1833 files changed, 276305 insertions, 55659 deletions
diff --git a/libavcodec/012v.c b/libavcodec/012v.c
new file mode 100644
index 0000000..b5a4066
--- /dev/null
+++ b/libavcodec/012v.c
@@ -0,0 +1,155 @@
+/*
+ * 012v decoder
+ *
+ * Copyright (C) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int zero12v_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV422P16;
+    avctx->bits_per_raw_sample = 10;
+
+    if (avctx->codec_tag == MKTAG('a', '1', '2', 'v'))
+        avpriv_request_sample(avctx, "transparency");
+
+    return 0;
+}
+
+static int zero12v_decode_frame(AVCodecContext *avctx, void *data,
+                                int *got_frame, AVPacket *avpkt)
+{
+    int line, ret;
+    const int width = avctx->width;
+    AVFrame *pic = data;
+    uint16_t *y, *u, *v;
+    const uint8_t *line_end, *src = avpkt->data;
+    int stride = avctx->width * 8 / 3;
+
+    if (width <= 1 || avctx->height <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions %dx%d not supported.\n", width, avctx->height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (   avctx->codec_tag == MKTAG('0', '1', '2', 'v')
+        && avpkt->size % avctx->height == 0
+        && avpkt->size / avctx->height * 3 >= width * 8)
+        stride = avpkt->size / avctx->height;
+
+    if (avpkt->size < avctx->height * stride) {
+        av_log(avctx, AV_LOG_ERROR, "Packet too small: %d instead of %d\n",
+               avpkt->size, avctx->height * stride);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->pict_type = AV_PICTURE_TYPE_I;
+    pic->key_frame = 1;
+
+    line_end = avpkt->data + stride;
+    for (line = 0; line < avctx->height; line++) {
+        uint16_t y_temp[6] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+        uint16_t u_temp[3] = {0x8000, 0x8000, 0x8000};
+        uint16_t v_temp[3] = {0x8000, 0x8000, 0x8000};
+        int x;
+        y = (uint16_t *)(pic->data[0] + line * pic->linesize[0]);
+        u = (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
+        v = (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
+
+        for (x = 0; x < width; x += 6) {
+            uint32_t t;
+
+            if (width - x < 6 || line_end - src < 16) {
+                y = y_temp;
+                u = u_temp;
+                v = v_temp;
+            }
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *u++ = t <<  6 & 0xFFC0;
+            *y++ = t >>  4 & 0xFFC0;
+            *v++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *y++ = t <<  6 & 0xFFC0;
+            *u++ = t >>  4 & 0xFFC0;
+            *y++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *v++ = t <<  6 & 0xFFC0;
+            *y++ = t >>  4 & 0xFFC0;
+            *u++ = t >> 14 & 0xFFC0;
+
+            if (line_end - src < 4)
+                break;
+
+            t = AV_RL32(src);
+            src += 4;
+            *y++ = t <<  6 & 0xFFC0;
+            *v++ = t >>  4 & 0xFFC0;
+            *y++ = t >> 14 & 0xFFC0;
+
+            if (width - x < 6)
+                break;
+        }
+
+        if (x < width) {
+            y = x   + (uint16_t *)(pic->data[0] + line * pic->linesize[0]);
+            u = x/2 + (uint16_t *)(pic->data[1] + line * pic->linesize[1]);
+            v = x/2 + (uint16_t *)(pic->data[2] + line * pic->linesize[2]);
+            memcpy(y, y_temp, sizeof(*y) * (width - x));
+            memcpy(u, u_temp, sizeof(*u) * (width - x + 1) / 2);
+            memcpy(v, v_temp, sizeof(*v) * (width - x + 1) / 2);
+        }
+
+        line_end += stride;
+        src = line_end - stride;
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_zero12v_decoder = {
+    .name           = "012v",
+    .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_012V,
+    .init           = zero12v_decode_init,
+    .decode         = zero12v_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c
index b2d4db2..a7a757a 100644
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -2,20 +2,20 @@
  * 4XM codec
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/frame.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
@@ -36,6 +37,7 @@
 #include "get_bits.h"
 #include "internal.h"
 
+
 #define BLOCK_TYPE_VLC_BITS 5
 #define ACDC_VLC_BITS 9
 
@@ -289,7 +291,7 @@ static void init_mv(FourXContext *f, int linesize)
     }
 #endif
 
-static inline void mcdc(uint16_t *dst, uint16_t *src, int log2w,
+static inline void mcdc(uint16_t *dst, const uint16_t *src, int log2w,
                         int h, int stride, int scale, unsigned dc)
 {
     int i;
@@ -333,36 +335,32 @@ static inline void mcdc(uint16_t *dst, uint16_t *src, int log2w,
         }
         break;
     default:
-        break;
+        av_assert0(0);
     }
 }
 
-static int decode_p_block(FourXContext *f, uint16_t *dst, uint16_t *src,
+static int decode_p_block(FourXContext *f, uint16_t *dst, const uint16_t *src,
                           int log2w, int log2h, int stride)
 {
     int index, h, code, ret, scale = 1;
     uint16_t *start, *end;
     unsigned dc = 0;
 
-    if (log2h < 0 || log2w < 0)
-        return AVERROR_INVALIDDATA;
+    av_assert0(log2w >= 0 && log2h >= 0);
 
     index = size2index[log2h][log2w];
-    if (index < 0)
-        return AVERROR_INVALIDDATA;
+    av_assert0(index >= 0);
 
     h     = 1 << log2h;
     code  = get_vlc2(&f->gb, block_type_vlc[1 - (f->version > 1)][index].table,
                      BLOCK_TYPE_VLC_BITS, 1);
-    if (code < 0 || code > 6)
-        return AVERROR_INVALIDDATA;
+    av_assert0(code >= 0 && code <= 6);
 
     start = f->last_frame_buffer;
     end   = start + stride * (f->avctx->height - h + 1) - (1 << log2w);
 
     if (code == 1) {
-        if (--log2h < 0)
-            return AVERROR_INVALIDDATA;
+        log2h--;
         if ((ret = decode_p_block(f, dst, src, log2w, log2h, stride)) < 0)
             return ret;
         return decode_p_block(f, dst + (stride << log2h),
@@ -376,24 +374,42 @@ static int decode_p_block(FourXContext *f, uint16_t *dst, uint16_t *src,
                               src + (1 << log2w),
                               log2w, log2h, stride);
     } else if (code == 6) {
+        if (bytestream2_get_bytes_left(&f->g2) < 4) {
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         if (log2w) {
-            dst[0]      = bytestream2_get_le16(&f->g2);
-            dst[1]      = bytestream2_get_le16(&f->g2);
+            dst[0]      = bytestream2_get_le16u(&f->g2);
+            dst[1]      = bytestream2_get_le16u(&f->g2);
         } else {
-            dst[0]      = bytestream2_get_le16(&f->g2);
-            dst[stride] = bytestream2_get_le16(&f->g2);
+            dst[0]      = bytestream2_get_le16u(&f->g2);
+            dst[stride] = bytestream2_get_le16u(&f->g2);
         }
         return 0;
     }
 
+    if ((code&3)==0 && bytestream2_get_bytes_left(&f->g) < 1) {
+        av_log(f->avctx, AV_LOG_ERROR, "bytestream overread\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     if (code == 0) {
         src  += f->mv[bytestream2_get_byte(&f->g)];
     } else if (code == 3 && f->version >= 2) {
         return 0;
     } else if (code == 4) {
         src  += f->mv[bytestream2_get_byte(&f->g)];
+        if (bytestream2_get_bytes_left(&f->g2) < 2){
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         dc    = bytestream2_get_le16(&f->g2);
     } else if (code == 5) {
+        if (bytestream2_get_bytes_left(&f->g2) < 2){
+            av_log(f->avctx, AV_LOG_ERROR, "wordstream overread\n");
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert0(start <= src && src <= end);
         scale = 0;
         dc    = bytestream2_get_le16(&f->g2);
     }
@@ -422,9 +438,9 @@ static int decode_p_frame(FourXContext *f, const uint8_t *buf, int length)
     src = f->last_frame_buffer;
 
     if (f->version > 1) {
-        if (length < 20)
-            return AVERROR_INVALIDDATA;
         extra           = 20;
+        if (length < extra)
+            return AVERROR_INVALIDDATA;
         bitstream_size  = AV_RL32(buf + 8);
         wordstream_size = AV_RL32(buf + 12);
         bytestream_size = AV_RL32(buf + 16);
@@ -435,24 +451,21 @@ static int decode_p_frame(FourXContext *f, const uint8_t *buf, int length)
         bytestream_size = FFMAX(length - bitstream_size - wordstream_size, 0);
     }
 
-    if (bitstream_size + bytestream_size + wordstream_size + extra != length
-        || bitstream_size  > (1 << 26)
-        || bytestream_size > (1 << 26)
-        || wordstream_size > (1 << 26)) {
-        av_log(f->avctx, AV_LOG_ERROR, "lengths %d %d %d %d\n",
-               bitstream_size, bytestream_size, wordstream_size,
-               bitstream_size + bytestream_size + wordstream_size - length);
+    if (bitstream_size > length || bitstream_size >= INT_MAX/8 ||
+        bytestream_size > length - bitstream_size ||
+        wordstream_size > length - bytestream_size - bitstream_size ||
+        extra > length - bytestream_size - bitstream_size - wordstream_size) {
+        av_log(f->avctx, AV_LOG_ERROR, "lengths %d %d %d %d\n", bitstream_size, bytestream_size, wordstream_size,
+        bitstream_size+ bytestream_size+ wordstream_size - length);
         return AVERROR_INVALIDDATA;
     }
 
-    av_fast_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
-                   bitstream_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
+                          bitstream_size);
     if (!f->bitstream_buffer)
         return AVERROR(ENOMEM);
     f->bbdsp.bswap_buf(f->bitstream_buffer, (const uint32_t *) (buf + extra),
                        bitstream_size / 4);
-    memset((uint8_t*)f->bitstream_buffer + bitstream_size,
-           0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&f->gb, f->bitstream_buffer, 8 * bitstream_size);
 
     wordstream_offset = extra + bitstream_size;
@@ -483,10 +496,17 @@ static int decode_i_block(FourXContext *f, int16_t *block)
 {
     int code, i, j, level, val;
 
+    if (get_bits_left(&f->gb) < 2){
+        av_log(f->avctx, AV_LOG_ERROR, "%d bits left before decode_i_block()\n", get_bits_left(&f->gb));
+        return -1;
+    }
+
     /* DC coef */
     val = get_vlc2(&f->pre_gb, f->pre_vlc.table, ACDC_VLC_BITS, 3);
-    if (val >> 4)
+    if (val >> 4) {
         av_log(f->avctx, AV_LOG_ERROR, "error dc run != 0\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     if (val)
         val = get_xbits(&f->gb, val);
@@ -504,7 +524,12 @@ static int decode_i_block(FourXContext *f, int16_t *block)
         if (code == 0xf0) {
             i += 16;
         } else {
-            level = get_xbits(&f->gb, code & 0xf);
+            if (code & 0xf) {
+                level = get_xbits(&f->gb, code & 0xf);
+            } else {
+                av_log(f->avctx, AV_LOG_ERROR, "0 coeff\n");
+                return AVERROR_INVALIDDATA;
+            }
             i    += code >> 4;
             if (i >= 64) {
                 av_log(f->avctx, AV_LOG_ERROR, "run %d oveflow\n", i);
@@ -584,7 +609,7 @@ static int decode_i_mb(FourXContext *f)
 
 static const uint8_t *read_huffman_tables(FourXContext *f,
                                           const uint8_t * const buf,
-                                          int len)
+                                          int buf_size)
 {
     int frequency[512] = { 0 };
     uint8_t flag[512];
@@ -593,6 +618,7 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     int bits_tab[257];
     int start, end;
     const uint8_t *ptr = buf;
+    const uint8_t *ptr_end = buf + buf_size;
     int j;
 
     memset(up, -1, sizeof(up));
@@ -602,10 +628,10 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     for (;;) {
         int i;
 
-        len -= end - start + 1;
-
-        if (end < start || len < 0)
+        if (ptr_end - ptr < FFMAX(end - start + 1, 0) + 1) {
+            av_log(f->avctx, AV_LOG_ERROR, "invalid data in read_huffman_tables\n");
             return NULL;
+        }
 
         for (i = start; i <= end; i++)
             frequency[i] = *ptr++;
@@ -613,9 +639,6 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
         if (start == 0)
             break;
 
-        if (--len < 0)
-            return NULL;
-
         end = *ptr++;
     }
     frequency[256] = 1;
@@ -623,6 +646,11 @@ static const uint8_t *read_huffman_tables(FourXContext *f,
     while ((ptr - buf) & 3)
         ptr++; // 4byte align
 
+    if (ptr > ptr_end) {
+        av_log(f->avctx, AV_LOG_ERROR, "ptr overflow in read_huffman_tables\n");
+        return NULL;
+    }
+
     for (j = 257; j < 512; j++) {
         int min_freq[2] = { 256 * 256, 256 * 256 };
         int smallest[2] = { 0, 0 };
@@ -691,6 +719,7 @@ static int decode_i2_frame(FourXContext *f, const uint8_t *buf, int length)
     const int height = f->avctx->height;
     const int mbs    = (FFALIGN(width, 16) >> 4) * (FFALIGN(height, 16) >> 4);
     uint16_t *dst    = f->frame_buffer;
+    const uint8_t *buf_end = buf + length;
     GetByteContext g3;
 
     if (length < mbs * 8) {
@@ -702,6 +731,8 @@ static int decode_i2_frame(FourXContext *f, const uint8_t *buf, int length)
     for (y = 0; y < height; y += 16) {
         for (x = 0; x < width; x += 16) {
             unsigned int color[4] = { 0 }, bits;
+            if (buf_end - buf < 8)
+                return -1;
             // warning following is purely guessed ...
             color[0] = bytestream2_get_le16u(&g3);
             color[1] = bytestream2_get_le16u(&g3);
@@ -735,7 +766,6 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
     const int width  = f->avctx->width;
     const int height = f->avctx->height;
     const unsigned int bitstream_size = AV_RL32(buf);
-    int token_count av_unused;
     unsigned int prestream_size;
     const uint8_t *prestream;
 
@@ -747,7 +777,6 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
         return AVERROR_INVALIDDATA;
     }
 
-    token_count    =     AV_RL32(buf + bitstream_size + 8);
     prestream_size = 4 * AV_RL32(buf + bitstream_size + 4);
     prestream      =             buf + bitstream_size + 12;
 
@@ -764,18 +793,18 @@ static int decode_i_frame(FourXContext *f, const uint8_t *buf, int length)
         return AVERROR_INVALIDDATA;
     }
 
+    av_assert0(prestream <= buf + length);
+
     init_get_bits(&f->gb, buf + 4, 8 * bitstream_size);
 
     prestream_size = length + buf - prestream;
 
-    av_fast_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
-                   prestream_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&f->bitstream_buffer, &f->bitstream_buffer_size,
+                          prestream_size);
     if (!f->bitstream_buffer)
         return AVERROR(ENOMEM);
     f->bbdsp.bswap_buf(f->bitstream_buffer, (const uint32_t *) prestream,
                        prestream_size / 4);
-    memset((uint8_t*)f->bitstream_buffer + prestream_size,
-           0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&f->pre_gb, f->bitstream_buffer, 8 * prestream_size);
 
     f->last_dc = 0 * 128 * 8 * 8;
@@ -807,11 +836,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (buf_size < 20)
         return AVERROR_INVALIDDATA;
 
-    if (avctx->width % 16 || avctx->height % 16) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Dimensions non-multiple of 16 are invalid.\n");
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(avctx->width % 16 == 0 && avctx->height % 16 == 0);
 
     if (buf_size < AV_RL32(buf + 4) + 8) {
         av_log(f->avctx, AV_LOG_ERROR, "size mismatch %d %"PRIu32"\n",
@@ -827,9 +852,19 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         const int data_size  = buf_size - 20;
         CFrameBuffer *cfrm;
 
+        if (f->version <= 1) {
+            av_log(f->avctx, AV_LOG_ERROR, "cfrm in version %d\n", f->version);
+            return AVERROR_INVALIDDATA;
+        }
+
         id         = AV_RL32(buf + 12);
         whole_size = AV_RL32(buf + 16);
 
+        if (data_size < 0 || whole_size < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "sizes invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         for (i = 0; i < CFRAME_BUFFER_COUNT; i++)
             if (f->cfrm[i].id && f->cfrm[i].id < avctx->frame_number)
                 av_log(f->avctx, AV_LOG_ERROR, "lost c frame %d\n",
@@ -848,11 +883,14 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         }
         cfrm = &f->cfrm[i];
 
+        if (data_size > UINT_MAX -  cfrm->size - AV_INPUT_BUFFER_PADDING_SIZE)
+            return AVERROR_INVALIDDATA;
+
         cfrm->data = av_fast_realloc(cfrm->data, &cfrm->allocated_size,
                                      cfrm->size + data_size + AV_INPUT_BUFFER_PADDING_SIZE);
         // explicit check needed as memcpy below might not catch a NULL
         if (!cfrm->data) {
-            av_log(f->avctx, AV_LOG_ERROR, "realloc failure");
+            av_log(f->avctx, AV_LOG_ERROR, "realloc failure\n");
             return AVERROR(ENOMEM);
         }
 
@@ -879,24 +917,27 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         frame_size = buf_size - 12;
     }
 
-
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0)
         return ret;
-    }
 
     if (frame_4cc == AV_RL32("ifr2")) {
         picture->pict_type = AV_PICTURE_TYPE_I;
-        if ((ret = decode_i2_frame(f, buf - 4, frame_size + 4)) < 0)
+        if ((ret = decode_i2_frame(f, buf - 4, frame_size + 4)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode i2 frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("ifrm")) {
         picture->pict_type = AV_PICTURE_TYPE_I;
-        if ((ret = decode_i_frame(f, buf, frame_size)) < 0)
+        if ((ret = decode_i_frame(f, buf, frame_size)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode i frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("pfrm") || frame_4cc == AV_RL32("pfr2")) {
         picture->pict_type = AV_PICTURE_TYPE_P;
-        if ((ret = decode_p_frame(f, buf, frame_size)) < 0)
+        if ((ret = decode_p_frame(f, buf, frame_size)) < 0) {
+            av_log(f->avctx, AV_LOG_ERROR, "decode p frame failed\n");
             return ret;
+        }
     } else if (frame_4cc == AV_RL32("snd_")) {
         av_log(avctx, AV_LOG_ERROR, "ignoring snd_ chunk length:%d\n",
                buf_size);
@@ -946,6 +987,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "extradata wrong or missing\n");
         return AVERROR_INVALIDDATA;
     }
+    if((avctx->width % 16) || (avctx->height % 16)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported width/height\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     ret = av_image_check_size(avctx->width, avctx->height, 0, avctx);
     if (ret < 0)
diff --git a/libavcodec/8bps.c b/libavcodec/8bps.c
index 7ba2b31..46344e0 100644
--- a/libavcodec/8bps.c
+++ b/libavcodec/8bps.c
@@ -2,20 +2,20 @@
  * Quicktime Planar RGB (8BPS) Video Decoder
  * Copyright (C) 2003 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
  *
  * Supports: PAL8 (RGB 8bpp, paletted)
  *         : BGR24 (RGB 24bpp) (can also output it as RGB32)
- *         : RGB32 (RGB 32bpp, 4th plane is probably alpha and it's ignored)
+ *         : RGB32 (RGB 32bpp, 4th plane is alpha)
  */
 
 #include <stdio.h>
@@ -65,27 +65,18 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     unsigned int dlen, p, row;
     const unsigned char *lp, *dp, *ep;
     unsigned char count;
-    unsigned int px_inc;
     unsigned int planes     = c->planes;
     unsigned char *planemap = c->planemap;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     ep = encoded + buf_size;
 
     /* Set data pointer after line lengths */
     dp = encoded + planes * (height << 1);
 
-    /* Ignore alpha plane, don't know what to do with it */
-    if (planes == 4)
-        planes--;
-
-    px_inc = planes + (avctx->pix_fmt == AV_PIX_FMT_RGB32);
-
     for (p = 0; p < planes; p++) {
         /* Lines length pointer for this plane */
         lp = encoded + p * (height << 1);
@@ -104,21 +95,21 @@ static int decode_frame(AVCodecContext *avctx, void *data,
                 if ((count = *dp++) <= 127) {
                     count++;
                     dlen -= count + 1;
-                    if (pixptr_end - pixptr < count * px_inc)
+                    if (pixptr_end - pixptr < count * planes)
                         break;
                     if (ep - dp < count)
                         return AVERROR_INVALIDDATA;
                     while (count--) {
                         *pixptr = *dp++;
-                        pixptr += px_inc;
+                        pixptr += planes;
                     }
                 } else {
                     count = 257 - count;
-                    if (pixptr_end - pixptr < count * px_inc)
+                    if (pixptr_end - pixptr < count * planes)
                         break;
                     while (count--) {
                         *pixptr = *dp;
-                        pixptr += px_inc;
+                        pixptr += planes;
                     }
                     dp++;
                     dlen -= 2;
@@ -179,7 +170,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         c->planemap[0] = HAVE_BIGENDIAN ? 1 : 2; // 1st plane is red
         c->planemap[1] = HAVE_BIGENDIAN ? 2 : 1; // 2nd plane is green
         c->planemap[2] = HAVE_BIGENDIAN ? 3 : 0; // 3rd plane is blue
-        c->planemap[3] = HAVE_BIGENDIAN ? 0 : 3; // 4th plane is alpha???
+        c->planemap[3] = HAVE_BIGENDIAN ? 0 : 3; // 4th plane is alpha
     }
     return 0;
 }
diff --git a/libavcodec/8svx.c b/libavcodec/8svx.c
index fe90b16..edc945c 100644
--- a/libavcodec/8svx.c
+++ b/libavcodec/8svx.c
@@ -1,21 +1,21 @@
 /*
- * 8SVX audio decoder
  * Copyright (C) 2008 Jaikrishnan Menon
+ * Copyright (C) 2011 Stefano Sabatini
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,18 @@
  *
  * supports: fibonacci delta encoding
  *         : exponential encoding
+ *
+ * For more information about the 8SVX format:
+ * http://netghost.narod.ru/gff/vendspec/iff/iff.txt
+ * http://sox.sourceforge.net/AudioFormats-11.html
+ * http://aminet.net/package/mus/misc/wavepak
+ * http://amigan.1emu.net/reg/8SVX.txt
+ *
+ * Samples can be found here:
+ * http://aminet.net/mods/smpl/
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "libavutil/common.h"
@@ -44,18 +54,17 @@ typedef struct EightSvxContext {
     int data_idx;
 } EightSvxContext;
 
-static const int8_t fibonacci[16]   = { -34, -21, -13,  -8, -5, -3, -2, -1,
-                                          0,   1,   2,   3,  5,  8, 13, 21 };
-static const int8_t exponential[16] = { -128, -64, -32, -16, -8, -4, -2, -1,
-                                           0,   1,   2,   4,  8, 16, 32, 64 };
+static const int8_t fibonacci[16]   = { -34,  -21, -13,  -8, -5, -3, -2, -1, 0, 1, 2, 3, 5, 8,  13, 21 };
+static const int8_t exponential[16] = { -128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64 };
 
-#define MAX_FRAME_SIZE 32768
+#define MAX_FRAME_SIZE 2048
 
 /**
  * Delta decode the compressed values in src, and put the resulting
  * decoded samples in dst.
  *
  * @param[in,out] state starting value. it is saved for use in the next call.
+ * @param table delta sequence table
  */
 static void delta_decode(uint8_t *dst, const uint8_t *src, int src_size,
                          uint8_t *state, const int8_t *table)
@@ -73,12 +82,6 @@ static void delta_decode(uint8_t *dst, const uint8_t *src, int src_size,
     *state = val;
 }
 
-static void raw_decode(uint8_t *dst, const int8_t *src, int src_size)
-{
-    while (src_size--)
-        *dst++ = *src++ + 128;
-}
-
 /** decode a frame */
 static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
                                  int *got_frame_ptr, AVPacket *avpkt)
@@ -87,27 +90,23 @@ static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame       = data;
     int buf_size;
     int ch, ret;
-    int is_compr = (avctx->codec_id != AV_CODEC_ID_PCM_S8_PLANAR);
+    int hdr_size = 2;
 
-    /* for the first packet, copy data to buffer */
-    if (avpkt->data) {
-        int hdr_size  = is_compr ? 2 : 0;
-        int chan_size = (avpkt->size - hdr_size * avctx->channels) / avctx->channels;
+    /* decode and interleave the first packet */
+    if (!esc->data[0] && avpkt) {
+        int chan_size = avpkt->size / avctx->channels - hdr_size;
 
-        if (avpkt->size < hdr_size * avctx->channels) {
-            av_log(avctx, AV_LOG_ERROR, "packet size is too small\n");
-            return AVERROR_INVALIDDATA;
+        if (avpkt->size % avctx->channels) {
+            av_log(avctx, AV_LOG_WARNING, "Packet with odd size, ignoring last byte\n");
         }
-        if (esc->data[0]) {
-            av_log(avctx, AV_LOG_ERROR, "unexpected data after first packet\n");
+        if (avpkt->size < (hdr_size + 1) * avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "packet size is too small\n");
             return AVERROR_INVALIDDATA;
         }
 
-        if (is_compr) {
         esc->fib_acc[0] = avpkt->data[1] + 128;
         if (avctx->channels == 2)
             esc->fib_acc[1] = avpkt->data[2+chan_size+1] + 128;
-        }
 
         esc->data_idx  = 0;
         esc->data_size = chan_size;
@@ -136,30 +135,22 @@ static int eightsvx_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     /* get output buffer */
-    frame->nb_samples = buf_size * (is_compr + 1);
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = buf_size * 2;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (ch = 0; ch < avctx->channels; ch++) {
-        if (is_compr) {
-            delta_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
-                         buf_size, &esc->fib_acc[ch], esc->table);
-        } else {
-            raw_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
-                       buf_size);
-        }
+        delta_decode(frame->data[ch], &esc->data[ch][esc->data_idx],
+                     buf_size, &esc->fib_acc[ch], esc->table);
     }
 
     esc->data_idx += buf_size;
 
     *got_frame_ptr = 1;
 
-    return avpkt->size;
+    return ((avctx->frame_number == 0)*hdr_size + buf_size)*avctx->channels;
 }
 
-/** initialize 8svx decoder */
 static av_cold int eightsvx_decode_init(AVCodecContext *avctx)
 {
     EightSvxContext *esc = avctx->priv_data;
@@ -169,17 +160,12 @@ static av_cold int eightsvx_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    switch(avctx->codec->id) {
-        case AV_CODEC_ID_8SVX_FIB:
-          esc->table = fibonacci;
-          break;
-        case AV_CODEC_ID_8SVX_EXP:
-          esc->table = exponential;
-          break;
-        case AV_CODEC_ID_PCM_S8_PLANAR:
-            break;
-        default:
-          return AVERROR_INVALIDDATA;
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_8SVX_FIB: esc->table = fibonacci;    break;
+    case AV_CODEC_ID_8SVX_EXP: esc->table = exponential;  break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Invalid codec id %d.\n", avctx->codec->id);
+        return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
 
@@ -192,10 +178,13 @@ static av_cold int eightsvx_decode_close(AVCodecContext *avctx)
 
     av_freep(&esc->data[0]);
     av_freep(&esc->data[1]);
+    esc->data_size = 0;
+    esc->data_idx = 0;
 
     return 0;
 }
 
+#if CONFIG_EIGHTSVX_FIB_DECODER
 AVCodec ff_eightsvx_fib_decoder = {
   .name           = "8svx_fib",
   .long_name      = NULL_IF_CONFIG_SMALL("8SVX fibonacci"),
@@ -203,13 +192,14 @@ AVCodec ff_eightsvx_fib_decoder = {
   .id             = AV_CODEC_ID_8SVX_FIB,
   .priv_data_size = sizeof (EightSvxContext),
   .init           = eightsvx_decode_init,
-  .close          = eightsvx_decode_close,
   .decode         = eightsvx_decode_frame,
-  .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+  .close          = eightsvx_decode_close,
+  .capabilities   = AV_CODEC_CAP_DR1,
   .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                     AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_EIGHTSVX_EXP_DECODER
 AVCodec ff_eightsvx_exp_decoder = {
   .name           = "8svx_exp",
   .long_name      = NULL_IF_CONFIG_SMALL("8SVX exponential"),
@@ -217,23 +207,10 @@ AVCodec ff_eightsvx_exp_decoder = {
   .id             = AV_CODEC_ID_8SVX_EXP,
   .priv_data_size = sizeof (EightSvxContext),
   .init           = eightsvx_decode_init,
-  .close          = eightsvx_decode_close,
   .decode         = eightsvx_decode_frame,
-  .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+  .close          = eightsvx_decode_close,
+  .capabilities   = AV_CODEC_CAP_DR1,
   .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                     AV_SAMPLE_FMT_NONE },
 };
-
-AVCodec ff_pcm_s8_planar_decoder = {
-    .name           = "pcm_s8_planar",
-    .long_name      = NULL_IF_CONFIG_SMALL("PCM signed 8-bit planar"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_PCM_S8_PLANAR,
-    .priv_data_size = sizeof(EightSvxContext),
-    .init           = eightsvx_decode_init,
-    .close          = eightsvx_decode_close,
-    .decode         = eightsvx_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
-                                                      AV_SAMPLE_FMT_NONE },
-};
+#endif
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index d12c52e..91df1ac 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1,20 +1,27 @@
+include $(SUBDIR)../config.mak
+
 NAME = avcodec
 
 HEADERS = avcodec.h                                                     \
+          avdct.h                                                       \
           avfft.h                                                       \
           d3d11va.h                                                     \
           dirac.h                                                       \
           dv_profile.h                                                  \
           dxva2.h                                                       \
+          jni.h                                                         \
           qsv.h                                                         \
           vaapi.h                                                       \
           vda.h                                                         \
           vdpau.h                                                       \
           version.h                                                     \
+          videotoolbox.h                                                \
           vorbis_parser.h                                               \
           xvmc.h                                                        \
 
 OBJS = allcodecs.o                                                      \
+       audioconvert.o                                                   \
+       avdct.o                                                          \
        avpacket.o                                                       \
        avpicture.o                                                      \
        bitstream.o                                                      \
@@ -26,13 +33,15 @@ OBJS = allcodecs.o                                                      \
        dirac.o                                                          \
        dv_profile.o                                                     \
        imgconvert.o                                                     \
-       log2_tab.o                                                       \
+       jni.o                                                            \
        mathtables.o                                                     \
        options.o                                                        \
        parser.o                                                         \
        profiles.o                                                       \
        qsv_api.o                                                        \
        raw.o                                                            \
+       resample.o                                                       \
+       resample2.o                                                      \
        utils.o                                                          \
        vorbis_parser.o                                                  \
        xiph.o                                                           \
@@ -45,13 +54,16 @@ OBJS-$(CONFIG_AUDIODSP)                += audiodsp.o
 OBJS-$(CONFIG_BLOCKDSP)                += blockdsp.o
 OBJS-$(CONFIG_BSWAPDSP)                += bswapdsp.o
 OBJS-$(CONFIG_CABAC)                   += cabac.o
+OBJS-$(CONFIG_CRYSTALHD)               += crystalhd.o
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
 OBJS-$(CONFIG_ERROR_RESILIENCE)        += error_resilience.o
+OBJS-$(CONFIG_EXIF)                    += exif.o tiff_common.o
 OBJS-$(CONFIG_FAANDCT)                 += faandct.o
 OBJS-$(CONFIG_FAANIDCT)                += faanidct.o
 OBJS-$(CONFIG_FDCTDSP)                 += fdctdsp.o jfdctfst.o jfdctint.o
 FFT-OBJS-$(CONFIG_HARDCODED_TABLES)    += cos_tables.o cos_fixed_tables.o
 OBJS-$(CONFIG_FFT)                     += avfft.o fft_fixed.o fft_float.o \
+                                          fft_fixed_32.o fft_init_table.o \
                                           $(FFT-OBJS-yes)
 OBJS-$(CONFIG_FLACDSP)                 += flacdsp.o
 OBJS-$(CONFIG_FMTCONVERT)              += fmtconvert.o
@@ -61,6 +73,7 @@ OBJS-$(CONFIG_H264CHROMA)              += h264chroma.o
 OBJS-$(CONFIG_H264DSP)                 += h264dsp.o h264idct.o
 OBJS-$(CONFIG_H264PRED)                += h264pred.o
 OBJS-$(CONFIG_H264QPEL)                += h264qpel.o
+OBJS-$(CONFIG_H264_VIDEOTOOLBOX_ENCODER) += videotoolboxenc.o
 OBJS-$(CONFIG_HPELDSP)                 += hpeldsp.o
 OBJS-$(CONFIG_HUFFMAN)                 += huffman.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += huffyuvdsp.o
@@ -70,13 +83,17 @@ OBJS-$(CONFIG_IIRFILTER)               += iirfilter.o
 OBJS-$(CONFIG_IMDCT15)                 += imdct15.o
 OBJS-$(CONFIG_INTRAX8)                 += intrax8.o intrax8dsp.o
 OBJS-$(CONFIG_IVIDSP)                  += ivi_dsp.o
+OBJS-$(CONFIG_JNI)                     += ffjni.o jni.o
 OBJS-$(CONFIG_JPEGTABLES)              += jpegtables.o
 OBJS-$(CONFIG_LIBXVID)                 += libxvid_rc.o
+OBJS-$(CONFIG_LLAUDDSP)                += lossless_audiodsp.o
+OBJS-$(CONFIG_LLVIDDSP)                += lossless_videodsp.o
 OBJS-$(CONFIG_LPC)                     += lpc.o
 OBJS-$(CONFIG_LSP)                     += lsp.o
 OBJS-$(CONFIG_LZF)                     += lzf.o
-OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o
+OBJS-$(CONFIG_MDCT)                    += mdct_fixed.o mdct_float.o mdct_fixed_32.o
 OBJS-$(CONFIG_ME_CMP)                  += me_cmp.o
+OBJS-$(CONFIG_MEDIACODEC)              += mediacodecdec.o mediacodec_wrapper.o mediacodec_sw_buffer.o
 OBJS-$(CONFIG_MPEG_ER)                 += mpeg_er.o
 OBJS-$(CONFIG_MPEGAUDIO)               += mpegaudio.o mpegaudiodata.o   \
                                           mpegaudiodecheader.o
@@ -97,7 +114,8 @@ OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
 RDFT-OBJS-$(CONFIG_HARDCODED_TABLES)   += sin_tables.o
 OBJS-$(CONFIG_RDFT)                    += rdft.o $(RDFT-OBJS-yes)
 OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
-OBJS-$(CONFIG_SINEWIN)                 += sinewin.o
+OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
+OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
 OBJS-$(CONFIG_SNAPPY)                  += snappy.o
 OBJS-$(CONFIG_STARTCODE)               += startcode.o
 OBJS-$(CONFIG_TEXTUREDSP)              += texturedsp.o
@@ -113,21 +131,30 @@ OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
 OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
 
 # decoders/encoders
+OBJS-$(CONFIG_ZERO12V_DECODER)         += 012v.o
 OBJS-$(CONFIG_A64MULTI_ENCODER)        += a64multienc.o elbg.o
 OBJS-$(CONFIG_A64MULTI5_ENCODER)       += a64multienc.o elbg.o
-OBJS-$(CONFIG_AAC_DECODER)             += aacdec.o aactab.o aacsbr.o aacps.o \
+OBJS-$(CONFIG_AAC_DECODER)             += aacdec.o aactab.o aacsbr.o aacps_float.o \
+                                          aacadtsdec.o mpeg4audio.o kbdwin.o \
+                                          sbrdsp.o aacpsdsp_float.o cbrt_data.o
+OBJS-$(CONFIG_AAC_FIXED_DECODER)       += aacdec_fixed.o aactab.o aacsbr_fixed.o aacps_fixed.o \
                                           aacadtsdec.o mpeg4audio.o kbdwin.o \
-                                          sbrdsp.o aacpsdsp.o
-OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o    \
+                                          sbrdsp_fixed.o aacpsdsp_fixed.o cbrt_data_fixed.o
+OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o aacenctab.o    \
                                           aacpsy.o aactab.o      \
-                                          psymodel.o mpeg4audio.o kbdwin.o
+                                          aacenc_is.o \
+                                          aacenc_tns.o \
+                                          aacenc_ltp.o \
+                                          aacenc_pred.o \
+                                          psymodel.o mpeg4audio.o kbdwin.o cbrt_data.o
 OBJS-$(CONFIG_AASC_DECODER)            += aasc.o msrledec.o
-OBJS-$(CONFIG_AC3_DECODER)             += ac3dec.o ac3dec_data.o ac3.o kbdwin.o
+OBJS-$(CONFIG_AC3_DECODER)             += ac3dec_float.o ac3dec_data.o ac3.o kbdwin.o
+OBJS-$(CONFIG_AC3_FIXED_DECODER)       += ac3dec_fixed.o ac3dec_data.o ac3.o kbdwin.o
 OBJS-$(CONFIG_AC3_ENCODER)             += ac3enc_float.o ac3enc.o ac3tab.o \
                                           ac3.o kbdwin.o
 OBJS-$(CONFIG_AC3_FIXED_ENCODER)       += ac3enc_fixed.o ac3enc.o ac3tab.o ac3.o
 OBJS-$(CONFIG_AIC_DECODER)             += aic.o
-OBJS-$(CONFIG_ALAC_DECODER)            += alac.o alac_data.o
+OBJS-$(CONFIG_ALAC_DECODER)            += alac.o alac_data.o alacdsp.o
 OBJS-$(CONFIG_ALAC_ENCODER)            += alacenc.o alac_data.o
 OBJS-$(CONFIG_ALIAS_PIX_DECODER)       += aliaspixdec.o
 OBJS-$(CONFIG_ALIAS_PIX_ENCODER)       += aliaspixenc.o
@@ -140,9 +167,17 @@ OBJS-$(CONFIG_AMRWB_DECODER)           += amrwbdec.o celp_filters.o   \
                                           celp_math.o acelp_filters.o \
                                           acelp_vectors.o             \
                                           acelp_pitch_delay.o
+OBJS-$(CONFIG_AMV_ENCODER)             += mjpegenc.o mjpegenc_common.o \
+                                          mpegvideo_enc.o motion_est.o \
+                                          ratecontrol.o mpeg12data.o   \
+                                          mpegvideo.o
 OBJS-$(CONFIG_ANM_DECODER)             += anm.o
 OBJS-$(CONFIG_ANSI_DECODER)            += ansi.o cga_data.o
 OBJS-$(CONFIG_APE_DECODER)             += apedec.o
+OBJS-$(CONFIG_APNG_DECODER)            += png.o pngdec.o pngdsp.o
+OBJS-$(CONFIG_APNG_ENCODER)            += png.o pngenc.o
+OBJS-$(CONFIG_SSA_DECODER)             += assdec.o ass.o
+OBJS-$(CONFIG_SSA_ENCODER)             += assenc.o ass.o
 OBJS-$(CONFIG_ASS_DECODER)             += assdec.o ass.o
 OBJS-$(CONFIG_ASS_ENCODER)             += assenc.o ass.o
 OBJS-$(CONFIG_ASV1_DECODER)            += asvdec.o asv.o mpeg12data.o
@@ -155,12 +190,20 @@ OBJS-$(CONFIG_ATRAC3P_DECODER)         += atrac3plusdec.o atrac3plus.o \
                                           atrac3plusdsp.o atrac.o
 OBJS-$(CONFIG_AURA_DECODER)            += cyuv.o
 OBJS-$(CONFIG_AURA2_DECODER)           += aura.o
+OBJS-$(CONFIG_AVRN_DECODER)            += avrndec.o mjpegdec.o
+OBJS-$(CONFIG_AVRP_DECODER)            += r210dec.o
+OBJS-$(CONFIG_AVRP_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_AVS_DECODER)             += avs.o
+OBJS-$(CONFIG_AVUI_DECODER)            += avuidec.o
+OBJS-$(CONFIG_AVUI_ENCODER)            += avuienc.o
+OBJS-$(CONFIG_AYUV_DECODER)            += v408dec.o
+OBJS-$(CONFIG_AYUV_ENCODER)            += v408enc.o
 OBJS-$(CONFIG_BETHSOFTVID_DECODER)     += bethsoftvideo.o
 OBJS-$(CONFIG_BFI_DECODER)             += bfi.o
 OBJS-$(CONFIG_BINK_DECODER)            += bink.o binkdsp.o
 OBJS-$(CONFIG_BINKAUDIO_DCT_DECODER)   += binkaudio.o
 OBJS-$(CONFIG_BINKAUDIO_RDFT_DECODER)  += binkaudio.o
+OBJS-$(CONFIG_BINTEXT_DECODER)         += bintext.o cga_data.o
 OBJS-$(CONFIG_BMP_DECODER)             += bmp.o msrledec.o
 OBJS-$(CONFIG_BMP_ENCODER)             += bmpenc.o
 OBJS-$(CONFIG_BMV_AUDIO_DECODER)       += bmvaudio.o
@@ -169,39 +212,52 @@ OBJS-$(CONFIG_BRENDER_PIX_DECODER)     += brenderpix.o
 OBJS-$(CONFIG_C93_DECODER)             += c93.o
 OBJS-$(CONFIG_CAVS_DECODER)            += cavs.o cavsdec.o cavsdsp.o \
                                           cavsdata.o mpeg12data.o
+OBJS-$(CONFIG_CCAPTION_DECODER)        += ccaption_dec.o
 OBJS-$(CONFIG_CDGRAPHICS_DECODER)      += cdgraphics.o
 OBJS-$(CONFIG_CDXL_DECODER)            += cdxl.o
+OBJS-$(CONFIG_CFHD_DECODER)            += cfhd.o cfhddata.o
 OBJS-$(CONFIG_CINEPAK_DECODER)         += cinepak.o
+OBJS-$(CONFIG_CINEPAK_ENCODER)         += cinepakenc.o elbg.o
 OBJS-$(CONFIG_CLJR_DECODER)            += cljrdec.o
 OBJS-$(CONFIG_CLJR_ENCODER)            += cljrenc.o
 OBJS-$(CONFIG_CLLC_DECODER)            += cllc.o canopus.o
 OBJS-$(CONFIG_COMFORTNOISE_DECODER)    += cngdec.o celp_filters.o
 OBJS-$(CONFIG_COMFORTNOISE_ENCODER)    += cngenc.o
 OBJS-$(CONFIG_COOK_DECODER)            += cook.o
+OBJS-$(CONFIG_CPIA_DECODER)            += cpia.o
 OBJS-$(CONFIG_CSCD_DECODER)            += cscd.o
 OBJS-$(CONFIG_CYUV_DECODER)            += cyuv.o
-OBJS-$(CONFIG_DCA_DECODER)             += dcadec.o dca.o dcadsp.o      \
-                                          dcadata.o dca_exss.o         \
-                                          dca_xll.o synth_filter.o
+OBJS-$(CONFIG_DCA_DECODER)             += dcadec.o dca.o dcadata.o dcahuff.o \
+                                          dca_core.o dca_exss.o dca_xll.o dca_lbr.o \
+                                          dcadsp.o dcadct.o synth_filter.o
+OBJS-$(CONFIG_DCA_ENCODER)             += dcaenc.o dca.o dcadata.o
 OBJS-$(CONFIG_DDS_DECODER)             += dds.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += diracdec.o dirac.o diracdsp.o diractab.o \
+                                          dirac_arith.o mpeg12data.o dirac_dwt.o
 OBJS-$(CONFIG_DFA_DECODER)             += dfa.o
 OBJS-$(CONFIG_DNXHD_DECODER)           += dnxhddec.o dnxhddata.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += dnxhdenc.o dnxhddata.o
 OBJS-$(CONFIG_DPX_DECODER)             += dpx.o
 OBJS-$(CONFIG_DPX_ENCODER)             += dpxenc.o
+OBJS-$(CONFIG_DSD_LSBF_DECODER)        += dsddec.o dsd.o
+OBJS-$(CONFIG_DSD_MSBF_DECODER)        += dsddec.o dsd.o
+OBJS-$(CONFIG_DSD_LSBF_PLANAR_DECODER) += dsddec.o dsd.o
+OBJS-$(CONFIG_DSD_MSBF_PLANAR_DECODER) += dsddec.o dsd.o
 OBJS-$(CONFIG_DSICINAUDIO_DECODER)     += dsicinaudio.o
 OBJS-$(CONFIG_DSICINVIDEO_DECODER)     += dsicinvideo.o
 OBJS-$(CONFIG_DSS_SP_DECODER)          += dss_sp.o
+OBJS-$(CONFIG_DST_DECODER)             += dstdec.o dsd.o
 OBJS-$(CONFIG_DVBSUB_DECODER)          += dvbsubdec.o
 OBJS-$(CONFIG_DVBSUB_ENCODER)          += dvbsub.o
 OBJS-$(CONFIG_DVDSUB_DECODER)          += dvdsubdec.o
 OBJS-$(CONFIG_DVDSUB_ENCODER)          += dvdsubenc.o
+OBJS-$(CONFIG_DVAUDIO_DECODER)         += dvaudiodec.o
 OBJS-$(CONFIG_DVVIDEO_DECODER)         += dvdec.o dv.o dvdata.o
 OBJS-$(CONFIG_DVVIDEO_ENCODER)         += dvenc.o dv.o dvdata.o
 OBJS-$(CONFIG_DXA_DECODER)             += dxa.o
 OBJS-$(CONFIG_DXTORY_DECODER)          += dxtory.o
 OBJS-$(CONFIG_DXV_DECODER)             += dxv.o
-OBJS-$(CONFIG_EAC3_DECODER)            += eac3dec.o eac3_data.o
+OBJS-$(CONFIG_EAC3_DECODER)            += eac3_data.o
 OBJS-$(CONFIG_EAC3_ENCODER)            += eac3enc.o eac3_data.o
 OBJS-$(CONFIG_EACMV_DECODER)           += eacmv.o
 OBJS-$(CONFIG_EAMAD_DECODER)           += eamad.o eaidct.o mpeg12.o \
@@ -214,14 +270,17 @@ OBJS-$(CONFIG_EIGHTSVX_EXP_DECODER)    += 8svx.o
 OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
 OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
 OBJS-$(CONFIG_ESCAPE130_DECODER)       += escape130.o
+OBJS-$(CONFIG_EVRC_DECODER)            += evrcdec.o acelp_vectors.o lsp.o
 OBJS-$(CONFIG_EXR_DECODER)             += exr.o
 OBJS-$(CONFIG_FFV1_DECODER)            += ffv1dec.o ffv1.o
 OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1enc.o ffv1.o
+OBJS-$(CONFIG_FFWAVESYNTH_DECODER)     += ffwavesynth.o
 OBJS-$(CONFIG_FIC_DECODER)             += fic.o
 OBJS-$(CONFIG_FLAC_DECODER)            += flacdec.o flacdata.o flac.o
-OBJS-$(CONFIG_FLAC_ENCODER)            += flacenc.o flacdata.o flac.o
+OBJS-$(CONFIG_FLAC_ENCODER)            += flacenc.o flacdata.o flac.o vorbis_data.o
 OBJS-$(CONFIG_FLASHSV_DECODER)         += flashsv.o
 OBJS-$(CONFIG_FLASHSV_ENCODER)         += flashsvenc.o
+OBJS-$(CONFIG_FLASHSV2_ENCODER)        += flashsv2enc.o
 OBJS-$(CONFIG_FLASHSV2_DECODER)        += flashsv.o
 OBJS-$(CONFIG_FLIC_DECODER)            += flicvideo.o
 OBJS-$(CONFIG_FOURXM_DECODER)          += 4xm.o
@@ -229,11 +288,10 @@ OBJS-$(CONFIG_FRAPS_DECODER)           += fraps.o
 OBJS-$(CONFIG_FRWU_DECODER)            += frwu.o
 OBJS-$(CONFIG_G2M_DECODER)             += g2meet.o elsdec.o
 OBJS-$(CONFIG_G723_1_DECODER)          += g723_1dec.o g723_1.o \
-                                          acelp_vectors.o celp_filters.o \
-                                          celp_math.o
+                                          acelp_vectors.o celp_filters.o celp_math.o
 OBJS-$(CONFIG_G723_1_ENCODER)          += g723_1enc.o g723_1.o \
-                                          acelp_vectors.o celp_filters.o \
-                                          celp_math.o
+                                          acelp_vectors.o celp_filters.o celp_math.o
+OBJS-$(CONFIG_G729_DECODER)            += g729dec.o lsp.o celp_math.o acelp_filters.o acelp_pitch_delay.o acelp_vectors.o g729postfilter.o
 OBJS-$(CONFIG_GIF_DECODER)             += gifdec.o lzw.o
 OBJS-$(CONFIG_GIF_ENCODER)             += gif.o lzwenc.o
 OBJS-$(CONFIG_GSM_DECODER)             += gsmdec.o gsmdec_data.o msgsmdec.o
@@ -251,6 +309,7 @@ OBJS-$(CONFIG_H264_DECODER)            += h264.o h264_cabac.o h264_cavlc.o \
                                           h264_refs.o h264_sei.o \
                                           h264_slice.o h264data.o h264_parse.o \
                                           h2645_parse.o
+OBJS-$(CONFIG_H264_MEDIACODEC_DECODER) += mediacodecdec_h264.o
 OBJS-$(CONFIG_HAP_DECODER)             += hapdec.o hap.o
 OBJS-$(CONFIG_HAP_ENCODER)             += hapenc.o hap.o
 OBJS-$(CONFIG_HEVC_DECODER)            += hevc.o hevc_mvs.o hevc_ps.o hevc_sei.o \
@@ -263,15 +322,19 @@ OBJS-$(CONFIG_HQX_DECODER)             += hqx.o hqxvlc.o hqxdsp.o canopus.o
 OBJS-$(CONFIG_HUFFYUV_DECODER)         += huffyuv.o huffyuvdec.o
 OBJS-$(CONFIG_HUFFYUV_ENCODER)         += huffyuv.o huffyuvenc.o
 OBJS-$(CONFIG_IDCIN_DECODER)           += idcinvideo.o
-OBJS-$(CONFIG_IFF_BYTERUN1_DECODER)    += iff.o
+OBJS-$(CONFIG_IDF_DECODER)             += bintext.o cga_data.o
 OBJS-$(CONFIG_IFF_ILBM_DECODER)        += iff.o
 OBJS-$(CONFIG_IMC_DECODER)             += imc.o
 OBJS-$(CONFIG_INDEO2_DECODER)          += indeo2.o
 OBJS-$(CONFIG_INDEO3_DECODER)          += indeo3.o
 OBJS-$(CONFIG_INDEO4_DECODER)          += indeo4.o ivi.o
 OBJS-$(CONFIG_INDEO5_DECODER)          += indeo5.o ivi.o
+OBJS-$(CONFIG_INTERPLAY_ACM_DECODER)   += interplayacm.o
 OBJS-$(CONFIG_INTERPLAY_DPCM_DECODER)  += dpcm.o
 OBJS-$(CONFIG_INTERPLAY_VIDEO_DECODER) += interplayvideo.o
+OBJS-$(CONFIG_JACOSUB_DECODER)         += jacosubdec.o ass.o
+OBJS-$(CONFIG_JPEG2000_ENCODER)        += j2kenc.o mqcenc.o mqc.o jpeg2000.o \
+                                          jpeg2000dwt.o
 OBJS-$(CONFIG_JPEG2000_DECODER)        += jpeg2000dec.o jpeg2000.o jpeg2000dsp.o \
                                           jpeg2000dwt.o mqcdec.o mqc.o
 OBJS-$(CONFIG_JPEGLS_DECODER)          += jpeglsdec.o jpegls.o
@@ -282,11 +345,14 @@ OBJS-$(CONFIG_KMVC_DECODER)            += kmvc.o
 OBJS-$(CONFIG_LAGARITH_DECODER)        += lagarith.o lagarithrac.o
 OBJS-$(CONFIG_LJPEG_ENCODER)           += ljpegenc.o mjpegenc_common.o
 OBJS-$(CONFIG_LOCO_DECODER)            += loco.o
+OBJS-$(CONFIG_M101_DECODER)            += m101.o
 OBJS-$(CONFIG_MACE3_DECODER)           += mace.o
 OBJS-$(CONFIG_MACE6_DECODER)           += mace.o
+OBJS-$(CONFIG_MAGICYUV_DECODER)        += magicyuv.o
 OBJS-$(CONFIG_MDEC_DECODER)            += mdec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_METASOUND_DECODER)       += metasound.o metasound_data.o \
                                           twinvq.o
+OBJS-$(CONFIG_MICRODVD_DECODER)        += microdvddec.o ass.o
 OBJS-$(CONFIG_MIMIC_DECODER)           += mimic.o
 OBJS-$(CONFIG_MJPEG_DECODER)           += mjpegdec.o
 OBJS-$(CONFIG_MJPEG_ENCODER)           += mjpegenc.o mjpegenc_common.o
@@ -295,10 +361,14 @@ OBJS-$(CONFIG_MJPEG_VAAPI_ENCODER)     += vaapi_encode_mjpeg.o
 OBJS-$(CONFIG_MLP_DECODER)             += mlpdec.o mlpdsp.o
 OBJS-$(CONFIG_MMVIDEO_DECODER)         += mmvideo.o
 OBJS-$(CONFIG_MOTIONPIXELS_DECODER)    += motionpixels.o
+OBJS-$(CONFIG_MOVTEXT_DECODER)         += movtextdec.o ass.o
+OBJS-$(CONFIG_MOVTEXT_ENCODER)         += movtextenc.o ass_split.o
 OBJS-$(CONFIG_MP1_DECODER)             += mpegaudiodec_fixed.o
 OBJS-$(CONFIG_MP1FLOAT_DECODER)        += mpegaudiodec_float.o
 OBJS-$(CONFIG_MP2_DECODER)             += mpegaudiodec_fixed.o
-OBJS-$(CONFIG_MP2_ENCODER)             += mpegaudioenc.o mpegaudio.o \
+OBJS-$(CONFIG_MP2_ENCODER)             += mpegaudioenc_float.o mpegaudio.o \
+                                          mpegaudiodata.o mpegaudiodsp_data.o
+OBJS-$(CONFIG_MP2FIXED_ENCODER)        += mpegaudioenc_fixed.o mpegaudio.o \
                                           mpegaudiodata.o mpegaudiodsp_data.o
 OBJS-$(CONFIG_MP2FLOAT_DECODER)        += mpegaudiodec_float.o
 OBJS-$(CONFIG_MP3_DECODER)             += mpegaudiodec_fixed.o
@@ -309,13 +379,13 @@ OBJS-$(CONFIG_MP3ON4_DECODER)          += mpegaudiodec_fixed.o mpeg4audio.o
 OBJS-$(CONFIG_MP3ON4FLOAT_DECODER)     += mpegaudiodec_float.o mpeg4audio.o
 OBJS-$(CONFIG_MPC7_DECODER)            += mpc7.o mpc.o
 OBJS-$(CONFIG_MPC8_DECODER)            += mpc8.o mpc.o
-OBJS-$(CONFIG_MPEG_XVMC_DECODER)       += mpegvideo_xvmc.o
+OBJS-$(CONFIG_MPEGVIDEO_DECODER)       += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG1VIDEO_DECODER)      += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG1VIDEO_ENCODER)      += mpeg12enc.o mpeg12.o
 OBJS-$(CONFIG_MPEG2VIDEO_DECODER)      += mpeg12dec.o mpeg12.o mpeg12data.o
 OBJS-$(CONFIG_MPEG2VIDEO_ENCODER)      += mpeg12enc.o mpeg12.o
-OBJS-$(CONFIG_MPEG2_MMAL_DECODER)      += mmaldec.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += xvididct.o
+OBJS-$(CONFIG_MPL2_DECODER)            += mpl2dec.o ass.o
 OBJS-$(CONFIG_MSA1_DECODER)            += mss3.o
 OBJS-$(CONFIG_MSMPEG4V1_DECODER)       += msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_MSMPEG4V2_DECODER)       += msmpeg4dec.o msmpeg4.o msmpeg4data.o
@@ -326,6 +396,7 @@ OBJS-$(CONFIG_MSRLE_DECODER)           += msrle.o msrledec.o
 OBJS-$(CONFIG_MSS1_DECODER)            += mss1.o mss12.o
 OBJS-$(CONFIG_MSS2_DECODER)            += mss2.o mss12.o mss2dsp.o wmv2data.o
 OBJS-$(CONFIG_MSVIDEO1_DECODER)        += msvideo1.o
+OBJS-$(CONFIG_MSVIDEO1_ENCODER)        += msvideo1enc.o elbg.o
 OBJS-$(CONFIG_MSZH_DECODER)            += lcldec.o
 OBJS-$(CONFIG_MTS2_DECODER)            += mss4.o
 OBJS-$(CONFIG_MVC1_DECODER)            += mvcdec.o
@@ -351,12 +422,16 @@ OBJS-$(CONFIG_PGMYUV_DECODER)          += pnmdec.o pnm.o
 OBJS-$(CONFIG_PGMYUV_ENCODER)          += pnmenc.o
 OBJS-$(CONFIG_PGSSUB_DECODER)          += pgssubdec.o
 OBJS-$(CONFIG_PICTOR_DECODER)          += pictordec.o cga_data.o
+OBJS-$(CONFIG_PJS_DECODER)             += textdec.o ass.o
 OBJS-$(CONFIG_PNG_DECODER)             += png.o pngdec.o pngdsp.o
 OBJS-$(CONFIG_PNG_ENCODER)             += png.o pngenc.o
 OBJS-$(CONFIG_PPM_DECODER)             += pnmdec.o pnm.o
 OBJS-$(CONFIG_PPM_ENCODER)             += pnmenc.o
-OBJS-$(CONFIG_PRORES_DECODER)          += proresdec.o proresdata.o proresdsp.o
-OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc.o proresdata.o
+OBJS-$(CONFIG_PRORES_DECODER)          += proresdec2.o proresdsp.o proresdata.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += proresdec_lgpl.o proresdsp.o proresdata.o
+OBJS-$(CONFIG_PRORES_ENCODER)          += proresenc_anatoliy.o
+OBJS-$(CONFIG_PRORES_AW_ENCODER)       += proresenc_anatoliy.o
+OBJS-$(CONFIG_PRORES_KS_ENCODER)       += proresenc_kostya.o proresdata.o
 OBJS-$(CONFIG_PTX_DECODER)             += ptx.o
 OBJS-$(CONFIG_QCELP_DECODER)           += qcelpdec.o                     \
                                           celp_filters.o acelp_vectors.o \
@@ -367,13 +442,16 @@ OBJS-$(CONFIG_QPEG_DECODER)            += qpeg.o
 OBJS-$(CONFIG_QTRLE_DECODER)           += qtrle.o
 OBJS-$(CONFIG_QTRLE_ENCODER)           += qtrleenc.o
 OBJS-$(CONFIG_R10K_DECODER)            += r210dec.o
+OBJS-$(CONFIG_R10K_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_R210_DECODER)            += r210dec.o
+OBJS-$(CONFIG_R210_ENCODER)            += r210enc.o
 OBJS-$(CONFIG_RA_144_DECODER)          += ra144dec.o ra144.o celp_filters.o
 OBJS-$(CONFIG_RA_144_ENCODER)          += ra144enc.o ra144.o celp_filters.o
 OBJS-$(CONFIG_RA_288_DECODER)          += ra288.o celp_filters.o
 OBJS-$(CONFIG_RALF_DECODER)            += ralf.o
 OBJS-$(CONFIG_RAWVIDEO_DECODER)        += rawdec.o
 OBJS-$(CONFIG_RAWVIDEO_ENCODER)        += rawenc.o
+OBJS-$(CONFIG_REALTEXT_DECODER)        += realtextdec.o ass.o
 OBJS-$(CONFIG_RL2_DECODER)             += rl2.o
 OBJS-$(CONFIG_ROQ_DECODER)             += roqvideodec.o roqvideo.o
 OBJS-$(CONFIG_ROQ_ENCODER)             += roqvideoenc.o roqvideo.o elbg.o
@@ -387,12 +465,16 @@ OBJS-$(CONFIG_RV20_DECODER)            += rv10.o
 OBJS-$(CONFIG_RV20_ENCODER)            += rv20enc.o
 OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o rv30dsp.o
 OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o rv40dsp.o
+OBJS-$(CONFIG_SAMI_DECODER)            += samidec.o ass.o htmlsubtitles.o
 OBJS-$(CONFIG_S302M_DECODER)           += s302m.o
+OBJS-$(CONFIG_S302M_ENCODER)           += s302menc.o
 OBJS-$(CONFIG_SANM_DECODER)            += sanm.o
 OBJS-$(CONFIG_SCREENPRESSO_DECODER)    += screenpresso.o
+OBJS-$(CONFIG_SDX2_DPCM_DECODER)       += dpcm.o
 OBJS-$(CONFIG_SGI_DECODER)             += sgidec.o
 OBJS-$(CONFIG_SGI_ENCODER)             += sgienc.o rle.o
 OBJS-$(CONFIG_SGIRLE_DECODER)          += sgirledec.o
+OBJS-$(CONFIG_SHEERVIDEO_DECODER)      += sheervideo.o
 OBJS-$(CONFIG_SHORTEN_DECODER)         += shorten.o
 OBJS-$(CONFIG_SIPR_DECODER)            += sipr.o acelp_pitch_delay.o \
                                           celp_math.o acelp_vectors.o \
@@ -401,29 +483,48 @@ OBJS-$(CONFIG_SIPR_DECODER)            += sipr.o acelp_pitch_delay.o \
 OBJS-$(CONFIG_SMACKAUD_DECODER)        += smacker.o
 OBJS-$(CONFIG_SMACKER_DECODER)         += smacker.o
 OBJS-$(CONFIG_SMC_DECODER)             += smc.o
+OBJS-$(CONFIG_SMVJPEG_DECODER)         += smvjpegdec.o
+OBJS-$(CONFIG_SNOW_DECODER)            += snowdec.o snow.o snow_dwt.o
+OBJS-$(CONFIG_SNOW_ENCODER)            += snowenc.o snow.o snow_dwt.o             \
+                                          h263.o ituh263enc.o
 OBJS-$(CONFIG_SOL_DPCM_DECODER)        += dpcm.o
+OBJS-$(CONFIG_SONIC_DECODER)           += sonic.o
+OBJS-$(CONFIG_SONIC_ENCODER)           += sonic.o
+OBJS-$(CONFIG_SONIC_LS_ENCODER)        += sonic.o
 OBJS-$(CONFIG_SP5X_DECODER)            += sp5xdec.o
-OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o
+OBJS-$(CONFIG_SRT_DECODER)             += srtdec.o ass.o htmlsubtitles.o
+OBJS-$(CONFIG_SRT_ENCODER)             += srtenc.o ass_split.o
+OBJS-$(CONFIG_STL_DECODER)             += textdec.o ass.o
+OBJS-$(CONFIG_SUBRIP_DECODER)          += srtdec.o ass.o htmlsubtitles.o
+OBJS-$(CONFIG_SUBRIP_ENCODER)          += srtenc.o ass_split.o
+OBJS-$(CONFIG_SUBVIEWER1_DECODER)      += textdec.o ass.o
+OBJS-$(CONFIG_SUBVIEWER_DECODER)       += subviewerdec.o ass.o
 OBJS-$(CONFIG_SUNRAST_DECODER)         += sunrast.o
 OBJS-$(CONFIG_SUNRAST_ENCODER)         += sunrastenc.o
 OBJS-$(CONFIG_SVQ1_DECODER)            += svq1dec.o svq1.o svq13.o h263data.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += svq1enc.o svq1.o  h263data.o  \
                                           h263.o ituh263enc.o
 OBJS-$(CONFIG_SVQ3_DECODER)            += svq3.o svq13.o mpegutils.o h264_parse.o h264data.o
-OBJS-$(CONFIG_TAK_DECODER)             += takdec.o tak.o
+OBJS-$(CONFIG_TEXT_DECODER)            += textdec.o ass.o
+OBJS-$(CONFIG_TEXT_ENCODER)            += srtenc.o ass_split.o
+OBJS-$(CONFIG_TAK_DECODER)             += takdec.o tak.o takdsp.o
 OBJS-$(CONFIG_TARGA_DECODER)           += targa.o
 OBJS-$(CONFIG_TARGA_ENCODER)           += targaenc.o rle.o
+OBJS-$(CONFIG_TARGA_Y216_DECODER)      += targa_y216dec.o
 OBJS-$(CONFIG_TDSC_DECODER)            += tdsc.o
 OBJS-$(CONFIG_TIERTEXSEQVIDEO_DECODER) += tiertexseqv.o
-OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o faxcompr.o
-OBJS-$(CONFIG_TIFF_ENCODER)            += tiffenc.o rle.o lzwenc.o
+OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o faxcompr.o tiff_data.o tiff_common.o
+OBJS-$(CONFIG_TIFF_ENCODER)            += tiffenc.o rle.o lzwenc.o tiff_data.o
 OBJS-$(CONFIG_TMV_DECODER)             += tmv.o cga_data.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += mlpdec.o mlpdsp.o
 OBJS-$(CONFIG_TRUEMOTION1_DECODER)     += truemotion1.o
 OBJS-$(CONFIG_TRUEMOTION2_DECODER)     += truemotion2.o
+OBJS-$(CONFIG_TRUEMOTION2RT_DECODER)   += truemotion2rt.o
 OBJS-$(CONFIG_TRUESPEECH_DECODER)      += truespeech.o
 OBJS-$(CONFIG_TSCC_DECODER)            += tscc.o msrledec.o
 OBJS-$(CONFIG_TSCC2_DECODER)           += tscc2.o
-OBJS-$(CONFIG_TTA_DECODER)             += tta.o
+OBJS-$(CONFIG_TTA_DECODER)             += tta.o ttadata.o ttadsp.o
+OBJS-$(CONFIG_TTA_ENCODER)             += ttaenc.o ttadata.o
 OBJS-$(CONFIG_TWINVQ_DECODER)          += twinvqdec.o twinvq.o
 OBJS-$(CONFIG_TXD_DECODER)             += txd.o
 OBJS-$(CONFIG_ULTI_DECODER)            += ulti.o
@@ -432,6 +533,10 @@ OBJS-$(CONFIG_UTVIDEO_ENCODER)         += utvideoenc.o utvideo.o
 OBJS-$(CONFIG_V210_DECODER)            += v210dec.o
 OBJS-$(CONFIG_V210_ENCODER)            += v210enc.o
 OBJS-$(CONFIG_V210X_DECODER)           += v210x.o
+OBJS-$(CONFIG_V308_DECODER)            += v308dec.o
+OBJS-$(CONFIG_V308_ENCODER)            += v308enc.o
+OBJS-$(CONFIG_V408_DECODER)            += v408dec.o
+OBJS-$(CONFIG_V408_ENCODER)            += v408enc.o
 OBJS-$(CONFIG_V410_DECODER)            += v410dec.o
 OBJS-$(CONFIG_V410_ENCODER)            += v410enc.o
 OBJS-$(CONFIG_VB_DECODER)              += vb.o
@@ -439,8 +544,10 @@ OBJS-$(CONFIG_VBLE_DECODER)            += vble.o
 OBJS-$(CONFIG_VC1_DECODER)             += vc1dec.o vc1_block.o vc1_loopfilter.o \
                                           vc1_mc.o vc1_pred.o vc1.o vc1data.o \
                                           msmpeg4dec.o msmpeg4.o msmpeg4data.o \
-                                          wmv2data.o
+                                          wmv2dsp.o wmv2data.o
 OBJS-$(CONFIG_VC1_MMAL_DECODER)        += mmaldec.o
+OBJS-$(CONFIG_VC1_QSV_DECODER)         += qsvdec_vc1.o
+OBJS-$(CONFIG_VC2_ENCODER)             += vc2enc.o vc2enc_dwt.o diractab.o
 OBJS-$(CONFIG_VCR1_DECODER)            += vcr1.o
 OBJS-$(CONFIG_VMDAUDIO_DECODER)        += vmdaudio.o
 OBJS-$(CONFIG_VMDVIDEO_DECODER)        += vmdvideo.o
@@ -455,11 +562,15 @@ OBJS-$(CONFIG_VP6_DECODER)             += vp6.o vp56.o vp56data.o \
                                           vp6dsp.o vp56rac.o
 OBJS-$(CONFIG_VP7_DECODER)             += vp8.o vp56rac.o
 OBJS-$(CONFIG_VP8_DECODER)             += vp8.o vp56rac.o
-OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9data.o vp9dsp.o \
-                                          vp9block.o vp9prob.o vp9mvs.o vp56rac.o
+OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9dsp.o vp56rac.o vp9dsp_8bpp.o \
+                                          vp9dsp_10bpp.o vp9dsp_12bpp.o
+OBJS-$(CONFIG_VPLAYER_DECODER)         += textdec.o ass.o
 OBJS-$(CONFIG_VQA_DECODER)             += vqavideo.o
 OBJS-$(CONFIG_WAVPACK_DECODER)         += wavpack.o
+OBJS-$(CONFIG_WAVPACK_ENCODER)         += wavpackenc.o
 OBJS-$(CONFIG_WEBP_DECODER)            += webp.o
+OBJS-$(CONFIG_WEBVTT_DECODER)          += webvttdec.o ass.o
+OBJS-$(CONFIG_WEBVTT_ENCODER)          += webvttenc.o ass_split.o
 OBJS-$(CONFIG_WMALOSSLESS_DECODER)     += wmalosslessdec.o wma_common.o
 OBJS-$(CONFIG_WMAPRO_DECODER)          += wmaprodec.o wma.o wma_common.o
 OBJS-$(CONFIG_WMAV1_DECODER)           += wmadec.o wma.o wma_common.o aactab.o
@@ -470,6 +581,7 @@ OBJS-$(CONFIG_WMAVOICE_DECODER)        += wmavoice.o \
                                           celp_filters.o \
                                           acelp_vectors.o acelp_filters.o
 OBJS-$(CONFIG_WMV1_DECODER)            += msmpeg4dec.o msmpeg4.o msmpeg4data.o
+OBJS-$(CONFIG_WMV1_ENCODER)            += msmpeg4enc.o
 OBJS-$(CONFIG_WMV2_DECODER)            += wmv2dec.o wmv2.o wmv2data.o \
                                           msmpeg4dec.o msmpeg4.o msmpeg4data.o
 OBJS-$(CONFIG_WMV2_ENCODER)            += wmv2enc.o wmv2.o wmv2data.o \
@@ -480,14 +592,23 @@ OBJS-$(CONFIG_WS_SND1_DECODER)         += ws-snd1.o
 OBJS-$(CONFIG_XAN_DPCM_DECODER)        += dpcm.o
 OBJS-$(CONFIG_XAN_WC3_DECODER)         += xan.o
 OBJS-$(CONFIG_XAN_WC4_DECODER)         += xxan.o
+OBJS-$(CONFIG_XBIN_DECODER)            += bintext.o cga_data.o
 OBJS-$(CONFIG_XBM_DECODER)             += xbmdec.o
 OBJS-$(CONFIG_XBM_ENCODER)             += xbmenc.o
+OBJS-$(CONFIG_XFACE_DECODER)           += xfacedec.o xface.o
+OBJS-$(CONFIG_XFACE_ENCODER)           += xfaceenc.o xface.o
 OBJS-$(CONFIG_XL_DECODER)              += xl.o
+OBJS-$(CONFIG_XMA1_DECODER)            += wmaprodec.o wma.o wma_common.o
+OBJS-$(CONFIG_XMA2_DECODER)            += wmaprodec.o wma.o wma_common.o
 OBJS-$(CONFIG_XSUB_DECODER)            += xsubdec.o
 OBJS-$(CONFIG_XSUB_ENCODER)            += xsubenc.o
 OBJS-$(CONFIG_XWD_DECODER)             += xwddec.o
 OBJS-$(CONFIG_XWD_ENCODER)             += xwdenc.o
+OBJS-$(CONFIG_Y41P_DECODER)            += y41pdec.o
+OBJS-$(CONFIG_Y41P_ENCODER)            += y41penc.o
 OBJS-$(CONFIG_YOP_DECODER)             += yop.o
+OBJS-$(CONFIG_YUV4_DECODER)            += yuv4dec.o
+OBJS-$(CONFIG_YUV4_ENCODER)            += yuv4enc.o
 OBJS-$(CONFIG_ZEROCODEC_DECODER)       += zerocodec.o
 OBJS-$(CONFIG_ZLIB_DECODER)            += lcldec.o
 OBJS-$(CONFIG_ZLIB_ENCODER)            += lclenc.o
@@ -512,13 +633,16 @@ OBJS-$(CONFIG_PCM_MULAW_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_MULAW_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S8_DECODER)             += pcm.o
 OBJS-$(CONFIG_PCM_S8_ENCODER)             += pcm.o
-OBJS-$(CONFIG_PCM_S8_PLANAR_DECODER)      += 8svx.o
+OBJS-$(CONFIG_PCM_S8_PLANAR_DECODER)      += pcm.o
+OBJS-$(CONFIG_PCM_S8_PLANAR_ENCODER)      += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16BE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S16BE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S16LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S16LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S24BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24DAUD_DECODER)        += pcm.o
@@ -526,11 +650,13 @@ OBJS-$(CONFIG_PCM_S24DAUD_ENCODER)        += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S24LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S24LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_S32BE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32BE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_DECODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_ENCODER)          += pcm.o
 OBJS-$(CONFIG_PCM_S32LE_PLANAR_DECODER)   += pcm.o
+OBJS-$(CONFIG_PCM_S32LE_PLANAR_ENCODER)   += pcm.o
 OBJS-$(CONFIG_PCM_U8_DECODER)             += pcm.o
 OBJS-$(CONFIG_PCM_U8_ENCODER)             += pcm.o
 OBJS-$(CONFIG_PCM_U16BE_DECODER)          += pcm.o
@@ -550,7 +676,10 @@ OBJS-$(CONFIG_PCM_ZORK_DECODER)           += pcm.o
 OBJS-$(CONFIG_ADPCM_4XM_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_ADX_DECODER)          += adxdec.o adx.o
 OBJS-$(CONFIG_ADPCM_ADX_ENCODER)          += adxenc.o adx.o
+OBJS-$(CONFIG_ADPCM_AFC_DECODER)          += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_AICA_DECODER)         += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_CT_DECODER)           += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_DTK_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_DECODER)           += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_MAXIS_XA_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_EA_R1_DECODER)        += adpcm.o adpcm_data.o
@@ -561,21 +690,27 @@ OBJS-$(CONFIG_ADPCM_G722_DECODER)         += g722.o g722dsp.o g722dec.o
 OBJS-$(CONFIG_ADPCM_G722_ENCODER)         += g722.o g722dsp.o g722enc.o
 OBJS-$(CONFIG_ADPCM_G726_DECODER)         += g726.o
 OBJS-$(CONFIG_ADPCM_G726_ENCODER)         += g726.o
+OBJS-$(CONFIG_ADPCM_G726LE_DECODER)       += g726.o
 OBJS-$(CONFIG_ADPCM_IMA_AMV_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_APC_DECODER)      += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_DAT4_DECODER)     += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_DK3_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_DK4_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_EA_EACS_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_EA_SEAD_DECODER)  += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_ISS_DECODER)      += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_OKI_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_QT_DECODER)       += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_QT_ENCODER)       += adpcmenc.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_IMA_RAD_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_SMJPEG_DECODER)   += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WAV_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WAV_ENCODER)      += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_IMA_WS_DECODER)       += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_MS_DECODER)           += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_MS_ENCODER)           += adpcmenc.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_MTAF_DECODER)         += adpcm.o adpcm_data.o
+OBJS-$(CONFIG_ADPCM_PSX_DECODER)          += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SBPRO_2_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SBPRO_3_DECODER)      += adpcm.o adpcm_data.o
 OBJS-$(CONFIG_ADPCM_SBPRO_4_DECODER)      += adpcm.o adpcm_data.o
@@ -591,29 +726,41 @@ OBJS-$(CONFIG_ADPCM_YAMAHA_ENCODER)       += adpcmenc.o adpcm_data.o
 OBJS-$(CONFIG_D3D11VA)                    += dxva2.o
 OBJS-$(CONFIG_DXVA2)                      += dxva2.o
 OBJS-$(CONFIG_VAAPI)                      += vaapi.o
-OBJS-$(CONFIG_VDA)                        += vda.o
+OBJS-$(CONFIG_VDA)                        += vda.o videotoolbox.o
+OBJS-$(CONFIG_VIDEOTOOLBOX)               += videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                      += vdpau.o
 
 OBJS-$(CONFIG_H263_VAAPI_HWACCEL)         += vaapi_mpeg4.o
+OBJS-$(CONFIG_H263_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
 OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
 OBJS-$(CONFIG_H264_DXVA2_HWACCEL)         += dxva2_h264.o
 OBJS-$(CONFIG_H264_VAAPI_HWACCEL)         += vaapi_h264.o
 OBJS-$(CONFIG_H264_VDA_HWACCEL)           += vda_h264.o
 OBJS-$(CONFIG_H264_VDPAU_HWACCEL)         += vdpau_h264.o
+OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
 OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
+OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o
 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
 OBJS-$(CONFIG_MPEG1_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+OBJS-$(CONFIG_MPEG1_XVMC_HWACCEL)         += mpegvideo_xvmc.o
 OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL)      += dxva2_mpeg2.o
 OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL)        += dxva2_mpeg2.o
 OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL)        += vaapi_mpeg2.o
 OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+OBJS-$(CONFIG_MPEG2_XVMC_HWACCEL)         += mpegvideo_xvmc.o
 OBJS-$(CONFIG_MPEG4_VAAPI_HWACCEL)        += vaapi_mpeg4.o
 OBJS-$(CONFIG_MPEG4_VDPAU_HWACCEL)        += vdpau_mpeg4.o
+OBJS-$(CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
 OBJS-$(CONFIG_VC1_D3D11VA_HWACCEL)        += dxva2_vc1.o
 OBJS-$(CONFIG_VC1_DXVA2_HWACCEL)          += dxva2_vc1.o
 OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
 OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
+OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
+OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
+OBJS-$(CONFIG_VP9_VAAPI_HWACCEL)          += vaapi_vp9.o
 
 # hardware-accelerated decoding/encoding
 OBJS-$(CONFIG_NVENC)                      += nvenc.o
@@ -621,50 +768,88 @@ OBJS-$(CONFIG_QSV)                        += qsv.o
 OBJS-$(CONFIG_QSVDEC)                     += qsvdec.o
 OBJS-$(CONFIG_QSVENC)                     += qsvenc.o
 
+OBJS-$(CONFIG_H264_CUVID_DECODER)         += cuvid.o
 OBJS-$(CONFIG_H264_MMAL_DECODER)          += mmaldec.o
-OBJS-$(CONFIG_H264_NVENC_ENCODER)         += nvenc_h264.o
+OBJS-$(CONFIG_H264_VDA_DECODER)           += vda_h264_dec.o
 OBJS-$(CONFIG_H264_OMX_ENCODER)           += omx.o
 OBJS-$(CONFIG_H264_QSV_DECODER)           += qsvdec_h2645.o
 OBJS-$(CONFIG_H264_QSV_ENCODER)           += qsvenc_h264.o
 OBJS-$(CONFIG_H264_VAAPI_ENCODER)         += vaapi_encode_h264.o vaapi_encode_h26x.o
-OBJS-$(CONFIG_HEVC_NVENC_ENCODER)         += nvenc_hevc.o
+OBJS-$(CONFIG_HEVC_CUVID_DECODER)         += cuvid.o
 OBJS-$(CONFIG_HEVC_QSV_DECODER)           += qsvdec_h2645.o
 OBJS-$(CONFIG_HEVC_QSV_ENCODER)           += qsvenc_hevc.o hevc_ps_enc.o h2645_parse.o
 OBJS-$(CONFIG_HEVC_VAAPI_ENCODER)         += vaapi_encode_h265.o vaapi_encode_h26x.o
+OBJS-$(CONFIG_MPEG2_MMAL_DECODER)         += mmaldec.o
 OBJS-$(CONFIG_MPEG2_QSV_DECODER)          += qsvdec_mpeg2.o
 OBJS-$(CONFIG_MPEG2_QSV_ENCODER)          += qsvenc_mpeg2.o
 OBJS-$(CONFIG_MPEG4_OMX_ENCODER)          += omx.o
+OBJS-$(CONFIG_NVENC_ENCODER)              += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_H264_ENCODER)         += nvenc_h264.o
+OBJS-$(CONFIG_NVENC_HEVC_ENCODER)         += nvenc_hevc.o
+OBJS-$(CONFIG_VC1_CUVID_DECODER)          += cuvid.o
+OBJS-$(CONFIG_VP8_CUVID_DECODER)          += cuvid.o
+OBJS-$(CONFIG_VP9_CUVID_DECODER)          += cuvid.o
 
 # libavformat dependencies
 OBJS-$(CONFIG_ISO_MEDIA)               += mpeg4audio.o mpegaudiodata.o
 
 OBJS-$(CONFIG_ADTS_MUXER)              += mpeg4audio.o
 OBJS-$(CONFIG_CAF_DEMUXER)             += ac3tab.o
-OBJS-$(CONFIG_FLAC_MUXER)              += flac.o flacdata.o
+OBJS-$(CONFIG_DNXHD_DEMUXER)           += dnxhddata.o
+OBJS-$(CONFIG_FLAC_DEMUXER)            += flac.o flacdata.o vorbis_data.o
+OBJS-$(CONFIG_FLAC_MUXER)              += flac.o flacdata.o vorbis_data.o
 OBJS-$(CONFIG_FLV_DEMUXER)             += mpeg4audio.o
 OBJS-$(CONFIG_GXF_DEMUXER)             += mpeg12data.o
 OBJS-$(CONFIG_IFF_DEMUXER)             += iff.o
 OBJS-$(CONFIG_LATM_MUXER)              += mpeg4audio.o
-OBJS-$(CONFIG_MATROSKA_AUDIO_MUXER)    += mpeg4audio.o                  \
+OBJS-$(CONFIG_MATROSKA_AUDIO_MUXER)    += mpeg4audio.o vorbis_data.o    \
                                           flac.o flacdata.o
-OBJS-$(CONFIG_MATROSKA_MUXER)          += flac.o flacdata.o
+OBJS-$(CONFIG_MATROSKA_MUXER)          += flac.o flacdata.o vorbis_data.o
 OBJS-$(CONFIG_MOV_DEMUXER)             += ac3tab.o
 OBJS-$(CONFIG_MP2_MUXER)               += mpegaudiodata.o mpegaudiodecheader.o
 OBJS-$(CONFIG_MP3_MUXER)               += mpegaudiodata.o mpegaudiodecheader.o
 OBJS-$(CONFIG_MPEGTS_MUXER)            += mpeg4audio.o
+OBJS-$(CONFIG_MXF_MUXER)               += dnxhddata.o
 OBJS-$(CONFIG_NUT_MUXER)               += mpegaudiodata.o
+OBJS-$(CONFIG_NUT_DEMUXER)             += mpegaudiodata.o mpeg4audio.o
+OBJS-$(CONFIG_OGA_MUXER)               += flac.o flacdata.o
 OBJS-$(CONFIG_OGG_DEMUXER)             += mpeg12data.o \
-                                          dirac.o
-OBJS-$(CONFIG_OGG_MUXER)               += flac.o flacdata.o
+                                          dirac.o vorbis_data.o
+OBJS-$(CONFIG_OGG_MUXER)               += flac.o flacdata.o \
+                                          vorbis_data.o
 OBJS-$(CONFIG_RTP_MUXER)               += mpeg4audio.o
 OBJS-$(CONFIG_SPDIF_DEMUXER)           += aacadtsdec.o mpeg4audio.o
 OBJS-$(CONFIG_SPDIF_MUXER)             += dca.o
 OBJS-$(CONFIG_TAK_DEMUXER)             += tak.o
 OBJS-$(CONFIG_WEBM_MUXER)              += mpeg4audio.o mpegaudiodata.o  \
-                                          flac.o flacdata.o
+                                          flac.o flacdata.o \
+                                          vorbis_data.o
+
+# libavfilter dependencies
+OBJS-$(CONFIG_ELBG_FILTER)             += elbg.o
 
 # external codec libraries
-OBJS-$(CONFIG_LIBDCADEC_DECODER)          += libdcadec.o dca.o
+OBJS-$(CONFIG_AAC_AT_DECODER)             += audiotoolboxdec.o
+OBJS-$(CONFIG_AC3_AT_DECODER)             += audiotoolboxdec.o
+OBJS-$(CONFIG_ADPCM_IMA_QT_AT_DECODER)    += audiotoolboxdec.o
+OBJS-$(CONFIG_ALAC_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_AMR_NB_AT_DECODER)          += audiotoolboxdec.o
+OBJS-$(CONFIG_EAC3_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_GSM_MS_AT_DECODER)          += audiotoolboxdec.o
+OBJS-$(CONFIG_ILBC_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_MP1_AT_DECODER)             += audiotoolboxdec.o mpegaudiodecheader.o
+OBJS-$(CONFIG_MP2_AT_DECODER)             += audiotoolboxdec.o mpegaudiodecheader.o
+OBJS-$(CONFIG_MP3_AT_DECODER)             += audiotoolboxdec.o mpegaudiodecheader.o
+OBJS-$(CONFIG_PCM_MULAW_AT_DECODER)       += audiotoolboxdec.o
+OBJS-$(CONFIG_PCM_ALAW_AT_DECODER)        += audiotoolboxdec.o
+OBJS-$(CONFIG_QDMC_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_QDM2_AT_DECODER)            += audiotoolboxdec.o
+OBJS-$(CONFIG_AAC_AT_ENCODER)             += audiotoolboxenc.o
+OBJS-$(CONFIG_ALAC_AT_ENCODER)            += audiotoolboxenc.o
+OBJS-$(CONFIG_ILBC_AT_ENCODER)            += audiotoolboxenc.o
+OBJS-$(CONFIG_PCM_ALAW_AT_ENCODER)        += audiotoolboxenc.o
+OBJS-$(CONFIG_PCM_MULAW_AT_ENCODER)       += audiotoolboxenc.o
+OBJS-$(CONFIG_LIBCELT_DECODER)            += libcelt_dec.o
 OBJS-$(CONFIG_LIBFAAC_ENCODER)            += libfaac.o
 OBJS-$(CONFIG_LIBFDK_AAC_DECODER)         += libfdk-aacdec.o
 OBJS-$(CONFIG_LIBFDK_AAC_ENCODER)         += libfdk-aacenc.o
@@ -690,25 +875,30 @@ OBJS-$(CONFIG_LIBSCHROEDINGER_DECODER)    += libschroedingerdec.o \
                                              libschroedinger.o
 OBJS-$(CONFIG_LIBSCHROEDINGER_ENCODER)    += libschroedingerenc.o \
                                              libschroedinger.o
+OBJS-$(CONFIG_LIBSHINE_ENCODER)           += libshine.o
 OBJS-$(CONFIG_LIBSPEEX_DECODER)           += libspeexdec.o
 OBJS-$(CONFIG_LIBSPEEX_ENCODER)           += libspeexenc.o
 OBJS-$(CONFIG_LIBTHEORA_ENCODER)          += libtheoraenc.o
 OBJS-$(CONFIG_LIBTWOLAME_ENCODER)         += libtwolame.o
-OBJS-$(CONFIG_LIBVO_AACENC_ENCODER)       += libvo-aacenc.o mpeg4audio.o
+OBJS-$(CONFIG_LIBUTVIDEO_DECODER)         += libutvideodec.o
+OBJS-$(CONFIG_LIBUTVIDEO_ENCODER)         += libutvideoenc.o
 OBJS-$(CONFIG_LIBVO_AMRWBENC_ENCODER)     += libvo-amrwbenc.o
-OBJS-$(CONFIG_LIBVORBIS_ENCODER)          += libvorbis.o \
+OBJS-$(CONFIG_LIBVORBIS_DECODER)          += libvorbisdec.o
+OBJS-$(CONFIG_LIBVORBIS_ENCODER)          += libvorbisenc.o \
                                              vorbis_data.o
-OBJS-$(CONFIG_LIBVPX_VP8_DECODER)         += libvpxdec.o libvpx.o
-OBJS-$(CONFIG_LIBVPX_VP8_ENCODER)         += libvpxenc.o libvpx.o
+OBJS-$(CONFIG_LIBVPX_VP8_DECODER)         += libvpxdec.o
+OBJS-$(CONFIG_LIBVPX_VP8_ENCODER)         += libvpxenc.o
 OBJS-$(CONFIG_LIBVPX_VP9_DECODER)         += libvpxdec.o libvpx.o
 OBJS-$(CONFIG_LIBVPX_VP9_ENCODER)         += libvpxenc.o libvpx.o
 OBJS-$(CONFIG_LIBWAVPACK_ENCODER)         += libwavpackenc.o
-OBJS-$(CONFIG_LIBWEBP_ENCODER)            += libwebpenc.o
+OBJS-$(CONFIG_LIBWEBP_ENCODER)            += libwebpenc_common.o libwebpenc.o
+OBJS-$(CONFIG_LIBWEBP_ANIM_ENCODER)       += libwebpenc_common.o libwebpenc_animencoder.o
 OBJS-$(CONFIG_LIBX262_ENCODER)            += libx264.o
 OBJS-$(CONFIG_LIBX264_ENCODER)            += libx264.o
 OBJS-$(CONFIG_LIBX265_ENCODER)            += libx265.o
 OBJS-$(CONFIG_LIBXAVS_ENCODER)            += libxavs.o
 OBJS-$(CONFIG_LIBXVID_ENCODER)            += libxvid.o
+OBJS-$(CONFIG_LIBZVBI_TELETEXT_DECODER)   += libzvbi-teletextdec.o
 
 # parsers
 OBJS-$(CONFIG_AAC_LATM_PARSER)         += latm_parser.o
@@ -720,13 +910,17 @@ OBJS-$(CONFIG_ADX_PARSER)              += adx_parser.o adx.o
 OBJS-$(CONFIG_BMP_PARSER)              += bmp_parser.o
 OBJS-$(CONFIG_CAVSVIDEO_PARSER)        += cavs_parser.o
 OBJS-$(CONFIG_COOK_PARSER)             += cook_parser.o
-OBJS-$(CONFIG_DCA_PARSER)              += dca_parser.o dca.o
+OBJS-$(CONFIG_DCA_PARSER)              += dca_parser.o dca_exss.o dca.o
 OBJS-$(CONFIG_DIRAC_PARSER)            += dirac_parser.o
 OBJS-$(CONFIG_DNXHD_PARSER)            += dnxhd_parser.o
 OBJS-$(CONFIG_DPX_PARSER)              += dpx_parser.o
+OBJS-$(CONFIG_DVAUDIO_PARSER)          += dvaudio_parser.o
 OBJS-$(CONFIG_DVBSUB_PARSER)           += dvbsub_parser.o
+OBJS-$(CONFIG_DVD_NAV_PARSER)          += dvd_nav_parser.o
 OBJS-$(CONFIG_DVDSUB_PARSER)           += dvdsub_parser.o
-OBJS-$(CONFIG_FLAC_PARSER)             += flac_parser.o flacdata.o flac.o
+OBJS-$(CONFIG_FLAC_PARSER)             += flac_parser.o flacdata.o flac.o \
+                                          vorbis_data.o
+OBJS-$(CONFIG_G729_PARSER)             += g729_parser.o
 OBJS-$(CONFIG_GSM_PARSER)              += gsm_parser.o
 OBJS-$(CONFIG_H261_PARSER)             += h261_parser.o
 OBJS-$(CONFIG_H263_PARSER)             += h263_parser.o
@@ -738,6 +932,7 @@ OBJS-$(CONFIG_MLP_PARSER)              += mlp_parser.o mlp.o
 OBJS-$(CONFIG_MPEG4VIDEO_PARSER)       += mpeg4video_parser.o h263.o \
                                           mpeg4videodec.o mpeg4video.o \
                                           ituh263dec.o h263dec.o h263data.o
+OBJS-$(CONFIG_PNG_PARSER)              += png_parser.o
 OBJS-$(CONFIG_MPEGAUDIO_PARSER)        += mpegaudio_parser.o \
                                           mpegaudiodecheader.o mpegaudiodata.o
 OBJS-$(CONFIG_MPEGVIDEO_PARSER)        += mpegvideo_parser.o    \
@@ -752,57 +947,90 @@ OBJS-$(CONFIG_VC1_PARSER)              += vc1_parser.o vc1.o vc1data.o  \
                                           simple_idct.o wmv2data.o
 OBJS-$(CONFIG_VP3_PARSER)              += vp3_parser.o
 OBJS-$(CONFIG_VP8_PARSER)              += vp8_parser.o
+OBJS-$(CONFIG_VP9_PARSER)              += vp9_parser.o
 
 # bitstream filters
 OBJS-$(CONFIG_AAC_ADTSTOASC_BSF)          += aac_adtstoasc_bsf.o aacadtsdec.o \
                                              mpeg4audio.o
 OBJS-$(CONFIG_CHOMP_BSF)                  += chomp_bsf.o
 OBJS-$(CONFIG_DUMP_EXTRADATA_BSF)         += dump_extradata_bsf.o
+OBJS-$(CONFIG_DCA_CORE_BSF)               += dca_core_bsf.o
 OBJS-$(CONFIG_H264_MP4TOANNEXB_BSF)       += h264_mp4toannexb_bsf.o
 OBJS-$(CONFIG_HEVC_MP4TOANNEXB_BSF)       += hevc_mp4toannexb_bsf.o
 OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF)        += imx_dump_header_bsf.o
 OBJS-$(CONFIG_MJPEG2JPEG_BSF)             += mjpeg2jpeg_bsf.o
 OBJS-$(CONFIG_MJPEGA_DUMP_HEADER_BSF)     += mjpega_dump_header_bsf.o
+OBJS-$(CONFIG_MPEG4_UNPACK_BFRAMES_BSF)   += mpeg4_unpack_bframes_bsf.o
 OBJS-$(CONFIG_MOV2TEXTSUB_BSF)            += movsub_bsf.o
+OBJS-$(CONFIG_MP3_HEADER_DECOMPRESS_BSF)  += mp3_header_decompress_bsf.o \
+                                             mpegaudiodata.o
 OBJS-$(CONFIG_NOISE_BSF)                  += noise_bsf.o
 OBJS-$(CONFIG_REMOVE_EXTRADATA_BSF)       += remove_extradata_bsf.o
 OBJS-$(CONFIG_TEXT2MOVSUB_BSF)            += movsub_bsf.o
+OBJS-$(CONFIG_VP9_SUPERFRAME_BSF)         += vp9_superframe_bsf.o
 
 # thread libraries
 OBJS-$(HAVE_LIBC_MSVCRT)               += file_open.o
 OBJS-$(HAVE_THREADS)                   += pthread.o pthread_slice.o pthread_frame.o
 
+OBJS-$(CONFIG_FRAME_THREAD_ENCODER)    += frame_thread_encoder.o
+
+# Windows resource file
+SLIBOBJS-$(HAVE_GNU_WINDRES)           += avcodecres.o
+
 SKIPHEADERS                            += %_tablegen.h                  \
                                           %_tables.h                    \
-                                          aac_tablegen_decl.h           \
                                           fft-internal.h                \
                                           tableprint.h                  \
+                                          tableprint_vlc.h              \
+                                          aaccoder_twoloop.h            \
+                                          aaccoder_trellis.h            \
+                                          aacenc_quantization.h         \
+                                          aacenc_quantization_misc.h    \
                                           $(ARCH)/vp56_arith.h          \
 
 SKIPHEADERS-$(CONFIG_D3D11VA)          += d3d11va.h dxva2_internal.h
 SKIPHEADERS-$(CONFIG_DXVA2)            += dxva2.h dxva2_internal.h
+SKIPHEADERS-$(CONFIG_JNI)              += ffjni.h
 SKIPHEADERS-$(CONFIG_LIBSCHROEDINGER)  += libschroedinger.h
+SKIPHEADERS-$(CONFIG_LIBUTVIDEO)       += libutvideo.h
 SKIPHEADERS-$(CONFIG_LIBVPX)           += libvpx.h
-SKIPHEADERS-$(CONFIG_MPEG_XVMC_DECODER) += xvmc.h
+SKIPHEADERS-$(CONFIG_LIBWEBP_ENCODER)  += libwebpenc_common.h
+SKIPHEADERS-$(CONFIG_MEDIACODEC)       += mediacodecdec.h mediacodec_wrapper.h mediacodec_sw_buffer.h
 SKIPHEADERS-$(CONFIG_NVENC)            += nvenc.h
 SKIPHEADERS-$(CONFIG_QSV)              += qsv.h qsv_internal.h
 SKIPHEADERS-$(CONFIG_QSVDEC)           += qsvdec.h
 SKIPHEADERS-$(CONFIG_QSVENC)           += qsvenc.h
+SKIPHEADERS-$(CONFIG_XVMC)             += xvmc.h
 SKIPHEADERS-$(CONFIG_VAAPI)            += vaapi_encode.h vaapi_internal.h
-SKIPHEADERS-$(CONFIG_VDA)              += vda.h vda_internal.h
+SKIPHEADERS-$(CONFIG_VDA)              += vda.h vda_vt_internal.h
 SKIPHEADERS-$(CONFIG_VDPAU)            += vdpau.h vdpau_internal.h
+SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += videotoolbox.h vda_vt_internal.h
+
+TESTPROGS = imgconvert                                                  \
+            jpeg2000dwt                                                 \
+            mathops                                                    \
+            options                                                     \
+            utils                                                       \
 
-TESTPROGS-$(CONFIG_FFT)                   += fft fft-fixed
+TESTPROGS-$(CONFIG_CABAC)                 += cabac
+TESTPROGS-$(CONFIG_DCT)                   += avfft
+TESTPROGS-$(CONFIG_FFT)                   += fft fft-fixed fft-fixed32
 TESTPROGS-$(CONFIG_GOLOMB)                += golomb
 TESTPROGS-$(CONFIG_IDCTDSP)               += dct
 TESTPROGS-$(CONFIG_IIRFILTER)             += iirfilter
+TESTPROGS-$(HAVE_MMX)                     += motion
 TESTPROGS-$(CONFIG_RANGECODER)            += rangecoder
+TESTPROGS-$(CONFIG_SNOW_ENCODER)          += snowenc
 
 TESTOBJS = dctref.o
 
-HOSTPROGS = aac_tablegen                                                \
-            aacps_tablegen                                              \
+TOOLS = fourcc2pixfmt
+
+HOSTPROGS = aacps_tablegen                                              \
+            aacps_fixed_tablegen                                        \
             cbrt_tablegen                                               \
+            cbrt_fixed_tablegen                                         \
             cos_tablegen                                                \
             dv_tablegen                                                 \
             motionpixels_tablegen                                       \
@@ -810,6 +1038,7 @@ HOSTPROGS = aac_tablegen                                                \
             pcm_tablegen                                                \
             qdm2_tablegen                                               \
             sinewin_tablegen                                            \
+            sinewin_fixed_tablegen                                      \
 
 CLEANFILES = *_tables.c *_tables.h *_tablegen$(HOSTEXESUF)
 
@@ -828,8 +1057,9 @@ else
 $(SUBDIR)%_tablegen$(HOSTEXESUF): HOSTCFLAGS += -DCONFIG_SMALL=0
 endif
 
-GEN_HEADERS = cbrt_tables.h aacps_tables.h aac_tables.h dv_tables.h     \
-              sinewin_tables.h mpegaudio_tables.h motionpixels_tables.h \
+GEN_HEADERS = cbrt_tables.h cbrt_fixed_tables.h aacps_tables.h aacps_fixed_tables.h \
+              dv_tables.h     \
+              sinewin_tables.h sinewin_fixed_tables.h mpegaudio_tables.h motionpixels_tables.h \
               pcm_tables.h qdm2_tables.h
 GEN_HEADERS := $(addprefix $(SUBDIR), $(GEN_HEADERS))
 
@@ -837,9 +1067,11 @@ $(GEN_HEADERS): $(SUBDIR)%_tables.h: $(SUBDIR)%_tablegen$(HOSTEXESUF)
 	$(M)./$< > $@
 
 ifdef CONFIG_HARDCODED_TABLES
-$(SUBDIR)aacdec.o: $(SUBDIR)cbrt_tables.h
-$(SUBDIR)aacps.o: $(SUBDIR)aacps_tables.h
-$(SUBDIR)aactab.o: $(SUBDIR)aac_tables.h
+$(SUBDIR)cbrt_data.o: $(SUBDIR)cbrt_tables.h
+$(SUBDIR)cbrt_data_fixed.o: $(SUBDIR)cbrt_fixed_tables.h
+$(SUBDIR)aacps_float.o: $(SUBDIR)aacps_tables.h
+$(SUBDIR)aacps_fixed.o: $(SUBDIR)aacps_fixed_tables.h
+$(SUBDIR)aactab_fixed.o: $(SUBDIR)aac_fixed_tables.h
 $(SUBDIR)dvenc.o: $(SUBDIR)dv_tables.h
 $(SUBDIR)motionpixels.o: $(SUBDIR)motionpixels_tables.h
 $(SUBDIR)mpegaudiodec_fixed.o: $(SUBDIR)mpegaudio_tables.h
@@ -847,4 +1079,5 @@ $(SUBDIR)mpegaudiodec_float.o: $(SUBDIR)mpegaudio_tables.h
 $(SUBDIR)pcm.o: $(SUBDIR)pcm_tables.h
 $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
 $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+$(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
 endif
diff --git a/libavcodec/a64colors.h b/libavcodec/a64colors.h
index d977426..a9cdb6f 100644
--- a/libavcodec/a64colors.h
+++ b/libavcodec/a64colors.h
@@ -2,20 +2,20 @@
  * a64 video encoder - c64 colors in rgb (Pepto)
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/a64enc.h b/libavcodec/a64enc.h
deleted file mode 100644
index 65c1d30..0000000
--- a/libavcodec/a64enc.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * a64 video encoder - basic headers
- * Copyright (c) 2009 Tobias Bindhammer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * a64 video encoder - basic headers
- */
-
-#ifndef AVCODEC_A64ENC_H
-#define AVCODEC_A64ENC_H
-
-#include "libavutil/lfg.h"
-#include "avcodec.h"
-
-#define C64XRES 320
-#define C64YRES 200
-
-typedef struct A64Context {
-    /* variables for multicolor modes */
-    AVLFG randctx;
-    int mc_lifetime;
-    int mc_use_5col;
-    unsigned mc_frame_counter;
-    int *mc_meta_charset;
-    int *mc_charmap;
-    int *mc_best_cb;
-    int mc_luma_vals[5];
-    uint8_t *mc_charset;
-    uint8_t *mc_colram;
-    uint8_t *mc_palette;
-    int mc_pal_size;
-
-    /* pts of the next packet that will be output */
-    int64_t next_pts;
-} A64Context;
-
-#endif /* AVCODEC_A64ENC_H */
diff --git a/libavcodec/a64multienc.c b/libavcodec/a64multienc.c
index 5d8d162..91aac09 100644
--- a/libavcodec/a64multienc.c
+++ b/libavcodec/a64multienc.c
@@ -2,20 +2,20 @@
  * a64 video encoder - multicolor modes
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,11 +24,11 @@
  * a64 video encoder - multicolor modes
  */
 
-#include "a64enc.h"
 #include "a64colors.h"
 #include "a64tables.h"
 #include "elbg.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
@@ -37,6 +37,28 @@
 #define INTERLACED    1
 #define CROP_SCREENS  1
 
+#define C64XRES 320
+#define C64YRES 200
+
+typedef struct A64Context {
+    /* variables for multicolor modes */
+    AVLFG randctx;
+    int mc_lifetime;
+    int mc_use_5col;
+    unsigned mc_frame_counter;
+    int *mc_meta_charset;
+    int *mc_charmap;
+    int *mc_best_cb;
+    int mc_luma_vals[5];
+    uint8_t *mc_charset;
+    uint8_t *mc_colram;
+    uint8_t *mc_palette;
+    int mc_pal_size;
+
+    /* pts of the next packet that will be output */
+    int64_t next_pts;
+} A64Context;
+
 /* gray gradient */
 static const int mc_colors[5]={0x0,0xb,0xc,0xf,0x1};
 
@@ -58,9 +80,13 @@ static void to_meta_with_crop(AVCodecContext *avctx,
             for (y = blocky; y < blocky + 8 && y < C64YRES; y++) {
                 for (x = blockx; x < blockx + 8 && x < C64XRES; x += 2) {
                     if(x < width && y < height) {
-                        /* build average over 2 pixels */
-                        luma = (src[(x + 0 + y * p->linesize[0])] +
-                                src[(x + 1 + y * p->linesize[0])]) / 2;
+                        if (x + 1 < width) {
+                            /* build average over 2 pixels */
+                            luma = (src[(x + 0 + y * p->linesize[0])] +
+                                    src[(x + 1 + y * p->linesize[0])]) / 2;
+                        } else {
+                            luma = src[(x + y * p->linesize[0])];
+                        }
                         /* write blocks as linear data now so they are suitable for elbg */
                         dest[0] = luma;
                     }
@@ -166,11 +192,11 @@ static void render_charset(AVCodecContext *avctx, uint8_t *charset,
 static av_cold int a64multi_close_encoder(AVCodecContext *avctx)
 {
     A64Context *c = avctx->priv_data;
-    av_free(c->mc_meta_charset);
-    av_free(c->mc_best_cb);
-    av_free(c->mc_charset);
-    av_free(c->mc_charmap);
-    av_free(c->mc_colram);
+    av_freep(&c->mc_meta_charset);
+    av_freep(&c->mc_best_cb);
+    av_freep(&c->mc_charset);
+    av_freep(&c->mc_charmap);
+    av_freep(&c->mc_colram);
     return 0;
 }
 
@@ -199,9 +225,9 @@ static av_cold int a64multi_encode_init(AVCodecContext *avctx)
                            a64_palette[mc_colors[a]][2] * 0.11;
     }
 
-    if (!(c->mc_meta_charset = av_malloc(32000 * c->mc_lifetime * sizeof(int))) ||
+    if (!(c->mc_meta_charset = av_mallocz_array(c->mc_lifetime, 32000 * sizeof(int))) ||
        !(c->mc_best_cb       = av_malloc(CHARSET_CHARS * 32 * sizeof(int)))     ||
-       !(c->mc_charmap       = av_mallocz(1000 * c->mc_lifetime * sizeof(int))) ||
+       !(c->mc_charmap       = av_mallocz_array(c->mc_lifetime, 1000 * sizeof(int))) ||
        !(c->mc_colram        = av_mallocz(CHARSET_CHARS * sizeof(uint8_t)))     ||
        !(c->mc_charset       = av_malloc(0x800 * (INTERLACED+1) * sizeof(uint8_t)))) {
         av_log(avctx, AV_LOG_ERROR, "Failed to allocate buffer memory.\n");
@@ -217,12 +243,6 @@ static av_cold int a64multi_encode_init(AVCodecContext *avctx)
     AV_WB32(avctx->extradata, c->mc_lifetime);
     AV_WB32(avctx->extradata + 16, INTERLACED);
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
     if (!avctx->codec_tag)
          avctx->codec_tag = AV_RL32("a64m");
 
@@ -247,7 +267,7 @@ static void a64_compress_colram(unsigned char *buf, int *charmap, uint8_t *colra
 }
 
 static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                                 const AVFrame *pict, int *got_packet)
+                                 const AVFrame *p, int *got_packet)
 {
     A64Context *c = avctx->priv_data;
 
@@ -257,7 +277,7 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int b_width;
 
     int req_size, ret;
-    uint8_t *buf;
+    uint8_t *buf = NULL;
 
     int *charmap     = c->mc_charmap;
     uint8_t *colram  = c->mc_colram;
@@ -280,7 +300,7 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     /* no data, means end encoding asap */
-    if (!pict) {
+    if (!p) {
         /* all done, end encoding */
         if (!c->mc_lifetime) return 0;
         /* no more frames in queue, prepare to flush remaining frames */
@@ -293,16 +313,10 @@ static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     } else {
         /* fill up mc_meta_charset with data until lifetime exceeds */
         if (c->mc_frame_counter < c->mc_lifetime) {
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-            avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-            to_meta_with_crop(avctx, pict, meta + 32000 * c->mc_frame_counter);
+            to_meta_with_crop(avctx, p, meta + 32000 * c->mc_frame_counter);
             c->mc_frame_counter++;
             if (c->next_pts == AV_NOPTS_VALUE)
-                c->next_pts = pict->pts;
+                c->next_pts = p->pts;
             /* lifetime is not reached so wait for next frame first */
             return 0;
         }
@@ -313,19 +327,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
         req_size = 0;
         /* any frames to encode? */
         if (c->mc_lifetime) {
-            req_size = charset_size + c->mc_lifetime*(screen_size + colram_size);
-            if ((ret = ff_alloc_packet(pkt, req_size)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", req_size);
+            int alloc_size = charset_size + c->mc_lifetime*(screen_size + colram_size);
+            if ((ret = ff_alloc_packet2(avctx, pkt, alloc_size, 0)) < 0)
                 return ret;
-            }
             buf = pkt->data;
 
             /* calc optimal new charset + charmaps */
-            ret = ff_init_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
+            ret = avpriv_init_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
                                CHARSET_CHARS, 50, charmap, &c->randctx);
             if (ret < 0)
                 return ret;
-            ret = ff_do_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
+            ret = avpriv_do_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb,
                              CHARSET_CHARS, 50, charmap, &c->randctx);
             if (ret < 0)
                 return ret;
@@ -338,7 +350,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             /* advance pointers */
             buf      += charset_size;
-            charset  += charset_size;
+            req_size += charset_size;
         }
 
         /* write x frames to buf */
@@ -375,6 +387,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         pkt->pts = pkt->dts = c->next_pts;
         c->next_pts         = AV_NOPTS_VALUE;
 
+        av_assert0(pkt->size >= req_size);
         pkt->size   = req_size;
         pkt->flags |= AV_PKT_FLAG_KEY;
         *got_packet = !!req_size;
@@ -382,6 +395,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+#if CONFIG_A64MULTI_ENCODER
 AVCodec ff_a64multi_encoder = {
     .name           = "a64multi",
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64"),
@@ -394,7 +408,8 @@ AVCodec ff_a64multi_encoder = {
     .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
     .capabilities   = AV_CODEC_CAP_DELAY,
 };
-
+#endif
+#if CONFIG_A64MULTI5_ENCODER
 AVCodec ff_a64multi5_encoder = {
     .name           = "a64multi5",
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64, extended with 5th color (colram)"),
@@ -407,3 +422,4 @@ AVCodec ff_a64multi5_encoder = {
     .pix_fmts       = (const enum AVPixelFormat[]) {AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
     .capabilities   = AV_CODEC_CAP_DELAY,
 };
+#endif
diff --git a/libavcodec/a64tables.h b/libavcodec/a64tables.h
index b95c5ce..a955ef4 100644
--- a/libavcodec/a64tables.h
+++ b/libavcodec/a64tables.h
@@ -2,20 +2,20 @@
  * a64 video encoder - tables used by a64 encoders
  * Copyright (c) 2009 Tobias Bindhammer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index fed6bf4..b1f4aa7 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,14 @@
 #ifndef AVCODEC_AAC_H
 #define AVCODEC_AAC_H
 
+
+#include "aac_defines.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
 #include "avcodec.h"
+#if !USE_FIXED
 #include "imdct15.h"
+#endif
 #include "fft.h"
 #include "mpeg4audio.h"
 #include "sbr.h"
@@ -45,6 +50,8 @@
 #define TNS_MAX_ORDER 20
 #define MAX_LTP_LONG_SFB 40
 
+#define CLIP_AVOIDANCE_FACTOR 0.95f
+
 enum RawDataBlockType {
     TYPE_SCE,
     TYPE_CPE,
@@ -76,12 +83,13 @@ enum BandType {
     ZERO_BT        = 0,     ///< Scalefactors and spectral data are all zero.
     FIRST_PAIR_BT  = 5,     ///< This and later band types encode two values (rather than four) with one code word.
     ESC_BT         = 11,    ///< Spectral data are coded with an escape sequence.
+    RESERVED_BT    = 12,    ///< Band types following are encoded differently from others.
     NOISE_BT       = 13,    ///< Spectral data are scaled white noise not coded in the bitstream.
-    INTENSITY_BT2  = 14,    ///< Scalefactor data are intensity stereo positions.
-    INTENSITY_BT   = 15,    ///< Scalefactor data are intensity stereo positions.
+    INTENSITY_BT2  = 14,    ///< Scalefactor data are intensity stereo positions (out of phase).
+    INTENSITY_BT   = 15,    ///< Scalefactor data are intensity stereo positions (in phase).
 };
 
-#define IS_CODEBOOK_UNSIGNED(x) ((x - 1) & 10)
+#define IS_CODEBOOK_UNSIGNED(x) (((x) - 1) & 10)
 
 enum ChannelPosition {
     AAC_CHANNEL_OFF   = 0,
@@ -125,12 +133,14 @@ typedef struct OutputConfiguration {
  * Predictor State
  */
 typedef struct PredictorState {
-    float cor0;
-    float cor1;
-    float var0;
-    float var1;
-    float r0;
-    float r1;
+    AAC_FLOAT cor0;
+    AAC_FLOAT cor1;
+    AAC_FLOAT var0;
+    AAC_FLOAT var1;
+    AAC_FLOAT r0;
+    AAC_FLOAT r1;
+    AAC_FLOAT k1;
+    AAC_FLOAT x_est;
 } PredictorState;
 
 #define MAX_PREDICTORS 672
@@ -141,13 +151,20 @@ typedef struct PredictorState {
 #define SCALE_MAX_DIFF   60    ///< maximum scalefactor difference allowed by standard
 #define SCALE_DIFF_ZERO  60    ///< codebook index corresponding to zero scalefactor indices difference
 
+#define POW_SF2_ZERO    200    ///< ff_aac_pow2sf_tab index corresponding to pow(2, 0);
+
+#define NOISE_PRE       256    ///< preamble for NOISE_BT, put in bitstream with the first noise band
+#define NOISE_PRE_BITS    9    ///< length of preamble
+#define NOISE_OFFSET     90    ///< subtracted from global gain, used as offset for the preamble
+
 /**
  * Long Term Prediction
  */
 typedef struct LongTermPrediction {
     int8_t present;
     int16_t lag;
-    float coef;
+    int coef_idx;
+    INTFLOAT coef;
     int8_t used[MAX_LTP_LONG_SFB];
 } LongTermPrediction;
 
@@ -169,7 +186,10 @@ typedef struct IndividualChannelStream {
     int predictor_present;
     int predictor_initialized;
     int predictor_reset_group;
+    int predictor_reset_count[31];  ///< used by encoder to count prediction resets
     uint8_t prediction_used[41];
+    uint8_t window_clipping[8]; ///< set if a certain window is near clipping
+    float clip_avoidance_factor; ///< set if any window is near clipping to the necessary atennuation factor to avoid it
 } IndividualChannelStream;
 
 /**
@@ -181,7 +201,8 @@ typedef struct TemporalNoiseShaping {
     int length[8][4];
     int direction[8][4];
     int order[8][4];
-    float coef[8][4][TNS_MAX_ORDER];
+    int coef_idx[8][4][TNS_MAX_ORDER];
+    INTFLOAT coef[8][4][TNS_MAX_ORDER];
 } TemporalNoiseShaping;
 
 /**
@@ -218,7 +239,7 @@ typedef struct ChannelCoupling {
     int ch_select[8];      /**< [0] shared list of gains; [1] list of gains for right channel;
                             *   [2] list of gains for left channel; [3] lists of gains for both channels
                             */
-    float gain[16][120];
+    INTFLOAT gain[16][120];
 } ChannelCoupling;
 
 /**
@@ -229,26 +250,36 @@ typedef struct SingleChannelElement {
     TemporalNoiseShaping tns;
     Pulse pulse;
     enum BandType band_type[128];                   ///< band types
+    enum BandType band_alt[128];                    ///< alternative band type (used by encoder)
     int band_type_run_end[120];                     ///< band type run end points
-    float sf[120];                                  ///< scalefactors
+    INTFLOAT sf[120];                               ///< scalefactors
     int sf_idx[128];                                ///< scalefactor indices (used by encoder)
     uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
-    DECLARE_ALIGNED(32, float,   coeffs)[1024];     ///< coefficients for IMDCT
-    DECLARE_ALIGNED(32, float,   saved)[1536];      ///< overlap
-    DECLARE_ALIGNED(32, float,   ret_buf)[2048];    ///< PCM output buffer
-    DECLARE_ALIGNED(16, float,   ltp_state)[3072];  ///< time signal for LTP
+    uint8_t can_pns[128];                           ///< band is allowed to PNS (informative)
+    float  is_ener[128];                            ///< Intensity stereo pos (used by encoder)
+    float pns_ener[128];                            ///< Noise energy values (used by encoder)
+    DECLARE_ALIGNED(32, INTFLOAT, pcoeffs)[1024];   ///< coefficients for IMDCT, pristine
+    DECLARE_ALIGNED(32, INTFLOAT, coeffs)[1024];    ///< coefficients for IMDCT, maybe processed
+    DECLARE_ALIGNED(32, INTFLOAT, saved)[1536];     ///< overlap
+    DECLARE_ALIGNED(32, INTFLOAT, ret_buf)[2048];   ///< PCM output buffer
+    DECLARE_ALIGNED(16, INTFLOAT, ltp_state)[3072]; ///< time signal for LTP
+    DECLARE_ALIGNED(32, AAC_FLOAT, lcoeffs)[1024];  ///< MDCT of LTP coefficients (used by encoder)
+    DECLARE_ALIGNED(32, AAC_FLOAT, prcoeffs)[1024]; ///< Main prediction coefs (used by encoder)
     PredictorState predictor_state[MAX_PREDICTORS];
-    float *ret;                                     ///< PCM output
+    INTFLOAT *ret;                                  ///< PCM output
 } SingleChannelElement;
 
 /**
  * channel element - generic struct for SCE/CPE/CCE/LFE
  */
 typedef struct ChannelElement {
+    int present;
     // CPE specific
     int common_window;        ///< Set if channels share a common 'IndividualChannelStream' in bitstream.
     int     ms_mode;          ///< Signals mid/side stereo flags coding mode (used by encoder)
+    uint8_t is_mode;          ///< Set if any bands have been encoded using intensity stereo (used by encoder)
     uint8_t ms_mask[128];     ///< Set if mid/side stereo is used for each scalefactor window band
+    uint8_t is_mask[128];     ///< Set if intensity stereo is used (used by encoder)
     // shared
     SingleChannelElement ch[2];
     // CCE specific
@@ -259,7 +290,8 @@ typedef struct ChannelElement {
 /**
  * main AAC context
  */
-typedef struct AACContext {
+struct AACContext {
+    AVClass        *class;
     AVCodecContext *avctx;
     AVFrame *frame;
 
@@ -273,6 +305,7 @@ typedef struct AACContext {
     ChannelElement          *che[4][MAX_ELEM_ID];
     ChannelElement  *tag_che_map[4][MAX_ELEM_ID];
     int tags_mapped;
+    int warned_remapping_once;
     /** @} */
 
     /**
@@ -280,7 +313,7 @@ typedef struct AACContext {
      * (We do not want to have these on the stack.)
      * @{
      */
-    DECLARE_ALIGNED(32, float, buf_mdct)[1024];
+    DECLARE_ALIGNED(32, INTFLOAT, buf_mdct)[1024];
     /** @} */
 
     /**
@@ -291,8 +324,12 @@ typedef struct AACContext {
     FFTContext mdct_small;
     FFTContext mdct_ld;
     FFTContext mdct_ltp;
+#if USE_FIXED
+    AVFixedDSPContext *fdsp;
+#else
     IMDCT15Context *mdct480;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
+#endif /* USE_FIXED */
     int random_state;
     /** @} */
 
@@ -303,9 +340,33 @@ typedef struct AACContext {
     SingleChannelElement *output_element[MAX_CHANNELS]; ///< Points to each SingleChannelElement
     /** @} */
 
-    DECLARE_ALIGNED(32, float, temp)[128];
+
+    /**
+     * @name Japanese DTV specific extension
+     * @{
+     */
+    int force_dmono_mode;///< 0->not dmono, 1->use first channel, 2->use second channel
+    int dmono_mode;      ///< 0->not dmono, 1->use first channel, 2->use second channel
+    /** @} */
+
+    DECLARE_ALIGNED(32, INTFLOAT, temp)[128];
 
     OutputConfiguration oc[2];
-} AACContext;
+    int warned_num_aac_frames;
+
+    /* aacdec functions pointers */
+    void (*imdct_and_windowing)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_ltp)(AACContext *ac, SingleChannelElement *sce);
+    void (*apply_tns)(INTFLOAT coef[1024], TemporalNoiseShaping *tns,
+                      IndividualChannelStream *ics, int decode);
+    void (*windowing_and_mdct_ltp)(AACContext *ac, INTFLOAT *out,
+                                   INTFLOAT *in, IndividualChannelStream *ics);
+    void (*update_ltp)(AACContext *ac, SingleChannelElement *sce);
+    void (*vector_pow43)(int *coefs, int len);
+    void (*subband_scale)(int *dst, int *src, int scale, int offset, int len);
+
+};
+
+void ff_aacdec_init_mips(AACContext *c);
 
 #endif /* AVCODEC_AAC_H */
diff --git a/libavcodec/aac_ac3_parser.c b/libavcodec/aac_ac3_parser.c
index 806a826..c9ba6bf 100644
--- a/libavcodec/aac_ac3_parser.c
+++ b/libavcodec/aac_ac3_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ int ff_aac_ac3_parse(AVCodecParserContext *s1,
     ParseContext *pc = &s->pc;
     int len, i;
     int new_frame_start;
+    int got_frame = 0;
 
 get_next:
     i=END_NOT_FOUND;
@@ -51,6 +52,7 @@ get_next:
             if(len<=0){
                 i=END_NOT_FOUND;
             }else{
+                got_frame = 1;
                 s->state=0;
                 i-= s->header_size -1;
                 s->remaining_size = len;
@@ -76,31 +78,34 @@ get_next:
     if(s->codec_id)
         avctx->codec_id = s->codec_id;
 
-    /* Due to backwards compatible HE-AAC the sample rate, channel count,
-       and total number of samples found in an AAC ADTS header are not
-       reliable. Bit rate is still accurate because the total frame duration in
-       seconds is still correct (as is the number of bits in the frame). */
-    if (avctx->codec_id != AV_CODEC_ID_AAC) {
-        avctx->sample_rate = s->sample_rate;
+    if (got_frame) {
+        /* Due to backwards compatible HE-AAC the sample rate, channel count,
+           and total number of samples found in an AAC ADTS header are not
+           reliable. Bit rate is still accurate because the total frame
+           duration in seconds is still correct (as is the number of bits in
+           the frame). */
+        if (avctx->codec_id != AV_CODEC_ID_AAC) {
+            avctx->sample_rate = s->sample_rate;
 
-        /* (E-)AC-3: allow downmixing to stereo or mono */
-        if (s->channels > 1 &&
-            avctx->request_channel_layout == AV_CH_LAYOUT_MONO) {
-            avctx->channels       = 1;
-            avctx->channel_layout = AV_CH_LAYOUT_MONO;
-        } else if (s->channels > 2 &&
-                   avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-            avctx->channels       = 2;
-            avctx->channel_layout = AV_CH_LAYOUT_STEREO;
-        } else {
-            avctx->channels = s->channels;
-            avctx->channel_layout = s->channel_layout;
+            /* (E-)AC-3: allow downmixing to stereo or mono */
+            if (s->channels > 1 &&
+                avctx->request_channel_layout == AV_CH_LAYOUT_MONO) {
+                avctx->channels       = 1;
+                avctx->channel_layout = AV_CH_LAYOUT_MONO;
+            } else if (s->channels > 2 &&
+                       avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
+                avctx->channels       = 2;
+                avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+            } else {
+                avctx->channels = s->channels;
+                avctx->channel_layout = s->channel_layout;
+            }
+            s1->duration = s->samples;
+            avctx->audio_service_type = s->service_type;
         }
-        s1->duration = s->samples;
-        avctx->audio_service_type = s->service_type;
-    }
 
-    avctx->bit_rate = s->bit_rate;
+        avctx->bit_rate = s->bit_rate;
+    }
 
     return i;
 }
diff --git a/libavcodec/aac_ac3_parser.h b/libavcodec/aac_ac3_parser.h
index 99286f0..c2506a5 100644
--- a/libavcodec/aac_ac3_parser.h
+++ b/libavcodec/aac_ac3_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac_adtstoasc_bsf.c b/libavcodec/aac_adtstoasc_bsf.c
index 9168e2b..48889fc 100644
--- a/libavcodec/aac_adtstoasc_bsf.c
+++ b/libavcodec/aac_adtstoasc_bsf.c
@@ -2,20 +2,20 @@
  * MPEG-2/4 AAC ADTS to MPEG-4 Audio Specific Configuration bitstream filter
  * Copyright (c) 2009 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -97,8 +97,7 @@ static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *out)
             in->data += get_bits_count(&gb)/8;
         }
 
-        extradata = av_packet_new_side_data(in, AV_PKT_DATA_NEW_EXTRADATA,
-                                            2 + pce_size);
+        extradata = av_mallocz(2 + pce_size + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!extradata) {
             ret = AVERROR(ENOMEM);
             goto fail;
@@ -116,6 +115,8 @@ static int aac_adtstoasc_filter(AVBSFContext *bsfc, AVPacket *out)
             memcpy(extradata + 2, pce_data, pce_size);
         }
 
+        bsfc->par_out->extradata = extradata;
+        bsfc->par_out->extradata_size = 2 + pce_size;
         ctx->first_frame_done = 1;
     }
 
diff --git a/libavcodec/aac_defines.h b/libavcodec/aac_defines.h
new file mode 100644
index 0000000..c12dc2f
--- /dev/null
+++ b/libavcodec/aac_defines.h
@@ -0,0 +1,114 @@
+/*
+ * AAC defines
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AAC_DEFINES_H
+#define AVCODEC_AAC_DEFINES_H
+
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
+#if USE_FIXED
+
+#include "libavutil/softfloat.h"
+
+#define FFT_FLOAT    0
+#define FFT_FIXED_32 1
+
+#define AAC_RENAME(x)       x ## _fixed
+#define AAC_RENAME_32(x)    x ## _fixed_32
+typedef int                 INTFLOAT;
+typedef int64_t             INT64FLOAT;
+typedef int16_t             SHORTFLOAT;
+typedef SoftFloat           AAC_FLOAT;
+typedef int                 AAC_SIGNE;
+#define FIXR(a)             ((int)((a) * 1 + 0.5))
+#define FIXR10(a)           ((int)((a) * 1024.0 + 0.5))
+#define Q23(a)              (int)((a) * 8388608.0 + 0.5)
+#define Q30(x)              (int)((x)*1073741824.0 + 0.5)
+#define Q31(x)              (int)((x)*2147483648.0 + 0.5)
+#define RANGE15(x)          x
+#define GET_GAIN(x, y)      (-(y) << (x)) + 1024
+#define AAC_MUL16(x, y)     (int)(((int64_t)(x) * (y) + 0x8000) >> 16)
+#define AAC_MUL26(x, y)     (int)(((int64_t)(x) * (y) + 0x2000000) >> 26)
+#define AAC_MUL30(x, y)     (int)(((int64_t)(x) * (y) + 0x20000000) >> 30)
+#define AAC_MUL31(x, y)     (int)(((int64_t)(x) * (y) + 0x40000000) >> 31)
+#define AAC_MADD28(x, y, a, b) (int)((((int64_t)(x) * (y)) + \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x8000000) >> 28)
+#define AAC_MADD30(x, y, a, b) (int)((((int64_t)(x) * (y)) + \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x20000000) >> 30)
+#define AAC_MADD30_V8(x, y, a, b, c, d, e, f) (int)((((int64_t)(x) * (y)) + \
+                                                     ((int64_t)(a) * (b)) + \
+                                                     ((int64_t)(c) * (d)) + \
+                                                     ((int64_t)(e) * (f)) + \
+                                                       0x20000000) >> 30)
+#define AAC_MSUB30(x, y, a, b) (int)((((int64_t)(x) * (y)) - \
+                                      ((int64_t)(a) * (b)) + \
+                                        0x20000000) >> 30)
+#define AAC_MSUB30_V8(x, y, a, b, c, d, e, f) (int)((((int64_t)(x) * (y)) + \
+                                                     ((int64_t)(a) * (b)) - \
+                                                     ((int64_t)(c) * (d)) - \
+                                                     ((int64_t)(e) * (f)) + \
+                                                       0x20000000) >> 30)
+#define AAC_MSUB31_V3(x, y, z)    (int)((((int64_t)(x) * (z)) - \
+                                      ((int64_t)(y) * (z)) + \
+                                        0x40000000) >> 31)
+#define AAC_HALF_SUM(x, y)  (x) >> 1 + (y) >> 1
+#define AAC_SRA_R(x, y)     (int)(((x) + (1 << ((y) - 1))) >> (y))
+
+#else
+
+#define FFT_FLOAT    1
+#define FFT_FIXED_32 0
+
+#define AAC_RENAME(x)       x
+#define AAC_RENAME_32(x)    x
+typedef float               INTFLOAT;
+typedef float               INT64FLOAT;
+typedef float               SHORTFLOAT;
+typedef float               AAC_FLOAT;
+typedef unsigned            AAC_SIGNE;
+#define FIXR(x)             ((float)(x))
+#define FIXR10(x)           ((float)(x))
+#define Q23(x)              ((float)(x))
+#define Q30(x)              ((float)(x))
+#define Q31(x)              ((float)(x))
+#define RANGE15(x)          (32768.0 * (x))
+#define GET_GAIN(x, y)      powf((x), -(y))
+#define AAC_MUL16(x, y)     ((x) * (y))
+#define AAC_MUL26(x, y)     ((x) * (y))
+#define AAC_MUL30(x, y)     ((x) * (y))
+#define AAC_MUL31(x, y)     ((x) * (y))
+#define AAC_MADD28(x, y, a, b) ((x) * (y) + (a) * (b))
+#define AAC_MADD30(x, y, a, b) ((x) * (y) + (a) * (b))
+#define AAC_MADD30_V8(x, y, a, b, c, d, e, f) ((x) * (y) + (a) * (b) + \
+                                               (c) * (d) + (e) * (f))
+#define AAC_MSUB30(x, y, a, b) ((x) * (y) - (a) * (b))
+#define AAC_MSUB30_V8(x, y, a, b, c, d, e, f) ((x) * (y) + (a) * (b) - \
+                                               (c) * (d) - (e) * (f))
+#define AAC_MSUB31_V3(x, y, z)    ((x) - (y)) * (z)
+#define AAC_HALF_SUM(x, y)  ((x) + (y)) * 0.5f
+#define AAC_SRA_R(x, y)     (x)
+
+#endif /* USE_FIXED */
+
+#endif /* AVCODEC_AAC_DEFINES_H */
diff --git a/libavcodec/aac_parser.c b/libavcodec/aac_parser.c
index eae120a..0b868ed 100644
--- a/libavcodec/aac_parser.c
+++ b/libavcodec/aac_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aac_tablegen.h b/libavcodec/aac_tablegen.h
deleted file mode 100644
index 8a05ec5..0000000
--- a/libavcodec/aac_tablegen.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Header file for hardcoded AAC tables
- *
- * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_AAC_TABLEGEN_H
-#define AVCODEC_AAC_TABLEGEN_H
-
-#include "aac_tablegen_decl.h"
-
-#if CONFIG_HARDCODED_TABLES
-#include "libavcodec/aac_tables.h"
-#else
-#include "libavutil/mathematics.h"
-float ff_aac_pow2sf_tab[428];
-
-void ff_aac_tableinit(void)
-{
-    int i;
-    for (i = 0; i < 428; i++)
-        ff_aac_pow2sf_tab[i] = pow(2, (i - POW_SF2_ZERO) / 4.0);
-}
-#endif /* CONFIG_HARDCODED_TABLES */
-
-#endif /* AVCODEC_AAC_TABLEGEN_H */
diff --git a/libavcodec/aac_tablegen_decl.h b/libavcodec/aac_tablegen_decl.h
deleted file mode 100644
index a5fd1cf..0000000
--- a/libavcodec/aac_tablegen_decl.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Header file for hardcoded AAC tables
- *
- * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_AAC_TABLEGEN_DECL_H
-#define AVCODEC_AAC_TABLEGEN_DECL_H
-
-#define POW_SF2_ZERO    200    ///< ff_aac_pow2sf_tab index corresponding to pow(2, 0);
-
-#if CONFIG_HARDCODED_TABLES
-#define ff_aac_tableinit()
-extern const float ff_aac_pow2sf_tab[428];
-#else
-void ff_aac_tableinit(void);
-extern       float ff_aac_pow2sf_tab[428];
-#endif /* CONFIG_HARDCODED_TABLES */
-
-#endif /* AVCODEC_AAC_TABLEGEN_DECL_H */
diff --git a/libavcodec/aacadtsdec.c b/libavcodec/aacadtsdec.c
index 2994bce..d0814ac 100644
--- a/libavcodec/aacadtsdec.c
+++ b/libavcodec/aacadtsdec.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2009 Alex Converse
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aacadtsdec.h b/libavcodec/aacadtsdec.h
index 6319efc..d0584ef 100644
--- a/libavcodec/aacadtsdec.h
+++ b/libavcodec/aacadtsdec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index ee89148..bca1f59 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -2,20 +2,20 @@
  * AAC coefficients encoder
  * Copyright (C) 2008-2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,269 +33,34 @@
 #include "libavutil/libm.h" // brought forward to work around cygwin header breakage
 
 #include <float.h>
+
 #include "libavutil/mathematics.h"
+#include "mathops.h"
 #include "avcodec.h"
 #include "put_bits.h"
 #include "aac.h"
 #include "aacenc.h"
 #include "aactab.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
+#include "aacenc_quantization.h"
 
-/** bits needed to code codebook run value for long windows */
-static const uint8_t run_value_bits_long[64] = {
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
-};
-
-/** bits needed to code codebook run value for short windows */
-static const uint8_t run_value_bits_short[16] = {
-    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
-};
-
-static const uint8_t *run_value_bits[2] = {
-    run_value_bits_long, run_value_bits_short
-};
-
+#include "aacenc_is.h"
+#include "aacenc_tns.h"
+#include "aacenc_ltp.h"
+#include "aacenc_pred.h"
 
-/**
- * Quantize one coefficient.
- * @return absolute value of the quantized coefficient
- * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
- */
-static av_always_inline int quant(float coef, const float Q)
-{
-    float a = coef * Q;
-    return sqrtf(a * sqrtf(a)) + 0.4054;
-}
+#include "libavcodec/aaccoder_twoloop.h"
 
-static void quantize_bands(int *out, const float *in, const float *scaled,
-                           int size, float Q34, int is_signed, int maxval)
-{
-    int i;
-    double qc;
-    for (i = 0; i < size; i++) {
-        qc = scaled[i] * Q34;
-        out[i] = (int)FFMIN(qc + 0.4054, (double)maxval);
-        if (is_signed && in[i] < 0.0f) {
-            out[i] = -out[i];
-        }
-    }
-}
+/* Parameter of f(x) = a*(lambda/100), defines the maximum fourier spread
+ * beyond which no PNS is used (since the SFBs contain tone rather than noise) */
+#define NOISE_SPREAD_THRESHOLD 0.9f
 
-static void abs_pow34_v(float *out, const float *in, const int size)
-{
-#ifndef USE_REALLY_FULL_SEARCH
-    int i;
-    for (i = 0; i < size; i++) {
-        float a = fabsf(in[i]);
-        out[i] = sqrtf(a * sqrtf(a));
-    }
-#endif /* USE_REALLY_FULL_SEARCH */
-}
+/* Parameter of f(x) = a*(100/lambda), defines how much PNS is allowed to
+ * replace low energy non zero bands */
+#define NOISE_LAMBDA_REPLACE 1.948f
 
-static const uint8_t aac_cb_range [12] = {0, 3, 3, 3, 3, 9, 9, 8, 8, 13, 13, 17};
-static const uint8_t aac_cb_maxval[12] = {0, 1, 1, 2, 2, 4, 4, 7, 7, 12, 12, 16};
-
-/**
- * Calculate rate distortion cost for quantizing with given codebook
- *
- * @return quantization distortion
- */
-static av_always_inline float quantize_and_encode_band_cost_template(
-                                struct AACEncContext *s,
-                                PutBitContext *pb, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits, int BT_ZERO, int BT_UNSIGNED,
-                                int BT_PAIR, int BT_ESC)
-{
-    const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512;
-    const float Q   = ff_aac_pow2sf_tab [q_idx];
-    const float Q34 = ff_aac_pow34sf_tab[q_idx];
-    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
-    const float CLIPPED_ESCAPE = 165140.0f*IQ;
-    int i, j;
-    float cost = 0;
-    const int dim = BT_PAIR ? 2 : 4;
-    int resbits = 0;
-    const int range  = aac_cb_range[cb];
-    const int maxval = aac_cb_maxval[cb];
-    int off;
-
-    if (BT_ZERO) {
-        for (i = 0; i < size; i++)
-            cost += in[i]*in[i];
-        if (bits)
-            *bits = 0;
-        return cost * lambda;
-    }
-    if (!scaled) {
-        abs_pow34_v(s->scoefs, in, size);
-        scaled = s->scoefs;
-    }
-    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, maxval);
-    if (BT_UNSIGNED) {
-        off = 0;
-    } else {
-        off = maxval;
-    }
-    for (i = 0; i < size; i += dim) {
-        const float *vec;
-        int *quants = s->qcoefs + i;
-        int curidx = 0;
-        int curbits;
-        float rd = 0.0f;
-        for (j = 0; j < dim; j++) {
-            curidx *= range;
-            curidx += quants[j] + off;
-        }
-        curbits =  ff_aac_spectral_bits[cb-1][curidx];
-        vec     = &ff_aac_codebook_vectors[cb-1][curidx*dim];
-        if (BT_UNSIGNED) {
-            for (j = 0; j < dim; j++) {
-                float t = fabsf(in[i+j]);
-                float di;
-                if (BT_ESC && vec[j] == 64.0f) { //FIXME: slow
-                    if (t >= CLIPPED_ESCAPE) {
-                        di = t - CLIPPED_ESCAPE;
-                        curbits += 21;
-                    } else {
-                        int c = av_clip_uintp2(quant(t, Q), 13);
-                        di = t - c*cbrtf(c)*IQ;
-                        curbits += av_log2(c)*2 - 4 + 1;
-                    }
-                } else {
-                    di = t - vec[j]*IQ;
-                }
-                if (vec[j] != 0.0f)
-                    curbits++;
-                rd += di*di;
-            }
-        } else {
-            for (j = 0; j < dim; j++) {
-                float di = in[i+j] - vec[j]*IQ;
-                rd += di*di;
-            }
-        }
-        cost    += rd * lambda + curbits;
-        resbits += curbits;
-        if (cost >= uplim)
-            return uplim;
-        if (pb) {
-            put_bits(pb, ff_aac_spectral_bits[cb-1][curidx], ff_aac_spectral_codes[cb-1][curidx]);
-            if (BT_UNSIGNED)
-                for (j = 0; j < dim; j++)
-                    if (ff_aac_codebook_vectors[cb-1][curidx*dim+j] != 0.0f)
-                        put_bits(pb, 1, in[i+j] < 0.0f);
-            if (BT_ESC) {
-                for (j = 0; j < 2; j++) {
-                    if (ff_aac_codebook_vectors[cb-1][curidx*2+j] == 64.0f) {
-                        int coef = av_clip_uintp2(quant(fabsf(in[i+j]), Q), 13);
-                        int len = av_log2(coef);
-
-                        put_bits(pb, len - 4 + 1, (1 << (len - 4 + 1)) - 2);
-                        put_bits(pb, len, coef & ((1 << len) - 1));
-                    }
-                }
-            }
-        }
-    }
-
-    if (bits)
-        *bits = resbits;
-    return cost;
-}
-
-#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC) \
-static float quantize_and_encode_band_cost_ ## NAME(                                        \
-                                struct AACEncContext *s,                                \
-                                PutBitContext *pb, const float *in,                     \
-                                const float *scaled, int size, int scale_idx,           \
-                                int cb, const float lambda, const float uplim,          \
-                                int *bits) {                                            \
-    return quantize_and_encode_band_cost_template(                                      \
-                                s, pb, in, scaled, size, scale_idx,                     \
-                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits,              \
-                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC);                 \
-}
-
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1)
-
-static float (*const quantize_and_encode_band_cost_arr[])(
-                                struct AACEncContext *s,
-                                PutBitContext *pb, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits) = {
-    quantize_and_encode_band_cost_ZERO,
-    quantize_and_encode_band_cost_SQUAD,
-    quantize_and_encode_band_cost_SQUAD,
-    quantize_and_encode_band_cost_UQUAD,
-    quantize_and_encode_band_cost_UQUAD,
-    quantize_and_encode_band_cost_SPAIR,
-    quantize_and_encode_band_cost_SPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_UPAIR,
-    quantize_and_encode_band_cost_ESC,
-};
-
-#define quantize_and_encode_band_cost(                                  \
-                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)                    \
-    quantize_and_encode_band_cost_arr[cb](                              \
-                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
-
-static float quantize_band_cost(struct AACEncContext *s, const float *in,
-                                const float *scaled, int size, int scale_idx,
-                                int cb, const float lambda, const float uplim,
-                                int *bits)
-{
-    return quantize_and_encode_band_cost(s, NULL, in, scaled, size, scale_idx,
-                                         cb, lambda, uplim, bits);
-}
-
-static void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
-                                     const float *in, int size, int scale_idx,
-                                     int cb, const float lambda)
-{
-    quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL);
-}
-
-static float find_max_val(int group_len, int swb_size, const float *scaled) {
-    float maxval = 0.0f;
-    int w2, i;
-    for (w2 = 0; w2 < group_len; w2++) {
-        for (i = 0; i < swb_size; i++) {
-            maxval = FFMAX(maxval, scaled[w2*128+i]);
-        }
-    }
-    return maxval;
-}
-
-static int find_min_book(float maxval, int sf) {
-    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
-    float Q34 = sqrtf(Q * sqrtf(Q));
-    int qmaxval, cb;
-    qmaxval = maxval * Q34 + 0.4054f;
-    if      (qmaxval ==  0) cb = 0;
-    else if (qmaxval ==  1) cb = 1;
-    else if (qmaxval ==  2) cb = 3;
-    else if (qmaxval <=  4) cb = 5;
-    else if (qmaxval <=  7) cb = 7;
-    else if (qmaxval <= 12) cb = 9;
-    else                    cb = 11;
-    return cb;
-}
+#include "libavcodec/aaccoder_trellis.h"
 
 /**
  * structure used in optimal codebook search
@@ -312,7 +77,7 @@ typedef struct BandCodingPath {
 static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce,
                                      int win, int group_len, const float lambda)
 {
-    BandCodingPath path[120][12];
+    BandCodingPath path[120][CB_TOT_ALL];
     int w, swb, cb, start, size;
     int i, j;
     const int max_sfb  = sce->ics.max_sfb;
@@ -325,7 +90,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
 
     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
     start = win*128;
-    for (cb = 0; cb < 12; cb++) {
+    for (cb = 0; cb < CB_TOT_ALL; cb++) {
         path[0][cb].cost     = 0.0f;
         path[0][cb].prev_idx = -1;
         path[0][cb].run      = 0;
@@ -333,7 +98,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     for (swb = 0; swb < max_sfb; swb++) {
         size = sce->ics.swb_sizes[swb];
         if (sce->zeroes[win*16 + swb]) {
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT_ALL; cb++) {
                 path[swb+1][cb].prev_idx = cb;
                 path[swb+1][cb].cost     = path[swb][cb].cost;
                 path[swb+1][cb].run      = path[swb][cb].run + 1;
@@ -343,15 +108,22 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
             int mincb = next_mincb;
             next_minrd = INFINITY;
             next_mincb = 0;
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT_ALL; cb++) {
                 float cost_stay_here, cost_get_here;
                 float rd = 0.0f;
+                if (cb >= 12 && sce->band_type[win*16+swb] < aac_cb_out_map[cb] ||
+                    cb  < aac_cb_in_map[sce->band_type[win*16+swb]] && sce->band_type[win*16+swb] > aac_cb_out_map[cb]) {
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].cost     = INFINITY;
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
+                    continue;
+                }
                 for (w = 0; w < group_len; w++) {
                     FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(win+w)*16+swb];
-                    rd += quantize_band_cost(s, sce->coeffs + start + w*128,
-                                             s->scoefs + start + w*128, size,
-                                             sce->sf_idx[(win+w)*16+swb], cb,
-                                             lambda / band->threshold, INFINITY, NULL);
+                    rd += quantize_band_cost(s, &sce->coeffs[start + w*128],
+                                             &s->scoefs[start + w*128], size,
+                                             sce->sf_idx[(win+w)*16+swb], aac_cb_out_map[cb],
+                                             lambda / band->threshold, INFINITY, NULL, NULL, 0);
                 }
                 cost_stay_here = path[swb][cb].cost + rd;
                 cost_get_here  = minrd              + rd + run_bits + 4;
@@ -379,11 +151,12 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //convert resulting path from backward-linked list
     stack_len = 0;
     idx       = 0;
-    for (cb = 1; cb < 12; cb++)
+    for (cb = 1; cb < CB_TOT_ALL; cb++)
         if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
             idx = cb;
     ppos = max_sfb;
     while (ppos > 0) {
+        av_assert1(idx >= 0);
         cb = idx;
         stackrun[stack_len] = path[ppos][cb].run;
         stackcb [stack_len] = cb;
@@ -394,12 +167,13 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //perform actual band info encoding
     start = 0;
     for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
         count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
+        memset(sce->zeroes + win*16 + start, !cb, count);
         //XXX: memset when band_type is also uint8_t
         for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
+            sce->band_type[win*16 + start] = cb;
             start++;
         }
         while (count >= run_esc) {
@@ -410,147 +184,54 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     }
 }
 
-static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
-                                  int win, int group_len, const float lambda)
+
+typedef struct TrellisPath {
+    float cost;
+    int prev;
+} TrellisPath;
+
+#define TRELLIS_STAGES 121
+#define TRELLIS_STATES (SCALE_MAX_DIFF+1)
+
+static void set_special_band_scalefactors(AACEncContext *s, SingleChannelElement *sce)
 {
-    BandCodingPath path[120][12];
-    int w, swb, cb, start, size;
-    int i, j;
-    const int max_sfb  = sce->ics.max_sfb;
-    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
-    const int run_esc  = (1 << run_bits) - 1;
-    int idx, ppos, count;
-    int stackrun[120], stackcb[120], stack_len;
-    float next_minbits = INFINITY;
-    int next_mincb = 0;
+    int w, g;
+    int prevscaler_n = -255, prevscaler_i = 0;
+    int bands = 0;
 
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    start = win*128;
-    for (cb = 0; cb < 12; cb++) {
-        path[0][cb].cost     = run_bits+4;
-        path[0][cb].prev_idx = -1;
-        path[0][cb].run      = 0;
-    }
-    for (swb = 0; swb < max_sfb; swb++) {
-        size = sce->ics.swb_sizes[swb];
-        if (sce->zeroes[win*16 + swb]) {
-            float cost_stay_here = path[swb][0].cost;
-            float cost_get_here  = next_minbits + run_bits + 4;
-            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
-                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
-                cost_stay_here += run_bits;
-            if (cost_get_here < cost_stay_here) {
-                path[swb+1][0].prev_idx = next_mincb;
-                path[swb+1][0].cost     = cost_get_here;
-                path[swb+1][0].run      = 1;
-            } else {
-                path[swb+1][0].prev_idx = 0;
-                path[swb+1][0].cost     = cost_stay_here;
-                path[swb+1][0].run      = path[swb][0].run + 1;
-            }
-            next_minbits = path[swb+1][0].cost;
-            next_mincb = 0;
-            for (cb = 1; cb < 12; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-        } else {
-            float minbits = next_minbits;
-            int mincb = next_mincb;
-            int startcb = sce->band_type[win*16+swb];
-            next_minbits = INFINITY;
-            next_mincb = 0;
-            for (cb = 0; cb < startcb; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-            for (cb = startcb; cb < 12; cb++) {
-                float cost_stay_here, cost_get_here;
-                float bits = 0.0f;
-                for (w = 0; w < group_len; w++) {
-                    bits += quantize_band_cost(s, sce->coeffs + start + w*128,
-                                               s->scoefs + start + w*128, size,
-                                               sce->sf_idx[(win+w)*16+swb], cb,
-                                               0, INFINITY, NULL);
-                }
-                cost_stay_here = path[swb][cb].cost + bits;
-                cost_get_here  = minbits            + bits + run_bits + 4;
-                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
-                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
-                    cost_stay_here += run_bits;
-                if (cost_get_here < cost_stay_here) {
-                    path[swb+1][cb].prev_idx = mincb;
-                    path[swb+1][cb].cost     = cost_get_here;
-                    path[swb+1][cb].run      = 1;
-                } else {
-                    path[swb+1][cb].prev_idx = cb;
-                    path[swb+1][cb].cost     = cost_stay_here;
-                    path[swb+1][cb].run      = path[swb][cb].run + 1;
-                }
-                if (path[swb+1][cb].cost < next_minbits) {
-                    next_minbits = path[swb+1][cb].cost;
-                    next_mincb = cb;
-                }
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g])
+                continue;
+            if (sce->band_type[w*16+g] == INTENSITY_BT || sce->band_type[w*16+g] == INTENSITY_BT2) {
+                sce->sf_idx[w*16+g] = av_clip(roundf(log2f(sce->is_ener[w*16+g])*2), -155, 100);
+                bands++;
+            } else if (sce->band_type[w*16+g] == NOISE_BT) {
+                sce->sf_idx[w*16+g] = av_clip(3+ceilf(log2f(sce->pns_ener[w*16+g])*2), -100, 155);
+                if (prevscaler_n == -255)
+                    prevscaler_n = sce->sf_idx[w*16+g];
+                bands++;
             }
         }
-        start += sce->ics.swb_sizes[swb];
     }
 
-    //convert resulting path from backward-linked list
-    stack_len = 0;
-    idx       = 0;
-    for (cb = 1; cb < 12; cb++)
-        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
-            idx = cb;
-    ppos = max_sfb;
-    while (ppos > 0) {
-        assert(idx >= 0);
-        cb = idx;
-        stackrun[stack_len] = path[ppos][cb].run;
-        stackcb [stack_len] = cb;
-        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
-        ppos -= path[ppos][cb].run;
-        stack_len++;
-    }
-    //perform actual band info encoding
-    start = 0;
-    for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
-        count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
-        //XXX: memset when band_type is also uint8_t
-        for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
-            start++;
-        }
-        while (count >= run_esc) {
-            put_bits(&s->pb, run_bits, run_esc);
-            count -= run_esc;
+    if (!bands)
+        return;
+
+    /* Clip the scalefactor indices */
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g])
+                continue;
+            if (sce->band_type[w*16+g] == INTENSITY_BT || sce->band_type[w*16+g] == INTENSITY_BT2) {
+                sce->sf_idx[w*16+g] = prevscaler_i = av_clip(sce->sf_idx[w*16+g], prevscaler_i - SCALE_MAX_DIFF, prevscaler_i + SCALE_MAX_DIFF);
+            } else if (sce->band_type[w*16+g] == NOISE_BT) {
+                sce->sf_idx[w*16+g] = prevscaler_n = av_clip(sce->sf_idx[w*16+g], prevscaler_n - SCALE_MAX_DIFF, prevscaler_n + SCALE_MAX_DIFF);
+            }
         }
-        put_bits(&s->pb, run_bits, count);
     }
 }
 
-/** Return the minimum scalefactor where the quantized coef does not clip. */
-static av_always_inline uint8_t coef2minsf(float coef) {
-    return av_clip_uint8(log2f(coef)*4 - 69 + SCALE_ONE_POS - SCALE_DIV_512);
-}
-
-/** Return the maximum scalefactor where the quantized coef is not zero. */
-static av_always_inline uint8_t coef2maxsf(float coef) {
-    return av_clip_uint8(log2f(coef)*4 +  6 + SCALE_ONE_POS - SCALE_DIV_512);
-}
-
-typedef struct TrellisPath {
-    float cost;
-    int prev;
-} TrellisPath;
-
-#define TRELLIS_STAGES 121
-#define TRELLIS_STATES (SCALE_MAX_DIFF+1)
-
 static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                                        SingleChannelElement *sce,
                                        const float lambda)
@@ -582,9 +263,9 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
     }
 
     //minimum scalefactor index is when minimum nonzero coefficient after quantizing is not clipped
-    q0 = coef2minsf(q0f);
+    q0 = av_clip(coef2minsf(q0f), 0, SCALE_MAX_POS-1);
     //maximum scalefactor index is when maximum coefficient after quantizing is still not zero
-    q1 = coef2maxsf(q1f);
+    q1 = av_clip(coef2maxsf(q1f), 1, SCALE_MAX_POS);
     if (q1 - q0 > 60) {
         int q0low  = q0;
         int q1high = q1;
@@ -600,6 +281,12 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
             q1  = q1high;
         }
     }
+    // q0 == q1 isn't really a legal situation
+    if (q0 == q1) {
+        // the following is indirect but guarantees q1 != q0 && q1 near q0
+        q1 = av_clip(q0+1, 1, SCALE_MAX_POS);
+        q0 = av_clip(q1-1, 0, SCALE_MAX_POS - 1);
+    }
 
     for (i = 0; i < TRELLIS_STATES; i++) {
         paths[0][i].cost    = 0.0f;
@@ -616,7 +303,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         start = w*128;
         for (g = 0; g < sce->ics.num_swb; g++) {
-            const float *coefs = sce->coeffs + start;
+            const float *coefs = &sce->coeffs[start];
             float qmin, qmax;
             int nz = 0;
 
@@ -648,6 +335,10 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                 maxscale = coef2maxsf(qmax);
                 minscale = av_clip(minscale - q0, 0, TRELLIS_STATES - 1);
                 maxscale = av_clip(maxscale - q0, 0, TRELLIS_STATES);
+                if (minscale == maxscale) {
+                    maxscale = av_clip(minscale+1, 1, TRELLIS_STATES);
+                    minscale = av_clip(maxscale-1, 0, TRELLIS_STATES - 1);
+                }
                 maxval = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], s->scoefs+start);
                 for (q = minscale; q < maxscale; q++) {
                     float dist = 0;
@@ -655,7 +346,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                     for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                         FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                         dist += quantize_band_cost(s, coefs + w2*128, s->scoefs + start + w2*128, sce->ics.swb_sizes[g],
-                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL);
+                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL, NULL, 0);
                     }
                     minrd = FFMIN(minrd, dist);
 
@@ -691,7 +382,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
     }
     while (idx) {
         sce->sf_idx[bandaddr[idx]] = minq + q0;
-        minq = paths[idx][minq].prev;
+        minq = FFMAX(paths[idx][minq].prev, 0);
         idx--;
     }
     //set the same quantizers inside window groups
@@ -701,440 +392,460 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                 sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-/**
- * two-loop quantizers search taken from ISO 13818-7 Appendix C
- */
-static void search_for_quantizers_twoloop(AVCodecContext *avctx,
-                                          AACEncContext *s,
-                                          SingleChannelElement *sce,
-                                          const float lambda)
+static void search_for_quantizers_fast(AVCodecContext *avctx, AACEncContext *s,
+                                       SingleChannelElement *sce,
+                                       const float lambda)
 {
-    int start = 0, i, w, w2, g;
-    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128];
-    float maxvals[128];
-    int fflag, minscaler;
-    int its  = 0;
-    int allz = 0;
-    float minthr = INFINITY;
+    int i, w, w2, g;
+    int minq = 255;
 
-    // for values above this the decoder might end up in an endless loop
-    // due to always having more bits than what can be encoded.
-    destbits = FFMIN(destbits, 5800);
-    //XXX: some heuristic to determine initial quantizers will reduce search time
-    //determine zero bands and upper limits
+    memset(sce->sf_idx, 0, sizeof(sce->sf_idx));
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            int nz = 0;
-            float uplim = 0.0f;
+        for (g = 0; g < sce->ics.num_swb; g++) {
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                uplim += band->threshold;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
+                if (band->energy <= band->threshold) {
+                    sce->sf_idx[(w+w2)*16+g] = 218;
                     sce->zeroes[(w+w2)*16+g] = 1;
-                    continue;
+                } else {
+                    sce->sf_idx[(w+w2)*16+g] = av_clip(SCALE_ONE_POS - SCALE_DIV_512 + log2f(band->threshold), 80, 218);
+                    sce->zeroes[(w+w2)*16+g] = 0;
                 }
-                nz = 1;
-            }
-            uplims[w*16+g] = uplim *512;
-            sce->zeroes[w*16+g] = !nz;
-            if (nz)
-                minthr = FFMIN(minthr, uplim);
-            allz |= nz;
-        }
-    }
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            if (sce->zeroes[w*16+g]) {
-                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
-                continue;
+                minq = FFMIN(minq, sce->sf_idx[(w+w2)*16+g]);
             }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
         }
     }
-
-    if (!allz)
-        return;
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *scaled = s->scoefs + start;
-            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
-            start += sce->ics.swb_sizes[g];
-        }
+    for (i = 0; i < 128; i++) {
+        sce->sf_idx[i] = 140;
+        //av_clip(sce->sf_idx[i], minq, minq + SCALE_MAX_DIFF - 1);
     }
-
-    //perform two-loop search
-    //outer loop - improve quality
-    do {
-        int tbits, qstep;
-        minscaler = sce->sf_idx[0];
-        //inner loop - quantize spectrum to fit into given number of bits
-        qstep = its ? 1 : 32;
-        do {
-            int prev = -1;
-            tbits = 0;
-            fflag = 0;
-            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-                start = w*128;
-                for (g = 0;  g < sce->ics.num_swb; g++) {
-                    const float *coefs = sce->coeffs + start;
-                    const float *scaled = s->scoefs + start;
-                    int bits = 0;
-                    int cb;
-                    float dist = 0.0f;
-
-                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
-                        start += sce->ics.swb_sizes[g];
-                        continue;
-                    }
-                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
-                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                        int b;
-                        dist += quantize_band_cost(s, coefs + w2*128,
-                                                   scaled + w2*128,
-                                                   sce->ics.swb_sizes[g],
-                                                   sce->sf_idx[w*16+g],
-                                                   cb,
-                                                   1.0f,
-                                                   INFINITY,
-                                                   &b);
-                        bits += b;
-                    }
-                    dists[w*16+g] = dist - bits;
-                    if (prev != -1) {
-                        bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
-                    }
-                    tbits += bits;
-                    start += sce->ics.swb_sizes[g];
-                    prev = sce->sf_idx[w*16+g];
-                }
-            }
-            if (tbits > destbits) {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
-                        sce->sf_idx[i] += qstep;
-            } else {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
-                        sce->sf_idx[i] -= qstep;
-            }
-            qstep >>= 1;
-            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
-                qstep = 1;
-        } while (qstep);
-
-        fflag = 0;
-        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
-        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-            for (g = 0; g < sce->ics.num_swb; g++) {
-                int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
-                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
-                        sce->sf_idx[w*16+g]--;
-                    else //Try to make sure there is some energy in every band
-                        sce->sf_idx[w*16+g]-=2;
-                }
-                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
-                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
-                if (sce->sf_idx[w*16+g] != prevsc)
-                    fflag = 1;
-                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-            }
-        }
-        its++;
-    } while (fflag && its < 10);
+    //set the same quantizers inside window groups
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
+        for (g = 0;  g < sce->ics.num_swb; g++)
+            for (w2 = 1; w2 < sce->ics.group_len[w]; w2++)
+                sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
-                                       SingleChannelElement *sce,
-                                       const float lambda)
+static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce)
 {
-    int start = 0, i, w, w2, g;
-    float uplim[128], maxq[128];
-    int minq, maxsf;
-    float distfact = ((sce->ics.num_windows > 1) ? 85.80 : 147.84) / lambda;
-    int last = 0, lastband = 0, curband = 0;
-    float avg_energy = 0.0;
-    if (sce->ics.num_windows == 1) {
-        start = 0;
-        for (i = 0; i < 1024; i++) {
-            if (i - start >= sce->ics.swb_sizes[curband]) {
-                start += sce->ics.swb_sizes[curband];
-                curband++;
-            }
-            if (sce->coeffs[i]) {
-                avg_energy += sce->coeffs[i] * sce->coeffs[i];
-                last = i;
-                lastband = curband;
-            }
-        }
+    FFPsyBand *band;
+    int w, g, w2, i;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
+    float *PNS = &s->scoefs[0*128], *PNS34 = &s->scoefs[1*128];
+    float *NOR34 = &s->scoefs[3*128];
+    uint8_t nextband[128];
+    const float lambda = s->lambda;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
+    const float thr_mult = NOISE_LAMBDA_REPLACE*(100.0f/lambda);
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float dist_bias = av_clipf(4.f * 120 / lambda, 0.25f, 4.0f);
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int prev = -1000, prev_sf = -1;
+    int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
     } else {
-        for (w = 0; w < 8; w++) {
-            const float *coeffs = sce->coeffs + w*128;
-            start = 0;
-            for (i = 0; i < 128; i++) {
-                if (i - start >= sce->ics.swb_sizes[curband]) {
-                    start += sce->ics.swb_sizes[curband];
-                    curband++;
-                }
-                if (coeffs[i]) {
-                    avg_energy += coeffs[i] * coeffs[i];
-                    last = FFMAX(last, i);
-                    lastband = FFMAX(lastband, curband);
-                }
-            }
-        }
-    }
-    last++;
-    avg_energy /= last;
-    if (avg_energy == 0.0f) {
-        for (i = 0; i < FF_ARRAY_ELEMS(sce->sf_idx); i++)
-            sce->sf_idx[i] = SCALE_ONE_POS;
-        return;
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
     }
+
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+    ff_init_nextband_map(sce, nextband);
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0; g < sce->ics.num_swb; g++) {
-            float *coefs   = sce->coeffs + start;
-            const int size = sce->ics.swb_sizes[g];
-            int start2 = start, end2 = start + size, peakpos = start;
-            float maxval = -1, thr = 0.0f, t;
-            maxq[w*16+g] = 0.0f;
-            if (g > lastband) {
-                maxq[w*16+g] = 0.0f;
-                start += size;
-                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++)
-                    memset(coefs + w2*128, 0, sizeof(coefs[0])*size);
+        int wstart = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            int noise_sfi;
+            float dist1 = 0.0f, dist2 = 0.0f, noise_amp;
+            float pns_energy = 0.0f, pns_tgt_energy, energy_ratio, dist_thresh;
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
+            const int start = wstart+sce->ics.swb_offset[g];
+            const float freq = (start-wstart)*freq_mult;
+            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
+            if (freq < NOISE_LOW_LIMIT || (start-wstart) >= cutoff) {
+                if (!sce->zeroes[w*16+g])
+                    prev_sf = sce->sf_idx[w*16+g];
                 continue;
             }
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                for (i = 0; i < size; i++) {
-                    float t = coefs[w2*128+i]*coefs[w2*128+i];
-                    maxq[w*16+g] = FFMAX(maxq[w*16+g], fabsf(coefs[w2*128 + i]));
-                    thr += t;
-                    if (sce->ics.num_windows == 1 && maxval < t) {
-                        maxval  = t;
-                        peakpos = start+i;
-                    }
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                sfb_energy += band->energy;
+                spread     = FFMIN(spread, band->spread);
+                threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
+                } else {
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
                 }
             }
-            if (sce->ics.num_windows == 1) {
-                start2 = FFMAX(peakpos - 2, start2);
-                end2   = FFMIN(peakpos + 3, end2);
-            } else {
-                start2 -= start;
-                end2   -= start;
-            }
-            start += size;
-            thr = pow(thr / (avg_energy * (end2 - start2)), 0.3 + 0.1*(lastband - g) / lastband);
-            t   = 1.0 - (1.0 * start2 / last);
-            uplim[w*16+g] = distfact / (1.4 * thr + t*t*t + 0.075);
-        }
-    }
-    memset(sce->sf_idx, 0, sizeof(sce->sf_idx));
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *coefs  = sce->coeffs + start;
-            const float *scaled = s->scoefs   + start;
-            const int size      = sce->ics.swb_sizes[g];
-            int scf, prev_scf, step;
-            int min_scf = -1, max_scf = 256;
-            float curdiff;
-            if (maxq[w*16+g] < 21.544) {
-                sce->zeroes[w*16+g] = 1;
-                start += size;
+
+            /* Ramps down at ~8000Hz and loosens the dist threshold */
+            dist_thresh = av_clipf(2.5f*NOISE_LOW_LIMIT/freq, 0.5f, 2.5f) * dist_bias;
+
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             *
+             * At this stage, point 2 is relaxed for zeroed bands near the noise threshold (hole avoidance is more important)
+             */
+            if ((!sce->zeroes[w*16+g] && !ff_sfdelta_can_remove_band(sce, nextband, prev_sf, w*16+g)) ||
+                ((sce->zeroes[w*16+g] || !sce->band_alt[w*16+g]) && sfb_energy < threshold*sqrtf(1.0f/freq_boost)) || spread < spread_threshold ||
+                (!sce->zeroes[w*16+g] && sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost) ||
+                min_energy < pns_transient_energy_r * max_energy ) {
+                sce->pns_ener[w*16+g] = sfb_energy;
+                if (!sce->zeroes[w*16+g])
+                    prev_sf = sce->sf_idx[w*16+g];
                 continue;
             }
-            sce->zeroes[w*16+g] = 0;
-            scf  = prev_scf = av_clip(SCALE_ONE_POS - SCALE_DIV_512 - log2f(1/maxq[w*16+g])*16/3, 60, 218);
-            step = 16;
-            for (;;) {
-                float dist = 0.0f;
-                int quant_max;
 
-                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                    int b;
-                    dist += quantize_band_cost(s, coefs + w2*128,
-                                               scaled + w2*128,
-                                               sce->ics.swb_sizes[g],
-                                               scf,
-                                               ESC_BT,
-                                               lambda,
-                                               INFINITY,
-                                               &b);
-                    dist -= b;
-                }
-                dist *= 1.0f / 512.0f / lambda;
-                quant_max = quant(maxq[w*16+g], ff_aac_pow2sf_tab[POW_SF2_ZERO - scf + SCALE_ONE_POS - SCALE_DIV_512]);
-                if (quant_max >= 8191) { // too much, return to the previous quantizer
-                    sce->sf_idx[w*16+g] = prev_scf;
-                    break;
+            pns_tgt_energy = sfb_energy*FFMIN(1.0f, spread*spread);
+            noise_sfi = av_clip(roundf(log2f(pns_tgt_energy)*2), -100, 155); /* Quantize */
+            noise_amp = -ff_aac_pow2sf_tab[noise_sfi + POW_SF2_ZERO];    /* Dequantize */
+            if (prev != -1000) {
+                int noise_sfdiff = noise_sfi - prev + SCALE_DIFF_ZERO;
+                if (noise_sfdiff < 0 || noise_sfdiff > 2*SCALE_MAX_DIFF) {
+                    if (!sce->zeroes[w*16+g])
+                        prev_sf = sce->sf_idx[w*16+g];
+                    continue;
                 }
-                prev_scf = scf;
-                curdiff = fabsf(dist - uplim[w*16+g]);
-                if (curdiff <= 1.0f)
-                    step = 0;
-                else
-                    step = log2f(curdiff);
-                if (dist > uplim[w*16+g])
-                    step = -step;
-                scf += step;
-                scf = av_clip_uint8(scf);
-                step = scf - prev_scf;
-                if (FFABS(step) <= 1 || (step > 0 && scf >= max_scf) || (step < 0 && scf <= min_scf)) {
-                    sce->sf_idx[w*16+g] = av_clip(scf, min_scf, max_scf);
-                    break;
+            }
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                float band_energy, scale, pns_senergy;
+                const int start_c = (w+w2)*128+sce->ics.swb_offset[g];
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                for (i = 0; i < sce->ics.swb_sizes[g]; i+=2) {
+                    double rnd[2];
+                    av_bmg_get(&s->lfg, rnd);
+                    PNS[i+0] = (float)rnd[0];
+                    PNS[i+1] = (float)rnd[1];
                 }
-                if (step > 0)
-                    min_scf = prev_scf;
-                else
-                    max_scf = prev_scf;
+                band_energy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                scale = noise_amp/sqrtf(band_energy);
+                s->fdsp->vector_fmul_scalar(PNS, PNS, scale, sce->ics.swb_sizes[g]);
+                pns_senergy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                pns_energy += pns_senergy;
+                abs_pow34_v(NOR34, &sce->coeffs[start_c], sce->ics.swb_sizes[g]);
+                abs_pow34_v(PNS34, PNS, sce->ics.swb_sizes[g]);
+                dist1 += quantize_band_cost(s, &sce->coeffs[start_c],
+                                            NOR34,
+                                            sce->ics.swb_sizes[g],
+                                            sce->sf_idx[(w+w2)*16+g],
+                                            sce->band_alt[(w+w2)*16+g],
+                                            lambda/band->threshold, INFINITY, NULL, NULL, 0);
+                /* Estimate rd on average as 5 bits for SF, 4 for the CB, plus spread energy * lambda/thr */
+                dist2 += band->energy/(band->spread*band->spread)*lambda*dist_thresh/band->threshold;
+            }
+            if (g && sce->band_type[w*16+g-1] == NOISE_BT) {
+                dist2 += 5;
+            } else {
+                dist2 += 9;
+            }
+            energy_ratio = pns_tgt_energy/pns_energy; /* Compensates for quantization error */
+            sce->pns_ener[w*16+g] = energy_ratio*pns_tgt_energy;
+            if (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || (energy_ratio > 0.85f && energy_ratio < 1.25f && dist2 < dist1)) {
+                sce->band_type[w*16+g] = NOISE_BT;
+                sce->zeroes[w*16+g] = 0;
+                prev = noise_sfi;
+            } else {
+                if (!sce->zeroes[w*16+g])
+                    prev_sf = sce->sf_idx[w*16+g];
             }
-            start += size;
         }
     }
-    minq = sce->sf_idx[0] ? sce->sf_idx[0] : INT_MAX;
-    for (i = 1; i < 128; i++) {
-        if (!sce->sf_idx[i])
-            sce->sf_idx[i] = sce->sf_idx[i-1];
-        else
-            minq = FFMIN(minq, sce->sf_idx[i]);
-    }
-    if (minq == INT_MAX)
-        minq = 0;
-    minq = FFMIN(minq, SCALE_MAX_POS);
-    maxsf = FFMIN(minq + SCALE_MAX_DIFF, SCALE_MAX_POS);
-    for (i = 126; i >= 0; i--) {
-        if (!sce->sf_idx[i])
-            sce->sf_idx[i] = sce->sf_idx[i+1];
-        sce->sf_idx[i] = av_clip(sce->sf_idx[i], minq, maxsf);
-    }
 }
 
-static void search_for_quantizers_fast(AVCodecContext *avctx, AACEncContext *s,
-                                       SingleChannelElement *sce,
-                                       const float lambda)
+static void mark_pns(AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce)
 {
-    int i, w, w2, g;
-    int minq = 255;
+    FFPsyBand *band;
+    int w, g, w2;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
+    const float lambda = s->lambda;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
+    } else {
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+    }
 
-    memset(sce->sf_idx, 0, sizeof(sce->sf_idx));
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0; g < sce->ics.num_swb; g++) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
+            const int start = sce->ics.swb_offset[g];
+            const float freq = start*freq_mult;
+            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
+            if (freq < NOISE_LOW_LIMIT || start >= cutoff) {
+                sce->can_pns[w*16+g] = 0;
+                continue;
+            }
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                if (band->energy <= band->threshold) {
-                    sce->sf_idx[(w+w2)*16+g] = 218;
-                    sce->zeroes[(w+w2)*16+g] = 1;
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                sfb_energy += band->energy;
+                spread     = FFMIN(spread, band->spread);
+                threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
                 } else {
-                    sce->sf_idx[(w+w2)*16+g] = av_clip(SCALE_ONE_POS - SCALE_DIV_512 + log2f(band->threshold), 80, 218);
-                    sce->zeroes[(w+w2)*16+g] = 0;
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
                 }
-                minq = FFMIN(minq, sce->sf_idx[(w+w2)*16+g]);
+            }
+
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             */
+            sce->pns_ener[w*16+g] = sfb_energy;
+            if (sfb_energy < threshold*sqrtf(1.5f/freq_boost) || spread < spread_threshold || min_energy < pns_transient_energy_r * max_energy) {
+                sce->can_pns[w*16+g] = 0;
+            } else {
+                sce->can_pns[w*16+g] = 1;
             }
         }
     }
-    for (i = 0; i < 128; i++) {
-        sce->sf_idx[i] = 140;
-        //av_clip(sce->sf_idx[i], minq, minq + SCALE_MAX_DIFF - 1);
-    }
-    //set the same quantizers inside window groups
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
-        for (g = 0;  g < sce->ics.num_swb; g++)
-            for (w2 = 1; w2 < sce->ics.group_len[w]; w2++)
-                sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-static void search_for_ms(AACEncContext *s, ChannelElement *cpe,
-                          const float lambda)
+static void search_for_ms(AACEncContext *s, ChannelElement *cpe)
 {
-    int start = 0, i, w, w2, g;
+    int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
+    uint8_t nextband0[128], nextband1[128];
     float M[128], S[128];
     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
+    const float lambda = s->lambda;
+    const float mslambda = FFMIN(1.0f, lambda / 120.f);
     SingleChannelElement *sce0 = &cpe->ch[0];
     SingleChannelElement *sce1 = &cpe->ch[1];
     if (!cpe->common_window)
         return;
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce0, nextband0);
+    ff_init_nextband_map(sce1, nextband1);
+
+    prev_mid = sce0->sf_idx[0];
+    prev_side = sce1->sf_idx[0];
     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
         for (g = 0;  g < sce0->ics.num_swb; g++) {
-            if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
-                float dist1 = 0.0f, dist2 = 0.0f;
+            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
+            if (!cpe->is_mask[w*16+g])
+                cpe->ms_mask[w*16+g] = 0;
+            if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
+                float Mmax = 0.0f, Smax = 0.0f;
+
+                /* Must compute mid/side SF and book for the whole window group */
                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
-                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
-                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
-                    float minthr = FFMIN(band0->threshold, band1->threshold);
-                    float maxthr = FFMAX(band0->threshold, band1->threshold);
                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
-                        M[i] = (sce0->coeffs[start+w2*128+i]
-                              + sce1->coeffs[start+w2*128+i]) * 0.5;
+                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
                         S[i] =  M[i]
-                              - sce1->coeffs[start+w2*128+i];
+                              - sce1->coeffs[start+(w+w2)*128+i];
+                    }
+                    abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
+                        Mmax = FFMAX(Mmax, M34[i]);
+                        Smax = FFMAX(Smax, S34[i]);
+                    }
+                }
+
+                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
+                    float dist1 = 0.0f, dist2 = 0.0f;
+                    int B0 = 0, B1 = 0;
+                    int minidx;
+                    int mididx, sididx;
+                    int midcb, sidcb;
+
+                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
+                    mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
+                        && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
+                            || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
+                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
+                        continue;
+                    }
+
+                    midcb = find_min_book(Mmax, mididx);
+                    sidcb = find_min_book(Smax, sididx);
+
+                    /* No CB can be zero */
+                    midcb = FFMAX(1,midcb);
+                    sidcb = FFMAX(1,sidcb);
+
+                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                        float minthr = FFMIN(band0->threshold, band1->threshold);
+                        int b1,b2,b3,b4;
+                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                            S[i] =  M[i]
+                                  - sce1->coeffs[start+(w+w2)*128+i];
+                        }
+
+                        abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
+                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                    L34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[w*16+g],
+                                                    sce0->band_type[w*16+g],
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
+                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                    R34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[w*16+g],
+                                                    sce1->band_type[w*16+g],
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
+                        dist2 += quantize_band_cost(s, M,
+                                                    M34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    mididx,
+                                                    midcb,
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
+                        dist2 += quantize_band_cost(s, S,
+                                                    S34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sididx,
+                                                    sidcb,
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
+                        B0 += b1+b2;
+                        B1 += b3+b4;
+                        dist1 -= b1+b2;
+                        dist2 -= b3+b4;
+                    }
+                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
+                    if (cpe->ms_mask[w*16+g]) {
+                        if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
+                            sce0->sf_idx[w*16+g] = mididx;
+                            sce1->sf_idx[w*16+g] = sididx;
+                            sce0->band_type[w*16+g] = midcb;
+                            sce1->band_type[w*16+g] = sidcb;
+                        } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
+                            /* ms_mask unneeded, and it confuses some decoders */
+                            cpe->ms_mask[w*16+g] = 0;
+                        }
+                        break;
+                    } else if (B1 > B0) {
+                        /* More boost won't fix this */
+                        break;
                     }
-                    abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
-                    dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
-                                                L34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL);
-                    dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
-                                                R34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL);
-                    dist2 += quantize_band_cost(s, M,
-                                                M34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL);
-                    dist2 += quantize_band_cost(s, S,
-                                                S34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL);
                 }
-                cpe->ms_mask[w*16+g] = dist2 < dist1;
             }
+            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
+                prev_mid = sce0->sf_idx[w*16+g];
+            if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                prev_side = sce1->sf_idx[w*16+g];
             start += sce0->ics.swb_sizes[g];
         }
     }
 }
 
-AACCoefficientsEncoder ff_aac_coders[] = {
-    {
-        search_for_quantizers_faac,
-        encode_window_bands_info,
-        quantize_and_encode_band,
-        search_for_ms,
-    },
-    {
+AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
+    [AAC_CODER_ANMR] = {
         search_for_quantizers_anmr,
         encode_window_bands_info,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_ltp_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_pred,
+        ff_aac_adjust_common_ltp,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        ff_aac_update_ltp,
+        ff_aac_ltp_insert_new_frame,
+        set_special_band_scalefactors,
+        search_for_pns,
+        mark_pns,
+        ff_aac_search_for_tns,
+        ff_aac_search_for_ltp,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
-    {
+    [AAC_CODER_TWOLOOP] = {
         search_for_quantizers_twoloop,
         codebook_trellis_rate,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_ltp_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_pred,
+        ff_aac_adjust_common_ltp,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        ff_aac_update_ltp,
+        ff_aac_ltp_insert_new_frame,
+        set_special_band_scalefactors,
+        search_for_pns,
+        mark_pns,
+        ff_aac_search_for_tns,
+        ff_aac_search_for_ltp,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
-    {
+    [AAC_CODER_FAST] = {
         search_for_quantizers_fast,
         encode_window_bands_info,
         quantize_and_encode_band,
+        ff_aac_encode_tns_info,
+        ff_aac_encode_ltp_info,
+        ff_aac_encode_main_pred,
+        ff_aac_adjust_common_pred,
+        ff_aac_adjust_common_ltp,
+        ff_aac_apply_main_pred,
+        ff_aac_apply_tns,
+        ff_aac_update_ltp,
+        ff_aac_ltp_insert_new_frame,
+        set_special_band_scalefactors,
+        search_for_pns,
+        mark_pns,
+        ff_aac_search_for_tns,
+        ff_aac_search_for_ltp,
         search_for_ms,
+        ff_aac_search_for_is,
+        ff_aac_search_for_pred,
     },
 };
diff --git a/libavcodec/aaccoder_trellis.h b/libavcodec/aaccoder_trellis.h
new file mode 100644
index 0000000..0230052
--- /dev/null
+++ b/libavcodec/aaccoder_trellis.h
@@ -0,0 +1,192 @@
+/*
+ * AAC encoder trellis codebook selector
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder trellis codebook selector
+ * @author Konstantin Shishkov
+ */
+
+/**
+ * This file contains a template for the codebook_trellis_rate selector function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost_bits
+ *  - abs_pow34_v
+ */
+
+#ifndef AVCODEC_AACCODER_TRELLIS_H
+#define AVCODEC_AACCODER_TRELLIS_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+
+/**
+ * structure used in optimal codebook search
+ */
+typedef struct TrellisBandCodingPath {
+    int prev_idx; ///< pointer to the previous path point
+    float cost;   ///< path cost
+    int run;
+} TrellisBandCodingPath;
+
+
+static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
+                                  int win, int group_len, const float lambda)
+{
+    TrellisBandCodingPath path[120][CB_TOT_ALL];
+    int w, swb, cb, start, size;
+    int i, j;
+    const int max_sfb  = sce->ics.max_sfb;
+    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
+    const int run_esc  = (1 << run_bits) - 1;
+    int idx, ppos, count;
+    int stackrun[120], stackcb[120], stack_len;
+    float next_minbits = INFINITY;
+    int next_mincb = 0;
+
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+    start = win*128;
+    for (cb = 0; cb < CB_TOT_ALL; cb++) {
+        path[0][cb].cost     = run_bits+4;
+        path[0][cb].prev_idx = -1;
+        path[0][cb].run      = 0;
+    }
+    for (swb = 0; swb < max_sfb; swb++) {
+        size = sce->ics.swb_sizes[swb];
+        if (sce->zeroes[win*16 + swb]) {
+            float cost_stay_here = path[swb][0].cost;
+            float cost_get_here  = next_minbits + run_bits + 4;
+            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
+                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
+                cost_stay_here += run_bits;
+            if (cost_get_here < cost_stay_here) {
+                path[swb+1][0].prev_idx = next_mincb;
+                path[swb+1][0].cost     = cost_get_here;
+                path[swb+1][0].run      = 1;
+            } else {
+                path[swb+1][0].prev_idx = 0;
+                path[swb+1][0].cost     = cost_stay_here;
+                path[swb+1][0].run      = path[swb][0].run + 1;
+            }
+            next_minbits = path[swb+1][0].cost;
+            next_mincb = 0;
+            for (cb = 1; cb < CB_TOT_ALL; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+        } else {
+            float minbits = next_minbits;
+            int mincb = next_mincb;
+            int startcb = sce->band_type[win*16+swb];
+            startcb = aac_cb_in_map[startcb];
+            next_minbits = INFINITY;
+            next_mincb = 0;
+            for (cb = 0; cb < startcb; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+            for (cb = startcb; cb < CB_TOT_ALL; cb++) {
+                float cost_stay_here, cost_get_here;
+                float bits = 0.0f;
+                if (cb >= 12 && sce->band_type[win*16+swb] != aac_cb_out_map[cb]) {
+                    path[swb+1][cb].cost = 61450;
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].run = 0;
+                    continue;
+                }
+                for (w = 0; w < group_len; w++) {
+                    bits += quantize_band_cost_bits(s, &sce->coeffs[start + w*128],
+                                               &s->scoefs[start + w*128], size,
+                                               sce->sf_idx[win*16+swb],
+                                               aac_cb_out_map[cb],
+                                               0, INFINITY, NULL, NULL, 0);
+                }
+                cost_stay_here = path[swb][cb].cost + bits;
+                cost_get_here  = minbits            + bits + run_bits + 4;
+                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
+                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
+                    cost_stay_here += run_bits;
+                if (cost_get_here < cost_stay_here) {
+                    path[swb+1][cb].prev_idx = mincb;
+                    path[swb+1][cb].cost     = cost_get_here;
+                    path[swb+1][cb].run      = 1;
+                } else {
+                    path[swb+1][cb].prev_idx = cb;
+                    path[swb+1][cb].cost     = cost_stay_here;
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
+                }
+                if (path[swb+1][cb].cost < next_minbits) {
+                    next_minbits = path[swb+1][cb].cost;
+                    next_mincb = cb;
+                }
+            }
+        }
+        start += sce->ics.swb_sizes[swb];
+    }
+
+    //convert resulting path from backward-linked list
+    stack_len = 0;
+    idx       = 0;
+    for (cb = 1; cb < CB_TOT_ALL; cb++)
+        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
+            idx = cb;
+    ppos = max_sfb;
+    while (ppos > 0) {
+        av_assert1(idx >= 0);
+        cb = idx;
+        stackrun[stack_len] = path[ppos][cb].run;
+        stackcb [stack_len] = cb;
+        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
+        ppos -= path[ppos][cb].run;
+        stack_len++;
+    }
+    //perform actual band info encoding
+    start = 0;
+    for (i = stack_len - 1; i >= 0; i--) {
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
+        count = stackrun[i];
+        memset(sce->zeroes + win*16 + start, !cb, count);
+        //XXX: memset when band_type is also uint8_t
+        for (j = 0; j < count; j++) {
+            sce->band_type[win*16 + start] = cb;
+            start++;
+        }
+        while (count >= run_esc) {
+            put_bits(&s->pb, run_bits, run_esc);
+            count -= run_esc;
+        }
+        put_bits(&s->pb, run_bits, count);
+    }
+}
+
+
+#endif /* AVCODEC_AACCODER_TRELLIS_H */
diff --git a/libavcodec/aaccoder_twoloop.h b/libavcodec/aaccoder_twoloop.h
new file mode 100644
index 0000000..42aea52
--- /dev/null
+++ b/libavcodec/aaccoder_twoloop.h
@@ -0,0 +1,763 @@
+/*
+ * AAC encoder twoloop coder
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder twoloop coder
+ * @author Konstantin Shishkov, Claudio Freire
+ */
+
+/**
+ * This file contains a template for the twoloop coder function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost
+ *  - abs_pow34_v
+ *  - find_max_val
+ *  - find_min_book
+ *  - find_form_factor
+ */
+
+#ifndef AVCODEC_AACCODER_TWOLOOP_H
+#define AVCODEC_AACCODER_TWOLOOP_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "mathops.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+
+/** Frequency in Hz for lower limit of noise substitution **/
+#define NOISE_LOW_LIMIT 4000
+
+#define sclip(x) av_clip(x,60,218)
+
+/* Reflects the cost to change codebooks */
+static inline int ff_pns_bits(SingleChannelElement *sce, int w, int g)
+{
+    return (!g || !sce->zeroes[w*16+g-1] || !sce->can_pns[w*16+g-1]) ? 9 : 5;
+}
+
+/**
+ * two-loop quantizers search taken from ISO 13818-7 Appendix C
+ */
+static void search_for_quantizers_twoloop(AVCodecContext *avctx,
+                                          AACEncContext *s,
+                                          SingleChannelElement *sce,
+                                          const float lambda)
+{
+    int start = 0, i, w, w2, g, recomprd;
+    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+    int refbits = destbits;
+    int toomanybits, toofewbits;
+    char nzs[128];
+    uint8_t nextband[128];
+    int maxsf[128], minsf[128];
+    float dists[128] = { 0 }, qenergies[128] = { 0 }, uplims[128], euplims[128], energies[128];
+    float maxvals[128], spread_thr_r[128];
+    float min_spread_thr_r, max_spread_thr_r;
+
+    /**
+     * rdlambda controls the maximum tolerated distortion. Twoloop
+     * will keep iterating until it fails to lower it or it reaches
+     * ulimit * rdlambda. Keeping it low increases quality on difficult
+     * signals, but lower it too much, and bits will be taken from weak
+     * signals, creating "holes". A balance is necesary.
+     * rdmax and rdmin specify the relative deviation from rdlambda
+     * allowed for tonality compensation
+     */
+    float rdlambda = av_clipf(2.0f * 120.f / lambda, 0.0625f, 16.0f);
+    const float nzslope = 1.5f;
+    float rdmin = 0.03125f;
+    float rdmax = 1.0f;
+
+    /**
+     * sfoffs controls an offset of optmium allocation that will be
+     * applied based on lambda. Keep it real and modest, the loop
+     * will take care of the rest, this just accelerates convergence
+     */
+    float sfoffs = av_clipf(log2f(120.0f / lambda) * 4.0f, -5, 10);
+
+    int fflag, minscaler, maxscaler, nminscaler;
+    int its  = 0;
+    int maxits = 30;
+    int allz = 0;
+    int tbits;
+    int cutoff = 1024;
+    int pns_start_pos;
+    int prev;
+
+    /**
+     * zeroscale controls a multiplier of the threshold, if band energy
+     * is below this, a zero is forced. Keep it lower than 1, unless
+     * low lambda is used, because energy < threshold doesn't mean there's
+     * no audible signal outright, it's just energy. Also make it rise
+     * slower than rdlambda, as rdscale has due compensation with
+     * noisy band depriorization below, whereas zeroing logic is rather dumb
+     */
+    float zeroscale;
+    if (lambda > 120.f) {
+        zeroscale = av_clipf(powf(120.f / lambda, 0.25f), 0.0625f, 1.0f);
+    } else {
+        zeroscale = 1.f;
+    }
+
+    if (s->psy.bitres.alloc >= 0) {
+        /**
+         * Psy granted us extra bits to use, from the reservoire
+         * adjust for lambda except what psy already did
+         */
+        destbits = s->psy.bitres.alloc
+            * (lambda / (avctx->global_quality ? avctx->global_quality : 120));
+    }
+
+    if (avctx->flags & CODEC_FLAG_QSCALE) {
+        /**
+         * Constant Q-scale doesn't compensate MS coding on its own
+         * No need to be overly precise, this only controls RD
+         * adjustment CB limits when going overboard
+         */
+        if (s->options.mid_side && s->cur_type == TYPE_CPE)
+            destbits *= 2;
+
+        /**
+         * When using a constant Q-scale, don't adjust bits, just use RD
+         * Don't let it go overboard, though... 8x psy target is enough
+         */
+        toomanybits = 5800;
+        toofewbits = destbits / 16;
+
+        /** Don't offset scalers, just RD */
+        sfoffs = sce->ics.num_windows - 1;
+        rdlambda = sqrtf(rdlambda);
+
+        /** search further */
+        maxits *= 2;
+    } else {
+        /* When using ABR, be strict, but a reasonable leeway is
+         * critical to allow RC to smoothly track desired bitrate
+         * without sudden quality drops that cause audible artifacts.
+         * Symmetry is also desirable, to avoid systematic bias.
+         */
+        toomanybits = destbits + destbits/8;
+        toofewbits = destbits - destbits/8;
+
+        sfoffs = 0;
+        rdlambda = sqrtf(rdlambda);
+    }
+
+    /** and zero out above cutoff frequency */
+    {
+        int wlen = 1024 / sce->ics.num_windows;
+        int bandwidth;
+
+        /**
+         * Scale, psy gives us constant quality, this LP only scales
+         * bitrate by lambda, so we save bits on subjectively unimportant HF
+         * rather than increase quantization noise. Adjust nominal bitrate
+         * to effective bitrate according to encoding parameters,
+         * AAC_CUTOFF_FROM_BITRATE is calibrated for effective bitrate.
+         */
+        float rate_bandwidth_multiplier = 1.5f;
+        int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+            ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+            : (avctx->bit_rate / avctx->channels);
+
+        /** Compensate for extensions that increase efficiency */
+        if (s->options.pns || s->options.intensity_stereo)
+            frame_bit_rate *= 1.15f;
+
+        if (avctx->cutoff > 0) {
+            bandwidth = avctx->cutoff;
+        } else {
+            bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+            s->psy.cutoff = bandwidth;
+        }
+
+        cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+        pns_start_pos = NOISE_LOW_LIMIT * 2 * wlen / avctx->sample_rate;
+    }
+
+    /**
+     * for values above this the decoder might end up in an endless loop
+     * due to always having more bits than what can be encoded.
+     */
+    destbits = FFMIN(destbits, 5800);
+    toomanybits = FFMIN(toomanybits, 5800);
+    toofewbits = FFMIN(toofewbits, 5800);
+    /**
+     * XXX: some heuristic to determine initial quantizers will reduce search time
+     * determine zero bands and upper distortion limits
+     */
+    min_spread_thr_r = -1;
+    max_spread_thr_r = -1;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+            int nz = 0;
+            float uplim = 0.0f, energy = 0.0f, spread = 0.0f;
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                if (start >= cutoff || band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f) {
+                    sce->zeroes[(w+w2)*16+g] = 1;
+                    continue;
+                }
+                nz = 1;
+            }
+            if (!nz) {
+                uplim = 0.0f;
+            } else {
+                nz = 0;
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                    FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                    if (band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f)
+                        continue;
+                    uplim += band->threshold;
+                    energy += band->energy;
+                    spread += band->spread;
+                    nz++;
+                }
+            }
+            uplims[w*16+g] = uplim;
+            energies[w*16+g] = energy;
+            nzs[w*16+g] = nz;
+            sce->zeroes[w*16+g] = !nz;
+            allz |= nz;
+            if (nz && sce->can_pns[w*16+g]) {
+                spread_thr_r[w*16+g] = energy * nz / (uplim * spread);
+                if (min_spread_thr_r < 0) {
+                    min_spread_thr_r = max_spread_thr_r = spread_thr_r[w*16+g];
+                } else {
+                    min_spread_thr_r = FFMIN(min_spread_thr_r, spread_thr_r[w*16+g]);
+                    max_spread_thr_r = FFMAX(max_spread_thr_r, spread_thr_r[w*16+g]);
+                }
+            }
+        }
+    }
+
+    /** Compute initial scalers */
+    minscaler = 65535;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g]) {
+                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
+                continue;
+            }
+            /**
+             * log2f-to-distortion ratio is, technically, 2 (1.5db = 4, but it's power vs level so it's 2).
+             * But, as offsets are applied, low-frequency signals are too sensitive to the induced distortion,
+             * so we make scaling more conservative by choosing a lower log2f-to-distortion ratio, and thus
+             * more robust.
+             */
+            sce->sf_idx[w*16+g] = av_clip(
+                SCALE_ONE_POS
+                    + 1.75*log2f(FFMAX(0.00125f,uplims[w*16+g]) / sce->ics.swb_sizes[g])
+                    + sfoffs,
+                60, SCALE_MAX_POS);
+            minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+        }
+    }
+
+    /** Clip */
+    minscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
+        for (g = 0;  g < sce->ics.num_swb; g++)
+            if (!sce->zeroes[w*16+g])
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF - 1);
+
+    if (!allz)
+        return;
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+    ff_quantize_band_cost_cache_init(s);
+
+    for (i = 0; i < sizeof(minsf) / sizeof(minsf[0]); ++i)
+        minsf[i] = 0;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            const float *scaled = s->scoefs + start;
+            int minsfidx;
+            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
+            if (maxvals[w*16+g] > 0) {
+                minsfidx = coef2minsf(maxvals[w*16+g]);
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++)
+                    minsf[(w+w2)*16+g] = minsfidx;
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    /**
+     * Scale uplims to match rate distortion to quality
+     * bu applying noisy band depriorization and tonal band priorization.
+     * Maxval-energy ratio gives us an idea of how noisy/tonal the band is.
+     * If maxval^2 ~ energy, then that band is mostly noise, and we can relax
+     * rate distortion requirements.
+     */
+    memcpy(euplims, uplims, sizeof(euplims));
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** psy already priorizes transients to some extent */
+        float de_psy_factor = (sce->ics.num_windows > 1) ? 8.0f / sce->ics.group_len[w] : 1.0f;
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (nzs[g] > 0) {
+                float cleanup_factor = ff_sqrf(av_clipf(start / (cutoff * 0.75f), 1.0f, 2.0f));
+                float energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    nzslope * cleanup_factor);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                uplims[w*16+g] *= av_clipf(rdlambda * energy2uplim, rdmin, rdmax)
+                                  * sce->ics.group_len[w];
+
+                energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    2.0f);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                euplims[w*16+g] *= av_clipf(rdlambda * energy2uplim * sce->ics.group_len[w],
+                    0.5f, 1.0f);
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    for (i = 0; i < sizeof(maxsf) / sizeof(maxsf[0]); ++i)
+        maxsf[i] = SCALE_MAX_POS;
+
+    //perform two-loop search
+    //outer loop - improve quality
+    do {
+        //inner loop - quantize spectrum to fit into given number of bits
+        int overdist;
+        int qstep = its ? 1 : 32;
+        do {
+            int changed = 0;
+            prev = -1;
+            recomprd = 0;
+            tbits = 0;
+            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                start = w*128;
+                for (g = 0;  g < sce->ics.num_swb; g++) {
+                    const float *coefs = &sce->coeffs[start];
+                    const float *scaled = &s->scoefs[start];
+                    int bits = 0;
+                    int cb;
+                    float dist = 0.0f;
+                    float qenergy = 0.0f;
+
+                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                        start += sce->ics.swb_sizes[g];
+                        if (sce->can_pns[w*16+g]) {
+                            /** PNS isn't free */
+                            tbits += ff_pns_bits(sce, w, g);
+                        }
+                        continue;
+                    }
+                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                        int b;
+                        float sqenergy;
+                        dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                   scaled + w2*128,
+                                                   sce->ics.swb_sizes[g],
+                                                   sce->sf_idx[w*16+g],
+                                                   cb,
+                                                   1.0f,
+                                                   INFINITY,
+                                                   &b, &sqenergy,
+                                                   0);
+                        bits += b;
+                        qenergy += sqenergy;
+                    }
+                    dists[w*16+g] = dist - bits;
+                    qenergies[w*16+g] = qenergy;
+                    if (prev != -1) {
+                        int sfdiff = av_clip(sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO, 0, 2*SCALE_MAX_DIFF);
+                        bits += ff_aac_scalefactor_bits[sfdiff];
+                    }
+                    tbits += bits;
+                    start += sce->ics.swb_sizes[g];
+                    prev = sce->sf_idx[w*16+g];
+                }
+            }
+            if (tbits > toomanybits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] < (SCALE_MAX_POS - SCALE_DIV_512)) {
+                        int maxsf_i = (tbits > 5800) ? SCALE_MAX_POS : maxsf[i];
+                        int new_sf = FFMIN(maxsf_i, sce->sf_idx[i] + qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
+            } else if (tbits < toofewbits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] > SCALE_ONE_POS) {
+                        int new_sf = FFMAX3(minsf[i], SCALE_ONE_POS, sce->sf_idx[i] - qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
+            }
+            qstep >>= 1;
+            if (!qstep && tbits > toomanybits && sce->sf_idx[0] < 217 && changed)
+                qstep = 1;
+        } while (qstep);
+
+        overdist = 1;
+        fflag = tbits < toofewbits;
+        for (i = 0; i < 2 && (overdist || recomprd); ++i) {
+            if (recomprd) {
+                /** Must recompute distortion */
+                prev = -1;
+                tbits = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    start = w*128;
+                    for (g = 0;  g < sce->ics.num_swb; g++) {
+                        const float *coefs = sce->coeffs + start;
+                        const float *scaled = s->scoefs + start;
+                        int bits = 0;
+                        int cb;
+                        float dist = 0.0f;
+                        float qenergy = 0.0f;
+
+                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                            start += sce->ics.swb_sizes[g];
+                            if (sce->can_pns[w*16+g]) {
+                                /** PNS isn't free */
+                                tbits += ff_pns_bits(sce, w, g);
+                            }
+                            continue;
+                        }
+                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                            int b;
+                            float sqenergy;
+                            dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                    scaled + w2*128,
+                                                    sce->ics.swb_sizes[g],
+                                                    sce->sf_idx[w*16+g],
+                                                    cb,
+                                                    1.0f,
+                                                    INFINITY,
+                                                    &b, &sqenergy,
+                                                    0);
+                            bits += b;
+                            qenergy += sqenergy;
+                        }
+                        dists[w*16+g] = dist - bits;
+                        qenergies[w*16+g] = qenergy;
+                        if (prev != -1) {
+                            int sfdiff = av_clip(sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO, 0, 2*SCALE_MAX_DIFF);
+                            bits += ff_aac_scalefactor_bits[sfdiff];
+                        }
+                        tbits += bits;
+                        start += sce->ics.swb_sizes[g];
+                        prev = sce->sf_idx[w*16+g];
+                    }
+                }
+            }
+            if (!i && s->options.pns && its > maxits/2 && tbits > toofewbits) {
+                float maxoverdist = 0.0f;
+                float ovrfactor = 1.f+(maxits-its)*16.f/maxits;
+                overdist = recomprd = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                        if (!sce->zeroes[w*16+g] && sce->sf_idx[w*16+g] > SCALE_ONE_POS && dists[w*16+g] > uplims[w*16+g]*ovrfactor) {
+                            float ovrdist = dists[w*16+g] / FFMAX(uplims[w*16+g],euplims[w*16+g]);
+                            maxoverdist = FFMAX(maxoverdist, ovrdist);
+                            overdist++;
+                        }
+                    }
+                }
+                if (overdist) {
+                    /* We have overdistorted bands, trade for zeroes (that can be noise)
+                     * Zero the bands in the lowest 1.25% spread-energy-threshold ranking
+                     */
+                    float minspread = max_spread_thr_r;
+                    float maxspread = min_spread_thr_r;
+                    float zspread;
+                    int zeroable = 0;
+                    int zeroed = 0;
+                    int maxzeroed, zloop;
+                    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                            if (start >= pns_start_pos && !sce->zeroes[w*16+g] && sce->can_pns[w*16+g]) {
+                                minspread = FFMIN(minspread, spread_thr_r[w*16+g]);
+                                maxspread = FFMAX(maxspread, spread_thr_r[w*16+g]);
+                                zeroable++;
+                            }
+                        }
+                    }
+                    zspread = (maxspread-minspread) * 0.0125f + minspread;
+                    /* Don't PNS everything even if allowed. It suppresses bit starvation signals from RC,
+                     * and forced the hand of the later search_for_pns step.
+                     * Instead, PNS a fraction of the spread_thr_r range depending on how starved for bits we are,
+                     * and leave further PNSing to search_for_pns if worthwhile.
+                     */
+                    zspread = FFMIN3(min_spread_thr_r * 8.f, zspread,
+                        ((toomanybits - tbits) * min_spread_thr_r + (tbits - toofewbits) * max_spread_thr_r) / (toomanybits - toofewbits + 1));
+                    maxzeroed = FFMIN(zeroable, FFMAX(1, (zeroable * its + maxits - 1) / (2 * maxits)));
+                    for (zloop = 0; zloop < 2; zloop++) {
+                        /* Two passes: first distorted stuff - two birds in one shot and all that,
+                         * then anything viable. Viable means not zero, but either CB=zero-able
+                         * (too high SF), not SF <= 1 (that means we'd be operating at very high
+                         * quality, we don't want PNS when doing VHQ), PNS allowed, and within
+                         * the lowest ranking percentile.
+                         */
+                        float loopovrfactor = (zloop) ? 1.0f : ovrfactor;
+                        int loopminsf = (zloop) ? (SCALE_ONE_POS - SCALE_DIV_512) : SCALE_ONE_POS;
+                        int mcb;
+                        for (g = sce->ics.num_swb-1; g > 0 && zeroed < maxzeroed; g--) {
+                            if (sce->ics.swb_offset[g] < pns_start_pos)
+                                continue;
+                            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                                if (!sce->zeroes[w*16+g] && sce->can_pns[w*16+g] && spread_thr_r[w*16+g] <= zspread
+                                    && sce->sf_idx[w*16+g] > loopminsf
+                                    && (dists[w*16+g] > loopovrfactor*uplims[w*16+g] || !(mcb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]))
+                                        || (mcb <= 1 && dists[w*16+g] > FFMIN(uplims[w*16+g], euplims[w*16+g]))) ) {
+                                    sce->zeroes[w*16+g] = 1;
+                                    sce->band_type[w*16+g] = 0;
+                                    zeroed++;
+                                }
+                            }
+                        }
+                    }
+                    if (zeroed)
+                        recomprd = fflag = 1;
+                } else {
+                    overdist = 0;
+                }
+            }
+        }
+
+        minscaler = SCALE_MAX_POS;
+        maxscaler = 0;
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0;  g < sce->ics.num_swb; g++) {
+                if (!sce->zeroes[w*16+g]) {
+                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+                    maxscaler = FFMAX(maxscaler, sce->sf_idx[w*16+g]);
+                }
+            }
+        }
+
+        minscaler = nminscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+        prev = -1;
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            /** Start with big steps, end up fine-tunning */
+            int depth = (its > maxits/2) ? ((its > maxits*2/3) ? 1 : 3) : 10;
+            int edepth = depth+2;
+            float uplmax = its / (maxits*0.25f) + 1.0f;
+            uplmax *= (tbits > destbits) ? FFMIN(2.0f, tbits / (float)FFMAX(1,destbits)) : 1.0f;
+            start = w * 128;
+            for (g = 0; g < sce->ics.num_swb; g++) {
+                int prevsc = sce->sf_idx[w*16+g];
+                if (prev < 0 && !sce->zeroes[w*16+g])
+                    prev = sce->sf_idx[0];
+                if (!sce->zeroes[w*16+g]) {
+                    const float *coefs = sce->coeffs + start;
+                    const float *scaled = s->scoefs + start;
+                    int cmb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    int mindeltasf = FFMAX(0, prev - SCALE_MAX_DIFF);
+                    int maxdeltasf = FFMIN(SCALE_MAX_POS - SCALE_DIV_512, prev + SCALE_MAX_DIFF);
+                    if ((!cmb || dists[w*16+g] > uplims[w*16+g]) && sce->sf_idx[w*16+g] > FFMAX(mindeltasf, minsf[w*16+g])) {
+                        /* Try to make sure there is some energy in every nonzero band
+                         * NOTE: This algorithm must be forcibly imbalanced, pushing harder
+                         *  on holes or more distorted bands at first, otherwise there's
+                         *  no net gain (since the next iteration will offset all bands
+                         *  on the opposite direction to compensate for extra bits)
+                         */
+                        for (i = 0; i < edepth && sce->sf_idx[w*16+g] > mindeltasf; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            int mb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1);
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                            dist = qenergy = 0.f;
+                            bits = 0;
+                            if (!cb) {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g]-1, maxsf[w*16+g]);
+                            } else if (i >= depth && dists[w*16+g] < euplims[w*16+g]) {
+                                break;
+                            }
+                            /* !g is the DC band, it's important, since quantization error here
+                             * applies to less than a cycle, it creates horrible intermodulation
+                             * distortion if it doesn't stick to what psy requests
+                             */
+                            if (!g && sce->ics.num_windows > 1 && dists[w*16+g] >= euplims[w*16+g])
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g], maxsf[w*16+g]);
+                            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                int b;
+                                float sqenergy;
+                                dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                        scaled + w2*128,
+                                                        sce->ics.swb_sizes[g],
+                                                        sce->sf_idx[w*16+g]-1,
+                                                        cb,
+                                                        1.0f,
+                                                        INFINITY,
+                                                        &b, &sqenergy,
+                                                        0);
+                                bits += b;
+                                qenergy += sqenergy;
+                            }
+                            sce->sf_idx[w*16+g]--;
+                            dists[w*16+g] = dist - bits;
+                            qenergies[w*16+g] = qenergy;
+                            if (mb && (sce->sf_idx[w*16+g] < mindeltasf || (
+                                    (dists[w*16+g] < FFMIN(uplmax*uplims[w*16+g], euplims[w*16+g]))
+                                    && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                                ) )) {
+                                break;
+                            }
+                        }
+                    } else if (tbits > toofewbits && sce->sf_idx[w*16+g] < FFMIN(maxdeltasf, maxsf[w*16+g])
+                            && (dists[w*16+g] < FFMIN(euplims[w*16+g], uplims[w*16+g]))
+                            && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                        ) {
+                        /** Um... over target. Save bits for more important stuff. */
+                        for (i = 0; i < depth && sce->sf_idx[w*16+g] < maxdeltasf; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]+1);
+                            if (cb > 0) {
+                                dist = qenergy = 0.f;
+                                bits = 0;
+                                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                    int b;
+                                    float sqenergy;
+                                    dist += quantize_band_cost_cached(s, w + w2, g, coefs + w2*128,
+                                                            scaled + w2*128,
+                                                            sce->ics.swb_sizes[g],
+                                                            sce->sf_idx[w*16+g]+1,
+                                                            cb,
+                                                            1.0f,
+                                                            INFINITY,
+                                                            &b, &sqenergy,
+                                                            0);
+                                    bits += b;
+                                    qenergy += sqenergy;
+                                }
+                                dist -= bits;
+                                if (dist < FFMIN(euplims[w*16+g], uplims[w*16+g])) {
+                                    sce->sf_idx[w*16+g]++;
+                                    dists[w*16+g] = dist;
+                                    qenergies[w*16+g] = qenergy;
+                                } else {
+                                    break;
+                                }
+                            } else {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g], maxsf[w*16+g]);
+                                break;
+                            }
+                        }
+                    }
+                    prev = sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], mindeltasf, maxdeltasf);
+                    if (sce->sf_idx[w*16+g] != prevsc)
+                        fflag = 1;
+                    nminscaler = FFMIN(nminscaler, sce->sf_idx[w*16+g]);
+                    sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                }
+                start += sce->ics.swb_sizes[g];
+            }
+        }
+
+        /** SF difference limit violation risk. Must re-clamp. */
+        prev = -1;
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0; g < sce->ics.num_swb; g++) {
+                if (!sce->zeroes[w*16+g]) {
+                    int prevsf = sce->sf_idx[w*16+g];
+                    if (prev < 0)
+                        prev = prevsf;
+                    sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], prev - SCALE_MAX_DIFF, prev + SCALE_MAX_DIFF);
+                    sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    prev = sce->sf_idx[w*16+g];
+                    if (!fflag && prevsf != sce->sf_idx[w*16+g])
+                        fflag = 1;
+                }
+            }
+        }
+
+        its++;
+    } while (fflag && its < maxits);
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce, nextband);
+
+    prev = -1;
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** Make sure proper codebooks are set */
+        for (g = 0; g < sce->ics.num_swb; g++) {
+            if (!sce->zeroes[w*16+g]) {
+                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                if (sce->band_type[w*16+g] <= 0) {
+                    if (!ff_sfdelta_can_remove_band(sce, nextband, prev, w*16+g)) {
+                        /** Cannot zero out, make sure it's not attempted */
+                        sce->band_type[w*16+g] = 1;
+                    } else {
+                        sce->zeroes[w*16+g] = 1;
+                        sce->band_type[w*16+g] = 0;
+                    }
+                }
+            } else {
+                sce->band_type[w*16+g] = 0;
+            }
+            /** Check that there's no SF delta range violations */
+            if (!sce->zeroes[w*16+g]) {
+                if (prev != -1) {
+                    av_unused int sfdiff = sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO;
+                    av_assert1(sfdiff >= 0 && sfdiff <= 2*SCALE_MAX_DIFF);
+                } else if (sce->zeroes[0]) {
+                    /** Set global gain to something useful */
+                    sce->sf_idx[0] = sce->sf_idx[w*16+g];
+                }
+                prev = sce->sf_idx[w*16+g];
+            }
+        }
+    }
+}
+
+#endif /* AVCODEC_AACCODER_TWOLOOP_H */
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c
index 6a06062..ee9b4eb 100644
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -8,20 +8,20 @@
  * Copyright (c) 2008-2010 Paul Kendall <paul@kcbbs.gen.nz>
  * Copyright (c) 2010      Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,55 +32,12 @@
  * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
  */
 
-/*
- * supported tools
- *
- * Support?             Name
- * N (code in SoC repo) gain control
- * Y                    block switching
- * Y                    window shapes - standard
- * N                    window shapes - Low Delay
- * Y                    filterbank - standard
- * N (code in SoC repo) filterbank - Scalable Sample Rate
- * Y                    Temporal Noise Shaping
- * Y                    Long Term Prediction
- * Y                    intensity stereo
- * Y                    channel coupling
- * Y                    frequency domain prediction
- * Y                    Perceptual Noise Substitution
- * Y                    Mid/Side stereo
- * N                    Scalable Inverse AAC Quantization
- * N                    Frequency Selective Switch
- * N                    upsampling filter
- * Y                    quantization & coding - AAC
- * N                    quantization & coding - TwinVQ
- * N                    quantization & coding - BSAC
- * N                    AAC Error Resilience tools
- * N                    Error Resilience payload syntax
- * N                    Error Protection tool
- * N                    CELP
- * N                    Silence Compression
- * N                    HVXC
- * N                    HVXC 4kbits/s VR
- * N                    Structured Audio tools
- * N                    Structured Audio Sample Bank Format
- * N                    MIDI
- * N                    Harmonic and Individual Lines plus Noise
- * N                    Text-To-Speech Interface
- * Y                    Spectral Band Replication
- * Y (not in this code) Layer-1
- * Y (not in this code) Layer-2
- * Y (not in this code) Layer-3
- * N                    SinuSoidal Coding (Transient, Sinusoid, Noise)
- * Y                    Parametric Stereo
- * N                    Direct Stream Transfer
- *
- * Note: - HE AAC v1 comprises LC AAC with Spectral Band Replication.
- *       - HE AAC v2 comprises LC AAC with Spectral Band Replication and
-           Parametric Stereo.
- */
+#define FFT_FLOAT 1
+#define FFT_FIXED_32 0
+#define USE_FIXED 0
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
@@ -93,14 +50,14 @@
 #include "aac.h"
 #include "aactab.h"
 #include "aacdectab.h"
-#include "cbrt_tablegen.h"
+#include "cbrt_data.h"
 #include "sbr.h"
 #include "aacsbr.h"
 #include "mpeg4audio.h"
 #include "aacadtsdec.h"
+#include "profiles.h"
 #include "libavutil/intfloat.h"
 
-#include <assert.h>
 #include <errno.h>
 #include <math.h>
 #include <stdint.h>
@@ -108,855 +65,10 @@
 
 #if ARCH_ARM
 #   include "arm/aac.h"
+#elif ARCH_MIPS
+#   include "mips/aacdec_mips.h"
 #endif
 
-#include "libavutil/thread.h"
-
-static VLC vlc_scalefactors;
-static VLC vlc_spectral[11];
-
-static const char overread_err[] = "Input buffer exhausted before END element found\n";
-
-static int count_channels(uint8_t (*layout)[3], int tags)
-{
-    int i, sum = 0;
-    for (i = 0; i < tags; i++) {
-        int syn_ele = layout[i][0];
-        int pos     = layout[i][2];
-        sum += (1 + (syn_ele == TYPE_CPE)) *
-               (pos != AAC_CHANNEL_OFF && pos != AAC_CHANNEL_CC);
-    }
-    return sum;
-}
-
-/**
- * Check for the channel element in the current channel position configuration.
- * If it exists, make sure the appropriate element is allocated and map the
- * channel order to match the internal Libav channel layout.
- *
- * @param   che_pos current channel position configuration
- * @param   type channel element type
- * @param   id channel element id
- * @param   channels count of the number of channels in the configuration
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static av_cold int che_configure(AACContext *ac,
-                                 enum ChannelPosition che_pos,
-                                 int type, int id, int *channels)
-{
-    if (che_pos) {
-        if (!ac->che[type][id]) {
-            if (!(ac->che[type][id] = av_mallocz(sizeof(ChannelElement))))
-                return AVERROR(ENOMEM);
-            ff_aac_sbr_ctx_init(ac, &ac->che[type][id]->sbr);
-        }
-        if (type != TYPE_CCE) {
-            if (*channels >= MAX_CHANNELS - 2)
-                return AVERROR_INVALIDDATA;
-            ac->output_element[(*channels)++] = &ac->che[type][id]->ch[0];
-            if (type == TYPE_CPE ||
-                (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1)) {
-                ac->output_element[(*channels)++] = &ac->che[type][id]->ch[1];
-            }
-        }
-    } else {
-        if (ac->che[type][id])
-            ff_aac_sbr_ctx_close(&ac->che[type][id]->sbr);
-        av_freep(&ac->che[type][id]);
-    }
-    return 0;
-}
-
-static int frame_configure_elements(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int type, id, ch, ret;
-
-    /* set channel pointers to internal buffers by default */
-    for (type = 0; type < 4; type++) {
-        for (id = 0; id < MAX_ELEM_ID; id++) {
-            ChannelElement *che = ac->che[type][id];
-            if (che) {
-                che->ch[0].ret = che->ch[0].ret_buf;
-                che->ch[1].ret = che->ch[1].ret_buf;
-            }
-        }
-    }
-
-    /* get output buffer */
-    av_frame_unref(ac->frame);
-    ac->frame->nb_samples = 2048;
-    if ((ret = ff_get_buffer(avctx, ac->frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
-
-    /* map output channel pointers to AVFrame data */
-    for (ch = 0; ch < avctx->channels; ch++) {
-        if (ac->output_element[ch])
-            ac->output_element[ch]->ret = (float *)ac->frame->extended_data[ch];
-    }
-
-    return 0;
-}
-
-struct elem_to_channel {
-    uint64_t av_position;
-    uint8_t syn_ele;
-    uint8_t elem_id;
-    uint8_t aac_position;
-};
-
-static int assign_pair(struct elem_to_channel e2c_vec[MAX_ELEM_ID],
-                       uint8_t (*layout_map)[3], int offset, uint64_t left,
-                       uint64_t right, int pos)
-{
-    if (layout_map[offset][0] == TYPE_CPE) {
-        e2c_vec[offset] = (struct elem_to_channel) {
-            .av_position  = left | right,
-            .syn_ele      = TYPE_CPE,
-            .elem_id      = layout_map[offset][1],
-            .aac_position = pos
-        };
-        return 1;
-    } else {
-        e2c_vec[offset] = (struct elem_to_channel) {
-            .av_position  = left,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[offset][1],
-            .aac_position = pos
-        };
-        e2c_vec[offset + 1] = (struct elem_to_channel) {
-            .av_position  = right,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[offset + 1][1],
-            .aac_position = pos
-        };
-        return 2;
-    }
-}
-
-static int count_paired_channels(uint8_t (*layout_map)[3], int tags, int pos,
-                                 int *current)
-{
-    int num_pos_channels = 0;
-    int first_cpe        = 0;
-    int sce_parity       = 0;
-    int i;
-    for (i = *current; i < tags; i++) {
-        if (layout_map[i][2] != pos)
-            break;
-        if (layout_map[i][0] == TYPE_CPE) {
-            if (sce_parity) {
-                if (pos == AAC_CHANNEL_FRONT && !first_cpe) {
-                    sce_parity = 0;
-                } else {
-                    return -1;
-                }
-            }
-            num_pos_channels += 2;
-            first_cpe         = 1;
-        } else {
-            num_pos_channels++;
-            sce_parity ^= 1;
-        }
-    }
-    if (sce_parity &&
-        ((pos == AAC_CHANNEL_FRONT && first_cpe) || pos == AAC_CHANNEL_SIDE))
-        return -1;
-    *current = i;
-    return num_pos_channels;
-}
-
-static uint64_t sniff_channel_order(uint8_t (*layout_map)[3], int tags)
-{
-    int i, n, total_non_cc_elements;
-    struct elem_to_channel e2c_vec[4 * MAX_ELEM_ID] = { { 0 } };
-    int num_front_channels, num_side_channels, num_back_channels;
-    uint64_t layout;
-
-    if (FF_ARRAY_ELEMS(e2c_vec) < tags)
-        return 0;
-
-    i = 0;
-    num_front_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_FRONT, &i);
-    if (num_front_channels < 0)
-        return 0;
-    num_side_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_SIDE, &i);
-    if (num_side_channels < 0)
-        return 0;
-    num_back_channels =
-        count_paired_channels(layout_map, tags, AAC_CHANNEL_BACK, &i);
-    if (num_back_channels < 0)
-        return 0;
-
-    if (num_side_channels == 0 && num_back_channels >= 4) {
-        num_side_channels = 2;
-        num_back_channels -= 2;
-    }
-
-    i = 0;
-    if (num_front_channels & 1) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_FRONT_CENTER,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_FRONT
-        };
-        i++;
-        num_front_channels--;
-    }
-    if (num_front_channels >= 4) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_FRONT_LEFT_OF_CENTER,
-                         AV_CH_FRONT_RIGHT_OF_CENTER,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-    if (num_front_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_FRONT_LEFT,
-                         AV_CH_FRONT_RIGHT,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-    while (num_front_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_FRONT);
-        num_front_channels -= 2;
-    }
-
-    if (num_side_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_SIDE_LEFT,
-                         AV_CH_SIDE_RIGHT,
-                         AAC_CHANNEL_FRONT);
-        num_side_channels -= 2;
-    }
-    while (num_side_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_SIDE);
-        num_side_channels -= 2;
-    }
-
-    while (num_back_channels >= 4) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         UINT64_MAX,
-                         UINT64_MAX,
-                         AAC_CHANNEL_BACK);
-        num_back_channels -= 2;
-    }
-    if (num_back_channels >= 2) {
-        i += assign_pair(e2c_vec, layout_map, i,
-                         AV_CH_BACK_LEFT,
-                         AV_CH_BACK_RIGHT,
-                         AAC_CHANNEL_BACK);
-        num_back_channels -= 2;
-    }
-    if (num_back_channels) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_BACK_CENTER,
-            .syn_ele      = TYPE_SCE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_BACK
-        };
-        i++;
-        num_back_channels--;
-    }
-
-    if (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = AV_CH_LOW_FREQUENCY,
-            .syn_ele      = TYPE_LFE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_LFE
-        };
-        i++;
-    }
-    while (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
-        e2c_vec[i] = (struct elem_to_channel) {
-            .av_position  = UINT64_MAX,
-            .syn_ele      = TYPE_LFE,
-            .elem_id      = layout_map[i][1],
-            .aac_position = AAC_CHANNEL_LFE
-        };
-        i++;
-    }
-
-    // Must choose a stable sort
-    total_non_cc_elements = n = i;
-    do {
-        int next_n = 0;
-        for (i = 1; i < n; i++)
-            if (e2c_vec[i - 1].av_position > e2c_vec[i].av_position) {
-                FFSWAP(struct elem_to_channel, e2c_vec[i - 1], e2c_vec[i]);
-                next_n = i;
-            }
-        n = next_n;
-    } while (n > 0);
-
-    layout = 0;
-    for (i = 0; i < total_non_cc_elements; i++) {
-        layout_map[i][0] = e2c_vec[i].syn_ele;
-        layout_map[i][1] = e2c_vec[i].elem_id;
-        layout_map[i][2] = e2c_vec[i].aac_position;
-        if (e2c_vec[i].av_position != UINT64_MAX) {
-            layout |= e2c_vec[i].av_position;
-        }
-    }
-
-    return layout;
-}
-
-/**
- * Save current output configuration if and only if it has been locked.
- */
-static void push_output_configuration(AACContext *ac) {
-    if (ac->oc[1].status == OC_LOCKED) {
-        ac->oc[0] = ac->oc[1];
-    }
-    ac->oc[1].status = OC_NONE;
-}
-
-/**
- * Restore the previous output configuration if and only if the current
- * configuration is unlocked.
- */
-static void pop_output_configuration(AACContext *ac) {
-    if (ac->oc[1].status != OC_LOCKED && ac->oc[0].status != OC_NONE) {
-        ac->oc[1] = ac->oc[0];
-        ac->avctx->channels = ac->oc[1].channels;
-        ac->avctx->channel_layout = ac->oc[1].channel_layout;
-    }
-}
-
-/**
- * Configure output channel order based on the current program
- * configuration element.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int output_configure(AACContext *ac,
-                            uint8_t layout_map[MAX_ELEM_ID * 4][3], int tags,
-                            enum OCStatus oc_type, int get_new_frame)
-{
-    AVCodecContext *avctx = ac->avctx;
-    int i, channels = 0, ret;
-    uint64_t layout = 0;
-    uint8_t id_map[TYPE_END][MAX_ELEM_ID] = {{ 0 }};
-    uint8_t type_counts[TYPE_END] = { 0 };
-
-    if (ac->oc[1].layout_map != layout_map) {
-        memcpy(ac->oc[1].layout_map, layout_map, tags * sizeof(layout_map[0]));
-        ac->oc[1].layout_map_tags = tags;
-    }
-    for (i = 0; i < tags; i++) {
-        int type =         layout_map[i][0];
-        int id =           layout_map[i][1];
-        id_map[type][id] = type_counts[type]++;
-    }
-    // Try to sniff a reasonable channel order, otherwise output the
-    // channels in the order the PCE declared them.
-    if (avctx->request_channel_layout != AV_CH_LAYOUT_NATIVE)
-        layout = sniff_channel_order(layout_map, tags);
-    for (i = 0; i < tags; i++) {
-        int type =     layout_map[i][0];
-        int id =       layout_map[i][1];
-        int iid =      id_map[type][id];
-        int position = layout_map[i][2];
-        // Allocate or free elements depending on if they are in the
-        // current program configuration.
-        ret = che_configure(ac, position, type, iid, &channels);
-        if (ret < 0)
-            return ret;
-        ac->tag_che_map[type][id] = ac->che[type][iid];
-    }
-    if (ac->oc[1].m4ac.ps == 1 && channels == 2) {
-        if (layout == AV_CH_FRONT_CENTER) {
-            layout = AV_CH_FRONT_LEFT|AV_CH_FRONT_RIGHT;
-        } else {
-            layout = 0;
-        }
-    }
-
-    avctx->channel_layout = ac->oc[1].channel_layout = layout;
-    avctx->channels       = ac->oc[1].channels       = channels;
-    ac->oc[1].status = oc_type;
-
-    if (get_new_frame) {
-        if ((ret = frame_configure_elements(ac->avctx)) < 0)
-            return ret;
-    }
-
-    return 0;
-}
-
-/**
- * Set up channel positions based on a default channel configuration
- * as specified in table 1.17.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int set_default_channel_config(AVCodecContext *avctx,
-                                      uint8_t (*layout_map)[3],
-                                      int *tags,
-                                      int channel_config)
-{
-    if (channel_config < 1 || (channel_config > 7 && channel_config < 11) ||
-        channel_config > 12) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid default channel configuration (%d)\n",
-               channel_config);
-        return AVERROR_INVALIDDATA;
-    }
-    *tags = tags_per_config[channel_config];
-    memcpy(layout_map, aac_channel_layout_map[channel_config - 1],
-           *tags * sizeof(*layout_map));
-    return 0;
-}
-
-static ChannelElement *get_che(AACContext *ac, int type, int elem_id)
-{
-    /* For PCE based channel configurations map the channels solely based
-     * on tags. */
-    if (!ac->oc[1].m4ac.chan_config) {
-        return ac->tag_che_map[type][elem_id];
-    }
-    // Allow single CPE stereo files to be signalled with mono configuration.
-    if (!ac->tags_mapped && type == TYPE_CPE &&
-        ac->oc[1].m4ac.chan_config == 1) {
-        uint8_t layout_map[MAX_ELEM_ID*4][3];
-        int layout_map_tags;
-        push_output_configuration(ac);
-
-        if (set_default_channel_config(ac->avctx, layout_map,
-                                       &layout_map_tags, 2) < 0)
-            return NULL;
-        if (output_configure(ac, layout_map, layout_map_tags,
-                             OC_TRIAL_FRAME, 1) < 0)
-            return NULL;
-
-        ac->oc[1].m4ac.chan_config = 2;
-        ac->oc[1].m4ac.ps = 0;
-    }
-    // And vice-versa
-    if (!ac->tags_mapped && type == TYPE_SCE &&
-        ac->oc[1].m4ac.chan_config == 2) {
-        uint8_t layout_map[MAX_ELEM_ID * 4][3];
-        int layout_map_tags;
-        push_output_configuration(ac);
-
-        if (set_default_channel_config(ac->avctx, layout_map,
-                                       &layout_map_tags, 1) < 0)
-            return NULL;
-        if (output_configure(ac, layout_map, layout_map_tags,
-                             OC_TRIAL_FRAME, 1) < 0)
-            return NULL;
-
-        ac->oc[1].m4ac.chan_config = 1;
-        if (ac->oc[1].m4ac.sbr)
-            ac->oc[1].m4ac.ps = -1;
-    }
-    /* For indexed channel configurations map the channels solely based
-     * on position. */
-    switch (ac->oc[1].m4ac.chan_config) {
-    case 12:
-    case 7:
-        if (ac->tags_mapped == 3 && type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][2];
-        }
-    case 11:
-        if (ac->tags_mapped == 2 &&
-            ac->oc[1].m4ac.chan_config == 11 &&
-            type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
-        }
-    case 6:
-        /* Some streams incorrectly code 5.1 audio as
-         * SCE[0] CPE[0] CPE[1] SCE[1]
-         * instead of
-         * SCE[0] CPE[0] CPE[1] LFE[0].
-         * If we seem to have encountered such a stream, transfer
-         * the LFE[0] element to the SCE[1]'s mapping */
-        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_LFE][0];
-        }
-    case 5:
-        if (ac->tags_mapped == 2 && type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][1];
-        }
-    case 4:
-        if (ac->tags_mapped == 2 &&
-            ac->oc[1].m4ac.chan_config == 4 &&
-            type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
-        }
-    case 3:
-    case 2:
-        if (ac->tags_mapped == (ac->oc[1].m4ac.chan_config != 2) &&
-            type == TYPE_CPE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][0];
-        } else if (ac->oc[1].m4ac.chan_config == 2) {
-            return NULL;
-        }
-    case 1:
-        if (!ac->tags_mapped && type == TYPE_SCE) {
-            ac->tags_mapped++;
-            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][0];
-        }
-    default:
-        return NULL;
-    }
-}
-
-/**
- * Decode an array of 4 bit element IDs, optionally interleaved with a
- * stereo/mono switching bit.
- *
- * @param type speaker type/position for these channels
- */
-static void decode_channel_map(uint8_t layout_map[][3],
-                               enum ChannelPosition type,
-                               GetBitContext *gb, int n)
-{
-    while (n--) {
-        enum RawDataBlockType syn_ele;
-        switch (type) {
-        case AAC_CHANNEL_FRONT:
-        case AAC_CHANNEL_BACK:
-        case AAC_CHANNEL_SIDE:
-            syn_ele = get_bits1(gb);
-            break;
-        case AAC_CHANNEL_CC:
-            skip_bits1(gb);
-            syn_ele = TYPE_CCE;
-            break;
-        case AAC_CHANNEL_LFE:
-            syn_ele = TYPE_LFE;
-            break;
-        default:
-            // AAC_CHANNEL_OFF has no channel map
-            return;
-        }
-        layout_map[0][0] = syn_ele;
-        layout_map[0][1] = get_bits(gb, 4);
-        layout_map[0][2] = type;
-        layout_map++;
-    }
-}
-
-/**
- * Decode program configuration element; reference: table 4.2.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_pce(AVCodecContext *avctx, MPEG4AudioConfig *m4ac,
-                      uint8_t (*layout_map)[3],
-                      GetBitContext *gb)
-{
-    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc;
-    int sampling_index;
-    int comment_len;
-    int tags;
-
-    skip_bits(gb, 2);  // object_type
-
-    sampling_index = get_bits(gb, 4);
-    if (m4ac->sampling_index != sampling_index)
-        av_log(avctx, AV_LOG_WARNING,
-               "Sample rate index in program config element does not "
-               "match the sample rate index configured by the container.\n");
-
-    num_front       = get_bits(gb, 4);
-    num_side        = get_bits(gb, 4);
-    num_back        = get_bits(gb, 4);
-    num_lfe         = get_bits(gb, 2);
-    num_assoc_data  = get_bits(gb, 3);
-    num_cc          = get_bits(gb, 4);
-
-    if (get_bits1(gb))
-        skip_bits(gb, 4); // mono_mixdown_tag
-    if (get_bits1(gb))
-        skip_bits(gb, 4); // stereo_mixdown_tag
-
-    if (get_bits1(gb))
-        skip_bits(gb, 3); // mixdown_coeff_index and pseudo_surround
-
-    decode_channel_map(layout_map       , AAC_CHANNEL_FRONT, gb, num_front);
-    tags = num_front;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_SIDE,  gb, num_side);
-    tags += num_side;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_BACK,  gb, num_back);
-    tags += num_back;
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_LFE,   gb, num_lfe);
-    tags += num_lfe;
-
-    skip_bits_long(gb, 4 * num_assoc_data);
-
-    decode_channel_map(layout_map + tags, AAC_CHANNEL_CC,    gb, num_cc);
-    tags += num_cc;
-
-    align_get_bits(gb);
-
-    /* comment field, first byte is length */
-    comment_len = get_bits(gb, 8) * 8;
-    if (get_bits_left(gb) < comment_len) {
-        av_log(avctx, AV_LOG_ERROR, overread_err);
-        return AVERROR_INVALIDDATA;
-    }
-    skip_bits_long(gb, comment_len);
-    return tags;
-}
-
-/**
- * Decode GA "General Audio" specific configuration; reference: table 4.1.
- *
- * @param   ac          pointer to AACContext, may be null
- * @param   avctx       pointer to AVCCodecContext, used for logging
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_ga_specific_config(AACContext *ac, AVCodecContext *avctx,
-                                     GetBitContext *gb,
-                                     MPEG4AudioConfig *m4ac,
-                                     int channel_config)
-{
-    int extension_flag, ret, ep_config, res_flags;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int tags = 0;
-
-    if (get_bits1(gb)) { // frameLengthFlag
-        avpriv_request_sample(avctx, "960/120 MDCT window");
-        return AVERROR_PATCHWELCOME;
-    }
-    m4ac->frame_length_short = 0;
-
-    if (get_bits1(gb))       // dependsOnCoreCoder
-        skip_bits(gb, 14);   // coreCoderDelay
-    extension_flag = get_bits1(gb);
-
-    if (m4ac->object_type == AOT_AAC_SCALABLE ||
-        m4ac->object_type == AOT_ER_AAC_SCALABLE)
-        skip_bits(gb, 3);     // layerNr
-
-    if (channel_config == 0) {
-        skip_bits(gb, 4);  // element_instance_tag
-        tags = decode_pce(avctx, m4ac, layout_map, gb);
-        if (tags < 0)
-            return tags;
-    } else {
-        if ((ret = set_default_channel_config(avctx, layout_map,
-                                              &tags, channel_config)))
-            return ret;
-    }
-
-    if (count_channels(layout_map, tags) > 1) {
-        m4ac->ps = 0;
-    } else if (m4ac->sbr == 1 && m4ac->ps == -1)
-        m4ac->ps = 1;
-
-    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
-        return ret;
-
-    if (extension_flag) {
-        switch (m4ac->object_type) {
-        case AOT_ER_BSAC:
-            skip_bits(gb, 5);    // numOfSubFrame
-            skip_bits(gb, 11);   // layer_length
-            break;
-        case AOT_ER_AAC_LC:
-        case AOT_ER_AAC_LTP:
-        case AOT_ER_AAC_SCALABLE:
-        case AOT_ER_AAC_LD:
-            res_flags = get_bits(gb, 3);
-            if (res_flags) {
-                avpriv_report_missing_feature(avctx,
-                                              "AAC data resilience (flags %x)",
-                                              res_flags);
-                return AVERROR_PATCHWELCOME;
-            }
-            break;
-        }
-        skip_bits1(gb);    // extensionFlag3 (TBD in version 3)
-    }
-    switch (m4ac->object_type) {
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LTP:
-    case AOT_ER_AAC_SCALABLE:
-    case AOT_ER_AAC_LD:
-        ep_config = get_bits(gb, 2);
-        if (ep_config) {
-            avpriv_report_missing_feature(avctx,
-                                          "epConfig %d", ep_config);
-            return AVERROR_PATCHWELCOME;
-        }
-    }
-    return 0;
-}
-
-static int decode_eld_specific_config(AACContext *ac, AVCodecContext *avctx,
-                                     GetBitContext *gb,
-                                     MPEG4AudioConfig *m4ac,
-                                     int channel_config)
-{
-    int ret, ep_config, res_flags;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int tags = 0;
-    const int ELDEXT_TERM = 0;
-
-    m4ac->ps  = 0;
-    m4ac->sbr = 0;
-
-    m4ac->frame_length_short = get_bits1(gb);
-    res_flags = get_bits(gb, 3);
-    if (res_flags) {
-        avpriv_report_missing_feature(avctx,
-                                      "AAC data resilience (flags %x)",
-                                      res_flags);
-        return AVERROR_PATCHWELCOME;
-    }
-
-    if (get_bits1(gb)) { // ldSbrPresentFlag
-        avpriv_report_missing_feature(avctx,
-                                      "Low Delay SBR");
-        return AVERROR_PATCHWELCOME;
-    }
-
-    while (get_bits(gb, 4) != ELDEXT_TERM) {
-        int len = get_bits(gb, 4);
-        if (len == 15)
-            len += get_bits(gb, 8);
-        if (len == 15 + 255)
-            len += get_bits(gb, 16);
-        if (get_bits_left(gb) < len * 8 + 4) {
-            av_log(avctx, AV_LOG_ERROR, overread_err);
-            return AVERROR_INVALIDDATA;
-        }
-        skip_bits_long(gb, 8 * len);
-    }
-
-    if ((ret = set_default_channel_config(avctx, layout_map,
-                                          &tags, channel_config)))
-        return ret;
-
-    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
-        return ret;
-
-    ep_config = get_bits(gb, 2);
-    if (ep_config) {
-        avpriv_report_missing_feature(avctx,
-                                      "epConfig %d", ep_config);
-        return AVERROR_PATCHWELCOME;
-    }
-    return 0;
-}
-
-/**
- * Decode audio specific configuration; reference: table 1.13.
- *
- * @param   ac          pointer to AACContext, may be null
- * @param   avctx       pointer to AVCCodecContext, used for logging
- * @param   m4ac        pointer to MPEG4AudioConfig, used for parsing
- * @param   data        pointer to buffer holding an audio specific config
- * @param   bit_size    size of audio specific config or data in bits
- * @param   sync_extension look for an appended sync extension
- *
- * @return  Returns error status or number of consumed bits. <0 - error
- */
-static int decode_audio_specific_config(AACContext *ac,
-                                        AVCodecContext *avctx,
-                                        MPEG4AudioConfig *m4ac,
-                                        const uint8_t *data, int bit_size,
-                                        int sync_extension)
-{
-    GetBitContext gb;
-    int i, ret;
-
-    ff_dlog(avctx, "extradata size %d\n", avctx->extradata_size);
-    for (i = 0; i < avctx->extradata_size; i++)
-        ff_dlog(avctx, "%02x ", avctx->extradata[i]);
-    ff_dlog(avctx, "\n");
-
-    if ((ret = init_get_bits(&gb, data, bit_size)) < 0)
-        return ret;
-
-    if ((i = avpriv_mpeg4audio_get_config(m4ac, data, bit_size,
-                                          sync_extension)) < 0)
-        return AVERROR_INVALIDDATA;
-    if (m4ac->sampling_index > 12) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid sampling rate index %d\n",
-               m4ac->sampling_index);
-        return AVERROR_INVALIDDATA;
-    }
-    if (m4ac->object_type == AOT_ER_AAC_LD &&
-        (m4ac->sampling_index < 3 || m4ac->sampling_index > 7)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "invalid low delay sampling rate index %d\n",
-               m4ac->sampling_index);
-        return AVERROR_INVALIDDATA;
-    }
-
-    skip_bits_long(&gb, i);
-
-    switch (m4ac->object_type) {
-    case AOT_AAC_MAIN:
-    case AOT_AAC_LC:
-    case AOT_AAC_LTP:
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LD:
-        if ((ret = decode_ga_specific_config(ac, avctx, &gb,
-                                            m4ac, m4ac->chan_config)) < 0)
-            return ret;
-        break;
-    case AOT_ER_AAC_ELD:
-        if ((ret = decode_eld_specific_config(ac, avctx, &gb,
-                                              m4ac, m4ac->chan_config)) < 0)
-            return ret;
-        break;
-    default:
-        avpriv_report_missing_feature(avctx,
-                                      "Audio object type %s%d",
-                                      m4ac->sbr == 1 ? "SBR+" : "",
-                                      m4ac->object_type);
-        return AVERROR(ENOSYS);
-    }
-
-    ff_dlog(avctx,
-            "AOT %d chan config %d sampling index %d (%d) SBR %d PS %d\n",
-            m4ac->object_type, m4ac->chan_config, m4ac->sampling_index,
-            m4ac->sample_rate, m4ac->sbr,
-            m4ac->ps);
-
-    return get_bits_count(&gb);
-}
-
-/**
- * linear congruential pseudorandom number generator
- *
- * @param   previous_val    pointer to the current state of the generator
- *
- * @return  Returns a 32-bit pseudorandom integer
- */
-static av_always_inline int lcg_random(int previous_val)
-{
-    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
-    return v.s;
-}
-
 static av_always_inline void reset_predict_state(PredictorState *ps)
 {
     ps->r0   = 0.0f;
@@ -967,509 +79,6 @@ static av_always_inline void reset_predict_state(PredictorState *ps)
     ps->var1 = 1.0f;
 }
 
-static void reset_all_predictors(PredictorState *ps)
-{
-    int i;
-    for (i = 0; i < MAX_PREDICTORS; i++)
-        reset_predict_state(&ps[i]);
-}
-
-static int sample_rate_idx (int rate)
-{
-         if (92017 <= rate) return 0;
-    else if (75132 <= rate) return 1;
-    else if (55426 <= rate) return 2;
-    else if (46009 <= rate) return 3;
-    else if (37566 <= rate) return 4;
-    else if (27713 <= rate) return 5;
-    else if (23004 <= rate) return 6;
-    else if (18783 <= rate) return 7;
-    else if (13856 <= rate) return 8;
-    else if (11502 <= rate) return 9;
-    else if (9391  <= rate) return 10;
-    else                    return 11;
-}
-
-static void reset_predictor_group(PredictorState *ps, int group_num)
-{
-    int i;
-    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
-        reset_predict_state(&ps[i]);
-}
-
-#define AAC_INIT_VLC_STATIC(num, size)                                     \
-    INIT_VLC_STATIC(&vlc_spectral[num], 8, ff_aac_spectral_sizes[num],     \
-         ff_aac_spectral_bits[num], sizeof(ff_aac_spectral_bits[num][0]),  \
-                                    sizeof(ff_aac_spectral_bits[num][0]),  \
-        ff_aac_spectral_codes[num], sizeof(ff_aac_spectral_codes[num][0]), \
-                                    sizeof(ff_aac_spectral_codes[num][0]), \
-        size);
-
-static av_cold void aac_static_table_init(void)
-{
-    AAC_INIT_VLC_STATIC( 0, 304);
-    AAC_INIT_VLC_STATIC( 1, 270);
-    AAC_INIT_VLC_STATIC( 2, 550);
-    AAC_INIT_VLC_STATIC( 3, 300);
-    AAC_INIT_VLC_STATIC( 4, 328);
-    AAC_INIT_VLC_STATIC( 5, 294);
-    AAC_INIT_VLC_STATIC( 6, 306);
-    AAC_INIT_VLC_STATIC( 7, 268);
-    AAC_INIT_VLC_STATIC( 8, 510);
-    AAC_INIT_VLC_STATIC( 9, 366);
-    AAC_INIT_VLC_STATIC(10, 462);
-
-    ff_aac_sbr_init();
-
-    ff_aac_tableinit();
-
-    INIT_VLC_STATIC(&vlc_scalefactors, 7,
-                    FF_ARRAY_ELEMS(ff_aac_scalefactor_code),
-                    ff_aac_scalefactor_bits,
-                    sizeof(ff_aac_scalefactor_bits[0]),
-                    sizeof(ff_aac_scalefactor_bits[0]),
-                    ff_aac_scalefactor_code,
-                    sizeof(ff_aac_scalefactor_code[0]),
-                    sizeof(ff_aac_scalefactor_code[0]),
-                    352);
-
-
-    // window initialization
-    ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024);
-    ff_kbd_window_init(ff_aac_kbd_short_128, 6.0, 128);
-    ff_init_ff_sine_windows(10);
-    ff_init_ff_sine_windows( 9);
-    ff_init_ff_sine_windows( 7);
-
-    cbrt_tableinit();
-}
-
-static AVOnce aac_init = AV_ONCE_INIT;
-
-static av_cold int aac_decode_init(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int ret;
-
-    ret = ff_thread_once(&aac_init, &aac_static_table_init);
-    if (ret != 0)
-        return AVERROR_UNKNOWN;
-
-    ac->avctx = avctx;
-    ac->oc[1].m4ac.sample_rate = avctx->sample_rate;
-
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
-
-    if (avctx->extradata_size > 0) {
-        if ((ret = decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
-                                                avctx->extradata,
-                                                avctx->extradata_size * 8,
-                                                1)) < 0)
-            return ret;
-    } else {
-        int sr, i;
-        uint8_t layout_map[MAX_ELEM_ID*4][3];
-        int layout_map_tags;
-
-        sr = sample_rate_idx(avctx->sample_rate);
-        ac->oc[1].m4ac.sampling_index = sr;
-        ac->oc[1].m4ac.channels = avctx->channels;
-        ac->oc[1].m4ac.sbr = -1;
-        ac->oc[1].m4ac.ps = -1;
-
-        for (i = 0; i < FF_ARRAY_ELEMS(ff_mpeg4audio_channels); i++)
-            if (ff_mpeg4audio_channels[i] == avctx->channels)
-                break;
-        if (i == FF_ARRAY_ELEMS(ff_mpeg4audio_channels)) {
-            i = 0;
-        }
-        ac->oc[1].m4ac.chan_config = i;
-
-        if (ac->oc[1].m4ac.chan_config) {
-            int ret = set_default_channel_config(avctx, layout_map,
-                &layout_map_tags, ac->oc[1].m4ac.chan_config);
-            if (!ret)
-                output_configure(ac, layout_map, layout_map_tags,
-                                 OC_GLOBAL_HDR, 0);
-            else if (avctx->err_recognition & AV_EF_EXPLODE)
-                return AVERROR_INVALIDDATA;
-        }
-    }
-
-    avpriv_float_dsp_init(&ac->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-
-    ac->random_state = 0x1f2e3d4c;
-
-    ff_mdct_init(&ac->mdct,       11, 1, 1.0 / (32768.0 * 1024.0));
-    ff_mdct_init(&ac->mdct_ld,    10, 1, 1.0 / (32768.0 * 512.0));
-    ff_mdct_init(&ac->mdct_small,  8, 1, 1.0 / (32768.0 * 128.0));
-    ff_mdct_init(&ac->mdct_ltp,   11, 0, -2.0 * 32768.0);
-    ret = ff_imdct15_init(&ac->mdct480, 5);
-    if (ret < 0)
-        return ret;
-
-    return 0;
-}
-
-/**
- * Skip data_stream_element; reference: table 4.10.
- */
-static int skip_data_stream_element(AACContext *ac, GetBitContext *gb)
-{
-    int byte_align = get_bits1(gb);
-    int count = get_bits(gb, 8);
-    if (count == 255)
-        count += get_bits(gb, 8);
-    if (byte_align)
-        align_get_bits(gb);
-
-    if (get_bits_left(gb) < 8 * count) {
-        av_log(ac->avctx, AV_LOG_ERROR, overread_err);
-        return AVERROR_INVALIDDATA;
-    }
-    skip_bits_long(gb, 8 * count);
-    return 0;
-}
-
-static int decode_prediction(AACContext *ac, IndividualChannelStream *ics,
-                             GetBitContext *gb)
-{
-    int sfb;
-    if (get_bits1(gb)) {
-        ics->predictor_reset_group = get_bits(gb, 5);
-        if (ics->predictor_reset_group == 0 ||
-            ics->predictor_reset_group > 30) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid Predictor Reset Group.\n");
-            return AVERROR_INVALIDDATA;
-        }
-    }
-    for (sfb = 0; sfb < FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index]); sfb++) {
-        ics->prediction_used[sfb] = get_bits1(gb);
-    }
-    return 0;
-}
-
-/**
- * Decode Long Term Prediction data; reference: table 4.xx.
- */
-static void decode_ltp(LongTermPrediction *ltp,
-                       GetBitContext *gb, uint8_t max_sfb)
-{
-    int sfb;
-
-    ltp->lag  = get_bits(gb, 11);
-    ltp->coef = ltp_coef[get_bits(gb, 3)];
-    for (sfb = 0; sfb < FFMIN(max_sfb, MAX_LTP_LONG_SFB); sfb++)
-        ltp->used[sfb] = get_bits1(gb);
-}
-
-/**
- * Decode Individual Channel Stream info; reference: table 4.6.
- */
-static int decode_ics_info(AACContext *ac, IndividualChannelStream *ics,
-                           GetBitContext *gb)
-{
-    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
-    const int aot = m4ac->object_type;
-    const int sampling_index = m4ac->sampling_index;
-    if (aot != AOT_ER_AAC_ELD) {
-        if (get_bits1(gb)) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Reserved bit set.\n");
-            if (ac->avctx->err_recognition & AV_EF_BITSTREAM)
-                return AVERROR_INVALIDDATA;
-        }
-        ics->window_sequence[1] = ics->window_sequence[0];
-        ics->window_sequence[0] = get_bits(gb, 2);
-        if (aot == AOT_ER_AAC_LD &&
-            ics->window_sequence[0] != ONLY_LONG_SEQUENCE) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "AAC LD is only defined for ONLY_LONG_SEQUENCE but "
-                   "window sequence %d found.\n", ics->window_sequence[0]);
-            ics->window_sequence[0] = ONLY_LONG_SEQUENCE;
-            return AVERROR_INVALIDDATA;
-        }
-        ics->use_kb_window[1]   = ics->use_kb_window[0];
-        ics->use_kb_window[0]   = get_bits1(gb);
-    }
-    ics->num_window_groups  = 1;
-    ics->group_len[0]       = 1;
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        int i;
-        ics->max_sfb = get_bits(gb, 4);
-        for (i = 0; i < 7; i++) {
-            if (get_bits1(gb)) {
-                ics->group_len[ics->num_window_groups - 1]++;
-            } else {
-                ics->num_window_groups++;
-                ics->group_len[ics->num_window_groups - 1] = 1;
-            }
-        }
-        ics->num_windows       = 8;
-        ics->swb_offset        =    ff_swb_offset_128[sampling_index];
-        ics->num_swb           =   ff_aac_num_swb_128[sampling_index];
-        ics->tns_max_bands     = ff_tns_max_bands_128[sampling_index];
-        ics->predictor_present = 0;
-    } else {
-        ics->max_sfb           = get_bits(gb, 6);
-        ics->num_windows       = 1;
-        if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD) {
-            if (m4ac->frame_length_short) {
-                ics->swb_offset    =     ff_swb_offset_480[sampling_index];
-                ics->num_swb       =    ff_aac_num_swb_480[sampling_index];
-                ics->tns_max_bands =  ff_tns_max_bands_480[sampling_index];
-            } else {
-                ics->swb_offset    =     ff_swb_offset_512[sampling_index];
-                ics->num_swb       =    ff_aac_num_swb_512[sampling_index];
-                ics->tns_max_bands =  ff_tns_max_bands_512[sampling_index];
-            }
-            if (!ics->num_swb || !ics->swb_offset)
-                return AVERROR_BUG;
-        } else {
-            ics->swb_offset    =    ff_swb_offset_1024[sampling_index];
-            ics->num_swb       =   ff_aac_num_swb_1024[sampling_index];
-            ics->tns_max_bands = ff_tns_max_bands_1024[sampling_index];
-        }
-        if (aot != AOT_ER_AAC_ELD) {
-            ics->predictor_present     = get_bits1(gb);
-            ics->predictor_reset_group = 0;
-        }
-        if (ics->predictor_present) {
-            if (aot == AOT_AAC_MAIN) {
-                if (decode_prediction(ac, ics, gb)) {
-                    return AVERROR_INVALIDDATA;
-                }
-            } else if (aot == AOT_AAC_LC ||
-                       aot == AOT_ER_AAC_LC) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Prediction is not allowed in AAC-LC.\n");
-                return AVERROR_INVALIDDATA;
-            } else {
-                if (aot == AOT_ER_AAC_LD) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "LTP in ER AAC LD not yet implemented.\n");
-                    return AVERROR_PATCHWELCOME;
-                }
-                if ((ics->ltp.present = get_bits(gb, 1)))
-                    decode_ltp(&ics->ltp, gb, ics->max_sfb);
-            }
-        }
-    }
-
-    if (ics->max_sfb > ics->num_swb) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Number of scalefactor bands in group (%d) "
-               "exceeds limit (%d).\n",
-               ics->max_sfb, ics->num_swb);
-        return AVERROR_INVALIDDATA;
-    }
-
-    return 0;
-}
-
-/**
- * Decode band types (section_data payload); reference: table 4.46.
- *
- * @param   band_type           array of the used band type
- * @param   band_type_run_end   array of the last scalefactor band of a band type run
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_band_types(AACContext *ac, enum BandType band_type[120],
-                             int band_type_run_end[120], GetBitContext *gb,
-                             IndividualChannelStream *ics)
-{
-    int g, idx = 0;
-    const int bits = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) ? 3 : 5;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        int k = 0;
-        while (k < ics->max_sfb) {
-            uint8_t sect_end = k;
-            int sect_len_incr;
-            int sect_band_type = get_bits(gb, 4);
-            if (sect_band_type == 12) {
-                av_log(ac->avctx, AV_LOG_ERROR, "invalid band type\n");
-                return AVERROR_INVALIDDATA;
-            }
-            do {
-                sect_len_incr = get_bits(gb, bits);
-                sect_end += sect_len_incr;
-                if (get_bits_left(gb) < 0) {
-                    av_log(ac->avctx, AV_LOG_ERROR, overread_err);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (sect_end > ics->max_sfb) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "Number of bands (%d) exceeds limit (%d).\n",
-                           sect_end, ics->max_sfb);
-                    return AVERROR_INVALIDDATA;
-                }
-            } while (sect_len_incr == (1 << bits) - 1);
-            for (; k < sect_end; k++) {
-                band_type        [idx]   = sect_band_type;
-                band_type_run_end[idx++] = sect_end;
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode scalefactors; reference: table 4.47.
- *
- * @param   global_gain         first scalefactor value as scalefactors are differentially coded
- * @param   band_type           array of the used band type
- * @param   band_type_run_end   array of the last scalefactor band of a band type run
- * @param   sf                  array of scalefactors or intensity stereo positions
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_scalefactors(AACContext *ac, float sf[120], GetBitContext *gb,
-                               unsigned int global_gain,
-                               IndividualChannelStream *ics,
-                               enum BandType band_type[120],
-                               int band_type_run_end[120])
-{
-    int g, i, idx = 0;
-    int offset[3] = { global_gain, global_gain - 90, 0 };
-    int clipped_offset;
-    int noise_flag = 1;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb;) {
-            int run_end = band_type_run_end[idx];
-            if (band_type[idx] == ZERO_BT) {
-                for (; i < run_end; i++, idx++)
-                    sf[idx] = 0.0;
-            } else if ((band_type[idx] == INTENSITY_BT) ||
-                       (band_type[idx] == INTENSITY_BT2)) {
-                for (; i < run_end; i++, idx++) {
-                    offset[2] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    clipped_offset = av_clip(offset[2], -155, 100);
-                    if (offset[2] != clipped_offset) {
-                        avpriv_request_sample(ac->avctx,
-                                              "If you heard an audible artifact, there may be a bug in the decoder. "
-                                              "Clipped intensity stereo position (%d -> %d)",
-                                              offset[2], clipped_offset);
-                    }
-                    sf[idx] = ff_aac_pow2sf_tab[-clipped_offset + POW_SF2_ZERO];
-                }
-            } else if (band_type[idx] == NOISE_BT) {
-                for (; i < run_end; i++, idx++) {
-                    if (noise_flag-- > 0)
-                        offset[1] += get_bits(gb, 9) - 256;
-                    else
-                        offset[1] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    clipped_offset = av_clip(offset[1], -100, 155);
-                    if (offset[1] != clipped_offset) {
-                        avpriv_request_sample(ac->avctx,
-                                              "If you heard an audible artifact, there may be a bug in the decoder. "
-                                              "Clipped noise gain (%d -> %d)",
-                                              offset[1], clipped_offset);
-                    }
-                    sf[idx] = -ff_aac_pow2sf_tab[clipped_offset + POW_SF2_ZERO];
-                }
-            } else {
-                for (; i < run_end; i++, idx++) {
-                    offset[0] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                    if (offset[0] > 255U) {
-                        av_log(ac->avctx, AV_LOG_ERROR,
-                               "Scalefactor (%d) out of range.\n", offset[0]);
-                        return AVERROR_INVALIDDATA;
-                    }
-                    sf[idx] = -ff_aac_pow2sf_tab[offset[0] - 100 + POW_SF2_ZERO];
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode pulse data; reference: table 4.7.
- */
-static int decode_pulses(Pulse *pulse, GetBitContext *gb,
-                         const uint16_t *swb_offset, int num_swb)
-{
-    int i, pulse_swb;
-    pulse->num_pulse = get_bits(gb, 2) + 1;
-    pulse_swb        = get_bits(gb, 6);
-    if (pulse_swb >= num_swb)
-        return -1;
-    pulse->pos[0]    = swb_offset[pulse_swb];
-    pulse->pos[0]   += get_bits(gb, 5);
-    if (pulse->pos[0] > 1023)
-        return -1;
-    pulse->amp[0]    = get_bits(gb, 4);
-    for (i = 1; i < pulse->num_pulse; i++) {
-        pulse->pos[i] = get_bits(gb, 5) + pulse->pos[i - 1];
-        if (pulse->pos[i] > 1023)
-            return -1;
-        pulse->amp[i] = get_bits(gb, 4);
-    }
-    return 0;
-}
-
-/**
- * Decode Temporal Noise Shaping data; reference: table 4.48.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_tns(AACContext *ac, TemporalNoiseShaping *tns,
-                      GetBitContext *gb, const IndividualChannelStream *ics)
-{
-    int w, filt, i, coef_len, coef_res, coef_compress;
-    const int is8 = ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE;
-    const int tns_max_order = is8 ? 7 : ac->oc[1].m4ac.object_type == AOT_AAC_MAIN ? 20 : 12;
-    for (w = 0; w < ics->num_windows; w++) {
-        if ((tns->n_filt[w] = get_bits(gb, 2 - is8))) {
-            coef_res = get_bits1(gb);
-
-            for (filt = 0; filt < tns->n_filt[w]; filt++) {
-                int tmp2_idx;
-                tns->length[w][filt] = get_bits(gb, 6 - 2 * is8);
-
-                if ((tns->order[w][filt] = get_bits(gb, 5 - 2 * is8)) > tns_max_order) {
-                    av_log(ac->avctx, AV_LOG_ERROR,
-                           "TNS filter order %d is greater than maximum %d.\n",
-                           tns->order[w][filt], tns_max_order);
-                    tns->order[w][filt] = 0;
-                    return AVERROR_INVALIDDATA;
-                }
-                if (tns->order[w][filt]) {
-                    tns->direction[w][filt] = get_bits1(gb);
-                    coef_compress = get_bits1(gb);
-                    coef_len = coef_res + 3 - coef_compress;
-                    tmp2_idx = 2 * coef_compress + coef_res;
-
-                    for (i = 0; i < tns->order[w][filt]; i++)
-                        tns->coef[w][filt][i] = tns_tmp2_map[tmp2_idx][get_bits(gb, coef_len)];
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Decode Mid/Side data; reference: table 4.54.
- *
- * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
- *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
- *                      [3] reserved for scalable AAC
- */
-static void decode_mid_side_stereo(ChannelElement *cpe, GetBitContext *gb,
-                                   int ms_present)
-{
-    int idx;
-    int max_idx = cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb;
-    if (ms_present == 1) {
-        for (idx = 0; idx < max_idx; idx++)
-            cpe->ms_mask[idx] = get_bits1(gb);
-    } else if (ms_present == 2) {
-        memset(cpe->ms_mask, 1, max_idx * sizeof(cpe->ms_mask[0]));
-    }
-}
-
 #ifndef VMUL2
 static inline float *VMUL2(float *dst, const float *v, unsigned idx,
                            const float *scale)
@@ -1538,233 +147,6 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
 }
 #endif
 
-/**
- * Decode spectral data; reference: table 4.50.
- * Dequantize and scale spectral data; reference: 4.6.3.3.
- *
- * @param   coef            array of dequantized, scaled spectral data
- * @param   sf              array of scalefactors or intensity stereo positions
- * @param   pulse_present   set if pulses are present
- * @param   pulse           pointer to pulse data struct
- * @param   band_type       array of the used band type
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_spectrum_and_dequant(AACContext *ac, float coef[1024],
-                                       GetBitContext *gb, const float sf[120],
-                                       int pulse_present, const Pulse *pulse,
-                                       const IndividualChannelStream *ics,
-                                       enum BandType band_type[120])
-{
-    int i, k, g, idx = 0;
-    const int c = 1024 / ics->num_windows;
-    const uint16_t *offsets = ics->swb_offset;
-    float *coef_base = coef;
-
-    for (g = 0; g < ics->num_windows; g++)
-        memset(coef + g * 128 + offsets[ics->max_sfb], 0,
-               sizeof(float) * (c - offsets[ics->max_sfb]));
-
-    for (g = 0; g < ics->num_window_groups; g++) {
-        unsigned g_len = ics->group_len[g];
-
-        for (i = 0; i < ics->max_sfb; i++, idx++) {
-            const unsigned cbt_m1 = band_type[idx] - 1;
-            float *cfo = coef + offsets[i];
-            int off_len = offsets[i + 1] - offsets[i];
-            int group;
-
-            if (cbt_m1 >= INTENSITY_BT2 - 1) {
-                for (group = 0; group < g_len; group++, cfo+=128) {
-                    memset(cfo, 0, off_len * sizeof(float));
-                }
-            } else if (cbt_m1 == NOISE_BT - 1) {
-                for (group = 0; group < g_len; group++, cfo+=128) {
-                    float scale;
-                    float band_energy;
-
-                    for (k = 0; k < off_len; k++) {
-                        ac->random_state  = lcg_random(ac->random_state);
-                        cfo[k] = ac->random_state;
-                    }
-
-                    band_energy = ac->fdsp.scalarproduct_float(cfo, cfo, off_len);
-                    scale = sf[idx] / sqrtf(band_energy);
-                    ac->fdsp.vector_fmul_scalar(cfo, cfo, scale, off_len);
-                }
-            } else {
-                const float *vq = ff_aac_codebook_vector_vals[cbt_m1];
-                const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cbt_m1];
-                VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cbt_m1].table;
-                OPEN_READER(re, gb);
-
-                switch (cbt_m1 >> 1) {
-                case 0:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned cb_idx;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            cf = VMUL4(cf, vq, cb_idx, sf + idx);
-                        } while (len -= 4);
-                    }
-                    break;
-
-                case 1:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nnz;
-                            unsigned cb_idx;
-                            uint32_t bits;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 8 & 15;
-                            bits = nnz ? GET_CACHE(re, gb) : 0;
-                            LAST_SKIP_BITS(re, gb, nnz);
-                            cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx);
-                        } while (len -= 4);
-                    }
-                    break;
-
-                case 2:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned cb_idx;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            cf = VMUL2(cf, vq, cb_idx, sf + idx);
-                        } while (len -= 2);
-                    }
-                    break;
-
-                case 3:
-                case 4:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nnz;
-                            unsigned cb_idx;
-                            unsigned sign;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 8 & 15;
-                            sign = nnz ? SHOW_UBITS(re, gb, nnz) << (cb_idx >> 12) : 0;
-                            LAST_SKIP_BITS(re, gb, nnz);
-                            cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx);
-                        } while (len -= 2);
-                    }
-                    break;
-
-                default:
-                    for (group = 0; group < g_len; group++, cfo+=128) {
-                        float *cf = cfo;
-                        uint32_t *icf = (uint32_t *) cf;
-                        int len = off_len;
-
-                        do {
-                            int code;
-                            unsigned nzt, nnz;
-                            unsigned cb_idx;
-                            uint32_t bits;
-                            int j;
-
-                            UPDATE_CACHE(re, gb);
-                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
-
-                            if (!code) {
-                                *icf++ = 0;
-                                *icf++ = 0;
-                                continue;
-                            }
-
-                            cb_idx = cb_vector_idx[code];
-                            nnz = cb_idx >> 12;
-                            nzt = cb_idx >> 8;
-                            bits = SHOW_UBITS(re, gb, nnz) << (32-nnz);
-                            LAST_SKIP_BITS(re, gb, nnz);
-
-                            for (j = 0; j < 2; j++) {
-                                if (nzt & 1<<j) {
-                                    uint32_t b;
-                                    int n;
-                                    /* The total length of escape_sequence must be < 22 bits according
-                                       to the specification (i.e. max is 111111110xxxxxxxxxxxx). */
-                                    UPDATE_CACHE(re, gb);
-                                    b = GET_CACHE(re, gb);
-                                    b = 31 - av_log2(~b);
-
-                                    if (b > 8) {
-                                        av_log(ac->avctx, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
-                                        return AVERROR_INVALIDDATA;
-                                    }
-
-                                    SKIP_BITS(re, gb, b + 1);
-                                    b += 4;
-                                    n = (1 << b) + SHOW_UBITS(re, gb, b);
-                                    LAST_SKIP_BITS(re, gb, b);
-                                    *icf++ = cbrt_tab[n] | (bits & 1U<<31);
-                                    bits <<= 1;
-                                } else {
-                                    unsigned v = ((const uint32_t*)vq)[cb_idx & 15];
-                                    *icf++ = (bits & 1U<<31) | v;
-                                    bits <<= !!v;
-                                }
-                                cb_idx >>= 4;
-                            }
-                        } while (len -= 2);
-
-                        ac->fdsp.vector_fmul_scalar(cfo, cfo, sf[idx], off_len);
-                    }
-                }
-
-                CLOSE_READER(re, gb);
-            }
-        }
-        coef += g_len << 7;
-    }
-
-    if (pulse_present) {
-        idx = 0;
-        for (i = 0; i < pulse->num_pulse; i++) {
-            float co = coef_base[ pulse->pos[i] ];
-            while (offsets[idx + 1] <= pulse->pos[i])
-                idx++;
-            if (band_type[idx] != NOISE_BT && sf[idx]) {
-                float ico = -pulse->amp[i];
-                if (co) {
-                    co /= sf[idx];
-                    ico = co / sqrtf(sqrtf(fabsf(co))) + (co > 0 ? -ico : ico);
-                }
-                coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx];
-            }
-        }
-    }
-    return 0;
-}
-
 static av_always_inline float flt16_round(float pf)
 {
     union av_intfloat32 tmp;
@@ -1821,738 +203,6 @@ static av_always_inline void predict(PredictorState *ps, float *coef,
 }
 
 /**
- * Apply AAC-Main style frequency domain prediction.
- */
-static void apply_prediction(AACContext *ac, SingleChannelElement *sce)
-{
-    int sfb, k;
-
-    if (!sce->ics.predictor_initialized) {
-        reset_all_predictors(sce->predictor_state);
-        sce->ics.predictor_initialized = 1;
-    }
-
-    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
-        for (sfb = 0;
-             sfb < ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index];
-             sfb++) {
-            for (k = sce->ics.swb_offset[sfb];
-                 k < sce->ics.swb_offset[sfb + 1];
-                 k++) {
-                predict(&sce->predictor_state[k], &sce->coeffs[k],
-                        sce->ics.predictor_present &&
-                        sce->ics.prediction_used[sfb]);
-            }
-        }
-        if (sce->ics.predictor_reset_group)
-            reset_predictor_group(sce->predictor_state,
-                                  sce->ics.predictor_reset_group);
-    } else
-        reset_all_predictors(sce->predictor_state);
-}
-
-/**
- * Decode an individual_channel_stream payload; reference: table 4.44.
- *
- * @param   common_window   Channels have independent [0], or shared [1], Individual Channel Stream information.
- * @param   scale_flag      scalable [1] or non-scalable [0] AAC (Unused until scalable AAC is implemented.)
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_ics(AACContext *ac, SingleChannelElement *sce,
-                      GetBitContext *gb, int common_window, int scale_flag)
-{
-    Pulse pulse;
-    TemporalNoiseShaping    *tns = &sce->tns;
-    IndividualChannelStream *ics = &sce->ics;
-    float *out = sce->coeffs;
-    int global_gain, eld_syntax, er_syntax, pulse_present = 0;
-    int ret;
-
-    eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-    er_syntax  = ac->oc[1].m4ac.object_type == AOT_ER_AAC_LC ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LTP ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LD ||
-                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-
-    /* This assignment is to silence a GCC warning about the variable being used
-     * uninitialized when in fact it always is.
-     */
-    pulse.num_pulse = 0;
-
-    global_gain = get_bits(gb, 8);
-
-    if (!common_window && !scale_flag) {
-        if (decode_ics_info(ac, ics, gb) < 0)
-            return AVERROR_INVALIDDATA;
-    }
-
-    if ((ret = decode_band_types(ac, sce->band_type,
-                                 sce->band_type_run_end, gb, ics)) < 0)
-        return ret;
-    if ((ret = decode_scalefactors(ac, sce->sf, gb, global_gain, ics,
-                                  sce->band_type, sce->band_type_run_end)) < 0)
-        return ret;
-
-    pulse_present = 0;
-    if (!scale_flag) {
-        if (!eld_syntax && (pulse_present = get_bits1(gb))) {
-            if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Pulse tool not allowed in eight short sequence.\n");
-                return AVERROR_INVALIDDATA;
-            }
-            if (decode_pulses(&pulse, gb, ics->swb_offset, ics->num_swb)) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "Pulse data corrupt or invalid.\n");
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        tns->present = get_bits1(gb);
-        if (tns->present && !er_syntax)
-            if (decode_tns(ac, tns, gb, ics) < 0)
-                return AVERROR_INVALIDDATA;
-        if (!eld_syntax && get_bits1(gb)) {
-            avpriv_request_sample(ac->avctx, "SSR");
-            return AVERROR_PATCHWELCOME;
-        }
-        // I see no textual basis in the spec for this occurring after SSR gain
-        // control, but this is what both reference and real implementations do
-        if (tns->present && er_syntax)
-            if (decode_tns(ac, tns, gb, ics) < 0)
-                return AVERROR_INVALIDDATA;
-    }
-
-    if (decode_spectrum_and_dequant(ac, out, gb, sce->sf, pulse_present,
-                                    &pulse, ics, sce->band_type) < 0)
-        return AVERROR_INVALIDDATA;
-
-    if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN && !common_window)
-        apply_prediction(ac, sce);
-
-    return 0;
-}
-
-/**
- * Mid/Side stereo decoding; reference: 4.6.8.1.3.
- */
-static void apply_mid_side_stereo(AACContext *ac, ChannelElement *cpe)
-{
-    const IndividualChannelStream *ics = &cpe->ch[0].ics;
-    float *ch0 = cpe->ch[0].coeffs;
-    float *ch1 = cpe->ch[1].coeffs;
-    int g, i, group, idx = 0;
-    const uint16_t *offsets = ics->swb_offset;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb; i++, idx++) {
-            if (cpe->ms_mask[idx] &&
-                cpe->ch[0].band_type[idx] < NOISE_BT &&
-                cpe->ch[1].band_type[idx] < NOISE_BT) {
-                for (group = 0; group < ics->group_len[g]; group++) {
-                    ac->fdsp.butterflies_float(ch0 + group * 128 + offsets[i],
-                                               ch1 + group * 128 + offsets[i],
-                                               offsets[i+1] - offsets[i]);
-                }
-            }
-        }
-        ch0 += ics->group_len[g] * 128;
-        ch1 += ics->group_len[g] * 128;
-    }
-}
-
-/**
- * intensity stereo decoding; reference: 4.6.8.2.3
- *
- * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
- *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
- *                      [3] reserved for scalable AAC
- */
-static void apply_intensity_stereo(AACContext *ac,
-                                   ChannelElement *cpe, int ms_present)
-{
-    const IndividualChannelStream *ics = &cpe->ch[1].ics;
-    SingleChannelElement         *sce1 = &cpe->ch[1];
-    float *coef0 = cpe->ch[0].coeffs, *coef1 = cpe->ch[1].coeffs;
-    const uint16_t *offsets = ics->swb_offset;
-    int g, group, i, idx = 0;
-    int c;
-    float scale;
-    for (g = 0; g < ics->num_window_groups; g++) {
-        for (i = 0; i < ics->max_sfb;) {
-            if (sce1->band_type[idx] == INTENSITY_BT ||
-                sce1->band_type[idx] == INTENSITY_BT2) {
-                const int bt_run_end = sce1->band_type_run_end[idx];
-                for (; i < bt_run_end; i++, idx++) {
-                    c = -1 + 2 * (sce1->band_type[idx] - 14);
-                    if (ms_present)
-                        c *= 1 - 2 * cpe->ms_mask[idx];
-                    scale = c * sce1->sf[idx];
-                    for (group = 0; group < ics->group_len[g]; group++)
-                        ac->fdsp.vector_fmul_scalar(coef1 + group * 128 + offsets[i],
-                                                    coef0 + group * 128 + offsets[i],
-                                                    scale,
-                                                    offsets[i + 1] - offsets[i]);
-                }
-            } else {
-                int bt_run_end = sce1->band_type_run_end[idx];
-                idx += bt_run_end - i;
-                i    = bt_run_end;
-            }
-        }
-        coef0 += ics->group_len[g] * 128;
-        coef1 += ics->group_len[g] * 128;
-    }
-}
-
-/**
- * Decode a channel_pair_element; reference: table 4.4.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_cpe(AACContext *ac, GetBitContext *gb, ChannelElement *cpe)
-{
-    int i, ret, common_window, ms_present = 0;
-    int eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
-
-    common_window = eld_syntax || get_bits1(gb);
-    if (common_window) {
-        if (decode_ics_info(ac, &cpe->ch[0].ics, gb))
-            return AVERROR_INVALIDDATA;
-        i = cpe->ch[1].ics.use_kb_window[0];
-        cpe->ch[1].ics = cpe->ch[0].ics;
-        cpe->ch[1].ics.use_kb_window[1] = i;
-        if (cpe->ch[1].ics.predictor_present &&
-            (ac->oc[1].m4ac.object_type != AOT_AAC_MAIN))
-            if ((cpe->ch[1].ics.ltp.present = get_bits(gb, 1)))
-                decode_ltp(&cpe->ch[1].ics.ltp, gb, cpe->ch[1].ics.max_sfb);
-        ms_present = get_bits(gb, 2);
-        if (ms_present == 3) {
-            av_log(ac->avctx, AV_LOG_ERROR, "ms_present = 3 is reserved.\n");
-            return AVERROR_INVALIDDATA;
-        } else if (ms_present)
-            decode_mid_side_stereo(cpe, gb, ms_present);
-    }
-    if ((ret = decode_ics(ac, &cpe->ch[0], gb, common_window, 0)))
-        return ret;
-    if ((ret = decode_ics(ac, &cpe->ch[1], gb, common_window, 0)))
-        return ret;
-
-    if (common_window) {
-        if (ms_present)
-            apply_mid_side_stereo(ac, cpe);
-        if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN) {
-            apply_prediction(ac, &cpe->ch[0]);
-            apply_prediction(ac, &cpe->ch[1]);
-        }
-    }
-
-    apply_intensity_stereo(ac, cpe, ms_present);
-    return 0;
-}
-
-static const float cce_scale[] = {
-    1.09050773266525765921, //2^(1/8)
-    1.18920711500272106672, //2^(1/4)
-    M_SQRT2,
-    2,
-};
-
-/**
- * Decode coupling_channel_element; reference: table 4.8.
- *
- * @return  Returns error status. 0 - OK, !0 - error
- */
-static int decode_cce(AACContext *ac, GetBitContext *gb, ChannelElement *che)
-{
-    int num_gain = 0;
-    int c, g, sfb, ret;
-    int sign;
-    float scale;
-    SingleChannelElement *sce = &che->ch[0];
-    ChannelCoupling     *coup = &che->coup;
-
-    coup->coupling_point = 2 * get_bits1(gb);
-    coup->num_coupled = get_bits(gb, 3);
-    for (c = 0; c <= coup->num_coupled; c++) {
-        num_gain++;
-        coup->type[c] = get_bits1(gb) ? TYPE_CPE : TYPE_SCE;
-        coup->id_select[c] = get_bits(gb, 4);
-        if (coup->type[c] == TYPE_CPE) {
-            coup->ch_select[c] = get_bits(gb, 2);
-            if (coup->ch_select[c] == 3)
-                num_gain++;
-        } else
-            coup->ch_select[c] = 2;
-    }
-    coup->coupling_point += get_bits1(gb) || (coup->coupling_point >> 1);
-
-    sign  = get_bits(gb, 1);
-    scale = cce_scale[get_bits(gb, 2)];
-
-    if ((ret = decode_ics(ac, sce, gb, 0, 0)))
-        return ret;
-
-    for (c = 0; c < num_gain; c++) {
-        int idx  = 0;
-        int cge  = 1;
-        int gain = 0;
-        float gain_cache = 1.0;
-        if (c) {
-            cge = coup->coupling_point == AFTER_IMDCT ? 1 : get_bits1(gb);
-            gain = cge ? get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60: 0;
-            gain_cache = powf(scale, -gain);
-        }
-        if (coup->coupling_point == AFTER_IMDCT) {
-            coup->gain[c][0] = gain_cache;
-        } else {
-            for (g = 0; g < sce->ics.num_window_groups; g++) {
-                for (sfb = 0; sfb < sce->ics.max_sfb; sfb++, idx++) {
-                    if (sce->band_type[idx] != ZERO_BT) {
-                        if (!cge) {
-                            int t = get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
-                            if (t) {
-                                int s = 1;
-                                t = gain += t;
-                                if (sign) {
-                                    s  -= 2 * (t & 0x1);
-                                    t >>= 1;
-                                }
-                                gain_cache = powf(scale, -t) * s;
-                            }
-                        }
-                        coup->gain[c][idx] = gain_cache;
-                    }
-                }
-            }
-        }
-    }
-    return 0;
-}
-
-/**
- * Parse whether channels are to be excluded from Dynamic Range Compression; reference: table 4.53.
- *
- * @return  Returns number of bytes consumed.
- */
-static int decode_drc_channel_exclusions(DynamicRangeControl *che_drc,
-                                         GetBitContext *gb)
-{
-    int i;
-    int num_excl_chan = 0;
-
-    do {
-        for (i = 0; i < 7; i++)
-            che_drc->exclude_mask[num_excl_chan++] = get_bits1(gb);
-    } while (num_excl_chan < MAX_CHANNELS - 7 && get_bits1(gb));
-
-    return num_excl_chan / 7;
-}
-
-/**
- * Decode dynamic range information; reference: table 4.52.
- *
- * @return  Returns number of bytes consumed.
- */
-static int decode_dynamic_range(DynamicRangeControl *che_drc,
-                                GetBitContext *gb)
-{
-    int n             = 1;
-    int drc_num_bands = 1;
-    int i;
-
-    /* pce_tag_present? */
-    if (get_bits1(gb)) {
-        che_drc->pce_instance_tag  = get_bits(gb, 4);
-        skip_bits(gb, 4); // tag_reserved_bits
-        n++;
-    }
-
-    /* excluded_chns_present? */
-    if (get_bits1(gb)) {
-        n += decode_drc_channel_exclusions(che_drc, gb);
-    }
-
-    /* drc_bands_present? */
-    if (get_bits1(gb)) {
-        che_drc->band_incr            = get_bits(gb, 4);
-        che_drc->interpolation_scheme = get_bits(gb, 4);
-        n++;
-        drc_num_bands += che_drc->band_incr;
-        for (i = 0; i < drc_num_bands; i++) {
-            che_drc->band_top[i] = get_bits(gb, 8);
-            n++;
-        }
-    }
-
-    /* prog_ref_level_present? */
-    if (get_bits1(gb)) {
-        che_drc->prog_ref_level = get_bits(gb, 7);
-        skip_bits1(gb); // prog_ref_level_reserved_bits
-        n++;
-    }
-
-    for (i = 0; i < drc_num_bands; i++) {
-        che_drc->dyn_rng_sgn[i] = get_bits1(gb);
-        che_drc->dyn_rng_ctl[i] = get_bits(gb, 7);
-        n++;
-    }
-
-    return n;
-}
-
-/**
- * Decode extension data (incomplete); reference: table 4.51.
- *
- * @param   cnt length of TYPE_FIL syntactic element in bytes
- *
- * @return Returns number of bytes consumed
- */
-static int decode_extension_payload(AACContext *ac, GetBitContext *gb, int cnt,
-                                    ChannelElement *che, enum RawDataBlockType elem_type)
-{
-    int crc_flag = 0;
-    int res = cnt;
-    switch (get_bits(gb, 4)) { // extension type
-    case EXT_SBR_DATA_CRC:
-        crc_flag++;
-    case EXT_SBR_DATA:
-        if (!che) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR was found before the first channel element.\n");
-            return res;
-        } else if (!ac->oc[1].m4ac.sbr) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR signaled to be not-present but was found in the bitstream.\n");
-            skip_bits_long(gb, 8 * cnt - 4);
-            return res;
-        } else if (ac->oc[1].m4ac.sbr == -1 && ac->oc[1].status == OC_LOCKED) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Implicit SBR was found with a first occurrence after the first frame.\n");
-            skip_bits_long(gb, 8 * cnt - 4);
-            return res;
-        } else if (ac->oc[1].m4ac.ps == -1 && ac->oc[1].status < OC_LOCKED && ac->avctx->channels == 1) {
-            ac->oc[1].m4ac.sbr = 1;
-            ac->oc[1].m4ac.ps = 1;
-            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
-            output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
-                             ac->oc[1].status, 1);
-        } else {
-            ac->oc[1].m4ac.sbr = 1;
-            ac->avctx->profile = FF_PROFILE_AAC_HE;
-        }
-        res = ff_decode_sbr_extension(ac, &che->sbr, gb, crc_flag, cnt, elem_type);
-        break;
-    case EXT_DYNAMIC_RANGE:
-        res = decode_dynamic_range(&ac->che_drc, gb);
-        break;
-    case EXT_FILL:
-    case EXT_FILL_DATA:
-    case EXT_DATA_ELEMENT:
-    default:
-        skip_bits_long(gb, 8 * cnt - 4);
-        break;
-    };
-    return res;
-}
-
-/**
- * Decode Temporal Noise Shaping filter coefficients and apply all-pole filters; reference: 4.6.9.3.
- *
- * @param   decode  1 if tool is used normally, 0 if tool is used in LTP.
- * @param   coef    spectral coefficients
- */
-static void apply_tns(float coef[1024], TemporalNoiseShaping *tns,
-                      IndividualChannelStream *ics, int decode)
-{
-    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
-    int w, filt, m, i;
-    int bottom, top, order, start, end, size, inc;
-    float lpc[TNS_MAX_ORDER];
-    float tmp[TNS_MAX_ORDER + 1];
-
-    for (w = 0; w < ics->num_windows; w++) {
-        bottom = ics->num_swb;
-        for (filt = 0; filt < tns->n_filt[w]; filt++) {
-            top    = bottom;
-            bottom = FFMAX(0, top - tns->length[w][filt]);
-            order  = tns->order[w][filt];
-            if (order == 0)
-                continue;
-
-            // tns_decode_coef
-            compute_lpc_coefs(tns->coef[w][filt], order, lpc, 0, 0, 0);
-
-            start = ics->swb_offset[FFMIN(bottom, mmm)];
-            end   = ics->swb_offset[FFMIN(   top, mmm)];
-            if ((size = end - start) <= 0)
-                continue;
-            if (tns->direction[w][filt]) {
-                inc = -1;
-                start = end - 1;
-            } else {
-                inc = 1;
-            }
-            start += w * 128;
-
-            if (decode) {
-                // ar filter
-                for (m = 0; m < size; m++, start += inc)
-                    for (i = 1; i <= FFMIN(m, order); i++)
-                        coef[start] -= coef[start - i * inc] * lpc[i - 1];
-            } else {
-                // ma filter
-                for (m = 0; m < size; m++, start += inc) {
-                    tmp[0] = coef[start];
-                    for (i = 1; i <= FFMIN(m, order); i++)
-                        coef[start] += tmp[i] * lpc[i - 1];
-                    for (i = order; i > 0; i--)
-                        tmp[i] = tmp[i - 1];
-                }
-            }
-        }
-    }
-}
-
-/**
- *  Apply windowing and MDCT to obtain the spectral
- *  coefficient from the predicted sample by LTP.
- */
-static void windowing_and_mdct_ltp(AACContext *ac, float *out,
-                                   float *in, IndividualChannelStream *ics)
-{
-    const float *lwindow      = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
-
-    if (ics->window_sequence[0] != LONG_STOP_SEQUENCE) {
-        ac->fdsp.vector_fmul(in, in, lwindow_prev, 1024);
-    } else {
-        memset(in, 0, 448 * sizeof(float));
-        ac->fdsp.vector_fmul(in + 448, in + 448, swindow_prev, 128);
-    }
-    if (ics->window_sequence[0] != LONG_START_SEQUENCE) {
-        ac->fdsp.vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024);
-    } else {
-        ac->fdsp.vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128);
-        memset(in + 1024 + 576, 0, 448 * sizeof(float));
-    }
-    ac->mdct_ltp.mdct_calc(&ac->mdct_ltp, out, in);
-}
-
-/**
- * Apply the long term prediction
- */
-static void apply_ltp(AACContext *ac, SingleChannelElement *sce)
-{
-    const LongTermPrediction *ltp = &sce->ics.ltp;
-    const uint16_t *offsets = sce->ics.swb_offset;
-    int i, sfb;
-
-    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
-        float *predTime = sce->ret;
-        float *predFreq = ac->buf_mdct;
-        int16_t num_samples = 2048;
-
-        if (ltp->lag < 1024)
-            num_samples = ltp->lag + 1024;
-        for (i = 0; i < num_samples; i++)
-            predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
-        memset(&predTime[i], 0, (2048 - i) * sizeof(float));
-
-        windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
-
-        if (sce->tns.present)
-            apply_tns(predFreq, &sce->tns, &sce->ics, 0);
-
-        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
-            if (ltp->used[sfb])
-                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
-                    sce->coeffs[i] += predFreq[i];
-    }
-}
-
-/**
- * Update the LTP buffer for next frame
- */
-static void update_ltp(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *saved     = sce->saved;
-    float *saved_ltp = sce->coeffs;
-    const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    int i;
-
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        memcpy(saved_ltp,       saved, 512 * sizeof(float));
-        memset(saved_ltp + 576, 0,     448 * sizeof(float));
-        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
-        for (i = 0; i < 64; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i];
-    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
-        memcpy(saved_ltp,       ac->buf_mdct + 512, 448 * sizeof(float));
-        memset(saved_ltp + 576, 0,                  448 * sizeof(float));
-        ac->fdsp.vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
-        for (i = 0; i < 64; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * swindow[63 - i];
-    } else { // LONG_STOP or ONLY_LONG
-        ac->fdsp.vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
-        for (i = 0; i < 512; i++)
-            saved_ltp[i + 512] = ac->buf_mdct[1023 - i] * lwindow[511 - i];
-    }
-
-    memcpy(sce->ltp_state,      sce->ltp_state+1024, 1024 * sizeof(*sce->ltp_state));
-    memcpy(sce->ltp_state+1024, sce->ret,            1024 * sizeof(*sce->ltp_state));
-    memcpy(sce->ltp_state+2048, saved_ltp,           1024 * sizeof(*sce->ltp_state));
-}
-
-/**
- * Conduct IMDCT and windowing.
- */
-static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
-    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
-    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
-    float *buf  = ac->buf_mdct;
-    float *temp = ac->temp;
-    int i;
-
-    // imdct
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        for (i = 0; i < 1024; i += 128)
-            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
-    } else
-        ac->mdct.imdct_half(&ac->mdct, buf, in);
-
-    /* window overlapping
-     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
-     * and long to short transitions are considered to be short to short
-     * transitions. This leaves just two cases (long to long and short to short)
-     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
-     */
-    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
-            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
-        ac->fdsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
-    } else {
-        memcpy(                         out,               saved,            448 * sizeof(float));
-
-        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-            ac->fdsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
-            ac->fdsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
-            ac->fdsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
-            memcpy(                     out + 448 + 4*128, temp, 64 * sizeof(float));
-        } else {
-            ac->fdsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
-            memcpy(                     out + 576,         buf + 64,         448 * sizeof(float));
-        }
-    }
-
-    // buffer update
-    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        memcpy(                     saved,       temp + 64,         64 * sizeof(float));
-        ac->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
-        ac->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
-        ac->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
-        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
-    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
-        memcpy(                     saved,       buf + 512,        448 * sizeof(float));
-        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
-    } else { // LONG_STOP or ONLY_LONG
-        memcpy(                     saved,       buf + 512,        512 * sizeof(float));
-    }
-}
-
-static void imdct_and_windowing_ld(AACContext *ac, SingleChannelElement *sce)
-{
-    IndividualChannelStream *ics = &sce->ics;
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    float *buf  = ac->buf_mdct;
-
-    // imdct
-    ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
-
-    // window overlapping
-    if (ics->use_kb_window[1]) {
-        // AAC LD uses a low overlap sine window instead of a KBD window
-        memcpy(out, saved, 192 * sizeof(float));
-        ac->fdsp.vector_fmul_window(out + 192, saved + 192, buf, ff_sine_128, 64);
-        memcpy(                     out + 320, buf + 64, 192 * sizeof(float));
-    } else {
-        ac->fdsp.vector_fmul_window(out, saved, buf, ff_sine_512, 256);
-    }
-
-    // buffer update
-    memcpy(saved, buf + 256, 256 * sizeof(float));
-}
-
-static void imdct_and_windowing_eld(AACContext *ac, SingleChannelElement *sce)
-{
-    float *in    = sce->coeffs;
-    float *out   = sce->ret;
-    float *saved = sce->saved;
-    float *buf  = ac->buf_mdct;
-    int i;
-    const int n  = ac->oc[1].m4ac.frame_length_short ? 480 : 512;
-    const int n2 = n >> 1;
-    const int n4 = n >> 2;
-    const float *const window = n == 480 ? ff_aac_eld_window_480 :
-                                           ff_aac_eld_window_512;
-
-    // Inverse transform, mapped to the conventional IMDCT by
-    // Chivukula, R.K.; Reznik, Y.A.; Devarajan, V.,
-    // "Efficient algorithms for MPEG-4 AAC-ELD, AAC-LD and AAC-LC filterbanks,"
-    // Audio, Language and Image Processing, 2008. ICALIP 2008. International Conference on
-    // URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4590245&isnumber=4589950
-    for (i = 0; i < n2; i+=2) {
-        float temp;
-        temp =  in[i    ]; in[i    ] = -in[n - 1 - i]; in[n - 1 - i] = temp;
-        temp = -in[i + 1]; in[i + 1] =  in[n - 2 - i]; in[n - 2 - i] = temp;
-    }
-    if (n == 480)
-        ac->mdct480->imdct_half(ac->mdct480, buf, in, 1, -1.f/(16*1024*960));
-    else
-        ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
-    for (i = 0; i < n; i+=2) {
-        buf[i] = -buf[i];
-    }
-    // Like with the regular IMDCT at this point we still have the middle half
-    // of a transform but with even symmetry on the left and odd symmetry on
-    // the right
-
-    // window overlapping
-    // The spec says to use samples [0..511] but the reference decoder uses
-    // samples [128..639].
-    for (i = n4; i < n2; i ++) {
-        out[i - n4] =    buf[n2 - 1 - i]       * window[i       - n4] +
-                       saved[      i + n2]     * window[i +   n - n4] +
-                      -saved[  n + n2 - 1 - i] * window[i + 2*n - n4] +
-                      -saved[2*n + n2 + i]     * window[i + 3*n - n4];
-    }
-    for (i = 0; i < n2; i ++) {
-        out[n4 + i] =    buf[i]               * window[i + n2       - n4] +
-                      -saved[      n - 1 - i] * window[i + n2 +   n - n4] +
-                      -saved[  n + i]         * window[i + n2 + 2*n - n4] +
-                       saved[2*n + n - 1 - i] * window[i + n2 + 3*n - n4];
-    }
-    for (i = 0; i < n4; i ++) {
-        out[n2 + n4 + i] =    buf[      i + n2]     * window[i +   n - n4] +
-                           -saved[      n2 - 1 - i] * window[i + 2*n - n4] +
-                           -saved[  n + n2 + i]     * window[i + 3*n - n4];
-    }
-
-    // buffer update
-    memmove(saved + n, saved, 2 * n * sizeof(float));
-    memcpy( saved,       buf,     n * sizeof(float));
-}
-
-/**
  * Apply dependent channel coupling (applied before IMDCT).
  *
  * @param   index   index into coupling gain array
@@ -2607,441 +257,7 @@ static void apply_independent_coupling(AACContext *ac,
         dest[i] += gain * src[i];
 }
 
-/**
- * channel coupling transformation interface
- *
- * @param   apply_coupling_method   pointer to (in)dependent coupling function
- */
-static void apply_channel_coupling(AACContext *ac, ChannelElement *cc,
-                                   enum RawDataBlockType type, int elem_id,
-                                   enum CouplingPoint coupling_point,
-                                   void (*apply_coupling_method)(AACContext *ac, SingleChannelElement *target, ChannelElement *cce, int index))
-{
-    int i, c;
-
-    for (i = 0; i < MAX_ELEM_ID; i++) {
-        ChannelElement *cce = ac->che[TYPE_CCE][i];
-        int index = 0;
-
-        if (cce && cce->coup.coupling_point == coupling_point) {
-            ChannelCoupling *coup = &cce->coup;
-
-            for (c = 0; c <= coup->num_coupled; c++) {
-                if (coup->type[c] == type && coup->id_select[c] == elem_id) {
-                    if (coup->ch_select[c] != 1) {
-                        apply_coupling_method(ac, &cc->ch[0], cce, index);
-                        if (coup->ch_select[c] != 0)
-                            index++;
-                    }
-                    if (coup->ch_select[c] != 2)
-                        apply_coupling_method(ac, &cc->ch[1], cce, index++);
-                } else
-                    index += 1 + (coup->ch_select[c] == 3);
-            }
-        }
-    }
-}
-
-/**
- * Convert spectral data to float samples, applying all supported tools as appropriate.
- */
-static void spectral_to_sample(AACContext *ac)
-{
-    int i, type;
-    void (*imdct_and_window)(AACContext *ac, SingleChannelElement *sce);
-    switch (ac->oc[1].m4ac.object_type) {
-    case AOT_ER_AAC_LD:
-        imdct_and_window = imdct_and_windowing_ld;
-        break;
-    case AOT_ER_AAC_ELD:
-        imdct_and_window = imdct_and_windowing_eld;
-        break;
-    default:
-        imdct_and_window = imdct_and_windowing;
-    }
-    for (type = 3; type >= 0; type--) {
-        for (i = 0; i < MAX_ELEM_ID; i++) {
-            ChannelElement *che = ac->che[type][i];
-            if (che) {
-                if (type <= TYPE_CPE)
-                    apply_channel_coupling(ac, che, type, i, BEFORE_TNS, apply_dependent_coupling);
-                if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
-                    if (che->ch[0].ics.predictor_present) {
-                        if (che->ch[0].ics.ltp.present)
-                            apply_ltp(ac, &che->ch[0]);
-                        if (che->ch[1].ics.ltp.present && type == TYPE_CPE)
-                            apply_ltp(ac, &che->ch[1]);
-                    }
-                }
-                if (che->ch[0].tns.present)
-                    apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
-                if (che->ch[1].tns.present)
-                    apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
-                if (type <= TYPE_CPE)
-                    apply_channel_coupling(ac, che, type, i, BETWEEN_TNS_AND_IMDCT, apply_dependent_coupling);
-                if (type != TYPE_CCE || che->coup.coupling_point == AFTER_IMDCT) {
-                    imdct_and_window(ac, &che->ch[0]);
-                    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                        update_ltp(ac, &che->ch[0]);
-                    if (type == TYPE_CPE) {
-                        imdct_and_window(ac, &che->ch[1]);
-                        if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
-                            update_ltp(ac, &che->ch[1]);
-                    }
-                    if (ac->oc[1].m4ac.sbr > 0) {
-                        ff_sbr_apply(ac, &che->sbr, type, che->ch[0].ret, che->ch[1].ret);
-                    }
-                }
-                if (type <= TYPE_CCE)
-                    apply_channel_coupling(ac, che, type, i, AFTER_IMDCT, apply_independent_coupling);
-            }
-        }
-    }
-}
-
-static int parse_adts_frame_header(AACContext *ac, GetBitContext *gb)
-{
-    int size;
-    AACADTSHeaderInfo hdr_info;
-    uint8_t layout_map[MAX_ELEM_ID*4][3];
-    int layout_map_tags, ret;
-
-    size = avpriv_aac_parse_header(gb, &hdr_info);
-    if (size > 0) {
-        if (hdr_info.num_aac_frames != 1) {
-            avpriv_report_missing_feature(ac->avctx,
-                                          "More than one AAC RDB per ADTS frame");
-            return AVERROR_PATCHWELCOME;
-        }
-        push_output_configuration(ac);
-        if (hdr_info.chan_config) {
-            ac->oc[1].m4ac.chan_config = hdr_info.chan_config;
-            if ((ret = set_default_channel_config(ac->avctx,
-                                                  layout_map,
-                                                  &layout_map_tags,
-                                                  hdr_info.chan_config)) < 0)
-                return ret;
-            if ((ret = output_configure(ac, layout_map, layout_map_tags,
-                                        FFMAX(ac->oc[1].status,
-                                              OC_TRIAL_FRAME), 0)) < 0)
-                return ret;
-        } else {
-            ac->oc[1].m4ac.chan_config = 0;
-        }
-        ac->oc[1].m4ac.sample_rate     = hdr_info.sample_rate;
-        ac->oc[1].m4ac.sampling_index  = hdr_info.sampling_index;
-        ac->oc[1].m4ac.object_type     = hdr_info.object_type;
-        ac->oc[1].m4ac.frame_length_short = 0;
-        if (ac->oc[0].status != OC_LOCKED ||
-            ac->oc[0].m4ac.chan_config != hdr_info.chan_config ||
-            ac->oc[0].m4ac.sample_rate != hdr_info.sample_rate) {
-            ac->oc[1].m4ac.sbr = -1;
-            ac->oc[1].m4ac.ps  = -1;
-        }
-        if (!hdr_info.crc_absent)
-            skip_bits(gb, 16);
-    }
-    return size;
-}
-
-static int aac_decode_er_frame(AVCodecContext *avctx, void *data,
-                               int *got_frame_ptr, GetBitContext *gb)
-{
-    AACContext *ac = avctx->priv_data;
-    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
-    ChannelElement *che;
-    int err, i;
-    int samples = m4ac->frame_length_short ? 960 : 1024;
-    int chan_config = m4ac->chan_config;
-    int aot = m4ac->object_type;
-
-    if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD)
-        samples >>= 1;
-
-    ac->frame = data;
-
-    if ((err = frame_configure_elements(avctx)) < 0)
-        return err;
-
-    // The FF_PROFILE_AAC_* defines are all object_type - 1
-    // This may lead to an undefined profile being signaled
-    ac->avctx->profile = aot - 1;
-
-    ac->tags_mapped = 0;
-
-    if (chan_config < 0 || (chan_config >= 8 && chan_config < 11) || chan_config >= 13) {
-        avpriv_request_sample(avctx, "Unknown ER channel configuration %d",
-                              chan_config);
-        return AVERROR_INVALIDDATA;
-    }
-    for (i = 0; i < tags_per_config[chan_config]; i++) {
-        const int elem_type = aac_channel_layout_map[chan_config-1][i][0];
-        const int elem_id   = aac_channel_layout_map[chan_config-1][i][1];
-        if (!(che=get_che(ac, elem_type, elem_id))) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "channel element %d.%d is not allocated\n",
-                   elem_type, elem_id);
-            return AVERROR_INVALIDDATA;
-        }
-        if (aot != AOT_ER_AAC_ELD)
-            skip_bits(gb, 4);
-        switch (elem_type) {
-        case TYPE_SCE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            break;
-        case TYPE_CPE:
-            err = decode_cpe(ac, gb, che);
-            break;
-        case TYPE_LFE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            break;
-        }
-        if (err < 0)
-            return err;
-    }
-
-    spectral_to_sample(ac);
-
-    ac->frame->nb_samples = samples;
-    ac->frame->sample_rate = avctx->sample_rate;
-    *got_frame_ptr = 1;
-
-    skip_bits_long(gb, get_bits_left(gb));
-    return 0;
-}
-
-static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
-                                int *got_frame_ptr, GetBitContext *gb)
-{
-    AACContext *ac = avctx->priv_data;
-    ChannelElement *che = NULL, *che_prev = NULL;
-    enum RawDataBlockType elem_type, elem_type_prev = TYPE_END;
-    int err, elem_id;
-    int samples = 0, multiplier, audio_found = 0, pce_found = 0;
-
-    ac->frame = data;
-
-    if (show_bits(gb, 12) == 0xfff) {
-        if ((err = parse_adts_frame_header(ac, gb)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error decoding AAC frame header.\n");
-            goto fail;
-        }
-        if (ac->oc[1].m4ac.sampling_index > 12) {
-            av_log(ac->avctx, AV_LOG_ERROR, "invalid sampling rate index %d\n", ac->oc[1].m4ac.sampling_index);
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-    }
-
-    if (avctx->channels)
-        if ((err = frame_configure_elements(avctx)) < 0)
-            goto fail;
-
-    // The FF_PROFILE_AAC_* defines are all object_type - 1
-    // This may lead to an undefined profile being signaled
-    ac->avctx->profile = ac->oc[1].m4ac.object_type - 1;
-
-    ac->tags_mapped = 0;
-    // parse
-    while ((elem_type = get_bits(gb, 3)) != TYPE_END) {
-        elem_id = get_bits(gb, 4);
-
-        if (!avctx->channels && elem_type != TYPE_PCE) {
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-
-        if (elem_type < TYPE_DSE) {
-            if (!(che=get_che(ac, elem_type, elem_id))) {
-                av_log(ac->avctx, AV_LOG_ERROR, "channel element %d.%d is not allocated\n",
-                       elem_type, elem_id);
-                err = AVERROR_INVALIDDATA;
-                goto fail;
-            }
-            samples = 1024;
-        }
-
-        switch (elem_type) {
-
-        case TYPE_SCE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            audio_found = 1;
-            break;
-
-        case TYPE_CPE:
-            err = decode_cpe(ac, gb, che);
-            audio_found = 1;
-            break;
-
-        case TYPE_CCE:
-            err = decode_cce(ac, gb, che);
-            break;
-
-        case TYPE_LFE:
-            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
-            audio_found = 1;
-            break;
-
-        case TYPE_DSE:
-            err = skip_data_stream_element(ac, gb);
-            break;
-
-        case TYPE_PCE: {
-            uint8_t layout_map[MAX_ELEM_ID*4][3];
-            int tags;
-            push_output_configuration(ac);
-            tags = decode_pce(avctx, &ac->oc[1].m4ac, layout_map, gb);
-            if (tags < 0) {
-                err = tags;
-                break;
-            }
-            if (pce_found) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Not evaluating a further program_config_element as this construct is dubious at best.\n");
-                pop_output_configuration(ac);
-            } else {
-                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
-                pce_found = 1;
-            }
-            break;
-        }
-
-        case TYPE_FIL:
-            if (elem_id == 15)
-                elem_id += get_bits(gb, 8) - 1;
-            if (get_bits_left(gb) < 8 * elem_id) {
-                    av_log(avctx, AV_LOG_ERROR, overread_err);
-                    err = AVERROR_INVALIDDATA;
-                    goto fail;
-            }
-            while (elem_id > 0)
-                elem_id -= decode_extension_payload(ac, gb, elem_id, che_prev, elem_type_prev);
-            err = 0; /* FIXME */
-            break;
-
-        default:
-            err = AVERROR_BUG; /* should not happen, but keeps compiler happy */
-            break;
-        }
-
-        che_prev       = che;
-        elem_type_prev = elem_type;
-
-        if (err)
-            goto fail;
-
-        if (get_bits_left(gb) < 3) {
-            av_log(avctx, AV_LOG_ERROR, overread_err);
-            err = AVERROR_INVALIDDATA;
-            goto fail;
-        }
-    }
-
-    if (!avctx->channels) {
-        *got_frame_ptr = 0;
-        return 0;
-    }
-
-    spectral_to_sample(ac);
-
-    multiplier = (ac->oc[1].m4ac.sbr == 1) ? ac->oc[1].m4ac.ext_sample_rate > ac->oc[1].m4ac.sample_rate : 0;
-    samples <<= multiplier;
-
-    if (ac->oc[1].status && audio_found) {
-        avctx->sample_rate = ac->oc[1].m4ac.sample_rate << multiplier;
-        avctx->frame_size = samples;
-        ac->oc[1].status = OC_LOCKED;
-    }
-
-    if (samples) {
-        ac->frame->nb_samples = samples;
-        ac->frame->sample_rate = avctx->sample_rate;
-    }
-    *got_frame_ptr = !!samples;
-
-    return 0;
-fail:
-    pop_output_configuration(ac);
-    return err;
-}
-
-static int aac_decode_frame(AVCodecContext *avctx, void *data,
-                            int *got_frame_ptr, AVPacket *avpkt)
-{
-    AACContext *ac = avctx->priv_data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
-    GetBitContext gb;
-    int buf_consumed;
-    int buf_offset;
-    int err;
-    int new_extradata_size;
-    const uint8_t *new_extradata = av_packet_get_side_data(avpkt,
-                                       AV_PKT_DATA_NEW_EXTRADATA,
-                                       &new_extradata_size);
-
-    if (new_extradata) {
-        av_free(avctx->extradata);
-        avctx->extradata = av_mallocz(new_extradata_size +
-                                      AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!avctx->extradata)
-            return AVERROR(ENOMEM);
-        avctx->extradata_size = new_extradata_size;
-        memcpy(avctx->extradata, new_extradata, new_extradata_size);
-        push_output_configuration(ac);
-        if (decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
-                                         avctx->extradata,
-                                         avctx->extradata_size*8, 1) < 0) {
-            pop_output_configuration(ac);
-            return AVERROR_INVALIDDATA;
-        }
-    }
-
-    if ((err = init_get_bits(&gb, buf, buf_size * 8)) < 0)
-        return err;
-
-    switch (ac->oc[1].m4ac.object_type) {
-    case AOT_ER_AAC_LC:
-    case AOT_ER_AAC_LTP:
-    case AOT_ER_AAC_LD:
-    case AOT_ER_AAC_ELD:
-        err = aac_decode_er_frame(avctx, data, got_frame_ptr, &gb);
-        break;
-    default:
-        err = aac_decode_frame_int(avctx, data, got_frame_ptr, &gb);
-    }
-    if (err < 0)
-        return err;
-
-    buf_consumed = (get_bits_count(&gb) + 7) >> 3;
-    for (buf_offset = buf_consumed; buf_offset < buf_size; buf_offset++)
-        if (buf[buf_offset])
-            break;
-
-    return buf_size > buf_offset ? buf_consumed : buf_size;
-}
-
-static av_cold int aac_decode_close(AVCodecContext *avctx)
-{
-    AACContext *ac = avctx->priv_data;
-    int i, type;
-
-    for (i = 0; i < MAX_ELEM_ID; i++) {
-        for (type = 0; type < 4; type++) {
-            if (ac->che[type][i])
-                ff_aac_sbr_ctx_close(&ac->che[type][i]->sbr);
-            av_freep(&ac->che[type][i]);
-        }
-    }
-
-    ff_mdct_end(&ac->mdct);
-    ff_mdct_end(&ac->mdct_small);
-    ff_mdct_end(&ac->mdct_ld);
-    ff_mdct_end(&ac->mdct_ltp);
-    ff_imdct15_uninit(&ac->mdct480);
-    return 0;
-}
-
+#include "aacdec_template.c"
 
 #define LOAS_SYNC_WORD   0x2b7       ///< 11 bits LOAS sync word
 
@@ -3096,7 +312,11 @@ static int latm_decode_audio_specific_config(struct LATMContext *latmctx,
         ac->oc[1].m4ac.sample_rate != m4ac.sample_rate ||
         ac->oc[1].m4ac.chan_config != m4ac.chan_config) {
 
-        av_log(avctx, AV_LOG_INFO, "audio config changed\n");
+        if(latmctx->initialized) {
+            av_log(avctx, AV_LOG_INFO, "audio config changed\n");
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "initializing latmctx\n");
+        }
         latmctx->initialized = 0;
 
         esize = (bits_consumed+7) / 8;
@@ -3139,9 +359,9 @@ static int read_stream_mux_config(struct LATMContext *latmctx,
             return AVERROR_PATCHWELCOME;
         }
 
-        // for each program (which there is only on in DVB)
+        // for each program (which there is only one in DVB)
 
-        // for each layer (which there is only on in DVB)
+        // for each layer (which there is only one in DVB)
         if (get_bits(gb, 3)) {                   // numLayer
             avpriv_request_sample(latmctx->aac_ctx.avctx, "Multiple layers");
             return AVERROR_PATCHWELCOME;
@@ -3254,7 +474,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
     int                 muxlength, err;
     GetBitContext       gb;
 
-    if ((err = init_get_bits(&gb, avpkt->data, avpkt->size * 8)) < 0)
+    if ((err = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
         return err;
 
     // check for LOAS sync word
@@ -3262,7 +482,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
         return AVERROR_INVALIDDATA;
 
     muxlength = get_bits(&gb, 13) + 3;
-    // not enough data, the parser should have sorted this
+    // not enough data, the parser should have sorted this out
     if (muxlength > avpkt->size)
         return AVERROR_INVALIDDATA;
 
@@ -3277,7 +497,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
             push_output_configuration(&latmctx->aac_ctx);
             if ((err = decode_audio_specific_config(
                     &latmctx->aac_ctx, avctx, &latmctx->aac_ctx.oc[1].m4ac,
-                    avctx->extradata, avctx->extradata_size*8, 1)) < 0) {
+                    avctx->extradata, avctx->extradata_size*8LL, 1)) < 0) {
                 pop_output_configuration(&latmctx->aac_ctx);
                 return err;
             }
@@ -3300,7 +520,7 @@ static int latm_decode_frame(AVCodecContext *avctx, void *out,
         err = aac_decode_er_frame(avctx, out, got_frame_ptr, &gb);
         break;
     default:
-        err = aac_decode_frame_int(avctx, out, got_frame_ptr, &gb);
+        err = aac_decode_frame_int(avctx, out, got_frame_ptr, &gb, avpkt);
     }
     if (err < 0)
         return err;
@@ -3319,7 +539,6 @@ static av_cold int latm_decode_init(AVCodecContext *avctx)
     return ret;
 }
 
-
 AVCodec ff_aac_decoder = {
     .name            = "aac",
     .long_name       = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
@@ -3335,6 +554,9 @@ AVCodec ff_aac_decoder = {
     .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
     .caps_internal   = FF_CODEC_CAP_INIT_THREADSAFE,
     .channel_layouts = aac_channel_layout,
+    .flush = flush,
+    .priv_class      = &aac_decoder_class,
+    .profiles        = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
 };
 
 /*
@@ -3357,4 +579,6 @@ AVCodec ff_aac_latm_decoder = {
     .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
     .caps_internal   = FF_CODEC_CAP_INIT_THREADSAFE,
     .channel_layouts = aac_channel_layout,
+    .flush = flush,
+    .profiles        = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
 };
diff --git a/libavcodec/aacdec_fixed.c b/libavcodec/aacdec_fixed.c
new file mode 100644
index 0000000..acb8178
--- /dev/null
+++ b/libavcodec/aacdec_fixed.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * AAC decoder fixed-point implementation
+ *
+ * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
+ * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC decoder
+ * @author Oded Shimon  ( ods15 ods15 dyndns org )
+ * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * Fixed point implementation
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#define USE_FIXED 1
+
+#include "libavutil/fixed_dsp.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "fft.h"
+#include "lpc.h"
+#include "kbdwin.h"
+#include "sinewin.h"
+
+#include "aac.h"
+#include "aactab.h"
+#include "aacdectab.h"
+#include "cbrt_data.h"
+#include "sbr.h"
+#include "aacsbr.h"
+#include "mpeg4audio.h"
+#include "aacadtsdec.h"
+#include "profiles.h"
+#include "libavutil/intfloat.h"
+
+#include <math.h>
+#include <string.h>
+
+static av_always_inline void reset_predict_state(PredictorState *ps)
+{
+    ps->r0.mant   = 0;
+    ps->r0.exp   = 0;
+    ps->r1.mant   = 0;
+    ps->r1.exp   = 0;
+    ps->cor0.mant = 0;
+    ps->cor0.exp = 0;
+    ps->cor1.mant = 0;
+    ps->cor1.exp = 0;
+    ps->var0.mant = 0x20000000;
+    ps->var0.exp = 1;
+    ps->var1.mant = 0x20000000;
+    ps->var1.exp = 1;
+}
+
+static const int exp2tab[4] = { Q31(1.0000000000/2), Q31(1.1892071150/2), Q31(1.4142135624/2), Q31(1.6817928305/2) };  // 2^0, 2^0.25, 2^0.5, 2^0.75
+
+static inline int *DEC_SPAIR(int *dst, unsigned idx)
+{
+    dst[0] = (idx & 15) - 4;
+    dst[1] = (idx >> 4 & 15) - 4;
+
+    return dst + 2;
+}
+
+static inline int *DEC_SQUAD(int *dst, unsigned idx)
+{
+    dst[0] = (idx & 3) - 1;
+    dst[1] = (idx >> 2 & 3) - 1;
+    dst[2] = (idx >> 4 & 3) - 1;
+    dst[3] = (idx >> 6 & 3) - 1;
+
+    return dst + 4;
+}
+
+static inline int *DEC_UPAIR(int *dst, unsigned idx, unsigned sign)
+{
+    dst[0] = (idx & 15) * (1 - (sign & 0xFFFFFFFE));
+    dst[1] = (idx >> 4 & 15) * (1 - ((sign & 1) << 1));
+
+    return dst + 2;
+}
+
+static inline int *DEC_UQUAD(int *dst, unsigned idx, unsigned sign)
+{
+    unsigned nz = idx >> 12;
+
+    dst[0] = (idx & 3) * (1 + (((int)sign >> 31) << 1));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[1] = (idx >> 2 & 3) * (1 + (((int)sign >> 31) << 1));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[2] = (idx >> 4 & 3) * (1 + (((int)sign >> 31) << 1));
+    sign <<= nz & 1;
+    nz >>= 1;
+    dst[3] = (idx >> 6 & 3) * (1 + (((int)sign >> 31) << 1));
+
+    return dst + 4;
+}
+
+static void vector_pow43(int *coefs, int len)
+{
+    int i, coef;
+
+    for (i=0; i<len; i++) {
+        coef = coefs[i];
+        if (coef < 0)
+            coef = -(int)ff_cbrt_tab_fixed[-coef];
+        else
+            coef = (int)ff_cbrt_tab_fixed[coef];
+        coefs[i] = coef;
+    }
+}
+
+static void subband_scale(int *dst, int *src, int scale, int offset, int len)
+{
+    int ssign = scale < 0 ? -1 : 1;
+    int s = FFABS(scale);
+    unsigned int round;
+    int i, out, c = exp2tab[s & 3];
+
+    s = offset - (s >> 2);
+
+    if (s > 0) {
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)(((int64_t)src[i] * c) >> 32);
+            dst[i] = ((int)(out+round) >> s) * ssign;
+        }
+    }
+    else {
+        s = s + 32;
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)((int64_t)((int64_t)src[i] * c + round) >> s);
+            dst[i] = out * ssign;
+        }
+    }
+}
+
+static void noise_scale(int *coefs, int scale, int band_energy, int len)
+{
+    int ssign = scale < 0 ? -1 : 1;
+    int s = FFABS(scale);
+    unsigned int round;
+    int i, out, c = exp2tab[s & 3];
+    int nlz = 0;
+
+    while (band_energy > 0x7fff) {
+        band_energy >>= 1;
+        nlz++;
+    }
+    c /= band_energy;
+    s = 21 + nlz - (s >> 2);
+
+    if (s > 0) {
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)(((int64_t)coefs[i] * c) >> 32);
+            coefs[i] = ((int)(out+round) >> s) * ssign;
+        }
+    }
+    else {
+        s = s + 32;
+        round = 1 << (s-1);
+        for (i=0; i<len; i++) {
+            out = (int)((int64_t)((int64_t)coefs[i] * c + round) >> s);
+            coefs[i] = out * ssign;
+        }
+    }
+}
+
+static av_always_inline SoftFloat flt16_round(SoftFloat pf)
+{
+    SoftFloat tmp;
+    int s;
+
+    tmp.exp = pf.exp;
+    s = pf.mant >> 31;
+    tmp.mant = (pf.mant ^ s) - s;
+    tmp.mant = (tmp.mant + 0x00200000U) & 0xFFC00000U;
+    tmp.mant = (tmp.mant ^ s) - s;
+
+    return tmp;
+}
+
+static av_always_inline SoftFloat flt16_even(SoftFloat pf)
+{
+    SoftFloat tmp;
+    int s;
+
+    tmp.exp = pf.exp;
+    s = pf.mant >> 31;
+    tmp.mant = (pf.mant ^ s) - s;
+    tmp.mant = (tmp.mant + 0x001FFFFFU + (tmp.mant & 0x00400000U >> 16)) & 0xFFC00000U;
+    tmp.mant = (tmp.mant ^ s) - s;
+
+    return tmp;
+}
+
+static av_always_inline SoftFloat flt16_trunc(SoftFloat pf)
+{
+    SoftFloat pun;
+    int s;
+
+    pun.exp = pf.exp;
+    s = pf.mant >> 31;
+    pun.mant = (pf.mant ^ s) - s;
+    pun.mant = pun.mant & 0xFFC00000U;
+    pun.mant = (pun.mant ^ s) - s;
+
+    return pun;
+}
+
+static av_always_inline void predict(PredictorState *ps, int *coef,
+                                     int output_enable)
+{
+    const SoftFloat a     = { 1023410176, 0 };  // 61.0 / 64
+    const SoftFloat alpha = {  973078528, 0 };  // 29.0 / 32
+    SoftFloat e0, e1;
+    SoftFloat pv;
+    SoftFloat k1, k2;
+    SoftFloat   r0 = ps->r0,     r1 = ps->r1;
+    SoftFloat cor0 = ps->cor0, cor1 = ps->cor1;
+    SoftFloat var0 = ps->var0, var1 = ps->var1;
+    SoftFloat tmp;
+
+    if (var0.exp > 1 || (var0.exp == 1 && var0.mant > 0x20000000)) {
+        k1 = av_mul_sf(cor0, flt16_even(av_div_sf(a, var0)));
+    }
+    else {
+        k1.mant = 0;
+        k1.exp = 0;
+    }
+
+    if (var1.exp > 1 || (var1.exp == 1 && var1.mant > 0x20000000)) {
+        k2 = av_mul_sf(cor1, flt16_even(av_div_sf(a, var1)));
+    }
+    else {
+        k2.mant = 0;
+        k2.exp = 0;
+    }
+
+    tmp = av_mul_sf(k1, r0);
+    pv = flt16_round(av_add_sf(tmp, av_mul_sf(k2, r1)));
+    if (output_enable) {
+        int shift = 28 - pv.exp;
+
+        if (shift < 31)
+            *coef += (pv.mant + (1 << (shift - 1))) >> shift;
+    }
+
+    e0 = av_int2sf(*coef, 2);
+    e1 = av_sub_sf(e0, tmp);
+
+    ps->cor1 = flt16_trunc(av_add_sf(av_mul_sf(alpha, cor1), av_mul_sf(r1, e1)));
+    tmp = av_add_sf(av_mul_sf(r1, r1), av_mul_sf(e1, e1));
+    tmp.exp--;
+    ps->var1 = flt16_trunc(av_add_sf(av_mul_sf(alpha, var1), tmp));
+    ps->cor0 = flt16_trunc(av_add_sf(av_mul_sf(alpha, cor0), av_mul_sf(r0, e0)));
+    tmp = av_add_sf(av_mul_sf(r0, r0), av_mul_sf(e0, e0));
+    tmp.exp--;
+    ps->var0 = flt16_trunc(av_add_sf(av_mul_sf(alpha, var0), tmp));
+
+    ps->r1 = flt16_trunc(av_mul_sf(a, av_sub_sf(r0, av_mul_sf(k1, e0))));
+    ps->r0 = flt16_trunc(av_mul_sf(a, e0));
+}
+
+
+static const int cce_scale_fixed[8] = {
+    Q30(1.0),          //2^(0/8)
+    Q30(1.0905077327), //2^(1/8)
+    Q30(1.1892071150), //2^(2/8)
+    Q30(1.2968395547), //2^(3/8)
+    Q30(1.4142135624), //2^(4/8)
+    Q30(1.5422108254), //2^(5/8)
+    Q30(1.6817928305), //2^(6/8)
+    Q30(1.8340080864), //2^(7/8)
+};
+
+/**
+ * Apply dependent channel coupling (applied before IMDCT).
+ *
+ * @param   index   index into coupling gain array
+ */
+static void apply_dependent_coupling_fixed(AACContext *ac,
+                                     SingleChannelElement *target,
+                                     ChannelElement *cce, int index)
+{
+    IndividualChannelStream *ics = &cce->ch[0].ics;
+    const uint16_t *offsets = ics->swb_offset;
+    int *dest = target->coeffs;
+    const int *src = cce->ch[0].coeffs;
+    int g, i, group, k, idx = 0;
+    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Dependent coupling is not supported together with LTP\n");
+        return;
+    }
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            if (cce->ch[0].band_type[idx] != ZERO_BT) {
+                const int gain = cce->coup.gain[index][idx];
+                int shift, round, c, tmp;
+
+                if (gain < 0) {
+                    c = -cce_scale_fixed[-gain & 7];
+                    shift = (-gain-1024) >> 3;
+                }
+                else {
+                    c = cce_scale_fixed[gain & 7];
+                    shift = (gain-1024) >> 3;
+                }
+
+                if (shift < 0) {
+                    shift = -shift;
+                    round = 1 << (shift - 1);
+
+                    for (group = 0; group < ics->group_len[g]; group++) {
+                        for (k = offsets[i]; k < offsets[i + 1]; k++) {
+                            tmp = (int)(((int64_t)src[group * 128 + k] * c + \
+                                       (int64_t)0x1000000000) >> 37);
+                            dest[group * 128 + k] += (tmp + round) >> shift;
+                        }
+                    }
+                }
+                else {
+                    for (group = 0; group < ics->group_len[g]; group++) {
+                        for (k = offsets[i]; k < offsets[i + 1]; k++) {
+                            tmp = (int)(((int64_t)src[group * 128 + k] * c + \
+                                        (int64_t)0x1000000000) >> 37);
+                            dest[group * 128 + k] += tmp << shift;
+                        }
+                    }
+                }
+            }
+        }
+        dest += ics->group_len[g] * 128;
+        src  += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * Apply independent channel coupling (applied after IMDCT).
+ *
+ * @param   index   index into coupling gain array
+ */
+static void apply_independent_coupling_fixed(AACContext *ac,
+                                       SingleChannelElement *target,
+                                       ChannelElement *cce, int index)
+{
+    int i, c, shift, round, tmp;
+    const int gain = cce->coup.gain[index][0];
+    const int *src = cce->ch[0].ret;
+    int *dest = target->ret;
+    const int len = 1024 << (ac->oc[1].m4ac.sbr == 1);
+
+    c = cce_scale_fixed[gain & 7];
+    shift = (gain-1024) >> 3;
+    if (shift < 0) {
+        shift = -shift;
+        round = 1 << (shift - 1);
+
+        for (i = 0; i < len; i++) {
+            tmp = (int)(((int64_t)src[i] * c + (int64_t)0x1000000000) >> 37);
+            dest[i] += (tmp + round) >> shift;
+        }
+    }
+    else {
+      for (i = 0; i < len; i++) {
+          tmp = (int)(((int64_t)src[i] * c + (int64_t)0x1000000000) >> 37);
+          dest[i] += tmp << shift;
+      }
+    }
+}
+
+#include "aacdec_template.c"
+
+AVCodec ff_aac_fixed_decoder = {
+    .name            = "aac_fixed",
+    .long_name       = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
+    .type            = AVMEDIA_TYPE_AUDIO,
+    .id              = AV_CODEC_ID_AAC,
+    .priv_data_size  = sizeof(AACContext),
+    .init            = aac_decode_init,
+    .close           = aac_decode_close,
+    .decode          = aac_decode_frame,
+    .sample_fmts     = (const enum AVSampleFormat[]) {
+        AV_SAMPLE_FMT_S32P, AV_SAMPLE_FMT_NONE
+    },
+    .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
+    .caps_internal   = FF_CODEC_CAP_INIT_THREADSAFE,
+    .channel_layouts = aac_channel_layout,
+    .profiles        = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
+    .flush = flush,
+};
diff --git a/libavcodec/aacdec_template.c b/libavcodec/aacdec_template.c
new file mode 100644
index 0000000..883ed52
--- /dev/null
+++ b/libavcodec/aacdec_template.c
@@ -0,0 +1,3238 @@
+/*
+ * AAC decoder
+ * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
+ * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
+ * Copyright (c) 2008-2013 Alex Converse <alex.converse@gmail.com>
+ *
+ * AAC LATM decoder
+ * Copyright (c) 2008-2010 Paul Kendall <paul@kcbbs.gen.nz>
+ * Copyright (c) 2010      Janne Grunau <janne-libav@jannau.net>
+ *
+ * AAC decoder fixed-point implementation
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC decoder
+ * @author Oded Shimon  ( ods15 ods15 dyndns org )
+ * @author Maxim Gavrilov ( maxim.gavrilov gmail com )
+ *
+ * AAC decoder fixed-point implementation
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ * @author Nedeljko Babic ( nedeljko.babic imgtec com )
+ */
+
+/*
+ * supported tools
+ *
+ * Support?                     Name
+ * N (code in SoC repo)         gain control
+ * Y                            block switching
+ * Y                            window shapes - standard
+ * N                            window shapes - Low Delay
+ * Y                            filterbank - standard
+ * N (code in SoC repo)         filterbank - Scalable Sample Rate
+ * Y                            Temporal Noise Shaping
+ * Y                            Long Term Prediction
+ * Y                            intensity stereo
+ * Y                            channel coupling
+ * Y                            frequency domain prediction
+ * Y                            Perceptual Noise Substitution
+ * Y                            Mid/Side stereo
+ * N                            Scalable Inverse AAC Quantization
+ * N                            Frequency Selective Switch
+ * N                            upsampling filter
+ * Y                            quantization & coding - AAC
+ * N                            quantization & coding - TwinVQ
+ * N                            quantization & coding - BSAC
+ * N                            AAC Error Resilience tools
+ * N                            Error Resilience payload syntax
+ * N                            Error Protection tool
+ * N                            CELP
+ * N                            Silence Compression
+ * N                            HVXC
+ * N                            HVXC 4kbits/s VR
+ * N                            Structured Audio tools
+ * N                            Structured Audio Sample Bank Format
+ * N                            MIDI
+ * N                            Harmonic and Individual Lines plus Noise
+ * N                            Text-To-Speech Interface
+ * Y                            Spectral Band Replication
+ * Y (not in this code)         Layer-1
+ * Y (not in this code)         Layer-2
+ * Y (not in this code)         Layer-3
+ * N                            SinuSoidal Coding (Transient, Sinusoid, Noise)
+ * Y                            Parametric Stereo
+ * N                            Direct Stream Transfer
+ * Y  (not in fixed point code) Enhanced AAC Low Delay (ER AAC ELD)
+ *
+ * Note: - HE AAC v1 comprises LC AAC with Spectral Band Replication.
+ *       - HE AAC v2 comprises LC AAC with Spectral Band Replication and
+           Parametric Stereo.
+ */
+
+#include "libavutil/thread.h"
+
+static VLC vlc_scalefactors;
+static VLC vlc_spectral[11];
+
+static int output_configure(AACContext *ac,
+                            uint8_t layout_map[MAX_ELEM_ID*4][3], int tags,
+                            enum OCStatus oc_type, int get_new_frame);
+
+#define overread_err "Input buffer exhausted before END element found\n"
+
+static int count_channels(uint8_t (*layout)[3], int tags)
+{
+    int i, sum = 0;
+    for (i = 0; i < tags; i++) {
+        int syn_ele = layout[i][0];
+        int pos     = layout[i][2];
+        sum += (1 + (syn_ele == TYPE_CPE)) *
+               (pos != AAC_CHANNEL_OFF && pos != AAC_CHANNEL_CC);
+    }
+    return sum;
+}
+
+/**
+ * Check for the channel element in the current channel position configuration.
+ * If it exists, make sure the appropriate element is allocated and map the
+ * channel order to match the internal FFmpeg channel layout.
+ *
+ * @param   che_pos current channel position configuration
+ * @param   type channel element type
+ * @param   id channel element id
+ * @param   channels count of the number of channels in the configuration
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static av_cold int che_configure(AACContext *ac,
+                                 enum ChannelPosition che_pos,
+                                 int type, int id, int *channels)
+{
+    if (*channels >= MAX_CHANNELS)
+        return AVERROR_INVALIDDATA;
+    if (che_pos) {
+        if (!ac->che[type][id]) {
+            if (!(ac->che[type][id] = av_mallocz(sizeof(ChannelElement))))
+                return AVERROR(ENOMEM);
+            AAC_RENAME(ff_aac_sbr_ctx_init)(ac, &ac->che[type][id]->sbr);
+        }
+        if (type != TYPE_CCE) {
+            if (*channels >= MAX_CHANNELS - (type == TYPE_CPE || (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1))) {
+                av_log(ac->avctx, AV_LOG_ERROR, "Too many channels\n");
+                return AVERROR_INVALIDDATA;
+            }
+            ac->output_element[(*channels)++] = &ac->che[type][id]->ch[0];
+            if (type == TYPE_CPE ||
+                (type == TYPE_SCE && ac->oc[1].m4ac.ps == 1)) {
+                ac->output_element[(*channels)++] = &ac->che[type][id]->ch[1];
+            }
+        }
+    } else {
+        if (ac->che[type][id])
+            AAC_RENAME(ff_aac_sbr_ctx_close)(&ac->che[type][id]->sbr);
+        av_freep(&ac->che[type][id]);
+    }
+    return 0;
+}
+
+static int frame_configure_elements(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int type, id, ch, ret;
+
+    /* set channel pointers to internal buffers by default */
+    for (type = 0; type < 4; type++) {
+        for (id = 0; id < MAX_ELEM_ID; id++) {
+            ChannelElement *che = ac->che[type][id];
+            if (che) {
+                che->ch[0].ret = che->ch[0].ret_buf;
+                che->ch[1].ret = che->ch[1].ret_buf;
+            }
+        }
+    }
+
+    /* get output buffer */
+    av_frame_unref(ac->frame);
+    if (!avctx->channels)
+        return 1;
+
+    ac->frame->nb_samples = 2048;
+    if ((ret = ff_get_buffer(avctx, ac->frame, 0)) < 0)
+        return ret;
+
+    /* map output channel pointers to AVFrame data */
+    for (ch = 0; ch < avctx->channels; ch++) {
+        if (ac->output_element[ch])
+            ac->output_element[ch]->ret = (INTFLOAT *)ac->frame->extended_data[ch];
+    }
+
+    return 0;
+}
+
+struct elem_to_channel {
+    uint64_t av_position;
+    uint8_t syn_ele;
+    uint8_t elem_id;
+    uint8_t aac_position;
+};
+
+static int assign_pair(struct elem_to_channel e2c_vec[MAX_ELEM_ID],
+                       uint8_t (*layout_map)[3], int offset, uint64_t left,
+                       uint64_t right, int pos)
+{
+    if (layout_map[offset][0] == TYPE_CPE) {
+        e2c_vec[offset] = (struct elem_to_channel) {
+            .av_position  = left | right,
+            .syn_ele      = TYPE_CPE,
+            .elem_id      = layout_map[offset][1],
+            .aac_position = pos
+        };
+        return 1;
+    } else {
+        e2c_vec[offset] = (struct elem_to_channel) {
+            .av_position  = left,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[offset][1],
+            .aac_position = pos
+        };
+        e2c_vec[offset + 1] = (struct elem_to_channel) {
+            .av_position  = right,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[offset + 1][1],
+            .aac_position = pos
+        };
+        return 2;
+    }
+}
+
+static int count_paired_channels(uint8_t (*layout_map)[3], int tags, int pos,
+                                 int *current)
+{
+    int num_pos_channels = 0;
+    int first_cpe        = 0;
+    int sce_parity       = 0;
+    int i;
+    for (i = *current; i < tags; i++) {
+        if (layout_map[i][2] != pos)
+            break;
+        if (layout_map[i][0] == TYPE_CPE) {
+            if (sce_parity) {
+                if (pos == AAC_CHANNEL_FRONT && !first_cpe) {
+                    sce_parity = 0;
+                } else {
+                    return -1;
+                }
+            }
+            num_pos_channels += 2;
+            first_cpe         = 1;
+        } else {
+            num_pos_channels++;
+            sce_parity ^= 1;
+        }
+    }
+    if (sce_parity &&
+        ((pos == AAC_CHANNEL_FRONT && first_cpe) || pos == AAC_CHANNEL_SIDE))
+        return -1;
+    *current = i;
+    return num_pos_channels;
+}
+
+static uint64_t sniff_channel_order(uint8_t (*layout_map)[3], int tags)
+{
+    int i, n, total_non_cc_elements;
+    struct elem_to_channel e2c_vec[4 * MAX_ELEM_ID] = { { 0 } };
+    int num_front_channels, num_side_channels, num_back_channels;
+    uint64_t layout;
+
+    if (FF_ARRAY_ELEMS(e2c_vec) < tags)
+        return 0;
+
+    i = 0;
+    num_front_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_FRONT, &i);
+    if (num_front_channels < 0)
+        return 0;
+    num_side_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_SIDE, &i);
+    if (num_side_channels < 0)
+        return 0;
+    num_back_channels =
+        count_paired_channels(layout_map, tags, AAC_CHANNEL_BACK, &i);
+    if (num_back_channels < 0)
+        return 0;
+
+    if (num_side_channels == 0 && num_back_channels >= 4) {
+        num_side_channels = 2;
+        num_back_channels -= 2;
+    }
+
+    i = 0;
+    if (num_front_channels & 1) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_FRONT_CENTER,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_FRONT
+        };
+        i++;
+        num_front_channels--;
+    }
+    if (num_front_channels >= 4) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_FRONT_LEFT_OF_CENTER,
+                         AV_CH_FRONT_RIGHT_OF_CENTER,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+    if (num_front_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_FRONT_LEFT,
+                         AV_CH_FRONT_RIGHT,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+    while (num_front_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_FRONT);
+        num_front_channels -= 2;
+    }
+
+    if (num_side_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_SIDE_LEFT,
+                         AV_CH_SIDE_RIGHT,
+                         AAC_CHANNEL_FRONT);
+        num_side_channels -= 2;
+    }
+    while (num_side_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_SIDE);
+        num_side_channels -= 2;
+    }
+
+    while (num_back_channels >= 4) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         UINT64_MAX,
+                         UINT64_MAX,
+                         AAC_CHANNEL_BACK);
+        num_back_channels -= 2;
+    }
+    if (num_back_channels >= 2) {
+        i += assign_pair(e2c_vec, layout_map, i,
+                         AV_CH_BACK_LEFT,
+                         AV_CH_BACK_RIGHT,
+                         AAC_CHANNEL_BACK);
+        num_back_channels -= 2;
+    }
+    if (num_back_channels) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_BACK_CENTER,
+            .syn_ele      = TYPE_SCE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_BACK
+        };
+        i++;
+        num_back_channels--;
+    }
+
+    if (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = AV_CH_LOW_FREQUENCY,
+            .syn_ele      = TYPE_LFE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_LFE
+        };
+        i++;
+    }
+    while (i < tags && layout_map[i][2] == AAC_CHANNEL_LFE) {
+        e2c_vec[i] = (struct elem_to_channel) {
+            .av_position  = UINT64_MAX,
+            .syn_ele      = TYPE_LFE,
+            .elem_id      = layout_map[i][1],
+            .aac_position = AAC_CHANNEL_LFE
+        };
+        i++;
+    }
+
+    // Must choose a stable sort
+    total_non_cc_elements = n = i;
+    do {
+        int next_n = 0;
+        for (i = 1; i < n; i++)
+            if (e2c_vec[i - 1].av_position > e2c_vec[i].av_position) {
+                FFSWAP(struct elem_to_channel, e2c_vec[i - 1], e2c_vec[i]);
+                next_n = i;
+            }
+        n = next_n;
+    } while (n > 0);
+
+    layout = 0;
+    for (i = 0; i < total_non_cc_elements; i++) {
+        layout_map[i][0] = e2c_vec[i].syn_ele;
+        layout_map[i][1] = e2c_vec[i].elem_id;
+        layout_map[i][2] = e2c_vec[i].aac_position;
+        if (e2c_vec[i].av_position != UINT64_MAX) {
+            layout |= e2c_vec[i].av_position;
+        }
+    }
+
+    return layout;
+}
+
+/**
+ * Save current output configuration if and only if it has been locked.
+ */
+static void push_output_configuration(AACContext *ac) {
+    if (ac->oc[1].status == OC_LOCKED || ac->oc[0].status == OC_NONE) {
+        ac->oc[0] = ac->oc[1];
+    }
+    ac->oc[1].status = OC_NONE;
+}
+
+/**
+ * Restore the previous output configuration if and only if the current
+ * configuration is unlocked.
+ */
+static void pop_output_configuration(AACContext *ac) {
+    if (ac->oc[1].status != OC_LOCKED && ac->oc[0].status != OC_NONE) {
+        ac->oc[1] = ac->oc[0];
+        ac->avctx->channels = ac->oc[1].channels;
+        ac->avctx->channel_layout = ac->oc[1].channel_layout;
+        output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
+                         ac->oc[1].status, 0);
+    }
+}
+
+/**
+ * Configure output channel order based on the current program
+ * configuration element.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int output_configure(AACContext *ac,
+                            uint8_t layout_map[MAX_ELEM_ID * 4][3], int tags,
+                            enum OCStatus oc_type, int get_new_frame)
+{
+    AVCodecContext *avctx = ac->avctx;
+    int i, channels = 0, ret;
+    uint64_t layout = 0;
+    uint8_t id_map[TYPE_END][MAX_ELEM_ID] = {{ 0 }};
+    uint8_t type_counts[TYPE_END] = { 0 };
+
+    if (ac->oc[1].layout_map != layout_map) {
+        memcpy(ac->oc[1].layout_map, layout_map, tags * sizeof(layout_map[0]));
+        ac->oc[1].layout_map_tags = tags;
+    }
+    for (i = 0; i < tags; i++) {
+        int type =         layout_map[i][0];
+        int id =           layout_map[i][1];
+        id_map[type][id] = type_counts[type]++;
+        if (id_map[type][id] >= MAX_ELEM_ID) {
+            avpriv_request_sample(ac->avctx, "Remapped id too large\n");
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+    // Try to sniff a reasonable channel order, otherwise output the
+    // channels in the order the PCE declared them.
+    if (avctx->request_channel_layout != AV_CH_LAYOUT_NATIVE)
+        layout = sniff_channel_order(layout_map, tags);
+    for (i = 0; i < tags; i++) {
+        int type =     layout_map[i][0];
+        int id =       layout_map[i][1];
+        int iid =      id_map[type][id];
+        int position = layout_map[i][2];
+        // Allocate or free elements depending on if they are in the
+        // current program configuration.
+        ret = che_configure(ac, position, type, iid, &channels);
+        if (ret < 0)
+            return ret;
+        ac->tag_che_map[type][id] = ac->che[type][iid];
+    }
+    if (ac->oc[1].m4ac.ps == 1 && channels == 2) {
+        if (layout == AV_CH_FRONT_CENTER) {
+            layout = AV_CH_FRONT_LEFT|AV_CH_FRONT_RIGHT;
+        } else {
+            layout = 0;
+        }
+    }
+
+    if (layout) avctx->channel_layout = layout;
+                            ac->oc[1].channel_layout = layout;
+    avctx->channels       = ac->oc[1].channels       = channels;
+    ac->oc[1].status = oc_type;
+
+    if (get_new_frame) {
+        if ((ret = frame_configure_elements(ac->avctx)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+static void flush(AVCodecContext *avctx)
+{
+    AACContext *ac= avctx->priv_data;
+    int type, i, j;
+
+    for (type = 3; type >= 0; type--) {
+        for (i = 0; i < MAX_ELEM_ID; i++) {
+            ChannelElement *che = ac->che[type][i];
+            if (che) {
+                for (j = 0; j <= 1; j++) {
+                    memset(che->ch[j].saved, 0, sizeof(che->ch[j].saved));
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Set up channel positions based on a default channel configuration
+ * as specified in table 1.17.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int set_default_channel_config(AVCodecContext *avctx,
+                                      uint8_t (*layout_map)[3],
+                                      int *tags,
+                                      int channel_config)
+{
+    if (channel_config < 1 || (channel_config > 7 && channel_config < 11) ||
+        channel_config > 12) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid default channel configuration (%d)\n",
+               channel_config);
+        return AVERROR_INVALIDDATA;
+    }
+    *tags = tags_per_config[channel_config];
+    memcpy(layout_map, aac_channel_layout_map[channel_config - 1],
+           *tags * sizeof(*layout_map));
+
+    /*
+     * AAC specification has 7.1(wide) as a default layout for 8-channel streams.
+     * However, at least Nero AAC encoder encodes 7.1 streams using the default
+     * channel config 7, mapping the side channels of the original audio stream
+     * to the second AAC_CHANNEL_FRONT pair in the AAC stream. Similarly, e.g. FAAD
+     * decodes the second AAC_CHANNEL_FRONT pair as side channels, therefore decoding
+     * the incorrect streams as if they were correct (and as the encoder intended).
+     *
+     * As actual intended 7.1(wide) streams are very rare, default to assuming a
+     * 7.1 layout was intended.
+     */
+    if (channel_config == 7 && avctx->strict_std_compliance < FF_COMPLIANCE_STRICT) {
+        av_log(avctx, AV_LOG_INFO, "Assuming an incorrectly encoded 7.1 channel layout"
+               " instead of a spec-compliant 7.1(wide) layout, use -strict %d to decode"
+               " according to the specification instead.\n", FF_COMPLIANCE_STRICT);
+        layout_map[2][2] = AAC_CHANNEL_SIDE;
+    }
+
+    return 0;
+}
+
+static ChannelElement *get_che(AACContext *ac, int type, int elem_id)
+{
+    /* For PCE based channel configurations map the channels solely based
+     * on tags. */
+    if (!ac->oc[1].m4ac.chan_config) {
+        return ac->tag_che_map[type][elem_id];
+    }
+    // Allow single CPE stereo files to be signalled with mono configuration.
+    if (!ac->tags_mapped && type == TYPE_CPE &&
+        ac->oc[1].m4ac.chan_config == 1) {
+        uint8_t layout_map[MAX_ELEM_ID*4][3];
+        int layout_map_tags;
+        push_output_configuration(ac);
+
+        av_log(ac->avctx, AV_LOG_DEBUG, "mono with CPE\n");
+
+        if (set_default_channel_config(ac->avctx, layout_map,
+                                       &layout_map_tags, 2) < 0)
+            return NULL;
+        if (output_configure(ac, layout_map, layout_map_tags,
+                             OC_TRIAL_FRAME, 1) < 0)
+            return NULL;
+
+        ac->oc[1].m4ac.chan_config = 2;
+        ac->oc[1].m4ac.ps = 0;
+    }
+    // And vice-versa
+    if (!ac->tags_mapped && type == TYPE_SCE &&
+        ac->oc[1].m4ac.chan_config == 2) {
+        uint8_t layout_map[MAX_ELEM_ID * 4][3];
+        int layout_map_tags;
+        push_output_configuration(ac);
+
+        av_log(ac->avctx, AV_LOG_DEBUG, "stereo with SCE\n");
+
+        if (set_default_channel_config(ac->avctx, layout_map,
+                                       &layout_map_tags, 1) < 0)
+            return NULL;
+        if (output_configure(ac, layout_map, layout_map_tags,
+                             OC_TRIAL_FRAME, 1) < 0)
+            return NULL;
+
+        ac->oc[1].m4ac.chan_config = 1;
+        if (ac->oc[1].m4ac.sbr)
+            ac->oc[1].m4ac.ps = -1;
+    }
+    /* For indexed channel configurations map the channels solely based
+     * on position. */
+    switch (ac->oc[1].m4ac.chan_config) {
+    case 12:
+    case 7:
+        if (ac->tags_mapped == 3 && type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][2];
+        }
+    case 11:
+        if (ac->tags_mapped == 2 &&
+            ac->oc[1].m4ac.chan_config == 11 &&
+            type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
+        }
+    case 6:
+        /* Some streams incorrectly code 5.1 audio as
+         * SCE[0] CPE[0] CPE[1] SCE[1]
+         * instead of
+         * SCE[0] CPE[0] CPE[1] LFE[0].
+         * If we seem to have encountered such a stream, transfer
+         * the LFE[0] element to the SCE[1]'s mapping */
+        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
+            if (!ac->warned_remapping_once && (type != TYPE_LFE || elem_id != 0)) {
+                av_log(ac->avctx, AV_LOG_WARNING,
+                   "This stream seems to incorrectly report its last channel as %s[%d], mapping to LFE[0]\n",
+                   type == TYPE_SCE ? "SCE" : "LFE", elem_id);
+                ac->warned_remapping_once++;
+            }
+            ac->tags_mapped++;
+            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_LFE][0];
+        }
+    case 5:
+        if (ac->tags_mapped == 2 && type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][1];
+        }
+    case 4:
+        /* Some streams incorrectly code 4.0 audio as
+         * SCE[0] CPE[0] LFE[0]
+         * instead of
+         * SCE[0] CPE[0] SCE[1].
+         * If we seem to have encountered such a stream, transfer
+         * the SCE[1] element to the LFE[0]'s mapping */
+        if (ac->tags_mapped == tags_per_config[ac->oc[1].m4ac.chan_config] - 1 && (type == TYPE_LFE || type == TYPE_SCE)) {
+            if (!ac->warned_remapping_once && (type != TYPE_SCE || elem_id != 1)) {
+                av_log(ac->avctx, AV_LOG_WARNING,
+                   "This stream seems to incorrectly report its last channel as %s[%d], mapping to SCE[1]\n",
+                   type == TYPE_SCE ? "SCE" : "LFE", elem_id);
+                ac->warned_remapping_once++;
+            }
+            ac->tags_mapped++;
+            return ac->tag_che_map[type][elem_id] = ac->che[TYPE_SCE][1];
+        }
+        if (ac->tags_mapped == 2 &&
+            ac->oc[1].m4ac.chan_config == 4 &&
+            type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][1];
+        }
+    case 3:
+    case 2:
+        if (ac->tags_mapped == (ac->oc[1].m4ac.chan_config != 2) &&
+            type == TYPE_CPE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_CPE][elem_id] = ac->che[TYPE_CPE][0];
+        } else if (ac->oc[1].m4ac.chan_config == 2) {
+            return NULL;
+        }
+    case 1:
+        if (!ac->tags_mapped && type == TYPE_SCE) {
+            ac->tags_mapped++;
+            return ac->tag_che_map[TYPE_SCE][elem_id] = ac->che[TYPE_SCE][0];
+        }
+    default:
+        return NULL;
+    }
+}
+
+/**
+ * Decode an array of 4 bit element IDs, optionally interleaved with a
+ * stereo/mono switching bit.
+ *
+ * @param type speaker type/position for these channels
+ */
+static void decode_channel_map(uint8_t layout_map[][3],
+                               enum ChannelPosition type,
+                               GetBitContext *gb, int n)
+{
+    while (n--) {
+        enum RawDataBlockType syn_ele;
+        switch (type) {
+        case AAC_CHANNEL_FRONT:
+        case AAC_CHANNEL_BACK:
+        case AAC_CHANNEL_SIDE:
+            syn_ele = get_bits1(gb);
+            break;
+        case AAC_CHANNEL_CC:
+            skip_bits1(gb);
+            syn_ele = TYPE_CCE;
+            break;
+        case AAC_CHANNEL_LFE:
+            syn_ele = TYPE_LFE;
+            break;
+        default:
+            // AAC_CHANNEL_OFF has no channel map
+            av_assert0(0);
+        }
+        layout_map[0][0] = syn_ele;
+        layout_map[0][1] = get_bits(gb, 4);
+        layout_map[0][2] = type;
+        layout_map++;
+    }
+}
+
+/**
+ * Decode program configuration element; reference: table 4.2.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_pce(AVCodecContext *avctx, MPEG4AudioConfig *m4ac,
+                      uint8_t (*layout_map)[3],
+                      GetBitContext *gb)
+{
+    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc;
+    int sampling_index;
+    int comment_len;
+    int tags;
+
+    skip_bits(gb, 2);  // object_type
+
+    sampling_index = get_bits(gb, 4);
+    if (m4ac->sampling_index != sampling_index)
+        av_log(avctx, AV_LOG_WARNING,
+               "Sample rate index in program config element does not "
+               "match the sample rate index configured by the container.\n");
+
+    num_front       = get_bits(gb, 4);
+    num_side        = get_bits(gb, 4);
+    num_back        = get_bits(gb, 4);
+    num_lfe         = get_bits(gb, 2);
+    num_assoc_data  = get_bits(gb, 3);
+    num_cc          = get_bits(gb, 4);
+
+    if (get_bits1(gb))
+        skip_bits(gb, 4); // mono_mixdown_tag
+    if (get_bits1(gb))
+        skip_bits(gb, 4); // stereo_mixdown_tag
+
+    if (get_bits1(gb))
+        skip_bits(gb, 3); // mixdown_coeff_index and pseudo_surround
+
+    if (get_bits_left(gb) < 4 * (num_front + num_side + num_back + num_lfe + num_assoc_data + num_cc)) {
+        av_log(avctx, AV_LOG_ERROR, "decode_pce: " overread_err);
+        return -1;
+    }
+    decode_channel_map(layout_map       , AAC_CHANNEL_FRONT, gb, num_front);
+    tags = num_front;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_SIDE,  gb, num_side);
+    tags += num_side;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_BACK,  gb, num_back);
+    tags += num_back;
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_LFE,   gb, num_lfe);
+    tags += num_lfe;
+
+    skip_bits_long(gb, 4 * num_assoc_data);
+
+    decode_channel_map(layout_map + tags, AAC_CHANNEL_CC,    gb, num_cc);
+    tags += num_cc;
+
+    align_get_bits(gb);
+
+    /* comment field, first byte is length */
+    comment_len = get_bits(gb, 8) * 8;
+    if (get_bits_left(gb) < comment_len) {
+        av_log(avctx, AV_LOG_ERROR, "decode_pce: " overread_err);
+        return AVERROR_INVALIDDATA;
+    }
+    skip_bits_long(gb, comment_len);
+    return tags;
+}
+
+/**
+ * Decode GA "General Audio" specific configuration; reference: table 4.1.
+ *
+ * @param   ac          pointer to AACContext, may be null
+ * @param   avctx       pointer to AVCCodecContext, used for logging
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_ga_specific_config(AACContext *ac, AVCodecContext *avctx,
+                                     GetBitContext *gb,
+                                     MPEG4AudioConfig *m4ac,
+                                     int channel_config)
+{
+    int extension_flag, ret, ep_config, res_flags;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int tags = 0;
+
+    if (get_bits1(gb)) { // frameLengthFlag
+        avpriv_request_sample(avctx, "960/120 MDCT window");
+        return AVERROR_PATCHWELCOME;
+    }
+    m4ac->frame_length_short = 0;
+
+    if (get_bits1(gb))       // dependsOnCoreCoder
+        skip_bits(gb, 14);   // coreCoderDelay
+    extension_flag = get_bits1(gb);
+
+    if (m4ac->object_type == AOT_AAC_SCALABLE ||
+        m4ac->object_type == AOT_ER_AAC_SCALABLE)
+        skip_bits(gb, 3);     // layerNr
+
+    if (channel_config == 0) {
+        skip_bits(gb, 4);  // element_instance_tag
+        tags = decode_pce(avctx, m4ac, layout_map, gb);
+        if (tags < 0)
+            return tags;
+    } else {
+        if ((ret = set_default_channel_config(avctx, layout_map,
+                                              &tags, channel_config)))
+            return ret;
+    }
+
+    if (count_channels(layout_map, tags) > 1) {
+        m4ac->ps = 0;
+    } else if (m4ac->sbr == 1 && m4ac->ps == -1)
+        m4ac->ps = 1;
+
+    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
+        return ret;
+
+    if (extension_flag) {
+        switch (m4ac->object_type) {
+        case AOT_ER_BSAC:
+            skip_bits(gb, 5);    // numOfSubFrame
+            skip_bits(gb, 11);   // layer_length
+            break;
+        case AOT_ER_AAC_LC:
+        case AOT_ER_AAC_LTP:
+        case AOT_ER_AAC_SCALABLE:
+        case AOT_ER_AAC_LD:
+            res_flags = get_bits(gb, 3);
+            if (res_flags) {
+                avpriv_report_missing_feature(avctx,
+                                              "AAC data resilience (flags %x)",
+                                              res_flags);
+                return AVERROR_PATCHWELCOME;
+            }
+            break;
+        }
+        skip_bits1(gb);    // extensionFlag3 (TBD in version 3)
+    }
+    switch (m4ac->object_type) {
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LTP:
+    case AOT_ER_AAC_SCALABLE:
+    case AOT_ER_AAC_LD:
+        ep_config = get_bits(gb, 2);
+        if (ep_config) {
+            avpriv_report_missing_feature(avctx,
+                                          "epConfig %d", ep_config);
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+    return 0;
+}
+
+static int decode_eld_specific_config(AACContext *ac, AVCodecContext *avctx,
+                                     GetBitContext *gb,
+                                     MPEG4AudioConfig *m4ac,
+                                     int channel_config)
+{
+    int ret, ep_config, res_flags;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int tags = 0;
+    const int ELDEXT_TERM = 0;
+
+    m4ac->ps  = 0;
+    m4ac->sbr = 0;
+#if USE_FIXED
+    if (get_bits1(gb)) { // frameLengthFlag
+        avpriv_request_sample(avctx, "960/120 MDCT window");
+        return AVERROR_PATCHWELCOME;
+    }
+#else
+    m4ac->frame_length_short = get_bits1(gb);
+#endif
+    res_flags = get_bits(gb, 3);
+    if (res_flags) {
+        avpriv_report_missing_feature(avctx,
+                                      "AAC data resilience (flags %x)",
+                                      res_flags);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (get_bits1(gb)) { // ldSbrPresentFlag
+        avpriv_report_missing_feature(avctx,
+                                      "Low Delay SBR");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    while (get_bits(gb, 4) != ELDEXT_TERM) {
+        int len = get_bits(gb, 4);
+        if (len == 15)
+            len += get_bits(gb, 8);
+        if (len == 15 + 255)
+            len += get_bits(gb, 16);
+        if (get_bits_left(gb) < len * 8 + 4) {
+            av_log(avctx, AV_LOG_ERROR, overread_err);
+            return AVERROR_INVALIDDATA;
+        }
+        skip_bits_long(gb, 8 * len);
+    }
+
+    if ((ret = set_default_channel_config(avctx, layout_map,
+                                          &tags, channel_config)))
+        return ret;
+
+    if (ac && (ret = output_configure(ac, layout_map, tags, OC_GLOBAL_HDR, 0)))
+        return ret;
+
+    ep_config = get_bits(gb, 2);
+    if (ep_config) {
+        avpriv_report_missing_feature(avctx,
+                                      "epConfig %d", ep_config);
+        return AVERROR_PATCHWELCOME;
+    }
+    return 0;
+}
+
+/**
+ * Decode audio specific configuration; reference: table 1.13.
+ *
+ * @param   ac          pointer to AACContext, may be null
+ * @param   avctx       pointer to AVCCodecContext, used for logging
+ * @param   m4ac        pointer to MPEG4AudioConfig, used for parsing
+ * @param   data        pointer to buffer holding an audio specific config
+ * @param   bit_size    size of audio specific config or data in bits
+ * @param   sync_extension look for an appended sync extension
+ *
+ * @return  Returns error status or number of consumed bits. <0 - error
+ */
+static int decode_audio_specific_config(AACContext *ac,
+                                        AVCodecContext *avctx,
+                                        MPEG4AudioConfig *m4ac,
+                                        const uint8_t *data, int64_t bit_size,
+                                        int sync_extension)
+{
+    GetBitContext gb;
+    int i, ret;
+
+    if (bit_size < 0 || bit_size > INT_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "Audio specific config size is invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ff_dlog(avctx, "audio specific config size %d\n", (int)bit_size >> 3);
+    for (i = 0; i < bit_size >> 3; i++)
+        ff_dlog(avctx, "%02x ", data[i]);
+    ff_dlog(avctx, "\n");
+
+    if ((ret = init_get_bits(&gb, data, bit_size)) < 0)
+        return ret;
+
+    if ((i = avpriv_mpeg4audio_get_config(m4ac, data, bit_size,
+                                          sync_extension)) < 0)
+        return AVERROR_INVALIDDATA;
+    if (m4ac->sampling_index > 12) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid sampling rate index %d\n",
+               m4ac->sampling_index);
+        return AVERROR_INVALIDDATA;
+    }
+    if (m4ac->object_type == AOT_ER_AAC_LD &&
+        (m4ac->sampling_index < 3 || m4ac->sampling_index > 7)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "invalid low delay sampling rate index %d\n",
+               m4ac->sampling_index);
+        return AVERROR_INVALIDDATA;
+    }
+
+    skip_bits_long(&gb, i);
+
+    switch (m4ac->object_type) {
+    case AOT_AAC_MAIN:
+    case AOT_AAC_LC:
+    case AOT_AAC_LTP:
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LD:
+        if ((ret = decode_ga_specific_config(ac, avctx, &gb,
+                                            m4ac, m4ac->chan_config)) < 0)
+            return ret;
+        break;
+    case AOT_ER_AAC_ELD:
+        if ((ret = decode_eld_specific_config(ac, avctx, &gb,
+                                              m4ac, m4ac->chan_config)) < 0)
+            return ret;
+        break;
+    default:
+        avpriv_report_missing_feature(avctx,
+                                      "Audio object type %s%d",
+                                      m4ac->sbr == 1 ? "SBR+" : "",
+                                      m4ac->object_type);
+        return AVERROR(ENOSYS);
+    }
+
+    ff_dlog(avctx,
+            "AOT %d chan config %d sampling index %d (%d) SBR %d PS %d\n",
+            m4ac->object_type, m4ac->chan_config, m4ac->sampling_index,
+            m4ac->sample_rate, m4ac->sbr,
+            m4ac->ps);
+
+    return get_bits_count(&gb);
+}
+
+/**
+ * linear congruential pseudorandom number generator
+ *
+ * @param   previous_val    pointer to the current state of the generator
+ *
+ * @return  Returns a 32-bit pseudorandom integer
+ */
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+static void reset_all_predictors(PredictorState *ps)
+{
+    int i;
+    for (i = 0; i < MAX_PREDICTORS; i++)
+        reset_predict_state(&ps[i]);
+}
+
+static int sample_rate_idx (int rate)
+{
+         if (92017 <= rate) return 0;
+    else if (75132 <= rate) return 1;
+    else if (55426 <= rate) return 2;
+    else if (46009 <= rate) return 3;
+    else if (37566 <= rate) return 4;
+    else if (27713 <= rate) return 5;
+    else if (23004 <= rate) return 6;
+    else if (18783 <= rate) return 7;
+    else if (13856 <= rate) return 8;
+    else if (11502 <= rate) return 9;
+    else if (9391  <= rate) return 10;
+    else                    return 11;
+}
+
+static void reset_predictor_group(PredictorState *ps, int group_num)
+{
+    int i;
+    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
+        reset_predict_state(&ps[i]);
+}
+
+#define AAC_INIT_VLC_STATIC(num, size)                                     \
+    INIT_VLC_STATIC(&vlc_spectral[num], 8, ff_aac_spectral_sizes[num],     \
+         ff_aac_spectral_bits[num], sizeof(ff_aac_spectral_bits[num][0]),  \
+                                    sizeof(ff_aac_spectral_bits[num][0]),  \
+        ff_aac_spectral_codes[num], sizeof(ff_aac_spectral_codes[num][0]), \
+                                    sizeof(ff_aac_spectral_codes[num][0]), \
+        size);
+
+static void aacdec_init(AACContext *ac);
+
+static av_cold void aac_static_table_init(void)
+{
+    AAC_INIT_VLC_STATIC( 0, 304);
+    AAC_INIT_VLC_STATIC( 1, 270);
+    AAC_INIT_VLC_STATIC( 2, 550);
+    AAC_INIT_VLC_STATIC( 3, 300);
+    AAC_INIT_VLC_STATIC( 4, 328);
+    AAC_INIT_VLC_STATIC( 5, 294);
+    AAC_INIT_VLC_STATIC( 6, 306);
+    AAC_INIT_VLC_STATIC( 7, 268);
+    AAC_INIT_VLC_STATIC( 8, 510);
+    AAC_INIT_VLC_STATIC( 9, 366);
+    AAC_INIT_VLC_STATIC(10, 462);
+
+    AAC_RENAME(ff_aac_sbr_init)();
+
+    ff_aac_tableinit();
+
+    INIT_VLC_STATIC(&vlc_scalefactors, 7,
+                    FF_ARRAY_ELEMS(ff_aac_scalefactor_code),
+                    ff_aac_scalefactor_bits,
+                    sizeof(ff_aac_scalefactor_bits[0]),
+                    sizeof(ff_aac_scalefactor_bits[0]),
+                    ff_aac_scalefactor_code,
+                    sizeof(ff_aac_scalefactor_code[0]),
+                    sizeof(ff_aac_scalefactor_code[0]),
+                    352);
+
+    // window initialization
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_long_1024), 4.0, 1024);
+    AAC_RENAME(ff_kbd_window_init)(AAC_RENAME(ff_aac_kbd_short_128), 6.0, 128);
+    AAC_RENAME(ff_init_ff_sine_windows)(10);
+    AAC_RENAME(ff_init_ff_sine_windows)( 9);
+    AAC_RENAME(ff_init_ff_sine_windows)( 7);
+
+    AAC_RENAME(ff_cbrt_tableinit)();
+}
+
+static AVOnce aac_table_init = AV_ONCE_INIT;
+
+static av_cold int aac_decode_init(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int ret;
+
+    ret = ff_thread_once(&aac_table_init, &aac_static_table_init);
+    if (ret != 0)
+        return AVERROR_UNKNOWN;
+
+    ac->avctx = avctx;
+    ac->oc[1].m4ac.sample_rate = avctx->sample_rate;
+
+    aacdec_init(ac);
+#if USE_FIXED
+    avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
+#else
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+#endif /* USE_FIXED */
+
+    if (avctx->extradata_size > 0) {
+        if ((ret = decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
+                                                avctx->extradata,
+                                                avctx->extradata_size * 8LL,
+                                                1)) < 0)
+            return ret;
+    } else {
+        int sr, i;
+        uint8_t layout_map[MAX_ELEM_ID*4][3];
+        int layout_map_tags;
+
+        sr = sample_rate_idx(avctx->sample_rate);
+        ac->oc[1].m4ac.sampling_index = sr;
+        ac->oc[1].m4ac.channels = avctx->channels;
+        ac->oc[1].m4ac.sbr = -1;
+        ac->oc[1].m4ac.ps = -1;
+
+        for (i = 0; i < FF_ARRAY_ELEMS(ff_mpeg4audio_channels); i++)
+            if (ff_mpeg4audio_channels[i] == avctx->channels)
+                break;
+        if (i == FF_ARRAY_ELEMS(ff_mpeg4audio_channels)) {
+            i = 0;
+        }
+        ac->oc[1].m4ac.chan_config = i;
+
+        if (ac->oc[1].m4ac.chan_config) {
+            int ret = set_default_channel_config(avctx, layout_map,
+                &layout_map_tags, ac->oc[1].m4ac.chan_config);
+            if (!ret)
+                output_configure(ac, layout_map, layout_map_tags,
+                                 OC_GLOBAL_HDR, 0);
+            else if (avctx->err_recognition & AV_EF_EXPLODE)
+                return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (avctx->channels > MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "Too many channels\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+#if USE_FIXED
+    ac->fdsp = avpriv_alloc_fixed_dsp(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#else
+    ac->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#endif /* USE_FIXED */
+    if (!ac->fdsp) {
+        return AVERROR(ENOMEM);
+    }
+
+    ac->random_state = 0x1f2e3d4c;
+
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct,       11, 1, 1.0 / RANGE15(1024.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_ld,    10, 1, 1.0 / RANGE15(512.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_small,  8, 1, 1.0 / RANGE15(128.0));
+    AAC_RENAME_32(ff_mdct_init)(&ac->mdct_ltp,   11, 0, RANGE15(-2.0));
+#if !USE_FIXED
+    ret = ff_imdct15_init(&ac->mdct480, 5);
+    if (ret < 0)
+        return ret;
+#endif
+
+    return 0;
+}
+
+/**
+ * Skip data_stream_element; reference: table 4.10.
+ */
+static int skip_data_stream_element(AACContext *ac, GetBitContext *gb)
+{
+    int byte_align = get_bits1(gb);
+    int count = get_bits(gb, 8);
+    if (count == 255)
+        count += get_bits(gb, 8);
+    if (byte_align)
+        align_get_bits(gb);
+
+    if (get_bits_left(gb) < 8 * count) {
+        av_log(ac->avctx, AV_LOG_ERROR, "skip_data_stream_element: "overread_err);
+        return AVERROR_INVALIDDATA;
+    }
+    skip_bits_long(gb, 8 * count);
+    return 0;
+}
+
+static int decode_prediction(AACContext *ac, IndividualChannelStream *ics,
+                             GetBitContext *gb)
+{
+    int sfb;
+    if (get_bits1(gb)) {
+        ics->predictor_reset_group = get_bits(gb, 5);
+        if (ics->predictor_reset_group == 0 ||
+            ics->predictor_reset_group > 30) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid Predictor Reset Group.\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    for (sfb = 0; sfb < FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index]); sfb++) {
+        ics->prediction_used[sfb] = get_bits1(gb);
+    }
+    return 0;
+}
+
+/**
+ * Decode Long Term Prediction data; reference: table 4.xx.
+ */
+static void decode_ltp(LongTermPrediction *ltp,
+                       GetBitContext *gb, uint8_t max_sfb)
+{
+    int sfb;
+
+    ltp->lag  = get_bits(gb, 11);
+    ltp->coef = ltp_coef[get_bits(gb, 3)];
+    for (sfb = 0; sfb < FFMIN(max_sfb, MAX_LTP_LONG_SFB); sfb++)
+        ltp->used[sfb] = get_bits1(gb);
+}
+
+/**
+ * Decode Individual Channel Stream info; reference: table 4.6.
+ */
+static int decode_ics_info(AACContext *ac, IndividualChannelStream *ics,
+                           GetBitContext *gb)
+{
+    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
+    const int aot = m4ac->object_type;
+    const int sampling_index = m4ac->sampling_index;
+    if (aot != AOT_ER_AAC_ELD) {
+        if (get_bits1(gb)) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Reserved bit set.\n");
+            if (ac->avctx->err_recognition & AV_EF_BITSTREAM)
+                return AVERROR_INVALIDDATA;
+        }
+        ics->window_sequence[1] = ics->window_sequence[0];
+        ics->window_sequence[0] = get_bits(gb, 2);
+        if (aot == AOT_ER_AAC_LD &&
+            ics->window_sequence[0] != ONLY_LONG_SEQUENCE) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "AAC LD is only defined for ONLY_LONG_SEQUENCE but "
+                   "window sequence %d found.\n", ics->window_sequence[0]);
+            ics->window_sequence[0] = ONLY_LONG_SEQUENCE;
+            return AVERROR_INVALIDDATA;
+        }
+        ics->use_kb_window[1]   = ics->use_kb_window[0];
+        ics->use_kb_window[0]   = get_bits1(gb);
+    }
+    ics->num_window_groups  = 1;
+    ics->group_len[0]       = 1;
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        int i;
+        ics->max_sfb = get_bits(gb, 4);
+        for (i = 0; i < 7; i++) {
+            if (get_bits1(gb)) {
+                ics->group_len[ics->num_window_groups - 1]++;
+            } else {
+                ics->num_window_groups++;
+                ics->group_len[ics->num_window_groups - 1] = 1;
+            }
+        }
+        ics->num_windows       = 8;
+        ics->swb_offset        =    ff_swb_offset_128[sampling_index];
+        ics->num_swb           =   ff_aac_num_swb_128[sampling_index];
+        ics->tns_max_bands     = ff_tns_max_bands_128[sampling_index];
+        ics->predictor_present = 0;
+    } else {
+        ics->max_sfb           = get_bits(gb, 6);
+        ics->num_windows       = 1;
+        if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD) {
+            if (m4ac->frame_length_short) {
+                ics->swb_offset    =     ff_swb_offset_480[sampling_index];
+                ics->num_swb       =    ff_aac_num_swb_480[sampling_index];
+                ics->tns_max_bands =  ff_tns_max_bands_480[sampling_index];
+            } else {
+                ics->swb_offset    =     ff_swb_offset_512[sampling_index];
+                ics->num_swb       =    ff_aac_num_swb_512[sampling_index];
+                ics->tns_max_bands =  ff_tns_max_bands_512[sampling_index];
+            }
+            if (!ics->num_swb || !ics->swb_offset)
+                return AVERROR_BUG;
+        } else {
+            ics->swb_offset    =    ff_swb_offset_1024[sampling_index];
+            ics->num_swb       =   ff_aac_num_swb_1024[sampling_index];
+            ics->tns_max_bands = ff_tns_max_bands_1024[sampling_index];
+        }
+        if (aot != AOT_ER_AAC_ELD) {
+            ics->predictor_present     = get_bits1(gb);
+            ics->predictor_reset_group = 0;
+        }
+        if (ics->predictor_present) {
+            if (aot == AOT_AAC_MAIN) {
+                if (decode_prediction(ac, ics, gb)) {
+                    goto fail;
+                }
+            } else if (aot == AOT_AAC_LC ||
+                       aot == AOT_ER_AAC_LC) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Prediction is not allowed in AAC-LC.\n");
+                goto fail;
+            } else {
+                if (aot == AOT_ER_AAC_LD) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "LTP in ER AAC LD not yet implemented.\n");
+                    return AVERROR_PATCHWELCOME;
+                }
+                if ((ics->ltp.present = get_bits(gb, 1)))
+                    decode_ltp(&ics->ltp, gb, ics->max_sfb);
+            }
+        }
+    }
+
+    if (ics->max_sfb > ics->num_swb) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Number of scalefactor bands in group (%d) "
+               "exceeds limit (%d).\n",
+               ics->max_sfb, ics->num_swb);
+        goto fail;
+    }
+
+    return 0;
+fail:
+    ics->max_sfb = 0;
+    return AVERROR_INVALIDDATA;
+}
+
+/**
+ * Decode band types (section_data payload); reference: table 4.46.
+ *
+ * @param   band_type           array of the used band type
+ * @param   band_type_run_end   array of the last scalefactor band of a band type run
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_band_types(AACContext *ac, enum BandType band_type[120],
+                             int band_type_run_end[120], GetBitContext *gb,
+                             IndividualChannelStream *ics)
+{
+    int g, idx = 0;
+    const int bits = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) ? 3 : 5;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        int k = 0;
+        while (k < ics->max_sfb) {
+            uint8_t sect_end = k;
+            int sect_len_incr;
+            int sect_band_type = get_bits(gb, 4);
+            if (sect_band_type == 12) {
+                av_log(ac->avctx, AV_LOG_ERROR, "invalid band type\n");
+                return AVERROR_INVALIDDATA;
+            }
+            do {
+                sect_len_incr = get_bits(gb, bits);
+                sect_end += sect_len_incr;
+                if (get_bits_left(gb) < 0) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "decode_band_types: "overread_err);
+                    return AVERROR_INVALIDDATA;
+                }
+                if (sect_end > ics->max_sfb) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "Number of bands (%d) exceeds limit (%d).\n",
+                           sect_end, ics->max_sfb);
+                    return AVERROR_INVALIDDATA;
+                }
+            } while (sect_len_incr == (1 << bits) - 1);
+            for (; k < sect_end; k++) {
+                band_type        [idx]   = sect_band_type;
+                band_type_run_end[idx++] = sect_end;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode scalefactors; reference: table 4.47.
+ *
+ * @param   global_gain         first scalefactor value as scalefactors are differentially coded
+ * @param   band_type           array of the used band type
+ * @param   band_type_run_end   array of the last scalefactor band of a band type run
+ * @param   sf                  array of scalefactors or intensity stereo positions
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_scalefactors(AACContext *ac, INTFLOAT sf[120], GetBitContext *gb,
+                               unsigned int global_gain,
+                               IndividualChannelStream *ics,
+                               enum BandType band_type[120],
+                               int band_type_run_end[120])
+{
+    int g, i, idx = 0;
+    int offset[3] = { global_gain, global_gain - NOISE_OFFSET, 0 };
+    int clipped_offset;
+    int noise_flag = 1;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb;) {
+            int run_end = band_type_run_end[idx];
+            if (band_type[idx] == ZERO_BT) {
+                for (; i < run_end; i++, idx++)
+                    sf[idx] = FIXR(0.);
+            } else if ((band_type[idx] == INTENSITY_BT) ||
+                       (band_type[idx] == INTENSITY_BT2)) {
+                for (; i < run_end; i++, idx++) {
+                    offset[2] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    clipped_offset = av_clip(offset[2], -155, 100);
+                    if (offset[2] != clipped_offset) {
+                        avpriv_request_sample(ac->avctx,
+                                              "If you heard an audible artifact, there may be a bug in the decoder. "
+                                              "Clipped intensity stereo position (%d -> %d)",
+                                              offset[2], clipped_offset);
+                    }
+#if USE_FIXED
+                    sf[idx] = 100 - clipped_offset;
+#else
+                    sf[idx] = ff_aac_pow2sf_tab[-clipped_offset + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            } else if (band_type[idx] == NOISE_BT) {
+                for (; i < run_end; i++, idx++) {
+                    if (noise_flag-- > 0)
+                        offset[1] += get_bits(gb, NOISE_PRE_BITS) - NOISE_PRE;
+                    else
+                        offset[1] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    clipped_offset = av_clip(offset[1], -100, 155);
+                    if (offset[1] != clipped_offset) {
+                        avpriv_request_sample(ac->avctx,
+                                              "If you heard an audible artifact, there may be a bug in the decoder. "
+                                              "Clipped noise gain (%d -> %d)",
+                                              offset[1], clipped_offset);
+                    }
+#if USE_FIXED
+                    sf[idx] = -(100 + clipped_offset);
+#else
+                    sf[idx] = -ff_aac_pow2sf_tab[clipped_offset + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            } else {
+                for (; i < run_end; i++, idx++) {
+                    offset[0] += get_vlc2(gb, vlc_scalefactors.table, 7, 3) - SCALE_DIFF_ZERO;
+                    if (offset[0] > 255U) {
+                        av_log(ac->avctx, AV_LOG_ERROR,
+                               "Scalefactor (%d) out of range.\n", offset[0]);
+                        return AVERROR_INVALIDDATA;
+                    }
+#if USE_FIXED
+                    sf[idx] = -offset[0];
+#else
+                    sf[idx] = -ff_aac_pow2sf_tab[offset[0] - 100 + POW_SF2_ZERO];
+#endif /* USE_FIXED */
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode pulse data; reference: table 4.7.
+ */
+static int decode_pulses(Pulse *pulse, GetBitContext *gb,
+                         const uint16_t *swb_offset, int num_swb)
+{
+    int i, pulse_swb;
+    pulse->num_pulse = get_bits(gb, 2) + 1;
+    pulse_swb        = get_bits(gb, 6);
+    if (pulse_swb >= num_swb)
+        return -1;
+    pulse->pos[0]    = swb_offset[pulse_swb];
+    pulse->pos[0]   += get_bits(gb, 5);
+    if (pulse->pos[0] >= swb_offset[num_swb])
+        return -1;
+    pulse->amp[0]    = get_bits(gb, 4);
+    for (i = 1; i < pulse->num_pulse; i++) {
+        pulse->pos[i] = get_bits(gb, 5) + pulse->pos[i - 1];
+        if (pulse->pos[i] >= swb_offset[num_swb])
+            return -1;
+        pulse->amp[i] = get_bits(gb, 4);
+    }
+    return 0;
+}
+
+/**
+ * Decode Temporal Noise Shaping data; reference: table 4.48.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_tns(AACContext *ac, TemporalNoiseShaping *tns,
+                      GetBitContext *gb, const IndividualChannelStream *ics)
+{
+    int w, filt, i, coef_len, coef_res, coef_compress;
+    const int is8 = ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int tns_max_order = is8 ? 7 : ac->oc[1].m4ac.object_type == AOT_AAC_MAIN ? 20 : 12;
+    for (w = 0; w < ics->num_windows; w++) {
+        if ((tns->n_filt[w] = get_bits(gb, 2 - is8))) {
+            coef_res = get_bits1(gb);
+
+            for (filt = 0; filt < tns->n_filt[w]; filt++) {
+                int tmp2_idx;
+                tns->length[w][filt] = get_bits(gb, 6 - 2 * is8);
+
+                if ((tns->order[w][filt] = get_bits(gb, 5 - 2 * is8)) > tns_max_order) {
+                    av_log(ac->avctx, AV_LOG_ERROR,
+                           "TNS filter order %d is greater than maximum %d.\n",
+                           tns->order[w][filt], tns_max_order);
+                    tns->order[w][filt] = 0;
+                    return AVERROR_INVALIDDATA;
+                }
+                if (tns->order[w][filt]) {
+                    tns->direction[w][filt] = get_bits1(gb);
+                    coef_compress = get_bits1(gb);
+                    coef_len = coef_res + 3 - coef_compress;
+                    tmp2_idx = 2 * coef_compress + coef_res;
+
+                    for (i = 0; i < tns->order[w][filt]; i++)
+                        tns->coef[w][filt][i] = tns_tmp2_map[tmp2_idx][get_bits(gb, coef_len)];
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Decode Mid/Side data; reference: table 4.54.
+ *
+ * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
+ *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
+ *                      [3] reserved for scalable AAC
+ */
+static void decode_mid_side_stereo(ChannelElement *cpe, GetBitContext *gb,
+                                   int ms_present)
+{
+    int idx;
+    int max_idx = cpe->ch[0].ics.num_window_groups * cpe->ch[0].ics.max_sfb;
+    if (ms_present == 1) {
+        for (idx = 0; idx < max_idx; idx++)
+            cpe->ms_mask[idx] = get_bits1(gb);
+    } else if (ms_present == 2) {
+        memset(cpe->ms_mask, 1, max_idx * sizeof(cpe->ms_mask[0]));
+    }
+}
+
+/**
+ * Decode spectral data; reference: table 4.50.
+ * Dequantize and scale spectral data; reference: 4.6.3.3.
+ *
+ * @param   coef            array of dequantized, scaled spectral data
+ * @param   sf              array of scalefactors or intensity stereo positions
+ * @param   pulse_present   set if pulses are present
+ * @param   pulse           pointer to pulse data struct
+ * @param   band_type       array of the used band type
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_spectrum_and_dequant(AACContext *ac, INTFLOAT coef[1024],
+                                       GetBitContext *gb, const INTFLOAT sf[120],
+                                       int pulse_present, const Pulse *pulse,
+                                       const IndividualChannelStream *ics,
+                                       enum BandType band_type[120])
+{
+    int i, k, g, idx = 0;
+    const int c = 1024 / ics->num_windows;
+    const uint16_t *offsets = ics->swb_offset;
+    INTFLOAT *coef_base = coef;
+
+    for (g = 0; g < ics->num_windows; g++)
+        memset(coef + g * 128 + offsets[ics->max_sfb], 0,
+               sizeof(INTFLOAT) * (c - offsets[ics->max_sfb]));
+
+    for (g = 0; g < ics->num_window_groups; g++) {
+        unsigned g_len = ics->group_len[g];
+
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            const unsigned cbt_m1 = band_type[idx] - 1;
+            INTFLOAT *cfo = coef + offsets[i];
+            int off_len = offsets[i + 1] - offsets[i];
+            int group;
+
+            if (cbt_m1 >= INTENSITY_BT2 - 1) {
+                for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                    memset(cfo, 0, off_len * sizeof(*cfo));
+                }
+            } else if (cbt_m1 == NOISE_BT - 1) {
+                for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+#if !USE_FIXED
+                    float scale;
+#endif /* !USE_FIXED */
+                    INTFLOAT band_energy;
+
+                    for (k = 0; k < off_len; k++) {
+                        ac->random_state  = lcg_random(ac->random_state);
+#if USE_FIXED
+                        cfo[k] = ac->random_state >> 3;
+#else
+                        cfo[k] = ac->random_state;
+#endif /* USE_FIXED */
+                    }
+
+#if USE_FIXED
+                    band_energy = ac->fdsp->scalarproduct_fixed(cfo, cfo, off_len);
+                    band_energy = fixed_sqrt(band_energy, 31);
+                    noise_scale(cfo, sf[idx], band_energy, off_len);
+#else
+                    band_energy = ac->fdsp->scalarproduct_float(cfo, cfo, off_len);
+                    scale = sf[idx] / sqrtf(band_energy);
+                    ac->fdsp->vector_fmul_scalar(cfo, cfo, scale, off_len);
+#endif /* USE_FIXED */
+                }
+            } else {
+#if !USE_FIXED
+                const float *vq = ff_aac_codebook_vector_vals[cbt_m1];
+#endif /* !USE_FIXED */
+                const uint16_t *cb_vector_idx = ff_aac_codebook_vector_idx[cbt_m1];
+                VLC_TYPE (*vlc_tab)[2] = vlc_spectral[cbt_m1].table;
+                OPEN_READER(re, gb);
+
+                switch (cbt_m1 >> 1) {
+                case 0:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned cb_idx;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+#if USE_FIXED
+                            cf = DEC_SQUAD(cf, cb_idx);
+#else
+                            cf = VMUL4(cf, vq, cb_idx, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 4);
+                    }
+                    break;
+
+                case 1:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nnz;
+                            unsigned cb_idx;
+                            uint32_t bits;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 8 & 15;
+                            bits = nnz ? GET_CACHE(re, gb) : 0;
+                            LAST_SKIP_BITS(re, gb, nnz);
+#if USE_FIXED
+                            cf = DEC_UQUAD(cf, cb_idx, bits);
+#else
+                            cf = VMUL4S(cf, vq, cb_idx, bits, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 4);
+                    }
+                    break;
+
+                case 2:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned cb_idx;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+#if USE_FIXED
+                            cf = DEC_SPAIR(cf, cb_idx);
+#else
+                            cf = VMUL2(cf, vq, cb_idx, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 2);
+                    }
+                    break;
+
+                case 3:
+                case 4:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+                        INTFLOAT *cf = cfo;
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nnz;
+                            unsigned cb_idx;
+                            unsigned sign;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 8 & 15;
+                            sign = nnz ? SHOW_UBITS(re, gb, nnz) << (cb_idx >> 12) : 0;
+                            LAST_SKIP_BITS(re, gb, nnz);
+#if USE_FIXED
+                            cf = DEC_UPAIR(cf, cb_idx, sign);
+#else
+                            cf = VMUL2S(cf, vq, cb_idx, sign, sf + idx);
+#endif /* USE_FIXED */
+                        } while (len -= 2);
+                    }
+                    break;
+
+                default:
+                    for (group = 0; group < (AAC_SIGNE)g_len; group++, cfo+=128) {
+#if USE_FIXED
+                        int *icf = cfo;
+                        int v;
+#else
+                        float *cf = cfo;
+                        uint32_t *icf = (uint32_t *) cf;
+#endif /* USE_FIXED */
+                        int len = off_len;
+
+                        do {
+                            int code;
+                            unsigned nzt, nnz;
+                            unsigned cb_idx;
+                            uint32_t bits;
+                            int j;
+
+                            UPDATE_CACHE(re, gb);
+                            GET_VLC(code, re, gb, vlc_tab, 8, 2);
+
+                            if (!code) {
+                                *icf++ = 0;
+                                *icf++ = 0;
+                                continue;
+                            }
+
+                            cb_idx = cb_vector_idx[code];
+                            nnz = cb_idx >> 12;
+                            nzt = cb_idx >> 8;
+                            bits = SHOW_UBITS(re, gb, nnz) << (32-nnz);
+                            LAST_SKIP_BITS(re, gb, nnz);
+
+                            for (j = 0; j < 2; j++) {
+                                if (nzt & 1<<j) {
+                                    uint32_t b;
+                                    int n;
+                                    /* The total length of escape_sequence must be < 22 bits according
+                                       to the specification (i.e. max is 111111110xxxxxxxxxxxx). */
+                                    UPDATE_CACHE(re, gb);
+                                    b = GET_CACHE(re, gb);
+                                    b = 31 - av_log2(~b);
+
+                                    if (b > 8) {
+                                        av_log(ac->avctx, AV_LOG_ERROR, "error in spectral data, ESC overflow\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
+
+                                    SKIP_BITS(re, gb, b + 1);
+                                    b += 4;
+                                    n = (1 << b) + SHOW_UBITS(re, gb, b);
+                                    LAST_SKIP_BITS(re, gb, b);
+#if USE_FIXED
+                                    v = n;
+                                    if (bits & 1U<<31)
+                                        v = -v;
+                                    *icf++ = v;
+#else
+                                    *icf++ = ff_cbrt_tab[n] | (bits & 1U<<31);
+#endif /* USE_FIXED */
+                                    bits <<= 1;
+                                } else {
+#if USE_FIXED
+                                    v = cb_idx & 15;
+                                    if (bits & 1U<<31)
+                                        v = -v;
+                                    *icf++ = v;
+#else
+                                    unsigned v = ((const uint32_t*)vq)[cb_idx & 15];
+                                    *icf++ = (bits & 1U<<31) | v;
+#endif /* USE_FIXED */
+                                    bits <<= !!v;
+                                }
+                                cb_idx >>= 4;
+                            }
+                        } while (len -= 2);
+#if !USE_FIXED
+                        ac->fdsp->vector_fmul_scalar(cfo, cfo, sf[idx], off_len);
+#endif /* !USE_FIXED */
+                    }
+                }
+
+                CLOSE_READER(re, gb);
+            }
+        }
+        coef += g_len << 7;
+    }
+
+    if (pulse_present) {
+        idx = 0;
+        for (i = 0; i < pulse->num_pulse; i++) {
+            INTFLOAT co = coef_base[ pulse->pos[i] ];
+            while (offsets[idx + 1] <= pulse->pos[i])
+                idx++;
+            if (band_type[idx] != NOISE_BT && sf[idx]) {
+                INTFLOAT ico = -pulse->amp[i];
+#if USE_FIXED
+                if (co) {
+                    ico = co + (co > 0 ? -ico : ico);
+                }
+                coef_base[ pulse->pos[i] ] = ico;
+#else
+                if (co) {
+                    co /= sf[idx];
+                    ico = co / sqrtf(sqrtf(fabsf(co))) + (co > 0 ? -ico : ico);
+                }
+                coef_base[ pulse->pos[i] ] = cbrtf(fabsf(ico)) * ico * sf[idx];
+#endif /* USE_FIXED */
+            }
+        }
+    }
+#if USE_FIXED
+    coef = coef_base;
+    idx = 0;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        unsigned g_len = ics->group_len[g];
+
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            const unsigned cbt_m1 = band_type[idx] - 1;
+            int *cfo = coef + offsets[i];
+            int off_len = offsets[i + 1] - offsets[i];
+            int group;
+
+            if (cbt_m1 < NOISE_BT - 1) {
+                for (group = 0; group < (int)g_len; group++, cfo+=128) {
+                    ac->vector_pow43(cfo, off_len);
+                    ac->subband_scale(cfo, cfo, sf[idx], 34, off_len);
+                }
+            }
+        }
+        coef += g_len << 7;
+    }
+#endif /* USE_FIXED */
+    return 0;
+}
+
+/**
+ * Apply AAC-Main style frequency domain prediction.
+ */
+static void apply_prediction(AACContext *ac, SingleChannelElement *sce)
+{
+    int sfb, k;
+
+    if (!sce->ics.predictor_initialized) {
+        reset_all_predictors(sce->predictor_state);
+        sce->ics.predictor_initialized = 1;
+    }
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        for (sfb = 0;
+             sfb < ff_aac_pred_sfb_max[ac->oc[1].m4ac.sampling_index];
+             sfb++) {
+            for (k = sce->ics.swb_offset[sfb];
+                 k < sce->ics.swb_offset[sfb + 1];
+                 k++) {
+                predict(&sce->predictor_state[k], &sce->coeffs[k],
+                        sce->ics.predictor_present &&
+                        sce->ics.prediction_used[sfb]);
+            }
+        }
+        if (sce->ics.predictor_reset_group)
+            reset_predictor_group(sce->predictor_state,
+                                  sce->ics.predictor_reset_group);
+    } else
+        reset_all_predictors(sce->predictor_state);
+}
+
+/**
+ * Decode an individual_channel_stream payload; reference: table 4.44.
+ *
+ * @param   common_window   Channels have independent [0], or shared [1], Individual Channel Stream information.
+ * @param   scale_flag      scalable [1] or non-scalable [0] AAC (Unused until scalable AAC is implemented.)
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_ics(AACContext *ac, SingleChannelElement *sce,
+                      GetBitContext *gb, int common_window, int scale_flag)
+{
+    Pulse pulse;
+    TemporalNoiseShaping    *tns = &sce->tns;
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *out = sce->coeffs;
+    int global_gain, eld_syntax, er_syntax, pulse_present = 0;
+    int ret;
+
+    eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+    er_syntax  = ac->oc[1].m4ac.object_type == AOT_ER_AAC_LC ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LTP ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_LD ||
+                 ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+
+    /* This assignment is to silence a GCC warning about the variable being used
+     * uninitialized when in fact it always is.
+     */
+    pulse.num_pulse = 0;
+
+    global_gain = get_bits(gb, 8);
+
+    if (!common_window && !scale_flag) {
+        if (decode_ics_info(ac, ics, gb) < 0)
+            return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = decode_band_types(ac, sce->band_type,
+                                 sce->band_type_run_end, gb, ics)) < 0)
+        return ret;
+    if ((ret = decode_scalefactors(ac, sce->sf, gb, global_gain, ics,
+                                  sce->band_type, sce->band_type_run_end)) < 0)
+        return ret;
+
+    pulse_present = 0;
+    if (!scale_flag) {
+        if (!eld_syntax && (pulse_present = get_bits1(gb))) {
+            if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Pulse tool not allowed in eight short sequence.\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (decode_pulses(&pulse, gb, ics->swb_offset, ics->num_swb)) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "Pulse data corrupt or invalid.\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        tns->present = get_bits1(gb);
+        if (tns->present && !er_syntax)
+            if (decode_tns(ac, tns, gb, ics) < 0)
+                return AVERROR_INVALIDDATA;
+        if (!eld_syntax && get_bits1(gb)) {
+            avpriv_request_sample(ac->avctx, "SSR");
+            return AVERROR_PATCHWELCOME;
+        }
+        // I see no textual basis in the spec for this occurring after SSR gain
+        // control, but this is what both reference and real implmentations do
+        if (tns->present && er_syntax)
+            if (decode_tns(ac, tns, gb, ics) < 0)
+                return AVERROR_INVALIDDATA;
+    }
+
+    if (decode_spectrum_and_dequant(ac, out, gb, sce->sf, pulse_present,
+                                    &pulse, ics, sce->band_type) < 0)
+        return AVERROR_INVALIDDATA;
+
+    if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN && !common_window)
+        apply_prediction(ac, sce);
+
+    return 0;
+}
+
+/**
+ * Mid/Side stereo decoding; reference: 4.6.8.1.3.
+ */
+static void apply_mid_side_stereo(AACContext *ac, ChannelElement *cpe)
+{
+    const IndividualChannelStream *ics = &cpe->ch[0].ics;
+    INTFLOAT *ch0 = cpe->ch[0].coeffs;
+    INTFLOAT *ch1 = cpe->ch[1].coeffs;
+    int g, i, group, idx = 0;
+    const uint16_t *offsets = ics->swb_offset;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb; i++, idx++) {
+            if (cpe->ms_mask[idx] &&
+                cpe->ch[0].band_type[idx] < NOISE_BT &&
+                cpe->ch[1].band_type[idx] < NOISE_BT) {
+#if USE_FIXED
+                for (group = 0; group < ics->group_len[g]; group++) {
+                    ac->fdsp->butterflies_fixed(ch0 + group * 128 + offsets[i],
+                                                ch1 + group * 128 + offsets[i],
+                                                offsets[i+1] - offsets[i]);
+#else
+                for (group = 0; group < ics->group_len[g]; group++) {
+                    ac->fdsp->butterflies_float(ch0 + group * 128 + offsets[i],
+                                               ch1 + group * 128 + offsets[i],
+                                               offsets[i+1] - offsets[i]);
+#endif /* USE_FIXED */
+                }
+            }
+        }
+        ch0 += ics->group_len[g] * 128;
+        ch1 += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * intensity stereo decoding; reference: 4.6.8.2.3
+ *
+ * @param   ms_present  Indicates mid/side stereo presence. [0] mask is all 0s;
+ *                      [1] mask is decoded from bitstream; [2] mask is all 1s;
+ *                      [3] reserved for scalable AAC
+ */
+static void apply_intensity_stereo(AACContext *ac,
+                                   ChannelElement *cpe, int ms_present)
+{
+    const IndividualChannelStream *ics = &cpe->ch[1].ics;
+    SingleChannelElement         *sce1 = &cpe->ch[1];
+    INTFLOAT *coef0 = cpe->ch[0].coeffs, *coef1 = cpe->ch[1].coeffs;
+    const uint16_t *offsets = ics->swb_offset;
+    int g, group, i, idx = 0;
+    int c;
+    INTFLOAT scale;
+    for (g = 0; g < ics->num_window_groups; g++) {
+        for (i = 0; i < ics->max_sfb;) {
+            if (sce1->band_type[idx] == INTENSITY_BT ||
+                sce1->band_type[idx] == INTENSITY_BT2) {
+                const int bt_run_end = sce1->band_type_run_end[idx];
+                for (; i < bt_run_end; i++, idx++) {
+                    c = -1 + 2 * (sce1->band_type[idx] - 14);
+                    if (ms_present)
+                        c *= 1 - 2 * cpe->ms_mask[idx];
+                    scale = c * sce1->sf[idx];
+                    for (group = 0; group < ics->group_len[g]; group++)
+#if USE_FIXED
+                        ac->subband_scale(coef1 + group * 128 + offsets[i],
+                                      coef0 + group * 128 + offsets[i],
+                                      scale,
+                                      23,
+                                      offsets[i + 1] - offsets[i]);
+#else
+                        ac->fdsp->vector_fmul_scalar(coef1 + group * 128 + offsets[i],
+                                                    coef0 + group * 128 + offsets[i],
+                                                    scale,
+                                                    offsets[i + 1] - offsets[i]);
+#endif /* USE_FIXED */
+                }
+            } else {
+                int bt_run_end = sce1->band_type_run_end[idx];
+                idx += bt_run_end - i;
+                i    = bt_run_end;
+            }
+        }
+        coef0 += ics->group_len[g] * 128;
+        coef1 += ics->group_len[g] * 128;
+    }
+}
+
+/**
+ * Decode a channel_pair_element; reference: table 4.4.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_cpe(AACContext *ac, GetBitContext *gb, ChannelElement *cpe)
+{
+    int i, ret, common_window, ms_present = 0;
+    int eld_syntax = ac->oc[1].m4ac.object_type == AOT_ER_AAC_ELD;
+
+    common_window = eld_syntax || get_bits1(gb);
+    if (common_window) {
+        if (decode_ics_info(ac, &cpe->ch[0].ics, gb))
+            return AVERROR_INVALIDDATA;
+        i = cpe->ch[1].ics.use_kb_window[0];
+        cpe->ch[1].ics = cpe->ch[0].ics;
+        cpe->ch[1].ics.use_kb_window[1] = i;
+        if (cpe->ch[1].ics.predictor_present &&
+            (ac->oc[1].m4ac.object_type != AOT_AAC_MAIN))
+            if ((cpe->ch[1].ics.ltp.present = get_bits(gb, 1)))
+                decode_ltp(&cpe->ch[1].ics.ltp, gb, cpe->ch[1].ics.max_sfb);
+        ms_present = get_bits(gb, 2);
+        if (ms_present == 3) {
+            av_log(ac->avctx, AV_LOG_ERROR, "ms_present = 3 is reserved.\n");
+            return AVERROR_INVALIDDATA;
+        } else if (ms_present)
+            decode_mid_side_stereo(cpe, gb, ms_present);
+    }
+    if ((ret = decode_ics(ac, &cpe->ch[0], gb, common_window, 0)))
+        return ret;
+    if ((ret = decode_ics(ac, &cpe->ch[1], gb, common_window, 0)))
+        return ret;
+
+    if (common_window) {
+        if (ms_present)
+            apply_mid_side_stereo(ac, cpe);
+        if (ac->oc[1].m4ac.object_type == AOT_AAC_MAIN) {
+            apply_prediction(ac, &cpe->ch[0]);
+            apply_prediction(ac, &cpe->ch[1]);
+        }
+    }
+
+    apply_intensity_stereo(ac, cpe, ms_present);
+    return 0;
+}
+
+static const float cce_scale[] = {
+    1.09050773266525765921, //2^(1/8)
+    1.18920711500272106672, //2^(1/4)
+    M_SQRT2,
+    2,
+};
+
+/**
+ * Decode coupling_channel_element; reference: table 4.8.
+ *
+ * @return  Returns error status. 0 - OK, !0 - error
+ */
+static int decode_cce(AACContext *ac, GetBitContext *gb, ChannelElement *che)
+{
+    int num_gain = 0;
+    int c, g, sfb, ret;
+    int sign;
+    INTFLOAT scale;
+    SingleChannelElement *sce = &che->ch[0];
+    ChannelCoupling     *coup = &che->coup;
+
+    coup->coupling_point = 2 * get_bits1(gb);
+    coup->num_coupled = get_bits(gb, 3);
+    for (c = 0; c <= coup->num_coupled; c++) {
+        num_gain++;
+        coup->type[c] = get_bits1(gb) ? TYPE_CPE : TYPE_SCE;
+        coup->id_select[c] = get_bits(gb, 4);
+        if (coup->type[c] == TYPE_CPE) {
+            coup->ch_select[c] = get_bits(gb, 2);
+            if (coup->ch_select[c] == 3)
+                num_gain++;
+        } else
+            coup->ch_select[c] = 2;
+    }
+    coup->coupling_point += get_bits1(gb) || (coup->coupling_point >> 1);
+
+    sign  = get_bits(gb, 1);
+    scale = AAC_RENAME(cce_scale)[get_bits(gb, 2)];
+
+    if ((ret = decode_ics(ac, sce, gb, 0, 0)))
+        return ret;
+
+    for (c = 0; c < num_gain; c++) {
+        int idx  = 0;
+        int cge  = 1;
+        int gain = 0;
+        INTFLOAT gain_cache = FIXR10(1.);
+        if (c) {
+            cge = coup->coupling_point == AFTER_IMDCT ? 1 : get_bits1(gb);
+            gain = cge ? get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60: 0;
+            gain_cache = GET_GAIN(scale, gain);
+        }
+        if (coup->coupling_point == AFTER_IMDCT) {
+            coup->gain[c][0] = gain_cache;
+        } else {
+            for (g = 0; g < sce->ics.num_window_groups; g++) {
+                for (sfb = 0; sfb < sce->ics.max_sfb; sfb++, idx++) {
+                    if (sce->band_type[idx] != ZERO_BT) {
+                        if (!cge) {
+                            int t = get_vlc2(gb, vlc_scalefactors.table, 7, 3) - 60;
+                            if (t) {
+                                int s = 1;
+                                t = gain += t;
+                                if (sign) {
+                                    s  -= 2 * (t & 0x1);
+                                    t >>= 1;
+                                }
+                                gain_cache = GET_GAIN(scale, t) * s;
+                            }
+                        }
+                        coup->gain[c][idx] = gain_cache;
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * Parse whether channels are to be excluded from Dynamic Range Compression; reference: table 4.53.
+ *
+ * @return  Returns number of bytes consumed.
+ */
+static int decode_drc_channel_exclusions(DynamicRangeControl *che_drc,
+                                         GetBitContext *gb)
+{
+    int i;
+    int num_excl_chan = 0;
+
+    do {
+        for (i = 0; i < 7; i++)
+            che_drc->exclude_mask[num_excl_chan++] = get_bits1(gb);
+    } while (num_excl_chan < MAX_CHANNELS - 7 && get_bits1(gb));
+
+    return num_excl_chan / 7;
+}
+
+/**
+ * Decode dynamic range information; reference: table 4.52.
+ *
+ * @return  Returns number of bytes consumed.
+ */
+static int decode_dynamic_range(DynamicRangeControl *che_drc,
+                                GetBitContext *gb)
+{
+    int n             = 1;
+    int drc_num_bands = 1;
+    int i;
+
+    /* pce_tag_present? */
+    if (get_bits1(gb)) {
+        che_drc->pce_instance_tag  = get_bits(gb, 4);
+        skip_bits(gb, 4); // tag_reserved_bits
+        n++;
+    }
+
+    /* excluded_chns_present? */
+    if (get_bits1(gb)) {
+        n += decode_drc_channel_exclusions(che_drc, gb);
+    }
+
+    /* drc_bands_present? */
+    if (get_bits1(gb)) {
+        che_drc->band_incr            = get_bits(gb, 4);
+        che_drc->interpolation_scheme = get_bits(gb, 4);
+        n++;
+        drc_num_bands += che_drc->band_incr;
+        for (i = 0; i < drc_num_bands; i++) {
+            che_drc->band_top[i] = get_bits(gb, 8);
+            n++;
+        }
+    }
+
+    /* prog_ref_level_present? */
+    if (get_bits1(gb)) {
+        che_drc->prog_ref_level = get_bits(gb, 7);
+        skip_bits1(gb); // prog_ref_level_reserved_bits
+        n++;
+    }
+
+    for (i = 0; i < drc_num_bands; i++) {
+        che_drc->dyn_rng_sgn[i] = get_bits1(gb);
+        che_drc->dyn_rng_ctl[i] = get_bits(gb, 7);
+        n++;
+    }
+
+    return n;
+}
+
+static int decode_fill(AACContext *ac, GetBitContext *gb, int len) {
+    uint8_t buf[256];
+    int i, major, minor;
+
+    if (len < 13+7*8)
+        goto unknown;
+
+    get_bits(gb, 13); len -= 13;
+
+    for(i=0; i+1<sizeof(buf) && len>=8; i++, len-=8)
+        buf[i] = get_bits(gb, 8);
+
+    buf[i] = 0;
+    if (ac->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(ac->avctx, AV_LOG_DEBUG, "FILL:%s\n", buf);
+
+    if (sscanf(buf, "libfaac %d.%d", &major, &minor) == 2){
+        ac->avctx->internal->skip_samples = 1024;
+    }
+
+unknown:
+    skip_bits_long(gb, len);
+
+    return 0;
+}
+
+/**
+ * Decode extension data (incomplete); reference: table 4.51.
+ *
+ * @param   cnt length of TYPE_FIL syntactic element in bytes
+ *
+ * @return Returns number of bytes consumed
+ */
+static int decode_extension_payload(AACContext *ac, GetBitContext *gb, int cnt,
+                                    ChannelElement *che, enum RawDataBlockType elem_type)
+{
+    int crc_flag = 0;
+    int res = cnt;
+    int type = get_bits(gb, 4);
+
+    if (ac->avctx->debug & FF_DEBUG_STARTCODE)
+        av_log(ac->avctx, AV_LOG_DEBUG, "extension type: %d len:%d\n", type, cnt);
+
+    switch (type) { // extension type
+    case EXT_SBR_DATA_CRC:
+        crc_flag++;
+    case EXT_SBR_DATA:
+        if (!che) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR was found before the first channel element.\n");
+            return res;
+        } else if (!ac->oc[1].m4ac.sbr) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR signaled to be not-present but was found in the bitstream.\n");
+            skip_bits_long(gb, 8 * cnt - 4);
+            return res;
+        } else if (ac->oc[1].m4ac.sbr == -1 && ac->oc[1].status == OC_LOCKED) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Implicit SBR was found with a first occurrence after the first frame.\n");
+            skip_bits_long(gb, 8 * cnt - 4);
+            return res;
+        } else if (ac->oc[1].m4ac.ps == -1 && ac->oc[1].status < OC_LOCKED && ac->avctx->channels == 1) {
+            ac->oc[1].m4ac.sbr = 1;
+            ac->oc[1].m4ac.ps = 1;
+            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
+            output_configure(ac, ac->oc[1].layout_map, ac->oc[1].layout_map_tags,
+                             ac->oc[1].status, 1);
+        } else {
+            ac->oc[1].m4ac.sbr = 1;
+            ac->avctx->profile = FF_PROFILE_AAC_HE;
+        }
+        res = AAC_RENAME(ff_decode_sbr_extension)(ac, &che->sbr, gb, crc_flag, cnt, elem_type);
+        break;
+    case EXT_DYNAMIC_RANGE:
+        res = decode_dynamic_range(&ac->che_drc, gb);
+        break;
+    case EXT_FILL:
+        decode_fill(ac, gb, 8 * cnt - 4);
+        break;
+    case EXT_FILL_DATA:
+    case EXT_DATA_ELEMENT:
+    default:
+        skip_bits_long(gb, 8 * cnt - 4);
+        break;
+    };
+    return res;
+}
+
+/**
+ * Decode Temporal Noise Shaping filter coefficients and apply all-pole filters; reference: 4.6.9.3.
+ *
+ * @param   decode  1 if tool is used normally, 0 if tool is used in LTP.
+ * @param   coef    spectral coefficients
+ */
+static void apply_tns(INTFLOAT coef[1024], TemporalNoiseShaping *tns,
+                      IndividualChannelStream *ics, int decode)
+{
+    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
+    int w, filt, m, i;
+    int bottom, top, order, start, end, size, inc;
+    INTFLOAT lpc[TNS_MAX_ORDER];
+    INTFLOAT tmp[TNS_MAX_ORDER+1];
+
+    for (w = 0; w < ics->num_windows; w++) {
+        bottom = ics->num_swb;
+        for (filt = 0; filt < tns->n_filt[w]; filt++) {
+            top    = bottom;
+            bottom = FFMAX(0, top - tns->length[w][filt]);
+            order  = tns->order[w][filt];
+            if (order == 0)
+                continue;
+
+            // tns_decode_coef
+            AAC_RENAME(compute_lpc_coefs)(tns->coef[w][filt], order, lpc, 0, 0, 0);
+
+            start = ics->swb_offset[FFMIN(bottom, mmm)];
+            end   = ics->swb_offset[FFMIN(   top, mmm)];
+            if ((size = end - start) <= 0)
+                continue;
+            if (tns->direction[w][filt]) {
+                inc = -1;
+                start = end - 1;
+            } else {
+                inc = 1;
+            }
+            start += w * 128;
+
+            if (decode) {
+                // ar filter
+                for (m = 0; m < size; m++, start += inc)
+                    for (i = 1; i <= FFMIN(m, order); i++)
+                        coef[start] -= AAC_MUL26(coef[start - i * inc], lpc[i - 1]);
+            } else {
+                // ma filter
+                for (m = 0; m < size; m++, start += inc) {
+                    tmp[0] = coef[start];
+                    for (i = 1; i <= FFMIN(m, order); i++)
+                        coef[start] += AAC_MUL26(tmp[i], lpc[i - 1]);
+                    for (i = order; i > 0; i--)
+                        tmp[i] = tmp[i - 1];
+                }
+            }
+        }
+    }
+}
+
+/**
+ *  Apply windowing and MDCT to obtain the spectral
+ *  coefficient from the predicted sample by LTP.
+ */
+static void windowing_and_mdct_ltp(AACContext *ac, INTFLOAT *out,
+                                   INTFLOAT *in, IndividualChannelStream *ics)
+{
+    const INTFLOAT *lwindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    const INTFLOAT *lwindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+
+    if (ics->window_sequence[0] != LONG_STOP_SEQUENCE) {
+        ac->fdsp->vector_fmul(in, in, lwindow_prev, 1024);
+    } else {
+        memset(in, 0, 448 * sizeof(*in));
+        ac->fdsp->vector_fmul(in + 448, in + 448, swindow_prev, 128);
+    }
+    if (ics->window_sequence[0] != LONG_START_SEQUENCE) {
+        ac->fdsp->vector_fmul_reverse(in + 1024, in + 1024, lwindow, 1024);
+    } else {
+        ac->fdsp->vector_fmul_reverse(in + 1024 + 448, in + 1024 + 448, swindow, 128);
+        memset(in + 1024 + 576, 0, 448 * sizeof(*in));
+    }
+    ac->mdct_ltp.mdct_calc(&ac->mdct_ltp, out, in);
+}
+
+/**
+ * Apply the long term prediction
+ */
+static void apply_ltp(AACContext *ac, SingleChannelElement *sce)
+{
+    const LongTermPrediction *ltp = &sce->ics.ltp;
+    const uint16_t *offsets = sce->ics.swb_offset;
+    int i, sfb;
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        INTFLOAT *predTime = sce->ret;
+        INTFLOAT *predFreq = ac->buf_mdct;
+        int16_t num_samples = 2048;
+
+        if (ltp->lag < 1024)
+            num_samples = ltp->lag + 1024;
+        for (i = 0; i < num_samples; i++)
+            predTime[i] = AAC_MUL30(sce->ltp_state[i + 2048 - ltp->lag], ltp->coef);
+        memset(&predTime[i], 0, (2048 - i) * sizeof(*predTime));
+
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+
+        if (sce->tns.present)
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+
+        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
+            if (ltp->used[sfb])
+                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
+                    sce->coeffs[i] += predFreq[i];
+    }
+}
+
+/**
+ * Update the LTP buffer for next frame
+ */
+static void update_ltp(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *saved     = sce->saved;
+    INTFLOAT *saved_ltp = sce->coeffs;
+    const INTFLOAT *lwindow = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    int i;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        memcpy(saved_ltp,       saved, 512 * sizeof(*saved_ltp));
+        memset(saved_ltp + 576, 0,     448 * sizeof(*saved_ltp));
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+
+        for (i = 0; i < 64; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], swindow[63 - i]);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        memcpy(saved_ltp,       ac->buf_mdct + 512, 448 * sizeof(*saved_ltp));
+        memset(saved_ltp + 576, 0,                  448 * sizeof(*saved_ltp));
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+
+        for (i = 0; i < 64; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], swindow[63 - i]);
+    } else { // LONG_STOP or ONLY_LONG
+        ac->fdsp->vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
+
+        for (i = 0; i < 512; i++)
+            saved_ltp[i + 512] = AAC_MUL31(ac->buf_mdct[1023 - i], lwindow[511 - i]);
+    }
+
+    memcpy(sce->ltp_state,      sce->ltp_state+1024, 1024 * sizeof(*sce->ltp_state));
+    memcpy(sce->ltp_state+1024, sce->ret,            1024 * sizeof(*sce->ltp_state));
+    memcpy(sce->ltp_state+2048, saved_ltp,           1024 * sizeof(*sce->ltp_state));
+}
+
+/**
+ * Conduct IMDCT and windowing.
+ */
+static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    const INTFLOAT *swindow      = ics->use_kb_window[0] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    const INTFLOAT *lwindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_long_1024) : AAC_RENAME(ff_sine_1024);
+    const INTFLOAT *swindow_prev = ics->use_kb_window[1] ? AAC_RENAME(ff_aac_kbd_short_128) : AAC_RENAME(ff_sine_128);
+    INTFLOAT *buf  = ac->buf_mdct;
+    INTFLOAT *temp = ac->temp;
+    int i;
+
+    // imdct
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 1024; i += 128)
+            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
+    } else {
+        ac->mdct.imdct_half(&ac->mdct, buf, in);
+#if USE_FIXED
+        for (i=0; i<1024; i++)
+          buf[i] = (buf[i] + 4) >> 3;
+#endif /* USE_FIXED */
+    }
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp->vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+    } else {
+        memcpy(                         out,               saved,            448 * sizeof(*out));
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            ac->fdsp->vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
+            ac->fdsp->vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
+            ac->fdsp->vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
+            memcpy(                     out + 448 + 4*128, temp, 64 * sizeof(*out));
+        } else {
+            ac->fdsp->vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            memcpy(                     out + 576,         buf + 64,         448 * sizeof(*out));
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        memcpy(                     saved,       temp + 64,         64 * sizeof(*saved));
+        ac->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(*saved));
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        memcpy(                     saved,       buf + 512,        448 * sizeof(*saved));
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(*saved));
+    } else { // LONG_STOP or ONLY_LONG
+        memcpy(                     saved,       buf + 512,        512 * sizeof(*saved));
+    }
+}
+
+static void imdct_and_windowing_ld(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    INTFLOAT *buf  = ac->buf_mdct;
+#if USE_FIXED
+    int i;
+#endif /* USE_FIXED */
+
+    // imdct
+    ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
+
+#if USE_FIXED
+    for (i = 0; i < 1024; i++)
+        buf[i] = (buf[i] + 2) >> 2;
+#endif /* USE_FIXED */
+
+    // window overlapping
+    if (ics->use_kb_window[1]) {
+        // AAC LD uses a low overlap sine window instead of a KBD window
+        memcpy(out, saved, 192 * sizeof(*out));
+        ac->fdsp->vector_fmul_window(out + 192, saved + 192, buf, AAC_RENAME(ff_sine_128), 64);
+        memcpy(                     out + 320, buf + 64, 192 * sizeof(*out));
+    } else {
+        ac->fdsp->vector_fmul_window(out, saved, buf, AAC_RENAME(ff_sine_512), 256);
+    }
+
+    // buffer update
+    memcpy(saved, buf + 256, 256 * sizeof(*saved));
+}
+
+static void imdct_and_windowing_eld(AACContext *ac, SingleChannelElement *sce)
+{
+    INTFLOAT *in    = sce->coeffs;
+    INTFLOAT *out   = sce->ret;
+    INTFLOAT *saved = sce->saved;
+    INTFLOAT *buf  = ac->buf_mdct;
+    int i;
+    const int n  = ac->oc[1].m4ac.frame_length_short ? 480 : 512;
+    const int n2 = n >> 1;
+    const int n4 = n >> 2;
+    const INTFLOAT *const window = n == 480 ? AAC_RENAME(ff_aac_eld_window_480) :
+                                           AAC_RENAME(ff_aac_eld_window_512);
+
+    // Inverse transform, mapped to the conventional IMDCT by
+    // Chivukula, R.K.; Reznik, Y.A.; Devarajan, V.,
+    // "Efficient algorithms for MPEG-4 AAC-ELD, AAC-LD and AAC-LC filterbanks,"
+    // International Conference on Audio, Language and Image Processing, ICALIP 2008.
+    // URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4590245&isnumber=4589950
+    for (i = 0; i < n2; i+=2) {
+        INTFLOAT temp;
+        temp =  in[i    ]; in[i    ] = -in[n - 1 - i]; in[n - 1 - i] = temp;
+        temp = -in[i + 1]; in[i + 1] =  in[n - 2 - i]; in[n - 2 - i] = temp;
+    }
+#if !USE_FIXED
+    if (n == 480)
+        ac->mdct480->imdct_half(ac->mdct480, buf, in, 1, -1.f/(16*1024*960));
+    else
+#endif
+        ac->mdct.imdct_half(&ac->mdct_ld, buf, in);
+
+#if USE_FIXED
+    for (i = 0; i < 1024; i++)
+      buf[i] = (buf[i] + 1) >> 1;
+#endif /* USE_FIXED */
+
+    for (i = 0; i < n; i+=2) {
+        buf[i] = -buf[i];
+    }
+    // Like with the regular IMDCT at this point we still have the middle half
+    // of a transform but with even symmetry on the left and odd symmetry on
+    // the right
+
+    // window overlapping
+    // The spec says to use samples [0..511] but the reference decoder uses
+    // samples [128..639].
+    for (i = n4; i < n2; i ++) {
+        out[i - n4] = AAC_MUL31(   buf[    n2 - 1 - i] , window[i       - n4]) +
+                      AAC_MUL31( saved[        i + n2] , window[i +   n - n4]) +
+                      AAC_MUL31(-saved[n + n2 - 1 - i] , window[i + 2*n - n4]) +
+                      AAC_MUL31(-saved[  2*n + n2 + i] , window[i + 3*n - n4]);
+    }
+    for (i = 0; i < n2; i ++) {
+        out[n4 + i] = AAC_MUL31(   buf[              i] , window[i + n2       - n4]) +
+                      AAC_MUL31(-saved[      n - 1 - i] , window[i + n2 +   n - n4]) +
+                      AAC_MUL31(-saved[          n + i] , window[i + n2 + 2*n - n4]) +
+                      AAC_MUL31( saved[2*n + n - 1 - i] , window[i + n2 + 3*n - n4]);
+    }
+    for (i = 0; i < n4; i ++) {
+        out[n2 + n4 + i] = AAC_MUL31(   buf[    i + n2] , window[i +   n - n4]) +
+                           AAC_MUL31(-saved[n2 - 1 - i] , window[i + 2*n - n4]) +
+                           AAC_MUL31(-saved[n + n2 + i] , window[i + 3*n - n4]);
+    }
+
+    // buffer update
+    memmove(saved + n, saved, 2 * n * sizeof(*saved));
+    memcpy( saved,       buf,     n * sizeof(*saved));
+}
+
+/**
+ * channel coupling transformation interface
+ *
+ * @param   apply_coupling_method   pointer to (in)dependent coupling function
+ */
+static void apply_channel_coupling(AACContext *ac, ChannelElement *cc,
+                                   enum RawDataBlockType type, int elem_id,
+                                   enum CouplingPoint coupling_point,
+                                   void (*apply_coupling_method)(AACContext *ac, SingleChannelElement *target, ChannelElement *cce, int index))
+{
+    int i, c;
+
+    for (i = 0; i < MAX_ELEM_ID; i++) {
+        ChannelElement *cce = ac->che[TYPE_CCE][i];
+        int index = 0;
+
+        if (cce && cce->coup.coupling_point == coupling_point) {
+            ChannelCoupling *coup = &cce->coup;
+
+            for (c = 0; c <= coup->num_coupled; c++) {
+                if (coup->type[c] == type && coup->id_select[c] == elem_id) {
+                    if (coup->ch_select[c] != 1) {
+                        apply_coupling_method(ac, &cc->ch[0], cce, index);
+                        if (coup->ch_select[c] != 0)
+                            index++;
+                    }
+                    if (coup->ch_select[c] != 2)
+                        apply_coupling_method(ac, &cc->ch[1], cce, index++);
+                } else
+                    index += 1 + (coup->ch_select[c] == 3);
+            }
+        }
+    }
+}
+
+/**
+ * Convert spectral data to samples, applying all supported tools as appropriate.
+ */
+static void spectral_to_sample(AACContext *ac, int samples)
+{
+    int i, type;
+    void (*imdct_and_window)(AACContext *ac, SingleChannelElement *sce);
+    switch (ac->oc[1].m4ac.object_type) {
+    case AOT_ER_AAC_LD:
+        imdct_and_window = imdct_and_windowing_ld;
+        break;
+    case AOT_ER_AAC_ELD:
+        imdct_and_window = imdct_and_windowing_eld;
+        break;
+    default:
+        imdct_and_window = ac->imdct_and_windowing;
+    }
+    for (type = 3; type >= 0; type--) {
+        for (i = 0; i < MAX_ELEM_ID; i++) {
+            ChannelElement *che = ac->che[type][i];
+            if (che && che->present) {
+                if (type <= TYPE_CPE)
+                    apply_channel_coupling(ac, che, type, i, BEFORE_TNS, AAC_RENAME(apply_dependent_coupling));
+                if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP) {
+                    if (che->ch[0].ics.predictor_present) {
+                        if (che->ch[0].ics.ltp.present)
+                            ac->apply_ltp(ac, &che->ch[0]);
+                        if (che->ch[1].ics.ltp.present && type == TYPE_CPE)
+                            ac->apply_ltp(ac, &che->ch[1]);
+                    }
+                }
+                if (che->ch[0].tns.present)
+                    ac->apply_tns(che->ch[0].coeffs, &che->ch[0].tns, &che->ch[0].ics, 1);
+                if (che->ch[1].tns.present)
+                    ac->apply_tns(che->ch[1].coeffs, &che->ch[1].tns, &che->ch[1].ics, 1);
+                if (type <= TYPE_CPE)
+                    apply_channel_coupling(ac, che, type, i, BETWEEN_TNS_AND_IMDCT, AAC_RENAME(apply_dependent_coupling));
+                if (type != TYPE_CCE || che->coup.coupling_point == AFTER_IMDCT) {
+                    imdct_and_window(ac, &che->ch[0]);
+                    if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
+                        ac->update_ltp(ac, &che->ch[0]);
+                    if (type == TYPE_CPE) {
+                        imdct_and_window(ac, &che->ch[1]);
+                        if (ac->oc[1].m4ac.object_type == AOT_AAC_LTP)
+                            ac->update_ltp(ac, &che->ch[1]);
+                    }
+                    if (ac->oc[1].m4ac.sbr > 0) {
+                        AAC_RENAME(ff_sbr_apply)(ac, &che->sbr, type, che->ch[0].ret, che->ch[1].ret);
+                    }
+                }
+                if (type <= TYPE_CCE)
+                    apply_channel_coupling(ac, che, type, i, AFTER_IMDCT, AAC_RENAME(apply_independent_coupling));
+
+#if USE_FIXED
+                {
+                    int j;
+                    /* preparation for resampler */
+                    for(j = 0; j<samples; j++){
+                        che->ch[0].ret[j] = (int32_t)av_clipl_int32((int64_t)che->ch[0].ret[j]<<7)+0x8000;
+                        if(type == TYPE_CPE)
+                            che->ch[1].ret[j] = (int32_t)av_clipl_int32((int64_t)che->ch[1].ret[j]<<7)+0x8000;
+                    }
+                }
+#endif /* USE_FIXED */
+                che->present = 0;
+            } else if (che) {
+                av_log(ac->avctx, AV_LOG_VERBOSE, "ChannelElement %d.%d missing \n", type, i);
+            }
+        }
+    }
+}
+
+static int parse_adts_frame_header(AACContext *ac, GetBitContext *gb)
+{
+    int size;
+    AACADTSHeaderInfo hdr_info;
+    uint8_t layout_map[MAX_ELEM_ID*4][3];
+    int layout_map_tags, ret;
+
+    size = avpriv_aac_parse_header(gb, &hdr_info);
+    if (size > 0) {
+        if (!ac->warned_num_aac_frames && hdr_info.num_aac_frames != 1) {
+            // This is 2 for "VLB " audio in NSV files.
+            // See samples/nsv/vlb_audio.
+            avpriv_report_missing_feature(ac->avctx,
+                                          "More than one AAC RDB per ADTS frame");
+            ac->warned_num_aac_frames = 1;
+        }
+        push_output_configuration(ac);
+        if (hdr_info.chan_config) {
+            ac->oc[1].m4ac.chan_config = hdr_info.chan_config;
+            if ((ret = set_default_channel_config(ac->avctx,
+                                                  layout_map,
+                                                  &layout_map_tags,
+                                                  hdr_info.chan_config)) < 0)
+                return ret;
+            if ((ret = output_configure(ac, layout_map, layout_map_tags,
+                                        FFMAX(ac->oc[1].status,
+                                              OC_TRIAL_FRAME), 0)) < 0)
+                return ret;
+        } else {
+            ac->oc[1].m4ac.chan_config = 0;
+            /**
+             * dual mono frames in Japanese DTV can have chan_config 0
+             * WITHOUT specifying PCE.
+             *  thus, set dual mono as default.
+             */
+            if (ac->dmono_mode && ac->oc[0].status == OC_NONE) {
+                layout_map_tags = 2;
+                layout_map[0][0] = layout_map[1][0] = TYPE_SCE;
+                layout_map[0][2] = layout_map[1][2] = AAC_CHANNEL_FRONT;
+                layout_map[0][1] = 0;
+                layout_map[1][1] = 1;
+                if (output_configure(ac, layout_map, layout_map_tags,
+                                     OC_TRIAL_FRAME, 0))
+                    return -7;
+            }
+        }
+        ac->oc[1].m4ac.sample_rate     = hdr_info.sample_rate;
+        ac->oc[1].m4ac.sampling_index  = hdr_info.sampling_index;
+        ac->oc[1].m4ac.object_type     = hdr_info.object_type;
+        ac->oc[1].m4ac.frame_length_short = 0;
+        if (ac->oc[0].status != OC_LOCKED ||
+            ac->oc[0].m4ac.chan_config != hdr_info.chan_config ||
+            ac->oc[0].m4ac.sample_rate != hdr_info.sample_rate) {
+            ac->oc[1].m4ac.sbr = -1;
+            ac->oc[1].m4ac.ps  = -1;
+        }
+        if (!hdr_info.crc_absent)
+            skip_bits(gb, 16);
+    }
+    return size;
+}
+
+static int aac_decode_er_frame(AVCodecContext *avctx, void *data,
+                               int *got_frame_ptr, GetBitContext *gb)
+{
+    AACContext *ac = avctx->priv_data;
+    const MPEG4AudioConfig *const m4ac = &ac->oc[1].m4ac;
+    ChannelElement *che;
+    int err, i;
+    int samples = m4ac->frame_length_short ? 960 : 1024;
+    int chan_config = m4ac->chan_config;
+    int aot = m4ac->object_type;
+
+    if (aot == AOT_ER_AAC_LD || aot == AOT_ER_AAC_ELD)
+        samples >>= 1;
+
+    ac->frame = data;
+
+    if ((err = frame_configure_elements(avctx)) < 0)
+        return err;
+
+    // The FF_PROFILE_AAC_* defines are all object_type - 1
+    // This may lead to an undefined profile being signaled
+    ac->avctx->profile = aot - 1;
+
+    ac->tags_mapped = 0;
+
+    if (chan_config < 0 || (chan_config >= 8 && chan_config < 11) || chan_config >= 13) {
+        avpriv_request_sample(avctx, "Unknown ER channel configuration %d",
+                              chan_config);
+        return AVERROR_INVALIDDATA;
+    }
+    for (i = 0; i < tags_per_config[chan_config]; i++) {
+        const int elem_type = aac_channel_layout_map[chan_config-1][i][0];
+        const int elem_id   = aac_channel_layout_map[chan_config-1][i][1];
+        if (!(che=get_che(ac, elem_type, elem_id))) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "channel element %d.%d is not allocated\n",
+                   elem_type, elem_id);
+            return AVERROR_INVALIDDATA;
+        }
+        che->present = 1;
+        if (aot != AOT_ER_AAC_ELD)
+            skip_bits(gb, 4);
+        switch (elem_type) {
+        case TYPE_SCE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            break;
+        case TYPE_CPE:
+            err = decode_cpe(ac, gb, che);
+            break;
+        case TYPE_LFE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            break;
+        }
+        if (err < 0)
+            return err;
+    }
+
+    spectral_to_sample(ac, samples);
+
+    if (!ac->frame->data[0] && samples) {
+        av_log(avctx, AV_LOG_ERROR, "no frame data found\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ac->frame->nb_samples = samples;
+    ac->frame->sample_rate = avctx->sample_rate;
+    *got_frame_ptr = 1;
+
+    skip_bits_long(gb, get_bits_left(gb));
+    return 0;
+}
+
+static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
+                                int *got_frame_ptr, GetBitContext *gb, AVPacket *avpkt)
+{
+    AACContext *ac = avctx->priv_data;
+    ChannelElement *che = NULL, *che_prev = NULL;
+    enum RawDataBlockType elem_type, elem_type_prev = TYPE_END;
+    int err, elem_id;
+    int samples = 0, multiplier, audio_found = 0, pce_found = 0;
+    int is_dmono, sce_count = 0;
+
+    ac->frame = data;
+
+    if (show_bits(gb, 12) == 0xfff) {
+        if ((err = parse_adts_frame_header(ac, gb)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Error decoding AAC frame header.\n");
+            goto fail;
+        }
+        if (ac->oc[1].m4ac.sampling_index > 12) {
+            av_log(ac->avctx, AV_LOG_ERROR, "invalid sampling rate index %d\n", ac->oc[1].m4ac.sampling_index);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+    }
+
+    if ((err = frame_configure_elements(avctx)) < 0)
+        goto fail;
+
+    // The FF_PROFILE_AAC_* defines are all object_type - 1
+    // This may lead to an undefined profile being signaled
+    ac->avctx->profile = ac->oc[1].m4ac.object_type - 1;
+
+    ac->tags_mapped = 0;
+    // parse
+    while ((elem_type = get_bits(gb, 3)) != TYPE_END) {
+        elem_id = get_bits(gb, 4);
+
+        if (avctx->debug & FF_DEBUG_STARTCODE)
+            av_log(avctx, AV_LOG_DEBUG, "Elem type:%x id:%x\n", elem_type, elem_id);
+
+        if (!avctx->channels && elem_type != TYPE_PCE) {
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+
+        if (elem_type < TYPE_DSE) {
+            if (!(che=get_che(ac, elem_type, elem_id))) {
+                av_log(ac->avctx, AV_LOG_ERROR, "channel element %d.%d is not allocated\n",
+                       elem_type, elem_id);
+                err = AVERROR_INVALIDDATA;
+                goto fail;
+            }
+            samples = 1024;
+            che->present = 1;
+        }
+
+        switch (elem_type) {
+
+        case TYPE_SCE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            audio_found = 1;
+            sce_count++;
+            break;
+
+        case TYPE_CPE:
+            err = decode_cpe(ac, gb, che);
+            audio_found = 1;
+            break;
+
+        case TYPE_CCE:
+            err = decode_cce(ac, gb, che);
+            break;
+
+        case TYPE_LFE:
+            err = decode_ics(ac, &che->ch[0], gb, 0, 0);
+            audio_found = 1;
+            break;
+
+        case TYPE_DSE:
+            err = skip_data_stream_element(ac, gb);
+            break;
+
+        case TYPE_PCE: {
+            uint8_t layout_map[MAX_ELEM_ID*4][3];
+            int tags;
+            push_output_configuration(ac);
+            tags = decode_pce(avctx, &ac->oc[1].m4ac, layout_map, gb);
+            if (tags < 0) {
+                err = tags;
+                break;
+            }
+            if (pce_found) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not evaluating a further program_config_element as this construct is dubious at best.\n");
+            } else {
+                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
+                if (!err)
+                    ac->oc[1].m4ac.chan_config = 0;
+                pce_found = 1;
+            }
+            break;
+        }
+
+        case TYPE_FIL:
+            if (elem_id == 15)
+                elem_id += get_bits(gb, 8) - 1;
+            if (get_bits_left(gb) < 8 * elem_id) {
+                    av_log(avctx, AV_LOG_ERROR, "TYPE_FIL: "overread_err);
+                    err = AVERROR_INVALIDDATA;
+                    goto fail;
+            }
+            while (elem_id > 0)
+                elem_id -= decode_extension_payload(ac, gb, elem_id, che_prev, elem_type_prev);
+            err = 0; /* FIXME */
+            break;
+
+        default:
+            err = AVERROR_BUG; /* should not happen, but keeps compiler happy */
+            break;
+        }
+
+        che_prev       = che;
+        elem_type_prev = elem_type;
+
+        if (err)
+            goto fail;
+
+        if (get_bits_left(gb) < 3) {
+            av_log(avctx, AV_LOG_ERROR, overread_err);
+            err = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+    }
+
+    if (!avctx->channels) {
+        *got_frame_ptr = 0;
+        return 0;
+    }
+
+    multiplier = (ac->oc[1].m4ac.sbr == 1) ? ac->oc[1].m4ac.ext_sample_rate > ac->oc[1].m4ac.sample_rate : 0;
+    samples <<= multiplier;
+
+    spectral_to_sample(ac, samples);
+
+    if (ac->oc[1].status && audio_found) {
+        avctx->sample_rate = ac->oc[1].m4ac.sample_rate << multiplier;
+        avctx->frame_size = samples;
+        ac->oc[1].status = OC_LOCKED;
+    }
+
+    if (multiplier) {
+        int side_size;
+        const uint8_t *side = av_packet_get_side_data(avpkt, AV_PKT_DATA_SKIP_SAMPLES, &side_size);
+        if (side && side_size>=4)
+            AV_WL32(side, 2*AV_RL32(side));
+    }
+
+    if (!ac->frame->data[0] && samples) {
+        av_log(avctx, AV_LOG_ERROR, "no frame data found\n");
+        err = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+
+    if (samples) {
+        ac->frame->nb_samples = samples;
+        ac->frame->sample_rate = avctx->sample_rate;
+    } else
+        av_frame_unref(ac->frame);
+    *got_frame_ptr = !!samples;
+
+    /* for dual-mono audio (SCE + SCE) */
+    is_dmono = ac->dmono_mode && sce_count == 2 &&
+               ac->oc[1].channel_layout == (AV_CH_FRONT_LEFT | AV_CH_FRONT_RIGHT);
+    if (is_dmono) {
+        if (ac->dmono_mode == 1)
+            ((AVFrame *)data)->data[1] =((AVFrame *)data)->data[0];
+        else if (ac->dmono_mode == 2)
+            ((AVFrame *)data)->data[0] =((AVFrame *)data)->data[1];
+    }
+
+    return 0;
+fail:
+    pop_output_configuration(ac);
+    return err;
+}
+
+static int aac_decode_frame(AVCodecContext *avctx, void *data,
+                            int *got_frame_ptr, AVPacket *avpkt)
+{
+    AACContext *ac = avctx->priv_data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    GetBitContext gb;
+    int buf_consumed;
+    int buf_offset;
+    int err;
+    int new_extradata_size;
+    const uint8_t *new_extradata = av_packet_get_side_data(avpkt,
+                                       AV_PKT_DATA_NEW_EXTRADATA,
+                                       &new_extradata_size);
+    int jp_dualmono_size;
+    const uint8_t *jp_dualmono   = av_packet_get_side_data(avpkt,
+                                       AV_PKT_DATA_JP_DUALMONO,
+                                       &jp_dualmono_size);
+
+    if (new_extradata && 0) {
+        av_free(avctx->extradata);
+        avctx->extradata = av_mallocz(new_extradata_size +
+                                      AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!avctx->extradata)
+            return AVERROR(ENOMEM);
+        avctx->extradata_size = new_extradata_size;
+        memcpy(avctx->extradata, new_extradata, new_extradata_size);
+        push_output_configuration(ac);
+        if (decode_audio_specific_config(ac, ac->avctx, &ac->oc[1].m4ac,
+                                         avctx->extradata,
+                                         avctx->extradata_size*8LL, 1) < 0) {
+            pop_output_configuration(ac);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    ac->dmono_mode = 0;
+    if (jp_dualmono && jp_dualmono_size > 0)
+        ac->dmono_mode =  1 + *jp_dualmono;
+    if (ac->force_dmono_mode >= 0)
+        ac->dmono_mode = ac->force_dmono_mode;
+
+    if (INT_MAX / 8 <= buf_size)
+        return AVERROR_INVALIDDATA;
+
+    if ((err = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return err;
+
+    switch (ac->oc[1].m4ac.object_type) {
+    case AOT_ER_AAC_LC:
+    case AOT_ER_AAC_LTP:
+    case AOT_ER_AAC_LD:
+    case AOT_ER_AAC_ELD:
+        err = aac_decode_er_frame(avctx, data, got_frame_ptr, &gb);
+        break;
+    default:
+        err = aac_decode_frame_int(avctx, data, got_frame_ptr, &gb, avpkt);
+    }
+    if (err < 0)
+        return err;
+
+    buf_consumed = (get_bits_count(&gb) + 7) >> 3;
+    for (buf_offset = buf_consumed; buf_offset < buf_size; buf_offset++)
+        if (buf[buf_offset])
+            break;
+
+    return buf_size > buf_offset ? buf_consumed : buf_size;
+}
+
+static av_cold int aac_decode_close(AVCodecContext *avctx)
+{
+    AACContext *ac = avctx->priv_data;
+    int i, type;
+
+    for (i = 0; i < MAX_ELEM_ID; i++) {
+        for (type = 0; type < 4; type++) {
+            if (ac->che[type][i])
+                AAC_RENAME(ff_aac_sbr_ctx_close)(&ac->che[type][i]->sbr);
+            av_freep(&ac->che[type][i]);
+        }
+    }
+
+    ff_mdct_end(&ac->mdct);
+    ff_mdct_end(&ac->mdct_small);
+    ff_mdct_end(&ac->mdct_ld);
+    ff_mdct_end(&ac->mdct_ltp);
+#if !USE_FIXED
+    ff_imdct15_uninit(&ac->mdct480);
+#endif
+    av_freep(&ac->fdsp);
+    return 0;
+}
+
+static void aacdec_init(AACContext *c)
+{
+    c->imdct_and_windowing                      = imdct_and_windowing;
+    c->apply_ltp                                = apply_ltp;
+    c->apply_tns                                = apply_tns;
+    c->windowing_and_mdct_ltp                   = windowing_and_mdct_ltp;
+    c->update_ltp                               = update_ltp;
+#if USE_FIXED
+    c->vector_pow43                             = vector_pow43;
+    c->subband_scale                            = subband_scale;
+#endif
+
+#if !USE_FIXED
+    if(ARCH_MIPS)
+        ff_aacdec_init_mips(c);
+#endif /* !USE_FIXED */
+}
+/**
+ * AVOptions for Japanese DTV specific extensions (ADTS only)
+ */
+#define AACDEC_FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+    {"dual_mono_mode", "Select the channel to decode for dual mono",
+     offsetof(AACContext, force_dmono_mode), AV_OPT_TYPE_INT, {.i64=-1}, -1, 2,
+     AACDEC_FLAGS, "dual_mono_mode"},
+
+    {"auto", "autoselection",            0, AV_OPT_TYPE_CONST, {.i64=-1}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"main", "Select Main/Left channel", 0, AV_OPT_TYPE_CONST, {.i64= 1}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"sub" , "Select Sub/Right channel", 0, AV_OPT_TYPE_CONST, {.i64= 2}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+    {"both", "Select both channels",     0, AV_OPT_TYPE_CONST, {.i64= 0}, INT_MIN, INT_MAX, AACDEC_FLAGS, "dual_mono_mode"},
+
+    {NULL},
+};
+
+static const AVClass aac_decoder_class = {
+    .class_name = "AAC decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
diff --git a/libavcodec/aacdectab.h b/libavcodec/aacdectab.h
index b7c5f7e..baf51a7 100644
--- a/libavcodec/aacdectab.h
+++ b/libavcodec/aacdectab.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,49 +35,6 @@
 
 #include <stdint.h>
 
-/* @name ltp_coef
- * Table of the LTP coefficients
- */
-static const float ltp_coef[8] = {
-    0.570829, 0.696616, 0.813004, 0.911304,
-    0.984900, 1.067894, 1.194601, 1.369533,
-};
-
-/* @name tns_tmp2_map
- * Tables of the tmp2[] arrays of LPC coefficients used for TNS.
- * The suffix _M_N[] indicate the values of coef_compress and coef_res
- * respectively.
- * @{
- */
-static const float tns_tmp2_map_1_3[4] = {
-     0.00000000, -0.43388373,  0.64278758,  0.34202015,
-};
-
-static const float tns_tmp2_map_0_3[8] = {
-     0.00000000, -0.43388373, -0.78183150, -0.97492790,
-     0.98480773,  0.86602539,  0.64278758,  0.34202015,
-};
-
-static const float tns_tmp2_map_1_4[8] = {
-     0.00000000, -0.20791170, -0.40673664, -0.58778524,
-     0.67369562,  0.52643216,  0.36124167,  0.18374951,
-};
-
-static const float tns_tmp2_map_0_4[16] = {
-     0.00000000, -0.20791170, -0.40673664, -0.58778524,
-    -0.74314481, -0.86602539, -0.95105654, -0.99452192,
-     0.99573416,  0.96182561,  0.89516330,  0.79801720,
-     0.67369562,  0.52643216,  0.36124167,  0.18374951,
-};
-
-static const float * const tns_tmp2_map[4] = {
-    tns_tmp2_map_0_3,
-    tns_tmp2_map_0_4,
-    tns_tmp2_map_1_3,
-    tns_tmp2_map_1_4
-};
-// @}
-
 static const int8_t tags_per_config[16] = { 0, 1, 1, 2, 3, 3, 4, 5, 0, 0, 0, 4, 5, 0, 5, 0 };
 
 static const uint8_t aac_channel_layout_map[16][5][3] = {
diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index c247c5b..2653cef 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -2,20 +2,20 @@
  * AAC encoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,10 @@
 /***********************************
  *              TODOs:
  * add sane pulse detection
- * add temporal noise shaping
  ***********************************/
 
+#include "libavutil/libm.h"
+#include "libavutil/thread.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
@@ -42,119 +43,12 @@
 #include "aac.h"
 #include "aactab.h"
 #include "aacenc.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
 
 #include "psymodel.h"
 
-#define AAC_MAX_CHANNELS 6
-
-#define ERROR_IF(cond, ...) \
-    if (cond) { \
-        av_log(avctx, AV_LOG_ERROR, __VA_ARGS__); \
-        return AVERROR(EINVAL); \
-    }
-
-float ff_aac_pow34sf_tab[428];
-
-static const uint8_t swb_size_1024_96[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
-    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_64[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8,
-    12, 12, 12, 16, 16, 16, 20, 24, 24, 28, 36,
-    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40
-};
-
-static const uint8_t swb_size_1024_48[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-    96
-};
-
-static const uint8_t swb_size_1024_32[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-};
-
-static const uint8_t swb_size_1024_24[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 16, 16, 16, 20, 20, 24, 24, 28, 28,
-    32, 36, 36, 40, 44, 48, 52, 52, 64, 64, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_16[] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 24, 24, 28, 28,
-    32, 36, 40, 40, 44, 48, 52, 56, 60, 64, 64, 64
-};
-
-static const uint8_t swb_size_1024_8[] = {
-    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-    16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 28, 28,
-    32, 36, 36, 40, 44, 48, 52, 56, 60, 64, 80
-};
-
-static const uint8_t *swb_size_1024[] = {
-    swb_size_1024_96, swb_size_1024_96, swb_size_1024_64,
-    swb_size_1024_48, swb_size_1024_48, swb_size_1024_32,
-    swb_size_1024_24, swb_size_1024_24, swb_size_1024_16,
-    swb_size_1024_16, swb_size_1024_16, swb_size_1024_8
-};
-
-static const uint8_t swb_size_128_96[] = {
-    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
-};
-
-static const uint8_t swb_size_128_48[] = {
-    4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16
-};
-
-static const uint8_t swb_size_128_24[] = {
-    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 20
-};
-
-static const uint8_t swb_size_128_16[] = {
-    4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 12, 12, 16, 20, 20
-};
-
-static const uint8_t swb_size_128_8[] = {
-    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 12, 16, 20, 20
-};
-
-static const uint8_t *swb_size_128[] = {
-    /* the last entry on the following row is swb_size_128_64 but is a
-       duplicate of swb_size_128_96 */
-    swb_size_128_96, swb_size_128_96, swb_size_128_96,
-    swb_size_128_48, swb_size_128_48, swb_size_128_48,
-    swb_size_128_24, swb_size_128_24, swb_size_128_16,
-    swb_size_128_16, swb_size_128_16, swb_size_128_8
-};
-
-/** default channel configurations */
-static const uint8_t aac_chan_configs[6][5] = {
- {1, TYPE_SCE},                               // 1 channel  - single channel element
- {1, TYPE_CPE},                               // 2 channels - channel pair
- {2, TYPE_SCE, TYPE_CPE},                     // 3 channels - center + stereo
- {3, TYPE_SCE, TYPE_CPE, TYPE_SCE},           // 4 channels - front center + stereo + back center
- {3, TYPE_SCE, TYPE_CPE, TYPE_CPE},           // 5 channels - front center + stereo + back stereo
- {4, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_LFE}, // 6 channels - front center + stereo + back stereo + LFE
-};
-
-/**
- * Table to remap channels from Libav's default order to AAC order.
- */
-static const uint8_t aac_chan_maps[AAC_MAX_CHANNELS][AAC_MAX_CHANNELS] = {
-    { 0 },
-    { 0, 1 },
-    { 2, 0, 1 },
-    { 2, 0, 1, 3 },
-    { 2, 0, 1, 3, 4 },
-    { 2, 0, 1, 4, 5, 3 },
-};
+static AVOnce aac_table_init = AV_ONCE_INIT;
 
 /**
  * Make AAC audio config object.
@@ -164,11 +58,12 @@ static void put_audio_specific_config(AVCodecContext *avctx)
 {
     PutBitContext pb;
     AACEncContext *s = avctx->priv_data;
+    int channels = s->channels - (s->channels == 8 ? 1 : 0);
 
-    init_put_bits(&pb, avctx->extradata, avctx->extradata_size*8);
-    put_bits(&pb, 5, 2); //object type - AAC-LC
+    init_put_bits(&pb, avctx->extradata, avctx->extradata_size);
+    put_bits(&pb, 5, s->profile+1); //profile
     put_bits(&pb, 4, s->samplerate_index); //sample rate index
-    put_bits(&pb, 4, s->channels);
+    put_bits(&pb, 4, channels);
     //GASpecificConfig
     put_bits(&pb, 1, 0); //frame length - 1024 samples
     put_bits(&pb, 1, 0); //does not depend on core coder
@@ -181,6 +76,15 @@ static void put_audio_specific_config(AVCodecContext *avctx)
     flush_put_bits(&pb);
 }
 
+void ff_quantize_band_cost_cache_init(struct AACEncContext *s)
+{
+    ++s->quantize_band_cost_cache_generation;
+    if (s->quantize_band_cost_cache_generation == 0) {
+        memset(s->quantize_band_cost_cache, 0, sizeof(s->quantize_band_cost_cache));
+        s->quantize_band_cost_cache_generation = 1;
+    }
+}
+
 #define WINDOW_FUNC(type) \
 static void apply_ ##type ##_window(AVFloatDSPContext *fdsp, \
                                     SingleChannelElement *sce, \
@@ -250,16 +154,17 @@ static void apply_window_and_mdct(AACEncContext *s, SingleChannelElement *sce,
                                   float *audio)
 {
     int i;
-    float *output = sce->ret_buf;
+    const float *output = sce->ret_buf;
 
-    apply_window[sce->ics.window_sequence[0]](&s->fdsp, sce, audio);
+    apply_window[sce->ics.window_sequence[0]](s->fdsp, sce, audio);
 
     if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE)
         s->mdct1024.mdct_calc(&s->mdct1024, sce->coeffs, output);
     else
         for (i = 0; i < 1024; i += 128)
-            s->mdct128.mdct_calc(&s->mdct128, sce->coeffs + i, output + i*2);
+            s->mdct128.mdct_calc(&s->mdct128, &sce->coeffs[i], output + i*2);
     memcpy(audio, audio + 1024, sizeof(audio[0]) * 1024);
+    memcpy(sce->pcoeffs, sce->coeffs, sizeof(sce->pcoeffs));
 }
 
 /**
@@ -275,7 +180,7 @@ static void put_ics_info(AACEncContext *s, IndividualChannelStream *info)
     put_bits(&s->pb, 1, info->use_kb_window[0]);
     if (info->window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
         put_bits(&s->pb, 6, info->max_sfb);
-        put_bits(&s->pb, 1, 0);            // no prediction
+        put_bits(&s->pb, 1, !!info->predictor_present);
     } else {
         put_bits(&s->pb, 4, info->max_sfb);
         for (w = 1; w < 8; w++)
@@ -304,27 +209,18 @@ static void encode_ms_info(PutBitContext *pb, ChannelElement *cpe)
 static void adjust_frame_information(ChannelElement *cpe, int chans)
 {
     int i, w, w2, g, ch;
-    int start, maxsfb, cmaxsfb;
+    int maxsfb, cmaxsfb;
 
     for (ch = 0; ch < chans; ch++) {
         IndividualChannelStream *ics = &cpe->ch[ch].ics;
-        start = 0;
         maxsfb = 0;
         cpe->ch[ch].pulse.num_pulse = 0;
-        for (w = 0; w < ics->num_windows*16; w += 16) {
-            for (g = 0; g < ics->num_swb; g++) {
-                //apply M/S
-                if (cpe->common_window && !ch && cpe->ms_mask[w + g]) {
-                    for (i = 0; i < ics->swb_sizes[g]; i++) {
-                        cpe->ch[0].coeffs[start+i] = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
-                        cpe->ch[1].coeffs[start+i] =  cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
-                    }
-                }
-                start += ics->swb_sizes[g];
+        for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+            for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+                for (cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w*16+cmaxsfb-1]; cmaxsfb--)
+                    ;
+                maxsfb = FFMAX(maxsfb, cmaxsfb);
             }
-            for (cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w+cmaxsfb-1]; cmaxsfb--)
-                ;
-            maxsfb = FFMAX(maxsfb, cmaxsfb);
         }
         ics->max_sfb = maxsfb;
 
@@ -360,6 +256,67 @@ static void adjust_frame_information(ChannelElement *cpe, int chans)
     }
 }
 
+static void apply_intensity_stereo(ChannelElement *cpe)
+{
+    int w, w2, g, i;
+    IndividualChannelStream *ics = &cpe->ch[0].ics;
+    if (!cpe->common_window)
+        return;
+    for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+        for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+            int start = (w+w2) * 128;
+            for (g = 0; g < ics->num_swb; g++) {
+                int p  = -1 + 2 * (cpe->ch[1].band_type[w*16+g] - 14);
+                float scale = cpe->ch[0].is_ener[w*16+g];
+                if (!cpe->is_mask[w*16 + g]) {
+                    start += ics->swb_sizes[g];
+                    continue;
+                }
+                if (cpe->ms_mask[w*16 + g])
+                    p *= -1;
+                for (i = 0; i < ics->swb_sizes[g]; i++) {
+                    float sum = (cpe->ch[0].coeffs[start+i] + p*cpe->ch[1].coeffs[start+i])*scale;
+                    cpe->ch[0].coeffs[start+i] = sum;
+                    cpe->ch[1].coeffs[start+i] = 0.0f;
+                }
+                start += ics->swb_sizes[g];
+            }
+        }
+    }
+}
+
+static void apply_mid_side_stereo(ChannelElement *cpe)
+{
+    int w, w2, g, i;
+    IndividualChannelStream *ics = &cpe->ch[0].ics;
+    if (!cpe->common_window)
+        return;
+    for (w = 0; w < ics->num_windows; w += ics->group_len[w]) {
+        for (w2 =  0; w2 < ics->group_len[w]; w2++) {
+            int start = (w+w2) * 128;
+            for (g = 0; g < ics->num_swb; g++) {
+                /* ms_mask can be used for other purposes in PNS and I/S,
+                 * so must not apply M/S if any band uses either, even if
+                 * ms_mask is set.
+                 */
+                if (!cpe->ms_mask[w*16 + g] || cpe->is_mask[w*16 + g]
+                    || cpe->ch[0].band_type[w*16 + g] >= NOISE_BT
+                    || cpe->ch[1].band_type[w*16 + g] >= NOISE_BT) {
+                    start += ics->swb_sizes[g];
+                    continue;
+                }
+                for (i = 0; i < ics->swb_sizes[g]; i++) {
+                    float L = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) * 0.5f;
+                    float R = L - cpe->ch[1].coeffs[start+i];
+                    cpe->ch[0].coeffs[start+i] = L;
+                    cpe->ch[1].coeffs[start+i] = R;
+                }
+                start += ics->swb_sizes[g];
+            }
+        }
+    }
+}
+
 /**
  * Encode scalefactor band coding type.
  */
@@ -367,6 +324,9 @@ static void encode_band_info(AACEncContext *s, SingleChannelElement *sce)
 {
     int w;
 
+    if (s->coder->set_special_band_scalefactors)
+        s->coder->set_special_band_scalefactors(s, sce);
+
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
         s->coder->encode_window_bands_info(s, sce, w, sce->ics.group_len[w], s->lambda);
 }
@@ -377,16 +337,30 @@ static void encode_band_info(AACEncContext *s, SingleChannelElement *sce)
 static void encode_scale_factors(AVCodecContext *avctx, AACEncContext *s,
                                  SingleChannelElement *sce)
 {
-    int off = sce->sf_idx[0], diff;
+    int diff, off_sf = sce->sf_idx[0], off_pns = sce->sf_idx[0] - NOISE_OFFSET;
+    int off_is = 0, noise_flag = 1;
     int i, w;
 
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         for (i = 0; i < sce->ics.max_sfb; i++) {
             if (!sce->zeroes[w*16 + i]) {
-                diff = sce->sf_idx[w*16 + i] - off + SCALE_DIFF_ZERO;
-                if (diff < 0 || diff > 120)
-                    av_log(avctx, AV_LOG_ERROR, "Scalefactor difference is too big to be coded\n");
-                off = sce->sf_idx[w*16 + i];
+                if (sce->band_type[w*16 + i] == NOISE_BT) {
+                    diff = sce->sf_idx[w*16 + i] - off_pns;
+                    off_pns = sce->sf_idx[w*16 + i];
+                    if (noise_flag-- > 0) {
+                        put_bits(&s->pb, NOISE_PRE_BITS, diff + NOISE_PRE);
+                        continue;
+                    }
+                } else if (sce->band_type[w*16 + i] == INTENSITY_BT  ||
+                           sce->band_type[w*16 + i] == INTENSITY_BT2) {
+                    diff = sce->sf_idx[w*16 + i] - off_is;
+                    off_is = sce->sf_idx[w*16 + i];
+                } else {
+                    diff = sce->sf_idx[w*16 + i] - off_sf;
+                    off_sf = sce->sf_idx[w*16 + i];
+                }
+                diff += SCALE_DIFF_ZERO;
+                av_assert0(diff >= 0 && diff <= 120);
                 put_bits(&s->pb, ff_aac_scalefactor_bits[diff], ff_aac_scalefactor_code[diff]);
             }
         }
@@ -426,18 +400,41 @@ static void encode_spectral_coeffs(AACEncContext *s, SingleChannelElement *sce)
                 start += sce->ics.swb_sizes[i];
                 continue;
             }
-            for (w2 = w; w2 < w + sce->ics.group_len[w]; w2++)
-                s->coder->quantize_and_encode_band(s, &s->pb, sce->coeffs + start + w2*128,
-                                                   sce->ics.swb_sizes[i],
+            for (w2 = w; w2 < w + sce->ics.group_len[w]; w2++) {
+                s->coder->quantize_and_encode_band(s, &s->pb,
+                                                   &sce->coeffs[start + w2*128],
+                                                   NULL, sce->ics.swb_sizes[i],
                                                    sce->sf_idx[w*16 + i],
                                                    sce->band_type[w*16 + i],
-                                                   s->lambda);
+                                                   s->lambda,
+                                                   sce->ics.window_clipping[w]);
+            }
             start += sce->ics.swb_sizes[i];
         }
     }
 }
 
 /**
+ * Downscale spectral coefficients for near-clipping windows to avoid artifacts
+ */
+static void avoid_clipping(AACEncContext *s, SingleChannelElement *sce)
+{
+    int start, i, j, w;
+
+    if (sce->ics.clip_avoidance_factor < 1.0f) {
+        for (w = 0; w < sce->ics.num_windows; w++) {
+            start = 0;
+            for (i = 0; i < sce->ics.max_sfb; i++) {
+                float *swb_coeffs = &sce->coeffs[start + w*128];
+                for (j = 0; j < sce->ics.swb_sizes[i]; j++)
+                    swb_coeffs[j] *= sce->ics.clip_avoidance_factor;
+                start += sce->ics.swb_sizes[i];
+            }
+        }
+    }
+}
+
+/**
  * Encode one channel of audio data.
  */
 static int encode_individual_channel(AVCodecContext *avctx, AACEncContext *s,
@@ -445,12 +442,19 @@ static int encode_individual_channel(AVCodecContext *avctx, AACEncContext *s,
                                      int common_window)
 {
     put_bits(&s->pb, 8, sce->sf_idx[0]);
-    if (!common_window)
+    if (!common_window) {
         put_ics_info(s, &sce->ics);
+        if (s->coder->encode_main_pred)
+            s->coder->encode_main_pred(s, sce);
+        if (s->coder->encode_ltp_info)
+            s->coder->encode_ltp_info(s, sce, 0);
+    }
     encode_band_info(s, sce);
     encode_scale_factors(avctx, s, sce);
     encode_pulses(s, &sce->pulse);
-    put_bits(&s->pb, 1, 0); //tns
+    put_bits(&s->pb, 1, !!sce->tns.present);
+    if (s->coder->encode_tns_info)
+        s->coder->encode_tns_info(s, sce);
     put_bits(&s->pb, 1, 0); //ssr
     encode_spectral_coeffs(s, sce);
     return 0;
@@ -478,7 +482,7 @@ static void put_bitstream_info(AACEncContext *s, const char *name)
 
 /*
  * Copy input samples.
- * Channels are reordered from Libav's default order to AAC order.
+ * Channels are reordered from libavcodec's default order to AAC order.
  */
 static void copy_input_samples(AACEncContext *s, const AVFrame *frame)
 {
@@ -508,9 +512,12 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     AACEncContext *s = avctx->priv_data;
     float **samples = s->planar_samples, *samples2, *la, *overlap;
     ChannelElement *cpe;
-    int i, ch, w, g, chans, tag, start_ch, ret;
+    SingleChannelElement *sce;
+    IndividualChannelStream *ics;
+    int i, its, ch, w, chans, tag, start_ch, ret, frame_bits;
+    int target_bits, rate_bits, too_many_bits, too_few_bits;
+    int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
     int chan_el_counter[4];
-    int frame_bits;
     FFPsyWindowInfo windows[AAC_MAX_CHANNELS];
 
     if (s->last_frame == 2)
@@ -536,18 +543,22 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         chans    = tag == TYPE_CPE ? 2 : 1;
         cpe      = &s->cpe[i];
         for (ch = 0; ch < chans; ch++) {
-            IndividualChannelStream *ics = &cpe->ch[ch].ics;
-            int cur_channel = start_ch + ch;
-            overlap  = &samples[cur_channel][0];
+            int k;
+            float clip_avoidance_factor;
+            sce = &cpe->ch[ch];
+            ics = &sce->ics;
+            s->cur_channel = start_ch + ch;
+            overlap  = &samples[s->cur_channel][0];
             samples2 = overlap + 1024;
             la       = samples2 + (448+64);
             if (!frame)
                 la = NULL;
             if (tag == TYPE_LFE) {
-                wi[ch].window_type[0] = ONLY_LONG_SEQUENCE;
+                wi[ch].window_type[0] = wi[ch].window_type[1] = ONLY_LONG_SEQUENCE;
                 wi[ch].window_shape   = 0;
                 wi[ch].num_windows    = 1;
                 wi[ch].grouping[0]    = 1;
+                wi[ch].clipping[0]    = 0;
 
                 /* Only the lowest 12 coefficients are used in a LFE channel.
                  * The expression below results in only the bottom 8 coefficients
@@ -555,7 +566,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                  */
                 ics->num_swb = s->samplerate_index >= 8 ? 1 : 3;
             } else {
-                wi[ch] = s->psy.model->window(&s->psy, samples2, la, cur_channel,
+                wi[ch] = s->psy.model->window(&s->psy, samples2, la, s->cur_channel,
                                               ics->window_sequence[0]);
             }
             ics->window_sequence[1] = ics->window_sequence[0];
@@ -565,24 +576,71 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             ics->num_windows        = wi[ch].num_windows;
             ics->swb_sizes          = s->psy.bands    [ics->num_windows == 8];
             ics->num_swb            = tag == TYPE_LFE ? ics->num_swb : s->psy.num_bands[ics->num_windows == 8];
+            ics->max_sfb            = FFMIN(ics->max_sfb, ics->num_swb);
+            ics->swb_offset         = wi[ch].window_type[0] == EIGHT_SHORT_SEQUENCE ?
+                                        ff_swb_offset_128 [s->samplerate_index]:
+                                        ff_swb_offset_1024[s->samplerate_index];
+            ics->tns_max_bands      = wi[ch].window_type[0] == EIGHT_SHORT_SEQUENCE ?
+                                        ff_tns_max_bands_128 [s->samplerate_index]:
+                                        ff_tns_max_bands_1024[s->samplerate_index];
+
             for (w = 0; w < ics->num_windows; w++)
                 ics->group_len[w] = wi[ch].grouping[w];
 
-            apply_window_and_mdct(s, &cpe->ch[ch], overlap);
+            /* Calculate input sample maximums and evaluate clipping risk */
+            clip_avoidance_factor = 0.0f;
+            for (w = 0; w < ics->num_windows; w++) {
+                const float *wbuf = overlap + w * 128;
+                const int wlen = 2048 / ics->num_windows;
+                float max = 0;
+                int j;
+                /* mdct input is 2 * output */
+                for (j = 0; j < wlen; j++)
+                    max = FFMAX(max, fabsf(wbuf[j]));
+                wi[ch].clipping[w] = max;
+            }
+            for (w = 0; w < ics->num_windows; w++) {
+                if (wi[ch].clipping[w] > CLIP_AVOIDANCE_FACTOR) {
+                    ics->window_clipping[w] = 1;
+                    clip_avoidance_factor = FFMAX(clip_avoidance_factor, wi[ch].clipping[w]);
+                } else {
+                    ics->window_clipping[w] = 0;
+                }
+            }
+            if (clip_avoidance_factor > CLIP_AVOIDANCE_FACTOR) {
+                ics->clip_avoidance_factor = CLIP_AVOIDANCE_FACTOR / clip_avoidance_factor;
+            } else {
+                ics->clip_avoidance_factor = 1.0f;
+            }
+
+            apply_window_and_mdct(s, sce, overlap);
+
+            if (s->options.ltp && s->coder->update_ltp) {
+                s->coder->update_ltp(s, sce);
+                apply_window[sce->ics.window_sequence[0]](s->fdsp, sce, &sce->ltp_state[0]);
+                s->mdct1024.mdct_calc(&s->mdct1024, sce->lcoeffs, sce->ret_buf);
+            }
+
+            for (k = 0; k < 1024; k++) {
+                if (!isfinite(cpe->ch[ch].coeffs[k])) {
+                    av_log(avctx, AV_LOG_ERROR, "Input contains NaN/+-Inf\n");
+                    return AVERROR(EINVAL);
+                }
+            }
+            avoid_clipping(s, sce);
         }
         start_ch += chans;
     }
-    if ((ret = ff_alloc_packet(avpkt, 768 * s->channels))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 8192 * s->channels, 0)) < 0)
         return ret;
-    }
-
+    frame_bits = its = 0;
     do {
         init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
         if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & AV_CODEC_FLAG_BITEXACT))
             put_bitstream_info(s, LIBAVCODEC_IDENT);
         start_ch = 0;
+        target_bits = 0;
         memset(chan_el_counter, 0, sizeof(chan_el_counter));
         for (i = 0; i < s->chan_map[0]; i++) {
             FFPsyWindowInfo* wi = windows + start_ch;
@@ -590,16 +648,39 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             tag      = s->chan_map[i+1];
             chans    = tag == TYPE_CPE ? 2 : 1;
             cpe      = &s->cpe[i];
+            cpe->common_window = 0;
+            memset(cpe->is_mask, 0, sizeof(cpe->is_mask));
+            memset(cpe->ms_mask, 0, sizeof(cpe->ms_mask));
             put_bits(&s->pb, 3, tag);
             put_bits(&s->pb, 4, chan_el_counter[tag]++);
-            for (ch = 0; ch < chans; ch++)
-                coeffs[ch] = cpe->ch[ch].coeffs;
+            for (ch = 0; ch < chans; ch++) {
+                sce = &cpe->ch[ch];
+                coeffs[ch] = sce->coeffs;
+                sce->ics.predictor_present = 0;
+                sce->ics.ltp.present = 0;
+                memset(sce->ics.ltp.used, 0, sizeof(sce->ics.ltp.used));
+                memset(sce->ics.prediction_used, 0, sizeof(sce->ics.prediction_used));
+                memset(&sce->tns, 0, sizeof(TemporalNoiseShaping));
+                for (w = 0; w < 128; w++)
+                    if (sce->band_type[w] > RESERVED_BT)
+                        sce->band_type[w] = 0;
+            }
+            s->psy.bitres.alloc = -1;
+            s->psy.bitres.bits = s->last_frame_pb_count / s->channels;
             s->psy.model->analyze(&s->psy, start_ch, coeffs, wi);
+            if (s->psy.bitres.alloc > 0) {
+                /* Lambda unused here on purpose, we need to take psy's unscaled allocation */
+                target_bits += s->psy.bitres.alloc
+                    * (s->lambda / (avctx->global_quality ? avctx->global_quality : 120));
+                s->psy.bitres.alloc /= chans;
+            }
+            s->cur_type = tag;
             for (ch = 0; ch < chans; ch++) {
                 s->cur_channel = start_ch + ch;
+                if (s->options.pns && s->coder->mark_pns)
+                    s->coder->mark_pns(s, avctx, &cpe->ch[ch]);
                 s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda);
             }
-            cpe->common_window = 0;
             if (chans > 1
                 && wi[0].window_type[0] == wi[1].window_type[0]
                 && wi[0].window_shape   == wi[1].window_shape) {
@@ -612,23 +693,73 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                     }
                 }
             }
+            for (ch = 0; ch < chans; ch++) { /* TNS and PNS */
+                sce = &cpe->ch[ch];
+                s->cur_channel = start_ch + ch;
+                if (s->options.tns && s->coder->search_for_tns)
+                    s->coder->search_for_tns(s, sce);
+                if (s->options.tns && s->coder->apply_tns_filt)
+                    s->coder->apply_tns_filt(s, sce);
+                if (sce->tns.present)
+                    tns_mode = 1;
+                if (s->options.pns && s->coder->search_for_pns)
+                    s->coder->search_for_pns(s, avctx, sce);
+            }
             s->cur_channel = start_ch;
-            if (s->options.stereo_mode && cpe->common_window) {
-                if (s->options.stereo_mode > 0) {
-                    IndividualChannelStream *ics = &cpe->ch[0].ics;
-                    for (w = 0; w < ics->num_windows; w += ics->group_len[w])
-                        for (g = 0;  g < ics->num_swb; g++)
-                            cpe->ms_mask[w*16+g] = 1;
-                } else if (s->coder->search_for_ms) {
-                    s->coder->search_for_ms(s, cpe, s->lambda);
+            if (s->options.intensity_stereo) { /* Intensity Stereo */
+                if (s->coder->search_for_is)
+                    s->coder->search_for_is(s, avctx, cpe);
+                if (cpe->is_mode) is_mode = 1;
+                apply_intensity_stereo(cpe);
+            }
+            if (s->options.pred) { /* Prediction */
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->options.pred && s->coder->search_for_pred)
+                        s->coder->search_for_pred(s, sce);
+                    if (cpe->ch[ch].ics.predictor_present) pred_mode = 1;
                 }
+                if (s->coder->adjust_common_pred)
+                    s->coder->adjust_common_pred(s, cpe);
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->options.pred && s->coder->apply_main_pred)
+                        s->coder->apply_main_pred(s, sce);
+                }
+                s->cur_channel = start_ch;
+            }
+            if (s->options.mid_side) { /* Mid/Side stereo */
+                if (s->options.mid_side == -1 && s->coder->search_for_ms)
+                    s->coder->search_for_ms(s, cpe);
+                else if (cpe->common_window)
+                    memset(cpe->ms_mask, 1, sizeof(cpe->ms_mask));
+                apply_mid_side_stereo(cpe);
             }
             adjust_frame_information(cpe, chans);
+            if (s->options.ltp) { /* LTP */
+                for (ch = 0; ch < chans; ch++) {
+                    sce = &cpe->ch[ch];
+                    s->cur_channel = start_ch + ch;
+                    if (s->coder->search_for_ltp)
+                        s->coder->search_for_ltp(s, sce, cpe->common_window);
+                    if (sce->ics.ltp.present) pred_mode = 1;
+                }
+                s->cur_channel = start_ch;
+                if (s->coder->adjust_common_ltp)
+                    s->coder->adjust_common_ltp(s, cpe);
+            }
             if (chans == 2) {
                 put_bits(&s->pb, 1, cpe->common_window);
                 if (cpe->common_window) {
                     put_ics_info(s, &cpe->ch[0].ics);
+                    if (s->coder->encode_main_pred)
+                        s->coder->encode_main_pred(s, &cpe->ch[0]);
+                    if (s->coder->encode_ltp_info)
+                        s->coder->encode_ltp_info(s, &cpe->ch[0], 1);
                     encode_ms_info(&s->pb, cpe);
+                    if (cpe->ms_mode) ms_mode = 1;
                 }
             }
             for (ch = 0; ch < chans; ch++) {
@@ -638,31 +769,77 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
             start_ch += chans;
         }
 
-        frame_bits = put_bits_count(&s->pb);
-        if (frame_bits <= 6144 * s->channels - 3) {
-            s->psy.bitres.bits = frame_bits / s->channels;
+        if (avctx->flags & CODEC_FLAG_QSCALE) {
+            /* When using a constant Q-scale, don't mess with lambda */
             break;
         }
 
-        s->lambda *= avctx->bit_rate * 1024.0f / avctx->sample_rate / frame_bits;
+        /* rate control stuff
+         * allow between the nominal bitrate, and what psy's bit reservoir says to target
+         * but drift towards the nominal bitrate always
+         */
+        frame_bits = put_bits_count(&s->pb);
+        rate_bits = avctx->bit_rate * 1024 / avctx->sample_rate;
+        rate_bits = FFMIN(rate_bits, 6144 * s->channels - 3);
+        too_many_bits = FFMAX(target_bits, rate_bits);
+        too_many_bits = FFMIN(too_many_bits, 6144 * s->channels - 3);
+        too_few_bits = FFMIN(FFMAX(rate_bits - rate_bits/4, target_bits), too_many_bits);
+
+        /* When using ABR, be strict (but only for increasing) */
+        too_few_bits = too_few_bits - too_few_bits/8;
+        too_many_bits = too_many_bits + too_many_bits/2;
+
+        if (   its == 0 /* for steady-state Q-scale tracking */
+            || (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits))
+            || frame_bits >= 6144 * s->channels - 3  )
+        {
+            float ratio = ((float)rate_bits) / frame_bits;
+
+            if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
+                /*
+                 * This path is for steady-state Q-scale tracking
+                 * When frame bits fall within the stable range, we still need to adjust
+                 * lambda to maintain it like so in a stable fashion (large jumps in lambda
+                 * create artifacts and should be avoided), but slowly
+                 */
+                ratio = sqrtf(sqrtf(ratio));
+                ratio = av_clipf(ratio, 0.9f, 1.1f);
+            } else {
+                /* Not so fast though */
+                ratio = sqrtf(ratio);
+            }
+            s->lambda = FFMIN(s->lambda * ratio, 65536.f);
 
+            /* Keep iterating if we must reduce and lambda is in the sky */
+            if (ratio > 0.9f && ratio < 1.1f) {
+                break;
+            } else {
+                if (is_mode || ms_mode || tns_mode || pred_mode) {
+                    for (i = 0; i < s->chan_map[0]; i++) {
+                        // Must restore coeffs
+                        chans = tag == TYPE_CPE ? 2 : 1;
+                        cpe = &s->cpe[i];
+                        for (ch = 0; ch < chans; ch++)
+                            memcpy(cpe->ch[ch].coeffs, cpe->ch[ch].pcoeffs, sizeof(cpe->ch[ch].coeffs));
+                    }
+                }
+                its++;
+            }
+        } else {
+            break;
+        }
     } while (1);
 
+    if (s->options.ltp && s->coder->ltp_insert_new_frame)
+        s->coder->ltp_insert_new_frame(s);
+
     put_bits(&s->pb, 3, TYPE_END);
     flush_put_bits(&s->pb);
-    frame_bits = put_bits_count(&s->pb);
-#if FF_API_STAT_BITS
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->frame_bits = frame_bits;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    // rate control stuff
-    if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
-        float ratio = avctx->bit_rate * 1024.0f / avctx->sample_rate / frame_bits;
-        s->lambda *= ratio;
-        s->lambda = FFMIN(s->lambda, 65536.f);
-    }
+
+    s->last_frame_pb_count = put_bits_count(&s->pb);
+
+    s->lambda_sum += s->lambda;
+    s->lambda_count++;
 
     if (!frame)
         s->last_frame++;
@@ -679,13 +856,17 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
 {
     AACEncContext *s = avctx->priv_data;
 
+    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_sum / s->lambda_count);
+
     ff_mdct_end(&s->mdct1024);
     ff_mdct_end(&s->mdct128);
     ff_psy_end(&s->psy);
+    ff_lpc_end(&s->lpc);
     if (s->psypp)
         ff_psy_preprocess_end(s->psypp);
     av_freep(&s->buffer.samples);
     av_freep(&s->cpe);
+    av_freep(&s->fdsp);
     ff_af_queue_close(&s->afq);
     return 0;
 }
@@ -694,7 +875,9 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
 {
     int ret = 0;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     // window init
     ff_kbd_window_init(ff_aac_kbd_long_1024, 4.0, 1024);
@@ -702,9 +885,9 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
     ff_init_ff_sine_windows(10);
     ff_init_ff_sine_windows(7);
 
-    if (ret = ff_mdct_init(&s->mdct1024, 11, 0, 32768.0))
+    if ((ret = ff_mdct_init(&s->mdct1024, 11, 0, 32768.0)) < 0)
         return ret;
-    if (ret = ff_mdct_init(&s->mdct128,   8, 0, 32768.0))
+    if ((ret = ff_mdct_init(&s->mdct128,   8, 0, 32768.0)) < 0)
         return ret;
 
     return 0;
@@ -713,8 +896,8 @@ static av_cold int dsp_init(AVCodecContext *avctx, AACEncContext *s)
 static av_cold int alloc_buffers(AVCodecContext *avctx, AACEncContext *s)
 {
     int ch;
-    FF_ALLOCZ_OR_GOTO(avctx, s->buffer.samples, 3 * 1024 * s->channels * sizeof(s->buffer.samples[0]), alloc_fail);
-    FF_ALLOCZ_OR_GOTO(avctx, s->cpe, sizeof(ChannelElement) * s->chan_map[0], alloc_fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->buffer.samples, s->channels, 3 * 1024 * sizeof(s->buffer.samples[0]), alloc_fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->cpe, s->chan_map[0], sizeof(ChannelElement), alloc_fail);
     FF_ALLOCZ_OR_GOTO(avctx, avctx->extradata, 5 + AV_INPUT_BUFFER_PADDING_SIZE, alloc_fail);
 
     for(ch = 0; ch < s->channels; ch++)
@@ -725,6 +908,11 @@ alloc_fail:
     return AVERROR(ENOMEM);
 }
 
+static av_cold void aac_encode_init_tables(void)
+{
+    ff_aac_tableinit();
+}
+
 static av_cold int aac_encode_init(AVCodecContext *avctx)
 {
     AACEncContext *s = avctx->priv_data;
@@ -733,28 +921,96 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
     uint8_t grouping[AAC_MAX_CHANNELS];
     int lengths[2];
 
+    /* Constants */
+    s->last_frame_pb_count = 0;
+    avctx->extradata_size = 5;
     avctx->frame_size = 1024;
+    avctx->initial_padding = 1024;
+    s->lambda = avctx->global_quality > 0 ? avctx->global_quality : 120;
 
+    /* Channel map and unspecified bitrate guessing */
+    s->channels = avctx->channels;
+    ERROR_IF(s->channels > AAC_MAX_CHANNELS || s->channels == 7,
+             "Unsupported number of channels: %d\n", s->channels);
+    s->chan_map = aac_chan_configs[s->channels-1];
+    if (!avctx->bit_rate) {
+        for (i = 1; i <= s->chan_map[0]; i++) {
+            avctx->bit_rate += s->chan_map[i] == TYPE_CPE ? 128000 : /* Pair */
+                               s->chan_map[i] == TYPE_LFE ? 16000  : /* LFE  */
+                                                            69000  ; /* SCE  */
+        }
+    }
+
+    /* Samplerate */
     for (i = 0; i < 16; i++)
         if (avctx->sample_rate == avpriv_mpeg4audio_sample_rates[i])
             break;
-
-    s->channels = avctx->channels;
-
-    ERROR_IF(i == 16,
+    s->samplerate_index = i;
+    ERROR_IF(s->samplerate_index == 16 ||
+             s->samplerate_index >= ff_aac_swb_size_1024_len ||
+             s->samplerate_index >= ff_aac_swb_size_128_len,
              "Unsupported sample rate %d\n", avctx->sample_rate);
-    ERROR_IF(s->channels > AAC_MAX_CHANNELS,
-             "Unsupported number of channels: %d\n", s->channels);
-    ERROR_IF(avctx->profile != FF_PROFILE_UNKNOWN && avctx->profile != FF_PROFILE_AAC_LOW,
-             "Unsupported profile %d\n", avctx->profile);
-    ERROR_IF(1024.0 * avctx->bit_rate / avctx->sample_rate > 6144 * s->channels,
-             "Too many bits %f > %d per frame requested\n",
+
+    /* Bitrate limiting */
+    WARN_IF(1024.0 * avctx->bit_rate / avctx->sample_rate > 6144 * s->channels,
+             "Too many bits %f > %d per frame requested, clamping to max\n",
              1024.0 * avctx->bit_rate / avctx->sample_rate,
              6144 * s->channels);
+    avctx->bit_rate = (int64_t)FFMIN(6144 * s->channels / 1024.0 * avctx->sample_rate,
+                                     avctx->bit_rate);
+
+    /* Profile and option setting */
+    avctx->profile = avctx->profile == FF_PROFILE_UNKNOWN ? FF_PROFILE_AAC_LOW :
+                     avctx->profile;
+    for (i = 0; i < FF_ARRAY_ELEMS(aacenc_profiles); i++)
+        if (avctx->profile == aacenc_profiles[i])
+            break;
+    if (avctx->profile == FF_PROFILE_MPEG2_AAC_LOW) {
+        avctx->profile = FF_PROFILE_AAC_LOW;
+        ERROR_IF(s->options.pred,
+                 "Main prediction unavailable in the \"mpeg2_aac_low\" profile\n");
+        ERROR_IF(s->options.ltp,
+                 "LTP prediction unavailable in the \"mpeg2_aac_low\" profile\n");
+        WARN_IF(s->options.pns,
+                "PNS unavailable in the \"mpeg2_aac_low\" profile, turning off\n");
+        s->options.pns = 0;
+    } else if (avctx->profile == FF_PROFILE_AAC_LTP) {
+        s->options.ltp = 1;
+        ERROR_IF(s->options.pred,
+                 "Main prediction unavailable in the \"aac_ltp\" profile\n");
+    } else if (avctx->profile == FF_PROFILE_AAC_MAIN) {
+        s->options.pred = 1;
+        ERROR_IF(s->options.ltp,
+                 "LTP prediction unavailable in the \"aac_main\" profile\n");
+    } else if (s->options.ltp) {
+        avctx->profile = FF_PROFILE_AAC_LTP;
+        WARN_IF(1,
+                "Chainging profile to \"aac_ltp\"\n");
+        ERROR_IF(s->options.pred,
+                 "Main prediction unavailable in the \"aac_ltp\" profile\n");
+    } else if (s->options.pred) {
+        avctx->profile = FF_PROFILE_AAC_MAIN;
+        WARN_IF(1,
+                "Chainging profile to \"aac_main\"\n");
+        ERROR_IF(s->options.ltp,
+                 "LTP prediction unavailable in the \"aac_main\" profile\n");
+    }
+    s->profile = avctx->profile;
+
+    /* Coder limitations */
+    s->coder = &ff_aac_coders[s->options.coder];
+    if (s->options.coder != AAC_CODER_TWOLOOP) {
+        ERROR_IF(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL,
+                 "Coders other than twoloop require -strict -2 and some may be removed in the future\n");
+        s->options.intensity_stereo = 0;
+        s->options.pns = 0;
+    }
+    ERROR_IF(s->options.ltp && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL,
+             "The LPT profile requires experimental compliance, add -strict -2 to enable!\n");
 
-    s->samplerate_index = i;
-
-    s->chan_map = aac_chan_configs[s->channels-1];
+    /* M/S introduces horrible artifacts with multichannel files, this is temporary */
+    if (s->channels > 3)
+        s->options.mid_side = 0;
 
     if ((ret = dsp_init(avctx, s)) < 0)
         goto fail;
@@ -762,29 +1018,27 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
     if ((ret = alloc_buffers(avctx, s)) < 0)
         goto fail;
 
-    avctx->extradata_size = 5;
     put_audio_specific_config(avctx);
 
-    sizes[0]   = swb_size_1024[i];
-    sizes[1]   = swb_size_128[i];
-    lengths[0] = ff_aac_num_swb_1024[i];
-    lengths[1] = ff_aac_num_swb_128[i];
+    sizes[0]   = ff_aac_swb_size_1024[s->samplerate_index];
+    sizes[1]   = ff_aac_swb_size_128[s->samplerate_index];
+    lengths[0] = ff_aac_num_swb_1024[s->samplerate_index];
+    lengths[1] = ff_aac_num_swb_128[s->samplerate_index];
     for (i = 0; i < s->chan_map[0]; i++)
         grouping[i] = s->chan_map[i + 1] == TYPE_CPE;
     if ((ret = ff_psy_init(&s->psy, avctx, 2, sizes, lengths,
                            s->chan_map[0], grouping)) < 0)
         goto fail;
     s->psypp = ff_psy_preprocess_init(avctx);
-    s->coder = &ff_aac_coders[2];
+    ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON);
+    av_lfg_init(&s->lfg, 0x72adca55);
 
-    s->lambda = avctx->global_quality ? avctx->global_quality : 120;
+    if (HAVE_MIPSDSP)
+        ff_aac_coder_init_mips(s);
 
-    ff_aac_tableinit();
+    if ((ret = ff_thread_once(&aac_table_init, &aac_encode_init_tables)) != 0)
+        return AVERROR_UNKNOWN;
 
-    for (i = 0; i < 428; i++)
-        ff_aac_pow34sf_tab[i] = sqrt(ff_aac_pow2sf_tab[i] * sqrt(ff_aac_pow2sf_tab[i]));
-
-    avctx->initial_padding = 1024;
     ff_af_queue_init(avctx, &s->afq);
 
     return 0;
@@ -795,10 +1049,16 @@ fail:
 
 #define AACENC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
 static const AVOption aacenc_options[] = {
-    {"stereo_mode", "Stereo coding method", offsetof(AACEncContext, options.stereo_mode), AV_OPT_TYPE_INT, {.i64 = 0}, -1, 1, AACENC_FLAGS, "stereo_mode"},
-        {"auto",     "Selected by the Encoder", 0, AV_OPT_TYPE_CONST, {.i64 = -1 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
-        {"ms_off",   "Disable Mid/Side coding", 0, AV_OPT_TYPE_CONST, {.i64 =  0 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
-        {"ms_force", "Force Mid/Side for the whole frame if possible", 0, AV_OPT_TYPE_CONST, {.i64 =  1 }, INT_MIN, INT_MAX, AACENC_FLAGS, "stereo_mode"},
+    {"aac_coder", "Coding algorithm", offsetof(AACEncContext, options.coder), AV_OPT_TYPE_INT, {.i64 = AAC_CODER_TWOLOOP}, 0, AAC_CODER_NB-1, AACENC_FLAGS, "coder"},
+        {"anmr",     "ANMR method",               0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_ANMR},    INT_MIN, INT_MAX, AACENC_FLAGS, "coder"},
+        {"twoloop",  "Two loop searching method", 0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_TWOLOOP}, INT_MIN, INT_MAX, AACENC_FLAGS, "coder"},
+        {"fast",     "Constant quantizer",        0, AV_OPT_TYPE_CONST, {.i64 = AAC_CODER_FAST},    INT_MIN, INT_MAX, AACENC_FLAGS, "coder"},
+    {"aac_ms", "Force M/S stereo coding", offsetof(AACEncContext, options.mid_side), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, AACENC_FLAGS},
+    {"aac_is", "Intensity stereo coding", offsetof(AACEncContext, options.intensity_stereo), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS},
+    {"aac_pns", "Perceptual noise substitution", offsetof(AACEncContext, options.pns), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS},
+    {"aac_tns", "Temporal noise shaping", offsetof(AACEncContext, options.tns), AV_OPT_TYPE_BOOL, {.i64 = 1}, -1, 1, AACENC_FLAGS},
+    {"aac_ltp", "Long term prediction", offsetof(AACEncContext, options.ltp), AV_OPT_TYPE_BOOL, {.i64 = 0}, -1, 1, AACENC_FLAGS},
+    {"aac_pred", "AAC-Main prediction", offsetof(AACEncContext, options.pred), AV_OPT_TYPE_BOOL, {.i64 = 0}, -1, 1, AACENC_FLAGS},
     {NULL}
 };
 
@@ -809,6 +1069,11 @@ static const AVClass aacenc_class = {
     LIBAVUTIL_VERSION_INT,
 };
 
+static const AVCodecDefault aac_encode_defaults[] = {
+    { "b", "0" },
+    { NULL }
+};
+
 AVCodec ff_aac_encoder = {
     .name           = "aac",
     .long_name      = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"),
@@ -818,8 +1083,10 @@ AVCodec ff_aac_encoder = {
     .init           = aac_encode_init,
     .encode2        = aac_encode_frame,
     .close          = aac_encode_end,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY |
-                      AV_CODEC_CAP_EXPERIMENTAL,
+    .defaults       = aac_encode_defaults,
+    .supported_samplerates = mpeg4audio_sample_rates,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLTP,
                                                      AV_SAMPLE_FMT_NONE },
     .priv_class     = &aacenc_class,
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index dec445c..63e7893 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -2,20 +2,20 @@
  * AAC encoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 #define AVCODEC_AACENC_H
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/lfg.h"
 #include "avcodec.h"
 #include "put_bits.h"
 
@@ -30,8 +31,24 @@
 #include "audio_frame_queue.h"
 #include "psymodel.h"
 
+#include "lpc.h"
+
+typedef enum AACCoder {
+    AAC_CODER_ANMR = 0,
+    AAC_CODER_TWOLOOP,
+    AAC_CODER_FAST,
+
+    AAC_CODER_NB,
+}AACCoder;
+
 typedef struct AACEncOptions {
-    int stereo_mode;
+    int coder;
+    int pns;
+    int tns;
+    int ltp;
+    int pred;
+    int mid_side;
+    int intensity_stereo;
 } AACEncOptions;
 
 struct AACEncContext;
@@ -41,13 +58,38 @@ typedef struct AACCoefficientsEncoder {
                                   SingleChannelElement *sce, const float lambda);
     void (*encode_window_bands_info)(struct AACEncContext *s, SingleChannelElement *sce,
                                      int win, int group_len, const float lambda);
-    void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, int size,
-                                     int scale_idx, int cb, const float lambda);
-    void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe, const float lambda);
+    void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, float *out, int size,
+                                     int scale_idx, int cb, const float lambda, int rtz);
+    void (*encode_tns_info)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*encode_ltp_info)(struct AACEncContext *s, SingleChannelElement *sce, int common_window);
+    void (*encode_main_pred)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*adjust_common_pred)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*adjust_common_ltp)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*apply_main_pred)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*apply_tns_filt)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*update_ltp)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*ltp_insert_new_frame)(struct AACEncContext *s);
+    void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
+    void (*mark_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
+    void (*search_for_tns)(struct AACEncContext *s, SingleChannelElement *sce);
+    void (*search_for_ltp)(struct AACEncContext *s, SingleChannelElement *sce, int common_window);
+    void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe);
+    void (*search_for_is)(struct AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
+    void (*search_for_pred)(struct AACEncContext *s, SingleChannelElement *sce);
 } AACCoefficientsEncoder;
 
 extern AACCoefficientsEncoder ff_aac_coders[];
 
+typedef struct AACQuantizeBandCostCacheEntry {
+    float rd;
+    float energy;
+    int bits;
+    char cb;
+    char rtz;
+    uint16_t generation;
+} AACQuantizeBandCostCacheEntry;
+
 /**
  * AAC encoder context
  */
@@ -57,9 +99,12 @@ typedef struct AACEncContext {
     PutBitContext pb;
     FFTContext mdct1024;                         ///< long (1024 samples) frame transform context
     FFTContext mdct128;                          ///< short (128 samples) frame transform context
-    AVFloatDSPContext fdsp;
-    float *planar_samples[6];                    ///< saved preprocessed input
+    AVFloatDSPContext *fdsp;
+    AVLFG lfg;                                   ///< PRNG needed for PNS
+    float *planar_samples[8];                    ///< saved preprocessed input
 
+    int profile;                                 ///< copied from avctx
+    LPCContext lpc;                              ///< used by TNS
     int samplerate_index;                        ///< MPEG-4 samplerate index
     int channels;                                ///< channel count
     const uint8_t *chan_map;                     ///< channel configuration map
@@ -68,18 +113,29 @@ typedef struct AACEncContext {
     FFPsyContext psy;
     struct FFPsyPreprocessContext* psypp;
     AACCoefficientsEncoder *coder;
-    int cur_channel;
+    int cur_channel;                             ///< current channel for coder context
     int last_frame;
+    int random_state;
     float lambda;
+    int last_frame_pb_count;                     ///< number of bits for the previous frame
+    float lambda_sum;                            ///< sum(lambda), for Qvg reporting
+    int lambda_count;                            ///< count(lambda), for Qvg reporting
+    enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to
+
     AudioFrameQueue afq;
     DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
     DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
 
+    uint16_t quantize_band_cost_cache_generation;
+    AACQuantizeBandCostCacheEntry quantize_band_cost_cache[256][128]; ///< memoization area for quantize_band_cost
+
     struct {
         float *samples;
     } buffer;
 } AACEncContext;
 
-extern float ff_aac_pow34sf_tab[428];
+void ff_aac_coder_init_mips(AACEncContext *c);
+void ff_quantize_band_cost_cache_init(struct AACEncContext *s);
+
 
 #endif /* AVCODEC_AACENC_H */
diff --git a/libavcodec/aacenc_is.c b/libavcodec/aacenc_is.c
new file mode 100644
index 0000000..473897b
--- /dev/null
+++ b/libavcodec/aacenc_is.c
@@ -0,0 +1,158 @@
+/*
+ * AAC encoder intensity stereo
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder Intensity Stereo
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aacenc.h"
+#include "aacenc_utils.h"
+#include "aacenc_is.h"
+#include "aacenc_quantization.h"
+
+struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
+                                         int start, int w, int g, float ener0,
+                                         float ener1, float ener01,
+                                         int use_pcoeffs, int phase)
+{
+    int i, w2;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    float *L = use_pcoeffs ? sce0->pcoeffs : sce0->coeffs;
+    float *R = use_pcoeffs ? sce1->pcoeffs : sce1->coeffs;
+    float *L34 = &s->scoefs[256*0], *R34 = &s->scoefs[256*1];
+    float *IS  = &s->scoefs[256*2], *I34 = &s->scoefs[256*3];
+    float dist1 = 0.0f, dist2 = 0.0f;
+    struct AACISError is_error = {0};
+
+    if (ener01 <= 0 || ener0 <= 0) {
+        is_error.pass = 0;
+        return is_error;
+    }
+
+    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+        int is_band_type, is_sf_idx = FFMAX(1, sce0->sf_idx[w*16+g]-4);
+        float e01_34 = phase*pos_pow34(ener1/ener0);
+        float maxval, dist_spec_err = 0.0f;
+        float minthr = FFMIN(band0->threshold, band1->threshold);
+        for (i = 0; i < sce0->ics.swb_sizes[g]; i++)
+            IS[i] = (L[start+(w+w2)*128+i] + phase*R[start+(w+w2)*128+i])*sqrt(ener0/ener01);
+        abs_pow34_v(L34, &L[start+(w+w2)*128], sce0->ics.swb_sizes[g]);
+        abs_pow34_v(R34, &R[start+(w+w2)*128], sce0->ics.swb_sizes[g]);
+        abs_pow34_v(I34, IS,                   sce0->ics.swb_sizes[g]);
+        maxval = find_max_val(1, sce0->ics.swb_sizes[g], I34);
+        is_band_type = find_min_book(maxval, is_sf_idx);
+        dist1 += quantize_band_cost(s, &L[start + (w+w2)*128], L34,
+                                    sce0->ics.swb_sizes[g],
+                                    sce0->sf_idx[w*16+g],
+                                    sce0->band_type[w*16+g],
+                                    s->lambda / band0->threshold, INFINITY, NULL, NULL, 0);
+        dist1 += quantize_band_cost(s, &R[start + (w+w2)*128], R34,
+                                    sce1->ics.swb_sizes[g],
+                                    sce1->sf_idx[w*16+g],
+                                    sce1->band_type[w*16+g],
+                                    s->lambda / band1->threshold, INFINITY, NULL, NULL, 0);
+        dist2 += quantize_band_cost(s, IS, I34, sce0->ics.swb_sizes[g],
+                                    is_sf_idx, is_band_type,
+                                    s->lambda / minthr, INFINITY, NULL, NULL, 0);
+        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+            dist_spec_err += (L34[i] - I34[i])*(L34[i] - I34[i]);
+            dist_spec_err += (R34[i] - I34[i]*e01_34)*(R34[i] - I34[i]*e01_34);
+        }
+        dist_spec_err *= s->lambda / minthr;
+        dist2 += dist_spec_err;
+    }
+
+    is_error.pass = dist2 <= dist1;
+    is_error.phase = phase;
+    is_error.error = dist2 - dist1;
+    is_error.dist1 = dist1;
+    is_error.dist2 = dist2;
+    is_error.ener01 = ener01;
+
+    return is_error;
+}
+
+void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe)
+{
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    int start = 0, count = 0, w, w2, g, i, prev_sf1 = -1, prev_bt = -1, prev_is = 0;
+    const float freq_mult = avctx->sample_rate/(1024.0f/sce0->ics.num_windows)/2.0f;
+    uint8_t nextband1[128];
+
+    if (!cpe->common_window)
+        return;
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce1, nextband1);
+
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce0->ics.num_swb; g++) {
+            if (start*freq_mult > INT_STEREO_LOW_LIMIT*(s->lambda/170.0f) &&
+                cpe->ch[0].band_type[w*16+g] != NOISE_BT && !cpe->ch[0].zeroes[w*16+g] &&
+                cpe->ch[1].band_type[w*16+g] != NOISE_BT && !cpe->ch[1].zeroes[w*16+g] &&
+                ff_sfdelta_can_remove_band(sce1, nextband1, prev_sf1, w*16+g)) {
+                float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f, ener01p = 0.0f;
+                struct AACISError ph_err1, ph_err2, *best;
+                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                        float coef0 = sce0->coeffs[start+(w+w2)*128+i];
+                        float coef1 = sce1->coeffs[start+(w+w2)*128+i];
+                        ener0  += coef0*coef0;
+                        ener1  += coef1*coef1;
+                        ener01 += (coef0 + coef1)*(coef0 + coef1);
+                        ener01p += (coef0 - coef1)*(coef0 - coef1);
+                    }
+                }
+                ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                                 ener0, ener1, ener01p, 0, -1);
+                ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                                 ener0, ener1, ener01, 0, +1);
+                best = (ph_err1.pass && ph_err1.error < ph_err2.error) ? &ph_err1 : &ph_err2;
+                if (best->pass) {
+                    cpe->is_mask[w*16+g] = 1;
+                    cpe->ms_mask[w*16+g] = 0;
+                    cpe->ch[0].is_ener[w*16+g] = sqrt(ener0 / best->ener01);
+                    cpe->ch[1].is_ener[w*16+g] = ener0/ener1;
+                    cpe->ch[1].band_type[w*16+g] = (best->phase > 0) ? INTENSITY_BT : INTENSITY_BT2;
+                    if (prev_is && prev_bt != cpe->ch[1].band_type[w*16+g]) {
+                        /** Flip M/S mask and pick the other CB, since it encodes more efficiently */
+                        cpe->ms_mask[w*16+g] = 1;
+                        cpe->ch[1].band_type[w*16+g] = (best->phase > 0) ? INTENSITY_BT2 : INTENSITY_BT;
+                    }
+                    prev_bt = cpe->ch[1].band_type[w*16+g];
+                    count++;
+                }
+            }
+            if (!sce1->zeroes[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                prev_sf1 = sce1->sf_idx[w*16+g];
+            prev_is = cpe->is_mask[w*16+g];
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+    cpe->is_mode = !!count;
+}
diff --git a/libavcodec/aacenc_is.h b/libavcodec/aacenc_is.h
new file mode 100644
index 0000000..269fd1a
--- /dev/null
+++ b/libavcodec/aacenc_is.h
@@ -0,0 +1,51 @@
+/*
+ * AAC encoder intensity stereo
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder Intensity Stereo
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_IS_H
+#define AVCODEC_AACENC_IS_H
+
+#include "aacenc.h"
+
+/** Frequency in Hz for lower limit of intensity stereo **/
+#define INT_STEREO_LOW_LIMIT 6100
+
+struct AACISError {
+    int pass;    /* 1 if dist2 <= dist1  */
+    int phase;   /* -1 or +1             */
+    float error; /* fabs(dist1 - dist2)  */
+    float dist1; /* From original coeffs */
+    float dist2; /* From IS'd coeffs     */
+    float ener01;
+};
+
+struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
+                                         int start, int w, int g, float ener0,
+                                         float ener1, float ener01,
+                                         int use_pcoeffs, int phase);
+void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
+
+#endif /* AVCODEC_AACENC_IS_H */
diff --git a/libavcodec/aacenc_ltp.c b/libavcodec/aacenc_ltp.c
new file mode 100644
index 0000000..b9d43b4
--- /dev/null
+++ b/libavcodec/aacenc_ltp.c
@@ -0,0 +1,236 @@
+/*
+ * AAC encoder long term prediction extension
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder long term prediction extension
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aacenc_ltp.h"
+#include "aacenc_quantization.h"
+#include "aacenc_utils.h"
+
+/**
+ * Encode LTP data.
+ */
+void ff_aac_encode_ltp_info(AACEncContext *s, SingleChannelElement *sce,
+                            int common_window)
+{
+    int i;
+    IndividualChannelStream *ics = &sce->ics;
+    if (s->profile != FF_PROFILE_AAC_LTP || !ics->predictor_present)
+        return;
+    if (common_window)
+        put_bits(&s->pb, 1, 0);
+    put_bits(&s->pb, 1, ics->ltp.present);
+    if (!ics->ltp.present)
+        return;
+    put_bits(&s->pb, 11, ics->ltp.lag);
+    put_bits(&s->pb, 3,  ics->ltp.coef_idx);
+    for (i = 0; i < FFMIN(ics->max_sfb, MAX_LTP_LONG_SFB); i++)
+        put_bits(&s->pb, 1, ics->ltp.used[i]);
+}
+
+void ff_aac_ltp_insert_new_frame(AACEncContext *s)
+{
+    int i, ch, tag, chans, cur_channel, start_ch = 0;
+    ChannelElement *cpe;
+    SingleChannelElement *sce;
+    for (i = 0; i < s->chan_map[0]; i++) {
+        cpe = &s->cpe[i];
+        tag      = s->chan_map[i+1];
+        chans    = tag == TYPE_CPE ? 2 : 1;
+        for (ch = 0; ch < chans; ch++) {
+            sce = &cpe->ch[ch];
+            cur_channel = start_ch + ch;
+            /* New sample + overlap */
+            memcpy(&sce->ltp_state[0],    &sce->ltp_state[1024], 1024*sizeof(sce->ltp_state[0]));
+            memcpy(&sce->ltp_state[1024], &s->planar_samples[cur_channel][2048], 1024*sizeof(sce->ltp_state[0]));
+            memcpy(&sce->ltp_state[2048], &sce->ret_buf[0], 1024*sizeof(sce->ltp_state[0]));
+            sce->ics.ltp.lag = 0;
+        }
+        start_ch += chans;
+    }
+}
+
+static void get_lag(float *buf, const float *new, LongTermPrediction *ltp)
+{
+    int i, j, lag, max_corr = 0;
+    float max_ratio;
+    for (i = 0; i < 2048; i++) {
+        float corr, s0 = 0.0f, s1 = 0.0f;
+        const int start = FFMAX(0, i - 1024);
+        for (j = start; j < 2048; j++) {
+            const int idx = j - i + 1024;
+            s0 += new[j]*buf[idx];
+            s1 += buf[idx]*buf[idx];
+        }
+        corr = s1 > 0.0f ? s0/sqrt(s1) : 0.0f;
+        if (corr > max_corr) {
+            max_corr = corr;
+            lag = i;
+            max_ratio = corr/(2048-start);
+        }
+    }
+    ltp->lag = FFMAX(av_clip_uintp2(lag, 11), 0);
+    ltp->coef_idx = quant_array_idx(max_ratio, ltp_coef, 8);
+    ltp->coef = ltp_coef[ltp->coef_idx];
+}
+
+static void generate_samples(float *buf, LongTermPrediction *ltp)
+{
+    int i, samples_num = 2048;
+    if (!ltp->lag) {
+        ltp->present = 0;
+        return;
+    } else if (ltp->lag < 1024) {
+        samples_num = ltp->lag + 1024;
+    }
+    for (i = 0; i < samples_num; i++)
+        buf[i] = ltp->coef*buf[i + 2048 - ltp->lag];
+    memset(&buf[i], 0, (2048 - i)*sizeof(float));
+}
+
+/**
+ * Process LTP parameters
+ * @see Patent WO2006070265A1
+ */
+void ff_aac_update_ltp(AACEncContext *s, SingleChannelElement *sce)
+{
+    float *pred_signal = &sce->ltp_state[0];
+    const float *samples = &s->planar_samples[s->cur_channel][1024];
+
+    if (s->profile != FF_PROFILE_AAC_LTP)
+        return;
+
+    /* Calculate lag */
+    get_lag(pred_signal, samples, &sce->ics.ltp);
+    generate_samples(pred_signal, &sce->ics.ltp);
+}
+
+void ff_aac_adjust_common_ltp(AACEncContext *s, ChannelElement *cpe)
+{
+    int sfb, count = 0;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+
+    if (!cpe->common_window ||
+        sce0->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE ||
+        sce1->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        sce0->ics.ltp.present = 0;
+        return;
+    }
+
+    for (sfb = 0; sfb < FFMIN(sce0->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++) {
+        int sum = sce0->ics.ltp.used[sfb] + sce1->ics.ltp.used[sfb];
+        if (sum != 2) {
+            sce0->ics.ltp.used[sfb] = 0;
+        } else if (sum == 2) {
+            count++;
+        }
+    }
+
+    sce0->ics.ltp.present = !!count;
+    sce0->ics.predictor_present = !!count;
+}
+
+/**
+ * Mark LTP sfb's
+ */
+void ff_aac_search_for_ltp(AACEncContext *s, SingleChannelElement *sce,
+                           int common_window)
+{
+    int w, g, w2, i, start = 0, count = 0;
+    int saved_bits = -(15 + FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB));
+    float *C34 = &s->scoefs[128*0], *PCD = &s->scoefs[128*1];
+    float *PCD34 = &s->scoefs[128*2];
+    const int max_ltp = FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB);
+
+    if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        if (sce->ics.ltp.lag) {
+            memset(&sce->ltp_state[0], 0, 3072*sizeof(sce->ltp_state[0]));
+            memset(&sce->ics.ltp, 0, sizeof(LongTermPrediction));
+        }
+        return;
+    }
+
+    if (!sce->ics.ltp.lag || s->lambda > 120.0f)
+        return;
+
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            int bits1 = 0, bits2 = 0;
+            float dist1 = 0.0f, dist2 = 0.0f;
+            if (w*16+g > max_ltp) {
+                start += sce->ics.swb_sizes[g];
+                continue;
+            }
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                int bits_tmp1, bits_tmp2;
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                for (i = 0; i < sce->ics.swb_sizes[g]; i++)
+                    PCD[i] = sce->coeffs[start+(w+w2)*128+i] - sce->lcoeffs[start+(w+w2)*128+i];
+                abs_pow34_v(C34,  &sce->coeffs[start+(w+w2)*128],  sce->ics.swb_sizes[g]);
+                abs_pow34_v(PCD34, PCD, sce->ics.swb_sizes[g]);
+                dist1 += quantize_band_cost(s, &sce->coeffs[start+(w+w2)*128], C34, sce->ics.swb_sizes[g],
+                                            sce->sf_idx[(w+w2)*16+g], sce->band_type[(w+w2)*16+g],
+                                            s->lambda/band->threshold, INFINITY, &bits_tmp1, NULL, 0);
+                dist2 += quantize_band_cost(s, PCD, PCD34, sce->ics.swb_sizes[g],
+                                            sce->sf_idx[(w+w2)*16+g],
+                                            sce->band_type[(w+w2)*16+g],
+                                            s->lambda/band->threshold, INFINITY, &bits_tmp2, NULL, 0);
+                bits1 += bits_tmp1;
+                bits2 += bits_tmp2;
+            }
+            if (dist2 < dist1 && bits2 < bits1) {
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++)
+                    for (i = 0; i < sce->ics.swb_sizes[g]; i++)
+                        sce->coeffs[start+(w+w2)*128+i] -= sce->lcoeffs[start+(w+w2)*128+i];
+                sce->ics.ltp.used[w*16+g] = 1;
+                saved_bits += bits1 - bits2;
+                count++;
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    sce->ics.ltp.present = !!count && (saved_bits >= 0);
+    sce->ics.predictor_present = !!sce->ics.ltp.present;
+
+    /* Reset any marked sfbs */
+    if (!sce->ics.ltp.present && !!count) {
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            start = 0;
+            for (g = 0;  g < sce->ics.num_swb; g++) {
+                if (sce->ics.ltp.used[w*16+g]) {
+                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                        for (i = 0; i < sce->ics.swb_sizes[g]; i++) {
+                            sce->coeffs[start+(w+w2)*128+i] += sce->lcoeffs[start+(w+w2)*128+i];
+                        }
+                    }
+                }
+                start += sce->ics.swb_sizes[g];
+            }
+        }
+    }
+}
diff --git a/libavcodec/aacenc_ltp.h b/libavcodec/aacenc_ltp.h
new file mode 100644
index 0000000..7276878
--- /dev/null
+++ b/libavcodec/aacenc_ltp.h
@@ -0,0 +1,41 @@
+/*
+ * AAC encoder long term prediction extension
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder long term prediction extension
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_LTP_H
+#define AVCODEC_AACENC_LTP_H
+
+#include "aacenc.h"
+
+void ff_aac_encode_ltp_info(AACEncContext *s, SingleChannelElement *sce,
+                            int common_window);
+void ff_aac_update_ltp(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_adjust_common_ltp(AACEncContext *s, ChannelElement *cpe);
+void ff_aac_ltp_insert_new_frame(AACEncContext *s);
+void ff_aac_search_for_ltp(AACEncContext *s, SingleChannelElement *sce,
+                           int common_window);
+
+#endif /* AVCODEC_AACENC_LTP_H */
diff --git a/libavcodec/aacenc_pred.c b/libavcodec/aacenc_pred.c
new file mode 100644
index 0000000..e77a3de
--- /dev/null
+++ b/libavcodec/aacenc_pred.c
@@ -0,0 +1,347 @@
+/*
+ * AAC encoder main-type prediction
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder main-type prediction
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "aactab.h"
+#include "aacenc_pred.h"
+#include "aacenc_utils.h"
+#include "aacenc_is.h"            /* <- Needed for common window distortions */
+#include "aacenc_quantization.h"
+
+#define RESTORE_PRED(sce, sfb) \
+        if (sce->ics.prediction_used[sfb]) {\
+            sce->ics.prediction_used[sfb] = 0;\
+            sce->band_type[sfb] = sce->band_alt[sfb];\
+        }
+
+static inline float flt16_round(float pf)
+{
+    union av_intfloat32 tmp;
+    tmp.f = pf;
+    tmp.i = (tmp.i + 0x00008000U) & 0xFFFF0000U;
+    return tmp.f;
+}
+
+static inline float flt16_even(float pf)
+{
+    union av_intfloat32 tmp;
+    tmp.f = pf;
+    tmp.i = (tmp.i + 0x00007FFFU + (tmp.i & 0x00010000U >> 16)) & 0xFFFF0000U;
+    return tmp.f;
+}
+
+static inline float flt16_trunc(float pf)
+{
+    union av_intfloat32 pun;
+    pun.f = pf;
+    pun.i &= 0xFFFF0000U;
+    return pun.f;
+}
+
+static inline void predict(PredictorState *ps, float *coef, float *rcoef, int set)
+{
+    float k2;
+    const float a     = 0.953125; // 61.0 / 64
+    const float alpha = 0.90625;  // 29.0 / 32
+    const float   k1 = ps->k1;
+    const float   r0 = ps->r0,     r1 = ps->r1;
+    const float cor0 = ps->cor0, cor1 = ps->cor1;
+    const float var0 = ps->var0, var1 = ps->var1;
+    const float e0 = *coef - ps->x_est;
+    const float e1 = e0 - k1 * r0;
+
+    if (set)
+        *coef = e0;
+
+    ps->cor1 = flt16_trunc(alpha * cor1 + r1 * e1);
+    ps->var1 = flt16_trunc(alpha * var1 + 0.5f * (r1 * r1 + e1 * e1));
+    ps->cor0 = flt16_trunc(alpha * cor0 + r0 * e0);
+    ps->var0 = flt16_trunc(alpha * var0 + 0.5f * (r0 * r0 + e0 * e0));
+    ps->r1   = flt16_trunc(a * (r0 - k1 * e0));
+    ps->r0   = flt16_trunc(a * e0);
+
+    /* Prediction for next frame */
+    ps->k1   = ps->var0 > 1 ? ps->cor0 * flt16_even(a / ps->var0) : 0;
+    k2       = ps->var1 > 1 ? ps->cor1 * flt16_even(a / ps->var1) : 0;
+    *rcoef   = ps->x_est = flt16_round(ps->k1*ps->r0 + k2*ps->r1);
+}
+
+static inline void reset_predict_state(PredictorState *ps)
+{
+    ps->r0    = 0.0f;
+    ps->r1    = 0.0f;
+    ps->k1    = 0.0f;
+    ps->cor0  = 0.0f;
+    ps->cor1  = 0.0f;
+    ps->var0  = 1.0f;
+    ps->var1  = 1.0f;
+    ps->x_est = 0.0f;
+}
+
+static inline void reset_all_predictors(PredictorState *ps)
+{
+    int i;
+    for (i = 0; i < MAX_PREDICTORS; i++)
+        reset_predict_state(&ps[i]);
+}
+
+static inline void reset_predictor_group(SingleChannelElement *sce, int group_num)
+{
+    int i;
+    PredictorState *ps = sce->predictor_state;
+    for (i = group_num - 1; i < MAX_PREDICTORS; i += 30)
+        reset_predict_state(&ps[i]);
+}
+
+void ff_aac_apply_main_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb, k;
+    const int pmax = FFMIN(sce->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        for (sfb = 0; sfb < pmax; sfb++) {
+            for (k = sce->ics.swb_offset[sfb]; k < sce->ics.swb_offset[sfb + 1]; k++) {
+                predict(&sce->predictor_state[k], &sce->coeffs[k], &sce->prcoeffs[k],
+                        sce->ics.predictor_present && sce->ics.prediction_used[sfb]);
+            }
+        }
+        if (sce->ics.predictor_reset_group) {
+            reset_predictor_group(sce, sce->ics.predictor_reset_group);
+        }
+    } else {
+        reset_all_predictors(sce->predictor_state);
+    }
+}
+
+/* If inc = 0 you can check if this returns 0 to see if you can reset freely */
+static inline int update_counters(IndividualChannelStream *ics, int inc)
+{
+    int i;
+    for (i = 1; i < 31; i++) {
+        ics->predictor_reset_count[i] += inc;
+        if (ics->predictor_reset_count[i] > PRED_RESET_FRAME_MIN)
+            return i; /* Reset this immediately */
+    }
+    return 0;
+}
+
+void ff_aac_adjust_common_pred(AACEncContext *s, ChannelElement *cpe)
+{
+    int start, w, w2, g, i, count = 0;
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    const int pmax0 = FFMIN(sce0->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    const int pmax1 = FFMIN(sce1->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    const int pmax  = FFMIN(pmax0, pmax1);
+
+    if (!cpe->common_window ||
+        sce0->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE ||
+        sce1->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE)
+        return;
+
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0; g < sce0->ics.num_swb; g++) {
+            int sfb = w*16+g;
+            int sum = sce0->ics.prediction_used[sfb] + sce1->ics.prediction_used[sfb];
+            float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f;
+            struct AACISError ph_err1, ph_err2, *erf;
+            if (sfb < PRED_SFB_START || sfb > pmax || sum != 2) {
+                RESTORE_PRED(sce0, sfb);
+                RESTORE_PRED(sce1, sfb);
+                start += sce0->ics.swb_sizes[g];
+                continue;
+            }
+            for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                    float coef0 = sce0->pcoeffs[start+(w+w2)*128+i];
+                    float coef1 = sce1->pcoeffs[start+(w+w2)*128+i];
+                    ener0  += coef0*coef0;
+                    ener1  += coef1*coef1;
+                    ener01 += (coef0 + coef1)*(coef0 + coef1);
+                }
+            }
+            ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                             ener0, ener1, ener01, 1, -1);
+            ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
+                                             ener0, ener1, ener01, 1, +1);
+            erf = ph_err1.error < ph_err2.error ? &ph_err1 : &ph_err2;
+            if (erf->pass) {
+                sce0->ics.prediction_used[sfb] = 1;
+                sce1->ics.prediction_used[sfb] = 1;
+                count++;
+            } else {
+                RESTORE_PRED(sce0, sfb);
+                RESTORE_PRED(sce1, sfb);
+            }
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+
+    sce1->ics.predictor_present = sce0->ics.predictor_present = !!count;
+}
+
+static void update_pred_resets(SingleChannelElement *sce)
+{
+    int i, max_group_id_c, max_frame = 0;
+    float avg_frame = 0.0f;
+    IndividualChannelStream *ics = &sce->ics;
+
+    /* Update the counters and immediately update any frame behind schedule */
+    if ((ics->predictor_reset_group = update_counters(&sce->ics, 1)))
+        return;
+
+    for (i = 1; i < 31; i++) {
+        /* Count-based */
+        if (ics->predictor_reset_count[i] > max_frame) {
+            max_group_id_c = i;
+            max_frame = ics->predictor_reset_count[i];
+        }
+        avg_frame = (ics->predictor_reset_count[i] + avg_frame)/2;
+    }
+
+    if (max_frame > PRED_RESET_MIN) {
+        ics->predictor_reset_group = max_group_id_c;
+    } else {
+        ics->predictor_reset_group = 0;
+    }
+}
+
+void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb, i, count = 0, cost_coeffs = 0, cost_pred = 0;
+    const int pmax = FFMIN(sce->ics.max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+    float *O34  = &s->scoefs[128*0], *P34 = &s->scoefs[128*1];
+    float *SENT = &s->scoefs[128*2], *S34 = &s->scoefs[128*3];
+    float *QERR = &s->scoefs[128*4];
+
+    if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        sce->ics.predictor_present = 0;
+        return;
+    }
+
+    if (!sce->ics.predictor_initialized) {
+        reset_all_predictors(sce->predictor_state);
+        sce->ics.predictor_initialized = 1;
+        memcpy(sce->prcoeffs, sce->coeffs, 1024*sizeof(float));
+        for (i = 1; i < 31; i++)
+            sce->ics.predictor_reset_count[i] = i;
+    }
+
+    update_pred_resets(sce);
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+
+    for (sfb = PRED_SFB_START; sfb < pmax; sfb++) {
+        int cost1, cost2, cb_p;
+        float dist1, dist2, dist_spec_err = 0.0f;
+        const int cb_n = sce->zeroes[sfb] ? 0 : sce->band_type[sfb];
+        const int cb_min = sce->zeroes[sfb] ? 0 : 1;
+        const int cb_max = sce->zeroes[sfb] ? 0 : RESERVED_BT;
+        const int start_coef = sce->ics.swb_offset[sfb];
+        const int num_coeffs = sce->ics.swb_offset[sfb + 1] - start_coef;
+        const FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[sfb];
+
+        if (start_coef + num_coeffs > MAX_PREDICTORS ||
+            (s->cur_channel && sce->band_type[sfb] >= INTENSITY_BT2) ||
+            sce->band_type[sfb] == NOISE_BT)
+            continue;
+
+        /* Normal coefficients */
+        abs_pow34_v(O34, &sce->coeffs[start_coef], num_coeffs);
+        dist1 = quantize_and_encode_band_cost(s, NULL, &sce->coeffs[start_coef], NULL,
+                                              O34, num_coeffs, sce->sf_idx[sfb],
+                                              cb_n, s->lambda / band->threshold, INFINITY, &cost1, NULL, 0);
+        cost_coeffs += cost1;
+
+        /* Encoded coefficients - needed for #bits, band type and quant. error */
+        for (i = 0; i < num_coeffs; i++)
+            SENT[i] = sce->coeffs[start_coef + i] - sce->prcoeffs[start_coef + i];
+        abs_pow34_v(S34, SENT, num_coeffs);
+        if (cb_n < RESERVED_BT)
+            cb_p = av_clip(find_min_book(find_max_val(1, num_coeffs, S34), sce->sf_idx[sfb]), cb_min, cb_max);
+        else
+            cb_p = cb_n;
+        quantize_and_encode_band_cost(s, NULL, SENT, QERR, S34, num_coeffs,
+                                      sce->sf_idx[sfb], cb_p, s->lambda / band->threshold, INFINITY,
+                                      &cost2, NULL, 0);
+
+        /* Reconstructed coefficients - needed for distortion measurements */
+        for (i = 0; i < num_coeffs; i++)
+            sce->prcoeffs[start_coef + i] += QERR[i] != 0.0f ? (sce->prcoeffs[start_coef + i] - QERR[i]) : 0.0f;
+        abs_pow34_v(P34, &sce->prcoeffs[start_coef], num_coeffs);
+        if (cb_n < RESERVED_BT)
+            cb_p = av_clip(find_min_book(find_max_val(1, num_coeffs, P34), sce->sf_idx[sfb]), cb_min, cb_max);
+        else
+            cb_p = cb_n;
+        dist2 = quantize_and_encode_band_cost(s, NULL, &sce->prcoeffs[start_coef], NULL,
+                                              P34, num_coeffs, sce->sf_idx[sfb],
+                                              cb_p, s->lambda / band->threshold, INFINITY, NULL, NULL, 0);
+        for (i = 0; i < num_coeffs; i++)
+            dist_spec_err += (O34[i] - P34[i])*(O34[i] - P34[i]);
+        dist_spec_err *= s->lambda / band->threshold;
+        dist2 += dist_spec_err;
+
+        if (dist2 <= dist1 && cb_p <= cb_n) {
+            cost_pred += cost2;
+            sce->ics.prediction_used[sfb] = 1;
+            sce->band_alt[sfb]  = cb_n;
+            sce->band_type[sfb] = cb_p;
+            count++;
+        } else {
+            cost_pred += cost1;
+            sce->band_alt[sfb] = cb_p;
+        }
+    }
+
+    if (count && cost_coeffs < cost_pred) {
+        count = 0;
+        for (sfb = PRED_SFB_START; sfb < pmax; sfb++)
+            RESTORE_PRED(sce, sfb);
+        memset(&sce->ics.prediction_used, 0, sizeof(sce->ics.prediction_used));
+    }
+
+    sce->ics.predictor_present = !!count;
+}
+
+/**
+ * Encoder predictors data.
+ */
+void ff_aac_encode_main_pred(AACEncContext *s, SingleChannelElement *sce)
+{
+    int sfb;
+    IndividualChannelStream *ics = &sce->ics;
+    const int pmax = FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[s->samplerate_index]);
+
+    if (s->profile != FF_PROFILE_AAC_MAIN ||
+        !ics->predictor_present)
+        return;
+
+    put_bits(&s->pb, 1, !!ics->predictor_reset_group);
+    if (ics->predictor_reset_group)
+        put_bits(&s->pb, 5, ics->predictor_reset_group);
+    for (sfb = 0; sfb < pmax; sfb++)
+        put_bits(&s->pb, 1, ics->prediction_used[sfb]);
+}
diff --git a/libavcodec/aacenc_pred.h b/libavcodec/aacenc_pred.h
new file mode 100644
index 0000000..aa305f4
--- /dev/null
+++ b/libavcodec/aacenc_pred.h
@@ -0,0 +1,47 @@
+/*
+ * AAC encoder main-type prediction
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder main-type prediction
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_PRED_H
+#define AVCODEC_AACENC_PRED_H
+
+#include "aacenc.h"
+
+/* Every predictor group needs to get reset at least once in this many frames */
+#define PRED_RESET_FRAME_MIN 240
+
+/* Any frame with less than this amount of frames since last reset is ok */
+#define PRED_RESET_MIN 64
+
+/* Raise to filter any low frequency artifacts due to prediction */
+#define PRED_SFB_START 10
+
+void ff_aac_apply_main_pred(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_adjust_common_pred(AACEncContext *s, ChannelElement *cpe);
+void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_encode_main_pred(AACEncContext *s, SingleChannelElement *sce);
+
+#endif /* AVCODEC_AACENC_PRED_H */
diff --git a/libavcodec/aacenc_quantization.h b/libavcodec/aacenc_quantization.h
new file mode 100644
index 0000000..4250407
--- /dev/null
+++ b/libavcodec/aacenc_quantization.h
@@ -0,0 +1,283 @@
+/*
+ * AAC encoder quantizer
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder quantizer
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_QUANTIZATION_H
+#define AVCODEC_AACENC_QUANTIZATION_H
+
+#include "aactab.h"
+#include "aacenc.h"
+#include "aacenctab.h"
+#include "aacenc_utils.h"
+
+/**
+ * Calculate rate distortion cost for quantizing with given codebook
+ *
+ * @return quantization distortion
+ */
+static av_always_inline float quantize_and_encode_band_cost_template(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *out,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int BT_ZERO, int BT_UNSIGNED,
+                                int BT_PAIR, int BT_ESC, int BT_NOISE, int BT_STEREO,
+                                const float ROUNDING)
+{
+    const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512;
+    const float Q   = ff_aac_pow2sf_tab [q_idx];
+    const float Q34 = ff_aac_pow34sf_tab[q_idx];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    const float CLIPPED_ESCAPE = 165140.0f*IQ;
+    int i, j;
+    float cost = 0;
+    float qenergy = 0;
+    const int dim = BT_PAIR ? 2 : 4;
+    int resbits = 0;
+    int off;
+
+    if (BT_ZERO || BT_NOISE || BT_STEREO) {
+        for (i = 0; i < size; i++)
+            cost += in[i]*in[i];
+        if (bits)
+            *bits = 0;
+        if (energy)
+            *energy = qenergy;
+        if (out) {
+            for (i = 0; i < size; i += dim)
+                for (j = 0; j < dim; j++)
+                    out[i+j] = 0.0f;
+        }
+        return cost * lambda;
+    }
+    if (!scaled) {
+        abs_pow34_v(s->scoefs, in, size);
+        scaled = s->scoefs;
+    }
+    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, aac_cb_maxval[cb], ROUNDING);
+    if (BT_UNSIGNED) {
+        off = 0;
+    } else {
+        off = aac_cb_maxval[cb];
+    }
+    for (i = 0; i < size; i += dim) {
+        const float *vec;
+        int *quants = s->qcoefs + i;
+        int curidx = 0;
+        int curbits;
+        float quantized, rd = 0.0f;
+        for (j = 0; j < dim; j++) {
+            curidx *= aac_cb_range[cb];
+            curidx += quants[j] + off;
+        }
+        curbits =  ff_aac_spectral_bits[cb-1][curidx];
+        vec     = &ff_aac_codebook_vectors[cb-1][curidx*dim];
+        if (BT_UNSIGNED) {
+            for (j = 0; j < dim; j++) {
+                float t = fabsf(in[i+j]);
+                float di;
+                if (BT_ESC && vec[j] == 64.0f) { //FIXME: slow
+                    if (t >= CLIPPED_ESCAPE) {
+                        quantized = CLIPPED_ESCAPE;
+                        curbits += 21;
+                    } else {
+                        int c = av_clip_uintp2(quant(t, Q, ROUNDING), 13);
+                        quantized = c*cbrtf(c)*IQ;
+                        curbits += av_log2(c)*2 - 4 + 1;
+                    }
+                } else {
+                    quantized = vec[j]*IQ;
+                }
+                di = t - quantized;
+                if (out)
+                    out[i+j] = in[i+j] >= 0 ? quantized : -quantized;
+                if (vec[j] != 0.0f)
+                    curbits++;
+                qenergy += quantized*quantized;
+                rd += di*di;
+            }
+        } else {
+            for (j = 0; j < dim; j++) {
+                quantized = vec[j]*IQ;
+                qenergy += quantized*quantized;
+                if (out)
+                    out[i+j] = quantized;
+                rd += (in[i+j] - quantized)*(in[i+j] - quantized);
+            }
+        }
+        cost    += rd * lambda + curbits;
+        resbits += curbits;
+        if (cost >= uplim)
+            return uplim;
+        if (pb) {
+            put_bits(pb, ff_aac_spectral_bits[cb-1][curidx], ff_aac_spectral_codes[cb-1][curidx]);
+            if (BT_UNSIGNED)
+                for (j = 0; j < dim; j++)
+                    if (ff_aac_codebook_vectors[cb-1][curidx*dim+j] != 0.0f)
+                        put_bits(pb, 1, in[i+j] < 0.0f);
+            if (BT_ESC) {
+                for (j = 0; j < 2; j++) {
+                    if (ff_aac_codebook_vectors[cb-1][curidx*2+j] == 64.0f) {
+                        int coef = av_clip_uintp2(quant(fabsf(in[i+j]), Q, ROUNDING), 13);
+                        int len = av_log2(coef);
+
+                        put_bits(pb, len - 4 + 1, (1 << (len - 4 + 1)) - 2);
+                        put_sbits(pb, len, coef);
+                    }
+                }
+            }
+        }
+    }
+
+    if (bits)
+        *bits = resbits;
+    if (energy)
+        *energy = qenergy;
+    return cost;
+}
+
+static inline float quantize_and_encode_band_cost_NONE(struct AACEncContext *s, PutBitContext *pb,
+                                                const float *in, float *quant, const float *scaled,
+                                                int size, int scale_idx, int cb,
+                                                const float lambda, const float uplim,
+                                                int *bits, float *energy) {
+    av_assert0(0);
+    return 0.0f;
+}
+
+#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO, ROUNDING) \
+static float quantize_and_encode_band_cost_ ## NAME(                                         \
+                                struct AACEncContext *s,                                     \
+                                PutBitContext *pb, const float *in, float *quant,            \
+                                const float *scaled, int size, int scale_idx,                \
+                                int cb, const float lambda, const float uplim,               \
+                                int *bits, float *energy) {                                  \
+    return quantize_and_encode_band_cost_template(                                           \
+                                s, pb, in, quant, scaled, size, scale_idx,                   \
+                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits, energy,           \
+                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO,  \
+                                ROUNDING);                                                   \
+}
+
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1, 0, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC_RTZ, 0, 1, 1, 1, 0, 0, ROUND_TO_ZERO)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NOISE, 0, 0, 0, 0, 1, 0, ROUND_STANDARD)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(STEREO,0, 0, 0, 0, 0, 1, ROUND_STANDARD)
+
+static float (*const quantize_and_encode_band_cost_arr[])(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *quant,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy) = {
+    quantize_and_encode_band_cost_ZERO,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_ESC,
+    quantize_and_encode_band_cost_NONE,     /* CB 12 doesn't exist */
+    quantize_and_encode_band_cost_NOISE,
+    quantize_and_encode_band_cost_STEREO,
+    quantize_and_encode_band_cost_STEREO,
+};
+
+static float (*const quantize_and_encode_band_cost_rtz_arr[])(
+                                struct AACEncContext *s,
+                                PutBitContext *pb, const float *in, float *quant,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy) = {
+    quantize_and_encode_band_cost_ZERO,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_SQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_UQUAD,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_SPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_UPAIR,
+    quantize_and_encode_band_cost_ESC_RTZ,
+    quantize_and_encode_band_cost_NONE,     /* CB 12 doesn't exist */
+    quantize_and_encode_band_cost_NOISE,
+    quantize_and_encode_band_cost_STEREO,
+    quantize_and_encode_band_cost_STEREO,
+};
+
+#define quantize_and_encode_band_cost(                                  \
+                                s, pb, in, quant, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy, rtz)               \
+    ((rtz) ? quantize_and_encode_band_cost_rtz_arr : quantize_and_encode_band_cost_arr)[cb]( \
+                                s, pb, in, quant, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy)
+
+static inline float quantize_band_cost(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    return quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
+                                         cb, lambda, uplim, bits, energy, rtz);
+}
+
+static inline int quantize_band_cost_bits(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    int auxbits;
+    quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
+                                         cb, 0.0f, uplim, &auxbits, energy, rtz);
+    if (bits) {
+        *bits = auxbits;
+    }
+    return auxbits;
+}
+
+static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
+                                            const float *in, float *out, int size, int scale_idx,
+                                            int cb, const float lambda, int rtz)
+{
+    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
+                                  INFINITY, NULL, NULL, rtz);
+}
+
+#include "aacenc_quantization_misc.h"
+
+#endif /* AVCODEC_AACENC_QUANTIZATION_H */
diff --git a/libavcodec/aacenc_quantization_misc.h b/libavcodec/aacenc_quantization_misc.h
new file mode 100644
index 0000000..28676ca
--- /dev/null
+++ b/libavcodec/aacenc_quantization_misc.h
@@ -0,0 +1,53 @@
+/*
+ * AAC encoder quantization
+ * Copyright (C) 2015 Claudio Freire
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder quantization misc reusable function templates
+ * @author Claudio Freire ( klaussfreire gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_QUANTIZATION_MISC_H
+#define AVCODEC_AACENC_QUANTIZATION_MISC_H
+
+static inline float quantize_band_cost_cached(struct AACEncContext *s, int w, int g, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    AACQuantizeBandCostCacheEntry *entry;
+    av_assert1(scale_idx >= 0 && scale_idx < 256);
+    entry = &s->quantize_band_cost_cache[scale_idx][w*16+g];
+    if (entry->generation != s->quantize_band_cost_cache_generation || entry->cb != cb || entry->rtz != rtz) {
+        entry->rd = quantize_band_cost(s, in, scaled, size, scale_idx,
+                                       cb, lambda, uplim, &entry->bits, &entry->energy, rtz);
+        entry->cb = cb;
+        entry->rtz = rtz;
+        entry->generation = s->quantize_band_cost_cache_generation;
+    }
+    if (bits)
+        *bits = entry->bits;
+    if (energy)
+        *energy = entry->energy;
+    return entry->rd;
+}
+
+#endif /* AVCODEC_AACENC_QUANTIZATION_MISC_H */
diff --git a/libavcodec/aacenc_tns.c b/libavcodec/aacenc_tns.c
new file mode 100644
index 0000000..2ffe1f8
--- /dev/null
+++ b/libavcodec/aacenc_tns.c
@@ -0,0 +1,215 @@
+/*
+ * AAC encoder TNS
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder temporal noise shaping
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#include "libavutil/libm.h"
+#include "aacenc.h"
+#include "aacenc_tns.h"
+#include "aactab.h"
+#include "aacenc_utils.h"
+#include "aacenc_quantization.h"
+
+/* Could be set to 3 to save an additional bit at the cost of little quality */
+#define TNS_Q_BITS 4
+
+/* Coefficient resolution in short windows */
+#define TNS_Q_BITS_IS8 4
+
+/* We really need the bits we save here elsewhere */
+#define TNS_ENABLE_COEF_COMPRESSION
+
+/* TNS will only be used if the LPC gain is within these margins */
+#define TNS_GAIN_THRESHOLD_LOW      1.4f
+#define TNS_GAIN_THRESHOLD_HIGH     1.16f*TNS_GAIN_THRESHOLD_LOW
+
+static inline int compress_coeffs(int *coef, int order, int c_bits)
+{
+    int i;
+    const int low_idx   = c_bits ?  4 : 2;
+    const int shift_val = c_bits ?  8 : 4;
+    const int high_idx  = c_bits ? 11 : 5;
+#ifndef TNS_ENABLE_COEF_COMPRESSION
+    return 0;
+#endif /* TNS_ENABLE_COEF_COMPRESSION */
+    for (i = 0; i < order; i++)
+        if (coef[i] >= low_idx && coef[i] <= high_idx)
+            return 0;
+    for (i = 0; i < order; i++)
+        coef[i] -= (coef[i] > high_idx) ? shift_val : 0;
+    return 1;
+}
+
+/**
+ * Encode TNS data.
+ * Coefficient compression is simply not lossless as it should be
+ * on any decoder tested and as such is not active.
+ */
+void ff_aac_encode_tns_info(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    int i, w, filt, coef_compress = 0, coef_len;
+    const int is8 = sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int c_bits = is8 ? TNS_Q_BITS_IS8 == 4 : TNS_Q_BITS == 4;
+
+    if (!sce->tns.present)
+        return;
+
+    for (i = 0; i < sce->ics.num_windows; i++) {
+        put_bits(&s->pb, 2 - is8, sce->tns.n_filt[i]);
+        if (!tns->n_filt[i])
+            continue;
+        put_bits(&s->pb, 1, c_bits);
+        for (filt = 0; filt < tns->n_filt[i]; filt++) {
+            put_bits(&s->pb, 6 - 2 * is8, tns->length[i][filt]);
+            put_bits(&s->pb, 5 - 2 * is8, tns->order[i][filt]);
+            if (!tns->order[i][filt])
+                continue;
+            put_bits(&s->pb, 1, tns->direction[i][filt]);
+            coef_compress = compress_coeffs(tns->coef_idx[i][filt],
+                                            tns->order[i][filt], c_bits);
+            put_bits(&s->pb, 1, coef_compress);
+            coef_len = c_bits + 3 - coef_compress;
+            for (w = 0; w < tns->order[i][filt]; w++)
+                put_bits(&s->pb, coef_len, tns->coef_idx[i][filt][w]);
+        }
+    }
+}
+
+/* Apply TNS filter */
+void ff_aac_apply_tns(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    IndividualChannelStream *ics = &sce->ics;
+    int w, filt, m, i, top, order, bottom, start, end, size, inc;
+    const int mmm = FFMIN(ics->tns_max_bands, ics->max_sfb);
+    float lpc[TNS_MAX_ORDER];
+
+    for (w = 0; w < ics->num_windows; w++) {
+        bottom = ics->num_swb;
+        for (filt = 0; filt < tns->n_filt[w]; filt++) {
+            top    = bottom;
+            bottom = FFMAX(0, top - tns->length[w][filt]);
+            order  = tns->order[w][filt];
+            if (order == 0)
+                continue;
+
+            // tns_decode_coef
+            compute_lpc_coefs(tns->coef[w][filt], order, lpc, 0, 0, 0);
+
+            start = ics->swb_offset[FFMIN(bottom, mmm)];
+            end   = ics->swb_offset[FFMIN(   top, mmm)];
+            if ((size = end - start) <= 0)
+                continue;
+            if (tns->direction[w][filt]) {
+                inc = -1;
+                start = end - 1;
+            } else {
+                inc = 1;
+            }
+            start += w * 128;
+
+            /* AR filter */
+            for (m = 0; m < size; m++, start += inc) {
+                for (i = 1; i <= FFMIN(m, order); i++) {
+                    sce->coeffs[start] += lpc[i-1]*sce->pcoeffs[start - i*inc];
+                }
+            }
+        }
+    }
+}
+
+/*
+ * c_bits - 1 if 4 bit coefficients, 0 if 3 bit coefficients
+ */
+static inline void quantize_coefs(double *coef, int *idx, float *lpc, int order,
+                                  int c_bits)
+{
+    int i;
+    const float *quant_arr = tns_tmp2_map[c_bits];
+    for (i = 0; i < order; i++) {
+        idx[i] = quant_array_idx(coef[i], quant_arr, c_bits ? 16 : 8);
+        lpc[i] = quant_arr[idx[i]];
+    }
+}
+
+/*
+ * 3 bits per coefficient with 8 short windows
+ */
+void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce)
+{
+    TemporalNoiseShaping *tns = &sce->tns;
+    int w, g, count = 0;
+    double gain, coefs[MAX_LPC_ORDER];
+    const int mmm = FFMIN(sce->ics.tns_max_bands, sce->ics.max_sfb);
+    const int is8 = sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE;
+    const int c_bits = is8 ? TNS_Q_BITS_IS8 == 4 : TNS_Q_BITS == 4;
+    const int sfb_start = av_clip(tns_min_sfb[is8][s->samplerate_index], 0, mmm);
+    const int sfb_end   = av_clip(sce->ics.num_swb, 0, mmm);
+    const int order = is8 ? 7 : s->profile == FF_PROFILE_AAC_LOW ? 12 : TNS_MAX_ORDER;
+    const int slant = sce->ics.window_sequence[0] == LONG_STOP_SEQUENCE  ? 1 :
+                      sce->ics.window_sequence[0] == LONG_START_SEQUENCE ? 0 : 2;
+    const int sfb_len = sfb_end - sfb_start;
+    const int coef_len = sce->ics.swb_offset[sfb_end] - sce->ics.swb_offset[sfb_start];
+
+    if (coef_len <= 0 || sfb_len <= 0) {
+        sce->tns.present = 0;
+        return;
+    }
+
+    for (w = 0; w < sce->ics.num_windows; w++) {
+        float en[2] = {0.0f, 0.0f};
+        int oc_start = 0, os_start = 0;
+        int coef_start = sce->ics.swb_offset[sfb_start];
+
+        for (g = sfb_start; g < sce->ics.num_swb && g <= sfb_end; g++) {
+            FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[w*16+g];
+            if (g > sfb_start + (sfb_len/2))
+                en[1] += band->energy;
+            else
+                en[0] += band->energy;
+        }
+
+        /* LPC */
+        gain = ff_lpc_calc_ref_coefs_f(&s->lpc, &sce->coeffs[w*128 + coef_start],
+                                       coef_len, order, coefs);
+
+        if (!order || !isfinite(gain) || gain < TNS_GAIN_THRESHOLD_LOW || gain > TNS_GAIN_THRESHOLD_HIGH)
+            continue;
+
+        tns->n_filt[w] = is8 ? 1 : order != TNS_MAX_ORDER ? 2 : 3;
+        for (g = 0; g < tns->n_filt[w]; g++) {
+            tns->direction[w][g] = slant != 2 ? slant : en[g] < en[!g];
+            tns->order[w][g] = g < tns->n_filt[w] ? order/tns->n_filt[w] : order - oc_start;
+            tns->length[w][g] = g < tns->n_filt[w] ? sfb_len/tns->n_filt[w] : sfb_len - os_start;
+            quantize_coefs(&coefs[oc_start], tns->coef_idx[w][g], tns->coef[w][g],
+                            tns->order[w][g], c_bits);
+            oc_start += tns->order[w][g];
+            os_start += tns->length[w][g];
+        }
+        count++;
+    }
+    sce->tns.present = !!count;
+}
diff --git a/libavcodec/aacenc_tns.h b/libavcodec/aacenc_tns.h
new file mode 100644
index 0000000..466738d
--- /dev/null
+++ b/libavcodec/aacenc_tns.h
@@ -0,0 +1,37 @@
+/*
+ * AAC encoder TNS
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder temporal noise shaping
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_TNS_H
+#define AVCODEC_AACENC_TNS_H
+
+#include "aacenc.h"
+
+void ff_aac_encode_tns_info(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_apply_tns(AACEncContext *s, SingleChannelElement *sce);
+void ff_aac_search_for_tns(AACEncContext *s, SingleChannelElement *sce);
+
+#endif /* AVCODEC_AACENC_TNS_H */
diff --git a/libavcodec/aacenc_utils.h b/libavcodec/aacenc_utils.h
new file mode 100644
index 0000000..bb1dcb4
--- /dev/null
+++ b/libavcodec/aacenc_utils.h
@@ -0,0 +1,266 @@
+/*
+ * AAC encoder utilities
+ * Copyright (C) 2015 Rostislav Pehlivanov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder utilities
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENC_UTILS_H
+#define AVCODEC_AACENC_UTILS_H
+
+#include "libavutil/ffmath.h"
+#include "aac.h"
+#include "aacenctab.h"
+#include "aactab.h"
+
+#define ROUND_STANDARD 0.4054f
+#define ROUND_TO_ZERO 0.1054f
+#define C_QUANT 0.4054f
+
+static inline void abs_pow34_v(float *out, const float *in, const int size)
+{
+    int i;
+    for (i = 0; i < size; i++) {
+        float a = fabsf(in[i]);
+        out[i] = sqrtf(a * sqrtf(a));
+    }
+}
+
+static inline float pos_pow34(float a)
+{
+    return sqrtf(a * sqrtf(a));
+}
+
+/**
+ * Quantize one coefficient.
+ * @return absolute value of the quantized coefficient
+ * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
+ */
+static inline int quant(float coef, const float Q, const float rounding)
+{
+    float a = coef * Q;
+    return sqrtf(a * sqrtf(a)) + rounding;
+}
+
+static inline void quantize_bands(int *out, const float *in, const float *scaled,
+                                  int size, float Q34, int is_signed, int maxval,
+                                  const float rounding)
+{
+    int i;
+    for (i = 0; i < size; i++) {
+        float qc = scaled[i] * Q34;
+        int tmp = (int)FFMIN(qc + rounding, (float)maxval);
+        if (is_signed && in[i] < 0.0f) {
+            tmp = -tmp;
+        }
+        out[i] = tmp;
+    }
+}
+
+static inline float find_max_val(int group_len, int swb_size, const float *scaled)
+{
+    float maxval = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        for (i = 0; i < swb_size; i++) {
+            maxval = FFMAX(maxval, scaled[w2*128+i]);
+        }
+    }
+    return maxval;
+}
+
+static inline int find_min_book(float maxval, int sf)
+{
+    float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
+    int qmaxval, cb;
+    qmaxval = maxval * Q34 + C_QUANT;
+    if (qmaxval >= (FF_ARRAY_ELEMS(aac_maxval_cb)))
+        cb = 11;
+    else
+        cb = aac_maxval_cb[qmaxval];
+    return cb;
+}
+
+static inline float find_form_factor(int group_len, int swb_size, float thresh,
+                                     const float *scaled, float nzslope) {
+    const float iswb_size = 1.0f / swb_size;
+    const float iswb_sizem1 = 1.0f / (swb_size - 1);
+    const float ethresh = thresh;
+    float form = 0.0f, weight = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        float e = 0.0f, e2 = 0.0f, var = 0.0f, maxval = 0.0f;
+        float nzl = 0;
+        for (i = 0; i < swb_size; i++) {
+            float s = fabsf(scaled[w2*128+i]);
+            maxval = FFMAX(maxval, s);
+            e += s;
+            e2 += s *= s;
+            /* We really don't want a hard non-zero-line count, since
+             * even below-threshold lines do add up towards band spectral power.
+             * So, fall steeply towards zero, but smoothly
+             */
+            if (s >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                if (nzslope == 2.f)
+                    nzl += (s / ethresh) * (s / ethresh);
+                else
+                    nzl += ff_fast_powf(s / ethresh, nzslope);
+            }
+        }
+        if (e2 > thresh) {
+            float frm;
+            e *= iswb_size;
+
+            /** compute variance */
+            for (i = 0; i < swb_size; i++) {
+                float d = fabsf(scaled[w2*128+i]) - e;
+                var += d*d;
+            }
+            var = sqrtf(var * iswb_sizem1);
+
+            e2 *= iswb_size;
+            frm = e / FFMIN(e+4*var,maxval);
+            form += e2 * sqrtf(frm) / FFMAX(0.5f,nzl);
+            weight += e2;
+        }
+    }
+    if (weight > 0) {
+        return form / weight;
+    } else {
+        return 1.0f;
+    }
+}
+
+/** Return the minimum scalefactor where the quantized coef does not clip. */
+static inline uint8_t coef2minsf(float coef)
+{
+    return av_clip_uint8(log2f(coef)*4 - 69 + SCALE_ONE_POS - SCALE_DIV_512);
+}
+
+/** Return the maximum scalefactor where the quantized coef is not zero. */
+static inline uint8_t coef2maxsf(float coef)
+{
+    return av_clip_uint8(log2f(coef)*4 +  6 + SCALE_ONE_POS - SCALE_DIV_512);
+}
+
+/*
+ * Returns the closest possible index to an array of float values, given a value.
+ */
+static inline int quant_array_idx(const float val, const float *arr, const int num)
+{
+    int i, index = 0;
+    float quant_min_err = INFINITY;
+    for (i = 0; i < num; i++) {
+        float error = (val - arr[i])*(val - arr[i]);
+        if (error < quant_min_err) {
+            quant_min_err = error;
+            index = i;
+        }
+    }
+    return index;
+}
+
+/**
+ * approximates exp10f(-3.0f*(0.5f + 0.5f * cosf(FFMIN(b,15.5f) / 15.5f)))
+ */
+static av_always_inline float bval2bmax(float b)
+{
+    return 0.001f + 0.0035f * (b*b*b) / (15.5f*15.5f*15.5f);
+}
+
+/*
+ * Compute a nextband map to be used with SF delta constraint utilities.
+ * The nextband array should contain 128 elements, and positions that don't
+ * map to valid, nonzero bands of the form w*16+g (with w being the initial
+ * window of the window group, only) are left indetermined.
+ */
+static inline void ff_init_nextband_map(const SingleChannelElement *sce, uint8_t *nextband)
+{
+    unsigned char prevband = 0;
+    int w, g;
+    /** Just a safe default */
+    for (g = 0; g < 128; g++)
+        nextband[g] = g;
+
+    /** Now really navigate the nonzero band chain */
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0; g < sce->ics.num_swb; g++) {
+            if (!sce->zeroes[w*16+g] && sce->band_type[w*16+g] < RESERVED_BT)
+                prevband = nextband[prevband] = w*16+g;
+        }
+    }
+    nextband[prevband] = prevband; /* terminate */
+}
+
+/*
+ * Updates nextband to reflect a removed band (equivalent to
+ * calling ff_init_nextband_map after marking a band as zero)
+ */
+static inline void ff_nextband_remove(uint8_t *nextband, int prevband, int band)
+{
+    nextband[prevband] = nextband[band];
+}
+
+/*
+ * Checks whether the specified band could be removed without inducing
+ * scalefactor delta that violates SF delta encoding constraints.
+ * prev_sf has to be the scalefactor of the previous nonzero, nonspecial
+ * band, in encoding order, or negative if there was no such band.
+ */
+static inline int ff_sfdelta_can_remove_band(const SingleChannelElement *sce,
+    const uint8_t *nextband, int prev_sf, int band)
+{
+    return prev_sf >= 0
+        && sce->sf_idx[nextband[band]] >= (prev_sf - SCALE_MAX_DIFF)
+        && sce->sf_idx[nextband[band]] <= (prev_sf + SCALE_MAX_DIFF);
+}
+
+/*
+ * Checks whether the specified band's scalefactor could be replaced
+ * with another one without violating SF delta encoding constraints.
+ * prev_sf has to be the scalefactor of the previous nonzero, nonsepcial
+ * band, in encoding order, or negative if there was no such band.
+ */
+static inline int ff_sfdelta_can_replace(const SingleChannelElement *sce,
+    const uint8_t *nextband, int prev_sf, int new_sf, int band)
+{
+    return new_sf >= (prev_sf - SCALE_MAX_DIFF)
+        && new_sf <= (prev_sf + SCALE_MAX_DIFF)
+        && sce->sf_idx[nextband[band]] >= (new_sf - SCALE_MAX_DIFF)
+        && sce->sf_idx[nextband[band]] <= (new_sf + SCALE_MAX_DIFF);
+}
+
+#define ERROR_IF(cond, ...) \
+    if (cond) { \
+        av_log(avctx, AV_LOG_ERROR, __VA_ARGS__); \
+        return AVERROR(EINVAL); \
+    }
+
+#define WARN_IF(cond, ...) \
+    if (cond) { \
+        av_log(avctx, AV_LOG_WARNING, __VA_ARGS__); \
+    }
+
+#endif /* AVCODEC_AACENC_UTILS_H */
diff --git a/libavcodec/aacenctab.c b/libavcodec/aacenctab.c
new file mode 100644
index 0000000..f3d70fb
--- /dev/null
+++ b/libavcodec/aacenctab.c
@@ -0,0 +1,108 @@
+/*
+ * AAC encoder data
+ * Copyright (c) 2015 Rostislav Pehlivanov ( atomnuker gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "aacenctab.h"
+
+static const uint8_t swb_size_128_96[] = {
+    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
+};
+
+static const uint8_t swb_size_128_64[] = {
+    4, 4, 4, 4, 4, 4, 8, 8, 8, 16, 28, 36
+};
+
+static const uint8_t swb_size_128_48[] = {
+    4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16
+};
+
+static const uint8_t swb_size_128_24[] = {
+    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 12, 12, 16, 16, 20
+};
+
+static const uint8_t swb_size_128_16[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 12, 12, 16, 20, 20
+};
+
+static const uint8_t swb_size_128_8[] = {
+    4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 12, 16, 20, 20
+};
+
+static const uint8_t swb_size_1024_96[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_64[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8,
+    12, 12, 12, 16, 16, 16, 20, 24, 24, 28, 36,
+    40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40
+};
+
+static const uint8_t swb_size_1024_48[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    96
+};
+
+static const uint8_t swb_size_1024_32[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+};
+
+static const uint8_t swb_size_1024_24[] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 16, 16, 16, 20, 20, 24, 24, 28, 28,
+    32, 36, 36, 40, 44, 48, 52, 52, 64, 64, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_16[] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 24, 24, 28, 28,
+    32, 36, 40, 40, 44, 48, 52, 56, 60, 64, 64, 64
+};
+
+static const uint8_t swb_size_1024_8[] = {
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 28, 28,
+    32, 36, 36, 40, 44, 48, 52, 56, 60, 64, 80
+};
+
+const uint8_t *ff_aac_swb_size_128[] = {
+    swb_size_128_96, swb_size_128_96, swb_size_128_64,
+    swb_size_128_48, swb_size_128_48, swb_size_128_48,
+    swb_size_128_24, swb_size_128_24, swb_size_128_16,
+    swb_size_128_16, swb_size_128_16, swb_size_128_8,
+    swb_size_128_8
+};
+
+const uint8_t *ff_aac_swb_size_1024[] = {
+    swb_size_1024_96, swb_size_1024_96, swb_size_1024_64,
+    swb_size_1024_48, swb_size_1024_48, swb_size_1024_32,
+    swb_size_1024_24, swb_size_1024_24, swb_size_1024_16,
+    swb_size_1024_16, swb_size_1024_16, swb_size_1024_8,
+    swb_size_1024_8
+};
+
+const int ff_aac_swb_size_128_len  = FF_ARRAY_ELEMS(ff_aac_swb_size_128);
+const int ff_aac_swb_size_1024_len = FF_ARRAY_ELEMS(ff_aac_swb_size_1024);
diff --git a/libavcodec/aacenctab.h b/libavcodec/aacenctab.h
new file mode 100644
index 0000000..5fc9411
--- /dev/null
+++ b/libavcodec/aacenctab.h
@@ -0,0 +1,128 @@
+/*
+ * AAC encoder data
+ * Copyright (c) 2015 Rostislav Pehlivanov ( atomnuker gmail com )
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder data
+ * @author Rostislav Pehlivanov ( atomnuker gmail com )
+ */
+
+#ifndef AVCODEC_AACENCTAB_H
+#define AVCODEC_AACENCTAB_H
+
+#include "aac.h"
+
+/** Total number of usable codebooks **/
+#define CB_TOT 12
+
+/** Total number of codebooks, including special ones **/
+#define CB_TOT_ALL 15
+
+#define AAC_MAX_CHANNELS 8
+
+extern const uint8_t *ff_aac_swb_size_1024[];
+extern const int      ff_aac_swb_size_1024_len;
+extern const uint8_t *ff_aac_swb_size_128[];
+extern const int      ff_aac_swb_size_128_len;
+
+/** default channel configurations */
+static const uint8_t aac_chan_configs[AAC_MAX_CHANNELS][6] = {
+    {1, TYPE_SCE},                                         // 1 channel  - single channel element
+    {1, TYPE_CPE},                                         // 2 channels - channel pair
+    {2, TYPE_SCE, TYPE_CPE},                               // 3 channels - center + stereo
+    {3, TYPE_SCE, TYPE_CPE, TYPE_SCE},                     // 4 channels - front center + stereo + back center
+    {3, TYPE_SCE, TYPE_CPE, TYPE_CPE},                     // 5 channels - front center + stereo + back stereo
+    {4, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_LFE},           // 6 channels - front center + stereo + back stereo + LFE
+    {0},                                                   // 7 channels - invalid without PCE
+    {5, TYPE_SCE, TYPE_CPE, TYPE_CPE, TYPE_CPE, TYPE_LFE}, // 8 channels - front center + front stereo + side stereo + back stereo + LFE
+};
+
+/**
+ * Table to remap channels from libavcodec's default order to AAC order.
+ */
+static const uint8_t aac_chan_maps[AAC_MAX_CHANNELS][AAC_MAX_CHANNELS] = {
+    { 0 },
+    { 0, 1 },
+    { 2, 0, 1 },
+    { 2, 0, 1, 3 },
+    { 2, 0, 1, 3, 4 },
+    { 2, 0, 1, 4, 5, 3 },
+    { 0 },
+    { 2, 0, 1, 6, 7, 4, 5, 3 },
+};
+
+/* duplicated from avpriv_mpeg4audio_sample_rates to avoid shared build
+ * failures */
+static const int mpeg4audio_sample_rates[16] = {
+    96000, 88200, 64000, 48000, 44100, 32000,
+    24000, 22050, 16000, 12000, 11025, 8000, 7350
+};
+
+/** bits needed to code codebook run value for long windows */
+static const uint8_t run_value_bits_long[64] = {
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
+};
+
+/** bits needed to code codebook run value for short windows */
+static const uint8_t run_value_bits_short[16] = {
+    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
+};
+
+/* TNS starting SFBs for long and short windows */
+static const uint8_t tns_min_sfb_short[16] = {
+    2, 2, 2, 3, 3, 4, 6, 6, 8, 10, 10, 12, 12, 12, 12, 12
+};
+
+static const uint8_t tns_min_sfb_long[16] = {
+    12, 13, 15, 16, 17, 20, 25, 26, 24, 28, 30, 31, 31, 31, 31, 31
+};
+
+static const uint8_t * const tns_min_sfb[2] = {
+    tns_min_sfb_long, tns_min_sfb_short
+};
+
+static const uint8_t * const run_value_bits[2] = {
+    run_value_bits_long, run_value_bits_short
+};
+
+/** Map to convert values from BandCodingPath index to a codebook index **/
+static const uint8_t aac_cb_out_map[CB_TOT_ALL]  = {0,1,2,3,4,5,6,7,8,9,10,11,13,14,15};
+/** Inverse map to convert from codebooks to BandCodingPath indices **/
+static const uint8_t aac_cb_in_map[CB_TOT_ALL+1] = {0,1,2,3,4,5,6,7,8,9,10,11,0,12,13,14};
+
+static const uint8_t aac_cb_range [12] = {0, 3, 3, 3, 3, 9, 9, 8, 8, 13, 13, 17};
+static const uint8_t aac_cb_maxval[12] = {0, 1, 1, 2, 2, 4, 4, 7, 7, 12, 12, 16};
+
+static const unsigned char aac_maxval_cb[] = {
+    0, 1, 3, 5, 5, 7, 7, 7, 9, 9, 9, 9, 9, 11
+};
+
+static const int aacenc_profiles[] = {
+    FF_PROFILE_AAC_MAIN,
+    FF_PROFILE_AAC_LOW,
+    FF_PROFILE_AAC_LTP,
+    FF_PROFILE_MPEG2_AAC_LOW,
+};
+
+#endif /* AVCODEC_AACENCTAB_H */
diff --git a/libavcodec/aacps.c b/libavcodec/aacps.c
index df069c3..ccc79ff 100644
--- a/libavcodec/aacps.c
+++ b/libavcodec/aacps.c
@@ -2,31 +2,38 @@
  * MPEG-4 Parametric Stereo decoding functions
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
  */
 
 #include <stdint.h>
 #include "libavutil/common.h"
-#include "libavutil/internal.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "aacps.h"
+#if USE_FIXED
+#include "aacps_fixed_tablegen.h"
+#else
+#include "libavutil/internal.h"
 #include "aacps_tablegen.h"
+#endif /* USE_FIXED */
 #include "aacpsdata.c"
 
 #define PS_BASELINE 0  ///< Operate in Baseline PS mode
@@ -148,7 +155,7 @@ static void ipdopd_reset(int8_t *ipd_hist, int8_t *opd_hist)
     }
 }
 
-int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps, int bits_left)
+int AAC_RENAME(ff_ps_read_data)(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps, int bits_left)
 {
     int e;
     int bit_count_start = get_bits_count(gb_host);
@@ -236,6 +243,7 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps
     if (!ps->num_env || ps->border_position[ps->num_env] < numQMFSlots - 1) {
         //Create a fake envelope
         int source = ps->num_env ? ps->num_env - 1 : ps->num_env_old - 1;
+        int b;
         if (source >= 0 && source != ps->num_env) {
             if (ps->enable_iid) {
                 memcpy(ps->iid_par+ps->num_env, ps->iid_par+source, sizeof(ps->iid_par[0]));
@@ -248,6 +256,22 @@ int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb_host, PSContext *ps
                 memcpy(ps->opd_par+ps->num_env, ps->opd_par+source, sizeof(ps->opd_par[0]));
             }
         }
+        if (ps->enable_iid){
+            for (b = 0; b < ps->nr_iid_par; b++) {
+                if (FFABS(ps->iid_par[ps->num_env][b]) > 7 + 8 * ps->iid_quant) {
+                    av_log(avctx, AV_LOG_ERROR, "iid_par invalid\n");
+                    goto err;
+                }
+            }
+        }
+        if (ps->enable_icc){
+            for (b = 0; b < ps->nr_iid_par; b++) {
+                if (ps->icc_par[ps->num_env][b] > 7U) {
+                    av_log(avctx, AV_LOG_ERROR, "icc_par invalid\n");
+                    goto err;
+                }
+            }
+        }
         ps->num_env++;
         ps->border_position[ps->num_env] = numQMFSlots - 1;
     }
@@ -285,35 +309,41 @@ err:
 
 /** Split one subband into 2 subsubbands with a symmetric real filter.
  * The filter must have its non-center even coefficients equal to zero. */
-static void hybrid2_re(float (*in)[2], float (*out)[32][2], const float filter[8], int len, int reverse)
+static void hybrid2_re(INTFLOAT (*in)[2], INTFLOAT (*out)[32][2], const INTFLOAT filter[8], int len, int reverse)
 {
     int i, j;
     for (i = 0; i < len; i++, in++) {
-        float re_in = filter[6] * in[6][0];          //real inphase
-        float re_op = 0.0f;                          //real out of phase
-        float im_in = filter[6] * in[6][1];          //imag inphase
-        float im_op = 0.0f;                          //imag out of phase
+        INT64FLOAT re_in = AAC_MUL31(filter[6], in[6][0]); //real inphase
+        INT64FLOAT re_op = 0.0f;                          //real out of phase
+        INT64FLOAT im_in = AAC_MUL31(filter[6], in[6][1]); //imag inphase
+        INT64FLOAT im_op = 0.0f;                          //imag out of phase
         for (j = 0; j < 6; j += 2) {
-            re_op += filter[j+1] * (in[j+1][0] + in[12-j-1][0]);
-            im_op += filter[j+1] * (in[j+1][1] + in[12-j-1][1]);
+            re_op += (INT64FLOAT)filter[j+1] * (in[j+1][0] + in[12-j-1][0]);
+            im_op += (INT64FLOAT)filter[j+1] * (in[j+1][1] + in[12-j-1][1]);
         }
-        out[ reverse][i][0] = re_in + re_op;
-        out[ reverse][i][1] = im_in + im_op;
-        out[!reverse][i][0] = re_in - re_op;
-        out[!reverse][i][1] = im_in - im_op;
+
+#if USE_FIXED
+        re_op = (re_op + 0x40000000) >> 31;
+        im_op = (im_op + 0x40000000) >> 31;
+#endif /* USE_FIXED */
+
+        out[ reverse][i][0] = (INTFLOAT)(re_in + re_op);
+        out[ reverse][i][1] = (INTFLOAT)(im_in + im_op);
+        out[!reverse][i][0] = (INTFLOAT)(re_in - re_op);
+        out[!reverse][i][1] = (INTFLOAT)(im_in - im_op);
     }
 }
 
 /** Split one subband into 6 subsubbands with a complex filter */
-static void hybrid6_cx(PSDSPContext *dsp, float (*in)[2], float (*out)[32][2],
-                       TABLE_CONST float (*filter)[8][2], int len)
+static void hybrid6_cx(PSDSPContext *dsp, INTFLOAT (*in)[2], INTFLOAT (*out)[32][2],
+                       TABLE_CONST INTFLOAT (*filter)[8][2], int len)
 {
     int i;
     int N = 8;
-    LOCAL_ALIGNED_16(float, temp, [8], [2]);
+    LOCAL_ALIGNED_16(INTFLOAT, temp, [8], [2]);
 
     for (i = 0; i < len; i++, in++) {
-        dsp->hybrid_analysis(temp, in, (const float (*)[8][2]) filter, 1, N);
+        dsp->hybrid_analysis(temp, in, (const INTFLOAT (*)[8][2]) filter, 1, N);
         out[0][i][0] = temp[6][0];
         out[0][i][1] = temp[6][1];
         out[1][i][0] = temp[7][0];
@@ -330,18 +360,18 @@ static void hybrid6_cx(PSDSPContext *dsp, float (*in)[2], float (*out)[32][2],
 }
 
 static void hybrid4_8_12_cx(PSDSPContext *dsp,
-                            float (*in)[2], float (*out)[32][2],
-                            TABLE_CONST float (*filter)[8][2], int N, int len)
+                            INTFLOAT (*in)[2], INTFLOAT (*out)[32][2],
+                            TABLE_CONST INTFLOAT (*filter)[8][2], int N, int len)
 {
     int i;
 
     for (i = 0; i < len; i++, in++) {
-        dsp->hybrid_analysis(out[0] + i, in, (const float (*)[8][2]) filter, 32, N);
+        dsp->hybrid_analysis(out[0] + i, in, (const INTFLOAT (*)[8][2]) filter, 32, N);
     }
 }
 
-static void hybrid_analysis(PSDSPContext *dsp, float out[91][32][2],
-                            float in[5][44][2], float L[2][38][64],
+static void hybrid_analysis(PSDSPContext *dsp, INTFLOAT out[91][32][2],
+                            INTFLOAT in[5][44][2], INTFLOAT L[2][38][64],
                             int is34, int len)
 {
     int i, j;
@@ -370,8 +400,8 @@ static void hybrid_analysis(PSDSPContext *dsp, float out[91][32][2],
     }
 }
 
-static void hybrid_synthesis(PSDSPContext *dsp, float out[2][38][64],
-                             float in[91][32][2], int is34, int len)
+static void hybrid_synthesis(PSDSPContext *dsp, INTFLOAT out[2][38][64],
+                             INTFLOAT in[91][32][2], int is34, int len)
 {
     int i, n;
     if (is34) {
@@ -412,9 +442,10 @@ static void hybrid_synthesis(PSDSPContext *dsp, float out[2][38][64],
 }
 
 /// All-pass filter decay slope
-#define DECAY_SLOPE      0.05f
+#define DECAY_SLOPE      Q30(0.05f)
 /// Number of frequency bands that can be addressed by the parameter index, b(k)
 static const int   NR_PAR_BANDS[]      = { 20, 34 };
+static const int   NR_IPDOPD_BANDS[]   = { 11, 17 };
 /// Number of frequency bands that can be addressed by the sub subband index, k
 static const int   NR_BANDS[]          = { 71, 91 };
 /// Start frequency band for the all-pass filter decay slope
@@ -465,28 +496,43 @@ static void map_idx_34_to_20(int8_t *par_mapped, const int8_t *par, int full)
     }
 }
 
-static void map_val_34_to_20(float par[PS_MAX_NR_IIDICC])
+static void map_val_34_to_20(INTFLOAT par[PS_MAX_NR_IIDICC])
 {
+#if USE_FIXED
+    par[ 0] = (int)(((int64_t)(par[ 0] + (par[ 1]>>1)) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 1] = (int)(((int64_t)((par[ 1]>>1) + par[ 2]) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 2] = (int)(((int64_t)(par[ 3] + (par[ 4]>>1)) * 1431655765 + \
+                      0x40000000) >> 31);
+    par[ 3] = (int)(((int64_t)((par[ 4]>>1) + par[ 5]) * 1431655765 + \
+                      0x40000000) >> 31);
+#else
     par[ 0] = (2*par[ 0] +   par[ 1]) * 0.33333333f;
     par[ 1] = (  par[ 1] + 2*par[ 2]) * 0.33333333f;
     par[ 2] = (2*par[ 3] +   par[ 4]) * 0.33333333f;
     par[ 3] = (  par[ 4] + 2*par[ 5]) * 0.33333333f;
-    par[ 4] = (  par[ 6] +   par[ 7]) * 0.5f;
-    par[ 5] = (  par[ 8] +   par[ 9]) * 0.5f;
+#endif /* USE_FIXED */
+    par[ 4] = AAC_HALF_SUM(par[ 6], par[ 7]);
+    par[ 5] = AAC_HALF_SUM(par[ 8], par[ 9]);
     par[ 6] =    par[10];
     par[ 7] =    par[11];
-    par[ 8] = (  par[12] +   par[13]) * 0.5f;
-    par[ 9] = (  par[14] +   par[15]) * 0.5f;
+    par[ 8] = AAC_HALF_SUM(par[12], par[13]);
+    par[ 9] = AAC_HALF_SUM(par[14], par[15]);
     par[10] =    par[16];
     par[11] =    par[17];
     par[12] =    par[18];
     par[13] =    par[19];
-    par[14] = (  par[20] +   par[21]) * 0.5f;
-    par[15] = (  par[22] +   par[23]) * 0.5f;
-    par[16] = (  par[24] +   par[25]) * 0.5f;
-    par[17] = (  par[26] +   par[27]) * 0.5f;
+    par[14] = AAC_HALF_SUM(par[20], par[21]);
+    par[15] = AAC_HALF_SUM(par[22], par[23]);
+    par[16] = AAC_HALF_SUM(par[24], par[25]);
+    par[17] = AAC_HALF_SUM(par[26], par[27]);
+#if USE_FIXED
+    par[18] = (((par[28]+2)>>2) + ((par[29]+2)>>2) + ((par[30]+2)>>2) + ((par[31]+2)>>2));
+#else
     par[18] = (  par[28] +   par[29] +   par[30] +   par[31]) * 0.25f;
-    par[19] = (  par[32] +   par[33]) * 0.5f;
+#endif /* USE_FIXED */
+    par[19] = AAC_HALF_SUM(par[32], par[33]);
 }
 
 static void map_idx_10_to_34(int8_t *par_mapped, const int8_t *par, int full)
@@ -571,7 +617,7 @@ static void map_idx_20_to_34(int8_t *par_mapped, const int8_t *par, int full)
     par_mapped[ 0] =  par[ 0];
 }
 
-static void map_val_20_to_34(float par[PS_MAX_NR_IIDICC])
+static void map_val_20_to_34(INTFLOAT par[PS_MAX_NR_IIDICC])
 {
     par[33] =  par[19];
     par[32] =  par[19];
@@ -602,28 +648,29 @@ static void map_val_20_to_34(float par[PS_MAX_NR_IIDICC])
     par[ 7] =  par[ 4];
     par[ 6] =  par[ 4];
     par[ 5] =  par[ 3];
-    par[ 4] = (par[ 2] + par[ 3]) * 0.5f;
+    par[ 4] = AAC_HALF_SUM(par[ 2], par[ 3]);
     par[ 3] =  par[ 2];
     par[ 2] =  par[ 1];
-    par[ 1] = (par[ 0] + par[ 1]) * 0.5f;
-    par[ 0] =  par[ 0];
+    par[ 1] = AAC_HALF_SUM(par[ 0], par[ 1]);
 }
 
-static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[32][2], int is34)
+static void decorrelation(PSContext *ps, INTFLOAT (*out)[32][2], const INTFLOAT (*s)[32][2], int is34)
 {
-    LOCAL_ALIGNED_16(float, power, [34], [PS_QMF_TIME_SLOTS]);
-    LOCAL_ALIGNED_16(float, transient_gain, [34], [PS_QMF_TIME_SLOTS]);
-    float *peak_decay_nrg = ps->peak_decay_nrg;
-    float *power_smooth = ps->power_smooth;
-    float *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
-    float (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
-    float (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
-    const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
-    const float peak_decay_factor = 0.76592833836465f;
+    LOCAL_ALIGNED_16(INTFLOAT, power, [34], [PS_QMF_TIME_SLOTS]);
+    LOCAL_ALIGNED_16(INTFLOAT, transient_gain, [34], [PS_QMF_TIME_SLOTS]);
+    INTFLOAT *peak_decay_nrg = ps->peak_decay_nrg;
+    INTFLOAT *power_smooth = ps->power_smooth;
+    INTFLOAT *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
+    INTFLOAT (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
+    INTFLOAT (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
+#if !USE_FIXED
     const float transient_impact  = 1.5f;
     const float a_smooth          = 0.25f; ///< Smoothing coefficient
+#endif /* USE_FIXED */
+    const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
     int i, k, m, n;
     int n0 = 0, nL = 32;
+    const INTFLOAT peak_decay_factor = Q31(0.76592833836465f);
 
     memset(power, 0, 34 * sizeof(*power));
 
@@ -641,6 +688,33 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
     }
 
     //Transient detection
+#if USE_FIXED
+    for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
+        for (n = n0; n < nL; n++) {
+            int decayed_peak;
+            int denom;
+
+            decayed_peak = (int)(((int64_t)peak_decay_factor * \
+                                           peak_decay_nrg[i] + 0x40000000) >> 31);
+            peak_decay_nrg[i] = FFMAX(decayed_peak, power[i][n]);
+            power_smooth[i] += (power[i][n] - power_smooth[i] + 2) >> 2;
+            peak_decay_diff_smooth[i] += (peak_decay_nrg[i] - power[i][n] - \
+                                          peak_decay_diff_smooth[i] + 2) >> 2;
+            denom = peak_decay_diff_smooth[i] + (peak_decay_diff_smooth[i] >> 1);
+            if (denom > power_smooth[i]) {
+              int p = power_smooth[i];
+              while (denom < 0x40000000) {
+                denom <<= 1;
+                p <<= 1;
+              }
+              transient_gain[i][n] = p / (denom >> 16);
+            }
+            else {
+              transient_gain[i][n] = 1 << 16;
+            }
+        }
+    }
+#else
     for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
         for (n = n0; n < nL; n++) {
             float decayed_peak = peak_decay_factor * peak_decay_nrg[i];
@@ -654,6 +728,7 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
         }
     }
 
+#endif /* USE_FIXED */
     //Decorrelation and transient reduction
     //                         PS_AP_LINKS - 1
     //                               -----
@@ -664,8 +739,22 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
     //d[k][z] (out) = transient_gain_mapped[k][z] * H[k][z] * s[k][z]
     for (k = 0; k < NR_ALLPASS_BANDS[is34]; k++) {
         int b = k_to_i[k];
+#if USE_FIXED
+        int g_decay_slope;
+
+        if (k - DECAY_CUTOFF[is34] <= 0) {
+          g_decay_slope = 1 << 30;
+        }
+        else if (k - DECAY_CUTOFF[is34] >= 20) {
+          g_decay_slope = 0;
+        }
+        else {
+          g_decay_slope = (1 << 30) - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
+        }
+#else
         float g_decay_slope = 1.f - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
         g_decay_slope = av_clipf(g_decay_slope, 0.f, 1.f);
+#endif /* USE_FIXED */
         memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
         memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
         for (m = 0; m < PS_AP_LINKS; m++) {
@@ -673,7 +762,7 @@ static void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[3
         }
         ps->dsp.decorrelate(out[k], delay[k] + PS_MAX_DELAY - 2, ap_delay[k],
                             phi_fract[is34][k],
-                            (const float (*)[2]) Q_fract_allpass[is34][k],
+                            (const INTFLOAT (*)[2]) Q_fract_allpass[is34][k],
                             transient_gain[b], g_decay_slope, nL - n0);
     }
     for (; k < SHORT_DELAY_BAND[is34]; k++) {
@@ -732,14 +821,14 @@ static void remap20(int8_t (**p_par_mapped)[PS_MAX_NR_IIDICC],
     }
 }
 
-static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2], int is34)
+static void stereo_processing(PSContext *ps, INTFLOAT (*l)[32][2], INTFLOAT (*r)[32][2], int is34)
 {
     int e, b, k;
 
-    float (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
-    float (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
-    float (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
-    float (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
+    INTFLOAT (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
+    INTFLOAT (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
+    INTFLOAT (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
+    INTFLOAT (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
     int8_t *opd_hist = ps->opd_hist;
     int8_t *ipd_hist = ps->ipd_hist;
     int8_t iid_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
@@ -751,7 +840,7 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     int8_t (*ipd_mapped)[PS_MAX_NR_IIDICC] = ipd_mapped_buf;
     int8_t (*opd_mapped)[PS_MAX_NR_IIDICC] = opd_mapped_buf;
     const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
-    TABLE_CONST float (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
+    TABLE_CONST INTFLOAT (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
 
     //Remapping
     if (ps->num_env_old) {
@@ -806,35 +895,36 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     //Mixing
     for (e = 0; e < ps->num_env; e++) {
         for (b = 0; b < NR_PAR_BANDS[is34]; b++) {
-            float h11, h12, h21, h22;
+            INTFLOAT h11, h12, h21, h22;
             h11 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][0];
             h12 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][1];
             h21 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][2];
             h22 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][3];
-            if (!PS_BASELINE && ps->enable_ipdopd && b < ps->nr_ipdopd_par) {
+
+            if (!PS_BASELINE && ps->enable_ipdopd && b < NR_IPDOPD_BANDS[is34]) {
                 //The spec say says to only run this smoother when enable_ipdopd
                 //is set but the reference decoder appears to run it constantly
-                float h11i, h12i, h21i, h22i;
-                float ipd_adj_re, ipd_adj_im;
+                INTFLOAT h11i, h12i, h21i, h22i;
+                INTFLOAT ipd_adj_re, ipd_adj_im;
                 int opd_idx = opd_hist[b] * 8 + opd_mapped[e][b];
                 int ipd_idx = ipd_hist[b] * 8 + ipd_mapped[e][b];
-                float opd_re = pd_re_smooth[opd_idx];
-                float opd_im = pd_im_smooth[opd_idx];
-                float ipd_re = pd_re_smooth[ipd_idx];
-                float ipd_im = pd_im_smooth[ipd_idx];
+                INTFLOAT opd_re = pd_re_smooth[opd_idx];
+                INTFLOAT opd_im = pd_im_smooth[opd_idx];
+                INTFLOAT ipd_re = pd_re_smooth[ipd_idx];
+                INTFLOAT ipd_im = pd_im_smooth[ipd_idx];
                 opd_hist[b] = opd_idx & 0x3F;
                 ipd_hist[b] = ipd_idx & 0x3F;
 
-                ipd_adj_re = opd_re*ipd_re + opd_im*ipd_im;
-                ipd_adj_im = opd_im*ipd_re - opd_re*ipd_im;
-                h11i = h11 * opd_im;
-                h11  = h11 * opd_re;
-                h12i = h12 * ipd_adj_im;
-                h12  = h12 * ipd_adj_re;
-                h21i = h21 * opd_im;
-                h21  = h21 * opd_re;
-                h22i = h22 * ipd_adj_im;
-                h22  = h22 * ipd_adj_re;
+                ipd_adj_re = AAC_MADD30(opd_re, ipd_re, opd_im, ipd_im);
+                ipd_adj_im = AAC_MSUB30(opd_im, ipd_re, opd_re, ipd_im);
+                h11i = AAC_MUL30(h11,  opd_im);
+                h11  = AAC_MUL30(h11,  opd_re);
+                h12i = AAC_MUL30(h12,  ipd_adj_im);
+                h12  = AAC_MUL30(h12,  ipd_adj_re);
+                h21i = AAC_MUL30(h21,  opd_im);
+                h21  = AAC_MUL30(h21,  opd_re);
+                h22i = AAC_MUL30(h22,  ipd_adj_im);
+                h22  = AAC_MUL30(h22,  ipd_adj_re);
                 H11[1][e+1][b] = h11i;
                 H12[1][e+1][b] = h12i;
                 H21[1][e+1][b] = h21i;
@@ -846,11 +936,14 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
             H22[0][e+1][b] = h22;
         }
         for (k = 0; k < NR_BANDS[is34]; k++) {
-            float h[2][4];
-            float h_step[2][4];
+            LOCAL_ALIGNED_16(INTFLOAT, h, [2], [4]);
+            LOCAL_ALIGNED_16(INTFLOAT, h_step, [2], [4]);
             int start = ps->border_position[e];
             int stop  = ps->border_position[e+1];
-            float width = 1.f / (stop - start);
+            INTFLOAT width = Q30(1.f) / ((stop - start) ? (stop - start) : 1);
+#if USE_FIXED
+            width <<= 1;
+#endif
             b = k_to_i[k];
             h[0][0] = H11[0][e][b];
             h[0][1] = H12[0][e][b];
@@ -871,15 +964,15 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
             }
             }
             //Interpolation
-            h_step[0][0] = (H11[0][e+1][b] - h[0][0]) * width;
-            h_step[0][1] = (H12[0][e+1][b] - h[0][1]) * width;
-            h_step[0][2] = (H21[0][e+1][b] - h[0][2]) * width;
-            h_step[0][3] = (H22[0][e+1][b] - h[0][3]) * width;
+            h_step[0][0] = AAC_MSUB31_V3(H11[0][e+1][b], h[0][0], width);
+            h_step[0][1] = AAC_MSUB31_V3(H12[0][e+1][b], h[0][1], width);
+            h_step[0][2] = AAC_MSUB31_V3(H21[0][e+1][b], h[0][2], width);
+            h_step[0][3] = AAC_MSUB31_V3(H22[0][e+1][b], h[0][3], width);
             if (!PS_BASELINE && ps->enable_ipdopd) {
-                h_step[1][0] = (H11[1][e+1][b] - h[1][0]) * width;
-                h_step[1][1] = (H12[1][e+1][b] - h[1][1]) * width;
-                h_step[1][2] = (H21[1][e+1][b] - h[1][2]) * width;
-                h_step[1][3] = (H22[1][e+1][b] - h[1][3]) * width;
+                h_step[1][0] = AAC_MSUB31_V3(H11[1][e+1][b], h[1][0], width);
+                h_step[1][1] = AAC_MSUB31_V3(H12[1][e+1][b], h[1][1], width);
+                h_step[1][2] = AAC_MSUB31_V3(H21[1][e+1][b], h[1][2], width);
+                h_step[1][3] = AAC_MSUB31_V3(H22[1][e+1][b], h[1][3], width);
             }
             ps->dsp.stereo_interpolate[!PS_BASELINE && ps->enable_ipdopd](
                 l[k] + start + 1, r[k] + start + 1,
@@ -888,10 +981,10 @@ static void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2
     }
 }
 
-int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float R[2][38][64], int top)
+int AAC_RENAME(ff_ps_apply)(AVCodecContext *avctx, PSContext *ps, INTFLOAT L[2][38][64], INTFLOAT R[2][38][64], int top)
 {
-    LOCAL_ALIGNED_16(float, Lbuf, [91], [32][2]);
-    LOCAL_ALIGNED_16(float, Rbuf, [91], [32][2]);
+    INTFLOAT (*Lbuf)[32][2] = ps->Lbuf;
+    INTFLOAT (*Rbuf)[32][2] = ps->Rbuf;
     const int len = 32;
     int is34 = ps->is34bands;
 
@@ -901,7 +994,7 @@ int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float
         memset(ps->ap_delay + top, 0, (NR_ALLPASS_BANDS[is34] - top)*sizeof(ps->ap_delay[0]));
 
     hybrid_analysis(&ps->dsp, Lbuf, ps->in_buf, L, is34, len);
-    decorrelation(ps, Rbuf, (const float (*)[32][2]) Lbuf, is34);
+    decorrelation(ps, Rbuf, (const INTFLOAT (*)[32][2]) Lbuf, is34);
     stereo_processing(ps, Lbuf, Rbuf, is34);
     hybrid_synthesis(&ps->dsp, L, Lbuf, is34, len);
     hybrid_synthesis(&ps->dsp, R, Rbuf, is34, len);
@@ -918,7 +1011,7 @@ int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float
 #define PS_VLC_ROW(name) \
     { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
 
-av_cold void ff_ps_init(void) {
+av_cold void AAC_RENAME(ff_ps_init)(void) {
     // Syntax initialization
     static const struct {
         const void *ps_codes, *ps_bits;
@@ -950,7 +1043,7 @@ av_cold void ff_ps_init(void) {
     ps_tableinit();
 }
 
-av_cold void ff_ps_ctx_init(PSContext *ps)
+av_cold void AAC_RENAME(ff_ps_ctx_init)(PSContext *ps)
 {
-    ff_psdsp_init(&ps->dsp);
+    AAC_RENAME(ff_psdsp_init)(&ps->dsp);
 }
diff --git a/libavcodec/aacps.h b/libavcodec/aacps.h
index e8a195a..61edce3 100644
--- a/libavcodec/aacps.h
+++ b/libavcodec/aacps.h
@@ -2,25 +2,25 @@
  * MPEG-4 Parametric Stereo definitions and declarations
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_PS_H
-#define AVCODEC_PS_H
+#ifndef AVCODEC_AACPS_H
+#define AVCODEC_AACPS_H
 
 #include <stdint.h>
 
@@ -61,24 +61,26 @@ typedef struct PSContext {
     int    is34bands;
     int    is34bands_old;
 
-    DECLARE_ALIGNED(16, float, in_buf)[5][44][2];
-    DECLARE_ALIGNED(16, float, delay)[PS_MAX_SSB][PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2];
-    DECLARE_ALIGNED(16, float, ap_delay)[PS_MAX_AP_BANDS][PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2];
-    DECLARE_ALIGNED(16, float, peak_decay_nrg)[34];
-    DECLARE_ALIGNED(16, float, power_smooth)[34];
-    DECLARE_ALIGNED(16, float, peak_decay_diff_smooth)[34];
-    DECLARE_ALIGNED(16, float, H11)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H12)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H21)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
-    DECLARE_ALIGNED(16, float, H22)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, in_buf)[5][44][2];
+    DECLARE_ALIGNED(16, INTFLOAT, delay)[PS_MAX_SSB][PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2];
+    DECLARE_ALIGNED(16, INTFLOAT, ap_delay)[PS_MAX_AP_BANDS][PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2];
+    DECLARE_ALIGNED(16, INTFLOAT, peak_decay_nrg)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, power_smooth)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, peak_decay_diff_smooth)[34];
+    DECLARE_ALIGNED(16, INTFLOAT, H11)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H12)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H21)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, H22)[2][PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC];
+    DECLARE_ALIGNED(16, INTFLOAT, Lbuf)[91][32][2];
+    DECLARE_ALIGNED(16, INTFLOAT, Rbuf)[91][32][2];
     int8_t opd_hist[PS_MAX_NR_IIDICC];
     int8_t ipd_hist[PS_MAX_NR_IIDICC];
     PSDSPContext dsp;
 } PSContext;
 
-void ff_ps_init(void);
-void ff_ps_ctx_init(PSContext *ps);
-int ff_ps_read_data(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int bits_left);
-int ff_ps_apply(AVCodecContext *avctx, PSContext *ps, float L[2][38][64], float R[2][38][64], int top);
+void AAC_RENAME(ff_ps_init)(void);
+void AAC_RENAME(ff_ps_ctx_init)(PSContext *ps);
+int AAC_RENAME(ff_ps_read_data)(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int bits_left);
+int AAC_RENAME(ff_ps_apply)(AVCodecContext *avctx, PSContext *ps, INTFLOAT L[2][38][64], INTFLOAT R[2][38][64], int top);
 
-#endif /* AVCODEC_PS_H */
+#endif /* AVCODEC_AACPS_H */
diff --git a/libavcodec/x86/audiodsp.h b/libavcodec/aacps_fixed.c
index 321056b..46af213 100644
--- a/libavcodec/x86/audiodsp.h
+++ b/libavcodec/aacps_fixed.c
@@ -1,25 +1,24 @@
 /*
- * This file is part of Libav.
+ * MPEG-4 Parametric Stereo decoding functions
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_X86_AUDIODSP_H
-#define AVCODEC_X86_AUDIODSP_H
-
-void ff_vector_clipf_sse(float *dst, const float *src,
-                         float min, float max, int len);
+#define USE_FIXED 1
 
-#endif /* AVCODEC_X86_AUDIODSP_H */
+#include "aacps.c"
diff --git a/libavcodec/aac_tablegen.c b/libavcodec/aacps_fixed_tablegen.c
index b2c6c95..9e30699 100644
--- a/libavcodec/aac_tablegen.c
+++ b/libavcodec/aacps_fixed_tablegen.c
@@ -1,37 +1,24 @@
 /*
- * Generate a header file for hardcoded AAC tables
+ * Generate a header file for hardcoded Parametric Stereo tables
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "aac_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    ff_aac_tableinit();
-
-    write_fileheader();
-
-    WRITE_ARRAY("const", float, ff_aac_pow2sf_tab);
-
-    return 0;
-}
+#define USE_FIXED 1
+#include "aacps_tablegen_template.c"
diff --git a/libavcodec/aacps_fixed_tablegen.h b/libavcodec/aacps_fixed_tablegen.h
new file mode 100644
index 0000000..8b82deb
--- /dev/null
+++ b/libavcodec/aacps_fixed_tablegen.h
@@ -0,0 +1,403 @@
+/*
+ * Header file for hardcoded Parametric Stereo tables
+ *
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+
+#ifndef AVCODEC_AACPS_FIXED_TABLEGEN_H
+#define AVCODEC_AACPS_FIXED_TABLEGEN_H
+
+#include <math.h>
+#include <stdint.h>
+
+#if CONFIG_HARDCODED_TABLES
+#define ps_tableinit()
+#define TABLE_CONST const
+#include "libavcodec/aacps_fixed_tables.h"
+#else
+#include "libavutil/common.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/mem.h"
+
+#include "aac_defines.h"
+#include "libavutil/softfloat.h"
+#define NR_ALLPASS_BANDS20 30
+#define NR_ALLPASS_BANDS34 50
+#define PS_AP_LINKS 3
+#define TABLE_CONST
+static int pd_re_smooth[8*8*8];
+static int pd_im_smooth[8*8*8];
+static int HA[46][8][4];
+static int HB[46][8][4];
+static DECLARE_ALIGNED(16, int, f20_0_8) [ 8][8][2];
+static DECLARE_ALIGNED(16, int, f34_0_12)[12][8][2];
+static DECLARE_ALIGNED(16, int, f34_1_8) [ 8][8][2];
+static DECLARE_ALIGNED(16, int, f34_2_4) [ 4][8][2];
+static TABLE_CONST DECLARE_ALIGNED(16, int, Q_fract_allpass)[2][50][3][2];
+static DECLARE_ALIGNED(16, int, phi_fract)[2][50][2];
+
+static const int g0_Q8[] = {
+    Q31(0.00746082949812f), Q31(0.02270420949825f), Q31(0.04546865930473f), Q31(0.07266113929591f),
+    Q31(0.09885108575264f), Q31(0.11793710567217f), Q31(0.125f)
+};
+
+static const int g0_Q12[] = {
+    Q31(0.04081179924692f), Q31(0.03812810994926f), Q31(0.05144908135699f), Q31(0.06399831151592f),
+    Q31(0.07428313801106f), Q31(0.08100347892914f), Q31(0.08333333333333f)
+};
+
+static const int g1_Q8[] = {
+    Q31(0.01565675600122f), Q31(0.03752716391991f), Q31(0.05417891378782f), Q31(0.08417044116767f),
+    Q31(0.10307344158036f), Q31(0.12222452249753f), Q31(0.125f)
+};
+
+static const int g2_Q4[] = {
+    Q31(-0.05908211155639f), Q31(-0.04871498374946f), Q31(0.0f),   Q31(0.07778723915851f),
+    Q31( 0.16486303567403f), Q31( 0.23279856662996f), Q31(0.25f)
+};
+
+static const int sintbl_4[4]   = {           0,  1073741824,           0, -1073741824 };
+static const int costbl_4[4]   = {  1073741824,           0, -1073741824,           0 };
+static const int sintbl_8[8]   = {           0,   759250125,  1073741824,   759250125,
+                                             0,  -759250125, -1073741824,  -759250125 };
+static const int costbl_8[8]   = {  1073741824,   759250125,           0,  -759250125,
+                                   -1073741824,  -759250125,           0,   759250125 };
+static const int sintbl_12[12] = {           0,   536870912,   929887697,  1073741824,
+                                     929887697,   536870912,           0,  -536870912,
+                                    -929887697, -1073741824,  -929887697,  -536870912 };
+static const int costbl_12[12] = {  1073741824,   929887697,   536870912,           0,
+                                    -536870912,  -929887697, -1073741824,  -929887697,
+                                    -536870912,           0,   536870912,   929887697 };
+
+static void make_filters_from_proto(int (*filter)[8][2], const int *proto, int bands)
+{
+
+    const int *sinptr, *cosptr;
+    int s, c, sinhalf, coshalf;
+    int q, n;
+
+    if (bands == 4) {
+        sinptr = sintbl_4;
+        cosptr = costbl_4;
+        sinhalf = 759250125;
+        coshalf = 759250125;
+    } else if (bands == 8) {
+        sinptr = sintbl_8;
+        cosptr = costbl_8;
+        sinhalf = 410903207;
+        coshalf = 992008094;
+    } else {
+        sinptr = sintbl_12;
+        cosptr = costbl_12;
+        sinhalf = 277904834;
+        coshalf = 1037154959;
+    }
+
+    for (q = 0; q < bands; q++) {
+        for (n = 0; n < 7; n++) {
+            int theta = (q*(n-6) + (n>>1) - 3) % bands;
+
+            if (theta < 0)
+                theta += bands;
+            s = sinptr[theta];
+            c = cosptr[theta];
+
+            if (n & 1) {
+                theta = (int)(((int64_t)c * coshalf - (int64_t)s * sinhalf + 0x20000000) >> 30);
+                s = (int)(((int64_t)s * coshalf + (int64_t)c * sinhalf + 0x20000000) >> 30);
+                c = theta;
+            }
+            filter[q][n][0] = (int)(((int64_t)proto[n] * c + 0x20000000) >> 30);
+            filter[q][n][1] = -(int)(((int64_t)proto[n] * s + 0x20000000) >> 30);
+        }
+    }
+}
+
+static void ps_tableinit(void)
+{
+    static const int ipdopd_sin[] = { Q30(0), Q30(M_SQRT1_2), Q30(1), Q30( M_SQRT1_2), Q30( 0), Q30(-M_SQRT1_2), Q30(-1), Q30(-M_SQRT1_2) };
+    static const int ipdopd_cos[] = { Q30(1), Q30(M_SQRT1_2), Q30(0), Q30(-M_SQRT1_2), Q30(-1), Q30(-M_SQRT1_2), Q30( 0), Q30( M_SQRT1_2) };
+    int pd0, pd1, pd2;
+    int idx;
+
+    static const int alpha_tab[] =
+    {
+      Q30(1.5146213770f/M_PI), Q30(1.5181334019f/M_PI), Q30(1.5234849453f/M_PI), Q30(1.5369486809f/M_PI), Q30(1.5500687361f/M_PI), Q30(1.5679757595f/M_PI),
+      Q30(1.4455626011f/M_PI), Q30(1.4531552792f/M_PI), Q30(1.4648091793f/M_PI), Q30(1.4945238829f/M_PI), Q30(1.5239057541f/M_PI), Q30(1.5644006729f/M_PI),
+      Q30(1.3738563061f/M_PI), Q30(1.3851221800f/M_PI), Q30(1.4026404619f/M_PI), Q30(1.4484288692f/M_PI), Q30(1.4949874878f/M_PI), Q30(1.5604078770f/M_PI),
+      Q30(1.2645189762f/M_PI), Q30(1.2796478271f/M_PI), Q30(1.3038636446f/M_PI), Q30(1.3710125685f/M_PI), Q30(1.4443849325f/M_PI), Q30(1.5532352924f/M_PI),
+      Q30(1.1507037878f/M_PI), Q30(1.1669205427f/M_PI), Q30(1.1938756704f/M_PI), Q30(1.2754167318f/M_PI), Q30(1.3761177063f/M_PI), Q30(1.5429240465f/M_PI),
+      Q30(1.0079245567f/M_PI), Q30(1.0208238363f/M_PI), Q30(1.0433073044f/M_PI), Q30(1.1208510399f/M_PI), Q30(1.2424604893f/M_PI), Q30(1.5185726881f/M_PI),
+      Q30(0.8995233774f/M_PI), Q30(0.9069069624f/M_PI), Q30(0.9201194048f/M_PI), Q30(0.9698365927f/M_PI), Q30(1.0671583414f/M_PI), Q30(1.4647934437f/M_PI),
+      Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI),
+      Q30(0.6712729335f/M_PI), Q30(0.6638893485f/M_PI), Q30(0.6506769061f/M_PI), Q30(0.6009597182f/M_PI), Q30(0.5036380291f/M_PI), Q30(0.1060028747f/M_PI),
+      Q30(0.5628717542f/M_PI), Q30(0.5499725342f/M_PI), Q30(0.5274890065f/M_PI), Q30(0.4499453008f/M_PI), Q30(0.3283358216f/M_PI), Q30(0.0522236861f/M_PI),
+      Q30(0.4200925827f/M_PI), Q30(0.4038758278f/M_PI), Q30(0.3769206405f/M_PI), Q30(0.2953795493f/M_PI), Q30(0.1946786791f/M_PI), Q30(0.0278722942f/M_PI),
+      Q30(0.3062773645f/M_PI), Q30(0.2911485136f/M_PI), Q30(0.2669326365f/M_PI), Q30(0.1997837722f/M_PI), Q30(0.1264114529f/M_PI), Q30(0.0175609849f/M_PI),
+      Q30(0.1969399750f/M_PI), Q30(0.1856741160f/M_PI), Q30(0.1681558639f/M_PI), Q30(0.1223674342f/M_PI), Q30(0.0758088827f/M_PI), Q30(0.0103884479f/M_PI),
+      Q30(0.1252337098f/M_PI), Q30(0.1176410317f/M_PI), Q30(0.1059871912f/M_PI), Q30(0.0762724727f/M_PI), Q30(0.0468905345f/M_PI), Q30(0.0063956482f/M_PI),
+      Q30(0.0561749674f/M_PI), Q30(0.0526629239f/M_PI), Q30(0.0473113805f/M_PI), Q30(0.0338476151f/M_PI), Q30(0.0207276177f/M_PI), Q30(0.0028205961f/M_PI),
+      Q30(1.5676341057f/M_PI), Q30(1.5678333044f/M_PI), Q30(1.5681363344f/M_PI), Q30(1.5688960552f/M_PI), Q30(1.5696337223f/M_PI), Q30(1.5706381798f/M_PI),
+      Q30(1.5651730299f/M_PI), Q30(1.5655272007f/M_PI), Q30(1.5660660267f/M_PI), Q30(1.5674170256f/M_PI), Q30(1.5687289238f/M_PI), Q30(1.5705151558f/M_PI),
+      Q30(1.5607966185f/M_PI), Q30(1.5614265203f/M_PI), Q30(1.5623844862f/M_PI), Q30(1.5647867918f/M_PI), Q30(1.5671195984f/M_PI), Q30(1.5702962875f/M_PI),
+      Q30(1.5530153513f/M_PI), Q30(1.5541347265f/M_PI), Q30(1.5558375120f/M_PI), Q30(1.5601085424f/M_PI), Q30(1.5642569065f/M_PI), Q30(1.5699069500f/M_PI),
+      Q30(1.5391840935f/M_PI), Q30(1.5411708355f/M_PI), Q30(1.5441943407f/M_PI), Q30(1.5517836809f/M_PI), Q30(1.5591609478f/M_PI), Q30(1.5692136288f/M_PI),
+      Q30(1.5146213770f/M_PI), Q30(1.5181334019f/M_PI), Q30(1.5234849453f/M_PI), Q30(1.5369486809f/M_PI), Q30(1.5500687361f/M_PI), Q30(1.5679757595f/M_PI),
+      Q30(1.4915299416f/M_PI), Q30(1.4964480400f/M_PI), Q30(1.5039558411f/M_PI), Q30(1.5229074955f/M_PI), Q30(1.5414420366f/M_PI), Q30(1.5667995214f/M_PI),
+      Q30(1.4590617418f/M_PI), Q30(1.4658898115f/M_PI), Q30(1.4763505459f/M_PI), Q30(1.5029321909f/M_PI), Q30(1.5291173458f/M_PI), Q30(1.5651149750f/M_PI),
+      Q30(1.4136143923f/M_PI), Q30(1.4229322672f/M_PI), Q30(1.4373078346f/M_PI), Q30(1.4743183851f/M_PI), Q30(1.5113102198f/M_PI), Q30(1.5626684427f/M_PI),
+      Q30(1.3505556583f/M_PI), Q30(1.3628427982f/M_PI), Q30(1.3820509911f/M_PI), Q30(1.4327841997f/M_PI), Q30(1.4850014448f/M_PI), Q30(1.5590143204f/M_PI),
+      Q30(1.2645189762f/M_PI), Q30(1.2796478271f/M_PI), Q30(1.3038636446f/M_PI), Q30(1.3710125685f/M_PI), Q30(1.4443849325f/M_PI), Q30(1.5532352924f/M_PI),
+      Q30(1.1919227839f/M_PI), Q30(1.2081253529f/M_PI), Q30(1.2346779108f/M_PI), Q30(1.3123005629f/M_PI), Q30(1.4034168720f/M_PI), Q30(1.5471596718f/M_PI),
+      Q30(1.1061993837f/M_PI), Q30(1.1219338179f/M_PI), Q30(1.1484941244f/M_PI), Q30(1.2320860624f/M_PI), Q30(1.3421301842f/M_PI), Q30(1.5373806953f/M_PI),
+      Q30(1.0079245567f/M_PI), Q30(1.0208238363f/M_PI), Q30(1.0433073044f/M_PI), Q30(1.1208510399f/M_PI), Q30(1.2424604893f/M_PI), Q30(1.5185726881f/M_PI),
+      Q30(0.8995233774f/M_PI), Q30(0.9069069624f/M_PI), Q30(0.9201194048f/M_PI), Q30(0.9698365927f/M_PI), Q30(1.0671583414f/M_PI), Q30(1.4647934437f/M_PI),
+      Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI), Q30(0.7853981853f/M_PI),
+      Q30(0.6712729335f/M_PI), Q30(0.6638893485f/M_PI), Q30(0.6506769061f/M_PI), Q30(0.6009597182f/M_PI), Q30(0.5036380291f/M_PI), Q30(0.1060028747f/M_PI),
+      Q30(0.5628717542f/M_PI), Q30(0.5499725342f/M_PI), Q30(0.5274890065f/M_PI), Q30(0.4499453008f/M_PI), Q30(0.3283358216f/M_PI), Q30(0.0522236861f/M_PI),
+      Q30(0.4645969570f/M_PI), Q30(0.4488625824f/M_PI), Q30(0.4223022461f/M_PI), Q30(0.3387103081f/M_PI), Q30(0.2286661267f/M_PI), Q30(0.0334156826f/M_PI),
+      Q30(0.3788735867f/M_PI), Q30(0.3626709878f/M_PI), Q30(0.3361184299f/M_PI), Q30(0.2584958076f/M_PI), Q30(0.1673794836f/M_PI), Q30(0.0236366931f/M_PI),
+      Q30(0.3062773645f/M_PI), Q30(0.2911485136f/M_PI), Q30(0.2669326365f/M_PI), Q30(0.1997837722f/M_PI), Q30(0.1264114529f/M_PI), Q30(0.0175609849f/M_PI),
+      Q30(0.2202406377f/M_PI), Q30(0.2079535723f/M_PI), Q30(0.1887452900f/M_PI), Q30(0.1380121708f/M_PI), Q30(0.0857949182f/M_PI), Q30(0.0117820343f/M_PI),
+      Q30(0.1571819335f/M_PI), Q30(0.1478640437f/M_PI), Q30(0.1334884763f/M_PI), Q30(0.0964778885f/M_PI), Q30(0.0594860613f/M_PI), Q30(0.0081279324f/M_PI),
+      Q30(0.1117345318f/M_PI), Q30(0.1049065739f/M_PI), Q30(0.0944457650f/M_PI), Q30(0.0678641573f/M_PI), Q30(0.0416790098f/M_PI), Q30(0.0056813755f/M_PI),
+      Q30(0.0792663917f/M_PI), Q30(0.0743482932f/M_PI), Q30(0.0668405443f/M_PI), Q30(0.0478888862f/M_PI), Q30(0.0293543357f/M_PI), Q30(0.0039967746f/M_PI),
+      Q30(0.0561749674f/M_PI), Q30(0.0526629239f/M_PI), Q30(0.0473113805f/M_PI), Q30(0.0338476151f/M_PI), Q30(0.0207276177f/M_PI), Q30(0.0028205961f/M_PI),
+      Q30(0.0316122435f/M_PI), Q30(0.0296254847f/M_PI), Q30(0.0266019460f/M_PI), Q30(0.0190126132f/M_PI), Q30(0.0116353342f/M_PI), Q30(0.0015827164f/M_PI),
+      Q30(0.0177809205f/M_PI), Q30(0.0166615788f/M_PI), Q30(0.0149587989f/M_PI), Q30(0.0106877899f/M_PI), Q30(0.0065393616f/M_PI), Q30(0.0008894200f/M_PI),
+      Q30(0.0099996664f/M_PI), Q30(0.0093698399f/M_PI), Q30(0.0084118480f/M_PI), Q30(0.0060095116f/M_PI), Q30(0.0036767013f/M_PI), Q30(0.0005000498f/M_PI),
+      Q30(0.0056233541f/M_PI), Q30(0.0052691097f/M_PI), Q30(0.0047303112f/M_PI), Q30(0.0033792770f/M_PI), Q30(0.0020674451f/M_PI), Q30(0.0002811795f/M_PI),
+      Q30(0.0031622672f/M_PI), Q30(0.0029630491f/M_PI), Q30(0.0026600463f/M_PI), Q30(0.0019002859f/M_PI), Q30(0.0011625893f/M_PI), Q30(0.0001581155f/M_PI)
+    };
+
+    static const int gamma_tab[] =
+    {
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0433459543f/M_PI), Q30(0.0672172382f/M_PI), Q30(0.0997167900f/M_PI), Q30(0.1162951663f/M_PI), Q30(0.1250736862f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0672341362f/M_PI), Q30(0.1045235619f/M_PI), Q30(0.1558904350f/M_PI), Q30(0.1824723780f/M_PI), Q30(0.1966800541f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1315985769f/M_PI), Q30(0.2072522491f/M_PI), Q30(0.3188187480f/M_PI), Q30(0.3825501204f/M_PI), Q30(0.4193951190f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1784276664f/M_PI), Q30(0.2856673002f/M_PI), Q30(0.4630723596f/M_PI), Q30(0.5971632004f/M_PI), Q30(0.7603877187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1315985769f/M_PI), Q30(0.2072522491f/M_PI), Q30(0.3188187480f/M_PI), Q30(0.3825501204f/M_PI), Q30(0.4193951190f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0672341362f/M_PI), Q30(0.1045235619f/M_PI), Q30(0.1558904350f/M_PI), Q30(0.1824723780f/M_PI), Q30(0.1966800541f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0433459543f/M_PI), Q30(0.0672172382f/M_PI), Q30(0.0997167900f/M_PI), Q30(0.1162951663f/M_PI), Q30(0.1250736862f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0011053939f/M_PI), Q30(0.0017089852f/M_PI), Q30(0.0025254129f/M_PI), Q30(0.0029398468f/M_PI), Q30(0.0031597170f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0019607407f/M_PI), Q30(0.0030395309f/M_PI), Q30(0.0044951206f/M_PI), Q30(0.0052305623f/M_PI), Q30(0.0056152637f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0034913034f/M_PI), Q30(0.0054070661f/M_PI), Q30(0.0079917293f/M_PI), Q30(0.0092999367f/M_PI), Q30(0.0099875759f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0062100487f/M_PI), Q30(0.0096135242f/M_PI), Q30(0.0142110568f/M_PI), Q30(0.0165348612f/M_PI), Q30(0.0177587029f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0110366223f/M_PI), Q30(0.0170863140f/M_PI), Q30(0.0252620988f/M_PI), Q30(0.0293955617f/M_PI), Q30(0.0315726399f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0275881495f/M_PI), Q30(0.0427365713f/M_PI), Q30(0.0632618815f/M_PI), Q30(0.0736731067f/M_PI), Q30(0.0791663304f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0387469754f/M_PI), Q30(0.0600636788f/M_PI), Q30(0.0890387669f/M_PI), Q30(0.1037906483f/M_PI), Q30(0.1115923747f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0541138873f/M_PI), Q30(0.0839984417f/M_PI), Q30(0.1248718798f/M_PI), Q30(0.1458375156f/M_PI), Q30(0.1569785923f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0747506917f/M_PI), Q30(0.1163287833f/M_PI), Q30(0.1738867164f/M_PI), Q30(0.2038587779f/M_PI), Q30(0.2199459076f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1212290376f/M_PI), Q30(0.1903949380f/M_PI), Q30(0.2907958031f/M_PI), Q30(0.3466993868f/M_PI), Q30(0.3782821596f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1418247074f/M_PI), Q30(0.2240308374f/M_PI), Q30(0.3474813402f/M_PI), Q30(0.4202919006f/M_PI), Q30(0.4637607038f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1784276664f/M_PI), Q30(0.2856673002f/M_PI), Q30(0.4630723596f/M_PI), Q30(0.5971632004f/M_PI), Q30(0.7603877187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1736015975f/M_PI), Q30(0.2773745656f/M_PI), Q30(0.4461984038f/M_PI), Q30(0.5666890144f/M_PI), Q30(0.6686112881f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1603866369f/M_PI), Q30(0.2549437582f/M_PI), Q30(0.4029446840f/M_PI), Q30(0.4980689585f/M_PI), Q30(0.5615641475f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1418247074f/M_PI), Q30(0.2240308374f/M_PI), Q30(0.3474813402f/M_PI), Q30(0.4202919006f/M_PI), Q30(0.4637607038f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1212290376f/M_PI), Q30(0.1903949380f/M_PI), Q30(0.2907958031f/M_PI), Q30(0.3466993868f/M_PI), Q30(0.3782821596f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.1011129096f/M_PI), Q30(0.1580764502f/M_PI), Q30(0.2387557179f/M_PI), Q30(0.2820728719f/M_PI), Q30(0.3058380187f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0747506917f/M_PI), Q30(0.1163287833f/M_PI), Q30(0.1738867164f/M_PI), Q30(0.2038587779f/M_PI), Q30(0.2199459076f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0541138873f/M_PI), Q30(0.0839984417f/M_PI), Q30(0.1248718798f/M_PI), Q30(0.1458375156f/M_PI), Q30(0.1569785923f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0387469754f/M_PI), Q30(0.0600636788f/M_PI), Q30(0.0890387669f/M_PI), Q30(0.1037906483f/M_PI), Q30(0.1115923747f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0275881495f/M_PI), Q30(0.0427365713f/M_PI), Q30(0.0632618815f/M_PI), Q30(0.0736731067f/M_PI), Q30(0.0791663304f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0195873566f/M_PI), Q30(0.0303316917f/M_PI), Q30(0.0448668823f/M_PI), Q30(0.0522258915f/M_PI), Q30(0.0561044961f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0110366223f/M_PI), Q30(0.0170863140f/M_PI), Q30(0.0252620988f/M_PI), Q30(0.0293955617f/M_PI), Q30(0.0315726399f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0062100487f/M_PI), Q30(0.0096135242f/M_PI), Q30(0.0142110568f/M_PI), Q30(0.0165348612f/M_PI), Q30(0.0177587029f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0034913034f/M_PI), Q30(0.0054070661f/M_PI), Q30(0.0079917293f/M_PI), Q30(0.0092999367f/M_PI), Q30(0.0099875759f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0019607407f/M_PI), Q30(0.0030395309f/M_PI), Q30(0.0044951206f/M_PI), Q30(0.0052305623f/M_PI), Q30(0.0056152637f/M_PI),
+      Q30(0.0000000000f/M_PI), Q30(0.0011053939f/M_PI), Q30(0.0017089852f/M_PI), Q30(0.0025254129f/M_PI), Q30(0.0029398468f/M_PI), Q30(0.0031597170f/M_PI)
+    };
+
+    static const int iid_par_dequant_c1[] = {
+        //iid_par_dequant_default
+        Q30(1.41198278375959f), Q30(1.40313815268360f), Q30(1.38687670404960f), Q30(1.34839972492648f),
+        Q30(1.29124937110028f), Q30(1.19603741667993f), Q30(1.10737240362323f), Q30(1),
+        Q30(0.87961716655242f), Q30(0.75464859232732f), Q30(0.57677990744575f), Q30(0.42640143271122f),
+        Q30(0.27671828230984f), Q30(0.17664462766713f), Q30(0.07940162697653f),
+        //iid_par_dequant_fine
+        Q30(1.41420649135832f), Q30(1.41419120222364f), Q30(1.41414285699784f), Q30(1.41399000859438f),
+        Q30(1.41350698548044f), Q30(1.41198278375959f), Q30(1.40977302262355f), Q30(1.40539479488545f),
+        Q30(1.39677960498402f), Q30(1.38005309967827f), Q30(1.34839972492648f), Q30(1.31392017367631f),
+        Q30(1.26431008149654f), Q30(1.19603741667993f), Q30(1.10737240362323f), Q30(1),
+        Q30(0.87961716655242f), Q30(0.75464859232732f), Q30(0.63365607219232f), Q30(0.52308104267543f),
+        Q30(0.42640143271122f), Q30(0.30895540465965f), Q30(0.22137464873077f), Q30(0.15768788954414f),
+        Q30(0.11198225164225f), Q30(0.07940162697653f), Q30(0.04469901562677f), Q30(0.02514469318284f),
+        Q30(0.01414142856998f), Q30(0.00795258154731f), Q30(0.00447211359449f),
+    };
+
+    static const int acos_icc_invq[] = {
+        Q31(0), Q31(0.178427635f/M_PI), Q31(0.28566733f/M_PI), Q31(0.46307236f/M_PI), Q31(0.59716315f/M_PI), Q31(0.78539816f/M_PI), Q31(1.10030855f/M_PI), Q31(1.57079633f/M_PI)
+    };
+    int iid, icc;
+
+    int k, m;
+    static const int8_t f_center_20[] = {
+        -3, -1, 1, 3, 5, 7, 10, 14, 18, 22,
+    };
+    static const int32_t f_center_34[] = {
+      Q31(  2/768.0),Q31(  6/768.0),Q31(10/768.0),Q31(14/768.0),Q31( 18/768.0),Q31( 22/768.0),Q31( 26/768.0),Q31(30/768.0),
+      Q31( 34/768.0),Q31(-10/768.0),Q31(-6/768.0),Q31(-2/768.0),Q31( 51/768.0),Q31( 57/768.0),Q31( 15/768.0),Q31(21/768.0),
+      Q31( 27/768.0),Q31( 33/768.0),Q31(39/768.0),Q31(45/768.0),Q31( 54/768.0),Q31( 66/768.0),Q31( 78/768.0),Q31(42/768.0),
+      Q31(102/768.0),Q31( 66/768.0),Q31(78/768.0),Q31(90/768.0),Q31(102/768.0),Q31(114/768.0),Q31(126/768.0),Q31(90/768.0)
+    };
+    static const int fractional_delay_links[] = { Q31(0.43f), Q31(0.75f), Q31(0.347f) };
+    const int fractional_delay_gain = Q31(0.39f);
+
+    for (pd0 = 0; pd0 < 8; pd0++) {
+        int pd0_re = (ipdopd_cos[pd0]+2)>>2;
+        int pd0_im = (ipdopd_sin[pd0]+2)>>2;
+        for (pd1 = 0; pd1 < 8; pd1++) {
+            int pd1_re = ipdopd_cos[pd1] >> 1;
+            int pd1_im = ipdopd_sin[pd1] >> 1;
+            for (pd2 = 0; pd2 < 8; pd2++) {
+                int shift, round;
+                int pd2_re = ipdopd_cos[pd2];
+                int pd2_im = ipdopd_sin[pd2];
+                int re_smooth = pd0_re + pd1_re + pd2_re;
+                int im_smooth = pd0_im + pd1_im + pd2_im;
+
+                SoftFloat pd_mag = av_int2sf(((ipdopd_cos[(pd0-pd1)&7]+8)>>4) + ((ipdopd_cos[(pd0-pd2)&7]+4)>>3) +
+                                               ((ipdopd_cos[(pd1-pd2)&7]+2)>>2) + 0x15000000, 28);
+                pd_mag = av_div_sf(FLOAT_1, av_sqrt_sf(pd_mag));
+                shift = 30 - pd_mag.exp;
+                round = 1 << (shift-1);
+                pd_re_smooth[pd0*64+pd1*8+pd2] = (int)(((int64_t)re_smooth * pd_mag.mant + round) >> shift);
+                pd_im_smooth[pd0*64+pd1*8+pd2] = (int)(((int64_t)im_smooth * pd_mag.mant + round) >> shift);
+            }
+        }
+    }
+
+    idx = 0;
+    for (iid = 0; iid < 46; iid++) {
+        int c1, c2;
+
+        c1 = iid_par_dequant_c1[iid];
+        if (iid < 15)
+          c2 = iid_par_dequant_c1[14-iid];
+        else
+          c2 = iid_par_dequant_c1[60-iid];
+
+        for (icc = 0; icc < 8; icc++) {
+            /*if (PS_BASELINE || ps->icc_mode < 3)*/{
+                int alpha, beta;
+                int ca, sa, cb, sb;
+
+                alpha = acos_icc_invq[icc];
+                beta = (int)(((int64_t)alpha * 1518500250 + 0x40000000) >> 31);
+                alpha >>= 1;
+                beta = (int)(((int64_t)beta * (c1 - c2) + 0x40000000) >> 31);
+                av_sincos_sf(beta + alpha, &sa, &ca);
+                av_sincos_sf(beta - alpha, &sb, &cb);
+
+                HA[iid][icc][0] = (int)(((int64_t)c2 * ca + 0x20000000) >> 30);
+                HA[iid][icc][1] = (int)(((int64_t)c1 * cb + 0x20000000) >> 30);
+                HA[iid][icc][2] = (int)(((int64_t)c2 * sa + 0x20000000) >> 30);
+                HA[iid][icc][3] = (int)(((int64_t)c1 * sb + 0x20000000) >> 30);
+            } /* else */ {
+                int alpha_int, gamma_int;
+                int alpha_c_int, alpha_s_int, gamma_c_int, gamma_s_int;
+
+                alpha_int = alpha_tab[idx];
+                gamma_int = gamma_tab[idx];
+
+                av_sincos_sf(alpha_int, &alpha_s_int, &alpha_c_int);
+                av_sincos_sf(gamma_int, &gamma_s_int, &gamma_c_int);
+
+                alpha_c_int = (int)(((int64_t)alpha_c_int * 1518500250 + 0x20000000) >> 30);
+                alpha_s_int = (int)(((int64_t)alpha_s_int * 1518500250 + 0x20000000) >> 30);
+
+                HB[iid][icc][0] = (int)(((int64_t)alpha_c_int * gamma_c_int + 0x20000000) >> 30);
+                HB[iid][icc][1] = (int)(((int64_t)alpha_s_int * gamma_c_int + 0x20000000) >> 30);
+                HB[iid][icc][2] = -(int)(((int64_t)alpha_s_int * gamma_s_int + 0x20000000) >> 30);
+                HB[iid][icc][3] = (int)(((int64_t)alpha_c_int * gamma_s_int + 0x20000000) >> 30);
+            }
+
+            if (icc < 5 || icc > 6)
+              idx++;
+        }
+    }
+
+    for (k = 0; k < NR_ALLPASS_BANDS20; k++) {
+        int theta;
+        int64_t f_center;
+        int c, s;
+
+        if (k < FF_ARRAY_ELEMS(f_center_20))
+          f_center = f_center_20[k];
+        else
+          f_center = (k << 3) - 52;
+
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            theta = (int)(((int64_t)fractional_delay_links[m] * f_center + 8) >> 4);
+            av_sincos_sf(-theta, &s, &c);
+            Q_fract_allpass[0][k][m][0] = c;
+            Q_fract_allpass[0][k][m][1] = s;
+        }
+
+        theta = (int)(((int64_t)fractional_delay_gain * f_center + 8) >> 4);
+        av_sincos_sf(-theta, &s, &c);
+        phi_fract[0][k][0] = c;
+        phi_fract[0][k][1] = s;
+    }
+
+    for (k = 0; k < NR_ALLPASS_BANDS34; k++) {
+        int theta, f_center;
+        int c, s;
+
+        if (k < FF_ARRAY_ELEMS(f_center_34))
+            f_center = f_center_34[k];
+        else
+            f_center = ((int64_t)k << 26) - (53 << 25);
+
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            theta = (int)(((int64_t)fractional_delay_links[m] * f_center + 0x10000000) >> 27);
+            av_sincos_sf(-theta, &s, &c);
+            Q_fract_allpass[1][k][m][0] = c;
+            Q_fract_allpass[1][k][m][1] = s;
+        }
+
+        theta = (int)(((int64_t)fractional_delay_gain * f_center + 0x10000000) >> 27);
+        av_sincos_sf(-theta, &s, &c);
+        phi_fract[1][k][0] = c;
+        phi_fract[1][k][1] = s;
+    }
+
+    make_filters_from_proto(f20_0_8,  g0_Q8,   8);
+    make_filters_from_proto(f34_0_12, g0_Q12, 12);
+    make_filters_from_proto(f34_1_8,  g1_Q8,   8);
+    make_filters_from_proto(f34_2_4,  g2_Q4,   4);
+}
+#endif /* CONFIG_HARDCODED_TABLES */
+
+#endif /* AVCODEC_AACPS_FIXED_TABLEGEN_H */
diff --git a/libavcodec/aacps_float.c b/libavcodec/aacps_float.c
new file mode 100644
index 0000000..73259c1
--- /dev/null
+++ b/libavcodec/aacps_float.c
@@ -0,0 +1,24 @@
+/*
+ * MPEG-4 Parametric Stereo decoding functions
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 0
+
+#include "aacps.c"
diff --git a/libavcodec/aacps_tablegen.c b/libavcodec/aacps_tablegen.c
index 537b6ba..26a6752 100644
--- a/libavcodec/aacps_tablegen.c
+++ b/libavcodec/aacps_tablegen.c
@@ -3,91 +3,22 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "aacps_tablegen.h"
-#include "tableprint.h"
-
-void write_float_3d_array (const void *p, int b, int c, int d)
-{
-    int i;
-    const float *f = p;
-    for (i = 0; i < b; i++) {
-        printf("{\n");
-        write_float_2d_array(f, c, d);
-        printf("},\n");
-        f += c * d;
-    }
-}
-
-void write_float_4d_array (const void *p, int a, int b, int c, int d)
-{
-    int i;
-    const float *f = p;
-    for (i = 0; i < a; i++) {
-        printf("{\n");
-        write_float_3d_array(f, b, c, d);
-        printf("},\n");
-        f += b * c * d;
-    }
-}
-
-int main(void)
-{
-    ps_tableinit();
-
-    write_fileheader();
-
-    printf("static const float pd_re_smooth[8*8*8] = {\n");
-    write_float_array(pd_re_smooth, 8*8*8);
-    printf("};\n");
-    printf("static const float pd_im_smooth[8*8*8] = {\n");
-    write_float_array(pd_im_smooth, 8*8*8);
-    printf("};\n");
-
-    printf("static const float HA[46][8][4] = {\n");
-    write_float_3d_array(HA, 46, 8, 4);
-    printf("};\n");
-    printf("static const float HB[46][8][4] = {\n");
-    write_float_3d_array(HB, 46, 8, 4);
-    printf("};\n");
-
-    printf("static const DECLARE_ALIGNED(16, float, f20_0_8)[8][8][2] = {\n");
-    write_float_3d_array(f20_0_8, 8, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_0_12)[12][8][2] = {\n");
-    write_float_3d_array(f34_0_12, 12, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_1_8)[8][8][2] = {\n");
-    write_float_3d_array(f34_1_8, 8, 8, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, f34_2_4)[4][8][2] = {\n");
-    write_float_3d_array(f34_2_4, 4, 8, 2);
-    printf("};\n");
-
-    printf("static TABLE_CONST DECLARE_ALIGNED(16, float, Q_fract_allpass)[2][50][3][2] = {\n");
-    write_float_4d_array(Q_fract_allpass, 2, 50, 3, 2);
-    printf("};\n");
-    printf("static const DECLARE_ALIGNED(16, float, phi_fract)[2][50][2] = {\n");
-    write_float_3d_array(phi_fract, 2, 50, 2);
-    printf("};\n");
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "aacps_tablegen_template.c"
diff --git a/libavcodec/aacps_tablegen.h b/libavcodec/aacps_tablegen.h
index a53f9fa..0ac4f68 100644
--- a/libavcodec/aacps_tablegen.h
+++ b/libavcodec/aacps_tablegen.h
@@ -3,25 +3,25 @@
  *
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AACPS_TABLEGEN_H
-#define AACPS_TABLEGEN_H
+#ifndef AVCODEC_AACPS_TABLEGEN_H
+#define AVCODEC_AACPS_TABLEGEN_H
 
 #include <math.h>
 #include <stdint.h>
@@ -70,7 +70,7 @@ static const float g2_Q4[] = {
      0.16486303567403f,  0.23279856662996f, 0.25f
 };
 
-static void make_filters_from_proto(float (*filter)[8][2], const float *proto, int bands)
+static av_cold void make_filters_from_proto(float (*filter)[8][2], const float *proto, int bands)
 {
     int q, n;
     for (q = 0; q < bands; q++) {
@@ -82,7 +82,7 @@ static void make_filters_from_proto(float (*filter)[8][2], const float *proto, i
     }
 }
 
-static void ps_tableinit(void)
+static av_cold void ps_tableinit(void)
 {
     static const float ipdopd_sin[] = { 0, M_SQRT1_2, 1,  M_SQRT1_2,  0, -M_SQRT1_2, -1, -M_SQRT1_2 };
     static const float ipdopd_cos[] = { 1, M_SQRT1_2, 0, -M_SQRT1_2, -1, -M_SQRT1_2,  0,  M_SQRT1_2 };
@@ -136,7 +136,7 @@ static void ps_tableinit(void)
                 float pd2_im = ipdopd_sin[pd2];
                 float re_smooth = 0.25f * pd0_re + 0.5f * pd1_re + pd2_re;
                 float im_smooth = 0.25f * pd0_im + 0.5f * pd1_im + pd2_im;
-                float pd_mag = 1 / sqrt(im_smooth * im_smooth + re_smooth * re_smooth);
+                float pd_mag = 1 / hypot(im_smooth, re_smooth);
                 pd_re_smooth[pd0*64+pd1*8+pd2] = re_smooth * pd_mag;
                 pd_im_smooth[pd0*64+pd1*8+pd2] = im_smooth * pd_mag;
             }
@@ -214,4 +214,4 @@ static void ps_tableinit(void)
 }
 #endif /* CONFIG_HARDCODED_TABLES */
 
-#endif /* AACPS_TABLEGEN_H */
+#endif /* AVCODEC_AACPS_TABLEGEN_H */
diff --git a/libavcodec/aacps_tablegen_template.c b/libavcodec/aacps_tablegen_template.c
new file mode 100644
index 0000000..341bd44
--- /dev/null
+++ b/libavcodec/aacps_tablegen_template.c
@@ -0,0 +1,107 @@
+/*
+ * Generate a header file for hardcoded Parametric Stereo tables
+ *
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#define CONFIG_HARDCODED_TABLES 0
+#include "aac_defines.h"
+
+#if USE_FIXED
+#define TYPE_NAME "int32_t"
+typedef int32_t INT32FLOAT;
+#define ARRAY_RENAME(x) write_int32_t_ ## x
+#define ARRAY_URENAME(x) write_uint32_t_ ## x
+#include "aacps_fixed_tablegen.h"
+#else
+#define TYPE_NAME "float"
+typedef float INT32FLOAT;
+#define ARRAY_RENAME(x) write_float_ ## x
+#define ARRAY_URENAME(x) write_float_ ## x
+#include "aacps_tablegen.h"
+#endif /* USE_FIXED */
+#include "tableprint.h"
+
+void ARRAY_RENAME(3d_array) (const void *p, int b, int c, int d)
+{
+    int i;
+    const INT32FLOAT *f = p;
+    for (i = 0; i < b; i++) {
+        printf("{\n");
+        ARRAY_URENAME(2d_array)(f, c, d);
+        printf("},\n");
+        f += c * d;
+    }
+}
+
+void ARRAY_RENAME(4d_array) (const void *p, int a, int b, int c, int d)
+{
+    int i;
+    const INT32FLOAT *f = p;
+    for (i = 0; i < a; i++) {
+        printf("{\n");
+        ARRAY_RENAME(3d_array)(f, b, c, d);
+        printf("},\n");
+        f += b * c * d;
+    }
+}
+
+int main(void)
+{
+    ps_tableinit();
+
+    write_fileheader();
+
+    printf("static const %s pd_re_smooth[8*8*8] = {\n", TYPE_NAME);
+    ARRAY_RENAME(array)(pd_re_smooth, 8*8*8);
+    printf("};\n");
+    printf("static const %s pd_im_smooth[8*8*8] = {\n", TYPE_NAME);
+    ARRAY_RENAME(array)(pd_im_smooth, 8*8*8);
+    printf("};\n");
+
+    printf("static const %s HA[46][8][4] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(HA, 46, 8, 4);
+    printf("};\n");
+    printf("static const %s HB[46][8][4] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(HB, 46, 8, 4);
+    printf("};\n");
+
+    printf("static const DECLARE_ALIGNED(16, %s, f20_0_8)[8][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f20_0_8, 8, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_0_12)[12][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_0_12, 12, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_1_8)[8][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_1_8, 8, 8, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, f34_2_4)[4][8][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(f34_2_4, 4, 8, 2);
+    printf("};\n");
+
+    printf("static const DECLARE_ALIGNED(16, %s, Q_fract_allpass)[2][50][3][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(4d_array)(Q_fract_allpass, 2, 50, 3, 2);
+    printf("};\n");
+    printf("static const DECLARE_ALIGNED(16, %s, phi_fract)[2][50][2] = {\n", TYPE_NAME);
+    ARRAY_RENAME(3d_array)(phi_fract, 2, 50, 2);
+    printf("};\n");
+
+    return 0;
+}
diff --git a/libavcodec/aacpsdata.c b/libavcodec/aacpsdata.c
index 675bd8e..5c1a1b0 100644
--- a/libavcodec/aacpsdata.c
+++ b/libavcodec/aacpsdata.c
@@ -2,20 +2,20 @@
  * MPEG-4 Parametric Stereo data tables
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -157,7 +157,7 @@ static const int8_t k_to_i_34[] = {
     33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33
 };
 
-static const float g1_Q2[] = {
-    0.0f,  0.01899487526049f, 0.0f, -0.07293139167538f,
-    0.0f,  0.30596630545168f, 0.5f
+static const INTFLOAT g1_Q2[] = {
+    Q31(0.0f),  Q31(0.01899487526049f), Q31(0.0f), Q31(-0.07293139167538f),
+    Q31(0.0f),  Q31(0.30596630545168f), Q31(0.5f)
 };
diff --git a/libavcodec/aacpsdsp.c b/libavcodec/aacpsdsp.c
deleted file mode 100644
index 88e731f..0000000
--- a/libavcodec/aacpsdsp.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "aacpsdsp.h"
-
-static void ps_add_squares_c(float *dst, const float (*src)[2], int n)
-{
-    int i;
-    for (i = 0; i < n; i++)
-        dst[i] += src[i][0] * src[i][0] + src[i][1] * src[i][1];
-}
-
-static void ps_mul_pair_single_c(float (*dst)[2], float (*src0)[2], float *src1,
-                                 int n)
-{
-    int i;
-    for (i = 0; i < n; i++) {
-        dst[i][0] = src0[i][0] * src1[i];
-        dst[i][1] = src0[i][1] * src1[i];
-    }
-}
-
-static void ps_hybrid_analysis_c(float (*out)[2], float (*in)[2],
-                                 const float (*filter)[8][2],
-                                 int stride, int n)
-{
-    int i, j;
-
-    for (i = 0; i < n; i++) {
-        float sum_re = filter[i][6][0] * in[6][0];
-        float sum_im = filter[i][6][0] * in[6][1];
-
-        for (j = 0; j < 6; j++) {
-            float in0_re = in[j][0];
-            float in0_im = in[j][1];
-            float in1_re = in[12-j][0];
-            float in1_im = in[12-j][1];
-            sum_re += filter[i][j][0] * (in0_re + in1_re) -
-                      filter[i][j][1] * (in0_im - in1_im);
-            sum_im += filter[i][j][0] * (in0_im + in1_im) +
-                      filter[i][j][1] * (in0_re - in1_re);
-        }
-        out[i * stride][0] = sum_re;
-        out[i * stride][1] = sum_im;
-    }
-}
-
-static void ps_hybrid_analysis_ileave_c(float (*out)[32][2], float L[2][38][64],
-                                        int i, int len)
-{
-    int j;
-
-    for (; i < 64; i++) {
-        for (j = 0; j < len; j++) {
-            out[i][j][0] = L[0][j][i];
-            out[i][j][1] = L[1][j][i];
-        }
-    }
-}
-
-static void ps_hybrid_synthesis_deint_c(float out[2][38][64],
-                                        float (*in)[32][2],
-                                        int i, int len)
-{
-    int n;
-
-    for (; i < 64; i++) {
-        for (n = 0; n < len; n++) {
-            out[0][n][i] = in[i][n][0];
-            out[1][n][i] = in[i][n][1];
-        }
-    }
-}
-
-static void ps_decorrelate_c(float (*out)[2], float (*delay)[2],
-                             float (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
-                             const float phi_fract[2], const float (*Q_fract)[2],
-                             const float *transient_gain,
-                             float g_decay_slope,
-                             int len)
-{
-    static const float a[] = { 0.65143905753106f,
-                               0.56471812200776f,
-                               0.48954165955695f };
-    float ag[PS_AP_LINKS];
-    int m, n;
-
-    for (m = 0; m < PS_AP_LINKS; m++)
-        ag[m] = a[m] * g_decay_slope;
-
-    for (n = 0; n < len; n++) {
-        float in_re = delay[n][0] * phi_fract[0] - delay[n][1] * phi_fract[1];
-        float in_im = delay[n][0] * phi_fract[1] + delay[n][1] * phi_fract[0];
-        for (m = 0; m < PS_AP_LINKS; m++) {
-            float a_re                = ag[m] * in_re;
-            float a_im                = ag[m] * in_im;
-            float link_delay_re       = ap_delay[m][n+2-m][0];
-            float link_delay_im       = ap_delay[m][n+2-m][1];
-            float fractional_delay_re = Q_fract[m][0];
-            float fractional_delay_im = Q_fract[m][1];
-            float apd_re = in_re;
-            float apd_im = in_im;
-            in_re = link_delay_re * fractional_delay_re -
-                    link_delay_im * fractional_delay_im - a_re;
-            in_im = link_delay_re * fractional_delay_im +
-                    link_delay_im * fractional_delay_re - a_im;
-            ap_delay[m][n+5][0] = apd_re + ag[m] * in_re;
-            ap_delay[m][n+5][1] = apd_im + ag[m] * in_im;
-        }
-        out[n][0] = transient_gain[n] * in_re;
-        out[n][1] = transient_gain[n] * in_im;
-    }
-}
-
-static void ps_stereo_interpolate_c(float (*l)[2], float (*r)[2],
-                                    float h[2][4], float h_step[2][4],
-                                    int len)
-{
-    float h0 = h[0][0];
-    float h1 = h[0][1];
-    float h2 = h[0][2];
-    float h3 = h[0][3];
-    float hs0 = h_step[0][0];
-    float hs1 = h_step[0][1];
-    float hs2 = h_step[0][2];
-    float hs3 = h_step[0][3];
-    int n;
-
-    for (n = 0; n < len; n++) {
-        //l is s, r is d
-        float l_re = l[n][0];
-        float l_im = l[n][1];
-        float r_re = r[n][0];
-        float r_im = r[n][1];
-        h0 += hs0;
-        h1 += hs1;
-        h2 += hs2;
-        h3 += hs3;
-        l[n][0] = h0 * l_re + h2 * r_re;
-        l[n][1] = h0 * l_im + h2 * r_im;
-        r[n][0] = h1 * l_re + h3 * r_re;
-        r[n][1] = h1 * l_im + h3 * r_im;
-    }
-}
-
-static void ps_stereo_interpolate_ipdopd_c(float (*l)[2], float (*r)[2],
-                                           float h[2][4], float h_step[2][4],
-                                           int len)
-{
-    float h00  = h[0][0],      h10  = h[1][0];
-    float h01  = h[0][1],      h11  = h[1][1];
-    float h02  = h[0][2],      h12  = h[1][2];
-    float h03  = h[0][3],      h13  = h[1][3];
-    float hs00 = h_step[0][0], hs10 = h_step[1][0];
-    float hs01 = h_step[0][1], hs11 = h_step[1][1];
-    float hs02 = h_step[0][2], hs12 = h_step[1][2];
-    float hs03 = h_step[0][3], hs13 = h_step[1][3];
-    int n;
-
-    for (n = 0; n < len; n++) {
-        //l is s, r is d
-        float l_re = l[n][0];
-        float l_im = l[n][1];
-        float r_re = r[n][0];
-        float r_im = r[n][1];
-        h00 += hs00;
-        h01 += hs01;
-        h02 += hs02;
-        h03 += hs03;
-        h10 += hs10;
-        h11 += hs11;
-        h12 += hs12;
-        h13 += hs13;
-
-        l[n][0] = h00 * l_re + h02 * r_re - h10 * l_im - h12 * r_im;
-        l[n][1] = h00 * l_im + h02 * r_im + h10 * l_re + h12 * r_re;
-        r[n][0] = h01 * l_re + h03 * r_re - h11 * l_im - h13 * r_im;
-        r[n][1] = h01 * l_im + h03 * r_im + h11 * l_re + h13 * r_re;
-    }
-}
-
-av_cold void ff_psdsp_init(PSDSPContext *s)
-{
-    s->add_squares            = ps_add_squares_c;
-    s->mul_pair_single        = ps_mul_pair_single_c;
-    s->hybrid_analysis        = ps_hybrid_analysis_c;
-    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_c;
-    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_c;
-    s->decorrelate            = ps_decorrelate_c;
-    s->stereo_interpolate[0]  = ps_stereo_interpolate_c;
-    s->stereo_interpolate[1]  = ps_stereo_interpolate_ipdopd_c;
-
-    if (ARCH_ARM)
-        ff_psdsp_init_arm(s);
-}
diff --git a/libavcodec/aacpsdsp.h b/libavcodec/aacpsdsp.h
index dc380b1..ad9bbb8 100644
--- a/libavcodec/aacpsdsp.h
+++ b/libavcodec/aacpsdsp.h
@@ -1,53 +1,57 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef LIBAVCODEC_AACPSDSP_H
-#define LIBAVCODEC_AACPSDSP_H
+#ifndef AVCODEC_AACPSDSP_H
+#define AVCODEC_AACPSDSP_H
+
+#include "aac_defines.h"
 
 #define PS_QMF_TIME_SLOTS 32
 #define PS_AP_LINKS 3
 #define PS_MAX_AP_DELAY 5
 
 typedef struct PSDSPContext {
-    void (*add_squares)(float *dst, const float (*src)[2], int n);
-    void (*mul_pair_single)(float (*dst)[2], float (*src0)[2], float *src1,
+    void (*add_squares)(INTFLOAT *dst, const INTFLOAT (*src)[2], int n);
+    void (*mul_pair_single)(INTFLOAT (*dst)[2], INTFLOAT (*src0)[2], INTFLOAT *src1,
                             int n);
-    void (*hybrid_analysis)(float (*out)[2], float (*in)[2],
-                            const float (*filter)[8][2],
+    void (*hybrid_analysis)(INTFLOAT (*out)[2], INTFLOAT (*in)[2],
+                            const INTFLOAT (*filter)[8][2],
                             int stride, int n);
-    void (*hybrid_analysis_ileave)(float (*out)[32][2], float L[2][38][64],
+    void (*hybrid_analysis_ileave)(INTFLOAT (*out)[32][2], INTFLOAT L[2][38][64],
                                    int i, int len);
-    void (*hybrid_synthesis_deint)(float out[2][38][64], float (*in)[32][2],
+    void (*hybrid_synthesis_deint)(INTFLOAT out[2][38][64], INTFLOAT (*in)[32][2],
                                    int i, int len);
-    void (*decorrelate)(float (*out)[2], float (*delay)[2],
-                        float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
-                        const float phi_fract[2], const float (*Q_fract)[2],
-                        const float *transient_gain,
-                        float g_decay_slope,
+    void (*decorrelate)(INTFLOAT (*out)[2], INTFLOAT (*delay)[2],
+                        INTFLOAT (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
+                        const INTFLOAT phi_fract[2], const INTFLOAT (*Q_fract)[2],
+                        const INTFLOAT *transient_gain,
+                        INTFLOAT g_decay_slope,
                         int len);
-    void (*stereo_interpolate[2])(float (*l)[2], float (*r)[2],
-                                  float h[2][4], float h_step[2][4],
+    void (*stereo_interpolate[2])(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                  INTFLOAT h[2][4], INTFLOAT h_step[2][4],
                                   int len);
 } PSDSPContext;
 
-void ff_psdsp_init(PSDSPContext *s);
+void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s);
 void ff_psdsp_init_arm(PSDSPContext *s);
+void ff_psdsp_init_mips(PSDSPContext *s);
+void ff_psdsp_init_x86(PSDSPContext *s);
 
-#endif /* LIBAVCODEC_AACPSDSP_H */
+#endif /* AVCODEC_AACPSDSP_H */
diff --git a/libavcodec/aacpsdsp_fixed.c b/libavcodec/aacpsdsp_fixed.c
new file mode 100644
index 0000000..2413295
--- /dev/null
+++ b/libavcodec/aacpsdsp_fixed.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+
+#include "aacpsdsp_template.c"
diff --git a/libavcodec/aacpsdsp_float.c b/libavcodec/aacpsdsp_float.c
new file mode 100644
index 0000000..99aa650
--- /dev/null
+++ b/libavcodec/aacpsdsp_float.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 0
+
+#include "aacpsdsp_template.c"
diff --git a/libavcodec/aacpsdsp_template.c b/libavcodec/aacpsdsp_template.c
new file mode 100644
index 0000000..3049ce8
--- /dev/null
+++ b/libavcodec/aacpsdsp_template.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "aacpsdsp.h"
+
+static void ps_add_squares_c(INTFLOAT *dst, const INTFLOAT (*src)[2], int n)
+{
+    int i;
+    for (i = 0; i < n; i++)
+        dst[i] += AAC_MADD28(src[i][0], src[i][0], src[i][1], src[i][1]);
+}
+
+static void ps_mul_pair_single_c(INTFLOAT (*dst)[2], INTFLOAT (*src0)[2], INTFLOAT *src1,
+                                 int n)
+{
+    int i;
+    for (i = 0; i < n; i++) {
+        dst[i][0] = AAC_MUL16(src0[i][0], src1[i]);
+        dst[i][1] = AAC_MUL16(src0[i][1], src1[i]);
+    }
+}
+
+static void ps_hybrid_analysis_c(INTFLOAT (*out)[2], INTFLOAT (*in)[2],
+                                 const INTFLOAT (*filter)[8][2],
+                                 int stride, int n)
+{
+    int i, j;
+
+    for (i = 0; i < n; i++) {
+        INT64FLOAT sum_re = (INT64FLOAT)filter[i][6][0] * in[6][0];
+        INT64FLOAT sum_im = (INT64FLOAT)filter[i][6][0] * in[6][1];
+
+        for (j = 0; j < 6; j++) {
+            INTFLOAT in0_re = in[j][0];
+            INTFLOAT in0_im = in[j][1];
+            INTFLOAT in1_re = in[12-j][0];
+            INTFLOAT in1_im = in[12-j][1];
+            sum_re += (INT64FLOAT)filter[i][j][0] * (in0_re + in1_re) -
+                      (INT64FLOAT)filter[i][j][1] * (in0_im - in1_im);
+            sum_im += (INT64FLOAT)filter[i][j][0] * (in0_im + in1_im) +
+                      (INT64FLOAT)filter[i][j][1] * (in0_re - in1_re);
+        }
+#if USE_FIXED
+        out[i * stride][0] = (int)((sum_re + 0x40000000) >> 31);
+        out[i * stride][1] = (int)((sum_im + 0x40000000) >> 31);
+#else
+        out[i * stride][0] = sum_re;
+        out[i * stride][1] = sum_im;
+#endif /* USE_FIXED */
+    }
+}
+static void ps_hybrid_analysis_ileave_c(INTFLOAT (*out)[32][2], INTFLOAT L[2][38][64],
+                                      int i, int len)
+{
+    int j;
+
+    for (; i < 64; i++) {
+        for (j = 0; j < len; j++) {
+            out[i][j][0] = L[0][j][i];
+            out[i][j][1] = L[1][j][i];
+        }
+    }
+}
+
+static void ps_hybrid_synthesis_deint_c(INTFLOAT out[2][38][64],
+                                      INTFLOAT (*in)[32][2],
+                                      int i, int len)
+{
+    int n;
+
+    for (; i < 64; i++) {
+        for (n = 0; n < len; n++) {
+            out[0][n][i] = in[i][n][0];
+            out[1][n][i] = in[i][n][1];
+        }
+    }
+}
+
+static void ps_decorrelate_c(INTFLOAT (*out)[2], INTFLOAT (*delay)[2],
+                             INTFLOAT (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
+                             const INTFLOAT phi_fract[2], const INTFLOAT (*Q_fract)[2],
+                             const INTFLOAT *transient_gain,
+                             INTFLOAT g_decay_slope,
+                             int len)
+{
+    static const INTFLOAT a[] = { Q31(0.65143905753106f),
+                               Q31(0.56471812200776f),
+                               Q31(0.48954165955695f) };
+    INTFLOAT ag[PS_AP_LINKS];
+    int m, n;
+
+    for (m = 0; m < PS_AP_LINKS; m++)
+        ag[m] = AAC_MUL30(a[m], g_decay_slope);
+
+    for (n = 0; n < len; n++) {
+        INTFLOAT in_re = AAC_MSUB30(delay[n][0], phi_fract[0], delay[n][1], phi_fract[1]);
+        INTFLOAT in_im = AAC_MADD30(delay[n][0], phi_fract[1], delay[n][1], phi_fract[0]);
+        for (m = 0; m < PS_AP_LINKS; m++) {
+            INTFLOAT a_re                = AAC_MUL31(ag[m], in_re);
+            INTFLOAT a_im                = AAC_MUL31(ag[m], in_im);
+            INTFLOAT link_delay_re       = ap_delay[m][n+2-m][0];
+            INTFLOAT link_delay_im       = ap_delay[m][n+2-m][1];
+            INTFLOAT fractional_delay_re = Q_fract[m][0];
+            INTFLOAT fractional_delay_im = Q_fract[m][1];
+            INTFLOAT apd_re = in_re;
+            INTFLOAT apd_im = in_im;
+            in_re = AAC_MSUB30(link_delay_re, fractional_delay_re,
+                    link_delay_im, fractional_delay_im);
+            in_re -= a_re;
+            in_im = AAC_MADD30(link_delay_re, fractional_delay_im,
+                    link_delay_im, fractional_delay_re);
+            in_im -= a_im;
+            ap_delay[m][n+5][0] = apd_re + AAC_MUL31(ag[m], in_re);
+            ap_delay[m][n+5][1] = apd_im + AAC_MUL31(ag[m], in_im);
+        }
+        out[n][0] = AAC_MUL16(transient_gain[n], in_re);
+        out[n][1] = AAC_MUL16(transient_gain[n], in_im);
+    }
+}
+
+static void ps_stereo_interpolate_c(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                    INTFLOAT h[2][4], INTFLOAT h_step[2][4],
+                                    int len)
+{
+    INTFLOAT h0 = h[0][0];
+    INTFLOAT h1 = h[0][1];
+    INTFLOAT h2 = h[0][2];
+    INTFLOAT h3 = h[0][3];
+    INTFLOAT hs0 = h_step[0][0];
+    INTFLOAT hs1 = h_step[0][1];
+    INTFLOAT hs2 = h_step[0][2];
+    INTFLOAT hs3 = h_step[0][3];
+    int n;
+
+    for (n = 0; n < len; n++) {
+        //l is s, r is d
+        INTFLOAT l_re = l[n][0];
+        INTFLOAT l_im = l[n][1];
+        INTFLOAT r_re = r[n][0];
+        INTFLOAT r_im = r[n][1];
+        h0 += hs0;
+        h1 += hs1;
+        h2 += hs2;
+        h3 += hs3;
+        l[n][0] = AAC_MADD30(h0,  l_re,  h2, r_re);
+        l[n][1] = AAC_MADD30(h0,  l_im,  h2,  r_im);
+        r[n][0] = AAC_MADD30(h1,  l_re,  h3,  r_re);
+        r[n][1] = AAC_MADD30(h1,  l_im,  h3,  r_im);
+    }
+}
+
+static void ps_stereo_interpolate_ipdopd_c(INTFLOAT (*l)[2], INTFLOAT (*r)[2],
+                                           INTFLOAT h[2][4], INTFLOAT h_step[2][4],
+                                           int len)
+{
+    INTFLOAT h00  = h[0][0],      h10  = h[1][0];
+    INTFLOAT h01  = h[0][1],      h11  = h[1][1];
+    INTFLOAT h02  = h[0][2],      h12  = h[1][2];
+    INTFLOAT h03  = h[0][3],      h13  = h[1][3];
+    INTFLOAT hs00 = h_step[0][0], hs10 = h_step[1][0];
+    INTFLOAT hs01 = h_step[0][1], hs11 = h_step[1][1];
+    INTFLOAT hs02 = h_step[0][2], hs12 = h_step[1][2];
+    INTFLOAT hs03 = h_step[0][3], hs13 = h_step[1][3];
+    int n;
+
+    for (n = 0; n < len; n++) {
+        //l is s, r is d
+        INTFLOAT l_re = l[n][0];
+        INTFLOAT l_im = l[n][1];
+        INTFLOAT r_re = r[n][0];
+        INTFLOAT r_im = r[n][1];
+        h00 += hs00;
+        h01 += hs01;
+        h02 += hs02;
+        h03 += hs03;
+        h10 += hs10;
+        h11 += hs11;
+        h12 += hs12;
+        h13 += hs13;
+
+        l[n][0] = AAC_MSUB30_V8(h00, l_re, h02, r_re, h10, l_im, h12, r_im);
+        l[n][1] = AAC_MADD30_V8(h00, l_im, h02, r_im, h10, l_re, h12, r_re);
+        r[n][0] = AAC_MSUB30_V8(h01, l_re, h03, r_re, h11, l_im, h13, r_im);
+        r[n][1] = AAC_MADD30_V8(h01, l_im, h03, r_im, h11, l_re, h13, r_re);
+    }
+}
+
+av_cold void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s)
+{
+    s->add_squares            = ps_add_squares_c;
+    s->mul_pair_single        = ps_mul_pair_single_c;
+    s->hybrid_analysis        = ps_hybrid_analysis_c;
+    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_c;
+    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_c;
+    s->decorrelate            = ps_decorrelate_c;
+    s->stereo_interpolate[0]  = ps_stereo_interpolate_c;
+    s->stereo_interpolate[1]  = ps_stereo_interpolate_ipdopd_c;
+
+#if !USE_FIXED
+    if (ARCH_ARM)
+        ff_psdsp_init_arm(s);
+    if (ARCH_MIPS)
+        ff_psdsp_init_mips(s);
+    if (ARCH_X86)
+        ff_psdsp_init_x86(s);
+#endif /* !USE_FIXED */
+}
diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
index 6cfae6b..a5fec73 100644
--- a/libavcodec/aacpsy.c
+++ b/libavcodec/aacpsy.c
@@ -2,20 +2,20 @@
  * AAC encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/ffmath.h"
+
 #include "avcodec.h"
 #include "aactab.h"
 #include "psymodel.h"
@@ -78,6 +80,8 @@
 #define PSY_3GPP_AH_THR_LONG    0.5f
 #define PSY_3GPP_AH_THR_SHORT   0.63f
 
+#define PSY_PE_FORGET_SLOPE  511
+
 enum {
     PSY_3GPP_AH_NONE,
     PSY_3GPP_AH_INACTIVE,
@@ -85,6 +89,7 @@ enum {
 };
 
 #define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
+#define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f)
 
 /* LAME psy model constants */
 #define PSY_LAME_FIR_LEN 21         ///< LAME psy model FIR order
@@ -155,6 +160,7 @@ typedef struct AacPsyContext{
     } pe;
     AacPsyCoeffs psy_coef[2][64];
     AacPsyChannel *ch;
+    float global_quality; ///< normalized global quality taken from avctx
 }AacPsyContext;
 
 /**
@@ -216,6 +222,10 @@ static const float psy_fir_coeffs[] = {
     -5.52212e-17 * 2, -0.313819 * 2
 };
 
+#if ARCH_MIPS
+#   include "mips/aacpsy_mips.h"
+#endif /* ARCH_MIPS */
+
 /**
  * Calculate the ABR attack threshold from the above LAME psymodel table.
  */
@@ -293,17 +303,24 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
     float bark;
     int i, j, g, start;
     float prev, minscale, minath, minsnr, pe_min;
-    const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
-    const int bandwidth    = ctx->avctx->cutoff ? ctx->avctx->cutoff : ctx->avctx->sample_rate / 2;
+    int chan_bitrate = ctx->avctx->bit_rate / ((ctx->avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : ctx->avctx->channels);
+
+    const int bandwidth    = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
     const float num_bark   = calc_bark((float)bandwidth);
 
     ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
     if (!ctx->model_priv_data)
         return AVERROR(ENOMEM);
     pctx = (AacPsyContext*) ctx->model_priv_data;
+    pctx->global_quality = (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) * 0.01f;
+
+    if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+        /* Use the target average bitrate to compute spread parameters */
+        chan_bitrate = (int)(chan_bitrate / 120.0 * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120));
+    }
 
     pctx->chan_bitrate = chan_bitrate;
-    pctx->frame_bits   = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate;
+    pctx->frame_bits   = FFMIN(2560, chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate);
     pctx->pe.min       =  8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
     pctx->pe.max       = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
     ctx->bitres.size   = 6144 - pctx->frame_bits;
@@ -332,12 +349,12 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
         for (g = 0; g < ctx->num_bands[j] - 1; g++) {
             AacPsyCoeffs *coeff = &coeffs[g];
             float bark_width = coeffs[g+1].barks - coeffs->barks;
-            coeff->spread_low[0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_LOW);
-            coeff->spread_hi [0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_HI);
-            coeff->spread_low[1] = pow(10.0, -bark_width * en_spread_low);
-            coeff->spread_hi [1] = pow(10.0, -bark_width * en_spread_hi);
+            coeff->spread_low[0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_LOW);
+            coeff->spread_hi [0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_HI);
+            coeff->spread_low[1] = ff_exp10(-bark_width * en_spread_low);
+            coeff->spread_hi [1] = ff_exp10(-bark_width * en_spread_hi);
             pe_min = bark_pe * bark_width;
-            minsnr = pow(2.0f, pe_min / band_sizes[g]) - 1.5f;
+            minsnr = exp2(pe_min / band_sizes[g]) - 1.5f;
             coeff->min_snr = av_clipf(1.0f / minsnr, PSY_SNR_25DB, PSY_SNR_1DB);
         }
         start = 0;
@@ -350,9 +367,9 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
         }
     }
 
-    pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels);
+    pctx->ch = av_mallocz_array(ctx->avctx->channels, sizeof(AacPsyChannel));
     if (!pctx->ch) {
-        av_freep(&pctx);
+        av_freep(&ctx->model_priv_data);
         return AVERROR(ENOMEM);
     }
 
@@ -391,7 +408,7 @@ static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
                                                  int channel, int prev_type)
 {
     int i, j;
-    int br               = ctx->avctx->bit_rate / ctx->avctx->channels;
+    int br               = ((AacPsyContext*)ctx->model_priv_data)->chan_bitrate;
     int attack_ratio     = br <= 16000 ? 18 : 10;
     AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
     AacPsyChannel *pch  = &pctx->ch[channel];
@@ -480,7 +497,7 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
     const float bitspend_add   = short_window ? PSY_3GPP_SPEND_ADD_S   : PSY_3GPP_SPEND_ADD_L;
     const float clip_low       = short_window ? PSY_3GPP_CLIP_LO_S     : PSY_3GPP_CLIP_LO_L;
     const float clip_high      = short_window ? PSY_3GPP_CLIP_HI_S     : PSY_3GPP_CLIP_HI_L;
-    float clipped_pe, bit_save, bit_spend, bit_factor, fill_level;
+    float clipped_pe, bit_save, bit_spend, bit_factor, fill_level, forgetful_min_pe;
 
     ctx->fill_level += ctx->frame_bits - bits;
     ctx->fill_level  = av_clip(ctx->fill_level, 0, size);
@@ -497,11 +514,21 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
      * Hopefully below is correct.
      */
     bit_factor = 1.0f - bit_save + ((bit_spend - bit_save) / (ctx->pe.max - ctx->pe.min)) * (clipped_pe - ctx->pe.min);
-    /* NOTE: The reference encoder attempts to center pe max/min around the current pe. */
+    /* NOTE: The reference encoder attempts to center pe max/min around the current pe.
+     * Here we do that by slowly forgetting pe.min when pe stays in a range that makes
+     * it unlikely (ie: above the mean)
+     */
     ctx->pe.max = FFMAX(pe, ctx->pe.max);
-    ctx->pe.min = FFMIN(pe, ctx->pe.min);
+    forgetful_min_pe = ((ctx->pe.min * PSY_PE_FORGET_SLOPE)
+        + FFMAX(ctx->pe.min, pe * (pe / ctx->pe.max))) / (PSY_PE_FORGET_SLOPE + 1);
+    ctx->pe.min = FFMIN(pe, forgetful_min_pe);
 
-    return FFMIN(ctx->frame_bits * bit_factor, ctx->frame_bits + size - bits);
+    /* NOTE: allocate a minimum of 1/8th average frame bits, to avoid
+     *   reservoir starvation from producing zero-bit frames
+     */
+    return FFMIN(
+        ctx->frame_bits * bit_factor,
+        FFMAX(ctx->frame_bits + size - bits, ctx->frame_bits / 8));
 }
 
 static float calc_pe_3gpp(AacPsyBand *band)
@@ -532,8 +559,11 @@ static float calc_reduction_3gpp(float a, float desired_pe, float pe,
 {
     float thr_avg, reduction;
 
-    thr_avg   = powf(2.0f, (a - pe) / (4.0f * active_lines));
-    reduction = powf(2.0f, (a - desired_pe) / (4.0f * active_lines)) - thr_avg;
+    if(active_lines == 0.0)
+        return 0;
+
+    thr_avg   = exp2f((a - pe) / (4.0f * active_lines));
+    reduction = exp2f((a - desired_pe) / (4.0f * active_lines)) - thr_avg;
 
     return FFMAX(reduction, 0.0f);
 }
@@ -544,8 +574,10 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
     float thr = band->thr;
 
     if (band->energy > thr) {
-        thr = powf(thr, 0.25f) + reduction;
-        thr = powf(thr, 4.0f);
+        thr = sqrtf(thr);
+        thr = sqrtf(thr) + reduction;
+        thr *= thr;
+        thr *= thr;
 
         /* This deviates from the 3GPP spec to match the reference encoder.
          * It performs min(thr_reduced, max(thr, energy/min_snr)) only for bands
@@ -561,6 +593,56 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
     return thr;
 }
 
+#ifndef calc_thr_3gpp
+static void calc_thr_3gpp(const FFPsyWindowInfo *wi, const int num_bands, AacPsyChannel *pch,
+                          const uint8_t *band_sizes, const float *coefs, const int cutoff)
+{
+    int i, w, g;
+    int start = 0, wstart = 0;
+    for (w = 0; w < wi->num_windows*16; w += 16) {
+        wstart = 0;
+        for (g = 0; g < num_bands; g++) {
+            AacPsyBand *band = &pch->band[w+g];
+
+            float form_factor = 0.0f;
+            float Temp;
+            band->energy = 0.0f;
+            if (wstart < cutoff) {
+                for (i = 0; i < band_sizes[g]; i++) {
+                    band->energy += coefs[start+i] * coefs[start+i];
+                    form_factor  += sqrtf(fabs(coefs[start+i]));
+                }
+            }
+            Temp = band->energy > 0 ? sqrtf((float)band_sizes[g] / band->energy) : 0;
+            band->thr      = band->energy * 0.001258925f;
+            band->nz_lines = form_factor * sqrtf(Temp);
+
+            start += band_sizes[g];
+            wstart += band_sizes[g];
+        }
+    }
+}
+#endif /* calc_thr_3gpp */
+
+#ifndef psy_hp_filter
+static void psy_hp_filter(const float *firbuf, float *hpfsmpl, const float *psy_fir_coeffs)
+{
+    int i, j;
+    for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
+        float sum1, sum2;
+        sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
+        sum2 = 0.0;
+        for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
+            sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
+            sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
+        }
+        /* NOTE: The LAME psymodel expects it's input in the range -32768 to 32768.
+         *       Tuning this for normalized floats would be difficult. */
+        hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
+    }
+}
+#endif /* psy_hp_filter */
+
 /**
  * Calculate band thresholds as suggested in 3GPP TS26.403
  */
@@ -569,33 +651,20 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 {
     AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
     AacPsyChannel *pch  = &pctx->ch[channel];
-    int start = 0;
     int i, w, g;
-    float desired_bits, desired_pe, delta_pe, reduction, spread_en[128] = {0};
+    float desired_bits, desired_pe, delta_pe, reduction= NAN, spread_en[128] = {0};
     float a = 0.0f, active_lines = 0.0f, norm_fac = 0.0f;
     float pe = pctx->chan_bitrate > 32000 ? 0.0f : FFMAX(50.0f, 100.0f - pctx->chan_bitrate * 100.0f / 32000.0f);
     const int      num_bands   = ctx->num_bands[wi->num_windows == 8];
     const uint8_t *band_sizes  = ctx->bands[wi->num_windows == 8];
     AacPsyCoeffs  *coeffs      = pctx->psy_coef[wi->num_windows == 8];
     const float avoid_hole_thr = wi->num_windows == 8 ? PSY_3GPP_AH_THR_SHORT : PSY_3GPP_AH_THR_LONG;
+    const int bandwidth        = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
+    const int cutoff           = bandwidth * 2048 / wi->num_windows / ctx->avctx->sample_rate;
 
     //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
-    for (w = 0; w < wi->num_windows*16; w += 16) {
-        for (g = 0; g < num_bands; g++) {
-            AacPsyBand *band = &pch->band[w+g];
+    calc_thr_3gpp(wi, num_bands, pch, band_sizes, coefs, cutoff);
 
-            float form_factor = 0.0f;
-            band->energy = 0.0f;
-            for (i = 0; i < band_sizes[g]; i++) {
-                band->energy += coefs[start+i] * coefs[start+i];
-                form_factor  += sqrtf(fabs(coefs[start+i]));
-            }
-            band->thr      = band->energy * 0.001258925f;
-            band->nz_lines = form_factor / powf(band->energy / band_sizes[g], 0.25f);
-
-            start += band_sizes[g];
-        }
-    }
     //modify thresholds and energies - spread, threshold in quiet, pre-echo control
     for (w = 0; w < wi->num_windows*16; w += 16) {
         AacPsyBand *bands = &pch->band[w];
@@ -616,7 +685,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
             band->thr_quiet = band->thr = FFMAX(band->thr, coeffs[g].ath);
             //5.4.2.5 "Pre-echo control"
-            if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (wi->window_type[1] == LONG_START_SEQUENCE && !w)))
+            if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (!w && wi->window_type[1] == LONG_START_SEQUENCE)))
                 band->thr = FFMAX(PSY_3GPP_RPEMIN*band->thr, FFMIN(band->thr,
                                   PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
 
@@ -635,16 +704,36 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
     /* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
     ctx->ch[channel].entropy = pe;
-    desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
-    desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
-    /* NOTE: PE correction is kept simple. During initial testing it had very
-     *       little effect on the final bitrate. Probably a good idea to come
-     *       back and do more testing later.
-     */
-    if (ctx->bitres.bits > 0)
-        desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
-                               0.85f, 1.15f);
+    if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+        /* (2.5 * 120) achieves almost transparent rate, and we want to give
+         * ample room downwards, so we make that equivalent to QSCALE=2.4
+         */
+        desired_pe = pe * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) / (2 * 2.5f * 120.0f);
+        desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+        desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+
+        /* PE slope smoothing */
+        if (ctx->bitres.bits > 0) {
+            desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+            desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+        }
+
+        pctx->pe.max = FFMAX(pe, pctx->pe.max);
+        pctx->pe.min = FFMIN(pe, pctx->pe.min);
+    } else {
+        desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
+        desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
+
+        /* NOTE: PE correction is kept simple. During initial testing it had very
+         *       little effect on the final bitrate. Probably a good idea to come
+         *       back and do more testing later.
+         */
+        if (ctx->bitres.bits > 0)
+            desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
+                                   0.85f, 1.15f);
+    }
     pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
+    ctx->bitres.alloc = desired_bits;
 
     if (desired_pe < pe) {
         /* 5.6.1.3.4 "First Estimation of the reduction value" */
@@ -681,7 +770,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
             }
             desired_pe_no_ah = FFMAX(desired_pe - (pe - pe_no_ah), 0.0f);
             if (active_lines > 0.0f)
-                reduction += calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
+                reduction = calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
 
             pe = 0.0f;
             for (w = 0; w < wi->num_windows*16; w += 16) {
@@ -691,7 +780,10 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
                     if (active_lines > 0.0f)
                         band->thr = calc_reduced_thr_3gpp(band, coeffs[g].min_snr, reduction);
                     pe += calc_pe_3gpp(band);
-                    band->norm_fac = band->active_lines / band->thr;
+                    if (band->thr > 0.0f)
+                        band->norm_fac = band->active_lines / band->thr;
+                    else
+                        band->norm_fac = 0.0f;
                     norm_fac += band->norm_fac;
                 }
             }
@@ -711,7 +803,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
                         float delta_sfb_pe = band->norm_fac * norm_fac * delta_pe;
                         float thr = band->thr;
 
-                        thr *= powf(2.0f, delta_sfb_pe / band->active_lines);
+                        thr *= exp2f(delta_sfb_pe / band->active_lines);
                         if (thr > coeffs[g].min_snr * band->energy && band->avoid_holes == PSY_3GPP_AH_INACTIVE)
                             thr = FFMAX(band->thr, coeffs[g].min_snr * band->energy);
                         band->thr = thr;
@@ -742,6 +834,8 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
 
             psy_band->threshold = band->thr;
             psy_band->energy    = band->energy;
+            psy_band->spread    = band->active_lines * 2.0f / band_sizes[g];
+            psy_band->bits      = PSY_3GPP_PE_TO_BITS(band->pe);
         }
     }
 
@@ -801,21 +895,10 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
         float energy_subshort[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS];
         float energy_short[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
         const float *firbuf = la + (AAC_BLOCK_SIZE_SHORT/4 - PSY_LAME_FIR_LEN);
-        int j, att_sum = 0;
+        int att_sum = 0;
 
         /* LAME comment: apply high pass filter of fs/4 */
-        for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
-            float sum1, sum2;
-            sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
-            sum2 = 0.0;
-            for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
-                sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
-                sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
-            }
-            /* NOTE: The LAME psymodel expects its input in the range -32768 to
-             * 32768. Tuning this for normalized floats would be difficult. */
-            hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
-        }
+        psy_hp_filter(firbuf, hpfsmpl, psy_fir_coeffs);
 
         /* Calculate the energies of each sub-shortblock */
         for (i = 0; i < PSY_LAME_NUM_SUBBLOCKS; i++) {
@@ -893,12 +976,14 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
 
     wi.window_type[1] = prev_type;
     if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
+
         wi.num_windows  = 1;
         wi.grouping[0]  = 1;
         if (wi.window_type[0] == LONG_START_SEQUENCE)
             wi.window_shape = 0;
         else
             wi.window_shape = 1;
+
     } else {
         int lastgrp = 0;
 
diff --git a/libavcodec/aacsbr.c b/libavcodec/aacsbr.c
index 3e3432c..15956e3 100644
--- a/libavcodec/aacsbr.c
+++ b/libavcodec/aacsbr.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,263 +25,31 @@
  * AAC Spectral Band Replication decoding functions
  * @author Robert Swain ( rob opendot cl )
  */
+#define USE_FIXED 0
 
 #include "aac.h"
 #include "sbr.h"
 #include "aacsbr.h"
 #include "aacsbrdata.h"
+#include "aacsbr_tablegen.h"
 #include "fft.h"
+#include "internal.h"
 #include "aacps.h"
 #include "sbrdsp.h"
 #include "libavutil/internal.h"
 #include "libavutil/libm.h"
+#include "libavutil/avassert.h"
 
 #include <stdint.h>
 #include <float.h>
+#include <math.h>
 
-#define ENVELOPE_ADJUSTMENT_OFFSET 2
-#define NOISE_FLOOR_OFFSET 6.0f
-
-/**
- * SBR VLC tables
- */
-enum {
-    T_HUFFMAN_ENV_1_5DB,
-    F_HUFFMAN_ENV_1_5DB,
-    T_HUFFMAN_ENV_BAL_1_5DB,
-    F_HUFFMAN_ENV_BAL_1_5DB,
-    T_HUFFMAN_ENV_3_0DB,
-    F_HUFFMAN_ENV_3_0DB,
-    T_HUFFMAN_ENV_BAL_3_0DB,
-    F_HUFFMAN_ENV_BAL_3_0DB,
-    T_HUFFMAN_NOISE_3_0DB,
-    T_HUFFMAN_NOISE_BAL_3_0DB,
-};
-
-/**
- * bs_frame_class - frame class of current SBR frame (14496-3 sp04 p98)
- */
-enum {
-    FIXFIX,
-    FIXVAR,
-    VARFIX,
-    VARVAR,
-};
-
-enum {
-    EXTENSION_ID_PS = 2,
-};
+#if ARCH_MIPS
+#include "mips/aacsbr_mips.h"
+#endif /* ARCH_MIPS */
 
 static VLC vlc_sbr[10];
-static const int8_t vlc_sbr_lav[10] =
-    { 60, 60, 24, 24, 31, 31, 12, 12, 31, 12 };
-
-#define SBR_INIT_VLC_STATIC(num, size) \
-    INIT_VLC_STATIC(&vlc_sbr[num], 9, sbr_tmp[num].table_size / sbr_tmp[num].elem_size,     \
-                    sbr_tmp[num].sbr_bits ,                      1,                      1, \
-                    sbr_tmp[num].sbr_codes, sbr_tmp[num].elem_size, sbr_tmp[num].elem_size, \
-                    size)
-
-#define SBR_VLC_ROW(name) \
-    { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
-
-av_cold void ff_aac_sbr_init(void)
-{
-    int n;
-    static const struct {
-        const void *sbr_codes, *sbr_bits;
-        const unsigned int table_size, elem_size;
-    } sbr_tmp[] = {
-        SBR_VLC_ROW(t_huffman_env_1_5dB),
-        SBR_VLC_ROW(f_huffman_env_1_5dB),
-        SBR_VLC_ROW(t_huffman_env_bal_1_5dB),
-        SBR_VLC_ROW(f_huffman_env_bal_1_5dB),
-        SBR_VLC_ROW(t_huffman_env_3_0dB),
-        SBR_VLC_ROW(f_huffman_env_3_0dB),
-        SBR_VLC_ROW(t_huffman_env_bal_3_0dB),
-        SBR_VLC_ROW(f_huffman_env_bal_3_0dB),
-        SBR_VLC_ROW(t_huffman_noise_3_0dB),
-        SBR_VLC_ROW(t_huffman_noise_bal_3_0dB),
-    };
-
-    // SBR VLC table initialization
-    SBR_INIT_VLC_STATIC(0, 1098);
-    SBR_INIT_VLC_STATIC(1, 1092);
-    SBR_INIT_VLC_STATIC(2, 768);
-    SBR_INIT_VLC_STATIC(3, 1026);
-    SBR_INIT_VLC_STATIC(4, 1058);
-    SBR_INIT_VLC_STATIC(5, 1052);
-    SBR_INIT_VLC_STATIC(6, 544);
-    SBR_INIT_VLC_STATIC(7, 544);
-    SBR_INIT_VLC_STATIC(8, 592);
-    SBR_INIT_VLC_STATIC(9, 512);
-
-    for (n = 1; n < 320; n++)
-        sbr_qmf_window_us[320 + n] = sbr_qmf_window_us[320 - n];
-    sbr_qmf_window_us[384] = -sbr_qmf_window_us[384];
-    sbr_qmf_window_us[512] = -sbr_qmf_window_us[512];
-
-    for (n = 0; n < 320; n++)
-        sbr_qmf_window_ds[n] = sbr_qmf_window_us[2*n];
-
-    ff_ps_init();
-}
-
-/** Places SBR in pure upsampling mode. */
-static void sbr_turnoff(SpectralBandReplication *sbr) {
-    sbr->start = 0;
-    // Init defaults used in pure upsampling mode
-    sbr->kx[1] = 32; //Typo in spec, kx' inits to 32
-    sbr->m[1] = 0;
-    // Reset values for first SBR header
-    sbr->data[0].e_a[1] = sbr->data[1].e_a[1] = -1;
-    memset(&sbr->spectrum_params, -1, sizeof(SpectrumParameters));
-}
-
-av_cold void ff_aac_sbr_ctx_init(AACContext *ac, SpectralBandReplication *sbr)
-{
-    sbr->kx[0] = sbr->kx[1];
-    sbr_turnoff(sbr);
-    sbr->data[0].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
-    sbr->data[1].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
-    /* SBR requires samples to be scaled to +/-32768.0 to work correctly.
-     * mdct scale factors are adjusted to scale up from +/-1.0 at analysis
-     * and scale back down at synthesis. */
-    ff_mdct_init(&sbr->mdct,     7, 1, 1.0 / (64 * 32768.0));
-    ff_mdct_init(&sbr->mdct_ana, 7, 1, -2.0 * 32768.0);
-    ff_ps_ctx_init(&sbr->ps);
-    ff_sbrdsp_init(&sbr->dsp);
-}
-
-av_cold void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr)
-{
-    ff_mdct_end(&sbr->mdct);
-    ff_mdct_end(&sbr->mdct_ana);
-}
-
-static int qsort_comparison_function_int16(const void *a, const void *b)
-{
-    return *(const int16_t *)a - *(const int16_t *)b;
-}
-
-static inline int in_table_int16(const int16_t *table, int last_el, int16_t needle)
-{
-    int i;
-    for (i = 0; i <= last_el; i++)
-        if (table[i] == needle)
-            return 1;
-    return 0;
-}
-
-/// Limiter Frequency Band Table (14496-3 sp04 p198)
-static void sbr_make_f_tablelim(SpectralBandReplication *sbr)
-{
-    int k;
-    if (sbr->bs_limiter_bands > 0) {
-        static const float bands_warped[3] = { 1.32715174233856803909f,   //2^(0.49/1.2)
-                                               1.18509277094158210129f,   //2^(0.49/2)
-                                               1.11987160404675912501f }; //2^(0.49/3)
-        const float lim_bands_per_octave_warped = bands_warped[sbr->bs_limiter_bands - 1];
-        int16_t patch_borders[7];
-        uint16_t *in = sbr->f_tablelim + 1, *out = sbr->f_tablelim;
-
-        patch_borders[0] = sbr->kx[1];
-        for (k = 1; k <= sbr->num_patches; k++)
-            patch_borders[k] = patch_borders[k-1] + sbr->patch_num_subbands[k-1];
-
-        memcpy(sbr->f_tablelim, sbr->f_tablelow,
-               (sbr->n[0] + 1) * sizeof(sbr->f_tablelow[0]));
-        if (sbr->num_patches > 1)
-            memcpy(sbr->f_tablelim + sbr->n[0] + 1, patch_borders + 1,
-                   (sbr->num_patches - 1) * sizeof(patch_borders[0]));
-
-        qsort(sbr->f_tablelim, sbr->num_patches + sbr->n[0],
-              sizeof(sbr->f_tablelim[0]),
-              qsort_comparison_function_int16);
-
-        sbr->n_lim = sbr->n[0] + sbr->num_patches - 1;
-        while (out < sbr->f_tablelim + sbr->n_lim) {
-            if (*in >= *out * lim_bands_per_octave_warped) {
-                *++out = *in++;
-            } else if (*in == *out ||
-                !in_table_int16(patch_borders, sbr->num_patches, *in)) {
-                in++;
-                sbr->n_lim--;
-            } else if (!in_table_int16(patch_borders, sbr->num_patches, *out)) {
-                *out = *in++;
-                sbr->n_lim--;
-            } else {
-                *++out = *in++;
-            }
-        }
-    } else {
-        sbr->f_tablelim[0] = sbr->f_tablelow[0];
-        sbr->f_tablelim[1] = sbr->f_tablelow[sbr->n[0]];
-        sbr->n_lim = 1;
-    }
-}
-
-static unsigned int read_sbr_header(SpectralBandReplication *sbr, GetBitContext *gb)
-{
-    unsigned int cnt = get_bits_count(gb);
-    uint8_t bs_header_extra_1;
-    uint8_t bs_header_extra_2;
-    int old_bs_limiter_bands = sbr->bs_limiter_bands;
-    SpectrumParameters old_spectrum_params;
-
-    sbr->start = 1;
-
-    // Save last spectrum parameters variables to compare to new ones
-    memcpy(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters));
-
-    sbr->bs_amp_res_header              = get_bits1(gb);
-    sbr->spectrum_params.bs_start_freq  = get_bits(gb, 4);
-    sbr->spectrum_params.bs_stop_freq   = get_bits(gb, 4);
-    sbr->spectrum_params.bs_xover_band  = get_bits(gb, 3);
-                                          skip_bits(gb, 2); // bs_reserved
-
-    bs_header_extra_1 = get_bits1(gb);
-    bs_header_extra_2 = get_bits1(gb);
-
-    if (bs_header_extra_1) {
-        sbr->spectrum_params.bs_freq_scale  = get_bits(gb, 2);
-        sbr->spectrum_params.bs_alter_scale = get_bits1(gb);
-        sbr->spectrum_params.bs_noise_bands = get_bits(gb, 2);
-    } else {
-        sbr->spectrum_params.bs_freq_scale  = 2;
-        sbr->spectrum_params.bs_alter_scale = 1;
-        sbr->spectrum_params.bs_noise_bands = 2;
-    }
-
-    // Check if spectrum parameters changed
-    if (memcmp(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters)))
-        sbr->reset = 1;
-
-    if (bs_header_extra_2) {
-        sbr->bs_limiter_bands  = get_bits(gb, 2);
-        sbr->bs_limiter_gains  = get_bits(gb, 2);
-        sbr->bs_interpol_freq  = get_bits1(gb);
-        sbr->bs_smoothing_mode = get_bits1(gb);
-    } else {
-        sbr->bs_limiter_bands  = 2;
-        sbr->bs_limiter_gains  = 2;
-        sbr->bs_interpol_freq  = 1;
-        sbr->bs_smoothing_mode = 1;
-    }
-
-    if (sbr->bs_limiter_bands != old_bs_limiter_bands && !sbr->reset)
-        sbr_make_f_tablelim(sbr);
-
-    return get_bits_count(gb) - cnt;
-}
-
-static int array_min_int16(const int16_t *array, int nel)
-{
-    int i, min = array[0];
-    for (i = 1; i < nel; i++)
-        min = FFMIN(array[i], min);
-    return min;
-}
+static void aacsbr_func_ptr_init(AACSBRContext *c);
 
 static void make_bands(int16_t* bands, int start, int stop, int num_bands)
 {
@@ -301,928 +69,70 @@ static void make_bands(int16_t* bands, int start, int stop, int num_bands)
     bands[num_bands-1] = stop - previous;
 }
 
-static int check_n_master(AVCodecContext *avctx, int n_master, int bs_xover_band)
-{
-    // Requirements (14496-3 sp04 p205)
-    if (n_master <= 0) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid n_master: %d\n", n_master);
-        return -1;
-    }
-    if (bs_xover_band >= n_master) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid bitstream, crossover band index beyond array bounds: %d\n",
-               bs_xover_band);
-        return -1;
-    }
-    return 0;
-}
-
-/// Master Frequency Band Table (14496-3 sp04 p194)
-static int sbr_make_f_master(AACContext *ac, SpectralBandReplication *sbr,
-                             SpectrumParameters *spectrum)
-{
-    unsigned int temp, max_qmf_subbands = 0;
-    unsigned int start_min, stop_min;
-    int k;
-    const int8_t *sbr_offset_ptr;
-    int16_t stop_dk[13];
-
-    if (sbr->sample_rate < 32000) {
-        temp = 3000;
-    } else if (sbr->sample_rate < 64000) {
-        temp = 4000;
-    } else
-        temp = 5000;
-
-    start_min = ((temp << 7) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-    stop_min  = ((temp << 8) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-
-    switch (sbr->sample_rate) {
-    case 16000:
-        sbr_offset_ptr = sbr_offset[0];
-        break;
-    case 22050:
-        sbr_offset_ptr = sbr_offset[1];
-        break;
-    case 24000:
-        sbr_offset_ptr = sbr_offset[2];
-        break;
-    case 32000:
-        sbr_offset_ptr = sbr_offset[3];
-        break;
-    case 44100: case 48000: case 64000:
-        sbr_offset_ptr = sbr_offset[4];
-        break;
-    case 88200: case 96000: case 128000: case 176400: case 192000:
-        sbr_offset_ptr = sbr_offset[5];
-        break;
-    default:
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Unsupported sample rate for SBR: %d\n", sbr->sample_rate);
-        return -1;
-    }
-
-    sbr->k[0] = start_min + sbr_offset_ptr[spectrum->bs_start_freq];
-
-    if (spectrum->bs_stop_freq < 14) {
-        sbr->k[2] = stop_min;
-        make_bands(stop_dk, stop_min, 64, 13);
-        qsort(stop_dk, 13, sizeof(stop_dk[0]), qsort_comparison_function_int16);
-        for (k = 0; k < spectrum->bs_stop_freq; k++)
-            sbr->k[2] += stop_dk[k];
-    } else if (spectrum->bs_stop_freq == 14) {
-        sbr->k[2] = 2*sbr->k[0];
-    } else if (spectrum->bs_stop_freq == 15) {
-        sbr->k[2] = 3*sbr->k[0];
-    } else {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bs_stop_freq: %d\n", spectrum->bs_stop_freq);
-        return -1;
-    }
-    sbr->k[2] = FFMIN(64, sbr->k[2]);
-
-    // Requirements (14496-3 sp04 p205)
-    if (sbr->sample_rate <= 32000) {
-        max_qmf_subbands = 48;
-    } else if (sbr->sample_rate == 44100) {
-        max_qmf_subbands = 35;
-    } else if (sbr->sample_rate >= 48000)
-        max_qmf_subbands = 32;
-
-    if (sbr->k[2] - sbr->k[0] > max_qmf_subbands) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bitstream, too many QMF subbands: %d\n", sbr->k[2] - sbr->k[0]);
-        return -1;
-    }
-
-    if (!spectrum->bs_freq_scale) {
-        int dk, k2diff;
-
-        dk = spectrum->bs_alter_scale + 1;
-        sbr->n_master = ((sbr->k[2] - sbr->k[0] + (dk&2)) >> dk) << 1;
-        if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-            return -1;
-
-        for (k = 1; k <= sbr->n_master; k++)
-            sbr->f_master[k] = dk;
-
-        k2diff = sbr->k[2] - sbr->k[0] - sbr->n_master * dk;
-        if (k2diff < 0) {
-            sbr->f_master[1]--;
-            sbr->f_master[2]-= (k2diff < -1);
-        } else if (k2diff) {
-            sbr->f_master[sbr->n_master]++;
-        }
-
-        sbr->f_master[0] = sbr->k[0];
-        for (k = 1; k <= sbr->n_master; k++)
-            sbr->f_master[k] += sbr->f_master[k - 1];
-
-    } else {
-        int half_bands = 7 - spectrum->bs_freq_scale;      // bs_freq_scale  = {1,2,3}
-        int two_regions, num_bands_0;
-        int vdk0_max, vdk1_min;
-        int16_t vk0[49];
-
-        if (49 * sbr->k[2] > 110 * sbr->k[0]) {
-            two_regions = 1;
-            sbr->k[1] = 2 * sbr->k[0];
-        } else {
-            two_regions = 0;
-            sbr->k[1] = sbr->k[2];
-        }
-
-        num_bands_0 = lrintf(half_bands * log2f(sbr->k[1] / (float)sbr->k[0])) * 2;
-
-        if (num_bands_0 <= 0) { // Requirements (14496-3 sp04 p205)
-            av_log(ac->avctx, AV_LOG_ERROR, "Invalid num_bands_0: %d\n", num_bands_0);
-            return -1;
-        }
-
-        vk0[0] = 0;
-
-        make_bands(vk0+1, sbr->k[0], sbr->k[1], num_bands_0);
-
-        qsort(vk0 + 1, num_bands_0, sizeof(vk0[1]), qsort_comparison_function_int16);
-        vdk0_max = vk0[num_bands_0];
-
-        vk0[0] = sbr->k[0];
-        for (k = 1; k <= num_bands_0; k++) {
-            if (vk0[k] <= 0) { // Requirements (14496-3 sp04 p205)
-                av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk0[%d]: %d\n", k, vk0[k]);
-                return -1;
-            }
-            vk0[k] += vk0[k-1];
-        }
-
-        if (two_regions) {
-            int16_t vk1[49];
-            float invwarp = spectrum->bs_alter_scale ? 0.76923076923076923077f
-                                                     : 1.0f; // bs_alter_scale = {0,1}
-            int num_bands_1 = lrintf(half_bands * invwarp *
-                                     log2f(sbr->k[2] / (float)sbr->k[1])) * 2;
-
-            make_bands(vk1+1, sbr->k[1], sbr->k[2], num_bands_1);
-
-            vdk1_min = array_min_int16(vk1 + 1, num_bands_1);
-
-            if (vdk1_min < vdk0_max) {
-                int change;
-                qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
-                change = FFMIN(vdk0_max - vk1[1], (vk1[num_bands_1] - vk1[1]) >> 1);
-                vk1[1]           += change;
-                vk1[num_bands_1] -= change;
-            }
-
-            qsort(vk1 + 1, num_bands_1, sizeof(vk1[1]), qsort_comparison_function_int16);
-
-            vk1[0] = sbr->k[1];
-            for (k = 1; k <= num_bands_1; k++) {
-                if (vk1[k] <= 0) { // Requirements (14496-3 sp04 p205)
-                    av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk1[%d]: %d\n", k, vk1[k]);
-                    return -1;
-                }
-                vk1[k] += vk1[k-1];
-            }
-
-            sbr->n_master = num_bands_0 + num_bands_1;
-            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-                return -1;
-            memcpy(&sbr->f_master[0],               vk0,
-                   (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
-            memcpy(&sbr->f_master[num_bands_0 + 1], vk1 + 1,
-                    num_bands_1      * sizeof(sbr->f_master[0]));
-
-        } else {
-            sbr->n_master = num_bands_0;
-            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
-                return -1;
-            memcpy(sbr->f_master, vk0, (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
-        }
-    }
-
-    return 0;
-}
-
-/// High Frequency Generation - Patch Construction (14496-3 sp04 p216 fig. 4.46)
-static int sbr_hf_calc_npatches(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int i, k, sb = 0;
-    int msb = sbr->k[0];
-    int usb = sbr->kx[1];
-    int goal_sb = ((1000 << 11) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
-
-    sbr->num_patches = 0;
-
-    if (goal_sb < sbr->kx[1] + sbr->m[1]) {
-        for (k = 0; sbr->f_master[k] < goal_sb; k++) ;
-    } else
-        k = sbr->n_master;
-
-    do {
-        int odd = 0;
-        for (i = k; i == k || sb > (sbr->k[0] - 1 + msb - odd); i--) {
-            sb = sbr->f_master[i];
-            odd = (sb + sbr->k[0]) & 1;
-        }
-
-        // Requirements (14496-3 sp04 p205) sets the maximum number of patches to 5.
-        // After this check the final number of patches can still be six which is
-        // illegal however the Coding Technologies decoder check stream has a final
-        // count of 6 patches
-        if (sbr->num_patches > 5) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Too many patches: %d\n", sbr->num_patches);
-            return -1;
-        }
-
-        sbr->patch_num_subbands[sbr->num_patches]  = FFMAX(sb - usb, 0);
-        sbr->patch_start_subband[sbr->num_patches] = sbr->k[0] - odd - sbr->patch_num_subbands[sbr->num_patches];
-
-        if (sbr->patch_num_subbands[sbr->num_patches] > 0) {
-            usb = sb;
-            msb = sb;
-            sbr->num_patches++;
-        } else
-            msb = sbr->kx[1];
-
-        if (sbr->f_master[k] - sb < 3)
-            k = sbr->n_master;
-    } while (sb != sbr->kx[1] + sbr->m[1]);
-
-    if (sbr->num_patches > 1 &&
-        sbr->patch_num_subbands[sbr->num_patches - 1] < 3)
-        sbr->num_patches--;
-
-    return 0;
-}
-
-/// Derived Frequency Band Tables (14496-3 sp04 p197)
-static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int k, temp;
-
-    sbr->n[1] = sbr->n_master - sbr->spectrum_params.bs_xover_band;
-    sbr->n[0] = (sbr->n[1] + 1) >> 1;
-
-    memcpy(sbr->f_tablehigh, &sbr->f_master[sbr->spectrum_params.bs_xover_band],
-           (sbr->n[1] + 1) * sizeof(sbr->f_master[0]));
-    sbr->m[1] = sbr->f_tablehigh[sbr->n[1]] - sbr->f_tablehigh[0];
-    sbr->kx[1] = sbr->f_tablehigh[0];
-
-    // Requirements (14496-3 sp04 p205)
-    if (sbr->kx[1] + sbr->m[1] > 64) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Stop frequency border too high: %d\n", sbr->kx[1] + sbr->m[1]);
-        return -1;
-    }
-    if (sbr->kx[1] > 32) {
-        av_log(ac->avctx, AV_LOG_ERROR, "Start frequency border too high: %d\n", sbr->kx[1]);
-        return -1;
-    }
-
-    sbr->f_tablelow[0] = sbr->f_tablehigh[0];
-    temp = sbr->n[1] & 1;
-    for (k = 1; k <= sbr->n[0]; k++)
-        sbr->f_tablelow[k] = sbr->f_tablehigh[2 * k - temp];
-
-    sbr->n_q = FFMAX(1, lrintf(sbr->spectrum_params.bs_noise_bands *
-                               log2f(sbr->k[2] / (float)sbr->kx[1]))); // 0 <= bs_noise_bands <= 3
-    if (sbr->n_q > 5) {
-        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
-        return -1;
-    }
-
-    sbr->f_tablenoise[0] = sbr->f_tablelow[0];
-    temp = 0;
-    for (k = 1; k <= sbr->n_q; k++) {
-        temp += (sbr->n[0] - temp) / (sbr->n_q + 1 - k);
-        sbr->f_tablenoise[k] = sbr->f_tablelow[temp];
-    }
-
-    if (sbr_hf_calc_npatches(ac, sbr) < 0)
-        return -1;
-
-    sbr_make_f_tablelim(sbr);
-
-    sbr->data[0].f_indexnoise = 0;
-    sbr->data[1].f_indexnoise = 0;
-
-    return 0;
-}
-
-static av_always_inline void get_bits1_vector(GetBitContext *gb, uint8_t *vec,
-                                              int elements)
-{
-    int i;
-    for (i = 0; i < elements; i++) {
-        vec[i] = get_bits1(gb);
-    }
-}
-
-/** ceil(log2(index+1)) */
-static const int8_t ceil_log2[] = {
-    0, 1, 2, 2, 3, 3,
-};
-
-static int read_sbr_grid(AACContext *ac, SpectralBandReplication *sbr,
-                         GetBitContext *gb, SBRData *ch_data)
-{
-    int i;
-    int bs_pointer = 0;
-    // frameLengthFlag ? 15 : 16; 960 sample length frames unsupported; this value is numTimeSlots
-    int abs_bord_trail = 16;
-    int num_rel_lead, num_rel_trail;
-    unsigned bs_num_env_old = ch_data->bs_num_env;
-
-    ch_data->bs_freq_res[0] = ch_data->bs_freq_res[ch_data->bs_num_env];
-    ch_data->bs_amp_res = sbr->bs_amp_res_header;
-    ch_data->t_env_num_env_old = ch_data->t_env[bs_num_env_old];
-
-    switch (ch_data->bs_frame_class = get_bits(gb, 2)) {
-    case FIXFIX:
-        ch_data->bs_num_env                 = 1 << get_bits(gb, 2);
-        num_rel_lead                        = ch_data->bs_num_env - 1;
-        if (ch_data->bs_num_env == 1)
-            ch_data->bs_amp_res = 0;
-
-        if (ch_data->bs_num_env > 4) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid bitstream, too many SBR envelopes in FIXFIX type SBR frame: %d\n",
-                   ch_data->bs_num_env);
-            return -1;
-        }
-
-        ch_data->t_env[0]                   = 0;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        abs_bord_trail = (abs_bord_trail + (ch_data->bs_num_env >> 1)) /
-                   ch_data->bs_num_env;
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + abs_bord_trail;
-
-        ch_data->bs_freq_res[1] = get_bits1(gb);
-        for (i = 1; i < ch_data->bs_num_env; i++)
-            ch_data->bs_freq_res[i + 1] = ch_data->bs_freq_res[1];
-        break;
-    case FIXVAR:
-        abs_bord_trail                     += get_bits(gb, 2);
-        num_rel_trail                       = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_trail + 1;
-        ch_data->t_env[0]                   = 0;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_trail; i++)
-            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
-                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        for (i = 0; i < ch_data->bs_num_env; i++)
-            ch_data->bs_freq_res[ch_data->bs_num_env - i] = get_bits1(gb);
-        break;
-    case VARFIX:
-        ch_data->t_env[0]                   = get_bits(gb, 2);
-        num_rel_lead                        = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_lead + 1;
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
-        break;
-    case VARVAR:
-        ch_data->t_env[0]                   = get_bits(gb, 2);
-        abs_bord_trail                     += get_bits(gb, 2);
-        num_rel_lead                        = get_bits(gb, 2);
-        num_rel_trail                       = get_bits(gb, 2);
-        ch_data->bs_num_env                 = num_rel_lead + num_rel_trail + 1;
-
-        if (ch_data->bs_num_env > 5) {
-            av_log(ac->avctx, AV_LOG_ERROR,
-                   "Invalid bitstream, too many SBR envelopes in VARVAR type SBR frame: %d\n",
-                   ch_data->bs_num_env);
-            return -1;
-        }
-
-        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
-
-        for (i = 0; i < num_rel_lead; i++)
-            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
-        for (i = 0; i < num_rel_trail; i++)
-            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
-                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
-
-        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
-
-        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
-        break;
-    }
-
-    if (bs_pointer < 0 || bs_pointer > ch_data->bs_num_env + 1) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Invalid bitstream, bs_pointer points to a middle noise border outside the time borders table: %d\n",
-               bs_pointer);
-        return -1;
-    }
-
-    for (i = 1; i <= ch_data->bs_num_env; i++) {
-        if (ch_data->t_env[i-1] > ch_data->t_env[i]) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Non monotone time borders\n");
-            return -1;
-        }
-    }
-
-    ch_data->bs_num_noise = (ch_data->bs_num_env > 1) + 1;
-
-    ch_data->t_q[0]                     = ch_data->t_env[0];
-    ch_data->t_q[ch_data->bs_num_noise] = ch_data->t_env[ch_data->bs_num_env];
-    if (ch_data->bs_num_noise > 1) {
-        int idx;
-        if (ch_data->bs_frame_class == FIXFIX) {
-            idx = ch_data->bs_num_env >> 1;
-        } else if (ch_data->bs_frame_class & 1) { // FIXVAR or VARVAR
-            idx = ch_data->bs_num_env - FFMAX(bs_pointer - 1, 1);
-        } else { // VARFIX
-            if (!bs_pointer)
-                idx = 1;
-            else if (bs_pointer == 1)
-                idx = ch_data->bs_num_env - 1;
-            else // bs_pointer > 1
-                idx = bs_pointer - 1;
-        }
-        ch_data->t_q[1] = ch_data->t_env[idx];
-    }
-
-    ch_data->e_a[0] = -(ch_data->e_a[1] != bs_num_env_old); // l_APrev
-    ch_data->e_a[1] = -1;
-    if ((ch_data->bs_frame_class & 1) && bs_pointer) { // FIXVAR or VARVAR and bs_pointer != 0
-        ch_data->e_a[1] = ch_data->bs_num_env + 1 - bs_pointer;
-    } else if ((ch_data->bs_frame_class == 2) && (bs_pointer > 1)) // VARFIX and bs_pointer > 1
-        ch_data->e_a[1] = bs_pointer - 1;
-
-    return 0;
-}
-
-static void copy_sbr_grid(SBRData *dst, const SBRData *src) {
-    //These variables are saved from the previous frame rather than copied
-    dst->bs_freq_res[0]    = dst->bs_freq_res[dst->bs_num_env];
-    dst->t_env_num_env_old = dst->t_env[dst->bs_num_env];
-    dst->e_a[0]            = -(dst->e_a[1] != dst->bs_num_env);
-
-    //These variables are read from the bitstream and therefore copied
-    memcpy(dst->bs_freq_res+1, src->bs_freq_res+1, sizeof(dst->bs_freq_res)-sizeof(*dst->bs_freq_res));
-    memcpy(dst->t_env,         src->t_env,         sizeof(dst->t_env));
-    memcpy(dst->t_q,           src->t_q,           sizeof(dst->t_q));
-    dst->bs_num_env        = src->bs_num_env;
-    dst->bs_amp_res        = src->bs_amp_res;
-    dst->bs_num_noise      = src->bs_num_noise;
-    dst->bs_frame_class    = src->bs_frame_class;
-    dst->e_a[1]            = src->e_a[1];
-}
-
-/// Read how the envelope and noise floor data is delta coded
-static void read_sbr_dtdf(SpectralBandReplication *sbr, GetBitContext *gb,
-                          SBRData *ch_data)
-{
-    get_bits1_vector(gb, ch_data->bs_df_env,   ch_data->bs_num_env);
-    get_bits1_vector(gb, ch_data->bs_df_noise, ch_data->bs_num_noise);
-}
-
-/// Read inverse filtering data
-static void read_sbr_invf(SpectralBandReplication *sbr, GetBitContext *gb,
-                          SBRData *ch_data)
-{
-    int i;
-
-    memcpy(ch_data->bs_invf_mode[1], ch_data->bs_invf_mode[0], 5 * sizeof(uint8_t));
-    for (i = 0; i < sbr->n_q; i++)
-        ch_data->bs_invf_mode[0][i] = get_bits(gb, 2);
-}
-
-static void read_sbr_envelope(SpectralBandReplication *sbr, GetBitContext *gb,
-                              SBRData *ch_data, int ch)
-{
-    int bits;
-    int i, j, k;
-    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
-    int t_lav, f_lav;
-    const int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
-    const int odd = sbr->n[1] & 1;
-
-    if (sbr->bs_coupling && ch) {
-        if (ch_data->bs_amp_res) {
-            bits   = 5;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_3_0DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_3_0DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
-        } else {
-            bits   = 6;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_1_5DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_1_5DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_1_5DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_1_5DB];
-        }
-    } else {
-        if (ch_data->bs_amp_res) {
-            bits   = 6;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_3_0DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_3_0DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
-        } else {
-            bits   = 7;
-            t_huff = vlc_sbr[T_HUFFMAN_ENV_1_5DB].table;
-            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_1_5DB];
-            f_huff = vlc_sbr[F_HUFFMAN_ENV_1_5DB].table;
-            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_1_5DB];
-        }
-    }
-
-    for (i = 0; i < ch_data->bs_num_env; i++) {
-        if (ch_data->bs_df_env[i]) {
-            // bs_freq_res[0] == bs_freq_res[bs_num_env] from prev frame
-            if (ch_data->bs_freq_res[i + 1] == ch_data->bs_freq_res[i]) {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-            } else if (ch_data->bs_freq_res[i + 1]) {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
-                    k = (j + odd) >> 1; // find k such that f_tablelow[k] <= f_tablehigh[j] < f_tablelow[k + 1]
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-                }
-            } else {
-                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
-                    k = j ? 2*j - odd : 0; // find k such that f_tablehigh[k] == f_tablelow[j]
-                    ch_data->env_facs[i + 1][j] = ch_data->env_facs[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
-                }
-            }
-        } else {
-            ch_data->env_facs[i + 1][0] = delta * get_bits(gb, bits); // bs_env_start_value_balance
-            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
-                ch_data->env_facs[i + 1][j] = ch_data->env_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
-        }
-    }
-
-    //assign 0th elements of env_facs from last elements
-    memcpy(ch_data->env_facs[0], ch_data->env_facs[ch_data->bs_num_env],
-           sizeof(ch_data->env_facs[0]));
-}
-
-static void read_sbr_noise(SpectralBandReplication *sbr, GetBitContext *gb,
-                           SBRData *ch_data, int ch)
-{
-    int i, j;
-    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
-    int t_lav, f_lav;
-    int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
-
-    if (sbr->bs_coupling && ch) {
-        t_huff = vlc_sbr[T_HUFFMAN_NOISE_BAL_3_0DB].table;
-        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_BAL_3_0DB];
-        f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
-        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
-    } else {
-        t_huff = vlc_sbr[T_HUFFMAN_NOISE_3_0DB].table;
-        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_3_0DB];
-        f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
-        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
-    }
-
-    for (i = 0; i < ch_data->bs_num_noise; i++) {
-        if (ch_data->bs_df_noise[i]) {
-            for (j = 0; j < sbr->n_q; j++)
-                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i][j] + delta * (get_vlc2(gb, t_huff, 9, 2) - t_lav);
-        } else {
-            ch_data->noise_facs[i + 1][0] = delta * get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
-            for (j = 1; j < sbr->n_q; j++)
-                ch_data->noise_facs[i + 1][j] = ch_data->noise_facs[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
-        }
-    }
-
-    //assign 0th elements of noise_facs from last elements
-    memcpy(ch_data->noise_facs[0], ch_data->noise_facs[ch_data->bs_num_noise],
-           sizeof(ch_data->noise_facs[0]));
-}
-
-static void read_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
-                               GetBitContext *gb,
-                               int bs_extension_id, int *num_bits_left)
-{
-    switch (bs_extension_id) {
-    case EXTENSION_ID_PS:
-        if (!ac->oc[1].m4ac.ps) {
-            av_log(ac->avctx, AV_LOG_ERROR, "Parametric Stereo signaled to be not-present but was found in the bitstream.\n");
-            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-            *num_bits_left = 0;
-        } else {
-#if 1
-            *num_bits_left -= ff_ps_read_data(ac->avctx, gb, &sbr->ps, *num_bits_left);
-            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
-#else
-            avpriv_report_missing_feature(ac->avctx, "Parametric Stereo");
-            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-            *num_bits_left = 0;
-#endif
-        }
-        break;
-    default:
-        // some files contain 0-padding
-        if (bs_extension_id || *num_bits_left > 16 || show_bits(gb, *num_bits_left))
-            avpriv_request_sample(ac->avctx, "Reserved SBR extensions");
-        skip_bits_long(gb, *num_bits_left); // bs_fill_bits
-        *num_bits_left = 0;
-        break;
-    }
-}
-
-static int read_sbr_single_channel_element(AACContext *ac,
-                                            SpectralBandReplication *sbr,
-                                            GetBitContext *gb)
-{
-    if (get_bits1(gb)) // bs_data_extra
-        skip_bits(gb, 4); // bs_reserved
-
-    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
-        return -1;
-    read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-    read_sbr_invf(sbr, gb, &sbr->data[0]);
-    read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-    read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-
-    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
-
-    return 0;
-}
-
-static int read_sbr_channel_pair_element(AACContext *ac,
-                                          SpectralBandReplication *sbr,
-                                          GetBitContext *gb)
-{
-    if (get_bits1(gb))    // bs_data_extra
-        skip_bits(gb, 8); // bs_reserved
-
-    if ((sbr->bs_coupling = get_bits1(gb))) {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
-            return -1;
-        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
-        read_sbr_invf(sbr, gb, &sbr->data[0]);
-        memcpy(sbr->data[1].bs_invf_mode[1], sbr->data[1].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
-        memcpy(sbr->data[1].bs_invf_mode[0], sbr->data[0].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
-        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
-        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
-    } else {
-        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]) ||
-            read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
-            return -1;
-        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
-        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
-        read_sbr_invf(sbr, gb, &sbr->data[0]);
-        read_sbr_invf(sbr, gb, &sbr->data[1]);
-        read_sbr_envelope(sbr, gb, &sbr->data[0], 0);
-        read_sbr_envelope(sbr, gb, &sbr->data[1], 1);
-        read_sbr_noise(sbr, gb, &sbr->data[0], 0);
-        read_sbr_noise(sbr, gb, &sbr->data[1], 1);
-    }
-
-    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
-    if ((sbr->data[1].bs_add_harmonic_flag = get_bits1(gb)))
-        get_bits1_vector(gb, sbr->data[1].bs_add_harmonic, sbr->n[1]);
-
-    return 0;
-}
-
-static unsigned int read_sbr_data(AACContext *ac, SpectralBandReplication *sbr,
-                                  GetBitContext *gb, int id_aac)
-{
-    unsigned int cnt = get_bits_count(gb);
-
-    if (id_aac == TYPE_SCE || id_aac == TYPE_CCE) {
-        if (read_sbr_single_channel_element(ac, sbr, gb)) {
-            sbr_turnoff(sbr);
-            return get_bits_count(gb) - cnt;
-        }
-    } else if (id_aac == TYPE_CPE) {
-        if (read_sbr_channel_pair_element(ac, sbr, gb)) {
-            sbr_turnoff(sbr);
-            return get_bits_count(gb) - cnt;
-        }
-    } else {
-        av_log(ac->avctx, AV_LOG_ERROR,
-            "Invalid bitstream - cannot apply SBR to element type %d\n", id_aac);
-        sbr_turnoff(sbr);
-        return get_bits_count(gb) - cnt;
-    }
-    if (get_bits1(gb)) { // bs_extended_data
-        int num_bits_left = get_bits(gb, 4); // bs_extension_size
-        if (num_bits_left == 15)
-            num_bits_left += get_bits(gb, 8); // bs_esc_count
-
-        num_bits_left <<= 3;
-        while (num_bits_left > 7) {
-            num_bits_left -= 2;
-            read_sbr_extension(ac, sbr, gb, get_bits(gb, 2), &num_bits_left); // bs_extension_id
-        }
-        if (num_bits_left < 0) {
-            av_log(ac->avctx, AV_LOG_ERROR, "SBR Extension over read.\n");
-        }
-        if (num_bits_left > 0)
-            skip_bits(gb, num_bits_left);
-    }
-
-    return get_bits_count(gb) - cnt;
-}
-
-static void sbr_reset(AACContext *ac, SpectralBandReplication *sbr)
-{
-    int err;
-    err = sbr_make_f_master(ac, sbr, &sbr->spectrum_params);
-    if (err >= 0)
-        err = sbr_make_f_derived(ac, sbr);
-    if (err < 0) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "SBR reset failed. Switching SBR to pure upsampling mode.\n");
-        sbr_turnoff(sbr);
-    }
-}
-
-/**
- * Decode Spectral Band Replication extension data; reference: table 4.55.
- *
- * @param   crc flag indicating the presence of CRC checksum
- * @param   cnt length of TYPE_FIL syntactic element in bytes
- *
- * @return  Returns number of bytes consumed from the TYPE_FIL element.
- */
-int ff_decode_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
-                            GetBitContext *gb_host, int crc, int cnt, int id_aac)
-{
-    unsigned int num_sbr_bits = 0, num_align_bits;
-    unsigned bytes_read;
-    GetBitContext gbc = *gb_host, *gb = &gbc;
-    skip_bits_long(gb_host, cnt*8 - 4);
-
-    sbr->reset = 0;
-
-    if (!sbr->sample_rate)
-        sbr->sample_rate = 2 * ac->oc[1].m4ac.sample_rate; //TODO use the nominal sample rate for arbitrary sample rate support
-    if (!ac->oc[1].m4ac.ext_sample_rate)
-        ac->oc[1].m4ac.ext_sample_rate = 2 * ac->oc[1].m4ac.sample_rate;
-
-    if (crc) {
-        skip_bits(gb, 10); // bs_sbr_crc_bits; TODO - implement CRC check
-        num_sbr_bits += 10;
-    }
-
-    //Save some state from the previous frame.
-    sbr->kx[0] = sbr->kx[1];
-    sbr->m[0] = sbr->m[1];
-    sbr->kx_and_m_pushed = 1;
-
-    num_sbr_bits++;
-    if (get_bits1(gb)) // bs_header_flag
-        num_sbr_bits += read_sbr_header(sbr, gb);
-
-    if (sbr->reset)
-        sbr_reset(ac, sbr);
-
-    if (sbr->start)
-        num_sbr_bits  += read_sbr_data(ac, sbr, gb, id_aac);
-
-    num_align_bits = ((cnt << 3) - 4 - num_sbr_bits) & 7;
-    bytes_read = ((num_sbr_bits + num_align_bits + 4) >> 3);
-
-    if (bytes_read > cnt) {
-        av_log(ac->avctx, AV_LOG_ERROR,
-               "Expected to read %d SBR bytes actually read %d.\n", cnt, bytes_read);
-    }
-    return cnt;
-}
-
 /// Dequantization and stereo decoding (14496-3 sp04 p203)
 static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
 {
     int k, e;
     int ch;
-
+    static const double exp2_tab[2] = {1, M_SQRT2};
     if (id_aac == TYPE_CPE && sbr->bs_coupling) {
-        float alpha      = sbr->data[0].bs_amp_res ?  1.0f :  0.5f;
-        float pan_offset = sbr->data[0].bs_amp_res ? 12.0f : 24.0f;
+        int pan_offset = sbr->data[0].bs_amp_res ? 12 : 24;
         for (e = 1; e <= sbr->data[0].bs_num_env; e++) {
             for (k = 0; k < sbr->n[sbr->data[0].bs_freq_res[e]]; k++) {
-                float temp1 = exp2f(sbr->data[0].env_facs[e][k] * alpha + 7.0f);
-                float temp2 = exp2f((pan_offset - sbr->data[1].env_facs[e][k]) * alpha);
-                float fac   = temp1 / (1.0f + temp2);
+                float temp1, temp2, fac;
+                if (sbr->data[0].bs_amp_res) {
+                    temp1 = ff_exp2fi(sbr->data[0].env_facs_q[e][k] + 7);
+                    temp2 = ff_exp2fi(pan_offset - sbr->data[1].env_facs_q[e][k]);
+                }
+                else {
+                    temp1 = ff_exp2fi((sbr->data[0].env_facs_q[e][k]>>1) + 7) *
+                            exp2_tab[sbr->data[0].env_facs_q[e][k] & 1];
+                    temp2 = ff_exp2fi((pan_offset - sbr->data[1].env_facs_q[e][k])>>1) *
+                            exp2_tab[(pan_offset - sbr->data[1].env_facs_q[e][k]) & 1];
+                }
+                if (temp1 > 1E20) {
+                    av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                    temp1 = 1;
+                }
+                fac   = temp1 / (1.0f + temp2);
                 sbr->data[0].env_facs[e][k] = fac;
                 sbr->data[1].env_facs[e][k] = fac * temp2;
             }
         }
         for (e = 1; e <= sbr->data[0].bs_num_noise; e++) {
             for (k = 0; k < sbr->n_q; k++) {
-                float temp1 = exp2f(NOISE_FLOOR_OFFSET - sbr->data[0].noise_facs[e][k] + 1);
-                float temp2 = exp2f(12 - sbr->data[1].noise_facs[e][k]);
-                float fac   = temp1 / (1.0f + temp2);
+                float temp1 = ff_exp2fi(NOISE_FLOOR_OFFSET - sbr->data[0].noise_facs_q[e][k] + 1);
+                float temp2 = ff_exp2fi(12 - sbr->data[1].noise_facs_q[e][k]);
+                float fac;
+                av_assert0(temp1 <= 1E20);
+                fac = temp1 / (1.0f + temp2);
                 sbr->data[0].noise_facs[e][k] = fac;
                 sbr->data[1].noise_facs[e][k] = fac * temp2;
             }
         }
     } else { // SCE or one non-coupled CPE
         for (ch = 0; ch < (id_aac == TYPE_CPE) + 1; ch++) {
-            float alpha = sbr->data[ch].bs_amp_res ? 1.0f : 0.5f;
             for (e = 1; e <= sbr->data[ch].bs_num_env; e++)
-                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++)
-                    sbr->data[ch].env_facs[e][k] =
-                        exp2f(alpha * sbr->data[ch].env_facs[e][k] + 6.0f);
+                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++){
+                    if (sbr->data[ch].bs_amp_res)
+                        sbr->data[ch].env_facs[e][k] = ff_exp2fi(sbr->data[ch].env_facs_q[e][k] + 6);
+                    else
+                        sbr->data[ch].env_facs[e][k] = ff_exp2fi((sbr->data[ch].env_facs_q[e][k]>>1) + 6)
+                                                       * exp2_tab[sbr->data[ch].env_facs_q[e][k] & 1];
+                    if (sbr->data[ch].env_facs[e][k] > 1E20) {
+                        av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                        sbr->data[ch].env_facs[e][k] = 1;
+                    }
+                }
+
             for (e = 1; e <= sbr->data[ch].bs_num_noise; e++)
                 for (k = 0; k < sbr->n_q; k++)
                     sbr->data[ch].noise_facs[e][k] =
-                        exp2f(NOISE_FLOOR_OFFSET - sbr->data[ch].noise_facs[e][k]);
+                        ff_exp2fi(NOISE_FLOOR_OFFSET - sbr->data[ch].noise_facs_q[e][k]);
         }
     }
 }
 
-/**
- * Analysis QMF Bank (14496-3 sp04 p206)
- *
- * @param   x       pointer to the beginning of the first sample window
- * @param   W       array of complex-valued samples split into subbands
- */
-static void sbr_qmf_analysis(AVFloatDSPContext *dsp, FFTContext *mdct,
-                             SBRDSPContext *sbrdsp, const float *in, float *x,
-                             float z[320], float W[2][32][32][2], int buf_idx)
-{
-    int i;
-    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
-    memcpy(x+288, in,         1024*sizeof(x[0]));
-    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
-                               // are not supported
-        dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
-        sbrdsp->sum64x5(z);
-        sbrdsp->qmf_pre_shuffle(z);
-        mdct->imdct_half(mdct, z, z+64);
-        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
-        x += 32;
-    }
-}
-
-/**
- * Synthesis QMF Bank (14496-3 sp04 p206) and Downsampled Synthesis QMF Bank
- * (14496-3 sp04 p206)
- */
-static void sbr_qmf_synthesis(FFTContext *mdct,
-                              SBRDSPContext *sbrdsp, AVFloatDSPContext *dsp,
-                              float *out, float X[2][38][64],
-                              float mdct_buf[2][64],
-                              float *v0, int *v_off, const unsigned int div)
-{
-    int i, n;
-    const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
-    const int step = 128 >> div;
-    float *v;
-    for (i = 0; i < 32; i++) {
-        if (*v_off < step) {
-            int saved_samples = (1280 - 128) >> div;
-            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(float));
-            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
-        } else {
-            *v_off -= step;
-        }
-        v = v0 + *v_off;
-        if (div) {
-            for (n = 0; n < 32; n++) {
-                X[0][i][   n] = -X[0][i][n];
-                X[0][i][32+n] =  X[1][i][31-n];
-            }
-            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
-            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
-        } else {
-            sbrdsp->neg_odd_64(X[1][i]);
-            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
-            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
-            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
-        }
-        dsp->vector_fmul    (out, v                , sbr_qmf_window                       , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
-        dsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
-        out += 64 >> div;
-    }
-}
-
 /** High Frequency Generation (14496-3 sp04 p214+) and Inverse Filtering
  * (14496-3 sp04 p214)
  * Warning: This routine does not seem numerically stable.
@@ -1302,203 +212,6 @@ static void sbr_chirp(SpectralBandReplication *sbr, SBRData *ch_data)
     }
 }
 
-/// Generate the subband filtered lowband
-static int sbr_lf_gen(AACContext *ac, SpectralBandReplication *sbr,
-                      float X_low[32][40][2], const float W[2][32][32][2],
-                      int buf_idx)
-{
-    int i, k;
-    const int t_HFGen = 8;
-    const int i_f = 32;
-    memset(X_low, 0, 32*sizeof(*X_low));
-    for (k = 0; k < sbr->kx[1]; k++) {
-        for (i = t_HFGen; i < i_f + t_HFGen; i++) {
-            X_low[k][i][0] = W[buf_idx][i - t_HFGen][k][0];
-            X_low[k][i][1] = W[buf_idx][i - t_HFGen][k][1];
-        }
-    }
-    buf_idx = 1-buf_idx;
-    for (k = 0; k < sbr->kx[0]; k++) {
-        for (i = 0; i < t_HFGen; i++) {
-            X_low[k][i][0] = W[buf_idx][i + i_f - t_HFGen][k][0];
-            X_low[k][i][1] = W[buf_idx][i + i_f - t_HFGen][k][1];
-        }
-    }
-    return 0;
-}
-
-/// High Frequency Generator (14496-3 sp04 p215)
-static int sbr_hf_gen(AACContext *ac, SpectralBandReplication *sbr,
-                      float X_high[64][40][2], const float X_low[32][40][2],
-                      const float (*alpha0)[2], const float (*alpha1)[2],
-                      const float bw_array[5], const uint8_t *t_env,
-                      int bs_num_env)
-{
-    int j, x;
-    int g = 0;
-    int k = sbr->kx[1];
-    for (j = 0; j < sbr->num_patches; j++) {
-        for (x = 0; x < sbr->patch_num_subbands[j]; x++, k++) {
-            const int p = sbr->patch_start_subband[j] + x;
-            while (g <= sbr->n_q && k >= sbr->f_tablenoise[g])
-                g++;
-            g--;
-
-            if (g < 0) {
-                av_log(ac->avctx, AV_LOG_ERROR,
-                       "ERROR : no subband found for frequency %d\n", k);
-                return -1;
-            }
-
-            sbr->dsp.hf_gen(X_high[k] + ENVELOPE_ADJUSTMENT_OFFSET,
-                            X_low[p]  + ENVELOPE_ADJUSTMENT_OFFSET,
-                            alpha0[p], alpha1[p], bw_array[g],
-                            2 * t_env[0], 2 * t_env[bs_num_env]);
-        }
-    }
-    if (k < sbr->m[1] + sbr->kx[1])
-        memset(X_high + k, 0, (sbr->m[1] + sbr->kx[1] - k) * sizeof(*X_high));
-
-    return 0;
-}
-
-/// Generate the subband filtered lowband
-static int sbr_x_gen(SpectralBandReplication *sbr, float X[2][38][64],
-                     const float Y0[38][64][2], const float Y1[38][64][2],
-                     const float X_low[32][40][2], int ch)
-{
-    int k, i;
-    const int i_f = 32;
-    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
-    memset(X, 0, 2*sizeof(*X));
-    for (k = 0; k < sbr->kx[0]; k++) {
-        for (i = 0; i < i_Temp; i++) {
-            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
-            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
-        }
-    }
-    for (; k < sbr->kx[0] + sbr->m[0]; k++) {
-        for (i = 0; i < i_Temp; i++) {
-            X[0][i][k] = Y0[i + i_f][k][0];
-            X[1][i][k] = Y0[i + i_f][k][1];
-        }
-    }
-
-    for (k = 0; k < sbr->kx[1]; k++) {
-        for (i = i_Temp; i < 38; i++) {
-            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
-            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
-        }
-    }
-    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
-        for (i = i_Temp; i < i_f; i++) {
-            X[0][i][k] = Y1[i][k][0];
-            X[1][i][k] = Y1[i][k][1];
-        }
-    }
-    return 0;
-}
-
-/** High Frequency Adjustment (14496-3 sp04 p217) and Mapping
- * (14496-3 sp04 p217)
- */
-static int sbr_mapping(AACContext *ac, SpectralBandReplication *sbr,
-                        SBRData *ch_data, int e_a[2])
-{
-    int e, i, m;
-
-    memset(ch_data->s_indexmapped[1], 0, 7*sizeof(ch_data->s_indexmapped[1]));
-    for (e = 0; e < ch_data->bs_num_env; e++) {
-        const unsigned int ilim = sbr->n[ch_data->bs_freq_res[e + 1]];
-        uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
-        int k;
-
-        if (sbr->kx[1] != table[0]) {
-            av_log(ac->avctx, AV_LOG_ERROR, "kx != f_table{high,low}[0]. "
-                   "Derived frequency tables were not regenerated.\n");
-            sbr_turnoff(sbr);
-            return AVERROR_BUG;
-        }
-        for (i = 0; i < ilim; i++)
-            for (m = table[i]; m < table[i + 1]; m++)
-                sbr->e_origmapped[e][m - sbr->kx[1]] = ch_data->env_facs[e+1][i];
-
-        // ch_data->bs_num_noise > 1 => 2 noise floors
-        k = (ch_data->bs_num_noise > 1) && (ch_data->t_env[e] >= ch_data->t_q[1]);
-        for (i = 0; i < sbr->n_q; i++)
-            for (m = sbr->f_tablenoise[i]; m < sbr->f_tablenoise[i + 1]; m++)
-                sbr->q_mapped[e][m - sbr->kx[1]] = ch_data->noise_facs[k+1][i];
-
-        for (i = 0; i < sbr->n[1]; i++) {
-            if (ch_data->bs_add_harmonic_flag) {
-                const unsigned int m_midpoint =
-                    (sbr->f_tablehigh[i] + sbr->f_tablehigh[i + 1]) >> 1;
-
-                ch_data->s_indexmapped[e + 1][m_midpoint - sbr->kx[1]] = ch_data->bs_add_harmonic[i] *
-                    (e >= e_a[1] || (ch_data->s_indexmapped[0][m_midpoint - sbr->kx[1]] == 1));
-            }
-        }
-
-        for (i = 0; i < ilim; i++) {
-            int additional_sinusoid_present = 0;
-            for (m = table[i]; m < table[i + 1]; m++) {
-                if (ch_data->s_indexmapped[e + 1][m - sbr->kx[1]]) {
-                    additional_sinusoid_present = 1;
-                    break;
-                }
-            }
-            memset(&sbr->s_mapped[e][table[i] - sbr->kx[1]], additional_sinusoid_present,
-                   (table[i + 1] - table[i]) * sizeof(sbr->s_mapped[e][0]));
-        }
-    }
-
-    memcpy(ch_data->s_indexmapped[0], ch_data->s_indexmapped[ch_data->bs_num_env], sizeof(ch_data->s_indexmapped[0]));
-    return 0;
-}
-
-/// Estimation of current envelope (14496-3 sp04 p218)
-static void sbr_env_estimate(float (*e_curr)[48], float X_high[64][40][2],
-                             SpectralBandReplication *sbr, SBRData *ch_data)
-{
-    int e, m;
-    int kx1 = sbr->kx[1];
-
-    if (sbr->bs_interpol_freq) {
-        for (e = 0; e < ch_data->bs_num_env; e++) {
-            const float recip_env_size = 0.5f / (ch_data->t_env[e + 1] - ch_data->t_env[e]);
-            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-
-            for (m = 0; m < sbr->m[1]; m++) {
-                float sum = sbr->dsp.sum_square(X_high[m+kx1] + ilb, iub - ilb);
-                e_curr[e][m] = sum * recip_env_size;
-            }
-        }
-    } else {
-        int k, p;
-
-        for (e = 0; e < ch_data->bs_num_env; e++) {
-            const int env_size = 2 * (ch_data->t_env[e + 1] - ch_data->t_env[e]);
-            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
-            const uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
-
-            for (p = 0; p < sbr->n[ch_data->bs_freq_res[e + 1]]; p++) {
-                float sum = 0.0f;
-                const int den = env_size * (table[p + 1] - table[p]);
-
-                for (k = table[p]; k < table[p + 1]; k++) {
-                    sum += sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb);
-                }
-                sum /= den;
-                for (k = table[p]; k < table[p + 1]; k++) {
-                    e_curr[e][k - kx1] = sum;
-                }
-            }
-        }
-    }
-}
-
 /**
  * Calculation of levels of additional HF signal components (14496-3 sp04 p219)
  * and Calculation of gain (14496-3 sp04 p219)
@@ -1575,10 +288,6 @@ static void sbr_hf_assemble(float Y1[38][64][2],
         0.11516383427084,
         0.03183050093751,
     };
-    static const int8_t phi[2][4] = {
-        {  1,  0, -1,  0}, // real
-        {  0,  1,  0, -1}, // imaginary
-    };
     float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
     int indexnoise = ch_data->f_indexnoise;
     int indexsine  = ch_data->f_indexsine;
@@ -1608,7 +317,6 @@ static void sbr_hf_assemble(float Y1[38][64][2],
 
     for (e = 0; e < ch_data->bs_num_env; e++) {
         for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
-            int phi_sign = (1 - 2*(kx & 1));
             LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
             LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
             float *g_filt, *q_filt;
@@ -1638,13 +346,17 @@ static void sbr_hf_assemble(float Y1[38][64][2],
                                                    q_filt, indexnoise,
                                                    kx, m_max);
             } else {
-                for (m = 0; m < m_max; m++) {
-                    Y1[i][m + kx][0] +=
-                        sbr->s_m[e][m] * phi[0][indexsine];
-                    Y1[i][m + kx][1] +=
-                        sbr->s_m[e][m] * (phi[1][indexsine] * phi_sign);
-                    phi_sign = -phi_sign;
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                float *out = &Y1[i][kx][idx];
+                float *in  = sbr->s_m[e];
+                for (m = 0; m+1 < m_max; m+=2) {
+                    out[2*m  ] += in[m  ] * A;
+                    out[2*m+2] += in[m+1] * B;
                 }
+                if(m_max&1)
+                    out[2*m  ] += in[m  ] * A;
             }
             indexnoise = (indexnoise + m_max) & 0x1ff;
             indexsine = (indexsine + 1) & 3;
@@ -1654,81 +366,4 @@ static void sbr_hf_assemble(float Y1[38][64][2],
     ch_data->f_indexsine  = indexsine;
 }
 
-void ff_sbr_apply(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
-                  float* L, float* R)
-{
-    int downsampled = ac->oc[1].m4ac.ext_sample_rate < sbr->sample_rate;
-    int ch;
-    int nch = (id_aac == TYPE_CPE) ? 2 : 1;
-    int err;
-
-    if (!sbr->kx_and_m_pushed) {
-        sbr->kx[0] = sbr->kx[1];
-        sbr->m[0] = sbr->m[1];
-    } else {
-        sbr->kx_and_m_pushed = 0;
-    }
-
-    if (sbr->start) {
-        sbr_dequant(sbr, id_aac);
-    }
-    for (ch = 0; ch < nch; ch++) {
-        /* decode channel */
-        sbr_qmf_analysis(&ac->fdsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples,
-                         (float*)sbr->qmf_filter_scratch,
-                         sbr->data[ch].W, sbr->data[ch].Ypos);
-        sbr_lf_gen(ac, sbr, sbr->X_low,
-                   (const float (*)[32][32][2]) sbr->data[ch].W,
-                   sbr->data[ch].Ypos);
-        sbr->data[ch].Ypos ^= 1;
-        if (sbr->start) {
-            sbr_hf_inverse_filter(&sbr->dsp, sbr->alpha0, sbr->alpha1,
-                                  (const float (*)[40][2]) sbr->X_low, sbr->k[0]);
-            sbr_chirp(sbr, &sbr->data[ch]);
-            sbr_hf_gen(ac, sbr, sbr->X_high,
-                       (const float (*)[40][2]) sbr->X_low,
-                       (const float (*)[2]) sbr->alpha0,
-                       (const float (*)[2]) sbr->alpha1,
-                       sbr->data[ch].bw_array, sbr->data[ch].t_env,
-                       sbr->data[ch].bs_num_env);
-
-            // hf_adj
-            err = sbr_mapping(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
-            if (!err) {
-                sbr_env_estimate(sbr->e_curr, sbr->X_high, sbr, &sbr->data[ch]);
-                sbr_gain_calc(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
-                sbr_hf_assemble(sbr->data[ch].Y[sbr->data[ch].Ypos],
-                                (const float (*)[40][2]) sbr->X_high,
-                                sbr, &sbr->data[ch],
-                                sbr->data[ch].e_a);
-            }
-        }
-
-        /* synthesis */
-        sbr_x_gen(sbr, sbr->X[ch],
-                  (const float (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
-                  (const float (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
-                  (const float (*)[40][2]) sbr->X_low, ch);
-    }
-
-    if (ac->oc[1].m4ac.ps == 1) {
-        if (sbr->ps.start) {
-            ff_ps_apply(ac->avctx, &sbr->ps, sbr->X[0], sbr->X[1], sbr->kx[1] + sbr->m[1]);
-        } else {
-            memcpy(sbr->X[1], sbr->X[0], sizeof(sbr->X[0]));
-        }
-        nch = 2;
-    }
-
-    sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, &ac->fdsp,
-                      L, sbr->X[0], sbr->qmf_filter_scratch,
-                      sbr->data[0].synthesis_filterbank_samples,
-                      &sbr->data[0].synthesis_filterbank_samples_offset,
-                      downsampled);
-    if (nch == 2)
-        sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, &ac->fdsp,
-                          R, sbr->X[1], sbr->qmf_filter_scratch,
-                          sbr->data[1].synthesis_filterbank_samples,
-                          &sbr->data[1].synthesis_filterbank_samples_offset,
-                          downsampled);
-}
+#include "aacsbr_template.c"
diff --git a/libavcodec/aacsbr.h b/libavcodec/aacsbr.h
index 9bc5e29..88c4d8a 100644
--- a/libavcodec/aacsbr.h
+++ b/libavcodec/aacsbr.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2010      Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,17 +33,64 @@
 #include "aac.h"
 #include "sbr.h"
 
+#define ENVELOPE_ADJUSTMENT_OFFSET 2
+#define NOISE_FLOOR_OFFSET 6
+
+/**
+ * SBR VLC tables
+ */
+enum {
+    T_HUFFMAN_ENV_1_5DB,
+    F_HUFFMAN_ENV_1_5DB,
+    T_HUFFMAN_ENV_BAL_1_5DB,
+    F_HUFFMAN_ENV_BAL_1_5DB,
+    T_HUFFMAN_ENV_3_0DB,
+    F_HUFFMAN_ENV_3_0DB,
+    T_HUFFMAN_ENV_BAL_3_0DB,
+    F_HUFFMAN_ENV_BAL_3_0DB,
+    T_HUFFMAN_NOISE_3_0DB,
+    T_HUFFMAN_NOISE_BAL_3_0DB,
+};
+
+/**
+ * bs_frame_class - frame class of current SBR frame (14496-3 sp04 p98)
+ */
+enum {
+    FIXFIX,
+    FIXVAR,
+    VARFIX,
+    VARVAR,
+};
+
+enum {
+    EXTENSION_ID_PS = 2,
+};
+
+static const int8_t vlc_sbr_lav[10] =
+    { 60, 60, 24, 24, 31, 31, 12, 12, 31, 12 };
+
+#define SBR_INIT_VLC_STATIC(num, size) \
+    INIT_VLC_STATIC(&vlc_sbr[num], 9, sbr_tmp[num].table_size / sbr_tmp[num].elem_size,     \
+                    sbr_tmp[num].sbr_bits ,                      1,                      1, \
+                    sbr_tmp[num].sbr_codes, sbr_tmp[num].elem_size, sbr_tmp[num].elem_size, \
+                    size)
+
+#define SBR_VLC_ROW(name) \
+    { name ## _codes, name ## _bits, sizeof(name ## _codes), sizeof(name ## _codes[0]) }
+
 /** Initialize SBR. */
-void ff_aac_sbr_init(void);
+void AAC_RENAME(ff_aac_sbr_init)(void);
 /** Initialize one SBR context. */
-void ff_aac_sbr_ctx_init(AACContext *ac, SpectralBandReplication *sbr);
+void AAC_RENAME(ff_aac_sbr_ctx_init)(AACContext *ac, SpectralBandReplication *sbr);
 /** Close one SBR context. */
-void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr);
+void AAC_RENAME(ff_aac_sbr_ctx_close)(SpectralBandReplication *sbr);
 /** Decode one SBR element. */
-int ff_decode_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
+int AAC_RENAME(ff_decode_sbr_extension)(AACContext *ac, SpectralBandReplication *sbr,
                             GetBitContext *gb, int crc, int cnt, int id_aac);
 /** Apply one SBR element to one AAC element. */
-void ff_sbr_apply(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
-                  float* L, float *R);
+void AAC_RENAME(ff_sbr_apply)(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
+                  INTFLOAT* L, INTFLOAT *R);
+
+void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c);
 
 #endif /* AVCODEC_AACSBR_H */
diff --git a/libavcodec/aacsbr_fixed.c b/libavcodec/aacsbr_fixed.c
new file mode 100644
index 0000000..b26314a
--- /dev/null
+++ b/libavcodec/aacsbr_fixed.c
@@ -0,0 +1,594 @@
+/*
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * AAC Spectral Band Replication decoding functions (fixed-point)
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC Spectral Band Replication decoding functions (fixed-point)
+ * Note: Rounding-to-nearest used unless otherwise stated
+ * @author Robert Swain ( rob opendot cl )
+ * @author Stanislav Ocovaj ( stanislav.ocovaj imgtec com )
+ */
+#define USE_FIXED 1
+
+#include "aac.h"
+#include "sbr.h"
+#include "aacsbr.h"
+#include "aacsbrdata.h"
+#include "aacsbr_fixed_tablegen.h"
+#include "fft.h"
+#include "aacps.h"
+#include "sbrdsp.h"
+#include "libavutil/internal.h"
+#include "libavutil/libm.h"
+#include "libavutil/avassert.h"
+
+#include <stdint.h>
+#include <float.h>
+#include <math.h>
+
+static VLC vlc_sbr[10];
+static void aacsbr_func_ptr_init(AACSBRContext *c);
+static const int CONST_LN2       = Q31(0.6931471806/256);  // ln(2)/256
+static const int CONST_RECIP_LN2 = Q31(0.7213475204);      // 0.5/ln(2)
+static const int CONST_076923    = Q31(0.76923076923076923077f);
+
+static const int fixed_log_table[10] =
+{
+    Q31(1.0/2), Q31(1.0/3), Q31(1.0/4), Q31(1.0/5), Q31(1.0/6),
+    Q31(1.0/7), Q31(1.0/8), Q31(1.0/9), Q31(1.0/10), Q31(1.0/11)
+};
+
+static int fixed_log(int x)
+{
+    int i, ret, xpow, tmp;
+
+    ret = x;
+    xpow = x;
+    for (i=0; i<10; i+=2){
+        xpow = (int)(((int64_t)xpow * x + 0x40000000) >> 31);
+        tmp = (int)(((int64_t)xpow * fixed_log_table[i] + 0x40000000) >> 31);
+        ret -= tmp;
+
+        xpow = (int)(((int64_t)xpow * x + 0x40000000) >> 31);
+        tmp = (int)(((int64_t)xpow * fixed_log_table[i+1] + 0x40000000) >> 31);
+        ret += tmp;
+    }
+
+    return ret;
+}
+
+static const int fixed_exp_table[7] =
+{
+    Q31(1.0/2), Q31(1.0/6), Q31(1.0/24), Q31(1.0/120),
+    Q31(1.0/720), Q31(1.0/5040), Q31(1.0/40320)
+};
+
+static int fixed_exp(int x)
+{
+    int i, ret, xpow, tmp;
+
+    ret = 0x800000 + x;
+    xpow = x;
+    for (i=0; i<7; i++){
+        xpow = (int)(((int64_t)xpow * x + 0x400000) >> 23);
+        tmp = (int)(((int64_t)xpow * fixed_exp_table[i] + 0x40000000) >> 31);
+        ret += tmp;
+    }
+
+    return ret;
+}
+
+static void make_bands(int16_t* bands, int start, int stop, int num_bands)
+{
+    int k, previous, present;
+    int base, prod, nz = 0;
+
+    base = (stop << 23) / start;
+    while (base < 0x40000000){
+        base <<= 1;
+        nz++;
+    }
+    base = fixed_log(base - 0x80000000);
+    base = (((base + 0x80) >> 8) + (8-nz)*CONST_LN2) / num_bands;
+    base = fixed_exp(base);
+
+    previous = start;
+    prod = start << 23;
+
+    for (k = 0; k < num_bands-1; k++) {
+        prod = (int)(((int64_t)prod * base + 0x400000) >> 23);
+        present = (prod + 0x400000) >> 23;
+        bands[k] = present - previous;
+        previous = present;
+    }
+    bands[num_bands-1] = stop - previous;
+}
+
+/// Dequantization and stereo decoding (14496-3 sp04 p203)
+static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
+{
+    int k, e;
+    int ch;
+
+    if (id_aac == TYPE_CPE && sbr->bs_coupling) {
+        int alpha      = sbr->data[0].bs_amp_res ?  2 :  1;
+        int pan_offset = sbr->data[0].bs_amp_res ? 12 : 24;
+        for (e = 1; e <= sbr->data[0].bs_num_env; e++) {
+            for (k = 0; k < sbr->n[sbr->data[0].bs_freq_res[e]]; k++) {
+                SoftFloat temp1, temp2, fac;
+
+                temp1.exp = sbr->data[0].env_facs_q[e][k] * alpha + 14;
+                if (temp1.exp & 1)
+                  temp1.mant = 759250125;
+                else
+                  temp1.mant = 0x20000000;
+                temp1.exp = (temp1.exp >> 1) + 1;
+                if (temp1.exp > 66) { // temp1 > 1E20
+                    av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                    temp1 = FLOAT_1;
+                }
+
+                temp2.exp = (pan_offset - sbr->data[1].env_facs_q[e][k]) * alpha;
+                if (temp2.exp & 1)
+                  temp2.mant = 759250125;
+                else
+                  temp2.mant = 0x20000000;
+                temp2.exp = (temp2.exp >> 1) + 1;
+                fac   = av_div_sf(temp1, av_add_sf(FLOAT_1, temp2));
+                sbr->data[0].env_facs[e][k] = fac;
+                sbr->data[1].env_facs[e][k] = av_mul_sf(fac, temp2);
+            }
+        }
+        for (e = 1; e <= sbr->data[0].bs_num_noise; e++) {
+            for (k = 0; k < sbr->n_q; k++) {
+                SoftFloat temp1, temp2, fac;
+
+                temp1.exp = NOISE_FLOOR_OFFSET - \
+                    sbr->data[0].noise_facs_q[e][k] + 2;
+                temp1.mant = 0x20000000;
+                av_assert0(temp1.exp <= 66);
+                temp2.exp = 12 - sbr->data[1].noise_facs_q[e][k] + 1;
+                temp2.mant = 0x20000000;
+                fac   = av_div_sf(temp1, av_add_sf(FLOAT_1, temp2));
+                sbr->data[0].noise_facs[e][k] = fac;
+                sbr->data[1].noise_facs[e][k] = av_mul_sf(fac, temp2);
+            }
+        }
+    } else { // SCE or one non-coupled CPE
+        for (ch = 0; ch < (id_aac == TYPE_CPE) + 1; ch++) {
+            int alpha = sbr->data[ch].bs_amp_res ? 2 : 1;
+            for (e = 1; e <= sbr->data[ch].bs_num_env; e++)
+                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[e]]; k++){
+                    SoftFloat temp1;
+
+                    temp1.exp = alpha * sbr->data[ch].env_facs_q[e][k] + 12;
+                    if (temp1.exp & 1)
+                        temp1.mant = 759250125;
+                    else
+                        temp1.mant = 0x20000000;
+                    temp1.exp = (temp1.exp >> 1) + 1;
+                    if (temp1.exp > 66) { // temp1 > 1E20
+                        av_log(NULL, AV_LOG_ERROR, "envelope scalefactor overflow in dequant\n");
+                        temp1 = FLOAT_1;
+                    }
+                    sbr->data[ch].env_facs[e][k] = temp1;
+                }
+            for (e = 1; e <= sbr->data[ch].bs_num_noise; e++)
+                for (k = 0; k < sbr->n_q; k++){
+                    sbr->data[ch].noise_facs[e][k].exp = NOISE_FLOOR_OFFSET - \
+                        sbr->data[ch].noise_facs_q[e][k] + 1;
+                    sbr->data[ch].noise_facs[e][k].mant = 0x20000000;
+                }
+        }
+    }
+}
+
+/** High Frequency Generation (14496-3 sp04 p214+) and Inverse Filtering
+ * (14496-3 sp04 p214)
+ * Warning: This routine does not seem numerically stable.
+ */
+static void sbr_hf_inverse_filter(SBRDSPContext *dsp,
+                                  int (*alpha0)[2], int (*alpha1)[2],
+                                  const int X_low[32][40][2], int k0)
+{
+    int k;
+    int shift, round;
+
+    for (k = 0; k < k0; k++) {
+        SoftFloat phi[3][2][2];
+        SoftFloat a00, a01, a10, a11;
+        SoftFloat dk;
+
+        dsp->autocorrelate(X_low[k], phi);
+
+        dk = av_sub_sf(av_mul_sf(phi[2][1][0], phi[1][0][0]),
+             av_mul_sf(av_add_sf(av_mul_sf(phi[1][1][0], phi[1][1][0]),
+             av_mul_sf(phi[1][1][1], phi[1][1][1])), FLOAT_0999999));
+
+        if (!dk.mant) {
+            a10 = FLOAT_0;
+            a11 = FLOAT_0;
+        } else {
+            SoftFloat temp_real, temp_im;
+            temp_real = av_sub_sf(av_sub_sf(av_mul_sf(phi[0][0][0], phi[1][1][0]),
+                                            av_mul_sf(phi[0][0][1], phi[1][1][1])),
+                                  av_mul_sf(phi[0][1][0], phi[1][0][0]));
+            temp_im   = av_sub_sf(av_add_sf(av_mul_sf(phi[0][0][0], phi[1][1][1]),
+                                            av_mul_sf(phi[0][0][1], phi[1][1][0])),
+                                  av_mul_sf(phi[0][1][1], phi[1][0][0]));
+
+            a10 = av_div_sf(temp_real, dk);
+            a11 = av_div_sf(temp_im,   dk);
+        }
+
+        if (!phi[1][0][0].mant) {
+            a00 = FLOAT_0;
+            a01 = FLOAT_0;
+        } else {
+            SoftFloat temp_real, temp_im;
+            temp_real = av_add_sf(phi[0][0][0],
+                                  av_add_sf(av_mul_sf(a10, phi[1][1][0]),
+                                            av_mul_sf(a11, phi[1][1][1])));
+            temp_im   = av_add_sf(phi[0][0][1],
+                                  av_sub_sf(av_mul_sf(a11, phi[1][1][0]),
+                                            av_mul_sf(a10, phi[1][1][1])));
+
+            temp_real.mant = -temp_real.mant;
+            temp_im.mant   = -temp_im.mant;
+            a00 = av_div_sf(temp_real, phi[1][0][0]);
+            a01 = av_div_sf(temp_im,   phi[1][0][0]);
+        }
+
+        shift = a00.exp;
+        if (shift >= 3)
+            alpha0[k][0] = 0x7fffffff;
+        else {
+            a00.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha0[k][0] = a00.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha0[k][0] = (a00.mant + round) >> shift;
+            }
+        }
+
+        shift = a01.exp;
+        if (shift >= 3)
+            alpha0[k][1] = 0x7fffffff;
+        else {
+            a01.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha0[k][1] = a01.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha0[k][1] = (a01.mant + round) >> shift;
+            }
+        }
+        shift = a10.exp;
+        if (shift >= 3)
+            alpha1[k][0] = 0x7fffffff;
+        else {
+            a10.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha1[k][0] = a10.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha1[k][0] = (a10.mant + round) >> shift;
+            }
+        }
+
+        shift = a11.exp;
+        if (shift >= 3)
+            alpha1[k][1] = 0x7fffffff;
+        else {
+            a11.mant <<= 1;
+            shift = 2-shift;
+            if (shift == 0)
+                alpha1[k][1] = a11.mant;
+            else {
+                round = 1 << (shift-1);
+                alpha1[k][1] = (a11.mant + round) >> shift;
+            }
+        }
+
+        shift = (int)(((int64_t)(alpha1[k][0]>>1) * (alpha1[k][0]>>1) + \
+                       (int64_t)(alpha1[k][1]>>1) * (alpha1[k][1]>>1) + \
+                       0x40000000) >> 31);
+        if (shift >= 0x20000000){
+            alpha1[k][0] = 0;
+            alpha1[k][1] = 0;
+            alpha0[k][0] = 0;
+            alpha0[k][1] = 0;
+        }
+
+        shift = (int)(((int64_t)(alpha0[k][0]>>1) * (alpha0[k][0]>>1) + \
+                       (int64_t)(alpha0[k][1]>>1) * (alpha0[k][1]>>1) + \
+                       0x40000000) >> 31);
+        if (shift >= 0x20000000){
+            alpha1[k][0] = 0;
+            alpha1[k][1] = 0;
+            alpha0[k][0] = 0;
+            alpha0[k][1] = 0;
+        }
+    }
+}
+
+/// Chirp Factors (14496-3 sp04 p214)
+static void sbr_chirp(SpectralBandReplication *sbr, SBRData *ch_data)
+{
+    int i;
+    int new_bw;
+    static const int bw_tab[] = { 0, 1610612736, 1932735283, 2104533975 };
+    int64_t accu;
+
+    for (i = 0; i < sbr->n_q; i++) {
+        if (ch_data->bs_invf_mode[0][i] + ch_data->bs_invf_mode[1][i] == 1)
+            new_bw = 1288490189;
+        else
+            new_bw = bw_tab[ch_data->bs_invf_mode[0][i]];
+
+        if (new_bw < ch_data->bw_array[i]){
+            accu  = (int64_t)new_bw * 1610612736;
+            accu += (int64_t)ch_data->bw_array[i] * 0x20000000;
+            new_bw = (int)((accu + 0x40000000) >> 31);
+        } else {
+            accu  = (int64_t)new_bw * 1946157056;
+            accu += (int64_t)ch_data->bw_array[i] * 201326592;
+            new_bw = (int)((accu + 0x40000000) >> 31);
+        }
+        ch_data->bw_array[i] = new_bw < 0x2000000 ? 0 : new_bw;
+    }
+}
+
+/**
+ * Calculation of levels of additional HF signal components (14496-3 sp04 p219)
+ * and Calculation of gain (14496-3 sp04 p219)
+ */
+static void sbr_gain_calc(AACContext *ac, SpectralBandReplication *sbr,
+                          SBRData *ch_data, const int e_a[2])
+{
+    int e, k, m;
+    // max gain limits : -3dB, 0dB, 3dB, inf dB (limiter off)
+    static const SoftFloat limgain[4] = { { 760155524,  0 }, { 0x20000000,  1 },
+                                            { 758351638,  1 }, { 625000000, 34 } };
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        int delta = !((e == e_a[1]) || (e == e_a[0]));
+        for (k = 0; k < sbr->n_lim; k++) {
+            SoftFloat gain_boost, gain_max;
+            SoftFloat sum[2];
+            sum[0] = sum[1] = FLOAT_0;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                const SoftFloat temp = av_div_sf(sbr->e_origmapped[e][m],
+                                            av_add_sf(FLOAT_1, sbr->q_mapped[e][m]));
+                sbr->q_m[e][m] = av_sqrt_sf(av_mul_sf(temp, sbr->q_mapped[e][m]));
+                sbr->s_m[e][m] = av_sqrt_sf(av_mul_sf(temp, av_int2sf(ch_data->s_indexmapped[e + 1][m], 0)));
+                if (!sbr->s_mapped[e][m]) {
+                    if (delta) {
+                      sbr->gain[e][m] = av_sqrt_sf(av_div_sf(sbr->e_origmapped[e][m],
+                                            av_mul_sf(av_add_sf(FLOAT_1, sbr->e_curr[e][m]),
+                                            av_add_sf(FLOAT_1, sbr->q_mapped[e][m]))));
+                    } else {
+                      sbr->gain[e][m] = av_sqrt_sf(av_div_sf(sbr->e_origmapped[e][m],
+                                            av_add_sf(FLOAT_1, sbr->e_curr[e][m])));
+                    }
+                } else {
+                    sbr->gain[e][m] = av_sqrt_sf(
+                                        av_div_sf(
+                                            av_mul_sf(sbr->e_origmapped[e][m], sbr->q_mapped[e][m]),
+                                            av_mul_sf(
+                                                av_add_sf(FLOAT_1, sbr->e_curr[e][m]),
+                                                av_add_sf(FLOAT_1, sbr->q_mapped[e][m]))));
+                }
+            }
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sum[0] = av_add_sf(sum[0], sbr->e_origmapped[e][m]);
+                sum[1] = av_add_sf(sum[1], sbr->e_curr[e][m]);
+            }
+            gain_max = av_mul_sf(limgain[sbr->bs_limiter_gains],
+                            av_sqrt_sf(
+                                av_div_sf(
+                                    av_add_sf(FLOAT_EPSILON, sum[0]),
+                                    av_add_sf(FLOAT_EPSILON, sum[1]))));
+            if (av_gt_sf(gain_max, FLOAT_100000))
+              gain_max = FLOAT_100000;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                SoftFloat q_m_max = av_div_sf(
+                                        av_mul_sf(sbr->q_m[e][m], gain_max),
+                                        sbr->gain[e][m]);
+                if (av_gt_sf(sbr->q_m[e][m], q_m_max))
+                  sbr->q_m[e][m] = q_m_max;
+                if (av_gt_sf(sbr->gain[e][m], gain_max))
+                  sbr->gain[e][m] = gain_max;
+            }
+            sum[0] = sum[1] = FLOAT_0;
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sum[0] = av_add_sf(sum[0], sbr->e_origmapped[e][m]);
+                sum[1] = av_add_sf(sum[1],
+                            av_mul_sf(
+                                av_mul_sf(sbr->e_curr[e][m],
+                                          sbr->gain[e][m]),
+                                sbr->gain[e][m]));
+                sum[1] = av_add_sf(sum[1],
+                            av_mul_sf(sbr->s_m[e][m], sbr->s_m[e][m]));
+                if (delta && !sbr->s_m[e][m].mant)
+                  sum[1] = av_add_sf(sum[1],
+                                av_mul_sf(sbr->q_m[e][m], sbr->q_m[e][m]));
+            }
+            gain_boost = av_sqrt_sf(
+                            av_div_sf(
+                                av_add_sf(FLOAT_EPSILON, sum[0]),
+                                av_add_sf(FLOAT_EPSILON, sum[1])));
+            if (av_gt_sf(gain_boost, FLOAT_1584893192))
+              gain_boost = FLOAT_1584893192;
+
+            for (m = sbr->f_tablelim[k] - sbr->kx[1]; m < sbr->f_tablelim[k + 1] - sbr->kx[1]; m++) {
+                sbr->gain[e][m] = av_mul_sf(sbr->gain[e][m], gain_boost);
+                sbr->q_m[e][m]  = av_mul_sf(sbr->q_m[e][m], gain_boost);
+                sbr->s_m[e][m]  = av_mul_sf(sbr->s_m[e][m], gain_boost);
+            }
+        }
+    }
+}
+
+/// Assembling HF Signals (14496-3 sp04 p220)
+static void sbr_hf_assemble(int Y1[38][64][2],
+                            const int X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2])
+{
+    int e, i, j, m;
+    const int h_SL = 4 * !sbr->bs_smoothing_mode;
+    const int kx = sbr->kx[1];
+    const int m_max = sbr->m[1];
+    static const SoftFloat h_smooth[5] = {
+      { 715827883, -1 },
+      { 647472402, -1 },
+      { 937030863, -2 },
+      { 989249804, -3 },
+      { 546843842, -4 },
+    };
+    SoftFloat (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
+    int indexnoise = ch_data->f_indexnoise;
+    int indexsine  = ch_data->f_indexsine;
+
+    if (sbr->reset) {
+        for (i = 0; i < h_SL; i++) {
+            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    } else if (h_SL) {
+        for (i = 0; i < 4; i++) {
+            memcpy(g_temp[i + 2 * ch_data->t_env[0]],
+                   g_temp[i + 2 * ch_data->t_env_num_env_old],
+                   sizeof(g_temp[0]));
+            memcpy(q_temp[i + 2 * ch_data->t_env[0]],
+                   q_temp[i + 2 * ch_data->t_env_num_env_old],
+                   sizeof(q_temp[0]));
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            memcpy(g_temp[h_SL + i], sbr->gain[e], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[h_SL + i], sbr->q_m[e],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            SoftFloat g_filt_tab[48];
+            SoftFloat q_filt_tab[48];
+            SoftFloat *g_filt, *q_filt;
+
+            if (h_SL && e != e_a[0] && e != e_a[1]) {
+                g_filt = g_filt_tab;
+                q_filt = q_filt_tab;
+                for (m = 0; m < m_max; m++) {
+                    const int idx1 = i + h_SL;
+                    g_filt[m].mant = g_filt[m].exp = 0;
+                    q_filt[m].mant = q_filt[m].exp = 0;
+                    for (j = 0; j <= h_SL; j++) {
+                        g_filt[m] = av_add_sf(g_filt[m],
+                                        av_mul_sf(g_temp[idx1 - j][m],
+                                            h_smooth[j]));
+                        q_filt[m] = av_add_sf(q_filt[m],
+                                        av_mul_sf(q_temp[idx1 - j][m],
+                                            h_smooth[j]));
+                    }
+                }
+            } else {
+                g_filt = g_temp[i + h_SL];
+                q_filt = q_temp[i];
+            }
+
+            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
+                               i + ENVELOPE_ADJUSTMENT_OFFSET);
+
+            if (e != e_a[0] && e != e_a[1]) {
+                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
+                                                   q_filt, indexnoise,
+                                                   kx, m_max);
+            } else {
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                int *out = &Y1[i][kx][idx];
+                int shift, round;
+
+                SoftFloat *in  = sbr->s_m[e];
+                for (m = 0; m+1 < m_max; m+=2) {
+                  shift = 22 - in[m  ].exp;
+                  round = 1 << (shift-1);
+                  out[2*m  ] += (in[m  ].mant * A + round) >> shift;
+
+                  shift = 22 - in[m+1].exp;
+                  round = 1 << (shift-1);
+                  out[2*m+2] += (in[m+1].mant * B + round) >> shift;
+                }
+                if(m_max&1)
+                {
+                  shift = 22 - in[m  ].exp;
+                  round = 1 << (shift-1);
+
+                  out[2*m  ] += (in[m  ].mant * A + round) >> shift;
+                }
+            }
+            indexnoise = (indexnoise + m_max) & 0x1ff;
+            indexsine = (indexsine + 1) & 3;
+        }
+    }
+    ch_data->f_indexnoise = indexnoise;
+    ch_data->f_indexsine  = indexsine;
+}
+
+#include "aacsbr_template.c"
diff --git a/libavcodec/aacsbr_fixed_tablegen.h b/libavcodec/aacsbr_fixed_tablegen.h
new file mode 100644
index 0000000..3fcf020
--- /dev/null
+++ b/libavcodec/aacsbr_fixed_tablegen.h
@@ -0,0 +1,28 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_FIXED_TABLEGEN_H
+#define AVCODEC_AACSBR_FIXED_TABLEGEN_H
+
+#include "aacsbr_tablegen_common.h"
+
+#endif /* AVCODEC_AACSBR_FIXED_TABLEGEN_H */
diff --git a/libavcodec/aacsbr_tablegen.h b/libavcodec/aacsbr_tablegen.h
new file mode 100644
index 0000000..242a963
--- /dev/null
+++ b/libavcodec/aacsbr_tablegen.h
@@ -0,0 +1,28 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_TABLEGEN_H
+#define AVCODEC_AACSBR_TABLEGEN_H
+
+#include "aacsbr_tablegen_common.h"
+
+#endif /* AVCODEC_AACSBR_TABLEGEN_H */
diff --git a/libavcodec/aacsbr_tablegen_common.h b/libavcodec/aacsbr_tablegen_common.h
new file mode 100644
index 0000000..8c8f6ef
--- /dev/null
+++ b/libavcodec/aacsbr_tablegen_common.h
@@ -0,0 +1,126 @@
+/*
+ * Header file for hardcoded AAC SBR windows
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AACSBR_TABLEGEN_COMMON_H
+#define AVCODEC_AACSBR_TABLEGEN_COMMON_H
+#include "aac_defines.h"
+#include "libavutil/mem.h"
+
+///< window coefficients for analysis/synthesis QMF banks
+static DECLARE_ALIGNED(32, INTFLOAT, sbr_qmf_window_ds)[320];
+static DECLARE_ALIGNED(32, INTFLOAT, sbr_qmf_window_us)[640] = {
+    Q31( 0.0000000000f), Q31(-0.0005525286f), Q31(-0.0005617692f), Q31(-0.0004947518f),
+    Q31(-0.0004875227f), Q31(-0.0004893791f), Q31(-0.0005040714f), Q31(-0.0005226564f),
+    Q31(-0.0005466565f), Q31(-0.0005677802f), Q31(-0.0005870930f), Q31(-0.0006132747f),
+    Q31(-0.0006312493f), Q31(-0.0006540333f), Q31(-0.0006777690f), Q31(-0.0006941614f),
+    Q31(-0.0007157736f), Q31(-0.0007255043f), Q31(-0.0007440941f), Q31(-0.0007490598f),
+    Q31(-0.0007681371f), Q31(-0.0007724848f), Q31(-0.0007834332f), Q31(-0.0007779869f),
+    Q31(-0.0007803664f), Q31(-0.0007801449f), Q31(-0.0007757977f), Q31(-0.0007630793f),
+    Q31(-0.0007530001f), Q31(-0.0007319357f), Q31(-0.0007215391f), Q31(-0.0006917937f),
+    Q31(-0.0006650415f), Q31(-0.0006341594f), Q31(-0.0005946118f), Q31(-0.0005564576f),
+    Q31(-0.0005145572f), Q31(-0.0004606325f), Q31(-0.0004095121f), Q31(-0.0003501175f),
+    Q31(-0.0002896981f), Q31(-0.0002098337f), Q31(-0.0001446380f), Q31(-0.0000617334f),
+    Q31( 0.0000134949f), Q31( 0.0001094383f), Q31( 0.0002043017f), Q31( 0.0002949531f),
+    Q31( 0.0004026540f), Q31( 0.0005107388f), Q31( 0.0006239376f), Q31( 0.0007458025f),
+    Q31( 0.0008608443f), Q31( 0.0009885988f), Q31( 0.0011250155f), Q31( 0.0012577884f),
+    Q31( 0.0013902494f), Q31( 0.0015443219f), Q31( 0.0016868083f), Q31( 0.0018348265f),
+    Q31( 0.0019841140f), Q31( 0.0021461583f), Q31( 0.0023017254f), Q31( 0.0024625616f),
+    Q31( 0.0026201758f), Q31( 0.0027870464f), Q31( 0.0029469447f), Q31( 0.0031125420f),
+    Q31( 0.0032739613f), Q31( 0.0034418874f), Q31( 0.0036008268f), Q31( 0.0037603922f),
+    Q31( 0.0039207432f), Q31( 0.0040819753f), Q31( 0.0042264269f), Q31( 0.0043730719f),
+    Q31( 0.0045209852f), Q31( 0.0046606460f), Q31( 0.0047932560f), Q31( 0.0049137603f),
+    Q31( 0.0050393022f), Q31( 0.0051407353f), Q31( 0.0052461166f), Q31( 0.0053471681f),
+    Q31( 0.0054196775f), Q31( 0.0054876040f), Q31( 0.0055475714f), Q31( 0.0055938023f),
+    Q31( 0.0056220643f), Q31( 0.0056455196f), Q31( 0.0056389199f), Q31( 0.0056266114f),
+    Q31( 0.0055917128f), Q31( 0.0055404363f), Q31( 0.0054753783f), Q31( 0.0053838975f),
+    Q31( 0.0052715758f), Q31( 0.0051382275f), Q31( 0.0049839687f), Q31( 0.0048109469f),
+    Q31( 0.0046039530f), Q31( 0.0043801861f), Q31( 0.0041251642f), Q31( 0.0038456408f),
+    Q31( 0.0035401246f), Q31( 0.0032091885f), Q31( 0.0028446757f), Q31( 0.0024508540f),
+    Q31( 0.0020274176f), Q31( 0.0015784682f), Q31( 0.0010902329f), Q31( 0.0005832264f),
+    Q31( 0.0000276045f), Q31(-0.0005464280f), Q31(-0.0011568135f), Q31(-0.0018039472f),
+    Q31(-0.0024826723f), Q31(-0.0031933778f), Q31(-0.0039401124f), Q31(-0.0047222596f),
+    Q31(-0.0055337211f), Q31(-0.0063792293f), Q31(-0.0072615816f), Q31(-0.0081798233f),
+    Q31(-0.0091325329f), Q31(-0.0101150215f), Q31(-0.0111315548f), Q31(-0.0121849995f),
+    Q31( 0.0132718220f), Q31( 0.0143904666f), Q31( 0.0155405553f), Q31( 0.0167324712f),
+    Q31( 0.0179433381f), Q31( 0.0191872431f), Q31( 0.0204531793f), Q31( 0.0217467550f),
+    Q31( 0.0230680169f), Q31( 0.0244160992f), Q31( 0.0257875847f), Q31( 0.0271859429f),
+    Q31( 0.0286072173f), Q31( 0.0300502657f), Q31( 0.0315017608f), Q31( 0.0329754081f),
+    Q31( 0.0344620948f), Q31( 0.0359697560f), Q31( 0.0374812850f), Q31( 0.0390053679f),
+    Q31( 0.0405349170f), Q31( 0.0420649094f), Q31( 0.0436097542f), Q31( 0.0451488405f),
+    Q31( 0.0466843027f), Q31( 0.0482165720f), Q31( 0.0497385755f), Q31( 0.0512556155f),
+    Q31( 0.0527630746f), Q31( 0.0542452768f), Q31( 0.0557173648f), Q31( 0.0571616450f),
+    Q31( 0.0585915683f), Q31( 0.0599837480f), Q31( 0.0613455171f), Q31( 0.0626857808f),
+    Q31( 0.0639715898f), Q31( 0.0652247106f), Q31( 0.0664367512f), Q31( 0.0676075985f),
+    Q31( 0.0687043828f), Q31( 0.0697630244f), Q31( 0.0707628710f), Q31( 0.0717002673f),
+    Q31( 0.0725682583f), Q31( 0.0733620255f), Q31( 0.0741003642f), Q31( 0.0747452558f),
+    Q31( 0.0753137336f), Q31( 0.0758008358f), Q31( 0.0761992479f), Q31( 0.0764992170f),
+    Q31( 0.0767093490f), Q31( 0.0768173975f), Q31( 0.0768230011f), Q31( 0.0767204924f),
+    Q31( 0.0765050718f), Q31( 0.0761748321f), Q31( 0.0757305756f), Q31( 0.0751576255f),
+    Q31( 0.0744664394f), Q31( 0.0736406005f), Q31( 0.0726774642f), Q31( 0.0715826364f),
+    Q31( 0.0703533073f), Q31( 0.0689664013f), Q31( 0.0674525021f), Q31( 0.0657690668f),
+    Q31( 0.0639444805f), Q31( 0.0619602779f), Q31( 0.0598166570f), Q31( 0.0575152691f),
+    Q31( 0.0550460034f), Q31( 0.0524093821f), Q31( 0.0495978676f), Q31( 0.0466303305f),
+    Q31( 0.0434768782f), Q31( 0.0401458278f), Q31( 0.0366418116f), Q31( 0.0329583930f),
+    Q31( 0.0290824006f), Q31( 0.0250307561f), Q31( 0.0207997072f), Q31( 0.0163701258f),
+    Q31( 0.0117623832f), Q31( 0.0069636862f), Q31( 0.0019765601f), Q31(-0.0032086896f),
+    Q31(-0.0085711749f), Q31(-0.0141288827f), Q31(-0.0198834129f), Q31(-0.0258227288f),
+    Q31(-0.0319531274f), Q31(-0.0382776572f), Q31(-0.0447806821f), Q31(-0.0514804176f),
+    Q31(-0.0583705326f), Q31(-0.0654409853f), Q31(-0.0726943300f), Q31(-0.0801372934f),
+    Q31(-0.0877547536f), Q31(-0.0955533352f), Q31(-0.1035329531f), Q31(-0.1116826931f),
+    Q31(-0.1200077984f), Q31(-0.1285002850f), Q31(-0.1371551761f), Q31(-0.1459766491f),
+    Q31(-0.1549607071f), Q31(-0.1640958855f), Q31(-0.1733808172f), Q31(-0.1828172548f),
+    Q31(-0.1923966745f), Q31(-0.2021250176f), Q31(-0.2119735853f), Q31(-0.2219652696f),
+    Q31(-0.2320690870f), Q31(-0.2423016884f), Q31(-0.2526480309f), Q31(-0.2631053299f),
+    Q31(-0.2736634040f), Q31(-0.2843214189f), Q31(-0.2950716717f), Q31(-0.3059098575f),
+    Q31(-0.3168278913f), Q31(-0.3278113727f), Q31(-0.3388722693f), Q31(-0.3499914122f),
+    Q31( 0.3611589903f), Q31( 0.3723795546f), Q31( 0.3836350013f), Q31( 0.3949211761f),
+    Q31( 0.4062317676f), Q31( 0.4175696896f), Q31( 0.4289119920f), Q31( 0.4402553754f),
+    Q31( 0.4515996535f), Q31( 0.4629308085f), Q31( 0.4742453214f), Q31( 0.4855253091f),
+    Q31( 0.4967708254f), Q31( 0.5079817500f), Q31( 0.5191234970f), Q31( 0.5302240895f),
+    Q31( 0.5412553448f), Q31( 0.5522051258f), Q31( 0.5630789140f), Q31( 0.5738524131f),
+    Q31( 0.5845403235f), Q31( 0.5951123086f), Q31( 0.6055783538f), Q31( 0.6159109932f),
+    Q31( 0.6261242695f), Q31( 0.6361980107f), Q31( 0.6461269695f), Q31( 0.6559016302f),
+    Q31( 0.6655139880f), Q31( 0.6749663190f), Q31( 0.6842353293f), Q31( 0.6933282376f),
+    Q31( 0.7022388719f), Q31( 0.7109410426f), Q31( 0.7194462634f), Q31( 0.7277448900f),
+    Q31( 0.7358211758f), Q31( 0.7436827863f), Q31( 0.7513137456f), Q31( 0.7587080760f),
+    Q31( 0.7658674865f), Q31( 0.7727780881f), Q31( 0.7794287519f), Q31( 0.7858353120f),
+    Q31( 0.7919735841f), Q31( 0.7978466413f), Q31( 0.8034485751f), Q31( 0.8087695004f),
+    Q31( 0.8138191270f), Q31( 0.8185776004f), Q31( 0.8230419890f), Q31( 0.8272275347f),
+    Q31( 0.8311038457f), Q31( 0.8346937361f), Q31( 0.8379717337f), Q31( 0.8409541392f),
+    Q31( 0.8436238281f), Q31( 0.8459818469f), Q31( 0.8480315777f), Q31( 0.8497805198f),
+    Q31( 0.8511971524f), Q31( 0.8523047035f), Q31( 0.8531020949f), Q31( 0.8535720573f),
+    Q31( 0.8537385600f),
+};
+
+static av_cold void aacsbr_tableinit(void)
+{
+    int n;
+    for (n = 1; n < 320; n++)
+        sbr_qmf_window_us[320 + n] = sbr_qmf_window_us[320 - n];
+    sbr_qmf_window_us[384] = -sbr_qmf_window_us[384];
+    sbr_qmf_window_us[512] = -sbr_qmf_window_us[512];
+
+    for (n = 0; n < 320; n++)
+        sbr_qmf_window_ds[n] = sbr_qmf_window_us[2*n];
+}
+
+#endif /* AVCODEC_AACSBR_TABLEGEN_COMMON_H */
diff --git a/libavcodec/aacsbr_template.c b/libavcodec/aacsbr_template.c
new file mode 100644
index 0000000..5110542
--- /dev/null
+++ b/libavcodec/aacsbr_template.c
@@ -0,0 +1,1571 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * Fixed point code
+ * Copyright (c) 2013
+ *      MIPS Technologies, Inc., California.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC Spectral Band Replication decoding functions
+ * @author Robert Swain ( rob opendot cl )
+ * @author Stanislav Ocovaj ( stanislav.ocovaj@imgtec.com )
+ * @author Zoran Basaric ( zoran.basaric@imgtec.com )
+ */
+
+#include "libavutil/qsort.h"
+
+av_cold void AAC_RENAME(ff_aac_sbr_init)(void)
+{
+    static const struct {
+        const void *sbr_codes, *sbr_bits;
+        const unsigned int table_size, elem_size;
+    } sbr_tmp[] = {
+        SBR_VLC_ROW(t_huffman_env_1_5dB),
+        SBR_VLC_ROW(f_huffman_env_1_5dB),
+        SBR_VLC_ROW(t_huffman_env_bal_1_5dB),
+        SBR_VLC_ROW(f_huffman_env_bal_1_5dB),
+        SBR_VLC_ROW(t_huffman_env_3_0dB),
+        SBR_VLC_ROW(f_huffman_env_3_0dB),
+        SBR_VLC_ROW(t_huffman_env_bal_3_0dB),
+        SBR_VLC_ROW(f_huffman_env_bal_3_0dB),
+        SBR_VLC_ROW(t_huffman_noise_3_0dB),
+        SBR_VLC_ROW(t_huffman_noise_bal_3_0dB),
+    };
+
+    // SBR VLC table initialization
+    SBR_INIT_VLC_STATIC(0, 1098);
+    SBR_INIT_VLC_STATIC(1, 1092);
+    SBR_INIT_VLC_STATIC(2, 768);
+    SBR_INIT_VLC_STATIC(3, 1026);
+    SBR_INIT_VLC_STATIC(4, 1058);
+    SBR_INIT_VLC_STATIC(5, 1052);
+    SBR_INIT_VLC_STATIC(6, 544);
+    SBR_INIT_VLC_STATIC(7, 544);
+    SBR_INIT_VLC_STATIC(8, 592);
+    SBR_INIT_VLC_STATIC(9, 512);
+
+    aacsbr_tableinit();
+
+    AAC_RENAME(ff_ps_init)();
+}
+
+/** Places SBR in pure upsampling mode. */
+static void sbr_turnoff(SpectralBandReplication *sbr) {
+    sbr->start = 0;
+    sbr->ready_for_dequant = 0;
+    // Init defults used in pure upsampling mode
+    sbr->kx[1] = 32; //Typo in spec, kx' inits to 32
+    sbr->m[1] = 0;
+    // Reset values for first SBR header
+    sbr->data[0].e_a[1] = sbr->data[1].e_a[1] = -1;
+    memset(&sbr->spectrum_params, -1, sizeof(SpectrumParameters));
+}
+
+av_cold void AAC_RENAME(ff_aac_sbr_ctx_init)(AACContext *ac, SpectralBandReplication *sbr)
+{
+    if(sbr->mdct.mdct_bits)
+        return;
+    sbr->kx[0] = sbr->kx[1];
+    sbr_turnoff(sbr);
+    sbr->data[0].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
+    sbr->data[1].synthesis_filterbank_samples_offset = SBR_SYNTHESIS_BUF_SIZE - (1280 - 128);
+    /* SBR requires samples to be scaled to +/-32768.0 to work correctly.
+     * mdct scale factors are adjusted to scale up from +/-1.0 at analysis
+     * and scale back down at synthesis. */
+    AAC_RENAME_32(ff_mdct_init)(&sbr->mdct,     7, 1, 1.0 / (64 * 32768.0));
+    AAC_RENAME_32(ff_mdct_init)(&sbr->mdct_ana, 7, 1, -2.0 * 32768.0);
+    AAC_RENAME(ff_ps_ctx_init)(&sbr->ps);
+    AAC_RENAME(ff_sbrdsp_init)(&sbr->dsp);
+    aacsbr_func_ptr_init(&sbr->c);
+}
+
+av_cold void AAC_RENAME(ff_aac_sbr_ctx_close)(SpectralBandReplication *sbr)
+{
+    AAC_RENAME_32(ff_mdct_end)(&sbr->mdct);
+    AAC_RENAME_32(ff_mdct_end)(&sbr->mdct_ana);
+}
+
+static int qsort_comparison_function_int16(const void *a, const void *b)
+{
+    return *(const int16_t *)a - *(const int16_t *)b;
+}
+
+static inline int in_table_int16(const int16_t *table, int last_el, int16_t needle)
+{
+    int i;
+    for (i = 0; i <= last_el; i++)
+        if (table[i] == needle)
+            return 1;
+    return 0;
+}
+
+/// Limiter Frequency Band Table (14496-3 sp04 p198)
+static void sbr_make_f_tablelim(SpectralBandReplication *sbr)
+{
+    int k;
+    if (sbr->bs_limiter_bands > 0) {
+        static const INTFLOAT bands_warped[3] = { Q23(1.32715174233856803909f),   //2^(0.49/1.2)
+                                               Q23(1.18509277094158210129f),   //2^(0.49/2)
+                                               Q23(1.11987160404675912501f) }; //2^(0.49/3)
+        const INTFLOAT lim_bands_per_octave_warped = bands_warped[sbr->bs_limiter_bands - 1];
+        int16_t patch_borders[7];
+        uint16_t *in = sbr->f_tablelim + 1, *out = sbr->f_tablelim;
+
+        patch_borders[0] = sbr->kx[1];
+        for (k = 1; k <= sbr->num_patches; k++)
+            patch_borders[k] = patch_borders[k-1] + sbr->patch_num_subbands[k-1];
+
+        memcpy(sbr->f_tablelim, sbr->f_tablelow,
+               (sbr->n[0] + 1) * sizeof(sbr->f_tablelow[0]));
+        if (sbr->num_patches > 1)
+            memcpy(sbr->f_tablelim + sbr->n[0] + 1, patch_borders + 1,
+                   (sbr->num_patches - 1) * sizeof(patch_borders[0]));
+
+        AV_QSORT(sbr->f_tablelim, sbr->num_patches + sbr->n[0],
+              uint16_t,
+              qsort_comparison_function_int16);
+
+        sbr->n_lim = sbr->n[0] + sbr->num_patches - 1;
+        while (out < sbr->f_tablelim + sbr->n_lim) {
+#if USE_FIXED
+            if ((*in << 23) >= *out * lim_bands_per_octave_warped) {
+#else
+            if (*in >= *out * lim_bands_per_octave_warped) {
+#endif /* USE_FIXED */
+                *++out = *in++;
+            } else if (*in == *out ||
+                !in_table_int16(patch_borders, sbr->num_patches, *in)) {
+                in++;
+                sbr->n_lim--;
+            } else if (!in_table_int16(patch_borders, sbr->num_patches, *out)) {
+                *out = *in++;
+                sbr->n_lim--;
+            } else {
+                *++out = *in++;
+            }
+        }
+    } else {
+        sbr->f_tablelim[0] = sbr->f_tablelow[0];
+        sbr->f_tablelim[1] = sbr->f_tablelow[sbr->n[0]];
+        sbr->n_lim = 1;
+    }
+}
+
+static unsigned int read_sbr_header(SpectralBandReplication *sbr, GetBitContext *gb)
+{
+    unsigned int cnt = get_bits_count(gb);
+    uint8_t bs_header_extra_1;
+    uint8_t bs_header_extra_2;
+    int old_bs_limiter_bands = sbr->bs_limiter_bands;
+    SpectrumParameters old_spectrum_params;
+
+    sbr->start = 1;
+    sbr->ready_for_dequant = 0;
+
+    // Save last spectrum parameters variables to compare to new ones
+    memcpy(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters));
+
+    sbr->bs_amp_res_header              = get_bits1(gb);
+    sbr->spectrum_params.bs_start_freq  = get_bits(gb, 4);
+    sbr->spectrum_params.bs_stop_freq   = get_bits(gb, 4);
+    sbr->spectrum_params.bs_xover_band  = get_bits(gb, 3);
+                                          skip_bits(gb, 2); // bs_reserved
+
+    bs_header_extra_1 = get_bits1(gb);
+    bs_header_extra_2 = get_bits1(gb);
+
+    if (bs_header_extra_1) {
+        sbr->spectrum_params.bs_freq_scale  = get_bits(gb, 2);
+        sbr->spectrum_params.bs_alter_scale = get_bits1(gb);
+        sbr->spectrum_params.bs_noise_bands = get_bits(gb, 2);
+    } else {
+        sbr->spectrum_params.bs_freq_scale  = 2;
+        sbr->spectrum_params.bs_alter_scale = 1;
+        sbr->spectrum_params.bs_noise_bands = 2;
+    }
+
+    // Check if spectrum parameters changed
+    if (memcmp(&old_spectrum_params, &sbr->spectrum_params, sizeof(SpectrumParameters)))
+        sbr->reset = 1;
+
+    if (bs_header_extra_2) {
+        sbr->bs_limiter_bands  = get_bits(gb, 2);
+        sbr->bs_limiter_gains  = get_bits(gb, 2);
+        sbr->bs_interpol_freq  = get_bits1(gb);
+        sbr->bs_smoothing_mode = get_bits1(gb);
+    } else {
+        sbr->bs_limiter_bands  = 2;
+        sbr->bs_limiter_gains  = 2;
+        sbr->bs_interpol_freq  = 1;
+        sbr->bs_smoothing_mode = 1;
+    }
+
+    if (sbr->bs_limiter_bands != old_bs_limiter_bands && !sbr->reset)
+        sbr_make_f_tablelim(sbr);
+
+    return get_bits_count(gb) - cnt;
+}
+
+static int array_min_int16(const int16_t *array, int nel)
+{
+    int i, min = array[0];
+    for (i = 1; i < nel; i++)
+        min = FFMIN(array[i], min);
+    return min;
+}
+
+static int check_n_master(AVCodecContext *avctx, int n_master, int bs_xover_band)
+{
+    // Requirements (14496-3 sp04 p205)
+    if (n_master <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid n_master: %d\n", n_master);
+        return -1;
+    }
+    if (bs_xover_band >= n_master) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid bitstream, crossover band index beyond array bounds: %d\n",
+               bs_xover_band);
+        return -1;
+    }
+    return 0;
+}
+
+/// Master Frequency Band Table (14496-3 sp04 p194)
+static int sbr_make_f_master(AACContext *ac, SpectralBandReplication *sbr,
+                             SpectrumParameters *spectrum)
+{
+    unsigned int temp, max_qmf_subbands = 0;
+    unsigned int start_min, stop_min;
+    int k;
+    const int8_t *sbr_offset_ptr;
+    int16_t stop_dk[13];
+
+    if (sbr->sample_rate < 32000) {
+        temp = 3000;
+    } else if (sbr->sample_rate < 64000) {
+        temp = 4000;
+    } else
+        temp = 5000;
+
+    switch (sbr->sample_rate) {
+    case 16000:
+        sbr_offset_ptr = sbr_offset[0];
+        break;
+    case 22050:
+        sbr_offset_ptr = sbr_offset[1];
+        break;
+    case 24000:
+        sbr_offset_ptr = sbr_offset[2];
+        break;
+    case 32000:
+        sbr_offset_ptr = sbr_offset[3];
+        break;
+    case 44100: case 48000: case 64000:
+        sbr_offset_ptr = sbr_offset[4];
+        break;
+    case 88200: case 96000: case 128000: case 176400: case 192000:
+        sbr_offset_ptr = sbr_offset[5];
+        break;
+    default:
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Unsupported sample rate for SBR: %d\n", sbr->sample_rate);
+        return -1;
+    }
+
+    start_min = ((temp << 7) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+    stop_min  = ((temp << 8) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+
+    sbr->k[0] = start_min + sbr_offset_ptr[spectrum->bs_start_freq];
+
+    if (spectrum->bs_stop_freq < 14) {
+        sbr->k[2] = stop_min;
+        make_bands(stop_dk, stop_min, 64, 13);
+        AV_QSORT(stop_dk, 13, int16_t, qsort_comparison_function_int16);
+        for (k = 0; k < spectrum->bs_stop_freq; k++)
+            sbr->k[2] += stop_dk[k];
+    } else if (spectrum->bs_stop_freq == 14) {
+        sbr->k[2] = 2*sbr->k[0];
+    } else if (spectrum->bs_stop_freq == 15) {
+        sbr->k[2] = 3*sbr->k[0];
+    } else {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bs_stop_freq: %d\n", spectrum->bs_stop_freq);
+        return -1;
+    }
+    sbr->k[2] = FFMIN(64, sbr->k[2]);
+
+    // Requirements (14496-3 sp04 p205)
+    if (sbr->sample_rate <= 32000) {
+        max_qmf_subbands = 48;
+    } else if (sbr->sample_rate == 44100) {
+        max_qmf_subbands = 35;
+    } else if (sbr->sample_rate >= 48000)
+        max_qmf_subbands = 32;
+    else
+        av_assert0(0);
+
+    if (sbr->k[2] - sbr->k[0] > max_qmf_subbands) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bitstream, too many QMF subbands: %d\n", sbr->k[2] - sbr->k[0]);
+        return -1;
+    }
+
+    if (!spectrum->bs_freq_scale) {
+        int dk, k2diff;
+
+        dk = spectrum->bs_alter_scale + 1;
+        sbr->n_master = ((sbr->k[2] - sbr->k[0] + (dk&2)) >> dk) << 1;
+        if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+            return -1;
+
+        for (k = 1; k <= sbr->n_master; k++)
+            sbr->f_master[k] = dk;
+
+        k2diff = sbr->k[2] - sbr->k[0] - sbr->n_master * dk;
+        if (k2diff < 0) {
+            sbr->f_master[1]--;
+            sbr->f_master[2]-= (k2diff < -1);
+        } else if (k2diff) {
+            sbr->f_master[sbr->n_master]++;
+        }
+
+        sbr->f_master[0] = sbr->k[0];
+        for (k = 1; k <= sbr->n_master; k++)
+            sbr->f_master[k] += sbr->f_master[k - 1];
+
+    } else {
+        int half_bands = 7 - spectrum->bs_freq_scale;      // bs_freq_scale  = {1,2,3}
+        int two_regions, num_bands_0;
+        int vdk0_max, vdk1_min;
+        int16_t vk0[49];
+#if USE_FIXED
+        int tmp, nz = 0;
+#endif /* USE_FIXED */
+
+        if (49 * sbr->k[2] > 110 * sbr->k[0]) {
+            two_regions = 1;
+            sbr->k[1] = 2 * sbr->k[0];
+        } else {
+            two_regions = 0;
+            sbr->k[1] = sbr->k[2];
+        }
+
+#if USE_FIXED
+        tmp = (sbr->k[1] << 23) / sbr->k[0];
+        while (tmp < 0x40000000) {
+          tmp <<= 1;
+          nz++;
+        }
+        tmp = fixed_log(tmp - 0x80000000);
+        tmp = (int)(((int64_t)tmp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+        tmp = (((tmp + 0x80) >> 8) + ((8 - nz) << 23)) * half_bands;
+        num_bands_0 = ((tmp + 0x400000) >> 23) * 2;
+#else
+        num_bands_0 = lrintf(half_bands * log2f(sbr->k[1] / (float)sbr->k[0])) * 2;
+#endif /* USE_FIXED */
+
+        if (num_bands_0 <= 0) { // Requirements (14496-3 sp04 p205)
+            av_log(ac->avctx, AV_LOG_ERROR, "Invalid num_bands_0: %d\n", num_bands_0);
+            return -1;
+        }
+
+        vk0[0] = 0;
+
+        make_bands(vk0+1, sbr->k[0], sbr->k[1], num_bands_0);
+
+        AV_QSORT(vk0 + 1, num_bands_0, int16_t, qsort_comparison_function_int16);
+        vdk0_max = vk0[num_bands_0];
+
+        vk0[0] = sbr->k[0];
+        for (k = 1; k <= num_bands_0; k++) {
+            if (vk0[k] <= 0) { // Requirements (14496-3 sp04 p205)
+                av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk0[%d]: %d\n", k, vk0[k]);
+                return -1;
+            }
+            vk0[k] += vk0[k-1];
+        }
+
+        if (two_regions) {
+            int16_t vk1[49];
+#if USE_FIXED
+            int num_bands_1;
+
+            tmp = (sbr->k[2] << 23) / sbr->k[1];
+            nz = 0;
+            while (tmp < 0x40000000) {
+              tmp <<= 1;
+              nz++;
+            }
+            tmp = fixed_log(tmp - 0x80000000);
+            tmp = (int)(((int64_t)tmp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+            tmp = (((tmp + 0x80) >> 8) + ((8 - nz) << 23)) * half_bands;
+            if (spectrum->bs_alter_scale)
+                tmp = (int)(((int64_t)tmp * CONST_076923 + 0x40000000) >> 31);
+            num_bands_1 = ((tmp + 0x400000) >> 23) * 2;
+#else
+            float invwarp = spectrum->bs_alter_scale ? 0.76923076923076923077f
+                                                     : 1.0f; // bs_alter_scale = {0,1}
+            int num_bands_1 = lrintf(half_bands * invwarp *
+                                     log2f(sbr->k[2] / (float)sbr->k[1])) * 2;
+#endif /* USE_FIXED */
+            make_bands(vk1+1, sbr->k[1], sbr->k[2], num_bands_1);
+
+            vdk1_min = array_min_int16(vk1 + 1, num_bands_1);
+
+            if (vdk1_min < vdk0_max) {
+                int change;
+                AV_QSORT(vk1 + 1, num_bands_1, int16_t, qsort_comparison_function_int16);
+                change = FFMIN(vdk0_max - vk1[1], (vk1[num_bands_1] - vk1[1]) >> 1);
+                vk1[1]           += change;
+                vk1[num_bands_1] -= change;
+            }
+
+            AV_QSORT(vk1 + 1, num_bands_1, int16_t, qsort_comparison_function_int16);
+
+            vk1[0] = sbr->k[1];
+            for (k = 1; k <= num_bands_1; k++) {
+                if (vk1[k] <= 0) { // Requirements (14496-3 sp04 p205)
+                    av_log(ac->avctx, AV_LOG_ERROR, "Invalid vDk1[%d]: %d\n", k, vk1[k]);
+                    return -1;
+                }
+                vk1[k] += vk1[k-1];
+            }
+
+            sbr->n_master = num_bands_0 + num_bands_1;
+            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+                return -1;
+            memcpy(&sbr->f_master[0],               vk0,
+                   (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
+            memcpy(&sbr->f_master[num_bands_0 + 1], vk1 + 1,
+                    num_bands_1      * sizeof(sbr->f_master[0]));
+
+        } else {
+            sbr->n_master = num_bands_0;
+            if (check_n_master(ac->avctx, sbr->n_master, sbr->spectrum_params.bs_xover_band))
+                return -1;
+            memcpy(sbr->f_master, vk0, (num_bands_0 + 1) * sizeof(sbr->f_master[0]));
+        }
+    }
+
+    return 0;
+}
+
+/// High Frequency Generation - Patch Construction (14496-3 sp04 p216 fig. 4.46)
+static int sbr_hf_calc_npatches(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int i, k, last_k = -1, last_msb = -1, sb = 0;
+    int msb = sbr->k[0];
+    int usb = sbr->kx[1];
+    int goal_sb = ((1000 << 11) + (sbr->sample_rate >> 1)) / sbr->sample_rate;
+
+    sbr->num_patches = 0;
+
+    if (goal_sb < sbr->kx[1] + sbr->m[1]) {
+        for (k = 0; sbr->f_master[k] < goal_sb; k++) ;
+    } else
+        k = sbr->n_master;
+
+    do {
+        int odd = 0;
+        if (k == last_k && msb == last_msb) {
+            av_log(ac->avctx, AV_LOG_ERROR, "patch construction failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+        last_k = k;
+        last_msb = msb;
+        for (i = k; i == k || sb > (sbr->k[0] - 1 + msb - odd); i--) {
+            sb = sbr->f_master[i];
+            odd = (sb + sbr->k[0]) & 1;
+        }
+
+        // Requirements (14496-3 sp04 p205) sets the maximum number of patches to 5.
+        // After this check the final number of patches can still be six which is
+        // illegal however the Coding Technologies decoder check stream has a final
+        // count of 6 patches
+        if (sbr->num_patches > 5) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Too many patches: %d\n", sbr->num_patches);
+            return -1;
+        }
+
+        sbr->patch_num_subbands[sbr->num_patches]  = FFMAX(sb - usb, 0);
+        sbr->patch_start_subband[sbr->num_patches] = sbr->k[0] - odd - sbr->patch_num_subbands[sbr->num_patches];
+
+        if (sbr->patch_num_subbands[sbr->num_patches] > 0) {
+            usb = sb;
+            msb = sb;
+            sbr->num_patches++;
+        } else
+            msb = sbr->kx[1];
+
+        if (sbr->f_master[k] - sb < 3)
+            k = sbr->n_master;
+    } while (sb != sbr->kx[1] + sbr->m[1]);
+
+    if (sbr->num_patches > 1 &&
+        sbr->patch_num_subbands[sbr->num_patches - 1] < 3)
+        sbr->num_patches--;
+
+    return 0;
+}
+
+/// Derived Frequency Band Tables (14496-3 sp04 p197)
+static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int k, temp;
+#if USE_FIXED
+    int nz = 0;
+#endif /* USE_FIXED */
+
+    sbr->n[1] = sbr->n_master - sbr->spectrum_params.bs_xover_band;
+    sbr->n[0] = (sbr->n[1] + 1) >> 1;
+
+    memcpy(sbr->f_tablehigh, &sbr->f_master[sbr->spectrum_params.bs_xover_band],
+           (sbr->n[1] + 1) * sizeof(sbr->f_master[0]));
+    sbr->m[1] = sbr->f_tablehigh[sbr->n[1]] - sbr->f_tablehigh[0];
+    sbr->kx[1] = sbr->f_tablehigh[0];
+
+    // Requirements (14496-3 sp04 p205)
+    if (sbr->kx[1] + sbr->m[1] > 64) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Stop frequency border too high: %d\n", sbr->kx[1] + sbr->m[1]);
+        return -1;
+    }
+    if (sbr->kx[1] > 32) {
+        av_log(ac->avctx, AV_LOG_ERROR, "Start frequency border too high: %d\n", sbr->kx[1]);
+        return -1;
+    }
+
+    sbr->f_tablelow[0] = sbr->f_tablehigh[0];
+    temp = sbr->n[1] & 1;
+    for (k = 1; k <= sbr->n[0]; k++)
+        sbr->f_tablelow[k] = sbr->f_tablehigh[2 * k - temp];
+#if USE_FIXED
+    temp = (sbr->k[2] << 23) / sbr->kx[1];
+    while (temp < 0x40000000) {
+        temp <<= 1;
+        nz++;
+    }
+    temp = fixed_log(temp - 0x80000000);
+    temp = (int)(((int64_t)temp * CONST_RECIP_LN2 + 0x20000000) >> 30);
+    temp = (((temp + 0x80) >> 8) + ((8 - nz) << 23)) * sbr->spectrum_params.bs_noise_bands;
+
+    sbr->n_q = (temp + 0x400000) >> 23;
+    if (sbr->n_q < 1)
+        sbr->n_q = 1;
+#else
+    sbr->n_q = FFMAX(1, lrintf(sbr->spectrum_params.bs_noise_bands *
+                               log2f(sbr->k[2] / (float)sbr->kx[1]))); // 0 <= bs_noise_bands <= 3
+#endif /* USE_FIXED */
+
+    if (sbr->n_q > 5) {
+        av_log(ac->avctx, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
+        return -1;
+    }
+
+    sbr->f_tablenoise[0] = sbr->f_tablelow[0];
+    temp = 0;
+    for (k = 1; k <= sbr->n_q; k++) {
+        temp += (sbr->n[0] - temp) / (sbr->n_q + 1 - k);
+        sbr->f_tablenoise[k] = sbr->f_tablelow[temp];
+    }
+
+    if (sbr_hf_calc_npatches(ac, sbr) < 0)
+        return -1;
+
+    sbr_make_f_tablelim(sbr);
+
+    sbr->data[0].f_indexnoise = 0;
+    sbr->data[1].f_indexnoise = 0;
+
+    return 0;
+}
+
+static av_always_inline void get_bits1_vector(GetBitContext *gb, uint8_t *vec,
+                                              int elements)
+{
+    int i;
+    for (i = 0; i < elements; i++) {
+        vec[i] = get_bits1(gb);
+    }
+}
+
+/** ceil(log2(index+1)) */
+static const int8_t ceil_log2[] = {
+    0, 1, 2, 2, 3, 3,
+};
+
+static int read_sbr_grid(AACContext *ac, SpectralBandReplication *sbr,
+                         GetBitContext *gb, SBRData *ch_data)
+{
+    int i;
+    int bs_pointer = 0;
+    // frameLengthFlag ? 15 : 16; 960 sample length frames unsupported; this value is numTimeSlots
+    int abs_bord_trail = 16;
+    int num_rel_lead, num_rel_trail;
+    unsigned bs_num_env_old = ch_data->bs_num_env;
+
+    ch_data->bs_freq_res[0] = ch_data->bs_freq_res[ch_data->bs_num_env];
+    ch_data->bs_amp_res = sbr->bs_amp_res_header;
+    ch_data->t_env_num_env_old = ch_data->t_env[bs_num_env_old];
+
+    switch (ch_data->bs_frame_class = get_bits(gb, 2)) {
+    case FIXFIX:
+        ch_data->bs_num_env                 = 1 << get_bits(gb, 2);
+        num_rel_lead                        = ch_data->bs_num_env - 1;
+        if (ch_data->bs_num_env == 1)
+            ch_data->bs_amp_res = 0;
+
+        if (ch_data->bs_num_env > 4) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid bitstream, too many SBR envelopes in FIXFIX type SBR frame: %d\n",
+                   ch_data->bs_num_env);
+            return -1;
+        }
+
+        ch_data->t_env[0]                   = 0;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        abs_bord_trail = (abs_bord_trail + (ch_data->bs_num_env >> 1)) /
+                   ch_data->bs_num_env;
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + abs_bord_trail;
+
+        ch_data->bs_freq_res[1] = get_bits1(gb);
+        for (i = 1; i < ch_data->bs_num_env; i++)
+            ch_data->bs_freq_res[i + 1] = ch_data->bs_freq_res[1];
+        break;
+    case FIXVAR:
+        abs_bord_trail                     += get_bits(gb, 2);
+        num_rel_trail                       = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_trail + 1;
+        ch_data->t_env[0]                   = 0;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_trail; i++)
+            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
+                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        for (i = 0; i < ch_data->bs_num_env; i++)
+            ch_data->bs_freq_res[ch_data->bs_num_env - i] = get_bits1(gb);
+        break;
+    case VARFIX:
+        ch_data->t_env[0]                   = get_bits(gb, 2);
+        num_rel_lead                        = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_lead + 1;
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
+        break;
+    case VARVAR:
+        ch_data->t_env[0]                   = get_bits(gb, 2);
+        abs_bord_trail                     += get_bits(gb, 2);
+        num_rel_lead                        = get_bits(gb, 2);
+        num_rel_trail                       = get_bits(gb, 2);
+        ch_data->bs_num_env                 = num_rel_lead + num_rel_trail + 1;
+
+        if (ch_data->bs_num_env > 5) {
+            av_log(ac->avctx, AV_LOG_ERROR,
+                   "Invalid bitstream, too many SBR envelopes in VARVAR type SBR frame: %d\n",
+                   ch_data->bs_num_env);
+            return -1;
+        }
+
+        ch_data->t_env[ch_data->bs_num_env] = abs_bord_trail;
+
+        for (i = 0; i < num_rel_lead; i++)
+            ch_data->t_env[i + 1] = ch_data->t_env[i] + 2 * get_bits(gb, 2) + 2;
+        for (i = 0; i < num_rel_trail; i++)
+            ch_data->t_env[ch_data->bs_num_env - 1 - i] =
+                ch_data->t_env[ch_data->bs_num_env - i] - 2 * get_bits(gb, 2) - 2;
+
+        bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env]);
+
+        get_bits1_vector(gb, ch_data->bs_freq_res + 1, ch_data->bs_num_env);
+        break;
+    }
+
+    av_assert0(bs_pointer >= 0);
+    if (bs_pointer > ch_data->bs_num_env + 1) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Invalid bitstream, bs_pointer points to a middle noise border outside the time borders table: %d\n",
+               bs_pointer);
+        return -1;
+    }
+
+    for (i = 1; i <= ch_data->bs_num_env; i++) {
+        if (ch_data->t_env[i-1] >= ch_data->t_env[i]) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Not strictly monotone time borders\n");
+            return -1;
+        }
+    }
+
+    ch_data->bs_num_noise = (ch_data->bs_num_env > 1) + 1;
+
+    ch_data->t_q[0]                     = ch_data->t_env[0];
+    ch_data->t_q[ch_data->bs_num_noise] = ch_data->t_env[ch_data->bs_num_env];
+    if (ch_data->bs_num_noise > 1) {
+        int idx;
+        if (ch_data->bs_frame_class == FIXFIX) {
+            idx = ch_data->bs_num_env >> 1;
+        } else if (ch_data->bs_frame_class & 1) { // FIXVAR or VARVAR
+            idx = ch_data->bs_num_env - FFMAX(bs_pointer - 1, 1);
+        } else { // VARFIX
+            if (!bs_pointer)
+                idx = 1;
+            else if (bs_pointer == 1)
+                idx = ch_data->bs_num_env - 1;
+            else // bs_pointer > 1
+                idx = bs_pointer - 1;
+        }
+        ch_data->t_q[1] = ch_data->t_env[idx];
+    }
+
+    ch_data->e_a[0] = -(ch_data->e_a[1] != bs_num_env_old); // l_APrev
+    ch_data->e_a[1] = -1;
+    if ((ch_data->bs_frame_class & 1) && bs_pointer) { // FIXVAR or VARVAR and bs_pointer != 0
+        ch_data->e_a[1] = ch_data->bs_num_env + 1 - bs_pointer;
+    } else if ((ch_data->bs_frame_class == 2) && (bs_pointer > 1)) // VARFIX and bs_pointer > 1
+        ch_data->e_a[1] = bs_pointer - 1;
+
+    return 0;
+}
+
+static void copy_sbr_grid(SBRData *dst, const SBRData *src) {
+    //These variables are saved from the previous frame rather than copied
+    dst->bs_freq_res[0]    = dst->bs_freq_res[dst->bs_num_env];
+    dst->t_env_num_env_old = dst->t_env[dst->bs_num_env];
+    dst->e_a[0]            = -(dst->e_a[1] != dst->bs_num_env);
+
+    //These variables are read from the bitstream and therefore copied
+    memcpy(dst->bs_freq_res+1, src->bs_freq_res+1, sizeof(dst->bs_freq_res)-sizeof(*dst->bs_freq_res));
+    memcpy(dst->t_env,         src->t_env,         sizeof(dst->t_env));
+    memcpy(dst->t_q,           src->t_q,           sizeof(dst->t_q));
+    dst->bs_num_env        = src->bs_num_env;
+    dst->bs_amp_res        = src->bs_amp_res;
+    dst->bs_num_noise      = src->bs_num_noise;
+    dst->bs_frame_class    = src->bs_frame_class;
+    dst->e_a[1]            = src->e_a[1];
+}
+
+/// Read how the envelope and noise floor data is delta coded
+static void read_sbr_dtdf(SpectralBandReplication *sbr, GetBitContext *gb,
+                          SBRData *ch_data)
+{
+    get_bits1_vector(gb, ch_data->bs_df_env,   ch_data->bs_num_env);
+    get_bits1_vector(gb, ch_data->bs_df_noise, ch_data->bs_num_noise);
+}
+
+/// Read inverse filtering data
+static void read_sbr_invf(SpectralBandReplication *sbr, GetBitContext *gb,
+                          SBRData *ch_data)
+{
+    int i;
+
+    memcpy(ch_data->bs_invf_mode[1], ch_data->bs_invf_mode[0], 5 * sizeof(uint8_t));
+    for (i = 0; i < sbr->n_q; i++)
+        ch_data->bs_invf_mode[0][i] = get_bits(gb, 2);
+}
+
+static int read_sbr_envelope(AACContext *ac, SpectralBandReplication *sbr, GetBitContext *gb,
+                              SBRData *ch_data, int ch)
+{
+    int bits;
+    int i, j, k;
+    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
+    int t_lav, f_lav;
+    const int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
+    const int odd = sbr->n[1] & 1;
+
+    if (sbr->bs_coupling && ch) {
+        if (ch_data->bs_amp_res) {
+            bits   = 5;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_3_0DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_3_0DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
+        } else {
+            bits   = 6;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_1_5DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_1_5DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_1_5DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_1_5DB];
+        }
+    } else {
+        if (ch_data->bs_amp_res) {
+            bits   = 6;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_3_0DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_3_0DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
+        } else {
+            bits   = 7;
+            t_huff = vlc_sbr[T_HUFFMAN_ENV_1_5DB].table;
+            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_1_5DB];
+            f_huff = vlc_sbr[F_HUFFMAN_ENV_1_5DB].table;
+            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_1_5DB];
+        }
+    }
+
+    for (i = 0; i < ch_data->bs_num_env; i++) {
+        if (ch_data->bs_df_env[i]) {
+            // bs_freq_res[0] == bs_freq_res[bs_num_env] from prev frame
+            if (ch_data->bs_freq_res[i + 1] == ch_data->bs_freq_res[i]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i][j] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                    if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                        av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            } else if (ch_data->bs_freq_res[i + 1]) {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = (j + odd) >> 1; // find k such that f_tablelow[k] <= f_tablehigh[j] < f_tablelow[k + 1]
+                    ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                    if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                        av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            } else {
+                for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                    k = j ? 2*j - odd : 0; // find k such that f_tablehigh[k] == f_tablelow[j]
+                    ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i][k] + delta * (get_vlc2(gb, t_huff, 9, 3) - t_lav);
+                    if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                        av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            }
+        } else {
+            ch_data->env_facs_q[i + 1][0] = delta * get_bits(gb, bits); // bs_env_start_value_balance
+            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++) {
+                ch_data->env_facs_q[i + 1][j] = ch_data->env_facs_q[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+                if (ch_data->env_facs_q[i + 1][j] > 127U) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "env_facs_q %d is invalid\n", ch_data->env_facs_q[i + 1][j]);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        }
+    }
+
+    //assign 0th elements of env_facs_q from last elements
+    memcpy(ch_data->env_facs_q[0], ch_data->env_facs_q[ch_data->bs_num_env],
+           sizeof(ch_data->env_facs_q[0]));
+
+    return 0;
+}
+
+static int read_sbr_noise(AACContext *ac, SpectralBandReplication *sbr, GetBitContext *gb,
+                           SBRData *ch_data, int ch)
+{
+    int i, j;
+    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
+    int t_lav, f_lav;
+    int delta = (ch == 1 && sbr->bs_coupling == 1) + 1;
+
+    if (sbr->bs_coupling && ch) {
+        t_huff = vlc_sbr[T_HUFFMAN_NOISE_BAL_3_0DB].table;
+        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_BAL_3_0DB];
+        f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
+        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
+    } else {
+        t_huff = vlc_sbr[T_HUFFMAN_NOISE_3_0DB].table;
+        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_3_0DB];
+        f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
+        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
+    }
+
+    for (i = 0; i < ch_data->bs_num_noise; i++) {
+        if (ch_data->bs_df_noise[i]) {
+            for (j = 0; j < sbr->n_q; j++) {
+                ch_data->noise_facs_q[i + 1][j] = ch_data->noise_facs_q[i][j] + delta * (get_vlc2(gb, t_huff, 9, 2) - t_lav);
+                if (ch_data->noise_facs_q[i + 1][j] > 30U) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "noise_facs_q %d is invalid\n", ch_data->noise_facs_q[i + 1][j]);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        } else {
+            ch_data->noise_facs_q[i + 1][0] = delta * get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
+            for (j = 1; j < sbr->n_q; j++) {
+                ch_data->noise_facs_q[i + 1][j] = ch_data->noise_facs_q[i + 1][j - 1] + delta * (get_vlc2(gb, f_huff, 9, 3) - f_lav);
+                if (ch_data->noise_facs_q[i + 1][j] > 30U) {
+                    av_log(ac->avctx, AV_LOG_ERROR, "noise_facs_q %d is invalid\n", ch_data->noise_facs_q[i + 1][j]);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        }
+    }
+
+    //assign 0th elements of noise_facs_q from last elements
+    memcpy(ch_data->noise_facs_q[0], ch_data->noise_facs_q[ch_data->bs_num_noise],
+           sizeof(ch_data->noise_facs_q[0]));
+    return 0;
+}
+
+static void read_sbr_extension(AACContext *ac, SpectralBandReplication *sbr,
+                               GetBitContext *gb,
+                               int bs_extension_id, int *num_bits_left)
+{
+    switch (bs_extension_id) {
+    case EXTENSION_ID_PS:
+        if (!ac->oc[1].m4ac.ps) {
+            av_log(ac->avctx, AV_LOG_ERROR, "Parametric Stereo signaled to be not-present but was found in the bitstream.\n");
+            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+            *num_bits_left = 0;
+        } else {
+#if 1
+            *num_bits_left -= AAC_RENAME(ff_ps_read_data)(ac->avctx, gb, &sbr->ps, *num_bits_left);
+            ac->avctx->profile = FF_PROFILE_AAC_HE_V2;
+#else
+            avpriv_report_missing_feature(ac->avctx, "Parametric Stereo");
+            skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+            *num_bits_left = 0;
+#endif
+        }
+        break;
+    default:
+        // some files contain 0-padding
+        if (bs_extension_id || *num_bits_left > 16 || show_bits(gb, *num_bits_left))
+            avpriv_request_sample(ac->avctx, "Reserved SBR extensions");
+        skip_bits_long(gb, *num_bits_left); // bs_fill_bits
+        *num_bits_left = 0;
+        break;
+    }
+}
+
+static int read_sbr_single_channel_element(AACContext *ac,
+                                            SpectralBandReplication *sbr,
+                                            GetBitContext *gb)
+{
+    int ret;
+
+    if (get_bits1(gb)) // bs_data_extra
+        skip_bits(gb, 4); // bs_reserved
+
+    if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+        return -1;
+    read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+    read_sbr_invf(sbr, gb, &sbr->data[0]);
+    if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+        return ret;
+    if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+        return ret;
+
+    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
+
+    return 0;
+}
+
+static int read_sbr_channel_pair_element(AACContext *ac,
+                                          SpectralBandReplication *sbr,
+                                          GetBitContext *gb)
+{
+    int ret;
+
+    if (get_bits1(gb))    // bs_data_extra
+        skip_bits(gb, 8); // bs_reserved
+
+    if ((sbr->bs_coupling = get_bits1(gb))) {
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]))
+            return -1;
+        copy_sbr_grid(&sbr->data[1], &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
+        read_sbr_invf(sbr, gb, &sbr->data[0]);
+        memcpy(sbr->data[1].bs_invf_mode[1], sbr->data[1].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
+        memcpy(sbr->data[1].bs_invf_mode[0], sbr->data[0].bs_invf_mode[0], sizeof(sbr->data[1].bs_invf_mode[0]));
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+    } else {
+        if (read_sbr_grid(ac, sbr, gb, &sbr->data[0]) ||
+            read_sbr_grid(ac, sbr, gb, &sbr->data[1]))
+            return -1;
+        read_sbr_dtdf(sbr, gb, &sbr->data[0]);
+        read_sbr_dtdf(sbr, gb, &sbr->data[1]);
+        read_sbr_invf(sbr, gb, &sbr->data[0]);
+        read_sbr_invf(sbr, gb, &sbr->data[1]);
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_envelope(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[0], 0)) < 0)
+            return ret;
+        if((ret = read_sbr_noise(ac, sbr, gb, &sbr->data[1], 1)) < 0)
+            return ret;
+    }
+
+    if ((sbr->data[0].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[0].bs_add_harmonic, sbr->n[1]);
+    if ((sbr->data[1].bs_add_harmonic_flag = get_bits1(gb)))
+        get_bits1_vector(gb, sbr->data[1].bs_add_harmonic, sbr->n[1]);
+
+    return 0;
+}
+
+static unsigned int read_sbr_data(AACContext *ac, SpectralBandReplication *sbr,
+                                  GetBitContext *gb, int id_aac)
+{
+    unsigned int cnt = get_bits_count(gb);
+
+    sbr->id_aac = id_aac;
+    sbr->ready_for_dequant = 1;
+
+    if (id_aac == TYPE_SCE || id_aac == TYPE_CCE) {
+        if (read_sbr_single_channel_element(ac, sbr, gb)) {
+            sbr_turnoff(sbr);
+            return get_bits_count(gb) - cnt;
+        }
+    } else if (id_aac == TYPE_CPE) {
+        if (read_sbr_channel_pair_element(ac, sbr, gb)) {
+            sbr_turnoff(sbr);
+            return get_bits_count(gb) - cnt;
+        }
+    } else {
+        av_log(ac->avctx, AV_LOG_ERROR,
+            "Invalid bitstream - cannot apply SBR to element type %d\n", id_aac);
+        sbr_turnoff(sbr);
+        return get_bits_count(gb) - cnt;
+    }
+    if (get_bits1(gb)) { // bs_extended_data
+        int num_bits_left = get_bits(gb, 4); // bs_extension_size
+        if (num_bits_left == 15)
+            num_bits_left += get_bits(gb, 8); // bs_esc_count
+
+        num_bits_left <<= 3;
+        while (num_bits_left > 7) {
+            num_bits_left -= 2;
+            read_sbr_extension(ac, sbr, gb, get_bits(gb, 2), &num_bits_left); // bs_extension_id
+        }
+        if (num_bits_left < 0) {
+            av_log(ac->avctx, AV_LOG_ERROR, "SBR Extension over read.\n");
+        }
+        if (num_bits_left > 0)
+            skip_bits(gb, num_bits_left);
+    }
+
+    return get_bits_count(gb) - cnt;
+}
+
+static void sbr_reset(AACContext *ac, SpectralBandReplication *sbr)
+{
+    int err;
+    err = sbr_make_f_master(ac, sbr, &sbr->spectrum_params);
+    if (err >= 0)
+        err = sbr_make_f_derived(ac, sbr);
+    if (err < 0) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "SBR reset failed. Switching SBR to pure upsampling mode.\n");
+        sbr_turnoff(sbr);
+    }
+}
+
+/**
+ * Decode Spectral Band Replication extension data; reference: table 4.55.
+ *
+ * @param   crc flag indicating the presence of CRC checksum
+ * @param   cnt length of TYPE_FIL syntactic element in bytes
+ *
+ * @return  Returns number of bytes consumed from the TYPE_FIL element.
+ */
+int AAC_RENAME(ff_decode_sbr_extension)(AACContext *ac, SpectralBandReplication *sbr,
+                            GetBitContext *gb_host, int crc, int cnt, int id_aac)
+{
+    unsigned int num_sbr_bits = 0, num_align_bits;
+    unsigned bytes_read;
+    GetBitContext gbc = *gb_host, *gb = &gbc;
+    skip_bits_long(gb_host, cnt*8 - 4);
+
+    sbr->reset = 0;
+
+    if (!sbr->sample_rate)
+        sbr->sample_rate = 2 * ac->oc[1].m4ac.sample_rate; //TODO use the nominal sample rate for arbitrary sample rate support
+    if (!ac->oc[1].m4ac.ext_sample_rate)
+        ac->oc[1].m4ac.ext_sample_rate = 2 * ac->oc[1].m4ac.sample_rate;
+
+    if (crc) {
+        skip_bits(gb, 10); // bs_sbr_crc_bits; TODO - implement CRC check
+        num_sbr_bits += 10;
+    }
+
+    //Save some state from the previous frame.
+    sbr->kx[0] = sbr->kx[1];
+    sbr->m[0] = sbr->m[1];
+    sbr->kx_and_m_pushed = 1;
+
+    num_sbr_bits++;
+    if (get_bits1(gb)) // bs_header_flag
+        num_sbr_bits += read_sbr_header(sbr, gb);
+
+    if (sbr->reset)
+        sbr_reset(ac, sbr);
+
+    if (sbr->start)
+        num_sbr_bits  += read_sbr_data(ac, sbr, gb, id_aac);
+
+    num_align_bits = ((cnt << 3) - 4 - num_sbr_bits) & 7;
+    bytes_read = ((num_sbr_bits + num_align_bits + 4) >> 3);
+
+    if (bytes_read > cnt) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "Expected to read %d SBR bytes actually read %d.\n", cnt, bytes_read);
+    }
+    return cnt;
+}
+
+/**
+ * Analysis QMF Bank (14496-3 sp04 p206)
+ *
+ * @param   x       pointer to the beginning of the first sample window
+ * @param   W       array of complex-valued samples split into subbands
+ */
+#ifndef sbr_qmf_analysis
+#if USE_FIXED
+static void sbr_qmf_analysis(AVFixedDSPContext *dsp, FFTContext *mdct,
+#else
+static void sbr_qmf_analysis(AVFloatDSPContext *dsp, FFTContext *mdct,
+#endif /* USE_FIXED */
+                             SBRDSPContext *sbrdsp, const INTFLOAT *in, INTFLOAT *x,
+                             INTFLOAT z[320], INTFLOAT W[2][32][32][2], int buf_idx)
+{
+    int i;
+#if USE_FIXED
+    int j;
+#endif
+    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
+    memcpy(x+288, in,         1024*sizeof(x[0]));
+    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
+                               // are not supported
+        dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
+        sbrdsp->sum64x5(z);
+        sbrdsp->qmf_pre_shuffle(z);
+#if USE_FIXED
+        for (j = 64; j < 128; j++) {
+            if (z[j] > 1<<24) {
+                av_log(NULL, AV_LOG_WARNING,
+                       "sbr_qmf_analysis: value %09d too large, setting to %09d\n",
+                       z[j], 1<<24);
+                z[j] = 1<<24;
+            } else if (z[j] < -(1<<24)) {
+                av_log(NULL, AV_LOG_WARNING,
+                       "sbr_qmf_analysis: value %09d too small, setting to %09d\n",
+                       z[j], -(1<<24));
+                z[j] = -(1<<24);
+            }
+        }
+#endif
+        mdct->imdct_half(mdct, z, z+64);
+        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
+        x += 32;
+    }
+}
+#endif
+
+/**
+ * Synthesis QMF Bank (14496-3 sp04 p206) and Downsampled Synthesis QMF Bank
+ * (14496-3 sp04 p206)
+ */
+#ifndef sbr_qmf_synthesis
+static void sbr_qmf_synthesis(FFTContext *mdct,
+#if USE_FIXED
+                              SBRDSPContext *sbrdsp, AVFixedDSPContext *dsp,
+#else
+                              SBRDSPContext *sbrdsp, AVFloatDSPContext *dsp,
+#endif /* USE_FIXED */
+                              INTFLOAT *out, INTFLOAT X[2][38][64],
+                              INTFLOAT mdct_buf[2][64],
+                              INTFLOAT *v0, int *v_off, const unsigned int div)
+{
+    int i, n;
+    const INTFLOAT *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
+    const int step = 128 >> div;
+    INTFLOAT *v;
+    for (i = 0; i < 32; i++) {
+        if (*v_off < step) {
+            int saved_samples = (1280 - 128) >> div;
+            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(INTFLOAT));
+            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
+        } else {
+            *v_off -= step;
+        }
+        v = v0 + *v_off;
+        if (div) {
+            for (n = 0; n < 32; n++) {
+                X[0][i][   n] = -X[0][i][n];
+                X[0][i][32+n] =  X[1][i][31-n];
+            }
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
+        } else {
+            sbrdsp->neg_odd_64(X[1][i]);
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
+            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
+        }
+        dsp->vector_fmul    (out, v                , sbr_qmf_window                       , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
+        dsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
+        out += 64 >> div;
+    }
+}
+#endif
+
+/// Generate the subband filtered lowband
+static int sbr_lf_gen(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_low[32][40][2], const INTFLOAT W[2][32][32][2],
+                      int buf_idx)
+{
+    int i, k;
+    const int t_HFGen = 8;
+    const int i_f = 32;
+    memset(X_low, 0, 32*sizeof(*X_low));
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = t_HFGen; i < i_f + t_HFGen; i++) {
+            X_low[k][i][0] = W[buf_idx][i - t_HFGen][k][0];
+            X_low[k][i][1] = W[buf_idx][i - t_HFGen][k][1];
+        }
+    }
+    buf_idx = 1-buf_idx;
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < t_HFGen; i++) {
+            X_low[k][i][0] = W[buf_idx][i + i_f - t_HFGen][k][0];
+            X_low[k][i][1] = W[buf_idx][i + i_f - t_HFGen][k][1];
+        }
+    }
+    return 0;
+}
+
+/// High Frequency Generator (14496-3 sp04 p215)
+static int sbr_hf_gen(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_high[64][40][2], const INTFLOAT X_low[32][40][2],
+                      const INTFLOAT (*alpha0)[2], const INTFLOAT (*alpha1)[2],
+                      const INTFLOAT bw_array[5], const uint8_t *t_env,
+                      int bs_num_env)
+{
+    int j, x;
+    int g = 0;
+    int k = sbr->kx[1];
+    for (j = 0; j < sbr->num_patches; j++) {
+        for (x = 0; x < sbr->patch_num_subbands[j]; x++, k++) {
+            const int p = sbr->patch_start_subband[j] + x;
+            while (g <= sbr->n_q && k >= sbr->f_tablenoise[g])
+                g++;
+            g--;
+
+            if (g < 0) {
+                av_log(ac->avctx, AV_LOG_ERROR,
+                       "ERROR : no subband found for frequency %d\n", k);
+                return -1;
+            }
+
+            sbr->dsp.hf_gen(X_high[k] + ENVELOPE_ADJUSTMENT_OFFSET,
+                            X_low[p]  + ENVELOPE_ADJUSTMENT_OFFSET,
+                            alpha0[p], alpha1[p], bw_array[g],
+                            2 * t_env[0], 2 * t_env[bs_num_env]);
+        }
+    }
+    if (k < sbr->m[1] + sbr->kx[1])
+        memset(X_high + k, 0, (sbr->m[1] + sbr->kx[1] - k) * sizeof(*X_high));
+
+    return 0;
+}
+
+/// Generate the subband filtered lowband
+static int sbr_x_gen(SpectralBandReplication *sbr, INTFLOAT X[2][38][64],
+                     const INTFLOAT Y0[38][64][2], const INTFLOAT Y1[38][64][2],
+                     const INTFLOAT X_low[32][40][2], int ch)
+{
+    int k, i;
+    const int i_f = 32;
+    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
+    memset(X, 0, 2*sizeof(*X));
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < i_Temp; i++) {
+            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
+            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
+        }
+    }
+    for (; k < sbr->kx[0] + sbr->m[0]; k++) {
+        for (i = 0; i < i_Temp; i++) {
+            X[0][i][k] = Y0[i + i_f][k][0];
+            X[1][i][k] = Y0[i + i_f][k][1];
+        }
+    }
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = i_Temp; i < 38; i++) {
+            X[0][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][0];
+            X[1][i][k] = X_low[k][i + ENVELOPE_ADJUSTMENT_OFFSET][1];
+        }
+    }
+    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
+        for (i = i_Temp; i < i_f; i++) {
+            X[0][i][k] = Y1[i][k][0];
+            X[1][i][k] = Y1[i][k][1];
+        }
+    }
+    return 0;
+}
+
+/** High Frequency Adjustment (14496-3 sp04 p217) and Mapping
+ * (14496-3 sp04 p217)
+ */
+static int sbr_mapping(AACContext *ac, SpectralBandReplication *sbr,
+                        SBRData *ch_data, int e_a[2])
+{
+    int e, i, m;
+
+    memset(ch_data->s_indexmapped[1], 0, 7*sizeof(ch_data->s_indexmapped[1]));
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        const unsigned int ilim = sbr->n[ch_data->bs_freq_res[e + 1]];
+        uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
+        int k;
+
+        if (sbr->kx[1] != table[0]) {
+            av_log(ac->avctx, AV_LOG_ERROR, "kx != f_table{high,low}[0]. "
+                   "Derived frequency tables were not regenerated.\n");
+            sbr_turnoff(sbr);
+            return AVERROR_BUG;
+        }
+        for (i = 0; i < ilim; i++)
+            for (m = table[i]; m < table[i + 1]; m++)
+                sbr->e_origmapped[e][m - sbr->kx[1]] = ch_data->env_facs[e+1][i];
+
+        // ch_data->bs_num_noise > 1 => 2 noise floors
+        k = (ch_data->bs_num_noise > 1) && (ch_data->t_env[e] >= ch_data->t_q[1]);
+        for (i = 0; i < sbr->n_q; i++)
+            for (m = sbr->f_tablenoise[i]; m < sbr->f_tablenoise[i + 1]; m++)
+                sbr->q_mapped[e][m - sbr->kx[1]] = ch_data->noise_facs[k+1][i];
+
+        for (i = 0; i < sbr->n[1]; i++) {
+            if (ch_data->bs_add_harmonic_flag) {
+                const unsigned int m_midpoint =
+                    (sbr->f_tablehigh[i] + sbr->f_tablehigh[i + 1]) >> 1;
+
+                ch_data->s_indexmapped[e + 1][m_midpoint - sbr->kx[1]] = ch_data->bs_add_harmonic[i] *
+                    (e >= e_a[1] || (ch_data->s_indexmapped[0][m_midpoint - sbr->kx[1]] == 1));
+            }
+        }
+
+        for (i = 0; i < ilim; i++) {
+            int additional_sinusoid_present = 0;
+            for (m = table[i]; m < table[i + 1]; m++) {
+                if (ch_data->s_indexmapped[e + 1][m - sbr->kx[1]]) {
+                    additional_sinusoid_present = 1;
+                    break;
+                }
+            }
+            memset(&sbr->s_mapped[e][table[i] - sbr->kx[1]], additional_sinusoid_present,
+                   (table[i + 1] - table[i]) * sizeof(sbr->s_mapped[e][0]));
+        }
+    }
+
+    memcpy(ch_data->s_indexmapped[0], ch_data->s_indexmapped[ch_data->bs_num_env], sizeof(ch_data->s_indexmapped[0]));
+    return 0;
+}
+
+/// Estimation of current envelope (14496-3 sp04 p218)
+static void sbr_env_estimate(AAC_FLOAT (*e_curr)[48], INTFLOAT X_high[64][40][2],
+                             SpectralBandReplication *sbr, SBRData *ch_data)
+{
+    int e, m;
+    int kx1 = sbr->kx[1];
+
+    if (sbr->bs_interpol_freq) {
+        for (e = 0; e < ch_data->bs_num_env; e++) {
+#if USE_FIXED
+            const SoftFloat recip_env_size = av_int2sf(0x20000000 / (ch_data->t_env[e + 1] - ch_data->t_env[e]), 30);
+#else
+            const float recip_env_size = 0.5f / (ch_data->t_env[e + 1] - ch_data->t_env[e]);
+#endif /* USE_FIXED */
+            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+
+            for (m = 0; m < sbr->m[1]; m++) {
+                AAC_FLOAT sum = sbr->dsp.sum_square(X_high[m+kx1] + ilb, iub - ilb);
+#if USE_FIXED
+                e_curr[e][m] = av_mul_sf(sum, recip_env_size);
+#else
+                e_curr[e][m] = sum * recip_env_size;
+#endif /* USE_FIXED */
+            }
+        }
+    } else {
+        int k, p;
+
+        for (e = 0; e < ch_data->bs_num_env; e++) {
+            const int env_size = 2 * (ch_data->t_env[e + 1] - ch_data->t_env[e]);
+            int ilb = ch_data->t_env[e]     * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            int iub = ch_data->t_env[e + 1] * 2 + ENVELOPE_ADJUSTMENT_OFFSET;
+            const uint16_t *table = ch_data->bs_freq_res[e + 1] ? sbr->f_tablehigh : sbr->f_tablelow;
+
+            for (p = 0; p < sbr->n[ch_data->bs_freq_res[e + 1]]; p++) {
+#if USE_FIXED
+                SoftFloat sum = FLOAT_0;
+                const SoftFloat den = av_int2sf(0x20000000 / (env_size * (table[p + 1] - table[p])), 29);
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    sum = av_add_sf(sum, sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb));
+                }
+                sum = av_mul_sf(sum, den);
+#else
+                float sum = 0.0f;
+                const int den = env_size * (table[p + 1] - table[p]);
+
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    sum += sbr->dsp.sum_square(X_high[k] + ilb, iub - ilb);
+                }
+                sum /= den;
+#endif /* USE_FIXED */
+                for (k = table[p]; k < table[p + 1]; k++) {
+                    e_curr[e][k - kx1] = sum;
+                }
+            }
+        }
+    }
+}
+
+void AAC_RENAME(ff_sbr_apply)(AACContext *ac, SpectralBandReplication *sbr, int id_aac,
+                  INTFLOAT* L, INTFLOAT* R)
+{
+    int downsampled = ac->oc[1].m4ac.ext_sample_rate < sbr->sample_rate;
+    int ch;
+    int nch = (id_aac == TYPE_CPE) ? 2 : 1;
+    int err;
+
+    if (id_aac != sbr->id_aac) {
+        av_log(ac->avctx, id_aac == TYPE_LFE ? AV_LOG_VERBOSE : AV_LOG_WARNING,
+            "element type mismatch %d != %d\n", id_aac, sbr->id_aac);
+        sbr_turnoff(sbr);
+    }
+
+    if (sbr->start && !sbr->ready_for_dequant) {
+        av_log(ac->avctx, AV_LOG_ERROR,
+               "No quantized data read for sbr_dequant.\n");
+        sbr_turnoff(sbr);
+    }
+
+    if (!sbr->kx_and_m_pushed) {
+        sbr->kx[0] = sbr->kx[1];
+        sbr->m[0] = sbr->m[1];
+    } else {
+        sbr->kx_and_m_pushed = 0;
+    }
+
+    if (sbr->start) {
+        sbr_dequant(sbr, id_aac);
+        sbr->ready_for_dequant = 0;
+    }
+    for (ch = 0; ch < nch; ch++) {
+        /* decode channel */
+        sbr_qmf_analysis(ac->fdsp, &sbr->mdct_ana, &sbr->dsp, ch ? R : L, sbr->data[ch].analysis_filterbank_samples,
+                         (INTFLOAT*)sbr->qmf_filter_scratch,
+                         sbr->data[ch].W, sbr->data[ch].Ypos);
+        sbr->c.sbr_lf_gen(ac, sbr, sbr->X_low,
+                          (const INTFLOAT (*)[32][32][2]) sbr->data[ch].W,
+                          sbr->data[ch].Ypos);
+        sbr->data[ch].Ypos ^= 1;
+        if (sbr->start) {
+            sbr->c.sbr_hf_inverse_filter(&sbr->dsp, sbr->alpha0, sbr->alpha1,
+                                         (const INTFLOAT (*)[40][2]) sbr->X_low, sbr->k[0]);
+            sbr_chirp(sbr, &sbr->data[ch]);
+            av_assert0(sbr->data[ch].bs_num_env > 0);
+            sbr_hf_gen(ac, sbr, sbr->X_high,
+                       (const INTFLOAT (*)[40][2]) sbr->X_low,
+                       (const INTFLOAT (*)[2]) sbr->alpha0,
+                       (const INTFLOAT (*)[2]) sbr->alpha1,
+                       sbr->data[ch].bw_array, sbr->data[ch].t_env,
+                       sbr->data[ch].bs_num_env);
+
+            // hf_adj
+            err = sbr_mapping(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
+            if (!err) {
+                sbr_env_estimate(sbr->e_curr, sbr->X_high, sbr, &sbr->data[ch]);
+                sbr_gain_calc(ac, sbr, &sbr->data[ch], sbr->data[ch].e_a);
+                sbr->c.sbr_hf_assemble(sbr->data[ch].Y[sbr->data[ch].Ypos],
+                                (const INTFLOAT (*)[40][2]) sbr->X_high,
+                                sbr, &sbr->data[ch],
+                                sbr->data[ch].e_a);
+            }
+        }
+
+        /* synthesis */
+        sbr->c.sbr_x_gen(sbr, sbr->X[ch],
+                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[1-sbr->data[ch].Ypos],
+                  (const INTFLOAT (*)[64][2]) sbr->data[ch].Y[  sbr->data[ch].Ypos],
+                  (const INTFLOAT (*)[40][2]) sbr->X_low, ch);
+    }
+
+    if (ac->oc[1].m4ac.ps == 1) {
+        if (sbr->ps.start) {
+            AAC_RENAME(ff_ps_apply)(ac->avctx, &sbr->ps, sbr->X[0], sbr->X[1], sbr->kx[1] + sbr->m[1]);
+        } else {
+            memcpy(sbr->X[1], sbr->X[0], sizeof(sbr->X[0]));
+        }
+        nch = 2;
+    }
+
+    sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, ac->fdsp,
+                      L, sbr->X[0], sbr->qmf_filter_scratch,
+                      sbr->data[0].synthesis_filterbank_samples,
+                      &sbr->data[0].synthesis_filterbank_samples_offset,
+                      downsampled);
+    if (nch == 2)
+        sbr_qmf_synthesis(&sbr->mdct, &sbr->dsp, ac->fdsp,
+                          R, sbr->X[1], sbr->qmf_filter_scratch,
+                          sbr->data[1].synthesis_filterbank_samples,
+                          &sbr->data[1].synthesis_filterbank_samples_offset,
+                          downsampled);
+}
+
+static void aacsbr_func_ptr_init(AACSBRContext *c)
+{
+    c->sbr_lf_gen            = sbr_lf_gen;
+    c->sbr_hf_assemble       = sbr_hf_assemble;
+    c->sbr_x_gen             = sbr_x_gen;
+    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter;
+
+#if !USE_FIXED
+    if(ARCH_MIPS)
+        ff_aacsbr_func_ptr_init_mips(c);
+#endif
+}
diff --git a/libavcodec/aacsbrdata.h b/libavcodec/aacsbrdata.h
index f309059..4ff8fae 100644
--- a/libavcodec/aacsbrdata.h
+++ b/libavcodec/aacsbrdata.h
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding data
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 
 #include <stdint.h>
 #include "libavutil/mem.h"
+#include "aac_defines.h"
 
 ///< Huffman tables for SBR
 
@@ -266,351 +267,269 @@ static const int8_t sbr_offset[6][16] = {
     {-2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  9, 11, 13, 16, 20, 24}, // 64000 Hz <  fs_sbr
 };
 
-///< window coefficients for analysis/synthesis QMF banks
-static DECLARE_ALIGNED(32, float, sbr_qmf_window_ds)[320];
-static DECLARE_ALIGNED(32, float, sbr_qmf_window_us)[640] = {
-     0.0000000000, -0.0005525286, -0.0005617692, -0.0004947518,
-    -0.0004875227, -0.0004893791, -0.0005040714, -0.0005226564,
-    -0.0005466565, -0.0005677802, -0.0005870930, -0.0006132747,
-    -0.0006312493, -0.0006540333, -0.0006777690, -0.0006941614,
-    -0.0007157736, -0.0007255043, -0.0007440941, -0.0007490598,
-    -0.0007681371, -0.0007724848, -0.0007834332, -0.0007779869,
-    -0.0007803664, -0.0007801449, -0.0007757977, -0.0007630793,
-    -0.0007530001, -0.0007319357, -0.0007215391, -0.0006917937,
-    -0.0006650415, -0.0006341594, -0.0005946118, -0.0005564576,
-    -0.0005145572, -0.0004606325, -0.0004095121, -0.0003501175,
-    -0.0002896981, -0.0002098337, -0.0001446380, -0.0000617334,
-     0.0000134949,  0.0001094383,  0.0002043017,  0.0002949531,
-     0.0004026540,  0.0005107388,  0.0006239376,  0.0007458025,
-     0.0008608443,  0.0009885988,  0.0011250155,  0.0012577884,
-     0.0013902494,  0.0015443219,  0.0016868083,  0.0018348265,
-     0.0019841140,  0.0021461583,  0.0023017254,  0.0024625616,
-     0.0026201758,  0.0027870464,  0.0029469447,  0.0031125420,
-     0.0032739613,  0.0034418874,  0.0036008268,  0.0037603922,
-     0.0039207432,  0.0040819753,  0.0042264269,  0.0043730719,
-     0.0045209852,  0.0046606460,  0.0047932560,  0.0049137603,
-     0.0050393022,  0.0051407353,  0.0052461166,  0.0053471681,
-     0.0054196775,  0.0054876040,  0.0055475714,  0.0055938023,
-     0.0056220643,  0.0056455196,  0.0056389199,  0.0056266114,
-     0.0055917128,  0.0055404363,  0.0054753783,  0.0053838975,
-     0.0052715758,  0.0051382275,  0.0049839687,  0.0048109469,
-     0.0046039530,  0.0043801861,  0.0041251642,  0.0038456408,
-     0.0035401246,  0.0032091885,  0.0028446757,  0.0024508540,
-     0.0020274176,  0.0015784682,  0.0010902329,  0.0005832264,
-     0.0000276045, -0.0005464280, -0.0011568135, -0.0018039472,
-    -0.0024826723, -0.0031933778, -0.0039401124, -0.0047222596,
-    -0.0055337211, -0.0063792293, -0.0072615816, -0.0081798233,
-    -0.0091325329, -0.0101150215, -0.0111315548, -0.0121849995,
-     0.0132718220,  0.0143904666,  0.0155405553,  0.0167324712,
-     0.0179433381,  0.0191872431,  0.0204531793,  0.0217467550,
-     0.0230680169,  0.0244160992,  0.0257875847,  0.0271859429,
-     0.0286072173,  0.0300502657,  0.0315017608,  0.0329754081,
-     0.0344620948,  0.0359697560,  0.0374812850,  0.0390053679,
-     0.0405349170,  0.0420649094,  0.0436097542,  0.0451488405,
-     0.0466843027,  0.0482165720,  0.0497385755,  0.0512556155,
-     0.0527630746,  0.0542452768,  0.0557173648,  0.0571616450,
-     0.0585915683,  0.0599837480,  0.0613455171,  0.0626857808,
-     0.0639715898,  0.0652247106,  0.0664367512,  0.0676075985,
-     0.0687043828,  0.0697630244,  0.0707628710,  0.0717002673,
-     0.0725682583,  0.0733620255,  0.0741003642,  0.0747452558,
-     0.0753137336,  0.0758008358,  0.0761992479,  0.0764992170,
-     0.0767093490,  0.0768173975,  0.0768230011,  0.0767204924,
-     0.0765050718,  0.0761748321,  0.0757305756,  0.0751576255,
-     0.0744664394,  0.0736406005,  0.0726774642,  0.0715826364,
-     0.0703533073,  0.0689664013,  0.0674525021,  0.0657690668,
-     0.0639444805,  0.0619602779,  0.0598166570,  0.0575152691,
-     0.0550460034,  0.0524093821,  0.0495978676,  0.0466303305,
-     0.0434768782,  0.0401458278,  0.0366418116,  0.0329583930,
-     0.0290824006,  0.0250307561,  0.0207997072,  0.0163701258,
-     0.0117623832,  0.0069636862,  0.0019765601, -0.0032086896,
-    -0.0085711749, -0.0141288827, -0.0198834129, -0.0258227288,
-    -0.0319531274, -0.0382776572, -0.0447806821, -0.0514804176,
-    -0.0583705326, -0.0654409853, -0.0726943300, -0.0801372934,
-    -0.0877547536, -0.0955533352, -0.1035329531, -0.1116826931,
-    -0.1200077984, -0.1285002850, -0.1371551761, -0.1459766491,
-    -0.1549607071, -0.1640958855, -0.1733808172, -0.1828172548,
-    -0.1923966745, -0.2021250176, -0.2119735853, -0.2219652696,
-    -0.2320690870, -0.2423016884, -0.2526480309, -0.2631053299,
-    -0.2736634040, -0.2843214189, -0.2950716717, -0.3059098575,
-    -0.3168278913, -0.3278113727, -0.3388722693, -0.3499914122,
-     0.3611589903,  0.3723795546,  0.3836350013,  0.3949211761,
-     0.4062317676,  0.4175696896,  0.4289119920,  0.4402553754,
-     0.4515996535,  0.4629308085,  0.4742453214,  0.4855253091,
-     0.4967708254,  0.5079817500,  0.5191234970,  0.5302240895,
-     0.5412553448,  0.5522051258,  0.5630789140,  0.5738524131,
-     0.5845403235,  0.5951123086,  0.6055783538,  0.6159109932,
-     0.6261242695,  0.6361980107,  0.6461269695,  0.6559016302,
-     0.6655139880,  0.6749663190,  0.6842353293,  0.6933282376,
-     0.7022388719,  0.7109410426,  0.7194462634,  0.7277448900,
-     0.7358211758,  0.7436827863,  0.7513137456,  0.7587080760,
-     0.7658674865,  0.7727780881,  0.7794287519,  0.7858353120,
-     0.7919735841,  0.7978466413,  0.8034485751,  0.8087695004,
-     0.8138191270,  0.8185776004,  0.8230419890,  0.8272275347,
-     0.8311038457,  0.8346937361,  0.8379717337,  0.8409541392,
-     0.8436238281,  0.8459818469,  0.8480315777,  0.8497805198,
-     0.8511971524,  0.8523047035,  0.8531020949,  0.8535720573,
-     0.8537385600,
-};
-
-/* First two entries repeated at end to simplify SIMD implementations. */
-const DECLARE_ALIGNED(16, float, ff_sbr_noise_table)[][2] = {
-{-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
-{ 0.14130051758487, -0.95090983575689}, {-0.47005496701697, -0.37340549728647},
-{ 0.80705063769351,  0.29653668284408}, {-0.38981478896926,  0.89572605717087},
-{-0.01053049862020, -0.66959058036166}, {-0.91266367957293, -0.11522938140034},
-{ 0.54840422910309,  0.75221367176302}, { 0.40009252867955, -0.98929400334421},
-{-0.99867974711855, -0.88147068645358}, {-0.95531076805040,  0.90908757154593},
-{-0.45725933317144, -0.56716323646760}, {-0.72929675029275, -0.98008272727324},
-{ 0.75622801399036,  0.20950329995549}, { 0.07069442601050, -0.78247898470706},
-{ 0.74496252926055, -0.91169004445807}, {-0.96440182703856, -0.94739918296622},
-{ 0.30424629369539, -0.49438267012479}, { 0.66565033746925,  0.64652935542491},
-{ 0.91697008020594,  0.17514097332009}, {-0.70774918760427,  0.52548653416543},
-{-0.70051415345560, -0.45340028808763}, {-0.99496513054797, -0.90071908066973},
-{ 0.98164490790123, -0.77463155528697}, {-0.54671580548181, -0.02570928536004},
-{-0.01689629065389,  0.00287506445732}, {-0.86110349531986,  0.42548583726477},
-{-0.98892980586032, -0.87881132267556}, { 0.51756627678691,  0.66926784710139},
-{-0.99635026409640, -0.58107730574765}, {-0.99969370862163,  0.98369989360250},
-{ 0.55266258627194,  0.59449057465591}, { 0.34581177741673,  0.94879421061866},
-{ 0.62664209577999, -0.74402970906471}, {-0.77149701404973, -0.33883658042801},
-{-0.91592244254432,  0.03687901376713}, {-0.76285492357887, -0.91371867919124},
-{ 0.79788337195331, -0.93180971199849}, { 0.54473080610200, -0.11919206037186},
-{-0.85639281671058,  0.42429854760451}, {-0.92882402971423,  0.27871809078609},
-{-0.11708371046774, -0.99800843444966}, { 0.21356749817493, -0.90716295627033},
-{-0.76191692573909,  0.99768118356265}, { 0.98111043100884, -0.95854459734407},
-{-0.85913269895572,  0.95766566168880}, {-0.93307242253692,  0.49431757696466},
-{ 0.30485754879632, -0.70540034357529}, { 0.85289650925190,  0.46766131791044},
-{ 0.91328082618125, -0.99839597361769}, {-0.05890199924154,  0.70741827819497},
-{ 0.28398686150148,  0.34633555702188}, { 0.95258164539612, -0.54893416026939},
-{-0.78566324168507, -0.75568541079691}, {-0.95789495447877, -0.20423194696966},
-{ 0.82411158711197,  0.96654618432562}, {-0.65185446735885, -0.88734990773289},
-{-0.93643603134666,  0.99870790442385}, { 0.91427159529618, -0.98290505544444},
-{-0.70395684036886,  0.58796798221039}, { 0.00563771969365,  0.61768196727244},
-{ 0.89065051931895,  0.52783352697585}, {-0.68683707712762,  0.80806944710339},
-{ 0.72165342518718, -0.69259857349564}, {-0.62928247730667,  0.13627037407335},
-{ 0.29938434065514, -0.46051329682246}, {-0.91781958879280, -0.74012716684186},
-{ 0.99298717043688,  0.40816610075661}, { 0.82368298622748, -0.74036047190173},
-{-0.98512833386833, -0.99972330709594}, {-0.95915368242257, -0.99237800466040},
-{-0.21411126572790, -0.93424819052545}, {-0.68821476106884, -0.26892306315457},
-{ 0.91851997982317,  0.09358228901785}, {-0.96062769559127,  0.36099095133739},
-{ 0.51646184922287, -0.71373332873917}, { 0.61130721139669,  0.46950141175917},
-{ 0.47336129371299, -0.27333178296162}, { 0.90998308703519,  0.96715662938132},
-{ 0.44844799194357,  0.99211574628306}, { 0.66614891079092,  0.96590176169121},
-{ 0.74922239129237, -0.89879858826087}, {-0.99571588506485,  0.52785521494349},
-{ 0.97401082477563, -0.16855870075190}, { 0.72683747733879, -0.48060774432251},
-{ 0.95432193457128,  0.68849603408441}, {-0.72962208425191, -0.76608443420917},
-{-0.85359479233537,  0.88738125901579}, {-0.81412430338535, -0.97480768049637},
-{-0.87930772356786,  0.74748307690436}, {-0.71573331064977, -0.98570608178923},
-{ 0.83524300028228,  0.83702537075163}, {-0.48086065601423, -0.98848504923531},
-{ 0.97139128574778,  0.80093621198236}, { 0.51992825347895,  0.80247631400510},
-{-0.00848591195325, -0.76670128000486}, {-0.70294374303036,  0.55359910445577},
-{-0.95894428168140, -0.43265504344783}, { 0.97079252950321,  0.09325857238682},
-{-0.92404293670797,  0.85507704027855}, {-0.69506469500450,  0.98633412625459},
-{ 0.26559203620024,  0.73314307966524}, { 0.28038443336943,  0.14537913654427},
-{-0.74138124825523,  0.99310339807762}, {-0.01752795995444, -0.82616635284178},
-{-0.55126773094930, -0.98898543862153}, { 0.97960898850996, -0.94021446752851},
-{-0.99196309146936,  0.67019017358456}, {-0.67684928085260,  0.12631491649378},
-{ 0.09140039465500, -0.20537731453108}, {-0.71658965751996, -0.97788200391224},
-{ 0.81014640078925,  0.53722648362443}, { 0.40616991671205, -0.26469008598449},
-{-0.67680188682972,  0.94502052337695}, { 0.86849774348749, -0.18333598647899},
-{-0.99500381284851, -0.02634122068550}, { 0.84329189340667,  0.10406957462213},
-{-0.09215968531446,  0.69540012101253}, { 0.99956173327206, -0.12358542001404},
-{-0.79732779473535, -0.91582524736159}, { 0.96349973642406,  0.96640458041000},
-{-0.79942778496547,  0.64323902822857}, {-0.11566039853896,  0.28587846253726},
-{-0.39922954514662,  0.94129601616966}, { 0.99089197565987, -0.92062625581587},
-{ 0.28631285179909, -0.91035047143603}, {-0.83302725605608, -0.67330410892084},
-{ 0.95404443402072,  0.49162765398743}, {-0.06449863579434,  0.03250560813135},
-{-0.99575054486311,  0.42389784469507}, {-0.65501142790847,  0.82546114655624},
-{-0.81254441908887, -0.51627234660629}, {-0.99646369485481,  0.84490533520752},
-{ 0.00287840603348,  0.64768261158166}, { 0.70176989408455, -0.20453028573322},
-{ 0.96361882270190,  0.40706967140989}, {-0.68883758192426,  0.91338958840772},
-{-0.34875585502238,  0.71472290693300}, { 0.91980081243087,  0.66507455644919},
-{-0.99009048343881,  0.85868021604848}, { 0.68865791458395,  0.55660316809678},
-{-0.99484402129368, -0.20052559254934}, { 0.94214511408023, -0.99696425367461},
-{-0.67414626793544,  0.49548221180078}, {-0.47339353684664, -0.85904328834047},
-{ 0.14323651387360, -0.94145598222488}, {-0.29268293575672,  0.05759224927952},
-{ 0.43793861458754, -0.78904969892724}, {-0.36345126374441,  0.64874435357162},
-{-0.08750604656825,  0.97686944362527}, {-0.96495267812511, -0.53960305946511},
-{ 0.55526940659947,  0.78891523734774}, { 0.73538215752630,  0.96452072373404},
-{-0.30889773919437, -0.80664389776860}, { 0.03574995626194, -0.97325616900959},
-{ 0.98720684660488,  0.48409133691962}, {-0.81689296271203, -0.90827703628298},
-{ 0.67866860118215,  0.81284503870856}, {-0.15808569732583,  0.85279555024382},
-{ 0.80723395114371, -0.24717418514605}, { 0.47788757329038, -0.46333147839295},
-{ 0.96367554763201,  0.38486749303242}, {-0.99143875716818, -0.24945277239809},
-{ 0.83081876925833, -0.94780851414763}, {-0.58753191905341,  0.01290772389163},
-{ 0.95538108220960, -0.85557052096538}, {-0.96490920476211, -0.64020970923102},
-{-0.97327101028521,  0.12378128133110}, { 0.91400366022124,  0.57972471346930},
-{-0.99925837363824,  0.71084847864067}, {-0.86875903507313, -0.20291699203564},
-{-0.26240034795124, -0.68264554369108}, {-0.24664412953388, -0.87642273115183},
-{ 0.02416275806869,  0.27192914288905}, { 0.82068619590515, -0.85087787994476},
-{ 0.88547373760759, -0.89636802901469}, {-0.18173078152226, -0.26152145156800},
-{ 0.09355476558534,  0.54845123045604}, {-0.54668414224090,  0.95980774020221},
-{ 0.37050990604091, -0.59910140383171}, {-0.70373594262891,  0.91227665827081},
-{-0.34600785879594, -0.99441426144200}, {-0.68774481731008, -0.30238837956299},
-{-0.26843291251234,  0.83115668004362}, { 0.49072334613242, -0.45359708737775},
-{ 0.38975993093975,  0.95515358099121}, {-0.97757125224150,  0.05305894580606},
-{-0.17325552859616, -0.92770672250494}, { 0.99948035025744,  0.58285545563426},
-{-0.64946246527458,  0.68645507104960}, {-0.12016920576437, -0.57147322153312},
-{-0.58947456517751, -0.34847132454388}, {-0.41815140454465,  0.16276422358861},
-{ 0.99885650204884,  0.11136095490444}, {-0.56649614128386, -0.90494866361587},
-{ 0.94138021032330,  0.35281916733018}, {-0.75725076534641,  0.53650549640587},
-{ 0.20541973692630, -0.94435144369918}, { 0.99980371023351,  0.79835913565599},
-{ 0.29078277605775,  0.35393777921520}, {-0.62858772103030,  0.38765693387102},
-{ 0.43440904467688, -0.98546330463232}, {-0.98298583762390,  0.21021524625209},
-{ 0.19513029146934, -0.94239832251867}, {-0.95476662400101,  0.98364554179143},
-{ 0.93379635304810, -0.70881994583682}, {-0.85235410573336, -0.08342347966410},
-{-0.86425093011245, -0.45795025029466}, { 0.38879779059045,  0.97274429344593},
-{ 0.92045124735495, -0.62433652524220}, { 0.89162532251878,  0.54950955570563},
-{-0.36834336949252,  0.96458298020975}, { 0.93891760988045, -0.89968353740388},
-{ 0.99267657565094, -0.03757034316958}, {-0.94063471614176,  0.41332338538963},
-{ 0.99740224117019, -0.16830494996370}, {-0.35899413170555, -0.46633226649613},
-{ 0.05237237274947, -0.25640361602661}, { 0.36703583957424, -0.38653265641875},
-{ 0.91653180367913, -0.30587628726597}, { 0.69000803499316,  0.90952171386132},
-{-0.38658751133527,  0.99501571208985}, {-0.29250814029851,  0.37444994344615},
-{-0.60182204677608,  0.86779651036123}, {-0.97418588163217,  0.96468523666475},
-{ 0.88461574003963,  0.57508405276414}, { 0.05198933055162,  0.21269661669964},
-{-0.53499621979720,  0.97241553731237}, {-0.49429560226497,  0.98183865291903},
-{-0.98935142339139, -0.40249159006933}, {-0.98081380091130, -0.72856895534041},
-{-0.27338148835532,  0.99950922447209}, { 0.06310802338302, -0.54539587529618},
-{-0.20461677199539, -0.14209977628489}, { 0.66223843141647,  0.72528579940326},
-{-0.84764345483665,  0.02372316801261}, {-0.89039863483811,  0.88866581484602},
-{ 0.95903308477986,  0.76744927173873}, { 0.73504123909879, -0.03747203173192},
-{-0.31744434966056, -0.36834111883652}, {-0.34110827591623,  0.40211222807691},
-{ 0.47803883714199, -0.39423219786288}, { 0.98299195879514,  0.01989791390047},
-{-0.30963073129751, -0.18076720599336}, { 0.99992588229018, -0.26281872094289},
-{-0.93149731080767, -0.98313162570490}, { 0.99923472302773, -0.80142993767554},
-{-0.26024169633417, -0.75999759855752}, {-0.35712514743563,  0.19298963768574},
-{-0.99899084509530,  0.74645156992493}, { 0.86557171579452,  0.55593866696299},
-{ 0.33408042438752,  0.86185953874709}, { 0.99010736374716,  0.04602397576623},
-{-0.66694269691195, -0.91643611810148}, { 0.64016792079480,  0.15649530836856},
-{ 0.99570534804836,  0.45844586038111}, {-0.63431466947340,  0.21079116459234},
-{-0.07706847005931, -0.89581437101329}, { 0.98590090577724,  0.88241721133981},
-{ 0.80099335254678, -0.36851896710853}, { 0.78368131392666,  0.45506999802597},
-{ 0.08707806671691,  0.80938994918745}, {-0.86811883080712,  0.39347308654705},
-{-0.39466529740375, -0.66809432114456}, { 0.97875325649683, -0.72467840967746},
-{-0.95038560288864,  0.89563219587625}, { 0.17005239424212,  0.54683053962658},
-{-0.76910792026848, -0.96226617549298}, { 0.99743281016846,  0.42697157037567},
-{ 0.95437383549973,  0.97002324109952}, { 0.99578905365569, -0.54106826257356},
-{ 0.28058259829990, -0.85361420634036}, { 0.85256524470573, -0.64567607735589},
-{-0.50608540105128, -0.65846015480300}, {-0.97210735183243, -0.23095213067791},
-{ 0.95424048234441, -0.99240147091219}, {-0.96926570524023,  0.73775654896574},
-{ 0.30872163214726,  0.41514960556126}, {-0.24523839572639,  0.63206633394807},
-{-0.33813265086024, -0.38661779441897}, {-0.05826828420146, -0.06940774188029},
-{-0.22898461455054,  0.97054853316316}, {-0.18509915019881,  0.47565762892084},
-{-0.10488238045009, -0.87769947402394}, {-0.71886586182037,  0.78030982480538},
-{ 0.99793873738654,  0.90041310491497}, { 0.57563307626120, -0.91034337352097},
-{ 0.28909646383717,  0.96307783970534}, { 0.42188998312520,  0.48148651230437},
-{ 0.93335049681047, -0.43537023883588}, {-0.97087374418267,  0.86636445711364},
-{ 0.36722871286923,  0.65291654172961}, {-0.81093025665696,  0.08778370229363},
-{-0.26240603062237, -0.92774095379098}, { 0.83996497984604,  0.55839849139647},
-{-0.99909615720225, -0.96024605713970}, { 0.74649464155061,  0.12144893606462},
-{-0.74774595569805, -0.26898062008959}, { 0.95781667469567, -0.79047927052628},
-{ 0.95472308713099, -0.08588776019550}, { 0.48708332746299,  0.99999041579432},
-{ 0.46332038247497,  0.10964126185063}, {-0.76497004940162,  0.89210929242238},
-{ 0.57397389364339,  0.35289703373760}, { 0.75374316974495,  0.96705214651335},
-{-0.59174397685714, -0.89405370422752}, { 0.75087906691890, -0.29612672982396},
-{-0.98607857336230,  0.25034911730023}, {-0.40761056640505, -0.90045573444695},
-{ 0.66929266740477,  0.98629493401748}, {-0.97463695257310, -0.00190223301301},
-{ 0.90145509409859,  0.99781390365446}, {-0.87259289048043,  0.99233587353666},
-{-0.91529461447692, -0.15698707534206}, {-0.03305738840705, -0.37205262859764},
-{ 0.07223051368337, -0.88805001733626}, { 0.99498012188353,  0.97094358113387},
-{-0.74904939500519,  0.99985483641521}, { 0.04585228574211,  0.99812337444082},
-{-0.89054954257993, -0.31791913188064}, {-0.83782144651251,  0.97637632547466},
-{ 0.33454804933804, -0.86231516800408}, {-0.99707579362824,  0.93237990079441},
-{-0.22827527843994,  0.18874759397997}, { 0.67248046289143, -0.03646211390569},
-{-0.05146538187944, -0.92599700120679}, { 0.99947295749905,  0.93625229707912},
-{ 0.66951124390363,  0.98905825623893}, {-0.99602956559179, -0.44654715757688},
-{ 0.82104905483590,  0.99540741724928}, { 0.99186510988782,  0.72023001312947},
-{-0.65284592392918,  0.52186723253637}, { 0.93885443798188, -0.74895312615259},
-{ 0.96735248738388,  0.90891816978629}, {-0.22225968841114,  0.57124029781228},
-{-0.44132783753414, -0.92688840659280}, {-0.85694974219574,  0.88844532719844},
-{ 0.91783042091762, -0.46356892383970}, { 0.72556974415690, -0.99899555770747},
-{-0.99711581834508,  0.58211560180426}, { 0.77638976371966,  0.94321834873819},
-{ 0.07717324253925,  0.58638399856595}, {-0.56049829194163,  0.82522301569036},
-{ 0.98398893639988,  0.39467440420569}, { 0.47546946844938,  0.68613044836811},
-{ 0.65675089314631,  0.18331637134880}, { 0.03273375457980, -0.74933109564108},
-{-0.38684144784738,  0.51337349030406}, {-0.97346267944545, -0.96549364384098},
-{-0.53282156061942, -0.91423265091354}, { 0.99817310731176,  0.61133572482148},
-{-0.50254500772635, -0.88829338134294}, { 0.01995873238855,  0.85223515096765},
-{ 0.99930381973804,  0.94578896296649}, { 0.82907767600783, -0.06323442598128},
-{-0.58660709669728,  0.96840773806582}, {-0.17573736667267, -0.48166920859485},
-{ 0.83434292401346, -0.13023450646997}, { 0.05946491307025,  0.20511047074866},
-{ 0.81505484574602, -0.94685947861369}, {-0.44976380954860,  0.40894572671545},
-{-0.89746474625671,  0.99846578838537}, { 0.39677256130792, -0.74854668609359},
-{-0.07588948563079,  0.74096214084170}, { 0.76343198951445,  0.41746629422634},
-{-0.74490104699626,  0.94725911744610}, { 0.64880119792759,  0.41336660830571},
-{ 0.62319537462542, -0.93098313552599}, { 0.42215817594807, -0.07712787385208},
-{ 0.02704554141885, -0.05417518053666}, { 0.80001773566818,  0.91542195141039},
-{-0.79351832348816, -0.36208897989136}, { 0.63872359151636,  0.08128252493444},
-{ 0.52890520960295,  0.60048872455592}, { 0.74238552914587,  0.04491915291044},
-{ 0.99096131449250, -0.19451182854402}, {-0.80412329643109, -0.88513818199457},
-{-0.64612616129736,  0.72198674804544}, { 0.11657770663191, -0.83662833815041},
-{-0.95053182488101, -0.96939905138082}, {-0.62228872928622,  0.82767262846661},
-{ 0.03004475787316, -0.99738896333384}, {-0.97987214341034,  0.36526129686425},
-{-0.99986980746200, -0.36021610299715}, { 0.89110648599879, -0.97894250343044},
-{ 0.10407960510582,  0.77357793811619}, { 0.95964737821728, -0.35435818285502},
-{ 0.50843233159162,  0.96107691266205}, { 0.17006334670615, -0.76854025314829},
-{ 0.25872675063360,  0.99893303933816}, {-0.01115998681937,  0.98496019742444},
-{-0.79598702973261,  0.97138411318894}, {-0.99264708948101, -0.99542822402536},
-{-0.99829663752818,  0.01877138824311}, {-0.70801016548184,  0.33680685948117},
-{-0.70467057786826,  0.93272777501857}, { 0.99846021905254, -0.98725746254433},
-{-0.63364968534650, -0.16473594423746}, {-0.16258217500792, -0.95939125400802},
-{-0.43645594360633, -0.94805030113284}, {-0.99848471702976,  0.96245166923809},
-{-0.16796458968998, -0.98987511890470}, {-0.87979225745213, -0.71725725041680},
-{ 0.44183099021786, -0.93568974498761}, { 0.93310180125532, -0.99913308068246},
-{-0.93941931782002, -0.56409379640356}, {-0.88590003188677,  0.47624600491382},
-{ 0.99971463703691, -0.83889954253462}, {-0.75376385639978,  0.00814643438625},
-{ 0.93887685615875, -0.11284528204636}, { 0.85126435782309,  0.52349251543547},
-{ 0.39701421446381,  0.81779634174316}, {-0.37024464187437, -0.87071656222959},
-{-0.36024828242896,  0.34655735648287}, {-0.93388812549209, -0.84476541096429},
-{-0.65298804552119, -0.18439575450921}, { 0.11960319006843,  0.99899346780168},
-{ 0.94292565553160,  0.83163906518293}, { 0.75081145286948, -0.35533223142265},
-{ 0.56721979748394, -0.24076836414499}, { 0.46857766746029, -0.30140233457198},
-{ 0.97312313923635, -0.99548191630031}, {-0.38299976567017,  0.98516909715427},
-{ 0.41025800019463,  0.02116736935734}, { 0.09638062008048,  0.04411984381457},
-{-0.85283249275397,  0.91475563922421}, { 0.88866808958124, -0.99735267083226},
-{-0.48202429536989, -0.96805608884164}, { 0.27572582416567,  0.58634753335832},
-{-0.65889129659168,  0.58835634138583}, { 0.98838086953732,  0.99994349600236},
-{-0.20651349620689,  0.54593044066355}, {-0.62126416356920, -0.59893681700392},
-{ 0.20320105410437, -0.86879180355289}, {-0.97790548600584,  0.96290806999242},
-{ 0.11112534735126,  0.21484763313301}, {-0.41368337314182,  0.28216837680365},
-{ 0.24133038992960,  0.51294362630238}, {-0.66393410674885, -0.08249679629081},
-{-0.53697829178752, -0.97649903936228}, {-0.97224737889348,  0.22081333579837},
-{ 0.87392477144549, -0.12796173740361}, { 0.19050361015753,  0.01602615387195},
-{-0.46353441212724, -0.95249041539006}, {-0.07064096339021, -0.94479803205886},
-{-0.92444085484466, -0.10457590187436}, {-0.83822593578728, -0.01695043208885},
-{ 0.75214681811150, -0.99955681042665}, {-0.42102998829339,  0.99720941999394},
-{-0.72094786237696, -0.35008961934255}, { 0.78843311019251,  0.52851398958271},
-{ 0.97394027897442, -0.26695944086561}, { 0.99206463477946, -0.57010120849429},
-{ 0.76789609461795, -0.76519356730966}, {-0.82002421836409, -0.73530179553767},
-{ 0.81924990025724,  0.99698425250579}, {-0.26719850873357,  0.68903369776193},
-{-0.43311260380975,  0.85321815947490}, { 0.99194979673836,  0.91876249766422},
-{-0.80692001248487, -0.32627540663214}, { 0.43080003649976, -0.21919095636638},
-{ 0.67709491937357, -0.95478075822906}, { 0.56151770568316, -0.70693811747778},
-{ 0.10831862810749, -0.08628837174592}, { 0.91229417540436, -0.65987351408410},
-{-0.48972893932274,  0.56289246362686}, {-0.89033658689697, -0.71656563987082},
-{ 0.65269447475094,  0.65916004833932}, { 0.67439478141121, -0.81684380846796},
-{-0.47770832416973, -0.16789556203025}, {-0.99715979260878, -0.93565784007648},
-{-0.90889593602546,  0.62034397054380}, {-0.06618622548177, -0.23812217221359},
-{ 0.99430266919728,  0.18812555317553}, { 0.97686402381843, -0.28664534366620},
-{ 0.94813650221268, -0.97506640027128}, {-0.95434497492853, -0.79607978501983},
-{-0.49104783137150,  0.32895214359663}, { 0.99881175120751,  0.88993983831354},
-{ 0.50449166760303, -0.85995072408434}, { 0.47162891065108, -0.18680204049569},
-{-0.62081581361840,  0.75000676218956}, {-0.43867015250812,  0.99998069244322},
-{ 0.98630563232075, -0.53578899600662}, {-0.61510362277374, -0.89515019899997},
-{-0.03841517601843, -0.69888815681179}, {-0.30102157304644, -0.07667808922205},
-{ 0.41881284182683,  0.02188098922282}, {-0.86135454941237,  0.98947480909359},
-{ 0.67226861393788, -0.13494389011014}, {-0.70737398842068, -0.76547349325992},
-{ 0.94044946687963,  0.09026201157416}, {-0.82386352534327,  0.08924768823676},
-{-0.32070666698656,  0.50143421908753}, { 0.57593163224487, -0.98966422921509},
-{-0.36326018419965,  0.07440243123228}, { 0.99979044674350, -0.14130287347405},
-{-0.92366023326932, -0.97979298068180}, {-0.44607178518598, -0.54233252016394},
-{ 0.44226800932956,  0.71326756742752}, { 0.03671907158312,  0.63606389366675},
-{ 0.52175424682195, -0.85396826735705}, {-0.94701139690956, -0.01826348194255},
-{-0.98759606946049,  0.82288714303073}, { 0.87434794743625,  0.89399495655433},
-{-0.93412041758744,  0.41374052024363}, { 0.96063943315511,  0.93116709541280},
-{ 0.97534253457837,  0.86150930812689}, { 0.99642466504163,  0.70190043427512},
-{-0.94705089665984, -0.29580042814306}, { 0.91599807087376, -0.98147830385781},
-{-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
+/* First eight entries repeated at end to simplify SIMD implementations. */
+const DECLARE_ALIGNED(16, INTFLOAT, AAC_RENAME(ff_sbr_noise_table))[][2] = {
+{Q31(-0.99948153278296f), Q31(-0.59483417516607f)}, {Q31( 0.97113454393991f), Q31(-0.67528515225647f)},
+{Q31( 0.14130051758487f), Q31(-0.95090983575689f)}, {Q31(-0.47005496701697f), Q31(-0.37340549728647f)},
+{Q31( 0.80705063769351f), Q31( 0.29653668284408f)}, {Q31(-0.38981478896926f), Q31( 0.89572605717087f)},
+{Q31(-0.01053049862020f), Q31(-0.66959058036166f)}, {Q31(-0.91266367957293f), Q31(-0.11522938140034f)},
+{Q31( 0.54840422910309f), Q31( 0.75221367176302f)}, {Q31( 0.40009252867955f), Q31(-0.98929400334421f)},
+{Q31(-0.99867974711855f), Q31(-0.88147068645358f)}, {Q31(-0.95531076805040f), Q31( 0.90908757154593f)},
+{Q31(-0.45725933317144f), Q31(-0.56716323646760f)}, {Q31(-0.72929675029275f), Q31(-0.98008272727324f)},
+{Q31( 0.75622801399036f), Q31( 0.20950329995549f)}, {Q31( 0.07069442601050f), Q31(-0.78247898470706f)},
+{Q31( 0.74496252926055f), Q31(-0.91169004445807f)}, {Q31(-0.96440182703856f), Q31(-0.94739918296622f)},
+{Q31( 0.30424629369539f), Q31(-0.49438267012479f)}, {Q31( 0.66565033746925f), Q31( 0.64652935542491f)},
+{Q31( 0.91697008020594f), Q31( 0.17514097332009f)}, {Q31(-0.70774918760427f), Q31( 0.52548653416543f)},
+{Q31(-0.70051415345560f), Q31(-0.45340028808763f)}, {Q31(-0.99496513054797f), Q31(-0.90071908066973f)},
+{Q31( 0.98164490790123f), Q31(-0.77463155528697f)}, {Q31(-0.54671580548181f), Q31(-0.02570928536004f)},
+{Q31(-0.01689629065389f), Q31( 0.00287506445732f)}, {Q31(-0.86110349531986f), Q31( 0.42548583726477f)},
+{Q31(-0.98892980586032f), Q31(-0.87881132267556f)}, {Q31( 0.51756627678691f), Q31( 0.66926784710139f)},
+{Q31(-0.99635026409640f), Q31(-0.58107730574765f)}, {Q31(-0.99969370862163f), Q31( 0.98369989360250f)},
+{Q31( 0.55266258627194f), Q31( 0.59449057465591f)}, {Q31( 0.34581177741673f), Q31( 0.94879421061866f)},
+{Q31( 0.62664209577999f), Q31(-0.74402970906471f)}, {Q31(-0.77149701404973f), Q31(-0.33883658042801f)},
+{Q31(-0.91592244254432f), Q31( 0.03687901376713f)}, {Q31(-0.76285492357887f), Q31(-0.91371867919124f)},
+{Q31( 0.79788337195331f), Q31(-0.93180971199849f)}, {Q31( 0.54473080610200f), Q31(-0.11919206037186f)},
+{Q31(-0.85639281671058f), Q31( 0.42429854760451f)}, {Q31(-0.92882402971423f), Q31( 0.27871809078609f)},
+{Q31(-0.11708371046774f), Q31(-0.99800843444966f)}, {Q31( 0.21356749817493f), Q31(-0.90716295627033f)},
+{Q31(-0.76191692573909f), Q31( 0.99768118356265f)}, {Q31( 0.98111043100884f), Q31(-0.95854459734407f)},
+{Q31(-0.85913269895572f), Q31( 0.95766566168880f)}, {Q31(-0.93307242253692f), Q31( 0.49431757696466f)},
+{Q31( 0.30485754879632f), Q31(-0.70540034357529f)}, {Q31( 0.85289650925190f), Q31( 0.46766131791044f)},
+{Q31( 0.91328082618125f), Q31(-0.99839597361769f)}, {Q31(-0.05890199924154f), Q31( 0.70741827819497f)},
+{Q31( 0.28398686150148f), Q31( 0.34633555702188f)}, {Q31( 0.95258164539612f), Q31(-0.54893416026939f)},
+{Q31(-0.78566324168507f), Q31(-0.75568541079691f)}, {Q31(-0.95789495447877f), Q31(-0.20423194696966f)},
+{Q31( 0.82411158711197f), Q31( 0.96654618432562f)}, {Q31(-0.65185446735885f), Q31(-0.88734990773289f)},
+{Q31(-0.93643603134666f), Q31( 0.99870790442385f)}, {Q31( 0.91427159529618f), Q31(-0.98290505544444f)},
+{Q31(-0.70395684036886f), Q31( 0.58796798221039f)}, {Q31( 0.00563771969365f), Q31( 0.61768196727244f)},
+{Q31( 0.89065051931895f), Q31( 0.52783352697585f)}, {Q31(-0.68683707712762f), Q31( 0.80806944710339f)},
+{Q31( 0.72165342518718f), Q31(-0.69259857349564f)}, {Q31(-0.62928247730667f), Q31( 0.13627037407335f)},
+{Q31( 0.29938434065514f), Q31(-0.46051329682246f)}, {Q31(-0.91781958879280f), Q31(-0.74012716684186f)},
+{Q31( 0.99298717043688f), Q31( 0.40816610075661f)}, {Q31( 0.82368298622748f), Q31(-0.74036047190173f)},
+{Q31(-0.98512833386833f), Q31(-0.99972330709594f)}, {Q31(-0.95915368242257f), Q31(-0.99237800466040f)},
+{Q31(-0.21411126572790f), Q31(-0.93424819052545f)}, {Q31(-0.68821476106884f), Q31(-0.26892306315457f)},
+{Q31( 0.91851997982317f), Q31( 0.09358228901785f)}, {Q31(-0.96062769559127f), Q31( 0.36099095133739f)},
+{Q31( 0.51646184922287f), Q31(-0.71373332873917f)}, {Q31( 0.61130721139669f), Q31( 0.46950141175917f)},
+{Q31( 0.47336129371299f), Q31(-0.27333178296162f)}, {Q31( 0.90998308703519f), Q31( 0.96715662938132f)},
+{Q31( 0.44844799194357f), Q31( 0.99211574628306f)}, {Q31( 0.66614891079092f), Q31( 0.96590176169121f)},
+{Q31( 0.74922239129237f), Q31(-0.89879858826087f)}, {Q31(-0.99571588506485f), Q31( 0.52785521494349f)},
+{Q31( 0.97401082477563f), Q31(-0.16855870075190f)}, {Q31( 0.72683747733879f), Q31(-0.48060774432251f)},
+{Q31( 0.95432193457128f), Q31( 0.68849603408441f)}, {Q31(-0.72962208425191f), Q31(-0.76608443420917f)},
+{Q31(-0.85359479233537f), Q31( 0.88738125901579f)}, {Q31(-0.81412430338535f), Q31(-0.97480768049637f)},
+{Q31(-0.87930772356786f), Q31( 0.74748307690436f)}, {Q31(-0.71573331064977f), Q31(-0.98570608178923f)},
+{Q31( 0.83524300028228f), Q31( 0.83702537075163f)}, {Q31(-0.48086065601423f), Q31(-0.98848504923531f)},
+{Q31( 0.97139128574778f), Q31( 0.80093621198236f)}, {Q31( 0.51992825347895f), Q31( 0.80247631400510f)},
+{Q31(-0.00848591195325f), Q31(-0.76670128000486f)}, {Q31(-0.70294374303036f), Q31( 0.55359910445577f)},
+{Q31(-0.95894428168140f), Q31(-0.43265504344783f)}, {Q31( 0.97079252950321f), Q31( 0.09325857238682f)},
+{Q31(-0.92404293670797f), Q31( 0.85507704027855f)}, {Q31(-0.69506469500450f), Q31( 0.98633412625459f)},
+{Q31( 0.26559203620024f), Q31( 0.73314307966524f)}, {Q31( 0.28038443336943f), Q31( 0.14537913654427f)},
+{Q31(-0.74138124825523f), Q31( 0.99310339807762f)}, {Q31(-0.01752795995444f), Q31(-0.82616635284178f)},
+{Q31(-0.55126773094930f), Q31(-0.98898543862153f)}, {Q31( 0.97960898850996f), Q31(-0.94021446752851f)},
+{Q31(-0.99196309146936f), Q31( 0.67019017358456f)}, {Q31(-0.67684928085260f), Q31( 0.12631491649378f)},
+{Q31( 0.09140039465500f), Q31(-0.20537731453108f)}, {Q31(-0.71658965751996f), Q31(-0.97788200391224f)},
+{Q31( 0.81014640078925f), Q31( 0.53722648362443f)}, {Q31( 0.40616991671205f), Q31(-0.26469008598449f)},
+{Q31(-0.67680188682972f), Q31( 0.94502052337695f)}, {Q31( 0.86849774348749f), Q31(-0.18333598647899f)},
+{Q31(-0.99500381284851f), Q31(-0.02634122068550f)}, {Q31( 0.84329189340667f), Q31( 0.10406957462213f)},
+{Q31(-0.09215968531446f), Q31( 0.69540012101253f)}, {Q31( 0.99956173327206f), Q31(-0.12358542001404f)},
+{Q31(-0.79732779473535f), Q31(-0.91582524736159f)}, {Q31( 0.96349973642406f), Q31( 0.96640458041000f)},
+{Q31(-0.79942778496547f), Q31( 0.64323902822857f)}, {Q31(-0.11566039853896f), Q31( 0.28587846253726f)},
+{Q31(-0.39922954514662f), Q31( 0.94129601616966f)}, {Q31( 0.99089197565987f), Q31(-0.92062625581587f)},
+{Q31( 0.28631285179909f), Q31(-0.91035047143603f)}, {Q31(-0.83302725605608f), Q31(-0.67330410892084f)},
+{Q31( 0.95404443402072f), Q31( 0.49162765398743f)}, {Q31(-0.06449863579434f), Q31( 0.03250560813135f)},
+{Q31(-0.99575054486311f), Q31( 0.42389784469507f)}, {Q31(-0.65501142790847f), Q31( 0.82546114655624f)},
+{Q31(-0.81254441908887f), Q31(-0.51627234660629f)}, {Q31(-0.99646369485481f), Q31( 0.84490533520752f)},
+{Q31( 0.00287840603348f), Q31( 0.64768261158166f)}, {Q31( 0.70176989408455f), Q31(-0.20453028573322f)},
+{Q31( 0.96361882270190f), Q31( 0.40706967140989f)}, {Q31(-0.68883758192426f), Q31( 0.91338958840772f)},
+{Q31(-0.34875585502238f), Q31( 0.71472290693300f)}, {Q31( 0.91980081243087f), Q31( 0.66507455644919f)},
+{Q31(-0.99009048343881f), Q31( 0.85868021604848f)}, {Q31( 0.68865791458395f), Q31( 0.55660316809678f)},
+{Q31(-0.99484402129368f), Q31(-0.20052559254934f)}, {Q31( 0.94214511408023f), Q31(-0.99696425367461f)},
+{Q31(-0.67414626793544f), Q31( 0.49548221180078f)}, {Q31(-0.47339353684664f), Q31(-0.85904328834047f)},
+{Q31( 0.14323651387360f), Q31(-0.94145598222488f)}, {Q31(-0.29268293575672f), Q31( 0.05759224927952f)},
+{Q31( 0.43793861458754f), Q31(-0.78904969892724f)}, {Q31(-0.36345126374441f), Q31( 0.64874435357162f)},
+{Q31(-0.08750604656825f), Q31( 0.97686944362527f)}, {Q31(-0.96495267812511f), Q31(-0.53960305946511f)},
+{Q31( 0.55526940659947f), Q31( 0.78891523734774f)}, {Q31( 0.73538215752630f), Q31( 0.96452072373404f)},
+{Q31(-0.30889773919437f), Q31(-0.80664389776860f)}, {Q31( 0.03574995626194f), Q31(-0.97325616900959f)},
+{Q31( 0.98720684660488f), Q31( 0.48409133691962f)}, {Q31(-0.81689296271203f), Q31(-0.90827703628298f)},
+{Q31( 0.67866860118215f), Q31( 0.81284503870856f)}, {Q31(-0.15808569732583f), Q31( 0.85279555024382f)},
+{Q31( 0.80723395114371f), Q31(-0.24717418514605f)}, {Q31( 0.47788757329038f), Q31(-0.46333147839295f)},
+{Q31( 0.96367554763201f), Q31( 0.38486749303242f)}, {Q31(-0.99143875716818f), Q31(-0.24945277239809f)},
+{Q31( 0.83081876925833f), Q31(-0.94780851414763f)}, {Q31(-0.58753191905341f), Q31( 0.01290772389163f)},
+{Q31( 0.95538108220960f), Q31(-0.85557052096538f)}, {Q31(-0.96490920476211f), Q31(-0.64020970923102f)},
+{Q31(-0.97327101028521f), Q31( 0.12378128133110f)}, {Q31( 0.91400366022124f), Q31( 0.57972471346930f)},
+{Q31(-0.99925837363824f), Q31( 0.71084847864067f)}, {Q31(-0.86875903507313f), Q31(-0.20291699203564f)},
+{Q31(-0.26240034795124f), Q31(-0.68264554369108f)}, {Q31(-0.24664412953388f), Q31(-0.87642273115183f)},
+{Q31( 0.02416275806869f), Q31( 0.27192914288905f)}, {Q31( 0.82068619590515f), Q31(-0.85087787994476f)},
+{Q31( 0.88547373760759f), Q31(-0.89636802901469f)}, {Q31(-0.18173078152226f), Q31(-0.26152145156800f)},
+{Q31( 0.09355476558534f), Q31( 0.54845123045604f)}, {Q31(-0.54668414224090f), Q31( 0.95980774020221f)},
+{Q31( 0.37050990604091f), Q31(-0.59910140383171f)}, {Q31(-0.70373594262891f), Q31( 0.91227665827081f)},
+{Q31(-0.34600785879594f), Q31(-0.99441426144200f)}, {Q31(-0.68774481731008f), Q31(-0.30238837956299f)},
+{Q31(-0.26843291251234f), Q31( 0.83115668004362f)}, {Q31( 0.49072334613242f), Q31(-0.45359708737775f)},
+{Q31( 0.38975993093975f), Q31( 0.95515358099121f)}, {Q31(-0.97757125224150f), Q31( 0.05305894580606f)},
+{Q31(-0.17325552859616f), Q31(-0.92770672250494f)}, {Q31( 0.99948035025744f), Q31( 0.58285545563426f)},
+{Q31(-0.64946246527458f), Q31( 0.68645507104960f)}, {Q31(-0.12016920576437f), Q31(-0.57147322153312f)},
+{Q31(-0.58947456517751f), Q31(-0.34847132454388f)}, {Q31(-0.41815140454465f), Q31( 0.16276422358861f)},
+{Q31( 0.99885650204884f), Q31( 0.11136095490444f)}, {Q31(-0.56649614128386f), Q31(-0.90494866361587f)},
+{Q31( 0.94138021032330f), Q31( 0.35281916733018f)}, {Q31(-0.75725076534641f), Q31( 0.53650549640587f)},
+{Q31( 0.20541973692630f), Q31(-0.94435144369918f)}, {Q31( 0.99980371023351f), Q31( 0.79835913565599f)},
+{Q31( 0.29078277605775f), Q31( 0.35393777921520f)}, {Q31(-0.62858772103030f), Q31( 0.38765693387102f)},
+{Q31( 0.43440904467688f), Q31(-0.98546330463232f)}, {Q31(-0.98298583762390f), Q31( 0.21021524625209f)},
+{Q31( 0.19513029146934f), Q31(-0.94239832251867f)}, {Q31(-0.95476662400101f), Q31( 0.98364554179143f)},
+{Q31( 0.93379635304810f), Q31(-0.70881994583682f)}, {Q31(-0.85235410573336f), Q31(-0.08342347966410f)},
+{Q31(-0.86425093011245f), Q31(-0.45795025029466f)}, {Q31( 0.38879779059045f), Q31( 0.97274429344593f)},
+{Q31( 0.92045124735495f), Q31(-0.62433652524220f)}, {Q31( 0.89162532251878f), Q31( 0.54950955570563f)},
+{Q31(-0.36834336949252f), Q31( 0.96458298020975f)}, {Q31( 0.93891760988045f), Q31(-0.89968353740388f)},
+{Q31( 0.99267657565094f), Q31(-0.03757034316958f)}, {Q31(-0.94063471614176f), Q31( 0.41332338538963f)},
+{Q31( 0.99740224117019f), Q31(-0.16830494996370f)}, {Q31(-0.35899413170555f), Q31(-0.46633226649613f)},
+{Q31( 0.05237237274947f), Q31(-0.25640361602661f)}, {Q31( 0.36703583957424f), Q31(-0.38653265641875f)},
+{Q31( 0.91653180367913f), Q31(-0.30587628726597f)}, {Q31( 0.69000803499316f), Q31( 0.90952171386132f)},
+{Q31(-0.38658751133527f), Q31( 0.99501571208985f)}, {Q31(-0.29250814029851f), Q31( 0.37444994344615f)},
+{Q31(-0.60182204677608f), Q31( 0.86779651036123f)}, {Q31(-0.97418588163217f), Q31( 0.96468523666475f)},
+{Q31( 0.88461574003963f), Q31( 0.57508405276414f)}, {Q31( 0.05198933055162f), Q31( 0.21269661669964f)},
+{Q31(-0.53499621979720f), Q31( 0.97241553731237f)}, {Q31(-0.49429560226497f), Q31( 0.98183865291903f)},
+{Q31(-0.98935142339139f), Q31(-0.40249159006933f)}, {Q31(-0.98081380091130f), Q31(-0.72856895534041f)},
+{Q31(-0.27338148835532f), Q31( 0.99950922447209f)}, {Q31( 0.06310802338302f), Q31(-0.54539587529618f)},
+{Q31(-0.20461677199539f), Q31(-0.14209977628489f)}, {Q31( 0.66223843141647f), Q31( 0.72528579940326f)},
+{Q31(-0.84764345483665f), Q31( 0.02372316801261f)}, {Q31(-0.89039863483811f), Q31( 0.88866581484602f)},
+{Q31( 0.95903308477986f), Q31( 0.76744927173873f)}, {Q31( 0.73504123909879f), Q31(-0.03747203173192f)},
+{Q31(-0.31744434966056f), Q31(-0.36834111883652f)}, {Q31(-0.34110827591623f), Q31( 0.40211222807691f)},
+{Q31( 0.47803883714199f), Q31(-0.39423219786288f)}, {Q31( 0.98299195879514f), Q31( 0.01989791390047f)},
+{Q31(-0.30963073129751f), Q31(-0.18076720599336f)}, {Q31( 0.99992588229018f), Q31(-0.26281872094289f)},
+{Q31(-0.93149731080767f), Q31(-0.98313162570490f)}, {Q31( 0.99923472302773f), Q31(-0.80142993767554f)},
+{Q31(-0.26024169633417f), Q31(-0.75999759855752f)}, {Q31(-0.35712514743563f), Q31( 0.19298963768574f)},
+{Q31(-0.99899084509530f), Q31( 0.74645156992493f)}, {Q31( 0.86557171579452f), Q31( 0.55593866696299f)},
+{Q31( 0.33408042438752f), Q31( 0.86185953874709f)}, {Q31( 0.99010736374716f), Q31( 0.04602397576623f)},
+{Q31(-0.66694269691195f), Q31(-0.91643611810148f)}, {Q31( 0.64016792079480f), Q31( 0.15649530836856f)},
+{Q31( 0.99570534804836f), Q31( 0.45844586038111f)}, {Q31(-0.63431466947340f), Q31( 0.21079116459234f)},
+{Q31(-0.07706847005931f), Q31(-0.89581437101329f)}, {Q31( 0.98590090577724f), Q31( 0.88241721133981f)},
+{Q31( 0.80099335254678f), Q31(-0.36851896710853f)}, {Q31( 0.78368131392666f), Q31( 0.45506999802597f)},
+{Q31( 0.08707806671691f), Q31( 0.80938994918745f)}, {Q31(-0.86811883080712f), Q31( 0.39347308654705f)},
+{Q31(-0.39466529740375f), Q31(-0.66809432114456f)}, {Q31( 0.97875325649683f), Q31(-0.72467840967746f)},
+{Q31(-0.95038560288864f), Q31( 0.89563219587625f)}, {Q31( 0.17005239424212f), Q31( 0.54683053962658f)},
+{Q31(-0.76910792026848f), Q31(-0.96226617549298f)}, {Q31( 0.99743281016846f), Q31( 0.42697157037567f)},
+{Q31( 0.95437383549973f), Q31( 0.97002324109952f)}, {Q31( 0.99578905365569f), Q31(-0.54106826257356f)},
+{Q31( 0.28058259829990f), Q31(-0.85361420634036f)}, {Q31( 0.85256524470573f), Q31(-0.64567607735589f)},
+{Q31(-0.50608540105128f), Q31(-0.65846015480300f)}, {Q31(-0.97210735183243f), Q31(-0.23095213067791f)},
+{Q31( 0.95424048234441f), Q31(-0.99240147091219f)}, {Q31(-0.96926570524023f), Q31( 0.73775654896574f)},
+{Q31( 0.30872163214726f), Q31( 0.41514960556126f)}, {Q31(-0.24523839572639f), Q31( 0.63206633394807f)},
+{Q31(-0.33813265086024f), Q31(-0.38661779441897f)}, {Q31(-0.05826828420146f), Q31(-0.06940774188029f)},
+{Q31(-0.22898461455054f), Q31( 0.97054853316316f)}, {Q31(-0.18509915019881f), Q31( 0.47565762892084f)},
+{Q31(-0.10488238045009f), Q31(-0.87769947402394f)}, {Q31(-0.71886586182037f), Q31( 0.78030982480538f)},
+{Q31( 0.99793873738654f), Q31( 0.90041310491497f)}, {Q31( 0.57563307626120f), Q31(-0.91034337352097f)},
+{Q31( 0.28909646383717f), Q31( 0.96307783970534f)}, {Q31( 0.42188998312520f), Q31( 0.48148651230437f)},
+{Q31( 0.93335049681047f), Q31(-0.43537023883588f)}, {Q31(-0.97087374418267f), Q31( 0.86636445711364f)},
+{Q31( 0.36722871286923f), Q31( 0.65291654172961f)}, {Q31(-0.81093025665696f), Q31( 0.08778370229363f)},
+{Q31(-0.26240603062237f), Q31(-0.92774095379098f)}, {Q31( 0.83996497984604f), Q31( 0.55839849139647f)},
+{Q31(-0.99909615720225f), Q31(-0.96024605713970f)}, {Q31( 0.74649464155061f), Q31( 0.12144893606462f)},
+{Q31(-0.74774595569805f), Q31(-0.26898062008959f)}, {Q31( 0.95781667469567f), Q31(-0.79047927052628f)},
+{Q31( 0.95472308713099f), Q31(-0.08588776019550f)}, {Q31( 0.48708332746299f), Q31( 0.99999041579432f)},
+{Q31( 0.46332038247497f), Q31( 0.10964126185063f)}, {Q31(-0.76497004940162f), Q31( 0.89210929242238f)},
+{Q31( 0.57397389364339f), Q31( 0.35289703373760f)}, {Q31( 0.75374316974495f), Q31( 0.96705214651335f)},
+{Q31(-0.59174397685714f), Q31(-0.89405370422752f)}, {Q31( 0.75087906691890f), Q31(-0.29612672982396f)},
+{Q31(-0.98607857336230f), Q31( 0.25034911730023f)}, {Q31(-0.40761056640505f), Q31(-0.90045573444695f)},
+{Q31( 0.66929266740477f), Q31( 0.98629493401748f)}, {Q31(-0.97463695257310f), Q31(-0.00190223301301f)},
+{Q31( 0.90145509409859f), Q31( 0.99781390365446f)}, {Q31(-0.87259289048043f), Q31( 0.99233587353666f)},
+{Q31(-0.91529461447692f), Q31(-0.15698707534206f)}, {Q31(-0.03305738840705f), Q31(-0.37205262859764f)},
+{Q31( 0.07223051368337f), Q31(-0.88805001733626f)}, {Q31( 0.99498012188353f), Q31( 0.97094358113387f)},
+{Q31(-0.74904939500519f), Q31( 0.99985483641521f)}, {Q31( 0.04585228574211f), Q31( 0.99812337444082f)},
+{Q31(-0.89054954257993f), Q31(-0.31791913188064f)}, {Q31(-0.83782144651251f), Q31( 0.97637632547466f)},
+{Q31( 0.33454804933804f), Q31(-0.86231516800408f)}, {Q31(-0.99707579362824f), Q31( 0.93237990079441f)},
+{Q31(-0.22827527843994f), Q31( 0.18874759397997f)}, {Q31( 0.67248046289143f), Q31(-0.03646211390569f)},
+{Q31(-0.05146538187944f), Q31(-0.92599700120679f)}, {Q31( 0.99947295749905f), Q31( 0.93625229707912f)},
+{Q31( 0.66951124390363f), Q31( 0.98905825623893f)}, {Q31(-0.99602956559179f), Q31(-0.44654715757688f)},
+{Q31( 0.82104905483590f), Q31( 0.99540741724928f)}, {Q31( 0.99186510988782f), Q31( 0.72023001312947f)},
+{Q31(-0.65284592392918f), Q31( 0.52186723253637f)}, {Q31( 0.93885443798188f), Q31(-0.74895312615259f)},
+{Q31( 0.96735248738388f), Q31( 0.90891816978629f)}, {Q31(-0.22225968841114f), Q31( 0.57124029781228f)},
+{Q31(-0.44132783753414f), Q31(-0.92688840659280f)}, {Q31(-0.85694974219574f), Q31( 0.88844532719844f)},
+{Q31( 0.91783042091762f), Q31(-0.46356892383970f)}, {Q31( 0.72556974415690f), Q31(-0.99899555770747f)},
+{Q31(-0.99711581834508f), Q31( 0.58211560180426f)}, {Q31( 0.77638976371966f), Q31( 0.94321834873819f)},
+{Q31( 0.07717324253925f), Q31( 0.58638399856595f)}, {Q31(-0.56049829194163f), Q31( 0.82522301569036f)},
+{Q31( 0.98398893639988f), Q31( 0.39467440420569f)}, {Q31( 0.47546946844938f), Q31( 0.68613044836811f)},
+{Q31( 0.65675089314631f), Q31( 0.18331637134880f)}, {Q31( 0.03273375457980f), Q31(-0.74933109564108f)},
+{Q31(-0.38684144784738f), Q31( 0.51337349030406f)}, {Q31(-0.97346267944545f), Q31(-0.96549364384098f)},
+{Q31(-0.53282156061942f), Q31(-0.91423265091354f)}, {Q31( 0.99817310731176f), Q31( 0.61133572482148f)},
+{Q31(-0.50254500772635f), Q31(-0.88829338134294f)}, {Q31( 0.01995873238855f), Q31( 0.85223515096765f)},
+{Q31( 0.99930381973804f), Q31( 0.94578896296649f)}, {Q31( 0.82907767600783f), Q31(-0.06323442598128f)},
+{Q31(-0.58660709669728f), Q31( 0.96840773806582f)}, {Q31(-0.17573736667267f), Q31(-0.48166920859485f)},
+{Q31( 0.83434292401346f), Q31(-0.13023450646997f)}, {Q31( 0.05946491307025f), Q31( 0.20511047074866f)},
+{Q31( 0.81505484574602f), Q31(-0.94685947861369f)}, {Q31(-0.44976380954860f), Q31( 0.40894572671545f)},
+{Q31(-0.89746474625671f), Q31( 0.99846578838537f)}, {Q31( 0.39677256130792f), Q31(-0.74854668609359f)},
+{Q31(-0.07588948563079f), Q31( 0.74096214084170f)}, {Q31( 0.76343198951445f), Q31( 0.41746629422634f)},
+{Q31(-0.74490104699626f), Q31( 0.94725911744610f)}, {Q31( 0.64880119792759f), Q31( 0.41336660830571f)},
+{Q31( 0.62319537462542f), Q31(-0.93098313552599f)}, {Q31( 0.42215817594807f), Q31(-0.07712787385208f)},
+{Q31( 0.02704554141885f), Q31(-0.05417518053666f)}, {Q31( 0.80001773566818f), Q31( 0.91542195141039f)},
+{Q31(-0.79351832348816f), Q31(-0.36208897989136f)}, {Q31( 0.63872359151636f), Q31( 0.08128252493444f)},
+{Q31( 0.52890520960295f), Q31( 0.60048872455592f)}, {Q31( 0.74238552914587f), Q31( 0.04491915291044f)},
+{Q31( 0.99096131449250f), Q31(-0.19451182854402f)}, {Q31(-0.80412329643109f), Q31(-0.88513818199457f)},
+{Q31(-0.64612616129736f), Q31( 0.72198674804544f)}, {Q31( 0.11657770663191f), Q31(-0.83662833815041f)},
+{Q31(-0.95053182488101f), Q31(-0.96939905138082f)}, {Q31(-0.62228872928622f), Q31( 0.82767262846661f)},
+{Q31( 0.03004475787316f), Q31(-0.99738896333384f)}, {Q31(-0.97987214341034f), Q31( 0.36526129686425f)},
+{Q31(-0.99986980746200f), Q31(-0.36021610299715f)}, {Q31( 0.89110648599879f), Q31(-0.97894250343044f)},
+{Q31( 0.10407960510582f), Q31( 0.77357793811619f)}, {Q31( 0.95964737821728f), Q31(-0.35435818285502f)},
+{Q31( 0.50843233159162f), Q31( 0.96107691266205f)}, {Q31( 0.17006334670615f), Q31(-0.76854025314829f)},
+{Q31( 0.25872675063360f), Q31( 0.99893303933816f)}, {Q31(-0.01115998681937f), Q31( 0.98496019742444f)},
+{Q31(-0.79598702973261f), Q31( 0.97138411318894f)}, {Q31(-0.99264708948101f), Q31(-0.99542822402536f)},
+{Q31(-0.99829663752818f), Q31( 0.01877138824311f)}, {Q31(-0.70801016548184f), Q31( 0.33680685948117f)},
+{Q31(-0.70467057786826f), Q31( 0.93272777501857f)}, {Q31( 0.99846021905254f), Q31(-0.98725746254433f)},
+{Q31(-0.63364968534650f), Q31(-0.16473594423746f)}, {Q31(-0.16258217500792f), Q31(-0.95939125400802f)},
+{Q31(-0.43645594360633f), Q31(-0.94805030113284f)}, {Q31(-0.99848471702976f), Q31( 0.96245166923809f)},
+{Q31(-0.16796458968998f), Q31(-0.98987511890470f)}, {Q31(-0.87979225745213f), Q31(-0.71725725041680f)},
+{Q31( 0.44183099021786f), Q31(-0.93568974498761f)}, {Q31( 0.93310180125532f), Q31(-0.99913308068246f)},
+{Q31(-0.93941931782002f), Q31(-0.56409379640356f)}, {Q31(-0.88590003188677f), Q31( 0.47624600491382f)},
+{Q31( 0.99971463703691f), Q31(-0.83889954253462f)}, {Q31(-0.75376385639978f), Q31( 0.00814643438625f)},
+{Q31( 0.93887685615875f), Q31(-0.11284528204636f)}, {Q31( 0.85126435782309f), Q31( 0.52349251543547f)},
+{Q31( 0.39701421446381f), Q31( 0.81779634174316f)}, {Q31(-0.37024464187437f), Q31(-0.87071656222959f)},
+{Q31(-0.36024828242896f), Q31( 0.34655735648287f)}, {Q31(-0.93388812549209f), Q31(-0.84476541096429f)},
+{Q31(-0.65298804552119f), Q31(-0.18439575450921f)}, {Q31( 0.11960319006843f), Q31( 0.99899346780168f)},
+{Q31( 0.94292565553160f), Q31( 0.83163906518293f)}, {Q31( 0.75081145286948f), Q31(-0.35533223142265f)},
+{Q31( 0.56721979748394f), Q31(-0.24076836414499f)}, {Q31( 0.46857766746029f), Q31(-0.30140233457198f)},
+{Q31( 0.97312313923635f), Q31(-0.99548191630031f)}, {Q31(-0.38299976567017f), Q31( 0.98516909715427f)},
+{Q31( 0.41025800019463f), Q31( 0.02116736935734f)}, {Q31( 0.09638062008048f), Q31( 0.04411984381457f)},
+{Q31(-0.85283249275397f), Q31( 0.91475563922421f)}, {Q31( 0.88866808958124f), Q31(-0.99735267083226f)},
+{Q31(-0.48202429536989f), Q31(-0.96805608884164f)}, {Q31( 0.27572582416567f), Q31( 0.58634753335832f)},
+{Q31(-0.65889129659168f), Q31( 0.58835634138583f)}, {Q31( 0.98838086953732f), Q31( 0.99994349600236f)},
+{Q31(-0.20651349620689f), Q31( 0.54593044066355f)}, {Q31(-0.62126416356920f), Q31(-0.59893681700392f)},
+{Q31( 0.20320105410437f), Q31(-0.86879180355289f)}, {Q31(-0.97790548600584f), Q31( 0.96290806999242f)},
+{Q31( 0.11112534735126f), Q31( 0.21484763313301f)}, {Q31(-0.41368337314182f), Q31( 0.28216837680365f)},
+{Q31( 0.24133038992960f), Q31( 0.51294362630238f)}, {Q31(-0.66393410674885f), Q31(-0.08249679629081f)},
+{Q31(-0.53697829178752f), Q31(-0.97649903936228f)}, {Q31(-0.97224737889348f), Q31( 0.22081333579837f)},
+{Q31( 0.87392477144549f), Q31(-0.12796173740361f)}, {Q31( 0.19050361015753f), Q31( 0.01602615387195f)},
+{Q31(-0.46353441212724f), Q31(-0.95249041539006f)}, {Q31(-0.07064096339021f), Q31(-0.94479803205886f)},
+{Q31(-0.92444085484466f), Q31(-0.10457590187436f)}, {Q31(-0.83822593578728f), Q31(-0.01695043208885f)},
+{Q31( 0.75214681811150f), Q31(-0.99955681042665f)}, {Q31(-0.42102998829339f), Q31( 0.99720941999394f)},
+{Q31(-0.72094786237696f), Q31(-0.35008961934255f)}, {Q31( 0.78843311019251f), Q31( 0.52851398958271f)},
+{Q31( 0.97394027897442f), Q31(-0.26695944086561f)}, {Q31( 0.99206463477946f), Q31(-0.57010120849429f)},
+{Q31( 0.76789609461795f), Q31(-0.76519356730966f)}, {Q31(-0.82002421836409f), Q31(-0.73530179553767f)},
+{Q31( 0.81924990025724f), Q31( 0.99698425250579f)}, {Q31(-0.26719850873357f), Q31( 0.68903369776193f)},
+{Q31(-0.43311260380975f), Q31( 0.85321815947490f)}, {Q31( 0.99194979673836f), Q31( 0.91876249766422f)},
+{Q31(-0.80692001248487f), Q31(-0.32627540663214f)}, {Q31( 0.43080003649976f), Q31(-0.21919095636638f)},
+{Q31( 0.67709491937357f), Q31(-0.95478075822906f)}, {Q31( 0.56151770568316f), Q31(-0.70693811747778f)},
+{Q31( 0.10831862810749f), Q31(-0.08628837174592f)}, {Q31( 0.91229417540436f), Q31(-0.65987351408410f)},
+{Q31(-0.48972893932274f), Q31( 0.56289246362686f)}, {Q31(-0.89033658689697f), Q31(-0.71656563987082f)},
+{Q31( 0.65269447475094f), Q31( 0.65916004833932f)}, {Q31( 0.67439478141121f), Q31(-0.81684380846796f)},
+{Q31(-0.47770832416973f), Q31(-0.16789556203025f)}, {Q31(-0.99715979260878f), Q31(-0.93565784007648f)},
+{Q31(-0.90889593602546f), Q31( 0.62034397054380f)}, {Q31(-0.06618622548177f), Q31(-0.23812217221359f)},
+{Q31( 0.99430266919728f), Q31( 0.18812555317553f)}, {Q31( 0.97686402381843f), Q31(-0.28664534366620f)},
+{Q31( 0.94813650221268f), Q31(-0.97506640027128f)}, {Q31(-0.95434497492853f), Q31(-0.79607978501983f)},
+{Q31(-0.49104783137150f), Q31( 0.32895214359663f)}, {Q31( 0.99881175120751f), Q31( 0.88993983831354f)},
+{Q31( 0.50449166760303f), Q31(-0.85995072408434f)}, {Q31( 0.47162891065108f), Q31(-0.18680204049569f)},
+{Q31(-0.62081581361840f), Q31( 0.75000676218956f)}, {Q31(-0.43867015250812f), Q31( 0.99998069244322f)},
+{Q31( 0.98630563232075f), Q31(-0.53578899600662f)}, {Q31(-0.61510362277374f), Q31(-0.89515019899997f)},
+{Q31(-0.03841517601843f), Q31(-0.69888815681179f)}, {Q31(-0.30102157304644f), Q31(-0.07667808922205f)},
+{Q31( 0.41881284182683f), Q31( 0.02188098922282f)}, {Q31(-0.86135454941237f), Q31( 0.98947480909359f)},
+{Q31( 0.67226861393788f), Q31(-0.13494389011014f)}, {Q31(-0.70737398842068f), Q31(-0.76547349325992f)},
+{Q31( 0.94044946687963f), Q31( 0.09026201157416f)}, {Q31(-0.82386352534327f), Q31( 0.08924768823676f)},
+{Q31(-0.32070666698656f), Q31( 0.50143421908753f)}, {Q31( 0.57593163224487f), Q31(-0.98966422921509f)},
+{Q31(-0.36326018419965f), Q31( 0.07440243123228f)}, {Q31( 0.99979044674350f), Q31(-0.14130287347405f)},
+{Q31(-0.92366023326932f), Q31(-0.97979298068180f)}, {Q31(-0.44607178518598f), Q31(-0.54233252016394f)},
+{Q31( 0.44226800932956f), Q31( 0.71326756742752f)}, {Q31( 0.03671907158312f), Q31( 0.63606389366675f)},
+{Q31( 0.52175424682195f), Q31(-0.85396826735705f)}, {Q31(-0.94701139690956f), Q31(-0.01826348194255f)},
+{Q31(-0.98759606946049f), Q31( 0.82288714303073f)}, {Q31( 0.87434794743625f), Q31( 0.89399495655433f)},
+{Q31(-0.93412041758744f), Q31( 0.41374052024363f)}, {Q31( 0.96063943315511f), Q31( 0.93116709541280f)},
+{Q31( 0.97534253457837f), Q31( 0.86150930812689f)}, {Q31( 0.99642466504163f), Q31( 0.70190043427512f)},
+{Q31(-0.94705089665984f), Q31(-0.29580042814306f)}, {Q31( 0.91599807087376f), Q31(-0.98147830385781f)},
+// Start of duplicated table
+{Q31(-0.99948153278296f), Q31(-0.59483417516607f)}, {Q31( 0.97113454393991f), Q31(-0.67528515225647f)},
+{Q31( 0.14130051758487f), Q31(-0.95090983575689f)}, {Q31(-0.47005496701697f), Q31(-0.37340549728647f)},
+{Q31( 0.80705063769351f), Q31( 0.29653668284408f)}, {Q31(-0.38981478896926f), Q31( 0.89572605717087f)},
+{Q31(-0.01053049862020f), Q31(-0.66959058036166f)}, {Q31(-0.91266367957293f), Q31(-0.11522938140034f)},
 };
 
 #endif /* AVCODEC_AACSBRDATA_H */
diff --git a/libavcodec/aactab.c b/libavcodec/aactab.c
index 9f1e8af..77d8732 100644
--- a/libavcodec/aactab.c
+++ b/libavcodec/aactab.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,12 +29,16 @@
 
 #include "libavutil/mem.h"
 #include "aac.h"
-#include "aac_tablegen.h"
 
 #include <stdint.h>
 
+float ff_aac_pow2sf_tab[428];
+float ff_aac_pow34sf_tab[428];
+
 DECLARE_ALIGNED(32, float,  ff_aac_kbd_long_1024)[1024];
 DECLARE_ALIGNED(32, float,  ff_aac_kbd_short_128)[128];
+DECLARE_ALIGNED(32, int,    ff_aac_kbd_long_1024_fixed)[1024];
+DECLARE_ALIGNED(32, int,    ff_aac_kbd_short_128_fixed)[128];
 
 const uint8_t ff_aac_num_swb_1024[] = {
     41, 41, 47, 49, 49, 51, 47, 47, 43, 43, 43, 40, 40
@@ -1767,6 +1771,490 @@ const DECLARE_ALIGNED(32, float, ff_aac_eld_window_512)[1920] = {
     -0.00111144, -0.00109764, -0.00108377, -0.00106989,
 };
 
+/* Q30 representation of ff_aac_eld_window_512 table */
+const DECLARE_ALIGNED(32, int, ff_aac_eld_window_512_fixed)[1920] = {
+    0x003783ba, 0x005d04f4, 0x008ae226, 0x00c02021,
+    0x00fb1804, 0x013a30a8, 0x017be9e6, 0x01bf296c,
+    0x02033204, 0x0247502c, 0x028adab0, 0x02cd9568,
+    0x030fa980, 0x03513dc0, 0x03927274, 0x03d363e0,
+    0x04142e40, 0x0454edc0, 0x0495bd48, 0x04d6a060,
+    0x051786d8, 0x05586548, 0x059935e8, 0x05d9feb0,
+    0x061acea0, 0x065bb680, 0x069cc800, 0x06de13f0,
+    0x071fa748, 0x07618b80, 0x07a3c7a8, 0x07e66da0,
+    0x082999d0, 0x086d6590, 0x08b1e640, 0x08f72850,
+    0x093d3120, 0x09840550, 0x09cba880, 0x0a1415f0,
+    0x0a5d41b0, 0x0aa720d0, 0x0af1a9a0, 0x0b3cce70,
+    0x0b887ec0, 0x0bd4ac10, 0x0c214a70, 0x0c6e5130,
+    0x0cbbba50, 0x0d098130, 0x0d57a240, 0x0da61a60,
+    0x0df4e620, 0x0e4401d0, 0x0e9369f0, 0x0ee31de0,
+    0x0f332000, 0x0f837180, 0x0fd412a0, 0x10250260,
+    0x10763f20, 0x10c7c660, 0x11199560, 0x116baa00,
+    0x11be0400, 0x1210a1c0, 0x12638180, 0x12b69ee0,
+    0x1309f3e0, 0x135d7ac0, 0x13b12dc0, 0x1404ffa0,
+    0x1458dd40, 0x14acb720, 0x15008120, 0x15543260,
+    0x15a7c460, 0x15fb3160, 0x164e7520, 0x16a193c0,
+    0x16f49740, 0x17478720, 0x179a6720, 0x17ed3720,
+    0x183ff460, 0x18929c20, 0x18e52b00, 0x19379c00,
+    0x1989e900, 0x19dc0ca0, 0x1a2e0280, 0x1a7fc400,
+    0x1ad14a00, 0x1b228ec0, 0x1b738ea0, 0x1bc44540,
+    0x1c14ada0, 0x1c64c380, 0x1cb48440, 0x1d03f420,
+    0x1d531c00, 0x1da20160, 0x1df0a660, 0x1e3f0860,
+    0x1e8d2340, 0x1edaf340, 0x1f2875e0, 0x1f75a700,
+    0x1fc281e0, 0x200f0380, 0x205b2ac0, 0x20a6f980,
+    0x20f27200, 0x213d9600, 0x21886580, 0x21d2e040,
+    0x221d0640, 0x2266d6c0, 0x22b05180, 0x22f97580,
+    0x23424280, 0x238ab880, 0x23d2d780, 0x241aa040,
+    0x246213c0, 0x24a93300, 0x24efff80, 0x25367b40,
+    0x256f68c0, 0x25b53580, 0x25faa580, 0x263fb940,
+    0x26847080, 0x26c8cbc0, 0x270ccb00, 0x27506e40,
+    0x2793b600, 0x27d6a200, 0x281932c0, 0x285b6880,
+    0x289d4400, 0x28dec5c0, 0x291feec0, 0x2960bf80,
+    0x29a137c0, 0x29e15800, 0x2a212000, 0x2a609080,
+    0x2a9fa980, 0x2ade6b40, 0x2b1cd600, 0x2b5aea00,
+    0x2b98a740, 0x2bd60d80, 0x2c131cc0, 0x2c4fd500,
+    0x2c8c3600, 0x2cc83f00, 0x2d03f040, 0x2d3f48c0,
+    0x2d7a48c0, 0x2db4ef40, 0x2def3c40, 0x2e292ec0,
+    0x2e62c700, 0x2e9c0400, 0x2ed4e580, 0x2f0d6ac0,
+    0x2f4592c0, 0x2f7d5c80, 0x2fb4c6c0, 0x2febd140,
+    0x30227b40, 0x3058c400, 0x308eab40, 0x30c43040,
+    0x30f95100, 0x312e0d00, 0x31626240, 0x31965040,
+    0x31c9d5c0, 0x31fcf240, 0x322fa480, 0x3261ec00,
+    0x3293c7c0, 0x32c53680, 0x32f63780, 0x3326c9c0,
+    0x3356ec00, 0x33869d00, 0x33b5db80, 0x33e4a700,
+    0x3412fdc0, 0x3440df40, 0x346e4a80, 0x349b3e40,
+    0x34c7ba00, 0x34f3bd80, 0x351f47c0, 0x354a5840,
+    0x3574ee40, 0x359f0900, 0x35c8a840, 0x35f1cb80,
+    0x361a71c0, 0x36429a80, 0x366a4580, 0x36917280,
+    0x36b82100, 0x36de5180, 0x37040340, 0x372936c0,
+    0x374dec40, 0x37722340, 0x3795dc40, 0x37b91780,
+    0x37dbd600, 0x37fe18c0, 0x381fe080, 0x38412e00,
+    0x38620280, 0x38825f40, 0x38a24540, 0x38c1b680,
+    0x38e0b5c0, 0x38ff4540, 0x391d6800, 0x393b20c0,
+    0x39587280, 0x39755fc0, 0x3991eb80, 0x39ae1a80,
+    0x39c9f280, 0x39e57980, 0x3a00b600, 0x3a1bae00,
+    0x3a366800, 0x3a50e9c0, 0x3a6b3a40, 0x3a8560c0,
+    0x3a9f6640, 0x3ab95400, 0x3ad332c0, 0x3aed0680,
+    0x3b06cf80, 0x3b208d40, 0x3b3a3e80, 0x3b53cb80,
+    0x3b6d0780, 0x3b85c380, 0x3b9dd0c0, 0x3bb4eb40,
+    0x3bcabac0, 0x3bdee680, 0x3bf11680, 0x3c011440,
+    0x3c179ac0, 0x3c1c4f00, 0x3c21aa40, 0x3c278880,
+    0x3c2dba80, 0x3c341140, 0x3c3a5e80, 0x3c409100,
+    0x3c46b480, 0x3c4cd5c0, 0x3c530180, 0x3c593cc0,
+    0x3c5f84c0, 0x3c65d640, 0x3c6c2e40, 0x3c728b40,
+    0x3c78ee80, 0x3c7f5840, 0x3c85c940, 0x3c8c4240,
+    0x3c92c380, 0x3c994cc0, 0x3c9fde40, 0x3ca67880,
+    0x3cad1ac0, 0x3cb3c540, 0x3cba7800, 0x3cc132c0,
+    0x3cc7f640, 0x3ccec280, 0x3cd59800, 0x3cdc76c0,
+    0x3ce35e80, 0x3cea4f00, 0x3cf147c0, 0x3cf84900,
+    0x3cff5340, 0x3d0666c0, 0x3d0d8400, 0x3d14ab40,
+    0x3d1bdc00, 0x3d2315c0, 0x3d2a5880, 0x3d31a440,
+    0x3d38f900, 0x3d405780, 0x3d47c040, 0x3d4f3300,
+    0x3d56af40, 0x3d5e3500, 0x3d65c380, 0x3d6d5ac0,
+    0x3d74fb40, 0x3d7ca540, 0x3d845900, 0x3d8c1680,
+    0x3d93dd00, 0x3d9bac80, 0x3da38400, 0x3dab6400,
+    0x3db34c80, 0x3dbb3dc0, 0x3dc33840, 0x3dcb3bc0,
+    0x3dd347c0, 0x3ddb5bc0, 0x3de37780, 0x3deb9b00,
+    0x3df3c600, 0x3dfbf940, 0x3e0434c0, 0x3e0c7840,
+    0x3e14c3c0, 0x3e1d1640, 0x3e256f80, 0x3e2dcf40,
+    0x3e363580, 0x3e3ea300, 0x3e4717c0, 0x3e4f9380,
+    0x3e581600, 0x3e609e40, 0x3e692c40, 0x3e71bf80,
+    0x3e7a5840, 0x3e82f740, 0x3e8b9c40, 0x3e944700,
+    0x3e9cf780, 0x3ea5ad00, 0x3eae66c0, 0x3eb72500,
+    0x3ebfe780, 0x3ec8af00, 0x3ed17b80, 0x3eda4d00,
+    0x3ee32340, 0x3eebfd40, 0x3ef4dac0, 0x3efdbbc0,
+    0x3f06a040, 0x3f0f88c0, 0x3f187540, 0x3f216600,
+    0x3f2a5a80, 0x3f335200, 0x3f3c4c40, 0x3f454940,
+    0x3f4e4940, 0x3f574c80, 0x3f605340, 0x3f695dc0,
+    0x3f726b40, 0x3f7b7b40, 0x3f848dc0, 0x3f8da240,
+    0x3f96b940, 0x3f9fd300, 0x3fa8f040, 0x3fb21080,
+    0x3fbb33c0, 0x3fc459c0, 0x3fcd81c0, 0x3fd6abc0,
+    0x3fdfd780, 0x3fe90480, 0x3ff23280, 0x3ffb6100,
+    0x40049f80, 0x400dd080, 0x40170400, 0x40203880,
+    0x40296f00, 0x4032a600, 0x403bde00, 0x40451680,
+    0x404e4f00, 0x40578700, 0x4060be80, 0x4069f500,
+    0x40732b80, 0x407c6280, 0x40859980, 0x408ed100,
+    0x40980800, 0x40a13f00, 0x40aa7500, 0x40b3a980,
+    0x40bcdd80, 0x40c61180, 0x40cf4500, 0x40d87800,
+    0x40e1ab00, 0x40eadc80, 0x40f40c80, 0x40fd3a80,
+    0x41066700, 0x410f9300, 0x4118bd80, 0x4121e700,
+    0x412b0f80, 0x41343580, 0x413d5880, 0x41467980,
+    0x414f9780, 0x4158b380, 0x4161cd80, 0x416ae580,
+    0x4173fb00, 0x417d0d00, 0x41861b80, 0x418f2600,
+    0x41982c80, 0x41a12f80, 0x41aa3000, 0x41b32c80,
+    0x41bc2580, 0x41c51a00, 0x41ce0900, 0x41d6f300,
+    0x41dfd800, 0x41e8b880, 0x41f19400, 0x41fa6b80,
+    0x42033d00, 0x420c0900, 0x4214cf00, 0x421d8e00,
+    0x42264680, 0x422ef980, 0x4237a680, 0x42404d80,
+    0x4248ee00, 0x42518780, 0x425a1a00, 0x4262a480,
+    0x426b2800, 0x4273a400, 0x427c1980, 0x42848880,
+    0x428cef80, 0x42954f00, 0x429da680, 0x42a5f500,
+    0x42ae3b80, 0x42b67a00, 0x42beb100, 0x42c6e080,
+    0x42cf0780, 0x42d72680, 0x42df3c00, 0x42e74880,
+    0x42ef4c80, 0x42f74880, 0x42ff3c80, 0x43072880,
+    0x430f0c80, 0x4316e800, 0x431eba00, 0x43268380,
+    0x432e4480, 0x4335fd00, 0x433dae80, 0x43455800,
+    0x434cfa00, 0x43549400, 0x435c2500, 0x4363ad80,
+    0x436b2e00, 0x4372a700, 0x437a1800, 0x43818200,
+    0x4388e400, 0x43903f00, 0x43979200, 0x439edd00,
+    0x43a62080, 0x43ad5c80, 0x43b49180, 0x43bbbf80,
+    0x43c2e800, 0x43ca0b00, 0x43d12980, 0x43d84280,
+    0x43df5200, 0x43e65500, 0x43ed4800, 0x43f43080,
+    0x43fb1c80, 0x44021b80, 0x44093a00, 0x44106480,
+    0x44176700, 0x441e0c00, 0x44241e00, 0x44297380,
+    0x4425dc00, 0x44240180, 0x441ff300, 0x4419e300,
+    0x44123f80, 0x44097500, 0x43ffe900, 0x43f5e700,
+    0x43eb9f00, 0x43e13f00, 0x43d6f200, 0x43ccbd80,
+    0x43c28400, 0x43b82780, 0x43ad8b00, 0x43a29c80,
+    0x43975180, 0x438ba080, 0x437f8180, 0x4372fd00,
+    0x43662b00, 0x43592480, 0x434c0000, 0x433ecd00,
+    0x43319180, 0x43245300, 0x43171700, 0x4309da80,
+    0x42fc9300, 0x42ef3500, 0x42e1b600, 0x42d40280,
+    0x42c60000, 0x42b79300, 0x42a8a180, 0x42991a00,
+    0x4288f200, 0x42782100, 0x42669e00, 0x42546880,
+    0x42418800, 0x422e0480, 0x4219e500, 0x42053680,
+    0x41f00980, 0x41da7080, 0x41c47b00, 0x41ae3600,
+    0x4197ab80, 0x4180e400, 0x4169e780, 0x4152bb00,
+    0x413b5e80, 0x4123d180, 0x410c1480, 0x40f42100,
+    0x40dbed00, 0x40c36c80, 0x40aa9600, 0x40915f80,
+    0x4077c100, 0x405db280, 0x40432c80, 0x40282580,
+    0x400c9280, 0x3ff068c0, 0x3fd39dc0, 0x3fb62bc0,
+    0x3f981200, 0x3f795080, 0x3f59e780, 0x3f39ebc0,
+    0x3f198680, 0x3ef8e100, 0x3ed82440, 0x3eb76c80,
+    0x3e96c940, 0x3e764900, 0x3e55f980, 0x3e35cb00,
+    0x3e1590c0, 0x3df51cc0, 0x3dd44200, 0x3db2e640,
+    0x3d910200, 0x3d6e8e40, 0x3d4b8480, 0x3d27e600,
+    0x3d03bc00, 0x3cdf0fc0, 0x3cb9eb80, 0x3c946240,
+    0x3c6e9180, 0x3c489700, 0x3c229000, 0x3bfc95c0,
+    0x3bd6bd00, 0x3bb11a80, 0x3b8bc180, 0x3b669bc0,
+    0x3b416a00, 0x3b1beb80, 0x3af5e140, 0x3acf3300,
+    0x3aa7ef80, 0x3a802780, 0x3a57eb80, 0x3a2f5880,
+    0x3a069640, 0x39ddcd40, 0x39b524c0, 0x398ca540,
+    0x39643800, 0x393bc540, 0x39133580, 0x38ea7ac0,
+    0x38c19040, 0x389871c0, 0x386f1b40, 0x38458e00,
+    0x381bd000, 0x37f1e780, 0x37c7db00, 0x379db080,
+    0x37736e80, 0x37491b00, 0x371ebcc0, 0x36f45980,
+    0x36c96600, 0x369ed300, 0x36740380, 0x3648ffc0,
+    0x361dcf40, 0x35f27a00, 0x35c70780, 0x359b7f80,
+    0x356fe9c0, 0x35444dc0, 0x3518b280, 0x34ed1940,
+    0x34c17c00, 0x3495d4c0, 0x346a1d40, 0x343e4300,
+    0x34122840, 0x33e5ae00, 0x33b8b780, 0x338b4dc0,
+    0x335d9f00, 0x332fdc00, 0x33023440, 0x32d4cc40,
+    0x32a7bc80, 0x327b1d40, 0x324f04c0, 0x32235280,
+    0x31f7b100, 0x31cbc7c0, 0x319f4140, 0x3171fb40,
+    0x31440840, 0x31157d00, 0x30e66e80, 0x30b6fc40,
+    0x30875080, 0x30579600, 0x3027f700, 0x2ff89140,
+    0x2fc976c0, 0x2f9ab880, 0x2f6c6780, 0x2f3e8780,
+    0x2f111000, 0x2ee3f800, 0x2eb73480, 0x2e8a9840,
+    0x2e5dd340, 0x2e3093c0, 0x2e028ac0, 0x2dd39680,
+    0x2da3c480, 0x2d732380, 0x2d41c400, 0x2d0fd300,
+    0x2cdd9ac0, 0x2cab6640, 0x2c797f00, 0x2c480d40,
+    0x2c171700, 0x2be6a0c0, 0x2bb6ae80, 0x2b8739c0,
+    0x2b583200, 0x2b298600, 0x2afb2400, 0x2accfa40,
+    0x2a9ef500, 0x2a710100, 0x2a430ac0, 0x2a14f9c0,
+    0x29e6b0c0, 0x29b81240, 0x29890140, 0x29596900,
+    0x29293e00, 0x28f87500, 0x28c70340, 0x2894efc0,
+    0x28625140, 0x282f4040, 0x27fbd5c0, 0x27c83540,
+    0x27948ec0, 0x27611240, 0x272def80, 0x26fb4cc0,
+    0x26c94780, 0x2697fcc0, 0x26678880, 0x2637f740,
+    0x26094540, 0x25db6dc0, 0x25ae6b40, 0x25821680,
+    0x255627c0, 0x252a55c0, 0x24fe5680, 0x24d1db40,
+    0x24a48fc0, 0x24761f40, 0x244637c0, 0x2414c900,
+    0x23e20240, 0x23ae1740, 0x23793bc0, 0x2343cc00,
+    0x230e4ac0, 0x22d93c80, 0x22a52400, 0x22725180,
+    0x2240e480, 0x2210f9c0, 0x21e2ab40, 0x21b5c7c0,
+    0x2189d2c0, 0x215e4d40, 0x2132b900, 0x2106ba80,
+    0x20da1940, 0x20ac9d80, 0x207e11c0, 0x204e77c0,
+    0x201e0880, 0x1fecfea0, 0x1fbb94e0, 0x1f8a0500,
+    0x1f59d340, 0x1f27ac20, 0x1ef67c60, 0x1ec64e40,
+    0x1e96fdc0, 0x1e686400, 0x1e3a5a00, 0x1e0cae80,
+    0x1ddf25e0, 0x1db18460, 0x1d839020, 0x1d5536e0,
+    0x1d268e80, 0x1cf7ae60, 0x1cc8aea0, 0x1c99af00,
+    0x1c6ad820, 0x1c3c5280, 0x1c0e4500, 0x1be0ab60,
+    0x1bb35620, 0x1b861400, 0x1b58b480, 0x1b2b1a00,
+    0x1afd39c0, 0x1acf09a0, 0x1aa080c0, 0x1a71b020,
+    0x1a42c2a0, 0x1a13e420, 0x19e53fc0, 0x19b6eb00,
+    0x1988e620, 0x195b3060, 0x192dc8a0, 0x1900a8a0,
+    0x18d3c4e0, 0x18a711e0, 0x187a83e0, 0x184e10e0,
+    0x1821b060, 0x17f55a00, 0x17c90580, 0x179cb100,
+    0x177060a0, 0x17441880, 0x1717dd20, 0x16ebb080,
+    0x16bf9260, 0x169382e0, 0x166781c0, 0x163b8f80,
+    0x160fade0, 0x15e3de40, 0x15b82220, 0x158c7ae0,
+    0x1560ea80, 0x15357240, 0x150a1400, 0x14ded020,
+    0x14b3a640, 0x148895a0, 0x145d9dc0, 0x1432bde0,
+    0x1407f540, 0x13dd4380, 0x13b2a860, 0x13882460,
+    0x135db880, 0x133365a0, 0x13092cc0, 0x12df0e60,
+    0x12b50aa0, 0x128b2120, 0x12615200, 0x12379da0,
+    0x120e04c0, 0x11e48820, 0x11bb2860, 0x1191e600,
+    0x1168c080, 0x113fb7a0, 0x1116cb40, 0x10edfba0,
+    0x10c54a00, 0x109cb7a0, 0x10744560, 0x104bf420,
+    0x1023c3e0, 0x0ffbb500, 0x0fd3c790, 0x0fabfbe0,
+    0x0f845290, 0x0f5ccc40, 0x0f356970, 0x0f0e2a60,
+    0x0ee70eb0, 0x0ec01610, 0x0e994040, 0x0e728d50,
+    0x0e4bfdf0, 0x0e2592c0, 0x0dff4c70, 0x0dd92af0,
+    0x0db32da0, 0x0d8d53e0, 0x0d679cf0, 0x0d420880,
+    0x0d1c9680, 0x0cf74700, 0x0cd219f0, 0x0cad0eb0,
+    0x0c882450, 0x0c6359a0, 0x0c3ead90, 0x0c1a1f80,
+    0x0bf5af40, 0x0bd15cf0, 0x0bad2870, 0x0b891440,
+    0x0b652530, 0x0b416020, 0x0b1dca30, 0x0afa6810,
+    0x0ad73ee0, 0x0ab45370, 0x0a91aac0, 0x0a6f49b0,
+    0x0a4da7f0, 0x0a2c7e20, 0x0a0ba310, 0x09eb1220,
+    0x09cac6e0, 0x09aabc70, 0x098aee40, 0x096b57a0,
+    0x094bf400, 0x092cbea0, 0x090db2e0, 0x08eecef0,
+    0x08d01360, 0x08b18110, 0x089318b0, 0x0874db00,
+    0x0856c880, 0x0838e1b0, 0x081b2730, 0x07fd99a8,
+    0x07e03a28, 0x07c309a8, 0x07a60910, 0x07893918,
+    0x076c99d0, 0x07502b90, 0x0733ee70, 0x0717e2f8,
+    0x06fc09b8, 0x06e06378, 0x06c4f0b8, 0x06a9b1c8,
+    0x068ea6a0, 0x0673cf18, 0x06592b18, 0x063ebad0,
+    0x06247ed0, 0x060a7780, 0x05f0a570, 0x05d708b8,
+    0x05bda128, 0x05a46e80, 0x058b7078, 0x0572a740,
+    0x055a1330, 0x0541b4d8, 0x05298c98, 0x05119a88,
+    0x04f9de50, 0x04e257a0, 0x04cb0630, 0x04b3ea00,
+    0x049d0378, 0x04865308, 0x046fd918, 0x045995a8,
+    0x04438860, 0x042db0d0, 0x04180ea0, 0x0402a1d0,
+    0x03ed6abc, 0x03d869b8, 0x03c39f28, 0x03af0af0,
+    0x039aaca0, 0x038683b4, 0x03728fc0, 0x035ed0b0,
+    0x034b46c4, 0x0337f254, 0x0324d3a0, 0x0311eab0,
+    0x02ff370c, 0x02ecb85c, 0x02da6e34, 0x02c858a8,
+    0x02b67820, 0x02a4cd28, 0x02935820, 0x02821920,
+    0x02710fac, 0x02603b54, 0x024f9bb4, 0x023f308c,
+    0x022ef9e8, 0x021ef7c8, 0x020f2a40, 0x01ff908e,
+    0x01f02974, 0x01e0f38a, 0x01d1ed94, 0x01c316d6,
+    0x01b46f5e, 0x01a5f720, 0x0197ae28, 0x018994ea,
+    0x017bac54, 0x016df546, 0x016070ae, 0x01532078,
+    0x01460760, 0x01392834, 0x012c85a4, 0x01201f7a,
+    0x0113f27c, 0x0107fb6c, 0x00fc36fd, 0x00f0a2d5,
+    0x00e53d51, 0x00da050f, 0x00cef88c, 0x00c41869,
+    0x00b9671f, 0x00aee754, 0x00a49b80, 0x009a8384,
+    0x00909ca6, 0x0086e400, 0x007d56e3, 0x0073f48e,
+    0x006abe70, 0x0061b5de, 0x0058dc65, 0x005033b4,
+    0x0047be30, 0x003f7e30, 0x00377619, 0x002fa4d4,
+    0x002805ee, 0x002094cb, 0x00194cb8, 0x00122856,
+    0x000b215c, 0x00043148, 0xfffd51f0, 0xfff683a0,
+    0xffefcd4d, 0xffe9362f, 0xffe2c57d, 0xffdc855c,
+    0xffd682c4, 0xffd0cad4, 0xffcb6a2c, 0xffc663bc,
+    0xffc1b06f, 0xffbd48e1, 0xffb92570, 0xffb53a54,
+    0xffb1779c, 0xffadcd38, 0xffaa2b42, 0xffa68855,
+    0xffa2e141, 0xff9f332c, 0xff9b7b9c, 0xff97bf2e,
+    0xff9409e2, 0xff9067e2, 0xff8ce556, 0xff898bf0,
+    0xff866306, 0xff8371d0, 0xff80bf63, 0xff7e4eba,
+    0xff7c1eaa, 0xff7a2e04, 0xff787b47, 0xff770280,
+    0xff75bd06, 0xff74a3f7, 0xff73b0b2, 0xff72dd02,
+    0xff72237e, 0xff717ebe, 0xff70e94c, 0xff705f59,
+    0xff6fde6a, 0xff6f6426, 0xff6eee40, 0xff6e7d0b,
+    0xff6e1359, 0xff6db403, 0xff6d61f8, 0xff6d2054,
+    0xff6cf267, 0xff6cdb76, 0xff6cdebb, 0xff6cff47,
+    0xff6d3fc9, 0xff6da306, 0xff6e2b82, 0xff6eda13,
+    0xff6fad6d, 0xff70a463, 0xff71bd9d, 0xff72f662,
+    0xff744a80, 0xff75b5c4, 0xff773409, 0xff78c0a6,
+    0xff7a5693, 0xff7bf0dc, 0xff7d8abb, 0xff7f2301,
+    0xff80bc08, 0xff825854, 0xff83fa56, 0xff85a55c,
+    0xff875d22, 0xff892598, 0xff8b025d, 0xff8cf53c,
+    0xff8efdf4, 0xff911c48, 0xff934fc9, 0xff959675,
+    0xff97ec86, 0xff9a4e35, 0xff9cb7d2, 0xff9f26cc,
+    0xffa199ce, 0xffa40f74, 0xffa6867c, 0xffa8feb2,
+    0xffab78e0, 0xffadf5c7, 0xffb07640, 0xffb2fba0,
+    0xffb587a2, 0xffb81bfb, 0xffbaba46, 0xffbd6236,
+    0xffc011a8, 0xffc2c679, 0xffc57e84, 0xffc83894,
+    0xffcaf41a, 0xffcdb0b8, 0xffd06e17, 0xffd32bf7,
+    0xffd5ea38, 0xffd8a8c3, 0xffdb6764, 0xffde25fb,
+    0xffe0e471, 0xffe3a2b2, 0xffe66087, 0xffe91da6,
+    0xffebd978, 0xffee9351, 0xfff14ab0, 0xfff3fef6,
+    0xfff6af94, 0xfff95c0c, 0xfffc03c7, 0xfffea659,
+    0x00015885, 0x0003f2e9, 0x00068a73, 0x00091e8d,
+    0x000bae7f, 0x000e39bf, 0x0010bf96, 0x00133f78,
+    0x0015b8c4, 0x00182ae4, 0x001a9558, 0x001cf7b2,
+    0x001f51e0, 0x0021a3b4, 0x0023ed25, 0x00262df2,
+    0x002865c5, 0x002a9469, 0x002cb967, 0x002ed4aa,
+    0x0030e607, 0x0032ed88, 0x0034eb2f, 0x0036de23,
+    0x0038c503, 0x003a9e4c, 0x003c68a6, 0x003e23dd,
+    0x003fd0db, 0x00417083, 0x0043038b, 0x00448adf,
+    0x00460740, 0x0047799c, 0x0048e2b2, 0x004a42af,
+    0x004b98fb, 0x004ce50b, 0x004e2654, 0x004f5b5d,
+    0x005081c3, 0x00519716, 0x00529920, 0x005386d0,
+    0x0054603f, 0x00552581, 0x0055d6cc, 0x00567558,
+    0x0057033c, 0x005782b4, 0x0057f5b6, 0x00585e46,
+    0x0058be68, 0x005917ff, 0x00596ce4, 0x0059bcc0,
+    0x005a053a, 0x005a43ee, 0x005a76ae, 0x005a9b37,
+    0x005aaf38, 0x005ab07a, 0x005a9cef, 0x005a7349,
+    0x005a3328, 0x0059dc0a, 0x00596db0, 0x0058e8e5,
+    0x00584f98, 0x0057a3c0, 0x0056e738, 0x00561bec,
+    0x005543df, 0x0054610b, 0x0053753e, 0x0052824e,
+    0x005189f6, 0x00508dec, 0x004f8fc0, 0x004e8fd0,
+    0x004d8d26, 0x004c86d7, 0x004b7c0a, 0x004a6b33,
+    0x00495239, 0x00482f0e, 0x0046ffc4, 0x0045c201,
+    0x00447337, 0x004310cc, 0x00419871, 0x004008e4,
+    0x003e6231, 0x003ca460, 0x003acf8a, 0x0038e57a,
+    0x0036e981, 0x0034defa, 0x0032c94b, 0x0030acc6,
+    0x002e8eb4, 0x002c7452, 0x002a62aa, 0x00285bbf,
+    0x00265eda, 0x00246b24, 0x00227f9c, 0x002098e7,
+    0x001eb13b, 0x001cc2ef, 0x001ac899, 0x0018be3d,
+    0x0016a198, 0x00147065, 0x00122897, 0x000fcbc5,
+    0x000d5f03, 0x000ae77a, 0x00086a52, 0x0005eb92,
+    0x00036e4a, 0x0000f57e, 0xfffe8414, 0xfffc1a78,
+    0xfff9b6bb, 0xfff756d9, 0xfff4f8d0, 0xfff29add,
+    0xfff03b87, 0xffedd94c, 0xffeb7295, 0xffe9072b,
+    0xffe6981a, 0xffe4265b, 0xffe1b30e, 0xffdf3f2b,
+    0xffdccb9e, 0xffda5993, 0xffd7ea0c, 0xffd57d60,
+    0xffd31302, 0xffd0aa27, 0xffce4243, 0xffcbdb40,
+    0xffc97595, 0xffc711a2, 0xffc4af9d, 0xffc24fa6,
+    0xffbff1de, 0xffbd9699, 0xffbb3e44, 0xffb8e8d5,
+    0xffb695f4, 0xffb44522, 0xffb1f627, 0xffafa8f0,
+    0xffad5d91, 0xffab140a, 0xffa8cc1c, 0xffa68590,
+    0xffa44066, 0xffa1fca0, 0xff9fba30, 0xff9d7902,
+    0xff9b3916, 0xff98fa6d, 0xff96bd06, 0xff9480b6,
+    0xff924532, 0xff900a24, 0xff8dcf41, 0xff8b9433,
+    0xff895884, 0xff871bd3, 0xff84dd8a, 0xff829d34,
+    0xff805a43, 0xff7e142d, 0xff7bca71, 0xff797c83,
+    0xff7729e3, 0xff74d204, 0xff727451, 0xff70101e,
+    0xff6da493, 0xff6b30d1, 0xff68b3f4, 0xff662d31,
+    0xff639bd1, 0xff60ff09, 0xff5e562c, 0xff5ba3e0,
+    0xff58ee39, 0xff563c22, 0xff5394f3, 0xff50fd1e,
+    0xff4e7599, 0xff4bff32, 0xff499ad4, 0xff47490a,
+    0xff450a36, 0xff42deb7, 0xff40c6cf, 0xff3ec2be,
+    0xff3cd299, 0xff3af681, 0xff392e6a, 0xff377a4a,
+    0xff35d9f7, 0xff344d44, 0xff32d3e8, 0xff316d96,
+    0xff3019d9, 0xff2ed83a, 0xff2da82f, 0xff2c88bf,
+    0xff2b78b4, 0xff2a76cc, 0xff298184, 0xff289890,
+    0xff27bc7d, 0xff26ee21, 0xff262e28, 0xff257cdc,
+    0xff24d9f4, 0xff244524, 0xff23be15, 0xff234488,
+    0xff22d852, 0xff227947, 0xff22273d, 0xff21e1d2,
+    0xff21a871, 0xff217a79, 0xff215748, 0xff213eca,
+    0xff21319e, 0xff21305c, 0xff213baf, 0xff2153c2,
+    0xff21782b, 0xff21a892, 0xff21e477, 0xff222bda,
+    0xff227f26, 0xff22debd, 0xff234b09, 0xff23c394,
+    0xff24471d, 0xff24d42b, 0xff25695c, 0xff260538,
+    0xff26a652, 0xff274b28, 0xff27f22d, 0xff2899d2,
+    0xff295975, 0xff29f2ad, 0xff2a96d7, 0xff2b45f4,
+    0xff2bffe3, 0xff2cc4ba, 0xff2d9458, 0xff2e6ede,
+    0xff2f544c, 0xff3044b7, 0xff314034, 0xff3246fa,
+    0xff33591e, 0xff3476e0, 0xff35a060, 0xff36d534,
+    0xff38148f, 0xff395daf, 0xff3aafd4, 0xff3c0ac8,
+    0xff3d6ed6, 0xff3edc54, 0xff405382, 0xff41d3f5,
+    0xff435ccc, 0xff44ed0f, 0xff4683d3, 0xff482080,
+    0xff49c297, 0xff4b69ab, 0xff4d1547, 0xff4ec4f5,
+    0xff50781d, 0xff522e20, 0xff53e692, 0xff55a15d,
+    0xff575f17, 0xff592022, 0xff5ae4de, 0xff5cacb4,
+    0xff5e75e2, 0xff603ee5, 0xff62062f, 0xff63caab,
+    0xff658b55, 0xff67476d, 0xff68fe11, 0xff6aaea0,
+    0xff6c5899, 0xff6dfb86, 0xff6f96e7, 0xff712a65,
+    0xff72b59f, 0xff74382b, 0xff75b1d3, 0xff772276,
+    0xff788a20, 0xff79e8e5, 0xff7b3ef0, 0xff7c8c98,
+    0xff7dd249, 0xff7f108c, 0xff804804, 0xff817d0e,
+    0xff82b74a, 0xff83fde6, 0xff855762, 0xff86c622,
+    0xff884904, 0xff89ded1, 0xff8b8646, 0xff8d3e4c,
+    0xff8f05cc, 0xff90dbc6, 0xff92bf2a, 0xff94af04,
+    0xff96aa26, 0xff98af9a, 0xff9abe48, 0xff9cd543,
+    0xff9ef3c1, 0xffa118ea, 0xffa343fd, 0xffa57423,
+    0xffa7a890, 0xffa9e084, 0xffac1b31, 0xffae5802,
+    0xffb09680, 0xffb2d621, 0xffb51678, 0xffb75704,
+    0xffb99726, 0xffbbd645, 0xffbe13d7, 0xffc04f26,
+    0xffc2879a, 0xffc4bc72, 0xffc6ed24, 0xffc918e3,
+    0xffcb3eb8, 0xffcd5dcc, 0xffcf7549, 0xffd184d8,
+    0xffd38c8f, 0xffd58ca4, 0xffd7854d, 0xffd97694,
+    0xffdb606e, 0xffdd42d1, 0xffdf1da8, 0xffe0f09b,
+    0xffe2bb00, 0xffe47c41, 0xffe633c6, 0xffe7e150,
+    0xffe98534, 0xffeb1fb4, 0xffecb10e, 0xffee3944,
+    0xffefb7e9, 0xfff12cbe, 0xfff29762, 0xfff3f789,
+    0xfff54cbe, 0xfff69695, 0xfff7d4b8, 0xfff90748,
+    0xfffa2ee5, 0xfffb4c3c, 0xfffc6003, 0xfffd6af0,
+    0xfffe6dda, 0xffff69b8, 0x00005f4b, 0x00014e7f,
+    0x00023646, 0x000315b4, 0x0003ebd3, 0x0004b74a,
+    0x00057677, 0x000627e2, 0x0006ca09, 0x00075ce1,
+    0x0007e196, 0x00085955, 0x0008c556, 0x00092751,
+    0x00098153, 0x0009d581, 0x000a25be, 0x000a732b,
+    0x000abe1f, 0x000b06e4, 0x000b4db1, 0x000b91fa,
+    0x000bd266, 0x000c0da0, 0x000c426e, 0x000c6ffb,
+    0x000c95b0, 0x000cb2f7, 0x000cc76e, 0x000cd317,
+    0x000cd647, 0x000cd17f, 0x000cc52b, 0x000cb1ea,
+    0x000c98c0, 0x000c7a62, 0x000c57c7, 0x000c3187,
+    0x000c0862, 0x000bdcd8, 0x000baf81, 0x000b80c7,
+    0x000b50ec, 0x000b202f, 0x000aeec6, 0x000abcb2,
+    0x000a89d2, 0x000a5605, 0x000a2116, 0x0009eafb,
+    0x0009b37d, 0x00097a9d, 0x00094030, 0x00090440,
+    0x0008c6b9, 0x000887ae, 0x0008470c, 0x00080512,
+    0x0007c1f6, 0x00077df9, 0x0007395a, 0x0006f45b,
+    0x0006af67, 0x00066abe, 0x000626b6, 0x0005e38f,
+    0x0005a1a0, 0x0005611e, 0x00052234, 0x0004e502,
+    0x0004a95d, 0x00046f46, 0x00043691, 0x0003ff33,
+    0x0003c90d, 0x0003941f, 0x00036047, 0x00032d9c,
+    0x0002fc1e, 0x0002cbed, 0x00029d1e, 0x00026fbc,
+    0x000243f2, 0x000219d6, 0x0001f17d, 0x0001caf1,
+    0x0001a63e, 0x00018363, 0x00016256, 0x00014316,
+    0x0001258f, 0x000109cb, 0x0000efaa, 0x0000d720,
+    0x0000c03a, 0x0000aacb, 0x000096de, 0x0000846a,
+    0x0000736d, 0x000063d3, 0x000055a6, 0x000048d0,
+    0x00003d47, 0x000032f6, 0x000029dc, 0x000021d9,
+    0x00001ae3, 0x000014ee, 0x00000fdb, 0x00000ba9,
+    0x00000839, 0x00000589, 0x00000370, 0x000001ee,
+    0x000000d7, 0x00000036, 0xffffffe0, 0xffffffc0,
+    0xffffffd5, 0xfffffff5, 0x0000000b, 0x0000000b,
+    0x0000000b, 0x0000000b, 0xfffffff5, 0xffffffd5,
+    0xffffffca, 0xffffffe0, 0x00000036, 0x000000d7,
+    0x000001ce, 0x0000033b, 0x00000529, 0x000007ad,
+    0x00000ac8, 0x00000e99, 0x00001316, 0x0000185e,
+    0x00001e7e, 0x00002575, 0x00002d4c, 0x0000361b,
+    0x00003fd6, 0x00004a93, 0x00005647, 0x00006312,
+    0x000070de, 0x00007fad, 0x00008f87, 0x0000a064,
+    0x0000b242, 0x0000c52d, 0x0000d919, 0x0000ee12,
+    0x0001040c, 0x00011b13, 0x0001331b, 0x00014c30,
+    0x0001663c, 0x0001814a, 0x00019d4f, 0x0001ba35,
+    0x0001d7e7, 0x0001f645, 0x00021544, 0x000234c3,
+    0x000254b9, 0x00027505, 0x000295a7, 0x0002b67e,
+    0x0002d7a1, 0x0002f904, 0x00031ab2, 0x00033ca0,
+    0x00035ee5, 0x0003818a, 0x0003a485, 0x0003c7e1,
+    0x0003eb72, 0x00040f0e, 0x0004329f, 0x000455e6,
+    0x000478c0, 0x00049aef, 0x0004bc52, 0x0004dca9,
+    0x0004fbde, 0x000519c5, 0x00053635, 0x0005512d,
+    0x00056aae, 0x000582a1, 0x00059927, 0x0005ae40,
+    0x0005c1f6, 0x0005d455, 0x0005e572, 0x0005f56d,
+    0x00060446, 0x0006121e, 0x00061f09, 0x00062b08,
+    0x00063605, 0x00063feb, 0x00064899, 0x00064ff0,
+    0x000655a5, 0x00065996, 0x00065b6f, 0x00065af8,
+    0x000657e9, 0x000651d4, 0x00064884, 0x00063bae,
+    0x00062b33, 0x00061706, 0x0005fefd, 0x0005e344,
+    0x0005c404, 0x0005a195, 0x00057c41, 0x00055473,
+    0x00052ac2, 0x0004ffc4, 0x0004d410, 0x0004a7e5,
+    0x00047b4f, 0x00044e39, 0x00042096, 0x0003f208,
+    0x0003c1e1, 0x00038f77, 0x00035a12, 0x00032127,
+    0x0002e476, 0x0002a389, 0x00025e29, 0x0002146d,
+    0x0001c700, 0x00017682, 0x000123a1, 0x0000cefd,
+    0x000078f7, 0x0000221a, 0xffffcad1, 0xffff7332,
+    0xffff1b1e, 0xfffec253, 0xfffe6891, 0xfffe0da2,
+    0xfffdb15c, 0xfffd5393, 0xfffcf412, 0xfffc92e3,
+    0xfffc3032, 0xfffbcc29, 0xfffb6714, 0xfffb0113,
+    0xfffa9a5b, 0xfffa3337, 0xfff9cbd4, 0xfff96450,
+    0xfff8fcac, 0xfff894dc, 0xfff82cd8, 0xfff7c4a8,
+    0xfff75c6d, 0xfff6f45e, 0xfff68c84, 0xfff62500,
+    0xfff5bde8, 0xfff5575a, 0xfff4f179, 0xfff48c64,
+    0xfff42810, 0xfff3c488, 0xfff361d7, 0xfff30008,
+    0xfff29f3a, 0xfff23f78, 0xfff1e0d8, 0xfff1835b,
+    0xfff1272a, 0xfff0cc46, 0xfff072cf, 0xfff01ad0,
+    0xffefc469, 0xffef6fa4, 0xffef1ca3, 0xffeecb7a,
+    0xffee7c1f, 0xffee2eb2, 0xffede33d, 0xffed99c1,
+    0xffed5249, 0xffed0cde, 0xffecc98d, 0xffec8849,
+    0xffec4934, 0xffec0c38, 0xffebd175, 0xffeb98eb,
+    0xffeb62a4, 0xffeb2ead, 0xffeafd19, 0xffeacdea,
+    0xffeaa129, 0xffea76cc, 0xffea4ef4, 0xffea299f,
+    0xffea06e5, 0xffe9e6ce, 0xffe9c97d, 0xffe9aebb,
+    0xffe99651, 0xffe97fd6, 0xffe96ad3, 0xffe95711,
+    0xffe9447d, 0xffe93315, 0xffe922ce, 0xffe913a0,
+    0xffe90588, 0xffe8f887, 0xffe8ec93, 0xffe8e1c1,
+    0xffe8d806, 0xffe8cf77, 0xffe8c816, 0xffe8c1eb,
+    0xffe8bd03, 0xffe8b967, 0xffe8b72e, 0xffe8b64d,
+    0xffe8b6d8, 0xffe8b8dc, 0xffe8bc6c, 0xffe8c18a,
+    0xffe8c840, 0xffe8d0a4, 0xffe8daca, 0xffe8e69e,
+    0xffe8f42a, 0xffe9035a, 0xffe9142b, 0xffe926a0,
+    0xffe93ab7, 0xffe95066, 0xffe967b8, 0xffe980ad,
+    0xffe99b3a, 0xffe9b754, 0xffe9d511, 0xffe9f45b,
+    0xffea1532, 0xffea3797, 0xffea5b89, 0xffea8108,
+    0xffeaa7ff, 0xffead079, 0xffeafa55, 0xffeb259e,
+    0xffeb5254, 0xffeb8061, 0xffebafdc, 0xffebe0ae,
+    0xffec12ce, 0xffec462f, 0xffec7add, 0xffecb0a3,
+    0xffece774, 0xffed1f32, 0xffed57a7, 0xffed90b2,
+    0xffedca48, 0xffee042a, 0xffee3e57, 0xffee788e,
+};
+
 const DECLARE_ALIGNED(32, float, ff_aac_eld_window_480)[1800] = {
      0.00101191,  0.00440397,  0.00718669,  0.01072130,
      0.01459757,  0.01875954,  0.02308987,  0.02751541,
@@ -2219,3 +2707,456 @@ const DECLARE_ALIGNED(32, float, ff_aac_eld_window_480)[1800] = {
     -0.00115988, -0.00114605, -0.00113200, -0.00111778,
     -0.00110343, -0.00108898, -0.00107448, -0.00105995,
 };
+
+const DECLARE_ALIGNED(32, int, ff_aac_eld_window_480_fixed)[1800] = {
+    0x00109442, 0x00482797, 0x0075bf2a, 0x00afa864,
+    0x00ef2aa5, 0x01335b36, 0x017a4df0, 0x01c2cffe,
+    0x020bfb4c, 0x0254fd74, 0x029d557c, 0x02e50574,
+    0x032c41a8, 0x03732c08, 0x03b9cb88, 0x040032e8,
+    0x044686f0, 0x048cd578, 0x04d30738, 0x05190500,
+    0x055ec210, 0x05a44750, 0x05e9aeb8, 0x062f0c80,
+    0x067477a0, 0x06ba1ac0, 0x07001998, 0x074680e0,
+    0x078d5ec0, 0x07d4d038, 0x081cf8f0, 0x0865f8b0,
+    0x08afe0e0, 0x08fab150, 0x09466cd0, 0x09931910,
+    0x09e0adb0, 0x0a2f1640, 0x0a7e43f0, 0x0ace2960,
+    0x0b1eb180, 0x0b6fc4b0, 0x0bc15050, 0x0c134710,
+    0x0c65a420, 0x0cb86340, 0x0d0b7df0, 0x0d5ef450,
+    0x0db2cb60, 0x0e070180, 0x0e5b91f0, 0x0eb07f20,
+    0x0f05d0a0, 0x0f5b8920, 0x0fb1a950, 0x10082e40,
+    0x105f1400, 0x10b65820, 0x110df780, 0x1165f120,
+    0x11be43e0, 0x1216eea0, 0x126feac0, 0x12c92b00,
+    0x1322a620, 0x137c55c0, 0x13d61ae0, 0x142fc940,
+    0x148949e0, 0x14e28da0, 0x153b9a80, 0x15947640,
+    0x15ed1840, 0x16458660, 0x169deb20, 0x16f663c0,
+    0x174ef8c0, 0x17a7a120, 0x180041c0, 0x1858d000,
+    0x18b14940, 0x1909a140, 0x1961c820, 0x19b9b620,
+    0x1a116480, 0x1a68c1a0, 0x1abfbd00, 0x1b164f60,
+    0x1b6c7580, 0x1bc23120, 0x1c1780e0, 0x1c6c5d00,
+    0x1cc0dbe0, 0x1d1532a0, 0x1d697660, 0x1dbdac20,
+    0x1e11b280, 0x1e655b80, 0x1eb89e80, 0x1f0b7720,
+    0x1f5dd680, 0x1fafaec0, 0x2000fb00, 0x2051c340,
+    0x20a22ac0, 0x20f24580, 0x214213c0, 0x21919140,
+    0x21e0b300, 0x222f7580, 0x227dd900, 0x22cbd880,
+    0x23196ec0, 0x23669b00, 0x23b35d80, 0x23ffb6c0,
+    0x244ba7c0, 0x249731c0, 0x24e25700, 0x252d1940,
+    0x2594ae40, 0x25deea40, 0x2628bd00, 0x26722680,
+    0x26bb2740, 0x2703bf40, 0x274beec0, 0x2793b600,
+    0x27db1500, 0x28220c00, 0x28689b80, 0x28aec4c0,
+    0x28f48800, 0x2939e680, 0x297ee080, 0x29c37600,
+    0x2a07a740, 0x2a4b74c0, 0x2a8ede80, 0x2ad1e500,
+    0x2b148880, 0x2b56c940, 0x2b98a740, 0x2bda2240,
+    0x2c1b3a80, 0x2c5bef80, 0x2c9c4100, 0x2cdc2e80,
+    0x2d1bb800, 0x2d5adc80, 0x2d999b80, 0x2dd7f500,
+    0x2e15e800, 0x2e537400, 0x2e9098c0, 0x2ecd5540,
+    0x2f09a900, 0x2f4592c0, 0x2f811140, 0x2fbc2340,
+    0x2ff6c7c0, 0x3030fe80, 0x306ac6c0, 0x30a41f80,
+    0x30dd07c0, 0x31157dc0, 0x314d7fc0, 0x31850c80,
+    0x31bc22c0, 0x31f2c1c0, 0x3228e840, 0x325e9540,
+    0x3293c7c0, 0x32c87e40, 0x32fcb800, 0x33307340,
+    0x3363aec0, 0x33966940, 0x33c8a140, 0x33fa5580,
+    0x342b84c0, 0x345c2dc0, 0x348c4f80, 0x34bbe900,
+    0x34eaf9c0, 0x35198080, 0x35477d00, 0x3574ee40,
+    0x35a1d340, 0x35ce2bc0, 0x35f9f6c0, 0x36253380,
+    0x364fe180, 0x367a0040, 0x36a38f80, 0x36cc8ec0,
+    0x36f4fe80, 0x371cde80, 0x37442e80, 0x376aef00,
+    0x37912000, 0x37b6c200, 0x37dbd600, 0x38005d00,
+    0x38245840, 0x3847c880, 0x386aaf80, 0x388d0e80,
+    0x38aee700, 0x38d03bc0, 0x38f11000, 0x39116700,
+    0x39314440, 0x3950ab00, 0x396f9e80, 0x398e22c0,
+    0x39ac3c40, 0x39c9f280, 0x39e74cc0, 0x3a045280,
+    0x3a210b40, 0x3a3d7ec0, 0x3a59b480, 0x3a75b480,
+    0x3a918900, 0x3aad3cc0, 0x3ac8db00, 0x3ae46bc0,
+    0x3afff080, 0x3b1b6840, 0x3b36d2c0, 0x3b521980,
+    0x3b6d0780, 0x3b876400, 0x3ba0f4c0, 0x3bb96740,
+    0x3bd03dc0, 0x3be56580, 0x3bf6dec0, 0x3c0c6140,
+    0x3c15a9c0, 0x3c1a5780, 0x3c1fd0c0, 0x3c25edc0,
+    0x3c2c78c0, 0x3c333880, 0x3c39f3c0, 0x3c409100,
+    0x3c471d00, 0x3c4da780, 0x3c543f40, 0x3c5ae880,
+    0x3c619f00, 0x3c685f00, 0x3c6f25c0, 0x3c75f280,
+    0x3c7cc6c0, 0x3c83a2c0, 0x3c8a87c0, 0x3c9175c0,
+    0x3c986d00, 0x3c9f6e00, 0x3ca67880, 0x3cad8c40,
+    0x3cb4a980, 0x3cbbd000, 0x3cc2ffc0, 0x3cca3940,
+    0x3cd17d40, 0x3cd8cb80, 0x3ce02480, 0x3ce78740,
+    0x3ceef3c0, 0x3cf66a00, 0x3cfdea00, 0x3d0574c0,
+    0x3d0d0a40, 0x3d14ab40, 0x3d1c5700, 0x3d240d00,
+    0x3d2bcd40, 0x3d3397c0, 0x3d3b6cc0, 0x3d434d00,
+    0x3d4b38c0, 0x3d532fc0, 0x3d5b3180, 0x3d633dc0,
+    0x3d6b53c0, 0x3d737400, 0x3d7b9f00, 0x3d83d540,
+    0x3d8c1680, 0x3d946200, 0x3d9cb780, 0x3da51680,
+    0x3dad7f00, 0x3db5f140, 0x3dbe6dc0, 0x3dc6f480,
+    0x3dcf8540, 0x3dd81fc0, 0x3de0c300, 0x3de96ec0,
+    0x3df22340, 0x3dfae0c0, 0x3e03a800, 0x3e0c7840,
+    0x3e155180, 0x3e1e32c0, 0x3e271bc0, 0x3e300c00,
+    0x3e390400, 0x3e420400, 0x3e4b0c40, 0x3e541c80,
+    0x3e5d33c0, 0x3e6651c0, 0x3e6f7580, 0x3e789fc0,
+    0x3e81d080, 0x3e8b0880, 0x3e944700, 0x3e9d8c00,
+    0x3ea6d680, 0x3eb02600, 0x3eb97a80, 0x3ec2d400,
+    0x3ecc3340, 0x3ed59880, 0x3edf0300, 0x3ee87280,
+    0x3ef1e600, 0x3efb5d40, 0x3f04d880, 0x3f0e5840,
+    0x3f17dcc0, 0x3f216600, 0x3f2af340, 0x3f348440,
+    0x3f3e1840, 0x3f47af40, 0x3f514a00, 0x3f5ae840,
+    0x3f648b00, 0x3f6e3140, 0x3f77db00, 0x3f818740,
+    0x3f8b3600, 0x3f94e780, 0x3f9e9c40, 0x3fa85480,
+    0x3fb21080, 0x3fbbcfc0, 0x3fc59200, 0x3fcf56c0,
+    0x3fd91dc0, 0x3fe2e640, 0x3fecb040, 0x3ff67b40,
+    0x40098600, 0x40135580, 0x401d2700, 0x4026fa00,
+    0x4030ce80, 0x403aa380, 0x40447900, 0x404e4f00,
+    0x40582400, 0x4061f900, 0x406bcd00, 0x4075a080,
+    0x407f7480, 0x40894900, 0x40931e00, 0x409cf280,
+    0x40a6c600, 0x40b09800, 0x40ba6980, 0x40c43a80,
+    0x40ce0b00, 0x40d7db00, 0x40e1ab00, 0x40eb7980,
+    0x40f54600, 0x40ff1080, 0x4108d980, 0x4112a100,
+    0x411c6800, 0x41262d80, 0x412ff080, 0x4139b180,
+    0x41436e80, 0x414d2980, 0x4156e100, 0x41609700,
+    0x416a4a80, 0x4173fb00, 0x417da800, 0x41875000,
+    0x4190f400, 0x419a9400, 0x41a43000, 0x41adc880,
+    0x41b75d00, 0x41c0ec80, 0x41ca7700, 0x41d3fb00,
+    0x41dd7980, 0x41e6f280, 0x41f06600, 0x41f9d480,
+    0x42033d00, 0x420c9f00, 0x4215f980, 0x421f4d00,
+    0x42289900, 0x4231de80, 0x423b1d00, 0x42445500,
+    0x424d8500, 0x4256ad00, 0x425fcc80, 0x4268e380,
+    0x4271f200, 0x427af900, 0x4283f880, 0x428cef80,
+    0x4295de00, 0x429ec280, 0x42a79d80, 0x42b06f00,
+    0x42b93800, 0x42c1f800, 0x42caaf80, 0x42d35d80,
+    0x42dc0100, 0x42e49b00, 0x42ed2a80, 0x42f5b080,
+    0x42fe2d80, 0x4306a180, 0x430f0c80, 0x43176d80,
+    0x431fc480, 0x43281100, 0x43305400, 0x43388e80,
+    0x4340c000, 0x4348e900, 0x43510900, 0x43591f00,
+    0x43612b80, 0x43692f00, 0x43712900, 0x43791a80,
+    0x43810380, 0x4388e400, 0x4390bc00, 0x43988b00,
+    0x43a05180, 0x43a80f00, 0x43afc480, 0x43b77180,
+    0x43bf1780, 0x43c6b700, 0x43ce5100, 0x43d5e580,
+    0x43dd7100, 0x43e4ef80, 0x43ec5b80, 0x43f3ba80,
+    0x43fb1c80, 0x44029400, 0x440a2e80, 0x4411d080,
+    0x44193800, 0x44202480, 0x44265880, 0x442ba780,
+    0x442d8680, 0x4428a500, 0x44241380, 0x441ccb00,
+    0x44140100, 0x440a1200, 0x43ff7280, 0x43f46980,
+    0x43e93200, 0x43ddff00, 0x43d2dc80, 0x43c7ac00,
+    0x43bc4900, 0x43b09400, 0x43a47d80, 0x4397fd80,
+    0x438b0780, 0x437d9b80, 0x436fd380, 0x4361cd80,
+    0x4353a800, 0x43457500, 0x43373c80, 0x43290500,
+    0x431ad400, 0x430ca280, 0x42fe6000, 0x42f00080,
+    0x42e17380, 0x42d29e00, 0x42c35d80, 0x42b39200,
+    0x42a32080, 0x4291fc00, 0x42801900, 0x426d6d80,
+    0x4259f680, 0x4245bd00, 0x4230ca80, 0x421b2900,
+    0x4204e800, 0x41ee1d00, 0x41d6dd80, 0x41bf3c80,
+    0x41a74680, 0x418f0680, 0x41768800, 0x415dd100,
+    0x4144e400, 0x412bbf80, 0x41126400, 0x40f8cc00,
+    0x40deea00, 0x40c4b100, 0x40aa1400, 0x408f0800,
+    0x40738380, 0x40577d80, 0x403aeb80, 0x401dc180,
+    0x3ffff240, 0x3fe170c0, 0x3fc232c0, 0x3fa23680,
+    0x3f817c40, 0x3f6002c0, 0x3f3ddec0, 0x3f1b4180,
+    0x3ef85d40, 0x3ed56340, 0x3eb27240, 0x3e8f9c40,
+    0x3e6cf400, 0x3e4a81c0, 0x3e282140, 0x3e059980,
+    0x3de2b280, 0x3dbf4100, 0x3d9b3640, 0x3d768b00,
+    0x3d513640, 0x3d2b3840, 0x3d049b80, 0x3cdd6b40,
+    0x3cb5b400, 0x3c8d8f40, 0x3c652080, 0x3c3c8c40,
+    0x3c13f480, 0x3beb7580, 0x3bc327c0, 0x3b9b2680,
+    0x3b737000, 0x3b4bc580, 0x3b23d740, 0x3afb5640,
+    0x3ad21c40, 0x3aa83780, 0x3a7dbc40, 0x3a52bf80,
+    0x3a276600, 0x39fbe0c0, 0x39d06140, 0x39a50ec0,
+    0x3979e300, 0x394ebf40, 0x392386c0, 0x38f82280,
+    0x38cc89c0, 0x38a0b7c0, 0x3874a740, 0x38485840,
+    0x381bd1c0, 0x37ef1b40, 0x37c23cc0, 0x37953dc0,
+    0x376825c0, 0x373afc80, 0x370dc980, 0x36e09440,
+    0x36b41dc0, 0x36862100, 0x3657e480, 0x36297240,
+    0x35fad380, 0x35cc1200, 0x359d36c0, 0x356e4b40,
+    0x353f5880, 0x35106780, 0x34e17780, 0x34b28240,
+    0x34838040, 0x345466c0, 0x34251940, 0x33f57280,
+    0x33c54bc0, 0x33949840, 0x33638380, 0x33324980,
+    0x33012500, 0x32d04480, 0x329fc7c0, 0x326fcbc0,
+    0x324068c0, 0x32116fc0, 0x31e27600, 0x31b30fc0,
+    0x3182e300, 0x3151e240, 0x312029c0, 0x30edd080,
+    0x30baf700, 0x3087cd00, 0x30548600, 0x30215680,
+    0x2fee65c0, 0x2fbbca40, 0x2f899980, 0x2f57e6c0,
+    0x2f26b540, 0x2ef5f980, 0x2ec5aa00, 0x2e95afc0,
+    0x2e65c180, 0x2e357b40, 0x2e047840, 0x2dd27380,
+    0x2d9f6c40, 0x2d6b7780, 0x2d36a6c0, 0x2d012940,
+    0x2ccb5680, 0x2c958a00, 0x2c601b80, 0x2c2b3640,
+    0x2bf6dfc0, 0x2bc31ec0, 0x2b8ff500, 0x2b5d5540,
+    0x2b2b2a00, 0x2af95e80, 0x2ac7dd80, 0x2a968f80,
+    0x2a655d40, 0x2a342f00, 0x2a02e8c0, 0x29d16700,
+    0x299f8640, 0x296d2380, 0x293a2740, 0x29068400,
+    0x28d22b40, 0x289d1540, 0x28675280, 0x28310180,
+    0x27fa3f00, 0x27c32f80, 0x278c08c0, 0x275505c0,
+    0x271e60c0, 0x26e84b00, 0x26b2e880, 0x267e5cc0,
+    0x264ac940, 0x26183a40, 0x25e6aa80, 0x25b615c0,
+    0x25866b80, 0x25576b40, 0x2528ba00, 0x24f9ffc0,
+    0x24cadfc0, 0x249af540, 0x2469da80, 0x24372780,
+    0x2402b800, 0x23ccbfc0, 0x23957cc0, 0x235d3140,
+    0x23245200, 0x22eb8000, 0x22b35cc0, 0x227c7940,
+    0x22471d40, 0x22136840, 0x21e18240, 0x21b15d80,
+    0x21827dc0, 0x21544600, 0x21261b00, 0x20f78600,
+    0x20c83e00, 0x20980000, 0x20668e00, 0x2033f300,
+    0x20007400, 0x1fcc64e0, 0x1f97d120, 0x1f642320,
+    0x1f2f49e0, 0x1efaa840, 0x1ec73580, 0x1e94d880,
+    0x1e636120, 0x1e32a160, 0x1e025ba0, 0x1dd24300,
+    0x1da20e60, 0x1d717940, 0x1d407560, 0x1d0f2040,
+    0x1cdd95c0, 0x1cabf500, 0x1c7a6940, 0x1c492340,
+    0x1c185680, 0x1be818c0, 0x1bb83f60, 0x1b888d20,
+    0x1b58c640, 0x1b28c240, 0x1af871e0, 0x1ac7c960,
+    0x1a96bf00, 0x1a656b60, 0x1a340360, 0x1a02bd20,
+    0x19d1c6c0, 0x19a12f40, 0x1970f480, 0x19411640,
+    0x19119000, 0x18e255a0, 0x18b358a0, 0x18848b20,
+    0x1855e040, 0x18274e00, 0x17f8c9e0, 0x17ca4a80,
+    0x179bce40, 0x176d5a60, 0x173ef400, 0x17109fe0,
+    0x16e25f60, 0x16b43240, 0x16861880, 0x16581220,
+    0x162a20c0, 0x15fc4620, 0x15ce8420, 0x15a0dca0,
+    0x157351c0, 0x1545e580, 0x151899a0, 0x14eb6ec0,
+    0x14be63a0, 0x14917a00, 0x14649ae0, 0x14377060,
+    0x1409d0c0, 0x13dbbb20, 0x13ad58e0, 0x137f0160,
+    0x1350cc80, 0x1322b8c0, 0x12f4ca60, 0x12c704e0,
+    0x129968a0, 0x126bf5c0, 0x123eade0, 0x12119300,
+    0x11e4a660, 0x11b7e860, 0x118b5940, 0x115ef8a0,
+    0x1132c600, 0x1106c1a0, 0x10daecc0, 0x10af4900,
+    0x1083d7a0, 0x10589c00, 0x102d9a00, 0x1002d1e0,
+    0x0fd842c0, 0x0fadde80, 0x0f839a50, 0x0f597700,
+    0x0f2f76e0, 0x0f05a170, 0x0edbf9c0, 0x0eb27f30,
+    0x0e8930d0, 0x0e600d70, 0x0e371550, 0x0e0e4950,
+    0x0de5ab50, 0x0dbd3d20, 0x0d94fe10, 0x0d6cecb0,
+    0x0d450220, 0x0d1d38f0, 0x0cf59130, 0x0cce0c30,
+    0x0ca6af10, 0x0c7f7b80, 0x0c587010, 0x0c318960,
+    0x0c0ac200, 0x0be418d0, 0x0bbd8da0, 0x0b9724e0,
+    0x0b70e6c0, 0x0b4ad970, 0x0b2502f0, 0x0aff6930,
+    0x0ada1250, 0x0ab50430, 0x0a9044d0, 0x0a6bda30,
+    0x0a3bedf0, 0x0a18be40, 0x09f5e530, 0x09d35cf0,
+    0x09b11ff0, 0x098f2890, 0x096d7120, 0x094bf400,
+    0x092aab80, 0x09099240, 0x08e8a620, 0x08c7e850,
+    0x08a75990, 0x0886fae0, 0x0866ccf0, 0x0846d070,
+    0x08270610, 0x08076e70, 0x07e80ac8, 0x07c8dc60,
+    0x07a9e440, 0x078b2348, 0x076c99d0, 0x074e4818,
+    0x07302e50, 0x07124d18, 0x06f4a530, 0x06d73778,
+    0x06ba0488, 0x069d0c88, 0x06804f68, 0x0663cce0,
+    0x06478528, 0x062b78a0, 0x060fa7e8, 0x05f413b8,
+    0x05d8bc38, 0x05bda128, 0x05a2c258, 0x05881f60,
+    0x056db888, 0x05538e60, 0x0539a170, 0x051ff218,
+    0x05068040, 0x04ed4b90, 0x04d45398, 0x04bb9820,
+    0x04a31988, 0x048ad860, 0x0472d528, 0x045b0ff0,
+    0x04438860, 0x042c3de8, 0x04153040, 0x03fe5f4c,
+    0x03e7cb98, 0x03d17580, 0x03bb5d64, 0x03a582e8,
+    0x038fe588, 0x037a8494, 0x03655fcc, 0x03507768,
+    0x033bcbb4, 0x03275d28, 0x03132bc0, 0x02ff370c,
+    0x02eb7e94, 0x02d801e8, 0x02c4c11c, 0x02b1bcbc,
+    0x029ef578, 0x028c6ba8, 0x027a1f20, 0x02680f54,
+    0x02563bac, 0x0244a3c8, 0x023347a0, 0x02222730,
+    0x0211429c, 0x02009938, 0x01f02974, 0x01dff1ae,
+    0x01cff058, 0x01c024c8, 0x01b08ef4, 0x01a12eda,
+    0x019204b0, 0x01831138, 0x01745588, 0x0165d2c2,
+    0x01578a96, 0x01497ffc, 0x013bb670, 0x012e3160,
+    0x0120f146, 0x0113f27c, 0x0107310c, 0x00faa909,
+    0x00ee57a1, 0x00e23b09, 0x00d6515b, 0x00ca9977,
+    0x00bf1509, 0x00b3c74d, 0x00a8b388, 0x009ddb3d,
+    0x00933bf2, 0x0088d22c, 0x007e9a70, 0x0074935a,
+    0x006abe70, 0x00611d5c, 0x0057b1f8, 0x004e7e73,
+    0x0045859b, 0x003cca96, 0x00344f32, 0x002c1074,
+    0x00240873, 0x001c31ba, 0x0014863f, 0x000cfe8b,
+    0x00059307, 0xfffe3b9a, 0xfff6f718, 0xffefcd4d,
+    0xffe8c6f4, 0xffe1ed10, 0xffdb4c57, 0xffd4f484,
+    0xffcef5dc, 0xffc95d0c, 0xffc4284e, 0xffbf4e14,
+    0xffbac5ae, 0xffb68360, 0xffb27548, 0xffae87be,
+    0xffaaa733, 0xffa6c67e, 0xffa2e141, 0xff9ef40c,
+    0xff9afc25, 0xff970058, 0xff930f7c, 0xff8f3857,
+    0xff8b8900, 0xff880bfe, 0xff84c9ea, 0xff81cbbd,
+    0xff7f17ad, 0xff7cadc6, 0xff7a8c4e, 0xff78b1cd,
+    0xff7719f3, 0xff75bd06, 0xff7492a4, 0xff7392bf,
+    0xff72b600, 0xff71f5c6, 0xff714b72, 0xff70b0ed,
+    0xff702232, 0xff6f9c90, 0xff6f1cee, 0xff6ea21f,
+    0xff6e2e9c, 0xff6dc617, 0xff6d6c09, 0xff6d2425,
+    0xff6cf267, 0xff6cdaca, 0xff6ce155, 0xff6d0983,
+    0xff6d56bb, 0xff6dcc4c, 0xff6e6cd0, 0xff6f3832,
+    0xff702cc4, 0xff71492e, 0xff728ae2, 0xff73ed63,
+    0xff756b7c, 0xff77001c, 0xff78a5d9, 0xff7a5693,
+    0xff7c0c40, 0xff7dc141, 0xff7f74aa, 0xff81298b,
+    0xff82e2de, 0xff84a3de, 0xff8670bd, 0xff884e42,
+    0xff8a410c, 0xff8c4c7f, 0xff8e70fc, 0xff90ae18,
+    0xff93037e, 0xff956f12, 0xff97ec86, 0xff9a7724,
+    0xff9d0a9d, 0xff9fa3ea, 0xffa2417e, 0xffa4e1ac,
+    0xffa78332, 0xffaa265a, 0xffaccc26, 0xffaf758e,
+    0xffb223d4, 0xffb4d906, 0xffb79726, 0xffba604e,
+    0xffbd349e, 0xffc011a8, 0xffc2f4d2, 0xffc5db82,
+    0xffc8c45f, 0xffcbaed5, 0xffce9a6d, 0xffd186c6,
+    0xffd473aa, 0xffd760e5, 0xffda4e55, 0xffdd3bd0,
+    0xffe0292b, 0xffe31645, 0xffe602ff, 0xffe8eef7,
+    0xffebd978, 0xffeec1bf, 0xfff1a72c, 0xfff488fe,
+    0xfff76689, 0xfffa3f2c, 0xfffd1245, 0xffffdf33,
+    0x000020ac, 0x0002e66f, 0x0005a937, 0x00086839,
+    0x000b22b3, 0x000dd7da, 0x001086ec, 0x00132f3c,
+    0x0015d001, 0x00186897, 0x001af849, 0x001d7eb6,
+    0x001ffbbe, 0x00226f41, 0x0024d8e8, 0x00273874,
+    0x00298d82, 0x002bd7aa, 0x002e16d4, 0x00304af6,
+    0x00327406, 0x00349203, 0x0036a416, 0x0038a893,
+    0x003a9da0, 0x003c8170, 0x003e53b8, 0x0040159a,
+    0x0041c816, 0x00436c92, 0x0045042c, 0x00468ff2,
+    0x00481106, 0x004987fe, 0x004af466, 0x004c5599,
+    0x004daae4, 0x004ef28c, 0x005029c4, 0x00514d9a,
+    0x00525b57, 0x005351f7, 0x00543190, 0x0054fa43,
+    0x0055ac2f, 0x00564938, 0x0056d3f7, 0x00574f3c,
+    0x0057bdd7, 0x00582260, 0x00587f28, 0x0058d6b1,
+    0x0059293c, 0x0059741a, 0x0059b472, 0x0059e73c,
+    0x005a0976, 0x005a1870, 0x005a116e, 0x0059f224,
+    0x0059b964, 0x005966ce, 0x0058f9e2, 0x005872e8,
+    0x0057d407, 0x00571f82, 0x005657b0, 0x00557ecd,
+    0x00549731, 0x0053a34b, 0x0052a56a, 0x00519fc6,
+    0x00509482, 0x004f85a4, 0x004e74ee, 0x004d6214,
+    0x004c4bd3, 0x004b314c, 0x004a1110, 0x0048e8c8,
+    0x0047b5f7, 0x00467626, 0x00452690, 0x0043c405,
+    0x00424b7f, 0x0040ba04, 0x003f0e53, 0x003d488b,
+    0x003b688c, 0x00396eb6, 0x00375dfb, 0x00353aaa,
+    0x003308ac, 0x0030ccb1, 0x002e8cf1, 0x002c4fd5,
+    0x002a1be8, 0x0027f486, 0x0025d90d, 0x0023c852,
+    0x0021c13b, 0x001fbf23, 0x001dbafc, 0x001badc6,
+    0x00199136, 0x00176150, 0x00151b86, 0x0012bcd1,
+    0x001044d1, 0x000db8d0, 0x000b1f43, 0x00087e89,
+    0x0005dbe2, 0x00033b1e, 0x00009fee, 0xfffe0d82,
+    0xfffb83cf, 0xfff90047, 0xfff6805a, 0xfff4019a,
+    0xfff18203, 0xffeeffb2, 0xffec78ba, 0xffe9ec4d,
+    0xffe75b4e, 0xffe4c71f, 0xffe23138, 0xffdf9ae6,
+    0xffdd0574, 0xffda723c, 0xffd7e24a, 0xffd55567,
+    0xffd2cabe, 0xffd04161, 0xffcdb890, 0xffcb306a,
+    0xffc8a95c, 0xffc62406, 0xffc3a140, 0xffc12188,
+    0xffbea542, 0xffbc2cc2, 0xffb9b7d2, 0xffb745f2,
+    0xffb4d6ac, 0xffb268fe, 0xffaffc72, 0xffad90e8,
+    0xffab263e, 0xffa8bcb8, 0xffa6547e, 0xffa3ed7b,
+    0xffa187ba, 0xff9f2351, 0xff9cc055, 0xff9a5ebc,
+    0xff97fe84, 0xff959f84, 0xff934146, 0xff90e37d,
+    0xff8e858a, 0xff8c26c0, 0xff89c69e, 0xff876483,
+    0xff84ffe4, 0xff82982b, 0xff802cb6, 0xff7dbccf,
+    0xff7b47b4, 0xff78ccd0, 0xff764b6c, 0xff73c2db,
+    0xff713227, 0xff6e9864, 0xff6bf470, 0xff694553,
+    0xff668a0d, 0xff63c1a6, 0xff60ec34, 0xff5e0e9e,
+    0xff5b30d3, 0xff585b8c, 0xff5595c9, 0xff52e1da,
+    0xff5040a0, 0xff4db31c, 0xff4b3a3b, 0xff48d67e,
+    0xff468850, 0xff445011, 0xff422ded, 0xff4021f9,
+    0xff3e2c56, 0xff3c4cf8, 0xff3a83df, 0xff38d0ec,
+    0xff3733c9, 0xff35ac14, 0xff343963, 0xff32db09,
+    0xff319066, 0xff305898, 0xff2f323d, 0xff2e1bb2,
+    0xff2d1369, 0xff2c18f8, 0xff2b2d2a, 0xff2a50e1,
+    0xff2984f4, 0xff28c978, 0xff281e01, 0xff278245,
+    0xff26f5c3, 0xff26785a, 0xff2609bf, 0xff25a9c8,
+    0xff255814, 0xff2513f6, 0xff24dcc4, 0xff24b1a6,
+    0xff2492b1, 0xff248093, 0xff247c0b, 0xff2485c6,
+    0xff249daf, 0xff24c359, 0xff24f639, 0xff253605,
+    0xff258312, 0xff25ddd5, 0xff2646e7, 0xff26be25,
+    0xff274264, 0xff27d1f6, 0xff286b19, 0xff290c13,
+    0xff29b30d, 0xff2a5e38, 0xff2b0bbd, 0xff2bb9a2,
+    0xff29a9d2, 0xff2a53dc, 0xff2b0a5a, 0xff2bcd43,
+    0xff2c9c76, 0xff2d7808, 0xff2e5ffa, 0xff2f544c,
+    0xff305528, 0xff316299, 0xff327ce0, 0xff33a432,
+    0xff34d8ba, 0xff361a8e, 0xff3768f8, 0xff38c2f5,
+    0xff3a2784, 0xff3b9623, 0xff3d0ef4, 0xff3e9277,
+    0xff4020ed, 0xff41ba14, 0xff435ccc, 0xff4507fd,
+    0xff46ba84, 0xff4873ac, 0xff4a32ea, 0xff4bf7bb,
+    0xff4dc17f, 0xff4f8fa0, 0xff516167, 0xff53361d,
+    0xff550d79, 0xff56e7ee, 0xff58c5ff, 0xff5aa84d,
+    0xff5c8e41, 0xff5e75e2, 0xff605d4d, 0xff6242b6,
+    0xff6424b8, 0xff66023d, 0xff67da44, 0xff69abd6,
+    0xff6b7646, 0xff6d38e8, 0xff6ef348, 0xff70a4ce,
+    0xff724d0f, 0xff73eb95, 0xff757fff, 0xff770a2d,
+    0xff788a20, 0xff79fff6, 0xff7b6be7, 0xff7cce52,
+    0xff7e27e4, 0xff7f78fc, 0xff80c38a, 0xff820e98,
+    0xff836378, 0xff84caaa, 0xff864990, 0xff87dff4,
+    0xff898c30, 0xff8b4cda, 0xff8d207a, 0xff8f05cc,
+    0xff90fb9b, 0xff930098, 0xff95138e, 0xff97332d,
+    0xff995e2a, 0xff9b934e, 0xff9dd18c, 0xffa017e3,
+    0xffa26550, 0xffa4b8e7, 0xffa711a8, 0xffa96eae,
+    0xffabcefc, 0xffae31cc, 0xffb09680, 0xffb2fc82,
+    0xffb5635a, 0xffb7ca52, 0xffba30a8, 0xffbc95a8,
+    0xffbef8a4, 0xffc158d0, 0xffc3b557, 0xffc60d6b,
+    0xffc86041, 0xffcaacb7, 0xffccf1cb, 0xffcf2e5c,
+    0xffd161e8, 0xffd38c8f, 0xffd5ae88, 0xffd7c808,
+    0xffd9d925, 0xffdbe1c8, 0xffdde1f3, 0xffdfd964,
+    0xffe1c79b, 0xffe3abcc, 0xffe5852a, 0xffe75341,
+    0xffe9162f, 0xffeace55, 0xffec7c15, 0xffee1f63,
+    0xffefb7e9, 0xfff1453d, 0xfff2c6fd, 0xfff43ca8,
+    0xfff5a5d4, 0xfff701ea, 0xfff850b4, 0xfff99288,
+    0xfffac853, 0xfffbf2d5, 0xfffd12e6, 0xfffe2991,
+    0xffff37e4, 0x00003eea, 0x00013ec4, 0x00023646,
+    0x0003244d, 0x00040797, 0x0004de8c, 0x0005a734,
+    0x00065fab, 0x0007068f, 0x00079c82, 0x000822fa,
+    0x00089b70, 0x000907a6, 0x00096a01, 0x0009c506,
+    0x000a1b37, 0x000a6e18, 0x000abe1f, 0x000b0bac,
+    0x000b5701, 0x000b9f3b, 0x000be2c2, 0x000c1fff,
+    0x000c5599, 0x000c829a, 0x000ca661, 0x000cc058,
+    0x000cd028, 0x000cd63d, 0x000cd317, 0x000cc739,
+    0x000cb36d, 0x000c98c0, 0x000c7833, 0x000c52df,
+    0x000c2984, 0x000bfcf9, 0x000bcdea, 0x000b9cf7,
+    0x000b6a97, 0x000b3700, 0x000b029d, 0x000acd79,
+    0x000a977e, 0x000a6076, 0x000a2838, 0x0009eea1,
+    0x0009b37d, 0x000976c2, 0x0009384e, 0x0008f816,
+    0x0008b612, 0x0008724a, 0x00082cd5, 0x0007e5e8,
+    0x00079dce, 0x000754de, 0x00070b62, 0x0006c1c6,
+    0x0006786a, 0x00062fba, 0x0005e801, 0x0005a1a0,
+    0x00055ce1, 0x000519fb, 0x0004d8f8, 0x000499b8,
+    0x00045c30, 0x00042040, 0x0003e5c8, 0x0003acb3,
+    0x000374df, 0x00033e59, 0x00030934, 0x0002d57d,
+    0x0002a348, 0x000272b6, 0x000243f2, 0x00021711,
+    0x0001ec3e, 0x0001c37a, 0x00019cc3, 0x00017830,
+    0x000155a0, 0x00013514, 0x0001168b, 0x0000f9e6,
+    0x0000df23, 0x0000c62e, 0x0000aef2, 0x00009978,
+    0x000085a1, 0x0000736d, 0x000062dc, 0x000053d8,
+    0x0000466c, 0x00003a62, 0x00002fd1, 0x00002681,
+    0x00001e73, 0x00001792, 0x000011c9, 0x00000cf6,
+    0x0000091a, 0x000005ff, 0x000003b1, 0x00000203,
+    0x000000d7, 0x0000002b, 0xffffffd5, 0xffffffc0,
+    0xffffffd5, 0x00000000, 0x00000015, 0x00000000,
+    0x00000000, 0x00000015, 0x00000000, 0xffffffd5,
+    0xffffffca, 0xffffffd5, 0x0000002b, 0x000000cc,
+    0x000001e3, 0x0000037b, 0x0000059f, 0x0000086e,
+    0x00000bf4, 0x0000103b, 0x00001564, 0x00001b6e,
+    0x0000226f, 0x00002a68, 0x00003377, 0x00003d93,
+    0x000048c5, 0x00005525, 0x000062a6, 0x00007155,
+    0x0000812f, 0x00009237, 0x0000a455, 0x0000b7ab,
+    0x0000cc18, 0x0000e1bd, 0x0000f878, 0x0001106c,
+    0x00012981, 0x000143c2, 0x00015f30, 0x00017bb6,
+    0x00019948, 0x0001b7e6, 0x0001d771, 0x0001f7bc,
+    0x000218b4, 0x00023a42, 0x00025c3b, 0x00027ea0,
+    0x0002a150, 0x0002c440, 0x0002e771, 0x00030aed,
+    0x00032eb4, 0x000352db, 0x00037759, 0x00039c4c,
+    0x0003c1ac, 0x0003e74b, 0x00040d00, 0x0004329f,
+    0x000457de, 0x00047c9c, 0x0004a083, 0x0004c35e,
+    0x0004e502, 0x00050543, 0x000523ec, 0x000540e7,
+    0x00055c2b, 0x000575c0, 0x00058da9, 0x0005a3e4,
+    0x0005b886, 0x0005cbb1, 0x0005dd65, 0x0005edcb,
+    0x0005fcfa, 0x00060afc, 0x00061808, 0x000623fc,
+    0x00062ec3, 0x00063849, 0x0006404b, 0x000646ac,
+    0x00064b13, 0x00064d37, 0x00064cd6, 0x0006497b,
+    0x000642c5, 0x0006385e, 0x000629f0, 0x00061766,
+    0x000600a0, 0x0005e57d, 0x0005c63e, 0x0005a322,
+    0x00057c97, 0x00055306, 0x00052711, 0x0004f96f,
+    0x0004caeb, 0x00049bfc, 0x00046c96, 0x00043cbb,
+    0x00040c3f, 0x0003daab, 0x0003a734, 0x000370f9,
+    0x0003372d, 0x0002f944, 0x0002b6d4, 0x00026f71,
+    0x000222fb, 0x0001d212, 0x00017d84, 0x00012630,
+    0x0000ccda, 0x00007200, 0x0000163b, 0xffffba15,
+    0xffff5da3, 0xffff0091, 0xfffea293, 0xfffe4367,
+    0xfffde2da, 0xfffd809f, 0xfffd1c81, 0xfffcb66a,
+    0xfffc4e90, 0xfffbe53e, 0xfffb7aa0, 0xfffb0f0a,
+    0xfffaa2c9, 0xfffa3612, 0xfff9c92f, 0xfff95c2d,
+    0xfff8eef4, 0xfff8817c, 0xfff813c3, 0xfff7a5d4,
+    0xfff737e5, 0xfff6ca17, 0xfff65c9e, 0xfff5efbc,
+    0xfff58390, 0xfff51830, 0xfff4adbc, 0xfff44435,
+    0xfff3db9a, 0xfff373d6, 0xfff30cfd, 0xfff2a71c,
+    0xfff24248, 0xfff1de9f, 0xfff17c44, 0xfff11b56,
+    0xfff0bbea, 0xfff05e17, 0xfff00206, 0xffefa7d9,
+    0xffef4f99, 0xffeef95d, 0xffeea53a, 0xffee533a,
+    0xffee035e, 0xffedb5b0, 0xffed6a3c, 0xffed20f5,
+    0xffecd9fe, 0xffec9555, 0xffec5305, 0xffec1319,
+    0xffebd591, 0xffeb9a83, 0xffeb61f9, 0xffeb2bfe,
+    0xffeaf89c, 0xffeac7ea, 0xffea99d2, 0xffea6e7e,
+    0xffea45ef, 0xffea203a, 0xffe9fda0, 0xffe9decc,
+    0xffe9c3de, 0xffe9ac56, 0xffe99789, 0xffe9845e,
+    0xffe97295, 0xffe96219, 0xffe952ea, 0xffe944f3,
+    0xffe93833, 0xffe92c9f, 0xffe92238, 0xffe918fe,
+    0xffe910fb, 0xffe90a3a, 0xffe904c6, 0xffe900a0,
+    0xffe8fddb, 0xffe8fc83, 0xffe8fca4, 0xffe8fe3c,
+    0xffe9016c, 0xffe9061e, 0xffe90c74, 0xffe9146c,
+    0xffe91e11, 0xffe929a5, 0xffe93731, 0xffe946c0,
+    0xffe95833, 0xffe96b7e, 0xffe98082, 0xffe9975e,
+    0xffe9affd, 0xffe9ca5e, 0xffe9e68e, 0xffea0481,
+    0xffea242b, 0xffea458e, 0xffea6894, 0xffea8d52,
+    0xffeab3c8, 0xffeadc0c, 0xffeb05fe, 0xffeb31a7,
+    0xffeb5ede, 0xffeb8da2, 0xffebbdf4, 0xffebefbd,
+    0xffec231f, 0xffec5802, 0xffec8e5e, 0xffecc61c,
+    0xffecff1c, 0xffed391e, 0xffed740c, 0xffedafb1,
+    0xffedebe1, 0xffee287d, 0xffee654e, 0xffeea23f,
+};
diff --git a/libavcodec/aactab.h b/libavcodec/aactab.h
index fc26db8..a0d44a2 100644
--- a/libavcodec/aactab.h
+++ b/libavcodec/aactab.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2005-2006 Oded Shimon ( ods15 ods15 dyndns org )
  * Copyright (c) 2006-2007 Maxim Gavrilov ( maxim.gavrilov gmail com )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,22 +32,117 @@
 
 #include "libavutil/mem.h"
 #include "aac.h"
-#include "aac_tablegen_decl.h"
 
 #include <stdint.h>
 
 /* NOTE:
- * Tables in this file are used by the AAC decoder and will be used by the AAC
- * encoder.
+ * Tables in this file are shared by the AAC decoders and encoder
  */
 
+extern float ff_aac_pow2sf_tab[428];
+extern float ff_aac_pow34sf_tab[428];
+
+static inline void ff_aac_tableinit(void)
+{
+    int i;
+
+    /* 2^(i/16) for 0 <= i <= 15 */
+    static const float exp2_lut[] = {
+        1.00000000000000000000,
+        1.04427378242741384032,
+        1.09050773266525765921,
+        1.13878863475669165370,
+        1.18920711500272106672,
+        1.24185781207348404859,
+        1.29683955465100966593,
+        1.35425554693689272830,
+        1.41421356237309504880,
+        1.47682614593949931139,
+        1.54221082540794082361,
+        1.61049033194925430818,
+        1.68179283050742908606,
+        1.75625216037329948311,
+        1.83400808640934246349,
+        1.91520656139714729387,
+    };
+    float t1 = 8.8817841970012523233890533447265625e-16; // 2^(-50)
+    float t2 = 3.63797880709171295166015625e-12; // 2^(-38)
+    int t1_inc_cur, t2_inc_cur;
+    int t1_inc_prev = 0;
+    int t2_inc_prev = 8;
+
+    for (i = 0; i < 428; i++) {
+        t1_inc_cur = 4 * (i % 4);
+        t2_inc_cur = (8 + 3*i) % 16;
+        if (t1_inc_cur < t1_inc_prev)
+            t1 *= 2;
+        if (t2_inc_cur < t2_inc_prev)
+            t2 *= 2;
+        // A much more efficient and accurate way of doing:
+        // ff_aac_pow2sf_tab[i] = pow(2, (i - POW_SF2_ZERO) / 4.0);
+        // ff_aac_pow34sf_tab[i] = pow(ff_aac_pow2sf_tab[i], 3.0/4.0);
+        ff_aac_pow2sf_tab[i] = t1 * exp2_lut[t1_inc_cur];
+        ff_aac_pow34sf_tab[i] = t2 * exp2_lut[t2_inc_cur];
+        t1_inc_prev = t1_inc_cur;
+        t2_inc_prev = t2_inc_cur;
+    }
+}
+
+/* @name ltp_coef
+ * Table of the LTP coefficients
+ */
+static const INTFLOAT ltp_coef[8] = {
+    Q30(0.570829), Q30(0.696616), Q30(0.813004), Q30(0.911304),
+    Q30(0.984900), Q30(1.067894), Q30(1.194601), Q30(1.369533),
+};
+
+/* @name tns_tmp2_map
+ * Tables of the tmp2[] arrays of LPC coefficients used for TNS.
+ * The suffix _M_N[] indicate the values of coef_compress and coef_res
+ * respectively.
+ * @{
+ */
+static const INTFLOAT tns_tmp2_map_1_3[4] = {
+    Q31(0.00000000), Q31(-0.43388373),  Q31(0.64278758),  Q31(0.34202015),
+};
+
+static const INTFLOAT tns_tmp2_map_0_3[8] = {
+    Q31(0.00000000), Q31(-0.43388373), Q31(-0.78183150), Q31(-0.97492790),
+    Q31(0.98480773), Q31( 0.86602539), Q31( 0.64278758), Q31( 0.34202015),
+};
+
+static const INTFLOAT tns_tmp2_map_1_4[8] = {
+    Q31(0.00000000), Q31(-0.20791170), Q31(-0.40673664), Q31(-0.58778524),
+    Q31(0.67369562), Q31( 0.52643216), Q31( 0.36124167), Q31( 0.18374951),
+};
+
+static const INTFLOAT tns_tmp2_map_0_4[16] = {
+    Q31( 0.00000000), Q31(-0.20791170), Q31(-0.40673664), Q31(-0.58778524),
+    Q31(-0.74314481), Q31(-0.86602539), Q31(-0.95105654), Q31(-0.99452192),
+    Q31( 0.99573416), Q31( 0.96182561), Q31( 0.89516330), Q31( 0.79801720),
+    Q31( 0.67369562), Q31( 0.52643216), Q31( 0.36124167), Q31( 0.18374951),
+};
+
+static const INTFLOAT * const tns_tmp2_map[4] = {
+    tns_tmp2_map_0_3,
+    tns_tmp2_map_0_4,
+    tns_tmp2_map_1_3,
+    tns_tmp2_map_1_4
+};
+// @}
+
 /* @name window coefficients
  * @{
  */
 DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_long_1024)[1024];
 DECLARE_ALIGNED(32, extern float,  ff_aac_kbd_short_128)[128];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_long_1024_fixed)[1024];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_long_512_fixed)[512];
+DECLARE_ALIGNED(32, extern int,    ff_aac_kbd_short_128_fixed)[128];
 const DECLARE_ALIGNED(32, extern float, ff_aac_eld_window_512)[1920];
+const DECLARE_ALIGNED(32, extern int,   ff_aac_eld_window_512_fixed)[1920];
 const DECLARE_ALIGNED(32, extern float, ff_aac_eld_window_480)[1800];
+const DECLARE_ALIGNED(32, extern int,   ff_aac_eld_window_480_fixed)[1800];
 // @}
 
 /* @name number of scalefactor window bands for long and short transform windows respectively
diff --git a/libavcodec/aandcttab.c b/libavcodec/aandcttab.c
index 0c5b573..97013d2 100644
--- a/libavcodec/aandcttab.c
+++ b/libavcodec/aandcttab.c
@@ -1,24 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * AAN (Arai Agui Aakajima) (I)DCT tables
+ * AAN (Arai, Agui and Nakajima) (I)DCT tables
  */
 
 #include <stdint.h>
diff --git a/libavcodec/aandcttab.h b/libavcodec/aandcttab.h
index daccb7b..b0a2f44 100644
--- a/libavcodec/aandcttab.h
+++ b/libavcodec/aandcttab.h
@@ -1,24 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * AAN (Arai Agui Nakajima) (I)DCT tables
+ * AAN (Arai, Agui and Nakajima) (I)DCT tables
  */
 
 #ifndef AVCODEC_AANDCTTAB_H
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 764bedc..36d9d34 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,13 +7,12 @@ OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IMDCT15)                  += aarch64/imdct15_init.o
-OBJS-$(CONFIG_MDCT)                     += aarch64/mdct_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
 
 # decoders/encoders
-OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
 OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
@@ -40,6 +39,5 @@ NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 
 # decoders/encoders
-NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
-                                           aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h
index 60e32dd..e05c5ad 100644
--- a/libavcodec/aarch64/asm-offsets.h
+++ b/libavcodec/aarch64/asm-offsets.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/cabac.h b/libavcodec/aarch64/cabac.h
index e12953e..6b9b77e 100644
--- a/libavcodec/aarch64/cabac.h
+++ b/libavcodec/aarch64/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S
deleted file mode 100644
index 4cd3328..0000000
--- a/libavcodec/aarch64/dcadsp_neon.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
-        mov             x3,  #32                // decifactor
-        sub             x1,  x1,  #7*4
-        add             x4,  x0,  #2*32*4 - 16  // out2
-        mov             x7,  #-16
-
-        ld1             {v0.4s,v1.4s}, [x1]
-        // reverse [-num_coeffs + 1, 0]
-        ext             v3.16b, v0.16b, v0.16b, #8
-        ext             v2.16b, v1.16b, v1.16b, #8
-        rev64           v3.4s,  v3.4s
-        rev64           v2.4s,  v2.4s
-1:
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        subs            x3,  x3,  #4
-        fmul            v16.4s, v2.4s,  v4.4s
-        fmul            v23.4s, v0.4s,  v4.4s
-        fmul            v17.4s, v2.4s,  v6.4s
-        fmul            v22.4s, v0.4s,  v6.4s
-
-        fmla            v16.4s, v3.4s,  v5.4s
-        fmla            v23.4s, v1.4s,  v5.4s
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        fmla            v17.4s, v3.4s,  v7.4s
-        fmla            v22.4s, v1.4s,  v7.4s
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        fmul            v18.4s, v2.4s,  v4.4s
-        fmul            v21.4s, v0.4s,  v4.4s
-        fmul            v19.4s, v2.4s,  v6.4s
-        fmul            v20.4s, v0.4s,  v6.4s
-
-        fmla            v18.4s, v3.4s,  v5.4s
-        fmla            v21.4s, v1.4s,  v5.4s
-        fmla            v19.4s, v3.4s,  v7.4s
-        fmla            v20.4s, v1.4s,  v7.4s
-
-        faddp           v16.4s, v16.4s, v17.4s
-        faddp           v18.4s, v18.4s, v19.4s
-        faddp           v20.4s, v20.4s, v21.4s
-        faddp           v22.4s, v22.4s, v23.4s
-        faddp           v16.4s, v16.4s, v18.4s
-        faddp           v20.4s, v20.4s, v22.4s
-
-        st1             {v16.4s}, [x0], #16
-        st1             {v20.4s}, [x4], x7
-        b.gt            1b
-
-        ret
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
-        mov             x3,  #64                // decifactor
-        sub             x1,  x1,  #3*4
-        add             x4,  x0,  #2*64*4 - 16  // out2
-        mov             x7,  #-16
-
-        ld1             {v0.4s}, [x1]
-        // reverse [-num_coeffs + 1, 0]
-        ext             v1.16b, v0.16b, v0.16b, #8
-        rev64           v1.4s,  v1.4s
-
-1:
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        subs            x3,  x3,  #4
-        fmul            v16.4s, v1.4s,  v4.4s
-        fmul            v23.4s, v0.4s,  v4.4s
-        fmul            v17.4s, v1.4s,  v5.4s
-        fmul            v22.4s, v0.4s,  v5.4s
-        fmul            v18.4s, v1.4s,  v6.4s
-        fmul            v21.4s, v0.4s,  v6.4s
-        fmul            v19.4s, v1.4s,  v7.4s
-        fmul            v20.4s, v0.4s,  v7.4s
-        faddp           v16.4s, v16.4s, v17.4s
-        faddp           v18.4s, v18.4s, v19.4s
-        faddp           v20.4s, v20.4s, v21.4s
-        faddp           v22.4s, v22.4s, v23.4s
-        faddp           v16.4s, v16.4s, v18.4s
-        faddp           v20.4s, v20.4s, v22.4s
-        st1             {v16.4s}, [x0], #16
-        st1             {v20.4s}, [x4], x7
-        b.gt            1b
-
-        ret
-endfunc
diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c
index 9cc57d3..db28520 100644
--- a/libavcodec/aarch64/fft_init_aarch64.c
+++ b/libavcodec/aarch64/fft_init_aarch64.c
@@ -1,23 +1,25 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/aarch64/cpu.h"
@@ -27,6 +29,10 @@
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 av_cold void ff_fft_init_aarch64(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -34,5 +40,11 @@ av_cold void ff_fft_init_aarch64(FFTContext *s)
     if (have_neon(cpu_flags)) {
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
     }
 }
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index e205e23..862039f 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -8,20 +8,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fmtconvert_init.c b/libavcodec/aarch64/fmtconvert_init.c
index 0a55a1b..210e74b 100644
--- a/libavcodec/aarch64/fmtconvert_init.c
+++ b/libavcodec/aarch64/fmtconvert_init.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized Format Conversion Utils
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fmtconvert_neon.S b/libavcodec/aarch64/fmtconvert_neon.S
index 3b33c87..2161c3a 100644
--- a/libavcodec/aarch64/fmtconvert_neon.S
+++ b/libavcodec/aarch64/fmtconvert_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau  <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c
index c7679ab..2af62be 100644
--- a/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index d1025c7..486079f 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index b106f11..e0f378f 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -78,6 +78,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
         c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
         c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if (chroma_format_idc <= 1)
         c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 9b4610a..4ec35f2 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 5395e14..fa414f7 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index 8f912cb..b144376 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index a38a27f..213b40b 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 74088b2..77f41d9 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 731dc06..d27cfac 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_init_aarch64.c b/libavcodec/aarch64/hpeldsp_init_aarch64.c
index 6bc4c09..144ae2b 100644
--- a/libavcodec/aarch64/hpeldsp_init_aarch64.c
+++ b/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S
index 2978290..a491c17 100644
--- a/libavcodec/aarch64/hpeldsp_neon.S
+++ b/libavcodec/aarch64/hpeldsp_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c
index 38018f2..58af9f0 100644
--- a/libavcodec/aarch64/imdct15_init.c
+++ b/libavcodec/aarch64/imdct15_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S
index d99edf4..97e1442 100644
--- a/libavcodec/aarch64/imdct15_neon.S
+++ b/libavcodec/aarch64/imdct15_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mdct_init.c b/libavcodec/aarch64/mdct_init.c
deleted file mode 100644
index 816111a..0000000
--- a/libavcodec/aarch64/mdct_init.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_aarch64(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_calc   = ff_imdct_calc_neon;
-        s->imdct_half   = ff_imdct_half_neon;
-        s->mdct_calc    = ff_mdct_calc_neon;
-        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-    }
-}
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index bccd832..1fd199c 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_init.c b/libavcodec/aarch64/mpegaudiodsp_init.c
index a8b2baf..b945146 100644
--- a/libavcodec/aarch64/mpegaudiodsp_init.c
+++ b/libavcodec/aarch64/mpegaudiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S
index c1edc64..97d2e1f 100644
--- a/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 377009e..0fddbec 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neontest.c b/libavcodec/aarch64/neontest.c
index 0414829..6e41f37 100644
--- a/libavcodec/aarch64/neontest.c
+++ b/libavcodec/aarch64/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c
index 0bb404f..764bc1e 100644
--- a/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/synth_filter_init.c
index d3430d0..767b011 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/synth_filter_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,8 +23,8 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavutil/attributes.h"
 #include "libavutil/internal.h"
-#include "libavcodec/dcadsp.h"
 #include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
 
 #include "asm-offsets.h"
 
@@ -32,25 +32,12 @@
 AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
 #endif
 
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
 void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float *synth_buf_ptr, int *synth_buf_offset,
                                 float synth_buf2[32], const float window[512],
                                 float out[32], const float in[32],
                                 float scale);
 
-av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
-        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
-    }
-}
-
 av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
index 9551bff..65551cb 100644
--- a/libavcodec/aarch64/synth_filter_neon.S
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 11cd81e..e59e55e 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp.S b/libavcodec/aarch64/videodsp.S
index 7ce5a7d..24067cc 100644
--- a/libavcodec/aarch64/videodsp.S
+++ b/libavcodec/aarch64/videodsp.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp_init.c b/libavcodec/aarch64/videodsp_init.c
index 59b697d..6f667a6 100644
--- a/libavcodec/aarch64/videodsp_init.c
+++ b/libavcodec/aarch64/videodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_init.c b/libavcodec/aarch64/vorbisdsp_init.c
index 3559b54..c796f95 100644
--- a/libavcodec/aarch64/vorbisdsp_init.c
+++ b/libavcodec/aarch64/vorbisdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_neon.S b/libavcodec/aarch64/vorbisdsp_neon.S
index 11f71f1..e76feeb 100644
--- a/libavcodec/aarch64/vorbisdsp_neon.S
+++ b/libavcodec/aarch64/vorbisdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aasc.c b/libavcodec/aasc.c
index e65ea39..fd63aba 100644
--- a/libavcodec/aasc.c
+++ b/libavcodec/aasc.c
@@ -2,20 +2,20 @@
  * Autodesk RLE Decoder
  * Copyright (C) 2005 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,15 +36,39 @@ typedef struct AascContext {
     AVCodecContext *avctx;
     GetByteContext gb;
     AVFrame *frame;
+
+    uint32_t palette[AVPALETTE_COUNT];
+    int palette_size;
 } AascContext;
 
 static av_cold int aasc_decode_init(AVCodecContext *avctx)
 {
     AascContext *s = avctx->priv_data;
+    uint8_t *ptr;
+    int i;
 
     s->avctx = avctx;
-
-    avctx->pix_fmt = AV_PIX_FMT_BGR24;
+    switch (avctx->bits_per_coded_sample) {
+    case 8:
+        avctx->pix_fmt = AV_PIX_FMT_PAL8;
+
+        ptr = avctx->extradata;
+        s->palette_size = FFMIN(avctx->extradata_size, AVPALETTE_SIZE);
+        for (i = 0; i < s->palette_size / 4; i++) {
+            s->palette[i] = 0xFFU << 24 | AV_RL32(ptr);
+            ptr += 4;
+        }
+        break;
+    case 16:
+        avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
+        break;
+    case 24:
+        avctx->pix_fmt = AV_PIX_FMT_BGR24;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n", avctx->bits_per_coded_sample);
+        return -1;
+    }
 
     s->frame = av_frame_alloc();
     if (!s->frame)
@@ -60,27 +84,35 @@ static int aasc_decode_frame(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
     AascContext *s     = avctx->priv_data;
-    int compr, i, stride, ret;
+    int compr, i, stride, psize, ret;
 
-    if (buf_size < 4)
+    if (buf_size < 4) {
+        av_log(avctx, AV_LOG_ERROR, "frame too short\n");
         return AVERROR_INVALIDDATA;
+    }
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     compr     = AV_RL32(buf);
     buf      += 4;
     buf_size -= 4;
+    psize = avctx->bits_per_coded_sample / 8;
+    switch (avctx->codec_tag) {
+    case MKTAG('A', 'A', 'S', '4'):
+        bytestream2_init(&s->gb, buf - 4, buf_size + 4);
+        ff_msrle_decode(avctx, s->frame, 8, &s->gb);
+        break;
+    case MKTAG('A', 'A', 'S', 'C'):
     switch (compr) {
     case 0:
-        stride = (avctx->width * 3 + 3) & ~3;
+        stride = (avctx->width * psize + psize) & ~psize;
         if (buf_size < stride * avctx->height)
             return AVERROR_INVALIDDATA;
         for (i = avctx->height - 1; i >= 0; i--) {
-            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * 3);
+            memcpy(s->frame->data[0] + i * s->frame->linesize[0], buf, avctx->width * psize);
             buf += stride;
+            buf_size -= stride;
         }
         break;
     case 1:
@@ -91,6 +123,14 @@ static int aasc_decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
         return AVERROR_INVALIDDATA;
     }
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown FourCC: %X\n", avctx->codec_tag);
+        return -1;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        memcpy(s->frame->data[1], s->palette, s->palette_size);
 
     *got_frame = 1;
     if ((ret = av_frame_ref(data, s->frame)) < 0)
diff --git a/libavcodec/ac3.c b/libavcodec/ac3.c
index 99e5b50..1d4eaa5 100644
--- a/libavcodec/ac3.c
+++ b/libavcodec/ac3.c
@@ -2,20 +2,20 @@
  * Common code between the AC-3 encoder and decoder
  * Copyright (c) 2000 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,8 +39,6 @@ const uint8_t ff_ac3_band_start_tab[AC3_CRITICAL_BANDS+1] = {
      79,  85, 97, 109, 121, 133, 157, 181, 205, 229, 253
 };
 
-#if CONFIG_HARDCODED_TABLES
-
 /**
  * Map each frequency coefficient bin to the critical band that contains it.
  */
@@ -69,10 +67,6 @@ const uint8_t ff_ac3_bin_to_band_tab[253] = {
     49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49
 };
 
-#else /* CONFIG_HARDCODED_TABLES */
-uint8_t ff_ac3_bin_to_band_tab[253];
-#endif
-
 static inline int calc_lowcomp1(int a, int b0, int b1, int c)
 {
     if ((b0 + 256) == b1) {
@@ -131,6 +125,9 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
     int band_start, band_end, begin, end1;
     int lowcomp, fastleak, slowleak;
 
+    if (end <= 0)
+        return AVERROR_INVALIDDATA;
+
     /* excitation function */
     band_start = ff_ac3_bin_to_band_tab[start];
     band_end   = ff_ac3_bin_to_band_tab[end-1] + 1;
@@ -200,9 +197,9 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
             if (band >= AC3_CRITICAL_BANDS || dba_lengths[seg] > AC3_CRITICAL_BANDS-band)
                 return -1;
             if (dba_values[seg] >= 4) {
-                delta = (dba_values[seg] - 3) << 7;
+                delta = (dba_values[seg] - 3) * 128;
             } else {
-                delta = (dba_values[seg] - 4) << 7;
+                delta = (dba_values[seg] - 4) * 128;
             }
             for (i = 0; i < dba_lengths[seg]; i++) {
                 mask[band++] += delta;
@@ -211,21 +208,3 @@ int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
     }
     return 0;
 }
-
-/**
- * Initialize some tables.
- * note: This function must remain thread safe because it is called by the
- *       AVParser init code.
- */
-av_cold void ff_ac3_common_init(void)
-{
-#if !CONFIG_HARDCODED_TABLES
-    /* compute ff_ac3_bin_to_band_tab from ff_ac3_band_start_tab */
-    int bin = 0, band;
-    for (band = 0; band < AC3_CRITICAL_BANDS; band++) {
-        int band_end = ff_ac3_band_start_tab[band+1];
-        while (bin < band_end)
-            ff_ac3_bin_to_band_tab[bin++] = band;
-    }
-#endif /* !CONFIG_HARDCODED_TABLES */
-}
diff --git a/libavcodec/ac3.h b/libavcodec/ac3.h
index f2cb6c3..747f2f5 100644
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -2,20 +2,20 @@
  * Common code between the AC-3 encoder and decoder
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,8 @@
 #define AC3_CRITICAL_BANDS 50
 #define AC3_MAX_CPL_BANDS  18
 
+#include "libavutil/opt.h"
+#include "avcodec.h"
 #include "ac3tab.h"
 
 /* exponent encoding strategy */
@@ -49,11 +51,59 @@
 #define EXP_D25   2
 #define EXP_D45   3
 
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
+#if USE_FIXED
+
+#define FFT_FLOAT 0
+
+#define FIXR(a)                 ((int)((a) * 0 + 0.5))
+#define FIXR12(a)               ((int)((a) * 4096 + 0.5))
+#define FIXR15(a)               ((int)((a) * 32768 + 0.5))
+#define ROUND15(x)              ((x) + 16384) >> 15
+
+#define AC3_RENAME(x)           x ## _fixed
+#define AC3_NORM(norm)          (1<<24)/(norm)
+#define AC3_MUL(a,b)            ((((int64_t) (a)) * (b))>>12)
+#define AC3_RANGE(x)            ((x)|(((x)&128)<<1))
+#define AC3_HEAVY_RANGE(x)      ((x)<<1)
+#define AC3_DYNAMIC_RANGE(x)    (x)
+#define AC3_SPX_BLEND(x)        (x)
+#define AC3_DYNAMIC_RANGE1      0
+
+typedef int                     INTFLOAT;
+typedef int16_t                 SHORTFLOAT;
+
+#else /* USE_FIXED */
+
+#define FIXR(x)                 ((float)(x))
+#define FIXR12(x)               ((float)(x))
+#define FIXR15(x)               ((float)(x))
+#define ROUND15(x)              (x)
+
+#define AC3_RENAME(x)           x
+#define AC3_NORM(norm)          (1.0f/(norm))
+#define AC3_MUL(a,b)            ((a) * (b))
+#define AC3_RANGE(x)            (dynamic_range_tab[(x)])
+#define AC3_HEAVY_RANGE(x)      (heavy_dynamic_range_tab[(x)])
+#define AC3_DYNAMIC_RANGE(x)    (powf(x,  s->drc_scale))
+#define AC3_SPX_BLEND(x)        (x)* (1.0f/32)
+#define AC3_DYNAMIC_RANGE1      1.0f
+
+typedef float                   INTFLOAT;
+typedef float                   SHORTFLOAT;
+
+#endif /* USE_FIXED */
+
+#define AC3_LEVEL(x)            ROUND15((x) * FIXR15(M_SQRT1_2))
+
 /* pre-defined gain values */
-#define LEVEL_PLUS_3DB          1.4142135623730950
+#define LEVEL_PLUS_3DB          M_SQRT2
 #define LEVEL_PLUS_1POINT5DB    1.1892071150027209
 #define LEVEL_MINUS_1POINT5DB   0.8408964152537145
-#define LEVEL_MINUS_3DB         0.7071067811865476
+#define LEVEL_MINUS_3DB         M_SQRT1_2
 #define LEVEL_MINUS_4POINT5DB   0.5946035575013605
 #define LEVEL_MINUS_6DB         0.5000000000000000
 #define LEVEL_MINUS_9DB         0.3535533905932738
diff --git a/libavcodec/ac3_parser.c b/libavcodec/ac3_parser.c
index 9704848..83dd90f 100644
--- a/libavcodec/ac3_parser.c
+++ b/libavcodec/ac3_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,9 +47,16 @@ static const uint8_t center_levels[4] = { 4, 5, 6, 5 };
 static const uint8_t surround_levels[4] = { 4, 6, 7, 6 };
 
 
-int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
+int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo **phdr)
 {
     int frame_size_code;
+    AC3HeaderInfo *hdr;
+
+    if (!*phdr)
+        *phdr = av_mallocz(sizeof(AC3HeaderInfo));
+    if (!*phdr)
+        return AVERROR(ENOMEM);
+    hdr = *phdr;
 
     memset(hdr, 0, sizeof(*hdr));
 
@@ -133,8 +140,8 @@ int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
         hdr->channel_mode = get_bits(gbc, 3);
         hdr->lfe_on = get_bits1(gbc);
 
-        hdr->bit_rate = (uint32_t)(8.0 * hdr->frame_size * hdr->sample_rate /
-                        (hdr->num_blocks * 256.0));
+        hdr->bit_rate = 8LL * hdr->frame_size * hdr->sample_rate /
+                        (hdr->num_blocks * 256);
         hdr->channels = ff_ac3_channels_tab[hdr->channel_mode] + hdr->lfe_on;
     }
     hdr->channel_layout = avpriv_ac3_channel_layout_tab[hdr->channel_mode];
@@ -152,11 +159,11 @@ static int ac3_sync(uint64_t state, AACAC3ParseContext *hdr_info,
         uint64_t u64;
         uint8_t  u8[8 + AV_INPUT_BUFFER_PADDING_SIZE];
     } tmp = { av_be2ne64(state) };
-    AC3HeaderInfo hdr;
+    AC3HeaderInfo hdr, *phdr = &hdr;
     GetBitContext gbc;
 
     init_get_bits(&gbc, tmp.u8+8-AC3_HEADER_SIZE, 54);
-    err = avpriv_ac3_parse_header(&gbc, &hdr);
+    err = avpriv_ac3_parse_header(&gbc, &phdr);
 
     if(err < 0)
         return 0;
diff --git a/libavcodec/ac3_parser.h b/libavcodec/ac3_parser.h
index 9322550..dc5d035 100644
--- a/libavcodec/ac3_parser.h
+++ b/libavcodec/ac3_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,11 +31,12 @@
  * Parse the header up to the lfeon element, which is the first 52 or 54 bits
  * depending on the audio coding mode.
  * @param[in]  gbc BitContext containing the first 54 bits of the frame.
- * @param[out] hdr Pointer to struct where header info is written.
+ * @param[out] hdr Pointer to Pointer to struct where header info is written.
+ *                 will be allocated if NULL
  * @return Returns 0 on success, -1 if there is a sync word mismatch,
  * -2 if the bsid (version) element is invalid, -3 if the fscod (sample rate)
  * element is invalid, or -4 if the frmsizecod (bit rate) element is invalid.
  */
-int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr);
+int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo **hdr);
 
 #endif /* AVCODEC_AC3_PARSER_H */
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 9b08638..fac189b 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -7,20 +7,20 @@
  * Copyright (c) 2007-2008 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  * Copyright (c) 2007 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,7 @@ static const uint8_t quantization_tab[16] = {
 
 /** dynamic range table. converts codes to scale factors. */
 static float dynamic_range_tab[256];
+static float heavy_dynamic_range_tab[256];
 
 /** Adjustments in dB gain */
 static const float gain_levels[9] = {
@@ -111,7 +112,7 @@ static const uint8_t ac3_default_coeffs[8][5][2] = {
 static inline int
 symmetric_dequant(int code, int levels)
 {
-    return ((code - (levels >> 1)) << 24) / levels;
+    return ((code - (levels >> 1)) * (1 << 24)) / levels;
 }
 
 /*
@@ -164,6 +165,14 @@ static av_cold void ac3_tables_init(void)
         int v = (i >> 5) - ((i >> 7) << 3) - 5;
         dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0x1F) | 0x20);
     }
+
+    /* generate compr dynamic range table
+       reference: Section 7.7.2 Heavy Compression */
+    for (i = 0; i < 256; i++) {
+        int v = (i >> 4) - ((i >> 7) << 4) - 4;
+        heavy_dynamic_range_tab[i] = powf(2.0f, v) * ((i & 0xF) | 0x10);
+    }
+
 }
 
 /**
@@ -176,18 +185,26 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
-    ff_ac3_common_init();
     ac3_tables_init();
     ff_mdct_init(&s->imdct_256, 8, 1, 1.0);
     ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
-    ff_kbd_window_init(s->window, 5.0, 256);
+    AC3_RENAME(ff_kbd_window_init)(s->window, 5.0, 256);
     ff_bswapdsp_init(&s->bdsp);
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+
+#if (USE_FIXED)
+    s->fdsp = avpriv_alloc_fixed_dsp(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#else
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
     ff_fmt_convert_init(&s->fmt_conv, avctx);
+#endif
+
+    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
     av_lfg_init(&s->dith_state, 0);
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    if (USE_FIXED)
+        avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
+    else
+        avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
     /* allow downmixing to stereo or mono */
     if (avctx->channels > 1 &&
@@ -219,9 +236,19 @@ static int ac3_parse_header(AC3DecodeContext *s)
     /* read the rest of the bsi. read twice for dual mono mode. */
     i = !s->channel_mode;
     do {
-        skip_bits(gbc, 5); // skip dialog normalization
-        if (get_bits1(gbc))
-            skip_bits(gbc, 8); //skip compression
+        s->dialog_normalization[(!s->channel_mode)-i] = -get_bits(gbc, 5);
+        if (s->dialog_normalization[(!s->channel_mode)-i] == 0) {
+            s->dialog_normalization[(!s->channel_mode)-i] = -31;
+        }
+        if (s->target_level != 0) {
+            s->level_gain[(!s->channel_mode)-i] = powf(2.0f,
+                (float)(s->target_level -
+                s->dialog_normalization[(!s->channel_mode)-i])/6.0f);
+        }
+        if (s->compression_exists[(!s->channel_mode)-i] = get_bits1(gbc)) {
+            s->heavy_dynamic_range[(!s->channel_mode)-i] =
+                AC3_HEAVY_RANGE(get_bits(gbc, 8));
+        }
         if (get_bits1(gbc))
             skip_bits(gbc, 8); //skip language code
         if (get_bits1(gbc))
@@ -267,10 +294,10 @@ static int ac3_parse_header(AC3DecodeContext *s)
  */
 static int parse_frame_header(AC3DecodeContext *s)
 {
-    AC3HeaderInfo hdr;
+    AC3HeaderInfo hdr, *phdr=&hdr;
     int err;
 
-    err = avpriv_ac3_parse_header(&s->gbc, &hdr);
+    err = avpriv_ac3_parse_header(&s->gbc, &phdr);
     if (err)
         return err;
 
@@ -338,40 +365,45 @@ static void set_downmix_coeffs(AC3DecodeContext *s)
     float cmix = gain_levels[s->  center_mix_level];
     float smix = gain_levels[s->surround_mix_level];
     float norm0, norm1;
+    float downmix_coeffs[AC3_MAX_CHANNELS][2];
 
     for (i = 0; i < s->fbw_channels; i++) {
-        s->downmix_coeffs[i][0] = gain_levels[ac3_default_coeffs[s->channel_mode][i][0]];
-        s->downmix_coeffs[i][1] = gain_levels[ac3_default_coeffs[s->channel_mode][i][1]];
+        downmix_coeffs[i][0] = gain_levels[ac3_default_coeffs[s->channel_mode][i][0]];
+        downmix_coeffs[i][1] = gain_levels[ac3_default_coeffs[s->channel_mode][i][1]];
     }
     if (s->channel_mode > 1 && s->channel_mode & 1) {
-        s->downmix_coeffs[1][0] = s->downmix_coeffs[1][1] = cmix;
+        downmix_coeffs[1][0] = downmix_coeffs[1][1] = cmix;
     }
     if (s->channel_mode == AC3_CHMODE_2F1R || s->channel_mode == AC3_CHMODE_3F1R) {
         int nf = s->channel_mode - 2;
-        s->downmix_coeffs[nf][0] = s->downmix_coeffs[nf][1] = smix * LEVEL_MINUS_3DB;
+        downmix_coeffs[nf][0] = downmix_coeffs[nf][1] = smix * LEVEL_MINUS_3DB;
     }
     if (s->channel_mode == AC3_CHMODE_2F2R || s->channel_mode == AC3_CHMODE_3F2R) {
         int nf = s->channel_mode - 4;
-        s->downmix_coeffs[nf][0] = s->downmix_coeffs[nf+1][1] = smix;
+        downmix_coeffs[nf][0] = downmix_coeffs[nf+1][1] = smix;
     }
 
     /* renormalize */
     norm0 = norm1 = 0.0;
     for (i = 0; i < s->fbw_channels; i++) {
-        norm0 += s->downmix_coeffs[i][0];
-        norm1 += s->downmix_coeffs[i][1];
+        norm0 += downmix_coeffs[i][0];
+        norm1 += downmix_coeffs[i][1];
     }
     norm0 = 1.0f / norm0;
     norm1 = 1.0f / norm1;
     for (i = 0; i < s->fbw_channels; i++) {
-        s->downmix_coeffs[i][0] *= norm0;
-        s->downmix_coeffs[i][1] *= norm1;
+        downmix_coeffs[i][0] *= norm0;
+        downmix_coeffs[i][1] *= norm1;
     }
 
     if (s->output_mode == AC3_CHMODE_MONO) {
         for (i = 0; i < s->fbw_channels; i++)
-            s->downmix_coeffs[i][0] = (s->downmix_coeffs[i][0] +
-                                       s->downmix_coeffs[i][1]) * LEVEL_MINUS_3DB;
+            downmix_coeffs[i][0] = (downmix_coeffs[i][0] +
+                                    downmix_coeffs[i][1]) * LEVEL_MINUS_3DB;
+    }
+    for (i = 0; i < s->fbw_channels; i++) {
+        s->downmix_coeffs[i][0] = FIXR12(downmix_coeffs[i][0]);
+        s->downmix_coeffs[i][1] = FIXR12(downmix_coeffs[i][1]);
     }
 }
 
@@ -379,7 +411,8 @@ static void set_downmix_coeffs(AC3DecodeContext *s)
  * Decode the grouped exponents according to exponent strategy.
  * reference: Section 7.1.3 Exponent Decoding
  */
-static int decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
+static int decode_exponents(AC3DecodeContext *s,
+                            GetBitContext *gbc, int exp_strategy, int ngrps,
                             uint8_t absexp, int8_t *dexps)
 {
     int i, j, grp, group_size;
@@ -399,8 +432,10 @@ static int decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
     prevexp = absexp;
     for (i = 0, j = 0; i < ngrps * 3; i++) {
         prevexp += dexp[i] - 2;
-        if (prevexp > 24U)
+        if (prevexp > 24U) {
+            av_log(s->avctx, AV_LOG_ERROR, "exponent %d is out-of-range\n", prevexp);
             return -1;
+        }
         switch (group_size) {
         case 4: dexps[j++] = prevexp;
                 dexps[j++] = prevexp;
@@ -429,7 +464,7 @@ static void calc_transform_coeffs_cpl(AC3DecodeContext *s)
                 int cpl_coord = s->cpl_coords[ch][band] << 5;
                 for (bin = band_start; bin < band_end; bin++) {
                     s->fixed_coeffs[ch][bin] =
-                        MULH(s->fixed_coeffs[CPL_CH][bin] << 4, cpl_coord);
+                        MULH(s->fixed_coeffs[CPL_CH][bin] * (1 << 4), cpl_coord);
                 }
                 if (ch == 2 && s->phase_flags[band]) {
                     for (bin = band_start; bin < band_end; bin++)
@@ -475,7 +510,7 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
         case 0:
             /* random noise with approximate range of -0.707 to 0.707 */
             if (dither)
-                mantissa = (av_lfg_get(&s->dith_state) / 362) - 5932275;
+                mantissa = (((av_lfg_get(&s->dith_state)>>8)*181)>>8) - 5931008;
             else
                 mantissa = 0;
             break;
@@ -522,8 +557,11 @@ static void ac3_decode_transform_coeffs_ch(AC3DecodeContext *s, int ch_index, ma
             break;
         default: /* 6 to 15 */
             /* Shift mantissa and sign-extend it. */
-            mantissa = get_sbits(gbc, quantization_tab[bap]);
-            mantissa <<= 24 - quantization_tab[bap];
+            if (bap > 15) {
+                av_log(s->avctx, AV_LOG_ERROR, "bap %d is invalid in plain AC-3\n", bap);
+                bap = 15;
+            }
+            mantissa = (unsigned)get_sbits(gbc, quantization_tab[bap]) << (24 - quantization_tab[bap]);
             break;
         }
         coeffs[freq] = mantissa >> exps[freq];
@@ -557,7 +595,7 @@ static void decode_transform_coeffs_ch(AC3DecodeContext *s, int blk, int ch,
         /* if AHT is used, mantissas for all blocks are encoded in the first
            block of the frame. */
         int bin;
-        if (!blk && CONFIG_EAC3_DECODER)
+        if (CONFIG_EAC3_DECODER && !blk)
             ff_eac3_decode_transform_coeffs_aht_ch(s, ch);
         for (bin = s->start_freq[ch]; bin < s->end_freq[ch]; bin++) {
             s->fixed_coeffs[ch][bin] = s->pre_mantissa[ch][bin][blk] >> s->dexps[ch][bin];
@@ -635,20 +673,30 @@ static inline void do_imdct(AC3DecodeContext *s, int channels)
     for (ch = 1; ch <= channels; ch++) {
         if (s->block_switch[ch]) {
             int i;
-            float *x = s->tmp_output + 128;
+            FFTSample *x = s->tmp_output + 128;
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i];
             s->imdct_256.imdct_half(&s->imdct_256, s->tmp_output, x);
-            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+#if USE_FIXED
+            s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1],
+                                       s->tmp_output, s->window, 128, 8);
+#else
+            s->fdsp->vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
                                        s->tmp_output, s->window, 128);
+#endif
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i + 1];
             s->imdct_256.imdct_half(&s->imdct_256, s->delay[ch - 1], x);
         } else {
             s->imdct_512.imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]);
-            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+#if USE_FIXED
+            s->fdsp->vector_fmul_window_scaled(s->outptr[ch - 1], s->delay[ch - 1],
+                                       s->tmp_output, s->window, 128, 8);
+#else
+            s->fdsp->vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
                                        s->tmp_output, s->window, 128);
-            memcpy(s->delay[ch - 1], s->tmp_output + 128, 128 * sizeof(float));
+#endif
+            memcpy(s->delay[ch - 1], s->tmp_output + 128, 128 * sizeof(FFTSample));
         }
     }
 }
@@ -783,13 +831,14 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         if (get_bits1(gbc)) {
             /* Allow asymmetric application of DRC when drc_scale > 1.
                Amplification of quiet sounds is enhanced */
-            float range = dynamic_range_tab[get_bits(gbc, 8)];
-            if (range > 1.0 || s->drc_scale <= 1.0)
-                s->dynamic_range[i] = powf(range, s->drc_scale);
+            int range_bits = get_bits(gbc, 8);
+            INTFLOAT range = AC3_RANGE(range_bits);
+            if (range_bits <= 127 || s->drc_scale <= 1.0)
+                s->dynamic_range[i] = AC3_DYNAMIC_RANGE(range);
             else
                 s->dynamic_range[i] = range;
         } else if (blk == 0) {
-            s->dynamic_range[i] = 1.0f;
+            s->dynamic_range[i] = AC3_DYNAMIC_RANGE1;
         }
     } while (i--);
 
@@ -815,6 +864,9 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
             if (start_subband > 7)
                 start_subband += start_subband - 7;
             end_subband    = get_bits(gbc, 3) + 5;
+#if USE_FIXED
+            s->spx_dst_end_freq = end_freq_inv_tab[end_subband-5];
+#endif
             if (end_subband   > 7)
                 end_subband   += end_subband   - 7;
             dst_start_freq = dst_start_freq * 12 + 25;
@@ -835,18 +887,21 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
 
             s->spx_dst_start_freq = dst_start_freq;
             s->spx_src_start_freq = src_start_freq;
-            s->spx_dst_end_freq   = dst_end_freq;
+            if (!USE_FIXED)
+                s->spx_dst_end_freq   = dst_end_freq;
 
             decode_band_structure(gbc, blk, s->eac3, 0,
                                   start_subband, end_subband,
                                   ff_eac3_default_spx_band_struct,
                                   &s->num_spx_bands,
                                   s->spx_band_sizes);
-        } else {
-            for (ch = 1; ch <= fbw_channels; ch++) {
-                s->channel_uses_spx[ch] = 0;
-                s->first_spx_coords[ch] = 1;
-            }
+        }
+    }
+    if (!s->eac3 || !s->spx_in_use) {
+        s->spx_in_use = 0;
+        for (ch = 1; ch <= fbw_channels; ch++) {
+            s->channel_uses_spx[ch] = 0;
+            s->first_spx_coords[ch] = 1;
         }
     }
 
@@ -855,26 +910,47 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         for (ch = 1; ch <= fbw_channels; ch++) {
             if (s->channel_uses_spx[ch]) {
                 if (s->first_spx_coords[ch] || get_bits1(gbc)) {
-                    float spx_blend;
+                    INTFLOAT spx_blend;
                     int bin, master_spx_coord;
 
                     s->first_spx_coords[ch] = 0;
-                    spx_blend = get_bits(gbc, 5) * (1.0f/32);
+                    spx_blend = AC3_SPX_BLEND(get_bits(gbc, 5));
                     master_spx_coord = get_bits(gbc, 2) * 3;
 
                     bin = s->spx_src_start_freq;
                     for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
-                        int bandsize;
+                        int bandsize = s->spx_band_sizes[bnd];
                         int spx_coord_exp, spx_coord_mant;
-                        float nratio, sblend, nblend, spx_coord;
+                        INTFLOAT nratio, sblend, nblend;
+#if USE_FIXED
+                        /* calculate blending factors */
+                        int64_t accu = ((bin << 23) + (bandsize << 22))
+                                     * (int64_t)s->spx_dst_end_freq;
+                        nratio = (int)(accu >> 32);
+                        nratio -= spx_blend << 18;
+
+                        if (nratio < 0) {
+                            nblend = 0;
+                            sblend = 0x800000;
+                        } else if (nratio > 0x7fffff) {
+                            nblend = 14529495; // sqrt(3) in FP.23
+                            sblend = 0;
+                        } else {
+                            nblend = fixed_sqrt(nratio, 23);
+                            accu = (int64_t)nblend * 1859775393;
+                            nblend = (int)((accu + (1<<29)) >> 30);
+                            sblend = fixed_sqrt(0x800000 - nratio, 23);
+                        }
+#else
+                        float spx_coord;
 
                         /* calculate blending factors */
-                        bandsize = s->spx_band_sizes[bnd];
                         nratio = ((float)((bin + (bandsize >> 1))) / s->spx_dst_end_freq) - spx_blend;
                         nratio = av_clipf(nratio, 0.0f, 1.0f);
                         nblend = sqrtf(3.0f * nratio); // noise is scaled by sqrt(3)
                                                        // to give unity variance
                         sblend = sqrtf(1.0f - nratio);
+#endif
                         bin += bandsize;
 
                         /* decode spx coordinates */
@@ -883,11 +959,18 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
                         if (spx_coord_exp == 15) spx_coord_mant <<= 1;
                         else                     spx_coord_mant += 4;
                         spx_coord_mant <<= (25 - spx_coord_exp - master_spx_coord);
-                        spx_coord = spx_coord_mant * (1.0f / (1 << 23));
 
                         /* multiply noise and signal blending factors by spx coordinate */
+#if USE_FIXED
+                        accu = (int64_t)nblend * spx_coord_mant;
+                        s->spx_noise_blend[ch][bnd]  = (int)((accu + (1<<22)) >> 23);
+                        accu = (int64_t)sblend * spx_coord_mant;
+                        s->spx_signal_blend[ch][bnd] = (int)((accu + (1<<22)) >> 23);
+#else
+                        spx_coord = spx_coord_mant * (1.0f / (1 << 23));
                         s->spx_noise_blend [ch][bnd] = nblend * spx_coord;
                         s->spx_signal_blend[ch][bnd] = sblend * spx_coord;
+#endif
                     }
                 }
             } else {
@@ -1063,10 +1146,9 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
     for (ch = !cpl_in_use; ch <= s->channels; ch++) {
         if (s->exp_strategy[blk][ch] != EXP_REUSE) {
             s->dexps[ch][0] = get_bits(gbc, 4) << !ch;
-            if (decode_exponents(gbc, s->exp_strategy[blk][ch],
+            if (decode_exponents(s, gbc, s->exp_strategy[blk][ch],
                                  s->num_exp_groups[ch], s->dexps[ch][0],
                                  &s->dexps[ch][s->start_freq[ch]+!!ch])) {
-                av_log(s->avctx, AV_LOG_ERROR, "exponent out-of-range\n");
                 return AVERROR_INVALIDDATA;
             }
             if (ch != CPL_CH && ch != s->lfe_ch)
@@ -1244,18 +1326,28 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
 
     /* apply scaling to coefficients (headroom, dynrng) */
     for (ch = 1; ch <= s->channels; ch++) {
-        float gain = 1.0 / 4194304.0f;
-        if (s->channel_mode == AC3_CHMODE_DUALMONO) {
-            gain *= s->dynamic_range[2 - ch];
-        } else {
-            gain *= s->dynamic_range[0];
-        }
+        int audio_channel = 0;
+        INTFLOAT gain;
+        if (s->channel_mode == AC3_CHMODE_DUALMONO)
+            audio_channel = 2-ch;
+        if (s->heavy_compression && s->compression_exists[audio_channel])
+            gain = s->heavy_dynamic_range[audio_channel];
+        else
+            gain = s->dynamic_range[audio_channel];
+
+#if USE_FIXED
+        scale_coefs(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
+#else
+        if (s->target_level != 0)
+          gain = gain * s->level_gain[audio_channel];
+        gain *= 1.0 / 4194304.0f;
         s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch],
                                                s->fixed_coeffs[ch], gain, 256);
+#endif
     }
 
     /* apply spectral extension to high frequency bins */
-    if (s->spx_in_use && CONFIG_EAC3_DECODER) {
+    if (CONFIG_EAC3_DECODER && s->spx_in_use) {
         ff_eac3_apply_spectral_extension(s);
     }
 
@@ -1276,19 +1368,24 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         do_imdct(s, s->channels);
 
         if (downmix_output) {
+#if USE_FIXED
+            ac3_downmix_c_fixed16(s->outptr, s->downmix_coeffs,
+                              s->out_channels, s->fbw_channels, 256);
+#else
             s->ac3dsp.downmix(s->outptr, s->downmix_coeffs,
                               s->out_channels, s->fbw_channels, 256);
+#endif
         }
     } else {
         if (downmix_output) {
-            s->ac3dsp.downmix(s->xcfptr + 1, s->downmix_coeffs,
-                              s->out_channels, s->fbw_channels, 256);
+            s->ac3dsp.AC3_RENAME(downmix)(s->xcfptr + 1, s->downmix_coeffs,
+                                          s->out_channels, s->fbw_channels, 256);
         }
 
         if (downmix_output && !s->downmixed) {
             s->downmixed = 1;
-            s->ac3dsp.downmix(s->dlyptr, s->downmix_coeffs, s->out_channels,
-                              s->fbw_channels, 128);
+            s->ac3dsp.AC3_RENAME(downmix)(s->dlyptr, s->downmix_coeffs,
+                                          s->out_channels, s->fbw_channels, 128);
         }
 
         do_imdct(s, s->out_channels);
@@ -1309,7 +1406,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
     AC3DecodeContext *s = avctx->priv_data;
     int blk, ch, err, ret;
     const uint8_t *channel_map;
-    const float *output[AC3_MAX_CHANNELS];
+    const SHORTFLOAT *output[AC3_MAX_CHANNELS];
     enum AVMatrixEncoding matrix_encoding;
     AVDownmixInfo *downmix_info;
 
@@ -1324,7 +1421,8 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         memcpy(s->input_buffer, buf, FFMIN(buf_size, AC3_FRAME_BUFFER_SIZE));
     buf = s->input_buffer;
     /* initialize the GetBitContext with the start of valid AC-3 Frame */
-    init_get_bits(&s->gbc, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gbc, buf, buf_size)) < 0)
+        return ret;
 
     /* parse the syncinfo */
     err = parse_frame_header(s);
@@ -1368,7 +1466,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         if (s->frame_size > buf_size) {
             av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
             err = AAC_AC3_PARSE_ERROR_FRAME_SIZE;
-        } else if (avctx->err_recognition & AV_EF_CRCCHECK) {
+        } else if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL)) {
             /* check for crc mismatch */
             if (av_crc(av_crc_get_table(AV_CRC_16_ANSI), 0, &buf[2],
                        s->frame_size - 2)) {
@@ -1402,6 +1500,10 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
             s->output_mode  = AC3_CHMODE_STEREO;
         }
 
+        s->loro_center_mix_level   = gain_levels[s->  center_mix_level];
+        s->loro_surround_mix_level = gain_levels[s->surround_mix_level];
+        s->ltrt_center_mix_level   = LEVEL_MINUS_3DB;
+        s->ltrt_surround_mix_level = LEVEL_MINUS_3DB;
         /* set downmixing coefficients if needed */
         if (s->channels != s->out_channels && !((s->output_mode & AC3_OUTPUT_LFEON) &&
                 s->fbw_channels == s->out_channels)) {
@@ -1423,19 +1525,18 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = s->num_blocks * AC3_BLOCK_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /* decode the audio blocks */
     channel_map = ff_ac3_dec_channel_map[s->output_mode & ~AC3_OUTPUT_LFEON][s->lfe_on];
+    for (ch = 0; ch < AC3_MAX_CHANNELS; ch++) {
+        output[ch] = s->output[ch];
+        s->outptr[ch] = s->output[ch];
+    }
     for (ch = 0; ch < s->channels; ch++) {
         if (ch < s->out_channels)
-            s->outptr[channel_map[ch]] = (float *)frame->data[ch];
-        else
-            s->outptr[ch] = s->output[ch];
-        output[ch] = s->output[ch];
+            s->outptr[channel_map[ch]] = (SHORTFLOAT *)frame->data[ch];
     }
     for (blk = 0; blk < s->num_blocks; blk++) {
         if (!err && decode_audio_block(s, blk)) {
@@ -1444,16 +1545,20 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
         }
         if (err)
             for (ch = 0; ch < s->out_channels; ch++)
-                memcpy(s->outptr[channel_map[ch]], output[ch], sizeof(**output) * AC3_BLOCK_SIZE);
+                memcpy(((SHORTFLOAT*)frame->data[ch]) + AC3_BLOCK_SIZE*blk, output[ch], AC3_BLOCK_SIZE*sizeof(SHORTFLOAT));
         for (ch = 0; ch < s->out_channels; ch++)
             output[ch] = s->outptr[channel_map[ch]];
-        for (ch = 0; ch < s->out_channels; ch++)
-            s->outptr[ch] += AC3_BLOCK_SIZE;
+        for (ch = 0; ch < s->out_channels; ch++) {
+            if (!ch || channel_map[ch])
+                s->outptr[channel_map[ch]] += AC3_BLOCK_SIZE;
+        }
     }
 
+    av_frame_set_decode_error_flags(frame, err ? FF_DECODE_ERROR_INVALID_BITSTREAM : 0);
+
     /* keep last block for error concealment in next frame */
     for (ch = 0; ch < s->out_channels; ch++)
-        memcpy(s->output[ch], output[ch], sizeof(**output) * AC3_BLOCK_SIZE);
+        memcpy(s->output[ch], output[ch], AC3_BLOCK_SIZE*sizeof(SHORTFLOAT));
 
     /*
      * AVMatrixEncoding
@@ -1524,59 +1629,10 @@ static av_cold int ac3_decode_end(AVCodecContext *avctx)
     AC3DecodeContext *s = avctx->priv_data;
     ff_mdct_end(&s->imdct_512);
     ff_mdct_end(&s->imdct_256);
+    av_freep(&s->fdsp);
 
     return 0;
 }
 
 #define OFFSET(x) offsetof(AC3DecodeContext, x)
 #define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM)
-static const AVOption options[] = {
-    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
-    { NULL},
-};
-
-static const AVClass ac3_decoder_class = {
-    .class_name = "AC3 decoder",
-    .item_name  = av_default_item_name,
-    .option     = options,
-    .version    = LIBAVUTIL_VERSION_INT,
-};
-
-AVCodec ff_ac3_decoder = {
-    .name           = "ac3",
-    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_AC3,
-    .priv_data_size = sizeof (AC3DecodeContext),
-    .init           = ac3_decode_init,
-    .close          = ac3_decode_end,
-    .decode         = ac3_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &ac3_decoder_class,
-};
-
-#if CONFIG_EAC3_DECODER
-static const AVClass eac3_decoder_class = {
-    .class_name = "E-AC3 decoder",
-    .item_name  = av_default_item_name,
-    .option     = options,
-    .version    = LIBAVUTIL_VERSION_INT,
-};
-
-AVCodec ff_eac3_decoder = {
-    .name           = "eac3",
-    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52B (AC-3, E-AC-3)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_EAC3,
-    .priv_data_size = sizeof (AC3DecodeContext),
-    .init           = ac3_decode_init,
-    .close          = ac3_decode_end,
-    .decode         = ac3_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &eac3_decoder_class,
-};
-#endif
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index 4c5359c..c2b867e 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -2,20 +2,20 @@
  * Common code between the AC-3 and E-AC-3 decoders
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,6 +51,7 @@
 #define AVCODEC_AC3DEC_H
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
 #include "libavutil/lfg.h"
 #include "ac3.h"
 #include "ac3dsp.h"
@@ -83,6 +84,9 @@ typedef struct AC3DecodeContext {
     int bitstream_mode;                     ///< bitstream mode                         (bsmod)
     int channel_mode;                       ///< channel mode                           (acmod)
     int lfe_on;                             ///< lfe channel in use
+    int dialog_normalization[2];            ///< dialog level in dBFS                   (dialnorm)
+    int compression_exists[2];              ///< compression field is valid for frame   (compre)
+    int compression_gain[2];                ///< gain to apply for heavy compression    (compr)
     int channel_map;                        ///< custom channel map
     int preferred_downmix;                  ///< Preferred 2-channel downmix mode       (dmixmod)
     int center_mix_level;                   ///< Center mix level index
@@ -99,6 +103,14 @@ typedef struct AC3DecodeContext {
     int dolby_headphone_mode;               ///< dolby headphone mode                   (dheadphonmod)
 ///@}
 
+    int preferred_stereo_downmix;
+    float ltrt_center_mix_level;
+    float ltrt_surround_mix_level;
+    float loro_center_mix_level;
+    float loro_surround_mix_level;
+    int target_level;                       ///< target level in dBFS
+    float level_gain[2];
+
 ///@name Frame syntax parameters
     int snr_offset_strategy;                ///< SNR offset strategy                    (snroffststr)
     int block_switch_syntax;                ///< block switch syntax enabled            (blkswe)
@@ -134,8 +146,8 @@ typedef struct AC3DecodeContext {
     int num_spx_bands;                          ///< number of spx bands                    (nspxbnds)
     uint8_t spx_band_sizes[SPX_MAX_BANDS];      ///< number of bins in each spx band
     uint8_t first_spx_coords[AC3_MAX_CHANNELS]; ///< first spx coordinates states           (firstspxcos)
-    float spx_noise_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS]; ///< spx noise blending factor  (nblendfact)
-    float spx_signal_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS];///< spx signal blending factor (sblendfact)
+    INTFLOAT spx_noise_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS]; ///< spx noise blending factor  (nblendfact)
+    INTFLOAT spx_signal_blend[AC3_MAX_CHANNELS][SPX_MAX_BANDS];///< spx signal blending factor (sblendfact)
 ///@}
 
 ///@name Adaptive hybrid transform
@@ -147,15 +159,17 @@ typedef struct AC3DecodeContext {
     int fbw_channels;                           ///< number of full-bandwidth channels
     int channels;                               ///< number of total channels
     int lfe_ch;                                 ///< index of LFE channel
-    float downmix_coeffs[AC3_MAX_CHANNELS][2];  ///< stereo downmix coefficients
+    SHORTFLOAT downmix_coeffs[AC3_MAX_CHANNELS][2];  ///< stereo downmix coefficients
     int downmixed;                              ///< indicates if coeffs are currently downmixed
     int output_mode;                            ///< output channel configuration
     int out_channels;                           ///< number of output channels
 ///@}
 
 ///@name Dynamic range
-    float dynamic_range[2];                 ///< dynamic range
-    float drc_scale;                        ///< percentage of dynamic range compression to be applied
+    INTFLOAT dynamic_range[2];                 ///< dynamic range
+    INTFLOAT drc_scale;                        ///< percentage of dynamic range compression to be applied
+    int heavy_compression;                     ///< apply heavy compression
+    INTFLOAT heavy_dynamic_range[2];           ///< heavy dynamic range compression
 ///@}
 
 ///@name Bandwidth
@@ -203,22 +217,26 @@ typedef struct AC3DecodeContext {
 
 ///@name Optimization
     BswapDSPContext bdsp;
-    AVFloatDSPContext fdsp;
+#if USE_FIXED
+    AVFixedDSPContext *fdsp;
+#else
+    AVFloatDSPContext *fdsp;
+#endif
     AC3DSPContext ac3dsp;
     FmtConvertContext fmt_conv;             ///< optimized conversion functions
 ///@}
 
-    float *outptr[AC3_MAX_CHANNELS];
-    float *xcfptr[AC3_MAX_CHANNELS];
-    float *dlyptr[AC3_MAX_CHANNELS];
+    SHORTFLOAT *outptr[AC3_MAX_CHANNELS];
+    INTFLOAT *xcfptr[AC3_MAX_CHANNELS];
+    INTFLOAT *dlyptr[AC3_MAX_CHANNELS];
 
 ///@name Aligned arrays
-    DECLARE_ALIGNED(16, int32_t, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];     ///< fixed-point transform coefficients
-    DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
-    DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
-    DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
-    DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
-    DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
+    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///< fixed-point transform coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
+    DECLARE_ALIGNED(32, INTFLOAT, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
+    DECLARE_ALIGNED(32, INTFLOAT, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
+    DECLARE_ALIGNED(32, SHORTFLOAT, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
     DECLARE_ALIGNED(32, uint8_t, input_buffer)[AC3_FRAME_BUFFER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE]; ///< temp buffer to prevent overread
 ///@}
 } AC3DecodeContext;
@@ -227,19 +245,19 @@ typedef struct AC3DecodeContext {
  * Parse the E-AC-3 frame header.
  * This parses both the bit stream info and audio frame header.
  */
-int ff_eac3_parse_header(AC3DecodeContext *s);
+static int ff_eac3_parse_header(AC3DecodeContext *s);
 
 /**
  * Decode mantissas in a single channel for the entire frame.
  * This is used when AHT mode is enabled.
  */
-void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch);
+static void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch);
 
 /**
  * Apply spectral extension to each channel by copying lower frequency
  * coefficients to higher frequency bins and applying side information to
  * approximate the original high frequency signal.
  */
-void ff_eac3_apply_spectral_extension(AC3DecodeContext *s);
+static void ff_eac3_apply_spectral_extension(AC3DecodeContext *s);
 
 #endif /* AVCODEC_AC3DEC_H */
diff --git a/libavcodec/ac3dec_data.c b/libavcodec/ac3dec_data.c
index 272a963..d0a9b1e 100644
--- a/libavcodec/ac3dec_data.c
+++ b/libavcodec/ac3dec_data.c
@@ -2,20 +2,20 @@
  * AC-3 and E-AC-3 decoder tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3dec_data.h b/libavcodec/ac3dec_data.h
index c0a584e..975b52e 100644
--- a/libavcodec/ac3dec_data.h
+++ b/libavcodec/ac3dec_data.h
@@ -2,20 +2,20 @@
  * AC-3 and E-AC-3 decoder tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ac3dec_fixed.c b/libavcodec/ac3dec_fixed.c
new file mode 100644
index 0000000..6416da4
--- /dev/null
+++ b/libavcodec/ac3dec_fixed.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ *
+ * AC3 fixed-point decoder for MIPS platforms
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define USE_FIXED 1
+#define FFT_FIXED_32 1
+#include "ac3dec.h"
+
+
+static const int end_freq_inv_tab[8] =
+{
+    50529027, 44278013, 39403370, 32292987, 27356480, 23729101, 20951060, 18755316
+};
+
+static void scale_coefs (
+    int32_t *dst,
+    const int32_t *src,
+    int dynrng,
+    int len)
+{
+    int i, shift, round;
+    int16_t mul;
+    int temp, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    mul = (dynrng & 0x1f) + 0x20;
+    shift = 4 - ((dynrng << 23) >> 28);
+    if (shift > 0 ) {
+      round = 1 << (shift-1);
+      for (i=0; i<len; i+=8) {
+
+          temp = src[i] * mul;
+          temp1 = src[i+1] * mul;
+          temp = temp + round;
+          temp2 = src[i+2] * mul;
+
+          temp1 = temp1 + round;
+          dst[i] = temp >> shift;
+          temp3 = src[i+3] * mul;
+          temp2 = temp2 + round;
+
+          dst[i+1] = temp1 >> shift;
+          temp4 = src[i + 4] * mul;
+          temp3 = temp3 + round;
+          dst[i+2] = temp2 >> shift;
+
+          temp5 = src[i+5] * mul;
+          temp4 = temp4 + round;
+          dst[i+3] = temp3 >> shift;
+          temp6 = src[i+6] * mul;
+
+          dst[i+4] = temp4 >> shift;
+          temp5 = temp5 + round;
+          temp7 = src[i+7] * mul;
+          temp6 = temp6 + round;
+
+          dst[i+5] = temp5 >> shift;
+          temp7 = temp7 + round;
+          dst[i+6] = temp6 >> shift;
+          dst[i+7] = temp7 >> shift;
+
+      }
+    } else {
+      shift = -shift;
+      for (i=0; i<len; i+=8) {
+
+          temp = src[i] * mul;
+          temp1 = src[i+1] * mul;
+          temp2 = src[i+2] * mul;
+
+          dst[i] = temp << shift;
+          temp3 = src[i+3] * mul;
+
+          dst[i+1] = temp1 << shift;
+          temp4 = src[i + 4] * mul;
+          dst[i+2] = temp2 << shift;
+
+          temp5 = src[i+5] * mul;
+          dst[i+3] = temp3 << shift;
+          temp6 = src[i+6] * mul;
+
+          dst[i+4] = temp4 << shift;
+          temp7 = src[i+7] * mul;
+
+          dst[i+5] = temp5 << shift;
+          dst[i+6] = temp6 << shift;
+          dst[i+7] = temp7 << shift;
+
+      }
+    }
+}
+
+/**
+ * Downmix samples from original signal to stereo or mono (this is for 16-bit samples
+ * and fixed point decoder - original (for 32-bit samples) is in ac3dsp.c).
+ */
+static void ac3_downmix_c_fixed16(int16_t **samples, int16_t (*matrix)[2],
+                                  int out_ch, int in_ch, int len)
+{
+    int i, j;
+    int v0, v1;
+    if (out_ch == 2) {
+        for (i = 0; i < len; i++) {
+            v0 = v1 = 0;
+            for (j = 0; j < in_ch; j++) {
+                v0 += samples[j][i] * matrix[j][0];
+                v1 += samples[j][i] * matrix[j][1];
+            }
+            samples[0][i] = (v0+2048)>>12;
+            samples[1][i] = (v1+2048)>>12;
+        }
+    } else if (out_ch == 1) {
+        for (i = 0; i < len; i++) {
+            v0 = 0;
+            for (j = 0; j < in_ch; j++)
+                v0 += samples[j][i] * matrix[j][0];
+            samples[0][i] = (v0+2048)>>12;
+        }
+    }
+}
+
+#include "eac3dec.c"
+#include "ac3dec.c"
+
+static const AVOption options[] = {
+    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
+    { "heavy_compr", "enable heavy dynamic range compression", OFFSET(heavy_compression), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, PAR },
+    { NULL},
+};
+
+static const AVClass ac3_decoder_class = {
+    .class_name = "Fixed-Point AC-3 Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ac3_fixed_decoder = {
+    .name           = "ac3_fixed",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_AC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &ac3_decoder_class,
+};
diff --git a/libavcodec/ac3dec_float.c b/libavcodec/ac3dec_float.c
new file mode 100644
index 0000000..0a5319a
--- /dev/null
+++ b/libavcodec/ac3dec_float.c
@@ -0,0 +1,92 @@
+/*
+ * AC-3 Audio Decoder
+ * This code was developed as part of Google Summer of Code 2006.
+ * E-AC-3 support was added as part of Google Summer of Code 2007.
+ *
+ * Copyright (c) 2006 Kartikey Mahendra BHATT (bhattkm at gmail dot com)
+ * Copyright (c) 2007-2008 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
+ * Copyright (c) 2007 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * Upmix delay samples from stereo to original channel layout.
+ */
+#include "ac3dec.h"
+#include "eac3dec.c"
+#include "ac3dec.c"
+
+static const AVOption options[] = {
+    { "drc_scale", "percentage of dynamic range compression to apply", OFFSET(drc_scale), AV_OPT_TYPE_FLOAT, {.dbl = 1.0}, 0.0, 6.0, PAR },
+    { "heavy_compr", "enable heavy dynamic range compression", OFFSET(heavy_compression), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, PAR },
+    { "target_level", "target level in -dBFS (0 not applied)", OFFSET(target_level), AV_OPT_TYPE_INT, {.i64 = 0 }, -31, 0, PAR },
+
+{"dmix_mode", "Preferred Stereo Downmix Mode", OFFSET(preferred_stereo_downmix), AV_OPT_TYPE_INT, {.i64 = -1 }, -1, 2, 0, "dmix_mode"},
+{"ltrt_cmixlev",   "Lt/Rt Center Mix Level",   OFFSET(ltrt_center_mix_level),    AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"ltrt_surmixlev", "Lt/Rt Surround Mix Level", OFFSET(ltrt_surround_mix_level),  AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"loro_cmixlev",   "Lo/Ro Center Mix Level",   OFFSET(loro_center_mix_level),    AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+{"loro_surmixlev", "Lo/Ro Surround Mix Level", OFFSET(loro_surround_mix_level),  AV_OPT_TYPE_FLOAT, {.dbl = -1.0 }, -1.0, 2.0, 0},
+
+    { NULL},
+};
+
+static const AVClass ac3_decoder_class = {
+    .class_name = "AC3 decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ac3_decoder = {
+    .name           = "ac3",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_AC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &ac3_decoder_class,
+};
+
+#if CONFIG_EAC3_DECODER
+static const AVClass eac3_decoder_class = {
+    .class_name = "E-AC3 decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_eac3_decoder = {
+    .name           = "eac3",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_EAC3,
+    .priv_data_size = sizeof (AC3DecodeContext),
+    .init           = ac3_decode_init,
+    .close          = ac3_decode_end,
+    .decode         = ac3_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("ATSC A/52B (AC-3, E-AC-3)"),
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &eac3_decoder_class,
+};
+#endif
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 38c35b1..9902f90 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -2,20 +2,20 @@
  * AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -171,6 +171,48 @@ static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs)
     }
 }
 
+static void ac3_sum_square_butterfly_int32_c(int64_t sum[4],
+                                             const int32_t *coef0,
+                                             const int32_t *coef1,
+                                             int len)
+{
+    int i;
+
+    sum[0] = sum[1] = sum[2] = sum[3] = 0;
+
+    for (i = 0; i < len; i++) {
+        int lt = coef0[i];
+        int rt = coef1[i];
+        int md = lt + rt;
+        int sd = lt - rt;
+        MAC64(sum[0], lt, lt);
+        MAC64(sum[1], rt, rt);
+        MAC64(sum[2], md, md);
+        MAC64(sum[3], sd, sd);
+    }
+}
+
+static void ac3_sum_square_butterfly_float_c(float sum[4],
+                                             const float *coef0,
+                                             const float *coef1,
+                                             int len)
+{
+    int i;
+
+    sum[0] = sum[1] = sum[2] = sum[3] = 0;
+
+    for (i = 0; i < len; i++) {
+        float lt = coef0[i];
+        float rt = coef1[i];
+        float md = lt + rt;
+        float sd = lt - rt;
+        sum[0] += lt * lt;
+        sum[1] += rt * rt;
+        sum[2] += md * md;
+        sum[3] += sd * sd;
+    }
+}
+
 static void ac3_downmix_c(float **samples, float (*matrix)[2],
                           int out_ch, int in_ch, int len)
 {
@@ -196,6 +238,31 @@ static void ac3_downmix_c(float **samples, float (*matrix)[2],
     }
 }
 
+static void ac3_downmix_c_fixed(int32_t **samples, int16_t (*matrix)[2],
+                                int out_ch, int in_ch, int len)
+{
+    int i, j;
+    int64_t v0, v1;
+    if (out_ch == 2) {
+        for (i = 0; i < len; i++) {
+            v0 = v1 = 0;
+            for (j = 0; j < in_ch; j++) {
+                v0 += (int64_t)samples[j][i] * matrix[j][0];
+                v1 += (int64_t)samples[j][i] * matrix[j][1];
+            }
+            samples[0][i] = (v0+2048)>>12;
+            samples[1][i] = (v1+2048)>>12;
+        }
+    } else if (out_ch == 1) {
+        for (i = 0; i < len; i++) {
+            v0 = 0;
+            for (j = 0; j < in_ch; j++)
+                v0 += (int64_t)samples[j][i] * matrix[j][0];
+            samples[0][i] = (v0+2048)>>12;
+        }
+    }
+}
+
 static void apply_window_int16_c(int16_t *output, const int16_t *input,
                                  const int16_t *window, unsigned int len)
 {
@@ -220,11 +287,16 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
     c->update_bap_counts = ac3_update_bap_counts_c;
     c->compute_mantissa_size = ac3_compute_mantissa_size_c;
     c->extract_exponents = ac3_extract_exponents_c;
+    c->sum_square_butterfly_int32 = ac3_sum_square_butterfly_int32_c;
+    c->sum_square_butterfly_float = ac3_sum_square_butterfly_float_c;
     c->downmix = ac3_downmix_c;
+    c->downmix_fixed = ac3_downmix_c_fixed;
     c->apply_window_int16 = apply_window_int16_c;
 
     if (ARCH_ARM)
         ff_ac3dsp_init_arm(c, bit_exact);
     if (ARCH_X86)
         ff_ac3dsp_init_x86(c, bit_exact);
+    if (ARCH_MIPS)
+        ff_ac3dsp_init_mips(c, bit_exact);
 }
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index 6ca0c5b..ed98c8c 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -2,20 +2,20 @@
  * AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -126,9 +126,18 @@ typedef struct AC3DSPContext {
 
     void (*extract_exponents)(uint8_t *exp, int32_t *coef, int nb_coefs);
 
+    void (*sum_square_butterfly_int32)(int64_t sum[4], const int32_t *coef0,
+                                       const int32_t *coef1, int len);
+
+    void (*sum_square_butterfly_float)(float sum[4], const float *coef0,
+                                       const float *coef1, int len);
+
     void (*downmix)(float **samples, float (*matrix)[2], int out_ch,
                     int in_ch, int len);
 
+    void (*downmix_fixed)(int32_t **samples, int16_t (*matrix)[2], int out_ch,
+                          int in_ch, int len);
+
     /**
      * Apply symmetric window in 16-bit fixed-point.
      * @param output destination array
@@ -147,5 +156,6 @@ typedef struct AC3DSPContext {
 void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact);
 
 #endif /* AVCODEC_AC3DSP_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index da141e1..636ca72 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,13 +36,13 @@
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
+#include "internal.h"
 #include "me_cmp.h"
 #include "put_bits.h"
 #include "audiodsp.h"
 #include "ac3dsp.h"
 #include "ac3.h"
 #include "fft.h"
-#include "internal.h"
 #include "ac3enc.h"
 #include "eac3enc.h"
 
@@ -274,7 +274,7 @@ void ff_ac3_apply_rematrixing(AC3EncodeContext *s)
     int nb_coefs;
     int blk, bnd, i;
     int start, end;
-    uint8_t *flags;
+    uint8_t *flags = NULL;
 
     if (!s->rematrixing_enabled)
         return;
@@ -1183,7 +1183,7 @@ static inline int asym_quant(int c, int e, int qbits)
 {
     int m;
 
-    c = (((c << e) >> (24 - qbits)) + 1) >> 1;
+    c = (((c * (1<<e)) >> (24 - qbits)) + 1) >> 1;
     m = (1 << (qbits-1));
     if (c >= m)
         c = m - 1;
@@ -1211,14 +1211,11 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
     int i;
 
     for (i = start_freq; i < end_freq; i++) {
-        int v;
         int c = fixed_coef[i];
         int e = exp[i];
-        int b = bap[i];
-        switch (b) {
-        case 0:
-            v = 0;
-            break;
+        int v = bap[i];
+        if (v)
+        switch (v) {
         case 1:
             v = sym_quant(c, e, 3);
             switch (s->mant1_cnt) {
@@ -1287,7 +1284,7 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
             v = asym_quant(c, e, 16);
             break;
         default:
-            v = asym_quant(c, e, b - 1);
+            v = asym_quant(c, e, v - 1);
             break;
         }
         qmant[i] = v;
@@ -1387,7 +1384,7 @@ static void ac3_output_frame_header(AC3EncodeContext *s)
  */
 static void output_audio_block(AC3EncodeContext *s, int blk)
 {
-    int ch, i, baie, bnd, got_cpl, ch0;
+    int ch, i, baie, bnd, got_cpl, av_uninit(ch0);
     AC3Block *block = &s->blocks[blk];
 
     /* block switching */
@@ -2023,6 +2020,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
     AC3EncodeContext *s = avctx->priv_data;
 
     av_freep(&s->windowed_samples);
+    if (s->planar_samples)
     for (ch = 0; ch < s->channels; ch++)
         av_freep(&s->planar_samples[ch]);
     av_freep(&s->planar_samples);
@@ -2038,6 +2036,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
     av_freep(&s->qmant_buffer);
     av_freep(&s->cpl_coord_exp_buffer);
     av_freep(&s->cpl_coord_mant_buffer);
+    av_freep(&s->fdsp);
     for (blk = 0; blk < s->num_blocks; blk++) {
         AC3Block *block = &s->blocks[blk];
         av_freep(&block->mdct_coef);
@@ -2154,8 +2153,9 @@ static av_cold int validate_options(AC3EncodeContext *s)
 
     /* validate bit rate */
     if (s->eac3) {
-        int max_br, min_br, wpf, min_br_dist, min_br_code;
+        int max_br, min_br, wpf, min_br_code;
         int num_blks_code, num_blocks, frame_samples;
+        long long min_br_dist;
 
         /* calculate min/max bitrate */
         /* TODO: More testing with 3 and 2 blocks. All E-AC-3 samples I've
@@ -2185,9 +2185,9 @@ static av_cold int validate_options(AC3EncodeContext *s)
            this is needed for lookup tables for bandwidth and coupling
            parameter selection */
         min_br_code = -1;
-        min_br_dist = INT_MAX;
+        min_br_dist = INT64_MAX;
         for (i = 0; i < 19; i++) {
-            int br_dist = abs(ff_ac3_bitrate_tab[i] * 1000 - avctx->bit_rate);
+            long long br_dist = llabs(ff_ac3_bitrate_tab[i] * 1000 - avctx->bit_rate);
             if (br_dist < min_br_dist) {
                 min_br_dist = br_dist;
                 min_br_code = i;
@@ -2200,10 +2200,11 @@ static av_cold int validate_options(AC3EncodeContext *s)
             wpf--;
         s->frame_size_min = 2 * wpf;
     } else {
-        int best_br = 0, best_code = 0, best_diff = INT_MAX;
+        int best_br = 0, best_code = 0;
+        long long best_diff = INT64_MAX;
         for (i = 0; i < 19; i++) {
             int br   = (ff_ac3_bitrate_tab[i] >> s->bit_alloc.sr_shift) * 1000;
-            int diff = abs(br - avctx->bit_rate);
+            long long diff = llabs(br - avctx->bit_rate);
             if (diff < best_diff) {
                 best_br   = br;
                 best_code = i;
@@ -2251,7 +2252,7 @@ static av_cold int validate_options(AC3EncodeContext *s)
  */
 static av_cold void set_bandwidth(AC3EncodeContext *s)
 {
-    int blk, ch, cpl_start;
+    int blk, ch, av_uninit(cpl_start);
 
     if (s->cutoff) {
         /* calculate bandwidth based on user-specified cutoff frequency */
@@ -2330,50 +2331,50 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     if (s->allocate_sample_buffers(s))
         goto alloc_fail;
 
-    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->bap_buffer, total_coefs,
                      sizeof(*s->bap_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->bap1_buffer, total_coefs,
                      sizeof(*s->bap1_buffer), alloc_fail);
-    FF_ALLOCZ_OR_GOTO(avctx, s->mdct_coef_buffer, total_coefs *
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->mdct_coef_buffer, total_coefs,
                       sizeof(*s->mdct_coef_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->exp_buffer, total_coefs,
                      sizeof(*s->exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, channel_blocks * 128 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->grouped_exp_buffer, channel_blocks, 128 *
                      sizeof(*s->grouped_exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->psd_buffer, total_coefs,
                      sizeof(*s->psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, channel_blocks * 64 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->band_psd_buffer, channel_blocks, 64 *
                      sizeof(*s->band_psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, channel_blocks * 64 *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->mask_buffer, channel_blocks, 64 *
                      sizeof(*s->mask_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, total_coefs *
+    FF_ALLOC_ARRAY_OR_GOTO(avctx, s->qmant_buffer, total_coefs,
                      sizeof(*s->qmant_buffer), alloc_fail);
     if (s->cpl_enabled) {
-        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_exp_buffer, channel_blocks * 16 *
+        FF_ALLOC_ARRAY_OR_GOTO(avctx, s->cpl_coord_exp_buffer, channel_blocks, 16 *
                          sizeof(*s->cpl_coord_exp_buffer), alloc_fail);
-        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_mant_buffer, channel_blocks * 16 *
+        FF_ALLOC_ARRAY_OR_GOTO(avctx, s->cpl_coord_mant_buffer, channel_blocks, 16 *
                          sizeof(*s->cpl_coord_mant_buffer), alloc_fail);
     }
     for (blk = 0; blk < s->num_blocks; blk++) {
         AC3Block *block = &s->blocks[blk];
-        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, channels * sizeof(*block->mdct_coef),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->mdct_coef, channels, sizeof(*block->mdct_coef),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->exp, channels * sizeof(*block->exp),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->exp, channels, sizeof(*block->exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->grouped_exp, channels * sizeof(*block->grouped_exp),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->grouped_exp, channels, sizeof(*block->grouped_exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->psd, channels * sizeof(*block->psd),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->psd, channels, sizeof(*block->psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->band_psd, channels * sizeof(*block->band_psd),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->band_psd, channels, sizeof(*block->band_psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->mask, channels * sizeof(*block->mask),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->mask, channels, sizeof(*block->mask),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->qmant, channels * sizeof(*block->qmant),
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->qmant, channels, sizeof(*block->qmant),
                           alloc_fail);
         if (s->cpl_enabled) {
-            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_exp, channels * sizeof(*block->cpl_coord_exp),
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->cpl_coord_exp, channels, sizeof(*block->cpl_coord_exp),
                               alloc_fail);
-            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_mant, channels * sizeof(*block->cpl_coord_mant),
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->cpl_coord_mant, channels, sizeof(*block->cpl_coord_mant),
                               alloc_fail);
         }
 
@@ -2396,11 +2397,11 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     }
 
     if (!s->fixed_point) {
-        FF_ALLOCZ_OR_GOTO(avctx, s->fixed_coef_buffer, total_coefs *
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->fixed_coef_buffer, total_coefs,
                           sizeof(*s->fixed_coef_buffer), alloc_fail);
         for (blk = 0; blk < s->num_blocks; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->fixed_coef, channels,
                               sizeof(*block->fixed_coef), alloc_fail);
             for (ch = 0; ch < channels; ch++)
                 block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (s->num_blocks * ch + blk)];
@@ -2408,7 +2409,7 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
     } else {
         for (blk = 0; blk < s->num_blocks; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
+            FF_ALLOCZ_ARRAY_OR_GOTO(avctx, block->fixed_coef, channels,
                               sizeof(*block->fixed_coef), alloc_fail);
             for (ch = 0; ch < channels; ch++)
                 block->fixed_coef[ch] = (int32_t *)block->mdct_coef[ch];
@@ -2430,8 +2431,6 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
 
     s->eac3 = avctx->codec_id == AV_CODEC_ID_EAC3;
 
-    ff_ac3_common_init();
-
     ret = validate_options(s);
     if (ret)
         return ret;
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index 76b6d7f..a2442d0 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000 Fabrice Bellard
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -165,7 +165,7 @@ typedef struct AC3EncodeContext {
     AVCodecContext *avctx;                  ///< parent AVCodecContext
     PutBitContext pb;                       ///< bitstream writer context
     AudioDSPContext adsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     MECmpContext mecc;
     AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
     FFTContext mdct;                        ///< FFT context for MDCT calculation
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index 2bb82ef..9d39026 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,8 +35,13 @@
 
 #define AC3ENC_TYPE AC3ENC_TYPE_AC3_FIXED
 #include "ac3enc_opts_template.c"
-static const AVClass ac3enc_class = { "Fixed-Point AC-3 Encoder", av_default_item_name,
-                                      ac3_options, LIBAVUTIL_VERSION_INT };
+
+static const AVClass ac3enc_class = {
+    .class_name = "Fixed-Point AC-3 Encoder",
+    .item_name  = av_default_item_name,
+    .option     = ac3_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 #include "ac3enc_template.c"
 
@@ -97,6 +102,12 @@ static void scale_coefficients(AC3EncodeContext *s)
     }
 }
 
+static void sum_square_butterfly(AC3EncodeContext *s, int64_t sum[4],
+                                 const int32_t *coef0, const int32_t *coef1,
+                                 int len)
+{
+    s->ac3dsp.sum_square_butterfly_int32(sum, coef0, coef1, len);
+}
 
 /*
  * Clip MDCT coefficients to allowable range.
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 822f431..6c91f45 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2010 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,12 @@
 
 #define AC3ENC_TYPE AC3ENC_TYPE_AC3
 #include "ac3enc_opts_template.c"
-static const AVClass ac3enc_class = { "AC-3 Encoder", av_default_item_name,
-                                      ac3_options, LIBAVUTIL_VERSION_INT };
+static const AVClass ac3enc_class = {
+    .class_name = "AC-3 Encoder",
+    .item_name  = av_default_item_name,
+    .option     = ac3_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 #include "ac3enc_template.c"
 
@@ -68,7 +72,7 @@ av_cold int ff_ac3_float_mdct_init(AC3EncodeContext *s)
     n  = 1 << 9;
     n2 = n >> 1;
 
-    window = av_malloc(n * sizeof(*window));
+    window = av_malloc_array(n, sizeof(*window));
     if (!window) {
         av_log(s->avctx, AV_LOG_ERROR, "Cannot allocate memory.\n");
         return AVERROR(ENOMEM);
@@ -104,6 +108,12 @@ static void scale_coefficients(AC3EncodeContext *s)
                                chan_size * (s->channels + cpl));
 }
 
+static void sum_square_butterfly(AC3EncodeContext *s, float sum[4],
+                                 const float *coef0, const float *coef1,
+                                 int len)
+{
+    s->ac3dsp.sum_square_butterfly_float(sum, coef0, coef1, len);
+}
 
 /*
  * Clip MDCT coefficients to allowable range.
@@ -129,7 +139,9 @@ static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl)
 av_cold int ff_ac3_float_encode_init(AVCodecContext *avctx)
 {
     AC3EncodeContext *s = avctx->priv_data;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
     return ff_ac3_encode_init(avctx);
 }
 
diff --git a/libavcodec/ac3enc_opts_template.c b/libavcodec/ac3enc_opts_template.c
index a08c70d..57b65a7 100644
--- a/libavcodec/ac3enc_opts_template.c
+++ b/libavcodec/ac3enc_opts_template.c
@@ -2,20 +2,20 @@
  * AC-3 encoder options
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 
 static const AVOption ac3_options[] = {
 /* Metadata Options */
-{"per_frame_metadata", "Allow Changing Metadata Per-Frame", OFFSET(allow_per_frame_metadata), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, AC3ENC_PARAM},
+{"per_frame_metadata", "Allow Changing Metadata Per-Frame", OFFSET(allow_per_frame_metadata), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AC3ENC_PARAM},
 #if AC3ENC_TYPE != AC3ENC_TYPE_EAC3
 /* AC-3 downmix levels */
 {"center_mixlev", "Center Mix Level", OFFSET(center_mix_level), AV_OPT_TYPE_FLOAT, {.dbl = LEVEL_MINUS_4POINT5DB }, 0.0, 1.0, AC3ENC_PARAM},
@@ -68,7 +68,7 @@ static const AVOption ac3_options[] = {
     {"standard", "Standard (default)", 0, AV_OPT_TYPE_CONST, {.i64 = AC3ENC_OPT_ADCONV_STANDARD }, INT_MIN, INT_MAX, AC3ENC_PARAM, "ad_conv_type"},
     {"hdcd",     "HDCD",               0, AV_OPT_TYPE_CONST, {.i64 = AC3ENC_OPT_ADCONV_HDCD     }, INT_MIN, INT_MAX, AC3ENC_PARAM, "ad_conv_type"},
 /* Other Encoding Options */
-{"stereo_rematrixing", "Stereo Rematrixing", OFFSET(stereo_rematrixing), AV_OPT_TYPE_INT, {.i64 = AC3ENC_OPT_ON }, AC3ENC_OPT_OFF, AC3ENC_OPT_ON, AC3ENC_PARAM},
+{"stereo_rematrixing", "Stereo Rematrixing", OFFSET(stereo_rematrixing), AV_OPT_TYPE_BOOL, {.i64 = 1 }, 0, 1, AC3ENC_PARAM},
 {"channel_coupling",   "Channel Coupling",   OFFSET(channel_coupling),   AV_OPT_TYPE_INT, {.i64 = AC3ENC_OPT_AUTO }, AC3ENC_OPT_AUTO, AC3ENC_OPT_ON, AC3ENC_PARAM, "channel_coupling"},
     {"auto", "Selected by the Encoder", 0, AV_OPT_TYPE_CONST, {.i64 = AC3ENC_OPT_AUTO }, INT_MIN, INT_MAX, AC3ENC_PARAM, "channel_coupling"},
 {"cpl_start_band", "Coupling Start Band", OFFSET(cpl_start), AV_OPT_TYPE_INT, {.i64 = AC3ENC_OPT_AUTO }, AC3ENC_OPT_AUTO, 15, AC3ENC_PARAM, "cpl_start_band"},
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index 8febf85..9dec9ae 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2011 Justin Ruggles <justin.ruggles@gmail.com>
  * Copyright (c) 2006-2010 Prakash Punnoor <prakash@punnoor.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,9 @@ static void clip_coefficients(AudioDSPContext *adsp, CoefType *coef,
 
 static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl);
 
+static void sum_square_butterfly(AC3EncodeContext *s, CoefSumType sum[4],
+                                 const CoefType *coef0, const CoefType *coef1,
+                                 int len);
 
 int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
 {
@@ -54,7 +57,7 @@ int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
 
     FF_ALLOC_OR_GOTO(s->avctx, s->windowed_samples, AC3_WINDOW_SIZE *
                      sizeof(*s->windowed_samples), alloc_fail);
-    FF_ALLOC_OR_GOTO(s->avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
+    FF_ALLOC_ARRAY_OR_GOTO(s->avctx, s->planar_samples, s->channels, sizeof(*s->planar_samples),
                      alloc_fail);
     for (ch = 0; ch < s->channels; ch++) {
         FF_ALLOCZ_OR_GOTO(s->avctx, s->planar_samples[ch],
@@ -70,7 +73,7 @@ alloc_fail:
 
 /*
  * Copy input samples.
- * Channels are reordered from Libav's default order to AC-3 order.
+ * Channels are reordered from FFmpeg's default order to AC-3 order.
  */
 static void copy_input_samples(AC3EncodeContext *s, SampleType **samples)
 {
@@ -105,7 +108,7 @@ static void apply_mdct(AC3EncodeContext *s)
             const SampleType *input_samples = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];
 
 #if CONFIG_AC3ENC_FLOAT
-            s->fdsp.vector_fmul(s->windowed_samples, input_samples,
+            s->fdsp->vector_fmul(s->windowed_samples, input_samples,
                                 s->mdct_window, AC3_WINDOW_SIZE);
 #else
             s->ac3dsp.apply_window_int16(s->windowed_samples, input_samples,
@@ -133,7 +136,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
 #else
     int32_t (*fixed_cpl_coords)[AC3_MAX_CHANNELS][16] = cpl_coords;
 #endif
-    int blk, ch, bnd, i, j;
+    int av_uninit(blk), ch, bnd, i, j;
     CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
     int cpl_start, num_cpl_coefs;
 
@@ -260,7 +263,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
                 energy_cpl = energy[blk][CPL_CH][bnd];
                 energy_ch = energy[blk][ch][bnd];
                 blk1 = blk+1;
-                while (!s->blocks[blk1].new_cpl_coords[ch] && blk1 < s->num_blocks) {
+                while (blk1 < s->num_blocks && !s->blocks[blk1].new_cpl_coords[ch]) {
                     if (s->blocks[blk1].cpl_in_use) {
                         energy_cpl += energy[blk1][CPL_CH][bnd];
                         energy_ch += energy[blk1][ch][bnd];
@@ -336,8 +339,8 @@ static void apply_channel_coupling(AC3EncodeContext *s)
 static void compute_rematrixing_strategy(AC3EncodeContext *s)
 {
     int nb_coefs;
-    int blk, bnd, i;
-    AC3Block *block, *block0;
+    int blk, bnd;
+    AC3Block *block, *block0 = NULL;
 
     if (s->channel_mode != AC3_CHMODE_STEREO)
         return;
@@ -361,20 +364,12 @@ static void compute_rematrixing_strategy(AC3EncodeContext *s)
         }
 
         for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
-            /* calculate calculate sum of squared coeffs for one band in one block */
+            /* calculate sum of squared coeffs for one band in one block */
             int start = ff_ac3_rematrix_band_tab[bnd];
             int end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
-            CoefSumType sum[4] = {0,};
-            for (i = start; i < end; i++) {
-                CoefType lt = block->mdct_coef[1][i];
-                CoefType rt = block->mdct_coef[2][i];
-                CoefType md = lt + rt;
-                CoefType sd = lt - rt;
-                MAC_COEF(sum[0], lt, lt);
-                MAC_COEF(sum[1], rt, rt);
-                MAC_COEF(sum[2], md, md);
-                MAC_COEF(sum[3], sd, sd);
-            }
+            CoefSumType sum[4];
+            sum_square_butterfly(s, sum, block->mdct_coef[1] + start,
+                                 block->mdct_coef[2] + start, end - start);
 
             /* compare sums to determine if rematrixing will be used for this band */
             if (FFMIN(sum[2], sum[3]) < FFMIN(sum[0], sum[1]))
@@ -443,10 +438,8 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, AVPacket *avpkt,
 
     ff_ac3_quantize_mantissas(s);
 
-    if ((ret = ff_alloc_packet(avpkt, s->frame_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->frame_size, 0)) < 0)
         return ret;
-    }
     ff_ac3_output_frame(s, avpkt->data);
 
     if (frame->pts != AV_NOPTS_VALUE)
diff --git a/libavcodec/ac3tab.c b/libavcodec/ac3tab.c
index 3cd07f9..d62d8bf 100644
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -2,20 +2,20 @@
  * AC-3 tables
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -116,7 +116,7 @@ const uint8_t ff_ac3_enc_channel_map[8][2][6] = {
 };
 
 /**
- * Table to remap channels from from AC-3 order to SMPTE order.
+ * Table to remap channels from AC-3 order to SMPTE order.
  * [channel_mode][lfe][ch]
  */
 const uint8_t ff_ac3_dec_channel_map[8][2][6] = {
diff --git a/libavcodec/ac3tab.h b/libavcodec/ac3tab.h
index 83edec5..f529fc8 100644
--- a/libavcodec/ac3tab.h
+++ b/libavcodec/ac3tab.h
@@ -2,20 +2,20 @@
  * AC-3 tables
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,6 @@
 #include "libavutil/internal.h"
 #include "ac3.h"
 
-#if CONFIG_HARDCODED_TABLES
-#   define HCONST const
-#else
-#   define HCONST
-#endif
-
 extern const uint16_t ff_ac3_frame_size_tab[38][3];
 extern const uint8_t  ff_ac3_channels_tab[8];
 extern av_export const uint16_t avpriv_ac3_channel_layout_tab[8];
@@ -54,7 +48,7 @@ extern const int16_t  ff_ac3_floor_tab[8];
 extern const uint16_t ff_ac3_fast_gain_tab[8];
 extern const uint16_t ff_eac3_default_chmap[8];
 extern const uint8_t  ff_ac3_band_start_tab[AC3_CRITICAL_BANDS+1];
-extern HCONST uint8_t ff_ac3_bin_to_band_tab[253];
+extern const uint8_t  ff_ac3_bin_to_band_tab[253];
 
 /** Custom channel map locations bitmask
  *  Other channels described in documentation:
diff --git a/libavcodec/acelp_filters.c b/libavcodec/acelp_filters.c
index 93bec65..35aa863 100644
--- a/libavcodec/acelp_filters.c
+++ b/libavcodec/acelp_filters.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "avcodec.h"
 #include "acelp_filters.h"
@@ -46,7 +47,7 @@ void ff_acelp_interpolate(int16_t* out, const int16_t* in,
 {
     int n, i;
 
-    assert(frac_pos >= 0 && frac_pos < precision);
+    av_assert1(frac_pos >= 0 && frac_pos < precision);
 
     for (n = 0; n < length; n++) {
         int idx = 0;
@@ -69,7 +70,7 @@ void ff_acelp_interpolate(int16_t* out, const int16_t* in,
             v += in[n - i] * filter_coeffs[idx - frac_pos];
         }
         if (av_clip_int16(v >> 15) != (v >> 15))
-            av_log(NULL, AV_LOG_WARNING, "overflow that would need cliping in ff_acelp_interpolate()\n");
+            av_log(NULL, AV_LOG_WARNING, "overflow that would need clipping in ff_acelp_interpolate()\n");
         out[n] = v >> 15;
     }
 }
@@ -143,3 +144,12 @@ void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
     samples[0] -= tilt * *mem;
     *mem = new_tilt_mem;
 }
+
+void ff_acelp_filter_init(ACELPFContext *c)
+{
+    c->acelp_interpolatef                      = ff_acelp_interpolatef;
+    c->acelp_apply_order_2_transfer_function   = ff_acelp_apply_order_2_transfer_function;
+
+    if(HAVE_MIPSFPU)
+        ff_acelp_filter_init_mips(c);
+}
diff --git a/libavcodec/acelp_filters.h b/libavcodec/acelp_filters.h
index 2be4c24..fe86cb2 100644
--- a/libavcodec/acelp_filters.h
+++ b/libavcodec/acelp_filters.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,39 @@
 
 #include <stdint.h>
 
+typedef struct ACELPFContext {
+    /**
+    * Floating point version of ff_acelp_interpolate()
+    */
+    void (*acelp_interpolatef)(float *out, const float *in,
+                            const float *filter_coeffs, int precision,
+                            int frac_pos, int filter_length, int length);
+
+    /**
+     * Apply an order 2 rational transfer function in-place.
+     *
+     * @param out output buffer for filtered speech samples
+     * @param in input buffer containing speech data (may be the same as out)
+     * @param zero_coeffs z^-1 and z^-2 coefficients of the numerator
+     * @param pole_coeffs z^-1 and z^-2 coefficients of the denominator
+     * @param gain scale factor for final output
+     * @param mem intermediate values used by filter (should be 0 initially)
+     * @param n number of samples (should be a multiple of eight)
+     */
+    void (*acelp_apply_order_2_transfer_function)(float *out, const float *in,
+                                                  const float zero_coeffs[2],
+                                                  const float pole_coeffs[2],
+                                                  float gain,
+                                                  float mem[2], int n);
+
+}ACELPFContext;
+
+/**
+ * Initialize ACELPFContext.
+ */
+void ff_acelp_filter_init(ACELPFContext *c);
+void ff_acelp_filter_init_mips(ACELPFContext *c);
+
 /**
  * low-pass Finite Impulse Response filter coefficients.
  *
@@ -75,7 +108,7 @@ void ff_acelp_interpolatef(float *out, const float *in,
  *
  * The filter has a cut-off frequency of 1/80 of the sampling freq
  *
- * @note Two items before the top of the out buffer must contain two items from the
+ * @note Two items before the top of the in buffer must contain two items from the
  *       tail of the previous subframe.
  *
  * @remark It is safe to pass the same array in in and out parameters.
diff --git a/libavcodec/acelp_pitch_delay.c b/libavcodec/acelp_pitch_delay.c
index 1965772..8ec1ba3 100644
--- a/libavcodec/acelp_pitch_delay.c
+++ b/libavcodec/acelp_pitch_delay.c
@@ -3,24 +3,25 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/common.h"
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
@@ -107,9 +108,20 @@ int16_t ff_acelp_decode_gain_code(
     for(i=0; i<ma_pred_order; i++)
         mr_energy += quant_energy[i] * ma_prediction_coeff[i];
 
+#ifdef G729_BITEXACT
+    mr_energy += (((-6165LL * ff_log2(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size, 0))) >> 3) & ~0x3ff);
+
+    mr_energy = (5439 * (mr_energy >> 15)) >> 8;           // (0.15) = (0.15) * (7.23)
+
+    return bidir_sal(
+               ((ff_exp2(mr_energy & 0x7fff) + 16) >> 5) * (gain_corr_factor >> 1),
+               (mr_energy >> 15) - 25
+           );
+#else
     mr_energy = gain_corr_factor * exp(M_LN10 / (20 << 23) * mr_energy) /
                 sqrt(adsp->scalarproduct_int16(fc_v, fc_v, subframe_size));
     return mr_energy >> 12;
+#endif
 }
 
 float ff_amr_set_fixed_gain(float fixed_gain_factor, float fixed_mean_energy,
@@ -120,7 +132,7 @@ float ff_amr_set_fixed_gain(float fixed_gain_factor, float fixed_mean_energy,
     // ^g_c = ^gamma_gc * 100.05 (predicted dB + mean dB - dB of fixed vector)
     // Note 10^(0.05 * -10log(average x2)) = 1/sqrt((average x2)).
     float val = fixed_gain_factor *
-        exp2f(M_LOG2_10 * 0.05 *
+        ff_exp10(0.05 *
               (avpriv_scalarproduct_float_c(pred_table, prediction_error, 4) +
                energy_mean)) /
         sqrtf(fixed_mean_energy);
diff --git a/libavcodec/acelp_pitch_delay.h b/libavcodec/acelp_pitch_delay.h
index 7b5b33d..2aade2f 100644
--- a/libavcodec/acelp_pitch_delay.h
+++ b/libavcodec/acelp_pitch_delay.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/acelp_vectors.c b/libavcodec/acelp_vectors.c
index 0c660ac..798217d 100644
--- a/libavcodec/acelp_vectors.c
+++ b/libavcodec/acelp_vectors.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
@@ -50,6 +51,26 @@ const uint8_t ff_fc_2pulses_9bits_track1_gray[16] =
   28, 26,
 };
 
+const uint8_t ff_fc_2pulses_9bits_track2_gray[32] =
+{
+  0,  2,
+  5,  4,
+  12, 10,
+  7,  9,
+  25, 24,
+  20, 22,
+  14, 15,
+  19, 17,
+  36, 31,
+  21, 26,
+  1,  6,
+  16, 11,
+  27, 29,
+  32, 30,
+  39, 37,
+  34, 35,
+};
+
 const uint8_t ff_fc_4pulses_8bits_tracks_13[16] =
 {
   0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75,
@@ -219,11 +240,13 @@ void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
         int x   = in->x[i], repeats = !((in->no_repeat_mask >> i) & 1);
         float y = in->y[i] * scale;
 
-        do {
-            out[x] += y;
-            y *= in->pitch_fac;
-            x += in->pitch_lag;
-        } while (x < size && repeats);
+        if (in->pitch_lag > 0)
+            av_assert0(x < size);
+            do {
+                out[x] += y;
+                y *= in->pitch_fac;
+                x += in->pitch_lag;
+            } while (x < size && repeats);
     }
 }
 
@@ -234,9 +257,18 @@ void ff_clear_fixed_vector(float *out, const AMRFixed *in, int size)
     for (i=0; i < in->n; i++) {
         int x  = in->x[i], repeats = !((in->no_repeat_mask >> i) & 1);
 
-        do {
-            out[x] = 0.0;
-            x += in->pitch_lag;
-        } while (x < size && repeats);
+        if (in->pitch_lag > 0)
+            do {
+                out[x] = 0.0;
+                x += in->pitch_lag;
+            } while (x < size && repeats);
     }
 }
+
+void ff_acelp_vectors_init(ACELPVContext *c)
+{
+    c->weighted_vector_sumf   = ff_weighted_vector_sumf;
+
+    if(HAVE_MIPSFPU)
+        ff_acelp_vectors_init_mips(c);
+}
diff --git a/libavcodec/acelp_vectors.h b/libavcodec/acelp_vectors.h
index d6226bf..fae834d 100644
--- a/libavcodec/acelp_vectors.h
+++ b/libavcodec/acelp_vectors.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,30 @@
 
 #include <stdint.h>
 
+typedef struct ACELPVContext {
+    /**
+     * float implementation of weighted sum of two vectors.
+     * @param[out] out result of addition
+     * @param in_a first vector
+     * @param in_b second vector
+     * @param weight_coeff_a first vector weight coefficient
+     * @param weight_coeff_a second vector weight coefficient
+     * @param length vectors length (should be a multiple of two)
+     *
+     * @note It is safe to pass the same buffer for out and in_a or in_b.
+     */
+    void (*weighted_vector_sumf)(float *out, const float *in_a, const float *in_b,
+                                 float weight_coeff_a, float weight_coeff_b,
+                                 int length);
+
+}ACELPVContext;
+
+/**
+ * Initialize ACELPVContext.
+ */
+void ff_acelp_vectors_init(ACELPVContext *c);
+void ff_acelp_vectors_init_mips(ACELPVContext *c);
+
 /** Sparse representation for the algebraic codebook (fixed) vector */
 typedef struct AMRFixed {
     int      n;
@@ -82,6 +106,37 @@ extern const uint8_t ff_fc_2pulses_9bits_track1[16];
 extern const uint8_t ff_fc_2pulses_9bits_track1_gray[16];
 
 /**
+ * Track|Pulse|        Positions
+ * -----------------------------------------
+ *  2   | 1   | 0, 7, 14, 20, 27, 34,  1, 21
+ *      |     | 2, 9, 15, 22, 29, 35,  6, 26
+ *      |     | 4,10, 17, 24, 30, 37, 11, 31
+ *      |     | 5,12, 19, 25, 32, 39, 16, 36
+ * -----------------------------------------
+ *
+ * @remark Track in the table should be read top-to-bottom, left-to-right.
+ *
+ * @note (EE.1) This table (from the reference code) does not comply with
+ *              the specification.
+ *              The specification contains the following table:
+ *
+ * Track|Pulse|        Positions
+ * -----------------------------------------
+ *  2   | 1   | 0, 5, 10, 15, 20, 25, 30, 35
+ *      |     | 1, 6, 11, 16, 21, 26, 31, 36
+ *      |     | 2, 7, 12, 17, 22, 27, 32, 37
+ *      |     | 4, 9, 14, 19, 24, 29, 34, 39
+ *
+ * -----------------------------------------
+ *
+ * @note (EE.2) Reference G.729D code also uses gray decoding for each
+ *              pulse index before looking up the value in the table.
+ *
+ * Used in G.729 @@6.4k (with gray coding)
+ */
+extern const uint8_t ff_fc_2pulses_9bits_track2_gray[32];
+
+/**
  * b60 hamming windowed sinc function coefficients
  */
 extern const float ff_b60_sinc[61];
diff --git a/libavcodec/adpcm.c b/libavcodec/adpcm.c
index 3ab16dd..46c63a2 100644
--- a/libavcodec/adpcm.c
+++ b/libavcodec/adpcm.c
@@ -13,25 +13,24 @@
  * MAXIS EA ADPCM decoder by Robert Marston (rmarston@gmail.com)
  * THP ADPCM decoder by Marco Gerards (mgerards@xs4all.nl)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
 #include "get_bits.h"
-#include "put_bits.h"
 #include "bytestream.h"
 #include "adpcm.h"
 #include "adpcm_data.h"
@@ -85,8 +84,9 @@ static const int swf_index_tables[4][16] = {
 /* end of tables */
 
 typedef struct ADPCMDecodeContext {
-    ADPCMChannelStatus status[6];
+    ADPCMChannelStatus status[14];
     int vqa_version;                /**< VQA version. Used for ADPCM_IMA_WS */
+    int has_status;
 } ADPCMDecodeContext;
 
 static av_cold int adpcm_decode_init(AVCodecContext * avctx)
@@ -96,15 +96,29 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
     unsigned int max_channels = 2;
 
     switch(avctx->codec->id) {
+    case AV_CODEC_ID_ADPCM_DTK:
     case AV_CODEC_ID_ADPCM_EA:
         min_channels = 2;
         break;
+    case AV_CODEC_ID_ADPCM_AFC:
     case AV_CODEC_ID_ADPCM_EA_R1:
     case AV_CODEC_ID_ADPCM_EA_R2:
     case AV_CODEC_ID_ADPCM_EA_R3:
     case AV_CODEC_ID_ADPCM_EA_XAS:
         max_channels = 6;
         break;
+    case AV_CODEC_ID_ADPCM_MTAF:
+        min_channels = 2;
+        max_channels = 8;
+        break;
+    case AV_CODEC_ID_ADPCM_PSX:
+        max_channels = 8;
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_DAT4:
+    case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
+        max_channels = 14;
+        break;
     }
     if (avctx->channels < min_channels || avctx->channels > max_channels) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
@@ -116,10 +130,8 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
         c->status[0].step = c->status[1].step = 511;
         break;
     case AV_CODEC_ID_ADPCM_IMA_WAV:
-        if (avctx->bits_per_coded_sample != 4) {
-            av_log(avctx, AV_LOG_ERROR, "Only 4-bit ADPCM IMA WAV files are supported\n");
-            return -1;
-        }
+        if (avctx->bits_per_coded_sample < 2 || avctx->bits_per_coded_sample > 5)
+            return AVERROR_INVALIDDATA;
         break;
     case AV_CODEC_ID_ADPCM_IMA_APC:
         if (avctx->extradata && avctx->extradata_size >= 8) {
@@ -136,6 +148,8 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
     }
 
     switch(avctx->codec->id) {
+        case AV_CODEC_ID_ADPCM_AICA:
+        case AV_CODEC_ID_ADPCM_IMA_DAT4:
         case AV_CODEC_ID_ADPCM_IMA_QT:
         case AV_CODEC_ID_ADPCM_IMA_WAV:
         case AV_CODEC_ID_ADPCM_4XM:
@@ -145,6 +159,11 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
         case AV_CODEC_ID_ADPCM_EA_R3:
         case AV_CODEC_ID_ADPCM_EA_XAS:
         case AV_CODEC_ID_ADPCM_THP:
+        case AV_CODEC_ID_ADPCM_THP_LE:
+        case AV_CODEC_ID_ADPCM_AFC:
+        case AV_CODEC_ID_ADPCM_DTK:
+        case AV_CODEC_ID_ADPCM_PSX:
+        case AV_CODEC_ID_ADPCM_MTAF:
             avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
             break;
         case AV_CODEC_ID_ADPCM_IMA_WS:
@@ -158,7 +177,7 @@ static av_cold int adpcm_decode_init(AVCodecContext * avctx)
     return 0;
 }
 
-static inline short adpcm_ima_expand_nibble(ADPCMChannelStatus *c, char nibble, int shift)
+static inline int16_t adpcm_ima_expand_nibble(ADPCMChannelStatus *c, int8_t nibble, int shift)
 {
     int step_index;
     int predictor;
@@ -181,7 +200,30 @@ static inline short adpcm_ima_expand_nibble(ADPCMChannelStatus *c, char nibble,
     c->predictor = av_clip_int16(predictor);
     c->step_index = step_index;
 
-    return (short)c->predictor;
+    return (int16_t)c->predictor;
+}
+
+static inline int16_t adpcm_ima_wav_expand_nibble(ADPCMChannelStatus *c, GetBitContext *gb, int bps)
+{
+    int nibble, step_index, predictor, sign, delta, diff, step, shift;
+
+    shift = bps - 1;
+    nibble = get_bits_le(gb, bps),
+    step = ff_adpcm_step_table[c->step_index];
+    step_index = c->step_index + ff_adpcm_index_tables[bps - 2][nibble];
+    step_index = av_clip(step_index, 0, 88);
+
+    sign = nibble & (1 << shift);
+    delta = av_mod_uintp2(nibble, shift);
+    diff = ((2 * delta + 1) * step) >> shift;
+    predictor = c->predictor;
+    if (sign) predictor -= diff;
+    else predictor += diff;
+
+    c->predictor = av_clip_int16(predictor);
+    c->step_index = step_index;
+
+    return (int16_t)c->predictor;
 }
 
 static inline int adpcm_ima_qt_expand_nibble(ADPCMChannelStatus *c, int nibble, int shift)
@@ -210,7 +252,7 @@ static inline int adpcm_ima_qt_expand_nibble(ADPCMChannelStatus *c, int nibble,
     return c->predictor;
 }
 
-static inline short adpcm_ms_expand_nibble(ADPCMChannelStatus *c, int nibble)
+static inline int16_t adpcm_ms_expand_nibble(ADPCMChannelStatus *c, int nibble)
 {
     int predictor;
 
@@ -221,11 +263,36 @@ static inline short adpcm_ms_expand_nibble(ADPCMChannelStatus *c, int nibble)
     c->sample1 = av_clip_int16(predictor);
     c->idelta = (ff_adpcm_AdaptationTable[(int)nibble] * c->idelta) >> 8;
     if (c->idelta < 16) c->idelta = 16;
+    if (c->idelta > INT_MAX/768) {
+        av_log(NULL, AV_LOG_WARNING, "idelta overflow\n");
+        c->idelta = INT_MAX/768;
+    }
 
     return c->sample1;
 }
 
-static inline short adpcm_ct_expand_nibble(ADPCMChannelStatus *c, char nibble)
+static inline int16_t adpcm_ima_oki_expand_nibble(ADPCMChannelStatus *c, int nibble)
+{
+    int step_index, predictor, sign, delta, diff, step;
+
+    step = ff_adpcm_oki_step_table[c->step_index];
+    step_index = c->step_index + ff_adpcm_index_table[(unsigned)nibble];
+    step_index = av_clip(step_index, 0, 48);
+
+    sign = nibble & 8;
+    delta = nibble & 7;
+    diff = ((2 * delta + 1) * step) >> 3;
+    predictor = c->predictor;
+    if (sign) predictor -= diff;
+    else predictor += diff;
+
+    c->predictor = av_clip_intp2(predictor, 11);
+    c->step_index = step_index;
+
+    return c->predictor << 4;
+}
+
+static inline int16_t adpcm_ct_expand_nibble(ADPCMChannelStatus *c, int8_t nibble)
 {
     int sign, delta, diff;
     int new_step;
@@ -243,10 +310,10 @@ static inline short adpcm_ct_expand_nibble(ADPCMChannelStatus *c, char nibble)
     new_step = (ff_adpcm_AdaptationTable[nibble & 7] * c->step) >> 8;
     c->step = av_clip(new_step, 511, 32767);
 
-    return (short)c->predictor;
+    return (int16_t)c->predictor;
 }
 
-static inline short adpcm_sbpro_expand_nibble(ADPCMChannelStatus *c, char nibble, int size, int shift)
+static inline int16_t adpcm_sbpro_expand_nibble(ADPCMChannelStatus *c, int8_t nibble, int size, int shift)
 {
     int sign, delta, diff;
 
@@ -263,10 +330,10 @@ static inline short adpcm_sbpro_expand_nibble(ADPCMChannelStatus *c, char nibble
     else if (delta == 0 && c->step > 0)
         c->step--;
 
-    return (short) c->predictor;
+    return (int16_t) c->predictor;
 }
 
-static inline short adpcm_yamaha_expand_nibble(ADPCMChannelStatus *c, unsigned char nibble)
+static inline int16_t adpcm_yamaha_expand_nibble(ADPCMChannelStatus *c, uint8_t nibble)
 {
     if(!c->step) {
         c->predictor = 0;
@@ -280,6 +347,15 @@ static inline short adpcm_yamaha_expand_nibble(ADPCMChannelStatus *c, unsigned c
     return c->predictor;
 }
 
+static inline int16_t adpcm_mtaf_expand_nibble(ADPCMChannelStatus *c, uint8_t nibble)
+{
+    c->predictor += ff_adpcm_mtaf_stepsize[c->step][nibble];
+    c->predictor = av_clip_int16(c->predictor);
+    c->step += ff_adpcm_index_table[nibble];
+    c->step = av_clip_uintp2(c->step, 5);
+    return c->predictor;
+}
+
 static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
                      const uint8_t *in, ADPCMChannelStatus *left,
                      ADPCMChannelStatus *right, int channels, int sample_offset)
@@ -298,11 +374,9 @@ static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
     for(i=0;i<4;i++) {
         shift  = 12 - (in[4+i*2] & 15);
         filter = in[4+i*2] >> 4;
-        if (filter > 4) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid XA-ADPCM filter %d (max. allowed is 4)\n",
-                   filter);
-            return AVERROR_INVALIDDATA;
+        if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table)) {
+            avpriv_request_sample(avctx, "unknown XA-ADPCM filter %d", filter);
+            filter=0;
         }
         f0 = xa_adpcm_table[filter][0];
         f1 = xa_adpcm_table[filter][1];
@@ -329,12 +403,11 @@ static int xa_decode(AVCodecContext *avctx, int16_t *out0, int16_t *out1,
 
         shift  = 12 - (in[5+i*2] & 15);
         filter = in[5+i*2] >> 4;
-        if (filter > 4) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid XA-ADPCM filter %d (max. allowed is 4)\n",
-                   filter);
-            return AVERROR_INVALIDDATA;
+        if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table)) {
+            avpriv_request_sample(avctx, "unknown XA-ADPCM filter %d", filter);
+            filter=0;
         }
+
         f0 = xa_adpcm_table[filter][0];
         f1 = xa_adpcm_table[filter][1];
 
@@ -393,7 +466,7 @@ static void adpcm_swf_decode(AVCodecContext *avctx, const uint8_t *buf, int buf_
                 // similar to IMA adpcm
                 int delta = get_bits(&gb, nb_bits);
                 int step = ff_adpcm_step_table[c->status[i].step_index];
-                long vpdiff = 0; // vpdiff = (delta+0.5)*step/4
+                int vpdiff = 0; // vpdiff = (delta+0.5)*step/4
                 int k = k0;
 
                 do {
@@ -428,9 +501,11 @@ static void adpcm_swf_decode(AVCodecContext *avctx, const uint8_t *buf, int buf_
  * @param[out] coded_samples set to the number of samples as coded in the
  *                           packet, or 0 if the codec does not encode the
  *                           number of samples in each frame.
+ * @param[out] approx_nb_samples set to non-zero if the number of samples
+ *                               returned is an approximation.
  */
 static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
-                          int buf_size, int *coded_samples)
+                          int buf_size, int *coded_samples, int *approx_nb_samples)
 {
     ADPCMDecodeContext *s = avctx->priv_data;
     int nb_samples        = 0;
@@ -439,6 +514,10 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     int header_size;
 
     *coded_samples = 0;
+    *approx_nb_samples = 0;
+
+    if(ch <= 0)
+        return 0;
 
     switch (avctx->codec->id) {
     /* constant, only check buf_size */
@@ -456,8 +535,10 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     case AV_CODEC_ID_ADPCM_CT:
     case AV_CODEC_ID_ADPCM_IMA_APC:
     case AV_CODEC_ID_ADPCM_IMA_EA_SEAD:
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
     case AV_CODEC_ID_ADPCM_IMA_WS:
     case AV_CODEC_ID_ADPCM_YAMAHA:
+    case AV_CODEC_ID_ADPCM_AICA:
         nb_samples = buf_size * 2 / ch;
         break;
     }
@@ -468,9 +549,10 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     header_size = 0;
     switch (avctx->codec->id) {
         case AV_CODEC_ID_ADPCM_4XM:
+        case AV_CODEC_ID_ADPCM_IMA_DAT4:
         case AV_CODEC_ID_ADPCM_IMA_ISS:     header_size = 4 * ch;      break;
         case AV_CODEC_ID_ADPCM_IMA_AMV:     header_size = 8;           break;
-        case AV_CODEC_ID_ADPCM_IMA_SMJPEG:  header_size = 4;           break;
+        case AV_CODEC_ID_ADPCM_IMA_SMJPEG:  header_size = 4 * ch;      break;
     }
     if (header_size > 0)
         return (buf_size - header_size) * 2 / ch;
@@ -514,6 +596,7 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         *coded_samples -= *coded_samples % 28;
         nb_samples      = (buf_size - header_size) * 2 / ch;
         nb_samples     -= nb_samples % 28;
+        *approx_nb_samples = 1;
         break;
     case AV_CODEC_ID_ADPCM_IMA_DK3:
         if (avctx->block_align > 0)
@@ -523,17 +606,35 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
     case AV_CODEC_ID_ADPCM_IMA_DK4:
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
+        if (buf_size < 4 * ch)
+            return AVERROR_INVALIDDATA;
         nb_samples = 1 + (buf_size - 4 * ch) * 2 / ch;
         break;
+    case AV_CODEC_ID_ADPCM_IMA_RAD:
+        if (avctx->block_align > 0)
+            buf_size = FFMIN(buf_size, avctx->block_align);
+        nb_samples = (buf_size - 4 * ch) * 2 / ch;
+        break;
     case AV_CODEC_ID_ADPCM_IMA_WAV:
+    {
+        int bsize = ff_adpcm_ima_block_sizes[avctx->bits_per_coded_sample - 2];
+        int bsamples = ff_adpcm_ima_block_samples[avctx->bits_per_coded_sample - 2];
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
-        nb_samples = 1 + (buf_size - 4 * ch) / (4 * ch) * 8;
+        if (buf_size < 4 * ch)
+            return AVERROR_INVALIDDATA;
+        nb_samples = 1 + (buf_size - 4 * ch) / (bsize * ch) * bsamples;
         break;
+    }
     case AV_CODEC_ID_ADPCM_MS:
         if (avctx->block_align > 0)
             buf_size = FFMIN(buf_size, avctx->block_align);
-        nb_samples = 2 + (buf_size - 7 * ch) * 2 / ch;
+        nb_samples = (buf_size - 6 * ch) * 2 / ch;
+        break;
+    case AV_CODEC_ID_ADPCM_MTAF:
+        if (avctx->block_align > 0)
+            buf_size = FFMIN(buf_size, avctx->block_align);
+        nb_samples = (buf_size - 16 * (ch / 2)) * 2 / ch;
         break;
     case AV_CODEC_ID_ADPCM_SBPRO_2:
     case AV_CODEC_ID_ADPCM_SBPRO_3:
@@ -546,6 +647,8 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         case AV_CODEC_ID_ADPCM_SBPRO_4: samples_per_byte = 2; break;
         }
         if (!s->status[0].step_index) {
+            if (buf_size < ch)
+                return AVERROR_INVALIDDATA;
             nb_samples++;
             buf_size -= ch;
         }
@@ -566,15 +669,33 @@ static int get_nb_samples(AVCodecContext *avctx, GetByteContext *gb,
         break;
     }
     case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
+        if (avctx->extradata) {
+            nb_samples = buf_size * 14 / (8 * ch);
+            break;
+        }
         has_coded_samples = 1;
         bytestream2_skip(gb, 4); // channel size
-        *coded_samples  = bytestream2_get_be32(gb);
-        *coded_samples -= *coded_samples % 14;
-        nb_samples      = (buf_size - 80) / (8 * ch) * 14;
+        *coded_samples  = (avctx->codec->id == AV_CODEC_ID_ADPCM_THP_LE) ?
+                          bytestream2_get_le32(gb) :
+                          bytestream2_get_be32(gb);
+        buf_size       -= 8 + 36 * ch;
+        buf_size       /= ch;
+        nb_samples      = buf_size / 8 * 14;
+        if (buf_size % 8 > 1)
+            nb_samples     += (buf_size % 8 - 1) * 2;
+        *approx_nb_samples = 1;
+        break;
+    case AV_CODEC_ID_ADPCM_AFC:
+        nb_samples = buf_size / (9 * ch) * 16;
         break;
     case AV_CODEC_ID_ADPCM_XA:
         nb_samples = (buf_size / 128) * 224 / ch;
         break;
+    case AV_CODEC_ID_ADPCM_DTK:
+    case AV_CODEC_ID_ADPCM_PSX:
+        nb_samples = buf_size / (16 * ch) * 28;
+        break;
     }
 
     /* validate coded sample count */
@@ -593,15 +714,15 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
     ADPCMDecodeContext *c = avctx->priv_data;
     ADPCMChannelStatus *cs;
     int n, m, channel, i;
-    short *samples;
+    int16_t *samples;
     int16_t **samples_p;
     int st; /* stereo */
     int count1, count2;
-    int nb_samples, coded_samples, ret;
+    int nb_samples, coded_samples, approx_nb_samples, ret;
     GetByteContext gb;
 
     bytestream2_init(&gb, buf, buf_size);
-    nb_samples = get_nb_samples(avctx, &gb, buf_size, &coded_samples);
+    nb_samples = get_nb_samples(avctx, &gb, buf_size, &coded_samples, &approx_nb_samples);
     if (nb_samples <= 0) {
         av_log(avctx, AV_LOG_ERROR, "invalid number of samples in packet\n");
         return AVERROR_INVALIDDATA;
@@ -609,17 +730,15 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = nb_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
-    samples = (short *)frame->data[0];
+    samples = (int16_t *)frame->data[0];
     samples_p = (int16_t **)frame->extended_data;
 
     /* use coded_samples when applicable */
     /* it is always <= nb_samples, so the output buffer will be large enough */
     if (coded_samples) {
-        if (coded_samples != nb_samples)
+        if (!approx_nb_samples && coded_samples != nb_samples)
             av_log(avctx, AV_LOG_WARNING, "mismatch in coded sample count\n");
         frame->nb_samples = nb_samples = coded_samples;
     }
@@ -681,6 +800,33 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
 
+        if (avctx->bits_per_coded_sample != 4) {
+            int samples_per_block = ff_adpcm_ima_block_samples[avctx->bits_per_coded_sample - 2];
+            int block_size = ff_adpcm_ima_block_sizes[avctx->bits_per_coded_sample - 2];
+            uint8_t temp[20] = { 0 };
+            GetBitContext g;
+
+            for (n = 0; n < (nb_samples - 1) / samples_per_block; n++) {
+                for (i = 0; i < avctx->channels; i++) {
+                    int j;
+
+                    cs = &c->status[i];
+                    samples = &samples_p[i][1 + n * samples_per_block];
+                    for (j = 0; j < block_size; j++) {
+                        temp[j] = buf[4 * avctx->channels + block_size * n * avctx->channels +
+                                        (j % 4) + (j / 4) * (avctx->channels * 4) + i * 4];
+                    }
+                    ret = init_get_bits8(&g, (const uint8_t *)&temp, block_size);
+                    if (ret < 0)
+                        return ret;
+                    for (m = 0; m < samples_per_block; m++) {
+                        samples[m] = adpcm_ima_wav_expand_nibble(cs, &g,
+                                          avctx->bits_per_coded_sample);
+                    }
+                }
+            }
+            bytestream2_skip(&gb, avctx->block_align - avctx->channels * 4);
+        } else {
         for (n = 0; n < (nb_samples - 1) / 8; n++) {
             for (i = 0; i < avctx->channels; i++) {
                 cs = &c->status[i];
@@ -692,6 +838,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 }
             }
         }
+        }
         break;
     case AV_CODEC_ID_ADPCM_4XM:
         for (i = 0; i < avctx->channels; i++)
@@ -759,6 +906,27 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     }
+    case AV_CODEC_ID_ADPCM_MTAF:
+        for (channel = 0; channel < avctx->channels; channel+=2) {
+            bytestream2_skipu(&gb, 4);
+            c->status[channel    ].step      = bytestream2_get_le16u(&gb);
+            c->status[channel + 1].step      = bytestream2_get_le16u(&gb);
+            c->status[channel    ].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
+            bytestream2_skipu(&gb, 2);
+            c->status[channel + 1].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
+            bytestream2_skipu(&gb, 2);
+            for (n = 0; n < nb_samples; n+=2) {
+                int v = bytestream2_get_byteu(&gb);
+                samples_p[channel][n    ] = adpcm_mtaf_expand_nibble(&c->status[channel], v & 0x0F);
+                samples_p[channel][n + 1] = adpcm_mtaf_expand_nibble(&c->status[channel], v >> 4  );
+            }
+            for (n = 0; n < nb_samples; n+=2) {
+                int v = bytestream2_get_byteu(&gb);
+                samples_p[channel + 1][n    ] = adpcm_mtaf_expand_nibble(&c->status[channel + 1], v & 0x0F);
+                samples_p[channel + 1][n + 1] = adpcm_mtaf_expand_nibble(&c->status[channel + 1], v >> 4  );
+            }
+        }
+        break;
     case AV_CODEC_ID_ADPCM_IMA_DK4:
         for (channel = 0; channel < avctx->channels; channel++) {
             cs = &c->status[channel];
@@ -770,7 +938,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 return AVERROR_INVALIDDATA;
             }
         }
-        for (n = (nb_samples >> (1 - st)) - 1; n > 0; n--) {
+        for (n = (nb_samples - 1) >> (1 - st); n > 0; n--) {
             int v = bytestream2_get_byteu(&gb);
             *samples++ = adpcm_ima_expand_nibble(&c->status[0 ], v >> 4  , 3);
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v & 0x0F, 3);
@@ -835,6 +1003,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = c->status[0].predictor + c->status[1].predictor;
             *samples++ = c->status[0].predictor - c->status[1].predictor;
         }
+
+        if ((bytestream2_tell(&gb) & 1))
+            bytestream2_skip(&gb, 1);
         break;
     }
     case AV_CODEC_ID_ADPCM_IMA_ISS:
@@ -864,6 +1035,18 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v2, 3);
         }
         break;
+    case AV_CODEC_ID_ADPCM_IMA_DAT4:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            cs = &c->status[channel];
+            samples = samples_p[channel];
+            bytestream2_skip(&gb, 4);
+            for (n = 0; n < nb_samples; n += 2) {
+                int v = bytestream2_get_byteu(&gb);
+                *samples++ = adpcm_ima_expand_nibble(cs, v >> 4  , 3);
+                *samples++ = adpcm_ima_expand_nibble(cs, v & 0x0F, 3);
+            }
+        }
+        break;
     case AV_CODEC_ID_ADPCM_IMA_APC:
         while (bytestream2_get_bytes_left(&gb) > 0) {
             int v = bytestream2_get_byteu(&gb);
@@ -871,6 +1054,38 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_ima_expand_nibble(&c->status[st], v & 0x0F, 3);
         }
         break;
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
+        while (bytestream2_get_bytes_left(&gb) > 0) {
+            int v = bytestream2_get_byteu(&gb);
+            *samples++ = adpcm_ima_oki_expand_nibble(&c->status[0],  v >> 4  );
+            *samples++ = adpcm_ima_oki_expand_nibble(&c->status[st], v & 0x0F);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_RAD:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            cs = &c->status[channel];
+            cs->step_index = sign_extend(bytestream2_get_le16u(&gb), 16);
+            cs->predictor  = sign_extend(bytestream2_get_le16u(&gb), 16);
+            if (cs->step_index > 88u){
+                av_log(avctx, AV_LOG_ERROR, "ERROR: step_index[%d] = %i\n",
+                       channel, cs->step_index);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        for (n = 0; n < nb_samples / 2; n++) {
+            int byte[2];
+
+            byte[0] = bytestream2_get_byteu(&gb);
+            if (st)
+                byte[1] = bytestream2_get_byteu(&gb);
+            for(channel = 0; channel < avctx->channels; channel++) {
+                *samples++ = adpcm_ima_expand_nibble(&c->status[channel], byte[channel] & 0x0F, 3);
+            }
+            for(channel = 0; channel < avctx->channels; channel++) {
+                *samples++ = adpcm_ima_expand_nibble(&c->status[channel], byte[channel] >> 4  , 3);
+            }
+        }
+        break;
     case AV_CODEC_ID_ADPCM_IMA_WS:
         if (c->vqa_version == 3) {
             for (channel = 0; channel < avctx->channels; channel++) {
@@ -946,6 +1161,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         /* Each EA ADPCM frame has a 12-byte header followed by 30-byte pieces,
            each coding 28 stereo samples. */
 
+        if(avctx->channels != 2)
+            return AVERROR_INVALIDDATA;
+
         current_left_sample   = sign_extend(bytestream2_get_le16u(&gb), 16);
         previous_left_sample  = sign_extend(bytestream2_get_le16u(&gb), 16);
         current_right_sample  = sign_extend(bytestream2_get_le16u(&gb), 16);
@@ -1131,16 +1349,9 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     case AV_CODEC_ID_ADPCM_IMA_AMV:
-    case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
-        if (avctx->codec->id == AV_CODEC_ID_ADPCM_IMA_AMV) {
-            c->status[0].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
-            c->status[0].step_index = bytestream2_get_le16u(&gb);
-            bytestream2_skipu(&gb, 4);
-        } else {
-            c->status[0].predictor = sign_extend(bytestream2_get_be16u(&gb), 16);
-            c->status[0].step_index = bytestream2_get_byteu(&gb);
-            bytestream2_skipu(&gb, 1);
-        }
+        c->status[0].predictor = sign_extend(bytestream2_get_le16u(&gb), 16);
+        c->status[0].step_index = bytestream2_get_byteu(&gb);
+        bytestream2_skipu(&gb, 5);
         if (c->status[0].step_index > 88u) {
             av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n",
                    c->status[0].step_index);
@@ -1148,18 +1359,29 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
         }
 
         for (n = nb_samples >> (1 - st); n > 0; n--) {
-            int hi, lo, v = bytestream2_get_byteu(&gb);
+            int v = bytestream2_get_byteu(&gb);
 
-            if (avctx->codec->id == AV_CODEC_ID_ADPCM_IMA_AMV) {
-                hi = v & 0x0F;
-                lo = v >> 4;
-            } else {
-                lo = v & 0x0F;
-                hi = v >> 4;
+            *samples++ = adpcm_ima_expand_nibble(&c->status[0], v >> 4, 3);
+            *samples++ = adpcm_ima_expand_nibble(&c->status[0], v & 0xf, 3);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
+        for (i = 0; i < avctx->channels; i++) {
+            c->status[i].predictor = sign_extend(bytestream2_get_be16u(&gb), 16);
+            c->status[i].step_index = bytestream2_get_byteu(&gb);
+            bytestream2_skipu(&gb, 1);
+            if (c->status[i].step_index > 88u) {
+                av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n",
+                       c->status[i].step_index);
+                return AVERROR_INVALIDDATA;
             }
+        }
+
+        for (n = nb_samples >> (1 - st); n > 0; n--) {
+            int v = bytestream2_get_byteu(&gb);
 
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], lo, 3);
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], hi, 3);
+            *samples++ = adpcm_ima_qt_expand_nibble(&c->status[0 ], v >> 4, 3);
+            *samples++ = adpcm_ima_qt_expand_nibble(&c->status[st], v & 0xf, 3);
         }
         break;
     case AV_CODEC_ID_ADPCM_CT:
@@ -1189,7 +1411,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                                                        byte & 0x0F, 4, 0);
             }
         } else if (avctx->codec->id == AV_CODEC_ID_ADPCM_SBPRO_3) {
-            for (n = nb_samples / 3; n > 0; n--) {
+            for (n = (nb_samples<<st) / 3; n > 0; n--) {
                 int byte = bytestream2_get_byteu(&gb);
                 *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
                                                         byte >> 5        , 3, 0);
@@ -1223,26 +1445,119 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
             *samples++ = adpcm_yamaha_expand_nibble(&c->status[st], v >> 4  );
         }
         break;
+    case AV_CODEC_ID_ADPCM_AICA:
+        if (!c->has_status) {
+            for (channel = 0; channel < avctx->channels; channel++)
+                c->status[channel].step = 0;
+            c->has_status = 1;
+        }
+        for (channel = 0; channel < avctx->channels; channel++) {
+            samples = samples_p[channel];
+            for (n = nb_samples >> 1; n > 0; n--) {
+                int v = bytestream2_get_byteu(&gb);
+                *samples++ = adpcm_yamaha_expand_nibble(&c->status[channel], v & 0x0F);
+                *samples++ = adpcm_yamaha_expand_nibble(&c->status[channel], v >> 4  );
+            }
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_AFC:
+    {
+        int samples_per_block;
+        int blocks;
+
+        if (avctx->extradata && avctx->extradata_size == 1 && avctx->extradata[0]) {
+            samples_per_block = avctx->extradata[0] / 16;
+            blocks = nb_samples / avctx->extradata[0];
+        } else {
+            samples_per_block = nb_samples / 16;
+            blocks = 1;
+        }
+
+        for (m = 0; m < blocks; m++) {
+        for (channel = 0; channel < avctx->channels; channel++) {
+            int prev1 = c->status[channel].sample1;
+            int prev2 = c->status[channel].sample2;
+
+            samples = samples_p[channel] + m * 16;
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < samples_per_block; i++) {
+                int byte = bytestream2_get_byteu(&gb);
+                int scale = 1 << (byte >> 4);
+                int index = byte & 0xf;
+                int factor1 = ff_adpcm_afc_coeffs[0][index];
+                int factor2 = ff_adpcm_afc_coeffs[1][index];
+
+                /* Decode 16 samples.  */
+                for (n = 0; n < 16; n++) {
+                    int32_t sampledat;
+
+                    if (n & 1) {
+                        sampledat = sign_extend(byte, 4);
+                    } else {
+                        byte = bytestream2_get_byteu(&gb);
+                        sampledat = sign_extend(byte >> 4, 4);
+                    }
+
+                    sampledat = ((prev1 * factor1 + prev2 * factor2) +
+                                 ((sampledat * scale) << 11)) >> 11;
+                    *samples = av_clip_int16(sampledat);
+                    prev2 = prev1;
+                    prev1 = *samples++;
+                }
+            }
+
+            c->status[channel].sample1 = prev1;
+            c->status[channel].sample2 = prev2;
+        }
+        }
+        bytestream2_seek(&gb, 0, SEEK_END);
+        break;
+    }
     case AV_CODEC_ID_ADPCM_THP:
+    case AV_CODEC_ID_ADPCM_THP_LE:
     {
-        int table[2][16];
-        int prev[2][2];
+        int table[14][16];
         int ch;
 
-        for (i = 0; i < 2; i++)
-            for (n = 0; n < 16; n++)
-                table[i][n] = sign_extend(bytestream2_get_be16u(&gb), 16);
+#define THP_GET16(g) \
+    sign_extend( \
+        avctx->codec->id == AV_CODEC_ID_ADPCM_THP_LE ? \
+        bytestream2_get_le16u(&(g)) : \
+        bytestream2_get_be16u(&(g)), 16)
+
+        if (avctx->extradata) {
+            GetByteContext tb;
+            if (avctx->extradata_size < 32 * avctx->channels) {
+                av_log(avctx, AV_LOG_ERROR, "Missing coeff table\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-        /* Initialize the previous sample.  */
-        for (i = 0; i < 2; i++)
-            for (n = 0; n < 2; n++)
-                prev[i][n] = sign_extend(bytestream2_get_be16u(&gb), 16);
+            bytestream2_init(&tb, avctx->extradata, avctx->extradata_size);
+            for (i = 0; i < avctx->channels; i++)
+                for (n = 0; n < 16; n++)
+                    table[i][n] = THP_GET16(tb);
+        } else {
+            for (i = 0; i < avctx->channels; i++)
+                for (n = 0; n < 16; n++)
+                    table[i][n] = THP_GET16(gb);
+
+            if (!c->has_status) {
+                /* Initialize the previous sample.  */
+                for (i = 0; i < avctx->channels; i++) {
+                    c->status[i].sample1 = THP_GET16(gb);
+                    c->status[i].sample2 = THP_GET16(gb);
+                }
+                c->has_status = 1;
+            } else {
+                bytestream2_skip(&gb, avctx->channels * 4);
+            }
+        }
 
-        for (ch = 0; ch <= st; ch++) {
+        for (ch = 0; ch < avctx->channels; ch++) {
             samples = samples_p[ch];
 
             /* Read in every sample for this channel.  */
-            for (i = 0; i < nb_samples / 14; i++) {
+            for (i = 0; i < (nb_samples + 13) / 14; i++) {
                 int byte = bytestream2_get_byteu(&gb);
                 int index = (byte >> 4) & 7;
                 unsigned int exp = byte & 0x0F;
@@ -1250,7 +1565,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                 int factor2 = table[ch][index * 2 + 1];
 
                 /* Decode 14 samples.  */
-                for (n = 0; n < 14; n++) {
+                for (n = 0; n < 14 && (i * 14 + n < nb_samples); n++) {
                     int32_t sampledat;
 
                     if (n & 1) {
@@ -1260,30 +1575,131 @@ static int adpcm_decode_frame(AVCodecContext *avctx, void *data,
                         sampledat = sign_extend(byte >> 4, 4);
                     }
 
-                    sampledat = ((prev[ch][0]*factor1
-                                + prev[ch][1]*factor2) >> 11) + (sampledat << exp);
+                    sampledat = ((c->status[ch].sample1 * factor1
+                                + c->status[ch].sample2 * factor2) >> 11) + (sampledat << exp);
                     *samples = av_clip_int16(sampledat);
-                    prev[ch][1] = prev[ch][0];
-                    prev[ch][0] = *samples++;
+                    c->status[ch].sample2 = c->status[ch].sample1;
+                    c->status[ch].sample1 = *samples++;
                 }
             }
         }
         break;
     }
+    case AV_CODEC_ID_ADPCM_DTK:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            samples = samples_p[channel];
+
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < nb_samples / 28; i++) {
+                int byte, header;
+                if (channel)
+                    bytestream2_skipu(&gb, 1);
+                header = bytestream2_get_byteu(&gb);
+                bytestream2_skipu(&gb, 3 - channel);
+
+                /* Decode 28 samples.  */
+                for (n = 0; n < 28; n++) {
+                    int32_t sampledat, prev;
+
+                    switch (header >> 4) {
+                    case 1:
+                        prev = (c->status[channel].sample1 * 0x3c);
+                        break;
+                    case 2:
+                        prev = (c->status[channel].sample1 * 0x73) - (c->status[channel].sample2 * 0x34);
+                        break;
+                    case 3:
+                        prev = (c->status[channel].sample1 * 0x62) - (c->status[channel].sample2 * 0x37);
+                        break;
+                    default:
+                        prev = 0;
+                    }
+
+                    prev = av_clip_intp2((prev + 0x20) >> 6, 21);
+
+                    byte = bytestream2_get_byteu(&gb);
+                    if (!channel)
+                        sampledat = sign_extend(byte, 4);
+                    else
+                        sampledat = sign_extend(byte >> 4, 4);
+
+                    sampledat = (((sampledat << 12) >> (header & 0xf)) << 6) + prev;
+                    *samples++ = av_clip_int16(sampledat >> 6);
+                    c->status[channel].sample2 = c->status[channel].sample1;
+                    c->status[channel].sample1 = sampledat;
+                }
+            }
+            if (!channel)
+                bytestream2_seek(&gb, 0, SEEK_SET);
+        }
+        break;
+    case AV_CODEC_ID_ADPCM_PSX:
+        for (channel = 0; channel < avctx->channels; channel++) {
+            samples = samples_p[channel];
+
+            /* Read in every sample for this channel.  */
+            for (i = 0; i < nb_samples / 28; i++) {
+                int filter, shift, flag, byte;
+
+                filter = bytestream2_get_byteu(&gb);
+                shift  = filter & 0xf;
+                filter = filter >> 4;
+                if (filter >= FF_ARRAY_ELEMS(xa_adpcm_table))
+                    return AVERROR_INVALIDDATA;
+                flag   = bytestream2_get_byteu(&gb);
+
+                /* Decode 28 samples.  */
+                for (n = 0; n < 28; n++) {
+                    int sample = 0, scale;
+
+                    if (flag < 0x07) {
+                        if (n & 1) {
+                            scale = sign_extend(byte >> 4, 4);
+                        } else {
+                            byte  = bytestream2_get_byteu(&gb);
+                            scale = sign_extend(byte, 4);
+                        }
+
+                        scale  = scale << 12;
+                        sample = (int)((scale >> shift) + (c->status[channel].sample1 * xa_adpcm_table[filter][0] + c->status[channel].sample2 * xa_adpcm_table[filter][1]) / 64);
+                    }
+                    *samples++ = av_clip_int16(sample);
+                    c->status[channel].sample2 = c->status[channel].sample1;
+                    c->status[channel].sample1 = sample;
+                }
+            }
+        }
+        break;
 
     default:
         return -1;
     }
 
+    if (avpkt->size && bytestream2_tell(&gb) == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Nothing consumed\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     *got_frame_ptr = 1;
 
+    if (avpkt->size < bytestream2_tell(&gb)) {
+        av_log(avctx, AV_LOG_ERROR, "Overread of %d < %d\n", avpkt->size, bytestream2_tell(&gb));
+        return avpkt->size;
+    }
+
     return bytestream2_tell(&gb);
 }
 
+static void adpcm_flush(AVCodecContext *avctx)
+{
+    ADPCMDecodeContext *c = avctx->priv_data;
+    c->has_status = 0;
+}
+
 
 static const enum AVSampleFormat sample_fmts_s16[]  = { AV_SAMPLE_FMT_S16,
                                                         AV_SAMPLE_FMT_NONE };
-static const enum AVSampleFormat sample_fmts_s16p[] = { AV_SAMPLE_FMT_S16,
+static const enum AVSampleFormat sample_fmts_s16p[] = { AV_SAMPLE_FMT_S16P,
                                                         AV_SAMPLE_FMT_NONE };
 static const enum AVSampleFormat sample_fmts_both[] = { AV_SAMPLE_FMT_S16,
                                                         AV_SAMPLE_FMT_S16P,
@@ -1298,13 +1714,17 @@ AVCodec ff_ ## name_ ## _decoder = {                        \
     .priv_data_size = sizeof(ADPCMDecodeContext),           \
     .init           = adpcm_decode_init,                    \
     .decode         = adpcm_decode_frame,                   \
+    .flush          = adpcm_flush,                          \
     .capabilities   = AV_CODEC_CAP_DR1,                     \
     .sample_fmts    = sample_fmts_,                         \
 }
 
 /* Note: Do not forget to add new entries to the Makefile as well. */
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_4XM,         sample_fmts_s16p, adpcm_4xm,         "ADPCM 4X Movie");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_AFC,         sample_fmts_s16p, adpcm_afc,         "ADPCM Nintendo Gamecube AFC");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_AICA,        sample_fmts_s16p, adpcm_aica,        "ADPCM Yamaha AICA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_CT,          sample_fmts_s16,  adpcm_ct,          "ADPCM Creative Technology");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_DTK,         sample_fmts_s16p, adpcm_dtk,         "ADPCM Nintendo Gamecube DTK");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA,          sample_fmts_s16,  adpcm_ea,          "ADPCM Electronic Arts");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_MAXIS_XA, sample_fmts_s16,  adpcm_ea_maxis_xa, "ADPCM Electronic Arts Maxis CDROM XA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_R1,       sample_fmts_s16p, adpcm_ea_r1,       "ADPCM Electronic Arts R1");
@@ -1313,20 +1733,26 @@ ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_R3,       sample_fmts_s16p, adpcm_ea_r3,
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_EA_XAS,      sample_fmts_s16p, adpcm_ea_xas,      "ADPCM Electronic Arts XAS");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_AMV,     sample_fmts_s16,  adpcm_ima_amv,     "ADPCM IMA AMV");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_APC,     sample_fmts_s16,  adpcm_ima_apc,     "ADPCM IMA CRYO APC");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_DAT4,    sample_fmts_s16,  adpcm_ima_dat4,    "ADPCM IMA Eurocom DAT4");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_DK3,     sample_fmts_s16,  adpcm_ima_dk3,     "ADPCM IMA Duck DK3");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_DK4,     sample_fmts_s16,  adpcm_ima_dk4,     "ADPCM IMA Duck DK4");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_EA_EACS, sample_fmts_s16,  adpcm_ima_ea_eacs, "ADPCM IMA Electronic Arts EACS");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_EA_SEAD, sample_fmts_s16,  adpcm_ima_ea_sead, "ADPCM IMA Electronic Arts SEAD");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_ISS,     sample_fmts_s16,  adpcm_ima_iss,     "ADPCM IMA Funcom ISS");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_OKI,     sample_fmts_s16,  adpcm_ima_oki,     "ADPCM IMA Dialogic OKI");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_QT,      sample_fmts_s16p, adpcm_ima_qt,      "ADPCM IMA QuickTime");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_RAD,     sample_fmts_s16,  adpcm_ima_rad,     "ADPCM IMA Radical");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_SMJPEG,  sample_fmts_s16,  adpcm_ima_smjpeg,  "ADPCM IMA Loki SDL MJPEG");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_WAV,     sample_fmts_s16p, adpcm_ima_wav,     "ADPCM IMA WAV");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_IMA_WS,      sample_fmts_both, adpcm_ima_ws,      "ADPCM IMA Westwood");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_MS,          sample_fmts_s16,  adpcm_ms,          "ADPCM Microsoft");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_MTAF,        sample_fmts_s16p, adpcm_mtaf,        "ADPCM MTAF");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_PSX,         sample_fmts_s16p, adpcm_psx,         "ADPCM Playstation");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_2,     sample_fmts_s16,  adpcm_sbpro_2,     "ADPCM Sound Blaster Pro 2-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_3,     sample_fmts_s16,  adpcm_sbpro_3,     "ADPCM Sound Blaster Pro 2.6-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SBPRO_4,     sample_fmts_s16,  adpcm_sbpro_4,     "ADPCM Sound Blaster Pro 4-bit");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_SWF,         sample_fmts_s16,  adpcm_swf,         "ADPCM Shockwave Flash");
-ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP,         sample_fmts_s16p, adpcm_thp,         "ADPCM Nintendo Gamecube THP");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP_LE,      sample_fmts_s16p, adpcm_thp_le,      "ADPCM Nintendo THP (little-endian)");
+ADPCM_DECODER(AV_CODEC_ID_ADPCM_THP,         sample_fmts_s16p, adpcm_thp,         "ADPCM Nintendo THP");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_XA,          sample_fmts_s16p, adpcm_xa,          "ADPCM CDROM XA");
 ADPCM_DECODER(AV_CODEC_ID_ADPCM_YAMAHA,      sample_fmts_s16,  adpcm_yamaha,      "ADPCM Yamaha");
diff --git a/libavcodec/adpcm.h b/libavcodec/adpcm.h
index 11be5a9..580db7d 100644
--- a/libavcodec/adpcm.h
+++ b/libavcodec/adpcm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001-2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,8 +38,8 @@ typedef struct ADPCMChannelStatus {
     int prev_sample;
 
     /* MS version */
-    int16_t sample1;
-    int16_t sample2;
+    int sample1;
+    int sample2;
     int coeff1;
     int coeff2;
     int idelta;
diff --git a/libavcodec/adpcm_data.c b/libavcodec/adpcm_data.c
index e40abc5..52271be 100644
--- a/libavcodec/adpcm_data.c
+++ b/libavcodec/adpcm_data.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001-2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,33 @@
 
 /* ff_adpcm_step_table[] and ff_adpcm_index_table[] are from the ADPCM
    reference source */
-/* This is the index table: */
+static const int8_t adpcm_index_table2[4] = {
+    -1,  2,
+    -1,  2,
+};
+
+static const int8_t adpcm_index_table3[8] = {
+    -1, -1,  1,  2,
+    -1, -1,  1,  2,
+};
+
 const int8_t ff_adpcm_index_table[16] = {
     -1, -1, -1, -1, 2, 4, 6, 8,
     -1, -1, -1, -1, 2, 4, 6, 8,
 };
 
+static const int8_t adpcm_index_table5[32] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 4, 6, 8, 10, 13, 16,
+    -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 4, 6, 8, 10, 13, 16,
+};
+
+const int8_t * const ff_adpcm_index_tables[4] = {
+    &adpcm_index_table2[0],
+    &adpcm_index_table3[0],
+    &ff_adpcm_index_table[0],
+    &adpcm_index_table5[0],
+};
+
 /**
  * This is the step table. Note that many programs use slight deviations from
  * this table, but such deviations are negligible:
@@ -49,6 +70,14 @@ const int16_t ff_adpcm_step_table[89] = {
     15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767
 };
 
+const int16_t ff_adpcm_oki_step_table[49] = {
+     16,  17,  19,  21,   23,   25,   28,   31,   34,  37,
+     41,  45,  50,  55,   60,   66,   73,   80,   88,  97,
+    107, 118, 130, 143,  157,  173,  190,  209,  230, 253,
+    279, 307, 337, 371,  408,  449,  494,  544,  598, 658,
+    724, 796, 876, 963, 1060, 1166, 1282, 1411, 1552
+};
+
 /* These are for MS-ADPCM */
 /* ff_adpcm_AdaptationTable[], ff_adpcm_AdaptCoeff1[], and
    ff_adpcm_AdaptCoeff2[] are from libsndfile */
@@ -76,3 +105,75 @@ const int8_t ff_adpcm_yamaha_difflookup[] = {
      1,  3,  5,  7,  9,  11,  13,  15,
     -1, -3, -5, -7, -9, -11, -13, -15
 };
+
+const int16_t ff_adpcm_afc_coeffs[2][16] = {
+    { 0, 2048, 0, 1024, 4096, 3584, 3072, 4608, 4200, 4800, 5120, 2048, 1024, 64512, 64512, 63488 },
+    { 0, 0, 2048, 1024, 63488, 64000, 64512, 62976, 63288, 63236, 62464, 63488, 64512, 1024, 0, 0 }
+};
+
+const int16_t ff_adpcm_mtaf_stepsize[32][16] = {
+    {     1,     5,     9,    13,    16,    20,    24,    28,
+         -1,    -5,    -9,   -13,   -16,   -20,   -24,   -28, },
+    {     2,     6,    11,    15,    20,    24,    29,    33,
+         -2,    -6,   -11,   -15,   -20,   -24,   -29,   -33, },
+    {     2,     7,    13,    18,    23,    28,    34,    39,
+         -2,    -7,   -13,   -18,   -23,   -28,   -34,   -39, },
+    {     3,     9,    15,    21,    28,    34,    40,    46,
+         -3,    -9,   -15,   -21,   -28,   -34,   -40,   -46, },
+    {     3,    11,    18,    26,    33,    41,    48,    56,
+         -3,   -11,   -18,   -26,   -33,   -41,   -48,   -56, },
+    {     4,    13,    22,    31,    40,    49,    58,    67,
+         -4,   -13,   -22,   -31,   -40,   -49,   -58,   -67, },
+    {     5,    16,    26,    37,    48,    59,    69,    80,
+         -5,   -16,   -26,   -37,   -48,   -59,   -69,   -80, },
+    {     6,    19,    31,    44,    57,    70,    82,    95,
+         -6,   -19,   -31,   -44,   -57,   -70,   -82,   -95, },
+    {     7,    22,    38,    53,    68,    83,    99,   114,
+         -7,   -22,   -38,   -53,   -68,   -83,   -99,  -114, },
+    {     9,    27,    45,    63,    81,    99,   117,   135,
+         -9,   -27,   -45,   -63,   -81,   -99,  -117,  -135, },
+    {    10,    32,    53,    75,    96,   118,   139,   161,
+        -10,   -32,   -53,   -75,   -96,  -118,  -139,  -161, },
+    {    12,    38,    64,    90,   115,   141,   167,   193,
+        -12,   -38,   -64,   -90,  -115,  -141,  -167,  -193, },
+    {    15,    45,    76,   106,   137,   167,   198,   228,
+        -15,   -45,   -76,  -106,  -137,  -167,  -198,  -228, },
+    {    18,    54,    91,   127,   164,   200,   237,   273,
+        -18,   -54,   -91,  -127,  -164,  -200,  -237,  -273, },
+    {    21,    65,   108,   152,   195,   239,   282,   326,
+        -21,   -65,  -108,  -152,  -195,  -239,  -282,  -326, },
+    {    25,    77,   129,   181,   232,   284,   336,   388,
+        -25,   -77,  -129,  -181,  -232,  -284,  -336,  -388, },
+    {    30,    92,   153,   215,   276,   338,   399,   461,
+        -30,   -92,  -153,  -215,  -276,  -338,  -399,  -461, },
+    {    36,   109,   183,   256,   329,   402,   476,   549,
+        -36,  -109,  -183,  -256,  -329,  -402,  -476,  -549, },
+    {    43,   130,   218,   305,   392,   479,   567,   654,
+        -43,  -130,  -218,  -305,  -392,  -479,  -567,  -654, },
+    {    52,   156,   260,   364,   468,   572,   676,   780,
+        -52,  -156,  -260,  -364,  -468,  -572,  -676,  -780, },
+    {    62,   186,   310,   434,   558,   682,   806,   930,
+        -62,  -186,  -310,  -434,  -558,  -682,  -806,  -930, },
+    {    73,   221,   368,   516,   663,   811,   958,  1106,
+        -73,  -221,  -368,  -516,  -663,  -811,  -958, -1106, },
+    {    87,   263,   439,   615,   790,   966,  1142,  1318,
+        -87,  -263,  -439,  -615,  -790,  -966, -1142, -1318, },
+    {   104,   314,   523,   733,   942,  1152,  1361,  1571,
+       -104,  -314,  -523,  -733,  -942, -1152, -1361, -1571, },
+    {   124,   374,   623,   873,  1122,  1372,  1621,  1871,
+       -124,  -374,  -623,  -873, -1122, -1372, -1621, -1871, },
+    {   148,   445,   743,  1040,  1337,  1634,  1932,  2229,
+       -148,  -445,  -743, -1040, -1337, -1634, -1932, -2229, },
+    {   177,   531,   885,  1239,  1593,  1947,  2301,  2655,
+       -177,  -531,  -885, -1239, -1593, -1947, -2301, -2655, },
+    {   210,   632,  1053,  1475,  1896,  2318,  2739,  3161,
+       -210,  -632, -1053, -1475, -1896, -2318, -2739, -3161, },
+    {   251,   753,  1255,  1757,  2260,  2762,  3264,  3766,
+       -251,  -753, -1255, -1757, -2260, -2762, -3264, -3766, },
+    {   299,   897,  1495,  2093,  2692,  3290,  3888,  4486,
+       -299,  -897, -1495, -2093, -2692, -3290, -3888, -4486, },
+    {   356,  1068,  1781,  2493,  3206,  3918,  4631,  5343,
+       -356, -1068, -1781, -2493, -3206, -3918, -4631, -5343, },
+    {   424,  1273,  2121,  2970,  3819,  4668,  5516,  6365,
+       -424, -1273, -2121, -2970, -3819, -4668, -5516, -6365, },
+};
diff --git a/libavcodec/adpcm_data.h b/libavcodec/adpcm_data.h
index cecd156..5a68713 100644
--- a/libavcodec/adpcm_data.h
+++ b/libavcodec/adpcm_data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001-2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,19 @@
 
 #include <stdint.h>
 
+static const uint8_t ff_adpcm_ima_block_sizes[4]   = {  4, 12, 4, 20 };
+static const uint8_t ff_adpcm_ima_block_samples[4] = { 16, 32, 8, 32 };
+
+extern const int8_t * const ff_adpcm_index_tables[4];
 extern const int8_t  ff_adpcm_index_table[16];
 extern const int16_t ff_adpcm_step_table[89];
+extern const int16_t ff_adpcm_oki_step_table[49];
 extern const int16_t ff_adpcm_AdaptationTable[];
 extern const uint8_t ff_adpcm_AdaptCoeff1[];
 extern const int8_t  ff_adpcm_AdaptCoeff2[];
 extern const int16_t ff_adpcm_yamaha_indexscale[];
 extern const int8_t  ff_adpcm_yamaha_difflookup[];
+extern const int16_t ff_adpcm_afc_coeffs[2][16];
+extern const int16_t ff_adpcm_mtaf_stepsize[32][16];
 
 #endif /* AVCODEC_ADPCM_DATA_H */
diff --git a/libavcodec/adpcmenc.c b/libavcodec/adpcmenc.c
index 0757624..36974fd 100644
--- a/libavcodec/adpcmenc.c
+++ b/libavcodec/adpcmenc.c
@@ -5,20 +5,20 @@
  * fringe ADPCM codecs (e.g., DK3, DK4, Westwood)
  *   by Mike Melanson (melanson@pcisys.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,6 +58,8 @@ typedef struct ADPCMEncodeContext {
 
 #define FREEZE_INTERVAL 128
 
+static av_cold int adpcm_encode_close(AVCodecContext *avctx);
+
 static av_cold int adpcm_encode_init(AVCodecContext *avctx)
 {
     ADPCMEncodeContext *s = avctx->priv_data;
@@ -99,6 +101,7 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
         /* seems frame_size isn't taken into account...
            have to buffer the samples :-( */
         avctx->block_align = BLKSIZE;
+        avctx->bits_per_coded_sample = 4;
         break;
     case AV_CODEC_ID_ADPCM_IMA_QT:
         avctx->frame_size  = 64;
@@ -107,8 +110,8 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
     case AV_CODEC_ID_ADPCM_MS:
         /* each 16 bits sample gives one nibble
            and we have 7 bytes per channel overhead */
-        avctx->frame_size = (BLKSIZE - 7 * avctx->channels) * 2 /
-                             avctx->channels + 2;
+        avctx->frame_size = (BLKSIZE - 7 * avctx->channels) * 2 / avctx->channels + 2;
+        avctx->bits_per_coded_sample = 4;
         avctx->block_align    = BLKSIZE;
         if (!(avctx->extradata = av_malloc(32 + AV_INPUT_BUFFER_PADDING_SIZE)))
             goto error;
@@ -143,10 +146,7 @@ static av_cold int adpcm_encode_init(AVCodecContext *avctx)
 
     return 0;
 error:
-    av_freep(&s->paths);
-    av_freep(&s->node_buf);
-    av_freep(&s->nodep_buf);
-    av_freep(&s->trellis_hash);
+    adpcm_encode_close(avctx);
     return ret;
 }
 
@@ -179,24 +179,27 @@ static inline uint8_t adpcm_ima_qt_compress_sample(ADPCMChannelStatus *c,
                                                    int16_t sample)
 {
     int delta  = sample - c->prev_sample;
-    int mask, step = ff_adpcm_step_table[c->step_index];
-    int diff   = step >> 3;
-    int nibble = 0;
+    int diff, step = ff_adpcm_step_table[c->step_index];
+    int nibble = 8*(delta < 0);
 
-    if (delta < 0) {
-        nibble = 8;
-        delta  = -delta;
-    }
+    delta= abs(delta);
+    diff = delta + (step >> 3);
 
-    for (mask = 4; mask;) {
-        if (delta >= step) {
-            nibble |= mask;
-            delta  -= step;
-            diff   += step;
-        }
-        step >>= 1;
-        mask >>= 1;
+    if (delta >= step) {
+        nibble |= 4;
+        delta  -= step;
+    }
+    step >>= 1;
+    if (delta >= step) {
+        nibble |= 2;
+        delta  -= step;
     }
+    step >>= 1;
+    if (delta >= step) {
+        nibble |= 1;
+        delta  -= step;
+    }
+    diff -= delta;
 
     if (nibble & 8)
         c->prev_sample -= diff;
@@ -224,7 +227,7 @@ static inline uint8_t adpcm_ms_compress_sample(ADPCMChannelStatus *c,
         bias = -c->idelta / 2;
 
     nibble = (nibble + bias) / c->idelta;
-    nibble = av_clip(nibble, -8, 7) & 0x0F;
+    nibble = av_clip_intp2(nibble, 3) & 0x0F;
 
     predictor += ((nibble & 0x08) ? (nibble - 0x10) : nibble) * c->idelta;
 
@@ -329,7 +332,7 @@ static void adpcm_compress_trellis(AVCodecContext *avctx,
                     uint8_t *h;\
                     dec_sample = av_clip_int16(dec_sample);\
                     d = sample - dec_sample;\
-                    ssd = nodes[j]->ssd + d*d;\
+                    ssd = nodes[j]->ssd + d*(unsigned)d;\
                     /* Check for wraparound, skip such samples completely. \
                      * Note, changing ssd to a 64 bit variable would be \
                      * simpler, avoiding this check, but it's slower on \
@@ -364,7 +367,7 @@ static void adpcm_compress_trellis(AVCodecContext *avctx,
                     *h = generation;\
                     u  = nodes_next[pos];\
                     if (!u) {\
-                        assert(pathn < FREEZE_INTERVAL << avctx->trellis);\
+                        av_assert1(pathn < FREEZE_INTERVAL << avctx->trellis);\
                         u = t++;\
                         nodes_next[pos] = u;\
                         u->path = pathn++;\
@@ -483,10 +486,8 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         pkt_size = (2 + avctx->channels * (22 + 4 * (frame->nb_samples - 1)) + 7) / 8;
     else
         pkt_size = avctx->block_align;
-    if ((ret = ff_alloc_packet(avpkt, pkt_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, pkt_size, 0)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     switch(avctx->codec->id) {
@@ -508,7 +509,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
         /* stereo: 4 bytes (8 samples) for left, 4 bytes for right */
         if (avctx->trellis > 0) {
-            FF_ALLOC_OR_GOTO(avctx, buf, avctx->channels * blocks * 8, error);
+            FF_ALLOC_ARRAY_OR_GOTO(avctx, buf, avctx->channels, blocks * 8, error);
             for (ch = 0; ch < avctx->channels; ch++) {
                 adpcm_compress_trellis(avctx, &samples_p[ch][1],
                                        buf + ch * blocks * 8, &c->status[ch],
@@ -540,7 +541,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_ADPCM_IMA_QT:
     {
         PutBitContext pb;
-        init_put_bits(&pb, dst, pkt_size * 8);
+        init_put_bits(&pb, dst, pkt_size);
 
         for (ch = 0; ch < avctx->channels; ch++) {
             ADPCMChannelStatus *status = &c->status[ch];
@@ -570,7 +571,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_ADPCM_SWF:
     {
         PutBitContext pb;
-        init_put_bits(&pb, dst, pkt_size * 8);
+        init_put_bits(&pb, dst, pkt_size);
 
         n = frame->nb_samples - 1;
 
@@ -580,7 +581,7 @@ static int adpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         // init the encoder state
         for (i = 0; i < avctx->channels; i++) {
             // clip step so it fits 6 bits
-            c->status[i].step_index = av_clip(c->status[i].step_index, 0, 63);
+            c->status[i].step_index = av_clip_uintp2(c->status[i].step_index, 6);
             put_sbits(&pb, 16, samples[i]);
             put_bits(&pb, 6, c->status[i].step_index);
             c->status[i].prev_sample = samples[i];
diff --git a/libavcodec/adx.c b/libavcodec/adx.c
index d941d7b..cd88b16 100644
--- a/libavcodec/adx.c
+++ b/libavcodec/adx.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adx.h b/libavcodec/adx.h
index 9ae84dc..08f749a 100644
--- a/libavcodec/adx.h
+++ b/libavcodec/adx.h
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adx_parser.c b/libavcodec/adx_parser.c
index 706e242..1fa718f 100644
--- a/libavcodec/adx_parser.c
+++ b/libavcodec/adx_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/adxdec.c b/libavcodec/adxdec.c
index dc587b2..32cc0f0 100644
--- a/libavcodec/adxdec.c
+++ b/libavcodec/adxdec.c
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -101,6 +101,7 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
     int16_t **samples;
     int samples_offset;
     const uint8_t *buf  = avpkt->data;
+    const uint8_t *buf_end = buf + avpkt->size;
     int num_blocks, ch, ret;
 
     if (c->eof) {
@@ -141,16 +142,14 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = num_blocks * BLOCK_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t **)frame->extended_data;
     samples_offset = 0;
 
     while (num_blocks--) {
         for (ch = 0; ch < c->channels; ch++) {
-            if (adx_decode(c, samples[ch], samples_offset, buf, ch)) {
+            if (buf_end - buf < BLOCK_SIZE || adx_decode(c, samples[ch], samples_offset, buf, ch)) {
                 c->eof = 1;
                 buf = avpkt->data + avpkt->size;
                 break;
@@ -158,9 +157,11 @@ static int adx_decode_frame(AVCodecContext *avctx, void *data,
             buf_size -= BLOCK_SIZE;
             buf      += BLOCK_SIZE;
         }
-        samples_offset += BLOCK_SAMPLES;
+        if (!c->eof)
+            samples_offset += BLOCK_SAMPLES;
     }
 
+    frame->nb_samples = samples_offset;
     *got_frame_ptr = 1;
 
     return buf - avpkt->data;
diff --git a/libavcodec/adxenc.c b/libavcodec/adxenc.c
index e730811..f1ba591 100644
--- a/libavcodec/adxenc.c
+++ b/libavcodec/adxenc.c
@@ -2,20 +2,20 @@
  * ADX ADPCM codecs
  * Copyright (c) 2001,2003 BERO
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,14 +43,12 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
     int s0, s1, s2, d;
     int max = 0;
     int min = 0;
-    int data[BLOCK_SAMPLES];
 
     s1 = prev->s1;
     s2 = prev->s2;
     for (i = 0, j = 0; j < 32; i += channels, j++) {
         s0 = wav[i];
         d = ((s0 << COEFF_BITS) - c->coeff[0] * s1 - c->coeff[1] * s2) >> COEFF_BITS;
-        data[j] = d;
         if (max < d)
             max = d;
         if (min > d)
@@ -58,10 +56,10 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
         s2 = s1;
         s1 = s0;
     }
-    prev->s1 = s1;
-    prev->s2 = s2;
 
     if (max == 0 && min == 0) {
+        prev->s1 = s1;
+        prev->s2 = s2;
         memset(adx, 0, BLOCK_SIZE);
         return;
     }
@@ -77,8 +75,23 @@ static void adx_encode(ADXContext *c, uint8_t *adx, const int16_t *wav,
     AV_WB16(adx, scale);
 
     init_put_bits(&pb, adx + 2, 16);
-    for (i = 0; i < BLOCK_SAMPLES; i++)
-        put_sbits(&pb, 4, av_clip(data[i] / scale, -8, 7));
+
+    s1 = prev->s1;
+    s2 = prev->s2;
+    for (i = 0, j = 0; j < 32; i += channels, j++) {
+        d = ((wav[i] << COEFF_BITS) - c->coeff[0] * s1 - c->coeff[1] * s2) >> COEFF_BITS;
+
+        d = av_clip_intp2(ROUNDED_DIV(d, scale), 3);
+
+        put_sbits(&pb, 4, d);
+
+        s0 = ((d << COEFF_BITS) * scale + c->coeff[0] * s1 + c->coeff[1] * s2) >> COEFF_BITS;
+        s2 = s1;
+        s1 = s0;
+    }
+    prev->s1 = s1;
+    prev->s2 = s2;
+
     flush_put_bits(&pb);
 }
 
@@ -133,10 +146,8 @@ static int adx_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int ch, out_size, ret;
 
     out_size = BLOCK_SIZE * avctx->channels + !c->header_parsed * HEADER_SIZE;
-    if ((ret = ff_alloc_packet(avpkt, out_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     if (!c->header_parsed) {
diff --git a/libavcodec/aic.c b/libavcodec/aic.c
index 9ea27a7..ff8e392 100644
--- a/libavcodec/aic.c
+++ b/libavcodec/aic.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include "get_bits.h"
 #include "golomb.h"
 #include "idctdsp.h"
+#include "thread.h"
 #include "unary.h"
 
 #define AIC_HDR_SIZE    24
@@ -132,7 +133,7 @@ static const uint8_t aic_c_ext_scan[192] = {
     177, 184, 176, 169, 162, 161, 168, 160,
 };
 
-static const uint8_t *aic_scan[NUM_BANDS] = {
+static const uint8_t * const aic_scan[NUM_BANDS] = {
     aic_y_scan, aic_c_scan, aic_y_ext_scan, aic_c_ext_scan
 };
 
@@ -152,6 +153,7 @@ typedef struct AICContext {
     int16_t        *data_ptr[NUM_BANDS];
 
     DECLARE_ALIGNED(16, int16_t, block)[64];
+    DECLARE_ALIGNED(16, uint8_t, quant_matrix)[64];
 } AICContext;
 
 static int aic_decode_header(AICContext *ctx, const uint8_t *src, int size)
@@ -203,7 +205,8 @@ static int aic_decode_coeffs(GetBitContext *gb, int16_t *dst,
     int has_skips, coeff_type, coeff_bits, skip_type, skip_bits;
     const int num_coeffs = aic_num_band_coeffs[band];
     const uint8_t *scan = aic_scan[band | force_chroma];
-    int mb, idx, val;
+    int mb, idx;
+    unsigned val;
 
     has_skips  = get_bits1(gb);
     coeff_type = get_bits1(gb);
@@ -217,14 +220,14 @@ static int aic_decode_coeffs(GetBitContext *gb, int16_t *dst,
             idx = -1;
             do {
                 GET_CODE(val, skip_type, skip_bits);
-                if (val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 idx += val + 1;
                 if (idx >= num_coeffs)
                     break;
                 GET_CODE(val, coeff_type, coeff_bits);
                 val++;
-                if (val >= 0x10000 || val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 dst[scan[idx]] = val;
             } while (idx < num_coeffs - 1);
@@ -234,7 +237,7 @@ static int aic_decode_coeffs(GetBitContext *gb, int16_t *dst,
         for (mb = 0; mb < slice_width; mb++) {
             for (idx = 0; idx < num_coeffs; idx++) {
                 GET_CODE(val, coeff_type, coeff_bits);
-                if (val >= 0x10000 || val < 0)
+                if (val >= 0x10000)
                     return AVERROR_INVALIDDATA;
                 dst[scan[idx]] = val;
             }
@@ -286,7 +289,7 @@ static void recombine_block_il(int16_t *dst, const uint8_t *scan,
     }
 }
 
-static void unquant_block(int16_t *block, int q)
+static void unquant_block(int16_t *block, int q, uint8_t *quant_matrix)
 {
     int i;
 
@@ -294,7 +297,7 @@ static void unquant_block(int16_t *block, int q)
         int val  = (uint16_t)block[i];
         int sign = val & 1;
 
-        block[i] = (((val >> 1) ^ -sign) * q * aic_quant_matrix[i] >> 4)
+        block[i] = (((val >> 1) ^ -sign) * q * quant_matrix[i] >> 4)
                    + sign;
     }
 }
@@ -335,7 +338,7 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
             else
                 recombine_block_il(ctx->block, ctx->scantable.permutated,
                                    &base_y, &ext_y, blk);
-            unquant_block(ctx->block, ctx->quant);
+            unquant_block(ctx->block, ctx->quant, ctx->quant_matrix);
             ctx->idsp.idct(ctx->block);
 
             if (!ctx->interlaced) {
@@ -352,7 +355,7 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
         for (blk = 0; blk < 2; blk++) {
             recombine_block(ctx->block, ctx->scantable.permutated,
                             &base_c, &ext_c);
-            unquant_block(ctx->block, ctx->quant);
+            unquant_block(ctx->block, ctx->quant, ctx->quant_matrix);
             ctx->idsp.idct(ctx->block);
             ctx->idsp.put_signed_pixels_clamped(ctx->block, C[blk],
                                                 ctx->frame->linesize[blk + 1]);
@@ -373,6 +376,7 @@ static int aic_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     uint32_t off;
     int x, y, ret;
     int slice_size;
+    ThreadFrame frame = { .f = data };
 
     ctx->frame            = data;
     ctx->frame->pict_type = AV_PICTURE_TYPE_I;
@@ -391,7 +395,7 @@ static int aic_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return ret;
     }
 
-    if ((ret = ff_get_buffer(avctx, ctx->frame, 0)) < 0)
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
 
     bytestream2_init(&gb, buf + AIC_HDR_SIZE,
@@ -437,6 +441,8 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 64; i++)
         scan[i] = i;
     ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, scan);
+    for (i = 0; i < 64; i++)
+        ctx->quant_matrix[ctx->idsp.idct_permutation[i]] = aic_quant_matrix[i];
 
     ctx->mb_width  = FFALIGN(avctx->width,  16) >> 4;
     ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
@@ -451,7 +457,7 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
         }
     }
 
-    ctx->slice_data = av_malloc(ctx->slice_width * AIC_BAND_COEFFS
+    ctx->slice_data = av_malloc_array(ctx->slice_width, AIC_BAND_COEFFS
                                 * sizeof(*ctx->slice_data));
     if (!ctx->slice_data) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating slice buffer\n");
@@ -484,5 +490,6 @@ AVCodec ff_aic_decoder = {
     .init           = aic_decode_init,
     .close          = aic_decode_close,
     .decode         = aic_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(aic_decode_init),
 };
diff --git a/libavcodec/alac.c b/libavcodec/alac.c
index 1f24e1b..b9c3400 100644
--- a/libavcodec/alac.c
+++ b/libavcodec/alac.c
@@ -2,20 +2,20 @@
  * ALAC (Apple Lossless Audio Codec) decoder
  * Copyright (c) 2005 David Hammerton
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,8 @@
  *  8 bits  compatible version   (0)
  *  8 bits  sample size
  *  8 bits  history mult         (40)
- *  8 bits  initial history      (14)
- *  8 bits  rice param limit     (10)
+ *  8 bits  initial history      (10)
+ *  8 bits  rice param limit     (14)
  *  8 bits  channels
  * 16 bits  maxRun               (255)
  * 32 bits  max coded frame size (0 means unknown)
@@ -48,17 +48,21 @@
 #include <inttypes.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "thread.h"
 #include "unary.h"
 #include "mathops.h"
 #include "alac_data.h"
+#include "alacdsp.h"
 
 #define ALAC_EXTRADATA_SIZE 36
 
 typedef struct ALACContext {
+    AVClass *class;
     AVCodecContext *avctx;
     GetBitContext gb;
     int channels;
@@ -75,6 +79,11 @@ typedef struct ALACContext {
 
     int extra_bits;     /**< number of extra bits beyond 16-bit */
     int nb_samples;     /**< number of samples in the current frame */
+
+    int direct_output;
+    int extra_bit_bug;
+
+    ALACDSPContext dsp;
 } ALACContext;
 
 static inline unsigned int decode_scalar(GetBitContext *gb, int k, int bps)
@@ -99,7 +108,7 @@ static inline unsigned int decode_scalar(GetBitContext *gb, int k, int bps)
     return x;
 }
 
-static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
+static int rice_decompress(ALACContext *alac, int32_t *output_buffer,
                             int nb_samples, int bps, int rice_history_mult)
 {
     int i;
@@ -110,6 +119,9 @@ static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
         int k;
         unsigned int x;
 
+        if(get_bits_left(&alac->gb) <= 0)
+            return -1;
+
         /* calculate rice param and decode next value */
         k = av_log2((history >> 9) + 3);
         k = FFMIN(k, alac->rice_limit);
@@ -150,6 +162,7 @@ static void rice_decompress(ALACContext *alac, int32_t *output_buffer,
             history = 0;
         }
     }
+    return 0;
 }
 
 static inline int sign_only(int v)
@@ -186,7 +199,7 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
     }
 
     /* read warm-up samples */
-    for (i = 1; i <= lpc_order; i++)
+    for (i = 1; i <= lpc_order && i < nb_samples; i++)
         buffer_out[i] = sign_extend(buffer_out[i - 1] + error_buffer[i], bps);
 
     /* NOTE: 4 and 8 are very common cases that could be optimized. */
@@ -220,35 +233,6 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
     }
 }
 
-static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
-                               int decorr_shift, int decorr_left_weight)
-{
-    int i;
-
-    for (i = 0; i < nb_samples; i++) {
-        int32_t a, b;
-
-        a = buffer[0][i];
-        b = buffer[1][i];
-
-        a -= (b * decorr_left_weight) >> decorr_shift;
-        b += a;
-
-        buffer[0][i] = b;
-        buffer[1][i] = a;
-    }
-}
-
-static void append_extra_bits(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
-                              int extra_bits, int channels, int nb_samples)
-{
-    int i, ch;
-
-    for (ch = 0; ch < channels; ch++)
-        for (i = 0; i < nb_samples; i++)
-            buffer[ch][i] = (buffer[ch][i] << extra_bits) | extra_bits_buffer[ch][i];
-}
-
 static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
                           int channels)
 {
@@ -265,7 +249,7 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
 
     alac->extra_bits = get_bits(&alac->gb, 2) << 3;
     bps = alac->sample_size - alac->extra_bits + channels - 1;
-    if (bps > 32) {
+    if (bps > 32U) {
         av_log(avctx, AV_LOG_ERROR, "bps is unsupported: %d\n", bps);
         return AVERROR_PATCHWELCOME;
     }
@@ -283,19 +267,18 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
         return AVERROR_INVALIDDATA;
     }
     if (!alac->nb_samples) {
+        ThreadFrame tframe = { .f = frame };
         /* get output buffer */
         frame->nb_samples = output_samples;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
             return ret;
-        }
     } else if (output_samples != alac->nb_samples) {
         av_log(avctx, AV_LOG_ERROR, "sample count mismatch: %"PRIu32" != %d\n",
                output_samples, alac->nb_samples);
         return AVERROR_INVALIDDATA;
     }
     alac->nb_samples = output_samples;
-    if (alac->sample_size > 16) {
+    if (alac->direct_output) {
         for (ch = 0; ch < channels; ch++)
             alac->output_samples_buffer[ch] = (int32_t *)frame->extended_data[ch_index + ch];
     }
@@ -332,14 +315,18 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
 
         if (alac->extra_bits) {
             for (i = 0; i < alac->nb_samples; i++) {
+                if(get_bits_left(&alac->gb) <= 0)
+                    return -1;
                 for (ch = 0; ch < channels; ch++)
                     alac->extra_bits_buffer[ch][i] = get_bits(&alac->gb, alac->extra_bits);
             }
         }
         for (ch = 0; ch < channels; ch++) {
-            rice_decompress(alac, alac->predict_error_buffer[ch],
+            int ret=rice_decompress(alac, alac->predict_error_buffer[ch],
                             alac->nb_samples, bps,
                             rice_history_mult[ch] * alac->rice_history_mult / 4);
+            if(ret<0)
+                return ret;
 
             /* adaptive FIR filter */
             if (prediction_type[ch] == 15) {
@@ -364,6 +351,8 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
     } else {
         /* not compressed, easy case */
         for (i = 0; i < alac->nb_samples; i++) {
+            if(get_bits_left(&alac->gb) <= 0)
+                return -1;
             for (ch = 0; ch < channels; ch++) {
                 alac->output_samples_buffer[ch][i] =
                          get_sbits_long(&alac->gb, alac->sample_size);
@@ -374,14 +363,24 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
         decorr_left_weight = 0;
     }
 
-    if (channels == 2 && decorr_left_weight) {
-        decorrelate_stereo(alac->output_samples_buffer, alac->nb_samples,
-                           decorr_shift, decorr_left_weight);
-    }
+    if (channels == 2) {
+        if (alac->extra_bits && alac->extra_bit_bug) {
+            alac->dsp.append_extra_bits[1](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                           alac->extra_bits, channels, alac->nb_samples);
+        }
 
-    if (alac->extra_bits) {
-        append_extra_bits(alac->output_samples_buffer, alac->extra_bits_buffer,
-                          alac->extra_bits, channels, alac->nb_samples);
+        if (decorr_left_weight) {
+            alac->dsp.decorrelate_stereo(alac->output_samples_buffer, alac->nb_samples,
+                                         decorr_shift, decorr_left_weight);
+        }
+
+        if (alac->extra_bits && !alac->extra_bit_bug) {
+            alac->dsp.append_extra_bits[1](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                           alac->extra_bits, channels, alac->nb_samples);
+        }
+    } else if (alac->extra_bits) {
+        alac->dsp.append_extra_bits[0](alac->output_samples_buffer, alac->extra_bits_buffer,
+                                       alac->extra_bits, channels, alac->nb_samples);
     }
 
     switch(alac->sample_size) {
@@ -392,6 +391,12 @@ static int decode_element(AVCodecContext *avctx, AVFrame *frame, int ch_index,
                 *outbuffer++ = alac->output_samples_buffer[ch][i];
         }}
         break;
+    case 20: {
+        for (ch = 0; ch < channels; ch++) {
+            for (i = 0; i < alac->nb_samples; i++)
+                alac->output_samples_buffer[ch][i] <<= 12;
+        }}
+        break;
     case 24: {
         for (ch = 0; ch < channels; ch++) {
             for (i = 0; i < alac->nb_samples; i++)
@@ -412,7 +417,8 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
     int channels;
     int ch, ret, got_end;
 
-    init_get_bits(&alac->gb, avpkt->data, avpkt->size * 8);
+    if ((ret = init_get_bits8(&alac->gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     got_end = 0;
     alac->nb_samples = 0;
@@ -424,7 +430,7 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
             break;
         }
         if (element > TYPE_CPE && element != TYPE_LFE) {
-            av_log(avctx, AV_LOG_ERROR, "syntax element unsupported: %d", element);
+            av_log(avctx, AV_LOG_ERROR, "syntax element unsupported: %d\n", element);
             return AVERROR_PATCHWELCOME;
         }
 
@@ -453,7 +459,10 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
                avpkt->size * 8 - get_bits_count(&alac->gb));
     }
 
-    *got_frame_ptr = 1;
+    if (alac->channels == ch)
+        *got_frame_ptr = 1;
+    else
+        av_log(avctx, AV_LOG_WARNING, "Failed to decode all channels\n");
 
     return avpkt->size;
 }
@@ -465,7 +474,7 @@ static av_cold int alac_decode_close(AVCodecContext *avctx)
     int ch;
     for (ch = 0; ch < FFMIN(alac->channels, 2); ch++) {
         av_freep(&alac->predict_error_buffer[ch]);
-        if (alac->sample_size == 16)
+        if (!alac->direct_output)
             av_freep(&alac->output_samples_buffer[ch]);
         av_freep(&alac->extra_bits_buffer[ch]);
     }
@@ -478,17 +487,24 @@ static int allocate_buffers(ALACContext *alac)
     int ch;
     int buf_size = alac->max_samples_per_frame * sizeof(int32_t);
 
+    for (ch = 0; ch < 2; ch++) {
+        alac->predict_error_buffer[ch]  = NULL;
+        alac->output_samples_buffer[ch] = NULL;
+        alac->extra_bits_buffer[ch]     = NULL;
+    }
+
     for (ch = 0; ch < FFMIN(alac->channels, 2); ch++) {
         FF_ALLOC_OR_GOTO(alac->avctx, alac->predict_error_buffer[ch],
                          buf_size, buf_alloc_fail);
 
-        if (alac->sample_size == 16) {
+        alac->direct_output = alac->sample_size > 16;
+        if (!alac->direct_output) {
             FF_ALLOC_OR_GOTO(alac->avctx, alac->output_samples_buffer[ch],
-                             buf_size, buf_alloc_fail);
+                             buf_size + AV_INPUT_BUFFER_PADDING_SIZE, buf_alloc_fail);
         }
 
         FF_ALLOC_OR_GOTO(alac->avctx, alac->extra_bits_buffer[ch],
-                         buf_size, buf_alloc_fail);
+                         buf_size + AV_INPUT_BUFFER_PADDING_SIZE, buf_alloc_fail);
     }
     return 0;
 buf_alloc_fail:
@@ -535,17 +551,18 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
 
     /* initialize from the extradata */
     if (alac->avctx->extradata_size < ALAC_EXTRADATA_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "alac: extradata is too small\n");
+        av_log(avctx, AV_LOG_ERROR, "extradata is too small\n");
         return AVERROR_INVALIDDATA;
     }
     if (alac_set_info(alac)) {
-        av_log(avctx, AV_LOG_ERROR, "alac: set_info failed\n");
+        av_log(avctx, AV_LOG_ERROR, "set_info failed\n");
         return -1;
     }
 
     switch (alac->sample_size) {
     case 16: avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
              break;
+    case 20:
     case 24:
     case 32: avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
              break;
@@ -563,7 +580,7 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
         else
             avctx->channels = alac->channels;
     }
-    if (avctx->channels > ALAC_MAX_CHANNELS) {
+    if (avctx->channels > ALAC_MAX_CHANNELS || avctx->channels <= 0 ) {
         av_log(avctx, AV_LOG_ERROR, "Unsupported channel count: %d\n",
                avctx->channels);
         return AVERROR_PATCHWELCOME;
@@ -575,9 +592,34 @@ static av_cold int alac_decode_init(AVCodecContext * avctx)
         return ret;
     }
 
+    ff_alacdsp_init(&alac->dsp);
+
     return 0;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    ALACContext *alac = avctx->priv_data;
+    alac->avctx = avctx;
+    return allocate_buffers(alac);
+}
+#endif
+
+static const AVOption options[] = {
+    { "extra_bits_bug", "Force non-standard decoding process",
+      offsetof(ALACContext, extra_bit_bug), AV_OPT_TYPE_BOOL, { .i64 = 0 },
+      0, 1, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM },
+    { NULL },
+};
+
+static const AVClass alac_class = {
+    .class_name = "alac",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_alac_decoder = {
     .name           = "alac",
     .long_name      = NULL_IF_CONFIG_SMALL("ALAC (Apple Lossless Audio Codec)"),
@@ -587,5 +629,7 @@ AVCodec ff_alac_decoder = {
     .init           = alac_decode_init,
     .close          = alac_decode_close,
     .decode         = alac_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .priv_class     = &alac_class
 };
diff --git a/libavcodec/alac_data.c b/libavcodec/alac_data.c
index 9e13119..0bcb06c 100644
--- a/libavcodec/alac_data.c
+++ b/libavcodec/alac_data.c
@@ -1,20 +1,20 @@
 /*
  * ALAC encoder and decoder common data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/alac_data.h b/libavcodec/alac_data.h
index ebb1f33..650d6dc 100644
--- a/libavcodec/alac_data.h
+++ b/libavcodec/alac_data.h
@@ -1,20 +1,20 @@
 /*
  * ALAC encoder and decoder common data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/alacdsp.c b/libavcodec/alacdsp.c
new file mode 100644
index 0000000..ecbaedb
--- /dev/null
+++ b/libavcodec/alacdsp.c
@@ -0,0 +1,63 @@
+/*
+ * ALAC (Apple Lossless Audio Codec) decoder
+ * Copyright (c) 2005 David Hammerton
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "alacdsp.h"
+#include "config.h"
+
+static void decorrelate_stereo(int32_t *buffer[2], int nb_samples,
+                               int decorr_shift, int decorr_left_weight)
+{
+    int i;
+
+    for (i = 0; i < nb_samples; i++) {
+        int32_t a, b;
+
+        a = buffer[0][i];
+        b = buffer[1][i];
+
+        a -= (b * decorr_left_weight) >> decorr_shift;
+        b += a;
+
+        buffer[0][i] = b;
+        buffer[1][i] = a;
+    }
+}
+
+static void append_extra_bits(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                              int extra_bits, int channels, int nb_samples)
+{
+    int i, ch;
+
+    for (ch = 0; ch < channels; ch++)
+        for (i = 0; i < nb_samples; i++)
+            buffer[ch][i] = (buffer[ch][i] << extra_bits) | extra_bits_buffer[ch][i];
+}
+
+av_cold void ff_alacdsp_init(ALACDSPContext *c)
+{
+    c->decorrelate_stereo   = decorrelate_stereo;
+    c->append_extra_bits[0] =
+    c->append_extra_bits[1] = append_extra_bits;
+
+    if (ARCH_X86)
+        ff_alacdsp_init_x86(c);
+}
diff --git a/libavcodec/alacdsp.h b/libavcodec/alacdsp.h
new file mode 100644
index 0000000..f8b56dd
--- /dev/null
+++ b/libavcodec/alacdsp.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALACDSP_H
+#define AVCODEC_ALACDSP_H
+
+#include <stdint.h>
+
+typedef struct ALACDSPContext {
+    void (*decorrelate_stereo)(int32_t *buffer[2], int nb_samples,
+                               int decorr_shift, int decorr_left_weight);
+    void (*append_extra_bits[2])(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                 int extra_bits, int channels, int nb_samples);
+} ALACDSPContext;
+
+void ff_alacdsp_init(ALACDSPContext *c);
+void ff_alacdsp_init_x86(ALACDSPContext *c);
+
+#endif /* AVCODEC_ALACDSP_H */
diff --git a/libavcodec/alacenc.c b/libavcodec/alacenc.c
index d921fa1..9ac35f1 100644
--- a/libavcodec/alacenc.c
+++ b/libavcodec/alacenc.c
@@ -2,20 +2,20 @@
  * ALAC audio encoder
  * Copyright (c) 2008  Jaikrishnan Menon <realityman@gmx.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,7 +70,7 @@ typedef struct AlacEncodeContext {
     int write_sample_size;
     int extra_bits;
     int32_t sample_buf[2][DEFAULT_FRAME_SIZE];
-    int32_t predictor_buf[DEFAULT_FRAME_SIZE];
+    int32_t predictor_buf[2][DEFAULT_FRAME_SIZE];
     int interlacing_shift;
     int interlacing_leftweight;
     PutBitContext pbctx;
@@ -256,13 +256,14 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
 {
     int i;
     AlacLPCContext lpc = s->lpc[ch];
+    int32_t *residual = s->predictor_buf[ch];
 
     if (lpc.lpc_order == 31) {
-        s->predictor_buf[0] = s->sample_buf[ch][0];
+        residual[0] = s->sample_buf[ch][0];
 
         for (i = 1; i < s->frame_size; i++) {
-            s->predictor_buf[i] = s->sample_buf[ch][i    ] -
-                                  s->sample_buf[ch][i - 1];
+            residual[i] = s->sample_buf[ch][i    ] -
+                          s->sample_buf[ch][i - 1];
         }
 
         return;
@@ -272,12 +273,11 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
 
     if (lpc.lpc_order > 0) {
         int32_t *samples  = s->sample_buf[ch];
-        int32_t *residual = s->predictor_buf;
 
         // generate warm-up samples
         residual[0] = samples[0];
         for (i = 1; i <= lpc.lpc_order; i++)
-            residual[i] = samples[i] - samples[i-1];
+            residual[i] = sign_extend(samples[i] - samples[i-1], s->write_sample_size);
 
         // perform lpc on remaining samples
         for (i = lpc.lpc_order + 1; i < s->frame_size; i++) {
@@ -316,11 +316,11 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
     }
 }
 
-static void alac_entropy_coder(AlacEncodeContext *s)
+static void alac_entropy_coder(AlacEncodeContext *s, int ch)
 {
     unsigned int history = s->rc.initial_history;
     int sign_modifier = 0, i, k;
-    int32_t *samples = s->predictor_buf;
+    int32_t *samples = s->predictor_buf[ch];
 
     for (i = 0; i < s->frame_size;) {
         int x;
@@ -397,6 +397,19 @@ static void write_element(AlacEncodeContext *s,
         init_sample_buffers(s, channels, samples);
         write_element_header(s, element, instance);
 
+        // extract extra bits if needed
+        if (s->extra_bits) {
+            uint32_t mask = (1 << s->extra_bits) - 1;
+            for (j = 0; j < channels; j++) {
+                int32_t *extra = s->predictor_buf[j];
+                int32_t *smp   = s->sample_buf[j];
+                for (i = 0; i < s->frame_size; i++) {
+                    extra[i] = smp[i] & mask;
+                    smp[i] >>= s->extra_bits;
+                }
+            }
+        }
+
         if (channels == 2)
             alac_stereo_decorrelation(s);
         else
@@ -419,11 +432,9 @@ static void write_element(AlacEncodeContext *s,
 
         // write extra bits if needed
         if (s->extra_bits) {
-            uint32_t mask = (1 << s->extra_bits) - 1;
             for (i = 0; i < s->frame_size; i++) {
                 for (j = 0; j < channels; j++) {
-                    put_bits(pb, s->extra_bits, s->sample_buf[j][i] & mask);
-                    s->sample_buf[j][i] >>= s->extra_bits;
+                    put_bits(pb, s->extra_bits, s->predictor_buf[j][i]);
                 }
             }
         }
@@ -435,10 +446,11 @@ static void write_element(AlacEncodeContext *s,
             // TODO: determine when this will actually help. for now it's not used.
             if (prediction_type == 15) {
                 // 2nd pass 1st order filter
+                int32_t *residual = s->predictor_buf[i];
                 for (j = s->frame_size - 1; j > 0; j--)
-                    s->predictor_buf[j] -= s->predictor_buf[j - 1];
+                    residual[j] -= residual[j - 1];
             }
-            alac_entropy_coder(s);
+            alac_entropy_coder(s, i);
         }
     }
 }
@@ -611,10 +623,8 @@ static int alac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     else
         max_frame_size = s->max_coded_frame_size;
 
-    if ((ret = ff_alloc_packet(avpkt, 2 * max_frame_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 2 * max_frame_size, 0)) < 0)
         return ret;
-    }
 
     /* use verbatim mode for compression_level 0 */
     if (s->compression_level) {
diff --git a/libavcodec/aliaspixdec.c b/libavcodec/aliaspixdec.c
index 8c18924..087b18f 100644
--- a/libavcodec/aliaspixdec.c
+++ b/libavcodec/aliaspixdec.c
@@ -2,20 +2,20 @@
  * Alias PIX image decoder
  * Copyright (C) 2014 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aliaspixenc.c b/libavcodec/aliaspixenc.c
index 63016af..a9ba00c 100644
--- a/libavcodec/aliaspixenc.c
+++ b/libavcodec/aliaspixenc.c
@@ -2,20 +2,20 @@
  * Alias PIX image encoder
  * Copyright (C) 2014 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,7 +61,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     length = ALIAS_HEADER_SIZE + 4 * width * height; // max possible
-    if ((ret = ff_alloc_packet(pkt, length)) < 0) {
+    if ((ret = ff_alloc_packet2(avctx, pkt, length, ALIAS_HEADER_SIZE + height*2)) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", length);
         return ret;
     }
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 2b11ef6..246f08b 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -2,20 +2,20 @@
  * Provide registration of all codecs, parsers and bitstream filters for libavcodec.
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,6 +68,8 @@ void avcodec_register_all(void)
 
     /* hardware accelerators */
     REGISTER_HWACCEL(H263_VAAPI,        h263_vaapi);
+    REGISTER_HWACCEL(H263_VIDEOTOOLBOX, h263_videotoolbox);
+    REGISTER_HWACCEL(H264_CUVID,        h264_cuvid);
     REGISTER_HWACCEL(H264_D3D11VA,      h264_d3d11va);
     REGISTER_HWACCEL(H264_DXVA2,        h264_dxva2);
     REGISTER_HWACCEL(H264_MMAL,         h264_mmal);
@@ -76,24 +78,40 @@ void avcodec_register_all(void)
     REGISTER_HWACCEL(H264_VDA,          h264_vda);
     REGISTER_HWACCEL(H264_VDA_OLD,      h264_vda_old);
     REGISTER_HWACCEL(H264_VDPAU,        h264_vdpau);
+    REGISTER_HWACCEL(H264_VIDEOTOOLBOX, h264_videotoolbox);
+    REGISTER_HWACCEL(HEVC_CUVID,        hevc_cuvid);
     REGISTER_HWACCEL(HEVC_D3D11VA,      hevc_d3d11va);
     REGISTER_HWACCEL(HEVC_DXVA2,        hevc_dxva2);
     REGISTER_HWACCEL(HEVC_QSV,          hevc_qsv);
+    REGISTER_HWACCEL(HEVC_VAAPI,        hevc_vaapi);
     REGISTER_HWACCEL(HEVC_VDPAU,        hevc_vdpau);
+    REGISTER_HWACCEL(MPEG1_XVMC,        mpeg1_xvmc);
     REGISTER_HWACCEL(MPEG1_VDPAU,       mpeg1_vdpau);
+    REGISTER_HWACCEL(MPEG1_VIDEOTOOLBOX, mpeg1_videotoolbox);
+    REGISTER_HWACCEL(MPEG2_XVMC,        mpeg2_xvmc);
     REGISTER_HWACCEL(MPEG2_D3D11VA,     mpeg2_d3d11va);
     REGISTER_HWACCEL(MPEG2_DXVA2,       mpeg2_dxva2);
     REGISTER_HWACCEL(MPEG2_MMAL,        mpeg2_mmal);
     REGISTER_HWACCEL(MPEG2_QSV,         mpeg2_qsv);
     REGISTER_HWACCEL(MPEG2_VAAPI,       mpeg2_vaapi);
     REGISTER_HWACCEL(MPEG2_VDPAU,       mpeg2_vdpau);
+    REGISTER_HWACCEL(MPEG2_VIDEOTOOLBOX, mpeg2_videotoolbox);
+    REGISTER_HWACCEL(MPEG4_MMAL,        mpeg4_mmal);
     REGISTER_HWACCEL(MPEG4_VAAPI,       mpeg4_vaapi);
     REGISTER_HWACCEL(MPEG4_VDPAU,       mpeg4_vdpau);
+    REGISTER_HWACCEL(MPEG4_VIDEOTOOLBOX, mpeg4_videotoolbox);
+    REGISTER_HWACCEL(VC1_CUVID,         vc1_cuvid);
     REGISTER_HWACCEL(VC1_D3D11VA,       vc1_d3d11va);
     REGISTER_HWACCEL(VC1_DXVA2,         vc1_dxva2);
     REGISTER_HWACCEL(VC1_VAAPI,         vc1_vaapi);
     REGISTER_HWACCEL(VC1_VDPAU,         vc1_vdpau);
     REGISTER_HWACCEL(VC1_MMAL,          vc1_mmal);
+    REGISTER_HWACCEL(VC1_QSV,           vc1_qsv);
+    REGISTER_HWACCEL(VP8_CUVID,         vp8_cuvid);
+    REGISTER_HWACCEL(VP9_CUVID,         vp9_cuvid);
+    REGISTER_HWACCEL(VP9_D3D11VA,       vp9_d3d11va);
+    REGISTER_HWACCEL(VP9_DXVA2,         vp9_dxva2);
+    REGISTER_HWACCEL(VP9_VAAPI,         vp9_vaapi);
     REGISTER_HWACCEL(WMV3_D3D11VA,      wmv3_d3d11va);
     REGISTER_HWACCEL(WMV3_DXVA2,        wmv3_dxva2);
     REGISTER_HWACCEL(WMV3_VAAPI,        wmv3_vaapi);
@@ -105,14 +123,19 @@ void avcodec_register_all(void)
     REGISTER_DECODER(AASC,              aasc);
     REGISTER_DECODER(AIC,               aic);
     REGISTER_ENCDEC (ALIAS_PIX,         alias_pix);
-    REGISTER_DECODER(AMV,               amv);
+    REGISTER_ENCDEC (AMV,               amv);
     REGISTER_DECODER(ANM,               anm);
     REGISTER_DECODER(ANSI,              ansi);
+    REGISTER_ENCDEC (APNG,              apng);
     REGISTER_ENCDEC (ASV1,              asv1);
     REGISTER_ENCDEC (ASV2,              asv2);
     REGISTER_DECODER(AURA,              aura);
     REGISTER_DECODER(AURA2,             aura2);
+    REGISTER_ENCDEC (AVRP,              avrp);
+    REGISTER_DECODER(AVRN,              avrn);
     REGISTER_DECODER(AVS,               avs);
+    REGISTER_ENCDEC (AVUI,              avui);
+    REGISTER_ENCDEC (AYUV,              ayuv);
     REGISTER_DECODER(BETHSOFTVID,       bethsoftvid);
     REGISTER_DECODER(BFI,               bfi);
     REGISTER_DECODER(BINK,              bink);
@@ -123,17 +146,21 @@ void avcodec_register_all(void)
     REGISTER_DECODER(CAVS,              cavs);
     REGISTER_DECODER(CDGRAPHICS,        cdgraphics);
     REGISTER_DECODER(CDXL,              cdxl);
-    REGISTER_DECODER(CINEPAK,           cinepak);
+    REGISTER_DECODER(CFHD,              cfhd);
+    REGISTER_ENCDEC (CINEPAK,           cinepak);
     REGISTER_ENCDEC (CLJR,              cljr);
     REGISTER_DECODER(CLLC,              cllc);
     REGISTER_ENCDEC (COMFORTNOISE,      comfortnoise);
+    REGISTER_DECODER(CPIA,              cpia);
     REGISTER_DECODER(CSCD,              cscd);
     REGISTER_DECODER(CYUV,              cyuv);
     REGISTER_DECODER(DDS,               dds);
     REGISTER_DECODER(DFA,               dfa);
+    REGISTER_DECODER(DIRAC,             dirac);
     REGISTER_ENCDEC (DNXHD,             dnxhd);
     REGISTER_ENCDEC (DPX,               dpx);
     REGISTER_DECODER(DSICINVIDEO,       dsicinvideo);
+    REGISTER_DECODER(DVAUDIO,           dvaudio);
     REGISTER_ENCDEC (DVVIDEO,           dvvideo);
     REGISTER_DECODER(DXA,               dxa);
     REGISTER_DECODER(DXTORY,            dxtory);
@@ -153,7 +180,7 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (FFVHUFF,           ffvhuff);
     REGISTER_DECODER(FIC,               fic);
     REGISTER_ENCDEC (FLASHSV,           flashsv);
-    REGISTER_DECODER(FLASHSV2,          flashsv2);
+    REGISTER_ENCDEC (FLASHSV2,          flashsv2);
     REGISTER_DECODER(FLIC,              flic);
     REGISTER_ENCDEC (FLV,               flv);
     REGISTER_DECODER(FOURXM,            fourxm);
@@ -164,10 +191,16 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (H261,              h261);
     REGISTER_ENCDEC (H263,              h263);
     REGISTER_DECODER(H263I,             h263i);
-    REGISTER_ENCODER(H263P,             h263p);
+    REGISTER_ENCDEC (H263P,             h263p);
     REGISTER_DECODER(H264,              h264);
+    REGISTER_DECODER(H264_CRYSTALHD,    h264_crystalhd);
+    REGISTER_DECODER(H264_MEDIACODEC,   h264_mediacodec);
     REGISTER_DECODER(H264_MMAL,         h264_mmal);
     REGISTER_DECODER(H264_QSV,          h264_qsv);
+    REGISTER_DECODER(H264_VDA,          h264_vda);
+#if FF_API_VDPAU
+    REGISTER_DECODER(H264_VDPAU,        h264_vdpau);
+#endif
     REGISTER_ENCDEC (HAP,               hap);
     REGISTER_DECODER(HEVC,              hevc);
     REGISTER_DECODER(HEVC_QSV,          hevc_qsv);
@@ -176,14 +209,13 @@ void avcodec_register_all(void)
     REGISTER_DECODER(HQX,               hqx);
     REGISTER_ENCDEC (HUFFYUV,           huffyuv);
     REGISTER_DECODER(IDCIN,             idcin);
-    REGISTER_DECODER(IFF_BYTERUN1,      iff_byterun1);
     REGISTER_DECODER(IFF_ILBM,          iff_ilbm);
     REGISTER_DECODER(INDEO2,            indeo2);
     REGISTER_DECODER(INDEO3,            indeo3);
     REGISTER_DECODER(INDEO4,            indeo4);
     REGISTER_DECODER(INDEO5,            indeo5);
     REGISTER_DECODER(INTERPLAY_VIDEO,   interplay_video);
-    REGISTER_DECODER(JPEG2000,          jpeg2000);
+    REGISTER_ENCDEC (JPEG2000,          jpeg2000);
     REGISTER_ENCDEC (JPEGLS,            jpegls);
     REGISTER_DECODER(JV,                jv);
     REGISTER_DECODER(KGV1,              kgv1);
@@ -191,6 +223,8 @@ void avcodec_register_all(void)
     REGISTER_DECODER(LAGARITH,          lagarith);
     REGISTER_ENCODER(LJPEG,             ljpeg);
     REGISTER_DECODER(LOCO,              loco);
+    REGISTER_DECODER(M101,              m101);
+    REGISTER_DECODER(MAGICYUV,          magicyuv);
     REGISTER_DECODER(MDEC,              mdec);
     REGISTER_DECODER(MIMIC,             mimic);
     REGISTER_ENCDEC (MJPEG,             mjpeg);
@@ -202,17 +236,29 @@ void avcodec_register_all(void)
 #endif /* FF_API_XVMC */
     REGISTER_ENCDEC (MPEG1VIDEO,        mpeg1video);
     REGISTER_ENCDEC (MPEG2VIDEO,        mpeg2video);
+    REGISTER_ENCDEC (MPEG4,             mpeg4);
+    REGISTER_DECODER(MPEG4_CRYSTALHD,   mpeg4_crystalhd);
+    REGISTER_DECODER(MPEG4_MMAL,        mpeg4_mmal);
+#if FF_API_VDPAU
+    REGISTER_DECODER(MPEG4_VDPAU,       mpeg4_vdpau);
+#endif
+    REGISTER_DECODER(MPEGVIDEO,         mpegvideo);
+#if FF_API_VDPAU
+    REGISTER_DECODER(MPEG_VDPAU,        mpeg_vdpau);
+    REGISTER_DECODER(MPEG1_VDPAU,       mpeg1_vdpau);
+#endif
     REGISTER_DECODER(MPEG2_MMAL,        mpeg2_mmal);
+    REGISTER_DECODER(MPEG2_CRYSTALHD,   mpeg2_crystalhd);
     REGISTER_DECODER(MPEG2_QSV,         mpeg2_qsv);
-    REGISTER_ENCDEC (MPEG4,             mpeg4);
     REGISTER_DECODER(MSA1,              msa1);
+    REGISTER_DECODER(MSMPEG4_CRYSTALHD, msmpeg4_crystalhd);
     REGISTER_DECODER(MSMPEG4V1,         msmpeg4v1);
     REGISTER_ENCDEC (MSMPEG4V2,         msmpeg4v2);
     REGISTER_ENCDEC (MSMPEG4V3,         msmpeg4v3);
     REGISTER_DECODER(MSRLE,             msrle);
     REGISTER_DECODER(MSS1,              mss1);
     REGISTER_DECODER(MSS2,              mss2);
-    REGISTER_DECODER(MSVIDEO1,          msvideo1);
+    REGISTER_ENCDEC (MSVIDEO1,          msvideo1);
     REGISTER_DECODER(MSZH,              mszh);
     REGISTER_DECODER(MTS2,              mts2);
     REGISTER_DECODER(MVC1,              mvc1);
@@ -229,12 +275,15 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (PNG,               png);
     REGISTER_ENCDEC (PPM,               ppm);
     REGISTER_ENCDEC (PRORES,            prores);
+    REGISTER_ENCODER(PRORES_AW,         prores_aw);
+    REGISTER_ENCODER(PRORES_KS,         prores_ks);
+    REGISTER_DECODER(PRORES_LGPL,       prores_lgpl);
     REGISTER_DECODER(PTX,               ptx);
     REGISTER_DECODER(QDRAW,             qdraw);
     REGISTER_DECODER(QPEG,              qpeg);
     REGISTER_ENCDEC (QTRLE,             qtrle);
-    REGISTER_DECODER(R10K,              r10k);
-    REGISTER_DECODER(R210,              r210);
+    REGISTER_ENCDEC (R10K,              r10k);
+    REGISTER_ENCDEC (R210,              r210);
     REGISTER_ENCDEC (RAWVIDEO,          rawvideo);
     REGISTER_DECODER(RL2,               rl2);
     REGISTER_ENCDEC (ROQ,               roq);
@@ -244,18 +293,23 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (RV20,              rv20);
     REGISTER_DECODER(RV30,              rv30);
     REGISTER_DECODER(RV40,              rv40);
-    REGISTER_DECODER(S302M,             s302m);
+    REGISTER_ENCDEC (S302M,             s302m);
     REGISTER_DECODER(SANM,              sanm);
     REGISTER_DECODER(SCREENPRESSO,      screenpresso);
+    REGISTER_DECODER(SDX2_DPCM,         sdx2_dpcm);
     REGISTER_ENCDEC (SGI,               sgi);
     REGISTER_DECODER(SGIRLE,            sgirle);
+    REGISTER_DECODER(SHEERVIDEO,        sheervideo);
     REGISTER_DECODER(SMACKER,           smacker);
     REGISTER_DECODER(SMC,               smc);
+    REGISTER_DECODER(SMVJPEG,           smvjpeg);
+    REGISTER_ENCDEC (SNOW,              snow);
     REGISTER_DECODER(SP5X,              sp5x);
     REGISTER_ENCDEC (SUNRAST,           sunrast);
     REGISTER_ENCDEC (SVQ1,              svq1);
     REGISTER_DECODER(SVQ3,              svq3);
     REGISTER_ENCDEC (TARGA,             targa);
+    REGISTER_DECODER(TARGA_Y216,        targa_y216);
     REGISTER_DECODER(TDSC,              tdsc);
     REGISTER_DECODER(THEORA,            theora);
     REGISTER_DECODER(THP,               thp);
@@ -264,6 +318,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER(TMV,               tmv);
     REGISTER_DECODER(TRUEMOTION1,       truemotion1);
     REGISTER_DECODER(TRUEMOTION2,       truemotion2);
+    REGISTER_DECODER(TRUEMOTION2RT,     truemotion2rt);
     REGISTER_DECODER(TSCC,              tscc);
     REGISTER_DECODER(TSCC2,             tscc2);
     REGISTER_DECODER(TXD,               txd);
@@ -271,12 +326,20 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (UTVIDEO,           utvideo);
     REGISTER_ENCDEC (V210,              v210);
     REGISTER_DECODER(V210X,             v210x);
+    REGISTER_ENCDEC (V308,              v308);
+    REGISTER_ENCDEC (V408,              v408);
     REGISTER_ENCDEC (V410,              v410);
     REGISTER_DECODER(VB,                vb);
     REGISTER_DECODER(VBLE,              vble);
     REGISTER_DECODER(VC1,               vc1);
+    REGISTER_DECODER(VC1_CRYSTALHD,     vc1_crystalhd);
+#if FF_API_VDPAU
+    REGISTER_DECODER(VC1_VDPAU,         vc1_vdpau);
+#endif
     REGISTER_DECODER(VC1IMAGE,          vc1image);
     REGISTER_DECODER(VC1_MMAL,          vc1_mmal);
+    REGISTER_DECODER(VC1_QSV,           vc1_qsv);
+    REGISTER_ENCODER(VC2,               vc2);
     REGISTER_DECODER(VCR1,              vcr1);
     REGISTER_DECODER(VMDVIDEO,          vmdvideo);
     REGISTER_DECODER(VMNC,              vmnc);
@@ -294,23 +357,32 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (WMV1,              wmv1);
     REGISTER_ENCDEC (WMV2,              wmv2);
     REGISTER_DECODER(WMV3,              wmv3);
+    REGISTER_DECODER(WMV3_CRYSTALHD,    wmv3_crystalhd);
+#if FF_API_VDPAU
+    REGISTER_DECODER(WMV3_VDPAU,        wmv3_vdpau);
+#endif
     REGISTER_DECODER(WMV3IMAGE,         wmv3image);
     REGISTER_DECODER(WNV1,              wnv1);
     REGISTER_DECODER(XAN_WC3,           xan_wc3);
     REGISTER_DECODER(XAN_WC4,           xan_wc4);
     REGISTER_ENCDEC (XBM,               xbm);
+    REGISTER_ENCDEC (XFACE,             xface);
     REGISTER_DECODER(XL,                xl);
     REGISTER_ENCDEC (XWD,               xwd);
+    REGISTER_ENCDEC (Y41P,              y41p);
     REGISTER_DECODER(YOP,               yop);
+    REGISTER_ENCDEC (YUV4,              yuv4);
+    REGISTER_DECODER(ZERO12V,           zero12v);
     REGISTER_DECODER(ZEROCODEC,         zerocodec);
     REGISTER_ENCDEC (ZLIB,              zlib);
     REGISTER_ENCDEC (ZMBV,              zmbv);
 
     /* audio codecs */
     REGISTER_ENCDEC (AAC,               aac);
+    REGISTER_DECODER(AAC_FIXED,         aac_fixed);
     REGISTER_DECODER(AAC_LATM,          aac_latm);
     REGISTER_ENCDEC (AC3,               ac3);
-    REGISTER_ENCODER(AC3_FIXED,         ac3_fixed);
+    REGISTER_ENCDEC (AC3_FIXED,         ac3_fixed);
     REGISTER_ENCDEC (ALAC,              alac);
     REGISTER_DECODER(ALS,               als);
     REGISTER_DECODER(AMRNB,             amrnb);
@@ -323,16 +395,25 @@ void avcodec_register_all(void)
     REGISTER_DECODER(BINKAUDIO_RDFT,    binkaudio_rdft);
     REGISTER_DECODER(BMV_AUDIO,         bmv_audio);
     REGISTER_DECODER(COOK,              cook);
-    REGISTER_DECODER(DCA,               dca);
+    REGISTER_ENCDEC (DCA,               dca);
+    REGISTER_DECODER(DSD_LSBF,          dsd_lsbf);
+    REGISTER_DECODER(DSD_MSBF,          dsd_msbf);
+    REGISTER_DECODER(DSD_LSBF_PLANAR,   dsd_lsbf_planar);
+    REGISTER_DECODER(DSD_MSBF_PLANAR,   dsd_msbf_planar);
     REGISTER_DECODER(DSICINAUDIO,       dsicinaudio);
     REGISTER_DECODER(DSS_SP,            dss_sp);
+    REGISTER_DECODER(DST,               dst);
     REGISTER_ENCDEC (EAC3,              eac3);
+    REGISTER_DECODER(EVRC,              evrc);
+    REGISTER_DECODER(FFWAVESYNTH,       ffwavesynth);
     REGISTER_ENCDEC (FLAC,              flac);
     REGISTER_ENCDEC (G723_1,            g723_1);
+    REGISTER_DECODER(G729,              g729);
     REGISTER_DECODER(GSM,               gsm);
     REGISTER_DECODER(GSM_MS,            gsm_ms);
     REGISTER_DECODER(IAC,               iac);
     REGISTER_DECODER(IMC,               imc);
+    REGISTER_DECODER(INTERPLAY_ACM,     interplay_acm);
     REGISTER_DECODER(MACE3,             mace3);
     REGISTER_DECODER(MACE6,             mace6);
     REGISTER_DECODER(METASOUND,         metasound);
@@ -341,6 +422,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER(MP1FLOAT,          mp1float);
     REGISTER_ENCDEC (MP2,               mp2);
     REGISTER_DECODER(MP2FLOAT,          mp2float);
+    REGISTER_ENCODER(MP2FIXED,          mp2fixed);
     REGISTER_DECODER(MP3,               mp3);
     REGISTER_DECODER(MP3FLOAT,          mp3float);
     REGISTER_DECODER(MP3ADU,            mp3adu);
@@ -361,20 +443,24 @@ void avcodec_register_all(void)
     REGISTER_DECODER(SHORTEN,           shorten);
     REGISTER_DECODER(SIPR,              sipr);
     REGISTER_DECODER(SMACKAUD,          smackaud);
+    REGISTER_ENCDEC (SONIC,             sonic);
+    REGISTER_ENCODER(SONIC_LS,          sonic_ls);
     REGISTER_DECODER(TAK,               tak);
     REGISTER_DECODER(TRUEHD,            truehd);
     REGISTER_DECODER(TRUESPEECH,        truespeech);
-    REGISTER_DECODER(TTA,               tta);
+    REGISTER_ENCDEC (TTA,               tta);
     REGISTER_DECODER(TWINVQ,            twinvq);
     REGISTER_DECODER(VMDAUDIO,          vmdaudio);
     REGISTER_ENCDEC (VORBIS,            vorbis);
-    REGISTER_DECODER(WAVPACK,           wavpack);
+    REGISTER_ENCDEC (WAVPACK,           wavpack);
     REGISTER_DECODER(WMALOSSLESS,       wmalossless);
     REGISTER_DECODER(WMAPRO,            wmapro);
     REGISTER_ENCDEC (WMAV1,             wmav1);
     REGISTER_ENCDEC (WMAV2,             wmav2);
     REGISTER_DECODER(WMAVOICE,          wmavoice);
     REGISTER_DECODER(WS_SND1,           ws_snd1);
+    REGISTER_DECODER(XMA1,              xma1);
+    REGISTER_DECODER(XMA2,              xma2);
 
     /* PCM codecs */
     REGISTER_ENCDEC (PCM_ALAW,          pcm_alaw);
@@ -387,18 +473,18 @@ void avcodec_register_all(void)
     REGISTER_DECODER(PCM_LXF,           pcm_lxf);
     REGISTER_ENCDEC (PCM_MULAW,         pcm_mulaw);
     REGISTER_ENCDEC (PCM_S8,            pcm_s8);
-    REGISTER_DECODER(PCM_S8_PLANAR,     pcm_s8_planar);
+    REGISTER_ENCDEC (PCM_S8_PLANAR,     pcm_s8_planar);
     REGISTER_ENCDEC (PCM_S16BE,         pcm_s16be);
-    REGISTER_DECODER(PCM_S16BE_PLANAR,  pcm_s16be_planar);
+    REGISTER_ENCDEC (PCM_S16BE_PLANAR,  pcm_s16be_planar);
     REGISTER_ENCDEC (PCM_S16LE,         pcm_s16le);
-    REGISTER_DECODER(PCM_S16LE_PLANAR,  pcm_s16le_planar);
+    REGISTER_ENCDEC (PCM_S16LE_PLANAR,  pcm_s16le_planar);
     REGISTER_ENCDEC (PCM_S24BE,         pcm_s24be);
     REGISTER_ENCDEC (PCM_S24DAUD,       pcm_s24daud);
     REGISTER_ENCDEC (PCM_S24LE,         pcm_s24le);
-    REGISTER_DECODER(PCM_S24LE_PLANAR,  pcm_s24le_planar);
+    REGISTER_ENCDEC (PCM_S24LE_PLANAR,  pcm_s24le_planar);
     REGISTER_ENCDEC (PCM_S32BE,         pcm_s32be);
     REGISTER_ENCDEC (PCM_S32LE,         pcm_s32le);
-    REGISTER_DECODER(PCM_S32LE_PLANAR,  pcm_s32le_planar);
+    REGISTER_ENCDEC (PCM_S32LE_PLANAR,  pcm_s32le_planar);
     REGISTER_ENCDEC (PCM_U8,            pcm_u8);
     REGISTER_ENCDEC (PCM_U16BE,         pcm_u16be);
     REGISTER_ENCDEC (PCM_U16LE,         pcm_u16le);
@@ -406,7 +492,7 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (PCM_U24LE,         pcm_u24le);
     REGISTER_ENCDEC (PCM_U32BE,         pcm_u32be);
     REGISTER_ENCDEC (PCM_U32LE,         pcm_u32le);
-    REGISTER_DECODER(PCM_ZORK ,         pcm_zork);
+    REGISTER_DECODER(PCM_ZORK,          pcm_zork);
 
     /* DPCM codecs */
     REGISTER_DECODER(INTERPLAY_DPCM,    interplay_dpcm);
@@ -417,7 +503,10 @@ void avcodec_register_all(void)
     /* ADPCM codecs */
     REGISTER_DECODER(ADPCM_4XM,         adpcm_4xm);
     REGISTER_ENCDEC (ADPCM_ADX,         adpcm_adx);
+    REGISTER_DECODER(ADPCM_AFC,         adpcm_afc);
+    REGISTER_DECODER(ADPCM_AICA,        adpcm_aica);
     REGISTER_DECODER(ADPCM_CT,          adpcm_ct);
+    REGISTER_DECODER(ADPCM_DTK,         adpcm_dtk);
     REGISTER_DECODER(ADPCM_EA,          adpcm_ea);
     REGISTER_DECODER(ADPCM_EA_MAXIS_XA, adpcm_ea_maxis_xa);
     REGISTER_DECODER(ADPCM_EA_R1,       adpcm_ea_r1);
@@ -426,37 +515,75 @@ void avcodec_register_all(void)
     REGISTER_DECODER(ADPCM_EA_XAS,      adpcm_ea_xas);
     REGISTER_ENCDEC (ADPCM_G722,        adpcm_g722);
     REGISTER_ENCDEC (ADPCM_G726,        adpcm_g726);
+    REGISTER_DECODER(ADPCM_G726LE,      adpcm_g726le);
     REGISTER_DECODER(ADPCM_IMA_AMV,     adpcm_ima_amv);
     REGISTER_DECODER(ADPCM_IMA_APC,     adpcm_ima_apc);
+    REGISTER_DECODER(ADPCM_IMA_DAT4,    adpcm_ima_dat4);
     REGISTER_DECODER(ADPCM_IMA_DK3,     adpcm_ima_dk3);
     REGISTER_DECODER(ADPCM_IMA_DK4,     adpcm_ima_dk4);
     REGISTER_DECODER(ADPCM_IMA_EA_EACS, adpcm_ima_ea_eacs);
     REGISTER_DECODER(ADPCM_IMA_EA_SEAD, adpcm_ima_ea_sead);
     REGISTER_DECODER(ADPCM_IMA_ISS,     adpcm_ima_iss);
+    REGISTER_DECODER(ADPCM_IMA_OKI,     adpcm_ima_oki);
     REGISTER_ENCDEC (ADPCM_IMA_QT,      adpcm_ima_qt);
+    REGISTER_DECODER(ADPCM_IMA_RAD,     adpcm_ima_rad);
     REGISTER_DECODER(ADPCM_IMA_SMJPEG,  adpcm_ima_smjpeg);
     REGISTER_ENCDEC (ADPCM_IMA_WAV,     adpcm_ima_wav);
     REGISTER_DECODER(ADPCM_IMA_WS,      adpcm_ima_ws);
     REGISTER_ENCDEC (ADPCM_MS,          adpcm_ms);
+    REGISTER_DECODER(ADPCM_MTAF,        adpcm_mtaf);
+    REGISTER_DECODER(ADPCM_PSX,         adpcm_psx);
     REGISTER_DECODER(ADPCM_SBPRO_2,     adpcm_sbpro_2);
     REGISTER_DECODER(ADPCM_SBPRO_3,     adpcm_sbpro_3);
     REGISTER_DECODER(ADPCM_SBPRO_4,     adpcm_sbpro_4);
     REGISTER_ENCDEC (ADPCM_SWF,         adpcm_swf);
     REGISTER_DECODER(ADPCM_THP,         adpcm_thp);
+    REGISTER_DECODER(ADPCM_THP_LE,      adpcm_thp_le);
     REGISTER_DECODER(ADPCM_VIMA,        adpcm_vima);
     REGISTER_DECODER(ADPCM_XA,          adpcm_xa);
     REGISTER_ENCDEC (ADPCM_YAMAHA,      adpcm_yamaha);
 
     /* subtitles */
+    REGISTER_ENCDEC (SSA,               ssa);
     REGISTER_ENCDEC (ASS,               ass);
+    REGISTER_DECODER(CCAPTION,          ccaption);
     REGISTER_ENCDEC (DVBSUB,            dvbsub);
     REGISTER_ENCDEC (DVDSUB,            dvdsub);
+    REGISTER_DECODER(JACOSUB,           jacosub);
+    REGISTER_DECODER(MICRODVD,          microdvd);
+    REGISTER_ENCDEC (MOVTEXT,           movtext);
+    REGISTER_DECODER(MPL2,              mpl2);
     REGISTER_DECODER(PGSSUB,            pgssub);
-    REGISTER_DECODER(SRT,               srt);
+    REGISTER_DECODER(PJS,               pjs);
+    REGISTER_DECODER(REALTEXT,          realtext);
+    REGISTER_DECODER(SAMI,              sami);
+    REGISTER_ENCDEC (SRT,               srt);
+    REGISTER_DECODER(STL,               stl);
+    REGISTER_ENCDEC (SUBRIP,            subrip);
+    REGISTER_DECODER(SUBVIEWER,         subviewer);
+    REGISTER_DECODER(SUBVIEWER1,        subviewer1);
+    REGISTER_ENCDEC (TEXT,              text);
+    REGISTER_DECODER(VPLAYER,           vplayer);
+    REGISTER_ENCDEC (WEBVTT,            webvtt);
     REGISTER_ENCDEC (XSUB,              xsub);
 
     /* external libraries */
-    REGISTER_DECODER(LIBDCADEC,         libdcadec)
+    REGISTER_ENCDEC (AAC_AT,            aac_at);
+    REGISTER_DECODER(AC3_AT,            ac3_at);
+    REGISTER_DECODER(ADPCM_IMA_QT_AT,   adpcm_ima_qt_at);
+    REGISTER_ENCDEC (ALAC_AT,           alac_at);
+    REGISTER_DECODER(AMR_NB_AT,         amr_nb_at);
+    REGISTER_DECODER(EAC3_AT,           eac3_at);
+    REGISTER_DECODER(GSM_MS_AT,         gsm_ms_at);
+    REGISTER_ENCDEC (ILBC_AT,           ilbc_at);
+    REGISTER_DECODER(MP1_AT,            mp1_at);
+    REGISTER_DECODER(MP2_AT,            mp2_at);
+    REGISTER_DECODER(MP3_AT,            mp3_at);
+    REGISTER_ENCDEC (PCM_ALAW_AT,       pcm_alaw_at);
+    REGISTER_ENCDEC (PCM_MULAW_AT,      pcm_mulaw_at);
+    REGISTER_DECODER(QDMC_AT,           qdmc_at);
+    REGISTER_DECODER(QDM2_AT,           qdm2_at);
+    REGISTER_DECODER(LIBCELT,           libcelt);
     REGISTER_ENCODER(LIBFAAC,           libfaac);
     REGISTER_ENCDEC (LIBFDK_AAC,        libfdk_aac);
     REGISTER_ENCDEC (LIBGSM,            libgsm);
@@ -468,40 +595,51 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC (LIBOPENJPEG,       libopenjpeg);
     REGISTER_ENCDEC (LIBOPUS,           libopus);
     REGISTER_ENCDEC (LIBSCHROEDINGER,   libschroedinger);
+    REGISTER_ENCODER(LIBSHINE,          libshine);
     REGISTER_ENCDEC (LIBSPEEX,          libspeex);
     REGISTER_ENCODER(LIBTHEORA,         libtheora);
     REGISTER_ENCODER(LIBTWOLAME,        libtwolame);
-    REGISTER_ENCODER(LIBVO_AACENC,      libvo_aacenc);
+    REGISTER_ENCDEC (LIBUTVIDEO,        libutvideo);
     REGISTER_ENCODER(LIBVO_AMRWBENC,    libvo_amrwbenc);
-    REGISTER_ENCODER(LIBVORBIS,         libvorbis);
+    REGISTER_ENCDEC (LIBVORBIS,         libvorbis);
     REGISTER_ENCDEC (LIBVPX_VP8,        libvpx_vp8);
     REGISTER_ENCDEC (LIBVPX_VP9,        libvpx_vp9);
     REGISTER_ENCODER(LIBWAVPACK,        libwavpack);
+    REGISTER_ENCODER(LIBWEBP_ANIM,      libwebp_anim);  /* preferred over libwebp */
     REGISTER_ENCODER(LIBWEBP,           libwebp);
     REGISTER_ENCODER(LIBX262,           libx262);
     REGISTER_ENCODER(LIBX264,           libx264);
+    REGISTER_ENCODER(LIBX264RGB,        libx264rgb);
     REGISTER_ENCODER(LIBX265,           libx265);
     REGISTER_ENCODER(LIBXAVS,           libxavs);
     REGISTER_ENCODER(LIBXVID,           libxvid);
+    REGISTER_DECODER(LIBZVBI_TELETEXT,  libzvbi_teletext);
+
+    /* text */
+    REGISTER_DECODER(BINTEXT,           bintext);
+    REGISTER_DECODER(XBIN,              xbin);
+    REGISTER_DECODER(IDF,               idf);
 
     /* external libraries, that shouldn't be used by default if one of the
      * above is available */
     REGISTER_ENCODER(LIBOPENH264,       libopenh264);
-    REGISTER_ENCODER(H264_NVENC,        h264_nvenc);
-    REGISTER_ENCODER(H264_OMX,          h264_omx);
+    REGISTER_DECODER(H264_CUVID,        h264_cuvid);
     REGISTER_ENCODER(H264_QSV,          h264_qsv);
     REGISTER_ENCODER(H264_VAAPI,        h264_vaapi);
-    REGISTER_ENCODER(LIBKVAZAAR,        libkvazaar);
-    REGISTER_ENCODER(HEVC_NVENC,        hevc_nvenc);
+    REGISTER_ENCODER(H264_VIDEOTOOLBOX, h264_videotoolbox);
+    REGISTER_ENCODER(NVENC,             nvenc);
+    REGISTER_ENCODER(H264_OMX,          h264_omx);
+    REGISTER_ENCODER(NVENC_H264,        nvenc_h264);
+    REGISTER_ENCODER(NVENC_HEVC,        nvenc_hevc);
+    REGISTER_DECODER(HEVC_CUVID,        hevc_cuvid);
     REGISTER_ENCODER(HEVC_QSV,          hevc_qsv);
     REGISTER_ENCODER(HEVC_VAAPI,        hevc_vaapi);
+    REGISTER_ENCODER(LIBKVAZAAR,        libkvazaar);
     REGISTER_ENCODER(MJPEG_VAAPI,       mjpeg_vaapi);
     REGISTER_ENCODER(MPEG2_QSV,         mpeg2_qsv);
-    REGISTER_ENCODER(MPEG4_OMX,         mpeg4_omx);
-#if FF_API_NVENC_OLD_NAME
-    REGISTER_ENCODER(NVENC_H264,        nvenc_h264);
-    REGISTER_ENCODER(NVENC_HEVC,        nvenc_hevc);
-#endif
+    REGISTER_DECODER(VC1_CUVID,         vc1_cuvid);
+    REGISTER_DECODER(VP8_CUVID,         vp8_cuvid);
+    REGISTER_DECODER(VP9_CUVID,         vp9_cuvid);
 
     /* parsers */
     REGISTER_PARSER(AAC,                aac);
@@ -515,9 +653,12 @@ void avcodec_register_all(void)
     REGISTER_PARSER(DIRAC,              dirac);
     REGISTER_PARSER(DNXHD,              dnxhd);
     REGISTER_PARSER(DPX,                dpx);
+    REGISTER_PARSER(DVAUDIO,            dvaudio);
     REGISTER_PARSER(DVBSUB,             dvbsub);
     REGISTER_PARSER(DVDSUB,             dvdsub);
+    REGISTER_PARSER(DVD_NAV,            dvd_nav);
     REGISTER_PARSER(FLAC,               flac);
+    REGISTER_PARSER(G729,               g729);
     REGISTER_PARSER(GSM,                gsm);
     REGISTER_PARSER(H261,               h261);
     REGISTER_PARSER(H263,               h263);
@@ -538,4 +679,5 @@ void avcodec_register_all(void)
     REGISTER_PARSER(VORBIS,             vorbis);
     REGISTER_PARSER(VP3,                vp3);
     REGISTER_PARSER(VP8,                vp8);
+    REGISTER_PARSER(VP9,                vp9);
 }
diff --git a/libavcodec/alpha/Makefile b/libavcodec/alpha/Makefile
new file mode 100644
index 0000000..796d976
--- /dev/null
+++ b/libavcodec/alpha/Makefile
@@ -0,0 +1,10 @@
+OBJS-$(CONFIG_BLOCKDSP)                 += alpha/blockdsp_alpha.o
+OBJS-$(CONFIG_ME_CMP)                   += alpha/me_cmp_alpha.o         \
+                                           alpha/me_cmp_mvi_asm.o
+OBJS-$(CONFIG_HPELDSP)                  += alpha/hpeldsp_alpha.o        \
+                                           alpha/hpeldsp_alpha_asm.o
+OBJS-$(CONFIG_IDCTDSP)                  += alpha/idctdsp_alpha.o        \
+                                           alpha/idctdsp_alpha_asm.o    \
+                                           alpha/simple_idct_alpha.o
+OBJS-$(CONFIG_MPEGVIDEO)                += alpha/mpegvideo_alpha.o
+OBJS-$(CONFIG_PIXBLOCKDSP)              += alpha/pixblockdsp_alpha.o
diff --git a/libavcodec/alpha/asm.h b/libavcodec/alpha/asm.h
new file mode 100644
index 0000000..827721e
--- /dev/null
+++ b/libavcodec/alpha/asm.h
@@ -0,0 +1,186 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_ASM_H
+#define AVCODEC_ALPHA_ASM_H
+
+#include <inttypes.h>
+
+#include "libavutil/common.h"
+
+#if AV_GCC_VERSION_AT_LEAST(2,96)
+# define likely(x)      __builtin_expect((x) != 0, 1)
+# define unlikely(x)    __builtin_expect((x) != 0, 0)
+#else
+# define likely(x)      (x)
+# define unlikely(x)    (x)
+#endif
+
+#define AMASK_BWX (1 << 0)
+#define AMASK_FIX (1 << 1)
+#define AMASK_CIX (1 << 2)
+#define AMASK_MVI (1 << 8)
+
+static inline uint64_t BYTE_VEC(uint64_t x)
+{
+    x |= x <<  8;
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+static inline uint64_t WORD_VEC(uint64_t x)
+{
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+
+#define sextw(x) ((int16_t) (x))
+
+#ifdef __GNUC__
+#define ldq(p)                                                  \
+    (((const union {                                            \
+        uint64_t __l;                                           \
+        __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)];  \
+    } *) (p))->__l)
+#define ldl(p)                                                  \
+    (((const union {                                            \
+        int32_t __l;                                            \
+        __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)];   \
+    } *) (p))->__l)
+#define stq(l, p)                                                       \
+    do {                                                                \
+        (((union {                                                      \
+            uint64_t __l;                                               \
+            __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)];      \
+        } *) (p))->__l) = l;                                            \
+    } while (0)
+#define stl(l, p)                                                       \
+    do {                                                                \
+        (((union {                                                      \
+            int32_t __l;                                                \
+            __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)];       \
+        } *) (p))->__l) = l;                                            \
+    } while (0)
+struct unaligned_long { uint64_t l; } __attribute__((packed));
+#define ldq_u(p)        (*(const uint64_t *) (((uint64_t) (p)) & ~7ul))
+#define uldq(a)         (((const struct unaligned_long *) (a))->l)
+
+#if AV_GCC_VERSION_AT_LEAST(3,3)
+#define prefetch(p)     __builtin_prefetch((p), 0, 1)
+#define prefetch_en(p)  __builtin_prefetch((p), 0, 0)
+#define prefetch_m(p)   __builtin_prefetch((p), 1, 1)
+#define prefetch_men(p) __builtin_prefetch((p), 1, 0)
+#define cmpbge          __builtin_alpha_cmpbge
+/* Avoid warnings.  */
+#define extql(a, b)     __builtin_alpha_extql(a, (uint64_t) (b))
+#define extwl(a, b)     __builtin_alpha_extwl(a, (uint64_t) (b))
+#define extqh(a, b)     __builtin_alpha_extqh(a, (uint64_t) (b))
+#define zap             __builtin_alpha_zap
+#define zapnot          __builtin_alpha_zapnot
+#define amask           __builtin_alpha_amask
+#define implver         __builtin_alpha_implver
+#define rpcc            __builtin_alpha_rpcc
+#else
+#define prefetch(p)     __asm__ volatile("ldl $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_en(p)  __asm__ volatile("ldq $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_m(p)   __asm__ volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_men(p) __asm__ volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define cmpbge(a, b) ({ uint64_t __r; __asm__ ("cmpbge  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extql(a, b)  ({ uint64_t __r; __asm__ ("extql   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extwl(a, b)  ({ uint64_t __r; __asm__ ("extwl   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extqh(a, b)  ({ uint64_t __r; __asm__ ("extqh   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zap(a, b)    ({ uint64_t __r; __asm__ ("zap     %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zapnot(a, b) ({ uint64_t __r; __asm__ ("zapnot  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define amask(a)     ({ uint64_t __r; __asm__ ("amask   %1,%0"      : "=r" (__r) : "rI"  (a));           __r; })
+#define implver()    ({ uint64_t __r; __asm__ ("implver %0"         : "=r" (__r));                       __r; })
+#define rpcc()       ({ uint64_t __r; __asm__ volatile ("rpcc %0"   : "=r" (__r));                       __r; })
+#endif
+#define wh64(p) __asm__ volatile("wh64 (%0)" : : "r"(p) : "memory")
+
+#if AV_GCC_VERSION_AT_LEAST(3,3) && defined(__alpha_max__)
+#define minub8  __builtin_alpha_minub8
+#define minsb8  __builtin_alpha_minsb8
+#define minuw4  __builtin_alpha_minuw4
+#define minsw4  __builtin_alpha_minsw4
+#define maxub8  __builtin_alpha_maxub8
+#define maxsb8  __builtin_alpha_maxsb8
+#define maxuw4  __builtin_alpha_maxuw4
+#define maxsw4  __builtin_alpha_maxsw4
+#define perr    __builtin_alpha_perr
+#define pklb    __builtin_alpha_pklb
+#define pkwb    __builtin_alpha_pkwb
+#define unpkbl  __builtin_alpha_unpkbl
+#define unpkbw  __builtin_alpha_unpkbw
+#else
+#define minub8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsb8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minuw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; minsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxub8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsb8(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxuw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsw4(a, b) ({ uint64_t __r; __asm__ (".arch ev6; maxsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define perr(a, b)   ({ uint64_t __r; __asm__ (".arch ev6; perr    %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; })
+#define pklb(a)      ({ uint64_t __r; __asm__ (".arch ev6; pklb    %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define pkwb(a)      ({ uint64_t __r; __asm__ (".arch ev6; pkwb    %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define unpkbl(a)    ({ uint64_t __r; __asm__ (".arch ev6; unpkbl  %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#define unpkbw(a)    ({ uint64_t __r; __asm__ (".arch ev6; unpkbw  %r1,%0"     : "=r" (__r) : "rJ"  (a));           __r; })
+#endif
+
+#elif defined(__DECC)           /* Digital/Compaq/hp "ccc" compiler */
+
+#include <c_asm.h>
+#define ldq(p) (*(const uint64_t *) (p))
+#define ldl(p) (*(const int32_t *)  (p))
+#define stq(l, p) do { *(uint64_t *) (p) = (l); } while (0)
+#define stl(l, p) do { *(int32_t *)  (p) = (l); } while (0)
+#define ldq_u(a)     asm ("ldq_u   %v0,0(%a0)", a)
+#define uldq(a)      (*(const __unaligned uint64_t *) (a))
+#define cmpbge(a, b) asm ("cmpbge  %a0,%a1,%v0", a, b)
+#define extql(a, b)  asm ("extql   %a0,%a1,%v0", a, b)
+#define extwl(a, b)  asm ("extwl   %a0,%a1,%v0", a, b)
+#define extqh(a, b)  asm ("extqh   %a0,%a1,%v0", a, b)
+#define zap(a, b)    asm ("zap     %a0,%a1,%v0", a, b)
+#define zapnot(a, b) asm ("zapnot  %a0,%a1,%v0", a, b)
+#define amask(a)     asm ("amask   %a0,%v0", a)
+#define implver()    asm ("implver %v0")
+#define rpcc()       asm ("rpcc           %v0")
+#define minub8(a, b) asm ("minub8  %a0,%a1,%v0", a, b)
+#define minsb8(a, b) asm ("minsb8  %a0,%a1,%v0", a, b)
+#define minuw4(a, b) asm ("minuw4  %a0,%a1,%v0", a, b)
+#define minsw4(a, b) asm ("minsw4  %a0,%a1,%v0", a, b)
+#define maxub8(a, b) asm ("maxub8  %a0,%a1,%v0", a, b)
+#define maxsb8(a, b) asm ("maxsb8  %a0,%a1,%v0", a, b)
+#define maxuw4(a, b) asm ("maxuw4  %a0,%a1,%v0", a, b)
+#define maxsw4(a, b) asm ("maxsw4  %a0,%a1,%v0", a, b)
+#define perr(a, b)   asm ("perr    %a0,%a1,%v0", a, b)
+#define pklb(a)      asm ("pklb    %a0,%v0", a)
+#define pkwb(a)      asm ("pkwb    %a0,%v0", a)
+#define unpkbl(a)    asm ("unpkbl  %a0,%v0", a)
+#define unpkbw(a)    asm ("unpkbw  %a0,%v0", a)
+#define wh64(a)      asm ("wh64    %a0", a)
+
+#else
+#error "Unknown compiler!"
+#endif
+
+#endif /* AVCODEC_ALPHA_ASM_H */
diff --git a/libavcodec/alpha/blockdsp_alpha.c b/libavcodec/alpha/blockdsp_alpha.c
new file mode 100644
index 0000000..c6f0964
--- /dev/null
+++ b/libavcodec/alpha/blockdsp_alpha.c
@@ -0,0 +1,49 @@
+/*
+ * Alpha optimised block operations
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/blockdsp.h"
+#include "asm.h"
+
+static void clear_blocks_axp(int16_t *blocks) {
+    uint64_t *p = (uint64_t *) blocks;
+    int n = sizeof(int16_t) * 6 * 64;
+
+    do {
+        p[0] = 0;
+        p[1] = 0;
+        p[2] = 0;
+        p[3] = 0;
+        p[4] = 0;
+        p[5] = 0;
+        p[6] = 0;
+        p[7] = 0;
+        p += 8;
+        n -= 8 * 8;
+    } while (n);
+}
+
+av_cold void ff_blockdsp_init_alpha(BlockDSPContext *c)
+{
+    c->clear_blocks = clear_blocks_axp;
+}
diff --git a/libavcodec/alpha/hpeldsp_alpha.c b/libavcodec/alpha/hpeldsp_alpha.c
new file mode 100644
index 0000000..8d54807
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.c
@@ -0,0 +1,213 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/hpeldsp.h"
+#include "hpeldsp_alpha.h"
+#include "asm.h"
+
+static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
+{
+    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+static inline uint64_t avg2(uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+#if 0
+/* The XY2 routines basically utilize this scheme, but reuse parts in
+   each iteration.  */
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
+{
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+                    + (l2 & BYTE_VEC(0x03))
+                    + (l3 & BYTE_VEC(0x03))
+                    + (l4 & BYTE_VEC(0x03))
+                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+#endif
+
+#define OP(LOAD, STORE)                         \
+    do {                                        \
+        STORE(LOAD(pixels), block);             \
+        pixels += line_size;                    \
+        block += line_size;                     \
+    } while (--h)
+
+#define OP_X2(LOAD, STORE)                                      \
+    do {                                                        \
+        uint64_t pix1, pix2;                                    \
+                                                                \
+        pix1 = LOAD(pixels);                                    \
+        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
+        STORE(AVG2(pix1, pix2), block);                         \
+        pixels += line_size;                                    \
+        block += line_size;                                     \
+    } while (--h)
+
+#define OP_Y2(LOAD, STORE)                      \
+    do {                                        \
+        uint64_t pix = LOAD(pixels);            \
+        do {                                    \
+            uint64_t next_pix;                  \
+                                                \
+            pixels += line_size;                \
+            next_pix = LOAD(pixels);            \
+            STORE(AVG2(pix, next_pix), block);  \
+            block += line_size;                 \
+            pix = next_pix;                     \
+        } while (--h);                          \
+    } while (0)
+
+#define OP_XY2(LOAD, STORE)                                                 \
+    do {                                                                    \
+        uint64_t pix1 = LOAD(pixels);                                       \
+        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
+        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
+                       + (pix2 & BYTE_VEC(0x03));                           \
+        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
+                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
+                                                                            \
+        do {                                                                \
+            uint64_t npix1, npix2;                                          \
+            uint64_t npix_l, npix_h;                                        \
+            uint64_t avg;                                                   \
+                                                                            \
+            pixels += line_size;                                            \
+            npix1 = LOAD(pixels);                                           \
+            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
+            npix_l = (npix1 & BYTE_VEC(0x03))                               \
+                   + (npix2 & BYTE_VEC(0x03));                              \
+            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
+                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
+            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
+                + pix_h + npix_h;                                           \
+            STORE(avg, block);                                              \
+                                                                            \
+            block += line_size;                                             \
+            pix_l = npix_l;                                                 \
+            pix_h = npix_h;                                                 \
+        } while (--h);                                                      \
+    } while (0)
+
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
+static void OPNAME ## _pixels ## SUFF ## _axp                               \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    if ((size_t) pixels & 0x7) {                                            \
+        OPKIND(uldq, STORE);                                                \
+    } else {                                                                \
+        OPKIND(ldq, STORE);                                                 \
+    }                                                                       \
+}                                                                           \
+                                                                            \
+static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
+    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
+}
+
+#define PIXOP(OPNAME, STORE)                    \
+    MAKE_OP(OPNAME, ,     OP,     STORE)        \
+    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
+    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
+    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
+
+/* Rounding primitives.  */
+#define AVG2 avg2
+#define AVG4 avg4
+#define AVG4_ROUNDER BYTE_VEC(0x02)
+#define STORE(l, b) stq(l, b)
+PIXOP(put, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg, STORE);
+
+/* Not rounding primitives.  */
+#undef AVG2
+#undef AVG4
+#undef AVG4_ROUNDER
+#undef STORE
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define AVG4_ROUNDER BYTE_VEC(0x01)
+#define STORE(l, b) stq(l, b)
+PIXOP(put_no_rnd, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg_no_rnd, STORE);
+
+static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
+                                 ptrdiff_t line_size, int h)
+{
+    put_pixels_axp_asm(block,     pixels,     line_size, h);
+    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
+}
+
+av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
+    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
+
+    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
+    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
+    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
+    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
+
+    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
+    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
+    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
+    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
+
+    c->avg_pixels_tab[1][0] = avg_pixels_axp;
+    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
+    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
+    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
+}
diff --git a/libavcodec/alpha/hpeldsp_alpha.h b/libavcodec/alpha/hpeldsp_alpha.h
new file mode 100644
index 0000000..985182c
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H
+#define AVCODEC_ALPHA_HPELDSP_ALPHA_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+
+#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */
diff --git a/libavcodec/alpha/hpeldsp_alpha_asm.S b/libavcodec/alpha/hpeldsp_alpha_asm.S
new file mode 100644
index 0000000..df386c4
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha_asm.S
@@ -0,0 +1,125 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ *                         int line_size, int h)
+ */
+        .align 6
+        .globl put_pixels_axp_asm
+        .ent put_pixels_axp_asm
+put_pixels_axp_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        and     a1, 7, t0
+        beq     t0, $aligned
+
+        .align 4
+$unaligned:
+        ldq_u   t0, 0(a1)
+        ldq_u   t1, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t2, 0(a1)
+        ldq_u   t3, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t4, 0(a1)
+        ldq_u   t5, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t6, 0(a1)
+        ldq_u   t7, 8(a1)
+        extql   t0, a1, t0
+        addq    a1, a2, a1
+
+        extqh   t1, a1, t1
+        addq    a0, a2, t8
+        extql   t2, a1, t2
+        addq    t8, a2, t9
+
+        extqh   t3, a1, t3
+        addq    t9, a2, ta
+        extql   t4, a1, t4
+        or      t0, t1, t0
+
+        extqh   t5, a1, t5
+        or      t2, t3, t2
+        extql   t6, a1, t6
+        or      t4, t5, t4
+
+        extqh   t7, a1, t7
+        or      t6, t7, t6
+        stq     t0, 0(a0)
+        stq     t2, 0(t8)
+
+        stq     t4, 0(t9)
+        subq    a3, 4, a3
+        stq     t6, 0(ta)
+        addq    ta, a2, a0
+
+        bne     a3, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        ldq     t0, 0(a1)
+        addq    a1, a2, a1
+        ldq     t1, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t2, 0(a1)
+        addq    a1, a2, a1
+        ldq     t3, 0(a1)
+
+        addq    a0, a2, t4
+        addq    a1, a2, a1
+        addq    t4, a2, t5
+        subq    a3, 4, a3
+
+        stq     t0, 0(a0)
+        addq    t5, a2, t6
+        stq     t1, 0(t4)
+        addq    t6, a2, a0
+
+        stq     t2, 0(t5)
+        stq     t3, 0(t6)
+
+        bne     a3, $aligned
+        ret
+        .end put_pixels_axp_asm
diff --git a/libavcodec/alpha/idctdsp_alpha.c b/libavcodec/alpha/idctdsp_alpha.c
new file mode 100644
index 0000000..1923ebb
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp_alpha.h"
+#include "asm.h"
+
+void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
+
+void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                 ptrdiff_t line_size);
+void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                 ptrdiff_t line_size);
+
+#if 0
+/* These functions were the base for the optimized assembler routines,
+   and remain here for documentation purposes.  */
+static void put_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
+                                   ptrdiff_t line_size)
+{
+    int i = 8;
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+
+    do {
+        uint64_t shorts0, shorts1;
+
+        shorts0 = ldq(block);
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+        stl(pkwb(shorts0), pixels);
+
+        shorts1 = ldq(block + 4);
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--i);
+}
+
+void add_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
+                            ptrdiff_t line_size)
+{
+    int h = 8;
+    /* Keep this function a leaf function by generating the constants
+       manually (mainly for the hack value ;-).  */
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+    uint64_t signmask  = zap(-1, 0x33);
+    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
+
+    do {
+        uint64_t shorts0, pix0, signs0;
+        uint64_t shorts1, pix1, signs1;
+
+        shorts0 = ldq(block);
+        shorts1 = ldq(block + 4);
+
+        pix0    = unpkbw(ldl(pixels));
+        /* Signed subword add (MMX paddw).  */
+        signs0  = shorts0 & signmask;
+        shorts0 &= ~signmask;
+        shorts0 += pix0;
+        shorts0 ^= signs0;
+        /* Clamp. */
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+
+        /* Next 4.  */
+        pix1    = unpkbw(ldl(pixels + 4));
+        signs1  = shorts1 & signmask;
+        shorts1 &= ~signmask;
+        shorts1 += pix1;
+        shorts1 ^= signs1;
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+
+        stl(pkwb(shorts0), pixels);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--h);
+}
+#endif
+
+av_cold void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx,
+                                   unsigned high_bit_depth)
+{
+    /* amask clears all bits that correspond to present features.  */
+    if (amask(AMASK_MVI) == 0) {
+        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
+        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;
+    }
+
+    put_pixels_clamped_axp_p = c->put_pixels_clamped;
+    add_pixels_clamped_axp_p = c->add_pixels_clamped;
+
+    if (!high_bit_depth && !avctx->lowres &&
+        (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) {
+        c->idct_put = ff_simple_idct_put_axp;
+        c->idct_add = ff_simple_idct_add_axp;
+        c->idct =     ff_simple_idct_axp;
+    }
+}
diff --git a/libavcodec/alpha/idctdsp_alpha.h b/libavcodec/alpha/idctdsp_alpha.h
new file mode 100644
index 0000000..bf98495
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_IDCTDSP_ALPHA_H
+#define AVCODEC_ALPHA_IDCTDSP_ALPHA_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                        ptrdiff_t line_size);
+extern void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
+                                        ptrdiff_t line_size);
+
+void ff_simple_idct_axp(int16_t *block);
+void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block);
+
+#endif /* AVCODEC_ALPHA_IDCTDSP_ALPHA_H */
diff --git a/libavcodec/alpha/idctdsp_alpha_asm.S b/libavcodec/alpha/idctdsp_alpha_asm.S
new file mode 100644
index 0000000..f545df9
--- /dev/null
+++ b/libavcodec/alpha/idctdsp_alpha_asm.S
@@ -0,0 +1,167 @@
+/*
+ * Alpha optimized IDCT-related routines
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+ *                                 ptrdiff_t line_size)
+ */
+        .align 6
+        .globl put_pixels_clamped_mvi_asm
+        .ent put_pixels_clamped_mvi_asm
+put_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t8, -1
+        lda     t9, 8           # loop counter
+        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
+
+        .align 4
+1:      ldq     t0,  0(a0)
+        ldq     t1,  8(a0)
+        ldq     t2, 16(a0)
+        ldq     t3, 24(a0)
+
+        maxsw4  t0, zero, t0
+        subq    t9, 2, t9
+        maxsw4  t1, zero, t1
+        lda     a0, 32(a0)
+
+        maxsw4  t2, zero, t2
+        addq    a1, a2, ta
+        maxsw4  t3, zero, t3
+        minsw4  t0, t8, t0
+
+        minsw4  t1, t8, t1
+        minsw4  t2, t8, t2
+        minsw4  t3, t8, t3
+        pkwb    t0, t0
+
+        pkwb    t1, t1
+        pkwb    t2, t2
+        pkwb    t3, t3
+        stl     t0, 0(a1)
+
+        stl     t1, 4(a1)
+        addq    ta, a2, a1
+        stl     t2, 0(ta)
+        stl     t3, 4(ta)
+
+        bne     t9, 1b
+        ret
+        .end put_pixels_clamped_mvi_asm
+
+/************************************************************************
+ * void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
+ *                                 ptrdiff_t line_size)
+ */
+        .align 6
+        .globl add_pixels_clamped_mvi_asm
+        .ent add_pixels_clamped_mvi_asm
+add_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t1, -1
+        lda     th, 8
+        zap     t1, 0x33, tg
+        nop
+
+        srl     tg, 1, t0
+        xor     tg, t0, tg      # 0x8000800080008000
+        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
+
+        .align 4
+1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
+        ldl     t4, 4(a1)       # pix1
+        addq    a1, a2, te      # pixels += line_size
+        ldq     t0, 0(a0)       # shorts0
+
+        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
+        ldl     ta, 4(te)       # pix3
+        ldq     t3, 8(a0)       # shorts1
+        ldq     t6, 16(a0)      # shorts2
+
+        ldq     t9, 24(a0)      # shorts3
+        unpkbw  t1, t1          # 0 0 (quarter/op no.)
+        and     t0, tg, t2      # 0 1
+        unpkbw  t4, t4          # 1 0
+
+        bic     t0, tg, t0      # 0 2
+        unpkbw  t7, t7          # 2 0
+        and     t3, tg, t5      # 1 1
+        addq    t0, t1, t0      # 0 3
+
+        xor     t0, t2, t0      # 0 4
+        unpkbw  ta, ta          # 3 0
+        and     t6, tg, t8      # 2 1
+        maxsw4  t0, zero, t0    # 0 5
+
+        bic     t3, tg, t3      # 1 2
+        bic     t6, tg, t6      # 2 2
+        minsw4  t0, tf, t0      # 0 6
+        addq    t3, t4, t3      # 1 3
+
+        pkwb    t0, t0          # 0 7
+        xor     t3, t5, t3      # 1 4
+        maxsw4  t3, zero, t3    # 1 5
+        addq    t6, t7, t6      # 2 3
+
+        xor     t6, t8, t6      # 2 4
+        and     t9, tg, tb      # 3 1
+        minsw4  t3, tf, t3      # 1 6
+        bic     t9, tg, t9      # 3 2
+
+        maxsw4  t6, zero, t6    # 2 5
+        addq    t9, ta, t9      # 3 3
+        stl     t0, 0(a1)       # 0 8
+        minsw4  t6, tf, t6      # 2 6
+
+        xor     t9, tb, t9      # 3 4
+        maxsw4  t9, zero, t9    # 3 5
+        lda     a0, 32(a0)      # block += 16;
+        pkwb    t3, t3          # 1 7
+
+        minsw4  t9, tf, t9      # 3 6
+        subq    th, 2, th
+        pkwb    t6, t6          # 2 7
+        pkwb    t9, t9          # 3 7
+
+        stl     t3, 4(a1)       # 1 8
+        addq    te, a2, a1      # pixels += line_size
+        stl     t6, 0(te)       # 2 8
+        stl     t9, 4(te)       # 3 8
+
+        bne     th, 1b
+        ret
+        .end add_pixels_clamped_mvi_asm
diff --git a/libavcodec/alpha/me_cmp_alpha.c b/libavcodec/alpha/me_cmp_alpha.c
new file mode 100644
index 0000000..8f36019
--- /dev/null
+++ b/libavcodec/alpha/me_cmp_alpha.c
@@ -0,0 +1,317 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/me_cmp.h"
+#include "asm.h"
+
+int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
+
+static inline uint64_t avg2(uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
+{
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+                    + (l2 & BYTE_VEC(0x03))
+                    + (l3 & BYTE_VEC(0x03))
+                    + (l4 & BYTE_VEC(0x03))
+                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+
+static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    if ((size_t) pix2 & 0x7) {
+        /* works only when pix2 is actually unaligned */
+        do {                    /* do 8 pixel a time */
+            uint64_t p1, p2;
+
+            p1  = ldq(pix1);
+            p2  = uldq(pix2);
+            result += perr(p1, p2);
+
+            pix1 += line_size;
+            pix2 += line_size;
+        } while (--h);
+    } else {
+        do {
+            uint64_t p1, p2;
+
+            p1 = ldq(pix1);
+            p2 = ldq(pix2);
+            result += perr(p1, p2);
+
+            pix1 += line_size;
+            pix2 += line_size;
+        } while (--h);
+    }
+
+    return result;
+}
+
+#if 0                           /* now done in assembly */
+int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int result = 0;
+    int h = 16;
+
+    if ((size_t) pix2 & 0x7) {
+        /* works only when pix2 is actually unaligned */
+        do {                    /* do 16 pixel a time */
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t t;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            t     = ldq_u(pix2 + 8);
+            p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+            p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+    } else {
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            p2_l = ldq(pix2);
+            p2_r = ldq(pix2 + 8);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+    }
+
+    return result;
+}
+#endif
+
+static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+    uint64_t disalign = (size_t) pix2 & 0x7;
+
+    switch (disalign) {
+    case 0:
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            l    = ldq(pix2);
+            r    = ldq(pix2 + 8);
+            p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
+            p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    case 7:
+        /* |.......l|lllllllr|rrrrrrr*|
+           This case is special because disalign1 would be 8, which
+           gets treated as 0 by extqh.  At least it is a bit faster
+           that way :)  */
+        do {
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, m, r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            l     = ldq_u(pix2);
+            m     = ldq_u(pix2 + 8);
+            r     = ldq_u(pix2 + 16);
+            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
+            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    default:
+        do {
+            uint64_t disalign1 = disalign + 1;
+            uint64_t p1_l, p1_r, p2_l, p2_r;
+            uint64_t l, m, r;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            l     = ldq_u(pix2);
+            m     = ldq_u(pix2 + 8);
+            r     = ldq_u(pix2 + 16);
+            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
+                         extql(l, disalign1) | extqh(m, disalign1));
+            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
+                         extql(m, disalign1) | extqh(r, disalign1));
+            pix1 += line_size;
+            pix2 += line_size;
+
+            result += perr(p1_l, p2_l)
+                    + perr(p1_r, p2_r);
+        } while (--h);
+        break;
+    }
+    return result;
+}
+
+static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    if ((size_t) pix2 & 0x7) {
+        uint64_t t, p2_l, p2_r;
+        t     = ldq_u(pix2 + 8);
+        p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+        p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+
+        do {
+            uint64_t p1_l, p1_r, np2_l, np2_r;
+            uint64_t t;
+
+            p1_l  = ldq(pix1);
+            p1_r  = ldq(pix1 + 8);
+            pix2 += line_size;
+            t     = ldq_u(pix2 + 8);
+            np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
+            np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
+
+            result += perr(p1_l, avg2(p2_l, np2_l))
+                    + perr(p1_r, avg2(p2_r, np2_r));
+
+            pix1 += line_size;
+            p2_l  = np2_l;
+            p2_r  = np2_r;
+
+        } while (--h);
+    } else {
+        uint64_t p2_l, p2_r;
+        p2_l = ldq(pix2);
+        p2_r = ldq(pix2 + 8);
+        do {
+            uint64_t p1_l, p1_r, np2_l, np2_r;
+
+            p1_l = ldq(pix1);
+            p1_r = ldq(pix1 + 8);
+            pix2 += line_size;
+            np2_l = ldq(pix2);
+            np2_r = ldq(pix2 + 8);
+
+            result += perr(p1_l, avg2(p2_l, np2_l))
+                    + perr(p1_r, avg2(p2_r, np2_r));
+
+            pix1 += line_size;
+            p2_l  = np2_l;
+            p2_r  = np2_r;
+        } while (--h);
+    }
+    return result;
+}
+
+static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
+{
+    int result = 0;
+
+    uint64_t p1_l, p1_r;
+    uint64_t p2_l, p2_r, p2_x;
+
+    p1_l = ldq(pix1);
+    p1_r = ldq(pix1 + 8);
+
+    if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
+        p2_l = uldq(pix2);
+        p2_r = uldq(pix2 + 8);
+        p2_x = (uint64_t) pix2[16] << 56;
+    } else {
+        p2_l = ldq(pix2);
+        p2_r = ldq(pix2 + 8);
+        p2_x = ldq(pix2 + 16) << 56;
+    }
+
+    do {
+        uint64_t np1_l, np1_r;
+        uint64_t np2_l, np2_r, np2_x;
+
+        pix1 += line_size;
+        pix2 += line_size;
+
+        np1_l = ldq(pix1);
+        np1_r = ldq(pix1 + 8);
+
+        if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
+            np2_l = uldq(pix2);
+            np2_r = uldq(pix2 + 8);
+            np2_x = (uint64_t) pix2[16] << 56;
+        } else {
+            np2_l = ldq(pix2);
+            np2_r = ldq(pix2 + 8);
+            np2_x = ldq(pix2 + 16) << 56;
+        }
+
+        result += perr(p1_l,
+                       avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
+                            np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
+                + perr(p1_r,
+                       avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
+                            np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
+
+        p1_l = np1_l;
+        p1_r = np1_r;
+        p2_l = np2_l;
+        p2_r = np2_r;
+        p2_x = np2_x;
+    } while (--h);
+
+    return result;
+}
+
+av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx)
+{
+    /* amask clears all bits that correspond to present features.  */
+    if (amask(AMASK_MVI) == 0) {
+        c->sad[0]           = pix_abs16x16_mvi_asm;
+        c->sad[1]           = pix_abs8x8_mvi;
+        c->pix_abs[0][0]    = pix_abs16x16_mvi_asm;
+        c->pix_abs[1][0]    = pix_abs8x8_mvi;
+        c->pix_abs[0][1]    = pix_abs16x16_x2_mvi;
+        c->pix_abs[0][2]    = pix_abs16x16_y2_mvi;
+        c->pix_abs[0][3]    = pix_abs16x16_xy2_mvi;
+    }
+}
diff --git a/libavcodec/alpha/me_cmp_mvi_asm.S b/libavcodec/alpha/me_cmp_mvi_asm.S
new file mode 100644
index 0000000..2399085
--- /dev/null
+++ b/libavcodec/alpha/me_cmp_mvi_asm.S
@@ -0,0 +1,179 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "regdef.h"
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/*****************************************************************************
+ * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size)
+ *
+ * This code is written with a pca56 in mind. For ev6, one should
+ * really take the increased latency of 3 cycles for MVI instructions
+ * into account.
+ *
+ * It is important to keep the loading and first use of a register as
+ * far apart as possible, because if a register is accessed before it
+ * has been fetched from memory, the CPU will stall.
+ */
+        .align 4
+        .globl pix_abs16x16_mvi_asm
+        .ent pix_abs16x16_mvi_asm
+pix_abs16x16_mvi_asm:
+        .frame sp, 0, ra, 0
+        .prologue 0
+
+        and     a2, 7, t0
+        clr     v0
+        beq     t0, $aligned
+        .align 4
+$unaligned:
+        /* Registers:
+           line 0:
+           t0:  left_u -> left lo -> left
+           t1:  mid
+           t2:  right_u -> right hi -> right
+           t3:  ref left
+           t4:  ref right
+           line 1:
+           t5:  left_u -> left lo -> left
+           t6:  mid
+           t7:  right_u -> right hi -> right
+           t8:  ref left
+           t9:  ref right
+           temp:
+           ta:  left hi
+           tb:  right lo
+           tc:  error left
+           td:  error right  */
+
+        /* load line 0 */
+        ldq_u   t0, 0(a2)       # left_u
+        ldq_u   t1, 8(a2)       # mid
+        ldq_u   t2, 16(a2)      # right_u
+        ldq     t3, 0(a1)       # ref left
+        ldq     t4, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        addq    a2, a3, a2      # pix2
+        /* load line 1 */
+        ldq_u   t5, 0(a2)       # left_u
+        ldq_u   t6, 8(a2)       # mid
+        ldq_u   t7, 16(a2)      # right_u
+        ldq     t8, 0(a1)       # ref left
+        ldq     t9, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        addq    a2, a3, a2      # pix2
+        /* calc line 0 */
+        extql   t0, a2, t0      # left lo
+        extqh   t1, a2, ta      # left hi
+        extql   t1, a2, tb      # right lo
+        or      t0, ta, t0      # left
+        extqh   t2, a2, t2      # right hi
+        perr    t3, t0, tc      # error left
+        or      t2, tb, t2      # right
+        perr    t4, t2, td      # error right
+        addq    v0, tc, v0      # add error left
+        addq    v0, td, v0      # add error left
+        /* calc line 1 */
+        extql   t5, a2, t5      # left lo
+        extqh   t6, a2, ta      # left hi
+        extql   t6, a2, tb      # right lo
+        or      t5, ta, t5      # left
+        extqh   t7, a2, t7      # right hi
+        perr    t8, t5, tc      # error left
+        or      t7, tb, t7      # right
+        perr    t9, t7, td      # error right
+        addq    v0, tc, v0      # add error left
+        addq    v0, td, v0      # add error left
+        /* loop */
+        subq    a4,  2, a4      # h -= 2
+        bne     a4, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        /* load line 0 */
+        ldq     t0, 0(a2)       # left
+        ldq     t1, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     t2, 0(a1)       # ref left
+        ldq     t3, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 1 */
+        ldq     t4, 0(a2)       # left
+        ldq     t5, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     t6, 0(a1)       # ref left
+        ldq     t7, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 2 */
+        ldq     t8, 0(a2)       # left
+        ldq     t9, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     ta, 0(a1)       # ref left
+        ldq     tb, 8(a1)       # ref right
+        addq    a1, a3, a1      # pix1
+        /* load line 3 */
+        ldq     tc, 0(a2)       # left
+        ldq     td, 8(a2)       # right
+        addq    a2, a3, a2      # pix2
+        ldq     te, 0(a1)       # ref left
+        ldq     a0, 8(a1)       # ref right
+        /* calc line 0 */
+        perr    t0, t2, t0      # error left
+        addq    a1, a3, a1      # pix1
+        perr    t1, t3, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 1 */
+        perr    t4, t6, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    t5, t7, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 2 */
+        perr    t8, ta, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    t9, tb, t1      # error right
+        addq    v0, t0, v0      # add error left
+        /* calc line 3 */
+        perr    tc, te, t0      # error left
+        addq    v0, t1, v0      # add error right
+        perr    td, a0, t1      # error right
+        addq    v0, t0, v0      # add error left
+        addq    v0, t1, v0      # add error right
+        /* loop */
+        subq    a4,  4, a4      # h -= 4
+        bne     a4, $aligned
+        ret
+        .end pix_abs16x16_mvi_asm
diff --git a/libavcodec/alpha/mpegvideo_alpha.c b/libavcodec/alpha/mpegvideo_alpha.c
new file mode 100644
index 0000000..126fe26
--- /dev/null
+++ b/libavcodec/alpha/mpegvideo_alpha.c
@@ -0,0 +1,110 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/mpegvideo.h"
+#include "asm.h"
+
+static void dct_unquantize_h263_axp(int16_t *block, int n_coeffs,
+                                    uint64_t qscale, uint64_t qadd)
+{
+    uint64_t qmul = qscale << 1;
+    uint64_t correction = WORD_VEC(qmul * 255 >> 8);
+    int i;
+
+    qadd = WORD_VEC(qadd);
+
+    for(i = 0; i <= n_coeffs; block += 4, i += 4) {
+        uint64_t levels, negmask, zeros, add, sub;
+
+        levels = ldq(block);
+        if (levels == 0)
+            continue;
+
+#ifdef __alpha_max__
+        /* I don't think the speed difference justifies runtime
+           detection.  */
+        negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */
+        negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
+#else
+        negmask = cmpbge(WORD_VEC(0x7fff), levels);
+        negmask &= (negmask >> 1) | (1 << 7);
+        negmask = zap(-1, negmask);
+#endif
+
+        zeros = cmpbge(0, levels);
+        zeros &= zeros >> 1;
+        /* zeros |= zeros << 1 is not needed since qadd <= 255, so
+           zapping the lower byte suffices.  */
+
+        levels *= qmul;
+        levels -= correction & (negmask << 16);
+
+        add = qadd & ~negmask;
+        sub = qadd &  negmask;
+        /* Set qadd to 0 for levels == 0.  */
+        add = zap(add, zeros);
+        levels += add;
+        levels -= sub;
+
+        stq(levels, block);
+    }
+}
+
+static void dct_unquantize_h263_intra_axp(MpegEncContext *s, int16_t *block,
+                                    int n, int qscale)
+{
+    int n_coeffs;
+    uint64_t qadd;
+    int16_t block0 = block[0];
+
+    if (!s->h263_aic) {
+        if (n < 4)
+            block0 *= s->y_dc_scale;
+        else
+            block0 *= s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    } else {
+        qadd = 0;
+    }
+
+    if(s->ac_pred)
+        n_coeffs = 63;
+    else
+        n_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    dct_unquantize_h263_axp(block, n_coeffs, qscale, qadd);
+
+    block[0] = block0;
+}
+
+static void dct_unquantize_h263_inter_axp(MpegEncContext *s, int16_t *block,
+                                    int n, int qscale)
+{
+    int n_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+    dct_unquantize_h263_axp(block, n_coeffs, qscale, (qscale - 1) | 1);
+}
+
+av_cold void ff_mpv_common_init_axp(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_axp;
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_axp;
+}
diff --git a/libavcodec/alpha/pixblockdsp_alpha.c b/libavcodec/alpha/pixblockdsp_alpha.c
new file mode 100644
index 0000000..866b762
--- /dev/null
+++ b/libavcodec/alpha/pixblockdsp_alpha.c
@@ -0,0 +1,78 @@
+/*
+ * SIMD-optimized pixel operations
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/pixblockdsp.h"
+#include "asm.h"
+
+static void get_pixels_mvi(int16_t *restrict block,
+                           const uint8_t *restrict pixels, ptrdiff_t line_size)
+{
+    int h = 8;
+
+    do {
+        uint64_t p;
+
+        p = ldq(pixels);
+        stq(unpkbw(p),       block);
+        stq(unpkbw(p >> 32), block + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--h);
+}
+
+static void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                            int stride) {
+    int h = 8;
+    uint64_t mask = 0x4040;
+
+    mask |= mask << 16;
+    mask |= mask << 32;
+    do {
+        uint64_t x, y, c, d, a;
+        uint64_t signs;
+
+        x = ldq(s1);
+        y = ldq(s2);
+        c = cmpbge(x, y);
+        d = x - y;
+        a = zap(mask, c);       /* We use 0x4040404040404040 here...  */
+        d += 4 * a;             /* ...so we can use s4addq here.      */
+        signs = zap(-1, c);
+
+        stq(unpkbw(d)       | (unpkbw(signs)       << 8), block);
+        stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
+
+        s1 += stride;
+        s2 += stride;
+        block += 8;
+    } while (--h);
+}
+
+av_cold void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx,
+                                       unsigned high_bit_depth)
+{
+    if (amask(AMASK_MVI) == 0) {
+        if (!high_bit_depth)
+            c->get_pixels = get_pixels_mvi;
+        c->diff_pixels = diff_pixels_mvi;
+    }
+}
diff --git a/libavcodec/alpha/regdef.h b/libavcodec/alpha/regdef.h
new file mode 100644
index 0000000..f05577a
--- /dev/null
+++ b/libavcodec/alpha/regdef.h
@@ -0,0 +1,77 @@
+/*
+ * Alpha optimized DSP utils
+ * copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Some BSDs don't seem to have regdef.h... sigh  */
+#ifndef AVCODEC_ALPHA_REGDEF_H
+#define AVCODEC_ALPHA_REGDEF_H
+
+#define v0      $0      /* function return value */
+
+#define t0      $1      /* temporary registers (caller-saved) */
+#define t1      $2
+#define t2      $3
+#define t3      $4
+#define t4      $5
+#define t5      $6
+#define t6      $7
+#define t7      $8
+
+#define s0      $9      /* saved-registers (callee-saved registers) */
+#define s1      $10
+#define s2      $11
+#define s3      $12
+#define s4      $13
+#define s5      $14
+#define s6      $15
+#define fp      s6      /* frame-pointer (s6 in frame-less procedures) */
+
+#define a0      $16     /* argument registers (caller-saved) */
+#define a1      $17
+#define a2      $18
+#define a3      $19
+#define a4      $20
+#define a5      $21
+
+#define t8      $22     /* more temps (caller-saved) */
+#define t9      $23
+#define t10     $24
+#define t11     $25
+#define ra      $26     /* return address register */
+#define t12     $27
+
+#define pv      t12     /* procedure-variable register */
+#define AT      $at     /* assembler temporary */
+#define gp      $29     /* global pointer */
+#define sp      $30     /* stack pointer */
+#define zero    $31     /* reads as zero, writes are noops */
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+#endif /* AVCODEC_ALPHA_REGDEF_H */
diff --git a/libavcodec/alpha/simple_idct_alpha.c b/libavcodec/alpha/simple_idct_alpha.c
new file mode 100644
index 0000000..04be0ce
--- /dev/null
+++ b/libavcodec/alpha/simple_idct_alpha.c
@@ -0,0 +1,303 @@
+/*
+ * Simple IDCT (Alpha optimized)
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * based upon some outcommented C code from mpeg2dec (idct_mmx.c
+ * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
+ *
+ * Alpha optimizations by Måns Rullgård <mans@mansr.com>
+ *                     and Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_alpha.h"
+#include "asm.h"
+
+// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
+// W4 is actually exactly 16384, but using 16383 works around
+// accumulating rounding errors for some encoders
+#define W1 22725
+#define W2 21407
+#define W3 19266
+#define W4 16383
+#define W5 12873
+#define W6  8867
+#define W7  4520
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
+static inline int idct_row(int16_t *row)
+{
+    int a0, a1, a2, a3, b0, b1, b2, b3, t;
+    uint64_t l, r, t2;
+    l = ldq(row);
+    r = ldq(row + 4);
+
+    if (l == 0 && r == 0)
+        return 0;
+
+    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
+
+    if (((l & ~0xffffUL) | r) == 0) {
+        a0 >>= ROW_SHIFT;
+        t2 = (uint16_t) a0;
+        t2 |= t2 << 16;
+        t2 |= t2 << 32;
+
+        stq(t2, row);
+        stq(t2, row + 4);
+        return 1;
+    }
+
+    a1 = a0;
+    a2 = a0;
+    a3 = a0;
+
+    t = extwl(l, 4);            /* row[2] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W2 * t;
+        a1 += W6 * t;
+        a2 -= W6 * t;
+        a3 -= W2 * t;
+    }
+
+    t = extwl(r, 0);            /* row[4] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W4 * t;
+        a1 -= W4 * t;
+        a2 -= W4 * t;
+        a3 += W4 * t;
+    }
+
+    t = extwl(r, 4);            /* row[6] */
+    if (t != 0) {
+        t = sextw(t);
+        a0 += W6 * t;
+        a1 -= W2 * t;
+        a2 += W2 * t;
+        a3 -= W6 * t;
+    }
+
+    t = extwl(l, 2);            /* row[1] */
+    if (t != 0) {
+        t = sextw(t);
+        b0 = W1 * t;
+        b1 = W3 * t;
+        b2 = W5 * t;
+        b3 = W7 * t;
+    } else {
+        b0 = 0;
+        b1 = 0;
+        b2 = 0;
+        b3 = 0;
+    }
+
+    t = extwl(l, 6);            /* row[3] */
+    if (t) {
+        t = sextw(t);
+        b0 += W3 * t;
+        b1 -= W7 * t;
+        b2 -= W1 * t;
+        b3 -= W5 * t;
+    }
+
+
+    t = extwl(r, 2);            /* row[5] */
+    if (t) {
+        t = sextw(t);
+        b0 += W5 * t;
+        b1 -= W1 * t;
+        b2 += W7 * t;
+        b3 += W3 * t;
+    }
+
+    t = extwl(r, 6);            /* row[7] */
+    if (t) {
+        t = sextw(t);
+        b0 += W7 * t;
+        b1 -= W5 * t;
+        b2 += W3 * t;
+        b3 -= W1 * t;
+    }
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+
+    return 2;
+}
+
+static inline void idct_col(int16_t *col)
+{
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    col[0] += (1 << (COL_SHIFT - 1)) / W4;
+
+    a0 = W4 * col[8 * 0];
+    a1 = W4 * col[8 * 0];
+    a2 = W4 * col[8 * 0];
+    a3 = W4 * col[8 * 0];
+
+    if (col[8 * 2]) {
+        a0 += W2 * col[8 * 2];
+        a1 += W6 * col[8 * 2];
+        a2 -= W6 * col[8 * 2];
+        a3 -= W2 * col[8 * 2];
+    }
+
+    if (col[8 * 4]) {
+        a0 += W4 * col[8 * 4];
+        a1 -= W4 * col[8 * 4];
+        a2 -= W4 * col[8 * 4];
+        a3 += W4 * col[8 * 4];
+    }
+
+    if (col[8 * 6]) {
+        a0 += W6 * col[8 * 6];
+        a1 -= W2 * col[8 * 6];
+        a2 += W2 * col[8 * 6];
+        a3 -= W6 * col[8 * 6];
+    }
+
+    if (col[8 * 1]) {
+        b0 = W1 * col[8 * 1];
+        b1 = W3 * col[8 * 1];
+        b2 = W5 * col[8 * 1];
+        b3 = W7 * col[8 * 1];
+    } else {
+        b0 = 0;
+        b1 = 0;
+        b2 = 0;
+        b3 = 0;
+    }
+
+    if (col[8 * 3]) {
+        b0 += W3 * col[8 * 3];
+        b1 -= W7 * col[8 * 3];
+        b2 -= W1 * col[8 * 3];
+        b3 -= W5 * col[8 * 3];
+    }
+
+    if (col[8 * 5]) {
+        b0 += W5 * col[8 * 5];
+        b1 -= W1 * col[8 * 5];
+        b2 += W7 * col[8 * 5];
+        b3 += W3 * col[8 * 5];
+    }
+
+    if (col[8 * 7]) {
+        b0 += W7 * col[8 * 7];
+        b1 -= W5 * col[8 * 7];
+        b2 += W3 * col[8 * 7];
+        b3 -= W1 * col[8 * 7];
+    }
+
+    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
+    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
+    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
+    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
+    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
+    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
+    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
+    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
+}
+
+/* If all rows but the first one are zero after row transformation,
+   all rows will be identical after column transformation.  */
+static inline void idct_col2(int16_t *col)
+{
+    int i;
+    uint64_t l, r;
+
+    for (i = 0; i < 8; ++i) {
+        int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
+
+        a0 *= W4;
+        col[i] = a0 >> COL_SHIFT;
+    }
+
+    l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
+    stq(l, col +  2 * 4); stq(r, col +  3 * 4);
+    stq(l, col +  4 * 4); stq(r, col +  5 * 4);
+    stq(l, col +  6 * 4); stq(r, col +  7 * 4);
+    stq(l, col +  8 * 4); stq(r, col +  9 * 4);
+    stq(l, col + 10 * 4); stq(r, col + 11 * 4);
+    stq(l, col + 12 * 4); stq(r, col + 13 * 4);
+    stq(l, col + 14 * 4); stq(r, col + 15 * 4);
+}
+
+void ff_simple_idct_axp(int16_t *block)
+{
+
+    int i;
+    int rowsZero = 1;           /* all rows except row 0 zero */
+    int rowsConstant = 1;       /* all rows consist of a constant value */
+
+    for (i = 0; i < 8; i++) {
+        int sparseness = idct_row(block + 8 * i);
+
+        if (i > 0 && sparseness > 0)
+            rowsZero = 0;
+        if (sparseness == 2)
+            rowsConstant = 0;
+    }
+
+    if (rowsZero) {
+        idct_col2(block);
+    } else if (rowsConstant) {
+        idct_col(block);
+        for (i = 0; i < 8; i += 2) {
+            uint64_t v = (uint16_t) block[0];
+            uint64_t w = (uint16_t) block[8];
+
+            v |= v << 16;
+            w |= w << 16;
+            v |= v << 32;
+            w |= w << 32;
+            stq(v, block + 0 * 4);
+            stq(v, block + 1 * 4);
+            stq(w, block + 2 * 4);
+            stq(w, block + 3 * 4);
+            block += 4 * 4;
+        }
+    } else {
+        for (i = 0; i < 8; i++)
+            idct_col(block + i);
+    }
+}
+
+void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_simple_idct_axp(block);
+    put_pixels_clamped_axp_p(block, dest, line_size);
+}
+
+void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_simple_idct_axp(block);
+    add_pixels_clamped_axp_p(block, dest, line_size);
+}
diff --git a/libavcodec/alsdec.c b/libavcodec/alsdec.c
index f356a70..a7e58a2 100644
--- a/libavcodec/alsdec.c
+++ b/libavcodec/alsdec.c
@@ -1,28 +1,28 @@
 /*
  * MPEG-4 ALS decoder
- * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * MPEG-4 ALS decoder
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 #include <inttypes.h>
@@ -199,6 +199,7 @@ typedef struct ALSDecContext {
     unsigned int cur_frame_length;  ///< length of the current frame to decode
     unsigned int frame_id;          ///< the frame ID / number of the current frame
     unsigned int js_switch;         ///< if true, joint-stereo decoding is enforced
+    unsigned int cs_switch;         ///< if true, channel rearrangement is done
     unsigned int num_blocks;        ///< number of blocks used in the current frame
     unsigned int s_max;             ///< maximum Rice parameter allowed in entropy coding
     uint8_t *bgmc_lut;              ///< pointer at lookup tables used for BGMC
@@ -281,12 +282,14 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
     GetBitContext gb;
     uint64_t ht_size;
     int i, config_offset;
-    MPEG4AudioConfig m4ac;
+    MPEG4AudioConfig m4ac = {0};
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
     uint32_t als_id, header_size, trailer_size;
+    int ret;
 
-    init_get_bits(&gb, avctx->extradata, avctx->extradata_size * 8);
+    if ((ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size)) < 0)
+        return ret;
 
     config_offset = avpriv_mpeg4audio_get_config(&m4ac, avctx->extradata,
                                                  avctx->extradata_size * 8, 1);
@@ -349,16 +352,28 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
         if (get_bits_left(&gb) < bits_needed)
             return AVERROR_INVALIDDATA;
 
-        if (!(sconf->chan_pos = av_malloc(avctx->channels * sizeof(*sconf->chan_pos))))
+        if (!(sconf->chan_pos = av_malloc_array(avctx->channels, sizeof(*sconf->chan_pos))))
             return AVERROR(ENOMEM);
 
-        for (i = 0; i < avctx->channels; i++)
-            sconf->chan_pos[i] = get_bits(&gb, chan_pos_bits);
+        ctx->cs_switch = 1;
+
+        for (i = 0; i < avctx->channels; i++) {
+            sconf->chan_pos[i] = -1;
+        }
+
+        for (i = 0; i < avctx->channels; i++) {
+            int idx;
+
+            idx = get_bits(&gb, chan_pos_bits);
+            if (idx >= avctx->channels || sconf->chan_pos[idx] != -1) {
+                av_log(avctx, AV_LOG_WARNING, "Invalid channel reordering.\n");
+                ctx->cs_switch = 0;
+                break;
+            }
+            sconf->chan_pos[idx] = i;
+        }
 
         align_get_bits(&gb);
-        // TODO: use this to actually do channel sorting
-    } else {
-        sconf->chan_sort = 0;
     }
 
 
@@ -392,7 +407,7 @@ static av_cold int read_specific_config(ALSDecContext *ctx)
         if (get_bits_left(&gb) < 32)
             return AVERROR_INVALIDDATA;
 
-        if (avctx->err_recognition & AV_EF_CRCCHECK) {
+        if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL)) {
             ctx->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
             ctx->crc       = 0xFFFFFFFF;
             ctx->crc_org   = ~get_bits_long(&gb, 32);
@@ -428,7 +443,6 @@ static int check_specific_config(ALSDecContext *ctx)
 
     MISSING_ERR(sconf->floating,  "Floating point decoding",     AVERROR_PATCHWELCOME);
     MISSING_ERR(sconf->rlslms,    "Adaptive RLS-LMS prediction", AVERROR_PATCHWELCOME);
-    MISSING_ERR(sconf->chan_sort, "Channel sorting",             0);
 
     return error;
 }
@@ -551,12 +565,15 @@ static void get_block_sizes(ALSDecContext *ctx, unsigned int *div_blocks,
 
 /** Read the block data for a constant block
  */
-static void read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
+static int read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 {
     ALSSpecificConfig *sconf = &ctx->sconf;
     AVCodecContext *avctx    = ctx->avctx;
     GetBitContext *gb        = &ctx->gb;
 
+    if (bd->block_length <= 0)
+        return AVERROR_INVALIDDATA;
+
     *bd->raw_samples = 0;
     *bd->const_block = get_bits1(gb);    // 1 = constant value, 0 = zero block (silence)
     bd->js_blocks    = get_bits1(gb);
@@ -571,6 +588,8 @@ static void read_const_block_data(ALSDecContext *ctx, ALSBlockData *bd)
 
     // ensure constant block decoding by reusing this field
     *bd->const_block = 1;
+
+    return 0;
 }
 
 
@@ -669,13 +688,17 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
             *bd->opt_order       = get_bits(gb, opt_order_length);
             if (*bd->opt_order > sconf->max_order) {
                 *bd->opt_order = sconf->max_order;
-                av_log(avctx, AV_LOG_ERROR, "Predictor order too large!\n");
+                av_log(avctx, AV_LOG_ERROR, "Predictor order too large.\n");
                 return AVERROR_INVALIDDATA;
             }
         } else {
             *bd->opt_order = sconf->max_order;
         }
-
+        if (*bd->opt_order > bd->block_length) {
+            *bd->opt_order = bd->block_length;
+            av_log(avctx, AV_LOG_ERROR, "Predictor order too large.\n");
+            return AVERROR_INVALIDDATA;
+        }
         opt_order = *bd->opt_order;
 
         if (opt_order) {
@@ -706,7 +729,7 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                     quant_cof[k] = decode_rice(gb, rice_param) + offset;
                     if (quant_cof[k] < -64 || quant_cof[k] > 63) {
                         av_log(avctx, AV_LOG_ERROR,
-                               "quant_cof %"PRIu32" is out of range\n",
+                               "quant_cof %"PRId32" is out of range.\n",
                                quant_cof[k]);
                         return AVERROR_INVALIDDATA;
                     }
@@ -742,8 +765,13 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
             bd->ltp_gain[0]   = decode_rice(gb, 1) << 3;
             bd->ltp_gain[1]   = decode_rice(gb, 2) << 3;
 
-            r                 = get_unary(gb, 0, 3);
+            r                 = get_unary(gb, 0, 4);
             c                 = get_bits(gb, 2);
+            if (r >= 4) {
+                av_log(avctx, AV_LOG_ERROR, "r overflow\n");
+                return AVERROR_INVALIDDATA;
+            }
+
             bd->ltp_gain[2]   = ltp_gain_values[r][c];
 
             bd->ltp_gain[3]   = decode_rice(gb, 2) << 3;
@@ -844,9 +872,6 @@ static int read_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
                 *current_res++ = decode_rice(gb, s[sb]);
      }
 
-    if (!sconf->mc_coding || ctx->js_switch)
-        align_get_bits(gb);
-
     return 0;
 }
 
@@ -964,17 +989,21 @@ static int decode_var_block_data(ALSDecContext *ctx, ALSBlockData *bd)
  */
 static int read_block(ALSDecContext *ctx, ALSBlockData *bd)
 {
-    int ret = 0;
+    int ret;
     GetBitContext *gb        = &ctx->gb;
+    ALSSpecificConfig *sconf = &ctx->sconf;
 
     *bd->shift_lsbs = 0;
     // read block type flag and read the samples accordingly
     if (get_bits1(gb)) {
         ret = read_var_block_data(ctx, bd);
     } else {
-        read_const_block_data(ctx, bd);
+        ret = read_const_block_data(ctx, bd);
     }
 
+    if (!sconf->mc_coding || ctx->js_switch)
+        align_get_bits(gb);
+
     return ret;
 }
 
@@ -1026,8 +1055,8 @@ static void zero_remaining(unsigned int b, unsigned int b_max,
 {
     unsigned int count = 0;
 
-    for (; b < b_max; b++)
-        count += div_blocks[b];
+    while (b < b_max)
+        count += div_blocks[b++];
 
     if (count)
         memset(buf, 0, sizeof(*buf) * count);
@@ -1132,7 +1161,7 @@ static int decode_blocks(ALSDecContext *ctx, unsigned int ra_frame,
         // reconstruct joint-stereo blocks
         if (bd[0].js_blocks) {
             if (bd[1].js_blocks)
-                av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel pair!\n");
+                av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel pair.\n");
 
             for (s = 0; s < div_blocks[b]; s++)
                 bd[0].raw_samples[s] = bd[1].raw_samples[s] - bd[0].raw_samples[s];
@@ -1180,7 +1209,7 @@ static int read_channel_data(ALSDecContext *ctx, ALSChannelData *cd, int c)
         current->master_channel = get_bits_long(gb, av_ceil_log2(channels));
 
         if (current->master_channel >= channels) {
-            av_log(ctx->avctx, AV_LOG_ERROR, "Invalid master channel!\n");
+            av_log(ctx->avctx, AV_LOG_ERROR, "Invalid master channel.\n");
             return AVERROR_INVALIDDATA;
         }
 
@@ -1205,7 +1234,7 @@ static int read_channel_data(ALSDecContext *ctx, ALSChannelData *cd, int c)
     }
 
     if (entries == channels) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Damaged channel data!\n");
+        av_log(ctx->avctx, AV_LOG_ERROR, "Damaged channel data.\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -1238,7 +1267,7 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
     }
 
     if (dep == channels) {
-        av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel correlation!\n");
+        av_log(ctx->avctx, AV_LOG_WARNING, "Invalid channel correlation.\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -1253,21 +1282,31 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
     bd->quant_cof   = ctx->quant_cof[c];
     bd->raw_samples = ctx->raw_samples[c] + offset;
 
-    dep = 0;
-    while (!ch[dep].stop_flag) {
+    for (dep = 0; !ch[dep].stop_flag; dep++) {
         ptrdiff_t smp;
         ptrdiff_t begin = 1;
         ptrdiff_t end   = bd->block_length - 1;
         int64_t y;
         int32_t *master = ctx->raw_samples[ch[dep].master_channel] + offset;
 
+        if (ch[dep].master_channel == c)
+            continue;
+
         if (ch[dep].time_diff_flag) {
             int t = ch[dep].time_diff_index;
 
             if (ch[dep].time_diff_sign) {
                 t      = -t;
+                if (begin < t) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "begin %"PTRDIFF_SPECIFIER" smaller than time diff index %d.\n", begin, t);
+                    return AVERROR_INVALIDDATA;
+                }
                 begin -= t;
             } else {
+                if (end < t) {
+                    av_log(ctx->avctx, AV_LOG_ERROR, "end %"PTRDIFF_SPECIFIER" smaller than time diff index %d.\n", end, t);
+                    return AVERROR_INVALIDDATA;
+                }
                 end   -= t;
             }
 
@@ -1311,8 +1350,6 @@ static int revert_channel_correlation(ALSDecContext *ctx, ALSBlockData *bd,
                 bd->raw_samples[smp] += y >> 7;
             }
         }
-
-        dep++;
     }
 
     return 0;
@@ -1387,7 +1424,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
 
         for (c = 0; c < avctx->channels; c++)
             if (ctx->chan_data[c] < ctx->chan_data_buffer) {
-                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid channel data!\n");
+                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid channel data.\n");
                 return AVERROR_INVALIDDATA;
             }
 
@@ -1443,6 +1480,7 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
                 bd.lpc_cof     = ctx->lpc_cof[c];
                 bd.quant_cof   = ctx->quant_cof[c];
                 bd.raw_samples = ctx->raw_samples[c] + offset;
+
                 if ((ret = decode_block(ctx, &bd)) < 0)
                     return ret;
             }
@@ -1461,6 +1499,11 @@ static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
 
     // TODO: read_diff_float_data
 
+    if (get_bits_left(gb) < 0) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Overread %d\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
@@ -1478,7 +1521,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     int invalid_frame, ret;
     unsigned int c, sample, ra_frame, bytes_read, shift;
 
-    init_get_bits(&ctx->gb, buffer, buffer_size * 8);
+    if ((ret = init_get_bits8(&ctx->gb, buffer, buffer_size)) < 0)
+        return ret;
 
     // In the case that the distance between random access frames is set to zero
     // (sconf->ra_distance == 0) no frame is treated as a random access frame.
@@ -1502,19 +1546,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
 
     /* get output buffer */
     frame->nb_samples = ctx->cur_frame_length;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     // transform decoded frame into output format
-    #define INTERLEAVE_OUTPUT(bps)                                 \
-    {                                                              \
-        int##bps##_t *dest = (int##bps##_t*)frame->data[0];        \
-        shift = bps - ctx->avctx->bits_per_raw_sample;             \
-        for (sample = 0; sample < ctx->cur_frame_length; sample++) \
-            for (c = 0; c < avctx->channels; c++)                  \
-                *dest++ = ctx->raw_samples[c][sample] << shift;    \
+    #define INTERLEAVE_OUTPUT(bps)                                                   \
+    {                                                                                \
+        int##bps##_t *dest = (int##bps##_t*)frame->data[0];                          \
+        shift = bps - ctx->avctx->bits_per_raw_sample;                               \
+        if (!ctx->cs_switch) {                                                       \
+            for (sample = 0; sample < ctx->cur_frame_length; sample++)               \
+                for (c = 0; c < avctx->channels; c++)                                \
+                    *dest++ = ctx->raw_samples[c][sample] << shift;                  \
+        } else {                                                                     \
+            for (sample = 0; sample < ctx->cur_frame_length; sample++)               \
+                for (c = 0; c < avctx->channels; c++)                                \
+                    *dest++ = ctx->raw_samples[sconf->chan_pos[c]][sample] << shift; \
+        }                                                                            \
     }
 
     if (ctx->avctx->bits_per_raw_sample <= 16) {
@@ -1524,7 +1572,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     }
 
     // update CRC
-    if (sconf->crc_enabled && (avctx->err_recognition & AV_EF_CRCCHECK)) {
+    if (sconf->crc_enabled && (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL))) {
         int swap = HAVE_BIGENDIAN != sconf->msb_first;
 
         if (ctx->avctx->bits_per_raw_sample == 24) {
@@ -1681,14 +1729,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
     // allocate quantized parcor coefficient buffer
     num_buffers = sconf->mc_coding ? avctx->channels : 1;
 
-    ctx->quant_cof        = av_malloc(sizeof(*ctx->quant_cof) * num_buffers);
-    ctx->lpc_cof          = av_malloc(sizeof(*ctx->lpc_cof)   * num_buffers);
-    ctx->quant_cof_buffer = av_malloc(sizeof(*ctx->quant_cof_buffer) *
-                                      num_buffers * sconf->max_order);
-    ctx->lpc_cof_buffer   = av_malloc(sizeof(*ctx->lpc_cof_buffer) *
-                                      num_buffers * sconf->max_order);
-    ctx->lpc_cof_reversed_buffer = av_malloc(sizeof(*ctx->lpc_cof_buffer) *
-                                             sconf->max_order);
+    ctx->quant_cof        = av_malloc_array(num_buffers, sizeof(*ctx->quant_cof));
+    ctx->lpc_cof          = av_malloc_array(num_buffers, sizeof(*ctx->lpc_cof));
+    ctx->quant_cof_buffer = av_malloc_array(num_buffers * sconf->max_order,
+                                            sizeof(*ctx->quant_cof_buffer));
+    ctx->lpc_cof_buffer   = av_malloc_array(num_buffers * sconf->max_order,
+                                            sizeof(*ctx->lpc_cof_buffer));
+    ctx->lpc_cof_reversed_buffer = av_malloc_array(sconf->max_order,
+                                                   sizeof(*ctx->lpc_cof_buffer));
 
     if (!ctx->quant_cof              || !ctx->lpc_cof        ||
         !ctx->quant_cof_buffer       || !ctx->lpc_cof_buffer ||
@@ -1705,15 +1753,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     // allocate and assign lag and gain data buffer for ltp mode
-    ctx->const_block     = av_malloc (sizeof(*ctx->const_block) * num_buffers);
-    ctx->shift_lsbs      = av_malloc (sizeof(*ctx->shift_lsbs)  * num_buffers);
-    ctx->opt_order       = av_malloc (sizeof(*ctx->opt_order)   * num_buffers);
-    ctx->store_prev_samples = av_malloc(sizeof(*ctx->store_prev_samples) * num_buffers);
-    ctx->use_ltp         = av_mallocz(sizeof(*ctx->use_ltp)  * num_buffers);
-    ctx->ltp_lag         = av_malloc (sizeof(*ctx->ltp_lag)  * num_buffers);
-    ctx->ltp_gain        = av_malloc (sizeof(*ctx->ltp_gain) * num_buffers);
-    ctx->ltp_gain_buffer = av_malloc (sizeof(*ctx->ltp_gain_buffer) *
-                                      num_buffers * 5);
+    ctx->const_block     = av_malloc_array(num_buffers, sizeof(*ctx->const_block));
+    ctx->shift_lsbs      = av_malloc_array(num_buffers, sizeof(*ctx->shift_lsbs));
+    ctx->opt_order       = av_malloc_array(num_buffers, sizeof(*ctx->opt_order));
+    ctx->store_prev_samples = av_malloc_array(num_buffers, sizeof(*ctx->store_prev_samples));
+    ctx->use_ltp         = av_mallocz_array(num_buffers, sizeof(*ctx->use_ltp));
+    ctx->ltp_lag         = av_malloc_array(num_buffers, sizeof(*ctx->ltp_lag));
+    ctx->ltp_gain        = av_malloc_array(num_buffers, sizeof(*ctx->ltp_gain));
+    ctx->ltp_gain_buffer = av_malloc_array(num_buffers * 5, sizeof(*ctx->ltp_gain_buffer));
 
     if (!ctx->const_block || !ctx->shift_lsbs ||
         !ctx->opt_order || !ctx->store_prev_samples ||
@@ -1729,12 +1776,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     // allocate and assign channel data buffer for mcc mode
     if (sconf->mc_coding) {
-        ctx->chan_data_buffer  = av_malloc(sizeof(*ctx->chan_data_buffer) *
-                                           num_buffers * num_buffers);
-        ctx->chan_data         = av_malloc(sizeof(*ctx->chan_data) *
-                                           num_buffers);
-        ctx->reverted_channels = av_malloc(sizeof(*ctx->reverted_channels) *
-                                           num_buffers);
+        ctx->chan_data_buffer  = av_mallocz_array(num_buffers * num_buffers,
+                                                 sizeof(*ctx->chan_data_buffer));
+        ctx->chan_data         = av_mallocz_array(num_buffers,
+                                                 sizeof(*ctx->chan_data));
+        ctx->reverted_channels = av_malloc_array(num_buffers,
+                                                 sizeof(*ctx->reverted_channels));
 
         if (!ctx->chan_data_buffer || !ctx->chan_data || !ctx->reverted_channels) {
             av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
@@ -1752,9 +1799,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     channel_size      = sconf->frame_length + sconf->max_order;
 
-    ctx->prev_raw_samples = av_malloc (sizeof(*ctx->prev_raw_samples) * sconf->max_order);
-    ctx->raw_buffer       = av_mallocz(sizeof(*ctx->     raw_buffer)  * avctx->channels * channel_size);
-    ctx->raw_samples      = av_malloc (sizeof(*ctx->     raw_samples) * avctx->channels);
+    ctx->prev_raw_samples = av_malloc_array(sconf->max_order, sizeof(*ctx->prev_raw_samples));
+    ctx->raw_buffer       = av_mallocz_array(avctx->channels * channel_size, sizeof(*ctx->raw_buffer));
+    ctx->raw_samples      = av_malloc_array(avctx->channels, sizeof(*ctx->raw_samples));
 
     // allocate previous raw sample buffer
     if (!ctx->prev_raw_samples || !ctx->raw_buffer|| !ctx->raw_samples) {
@@ -1770,11 +1817,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     // allocate crc buffer
     if (HAVE_BIGENDIAN != sconf->msb_first && sconf->crc_enabled &&
-        (avctx->err_recognition & AV_EF_CRCCHECK)) {
-        ctx->crc_buffer = av_malloc(sizeof(*ctx->crc_buffer) *
-                                    ctx->cur_frame_length *
-                                    avctx->channels *
-                                    av_get_bytes_per_sample(avctx->sample_fmt));
+        (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_CAREFUL))) {
+        ctx->crc_buffer = av_malloc_array(ctx->cur_frame_length *
+                                          avctx->channels *
+                                          av_get_bytes_per_sample(avctx->sample_fmt),
+                                          sizeof(*ctx->crc_buffer));
         if (!ctx->crc_buffer) {
             av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
             ret = AVERROR(ENOMEM);
diff --git a/libavcodec/amr.h b/libavcodec/amr.h
index 676c963..727f8c3 100644
--- a/libavcodec/amr.h
+++ b/libavcodec/amr.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #include "avcodec.h"
 
 #ifdef AMR_USE_16BIT_TABLES
-#define R_TABLE_TYPE uint16_t
+typedef uint16_t R_TABLE_TYPE;
 #else
-#define R_TABLE_TYPE uint8_t
+typedef uint8_t R_TABLE_TYPE;
 #endif
 
 /**
diff --git a/libavcodec/amrnbdata.h b/libavcodec/amrnbdata.h
index b7d1b89..435fd99 100644
--- a/libavcodec/amrnbdata.h
+++ b/libavcodec/amrnbdata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Robert Swain
  * Copyright (c) 2009 Colin McQuillan
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1655,10 +1655,10 @@ static const float ir_filter_medium[AMR_SUBFRAME_SIZE] = {
  0.016998,  0.023804, -0.041779,  0.025696,  0.019989,
 };
 
-static const float *ir_filters_lookup[2]           = {
+static const float * const ir_filters_lookup[2]           = {
     ir_filter_strong,           ir_filter_medium
 };
-static const float *ir_filters_lookup_MODE_7k95[2] = {
+static const float * const ir_filters_lookup_MODE_7k95[2] = {
     ir_filter_strong_MODE_7k95, ir_filter_medium
 };
 
diff --git a/libavcodec/amrnbdec.c b/libavcodec/amrnbdec.c
index e2b5641..ea299ac 100644
--- a/libavcodec/amrnbdec.c
+++ b/libavcodec/amrnbdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Robert Swain
  * Copyright (c) 2009 Colin McQuillan
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,8 @@
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
+#include "celp_math.h"
 #include "celp_filters.h"
 #include "acelp_filters.h"
 #include "acelp_vectors.h"
@@ -84,7 +86,7 @@
 /** Maximum sharpening factor
  *
  * The specification says 0.8, which should be 13107, but the reference C code
- * uses 13017 instead. (Amusingly the same applies to SHARP_MAX in bitexact G.729.)
+ * uses 13017 instead. (Amusingly the same applies to SHARP_MAX in g729dec.c.)
  */
 #define SHARP_MAX 0.79449462890625
 
@@ -136,6 +138,11 @@ typedef struct AMRContext {
 
     float samples_in[LP_FILTER_ORDER + AMR_SUBFRAME_SIZE]; ///< floating point samples
 
+    ACELPFContext                     acelpf_ctx; ///< context for filters for ACELP-based codecs
+    ACELPVContext                     acelpv_ctx; ///< context for vector operations for ACELP-based codecs
+    CELPFContext                       celpf_ctx; ///< context for filters for CELP-based codecs
+    CELPMContext                       celpm_ctx; ///< context for fixed point math operations
+
 } AMRContext;
 
 /** Double version of ff_weighted_vector_sumf() */
@@ -162,7 +169,8 @@ static av_cold int amrnb_decode_init(AVCodecContext *avctx)
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 8000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
     // p->excitation always points to the same position in p->excitation_buf
@@ -176,6 +184,11 @@ static av_cold int amrnb_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 4; i++)
         p->prediction_error[i] = MIN_ENERGY;
 
+    ff_acelp_filter_init(&p->acelpf_ctx);
+    ff_acelp_vectors_init(&p->acelpv_ctx);
+    ff_celp_filter_init(&p->celpf_ctx);
+    ff_celp_math_init(&p->celpm_ctx);
+
     return 0;
 }
 
@@ -219,15 +232,16 @@ static enum Mode unpack_bitstream(AMRContext *p, const uint8_t *buf,
  * Interpolate the LSF vector (used for fixed gain smoothing).
  * The interpolation is done over all four subframes even in MODE_12k2.
  *
+ * @param[in]     ctx       The Context
  * @param[in,out] lsf_q     LSFs in [0,1] for each subframe
  * @param[in]     lsf_new   New LSFs in [0,1] for subframe 4
  */
-static void interpolate_lsf(float lsf_q[4][LP_FILTER_ORDER], float *lsf_new)
+static void interpolate_lsf(ACELPVContext *ctx, float lsf_q[4][LP_FILTER_ORDER], float *lsf_new)
 {
     int i;
 
     for (i = 0; i < 4; i++)
-        ff_weighted_vector_sumf(lsf_q[i], lsf_q[3], lsf_new,
+        ctx->weighted_vector_sumf(lsf_q[i], lsf_q[3], lsf_new,
                                 0.25 * (3 - i), 0.25 * (i + 1),
                                 LP_FILTER_ORDER);
 }
@@ -271,7 +285,7 @@ static void lsf2lsp_for_mode12k2(AMRContext *p, double lsp[LP_FILTER_ORDER],
     ff_set_min_dist_lsf(lsf_q, MIN_LSF_SPACING, LP_FILTER_ORDER);
 
     if (update)
-        interpolate_lsf(p->lsf_q, lsf_q);
+        interpolate_lsf(&p->acelpv_ctx, p->lsf_q, lsf_q);
 
     ff_acelp_lsf2lspd(lsp, lsf_q, LP_FILTER_ORDER);
 }
@@ -334,7 +348,7 @@ static void lsf2lsp_3(AMRContext *p)
     ff_set_min_dist_lsf(lsf_q, MIN_LSF_SPACING, LP_FILTER_ORDER);
 
     // store data for computing the next frame's LSFs
-    interpolate_lsf(p->lsf_q, lsf_q);
+    interpolate_lsf(&p->acelpv_ctx, p->lsf_q, lsf_q);
     memcpy(p->prev_lsf_r, lsf_r, LP_FILTER_ORDER * sizeof(*lsf_r));
 
     ff_acelp_lsf2lspd(p->lsp[3], lsf_q, LP_FILTER_ORDER);
@@ -385,22 +399,23 @@ static void decode_pitch_vector(AMRContext *p,
         decode_pitch_lag_1_6(&pitch_lag_int, &pitch_lag_frac,
                              amr_subframe->p_lag, p->pitch_lag_int,
                              subframe);
-    } else
+    } else {
         ff_decode_pitch_lag(&pitch_lag_int, &pitch_lag_frac,
                             amr_subframe->p_lag,
                             p->pitch_lag_int, subframe,
                             mode != MODE_4k75 && mode != MODE_5k15,
                             mode <= MODE_6k7 ? 4 : (mode == MODE_7k95 ? 5 : 6));
+        pitch_lag_frac *= 2;
+    }
 
     p->pitch_lag_int = pitch_lag_int; // store previous lag in a uint8_t
 
-    pitch_lag_frac <<= (p->cur_frame_mode != MODE_12k2);
-
     pitch_lag_int += pitch_lag_frac > 0;
 
     /* Calculate the pitch vector by interpolating the past excitation at the
        pitch lag using a b60 hamming windowed sinc function.   */
-    ff_acelp_interpolatef(p->excitation, p->excitation + 1 - pitch_lag_int,
+    p->acelpf_ctx.acelp_interpolatef(p->excitation,
+                          p->excitation + 1 - pitch_lag_int,
                           ff_b60_sinc, 6,
                           pitch_lag_frac + 6 - 6*(pitch_lag_frac > 0),
                           10, AMR_SUBFRAME_SIZE);
@@ -484,7 +499,7 @@ static void decode_8_pulses_31bits(const int16_t *fixed_index,
 static void decode_fixed_sparse(AMRFixed *fixed_sparse, const uint16_t *pulses,
                                 const enum Mode mode, const int subframe)
 {
-    assert(MODE_4k75 <= mode && mode <= MODE_12k2);
+    av_assert1(MODE_4k75 <= (signed)mode && mode <= MODE_12k2);
 
     if (mode == MODE_12k2) {
         ff_decode_10_pulses_35bits(pulses, fixed_sparse, gray_decode, 5, 3);
@@ -785,12 +800,12 @@ static int synthesis(AMRContext *p, float *lpc,
         for (i = 0; i < AMR_SUBFRAME_SIZE; i++)
             p->pitch_vector[i] *= 0.25;
 
-    ff_weighted_vector_sumf(excitation, p->pitch_vector, fixed_vector,
+    p->acelpv_ctx.weighted_vector_sumf(excitation, p->pitch_vector, fixed_vector,
                             p->pitch_gain[4], fixed_gain, AMR_SUBFRAME_SIZE);
 
     // emphasize pitch vector contribution
     if (p->pitch_gain[4] > 0.5 && !overflow) {
-        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+        float energy = p->celpm_ctx.dot_productf(excitation, excitation,
                                                     AMR_SUBFRAME_SIZE);
         float pitch_factor =
             p->pitch_gain[4] *
@@ -805,7 +820,8 @@ static int synthesis(AMRContext *p, float *lpc,
                                                 AMR_SUBFRAME_SIZE);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, lpc, excitation, AMR_SUBFRAME_SIZE,
+    p->celpf_ctx.celp_lp_synthesis_filterf(samples, lpc, excitation,
+                                 AMR_SUBFRAME_SIZE,
                                  LP_FILTER_ORDER);
 
     // detect overflow
@@ -851,10 +867,11 @@ static void update_state(AMRContext *p)
 /**
  * Get the tilt factor of a formant filter from its transfer function
  *
+ * @param p     The Context
  * @param lpc_n LP_FILTER_ORDER coefficients of the numerator
  * @param lpc_d LP_FILTER_ORDER coefficients of the denominator
  */
-static float tilt_factor(float *lpc_n, float *lpc_d)
+static float tilt_factor(AMRContext *p, float *lpc_n, float *lpc_d)
 {
     float rh0, rh1; // autocorrelation at lag 0 and 1
 
@@ -864,11 +881,12 @@ static float tilt_factor(float *lpc_n, float *lpc_d)
 
     hf[0] = 1.0;
     memcpy(hf + 1, lpc_n, sizeof(float) * LP_FILTER_ORDER);
-    ff_celp_lp_synthesis_filterf(hf, lpc_d, hf, AMR_TILT_RESPONSE,
+    p->celpf_ctx.celp_lp_synthesis_filterf(hf, lpc_d, hf,
+                                 AMR_TILT_RESPONSE,
                                  LP_FILTER_ORDER);
 
-    rh0 = avpriv_scalarproduct_float_c(hf, hf,     AMR_TILT_RESPONSE);
-    rh1 = avpriv_scalarproduct_float_c(hf, hf + 1, AMR_TILT_RESPONSE - 1);
+    rh0 = p->celpm_ctx.dot_productf(hf, hf,     AMR_TILT_RESPONSE);
+    rh1 = p->celpm_ctx.dot_productf(hf, hf + 1, AMR_TILT_RESPONSE - 1);
 
     // The spec only specifies this check for 12.2 and 10.2 kbit/s
     // modes. But in the ref source the tilt is always non-negative.
@@ -888,7 +906,7 @@ static void postfilter(AMRContext *p, float *lpc, float *buf_out)
     int i;
     float *samples          = p->samples_in + LP_FILTER_ORDER; // Start of input
 
-    float speech_gain       = avpriv_scalarproduct_float_c(samples, samples,
+    float speech_gain       = p->celpm_ctx.dot_productf(samples, samples,
                                                            AMR_SUBFRAME_SIZE);
 
     float pole_out[AMR_SUBFRAME_SIZE + LP_FILTER_ORDER];  // Output of pole filter
@@ -909,16 +927,16 @@ static void postfilter(AMRContext *p, float *lpc, float *buf_out)
     }
 
     memcpy(pole_out, p->postfilter_mem, sizeof(float) * LP_FILTER_ORDER);
-    ff_celp_lp_synthesis_filterf(pole_out + LP_FILTER_ORDER, lpc_d, samples,
+    p->celpf_ctx.celp_lp_synthesis_filterf(pole_out + LP_FILTER_ORDER, lpc_d, samples,
                                  AMR_SUBFRAME_SIZE, LP_FILTER_ORDER);
     memcpy(p->postfilter_mem, pole_out + AMR_SUBFRAME_SIZE,
            sizeof(float) * LP_FILTER_ORDER);
 
-    ff_celp_lp_zero_synthesis_filterf(buf_out, lpc_n,
+    p->celpf_ctx.celp_lp_zero_synthesis_filterf(buf_out, lpc_n,
                                       pole_out + LP_FILTER_ORDER,
                                       AMR_SUBFRAME_SIZE, LP_FILTER_ORDER);
 
-    ff_tilt_compensation(&p->tilt_mem, tilt_factor(lpc_n, lpc_d), buf_out,
+    ff_tilt_compensation(&p->tilt_mem, tilt_factor(p, lpc_n, lpc_d), buf_out,
                          AMR_SUBFRAME_SIZE);
 
     ff_adaptive_gain_control(buf_out, buf_out, speech_gain, AMR_SUBFRAME_SIZE,
@@ -945,10 +963,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = AMR_BLOCK_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (float *)frame->data[0];
 
     p->cur_frame_mode = unpack_bitstream(p, buf, buf_size);
@@ -957,7 +973,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
     if (p->cur_frame_mode == MODE_DTX) {
-        avpriv_request_sample(avctx, "dtx mode");
+        avpriv_report_missing_feature(avctx, "dtx mode");
+        av_log(avctx, AV_LOG_INFO, "Note: libopencore_amrnb supports dtx\n");
         return AVERROR_PATCHWELCOME;
     }
 
@@ -995,7 +1012,7 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
 
         p->fixed_gain[4] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  avpriv_scalarproduct_float_c(p->fixed_vector,
+                       p->celpm_ctx.dot_productf(p->fixed_vector,
                                                                p->fixed_vector,
                                                                AMR_SUBFRAME_SIZE) /
                                   AMR_SUBFRAME_SIZE,
@@ -1041,7 +1058,8 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
         update_state(p);
     }
 
-    ff_acelp_apply_order_2_transfer_function(buf_out, buf_out, highpass_zeros,
+    p->acelpf_ctx.acelp_apply_order_2_transfer_function(buf_out,
+                                             buf_out, highpass_zeros,
                                              highpass_poles,
                                              highpass_gain * AMR_SAMPLE_SCALE,
                                              p->high_pass_mem, AMR_BLOCK_SIZE);
@@ -1052,7 +1070,7 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
      * for fixed_gain_smooth.
      * The specification has an incorrect formula: the reference decoder uses
      * qbar(n-1) rather than qbar(n) in section 6.1(4) equation 71. */
-    ff_weighted_vector_sumf(p->lsf_avg, p->lsf_avg, p->lsf_q[3],
+    p->acelpv_ctx.weighted_vector_sumf(p->lsf_avg, p->lsf_avg, p->lsf_q[3],
                             0.84, 0.16, LP_FILTER_ORDER);
 
     *got_frame_ptr = 1;
diff --git a/libavcodec/amrwbdata.h b/libavcodec/amrwbdata.h
index c0078b3..e0152a6 100644
--- a/libavcodec/amrwbdata.h
+++ b/libavcodec/amrwbdata.h
@@ -2,20 +2,20 @@
  * AMR wideband data and definitions
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1805,7 +1805,7 @@ static const float ir_filter_mid[64] = {
     -7.501221e-02,  2.920532e-02,  1.660156e-02,  7.751465e-02
 };
 
-static const float *ir_filters_lookup[2] = {
+static const float * const ir_filters_lookup[2] = {
     ir_filter_str, ir_filter_mid
 };
 
diff --git a/libavcodec/amrwbdec.c b/libavcodec/amrwbdec.c
index f1fbcc0..7d0c135 100644
--- a/libavcodec/amrwbdec.c
+++ b/libavcodec/amrwbdec.c
@@ -2,20 +2,20 @@
  * AMR wideband decoder
  * Copyright (c) 2010 Marcelo Galvao Povoa
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A particular PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "avcodec.h"
 #include "lsp.h"
 #include "celp_filters.h"
+#include "celp_math.h"
 #include "acelp_filters.h"
 #include "acelp_vectors.h"
 #include "acelp_pitch_delay.h"
@@ -41,6 +42,7 @@
 #include "amr.h"
 
 #include "amrwbdata.h"
+#include "mips/amrwbdec_mips.h"
 
 typedef struct AMRWBContext {
     AMRWBFrame                             frame; ///< AMRWB parameters decoded from bitstream
@@ -84,6 +86,11 @@ typedef struct AMRWBContext {
 
     AVLFG                                   prng; ///< random number generator for white noise excitation
     uint8_t                          first_frame; ///< flag active during decoding of the first frame
+    ACELPFContext                     acelpf_ctx; ///< context for filters for ACELP-based codecs
+    ACELPVContext                     acelpv_ctx; ///< context for vector operations for ACELP-based codecs
+    CELPFContext                       celpf_ctx; ///< context for filters for CELP-based codecs
+    CELPMContext                       celpm_ctx; ///< context for fixed point math operations
+
 } AMRWBContext;
 
 static av_cold int amrwb_decode_init(AVCodecContext *avctx)
@@ -98,7 +105,8 @@ static av_cold int amrwb_decode_init(AVCodecContext *avctx)
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 16000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 16000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
     av_lfg_init(&ctx->prng, 1);
@@ -112,6 +120,11 @@ static av_cold int amrwb_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 4; i++)
         ctx->prediction_error[i] = MIN_ENERGY;
 
+    ff_acelp_filter_init(&ctx->acelpf_ctx);
+    ff_acelp_vectors_init(&ctx->acelpv_ctx);
+    ff_celp_filter_init(&ctx->celpf_ctx);
+    ff_celp_math_init(&ctx->celpm_ctx);
+
     return 0;
 }
 
@@ -323,7 +336,8 @@ static void decode_pitch_vector(AMRWBContext *ctx,
 
     /* Calculate the pitch vector by interpolating the past excitation at the
        pitch lag using a hamming windowed sinc function */
-    ff_acelp_interpolatef(exc, exc + 1 - pitch_lag_int,
+    ctx->acelpf_ctx.acelp_interpolatef(exc,
+                          exc + 1 - pitch_lag_int,
                           ac_inter, 4,
                           pitch_lag_frac + (pitch_lag_frac > 0 ? 0 : 4),
                           LP_ORDER, AMRWB_SFR_SIZE + 1);
@@ -341,7 +355,7 @@ static void decode_pitch_vector(AMRWBContext *ctx,
 }
 
 /** Get x bits in the index interval [lsb,lsb+len-1] inclusive */
-#define BIT_STR(x,lsb,len) (((x) >> (lsb)) & ((1 << (len)) - 1))
+#define BIT_STR(x,lsb,len) av_mod_uintp2((x) >> (lsb), (len))
 
 /** Get the bit at specified position */
 #define BIT_POS(x, p) (((x) >> (p)) & 1)
@@ -582,16 +596,18 @@ static void pitch_sharpening(AMRWBContext *ctx, float *fixed_vector)
  *
  * @param[in] p_vector, f_vector   Pitch and fixed excitation vectors
  * @param[in] p_gain, f_gain       Pitch and fixed gains
+ * @param[in] ctx                  The context
  */
 // XXX: There is something wrong with the precision here! The magnitudes
 // of the energies are not correct. Please check the reference code carefully
 static float voice_factor(float *p_vector, float p_gain,
-                          float *f_vector, float f_gain)
+                          float *f_vector, float f_gain,
+                          CELPMContext *ctx)
 {
-    double p_ener = (double) avpriv_scalarproduct_float_c(p_vector, p_vector,
+    double p_ener = (double) ctx->dot_productf(p_vector, p_vector,
                                                           AMRWB_SFR_SIZE) *
                     p_gain * p_gain;
-    double f_ener = (double) avpriv_scalarproduct_float_c(f_vector, f_vector,
+    double f_ener = (double) ctx->dot_productf(f_vector, f_vector,
                                                           AMRWB_SFR_SIZE) *
                     f_gain * f_gain;
 
@@ -755,13 +771,13 @@ static void synthesis(AMRWBContext *ctx, float *lpc, float *excitation,
                       float fixed_gain, const float *fixed_vector,
                       float *samples)
 {
-    ff_weighted_vector_sumf(excitation, ctx->pitch_vector, fixed_vector,
+    ctx->acelpv_ctx.weighted_vector_sumf(excitation, ctx->pitch_vector, fixed_vector,
                             ctx->pitch_gain[0], fixed_gain, AMRWB_SFR_SIZE);
 
     /* emphasize pitch vector contribution in low bitrate modes */
     if (ctx->pitch_gain[0] > 0.5 && ctx->fr_cur_mode <= MODE_8k85) {
         int i;
-        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+        float energy = ctx->celpm_ctx.dot_productf(excitation, excitation,
                                                     AMRWB_SFR_SIZE);
 
         // XXX: Weird part in both ref code and spec. A unknown parameter
@@ -775,7 +791,7 @@ static void synthesis(AMRWBContext *ctx, float *lpc, float *excitation,
                                                 energy, AMRWB_SFR_SIZE);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, lpc, excitation,
+    ctx->celpf_ctx.celp_lp_synthesis_filterf(samples, lpc, excitation,
                                  AMRWB_SFR_SIZE, LP_ORDER);
 }
 
@@ -807,8 +823,9 @@ static void de_emphasis(float *out, float *in, float m, float mem[1])
  * @param[out] out                 Buffer for interpolated signal
  * @param[in]  in                  Current signal data (length 0.8*o_size)
  * @param[in]  o_size              Output signal length
+ * @param[in] ctx                  The context
  */
-static void upsample_5_4(float *out, const float *in, int o_size)
+static void upsample_5_4(float *out, const float *in, int o_size, CELPMContext *ctx)
 {
     const float *in0 = in - UPS_FIR_SIZE + 1;
     int i, j, k;
@@ -821,7 +838,7 @@ static void upsample_5_4(float *out, const float *in, int o_size)
         i++;
 
         for (k = 1; k < 5; k++) {
-            out[i] = avpriv_scalarproduct_float_c(in0 + int_part,
+            out[i] = ctx->dot_productf(in0 + int_part,
                                                   upsample_fir[4 - frac_part],
                                                   UPS_MEM_SIZE);
             int_part++;
@@ -849,8 +866,8 @@ static float find_hb_gain(AMRWBContext *ctx, const float *synth,
     if (ctx->fr_cur_mode == MODE_23k85)
         return qua_hb_gain[hb_idx] * (1.0f / (1 << 14));
 
-    tilt = avpriv_scalarproduct_float_c(synth, synth + 1, AMRWB_SFR_SIZE - 1) /
-           avpriv_scalarproduct_float_c(synth, synth, AMRWB_SFR_SIZE);
+    tilt = ctx->celpm_ctx.dot_productf(synth, synth + 1, AMRWB_SFR_SIZE - 1) /
+           ctx->celpm_ctx.dot_productf(synth, synth, AMRWB_SFR_SIZE);
 
     /* return gain bounded by [0.1, 1.0] */
     return av_clipf((1.0 - FFMAX(0.0, tilt)) * (1.25 - 0.25 * wsp), 0.1, 1.0);
@@ -869,7 +886,7 @@ static void scaled_hb_excitation(AMRWBContext *ctx, float *hb_exc,
                                  const float *synth_exc, float hb_gain)
 {
     int i;
-    float energy = avpriv_scalarproduct_float_c(synth_exc, synth_exc,
+    float energy = ctx->celpm_ctx.dot_productf(synth_exc, synth_exc,
                                                 AMRWB_SFR_SIZE);
 
     /* Generate a white-noise excitation */
@@ -1000,7 +1017,7 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
         float e_isf[LP_ORDER_16k]; // ISF vector for extrapolation
         double e_isp[LP_ORDER_16k];
 
-        ff_weighted_vector_sumf(e_isf, isf_past, isf, isfp_inter[subframe],
+        ctx->acelpv_ctx.weighted_vector_sumf(e_isf, isf_past, isf, isfp_inter[subframe],
                                 1.0 - isfp_inter[subframe], LP_ORDER);
 
         extrapolate_isf(e_isf);
@@ -1014,7 +1031,7 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
         lpc_weighting(hb_lpc, ctx->lp_coef[subframe], 0.6, LP_ORDER);
     }
 
-    ff_celp_lp_synthesis_filterf(samples, hb_lpc, exc, AMRWB_SFR_SIZE_16k,
+    ctx->celpf_ctx.celp_lp_synthesis_filterf(samples, hb_lpc, exc, AMRWB_SFR_SIZE_16k,
                                  (mode == MODE_6k60) ? LP_ORDER_16k : LP_ORDER);
 }
 
@@ -1029,6 +1046,8 @@ static void hb_synthesis(AMRWBContext *ctx, int subframe, float *samples,
  *
  * @remark It is safe to pass the same array in in and out parameters
  */
+
+#ifndef hb_fir_filter
 static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
                           float mem[HB_FIR_SIZE], const float *in)
 {
@@ -1046,6 +1065,7 @@ static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
 
     memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
 }
+#endif /* hb_fir_filter */
 
 /**
  * Update context state before the next subframe.
@@ -1089,10 +1109,8 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 4 * AMRWB_SFR_SIZE_16k;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (float *)frame->data[0];
 
     header_size      = decode_mime_header(ctx, buf);
@@ -1163,7 +1181,7 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
         ctx->fixed_gain[0] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  avpriv_scalarproduct_float_c(ctx->fixed_vector,
+                                  ctx->celpm_ctx.dot_productf(ctx->fixed_vector,
                                                                ctx->fixed_vector,
                                                                AMRWB_SFR_SIZE) /
                                   AMRWB_SFR_SIZE,
@@ -1172,7 +1190,8 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
         /* Calculate voice factor and store tilt for next subframe */
         voice_fac      = voice_factor(ctx->pitch_vector, ctx->pitch_gain[0],
-                                      ctx->fixed_vector, ctx->fixed_gain[0]);
+                                      ctx->fixed_vector, ctx->fixed_gain[0],
+                                      &ctx->celpm_ctx);
         ctx->tilt_coef = voice_fac * 0.25 + 0.25;
 
         /* Construct current excitation */
@@ -1198,15 +1217,15 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
         de_emphasis(&ctx->samples_up[UPS_MEM_SIZE],
                     &ctx->samples_az[LP_ORDER], PREEMPH_FAC, ctx->demph_mem);
 
-        ff_acelp_apply_order_2_transfer_function(&ctx->samples_up[UPS_MEM_SIZE],
+        ctx->acelpf_ctx.acelp_apply_order_2_transfer_function(&ctx->samples_up[UPS_MEM_SIZE],
             &ctx->samples_up[UPS_MEM_SIZE], hpf_zeros, hpf_31_poles,
             hpf_31_gain, ctx->hpf_31_mem, AMRWB_SFR_SIZE);
 
         upsample_5_4(sub_buf, &ctx->samples_up[UPS_FIR_SIZE],
-                     AMRWB_SFR_SIZE_16k);
+                     AMRWB_SFR_SIZE_16k, &ctx->celpm_ctx);
 
         /* High frequency band (6.4 - 7.0 kHz) generation part */
-        ff_acelp_apply_order_2_transfer_function(hb_samples,
+        ctx->acelpf_ctx.acelp_apply_order_2_transfer_function(hb_samples,
             &ctx->samples_up[UPS_MEM_SIZE], hpf_zeros, hpf_400_poles,
             hpf_400_gain, ctx->hpf_400_mem, AMRWB_SFR_SIZE);
 
diff --git a/libavcodec/anm.c b/libavcodec/anm.c
index b70d220..29d59fb 100644
--- a/libavcodec/anm.c
+++ b/libavcodec/anm.c
@@ -2,20 +2,20 @@
  * Deluxe Paint Animation decoder
  * Copyright (c) 2009 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,8 +47,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     bytestream2_init(&s->gb, avctx->extradata, avctx->extradata_size);
-    if (bytestream2_get_bytes_left(&s->gb) < 16 * 8 + 4 * 256)
+    if (bytestream2_get_bytes_left(&s->gb) < 16 * 8 + 4 * 256) {
+        av_frame_free(&s->frame);
         return AVERROR_INVALIDDATA;
+    }
 
     bytestream2_skipu(&s->gb, 16 * 8);
     for (i = 0; i < 256; i++)
@@ -117,10 +119,8 @@ static int decode_frame(AVCodecContext *avctx,
     uint8_t *dst, *dst_end;
     int count, ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
     dst     = s->frame->data[0];
     dst_end = s->frame->data[0] + s->frame->linesize[0]*avctx->height;
 
@@ -128,11 +128,11 @@ static int decode_frame(AVCodecContext *avctx,
 
     if (bytestream2_get_byte(&s->gb) != 0x42) {
         avpriv_request_sample(avctx, "Unknown record type");
-        return buf_size;
+        return AVERROR_INVALIDDATA;
     }
     if (bytestream2_get_byte(&s->gb)) {
         avpriv_request_sample(avctx, "Padding bytes");
-        return buf_size;
+        return AVERROR_PATCHWELCOME;
     }
     bytestream2_skip(&s->gb, 2);
 
diff --git a/libavcodec/ansi.c b/libavcodec/ansi.c
index 65e2e16..4808ea7 100644
--- a/libavcodec/ansi.c
+++ b/libavcodec/ansi.c
@@ -2,20 +2,20 @@
  * ASCII/ANSI art decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "libavutil/common.h"
 #include "libavutil/frame.h"
 #include "libavutil/lfg.h"
+#include "libavutil/xga_font_data.h"
 #include "avcodec.h"
 #include "cga_data.h"
 #include "internal.h"
@@ -60,6 +61,7 @@ typedef struct AnsiContext {
     int attributes;       /**< attribute flags */
     int fg;               /**< foreground color */
     int bg;               /**< background color */
+    int first_frame;
 
     /* ansi parser state machine */
     enum {
@@ -83,7 +85,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     /* defaults */
-    s->font        = ff_vga16_font;
+    s->font        = avpriv_vga16_font;
     s->font_height = 16;
     s->fg          = DEFAULT_FG_COLOR;
     s->bg          = DEFAULT_BG_COLOR;
@@ -96,12 +98,27 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static void set_palette(uint32_t *pal)
+{
+    int r, g, b;
+    memcpy(pal, ff_cga_palette, 16 * 4);
+    pal += 16;
+#define COLOR(x) ((x) * 40 + 55)
+    for (r = 0; r < 6; r++)
+        for (g = 0; g < 6; g++)
+            for (b = 0; b < 6; b++)
+                *pal++ = 0xFF000000 | (COLOR(r) << 16) | (COLOR(g) << 8) | COLOR(b);
+#define GRAY(x) ((x) * 10 + 8)
+    for (g = 0; g < 24; g++)
+        *pal++ = 0xFF000000 | (GRAY(g) << 16) | (GRAY(g) << 8) | GRAY(g);
+}
+
 static void hscroll(AVCodecContext *avctx)
 {
     AnsiContext *s = avctx->priv_data;
     int i;
 
-    if (s->y < avctx->height - s->font_height) {
+    if (s->y <= avctx->height - 2*s->font_height) {
         s->y += s->font_height;
         return;
     }
@@ -154,7 +171,7 @@ static void draw_char(AVCodecContext *avctx, int c)
     ff_draw_pc_font(s->frame->data[0] + s->y * s->frame->linesize[0] + s->x,
                     s->frame->linesize[0], s->font, s->font_height, c, fg, bg);
     s->x += FONT_WIDTH;
-    if (s->x >= avctx->width) {
+    if (s->x > avctx->width - FONT_WIDTH) {
         s->x = 0;
         hscroll(avctx);
     }
@@ -168,8 +185,8 @@ static int execute_code(AVCodecContext * avctx, int c)
 {
     AnsiContext *s = avctx->priv_data;
     int ret, i;
-    int width = 0;
-    int height = 0;
+    int width  = avctx->width;
+    int height = avctx->height;
 
     switch(c) {
     case 'A': //Cursor Up
@@ -195,19 +212,19 @@ static int execute_code(AVCodecContext * avctx, int c)
             s->args[0] = DEFAULT_SCREEN_MODE;
         switch(s->args[0]) {
         case 0: case 1: case 4: case 5: case 13: case 19: //320x200 (25 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 40<<3;
             height = 25<<3;
             break;
         case 2: case 3: //640x400 (25 rows)
-            s->font = ff_vga16_font;
+            s->font = avpriv_vga16_font;
             s->font_height = 16;
             width  = 80<<3;
             height = 25<<4;
             break;
         case 6: case 14: //640x200 (25 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 25<<3;
@@ -215,13 +232,13 @@ static int execute_code(AVCodecContext * avctx, int c)
         case 7: //set line wrapping
             break;
         case 15: case 16: //640x350 (43 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 43<<3;
             break;
         case 17: case 18: //640x480 (60 rows)
-            s->font = ff_cga_font;
+            s->font = avpriv_cga_font;
             s->font_height = 8;
             width  = 80<<3;
             height = 60<<4;
@@ -229,20 +246,19 @@ static int execute_code(AVCodecContext * avctx, int c)
         default:
             avpriv_request_sample(avctx, "Unsupported screen mode");
         }
-        if (width != 0 && height != 0 &&
-            (width != avctx->width || height != avctx->height)) {
+        s->x = av_clip(s->x, 0, width  - FONT_WIDTH);
+        s->y = av_clip(s->y, 0, height - s->font_height);
+        if (width != avctx->width || height != avctx->height) {
             av_frame_unref(s->frame);
             ret = ff_set_dimensions(avctx, width, height);
             if (ret < 0)
                 return ret;
-            ret = ff_get_buffer(avctx, s->frame, AV_GET_BUFFER_FLAG_REF);
-            if (ret < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, s->frame,
+                                     AV_GET_BUFFER_FLAG_REF)) < 0)
                 return ret;
-            }
             s->frame->pict_type           = AV_PICTURE_TYPE_I;
             s->frame->palette_has_changed = 1;
-            memcpy(s->frame->data[1], ff_cga_palette, 16 * 4);
+            set_palette((uint32_t *)s->frame->data[1]);
             erase_screen(avctx);
         } else if (c == 'l') {
             erase_screen(avctx);
@@ -290,12 +306,20 @@ static int execute_code(AVCodecContext * avctx, int c)
                 s->bg = DEFAULT_BG_COLOR;
             } else if (m == 1 || m == 2 || m == 4 || m == 5 || m == 7 || m == 8) {
                 s->attributes |= 1 << (m - 1);
-            } else if (m >= 30 && m <= 38) {
+            } else if (m >= 30 && m <= 37) {
                 s->fg = ansi_to_cga[m - 30];
+            } else if (m == 38 && i + 2 < FFMIN(s->nb_args, MAX_NB_ARGS) && s->args[i + 1] == 5 && s->args[i + 2] < 256) {
+                int index = s->args[i + 2];
+                s->fg = index < 16 ? ansi_to_cga[index] : index;
+                i += 2;
             } else if (m == 39) {
                 s->fg = ansi_to_cga[DEFAULT_FG_COLOR];
             } else if (m >= 40 && m <= 47) {
                 s->bg = ansi_to_cga[m - 40];
+            } else if (m == 48 && i + 2 < FFMIN(s->nb_args, MAX_NB_ARGS) && s->args[i + 1] == 5 && s->args[i + 2] < 256) {
+                int index = s->args[i + 2];
+                s->bg = index < 16 ? ansi_to_cga[index] : index;
+                i += 2;
             } else if (m == 49) {
                 s->fg = ansi_to_cga[DEFAULT_BG_COLOR];
             } else {
@@ -319,6 +343,8 @@ static int execute_code(AVCodecContext * avctx, int c)
         avpriv_request_sample(avctx, "Unknown escape code");
         break;
     }
+    s->x = av_clip(s->x, 0, avctx->width  - FONT_WIDTH);
+    s->y = av_clip(s->y, 0, avctx->height - s->font_height);
     return 0;
 }
 
@@ -332,19 +358,21 @@ static int decode_frame(AVCodecContext *avctx,
     const uint8_t *buf_end   = buf+buf_size;
     int ret, i, count;
 
-    ret = ff_reget_buffer(avctx, s->frame);
-    if (ret < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
     if (!avctx->frame_number) {
-        memset(s->frame->data[0], 0, avctx->height * FFABS(s->frame->linesize[0]));
+        for (i=0; i<avctx->height; i++)
+            memset(s->frame->data[0]+ i*s->frame->linesize[0], 0, avctx->width);
         memset(s->frame->data[1], 0, AVPALETTE_SIZE);
     }
 
     s->frame->pict_type           = AV_PICTURE_TYPE_I;
     s->frame->palette_has_changed = 1;
-    memcpy(s->frame->data[1], ff_cga_palette, 16 * 4);
+    set_palette((uint32_t *)s->frame->data[1]);
+    if (!s->first_frame) {
+        erase_screen(avctx);
+        s->first_frame = 1;
+    }
 
     while(buf < buf_end) {
         switch(s->state) {
@@ -383,7 +411,7 @@ static int decode_frame(AVCodecContext *avctx,
             if (buf[0] == '[') {
                 s->state   = STATE_CODE;
                 s->nb_args = 0;
-                s->args[0] = 0;
+                s->args[0] = -1;
             } else {
                 s->state = STATE_NORMAL;
                 draw_char(avctx, 0x1B);
@@ -394,8 +422,8 @@ static int decode_frame(AVCodecContext *avctx,
             switch(buf[0]) {
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9':
-                if (s->nb_args < MAX_NB_ARGS)
-                    s->args[s->nb_args] = s->args[s->nb_args] * 10 + buf[0] - '0';
+                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args] < 6553)
+                    s->args[s->nb_args] = FFMAX(s->args[s->nb_args], 0) * 10 + buf[0] - '0';
                 break;
             case ';':
                 s->nb_args++;
@@ -411,7 +439,7 @@ static int decode_frame(AVCodecContext *avctx,
             default:
                 if (s->nb_args > MAX_NB_ARGS)
                     av_log(avctx, AV_LOG_WARNING, "args overflow (%i)\n", s->nb_args);
-                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args])
+                if (s->nb_args < MAX_NB_ARGS && s->args[s->nb_args] >= 0)
                     s->nb_args++;
                 if ((ret = execute_code(avctx, buf[0])) < 0)
                     return ret;
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index 2f64488..b99598b 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
  *  based upon libdemac from Dave Chapman.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
-#include "apedsp.h"
+#include "lossless_audiodsp.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
 #include "bytestream.h"
@@ -137,7 +137,7 @@ typedef struct APEContext {
     AVClass *class;                          ///< class for AVOptions
     AVCodecContext *avctx;
     BswapDSPContext bdsp;
-    APEDSPContext adsp;
+    LLAudDSPContext adsp;
     int channels;
     int samples;                             ///< samples left to decode in current frame
     int bps;
@@ -212,19 +212,6 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul)
-{
-    int res = 0;
-
-    while (order--) {
-        res   += *v1 * *v2++;
-        *v1++ += mul * *v3++;
-    }
-    return res;
-}
-
 static av_cold int ape_decode_init(AVCodecContext *avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -260,9 +247,10 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
     s->compression_level = AV_RL16(avctx->extradata + 2);
     s->flags             = AV_RL16(avctx->extradata + 4);
 
-    av_log(avctx, AV_LOG_DEBUG, "Compression Level: %d - Flags: %d\n",
+    av_log(avctx, AV_LOG_VERBOSE, "Compression Level: %d - Flags: %d\n",
            s->compression_level, s->flags);
     if (s->compression_level % 1000 || s->compression_level > COMPRESSION_LEVEL_INSANE ||
+        !s->compression_level ||
         (s->fileversion < 3930 && s->compression_level == COMPRESSION_LEVEL_INSANE)) {
         av_log(avctx, AV_LOG_ERROR, "Incorrect compression level %d\n",
                s->compression_level);
@@ -305,16 +293,8 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
         s->predictor_decode_stereo = predictor_decode_stereo_3950;
     }
 
-    s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-
-    if (ARCH_ARM)
-        ff_apedsp_init_arm(&s->adsp);
-    if (ARCH_PPC)
-        ff_apedsp_init_ppc(&s->adsp);
-    if (ARCH_X86)
-        ff_apedsp_init_x86(&s->adsp);
-
     ff_bswapdsp_init(&s->bdsp);
+    ff_llauddsp_init(&s->adsp);
     avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
     return 0;
@@ -512,9 +492,12 @@ static inline int ape_decode_value_3860(APEContext *ctx, GetBitContext *gb,
 
     if (!rice->k)
         x = overflow;
-    else
+    else if(rice->k <= MIN_CACHE_BITS) {
         x = (overflow << rice->k) + get_bits(gb, rice->k);
-
+    } else {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Too many bits: %d\n", rice->k);
+        return AVERROR_INVALIDDATA;
+    }
     rice->ksum += x - (rice->ksum + 8 >> 4);
     if (rice->ksum < (rice->k ? 1 << (rice->k + 4) : 0))
         rice->k--;
@@ -522,10 +505,7 @@ static inline int ape_decode_value_3860(APEContext *ctx, GetBitContext *gb,
         rice->k++;
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
@@ -541,9 +521,13 @@ static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
     } else
         tmpk = (rice->k < 1) ? 0 : rice->k - 1;
 
-    if (tmpk <= 16 || ctx->fileversion < 3910)
+    if (tmpk <= 16 || ctx->fileversion < 3910) {
+        if (tmpk > 23) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "Too many bits: %d\n", tmpk);
+            return AVERROR_INVALIDDATA;
+        }
         x = range_decode_bits(ctx, tmpk);
-    else if (tmpk <= 32) {
+    } else if (tmpk <= 31) {
         x = range_decode_bits(ctx, 16);
         x |= (range_decode_bits(ctx, tmpk - 16) << 16);
     } else {
@@ -555,10 +539,7 @@ static inline int ape_decode_value_3900(APEContext *ctx, APERice *rice)
     update_rice(rice, x);
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static inline int ape_decode_value_3990(APEContext *ctx, APERice *rice)
@@ -601,10 +582,7 @@ static inline int ape_decode_value_3990(APEContext *ctx, APERice *rice)
     update_rice(rice, x);
 
     /* Convert to signed */
-    if (x & 1)
-        return (x >> 1) + 1;
-    else
-        return -(x >> 1);
+    return ((x >> 1) ^ ((x & 1) - 1)) + 1;
 }
 
 static void decode_array_0000(APEContext *ctx, GetBitContext *gb,
@@ -619,10 +597,14 @@ static void decode_array_0000(APEContext *ctx, GetBitContext *gb,
         rice->ksum += out[i];
     }
     rice->k = av_log2(rice->ksum / 10) + 1;
+    if (rice->k >= 24)
+        return;
     for (; i < FFMIN(blockstodecode, 64); i++) {
         out[i] = get_rice_ook(&ctx->gb, rice->k);
         rice->ksum += out[i];
         rice->k = av_log2(rice->ksum / ((i + 1) * 2)) + 1;
+        if (rice->k >= 24)
+            return;
     }
     ksummax = 1 << rice->k + 7;
     ksummin = rice->k ? (1 << rice->k + 6) : 0;
@@ -643,12 +625,8 @@ static void decode_array_0000(APEContext *ctx, GetBitContext *gb,
         }
     }
 
-    for (i = 0; i < blockstodecode; i++) {
-        if (out[i] & 1)
-            out[i] = (out[i] >> 1) + 1;
-        else
-            out[i] = -(out[i] >> 1);
-    }
+    for (i = 0; i < blockstodecode; i++)
+        out[i] = ((out[i] >> 1) ^ ((out[i] & 1) - 1)) + 1;
 }
 
 static void entropy_decode_mono_0000(APEContext *ctx, int blockstodecode)
@@ -908,11 +886,14 @@ static av_always_inline int filter_3800(APEPredictor *p,
     return p->filterA[filter];
 }
 
-static void long_filter_high_3800(int32_t *buffer, int order, int shift,
-                                  int32_t *coeffs, int32_t *delay, int length)
+static void long_filter_high_3800(int32_t *buffer, int order, int shift, int length)
 {
     int i, j;
     int32_t dotprod, sign;
+    int32_t coeffs[256], delay[256];
+
+    if (order >= length)
+        return;
 
     memset(coeffs, 0, order * sizeof(*coeffs));
     for (i = 0; i < order; i++)
@@ -922,7 +903,7 @@ static void long_filter_high_3800(int32_t *buffer, int order, int shift,
         sign = APESIGN(buffer[i]);
         for (j = 0; j < order; j++) {
             dotprod += delay[j] * coeffs[j];
-            coeffs[j] -= (((delay[j] >> 30) & 2) - 1) * sign;
+            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
         }
         buffer[i] -= dotprod >> shift;
         for (j = 0; j < order - 1; j++)
@@ -942,7 +923,7 @@ static void long_filter_ehigh_3830(int32_t *buffer, int length)
         sign = APESIGN(buffer[i]);
         for (j = 7; j >= 0; j--) {
             dotprod += delay[j] * coeffs[j];
-            coeffs[j] -= (((delay[j] >> 30) & 2) - 1) * sign;
+            coeffs[j] += ((delay[j] >> 31) | 1) * sign;
         }
         for (j = 7; j > 0; j--)
             delay[j] = delay[j - 1];
@@ -956,13 +937,12 @@ static void predictor_decode_stereo_3800(APEContext *ctx, int count)
     APEPredictor *p = &ctx->predictor;
     int32_t *decoded0 = ctx->decoded[0];
     int32_t *decoded1 = ctx->decoded[1];
-    int32_t coeffs[256], delay[256];
     int start = 4, shift = 10;
 
     if (ctx->compression_level == COMPRESSION_LEVEL_HIGH) {
         start = 16;
-        long_filter_high_3800(decoded0, 16, 9, coeffs, delay, count);
-        long_filter_high_3800(decoded1, 16, 9, coeffs, delay, count);
+        long_filter_high_3800(decoded0, 16, 9, count);
+        long_filter_high_3800(decoded1, 16, 9, count);
     } else if (ctx->compression_level == COMPRESSION_LEVEL_EXTRA_HIGH) {
         int order = 128, shift2 = 11;
 
@@ -974,8 +954,8 @@ static void predictor_decode_stereo_3800(APEContext *ctx, int count)
             long_filter_ehigh_3830(decoded1 + order, count - order);
         }
         start = order;
-        long_filter_high_3800(decoded0, order, shift2, coeffs, delay, count);
-        long_filter_high_3800(decoded1, order, shift2, coeffs, delay, count);
+        long_filter_high_3800(decoded0, order, shift2, count);
+        long_filter_high_3800(decoded1, order, shift2, count);
     }
 
     while (count--) {
@@ -1011,12 +991,11 @@ static void predictor_decode_mono_3800(APEContext *ctx, int count)
 {
     APEPredictor *p = &ctx->predictor;
     int32_t *decoded0 = ctx->decoded[0];
-    int32_t coeffs[256], delay[256];
     int start = 4, shift = 10;
 
     if (ctx->compression_level == COMPRESSION_LEVEL_HIGH) {
         start = 16;
-        long_filter_high_3800(decoded0, 16, 9, coeffs, delay, count);
+        long_filter_high_3800(decoded0, 16, 9, count);
     } else if (ctx->compression_level == COMPRESSION_LEVEL_EXTRA_HIGH) {
         int order = 128, shift2 = 11;
 
@@ -1027,7 +1006,7 @@ static void predictor_decode_mono_3800(APEContext *ctx, int count)
             long_filter_ehigh_3830(decoded0 + order, count - order);
         }
         start = order;
-        long_filter_high_3800(decoded0, order, shift2, coeffs, delay, count);
+        long_filter_high_3800(decoded0, order, shift2, count);
     }
 
     while (count--) {
@@ -1401,7 +1380,7 @@ static void ape_unpack_stereo(APEContext *ctx, int count)
     int32_t *decoded0 = ctx->decoded[0];
     int32_t *decoded1 = ctx->decoded[1];
 
-    if (ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) {
+    if ((ctx->frameflags & APE_FRAMECODE_STEREO_SILENCE) == APE_FRAMECODE_STEREO_SILENCE) {
         /* We are pure silence, so we're done. */
         av_log(ctx->avctx, AV_LOG_DEBUG, "pure silence stereo\n");
         return;
@@ -1457,7 +1436,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
         }
         if (s->fileversion < 3950) // previous versions overread two bytes
             buf_size += 2;
-        av_fast_malloc(&s->data, &s->data_size, buf_size);
+        av_fast_padded_malloc(&s->data, &s->data_size, buf_size);
         if (!s->data)
             return AVERROR(ENOMEM);
         s->bdsp.bswap_buf((uint32_t *) s->data, (const uint32_t *) buf,
@@ -1480,7 +1459,8 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
             }
             s->ptr += offset;
         } else {
-            init_get_bits(&s->gb, s->ptr, (s->data_end - s->ptr) * 8);
+            if ((ret = init_get_bits8(&s->gb, s->ptr, s->data_end - s->ptr)) < 0)
+                return ret;
             if (s->fileversion > 3800)
                 skip_bits_long(&s->gb, offset * 8);
             else
@@ -1492,14 +1472,13 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
                    nblocks);
             return AVERROR_INVALIDDATA;
         }
-        s->samples = nblocks;
 
         /* Initialize the frame decoder */
         if (init_frame_decoder(s) < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error reading frame header\n");
             return AVERROR_INVALIDDATA;
         }
-
+        s->samples = nblocks;
     }
 
     if (!s->data) {
@@ -1524,10 +1503,8 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = blockstodecode;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     s->error=0;
 
@@ -1571,7 +1548,7 @@ static int ape_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return (s->samples == 0) ? avpkt->size : 0;
+    return !s->samples ? avpkt->size : 0;
 }
 
 static void ape_flush(AVCodecContext *avctx)
diff --git a/libavcodec/apng.h b/libavcodec/apng.h
new file mode 100644
index 0000000..41249e0
--- /dev/null
+++ b/libavcodec/apng.h
@@ -0,0 +1,41 @@
+/*
+ * APNG common header
+ * Copyright (c) 2014 Benoit Fouet
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * APNG common header
+ */
+
+#ifndef AVCODEC_APNG_H
+#define AVCODEC_APNG_H
+
+enum {
+   APNG_DISPOSE_OP_NONE       = 0,
+   APNG_DISPOSE_OP_BACKGROUND = 1,
+   APNG_DISPOSE_OP_PREVIOUS   = 2,
+};
+
+enum {
+    APNG_BLEND_OP_SOURCE = 0,
+    APNG_BLEND_OP_OVER   = 1,
+};
+
+#endif /* AVCODEC_APNG_H */
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index bd4dd4e..a4ceca7 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,8 +21,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
                                           arm/idctdsp_arm.o             \
                                           arm/jrevdct_arm.o             \
                                           arm/simple_idct_arm.o
-OBJS-$(CONFIG_MDCT)                    += arm/mdct_init_arm.o           \
-                                          arm/mdct_fixed_init_arm.o
+OBJS-$(CONFIG_LLAUDDSP)                += arm/lossless_audiodsp_init_arm.o
 OBJS-$(CONFIG_ME_CMP)                  += arm/me_cmp_init_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
@@ -39,8 +38,8 @@ OBJS-$(CONFIG_VP8DSP)                  += arm/vp8dsp_init_arm.o
 # decoders/encoders
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                           arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_APE_DECODER)             += arm/apedsp_init_arm.o
-OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
+OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
+OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += arm/vorbisdsp_init_arm.o
@@ -89,8 +88,7 @@ VFP-OBJS-$(CONFIG_FMTCONVERT)          += arm/fmtconvert_vfp.o
 VFP-OBJS-$(CONFIG_MDCT)                += arm/mdct_vfp.o
 
 # decoders/encoders
-VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/dcadsp_vfp.o              \
-                                          arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/synth_filter_vfp.o
 
 
 # NEON optimizations
@@ -130,9 +128,12 @@ NEON-OBJS-$(CONFIG_VP8DSP)             += arm/vp8dsp_init_neon.o        \
 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
                                           arm/sbrdsp_neon.o
-NEON-OBJS-$(CONFIG_APE_DECODER)        += arm/apedsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
-                                          arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+                                          arm/hevcdsp_deblock_neon.o    \
+                                          arm/hevcdsp_idct_neon.o       \
+                                          arm/hevcdsp_qpel_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h
index 4f143cb..cafa881 100644
--- a/libavcodec/arm/aac.h
+++ b/libavcodec/arm/aac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/aacpsdsp_init_arm.c b/libavcodec/arm/aacpsdsp_init_arm.c
index 6326376..e04787c 100644
--- a/libavcodec/arm/aacpsdsp_init_arm.c
+++ b/libavcodec/arm/aacpsdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S
index fb00900..a93bbfe 100644
--- a/libavcodec/arm/aacpsdsp_neon.S
+++ b/libavcodec/arm/aacpsdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S
index ed8eb37..1aea190 100644
--- a/libavcodec/arm/ac3dsp_arm.S
+++ b/libavcodec/arm/ac3dsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S
index 2028d0b..1d2563d 100644
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_init_arm.c b/libavcodec/arm/ac3dsp_init_arm.c
index a48353a..a3c32ff 100644
--- a/libavcodec/arm/ac3dsp_init_arm.c
+++ b/libavcodec/arm/ac3dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,14 @@ void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
                                 const int16_t *window, unsigned n);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+                                            const int32_t *coef0,
+                                            const int32_t *coef1,
+                                            int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+                                            const float *coef0,
+                                            const float *coef1,
+                                            int len);
 
 void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
                                      int start, int end,
@@ -59,5 +67,7 @@ av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
         c->float_to_fixed24      = ff_float_to_fixed24_neon;
         c->extract_exponents     = ff_ac3_extract_exponents_neon;
         c->apply_window_int16    = ff_apply_window_int16_neon;
+        c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+        c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
     }
 }
diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index f97b190..89d0ae8 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -131,3 +131,47 @@ function ff_apply_window_int16_neon, export=1
 
         pop             {r4,pc}
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+        vmov.i64        q0,  #0
+        vmov.i64        q1,  #0
+        vmov.i64        q2,  #0
+        vmov.i64        q3,  #0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.s32        d18, d16, d17
+        vsub.s32        d19, d16, d17
+        vmlal.s32       q0,  d16, d16
+        vmlal.s32       q1,  d17, d17
+        vmlal.s32       q2,  d18, d18
+        vmlal.s32       q3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vadd.s64        d0,  d0,  d1
+        vadd.s64        d1,  d2,  d3
+        vadd.s64        d2,  d4,  d5
+        vadd.s64        d3,  d6,  d7
+        vst1.64         {q0-q1},  [r0]
+        bx              lr
+endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+        vmov.f32        q0,  #0.0
+        vmov.f32        q1,  #0.0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.f32        d18, d16, d17
+        vsub.f32        d19, d16, d17
+        vmla.f32        d0,  d16, d16
+        vmla.f32        d1,  d17, d17
+        vmla.f32        d2,  d18, d18
+        vmla.f32        d3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vpadd.f32       d0,  d0,  d1
+        vpadd.f32       d1,  d2,  d3
+        vst1.32         {q0},     [r0]
+        bx              lr
+endfunc
diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index 0ea2f04..a2174b0 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_arm.h b/libavcodec/arm/audiodsp_arm.h
index e97e804..213660d 100644
--- a/libavcodec/arm/audiodsp_arm.h
+++ b/libavcodec/arm/audiodsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_arm.c b/libavcodec/arm/audiodsp_init_arm.c
index ea9ec3c..74aa52a 100644
--- a/libavcodec/arm/audiodsp_init_arm.c
+++ b/libavcodec/arm/audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized audio functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_neon.c b/libavcodec/arm/audiodsp_init_neon.c
index af53272..f7bd162 100644
--- a/libavcodec/arm/audiodsp_init_neon.c
+++ b/libavcodec/arm/audiodsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_neon.S b/libavcodec/arm/audiodsp_neon.S
index dfb998d..ab32cef 100644
--- a/libavcodec/arm/audiodsp_neon.S
+++ b/libavcodec/arm/audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/blockdsp_arm.h b/libavcodec/arm/blockdsp_arm.h
index 6d9c2c3..59ebeb8 100644
--- a/libavcodec/arm/blockdsp_arm.h
+++ b/libavcodec/arm/blockdsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,6 @@
 
 #include "libavcodec/blockdsp.h"
 
-void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth);
+void ff_blockdsp_init_neon(BlockDSPContext *c);
 
 #endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */
diff --git a/libavcodec/arm/blockdsp_init_arm.c b/libavcodec/arm/blockdsp_init_arm.c
index a0c0367..2080d52 100644
--- a/libavcodec/arm/blockdsp_init_arm.c
+++ b/libavcodec/arm/blockdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized block operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,10 +24,10 @@
 #include "libavcodec/blockdsp.h"
 #include "blockdsp_arm.h"
 
-av_cold void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_arm(BlockDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags))
-        ff_blockdsp_init_neon(c, high_bit_depth);
+        ff_blockdsp_init_neon(c);
 }
diff --git a/libavcodec/arm/blockdsp_init_neon.c b/libavcodec/arm/blockdsp_init_neon.c
index 5081cf0..87c0d6d 100644
--- a/libavcodec/arm/blockdsp_init_neon.c
+++ b/libavcodec/arm/blockdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised block operations
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,8 @@
 void ff_clear_block_neon(int16_t *block);
 void ff_clear_blocks_neon(int16_t *blocks);
 
-av_cold void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_neon(BlockDSPContext *c)
 {
-    if (!high_bit_depth) {
-        c->clear_block  = ff_clear_block_neon;
-        c->clear_blocks = ff_clear_blocks_neon;
-    }
+      c->clear_block  = ff_clear_block_neon;
+      c->clear_blocks = ff_clear_blocks_neon;
 }
diff --git a/libavcodec/arm/blockdsp_neon.S b/libavcodec/arm/blockdsp_neon.S
index 98df2c6..9fc63cb 100644
--- a/libavcodec/arm/blockdsp_neon.S
+++ b/libavcodec/arm/blockdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised block functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index 6ff5f1a..fdbf86b 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,12 +59,18 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
         "tst        %[r_c]        , %[r_c]                      \n\t"
         "bne        2f                                          \n\t"
         "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+        "add        %[r_c]        , %[r_c]      , #2            \n\t"
+        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
+#else
         "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
         "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
         "cmp        %[r_c]        , %[r_b]                      \n\t"
         "itt        lt                                          \n\t"
         "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
         "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
+#endif
         "sub        %[r_c]        , %[low]      , #1            \n\t"
         "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
         "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 4aed576..ae4b730 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,10 +24,9 @@
 #include <stdint.h>
 
 #include "config.h"
-#include "libavcodec/dcadsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4)
+#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
 
 #define decode_blockcodes decode_blockcodes
 static inline int decode_blockcodes(int code1, int code2, int levels,
@@ -35,46 +34,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
 {
     int32_t v0, v1, v2, v3, v4, v5;
 
-    __asm__ ("smmul   %8,  %14, %18           \n"
-             "smmul   %11, %15, %18           \n"
-             "smlabb  %14, %8,  %17, %14      \n"
-             "smlabb  %15, %11, %17, %15      \n"
-             "smmul   %9,  %8,  %18           \n"
-             "smmul   %12, %11, %18           \n"
-             "sub     %14, %14, %16, lsr #1   \n"
-             "sub     %15, %15, %16, lsr #1   \n"
-             "smlabb  %8,  %9,  %17, %8       \n"
-             "smlabb  %11, %12, %17, %11      \n"
-             "smmul   %10, %9,  %18           \n"
-             "smmul   %13, %12, %18           \n"
-             "str     %14, %0                 \n"
-             "str     %15, %4                 \n"
-             "sub     %8,  %8,  %16, lsr #1   \n"
-             "sub     %11, %11, %16, lsr #1   \n"
-             "smlabb  %9,  %10, %17, %9       \n"
-             "smlabb  %12, %13, %17, %12      \n"
-             "smmul   %14, %10, %18           \n"
-             "smmul   %15, %13, %18           \n"
-             "str     %8,  %1                 \n"
-             "str     %11, %5                 \n"
-             "sub     %9,  %9,  %16, lsr #1   \n"
-             "sub     %12, %12, %16, lsr #1   \n"
-             "smlabb  %10, %14, %17, %10      \n"
-             "smlabb  %13, %15, %17, %13      \n"
-             "str     %9,  %2                 \n"
-             "str     %12, %6                 \n"
-             "sub     %10, %10, %16, lsr #1   \n"
-             "sub     %13, %13, %16, lsr #1   \n"
-             "str     %10, %3                 \n"
-             "str     %13, %7                 \n"
-             : "=m"(values[0]), "=m"(values[1]),
-               "=m"(values[2]), "=m"(values[3]),
-               "=m"(values[4]), "=m"(values[5]),
-               "=m"(values[6]), "=m"(values[7]),
-               "=&r"(v0), "=&r"(v1), "=&r"(v2),
+    __asm__ ("smmul   %0,  %6,  %10           \n"
+             "smmul   %3,  %7,  %10           \n"
+             "smlabb  %6,  %0,  %9,  %6       \n"
+             "smlabb  %7,  %3,  %9,  %7       \n"
+             "smmul   %1,  %0,  %10           \n"
+             "smmul   %4,  %3,  %10           \n"
+             "sub     %6,  %6,  %8,  lsr #1   \n"
+             "sub     %7,  %7,  %8,  lsr #1   \n"
+             "smlabb  %0,  %1,  %9,  %0       \n"
+             "smlabb  %3,  %4,  %9,  %3       \n"
+             "smmul   %2,  %1,  %10           \n"
+             "smmul   %5,  %4,  %10           \n"
+             "str     %6,  [%11, #0]          \n"
+             "str     %7,  [%11, #16]         \n"
+             "sub     %0,  %0,  %8,  lsr #1   \n"
+             "sub     %3,  %3,  %8,  lsr #1   \n"
+             "smlabb  %1,  %2,  %9,  %1       \n"
+             "smlabb  %4,  %5,  %9,  %4       \n"
+             "smmul   %6,  %2,  %10           \n"
+             "smmul   %7,  %5,  %10           \n"
+             "str     %0,  [%11, #4]          \n"
+             "str     %3,  [%11, #20]         \n"
+             "sub     %1,  %1,  %8,  lsr #1   \n"
+             "sub     %4,  %4,  %8,  lsr #1   \n"
+             "smlabb  %2,  %6,  %9,  %2       \n"
+             "smlabb  %5,  %7,  %9,  %5       \n"
+             "str     %1,  [%11, #8]          \n"
+             "str     %4,  [%11, #24]         \n"
+             "sub     %2,  %2,  %8,  lsr #1   \n"
+             "sub     %5,  %5,  %8,  lsr #1   \n"
+             "str     %2,  [%11, #12]         \n"
+             "str     %5,  [%11, #28]         \n"
+             : "=&r"(v0), "=&r"(v1), "=&r"(v2),
                "=&r"(v3), "=&r"(v4), "=&r"(v5),
                "+&r"(code1), "+&r"(code2)
-             : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
+             : "r"(levels - 1), "r"(-levels),
+               "r"(ff_inverse[levels]), "r"(values)
+             : "memory");
 
     return code1 | code2;
 }
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
deleted file mode 100644
index 735c4c2..0000000
--- a/libavcodec/arm/dcadsp_neon.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
-        push            {r4-r6,lr}
-        mov             r3,  #32                @ decifactor
-        mov             r6,  #256/32
-        b               dca_lfe_fir
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
-        push            {r4-r6,lr}
-        mov             r3,  #64                @ decifactor
-        mov             r6,  #256/64
-dca_lfe_fir:
-        add             r4,  r0,  r3,  lsl #2   @ out2
-        add             r5,  r2,  #256*4-16     @ cf1
-        sub             r1,  r1,  #12
-        mov             lr,  #-16
-1:
-        vmov.f32        q2,  #0.0               @ v0
-        vmov.f32        q3,  #0.0               @ v1
-        mov             r12, r6
-2:
-        vld1.32         {q8},     [r2,:128]!    @ cf0
-        vld1.32         {q9},     [r5,:128], lr @ cf1
-        vld1.32         {q1},     [r1], lr      @ in
-        subs            r12, r12, #4
-        vrev64.32       q10, q8
-        vmla.f32        q3,  q1,  q9
-        vmla.f32        d4,  d2,  d21
-        vmla.f32        d5,  d3,  d20
-        bne             2b
-
-        add             r1,  r1,  r6,  lsl #2
-        subs            r3,  r3,  #1
-        vadd.f32        d4,  d4,  d5
-        vadd.f32        d6,  d6,  d7
-        vpadd.f32       d5,  d4,  d6
-        vst1.32         {d5[0]},  [r0,:32]!
-        vst1.32         {d5[1]},  [r4,:32]!
-        bne             1b
-
-        pop             {r4-r6,pc}
-endfunc
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
deleted file mode 100644
index c9114d4..0000000
--- a/libavcodec/arm/dcadsp_vfp.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison@riscosopen.org>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-POUT          .req    a1
-PIN           .req    a2
-PCOEF         .req    a3
-OLDFPSCR      .req    a4
-COUNTER       .req    ip
-
-IN0           .req    s4
-IN1           .req    s5
-IN2           .req    s6
-IN3           .req    s7
-IN4           .req    s0
-IN5           .req    s1
-IN6           .req    s2
-IN7           .req    s3
-COEF0         .req    s8   @ coefficient elements
-COEF1         .req    s9
-COEF2         .req    s10
-COEF3         .req    s11
-COEF4         .req    s12
-COEF5         .req    s13
-COEF6         .req    s14
-COEF7         .req    s15
-ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
-ACCUM4        .req    s20
-POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
-POST1         .req    s25
-POST2         .req    s26
-POST3         .req    s27
-
-
-.macro inner_loop  decifactor, dir, tail, head
- .ifc "\dir","up"
-  .set X, 0
-  .set Y, 4
- .else
-  .set X, 4*JMAX*4 - 4
-  .set Y, -4
- .endif
- .ifnc "\head",""
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
- .endif
- .ifnc "\tail",""
-        vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
- .endif
- .ifnc "\head",""
-        vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
- .endif
- .ifnc "\head",""
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
-   .ifc "\tail",""
-        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
-   .endif
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
-   .ifnc "\tail",""
-        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
-   .endif
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
- .endif
- .ifnc "\tail",""
-        vstmia  POUT!, {POST0-POST3}
- .endif
- .ifnc "\head",""
-        vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
-        vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
-  .if \decifactor == 32
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
-        vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
-        vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
-        vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
-        vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
-  .endif
- .endif
-.endm
-
-.macro dca_lfe_fir  decifactor
-function ff_dca_lfe_fir\decifactor\()_vfp, export=1
-        fmrx    OLDFPSCR, FPSCR
-        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
-        fmxr    FPSCR, ip
-        vldr    IN0, [PIN, #-0*4]
-        vldr    IN1, [PIN, #-1*4]
-        vldr    IN2, [PIN, #-2*4]
-        vldr    IN3, [PIN, #-3*4]
- .if \decifactor == 32
-  .set JMAX, 8
-        vpush   {s16-s31}
-        vldr    IN4, [PIN, #-4*4]
-        vldr    IN5, [PIN, #-5*4]
-        vldr    IN6, [PIN, #-6*4]
-        vldr    IN7, [PIN, #-7*4]
- .else
-  .set JMAX, 4
-        vpush   {s16-s27}
- .endif
-
-        mov     COUNTER, #\decifactor/4 - 1
-        inner_loop  \decifactor, up,, head
-1:      add     PCOEF, PCOEF, #4*JMAX*4
-        subs    COUNTER, COUNTER, #1
-        inner_loop  \decifactor, up, tail, head
-        bne     1b
-        inner_loop  \decifactor, up, tail
-
-        mov     COUNTER, #\decifactor/4 - 1
-        inner_loop  \decifactor, down,, head
-1:      sub     PCOEF, PCOEF, #4*JMAX*4
-        subs    COUNTER, COUNTER, #1
-        inner_loop  \decifactor, down, tail, head
-        bne     1b
-        inner_loop  \decifactor, down, tail
-
- .if \decifactor == 32
-        vpop    {s16-s31}
- .else
-        vpop    {s16-s27}
- .endif
-        fmxr    FPSCR, OLDFPSCR
-        bx      lr
-endfunc
-.endm
-
-        dca_lfe_fir  64
- .ltorg
-        dca_lfe_fir  32
-
-        .unreq  POUT
-        .unreq  PIN
-        .unreq  PCOEF
-        .unreq  OLDFPSCR
-        .unreq  COUNTER
-
-        .unreq  IN0
-        .unreq  IN1
-        .unreq  IN2
-        .unreq  IN3
-        .unreq  IN4
-        .unreq  IN5
-        .unreq  IN6
-        .unreq  IN7
-        .unreq  COEF0
-        .unreq  COEF1
-        .unreq  COEF2
-        .unreq  COEF3
-        .unreq  COEF4
-        .unreq  COEF5
-        .unreq  COEF6
-        .unreq  COEF7
-        .unreq  ACCUM0
-        .unreq  ACCUM4
-        .unreq  POST0
-        .unreq  POST1
-        .unreq  POST2
-        .unreq  POST3
-
-
-IN      .req    a1
-SBACT   .req    a2
-OLDFPSCR .req   a3
-IMDCT   .req    a4
-WINDOW  .req    v1
-OUT     .req    v2
-BUF     .req    v3
-SCALEINT .req   v4 @ only used in softfp case
-COUNT   .req    v5
-
-SCALE   .req    s0
-
-/* Stack layout differs in softfp and hardfp cases:
- *
- * hardfp
- *      fp -> 6 arg words saved by caller
- *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
- *            s16-s23 on entry
- *            align 16
- *     buf -> 8*32*4 bytes buffer
- *            s0 on entry
- *      sp -> 3 arg words for callee
- *
- * softfp
- *      fp -> 7 arg words saved by caller
- *            a4,v1-v5,fp,lr on entry
- *            s16-s23 on entry
- *            align 16
- *     buf -> 8*32*4 bytes buffer
- *      sp -> 4 arg words for callee
- */
-
-/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- *                                 SynthFilterContext *synth, FFTContext *imdct,
- *                                 float (*synth_buf_ptr)[512],
- *                                 int *synth_buf_offset, float (*synth_buf2)[32],
- *                                 const float (*window)[512], float *samples_out,
- *                                 float (*raXin)[32], float scale);
- */
-function ff_dca_qmf_32_subbands_vfp, export=1
-VFP     push    {a3-a4,v1-v3,v5,fp,lr}
-NOVFP   push    {a4,v1-v5,fp,lr}
-        add     fp, sp, #8*4
-        vpush   {s16-s23}
-        @ The buffer pointed at by raXin isn't big enough for us to do a
-        @ complete matrix transposition as we want to, so allocate an
-        @ alternative buffer from the stack. Align to 4 words for speed.
-        sub     BUF, sp, #8*32*4
-        bic     BUF, BUF, #15
-        mov     sp, BUF
-        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
-        fmrx    OLDFPSCR, FPSCR
-        fmxr    FPSCR, lr
-        @ COUNT is used to count down 2 things at once:
-        @ bits 0-4 are the number of word pairs remaining in the output row
-        @ bits 5-31 are the number of words to copy (with possible negation)
-        @   from the source matrix before we start zeroing the remainder
-        mov     COUNT, #(-4 << 5) + 16
-        adds    COUNT, COUNT, SBACT, lsl #5
-        bmi     2f
-1:
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  [IN, #(1*8+0)*4]
-        vldr    s11, [IN, #(1*8+1)*4]
-        vldr    s13, [IN, #(1*8+2)*4]
-        vldr    s15, [IN, #(1*8+3)*4]
-        vneg.f  s16, s16
-        vldr    s17, [IN, #(1*8+4)*4]
-        vldr    s19, [IN, #(1*8+5)*4]
-        vldr    s21, [IN, #(1*8+6)*4]
-        vldr    s23, [IN, #(1*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        vldr    s9,  [IN, #(3*8+0)*4]
-        vldr    s11, [IN, #(3*8+1)*4]
-        vldr    s13, [IN, #(3*8+2)*4]
-        vldr    s15, [IN, #(3*8+3)*4]
-        vldr    s17, [IN, #(3*8+4)*4]
-        vldr    s19, [IN, #(3*8+5)*4]
-        vldr    s21, [IN, #(3*8+6)*4]
-        vldr    s23, [IN, #(3*8+7)*4]
-        vneg.f  s9, s9
-        vldr    s8,  [IN, #(2*8+0)*4]
-        vldr    s10, [IN, #(2*8+1)*4]
-        vldr    s12, [IN, #(2*8+2)*4]
-        vldr    s14, [IN, #(2*8+3)*4]
-        vneg.f  s17, s17
-        vldr    s16, [IN, #(2*8+4)*4]
-        vldr    s18, [IN, #(2*8+5)*4]
-        vldr    s20, [IN, #(2*8+6)*4]
-        vldr    s22, [IN, #(2*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+2)*4]
-        vstr    d5,  [BUF, #(1*32+2)*4]
-        vstr    d6,  [BUF, #(2*32+2)*4]
-        vstr    d7,  [BUF, #(3*32+2)*4]
-        vstr    d8,  [BUF, #(4*32+2)*4]
-        vstr    d9,  [BUF, #(5*32+2)*4]
-        vstr    d10, [BUF, #(6*32+2)*4]
-        vstr    d11, [BUF, #(7*32+2)*4]
-        add     IN, IN, #4*8*4
-        add     BUF, BUF, #4*4
-        subs    COUNT, COUNT, #(4 << 5) + 2
-        bpl     1b
-2:      @ Now deal with trailing < 4 samples
-        adds    COUNT, COUNT, #3 << 5
-        bmi     4f  @ sb_act was a multiple of 4
-        bics    lr, COUNT, #0x1F
-        bne     3f
-        @ sb_act was n*4+1
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  zero
-        vldr    s11, zero
-        vldr    s13, zero
-        vldr    s15, zero
-        vneg.f  s16, s16
-        vldr    s17, zero
-        vldr    s19, zero
-        vldr    s21, zero
-        vldr    s23, zero
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #1
-        b       4f
-3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  [IN, #(1*8+0)*4]
-        vldr    s11, [IN, #(1*8+1)*4]
-        vldr    s13, [IN, #(1*8+2)*4]
-        vldr    s15, [IN, #(1*8+3)*4]
-        vneg.f  s16, s16
-        vldr    s17, [IN, #(1*8+4)*4]
-        vldr    s19, [IN, #(1*8+5)*4]
-        vldr    s21, [IN, #(1*8+6)*4]
-        vldr    s23, [IN, #(1*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #(2 << 5) + 1
-        bics    lr, COUNT, #0x1F
-        bne     4f
-        @ sb_act was n*4+3
-        vldr    s8,  [IN, #(2*8+0)*4]
-        vldr    s10, [IN, #(2*8+1)*4]
-        vldr    s12, [IN, #(2*8+2)*4]
-        vldr    s14, [IN, #(2*8+3)*4]
-        vldr    s16, [IN, #(2*8+4)*4]
-        vldr    s18, [IN, #(2*8+5)*4]
-        vldr    s20, [IN, #(2*8+6)*4]
-        vldr    s22, [IN, #(2*8+7)*4]
-        vldr    s9,  zero
-        vldr    s11, zero
-        vldr    s13, zero
-        vldr    s15, zero
-        vldr    s17, zero
-        vldr    s19, zero
-        vldr    s21, zero
-        vldr    s23, zero
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #1
-4:      @ Now fill the remainder with 0
-        vldr    s8, zero
-        vldr    s9, zero
-        ands    COUNT, COUNT, #0x1F
-        beq     6f
-5:      vstr    d4, [BUF, #(0*32+0)*4]
-        vstr    d4, [BUF, #(1*32+0)*4]
-        vstr    d4, [BUF, #(2*32+0)*4]
-        vstr    d4, [BUF, #(3*32+0)*4]
-        vstr    d4, [BUF, #(4*32+0)*4]
-        vstr    d4, [BUF, #(5*32+0)*4]
-        vstr    d4, [BUF, #(6*32+0)*4]
-        vstr    d4, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        subs    COUNT, COUNT, #1
-        bne     5b
-6:
-        fmxr    FPSCR, OLDFPSCR
-        ldr     WINDOW, [fp, #3*4]
-        ldr     OUT, [fp, #4*4]
-        sub     BUF, BUF, #32*4
-NOVFP   ldr     SCALEINT, [fp, #6*4]
-        mov     COUNT, #8
-VFP     vpush   {SCALE}
-VFP     sub     sp, sp, #3*4
-NOVFP   sub     sp, sp, #4*4
-7:
-VFP     ldr     a1, [fp, #-7*4]     @ imdct
-NOVFP   ldr     a1, [fp, #-8*4]
-        ldmia   fp, {a2-a4}
-VFP     stmia   sp, {WINDOW, OUT, BUF}
-NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
-VFP     vldr    SCALE, [sp, #3*4]
-        bl      X(ff_synth_filter_float_vfp)
-        add     OUT, OUT, #32*4
-        add     BUF, BUF, #32*4
-        subs    COUNT, COUNT, #1
-        bne     7b
-
-A       sub     sp, fp, #(8+8)*4
-T       sub     fp, fp, #(8+8)*4
-T       mov     sp, fp
-        vpop    {s16-s23}
-VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
-NOVFP   pop     {a4,v1-v5,fp,pc}
-endfunc
-
-        .unreq  IN
-        .unreq  SBACT
-        .unreq  OLDFPSCR
-        .unreq  IMDCT
-        .unreq  WINDOW
-        .unreq  OUT
-        .unreq  BUF
-        .unreq  SCALEINT
-        .unreq  COUNT
-
-        .unreq  SCALE
-
-        .align 2
-zero:   .word   0
diff --git a/libavcodec/arm/dct-test.c b/libavcodec/arm/dct-test.c
index 70e5c1c..f9076b3 100644
--- a/libavcodec/arm/dct-test.c
+++ b/libavcodec/arm/dct-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_fixed_init_arm.c b/libavcodec/arm/fft_fixed_init_arm.c
index 5132b09..11226d6 100644
--- a/libavcodec/arm/fft_fixed_init_arm.c
+++ b/libavcodec/arm/fft_fixed_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,8 @@
 #include "libavcodec/fft.h"
 
 void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
+void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
 
 av_cold void ff_fft_fixed_init_arm(FFTContext *s)
 {
@@ -33,6 +35,16 @@ av_cold void ff_fft_fixed_init_arm(FFTContext *s)
 
     if (have_neon(cpu_flags)) {
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+#if CONFIG_FFT
         s->fft_calc        = ff_fft_fixed_calc_neon;
+#endif
+
+#if CONFIG_MDCT
+        if (!s->inverse && s->nbits >= 3) {
+            s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+            s->mdct_calc        = ff_mdct_fixed_calc_neon;
+            s->mdct_calcw       = ff_mdct_fixed_calcw_neon;
+        }
+#endif
     }
 }
diff --git a/libavcodec/arm/fft_fixed_neon.S b/libavcodec/arm/fft_fixed_neon.S
index c70a189..2651607 100644
--- a/libavcodec/arm/fft_fixed_neon.S
+++ b/libavcodec/arm/fft_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 4d047ea..331bd65 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,16 +29,33 @@ void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 av_cold void ff_fft_init_arm(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_vfp_vm(cpu_flags)) {
         s->fft_calc     = ff_fft_calc_vfp;
+#if CONFIG_MDCT
+        s->imdct_half   = ff_imdct_half_vfp;
+#endif
     }
 
     if (have_neon(cpu_flags)) {
+#if CONFIG_FFT
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
+#endif
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
     }
 }
diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
index b161015..48f8dfc 100644
--- a/libavcodec/arm/fft_neon.S
+++ b/libavcodec/arm/fft_neon.S
@@ -7,20 +7,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index c2801fa..ac60132 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_arm.S b/libavcodec/arm/flacdsp_arm.S
index d4441da..f8861c5 100644
--- a/libavcodec/arm/flacdsp_arm.S
+++ b/libavcodec/arm/flacdsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_init_arm.c b/libavcodec/arm/flacdsp_init_arm.c
index 0530cf7..564e3dc 100644
--- a/libavcodec/arm/flacdsp_init_arm.c
+++ b/libavcodec/arm/flacdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,9 @@
 void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
-av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
                                  int bps)
 {
-    if (bps <= 16)
-        c->lpc = ff_flac_lpc_16_arm;
+    if (CONFIG_FLAC_DECODER)
+        c->lpc16 = ff_flac_lpc_16_arm;
 }
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
index 11396e8..a734dec 100644
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized Format Conversion Utils
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
index 5d48e3d..738953e 100644
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau  <janne-libav@jannau.net>b
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 4e43f42..b14af45 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_init_arm.c b/libavcodec/arm/g722dsp_init_arm.c
index 5edf619..c0e5d8b 100644
--- a/libavcodec/arm/g722dsp_init_arm.c
+++ b/libavcodec/arm/g722dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_neon.S b/libavcodec/arm/g722dsp_neon.S
index 5fa3c27..757e53f 100644
--- a/libavcodec/arm/g722dsp_neon.S
+++ b/libavcodec/arm/g722dsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions for G722 coding
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c
index 6f36553..13f7e0d 100644
--- a/libavcodec/arm/h264chroma_init_arm.c
+++ b/libavcodec/arm/h264chroma_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index ee7011b..77ed3c0 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index 7afd350..90144d0 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,11 +72,14 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
 static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
                                       const int chroma_format_idc)
 {
+#if HAVE_NEON
     if (bit_depth == 8) {
         c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
         c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if(chroma_format_idc == 1){
         c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+        }
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
         c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
@@ -96,6 +99,7 @@ static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
         c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
         c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
     }
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
@@ -103,8 +107,10 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 5e75565..274a547 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index f588f3e..4f68bdb 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index a445d4d..cc324d7 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,6 +49,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
                                         const int bit_depth,
                                         const int chroma_format_idc)
 {
+#if HAVE_NEON
     const int high_depth = bit_depth > 8;
 
     if (high_depth)
@@ -81,6 +82,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
     if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
         codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
         h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
diff --git a/libavcodec/arm/h264pred_neon.S b/libavcodec/arm/h264pred_neon.S
index 332f94b..4dc47ba 100644
--- a/libavcodec/arm/h264pred_neon.S
+++ b/libavcodec/arm/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_init_arm.c b/libavcodec/arm/h264qpel_init_arm.c
index 01615b5..71237be 100644
--- a/libavcodec/arm/h264qpel_init_arm.c
+++ b/libavcodec/arm/h264qpel_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_neon.S b/libavcodec/arm/h264qpel_neon.S
index 6c51250..21336c6 100644
--- a/libavcodec/arm/h264qpel_neon.S
+++ b/libavcodec/arm/h264qpel_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hevcdsp_arm.h b/libavcodec/arm/hevcdsp_arm.h
new file mode 100644
index 0000000..7735df9
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_arm.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
+#define AVCODEC_ARM_HEVCDSP_ARM_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000..166bddb
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start
+        ldr      r12, [r2]
+        ldr      r3, [r2, #4]
+        add      r2, r3, r12
+        cmp      r2, #0
+        it       eq
+        bxeq     lr
+.endm
+
+.macro hevc_loop_filter_chroma_body
+        vsubl.u8  q3, d4, d2
+        vsubl.u8  q11, d18, d19
+        vshl.i16  q3, #2
+        vadd.i16  q11, q3
+        vdup.16   d0, r12
+        vdup.16   d1, r3
+        vrshr.s16 q11, q11, #3
+        vneg.s16  q12, q0
+        vmovl.u8  q2, d4
+        vmin.s16  q11, q11, q0
+        vmax.s16  q11, q11, q12
+        vaddw.u8  q1, q11, d2
+        vsub.i16  q2, q11
+        vqmovun.s16 d2, q1
+        vqmovun.s16 d4, q2
+.endm
+
+.macro hevc_loop_filter_luma_start
+        ldr     r12, [r3]
+        ldr      r3, [r3, #4]
+        lsl      r3, #16
+        orr      r3, r12
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        lsr      r3, #16
+.endm
+
+.macro hevc_loop_filter_luma_body
+        vmovl.u8  q8, d16
+        vmovl.u8  q9, d18
+        vmovl.u8  q10, d20
+        vmovl.u8  q11, d22
+        vmovl.u8  q12, d24
+        vmovl.u8  q13, d26
+        vmovl.u8  q14, d28
+        vmovl.u8  q15, d30
+
+        vadd.i16   q7, q9, q11
+        vadd.i16   q6, q14, q12
+        vsub.i16   q7, q10
+        vsub.i16   q6, q13
+        vabd.s16   q7, q7, q10
+        vabd.s16   q6, q6, q13
+
+
+        vdup.16    q0, r2
+        vmov       q4, q7
+        vmov       q5, q6
+        vdup.16    d4, r12
+        vtrn.16    q7, q4
+        vtrn.16    q6, q5
+
+        vshl.u64   q7, #32
+        vshr.u64   q4, #32
+        vshl.u64   q6, #32
+        vshr.u64   q5, #32
+        vshr.u64   q7, #32
+        vshr.u64   q6, #32
+        vshl.u64   q5, #32
+        vshl.u64   q4, #32
+        vorr       q6, q5
+        vorr       q7, q4
+        vdup.16    d5, r3
+        vadd.i16   q5, q7, q6
+
+        vmov       q4, q5
+        vmov       q3, q5
+        vtrn.32    q3, q4
+
+        vadd.i16   q4, q3
+
+        vshl.s16   q5, q5, #1
+        vcgt.s16   q3, q0, q4
+
+        vmovn.i16  d6, q3
+        vshr.s16   q1, q0, #2
+        vmovn.i16  d6, q3
+        vcgt.s16   q5, q1, q5
+        vmov       r7, s12
+        cmp        r7, #0
+        beq        bypasswrite
+
+        vpadd.i32  d0, d14, d12
+        vpadd.i32  d1, d15, d13
+        vmov       q4, q2
+        vshl.s16   q2, #2
+        vshr.s16   q1, q1, #1
+        vrhadd.s16 q2, q4
+
+        vabd.s16   q7, q8, q11
+        vaba.s16   q7, q15, q12
+
+        vmovn.i32  d0, q0
+        vmov       r5, r6, s0, s1
+        vcgt.s16   q6, q1, q7
+        vand       q5, q5, q6
+        vabd.s16   q7, q11, q12
+        vcgt.s16   q6, q2, q7
+        vand       q5, q5, q6
+
+        vmov       q2, q5
+        vtrn.s16   q5, q2
+        vshr.u64   q2, #32
+        vshl.u64   q5, #32
+        vshl.u64   q2, #32
+        vshr.u64   q5, #32
+        vorr       q5, q2
+
+        vmov       q2, q5
+        vshl.i16   q7, q4, #1
+        vtrn.32    q2, q5
+        vand       q5, q2
+        vneg.s16   q6, q7
+        vmovn.i16  d4, q5
+        vmovn.i16  d4, q2
+        vmov       r8, s8
+
+        and        r9, r8, r7
+        cmp        r9, #0
+        beq        weakfilter_\@
+
+        vadd.i16  q2, q11, q12
+        vadd.i16  q4, q9, q8
+        vadd.i16  q1, q2, q10
+        vdup.16   d10, r9
+        vadd.i16  q0, q1, q9
+        vshl.i16  q4, #1
+        lsr        r9, #16
+        vadd.i16  q1, q0
+        vrshr.s16 q3, q0, #2
+        vadd.i16  q1, q13
+        vadd.i16  q4, q0
+        vsub.i16  q3, q10
+        vrshr.s16 q1, #3
+        vrshr.s16 q4, #3
+        vmax.s16  q3, q6
+        vsub.i16  q1, q11
+        vsub.i16  q4, q9
+        vmin.s16  q3, q7
+        vmax.s16  q4, q6
+        vmax.s16  q1, q6
+        vadd.i16  q3, q10
+        vmin.s16  q4, q7
+        vmin.s16  q1, q7
+        vdup.16   d11, r9
+        vadd.i16  q4, q9
+        vadd.i16  q1, q11
+        vbit      q9, q4, q5
+        vadd.i16  q4, q2, q13
+        vbit      q11, q1, q5
+        vadd.i16  q0, q4, q14
+        vadd.i16  q2, q15, q14
+        vadd.i16  q4, q0
+
+        vshl.i16  q2, #1
+        vadd.i16  q4, q10
+        vbit      q10, q3, q5
+        vrshr.s16 q4, #3
+        vadd.i16  q2, q0
+        vrshr.s16 q3, q0, #2
+        vsub.i16  q4, q12
+        vrshr.s16 q2, #3
+        vsub.i16  q3, q13
+        vmax.s16  q4, q6
+        vsub.i16  q2, q14
+        vmax.s16  q3, q6
+        vmin.s16  q4, q7
+        vmax.s16  q2, q6
+        vmin.s16  q3, q7
+        vadd.i16  q4, q12
+        vmin.s16  q2, q7
+        vadd.i16  q3, q13
+        vbit      q12, q4, q5
+        vadd.i16  q2, q14
+        vbit      q13, q3, q5
+        vbit      q14, q2, q5
+
+weakfilter_\@:
+        mvn       r8, r8
+        and       r9, r8, r7
+        cmp       r9, #0
+        beq       ready_\@
+
+        vdup.16    q4, r2
+
+        vdup.16   d10, r9
+        lsr       r9, #16
+        vmov       q1, q4
+        vdup.16   d11, r9
+        vshr.s16   q1, #1
+        vsub.i16  q2, q12, q11
+        vadd.i16   q4, q1
+        vshl.s16  q0, q2, #3
+        vshr.s16   q4, #3
+        vadd.i16  q2, q0
+        vsub.i16  q0, q13, q10
+        vsub.i16  q2, q0
+        vshl.i16  q0, q0, #1
+        vsub.i16  q2, q0
+        vshl.s16  q1, q7, 2
+        vrshr.s16 q2, q2, #4
+        vadd.i16  q1, q7
+        vabs.s16  q3, q2
+        vshr.s16  q6, q6, #1
+        vcgt.s16  q1, q1, q3
+        vand      q5, q1
+        vshr.s16  q7, q7, #1
+        vmax.s16  q2, q2, q6
+        vmin.s16  q2, q2, q7
+
+        vshr.s16  q7, q7, #1
+        vrhadd.s16 q3, q9, q11
+        vneg.s16  q6, q7
+        vsub.s16  q3, q10
+        vdup.16   d2, r5
+        vhadd.s16 q3, q2
+        vdup.16   d3, r6
+        vmax.s16  q3, q3, q6
+        vcgt.s16  q1, q4, q1
+        vmin.s16  q3, q3, q7
+        vand      q1, q5
+        vadd.i16  q3, q10
+        lsr       r5, #16
+        lsr       r6, #16
+        vbit      q10, q3, q1
+
+        vrhadd.s16 q3, q14, q12
+        vdup.16   d2, r5
+        vsub.s16  q3, q13
+        vdup.16   d3, r6
+        vhsub.s16 q3, q2
+        vcgt.s16  q1, q4, q1
+        vmax.s16  q3, q3, q6
+        vand      q1, q5
+        vmin.s16  q3, q3, q7
+        vadd.i16  q3, q13
+        vbit      q13, q3, q1
+        vadd.i16  q0, q11, q2
+        vsub.i16  q4, q12, q2
+        vbit      q11, q0, q5
+        vbit      q12, q4, q5
+
+ready_\@:
+        vqmovun.s16 d16, q8
+        vqmovun.s16 d18, q9
+        vqmovun.s16 d20, q10
+        vqmovun.s16 d22, q11
+        vqmovun.s16 d24, q12
+        vqmovun.s16 d26, q13
+        vqmovun.s16 d28, q14
+        vqmovun.s16 d30, q15
+.endm
+
+function ff_hevc_v_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d22}, [r0], r1
+        vld1.8   {d24}, [r0], r1
+        vld1.8   {d26}, [r0], r1
+        vld1.8   {d28}, [r0], r1
+        vld1.8   {d30}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        hevc_loop_filter_luma_body
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0], r1
+        vst1.8   {d30}, [r0]
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_h_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, r0, r1, lsl #2
+        vld1.8  {d16}, [r0], r1
+        vld1.8  {d18}, [r0], r1
+        vld1.8  {d20}, [r0], r1
+        vld1.8  {d22}, [r0], r1
+        vld1.8  {d24}, [r0], r1
+        vld1.8  {d26}, [r0], r1
+        vld1.8  {d28}, [r0], r1
+        vld1.8  {d30}, [r0], r1
+        sub        r0, r0, r1, lsl #3
+        add        r0, r1
+        hevc_loop_filter_luma_body
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0]
+bypasswrite:
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_v_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d17}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2},  [r0], r1
+        vld1.8   {d4},  [r0], r1
+        vld1.8   {d19}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d21}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        hevc_loop_filter_chroma_body
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d17}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d2},  [r0], r1
+        vst1.8   {d4},  [r0], r1
+        vst1.8   {d19}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d21}, [r0]
+        bx       lr
+endfunc
+
+function ff_hevc_h_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, r0, r1, lsl #1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2}, [r0], r1
+        vld1.8   {d4}, [r0], r1
+        vld1.8   {d19}, [r0]
+        sub      r0, r0, r1, lsl #1
+        hevc_loop_filter_chroma_body
+        vst1.8   {d2}, [r0], r1
+        vst1.8   {d4}, [r0]
+        bx       lr
+endfunc
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
new file mode 100644
index 0000000..13d540e
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+function ff_hevc_idct_4x4_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q0, r1
+        vdup.16     q1, r1
+        vst1.16     {q0, q1}, [r0]
+        bx lr
+endfunc
+
+function ff_hevc_idct_8x8_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function ff_hevc_idct_16x16_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function ff_hevc_idct_32x32_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        mov         r3, #16
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+1:      subs        r3, #1
+        vstm        r0!, {q8-q15}
+        bne         1b
+        bx lr
+endfunc
+
+function ff_hevc_transform_add_4x4_neon_8, export=1
+        vldm        r1, {q0-q1}
+        vld1.32     d4[0], [r0], r2
+        vld1.32     d4[1], [r0], r2
+        vld1.32     d5[0], [r0], r2
+        vld1.32     d5[1], [r0], r2
+        sub         r0, r0, r2, lsl #2
+        vmovl.u8    q8, d4
+        vmovl.u8    q9, d5
+        vqadd.s16   q0, q0, q8
+        vqadd.s16   q1, q1, q9
+        vqmovun.s16 d0, q0
+        vqmovun.s16 d1, q1
+        vst1.32     d0[0], [r0], r2
+        vst1.32     d0[1], [r0], r2
+        vst1.32     d1[0], [r0], r2
+        vst1.32     d1[1], [r0], r2
+        bx          lr
+endfunc
+
+function ff_hevc_transform_add_8x8_neon_8, export=1
+        mov         r3,   #8
+1:      subs        r3,   #1
+        vld1.16     {q0}, [r1]!
+        vld1.8      d16,  [r0]
+        vmovl.u8    q8,   d16
+        vqadd.s16   q0,   q8
+        vqmovun.s16 d0,   q0
+        vst1.32     d0,   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+function ff_hevc_transform_add_16x16_neon_8, export=1
+        mov         r3,   #16
+1:      subs        r3,   #1
+        vld1.16     {q0, q1}, [r1]!
+        vld1.8      {q8},  [r0]
+        vmovl.u8    q9,  d16
+        vmovl.u8    q10, d17
+        vqadd.s16   q0,  q9
+        vqadd.s16   q1,  q10
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vst1.8      {q0},   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+function ff_hevc_transform_add_32x32_neon_8, export=1
+        mov         r3,   #32
+1:      subs        r3,   #1
+        vldm        r1!, {q0-q3}
+        vld1.8      {q8, q9},  [r0]
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vqmovun.s16 d2,  q2
+        vqmovun.s16 d3,  q3
+        vst1.8     {q0, q1},   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+.macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.64         \r0, \r4
+        vtrn.64         \r1, \r5
+        vtrn.64         \r2, \r6
+        vtrn.64         \r3, \r7
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.32         \r4, \r6
+        vtrn.32         \r5, \r7
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+        vtrn.16         \r4, \r5
+        vtrn.16         \r6, \r7
+.endm
+
+// in 4 q regs
+// output 8 d regs
+.macro transpose_16b_4x4    r0, r1, r2, r3
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+.endm
+
+/* uses registers q2 - q9 for temp values */
+/* TODO: reorder */
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
+        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
+        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
+        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
+
+        vaddl.s16   q7, \r0, \r3    // src0 + src3
+        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
+        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
+
+        vmul.s32    q8, q5, d0[1]   // 29 * c0
+        vmul.s32    q9, q2, d1[0]   // 55 * c1
+        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
+        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
+
+        vmul.s32    q2, q2, d0[1]   // 29 * c1
+        vmul.s32    q9, q4, d1[0]   // 55 * c2
+        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
+        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
+
+        vmul.s32    q5, q5, d1[0]   // 55 * c0
+        vmul.s32    q4, q4, d0[1]   // 29 * c2
+        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
+        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
+
+        vqrshrn.s32   \r0, q8, \shift
+        vqrshrn.s32   \r1, q9, \shift
+        vqrshrn.s32   \r2, q7, \shift
+        vqrshrn.s32   \r3, q5, \shift
+.endm
+
+/* uses registers q2 - q6 for temp values */
+.macro tr4 r0, r1, r2, r3
+        vmull.s16  q4, \r1, d0[0]   // 83 * src1
+        vmull.s16  q6, \r1, d0[1]   // 36 * src1
+        vshll.s16  q2, \r0, #6   // 64 * src0
+        vshll.s16  q3, \r2, #6   // 64 * src2
+        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
+        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
+        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
+        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
+
+        vsub.s32   q3, q5, q4    // e0 - o0
+        vadd.s32   q4, q5, q4    // e0 + o0
+        vadd.s32   q5, q2, q6    // e1 + o1
+        vsub.s32   q6, q2, q6    // e1 - o1
+.endm
+
+.macro tr4_shift r0, r1, r2, r3, shift
+        vmull.s16  q4, \r1, d0[0]   // 83 * src1
+        vmull.s16  q6, \r1, d0[1]   // 36 * src1
+        vshll.s16  q2, \r0, #6   // 64 * src0
+        vshll.s16  q3, \r2, #6   // 64 * src2
+        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
+        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
+        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
+        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
+
+        vsub.s32   q3, q5, q4    // e0 - o0
+        vadd.s32   q4, q5, q4    // e0 + o0
+        vadd.s32   q5, q2, q6    // e1 + o1
+        vsub.s32   q6, q2, q6    // e1 - o1
+
+        vqrshrn.s32   \r0, q4, \shift
+        vqrshrn.s32   \r1, q5, \shift
+        vqrshrn.s32   \r2, q6, \shift
+        vqrshrn.s32   \r3, q3, \shift
+.endm
+
+function ff_hevc_transform_4x4_neon_8, export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x00240053 // 36 and 83
+        vmov.32     d0[0], r3
+
+        tr4_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_shift d28, d29, d30, d31, #12
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x4a  // 74
+        vmov.32     d0[0], r3
+        ldr         r3, =0x1d  // 29
+        vmov.32     d0[1], r3
+        ldr         r3, =0x37  // 55
+        vmov.32     d1[0], r3
+
+        tr4_luma_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_luma_shift d28, d29, d30, d31, #12
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+.macro tr8_begin in0, in1, in2, in3
+        vmull.s16  q7, \in0, d1[1]   // 89 * src1
+        vmull.s16  q8, \in0, d1[0]   // 75 * src1
+        vmull.s16  q9, \in0, d1[3]   // 50 * src1
+        vmull.s16  q10, \in0, d1[2]  // 18 * src1
+
+        vmlal.s16  q7, \in1, d1[0]   // 75 * src3
+        vmlsl.s16  q8, \in1, d1[2]   //-18 * src3
+        vmlsl.s16  q9, \in1, d1[1]   //-89 * src3
+        vmlsl.s16  q10, \in1, d1[3]  //-50 * src3
+
+        vmlal.s16  q7, \in2, d1[3]   // 50 * src5
+        vmlsl.s16  q8, \in2, d1[1]   //-89 * src5
+        vmlal.s16  q9, \in2, d1[2]   // 18 * src5
+        vmlal.s16  q10, \in2, d1[0]  // 75 * src5
+
+        vmlal.s16  q7, \in3, d1[2]   // 18 * src7
+        vmlsl.s16  q8, \in3, d1[3]   //-50 * src7
+        vmlal.s16  q9, \in3, d1[0]   // 75 * src7
+        vmlsl.s16  q10, \in3, d1[1]  //-89 * src7
+.endm
+
+.macro tr8_end shift
+        vadd.s32   q1, q4, q7   //  e_8[0] + o_8[0], dst[0]
+        vsub.s32   q4, q4, q7   //  e_8[0] - o_8[0], dst[7]
+
+        vadd.s32   q2, q5, q8   // e_8[1] + o_8[1], dst[1]
+        vsub.s32   q5, q5, q8   // e_8[1] - o_8[1], dst[6]
+
+        vadd.s32   q11, q6, q9  // e_8[2] + o_8[2], dst[2]
+        vsub.s32    q6, q6, q9  // e_8[2] - o_8[2], dst[5]
+
+        vadd.s32   q12, q3, q10 // e_8[3] + o_8[3], dst[3]
+        vsub.s32   q3, q3, q10  // e_8[3] - o_8[3], dst[4]
+        vqrshrn.s32   d2, q1, \shift
+        vqrshrn.s32   d3, q2, \shift
+        vqrshrn.s32   d4, q11, \shift
+        vqrshrn.s32   d5, q12, \shift
+        vqrshrn.s32   d6, q3, \shift
+        vqrshrn.s32   d7, q6, \shift
+        vqrshrn.s32   d9, q4, \shift
+        vqrshrn.s32   d8, q5, \shift
+.endm
+
+function ff_hevc_transform_8x8_neon_8, export=1
+        push   {r4-r8}
+        vpush {d8-d15}
+        mov    r5, #16
+
+        adr       r3, tr4f
+        vld1.16   {d0, d1}, [r3]
+
+        // left half
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #128
+        //skip right half if col_limit in r1 is less than 4
+        cmp      r1, #4
+        blt      1f
+        //right half
+        add      r0, #8
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #136
+1:
+        // top half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #12
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        vstm r0!, {q1-q4}
+
+        // bottom half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #12
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        //vstm     r0, {q1-q4}
+        vst1.16 {q1-q2}, [r0]
+        add     r0, #32
+        vst1.16 {q3-q4}, [r0]
+        sub     r0, #32
+        vpop {d8-d15}
+        pop {r4-r8}
+        bx lr
+endfunc
+
+.align 4
+tr4f:
+.word 0x00240053  // 36 and d1[0] = 83
+.word 0x00000000
+tr8f:
+.word 0x0059004b  // 89, d0[0] = 75
+.word 0x00320012  // 50, d0[2] = 18
+tr16:
+.word 0x005a0057  // 90, d2[0] = 87
+.word 0x00500046  // 80, d2[2] = 70
+.word 0x0039002b  // 57, d2[0] = 43
+.word 0x00190009  // 25, d2[2] = 9
diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c
new file mode 100644
index 0000000..adcc454
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_arm.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+av_cold void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_hevcdsp_init_neon(c, bit_depth);
+}
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
new file mode 100644
index 0000000..5591807
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+
+#define PUT_PIXELS(name) \
+    void name(int16_t *dst, uint8_t *src, \
+                                ptrdiff_t srcstride, int height, \
+                                intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int width);
+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+#define QPEL_FUNC(name) \
+    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+                                   int height, int width)
+
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW_PIX(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int height, intptr_t mx, intptr_t my, int width);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
+#undef QPEL_FUNC_UW_PIX
+
+#define QPEL_FUNC_UW(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width) {
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+    if (bit_depth == 8) {
+        int x;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+        c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+        c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
+        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
+        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
+        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+        put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+        put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+        put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+        put_hevc_qpel_neon[0][1]       = ff_hevc_put_qpel_h1_neon_8;
+        put_hevc_qpel_neon[0][2]       = ff_hevc_put_qpel_h2_neon_8;
+        put_hevc_qpel_neon[0][3]       = ff_hevc_put_qpel_h3_neon_8;
+        put_hevc_qpel_neon[1][1]       = ff_hevc_put_qpel_h1v1_neon_8;
+        put_hevc_qpel_neon[1][2]       = ff_hevc_put_qpel_h2v1_neon_8;
+        put_hevc_qpel_neon[1][3]       = ff_hevc_put_qpel_h3v1_neon_8;
+        put_hevc_qpel_neon[2][1]       = ff_hevc_put_qpel_h1v2_neon_8;
+        put_hevc_qpel_neon[2][2]       = ff_hevc_put_qpel_h2v2_neon_8;
+        put_hevc_qpel_neon[2][3]       = ff_hevc_put_qpel_h3v2_neon_8;
+        put_hevc_qpel_neon[3][1]       = ff_hevc_put_qpel_h1v3_neon_8;
+        put_hevc_qpel_neon[3][2]       = ff_hevc_put_qpel_h2v3_neon_8;
+        put_hevc_qpel_neon[3][3]       = ff_hevc_put_qpel_h3v3_neon_8;
+        put_hevc_qpel_uw_neon[1][0]      = ff_hevc_put_qpel_uw_v1_neon_8;
+        put_hevc_qpel_uw_neon[2][0]      = ff_hevc_put_qpel_uw_v2_neon_8;
+        put_hevc_qpel_uw_neon[3][0]      = ff_hevc_put_qpel_uw_v3_neon_8;
+        put_hevc_qpel_uw_neon[0][1]      = ff_hevc_put_qpel_uw_h1_neon_8;
+        put_hevc_qpel_uw_neon[0][2]      = ff_hevc_put_qpel_uw_h2_neon_8;
+        put_hevc_qpel_uw_neon[0][3]      = ff_hevc_put_qpel_uw_h3_neon_8;
+        put_hevc_qpel_uw_neon[1][1]      = ff_hevc_put_qpel_uw_h1v1_neon_8;
+        put_hevc_qpel_uw_neon[1][2]      = ff_hevc_put_qpel_uw_h2v1_neon_8;
+        put_hevc_qpel_uw_neon[1][3]      = ff_hevc_put_qpel_uw_h3v1_neon_8;
+        put_hevc_qpel_uw_neon[2][1]      = ff_hevc_put_qpel_uw_h1v2_neon_8;
+        put_hevc_qpel_uw_neon[2][2]      = ff_hevc_put_qpel_uw_h2v2_neon_8;
+        put_hevc_qpel_uw_neon[2][3]      = ff_hevc_put_qpel_uw_h3v2_neon_8;
+        put_hevc_qpel_uw_neon[3][1]      = ff_hevc_put_qpel_uw_h1v3_neon_8;
+        put_hevc_qpel_uw_neon[3][2]      = ff_hevc_put_qpel_uw_h2v3_neon_8;
+        put_hevc_qpel_uw_neon[3][3]      = ff_hevc_put_qpel_uw_h3v3_neon_8;
+        for (x = 0; x < 10; x++) {
+            c->put_hevc_qpel[x][1][0]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][0][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][1][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][0]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][0][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+        }
+        c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+        c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+        c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+        c->put_hevc_qpel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+        c->put_hevc_qpel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+        c->put_hevc_qpel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+        c->put_hevc_qpel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+        c->put_hevc_qpel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+        c->put_hevc_qpel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+        c->put_hevc_qpel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+
+        c->put_hevc_qpel_uni[1][0][0]  = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
+        c->put_hevc_qpel_uni[3][0][0]  = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
+        c->put_hevc_qpel_uni[5][0][0]  = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
+        c->put_hevc_qpel_uni[6][0][0]  = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
+        c->put_hevc_qpel_uni[7][0][0]  = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
+        c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+        c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+    }
+}
diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000..86f92cf
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -0,0 +1,999 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro regshuffle_d8
+    vmov d16, d17
+    vmov d17, d18
+    vmov d18, d19
+    vmov d19, d20
+    vmov d20, d21
+    vmov d21, d22
+    vmov d22, d23
+.endm
+
+.macro regshuffle_q8
+    vmov q0, q1
+    vmov q1, q2
+    vmov q2, q3
+    vmov q3, q4
+    vmov q4, q5
+    vmov q5, q6
+    vmov q6, q7
+.endm
+
+.macro vextin8
+        pld       [r2]
+        vld1.8    {q11}, [r2], r3
+        vext.8    d16, d22, d23, #1
+        vext.8    d17, d22, d23, #2
+        vext.8    d18, d22, d23, #3
+        vext.8    d19, d22, d23, #4
+        vext.8    d20, d22, d23, #5
+        vext.8    d21, d22, d23, #6
+        vext.8    d22, d22, d23, #7
+.endm
+
+.macro loadin8
+        pld       [r2]
+        vld1.8    {d16}, [r2], r3
+        pld       [r2]
+        vld1.8    {d17}, [r2], r3
+        pld       [r2]
+        vld1.8    {d18}, [r2], r3
+        pld       [r2]
+        vld1.8    {d19}, [r2], r3
+        pld       [r2]
+        vld1.8    {d20}, [r2], r3
+        pld       [r2]
+        vld1.8    {d21}, [r2], r3
+        pld       [r2]
+        vld1.8    {d22}, [r2], r3
+        pld       [r2]
+        vld1.8    {d23}, [r2], r3
+.endm
+
+.macro qpel_filter_1_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d6, d16   // 58 * d0
+        vmull.s16  q10, d7, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d4, d17   // 10 * c0
+        vmull.s16  q12, d5, d17   // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d8, d16   // 17 * e0
+        vmull.s16  q14, d9, d16   // 17 * e1
+        vmull.s16  q15, d10, d17  //  5 * f0
+        vmull.s16   q8, d11, d17  //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d2, #2    // 4 * b0
+        vshll.s16  q12, d3, #2    // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d12, d0   // g0 - a0
+        vsubl.s16  q14, d13, d1   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+// input  q0 - q7
+// output q8
+.macro qpel_filter_2_32b
+        vmov.i32   q8, #11
+        vaddl.s16   q9, d6, d8   // d0 + e0
+        vaddl.s16  q10, d7, d9   // d1 + e1
+        vaddl.s16  q11, d4, d10  // c0 + f0
+        vaddl.s16  q12, d5, d11  // c1 + f1
+        vmul.s32   q11, q8       // 11 * (c0 + f0)
+        vmul.s32   q12, q8       // 11 * (c1 + f1)
+        vmov.i32   q8, #40
+        vaddl.s16  q15, d2, d12  // b0 + g0
+        vmul.s32    q9, q8       // 40 * (d0 + e0)
+        vmul.s32   q10, q8       // 40 * (d1 + e1)
+        vaddl.s16   q8, d3, d13  // b1 + g1
+        vaddl.s16  q13, d0, d14  // a0 + h0
+        vaddl.s16  q14, d1, d15  // a1 + h1
+        vshl.s32   q15, #2       // 4*(b0+g0)
+        vshl.s32    q8, #2       // 4*(b1+g1)
+        vadd.s32   q11, q13      // 11 * (c0 + f0) + a0 + h0
+        vadd.s32   q12, q14      // 11 * (c1 + f1) + a1 + h1
+        vadd.s32   q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
+        vadd.s32   q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
+        vsub.s32   q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
+        vsub.s32   q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_3_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d8, d16   // 58 * d0
+        vmull.s16  q10, d9, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d10, d17  // 10 * c0
+        vmull.s16  q12, d11, d17  // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d6, d16   // 17 * e0
+        vmull.s16  q14, d7, d16   // 17 * e1
+        vmull.s16  q15, d4, d17   //  5 * f0
+        vmull.s16   q8, d5, d17   //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d12, #2   // 4 * b0
+        vshll.s16  q12, d13, #2   // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d2, d14   // g0 - a0
+        vsubl.s16  q14, d3, d15   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_1 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d20, #4   // 16*e
+        vshll.u8   q14, d21, #2   // 4*f
+        vmull.u8  \out, d19, d24  // 58*d
+        vaddw.u8   q13, q13, d20  // 17*e
+        vmull.u8   q15, d18, d25  // 10*c
+        vaddw.u8   q14, q14, d21  // 5*f
+        vsubl.u8   q12, d22, d16  // g - a
+        vadd.u16  \out, q13       // 58d + 17e
+        vshll.u8   q13, d17, #2   // 4*b
+        vadd.u16   q15, q14       // 10*c + 5*f
+        vadd.s16   q13, q12       // - a + 4*b + g
+        vsub.s16  \out, q15       // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13       // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro qpel_filter_2 out=q7
+        vmov.i16   q12, #10
+        vmov.i16   q14, #11
+        vaddl.u8   q13, d19, d20   // d + e
+        vaddl.u8   q15, d18, d21   // c + f
+        vmul.u16   q13, q12        // 10 * (d+e)
+        vmul.u16   q15, q14        // 11 * ( c + f)
+        vaddl.u8  \out, d17, d22   // b + g
+        vaddl.u8   q12, d16, d23   // a + h
+        vadd.u16  \out, q13        // b + 10 * (d + e) + g
+        vadd.s16   q12, q15
+        vshl.u16  \out, #2         // 4 * (b + 10 * (d + e) + g)
+        vsub.s16  \out, q12
+.endm
+
+.macro qpel_filter_3 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d19, #4     // 16*e
+        vshll.u8   q14, d18, #2     // 4*f
+        vmull.u8  \out, d20, d24    // 58*d
+        vaddw.u8   q13, q13, d19    // 17*e
+        vmull.u8   q15, d21, d25    // 10*c
+        vaddw.u8   q14, q14, d18    // 5*f
+        vsubl.u8   q12, d17, d23    // g - a
+        vadd.u16  \out, q13         // 58d + 17e
+        vshll.u8   q13, d22, #2     // 4*b
+        vadd.u16   q15, q14         // 10*c + 5*f
+        vadd.s16   q13, q12         // - a + 4*b + g
+        vsub.s16  \out, q15         // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13         // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro  hevc_put_qpel_vX_neon_8 filter
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        lsl       r1, #1
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vst1.16    {q7}, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vst1.16    d14, [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro  hevc_put_qpel_uw_vX_neon_8 filter
+        push   {r4-r10}
+        ldr    r5, [sp, #28] // width
+        ldr    r4, [sp, #32] // height
+        ldr    r8, [sp, #36] // src2
+        ldr    r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32    d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+        b   99f
+.Lbi\@: lsl       r9, #1
+        mov       r10, r8
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+        push     {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush    {d8-d15}
+        sub       r2, #4
+        lsl       r1, #1
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16   {q7}, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16  d14, [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush    {d8-d15}
+        sub       r2, #4
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32  d0[0], [r0], r1
+        bne       4b
+        b         99f
+.Lbi\@:
+        lsl       r9, #1
+        cmp       r5, #4
+        beq       4f
+        mov       r10, r8
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        add       r10, #16
+        mov       r8, r10
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        lsl       r1, #1
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vst1.16    {q8}, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vst1.16    d16, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.32        d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+        b   99f
+.Lbi\@: lsl      r9, #1
+        mov      r10, r8
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q8
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d16
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+.macro init_put_pixels
+        pld    [r1]
+        pld    [r1, r2]
+        mov    r12, MAX_PB_SIZE
+        lsl    r12, #1
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+        init_put_pixels
+        vmov.u8      d5, #255
+        vshr.u64     d5, #32
+0:      subs r3, #1
+        vld1.32     {d0[0]}, [r1], r2
+        pld [r1]
+        vld1.32     d6, [r0]
+        vshll.u8    q0, d0, #6
+        vbit        d6, d0, d5
+        vst1.32     d6, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.32   {d0[0]}, [r1], r2
+        vld1.32   {d0[1]}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8   q0, d0, #6
+        vst1.64   {d0}, [r0], r12
+        vst1.64   {d1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+        init_put_pixels
+        vmov.u8      q10, #255
+        vshr.u64     d21, #32
+0:      subs r3, #1
+        vld1.16     {d0}, [r1], r2
+        pld [r1]
+        vshll.u8    q0, d0, #6
+        vld1.8      {q12}, [r0]
+        vbit        q12, q0, q10
+        vst1.8      {q12}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {d0}, [r1], r2
+        vld1.8   {d2}, [r1], r2
+        pld        [r1]
+        pld        [r1, r2]
+        vshll.u8   q0, d0, #6
+        vshll.u8   q1, d2, #6
+        vst1.16   {q0}, [r0], r12
+        vst1.16   {q1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.64    {d0}, [r1]
+        add       r1, #8
+        vld1.32   {d1[0]}, [r1], r2
+        sub       r1, #8
+        vld1.64    {d2}, [r1]
+        add       r1, #8
+        vld1.32   {d1[1]}, [r1], r2
+        sub       r1, #8
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vmov      d22, d19
+        vst1.64   {d16, d17, d18}, [r0], r12
+        vst1.64   {d20, d21, d22}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {q0}, [r1], r2
+        vld1.8   {q1}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vst1.8    {q8, q9}, [r0], r12
+        vst1.8    {q10, q11}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8   {d0, d1, d2}, [r1], r2
+        pld       [r1]
+        vshll.u8  q10, d0, #6
+        vshll.u8  q11, d1, #6
+        vshll.u8  q12, d2, #6
+        vstm     r0, {q10, q11, q12}
+        add      r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8 {q0, q1}, [r1], r2
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vstm    r0, {q8, q9, q10, q11}
+        add     r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add r1, #32
+        vld1.8    {q2}, [r1], r2
+        sub r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vstm r0, {q8, q9, q10, q11, q12, q13}
+        add  r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add      r1, #32
+        vld1.8    {q2, q3}, [r1], r2
+        sub      r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vshll.u8  q14, d6, #6
+        vshll.u8  q15, d7, #6
+        vstm    r0, {q8, q9, q10, q11, q12, q13, q14, q15}
+        add r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
+        push   {r4-r9}
+        ldr    r5, [sp, #24] // width
+        ldr    r4, [sp, #28] // height
+        ldr    r8, [sp, #32] // src2
+        ldr    r9, [sp, #36] // src2stride
+        vpush {d8-d15}
+        cmp    r8, #0
+        bne    2f
+1:      subs r4, #1
+        vld1.8     {d0}, [r2], r3
+        vst1.8      d0, [r0], r1
+        bne 1b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+2:      subs  r4, #1
+        vld1.8         {d0}, [r2], r3
+        vld1.16        {q1}, [r8], r9
+        vshll.u8       q0, d0, #6
+        vqadd.s16      q0, q1
+        vqrshrun.s16   d0, q0, #7
+        vst1.8      d0, [r0], r1
+        bne 2b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+endfunc
+
+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        ldr    r12, [sp] // height
+1:      subs   r12, #4
+        vld1.32     {\regs}  , [r2], r3
+        vld1.32     {\regs2} , [r2], r3
+        vld1.32     {\regs3} , [r2], r3
+        vld1.32     {\regs4} , [r2], r3
+        vst1.32     {\regs}  , [r0], r1
+        vst1.32     {\regs2} , [r0], r1
+        vst1.32     {\regs3} , [r0], r1
+        vst1.32     {\regs4} , [r0], r1
+        bne 1b
+        bx lr
+endfunc
+.endm
+
+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        push   {r4-r5}
+        ldr    r12, [sp, #8] // height
+1:      subs r12, #2
+        mov      r4, r2
+        vld1.32   {\regs} , [r2]!
+        vld1.32   {\regs2} , [r2]
+        add      r2, r4, r3
+        mov      r4, r2
+        vld1.32   {\regs3} , [r2]!
+        vld1.32   {\regs4} , [r2]
+        add      r2, r4, r3
+        mov      r5, r0
+        vst1.32   {\regs} , [r0]!
+        vst1.32   {\regs2} , [r0]
+        add      r0, r5, r1
+        mov      r5, r0
+        vst1.32   {\regs3} , [r0]!
+        vst1.32   {\regs4} , [r0]
+        add      r0, r5, r1
+        bne 1b
+        pop   {r4-r5}
+        bx lr
+endfunc
+.endm
+
+put_qpel_uw_pixels    4, d0[0], d0[1], d1[0], d1[1]
+put_qpel_uw_pixels    8, d0,    d1,    d2,    d3
+put_qpel_uw_pixels_m 12, d0,    d1[0], d2,    d3[0]
+put_qpel_uw_pixels   16, q0,    q1,    q2,    q3
+put_qpel_uw_pixels   24, d0-d2, d3-d5, d16-d18, d19-d21
+put_qpel_uw_pixels   32, q0-q1, q2-q3, q8-q9, q10-q11
+put_qpel_uw_pixels_m 48, q0-q1, q2,    q8-q9, q10
+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S
index 0f8092e..219f793 100644
--- a/libavcodec/arm/hpeldsp_arm.S
+++ b/libavcodec/arm/hpeldsp_arm.S
@@ -2,20 +2,20 @@
 @ ARMv4-optimized halfpel functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h
index a864152..5f3c774 100644
--- a/libavcodec/arm/hpeldsp_arm.h
+++ b/libavcodec/arm/hpeldsp_arm.h
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S
index f1abc32..a8bd459 100644
--- a/libavcodec/arm/hpeldsp_armv6.S
+++ b/libavcodec/arm/hpeldsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c
index 6390660..1977b13 100644
--- a/libavcodec/arm/hpeldsp_init_arm.c
+++ b/libavcodec/arm/hpeldsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized halfpel functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c
index 67a500d..967a8e0 100644
--- a/libavcodec/arm/hpeldsp_init_armv6.c
+++ b/libavcodec/arm/hpeldsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c
index 76d4eaf..d9feadd 100644
--- a/libavcodec/arm/hpeldsp_init_neon.c
+++ b/libavcodec/arm/hpeldsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S
index 90bc3cb..cf4a6cf 100644
--- a/libavcodec/arm/hpeldsp_neon.S
+++ b/libavcodec/arm/hpeldsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idct.h b/libavcodec/arm/idct.h
index 168d64b..39cef3a 100644
--- a/libavcodec/arm/idct.h
+++ b/libavcodec/arm/idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S
index 34f467e..057eff9 100644
--- a/libavcodec/arm/idctdsp_arm.S
+++ b/libavcodec/arm/idctdsp_arm.S
@@ -2,27 +2,27 @@
 @ ARMv4-optimized IDCT functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
 #include "config.h"
 #include "libavutil/arm/asm.S"
 
-@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride)
 function ff_add_pixels_clamped_arm, export=1, align=5
         push            {r4-r10}
         mov             r10, #8
diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h
index 9012b82..d7bc5cd 100644
--- a/libavcodec/arm/idctdsp_arm.h
+++ b/libavcodec/arm/idctdsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S
index c180d73..a6e77d6 100644
--- a/libavcodec/arm/idctdsp_armv6.S
+++ b/libavcodec/arm/idctdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c
index 8207c31..0068e3f 100644
--- a/libavcodec/arm/idctdsp_init_arm.c
+++ b/libavcodec/arm/idctdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized IDCT functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "idctdsp_arm.h"
 
 void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
-                               int line_size);
+                               ptrdiff_t line_size);
 
 /* XXX: those functions should be suppressed ASAP when all IDCTs are
  * converted */
@@ -63,8 +63,8 @@ av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_ARM) {
             c->idct_put  = j_rev_dct_arm_put;
             c->idct_add  = j_rev_dct_arm_add;
diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c
index 251165d..3d881e1 100644
--- a/libavcodec/arm/idctdsp_init_armv5te.c
+++ b/libavcodec/arm/idctdsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,9 @@
 av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
-    if (!high_bit_depth &&
+    if (!avctx->lowres && !high_bit_depth &&
         (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
          avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
         c->idct_put  = ff_simple_idct_put_armv5te;
         c->idct_add  = ff_simple_idct_add_armv5te;
diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c
index 8f0c49b..edf3070 100644
--- a/libavcodec/arm/idctdsp_init_armv6.c
+++ b/libavcodec/arm/idctdsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,13 +27,13 @@
 #include "idctdsp_arm.h"
 
 void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
-                                 int line_size);
+                                 ptrdiff_t line_size);
 
 av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
             c->idct_put  = ff_simple_idct_put_armv6;
             c->idct_add  = ff_simple_idct_add_armv6;
diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c
index c94f7b6..b70c5b0 100644
--- a/libavcodec/arm/idctdsp_init_neon.c
+++ b/libavcodec/arm/idctdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,16 @@
 #include "idct.h"
 #include "idctdsp_arm.h"
 
-void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
 
 av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
+    if (!avctx->lowres && !high_bit_depth) {
         if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
             avctx->idct_algo == FF_IDCT_SIMPLENEON) {
             c->idct_put  = ff_simple_idct_put_neon;
             c->idct_add  = ff_simple_idct_add_neon;
diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S
index 7095879..1911a33 100644
--- a/libavcodec/arm/idctdsp_neon.S
+++ b/libavcodec/arm/idctdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 42f3739..72c4c77 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -1,21 +1,21 @@
 /*
  * ARM NEON optimised integer operations
- * Copyright (c) 2009 Kostya Shishkov
+ * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ function ff_scalarproduct_int16_neon, export=1
         vmlal.s16       q2,  d18,  d22
         vmlal.s16       q3,  d19,  d23
         subs            r2,  r2,   #16
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
@@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1
         vmov.32         r0,  d3[0]
         bx              lr
 endfunc
+
diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/lossless_audiodsp_init_arm.c
index 47ea034..981a39a 100644
--- a/libavcodec/arm/apedsp_init_arm.c
+++ b/libavcodec/arm/lossless_audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,12 +23,12 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3, int len, int mul);
 
-av_cold void ff_apedsp_init_arm(APEDSPContext *c)
+av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/lossless_audiodsp_neon.S
index 7cfbf43..ba7c45f 100644
--- a/libavcodec/arm/apedsp_neon.S
+++ b/libavcodec/arm/lossless_audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised integer operations
  * Copyright (c) 2009 Kostya Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ function ff_scalarproduct_and_madd_int16_neon, export=1
         vst1.16         {q10},     [r12,:128]!
         subs            r3,  r3,   #16
         vst1.16         {q13},     [r12,:128]!
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
index 45ac67d..dc57c55 100644
--- a/libavcodec/arm/mathops.h
+++ b/libavcodec/arm/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_fixed_init_arm.c b/libavcodec/arm/mdct_fixed_init_arm.c
deleted file mode 100644
index 606c80c..0000000
--- a/libavcodec/arm/mdct_fixed_init_arm.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#define FFT_FLOAT 0
-#include "libavcodec/fft.h"
-
-void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
-void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
-
-av_cold void ff_mdct_fixed_init_arm(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        if (!s->inverse && s->nbits >= 3) {
-            s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-            s->mdct_calc        = ff_mdct_fixed_calc_neon;
-            s->mdct_calcw       = ff_mdct_fixed_calcw_neon;
-        }
-    }
-}
diff --git a/libavcodec/arm/mdct_fixed_neon.S b/libavcodec/arm/mdct_fixed_neon.S
index c77be59..365c5e7 100644
--- a/libavcodec/arm/mdct_fixed_neon.S
+++ b/libavcodec/arm/mdct_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_init_arm.c b/libavcodec/arm/mdct_init_arm.c
deleted file mode 100644
index 24678dd..0000000
--- a/libavcodec/arm/mdct_init_arm.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_arm(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_vfp_vm(cpu_flags)) {
-        s->imdct_half   = ff_imdct_half_vfp;
-    }
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_calc   = ff_imdct_calc_neon;
-        s->imdct_half   = ff_imdct_half_neon;
-        s->mdct_calc    = ff_mdct_calc_neon;
-        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-    }
-}
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
index bfe259c..a6952fa 100644
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised MDCT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index f3fe668..43f6d14 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_armv6.S b/libavcodec/arm/me_cmp_armv6.S
index 436e20d..fa5a823 100644
--- a/libavcodec/arm/me_cmp_armv6.S
+++ b/libavcodec/arm/me_cmp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_init_arm.c b/libavcodec/arm/me_cmp_init_arm.c
index 4d73f3e..03870a2 100644
--- a/libavcodec/arm/me_cmp_init_arm.c
+++ b/libavcodec/arm/me_cmp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv5te.S b/libavcodec/arm/mlpdsp_armv5te.S
index 4272dae..4f9aa48 100644
--- a/libavcodec/arm/mlpdsp_armv5te.S
+++ b/libavcodec/arm/mlpdsp_armv5te.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
index de9db46..b7ecf6c 100644
--- a/libavcodec/arm/mlpdsp_armv6.S
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 4cdd10c..34a5f61 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
index 49bd0bc..977abb6 100644
--- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_init_arm.c b/libavcodec/arm/mpegaudiodsp_init_arm.c
index e73aee6..98e0c8a 100644
--- a/libavcodec/arm/mpegaudiodsp_init_arm.c
+++ b/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 34e9cf1..918be16 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2002 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_arm.h b/libavcodec/arm/mpegvideo_arm.h
index 17e3a5b..709ae6b 100644
--- a/libavcodec/arm/mpegvideo_arm.h
+++ b/libavcodec/arm/mpegvideo_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c
index 4bb7b6e..e20bb4c 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -2,24 +2,25 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
 #include "mpegvideo_arm.h"
@@ -55,7 +56,7 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
     int level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qmul = qscale << 1;
 
@@ -84,7 +85,7 @@ static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
     int qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
index 4426e15..8687d6b 100644
--- a/libavcodec/arm/mpegvideo_armv5te_s.S
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -2,20 +2,20 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 3e1f7b5..1889d7a 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S b/libavcodec/arm/mpegvideoencdsp_armv6.S
index 99db501..ab0dad7 100644
--- a/libavcodec/arm/mpegvideoencdsp_armv6.S
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_init_arm.c b/libavcodec/arm/mpegvideoencdsp_init_arm.c
index ab9ba3e..4bfe835 100644
--- a/libavcodec/arm/mpegvideoencdsp_init_arm.c
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neon.S b/libavcodec/arm/neon.S
index 716a607..787bc4b 100644
--- a/libavcodec/arm/neon.S
+++ b/libavcodec/arm/neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neontest.c b/libavcodec/arm/neontest.c
index b77bcd7..a3b5b8e 100644
--- a/libavcodec/arm/neontest.c
+++ b/libavcodec/arm/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/pixblockdsp_armv6.S b/libavcodec/arm/pixblockdsp_armv6.S
index 4c925a4..b10ea78 100644
--- a/libavcodec/arm/pixblockdsp_armv6.S
+++ b/libavcodec/arm/pixblockdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/pixblockdsp_init_arm.c b/libavcodec/arm/pixblockdsp_init_arm.c
index f20769b..76d7509 100644
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,7 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
-void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
+void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
 void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
                           const uint8_t *s2, int stride);
 
diff --git a/libavcodec/arm/rdft_init_arm.c b/libavcodec/arm/rdft_init_arm.c
index 2858ba9..1c5d8be 100644
--- a/libavcodec/arm/rdft_init_arm.c
+++ b/libavcodec/arm/rdft_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S
index 7d01d53..781d976 100644
--- a/libavcodec/arm/rdft_neon.S
+++ b/libavcodec/arm/rdft_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised RDFT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv34dsp_init_arm.c b/libavcodec/arm/rv34dsp_init_arm.c
index 5ce787b..8bfe90b 100644
--- a/libavcodec/arm/rv34dsp_init_arm.c
+++ b/libavcodec/arm/rv34dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index a29123f..3d4a83d 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_init_arm.c b/libavcodec/arm/rv40dsp_init_arm.c
index df3e461..c24854d 100644
--- a/libavcodec/arm/rv40dsp_init_arm.c
+++ b/libavcodec/arm/rv40dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 6bd45eb..099f88c 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c
index 4da7967..4fb69f9 100644
--- a/libavcodec/arm/sbrdsp_init_arm.c
+++ b/libavcodec/arm/sbrdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S
index 610397f..e66abd6 100644
--- a/libavcodec/arm/sbrdsp_neon.S
+++ b/libavcodec/arm/sbrdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
index a651927..42d79ab 100644
--- a/libavcodec/arm/simple_idct_arm.S
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -4,22 +4,22 @@
  * Author: Frederic Boulay <dilb@handhelds.org>
  *
  * The function defined in this file is derived from the simple_idct function
- * from the libavcodec library part of the Libav project.
+ * from the libavcodec library part of the FFmpeg project.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
index bf509ee..d1f10b7 100644
--- a/libavcodec/arm/simple_idct_armv5te.S
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index 6072346..79cf5d4 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index a1cde8d..c3e573c 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -6,20 +6,20 @@
  * Based on Simple IDCT
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h
index d7996c1..cf25d9d 100644
--- a/libavcodec/arm/startcode.h
+++ b/libavcodec/arm/startcode.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
index 64078b2..a46f009 100644
--- a/libavcodec/arm/startcode_armv6.S
+++ b/libavcodec/arm/startcode_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c
index bf0d9b4..ea0ce14 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/synth_filter_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,20 +22,9 @@
 
 #include "libavutil/arm/cpu.h"
 #include "libavutil/attributes.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
-void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs);
-
-void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
-                                SynthFilterContext *synth, FFTContext *imdct,
-                                float synth_buf_ptr[512],
-                                int *synth_buf_offset, float synth_buf2[32],
-                                const float window[512], float *samples_out,
-                                float raXin[32], float scale);
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
 
 void ff_synth_filter_float_vfp(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
@@ -49,21 +38,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float out[32], const float in[32],
                                 float scale);
 
-av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_vfp_vm(cpu_flags)) {
-        s->lfe_fir[0]      = ff_dca_lfe_fir32_vfp;
-        s->lfe_fir[1]      = ff_dca_lfe_fir64_vfp;
-        s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
-    }
-    if (have_neon(cpu_flags)) {
-        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
-        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
-    }
-}
-
 av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S
index 62bb667..5417be7 100644
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
index 5d79e50..596734c 100644
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp.h b/libavcodec/arm/vc1dsp.h
index 30f059f..cd01ac5 100644
--- a/libavcodec/arm/vc1dsp.h
+++ b/libavcodec/arm/vc1dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
index a6a97c8..5f2c759 100644
--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         ff_vc1dsp_init_neon(dsp);
 }
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 9ded7a2..bb873e6 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,40 +37,38 @@ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, int linesize, int16_t *block);
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int rnd);
 
-void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
+#define DECL_PUT(X, Y) \
+void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \
+                                      ptrdiff_t stride, int rnd); \
+static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \
+                                         ptrdiff_t stride, int rnd) \
+{ \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+  dst += 8*stride; src += 8*stride; \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+}
+
+DECL_PUT(1, 0)
+DECL_PUT(2, 0)
+DECL_PUT(3, 0)
+
+DECL_PUT(0, 1)
+DECL_PUT(0, 2)
+DECL_PUT(0, 3)
+
+DECL_PUT(1, 1)
+DECL_PUT(1, 2)
+DECL_PUT(1, 3)
+
+DECL_PUT(2, 1)
+DECL_PUT(2, 2)
+DECL_PUT(2, 3)
+
+DECL_PUT(3, 1)
+DECL_PUT(3, 2)
+DECL_PUT(3, 3)
 
 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
                                 int x, int y);
@@ -81,6 +79,10 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
                                 int x, int y);
 
+#define FN_ASSIGN(X, Y) \
+    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+
 av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
 {
     dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
@@ -92,23 +94,26 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
 
-    dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon;
+    dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
     if (HAVE_AS_DN_DIRECTIVE) {
-    dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = ff_put_vc1_mspel_mc01_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = ff_put_vc1_mspel_mc11_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = ff_put_vc1_mspel_mc21_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = ff_put_vc1_mspel_mc31_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = ff_put_vc1_mspel_mc02_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = ff_put_vc1_mspel_mc12_neon;
-    dsp->put_vc1_mspel_pixels_tab[10] = ff_put_vc1_mspel_mc22_neon;
-    dsp->put_vc1_mspel_pixels_tab[11] = ff_put_vc1_mspel_mc32_neon;
-    dsp->put_vc1_mspel_pixels_tab[12] = ff_put_vc1_mspel_mc03_neon;
-    dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon;
-    dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon;
-    dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon;
+    FN_ASSIGN(1, 0);
+    FN_ASSIGN(2, 0);
+    FN_ASSIGN(3, 0);
+
+    FN_ASSIGN(0, 1);
+    FN_ASSIGN(1, 1);
+    FN_ASSIGN(2, 1);
+    FN_ASSIGN(3, 1);
+
+    FN_ASSIGN(0, 2);
+    FN_ASSIGN(1, 2);
+    FN_ASSIGN(2, 2);
+    FN_ASSIGN(3, 2);
+
+    FN_ASSIGN(0, 3);
+    FN_ASSIGN(1, 3);
+    FN_ASSIGN(2, 3);
+    FN_ASSIGN(3, 3);
     }
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index fa87ede..c4f4db9 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_arm.h b/libavcodec/arm/videodsp_arm.h
index a708759..112cbb8 100644
--- a/libavcodec/arm/videodsp_arm.h
+++ b/libavcodec/arm/videodsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_armv5te.S b/libavcodec/arm/videodsp_armv5te.S
index 0510019..aff1161 100644
--- a/libavcodec/arm/videodsp_armv5te.S
+++ b/libavcodec/arm/videodsp_armv5te.S
@@ -2,20 +2,20 @@
 @ ARMv5te-optimized core video DSP functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/videodsp_init_arm.c b/libavcodec/arm/videodsp_init_arm.c
index 20c6e4a..a89abb2 100644
--- a/libavcodec/arm/videodsp_init_arm.c
+++ b/libavcodec/arm/videodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_init_armv5te.c b/libavcodec/arm/videodsp_init_armv5te.c
index 832191f..1ea1f34 100644
--- a/libavcodec/arm/videodsp_init_armv5te.c
+++ b/libavcodec/arm/videodsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,5 +27,7 @@ void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
 
 av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
 {
+#if HAVE_ARMV5TE_EXTERNAL
     ctx->prefetch = ff_prefetch_arm;
+#endif
 }
diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/vorbisdsp_init_arm.c
index 853ba2d..f4b3d80 100644
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/vorbisdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vorbisdsp_neon.S b/libavcodec/arm/vorbisdsp_neon.S
index 7df876c..79ce54f 100644
--- a/libavcodec/arm/vorbisdsp_neon.S
+++ b/libavcodec/arm/vorbisdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 11e1f1c..d924636 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index 58bd97d..2942d48 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index 6bc9456..feb1247 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_init_arm.c b/libavcodec/arm/vp6dsp_init_arm.c
index 4ec41ed..ed68321 100644
--- a/libavcodec/arm/vp6dsp_init_arm.c
+++ b/libavcodec/arm/vp6dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_neon.S b/libavcodec/arm/vp6dsp_neon.S
index 10b4d0f..03dd28d 100644
--- a/libavcodec/arm/vp6dsp_neon.S
+++ b/libavcodec/arm/vp6dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h
index 93b2788..965342d 100644
--- a/libavcodec/arm/vp8.h
+++ b/libavcodec/arm/vp8.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
index 3863dc3..e7d25a4 100644
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp.h b/libavcodec/arm/vp8dsp.h
index 0d55e0f..7281d0b 100644
--- a/libavcodec/arm/vp8dsp.h
+++ b/libavcodec/arm/vp8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S
index 03100cd..1adcbbd 100644
--- a/libavcodec/arm/vp8dsp_armv6.S
+++ b/libavcodec/arm/vp8dsp_armv6.S
@@ -5,20 +5,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * This code was partially ported from libvpx, which uses this license:
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index aa77dba..8b80176 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_armv6.c b/libavcodec/arm/vp8dsp_init_armv6.c
index febe4e7..a5bcd73 100644
--- a/libavcodec/arm/vp8dsp_init_armv6.c
+++ b/libavcodec/arm/vp8dsp_init_armv6.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_neon.c b/libavcodec/arm/vp8dsp_init_neon.c
index 2b6c775..53f1f23 100644
--- a/libavcodec/arm/vp8dsp_init_neon.c
+++ b/libavcodec/arm/vp8dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index 02236a4..2e87fb0 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ass.c b/libavcodec/ass.c
index def11f0..b4f081c 100644
--- a/libavcodec/ass.c
+++ b/libavcodec/ass.c
@@ -2,65 +2,75 @@
  * SSA/ASS common functions
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "ass.h"
+#include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/common.h"
 
-/**
- * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS.
- *
- * @param avctx pointer to the AVCodecContext
- * @param font name of the default font face to use
- * @param font_size default font size to use
- * @param color default text color to use (ABGR)
- * @param back_color default background color to use (ABGR)
- * @param bold 1 for bold text, 0 for normal text
- * @param italic 1 for italic text, 0 for normal text
- * @param underline 1 for underline text, 0 for normal text
- * @param alignment position of the text (left, center, top...), defined after
- *                  the layout of the numpad (1-3 sub, 4-6 mid, 7-9 top)
- * @return >= 0 on success otherwise an error code <0
- */
-static int ass_subtitle_header(AVCodecContext *avctx,
-                               const char *font, int font_size,
-                               int color, int back_color,
-                               int bold, int italic, int underline,
-                               int alignment)
+int ff_ass_subtitle_header(AVCodecContext *avctx,
+                           const char *font, int font_size,
+                           int color, int back_color,
+                           int bold, int italic, int underline,
+                           int border_style, int alignment)
 {
-    char header[512];
-
-    snprintf(header, sizeof(header),
+    avctx->subtitle_header = av_asprintf(
              "[Script Info]\r\n"
+             "; Script generated by FFmpeg/Lavc%s\r\n"
              "ScriptType: v4.00+\r\n"
+             "PlayResX: %d\r\n"
+             "PlayResY: %d\r\n"
              "\r\n"
              "[V4+ Styles]\r\n"
-             "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, AlphaLevel, Encoding\r\n"
-             "Style: Default,%s,%d,&H%x,&H%x,&H%x,&H%x,%d,%d,%d,1,1,0,%d,10,10,10,0,0\r\n"
+
+             /* ASSv4 header */
+             "Format: Name, "
+             "Fontname, Fontsize, "
+             "PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+             "Bold, Italic, Underline, StrikeOut, "
+             "ScaleX, ScaleY, "
+             "Spacing, Angle, "
+             "BorderStyle, Outline, Shadow, "
+             "Alignment, MarginL, MarginR, MarginV, "
+             "Encoding\r\n"
+
+             "Style: "
+             "Default,"             /* Name */
+             "%s,%d,"               /* Font{name,size} */
+             "&H%x,&H%x,&H%x,&H%x," /* {Primary,Secondary,Outline,Back}Colour */
+             "%d,%d,%d,0,"          /* Bold, Italic, Underline, StrikeOut */
+             "100,100,"             /* Scale{X,Y} */
+             "0,0,"                 /* Spacing, Angle */
+             "%d,1,0,"              /* BorderStyle, Outline, Shadow */
+             "%d,10,10,10,"         /* Alignment, Margin[LRV] */
+             "0\r\n"                /* Encoding */
+
              "\r\n"
              "[Events]\r\n"
-             "Format: Layer, Start, End, Text\r\n",
+             "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\r\n",
+             !(avctx->flags & AV_CODEC_FLAG_BITEXACT) ? AV_STRINGIFY(LIBAVCODEC_VERSION) : "",
+             ASS_DEFAULT_PLAYRESX, ASS_DEFAULT_PLAYRESY,
              font, font_size, color, color, back_color, back_color,
-             -bold, -italic, -underline, alignment);
+             -bold, -italic, -underline, border_style, alignment);
 
-    avctx->subtitle_header = av_strdup(header);
     if (!avctx->subtitle_header)
         return AVERROR(ENOMEM);
     avctx->subtitle_header_size = strlen(avctx->subtitle_header);
@@ -69,57 +79,89 @@ static int ass_subtitle_header(AVCodecContext *avctx,
 
 int ff_ass_subtitle_header_default(AVCodecContext *avctx)
 {
-    return ass_subtitle_header(avctx, ASS_DEFAULT_FONT,
+    return ff_ass_subtitle_header(avctx, ASS_DEFAULT_FONT,
                                ASS_DEFAULT_FONT_SIZE,
                                ASS_DEFAULT_COLOR,
                                ASS_DEFAULT_BACK_COLOR,
                                ASS_DEFAULT_BOLD,
                                ASS_DEFAULT_ITALIC,
                                ASS_DEFAULT_UNDERLINE,
+                               ASS_DEFAULT_BORDERSTYLE,
                                ASS_DEFAULT_ALIGNMENT);
 }
 
-void ff_ass_init(AVSubtitle *sub)
-{
-    memset(sub, 0, sizeof(*sub));
-}
-
-static int ts_to_string(char *str, int strlen, int ts)
+char *ff_ass_get_dialog(int readorder, int layer, const char *style,
+                        const char *speaker, const char *text)
 {
-    int h, m, s;
-    h = ts/360000;  ts -= 360000*h;
-    m = ts/  6000;  ts -=   6000*m;
-    s = ts/   100;  ts -=    100*s;
-    return snprintf(str, strlen, "%d:%02d:%02d.%02d", h, m, s, ts);
+    return av_asprintf("%d,%d,%s,%s,0,0,0,,%s",
+                       readorder, layer, style ? style : "Default",
+                       speaker ? speaker : "", text);
 }
 
 int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
-                    int ts_start, int ts_end, int raw)
+                    int readorder, int layer, const char *style,
+                    const char *speaker)
 {
-    int len = 0, dlen, duration = ts_end - ts_start;
-    char s_start[16], s_end[16], header[48] = {0};
+    char *ass_str;
     AVSubtitleRect **rects;
 
-    if (!raw) {
-        ts_to_string(s_start, sizeof(s_start), ts_start);
-        ts_to_string(s_end,   sizeof(s_end),   ts_end  );
-        len = snprintf(header, sizeof(header), "Dialogue: 0,%s,%s,",
-                       s_start, s_end);
-    }
-
-    dlen = strcspn(dialog, "\n");
-    dlen += dialog[dlen] == '\n';
-
-    rects = av_realloc(sub->rects, (sub->num_rects+1) * sizeof(*sub->rects));
+    rects = av_realloc_array(sub->rects, (sub->num_rects+1), sizeof(*sub->rects));
     if (!rects)
         return AVERROR(ENOMEM);
     sub->rects = rects;
-    sub->end_display_time = FFMAX(sub->end_display_time, 10 * duration);
     rects[sub->num_rects]       = av_mallocz(sizeof(*rects[0]));
+    if (!rects[sub->num_rects])
+        return AVERROR(ENOMEM);
     rects[sub->num_rects]->type = SUBTITLE_ASS;
-    rects[sub->num_rects]->ass  = av_malloc(len + dlen + 1);
-    strcpy (rects[sub->num_rects]->ass      , header);
-    av_strlcpy(rects[sub->num_rects]->ass + len, dialog, dlen + 1);
+    ass_str = ff_ass_get_dialog(readorder, layer, style, speaker, dialog);
+    if (!ass_str)
+        return AVERROR(ENOMEM);
+    rects[sub->num_rects]->ass = ass_str;
     sub->num_rects++;
-    return dlen;
+    return 0;
+}
+
+void ff_ass_decoder_flush(AVCodecContext *avctx)
+{
+    FFASSDecoderContext *s = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        s->readorder = 0;
+}
+
+void ff_ass_bprint_text_event(AVBPrint *buf, const char *p, int size,
+                             const char *linebreaks, int keep_ass_markup)
+{
+    const char *p_end = p + size;
+
+    for (; p < p_end && *p; p++) {
+
+        /* forced custom line breaks, not accounted as "normal" EOL */
+        if (linebreaks && strchr(linebreaks, *p)) {
+            av_bprintf(buf, "\\N");
+
+        /* standard ASS escaping so random characters don't get mis-interpreted
+         * as ASS */
+        } else if (!keep_ass_markup && strchr("{}\\", *p)) {
+            av_bprintf(buf, "\\%c", *p);
+
+        /* some packets might end abruptly (no \0 at the end, like for example
+         * in some cases of demuxing from a classic video container), some
+         * might be terminated with \n or \r\n which we have to remove (for
+         * consistency with those who haven't), and we also have to deal with
+         * evil cases such as \r at the end of the buffer (and no \0 terminated
+         * character) */
+        } else if (p[0] == '\n') {
+            /* some stuff left so we can insert a line break */
+            if (p < p_end - 1)
+                av_bprintf(buf, "\\N");
+        } else if (p[0] == '\r' && p < p_end - 1 && p[1] == '\n') {
+            /* \r followed by a \n, we can skip it. We don't insert the \N yet
+             * because we don't know if it is followed by more text */
+            continue;
+
+        /* finally, a sane character */
+        } else {
+            av_bprint_chars(buf, *p, 1);
+        }
+    }
 }
diff --git a/libavcodec/ass.h b/libavcodec/ass.h
index 1302a04..314b43b 100644
--- a/libavcodec/ass.h
+++ b/libavcodec/ass.h
@@ -2,20 +2,20 @@
  * SSA/ASS common functions
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,10 @@
 #define AVCODEC_ASS_H
 
 #include "avcodec.h"
+#include "libavutil/bprint.h"
+
+#define ASS_DEFAULT_PLAYRESX 384
+#define ASS_DEFAULT_PLAYRESY 288
 
 /**
  * @name Default values for ASS style
@@ -36,8 +40,34 @@
 #define ASS_DEFAULT_ITALIC      0
 #define ASS_DEFAULT_UNDERLINE   0
 #define ASS_DEFAULT_ALIGNMENT   2
+#define ASS_DEFAULT_BORDERSTYLE 1
 /** @} */
 
+typedef struct FFASSDecoderContext {
+    int readorder;
+} FFASSDecoderContext;
+
+/**
+ * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS.
+ *
+ * @param avctx pointer to the AVCodecContext
+ * @param font name of the default font face to use
+ * @param font_size default font size to use
+ * @param color default text color to use (ABGR)
+ * @param back_color default background color to use (ABGR)
+ * @param bold 1 for bold text, 0 for normal text
+ * @param italic 1 for italic text, 0 for normal text
+ * @param underline 1 for underline text, 0 for normal text
+ * @param alignment position of the text (left, center, top...), defined after
+ *                  the layout of the numpad (1-3 sub, 4-6 mid, 7-9 top)
+ * @return >= 0 on success otherwise an error code <0
+ */
+int ff_ass_subtitle_header(AVCodecContext *avctx,
+                           const char *font, int font_size,
+                           int color, int back_color,
+                           int bold, int italic, int underline,
+                           int border_style, int alignment);
+
 /**
  * Generate a suitable AVCodecContext.subtitle_header for SUBTITLE_ASS
  * with default style.
@@ -48,29 +78,34 @@
 int ff_ass_subtitle_header_default(AVCodecContext *avctx);
 
 /**
- * Initialize an AVSubtitle structure for use with ff_ass_add_rect().
- *
- * @param sub pointer to the AVSubtitle
+ * Craft an ASS dialog string.
  */
-void ff_ass_init(AVSubtitle *sub);
+char *ff_ass_get_dialog(int readorder, int layer, const char *style,
+                        const char *speaker, const char *text);
 
 /**
- * Add an ASS dialog line to an AVSubtitle as a new AVSubtitleRect.
- *
- * @param sub pointer to the AVSubtitle
- * @param dialog ASS dialog to add to sub
- * @param ts_start start timestamp for this dialog (in 1/100 second unit)
- * @param ts_end end timestamp for this dialog (in 1/100 second unit)
- * @param raw when set to 1, it indicates that dialog contains a whole ASS
- *                           dialog line which should be copied as is.
- *            when set to 0, it indicates that dialog contains only the Text
- *                           part of the ASS dialog line, the rest of the line
- *                           will be generated.
- * @return number of characters read from dialog. It can be less than the whole
- *         length of dialog, if dialog contains several lines of text.
- *         A negative value indicates an error.
+ * Add an ASS dialog to a subtitle.
  */
 int ff_ass_add_rect(AVSubtitle *sub, const char *dialog,
-                    int ts_start, int ts_end, int raw);
+                    int readorder, int layer, const char *style,
+                    const char *speaker);
+
+/**
+ * Helper to flush a text subtitles decoder making use of the
+ * FFASSDecoderContext.
+ */
+void ff_ass_decoder_flush(AVCodecContext *avctx);
 
+/**
+ * Escape a text subtitle using ASS syntax into an AVBPrint buffer.
+ * Newline characters will be escaped to \N.
+ *
+ * @param buf pointer to an initialized AVBPrint buffer
+ * @param p source text
+ * @param size size of the source text
+ * @param linebreaks additional newline chars, which will be escaped to \N
+ * @param keep_ass_markup braces and backslash will not be escaped if set
+ */
+void ff_ass_bprint_text_event(AVBPrint *buf, const char *p, int size,
+                             const char *linebreaks, int keep_ass_markup);
 #endif /* AVCODEC_ASS_H */
diff --git a/libavcodec/ass_split.c b/libavcodec/ass_split.c
new file mode 100644
index 0000000..beaba7e
--- /dev/null
+++ b/libavcodec/ass_split.c
@@ -0,0 +1,580 @@
+/*
+ * SSA/ASS spliting functions
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass_split.h"
+
+typedef enum {
+    ASS_STR,
+    ASS_INT,
+    ASS_FLT,
+    ASS_COLOR,
+    ASS_TIMESTAMP,
+    ASS_ALGN,
+} ASSFieldType;
+
+typedef struct {
+    const char *name;
+    int type;
+    int offset;
+} ASSFields;
+
+typedef struct {
+    const char *section;
+    const char *format_header;
+    const char *fields_header;
+    int         size;
+    int         offset;
+    int         offset_count;
+    ASSFields   fields[24];
+} ASSSection;
+
+static const ASSSection ass_sections[] = {
+    { .section       = "Script Info",
+      .offset        = offsetof(ASS, script_info),
+      .fields = {{"ScriptType", ASS_STR, offsetof(ASSScriptInfo, script_type)},
+                 {"Collisions", ASS_STR, offsetof(ASSScriptInfo, collisions) },
+                 {"PlayResX",   ASS_INT, offsetof(ASSScriptInfo, play_res_x) },
+                 {"PlayResY",   ASS_INT, offsetof(ASSScriptInfo, play_res_y) },
+                 {"Timer",      ASS_FLT, offsetof(ASSScriptInfo, timer)      },
+                 {0},
+        }
+    },
+    { .section       = "V4+ Styles",
+      .format_header = "Format",
+      .fields_header = "Style",
+      .size          = sizeof(ASSStyle),
+      .offset        = offsetof(ASS, styles),
+      .offset_count  = offsetof(ASS, styles_count),
+      .fields = {{"Name",            ASS_STR,   offsetof(ASSStyle, name)           },
+                 {"Fontname",        ASS_STR,   offsetof(ASSStyle, font_name)      },
+                 {"Fontsize",        ASS_INT,   offsetof(ASSStyle, font_size)      },
+                 {"PrimaryColour",   ASS_COLOR, offsetof(ASSStyle, primary_color)  },
+                 {"SecondaryColour", ASS_COLOR, offsetof(ASSStyle, secondary_color)},
+                 {"OutlineColour",   ASS_COLOR, offsetof(ASSStyle, outline_color)  },
+                 {"BackColour",      ASS_COLOR, offsetof(ASSStyle, back_color)     },
+                 {"Bold",            ASS_INT,   offsetof(ASSStyle, bold)           },
+                 {"Italic",          ASS_INT,   offsetof(ASSStyle, italic)         },
+                 {"Underline",       ASS_INT,   offsetof(ASSStyle, underline)      },
+                 {"StrikeOut",       ASS_INT,   offsetof(ASSStyle, strikeout)      },
+                 {"ScaleX",          ASS_FLT,   offsetof(ASSStyle, scalex)         },
+                 {"ScaleY",          ASS_FLT,   offsetof(ASSStyle, scaley)         },
+                 {"Spacing",         ASS_FLT,   offsetof(ASSStyle, spacing)        },
+                 {"Angle",           ASS_FLT,   offsetof(ASSStyle, angle)          },
+                 {"BorderStyle",     ASS_INT,   offsetof(ASSStyle, border_style)   },
+                 {"Outline",         ASS_FLT,   offsetof(ASSStyle, outline)        },
+                 {"Shadow",          ASS_FLT,   offsetof(ASSStyle, shadow)         },
+                 {"Alignment",       ASS_INT,   offsetof(ASSStyle, alignment)      },
+                 {"MarginL",         ASS_INT,   offsetof(ASSStyle, margin_l)       },
+                 {"MarginR",         ASS_INT,   offsetof(ASSStyle, margin_r)       },
+                 {"MarginV",         ASS_INT,   offsetof(ASSStyle, margin_v)       },
+                 {"Encoding",        ASS_INT,   offsetof(ASSStyle, encoding)       },
+                 {0},
+        }
+    },
+    { .section       = "V4 Styles",
+      .format_header = "Format",
+      .fields_header = "Style",
+      .size          = sizeof(ASSStyle),
+      .offset        = offsetof(ASS, styles),
+      .offset_count  = offsetof(ASS, styles_count),
+      .fields = {{"Name",            ASS_STR,   offsetof(ASSStyle, name)           },
+                 {"Fontname",        ASS_STR,   offsetof(ASSStyle, font_name)      },
+                 {"Fontsize",        ASS_INT,   offsetof(ASSStyle, font_size)      },
+                 {"PrimaryColour",   ASS_COLOR, offsetof(ASSStyle, primary_color)  },
+                 {"SecondaryColour", ASS_COLOR, offsetof(ASSStyle, secondary_color)},
+                 {"TertiaryColour",  ASS_COLOR, offsetof(ASSStyle, outline_color)  },
+                 {"BackColour",      ASS_COLOR, offsetof(ASSStyle, back_color)     },
+                 {"Bold",            ASS_INT,   offsetof(ASSStyle, bold)           },
+                 {"Italic",          ASS_INT,   offsetof(ASSStyle, italic)         },
+                 {"BorderStyle",     ASS_INT,   offsetof(ASSStyle, border_style)   },
+                 {"Outline",         ASS_FLT,   offsetof(ASSStyle, outline)        },
+                 {"Shadow",          ASS_FLT,   offsetof(ASSStyle, shadow)         },
+                 {"Alignment",       ASS_ALGN,  offsetof(ASSStyle, alignment)      },
+                 {"MarginL",         ASS_INT,   offsetof(ASSStyle, margin_l)       },
+                 {"MarginR",         ASS_INT,   offsetof(ASSStyle, margin_r)       },
+                 {"MarginV",         ASS_INT,   offsetof(ASSStyle, margin_v)       },
+                 {"AlphaLevel",      ASS_INT,   offsetof(ASSStyle, alpha_level)    },
+                 {"Encoding",        ASS_INT,   offsetof(ASSStyle, encoding)       },
+                 {0},
+        }
+    },
+    { .section       = "Events",
+      .format_header = "Format",
+      .fields_header = "Dialogue",
+      .size          = sizeof(ASSDialog),
+      .offset        = offsetof(ASS, dialogs),
+      .offset_count  = offsetof(ASS, dialogs_count),
+      .fields = {{"Layer",   ASS_INT,        offsetof(ASSDialog, layer)   },
+                 {"Start",   ASS_TIMESTAMP,  offsetof(ASSDialog, start)   },
+                 {"End",     ASS_TIMESTAMP,  offsetof(ASSDialog, end)     },
+                 {"Style",   ASS_STR,        offsetof(ASSDialog, style)   },
+                 {"Name",    ASS_STR,        offsetof(ASSDialog, name)    },
+                 {"MarginL", ASS_INT,        offsetof(ASSDialog, margin_l)},
+                 {"MarginR", ASS_INT,        offsetof(ASSDialog, margin_r)},
+                 {"MarginV", ASS_INT,        offsetof(ASSDialog, margin_v)},
+                 {"Effect",  ASS_STR,        offsetof(ASSDialog, effect)  },
+                 {"Text",    ASS_STR,        offsetof(ASSDialog, text)    },
+                 {0},
+        }
+    },
+};
+
+
+typedef int (*ASSConvertFunc)(void *dest, const char *buf, int len);
+
+static int convert_str(void *dest, const char *buf, int len)
+{
+    char *str = av_malloc(len + 1);
+    if (str) {
+        memcpy(str, buf, len);
+        str[len] = 0;
+        if (*(void **)dest)
+            av_free(*(void **)dest);
+        *(char **)dest = str;
+    }
+    return !str;
+}
+static int convert_int(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "%d", (int *)dest) == 1;
+}
+static int convert_flt(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "%f", (float *)dest) == 1;
+}
+static int convert_color(void *dest, const char *buf, int len)
+{
+    return sscanf(buf, "&H%8x", (int *)dest) == 1 ||
+           sscanf(buf, "%d",    (int *)dest) == 1;
+}
+static int convert_timestamp(void *dest, const char *buf, int len)
+{
+    int c, h, m, s, cs;
+    if ((c = sscanf(buf, "%d:%02d:%02d.%02d", &h, &m, &s, &cs)) == 4)
+        *(int *)dest = 360000*h + 6000*m + 100*s + cs;
+    return c == 4;
+}
+static int convert_alignment(void *dest, const char *buf, int len)
+{
+    int a;
+    if (sscanf(buf, "%d", &a) == 1) {
+        /* convert V4 Style alignment to V4+ Style */
+        *(int *)dest = a + ((a&4) >> 1) - 5*!!(a&8);
+        return 1;
+    }
+    return 0;
+}
+
+static const ASSConvertFunc convert_func[] = {
+    [ASS_STR]       = convert_str,
+    [ASS_INT]       = convert_int,
+    [ASS_FLT]       = convert_flt,
+    [ASS_COLOR]     = convert_color,
+    [ASS_TIMESTAMP] = convert_timestamp,
+    [ASS_ALGN]      = convert_alignment,
+};
+
+
+struct ASSSplitContext {
+    ASS ass;
+    int current_section;
+    int field_number[FF_ARRAY_ELEMS(ass_sections)];
+    int *field_order[FF_ARRAY_ELEMS(ass_sections)];
+};
+
+
+static uint8_t *realloc_section_array(ASSSplitContext *ctx)
+{
+    const ASSSection *section = &ass_sections[ctx->current_section];
+    int *count = (int *)((uint8_t *)&ctx->ass + section->offset_count);
+    void **section_ptr = (void **)((uint8_t *)&ctx->ass + section->offset);
+    uint8_t *tmp = av_realloc_array(*section_ptr, (*count+1), section->size);
+    if (!tmp)
+        return NULL;
+    *section_ptr = tmp;
+    tmp += *count * section->size;
+    memset(tmp, 0, section->size);
+    (*count)++;
+    return tmp;
+}
+
+static inline int is_eol(char buf)
+{
+    return buf == '\r' || buf == '\n' || buf == 0;
+}
+
+static inline const char *skip_space(const char *buf)
+{
+    while (*buf == ' ')
+        buf++;
+    return buf;
+}
+
+static int *get_default_field_orders(const ASSSection *section)
+{
+    int i;
+    int *order = av_malloc_array(FF_ARRAY_ELEMS(section->fields), sizeof(*order));
+
+    if (!order)
+        return NULL;
+    for (i = 0; section->fields[i].name; i++)
+        order[i] = i;
+    while (i < FF_ARRAY_ELEMS(section->fields))
+        order[i] = -1;
+    return order;
+}
+
+static const char *ass_split_section(ASSSplitContext *ctx, const char *buf)
+{
+    const ASSSection *section = &ass_sections[ctx->current_section];
+    int *number = &ctx->field_number[ctx->current_section];
+    int *order = ctx->field_order[ctx->current_section];
+    int *tmp, i, len;
+
+    while (buf && *buf) {
+        if (buf[0] == '[') {
+            ctx->current_section = -1;
+            break;
+        }
+        if (buf[0] == ';' || (buf[0] == '!' && buf[1] == ':')) {
+            /* skip comments */
+        } else if (section->format_header && !order) {
+            len = strlen(section->format_header);
+            if (strncmp(buf, section->format_header, len) || buf[len] != ':')
+                goto next_line;
+            buf += len + 1;
+            while (!is_eol(*buf)) {
+                buf = skip_space(buf);
+                len = strcspn(buf, ", \r\n");
+                if (!(tmp = av_realloc_array(order, (*number + 1), sizeof(*order))))
+                    return NULL;
+                order = tmp;
+                order[*number] = -1;
+                for (i=0; section->fields[i].name; i++)
+                    if (!strncmp(buf, section->fields[i].name, len)) {
+                        order[*number] = i;
+                        break;
+                    }
+                (*number)++;
+                buf = skip_space(buf + len + (buf[len] == ','));
+            }
+            ctx->field_order[ctx->current_section] = order;
+        } else if (section->fields_header) {
+            len = strlen(section->fields_header);
+            if (!strncmp(buf, section->fields_header, len) && buf[len] == ':') {
+                uint8_t *ptr, *struct_ptr = realloc_section_array(ctx);
+                if (!struct_ptr)  return NULL;
+
+                /* No format header line found so far, assume default */
+                if (!order) {
+                    order = get_default_field_orders(section);
+                    if (!order)
+                        return NULL;
+                    ctx->field_order[ctx->current_section] = order;
+                }
+
+                buf += len + 1;
+                for (i=0; !is_eol(*buf) && i < *number; i++) {
+                    int last = i == *number - 1;
+                    buf = skip_space(buf);
+                    len = strcspn(buf, last ? "\r\n" : ",\r\n");
+                    if (order[i] >= 0) {
+                        ASSFieldType type = section->fields[order[i]].type;
+                        ptr = struct_ptr + section->fields[order[i]].offset;
+                        convert_func[type](ptr, buf, len);
+                    }
+                    buf += len;
+                    if (!last && *buf) buf++;
+                    buf = skip_space(buf);
+                }
+            }
+        } else {
+            len = strcspn(buf, ":\r\n");
+            if (buf[len] == ':') {
+                for (i=0; section->fields[i].name; i++)
+                    if (!strncmp(buf, section->fields[i].name, len)) {
+                        ASSFieldType type = section->fields[i].type;
+                        uint8_t *ptr = (uint8_t *)&ctx->ass + section->offset;
+                        ptr += section->fields[i].offset;
+                        buf = skip_space(buf + len + 1);
+                        convert_func[type](ptr, buf, strcspn(buf, "\r\n"));
+                        break;
+                    }
+            }
+        }
+next_line:
+        buf += strcspn(buf, "\n");
+        buf += !!*buf;
+    }
+    return buf;
+}
+
+static int ass_split(ASSSplitContext *ctx, const char *buf)
+{
+    char c, section[16];
+    int i;
+
+    if (ctx->current_section >= 0)
+        buf = ass_split_section(ctx, buf);
+
+    while (buf && *buf) {
+        if (sscanf(buf, "[%15[0-9A-Za-z+ ]]%c", section, &c) == 2) {
+            buf += strcspn(buf, "\n");
+            buf += !!*buf;
+            for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++)
+                if (!strcmp(section, ass_sections[i].section)) {
+                    ctx->current_section = i;
+                    buf = ass_split_section(ctx, buf);
+                }
+        } else {
+            buf += strcspn(buf, "\n");
+            buf += !!*buf;
+        }
+    }
+    return buf ? 0 : AVERROR_INVALIDDATA;
+}
+
+ASSSplitContext *ff_ass_split(const char *buf)
+{
+    ASSSplitContext *ctx = av_mallocz(sizeof(*ctx));
+    if (!ctx)
+        return NULL;
+    ctx->current_section = -1;
+    if (ass_split(ctx, buf) < 0) {
+        ff_ass_split_free(ctx);
+        return NULL;
+    }
+    return ctx;
+}
+
+static void free_section(ASSSplitContext *ctx, const ASSSection *section)
+{
+    uint8_t *ptr = (uint8_t *)&ctx->ass + section->offset;
+    int i, j, *count, c = 1;
+
+    if (section->format_header) {
+        ptr   = *(void **)ptr;
+        count = (int *)((uint8_t *)&ctx->ass + section->offset_count);
+    } else
+        count = &c;
+
+    if (ptr)
+        for (i=0; i<*count; i++, ptr += section->size)
+            for (j=0; section->fields[j].name; j++) {
+                const ASSFields *field = &section->fields[j];
+                if (field->type == ASS_STR)
+                    av_freep(ptr + field->offset);
+            }
+    *count = 0;
+
+    if (section->format_header)
+        av_freep((uint8_t *)&ctx->ass + section->offset);
+}
+
+ASSDialog *ff_ass_split_dialog(ASSSplitContext *ctx, const char *buf,
+                               int cache, int *number)
+{
+    ASSDialog *dialog = NULL;
+    int i, count;
+    if (!cache)
+        for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++)
+            if (!strcmp(ass_sections[i].section, "Events")) {
+                free_section(ctx, &ass_sections[i]);
+                break;
+            }
+    count = ctx->ass.dialogs_count;
+    if (ass_split(ctx, buf) == 0)
+        dialog = ctx->ass.dialogs + count;
+    if (number)
+        *number = ctx->ass.dialogs_count - count;
+    return dialog;
+}
+
+void ff_ass_free_dialog(ASSDialog **dialogp)
+{
+    ASSDialog *dialog = *dialogp;
+    if (!dialog)
+        return;
+    av_freep(&dialog->style);
+    av_freep(&dialog->name);
+    av_freep(&dialog->effect);
+    av_freep(&dialog->text);
+    av_freep(dialogp);
+}
+
+ASSDialog *ff_ass_split_dialog2(ASSSplitContext *ctx, const char *buf)
+{
+    int i;
+    static const ASSFields fields[] = {
+        {"ReadOrder", ASS_INT, offsetof(ASSDialog, readorder)},
+        {"Layer",     ASS_INT, offsetof(ASSDialog, layer)    },
+        {"Style",     ASS_STR, offsetof(ASSDialog, style)    },
+        {"Name",      ASS_STR, offsetof(ASSDialog, name)     },
+        {"MarginL",   ASS_INT, offsetof(ASSDialog, margin_l) },
+        {"MarginR",   ASS_INT, offsetof(ASSDialog, margin_r) },
+        {"MarginV",   ASS_INT, offsetof(ASSDialog, margin_v) },
+        {"Effect",    ASS_STR, offsetof(ASSDialog, effect)   },
+        {"Text",      ASS_STR, offsetof(ASSDialog, text)     },
+    };
+
+    ASSDialog *dialog = av_mallocz(sizeof(*dialog));
+    if (!dialog)
+        return NULL;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(fields); i++) {
+        size_t len;
+        const int last = i == FF_ARRAY_ELEMS(fields) - 1;
+        const ASSFieldType type = fields[i].type;
+        uint8_t *ptr = (uint8_t *)dialog + fields[i].offset;
+        buf = skip_space(buf);
+        len = last ? strlen(buf) : strcspn(buf, ",");
+        if (len >= INT_MAX) {
+            ff_ass_free_dialog(&dialog);
+            return NULL;
+        }
+        convert_func[type](ptr, buf, len);
+        buf += len;
+        if (*buf) buf++;
+    }
+    return dialog;
+}
+
+void ff_ass_split_free(ASSSplitContext *ctx)
+{
+    if (ctx) {
+        int i;
+        for (i=0; i<FF_ARRAY_ELEMS(ass_sections); i++) {
+            free_section(ctx, &ass_sections[i]);
+            av_freep(&(ctx->field_order[i]));
+        }
+        av_free(ctx);
+    }
+}
+
+
+int ff_ass_split_override_codes(const ASSCodesCallbacks *callbacks, void *priv,
+                                const char *buf)
+{
+    const char *text = NULL;
+    char new_line[2];
+    int text_len = 0;
+
+    while (buf && *buf) {
+        if (text && callbacks->text &&
+            (sscanf(buf, "\\%1[nN]", new_line) == 1 ||
+             !strncmp(buf, "{\\", 2))) {
+            callbacks->text(priv, text, text_len);
+            text = NULL;
+        }
+        if (sscanf(buf, "\\%1[nN]", new_line) == 1) {
+            if (callbacks->new_line)
+                callbacks->new_line(priv, new_line[0] == 'N');
+            buf += 2;
+        } else if (!strncmp(buf, "{\\", 2)) {
+            buf++;
+            while (*buf == '\\') {
+                char style[2], c[2], sep[2], c_num[2] = "0", tmp[128] = {0};
+                unsigned int color = 0xFFFFFFFF;
+                int len, size = -1, an = -1, alpha = -1;
+                int x1, y1, x2, y2, t1 = -1, t2 = -1;
+                if (sscanf(buf, "\\%1[bisu]%1[01\\}]%n", style, c, &len) > 1) {
+                    int close = c[0] == '0' ? 1 : c[0] == '1' ? 0 : -1;
+                    len += close != -1;
+                    if (callbacks->style)
+                        callbacks->style(priv, style[0], close);
+                } else if (sscanf(buf, "\\c%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\c&H%X&%1[\\}]%n", &color, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]c%1[\\}]%n", c_num, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]c&H%X&%1[\\}]%n", c_num, &color, sep, &len) > 2) {
+                    if (callbacks->color)
+                        callbacks->color(priv, color, c_num[0] - '0');
+                } else if (sscanf(buf, "\\alpha%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\alpha&H%2X&%1[\\}]%n", &alpha, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]a%1[\\}]%n", c_num, sep, &len) > 1 ||
+                           sscanf(buf, "\\%1[1234]a&H%2X&%1[\\}]%n", c_num, &alpha, sep, &len) > 2) {
+                    if (callbacks->alpha)
+                        callbacks->alpha(priv, alpha, c_num[0] - '0');
+                } else if (sscanf(buf, "\\fn%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\fn%127[^\\}]%1[\\}]%n", tmp, sep, &len) > 1) {
+                    if (callbacks->font_name)
+                        callbacks->font_name(priv, tmp[0] ? tmp : NULL);
+                } else if (sscanf(buf, "\\fs%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\fs%u%1[\\}]%n", &size, sep, &len) > 1) {
+                    if (callbacks->font_size)
+                        callbacks->font_size(priv, size);
+                } else if (sscanf(buf, "\\a%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\a%2u%1[\\}]%n", &an, sep, &len) > 1 ||
+                           sscanf(buf, "\\an%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\an%1u%1[\\}]%n", &an, sep, &len) > 1) {
+                    if (an != -1 && buf[2] != 'n')
+                        an = (an&3) + (an&4 ? 6 : an&8 ? 3 : 0);
+                    if (callbacks->alignment)
+                        callbacks->alignment(priv, an);
+                } else if (sscanf(buf, "\\r%1[\\}]%n", sep, &len) > 0 ||
+                           sscanf(buf, "\\r%127[^\\}]%1[\\}]%n", tmp, sep, &len) > 1) {
+                    if (callbacks->cancel_overrides)
+                        callbacks->cancel_overrides(priv, tmp);
+                } else if (sscanf(buf, "\\move(%d,%d,%d,%d)%1[\\}]%n", &x1, &y1, &x2, &y2, sep, &len) > 4 ||
+                           sscanf(buf, "\\move(%d,%d,%d,%d,%d,%d)%1[\\}]%n", &x1, &y1, &x2, &y2, &t1, &t2, sep, &len) > 6) {
+                    if (callbacks->move)
+                        callbacks->move(priv, x1, y1, x2, y2, t1, t2);
+                } else if (sscanf(buf, "\\pos(%d,%d)%1[\\}]%n", &x1, &y1, sep, &len) > 2) {
+                    if (callbacks->move)
+                        callbacks->move(priv, x1, y1, x1, y1, -1, -1);
+                } else if (sscanf(buf, "\\org(%d,%d)%1[\\}]%n", &x1, &y1, sep, &len) > 2) {
+                    if (callbacks->origin)
+                        callbacks->origin(priv, x1, y1);
+                } else {
+                    len = strcspn(buf+1, "\\}") + 2;  /* skip unknown code */
+                }
+                buf += len - 1;
+            }
+            if (*buf++ != '}')
+                return AVERROR_INVALIDDATA;
+        } else {
+            if (!text) {
+                text = buf;
+                text_len = 1;
+            } else
+                text_len++;
+            buf++;
+        }
+    }
+    if (text && callbacks->text)
+        callbacks->text(priv, text, text_len);
+    if (callbacks->end)
+        callbacks->end(priv);
+    return 0;
+}
+
+ASSStyle *ff_ass_style_get(ASSSplitContext *ctx, const char *style)
+{
+    ASS *ass = &ctx->ass;
+    int i;
+
+    if (!style || !*style)
+        style = "Default";
+    for (i=0; i<ass->styles_count; i++)
+        if (ass->styles[i].name && !strcmp(ass->styles[i].name, style))
+            return ass->styles + i;
+    return NULL;
+}
diff --git a/libavcodec/ass_split.h b/libavcodec/ass_split.h
new file mode 100644
index 0000000..30ce772
--- /dev/null
+++ b/libavcodec/ass_split.h
@@ -0,0 +1,207 @@
+/*
+ * SSA/ASS spliting functions
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ASS_SPLIT_H
+#define AVCODEC_ASS_SPLIT_H
+
+/**
+ * fields extracted from the [Script Info] section
+ */
+typedef struct {
+    char *script_type;    /**< SSA script format version (eg. v4.00) */
+    char *collisions;     /**< how subtitles are moved to prevent collisions */
+    int   play_res_x;     /**< video width that ASS coords are referring to */
+    int   play_res_y;     /**< video height that ASS coords are referring to */
+    float timer;          /**< time multiplier to apply to SSA clock (in %) */
+} ASSScriptInfo;
+
+/**
+ * fields extracted from the [V4(+) Styles] section
+ */
+typedef struct {
+    char *name;           /**< name of the tyle (case sensitive) */
+    char *font_name;      /**< font face (case sensitive) */
+    int   font_size;      /**< font height */
+    int   primary_color;  /**< color that a subtitle will normally appear in */
+    int   secondary_color;
+    int   outline_color;  /**< color for outline in ASS, called tertiary in SSA */
+    int   back_color;     /**< color of the subtitle outline or shadow */
+    int   bold;           /**< whether text is bold (1) or not (0) */
+    int   italic;         /**< whether text is italic (1) or not (0) */
+    int   underline;      /**< whether text is underlined (1) or not (0) */
+    int   strikeout;
+    float scalex;
+    float scaley;
+    float spacing;
+    float angle;
+    int   border_style;
+    float outline;
+    float shadow;
+    int   alignment;      /**< position of the text (left, center, top...),
+                               defined after the layout of the numpad
+                               (1-3 sub, 4-6 mid, 7-9 top) */
+    int   margin_l;
+    int   margin_r;
+    int   margin_v;
+    int   alpha_level;
+    int   encoding;
+} ASSStyle;
+
+/**
+ * fields extracted from the [Events] section
+ */
+typedef struct {
+    int   readorder;
+    int   layer;    /**< higher numbered layers are drawn over lower numbered */
+    int   start;    /**< start time of the dialog in centiseconds */
+    int   end;      /**< end time of the dialog in centiseconds */
+    char *style;    /**< name of the ASSStyle to use with this dialog */
+    char *name;
+    int   margin_l;
+    int   margin_r;
+    int   margin_v;
+    char *effect;
+    char *text;     /**< actual text which will be displayed as a subtitle,
+                         can include style override control codes (see
+                         ff_ass_split_override_codes()) */
+} ASSDialog;
+
+/**
+ * structure containing the whole split ASS data
+ */
+typedef struct {
+    ASSScriptInfo script_info;   /**< general information about the SSA script*/
+    ASSStyle     *styles;        /**< array of split out styles */
+    int           styles_count;  /**< number of ASSStyle in the styles array */
+    ASSDialog    *dialogs;       /**< array of split out dialogs */
+    int           dialogs_count; /**< number of ASSDialog in the dialogs array*/
+} ASS;
+
+/**
+ * This struct can be casted to ASS to access to the split data.
+ */
+typedef struct ASSSplitContext ASSSplitContext;
+
+/**
+ * Split a full ASS file or a ASS header from a string buffer and store
+ * the split structure in a newly allocated context.
+ *
+ * @param buf String containing the ASS formatted data.
+ * @return Newly allocated struct containing split data.
+ */
+ASSSplitContext *ff_ass_split(const char *buf);
+
+/**
+ * Split one or several ASS "Dialogue" lines from a string buffer and store
+ * them in an already initialized context.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param buf String containing the ASS "Dialogue" lines.
+ * @param cache Set to 1 to keep all the previously split ASSDialog in
+ *              the context, or set to 0 to free all the previously split
+ *              ASSDialog.
+ * @param number If not NULL, the pointed integer will be set to the number
+ *               of split ASSDialog.
+ * @return Pointer to the first split ASSDialog.
+ */
+ASSDialog *ff_ass_split_dialog(ASSSplitContext *ctx, const char *buf,
+                               int cache, int *number);
+
+/**
+ * Free a dialogue obtained from ff_ass_split_dialog2().
+ */
+void ff_ass_free_dialog(ASSDialog **dialogp);
+
+/**
+ * Split one ASS Dialogue line from a string buffer.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param buf String containing the ASS "Dialogue" line.
+ * @return Pointer to the split ASSDialog. Must be freed with ff_ass_free_dialog()
+ */
+ASSDialog *ff_ass_split_dialog2(ASSSplitContext *ctx, const char *buf);
+
+/**
+ * Free all the memory allocated for an ASSSplitContext.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ */
+void ff_ass_split_free(ASSSplitContext *ctx);
+
+
+/**
+ * Set of callback functions corresponding to each override codes that can
+ * be encountered in a "Dialogue" Text field.
+ */
+typedef struct {
+    /**
+     * @defgroup ass_styles    ASS styles
+     * @{
+     */
+    void (*text)(void *priv, const char *text, int len);
+    void (*new_line)(void *priv, int forced);
+    void (*style)(void *priv, char style, int close);
+    void (*color)(void *priv, unsigned int /* color */, unsigned int color_id);
+    void (*alpha)(void *priv, int alpha, int alpha_id);
+    void (*font_name)(void *priv, const char *name);
+    void (*font_size)(void *priv, int size);
+    void (*alignment)(void *priv, int alignment);
+    void (*cancel_overrides)(void *priv, const char *style);
+    /** @} */
+
+    /**
+     * @defgroup ass_functions    ASS functions
+     * @{
+     */
+    void (*move)(void *priv, int x1, int y1, int x2, int y2, int t1, int t2);
+    void (*origin)(void *priv, int x, int y);
+    /** @} */
+
+    /**
+     * @defgroup ass_end    end of Dialogue Event
+     * @{
+     */
+    void (*end)(void *priv);
+    /** @} */
+} ASSCodesCallbacks;
+
+/**
+ * Split override codes out of a ASS "Dialogue" Text field.
+ *
+ * @param callbacks Set of callback functions called for each override code
+ *                  encountered.
+ * @param priv Opaque pointer passed to the callback functions.
+ * @param buf The ASS "Dialogue" Text field to split.
+ * @return >= 0 on success otherwise an error code <0
+ */
+int ff_ass_split_override_codes(const ASSCodesCallbacks *callbacks, void *priv,
+                                const char *buf);
+
+/**
+ * Find an ASSStyle structure by its name.
+ *
+ * @param ctx Context previously initialized by ff_ass_split().
+ * @param style name of the style to search for.
+ * @return the ASSStyle corresponding to style, or NULL if style can't be found
+ */
+ASSStyle *ff_ass_style_get(ASSSplitContext *ctx, const char *style);
+
+#endif /* AVCODEC_ASS_SPLIT_H */
diff --git a/libavcodec/assdec.c b/libavcodec/assdec.c
index 48fe32e..3178f29 100644
--- a/libavcodec/assdec.c
+++ b/libavcodec/assdec.c
@@ -2,20 +2,20 @@
  * SSA/ASS decoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,11 @@
 
 static av_cold int ass_decode_init(AVCodecContext *avctx)
 {
-    avctx->subtitle_header = av_malloc(avctx->extradata_size);
+    avctx->subtitle_header = av_malloc(avctx->extradata_size + 1);
     if (!avctx->subtitle_header)
         return AVERROR(ENOMEM);
     memcpy(avctx->subtitle_header, avctx->extradata, avctx->extradata_size);
+    avctx->subtitle_header[avctx->extradata_size] = 0;
     avctx->subtitle_header_size = avctx->extradata_size;
     return 0;
 }
@@ -39,28 +40,44 @@ static av_cold int ass_decode_init(AVCodecContext *avctx)
 static int ass_decode_frame(AVCodecContext *avctx, void *data, int *got_sub_ptr,
                             AVPacket *avpkt)
 {
-    const char *ptr = avpkt->data;
-    int len, size = avpkt->size;
+    AVSubtitle *sub = data;
 
-    ff_ass_init(data);
+    if (avpkt->size <= 0)
+        return avpkt->size;
 
-    while (size > 0) {
-        len = ff_ass_add_rect(data, ptr, 0, 0/* FIXME: duration */, 1);
-        if (len < 0)
-            return len;
-        ptr  += len;
-        size -= len;
-    }
-
-    *got_sub_ptr = avpkt->size > 0;
+    sub->rects = av_malloc(sizeof(*sub->rects));
+    if (!sub->rects)
+        return AVERROR(ENOMEM);
+    sub->rects[0] = av_mallocz(sizeof(*sub->rects[0]));
+    if (!sub->rects[0])
+        return AVERROR(ENOMEM);
+    sub->num_rects = 1;
+    sub->rects[0]->type = SUBTITLE_ASS;
+    sub->rects[0]->ass  = av_strdup(avpkt->data);
+    if (!sub->rects[0]->ass)
+        return AVERROR(ENOMEM);
+    *got_sub_ptr = 1;
     return avpkt->size;
 }
 
+#if CONFIG_SSA_DECODER
+AVCodec ff_ssa_decoder = {
+    .name         = "ssa",
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_ASS,
+    .init         = ass_decode_init,
+    .decode       = ass_decode_frame,
+};
+#endif
+
+#if CONFIG_ASS_DECODER
 AVCodec ff_ass_decoder = {
     .name         = "ass",
-    .long_name    = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
-    .id           = AV_CODEC_ID_SSA,
+    .id           = AV_CODEC_ID_ASS,
     .init         = ass_decode_init,
     .decode       = ass_decode_frame,
 };
+#endif
diff --git a/libavcodec/assenc.c b/libavcodec/assenc.c
index caf266e..dc4f0ff 100644
--- a/libavcodec/assenc.c
+++ b/libavcodec/assenc.c
@@ -2,37 +2,43 @@
  * SSA/ASS encoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
 #include "avcodec.h"
+#include "ass.h"
 #include "libavutil/avstring.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
+typedef struct {
+    int id; ///< current event id, ReadOrder field
+} ASSEncodeContext;
+
 static av_cold int ass_encode_init(AVCodecContext *avctx)
 {
-    avctx->extradata = av_malloc(avctx->subtitle_header_size);
+    avctx->extradata = av_malloc(avctx->subtitle_header_size + 1);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
     memcpy(avctx->extradata, avctx->subtitle_header, avctx->subtitle_header_size);
     avctx->extradata_size = avctx->subtitle_header_size;
+    avctx->extradata[avctx->extradata_size] = 0;
     return 0;
 }
 
@@ -40,15 +46,50 @@ static int ass_encode_frame(AVCodecContext *avctx,
                             unsigned char *buf, int bufsize,
                             const AVSubtitle *sub)
 {
+    ASSEncodeContext *s = avctx->priv_data;
     int i, len, total_len = 0;
 
     for (i=0; i<sub->num_rects; i++) {
+        char ass_line[2048];
+        const char *ass = sub->rects[i]->ass;
+        long int layer;
+        char *p;
+
         if (sub->rects[i]->type != SUBTITLE_ASS) {
             av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
             return -1;
         }
 
-        len = av_strlcpy(buf+total_len, sub->rects[i]->ass, bufsize-total_len);
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            if (i > 0) {
+                av_log(avctx, AV_LOG_ERROR, "ASS encoder supports only one "
+                       "ASS rectangle field.\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            ass += 10; // skip "Dialogue: "
+            /* parse Layer field. If it's a Marked field, the content
+             * will be "Marked=N" instead of the layer num, so we will
+             * have layer=0, which is fine. */
+            layer = strtol(ass, &p, 10);
+
+#define SKIP_ENTRY(ptr) do {        \
+    char *sep = strchr(ptr, ',');   \
+    if (sep)                        \
+        ptr = sep + 1;              \
+} while (0)
+
+            SKIP_ENTRY(p); // skip layer or marked
+            SKIP_ENTRY(p); // skip start timestamp
+            SKIP_ENTRY(p); // skip end timestamp
+            snprintf(ass_line, sizeof(ass_line), "%d,%ld,%s", ++s->id, layer, p);
+            ass_line[strcspn(ass_line, "\r\n")] = 0;
+            ass = ass_line;
+        }
+#endif
+
+        len = av_strlcpy(buf+total_len, ass, bufsize-total_len);
 
         if (len > bufsize-total_len-1) {
             av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
@@ -61,11 +102,26 @@ static int ass_encode_frame(AVCodecContext *avctx,
     return total_len;
 }
 
+#if CONFIG_SSA_ENCODER
+AVCodec ff_ssa_encoder = {
+    .name         = "ssa",
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_ASS,
+    .init         = ass_encode_init,
+    .encode_sub   = ass_encode_frame,
+    .priv_data_size = sizeof(ASSEncodeContext),
+};
+#endif
+
+#if CONFIG_ASS_ENCODER
 AVCodec ff_ass_encoder = {
     .name         = "ass",
-    .long_name    = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
+    .long_name    = NULL_IF_CONFIG_SMALL("ASS (Advanced SubStation Alpha) subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
-    .id           = AV_CODEC_ID_SSA,
+    .id           = AV_CODEC_ID_ASS,
     .init         = ass_encode_init,
     .encode_sub   = ass_encode_frame,
+    .priv_data_size = sizeof(ASSEncodeContext),
 };
+#endif
diff --git a/libavcodec/asv.c b/libavcodec/asv.c
index b9e93f7..14fdf73 100644
--- a/libavcodec/asv.c
+++ b/libavcodec/asv.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/asv.h b/libavcodec/asv.h
index 18f7a95..e2cdc81 100644
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/asvdec.c b/libavcodec/asvdec.c
index f17f064..036d56e 100644
--- a/libavcodec/asvdec.c
+++ b/libavcodec/asvdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,6 @@
 #include "asv.h"
 #include "avcodec.h"
 #include "blockdsp.h"
-#include "put_bits.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -211,10 +210,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *const p = data;
     int mb_x, mb_y, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -277,8 +274,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int i;
 
     if (avctx->extradata_size < 1) {
-        av_log(avctx, AV_LOG_ERROR, "No extradata provided\n");
-        return AVERROR_INVALIDDATA;
+        av_log(avctx, AV_LOG_WARNING, "No extradata provided\n");
     }
 
     ff_asv_common_init(avctx);
@@ -288,8 +284,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ff_init_scantable(a->idsp.idct_permutation, &a->scantable, ff_asv_scantab);
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
-    a->inv_qscale = avctx->extradata[0];
-    if (a->inv_qscale == 0) {
+    if (avctx->extradata_size < 1 || (a->inv_qscale = avctx->extradata[0]) == 0) {
         av_log(avctx, AV_LOG_ERROR, "illegal qscale 0\n");
         if (avctx->codec_id == AV_CODEC_ID_ASV1)
             a->inv_qscale = 6;
@@ -317,6 +312,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+#if CONFIG_ASV1_DECODER
 AVCodec ff_asv1_decoder = {
     .name           = "asv1",
     .long_name      = NULL_IF_CONFIG_SMALL("ASUS V1"),
@@ -328,7 +324,9 @@ AVCodec ff_asv1_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
 
+#if CONFIG_ASV2_DECODER
 AVCodec ff_asv2_decoder = {
     .name           = "asv2",
     .long_name      = NULL_IF_CONFIG_SMALL("ASUS V2"),
@@ -340,3 +338,4 @@ AVCodec ff_asv2_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index ac7c317..ec98a0c 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,10 @@
 #include "libavutil/attributes.h"
 #include "libavutil/mem.h"
 
+#include "aandcttab.h"
 #include "asv.h"
 #include "avcodec.h"
+#include "dct.h"
 #include "fdctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -50,7 +52,7 @@ static inline void asv1_put_level(PutBitContext *pb, int level)
     }
 }
 
-static inline void asv2_put_level(PutBitContext *pb, int level)
+static inline void asv2_put_level(ASV1Context *a, PutBitContext *pb, int level)
 {
     unsigned int index = level + 31;
 
@@ -58,6 +60,10 @@ static inline void asv2_put_level(PutBitContext *pb, int level)
         put_bits(pb, ff_asv2_level_tab[index][1], ff_asv2_level_tab[index][0]);
     } else {
         put_bits(pb, ff_asv2_level_tab[31][1], ff_asv2_level_tab[31][0]);
+        if (level < -128 || level > 127) {
+            av_log(a->avctx, AV_LOG_WARNING, "Cliping level %d, increase qscale\n", level);
+            level = av_clip_int8(level);
+        }
         asv2_put_bits(pb, 8, level & 0xFF);
     }
 }
@@ -108,7 +114,7 @@ static inline void asv1_encode_block(ASV1Context *a, int16_t block[64])
     put_bits(&a->pb, ff_asv_ccp_tab[16][1], ff_asv_ccp_tab[16][0]);
 }
 
-static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
+static inline void asv2_encode_block(ASV1Context *a, int16_t block[64])
 {
     int i;
     int count = 0;
@@ -142,8 +148,7 @@ static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
                                  a->q_intra_matrix[index + 9] + (1 << 15)) >> 16))
             ccp |= 1;
 
-        if (!i && ccp >= 8)
-            return AVERROR_BUG;
+        av_assert2(i || ccp < 8);
         if (i)
             put_bits(&a->pb, ff_asv_ac_ccp_tab[ccp][1], ff_asv_ac_ccp_tab[ccp][0]);
         else
@@ -151,24 +156,22 @@ static inline int asv2_encode_block(ASV1Context *a, int16_t block[64])
 
         if (ccp) {
             if (ccp & 8)
-                asv2_put_level(&a->pb, block[index + 0]);
+                asv2_put_level(a, &a->pb, block[index + 0]);
             if (ccp & 4)
-                asv2_put_level(&a->pb, block[index + 8]);
+                asv2_put_level(a, &a->pb, block[index + 8]);
             if (ccp & 2)
-                asv2_put_level(&a->pb, block[index + 1]);
+                asv2_put_level(a, &a->pb, block[index + 1]);
             if (ccp & 1)
-                asv2_put_level(&a->pb, block[index + 9]);
+                asv2_put_level(a, &a->pb, block[index + 9]);
         }
     }
-
-    return 0;
 }
 
 #define MAX_MB_SIZE (30 * 16 * 16 * 3 / 2 / 8)
 
 static inline int encode_mb(ASV1Context *a, int16_t block[6][64])
 {
-    int i, ret;
+    int i;
 
     if (a->pb.buf_end - a->pb.buf - (put_bits_count(&a->pb) >> 3) < MAX_MB_SIZE) {
         av_log(a->avctx, AV_LOG_ERROR, "encoded frame too large\n");
@@ -180,9 +183,7 @@ static inline int encode_mb(ASV1Context *a, int16_t block[6][64])
             asv1_encode_block(a, block[i]);
     } else {
         for (i = 0; i < 6; i++) {
-            ret = asv2_encode_block(a, block[i]);
-            if (ret < 0)
-                return ret;
+            asv2_encode_block(a, block[i]);
         }
     }
     return 0;
@@ -221,13 +222,52 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int size, ret;
     int mb_x, mb_y;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, a->mb_height * a->mb_width * MAX_MB_SIZE +
-                             AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if (pict->width % 16 || pict->height % 16) {
+        AVFrame *clone = av_frame_alloc();
+        int i;
+
+        if (!clone)
+            return AVERROR(ENOMEM);
+        clone->format = pict->format;
+        clone->width  = FFALIGN(pict->width, 16);
+        clone->height = FFALIGN(pict->height, 16);
+        ret = av_frame_get_buffer(clone, 32);
+        if (ret < 0) {
+            av_frame_free(&clone);
+            return ret;
+        }
+
+        ret = av_frame_copy(clone, pict);
+        if (ret < 0) {
+            av_frame_free(&clone);
+            return ret;
+        }
+
+        for (i = 0; i<3; i++) {
+            int x, y;
+            int w  = AV_CEIL_RSHIFT(pict->width, !!i);
+            int h  = AV_CEIL_RSHIFT(pict->height, !!i);
+            int w2 = AV_CEIL_RSHIFT(clone->width, !!i);
+            int h2 = AV_CEIL_RSHIFT(clone->height, !!i);
+            for (y=0; y<h; y++)
+                for (x=w; x<w2; x++)
+                    clone->data[i][x + y*clone->linesize[i]] =
+                        clone->data[i][w - 1 + y*clone->linesize[i]];
+            for (y=h; y<h2; y++)
+                for (x=0; x<w2; x++)
+                    clone->data[i][x + y*clone->linesize[i]] =
+                        clone->data[i][x + (h-1)*clone->linesize[i]];
+        }
+        ret = encode_frame(avctx, pkt, clone, got_packet);
+
+        av_frame_free(&clone);
         return ret;
     }
 
+    if ((ret = ff_alloc_packet2(avctx, pkt, a->mb_height * a->mb_width * MAX_MB_SIZE +
+                                AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
     init_put_bits(&a->pb, pkt->data, pkt->size);
 
     for (mb_y = 0; mb_y < a->mb_height2; mb_y++) {
@@ -282,18 +322,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int i;
     const int scale = avctx->codec_id == AV_CODEC_ID_ASV1 ? 1 : 2;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
     ff_asv_common_init(avctx);
     ff_fdctdsp_init(&a->fdsp, avctx);
     ff_pixblockdsp_init(&a->pdsp, avctx);
 
-    if (avctx->global_quality == 0)
+    if (avctx->global_quality <= 0)
         avctx->global_quality = 4 * FF_QUALITY_SCALE;
 
     a->inv_qscale = (32 * scale * FF_QUALITY_SCALE +
@@ -307,8 +340,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
     ((uint32_t *) avctx->extradata)[1] = av_le2ne32(AV_RL32("ASUS"));
 
     for (i = 0; i < 64; i++) {
-        int q = 32 * scale * ff_mpeg1_default_intra_matrix[i];
-        a->q_intra_matrix[i] = ((a->inv_qscale << 16) + q / 2) / q;
+        if (a->fdsp.fdct == ff_fdct_ifast) {
+            int q = 32LL * scale * ff_mpeg1_default_intra_matrix[i] * ff_aanscales[i];
+            a->q_intra_matrix[i] = (((int64_t)a->inv_qscale << 30) + q / 2) / q;
+        } else {
+            int q = 32 * scale * ff_mpeg1_default_intra_matrix[i];
+            a->q_intra_matrix[i] = ((a->inv_qscale << 16) + q / 2) / q;
+        }
     }
 
     return 0;
diff --git a/libavcodec/atrac.c b/libavcodec/atrac.c
index f36db9e..12e8997 100644
--- a/libavcodec/atrac.c
+++ b/libavcodec/atrac.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2013 Maxim Poliakovski
  * Copyright (c) 2006-2008 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -124,7 +124,8 @@ void ff_atrac_gain_compensation(AtracGCContext *gctx, float *in, float *prev,
     memcpy(prev, &in[num_samples], num_samples * sizeof(float));
 }
 
-void ff_atrac_iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float *delayBuf, float *temp)
+void ff_atrac_iqmf(float *inlo, float *inhi, unsigned int nIn, float *pOut,
+                   float *delayBuf, float *temp)
 {
     int   i, j;
     float   *p1, *p3;
diff --git a/libavcodec/atrac.h b/libavcodec/atrac.h
index 8909323..05208bb 100644
--- a/libavcodec/atrac.h
+++ b/libavcodec/atrac.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2009-2013 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -91,6 +91,7 @@ void ff_atrac_gain_compensation(AtracGCContext *gctx, float *in, float *prev,
  * @param delayBuf  delayBuf buffer
  * @param temp      temp buffer
  */
-void ff_atrac_iqmf (float *inlo, float *inhi, unsigned int nIn, float *pOut, float *delayBuf, float *temp);
+void ff_atrac_iqmf(float *inlo, float *inhi, unsigned int nIn, float *pOut,
+                   float *delayBuf, float *temp);
 
 #endif /* AVCODEC_ATRAC_H */
diff --git a/libavcodec/atrac1.c b/libavcodec/atrac1.c
index e938976..a8c8c91 100644
--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,7 +65,7 @@ typedef struct AT1SUCtx {
     DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
     DECLARE_ALIGNED(32, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
     DECLARE_ALIGNED(32, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
-    DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
+    DECLARE_ALIGNED(32, float, last_qmf_delay)[256+39];    ///< delay line for the last stacked QMF filter
 } AT1SUCtx;
 
 /**
@@ -80,7 +80,7 @@ typedef struct AT1Ctx {
     DECLARE_ALIGNED(32, float, high)[512];
     float*              bands[3];
     FFTContext          mdct_ctx[3];
-    AVFloatDSPContext   fdsp;
+    AVFloatDSPContext   *fdsp;
 } AT1Ctx;
 
 /** size of the transform in samples in the long mode for each QMF band */
@@ -140,7 +140,7 @@ static int at1_imdct_block(AT1SUCtx* su, AT1Ctx *q)
             at1_imdct(q, &q->spec[pos], &su->spectrum[0][ref_pos + start_pos], nbits, band_num);
 
             /* overlap and window */
-            q->fdsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
+            q->fdsp->vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
                                        &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16);
 
             prev_buf = &su->spectrum[0][ref_pos+start_pos + 16];
@@ -242,7 +242,7 @@ static int at1_unpack_dequant(GetBitContext* gb, AT1SUCtx* su,
                      */
                     spec[pos+i] = get_sbits(gb, word_len) * scale_factor * max_quant;
                 }
-            } else { /* word_len = 0 -> empty BFU, zero all specs in the emty BFU */
+            } else { /* word_len = 0 -> empty BFU, zero all specs in the empty BFU */
                 memset(&spec[pos], 0, num_specs * sizeof(float));
             }
         }
@@ -260,9 +260,9 @@ static void at1_subband_synthesis(AT1Ctx *q, AT1SUCtx* su, float *pOut)
     /* combine low and middle bands */
     ff_atrac_iqmf(q->bands[0], q->bands[1], 128, temp, su->fst_qmf_delay, iqmf_temp);
 
-    /* delay the signal of the high band by 23 samples */
-    memcpy( su->last_qmf_delay,    &su->last_qmf_delay[256], sizeof(float) *  23);
-    memcpy(&su->last_qmf_delay[23], q->bands[2],             sizeof(float) * 256);
+    /* delay the signal of the high band by 39 samples */
+    memcpy( su->last_qmf_delay,    &su->last_qmf_delay[256], sizeof(float) *  39);
+    memcpy(&su->last_qmf_delay[39], q->bands[2],             sizeof(float) * 256);
 
     /* combine (low + middle) and high bands */
     ff_atrac_iqmf(temp, su->last_qmf_delay, 256, pOut, su->snd_qmf_delay, iqmf_temp);
@@ -287,10 +287,8 @@ static int atrac1_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = AT1_SU_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (ch = 0; ch < avctx->channels; ch++) {
         AT1SUCtx* su = &q->SUs[ch];
@@ -326,6 +324,8 @@ static av_cold int atrac1_decode_end(AVCodecContext * avctx)
     ff_mdct_end(&q->mdct_ctx[1]);
     ff_mdct_end(&q->mdct_ctx[2]);
 
+    av_freep(&q->fdsp);
+
     return 0;
 }
 
@@ -343,6 +343,11 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
+    if (avctx->block_align <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported block align.");
+        return AVERROR_PATCHWELCOME;
+    }
+
     /* Init the mdct transforms */
     if ((ret = ff_mdct_init(&q->mdct_ctx[0], 6, 1, -1.0/ (1 << 15))) ||
         (ret = ff_mdct_init(&q->mdct_ctx[1], 8, 1, -1.0/ (1 << 15))) ||
@@ -356,7 +361,7 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
 
     ff_atrac_generate_tables();
 
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     q->bands[0] = q->low;
     q->bands[1] = q->mid;
diff --git a/libavcodec/atrac1data.h b/libavcodec/atrac1data.h
index 539867b..62c218b 100644
--- a/libavcodec/atrac1data.h
+++ b/libavcodec/atrac1data.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Maxim Poliakovski
  * Copyright (c) 2009 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3.c b/libavcodec/atrac3.c
index 2e1fd3c..256990b 100644
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2008 Maxim Poliakovski
  * Copyright (c) 2006-2008 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "fft.h"
@@ -104,9 +105,9 @@ typedef struct ATRAC3Context {
     int scrambled_stream;
     //@}
 
-    AtracGCContext  gainc_ctx;
-    FFTContext mdct_ctx;
-    AVFloatDSPContext fdsp;
+    AtracGCContext    gainc_ctx;
+    FFTContext        mdct_ctx;
+    AVFloatDSPContext *fdsp;
 } ATRAC3Context;
 
 static DECLARE_ALIGNED(32, float, mdct_window)[MDCT_SIZE];
@@ -139,7 +140,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
     q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input);
 
     /* Perform windowing on the output. */
-    q->fdsp.vector_fmul(output, output, mdct_window, MDCT_SIZE);
+    q->fdsp->vector_fmul(output, output, mdct_window, MDCT_SIZE);
 }
 
 /*
@@ -187,8 +188,9 @@ static av_cold int atrac3_decode_close(AVCodecContext *avctx)
 {
     ATRAC3Context *q = avctx->priv_data;
 
-    av_free(q->units);
-    av_free(q->decoded_bytes_buffer);
+    av_freep(&q->units);
+    av_freep(&q->decoded_bytes_buffer);
+    av_freep(&q->fdsp);
 
     ff_mdct_end(&q->mdct_ctx);
 
@@ -407,17 +409,17 @@ static int decode_tonal_components(GetBitContext *gb,
 static int decode_gain_control(GetBitContext *gb, GainBlock *block,
                                int num_bands)
 {
-    int i, j;
+    int b, j;
     int *level, *loc;
 
     AtracGainInfo *gain = block->g_block;
 
-    for (i = 0; i <= num_bands; i++) {
-        gain[i].num_points    = get_bits(gb, 3);
-        level                 = gain[i].lev_code;
-        loc                   = gain[i].loc_code;
+    for (b = 0; b <= num_bands; b++) {
+        gain[b].num_points = get_bits(gb, 3);
+        level              = gain[b].lev_code;
+        loc                = gain[b].loc_code;
 
-        for (j = 0; j < gain[i].num_points; j++) {
+        for (j = 0; j < gain[b].num_points; j++) {
             level[j] = get_bits(gb, 4);
             loc[j]   = get_bits(gb, 5);
             if (j && loc[j] <= loc[j - 1])
@@ -426,8 +428,8 @@ static int decode_gain_control(GetBitContext *gb, GainBlock *block,
     }
 
     /* Clear the unused blocks. */
-    for (; i < 4 ; i++)
-        gain[i].num_points = 0;
+    for (; b < 4 ; b++)
+        gain[b].num_points = 0;
 
     return 0;
 }
@@ -518,7 +520,7 @@ static void reverse_matrixing(float *su1, float *su2, int *prev_code,
             }
             break;
         default:
-            assert(0);
+            av_assert1(0);
         }
     }
 }
@@ -673,7 +675,7 @@ static int decode_frame(AVCodecContext *avctx, const uint8_t *databuf,
 
 
         /* set the bitstream reader at the start of the second Sound Unit*/
-        init_get_bits(&q->gb, ptr1, (avctx->block_align - i) * 8);
+        init_get_bits8(&q->gb, ptr1, q->decoded_bytes_buffer + avctx->block_align - ptr1);
 
         /* Fill the Weighting coeffs delay buffer */
         memmove(q->weighting_delay, &q->weighting_delay[2],
@@ -747,10 +749,8 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = SAMPLES_PER_FRAME;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /* Check if we need to descramble and what buffer to pass on. */
     if (q->scrambled_stream) {
@@ -762,7 +762,7 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
 
     ret = decode_frame(avctx, databuf, (float **)frame->extended_data);
     if (ret) {
-        av_log(NULL, AV_LOG_ERROR, "Frame decoding error!\n");
+        av_log(avctx, AV_LOG_ERROR, "Frame decoding error!\n");
         return ret;
     }
 
@@ -771,7 +771,7 @@ static int atrac3_decode_frame(AVCodecContext *avctx, void *data,
     return avctx->block_align;
 }
 
-static av_cold void atrac3_init_static_data(AVCodec *codec)
+static av_cold void atrac3_init_static_data(void)
 {
     int i;
 
@@ -791,6 +791,7 @@ static av_cold void atrac3_init_static_data(AVCodec *codec)
 
 static av_cold int atrac3_decode_init(AVCodecContext *avctx)
 {
+    static int static_init_done;
     int i, ret;
     int version, delay, samples_per_frame, frame_factor;
     const uint8_t *edata_ptr = avctx->extradata;
@@ -801,6 +802,10 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
+    if (!static_init_done)
+        atrac3_init_static_data();
+    static_init_done = 1;
+
     /* Take care of the codec-specific extradata. */
     if (avctx->extradata_size == 14) {
         /* Parse the extradata, WAV format */
@@ -829,7 +834,7 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
                    avctx->channels, frame_factor);
             return AVERROR_INVALIDDATA;
         }
-    } else if (avctx->extradata_size == 10) {
+    } else if (avctx->extradata_size == 12 || avctx->extradata_size == 10) {
         /* Parse the extradata, RM format. */
         version                = bytestream_get_be32(&edata_ptr);
         samples_per_frame      = bytestream_get_be16(&edata_ptr);
@@ -838,7 +843,7 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
         q->scrambled_stream    = 1;
 
     } else {
-        av_log(NULL, AV_LOG_ERROR, "Unknown extradata size %d.\n",
+        av_log(avctx, AV_LOG_ERROR, "Unknown extradata size %d.\n",
                avctx->extradata_size);
         return AVERROR(EINVAL);
     }
@@ -866,8 +871,10 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
     if (q->coding_mode == STEREO)
         av_log(avctx, AV_LOG_DEBUG, "Normal stereo detected.\n");
     else if (q->coding_mode == JOINT_STEREO) {
-        if (avctx->channels != 2)
+        if (avctx->channels != 2) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid coding mode\n");
             return AVERROR_INVALIDDATA;
+        }
         av_log(avctx, AV_LOG_DEBUG, "Joint stereo detected.\n");
     } else {
         av_log(avctx, AV_LOG_ERROR, "Unknown channel coding mode %x!\n",
@@ -907,10 +914,10 @@ static av_cold int atrac3_decode_init(AVCodecContext *avctx)
     }
 
     ff_atrac_init_gain_compensation(&q->gainc_ctx, 4, 3);
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
-    q->units = av_mallocz(sizeof(*q->units) * avctx->channels);
-    if (!q->units) {
+    q->units = av_mallocz_array(avctx->channels, sizeof(*q->units));
+    if (!q->units || !q->fdsp) {
         atrac3_decode_close(avctx);
         return AVERROR(ENOMEM);
     }
@@ -925,7 +932,6 @@ AVCodec ff_atrac3_decoder = {
     .id               = AV_CODEC_ID_ATRAC3,
     .priv_data_size   = sizeof(ATRAC3Context),
     .init             = atrac3_decode_init,
-    .init_static_data = atrac3_init_static_data,
     .close            = atrac3_decode_close,
     .decode           = atrac3_decode_frame,
     .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
diff --git a/libavcodec/atrac3data.h b/libavcodec/atrac3data.h
index 4f5c122..5d91274 100644
--- a/libavcodec/atrac3data.h
+++ b/libavcodec/atrac3data.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Maxim Poliakovski
  * Copyright (c) 2006-2007 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3plus.c b/libavcodec/atrac3plus.c
index 076fb84..46e0bea 100644
--- a/libavcodec/atrac3plus.c
+++ b/libavcodec/atrac3plus.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,56 +77,56 @@ static av_cold void build_canonical_huff(const uint8_t *cb, const uint8_t *xlat,
     *tab_offset += 1 << max_len;
 }
 
-av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
+av_cold void ff_atrac3p_init_vlcs(void)
 {
     int i, wl_vlc_offs, ct_vlc_offs, sf_vlc_offs, tab_offset;
 
-    static int wl_nb_bits[4]  = { 2, 3, 5, 5 };
-    static int wl_nb_codes[4] = { 3, 5, 8, 8 };
-    static const uint8_t *wl_bits[4] = {
+    static const int wl_nb_bits[4]  = { 2, 3, 5, 5 };
+    static const int wl_nb_codes[4] = { 3, 5, 8, 8 };
+    static const uint8_t * const wl_bits[4] = {
         atrac3p_wl_huff_bits1, atrac3p_wl_huff_bits2,
         atrac3p_wl_huff_bits3, atrac3p_wl_huff_bits4
     };
-    static const uint8_t *wl_codes[4] = {
+    static const uint8_t * const wl_codes[4] = {
         atrac3p_wl_huff_code1, atrac3p_wl_huff_code2,
         atrac3p_wl_huff_code3, atrac3p_wl_huff_code4
     };
-    static const uint8_t *wl_xlats[4] = {
+    static const uint8_t * const wl_xlats[4] = {
         atrac3p_wl_huff_xlat1, atrac3p_wl_huff_xlat2, NULL, NULL
     };
 
-    static int ct_nb_bits[4]  = { 3, 4, 4, 4 };
-    static int ct_nb_codes[4] = { 4, 8, 8, 8 };
-    static const uint8_t *ct_bits[4]  = {
+    static const int ct_nb_bits[4]  = { 3, 4, 4, 4 };
+    static const int ct_nb_codes[4] = { 4, 8, 8, 8 };
+    static const uint8_t * const ct_bits[4]  = {
         atrac3p_ct_huff_bits1, atrac3p_ct_huff_bits2,
         atrac3p_ct_huff_bits2, atrac3p_ct_huff_bits3
     };
-    static const uint8_t *ct_codes[4] = {
+    static const uint8_t * const ct_codes[4] = {
         atrac3p_ct_huff_code1, atrac3p_ct_huff_code2,
         atrac3p_ct_huff_code2, atrac3p_ct_huff_code3
     };
-    static const uint8_t *ct_xlats[4] = {
+    static const uint8_t * const ct_xlats[4] = {
         NULL, NULL, atrac3p_ct_huff_xlat1, NULL
     };
 
-    static int sf_nb_bits[8]  = {  9,  9,  9,  9,  6,  6,  7,  7 };
-    static int sf_nb_codes[8] = { 64, 64, 64, 64, 16, 16, 16, 16 };
-    static const uint8_t  *sf_bits[8]  = {
+    static const  int sf_nb_bits[8]  = {  9,  9,  9,  9,  6,  6,  7,  7 };
+    static const  int sf_nb_codes[8] = { 64, 64, 64, 64, 16, 16, 16, 16 };
+    static const uint8_t  * const sf_bits[8]  = {
         atrac3p_sf_huff_bits1, atrac3p_sf_huff_bits1, atrac3p_sf_huff_bits2,
         atrac3p_sf_huff_bits3, atrac3p_sf_huff_bits4, atrac3p_sf_huff_bits4,
         atrac3p_sf_huff_bits5, atrac3p_sf_huff_bits6
     };
-    static const uint16_t *sf_codes[8] = {
+    static const uint16_t * const sf_codes[8] = {
         atrac3p_sf_huff_code1, atrac3p_sf_huff_code1, atrac3p_sf_huff_code2,
         atrac3p_sf_huff_code3, atrac3p_sf_huff_code4, atrac3p_sf_huff_code4,
         atrac3p_sf_huff_code5, atrac3p_sf_huff_code6
     };
-    static const uint8_t  *sf_xlats[8] = {
+    static const uint8_t  * const sf_xlats[8] = {
         atrac3p_sf_huff_xlat1, atrac3p_sf_huff_xlat2, NULL, NULL,
         atrac3p_sf_huff_xlat4, atrac3p_sf_huff_xlat5, NULL, NULL
     };
 
-    static const uint8_t *gain_cbs[11] = {
+    static const uint8_t * const gain_cbs[11] = {
         atrac3p_huff_gain_npoints1_cb, atrac3p_huff_gain_npoints1_cb,
         atrac3p_huff_gain_lev1_cb, atrac3p_huff_gain_lev2_cb,
         atrac3p_huff_gain_lev3_cb, atrac3p_huff_gain_lev4_cb,
@@ -134,7 +134,7 @@ av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
         atrac3p_huff_gain_loc4_cb, atrac3p_huff_gain_loc2_cb,
         atrac3p_huff_gain_loc5_cb
     };
-    static const uint8_t *gain_xlats[11] = {
+    static const uint8_t * const gain_xlats[11] = {
         NULL, atrac3p_huff_gain_npoints2_xlat, atrac3p_huff_gain_lev1_xlat,
         atrac3p_huff_gain_lev2_xlat, atrac3p_huff_gain_lev3_xlat,
         atrac3p_huff_gain_lev4_xlat, atrac3p_huff_gain_loc3_xlat,
@@ -142,13 +142,13 @@ av_cold void ff_atrac3p_init_vlcs(AVCodec *codec)
         atrac3p_huff_gain_loc2_xlat, atrac3p_huff_gain_loc5_xlat
     };
 
-    static const uint8_t *tone_cbs[7] = {
+    static const uint8_t * const tone_cbs[7] = {
         atrac3p_huff_tonebands_cb,  atrac3p_huff_numwavs1_cb,
         atrac3p_huff_numwavs2_cb,   atrac3p_huff_wav_ampsf1_cb,
         atrac3p_huff_wav_ampsf2_cb, atrac3p_huff_wav_ampsf3_cb,
         atrac3p_huff_freq_cb
     };
-    static const uint8_t *tone_xlats[7] = {
+    static const uint8_t * const tone_xlats[7] = {
         NULL, NULL, atrac3p_huff_numwavs2_xlat, atrac3p_huff_wav_ampsf1_xlat,
         atrac3p_huff_wav_ampsf2_xlat, atrac3p_huff_wav_ampsf3_xlat,
         atrac3p_huff_freq_xlat
@@ -817,7 +817,7 @@ static void decode_qu_spectra(GetBitContext *gb, const Atrac3pSpecCodeTab *tab,
     int num_coeffs = tab->num_coeffs;
     int bits       = tab->bits;
     int is_signed  = tab->is_signed;
-    unsigned val, mask = (1 << bits) - 1;
+    unsigned val;
 
     for (pos = 0; pos < num_specs;) {
         if (group_size == 1 || get_bits1(gb)) {
@@ -825,7 +825,7 @@ static void decode_qu_spectra(GetBitContext *gb, const Atrac3pSpecCodeTab *tab,
                 val = get_vlc2(gb, vlc_tab->table, vlc_tab->bits, 1);
 
                 for (i = 0; i < num_coeffs; i++) {
-                    cf = val & mask;
+                    cf = av_mod_uintp2(val, bits);
                     if (is_signed)
                         cf = sign_extend(cf, bits);
                     else if (cf && get_bits1(gb))
@@ -1721,11 +1721,7 @@ static int decode_tones_info(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
     if (num_channels == 2) {
         get_subband_flags(gb, ctx->waves_info->tone_sharing, ctx->waves_info->num_tone_bands);
         get_subband_flags(gb, ctx->waves_info->tone_master,  ctx->waves_info->num_tone_bands);
-        if (get_subband_flags(gb, ctx->waves_info->phase_shift,
-                              ctx->waves_info->num_tone_bands)) {
-            avpriv_report_missing_feature(avctx, "GHA Phase shifting");
-            return AVERROR_PATCHWELCOME;
-        }
+        get_subband_flags(gb, ctx->waves_info->invert_phase, ctx->waves_info->num_tone_bands);
     }
 
     ctx->waves_info->tones_index = 0;
diff --git a/libavcodec/atrac3plus.h b/libavcodec/atrac3plus.h
index e56c444..a33c38a 100644
--- a/libavcodec/atrac3plus.h
+++ b/libavcodec/atrac3plus.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -122,7 +122,7 @@ typedef struct Atrac3pWaveSynthParams {
     int num_tone_bands;                     ///< number of PQF bands with tones
     uint8_t tone_sharing[ATRAC3P_SUBBANDS]; ///< 1 - subband-wise tone sharing flags
     uint8_t tone_master[ATRAC3P_SUBBANDS];  ///< 1 - subband-wise tone channel swapping
-    uint8_t phase_shift[ATRAC3P_SUBBANDS];  ///< 1 - subband-wise 180° phase shifting
+    uint8_t invert_phase[ATRAC3P_SUBBANDS]; ///< 1 - subband-wise phase inversion
     int tones_index;                        ///< total sum of tones in this unit
     Atrac3pWaveParam waves[48];
 } Atrac3pWaveSynthParams;
@@ -155,10 +155,8 @@ typedef struct Atrac3pChanUnitCtx {
 
 /**
  * Initialize VLC tables for bitstream parsing.
- *
- * @param[in]   codec    ptr to the AVCodec
  */
-void ff_atrac3p_init_vlcs(AVCodec *codec);
+void ff_atrac3p_init_vlcs(void);
 
 /**
  * Decode bitstream data of a channel unit.
@@ -169,8 +167,8 @@ void ff_atrac3p_init_vlcs(AVCodec *codec);
  * @param[in]     avctx         ptr to the AVCodecContext
  * @return result code: 0 = OK, otherwise - error code
  */
-int  ff_atrac3p_decode_channel_unit(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
-                                    int num_channels, AVCodecContext *avctx);
+int ff_atrac3p_decode_channel_unit(GetBitContext *gb, Atrac3pChanUnitCtx *ctx,
+                                   int num_channels, AVCodecContext *avctx);
 
 /**
  * Initialize IMDCT transform.
diff --git a/libavcodec/atrac3plus_data.h b/libavcodec/atrac3plus_data.h
index 5026a59..2a107ee 100644
--- a/libavcodec/atrac3plus_data.h
+++ b/libavcodec/atrac3plus_data.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/atrac3plusdec.c b/libavcodec/atrac3plusdec.c
index 4a742c1..ec2b1ad 100644
--- a/libavcodec/atrac3plusdec.c
+++ b/libavcodec/atrac3plusdec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@
 
 typedef struct ATRAC3PContext {
     GetBitContext gb;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
     DECLARE_ALIGNED(32, float, samples)[2][ATRAC3P_FRAME_SAMPLES];  ///< quantized MDCT spectrum
     DECLARE_ALIGNED(32, float, mdct_buf)[2][ATRAC3P_FRAME_SAMPLES]; ///< output of the IMDCT
@@ -67,7 +67,13 @@ typedef struct ATRAC3PContext {
 
 static av_cold int atrac3p_decode_close(AVCodecContext *avctx)
 {
-    av_free(((ATRAC3PContext *)(avctx->priv_data))->ch_units);
+    ATRAC3PContext *ctx = avctx->priv_data;
+
+    av_freep(&ctx->ch_units);
+    av_freep(&ctx->fdsp);
+
+    ff_mdct_end(&ctx->mdct_ctx);
+    ff_mdct_end(&ctx->ipqf_dct_ctx);
 
     return 0;
 }
@@ -148,7 +154,7 @@ static av_cold int atrac3p_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    avpriv_float_dsp_init(&ctx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    ff_atrac3p_init_vlcs();
 
     /* initialize IPQF */
     ff_mdct_init(&ctx->ipqf_dct_ctx, 5, 1, 32.0 / 32768.0);
@@ -164,9 +170,10 @@ static av_cold int atrac3p_decode_init(AVCodecContext *avctx)
 
     ctx->my_channel_layout = avctx->channel_layout;
 
-    ctx->ch_units = av_mallocz(sizeof(*ctx->ch_units) *
-                               ctx->num_channel_blocks);
-    if (!ctx->ch_units) {
+    ctx->ch_units = av_mallocz_array(ctx->num_channel_blocks, sizeof(*ctx->ch_units));
+    ctx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+
+    if (!ctx->ch_units || !ctx->fdsp) {
         atrac3p_decode_close(avctx);
         return AVERROR(ENOMEM);
     }
@@ -263,7 +270,7 @@ static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx *ch_unit,
     for (ch = 0; ch < num_channels; ch++) {
         for (sb = 0; sb < ch_unit->num_subbands; sb++) {
             /* inverse transform and windowing */
-            ff_atrac3p_imdct(&ctx->fdsp, &ctx->mdct_ctx,
+            ff_atrac3p_imdct(ctx->fdsp, &ctx->mdct_ctx,
                              &ctx->samples[ch][sb * ATRAC3P_SUBBAND_SAMPLES],
                              &ctx->mdct_buf[ch][sb * ATRAC3P_SUBBAND_SAMPLES],
                              (ch_unit->channels[ch].wnd_shape_prev[sb] << 1) +
@@ -297,7 +304,7 @@ static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx *ch_unit,
             for (sb = 0; sb < ch_unit->num_subbands; sb++)
                 if (ch_unit->channels[ch].tones_info[sb].num_wavs ||
                     ch_unit->channels[ch].tones_info_prev[sb].num_wavs) {
-                    ff_atrac3p_generate_tones(ch_unit, &ctx->fdsp, ch, sb,
+                    ff_atrac3p_generate_tones(ch_unit, ctx->fdsp, ch, sb,
                                               &ctx->time_buf[ch][sb * 128]);
                 }
         }
@@ -329,10 +336,8 @@ static int atrac3p_decode_frame(AVCodecContext *avctx, void *data,
     float **samples_p = (float **)frame->extended_data;
 
     frame->nb_samples = ATRAC3P_FRAME_SAMPLES;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if ((ret = init_get_bits8(&ctx->gb, avpkt->data, avpkt->size)) < 0)
         return ret;
@@ -379,18 +384,17 @@ static int atrac3p_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return avctx->block_align;
+    return FFMIN(avctx->block_align, avpkt->size);
 }
 
 AVCodec ff_atrac3p_decoder = {
-    .name             = "atrac3plus",
-    .long_name        = NULL_IF_CONFIG_SMALL("ATRAC3+ (Adaptive TRansform Acoustic Coding 3+)"),
-    .type             = AVMEDIA_TYPE_AUDIO,
-    .id               = AV_CODEC_ID_ATRAC3P,
-    .capabilities     = AV_CODEC_CAP_DR1,
-    .priv_data_size   = sizeof(ATRAC3PContext),
-    .init             = atrac3p_decode_init,
-    .init_static_data = ff_atrac3p_init_vlcs,
-    .close            = atrac3p_decode_close,
-    .decode           = atrac3p_decode_frame,
+    .name           = "atrac3plus",
+    .long_name      = NULL_IF_CONFIG_SMALL("ATRAC3+ (Adaptive TRansform Acoustic Coding 3+)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ATRAC3P,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(ATRAC3PContext),
+    .init           = atrac3p_decode_init,
+    .close          = atrac3p_decode_close,
+    .decode         = atrac3p_decode_frame,
 };
diff --git a/libavcodec/atrac3plusdsp.c b/libavcodec/atrac3plusdsp.c
index 468f098..d089588 100644
--- a/libavcodec/atrac3plusdsp.c
+++ b/libavcodec/atrac3plusdsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include <math.h>
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "sinewin.h"
 #include "fft.h"
@@ -107,7 +108,7 @@ av_cold void ff_atrac3p_init_wave_synth(void)
 
     /* generate amplitude scalefactors table */
     for (i = 0; i < 64; i++)
-        amp_sf_tab[i] = pow(2.0f, ((double)i - 3) / 4.0f);
+        amp_sf_tab[i] = exp2f((i - 3) / 4.0f);
 }
 
 /**
@@ -116,14 +117,16 @@ av_cold void ff_atrac3p_init_wave_synth(void)
  *  @param[in]    synth_param   ptr to common synthesis parameters
  *  @param[in]    waves_info    parameters for each sine wave
  *  @param[in]    envelope      envelope data for all waves in a group
- *  @param[in]    phase_shift   flag indicates 180° phase shift
+ *  @param[in]    fdsp          ptr to floating-point DSP context
+ *  @param[in]    invert_phase  flag indicating 180° phase shift
  *  @param[in]    reg_offset    region offset for trimming envelope data
  *  @param[out]   out           receives sythesized data
  */
 static void waves_synth(Atrac3pWaveSynthParams *synth_param,
                         Atrac3pWavesData *waves_info,
                         Atrac3pWaveEnvelope *envelope,
-                        int phase_shift, int reg_offset, float *out)
+                        AVFloatDSPContext *fdsp,
+                        int invert_phase, int reg_offset, float *out)
 {
     int i, wn, inc, pos;
     double amp;
@@ -146,6 +149,10 @@ static void waves_synth(Atrac3pWaveSynthParams *synth_param,
         }
     }
 
+    /* invert phase if requested */
+    if (invert_phase)
+        fdsp->vector_fmul_scalar(out, out, -1.0f, 128);
+
     /* fade in with steep Hann window if requested */
     if (envelope->has_start_point) {
         pos = (envelope->start_pos << 2) - reg_offset;
@@ -216,12 +223,12 @@ void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit, AVFloatDSPContext *f
     /* synthesize waves for both overlapping regions */
     if (tones_now->num_wavs && reg1_env_nonzero)
         waves_synth(ch_unit->waves_info_prev, tones_now, &tones_now->curr_env,
-                    ch_unit->waves_info_prev->phase_shift[sb] & ch_num,
+                    fdsp, ch_unit->waves_info_prev->invert_phase[sb] & ch_num,
                     128, wavreg1);
 
     if (tones_next->num_wavs && reg2_env_nonzero)
-        waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env,
-                    ch_unit->waves_info->phase_shift[sb] & ch_num, 0, wavreg2);
+        waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env, fdsp,
+                    ch_unit->waves_info->invert_phase[sb] & ch_num, 0, wavreg2);
 
     /* Hann windowing for non-faded wave signals */
     if (tones_now->num_wavs && tones_next->num_wavs &&
@@ -599,8 +606,8 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
                      const float *in, float *out)
 {
     int i, s, sb, t, pos_now, pos_next;
-    DECLARE_ALIGNED(32, float, idct_in)[ATRAC3P_SUBBANDS];
-    DECLARE_ALIGNED(32, float, idct_out)[ATRAC3P_SUBBANDS];
+    LOCAL_ALIGNED(32, float, idct_in, [ATRAC3P_SUBBANDS]);
+    LOCAL_ALIGNED(32, float, idct_out, [ATRAC3P_SUBBANDS]);
 
     memset(out, 0, ATRAC3P_FRAME_SAMPLES * sizeof(*out));
 
diff --git a/libavcodec/audio_frame_queue.c b/libavcodec/audio_frame_queue.c
index c4ca02b..f2ccd69 100644
--- a/libavcodec/audio_frame_queue.c
+++ b/libavcodec/audio_frame_queue.c
@@ -2,110 +2,72 @@
  * Audio Frame Queue
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
-#include "libavutil/mathematics.h"
-#include "internal.h"
 #include "audio_frame_queue.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
 
 av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq)
 {
-    afq->avctx             = avctx;
-    afq->next_pts          = AV_NOPTS_VALUE;
+    afq->avctx = avctx;
     afq->remaining_delay   = avctx->initial_padding;
     afq->remaining_samples = avctx->initial_padding;
-    afq->frame_queue       = NULL;
-}
-
-static void delete_next_frame(AudioFrameQueue *afq)
-{
-    AudioFrame *f = afq->frame_queue;
-    if (f) {
-        afq->frame_queue = f->next;
-        f->next = NULL;
-        av_freep(&f);
-    }
+    afq->frame_count       = 0;
 }
 
 void ff_af_queue_close(AudioFrameQueue *afq)
 {
-    /* remove/free any remaining frames */
-    while (afq->frame_queue)
-        delete_next_frame(afq);
+    if(afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "%d frames left in the queue on closing\n", afq->frame_count);
+    av_freep(&afq->frames);
     memset(afq, 0, sizeof(*afq));
 }
 
-#ifdef DEBUG
-static void af_queue_log_state(AudioFrameQueue *afq)
-{
-    AudioFrame *f;
-    ff_dlog(afq->avctx, "remaining delay   = %d\n", afq->remaining_delay);
-    ff_dlog(afq->avctx, "remaining samples = %d\n", afq->remaining_samples);
-    ff_dlog(afq->avctx, "frames:\n");
-    f = afq->frame_queue;
-    while (f) {
-        ff_dlog(afq->avctx, "  [ pts=%9"PRId64" duration=%d ]\n",
-                f->pts, f->duration);
-        f = f->next;
-    }
-}
-#endif /* DEBUG */
-
 int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f)
 {
-    AudioFrame *new_frame;
-    AudioFrame *queue_end = afq->frame_queue;
-
-    /* find the end of the queue */
-    while (queue_end && queue_end->next)
-        queue_end = queue_end->next;
-
-    /* allocate new frame queue entry */
-    if (!(new_frame = av_malloc(sizeof(*new_frame))))
+    AudioFrame *new = av_fast_realloc(afq->frames, &afq->frame_alloc, sizeof(*afq->frames)*(afq->frame_count+1));
+    if(!new)
         return AVERROR(ENOMEM);
+    afq->frames = new;
+    new += afq->frame_count;
 
     /* get frame parameters */
-    new_frame->next = NULL;
-    new_frame->duration = f->nb_samples;
+    new->duration = f->nb_samples;
+    new->duration += afq->remaining_delay;
     if (f->pts != AV_NOPTS_VALUE) {
-        new_frame->pts = av_rescale_q(f->pts,
+        new->pts = av_rescale_q(f->pts,
                                       afq->avctx->time_base,
                                       (AVRational){ 1, afq->avctx->sample_rate });
-        afq->next_pts = new_frame->pts + new_frame->duration;
+        new->pts -= afq->remaining_delay;
+        if(afq->frame_count && new[-1].pts >= new->pts)
+            av_log(afq->avctx, AV_LOG_WARNING, "Queue input is backward in time\n");
     } else {
-        new_frame->pts = AV_NOPTS_VALUE;
-        afq->next_pts  = AV_NOPTS_VALUE;
+        new->pts = AV_NOPTS_VALUE;
     }
-
-    /* add new frame to the end of the queue */
-    if (!queue_end)
-        afq->frame_queue = new_frame;
-    else
-        queue_end->next = new_frame;
+    afq->remaining_delay = 0;
 
     /* add frame sample count */
     afq->remaining_samples += f->nb_samples;
 
-#ifdef DEBUG
-    af_queue_log_state(afq);
-#endif
+    afq->frame_count++;
 
     return 0;
 }
@@ -115,50 +77,37 @@ void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
 {
     int64_t out_pts = AV_NOPTS_VALUE;
     int removed_samples = 0;
+    int i;
 
-#ifdef DEBUG
-    af_queue_log_state(afq);
-#endif
-
-    /* get output pts from the next frame or generated pts */
-    if (afq->frame_queue) {
-        if (afq->frame_queue->pts != AV_NOPTS_VALUE)
-            out_pts = afq->frame_queue->pts - afq->remaining_delay;
-    } else {
-        if (afq->next_pts != AV_NOPTS_VALUE)
-            out_pts = afq->next_pts - afq->remaining_delay;
+    if (afq->frame_count || afq->frame_alloc) {
+        if (afq->frames->pts != AV_NOPTS_VALUE)
+            out_pts = afq->frames->pts;
     }
-    if (pts) {
-        if (out_pts != AV_NOPTS_VALUE)
-            *pts = ff_samples_to_time_base(afq->avctx, out_pts);
-        else
-            *pts = AV_NOPTS_VALUE;
-    }
-
-    /* if the delay is larger than the packet duration, we use up delay samples
-       for the output packet and leave all frames in the queue */
-    if (afq->remaining_delay >= nb_samples) {
-        removed_samples      += nb_samples;
-        afq->remaining_delay -= nb_samples;
-    }
-    /* remove frames from the queue until we have enough to cover the
-       requested number of samples or until the queue is empty */
-    while (removed_samples < nb_samples && afq->frame_queue) {
-        removed_samples += afq->frame_queue->duration;
-        delete_next_frame(afq);
+    if(!afq->frame_count)
+        av_log(afq->avctx, AV_LOG_WARNING, "Trying to remove %d samples, but the queue is empty\n", nb_samples);
+    if (pts)
+        *pts = ff_samples_to_time_base(afq->avctx, out_pts);
+
+    for(i=0; nb_samples && i<afq->frame_count; i++){
+        int n= FFMIN(afq->frames[i].duration, nb_samples);
+        afq->frames[i].duration -= n;
+        nb_samples              -= n;
+        removed_samples         += n;
+        if(afq->frames[i].pts != AV_NOPTS_VALUE)
+            afq->frames[i].pts      += n;
     }
     afq->remaining_samples -= removed_samples;
-
-    /* if there are no frames left and we have room for more samples, use
-       any remaining delay samples */
-    if (removed_samples < nb_samples && afq->remaining_samples > 0) {
-        int add_samples = FFMIN(afq->remaining_samples,
-                                nb_samples - removed_samples);
-        removed_samples        += add_samples;
-        afq->remaining_samples -= add_samples;
+    i -= i && afq->frames[i-1].duration;
+    memmove(afq->frames, afq->frames + i, sizeof(*afq->frames) * (afq->frame_count - i));
+    afq->frame_count -= i;
+
+    if(nb_samples){
+        av_assert0(!afq->frame_count);
+        av_assert0(afq->remaining_samples == afq->remaining_delay);
+        if(afq->frames && afq->frames[0].pts != AV_NOPTS_VALUE)
+            afq->frames[0].pts += nb_samples;
+        av_log(afq->avctx, AV_LOG_DEBUG, "Trying to remove %d more samples than there are in the queue\n", nb_samples);
     }
-    if (removed_samples > nb_samples)
-        av_log(afq->avctx, AV_LOG_WARNING, "frame_size is too large\n");
     if (duration)
         *duration = ff_samples_to_time_base(afq->avctx, removed_samples);
 }
diff --git a/libavcodec/audio_frame_queue.h b/libavcodec/audio_frame_queue.h
index 1250ec2..d8076ea 100644
--- a/libavcodec/audio_frame_queue.h
+++ b/libavcodec/audio_frame_queue.h
@@ -2,20 +2,20 @@
  * Audio Frame Queue
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,15 @@
 typedef struct AudioFrame {
     int64_t pts;
     int duration;
-    struct AudioFrame *next;
 } AudioFrame;
 
 typedef struct AudioFrameQueue {
     AVCodecContext *avctx;
-    int64_t next_pts;
     int remaining_delay;
     int remaining_samples;
-    AudioFrame *frame_queue;
+    AudioFrame *frames;
+    unsigned frame_count;
+    unsigned frame_alloc;
 } AudioFrameQueue;
 
 /**
diff --git a/libavcodec/audioconvert.c b/libavcodec/audioconvert.c
new file mode 100644
index 0000000..5e46fae
--- /dev/null
+++ b/libavcodec/audioconvert.c
@@ -0,0 +1,120 @@
+/*
+ * audio conversion
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * audio conversion
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/libm.h"
+#include "libavutil/samplefmt.h"
+#include "avcodec.h"
+#include "audioconvert.h"
+
+#if FF_API_AUDIO_CONVERT
+
+struct AVAudioConvert {
+    int in_channels, out_channels;
+    int fmt_pair;
+};
+
+AVAudioConvert *av_audio_convert_alloc(enum AVSampleFormat out_fmt, int out_channels,
+                                       enum AVSampleFormat in_fmt, int in_channels,
+                                       const float *matrix, int flags)
+{
+    AVAudioConvert *ctx;
+    if (in_channels!=out_channels)
+        return NULL;  /* FIXME: not supported */
+    ctx = av_malloc(sizeof(AVAudioConvert));
+    if (!ctx)
+        return NULL;
+    ctx->in_channels = in_channels;
+    ctx->out_channels = out_channels;
+    ctx->fmt_pair = out_fmt + AV_SAMPLE_FMT_NB*in_fmt;
+    return ctx;
+}
+
+void av_audio_convert_free(AVAudioConvert *ctx)
+{
+    av_free(ctx);
+}
+
+int av_audio_convert(AVAudioConvert *ctx,
+                           void * const out[6], const int out_stride[6],
+                     const void * const  in[6], const int  in_stride[6], int len)
+{
+    int ch;
+
+    //FIXME optimize common cases
+
+    for(ch=0; ch<ctx->out_channels; ch++){
+        const int is=  in_stride[ch];
+        const int os= out_stride[ch];
+        const uint8_t *pi=  in[ch];
+        uint8_t *po= out[ch];
+        uint8_t *end= po + os*len;
+        if(!out[ch])
+            continue;
+
+#define CONV(ofmt, otype, ifmt, expr)\
+if(ctx->fmt_pair == ofmt + AV_SAMPLE_FMT_NB*ifmt){\
+    do{\
+        *(otype*)po = expr; pi += is; po += os;\
+    }while(po < end);\
+}
+
+//FIXME put things below under ifdefs so we do not waste space for cases no codec will need
+//FIXME rounding ?
+
+             CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_U8 ,  *(const uint8_t*)pi)
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)<<8)
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)<<24)
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)*(1.0 / (1<<7)))
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)*(1.0 / (1<<7)))
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_S16, (*(const int16_t*)pi>>8) + 0x80)
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_S16,  *(const int16_t*)pi)
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_S16,  *(const int16_t*)pi<<16)
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_S16,  *(const int16_t*)pi*(1.0 / (1<<15)))
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_S16,  *(const int16_t*)pi*(1.0 / (1<<15)))
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_S32, (*(const int32_t*)pi>>24) + 0x80)
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_S32,  *(const int32_t*)pi>>16)
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_S32,  *(const int32_t*)pi)
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_S32,  *(const int32_t*)pi*(1.0 / (1U<<31)))
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_S32,  *(const int32_t*)pi*(1.0 / (1U<<31)))
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_FLT, av_clip_uint8(  lrintf(*(const float*)pi * (1<<7)) + 0x80))
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, av_clip_int16(  lrintf(*(const float*)pi * (1<<15))))
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, av_clipl_int32(llrintf(*(const float*)pi * (1U<<31))))
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_FLT, *(const float*)pi)
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_FLT, *(const float*)pi)
+        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_DBL, av_clip_uint8(  lrint(*(const double*)pi * (1<<7)) + 0x80))
+        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, av_clip_int16(  lrint(*(const double*)pi * (1<<15))))
+        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, av_clipl_int32(llrint(*(const double*)pi * (1U<<31))))
+        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_DBL, *(const double*)pi)
+        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_DBL, *(const double*)pi)
+        else return -1;
+    }
+    return 0;
+}
+
+#endif /* FF_API_AUDIO_CONVERT */
diff --git a/libavcodec/audioconvert.h b/libavcodec/audioconvert.h
new file mode 100644
index 0000000..996c3f3
--- /dev/null
+++ b/libavcodec/audioconvert.h
@@ -0,0 +1,86 @@
+/*
+ * audio conversion
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2008 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AUDIOCONVERT_H
+#define AVCODEC_AUDIOCONVERT_H
+
+#include "version.h"
+
+/**
+ * @file
+ * Audio format conversion routines
+ * This interface is deprecated and will be dropped in a future
+ * version. You should use the libswresample library instead.
+ */
+
+#if FF_API_AUDIO_CONVERT
+
+#include "libavutil/cpu.h"
+#include "avcodec.h"
+#include "libavutil/channel_layout.h"
+
+struct AVAudioConvert;
+typedef struct AVAudioConvert AVAudioConvert;
+
+/**
+ * Create an audio sample format converter context
+ * @param out_fmt Output sample format
+ * @param out_channels Number of output channels
+ * @param in_fmt Input sample format
+ * @param in_channels Number of input channels
+ * @param[in] matrix Channel mixing matrix (of dimension in_channel*out_channels). Set to NULL to ignore.
+ * @param flags See AV_CPU_FLAG_xx
+ * @return NULL on error
+ * @deprecated See libswresample
+ */
+
+attribute_deprecated
+AVAudioConvert *av_audio_convert_alloc(enum AVSampleFormat out_fmt, int out_channels,
+                                       enum AVSampleFormat in_fmt, int in_channels,
+                                       const float *matrix, int flags);
+
+/**
+ * Free audio sample format converter context
+ * @deprecated See libswresample
+ */
+
+attribute_deprecated
+void av_audio_convert_free(AVAudioConvert *ctx);
+
+/**
+ * Convert between audio sample formats
+ * @param[in] out array of output buffers for each channel. set to NULL to ignore processing of the given channel.
+ * @param[in] out_stride distance between consecutive output samples (measured in bytes)
+ * @param[in] in array of input buffers for each channel
+ * @param[in] in_stride distance between consecutive input samples (measured in bytes)
+ * @param len length of audio frame size (measured in samples)
+ * @deprecated See libswresample
+ */
+
+attribute_deprecated
+int av_audio_convert(AVAudioConvert *ctx,
+                           void * const out[6], const int out_stride[6],
+                     const void * const  in[6], const int  in_stride[6], int len);
+
+#endif /* FF_API_AUDIO_CONVERT */
+
+#endif /* AVCODEC_AUDIOCONVERT_H */
diff --git a/libavcodec/audiodsp.c b/libavcodec/audiodsp.c
index f7e6167..85b5a74 100644
--- a/libavcodec/audiodsp.c
+++ b/libavcodec/audiodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/audiodsp.h b/libavcodec/audiodsp.h
index 58205a1..b55bf85 100644
--- a/libavcodec/audiodsp.h
+++ b/libavcodec/audiodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/audiotoolboxdec.c b/libavcodec/audiotoolboxdec.c
new file mode 100644
index 0000000..1097668
--- /dev/null
+++ b/libavcodec/audiotoolboxdec.c
@@ -0,0 +1,625 @@
+/*
+ * Audio Toolbox system codecs
+ *
+ * copyright (c) 2016 Rodger Combs
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <AudioToolbox/AudioToolbox.h>
+
+#include "config.h"
+#include "avcodec.h"
+#include "ac3_parser.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "mpegaudiodecheader.h"
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+#include "libavutil/log.h"
+
+#ifndef __MAC_10_11
+#define kAudioFormatEnhancedAC3 'ec-3'
+#endif
+
+typedef struct ATDecodeContext {
+    AVClass *av_class;
+
+    AudioConverterRef converter;
+    AudioStreamPacketDescription pkt_desc;
+    AVPacket in_pkt;
+    AVPacket new_in_pkt;
+    AVBSFContext *bsf;
+    char *decoded_data;
+    int channel_map[64];
+
+    uint8_t *extradata;
+    int extradata_size;
+
+    int64_t last_pts;
+    int eof;
+} ATDecodeContext;
+
+static UInt32 ffat_get_format_id(enum AVCodecID codec, int profile)
+{
+    switch (codec) {
+    case AV_CODEC_ID_AAC:
+        return kAudioFormatMPEG4AAC;
+    case AV_CODEC_ID_AC3:
+        return kAudioFormatAC3;
+    case AV_CODEC_ID_ADPCM_IMA_QT:
+        return kAudioFormatAppleIMA4;
+    case AV_CODEC_ID_ALAC:
+        return kAudioFormatAppleLossless;
+    case AV_CODEC_ID_AMR_NB:
+        return kAudioFormatAMR;
+    case AV_CODEC_ID_EAC3:
+        return kAudioFormatEnhancedAC3;
+    case AV_CODEC_ID_GSM_MS:
+        return kAudioFormatMicrosoftGSM;
+    case AV_CODEC_ID_ILBC:
+        return kAudioFormatiLBC;
+    case AV_CODEC_ID_MP1:
+        return kAudioFormatMPEGLayer1;
+    case AV_CODEC_ID_MP2:
+        return kAudioFormatMPEGLayer2;
+    case AV_CODEC_ID_MP3:
+        return kAudioFormatMPEGLayer3;
+    case AV_CODEC_ID_PCM_ALAW:
+        return kAudioFormatALaw;
+    case AV_CODEC_ID_PCM_MULAW:
+        return kAudioFormatULaw;
+    case AV_CODEC_ID_QDMC:
+        return kAudioFormatQDesign;
+    case AV_CODEC_ID_QDM2:
+        return kAudioFormatQDesign2;
+    default:
+        av_assert0(!"Invalid codec ID!");
+        return 0;
+    }
+}
+
+static int ffat_get_channel_id(AudioChannelLabel label)
+{
+    if (label == 0)
+        return -1;
+    else if (label <= kAudioChannelLabel_LFEScreen)
+        return label - 1;
+    else if (label <= kAudioChannelLabel_RightSurround)
+        return label + 4;
+    else if (label <= kAudioChannelLabel_CenterSurround)
+        return label + 1;
+    else if (label <= kAudioChannelLabel_RightSurroundDirect)
+        return label + 23;
+    else if (label <= kAudioChannelLabel_TopBackRight)
+        return label - 1;
+    else if (label < kAudioChannelLabel_RearSurroundLeft)
+        return -1;
+    else if (label <= kAudioChannelLabel_RearSurroundRight)
+        return label - 29;
+    else if (label <= kAudioChannelLabel_RightWide)
+        return label - 4;
+    else if (label == kAudioChannelLabel_LFE2)
+        return ff_ctzll(AV_CH_LOW_FREQUENCY_2);
+    else if (label == kAudioChannelLabel_Mono)
+        return ff_ctzll(AV_CH_FRONT_CENTER);
+    else
+        return -1;
+}
+
+static int ffat_compare_channel_descriptions(const void* a, const void* b)
+{
+    const AudioChannelDescription* da = a;
+    const AudioChannelDescription* db = b;
+    return ffat_get_channel_id(da->mChannelLabel) - ffat_get_channel_id(db->mChannelLabel);
+}
+
+static AudioChannelLayout *ffat_convert_layout(AudioChannelLayout *layout, UInt32* size)
+{
+    AudioChannelLayoutTag tag = layout->mChannelLayoutTag;
+    AudioChannelLayout *new_layout;
+    if (tag == kAudioChannelLayoutTag_UseChannelDescriptions)
+        return layout;
+    else if (tag == kAudioChannelLayoutTag_UseChannelBitmap)
+        AudioFormatGetPropertyInfo(kAudioFormatProperty_ChannelLayoutForBitmap,
+                                   sizeof(UInt32), &layout->mChannelBitmap, size);
+    else
+        AudioFormatGetPropertyInfo(kAudioFormatProperty_ChannelLayoutForTag,
+                                   sizeof(AudioChannelLayoutTag), &tag, size);
+    new_layout = av_malloc(*size);
+    if (!new_layout) {
+        av_free(layout);
+        return NULL;
+    }
+    if (tag == kAudioChannelLayoutTag_UseChannelBitmap)
+        AudioFormatGetProperty(kAudioFormatProperty_ChannelLayoutForBitmap,
+                               sizeof(UInt32), &layout->mChannelBitmap, size, new_layout);
+    else
+        AudioFormatGetProperty(kAudioFormatProperty_ChannelLayoutForTag,
+                               sizeof(AudioChannelLayoutTag), &tag, size, new_layout);
+    new_layout->mChannelLayoutTag = kAudioChannelLayoutTag_UseChannelDescriptions;
+    av_free(layout);
+    return new_layout;
+}
+
+static int ffat_update_ctx(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioStreamBasicDescription format;
+    UInt32 size = sizeof(format);
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterCurrentInputStreamDescription,
+                                   &size, &format)) {
+        if (format.mSampleRate)
+            avctx->sample_rate = format.mSampleRate;
+        avctx->channels = format.mChannelsPerFrame;
+        avctx->channel_layout = av_get_default_channel_layout(avctx->channels);
+        avctx->frame_size = format.mFramesPerPacket;
+    }
+
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterCurrentOutputStreamDescription,
+                                   &size, &format)) {
+        format.mSampleRate = avctx->sample_rate;
+        format.mChannelsPerFrame = avctx->channels;
+        AudioConverterSetProperty(at->converter,
+                                  kAudioConverterCurrentOutputStreamDescription,
+                                  size, &format);
+    }
+
+    if (!AudioConverterGetPropertyInfo(at->converter, kAudioConverterOutputChannelLayout,
+                                       &size, NULL) && size) {
+        AudioChannelLayout *layout = av_malloc(size);
+        uint64_t layout_mask = 0;
+        int i;
+        if (!layout)
+            return AVERROR(ENOMEM);
+        AudioConverterGetProperty(at->converter, kAudioConverterOutputChannelLayout,
+                                  &size, layout);
+        if (!(layout = ffat_convert_layout(layout, &size)))
+            return AVERROR(ENOMEM);
+        for (i = 0; i < layout->mNumberChannelDescriptions; i++) {
+            int id = ffat_get_channel_id(layout->mChannelDescriptions[i].mChannelLabel);
+            if (id < 0)
+                goto done;
+            if (layout_mask & (1 << id))
+                goto done;
+            layout_mask |= 1 << id;
+            layout->mChannelDescriptions[i].mChannelFlags = i; // Abusing flags as index
+        }
+        avctx->channel_layout = layout_mask;
+        qsort(layout->mChannelDescriptions, layout->mNumberChannelDescriptions,
+              sizeof(AudioChannelDescription), &ffat_compare_channel_descriptions);
+        for (i = 0; i < layout->mNumberChannelDescriptions; i++)
+            at->channel_map[i] = layout->mChannelDescriptions[i].mChannelFlags;
+done:
+        av_free(layout);
+    }
+
+    if (!avctx->frame_size)
+        avctx->frame_size = 2048;
+
+    return 0;
+}
+
+static void put_descr(PutByteContext *pb, int tag, unsigned int size)
+{
+    int i = 3;
+    bytestream2_put_byte(pb, tag);
+    for (; i > 0; i--)
+        bytestream2_put_byte(pb, (size >> (7 * i)) | 0x80);
+    bytestream2_put_byte(pb, size & 0x7F);
+}
+
+static uint8_t* ffat_get_magic_cookie(AVCodecContext *avctx, UInt32 *cookie_size)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (avctx->codec_id == AV_CODEC_ID_AAC) {
+        char *extradata;
+        PutByteContext pb;
+        *cookie_size = 5 + 3 + 5+13 + 5+at->extradata_size;
+        if (!(extradata = av_malloc(*cookie_size)))
+            return NULL;
+
+        bytestream2_init_writer(&pb, extradata, *cookie_size);
+
+        // ES descriptor
+        put_descr(&pb, 0x03, 3 + 5+13 + 5+at->extradata_size);
+        bytestream2_put_be16(&pb, 0);
+        bytestream2_put_byte(&pb, 0x00); // flags (= no flags)
+
+        // DecoderConfig descriptor
+        put_descr(&pb, 0x04, 13 + 5+at->extradata_size);
+
+        // Object type indication
+        bytestream2_put_byte(&pb, 0x40);
+
+        bytestream2_put_byte(&pb, 0x15); // flags (= Audiostream)
+
+        bytestream2_put_be24(&pb, 0); // Buffersize DB
+
+        bytestream2_put_be32(&pb, 0); // maxbitrate
+        bytestream2_put_be32(&pb, 0); // avgbitrate
+
+        // DecoderSpecific info descriptor
+        put_descr(&pb, 0x05, at->extradata_size);
+        bytestream2_put_buffer(&pb, at->extradata, at->extradata_size);
+        return extradata;
+    } else {
+        *cookie_size = at->extradata_size;
+        return at->extradata;
+    }
+}
+
+static av_cold int ffat_usable_extradata(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    return at->extradata_size &&
+           (avctx->codec_id == AV_CODEC_ID_ALAC ||
+            avctx->codec_id == AV_CODEC_ID_QDM2 ||
+            avctx->codec_id == AV_CODEC_ID_QDMC ||
+            avctx->codec_id == AV_CODEC_ID_AAC);
+}
+
+static int ffat_set_extradata(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (ffat_usable_extradata(avctx)) {
+        OSStatus status;
+        UInt32 cookie_size;
+        uint8_t *cookie = ffat_get_magic_cookie(avctx, &cookie_size);
+        if (!cookie)
+            return AVERROR(ENOMEM);
+
+        status = AudioConverterSetProperty(at->converter,
+                                           kAudioConverterDecompressionMagicCookie,
+                                           cookie_size, cookie);
+        if (status != 0)
+            av_log(avctx, AV_LOG_WARNING, "AudioToolbox cookie error: %i\n", (int)status);
+
+        if (cookie != at->extradata)
+            av_free(cookie);
+    }
+    return 0;
+}
+
+static av_cold int ffat_create_decoder(AVCodecContext *avctx, AVPacket *pkt)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    OSStatus status;
+    int i;
+
+    enum AVSampleFormat sample_fmt = (avctx->bits_per_raw_sample == 32) ?
+                                     AV_SAMPLE_FMT_S32 : AV_SAMPLE_FMT_S16;
+
+    AudioStreamBasicDescription in_format = {
+        .mFormatID = ffat_get_format_id(avctx->codec_id, avctx->profile),
+        .mBytesPerPacket = (avctx->codec_id == AV_CODEC_ID_ILBC) ? avctx->block_align : 0,
+    };
+    AudioStreamBasicDescription out_format = {
+        .mFormatID = kAudioFormatLinearPCM,
+        .mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked,
+        .mFramesPerPacket = 1,
+        .mBitsPerChannel = av_get_bytes_per_sample(sample_fmt) * 8,
+    };
+
+    avctx->sample_fmt = sample_fmt;
+
+    if (ffat_usable_extradata(avctx)) {
+        UInt32 format_size = sizeof(in_format);
+        UInt32 cookie_size;
+        uint8_t *cookie = ffat_get_magic_cookie(avctx, &cookie_size);
+        if (!cookie)
+            return AVERROR(ENOMEM);
+        status = AudioFormatGetProperty(kAudioFormatProperty_FormatInfo,
+                                        cookie_size, cookie, &format_size, &in_format);
+        if (cookie != at->extradata)
+            av_free(cookie);
+        if (status != 0) {
+            av_log(avctx, AV_LOG_ERROR, "AudioToolbox header-parse error: %i\n", (int)status);
+            return AVERROR_UNKNOWN;
+        }
+#if CONFIG_MP1_AT_DECODER || CONFIG_MP2_AT_DECODER || CONFIG_MP3_AT_DECODER
+    } else if (pkt && pkt->size >= 4 &&
+               (avctx->codec_id == AV_CODEC_ID_MP1 ||
+                avctx->codec_id == AV_CODEC_ID_MP2 ||
+                avctx->codec_id == AV_CODEC_ID_MP3)) {
+        enum AVCodecID codec_id;
+        int bit_rate;
+        if (ff_mpa_decode_header(AV_RB32(pkt->data), &avctx->sample_rate,
+                                 &in_format.mChannelsPerFrame, &avctx->frame_size,
+                                 &bit_rate, &codec_id) < 0)
+            return AVERROR_INVALIDDATA;
+        avctx->bit_rate = bit_rate;
+        in_format.mSampleRate = avctx->sample_rate;
+#endif
+#if CONFIG_AC3_AT_DECODER || CONFIG_EAC3_AT_DECODER
+    } else if (pkt && pkt->size >= 7 &&
+               (avctx->codec_id == AV_CODEC_ID_AC3 ||
+                avctx->codec_id == AV_CODEC_ID_EAC3)) {
+        AC3HeaderInfo hdr, *phdr = &hdr;
+        GetBitContext gbc;
+        init_get_bits(&gbc, pkt->data, pkt->size);
+        if (avpriv_ac3_parse_header(&gbc, &phdr) < 0)
+            return AVERROR_INVALIDDATA;
+        in_format.mSampleRate = hdr.sample_rate;
+        in_format.mChannelsPerFrame = hdr.channels;
+        avctx->frame_size = hdr.num_blocks * 256;
+        avctx->bit_rate = hdr.bit_rate;
+#endif
+    } else {
+        in_format.mSampleRate = avctx->sample_rate ? avctx->sample_rate : 44100;
+        in_format.mChannelsPerFrame = avctx->channels ? avctx->channels : 1;
+    }
+
+    avctx->sample_rate = out_format.mSampleRate = in_format.mSampleRate;
+    avctx->channels = out_format.mChannelsPerFrame = in_format.mChannelsPerFrame;
+
+    if (avctx->codec_id == AV_CODEC_ID_ADPCM_IMA_QT)
+        in_format.mFramesPerPacket = 64;
+
+    status = AudioConverterNew(&in_format, &out_format, &at->converter);
+
+    if (status != 0) {
+        av_log(avctx, AV_LOG_ERROR, "AudioToolbox init error: %i\n", (int)status);
+        return AVERROR_UNKNOWN;
+    }
+
+    if ((status = ffat_set_extradata(avctx)) < 0)
+        return status;
+
+    for (i = 0; i < (sizeof(at->channel_map) / sizeof(at->channel_map[0])); i++)
+        at->channel_map[i] = i;
+
+    ffat_update_ctx(avctx);
+
+    if(!(at->decoded_data = av_malloc(av_get_bytes_per_sample(avctx->sample_fmt)
+                                      * avctx->frame_size * avctx->channels)))
+        return AVERROR(ENOMEM);
+
+    at->last_pts = AV_NOPTS_VALUE;
+
+    return 0;
+}
+
+static av_cold int ffat_init_decoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    at->extradata = avctx->extradata;
+    at->extradata_size = avctx->extradata_size;
+
+    if ((avctx->channels && avctx->sample_rate) || ffat_usable_extradata(avctx))
+        return ffat_create_decoder(avctx, NULL);
+    else
+        return 0;
+}
+
+static OSStatus ffat_decode_callback(AudioConverterRef converter, UInt32 *nb_packets,
+                                     AudioBufferList *data,
+                                     AudioStreamPacketDescription **packets,
+                                     void *inctx)
+{
+    AVCodecContext *avctx = inctx;
+    ATDecodeContext *at = avctx->priv_data;
+
+    if (at->eof) {
+        *nb_packets = 0;
+        if (packets) {
+            *packets = &at->pkt_desc;
+            at->pkt_desc.mDataByteSize = 0;
+        }
+        return 0;
+    }
+
+    av_packet_unref(&at->in_pkt);
+    av_packet_move_ref(&at->in_pkt, &at->new_in_pkt);
+
+    if (!at->in_pkt.data) {
+        *nb_packets = 0;
+        return 1;
+    }
+
+    data->mNumberBuffers              = 1;
+    data->mBuffers[0].mNumberChannels = 0;
+    data->mBuffers[0].mDataByteSize   = at->in_pkt.size;
+    data->mBuffers[0].mData           = at->in_pkt.data;
+    *nb_packets = 1;
+
+    if (packets) {
+        *packets = &at->pkt_desc;
+        at->pkt_desc.mDataByteSize = at->in_pkt.size;
+    }
+
+    return 0;
+}
+
+#define COPY_SAMPLES(type) \
+    type *in_ptr = (type*)at->decoded_data; \
+    type *end_ptr = in_ptr + frame->nb_samples * avctx->channels; \
+    type *out_ptr = (type*)frame->data[0]; \
+    for (; in_ptr < end_ptr; in_ptr += avctx->channels, out_ptr += avctx->channels) { \
+        int c; \
+        for (c = 0; c < avctx->channels; c++) \
+            out_ptr[c] = in_ptr[at->channel_map[c]]; \
+    }
+
+static void ffat_copy_samples(AVCodecContext *avctx, AVFrame *frame)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    if (avctx->sample_fmt == AV_SAMPLE_FMT_S32) {
+        COPY_SAMPLES(int32_t);
+    } else {
+        COPY_SAMPLES(int16_t);
+    }
+}
+
+static int ffat_decode(AVCodecContext *avctx, void *data,
+                       int *got_frame_ptr, AVPacket *avpkt)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AVFrame *frame = data;
+    int pkt_size = avpkt->size;
+    AVPacket filtered_packet = {0};
+    OSStatus ret;
+    AudioBufferList out_buffers;
+
+    if (avctx->codec_id == AV_CODEC_ID_AAC && avpkt->size > 2 &&
+        (AV_RB16(avpkt->data) & 0xfff0) == 0xfff0) {
+        AVPacket filter_pkt = {0};
+        if (!at->bsf) {
+            const AVBitStreamFilter *bsf = av_bsf_get_by_name("aac_adtstoasc");
+            if(!bsf)
+                return AVERROR_BSF_NOT_FOUND;
+            if ((ret = av_bsf_alloc(bsf, &at->bsf)))
+                return ret;
+            if (((ret = avcodec_parameters_from_context(at->bsf->par_in, avctx)) < 0) ||
+                ((ret = av_bsf_init(at->bsf)) < 0)) {
+                av_bsf_free(&at->bsf);
+                return ret;
+            }
+        }
+
+        if ((ret = av_packet_ref(&filter_pkt, avpkt)) < 0)
+            return ret;
+
+        if ((ret = av_bsf_send_packet(at->bsf, &filter_pkt)) < 0) {
+            av_packet_unref(&filter_pkt);
+            return ret;
+        }
+
+        if ((ret = av_bsf_receive_packet(at->bsf, &filtered_packet)) < 0)
+            return ret;
+
+        at->extradata = at->bsf->par_out->extradata;
+        at->extradata_size = at->bsf->par_out->extradata_size;
+
+        avpkt = &filtered_packet;
+    }
+
+    if (!at->converter) {
+        if ((ret = ffat_create_decoder(avctx, avpkt)) < 0) {
+            av_packet_unref(&filtered_packet);
+            return ret;
+        }
+    }
+
+    out_buffers = (AudioBufferList){
+        .mNumberBuffers = 1,
+        .mBuffers = {
+            {
+                .mNumberChannels = avctx->channels,
+                .mDataByteSize = av_get_bytes_per_sample(avctx->sample_fmt) * avctx->frame_size
+                                 * avctx->channels,
+            }
+        }
+    };
+
+    av_packet_unref(&at->new_in_pkt);
+
+    if (avpkt->size) {
+        if (filtered_packet.data) {
+            at->new_in_pkt = filtered_packet;
+        } else if ((ret = av_packet_ref(&at->new_in_pkt, avpkt)) < 0) {
+            return ret;
+        }
+    } else {
+        at->eof = 1;
+    }
+
+    frame->sample_rate = avctx->sample_rate;
+
+    frame->nb_samples = avctx->frame_size;
+
+    out_buffers.mBuffers[0].mData = at->decoded_data;
+
+    ret = AudioConverterFillComplexBuffer(at->converter, ffat_decode_callback, avctx,
+                                          &frame->nb_samples, &out_buffers, NULL);
+    if ((!ret || ret == 1) && frame->nb_samples) {
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+        ffat_copy_samples(avctx, frame);
+        *got_frame_ptr = 1;
+        if (at->last_pts != AV_NOPTS_VALUE) {
+            frame->pkt_pts = at->last_pts;
+            at->last_pts = avpkt->pts;
+        }
+    } else if (ret && ret != 1) {
+        av_log(avctx, AV_LOG_WARNING, "Decode error: %i\n", ret);
+    } else {
+        at->last_pts = avpkt->pts;
+    }
+
+    return pkt_size;
+}
+
+static av_cold void ffat_decode_flush(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioConverterReset(at->converter);
+    av_packet_unref(&at->new_in_pkt);
+    av_packet_unref(&at->in_pkt);
+}
+
+static av_cold int ffat_close_decoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioConverterDispose(at->converter);
+    av_bsf_free(&at->bsf);
+    av_packet_unref(&at->new_in_pkt);
+    av_packet_unref(&at->in_pkt);
+    av_free(at->decoded_data);
+    return 0;
+}
+
+#define FFAT_DEC_CLASS(NAME) \
+    static const AVClass ffat_##NAME##_dec_class = { \
+        .class_name = "at_" #NAME "_dec", \
+        .version    = LIBAVUTIL_VERSION_INT, \
+    };
+
+#define FFAT_DEC(NAME, ID) \
+    FFAT_DEC_CLASS(NAME) \
+    AVCodec ff_##NAME##_at_decoder = { \
+        .name           = #NAME "_at", \
+        .long_name      = NULL_IF_CONFIG_SMALL(#NAME " (AudioToolbox)"), \
+        .type           = AVMEDIA_TYPE_AUDIO, \
+        .id             = ID, \
+        .priv_data_size = sizeof(ATDecodeContext), \
+        .init           = ffat_init_decoder, \
+        .close          = ffat_close_decoder, \
+        .decode         = ffat_decode, \
+        .flush          = ffat_decode_flush, \
+        .priv_class     = &ffat_##NAME##_dec_class, \
+        .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY, \
+        .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE, \
+    };
+
+FFAT_DEC(aac,          AV_CODEC_ID_AAC)
+FFAT_DEC(ac3,          AV_CODEC_ID_AC3)
+FFAT_DEC(adpcm_ima_qt, AV_CODEC_ID_ADPCM_IMA_QT)
+FFAT_DEC(alac,         AV_CODEC_ID_ALAC)
+FFAT_DEC(amr_nb,       AV_CODEC_ID_AMR_NB)
+FFAT_DEC(eac3,         AV_CODEC_ID_EAC3)
+FFAT_DEC(gsm_ms,       AV_CODEC_ID_GSM_MS)
+FFAT_DEC(ilbc,         AV_CODEC_ID_ILBC)
+FFAT_DEC(mp1,          AV_CODEC_ID_MP1)
+FFAT_DEC(mp2,          AV_CODEC_ID_MP2)
+FFAT_DEC(mp3,          AV_CODEC_ID_MP3)
+FFAT_DEC(pcm_alaw,     AV_CODEC_ID_PCM_ALAW)
+FFAT_DEC(pcm_mulaw,    AV_CODEC_ID_PCM_MULAW)
+FFAT_DEC(qdmc,         AV_CODEC_ID_QDMC)
+FFAT_DEC(qdm2,         AV_CODEC_ID_QDM2)
diff --git a/libavcodec/audiotoolboxenc.c b/libavcodec/audiotoolboxenc.c
new file mode 100644
index 0000000..c47fbd1
--- /dev/null
+++ b/libavcodec/audiotoolboxenc.c
@@ -0,0 +1,645 @@
+/*
+ * Audio Toolbox system codecs
+ *
+ * copyright (c) 2016 Rodger Combs
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <AudioToolbox/AudioToolbox.h>
+
+#define FF_BUFQUEUE_SIZE 256
+#include "libavfilter/bufferqueue.h"
+
+#include "config.h"
+#include "audio_frame_queue.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "libavformat/isom.h"
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+#include "libavutil/log.h"
+
+typedef struct ATDecodeContext {
+    AVClass *av_class;
+    int mode;
+    int quality;
+
+    AudioConverterRef converter;
+    struct FFBufQueue frame_queue;
+    struct FFBufQueue used_frame_queue;
+
+    unsigned pkt_size;
+    AudioFrameQueue afq;
+    int eof;
+    int frame_size;
+} ATDecodeContext;
+
+static UInt32 ffat_get_format_id(enum AVCodecID codec, int profile)
+{
+    switch (codec) {
+    case AV_CODEC_ID_AAC:
+        switch (profile) {
+        case FF_PROFILE_AAC_LOW:
+        default:
+            return kAudioFormatMPEG4AAC;
+        case FF_PROFILE_AAC_HE:
+            return kAudioFormatMPEG4AAC_HE;
+        case FF_PROFILE_AAC_HE_V2:
+            return kAudioFormatMPEG4AAC_HE_V2;
+        case FF_PROFILE_AAC_LD:
+            return kAudioFormatMPEG4AAC_LD;
+        case FF_PROFILE_AAC_ELD:
+            return kAudioFormatMPEG4AAC_ELD;
+        }
+    case AV_CODEC_ID_ADPCM_IMA_QT:
+        return kAudioFormatAppleIMA4;
+    case AV_CODEC_ID_ALAC:
+        return kAudioFormatAppleLossless;
+    case AV_CODEC_ID_ILBC:
+        return kAudioFormatiLBC;
+    case AV_CODEC_ID_PCM_ALAW:
+        return kAudioFormatALaw;
+    case AV_CODEC_ID_PCM_MULAW:
+        return kAudioFormatULaw;
+    default:
+        av_assert0(!"Invalid codec ID!");
+        return 0;
+    }
+}
+
+static void ffat_update_ctx(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    UInt32 size = sizeof(unsigned);
+    AudioConverterPrimeInfo prime_info;
+    AudioStreamBasicDescription out_format;
+
+    AudioConverterGetProperty(at->converter,
+                              kAudioConverterPropertyMaximumOutputPacketSize,
+                              &size, &at->pkt_size);
+
+    if (at->pkt_size <= 0)
+        at->pkt_size = 1024 * 50;
+
+    size = sizeof(prime_info);
+
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterPrimeInfo,
+                                   &size, &prime_info)) {
+        avctx->initial_padding = prime_info.leadingFrames;
+    }
+
+    size = sizeof(out_format);
+    if (!AudioConverterGetProperty(at->converter,
+                                   kAudioConverterCurrentOutputStreamDescription,
+                                   &size, &out_format)) {
+        if (out_format.mFramesPerPacket)
+            avctx->frame_size = out_format.mFramesPerPacket;
+        if (out_format.mBytesPerPacket && avctx->codec_id == AV_CODEC_ID_ILBC)
+            avctx->block_align = out_format.mBytesPerPacket;
+    }
+
+    at->frame_size = avctx->frame_size;
+    if (avctx->codec_id == AV_CODEC_ID_PCM_MULAW ||
+        avctx->codec_id == AV_CODEC_ID_PCM_ALAW) {
+        at->pkt_size *= 1024;
+        avctx->frame_size *= 1024;
+    }
+}
+
+static int read_descr(GetByteContext *gb, int *tag)
+{
+    int len = 0;
+    int count = 4;
+    *tag = bytestream2_get_byte(gb);
+    while (count--) {
+        int c = bytestream2_get_byte(gb);
+        len = (len << 7) | (c & 0x7f);
+        if (!(c & 0x80))
+            break;
+    }
+    return len;
+}
+
+static int get_ilbc_mode(AVCodecContext *avctx)
+{
+    if (avctx->block_align == 38)
+        return 20;
+    else if (avctx->block_align == 50)
+        return 30;
+    else if (avctx->bit_rate > 0)
+        return avctx->bit_rate <= 14000 ? 30 : 20;
+    else
+        return 30;
+}
+
+static av_cold int get_channel_label(int channel)
+{
+    uint64_t map = 1 << channel;
+    if (map <= AV_CH_LOW_FREQUENCY)
+        return channel + 1;
+    else if (map <= AV_CH_BACK_RIGHT)
+        return channel + 29;
+    else if (map <= AV_CH_BACK_CENTER)
+        return channel - 1;
+    else if (map <= AV_CH_SIDE_RIGHT)
+        return channel - 4;
+    else if (map <= AV_CH_TOP_BACK_RIGHT)
+        return channel + 1;
+    else if (map <= AV_CH_STEREO_RIGHT)
+        return -1;
+    else if (map <= AV_CH_WIDE_RIGHT)
+        return channel + 4;
+    else if (map <= AV_CH_SURROUND_DIRECT_RIGHT)
+        return channel - 23;
+    else if (map == AV_CH_LOW_FREQUENCY_2)
+        return kAudioChannelLabel_LFE2;
+    else
+        return -1;
+}
+
+static int remap_layout(AudioChannelLayout *layout, uint64_t in_layout, int count)
+{
+    int i;
+    int c = 0;
+    layout->mChannelLayoutTag = kAudioChannelLayoutTag_UseChannelDescriptions;
+    layout->mNumberChannelDescriptions = count;
+    for (i = 0; i < count; i++) {
+        int label;
+        while (!(in_layout & (1 << c)) && c < 64)
+            c++;
+        if (c == 64)
+            return AVERROR(EINVAL); // This should never happen
+        label = get_channel_label(c);
+        layout->mChannelDescriptions[i].mChannelLabel = label;
+        if (label < 0)
+            return AVERROR(EINVAL);
+        c++;
+    }
+    return 0;
+}
+
+static int get_aac_tag(uint64_t in_layout)
+{
+    switch (in_layout) {
+    case AV_CH_LAYOUT_MONO:
+        return kAudioChannelLayoutTag_Mono;
+    case AV_CH_LAYOUT_STEREO:
+        return kAudioChannelLayoutTag_Stereo;
+    case AV_CH_LAYOUT_QUAD:
+        return kAudioChannelLayoutTag_AAC_Quadraphonic;
+    case AV_CH_LAYOUT_OCTAGONAL:
+        return kAudioChannelLayoutTag_AAC_Octagonal;
+    case AV_CH_LAYOUT_SURROUND:
+        return kAudioChannelLayoutTag_AAC_3_0;
+    case AV_CH_LAYOUT_4POINT0:
+        return kAudioChannelLayoutTag_AAC_4_0;
+    case AV_CH_LAYOUT_5POINT0:
+        return kAudioChannelLayoutTag_AAC_5_0;
+    case AV_CH_LAYOUT_5POINT1:
+        return kAudioChannelLayoutTag_AAC_5_1;
+    case AV_CH_LAYOUT_6POINT0:
+        return kAudioChannelLayoutTag_AAC_6_0;
+    case AV_CH_LAYOUT_6POINT1:
+        return kAudioChannelLayoutTag_AAC_6_1;
+    case AV_CH_LAYOUT_7POINT0:
+        return kAudioChannelLayoutTag_AAC_7_0;
+    case AV_CH_LAYOUT_7POINT1_WIDE_BACK:
+        return kAudioChannelLayoutTag_AAC_7_1;
+    case AV_CH_LAYOUT_7POINT1:
+        return kAudioChannelLayoutTag_MPEG_7_1_C;
+    default:
+        return 0;
+    }
+}
+
+static av_cold int ffat_init_encoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    OSStatus status;
+
+    AudioStreamBasicDescription in_format = {
+        .mSampleRate = avctx->sample_rate,
+        .mFormatID = kAudioFormatLinearPCM,
+        .mFormatFlags = ((avctx->sample_fmt == AV_SAMPLE_FMT_FLT ||
+                          avctx->sample_fmt == AV_SAMPLE_FMT_DBL) ? kAudioFormatFlagIsFloat
+                        : avctx->sample_fmt == AV_SAMPLE_FMT_U8 ? 0
+                        : kAudioFormatFlagIsSignedInteger)
+                        | kAudioFormatFlagIsPacked,
+        .mBytesPerPacket = av_get_bytes_per_sample(avctx->sample_fmt) * avctx->channels,
+        .mFramesPerPacket = 1,
+        .mBytesPerFrame = av_get_bytes_per_sample(avctx->sample_fmt) * avctx->channels,
+        .mChannelsPerFrame = avctx->channels,
+        .mBitsPerChannel = av_get_bytes_per_sample(avctx->sample_fmt) * 8,
+    };
+    AudioStreamBasicDescription out_format = {
+        .mSampleRate = avctx->sample_rate,
+        .mFormatID = ffat_get_format_id(avctx->codec_id, avctx->profile),
+        .mChannelsPerFrame = in_format.mChannelsPerFrame,
+    };
+    UInt32 layout_size = sizeof(AudioChannelLayout) +
+                         sizeof(AudioChannelDescription) * avctx->channels;
+    AudioChannelLayout *channel_layout = av_malloc(layout_size);
+
+    if (!channel_layout)
+        return AVERROR(ENOMEM);
+
+    if (avctx->codec_id == AV_CODEC_ID_ILBC) {
+        int mode = get_ilbc_mode(avctx);
+        out_format.mFramesPerPacket  = 8000 * mode / 1000;
+        out_format.mBytesPerPacket   = (mode == 20 ? 38 : 50);
+    }
+
+    status = AudioConverterNew(&in_format, &out_format, &at->converter);
+
+    if (status != 0) {
+        av_log(avctx, AV_LOG_ERROR, "AudioToolbox init error: %i\n", (int)status);
+        av_free(channel_layout);
+        return AVERROR_UNKNOWN;
+    }
+
+    if (!avctx->channel_layout)
+        avctx->channel_layout = av_get_default_channel_layout(avctx->channels);
+
+    if ((status = remap_layout(channel_layout, avctx->channel_layout, avctx->channels)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid channel layout\n");
+        av_free(channel_layout);
+        return status;
+    }
+
+    if (AudioConverterSetProperty(at->converter, kAudioConverterInputChannelLayout,
+                                  layout_size, channel_layout)) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported input channel layout\n");
+        av_free(channel_layout);
+        return AVERROR(EINVAL);
+    }
+    if (avctx->codec_id == AV_CODEC_ID_AAC) {
+        int tag = get_aac_tag(avctx->channel_layout);
+        if (tag) {
+            channel_layout->mChannelLayoutTag = tag;
+            channel_layout->mNumberChannelDescriptions = 0;
+        }
+    }
+    if (AudioConverterSetProperty(at->converter, kAudioConverterOutputChannelLayout,
+                                  layout_size, channel_layout)) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported output channel layout\n");
+        av_free(channel_layout);
+        return AVERROR(EINVAL);
+    }
+    av_free(channel_layout);
+
+    if (avctx->bits_per_raw_sample)
+        AudioConverterSetProperty(at->converter,
+                                  kAudioConverterPropertyBitDepthHint,
+                                  sizeof(avctx->bits_per_raw_sample),
+                                  &avctx->bits_per_raw_sample);
+
+#if !TARGET_OS_IPHONE
+    if (at->mode == -1)
+        at->mode = (avctx->flags & AV_CODEC_FLAG_QSCALE) ?
+                   kAudioCodecBitRateControlMode_Variable :
+                   kAudioCodecBitRateControlMode_Constant;
+
+    AudioConverterSetProperty(at->converter, kAudioCodecPropertyBitRateControlMode,
+                              sizeof(at->mode), &at->mode);
+
+    if (at->mode == kAudioCodecBitRateControlMode_Variable) {
+        int q = avctx->global_quality / FF_QP2LAMBDA;
+        if (q < 0 || q > 14) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "VBR quality %d out of range, should be 0-14\n", q);
+            q = av_clip(q, 0, 14);
+        }
+        q = 127 - q * 9;
+        AudioConverterSetProperty(at->converter, kAudioCodecPropertySoundQualityForVBR,
+                                  sizeof(q), &q);
+    } else
+#endif
+    if (avctx->bit_rate > 0) {
+        UInt32 rate = avctx->bit_rate;
+        UInt32 size;
+        status = AudioConverterGetPropertyInfo(at->converter,
+                                               kAudioConverterApplicableEncodeBitRates,
+                                               &size, NULL);
+        if (!status && size) {
+            UInt32 new_rate = rate;
+            int count;
+            int i;
+            AudioValueRange *ranges = av_malloc(size);
+            if (!ranges)
+                return AVERROR(ENOMEM);
+            AudioConverterGetProperty(at->converter,
+                                      kAudioConverterApplicableEncodeBitRates,
+                                      &size, ranges);
+            count = size / sizeof(AudioValueRange);
+            for (i = 0; i < count; i++) {
+                AudioValueRange *range = &ranges[i];
+                if (rate >= range->mMinimum && rate <= range->mMaximum) {
+                    new_rate = rate;
+                    break;
+                } else if (rate > range->mMaximum) {
+                    new_rate = range->mMaximum;
+                } else {
+                    new_rate = range->mMinimum;
+                    break;
+                }
+            }
+            if (new_rate != rate) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Bitrate %u not allowed; changing to %u\n", rate, new_rate);
+                rate = new_rate;
+            }
+            av_free(ranges);
+        }
+        AudioConverterSetProperty(at->converter, kAudioConverterEncodeBitRate,
+                                  sizeof(rate), &rate);
+    }
+
+    at->quality = 96 - at->quality * 32;
+    AudioConverterSetProperty(at->converter, kAudioConverterCodecQuality,
+                              sizeof(at->quality), &at->quality);
+
+    if (!AudioConverterGetPropertyInfo(at->converter, kAudioConverterCompressionMagicCookie,
+                                       &avctx->extradata_size, NULL) &&
+        avctx->extradata_size) {
+        int extradata_size = avctx->extradata_size;
+        uint8_t *extradata;
+        if (!(avctx->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE)))
+            return AVERROR(ENOMEM);
+        if (avctx->codec_id == AV_CODEC_ID_ALAC) {
+            avctx->extradata_size = 0x24;
+            AV_WB32(avctx->extradata,     0x24);
+            AV_WB32(avctx->extradata + 4, MKBETAG('a','l','a','c'));
+            extradata = avctx->extradata + 12;
+            avctx->extradata_size = 0x24;
+        } else {
+            extradata = avctx->extradata;
+        }
+        status = AudioConverterGetProperty(at->converter,
+                                           kAudioConverterCompressionMagicCookie,
+                                           &extradata_size, extradata);
+        if (status != 0) {
+            av_log(avctx, AV_LOG_ERROR, "AudioToolbox cookie error: %i\n", (int)status);
+            return AVERROR_UNKNOWN;
+        } else if (avctx->codec_id == AV_CODEC_ID_AAC) {
+            GetByteContext gb;
+            int tag, len;
+            bytestream2_init(&gb, extradata, extradata_size);
+            do {
+                len = read_descr(&gb, &tag);
+                if (tag == MP4DecConfigDescrTag) {
+                    bytestream2_skip(&gb, 13);
+                    len = read_descr(&gb, &tag);
+                    if (tag == MP4DecSpecificDescrTag) {
+                        len = FFMIN(gb.buffer_end - gb.buffer, len);
+                        memmove(extradata, gb.buffer, len);
+                        avctx->extradata_size = len;
+                        break;
+                    }
+                } else if (tag == MP4ESDescrTag) {
+                    int flags;
+                    bytestream2_skip(&gb, 2);
+                    flags = bytestream2_get_byte(&gb);
+                    if (flags & 0x80) //streamDependenceFlag
+                        bytestream2_skip(&gb, 2);
+                    if (flags & 0x40) //URL_Flag
+                        bytestream2_skip(&gb, bytestream2_get_byte(&gb));
+                    if (flags & 0x20) //OCRstreamFlag
+                        bytestream2_skip(&gb, 2);
+                }
+            } while (bytestream2_get_bytes_left(&gb));
+        } else if (avctx->codec_id != AV_CODEC_ID_ALAC) {
+            avctx->extradata_size = extradata_size;
+        }
+    }
+
+    ffat_update_ctx(avctx);
+
+#if !TARGET_OS_IPHONE && defined(__MAC_10_9)
+    if (at->mode == kAudioCodecBitRateControlMode_Variable && avctx->rc_max_rate) {
+        UInt32 max_size = avctx->rc_max_rate * avctx->frame_size / avctx->sample_rate;
+        if (max_size)
+            AudioConverterSetProperty(at->converter, kAudioCodecPropertyPacketSizeLimitForVBR,
+                                      sizeof(max_size), &max_size);
+    }
+#endif
+
+    ff_af_queue_init(avctx, &at->afq);
+
+    return 0;
+}
+
+static OSStatus ffat_encode_callback(AudioConverterRef converter, UInt32 *nb_packets,
+                                     AudioBufferList *data,
+                                     AudioStreamPacketDescription **packets,
+                                     void *inctx)
+{
+    AVCodecContext *avctx = inctx;
+    ATDecodeContext *at = avctx->priv_data;
+    AVFrame *frame;
+
+    if (!at->frame_queue.available) {
+        if (at->eof) {
+            *nb_packets = 0;
+            return 0;
+        } else {
+            *nb_packets = 0;
+            return 1;
+        }
+    }
+
+    frame = ff_bufqueue_get(&at->frame_queue);
+
+    data->mNumberBuffers              = 1;
+    data->mBuffers[0].mNumberChannels = avctx->channels;
+    data->mBuffers[0].mDataByteSize   = frame->nb_samples *
+                                        av_get_bytes_per_sample(avctx->sample_fmt) *
+                                        avctx->channels;
+    data->mBuffers[0].mData           = frame->data[0];
+    if (*nb_packets > frame->nb_samples)
+        *nb_packets = frame->nb_samples;
+
+    ff_bufqueue_add(avctx, &at->used_frame_queue, frame);
+
+    return 0;
+}
+
+static int ffat_encode(AVCodecContext *avctx, AVPacket *avpkt,
+                       const AVFrame *frame, int *got_packet_ptr)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    OSStatus ret;
+
+    AudioBufferList out_buffers = {
+        .mNumberBuffers = 1,
+        .mBuffers = {
+            {
+                .mNumberChannels = avctx->channels,
+                .mDataByteSize = at->pkt_size,
+            }
+        }
+    };
+    AudioStreamPacketDescription out_pkt_desc = {0};
+
+    if (frame) {
+        AVFrame *in_frame;
+
+        if (ff_bufqueue_is_full(&at->frame_queue)) {
+            /*
+             * The frame queue is significantly larger than needed in practice,
+             * but no clear way to determine the minimum number of samples to
+             * get output from AudioConverterFillComplexBuffer().
+             */
+            av_log(avctx, AV_LOG_ERROR, "Bug: frame queue is too small.\n");
+            return AVERROR_BUG;
+        }
+
+        if ((ret = ff_af_queue_add(&at->afq, frame)) < 0)
+            return ret;
+
+        in_frame = av_frame_clone(frame);
+        if (!in_frame)
+            return AVERROR(ENOMEM);
+
+        ff_bufqueue_add(avctx, &at->frame_queue, in_frame);
+    } else {
+        at->eof = 1;
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, at->pkt_size, 0)) < 0)
+        return ret;
+
+
+    out_buffers.mBuffers[0].mData = avpkt->data;
+
+    *got_packet_ptr = avctx->frame_size / at->frame_size;
+
+    ret = AudioConverterFillComplexBuffer(at->converter, ffat_encode_callback, avctx,
+                                          got_packet_ptr, &out_buffers,
+                                          (avctx->frame_size > at->frame_size) ? NULL : &out_pkt_desc);
+
+    ff_bufqueue_discard_all(&at->used_frame_queue);
+
+    if ((!ret || ret == 1) && *got_packet_ptr) {
+        avpkt->size = out_buffers.mBuffers[0].mDataByteSize;
+        ff_af_queue_remove(&at->afq, out_pkt_desc.mVariableFramesInPacket ?
+                                     out_pkt_desc.mVariableFramesInPacket :
+                                     avctx->frame_size,
+                           &avpkt->pts,
+                           &avpkt->duration);
+    } else if (ret && ret != 1) {
+        av_log(avctx, AV_LOG_WARNING, "Encode error: %i\n", ret);
+    }
+
+    return 0;
+}
+
+static av_cold void ffat_encode_flush(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioConverterReset(at->converter);
+    ff_bufqueue_discard_all(&at->frame_queue);
+    ff_bufqueue_discard_all(&at->used_frame_queue);
+}
+
+static av_cold int ffat_close_encoder(AVCodecContext *avctx)
+{
+    ATDecodeContext *at = avctx->priv_data;
+    AudioConverterDispose(at->converter);
+    ff_bufqueue_discard_all(&at->frame_queue);
+    ff_bufqueue_discard_all(&at->used_frame_queue);
+    ff_af_queue_close(&at->afq);
+    return 0;
+}
+
+static const AVProfile aac_profiles[] = {
+    { FF_PROFILE_AAC_LOW,   "LC"       },
+    { FF_PROFILE_AAC_HE,    "HE-AAC"   },
+    { FF_PROFILE_AAC_HE_V2, "HE-AACv2" },
+    { FF_PROFILE_AAC_LD,    "LD"       },
+    { FF_PROFILE_AAC_ELD,   "ELD"      },
+    { FF_PROFILE_UNKNOWN },
+};
+
+#define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+#if !TARGET_OS_IPHONE
+    {"aac_at_mode", "ratecontrol mode", offsetof(ATDecodeContext, mode), AV_OPT_TYPE_INT, {.i64 = -1}, -1, kAudioCodecBitRateControlMode_Variable, AE, "mode"},
+        {"auto", "VBR if global quality is given; CBR otherwise", 0, AV_OPT_TYPE_CONST, {.i64 = -1}, INT_MIN, INT_MAX, AE, "mode"},
+        {"cbr",  "constant bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_Constant}, INT_MIN, INT_MAX, AE, "mode"},
+        {"abr",  "long-term average bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_LongTermAverage}, INT_MIN, INT_MAX, AE, "mode"},
+        {"cvbr", "constrained variable bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_VariableConstrained}, INT_MIN, INT_MAX, AE, "mode"},
+        {"vbr" , "variable bitrate", 0, AV_OPT_TYPE_CONST, {.i64 = kAudioCodecBitRateControlMode_Variable}, INT_MIN, INT_MAX, AE, "mode"},
+#endif
+    {"aac_at_quality", "quality vs speed control", offsetof(ATDecodeContext, quality), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 2, AE},
+    { NULL },
+};
+
+#define FFAT_ENC_CLASS(NAME) \
+    static const AVClass ffat_##NAME##_enc_class = { \
+        .class_name = "at_" #NAME "_enc", \
+        .item_name  = av_default_item_name, \
+        .option     = options, \
+        .version    = LIBAVUTIL_VERSION_INT, \
+    };
+
+#define FFAT_ENC(NAME, ID, PROFILES, ...) \
+    FFAT_ENC_CLASS(NAME) \
+    AVCodec ff_##NAME##_at_encoder = { \
+        .name           = #NAME "_at", \
+        .long_name      = NULL_IF_CONFIG_SMALL(#NAME " (AudioToolbox)"), \
+        .type           = AVMEDIA_TYPE_AUDIO, \
+        .id             = ID, \
+        .priv_data_size = sizeof(ATDecodeContext), \
+        .init           = ffat_init_encoder, \
+        .close          = ffat_close_encoder, \
+        .encode2        = ffat_encode, \
+        .flush          = ffat_encode_flush, \
+        .priv_class     = &ffat_##NAME##_enc_class, \
+        .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY __VA_ARGS__, \
+        .sample_fmts    = (const enum AVSampleFormat[]) { \
+            AV_SAMPLE_FMT_S16, \
+            AV_SAMPLE_FMT_U8,  AV_SAMPLE_FMT_NONE \
+        }, \
+        .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE, \
+        .profiles       = PROFILES, \
+    };
+
+static const uint64_t aac_at_channel_layouts[] = {
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_LAYOUT_4POINT0,
+    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_6POINT0,
+    AV_CH_LAYOUT_6POINT1,
+    AV_CH_LAYOUT_7POINT0,
+    AV_CH_LAYOUT_7POINT1_WIDE_BACK,
+    AV_CH_LAYOUT_QUAD,
+    AV_CH_LAYOUT_OCTAGONAL,
+    0,
+};
+
+FFAT_ENC(aac,          AV_CODEC_ID_AAC,          aac_profiles, , .channel_layouts = aac_at_channel_layouts)
+//FFAT_ENC(adpcm_ima_qt, AV_CODEC_ID_ADPCM_IMA_QT, NULL)
+FFAT_ENC(alac,         AV_CODEC_ID_ALAC,         NULL, | AV_CODEC_CAP_VARIABLE_FRAME_SIZE | AV_CODEC_CAP_LOSSLESS)
+FFAT_ENC(ilbc,         AV_CODEC_ID_ILBC,         NULL)
+FFAT_ENC(pcm_alaw,     AV_CODEC_ID_PCM_ALAW,     NULL)
+FFAT_ENC(pcm_mulaw,    AV_CODEC_ID_PCM_MULAW,    NULL)
diff --git a/libavcodec/aura.c b/libavcodec/aura.c
index a1ef6f8..5f84d95 100644
--- a/libavcodec/aura.c
+++ b/libavcodec/aura.c
@@ -1,20 +1,20 @@
 /*
  * Aura 2 decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,10 +59,8 @@ static int aura_decode_frame(AVCodecContext *avctx,
     /* pixel data starts 48 bytes in, after 3x16-byte tables */
     buf += 48;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     Y = frame->data[0];
     U = frame->data[1];
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 8e97c97..682aa35 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 #include "libavutil/avutil.h"
 #include "libavutil/buffer.h"
 #include "libavutil/cpu.h"
+#include "libavutil/channel_layout.h"
 #include "libavutil/dict.h"
 #include "libavutil/frame.h"
 #include "libavutil/log.h"
@@ -41,11 +42,6 @@
 
 #include "version.h"
 
-#if FF_API_FAST_MALLOC
-// to provide fast_*alloc
-#include "libavutil/mem.h"
-#endif
-
 /**
  * @defgroup libavc Encoding/Decoding Library
  * @{
@@ -158,7 +154,7 @@
  * - The new API does not handle subtitles yet.
  *
  * Mixing new and old function calls on the same AVCodecContext is not allowed,
- * and will result in arbitrary behavior.
+ * and will result in undefined behavior.
  *
  * Some codecs might require using the new API; using the old API will return
  * an error when calling it.
@@ -184,8 +180,8 @@
  * details.
  *
  * If you add a codec ID to this list, add it so that
- * 1. no value of a existing codec ID changes (that would break ABI),
- * 2. it is as close as possible to similar codecs.
+ * 1. no value of an existing codec ID changes (that would break ABI),
+ * 2. it is as close as possible to similar codecs
  *
  * After adding new codec IDs, do not forget to add an entry to the codec
  * descriptor list and bump libavcodec minor version.
@@ -333,7 +329,7 @@ enum AVCodecID {
     AV_CODEC_ID_ANM,
     AV_CODEC_ID_BINKVIDEO,
     AV_CODEC_ID_IFF_ILBM,
-    AV_CODEC_ID_IFF_BYTERUN1,
+#define AV_CODEC_ID_IFF_BYTERUN1 AV_CODEC_ID_IFF_ILBM
     AV_CODEC_ID_KGV1,
     AV_CODEC_ID_YOP,
     AV_CODEC_ID_VP8,
@@ -371,6 +367,7 @@ enum AVCodecID {
     AV_CODEC_ID_WEBP,
     AV_CODEC_ID_HNM4_VIDEO,
     AV_CODEC_ID_HEVC,
+#define AV_CODEC_ID_H265 AV_CODEC_ID_HEVC
     AV_CODEC_ID_FIC,
     AV_CODEC_ID_ALIAS_PIX,
     AV_CODEC_ID_BRENDER_PIX,
@@ -390,6 +387,28 @@ enum AVCodecID {
     AV_CODEC_ID_SCREENPRESSO,
     AV_CODEC_ID_RSCC,
 
+    AV_CODEC_ID_Y41P = 0x8000,
+    AV_CODEC_ID_AVRP,
+    AV_CODEC_ID_012V,
+    AV_CODEC_ID_AVUI,
+    AV_CODEC_ID_AYUV,
+    AV_CODEC_ID_TARGA_Y216,
+    AV_CODEC_ID_V308,
+    AV_CODEC_ID_V408,
+    AV_CODEC_ID_YUV4,
+    AV_CODEC_ID_AVRN,
+    AV_CODEC_ID_CPIA,
+    AV_CODEC_ID_XFACE,
+    AV_CODEC_ID_SNOW,
+    AV_CODEC_ID_SMVJPEG,
+    AV_CODEC_ID_APNG,
+    AV_CODEC_ID_DAALA,
+    AV_CODEC_ID_CFHD,
+    AV_CODEC_ID_TRUEMOTION2RT,
+    AV_CODEC_ID_M101,
+    AV_CODEC_ID_MAGICYUV,
+    AV_CODEC_ID_SHEERVIDEO,
+
     /* various PCM "codecs" */
     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
     AV_CODEC_ID_PCM_S16LE = 0x10000,
@@ -423,6 +442,9 @@ enum AVCodecID {
     AV_CODEC_ID_PCM_S24LE_PLANAR,
     AV_CODEC_ID_PCM_S32LE_PLANAR,
     AV_CODEC_ID_PCM_S16BE_PLANAR,
+    /* new PCM "codecs" should be added right below this line starting with
+     * an explicit value of for example 0x10800
+     */
 
     /* various ADPCM codecs */
     AV_CODEC_ID_ADPCM_IMA_QT = 0x11000,
@@ -456,6 +478,20 @@ enum AVCodecID {
     AV_CODEC_ID_ADPCM_G722,
     AV_CODEC_ID_ADPCM_IMA_APC,
     AV_CODEC_ID_ADPCM_VIMA,
+#if FF_API_VIMA_DECODER
+    AV_CODEC_ID_VIMA = AV_CODEC_ID_ADPCM_VIMA,
+#endif
+
+    AV_CODEC_ID_ADPCM_AFC = 0x11800,
+    AV_CODEC_ID_ADPCM_IMA_OKI,
+    AV_CODEC_ID_ADPCM_DTK,
+    AV_CODEC_ID_ADPCM_IMA_RAD,
+    AV_CODEC_ID_ADPCM_G726LE,
+    AV_CODEC_ID_ADPCM_THP_LE,
+    AV_CODEC_ID_ADPCM_PSX,
+    AV_CODEC_ID_ADPCM_AICA,
+    AV_CODEC_ID_ADPCM_IMA_DAT4,
+    AV_CODEC_ID_ADPCM_MTAF,
 
     /* AMR */
     AV_CODEC_ID_AMR_NB = 0x12000,
@@ -471,6 +507,8 @@ enum AVCodecID {
     AV_CODEC_ID_XAN_DPCM,
     AV_CODEC_ID_SOL_DPCM,
 
+    AV_CODEC_ID_SDX2_DPCM = 0x14800,
+
     /* audio codecs */
     AV_CODEC_ID_MP2 = 0x15000,
     AV_CODEC_ID_MP3, ///< preferred ID for decoding MPEG audio layer 1, 2 or 3
@@ -543,6 +581,21 @@ enum AVCodecID {
     AV_CODEC_ID_ON2AVC,
     AV_CODEC_ID_DSS_SP,
 
+    AV_CODEC_ID_FFWAVESYNTH = 0x15800,
+    AV_CODEC_ID_SONIC,
+    AV_CODEC_ID_SONIC_LS,
+    AV_CODEC_ID_EVRC,
+    AV_CODEC_ID_SMV,
+    AV_CODEC_ID_DSD_LSBF,
+    AV_CODEC_ID_DSD_MSBF,
+    AV_CODEC_ID_DSD_LSBF_PLANAR,
+    AV_CODEC_ID_DSD_MSBF_PLANAR,
+    AV_CODEC_ID_4GV,
+    AV_CODEC_ID_INTERPLAY_ACM,
+    AV_CODEC_ID_XMA1,
+    AV_CODEC_ID_XMA2,
+    AV_CODEC_ID_DST,
+
     /* subtitle codecs */
     AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
     AV_CODEC_ID_DVD_SUBTITLE = 0x17000,
@@ -555,10 +608,36 @@ enum AVCodecID {
     AV_CODEC_ID_DVB_TELETEXT,
     AV_CODEC_ID_SRT,
 
+    AV_CODEC_ID_MICRODVD   = 0x17800,
+    AV_CODEC_ID_EIA_608,
+    AV_CODEC_ID_JACOSUB,
+    AV_CODEC_ID_SAMI,
+    AV_CODEC_ID_REALTEXT,
+    AV_CODEC_ID_STL,
+    AV_CODEC_ID_SUBVIEWER1,
+    AV_CODEC_ID_SUBVIEWER,
+    AV_CODEC_ID_SUBRIP,
+    AV_CODEC_ID_WEBVTT,
+    AV_CODEC_ID_MPL2,
+    AV_CODEC_ID_VPLAYER,
+    AV_CODEC_ID_PJS,
+    AV_CODEC_ID_ASS,
+    AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
+
     /* other specific kind of codecs (generally used for attachments) */
     AV_CODEC_ID_FIRST_UNKNOWN = 0x18000,           ///< A dummy ID pointing at the start of various fake codecs.
     AV_CODEC_ID_TTF = 0x18000,
 
+    AV_CODEC_ID_BINTEXT    = 0x18800,
+    AV_CODEC_ID_XBIN,
+    AV_CODEC_ID_IDF,
+    AV_CODEC_ID_OTF,
+    AV_CODEC_ID_SMPTE_KLV,
+    AV_CODEC_ID_DVD_NAV,
+    AV_CODEC_ID_TIMED_ID3,
+    AV_CODEC_ID_BIN_DATA,
+
+
     AV_CODEC_ID_PROBE = 0x19000, ///< codec_id is not known (like AV_CODEC_ID_NONE) but lavf should attempt to identify it
 
     AV_CODEC_ID_MPEG2TS = 0x20000, /**< _FAKE_ codec to indicate a raw MPEG-2 TS
@@ -592,6 +671,12 @@ typedef struct AVCodecDescriptor {
      */
     int             props;
     /**
+     * MIME type(s) associated with the codec.
+     * May be NULL; if not, a NULL-terminated array of MIME types.
+     * The first item is always non-NULL and is the preferred MIME type.
+     */
+    const char *const *mime_types;
+    /**
      * If non-NULL, an array of profiles recognized for this codec.
      * Terminated with FF_PROFILE_UNKNOWN.
      */
@@ -623,6 +708,16 @@ typedef struct AVCodecDescriptor {
  * equal.
  */
 #define AV_CODEC_PROP_REORDER       (1 << 3)
+/**
+ * Subtitle codec is bitmap based
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->pict field.
+ */
+#define AV_CODEC_PROP_BITMAP_SUB    (1 << 16)
+/**
+ * Subtitle codec is text based.
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->ass field.
+ */
+#define AV_CODEC_PROP_TEXT_SUB      (1 << 17)
 
 /**
  * @ingroup lavc_decoding
@@ -632,7 +727,7 @@ typedef struct AVCodecDescriptor {
  * Note: If the first 23 bits of the additional bytes are not 0, then damaged
  * MPEG bitstreams could cause overread and segfault.
  */
-#define AV_INPUT_BUFFER_PADDING_SIZE 8
+#define AV_INPUT_BUFFER_PADDING_SIZE 32
 
 /**
  * @ingroup lavc_encoding
@@ -645,7 +740,7 @@ typedef struct AVCodecDescriptor {
 /**
  * @deprecated use AV_INPUT_BUFFER_PADDING_SIZE instead
  */
-#define FF_INPUT_BUFFER_PADDING_SIZE 8
+#define FF_INPUT_BUFFER_PADDING_SIZE 32
 
 /**
  * @deprecated use AV_INPUT_BUFFER_MIN_SIZE instead
@@ -669,6 +764,7 @@ enum Motion_Est_ID {
     ME_HEX,         ///< hexagon based search
     ME_UMH,         ///< uneven multi-hexagon search
     ME_TESA,        ///< transformed exhaustive search algorithm
+    ME_ITER=50,     ///< iterative search
 };
 #endif
 
@@ -682,6 +778,7 @@ enum AVDiscard{
     AVDISCARD_DEFAULT =  0, ///< discard useless packets like 0 size packets in avi
     AVDISCARD_NONREF  =  8, ///< discard all non reference
     AVDISCARD_BIDIR   = 16, ///< discard all bidirectional frames
+    AVDISCARD_NONINTRA= 24, ///< discard all non intra frames
     AVDISCARD_NONKEY  = 32, ///< discard all frames except keyframes
     AVDISCARD_ALL     = 48, ///< discard all
 };
@@ -792,7 +889,7 @@ typedef struct RcOverride{
  * interlaced motion estimation
  */
 #define AV_CODEC_FLAG_INTERLACED_ME   (1 << 29)
-#define AV_CODEC_FLAG_CLOSED_GOP      (1 << 31)
+#define AV_CODEC_FLAG_CLOSED_GOP      (1U << 31)
 
 /**
  * Allow non spec compliant speedup tricks.
@@ -806,6 +903,12 @@ typedef struct RcOverride{
  * Place global headers at every keyframe instead of in extradata.
  */
 #define AV_CODEC_FLAG2_LOCAL_HEADER   (1 <<  3)
+
+/**
+ * timecode is in drop frame format. DEPRECATED!!!!
+ */
+#define AV_CODEC_FLAG2_DROP_FRAME_TIMECODE (1 << 13)
+
 /**
  * Input bitstream might be truncated at a packet boundaries
  * instead of only at frame boundaries.
@@ -816,6 +919,23 @@ typedef struct RcOverride{
  */
 #define AV_CODEC_FLAG2_IGNORE_CROP    (1 << 16)
 
+/**
+ * Show all frames before the first keyframe
+ */
+#define AV_CODEC_FLAG2_SHOW_ALL       (1 << 22)
+/**
+ * Export motion vectors through frame side data
+ */
+#define AV_CODEC_FLAG2_EXPORT_MVS     (1 << 28)
+/**
+ * Do not skip samples and export skip information as frame side data
+ */
+#define AV_CODEC_FLAG2_SKIP_MANUAL    (1 << 29)
+/**
+ * Do not reset ASS ReadOrder field on flush (subtitles decoding)
+ */
+#define AV_CODEC_FLAG2_RO_FLUSH_NOOP  (1 << 30)
+
 /* Unsupported options :
  *              Syntax Arithmetic coding (SAC)
  *              Reference Picture Selection
@@ -863,6 +983,14 @@ typedef struct RcOverride{
  * This can be used to prevent truncation of the last audio samples.
  */
 #define AV_CODEC_CAP_SMALL_LAST_FRAME    (1 <<  6)
+
+#if FF_API_CAP_VDPAU
+/**
+ * Codec can export data for HW decoding (VDPAU).
+ */
+#define AV_CODEC_CAP_HWACCEL_VDPAU       (1 <<  7)
+#endif
+
 /**
  * Codec can output multiple frames per AVPacket
  * Normally demuxers return one frame at a time, demuxers which do not do
@@ -904,17 +1032,26 @@ typedef struct RcOverride{
  * Audio encoder supports receiving a different number of samples in each call.
  */
 #define AV_CODEC_CAP_VARIABLE_FRAME_SIZE (1 << 16)
+/**
+ * Codec is intra only.
+ */
+#define AV_CODEC_CAP_INTRA_ONLY       0x40000000
+/**
+ * Codec is lossless.
+ */
+#define AV_CODEC_CAP_LOSSLESS         0x80000000
+
 
 #if FF_API_WITHOUT_PREFIX
 /**
  * Allow decoders to produce frames with data planes that are not aligned
  * to CPU requirements (e.g. due to cropping).
  */
-#define CODEC_FLAG_UNALIGNED 0x0001
-#define CODEC_FLAG_QSCALE 0x0002  ///< Use fixed qscale.
-#define CODEC_FLAG_4MV    0x0004  ///< 4 MV per MB allowed / advanced prediction for H.263.
-#define CODEC_FLAG_OUTPUT_CORRUPT 0x0008 ///< Output even those frames that might be corrupted
-#define CODEC_FLAG_QPEL   0x0010  ///< Use qpel MC.
+#define CODEC_FLAG_UNALIGNED AV_CODEC_FLAG_UNALIGNED
+#define CODEC_FLAG_QSCALE AV_CODEC_FLAG_QSCALE
+#define CODEC_FLAG_4MV    AV_CODEC_FLAG_4MV
+#define CODEC_FLAG_OUTPUT_CORRUPT AV_CODEC_FLAG_OUTPUT_CORRUPT
+#define CODEC_FLAG_QPEL   AV_CODEC_FLAG_QPEL
 #if FF_API_GMC
 /**
  * @deprecated use the "gmc" private option of the libxvid encoder
@@ -935,9 +1072,9 @@ typedef struct RcOverride{
  */
 #define CODEC_FLAG_INPUT_PRESERVED 0x0100
 #endif
-#define CODEC_FLAG_PASS1           0x0200   ///< Use internal 2pass ratecontrol in first pass mode.
-#define CODEC_FLAG_PASS2           0x0400   ///< Use internal 2pass ratecontrol in second pass mode.
-#define CODEC_FLAG_GRAY            0x2000   ///< Only decode/encode grayscale.
+#define CODEC_FLAG_PASS1           AV_CODEC_FLAG_PASS1
+#define CODEC_FLAG_PASS2           AV_CODEC_FLAG_PASS2
+#define CODEC_FLAG_GRAY            AV_CODEC_FLAG_GRAY
 #if FF_API_EMU_EDGE
 /**
  * @deprecated edges are not used/required anymore. I.e. this flag is now always
@@ -945,9 +1082,9 @@ typedef struct RcOverride{
  */
 #define CODEC_FLAG_EMU_EDGE        0x4000
 #endif
-#define CODEC_FLAG_PSNR            0x8000   ///< error[?] variables will be set during encoding.
-#define CODEC_FLAG_TRUNCATED       0x00010000 /** Input bitstream might be truncated at a random
-                                                  location instead of only at frame boundaries. */
+#define CODEC_FLAG_PSNR            AV_CODEC_FLAG_PSNR
+#define CODEC_FLAG_TRUNCATED       AV_CODEC_FLAG_TRUNCATED
+
 #if FF_API_NORMALIZE_AQP
 /**
  * @deprecated use the flag "naq" in the "mpv_flags" private option of the
@@ -955,21 +1092,24 @@ typedef struct RcOverride{
  */
 #define CODEC_FLAG_NORMALIZE_AQP  0x00020000
 #endif
-#define CODEC_FLAG_INTERLACED_DCT 0x00040000 ///< Use interlaced DCT.
-#define CODEC_FLAG_LOW_DELAY      0x00080000 ///< Force low delay.
-#define CODEC_FLAG_GLOBAL_HEADER  0x00400000 ///< Place global headers in extradata instead of every keyframe.
-#define CODEC_FLAG_BITEXACT       0x00800000 ///< Use only bitexact stuff (except (I)DCT).
-/* Fx : Flag for H.263+ extra options */
-#define CODEC_FLAG_AC_PRED        0x01000000 ///< H.263 advanced intra coding / MPEG-4 AC prediction
-#define CODEC_FLAG_LOOP_FILTER    0x00000800 ///< loop filter
-#define CODEC_FLAG_INTERLACED_ME  0x20000000 ///< interlaced motion estimation
-#define CODEC_FLAG_CLOSED_GOP     0x80000000
-#define CODEC_FLAG2_FAST          0x00000001 ///< Allow non spec compliant speedup tricks.
-#define CODEC_FLAG2_NO_OUTPUT     0x00000004 ///< Skip bitstream encoding.
-#define CODEC_FLAG2_LOCAL_HEADER  0x00000008 ///< Place global headers at every keyframe instead of in extradata.
-#define CODEC_FLAG2_IGNORE_CROP   0x00010000 ///< Discard cropping information from SPS.
-
-#define CODEC_FLAG2_CHUNKS        0x00008000 ///< Input bitstream might be truncated at a packet boundaries instead of only at frame boundaries.
+#define CODEC_FLAG_INTERLACED_DCT AV_CODEC_FLAG_INTERLACED_DCT
+#define CODEC_FLAG_LOW_DELAY      AV_CODEC_FLAG_LOW_DELAY
+#define CODEC_FLAG_GLOBAL_HEADER  AV_CODEC_FLAG_GLOBAL_HEADER
+#define CODEC_FLAG_BITEXACT       AV_CODEC_FLAG_BITEXACT
+#define CODEC_FLAG_AC_PRED        AV_CODEC_FLAG_AC_PRED
+#define CODEC_FLAG_LOOP_FILTER    AV_CODEC_FLAG_LOOP_FILTER
+#define CODEC_FLAG_INTERLACED_ME  AV_CODEC_FLAG_INTERLACED_ME
+#define CODEC_FLAG_CLOSED_GOP     AV_CODEC_FLAG_CLOSED_GOP
+#define CODEC_FLAG2_FAST          AV_CODEC_FLAG2_FAST
+#define CODEC_FLAG2_NO_OUTPUT     AV_CODEC_FLAG2_NO_OUTPUT
+#define CODEC_FLAG2_LOCAL_HEADER  AV_CODEC_FLAG2_LOCAL_HEADER
+#define CODEC_FLAG2_DROP_FRAME_TIMECODE AV_CODEC_FLAG2_DROP_FRAME_TIMECODE
+#define CODEC_FLAG2_IGNORE_CROP   AV_CODEC_FLAG2_IGNORE_CROP
+
+#define CODEC_FLAG2_CHUNKS        AV_CODEC_FLAG2_CHUNKS
+#define CODEC_FLAG2_SHOW_ALL      AV_CODEC_FLAG2_SHOW_ALL
+#define CODEC_FLAG2_EXPORT_MVS    AV_CODEC_FLAG2_EXPORT_MVS
+#define CODEC_FLAG2_SKIP_MANUAL   AV_CODEC_FLAG2_SKIP_MANUAL
 
 /* Unsupported options :
  *              Syntax Arithmetic coding (SAC)
@@ -978,16 +1118,22 @@ typedef struct RcOverride{
 /* /Fx */
 /* codec capabilities */
 
-#define CODEC_CAP_DRAW_HORIZ_BAND 0x0001 ///< Decoder can use draw_horiz_band callback.
+#define CODEC_CAP_DRAW_HORIZ_BAND AV_CODEC_CAP_DRAW_HORIZ_BAND ///< Decoder can use draw_horiz_band callback.
 /**
  * Codec uses get_buffer() for allocating buffers and supports custom allocators.
  * If not set, it might not use get_buffer() at all or use operations that
  * assume the buffer was allocated by avcodec_default_get_buffer.
  */
-#define CODEC_CAP_DR1             0x0002
-#define CODEC_CAP_TRUNCATED       0x0008
+#define CODEC_CAP_DR1             AV_CODEC_CAP_DR1
+#define CODEC_CAP_TRUNCATED       AV_CODEC_CAP_TRUNCATED
 #if FF_API_XVMC
-/* Codec can export data for HW decoding (XvMC). */
+/* Codec can export data for HW decoding. This flag indicates that
+ * the codec would call get_format() with list that might contain HW accelerated
+ * pixel formats (XvMC, VDPAU, VAAPI, etc). The application can pick any of them
+ * including raw image format.
+ * The application can use the passed context to determine bitstream version,
+ * chroma format, resolution etc.
+ */
 #define CODEC_CAP_HWACCEL         0x0010
 #endif /* FF_API_XVMC */
 /**
@@ -1013,17 +1159,17 @@ typedef struct RcOverride{
  *       each output packet. If this flag is not set, the pts and duration will
  *       be determined by libavcodec from the input frame.
  */
-#define CODEC_CAP_DELAY           0x0020
+#define CODEC_CAP_DELAY           AV_CODEC_CAP_DELAY
 /**
  * Codec can be fed a final frame with a smaller size.
  * This can be used to prevent truncation of the last audio samples.
  */
-#define CODEC_CAP_SMALL_LAST_FRAME 0x0040
+#define CODEC_CAP_SMALL_LAST_FRAME AV_CODEC_CAP_SMALL_LAST_FRAME
 #if FF_API_CAP_VDPAU
 /**
  * Codec can export data for HW decoding (VDPAU).
  */
-#define CODEC_CAP_HWACCEL_VDPAU    0x0080
+#define CODEC_CAP_HWACCEL_VDPAU    AV_CODEC_CAP_HWACCEL_VDPAU
 #endif
 /**
  * Codec can output multiple frames per AVPacket
@@ -1036,16 +1182,16 @@ typedef struct RcOverride{
  * prohibiting stream copy in many cases thus it should only be considered
  * as a last resort.
  */
-#define CODEC_CAP_SUBFRAMES        0x0100
+#define CODEC_CAP_SUBFRAMES        AV_CODEC_CAP_SUBFRAMES
 /**
  * Codec is experimental and is thus avoided in favor of non experimental
  * encoders
  */
-#define CODEC_CAP_EXPERIMENTAL     0x0200
+#define CODEC_CAP_EXPERIMENTAL     AV_CODEC_CAP_EXPERIMENTAL
 /**
  * Codec should fill in channel configuration and samplerate instead of container
  */
-#define CODEC_CAP_CHANNEL_CONF     0x0400
+#define CODEC_CAP_CHANNEL_CONF     AV_CODEC_CAP_CHANNEL_CONF
 #if FF_API_NEG_LINESIZES
 /**
  * @deprecated no codecs use this capability
@@ -1055,23 +1201,37 @@ typedef struct RcOverride{
 /**
  * Codec supports frame-level multithreading.
  */
-#define CODEC_CAP_FRAME_THREADS    0x1000
+#define CODEC_CAP_FRAME_THREADS    AV_CODEC_CAP_FRAME_THREADS
 /**
  * Codec supports slice-based (or partition-based) multithreading.
  */
-#define CODEC_CAP_SLICE_THREADS    0x2000
+#define CODEC_CAP_SLICE_THREADS    AV_CODEC_CAP_SLICE_THREADS
 /**
  * Codec supports changed parameters at any point.
  */
-#define CODEC_CAP_PARAM_CHANGE     0x4000
+#define CODEC_CAP_PARAM_CHANGE     AV_CODEC_CAP_PARAM_CHANGE
 /**
  * Codec supports avctx->thread_count == 0 (auto).
  */
-#define CODEC_CAP_AUTO_THREADS     0x8000
+#define CODEC_CAP_AUTO_THREADS     AV_CODEC_CAP_AUTO_THREADS
 /**
  * Audio encoder supports receiving a different number of samples in each call.
  */
-#define CODEC_CAP_VARIABLE_FRAME_SIZE 0x10000
+#define CODEC_CAP_VARIABLE_FRAME_SIZE AV_CODEC_CAP_VARIABLE_FRAME_SIZE
+/**
+ * Codec is intra only.
+ */
+#define CODEC_CAP_INTRA_ONLY       AV_CODEC_CAP_INTRA_ONLY
+/**
+ * Codec is lossless.
+ */
+#define CODEC_CAP_LOSSLESS         AV_CODEC_CAP_LOSSLESS
+
+/**
+ * HWAccel is experimental and is thus avoided in favor of non experimental
+ * codecs
+ */
+#define HWACCEL_CODEC_CAP_EXPERIMENTAL     0x0200
 #endif /* FF_API_WITHOUT_PREFIX */
 
 #if FF_API_MB_TYPE
@@ -1255,11 +1415,16 @@ enum AVPacketSideDataType {
     AV_PKT_DATA_AUDIO_SERVICE_TYPE,
 
     /**
-     * This side data contains an integer value representing the quality
-     * factor of the compressed frame. Allowed range is between 1 (good)
-     * and FF_LAMBDA_MAX (bad).
+     * This side data contains quality related information from the encoder.
+     * @code
+     * u32le quality factor of the compressed frame. Allowed range is between 1 (good) and FF_LAMBDA_MAX (bad).
+     * u8    picture type
+     * u8    error count
+     * u16   reserved
+     * u64le[error count] sum of squared differences between encoder in and output
+     * @endcode
      */
-    AV_PKT_DATA_QUALITY_FACTOR,
+    AV_PKT_DATA_QUALITY_STATS,
 
     /**
      * This side data contains an integer value representing the stream index
@@ -1273,8 +1438,87 @@ enum AVPacketSideDataType {
      * This side data corresponds to the AVCPBProperties struct.
      */
     AV_PKT_DATA_CPB_PROPERTIES,
+
+    /**
+     * Recommmends skipping the specified number of samples
+     * @code
+     * u32le number of samples to skip from start of this packet
+     * u32le number of samples to skip from end of this packet
+     * u8    reason for start skip
+     * u8    reason for end   skip (0=padding silence, 1=convergence)
+     * @endcode
+     */
+    AV_PKT_DATA_SKIP_SAMPLES=70,
+
+    /**
+     * An AV_PKT_DATA_JP_DUALMONO side data packet indicates that
+     * the packet may contain "dual mono" audio specific to Japanese DTV
+     * and if it is true, recommends only the selected channel to be used.
+     * @code
+     * u8    selected channels (0=mail/left, 1=sub/right, 2=both)
+     * @endcode
+     */
+    AV_PKT_DATA_JP_DUALMONO,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop.
+     */
+    AV_PKT_DATA_STRINGS_METADATA,
+
+    /**
+     * Subtitle event position
+     * @code
+     * u32le x1
+     * u32le y1
+     * u32le x2
+     * u32le y2
+     * @endcode
+     */
+    AV_PKT_DATA_SUBTITLE_POSITION,
+
+    /**
+     * Data found in BlockAdditional element of matroska container. There is
+     * no end marker for the data, so it is required to rely on the side data
+     * size to recognize the end. 8 byte id (as found in BlockAddId) followed
+     * by data.
+     */
+    AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+
+    /**
+     * The optional first identifier line of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_IDENTIFIER,
+
+    /**
+     * The optional settings (rendering instructions) that immediately
+     * follow the timestamp specifier of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_SETTINGS,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop. This
+     * side data includes updated metadata which appeared in the stream.
+     */
+    AV_PKT_DATA_METADATA_UPDATE,
+
+    /**
+     * MPEGTS stream ID, this is required to pass the stream ID
+     * information from the demuxer to the corresponding muxer.
+     */
+    AV_PKT_DATA_MPEGTS_STREAM_ID,
+
+    /**
+     * Mastering display metadata (based on SMPTE-2086:2014). This metadata
+     * should be associated with a video stream and containts data in the form
+     * of the AVMasteringDisplayMetadata struct.
+     */
+    AV_PKT_DATA_MASTERING_DISPLAY_METADATA
 };
 
+#define AV_PKT_DATA_QUALITY_FACTOR AV_PKT_DATA_QUALITY_STATS //DEPRECATED
+
 typedef struct AVPacketSideData {
     uint8_t *data;
     int      size;
@@ -1291,7 +1535,7 @@ typedef struct AVPacketSideData {
  * packets, with no compressed data, containing only side data
  * (e.g. to update some stream parameters at the end of encoding).
  *
- * AVPacket is one of the few structs in Libav, whose size is a part of public
+ * AVPacket is one of the few structs in FFmpeg, whose size is a part of public
  * ABI. Thus it may be allocated on stack and no new fields can be added to it
  * without libavcodec and libavformat major bump.
  *
@@ -1393,6 +1637,12 @@ enum AVFieldOrder {
  * New fields can be added to the end with minor version bumps.
  * Removal, reordering and changes to existing fields require a major
  * version bump.
+ * Please use AVOptions (av_opt* / av_set/get*()) to access these fields from user
+ * applications.
+ * The name string for AVOptions options matches the associated command line
+ * parameter name and can be found in libavcodec/options_table.h
+ * The AVOption/command line parameter names differ in some cases from the C
+ * structure field names for historic reasons or brevity.
  * sizeof(AVCodecContext) must not be used outside libav*.
  */
 typedef struct AVCodecContext {
@@ -1457,9 +1707,10 @@ typedef struct AVCodecContext {
     /**
      * the average bitrate
      * - encoding: Set by user; unused for constant quantizer encoding.
-     * - decoding: Set by libavcodec. 0 or some bitrate if this info is available in the stream.
+     * - decoding: Set by user, may be overwritten by libavcodec
+     *             if this info is available in the stream
      */
-    int bit_rate;
+    int64_t bit_rate;
 
     /**
      * number of bits the bitstream is allowed to diverge from the reference.
@@ -1517,6 +1768,16 @@ typedef struct AVCodecContext {
      * of which frame timestamps are represented. For fixed-fps content,
      * timebase should be 1/framerate and timestamp increments should be
      * identically 1.
+     * This often, but not always is the inverse of the frame rate or field rate
+     * for video. 1/time_base is not the average frame rate if the frame rate is not
+     * constant.
+     *
+     * Like containers, elementary streams also can store timestamps, 1/time_base
+     * is the unit in which these timestamps are specified.
+     * As example of such codec time base see ISO/IEC 14496-2:2001(E)
+     * vop_time_increment_resolution and fixed_vop_rate
+     * (fixed_vop_rate == 0 implies that it is different from the framerate)
+     *
      * - encoding: MUST be set by user.
      * - decoding: the use of this field for decoding is deprecated.
      *             Use framerate instead.
@@ -1535,6 +1796,11 @@ typedef struct AVCodecContext {
     /**
      * Codec delay.
      *
+     * Encoding: Number of frames delay there will be from the encoder input to
+     *           the decoder output. (we assume the decoder matches the spec)
+     * Decoding: Number of frames delay in addition to what a standard decoder
+     *           as specified in the spec would produce.
+     *
      * Video:
      *   Number of frames the decoded output will be delayed relative to the
      *   encoded input.
@@ -1570,7 +1836,7 @@ typedef struct AVCodecContext {
 
     /**
      * Bitstream width / height, may be different from width/height e.g. when
-     * the decoded frame is cropped before being output.
+     * the decoded frame is cropped before being output or lowres is enabled.
      *
      * @note Those field may not match the value of the last
      * AVFrame output by avcodec_receive_frame() due frame
@@ -1827,6 +2093,8 @@ typedef struct AVCodecContext {
 #define FF_CMP_VSAD   8
 #define FF_CMP_VSSE   9
 #define FF_CMP_NSSE   10
+#define FF_CMP_W53    11
+#define FF_CMP_W97    12
 #define FF_CMP_DCTMAX 13
 #define FF_CMP_DCT264 14
 #define FF_CMP_CHROMA 256
@@ -1929,7 +2197,7 @@ typedef struct AVCodecContext {
      * XVideo Motion Acceleration
      * - encoding: forbidden
      * - decoding: set by decoder
-     * @deprecated XvMC support is slated for removal.
+     * @deprecated XvMC doesn't need it anymore.
      */
     attribute_deprecated int xvmc_acceleration;
 #endif /* FF_API_XVMC */
@@ -1985,7 +2253,7 @@ typedef struct AVCodecContext {
     /**
      * precision of the intra DC coefficient - 8
      * - encoding: Set by user.
-     * - decoding: unused
+     * - decoding: Set by libavcodec
      */
     int intra_dc_precision;
 
@@ -2133,7 +2401,7 @@ typedef struct AVCodecContext {
 
     /** Field order
      * - encoding: set by libavcodec
-     * - decoding: Set by libavcodec
+     * - decoding: Set by user.
      */
     enum AVFieldOrder field_order;
 
@@ -2187,7 +2455,7 @@ typedef struct AVCodecContext {
     /**
      * Audio channel layout.
      * - encoding: set by user.
-     * - decoding: set by libavcodec.
+     * - decoding: set by user, may be overwritten by libavcodec.
      */
     uint64_t channel_layout;
 
@@ -2206,9 +2474,10 @@ typedef struct AVCodecContext {
     enum AVAudioServiceType audio_service_type;
 
     /**
-     * Used to request a sample format from the decoder.
-     * - encoding: unused.
+     * desired sample format
+     * - encoding: Not used.
      * - decoding: Set by user.
+     * Decoder will decode to this format if it can.
      */
     enum AVSampleFormat request_sample_fmt;
 
@@ -2266,6 +2535,8 @@ typedef struct AVCodecContext {
      * avcodec_align_dimensions2() should be used to find the required width and
      * height, as they normally need to be rounded up to the next multiple of 16.
      *
+     * Some decoders do not support linesizes changing between frames.
+     *
      * If frame multithreading is used and thread_safe_callbacks is set,
      * this callback may be called from a different thread, but not from more
      * than one at once. Does not need to be reentrant.
@@ -2371,16 +2642,16 @@ typedef struct AVCodecContext {
     /**
      * maximum bitrate
      * - encoding: Set by user.
-     * - decoding: unused
+     * - decoding: Set by user, may be overwritten by libavcodec.
      */
-    int rc_max_rate;
+    int64_t rc_max_rate;
 
     /**
      * minimum bitrate
      * - encoding: Set by user.
      * - decoding: unused
      */
-    int rc_min_rate;
+    int64_t rc_min_rate;
 
 #if FF_API_MPV_OPT
     /**
@@ -2606,6 +2877,7 @@ typedef struct AVCodecContext {
     int error_concealment;
 #define FF_EC_GUESS_MVS   1
 #define FF_EC_DEBLOCK     2
+#define FF_EC_FAVOR_INTER 256
 
     /**
      * debug
@@ -2634,17 +2906,21 @@ typedef struct AVCodecContext {
 #define FF_DEBUG_MMCO        0x00000800
 #define FF_DEBUG_BUGS        0x00001000
 #if FF_API_DEBUG_MV
-#define FF_DEBUG_VIS_QP      0x00002000
-#define FF_DEBUG_VIS_MB_TYPE 0x00004000
+#define FF_DEBUG_VIS_QP      0x00002000 ///< only access through AVOptions from outside libavcodec
+#define FF_DEBUG_VIS_MB_TYPE 0x00004000 ///< only access through AVOptions from outside libavcodec
 #endif
 #define FF_DEBUG_BUFFERS     0x00008000
 #define FF_DEBUG_THREADS     0x00010000
+#define FF_DEBUG_GREEN_MD    0x00800000
+#define FF_DEBUG_NOMC        0x01000000
 
 #if FF_API_DEBUG_MV
     /**
-     * @deprecated this option does not have any effect
+     * debug
+     * Code outside libavcodec should access this field using AVOptions
+     * - encoding: Set by user.
+     * - decoding: Set by user.
      */
-    attribute_deprecated
     int debug_mv;
 #define FF_DEBUG_VIS_MV_P_FOR  0x00000001 // visualize forward predicted MVs of P-frames
 #define FF_DEBUG_VIS_MV_B_FOR  0x00000002 // visualize forward predicted MVs of B-frames
@@ -2665,9 +2941,15 @@ typedef struct AVCodecContext {
  * decoder returning an error.
  */
 #define AV_EF_CRCCHECK  (1<<0)
-#define AV_EF_BITSTREAM (1<<1)
-#define AV_EF_BUFFER    (1<<2)
-#define AV_EF_EXPLODE   (1<<3)
+#define AV_EF_BITSTREAM (1<<1)          ///< detect bitstream specification deviations
+#define AV_EF_BUFFER    (1<<2)          ///< detect improper bitstream length
+#define AV_EF_EXPLODE   (1<<3)          ///< abort decoding on minor error detection
+
+#define AV_EF_IGNORE_ERR (1<<15)        ///< ignore errors and continue
+#define AV_EF_CAREFUL    (1<<16)        ///< consider things that violate the spec, are fast to calculate and have not been seen in the wild as errors
+#define AV_EF_COMPLIANT  (1<<17)        ///< consider all spec non compliances as errors
+#define AV_EF_AGGRESSIVE (1<<18)        ///< consider things that a sane encoder should not do as an error
+
 
     /**
      * opaque 64-bit number (generally a PTS) that will be reordered and
@@ -2688,8 +2970,8 @@ typedef struct AVCodecContext {
      * Hardware accelerator context.
      * For some hardware accelerators, a global context needs to be
      * provided by the user. In that case, this holds display-dependent
-     * data Libav cannot instantiate itself. Please refer to the
-     * Libav HW accelerator documentation to know how to fill this
+     * data FFmpeg cannot instantiate itself. Please refer to the
+     * FFmpeg HW accelerator documentation to know how to fill this
      * is. e.g. for VA API, this is a struct vaapi_context.
      * - encoding: unused
      * - decoding: Set by user
@@ -2749,6 +3031,7 @@ typedef struct AVCodecContext {
 #if FF_API_ARCH_ALPHA
 #define FF_IDCT_SIMPLEALPHA   23
 #endif
+#define FF_IDCT_SIMPLEAUTO    128
 
     /**
      * bits per sample/pixel from the demuxer (needed for huffyuv).
@@ -2769,10 +3052,10 @@ typedef struct AVCodecContext {
      * low resolution decoding, 1-> 1/2 size, 2->1/4 size
      * - encoding: unused
      * - decoding: Set by user.
-     *
-     * @deprecated use decoder private options instead
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_lowres(avctx)
      */
-    attribute_deprecated int lowres;
+     int lowres;
 #endif
 
 #if FF_API_CODED_FRAME
@@ -2948,6 +3231,7 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_HEVC_MAIN                        1
 #define FF_PROFILE_HEVC_MAIN_10                     2
 #define FF_PROFILE_HEVC_MAIN_STILL_PICTURE          3
+#define FF_PROFILE_HEVC_REXT                        4
 
     /**
      * level
@@ -2958,18 +3242,21 @@ typedef struct AVCodecContext {
 #define FF_LEVEL_UNKNOWN -99
 
     /**
+     * Skip loop filtering for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
     enum AVDiscard skip_loop_filter;
 
     /**
+     * Skip IDCT/dequantization for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
     enum AVDiscard skip_idct;
 
     /**
+     * Skip decoding for selected frames.
      * - encoding: unused
      * - decoding: Set by user.
      */
@@ -3040,7 +3327,7 @@ typedef struct AVCodecContext {
      */
     int initial_padding;
 
-    /*
+    /**
      * - decoding: For codecs that store a framerate value in the compressed
      *             bitstream, the decoder may export it here. { 0, 1} when
      *             unknown.
@@ -3057,6 +3344,134 @@ typedef struct AVCodecContext {
     enum AVPixelFormat sw_pix_fmt;
 
     /**
+     * Timebase in which pkt_dts/pts and AVPacket.dts/pts are.
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_pkt_timebase(avctx)
+     * - encoding unused.
+     * - decoding set by user.
+     */
+    AVRational pkt_timebase;
+
+    /**
+     * AVCodecDescriptor
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_codec_descriptor(avctx)
+     * - encoding: unused.
+     * - decoding: set by libavcodec.
+     */
+    const AVCodecDescriptor *codec_descriptor;
+
+#if !FF_API_LOWRES
+    /**
+     * low resolution decoding, 1-> 1/2 size, 2->1/4 size
+     * - encoding: unused
+     * - decoding: Set by user.
+     * Code outside libavcodec should access this field using:
+     * av_codec_{get,set}_lowres(avctx)
+     */
+     int lowres;
+#endif
+
+    /**
+     * Current statistics for PTS correction.
+     * - decoding: maintained and used by libavcodec, not intended to be used by user apps
+     * - encoding: unused
+     */
+    int64_t pts_correction_num_faulty_pts; /// Number of incorrect PTS values so far
+    int64_t pts_correction_num_faulty_dts; /// Number of incorrect DTS values so far
+    int64_t pts_correction_last_pts;       /// PTS of the last frame
+    int64_t pts_correction_last_dts;       /// DTS of the last frame
+
+    /**
+     * Character encoding of the input subtitles file.
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    char *sub_charenc;
+
+    /**
+     * Subtitles character encoding mode. Formats or codecs might be adjusting
+     * this setting (if they are doing the conversion themselves for instance).
+     * - decoding: set by libavcodec
+     * - encoding: unused
+     */
+    int sub_charenc_mode;
+#define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
+#define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
+#define FF_SUB_CHARENC_MODE_PRE_DECODER  1  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
+
+    /**
+     * Skip processing alpha if supported by codec.
+     * Note that if the format uses pre-multiplied alpha (common with VP6,
+     * and recommended due to better video quality/compression)
+     * the image will look as if alpha-blended onto a black background.
+     * However for formats that do not use pre-multiplied alpha
+     * there might be serious artefacts (though e.g. libswscale currently
+     * assumes pre-multiplied alpha anyway).
+     * Code outside libavcodec should access this field using AVOptions
+     *
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int skip_alpha;
+
+    /**
+     * Number of samples to skip after a discontinuity
+     * - decoding: unused
+     * - encoding: set by libavcodec
+     */
+    int seek_preroll;
+
+#if !FF_API_DEBUG_MV
+    /**
+     * debug motion vectors
+     * Code outside libavcodec should access this field using AVOptions
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int debug_mv;
+#define FF_DEBUG_VIS_MV_P_FOR  0x00000001 //visualize forward predicted MVs of P frames
+#define FF_DEBUG_VIS_MV_B_FOR  0x00000002 //visualize forward predicted MVs of B frames
+#define FF_DEBUG_VIS_MV_B_BACK 0x00000004 //visualize backward predicted MVs of B frames
+#endif
+
+    /**
+     * custom intra quantization matrix
+     * Code outside libavcodec should access this field using av_codec_g/set_chroma_intra_matrix()
+     * - encoding: Set by user, can be NULL.
+     * - decoding: unused.
+     */
+    uint16_t *chroma_intra_matrix;
+
+    /**
+     * dump format separator.
+     * can be ", " or "\n      " or anything else
+     * Code outside libavcodec should access this field using AVOptions
+     * (NO direct access).
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    uint8_t *dump_separator;
+
+    /**
+     * ',' separated list of allowed decoders.
+     * If NULL then all are allowed
+     * - encoding: unused
+     * - decoding: set by user through AVOPtions (NO direct access)
+     */
+    char *codec_whitelist;
+
+    /*
+     * Properties of the stream that gets decoded
+     * To be accessed through av_codec_get_properties() (NO direct access)
+     * - encoding: unused
+     * - decoding: set by libavcodec
+     */
+    unsigned properties;
+#define FF_CODEC_PROPERTY_LOSSLESS        0x00000001
+#define FF_CODEC_PROPERTY_CLOSED_CAPTIONS 0x00000002
+
+    /**
      * Additional data associated with the entire coded stream.
      *
      * - decoding: unused
@@ -3077,8 +3492,37 @@ typedef struct AVCodecContext {
      * afterwards owned and managed by libavcodec.
      */
     AVBufferRef *hw_frames_ctx;
+
+    /**
+     * Control the form of AVSubtitle.rects[N]->ass
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int sub_text_format;
+#define FF_SUB_TEXT_FMT_ASS              0
+#if FF_API_ASS_TIMING
+#define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+#endif
+
 } AVCodecContext;
 
+AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+void       av_codec_set_pkt_timebase         (AVCodecContext *avctx, AVRational val);
+
+const AVCodecDescriptor *av_codec_get_codec_descriptor(const AVCodecContext *avctx);
+void                     av_codec_set_codec_descriptor(AVCodecContext *avctx, const AVCodecDescriptor *desc);
+
+unsigned av_codec_get_codec_properties(const AVCodecContext *avctx);
+
+int  av_codec_get_lowres(const AVCodecContext *avctx);
+void av_codec_set_lowres(AVCodecContext *avctx, int val);
+
+int  av_codec_get_seek_preroll(const AVCodecContext *avctx);
+void av_codec_set_seek_preroll(AVCodecContext *avctx, int val);
+
+uint16_t *av_codec_get_chroma_intra_matrix(const AVCodecContext *avctx);
+void av_codec_set_chroma_intra_matrix(AVCodecContext *avctx, uint16_t *val);
+
 /**
  * AVProfile.
  */
@@ -3119,9 +3563,7 @@ typedef struct AVCodec {
     const int *supported_samplerates;       ///< array of supported audio samplerates, or NULL if unknown, array is terminated by 0
     const enum AVSampleFormat *sample_fmts; ///< array of supported sample formats, or NULL if unknown, array is terminated by -1
     const uint64_t *channel_layouts;         ///< array of support channel layouts, or NULL if unknown. array is terminated by 0
-#if FF_API_LOWRES
-    attribute_deprecated uint8_t max_lowres; ///< maximum value for lowres supported by the decoder
-#endif
+    uint8_t max_lowres;                     ///< maximum value for lowres supported by the decoder, no direct access, use av_codec_get_max_lowres()
     const AVClass *priv_class;              ///< AVClass for the private context
     const AVProfile *profiles;              ///< array of recognized profiles, or NULL if unknown, array is terminated by {FF_PROFILE_UNKNOWN}
 
@@ -3208,6 +3650,10 @@ typedef struct AVCodec {
     int caps_internal;
 } AVCodec;
 
+int av_codec_get_max_lowres(const AVCodec *codec);
+
+struct MpegEncContext;
+
 /**
  * @defgroup lavc_hwaccel AVHWAccel
  * @{
@@ -3243,7 +3689,7 @@ typedef struct AVHWAccel {
 
     /**
      * Hardware accelerated codec capabilities.
-     * see FF_HWACCEL_CODEC_CAP_*
+     * see HWACCEL_CODEC_CAP_*
      */
     int capabilities;
 
@@ -3282,6 +3728,7 @@ typedef struct AVHWAccel {
      *
      * Meaningful slice information (codec specific) is guaranteed to
      * be parsed at this point. This function is mandatory.
+     * The only exception is XvMC, that works on MB level.
      *
      * @param avctx the codec context
      * @param buf the slice data buffer base
@@ -3311,6 +3758,17 @@ typedef struct AVHWAccel {
     int frame_priv_data_size;
 
     /**
+     * Called for every Macroblock in a slice.
+     *
+     * XvMC uses it to replace the ff_mpv_decode_mb().
+     * Instead of decoding to raw picture, MB parameters are
+     * stored in an array provided by the video driver.
+     *
+     * @param s the mpeg context
+     */
+    void (*decode_mb)(struct MpegEncContext *s);
+
+    /**
      * Initialize the hwaccel private data.
      *
      * This will be called from ff_get_format(), after hwaccel and
@@ -3338,6 +3796,9 @@ typedef struct AVHWAccel {
  * Hardware acceleration should be used for decoding even if the codec level
  * used is unknown or higher than the maximum supported level reported by the
  * hardware driver.
+ *
+ * It's generally a good idea to pass this flag unless you have a specific
+ * reason not to, as hardware tends to under-report supported levels.
  */
 #define AV_HWACCEL_FLAG_IGNORE_LEVEL (1 << 0)
 
@@ -3360,13 +3821,15 @@ typedef struct AVHWAccel {
  */
 
 /**
- * four components are given, that's all.
- * the last component is alpha
- * @deprecated Use the imgutils functions
+ * Picture data structure.
+ *
+ * Up to four components can be stored into it, the last component is
+ * alpha.
+ * @deprecated use AVFrame or imgutils functions instead
  */
 typedef struct AVPicture {
     attribute_deprecated
-    uint8_t *data[AV_NUM_DATA_POINTERS];
+    uint8_t *data[AV_NUM_DATA_POINTERS];    ///< pointers to the image data planes
     attribute_deprecated
     int linesize[AV_NUM_DATA_POINTERS];     ///< number of bytes per line
 } AVPicture;
@@ -3376,9 +3839,6 @@ typedef struct AVPicture {
  */
 #endif
 
-#define AVPALETTE_SIZE 1024
-#define AVPALETTE_COUNT 256
-
 enum AVSubtitleType {
     SUBTITLE_NONE,
 
@@ -3430,6 +3890,7 @@ typedef struct AVSubtitleRect {
      * struct.
      */
     char *ass;
+
     int flags;
 } AVSubtitleRect;
 
@@ -3486,11 +3947,35 @@ typedef struct AVCodecParameters {
     /**
      * The average bitrate of the encoded data (in bits per second).
      */
-    int bit_rate;
+    int64_t bit_rate;
 
+    /**
+     * The number of bits per sample in the codedwords.
+     *
+     * This is basically the bitrate per sample. It is mandatory for a bunch of
+     * formats to actually decode them. It's the number of bits for one sample in
+     * the actual coded bitstream.
+     *
+     * This could be for example 4 for ADPCM
+     * For PCM formats this matches bits_per_raw_sample
+     * Can be 0
+     */
     int bits_per_coded_sample;
 
     /**
+     * This is the number of valid bits in each output sample. If the
+     * sample format has more bits, the least significant bits are additional
+     * padding bits, which are always 0. Use right shifts to reduce the sample
+     * to its actual size. For example, audio formats with 24 bit samples will
+     * have bits_per_raw_sample set to 24, and format set to AV_SAMPLE_FMT_S32.
+     * To get the original sample use "(int32_t)sample >> 8"."
+     *
+     * For ADPCM this might be 12 or 16 or similar
+     * Can be 0
+     */
+    int bits_per_raw_sample;
+
+    /**
      * Codec-specific bitstream restrictions that the stream conforms to.
      */
     int profile;
@@ -3526,6 +4011,11 @@ typedef struct AVCodecParameters {
     enum AVChromaLocation              chroma_location;
 
     /**
+     * Video only. Number of delayed frames.
+     */
+    int video_delay;
+
+    /**
      * Audio only. The channel layout bitmask. May be 0 if the channel layout is
      * unknown or unspecified, otherwise the number of bits set must be equal to
      * the channels field.
@@ -3546,6 +4036,10 @@ typedef struct AVCodecParameters {
      * Corresponds to nBlockAlign in WAVEFORMATEX.
      */
     int      block_align;
+    /**
+     * Audio only. Audio frame size, if known. Required by some formats to be static.
+     */
+    int      frame_size;
 
     /**
      * Audio only. The amount of padding (in samples) inserted by the encoder at
@@ -3561,6 +4055,10 @@ typedef struct AVCodecParameters {
      * audio without any trailing padding.
      */
     int trailing_padding;
+    /**
+     * Audio only. Number of samples to skip after a discontinuity.
+     */
+    int seek_preroll;
 } AVCodecParameters;
 
 /**
@@ -3649,13 +4147,29 @@ int avcodec_get_context_defaults3(AVCodecContext *s, const AVCodec *codec);
 const AVClass *avcodec_get_class(void);
 
 /**
+ * Get the AVClass for AVFrame. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_frame_class(void);
+
+/**
+ * Get the AVClass for AVSubtitleRect. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_subtitle_rect_class(void);
+
+/**
  * Copy the settings of the source AVCodecContext into the destination
  * AVCodecContext. The resulting destination codec context will be
  * unopened, i.e. you are required to call avcodec_open2() before you
  * can use this AVCodecContext to decode/encode video/audio data.
  *
  * @param dest target codec context, should be initialized with
- *             avcodec_alloc_context3(), but otherwise uninitialized
+ *             avcodec_alloc_context3(NULL), but otherwise uninitialized
  * @param src source codec context
  * @return AVERROR() on error (e.g. memory allocation error), 0 on success
  */
@@ -3866,6 +4380,20 @@ int av_packet_from_data(AVPacket *pkt, uint8_t *data, int size);
 attribute_deprecated
 int av_dup_packet(AVPacket *pkt);
 /**
+ * Copy packet, including contents
+ *
+ * @return 0 on success, negative AVERROR on fail
+ */
+int av_copy_packet(AVPacket *dst, const AVPacket *src);
+
+/**
+ * Copy packet side data
+ *
+ * @return 0 on success, negative AVERROR on fail
+ */
+int av_copy_packet_side_data(AVPacket *dst, const AVPacket *src);
+
+/**
  * Free a packet.
  *
  * @deprecated Use av_packet_unref
@@ -3924,6 +4452,31 @@ int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
 uint8_t* av_packet_get_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
                                  int *size);
 
+int av_packet_merge_side_data(AVPacket *pkt);
+
+int av_packet_split_side_data(AVPacket *pkt);
+
+const char *av_packet_side_data_name(enum AVPacketSideDataType type);
+
+/**
+ * Pack a dictionary for use in side_data.
+ *
+ * @param dict The dictionary to pack.
+ * @param size pointer to store the size of the returned data
+ * @return pointer to data if successful, NULL otherwise
+ */
+uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size);
+/**
+ * Unpack a dictionary from side_data.
+ *
+ * @param data data from side_data
+ * @param size size of the data
+ * @param dict the metadata storage dictionary
+ * @return 0 on success, < 0 on failure
+ */
+int av_packet_unpack_dictionary(const uint8_t *data, int size, AVDictionary **dict);
+
+
 /**
  * Convenience function to free all the side data stored.
  * All the other fields stay untouched.
@@ -3948,7 +4501,7 @@ void av_packet_free_side_data(AVPacket *pkt);
  *
  * @return 0 on success, a negative AVERROR on error.
  */
-int av_packet_ref(AVPacket *dst, AVPacket *src);
+int av_packet_ref(AVPacket *dst, const AVPacket *src);
 
 /**
  * Wipe the packet.
@@ -4063,6 +4616,28 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
                                int linesize_align[AV_NUM_DATA_POINTERS]);
 
 /**
+ * Converts AVChromaLocation to swscale x/y chroma position.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos);
+
+/**
+ * Converts swscale x/y chroma position to AVChromaLocation.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos);
+
+/**
  * Decode the audio frame of size avpkt->size from avpkt->data into frame.
  *
  * Some decoders may support multiple frames in a single AVPacket. Such
@@ -4117,7 +4692,7 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
  */
 attribute_deprecated
 int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
-                          int *got_frame_ptr, AVPacket *avpkt);
+                          int *got_frame_ptr, const AVPacket *avpkt);
 
 /**
  * Decode the video frame of size avpkt->size from avpkt->data into picture.
@@ -4167,7 +4742,7 @@ int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
 attribute_deprecated
 int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
                          int *got_picture_ptr,
-                         AVPacket *avpkt);
+                         const AVPacket *avpkt);
 
 /**
  * Decode a subtitle message.
@@ -4179,12 +4754,20 @@ int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
  * and reusing a get_buffer written for video codecs would probably perform badly
  * due to a potentially very different allocation pattern.
  *
+ * Some decoders (those marked with CODEC_CAP_DELAY) have a delay between input
+ * and output. This means that for some packets they will not immediately
+ * produce decoded output and need to be flushed at the end of decoding to get
+ * all the decoded data. Flushing is done by calling this function with packets
+ * with avpkt->data set to NULL and avpkt->size set to 0 until it stops
+ * returning subtitles. It is safe to flush even those decoders that are not
+ * marked with CODEC_CAP_DELAY, then no subtitles will be returned.
+ *
  * @note The AVCodecContext MUST have been opened with @ref avcodec_open2()
  * before packets may be fed to the decoder.
  *
  * @param avctx the codec context
- * @param[out] sub The AVSubtitle in which the decoded subtitle will be stored, must be
-                   freed with avsubtitle_free if *got_sub_ptr is set.
+ * @param[out] sub The Preallocated AVSubtitle in which the decoded subtitle will be stored,
+ *                 must be freed with avsubtitle_free if *got_sub_ptr is set.
  * @param[in,out] got_sub_ptr Zero if no subtitle could be decompressed, otherwise, it is nonzero.
  * @param[in] avpkt The input AVPacket containing the input buffer.
  */
@@ -4365,6 +4948,7 @@ typedef struct AVCodecParserContext {
 #define PARSER_FLAG_ONCE                      0x0002
 /// Set if the parser has a valid file offset
 #define PARSER_FLAG_FETCHED_OFFSET            0x0004
+#define PARSER_FLAG_USE_CODEC_TS              0x1000
 
     int64_t offset;      ///< byte offset from starting packet start
     int64_t cur_frame_end[AV_PARSER_PTS_NB];
@@ -4550,7 +5134,7 @@ int av_parser_parse2(AVCodecParserContext *s,
 
 /**
  * @return 0 if the output buffer is a subset of the input, 1 if it is allocated and must be freed
- * @deprecated use AVBitstreamFilter
+ * @deprecated use AVBitStreamFilter
  */
 int av_parser_change(AVCodecParserContext *s,
                      AVCodecContext *avctx,
@@ -4597,11 +5181,12 @@ AVCodec *avcodec_find_encoder_by_name(const char *name);
  *                  The user can supply an output buffer by setting
  *                  avpkt->data and avpkt->size prior to calling the
  *                  function, but if the size of the user-provided data is not
- *                  large enough, encoding will fail. All other AVPacket fields
- *                  will be reset by the encoder using av_init_packet(). If
- *                  avpkt->data is NULL, the encoder will allocate it.
- *                  The encoder will set avpkt->size to the size of the
- *                  output packet.
+ *                  large enough, encoding will fail. If avpkt->data and
+ *                  avpkt->size are set, avpkt->destruct must also be set. All
+ *                  other AVPacket fields will be reset by the encoder using
+ *                  av_init_packet(). If avpkt->data is NULL, the encoder will
+ *                  allocate it. The encoder will set avpkt->size to the size
+ *                  of the output packet.
  *
  *                  If this function fails or produces no output, avpkt will be
  *                  freed using av_packet_unref().
@@ -4668,6 +5253,103 @@ int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
  * @}
  */
 
+#if FF_API_AVCODEC_RESAMPLE
+/**
+ * @defgroup lavc_resample Audio resampling
+ * @ingroup libavc
+ * @deprecated use libswresample instead
+ *
+ * @{
+ */
+struct ReSampleContext;
+struct AVResampleContext;
+
+typedef struct ReSampleContext ReSampleContext;
+
+/**
+ *  Initialize audio resampling context.
+ *
+ * @param output_channels  number of output channels
+ * @param input_channels   number of input channels
+ * @param output_rate      output sample rate
+ * @param input_rate       input sample rate
+ * @param sample_fmt_out   requested output sample format
+ * @param sample_fmt_in    input sample format
+ * @param filter_length    length of each FIR filter in the filterbank relative to the cutoff frequency
+ * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
+ * @param linear           if 1 then the used FIR filter will be linearly interpolated
+                           between the 2 closest, if 0 the closest will be used
+ * @param cutoff           cutoff frequency, 1.0 corresponds to half the output sampling rate
+ * @return allocated ReSampleContext, NULL if error occurred
+ */
+attribute_deprecated
+ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
+                                        int output_rate, int input_rate,
+                                        enum AVSampleFormat sample_fmt_out,
+                                        enum AVSampleFormat sample_fmt_in,
+                                        int filter_length, int log2_phase_count,
+                                        int linear, double cutoff);
+
+attribute_deprecated
+int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples);
+
+/**
+ * Free resample context.
+ *
+ * @param s a non-NULL pointer to a resample context previously
+ *          created with av_audio_resample_init()
+ */
+attribute_deprecated
+void audio_resample_close(ReSampleContext *s);
+
+
+/**
+ * Initialize an audio resampler.
+ * Note, if either rate is not an integer then simply scale both rates up so they are.
+ * @param filter_length length of each FIR filter in the filterbank relative to the cutoff freq
+ * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
+ * @param linear If 1 then the used FIR filter will be linearly interpolated
+                 between the 2 closest, if 0 the closest will be used
+ * @param cutoff cutoff frequency, 1.0 corresponds to half the output sampling rate
+ */
+attribute_deprecated
+struct AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_length, int log2_phase_count, int linear, double cutoff);
+
+/**
+ * Resample an array of samples using a previously configured context.
+ * @param src an array of unconsumed samples
+ * @param consumed the number of samples of src which have been consumed are returned here
+ * @param src_size the number of unconsumed samples available
+ * @param dst_size the amount of space in samples available in dst
+ * @param update_ctx If this is 0 then the context will not be modified, that way several channels can be resampled with the same context.
+ * @return the number of samples written in dst or -1 if an error occurred
+ */
+attribute_deprecated
+int av_resample(struct AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx);
+
+
+/**
+ * Compensate samplerate/timestamp drift. The compensation is done by changing
+ * the resampler parameters, so no audible clicks or similar distortions occur
+ * @param compensation_distance distance in output samples over which the compensation should be performed
+ * @param sample_delta number of output samples which should be output less
+ *
+ * example: av_resample_compensate(c, 10, 500)
+ * here instead of 510 samples only 500 samples would be output
+ *
+ * note, due to rounding the actual compensation might be slightly different,
+ * especially if the compensation_distance is large and the in_rate used during init is small
+ */
+attribute_deprecated
+void av_resample_compensate(struct AVResampleContext *c, int sample_delta, int compensation_distance);
+attribute_deprecated
+void av_resample_close(struct AVResampleContext *c);
+
+/**
+ * @}
+ */
+#endif
+
 #if FF_API_AVPICTURE
 /**
  * @addtogroup lavc_picture
@@ -4690,14 +5372,14 @@ void avpicture_free(AVPicture *picture);
  * @deprecated use av_image_fill_arrays() instead.
  */
 attribute_deprecated
-int avpicture_fill(AVPicture *picture, uint8_t *ptr,
+int avpicture_fill(AVPicture *picture, const uint8_t *ptr,
                    enum AVPixelFormat pix_fmt, int width, int height);
 
 /**
  * @deprecated use av_image_copy_to_buffer() instead.
  */
 attribute_deprecated
-int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt,
+int avpicture_layout(const AVPicture *src, enum AVPixelFormat pix_fmt,
                      int width, int height,
                      unsigned char *dest, int dest_size);
 
@@ -4750,10 +5432,21 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
  */
 
 /**
- * @deprecated Use av_pix_fmt_get_chroma_sub_sample
+ * Utility function to access log2_chroma_w log2_chroma_h from
+ * the pixel format AVPixFmtDescriptor.
+ *
+ * This function asserts that pix_fmt is valid. See av_pix_fmt_get_chroma_sub_sample
+ * for one that returns a failure code and continues in case of invalid
+ * pix_fmts.
+ *
+ * @param[in]  pix_fmt the pixel format
+ * @param[out] h_shift store log2_chroma_w
+ * @param[out] v_shift store log2_chroma_h
+ *
+ * @see av_pix_fmt_get_chroma_sub_sample
  */
 
-void attribute_deprecated avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift);
+void avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift);
 
 /**
  * Return a value representing the fourCC code associated to the
@@ -4762,29 +5455,8 @@ void attribute_deprecated avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_f
  */
 unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat pix_fmt);
 
-#define FF_LOSS_RESOLUTION  0x0001 /**< loss due to resolution change */
-#define FF_LOSS_DEPTH       0x0002 /**< loss due to color depth change */
-#define FF_LOSS_COLORSPACE  0x0004 /**< loss due to color space conversion */
-#define FF_LOSS_ALPHA       0x0008 /**< loss of alpha bits */
-#define FF_LOSS_COLORQUANT  0x0010 /**< loss due to color quantization */
-#define FF_LOSS_CHROMA      0x0020 /**< loss of chroma (e.g. RGB to gray conversion) */
-
-/**
- * Compute what kind of losses will occur when converting from one specific
- * pixel format to another.
- * When converting from one pixel format to another, information loss may occur.
- * For example, when converting from RGB24 to GRAY, the color information will
- * be lost. Similarly, other losses occur when converting from some formats to
- * other formats. These losses can involve loss of chroma, but also loss of
- * resolution, loss of color depth, loss due to the color space conversion, loss
- * of the alpha bits or loss due to color quantization.
- * avcodec_get_fix_fmt_loss() informs you about the various types of losses
- * which will occur when converting from one pixel format to another.
- *
- * @param[in] dst_pix_fmt destination pixel format
- * @param[in] src_pix_fmt source pixel format
- * @param[in] has_alpha Whether the source pixel format alpha channel is used.
- * @return Combination of flags informing you what kind of losses will occur.
+/**
+ * @deprecated see av_get_pix_fmt_loss()
  */
 int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat src_pix_fmt,
                              int has_alpha);
@@ -4794,7 +5466,7 @@ int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat
  * format.  When converting from one pixel format to another, information loss
  * may occur.  For example, when converting from RGB24 to GRAY, the color
  * information will be lost. Similarly, other losses occur when converting from
- * some formats to other formats. avcodec_find_best_pix_fmt2() searches which of
+ * some formats to other formats. avcodec_find_best_pix_fmt_of_2() searches which of
  * the given pixel formats should be used to suffer the least amount of loss.
  * The pixel formats from which it chooses one, are determined by the
  * pix_fmt_list parameter.
@@ -4806,9 +5478,26 @@ int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat
  * @param[out] loss_ptr Combination of flags informing you what kind of losses will occur.
  * @return The best pixel format to convert to or -1 if none was found.
  */
-enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat *pix_fmt_list,
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list,
+                                            enum AVPixelFormat src_pix_fmt,
+                                            int has_alpha, int *loss_ptr);
+
+/**
+ * @deprecated see av_find_best_pix_fmt_of_2()
+ */
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+
+attribute_deprecated
+#if AV_HAVE_INCOMPATIBLE_LIBAV_ABI
+enum AVPixelFormat avcodec_find_best_pix_fmt2(const enum AVPixelFormat *pix_fmt_list,
                                               enum AVPixelFormat src_pix_fmt,
                                               int has_alpha, int *loss_ptr);
+#else
+enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+#endif
+
 
 enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const enum AVPixelFormat * fmt);
 
@@ -4864,7 +5553,12 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
 //FIXME func typedef
 
 /**
- * Fill audio frame data and linesize.
+ * Fill AVFrame audio data and linesize pointers.
+ *
+ * The buffer buf must be a preallocated buffer with a size big enough
+ * to contain the specified samples amount. The filled AVFrame data
+ * pointers will point to this buffer.
+ *
  * AVFrame extended_data channel pointers are allocated if necessary for
  * planar audio.
  *
@@ -4877,7 +5571,9 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
  * @param buf         buffer to use for frame data
  * @param buf_size    size of buffer
  * @param align       plane size sample alignment (0 = default)
- * @return            0 on success, negative error code on failure
+ * @return            >=0 on success, negative error code on failure
+ * @todo return the size in bytes required to store the samples in
+ * case of success, at the next libavutil bump
  */
 int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
                              enum AVSampleFormat sample_fmt, const uint8_t *buf,
@@ -4903,6 +5599,14 @@ void avcodec_flush_buffers(AVCodecContext *avctx);
 int av_get_bits_per_sample(enum AVCodecID codec_id);
 
 /**
+ * Return the PCM codec associated with a sample format.
+ * @param be  endianness, 0 for little, 1 for big,
+ *            -1 (or anything else) for native
+ * @return  AV_CODEC_ID_PCM_* or AV_CODEC_ID_NONE
+ */
+enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be);
+
+/**
  * Return codec bits per sample.
  * Only return non-zero if the bits per sample is exactly correct, not an
  * approximation.
@@ -4934,6 +5638,11 @@ typedef struct AVBitStreamFilterContext {
     struct AVBitStreamFilter *filter;
     AVCodecParserContext *parser;
     struct AVBitStreamFilterContext *next;
+    /**
+     * Internal default arguments, used if NULL is passed to av_bitstream_filter_filter().
+     * Not for access by library users.
+     */
+    char *args;
 } AVBitStreamFilterContext;
 #endif
 
@@ -5032,20 +5741,82 @@ typedef struct AVBitStreamFilter {
 
 #if FF_API_OLD_BSF
 /**
- * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
- * is deprecated. Use the new bitstream filtering API (using AVBSFContext).
+ * Register a bitstream filter.
+ *
+ * The filter will be accessible to the application code through
+ * av_bitstream_filter_next() or can be directly initialized with
+ * av_bitstream_filter_init().
+ *
+ * @see avcodec_register_all()
  */
 attribute_deprecated
 void av_register_bitstream_filter(AVBitStreamFilter *bsf);
+
+/**
+ * Create and initialize a bitstream filter context given a bitstream
+ * filter name.
+ *
+ * The returned context must be freed with av_bitstream_filter_close().
+ *
+ * @param name    the name of the bitstream filter
+ * @return a bitstream filter context if a matching filter was found
+ * and successfully initialized, NULL otherwise
+ */
 attribute_deprecated
 AVBitStreamFilterContext *av_bitstream_filter_init(const char *name);
+
+/**
+ * Filter bitstream.
+ *
+ * This function filters the buffer buf with size buf_size, and places the
+ * filtered buffer in the buffer pointed to by poutbuf.
+ *
+ * The output buffer must be freed by the caller.
+ *
+ * @param bsfc            bitstream filter context created by av_bitstream_filter_init()
+ * @param avctx           AVCodecContext accessed by the filter, may be NULL.
+ *                        If specified, this must point to the encoder context of the
+ *                        output stream the packet is sent to.
+ * @param args            arguments which specify the filter configuration, may be NULL
+ * @param poutbuf         pointer which is updated to point to the filtered buffer
+ * @param poutbuf_size    pointer which is updated to the filtered buffer size in bytes
+ * @param buf             buffer containing the data to filter
+ * @param buf_size        size in bytes of buf
+ * @param keyframe        set to non-zero if the buffer to filter corresponds to a key-frame packet data
+ * @return >= 0 in case of success, or a negative error code in case of failure
+ *
+ * If the return value is positive, an output buffer is allocated and
+ * is available in *poutbuf, and is distinct from the input buffer.
+ *
+ * If the return value is 0, the output buffer is not allocated and
+ * should be considered identical to the input buffer, or in case
+ * *poutbuf was set it points to the input buffer (not necessarily to
+ * its starting address). A special case is if *poutbuf was set to NULL and
+ * *poutbuf_size was set to 0, which indicates the packet should be dropped.
+ */
 attribute_deprecated
 int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
                                AVCodecContext *avctx, const char *args,
                                uint8_t **poutbuf, int *poutbuf_size,
                                const uint8_t *buf, int buf_size, int keyframe);
+
+/**
+ * Release bitstream filter context.
+ *
+ * @param bsf the bitstream filter context created with
+ * av_bitstream_filter_init(), can be NULL
+ */
 attribute_deprecated
 void av_bitstream_filter_close(AVBitStreamFilterContext *bsf);
+
+/**
+ * If f is NULL, return the first registered bitstream filter,
+ * if f is non-NULL, return the next registered bitstream filter
+ * after f, or NULL if f is the last one.
+ *
+ * This function can be used to iterate over all registered bitstream
+ * filters.
+ */
 attribute_deprecated
 AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f);
 #endif
@@ -5147,14 +5918,21 @@ const AVClass *av_bsf_get_class(void);
 /* memory */
 
 /**
- * Allocate a buffer with padding, reusing the given one if large enough.
- *
  * Same behaviour av_fast_malloc but the buffer has additional
- * AV_INPUT_PADDING_SIZE at the end which will always memset to 0.
+ * AV_INPUT_BUFFER_PADDING_SIZE at the end which will always be 0.
+ *
+ * In addition the whole buffer will initially and after resizes
+ * be 0-initialized so that no uninitialized data will ever appear.
  */
 void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size);
 
 /**
+ * Same behaviour av_fast_padded_malloc except that buffer will always
+ * be 0-initialized after call.
+ */
+void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size);
+
+/**
  * Encode extradata length to a buffer. Used by xiph codecs.
  *
  * @param s buffer to write to; must be at least (v/255+1) bytes long
@@ -5166,7 +5944,7 @@ unsigned int av_xiphlacing(unsigned char *s, unsigned int v);
 #if FF_API_MISSING_SAMPLE
 /**
  * Log a generic warning message about a missing feature. This function is
- * intended to be used internally by Libav (libavcodec, libavformat, etc.)
+ * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
  * only, and would normally not be used by applications.
  * @param[in] avc a pointer to an arbitrary struct of which the first field is
  * a pointer to an AVClass struct
@@ -5182,7 +5960,7 @@ void av_log_missing_feature(void *avc, const char *feature, int want_sample);
 
 /**
  * Log a generic warning message asking for a sample. This function is
- * intended to be used internally by Libav (libavcodec, libavformat, etc.)
+ * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
  * only, and would normally not be used by applications.
  * @param[in] avc a pointer to an arbitrary struct of which the first field is
  * a pointer to an AVClass struct
@@ -5247,6 +6025,12 @@ int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op));
 enum AVMediaType avcodec_get_type(enum AVCodecID codec_id);
 
 /**
+ * Get the name of a codec.
+ * @return  a static string identifying the codec; never NULL
+ */
+const char *avcodec_get_name(enum AVCodecID id);
+
+/**
  * @return a positive value if s is open (i.e. avcodec_open2() was called on it
  * with no corresponding avcodec_close()), 0 otherwise.
  */
diff --git a/libavcodec/avcodecres.rc b/libavcodec/avcodecres.rc
new file mode 100644
index 0000000..4b69686
--- /dev/null
+++ b/libavcodec/avcodecres.rc
@@ -0,0 +1,55 @@
+/*
+ * Windows resource file for libavcodec
+ *
+ * Copyright (C) 2012 James Almer
+ * Copyright (C) 2013 Tiancheng "Timothy" Gu
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <windows.h>
+#include "libavcodec/version.h"
+#include "libavutil/ffversion.h"
+#include "config.h"
+
+1 VERSIONINFO
+FILEVERSION     LIBAVCODEC_VERSION_MAJOR, LIBAVCODEC_VERSION_MINOR, LIBAVCODEC_VERSION_MICRO, 0
+PRODUCTVERSION  LIBAVCODEC_VERSION_MAJOR, LIBAVCODEC_VERSION_MINOR, LIBAVCODEC_VERSION_MICRO, 0
+FILEFLAGSMASK   VS_FFI_FILEFLAGSMASK
+FILEOS          VOS_NT_WINDOWS32
+FILETYPE        VFT_DLL
+{
+    BLOCK "StringFileInfo"
+    {
+        BLOCK "040904B0"
+        {
+            VALUE "CompanyName",      "FFmpeg Project"
+            VALUE "FileDescription",  "FFmpeg codec library"
+            VALUE "FileVersion",      AV_STRINGIFY(LIBAVCODEC_VERSION)
+            VALUE "InternalName",     "libavcodec"
+            VALUE "LegalCopyright",   "Copyright (C) 2000-" AV_STRINGIFY(CONFIG_THIS_YEAR) " FFmpeg Project"
+            VALUE "OriginalFilename", "avcodec" BUILDSUF "-" AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) SLIBSUF
+            VALUE "ProductName",      "FFmpeg"
+            VALUE "ProductVersion",   FFMPEG_VERSION
+        }
+    }
+
+    BLOCK "VarFileInfo"
+    {
+        VALUE "Translation", 0x0409, 0x04B0
+    }
+}
diff --git a/libavcodec/avdct.c b/libavcodec/avdct.c
new file mode 100644
index 0000000..80aca88
--- /dev/null
+++ b/libavcodec/avdct.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "idctdsp.h"
+#include "fdctdsp.h"
+#include "pixblockdsp.h"
+#include "avdct.h"
+
+#define OFFSET(x) offsetof(AVDCT,x)
+#define DEFAULT 0 //should be NAN but it does not work as it is not a constant in glibc as required by ANSI/ISO C
+//these names are too long to be readable
+#define V AV_OPT_FLAG_VIDEO_PARAM
+#define A AV_OPT_FLAG_AUDIO_PARAM
+#define E AV_OPT_FLAG_ENCODING_PARAM
+#define D AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption avdct_options[] = {
+{"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"},
+{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
+{"fastint", "fast integer (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"},
+{"int", "accurate integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_INT }, INT_MIN, INT_MAX, V|E, "dct"},
+{"mmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"},
+{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, "dct"},
+{"faan", "floating point AAN DCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, "dct"},
+
+{"idct", "select IDCT implementation", OFFSET(idct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E|D, "idct"},
+{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_AUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"int", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_INT }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simple", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplemmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"arm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ALTIVEC }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#if FF_API_ARCH_SH4
+{"sh4", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SH4 }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#endif
+{"simplearm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplearmv5te", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV5TE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simplearmv6", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV6 }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simpleneon", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLENEON }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#if FF_API_ARCH_ALPHA
+{"simplealpha", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEALPHA }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#endif
+#if FF_API_UNUSED_MEMBERS
+{"ipp", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_IPP }, INT_MIN, INT_MAX, V|E|D, "idct"},
+#endif
+{"xvid", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"xvidmmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"faani", "floating point AAN IDCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"},
+{"simpleauto", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
+
+{"bits_per_sample", "", OFFSET(bits_per_sample), AV_OPT_TYPE_INT, {.i64 = 8 }, 0, 14, 0,},
+{NULL},
+};
+
+static const AVClass avdct_class = {
+    .class_name              = "AVDCT",
+    .option                  = avdct_options,
+    .version                 = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_dct_get_class(void)
+{
+    return &avdct_class;
+}
+
+AVDCT *avcodec_dct_alloc(void)
+{
+    AVDCT *dsp = av_mallocz(sizeof(AVDCT));
+
+    if (!dsp)
+        return NULL;
+
+    dsp->av_class = &avdct_class;
+    av_opt_set_defaults(dsp);
+
+    return dsp;
+}
+
+int avcodec_dct_init(AVDCT *dsp)
+{
+    AVCodecContext *avctx = avcodec_alloc_context3(NULL);
+
+    if (!avctx)
+        return AVERROR(ENOMEM);
+
+    avctx->idct_algo = dsp->idct_algo;
+    avctx->dct_algo  = dsp->dct_algo;
+    avctx->bits_per_raw_sample = dsp->bits_per_sample;
+
+#define COPY(src, name) memcpy(&dsp->name, &src.name, sizeof(dsp->name))
+
+#if CONFIG_IDCTDSP
+    {
+        IDCTDSPContext idsp;
+        ff_idctdsp_init(&idsp, avctx);
+        COPY(idsp, idct);
+        COPY(idsp, idct_permutation);
+    }
+#endif
+
+#if CONFIG_FDCTDSP
+    {
+        FDCTDSPContext fdsp;
+        ff_fdctdsp_init(&fdsp, avctx);
+        COPY(fdsp, fdct);
+    }
+#endif
+
+#if CONFIG_PIXBLOCKDSP
+    {
+        PixblockDSPContext pdsp;
+        ff_pixblockdsp_init(&pdsp, avctx);
+        COPY(pdsp, get_pixels);
+    }
+#endif
+
+    avcodec_close(avctx);
+    av_free(avctx);
+
+    return 0;
+}
diff --git a/libavcodec/avdct.h b/libavcodec/avdct.h
new file mode 100644
index 0000000..272422e
--- /dev/null
+++ b/libavcodec/avdct.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVDCT_H
+#define AVCODEC_AVDCT_H
+
+#include "libavutil/opt.h"
+
+/**
+ * AVDCT context.
+ * @note function pointers can be NULL if the specific features have been
+ *       disabled at build time.
+ */
+typedef struct AVDCT {
+    const AVClass *av_class;
+
+    void (*idct)(int16_t *block /* align 16 */);
+
+    /**
+     * IDCT input permutation.
+     * Several optimized IDCTs need a permutated input (relative to the
+     * normal order of the reference IDCT).
+     * This permutation must be performed before the idct_put/add.
+     * Note, normally this can be merged with the zigzag/alternate scan<br>
+     * An example to avoid confusion:
+     * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...)
+     * - (x -> reference DCT -> reference IDCT -> x)
+     * - (x -> reference DCT -> simple_mmx_perm = idct_permutation
+     *    -> simple_idct_mmx -> x)
+     * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant
+     *    -> simple_idct_mmx -> ...)
+     */
+    uint8_t idct_permutation[64];
+
+    void (*fdct)(int16_t *block /* align 16 */);
+
+
+    /**
+     * DCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int dct_algo;
+
+    /**
+     * IDCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int idct_algo;
+
+    void (*get_pixels)(int16_t *block /* align 16 */,
+                       const uint8_t *pixels /* align 8 */,
+                       ptrdiff_t line_size);
+
+    int bits_per_sample;
+} AVDCT;
+
+/**
+ * Allocates a AVDCT context.
+ * This needs to be initialized with avcodec_dct_init() after optionally
+ * configuring it with AVOptions.
+ *
+ * To free it use av_free()
+ */
+AVDCT *avcodec_dct_alloc(void);
+int avcodec_dct_init(AVDCT *);
+
+const AVClass *avcodec_dct_get_class(void);
+
+#endif /* AVCODEC_AVDCT_H */
diff --git a/libavcodec/avfft-test.c b/libavcodec/avfft-test.c
new file mode 100644
index 0000000..c2d4cc4
--- /dev/null
+++ b/libavcodec/avfft-test.c
@@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/mem.h"
+#include "avfft.h"
+
+int main(int argc, char **argv)
+{
+    int i;
+#define LEN 1024
+    FFTSample *ref  = av_malloc_array(LEN, sizeof(*ref));
+    FFTSample *data = av_malloc_array(LEN, sizeof(*data));
+    RDFTContext *rdft_context  = av_rdft_init(10, DFT_R2C);
+    RDFTContext *irdft_context = av_rdft_init(10, IDFT_C2R);
+
+    if (!ref || !data || !rdft_context || !irdft_context)
+        return 2;
+    for (i=0; i<LEN; i++) {
+        ref[i] = data[i] = i*456 + 123 + i*i;
+    }
+    av_rdft_calc(rdft_context, data);
+    av_rdft_calc(irdft_context, data);
+
+    for (i=0; i<LEN; i++) {
+        if (fabs(ref[i] - data[i]/LEN*2) > 1) {
+            fprintf(stderr, "Failed at %d (%f %f)\n", i, ref[i], data[i]/LEN*2);
+            return 1;
+        }
+    }
+
+    av_rdft_end(rdft_context);
+    av_rdft_end(irdft_context);
+    av_free(data);
+    av_free(ref);
+
+    return 0;
+}
diff --git a/libavcodec/avfft.c b/libavcodec/avfft.c
index 513f57e..2200f37 100644
--- a/libavcodec/avfft.c
+++ b/libavcodec/avfft.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 FFTContext *av_fft_init(int nbits, int inverse)
 {
-    FFTContext *s = av_malloc(sizeof(*s));
+    FFTContext *s = av_mallocz(sizeof(*s));
 
     if (s && ff_fft_init(s, nbits, inverse))
         av_freep(&s);
diff --git a/libavcodec/avfft.h b/libavcodec/avfft.h
index e2e727d..0c0f9b8 100644
--- a/libavcodec/avfft.h
+++ b/libavcodec/avfft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/avpacket.c b/libavcodec/avpacket.c
index 59fc6f2..8988ca2 100644
--- a/libavcodec/avpacket.c
+++ b/libavcodec/avpacket.c
@@ -2,20 +2,20 @@
  * AVPacket functions for libavcodec
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/mem.h"
 #include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
 
 void av_init_packet(AVPacket *pkt)
 {
@@ -108,24 +110,38 @@ int av_grow_packet(AVPacket *pkt, int grow_by)
 {
     int new_size;
     av_assert0((unsigned)pkt->size <= INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!pkt->size)
-        return av_new_packet(pkt, grow_by);
     if ((unsigned)grow_by >
         INT_MAX - (pkt->size + AV_INPUT_BUFFER_PADDING_SIZE))
         return -1;
 
     new_size = pkt->size + grow_by + AV_INPUT_BUFFER_PADDING_SIZE;
     if (pkt->buf) {
-        int ret = av_buffer_realloc(&pkt->buf, new_size);
-        if (ret < 0)
-            return ret;
+        size_t data_offset;
+        uint8_t *old_data = pkt->data;
+        if (pkt->data == NULL) {
+            data_offset = 0;
+            pkt->data = pkt->buf->data;
+        } else {
+            data_offset = pkt->data - pkt->buf->data;
+            if (data_offset > INT_MAX - new_size)
+                return -1;
+        }
+
+        if (new_size + data_offset > pkt->buf->size) {
+            int ret = av_buffer_realloc(&pkt->buf, new_size + data_offset);
+            if (ret < 0) {
+                pkt->data = old_data;
+                return ret;
+            }
+            pkt->data = pkt->buf->data + data_offset;
+        }
     } else {
         pkt->buf = av_buffer_alloc(new_size);
         if (!pkt->buf)
             return AVERROR(ENOMEM);
-        memcpy(pkt->buf->data, pkt->data, FFMIN(pkt->size, pkt->size + grow_by));
+        memcpy(pkt->buf->data, pkt->data, pkt->size);
+        pkt->data = pkt->buf->data;
     }
-    pkt->data  = pkt->buf->data;
     pkt->size += grow_by;
     memset(pkt->data + pkt->size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
@@ -177,32 +193,50 @@ do {                                         \
         dst = data;                                                     \
     } while (0)
 
-int av_dup_packet(AVPacket *pkt)
+/* Makes duplicates of data, side_data, but does not copy any other fields */
+static int copy_packet_data(AVPacket *pkt, const AVPacket *src, int dup)
 {
-    AVPacket tmp_pkt;
-
-    if (!pkt->buf && pkt->data) {
-        tmp_pkt = *pkt;
-
-        pkt->data      = NULL;
-        pkt->side_data = NULL;
-        DUP_DATA(pkt->data, tmp_pkt.data, pkt->size, 1, ALLOC_BUF);
+    pkt->data      = NULL;
+    pkt->side_data = NULL;
+    if (pkt->buf) {
+        AVBufferRef *ref = av_buffer_ref(src->buf);
+        if (!ref)
+            return AVERROR(ENOMEM);
+        pkt->buf  = ref;
+        pkt->data = ref->data;
+    } else {
+        DUP_DATA(pkt->data, src->data, pkt->size, 1, ALLOC_BUF);
+    }
+    if (pkt->side_data_elems && dup)
+        pkt->side_data = src->side_data;
+    if (pkt->side_data_elems && !dup) {
+        return av_copy_packet_side_data(pkt, src);
+    }
+    return 0;
 
-        if (pkt->side_data_elems) {
-            int i;
+failed_alloc:
+    av_packet_unref(pkt);
+    return AVERROR(ENOMEM);
+}
 
-            DUP_DATA(pkt->side_data, tmp_pkt.side_data,
-                     pkt->side_data_elems * sizeof(*pkt->side_data), 0, ALLOC_MALLOC);
+int av_copy_packet_side_data(AVPacket *pkt, const AVPacket *src)
+{
+    if (src->side_data_elems) {
+        int i;
+        DUP_DATA(pkt->side_data, src->side_data,
+                src->side_data_elems * sizeof(*src->side_data), 0, ALLOC_MALLOC);
+        if (src != pkt) {
             memset(pkt->side_data, 0,
-                   pkt->side_data_elems * sizeof(*pkt->side_data));
-            for (i = 0; i < pkt->side_data_elems; i++) {
-                DUP_DATA(pkt->side_data[i].data, tmp_pkt.side_data[i].data,
-                         tmp_pkt.side_data[i].size, 1, ALLOC_MALLOC);
-                pkt->side_data[i].size = tmp_pkt.side_data[i].size;
-                pkt->side_data[i].type = tmp_pkt.side_data[i].type;
-            }
+                   src->side_data_elems * sizeof(*src->side_data));
+        }
+        for (i = 0; i < src->side_data_elems; i++) {
+            DUP_DATA(pkt->side_data[i].data, src->side_data[i].data,
+                    src->side_data[i].size, 1, ALLOC_MALLOC);
+            pkt->side_data[i].size = src->side_data[i].size;
+            pkt->side_data[i].type = src->side_data[i].type;
         }
     }
+    pkt->side_data_elems = src->side_data_elems;
     return 0;
 
 failed_alloc:
@@ -212,11 +246,28 @@ failed_alloc:
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+int av_dup_packet(AVPacket *pkt)
+{
+    AVPacket tmp_pkt;
+
+    if (!pkt->buf && pkt->data) {
+        tmp_pkt = *pkt;
+        return copy_packet_data(pkt, &tmp_pkt, 1);
+    }
+    return 0;
+}
+
+int av_copy_packet(AVPacket *dst, const AVPacket *src)
+{
+    *dst = *src;
+    return copy_packet_data(dst, src, 0);
+}
+
 void av_packet_free_side_data(AVPacket *pkt)
 {
     int i;
     for (i = 0; i < pkt->side_data_elems; i++)
-        av_free(pkt->side_data[i].data);
+        av_freep(&pkt->side_data[i].data);
     av_freep(&pkt->side_data);
     pkt->side_data_elems = 0;
 }
@@ -267,7 +318,7 @@ uint8_t *av_packet_new_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
 
     if ((unsigned)size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE)
         return NULL;
-    data = av_malloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+    data = av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!data)
         return NULL;
 
@@ -295,6 +346,172 @@ uint8_t *av_packet_get_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
     return NULL;
 }
 
+const char *av_packet_side_data_name(enum AVPacketSideDataType type)
+{
+    switch(type) {
+    case AV_PKT_DATA_PALETTE:                    return "Palette";
+    case AV_PKT_DATA_NEW_EXTRADATA:              return "New Extradata";
+    case AV_PKT_DATA_PARAM_CHANGE:               return "Param Change";
+    case AV_PKT_DATA_H263_MB_INFO:               return "H263 MB Info";
+    case AV_PKT_DATA_REPLAYGAIN:                 return "Replay Gain";
+    case AV_PKT_DATA_DISPLAYMATRIX:              return "Display Matrix";
+    case AV_PKT_DATA_STEREO3D:                   return "Stereo 3D";
+    case AV_PKT_DATA_AUDIO_SERVICE_TYPE:         return "Audio Service Type";
+    case AV_PKT_DATA_SKIP_SAMPLES:               return "Skip Samples";
+    case AV_PKT_DATA_JP_DUALMONO:                return "JP Dual Mono";
+    case AV_PKT_DATA_STRINGS_METADATA:           return "Strings Metadata";
+    case AV_PKT_DATA_SUBTITLE_POSITION:          return "Subtitle Position";
+    case AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL:   return "Matroska BlockAdditional";
+    case AV_PKT_DATA_WEBVTT_IDENTIFIER:          return "WebVTT ID";
+    case AV_PKT_DATA_WEBVTT_SETTINGS:            return "WebVTT Settings";
+    case AV_PKT_DATA_METADATA_UPDATE:            return "Metadata Update";
+    case AV_PKT_DATA_MPEGTS_STREAM_ID:           return "MPEGTS Stream ID";
+    case AV_PKT_DATA_MASTERING_DISPLAY_METADATA: return "Mastering display metadata";
+    }
+    return NULL;
+}
+
+#define FF_MERGE_MARKER 0x8c4d9d108e25e9feULL
+
+int av_packet_merge_side_data(AVPacket *pkt){
+    if(pkt->side_data_elems){
+        AVBufferRef *buf;
+        int i;
+        uint8_t *p;
+        uint64_t size= pkt->size + 8LL + AV_INPUT_BUFFER_PADDING_SIZE;
+        AVPacket old= *pkt;
+        for (i=0; i<old.side_data_elems; i++) {
+            size += old.side_data[i].size + 5LL;
+        }
+        if (size > INT_MAX)
+            return AVERROR(EINVAL);
+        buf = av_buffer_alloc(size);
+        if (!buf)
+            return AVERROR(ENOMEM);
+        pkt->buf = buf;
+        pkt->data = p = buf->data;
+        pkt->size = size - AV_INPUT_BUFFER_PADDING_SIZE;
+        bytestream_put_buffer(&p, old.data, old.size);
+        for (i=old.side_data_elems-1; i>=0; i--) {
+            bytestream_put_buffer(&p, old.side_data[i].data, old.side_data[i].size);
+            bytestream_put_be32(&p, old.side_data[i].size);
+            *p++ = old.side_data[i].type | ((i==old.side_data_elems-1)*128);
+        }
+        bytestream_put_be64(&p, FF_MERGE_MARKER);
+        av_assert0(p-pkt->data == pkt->size);
+        memset(p, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        av_packet_unref(&old);
+        pkt->side_data_elems = 0;
+        pkt->side_data = NULL;
+        return 1;
+    }
+    return 0;
+}
+
+int av_packet_split_side_data(AVPacket *pkt){
+    if (!pkt->side_data_elems && pkt->size >12 && AV_RB64(pkt->data + pkt->size - 8) == FF_MERGE_MARKER){
+        int i;
+        unsigned int size;
+        uint8_t *p;
+
+        p = pkt->data + pkt->size - 8 - 5;
+        for (i=1; ; i++){
+            size = AV_RB32(p);
+            if (size>INT_MAX - 5 || p - pkt->data < size)
+                return 0;
+            if (p[4]&128)
+                break;
+            if (p - pkt->data < size + 5)
+                return 0;
+            p-= size+5;
+        }
+
+        pkt->side_data = av_malloc_array(i, sizeof(*pkt->side_data));
+        if (!pkt->side_data)
+            return AVERROR(ENOMEM);
+
+        p= pkt->data + pkt->size - 8 - 5;
+        for (i=0; ; i++){
+            size= AV_RB32(p);
+            av_assert0(size<=INT_MAX - 5 && p - pkt->data >= size);
+            pkt->side_data[i].data = av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE);
+            pkt->side_data[i].size = size;
+            pkt->side_data[i].type = p[4]&127;
+            if (!pkt->side_data[i].data)
+                return AVERROR(ENOMEM);
+            memcpy(pkt->side_data[i].data, p-size, size);
+            pkt->size -= size + 5;
+            if(p[4]&128)
+                break;
+            p-= size+5;
+        }
+        pkt->size -= 8;
+        pkt->side_data_elems = i+1;
+        return 1;
+    }
+    return 0;
+}
+
+uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size)
+{
+    AVDictionaryEntry *t = NULL;
+    uint8_t *data = NULL;
+    *size = 0;
+
+    if (!dict)
+        return NULL;
+
+    while ((t = av_dict_get(dict, "", t, AV_DICT_IGNORE_SUFFIX))) {
+        const size_t keylen   = strlen(t->key);
+        const size_t valuelen = strlen(t->value);
+        const size_t new_size = *size + keylen + 1 + valuelen + 1;
+        uint8_t *const new_data = av_realloc(data, new_size);
+
+        if (!new_data)
+            goto fail;
+        data = new_data;
+        if (new_size > INT_MAX)
+            goto fail;
+
+        memcpy(data + *size, t->key, keylen + 1);
+        memcpy(data + *size + keylen + 1, t->value, valuelen + 1);
+
+        *size = new_size;
+    }
+
+    return data;
+
+fail:
+    av_freep(&data);
+    *size = 0;
+    return NULL;
+}
+
+int av_packet_unpack_dictionary(const uint8_t *data, int size, AVDictionary **dict)
+{
+    const uint8_t *end = data + size;
+    int ret = 0;
+
+    if (!dict || !data || !size)
+        return ret;
+    if (size && end[-1])
+        return AVERROR_INVALIDDATA;
+    while (data < end) {
+        const uint8_t *key = data;
+        const uint8_t *val = data + strlen(key) + 1;
+
+        if (val >= end)
+            return AVERROR_INVALIDDATA;
+
+        ret = av_dict_set(dict, key, val, 0);
+        if (ret < 0)
+            break;
+        data = val + strlen(val) + 1;
+    }
+
+    return ret;
+}
+
 int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
                                int size)
 {
@@ -352,7 +569,7 @@ void av_packet_unref(AVPacket *pkt)
     pkt->size = 0;
 }
 
-int av_packet_ref(AVPacket *dst, AVPacket *src)
+int av_packet_ref(AVPacket *dst, const AVPacket *src)
 {
     int ret;
 
@@ -365,16 +582,18 @@ int av_packet_ref(AVPacket *dst, AVPacket *src)
         if (ret < 0)
             goto fail;
         memcpy(dst->buf->data, src->data, src->size);
+        dst->data = dst->buf->data;
     } else {
         dst->buf = av_buffer_ref(src->buf);
         if (!dst->buf) {
             ret = AVERROR(ENOMEM);
             goto fail;
         }
+        dst->data = src->data;
     }
 
     dst->size = src->size;
-    dst->data = dst->buf->data;
+
     return 0;
 fail:
     av_packet_free_side_data(dst);
@@ -417,3 +636,28 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 }
+
+int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type)
+{
+    uint8_t *side_data;
+    int side_data_size;
+    int i;
+
+    side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_QUALITY_STATS, &side_data_size);
+    if (!side_data) {
+        side_data_size = 4+4+8*error_count;
+        side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_STATS,
+                                            side_data_size);
+    }
+
+    if (!side_data || side_data_size < 4+4+8*error_count)
+        return AVERROR(ENOMEM);
+
+    AV_WL32(side_data   , quality  );
+    side_data[4] = pict_type;
+    side_data[5] = error_count;
+    for (i = 0; i<error_count; i++)
+        AV_WL64(side_data+8 + 8*i , error[i]);
+
+    return 0;
+}
diff --git a/libavcodec/avpicture.c b/libavcodec/avpicture.c
index 786d740..56435f4 100644
--- a/libavcodec/avpicture.c
+++ b/libavcodec/avpicture.c
@@ -2,20 +2,20 @@
  * AVPicture management routines
  * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,19 +34,18 @@
 
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
-int avpicture_fill(AVPicture *picture, uint8_t *ptr,
+int avpicture_fill(AVPicture *picture, const uint8_t *ptr,
                    enum AVPixelFormat pix_fmt, int width, int height)
 {
     return av_image_fill_arrays(picture->data, picture->linesize,
                                 ptr, pix_fmt, width, height, 1);
 }
 
-int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt,
-                     int width, int height,
+int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt, int width, int height,
                      unsigned char *dest, int dest_size)
 {
     return av_image_copy_to_buffer(dest, dest_size,
-                                   src->data, src->linesize,
+                                   (const uint8_t * const*)src->data, src->linesize,
                                    pix_fmt, width, height, 1);
 }
 
@@ -70,13 +69,13 @@ int avpicture_alloc(AVPicture *picture,
 
 void avpicture_free(AVPicture *picture)
 {
-    av_free(picture->data[0]);
+    av_freep(&picture->data[0]);
 }
 
 void av_picture_copy(AVPicture *dst, const AVPicture *src,
                      enum AVPixelFormat pix_fmt, int width, int height)
 {
-    av_image_copy(dst->data, dst->linesize, src->data,
+    av_image_copy(dst->data, dst->linesize, (const uint8_t **)src->data,
                   src->linesize, pix_fmt, width, height);
 }
 FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/libavcodec/avr32/mathops.h b/libavcodec/avr32/mathops.h
index 528b7ad..85f42b5 100644
--- a/libavcodec/avr32/mathops.h
+++ b/libavcodec/avr32/mathops.h
@@ -2,20 +2,20 @@
  * Simple math operations
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/avrndec.c b/libavcodec/avrndec.c
new file mode 100644
index 0000000..695c491
--- /dev/null
+++ b/libavcodec/avrndec.c
@@ -0,0 +1,173 @@
+/*
+ * AVRn decoder
+ * Copyright (c) 2012 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "mjpeg.h"
+#include "mjpegdec.h"
+#include "libavutil/imgutils.h"
+
+typedef struct {
+    AVCodecContext *mjpeg_avctx;
+    int is_mjpeg;
+    int interlace; //FIXME use frame.interlaced_frame
+    int tff;
+} AVRnContext;
+
+static av_cold int init(AVCodecContext *avctx)
+{
+    AVRnContext *a = avctx->priv_data;
+    int ret;
+
+    // Support "Resolution 1:1" for Avid AVI Codec
+    a->is_mjpeg = avctx->extradata_size < 31 || memcmp(&avctx->extradata[28], "1:1", 3);
+
+    if(!a->is_mjpeg && avctx->lowres) {
+        av_log(avctx, AV_LOG_ERROR, "lowres is not possible with rawvideo\n");
+        return AVERROR(EINVAL);
+    }
+
+    if(a->is_mjpeg) {
+        AVCodec *codec = avcodec_find_decoder(AV_CODEC_ID_MJPEG);
+        AVDictionary *thread_opt = NULL;
+        if (!codec) {
+            av_log(avctx, AV_LOG_ERROR, "MJPEG codec not found\n");
+            return AVERROR_DECODER_NOT_FOUND;
+        }
+
+        a->mjpeg_avctx = avcodec_alloc_context3(codec);
+
+        av_dict_set(&thread_opt, "threads", "1", 0); // Is this needed ?
+        a->mjpeg_avctx->refcounted_frames = 1;
+        a->mjpeg_avctx->flags = avctx->flags;
+        a->mjpeg_avctx->idct_algo = avctx->idct_algo;
+        a->mjpeg_avctx->lowres = avctx->lowres;
+        a->mjpeg_avctx->width = avctx->width;
+        a->mjpeg_avctx->height = avctx->height;
+
+        if ((ret = ff_codec_open2_recursive(a->mjpeg_avctx, codec, &thread_opt)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "MJPEG codec failed to open\n");
+        }
+        av_dict_free(&thread_opt);
+
+        return ret;
+    }
+
+    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return ret;
+
+    avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+
+    if(avctx->extradata_size >= 9 && avctx->extradata[4]+28 < avctx->extradata_size) {
+        int ndx = avctx->extradata[4] + 4;
+        a->interlace = !memcmp(avctx->extradata + ndx, "1:1(", 4);
+        if(a->interlace) {
+            a->tff = avctx->extradata[ndx + 24] == 1;
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int end(AVCodecContext *avctx)
+{
+    AVRnContext *a = avctx->priv_data;
+
+    avcodec_close(a->mjpeg_avctx);
+    av_freep(&a->mjpeg_avctx);
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame, AVPacket *avpkt)
+{
+    AVRnContext *a = avctx->priv_data;
+    AVFrame *p = data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+    int y, ret, true_height;
+
+    if(a->is_mjpeg) {
+        ret = avcodec_decode_video2(a->mjpeg_avctx, data, got_frame, avpkt);
+
+        if (ret >= 0 && *got_frame && avctx->width <= p->width && avctx->height <= p->height) {
+            int shift = p->height - avctx->height;
+            int subsample_h, subsample_v;
+
+            av_pix_fmt_get_chroma_sub_sample(p->format, &subsample_h, &subsample_v);
+
+            p->data[0] += p->linesize[0] * shift;
+            if (p->data[2]) {
+                p->data[1] += p->linesize[1] * (shift>>subsample_v);
+                p->data[2] += p->linesize[2] * (shift>>subsample_v);
+            }
+
+            p->width  = avctx->width;
+            p->height = avctx->height;
+        }
+        avctx->pix_fmt = a->mjpeg_avctx->pix_fmt;
+        return ret;
+    }
+
+    true_height    = buf_size / (2*avctx->width);
+
+    if(buf_size < 2*avctx->width * avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
+    p->pict_type= AV_PICTURE_TYPE_I;
+    p->key_frame= 1;
+
+    if(a->interlace) {
+        buf += (true_height - avctx->height)*avctx->width;
+        for(y = 0; y < avctx->height-1; y+=2) {
+            memcpy(p->data[0] + (y+ a->tff)*p->linesize[0], buf                             , 2*avctx->width);
+            memcpy(p->data[0] + (y+!a->tff)*p->linesize[0], buf + avctx->width*true_height+4, 2*avctx->width);
+            buf += 2*avctx->width;
+        }
+    } else {
+        buf += (true_height - avctx->height)*avctx->width*2;
+        for(y = 0; y < avctx->height; y++) {
+            memcpy(p->data[0] + y*p->linesize[0], buf, 2*avctx->width);
+            buf += 2*avctx->width;
+        }
+    }
+
+    *got_frame      = 1;
+    return buf_size;
+}
+
+AVCodec ff_avrn_decoder = {
+    .name           = "avrn",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRN,
+    .priv_data_size = sizeof(AVRnContext),
+    .init           = init,
+    .close          = end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
+};
diff --git a/libavcodec/avs.c b/libavcodec/avs.c
index 0d127f8..345d628 100644
--- a/libavcodec/avs.c
+++ b/libavcodec/avs.c
@@ -2,20 +2,20 @@
  * AVS video decoder.
  * Copyright (c) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -57,12 +57,10 @@ avs_decode_frame(AVCodecContext * avctx,
     int i, j, x, y, stride, ret, vect_w = 3, vect_h = 3;
     AvsVideoSubType sub_type;
     AvsBlockType type;
-    GetBitContext change_map;
+    GetBitContext change_map = {0}; //init to silence warning
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_P;
     p->key_frame = 0;
 
@@ -84,8 +82,10 @@ avs_decode_frame(AVCodecContext * avctx,
         if (first >= 256 || last > 256 || buf_end - buf < 4 + 4 + 3 * (last - first))
             return AVERROR_INVALIDDATA;
         buf += 4;
-        for (i=first; i<last; i++, buf+=3)
+        for (i=first; i<last; i++, buf+=3) {
             pal[i] = (buf[0] << 18) | (buf[1] << 10) | (buf[2] << 2);
+            pal[i] |= 0xFFU << 24 | (pal[i] >> 6) & 0x30303;
+        }
 
         sub_type = buf[0];
         type = buf[1];
diff --git a/libavcodec/avuidec.c b/libavcodec/avuidec.c
new file mode 100644
index 0000000..5117844
--- /dev/null
+++ b/libavcodec/avuidec.c
@@ -0,0 +1,130 @@
+/*
+ * AVID Meridien decoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int avui_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+    return 0;
+}
+
+static int avui_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    int ret;
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data, *extradata = avctx->extradata;
+    const uint8_t *srca;
+    uint8_t *y, *u, *v, *a;
+    int transparent, interlaced = 1, skip, opaque_length, i, j, k;
+    uint32_t extradata_size = avctx->extradata_size;
+
+    while (extradata_size >= 24) {
+        uint32_t atom_size = AV_RB32(extradata);
+        if (!memcmp(&extradata[4], "APRGAPRG0001", 12)) {
+            interlaced = extradata[19] != 1;
+            break;
+        }
+        if (atom_size && atom_size <= extradata_size) {
+            extradata      += atom_size;
+            extradata_size -= atom_size;
+        } else {
+            break;
+        }
+    }
+    if (avctx->height == 486) {
+        skip = 10;
+    } else {
+        skip = 16;
+    }
+    opaque_length = 2 * avctx->width * (avctx->height + skip) + 4 * interlaced;
+    if (avpkt->size < opaque_length) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+    transparent = avctx->bits_per_coded_sample == 32 &&
+                  avpkt->size >= opaque_length * 2 + 4;
+    srca = src + opaque_length + 5;
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    if (!interlaced) {
+        src  += avctx->width * skip;
+        srca += avctx->width * skip;
+    }
+
+    for (i = 0; i < interlaced + 1; i++) {
+        src  += avctx->width * skip;
+        srca += avctx->width * skip;
+        if (interlaced && avctx->height == 486) {
+            y = pic->data[0] + (1 - i) * pic->linesize[0];
+            u = pic->data[1] + (1 - i) * pic->linesize[1];
+            v = pic->data[2] + (1 - i) * pic->linesize[2];
+            a = pic->data[3] + (1 - i) * pic->linesize[3];
+        } else {
+            y = pic->data[0] + i * pic->linesize[0];
+            u = pic->data[1] + i * pic->linesize[1];
+            v = pic->data[2] + i * pic->linesize[2];
+            a = pic->data[3] + i * pic->linesize[3];
+        }
+
+        for (j = 0; j < avctx->height >> interlaced; j++) {
+            for (k = 0; k < avctx->width >> 1; k++) {
+                u[    k    ] = *src++;
+                y[2 * k    ] = *src++;
+                a[2 * k    ] = 0xFF - (transparent ? *srca++ : 0);
+                srca++;
+                v[    k    ] = *src++;
+                y[2 * k + 1] = *src++;
+                a[2 * k + 1] = 0xFF - (transparent ? *srca++ : 0);
+                srca++;
+            }
+
+            y += (interlaced + 1) * pic->linesize[0];
+            u += (interlaced + 1) * pic->linesize[1];
+            v += (interlaced + 1) * pic->linesize[2];
+            a += (interlaced + 1) * pic->linesize[3];
+        }
+        src  += 4;
+        srca += 4;
+    }
+    *got_frame       = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_avui_decoder = {
+    .name         = "avui",
+    .long_name    = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AVUI,
+    .init         = avui_decode_init,
+    .decode       = avui_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/avuienc.c b/libavcodec/avuienc.c
new file mode 100644
index 0000000..b219906
--- /dev/null
+++ b/libavcodec/avuienc.c
@@ -0,0 +1,103 @@
+/*
+ * AVID Meridien encoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+static av_cold int avui_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width != 720 || avctx->height != 486 && avctx->height != 576) {
+        av_log(avctx, AV_LOG_ERROR, "Only 720x486 and 720x576 are supported.\n");
+        return AVERROR(EINVAL);
+    }
+    if (!(avctx->extradata = av_mallocz(144 + AV_INPUT_BUFFER_PADDING_SIZE)))
+        return AVERROR(ENOMEM);
+    avctx->extradata_size = 144;
+    memcpy(avctx->extradata, "\0\0\0\x18""APRGAPRG0001", 16);
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        avctx->extradata[19] = 2;
+    } else {
+        avctx->extradata[19] = 1;
+    }
+    memcpy(avctx->extradata + 24, "\0\0\0\x78""ARESARES0001""\0\0\0\x98", 20);
+    AV_WB32(avctx->extradata + 44, avctx->width);
+    AV_WB32(avctx->extradata + 48, avctx->height);
+    memcpy(avctx->extradata + 52, "\0\0\0\x1\0\0\0\x20\0\0\0\x2", 12);
+
+
+    return 0;
+}
+
+static int avui_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    int i, j, skip, ret, size, interlaced;
+
+    interlaced = avctx->field_order > AV_FIELD_PROGRESSIVE;
+
+    if (avctx->height == 486) {
+        skip = 10;
+    } else {
+        skip = 16;
+    }
+    size = 2 * avctx->width * (avctx->height + skip) + 8 * interlaced;
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+        return ret;
+    dst = pkt->data;
+    if (!interlaced) {
+        memset(dst, 0, avctx->width * skip);
+        dst += avctx->width * skip;
+    }
+
+    for (i = 0; i <= interlaced; i++) {
+        uint8_t *src;
+        if (interlaced && avctx->height == 486) {
+            src = pic->data[0] + (1 - i) * pic->linesize[0];
+        } else {
+            src = pic->data[0] + i * pic->linesize[0];
+        }
+        memset(dst, 0, avctx->width * skip + 4 * i);
+        dst += avctx->width * skip + 4 * i;
+        for (j = 0; j < avctx->height; j += interlaced + 1) {
+            memcpy(dst, src, avctx->width * 2);
+            src += (interlaced + 1) * pic->linesize[0];
+            dst += avctx->width * 2;
+        }
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+AVCodec ff_avui_encoder = {
+    .name         = "avui",
+    .long_name    = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AVUI,
+    .init         = avui_encode_init,
+    .encode2      = avui_encode_frame,
+    .capabilities = AV_CODEC_CAP_EXPERIMENTAL | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_UYVY422, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/bethsoftvideo.c b/libavcodec/bethsoftvideo.c
index 11e2cfa..97b745d 100644
--- a/libavcodec/bethsoftvideo.c
+++ b/libavcodec/bethsoftvideo.c
@@ -2,20 +2,20 @@
  * Bethesda VID video decoder
  * Copyright (C) 2007 Nicholas Tung
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,7 +59,8 @@ static int set_palette(BethsoftvidContext *ctx)
         return AVERROR_INVALIDDATA;
 
     for(a = 0; a < 256; a++){
-        palette[a] = bytestream2_get_be24u(&ctx->g) * 4;
+        palette[a] = 0xFFU << 24 | bytestream2_get_be24u(&ctx->g) * 4;
+        palette[a] |= palette[a] >> 6 & 0x30303;
     }
     ctx->frame->palette_has_changed = 1;
     return 0;
@@ -78,10 +79,8 @@ static int bethsoftvid_decode_frame(AVCodecContext *avctx,
     int code, ret;
     int yoffset;
 
-    if ((ret = ff_reget_buffer(avctx, vid->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, vid->frame)) < 0)
         return ret;
-    }
     wrap_to_next_line = vid->frame->linesize[0] - avctx->width;
 
     if (avpkt->side_data_elems > 0 &&
diff --git a/libavcodec/bethsoftvideo.h b/libavcodec/bethsoftvideo.h
index 5cbbdfd..d5b5d0a 100644
--- a/libavcodec/bethsoftvideo.h
+++ b/libavcodec/bethsoftvideo.h
@@ -2,20 +2,20 @@
  * Bethesda VID video decoder
  * Copyright (C) 2007 Nicholas Tung
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bfi.c b/libavcodec/bfi.c
index 8335e9d..6727629 100644
--- a/libavcodec/bfi.c
+++ b/libavcodec/bfi.c
@@ -2,20 +2,20 @@
  * Brute Force & Ignorance (BFI) video decoder
  * Copyright (c) 2008 Sisir Koppaka
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
 typedef struct BFIContext {
     AVCodecContext *avctx;
     uint8_t *dst;
+    uint32_t pal[256];
 } BFIContext;
 
 static av_cold int bfi_decode_init(AVCodecContext *avctx)
@@ -41,6 +42,8 @@ static av_cold int bfi_decode_init(AVCodecContext *avctx)
     BFIContext *bfi = avctx->priv_data;
     avctx->pix_fmt  = AV_PIX_FMT_PAL8;
     bfi->dst        = av_mallocz(avctx->width * avctx->height);
+    if (!bfi->dst)
+        return AVERROR(ENOMEM);
     return 0;
 }
 
@@ -57,10 +60,8 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
     uint32_t *pal;
     int i, j, ret, height = avctx->height;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init(&g, avpkt->data, buf_size);
 
@@ -76,16 +77,19 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
         pal = (uint32_t *)frame->data[1];
         for (i = 0; i < avctx->extradata_size / 3; i++) {
             int shift = 16;
-            *pal = 0;
+            *pal = 0xFFU << 24;
             for (j = 0; j < 3; j++, shift -= 8)
                 *pal += ((avctx->extradata[i * 3 + j] << 2) |
                          (avctx->extradata[i * 3 + j] >> 4)) << shift;
             pal++;
         }
+        memcpy(bfi->pal, frame->data[1], sizeof(bfi->pal));
         frame->palette_has_changed = 1;
     } else {
         frame->pict_type = AV_PICTURE_TYPE_P;
         frame->key_frame = 0;
+        frame->palette_has_changed = 0;
+        memcpy(frame->data[1], bfi->pal, sizeof(bfi->pal));
     }
 
     bytestream2_skip(&g, 4); // Unpacked size, not required.
@@ -167,7 +171,7 @@ static int bfi_decode_frame(AVCodecContext *avctx, void *data,
 static av_cold int bfi_decode_close(AVCodecContext *avctx)
 {
     BFIContext *bfi = avctx->priv_data;
-    av_free(bfi->dst);
+    av_freep(&bfi->dst);
     return 0;
 }
 
diff --git a/libavcodec/bfin/README b/libavcodec/bfin/README
new file mode 100644
index 0000000..afb3461
--- /dev/null
+++ b/libavcodec/bfin/README
@@ -0,0 +1,6 @@
+BFIN optimizations have been removed in
+commit 880e2aa23645ed9871c66ee1cbd00f93c72d2d73
+The last revission with the optimizations is fa4e17c14035ebf43130fb369e1728cdd98d0b72
+
+If you want to maintain these (or other) BFIN optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/bgmc.c b/libavcodec/bgmc.c
index ad8baae..1a6817b 100644
--- a/libavcodec/bgmc.c
+++ b/libavcodec/bgmc.c
@@ -1,28 +1,28 @@
 /*
  * Block Gilbert-Moore decoder
- * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Block Gilbert-Moore decoder as used by MPEG-4 ALS
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 #include "libavutil/attributes.h"
diff --git a/libavcodec/bgmc.h b/libavcodec/bgmc.h
index 3d5b490..4893736 100644
--- a/libavcodec/bgmc.h
+++ b/libavcodec/bgmc.h
@@ -1,28 +1,28 @@
 /*
  * Block Gilbert-Moore decoder
- * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * Copyright (c) 2010 Thilo Borgmann <thilo.borgmann _at_ mail.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Block Gilbert-Moore decoder header
- * @author Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
  */
 
 
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index 94d1598..dffcdee 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Konstantin Shishkov
  * Copyright (C) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -120,6 +120,7 @@ typedef struct BinkContext {
     int            version;              ///< internal Bink file version
     int            has_alpha;
     int            swap_planes;
+    unsigned       frame_num;
 
     Bundle         bundle[BINKB_NB_SRC]; ///< bundles for decoding all data types
     Tree           col_high[16];         ///< trees for decoding high nibble in "colours" data type
@@ -143,7 +144,7 @@ enum BlockTypes {
 };
 
 /**
- * Initialize length length in all bundles.
+ * Initialize length in all bundles.
  *
  * @param c     decoder context
  * @param width plane width
@@ -174,7 +175,7 @@ static void init_lengths(BinkContext *c, int width, int bw)
  *
  * @param c decoder context
  */
-static av_cold void init_bundles(BinkContext *c)
+static av_cold int init_bundles(BinkContext *c)
 {
     int bw, bh, blocks;
     int i;
@@ -184,9 +185,13 @@ static av_cold void init_bundles(BinkContext *c)
     blocks = bw * bh;
 
     for (i = 0; i < BINKB_NB_SRC; i++) {
-        c->bundle[i].data = av_malloc(blocks * 64);
+        c->bundle[i].data = av_mallocz(blocks * 64);
+        if (!c->bundle[i].data)
+            return AVERROR(ENOMEM);
         c->bundle[i].data_end = c->bundle[i].data + blocks * 64;
     }
+
+    return 0;
 }
 
 /**
@@ -679,11 +684,12 @@ static int read_dct_coeffs(GetBitContext *gb, int32_t block[64], const uint8_t *
         quant_idx = get_bits(gb, 4);
     } else {
         quant_idx = q;
+        if (quant_idx > 15U) {
+            av_log(NULL, AV_LOG_ERROR, "quant_index %d out of range\n", quant_idx);
+            return AVERROR_INVALIDDATA;
+        }
     }
 
-    if (quant_idx >= 16)
-        return AVERROR_INVALIDDATA;
-
     quant = quant_matrices[quant_idx];
 
     block[0] = (block[0] * quant[0]) >> 11;
@@ -866,7 +872,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = binkb_get_value(c, BINKB_SRC_INTRA_DC);
                 qp = binkb_get_value(c, BINKB_SRC_INTRA_Q);
-                read_dct_coeffs(gb, dctblock, bink_scan, binkb_intra_quant, qp);
+                read_dct_coeffs(gb, dctblock, bink_scan, (const int32_t (*)[64])binkb_intra_quant, qp);
                 c->binkdsp.idct_put(dst, stride, dctblock);
                 break;
             case 3:
@@ -899,7 +905,7 @@ static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
                 memset(dctblock, 0, sizeof(*dctblock) * 64);
                 dctblock[0] = binkb_get_value(c, BINKB_SRC_INTER_DC);
                 qp = binkb_get_value(c, BINKB_SRC_INTER_Q);
-                read_dct_coeffs(gb, dctblock, bink_scan, binkb_inter_quant, qp);
+                read_dct_coeffs(gb, dctblock, bink_scan, (const int32_t (*)[64])binkb_inter_quant, qp);
                 c->binkdsp.idct_add(dst, stride, dctblock);
                 break;
             case 5:
@@ -1184,15 +1190,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int bits_count = pkt->size << 3;
 
     if (c->version > 'b') {
-        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
             return ret;
-        }
     } else {
-        if ((ret = ff_reget_buffer(avctx, c->last)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+        if ((ret = ff_reget_buffer(avctx, c->last)) < 0)
             return ret;
-        }
         if ((ret = av_frame_ref(frame, c->last)) < 0)
             return ret;
     }
@@ -1207,6 +1209,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     if (c->version >= 'i')
         skip_bits_long(&gb, 32);
 
+    c->frame_num++;
+
     for (plane = 0; plane < 3; plane++) {
         plane_idx = (!plane || !c->swap_planes) ? plane : (plane ^ 3);
 
@@ -1215,7 +1219,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
                 return ret;
         } else {
             if ((ret = binkb_decode_plane(c, frame, &gb, plane_idx,
-                                          !avctx->frame_number, !!plane)) < 0)
+                                          c->frame_num == 1, !!plane)) < 0)
                 return ret;
         }
         if (get_bits_count(&gb) >= bits_count)
@@ -1241,41 +1245,28 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
 static av_cold void binkb_calc_quant(void)
 {
     uint8_t inv_bink_scan[64];
-    double s[64];
+    static const int s[64]={
+        1073741824,1489322693,1402911301,1262586814,1073741824, 843633538, 581104888, 296244703,
+        1489322693,2065749918,1945893874,1751258219,1489322693,1170153332, 806015634, 410903207,
+        1402911301,1945893874,1832991949,1649649171,1402911301,1102260336, 759250125, 387062357,
+        1262586814,1751258219,1649649171,1484645031,1262586814, 992008094, 683307060, 348346918,
+        1073741824,1489322693,1402911301,1262586814,1073741824, 843633538, 581104888, 296244703,
+         843633538,1170153332,1102260336, 992008094, 843633538, 662838617, 456571181, 232757969,
+         581104888, 806015634, 759250125, 683307060, 581104888, 456571181, 314491699, 160326478,
+         296244703, 410903207, 387062357, 348346918, 296244703, 232757969, 160326478,  81733730,
+    };
     int i, j;
-
-    for (j = 0; j < 8; j++) {
-        for (i = 0; i < 8; i++) {
-            if (j && j != 4)
-               if (i && i != 4)
-                   s[j*8 + i] = cos(j * M_PI/16.0) * cos(i * M_PI/16.0) * 2.0;
-               else
-                   s[j*8 + i] = cos(j * M_PI/16.0) * sqrt(2.0);
-            else
-               if (i && i != 4)
-                   s[j*8 + i] = cos(i * M_PI/16.0) * sqrt(2.0);
-               else
-                   s[j*8 + i] = 1.0;
-        }
-    }
-
+#define C (1LL<<30)
     for (i = 0; i < 64; i++)
         inv_bink_scan[bink_scan[i]] = i;
 
     for (j = 0; j < 16; j++) {
         for (i = 0; i < 64; i++) {
             int k = inv_bink_scan[i];
-            if (s[i] == 1.0) {
-                binkb_intra_quant[j][k] = (1L << 12) * binkb_intra_seed[i] *
-                                          binkb_num[j]/binkb_den[j];
-                binkb_inter_quant[j][k] = (1L << 12) * binkb_inter_seed[i] *
-                                          binkb_num[j]/binkb_den[j];
-            } else {
-                binkb_intra_quant[j][k] = (1L << 12) * binkb_intra_seed[i] * s[i] *
-                                          binkb_num[j]/(double)binkb_den[j];
-                binkb_inter_quant[j][k] = (1L << 12) * binkb_inter_seed[i] * s[i] *
-                                          binkb_num[j]/(double)binkb_den[j];
-            }
+            binkb_intra_quant[j][k] = binkb_intra_seed[i] * (int64_t)s[i] *
+                                        binkb_num[j]/(binkb_den[j] * (C>>12));
+            binkb_inter_quant[j][k] = binkb_inter_seed[i] * (int64_t)s[i] *
+                                        binkb_num[j]/(binkb_den[j] * (C>>12));
         }
     }
 }
@@ -1321,7 +1312,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ff_hpeldsp_init(&c->hdsp, avctx->flags);
     ff_binkdsp_init(&c->binkdsp);
 
-    init_bundles(c);
+    if ((ret = init_bundles(c)) < 0) {
+        free_bundles(c);
+        return ret;
+    }
 
     if (c->version == 'b') {
         if (!binkb_initialised) {
@@ -1343,6 +1337,13 @@ static av_cold int decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+static void flush(AVCodecContext *avctx)
+{
+    BinkContext * const c = avctx->priv_data;
+
+    c->frame_num = 0;
+}
+
 AVCodec ff_bink_decoder = {
     .name           = "binkvideo",
     .long_name      = NULL_IF_CONFIG_SMALL("Bink video"),
@@ -1352,5 +1353,6 @@ AVCodec ff_bink_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/binkaudio.c b/libavcodec/binkaudio.c
index 71ad344..5cc2331 100644
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007-2011 Peter Ross (pross@xvid.org)
  * Copyright (c) 2009 Daniel Verkamp (daniel@drv.nu)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -81,14 +81,14 @@ static av_cold int decode_init(AVCodecContext *avctx)
         frame_len_bits = 11;
     }
 
-    if (avctx->channels > MAX_CHANNELS) {
-        av_log(avctx, AV_LOG_ERROR, "too many channels: %d\n", avctx->channels);
-        return -1;
+    if (avctx->channels < 1 || avctx->channels > MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels: %d\n", avctx->channels);
+        return AVERROR_INVALIDDATA;
     }
     avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO :
                                                    AV_CH_LAYOUT_STEREO;
 
-    s->version_b = avctx->extradata && avctx->extradata[3] == 'b';
+    s->version_b = avctx->extradata_size >= 4 && avctx->extradata[3] == 'b';
 
     if (avctx->codec->id == AV_CODEC_ID_BINKAUDIO_RDFT) {
         // audio is already interleaved for the RDFT format variant
@@ -305,9 +305,11 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         buf = av_realloc(s->packet_buffer, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!buf)
             return AVERROR(ENOMEM);
+        memset(buf + avpkt->size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
         s->packet_buffer = buf;
         memcpy(s->packet_buffer, avpkt->data, avpkt->size);
-        init_get_bits(gb, s->packet_buffer, avpkt->size * 8);
+        if ((ret = init_get_bits8(gb, s->packet_buffer, avpkt->size)) < 0)
+            return ret;
         consumed = avpkt->size;
 
         /* skip reported size */
@@ -316,10 +318,8 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = s->frame_len;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (decode_block(s, (float **)frame->extended_data,
                      avctx->codec->id == AV_CODEC_ID_BINKAUDIO_DCT)) {
diff --git a/libavcodec/binkdata.h b/libavcodec/binkdata.h
index 3da6b7e..57619be 100644
--- a/libavcodec/binkdata.h
+++ b/libavcodec/binkdata.h
@@ -2,20 +2,20 @@
  * Bink video decoder
  * Copyright (C) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/binkdsp.c b/libavcodec/binkdsp.c
index 0dfe12c..9d70e23 100644
--- a/libavcodec/binkdsp.c
+++ b/libavcodec/binkdsp.c
@@ -2,20 +2,20 @@
  * Bink DSP routines
  * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -129,7 +129,7 @@ static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align
     }
 }
 
-static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
+static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
                           int line_size)
 {
     int i;
diff --git a/libavcodec/binkdsp.h b/libavcodec/binkdsp.h
index 418afb9..f319d1f 100644
--- a/libavcodec/binkdsp.h
+++ b/libavcodec/binkdsp.h
@@ -2,20 +2,20 @@
  * Bink DSP routines
  * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bintext.c b/libavcodec/bintext.c
new file mode 100644
index 0000000..90bbe67
--- /dev/null
+++ b/libavcodec/bintext.c
@@ -0,0 +1,258 @@
+/*
+ * Binary text decoder
+ * eXtended BINary text (XBIN) decoder
+ * iCEDraw File decoder
+ * Copyright (c) 2010 Peter Ross (pross@xvid.org)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Binary text decoder
+ * eXtended BINary text (XBIN) decoder
+ * iCEDraw File decoder
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/xga_font_data.h"
+#include "avcodec.h"
+#include "cga_data.h"
+#include "bintext.h"
+#include "internal.h"
+
+typedef struct XbinContext {
+    AVFrame *frame;
+    int palette[16];
+    int flags;
+    int font_height;
+    const uint8_t *font;
+    int x, y;
+} XbinContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+    uint8_t *p;
+    int i;
+
+    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    p = avctx->extradata;
+    if (p) {
+        s->font_height = p[0];
+        s->flags = p[1];
+        p += 2;
+        if(avctx->extradata_size < 2 + (!!(s->flags & BINTEXT_PALETTE))*3*16
+                                     + (!!(s->flags & BINTEXT_FONT))*s->font_height*256) {
+            av_log(avctx, AV_LOG_ERROR, "not enough extradata\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        s->font_height = 8;
+        s->flags = 0;
+    }
+
+    if ((s->flags & BINTEXT_PALETTE)) {
+        for (i = 0; i < 16; i++) {
+            s->palette[i] = 0xFF000000 | (AV_RB24(p) << 2) | ((AV_RB24(p) >> 4) & 0x30303);
+            p += 3;
+        }
+    } else {
+        for (i = 0; i < 16; i++)
+            s->palette[i] = 0xFF000000 | ff_cga_palette[i];
+    }
+
+    if ((s->flags & BINTEXT_FONT)) {
+        s->font = p;
+    } else {
+        switch(s->font_height) {
+        default:
+            av_log(avctx, AV_LOG_WARNING, "font height %i not supported\n", s->font_height);
+            s->font_height = 8;
+        case 8:
+            s->font = avpriv_cga_font;
+            break;
+        case 16:
+            s->font = avpriv_vga16_font;
+            break;
+        }
+    }
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+#define DEFAULT_BG_COLOR 0
+av_unused static void hscroll(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+    if (s->y < avctx->height - s->font_height) {
+        s->y += s->font_height;
+    } else {
+        memmove(s->frame->data[0], s->frame->data[0] + s->font_height*s->frame->linesize[0],
+            (avctx->height - s->font_height)*s->frame->linesize[0]);
+        memset(s->frame->data[0] + (avctx->height - s->font_height)*s->frame->linesize[0],
+            DEFAULT_BG_COLOR, s->font_height * s->frame->linesize[0]);
+    }
+}
+
+#define FONT_WIDTH 8
+
+/**
+ * Draw character to screen
+ */
+static void draw_char(AVCodecContext *avctx, int c, int a)
+{
+    XbinContext *s = avctx->priv_data;
+    if (s->y > avctx->height - s->font_height)
+        return;
+    ff_draw_pc_font(s->frame->data[0] + s->y * s->frame->linesize[0] + s->x,
+                    s->frame->linesize[0], s->font, s->font_height, c,
+                    a & 0x0F, a >> 4);
+    s->x += FONT_WIDTH;
+    if (s->x > avctx->width - FONT_WIDTH) {
+        s->x = 0;
+        s->y += s->font_height;
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_frame,
+                            AVPacket *avpkt)
+{
+    XbinContext *s = avctx->priv_data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    const uint8_t *buf_end = buf+buf_size;
+    int ret;
+
+    s->x = s->y = 0;
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+        return ret;
+    s->frame->pict_type           = AV_PICTURE_TYPE_I;
+    s->frame->palette_has_changed = 1;
+    memcpy(s->frame->data[1], s->palette, 16 * 4);
+
+    if (avctx->codec_id == AV_CODEC_ID_XBIN) {
+        while (buf + 2 < buf_end) {
+            int i,c,a;
+            int type  = *buf >> 6;
+            int count = (*buf & 0x3F) + 1;
+            buf++;
+            switch (type) {
+            case 0: //no compression
+                for (i = 0; i < count && buf + 1 < buf_end; i++) {
+                    draw_char(avctx, buf[0], buf[1]);
+                    buf += 2;
+                }
+                break;
+            case 1: //character compression
+                c = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, c, *buf++);
+                break;
+            case 2: //attribute compression
+                a = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, *buf++, a);
+                break;
+            case 3: //character/attribute compression
+                c = *buf++;
+                a = *buf++;
+                for (i = 0; i < count && buf < buf_end; i++)
+                    draw_char(avctx, c, a);
+                break;
+            }
+        }
+    } else if (avctx->codec_id == AV_CODEC_ID_IDF) {
+        while (buf + 2 < buf_end) {
+            if (AV_RL16(buf) == 1) {
+               int i;
+               if (buf + 6 > buf_end)
+                   break;
+               for (i = 0; i < buf[2]; i++)
+                   draw_char(avctx, buf[4], buf[5]);
+               buf += 6;
+            } else {
+               draw_char(avctx, buf[0], buf[1]);
+               buf += 2;
+            }
+        }
+    } else {
+        while (buf + 1 < buf_end) {
+            draw_char(avctx, buf[0], buf[1]);
+            buf += 2;
+        }
+    }
+
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
+    *got_frame      = 1;
+    return buf_size;
+}
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    XbinContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
+#if CONFIG_BINTEXT_DECODER
+AVCodec ff_bintext_decoder = {
+    .name           = "bintext",
+    .long_name      = NULL_IF_CONFIG_SMALL("Binary text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_BINTEXT,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_XBIN_DECODER
+AVCodec ff_xbin_decoder = {
+    .name           = "xbin",
+    .long_name      = NULL_IF_CONFIG_SMALL("eXtended BINary text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XBIN,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_IDF_DECODER
+AVCodec ff_idf_decoder = {
+    .name           = "idf",
+    .long_name      = NULL_IF_CONFIG_SMALL("iCEDraw text"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_IDF,
+    .priv_data_size = sizeof(XbinContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
diff --git a/libavcodec/bintext.h b/libavcodec/bintext.h
new file mode 100644
index 0000000..21428ba
--- /dev/null
+++ b/libavcodec/bintext.h
@@ -0,0 +1,37 @@
+/*
+ * Binary text decoder
+ * Copyright (c) 2010 Peter Ross (pross@xvid.org)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Binary text decoder
+ */
+
+#ifndef AVCODEC_BINTEXT_H
+#define AVCODEC_BINTEXT_H
+
+/* flag values passed between avformat and avcodec;
+ * while these are identical to the XBIN flags, they are also used
+ * for the BINTEXT and IDF decoders.
+ */
+#define BINTEXT_PALETTE  0x1
+#define BINTEXT_FONT     0x2
+
+#endif /* AVCODEC_BINTEXT_H */
diff --git a/libavcodec/bit_depth_template.c b/libavcodec/bit_depth_template.c
index 27e658b..8018489 100644
--- a/libavcodec/bit_depth_template.c
+++ b/libavcodec/bit_depth_template.c
@@ -1,23 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "mathops.h"
 #include "rnd_avg.h"
+#include "libavutil/intreadwrite.h"
 
 #ifndef BIT_DEPTH
 #define BIT_DEPTH 8
@@ -71,7 +72,7 @@
 #   define pixel4 uint32_t
 #   define dctcoef int16_t
 
-#   define INIT_CLIP const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+#   define INIT_CLIP
 #   define no_rnd_avg_pixel4 no_rnd_avg32
 #   define    rnd_avg_pixel4    rnd_avg32
 #   define AV_RN2P  AV_RN16
@@ -83,7 +84,7 @@
 #   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
 
 #   define av_clip_pixel(a) av_clip_uint8(a)
-#   define CLIP(a) cm[a]
+#   define CLIP(a) av_clip_uint8(a)
 #endif
 
 #define FUNC3(a, b, c)  a ## _ ## b ## c
diff --git a/libavcodec/bitstream.c b/libavcodec/bitstream.c
index a0d2a98..9785ef7 100644
--- a/libavcodec/bitstream.c
+++ b/libavcodec/bitstream.c
@@ -6,20 +6,20 @@
  *
  * alternative bitstream reader & writer by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,9 @@
  * bitstream api.
  */
 
+#include "libavutil/atomic.h"
+#include "libavutil/avassert.h"
+#include "libavutil/qsort.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
@@ -68,6 +71,8 @@ void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
     if (length == 0)
         return;
 
+    av_assert0(length <= put_bits_left(pb));
+
     if (CONFIG_SMALL || words < 16 || put_bits_count(pb) & 7) {
         for (i = 0; i < words; i++)
             put_bits(pb, 16, AV_RB16(src + 2 * i));
@@ -107,17 +112,16 @@ static int alloc_table(VLC *vlc, int size, int use_static)
 
     vlc->table_size += size;
     if (vlc->table_size > vlc->table_allocated) {
-        int err;
         if (use_static)
-            return AVERROR_BUG;
+            abort(); // cannot do anything, init_vlc() is used with too little memory
         vlc->table_allocated += (1 << vlc->bits);
-        if ((err = av_reallocp(&vlc->table,
-                               sizeof(VLC_TYPE) * 2 *
-                               vlc->table_allocated)) < 0) {
+        vlc->table = av_realloc_f(vlc->table, vlc->table_allocated, sizeof(VLC_TYPE) * 2);
+        if (!vlc->table) {
             vlc->table_allocated = 0;
             vlc->table_size = 0;
-            return err;
+            return AVERROR(ENOMEM);
         }
+        memset(vlc->table + vlc->table_allocated - (1 << vlc->bits), 0, sizeof(VLC_TYPE) * 2 << vlc->bits);
     }
     return index;
 }
@@ -163,19 +167,16 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
     int table_size, table_index, index, code_prefix, symbol, subtable_bits;
     int i, j, k, n, nb, inc;
     uint32_t code;
-    VLC_TYPE (*table)[2];
+    volatile VLC_TYPE (* volatile table)[2]; // the double volatile is needed to prevent an internal compiler error in gcc 4.2
 
     table_size = 1 << table_nb_bits;
+    if (table_nb_bits > 30)
+       return -1;
     table_index = alloc_table(vlc, table_size, flags & INIT_VLC_USE_NEW_STATIC);
     ff_dlog(NULL, "new table index=%d size=%d\n", table_index, table_size);
     if (table_index < 0)
         return table_index;
-    table = &vlc->table[table_index];
-
-    for (i = 0; i < table_size; i++) {
-        table[i][1] = 0; //bits
-        table[i][0] = -1; //codes
-    }
+    table = (volatile VLC_TYPE (*)[2])&vlc->table[table_index];
 
     /* first pass: map codes and compute auxiliary table sizes */
     for (i = 0; i < nb_codes; i++) {
@@ -193,8 +194,9 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
                 inc = 1 << n;
             }
             for (k = 0; k < nb; k++) {
+                int bits = table[j][1];
                 ff_dlog(NULL, "%4x: code=%d n=%d\n", j, i, n);
-                if (table[j][1] /*bits*/ != 0) {
+                if (bits != 0 && bits != n) {
                     av_log(NULL, AV_LOG_ERROR, "incorrect codes\n");
                     return AVERROR_INVALIDDATA;
                 }
@@ -229,11 +231,17 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
             if (index < 0)
                 return index;
             /* note: realloc has been done, so reload tables */
-            table = &vlc->table[table_index];
+            table = (volatile VLC_TYPE (*)[2])&vlc->table[table_index];
             table[j][0] = index; //code
             i = k-1;
         }
     }
+
+    for (i = 0; i < table_size; i++) {
+        if (table[i][1] == 0) //bits
+            table[i][0] = -1; //codes
+    }
+
     return table_index;
 }
 
@@ -258,13 +266,13 @@ static int build_table(VLC *vlc, int table_nb_bits, int nb_codes,
    'xxx_size' : gives the number of bytes of each entry of the 'bits'
    or 'codes' tables.
 
-   'wrap' and 'size' allows to use any memory configuration and types
+   'wrap' and 'size' make it possible to use any memory configuration and types
    (byte/word/long) to store the 'bits', 'codes', and 'symbols' tables.
 
    'use_static' should be set to 1 for tables, which should be freed
    with av_free_static(), 0 if ff_free_vlc() will be used.
 */
-int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes,
+int ff_init_vlc_sparse(VLC *vlc_arg, int nb_bits, int nb_codes,
                        const void *bits, int bits_wrap, int bits_size,
                        const void *codes, int codes_wrap, int codes_size,
                        const void *symbols, int symbols_wrap, int symbols_size,
@@ -272,61 +280,79 @@ int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes,
 {
     VLCcode *buf;
     int i, j, ret;
+    VLCcode localbuf[1500]; // the maximum currently needed is 1296 by rv34
+    VLC localvlc, *vlc;
 
+    vlc = vlc_arg;
     vlc->bits = nb_bits;
     if (flags & INIT_VLC_USE_NEW_STATIC) {
-        if (vlc->table_size && vlc->table_size == vlc->table_allocated) {
-            return 0;
-        } else if (vlc->table_size) {
-            return AVERROR_BUG;
-        }
+        av_assert0(nb_codes + 1 <= FF_ARRAY_ELEMS(localbuf));
+        buf = localbuf;
+        localvlc = *vlc_arg;
+        vlc = &localvlc;
+        vlc->table_size = 0;
     } else {
         vlc->table           = NULL;
         vlc->table_allocated = 0;
         vlc->table_size      = 0;
-    }
 
-    ff_dlog(NULL, "build table nb_codes=%d\n", nb_codes);
+        buf = av_malloc_array((nb_codes + 1), sizeof(VLCcode));
+        if (!buf)
+            return AVERROR(ENOMEM);
+    }
 
-    buf = av_malloc((nb_codes + 1) * sizeof(VLCcode));
-    if (!buf)
-        return AVERROR(ENOMEM);
 
-    assert(symbols_size <= 2 || !symbols);
+    av_assert0(symbols_size <= 2 || !symbols);
     j = 0;
-#define COPY(condition)                                                     \
+#define COPY(condition)\
     for (i = 0; i < nb_codes; i++) {                                        \
         GET_DATA(buf[j].bits, bits, i, bits_wrap, bits_size);               \
         if (!(condition))                                                   \
             continue;                                                       \
+        if (buf[j].bits > 3*nb_bits || buf[j].bits>32) {                    \
+            av_log(NULL, AV_LOG_ERROR, "Too long VLC (%d) in init_vlc\n", buf[j].bits);\
+            if (!(flags & INIT_VLC_USE_NEW_STATIC))                         \
+                av_free(buf);                                               \
+            return -1;                                                      \
+        }                                                                   \
         GET_DATA(buf[j].code, codes, i, codes_wrap, codes_size);            \
+        if (buf[j].code >= (1LL<<buf[j].bits)) {                            \
+            av_log(NULL, AV_LOG_ERROR, "Invalid code %x for %d in init_vlc\n", buf[j].code, i);\
+            if (!(flags & INIT_VLC_USE_NEW_STATIC))                         \
+                av_free(buf);                                               \
+            return -1;                                                      \
+        }                                                                   \
         if (flags & INIT_VLC_LE)                                            \
             buf[j].code = bitswap_32(buf[j].code);                          \
         else                                                                \
             buf[j].code <<= 32 - buf[j].bits;                               \
         if (symbols)                                                        \
             GET_DATA(buf[j].symbol, symbols, i, symbols_wrap, symbols_size) \
-            else                                                            \
-                buf[j].symbol = i;                                          \
+        else                                                                \
+            buf[j].symbol = i;                                              \
         j++;                                                                \
     }
     COPY(buf[j].bits > nb_bits);
     // qsort is the slowest part of init_vlc, and could probably be improved or avoided
-    qsort(buf, j, sizeof(VLCcode), compare_vlcspec);
+    AV_QSORT(buf, j, struct VLCcode, compare_vlcspec);
     COPY(buf[j].bits && buf[j].bits <= nb_bits);
     nb_codes = j;
 
     ret = build_table(vlc, nb_bits, nb_codes, buf, flags);
 
-    av_free(buf);
-    if (ret < 0) {
-        av_freep(&vlc->table);
-        return ret;
+    if (flags & INIT_VLC_USE_NEW_STATIC) {
+        if(vlc->table_size != vlc->table_allocated)
+            av_log(NULL, AV_LOG_ERROR, "needed %d had %d\n", vlc->table_size, vlc->table_allocated);
+
+        av_assert0(ret >= 0);
+        *vlc_arg = *vlc;
+    } else {
+        av_free(buf);
+        if (ret < 0) {
+            av_freep(&vlc->table);
+            return ret;
+        }
     }
-    if ((flags & INIT_VLC_USE_NEW_STATIC) &&
-        vlc->table_size != vlc->table_allocated)
-        av_log(NULL, AV_LOG_ERROR, "needed %d had %d\n",
-               vlc->table_size, vlc->table_allocated);
     return 0;
 }
 
diff --git a/libavcodec/bitstream_filter.c b/libavcodec/bitstream_filter.c
index ab608a9..02878e3 100644
--- a/libavcodec/bitstream_filter.c
+++ b/libavcodec/bitstream_filter.c
@@ -1,29 +1,30 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
 #include "avcodec.h"
-
+#include "libavutil/atomic.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
+#include "libavutil/opt.h"
 
 #if FF_API_OLD_BSF
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -45,6 +46,7 @@ void av_register_bitstream_filter(AVBitStreamFilter *bsf)
 
 typedef struct BSFCompatContext {
     AVBSFContext *ctx;
+    int extradata_updated;
 } BSFCompatContext;
 
 AVBitStreamFilterContext *av_bitstream_filter_init(const char *name)
@@ -81,7 +83,12 @@ fail:
 
 void av_bitstream_filter_close(AVBitStreamFilterContext *bsfc)
 {
-    BSFCompatContext *priv = bsfc->priv_data;
+    BSFCompatContext *priv;
+
+    if (!bsfc)
+        return;
+
+    priv = bsfc->priv_data;
 
     av_bsf_free(&priv->ctx);
     av_freep(&bsfc->priv_data);
@@ -108,20 +115,19 @@ int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
 
         priv->ctx->time_base_in = avctx->time_base;
 
+        if (bsfc->args && bsfc->filter->priv_class) {
+            const AVOption *opt = av_opt_next(priv->ctx->priv_data, NULL);
+            const char * shorthand[2] = {NULL};
+
+            if (opt)
+                shorthand[0] = opt->name;
+
+            ret = av_opt_set_from_string(priv->ctx->priv_data, bsfc->args, shorthand, "=", ":");
+        }
+
         ret = av_bsf_init(priv->ctx);
         if (ret < 0)
             return ret;
-
-        if (priv->ctx->par_out->extradata_size) {
-            av_freep(&avctx->extradata);
-            avctx->extradata_size = 0;
-            avctx->extradata = av_mallocz(priv->ctx->par_out->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!avctx->extradata)
-                return AVERROR(ENOMEM);
-            memcpy(avctx->extradata, priv->ctx->par_out->extradata,
-                   priv->ctx->par_out->extradata_size);
-            avctx->extradata_size = priv->ctx->par_out->extradata_size;
-        }
     }
 
     pkt.data = buf;
@@ -157,6 +163,21 @@ int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
         av_packet_unref(&pkt);
     }
 
+    if (!priv->extradata_updated) {
+        /* update extradata in avctx from the output codec parameters */
+        if (priv->ctx->par_out->extradata_size && (!args || !strstr(args, "private_spspps_buf"))) {
+            av_freep(&avctx->extradata);
+            avctx->extradata_size = 0;
+            avctx->extradata = av_mallocz(priv->ctx->par_out->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!avctx->extradata)
+                return AVERROR(ENOMEM);
+            memcpy(avctx->extradata, priv->ctx->par_out->extradata, priv->ctx->par_out->extradata_size);
+            avctx->extradata_size = priv->ctx->par_out->extradata_size;
+        }
+
+        priv->extradata_updated = 1;
+    }
+
     return 1;
 }
 FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/libavcodec/bitstream_filters.c b/libavcodec/bitstream_filters.c
index 3b4026c..840bb43 100644
--- a/libavcodec/bitstream_filters.c
+++ b/libavcodec/bitstream_filters.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,19 @@
 extern const AVBitStreamFilter ff_aac_adtstoasc_bsf;
 extern const AVBitStreamFilter ff_chomp_bsf;
 extern const AVBitStreamFilter ff_dump_extradata_bsf;
+extern const AVBitStreamFilter ff_dca_core_bsf;
 extern const AVBitStreamFilter ff_h264_mp4toannexb_bsf;
 extern const AVBitStreamFilter ff_hevc_mp4toannexb_bsf;
 extern const AVBitStreamFilter ff_imx_dump_header_bsf;
 extern const AVBitStreamFilter ff_mjpeg2jpeg_bsf;
 extern const AVBitStreamFilter ff_mjpega_dump_header_bsf;
+extern const AVBitStreamFilter ff_mp3_header_decompress_bsf;
+extern const AVBitStreamFilter ff_mpeg4_unpack_bframes_bsf;
 extern const AVBitStreamFilter ff_mov2textsub_bsf;
-extern const AVBitStreamFilter ff_text2movsub_bsf;
 extern const AVBitStreamFilter ff_noise_bsf;
 extern const AVBitStreamFilter ff_remove_extradata_bsf;
+extern const AVBitStreamFilter ff_text2movsub_bsf;
+extern const AVBitStreamFilter ff_vp9_superframe_bsf;
 
 #include "libavcodec/bsf_list.c"
 
diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c
index e3d2ca1..a5c527a 100644
--- a/libavcodec/blockdsp.c
+++ b/libavcodec/blockdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,12 @@
 #include "blockdsp.h"
 #include "version.h"
 
-static void clear_block_8_c(int16_t *block)
+static void clear_block_c(int16_t *block)
 {
     memset(block, 0, sizeof(int16_t) * 64);
 }
 
-static void clear_blocks_8_c(int16_t *blocks)
+static void clear_blocks_c(int16_t *blocks)
 {
     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
 }
@@ -57,22 +57,20 @@ static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 
 av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
 {
-    const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
-
-    c->clear_block  = clear_block_8_c;
-    c->clear_blocks = clear_blocks_8_c;
+    c->clear_block  = clear_block_c;
+    c->clear_blocks = clear_blocks_c;
 
     c->fill_block_tab[0] = fill_block16_c;
     c->fill_block_tab[1] = fill_block8_c;
 
+    if (ARCH_ALPHA)
+        ff_blockdsp_init_alpha(c);
     if (ARCH_ARM)
-        ff_blockdsp_init_arm(c, high_bit_depth);
+        ff_blockdsp_init_arm(c);
     if (ARCH_PPC)
-        ff_blockdsp_init_ppc(c, high_bit_depth);
+        ff_blockdsp_init_ppc(c);
     if (ARCH_X86)
-#if FF_API_XVMC
-        ff_blockdsp_init_x86(c, high_bit_depth, avctx);
-#else
-        ff_blockdsp_init_x86(c, high_bit_depth);
-#endif /* FF_API_XVMC */
+        ff_blockdsp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_blockdsp_init_mips(c);
 }
diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h
index 32c671c..95e1d0f 100644
--- a/libavcodec/blockdsp.h
+++ b/libavcodec/blockdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,13 +40,10 @@ typedef struct BlockDSPContext {
 
 void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx);
 
-void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth);
-void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth);
-#if FF_API_XVMC
-void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
-                          AVCodecContext *avctx);
-#else
-void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth);
-#endif /* FF_API_XVMC */
+void ff_blockdsp_init_alpha(BlockDSPContext *c);
+void ff_blockdsp_init_arm(BlockDSPContext *c);
+void ff_blockdsp_init_ppc(BlockDSPContext *c);
+void ff_blockdsp_init_x86(BlockDSPContext *c, AVCodecContext *avctx);
+void ff_blockdsp_init_mips(BlockDSPContext *c);
 
 #endif /* AVCODEC_BLOCKDSP_H */
diff --git a/libavcodec/bmp.c b/libavcodec/bmp.c
index 648fa68..fa1d6a5 100644
--- a/libavcodec/bmp.c
+++ b/libavcodec/bmp.c
@@ -2,20 +2,20 @@
  * BMP image format decoder
  * Copyright (c) 2005 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,7 +40,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     BiCompression comp;
     unsigned int ihsize;
     int i, j, n, linesize, ret;
-    uint32_t rgb[3];
+    uint32_t rgb[3] = {0};
+    uint32_t alpha = 0;
     uint8_t *ptr;
     int dsize;
     const uint8_t *buf0 = buf;
@@ -69,7 +70,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
 
     hsize  = bytestream_get_le32(&buf); /* header size */
     ihsize = bytestream_get_le32(&buf); /* more header size */
-    if (ihsize + 14 > hsize) {
+    if (ihsize + 14LL > hsize) {
         av_log(avctx, AV_LOG_ERROR, "invalid header size %u\n", hsize);
         return AVERROR_INVALIDDATA;
     }
@@ -86,7 +87,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     }
 
     switch (ihsize) {
-    case  40: // windib v3
+    case  40: // windib
+    case  56: // windib v3
     case  64: // OS/2 v2
     case 108: // windib v4
     case 124: // windib v5
@@ -110,7 +112,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
 
     depth = bytestream_get_le16(&buf);
 
-    if (ihsize == 40)
+    if (ihsize >= 40)
         comp = bytestream_get_le32(&buf);
     else
         comp = BMP_RGB;
@@ -126,6 +128,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
         rgb[0] = bytestream_get_le32(&buf);
         rgb[1] = bytestream_get_le32(&buf);
         rgb[2] = bytestream_get_le32(&buf);
+        if (ihsize > 40)
+        alpha = bytestream_get_le32(&buf);
     }
 
     avctx->width  = width;
@@ -136,21 +140,21 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     switch (depth) {
     case 32:
         if (comp == BMP_BITFIELDS) {
-            rgb[0] = (rgb[0] >> 15) & 3;
-            rgb[1] = (rgb[1] >> 15) & 3;
-            rgb[2] = (rgb[2] >> 15) & 3;
-
-            if (rgb[0] + rgb[1] + rgb[2] != 3 ||
-                rgb[0] == rgb[1] || rgb[0] == rgb[2] || rgb[1] == rgb[2]) {
-                break;
+            if (rgb[0] == 0xFF000000 && rgb[1] == 0x00FF0000 && rgb[2] == 0x0000FF00)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_ABGR : AV_PIX_FMT_0BGR;
+            else if (rgb[0] == 0x00FF0000 && rgb[1] == 0x0000FF00 && rgb[2] == 0x000000FF)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_BGRA : AV_PIX_FMT_BGR0;
+            else if (rgb[0] == 0x0000FF00 && rgb[1] == 0x00FF0000 && rgb[2] == 0xFF000000)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_ARGB : AV_PIX_FMT_0RGB;
+            else if (rgb[0] == 0x000000FF && rgb[1] == 0x0000FF00 && rgb[2] == 0x00FF0000)
+                avctx->pix_fmt = alpha ? AV_PIX_FMT_RGBA : AV_PIX_FMT_RGB0;
+            else {
+                av_log(avctx, AV_LOG_ERROR, "Unknown bitfields %0X %0X %0X\n", rgb[0], rgb[1], rgb[2]);
+                return AVERROR(EINVAL);
             }
         } else {
-            rgb[0] = 2;
-            rgb[1] = 1;
-            rgb[2] = 0;
+            avctx->pix_fmt = AV_PIX_FMT_BGRA;
         }
-
-        avctx->pix_fmt = AV_PIX_FMT_BGR24;
         break;
     case 24:
         avctx->pix_fmt = AV_PIX_FMT_BGR24;
@@ -199,10 +203,8 @@ static int bmp_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -210,12 +212,16 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     dsize = buf_size - hsize;
 
     /* Line size in file multiple of 4 */
-    n = ((avctx->width * depth) / 8 + 3) & ~3;
+    n = ((avctx->width * depth + 31) / 8) & ~3;
 
     if (n * avctx->height > dsize && comp != BMP_RLE4 && comp != BMP_RLE8) {
-        av_log(avctx, AV_LOG_ERROR, "not enough data (%d < %d)\n",
-               dsize, n * avctx->height);
-        return AVERROR_INVALIDDATA;
+        n = (avctx->width * depth + 7) / 8;
+        if (n * avctx->height > dsize) {
+            av_log(avctx, AV_LOG_ERROR, "not enough data (%d < %d)\n",
+                   dsize, n * avctx->height);
+            return AVERROR_INVALIDDATA;
+        }
+        av_log(avctx, AV_LOG_ERROR, "data size too small, assuming missing line alignment\n");
     }
 
     // RLE may skip decoding some picture areas, so blank picture before decoding
@@ -246,20 +252,26 @@ static int bmp_decode_frame(AVCodecContext *avctx,
             } else if (t) {
                 colors = t;
             }
+        } else {
+            colors = FFMIN(256, (hsize-ihsize-14) / 3);
         }
         buf = buf0 + 14 + ihsize; //palette location
         // OS/2 bitmap, 3 bytes per palette entry
         if ((hsize-ihsize-14) < (colors << 2)) {
+            if ((hsize-ihsize-14) < colors * 3) {
+                av_log(avctx, AV_LOG_ERROR, "palette doesn't fit in packet\n");
+                return AVERROR_INVALIDDATA;
+            }
             for (i = 0; i < colors; i++)
-                ((uint32_t*)p->data[1])[i] = bytestream_get_le24(&buf);
+                ((uint32_t*)p->data[1])[i] = (0xFFU<<24) | bytestream_get_le24(&buf);
         } else {
             for (i = 0; i < colors; i++)
-                ((uint32_t*)p->data[1])[i] = bytestream_get_le32(&buf);
+                ((uint32_t*)p->data[1])[i] = 0xFFU << 24 | bytestream_get_le32(&buf);
         }
         buf = buf0 + hsize;
     }
     if (comp == BMP_RLE4 || comp == BMP_RLE8) {
-        if (height < 0) {
+        if (comp == BMP_RLE8 && height < 0) {
             p->data[0]    +=  p->linesize[0] * (avctx->height - 1);
             p->linesize[0] = -p->linesize[0];
         }
@@ -290,6 +302,7 @@ static int bmp_decode_frame(AVCodecContext *avctx,
             break;
         case 8:
         case 24:
+        case 32:
             for (i = 0; i < avctx->height; i++) {
                 memcpy(ptr, buf, n);
                 buf += n;
@@ -319,28 +332,25 @@ static int bmp_decode_frame(AVCodecContext *avctx,
                 ptr += linesize;
             }
             break;
-        case 32:
-            for (i = 0; i < avctx->height; i++) {
-                const uint8_t *src = buf;
-                uint8_t *dst       = ptr;
-
-                for (j = 0; j < avctx->width; j++) {
-                    dst[0] = src[rgb[2]];
-                    dst[1] = src[rgb[1]];
-                    dst[2] = src[rgb[0]];
-                    dst += 3;
-                    src += 4;
-                }
-
-                buf += n;
-                ptr += linesize;
-            }
-            break;
         default:
             av_log(avctx, AV_LOG_ERROR, "BMP decoder is broken\n");
             return AVERROR_INVALIDDATA;
         }
     }
+    if (avctx->pix_fmt == AV_PIX_FMT_BGRA) {
+        for (i = 0; i < avctx->height; i++) {
+            int j;
+            uint8_t *ptr = p->data[0] + p->linesize[0]*i + 3;
+            for (j = 0; j < avctx->width; j++) {
+                if (ptr[4*j])
+                    break;
+            }
+            if (j < avctx->width)
+                break;
+        }
+        if (i == avctx->height)
+            avctx->pix_fmt = p->format = AV_PIX_FMT_BGR0;
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/bmp.h b/libavcodec/bmp.h
index a472f59..fb21090 100644
--- a/libavcodec/bmp.h
+++ b/libavcodec/bmp.h
@@ -2,20 +2,20 @@
  * internals for BMP codecs
  * Copyright (c) 2005 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bmp_parser.c b/libavcodec/bmp_parser.c
index b85dd8b..cd65f02 100644
--- a/libavcodec/bmp_parser.c
+++ b/libavcodec/bmp_parser.c
@@ -2,20 +2,20 @@
  * BMP parser
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,21 +45,37 @@ static int bmp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     int i = 0;
 
     *poutbuf_size = 0;
-    if (buf_size == 0)
-        return 0;
 
-    if (!bpc->pc.frame_start_found) {
+restart:
+    if (bpc->pc.frame_start_found <= 2+4+4) {
         for (; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state >> 48) == (('B' << 8) | 'M')) {
-                bpc->fsize = av_bswap32(state >> 16);
-                bpc->pc.frame_start_found = 1;
-                if (bpc->fsize > buf_size - i + 7)
-                    bpc->remaining_size = bpc->fsize - buf_size + i - 7;
-                else
-                    next = bpc->fsize + i - 7;
-                break;
-            }
+            if (bpc->pc.frame_start_found == 0) {
+                if ((state >> 48) == (('B' << 8) | 'M')) {
+                    bpc->fsize = av_bswap32(state >> 16);
+                    if (bpc->fsize > 17)
+                        bpc->pc.frame_start_found = 1;
+                }
+            } else if (bpc->pc.frame_start_found == 2+4+4) {
+//                 unsigned hsize = av_bswap32(state>>32);
+                unsigned ihsize = av_bswap32(state);
+                if (ihsize < 12 || ihsize > 200) {
+                    bpc->pc.frame_start_found = 0;
+                    continue;
+                }
+                bpc->pc.frame_start_found++;
+                bpc->remaining_size = bpc->fsize + i - 17;
+
+                if (bpc->pc.index + i > 17) {
+                    next = i - 17;
+                    state = 0;
+                    break;
+                } else {
+                    bpc->pc.state64 = 0;
+                    goto restart;
+                }
+            } else if (bpc->pc.frame_start_found)
+                bpc->pc.frame_start_found++;
         }
         bpc->pc.state64 = state;
     } else {
@@ -68,7 +84,9 @@ static int bmp_parse(AVCodecParserContext *s, AVCodecContext *avctx,
             bpc->remaining_size -= i;
             if (bpc->remaining_size)
                 goto flush;
-            next = i;
+
+            bpc->pc.frame_start_found = 0;
+            goto restart;
         }
     }
 
@@ -76,7 +94,10 @@ flush:
     if (ff_combine_frame(&bpc->pc, next, &buf, &buf_size) < 0)
         return buf_size;
 
-    bpc->pc.frame_start_found = 0;
+    if (next != END_NOT_FOUND && next < 0)
+        bpc->pc.frame_start_found = FFMAX(bpc->pc.frame_start_found - i - 1, 0);
+    else
+        bpc->pc.frame_start_found = 0;
 
     *poutbuf      = buf;
     *poutbuf_size = buf_size;
diff --git a/libavcodec/bmpenc.c b/libavcodec/bmpenc.c
index 915c396..e829d68 100644
--- a/libavcodec/bmpenc.c
+++ b/libavcodec/bmpenc.c
@@ -3,24 +3,25 @@
  * Copyright (c) 2006, 2007 Michel Bardiaux
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "bmp.h"
@@ -32,6 +33,9 @@ static const uint32_t rgb444_masks[]  = { 0x0F00, 0x00F0, 0x000F };
 
 static av_cold int bmp_encode_init(AVCodecContext *avctx){
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_BGRA:
+        avctx->bits_per_coded_sample = 32;
+        break;
     case AV_PIX_FMT_BGR24:
         avctx->bits_per_coded_sample = 24;
         break;
@@ -53,7 +57,7 @@ static av_cold int bmp_encode_init(AVCodecContext *avctx){
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     return 0;
@@ -65,6 +69,7 @@ static int bmp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame * const p = pict;
     int n_bytes_image, n_bytes_per_row, n_bytes, i, n, hsize, ret;
     const uint32_t *pal = NULL;
+    uint32_t palette256[256];
     int pad_bytes_per_row, pal_entries = 0, compression = BMP_RGB;
     int bit_count = avctx->bits_per_coded_sample;
     uint8_t *ptr, *buf;
@@ -91,7 +96,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
     case AV_PIX_FMT_RGB4_BYTE:
     case AV_PIX_FMT_BGR4_BYTE:
     case AV_PIX_FMT_GRAY8:
-        avpriv_set_systematic_pal2((uint32_t*)p->data[1], avctx->pix_fmt);
+        av_assert1(bit_count == 8);
+        avpriv_set_systematic_pal2(palette256, avctx->pix_fmt);
+        pal = palette256;
+        break;
     case AV_PIX_FMT_PAL8:
         pal = (uint32_t *)p->data[1];
         break;
@@ -110,10 +118,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define SIZE_BITMAPINFOHEADER 40
     hsize = SIZE_BITMAPFILEHEADER + SIZE_BITMAPINFOHEADER + (pal_entries << 2);
     n_bytes = n_bytes_image + hsize;
-    if ((ret = ff_alloc_packet(pkt, n_bytes)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, n_bytes, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
     bytestream_put_byte(&buf, 'B');                   // BITMAPFILEHEADER.bfType
     bytestream_put_byte(&buf, 'M');                   // do.
@@ -165,8 +171,8 @@ AVCodec ff_bmp_encoder = {
     .init           = bmp_encode_init,
     .encode2        = bmp_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_BGR24,
-        AV_PIX_FMT_RGB555, AV_PIX_FMT_RGB444, AV_PIX_FMT_RGB565,
+        AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR24,
+        AV_PIX_FMT_RGB565, AV_PIX_FMT_RGB555, AV_PIX_FMT_RGB444,
         AV_PIX_FMT_RGB8, AV_PIX_FMT_BGR8, AV_PIX_FMT_RGB4_BYTE, AV_PIX_FMT_BGR4_BYTE, AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_MONOBLACK,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/bmvaudio.c b/libavcodec/bmvaudio.c
index 8b4bd78..b1587ab 100644
--- a/libavcodec/bmvaudio.c
+++ b/libavcodec/bmvaudio.c
@@ -2,20 +2,20 @@
  * Discworld II BMV audio decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,10 +58,8 @@ static int bmv_aud_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = total_blocks * 32;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples = (int16_t *)frame->data[0];
 
     for (blocks = 0; blocks < total_blocks; blocks++) {
diff --git a/libavcodec/bmvvideo.c b/libavcodec/bmvvideo.c
index f4b8f29..97f850d 100644
--- a/libavcodec/bmvvideo.c
+++ b/libavcodec/bmvvideo.c
@@ -2,23 +2,24 @@
  * Discworld II BMV video decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 
 #include "avcodec.h"
@@ -50,7 +51,7 @@ typedef struct BMVDecContext {
     const uint8_t *stream;
 } BMVDecContext;
 
-#define NEXT_BYTE(v) v = forward ? v + 1 : v - 1;
+#define NEXT_BYTE(v) (v) = forward ? (v) + 1 : (v) - 1;
 
 static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame, int frame_off)
 {
@@ -100,6 +101,8 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
         }
         if (!(val & 0xC)) {
             for (;;) {
+                if(shift>22)
+                    return -1;
                 if (!read_two_nibbles) {
                     if (src < source || src >= source_end)
                         return AVERROR_INVALIDDATA;
@@ -133,6 +136,7 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
         }
         advance_mode = val & 1;
         len = (val >> 1) - 1;
+        av_assert0(len>0);
         mode += 1 + advance_mode;
         if (mode >= 4)
             mode -= 3;
@@ -185,8 +189,6 @@ static int decode_bmv_frame(const uint8_t *source, int src_len, uint8_t *frame,
                 memset(dst, val, len);
             }
             break;
-        default:
-            break;
         }
         if (dst == dst_end)
             return 0;
@@ -227,7 +229,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
         }
         for (i = 0; i < 256; i++)
-            c->pal[i] = bytestream_get_be24(&c->stream);
+            c->pal[i] = 0xFFU << 24 | bytestream_get_be24(&c->stream);
     }
     if (type & BMV_SCROLL) {
         if (c->stream - pkt->data > pkt->size - 2) {
@@ -241,10 +243,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         scr_off = 0;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (decode_bmv_frame(c->stream, pkt->size - (c->stream - pkt->data), c->frame, scr_off)) {
         av_log(avctx, AV_LOG_ERROR, "Error decoding frame data\n");
@@ -276,6 +276,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
+    if (avctx->width != SCREEN_WIDE || avctx->height != SCREEN_HIGH) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid dimension %dx%d\n", avctx->width, avctx->height);
+        return AVERROR_INVALIDDATA;
+    }
+
     c->frame = c->frame_base + 640;
 
     return 0;
diff --git a/libavcodec/brenderpix.c b/libavcodec/brenderpix.c
index a4b4c87..0556858 100644
--- a/libavcodec/brenderpix.c
+++ b/libavcodec/brenderpix.c
@@ -2,20 +2,20 @@
  * BRender PIX (.pix) image decoder
  * Copyright (c) 2012 Aleksi Nurmi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -134,7 +134,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 {
     AVFrame *frame = data;
 
-    int ret, i, j;
+    int ret, i;
     GetByteContext gb;
 
     unsigned int bytes_pp;
@@ -142,6 +142,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     unsigned int chunk_type;
     unsigned int data_len;
     unsigned int bytes_per_scanline;
+    unsigned int bytes_left;
     PixHeader hdr;
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
@@ -168,7 +169,7 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     ret = pix_decode_header(&hdr, &gb);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Invalid header length.\n");
-        return AVERROR_INVALIDDATA;
+        return ret;
     }
     switch (hdr.format) {
     case 3:
@@ -187,7 +188,10 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->pix_fmt = AV_PIX_FMT_RGB24;
         bytes_pp = 3;
         break;
-    case 7: // XRGB
+    case 7:
+        avctx->pix_fmt = AV_PIX_FMT_0RGB;
+        bytes_pp = 4;
+        break;
     case 8: // ARGB
         avctx->pix_fmt = AV_PIX_FMT_ARGB;
         bytes_pp = 4;
@@ -219,22 +223,21 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ret = pix_decode_header(&palhdr, &gb);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "Invalid palette header length.\n");
-            return AVERROR_INVALIDDATA;
+            return ret;
         }
         if (palhdr.format != 7)
             avpriv_request_sample(avctx, "Palette not in RGB format");
 
         chunk_type = bytestream2_get_be32(&gb);
         data_len = bytestream2_get_be32(&gb);
-        if (chunk_type != IMAGE_DATA_CHUNK ||
-            bytestream2_get_bytes_left(&gb) < data_len) {
+        bytestream2_skip(&gb, 8);
+        if (chunk_type != IMAGE_DATA_CHUNK || data_len != 1032 ||
+            bytestream2_get_bytes_left(&gb) < 1032) {
             av_log(avctx, AV_LOG_ERROR, "Invalid palette data.\n");
             return AVERROR_INVALIDDATA;
         }
-
         // palette data is surrounded by 8 null bytes (both top and bottom)
-        bytestream2_skip(&gb, 8);
-        // convert to machine endian format (ARGB)
+        // convert 0RGB to machine endian format (ARGB32)
         for (i = 0; i < 256; ++i)
             *pal_out++ = (0xFFU << 24) | bytestream2_get_be32u(&gb);
         bytestream2_skip(&gb, 8);
@@ -259,9 +262,10 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // read the image data to the buffer
     bytes_per_scanline = bytes_pp * hdr.width;
-    if (chunk_type != IMAGE_DATA_CHUNK ||
-        data_len < bytes_per_scanline * hdr.height ||
-        bytestream2_get_bytes_left(&gb) < data_len) {
+    bytes_left = bytestream2_get_bytes_left(&gb);
+
+    if (chunk_type != IMAGE_DATA_CHUNK || data_len != bytes_left ||
+        bytes_left / bytes_per_scanline < hdr.height) {
         av_log(avctx, AV_LOG_ERROR, "Invalid image data.\n");
         return AVERROR_INVALIDDATA;
     }
@@ -271,12 +275,6 @@ static int pix_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         bytes_per_scanline,
                         bytes_per_scanline, hdr.height);
 
-    // make alpha opaque for XRGB
-    if (hdr.format == 7)
-        for (j = 0; j < frame->height; j++)
-            for (i = 0; i < frame->linesize[0]; i += 4)
-                frame->data[0][j * frame->linesize[0] + i] = 0xFF;
-
     frame->pict_type = AV_PICTURE_TYPE_I;
     frame->key_frame = 1;
     *got_frame = 1;
diff --git a/libavcodec/bsf.c b/libavcodec/bsf.c
index 284c7c8..88b7f29 100644
--- a/libavcodec/bsf.c
+++ b/libavcodec/bsf.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bsf.h b/libavcodec/bsf.h
index cf35fc8..3435df5 100644
--- a/libavcodec/bsf.h
+++ b/libavcodec/bsf.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bswapdsp.c b/libavcodec/bswapdsp.c
index 6700cfd..a6e1ec0 100644
--- a/libavcodec/bswapdsp.c
+++ b/libavcodec/bswapdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/bswapdsp.h b/libavcodec/bswapdsp.h
index fd10a88..4d19092 100644
--- a/libavcodec/bswapdsp.h
+++ b/libavcodec/bswapdsp.h
@@ -1,23 +1,23 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_BSWAP_BUF_H
-#define AVCODEC_BSWAP_BUF_H
+#ifndef AVCODEC_BSWAPDSP_H
+#define AVCODEC_BSWAPDSP_H
 
 #include <stdint.h>
 
@@ -29,4 +29,4 @@ typedef struct BswapDSPContext {
 void ff_bswapdsp_init(BswapDSPContext *c);
 void ff_bswapdsp_init_x86(BswapDSPContext *c);
 
-#endif /* AVCODEC_BSWAP_BUF_H */
+#endif /* AVCODEC_BSWAPDSP_H */
diff --git a/libavcodec/bytestream.h b/libavcodec/bytestream.h
index cb3573b..7c05ea6 100644
--- a/libavcodec/bytestream.h
+++ b/libavcodec/bytestream.h
@@ -3,20 +3,20 @@
  * copyright (c) 2006 Baptiste Coudurier <baptiste.coudurier@free.fr>
  * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
@@ -133,6 +134,7 @@ static av_always_inline void bytestream2_init(GetByteContext *g,
                                               const uint8_t *buf,
                                               int buf_size)
 {
+    av_assert0(buf_size >= 0);
     g->buffer       = buf;
     g->buffer_start = buf;
     g->buffer_end   = buf + buf_size;
@@ -142,6 +144,7 @@ static av_always_inline void bytestream2_init_writer(PutByteContext *p,
                                                      uint8_t *buf,
                                                      int buf_size)
 {
+    av_assert0(buf_size >= 0);
     p->buffer       = buf;
     p->buffer_start = buf;
     p->buffer_end   = buf + buf_size;
diff --git a/libavcodec/c93.c b/libavcodec/c93.c
index 18df958..fd026ac 100644
--- a/libavcodec/c93.c
+++ b/libavcodec/c93.c
@@ -2,20 +2,20 @@
  * Interplay C93 video decoder
  * Copyright (c) 2007 Anssi Hannula <anssi.hannula@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -133,12 +133,13 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *out;
     int stride, ret, i, x, y, b, bt = 0;
 
+    if ((ret = ff_set_dimensions(avctx, WIDTH, HEIGHT)) < 0)
+        return ret;
+
     c93->currentpic ^= 1;
 
-    if ((ret = ff_reget_buffer(avctx, newpic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, newpic)) < 0)
         return ret;
-    }
 
     stride = newpic->linesize[0];
 
@@ -176,7 +177,14 @@ static int decode_frame(AVCodecContext *avctx, void *data,
             case C93_4X4_FROM_PREV:
                 for (j = 0; j < 8; j += 4) {
                     for (i = 0; i < 8; i += 4) {
-                        offset = bytestream2_get_le16(&gb);
+                        int offset = bytestream2_get_le16(&gb);
+                        int from_x = offset % WIDTH;
+                        int from_y = offset / WIDTH;
+                        if (block_type == C93_4X4_FROM_CURR && from_y == y+j &&
+                            (FFABS(from_x - x-i) < 4 || FFABS(from_x - x-i) > WIDTH-4)) {
+                            avpriv_request_sample(avctx, "block overlap %d %d %d %d", from_x, x+i, from_y, y+j);
+                            return AVERROR_INVALIDDATA;
+                        }
                         if ((ret = copy_block(avctx, &out[j*stride+i],
                                               copy_from, offset, 4, stride)) < 0)
                             return ret;
@@ -236,7 +244,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (b & C93_HAS_PALETTE) {
         uint32_t *palette = (uint32_t *) newpic->data[1];
         for (i = 0; i < 256; i++) {
-            palette[i] = bytestream2_get_be24(&gb);
+            palette[i] = 0xFFU << 24 | bytestream2_get_be24(&gb);
         }
         newpic->palette_has_changed = 1;
     } else {
diff --git a/libavcodec/cabac-test.c b/libavcodec/cabac-test.c
new file mode 100644
index 0000000..47f31e9
--- /dev/null
+++ b/libavcodec/cabac-test.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "cabac.c"
+
+#define SIZE 10240
+
+#include "libavutil/lfg.h"
+#include "avcodec.h"
+
+static inline void put_cabac_bit(CABACContext *c, int b){
+    put_bits(&c->pb, 1, b);
+    for(;c->outstanding_count; c->outstanding_count--){
+        put_bits(&c->pb, 1, 1-b);
+    }
+}
+
+static inline void renorm_cabac_encoder(CABACContext *c){
+    while(c->range < 0x100){
+        //FIXME optimize
+        if(c->low<0x100){
+            put_cabac_bit(c, 0);
+        }else if(c->low<0x200){
+            c->outstanding_count++;
+            c->low -= 0x100;
+        }else{
+            put_cabac_bit(c, 1);
+            c->low -= 0x200;
+        }
+
+        c->range+= c->range;
+        c->low += c->low;
+    }
+}
+
+static void put_cabac(CABACContext *c, uint8_t * const state, int bit){
+    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + *state];
+
+    if(bit == ((*state)&1)){
+        c->range -= RangeLPS;
+        *state    = ff_h264_mlps_state[128 + *state];
+    }else{
+        c->low += c->range - RangeLPS;
+        c->range = RangeLPS;
+        *state= ff_h264_mlps_state[127 - *state];
+    }
+
+    renorm_cabac_encoder(c);
+}
+
+/**
+ * @param bit 0 -> write zero bit, !=0 write one bit
+ */
+static void put_cabac_bypass(CABACContext *c, int bit){
+    c->low += c->low;
+
+    if(bit){
+        c->low += c->range;
+    }
+//FIXME optimize
+    if(c->low<0x200){
+        put_cabac_bit(c, 0);
+    }else if(c->low<0x400){
+        c->outstanding_count++;
+        c->low -= 0x200;
+    }else{
+        put_cabac_bit(c, 1);
+        c->low -= 0x400;
+    }
+}
+
+/**
+ *
+ * @return the number of bytes written
+ */
+static int put_cabac_terminate(CABACContext *c, int bit){
+    c->range -= 2;
+
+    if(!bit){
+        renorm_cabac_encoder(c);
+    }else{
+        c->low += c->range;
+        c->range= 2;
+
+        renorm_cabac_encoder(c);
+
+        av_assert0(c->low <= 0x1FF);
+        put_cabac_bit(c, c->low>>9);
+        put_bits(&c->pb, 2, ((c->low>>7)&3)|1);
+
+        flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
+    }
+
+    return (put_bits_count(&c->pb)+7)>>3;
+}
+
+int main(void){
+    CABACContext c;
+    uint8_t b[9*SIZE];
+    uint8_t r[9*SIZE];
+    int i, ret = 0;
+    uint8_t state[10]= {0};
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);
+    ff_init_cabac_encoder(&c, b, SIZE);
+
+    for(i=0; i<SIZE; i++){
+        if(2*i<SIZE) r[i] = av_lfg_get(&prng) % 7;
+        else         r[i] = (i>>8)&1;
+    }
+
+    for(i=0; i<SIZE; i++){
+        put_cabac_bypass(&c, r[i]&1);
+    }
+
+    for(i=0; i<SIZE; i++){
+        put_cabac(&c, state, r[i]&1);
+    }
+
+    i= put_cabac_terminate(&c, 1);
+    b[i++] = av_lfg_get(&prng);
+    b[i  ] = av_lfg_get(&prng);
+
+    ff_init_cabac_decoder(&c, b, SIZE);
+
+    memset(state, 0, sizeof(state));
+
+    for(i=0; i<SIZE; i++){
+        if( (r[i]&1) != get_cabac_bypass(&c) ) {
+            av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i);
+            ret = 1;
+        }
+    }
+
+    for(i=0; i<SIZE; i++){
+        if( (r[i]&1) != get_cabac_noinline(&c, state) ) {
+            av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
+            ret = 1;
+        }
+    }
+    if(!get_cabac_terminate(&c)) {
+        av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
+        ret = 1;
+    }
+
+    return ret;
+}
diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c
index 5c59003..c0abe83 100644
--- a/libavcodec/cabac.c
+++ b/libavcodec/cabac.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include <string.h>
 
 #include "libavutil/common.h"
+#include "libavutil/timer.h"
 
 #include "cabac.h"
 #include "cabac_functions.h"
@@ -160,7 +161,20 @@ const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63] = {
 /**
  * @param buf_size size of buf in bits
  */
-void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
+void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size){
+    init_put_bits(&c->pb, buf, buf_size);
+
+    c->low= 0;
+    c->range= 0x1FE;
+    c->outstanding_count= 0;
+    c->pb.bit_left++; //avoids firstBitFlag
+}
+
+/**
+ *
+ * @param buf_size size of buf in bits
+ */
+int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
     c->bytestream_start=
     c->bytestream= buf;
     c->bytestream_end= buf + buf_size;
@@ -168,9 +182,21 @@ void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
 #if CABAC_BITS == 16
     c->low =  (*c->bytestream++)<<18;
     c->low+=  (*c->bytestream++)<<10;
+    // Keep our fetches on a 2-byte boundry as this should avoid ever having to
+    // do unaligned loads if the compiler (or asm) optimises the double byte
+    // load into a single instruction
+    if(((uintptr_t)c->bytestream & 1) == 0) {
+        c->low += (1 << 9);
+    }
+    else {
+        c->low += ((*c->bytestream++) << 2) + 2;
+    }
 #else
     c->low =  (*c->bytestream++)<<10;
-#endif
     c->low+= ((*c->bytestream++)<<2) + 2;
+#endif
     c->range= 0x1FE;
+    if ((c->range<<(CABAC_BITS+1)) < c->low)
+        return AVERROR_INVALIDDATA;
+    return 0;
 }
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 40eefed..1bf1c62 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,11 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 typedef struct CABACContext{
     int low;
     int range;
+    int outstanding_count;
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
+    PutBitContext pb;
 }CABACContext;
 
-void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
+void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size);
+int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
 
 #endif /* AVCODEC_CABAC_H */
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index beb5016..fe72a82 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 #include "cabac.h"
 #include "config.h"
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
 #if ARCH_AARCH64
 #   include "aarch64/cabac.h"
 #endif
@@ -47,6 +51,7 @@ static const uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS
 static const uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
 static const uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
 
+#if !defined(get_cabac_bypass) || !defined(get_cabac_terminate)
 static void refill(CABACContext *c){
 #if CABAC_BITS == 16
         c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
@@ -54,10 +59,14 @@ static void refill(CABACContext *c){
         c->low+= c->bytestream[0]<<1;
 #endif
     c->low -= CABAC_MASK;
+#if !UNCHECKED_BITSTREAM_READER
     if (c->bytestream < c->bytestream_end)
+#endif
         c->bytestream += CABAC_BITS / 8;
 }
+#endif
 
+#ifndef get_cabac_terminate
 static inline void renorm_cabac_decoder_once(CABACContext *c){
     int shift= (uint32_t)(c->range - 0x100)>>31;
     c->range<<= shift;
@@ -65,13 +74,18 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){
     if(!(c->low & CABAC_MASK))
         refill(c);
 }
+#endif
 
 #ifndef get_cabac_inline
 static void refill2(CABACContext *c){
-    int i, x;
-
+    int i;
+    unsigned x;
+#if !HAVE_FAST_CLZ
     x= c->low ^ (c->low-1);
     i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
+#else
+    i = ff_ctz(c->low) - CABAC_BITS;
+#endif
 
     x= -CABAC_MASK;
 
@@ -82,10 +96,14 @@ static void refill2(CABACContext *c){
 #endif
 
     c->low += x<<i;
+#if !UNCHECKED_BITSTREAM_READER
     if (c->bytestream < c->bytestream_end)
+#endif
         c->bytestream += CABAC_BITS/8;
 }
+#endif
 
+#ifndef get_cabac_inline
 static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
     int s = *state;
     int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
@@ -156,6 +174,7 @@ static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
 /**
  * @return the number of bytes read or 0 if no end
  */
+#ifndef get_cabac_terminate
 static int av_unused get_cabac_terminate(CABACContext *c){
     c->range -= 2;
     if(c->low < c->range<<(CABAC_BITS+1)){
@@ -165,11 +184,13 @@ static int av_unused get_cabac_terminate(CABACContext *c){
         return c->bytestream - c->bytestream_start;
     }
 }
+#endif
 
 /**
  * Skip @p n bytes and reset the decoder.
  * @return the address of the first skipped byte or NULL if there's less than @p n bytes left
  */
+#ifndef skip_bytes
 static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
     const uint8_t *ptr = c->bytestream;
 
@@ -181,9 +202,11 @@ static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
 #endif
     if ((int) (c->bytestream_end - ptr) < n)
         return NULL;
-    ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n);
+    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
+        return NULL;
 
     return ptr;
 }
+#endif
 
 #endif /* AVCODEC_CABAC_FUNCTIONS_H */
diff --git a/libavcodec/canopus.c b/libavcodec/canopus.c
index 729e7ef..ea6cc64 100644
--- a/libavcodec/canopus.c
+++ b/libavcodec/canopus.c
@@ -2,20 +2,20 @@
  * Canopus common routines
  * Copyright (c) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/canopus.h b/libavcodec/canopus.h
index 9e5702d..faa1e8d 100644
--- a/libavcodec/canopus.h
+++ b/libavcodec/canopus.h
@@ -2,20 +2,20 @@
  * Canopus common routines
  * Copyright (c) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c
index 3050b9a..10e118e 100644
--- a/libavcodec/cavs.c
+++ b/libavcodec/cavs.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,15 +75,16 @@ static inline int get_bs(cavs_vector *mvP, cavs_vector *mvQ, int b)
 {
     if ((mvP->ref == REF_INTRA) || (mvQ->ref == REF_INTRA))
         return 2;
-    if ((abs(mvP->x - mvQ->x) >= 4) || (abs(mvP->y - mvQ->y) >= 4))
+    if((abs(mvP->x - mvQ->x) >= 4) ||
+       (abs(mvP->y - mvQ->y) >= 4) ||
+       (mvP->ref != mvQ->ref))
         return 1;
     if (b) {
         mvP += MV_BWD_OFFS;
         mvQ += MV_BWD_OFFS;
-        if ((abs(mvP->x - mvQ->x) >= 4) || (abs(mvP->y - mvQ->y) >= 4))
-            return 1;
-    } else {
-        if (mvP->ref != mvQ->ref)
+        if((abs(mvP->x - mvQ->x) >= 4) ||
+           (abs(mvP->y - mvQ->y) >= 4) ||
+           (mvP->ref != mvQ->ref))
             return 1;
     }
     return 0;
@@ -148,6 +149,8 @@ void ff_cavs_filter(AVSContext *h, enum cavs_mb mb_type)
                 qp_avg = (h->qp + h->left_qp + 1) >> 1;
                 SET_PARAMS;
                 h->cdsp.cavs_filter_lv(h->cy, h->l_stride, alpha, beta, tc, bs[0], bs[1]);
+                qp_avg = (ff_cavs_chroma_qp[h->qp] + ff_cavs_chroma_qp[h->left_qp] + 1) >> 1;
+                SET_PARAMS;
                 h->cdsp.cavs_filter_cv(h->cu, h->c_stride, alpha, beta, tc, bs[0], bs[1]);
                 h->cdsp.cavs_filter_cv(h->cv, h->c_stride, alpha, beta, tc, bs[0], bs[1]);
             }
@@ -160,6 +163,8 @@ void ff_cavs_filter(AVSContext *h, enum cavs_mb mb_type)
                 qp_avg = (h->qp + h->top_qp[h->mbx] + 1) >> 1;
                 SET_PARAMS;
                 h->cdsp.cavs_filter_lh(h->cy, h->l_stride, alpha, beta, tc, bs[4], bs[5]);
+                qp_avg = (ff_cavs_chroma_qp[h->qp] + ff_cavs_chroma_qp[h->top_qp[h->mbx]] + 1) >> 1;
+                SET_PARAMS;
                 h->cdsp.cavs_filter_ch(h->cu, h->c_stride, alpha, beta, tc, bs[4], bs[5]);
                 h->cdsp.cavs_filter_ch(h->cv, h->c_stride, alpha, beta, tc, bs[4], bs[5]);
             }
@@ -233,9 +238,14 @@ void ff_cavs_load_intra_pred_chroma(AVSContext *h)
     /* extend borders by one pixel */
     h->left_border_u[9]              = h->left_border_u[8];
     h->left_border_v[9]              = h->left_border_v[8];
-    h->top_border_u[h->mbx * 10 + 9] = h->top_border_u[h->mbx * 10 + 8];
-    h->top_border_v[h->mbx * 10 + 9] = h->top_border_v[h->mbx * 10 + 8];
-    if (h->mbx && h->mby) {
+    if(h->flags & C_AVAIL) {
+        h->top_border_u[h->mbx*10 + 9] = h->top_border_u[h->mbx*10 + 11];
+        h->top_border_v[h->mbx*10 + 9] = h->top_border_v[h->mbx*10 + 11];
+    } else {
+        h->top_border_u[h->mbx * 10 + 9] = h->top_border_u[h->mbx * 10 + 8];
+        h->top_border_v[h->mbx * 10 + 9] = h->top_border_v[h->mbx * 10 + 8];
+    }
+    if((h->flags & A_AVAIL) && (h->flags & B_AVAIL)) {
         h->top_border_u[h->mbx * 10] = h->left_border_u[0] = h->topleft_border_u;
         h->top_border_v[h->mbx * 10] = h->left_border_v[0] = h->topleft_border_v;
     } else {
@@ -527,7 +537,7 @@ void ff_cavs_inter(AVSContext *h, enum cavs_mb mb_type)
 static inline void scale_mv(AVSContext *h, int *d_x, int *d_y,
                             cavs_vector *src, int distp)
 {
-    int den = h->scale_den[src->ref];
+    int den = h->scale_den[FFMAX(src->ref, 0)];
 
     *d_x = (src->x * distp * den + 256 + FF_SIGNBIT(src->x)) >> 9;
     *d_y = (src->y * distp * den + 256 + FF_SIGNBIT(src->y)) >> 9;
@@ -574,7 +584,7 @@ void ff_cavs_mv(AVSContext *h, enum cavs_mv_loc nP, enum cavs_mv_loc nC,
 
     mvP->ref  = ref;
     mvP->dist = h->dist[mvP->ref];
-    if (mvC->ref == NOT_AVAIL)
+    if (mvC->ref == NOT_AVAIL || (nP == MV_FWD_X3) || (nP == MV_BWD_X3 ))
         mvC = &h->mv[nP - 5];  // set to top-left (mvD)
     if (mode == MV_PRED_PSKIP &&
         (mvA->ref == NOT_AVAIL ||
@@ -704,7 +714,7 @@ int ff_cavs_next_mb(AVSContext *h)
  *
  ****************************************************************************/
 
-void ff_cavs_init_pic(AVSContext *h)
+int ff_cavs_init_pic(AVSContext *h)
 {
     int i;
 
@@ -725,6 +735,8 @@ void ff_cavs_init_pic(AVSContext *h)
     h->luma_scan[3]   = 8 * h->l_stride + 8;
     h->mbx            = h->mby = h->mbidx = 0;
     h->flags          = 0;
+
+    return 0;
 }
 
 /*****************************************************************************
@@ -738,22 +750,39 @@ void ff_cavs_init_pic(AVSContext *h)
  * this data has to be stored for one complete row of macroblocks
  * and this storage space is allocated here
  */
-void ff_cavs_init_top_lines(AVSContext *h)
+int ff_cavs_init_top_lines(AVSContext *h)
 {
     /* alloc top line of predictors */
     h->top_qp       = av_mallocz(h->mb_width);
-    h->top_mv[0]    = av_mallocz((h->mb_width * 2 + 1) * sizeof(cavs_vector));
-    h->top_mv[1]    = av_mallocz((h->mb_width * 2 + 1) * sizeof(cavs_vector));
-    h->top_pred_Y   = av_mallocz(h->mb_width * 2 * sizeof(*h->top_pred_Y));
-    h->top_border_y = av_mallocz((h->mb_width + 1) * 16);
-    h->top_border_u = av_mallocz(h->mb_width * 10);
-    h->top_border_v = av_mallocz(h->mb_width * 10);
+    h->top_mv[0]    = av_mallocz_array(h->mb_width * 2 + 1,  sizeof(cavs_vector));
+    h->top_mv[1]    = av_mallocz_array(h->mb_width * 2 + 1,  sizeof(cavs_vector));
+    h->top_pred_Y   = av_mallocz_array(h->mb_width * 2,  sizeof(*h->top_pred_Y));
+    h->top_border_y = av_mallocz_array(h->mb_width + 1,  16);
+    h->top_border_u = av_mallocz_array(h->mb_width,  10);
+    h->top_border_v = av_mallocz_array(h->mb_width,  10);
 
     /* alloc space for co-located MVs and types */
-    h->col_mv        = av_mallocz(h->mb_width * h->mb_height * 4 *
-                                  sizeof(cavs_vector));
+    h->col_mv        = av_mallocz_array(h->mb_width * h->mb_height,
+                                        4 * sizeof(cavs_vector));
     h->col_type_base = av_mallocz(h->mb_width * h->mb_height);
     h->block         = av_mallocz(64 * sizeof(int16_t));
+
+    if (!h->top_qp || !h->top_mv[0] || !h->top_mv[1] || !h->top_pred_Y ||
+        !h->top_border_y || !h->top_border_u || !h->top_border_v ||
+        !h->col_mv || !h->col_type_base || !h->block) {
+        av_freep(&h->top_qp);
+        av_freep(&h->top_mv[0]);
+        av_freep(&h->top_mv[1]);
+        av_freep(&h->top_pred_Y);
+        av_freep(&h->top_border_y);
+        av_freep(&h->top_border_u);
+        av_freep(&h->top_border_v);
+        av_freep(&h->col_mv);
+        av_freep(&h->col_type_base);
+        av_freep(&h->block);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
 }
 
 av_cold int ff_cavs_init(AVCodecContext *avctx)
@@ -810,16 +839,16 @@ av_cold int ff_cavs_end(AVCodecContext *avctx)
     av_frame_free(&h->DPB[0].f);
     av_frame_free(&h->DPB[1].f);
 
-    av_free(h->top_qp);
-    av_free(h->top_mv[0]);
-    av_free(h->top_mv[1]);
-    av_free(h->top_pred_Y);
-    av_free(h->top_border_y);
-    av_free(h->top_border_u);
-    av_free(h->top_border_v);
-    av_free(h->col_mv);
-    av_free(h->col_type_base);
-    av_free(h->block);
+    av_freep(&h->top_qp);
+    av_freep(&h->top_mv[0]);
+    av_freep(&h->top_mv[1]);
+    av_freep(&h->top_pred_Y);
+    av_freep(&h->top_border_y);
+    av_freep(&h->top_border_u);
+    av_freep(&h->top_border_v);
+    av_freep(&h->col_mv);
+    av_freep(&h->col_type_base);
+    av_freep(&h->block);
     av_freep(&h->edge_emu_buffer);
     return 0;
 }
diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h
index cfae055..fb9df15 100644
--- a/libavcodec/cavs.h
+++ b/libavcodec/cavs.h
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -216,6 +216,7 @@ typedef struct AVSContext {
     int luma_scan[4];
     int qp;
     int qp_fixed;
+    int pic_qp_fixed;
     int cbp;
     ScanTable scantable;
 
@@ -241,6 +242,7 @@ typedef struct AVSContext {
     int16_t *block;
 } AVSContext;
 
+extern const uint8_t     ff_cavs_chroma_qp[64];
 extern const uint8_t     ff_cavs_partition_flags[30];
 extern const cavs_vector ff_cavs_intra_mv;
 extern const cavs_vector ff_cavs_dir_mv;
@@ -269,8 +271,8 @@ void ff_cavs_mv(AVSContext *h, enum cavs_mv_loc nP, enum cavs_mv_loc nC,
                 enum cavs_mv_pred mode, enum cavs_block size, int ref);
 void ff_cavs_init_mb(AVSContext *h);
 int  ff_cavs_next_mb(AVSContext *h);
-void ff_cavs_init_pic(AVSContext *h);
-void ff_cavs_init_top_lines(AVSContext *h);
+int ff_cavs_init_pic(AVSContext *h);
+int ff_cavs_init_top_lines(AVSContext *h);
 int ff_cavs_init(AVCodecContext *avctx);
 int ff_cavs_end (AVCodecContext *avctx);
 
diff --git a/libavcodec/cavs_parser.c b/libavcodec/cavs_parser.c
index 84f647c..6067a39 100644
--- a/libavcodec/cavs_parser.c
+++ b/libavcodec/cavs_parser.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) parser.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavsdata.c b/libavcodec/cavsdata.c
index 4e4a131..2835a4b 100644
--- a/libavcodec/cavsdata.c
+++ b/libavcodec/cavsdata.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,6 +54,13 @@ const uint8_t ff_cavs_partition_flags[30] = {
                       SPLITH|SPLITV, //B_8X8 = 29
 };
 
+const uint8_t ff_cavs_chroma_qp[64] = {
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44,
+  45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51
+};
+
 /** mark block as "no prediction from this direction"
     e.g. forward motion vector in BWD partition */
 const cavs_vector ff_cavs_dir_mv   = {0,0,1,REF_DIR};
diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c
index a455a34..70ac6f8 100644
--- a/libavcodec/cavsdec.c
+++ b/libavcodec/cavsdec.c
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * @author Stefan Gehrer <stefan.gehrer@gmx.de>
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "golomb.h"
@@ -50,13 +51,6 @@ static const uint8_t cbp_tab[64][2] = {
 
 static const uint8_t scan3x3[4] = { 4, 5, 7, 8 };
 
-static const uint8_t cavs_chroma_qp[64] = {
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44,
-  45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51
-};
-
 static const uint8_t dequant_shift[64] = {
   14, 14, 14, 14, 14, 14, 14, 14,
   13, 13, 13, 13, 13, 13, 13, 13,
@@ -508,11 +502,15 @@ static inline void mv_pred_sym(AVSContext *h, cavs_vector *src,
 /** kth-order exponential golomb code */
 static inline int get_ue_code(GetBitContext *gb, int order)
 {
+    unsigned ret = get_ue_golomb(gb);
+    if (ret >= ((1U<<31)>>order)) {
+        av_log(NULL, AV_LOG_ERROR, "get_ue_code: value too larger\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (order) {
-        int ret = get_ue_golomb(gb) << order;
-        return ret + get_bits(gb, order);
+        return (ret<<order) + get_bits(gb, order);
     }
-    return get_ue_golomb(gb);
+    return ret;
 }
 
 static inline int dequant(AVSContext *h, int16_t *level_buf, uint8_t *run_buf,
@@ -549,29 +547,37 @@ static int decode_residual_block(AVSContext *h, GetBitContext *gb,
                                  const struct dec_2dvlc *r, int esc_golomb_order,
                                  int qp, uint8_t *dst, int stride)
 {
-    int i, level_code, esc_code, level, run, mask, ret;
+    int i, esc_code, level, mask, ret;
+    unsigned int level_code, run;
     int16_t level_buf[65];
     uint8_t run_buf[65];
     int16_t *block = h->block;
 
-    for (i = 0;i < 65; i++) {
+    for (i = 0; i < 65; i++) {
         level_code = get_ue_code(gb, r->golomb_order);
         if (level_code >= ESCAPE_CODE) {
             run      = ((level_code - ESCAPE_CODE) >> 1) + 1;
+            if(run > 64) {
+                av_log(h->avctx, AV_LOG_ERROR, "run %d is too large\n", run);
+                return AVERROR_INVALIDDATA;
+            }
             esc_code = get_ue_code(gb, esc_golomb_order);
+            if (esc_code < 0 || esc_code > 32767) {
+                av_log(h->avctx, AV_LOG_ERROR, "esc_code invalid\n");
+                return AVERROR_INVALIDDATA;
+            }
+
             level    = esc_code + (run > r->max_run ? 1 : r->level_add[run]);
             while (level > r->inc_limit)
                 r++;
             mask  = -(level_code & 1);
             level = (level ^ mask) - mask;
-        } else if (level_code >= 0) {
+        } else {
             level = r->rltab[level_code][0];
             if (!level) //end of block signal
                 break;
             run = r->rltab[level_code][1];
             r  += r->rltab[level_code][2];
-        } else {
-            break;
         }
         level_buf[i] = level;
         run_buf[i]   = run;
@@ -589,10 +595,10 @@ static inline void decode_residual_chroma(AVSContext *h)
 {
     if (h->cbp & (1 << 4))
         decode_residual_block(h, &h->gb, chroma_dec, 0,
-                              cavs_chroma_qp[h->qp], h->cu, h->c_stride);
+                              ff_cavs_chroma_qp[h->qp], h->cu, h->c_stride);
     if (h->cbp & (1 << 5))
         decode_residual_block(h, &h->gb, chroma_dec, 0,
-                              cavs_chroma_qp[h->qp], h->cv, h->c_stride);
+                              ff_cavs_chroma_qp[h->qp], h->cv, h->c_stride);
 }
 
 static inline int decode_residual_inter(AVSContext *h)
@@ -601,7 +607,7 @@ static inline int decode_residual_inter(AVSContext *h)
 
     /* get coded block pattern */
     int cbp = get_ue_golomb(&h->gb);
-    if (cbp > 63 || cbp < 0) {
+    if (cbp > 63U) {
         av_log(h->avctx, AV_LOG_ERROR, "illegal inter cbp %d\n", cbp);
         return AVERROR_INVALIDDATA;
     }
@@ -672,7 +678,7 @@ static int decode_mb_i(AVSContext *h, int cbp_code)
     /* get coded block pattern */
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I)
         cbp_code = get_ue_golomb(gb);
-    if (cbp_code > 63 || cbp_code < 0) {
+    if (cbp_code > 63U) {
         av_log(h->avctx, AV_LOG_ERROR, "illegal intra cbp\n");
         return AVERROR_INVALIDDATA;
     }
@@ -759,7 +765,7 @@ static void decode_mb_p(AVSContext *h, enum cavs_mb mb_type)
     h->col_type_base[h->mbidx] = mb_type;
 }
 
-static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
+static int decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
 {
     int block;
     enum cavs_sub_mb sub_type[4];
@@ -796,6 +802,8 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
         ff_cavs_mv(h, MV_BWD_X0, MV_BWD_C2, MV_PRED_MEDIAN, BLK_16X16, 0);
         break;
     case B_8X8:
+#define TMP_UNUSED_INX  7
+        flags = 0;
         for (block = 0; block < 4; block++)
             sub_type[block] = get_bits(&h->gb, 2);
         for (block = 0; block < 4; block++) {
@@ -803,11 +811,30 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
             case B_SUB_DIRECT:
                 if (!h->col_type_base[h->mbidx]) {
                     /* intra MB at co-location, do in-plane prediction */
-                    ff_cavs_mv(h, mv_scan[block], mv_scan[block] - 3,
-                               MV_PRED_BSKIP, BLK_8X8, 1);
-                    ff_cavs_mv(h, mv_scan[block] + MV_BWD_OFFS,
-                               mv_scan[block] - 3 + MV_BWD_OFFS,
-                               MV_PRED_BSKIP, BLK_8X8, 0);
+                    if(flags==0) {
+                        // if col-MB is a Intra MB, current Block size is 16x16.
+                        // AVS standard section 9.9.1
+                        if(block>0){
+                            h->mv[TMP_UNUSED_INX              ] = h->mv[MV_FWD_X0              ];
+                            h->mv[TMP_UNUSED_INX + MV_BWD_OFFS] = h->mv[MV_FWD_X0 + MV_BWD_OFFS];
+                        }
+                        ff_cavs_mv(h, MV_FWD_X0, MV_FWD_C2,
+                                   MV_PRED_BSKIP, BLK_8X8, 1);
+                        ff_cavs_mv(h, MV_FWD_X0+MV_BWD_OFFS,
+                                   MV_FWD_C2+MV_BWD_OFFS,
+                                   MV_PRED_BSKIP, BLK_8X8, 0);
+                        if(block>0) {
+                            flags = mv_scan[block];
+                            h->mv[flags              ] = h->mv[MV_FWD_X0              ];
+                            h->mv[flags + MV_BWD_OFFS] = h->mv[MV_FWD_X0 + MV_BWD_OFFS];
+                            h->mv[MV_FWD_X0              ] = h->mv[TMP_UNUSED_INX              ];
+                            h->mv[MV_FWD_X0 + MV_BWD_OFFS] = h->mv[TMP_UNUSED_INX + MV_BWD_OFFS];
+                        } else
+                            flags = MV_FWD_X0;
+                    } else {
+                        h->mv[mv_scan[block]              ] = h->mv[flags              ];
+                        h->mv[mv_scan[block] + MV_BWD_OFFS] = h->mv[flags + MV_BWD_OFFS];
+                    }
                 } else
                     mv_pred_direct(h, &h->mv[mv_scan[block]],
                                    &h->col_mv[h->mbidx * 4 + block]);
@@ -823,6 +850,7 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
                 break;
             }
         }
+#undef TMP_UNUSED_INX
         for (block = 0; block < 4; block++) {
             if (sub_type[block] == B_SUB_BWD)
                 ff_cavs_mv(h, mv_scan[block] + MV_BWD_OFFS,
@@ -831,7 +859,11 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
         }
         break;
     default:
-        assert((mb_type > B_SYM_16X16) && (mb_type < B_8X8));
+        if (mb_type <= B_SYM_16X16) {
+            av_log(h->avctx, AV_LOG_ERROR, "Invalid mb_type %d in B frame\n", mb_type);
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert2(mb_type < B_8X8);
         flags = ff_cavs_partition_flags[mb_type];
         if (mb_type & 1) { /* 16x8 macroblock types */
             if (flags & FWD0)
@@ -866,6 +898,8 @@ static void decode_mb_b(AVSContext *h, enum cavs_mb mb_type)
     if (mb_type != B_SKIP)
         decode_residual_inter(h);
     ff_cavs_filter(h, mb_type);
+
+    return 0;
 }
 
 /*****************************************************************************
@@ -878,12 +912,18 @@ static inline int decode_slice_header(AVSContext *h, GetBitContext *gb)
 {
     if (h->stc > 0xAF)
         av_log(h->avctx, AV_LOG_ERROR, "unexpected start code 0x%02x\n", h->stc);
+
+    if (h->stc >= h->mb_height) {
+        av_log(h->avctx, AV_LOG_ERROR, "stc 0x%02x is too large\n", h->stc);
+        return AVERROR_INVALIDDATA;
+    }
+
     h->mby   = h->stc;
     h->mbidx = h->mby * h->mb_width;
 
     /* mark top macroblocks as unavailable */
     h->flags &= ~(B_AVAIL | C_AVAIL);
-    if ((h->mby == 0) && (!h->qp_fixed)) {
+    if (!h->pic_qp_fixed) {
         h->qp_fixed = get_bits1(gb);
         h->qp       = get_bits(gb, 6);
     }
@@ -976,16 +1016,17 @@ static int decode_pic(AVSContext *h)
             return AVERROR(ENOMEM);
     }
 
-    ff_cavs_init_pic(h);
+    if ((ret = ff_cavs_init_pic(h)) < 0)
+        return ret;
     h->cur.poc = get_bits(&h->gb, 8) * 2;
 
     /* get temporal distances and MV scaling factors */
     if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-        h->dist[0] = (h->cur.poc - h->DPB[0].poc  + 512) % 512;
+        h->dist[0] = (h->cur.poc - h->DPB[0].poc) & 511;
     } else {
-        h->dist[0] = (h->DPB[0].poc  - h->cur.poc + 512) % 512;
+        h->dist[0] = (h->DPB[0].poc  - h->cur.poc) & 511;
     }
-    h->dist[1] = (h->cur.poc - h->DPB[1].poc  + 512) % 512;
+    h->dist[1] = (h->cur.poc - h->DPB[1].poc) & 511;
     h->scale_den[0] = h->dist[0] ? 512/h->dist[0] : 0;
     h->scale_den[1] = h->dist[1] ? 512/h->dist[1] : 0;
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_B) {
@@ -1005,6 +1046,7 @@ static int decode_pic(AVSContext *h)
         skip_bits1(&h->gb);     //advanced_pred_mode_disable
     skip_bits1(&h->gb);        //top_field_first
     skip_bits1(&h->gb);        //repeat_first_field
+    h->pic_qp_fixed =
     h->qp_fixed = get_bits1(&h->gb);
     h->qp       = get_bits(&h->gb, 6);
     if (h->cur.f->pict_type == AV_PICTURE_TYPE_I) {
@@ -1080,6 +1122,7 @@ static int decode_seq_header(AVSContext *h)
 {
     int frame_rate_code;
     int width, height;
+    int ret;
 
     h->profile = get_bits(&h->gb, 8);
     h->level   = get_bits(&h->gb, 8);
@@ -1092,24 +1135,36 @@ static int decode_seq_header(AVSContext *h)
                                       "Width/height changing in CAVS");
         return AVERROR_PATCHWELCOME;
     }
-    h->width  = width;
-    h->height = height;
-
+    if (width <= 0 || height <= 0) {
+        av_log(h->avctx, AV_LOG_ERROR, "Dimensions invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
     skip_bits(&h->gb, 2); //chroma format
     skip_bits(&h->gb, 3); //sample_precision
     h->aspect_ratio = get_bits(&h->gb, 4);
     frame_rate_code = get_bits(&h->gb, 4);
+    if (frame_rate_code == 0 || frame_rate_code > 13) {
+        av_log(h->avctx, AV_LOG_WARNING,
+               "frame_rate_code %d is invalid\n", frame_rate_code);
+        frame_rate_code = 1;
+    }
+
     skip_bits(&h->gb, 18); //bit_rate_lower
     skip_bits1(&h->gb);    //marker_bit
     skip_bits(&h->gb, 12); //bit_rate_upper
     h->low_delay =  get_bits1(&h->gb);
+
+    ret = ff_set_dimensions(h->avctx, width, height);
+    if (ret < 0)
+        return ret;
+
+    h->width  = width;
+    h->height = height;
     h->mb_width  = (h->width  + 15) >> 4;
     h->mb_height = (h->height + 15) >> 4;
     h->avctx->framerate = ff_mpeg12_frame_rate_tab[frame_rate_code];
-    h->avctx->width  = h->width;
-    h->avctx->height = h->height;
     if (!h->top_qp)
-        ff_cavs_init_top_lines(h);
+        return ff_cavs_init_top_lines(h);
     return 0;
 }
 
@@ -1138,12 +1193,17 @@ static int cavs_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return 0;
     }
 
+    h->stc = 0;
+
     buf_ptr = buf;
     buf_end = buf + buf_size;
     for(;;) {
         buf_ptr = avpriv_find_start_code(buf_ptr, buf_end, &stc);
-        if ((stc & 0xFFFFFE00) || buf_ptr == buf_end)
+        if ((stc & 0xFFFFFE00) || buf_ptr == buf_end) {
+            if (!h->stc)
+                av_log(h->avctx, AV_LOG_WARNING, "no frame decoded\n");
             return FFMAX(0, buf_ptr - buf);
+        }
         input_size = (buf_end - buf_ptr) * 8;
         switch (stc) {
         case CAVS_START_CODE:
@@ -1166,8 +1226,8 @@ static int cavs_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 break;
             *got_frame = 1;
             if (h->cur.f->pict_type != AV_PICTURE_TYPE_B) {
-                if (h->DPB[1].f->data[0]) {
-                    if ((ret = av_frame_ref(data, h->DPB[1].f)) < 0)
+                if (h->DPB[!h->low_delay].f->data[0]) {
+                    if ((ret = av_frame_ref(data, h->DPB[!h->low_delay].f)) < 0)
                         return ret;
                 } else {
                     *got_frame = 0;
diff --git a/libavcodec/cavsdsp.c b/libavcodec/cavsdsp.c
index cc78989..91f6d73 100644
--- a/libavcodec/cavsdsp.c
+++ b/libavcodec/cavsdsp.c
@@ -5,20 +5,20 @@
  *
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cavsdsp.h b/libavcodec/cavsdsp.h
index 248afd5..847f5c4 100644
--- a/libavcodec/cavsdsp.h
+++ b/libavcodec/cavsdsp.h
@@ -2,20 +2,20 @@
  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cbrt_data.c b/libavcodec/cbrt_data.c
new file mode 100644
index 0000000..d2e36cd
--- /dev/null
+++ b/libavcodec/cbrt_data.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "cbrt_data.h"
+
+#include "libavutil/libm.h"
+
+#if CONFIG_HARDCODED_TABLES
+#include "libavcodec/cbrt_tables.h"
+#else
+#include "cbrt_tablegen.h"
+#endif
diff --git a/libavcodec/cbrt_data.h b/libavcodec/cbrt_data.h
new file mode 100644
index 0000000..232f74f
--- /dev/null
+++ b/libavcodec/cbrt_data.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CBRT_DATA_H
+#define AVCODEC_CBRT_DATA_H
+
+#include <stdint.h>
+
+#if CONFIG_HARDCODED_TABLES
+#define ff_cbrt_tableinit_fixed()
+#define ff_cbrt_tableinit()
+extern const uint32_t ff_cbrt_tab[1 << 13];
+extern const uint32_t ff_cbrt_tab_fixed[1 << 13];
+#else
+void ff_cbrt_tableinit(void);
+void ff_cbrt_tableinit_fixed(void);
+extern uint32_t ff_cbrt_tab[1 << 13];
+extern uint32_t ff_cbrt_tab_fixed[1 << 13];
+#endif
+
+#endif
diff --git a/libavcodec/cbrt_data_fixed.c b/libavcodec/cbrt_data_fixed.c
new file mode 100644
index 0000000..d661b25
--- /dev/null
+++ b/libavcodec/cbrt_data_fixed.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "cbrt_data.h"
+
+#if CONFIG_HARDCODED_TABLES
+#include "libavcodec/cbrt_fixed_tables.h"
+#else
+#define USE_FIXED 1
+#include "cbrt_tablegen.h"
+#endif
diff --git a/libavcodec/cbrt_fixed_tablegen.c b/libavcodec/cbrt_fixed_tablegen.c
new file mode 100644
index 0000000..24d2fbb
--- /dev/null
+++ b/libavcodec/cbrt_fixed_tablegen.c
@@ -0,0 +1,24 @@
+/*
+ * Generate a header file for hardcoded AAC cube-root table
+ *
+ * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "cbrt_tablegen_template.c"
diff --git a/libavcodec/cbrt_tablegen.c b/libavcodec/cbrt_tablegen.c
index e92c0f1..8c2235e 100644
--- a/libavcodec/cbrt_tablegen.c
+++ b/libavcodec/cbrt_tablegen.c
@@ -3,35 +3,22 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#include "cbrt_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    cbrt_tableinit();
-
-    write_fileheader();
-
-    WRITE_ARRAY("static const", uint32_t, cbrt_tab);
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "cbrt_tablegen_template.c"
diff --git a/libavcodec/cbrt_tablegen.h b/libavcodec/cbrt_tablegen.h
index 60d900a..9af18d8 100644
--- a/libavcodec/cbrt_tablegen.h
+++ b/libavcodec/cbrt_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,28 +25,49 @@
 
 #include <stdint.h>
 #include <math.h>
+#include "libavutil/attributes.h"
+#include "libavutil/intfloat.h"
+#include "libavcodec/aac_defines.h"
 
-#if CONFIG_HARDCODED_TABLES
-#define cbrt_tableinit()
-#include "libavcodec/cbrt_tables.h"
+#if USE_FIXED
+#define CBRT(x) lrint((x) * 8192)
 #else
-static uint32_t cbrt_tab[1 << 13];
+#define CBRT(x) av_float2int((float)(x))
+#endif
 
-static void cbrt_tableinit(void)
+uint32_t AAC_RENAME(ff_cbrt_tab)[1 << 13];
+
+av_cold void AAC_RENAME(ff_cbrt_tableinit)(void)
 {
-    if (!cbrt_tab[(1<<13) - 1]) {
-        int i;
-        /* cbrtf() isn't available on all systems, so we use powf(). */
-        for (i = 0; i < 1<<13; i++) {
-            union {
-                float f;
-                uint32_t i;
-            } f;
-            f.f = powf(i, 1.0 / 3.0) * i;
-            cbrt_tab[i] = f.i;
+    static double cbrt_tab_dbl[1 << 13];
+    if (!AAC_RENAME(ff_cbrt_tab)[(1<<13) - 1]) {
+        int i, j, k;
+        double cbrt_val;
+
+        for (i = 1; i < 1<<13; i++)
+            cbrt_tab_dbl[i] = 1;
+
+        /* have to take care of non-squarefree numbers */
+        for (i = 2; i < 90; i++) {
+            if (cbrt_tab_dbl[i] == 1) {
+                cbrt_val = i * cbrt(i);
+                for (k = i; k < 1<<13; k *= i)
+                    for (j = k; j < 1<<13; j += k)
+                        cbrt_tab_dbl[j] *= cbrt_val;
+            }
         }
+
+        for (i = 91; i <= 8191; i+= 2) {
+            if (cbrt_tab_dbl[i] == 1) {
+                cbrt_val = i * cbrt(i);
+                for (j = i; j < 1<<13; j += i)
+                    cbrt_tab_dbl[j] *= cbrt_val;
+            }
+        }
+
+        for (i = 0; i < 1<<13; i++)
+            AAC_RENAME(ff_cbrt_tab)[i] = CBRT(cbrt_tab_dbl[i]);
     }
 }
-#endif /* CONFIG_HARDCODED_TABLES */
 
 #endif /* AVCODEC_CBRT_TABLEGEN_H */
diff --git a/libavcodec/cbrt_tablegen_template.c b/libavcodec/cbrt_tablegen_template.c
new file mode 100644
index 0000000..21ed2a6
--- /dev/null
+++ b/libavcodec/cbrt_tablegen_template.c
@@ -0,0 +1,42 @@
+/*
+ * Generate a header file for hardcoded AAC cube-root table
+ *
+ * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#define CONFIG_HARDCODED_TABLES 0
+#include "libavutil/tablegen.h"
+#include "cbrt_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    AAC_RENAME(ff_cbrt_tableinit)();
+
+    write_fileheader();
+
+#if USE_FIXED
+    WRITE_ARRAY("const", uint32_t, ff_cbrt_tab_fixed);
+#else
+    WRITE_ARRAY("const", uint32_t, ff_cbrt_tab);
+#endif
+
+    return 0;
+}
diff --git a/libavcodec/ccaption_dec.c b/libavcodec/ccaption_dec.c
new file mode 100644
index 0000000..3b15149
--- /dev/null
+++ b/libavcodec/ccaption_dec.c
@@ -0,0 +1,818 @@
+/*
+ * Closed Caption Decoding
+ * Copyright (c) 2015 Anshul Maheshwari
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/opt.h"
+
+#define SCREEN_ROWS 15
+#define SCREEN_COLUMNS 32
+
+#define SET_FLAG(var, val)   ( (var) |=   ( 1 << (val)) )
+#define UNSET_FLAG(var, val) ( (var) &=  ~( 1 << (val)) )
+#define CHECK_FLAG(var, val) ( (var) &    ( 1 << (val)) )
+
+static const AVRational ms_tb = {1, 1000};
+
+/*
+ * TODO list
+ * 1) handle font and color completely
+ */
+enum cc_mode {
+    CCMODE_POPON,
+    CCMODE_PAINTON,
+    CCMODE_ROLLUP,
+    CCMODE_TEXT,
+};
+
+enum cc_color_code {
+    CCCOL_WHITE,
+    CCCOL_GREEN,
+    CCCOL_BLUE,
+    CCCOL_CYAN,
+    CCCOL_RED,
+    CCCOL_YELLOW,
+    CCCOL_MAGENTA,
+    CCCOL_USERDEFINED,
+    CCCOL_BLACK,
+    CCCOL_TRANSPARENT,
+};
+
+enum cc_font {
+    CCFONT_REGULAR,
+    CCFONT_ITALICS,
+    CCFONT_UNDERLINED,
+    CCFONT_UNDERLINED_ITALICS,
+};
+
+enum cc_charset {
+    CCSET_BASIC_AMERICAN,
+    CCSET_SPECIAL_AMERICAN,
+    CCSET_EXTENDED_SPANISH_FRENCH_MISC,
+    CCSET_EXTENDED_PORTUGUESE_GERMAN_DANISH,
+};
+
+static const char *charset_overrides[4][128] =
+{
+    [CCSET_BASIC_AMERICAN] = {
+        [0x27] = "\u2019",
+        [0x2a] = "\u00e1",
+        [0x5c] = "\u00e9",
+        [0x5e] = "\u00ed",
+        [0x5f] = "\u00f3",
+        [0x60] = "\u00fa",
+        [0x7b] = "\u00e7",
+        [0x7c] = "\u00f7",
+        [0x7d] = "\u00d1",
+        [0x7e] = "\u00f1",
+        [0x7f] = "\u2588"
+    },
+    [CCSET_SPECIAL_AMERICAN] = {
+        [0x30] = "\u00ae",
+        [0x31] = "\u00b0",
+        [0x32] = "\u00bd",
+        [0x33] = "\u00bf",
+        [0x34] = "\u2122",
+        [0x35] = "\u00a2",
+        [0x36] = "\u00a3",
+        [0x37] = "\u266a",
+        [0x38] = "\u00e0",
+        [0x39] = "\u00A0",
+        [0x3a] = "\u00e8",
+        [0x3b] = "\u00e2",
+        [0x3c] = "\u00ea",
+        [0x3d] = "\u00ee",
+        [0x3e] = "\u00f4",
+        [0x3f] = "\u00fb",
+    },
+    [CCSET_EXTENDED_SPANISH_FRENCH_MISC] = {
+        [0x20] = "\u00c1",
+        [0x21] = "\u00c9",
+        [0x22] = "\u00d3",
+        [0x23] = "\u00da",
+        [0x24] = "\u00dc",
+        [0x25] = "\u00fc",
+        [0x26] = "\u00b4",
+        [0x27] = "\u00a1",
+        [0x28] = "*",
+        [0x29] = "\u2018",
+        [0x2a] = "-",
+        [0x2b] = "\u00a9",
+        [0x2c] = "\u2120",
+        [0x2d] = "\u00b7",
+        [0x2e] = "\u201c",
+        [0x2f] = "\u201d",
+        [0x30] = "\u00c0",
+        [0x31] = "\u00c2",
+        [0x32] = "\u00c7",
+        [0x33] = "\u00c8",
+        [0x34] = "\u00ca",
+        [0x35] = "\u00cb",
+        [0x36] = "\u00eb",
+        [0x37] = "\u00ce",
+        [0x38] = "\u00cf",
+        [0x39] = "\u00ef",
+        [0x3a] = "\u00d4",
+        [0x3b] = "\u00d9",
+        [0x3c] = "\u00f9",
+        [0x3d] = "\u00db",
+        [0x3e] = "\u00ab",
+        [0x3f] = "\u00bb",
+    },
+    [CCSET_EXTENDED_PORTUGUESE_GERMAN_DANISH] = {
+        [0x20] = "\u00c3",
+        [0x21] = "\u00e3",
+        [0x22] = "\u00cd",
+        [0x23] = "\u00cc",
+        [0x24] = "\u00ec",
+        [0x25] = "\u00d2",
+        [0x26] = "\u00f2",
+        [0x27] = "\u00d5",
+        [0x28] = "\u00f5",
+        [0x29] = "{",
+        [0x2a] = "}",
+        [0x2b] = "\\",
+        [0x2c] = "^",
+        [0x2d] = "_",
+        [0x2e] = "|",
+        [0x2f] = "~",
+        [0x30] = "\u00c4",
+        [0x31] = "\u00e4",
+        [0x32] = "\u00d6",
+        [0x33] = "\u00f6",
+        [0x34] = "\u00df",
+        [0x35] = "\u00a5",
+        [0x36] = "\u00a4",
+        [0x37] = "\u00a6",
+        [0x38] = "\u00c5",
+        [0x39] = "\u00e5",
+        [0x3a] = "\u00d8",
+        [0x3b] = "\u00f8",
+        [0x3c] = "\u250c",
+        [0x3d] = "\u2510",
+        [0x3e] = "\u2514",
+        [0x3f] = "\u2518",
+    },
+};
+
+static const unsigned char pac2_attribs[32][3] = // Color, font, ident
+{
+    { CCCOL_WHITE,   CCFONT_REGULAR,            0 },  // 0x40 || 0x60
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         0 },  // 0x41 || 0x61
+    { CCCOL_GREEN,   CCFONT_REGULAR,            0 },  // 0x42 || 0x62
+    { CCCOL_GREEN,   CCFONT_UNDERLINED,         0 },  // 0x43 || 0x63
+    { CCCOL_BLUE,    CCFONT_REGULAR,            0 },  // 0x44 || 0x64
+    { CCCOL_BLUE,    CCFONT_UNDERLINED,         0 },  // 0x45 || 0x65
+    { CCCOL_CYAN,    CCFONT_REGULAR,            0 },  // 0x46 || 0x66
+    { CCCOL_CYAN,    CCFONT_UNDERLINED,         0 },  // 0x47 || 0x67
+    { CCCOL_RED,     CCFONT_REGULAR,            0 },  // 0x48 || 0x68
+    { CCCOL_RED,     CCFONT_UNDERLINED,         0 },  // 0x49 || 0x69
+    { CCCOL_YELLOW,  CCFONT_REGULAR,            0 },  // 0x4a || 0x6a
+    { CCCOL_YELLOW,  CCFONT_UNDERLINED,         0 },  // 0x4b || 0x6b
+    { CCCOL_MAGENTA, CCFONT_REGULAR,            0 },  // 0x4c || 0x6c
+    { CCCOL_MAGENTA, CCFONT_UNDERLINED,         0 },  // 0x4d || 0x6d
+    { CCCOL_WHITE,   CCFONT_ITALICS,            0 },  // 0x4e || 0x6e
+    { CCCOL_WHITE,   CCFONT_UNDERLINED_ITALICS, 0 },  // 0x4f || 0x6f
+    { CCCOL_WHITE,   CCFONT_REGULAR,            0 },  // 0x50 || 0x70
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         0 },  // 0x51 || 0x71
+    { CCCOL_WHITE,   CCFONT_REGULAR,            4 },  // 0x52 || 0x72
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         4 },  // 0x53 || 0x73
+    { CCCOL_WHITE,   CCFONT_REGULAR,            8 },  // 0x54 || 0x74
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,         8 },  // 0x55 || 0x75
+    { CCCOL_WHITE,   CCFONT_REGULAR,           12 },  // 0x56 || 0x76
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        12 },  // 0x57 || 0x77
+    { CCCOL_WHITE,   CCFONT_REGULAR,           16 },  // 0x58 || 0x78
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        16 },  // 0x59 || 0x79
+    { CCCOL_WHITE,   CCFONT_REGULAR,           20 },  // 0x5a || 0x7a
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        20 },  // 0x5b || 0x7b
+    { CCCOL_WHITE,   CCFONT_REGULAR,           24 },  // 0x5c || 0x7c
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        24 },  // 0x5d || 0x7d
+    { CCCOL_WHITE,   CCFONT_REGULAR,           28 },  // 0x5e || 0x7e
+    { CCCOL_WHITE,   CCFONT_UNDERLINED,        28 }   // 0x5f || 0x7f
+    /* total 32 entries */
+};
+
+struct Screen {
+    /* +1 is used to compensate null character of string */
+    uint8_t characters[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t charsets[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t colors[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    uint8_t fonts[SCREEN_ROWS][SCREEN_COLUMNS+1];
+    /*
+     * Bitmask of used rows; if a bit is not set, the
+     * corresponding row is not used.
+     * for setting row 1  use row | (1 << 0)
+     * for setting row 15 use row | (1 << 14)
+     */
+    int16_t row_used;
+};
+
+typedef struct CCaptionSubContext {
+    AVClass *class;
+    int real_time;
+    struct Screen screen[2];
+    int active_screen;
+    uint8_t cursor_row;
+    uint8_t cursor_column;
+    uint8_t cursor_color;
+    uint8_t cursor_font;
+    uint8_t cursor_charset;
+    AVBPrint buffer;
+    int buffer_changed;
+    int rollup;
+    enum cc_mode mode;
+    int64_t start_time;
+    /* visible screen time */
+    int64_t startv_time;
+    int64_t end_time;
+    int screen_touched;
+    int64_t last_real_time;
+    char prev_cmd[2];
+    /* buffer to store pkt data */
+    AVBufferRef *pktbuf;
+    int readorder;
+} CCaptionSubContext;
+
+
+static av_cold int init_decoder(AVCodecContext *avctx)
+{
+    int ret;
+    CCaptionSubContext *ctx = avctx->priv_data;
+
+    av_bprint_init(&ctx->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    /* taking by default roll up to 2 */
+    ctx->mode = CCMODE_ROLLUP;
+    ctx->rollup = 2;
+    ret = ff_ass_subtitle_header(avctx, "Monospace",
+                                 ASS_DEFAULT_FONT_SIZE,
+                                 ASS_DEFAULT_COLOR,
+                                 ASS_DEFAULT_BACK_COLOR,
+                                 ASS_DEFAULT_BOLD,
+                                 ASS_DEFAULT_ITALIC,
+                                 ASS_DEFAULT_UNDERLINE,
+                                 3,
+                                 ASS_DEFAULT_ALIGNMENT);
+    if (ret < 0) {
+        return ret;
+    }
+    /* allocate pkt buffer */
+    ctx->pktbuf = av_buffer_alloc(128);
+    if (!ctx->pktbuf) {
+        ret = AVERROR(ENOMEM);
+    }
+    return ret;
+}
+
+static av_cold int close_decoder(AVCodecContext *avctx)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    av_bprint_finalize(&ctx->buffer, NULL);
+    av_buffer_unref(&ctx->pktbuf);
+    return 0;
+}
+
+static void flush_decoder(AVCodecContext *avctx)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    ctx->screen[0].row_used = 0;
+    ctx->screen[1].row_used = 0;
+    ctx->prev_cmd[0] = 0;
+    ctx->prev_cmd[1] = 0;
+    ctx->mode = CCMODE_ROLLUP;
+    ctx->rollup = 2;
+    ctx->cursor_row = 0;
+    ctx->cursor_column = 0;
+    ctx->cursor_font = 0;
+    ctx->cursor_color = 0;
+    ctx->cursor_charset = 0;
+    ctx->active_screen = 0;
+    ctx->last_real_time = 0;
+    ctx->screen_touched = 0;
+    ctx->buffer_changed = 0;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        ctx->readorder = 0;
+    av_bprint_clear(&ctx->buffer);
+}
+
+/**
+ * @param ctx closed caption context just to print log
+ */
+static int write_char(CCaptionSubContext *ctx, struct Screen *screen, char ch)
+{
+    uint8_t col = ctx->cursor_column;
+    char *row = screen->characters[ctx->cursor_row];
+    char *font = screen->fonts[ctx->cursor_row];
+    char *charset = screen->charsets[ctx->cursor_row];
+
+    if (col < SCREEN_COLUMNS) {
+        row[col] = ch;
+        font[col] = ctx->cursor_font;
+        charset[col] = ctx->cursor_charset;
+        ctx->cursor_charset = CCSET_BASIC_AMERICAN;
+        if (ch) ctx->cursor_column++;
+        return 0;
+    }
+    /* We have extra space at end only for null character */
+    else if (col == SCREEN_COLUMNS && ch == 0) {
+        row[col] = ch;
+        return 0;
+    }
+    else {
+        av_log(ctx, AV_LOG_WARNING, "Data Ignored since exceeding screen width\n");
+        return AVERROR_INVALIDDATA;
+    }
+}
+
+/**
+ * This function after validating parity bit, also remove it from data pair.
+ * The first byte doesn't pass parity, we replace it with a solid blank
+ * and process the pair.
+ * If the second byte doesn't pass parity, it returns INVALIDDATA
+ * user can ignore the whole pair and pass the other pair.
+ */
+static int validate_cc_data_pair(uint8_t *cc_data_pair)
+{
+    uint8_t cc_valid = (*cc_data_pair & 4) >>2;
+    uint8_t cc_type = *cc_data_pair & 3;
+
+    if (!cc_valid)
+        return AVERROR_INVALIDDATA;
+
+    // if EIA-608 data then verify parity.
+    if (cc_type==0 || cc_type==1) {
+        if (!av_parity(cc_data_pair[2])) {
+            return AVERROR_INVALIDDATA;
+        }
+        if (!av_parity(cc_data_pair[1])) {
+            cc_data_pair[1]=0x7F;
+        }
+    }
+
+    //Skip non-data
+    if ((cc_data_pair[0] == 0xFA || cc_data_pair[0] == 0xFC || cc_data_pair[0] == 0xFD)
+         && (cc_data_pair[1] & 0x7F) == 0 && (cc_data_pair[2] & 0x7F) == 0)
+        return AVERROR_PATCHWELCOME;
+
+    //skip 708 data
+    if (cc_type == 3 || cc_type == 2)
+        return AVERROR_PATCHWELCOME;
+
+    /* remove parity bit */
+    cc_data_pair[1] &= 0x7F;
+    cc_data_pair[2] &= 0x7F;
+
+    return 0;
+}
+
+static struct Screen *get_writing_screen(CCaptionSubContext *ctx)
+{
+    switch (ctx->mode) {
+    case CCMODE_POPON:
+        // use Inactive screen
+        return ctx->screen + !ctx->active_screen;
+    case CCMODE_PAINTON:
+    case CCMODE_ROLLUP:
+    case CCMODE_TEXT:
+        // use active screen
+        return ctx->screen + ctx->active_screen;
+    }
+    /* It was never an option */
+    return NULL;
+}
+
+static void roll_up(CCaptionSubContext *ctx)
+{
+    struct Screen *screen;
+    int i, keep_lines;
+
+    if (ctx->mode == CCMODE_TEXT)
+        return;
+
+    screen = get_writing_screen(ctx);
+
+    /* +1 signify cursor_row starts from 0
+     * Can't keep lines less then row cursor pos
+     */
+    keep_lines = FFMIN(ctx->cursor_row + 1, ctx->rollup);
+
+    for (i = 0; i < SCREEN_ROWS; i++) {
+        if (i > ctx->cursor_row - keep_lines && i <= ctx->cursor_row)
+            continue;
+        UNSET_FLAG(screen->row_used, i);
+    }
+
+    for (i = 0; i < keep_lines && screen->row_used; i++) {
+        const int i_row = ctx->cursor_row - keep_lines + i + 1;
+
+        memcpy(screen->characters[i_row], screen->characters[i_row+1], SCREEN_COLUMNS);
+        memcpy(screen->colors[i_row], screen->colors[i_row+1], SCREEN_COLUMNS);
+        memcpy(screen->fonts[i_row], screen->fonts[i_row+1], SCREEN_COLUMNS);
+        memcpy(screen->charsets[i_row], screen->charsets[i_row+1], SCREEN_COLUMNS);
+        if (CHECK_FLAG(screen->row_used, i_row + 1))
+            SET_FLAG(screen->row_used, i_row);
+    }
+
+    UNSET_FLAG(screen->row_used, ctx->cursor_row);
+}
+
+static int capture_screen(CCaptionSubContext *ctx)
+{
+    int i;
+    struct Screen *screen = ctx->screen + ctx->active_screen;
+    enum cc_font prev_font = CCFONT_REGULAR;
+    av_bprint_clear(&ctx->buffer);
+
+    for (i = 0; screen->row_used && i < SCREEN_ROWS; i++)
+    {
+        if (CHECK_FLAG(screen->row_used, i)) {
+            const char *row = screen->characters[i];
+            const char *font = screen->fonts[i];
+            const char *charset = screen->charsets[i];
+            const char *override;
+            int j = 0;
+
+            /* skip leading space */
+            while (row[j] == ' ' && charset[j] == CCSET_BASIC_AMERICAN)
+                j++;
+
+            for (; j < SCREEN_COLUMNS; j++) {
+                const char *e_tag = "", *s_tag = "";
+
+                if (row[j] == 0)
+                    break;
+
+                if (prev_font != font[j]) {
+                    switch (prev_font) {
+                    case CCFONT_ITALICS:
+                        e_tag = "{\\i0}";
+                        break;
+                    case CCFONT_UNDERLINED:
+                        e_tag = "{\\u0}";
+                        break;
+                    case CCFONT_UNDERLINED_ITALICS:
+                        e_tag = "{\\u0}{\\i0}";
+                        break;
+                    }
+                    switch (font[j]) {
+                    case CCFONT_ITALICS:
+                        s_tag = "{\\i1}";
+                        break;
+                    case CCFONT_UNDERLINED:
+                        s_tag = "{\\u1}";
+                        break;
+                    case CCFONT_UNDERLINED_ITALICS:
+                        s_tag = "{\\u1}{\\i1}";
+                        break;
+                    }
+                }
+                prev_font = font[j];
+                override = charset_overrides[(int)charset[j]][(int)row[j]];
+                if (override) {
+                    av_bprintf(&ctx->buffer, "%s%s%s", e_tag, s_tag, override);
+                } else {
+                    av_bprintf(&ctx->buffer, "%s%s%c", e_tag, s_tag, row[j]);
+                }
+            }
+            av_bprintf(&ctx->buffer, "\\N");
+        }
+    }
+    if (!av_bprint_is_complete(&ctx->buffer))
+        return AVERROR(ENOMEM);
+    if (screen->row_used && ctx->buffer.len >= 2) {
+        ctx->buffer.len -= 2;
+        ctx->buffer.str[ctx->buffer.len] = 0;
+    }
+    ctx->buffer_changed = 1;
+    return 0;
+}
+
+static int reap_screen(CCaptionSubContext *ctx, int64_t pts)
+{
+    ctx->start_time = ctx->startv_time;
+    ctx->startv_time = pts;
+    ctx->end_time = pts;
+    return capture_screen(ctx);
+}
+
+static void handle_textattr(CCaptionSubContext *ctx, uint8_t hi, uint8_t lo)
+{
+    int i = lo - 0x20;
+    struct Screen *screen = get_writing_screen(ctx);
+
+    if (i >= 32)
+        return;
+
+    ctx->cursor_color = pac2_attribs[i][0];
+    ctx->cursor_font = pac2_attribs[i][1];
+
+    SET_FLAG(screen->row_used, ctx->cursor_row);
+    write_char(ctx, screen, ' ');
+}
+
+static void handle_pac(CCaptionSubContext *ctx, uint8_t hi, uint8_t lo)
+{
+    static const int8_t row_map[] = {
+        11, -1, 1, 2, 3, 4, 12, 13, 14, 15, 5, 6, 7, 8, 9, 10
+    };
+    const int index = ( (hi<<1) & 0x0e) | ( (lo>>5) & 0x01 );
+    struct Screen *screen = get_writing_screen(ctx);
+    int indent, i;
+
+    if (row_map[index] <= 0) {
+        av_log(ctx, AV_LOG_DEBUG, "Invalid pac index encountered\n");
+        return;
+    }
+
+    lo &= 0x1f;
+
+    ctx->cursor_row = row_map[index] - 1;
+    ctx->cursor_color =  pac2_attribs[lo][0];
+    ctx->cursor_font = pac2_attribs[lo][1];
+    ctx->cursor_charset = CCSET_BASIC_AMERICAN;
+    ctx->cursor_column = 0;
+    indent = pac2_attribs[lo][2];
+    for (i = 0; i < indent; i++) {
+        write_char(ctx, screen, ' ');
+    }
+}
+
+/**
+ * @param pts it is required to set end time
+ */
+static void handle_edm(CCaptionSubContext *ctx, int64_t pts)
+{
+    struct Screen *screen = ctx->screen + ctx->active_screen;
+
+    // In buffered mode, keep writing to screen until it is wiped.
+    // Before wiping the display, capture contents to emit subtitle.
+    if (!ctx->real_time)
+        reap_screen(ctx, pts);
+
+    screen->row_used = 0;
+
+    // In realtime mode, emit an empty caption so the last one doesn't
+    // stay on the screen.
+    if (ctx->real_time)
+        reap_screen(ctx, pts);
+}
+
+static void handle_eoc(CCaptionSubContext *ctx, int64_t pts)
+{
+    // In buffered mode, we wait til the *next* EOC and
+    // reap what was already on the screen since the last EOC.
+    if (!ctx->real_time)
+        handle_edm(ctx,pts);
+
+    ctx->active_screen = !ctx->active_screen;
+    ctx->cursor_column = 0;
+
+    // In realtime mode, we display the buffered contents (after
+    // flipping the buffer to active above) as soon as EOC arrives.
+    if (ctx->real_time)
+        reap_screen(ctx, pts);
+}
+
+static void handle_delete_end_of_row(CCaptionSubContext *ctx, char hi, char lo)
+{
+    struct Screen *screen = get_writing_screen(ctx);
+    write_char(ctx, screen, 0);
+}
+
+static void handle_char(CCaptionSubContext *ctx, char hi, char lo, int64_t pts)
+{
+    struct Screen *screen = get_writing_screen(ctx);
+
+    SET_FLAG(screen->row_used, ctx->cursor_row);
+
+    switch (hi) {
+      case 0x11:
+        ctx->cursor_charset = CCSET_SPECIAL_AMERICAN;
+        break;
+      case 0x12:
+        if (ctx->cursor_column > 0)
+            ctx->cursor_column -= 1;
+        ctx->cursor_charset = CCSET_EXTENDED_SPANISH_FRENCH_MISC;
+        break;
+      case 0x13:
+        if (ctx->cursor_column > 0)
+            ctx->cursor_column -= 1;
+        ctx->cursor_charset = CCSET_EXTENDED_PORTUGUESE_GERMAN_DANISH;
+        break;
+      default:
+        ctx->cursor_charset = CCSET_BASIC_AMERICAN;
+        write_char(ctx, screen, hi);
+        break;
+    }
+
+    if (lo) {
+        write_char(ctx, screen, lo);
+    }
+    write_char(ctx, screen, 0);
+
+    if (ctx->mode != CCMODE_POPON)
+        ctx->screen_touched = 1;
+
+    if (lo)
+       ff_dlog(ctx, "(%c,%c)\n", hi, lo);
+    else
+       ff_dlog(ctx, "(%c)\n", hi);
+}
+
+static void process_cc608(CCaptionSubContext *ctx, int64_t pts, uint8_t hi, uint8_t lo)
+{
+    if (hi == ctx->prev_cmd[0] && lo == ctx->prev_cmd[1]) {
+        /* ignore redundant command */
+        return;
+    }
+
+    /* set prev command */
+    ctx->prev_cmd[0] = hi;
+    ctx->prev_cmd[1] = lo;
+
+    if ( (hi == 0x10 && (lo >= 0x40 && lo <= 0x5f)) ||
+       ( (hi >= 0x11 && hi <= 0x17) && (lo >= 0x40 && lo <= 0x7f) ) ) {
+        handle_pac(ctx, hi, lo);
+    } else if ( ( hi == 0x11 && lo >= 0x20 && lo <= 0x2f ) ||
+                ( hi == 0x17 && lo >= 0x2e && lo <= 0x2f) ) {
+        handle_textattr(ctx, hi, lo);
+    } else if (hi == 0x14 || hi == 0x15 || hi == 0x1c) {
+        switch (lo) {
+        case 0x20:
+            /* resume caption loading */
+            ctx->mode = CCMODE_POPON;
+            break;
+        case 0x24:
+            handle_delete_end_of_row(ctx, hi, lo);
+            break;
+        case 0x25:
+        case 0x26:
+        case 0x27:
+            ctx->rollup = lo - 0x23;
+            ctx->mode = CCMODE_ROLLUP;
+            break;
+        case 0x29:
+            /* resume direct captioning */
+            ctx->mode = CCMODE_PAINTON;
+            break;
+        case 0x2b:
+            /* resume text display */
+            ctx->mode = CCMODE_TEXT;
+            break;
+        case 0x2c:
+            /* erase display memory */
+            handle_edm(ctx, pts);
+            break;
+        case 0x2d:
+            /* carriage return */
+            ff_dlog(ctx, "carriage return\n");
+            if (!ctx->real_time)
+                reap_screen(ctx, pts);
+            roll_up(ctx);
+            ctx->cursor_column = 0;
+            break;
+        case 0x2e:
+            /* erase buffered (non displayed) memory */
+            // Only in realtime mode. In buffered mode, we re-use the inactive screen
+            // for our own buffering.
+            if (ctx->real_time) {
+                struct Screen *screen = ctx->screen + !ctx->active_screen;
+                screen->row_used = 0;
+            }
+            break;
+        case 0x2f:
+            /* end of caption */
+            ff_dlog(ctx, "handle_eoc\n");
+            handle_eoc(ctx, pts);
+            break;
+        default:
+            ff_dlog(ctx, "Unknown command 0x%hhx 0x%hhx\n", hi, lo);
+            break;
+        }
+    } else if (hi >= 0x11 && hi <= 0x13) {
+        /* Special characters */
+        handle_char(ctx, hi, lo, pts);
+    } else if (hi >= 0x20) {
+        /* Standard characters (always in pairs) */
+        handle_char(ctx, hi, lo, pts);
+        ctx->prev_cmd[0] = ctx->prev_cmd[1] = 0;
+    } else {
+        /* Ignoring all other non data code */
+        ff_dlog(ctx, "Unknown command 0x%hhx 0x%hhx\n", hi, lo);
+    }
+}
+
+static int decode(AVCodecContext *avctx, void *data, int *got_sub, AVPacket *avpkt)
+{
+    CCaptionSubContext *ctx = avctx->priv_data;
+    AVSubtitle *sub = data;
+    const int64_t start_time = sub->pts;
+    uint8_t *bptr = NULL;
+    int len = avpkt->size;
+    int ret = 0;
+    int i;
+
+    if (ctx->pktbuf->size < len) {
+        ret = av_buffer_realloc(&ctx->pktbuf, len);
+         if (ret < 0) {
+            av_log(ctx, AV_LOG_WARNING, "Insufficient Memory of %d truncated to %d\n", len, ctx->pktbuf->size);
+            len = ctx->pktbuf->size;
+            ret = 0;
+        }
+    }
+    memcpy(ctx->pktbuf->data, avpkt->data, len);
+    bptr = ctx->pktbuf->data;
+
+    for (i  = 0; i < len; i += 3) {
+        uint8_t cc_type = *(bptr + i) & 3;
+        if (validate_cc_data_pair(bptr + i))
+            continue;
+        /* ignoring data field 1 */
+        if(cc_type == 1)
+            continue;
+        else
+            process_cc608(ctx, start_time, *(bptr + i + 1) & 0x7f, *(bptr + i + 2) & 0x7f);
+
+        if (!ctx->buffer_changed)
+            continue;
+        ctx->buffer_changed = 0;
+
+        if (*ctx->buffer.str || ctx->real_time)
+        {
+            ff_dlog(ctx, "cdp writing data (%s)\n",ctx->buffer.str);
+            ret = ff_ass_add_rect(sub, ctx->buffer.str, ctx->readorder++, 0, NULL, NULL);
+            if (ret < 0)
+                return ret;
+            sub->pts = ctx->start_time;
+            if (!ctx->real_time)
+                sub->end_display_time = av_rescale_q(ctx->end_time - ctx->start_time,
+                                                     AV_TIME_BASE_Q, ms_tb);
+            else
+                sub->end_display_time = -1;
+            ctx->buffer_changed = 0;
+            ctx->last_real_time = sub->pts;
+            ctx->screen_touched = 0;
+        }
+    }
+
+    if (ctx->real_time && ctx->screen_touched &&
+        sub->pts > ctx->last_real_time + av_rescale_q(200, ms_tb, AV_TIME_BASE_Q)) {
+        ctx->last_real_time = sub->pts;
+        ctx->screen_touched = 0;
+
+        capture_screen(ctx);
+        ctx->buffer_changed = 0;
+
+        ret = ff_ass_add_rect(sub, ctx->buffer.str, ctx->readorder++, 0, NULL, NULL);
+        if (ret < 0)
+            return ret;
+        sub->end_display_time = -1;
+    }
+
+    *got_sub = sub->num_rects > 0;
+    return ret;
+}
+
+#define OFFSET(x) offsetof(CCaptionSubContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "real_time", "emit subtitle events as they are decoded for real-time display", OFFSET(real_time), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, SD },
+    {NULL}
+};
+
+static const AVClass ccaption_dec_class = {
+    .class_name = "Closed caption Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_ccaption_decoder = {
+    .name           = "cc_dec",
+    .long_name      = NULL_IF_CONFIG_SMALL("Closed Caption (EIA-608 / CEA-708) Decoder"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_EIA_608,
+    .priv_data_size = sizeof(CCaptionSubContext),
+    .init           = init_decoder,
+    .close          = close_decoder,
+    .flush          = flush_decoder,
+    .decode         = decode,
+    .priv_class     = &ccaption_dec_class,
+};
diff --git a/libavcodec/cdgraphics.c b/libavcodec/cdgraphics.c
index 3b68f45..87ad5e7 100644
--- a/libavcodec/cdgraphics.c
+++ b/libavcodec/cdgraphics.c
@@ -2,20 +2,20 @@
  * CD Graphics Video Decoder
  * Copyright (c) 2009 Michael Tison
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,6 +49,7 @@
 #define CDG_INST_TILE_BLOCK        6
 #define CDG_INST_SCROLL_PRESET    20
 #define CDG_INST_SCROLL_COPY      24
+#define CDG_INST_TRANSPARENT_COL  28
 #define CDG_INST_LOAD_PAL_LO      30
 #define CDG_INST_LOAD_PAL_HIGH    31
 #define CDG_INST_TILE_BLOCK_XOR   38
@@ -67,6 +68,7 @@ typedef struct CDGraphicsContext {
     AVFrame *frame;
     int hscroll;
     int vscroll;
+    int transparency;
 } CDGraphicsContext;
 
 static av_cold int cdg_decode_init(AVCodecContext *avctx)
@@ -76,6 +78,7 @@ static av_cold int cdg_decode_init(AVCodecContext *avctx)
     cc->frame = av_frame_alloc();
     if (!cc->frame)
         return AVERROR(ENOMEM);
+    cc->transparency = -1;
 
     avctx->width   = CDG_FULL_WIDTH;
     avctx->height  = CDG_FULL_HEIGHT;
@@ -119,7 +122,9 @@ static void cdg_load_palette(CDGraphicsContext *cc, uint8_t *data, int low)
         r = ((color >> 8) & 0x000F) * 17;
         g = ((color >> 4) & 0x000F) * 17;
         b = ((color     ) & 0x000F) * 17;
-        palette[i + array_offset] = r << 16 | g << 8 | b;
+        palette[i + array_offset] = 0xFFU << 24 | r << 16 | g << 8 | b;
+        if (cc->transparency >= 0)
+            palette[cc->transparency] &= 0xFFFFFF;
     }
     cc->frame->palette_has_changed = 1;
 }
@@ -265,20 +270,27 @@ static int cdg_decode_frame(AVCodecContext *avctx,
     int buf_size       = avpkt->size;
     int ret;
     uint8_t command, inst;
-    uint8_t cdg_data[CDG_DATA_SIZE];
+    uint8_t cdg_data[CDG_DATA_SIZE] = {0};
     AVFrame *frame = data;
     CDGraphicsContext *cc = avctx->priv_data;
 
-    bytestream2_init(&gb, avpkt->data, avpkt->size);
+    if (buf_size < CDG_MINIMUM_PKT_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "buffer too small for decoder\n");
+        return AVERROR(EINVAL);
+    }
+    if (buf_size > CDG_HEADER_SIZE + CDG_DATA_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "buffer too big for decoder\n");
+        return AVERROR(EINVAL);
+    }
 
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-    ret = ff_reget_buffer(avctx, cc->frame);
-    if (ret) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, cc->frame)) < 0)
         return ret;
-    }
-    if (!avctx->frame_number)
+    if (!avctx->frame_number) {
         memset(cc->frame->data[0], 0, cc->frame->linesize[0] * avctx->height);
+        memset(cc->frame->data[1], 0, AVPALETTE_SIZE);
+    }
 
     command = bytestream2_get_byte(&gb);
     inst    = bytestream2_get_byte(&gb);
@@ -325,11 +337,8 @@ static int cdg_decode_frame(AVCodecContext *avctx,
                 return AVERROR(EINVAL);
             }
 
-            ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF);
-            if (ret) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
                 return ret;
-            }
 
             cdg_scroll(cc, cdg_data, frame, inst == CDG_INST_SCROLL_COPY);
             av_frame_unref(cc->frame);
@@ -337,6 +346,9 @@ static int cdg_decode_frame(AVCodecContext *avctx,
             if (ret < 0)
                 return ret;
             break;
+        case CDG_INST_TRANSPARENT_COL:
+            cc->transparency = cdg_data[0] & 0xF;
+            break;
         default:
             break;
         }
diff --git a/libavcodec/cdxl.c b/libavcodec/cdxl.c
index 99e96eb..c8d66b5 100644
--- a/libavcodec/cdxl.c
+++ b/libavcodec/cdxl.c
@@ -2,32 +2,41 @@
  * CDXL video decoder
  * Copyright (c) 2011-2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * @file
+ * Commodore CDXL video decoder
+ * @author Paul B Mahol
+ */
+
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
+#include "bytestream.h"
 #include "get_bits.h"
 #include "internal.h"
 
 #define BIT_PLANAR   0x00
-#define BYTE_PLANAR  0x20
-#define CHUNKY       0x40
+#define CHUNKY       0x20
+#define BYTE_PLANAR  0x40
 #define BIT_LINE     0x80
 #define BYTE_LINE    0xC0
 
@@ -63,7 +72,7 @@ static void import_palette(CDXLVideoContext *c, uint32_t *new_palette)
         unsigned r   = ((rgb >> 8) & 0xF) * 0x11;
         unsigned g   = ((rgb >> 4) & 0xF) * 0x11;
         unsigned b   =  (rgb       & 0xF) * 0x11;
-        AV_WN32(&new_palette[i], (r << 16) | (g << 8) | b);
+        AV_WN32(&new_palette[i], (0xFFU << 24) | (r << 16) | (g << 8) | b);
     }
 }
 
@@ -72,7 +81,8 @@ static void bitplanar2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
     GetBitContext gb;
     int x, y, plane;
 
-    init_get_bits(&gb, c->video, c->video_size * 8);
+    if (init_get_bits8(&gb, c->video, c->video_size) < 0)
+        return;
     for (plane = 0; plane < c->bpp; plane++) {
         for (y = 0; y < c->avctx->height; y++) {
             for (x = 0; x < c->avctx->width; x++)
@@ -87,7 +97,8 @@ static void bitline2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
     GetBitContext  gb;
     int x, y, plane;
 
-    init_get_bits(&gb, c->video, c->video_size * 8);
+    if (init_get_bits8(&gb, c->video, c->video_size) < 0)
+        return;
     for (y = 0; y < c->avctx->height; y++) {
         for (plane = 0; plane < c->bpp; plane++) {
             for (x = 0; x < c->avctx->width; x++)
@@ -97,6 +108,17 @@ static void bitline2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
     }
 }
 
+static void chunky2chunky(CDXLVideoContext *c, int linesize, uint8_t *out)
+{
+    GetByteContext gb;
+    int y;
+
+    bytestream2_init(&gb, c->video, c->video_size);
+    for (y = 0; y < c->avctx->height; y++) {
+        bytestream2_get_buffer(&gb, out + linesize * y, c->avctx->width * 3);
+    }
+}
+
 static void import_format(CDXLVideoContext *c, int linesize, uint8_t *out)
 {
     memset(out, 0, linesize * c->avctx->height);
@@ -108,6 +130,9 @@ static void import_format(CDXLVideoContext *c, int linesize, uint8_t *out)
     case BIT_LINE:
         bitline2chunky(c, linesize, out);
         break;
+    case CHUNKY:
+        chunky2chunky(c, linesize, out);
+        break;
     }
 }
 
@@ -115,10 +140,16 @@ static void cdxl_decode_rgb(CDXLVideoContext *c, AVFrame *frame)
 {
     uint32_t *new_palette = (uint32_t *)frame->data[1];
 
+    memset(frame->data[1], 0, AVPALETTE_SIZE);
     import_palette(c, new_palette);
     import_format(c, frame->linesize[0], frame->data[0]);
 }
 
+static void cdxl_decode_raw(CDXLVideoContext *c, AVFrame *frame)
+{
+    import_format(c, frame->linesize[0], frame->data[0]);
+}
+
 static void cdxl_decode_ham6(CDXLVideoContext *c, AVFrame *frame)
 {
     AVCodecContext *avctx = c->avctx;
@@ -231,7 +262,7 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     if (c->bpp < 1)
         return AVERROR_INVALIDDATA;
-    if (c->format != BIT_PLANAR && c->format != BIT_LINE) {
+    if (c->format != BIT_PLANAR && c->format != BIT_LINE && c->format != CHUNKY) {
         avpriv_request_sample(avctx, "Pixel format 0x%0x", c->format);
         return AVERROR_PATCHWELCOME;
     }
@@ -239,7 +270,10 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    aligned_width = FFALIGN(c->avctx->width, 16);
+    if (c->format == CHUNKY)
+        aligned_width = avctx->width;
+    else
+        aligned_width = FFALIGN(c->avctx->width, 16);
     c->padded_bits  = aligned_width - c->avctx->width;
     if (c->video_size < aligned_width * avctx->height * c->bpp / 8)
         return AVERROR_INVALIDDATA;
@@ -249,16 +283,17 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
         if (c->palette_size != (1 << (c->bpp - 1)))
             return AVERROR_INVALIDDATA;
         avctx->pix_fmt = AV_PIX_FMT_BGR24;
+    } else if (!encoding && c->bpp == 24 && c->format == CHUNKY &&
+               !c->palette_size) {
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
     } else {
-        avpriv_request_sample(avctx, "Encoding %d and bpp %d",
-                              encoding, c->bpp);
+        avpriv_request_sample(avctx, "Encoding %d, bpp %d and format 0x%x",
+                              encoding, c->bpp, c->format);
         return AVERROR_PATCHWELCOME;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
 
     if (encoding) {
@@ -270,8 +305,10 @@ static int cdxl_decode_frame(AVCodecContext *avctx, void *data,
             cdxl_decode_ham8(c, p);
         else
             cdxl_decode_ham6(c, p);
-    } else {
+    } else if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
         cdxl_decode_rgb(c, p);
+    } else {
+        cdxl_decode_raw(c, p);
     }
     *got_frame = 1;
 
@@ -282,7 +319,7 @@ static av_cold int cdxl_decode_end(AVCodecContext *avctx)
 {
     CDXLVideoContext *c = avctx->priv_data;
 
-    av_free(c->new_video);
+    av_freep(&c->new_video);
 
     return 0;
 }
diff --git a/libavcodec/celp_filters.c b/libavcodec/celp_filters.c
index 61474f5..a81fd88 100644
--- a/libavcodec/celp_filters.c
+++ b/libavcodec/celp_filters.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include "avcodec.h"
 #include "celp_filters.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 
 void ff_celp_convolve_circ(int16_t* fc_out, const int16_t* fc_in,
@@ -104,6 +105,8 @@ void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs,
     c -= filter_coeffs[1] * filter_coeffs[0];
     c -= filter_coeffs[0] * b;
 
+    av_assert2((filter_length&1)==0 && filter_length>=4);
+
     old_out0 = out[-4];
     old_out1 = out[-3];
     old_out2 = out[-2];
@@ -205,3 +208,12 @@ void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs,
             out[n] += filter_coeffs[i-1] * in[n-i];
     }
 }
+
+void ff_celp_filter_init(CELPFContext *c)
+{
+    c->celp_lp_synthesis_filterf        = ff_celp_lp_synthesis_filterf;
+    c->celp_lp_zero_synthesis_filterf   = ff_celp_lp_zero_synthesis_filterf;
+
+    if(HAVE_MIPSFPU)
+        ff_celp_filter_init_mips(c);
+}
diff --git a/libavcodec/celp_filters.h b/libavcodec/celp_filters.h
index c328258..f644ec3 100644
--- a/libavcodec/celp_filters.h
+++ b/libavcodec/celp_filters.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,55 @@
 
 #include <stdint.h>
 
+typedef struct CELPFContext {
+    /**
+     * LP synthesis filter.
+     * @param[out] out pointer to output buffer
+     *        - the array out[-filter_length, -1] must
+     *        contain the previous result of this filter
+     * @param filter_coeffs filter coefficients.
+     * @param in input signal
+     * @param buffer_length amount of data to process
+     * @param filter_length filter length (10 for 10th order LP filter). Must be
+     *                      greater than 4 and even.
+     *
+     * @note Output buffer must contain filter_length samples of past
+     *       speech data before pointer.
+     *
+     * Routine applies 1/A(z) filter to given speech data.
+     */
+    void (*celp_lp_synthesis_filterf)(float *out, const float *filter_coeffs,
+                                      const float *in, int buffer_length,
+                                      int filter_length);
+
+    /**
+     * LP zero synthesis filter.
+     * @param[out] out pointer to output buffer
+     * @param filter_coeffs filter coefficients.
+     * @param in input signal
+     *        - the array in[-filter_length, -1] must
+     *        contain the previous input of this filter
+     * @param buffer_length amount of data to process (should be a multiple of eight)
+     * @param filter_length filter length (10 for 10th order LP filter;
+     *                                      should be a multiple of two)
+     *
+     * @note Output buffer must contain filter_length samples of past
+     *       speech data before pointer.
+     *
+     * Routine applies A(z) filter to given speech data.
+     */
+    void (*celp_lp_zero_synthesis_filterf)(float *out, const float *filter_coeffs,
+                                           const float *in, int buffer_length,
+                                           int filter_length);
+
+}CELPFContext;
+
+/**
+ * Initialize CELPFContext.
+ */
+void ff_celp_filter_init(CELPFContext *c);
+void ff_celp_filter_init_mips(CELPFContext *c);
+
 /**
  * Circularly convolve fixed vector with a phase dispersion impulse
  *        response filter (D.6.2 of G.729 and 6.1.5 of AMR).
diff --git a/libavcodec/celp_math.c b/libavcodec/celp_math.c
index 8a788f5..a96b1ae 100644
--- a/libavcodec/celp_math.c
+++ b/libavcodec/celp_math.c
@@ -3,31 +3,30 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 #include <limits.h>
-#include <assert.h>
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
-#include "celp_math.h"
 #include "mathops.h"
-
+#include "celp_math.h"
 #include "libavutil/common.h"
 
 static const uint16_t exp2a[]=
@@ -50,7 +49,7 @@ int ff_exp2(uint16_t power)
 {
     unsigned int result= exp2a[power>>10] + 0x10000;
 
-    assert(power <= 0x7fff);
+    av_assert2(power <= 0x7fff);
 
     result= (result<<3) + ((result*exp2b[(power>>5)&31])>>17);
     return result + ((result*(power&31)*89)>>22);
@@ -63,10 +62,17 @@ int ff_exp2(uint16_t power)
  */
 static const uint16_t tab_log2[33] =
 {
+#ifdef G729_BITEXACT
+      0,   1455,   2866,   4236,   5568,   6863,   8124,   9352,
+  10549,  11716,  12855,  13967,  15054,  16117,  17156,  18172,
+  19167,  20142,  21097,  22033,  22951,  23852,  24735,  25603,
+  26455,  27291,  28113,  28922,  29716,  30497,  31266,  32023,  32767,
+#else
       4,   1459,   2870,   4240,   5572,   6867,   8127,   9355,
   10552,  11719,  12858,  13971,  15057,  16120,  17158,  18175,
   19170,  20145,  21100,  22036,  22954,  23854,  24738,  25605,
   26457,  27294,  28116,  28924,  29719,  30500,  31269,  32025,  32769,
+#endif
 };
 
 int ff_log2_q15(uint32_t value)
@@ -99,3 +105,22 @@ int64_t ff_dot_product(const int16_t *a, const int16_t *b, int length)
 
     return sum;
 }
+
+float ff_dot_productf(const float* a, const float* b, int length)
+{
+    float sum = 0;
+    int i;
+
+    for(i=0; i<length; i++)
+        sum += a[i] * b[i];
+
+    return sum;
+}
+
+void ff_celp_math_init(CELPMContext *c)
+{
+    c->dot_productf   = ff_dot_productf;
+
+    if(HAVE_MIPSFPU)
+        ff_celp_math_init_mips(c);
+}
diff --git a/libavcodec/celp_math.h b/libavcodec/celp_math.h
index 9cebdfe..18888a4 100644
--- a/libavcodec/celp_math.h
+++ b/libavcodec/celp_math.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,25 @@
 
 #include <stdint.h>
 
+typedef struct CELPMContext {
+    /**
+     * Return the dot product.
+     * @param a input data array
+     * @param b input data array
+     * @param length number of elements
+     *
+     * @return dot product = sum of elementwise products
+     */
+    float (*dot_productf)(const float* a, const float* b, int length);
+
+}CELPMContext;
+
+/**
+ * Initialize CELPMContext.
+ */
+void ff_celp_math_init(CELPMContext *c);
+void ff_celp_math_init_mips(CELPMContext *c);
+
 /**
  * fixed-point implementation of exp2(x) in [0; 1] domain.
  * @param power argument to exp2, 0 <= power <= 0x7fff
@@ -65,4 +84,14 @@ static inline int bidir_sal(int value, int offset)
     else           return value <<  offset;
 }
 
+/**
+ * Return the dot product.
+ * @param a input data array
+ * @param b input data array
+ * @param length number of elements
+ *
+ * @return dot product = sum of elementwise products
+ */
+float ff_dot_productf(const float* a, const float* b, int length);
+
 #endif /* AVCODEC_CELP_MATH_H */
diff --git a/libavcodec/cfhd.c b/libavcodec/cfhd.c
new file mode 100644
index 0000000..74facd4
--- /dev/null
+++ b/libavcodec/cfhd.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2015-2016 Kieran Kunhya <kieran@kunhya.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * CFHD Video Decoder
+ */
+
+#include "libavutil/buffer.h"
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "thread.h"
+#include "cfhd.h"
+
+#define SUBBAND_COUNT 10
+
+static av_cold int cfhd_decode_init(AVCodecContext *avctx)
+{
+    CFHDContext *s = avctx->priv_data;
+
+    avctx->bits_per_raw_sample = 10;
+    s->avctx                   = avctx;
+
+    return ff_cfhd_init_vlcs(s);
+}
+
+static void init_plane_defaults(CFHDContext *s)
+{
+    s->subband_num        = 0;
+    s->level              = 0;
+    s->subband_num_actual = 0;
+}
+
+static void init_frame_defaults(CFHDContext *s)
+{
+    s->coded_width       = 0;
+    s->coded_height      = 0;
+    s->bpc               = 10;
+    s->channel_cnt       = 4;
+    s->subband_cnt       = 10;
+    s->channel_num       = 0;
+    s->lowpass_precision = 16;
+    s->quantisation      = 1;
+    s->wavelet_depth     = 3;
+    s->pshift            = 1;
+    s->codebook          = 0;
+    init_plane_defaults(s);
+}
+
+/* TODO: merge with VLC tables or use LUT */
+static inline int dequant_and_decompand(int level, int quantisation)
+{
+    int64_t abslevel = abs(level);
+    return (abslevel + ((768 * abslevel * abslevel * abslevel) / (255 * 255 * 255))) * FFSIGN(level) * quantisation;
+}
+
+static inline void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride,
+                          int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
+{
+    int16_t tmp;
+
+    int i;
+    for (i = 0; i < len; i++) {
+        if (i == 0) {
+            tmp = (11*low[0*low_stride] - 4*low[1*low_stride] + low[2*low_stride] + 4) >> 3;
+            output[(2*i+0)*out_stride] = (tmp + high[0*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+            tmp = ( 5*low[0*low_stride] + 4*low[1*low_stride] - low[2*low_stride] + 4) >> 3;
+            output[(2*i+1)*out_stride] = (tmp - high[0*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+        } else if (i == len-1) {
+            tmp = ( 5*low[i*low_stride] + 4*low[(i-1)*low_stride] - low[(i-2)*low_stride] + 4) >> 3;
+            output[(2*i+0)*out_stride] = (tmp + high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+            tmp = (11*low[i*low_stride] - 4*low[(i-1)*low_stride] + low[(i-2)*low_stride] + 4) >> 3;
+            output[(2*i+1)*out_stride] = (tmp - high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+        } else {
+            tmp = (low[(i-1)*low_stride] - low[(i+1)*low_stride] + 4) >> 3;
+            output[(2*i+0)*out_stride] = (tmp + low[i*low_stride] + high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+            tmp = (low[(i+1)*low_stride] - low[(i-1)*low_stride] + 4) >> 3;
+            output[(2*i+1)*out_stride] = (tmp + low[i*low_stride] - high[i*high_stride]) >> 1;
+            if (clip)
+                output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+        }
+    }
+}
+
+static void horiz_filter(int16_t *output, int16_t *low, int16_t *high, int width)
+{
+    filter(output, 1, low, 1, high, 1, width, 0);
+}
+
+static void horiz_filter_clip(int16_t *output, int16_t *low, int16_t *high, int width, uint8_t clip)
+{
+    filter(output, 1, low, 1, high, 1, width, clip);
+}
+
+static void vert_filter(int16_t *output, int out_stride, int16_t *low, int low_stride,
+                        int16_t *high, int high_stride, int len)
+{
+    filter(output, out_stride, low, low_stride, high, high_stride, len, 0);
+}
+
+static void free_buffers(AVCodecContext *avctx)
+{
+    CFHDContext *s = avctx->priv_data;
+    int i, j;
+
+    for (i = 0; i < 4; i++) {
+        av_freep(&s->plane[i].idwt_buf);
+        av_freep(&s->plane[i].idwt_tmp);
+
+        for (j = 0; j < 9; j++)
+            s->plane[i].subband[j] = NULL;
+
+        for (j = 0; j < 8; j++)
+            s->plane[i].l_h[j] = NULL;
+    }
+    s->a_height = 0;
+    s->a_width  = 0;
+}
+
+static int alloc_buffers(AVCodecContext *avctx)
+{
+    CFHDContext *s = avctx->priv_data;
+    int i, j, k, ret, planes;
+
+    if ((ret = ff_set_dimensions(avctx, s->coded_width, s->coded_height)) < 0)
+        return ret;
+    avctx->pix_fmt = s->coded_format;
+
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
+    planes = av_pix_fmt_count_planes(avctx->pix_fmt);
+
+    for (i = 0; i < planes; i++) {
+        int width = i ? avctx->width >> s->chroma_x_shift : avctx->width;
+        int height = i ? avctx->height >> s->chroma_y_shift : avctx->height;
+        int stride = FFALIGN(width / 8, 8) * 8;
+        int w8, h8, w4, h4, w2, h2;
+        height = FFALIGN(height / 8, 2) * 8;
+        s->plane[i].width = width;
+        s->plane[i].height = height;
+        s->plane[i].stride = stride;
+
+        w8 = FFALIGN(s->plane[i].width / 8, 8);
+        h8 = FFALIGN(s->plane[i].height / 8, 2);
+        w4 = w8 * 2;
+        h4 = h8 * 2;
+        w2 = w4 * 2;
+        h2 = h4 * 2;
+
+        s->plane[i].idwt_buf = av_mallocz_array(height * stride, sizeof(*s->plane[i].idwt_buf));
+        s->plane[i].idwt_tmp = av_malloc_array(height * stride, sizeof(*s->plane[i].idwt_tmp));
+        if (!s->plane[i].idwt_buf || !s->plane[i].idwt_tmp) {
+            return AVERROR(ENOMEM);
+        }
+
+        s->plane[i].subband[0] = s->plane[i].idwt_buf;
+        s->plane[i].subband[1] = s->plane[i].idwt_buf + 2 * w8 * h8;
+        s->plane[i].subband[2] = s->plane[i].idwt_buf + 1 * w8 * h8;
+        s->plane[i].subband[3] = s->plane[i].idwt_buf + 3 * w8 * h8;
+        s->plane[i].subband[4] = s->plane[i].idwt_buf + 2 * w4 * h4;
+        s->plane[i].subband[5] = s->plane[i].idwt_buf + 1 * w4 * h4;
+        s->plane[i].subband[6] = s->plane[i].idwt_buf + 3 * w4 * h4;
+        s->plane[i].subband[7] = s->plane[i].idwt_buf + 2 * w2 * h2;
+        s->plane[i].subband[8] = s->plane[i].idwt_buf + 1 * w2 * h2;
+        s->plane[i].subband[9] = s->plane[i].idwt_buf + 3 * w2 * h2;
+
+        for (j = 0; j < DWT_LEVELS; j++) {
+            for(k = 0; k < 4; k++) {
+                s->plane[i].band[j][k].a_width  = w8 << j;
+                s->plane[i].band[j][k].a_height = h8 << j;
+            }
+        }
+
+        /* ll2 and ll1 commented out because they are done in-place */
+        s->plane[i].l_h[0] = s->plane[i].idwt_tmp;
+        s->plane[i].l_h[1] = s->plane[i].idwt_tmp + 2 * w8 * h8;
+        //s->plane[i].l_h[2] = ll2;
+        s->plane[i].l_h[3] = s->plane[i].idwt_tmp;
+        s->plane[i].l_h[4] = s->plane[i].idwt_tmp + 2 * w4 * h4;
+        //s->plane[i].l_h[5] = ll1;
+        s->plane[i].l_h[6] = s->plane[i].idwt_tmp;
+        s->plane[i].l_h[7] = s->plane[i].idwt_tmp + 2 * w2 * h2;
+    }
+
+    s->a_height = s->coded_height;
+    s->a_width  = s->coded_width;
+    s->a_format = s->coded_format;
+
+    return 0;
+}
+
+static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
+                       AVPacket *avpkt)
+{
+    CFHDContext *s = avctx->priv_data;
+    GetByteContext gb;
+    ThreadFrame frame = { .f = data };
+    AVFrame *pic = data;
+    int ret = 0, i, j, planes, plane, got_buffer = 0;
+    int16_t *coeff_data;
+
+    s->coded_format = AV_PIX_FMT_YUV422P10;
+    init_frame_defaults(s);
+    planes = av_pix_fmt_count_planes(s->coded_format);
+
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
+
+    while (bytestream2_get_bytes_left(&gb) > 4) {
+        /* Bit weird but implement the tag parsing as the spec says */
+        uint16_t tagu   = bytestream2_get_be16(&gb);
+        int16_t tag     = (int16_t)tagu;
+        int8_t tag8     = (int8_t)(tagu >> 8);
+        uint16_t abstag = abs(tag);
+        int8_t abs_tag8 = abs(tag8);
+        uint16_t data   = bytestream2_get_be16(&gb);
+        if (abs_tag8 >= 0x60 && abs_tag8 <= 0x6f) {
+            av_log(avctx, AV_LOG_DEBUG, "large len %x\n", ((tagu & 0xff) << 16) | data);
+        } else if (tag == 20) {
+            av_log(avctx, AV_LOG_DEBUG, "Width %"PRIu16"\n", data);
+            s->coded_width = data;
+        } else if (tag == 21) {
+            av_log(avctx, AV_LOG_DEBUG, "Height %"PRIu16"\n", data);
+            s->coded_height = data;
+        } else if (tag == 101) {
+            av_log(avctx, AV_LOG_DEBUG, "Bits per component: %"PRIu16"\n", data);
+            s->bpc = data;
+        } else if (tag == 12) {
+            av_log(avctx, AV_LOG_DEBUG, "Channel Count: %"PRIu16"\n", data);
+            s->channel_cnt = data;
+            if (data > 4) {
+                av_log(avctx, AV_LOG_ERROR, "Channel Count of %"PRIu16" is unsupported\n", data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+        } else if (tag == 14) {
+            av_log(avctx, AV_LOG_DEBUG, "Subband Count: %"PRIu16"\n", data);
+            if (data != SUBBAND_COUNT) {
+                av_log(avctx, AV_LOG_ERROR, "Subband Count of %"PRIu16" is unsupported\n", data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+        } else if (tag == 62) {
+            s->channel_num = data;
+            av_log(avctx, AV_LOG_DEBUG, "Channel number %"PRIu16"\n", data);
+            if (s->channel_num >= planes) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid channel number\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            init_plane_defaults(s);
+        } else if (tag == 48) {
+            if (s->subband_num != 0 && data == 1)  // hack
+                s->level++;
+            av_log(avctx, AV_LOG_DEBUG, "Subband number %"PRIu16"\n", data);
+            s->subband_num = data;
+            if (s->level >= DWT_LEVELS) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid level\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+            if (s->subband_num > 3) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid subband number\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 51) {
+            av_log(avctx, AV_LOG_DEBUG, "Subband number actual %"PRIu16"\n", data);
+            s->subband_num_actual = data;
+            if (s->subband_num_actual >= 10) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid subband number actual\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 35)
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass precision bits: %"PRIu16"\n", data);
+        else if (tag == 53) {
+            s->quantisation = data;
+            av_log(avctx, AV_LOG_DEBUG, "Quantisation: %"PRIu16"\n", data);
+        } else if (tag == 109) {
+            s->prescale_shift[0] = (data >> 0) & 0x7;
+            s->prescale_shift[1] = (data >> 3) & 0x7;
+            s->prescale_shift[2] = (data >> 6) & 0x7;
+            av_log(avctx, AV_LOG_DEBUG, "Prescale shift (VC-5): %x\n", data);
+        } else if (tag == 27) {
+            s->plane[s->channel_num].band[0][0].width  = data;
+            s->plane[s->channel_num].band[0][0].stride = data;
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass width %"PRIu16"\n", data);
+            if (data < 2 || data > s->plane[s->channel_num].band[0][0].a_width) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid lowpass width\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 28) {
+            s->plane[s->channel_num].band[0][0].height = data;
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass height %"PRIu16"\n", data);
+            if (data < 2 || data > s->plane[s->channel_num].band[0][0].height) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid lowpass height\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 1)
+            av_log(avctx, AV_LOG_DEBUG, "Sample type? %"PRIu16"\n", data);
+        else if (tag == 10) {
+            if (data != 0) {
+                avpriv_report_missing_feature(avctx, "Transform type of %"PRIu16, data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+            av_log(avctx, AV_LOG_DEBUG, "Transform-type? %"PRIu16"\n", data);
+        } else if (abstag >= 0x4000 && abstag <= 0x40ff) {
+            av_log(avctx, AV_LOG_DEBUG, "Small chunk length %d %s\n", data * 4, tag < 0 ? "optional" : "required");
+            bytestream2_skipu(&gb, data * 4);
+        } else if (tag == 23) {
+            av_log(avctx, AV_LOG_DEBUG, "Skip frame\n");
+            avpriv_report_missing_feature(avctx, "Skip frame");
+            ret = AVERROR_PATCHWELCOME;
+            break;
+        } else if (tag == 2) {
+            av_log(avctx, AV_LOG_DEBUG, "tag=2 header - skipping %i tag/value pairs\n", data);
+            if (data > bytestream2_get_bytes_left(&gb) / 4) {
+                av_log(avctx, AV_LOG_ERROR, "too many tag/value pairs (%d)\n", data);
+                ret = AVERROR_INVALIDDATA;
+                break;
+            }
+            for (i = 0; i < data; i++) {
+                uint16_t tag2 = bytestream2_get_be16(&gb);
+                uint16_t val2 = bytestream2_get_be16(&gb);
+                av_log(avctx, AV_LOG_DEBUG, "Tag/Value = %x %x\n", tag2, val2);
+            }
+        } else if (tag == 41) {
+            s->plane[s->channel_num].band[s->level][s->subband_num].width  = data;
+            s->plane[s->channel_num].band[s->level][s->subband_num].stride = FFALIGN(data, 8);
+            av_log(avctx, AV_LOG_DEBUG, "Highpass width %i channel %i level %i subband %i\n", data, s->channel_num, s->level, s->subband_num);
+            if (data < 2) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass width\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 42) {
+            s->plane[s->channel_num].band[s->level][s->subband_num].height = data;
+            av_log(avctx, AV_LOG_DEBUG, "Highpass height %i\n", data);
+            if (data < 2) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass height\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 49) {
+            s->plane[s->channel_num].band[s->level][s->subband_num].width  = data;
+            s->plane[s->channel_num].band[s->level][s->subband_num].stride = FFALIGN(data, 8);
+            av_log(avctx, AV_LOG_DEBUG, "Highpass width2 %i\n", data);
+            if (data < 2) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass width2\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 50) {
+            s->plane[s->channel_num].band[s->level][s->subband_num].height = data;
+            av_log(avctx, AV_LOG_DEBUG, "Highpass height2 %i\n", data);
+            if (data < 2) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid highpass height2\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 71) {
+            s->codebook = data;
+            av_log(avctx, AV_LOG_DEBUG, "Codebook %i\n", s->codebook);
+        } else if (tag == 72) {
+            s->codebook = data;
+            av_log(avctx, AV_LOG_DEBUG, "Other codebook? %i\n", s->codebook);
+        } else if (tag == 70) {
+            av_log(avctx, AV_LOG_DEBUG, "Subsampling or bit-depth flag? %i\n", data);
+            s->bpc = data;
+            if (!(s->bpc == 10 || s->bpc == 12)) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid bits per channel\n");
+                ret = AVERROR(EINVAL);
+                break;
+            }
+        } else if (tag == 84) {
+            av_log(avctx, AV_LOG_DEBUG, "Sample format? %i\n", data);
+            if (data == 1)
+                s->coded_format = AV_PIX_FMT_YUV422P10;
+            else if (data == 3)
+                s->coded_format = AV_PIX_FMT_GBRP12;
+            else if (data == 4)
+                s->coded_format = AV_PIX_FMT_GBRAP12;
+            else {
+                avpriv_report_missing_feature(avctx, "Sample format of %"PRIu16, data);
+                ret = AVERROR_PATCHWELCOME;
+                break;
+            }
+            planes = av_pix_fmt_count_planes(s->coded_format);
+        } else
+            av_log(avctx, AV_LOG_DEBUG,  "Unknown tag %i data %x\n", tag, data);
+
+        /* Some kind of end of header tag */
+        if (tag == 4 && data == 0x1a4a && s->coded_width && s->coded_height &&
+            s->coded_format != AV_PIX_FMT_NONE) {
+            if (s->a_width != s->coded_width || s->a_height != s->coded_height ||
+                s->a_format != s->coded_format) {
+                free_buffers(avctx);
+                if ((ret = alloc_buffers(avctx)) < 0) {
+                    free_buffers(avctx);
+                    return ret;
+                }
+            }
+            ret = ff_set_dimensions(avctx, s->coded_width, s->coded_height);
+            if (ret < 0)
+                return ret;
+            frame.f->width =
+            frame.f->height = 0;
+
+            if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+                return ret;
+
+            s->coded_width = 0;
+            s->coded_height = 0;
+            s->coded_format = AV_PIX_FMT_NONE;
+            got_buffer = 1;
+        }
+        coeff_data = s->plane[s->channel_num].subband[s->subband_num_actual];
+
+        /* Lowpass coefficients */
+        if (tag == 4 && data == 0xf0f && s->a_width && s->a_height) {
+            int lowpass_height = s->plane[s->channel_num].band[0][0].height;
+            int lowpass_width  = s->plane[s->channel_num].band[0][0].width;
+            int lowpass_a_height = s->plane[s->channel_num].band[0][0].a_height;
+            int lowpass_a_width  = s->plane[s->channel_num].band[0][0].a_width;
+
+            if (!got_buffer) {
+                av_log(avctx, AV_LOG_ERROR, "No end of header tag found\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+
+            if (lowpass_height > lowpass_a_height || lowpass_width > lowpass_a_width ||
+                lowpass_a_width * lowpass_a_height * sizeof(int16_t) > bytestream2_get_bytes_left(&gb)) {
+                av_log(avctx, AV_LOG_ERROR, "Too many lowpass coefficients\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+
+            av_log(avctx, AV_LOG_DEBUG, "Start of lowpass coeffs component %d height:%d, width:%d\n", s->channel_num, lowpass_height, lowpass_width);
+            for (i = 0; i < lowpass_height; i++) {
+                for (j = 0; j < lowpass_width; j++)
+                    coeff_data[j] = bytestream2_get_be16u(&gb);
+
+                coeff_data += lowpass_width;
+            }
+
+            /* Align to mod-4 position to continue reading tags */
+            bytestream2_seek(&gb, bytestream2_tell(&gb) & 3, SEEK_CUR);
+
+            /* Copy last line of coefficients if odd height */
+            if (lowpass_height & 1) {
+                memcpy(&coeff_data[lowpass_height * lowpass_width],
+                       &coeff_data[(lowpass_height - 1) * lowpass_width],
+                       lowpass_width * sizeof(*coeff_data));
+            }
+
+            av_log(avctx, AV_LOG_DEBUG, "Lowpass coefficients %d\n", lowpass_width * lowpass_height);
+        }
+
+        if (tag == 55 && s->subband_num_actual != 255 && s->a_width && s->a_height) {
+            int highpass_height = s->plane[s->channel_num].band[s->level][s->subband_num].height;
+            int highpass_width  = s->plane[s->channel_num].band[s->level][s->subband_num].width;
+            int highpass_a_width = s->plane[s->channel_num].band[s->level][s->subband_num].a_width;
+            int highpass_a_height = s->plane[s->channel_num].band[s->level][s->subband_num].a_height;
+            int highpass_stride = s->plane[s->channel_num].band[s->level][s->subband_num].stride;
+            int expected = highpass_height * highpass_stride;
+            int a_expected = highpass_a_height * highpass_a_width;
+            int level, run, coeff;
+            int count = 0, bytes;
+
+            if (!got_buffer) {
+                av_log(avctx, AV_LOG_ERROR, "No end of header tag found\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+
+            if (highpass_height > highpass_a_height || highpass_width > highpass_a_width || a_expected < expected) {
+                av_log(avctx, AV_LOG_ERROR, "Too many highpass coefficents\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+
+            av_log(avctx, AV_LOG_DEBUG, "Start subband coeffs plane %i level %i codebook %i expected %i\n", s->channel_num, s->level, s->codebook, expected);
+
+            init_get_bits(&s->gb, gb.buffer, bytestream2_get_bytes_left(&gb) * 8);
+            {
+                OPEN_READER(re, &s->gb);
+                if (!s->codebook) {
+                    while (1) {
+                        UPDATE_CACHE(re, &s->gb);
+                        GET_RL_VLC(level, run, re, &s->gb, s->table_9_rl_vlc,
+                                   VLC_BITS, 3, 1);
+
+                        /* escape */
+                        if (level == 64)
+                            break;
+
+                        count += run;
+
+                        if (count > expected)
+                            break;
+
+                        coeff = dequant_and_decompand(level, s->quantisation);
+                        for (i = 0; i < run; i++)
+                            *coeff_data++ = coeff;
+                    }
+                } else {
+                    while (1) {
+                        UPDATE_CACHE(re, &s->gb);
+                        GET_RL_VLC(level, run, re, &s->gb, s->table_18_rl_vlc,
+                                   VLC_BITS, 3, 1);
+
+                        /* escape */
+                        if (level == 255 && run == 2)
+                            break;
+
+                        count += run;
+
+                        if (count > expected)
+                            break;
+
+                        coeff = dequant_and_decompand(level, s->quantisation);
+                        for (i = 0; i < run; i++)
+                            *coeff_data++ = coeff;
+                    }
+                }
+                CLOSE_READER(re, &s->gb);
+            }
+
+            if (count > expected) {
+                av_log(avctx, AV_LOG_ERROR, "Escape codeword not found, probably corrupt data\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
+
+            bytes = FFALIGN(FF_CEIL_RSHIFT(get_bits_count(&s->gb), 3), 4);
+            if (bytes > bytestream2_get_bytes_left(&gb)) {
+                av_log(avctx, AV_LOG_ERROR, "Bitstream overread error\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            } else
+                bytestream2_seek(&gb, bytes, SEEK_CUR);
+
+            av_log(avctx, AV_LOG_DEBUG, "End subband coeffs %i extra %i\n", count, count - expected);
+            s->codebook = 0;
+
+            /* Copy last line of coefficients if odd height */
+            if (highpass_height & 1) {
+                memcpy(&coeff_data[highpass_height * highpass_stride],
+                       &coeff_data[(highpass_height - 1) * highpass_stride],
+                       highpass_stride * sizeof(*coeff_data));
+            }
+        }
+    }
+
+    if (!s->a_width || !s->a_height || s->a_format == AV_PIX_FMT_NONE ||
+        s->coded_width || s->coded_height || s->coded_format != AV_PIX_FMT_NONE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid dimensions\n");
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    if (!got_buffer) {
+        av_log(avctx, AV_LOG_ERROR, "No end of header tag found\n");
+        ret = AVERROR(EINVAL);
+        goto end;
+    }
+
+    planes = av_pix_fmt_count_planes(avctx->pix_fmt);
+    for (plane = 0; plane < planes && !ret; plane++) {
+        /* level 1 */
+        int lowpass_height  = s->plane[plane].band[0][0].height;
+        int lowpass_width   = s->plane[plane].band[0][0].width;
+        int highpass_stride = s->plane[plane].band[0][1].stride;
+        int act_plane = plane == 1 ? 2 : plane == 2 ? 1 : plane;
+        int16_t *low, *high, *output, *dst;
+
+        if (lowpass_height > s->plane[plane].band[0][0].a_height || lowpass_width > s->plane[plane].band[0][0].a_width ||
+            !highpass_stride || s->plane[plane].band[0][1].width > s->plane[plane].band[0][1].a_width) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid plane dimensions\n");
+            ret = AVERROR(EINVAL);
+            goto end;
+        }
+
+        av_log(avctx, AV_LOG_DEBUG, "Decoding level 1 plane %i %i %i %i\n", plane, lowpass_height, lowpass_width, highpass_stride);
+
+        low    = s->plane[plane].subband[0];
+        high   = s->plane[plane].subband[2];
+        output = s->plane[plane].l_h[0];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
+
+        low    = s->plane[plane].subband[1];
+        high   = s->plane[plane].subband[3];
+        output = s->plane[plane].l_h[1];
+
+        for (i = 0; i < lowpass_width; i++) {
+            // note the stride of "low" is highpass_stride
+            vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
+
+        low    = s->plane[plane].l_h[0];
+        high   = s->plane[plane].l_h[1];
+        output = s->plane[plane].subband[0];
+        for (i = 0; i < lowpass_height * 2; i++) {
+            horiz_filter(output, low, high, lowpass_width);
+            low    += lowpass_width;
+            high   += lowpass_width;
+            output += lowpass_width * 2;
+        }
+        if (s->bpc == 12) {
+            output = s->plane[plane].subband[0];
+            for (i = 0; i < lowpass_height * 2; i++) {
+                for (j = 0; j < lowpass_width * 2; j++)
+                    output[j] <<= 2;
+
+                output += lowpass_width * 2;
+            }
+        }
+
+        /* level 2 */
+        lowpass_height  = s->plane[plane].band[1][1].height;
+        lowpass_width   = s->plane[plane].band[1][1].width;
+        highpass_stride = s->plane[plane].band[1][1].stride;
+
+        if (lowpass_height > s->plane[plane].band[1][1].a_height || lowpass_width > s->plane[plane].band[1][1].a_width ||
+            !highpass_stride || s->plane[plane].band[1][1].width > s->plane[plane].band[1][1].a_width) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid plane dimensions\n");
+            ret = AVERROR(EINVAL);
+            goto end;
+        }
+
+        av_log(avctx, AV_LOG_DEBUG, "Level 2 plane %i %i %i %i\n", plane, lowpass_height, lowpass_width, highpass_stride);
+
+        low    = s->plane[plane].subband[0];
+        high   = s->plane[plane].subband[5];
+        output = s->plane[plane].l_h[3];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
+
+        low    = s->plane[plane].subband[4];
+        high   = s->plane[plane].subband[6];
+        output = s->plane[plane].l_h[4];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
+
+        low    = s->plane[plane].l_h[3];
+        high   = s->plane[plane].l_h[4];
+        output = s->plane[plane].subband[0];
+        for (i = 0; i < lowpass_height * 2; i++) {
+            horiz_filter(output, low, high, lowpass_width);
+            low    += lowpass_width;
+            high   += lowpass_width;
+            output += lowpass_width * 2;
+        }
+
+        output = s->plane[plane].subband[0];
+        for (i = 0; i < lowpass_height * 2; i++) {
+            for (j = 0; j < lowpass_width * 2; j++)
+                output[j] <<= 2;
+
+            output += lowpass_width * 2;
+        }
+
+        /* level 3 */
+        lowpass_height  = s->plane[plane].band[2][1].height;
+        lowpass_width   = s->plane[plane].band[2][1].width;
+        highpass_stride = s->plane[plane].band[2][1].stride;
+
+        if (lowpass_height > s->plane[plane].band[2][1].a_height || lowpass_width > s->plane[plane].band[2][1].a_width ||
+            !highpass_stride || s->plane[plane].band[2][1].width > s->plane[plane].band[2][1].a_width) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid plane dimensions\n");
+            ret = AVERROR(EINVAL);
+            goto end;
+        }
+
+        av_log(avctx, AV_LOG_DEBUG, "Level 3 plane %i %i %i %i\n", plane, lowpass_height, lowpass_width, highpass_stride);
+
+        low    = s->plane[plane].subband[0];
+        high   = s->plane[plane].subband[8];
+        output = s->plane[plane].l_h[6];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, lowpass_width, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
+
+        low    = s->plane[plane].subband[7];
+        high   = s->plane[plane].subband[9];
+        output = s->plane[plane].l_h[7];
+        for (i = 0; i < lowpass_width; i++) {
+            vert_filter(output, lowpass_width, low, highpass_stride, high, highpass_stride, lowpass_height);
+            low++;
+            high++;
+            output++;
+        }
+
+        dst = (int16_t *)pic->data[act_plane];
+        low  = s->plane[plane].l_h[6];
+        high = s->plane[plane].l_h[7];
+        for (i = 0; i < lowpass_height * 2; i++) {
+            horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+            low  += lowpass_width;
+            high += lowpass_width;
+            dst  += pic->linesize[act_plane] / 2;
+        }
+    }
+
+
+end:
+    if (ret < 0)
+        return ret;
+
+    *got_frame = 1;
+    return avpkt->size;
+}
+
+static av_cold int cfhd_close_decoder(AVCodecContext *avctx)
+{
+    CFHDContext *s = avctx->priv_data;
+
+    free_buffers(avctx);
+
+    if (!avctx->internal->is_copy) {
+        ff_free_vlc(&s->vlc_9);
+        ff_free_vlc(&s->vlc_18);
+    }
+
+    return 0;
+}
+
+AVCodec ff_cfhd_decoder = {
+    .name           = "cfhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("Cineform HD"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_CFHD,
+    .priv_data_size = sizeof(CFHDContext),
+    .init           = cfhd_decode_init,
+    .close          = cfhd_close_decoder,
+    .decode         = cfhd_decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/cfhd.h b/libavcodec/cfhd.h
new file mode 100644
index 0000000..67a0e4c
--- /dev/null
+++ b/libavcodec/cfhd.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2015 Kieran Kunhya
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CFHD_H
+#define AVCODEC_CFHD_H
+
+#include <stdint.h>
+
+#include "libavutil/avassert.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+
+#define VLC_BITS 9
+#define NB_VLC_TABLE_9 (71+3)
+#define NB_VLC_TABLE_18 (263+1)
+
+typedef struct CFHD_RL_VLC_ELEM {
+    int16_t level;
+    int8_t len;
+    uint16_t run;
+} CFHD_RL_VLC_ELEM;
+
+#define DWT_LEVELS 3
+
+typedef struct SubBand {
+    int level;
+    int orientation;
+    int stride;
+    int a_width;
+    int width;
+    int a_height;
+    int height;
+    int pshift;
+    int quant;
+    uint8_t *ibuf;
+} SubBand;
+
+typedef struct Plane {
+    int width;
+    int height;
+    ptrdiff_t stride;
+
+    int16_t *idwt_buf;
+    int16_t *idwt_tmp;
+
+    /* TODO: merge this into SubBand structure */
+    int16_t *subband[10];
+    int16_t *l_h[8];
+
+    SubBand band[DWT_LEVELS][4];
+} Plane;
+
+typedef struct CFHDContext {
+    AVCodecContext *avctx;
+
+    CFHD_RL_VLC_ELEM table_9_rl_vlc[2088];
+    VLC vlc_9;
+
+    CFHD_RL_VLC_ELEM table_18_rl_vlc[4572];
+    VLC vlc_18;
+
+    GetBitContext gb;
+
+    int chroma_x_shift;
+    int chroma_y_shift;
+
+    int coded_width;
+    int coded_height;
+    int coded_format;
+
+    int a_width;
+    int a_height;
+    int a_format;
+
+    int bpc;
+    int channel_cnt;
+    int subband_cnt;
+    int channel_num;
+    uint8_t lowpass_precision;
+    uint16_t quantisation;
+    int wavelet_depth;
+    int pshift;
+
+    int codebook;
+    int subband_num;
+    int level;
+    int subband_num_actual;
+
+    uint8_t prescale_shift[3];
+    Plane plane[4];
+
+} CFHDContext;
+
+int ff_cfhd_init_vlcs(CFHDContext *s);
+
+#endif /* AVCODEC_CFHD_H */
diff --git a/libavcodec/cfhddata.c b/libavcodec/cfhddata.c
new file mode 100644
index 0000000..9330d34
--- /dev/null
+++ b/libavcodec/cfhddata.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2015 Kieran Kunhya
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "stdint.h"
+#include "cfhd.h"
+
+/* some special codewords, not sure what they all mean */
+#define TABLE_9_BAND_END1 0x1C7859Eh
+#define TABLE_9_BAND_END_LEN1 25
+#define TABLE_9_BAND_END2 0x38F0B3Fh
+#define TABLE_9_BAND_END_LEN2 26
+#define TABLE_9_BAND_END3 0x38F0B3Eh
+#define TABLE_9_BAND_END_LEN3 26
+
+static const uint8_t table_9_vlc_len[NB_VLC_TABLE_9] = {
+     1,    2,    4,    5,    5,    5,    6,    6,
+     6,    7,    7,    8,    8,    8,    8,    9,
+     9,    9,    9,    9,   10,   10,   11,   11,
+    11,   11,   12,   12,   12,   12,   13,   13,
+    13,   14,   14,   14,   14,   14,   14,   15,
+    15,   15,   15,   16,   16,   16,   16,   17,
+    17,   17,   17,   17,   18,   18,   18,   19,
+    19,   19,   20,   20,   20,   20,   20,   22,
+    23,   23,   23,   23,   24,   24,   24,   25,
+    26,   26,
+};
+
+static const uint32_t table_9_vlc_bits[NB_VLC_TABLE_9] = {
+            0,       0x2,       0xc,      0x1a,      0x1d,      0x1e,      0x39,      0x3e,
+         0x37,      0x7e,      0x6c,      0xe2,      0xfe,      0xdb,      0xe0,     0x1c3,
+        0x1c6,     0x1ff,     0x1fe,     0x1b5,     0x369,     0x385,     0x71d,     0x6d0,
+        0x708,     0x71f,     0xe3d,     0xe39,     0xe13,     0xe12,    0x1c71,    0x1b45,
+       0x1b47,    0x3689,    0x38f2,    0x38e1,    0x38e0,    0x38f1,    0x3688,    0x6d1b,
+       0x71e0,    0x6d19,    0x71e7,    0xe3cd,    0xda35,    0xda30,    0xe3c3,   0x1b469,
+      0x1b462,   0x1c798,   0x1b463,   0x1c799,   0x38f08,   0x38f09,   0x38f0a,   0x6d1a0,
+      0x6d1a3,   0x6d1a1,   0xda345,   0xda344,   0xe3c2d,   0xe3c2f,   0xe3c2e,  0x38f0b2,
+     0x71e160,  0x71e162,  0x71e166,  0x71e161,  0xe3c2ce,  0xe3c2c6,  0xe3c2c7, 0x1C7859E,
+    0x38F0B3F, 0x38F0B3E,
+};
+
+static const uint16_t table_9_vlc_run[NB_VLC_TABLE_9] = {
+    1,    1,    1,    1,   12,    1,   32,  160,
+    1,    1,    1,  320,    1,    1,   80,  120,
+    1,    1,  100,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1
+};
+
+static const uint8_t table_9_vlc_level[NB_VLC_TABLE_9] = {
+     0,    1,    2,    3,    0,    4,    0,    0,
+     5,    7,    6,    0,    9,    8,    0,    0,
+    11,   12,    0,   10,   13,   14,   17,   15,
+    16,   18,   22,   21,   20,   19,   25,   23,
+    24,   27,   31,   29,   28,   30,   26,   33,
+    34,   32,   35,   39,   37,   36,   38,   42,
+    40,   43,   41,   44,   45,   46,   47,   48,
+    50,   49,   52,   51,   53,   55,   54,   56,
+    57,   59,   60,   58,   61,   62,   63,   64,
+    64,   64,
+};
+
+static const uint32_t table_18_vlc_bits[NB_VLC_TABLE_18] = {
+            0,       0x2,       0x7,      0x19,      0x30,      0x36,      0x6f,      0x63,
+         0x69,      0x6b,      0xd1,      0xd4,      0xdc,     0x189,     0x18a,     0x1a0,
+        0x1ab,     0x377,     0x310,     0x316,     0x343,     0x354,     0x375,     0x623,
+        0x684,     0x685,     0x6ab,     0x6ec,     0xddb,     0xc5c,     0xc5e,     0xc44,
+        0xd55,     0xdd1,     0xdd3,    0x1bb5,    0x188b,    0x18bb,    0x18bf,    0x1aa8,
+       0x1ba0,    0x1ba5,    0x1ba4,    0x3115,    0x3175,    0x317d,    0x3553,    0x3768,
+       0x6e87,    0x6ed3,    0x62e8,    0x62f8,    0x6228,    0x6aa4,    0x6e85,    0xc453,
+       0xc5d3,    0xc5f3,    0xdda4,    0xdd08,    0xdd0c,   0x1bb4b,   0x1bb4a,   0x18ba5,
+      0x18be5,   0x1aa95,   0x1aa97,   0x188a4,   0x1ba13,   0x31748,   0x317c8,   0x35528,
+      0x3552c,   0x37424,   0x37434,   0x37436,   0x62294,   0x62e92,   0x62f92,   0x6aa52,
+      0x6aa5a,   0x6e86a,   0x6e86e,   0x6e84a,   0xc452a,   0xc5d27,   0xc5f26,   0xd54a6,
+      0xd54b6,   0xdd096,   0xdd0d6,   0xdd0de,  0x188a56,  0x18ba4d,  0x18be4e,  0x18be4f,
+     0x1aa96e,  0x1ba12e,  0x1ba12f,  0x1ba1af,  0x1ba1bf,  0x37435d,  0x37437d,  0x317498,
+     0x35529c,  0x35529d,  0x3552de,  0x3552df,  0x62e933,  0x62295d,  0x6aa53d,  0x6aa53f,
+     0x6aa53e,  0x6e86b9,  0x6e86f8,  0xd54a79,  0xc5d265,  0xc452b8,  0xdd0d71,  0xd54a78,
+     0xdd0d70,  0xdd0df2,  0xdd0df3, 0x188a5f6, 0x188a5f5, 0x188a5f4, 0x188a5f3, 0x188a5f2,
+    0x188a5f1, 0x188a5f0, 0x188a5ef, 0x188a5ee, 0x188a5ed, 0x188a5aa, 0x188a5e3, 0x188a5df,
+    0x188a589, 0x188a5dd, 0x188a578, 0x188a5e0, 0x188a588, 0x188a5d6, 0x188a5db, 0x188a5e1,
+    0x188a587, 0x188a59a, 0x188a5c4, 0x188a5ec, 0x188a586, 0x188a573, 0x188a59c, 0x188a5c8,
+    0x188a5fb, 0x188a5a1, 0x188a5eb, 0x188a5a8, 0x188a584, 0x188a5d2, 0x188a599, 0x188a598,
+    0x188a583, 0x18ba4c9, 0x188a5d0, 0x188a594, 0x188a582, 0x188a5cb, 0x188a5d8, 0x188a5e7,
+    0x188a581, 0x188a5ea, 0x188a5a9, 0x188a5a6, 0x188a580, 0x188a5a0, 0x188a59d, 0x188a5c3,
+    0x188a57f, 0x188a5c0, 0x188a5de, 0x188a5d4, 0x188a57e, 0x188a5c2, 0x188a592, 0x188a5cd,
+    0x188a57d, 0x188a5a3, 0x188a5e8, 0x188a5a2, 0x188a57c, 0x188a58e, 0x188a5b3, 0x188a5b2,
+    0x188a5b1, 0x188a5b0, 0x188a5af, 0x188a5ae, 0x188a5ad, 0x188a5ac, 0x188a5ab, 0x188a5da,
+    0x188a5e4, 0x188a5e5, 0x188a5d9, 0x188a5b5, 0x188a5bc, 0x188a5bd, 0x188a5e9, 0x188a5cc,
+    0x188a585, 0x188a5d3, 0x188a5e2, 0x188a595, 0x188a596, 0x188a5b8, 0x188a590, 0x188a5c9,
+    0x188a5a4, 0x188a5e6, 0x188a5a5, 0x188a5ce, 0x188a5bf, 0x188a572, 0x188a59b, 0x188a5be,
+    0x188a5c7, 0x188a5ca, 0x188a5d5, 0x188a57b, 0x188a58d, 0x188a58c, 0x188a58b, 0x188a58a,
+    0x18ba4c8, 0x188a5c5, 0x188a5fa, 0x188a5bb, 0x188a5c1, 0x188a5cf, 0x188a5b9, 0x188a5b6,
+    0x188a597, 0x188a5fe, 0x188a5d7, 0x188a5ba, 0x188a591, 0x188a5c6, 0x188a5dc, 0x188a57a,
+    0x188a59f, 0x188a5f9, 0x188a5b4, 0x188a5a7, 0x188a58f, 0x188a5fd, 0x188a5b7, 0x188a593,
+    0x188a59e, 0x188a5f8, 0x188a5ff, 0x188a5fc, 0x188a579, 0x188a5f7, 0x3114ba2, 0x3114ba3,
+};
+
+static const uint8_t table_18_vlc_len[NB_VLC_TABLE_18] = {
+     1,  2,  3,  5,  6,  6,  7,  7,
+     7,  7,  8,  8,  8,  9,  9,  9,
+     9, 10, 10, 10, 10, 10, 10, 11,
+    11, 11, 11, 11, 12, 12, 12, 12,
+    12, 12, 12, 13, 13, 13, 13, 13,
+    13, 13, 13, 14, 14, 14, 14, 14,
+    15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 17, 17, 17,
+    17, 17, 17, 17, 17, 18, 18, 18,
+    18, 18, 18, 18, 19, 19, 19, 19,
+    19, 19, 19, 19, 20, 20, 20, 20,
+    20, 20, 20, 20, 21, 21, 21, 21,
+    21, 21, 21, 21, 21, 22, 22, 22,
+    22, 22, 22, 22, 23, 23, 23, 23,
+    23, 23, 23, 24, 24, 24, 24, 24,
+    24, 24, 24, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 26, 26,
+};
+
+static const uint16_t table_18_vlc_run[NB_VLC_TABLE_18] = {
+    1,    1,    1,    1,    1,    1,    1,    1,
+   12,    1,   20,    1,    1,    1,   32,    1,
+    1,    1,    1,    1,   60,    1,    1,    1,
+    1,  100,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,  180,    1,
+    1,  320,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    1,
+    1,    1,    1,    1,    1,    1,    1,    2,
+};
+
+static const uint8_t table_18_vlc_level[NB_VLC_TABLE_18] = {
+      0,    1,    2,    3,    4,    5,    8,    6,
+      0,    7,    0,    9,   10,   11,    0,   12,
+     13,   18,   14,   15,    0,   16,   17,   19,
+     20,    0,   21,   22,   29,   24,   25,   23,
+     26,   27,   28,   35,   30,   31,    0,   32,
+     33,    0,   34,   36,   37,   38,   39,   40,
+     46,   47,   42,   43,   41,   44,   45,   48,
+     49,   50,   53,   51,   52,   61,   60,   55,
+     56,   57,   58,   54,   59,   62,   63,   64,
+     65,   66,   67,   68,   69,   70,   71,   72,
+     73,   75,   76,   74,   77,   78,   79,   80,
+     81,   82,   83,   84,   85,   86,   87,   88,
+     89,   90,   91,   92,   93,   99,  100,   94,
+     95,   96,   97,   98,  102,  101,  103,  105,
+    104,  106,  107,  111,  109,  108,  113,  110,
+    112,  114,  115,  225,  189,  188,  203,  202,
+    197,  207,  169,  223,  159,  235,  152,  192,
+    179,  201,  172,  149,  178,  120,  219,  150,
+    127,  211,  125,  158,  247,  238,  163,  228,
+    183,  217,  168,  122,  128,  249,  187,  186,
+    136,  181,  255,  230,  135,  233,  222,  145,
+    134,  167,  248,  209,  243,  216,  164,  140,
+    157,  239,  191,  251,  156,  139,  242,  133,
+    162,  213,  165,  212,  227,  198,  236,  234,
+    117,  215,  124,  123,  254,  253,  148,  218,
+    146,  147,  224,  143,  184,  185,  166,  132,
+    129,  250,  151,  119,  193,  176,  245,  229,
+    206,  144,  208,  137,  241,  237,  190,  240,
+    131,  232,  252,  171,  205,  204,  118,  214,
+    180,  126,  182,  175,  141,  138,  177,  153,
+    194,  160,  121,  174,  246,  130,  200,  170,
+    221,  196,  142,  210,  199,  155,  154,  244,
+    220,  195,  161,  231,  173,  226,  116,  255,
+};
+
+av_cold int ff_cfhd_init_vlcs(CFHDContext *s)
+{
+    int i, j, ret = 0;
+    uint32_t new_cfhd_vlc_bits[NB_VLC_TABLE_18 * 2];
+    uint8_t  new_cfhd_vlc_len[NB_VLC_TABLE_18 * 2];
+    uint16_t new_cfhd_vlc_run[NB_VLC_TABLE_18 * 2];
+    int16_t  new_cfhd_vlc_level[NB_VLC_TABLE_18 * 2];
+
+    /** Similar to dv.c, generate signed VLC tables **/
+
+    /* Table 9 */
+    for (i = 0, j = 0; i < NB_VLC_TABLE_9; i++, j++) {
+        new_cfhd_vlc_bits[j]  = table_9_vlc_bits[i];
+        new_cfhd_vlc_len[j]   = table_9_vlc_len[i];
+        new_cfhd_vlc_run[j]   = table_9_vlc_run[i];
+        new_cfhd_vlc_level[j] = table_9_vlc_level[i];
+
+        /* Don't include the zero level nor escape bits */
+        if (table_9_vlc_level[i] &&
+            new_cfhd_vlc_bits[j] != table_9_vlc_bits[NB_VLC_TABLE_9-1]) {
+            new_cfhd_vlc_bits[j] <<= 1;
+            new_cfhd_vlc_len[j]++;
+            j++;
+            new_cfhd_vlc_bits[j]  = (table_9_vlc_bits[i] << 1) | 1;
+            new_cfhd_vlc_len[j]   =  table_9_vlc_len[i] + 1;
+            new_cfhd_vlc_run[j]   =  table_9_vlc_run[i];
+            new_cfhd_vlc_level[j] = -table_9_vlc_level[i];
+        }
+    }
+
+    ret = init_vlc(&s->vlc_9, VLC_BITS, j, new_cfhd_vlc_len,
+                   1, 1, new_cfhd_vlc_bits, 4, 4, 0);
+    if (ret < 0)
+        return ret;
+    for (i = 0; i < s->vlc_9.table_size; i++) {
+        int code = s->vlc_9.table[i][0];
+        int len  = s->vlc_9.table[i][1];
+        int level, run;
+
+        if (len < 0) { // more bits needed
+            run   = 0;
+            level = code;
+        } else {
+            run   = new_cfhd_vlc_run[code];
+            level = new_cfhd_vlc_level[code];
+        }
+        s->table_9_rl_vlc[i].len   = len;
+        s->table_9_rl_vlc[i].level = level;
+        s->table_9_rl_vlc[i].run   = run;
+    }
+
+    /* Table 18 */
+    for (i = 0, j = 0; i < NB_VLC_TABLE_18; i++, j++) {
+        new_cfhd_vlc_bits[j]  = table_18_vlc_bits[i];
+        new_cfhd_vlc_len[j]   = table_18_vlc_len[i];
+        new_cfhd_vlc_run[j]   = table_18_vlc_run[i];
+        new_cfhd_vlc_level[j] = table_18_vlc_level[i];
+
+        /* Don't include the zero level nor escape bits */
+        if (table_18_vlc_level[i] &&
+            new_cfhd_vlc_bits[j] != table_18_vlc_bits[NB_VLC_TABLE_18-1]) {
+            new_cfhd_vlc_bits[j] <<= 1;
+            new_cfhd_vlc_len[j]++;
+            j++;
+            new_cfhd_vlc_bits[j]  = (table_18_vlc_bits[i] << 1) | 1;
+            new_cfhd_vlc_len[j]   =  table_18_vlc_len[i] + 1;
+            new_cfhd_vlc_run[j]   =  table_18_vlc_run[i];
+            new_cfhd_vlc_level[j] = -table_18_vlc_level[i];
+        }
+    }
+
+    ret = init_vlc(&s->vlc_18, VLC_BITS, j, new_cfhd_vlc_len,
+                   1, 1, new_cfhd_vlc_bits, 4, 4, 0);
+    if (ret < 0)
+        return ret;
+    av_assert0(s->vlc_18.table_size == 4572);
+
+    for (i = 0; i < s->vlc_18.table_size; i++) {
+        int code = s->vlc_18.table[i][0];
+        int len  = s->vlc_18.table[i][1];
+        int level, run;
+
+        if (len < 0) { // more bits needed
+            run   = 0;
+            level = code;
+        } else {
+            run   = new_cfhd_vlc_run[code];
+            level = new_cfhd_vlc_level[code];
+        }
+        s->table_18_rl_vlc[i].len   = len;
+        s->table_18_rl_vlc[i].level = level;
+        s->table_18_rl_vlc[i].run   = run;
+    }
+
+    return ret;
+}
diff --git a/libavcodec/cga_data.c b/libavcodec/cga_data.c
index 2c63ff2..023a86b 100644
--- a/libavcodec/cga_data.c
+++ b/libavcodec/cga_data.c
@@ -1,435 +1,46 @@
 /*
  * CGA/EGA/VGA ROM data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * CGA/EGA/VGA ROM data
+ * @note fonts are in libavutil/xga_font_data.[ch]
  */
 
 #include <stdint.h>
 #include "cga_data.h"
 
-const uint8_t ff_cga_font[2048] = {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0xbd, 0x99, 0x81, 0x7e,
- 0x7e, 0xff, 0xdb, 0xff, 0xc3, 0xe7, 0xff, 0x7e, 0x6c, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10, 0x00,
- 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x38, 0x7c, 0x38, 0xfe, 0xfe, 0x7c, 0x38, 0x7c,
- 0x10, 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x7c, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x18, 0x00, 0x00,
- 0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00,
- 0xff, 0xc3, 0x99, 0xbd, 0xbd, 0x99, 0xc3, 0xff, 0x0f, 0x07, 0x0f, 0x7d, 0xcc, 0xcc, 0xcc, 0x78,
- 0x3c, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x70, 0xf0, 0xe0,
- 0x7f, 0x63, 0x7f, 0x63, 0x63, 0x67, 0xe6, 0xc0, 0x99, 0x5a, 0x3c, 0xe7, 0xe7, 0x3c, 0x5a, 0x99,
- 0x80, 0xe0, 0xf8, 0xfe, 0xf8, 0xe0, 0x80, 0x00, 0x02, 0x0e, 0x3e, 0xfe, 0x3e, 0x0e, 0x02, 0x00,
- 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x66, 0x00,
- 0x7f, 0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x00, 0x3e, 0x63, 0x38, 0x6c, 0x6c, 0x38, 0xcc, 0x78,
- 0x00, 0x00, 0x00, 0x00, 0x7e, 0x7e, 0x7e, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x7e, 0x3c, 0x18, 0xff,
- 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00,
- 0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00,
- 0x00, 0x00, 0xc0, 0xc0, 0xc0, 0xfe, 0x00, 0x00, 0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00,
- 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x00, 0x00, 0x00, 0xff, 0xff, 0x7e, 0x3c, 0x18, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x78, 0x78, 0x30, 0x30, 0x00, 0x30, 0x00,
- 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6c, 0x6c, 0xfe, 0x6c, 0xfe, 0x6c, 0x6c, 0x00,
- 0x30, 0x7c, 0xc0, 0x78, 0x0c, 0xf8, 0x30, 0x00, 0x00, 0xc6, 0xcc, 0x18, 0x30, 0x66, 0xc6, 0x00,
- 0x38, 0x6c, 0x38, 0x76, 0xdc, 0xcc, 0x76, 0x00, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x30, 0x60, 0x60, 0x60, 0x30, 0x18, 0x00, 0x60, 0x30, 0x18, 0x18, 0x18, 0x30, 0x60, 0x00,
- 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00, 0x00, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00,
- 0x7c, 0xc6, 0xce, 0xde, 0xf6, 0xe6, 0x7c, 0x00, 0x30, 0x70, 0x30, 0x30, 0x30, 0x30, 0xfc, 0x00,
- 0x78, 0xcc, 0x0c, 0x38, 0x60, 0xcc, 0xfc, 0x00, 0x78, 0xcc, 0x0c, 0x38, 0x0c, 0xcc, 0x78, 0x00,
- 0x1c, 0x3c, 0x6c, 0xcc, 0xfe, 0x0c, 0x1e, 0x00, 0xfc, 0xc0, 0xf8, 0x0c, 0x0c, 0xcc, 0x78, 0x00,
- 0x38, 0x60, 0xc0, 0xf8, 0xcc, 0xcc, 0x78, 0x00, 0xfc, 0xcc, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x00,
- 0x78, 0xcc, 0xcc, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x78, 0xcc, 0xcc, 0x7c, 0x0c, 0x18, 0x70, 0x00,
- 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x00, 0x00, 0x30, 0x30, 0x60,
- 0x18, 0x30, 0x60, 0xc0, 0x60, 0x30, 0x18, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0xfc, 0x00, 0x00,
- 0x60, 0x30, 0x18, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x78, 0xcc, 0x0c, 0x18, 0x30, 0x00, 0x30, 0x00,
- 0x7c, 0xc6, 0xde, 0xde, 0xde, 0xc0, 0x78, 0x00, 0x30, 0x78, 0xcc, 0xcc, 0xfc, 0xcc, 0xcc, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x66, 0x66, 0xfc, 0x00, 0x3c, 0x66, 0xc0, 0xc0, 0xc0, 0x66, 0x3c, 0x00,
- 0xf8, 0x6c, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0xfe, 0x62, 0x68, 0x78, 0x68, 0x62, 0xfe, 0x00,
- 0xfe, 0x62, 0x68, 0x78, 0x68, 0x60, 0xf0, 0x00, 0x3c, 0x66, 0xc0, 0xc0, 0xce, 0x66, 0x3e, 0x00,
- 0xcc, 0xcc, 0xcc, 0xfc, 0xcc, 0xcc, 0xcc, 0x00, 0x78, 0x30, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x1e, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0x78, 0x00, 0xe6, 0x66, 0x6c, 0x78, 0x6c, 0x66, 0xe6, 0x00,
- 0xf0, 0x60, 0x60, 0x60, 0x62, 0x66, 0xfe, 0x00, 0xc6, 0xee, 0xfe, 0xfe, 0xd6, 0xc6, 0xc6, 0x00,
- 0xc6, 0xe6, 0xf6, 0xde, 0xce, 0xc6, 0xc6, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x38, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xdc, 0x78, 0x1c, 0x00,
- 0xfc, 0x66, 0x66, 0x7c, 0x6c, 0x66, 0xe6, 0x00, 0x78, 0xcc, 0xe0, 0x70, 0x1c, 0xcc, 0x78, 0x00,
- 0xfc, 0xb4, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xfc, 0x00,
- 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x00, 0xc6, 0xc6, 0xc6, 0xd6, 0xfe, 0xee, 0xc6, 0x00,
- 0xc6, 0xc6, 0x6c, 0x38, 0x38, 0x6c, 0xc6, 0x00, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x30, 0x78, 0x00,
- 0xfe, 0xc6, 0x8c, 0x18, 0x32, 0x66, 0xfe, 0x00, 0x78, 0x60, 0x60, 0x60, 0x60, 0x60, 0x78, 0x00,
- 0xc0, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x02, 0x00, 0x78, 0x18, 0x18, 0x18, 0x18, 0x18, 0x78, 0x00,
- 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
- 0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x76, 0x00,
- 0xe0, 0x60, 0x60, 0x7c, 0x66, 0x66, 0xdc, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xc0, 0xcc, 0x78, 0x00,
- 0x1c, 0x0c, 0x0c, 0x7c, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00,
- 0x38, 0x6c, 0x60, 0xf0, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8,
- 0xe0, 0x60, 0x6c, 0x76, 0x66, 0x66, 0xe6, 0x00, 0x30, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x0c, 0x00, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0x78, 0xe0, 0x60, 0x66, 0x6c, 0x78, 0x6c, 0xe6, 0x00,
- 0x70, 0x30, 0x30, 0x30, 0x30, 0x30, 0x78, 0x00, 0x00, 0x00, 0xcc, 0xfe, 0xfe, 0xd6, 0xc6, 0x00,
- 0x00, 0x00, 0xf8, 0xcc, 0xcc, 0xcc, 0xcc, 0x00, 0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0xdc, 0x66, 0x66, 0x7c, 0x60, 0xf0, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0x7c, 0x0c, 0x1e,
- 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x7c, 0xc0, 0x78, 0x0c, 0xf8, 0x00,
- 0x10, 0x30, 0x7c, 0x30, 0x30, 0x34, 0x18, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00,
- 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0x78, 0x30, 0x00, 0x00, 0x00, 0xc6, 0xd6, 0xfe, 0xfe, 0x6c, 0x00,
- 0x00, 0x00, 0xc6, 0x6c, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8,
- 0x00, 0x00, 0xfc, 0x98, 0x30, 0x64, 0xfc, 0x00, 0x1c, 0x30, 0x30, 0xe0, 0x30, 0x30, 0x1c, 0x00,
- 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00, 0xe0, 0x30, 0x30, 0x1c, 0x30, 0x30, 0xe0, 0x00,
- 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0x00,
- 0x78, 0xcc, 0xc0, 0xcc, 0x78, 0x18, 0x0c, 0x78, 0x00, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x1c, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00, 0x7e, 0xc3, 0x3c, 0x06, 0x3e, 0x66, 0x3f, 0x00,
- 0xcc, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0xe0, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00,
- 0x30, 0x30, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0x00, 0x00, 0x78, 0xc0, 0xc0, 0x78, 0x0c, 0x38,
- 0x7e, 0xc3, 0x3c, 0x66, 0x7e, 0x60, 0x3c, 0x00, 0xcc, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00,
- 0xe0, 0x00, 0x78, 0xcc, 0xfc, 0xc0, 0x78, 0x00, 0xcc, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x7c, 0xc6, 0x38, 0x18, 0x18, 0x18, 0x3c, 0x00, 0xe0, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0xc6, 0x38, 0x6c, 0xc6, 0xfe, 0xc6, 0xc6, 0x00, 0x30, 0x30, 0x00, 0x78, 0xcc, 0xfc, 0xcc, 0x00,
- 0x1c, 0x00, 0xfc, 0x60, 0x78, 0x60, 0xfc, 0x00, 0x00, 0x00, 0x7f, 0x0c, 0x7f, 0xcc, 0x7f, 0x00,
- 0x3e, 0x6c, 0xcc, 0xfe, 0xcc, 0xcc, 0xce, 0x00, 0x78, 0xcc, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0xcc, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0xe0, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00,
- 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00, 0x00, 0xe0, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x00, 0xcc, 0x00, 0xcc, 0xcc, 0x7c, 0x0c, 0xf8, 0xc3, 0x18, 0x3c, 0x66, 0x66, 0x3c, 0x18, 0x00,
- 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x18, 0x18, 0x7e, 0xc0, 0xc0, 0x7e, 0x18, 0x18,
- 0x38, 0x6c, 0x64, 0xf0, 0x60, 0xe6, 0xfc, 0x00, 0xcc, 0xcc, 0x78, 0xfc, 0x30, 0xfc, 0x30, 0x30,
- 0xf8, 0xcc, 0xcc, 0xfa, 0xc6, 0xcf, 0xc6, 0xc7, 0x0e, 0x1b, 0x18, 0x3c, 0x18, 0x18, 0xd8, 0x70,
- 0x1c, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0x7e, 0x00, 0x38, 0x00, 0x70, 0x30, 0x30, 0x30, 0x78, 0x00,
- 0x00, 0x1c, 0x00, 0x78, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x1c, 0x00, 0xcc, 0xcc, 0xcc, 0x7e, 0x00,
- 0x00, 0xf8, 0x00, 0xf8, 0xcc, 0xcc, 0xcc, 0x00, 0xfc, 0x00, 0xcc, 0xec, 0xfc, 0xdc, 0xcc, 0x00,
- 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00, 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x7c, 0x00, 0x00,
- 0x30, 0x00, 0x30, 0x60, 0xc0, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00, 0xfc, 0xc0, 0xc0, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0xfc, 0x0c, 0x0c, 0x00, 0x00, 0xc3, 0xc6, 0xcc, 0xde, 0x33, 0x66, 0xcc, 0x0f,
- 0xc3, 0xc6, 0xcc, 0xdb, 0x37, 0x6f, 0xcf, 0x03, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00,
- 0x00, 0x33, 0x66, 0xcc, 0x66, 0x33, 0x00, 0x00, 0x00, 0xcc, 0x66, 0x33, 0x66, 0xcc, 0x00, 0x00,
- 0x22, 0x88, 0x22, 0x88, 0x22, 0x88, 0x22, 0x88, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
- 0xdb, 0x77, 0xdb, 0xee, 0xdb, 0x77, 0xdb, 0xee, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x36, 0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0xfe, 0x06, 0xf6, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, 0x37, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0xf7, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x3f, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
- 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0xc8, 0xdc, 0x76, 0x00, 0x00, 0x78, 0xcc, 0xf8, 0xcc, 0xf8, 0xc0, 0xc0,
- 0x00, 0xfc, 0xcc, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00,
- 0xfc, 0xcc, 0x60, 0x30, 0x60, 0xcc, 0xfc, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8, 0xd8, 0x70, 0x00,
- 0x00, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0xc0, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x00,
- 0xfc, 0x30, 0x78, 0xcc, 0xcc, 0x78, 0x30, 0xfc, 0x38, 0x6c, 0xc6, 0xfe, 0xc6, 0x6c, 0x38, 0x00,
- 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x6c, 0xee, 0x00, 0x1c, 0x30, 0x18, 0x7c, 0xcc, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0x7e, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x06, 0x0c, 0x7e, 0xdb, 0xdb, 0x7e, 0x60, 0xc0,
- 0x38, 0x60, 0xc0, 0xf8, 0xc0, 0x60, 0x38, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x00,
- 0x00, 0xfc, 0x00, 0xfc, 0x00, 0xfc, 0x00, 0x00, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x00, 0xfc, 0x00,
- 0x60, 0x30, 0x18, 0x30, 0x60, 0x00, 0xfc, 0x00, 0x18, 0x30, 0x60, 0x30, 0x18, 0x00, 0xfc, 0x00,
- 0x0e, 0x1b, 0x1b, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0x70,
- 0x30, 0x30, 0x00, 0xfc, 0x00, 0x30, 0x30, 0x00, 0x00, 0x76, 0xdc, 0x00, 0x76, 0xdc, 0x00, 0x00,
- 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0f, 0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x3c, 0x1c,
- 0x78, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x70, 0x18, 0x30, 0x60, 0x78, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x3c, 0x3c, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-};
-
-const uint8_t ff_vga16_font[4096] = {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd, 0x99, 0x81, 0x81, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7e, 0xff, 0xdb, 0xff, 0xff, 0xc3, 0xe7, 0xff, 0xff, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x7c, 0xfe, 0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0xe7, 0xe7, 0xe7, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x7e, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xc3, 0x99, 0xbd, 0xbd, 0x99, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x1e, 0x0e, 0x1a, 0x32, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x30, 0x30, 0x70, 0xf0, 0xe0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7f, 0x63, 0x7f, 0x63, 0x63, 0x63, 0x63, 0x67, 0xe7, 0xe6, 0xc0, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x18, 0x18, 0xdb, 0x3c, 0xe7, 0x3c, 0xdb, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfe, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x02, 0x06, 0x0e, 0x1e, 0x3e, 0xfe, 0x3e, 0x1e, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7f, 0xdb, 0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x7c, 0xc6, 0x60, 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0xc0, 0xc0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x38, 0x7c, 0x7c, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0x7c, 0x7c, 0x38, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x66, 0x66, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x7c, 0xc6, 0xc2, 0xc0, 0x7c, 0x06, 0x06, 0x86, 0xc6, 0x7c, 0x18, 0x18, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xc2, 0xc6, 0x0c, 0x18, 0x30, 0x60, 0xc6, 0x86, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x76, 0xdc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x30, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x18, 0x0c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x30, 0x18, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x02, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc3, 0xc3, 0xdb, 0xdb, 0xc3, 0xc3, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x38, 0x78, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0x06, 0x06, 0x3c, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0c, 0x1c, 0x3c, 0x6c, 0xcc, 0xfe, 0x0c, 0x0c, 0x0c, 0x1e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc0, 0xc0, 0xc0, 0xfc, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x60, 0xc0, 0xc0, 0xfc, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc6, 0x06, 0x06, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x06, 0x06, 0x0c, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x0c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xde, 0xde, 0xde, 0xdc, 0xc0, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x66, 0x66, 0x66, 0x66, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xf8, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xde, 0xc6, 0xc6, 0x66, 0x3a, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1e, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xe6, 0x66, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xf0, 0x60, 0x60, 0x60, 0x60, 0x60, 0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xe7, 0xff, 0xff, 0xdb, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xd6, 0xde, 0x7c, 0x0c, 0x0e, 0x00, 0x00,
- 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x6c, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x60, 0x38, 0x0c, 0x06, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xff, 0xdb, 0x99, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18, 0x3c, 0x66, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xff, 0xc3, 0x86, 0x0c, 0x18, 0x30, 0x60, 0xc1, 0xc3, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x80, 0xc0, 0xe0, 0x70, 0x38, 0x1c, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
- 0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x78, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc0, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1c, 0x0c, 0x0c, 0x3c, 0x6c, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xcc, 0x78, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x6c, 0x76, 0x66, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x06, 0x06, 0x00, 0x0e, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x66, 0x66, 0x3c, 0x00,
- 0x00, 0x00, 0xe0, 0x60, 0x60, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xdb, 0xdb, 0xdb, 0xdb, 0xdb, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0x0c, 0x1e, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0x60, 0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x10, 0x30, 0x30, 0xfc, 0x30, 0x30, 0x30, 0x30, 0x36, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0x3c, 0x66, 0xc3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0xf8, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xcc, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0e, 0x18, 0x18, 0x18, 0x70, 0x18, 0x18, 0x18, 0x18, 0x0e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x70, 0x18, 0x18, 0x18, 0x0e, 0x18, 0x18, 0x18, 0x18, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x0c, 0x06, 0x7c, 0x00, 0x00,
- 0x00, 0x00, 0xcc, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0c, 0x18, 0x30, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xcc, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x38, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x60, 0x60, 0x66, 0x3c, 0x0c, 0x06, 0x3c, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x66, 0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x3c, 0x66, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc6, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x38, 0x6c, 0x38, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x30, 0x60, 0x00, 0xfe, 0x66, 0x60, 0x7c, 0x60, 0x60, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0x3b, 0x1b, 0x7e, 0xd8, 0xdc, 0x77, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x3e, 0x6c, 0xcc, 0xcc, 0xfe, 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x30, 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x60, 0x30, 0x18, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc6, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0x78, 0x00,
- 0x00, 0xc6, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc6, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x18, 0x7e, 0xc3, 0xc0, 0xc0, 0xc0, 0xc3, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xe6, 0xfc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xfc, 0x66, 0x66, 0x7c, 0x62, 0x66, 0x6f, 0x66, 0x66, 0x66, 0xf3, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0e, 0x1b, 0x18, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0x70, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0c, 0x18, 0x30, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x18, 0x30, 0x60, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x76, 0xdc, 0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
- 0x76, 0xdc, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x30, 0x30, 0x00, 0x30, 0x30, 0x60, 0xc0, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x60, 0xce, 0x9b, 0x06, 0x0c, 0x1f, 0x00, 0x00,
- 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x66, 0xce, 0x96, 0x3e, 0x06, 0x06, 0x00, 0x00,
- 0x00, 0x00, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x6c, 0xd8, 0x6c, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, 0x6c, 0x36, 0x6c, 0xd8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44,
- 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
- 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
- 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0xd8, 0xd8, 0xd8, 0xdc, 0x76, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xd8, 0xcc, 0xc6, 0xc6, 0xc6, 0xcc, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0xfe, 0xc6, 0xc6, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0xfe, 0xc6, 0x60, 0x30, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7e, 0x18, 0x3c, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x6c, 0x6c, 0x6c, 0xee, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1e, 0x30, 0x18, 0x0c, 0x3e, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xdb, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x03, 0x06, 0x7e, 0xdb, 0xdb, 0xf3, 0x7e, 0x60, 0xc0, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x1c, 0x30, 0x60, 0x60, 0x7c, 0x60, 0x60, 0x60, 0x30, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x0e, 0x1b, 0x1b, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
- 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x7e, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x38, 0x6c, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x0f, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x6c, 0x3c, 0x1c, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0xd8, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x70, 0xd8, 0x30, 0x60, 0xc8, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
 const uint32_t ff_cga_palette[16] = {
-    0x000000, 0x0000AA, 0x00AA00, 0x00AAAA, 0xAA0000, 0xAA00AA, 0xAA5500, 0xAAAAAA,
-    0x555555, 0x5555FF, 0x55FF55, 0x55FFFF, 0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF,
+    0xFF000000, 0xFF0000AA, 0xFF00AA00, 0xFF00AAAA, 0xFFAA0000, 0xFFAA00AA, 0xFFAA5500, 0xFFAAAAAA,
+    0xFF555555, 0xFF5555FF, 0xFF55FF55, 0xFF55FFFF, 0xFFFF5555, 0xFFFF55FF, 0xFFFFFF55, 0xFFFFFFFF,
 };
 
 const uint32_t ff_ega_palette[64] = {
-    0x000000, 0x0000AA, 0x00AA00, 0x00AAAA, 0xAA0000, 0xAA00AA, 0xAAAA00, 0xAAAAAA,
-    0x000055, 0x0000FF, 0x00AA55, 0x00AAFF, 0xAA0055, 0xAA00FF, 0xAAAA55, 0xAAAAFF,
-    0x005500, 0x0055AA, 0x00FF00, 0x00FFAA, 0xAA5500, 0xAA55AA, 0xAAFF00, 0xAAFFAA,
-    0x005555, 0x0055FF, 0x00FF55, 0x00FFFF, 0xAA5555, 0xAA55FF, 0xAAFF55, 0xAAFFFF,
-    0x550000, 0x5500AA, 0x55AA00, 0x55AAAA, 0xFF0000, 0xFF00AA, 0xFFAA00, 0xFFAAAA,
-    0x550055, 0x5500FF, 0x55AA55, 0x55AAFF, 0xFF0055, 0xFF00FF, 0xFFAA55, 0xFFAAFF,
-    0x555500, 0x5555AA, 0x55FF00, 0x55FFAA, 0xFF5500, 0xFF55AA, 0xFFFF00, 0xFFFFAA,
-    0x555555, 0x5555FF, 0x55FF55, 0x55FFFF, 0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF
+    0xFF000000, 0xFF0000AA, 0xFF00AA00, 0xFF00AAAA, 0xFFAA0000, 0xFFAA00AA, 0xFFAAAA00, 0xFFAAAAAA,
+    0xFF000055, 0xFF0000FF, 0xFF00AA55, 0xFF00AAFF, 0xFFAA0055, 0xFFAA00FF, 0xFFAAAA55, 0xFFAAAAFF,
+    0xFF005500, 0xFF0055AA, 0xFF00FF00, 0xFF00FFAA, 0xFFAA5500, 0xFFAA55AA, 0xFFAAFF00, 0xFFAAFFAA,
+    0xFF005555, 0xFF0055FF, 0xFF00FF55, 0xFF00FFFF, 0xFFAA5555, 0xFFAA55FF, 0xFFAAFF55, 0xFFAAFFFF,
+    0xFF550000, 0xFF5500AA, 0xFF55AA00, 0xFF55AAAA, 0xFFFF0000, 0xFFFF00AA, 0xFFFFAA00, 0xFFFFAAAA,
+    0xFF550055, 0xFF5500FF, 0xFF55AA55, 0xFF55AAFF, 0xFFFF0055, 0xFFFF00FF, 0xFFFFAA55, 0xFFFFAAFF,
+    0xFF555500, 0xFF5555AA, 0xFF55FF00, 0xFF55FFAA, 0xFFFF5500, 0xFFFF55AA, 0xFFFFFF00, 0xFFFFFFAA,
+    0xFF555555, 0xFF5555FF, 0xFF55FF55, 0xFF55FFFF, 0xFFFF5555, 0xFFFF55FF, 0xFFFFFF55, 0xFFFFFFFF
 };
 
 void ff_draw_pc_font(uint8_t *dst, int linesize, const uint8_t *font, int font_height, int ch, int fg, int bg)
diff --git a/libavcodec/cga_data.h b/libavcodec/cga_data.h
index 2149cfd..3f5281a 100644
--- a/libavcodec/cga_data.h
+++ b/libavcodec/cga_data.h
@@ -1,26 +1,27 @@
 /*
  * CGA/EGA/VGA ROM data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * CGA/EGA/VGA ROM data
+ * @note fonts are in libavutil/xga_font_data.[ch]
  */
 
 #ifndef AVCODEC_CGA_DATA_H
@@ -28,8 +29,6 @@
 
 #include <stdint.h>
 
-extern const uint8_t ff_cga_font[2048];
-extern const uint8_t ff_vga16_font[4096];
 extern const uint32_t ff_cga_palette[16];
 extern const uint32_t ff_ega_palette[64];
 
diff --git a/libavcodec/chomp_bsf.c b/libavcodec/chomp_bsf.c
index 2e76113..cc94380 100644
--- a/libavcodec/chomp_bsf.c
+++ b/libavcodec/chomp_bsf.c
@@ -2,20 +2,20 @@
  * Chomp bitstream filter
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c
index 611ffe5..a2190d7 100644
--- a/libavcodec/cinepak.c
+++ b/libavcodec/cinepak.c
@@ -2,20 +2,20 @@
  * Cinepak Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,9 @@
  *   http://www.csse.monash.edu.au/~timf/
  * @see For more information on the quirky data inside Sega FILM/CPK files, visit:
  *   http://wiki.multimedia.cx/index.php?title=Sega_FILM
+ *
+ * Cinepak colorspace support (c) 2013 Rl, Aetey Global Technologies AB
+ * @author Cinepak colorspace, Rl, Aetey Global Technologies AB
  */
 
 #include <stdio.h>
@@ -40,10 +43,7 @@
 #include "internal.h"
 
 
-typedef struct cvid_codebook {
-    uint8_t  y0, y1, y2, y3;
-    uint8_t  u, v;
-} cvid_codebook;
+typedef uint8_t cvid_codebook[12];
 
 #define MAX_STRIPS      32
 
@@ -79,12 +79,14 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
     const uint8_t *eod = (data + size);
     uint32_t flag, mask;
     int      i, n;
+    uint8_t *p;
 
     /* check if this chunk contains 4- or 6-element vectors */
     n    = (chunk_id & 0x04) ? 4 : 6;
     flag = 0;
     mask = 0;
 
+    p = codebook[0];
     for (i=0; i < 256; i++) {
         if ((chunk_id & 0x01) && !(mask >>= 1)) {
             if ((data + 4) > eod)
@@ -96,28 +98,33 @@ static void cinepak_decode_codebook (cvid_codebook *codebook,
         }
 
         if (!(chunk_id & 0x01) || (flag & mask)) {
+            int k, kk;
+
             if ((data + n) > eod)
                 break;
 
+            for (k = 0; k < 4; ++k) {
+                int r = *data++;
+                for (kk = 0; kk < 3; ++kk)
+                    *p++ = r;
+            }
             if (n == 6) {
-                codebook[i].y0 = *data++;
-                codebook[i].y1 = *data++;
-                codebook[i].y2 = *data++;
-                codebook[i].y3 = *data++;
-                codebook[i].u  = 128 + *data++;
-                codebook[i].v  = 128 + *data++;
-            } else {
-                /* this codebook type indicates either greyscale or
-                 * palettized video; if palettized, U & V components will
-                 * not be used so it is safe to set them to 128 for the
-                 * benefit of greyscale rendering in YUV420P */
-                codebook[i].y0 = *data++;
-                codebook[i].y1 = *data++;
-                codebook[i].y2 = *data++;
-                codebook[i].y3 = *data++;
-                codebook[i].u  = 128;
-                codebook[i].v  = 128;
+                int r, g, b, u, v;
+                u = *(int8_t *)data++;
+                v = *(int8_t *)data++;
+                p -= 12;
+                for(k=0; k<4; ++k) {
+                    r = *p++ + v*2;
+                    g = *p++ - (u/2) - v;
+                    b = *p   + u*2;
+                    p -= 2;
+                    *p++ = av_clip_uint8(r);
+                    *p++ = av_clip_uint8(g);
+                    *p++ = av_clip_uint8(b);
+                }
             }
+        } else {
+            p += 12;
         }
     }
 }
@@ -127,25 +134,31 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
 {
     const uint8_t   *eod = (data + size);
     uint32_t         flag, mask;
-    cvid_codebook   *codebook;
-    unsigned int     x, y;
-    uint32_t         iy[4];
-    uint32_t         iu[2];
-    uint32_t         iv[2];
+    uint8_t         *cb0, *cb1, *cb2, *cb3;
+    int             x, y;
+    char            *ip0, *ip1, *ip2, *ip3;
 
     flag = 0;
     mask = 0;
 
     for (y=strip->y1; y < strip->y2; y+=4) {
 
-        iy[0] = strip->x1 + (y * s->frame->linesize[0]);
-        iy[1] = iy[0] + s->frame->linesize[0];
-        iy[2] = iy[1] + s->frame->linesize[0];
-        iy[3] = iy[2] + s->frame->linesize[0];
-        iu[0] = (strip->x1/2) + ((y/2) * s->frame->linesize[1]);
-        iu[1] = iu[0] + s->frame->linesize[1];
-        iv[0] = (strip->x1/2) + ((y/2) * s->frame->linesize[2]);
-        iv[1] = iv[0] + s->frame->linesize[2];
+/* take care of y dimension not being multiple of 4, such streams exist */
+        ip0 = ip1 = ip2 = ip3 = s->frame->data[0] +
+          (s->palette_video?strip->x1:strip->x1*3) + (y * s->frame->linesize[0]);
+        if(s->avctx->height - y > 1) {
+            ip1 = ip0 + s->frame->linesize[0];
+            if(s->avctx->height - y > 2) {
+                ip2 = ip1 + s->frame->linesize[0];
+                if(s->avctx->height - y > 3) {
+                    ip3 = ip2 + s->frame->linesize[0];
+                }
+            }
+        }
+/* to get the correct picture for not-multiple-of-4 cases let us fill
+ * each block from the bottom up, thus possibly overwriting the top line
+ * more than once but ending with the correct data in place
+ * (instead of in-loop checking) */
 
         for (x=strip->x1; x < strip->x2; x+=4) {
             if ((chunk_id & 0x01) && !(mask >>= 1)) {
@@ -168,97 +181,82 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
                 }
 
                 if ((chunk_id & 0x02) || (~flag & mask)) {
+                    uint8_t *p;
                     if (data >= eod)
                         return AVERROR_INVALIDDATA;
 
-                    codebook = &strip->v1_codebook[*data++];
-                    s->frame->data[0][iy[0] + 0] = codebook->y0;
-                    s->frame->data[0][iy[0] + 1] = codebook->y0;
-                    s->frame->data[0][iy[1] + 0] = codebook->y0;
-                    s->frame->data[0][iy[1] + 1] = codebook->y0;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0]] = codebook->u;
-                        s->frame->data[2][iv[0]] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[0] + 2] = codebook->y1;
-                    s->frame->data[0][iy[0] + 3] = codebook->y1;
-                    s->frame->data[0][iy[1] + 2] = codebook->y1;
-                    s->frame->data[0][iy[1] + 3] = codebook->y1;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0] + 1] = codebook->u;
-                        s->frame->data[2][iv[0] + 1] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[2] + 0] = codebook->y2;
-                    s->frame->data[0][iy[2] + 1] = codebook->y2;
-                    s->frame->data[0][iy[3] + 0] = codebook->y2;
-                    s->frame->data[0][iy[3] + 1] = codebook->y2;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1]] = codebook->u;
-                        s->frame->data[2][iv[1]] = codebook->v;
-                    }
-
-                    s->frame->data[0][iy[2] + 2] = codebook->y3;
-                    s->frame->data[0][iy[2] + 3] = codebook->y3;
-                    s->frame->data[0][iy[3] + 2] = codebook->y3;
-                    s->frame->data[0][iy[3] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1] + 1] = codebook->u;
-                        s->frame->data[2][iv[1] + 1] = codebook->v;
+                    p = strip->v1_codebook[*data++];
+                    if (s->palette_video) {
+                        ip3[0] = ip3[1] = ip2[0] = ip2[1] = p[6];
+                        ip3[2] = ip3[3] = ip2[2] = ip2[3] = p[9];
+                        ip1[0] = ip1[1] = ip0[0] = ip0[1] = p[0];
+                        ip1[2] = ip1[3] = ip0[2] = ip0[3] = p[3];
+                    } else {
+                        p += 6;
+                        memcpy(ip3 + 0, p, 3); memcpy(ip3 + 3, p, 3);
+                        memcpy(ip2 + 0, p, 3); memcpy(ip2 + 3, p, 3);
+                        p += 3; /* ... + 9 */
+                        memcpy(ip3 + 6, p, 3); memcpy(ip3 + 9, p, 3);
+                        memcpy(ip2 + 6, p, 3); memcpy(ip2 + 9, p, 3);
+                        p -= 9; /* ... + 0 */
+                        memcpy(ip1 + 0, p, 3); memcpy(ip1 + 3, p, 3);
+                        memcpy(ip0 + 0, p, 3); memcpy(ip0 + 3, p, 3);
+                        p += 3; /* ... + 3 */
+                        memcpy(ip1 + 6, p, 3); memcpy(ip1 + 9, p, 3);
+                        memcpy(ip0 + 6, p, 3); memcpy(ip0 + 9, p, 3);
                     }
 
                 } else if (flag & mask) {
                     if ((data + 4) > eod)
                         return AVERROR_INVALIDDATA;
 
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[0] + 0] = codebook->y0;
-                    s->frame->data[0][iy[0] + 1] = codebook->y1;
-                    s->frame->data[0][iy[1] + 0] = codebook->y2;
-                    s->frame->data[0][iy[1] + 1] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0]] = codebook->u;
-                        s->frame->data[2][iv[0]] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[0] + 2] = codebook->y0;
-                    s->frame->data[0][iy[0] + 3] = codebook->y1;
-                    s->frame->data[0][iy[1] + 2] = codebook->y2;
-                    s->frame->data[0][iy[1] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[0] + 1] = codebook->u;
-                        s->frame->data[2][iv[0] + 1] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[2] + 0] = codebook->y0;
-                    s->frame->data[0][iy[2] + 1] = codebook->y1;
-                    s->frame->data[0][iy[3] + 0] = codebook->y2;
-                    s->frame->data[0][iy[3] + 1] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1]] = codebook->u;
-                        s->frame->data[2][iv[1]] = codebook->v;
-                    }
-
-                    codebook = &strip->v4_codebook[*data++];
-                    s->frame->data[0][iy[2] + 2] = codebook->y0;
-                    s->frame->data[0][iy[2] + 3] = codebook->y1;
-                    s->frame->data[0][iy[3] + 2] = codebook->y2;
-                    s->frame->data[0][iy[3] + 3] = codebook->y3;
-                    if (!s->palette_video) {
-                        s->frame->data[1][iu[1] + 1] = codebook->u;
-                        s->frame->data[2][iv[1] + 1] = codebook->v;
+                    cb0 = strip->v4_codebook[*data++];
+                    cb1 = strip->v4_codebook[*data++];
+                    cb2 = strip->v4_codebook[*data++];
+                    cb3 = strip->v4_codebook[*data++];
+                    if (s->palette_video) {
+                        uint8_t *p;
+                        p = ip3;
+                        *p++ = cb2[6];
+                        *p++ = cb2[9];
+                        *p++ = cb3[6];
+                        *p   = cb3[9];
+                        p = ip2;
+                        *p++ = cb2[0];
+                        *p++ = cb2[3];
+                        *p++ = cb3[0];
+                        *p   = cb3[3];
+                        p = ip1;
+                        *p++ = cb0[6];
+                        *p++ = cb0[9];
+                        *p++ = cb1[6];
+                        *p   = cb1[9];
+                        p = ip0;
+                        *p++ = cb0[0];
+                        *p++ = cb0[3];
+                        *p++ = cb1[0];
+                        *p   = cb1[3];
+                    } else {
+                        memcpy(ip3 + 0, cb2 + 6, 6);
+                        memcpy(ip3 + 6, cb3 + 6, 6);
+                        memcpy(ip2 + 0, cb2 + 0, 6);
+                        memcpy(ip2 + 6, cb3 + 0, 6);
+                        memcpy(ip1 + 0, cb0 + 6, 6);
+                        memcpy(ip1 + 6, cb1 + 6, 6);
+                        memcpy(ip0 + 0, cb0 + 0, 6);
+                        memcpy(ip0 + 6, cb1 + 0, 6);
                     }
 
                 }
             }
 
-            iy[0] += 4;  iy[1] += 4;
-            iy[2] += 4;  iy[3] += 4;
-            iu[0] += 2;  iu[1] += 2;
-            iv[0] += 2;  iv[1] += 2;
+            if (s->palette_video) {
+                ip0 += 4;  ip1 += 4;
+                ip2 += 4;  ip3 += 4;
+            } else {
+                ip0 += 12;  ip1 += 12;
+                ip2 += 12;  ip3 += 12;
+            }
         }
     }
 
@@ -362,15 +360,23 @@ static int cinepak_decode (CinepakContext *s)
 
     num_strips = FFMIN(num_strips, MAX_STRIPS);
 
+    s->frame->key_frame = 0;
+
     for (i=0; i < num_strips; i++) {
         if ((s->data + 12) > eod)
             return AVERROR_INVALIDDATA;
 
         s->strips[i].id = s->data[0];
-        s->strips[i].y1 = y0;
-        s->strips[i].x1 = 0;
-        s->strips[i].y2 = y0 + AV_RB16 (&s->data[8]);
-        s->strips[i].x2 = s->avctx->width;
+/* zero y1 means "relative to the previous stripe" */
+        if (!(s->strips[i].y1 = AV_RB16 (&s->data[4])))
+            s->strips[i].y2 = (s->strips[i].y1 = y0) + AV_RB16 (&s->data[8]);
+        else
+            s->strips[i].y2 = AV_RB16 (&s->data[8]);
+        s->strips[i].x1 = AV_RB16 (&s->data[6]);
+        s->strips[i].x2 = AV_RB16 (&s->data[10]);
+
+        if (s->strips[i].id == 0x10)
+            s->frame->key_frame = 1;
 
         strip_size = AV_RB24 (&s->data[1]) - 12;
         if (strip_size < 0)
@@ -403,12 +409,13 @@ static av_cold int cinepak_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     s->width = (avctx->width + 3) & ~3;
     s->height = (avctx->height + 3) & ~3;
+
     s->sega_film_skip_bytes = -1;  /* uninitialized state */
 
     // check for paletted data
     if (avctx->bits_per_coded_sample != 8) {
         s->palette_video = 0;
-        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
     } else {
         s->palette_video = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -432,10 +439,8 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
     s->data = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame))) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (s->palette_video) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
@@ -445,7 +450,9 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
         }
     }
 
-    cinepak_decode(s);
+    if ((ret = cinepak_decode(s)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "cinepak_decode failed\n");
+    }
 
     if (s->palette_video)
         memcpy (s->frame->data[1], s->pal, AVPALETTE_SIZE);
diff --git a/libavcodec/cinepakenc.c b/libavcodec/cinepakenc.c
new file mode 100644
index 0000000..06b06da
--- /dev/null
+++ b/libavcodec/cinepakenc.c
@@ -0,0 +1,1390 @@
+/*
+ * Cinepak encoder (c) 2011 Tomas H�rdin
+ * http://titan.codemill.se/~tomhar/cinepakenc.patch
+ *
+ * Fixes and improvements, vintage decoders compatibility
+ *  (c) 2013, 2014 Rl, Aetey Global Technologies AB
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+ * TODO:
+ * - optimize: color space conversion, ...
+ * - implement options to set the min/max number of strips?
+ * MAYBE:
+ * - "optimally" split the frame into several non-regular areas
+ *   using a separate codebook pair for each area and approximating
+ *   the area by several rectangular strips (generally not full width ones)
+ *   (use quadtree splitting? a simple fixed-granularity grid?)
+ *
+ *
+ * version 2014-01-23 Rl
+ * - added option handling for flexibility
+ *
+ * version 2014-01-21 Rl
+ * - believe it or not, now we get even smaller files, with better quality
+ *   (which means I missed an optimization earlier :)
+ *
+ * version 2014-01-20 Rl
+ * - made the encoder compatible with vintage decoders
+ *   and added some yet unused code for possible future
+ *   incremental codebook updates
+ * - fixed a small memory leak
+ *
+ * version 2013-04-28 Rl
+ * - bugfixed codebook optimization logic
+ *
+ * version 2013-02-14 Rl
+ * "Valentine's Day" version:
+ * - made strip division more robust
+ * - minimized bruteforcing the number of strips,
+ *   (costs some R/D but speeds up compession a lot), the heuristic
+ *   assumption is that score as a function of the number of strips has
+ *   one wide minimum which moves slowly, of course not fully true
+ * - simplified codebook generation,
+ *   the old code was meant for other optimizations than we actually do
+ * - optimized the codebook generation / error estimation for MODE_MC
+ *
+ * version 2013-02-12 Rl
+ * - separated codebook training sets, avoided the transfer of wasted bytes,
+ *   which yields both better quality and smaller files
+ * - now using the correct colorspace (TODO: move conversion to libswscale)
+ *
+ * version 2013-02-08 Rl
+ * - fixes/optimization in multistrip encoding and codebook size choice,
+ *   quality/bitrate is now better than that of the binary proprietary encoder
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "libavutil/lfg.h"
+#include "elbg.h"
+#include "internal.h"
+
+#include "libavutil/avassert.h"
+#include "libavutil/opt.h"
+
+#define CVID_HEADER_SIZE 10
+#define STRIP_HEADER_SIZE 12
+#define CHUNK_HEADER_SIZE 4
+
+#define MB_SIZE 4           //4x4 MBs
+#define MB_AREA (MB_SIZE*MB_SIZE)
+
+#define VECTOR_MAX 6        //six or four entries per vector depending on format
+#define CODEBOOK_MAX 256    //size of a codebook
+
+#define MAX_STRIPS  32      //Note: having fewer choices regarding the number of strips speeds up encoding (obviously)
+#define MIN_STRIPS  1       //Note: having more strips speeds up encoding the frame (this is less obvious)
+// MAX_STRIPS limits the maximum quality you can reach
+//            when you want hight quality on high resolutions,
+// MIN_STRIPS limits the minimum efficiently encodable bit rate
+//            on low resolutions
+// the numbers are only used for brute force optimization for the first frame,
+// for the following frames they are adaptively readjusted
+// NOTE the decoder in ffmpeg has its own arbitrary limitation on the number
+// of strips, currently 32
+
+typedef enum {
+    MODE_V1_ONLY = 0,
+    MODE_V1_V4,
+    MODE_MC,
+
+    MODE_COUNT,
+} CinepakMode;
+
+typedef enum {
+    ENC_V1,
+    ENC_V4,
+    ENC_SKIP,
+
+    ENC_UNCERTAIN
+} mb_encoding;
+
+typedef struct {
+    int v1_vector;                  //index into v1 codebook
+    int v1_error;                   //error when using V1 encoding
+    int v4_vector[4];               //indices into v4 codebooks
+    int v4_error;                   //error when using V4 encoding
+    int skip_error;                 //error when block is skipped (aka copied from last frame)
+    mb_encoding best_encoding;      //last result from calculate_mode_score()
+} mb_info;
+
+typedef struct {
+    int v1_codebook[CODEBOOK_MAX*VECTOR_MAX];
+    int v4_codebook[CODEBOOK_MAX*VECTOR_MAX];
+    int v1_size;
+    int v4_size;
+    CinepakMode mode;
+} strip_info;
+
+typedef struct {
+    const AVClass *class;
+    AVCodecContext *avctx;
+    unsigned char *pict_bufs[4], *strip_buf, *frame_buf;
+    AVFrame *last_frame;
+    AVFrame *best_frame;
+    AVFrame *scratch_frame;
+    AVFrame *input_frame;
+    enum AVPixelFormat pix_fmt;
+    int w, h;
+    int frame_buf_size;
+    int curframe, keyint;
+    AVLFG randctx;
+    uint64_t lambda;
+    int *codebook_input;
+    int *codebook_closest;
+    mb_info *mb;                                //MB RD state
+    int min_strips;          //the current limit
+    int max_strips;          //the current limit
+#ifdef CINEPAKENC_DEBUG
+    mb_info *best_mb;                           //TODO: remove. only used for printing stats
+    int num_v1_mode, num_v4_mode, num_mc_mode;
+    int num_v1_encs, num_v4_encs, num_skips;
+#endif
+// options
+    int max_extra_cb_iterations;
+    int skip_empty_cb;
+    int min_min_strips;
+    int max_max_strips;
+    int strip_number_delta_range;
+} CinepakEncContext;
+
+#define OFFSET(x) offsetof(CinepakEncContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "max_extra_cb_iterations", "Max extra codebook recalculation passes, more is better and slower", OFFSET(max_extra_cb_iterations), AV_OPT_TYPE_INT, { .i64 = 2 }, 0, INT_MAX, VE },
+    { "skip_empty_cb", "Avoid wasting bytes, ignore vintage MacOS decoder", OFFSET(skip_empty_cb), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "max_strips", "Limit strips/frame, vintage compatible is 1..3, otherwise the more the better", OFFSET(max_max_strips), AV_OPT_TYPE_INT, { .i64 = 3 }, MIN_STRIPS, MAX_STRIPS, VE },
+    { "min_strips", "Enforce min strips/frame, more is worse and faster, must be <= max_strips", OFFSET(min_min_strips), AV_OPT_TYPE_INT, { .i64 = MIN_STRIPS }, MIN_STRIPS, MAX_STRIPS, VE },
+    { "strip_number_adaptivity", "How fast the strip number adapts, more is slightly better, much slower", OFFSET(strip_number_delta_range), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, MAX_STRIPS-MIN_STRIPS, VE },
+    { NULL },
+};
+
+static const AVClass cinepak_class = {
+    .class_name = "cinepak",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static av_cold int cinepak_encode_init(AVCodecContext *avctx)
+{
+    CinepakEncContext *s = avctx->priv_data;
+    int x, mb_count, strip_buf_size, frame_buf_size;
+
+    if (avctx->width & 3 || avctx->height & 3) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be multiples of four (got %ix%i)\n",
+                avctx->width, avctx->height);
+        return AVERROR(EINVAL);
+    }
+
+    if (s->min_min_strips > s->max_max_strips) {
+        av_log(avctx, AV_LOG_ERROR, "minimal number of strips can not exceed maximal (got %i and %i)\n",
+                s->min_min_strips, s->max_max_strips);
+        return AVERROR(EINVAL);
+    }
+
+    if (!(s->last_frame = av_frame_alloc()))
+        return AVERROR(ENOMEM);
+    if (!(s->best_frame = av_frame_alloc()))
+        goto enomem;
+    if (!(s->scratch_frame = av_frame_alloc()))
+        goto enomem;
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24)
+        if (!(s->input_frame = av_frame_alloc()))
+            goto enomem;
+
+    if (!(s->codebook_input = av_malloc(sizeof(int) * (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4) * (avctx->width * avctx->height) >> 2)))
+        goto enomem;
+
+    if (!(s->codebook_closest = av_malloc(sizeof(int) * (avctx->width * avctx->height) >> 2)))
+        goto enomem;
+
+    for(x = 0; x < (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 4 : 3); x++)
+        if(!(s->pict_bufs[x] = av_malloc((avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4) * (avctx->width * avctx->height) >> 2)))
+            goto enomem;
+
+    mb_count = avctx->width * avctx->height / MB_AREA;
+
+    //the largest possible chunk is 0x31 with all MBs encoded in V4 mode
+    //and full codebooks being replaced in INTER mode,
+    // which is 34 bits per MB
+    //and 2*256 extra flag bits per strip
+    strip_buf_size = STRIP_HEADER_SIZE + 3 * CHUNK_HEADER_SIZE + 2 * VECTOR_MAX * CODEBOOK_MAX + 4 * (mb_count + (mb_count + 15) / 16) + (2 * CODEBOOK_MAX)/8;
+
+    frame_buf_size = CVID_HEADER_SIZE + s->max_max_strips * strip_buf_size;
+
+    if (!(s->strip_buf = av_malloc(strip_buf_size)))
+        goto enomem;
+
+    if (!(s->frame_buf = av_malloc(frame_buf_size)))
+        goto enomem;
+
+    if (!(s->mb = av_malloc_array(mb_count, sizeof(mb_info))))
+        goto enomem;
+
+#ifdef CINEPAKENC_DEBUG
+    if (!(s->best_mb = av_malloc_array(mb_count, sizeof(mb_info))))
+        goto enomem;
+#endif
+
+    av_lfg_init(&s->randctx, 1);
+    s->avctx = avctx;
+    s->w = avctx->width;
+    s->h = avctx->height;
+    s->frame_buf_size = frame_buf_size;
+    s->curframe = 0;
+    s->keyint = avctx->keyint_min;
+    s->pix_fmt = avctx->pix_fmt;
+
+    //set up AVFrames
+    s->last_frame->data[0]        = s->pict_bufs[0];
+    s->last_frame->linesize[0]    = s->w;
+    s->best_frame->data[0]        = s->pict_bufs[1];
+    s->best_frame->linesize[0]    = s->w;
+    s->scratch_frame->data[0]     = s->pict_bufs[2];
+    s->scratch_frame->linesize[0] = s->w;
+
+    if (s->pix_fmt == AV_PIX_FMT_RGB24) {
+        s->last_frame->data[1]        = s->last_frame->data[0] + s->w * s->h;
+        s->last_frame->data[2]        = s->last_frame->data[1] + ((s->w * s->h) >> 2);
+        s->last_frame->linesize[1]    = s->last_frame->linesize[2] = s->w >> 1;
+
+        s->best_frame->data[1]        = s->best_frame->data[0] + s->w * s->h;
+        s->best_frame->data[2]        = s->best_frame->data[1] + ((s->w * s->h) >> 2);
+        s->best_frame->linesize[1]    = s->best_frame->linesize[2] = s->w >> 1;
+
+        s->scratch_frame->data[1]     = s->scratch_frame->data[0] + s->w * s->h;
+        s->scratch_frame->data[2]     = s->scratch_frame->data[1] + ((s->w * s->h) >> 2);
+        s->scratch_frame->linesize[1] = s->scratch_frame->linesize[2] = s->w >> 1;
+
+        s->input_frame->data[0]       = s->pict_bufs[3];
+        s->input_frame->linesize[0]   = s->w;
+        s->input_frame->data[1]       = s->input_frame->data[0] + s->w * s->h;
+        s->input_frame->data[2]       = s->input_frame->data[1] + ((s->w * s->h) >> 2);
+        s->input_frame->linesize[1]   = s->input_frame->linesize[2] = s->w >> 1;
+    }
+
+    s->min_strips = s->min_min_strips;
+    s->max_strips = s->max_max_strips;
+
+#ifdef CINEPAKENC_DEBUG
+    s->num_v1_mode = s->num_v4_mode = s->num_mc_mode = s->num_v1_encs = s->num_v4_encs = s->num_skips = 0;
+#endif
+
+    return 0;
+
+enomem:
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->best_frame);
+    av_frame_free(&s->scratch_frame);
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24)
+        av_frame_free(&s->input_frame);
+    av_freep(&s->codebook_input);
+    av_freep(&s->codebook_closest);
+    av_freep(&s->strip_buf);
+    av_freep(&s->frame_buf);
+    av_freep(&s->mb);
+#ifdef CINEPAKENC_DEBUG
+    av_freep(&s->best_mb);
+#endif
+
+    for(x = 0; x < (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 4 : 3); x++)
+        av_freep(&s->pict_bufs[x]);
+
+    return AVERROR(ENOMEM);
+}
+
+static int64_t calculate_mode_score(CinepakEncContext *s, int h, strip_info *info, int report, int *training_set_v1_shrunk, int *training_set_v4_shrunk
+#ifdef CINEPAK_REPORT_SERR
+, int64_t *serr
+#endif
+)
+{
+    //score = FF_LAMBDA_SCALE * error + lambda * bits
+    int x;
+    int entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+    int mb_count = s->w * h / MB_AREA;
+    mb_info *mb;
+    int64_t score1, score2, score3;
+    int64_t ret = s->lambda * ((info->v1_size ? CHUNK_HEADER_SIZE + info->v1_size * entry_size : 0) +
+                   (info->v4_size ? CHUNK_HEADER_SIZE + info->v4_size * entry_size : 0) +
+                   CHUNK_HEADER_SIZE) << 3;
+
+    //av_log(s->avctx, AV_LOG_INFO, "sizes %3i %3i -> %9"PRId64" score mb_count %i", info->v1_size, info->v4_size, ret, mb_count);
+
+#ifdef CINEPAK_REPORT_SERR
+    *serr = 0;
+#endif
+
+    switch(info->mode) {
+    case MODE_V1_ONLY:
+        //one byte per MB
+        ret += s->lambda * 8 * mb_count;
+
+// while calculating we assume all blocks are ENC_V1
+        for(x = 0; x < mb_count; x++) {
+            mb = &s->mb[x];
+            ret += FF_LAMBDA_SCALE * mb->v1_error;
+#ifdef CINEPAK_REPORT_SERR
+            *serr += mb->v1_error;
+#endif
+// this function is never called for report in MODE_V1_ONLY
+//            if(!report)
+            mb->best_encoding = ENC_V1;
+        }
+
+        break;
+    case MODE_V1_V4:
+        //9 or 33 bits per MB
+        if(report) {
+// no moves between the corresponding training sets are allowed
+            *training_set_v1_shrunk = *training_set_v4_shrunk = 0;
+            for(x = 0; x < mb_count; x++) {
+                int mberr;
+                mb = &s->mb[x];
+                if(mb->best_encoding == ENC_V1)
+                    score1 = s->lambda * 9  + FF_LAMBDA_SCALE * (mberr=mb->v1_error);
+                else
+                    score1 = s->lambda * 33 + FF_LAMBDA_SCALE * (mberr=mb->v4_error);
+                ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                *serr += mberr;
+#endif
+            }
+        } else { // find best mode per block
+            for(x = 0; x < mb_count; x++) {
+                mb = &s->mb[x];
+                score1 = s->lambda * 9  + FF_LAMBDA_SCALE * mb->v1_error;
+                score2 = s->lambda * 33 + FF_LAMBDA_SCALE * mb->v4_error;
+
+                if(score1 <= score2) {
+                    ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v1_error;
+#endif
+                    mb->best_encoding = ENC_V1;
+                } else {
+                    ret += score2;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v4_error;
+#endif
+                    mb->best_encoding = ENC_V4;
+                }
+            }
+        }
+
+        break;
+    case MODE_MC:
+        //1, 10 or 34 bits per MB
+        if(report) {
+            int v1_shrunk = 0, v4_shrunk = 0;
+            for(x = 0; x < mb_count; x++) {
+                mb = &s->mb[x];
+// it is OK to move blocks to ENC_SKIP here
+// but not to any codebook encoding!
+                score1 = s->lambda * 1  + FF_LAMBDA_SCALE * mb->skip_error;
+                if(mb->best_encoding == ENC_SKIP) {
+                    ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->skip_error;
+#endif
+                } else if(mb->best_encoding == ENC_V1) {
+                    if((score2=s->lambda * 10 + FF_LAMBDA_SCALE * mb->v1_error) >= score1) {
+                        mb->best_encoding = ENC_SKIP;
+                        ++v1_shrunk;
+                        ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->skip_error;
+#endif
+                    } else {
+                        ret += score2;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->v1_error;
+#endif
+                    }
+                } else {
+                    if((score3=s->lambda * 34 + FF_LAMBDA_SCALE * mb->v4_error) >= score1) {
+                        mb->best_encoding = ENC_SKIP;
+                        ++v4_shrunk;
+                        ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->skip_error;
+#endif
+                    } else {
+                        ret += score3;
+#ifdef CINEPAK_REPORT_SERR
+                        *serr += mb->v4_error;
+#endif
+                    }
+                }
+            }
+            *training_set_v1_shrunk = v1_shrunk;
+            *training_set_v4_shrunk = v4_shrunk;
+        } else { // find best mode per block
+            for(x = 0; x < mb_count; x++) {
+                mb = &s->mb[x];
+                score1 = s->lambda * 1  + FF_LAMBDA_SCALE * mb->skip_error;
+                score2 = s->lambda * 10 + FF_LAMBDA_SCALE * mb->v1_error;
+                score3 = s->lambda * 34 + FF_LAMBDA_SCALE * mb->v4_error;
+
+                if(score1 <= score2 && score1 <= score3) {
+                    ret += score1;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->skip_error;
+#endif
+                    mb->best_encoding = ENC_SKIP;
+                } else if(score2 <= score3) {
+                    ret += score2;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v1_error;
+#endif
+                    mb->best_encoding = ENC_V1;
+                } else {
+                    ret += score3;
+#ifdef CINEPAK_REPORT_SERR
+                    *serr += mb->v4_error;
+#endif
+                    mb->best_encoding = ENC_V4;
+                }
+            }
+        }
+
+        break;
+    }
+
+    return ret;
+}
+
+static int write_chunk_header(unsigned char *buf, int chunk_type, int chunk_size)
+{
+    buf[0] = chunk_type;
+    AV_WB24(&buf[1], chunk_size + CHUNK_HEADER_SIZE);
+    return CHUNK_HEADER_SIZE;
+}
+
+static int encode_codebook(CinepakEncContext *s, int *codebook, int size, int chunk_type_yuv, int chunk_type_gray, unsigned char *buf)
+{
+    int x, y, ret, entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+    int incremental_codebook_replacement_mode = 0; // hardcoded here,
+                // the compiler should notice that this is a constant -- rl
+
+    ret = write_chunk_header(buf,
+          s->pix_fmt == AV_PIX_FMT_RGB24 ?
+           chunk_type_yuv+(incremental_codebook_replacement_mode?1:0) :
+           chunk_type_gray+(incremental_codebook_replacement_mode?1:0),
+          entry_size * size
+           + (incremental_codebook_replacement_mode?(size+31)/32*4:0) );
+
+// we do codebook encoding according to the "intra" mode
+// but we keep the "dead" code for reference in case we will want
+// to use incremental codebook updates (which actually would give us
+// "kind of" motion compensation, especially in 1 strip/frame case) -- rl
+// (of course, the code will be not useful as-is)
+    if(incremental_codebook_replacement_mode) {
+        int flags = 0;
+        int flagsind;
+        for(x = 0; x < size; x++) {
+            if(flags == 0) {
+                flagsind = ret;
+                ret += 4;
+                flags = 0x80000000;
+            } else
+                flags = ((flags>>1) | 0x80000000);
+            for(y = 0; y < entry_size; y++)
+                buf[ret++] = codebook[y + x*entry_size] ^ (y >= 4 ? 0x80 : 0);
+            if((flags&0xffffffff) == 0xffffffff) {
+                AV_WB32(&buf[flagsind], flags);
+                flags = 0;
+            }
+        }
+        if(flags)
+            AV_WB32(&buf[flagsind], flags);
+    } else
+        for(x = 0; x < size; x++)
+            for(y = 0; y < entry_size; y++)
+                buf[ret++] = codebook[y + x*entry_size] ^ (y >= 4 ? 0x80 : 0);
+
+    return ret;
+}
+
+//sets out to the sub picture starting at (x,y) in in
+static void get_sub_picture(CinepakEncContext *s, int x, int y,
+                            uint8_t * in_data[4], int  in_linesize[4],
+                            uint8_t *out_data[4], int out_linesize[4])
+{
+    out_data[0] = in_data[0] + x + y * in_linesize[0];
+    out_linesize[0] = in_linesize[0];
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        out_data[1] = in_data[1] + (x >> 1) + (y >> 1) * in_linesize[1];
+        out_linesize[1] = in_linesize[1];
+
+        out_data[2] = in_data[2] + (x >> 1) + (y >> 1) * in_linesize[2];
+        out_linesize[2] = in_linesize[2];
+    }
+}
+
+//decodes the V1 vector in mb into the 4x4 MB pointed to by data
+static void decode_v1_vector(CinepakEncContext *s, uint8_t *data[4],
+                             int linesize[4], int v1_vector, strip_info *info)
+{
+    int entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+
+    data[0][0] =
+            data[0][1] =
+            data[0][    linesize[0]] =
+            data[0][1+  linesize[0]] = info->v1_codebook[v1_vector*entry_size];
+
+    data[0][2] =
+            data[0][3] =
+            data[0][2+  linesize[0]] =
+            data[0][3+  linesize[0]] = info->v1_codebook[v1_vector*entry_size+1];
+
+    data[0][2*linesize[0]] =
+            data[0][1+2*linesize[0]] =
+            data[0][  3*linesize[0]] =
+            data[0][1+3*linesize[0]] = info->v1_codebook[v1_vector*entry_size+2];
+
+    data[0][2+2*linesize[0]] =
+            data[0][3+2*linesize[0]] =
+            data[0][2+3*linesize[0]] =
+            data[0][3+3*linesize[0]] = info->v1_codebook[v1_vector*entry_size+3];
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        data[1][0] =
+            data[1][1] =
+            data[1][    linesize[1]] =
+            data[1][1+  linesize[1]] = info->v1_codebook[v1_vector*entry_size+4];
+
+        data[2][0] =
+            data[2][1] =
+            data[2][    linesize[2]] =
+            data[2][1+  linesize[2]] = info->v1_codebook[v1_vector*entry_size+5];
+    }
+}
+
+//decodes the V4 vectors in mb into the 4x4 MB pointed to by data
+static void decode_v4_vector(CinepakEncContext *s, uint8_t *data[4],
+                             int linesize[4], int *v4_vector, strip_info *info)
+{
+    int i, x, y, entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+
+    for(i = y = 0; y < 4; y += 2) {
+        for(x = 0; x < 4; x += 2, i++) {
+            data[0][x   +     y*linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size];
+            data[0][x+1 +     y*linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size+1];
+            data[0][x   + (y+1)*linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size+2];
+            data[0][x+1 + (y+1)*linesize[0]] = info->v4_codebook[v4_vector[i]*entry_size+3];
+
+            if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+                data[1][(x>>1) + (y>>1)*linesize[1]] = info->v4_codebook[v4_vector[i]*entry_size+4];
+                data[2][(x>>1) + (y>>1)*linesize[2]] = info->v4_codebook[v4_vector[i]*entry_size+5];
+            }
+        }
+    }
+}
+
+static void copy_mb(CinepakEncContext *s,
+                    uint8_t *a_data[4], int a_linesize[4],
+                    uint8_t *b_data[4], int b_linesize[4])
+{
+    int y, p;
+
+    for(y = 0; y < MB_SIZE; y++) {
+        memcpy(a_data[0]+y*a_linesize[0], b_data[0]+y*b_linesize[0],
+               MB_SIZE);
+    }
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        for(p = 1; p <= 2; p++) {
+            for(y = 0; y < MB_SIZE/2; y++) {
+                memcpy(a_data[p] + y*a_linesize[p],
+                       b_data[p] + y*b_linesize[p],
+                       MB_SIZE/2);
+            }
+        }
+    }
+}
+
+static int encode_mode(CinepakEncContext *s, int h,
+                       uint8_t *scratch_data[4], int scratch_linesize[4],
+                       uint8_t *last_data[4], int last_linesize[4],
+                       strip_info *info, unsigned char *buf)
+{
+    int x, y, z, flags, bits, temp_size, header_ofs, ret = 0, mb_count = s->w * h / MB_AREA;
+    int needs_extra_bit, should_write_temp;
+    unsigned char temp[64]; //32/2 = 16 V4 blocks at 4 B each -> 64 B
+    mb_info *mb;
+    uint8_t *sub_scratch_data[4] = {0}, *sub_last_data[4] = {0};
+    int sub_scratch_linesize[4] = {0}, sub_last_linesize[4] = {0};
+
+    //encode codebooks
+////// MacOS vintage decoder compatibility dictates the presence of
+////// the codebook chunk even when the codebook is empty - pretty dumb...
+////// and also the certain order of the codebook chunks -- rl
+    if(info->v4_size || !s->skip_empty_cb)
+        ret += encode_codebook(s, info->v4_codebook, info->v4_size, 0x20, 0x24, buf + ret);
+
+    if(info->v1_size || !s->skip_empty_cb)
+        ret += encode_codebook(s, info->v1_codebook, info->v1_size, 0x22, 0x26, buf + ret);
+
+    //update scratch picture
+    for(z = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, z++) {
+            mb = &s->mb[z];
+
+            get_sub_picture(s, x, y, scratch_data, scratch_linesize,
+                            sub_scratch_data, sub_scratch_linesize);
+
+            if(info->mode == MODE_MC && mb->best_encoding == ENC_SKIP) {
+                get_sub_picture(s, x, y,
+                                last_data, last_linesize,
+                                sub_last_data, sub_last_linesize);
+                copy_mb(s, sub_scratch_data, sub_scratch_linesize,
+                        sub_last_data, sub_last_linesize);
+            } else if(info->mode == MODE_V1_ONLY || mb->best_encoding == ENC_V1)
+                decode_v1_vector(s, sub_scratch_data, sub_scratch_linesize,
+                                 mb->v1_vector, info);
+            else
+                decode_v4_vector(s, sub_scratch_data, sub_scratch_linesize,
+                                 mb->v4_vector, info);
+        }
+    }
+
+    switch(info->mode) {
+    case MODE_V1_ONLY:
+        //av_log(s->avctx, AV_LOG_INFO, "mb_count = %i\n", mb_count);
+        ret += write_chunk_header(buf + ret, 0x32, mb_count);
+
+        for(x = 0; x < mb_count; x++)
+            buf[ret++] = s->mb[x].v1_vector;
+
+        break;
+    case MODE_V1_V4:
+        //remember header position
+        header_ofs = ret;
+        ret += CHUNK_HEADER_SIZE;
+
+        for(x = 0; x < mb_count; x += 32) {
+            flags = 0;
+            for(y = x; y < FFMIN(x+32, mb_count); y++)
+                if(s->mb[y].best_encoding == ENC_V4)
+                    flags |= 1 << (31 - y + x);
+
+            AV_WB32(&buf[ret], flags);
+            ret += 4;
+
+            for(y = x; y < FFMIN(x+32, mb_count); y++) {
+                mb = &s->mb[y];
+
+                if(mb->best_encoding == ENC_V1)
+                    buf[ret++] = mb->v1_vector;
+                else
+                    for(z = 0; z < 4; z++)
+                        buf[ret++] = mb->v4_vector[z];
+            }
+        }
+
+        write_chunk_header(buf + header_ofs, 0x30, ret - header_ofs - CHUNK_HEADER_SIZE);
+
+        break;
+    case MODE_MC:
+        //remember header position
+        header_ofs = ret;
+        ret += CHUNK_HEADER_SIZE;
+        flags = bits = temp_size = 0;
+
+        for(x = 0; x < mb_count; x++) {
+            mb = &s->mb[x];
+            flags |= (mb->best_encoding != ENC_SKIP) << (31 - bits++);
+            needs_extra_bit = 0;
+            should_write_temp = 0;
+
+            if(mb->best_encoding != ENC_SKIP) {
+                if(bits < 32)
+                    flags |= (mb->best_encoding == ENC_V4) << (31 - bits++);
+                else
+                    needs_extra_bit = 1;
+            }
+
+            if(bits == 32) {
+                AV_WB32(&buf[ret], flags);
+                ret += 4;
+                flags = bits = 0;
+
+                if(mb->best_encoding == ENC_SKIP || needs_extra_bit) {
+                    memcpy(&buf[ret], temp, temp_size);
+                    ret += temp_size;
+                    temp_size = 0;
+                } else
+                    should_write_temp = 1;
+            }
+
+            if(needs_extra_bit) {
+                flags = (mb->best_encoding == ENC_V4) << 31;
+                bits = 1;
+            }
+
+            if(mb->best_encoding == ENC_V1)
+                temp[temp_size++] = mb->v1_vector;
+            else if(mb->best_encoding == ENC_V4)
+                for(z = 0; z < 4; z++)
+                    temp[temp_size++] = mb->v4_vector[z];
+
+            if(should_write_temp) {
+                memcpy(&buf[ret], temp, temp_size);
+                ret += temp_size;
+                temp_size = 0;
+            }
+        }
+
+        if(bits > 0) {
+            AV_WB32(&buf[ret], flags);
+            ret += 4;
+            memcpy(&buf[ret], temp, temp_size);
+            ret += temp_size;
+        }
+
+        write_chunk_header(buf + header_ofs, 0x31, ret - header_ofs - CHUNK_HEADER_SIZE);
+
+        break;
+    }
+
+    return ret;
+}
+
+//computes distortion of 4x4 MB in b compared to a
+static int compute_mb_distortion(CinepakEncContext *s,
+                                 uint8_t *a_data[4], int a_linesize[4],
+                                 uint8_t *b_data[4], int b_linesize[4])
+{
+    int x, y, p, d, ret = 0;
+
+    for(y = 0; y < MB_SIZE; y++) {
+        for(x = 0; x < MB_SIZE; x++) {
+            d = a_data[0][x + y*a_linesize[0]] - b_data[0][x + y*b_linesize[0]];
+            ret += d*d;
+        }
+    }
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        for(p = 1; p <= 2; p++) {
+            for(y = 0; y < MB_SIZE/2; y++) {
+                for(x = 0; x < MB_SIZE/2; x++) {
+                    d = a_data[p][x + y*a_linesize[p]] - b_data[p][x + y*b_linesize[p]];
+                    ret += d*d;
+                }
+            }
+        }
+    }
+
+    return ret;
+}
+
+// return the possibly adjusted size of the codebook
+#define CERTAIN(x) ((x)!=ENC_UNCERTAIN)
+static int quantize(CinepakEncContext *s, int h,
+                    uint8_t *data[4], int linesize[4],
+                    int v1mode, strip_info *info,
+                    mb_encoding encoding)
+{
+    int x, y, i, j, k, x2, y2, x3, y3, plane, shift, mbn;
+    int entry_size = s->pix_fmt == AV_PIX_FMT_RGB24 ? 6 : 4;
+    int *codebook = v1mode ? info->v1_codebook : info->v4_codebook;
+    int size = v1mode ? info->v1_size : info->v4_size;
+    int64_t total_error = 0;
+    uint8_t vq_pict_buf[(MB_AREA*3)/2];
+    uint8_t *sub_data    [4], *vq_data    [4];
+    int      sub_linesize[4],  vq_linesize[4];
+
+    for(mbn = i = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, ++mbn) {
+            int *base;
+
+            if(CERTAIN(encoding)) {
+// use for the training only the blocks known to be to be encoded [sic:-]
+               if(s->mb[mbn].best_encoding != encoding) continue;
+            }
+
+            base = s->codebook_input + i*entry_size;
+            if(v1mode) {
+                //subsample
+                for(j = y2 = 0; y2 < entry_size; y2 += 2) {
+                    for(x2 = 0; x2 < 4; x2 += 2, j++) {
+                        plane = y2 < 4 ? 0 : 1 + (x2 >> 1);
+                        shift = y2 < 4 ? 0 : 1;
+                        x3 = shift ? 0 : x2;
+                        y3 = shift ? 0 : y2;
+                        base[j] = (data[plane][((x+x3) >> shift) +      ((y+y3) >> shift)      * linesize[plane]] +
+                                   data[plane][((x+x3) >> shift) + 1 +  ((y+y3) >> shift)      * linesize[plane]] +
+                                   data[plane][((x+x3) >> shift) +     (((y+y3) >> shift) + 1) * linesize[plane]] +
+                                   data[plane][((x+x3) >> shift) + 1 + (((y+y3) >> shift) + 1) * linesize[plane]]) >> 2;
+                    }
+                }
+            } else {
+                //copy
+                for(j = y2 = 0; y2 < MB_SIZE; y2 += 2) {
+                    for(x2 = 0; x2 < MB_SIZE; x2 += 2) {
+                        for(k = 0; k < entry_size; k++, j++) {
+                            plane = k >= 4 ? k - 3 : 0;
+
+                            if(k >= 4) {
+                                x3 = (x+x2) >> 1;
+                                y3 = (y+y2) >> 1;
+                            } else {
+                                x3 = x + x2 + (k & 1);
+                                y3 = y + y2 + (k >> 1);
+                            }
+
+                            base[j] = data[plane][x3 + y3*linesize[plane]];
+                        }
+                    }
+                }
+            }
+            i += v1mode ? 1 : 4;
+        }
+    }
+//    if(i < mbn*(v1mode ? 1 : 4)) {
+//        av_log(s->avctx, AV_LOG_INFO, "reducing training set for %s from %i to %i (encoding %i)\n", v1mode?"v1":"v4", mbn*(v1mode ? 1 : 4), i, encoding);
+//    }
+
+    if(i == 0) // empty training set, nothing to do
+        return 0;
+    if(i < size) {
+        //av_log(s->avctx, (CERTAIN(encoding) ? AV_LOG_ERROR : AV_LOG_INFO), "WOULD WASTE: %s cbsize %i bigger than training set size %i (encoding %i)\n", v1mode?"v1":"v4", size, i, encoding);
+        size = i;
+    }
+
+    avpriv_init_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
+    avpriv_do_elbg(s->codebook_input, entry_size, i, codebook, size, 1, s->codebook_closest, &s->randctx);
+
+    //setup vq_data, which contains a single MB
+    vq_data[0] = vq_pict_buf;
+    vq_linesize[0] = MB_SIZE;
+    vq_data[1] = &vq_pict_buf[MB_AREA];
+    vq_data[2] = vq_data[1] + (MB_AREA >> 2);
+    vq_linesize[1] = vq_linesize[2] = MB_SIZE >> 1;
+
+    //copy indices
+    for(i = j = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, j++) {
+            mb_info *mb = &s->mb[j];
+// skip uninteresting blocks if we know their preferred encoding
+            if(CERTAIN(encoding) && mb->best_encoding != encoding)
+                continue;
+
+            //point sub_data to current MB
+            get_sub_picture(s, x, y, data, linesize, sub_data, sub_linesize);
+
+            if(v1mode) {
+                mb->v1_vector = s->codebook_closest[i];
+
+                //fill in vq_data with V1 data
+                decode_v1_vector(s, vq_data, vq_linesize, mb->v1_vector, info);
+
+                mb->v1_error = compute_mb_distortion(s, sub_data, sub_linesize,
+                                                     vq_data, vq_linesize);
+                total_error += mb->v1_error;
+            } else {
+                for(k = 0; k < 4; k++)
+                    mb->v4_vector[k] = s->codebook_closest[i+k];
+
+                //fill in vq_data with V4 data
+                decode_v4_vector(s, vq_data, vq_linesize, mb->v4_vector, info);
+
+                mb->v4_error = compute_mb_distortion(s, sub_data, sub_linesize,
+                                                     vq_data, vq_linesize);
+                total_error += mb->v4_error;
+            }
+            i += v1mode ? 1 : 4;
+        }
+    }
+// check that we did it right in the beginning of the function
+    av_assert0(i >= size); // training set is no smaller than the codebook
+
+    //av_log(s->avctx, AV_LOG_INFO, "isv1 %i size= %i i= %i error %"PRId64"\n", v1mode, size, i, total_error);
+
+    return size;
+}
+
+static void calculate_skip_errors(CinepakEncContext *s, int h,
+                                  uint8_t *last_data[4], int last_linesize[4],
+                                  uint8_t *data[4], int linesize[4],
+                                  strip_info *info)
+{
+    int x, y, i;
+    uint8_t *sub_last_data    [4], *sub_pict_data    [4];
+    int      sub_last_linesize[4],  sub_pict_linesize[4];
+
+    for(i = y = 0; y < h; y += MB_SIZE) {
+        for(x = 0; x < s->w; x += MB_SIZE, i++) {
+            get_sub_picture(s, x, y, last_data,     last_linesize,
+                                 sub_last_data, sub_last_linesize);
+            get_sub_picture(s, x, y,      data,          linesize,
+                                 sub_pict_data, sub_pict_linesize);
+
+            s->mb[i].skip_error = compute_mb_distortion(s,
+                                            sub_last_data, sub_last_linesize,
+                                            sub_pict_data, sub_pict_linesize);
+        }
+    }
+}
+
+static void write_strip_header(CinepakEncContext *s, int y, int h, int keyframe, unsigned char *buf, int strip_size)
+{
+// actually we are exclusively using intra strip coding (how much can we win
+// otherwise? how to choose which part of a codebook to update?),
+// keyframes are different only because we disallow ENC_SKIP on them -- rl
+// (besides, the logic here used to be inverted: )
+//    buf[0] = keyframe ? 0x11: 0x10;
+    buf[0] = keyframe ? 0x10: 0x11;
+    AV_WB24(&buf[1], strip_size + STRIP_HEADER_SIZE);
+//    AV_WB16(&buf[4], y); /* using absolute y values works -- rl */
+    AV_WB16(&buf[4], 0); /* using relative values works as well -- rl */
+    AV_WB16(&buf[6], 0);
+//    AV_WB16(&buf[8], y+h); /* using absolute y values works -- rl */
+    AV_WB16(&buf[8], h); /* using relative values works as well -- rl */
+    AV_WB16(&buf[10], s->w);
+    //av_log(s->avctx, AV_LOG_INFO, "write_strip_header() %x keyframe=%d\n", buf[0], keyframe);
+}
+
+static int rd_strip(CinepakEncContext *s, int y, int h, int keyframe,
+                    uint8_t *last_data[4], int last_linesize[4],
+                    uint8_t *data[4], int linesize[4],
+                    uint8_t *scratch_data[4], int scratch_linesize[4],
+                    unsigned char *buf, int64_t *best_score
+#ifdef CINEPAK_REPORT_SERR
+, int64_t *best_serr
+#endif
+)
+{
+    int64_t score = 0;
+#ifdef CINEPAK_REPORT_SERR
+    int64_t serr;
+#endif
+    int best_size = 0;
+    strip_info info;
+// for codebook optimization:
+    int v1enough, v1_size, v4enough, v4_size;
+    int new_v1_size, new_v4_size;
+    int v1shrunk, v4shrunk;
+
+    if(!keyframe)
+        calculate_skip_errors(s, h, last_data, last_linesize, data, linesize,
+                              &info);
+
+    //try some powers of 4 for the size of the codebooks
+    //constraint the v4 codebook to be no bigger than v1 one,
+    //(and no less than v1_size/4)
+    //thus making v1 preferable and possibly losing small details? should be ok
+#define SMALLEST_CODEBOOK 1
+    for(v1enough = 0, v1_size = SMALLEST_CODEBOOK; v1_size <= CODEBOOK_MAX && !v1enough; v1_size <<= 2) {
+        for(v4enough = 0, v4_size = 0; v4_size <= v1_size && !v4enough; v4_size = v4_size ? v4_size << 2 : v1_size >= SMALLEST_CODEBOOK << 2 ? v1_size >> 2 : SMALLEST_CODEBOOK) {
+            //try all modes
+            for(CinepakMode mode = 0; mode < MODE_COUNT; mode++) {
+                //don't allow MODE_MC in intra frames
+                if(keyframe && mode == MODE_MC)
+                    continue;
+
+                if(mode == MODE_V1_ONLY) {
+                    info.v1_size = v1_size;
+// the size may shrink even before optimizations if the input is short:
+                    info.v1_size = quantize(s, h, data, linesize, 1,
+                                            &info, ENC_UNCERTAIN);
+                    if(info.v1_size < v1_size)
+// too few eligible blocks, no sense in trying bigger sizes
+                        v1enough = 1;
+
+                    info.v4_size = 0;
+                } else { // mode != MODE_V1_ONLY
+                    // if v4 codebook is empty then only allow V1-only mode
+                    if(!v4_size)
+                        continue;
+
+                    if(mode == MODE_V1_V4) {
+                        info.v4_size = v4_size;
+                        info.v4_size = quantize(s, h, data, linesize, 0,
+                                                &info, ENC_UNCERTAIN);
+                        if(info.v4_size < v4_size)
+// too few eligible blocks, no sense in trying bigger sizes
+                            v4enough = 1;
+                    }
+                }
+
+                info.mode = mode;
+// choose the best encoding per block, based on current experience
+                score = calculate_mode_score(s, h, &info, 0,
+                                             &v1shrunk, &v4shrunk
+#ifdef CINEPAK_REPORT_SERR
+, &serr
+#endif
+);
+
+                if(mode != MODE_V1_ONLY){
+                    int extra_iterations_limit = s->max_extra_cb_iterations;
+// recompute the codebooks, omitting the extra blocks
+// we assume we _may_ come here with more blocks to encode than before
+                    info.v1_size = v1_size;
+                    new_v1_size = quantize(s, h, data, linesize, 1, &info, ENC_V1);
+                    if(new_v1_size < info.v1_size){
+                        //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v1 codebook to %i entries\n", mode, v1_size, v4_size, new_v1_size);
+                        info.v1_size = new_v1_size;
+                    }
+// we assume we _may_ come here with more blocks to encode than before
+                    info.v4_size = v4_size;
+                    new_v4_size = quantize(s, h, data, linesize, 0, &info, ENC_V4);
+                    if(new_v4_size < info.v4_size) {
+                        //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v4 codebook to %i entries at first iteration\n", mode, v1_size, v4_size, new_v4_size);
+                        info.v4_size = new_v4_size;
+                    }
+// calculate the resulting score
+// (do not move blocks to codebook encodings now, as some blocks may have
+// got bigger errors despite a smaller training set - but we do not
+// ever grow the training sets back)
+                    for(;;) {
+                        score = calculate_mode_score(s, h, &info, 1,
+                                                     &v1shrunk, &v4shrunk
+#ifdef CINEPAK_REPORT_SERR
+, &serr
+#endif
+);
+// do we have a reason to reiterate? if so, have we reached the limit?
+                        if((!v1shrunk && !v4shrunk) || !extra_iterations_limit--) break;
+// recompute the codebooks, omitting the extra blocks
+                        if(v1shrunk) {
+                            info.v1_size = v1_size;
+                            new_v1_size = quantize(s, h, data, linesize, 1, &info, ENC_V1);
+                            if(new_v1_size < info.v1_size){
+                                //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v1 codebook to %i entries\n", mode, v1_size, v4_size, new_v1_size);
+                                info.v1_size = new_v1_size;
+                            }
+                        }
+                        if(v4shrunk) {
+                            info.v4_size = v4_size;
+                            new_v4_size = quantize(s, h, data, linesize, 0, &info, ENC_V4);
+                            if(new_v4_size < info.v4_size) {
+                                //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: cut v4 codebook to %i entries\n", mode, v1_size, v4_size, new_v4_size);
+                                info.v4_size = new_v4_size;
+                            }
+                        }
+                    }
+                }
+
+                //av_log(s->avctx, AV_LOG_INFO, "%3i %3i score = %"PRId64"\n", v1_size, v4_size, score);
+
+                if(best_size == 0 || score < *best_score) {
+
+                    *best_score = score;
+#ifdef CINEPAK_REPORT_SERR
+                    *best_serr = serr;
+#endif
+                    best_size = encode_mode(s, h,
+                                            scratch_data, scratch_linesize,
+                                            last_data, last_linesize, &info,
+                                            s->strip_buf + STRIP_HEADER_SIZE);
+
+                    //av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: %18"PRId64" %i B", mode, info.v1_size, info.v4_size, score, best_size);
+                    //av_log(s->avctx, AV_LOG_INFO, "\n");
+#ifdef CINEPAK_REPORT_SERR
+                    av_log(s->avctx, AV_LOG_INFO, "mode %i, %3i, %3i: %18"PRId64" %i B\n", mode, v1_size, v4_size, serr, best_size);
+#endif
+
+#ifdef CINEPAKENC_DEBUG
+                    //save MB encoding choices
+                    memcpy(s->best_mb, s->mb, mb_count*sizeof(mb_info));
+#endif
+
+                    //memcpy(strip_temp + STRIP_HEADER_SIZE, strip_temp, best_size);
+                    write_strip_header(s, y, h, keyframe, s->strip_buf, best_size);
+
+                }
+            }
+        }
+    }
+
+#ifdef CINEPAKENC_DEBUG
+    //gather stats. this will only work properly of MAX_STRIPS == 1
+    if(best_info.mode == MODE_V1_ONLY) {
+        s->num_v1_mode++;
+        s->num_v1_encs += s->w*h/MB_AREA;
+    } else {
+        if(best_info.mode == MODE_V1_V4)
+            s->num_v4_mode++;
+        else
+            s->num_mc_mode++;
+
+        int x;
+        for(x = 0; x < s->w*h/MB_AREA; x++)
+            if(s->best_mb[x].best_encoding == ENC_V1)
+                s->num_v1_encs++;
+            else if(s->best_mb[x].best_encoding == ENC_V4)
+                s->num_v4_encs++;
+            else
+                s->num_skips++;
+    }
+#endif
+
+    best_size += STRIP_HEADER_SIZE;
+    memcpy(buf, s->strip_buf, best_size);
+
+    return best_size;
+}
+
+static int write_cvid_header(CinepakEncContext *s, unsigned char *buf, int num_strips, int data_size, int isakeyframe)
+{
+    buf[0] = isakeyframe ? 0 : 1;
+    AV_WB24(&buf[1], data_size + CVID_HEADER_SIZE);
+    AV_WB16(&buf[4], s->w);
+    AV_WB16(&buf[6], s->h);
+    AV_WB16(&buf[8], num_strips);
+
+    return CVID_HEADER_SIZE;
+}
+
+static int rd_frame(CinepakEncContext *s, const AVFrame *frame,
+                    int isakeyframe, unsigned char *buf, int buf_size)
+{
+    int num_strips, strip, i, y, nexty, size, temp_size;
+    uint8_t *last_data    [4], *data    [4], *scratch_data    [4];
+    int      last_linesize[4],  linesize[4],  scratch_linesize[4];
+    int64_t best_score = 0, score, score_temp;
+#ifdef CINEPAK_REPORT_SERR
+    int64_t best_serr = 0, serr, serr_temp;
+#endif
+
+    int best_nstrips = -1, best_size = -1; // mark as uninitialzed
+
+    if(s->pix_fmt == AV_PIX_FMT_RGB24) {
+        int x;
+// build a copy of the given frame in the correct colorspace
+        for(y = 0; y < s->h; y += 2) {
+            for(x = 0; x < s->w; x += 2) {
+                uint8_t *ir[2]; int32_t r, g, b, rr, gg, bb;
+                ir[0] = frame->data[0] + x*3 + y*frame->linesize[0];
+                ir[1] = ir[0] + frame->linesize[0];
+                get_sub_picture(s, x, y,
+                                s->input_frame->data, s->input_frame->linesize,
+                                scratch_data, scratch_linesize);
+                r = g = b = 0;
+                for(i=0; i<4; ++i) {
+                    int i1, i2;
+                    i1 = (i&1); i2 = (i>=2);
+                    rr = ir[i2][i1*3+0];
+                    gg = ir[i2][i1*3+1];
+                    bb = ir[i2][i1*3+2];
+                    r += rr; g += gg; b += bb;
+// using fixed point arithmetic for portable repeatability, scaling by 2^23
+// "Y"
+//                    rr = 0.2857*rr + 0.5714*gg + 0.1429*bb;
+                    rr = (2396625*rr + 4793251*gg + 1198732*bb) >> 23;
+                    if(      rr <   0) rr =   0;
+                    else if (rr > 255) rr = 255;
+                    scratch_data[0][i1 + i2*scratch_linesize[0]] = rr;
+                }
+// let us scale down as late as possible
+//                r /= 4; g /= 4; b /= 4;
+// "U"
+//                rr = -0.1429*r - 0.2857*g + 0.4286*b;
+                rr = (-299683*r - 599156*g + 898839*b) >> 23;
+                if(      rr < -128) rr = -128;
+                else if (rr >  127) rr =  127;
+                scratch_data[1][0] = rr + 128; // quantize needs unsigned
+// "V"
+//                rr = 0.3571*r - 0.2857*g - 0.0714*b;
+                rr = (748893*r - 599156*g - 149737*b) >> 23;
+                if(      rr < -128) rr = -128;
+                else if (rr >  127) rr =  127;
+                scratch_data[2][0] = rr + 128; // quantize needs unsigned
+            }
+        }
+    }
+
+    //would be nice but quite certainly incompatible with vintage players:
+    // support encoding zero strips (meaning skip the whole frame)
+    for(num_strips = s->min_strips; num_strips <= s->max_strips && num_strips <= s->h / MB_SIZE; num_strips++) {
+        score = 0;
+        size = 0;
+#ifdef CINEPAK_REPORT_SERR
+        serr = 0;
+#endif
+
+        for(y = 0, strip = 1; y < s->h; strip++, y = nexty) {
+            int strip_height;
+
+            nexty = strip * s->h / num_strips; // <= s->h
+            //make nexty the next multiple of 4 if not already there
+            if(nexty & 3)
+                nexty += 4 - (nexty & 3);
+
+            strip_height = nexty - y;
+            if(strip_height <= 0) { // can this ever happen?
+                av_log(s->avctx, AV_LOG_INFO, "skipping zero height strip %i of %i\n", strip, num_strips);
+                continue;
+            }
+
+            if(s->pix_fmt == AV_PIX_FMT_RGB24)
+                get_sub_picture(s, 0, y,
+                                s->input_frame->data, s->input_frame->linesize,
+                                data, linesize);
+            else
+                get_sub_picture(s, 0, y,
+                                (uint8_t **)frame->data, (int*)frame->linesize,
+                                data, linesize);
+            get_sub_picture(s, 0, y,
+                            s->last_frame->data, s->last_frame->linesize,
+                            last_data, last_linesize);
+            get_sub_picture(s, 0, y,
+                            s->scratch_frame->data, s->scratch_frame->linesize,
+                            scratch_data, scratch_linesize);
+
+            if((temp_size = rd_strip(s, y, strip_height, isakeyframe,
+                                     last_data, last_linesize, data, linesize,
+                                     scratch_data, scratch_linesize,
+                                     s->frame_buf + size + CVID_HEADER_SIZE, &score_temp
+#ifdef CINEPAK_REPORT_SERR
+, &serr_temp
+#endif
+)) < 0)
+                return temp_size;
+
+            score += score_temp;
+#ifdef CINEPAK_REPORT_SERR
+            serr += serr_temp;
+#endif
+            size += temp_size;
+            //av_log(s->avctx, AV_LOG_INFO, "strip %d, isakeyframe=%d", strip, isakeyframe);
+            //av_log(s->avctx, AV_LOG_INFO, "\n");
+        }
+
+        if(best_score == 0 || score < best_score) {
+            best_score = score;
+#ifdef CINEPAK_REPORT_SERR
+            best_serr = serr;
+#endif
+            best_size = size + write_cvid_header(s, s->frame_buf, num_strips, size, isakeyframe);
+            //av_log(s->avctx, AV_LOG_INFO, "best number of strips so far: %2i, %12"PRId64", %i B\n", num_strips, score, best_size);
+#ifdef CINEPAK_REPORT_SERR
+            av_log(s->avctx, AV_LOG_INFO, "best number of strips so far: %2i, %12"PRId64", %i B\n", num_strips, serr, best_size);
+#endif
+
+            FFSWAP(AVFrame *, s->best_frame, s->scratch_frame);
+            memcpy(buf, s->frame_buf, best_size);
+            best_nstrips = num_strips;
+        }
+// avoid trying too many strip numbers without a real reason
+// (this makes the processing of the very first frame faster)
+        if(num_strips - best_nstrips > 4)
+            break;
+    }
+
+    av_assert0(best_nstrips >= 0 && best_size >= 0);
+
+// let the number of strips slowly adapt to the changes in the contents,
+// compared to full bruteforcing every time this will occasionally lead
+// to some r/d performance loss but makes encoding up to several times faster
+    if(!s->strip_number_delta_range) {
+        if(best_nstrips == s->max_strips) { // let us try to step up
+            s->max_strips = best_nstrips + 1;
+            if(s->max_strips >= s->max_max_strips)
+                s->max_strips = s->max_max_strips;
+        } else { // try to step down
+            s->max_strips = best_nstrips;
+        }
+        s->min_strips = s->max_strips - 1;
+        if(s->min_strips < s->min_min_strips)
+            s->min_strips = s->min_min_strips;
+    } else {
+        s->max_strips = best_nstrips + s->strip_number_delta_range;
+        if(s->max_strips >= s->max_max_strips)
+            s->max_strips = s->max_max_strips;
+        s->min_strips = best_nstrips - s->strip_number_delta_range;
+        if(s->min_strips < s->min_min_strips)
+            s->min_strips = s->min_min_strips;
+    }
+
+    return best_size;
+}
+
+static int cinepak_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                const AVFrame *frame, int *got_packet)
+{
+    CinepakEncContext *s = avctx->priv_data;
+    int ret;
+
+    s->lambda = frame->quality ? frame->quality - 1 : 2 * FF_LAMBDA_SCALE;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->frame_buf_size, 0)) < 0)
+        return ret;
+    ret = rd_frame(s, frame, (s->curframe == 0), pkt->data, s->frame_buf_size);
+    pkt->size = ret;
+    if (s->curframe == 0)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    FFSWAP(AVFrame *, s->last_frame, s->best_frame);
+
+    if (++s->curframe >= s->keyint)
+        s->curframe = 0;
+
+    return 0;
+}
+
+static av_cold int cinepak_encode_end(AVCodecContext *avctx)
+{
+    CinepakEncContext *s = avctx->priv_data;
+    int x;
+
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->best_frame);
+    av_frame_free(&s->scratch_frame);
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24)
+        av_frame_free(&s->input_frame);
+    av_freep(&s->codebook_input);
+    av_freep(&s->codebook_closest);
+    av_freep(&s->strip_buf);
+    av_freep(&s->frame_buf);
+    av_freep(&s->mb);
+#ifdef CINEPAKENC_DEBUG
+    av_freep(&s->best_mb);
+#endif
+
+    for(x = 0; x < (avctx->pix_fmt == AV_PIX_FMT_RGB24 ? 4 : 3); x++)
+        av_freep(&s->pict_bufs[x]);
+
+#ifdef CINEPAKENC_DEBUG
+    av_log(avctx, AV_LOG_INFO, "strip coding stats: %i V1 mode, %i V4 mode, %i MC mode (%i V1 encs, %i V4 encs, %i skips)\n",
+        s->num_v1_mode, s->num_v4_mode, s->num_mc_mode, s->num_v1_encs, s->num_v4_encs, s->num_skips);
+#endif
+
+    return 0;
+}
+
+AVCodec ff_cinepak_encoder = {
+    .name           = "cinepak",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_CINEPAK,
+    .priv_data_size = sizeof(CinepakEncContext),
+    .init           = cinepak_encode_init,
+    .encode2        = cinepak_encode_frame,
+    .close          = cinepak_encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_RGB24, AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE},
+    .long_name      = NULL_IF_CONFIG_SMALL("Cinepak / CVID"),
+    .priv_class     = &cinepak_class,
+};
diff --git a/libavcodec/cljrdec.c b/libavcodec/cljrdec.c
index 33d8023..4b187f8 100644
--- a/libavcodec/cljrdec.c
+++ b/libavcodec/cljrdec.c
@@ -2,20 +2,20 @@
  * Cirrus Logic AccuPak (CLJR) decoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,16 +43,14 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if (buf_size < avctx->height * avctx->width) {
+    if (buf_size / avctx->height < avctx->width) {
         av_log(avctx, AV_LOG_ERROR,
                "Resolution larger than buffer size. Invalid header?\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
@@ -63,10 +61,10 @@ static int decode_frame(AVCodecContext *avctx,
         uint8_t *cb   = &p->data[1][y * p->linesize[1]];
         uint8_t *cr   = &p->data[2][y * p->linesize[2]];
         for (x = 0; x < avctx->width; x += 4) {
-            luma[3] = get_bits(&gb, 5) << 3;
-            luma[2] = get_bits(&gb, 5) << 3;
-            luma[1] = get_bits(&gb, 5) << 3;
-            luma[0] = get_bits(&gb, 5) << 3;
+            luma[3] = (get_bits(&gb, 5)*33) >> 2;
+            luma[2] = (get_bits(&gb, 5)*33) >> 2;
+            luma[1] = (get_bits(&gb, 5)*33) >> 2;
+            luma[0] = (get_bits(&gb, 5)*33) >> 2;
             luma += 4;
             *(cb++) = get_bits(&gb, 6) << 2;
             *(cr++) = get_bits(&gb, 6) << 2;
@@ -93,3 +91,4 @@ AVCodec ff_cljr_decoder = {
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+
diff --git a/libavcodec/cljrenc.c b/libavcodec/cljrenc.c
index 0687e30..a371825 100644
--- a/libavcodec/cljrenc.c
+++ b/libavcodec/cljrenc.c
@@ -2,20 +2,20 @@
  * Cirrus Logic AccuPak (CLJR) encoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,28 +25,39 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
 #include "put_bits.h"
 
+typedef struct CLJRContext {
+    AVClass        *avclass;
+    int             dither_type;
+} CLJRContext;
+
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *p, int *got_packet)
 {
+    CLJRContext *a = avctx->priv_data;
     PutBitContext pb;
     int x, y, ret;
+    uint32_t dither= avctx->frame_number;
+    static const uint32_t ordered_dither[2][2] =
+    {
+        { 0x10400000, 0x104F0000 },
+        { 0xCB2A0000, 0xCB250000 },
+    };
 
-    if ((ret = ff_alloc_packet(pkt, 32*avctx->height*avctx->width/4)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+    if (avctx->width%4 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+         av_log(avctx, AV_LOG_ERROR,
+                "Widths which are not a multiple of 4 might fail with some decoders, "
+                "use vstrict=-1 / -strict -1 to use %d anyway.\n", avctx->width);
+         return AVERROR_EXPERIMENTAL;
     }
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    if ((ret = ff_alloc_packet2(avctx, pkt, 32*avctx->height*avctx->width/4, 0)) < 0)
+        return ret;
 
     init_put_bits(&pb, pkt->data, pkt->size);
 
@@ -54,14 +65,25 @@ FF_ENABLE_DEPRECATION_WARNINGS
         uint8_t *luma = &p->data[0][y * p->linesize[0]];
         uint8_t *cb   = &p->data[1][y * p->linesize[1]];
         uint8_t *cr   = &p->data[2][y * p->linesize[2]];
+        uint8_t luma_tmp[4];
         for (x = 0; x < avctx->width; x += 4) {
-            put_bits(&pb, 5, luma[3] >> 3);
-            put_bits(&pb, 5, luma[2] >> 3);
-            put_bits(&pb, 5, luma[1] >> 3);
-            put_bits(&pb, 5, luma[0] >> 3);
+            switch (a->dither_type) {
+            case 0: dither = 0x492A0000;                       break;
+            case 1: dither = dither * 1664525 + 1013904223;    break;
+            case 2: dither = ordered_dither[ y&1 ][ (x>>2)&1 ];break;
+            }
+            if (x+3 >= avctx->width) {
+                memset(luma_tmp, 0, sizeof(luma_tmp));
+                memcpy(luma_tmp, luma, avctx->width - x);
+                luma = luma_tmp;
+            }
+            put_bits(&pb, 5, (249*(luma[3] +  (dither>>29)   )) >> 11);
+            put_bits(&pb, 5, (249*(luma[2] + ((dither>>26)&7))) >> 11);
+            put_bits(&pb, 5, (249*(luma[1] + ((dither>>23)&7))) >> 11);
+            put_bits(&pb, 5, (249*(luma[0] + ((dither>>20)&7))) >> 11);
             luma += 4;
-            put_bits(&pb, 6, *(cb++) >> 2);
-            put_bits(&pb, 6, *(cr++) >> 2);
+            put_bits(&pb, 6, (253*(*(cb++) + ((dither>>18)&3))) >> 10);
+            put_bits(&pb, 6, (253*(*(cr++) + ((dither>>16)&3))) >> 10);
         }
     }
 
@@ -73,12 +95,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+#define OFFSET(x) offsetof(CLJRContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "dither_type",   "Dither type",   OFFSET(dither_type),        AV_OPT_TYPE_INT, { .i64=1 }, 0, 2, VE},
+    { NULL },
+};
+
+static const AVClass cljr_class = {
+    .class_name = "cljr encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_cljr_encoder = {
     .name           = "cljr",
     .long_name      = NULL_IF_CONFIG_SMALL("Cirrus Logic AccuPak"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_CLJR,
+    .priv_data_size = sizeof(CLJRContext),
     .encode2        = encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV411P,
                                                    AV_PIX_FMT_NONE },
+    .priv_class     = &cljr_class,
 };
diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c
index cdbed74..1c6902a 100644
--- a/libavcodec/cllc.c
+++ b/libavcodec/cllc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012-2013 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -398,7 +398,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
     ctx->bdsp.bswap16_buf((uint16_t *) ctx->swapped_buf, (uint16_t *) src,
                           data_size / 2);
 
-    init_get_bits(&gb, ctx->swapped_buf, data_size * 8);
+    if ((ret = init_get_bits8(&gb, ctx->swapped_buf, data_size)) < 0)
+        return ret;
 
     /*
      * Read in coding type. The types are as follows:
@@ -416,11 +417,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_YUV422P;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         ret = decode_yuv_frame(ctx, &gb, pic);
         if (ret < 0)
@@ -432,11 +430,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_RGB24;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         ret = decode_rgb24_frame(ctx, &gb, pic);
         if (ret < 0)
@@ -447,11 +442,8 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data,
         avctx->pix_fmt             = AV_PIX_FMT_ARGB;
         avctx->bits_per_raw_sample = 8;
 
-        ret = ff_get_buffer(avctx, pic, 0);
-        if (ret < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         ret = decode_argb_frame(ctx, &gb, pic);
         if (ret < 0)
diff --git a/libavcodec/cngdec.c b/libavcodec/cngdec.c
index 482ef94..34f8814 100644
--- a/libavcodec/cngdec.c
+++ b/libavcodec/cngdec.c
@@ -2,26 +2,27 @@
  * RFC 3389 comfort noise generator
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <math.h>
 
 #include "libavutil/common.h"
+#include "libavutil/ffmath.h"
 #include "avcodec.h"
 #include "celp_filters.h"
 #include "internal.h"
@@ -41,11 +42,11 @@ typedef struct CNGContext {
 static av_cold int cng_decode_close(AVCodecContext *avctx)
 {
     CNGContext *p = avctx->priv_data;
-    av_free(p->refl_coef);
-    av_free(p->target_refl_coef);
-    av_free(p->lpc_coef);
-    av_free(p->filter_out);
-    av_free(p->excitation);
+    av_freep(&p->refl_coef);
+    av_freep(&p->target_refl_coef);
+    av_freep(&p->lpc_coef);
+    av_freep(&p->filter_out);
+    av_freep(&p->excitation);
     return 0;
 }
 
@@ -59,12 +60,12 @@ static av_cold int cng_decode_init(AVCodecContext *avctx)
 
     p->order            = 12;
     avctx->frame_size   = 640;
-    p->refl_coef        = av_mallocz(p->order * sizeof(*p->refl_coef));
-    p->target_refl_coef = av_mallocz(p->order * sizeof(*p->target_refl_coef));
-    p->lpc_coef         = av_mallocz(p->order * sizeof(*p->lpc_coef));
-    p->filter_out       = av_mallocz((avctx->frame_size + p->order) *
+    p->refl_coef        = av_mallocz_array(p->order, sizeof(*p->refl_coef));
+    p->target_refl_coef = av_mallocz_array(p->order, sizeof(*p->target_refl_coef));
+    p->lpc_coef         = av_mallocz_array(p->order, sizeof(*p->lpc_coef));
+    p->filter_out       = av_mallocz_array(avctx->frame_size + p->order,
                                      sizeof(*p->filter_out));
-    p->excitation       = av_mallocz(avctx->frame_size * sizeof(*p->excitation));
+    p->excitation       = av_mallocz_array(avctx->frame_size, sizeof(*p->excitation));
     if (!p->refl_coef || !p->target_refl_coef || !p->lpc_coef ||
         !p->filter_out || !p->excitation) {
         cng_decode_close(avctx);
@@ -112,7 +113,7 @@ static int cng_decode_frame(AVCodecContext *avctx, void *data,
 
     if (avpkt->size) {
         int dbov = -avpkt->data[0];
-        p->target_energy = 1081109975 * pow(10, dbov / 10.0) * 0.75;
+        p->target_energy = 1081109975 * ff_exp10(dbov / 10.0) * 0.75;
         memset(p->target_refl_coef, 0, p->order * sizeof(*p->target_refl_coef));
         for (i = 0; i < FFMIN(avpkt->size - 1, p->order); i++) {
             p->target_refl_coef[i] = (avpkt->data[1 + i] - 127) / 128.0;
@@ -142,10 +143,8 @@ static int cng_decode_frame(AVCodecContext *avctx, void *data,
                                  p->excitation, avctx->frame_size, p->order);
 
     frame->nb_samples = avctx->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     buf_out = (int16_t *)frame->data[0];
     for (i = 0; i < avctx->frame_size; i++)
         buf_out[i] = p->filter_out[i + p->order];
diff --git a/libavcodec/cngenc.c b/libavcodec/cngenc.c
index 98f3c4e..302c703 100644
--- a/libavcodec/cngenc.c
+++ b/libavcodec/cngenc.c
@@ -2,20 +2,20 @@
  * RFC 3389 comfort noise generator
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,8 +56,8 @@ static av_cold int cng_encode_init(AVCodecContext *avctx)
     p->order = 10;
     if ((ret = ff_lpc_init(&p->lpc, avctx->frame_size, p->order, FF_LPC_TYPE_LEVINSON)) < 0)
         return ret;
-    p->samples32 = av_malloc(avctx->frame_size * sizeof(*p->samples32));
-    p->ref_coef = av_malloc(p->order * sizeof(*p->ref_coef));
+    p->samples32 = av_malloc_array(avctx->frame_size, sizeof(*p->samples32));
+    p->ref_coef = av_malloc_array(p->order, sizeof(*p->ref_coef));
     if (!p->samples32 || !p->ref_coef) {
         cng_encode_close(avctx);
         return AVERROR(ENOMEM);
@@ -75,7 +75,7 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int qdbov;
     int16_t *samples = (int16_t*) frame->data[0];
 
-    if ((ret = ff_alloc_packet(avpkt, 1 + p->order))) {
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 1 + p->order, 1 + p->order))) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
         return ret;
     }
@@ -87,7 +87,7 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     energy /= frame->nb_samples;
     if (energy > 0) {
         double dbov = 10 * log10(energy / 1081109975);
-        qdbov = av_clip(-floor(dbov), 0, 127);
+        qdbov = av_clip_uintp2(-floor(dbov), 7);
     } else {
         qdbov = 127;
     }
@@ -97,7 +97,7 @@ static int cng_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         avpkt->data[1 + i] = p->ref_coef[i] * 127 + 127;
 
     *got_packet_ptr = 1;
-    avpkt->size = 1 + p->order;
+    av_assert1(avpkt->size == 1 + p->order);
 
     return 0;
 }
diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
index 7fd2cc6..575a6e5 100644
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * This table was generated from the long and short names of AVCodecs
+ * please see the respective codec sources for authorship
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +27,8 @@
 #include "profiles.h"
 #include "version.h"
 
+#define MT(...) (const char *const[]){ __VA_ARGS__, NULL }
+
 static const AVCodecDescriptor codec_descriptors[] = {
     /* video codecs */
     {
@@ -84,6 +89,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "mjpeg",
         .long_name = NULL_IF_CONFIG_SMALL("Motion JPEG"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .mime_types= MT("image/jpeg"),
     },
     {
         .id        = AV_CODEC_ID_MJPEGB,
@@ -403,6 +409,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_SNOW,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "snow",
+        .long_name = NULL_IF_CONFIG_SMALL("Snow"),
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_TSCC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "tscc",
@@ -522,6 +535,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "bmp",
         .long_name = NULL_IF_CONFIG_SMALL("BMP (Windows and OS/2 bitmap)"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-ms-bmp"),
     },
     {
         .id        = AV_CODEC_ID_CSCD,
@@ -593,6 +607,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("JPEG 2000"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
                      AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/jp2"),
         .profiles  = NULL_IF_CONFIG_SMALL(ff_jpeg2000_profiles),
     },
     {
@@ -729,6 +744,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_DAALA,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "daala",
+        .long_name = NULL_IF_CONFIG_SMALL("Daala"),
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_DIRAC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dirac",
@@ -865,14 +887,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_IFF_ILBM,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "iff_ilbm",
-        .long_name = NULL_IF_CONFIG_SMALL("IFF ILBM"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_IFF_BYTERUN1,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "iff_byterun1",
-        .long_name = NULL_IF_CONFIG_SMALL("IFF ByteRun1"),
+        .long_name = NULL_IF_CONFIG_SMALL("IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -902,6 +917,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "vp9",
         .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
         .props     = AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
     },
     {
         .id        = AV_CODEC_ID_PICTOR,
@@ -932,6 +948,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_M101,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "m101",
+        .long_name = NULL_IF_CONFIG_SMALL("Matrox Uncompressed SD"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_MVC1,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "mvc1",
@@ -1020,7 +1043,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "cdxl",
         .long_name = NULL_IF_CONFIG_SMALL("Commodore CDXL video"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_ZEROCODEC,
@@ -1079,6 +1102,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_Y41P,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "y41p",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
         .id        = AV_CODEC_ID_ESCAPE130,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "escape130",
@@ -1086,6 +1116,88 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_AVRP,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "avrp",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_012V,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "012v",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_AVUI,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "avui",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_AYUV,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ayuv",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_TARGA_Y216,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "targa_y216",
+        .long_name = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_V308,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "v308",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_V408,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "v408",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_YUV4,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "yuv4",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_AVRN,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "avrn",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
+    },
+    {
+        .id        = AV_CODEC_ID_CPIA,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "cpia",
+        .long_name = NULL_IF_CONFIG_SMALL("CPiA video format"),
+    },
+    {
+        .id        = AV_CODEC_ID_XFACE,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xface",
+        .long_name = NULL_IF_CONFIG_SMALL("X-face image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SMVJPEG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "smvjpeg",
+        .long_name = NULL_IF_CONFIG_SMALL("Sigmatel Motion Video"),
+    },
+
+    {
         .id        = AV_CODEC_ID_G2M,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "g2m",
@@ -1103,7 +1215,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_HEVC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hevc",
-        .long_name = NULL_IF_CONFIG_SMALL("HEVC (High Efficiency Video Coding)"),
+        .long_name = NULL_IF_CONFIG_SMALL("H.265 / HEVC (High Efficiency Video Coding)"),
         .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER,
         .profiles  = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
     },
@@ -1132,7 +1244,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_SANM,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "sanm",
-        .long_name = NULL_IF_CONFIG_SMALL("LucasArts SANM video"),
+        .long_name = NULL_IF_CONFIG_SMALL("LucasArts SANM/SMUSH video"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -1226,7 +1338,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_DPX,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dpx",
-        .long_name = NULL_IF_CONFIG_SMALL("DPX image"),
+        .long_name = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
@@ -1243,6 +1355,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "gif",
         .long_name = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
         .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/gif"),
     },
     {
         .id        = AV_CODEC_ID_JPEGLS,
@@ -1265,6 +1378,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "pam",
         .long_name = NULL_IF_CONFIG_SMALL("PAM (Portable AnyMap) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-portable-pixmap"),
     },
     {
         .id        = AV_CODEC_ID_PBM,
@@ -1279,6 +1393,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "pcx",
         .long_name = NULL_IF_CONFIG_SMALL("PC Paintbrush PCX image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-pcx"),
     },
     {
         .id        = AV_CODEC_ID_PGM,
@@ -1300,6 +1415,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "png",
         .long_name = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
         .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
     },
     {
         .id        = AV_CODEC_ID_PPM,
@@ -1342,6 +1458,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "targa",
         .long_name = NULL_IF_CONFIG_SMALL("Truevision Targa image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-targa", "image/x-tga"),
     },
     {
         .id        = AV_CODEC_ID_TDSC,
@@ -1356,6 +1473,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "tiff",
         .long_name = NULL_IF_CONFIG_SMALL("TIFF image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/tiff"),
     },
     {
         .id        = AV_CODEC_ID_TXD,
@@ -1378,6 +1496,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("WebP"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
                      AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/webp"),
     },
     {
         .id        = AV_CODEC_ID_WMV3IMAGE,
@@ -1399,6 +1518,43 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "xwd",
         .long_name = NULL_IF_CONFIG_SMALL("XWD (X Window Dump) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-xwindowdump"),
+    },
+    {
+        .id        = AV_CODEC_ID_APNG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "apng",
+        .long_name = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
+    },
+    {
+        .id        = AV_CODEC_ID_CFHD,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "cfhd",
+        .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_TRUEMOTION2RT,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "truemotion2rt",
+        .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_MAGICYUV,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "magicyuv",
+        .long_name = NULL_IF_CONFIG_SMALL("MagicYUV Lossless Video"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_SHEERVIDEO,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "sheervideo",
+        .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
 
     /* various PCM "codecs" */
@@ -1448,13 +1604,15 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_PCM_MULAW,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_mulaw",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM mu-law"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM mu-law / G.711 mu-law"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_PCM_ALAW,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_alaw",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM A-law"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM A-law / G.711 A-law"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
         .id        = AV_CODEC_ID_PCM_S32LE,
@@ -1537,7 +1695,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_PCM_S16LE_PLANAR,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_s16le_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM 16-bit little-endian planar"),
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit little-endian planar"),
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
@@ -1608,7 +1766,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "s302m",
         .long_name = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
         .id        = AV_CODEC_ID_PCM_S8_PLANAR,
@@ -1749,7 +1907,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_ADPCM_THP,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "adpcm_thp",
-        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube THP"),
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_THP_LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_thp_le",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP (Little-Endian)"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -1830,12 +1995,68 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_ADPCM_AFC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_afc",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube AFC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_OKI,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_oki",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Dialogic OKI"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_DTK,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_dtk",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube DTK"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_RAD,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_rad",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Radical"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_G726LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_g726le",
+        .long_name = NULL_IF_CONFIG_SMALL("G.726 ADPCM little-endian"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_ADPCM_VIMA,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "adpcm_vima",
         .long_name = NULL_IF_CONFIG_SMALL("LucasArts VIMA audio"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_ADPCM_PSX,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_psx",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Playstation"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_AICA,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_aica",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Yamaha AICA"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_IMA_DAT4,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_ima_dat4",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Eurocom DAT4"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* AMR */
     {
@@ -1898,6 +2119,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("DPCM Sol"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_SDX2_DPCM,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sdx2_dpcm",
+        .long_name = NULL_IF_CONFIG_SMALL("DPCM Squareroot-Delta-Exact"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* audio codecs */
     {
@@ -2260,6 +2488,7 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .name      = "aac_latm",
         .long_name = NULL_IF_CONFIG_SMALL("AAC LATM (Advanced Audio Coding LATM syntax)"),
         .props     = AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_aac_profiles),
     },
     {
         .id        = AV_CODEC_ID_QDMC,
@@ -2339,6 +2568,24 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_FFWAVESYNTH,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "wavesynth",
+        .long_name = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonic",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC_LS,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonicls",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic lossless"),
+    },
+    {
         .id        = AV_CODEC_ID_OPUS,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "opus",
@@ -2380,6 +2627,90 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("On2 Audio for Video Codec"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_EVRC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "evrc",
+        .long_name = NULL_IF_CONFIG_SMALL("EVRC (Enhanced Variable Rate Codec)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SMV,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "smv",
+        .long_name = NULL_IF_CONFIG_SMALL("SMV (Selectable Mode Vocoder)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_4GV,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "4gv",
+        .long_name = NULL_IF_CONFIG_SMALL("4GV (Fourth Generation Vocoder)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_LSBF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_lsbf",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_MSBF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_msbf",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_LSBF_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_lsbf_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first, planar"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DSD_MSBF_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dsd_msbf_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first, planar"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_INTERPLAY_ACM,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "interplayacm",
+        .long_name = NULL_IF_CONFIG_SMALL("Interplay ACM"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_XMA1,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "xma1",
+        .long_name = NULL_IF_CONFIG_SMALL("Xbox Media Audio 1"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_XMA2,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "xma2",
+        .long_name = NULL_IF_CONFIG_SMALL("Xbox Media Audio 2"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_DST,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dst",
+        .long_name = NULL_IF_CONFIG_SMALL("DST (Direct Stream Transfer)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_ADPCM_MTAF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_mtaf",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM MTAF"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* subtitle codecs */
     {
@@ -2387,42 +2718,56 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "dvd_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("DVD subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_DVB_SUBTITLE,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "dvb_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("DVB subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_TEXT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "text",
         .long_name = NULL_IF_CONFIG_SMALL("raw UTF-8 text"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_XSUB,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "xsub",
         .long_name = NULL_IF_CONFIG_SMALL("XSUB"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_ASS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "ass",
+        .long_name = NULL_IF_CONFIG_SMALL("ASS (Advanced SSA) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_SSA,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "ssa",
-        .long_name = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) / ASS (Advanced SSA) subtitle"),
+        .long_name = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_MOV_TEXT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "mov_text",
         .long_name = NULL_IF_CONFIG_SMALL("MOV text"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
         .id        = AV_CODEC_ID_HDMV_PGS_SUBTITLE,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "hdmv_pgs_subtitle",
         .long_name = NULL_IF_CONFIG_SMALL("HDMV Presentation Graphic Stream subtitles"),
+        .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
         .id        = AV_CODEC_ID_DVB_TELETEXT,
@@ -2434,8 +2779,171 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .id        = AV_CODEC_ID_SRT,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "srt",
-        .long_name = NULL_IF_CONFIG_SMALL("SubRip Text"),
+        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle with embedded timing"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBRIP,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subrip",
+        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_MICRODVD,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "microdvd",
+        .long_name = NULL_IF_CONFIG_SMALL("MicroDVD subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_MPL2,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "mpl2",
+        .long_name = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_EIA_608,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "eia_608",
+        .long_name = NULL_IF_CONFIG_SMALL("EIA-608 closed captions"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_JACOSUB,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "jacosub",
+        .long_name = NULL_IF_CONFIG_SMALL("JACOsub subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_PJS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "pjs",
+        .long_name = NULL_IF_CONFIG_SMALL("PJS (Phoenix Japanimation Society) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
     },
+    {
+        .id        = AV_CODEC_ID_SAMI,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "sami",
+        .long_name = NULL_IF_CONFIG_SMALL("SAMI subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_REALTEXT,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "realtext",
+        .long_name = NULL_IF_CONFIG_SMALL("RealText subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_STL,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "stl",
+        .long_name = NULL_IF_CONFIG_SMALL("Spruce subtitle format"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBVIEWER1,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subviewer1",
+        .long_name = NULL_IF_CONFIG_SMALL("SubViewer v1 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_SUBVIEWER,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "subviewer",
+        .long_name = NULL_IF_CONFIG_SMALL("SubViewer subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_VPLAYER,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "vplayer",
+        .long_name = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_WEBVTT,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "webvtt",
+        .long_name = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "hdmv_text_subtitle",
+        .long_name = NULL_IF_CONFIG_SMALL("HDMV Text subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+
+    /* other kind of codecs and pseudo-codecs */
+    {
+        .id        = AV_CODEC_ID_TTF,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "ttf",
+        .long_name = NULL_IF_CONFIG_SMALL("TrueType font"),
+        .mime_types= MT("application/x-truetype-font", "application/x-font"),
+    },
+    {
+        .id        = AV_CODEC_ID_BINTEXT,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "bintext",
+        .long_name = NULL_IF_CONFIG_SMALL("Binary text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_XBIN,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xbin",
+        .long_name = NULL_IF_CONFIG_SMALL("eXtended BINary text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_IDF,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "idf",
+        .long_name = NULL_IF_CONFIG_SMALL("iCEDraw text"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY,
+    },
+    {
+        .id        = AV_CODEC_ID_OTF,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "otf",
+        .long_name = NULL_IF_CONFIG_SMALL("OpenType font"),
+        .mime_types= MT("application/vnd.ms-opentype"),
+    },
+    {
+        .id        = AV_CODEC_ID_SMPTE_KLV,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "klv",
+        .long_name = NULL_IF_CONFIG_SMALL("SMPTE 336M Key-Length-Value (KLV) metadata"),
+    },
+    {
+        .id        = AV_CODEC_ID_DVD_NAV,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "dvd_nav_packet",
+        .long_name = NULL_IF_CONFIG_SMALL("DVD Nav packet"),
+    },
+    {
+        .id        = AV_CODEC_ID_TIMED_ID3,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "timed_id3",
+        .long_name = NULL_IF_CONFIG_SMALL("timed ID3 metadata"),
+    },
+    {
+        .id        = AV_CODEC_ID_BIN_DATA,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "bin_data",
+        .long_name = NULL_IF_CONFIG_SMALL("binary data"),
+        .mime_types= MT("application/octet-stream"),
+    },
+
+    /* deprecated codec ids */
 };
 
 const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id)
diff --git a/libavcodec/cook.c b/libavcodec/cook.c
index f487db6..794a475 100644
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Sascha Sommer
  * Copyright (c) 2005 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,6 +52,7 @@
 #include "fft.h"
 #include "internal.h"
 #include "sinewin.h"
+#include "unary.h"
 
 #include "cookdata.h"
 
@@ -165,10 +166,17 @@ static float rootpow2tab[127];
 /* table generator */
 static av_cold void init_pow2table(void)
 {
+    /* fast way of computing 2^i and 2^(0.5*i) for -63 <= i < 64 */
     int i;
+    static const float exp2_tab[2] = {1, M_SQRT2};
+    float exp2_val = powf(2, -63);
+    float root_val = powf(2, -32);
     for (i = -63; i < 64; i++) {
-        pow2tab[63 + i] = pow(2, i);
-        rootpow2tab[63 + i] = sqrt(pow(2, i));
+        if (!(i & 1))
+            root_val *= 2;
+        pow2tab[63 + i] = exp2_val;
+        rootpow2tab[63 + i] = root_val * exp2_tab[i & 1];
+        exp2_val *= 2;
     }
 }
 
@@ -219,7 +227,7 @@ static av_cold int init_cook_mlt(COOKContext *q)
     int j, ret;
     int mlt_size = q->samples_per_channel;
 
-    if ((q->mlt_window = av_malloc(mlt_size * sizeof(*q->mlt_window))) == 0)
+    if ((q->mlt_window = av_malloc_array(mlt_size, sizeof(*q->mlt_window))) == 0)
         return AVERROR(ENOMEM);
 
     /* Initialize the MLT window: simple sine window. */
@@ -229,7 +237,7 @@ static av_cold int init_cook_mlt(COOKContext *q)
 
     /* Initialize the MDCT. */
     if ((ret = ff_mdct_init(&q->mdct_ctx, av_log2(mlt_size) + 1, 1, 1.0 / 32768.0))) {
-        av_free(q->mlt_window);
+        av_freep(&q->mlt_window);
         return ret;
     }
     av_log(q->avctx, AV_LOG_DEBUG, "MDCT initialized, order = %d.\n",
@@ -303,8 +311,8 @@ static av_cold int cook_decode_close(AVCodecContext *avctx)
     av_log(avctx, AV_LOG_DEBUG, "Deallocating memory.\n");
 
     /* Free allocated memory buffers. */
-    av_free(q->mlt_window);
-    av_free(q->decoded_bytes_buffer);
+    av_freep(&q->mlt_window);
+    av_freep(&q->decoded_bytes_buffer);
 
     /* Free the transform. */
     ff_mdct_end(&q->mdct_ctx);
@@ -332,11 +340,7 @@ static void decode_gain_info(GetBitContext *gb, int *gaininfo)
 {
     int i, n;
 
-    while (get_bits1(gb)) {
-        /* NOTHING */
-    }
-
-    n = get_bits_count(gb) - 1;     // amount of elements*2 to update
+    n = get_unary(gb, 0, get_bits_left(gb));     // amount of elements*2 to update
 
     i = 0;
     while (n--) {
@@ -397,7 +401,7 @@ static int decode_envelope(COOKContext *q, COOKSubpacket *p,
  * @param category              pointer to the category array
  * @param category_index        pointer to the category_index array
  */
-static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
+static void categorize(COOKContext *q, COOKSubpacket *p, const int *quant_index_table,
                        int *category, int *category_index)
 {
     int exp_idx, bias, tmpbias1, tmpbias2, bits_left, num_bits, index, v, i, j;
@@ -421,7 +425,7 @@ static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
         num_bits = 0;
         index    = 0;
         for (j = p->total_subbands; j > 0; j--) {
-            exp_idx = av_clip((i - quant_index_table[index] + bias) / 2, 0, 7);
+            exp_idx = av_clip_uintp2((i - quant_index_table[index] + bias) / 2, 3);
             index++;
             num_bits += expbits_tab[exp_idx];
         }
@@ -432,7 +436,7 @@ static void categorize(COOKContext *q, COOKSubpacket *p, int *quant_index_table,
     /* Calculate total number of bits. */
     num_bits = 0;
     for (i = 0; i < p->total_subbands; i++) {
-        exp_idx = av_clip((bias - quant_index_table[i]) / 2, 0, 7);
+        exp_idx = av_clip_uintp2((bias - quant_index_table[i]) / 2, 3);
         num_bits += expbits_tab[exp_idx];
         exp_index1[i] = exp_idx;
         exp_index2[i] = exp_idx;
@@ -630,13 +634,17 @@ static int mono_decode(COOKContext *q, COOKSubpacket *p, float *mlt_buffer)
     int category_index[128] = { 0 };
     int category[128]       = { 0 };
     int quant_index_table[102];
-    int res;
+    int res, i;
 
     if ((res = decode_envelope(q, p, quant_index_table)) < 0)
         return res;
     q->num_vectors = get_bits(&q->gb, p->log2_numvector_size);
     categorize(q, p, quant_index_table, category, category_index);
     expand_category(q, category, category_index);
+    for (i=0; i<p->total_subbands; i++) {
+        if (category[i] > 7)
+            return AVERROR_INVALIDDATA;
+    }
     decode_vectors(q, p, category, quant_index_table, mlt_buffer);
 
     return 0;
@@ -736,7 +744,7 @@ static void imlt_gain(COOKContext *q, float *inbuffer,
  * @param q                 pointer to the COOKContext
  * @param decouple_tab      decoupling array
  */
-static void decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
+static int decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
 {
     int i;
     int vlc    = get_bits1(&q->gb);
@@ -745,7 +753,7 @@ static void decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
     int length = end - start + 1;
 
     if (start > end)
-        return;
+        return 0;
 
     if (vlc)
         for (i = 0; i < length; i++)
@@ -753,11 +761,18 @@ static void decouple_info(COOKContext *q, COOKSubpacket *p, int *decouple_tab)
                                                p->channel_coupling.table,
                                                p->channel_coupling.bits, 2);
     else
-        for (i = 0; i < length; i++)
-            decouple_tab[start + i] = get_bits(&q->gb, p->js_vlc_bits);
+        for (i = 0; i < length; i++) {
+            int v = get_bits(&q->gb, p->js_vlc_bits);
+            if (v == (1<<p->js_vlc_bits)-1) {
+                av_log(q->avctx, AV_LOG_ERROR, "decouple value too large\n");
+                return AVERROR_INVALIDDATA;
+            }
+            decouple_tab[start + i] = v;
+        }
+    return 0;
 }
 
-/*
+/**
  * function decouples a pair of signals from a single signal via multiplication.
  *
  * @param q                 pointer to the COOKContext
@@ -805,10 +820,10 @@ static int joint_decode(COOKContext *q, COOKSubpacket *p,
     /* Make sure the buffers are zeroed out. */
     memset(mlt_buffer_left,  0, 1024 * sizeof(*mlt_buffer_left));
     memset(mlt_buffer_right, 0, 1024 * sizeof(*mlt_buffer_right));
-    decouple_info(q, p, decouple_tab);
+    if ((res = decouple_info(q, p, decouple_tab)) < 0)
+        return res;
     if ((res = mono_decode(q, p, decode_buffer)) < 0)
         return res;
-
     /* The two channels are stored interleaved in decode_buffer. */
     for (i = 0; i < p->js_subband_start; i++) {
         for (j = 0; j < SUBBAND_SIZE; j++) {
@@ -927,7 +942,7 @@ static int decode_subpacket(COOKContext *q, COOKSubpacket *p,
                           p->mono_previous_buffer1,
                           outbuffer ? outbuffer[p->ch_idx] : NULL);
 
-    if (p->num_channels == 2)
+    if (p->num_channels == 2) {
         if (p->joint_stereo)
             mlt_compensate_output(q, q->decode_buffer_2, &p->gains1,
                                   p->mono_previous_buffer2,
@@ -936,6 +951,7 @@ static int decode_subpacket(COOKContext *q, COOKSubpacket *p,
             mlt_compensate_output(q, q->decode_buffer_2, &p->gains2,
                                   p->mono_previous_buffer2,
                                   outbuffer ? outbuffer[p->ch_idx + 1] : NULL);
+    }
 
     return 0;
 }
@@ -959,10 +975,8 @@ static int cook_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     if (q->discarded_packets >= 2) {
         frame->nb_samples = q->samples_per_channel;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
         samples = (float **)frame->extended_data;
     }
 
@@ -1009,7 +1023,6 @@ static int cook_decode_frame(AVCodecContext *avctx, void *data,
     return avctx->block_align;
 }
 
-#ifdef DEBUG
 static void dump_cook_context(COOKContext *q)
 {
     //int i=0;
@@ -1022,7 +1035,7 @@ static void dump_cook_context(COOKContext *q)
     }
     ff_dlog(q->avctx, "COOKContext\n");
     PRINT("nb_channels", q->avctx->channels);
-    PRINT("bit_rate", q->avctx->bit_rate);
+    PRINT("bit_rate", (int)q->avctx->bit_rate);
     PRINT("sample_rate", q->avctx->sample_rate);
     PRINT("samples_per_channel", q->subpacket[0].samples_per_channel);
     PRINT("subbands", q->subpacket[0].subbands);
@@ -1031,7 +1044,6 @@ static void dump_cook_context(COOKContext *q)
     PRINT("numvector_size", q->subpacket[0].numvector_size);
     PRINT("total_subbands", q->subpacket[0].total_subbands);
 }
-#endif
 
 /**
  * Cook initialization
@@ -1046,7 +1058,7 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
     int extradata_size = avctx->extradata_size;
     int s = 0;
     unsigned int channel_mask = 0;
-    int samples_per_frame;
+    int samples_per_frame = 0;
     int ret;
     q->avctx = avctx;
 
@@ -1080,6 +1092,11 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
         if (extradata_size >= 8) {
             bytestream_get_be32(&edata_ptr);    // Unknown unused
             q->subpacket[s].js_subband_start = bytestream_get_be16(&edata_ptr);
+            if (q->subpacket[s].js_subband_start >= 51) {
+                av_log(avctx, AV_LOG_ERROR, "js_subband_start %d is too large\n", q->subpacket[s].js_subband_start);
+                return AVERROR_INVALIDDATA;
+            }
+
             q->subpacket[s].js_vlc_bits = bytestream_get_be16(&edata_ptr);
             extradata_size -= 8;
         }
@@ -1187,15 +1204,24 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
             avpriv_request_sample(avctx, "subbands > 50");
             return AVERROR_PATCHWELCOME;
         }
+        if (q->subpacket[s].subbands == 0) {
+            avpriv_request_sample(avctx, "subbands = 0");
+            return AVERROR_PATCHWELCOME;
+        }
         q->subpacket[s].gains1.now      = q->subpacket[s].gain_1;
         q->subpacket[s].gains1.previous = q->subpacket[s].gain_2;
         q->subpacket[s].gains2.now      = q->subpacket[s].gain_3;
         q->subpacket[s].gains2.previous = q->subpacket[s].gain_4;
 
+        if (q->num_subpackets + q->subpacket[s].num_channels > q->avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "Too many subpackets %d for channels %d\n", q->num_subpackets, q->avctx->channels);
+            return AVERROR_INVALIDDATA;
+        }
+
         q->num_subpackets++;
         s++;
-        if (s > MAX_SUBPACKETS) {
-            avpriv_request_sample(avctx, "subpackets > %d", MAX_SUBPACKETS);
+        if (s > FFMIN(MAX_SUBPACKETS, avctx->block_align)) {
+            avpriv_request_sample(avctx, "subpackets > %d", FFMIN(MAX_SUBPACKETS, avctx->block_align));
             return AVERROR_PATCHWELCOME;
         }
     }
@@ -1248,9 +1274,9 @@ static av_cold int cook_decode_init(AVCodecContext *avctx)
     else
         avctx->channel_layout = (avctx->channels == 2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
-#ifdef DEBUG
+
     dump_cook_context(q);
-#endif
+
     return 0;
 }
 
diff --git a/libavcodec/cook_parser.c b/libavcodec/cook_parser.c
index f140e90..6dbbfd8 100644
--- a/libavcodec/cook_parser.c
+++ b/libavcodec/cook_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,11 +40,12 @@ static int cook_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
 {
     CookParseContext *s = s1->priv_data;
 
-    if (s->duration)
-        s1->duration = s->duration;
-    else if (avctx->extradata && avctx->extradata_size >= 8 && avctx->channels)
+    if (!s->duration &&
+                avctx->extradata && avctx->extradata_size >= 8 && avctx->channels)
         s->duration = AV_RB16(avctx->extradata + 4) / avctx->channels;
 
+    s1->duration = s->duration;
+
     /* always return the full packet. this parser isn't doing any splitting or
        combining, only setting packet duration */
     *poutbuf      = buf;
diff --git a/libavcodec/cookdata.h b/libavcodec/cookdata.h
index 714ba1e..dcdb912 100644
--- a/libavcodec/cookdata.h
+++ b/libavcodec/cookdata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Sascha Sommer
  * Copyright (c) 2005 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/copy_block.h b/libavcodec/copy_block.h
index 10718cc..9ed451f 100644
--- a/libavcodec/copy_block.h
+++ b/libavcodec/copy_block.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,17 @@
 
 #include "libavutil/intreadwrite.h"
 
+static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
+{
+    int i;
+    for(i=0; i<h; i++)
+    {
+        AV_COPY16U(dst, src);
+        dst+=dstStride;
+        src+=srcStride;
+    }
+}
+
 static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
 {
     int i;
diff --git a/libavcodec/cos_tablegen.c b/libavcodec/cos_tablegen.c
index 92b8295..7206aad 100644
--- a/libavcodec/cos_tablegen.c
+++ b/libavcodec/cos_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,9 @@
 #include <string.h>
 #include <math.h>
 
-#define BITS 16
+#include "libavutil/mathematics.h"
+
+#define BITS 17
 #define FLOATFMT "%.18e"
 #define FIXEDFMT "%6d"
 
diff --git a/libavcodec/cpia.c b/libavcodec/cpia.c
new file mode 100644
index 0000000..07cdd50
--- /dev/null
+++ b/libavcodec/cpia.c
@@ -0,0 +1,233 @@
+/*
+ * CPiA video decoder.
+ * Copyright (c) 2010 Hans de Goede <hdegoede@redhat.com>
+ *
+ * This decoder is based on the LGPL code available at
+ * https://v4l4j.googlecode.com/svn/v4l4j/trunk/libvideo/libv4lconvert/cpia1.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+
+
+#define FRAME_HEADER_SIZE 64
+#define MAGIC_0         0x19    /**< First header byte */
+#define MAGIC_1         0x68    /**< Second header byte */
+#define SUBSAMPLE_420      0
+#define SUBSAMPLE_422      1
+#define YUVORDER_YUYV      0
+#define YUVORDER_UYVY      1
+#define NOT_COMPRESSED     0
+#define COMPRESSED         1
+#define NO_DECIMATION      0
+#define DECIMATION_ENAB    1
+#define EOL             0xfd    /**< End Of Line marker */
+#define EOI             0xff    /**< End Of Image marker */
+
+
+typedef struct {
+    AVFrame *frame;
+} CpiaContext;
+
+
+static int cpia_decode_frame(AVCodecContext *avctx,
+                             void *data, int *got_frame, AVPacket* avpkt)
+{
+    CpiaContext* const cpia = avctx->priv_data;
+    int i,j,ret;
+
+    uint8_t* const header = avpkt->data;
+    uint8_t* src;
+    int src_size;
+    uint16_t linelength;
+    uint8_t skip;
+
+    AVFrame *frame = cpia->frame;
+    uint8_t *y, *u, *v, *y_end, *u_end, *v_end;
+
+    // Check header
+    if ( avpkt->size < FRAME_HEADER_SIZE
+      || header[0] != MAGIC_0 || header[1] != MAGIC_1
+      || (header[17] != SUBSAMPLE_420 && header[17] != SUBSAMPLE_422)
+      || (header[18] != YUVORDER_YUYV && header[18] != YUVORDER_UYVY)
+      || (header[28] != NOT_COMPRESSED && header[28] != COMPRESSED)
+      || (header[29] != NO_DECIMATION && header[29] != DECIMATION_ENAB)
+    ) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid header!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // currently unsupported properties
+    if (header[17] == SUBSAMPLE_422) {
+        avpriv_report_missing_feature(avctx, "4:2:2 subsampling");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (header[18] == YUVORDER_UYVY) {
+        avpriv_report_missing_feature(avctx, "YUV byte order UYVY");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (header[29] == DECIMATION_ENAB) {
+        avpriv_report_missing_feature(avctx, "Decimation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    src = header + FRAME_HEADER_SIZE;
+    src_size = avpkt->size - FRAME_HEADER_SIZE;
+
+    if (header[28] == NOT_COMPRESSED) {
+        frame->pict_type = AV_PICTURE_TYPE_I;
+        frame->key_frame = 1;
+    } else {
+        frame->pict_type = AV_PICTURE_TYPE_P;
+        frame->key_frame = 0;
+    }
+
+    // Get buffer filled with previous frame
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
+        return ret;
+
+
+    for ( i = 0;
+          i < frame->height;
+          i++, src += linelength, src_size -= linelength
+    ) {
+        // Read line length, two byte little endian
+        linelength = AV_RL16(src);
+        src += 2;
+
+        if (src_size < linelength) {
+            av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+            av_log(avctx, AV_LOG_WARNING, "Frame ended unexpectedly!\n");
+            break;
+        }
+        if (src[linelength - 1] != EOL) {
+            av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+            av_log(avctx, AV_LOG_WARNING, "Wrong line length %d or line not terminated properly (found 0x%02x)!\n", linelength, src[linelength - 1]);
+            break;
+        }
+
+        /* Update the data pointers. Y data is on every line.
+         * U and V data on every second line
+         */
+        y = &frame->data[0][i * frame->linesize[0]];
+        u = &frame->data[1][(i >> 1) * frame->linesize[1]];
+        v = &frame->data[2][(i >> 1) * frame->linesize[2]];
+        y_end = y + frame->linesize[0] - 1;
+        u_end = u + frame->linesize[1] - 1;
+        v_end = v + frame->linesize[2] - 1;
+
+        if ((i & 1) && header[17] == SUBSAMPLE_420) {
+            /* We are on an odd line and 420 subsample is used.
+             * On this line only Y values are specified, one per pixel.
+             */
+            for (j = 0; j < linelength - 1; j++) {
+                if (y > y_end) {
+                    av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+                    av_log(avctx, AV_LOG_WARNING, "Decoded data exceeded linesize!\n");
+                    break;
+                }
+                if ((src[j] & 1) && header[28] == COMPRESSED) {
+                    /* It seems that odd lines are always uncompressed, but
+                     * we do it according to specification anyways.
+                     */
+                    skip = src[j] >> 1;
+                    y += skip;
+                } else {
+                    *(y++) = src[j];
+                }
+            }
+        } else if (header[17] == SUBSAMPLE_420) {
+            /* We are on an even line and 420 subsample is used.
+             * On this line each pair of pixels is described by four bytes.
+             */
+            for (j = 0; j < linelength - 4; ) {
+                if (y + 1 > y_end || u > u_end || v > v_end) {
+                    av_frame_set_decode_error_flags(frame, FF_DECODE_ERROR_INVALID_BITSTREAM);
+                    av_log(avctx, AV_LOG_WARNING, "Decoded data exceeded linesize!\n");
+                    break;
+                }
+                if ((src[j] & 1) && header[28] == COMPRESSED) {
+                    // Skip amount of pixels and move forward one byte
+                    skip = src[j] >> 1;
+                    y += skip;
+                    u += skip >> 1;
+                    v += skip >> 1;
+                    j++;
+                } else {
+                    // Set image data as specified and move forward 4 bytes
+                    *(y++) = src[j];
+                    *(u++) = src[j+1];
+                    *(y++) = src[j+2];
+                    *(v++) = src[j+3];
+                    j += 4;
+                }
+            }
+        }
+    }
+
+    *got_frame = 1;
+    if ((ret = av_frame_ref(data, cpia->frame)) < 0)
+        return ret;
+
+    return avpkt->size;
+}
+
+static av_cold int cpia_decode_init(AVCodecContext *avctx)
+{
+    CpiaContext *s = avctx->priv_data;
+
+    // output pixel format
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    /* The default timebase set by the v4l2 demuxer leads to probing which is buggy.
+     * Set some reasonable time_base to skip this.
+     */
+    if (avctx->time_base.num == 1 && avctx->time_base.den == 1000000) {
+        avctx->time_base.num = 1;
+        avctx->time_base.den = 60;
+    }
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int cpia_decode_end(AVCodecContext *avctx)
+{
+    CpiaContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
+AVCodec ff_cpia_decoder = {
+    .name           = "cpia",
+    .long_name      = NULL_IF_CONFIG_SMALL("CPiA video format"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_CPIA,
+    .priv_data_size = sizeof(CpiaContext),
+    .init           = cpia_decode_init,
+    .close          = cpia_decode_end,
+    .decode         = cpia_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/crystalhd.c b/libavcodec/crystalhd.c
new file mode 100644
index 0000000..3cb32a8
--- /dev/null
+++ b/libavcodec/crystalhd.c
@@ -0,0 +1,1226 @@
+/*
+ * - CrystalHD decoder module -
+ *
+ * Copyright(C) 2010,2011 Philip Langdale <ffmpeg.philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * - Principles of Operation -
+ *
+ * The CrystalHD decoder operates at the bitstream level - which is an even
+ * higher level than the decoding hardware you typically see in modern GPUs.
+ * This means it has a very simple interface, in principle. You feed demuxed
+ * packets in one end and get decoded picture (fields/frames) out the other.
+ *
+ * Of course, nothing is ever that simple. Due, at the very least, to b-frame
+ * dependencies in the supported formats, the hardware has a delay between
+ * when a packet goes in, and when a picture comes out. Furthermore, this delay
+ * is not just a function of time, but also one of the dependency on additional
+ * frames being fed into the decoder to satisfy the b-frame dependencies.
+ *
+ * As such, a pipeline will build up that is roughly equivalent to the required
+ * DPB for the file being played. If that was all it took, things would still
+ * be simple - so, of course, it isn't.
+ *
+ * The hardware has a way of indicating that a picture is ready to be copied out,
+ * but this is unreliable - and sometimes the attempt will still fail so, based
+ * on testing, the code will wait until 3 pictures are ready before starting
+ * to copy out - and this has the effect of extending the pipeline.
+ *
+ * Finally, while it is tempting to say that once the decoder starts outputting
+ * frames, the software should never fail to return a frame from a decode(),
+ * this is a hard assertion to make, because the stream may switch between
+ * differently encoded content (number of b-frames, interlacing, etc) which
+ * might require a longer pipeline than before. If that happened, you could
+ * deadlock trying to retrieve a frame that can't be decoded without feeding
+ * in additional packets.
+ *
+ * As such, the code will return in the event that a picture cannot be copied
+ * out, leading to an increase in the length of the pipeline. This in turn,
+ * means we have to be sensitive to the time it takes to decode a picture;
+ * We do not want to give up just because the hardware needed a little more
+ * time to prepare the picture! For this reason, there are delays included
+ * in the decode() path that ensure that, under normal conditions, the hardware
+ * will only fail to return a frame if it really needs additional packets to
+ * complete the decoding.
+ *
+ * Finally, to be explicit, we do not want the pipeline to grow without bound
+ * for two reasons: 1) The hardware can only buffer a finite number of packets,
+ * and 2) The client application may not be able to cope with arbitrarily long
+ * delays in the video path relative to the audio path. For example. MPlayer
+ * can only handle a 20 picture delay (although this is arbitrary, and needs
+ * to be extended to fully support the CrystalHD where the delay could be up
+ * to 32 pictures - consider PAFF H.264 content with 16 b-frames).
+ */
+
+/*****************************************************************************
+ * Includes
+ ****************************************************************************/
+
+#define _XOPEN_SOURCE 600
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <libcrystalhd/bc_dts_types.h>
+#include <libcrystalhd/bc_dts_defs.h>
+#include <libcrystalhd/libcrystalhd_if.h>
+
+#include "avcodec.h"
+#include "h264.h"
+#include "internal.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+/** Timeout parameter passed to DtsProcOutput() in us */
+#define OUTPUT_PROC_TIMEOUT 50
+/** Step between fake timestamps passed to hardware in units of 100ns */
+#define TIMESTAMP_UNIT 100000
+/** Initial value in us of the wait in decode() */
+#define BASE_WAIT 10000
+/** Increment in us to adjust wait in decode() */
+#define WAIT_UNIT 1000
+
+
+/*****************************************************************************
+ * Module private data
+ ****************************************************************************/
+
+typedef enum {
+    RET_ERROR           = -1,
+    RET_OK              = 0,
+    RET_COPY_AGAIN      = 1,
+    RET_SKIP_NEXT_COPY  = 2,
+    RET_COPY_NEXT_FIELD = 3,
+} CopyRet;
+
+typedef struct OpaqueList {
+    struct OpaqueList *next;
+    uint64_t fake_timestamp;
+    uint64_t reordered_opaque;
+    uint8_t pic_type;
+} OpaqueList;
+
+typedef struct {
+    AVClass *av_class;
+    AVCodecContext *avctx;
+    AVFrame *pic;
+    HANDLE dev;
+
+    uint8_t *orig_extradata;
+    uint32_t orig_extradata_size;
+
+    AVBitStreamFilterContext *bsfc;
+    AVCodecParserContext *parser;
+
+    uint8_t is_70012;
+    uint8_t *sps_pps_buf;
+    uint32_t sps_pps_size;
+    uint8_t is_nal;
+    uint8_t output_ready;
+    uint8_t need_second_field;
+    uint8_t skip_next_output;
+    uint64_t decode_wait;
+
+    uint64_t last_picture;
+
+    OpaqueList *head;
+    OpaqueList *tail;
+
+    /* Options */
+    uint32_t sWidth;
+    uint8_t bframe_bug;
+} CHDContext;
+
+static const AVOption options[] = {
+    { "crystalhd_downscale_width",
+      "Turn on downscaling to the specified width",
+      offsetof(CHDContext, sWidth),
+      AV_OPT_TYPE_INT, {.i64 = 0}, 0, UINT32_MAX,
+      AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM, },
+    { NULL, },
+};
+
+
+/*****************************************************************************
+ * Helper functions
+ ****************************************************************************/
+
+static inline BC_MEDIA_SUBTYPE id2subtype(CHDContext *priv, enum AVCodecID id)
+{
+    switch (id) {
+    case AV_CODEC_ID_MPEG4:
+        return BC_MSUBTYPE_DIVX;
+    case AV_CODEC_ID_MSMPEG4V3:
+        return BC_MSUBTYPE_DIVX311;
+    case AV_CODEC_ID_MPEG2VIDEO:
+        return BC_MSUBTYPE_MPEG2VIDEO;
+    case AV_CODEC_ID_VC1:
+        return BC_MSUBTYPE_VC1;
+    case AV_CODEC_ID_WMV3:
+        return BC_MSUBTYPE_WMV3;
+    case AV_CODEC_ID_H264:
+        return priv->is_nal ? BC_MSUBTYPE_AVC1 : BC_MSUBTYPE_H264;
+    default:
+        return BC_MSUBTYPE_INVALID;
+    }
+}
+
+static inline void print_frame_info(CHDContext *priv, BC_DTS_PROC_OUT *output)
+{
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tYBuffSz: %u\n", output->YbuffSz);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tYBuffDoneSz: %u\n",
+           output->YBuffDoneSz);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tUVBuffDoneSz: %u\n",
+           output->UVBuffDoneSz);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tTimestamp: %"PRIu64"\n",
+           output->PicInfo.timeStamp);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tPicture Number: %u\n",
+           output->PicInfo.picture_number);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tWidth: %u\n",
+           output->PicInfo.width);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tHeight: %u\n",
+           output->PicInfo.height);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tChroma: 0x%03x\n",
+           output->PicInfo.chroma_format);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tPulldown: %u\n",
+           output->PicInfo.pulldown);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tFlags: 0x%08x\n",
+           output->PicInfo.flags);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tFrame Rate/Res: %u\n",
+           output->PicInfo.frame_rate);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tAspect Ratio: %u\n",
+           output->PicInfo.aspect_ratio);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tColor Primaries: %u\n",
+           output->PicInfo.colour_primaries);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tMetaData: %u\n",
+           output->PicInfo.picture_meta_payload);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tSession Number: %u\n",
+           output->PicInfo.sess_num);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tycom: %u\n",
+           output->PicInfo.ycom);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tCustom Aspect: %u\n",
+           output->PicInfo.custom_aspect_ratio_width_height);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tFrames to Drop: %u\n",
+           output->PicInfo.n_drop);
+    av_log(priv->avctx, AV_LOG_VERBOSE, "\tH264 Valid Fields: 0x%08x\n",
+           output->PicInfo.other.h264.valid);
+}
+
+
+/*****************************************************************************
+ * OpaqueList functions
+ ****************************************************************************/
+
+static uint64_t opaque_list_push(CHDContext *priv, uint64_t reordered_opaque,
+                                 uint8_t pic_type)
+{
+    OpaqueList *newNode = av_mallocz(sizeof (OpaqueList));
+    if (!newNode) {
+        av_log(priv->avctx, AV_LOG_ERROR,
+               "Unable to allocate new node in OpaqueList.\n");
+        return 0;
+    }
+    if (!priv->head) {
+        newNode->fake_timestamp = TIMESTAMP_UNIT;
+        priv->head              = newNode;
+    } else {
+        newNode->fake_timestamp = priv->tail->fake_timestamp + TIMESTAMP_UNIT;
+        priv->tail->next        = newNode;
+    }
+    priv->tail = newNode;
+    newNode->reordered_opaque = reordered_opaque;
+    newNode->pic_type = pic_type;
+
+    return newNode->fake_timestamp;
+}
+
+/*
+ * The OpaqueList is built in decode order, while elements will be removed
+ * in presentation order. If frames are reordered, this means we must be
+ * able to remove elements that are not the first element.
+ *
+ * Returned node must be freed by caller.
+ */
+static OpaqueList *opaque_list_pop(CHDContext *priv, uint64_t fake_timestamp)
+{
+    OpaqueList *node = priv->head;
+
+    if (!priv->head) {
+        av_log(priv->avctx, AV_LOG_ERROR,
+               "CrystalHD: Attempted to query non-existent timestamps.\n");
+        return NULL;
+    }
+
+    /*
+     * The first element is special-cased because we have to manipulate
+     * the head pointer rather than the previous element in the list.
+     */
+    if (priv->head->fake_timestamp == fake_timestamp) {
+        priv->head = node->next;
+
+        if (!priv->head->next)
+            priv->tail = priv->head;
+
+        node->next = NULL;
+        return node;
+    }
+
+    /*
+     * The list is processed at arm's length so that we have the
+     * previous element available to rewrite its next pointer.
+     */
+    while (node->next) {
+        OpaqueList *current = node->next;
+        if (current->fake_timestamp == fake_timestamp) {
+            node->next = current->next;
+
+            if (!node->next)
+               priv->tail = node;
+
+            current->next = NULL;
+            return current;
+        } else {
+            node = current;
+        }
+    }
+
+    av_log(priv->avctx, AV_LOG_VERBOSE,
+           "CrystalHD: Couldn't match fake_timestamp.\n");
+    return NULL;
+}
+
+
+/*****************************************************************************
+ * Video decoder API function definitions
+ ****************************************************************************/
+
+static void flush(AVCodecContext *avctx)
+{
+    CHDContext *priv = avctx->priv_data;
+
+    avctx->has_b_frames     = 0;
+    priv->last_picture      = -1;
+    priv->output_ready      = 0;
+    priv->need_second_field = 0;
+    priv->skip_next_output  = 0;
+    priv->decode_wait       = BASE_WAIT;
+
+    av_frame_unref (priv->pic);
+
+    /* Flush mode 4 flushes all software and hardware buffers. */
+    DtsFlushInput(priv->dev, 4);
+}
+
+
+static av_cold int uninit(AVCodecContext *avctx)
+{
+    CHDContext *priv = avctx->priv_data;
+    HANDLE device;
+
+    device = priv->dev;
+    DtsStopDecoder(device);
+    DtsCloseDecoder(device);
+    DtsDeviceClose(device);
+
+    /*
+     * Restore original extradata, so that if the decoder is
+     * reinitialised, the bitstream detection and filtering
+     * will work as expected.
+     */
+    if (priv->orig_extradata) {
+        av_free(avctx->extradata);
+        avctx->extradata = priv->orig_extradata;
+        avctx->extradata_size = priv->orig_extradata_size;
+        priv->orig_extradata = NULL;
+        priv->orig_extradata_size = 0;
+    }
+
+    av_parser_close(priv->parser);
+    if (priv->bsfc) {
+        av_bitstream_filter_close(priv->bsfc);
+    }
+
+    av_freep(&priv->sps_pps_buf);
+
+    av_frame_free (&priv->pic);
+
+    if (priv->head) {
+       OpaqueList *node = priv->head;
+       while (node) {
+          OpaqueList *next = node->next;
+          av_free(node);
+          node = next;
+       }
+    }
+
+    return 0;
+}
+
+
+static av_cold int init(AVCodecContext *avctx)
+{
+    CHDContext* priv;
+    BC_STATUS ret;
+    BC_INFO_CRYSTAL version;
+    BC_INPUT_FORMAT format = {
+        .FGTEnable   = FALSE,
+        .Progressive = TRUE,
+        .OptFlags    = 0x80000000 | vdecFrameRate59_94 | 0x40,
+        .width       = avctx->width,
+        .height      = avctx->height,
+    };
+
+    BC_MEDIA_SUBTYPE subtype;
+
+    uint32_t mode = DTS_PLAYBACK_MODE |
+                    DTS_LOAD_FILE_PLAY_FW |
+                    DTS_SKIP_TX_CHK_CPB |
+                    DTS_PLAYBACK_DROP_RPT_MODE |
+                    DTS_SINGLE_THREADED_MODE |
+                    DTS_DFLT_RESOLUTION(vdecRESOLUTION_1080p23_976);
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD Init for %s\n",
+           avctx->codec->name);
+
+    avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+
+    /* Initialize the library */
+    priv               = avctx->priv_data;
+    priv->avctx        = avctx;
+    priv->is_nal       = avctx->extradata_size > 0 && *(avctx->extradata) == 1;
+    priv->last_picture = -1;
+    priv->decode_wait  = BASE_WAIT;
+    priv->pic          = av_frame_alloc();
+
+    subtype = id2subtype(priv, avctx->codec->id);
+    switch (subtype) {
+    case BC_MSUBTYPE_AVC1:
+        {
+            uint8_t *dummy_p;
+            int dummy_int;
+
+            /* Back up the extradata so it can be restored at close time. */
+            priv->orig_extradata = av_malloc(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!priv->orig_extradata) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Failed to allocate copy of extradata\n");
+                return AVERROR(ENOMEM);
+            }
+            priv->orig_extradata_size = avctx->extradata_size;
+            memcpy(priv->orig_extradata, avctx->extradata, avctx->extradata_size);
+
+            priv->bsfc = av_bitstream_filter_init("h264_mp4toannexb");
+            if (!priv->bsfc) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Cannot open the h264_mp4toannexb BSF!\n");
+                return AVERROR_BSF_NOT_FOUND;
+            }
+            av_bitstream_filter_filter(priv->bsfc, avctx, NULL, &dummy_p,
+                                       &dummy_int, NULL, 0, 0);
+        }
+        subtype = BC_MSUBTYPE_H264;
+        // Fall-through
+    case BC_MSUBTYPE_H264:
+        format.startCodeSz = 4;
+        // Fall-through
+    case BC_MSUBTYPE_VC1:
+    case BC_MSUBTYPE_WVC1:
+    case BC_MSUBTYPE_WMV3:
+    case BC_MSUBTYPE_WMVA:
+    case BC_MSUBTYPE_MPEG2VIDEO:
+    case BC_MSUBTYPE_DIVX:
+    case BC_MSUBTYPE_DIVX311:
+        format.pMetaData  = avctx->extradata;
+        format.metaDataSz = avctx->extradata_size;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: Unknown codec name\n");
+        return AVERROR(EINVAL);
+    }
+    format.mSubtype = subtype;
+
+    if (priv->sWidth) {
+        format.bEnableScaling = 1;
+        format.ScalingParams.sWidth = priv->sWidth;
+    }
+
+    /* Get a decoder instance */
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: starting up\n");
+    // Initialize the Link and Decoder devices
+    ret = DtsDeviceOpen(&priv->dev, mode);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: DtsDeviceOpen failed\n");
+        goto fail;
+    }
+
+    ret = DtsCrystalHDVersion(priv->dev, &version);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "CrystalHD: DtsCrystalHDVersion failed\n");
+        goto fail;
+    }
+    priv->is_70012 = version.device == 0;
+
+    if (priv->is_70012 &&
+        (subtype == BC_MSUBTYPE_DIVX || subtype == BC_MSUBTYPE_DIVX311)) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "CrystalHD: BCM70012 doesn't support MPEG4-ASP/DivX/Xvid\n");
+        goto fail;
+    }
+
+    ret = DtsSetInputFormat(priv->dev, &format);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: SetInputFormat failed\n");
+        goto fail;
+    }
+
+    ret = DtsOpenDecoder(priv->dev, BC_STREAM_TYPE_ES);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsOpenDecoder failed\n");
+        goto fail;
+    }
+
+    ret = DtsSetColorSpace(priv->dev, OUTPUT_MODE422_YUY2);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsSetColorSpace failed\n");
+        goto fail;
+    }
+    ret = DtsStartDecoder(priv->dev);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsStartDecoder failed\n");
+        goto fail;
+    }
+    ret = DtsStartCapture(priv->dev);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: DtsStartCapture failed\n");
+        goto fail;
+    }
+
+    if (avctx->codec->id == AV_CODEC_ID_H264) {
+        priv->parser = av_parser_init(avctx->codec->id);
+        if (!priv->parser)
+            av_log(avctx, AV_LOG_WARNING,
+                   "Cannot open the h.264 parser! Interlaced h.264 content "
+                   "will not be detected reliably.\n");
+        priv->parser->flags = PARSER_FLAG_COMPLETE_FRAMES;
+    }
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Init complete.\n");
+
+    return 0;
+
+ fail:
+    uninit(avctx);
+    return -1;
+}
+
+
+static inline CopyRet copy_frame(AVCodecContext *avctx,
+                                 BC_DTS_PROC_OUT *output,
+                                 void *data, int *got_frame)
+{
+    BC_STATUS ret;
+    BC_DTS_STATUS decoder_status = { 0, };
+    uint8_t trust_interlaced;
+    uint8_t interlaced;
+
+    CHDContext *priv = avctx->priv_data;
+    int64_t pkt_pts  = AV_NOPTS_VALUE;
+    uint8_t pic_type = 0;
+
+    uint8_t bottom_field = (output->PicInfo.flags & VDEC_FLAG_BOTTOMFIELD) ==
+                           VDEC_FLAG_BOTTOMFIELD;
+    uint8_t bottom_first = !!(output->PicInfo.flags & VDEC_FLAG_BOTTOM_FIRST);
+
+    int width    = output->PicInfo.width;
+    int height   = output->PicInfo.height;
+    int bwidth;
+    uint8_t *src = output->Ybuff;
+    int sStride;
+    uint8_t *dst;
+    int dStride;
+
+    if (output->PicInfo.timeStamp != 0) {
+        OpaqueList *node = opaque_list_pop(priv, output->PicInfo.timeStamp);
+        if (node) {
+            pkt_pts = node->reordered_opaque;
+            pic_type = node->pic_type;
+            av_free(node);
+        } else {
+            /*
+             * We will encounter a situation where a timestamp cannot be
+             * popped if a second field is being returned. In this case,
+             * each field has the same timestamp and the first one will
+             * cause it to be popped. To keep subsequent calculations
+             * simple, pic_type should be set a FIELD value - doesn't
+             * matter which, but I chose BOTTOM.
+             */
+            pic_type = PICT_BOTTOM_FIELD;
+        }
+        av_log(avctx, AV_LOG_VERBOSE, "output \"pts\": %"PRIu64"\n",
+               output->PicInfo.timeStamp);
+        av_log(avctx, AV_LOG_VERBOSE, "output picture type %d\n",
+               pic_type);
+    }
+
+    ret = DtsGetDriverStatus(priv->dev, &decoder_status);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR,
+               "CrystalHD: GetDriverStatus failed: %u\n", ret);
+       return RET_ERROR;
+    }
+
+    /*
+     * For most content, we can trust the interlaced flag returned
+     * by the hardware, but sometimes we can't. These are the
+     * conditions under which we can trust the flag:
+     *
+     * 1) It's not h.264 content
+     * 2) The UNKNOWN_SRC flag is not set
+     * 3) We know we're expecting a second field
+     * 4) The hardware reports this picture and the next picture
+     *    have the same picture number.
+     *
+     * Note that there can still be interlaced content that will
+     * fail this check, if the hardware hasn't decoded the next
+     * picture or if there is a corruption in the stream. (In either
+     * case a 0 will be returned for the next picture number)
+     */
+    trust_interlaced = avctx->codec->id != AV_CODEC_ID_H264 ||
+                       !(output->PicInfo.flags & VDEC_FLAG_UNKNOWN_SRC) ||
+                       priv->need_second_field ||
+                       (decoder_status.picNumFlags & ~0x40000000) ==
+                       output->PicInfo.picture_number;
+
+    /*
+     * If we got a false negative for trust_interlaced on the first field,
+     * we will realise our mistake here when we see that the picture number is that
+     * of the previous picture. We cannot recover the frame and should discard the
+     * second field to keep the correct number of output frames.
+     */
+    if (output->PicInfo.picture_number == priv->last_picture && !priv->need_second_field) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Incorrectly guessed progressive frame. Discarding second field\n");
+        /* Returning without providing a picture. */
+        return RET_OK;
+    }
+
+    interlaced = (output->PicInfo.flags & VDEC_FLAG_INTERLACED_SRC) &&
+                 trust_interlaced;
+
+    if (!trust_interlaced && (decoder_status.picNumFlags & ~0x40000000) == 0) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "Next picture number unknown. Assuming progressive frame.\n");
+    }
+
+    av_log(avctx, AV_LOG_VERBOSE, "Interlaced state: %d | trust_interlaced %d\n",
+           interlaced, trust_interlaced);
+
+    if (priv->pic->data[0] && !priv->need_second_field)
+        av_frame_unref(priv->pic);
+
+    priv->need_second_field = interlaced && !priv->need_second_field;
+
+    if (!priv->pic->data[0]) {
+        if (ff_get_buffer(avctx, priv->pic, AV_GET_BUFFER_FLAG_REF) < 0)
+            return RET_ERROR;
+    }
+
+    bwidth = av_image_get_linesize(avctx->pix_fmt, width, 0);
+    if (priv->is_70012) {
+        int pStride;
+
+        if (width <= 720)
+            pStride = 720;
+        else if (width <= 1280)
+            pStride = 1280;
+        else pStride = 1920;
+        sStride = av_image_get_linesize(avctx->pix_fmt, pStride, 0);
+    } else {
+        sStride = bwidth;
+    }
+
+    dStride = priv->pic->linesize[0];
+    dst     = priv->pic->data[0];
+
+    av_log(priv->avctx, AV_LOG_VERBOSE, "CrystalHD: Copying out frame\n");
+
+    if (interlaced) {
+        int dY = 0;
+        int sY = 0;
+
+        height /= 2;
+        if (bottom_field) {
+            av_log(priv->avctx, AV_LOG_VERBOSE, "Interlaced: bottom field\n");
+            dY = 1;
+        } else {
+            av_log(priv->avctx, AV_LOG_VERBOSE, "Interlaced: top field\n");
+            dY = 0;
+        }
+
+        for (sY = 0; sY < height; dY++, sY++) {
+            memcpy(&(dst[dY * dStride]), &(src[sY * sStride]), bwidth);
+            dY++;
+        }
+    } else {
+        av_image_copy_plane(dst, dStride, src, sStride, bwidth, height);
+    }
+
+    priv->pic->interlaced_frame = interlaced;
+    if (interlaced)
+        priv->pic->top_field_first = !bottom_first;
+
+    priv->pic->pkt_pts = pkt_pts;
+
+    if (!priv->need_second_field) {
+        *got_frame       = 1;
+        if ((ret = av_frame_ref(data, priv->pic)) < 0) {
+            return ret;
+        }
+    }
+
+    /*
+     * Two types of PAFF content have been observed. One form causes the
+     * hardware to return a field pair and the other individual fields,
+     * even though the input is always individual fields. We must skip
+     * copying on the next decode() call to maintain pipeline length in
+     * the first case.
+     */
+    if (!interlaced && (output->PicInfo.flags & VDEC_FLAG_UNKNOWN_SRC) &&
+        (pic_type == PICT_TOP_FIELD || pic_type == PICT_BOTTOM_FIELD)) {
+        av_log(priv->avctx, AV_LOG_VERBOSE, "Fieldpair from two packets.\n");
+        return RET_SKIP_NEXT_COPY;
+    }
+
+    /*
+     * The logic here is purely based on empirical testing with samples.
+     * If we need a second field, it could come from a second input packet,
+     * or it could come from the same field-pair input packet at the current
+     * field. In the first case, we should return and wait for the next time
+     * round to get the second field, while in the second case, we should
+     * ask the decoder for it immediately.
+     *
+     * Testing has shown that we are dealing with the fieldpair -> two fields
+     * case if the VDEC_FLAG_UNKNOWN_SRC is not set or if the input picture
+     * type was PICT_FRAME (in this second case, the flag might still be set)
+     */
+    return priv->need_second_field &&
+           (!(output->PicInfo.flags & VDEC_FLAG_UNKNOWN_SRC) ||
+            pic_type == PICT_FRAME) ?
+           RET_COPY_NEXT_FIELD : RET_OK;
+}
+
+
+static inline CopyRet receive_frame(AVCodecContext *avctx,
+                                    void *data, int *got_frame)
+{
+    BC_STATUS ret;
+    BC_DTS_PROC_OUT output = {
+        .PicInfo.width  = avctx->width,
+        .PicInfo.height = avctx->height,
+    };
+    CHDContext *priv = avctx->priv_data;
+    HANDLE dev       = priv->dev;
+
+    *got_frame = 0;
+
+    // Request decoded data from the driver
+    ret = DtsProcOutputNoCopy(dev, OUTPUT_PROC_TIMEOUT, &output);
+    if (ret == BC_STS_FMT_CHANGE) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Initial format change\n");
+        avctx->width  = output.PicInfo.width;
+        avctx->height = output.PicInfo.height;
+        switch ( output.PicInfo.aspect_ratio ) {
+        case vdecAspectRatioSquare:
+            avctx->sample_aspect_ratio = (AVRational) {  1,  1};
+            break;
+        case vdecAspectRatio12_11:
+            avctx->sample_aspect_ratio = (AVRational) { 12, 11};
+            break;
+        case vdecAspectRatio10_11:
+            avctx->sample_aspect_ratio = (AVRational) { 10, 11};
+            break;
+        case vdecAspectRatio16_11:
+            avctx->sample_aspect_ratio = (AVRational) { 16, 11};
+            break;
+        case vdecAspectRatio40_33:
+            avctx->sample_aspect_ratio = (AVRational) { 40, 33};
+            break;
+        case vdecAspectRatio24_11:
+            avctx->sample_aspect_ratio = (AVRational) { 24, 11};
+            break;
+        case vdecAspectRatio20_11:
+            avctx->sample_aspect_ratio = (AVRational) { 20, 11};
+            break;
+        case vdecAspectRatio32_11:
+            avctx->sample_aspect_ratio = (AVRational) { 32, 11};
+            break;
+        case vdecAspectRatio80_33:
+            avctx->sample_aspect_ratio = (AVRational) { 80, 33};
+            break;
+        case vdecAspectRatio18_11:
+            avctx->sample_aspect_ratio = (AVRational) { 18, 11};
+            break;
+        case vdecAspectRatio15_11:
+            avctx->sample_aspect_ratio = (AVRational) { 15, 11};
+            break;
+        case vdecAspectRatio64_33:
+            avctx->sample_aspect_ratio = (AVRational) { 64, 33};
+            break;
+        case vdecAspectRatio160_99:
+            avctx->sample_aspect_ratio = (AVRational) {160, 99};
+            break;
+        case vdecAspectRatio4_3:
+            avctx->sample_aspect_ratio = (AVRational) {  4,  3};
+            break;
+        case vdecAspectRatio16_9:
+            avctx->sample_aspect_ratio = (AVRational) { 16,  9};
+            break;
+        case vdecAspectRatio221_1:
+            avctx->sample_aspect_ratio = (AVRational) {221,  1};
+            break;
+        }
+        return RET_COPY_AGAIN;
+    } else if (ret == BC_STS_SUCCESS) {
+        int copy_ret = -1;
+        if (output.PoutFlags & BC_POUT_FLAGS_PIB_VALID) {
+            if (priv->last_picture == -1) {
+                /*
+                 * Init to one less, so that the incrementing code doesn't
+                 * need to be special-cased.
+                 */
+                priv->last_picture = output.PicInfo.picture_number - 1;
+            }
+
+            if (avctx->codec->id == AV_CODEC_ID_MPEG4 &&
+                output.PicInfo.timeStamp == 0 && priv->bframe_bug) {
+                av_log(avctx, AV_LOG_VERBOSE,
+                       "CrystalHD: Not returning packed frame twice.\n");
+                priv->last_picture++;
+                DtsReleaseOutputBuffs(dev, NULL, FALSE);
+                return RET_COPY_AGAIN;
+            }
+
+            print_frame_info(priv, &output);
+
+            if (priv->last_picture + 1 < output.PicInfo.picture_number) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "CrystalHD: Picture Number discontinuity\n");
+                /*
+                 * Have we lost frames? If so, we need to shrink the
+                 * pipeline length appropriately.
+                 *
+                 * XXX: I have no idea what the semantics of this situation
+                 * are so I don't even know if we've lost frames or which
+                 * ones.
+                 *
+                 * In any case, only warn the first time.
+                 */
+               priv->last_picture = output.PicInfo.picture_number - 1;
+            }
+
+            copy_ret = copy_frame(avctx, &output, data, got_frame);
+            if (*got_frame > 0) {
+                avctx->has_b_frames--;
+                priv->last_picture++;
+                av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Pipeline length: %u\n",
+                       avctx->has_b_frames);
+            }
+        } else {
+            /*
+             * An invalid frame has been consumed.
+             */
+            av_log(avctx, AV_LOG_ERROR, "CrystalHD: ProcOutput succeeded with "
+                                        "invalid PIB\n");
+            avctx->has_b_frames--;
+            copy_ret = RET_OK;
+        }
+        DtsReleaseOutputBuffs(dev, NULL, FALSE);
+
+        return copy_ret;
+    } else if (ret == BC_STS_BUSY) {
+        return RET_COPY_AGAIN;
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: ProcOutput failed %d\n", ret);
+        return RET_ERROR;
+    }
+}
+
+
+static int decode(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
+{
+    BC_STATUS ret;
+    BC_DTS_STATUS decoder_status = { 0, };
+    CopyRet rec_ret;
+    CHDContext *priv   = avctx->priv_data;
+    HANDLE dev         = priv->dev;
+    uint8_t *in_data   = avpkt->data;
+    int len            = avpkt->size;
+    int free_data      = 0;
+    uint8_t pic_type   = 0;
+
+    av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: decode_frame\n");
+
+    if (avpkt->size == 7 && !priv->bframe_bug) {
+        /*
+         * The use of a drop frame triggers the bug
+         */
+        av_log(avctx, AV_LOG_INFO,
+               "CrystalHD: Enabling work-around for packed b-frame bug\n");
+        priv->bframe_bug = 1;
+    } else if (avpkt->size == 8 && priv->bframe_bug) {
+        /*
+         * Delay frames don't trigger the bug
+         */
+        av_log(avctx, AV_LOG_INFO,
+               "CrystalHD: Disabling work-around for packed b-frame bug\n");
+        priv->bframe_bug = 0;
+    }
+
+    if (len) {
+        int32_t tx_free = (int32_t)DtsTxFreeSize(dev);
+
+        if (priv->parser) {
+            int ret = 0;
+
+            if (priv->bsfc) {
+                ret = av_bitstream_filter_filter(priv->bsfc, avctx, NULL,
+                                                 &in_data, &len,
+                                                 avpkt->data, len, 0);
+            }
+            free_data = ret > 0;
+
+            if (ret >= 0) {
+                uint8_t *pout;
+                int psize;
+                int index;
+                H264Context *h = priv->parser->priv_data;
+
+                index = av_parser_parse2(priv->parser, avctx, &pout, &psize,
+                                         in_data, len, avctx->internal->pkt->pts,
+                                         avctx->internal->pkt->dts, 0);
+                if (index < 0) {
+                    av_log(avctx, AV_LOG_WARNING,
+                           "CrystalHD: Failed to parse h.264 packet to "
+                           "detect interlacing.\n");
+                } else if (index != len) {
+                    av_log(avctx, AV_LOG_WARNING,
+                           "CrystalHD: Failed to parse h.264 packet "
+                           "completely. Interlaced frames may be "
+                           "incorrectly detected.\n");
+                } else {
+                    av_log(avctx, AV_LOG_VERBOSE,
+                           "CrystalHD: parser picture type %d\n",
+                           h->picture_structure);
+                    pic_type = h->picture_structure;
+                }
+            } else {
+                av_log(avctx, AV_LOG_WARNING,
+                       "CrystalHD: mp4toannexb filter failed to filter "
+                       "packet. Interlaced frames may be incorrectly "
+                       "detected.\n");
+            }
+        }
+
+        if (len < tx_free - 1024) {
+            /*
+             * Despite being notionally opaque, either libcrystalhd or
+             * the hardware itself will mangle pts values that are too
+             * small or too large. The docs claim it should be in units
+             * of 100ns. Given that we're nominally dealing with a black
+             * box on both sides, any transform we do has no guarantee of
+             * avoiding mangling so we need to build a mapping to values
+             * we know will not be mangled.
+             */
+            uint64_t pts = opaque_list_push(priv, avctx->internal->pkt->pts, pic_type);
+            if (!pts) {
+                if (free_data) {
+                    av_freep(&in_data);
+                }
+                return AVERROR(ENOMEM);
+            }
+            av_log(priv->avctx, AV_LOG_VERBOSE,
+                   "input \"pts\": %"PRIu64"\n", pts);
+            ret = DtsProcInput(dev, in_data, len, pts, 0);
+            if (free_data) {
+                av_freep(&in_data);
+            }
+            if (ret == BC_STS_BUSY) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "CrystalHD: ProcInput returned busy\n");
+                usleep(BASE_WAIT);
+                return AVERROR(EBUSY);
+            } else if (ret != BC_STS_SUCCESS) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "CrystalHD: ProcInput failed: %u\n", ret);
+                return -1;
+            }
+            avctx->has_b_frames++;
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "CrystalHD: Input buffer full\n");
+            len = 0; // We didn't consume any bytes.
+        }
+    } else {
+        av_log(avctx, AV_LOG_INFO, "CrystalHD: No more input data\n");
+    }
+
+    if (priv->skip_next_output) {
+        av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Skipping next output.\n");
+        priv->skip_next_output = 0;
+        avctx->has_b_frames--;
+        return len;
+    }
+
+    ret = DtsGetDriverStatus(dev, &decoder_status);
+    if (ret != BC_STS_SUCCESS) {
+        av_log(avctx, AV_LOG_ERROR, "CrystalHD: GetDriverStatus failed\n");
+        return -1;
+    }
+
+    /*
+     * No frames ready. Don't try to extract.
+     *
+     * Empirical testing shows that ReadyListCount can be a damn lie,
+     * and ProcOut still fails when count > 0. The same testing showed
+     * that two more iterations were needed before ProcOutput would
+     * succeed.
+     */
+    if (priv->output_ready < 2) {
+        if (decoder_status.ReadyListCount != 0)
+            priv->output_ready++;
+        usleep(BASE_WAIT);
+        av_log(avctx, AV_LOG_INFO, "CrystalHD: Filling pipeline.\n");
+        return len;
+    } else if (decoder_status.ReadyListCount == 0) {
+        /*
+         * After the pipeline is established, if we encounter a lack of frames
+         * that probably means we're not giving the hardware enough time to
+         * decode them, so start increasing the wait time at the end of a
+         * decode call.
+         */
+        usleep(BASE_WAIT);
+        priv->decode_wait += WAIT_UNIT;
+        av_log(avctx, AV_LOG_INFO, "CrystalHD: No frames ready. Returning\n");
+        return len;
+    }
+
+    do {
+        rec_ret = receive_frame(avctx, data, got_frame);
+        if (rec_ret == RET_OK && *got_frame == 0) {
+            /*
+             * This case is for when the encoded fields are stored
+             * separately and we get a separate avpkt for each one. To keep
+             * the pipeline stable, we should return nothing and wait for
+             * the next time round to grab the second field.
+             * H.264 PAFF is an example of this.
+             */
+            av_log(avctx, AV_LOG_VERBOSE, "Returning after first field.\n");
+            avctx->has_b_frames--;
+        } else if (rec_ret == RET_COPY_NEXT_FIELD) {
+            /*
+             * This case is for when the encoded fields are stored in a
+             * single avpkt but the hardware returns then separately. Unless
+             * we grab the second field before returning, we'll slip another
+             * frame in the pipeline and if that happens a lot, we're sunk.
+             * So we have to get that second field now.
+             * Interlaced mpeg2 and vc1 are examples of this.
+             */
+            av_log(avctx, AV_LOG_VERBOSE, "Trying to get second field.\n");
+            while (1) {
+                usleep(priv->decode_wait);
+                ret = DtsGetDriverStatus(dev, &decoder_status);
+                if (ret == BC_STS_SUCCESS &&
+                    decoder_status.ReadyListCount > 0) {
+                    rec_ret = receive_frame(avctx, data, got_frame);
+                    if ((rec_ret == RET_OK && *got_frame > 0) ||
+                        rec_ret == RET_ERROR)
+                        break;
+                }
+            }
+            av_log(avctx, AV_LOG_VERBOSE, "CrystalHD: Got second field.\n");
+        } else if (rec_ret == RET_SKIP_NEXT_COPY) {
+            /*
+             * Two input packets got turned into a field pair. Gawd.
+             */
+            av_log(avctx, AV_LOG_VERBOSE,
+                   "Don't output on next decode call.\n");
+            priv->skip_next_output = 1;
+        }
+        /*
+         * If rec_ret == RET_COPY_AGAIN, that means that either we just handled
+         * a FMT_CHANGE event and need to go around again for the actual frame,
+         * we got a busy status and need to try again, or we're dealing with
+         * packed b-frames, where the hardware strangely returns the packed
+         * p-frame twice. We choose to keep the second copy as it carries the
+         * valid pts.
+         */
+    } while (rec_ret == RET_COPY_AGAIN);
+    usleep(priv->decode_wait);
+    return len;
+}
+
+
+#if CONFIG_H264_CRYSTALHD_DECODER
+static AVClass h264_class = {
+    "h264_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_h264_crystalhd_decoder = {
+    .name           = "h264_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &h264_class,
+};
+#endif
+
+#if CONFIG_MPEG2_CRYSTALHD_DECODER
+static AVClass mpeg2_class = {
+    "mpeg2_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_mpeg2_crystalhd_decoder = {
+    .name           = "mpeg2_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-2 Video (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &mpeg2_class,
+};
+#endif
+
+#if CONFIG_MPEG4_CRYSTALHD_DECODER
+static AVClass mpeg4_class = {
+    "mpeg4_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_mpeg4_crystalhd_decoder = {
+    .name           = "mpeg4_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-4 Part 2 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG4,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &mpeg4_class,
+};
+#endif
+
+#if CONFIG_MSMPEG4_CRYSTALHD_DECODER
+static AVClass msmpeg4_class = {
+    "msmpeg4_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_msmpeg4_crystalhd_decoder = {
+    .name           = "msmpeg4_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-4 Part 2 Microsoft variant version 3 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MSMPEG4V3,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_EXPERIMENTAL,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &msmpeg4_class,
+};
+#endif
+
+#if CONFIG_VC1_CRYSTALHD_DECODER
+static AVClass vc1_class = {
+    "vc1_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_vc1_crystalhd_decoder = {
+    .name           = "vc1_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE VC-1 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &vc1_class,
+};
+#endif
+
+#if CONFIG_WMV3_CRYSTALHD_DECODER
+static AVClass wmv3_class = {
+    "wmv3_crystalhd",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_wmv3_crystalhd_decoder = {
+    .name           = "wmv3_crystalhd",
+    .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Video 9 (CrystalHD acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WMV3,
+    .priv_data_size = sizeof(CHDContext),
+    .init           = init,
+    .close          = uninit,
+    .decode         = decode,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUYV422, AV_PIX_FMT_NONE},
+    .priv_class     = &wmv3_class,
+};
+#endif
diff --git a/libavcodec/cscd.c b/libavcodec/cscd.c
index 0cb375b..9e1dec9 100644
--- a/libavcodec/cscd.c
+++ b/libavcodec/cscd.c
@@ -2,20 +2,20 @@
  * CamStudio decoder
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <stdio.h>
@@ -31,14 +31,15 @@
 #include "libavutil/lzo.h"
 
 typedef struct CamStudioContext {
+    AVFrame *pic;
     int linelen, height, bpp;
     unsigned int decomp_size;
     unsigned char* decomp_buf;
 } CamStudioContext;
 
-static void copy_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
+static void copy_frame_default(AVFrame *f, const uint8_t *src,
                                int linelen, int height) {
-    int i;
+    int i, src_stride = FFALIGN(linelen, 4);
     uint8_t *dst = f->data[0];
     dst += (height - 1) * f->linesize[0];
     for (i = height; i; i--) {
@@ -48,9 +49,9 @@ static void copy_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
     }
 }
 
-static void add_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
+static void add_frame_default(AVFrame *f, const uint8_t *src,
                               int linelen, int height) {
-    int i, j;
+    int i, j, src_stride = FFALIGN(linelen, 4);
     uint8_t *dst = f->data[0];
     dst += (height - 1) * f->linesize[0];
     for (i = height; i; i--) {
@@ -61,87 +62,11 @@ static void add_frame_default(AVFrame *f, const uint8_t *src, int src_stride,
     }
 }
 
-#if !HAVE_BIGENDIAN
-#define copy_frame_16(f, s, l, h) copy_frame_default(f, s, l, l, h)
-#define copy_frame_32(f, s, l, h) copy_frame_default(f, s, l, l, h)
-#define add_frame_16(f, s, l, h) add_frame_default(f, s, l, l, h)
-#define add_frame_32(f, s, l, h) add_frame_default(f, s, l, l, h)
-#else
-static void copy_frame_16(AVFrame *f, const uint8_t *src,
-                          int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 2; j; j--) {
-          dst[0] = src[1];
-          dst[1] = src[0];
-          src += 2;
-          dst += 2;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void copy_frame_32(AVFrame *f, const uint8_t *src,
-                          int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 4; j; j--) {
-          dst[0] = src[3];
-          dst[1] = src[2];
-          dst[2] = src[1];
-          dst[3] = src[0];
-          src += 4;
-          dst += 4;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void add_frame_16(AVFrame *f, const uint8_t *src,
-                         int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 2; j; j--) {
-          dst[0] += src[1];
-          dst[1] += src[0];
-          src += 2;
-          dst += 2;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-
-static void add_frame_32(AVFrame *f, const uint8_t *src,
-                         int linelen, int height) {
-    int i, j;
-    uint8_t *dst = f->data[0];
-    dst += (height - 1) * f->linesize[0];
-    for (i = height; i; i--) {
-        for (j = linelen / 4; j; j--) {
-          dst[0] += src[3];
-          dst[1] += src[2];
-          dst[2] += src[1];
-          dst[3] += src[0];
-          src += 4;
-          dst += 4;
-        }
-        dst -= f->linesize[0] + linelen;
-    }
-}
-#endif
-
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt) {
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     CamStudioContext *c = avctx->priv_data;
-    AVFrame *picture = data;
     int ret;
 
     if (buf_size < 2) {
@@ -149,10 +74,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
 
     // decompress data
     switch ((buf[0] >> 1) & 7) {
@@ -180,36 +103,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // flip upside down, add difference frame
     if (buf[0] & 1) { // keyframe
-        picture->pict_type = AV_PICTURE_TYPE_I;
-        picture->key_frame = 1;
-        switch (c->bpp) {
-          case 16:
-              copy_frame_16(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          case 32:
-              copy_frame_32(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          default:
-              copy_frame_default(picture, c->decomp_buf, FFALIGN(c->linelen, 4),
+        c->pic->pict_type = AV_PICTURE_TYPE_I;
+        c->pic->key_frame = 1;
+              copy_frame_default(c->pic, c->decomp_buf,
                                  c->linelen, c->height);
-        }
     } else {
-        picture->pict_type = AV_PICTURE_TYPE_P;
-        picture->key_frame = 0;
-        switch (c->bpp) {
-          case 16:
-              add_frame_16(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          case 32:
-              add_frame_32(picture, c->decomp_buf, c->linelen, c->height);
-              break;
-          default:
-              add_frame_default(picture, c->decomp_buf, FFALIGN(c->linelen, 4),
+        c->pic->pict_type = AV_PICTURE_TYPE_P;
+        c->pic->key_frame = 0;
+              add_frame_default(c->pic, c->decomp_buf,
                                 c->linelen, c->height);
-        }
     }
 
     *got_frame = 1;
+    if ((ret = av_frame_ref(data, c->pic)) < 0)
+        return ret;
+
     return buf_size;
 }
 
@@ -217,9 +125,9 @@ static av_cold int decode_init(AVCodecContext *avctx) {
     CamStudioContext *c = avctx->priv_data;
     int stride;
     switch (avctx->bits_per_coded_sample) {
-        case 16: avctx->pix_fmt = AV_PIX_FMT_RGB555; break;
+        case 16: avctx->pix_fmt = AV_PIX_FMT_RGB555LE; break;
         case 24: avctx->pix_fmt = AV_PIX_FMT_BGR24; break;
-        case 32: avctx->pix_fmt = AV_PIX_FMT_RGB32; break;
+        case 32: avctx->pix_fmt = AV_PIX_FMT_BGR0; break;
         default:
             av_log(avctx, AV_LOG_ERROR,
                    "CamStudio codec error: invalid depth %i bpp\n",
@@ -229,21 +137,23 @@ static av_cold int decode_init(AVCodecContext *avctx) {
     c->bpp = avctx->bits_per_coded_sample;
     c->linelen = avctx->width * avctx->bits_per_coded_sample / 8;
     c->height = avctx->height;
-    stride = c->linelen;
-    if (avctx->bits_per_coded_sample == 24)
-        stride = FFALIGN(stride, 4);
+    stride = FFALIGN(c->linelen, 4);
     c->decomp_size = c->height * stride;
     c->decomp_buf = av_malloc(c->decomp_size + AV_LZO_OUTPUT_PADDING);
     if (!c->decomp_buf) {
         av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
         return AVERROR(ENOMEM);
     }
+    c->pic = av_frame_alloc();
+    if (!c->pic)
+        return AVERROR(ENOMEM);
     return 0;
 }
 
 static av_cold int decode_end(AVCodecContext *avctx) {
     CamStudioContext *c = avctx->priv_data;
     av_freep(&c->decomp_buf);
+    av_frame_free(&c->pic);
     return 0;
 }
 
diff --git a/libavcodec/cuvid.c b/libavcodec/cuvid.c
new file mode 100644
index 0000000..1da0e87
--- /dev/null
+++ b/libavcodec/cuvid.c
@@ -0,0 +1,698 @@
+/*
+ * Nvidia CUVID decoder
+ * Copyright (c) 2016 Timo Rothenpieler <timo@rothenpieler.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/buffer.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda.h"
+#include "libavutil/fifo.h"
+#include "libavutil/log.h"
+
+#include "avcodec.h"
+#include "internal.h"
+
+#include <nvcuvid.h>
+
+#define MAX_FRAME_COUNT 20
+
+typedef struct CuvidContext
+{
+    CUvideodecoder cudecoder;
+    CUvideoparser cuparser;
+
+    AVBufferRef *hwdevice;
+    AVBufferRef *hwframe;
+
+    AVBSFContext *bsf;
+
+    AVFifoBuffer *frame_queue;
+
+    int internal_error;
+
+    cudaVideoCodec codec_type;
+    cudaVideoChromaFormat chroma_format;
+} CuvidContext;
+
+static int check_cu(AVCodecContext *avctx, CUresult err, const char *func)
+{
+    const char *err_name;
+    const char *err_string;
+
+    av_log(avctx, AV_LOG_TRACE, "Calling %s\n", func);
+
+    if (err == CUDA_SUCCESS)
+        return 0;
+
+    cuGetErrorName(err, &err_name);
+    cuGetErrorString(err, &err_string);
+
+    av_log(avctx, AV_LOG_ERROR, "%s failed", func);
+    if (err_name && err_string)
+        av_log(avctx, AV_LOG_ERROR, " -> %s: %s", err_name, err_string);
+    av_log(avctx, AV_LOG_ERROR, "\n");
+
+    return AVERROR_EXTERNAL;
+}
+
+#define CHECK_CU(x) check_cu(avctx, (x), #x)
+
+static int CUDAAPI cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* format)
+{
+    AVCodecContext *avctx = opaque;
+    CuvidContext *ctx = avctx->priv_data;
+    AVHWFramesContext *hwframe_ctx = (AVHWFramesContext*)ctx->hwframe->data;
+    CUVIDDECODECREATEINFO cuinfo;
+
+    av_log(avctx, AV_LOG_TRACE, "pfnSequenceCallback\n");
+
+    ctx->internal_error = 0;
+
+    avctx->width = format->display_area.right;
+    avctx->height = format->display_area.bottom;
+
+    ff_set_sar(avctx, av_div_q(
+        (AVRational){ format->display_aspect_ratio.x, format->display_aspect_ratio.y },
+        (AVRational){ avctx->width, avctx->height }));
+
+    if (!format->progressive_sequence)
+        avctx->flags |= AV_CODEC_FLAG_INTERLACED_DCT;
+    else
+        avctx->flags &= ~AV_CODEC_FLAG_INTERLACED_DCT;
+
+    if (format->video_signal_description.video_full_range_flag)
+        avctx->color_range = AVCOL_RANGE_JPEG;
+    else
+        avctx->color_range = AVCOL_RANGE_MPEG;
+
+    avctx->color_primaries = format->video_signal_description.color_primaries;
+    avctx->color_trc = format->video_signal_description.transfer_characteristics;
+    avctx->colorspace = format->video_signal_description.matrix_coefficients;
+
+    if (format->bitrate)
+        avctx->bit_rate = format->bitrate;
+
+    if (format->frame_rate.numerator && format->frame_rate.denominator) {
+        avctx->framerate.num = format->frame_rate.numerator;
+        avctx->framerate.den = format->frame_rate.denominator;
+    }
+
+    if (ctx->cudecoder
+            && avctx->coded_width == format->coded_width
+            && avctx->coded_height == format->coded_height
+            && ctx->chroma_format == format->chroma_format
+            && ctx->codec_type == format->codec)
+        return 1;
+
+    if (ctx->cudecoder) {
+        av_log(avctx, AV_LOG_ERROR, "re-initializing decoder is not supported\n");
+        ctx->internal_error = AVERROR(EINVAL);
+        return 0;
+    }
+
+    if (hwframe_ctx->pool) {
+        av_log(avctx, AV_LOG_ERROR, "AVHWFramesContext is already initialized\n");
+        ctx->internal_error = AVERROR(EINVAL);
+        return 0;
+    }
+
+    avctx->coded_width = format->coded_width;
+    avctx->coded_height = format->coded_height;
+
+    ctx->chroma_format = format->chroma_format;
+
+    memset(&cuinfo, 0, sizeof(cuinfo));
+
+    cuinfo.CodecType = ctx->codec_type = format->codec;
+    cuinfo.ChromaFormat = format->chroma_format;
+    cuinfo.OutputFormat = cudaVideoSurfaceFormat_NV12;
+
+    cuinfo.ulWidth = avctx->coded_width;
+    cuinfo.ulHeight = avctx->coded_height;
+    cuinfo.ulTargetWidth = cuinfo.ulWidth;
+    cuinfo.ulTargetHeight = cuinfo.ulHeight;
+
+    cuinfo.target_rect.left = 0;
+    cuinfo.target_rect.top = 0;
+    cuinfo.target_rect.right = cuinfo.ulWidth;
+    cuinfo.target_rect.bottom = cuinfo.ulHeight;
+
+    cuinfo.ulNumDecodeSurfaces = MAX_FRAME_COUNT;
+    cuinfo.ulNumOutputSurfaces = 1;
+    cuinfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+
+    cuinfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
+
+    ctx->internal_error = CHECK_CU(cuvidCreateDecoder(&ctx->cudecoder, &cuinfo));
+    if (ctx->internal_error < 0)
+        return 0;
+
+    hwframe_ctx->format = AV_PIX_FMT_CUDA;
+    hwframe_ctx->sw_format = AV_PIX_FMT_NV12;
+    hwframe_ctx->width = FFALIGN(avctx->coded_width, 32);
+    hwframe_ctx->height = FFALIGN(avctx->coded_height, 32);
+
+    if ((ctx->internal_error = av_hwframe_ctx_init(ctx->hwframe)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "av_hwframe_ctx_init failed\n");
+        return 0;
+    }
+
+    return 1;
+}
+
+static int CUDAAPI cuvid_handle_picture_decode(void *opaque, CUVIDPICPARAMS* picparams)
+{
+    AVCodecContext *avctx = opaque;
+    CuvidContext *ctx = avctx->priv_data;
+
+    av_log(avctx, AV_LOG_TRACE, "pfnDecodePicture\n");
+
+    ctx->internal_error = CHECK_CU(cuvidDecodePicture(ctx->cudecoder, picparams));
+    if (ctx->internal_error < 0)
+        return 0;
+
+    return 1;
+}
+
+static int CUDAAPI cuvid_handle_picture_display(void *opaque, CUVIDPARSERDISPINFO* dispinfo)
+{
+    AVCodecContext *avctx = opaque;
+    CuvidContext *ctx = avctx->priv_data;
+
+    av_log(avctx, AV_LOG_TRACE, "pfnDisplayPicture\n");
+
+    ctx->internal_error = 0;
+
+    av_fifo_generic_write(ctx->frame_queue, dispinfo, sizeof(CUVIDPARSERDISPINFO), NULL);
+
+    return 1;
+}
+
+static int cuvid_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
+    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
+    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
+    AVFrame *frame = data;
+    CUVIDSOURCEDATAPACKET cupkt;
+    AVPacket filter_packet = { 0 };
+    AVPacket filtered_packet = { 0 };
+    CUdeviceptr mapped_frame = 0;
+    int ret = 0, eret = 0;
+
+    if (ctx->bsf && avpkt->size) {
+        if ((ret = av_packet_ref(&filter_packet, avpkt)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_packet_ref failed\n");
+            return ret;
+        }
+
+        if ((ret = av_bsf_send_packet(ctx->bsf, &filter_packet)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_bsf_send_packet failed\n");
+            av_packet_unref(&filter_packet);
+            return ret;
+        }
+
+        if ((ret = av_bsf_receive_packet(ctx->bsf, &filtered_packet)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_bsf_receive_packet failed\n");
+            return ret;
+        }
+
+        avpkt = &filtered_packet;
+    }
+
+    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0) {
+        av_packet_unref(&filtered_packet);
+        return ret;
+    }
+
+    memset(&cupkt, 0, sizeof(cupkt));
+
+    if (avpkt->size) {
+        cupkt.payload_size = avpkt->size;
+        cupkt.payload = avpkt->data;
+
+        if (avpkt->pts != AV_NOPTS_VALUE) {
+            cupkt.flags = CUVID_PKT_TIMESTAMP;
+            cupkt.timestamp = av_rescale_q(avpkt->pts, avctx->time_base, (AVRational){1, 10000000});
+        }
+    } else {
+        cupkt.flags = CUVID_PKT_ENDOFSTREAM;
+    }
+
+    ret = CHECK_CU(cuvidParseVideoData(ctx->cuparser, &cupkt));
+
+    av_packet_unref(&filtered_packet);
+
+    if (ret < 0) {
+        if (ctx->internal_error)
+            ret = ctx->internal_error;
+        goto error;
+    }
+
+    if (av_fifo_size(ctx->frame_queue)) {
+        CUVIDPARSERDISPINFO dispinfo;
+        CUVIDPROCPARAMS params;
+        unsigned int pitch = 0;
+        int offset = 0;
+        int i;
+
+        av_fifo_generic_read(ctx->frame_queue, &dispinfo, sizeof(CUVIDPARSERDISPINFO), NULL);
+
+        memset(&params, 0, sizeof(params));
+        params.progressive_frame = dispinfo.progressive_frame;
+        params.second_field = 0;
+        params.top_field_first = dispinfo.top_field_first;
+
+        ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, dispinfo.picture_index, &mapped_frame, &pitch, &params));
+        if (ret < 0)
+            goto error;
+
+        if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
+            ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n");
+                goto error;
+            }
+
+            ret = ff_decode_frame_props(avctx, frame);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n");
+                goto error;
+            }
+
+            for (i = 0; i < 2; i++) {
+                CUDA_MEMCPY2D cpy = {
+                    .srcMemoryType = CU_MEMORYTYPE_DEVICE,
+                    .dstMemoryType = CU_MEMORYTYPE_DEVICE,
+                    .srcDevice     = mapped_frame,
+                    .dstDevice     = (CUdeviceptr)frame->data[i],
+                    .srcPitch      = pitch,
+                    .dstPitch      = frame->linesize[i],
+                    .srcY          = offset,
+                    .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
+                    .Height        = avctx->coded_height >> (i ? 1 : 0),
+                };
+
+                ret = CHECK_CU(cuMemcpy2D(&cpy));
+                if (ret < 0)
+                    goto error;
+
+                offset += avctx->coded_height;
+            }
+        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
+            AVFrame *tmp_frame = av_frame_alloc();
+            if (!tmp_frame) {
+                av_log(avctx, AV_LOG_ERROR, "av_frame_alloc failed\n");
+                ret = AVERROR(ENOMEM);
+                goto error;
+            }
+
+            tmp_frame->format        = AV_PIX_FMT_CUDA;
+            tmp_frame->hw_frames_ctx = av_buffer_ref(ctx->hwframe);
+            tmp_frame->data[0]       = (uint8_t*)mapped_frame;
+            tmp_frame->linesize[0]   = pitch;
+            tmp_frame->data[1]       = (uint8_t*)(mapped_frame + avctx->coded_height * pitch);
+            tmp_frame->linesize[1]   = pitch;
+            tmp_frame->width         = avctx->width;
+            tmp_frame->height        = avctx->height;
+
+            ret = ff_get_buffer(avctx, frame, 0);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR, "ff_get_buffer failed\n");
+                av_frame_free(&tmp_frame);
+                goto error;
+            }
+
+            ret = av_hwframe_transfer_data(frame, tmp_frame, 0);
+            if (ret) {
+                av_log(avctx, AV_LOG_ERROR, "av_hwframe_transfer_data failed\n");
+                av_frame_free(&tmp_frame);
+                goto error;
+            }
+
+            av_frame_free(&tmp_frame);
+        } else {
+            ret = AVERROR_BUG;
+            goto error;
+        }
+
+        frame->width = avctx->width;
+        frame->height = avctx->height;
+        frame->pts = av_rescale_q(dispinfo.timestamp, (AVRational){1, 10000000}, avctx->time_base);
+
+        /* CUVIDs opaque reordering breaks the internal pkt logic.
+         * So set pkt_pts and clear all the other pkt_ fields.
+         */
+        frame->pkt_pts = frame->pts;
+        av_frame_set_pkt_pos(frame, -1);
+        av_frame_set_pkt_duration(frame, 0);
+        av_frame_set_pkt_size(frame, -1);
+
+        frame->interlaced_frame = !dispinfo.progressive_frame;
+
+        if (!dispinfo.progressive_frame)
+            frame->top_field_first = dispinfo.top_field_first;
+
+        *got_frame = 1;
+    } else {
+        *got_frame = 0;
+    }
+
+error:
+    if (mapped_frame)
+        eret = CHECK_CU(cuvidUnmapVideoFrame(ctx->cudecoder, mapped_frame));
+
+    eret = CHECK_CU(cuCtxPopCurrent(&dummy));
+
+    if (eret < 0)
+        return eret;
+    else
+        return ret;
+}
+
+static av_cold int cuvid_decode_end(AVCodecContext *avctx)
+{
+    CuvidContext *ctx = avctx->priv_data;
+
+    av_fifo_freep(&ctx->frame_queue);
+
+    if (ctx->bsf)
+        av_bsf_free(&ctx->bsf);
+
+    if (ctx->cuparser)
+        cuvidDestroyVideoParser(ctx->cuparser);
+
+    if (ctx->cudecoder)
+        cuvidDestroyDecoder(ctx->cudecoder);
+
+    av_buffer_unref(&ctx->hwframe);
+    av_buffer_unref(&ctx->hwdevice);
+
+    return 0;
+}
+
+static void cuvid_ctx_free(AVHWDeviceContext *ctx)
+{
+    AVCUDADeviceContext *hwctx = ctx->hwctx;
+    cuCtxDestroy(hwctx->cuda_ctx);
+}
+
+static int cuvid_test_dummy_decoder(AVCodecContext *avctx, CUVIDPARSERPARAMS *cuparseinfo)
+{
+    CUVIDDECODECREATEINFO cuinfo;
+    CUvideodecoder cudec = 0;
+    int ret = 0;
+
+    memset(&cuinfo, 0, sizeof(cuinfo));
+
+    cuinfo.CodecType = cuparseinfo->CodecType;
+    cuinfo.ChromaFormat = cudaVideoChromaFormat_420;
+    cuinfo.OutputFormat = cudaVideoSurfaceFormat_NV12;
+
+    cuinfo.ulWidth = 1280;
+    cuinfo.ulHeight = 720;
+    cuinfo.ulTargetWidth = cuinfo.ulWidth;
+    cuinfo.ulTargetHeight = cuinfo.ulHeight;
+
+    cuinfo.target_rect.left = 0;
+    cuinfo.target_rect.top = 0;
+    cuinfo.target_rect.right = cuinfo.ulWidth;
+    cuinfo.target_rect.bottom = cuinfo.ulHeight;
+
+    cuinfo.ulNumDecodeSurfaces = MAX_FRAME_COUNT;
+    cuinfo.ulNumOutputSurfaces = 1;
+    cuinfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+
+    cuinfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
+
+    ret = CHECK_CU(cuvidCreateDecoder(&cudec, &cuinfo));
+    if (ret < 0)
+        return ret;
+
+    ret = CHECK_CU(cuvidDestroyDecoder(cudec));
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static av_cold int cuvid_decode_init(AVCodecContext *avctx)
+{
+    CuvidContext *ctx = avctx->priv_data;
+    AVCUDADeviceContext *device_hwctx;
+    AVHWDeviceContext *device_ctx;
+    AVHWFramesContext *hwframe_ctx;
+    CUVIDPARSERPARAMS cuparseinfo;
+    CUVIDEOFORMATEX cuparse_ext;
+    CUVIDSOURCEDATAPACKET seq_pkt;
+    CUdevice device;
+    CUcontext cuda_ctx = NULL;
+    CUcontext dummy;
+    const AVBitStreamFilter *bsf;
+    int ret = 0;
+
+    enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_CUDA,
+                                       AV_PIX_FMT_NV12,
+                                       AV_PIX_FMT_NONE };
+
+    ret = ff_get_format(avctx, pix_fmts);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "ff_get_format failed: %d\n", ret);
+        return ret;
+    }
+
+    ctx->frame_queue = av_fifo_alloc(MAX_FRAME_COUNT * sizeof(CUVIDPARSERDISPINFO));
+    if (!ctx->frame_queue) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+
+    avctx->pix_fmt = ret;
+
+    if (avctx->hw_frames_ctx) {
+        ctx->hwframe = av_buffer_ref(avctx->hw_frames_ctx);
+        if (!ctx->hwframe) {
+            ret = AVERROR(ENOMEM);
+            goto error;
+        }
+
+        hwframe_ctx = (AVHWFramesContext*)ctx->hwframe->data;
+
+        ctx->hwdevice = av_buffer_ref(hwframe_ctx->device_ref);
+        if (!ctx->hwdevice) {
+            ret = AVERROR(ENOMEM);
+            goto error;
+        }
+
+        device_ctx = hwframe_ctx->device_ctx;
+        device_hwctx = device_ctx->hwctx;
+        cuda_ctx = device_hwctx->cuda_ctx;
+    } else {
+        ctx->hwdevice = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA);
+        if (!ctx->hwdevice) {
+            av_log(avctx, AV_LOG_ERROR, "Error allocating hwdevice\n");
+            ret = AVERROR(ENOMEM);
+            goto error;
+        }
+
+        ret = CHECK_CU(cuInit(0));
+        if (ret < 0)
+            goto error;
+
+        ret = CHECK_CU(cuDeviceGet(&device, 0));
+        if (ret < 0)
+            goto error;
+
+        ret = CHECK_CU(cuCtxCreate(&cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, device));
+        if (ret < 0)
+            goto error;
+
+        device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
+        device_ctx->free = cuvid_ctx_free;
+
+        device_hwctx = device_ctx->hwctx;
+        device_hwctx->cuda_ctx = cuda_ctx;
+
+        ret = CHECK_CU(cuCtxPopCurrent(&dummy));
+        if (ret < 0)
+            goto error;
+
+        ret = av_hwdevice_ctx_init(ctx->hwdevice);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "av_hwdevice_ctx_init failed\n");
+            goto error;
+        }
+
+        ctx->hwframe = av_hwframe_ctx_alloc(ctx->hwdevice);
+        if (!ctx->hwframe) {
+            av_log(avctx, AV_LOG_ERROR, "av_hwframe_ctx_alloc failed\n");
+            ret = AVERROR(ENOMEM);
+            goto error;
+        }
+    }
+
+    memset(&cuparseinfo, 0, sizeof(cuparseinfo));
+    memset(&cuparse_ext, 0, sizeof(cuparse_ext));
+    memset(&seq_pkt, 0, sizeof(seq_pkt));
+
+    cuparseinfo.pExtVideoInfo = &cuparse_ext;
+
+    switch (avctx->codec->id) {
+#if CONFIG_H264_CUVID_DECODER
+    case AV_CODEC_ID_H264:
+        cuparseinfo.CodecType = cudaVideoCodec_H264;
+        break;
+#endif
+#if CONFIG_HEVC_CUVID_DECODER
+    case AV_CODEC_ID_HEVC:
+        cuparseinfo.CodecType = cudaVideoCodec_HEVC;
+        break;
+#endif
+#if CONFIG_VP8_CUVID_DECODER
+    case AV_CODEC_ID_VP8:
+        cuparseinfo.CodecType = cudaVideoCodec_VP8;
+        break;
+#endif
+#if CONFIG_VP9_CUVID_DECODER
+    case AV_CODEC_ID_VP9:
+        cuparseinfo.CodecType = cudaVideoCodec_VP9;
+        break;
+#endif
+#if CONFIG_VC1_CUVID_DECODER
+    case AV_CODEC_ID_VC1:
+        cuparseinfo.CodecType = cudaVideoCodec_VC1;
+        break;
+#endif
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Invalid CUVID codec!\n");
+        return AVERROR_BUG;
+    }
+
+    if (avctx->codec->id == AV_CODEC_ID_H264 || avctx->codec->id == AV_CODEC_ID_HEVC) {
+        if (avctx->codec->id == AV_CODEC_ID_H264)
+            bsf = av_bsf_get_by_name("h264_mp4toannexb");
+        else
+            bsf = av_bsf_get_by_name("hevc_mp4toannexb");
+
+        if (!bsf) {
+            ret = AVERROR_BSF_NOT_FOUND;
+            goto error;
+        }
+        if (ret = av_bsf_alloc(bsf, &ctx->bsf)) {
+            goto error;
+        }
+        if (((ret = avcodec_parameters_from_context(ctx->bsf->par_in, avctx)) < 0) || ((ret = av_bsf_init(ctx->bsf)) < 0)) {
+            av_bsf_free(&ctx->bsf);
+            goto error;
+        }
+
+        cuparse_ext.format.seqhdr_data_length = ctx->bsf->par_out->extradata_size;
+        memcpy(cuparse_ext.raw_seqhdr_data,
+               ctx->bsf->par_out->extradata,
+               FFMIN(sizeof(cuparse_ext.raw_seqhdr_data), ctx->bsf->par_out->extradata_size));
+    } else if (avctx->extradata_size > 0) {
+        cuparse_ext.format.seqhdr_data_length = avctx->extradata_size;
+        memcpy(cuparse_ext.raw_seqhdr_data,
+               avctx->extradata,
+               FFMIN(sizeof(cuparse_ext.raw_seqhdr_data), avctx->extradata_size));
+    }
+
+    cuparseinfo.ulMaxNumDecodeSurfaces = MAX_FRAME_COUNT;
+    cuparseinfo.ulMaxDisplayDelay = 4;
+    cuparseinfo.pUserData = avctx;
+    cuparseinfo.pfnSequenceCallback = cuvid_handle_video_sequence;
+    cuparseinfo.pfnDecodePicture = cuvid_handle_picture_decode;
+    cuparseinfo.pfnDisplayPicture = cuvid_handle_picture_display;
+
+    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        goto error;
+
+    ret = cuvid_test_dummy_decoder(avctx, &cuparseinfo);
+    if (ret < 0)
+        goto error;
+
+    ret = CHECK_CU(cuvidCreateVideoParser(&ctx->cuparser, &cuparseinfo));
+    if (ret < 0)
+        goto error;
+
+    seq_pkt.payload = cuparse_ext.raw_seqhdr_data;
+    seq_pkt.payload_size = cuparse_ext.format.seqhdr_data_length;
+
+    if (seq_pkt.payload && seq_pkt.payload_size) {
+        ret = CHECK_CU(cuvidParseVideoData(ctx->cuparser, &seq_pkt));
+        if (ret < 0)
+            goto error;
+    }
+
+    ret = CHECK_CU(cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto error;
+
+    return 0;
+
+error:
+    cuvid_decode_end(avctx);
+    return ret;
+}
+
+#define DEFINE_CUVID_CODEC(x, X) \
+    AVHWAccel ff_##x##_cuvid_hwaccel = { \
+        .name           = #x "_cuvid", \
+        .type           = AVMEDIA_TYPE_VIDEO, \
+        .id             = AV_CODEC_ID_##X, \
+        .pix_fmt        = AV_PIX_FMT_CUDA, \
+    }; \
+    AVCodec ff_##x##_cuvid_decoder = { \
+        .name           = #x "_cuvid", \
+        .long_name      = NULL_IF_CONFIG_SMALL("Nvidia CUVID " #X " decoder"), \
+        .type           = AVMEDIA_TYPE_VIDEO, \
+        .id             = AV_CODEC_ID_##X, \
+        .priv_data_size = sizeof(CuvidContext), \
+        .init           = cuvid_decode_init, \
+        .close          = cuvid_decode_end, \
+        .decode         = cuvid_decode_frame, \
+        .capabilities   = AV_CODEC_CAP_DELAY, \
+        .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, \
+                                                        AV_PIX_FMT_NV12, \
+                                                        AV_PIX_FMT_NONE }, \
+    };
+
+#if CONFIG_HEVC_CUVID_DECODER
+DEFINE_CUVID_CODEC(hevc, HEVC)
+#endif
+
+#if CONFIG_H264_CUVID_DECODER
+DEFINE_CUVID_CODEC(h264, H264)
+#endif
+
+#if CONFIG_VP8_CUVID_DECODER
+DEFINE_CUVID_CODEC(vp8, VP8)
+#endif
+
+#if CONFIG_VP9_CUVID_DECODER
+DEFINE_CUVID_CODEC(vp9, VP9)
+#endif
+
+#if CONFIG_VC1_CUVID_DECODER
+DEFINE_CUVID_CODEC(vc1, VC1)
+#endif
diff --git a/libavcodec/cyuv.c b/libavcodec/cyuv.c
index 86f7aac..8e6749b 100644
--- a/libavcodec/cyuv.c
+++ b/libavcodec/cyuv.c
@@ -6,20 +6,20 @@
  *
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,7 +52,6 @@ static av_cold int cyuv_decode_init(AVCodecContext *avctx)
     if (s->width & 0x3)
         return AVERROR_INVALIDDATA;
     s->height = avctx->height;
-    avctx->pix_fmt = AV_PIX_FMT_YUV411P;
 
     return 0;
 }
@@ -82,6 +81,7 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
     int stream_ptr;
     unsigned char cur_byte;
     int pixel_groups;
+    int rawsize = s->height * FFALIGN(s->width,2) * 2;
     int ret;
 
     if (avctx->codec_id == AV_CODEC_ID_AURA) {
@@ -92,7 +92,11 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
      * followed by (height) lines each with 3 bytes to represent groups
      * of 4 pixels. Thus, the total size of the buffer ought to be:
      *    (3 * 16) + height * (width * 3 / 4) */
-    if (buf_size != 48 + s->height * (s->width * 3 / 4)) {
+    if (buf_size == 48 + s->height * (s->width * 3 / 4)) {
+        avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+    } else if(buf_size == rawsize ) {
+        avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+    } else {
         av_log(avctx, AV_LOG_ERROR, "got a buffer with %d bytes when %d were expected\n",
                buf_size, 48 + s->height * (s->width * 3 / 4));
         return AVERROR_INVALIDDATA;
@@ -101,15 +105,22 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
     /* pixel data starts 48 bytes in, after 3x16-byte tables */
     stream_ptr = 48;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     y_plane = frame->data[0];
     u_plane = frame->data[1];
     v_plane = frame->data[2];
 
+    if (buf_size == rawsize) {
+        int linesize = FFALIGN(s->width,2) * 2;
+        y_plane += frame->linesize[0] * s->height;
+        for (stream_ptr = 0; stream_ptr < rawsize; stream_ptr += linesize) {
+            y_plane -= frame->linesize[0];
+            memcpy(y_plane, buf+stream_ptr, linesize);
+        }
+    } else {
+
     /* iterate through each line in the height */
     for (y_ptr = 0, u_ptr = 0, v_ptr = 0;
          y_ptr < (s->height * frame->linesize[0]);
@@ -157,6 +168,7 @@ static int cyuv_decode_frame(AVCodecContext *avctx,
 
         }
     }
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/d3d11va.c b/libavcodec/d3d11va.c
index 946de06..9967f32 100644
--- a/libavcodec/d3d11va.c
+++ b/libavcodec/d3d11va.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2015 Steve Lhomme
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/d3d11va.h b/libavcodec/d3d11va.h
index 9264ec6..6816b6c 100644
--- a/libavcodec/d3d11va.h
+++ b/libavcodec/d3d11va.h
@@ -4,20 +4,20 @@
  * copyright (c) 2009 Laurent Aimar
  * copyright (c) 2015 Steve Lhomme
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,7 @@
 
 /**
  * This structure is used to provides the necessary configurations and data
- * to the Direct3D11 Libav HWAccel implementation.
+ * to the Direct3D11 FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  *
@@ -88,7 +88,7 @@ typedef struct AVD3D11VAContext {
     uint64_t workaround;
 
     /**
-     * Private to the Libav AVHWAccel implementation
+     * Private to the FFmpeg AVHWAccel implementation
      */
     unsigned report_id;
 
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index c5daf07..58f340e 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -1,20 +1,24 @@
 /*
  * DCA compatible decoder data
+ * Copyright (C) 2004 Gildas Bazin
+ * Copyright (C) 2004 Benjamin Zores
+ * Copyright (C) 2006 Benjamin Larsson
+ * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +36,16 @@ const uint32_t avpriv_dca_sample_rates[16] = {
     12000, 24000, 48000, 96000, 192000
 };
 
-int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
+const uint32_t ff_dca_sampling_freqs[16] = {
+      8000,  16000, 32000, 64000, 128000, 22050,  44100,  88200,
+    176400, 352800, 12000, 24000,  48000, 96000, 192000, 384000,
+};
+
+const uint8_t ff_dca_freq_ranges[16] = {
+    0, 1, 2, 3, 4, 1, 2, 3, 4, 4, 0, 1, 2, 3, 4, 4
+};
+
+int avpriv_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
                              int max_size)
 {
     uint32_t mrk;
@@ -45,6 +58,7 @@ int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
     mrk = AV_RB32(src);
     switch (mrk) {
     case DCA_SYNCWORD_CORE_BE:
+    case DCA_SYNCWORD_SUBSTREAM:
         memcpy(dst, src, src_size);
         return src_size;
     case DCA_SYNCWORD_CORE_LE:
diff --git a/libavcodec/dca.h b/libavcodec/dca.h
index 787a9c7..bd96bc9 100644
--- a/libavcodec/dca.h
+++ b/libavcodec/dca.h
@@ -4,21 +4,22 @@
  * Copyright (C) 2004 Benjamin Zores
  * Copyright (C) 2006 Benjamin Larsson
  * Copyright (C) 2007 Konstantin Shishkov
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,279 +28,139 @@
 
 #include <stdint.h>
 
-#include "libavutil/float_dsp.h"
+#include "libavutil/common.h"
 #include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+
+enum DCASpeaker {
+    DCA_SPEAKER_C,    DCA_SPEAKER_L,    DCA_SPEAKER_R,    DCA_SPEAKER_Ls,
+    DCA_SPEAKER_Rs,   DCA_SPEAKER_LFE1, DCA_SPEAKER_Cs,   DCA_SPEAKER_Lsr,
+    DCA_SPEAKER_Rsr,  DCA_SPEAKER_Lss,  DCA_SPEAKER_Rss,  DCA_SPEAKER_Lc,
+    DCA_SPEAKER_Rc,   DCA_SPEAKER_Lh,   DCA_SPEAKER_Ch,   DCA_SPEAKER_Rh,
+    DCA_SPEAKER_LFE2, DCA_SPEAKER_Lw,   DCA_SPEAKER_Rw,   DCA_SPEAKER_Oh,
+    DCA_SPEAKER_Lhs,  DCA_SPEAKER_Rhs,  DCA_SPEAKER_Chr,  DCA_SPEAKER_Lhr,
+    DCA_SPEAKER_Rhr,  DCA_SPEAKER_Cl,   DCA_SPEAKER_Ll,   DCA_SPEAKER_Rl,
+    DCA_SPEAKER_RSV1, DCA_SPEAKER_RSV2, DCA_SPEAKER_RSV3, DCA_SPEAKER_RSV4,
+
+    DCA_SPEAKER_COUNT
+};
 
-#include "avcodec.h"
-#include "dcadsp.h"
-#include "fmtconvert.h"
-#include "get_bits.h"
-
-#define DCA_PRIM_CHANNELS_MAX  (7)
-#define DCA_ABITS_MAX         (32)      /* Should be 28 */
-#define DCA_SUBSUBFRAMES_MAX   (4)
-#define DCA_SUBFRAMES_MAX     (16)
-#define DCA_BLOCKS_MAX        (16)
-#define DCA_LFE_MAX            (3)
-
-#define DCA_PRIM_CHANNELS_MAX  (7)
-#define DCA_ABITS_MAX         (32)      /* Should be 28 */
-#define DCA_SUBSUBFRAMES_MAX   (4)
-#define DCA_SUBFRAMES_MAX     (16)
-#define DCA_BLOCKS_MAX        (16)
-#define DCA_LFE_MAX            (3)
-#define DCA_XLL_FBANDS_MAX     (4)
-#define DCA_XLL_SEGMENTS_MAX  (16)
-#define DCA_XLL_CHSETS_MAX    (16)
-#define DCA_XLL_CHANNELS_MAX  (16)
-#define DCA_XLL_AORDER_MAX    (15)
-
-/* Arbitrary limit; not sure what the maximum really is, but much larger. */
-#define DCA_XLL_DMIX_NCOEFFS_MAX (18)
+enum DCASpeakerMask {
+    DCA_SPEAKER_MASK_C     = 0x00000001,
+    DCA_SPEAKER_MASK_L     = 0x00000002,
+    DCA_SPEAKER_MASK_R     = 0x00000004,
+    DCA_SPEAKER_MASK_Ls    = 0x00000008,
+    DCA_SPEAKER_MASK_Rs    = 0x00000010,
+    DCA_SPEAKER_MASK_LFE1  = 0x00000020,
+    DCA_SPEAKER_MASK_Cs    = 0x00000040,
+    DCA_SPEAKER_MASK_Lsr   = 0x00000080,
+    DCA_SPEAKER_MASK_Rsr   = 0x00000100,
+    DCA_SPEAKER_MASK_Lss   = 0x00000200,
+    DCA_SPEAKER_MASK_Rss   = 0x00000400,
+    DCA_SPEAKER_MASK_Lc    = 0x00000800,
+    DCA_SPEAKER_MASK_Rc    = 0x00001000,
+    DCA_SPEAKER_MASK_Lh    = 0x00002000,
+    DCA_SPEAKER_MASK_Ch    = 0x00004000,
+    DCA_SPEAKER_MASK_Rh    = 0x00008000,
+    DCA_SPEAKER_MASK_LFE2  = 0x00010000,
+    DCA_SPEAKER_MASK_Lw    = 0x00020000,
+    DCA_SPEAKER_MASK_Rw    = 0x00040000,
+    DCA_SPEAKER_MASK_Oh    = 0x00080000,
+    DCA_SPEAKER_MASK_Lhs   = 0x00100000,
+    DCA_SPEAKER_MASK_Rhs   = 0x00200000,
+    DCA_SPEAKER_MASK_Chr   = 0x00400000,
+    DCA_SPEAKER_MASK_Lhr   = 0x00800000,
+    DCA_SPEAKER_MASK_Rhr   = 0x01000000,
+    DCA_SPEAKER_MASK_Cl    = 0x02000000,
+    DCA_SPEAKER_MASK_Ll    = 0x04000000,
+    DCA_SPEAKER_MASK_Rl    = 0x08000000,
+};
 
-#define DCA_MAX_FRAME_SIZE       16384
-#define DCA_MAX_EXSS_HEADER_SIZE  4096
+#define DCA_SPEAKER_LAYOUT_MONO         (DCA_SPEAKER_MASK_C)
+#define DCA_SPEAKER_LAYOUT_STEREO       (DCA_SPEAKER_MASK_L | DCA_SPEAKER_MASK_R)
+#define DCA_SPEAKER_LAYOUT_2POINT1      (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_LFE1)
+#define DCA_SPEAKER_LAYOUT_3_0          (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_C)
+#define DCA_SPEAKER_LAYOUT_2_1          (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_Cs)
+#define DCA_SPEAKER_LAYOUT_3_1          (DCA_SPEAKER_LAYOUT_3_0 | DCA_SPEAKER_MASK_Cs)
+#define DCA_SPEAKER_LAYOUT_2_2          (DCA_SPEAKER_LAYOUT_STEREO | DCA_SPEAKER_MASK_Ls | DCA_SPEAKER_MASK_Rs)
+#define DCA_SPEAKER_LAYOUT_5POINT0      (DCA_SPEAKER_LAYOUT_3_0 | DCA_SPEAKER_MASK_Ls | DCA_SPEAKER_MASK_Rs)
+#define DCA_SPEAKER_LAYOUT_5POINT1      (DCA_SPEAKER_LAYOUT_5POINT0 | DCA_SPEAKER_MASK_LFE1)
+#define DCA_SPEAKER_LAYOUT_7POINT0_WIDE (DCA_SPEAKER_LAYOUT_5POINT0 | DCA_SPEAKER_MASK_Lw | DCA_SPEAKER_MASK_Rw)
+#define DCA_SPEAKER_LAYOUT_7POINT1_WIDE (DCA_SPEAKER_LAYOUT_7POINT0_WIDE | DCA_SPEAKER_MASK_LFE1)
+
+#define DCA_HAS_STEREO(mask) \
+    ((mask & DCA_SPEAKER_LAYOUT_STEREO) == DCA_SPEAKER_LAYOUT_STEREO)
+
+enum DCASpeakerPair {
+    DCA_SPEAKER_PAIR_C      = 0x0001,
+    DCA_SPEAKER_PAIR_LR     = 0x0002,
+    DCA_SPEAKER_PAIR_LsRs   = 0x0004,
+    DCA_SPEAKER_PAIR_LFE1   = 0x0008,
+    DCA_SPEAKER_PAIR_Cs     = 0x0010,
+    DCA_SPEAKER_PAIR_LhRh   = 0x0020,
+    DCA_SPEAKER_PAIR_LsrRsr = 0x0040,
+    DCA_SPEAKER_PAIR_Ch     = 0x0080,
+    DCA_SPEAKER_PAIR_Oh     = 0x0100,
+    DCA_SPEAKER_PAIR_LcRc   = 0x0200,
+    DCA_SPEAKER_PAIR_LwRw   = 0x0400,
+    DCA_SPEAKER_PAIR_LssRss = 0x0800,
+    DCA_SPEAKER_PAIR_LFE2   = 0x1000,
+    DCA_SPEAKER_PAIR_LhsRhs = 0x2000,
+    DCA_SPEAKER_PAIR_Chr    = 0x4000,
+    DCA_SPEAKER_PAIR_LhrRhr = 0x8000
+};
 
-#define DCA_BUFFER_PADDING_SIZE   1024
+/**
+ * Return number of individual channels in DCASpeakerPair mask
+ */
+static inline int ff_dca_count_chs_for_mask(unsigned int mask)
+{
+    return av_popcount((mask & 0xffff) | ((mask & 0xae66) << 16));
+}
+
+enum DCARepresentationType {
+    DCA_REPR_TYPE_LtRt = 2,
+    DCA_REPR_TYPE_LhRh = 3
+};
 
 enum DCAExtensionMask {
-    DCA_EXT_CORE       = 0x001, ///< core in core substream
-    DCA_EXT_XXCH       = 0x002, ///< XXCh channels extension in core substream
-    DCA_EXT_X96        = 0x004, ///< 96/24 extension in core substream
-    DCA_EXT_XCH        = 0x008, ///< XCh channel extension in core substream
-    DCA_EXT_EXSS_CORE  = 0x010, ///< core in ExSS (extension substream)
-    DCA_EXT_EXSS_XBR   = 0x020, ///< extended bitrate extension in ExSS
-    DCA_EXT_EXSS_XXCH  = 0x040, ///< XXCh channels extension in ExSS
-    DCA_EXT_EXSS_X96   = 0x080, ///< 96/24 extension in ExSS
-    DCA_EXT_EXSS_LBR   = 0x100, ///< low bitrate component in ExSS
-    DCA_EXT_EXSS_XLL   = 0x200, ///< lossless extension in ExSS
+    DCA_CSS_CORE   = 0x001,
+    DCA_CSS_XXCH   = 0x002,
+    DCA_CSS_X96    = 0x004,
+    DCA_CSS_XCH    = 0x008,
+    DCA_CSS_MASK   = 0x00f,
+    DCA_EXSS_CORE  = 0x010,
+    DCA_EXSS_XBR   = 0x020,
+    DCA_EXSS_XXCH  = 0x040,
+    DCA_EXSS_X96   = 0x080,
+    DCA_EXSS_LBR   = 0x100,
+    DCA_EXSS_XLL   = 0x200,
+    DCA_EXSS_RSV1  = 0x400,
+    DCA_EXSS_RSV2  = 0x800,
+    DCA_EXSS_MASK  = 0xff0,
 };
 
-typedef struct XllChSetSubHeader {
-    int channels;               ///< number of channels in channel set, at most 16
-    int residual_encode;        ///< residual channel encoding
-    int bit_resolution;         ///< input sample bit-width
-    int bit_width;              ///< original input sample bit-width
-    int sampling_frequency;     ///< sampling frequency
-    int samp_freq_interp;       ///< sampling frequency interpolation multiplier
-    int replacement_set;        ///< replacement channel set group
-    int active_replace_set;     ///< current channel set is active channel set
-    int primary_ch_set;
-    int downmix_coeff_code_embedded;
-    int downmix_embedded;
-    int downmix_type;
-    int hier_chset;             ///< hierarchical channel set
-    int downmix_ncoeffs;
-    int downmix_coeffs[DCA_XLL_DMIX_NCOEFFS_MAX];
-    int ch_mask_enabled;
-    int ch_mask;
-    int mapping_coeffs_present;
-    int num_freq_bands;
-
-    /* m_nOrigChanOrder */
-    uint8_t orig_chan_order[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    uint8_t orig_chan_order_inv[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    /* Coefficients for channel pairs (at most 8), m_anPWChPairsCoeffs */
-    int8_t pw_ch_pairs_coeffs[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX/2];
-    /* m_nCurrHighestLPCOrder */
-    uint8_t adapt_order_max[DCA_XLL_FBANDS_MAX];
-    /* m_pnAdaptPredOrder */
-    uint8_t adapt_order[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    /* m_pnFixedPredOrder */
-    uint8_t fixed_order[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    /* m_pnLPCReflCoeffsQInd, unsigned version */
-    uint8_t lpc_refl_coeffs_q_ind[DCA_XLL_FBANDS_MAX]
-                                 [DCA_XLL_CHANNELS_MAX][DCA_XLL_AORDER_MAX];
-
-    int lsb_fsize[DCA_XLL_FBANDS_MAX];
-    int8_t scalable_lsbs[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-    int8_t bit_width_adj_per_ch[DCA_XLL_FBANDS_MAX][DCA_XLL_CHANNELS_MAX];
-} XllChSetSubHeader;
-
-typedef struct XllNavi {
-    GetBitContext gb;  // Context for parsing the data segments
-    unsigned band_size[DCA_XLL_FBANDS_MAX];
-    unsigned segment_size[DCA_XLL_FBANDS_MAX][DCA_XLL_SEGMENTS_MAX];
-    unsigned chset_size[DCA_XLL_FBANDS_MAX][DCA_XLL_SEGMENTS_MAX][DCA_XLL_CHSETS_MAX];
-} XllNavi;
-
-typedef struct QMF64_table {
-    float dct4_coeff[32][32];
-    float dct2_coeff[32][32];
-    float rcos[32];
-    float rsin[32];
-} QMF64_table;
-
-/* Primary audio coding header */
-typedef struct DCAAudioHeader {
-    int subband_activity[DCA_PRIM_CHANNELS_MAX];    ///< subband activity count
-    int vq_start_subband[DCA_PRIM_CHANNELS_MAX];    ///< high frequency vq start subband
-    int joint_intensity[DCA_PRIM_CHANNELS_MAX];     ///< joint intensity coding index
-    int transient_huffman[DCA_PRIM_CHANNELS_MAX];   ///< transient mode code book
-    int scalefactor_huffman[DCA_PRIM_CHANNELS_MAX]; ///< scale factor code book
-    int bitalloc_huffman[DCA_PRIM_CHANNELS_MAX];    ///< bit allocation quantizer select
-    int quant_index_huffman[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];  ///< quantization index codebook select
-    uint32_t scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///< scale factor adjustment
-
-    int subframes;              ///< number of subframes
-    int total_channels;         ///< number of channels including extensions
-    int prim_channels;          ///< number of primary audio channels
-} DCAAudioHeader;
-
-typedef struct DCAChan {
-    DECLARE_ALIGNED(32, int32_t, subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][SAMPLES_PER_SUBBAND];
-
-    /* Subband samples history (for ADPCM) */
-    DECLARE_ALIGNED(32, int32_t, subband_samples_hist)[DCA_SUBBANDS][4];
-    int hist_index;
-
-    /* Half size is sufficient for core decoding, but for 96 kHz data
-     * we need QMF with 64 subbands and 1024 samples. */
-    DECLARE_ALIGNED(32, float, subband_fir_hist)[1024];
-    DECLARE_ALIGNED(32, float, subband_fir_noidea)[64];
-
-    /* Primary audio coding side information */
-    int prediction_mode[DCA_SUBBANDS];    ///< prediction mode (ADPCM used or not)
-    int prediction_vq[DCA_SUBBANDS];      ///< prediction VQ coefs
-    int bitalloc[DCA_SUBBANDS];           ///< bit allocation index
-    int transition_mode[DCA_SUBBANDS];    ///< transition mode (transients)
-    int32_t scale_factor[DCA_SUBBANDS][2];///< scale factors (2 if transient)
-    int joint_huff;                       ///< joint subband scale factors codebook
-    int joint_scale_factor[DCA_SUBBANDS]; ///< joint subband scale factors
-
-    int32_t  high_freq_vq[DCA_SUBBANDS];  ///< VQ encoded high frequency subbands
-} DCAChan;
-
-
-typedef struct DCAContext {
-    AVClass *class;             ///< class for AVOptions
-    AVCodecContext *avctx;
-    /* Frame header */
-    int frame_type;             ///< type of the current frame
-    int samples_deficit;        ///< deficit sample count
-    int crc_present;            ///< crc is present in the bitstream
-    int sample_blocks;          ///< number of PCM sample blocks
-    int frame_size;             ///< primary frame byte size
-    int amode;                  ///< audio channels arrangement
-    int sample_rate;            ///< audio sampling rate
-    int bit_rate;               ///< transmission bit rate
-    int bit_rate_index;         ///< transmission bit rate index
+enum DCADownMixType {
+    DCA_DMIX_TYPE_1_0,
+    DCA_DMIX_TYPE_LoRo,
+    DCA_DMIX_TYPE_LtRt,
+    DCA_DMIX_TYPE_3_0,
+    DCA_DMIX_TYPE_2_1,
+    DCA_DMIX_TYPE_2_2,
+    DCA_DMIX_TYPE_3_1,
 
-    int dynrange;               ///< embedded dynamic range flag
-    int timestamp;              ///< embedded time stamp flag
-    int aux_data;               ///< auxiliary data flag
-    int hdcd;                   ///< source material is mastered in HDCD
-    int ext_descr;              ///< extension audio descriptor flag
-    int ext_coding;             ///< extended coding flag
-    int aspf;                   ///< audio sync word insertion flag
-    int lfe;                    ///< low frequency effects flag
-    int predictor_history;      ///< predictor history flag
-    int header_crc;             ///< header crc check bytes
-    int multirate_inter;        ///< multirate interpolator switch
-    int version;                ///< encoder software revision
-    int copy_history;           ///< copy history
-    int source_pcm_res;         ///< source pcm resolution
-    int front_sum;              ///< front sum/difference flag
-    int surround_sum;           ///< surround sum/difference flag
-    int dialog_norm;            ///< dialog normalisation parameter
-
-    /* Primary audio coding header */
-    DCAAudioHeader audio_header;
-
-    /* Primary audio coding side information */
-    int subsubframes[DCA_SUBFRAMES_MAX];                         ///< number of subsubframes
-    int partial_samples[DCA_SUBFRAMES_MAX];                      ///< partial subsubframe samples count
-    float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2];            ///< stereo downmix coefficients
-    int dynrange_coef;                                           ///< dynamic range coefficient
-
-    /* Core substream's embedded downmix coefficients (cf. ETSI TS 102 114 V1.4.1)
-     * Input:  primary audio channels (incl. LFE if present)
-     * Output: downmix audio channels (up to 4, no LFE) */
-    uint8_t  core_downmix;                                       ///< embedded downmix coefficients available
-    uint8_t  core_downmix_amode;                                 ///< audio channel arrangement of embedded downmix
-    uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4];   ///< embedded downmix coefficients (9-bit codes)
-
-
-    float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)];      ///< Low frequency effect data
-    int lfe_scale_factor;
-
-    /* Subband samples history (for ADPCM) */
-    DECLARE_ALIGNED(32, float, raXin)[32];
-
-    DCAChan dca_chan[DCA_PRIM_CHANNELS_MAX];
-
-    int output;                 ///< type of output
-
-    float *samples_chanptr[DCA_PRIM_CHANNELS_MAX + 1];
-    float *extra_channels[DCA_PRIM_CHANNELS_MAX + 1];
-    uint8_t *extra_channels_buffer;
-    unsigned int extra_channels_buffer_size;
-
-    uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];
-    int dca_buffer_size;        ///< how much data is in the dca_buffer
-
-    const int8_t *channel_order_tab;  ///< channel reordering table, lfe and non lfe
-    GetBitContext gb;
-    /* Current position in DCA frame */
-    int current_subframe;
-    int current_subsubframe;
-
-    int core_ext_mask;          ///< present extensions in the core substream
-    int exss_ext_mask;          ///< Non-core extensions
-
-    /* XCh extension information */
-    int xch_present;            ///< XCh extension present and valid
-    int xch_base_channel;       ///< index of first (only) channel containing XCH data
-    int xch_disable;            ///< whether the XCh extension should be decoded or not
-
-    /* XLL extension information */
-    int xll_disable;
-    int xll_nch_sets;           ///< number of channel sets per frame
-    int xll_channels;           ///< total number of channels (in all channel sets)
-    int xll_residual_channels;  ///< number of residual channels
-    int xll_segments;           ///< number of segments per frame
-    int xll_log_smpl_in_seg;    ///< supposedly this is "nBits4SamplLoci"
-    int xll_smpl_in_seg;        ///< samples in segment per one frequency band for the first channel set
-    int xll_bits4seg_size;      ///< number of bits used to read segment size
-    int xll_banddata_crc;       ///< presence of CRC16 within each frequency band
-    int xll_scalable_lsb;
-    int xll_bits4ch_mask;       ///< channel position mask
-    int xll_fixed_lsb_width;
-    XllChSetSubHeader xll_chsets[DCA_XLL_CHSETS_MAX];
-    XllNavi xll_navi;
-    int *xll_sample_buf;
-    unsigned int xll_sample_buf_size;
-
-    /* ExSS header parser */
-    int static_fields;          ///< static fields present
-    int mix_metadata;           ///< mixing metadata present
-    int num_mix_configs;        ///< number of mix out configurations
-    int mix_config_num_ch[4];   ///< number of channels in each mix out configuration
-
-    int profile;
-    int one2one_map_chtospkr;
-
-    int debug_flag;             ///< used for suppressing repeated error messages output
-    AVFloatDSPContext fdsp;
-    FFTContext imdct;
-    SynthFilterContext synth;
-    DCADSPContext dcadsp;
-    QMF64_table *qmf64_table;
-    FmtConvertContext fmt_conv;
-} DCAContext;
+    DCA_DMIX_TYPE_COUNT
+};
 
 extern av_export const uint32_t avpriv_dca_sample_rates[16];
 
+extern const uint32_t ff_dca_sampling_freqs[16];
+extern const uint8_t ff_dca_freq_ranges[16];
+
 /**
  * Convert bitstream to one representation based on sync marker
  */
-int ff_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
-                             int max_size);
-
-void ff_dca_exss_parse_header(DCAContext *s);
-
-int ff_dca_xll_decode_header(DCAContext *s);
-int ff_dca_xll_decode_navi(DCAContext *s, int asset_end);
-int ff_dca_xll_decode_audio(DCAContext *s, AVFrame *frame);
+int avpriv_dca_convert_bitstream(const uint8_t *src, int src_size, uint8_t *dst,
+                                 int max_size);
 
 #endif /* AVCODEC_DCA_H */
diff --git a/libavcodec/dca_core.c b/libavcodec/dca_core.c
new file mode 100644
index 0000000..46825ed
--- /dev/null
+++ b/libavcodec/dca_core.c
@@ -0,0 +1,2550 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dcadec.h"
+#include "dcadata.h"
+#include "dcahuff.h"
+#include "dcamath.h"
+#include "dca_syncwords.h"
+
+#if ARCH_ARM
+#include "arm/dca.h"
+#endif
+
+enum HeaderType {
+    HEADER_CORE,
+    HEADER_XCH,
+    HEADER_XXCH
+};
+
+enum AudioMode {
+    AMODE_MONO,             // Mode 0: A (mono)
+    AMODE_MONO_DUAL,        // Mode 1: A + B (dual mono)
+    AMODE_STEREO,           // Mode 2: L + R (stereo)
+    AMODE_STEREO_SUMDIFF,   // Mode 3: (L+R) + (L-R) (sum-diff)
+    AMODE_STEREO_TOTAL,     // Mode 4: LT + RT (left and right total)
+    AMODE_3F,               // Mode 5: C + L + R
+    AMODE_2F1R,             // Mode 6: L + R + S
+    AMODE_3F1R,             // Mode 7: C + L + R + S
+    AMODE_2F2R,             // Mode 8: L + R + SL + SR
+    AMODE_3F2R,             // Mode 9: C + L + R + SL + SR
+
+    AMODE_COUNT
+};
+
+enum ExtAudioType {
+    EXT_AUDIO_XCH   = 0,
+    EXT_AUDIO_X96   = 2,
+    EXT_AUDIO_XXCH  = 6
+};
+
+enum LFEFlag {
+    LFE_FLAG_NONE,
+    LFE_FLAG_128,
+    LFE_FLAG_64,
+    LFE_FLAG_INVALID
+};
+
+static const int8_t prm_ch_to_spkr_map[AMODE_COUNT][5] = {
+    { DCA_SPEAKER_C,            -1,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R,             -1,             -1,             -1 },
+    { DCA_SPEAKER_C, DCA_SPEAKER_L, DCA_SPEAKER_R ,             -1,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R, DCA_SPEAKER_Cs,             -1,             -1 },
+    { DCA_SPEAKER_C, DCA_SPEAKER_L, DCA_SPEAKER_R , DCA_SPEAKER_Cs,             -1 },
+    { DCA_SPEAKER_L, DCA_SPEAKER_R, DCA_SPEAKER_Ls, DCA_SPEAKER_Rs,             -1 },
+    { DCA_SPEAKER_C, DCA_SPEAKER_L, DCA_SPEAKER_R,  DCA_SPEAKER_Ls, DCA_SPEAKER_Rs }
+};
+
+static const uint8_t audio_mode_ch_mask[AMODE_COUNT] = {
+    DCA_SPEAKER_LAYOUT_MONO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_STEREO,
+    DCA_SPEAKER_LAYOUT_3_0,
+    DCA_SPEAKER_LAYOUT_2_1,
+    DCA_SPEAKER_LAYOUT_3_1,
+    DCA_SPEAKER_LAYOUT_2_2,
+    DCA_SPEAKER_LAYOUT_5POINT0
+};
+
+static const uint8_t block_code_nbits[7] = {
+    7, 10, 12, 13, 15, 17, 19
+};
+
+static const uint8_t quant_index_sel_nbits[DCA_CODE_BOOKS] = {
+    1, 2, 2, 2, 2, 3, 3, 3, 3, 3
+};
+
+static const uint8_t quant_index_group_size[DCA_CODE_BOOKS] = {
+    1, 3, 3, 3, 3, 7, 7, 7, 7, 7
+};
+
+static int dca_get_vlc(GetBitContext *s, DCAVLC *v, int i)
+{
+    return get_vlc2(s, v->vlc[i].table, v->vlc[i].bits, v->max_depth) + v->offset;
+}
+
+static void get_array(GetBitContext *s, int32_t *array, int size, int n)
+{
+    int i;
+
+    for (i = 0; i < size; i++)
+        array[i] = get_sbits(s, n);
+}
+
+// 5.3.1 - Bit stream header
+static int parse_frame_header(DCACoreDecoder *s)
+{
+    int normal_frame, pcmr_index;
+
+    // Frame type
+    normal_frame = get_bits1(&s->gb);
+
+    // Deficit sample count
+    if (get_bits(&s->gb, 5) != DCA_PCMBLOCK_SAMPLES - 1) {
+        av_log(s->avctx, AV_LOG_ERROR, "Deficit samples are not supported\n");
+        return normal_frame ? AVERROR_INVALIDDATA : AVERROR_PATCHWELCOME;
+    }
+
+    // CRC present flag
+    s->crc_present = get_bits1(&s->gb);
+
+    // Number of PCM sample blocks
+    s->npcmblocks = get_bits(&s->gb, 7) + 1;
+    if (s->npcmblocks & (DCA_SUBBAND_SAMPLES - 1)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported number of PCM sample blocks (%d)\n", s->npcmblocks);
+        return (s->npcmblocks < 6 || normal_frame) ? AVERROR_INVALIDDATA : AVERROR_PATCHWELCOME;
+    }
+
+    // Primary frame byte size
+    s->frame_size = get_bits(&s->gb, 14) + 1;
+    if (s->frame_size < 96) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid core frame size (%d bytes)\n", s->frame_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Audio channel arrangement
+    s->audio_mode = get_bits(&s->gb, 6);
+    if (s->audio_mode >= AMODE_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported audio channel arrangement (%d)\n", s->audio_mode);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Core audio sampling frequency
+    s->sample_rate = avpriv_dca_sample_rates[get_bits(&s->gb, 4)];
+    if (!s->sample_rate) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid core audio sampling frequency\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Transmission bit rate
+    s->bit_rate = ff_dca_bit_rates[get_bits(&s->gb, 5)];
+
+    // Reserved field
+    skip_bits1(&s->gb);
+
+    // Embedded dynamic range flag
+    s->drc_present = get_bits1(&s->gb);
+
+    // Embedded time stamp flag
+    s->ts_present = get_bits1(&s->gb);
+
+    // Auxiliary data flag
+    s->aux_present = get_bits1(&s->gb);
+
+    // HDCD mastering flag
+    skip_bits1(&s->gb);
+
+    // Extension audio descriptor flag
+    s->ext_audio_type = get_bits(&s->gb, 3);
+
+    // Extended coding flag
+    s->ext_audio_present = get_bits1(&s->gb);
+
+    // Audio sync word insertion flag
+    s->sync_ssf = get_bits1(&s->gb);
+
+    // Low frequency effects flag
+    s->lfe_present = get_bits(&s->gb, 2);
+    if (s->lfe_present == LFE_FLAG_INVALID) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid low frequency effects flag\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Predictor history flag switch
+    s->predictor_history = get_bits1(&s->gb);
+
+    // Header CRC check bytes
+    if (s->crc_present)
+        skip_bits(&s->gb, 16);
+
+    // Multirate interpolator switch
+    s->filter_perfect = get_bits1(&s->gb);
+
+    // Encoder software revision
+    skip_bits(&s->gb, 4);
+
+    // Copy history
+    skip_bits(&s->gb, 2);
+
+    // Source PCM resolution
+    s->source_pcm_res = ff_dca_bits_per_sample[pcmr_index = get_bits(&s->gb, 3)];
+    if (!s->source_pcm_res) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid source PCM resolution\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->es_format = pcmr_index & 1;
+
+    // Front sum/difference flag
+    s->sumdiff_front = get_bits1(&s->gb);
+
+    // Surround sum/difference flag
+    s->sumdiff_surround = get_bits1(&s->gb);
+
+    // Dialog normalization / unspecified
+    skip_bits(&s->gb, 4);
+
+    return 0;
+}
+
+// 5.3.2 - Primary audio coding header
+static int parse_coding_header(DCACoreDecoder *s, enum HeaderType header, int xch_base)
+{
+    int n, ch, nchannels, header_size = 0, header_pos = get_bits_count(&s->gb);
+    unsigned int mask, index;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    switch (header) {
+    case HEADER_CORE:
+        // Number of subframes
+        s->nsubframes = get_bits(&s->gb, 4) + 1;
+
+        // Number of primary audio channels
+        s->nchannels = get_bits(&s->gb, 3) + 1;
+        if (s->nchannels != ff_dca_channels[s->audio_mode]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of primary audio channels (%d) for audio channel arrangement (%d)\n", s->nchannels, s->audio_mode);
+            return AVERROR_INVALIDDATA;
+        }
+        av_assert1(s->nchannels <= DCA_CHANNELS - 2);
+
+        s->ch_mask = audio_mode_ch_mask[s->audio_mode];
+
+        // Add LFE channel if present
+        if (s->lfe_present)
+            s->ch_mask |= DCA_SPEAKER_MASK_LFE1;
+        break;
+
+    case HEADER_XCH:
+        s->nchannels = ff_dca_channels[s->audio_mode] + 1;
+        av_assert1(s->nchannels <= DCA_CHANNELS - 1);
+        s->ch_mask |= DCA_SPEAKER_MASK_Cs;
+        break;
+
+    case HEADER_XXCH:
+        // Channel set header length
+        header_size = get_bits(&s->gb, 7) + 1;
+
+        // Check CRC
+        if (s->xxch_crc_present
+            && ff_dca_check_crc(s->avctx, &s->gb, header_pos, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH channel set header checksum\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Number of channels in a channel set
+        nchannels = get_bits(&s->gb, 3) + 1;
+        if (nchannels > DCA_XXCH_CHANNELS_MAX) {
+            avpriv_request_sample(s->avctx, "%d XXCH channels", nchannels);
+            return AVERROR_PATCHWELCOME;
+        }
+        s->nchannels = ff_dca_channels[s->audio_mode] + nchannels;
+        av_assert1(s->nchannels <= DCA_CHANNELS);
+
+        // Loudspeaker layout mask
+        mask = get_bits_long(&s->gb, s->xxch_mask_nbits - DCA_SPEAKER_Cs);
+        s->xxch_spkr_mask = mask << DCA_SPEAKER_Cs;
+
+        if (av_popcount(s->xxch_spkr_mask) != nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH speaker layout mask (%#x)\n", s->xxch_spkr_mask);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (s->xxch_core_mask & s->xxch_spkr_mask) {
+            av_log(s->avctx, AV_LOG_ERROR, "XXCH speaker layout mask (%#x) overlaps with core (%#x)\n", s->xxch_spkr_mask, s->xxch_core_mask);
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Combine core and XXCH masks together
+        s->ch_mask = s->xxch_core_mask | s->xxch_spkr_mask;
+
+        // Downmix coefficients present in stream
+        if (get_bits1(&s->gb)) {
+            int *coeff_ptr = s->xxch_dmix_coeff;
+
+            // Downmix already performed by encoder
+            s->xxch_dmix_embedded = get_bits1(&s->gb);
+
+            // Downmix scale factor
+            index = get_bits(&s->gb, 6) * 4 - FF_DCA_DMIXTABLE_OFFSET - 3;
+            if (index >= FF_DCA_INV_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH downmix scale index (%d)\n", index);
+                return AVERROR_INVALIDDATA;
+            }
+            s->xxch_dmix_scale_inv = ff_dca_inv_dmixtable[index];
+
+            // Downmix channel mapping mask
+            for (ch = 0; ch < nchannels; ch++) {
+                mask = get_bits_long(&s->gb, s->xxch_mask_nbits);
+                if ((mask & s->xxch_core_mask) != mask) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH downmix channel mapping mask (%#x)\n", mask);
+                    return AVERROR_INVALIDDATA;
+                }
+                s->xxch_dmix_mask[ch] = mask;
+            }
+
+            // Downmix coefficients
+            for (ch = 0; ch < nchannels; ch++) {
+                for (n = 0; n < s->xxch_mask_nbits; n++) {
+                    if (s->xxch_dmix_mask[ch] & (1U << n)) {
+                        int code = get_bits(&s->gb, 7);
+                        int sign = (code >> 6) - 1;
+                        if (code &= 63) {
+                            index = code * 4 - 3;
+                            if (index >= FF_DCA_DMIXTABLE_SIZE) {
+                                av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH downmix coefficient index (%d)\n", index);
+                                return AVERROR_INVALIDDATA;
+                            }
+                            *coeff_ptr++ = (ff_dca_dmixtable[index] ^ sign) - sign;
+                        } else {
+                            *coeff_ptr++ = 0;
+                        }
+                    }
+                }
+            }
+        } else {
+            s->xxch_dmix_embedded = 0;
+        }
+
+        break;
+    }
+
+    // Subband activity count
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        s->nsubbands[ch] = get_bits(&s->gb, 5) + 2;
+        if (s->nsubbands[ch] > DCA_SUBBANDS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid subband activity count\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // High frequency VQ start subband
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        s->subband_vq_start[ch] = get_bits(&s->gb, 5) + 1;
+
+    // Joint intensity coding index
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        if ((n = get_bits(&s->gb, 3)) && header == HEADER_XXCH)
+            n += xch_base - 1;
+        if (n > s->nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid joint intensity coding index\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->joint_intensity_index[ch] = n;
+    }
+
+    // Transient mode code book
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        s->transition_mode_sel[ch] = get_bits(&s->gb, 2);
+
+    // Scale factor code book
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        s->scale_factor_sel[ch] = get_bits(&s->gb, 3);
+        if (s->scale_factor_sel[ch] == 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid scale factor code book\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Bit allocation quantizer select
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        s->bit_allocation_sel[ch] = get_bits(&s->gb, 3);
+        if (s->bit_allocation_sel[ch] == 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid bit allocation quantizer select\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Quantization index codebook select
+    for (n = 0; n < DCA_CODE_BOOKS; n++)
+        for (ch = xch_base; ch < s->nchannels; ch++)
+            s->quant_index_sel[ch][n] = get_bits(&s->gb, quant_index_sel_nbits[n]);
+
+    // Scale factor adjustment index
+    for (n = 0; n < DCA_CODE_BOOKS; n++)
+        for (ch = xch_base; ch < s->nchannels; ch++)
+            if (s->quant_index_sel[ch][n] < quant_index_group_size[n])
+                s->scale_factor_adj[ch][n] = ff_dca_scale_factor_adj[get_bits(&s->gb, 2)];
+
+    if (header == HEADER_XXCH) {
+        // Reserved
+        // Byte align
+        // CRC16 of channel set header
+        if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of XXCH channel set header\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        // Audio header CRC check word
+        if (s->crc_present)
+            skip_bits(&s->gb, 16);
+    }
+
+    return 0;
+}
+
+static inline int parse_scale(DCACoreDecoder *s, int *scale_index, int sel)
+{
+    const uint32_t *scale_table;
+    unsigned int scale_size;
+
+    // Select the root square table
+    if (sel > 5) {
+        scale_table = ff_dca_scale_factor_quant7;
+        scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant7);
+    } else {
+        scale_table = ff_dca_scale_factor_quant6;
+        scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant6);
+    }
+
+    // If Huffman code was used, the difference of scales was encoded
+    if (sel < 5)
+        *scale_index += dca_get_vlc(&s->gb, &ff_dca_vlc_scale_factor, sel);
+    else
+        *scale_index = get_bits(&s->gb, sel + 1);
+
+    // Look up scale factor from the root square table
+    if ((unsigned int)*scale_index >= scale_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid scale factor index\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return scale_table[*scale_index];
+}
+
+static inline int parse_joint_scale(DCACoreDecoder *s, int sel)
+{
+    int scale_index;
+
+    // Absolute value was encoded even when Huffman code was used
+    if (sel < 5)
+        scale_index = dca_get_vlc(&s->gb, &ff_dca_vlc_scale_factor, sel);
+    else
+        scale_index = get_bits(&s->gb, sel + 1);
+
+    // Bias by 64
+    scale_index += 64;
+
+    // Look up joint scale factor
+    if ((unsigned int)scale_index >= FF_ARRAY_ELEMS(ff_dca_joint_scale_factors)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid joint scale factor index\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return ff_dca_joint_scale_factors[scale_index];
+}
+
+// 5.4.1 - Primary audio coding side information
+static int parse_subframe_header(DCACoreDecoder *s, int sf,
+                                 enum HeaderType header, int xch_base)
+{
+    int ch, band, ret;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    if (header == HEADER_CORE) {
+        // Subsubframe count
+        s->nsubsubframes[sf] = get_bits(&s->gb, 2) + 1;
+
+        // Partial subsubframe sample count
+        skip_bits(&s->gb, 3);
+    }
+
+    // Prediction mode
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        for (band = 0; band < s->nsubbands[ch]; band++)
+            s->prediction_mode[ch][band] = get_bits1(&s->gb);
+
+    // Prediction coefficients VQ address
+    for (ch = xch_base; ch < s->nchannels; ch++)
+        for (band = 0; band < s->nsubbands[ch]; band++)
+            if (s->prediction_mode[ch][band])
+                s->prediction_vq_index[ch][band] = get_bits(&s->gb, 12);
+
+    // Bit allocation index
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int sel = s->bit_allocation_sel[ch];
+
+        for (band = 0; band < s->subband_vq_start[ch]; band++) {
+            int abits;
+
+            if (sel < 5)
+                abits = dca_get_vlc(&s->gb, &ff_dca_vlc_bit_allocation, sel);
+            else
+                abits = get_bits(&s->gb, sel - 1);
+
+            if (abits > DCA_ABITS_MAX) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid bit allocation index\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            s->bit_allocation[ch][band] = abits;
+        }
+    }
+
+    // Transition mode
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        // Clear transition mode for all subbands
+        memset(s->transition_mode[sf][ch], 0, sizeof(s->transition_mode[0][0]));
+
+        // Transient possible only if more than one subsubframe
+        if (s->nsubsubframes[sf] > 1) {
+            int sel = s->transition_mode_sel[ch];
+            for (band = 0; band < s->subband_vq_start[ch]; band++)
+                if (s->bit_allocation[ch][band])
+                    s->transition_mode[sf][ch][band] = dca_get_vlc(&s->gb, &ff_dca_vlc_transition_mode, sel);
+        }
+    }
+
+    // Scale factors
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int sel = s->scale_factor_sel[ch];
+        int scale_index = 0;
+
+        // Extract scales for subbands up to VQ
+        for (band = 0; band < s->subband_vq_start[ch]; band++) {
+            if (s->bit_allocation[ch][band]) {
+                if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                    return ret;
+                s->scale_factors[ch][band][0] = ret;
+                if (s->transition_mode[sf][ch][band]) {
+                    if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                        return ret;
+                    s->scale_factors[ch][band][1] = ret;
+                }
+            } else {
+                s->scale_factors[ch][band][0] = 0;
+            }
+        }
+
+        // High frequency VQ subbands
+        for (band = s->subband_vq_start[ch]; band < s->nsubbands[ch]; band++) {
+            if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                return ret;
+            s->scale_factors[ch][band][0] = ret;
+        }
+    }
+
+    // Joint subband codebook select
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        if (s->joint_intensity_index[ch]) {
+            s->joint_scale_sel[ch] = get_bits(&s->gb, 3);
+            if (s->joint_scale_sel[ch] == 7) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid joint scale factor code book\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Scale factors for joint subband coding
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            int sel = s->joint_scale_sel[ch];
+            for (band = s->nsubbands[ch]; band < s->nsubbands[src_ch]; band++) {
+                if ((ret = parse_joint_scale(s, sel)) < 0)
+                    return ret;
+                s->joint_scale_factors[ch][band] = ret;
+            }
+        }
+    }
+
+    // Dynamic range coefficient
+    if (s->drc_present && header == HEADER_CORE)
+        skip_bits(&s->gb, 8);
+
+    // Side information CRC check word
+    if (s->crc_present)
+        skip_bits(&s->gb, 16);
+
+    return 0;
+}
+
+#ifndef decode_blockcodes
+static inline int decode_blockcodes(int code1, int code2, int levels, int32_t *audio)
+{
+    int offset = (levels - 1) / 2;
+    int n, div;
+
+    for (n = 0; n < DCA_SUBBAND_SAMPLES / 2; n++) {
+        div = FASTDIV(code1, levels);
+        audio[n] = code1 - div * levels - offset;
+        code1 = div;
+    }
+    for (; n < DCA_SUBBAND_SAMPLES; n++) {
+        div = FASTDIV(code2, levels);
+        audio[n] = code2 - div * levels - offset;
+        code2 = div;
+    }
+
+    return code1 | code2;
+}
+#endif
+
+static inline int parse_block_codes(DCACoreDecoder *s, int32_t *audio, int abits)
+{
+    // Extract block code indices from the bit stream
+    int code1 = get_bits(&s->gb, block_code_nbits[abits - 1]);
+    int code2 = get_bits(&s->gb, block_code_nbits[abits - 1]);
+    int levels = ff_dca_quant_levels[abits];
+
+    // Look up samples from the block code book
+    if (decode_blockcodes(code1, code2, levels, audio)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Failed to decode block code(s)\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static inline int parse_huffman_codes(DCACoreDecoder *s, int32_t *audio, int abits, int sel)
+{
+    int i;
+
+    // Extract Huffman codes from the bit stream
+    for (i = 0; i < DCA_SUBBAND_SAMPLES; i++)
+        audio[i] = dca_get_vlc(&s->gb, &ff_dca_vlc_quant_index[abits - 1], sel);
+
+    return 1;
+}
+
+static inline int extract_audio(DCACoreDecoder *s, int32_t *audio, int abits, int ch)
+{
+    av_assert1(abits >= 0 && abits <= DCA_ABITS_MAX);
+
+    if (abits == 0) {
+        // No bits allocated
+        memset(audio, 0, DCA_SUBBAND_SAMPLES * sizeof(*audio));
+        return 0;
+    }
+
+    if (abits <= DCA_CODE_BOOKS) {
+        int sel = s->quant_index_sel[ch][abits - 1];
+        if (sel < quant_index_group_size[abits - 1]) {
+            // Huffman codes
+            return parse_huffman_codes(s, audio, abits, sel);
+        }
+        if (abits <= 7) {
+            // Block codes
+            return parse_block_codes(s, audio, abits);
+        }
+    }
+
+    // No further encoding
+    get_array(&s->gb, audio, DCA_SUBBAND_SAMPLES, abits - 3);
+    return 0;
+}
+
+static inline void dequantize(int32_t *output, const int32_t *input,
+                              int32_t step_size, int32_t scale, int residual)
+{
+    // Account for quantizer step size
+    int64_t step_scale = (int64_t)step_size * scale;
+    int n, shift = 0;
+
+    // Limit scale factor resolution to 22 bits
+    if (step_scale > (1 << 23)) {
+        shift = av_log2(step_scale >> 23) + 1;
+        step_scale >>= shift;
+    }
+
+    // Scale the samples
+    if (residual) {
+        for (n = 0; n < DCA_SUBBAND_SAMPLES; n++)
+            output[n] += clip23(norm__(input[n] * step_scale, 22 - shift));
+    } else {
+        for (n = 0; n < DCA_SUBBAND_SAMPLES; n++)
+            output[n]  = clip23(norm__(input[n] * step_scale, 22 - shift));
+    }
+}
+
+static inline void inverse_adpcm(int32_t **subband_samples,
+                                 const int16_t *vq_index,
+                                 const int8_t *prediction_mode,
+                                 int sb_start, int sb_end,
+                                 int ofs, int len)
+{
+    int i, j, k;
+
+    for (i = sb_start; i < sb_end; i++) {
+        if (prediction_mode[i]) {
+            const int16_t *coeff = ff_dca_adpcm_vb[vq_index[i]];
+            int32_t *ptr = subband_samples[i] + ofs;
+            for (j = 0; j < len; j++) {
+                int64_t err = 0;
+                for (k = 0; k < DCA_ADPCM_COEFFS; k++)
+                    err += (int64_t)ptr[j - k - 1] * coeff[k];
+                ptr[j] = clip23(ptr[j] + clip23(norm13(err)));
+            }
+        }
+    }
+}
+
+// 5.5 - Primary audio data arrays
+static int parse_subframe_audio(DCACoreDecoder *s, int sf, enum HeaderType header,
+                                int xch_base, int *sub_pos, int *lfe_pos)
+{
+    int32_t audio[16], scale;
+    int n, ssf, ofs, ch, band;
+
+    // Check number of subband samples in this subframe
+    int nsamples = s->nsubsubframes[sf] * DCA_SUBBAND_SAMPLES;
+    if (*sub_pos + nsamples > s->npcmblocks) {
+        av_log(s->avctx, AV_LOG_ERROR, "Subband sample buffer overflow\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // VQ encoded subbands
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int32_t vq_index[DCA_SUBBANDS];
+
+        for (band = s->subband_vq_start[ch]; band < s->nsubbands[ch]; band++)
+            // Extract the VQ address from the bit stream
+            vq_index[band] = get_bits(&s->gb, 10);
+
+        if (s->subband_vq_start[ch] < s->nsubbands[ch]) {
+            s->dcadsp->decode_hf(s->subband_samples[ch], vq_index,
+                                 ff_dca_high_freq_vq, s->scale_factors[ch],
+                                 s->subband_vq_start[ch], s->nsubbands[ch],
+                                 *sub_pos, nsamples);
+        }
+    }
+
+    // Low frequency effect data
+    if (s->lfe_present && header == HEADER_CORE) {
+        unsigned int index;
+
+        // Determine number of LFE samples in this subframe
+        int nlfesamples = 2 * s->lfe_present * s->nsubsubframes[sf];
+        av_assert1((unsigned int)nlfesamples <= FF_ARRAY_ELEMS(audio));
+
+        // Extract LFE samples from the bit stream
+        get_array(&s->gb, audio, nlfesamples, 8);
+
+        // Extract scale factor index from the bit stream
+        index = get_bits(&s->gb, 8);
+        if (index >= FF_ARRAY_ELEMS(ff_dca_scale_factor_quant7)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE scale factor index\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Look up the 7-bit root square quantization table
+        scale = ff_dca_scale_factor_quant7[index];
+
+        // Account for quantizer step size which is 0.035
+        scale = mul23(4697620 /* 0.035 * (1 << 27) */, scale);
+
+        // Scale and take the LFE samples
+        for (n = 0, ofs = *lfe_pos; n < nlfesamples; n++, ofs++)
+            s->lfe_samples[ofs] = clip23(audio[n] * scale >> 4);
+
+        // Advance LFE sample pointer for the next subframe
+        *lfe_pos = ofs;
+    }
+
+    // Audio data
+    for (ssf = 0, ofs = *sub_pos; ssf < s->nsubsubframes[sf]; ssf++) {
+        for (ch = xch_base; ch < s->nchannels; ch++) {
+            if (get_bits_left(&s->gb) < 0)
+                return AVERROR_INVALIDDATA;
+
+            // Not high frequency VQ subbands
+            for (band = 0; band < s->subband_vq_start[ch]; band++) {
+                int ret, trans_ssf, abits = s->bit_allocation[ch][band];
+                int32_t step_size;
+
+                // Extract bits from the bit stream
+                if ((ret = extract_audio(s, audio, abits, ch)) < 0)
+                    return ret;
+
+                // Select quantization step size table and look up
+                // quantization step size
+                if (s->bit_rate == 3)
+                    step_size = ff_dca_lossless_quant[abits];
+                else
+                    step_size = ff_dca_lossy_quant[abits];
+
+                // Identify transient location
+                trans_ssf = s->transition_mode[sf][ch][band];
+
+                // Determine proper scale factor
+                if (trans_ssf == 0 || ssf < trans_ssf)
+                    scale = s->scale_factors[ch][band][0];
+                else
+                    scale = s->scale_factors[ch][band][1];
+
+                // Adjust scale factor when SEL indicates Huffman code
+                if (ret > 0) {
+                    int64_t adj = s->scale_factor_adj[ch][abits - 1];
+                    scale = clip23(adj * scale >> 22);
+                }
+
+                dequantize(s->subband_samples[ch][band] + ofs,
+                           audio, step_size, scale, 0);
+            }
+        }
+
+        // DSYNC
+        if ((ssf == s->nsubsubframes[sf] - 1 || s->sync_ssf) && get_bits(&s->gb, 16) != 0xffff) {
+            av_log(s->avctx, AV_LOG_ERROR, "DSYNC check failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        ofs += DCA_SUBBAND_SAMPLES;
+    }
+
+    // Inverse ADPCM
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        inverse_adpcm(s->subband_samples[ch], s->prediction_vq_index[ch],
+                      s->prediction_mode[ch], 0, s->nsubbands[ch],
+                      *sub_pos, nsamples);
+    }
+
+    // Joint subband coding
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            s->dcadsp->decode_joint(s->subband_samples[ch], s->subband_samples[src_ch],
+                                    s->joint_scale_factors[ch], s->nsubbands[ch],
+                                    s->nsubbands[src_ch], *sub_pos, nsamples);
+        }
+    }
+
+    // Advance subband sample pointer for the next subframe
+    *sub_pos = ofs;
+    return 0;
+}
+
+static void erase_adpcm_history(DCACoreDecoder *s)
+{
+    int ch, band;
+
+    // Erase ADPCM history from previous frame if
+    // predictor history switch was disabled
+    for (ch = 0; ch < DCA_CHANNELS; ch++)
+        for (band = 0; band < DCA_SUBBANDS; band++)
+            AV_ZERO128(s->subband_samples[ch][band] - DCA_ADPCM_COEFFS);
+
+    emms_c();
+}
+
+static int alloc_sample_buffer(DCACoreDecoder *s)
+{
+    int nchsamples = DCA_ADPCM_COEFFS + s->npcmblocks;
+    int nframesamples = nchsamples * DCA_CHANNELS * DCA_SUBBANDS;
+    int nlfesamples = DCA_LFE_HISTORY + s->npcmblocks / 2;
+    unsigned int size = s->subband_size;
+    int ch, band;
+
+    // Reallocate subband sample buffer
+    av_fast_mallocz(&s->subband_buffer, &s->subband_size,
+                    (nframesamples + nlfesamples) * sizeof(int32_t));
+    if (!s->subband_buffer)
+        return AVERROR(ENOMEM);
+
+    if (size != s->subband_size) {
+        for (ch = 0; ch < DCA_CHANNELS; ch++)
+            for (band = 0; band < DCA_SUBBANDS; band++)
+                s->subband_samples[ch][band] = s->subband_buffer +
+                    (ch * DCA_SUBBANDS + band) * nchsamples + DCA_ADPCM_COEFFS;
+        s->lfe_samples = s->subband_buffer + nframesamples;
+    }
+
+    if (!s->predictor_history)
+        erase_adpcm_history(s);
+
+    return 0;
+}
+
+static int parse_frame_data(DCACoreDecoder *s, enum HeaderType header, int xch_base)
+{
+    int sf, ch, ret, band, sub_pos, lfe_pos;
+
+    if ((ret = parse_coding_header(s, header, xch_base)) < 0)
+        return ret;
+
+    for (sf = 0, sub_pos = 0, lfe_pos = DCA_LFE_HISTORY; sf < s->nsubframes; sf++) {
+        if ((ret = parse_subframe_header(s, sf, header, xch_base)) < 0)
+            return ret;
+        if ((ret = parse_subframe_audio(s, sf, header, xch_base, &sub_pos, &lfe_pos)) < 0)
+            return ret;
+    }
+
+    for (ch = xch_base; ch < s->nchannels; ch++) {
+        // Determine number of active subbands for this channel
+        int nsubbands = s->nsubbands[ch];
+        if (s->joint_intensity_index[ch])
+            nsubbands = FFMAX(nsubbands, s->nsubbands[s->joint_intensity_index[ch] - 1]);
+
+        // Update history for ADPCM
+        for (band = 0; band < nsubbands; band++) {
+            int32_t *samples = s->subband_samples[ch][band] - DCA_ADPCM_COEFFS;
+            AV_COPY128(samples, samples + s->npcmblocks);
+        }
+
+        // Clear inactive subbands
+        for (; band < DCA_SUBBANDS; band++) {
+            int32_t *samples = s->subband_samples[ch][band] - DCA_ADPCM_COEFFS;
+            memset(samples, 0, (DCA_ADPCM_COEFFS + s->npcmblocks) * sizeof(int32_t));
+        }
+    }
+
+    emms_c();
+
+    return 0;
+}
+
+static int parse_xch_frame(DCACoreDecoder *s)
+{
+    int ret;
+
+    if (s->ch_mask & DCA_SPEAKER_MASK_Cs) {
+        av_log(s->avctx, AV_LOG_ERROR, "XCH with Cs speaker already present\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = parse_frame_data(s, HEADER_XCH, s->nchannels)) < 0)
+        return ret;
+
+    // Seek to the end of core frame, don't trust XCH frame size
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XCH frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_xxch_frame(DCACoreDecoder *s)
+{
+    int xxch_nchsets, xxch_frame_size;
+    int ret, mask, header_size, header_pos = get_bits_count(&s->gb);
+
+    // XXCH sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_XXCH) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // XXCH frame header length
+    header_size = get_bits(&s->gb, 6) + 1;
+
+    // Check XXCH frame header CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos + 32, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XXCH frame header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // CRC presence flag for channel set header
+    s->xxch_crc_present = get_bits1(&s->gb);
+
+    // Number of bits for loudspeaker mask
+    s->xxch_mask_nbits = get_bits(&s->gb, 5) + 1;
+    if (s->xxch_mask_nbits <= DCA_SPEAKER_Cs) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid number of bits for XXCH speaker mask (%d)\n", s->xxch_mask_nbits);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of channel sets
+    xxch_nchsets = get_bits(&s->gb, 2) + 1;
+    if (xxch_nchsets > 1) {
+        avpriv_request_sample(s->avctx, "%d XXCH channel sets", xxch_nchsets);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Channel set 0 data byte size
+    xxch_frame_size = get_bits(&s->gb, 14) + 1;
+
+    // Core loudspeaker activity mask
+    s->xxch_core_mask = get_bits_long(&s->gb, s->xxch_mask_nbits);
+
+    // Validate the core mask
+    mask = s->ch_mask;
+
+    if ((mask & DCA_SPEAKER_MASK_Ls) && (s->xxch_core_mask & DCA_SPEAKER_MASK_Lss))
+        mask = (mask & ~DCA_SPEAKER_MASK_Ls) | DCA_SPEAKER_MASK_Lss;
+
+    if ((mask & DCA_SPEAKER_MASK_Rs) && (s->xxch_core_mask & DCA_SPEAKER_MASK_Rss))
+        mask = (mask & ~DCA_SPEAKER_MASK_Rs) | DCA_SPEAKER_MASK_Rss;
+
+    if (mask != s->xxch_core_mask) {
+        av_log(s->avctx, AV_LOG_ERROR, "XXCH core speaker activity mask (%#x) disagrees with core (%#x)\n", s->xxch_core_mask, mask);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Reserved
+    // Byte align
+    // CRC16 of XXCH frame header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XXCH frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Parse XXCH channel set 0
+    if ((ret = parse_frame_data(s, HEADER_XXCH, s->nchannels)) < 0)
+        return ret;
+
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8 + xxch_frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XXCH channel set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_xbr_subframe(DCACoreDecoder *s, int xbr_base_ch, int xbr_nchannels,
+                              int *xbr_nsubbands, int xbr_transition_mode, int sf, int *sub_pos)
+{
+    int     xbr_nabits[DCA_CHANNELS];
+    int     xbr_bit_allocation[DCA_CHANNELS][DCA_SUBBANDS];
+    int     xbr_scale_nbits[DCA_CHANNELS];
+    int32_t xbr_scale_factors[DCA_CHANNELS][DCA_SUBBANDS][2];
+    int     ssf, ch, band, ofs;
+
+    // Check number of subband samples in this subframe
+    if (*sub_pos + s->nsubsubframes[sf] * DCA_SUBBAND_SAMPLES > s->npcmblocks) {
+        av_log(s->avctx, AV_LOG_ERROR, "Subband sample buffer overflow\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // Number of bits for XBR bit allocation index
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++)
+        xbr_nabits[ch] = get_bits(&s->gb, 2) + 2;
+
+    // XBR bit allocation index
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+        for (band = 0; band < xbr_nsubbands[ch]; band++) {
+            xbr_bit_allocation[ch][band] = get_bits(&s->gb, xbr_nabits[ch]);
+            if (xbr_bit_allocation[ch][band] > DCA_ABITS_MAX) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR bit allocation index\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Number of bits for scale indices
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+        xbr_scale_nbits[ch] = get_bits(&s->gb, 3);
+        if (!xbr_scale_nbits[ch]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of bits for XBR scale factor index\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // XBR scale factors
+    for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+        const uint32_t *scale_table;
+        int scale_size;
+
+        // Select the root square table
+        if (s->scale_factor_sel[ch] > 5) {
+            scale_table = ff_dca_scale_factor_quant7;
+            scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant7);
+        } else {
+            scale_table = ff_dca_scale_factor_quant6;
+            scale_size = FF_ARRAY_ELEMS(ff_dca_scale_factor_quant6);
+        }
+
+        // Parse scale factor indices and look up scale factors from the root
+        // square table
+        for (band = 0; band < xbr_nsubbands[ch]; band++) {
+            if (xbr_bit_allocation[ch][band]) {
+                int scale_index = get_bits(&s->gb, xbr_scale_nbits[ch]);
+                if (scale_index >= scale_size) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR scale factor index\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                xbr_scale_factors[ch][band][0] = scale_table[scale_index];
+                if (xbr_transition_mode && s->transition_mode[sf][ch][band]) {
+                    scale_index = get_bits(&s->gb, xbr_scale_nbits[ch]);
+                    if (scale_index >= scale_size) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR scale factor index\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    xbr_scale_factors[ch][band][1] = scale_table[scale_index];
+                }
+            }
+        }
+    }
+
+    // Audio data
+    for (ssf = 0, ofs = *sub_pos; ssf < s->nsubsubframes[sf]; ssf++) {
+        for (ch = xbr_base_ch; ch < xbr_nchannels; ch++) {
+            if (get_bits_left(&s->gb) < 0)
+                return AVERROR_INVALIDDATA;
+
+            for (band = 0; band < xbr_nsubbands[ch]; band++) {
+                int ret, trans_ssf, abits = xbr_bit_allocation[ch][band];
+                int32_t audio[DCA_SUBBAND_SAMPLES], step_size, scale;
+
+                // Extract bits from the bit stream
+                if (abits > 7) {
+                    // No further encoding
+                    get_array(&s->gb, audio, DCA_SUBBAND_SAMPLES, abits - 3);
+                } else if (abits > 0) {
+                    // Block codes
+                    if ((ret = parse_block_codes(s, audio, abits)) < 0)
+                        return ret;
+                } else {
+                    // No bits allocated
+                    continue;
+                }
+
+                // Look up quantization step size
+                step_size = ff_dca_lossless_quant[abits];
+
+                // Identify transient location
+                if (xbr_transition_mode)
+                    trans_ssf = s->transition_mode[sf][ch][band];
+                else
+                    trans_ssf = 0;
+
+                // Determine proper scale factor
+                if (trans_ssf == 0 || ssf < trans_ssf)
+                    scale = xbr_scale_factors[ch][band][0];
+                else
+                    scale = xbr_scale_factors[ch][band][1];
+
+                dequantize(s->subband_samples[ch][band] + ofs,
+                           audio, step_size, scale, 1);
+            }
+        }
+
+        // DSYNC
+        if ((ssf == s->nsubsubframes[sf] - 1 || s->sync_ssf) && get_bits(&s->gb, 16) != 0xffff) {
+            av_log(s->avctx, AV_LOG_ERROR, "XBR-DSYNC check failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        ofs += DCA_SUBBAND_SAMPLES;
+    }
+
+    // Advance subband sample pointer for the next subframe
+    *sub_pos = ofs;
+    return 0;
+}
+
+static int parse_xbr_frame(DCACoreDecoder *s)
+{
+    int     xbr_frame_size[DCA_EXSS_CHSETS_MAX];
+    int     xbr_nchannels[DCA_EXSS_CHSETS_MAX];
+    int     xbr_nsubbands[DCA_EXSS_CHSETS_MAX * DCA_EXSS_CHANNELS_MAX];
+    int     xbr_nchsets, xbr_transition_mode, xbr_band_nbits, xbr_base_ch;
+    int     i, ch1, ch2, ret, header_size, header_pos = get_bits_count(&s->gb);
+
+    // XBR sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_XBR) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // XBR frame header length
+    header_size = get_bits(&s->gb, 6) + 1;
+
+    // Check XBR frame header CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos + 32, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XBR frame header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of channel sets
+    xbr_nchsets = get_bits(&s->gb, 2) + 1;
+
+    // Channel set data byte size
+    for (i = 0; i < xbr_nchsets; i++)
+        xbr_frame_size[i] = get_bits(&s->gb, 14) + 1;
+
+    // Transition mode flag
+    xbr_transition_mode = get_bits1(&s->gb);
+
+    // Channel set headers
+    for (i = 0, ch2 = 0; i < xbr_nchsets; i++) {
+        xbr_nchannels[i] = get_bits(&s->gb, 3) + 1;
+        xbr_band_nbits = get_bits(&s->gb, 2) + 5;
+        for (ch1 = 0; ch1 < xbr_nchannels[i]; ch1++, ch2++) {
+            xbr_nsubbands[ch2] = get_bits(&s->gb, xbr_band_nbits) + 1;
+            if (xbr_nsubbands[ch2] > DCA_SUBBANDS) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid number of active XBR subbands (%d)\n", xbr_nsubbands[ch2]);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Reserved
+    // Byte align
+    // CRC16 of XBR frame header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XBR frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Channel set data
+    for (i = 0, xbr_base_ch = 0; i < xbr_nchsets; i++) {
+        header_pos = get_bits_count(&s->gb);
+
+        if (xbr_base_ch + xbr_nchannels[i] <= s->nchannels) {
+            int sf, sub_pos;
+
+            for (sf = 0, sub_pos = 0; sf < s->nsubframes; sf++) {
+                if ((ret = parse_xbr_subframe(s, xbr_base_ch,
+                                              xbr_base_ch + xbr_nchannels[i],
+                                              xbr_nsubbands, xbr_transition_mode,
+                                              sf, &sub_pos)) < 0)
+                    return ret;
+            }
+        }
+
+        xbr_base_ch += xbr_nchannels[i];
+
+        if (ff_dca_seek_bits(&s->gb, header_pos + xbr_frame_size[i] * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of XBR channel set\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    return 0;
+}
+
+// Modified ISO/IEC 9899 linear congruential generator
+// Returns pseudorandom integer in range [-2^30, 2^30 - 1]
+static int rand_x96(DCACoreDecoder *s)
+{
+    s->x96_rand = 1103515245U * s->x96_rand + 12345U;
+    return (s->x96_rand & 0x7fffffff) - 0x40000000;
+}
+
+static int parse_x96_subframe_audio(DCACoreDecoder *s, int sf, int xch_base, int *sub_pos)
+{
+    int n, ssf, ch, band, ofs;
+
+    // Check number of subband samples in this subframe
+    int nsamples = s->nsubsubframes[sf] * DCA_SUBBAND_SAMPLES;
+    if (*sub_pos + nsamples > s->npcmblocks) {
+        av_log(s->avctx, AV_LOG_ERROR, "Subband sample buffer overflow\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // VQ encoded or unallocated subbands
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+            // Get the sample pointer and scale factor
+            int32_t *samples = s->x96_subband_samples[ch][band] + *sub_pos;
+            int32_t scale    = s->scale_factors[ch][band >> 1][band & 1];
+
+            switch (s->bit_allocation[ch][band]) {
+            case 0: // No bits allocated for subband
+                if (scale <= 1)
+                    memset(samples, 0, nsamples * sizeof(int32_t));
+                else for (n = 0; n < nsamples; n++)
+                    // Generate scaled random samples
+                    samples[n] = mul31(rand_x96(s), scale);
+                break;
+
+            case 1: // VQ encoded subband
+                for (ssf = 0; ssf < (s->nsubsubframes[sf] + 1) / 2; ssf++) {
+                    // Extract the VQ address from the bit stream and look up
+                    // the VQ code book for up to 16 subband samples
+                    const int8_t *vq_samples = ff_dca_high_freq_vq[get_bits(&s->gb, 10)];
+                    // Scale and take the samples
+                    for (n = 0; n < FFMIN(nsamples - ssf * 16, 16); n++)
+                        *samples++ = clip23(vq_samples[n] * scale + (1 << 3) >> 4);
+                }
+                break;
+            }
+        }
+    }
+
+    // Audio data
+    for (ssf = 0, ofs = *sub_pos; ssf < s->nsubsubframes[sf]; ssf++) {
+        for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+            if (get_bits_left(&s->gb) < 0)
+                return AVERROR_INVALIDDATA;
+
+            for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+                int ret, abits = s->bit_allocation[ch][band] - 1;
+                int32_t audio[DCA_SUBBAND_SAMPLES], step_size, scale;
+
+                // Not VQ encoded or unallocated subbands
+                if (abits < 1)
+                    continue;
+
+                // Extract bits from the bit stream
+                if ((ret = extract_audio(s, audio, abits, ch)) < 0)
+                    return ret;
+
+                // Select quantization step size table and look up quantization
+                // step size
+                if (s->bit_rate == 3)
+                    step_size = ff_dca_lossless_quant[abits];
+                else
+                    step_size = ff_dca_lossy_quant[abits];
+
+                // Get the scale factor
+                scale = s->scale_factors[ch][band >> 1][band & 1];
+
+                dequantize(s->x96_subband_samples[ch][band] + ofs,
+                           audio, step_size, scale, 0);
+            }
+        }
+
+        // DSYNC
+        if ((ssf == s->nsubsubframes[sf] - 1 || s->sync_ssf) && get_bits(&s->gb, 16) != 0xffff) {
+            av_log(s->avctx, AV_LOG_ERROR, "X96-DSYNC check failed\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        ofs += DCA_SUBBAND_SAMPLES;
+    }
+
+    // Inverse ADPCM
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        inverse_adpcm(s->x96_subband_samples[ch], s->prediction_vq_index[ch],
+                      s->prediction_mode[ch], s->x96_subband_start, s->nsubbands[ch],
+                      *sub_pos, nsamples);
+    }
+
+    // Joint subband coding
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            s->dcadsp->decode_joint(s->x96_subband_samples[ch], s->x96_subband_samples[src_ch],
+                                    s->joint_scale_factors[ch], s->nsubbands[ch],
+                                    s->nsubbands[src_ch], *sub_pos, nsamples);
+        }
+    }
+
+    // Advance subband sample pointer for the next subframe
+    *sub_pos = ofs;
+    return 0;
+}
+
+static void erase_x96_adpcm_history(DCACoreDecoder *s)
+{
+    int ch, band;
+
+    // Erase ADPCM history from previous frame if
+    // predictor history switch was disabled
+    for (ch = 0; ch < DCA_CHANNELS; ch++)
+        for (band = 0; band < DCA_SUBBANDS_X96; band++)
+            AV_ZERO128(s->x96_subband_samples[ch][band] - DCA_ADPCM_COEFFS);
+
+    emms_c();
+}
+
+static int alloc_x96_sample_buffer(DCACoreDecoder *s)
+{
+    int nchsamples = DCA_ADPCM_COEFFS + s->npcmblocks;
+    int nframesamples = nchsamples * DCA_CHANNELS * DCA_SUBBANDS_X96;
+    unsigned int size = s->x96_subband_size;
+    int ch, band;
+
+    // Reallocate subband sample buffer
+    av_fast_mallocz(&s->x96_subband_buffer, &s->x96_subband_size,
+                    nframesamples * sizeof(int32_t));
+    if (!s->x96_subband_buffer)
+        return AVERROR(ENOMEM);
+
+    if (size != s->x96_subband_size) {
+        for (ch = 0; ch < DCA_CHANNELS; ch++)
+            for (band = 0; band < DCA_SUBBANDS_X96; band++)
+                s->x96_subband_samples[ch][band] = s->x96_subband_buffer +
+                    (ch * DCA_SUBBANDS_X96 + band) * nchsamples + DCA_ADPCM_COEFFS;
+    }
+
+    if (!s->predictor_history)
+        erase_x96_adpcm_history(s);
+
+    return 0;
+}
+
+static int parse_x96_subframe_header(DCACoreDecoder *s, int xch_base)
+{
+    int ch, band, ret;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // Prediction mode
+    for (ch = xch_base; ch < s->x96_nchannels; ch++)
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++)
+            s->prediction_mode[ch][band] = get_bits1(&s->gb);
+
+    // Prediction coefficients VQ address
+    for (ch = xch_base; ch < s->x96_nchannels; ch++)
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++)
+            if (s->prediction_mode[ch][band])
+                s->prediction_vq_index[ch][band] = get_bits(&s->gb, 12);
+
+    // Bit allocation index
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int sel = s->bit_allocation_sel[ch];
+        int abits = 0;
+
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+            // If Huffman code was used, the difference of abits was encoded
+            if (sel < 7)
+                abits += dca_get_vlc(&s->gb, &ff_dca_vlc_quant_index[5 + 2 * s->x96_high_res], sel);
+            else
+                abits = get_bits(&s->gb, 3 + s->x96_high_res);
+
+            if (abits < 0 || abits > 7 + 8 * s->x96_high_res) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 bit allocation index\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            s->bit_allocation[ch][band] = abits;
+        }
+    }
+
+    // Scale factors
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int sel = s->scale_factor_sel[ch];
+        int scale_index = 0;
+
+        // Extract scales for subbands which are transmitted even for
+        // unallocated subbands
+        for (band = s->x96_subband_start; band < s->nsubbands[ch]; band++) {
+            if ((ret = parse_scale(s, &scale_index, sel)) < 0)
+                return ret;
+            s->scale_factors[ch][band >> 1][band & 1] = ret;
+        }
+    }
+
+    // Joint subband codebook select
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        if (s->joint_intensity_index[ch]) {
+            s->joint_scale_sel[ch] = get_bits(&s->gb, 3);
+            if (s->joint_scale_sel[ch] == 7) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 joint scale factor code book\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
+    // Scale factors for joint subband coding
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        int src_ch = s->joint_intensity_index[ch] - 1;
+        if (src_ch >= 0) {
+            int sel = s->joint_scale_sel[ch];
+            for (band = s->nsubbands[ch]; band < s->nsubbands[src_ch]; band++) {
+                if ((ret = parse_joint_scale(s, sel)) < 0)
+                    return ret;
+                s->joint_scale_factors[ch][band] = ret;
+            }
+        }
+    }
+
+    // Side information CRC check word
+    if (s->crc_present)
+        skip_bits(&s->gb, 16);
+
+    return 0;
+}
+
+static int parse_x96_coding_header(DCACoreDecoder *s, int exss, int xch_base)
+{
+    int n, ch, header_size = 0, header_pos = get_bits_count(&s->gb);
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    if (exss) {
+        // Channel set header length
+        header_size = get_bits(&s->gb, 7) + 1;
+
+        // Check CRC
+        if (s->x96_crc_present
+            && ff_dca_check_crc(s->avctx, &s->gb, header_pos, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 channel set header checksum\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // High resolution flag
+    s->x96_high_res = get_bits1(&s->gb);
+
+    // First encoded subband
+    if (s->x96_rev_no < 8) {
+        s->x96_subband_start = get_bits(&s->gb, 5);
+        if (s->x96_subband_start > 27) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 subband start index (%d)\n", s->x96_subband_start);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        s->x96_subband_start = DCA_SUBBANDS;
+    }
+
+    // Subband activity count
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        s->nsubbands[ch] = get_bits(&s->gb, 6) + 1;
+        if (s->nsubbands[ch] < DCA_SUBBANDS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 subband activity count (%d)\n", s->nsubbands[ch]);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Joint intensity coding index
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        if ((n = get_bits(&s->gb, 3)) && xch_base)
+            n += xch_base - 1;
+        if (n > s->x96_nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 joint intensity coding index\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->joint_intensity_index[ch] = n;
+    }
+
+    // Scale factor code book
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        s->scale_factor_sel[ch] = get_bits(&s->gb, 3);
+        if (s->scale_factor_sel[ch] >= 6) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 scale factor code book\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    // Bit allocation quantizer select
+    for (ch = xch_base; ch < s->x96_nchannels; ch++)
+        s->bit_allocation_sel[ch] = get_bits(&s->gb, 3);
+
+    // Quantization index codebook select
+    for (n = 0; n < 6 + 4 * s->x96_high_res; n++)
+        for (ch = xch_base; ch < s->x96_nchannels; ch++)
+            s->quant_index_sel[ch][n] = get_bits(&s->gb, quant_index_sel_nbits[n]);
+
+    if (exss) {
+        // Reserved
+        // Byte align
+        // CRC16 of channel set header
+        if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 channel set header\n");
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        if (s->crc_present)
+            skip_bits(&s->gb, 16);
+    }
+
+    return 0;
+}
+
+static int parse_x96_frame_data(DCACoreDecoder *s, int exss, int xch_base)
+{
+    int sf, ch, ret, band, sub_pos;
+
+    if ((ret = parse_x96_coding_header(s, exss, xch_base)) < 0)
+        return ret;
+
+    for (sf = 0, sub_pos = 0; sf < s->nsubframes; sf++) {
+        if ((ret = parse_x96_subframe_header(s, xch_base)) < 0)
+            return ret;
+        if ((ret = parse_x96_subframe_audio(s, sf, xch_base, &sub_pos)) < 0)
+            return ret;
+    }
+
+    for (ch = xch_base; ch < s->x96_nchannels; ch++) {
+        // Determine number of active subbands for this channel
+        int nsubbands = s->nsubbands[ch];
+        if (s->joint_intensity_index[ch])
+            nsubbands = FFMAX(nsubbands, s->nsubbands[s->joint_intensity_index[ch] - 1]);
+
+        // Update history for ADPCM and clear inactive subbands
+        for (band = 0; band < DCA_SUBBANDS_X96; band++) {
+            int32_t *samples = s->x96_subband_samples[ch][band] - DCA_ADPCM_COEFFS;
+            if (band >= s->x96_subband_start && band < nsubbands)
+                AV_COPY128(samples, samples + s->npcmblocks);
+            else
+                memset(samples, 0, (DCA_ADPCM_COEFFS + s->npcmblocks) * sizeof(int32_t));
+        }
+    }
+
+    emms_c();
+
+    return 0;
+}
+
+static int parse_x96_frame(DCACoreDecoder *s)
+{
+    int ret;
+
+    // Revision number
+    s->x96_rev_no = get_bits(&s->gb, 4);
+    if (s->x96_rev_no < 1 || s->x96_rev_no > 8) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 revision (%d)\n", s->x96_rev_no);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->x96_crc_present = 0;
+    s->x96_nchannels = s->nchannels;
+
+    if ((ret = alloc_x96_sample_buffer(s)) < 0)
+        return ret;
+
+    if ((ret = parse_x96_frame_data(s, 0, 0)) < 0)
+        return ret;
+
+    // Seek to the end of core frame
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_x96_frame_exss(DCACoreDecoder *s)
+{
+    int     x96_frame_size[DCA_EXSS_CHSETS_MAX];
+    int     x96_nchannels[DCA_EXSS_CHSETS_MAX];
+    int     x96_nchsets, x96_base_ch;
+    int     i, ret, header_size, header_pos = get_bits_count(&s->gb);
+
+    // X96 sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_X96) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // X96 frame header length
+    header_size = get_bits(&s->gb, 6) + 1;
+
+    // Check X96 frame header CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos + 32, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 frame header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Revision number
+    s->x96_rev_no = get_bits(&s->gb, 4);
+    if (s->x96_rev_no < 1 || s->x96_rev_no > 8) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid X96 revision (%d)\n", s->x96_rev_no);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // CRC presence flag for channel set header
+    s->x96_crc_present = get_bits1(&s->gb);
+
+    // Number of channel sets
+    x96_nchsets = get_bits(&s->gb, 2) + 1;
+
+    // Channel set data byte size
+    for (i = 0; i < x96_nchsets; i++)
+        x96_frame_size[i] = get_bits(&s->gb, 12) + 1;
+
+    // Number of channels in channel set
+    for (i = 0; i < x96_nchsets; i++)
+        x96_nchannels[i] = get_bits(&s->gb, 3) + 1;
+
+    // Reserved
+    // Byte align
+    // CRC16 of X96 frame header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = alloc_x96_sample_buffer(s)) < 0)
+        return ret;
+
+    // Channel set data
+    s->x96_nchannels = 0;
+    for (i = 0, x96_base_ch = 0; i < x96_nchsets; i++) {
+        header_pos = get_bits_count(&s->gb);
+
+        if (x96_base_ch + x96_nchannels[i] <= s->nchannels) {
+            s->x96_nchannels = x96_base_ch + x96_nchannels[i];
+            if ((ret = parse_x96_frame_data(s, 1, x96_base_ch)) < 0)
+                return ret;
+        }
+
+        x96_base_ch += x96_nchannels[i];
+
+        if (ff_dca_seek_bits(&s->gb, header_pos + x96_frame_size[i] * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of X96 channel set\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    return 0;
+}
+
+static int parse_aux_data(DCACoreDecoder *s)
+{
+    int aux_pos;
+
+    if (get_bits_left(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
+    // Auxiliary data byte count (can't be trusted)
+    skip_bits(&s->gb, 6);
+
+    // 4-byte align
+    skip_bits_long(&s->gb, -get_bits_count(&s->gb) & 31);
+
+    // Auxiliary data sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_REV1AUX) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid auxiliary data sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    aux_pos = get_bits_count(&s->gb);
+
+    // Auxiliary decode time stamp flag
+    if (get_bits1(&s->gb))
+        skip_bits_long(&s->gb, 47);
+
+    // Auxiliary dynamic downmix flag
+    if (s->prim_dmix_embedded = get_bits1(&s->gb)) {
+        int i, m, n;
+
+        // Auxiliary primary channel downmix type
+        s->prim_dmix_type = get_bits(&s->gb, 3);
+        if (s->prim_dmix_type >= DCA_DMIX_TYPE_COUNT) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid primary channel set downmix type\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Size of downmix coefficients matrix
+        m = ff_dca_dmix_primary_nch[s->prim_dmix_type];
+        n = ff_dca_channels[s->audio_mode] + !!s->lfe_present;
+
+        // Dynamic downmix code coefficients
+        for (i = 0; i < m * n; i++) {
+            int code = get_bits(&s->gb, 9);
+            int sign = (code >> 8) - 1;
+            unsigned int index = code & 0xff;
+            if (index >= FF_DCA_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid downmix coefficient index\n");
+                return AVERROR_INVALIDDATA;
+            }
+            s->prim_dmix_coeff[i] = (ff_dca_dmixtable[index] ^ sign) - sign;
+        }
+    }
+
+    // Byte align
+    skip_bits(&s->gb, -get_bits_count(&s->gb) & 7);
+
+    // CRC16 of auxiliary data
+    skip_bits(&s->gb, 16);
+
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, aux_pos, get_bits_count(&s->gb))) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid auxiliary data checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_optional_info(DCACoreDecoder *s)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    int ret = -1;
+
+    // Time code stamp
+    if (s->ts_present)
+        skip_bits_long(&s->gb, 32);
+
+    // Auxiliary data
+    if (s->aux_present && (ret = parse_aux_data(s)) < 0
+        && (s->avctx->err_recognition & AV_EF_EXPLODE))
+        return ret;
+
+    if (ret < 0)
+        s->prim_dmix_embedded = 0;
+
+    // Core extensions
+    if (s->ext_audio_present && !dca->core_only) {
+        int sync_pos = FFMIN(s->frame_size / 4, s->gb.size_in_bits / 32) - 1;
+        int last_pos = get_bits_count(&s->gb) / 32;
+        int size, dist;
+
+        // Search for extension sync words aligned on 4-byte boundary. Search
+        // must be done backwards from the end of core frame to work around
+        // sync word aliasing issues.
+        switch (s->ext_audio_type) {
+        case EXT_AUDIO_XCH:
+            if (dca->request_channel_layout)
+                break;
+
+            // The distance between XCH sync word and end of the core frame
+            // must be equal to XCH frame size. Off by one error is allowed for
+            // compatibility with legacy bitstreams. Minimum XCH frame size is
+            // 96 bytes. AMODE and PCHS are further checked to reduce
+            // probability of alias sync detection.
+            for (; sync_pos >= last_pos; sync_pos--) {
+                if (AV_RB32(s->gb.buffer + sync_pos * 4) == DCA_SYNCWORD_XCH) {
+                    s->gb.index = (sync_pos + 1) * 32;
+                    size = get_bits(&s->gb, 10) + 1;
+                    dist = s->frame_size - sync_pos * 4;
+                    if (size >= 96
+                        && (size == dist || size - 1 == dist)
+                        && get_bits(&s->gb, 7) == 0x08) {
+                        s->xch_pos = get_bits_count(&s->gb);
+                        break;
+                    }
+                }
+            }
+
+            if (!s->xch_pos) {
+                av_log(s->avctx, AV_LOG_ERROR, "XCH sync word not found\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+            break;
+
+        case EXT_AUDIO_X96:
+            // The distance between X96 sync word and end of the core frame
+            // must be equal to X96 frame size. Minimum X96 frame size is 96
+            // bytes.
+            for (; sync_pos >= last_pos; sync_pos--) {
+                if (AV_RB32(s->gb.buffer + sync_pos * 4) == DCA_SYNCWORD_X96) {
+                    s->gb.index = (sync_pos + 1) * 32;
+                    size = get_bits(&s->gb, 12) + 1;
+                    dist = s->frame_size - sync_pos * 4;
+                    if (size >= 96 && size == dist) {
+                        s->x96_pos = get_bits_count(&s->gb);
+                        break;
+                    }
+                }
+            }
+
+            if (!s->x96_pos) {
+                av_log(s->avctx, AV_LOG_ERROR, "X96 sync word not found\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+            break;
+
+        case EXT_AUDIO_XXCH:
+            if (dca->request_channel_layout)
+                break;
+
+            // XXCH frame header CRC must be valid. Minimum XXCH frame header
+            // size is 11 bytes.
+            for (; sync_pos >= last_pos; sync_pos--) {
+                if (AV_RB32(s->gb.buffer + sync_pos * 4) == DCA_SYNCWORD_XXCH) {
+                    s->gb.index = (sync_pos + 1) * 32;
+                    size = get_bits(&s->gb, 6) + 1;
+                    dist = s->gb.size_in_bits / 8 - sync_pos * 4;
+                    if (size >= 11 && size <= dist &&
+                        !av_crc(dca->crctab, 0xffff, s->gb.buffer +
+                                (sync_pos + 1) * 4, size - 4)) {
+                        s->xxch_pos = sync_pos * 32;
+                        break;
+                    }
+                }
+            }
+
+            if (!s->xxch_pos) {
+                av_log(s->avctx, AV_LOG_ERROR, "XXCH sync word not found\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+            break;
+        }
+    }
+
+    return 0;
+}
+
+int ff_dca_core_parse(DCACoreDecoder *s, uint8_t *data, int size)
+{
+    int ret;
+
+    s->ext_audio_mask = 0;
+    s->xch_pos = s->xxch_pos = s->x96_pos = 0;
+
+    if ((ret = init_get_bits8(&s->gb, data, size)) < 0)
+        return ret;
+
+    skip_bits_long(&s->gb, 32);
+    if ((ret = parse_frame_header(s)) < 0)
+        return ret;
+    if ((ret = alloc_sample_buffer(s)) < 0)
+        return ret;
+    if ((ret = parse_frame_data(s, HEADER_CORE, 0)) < 0)
+        return ret;
+    if ((ret = parse_optional_info(s)) < 0)
+        return ret;
+
+    // Workaround for DTS in WAV
+    if (s->frame_size > size && s->frame_size < size + 4)
+        s->frame_size = size;
+
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of core frame\n");
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+int ff_dca_core_parse_exss(DCACoreDecoder *s, uint8_t *data, DCAExssAsset *asset)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    GetBitContext gb = s->gb;
+    int exss_mask = asset ? asset->extension_mask : 0;
+    int ret = 0, ext = 0;
+
+    // Parse (X)XCH unless downmixing
+    if (!dca->request_channel_layout) {
+        if (exss_mask & DCA_EXSS_XXCH) {
+            if ((ret = init_get_bits8(&s->gb, data + asset->xxch_offset, asset->xxch_size)) < 0)
+                return ret;
+            ret = parse_xxch_frame(s);
+            ext = DCA_EXSS_XXCH;
+        } else if (s->xxch_pos) {
+            s->gb.index = s->xxch_pos;
+            ret = parse_xxch_frame(s);
+            ext = DCA_CSS_XXCH;
+        } else if (s->xch_pos) {
+            s->gb.index = s->xch_pos;
+            ret = parse_xch_frame(s);
+            ext = DCA_CSS_XCH;
+        }
+
+        // Revert to primary channel set in case (X)XCH parsing fails
+        if (ret < 0) {
+            if (avctx->err_recognition & AV_EF_EXPLODE)
+                return ret;
+            s->nchannels = ff_dca_channels[s->audio_mode];
+            s->ch_mask = audio_mode_ch_mask[s->audio_mode];
+            if (s->lfe_present)
+                s->ch_mask |= DCA_SPEAKER_MASK_LFE1;
+        } else {
+            s->ext_audio_mask |= ext;
+        }
+    }
+
+    // Parse XBR
+    if (exss_mask & DCA_EXSS_XBR) {
+        if ((ret = init_get_bits8(&s->gb, data + asset->xbr_offset, asset->xbr_size)) < 0)
+            return ret;
+        if ((ret = parse_xbr_frame(s)) < 0) {
+            if (avctx->err_recognition & AV_EF_EXPLODE)
+                return ret;
+        } else {
+            s->ext_audio_mask |= DCA_EXSS_XBR;
+        }
+    }
+
+    // Parse X96 unless decoding XLL
+    if (!(dca->packet & DCA_PACKET_XLL)) {
+        if (exss_mask & DCA_EXSS_X96) {
+            if ((ret = init_get_bits8(&s->gb, data + asset->x96_offset, asset->x96_size)) < 0)
+                return ret;
+            if ((ret = parse_x96_frame_exss(s)) < 0) {
+                if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->ext_audio_mask |= DCA_EXSS_X96;
+            }
+        } else if (s->x96_pos) {
+            s->gb = gb;
+            s->gb.index = s->x96_pos;
+            if ((ret = parse_x96_frame(s)) < 0) {
+                if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->ext_audio_mask |= DCA_CSS_X96;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int map_prm_ch_to_spkr(DCACoreDecoder *s, int ch)
+{
+    int pos, spkr;
+
+    // Try to map this channel to core first
+    pos = ff_dca_channels[s->audio_mode];
+    if (ch < pos) {
+        spkr = prm_ch_to_spkr_map[s->audio_mode][ch];
+        if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH)) {
+            if (s->xxch_core_mask & (1U << spkr))
+                return spkr;
+            if (spkr == DCA_SPEAKER_Ls && (s->xxch_core_mask & DCA_SPEAKER_MASK_Lss))
+                return DCA_SPEAKER_Lss;
+            if (spkr == DCA_SPEAKER_Rs && (s->xxch_core_mask & DCA_SPEAKER_MASK_Rss))
+                return DCA_SPEAKER_Rss;
+            return -1;
+        }
+        return spkr;
+    }
+
+    // Then XCH
+    if ((s->ext_audio_mask & DCA_CSS_XCH) && ch == pos)
+        return DCA_SPEAKER_Cs;
+
+    // Then XXCH
+    if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH)) {
+        for (spkr = DCA_SPEAKER_Cs; spkr < s->xxch_mask_nbits; spkr++)
+            if (s->xxch_spkr_mask & (1U << spkr))
+                if (pos++ == ch)
+                    return spkr;
+    }
+
+    // No mapping
+    return -1;
+}
+
+static void erase_dsp_history(DCACoreDecoder *s)
+{
+    memset(s->dcadsp_data, 0, sizeof(s->dcadsp_data));
+    s->output_history_lfe_fixed = 0;
+    s->output_history_lfe_float = 0;
+}
+
+static void set_filter_mode(DCACoreDecoder *s, int mode)
+{
+    if (s->filter_mode != mode) {
+        erase_dsp_history(s);
+        s->filter_mode = mode;
+    }
+}
+
+int ff_dca_core_filter_fixed(DCACoreDecoder *s, int x96_synth)
+{
+    int n, ch, spkr, nsamples, x96_nchannels = 0;
+    const int32_t *filter_coeff;
+    int32_t *ptr;
+
+    // Externally set x96_synth flag implies that X96 synthesis should be
+    // enabled, yet actual X96 subband data should be discarded. This is a
+    // special case for lossless residual decoder that ignores X96 data if
+    // present.
+    if (!x96_synth && (s->ext_audio_mask & (DCA_CSS_X96 | DCA_EXSS_X96))) {
+        x96_nchannels = s->x96_nchannels;
+        x96_synth = 1;
+    }
+    if (x96_synth < 0)
+        x96_synth = 0;
+
+    s->output_rate = s->sample_rate << x96_synth;
+    s->npcmsamples = nsamples = (s->npcmblocks * DCA_PCMBLOCK_SAMPLES) << x96_synth;
+
+    // Reallocate PCM output buffer
+    av_fast_malloc(&s->output_buffer, &s->output_size,
+                   nsamples * av_popcount(s->ch_mask) * sizeof(int32_t));
+    if (!s->output_buffer)
+        return AVERROR(ENOMEM);
+
+    ptr = (int32_t *)s->output_buffer;
+    for (spkr = 0; spkr < DCA_SPEAKER_COUNT; spkr++) {
+        if (s->ch_mask & (1U << spkr)) {
+            s->output_samples[spkr] = ptr;
+            ptr += nsamples;
+        } else {
+            s->output_samples[spkr] = NULL;
+        }
+    }
+
+    // Handle change of filtering mode
+    set_filter_mode(s, x96_synth | DCA_FILTER_MODE_FIXED);
+
+    // Select filter
+    if (x96_synth)
+        filter_coeff = ff_dca_fir_64bands_fixed;
+    else if (s->filter_perfect)
+        filter_coeff = ff_dca_fir_32bands_perfect_fixed;
+    else
+        filter_coeff = ff_dca_fir_32bands_nonperfect_fixed;
+
+    // Filter primary channels
+    for (ch = 0; ch < s->nchannels; ch++) {
+        // Map this primary channel to speaker
+        spkr = map_prm_ch_to_spkr(s, ch);
+        if (spkr < 0)
+            return AVERROR(EINVAL);
+
+        // Filter bank reconstruction
+        s->dcadsp->sub_qmf_fixed[x96_synth](
+            &s->synth,
+            &s->dcadct,
+            s->output_samples[spkr],
+            s->subband_samples[ch],
+            ch < x96_nchannels ? s->x96_subband_samples[ch] : NULL,
+            s->dcadsp_data[ch].u.fix.hist1,
+            &s->dcadsp_data[ch].offset,
+            s->dcadsp_data[ch].u.fix.hist2,
+            filter_coeff,
+            s->npcmblocks);
+    }
+
+    // Filter LFE channel
+    if (s->lfe_present) {
+        int32_t *samples = s->output_samples[DCA_SPEAKER_LFE1];
+        int nlfesamples = s->npcmblocks >> 1;
+
+        // Check LFF
+        if (s->lfe_present == LFE_FLAG_128) {
+            av_log(s->avctx, AV_LOG_ERROR, "Fixed point mode doesn't support LFF=1\n");
+            return AVERROR(EINVAL);
+        }
+
+        // Offset intermediate buffer for X96
+        if (x96_synth)
+            samples += nsamples / 2;
+
+        // Interpolate LFE channel
+        s->dcadsp->lfe_fir_fixed(samples, s->lfe_samples + DCA_LFE_HISTORY,
+                                 ff_dca_lfe_fir_64_fixed, s->npcmblocks);
+
+        if (x96_synth) {
+            // Filter 96 kHz oversampled LFE PCM to attenuate high frequency
+            // (47.6 - 48.0 kHz) components of interpolation image
+            s->dcadsp->lfe_x96_fixed(s->output_samples[DCA_SPEAKER_LFE1],
+                                     samples, &s->output_history_lfe_fixed,
+                                     nsamples / 2);
+
+        }
+
+        // Update LFE history
+        for (n = DCA_LFE_HISTORY - 1; n >= 0; n--)
+            s->lfe_samples[n] = s->lfe_samples[nlfesamples + n];
+    }
+
+    return 0;
+}
+
+static int filter_frame_fixed(DCACoreDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    int i, n, ch, ret, spkr, nsamples;
+
+    // Don't filter twice when falling back from XLL
+    if (!(dca->packet & DCA_PACKET_XLL) && (ret = ff_dca_core_filter_fixed(s, 0)) < 0)
+        return ret;
+
+    avctx->sample_rate = s->output_rate;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
+    avctx->bits_per_raw_sample = 24;
+
+    frame->nb_samples = nsamples = s->npcmsamples;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Undo embedded XCH downmix
+    if (s->es_format && (s->ext_audio_mask & DCA_CSS_XCH)
+        && s->audio_mode >= AMODE_2F2R) {
+        s->dcadsp->dmix_sub_xch(s->output_samples[DCA_SPEAKER_Ls],
+                                s->output_samples[DCA_SPEAKER_Rs],
+                                s->output_samples[DCA_SPEAKER_Cs],
+                                nsamples);
+
+    }
+
+    // Undo embedded XXCH downmix
+    if ((s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH))
+        && s->xxch_dmix_embedded) {
+        int scale_inv   = s->xxch_dmix_scale_inv;
+        int *coeff_ptr  = s->xxch_dmix_coeff;
+        int xch_base    = ff_dca_channels[s->audio_mode];
+        av_assert1(s->nchannels - xch_base <= DCA_XXCH_CHANNELS_MAX);
+
+        // Undo embedded core downmix pre-scaling
+        for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+            if (s->xxch_core_mask & (1U << spkr)) {
+                s->dcadsp->dmix_scale_inv(s->output_samples[spkr],
+                                          scale_inv, nsamples);
+            }
+        }
+
+        // Undo downmix
+        for (ch = xch_base; ch < s->nchannels; ch++) {
+            int src_spkr = map_prm_ch_to_spkr(s, ch);
+            if (src_spkr < 0)
+                return AVERROR(EINVAL);
+            for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+                if (s->xxch_dmix_mask[ch - xch_base] & (1U << spkr)) {
+                    int coeff = mul16(*coeff_ptr++, scale_inv);
+                    if (coeff) {
+                        s->dcadsp->dmix_sub(s->output_samples[spkr    ],
+                                            s->output_samples[src_spkr],
+                                            coeff, nsamples);
+                    }
+                }
+            }
+        }
+    }
+
+    if (!(s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH | DCA_EXSS_XXCH))) {
+        // Front sum/difference decoding
+        if ((s->sumdiff_front && s->audio_mode > AMODE_MONO)
+            || s->audio_mode == AMODE_STEREO_SUMDIFF) {
+            s->fixed_dsp->butterflies_fixed(s->output_samples[DCA_SPEAKER_L],
+                                            s->output_samples[DCA_SPEAKER_R],
+                                            nsamples);
+        }
+
+        // Surround sum/difference decoding
+        if (s->sumdiff_surround && s->audio_mode >= AMODE_2F2R) {
+            s->fixed_dsp->butterflies_fixed(s->output_samples[DCA_SPEAKER_Ls],
+                                            s->output_samples[DCA_SPEAKER_Rs],
+                                            nsamples);
+        }
+    }
+
+    // Downmix primary channel set to stereo
+    if (s->request_mask != s->ch_mask) {
+        ff_dca_downmix_to_stereo_fixed(s->dcadsp,
+                                       s->output_samples,
+                                       s->prim_dmix_coeff,
+                                       nsamples, s->ch_mask);
+    }
+
+    for (i = 0; i < avctx->channels; i++) {
+        int32_t *samples = s->output_samples[s->ch_remap[i]];
+        int32_t *plane = (int32_t *)frame->extended_data[i];
+        for (n = 0; n < nsamples; n++)
+            plane[n] = clip23(samples[n]) * (1 << 8);
+    }
+
+    return 0;
+}
+
+static int filter_frame_float(DCACoreDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    int x96_nchannels = 0, x96_synth = 0;
+    int i, n, ch, ret, spkr, nsamples, nchannels;
+    float *output_samples[DCA_SPEAKER_COUNT] = { NULL }, *ptr;
+    const float *filter_coeff;
+
+    if (s->ext_audio_mask & (DCA_CSS_X96 | DCA_EXSS_X96)) {
+        x96_nchannels = s->x96_nchannels;
+        x96_synth = 1;
+    }
+
+    avctx->sample_rate = s->sample_rate << x96_synth;
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    avctx->bits_per_raw_sample = 0;
+
+    frame->nb_samples = nsamples = (s->npcmblocks * DCA_PCMBLOCK_SAMPLES) << x96_synth;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Build reverse speaker to channel mapping
+    for (i = 0; i < avctx->channels; i++)
+        output_samples[s->ch_remap[i]] = (float *)frame->extended_data[i];
+
+    // Allocate space for extra channels
+    nchannels = av_popcount(s->ch_mask) - avctx->channels;
+    if (nchannels > 0) {
+        av_fast_malloc(&s->output_buffer, &s->output_size,
+                       nsamples * nchannels * sizeof(float));
+        if (!s->output_buffer)
+            return AVERROR(ENOMEM);
+
+        ptr = (float *)s->output_buffer;
+        for (spkr = 0; spkr < DCA_SPEAKER_COUNT; spkr++) {
+            if (!(s->ch_mask & (1U << spkr)))
+                continue;
+            if (output_samples[spkr])
+                continue;
+            output_samples[spkr] = ptr;
+            ptr += nsamples;
+        }
+    }
+
+    // Handle change of filtering mode
+    set_filter_mode(s, x96_synth);
+
+    // Select filter
+    if (x96_synth)
+        filter_coeff = ff_dca_fir_64bands;
+    else if (s->filter_perfect)
+        filter_coeff = ff_dca_fir_32bands_perfect;
+    else
+        filter_coeff = ff_dca_fir_32bands_nonperfect;
+
+    // Filter primary channels
+    for (ch = 0; ch < s->nchannels; ch++) {
+        // Map this primary channel to speaker
+        spkr = map_prm_ch_to_spkr(s, ch);
+        if (spkr < 0)
+            return AVERROR(EINVAL);
+
+        // Filter bank reconstruction
+        s->dcadsp->sub_qmf_float[x96_synth](
+            &s->synth,
+            &s->imdct[x96_synth],
+            output_samples[spkr],
+            s->subband_samples[ch],
+            ch < x96_nchannels ? s->x96_subband_samples[ch] : NULL,
+            s->dcadsp_data[ch].u.flt.hist1,
+            &s->dcadsp_data[ch].offset,
+            s->dcadsp_data[ch].u.flt.hist2,
+            filter_coeff,
+            s->npcmblocks,
+            1.0f / (1 << (17 - x96_synth)));
+    }
+
+    // Filter LFE channel
+    if (s->lfe_present) {
+        int dec_select = (s->lfe_present == LFE_FLAG_128);
+        float *samples = output_samples[DCA_SPEAKER_LFE1];
+        int nlfesamples = s->npcmblocks >> (dec_select + 1);
+
+        // Offset intermediate buffer for X96
+        if (x96_synth)
+            samples += nsamples / 2;
+
+        // Select filter
+        if (dec_select)
+            filter_coeff = ff_dca_lfe_fir_128;
+        else
+            filter_coeff = ff_dca_lfe_fir_64;
+
+        // Interpolate LFE channel
+        s->dcadsp->lfe_fir_float[dec_select](
+            samples, s->lfe_samples + DCA_LFE_HISTORY,
+            filter_coeff, s->npcmblocks);
+
+        if (x96_synth) {
+            // Filter 96 kHz oversampled LFE PCM to attenuate high frequency
+            // (47.6 - 48.0 kHz) components of interpolation image
+            s->dcadsp->lfe_x96_float(output_samples[DCA_SPEAKER_LFE1],
+                                     samples, &s->output_history_lfe_float,
+                                     nsamples / 2);
+        }
+
+        // Update LFE history
+        for (n = DCA_LFE_HISTORY - 1; n >= 0; n--)
+            s->lfe_samples[n] = s->lfe_samples[nlfesamples + n];
+    }
+
+    // Undo embedded XCH downmix
+    if (s->es_format && (s->ext_audio_mask & DCA_CSS_XCH)
+        && s->audio_mode >= AMODE_2F2R) {
+        s->float_dsp->vector_fmac_scalar(output_samples[DCA_SPEAKER_Ls],
+                                         output_samples[DCA_SPEAKER_Cs],
+                                         -M_SQRT1_2, nsamples);
+        s->float_dsp->vector_fmac_scalar(output_samples[DCA_SPEAKER_Rs],
+                                         output_samples[DCA_SPEAKER_Cs],
+                                         -M_SQRT1_2, nsamples);
+    }
+
+    // Undo embedded XXCH downmix
+    if ((s->ext_audio_mask & (DCA_CSS_XXCH | DCA_EXSS_XXCH))
+        && s->xxch_dmix_embedded) {
+        float scale_inv = s->xxch_dmix_scale_inv * (1.0f / (1 << 16));
+        int *coeff_ptr  = s->xxch_dmix_coeff;
+        int xch_base    = ff_dca_channels[s->audio_mode];
+        av_assert1(s->nchannels - xch_base <= DCA_XXCH_CHANNELS_MAX);
+
+        // Undo downmix
+        for (ch = xch_base; ch < s->nchannels; ch++) {
+            int src_spkr = map_prm_ch_to_spkr(s, ch);
+            if (src_spkr < 0)
+                return AVERROR(EINVAL);
+            for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+                if (s->xxch_dmix_mask[ch - xch_base] & (1U << spkr)) {
+                    int coeff = *coeff_ptr++;
+                    if (coeff) {
+                        s->float_dsp->vector_fmac_scalar(output_samples[    spkr],
+                                                         output_samples[src_spkr],
+                                                         coeff * (-1.0f / (1 << 15)),
+                                                         nsamples);
+                    }
+                }
+            }
+        }
+
+        // Undo embedded core downmix pre-scaling
+        for (spkr = 0; spkr < s->xxch_mask_nbits; spkr++) {
+            if (s->xxch_core_mask & (1U << spkr)) {
+                s->float_dsp->vector_fmul_scalar(output_samples[spkr],
+                                                 output_samples[spkr],
+                                                 scale_inv, nsamples);
+            }
+        }
+    }
+
+    if (!(s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH | DCA_EXSS_XXCH))) {
+        // Front sum/difference decoding
+        if ((s->sumdiff_front && s->audio_mode > AMODE_MONO)
+            || s->audio_mode == AMODE_STEREO_SUMDIFF) {
+            s->float_dsp->butterflies_float(output_samples[DCA_SPEAKER_L],
+                                            output_samples[DCA_SPEAKER_R],
+                                            nsamples);
+        }
+
+        // Surround sum/difference decoding
+        if (s->sumdiff_surround && s->audio_mode >= AMODE_2F2R) {
+            s->float_dsp->butterflies_float(output_samples[DCA_SPEAKER_Ls],
+                                            output_samples[DCA_SPEAKER_Rs],
+                                            nsamples);
+        }
+    }
+
+    // Downmix primary channel set to stereo
+    if (s->request_mask != s->ch_mask) {
+        ff_dca_downmix_to_stereo_float(s->float_dsp, output_samples,
+                                       s->prim_dmix_coeff,
+                                       nsamples, s->ch_mask);
+    }
+
+    return 0;
+}
+
+int ff_dca_core_filter_frame(DCACoreDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    DCAExssAsset *asset = &dca->exss.assets[0];
+    enum AVMatrixEncoding matrix_encoding;
+    int ret;
+
+    // Handle downmixing to stereo request
+    if (dca->request_channel_layout == DCA_SPEAKER_LAYOUT_STEREO
+        && s->audio_mode > AMODE_MONO && s->prim_dmix_embedded
+        && (s->prim_dmix_type == DCA_DMIX_TYPE_LoRo ||
+            s->prim_dmix_type == DCA_DMIX_TYPE_LtRt))
+        s->request_mask = DCA_SPEAKER_LAYOUT_STEREO;
+    else
+        s->request_mask = s->ch_mask;
+    if (!ff_dca_set_channel_layout(avctx, s->ch_remap, s->request_mask))
+        return AVERROR(EINVAL);
+
+    // Force fixed point mode when falling back from XLL
+    if ((avctx->flags & AV_CODEC_FLAG_BITEXACT) || ((dca->packet & DCA_PACKET_EXSS)
+                                                    && (asset->extension_mask & DCA_EXSS_XLL)))
+        ret = filter_frame_fixed(s, frame);
+    else
+        ret = filter_frame_float(s, frame);
+    if (ret < 0)
+        return ret;
+
+    // Set profile, bit rate, etc
+    if (s->ext_audio_mask & DCA_EXSS_MASK)
+        avctx->profile = FF_PROFILE_DTS_HD_HRA;
+    else if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH))
+        avctx->profile = FF_PROFILE_DTS_ES;
+    else if (s->ext_audio_mask & DCA_CSS_X96)
+        avctx->profile = FF_PROFILE_DTS_96_24;
+    else
+        avctx->profile = FF_PROFILE_DTS;
+
+    if (s->bit_rate > 3 && !(s->ext_audio_mask & DCA_EXSS_MASK))
+        avctx->bit_rate = s->bit_rate;
+    else
+        avctx->bit_rate = 0;
+
+    if (s->audio_mode == AMODE_STEREO_TOTAL || (s->request_mask != s->ch_mask &&
+                                                s->prim_dmix_type == DCA_DMIX_TYPE_LtRt))
+        matrix_encoding = AV_MATRIX_ENCODING_DOLBY;
+    else
+        matrix_encoding = AV_MATRIX_ENCODING_NONE;
+    if ((ret = ff_side_data_update_matrix_encoding(frame, matrix_encoding)) < 0)
+        return ret;
+
+    return 0;
+}
+
+av_cold void ff_dca_core_flush(DCACoreDecoder *s)
+{
+    if (s->subband_buffer) {
+        erase_adpcm_history(s);
+        memset(s->lfe_samples, 0, DCA_LFE_HISTORY * sizeof(int32_t));
+    }
+
+    if (s->x96_subband_buffer)
+        erase_x96_adpcm_history(s);
+
+    erase_dsp_history(s);
+}
+
+av_cold int ff_dca_core_init(DCACoreDecoder *s)
+{
+    if (!(s->float_dsp = avpriv_float_dsp_alloc(0)))
+        return -1;
+    if (!(s->fixed_dsp = avpriv_alloc_fixed_dsp(0)))
+        return -1;
+
+    ff_dcadct_init(&s->dcadct);
+    if (ff_mdct_init(&s->imdct[0], 6, 1, 1.0) < 0)
+        return -1;
+    if (ff_mdct_init(&s->imdct[1], 7, 1, 1.0) < 0)
+        return -1;
+    ff_synth_filter_init(&s->synth);
+
+    s->x96_rand = 1;
+    return 0;
+}
+
+av_cold void ff_dca_core_close(DCACoreDecoder *s)
+{
+    av_freep(&s->float_dsp);
+    av_freep(&s->fixed_dsp);
+
+    ff_mdct_end(&s->imdct[0]);
+    ff_mdct_end(&s->imdct[1]);
+
+    av_freep(&s->subband_buffer);
+    s->subband_size = 0;
+
+    av_freep(&s->x96_subband_buffer);
+    s->x96_subband_size = 0;
+
+    av_freep(&s->output_buffer);
+    s->output_size = 0;
+}
diff --git a/libavcodec/dca_core.h b/libavcodec/dca_core.h
new file mode 100644
index 0000000..e84bdab
--- /dev/null
+++ b/libavcodec/dca_core.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_CORE_H
+#define AVCODEC_DCA_CORE_H
+
+#include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/fixed_dsp.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dca_exss.h"
+#include "dcadsp.h"
+#include "dcadct.h"
+#include "dcahuff.h"
+#include "fft.h"
+#include "synth_filter.h"
+
+#define DCA_CHANNELS            7
+#define DCA_SUBBANDS            32
+#define DCA_SUBBANDS_X96        64
+#define DCA_SUBFRAMES           16
+#define DCA_SUBBAND_SAMPLES     8
+#define DCA_PCMBLOCK_SAMPLES    32
+#define DCA_ADPCM_COEFFS        4
+#define DCA_LFE_HISTORY         8
+#define DCA_ABITS_MAX           26
+
+#define DCA_CORE_CHANNELS_MAX       6
+#define DCA_DMIX_CHANNELS_MAX       4
+#define DCA_XXCH_CHANNELS_MAX       2
+#define DCA_EXSS_CHANNELS_MAX       8
+#define DCA_EXSS_CHSETS_MAX         4
+
+#define DCA_FILTER_MODE_X96     0x01
+#define DCA_FILTER_MODE_FIXED   0x02
+
+typedef struct DCADSPData {
+    union {
+        struct {
+            DECLARE_ALIGNED(32, float, hist1)[1024];
+            DECLARE_ALIGNED(32, float, hist2)[64];
+        } flt;
+        struct {
+            DECLARE_ALIGNED(32, int32_t, hist1)[1024];
+            DECLARE_ALIGNED(32, int32_t, hist2)[64];
+        } fix;
+    } u;
+    int offset;
+} DCADSPData;
+
+typedef struct DCACoreDecoder {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    // Bit stream header
+    int     crc_present;        ///< CRC present flag
+    int     npcmblocks;         ///< Number of PCM sample blocks
+    int     frame_size;         ///< Primary frame byte size
+    int     audio_mode;         ///< Audio channel arrangement
+    int     sample_rate;        ///< Core audio sampling frequency
+    int     bit_rate;           ///< Transmission bit rate
+    int     drc_present;        ///< Embedded dynamic range flag
+    int     ts_present;         ///< Embedded time stamp flag
+    int     aux_present;        ///< Auxiliary data flag
+    int     ext_audio_type;     ///< Extension audio descriptor flag
+    int     ext_audio_present;  ///< Extended coding flag
+    int     sync_ssf;           ///< Audio sync word insertion flag
+    int     lfe_present;        ///< Low frequency effects flag
+    int     predictor_history;  ///< Predictor history flag switch
+    int     filter_perfect;     ///< Multirate interpolator switch
+    int     source_pcm_res;     ///< Source PCM resolution
+    int     es_format;          ///< Extended surround (ES) mastering flag
+    int     sumdiff_front;      ///< Front sum/difference flag
+    int     sumdiff_surround;   ///< Surround sum/difference flag
+
+    // Primary audio coding header
+    int         nsubframes;     ///< Number of subframes
+    int         nchannels;      ///< Number of primary audio channels (incl. extension channels)
+    int         ch_mask;        ///< Speaker layout mask (incl. LFE and extension channels)
+    int8_t      nsubbands[DCA_CHANNELS];                ///< Subband activity count
+    int8_t      subband_vq_start[DCA_CHANNELS];         ///< High frequency VQ start subband
+    int8_t      joint_intensity_index[DCA_CHANNELS];    ///< Joint intensity coding index
+    int8_t      transition_mode_sel[DCA_CHANNELS];      ///< Transient mode code book
+    int8_t      scale_factor_sel[DCA_CHANNELS];         ///< Scale factor code book
+    int8_t      bit_allocation_sel[DCA_CHANNELS];       ///< Bit allocation quantizer select
+    int8_t      quant_index_sel[DCA_CHANNELS][DCA_CODE_BOOKS];  ///< Quantization index codebook select
+    int32_t     scale_factor_adj[DCA_CHANNELS][DCA_CODE_BOOKS]; ///< Scale factor adjustment
+
+    // Primary audio coding side information
+    int8_t      nsubsubframes[DCA_SUBFRAMES];   ///< Subsubframe count for each subframe
+    int8_t      prediction_mode[DCA_CHANNELS][DCA_SUBBANDS_X96];            ///< Prediction mode
+    int16_t     prediction_vq_index[DCA_CHANNELS][DCA_SUBBANDS_X96];        ///< Prediction coefficients VQ address
+    int8_t      bit_allocation[DCA_CHANNELS][DCA_SUBBANDS_X96];             ///< Bit allocation index
+    int8_t      transition_mode[DCA_SUBFRAMES][DCA_CHANNELS][DCA_SUBBANDS]; ///< Transition mode
+    int32_t     scale_factors[DCA_CHANNELS][DCA_SUBBANDS][2];               ///< Scale factors (2x for transients and X96)
+    int8_t      joint_scale_sel[DCA_CHANNELS];                              ///< Joint subband codebook select
+    int32_t     joint_scale_factors[DCA_CHANNELS][DCA_SUBBANDS_X96];        ///< Scale factors for joint subband coding
+
+    // Auxiliary data
+    int     prim_dmix_embedded; ///< Auxiliary dynamic downmix flag
+    int     prim_dmix_type;     ///< Auxiliary primary channel downmix type
+    int     prim_dmix_coeff[DCA_DMIX_CHANNELS_MAX * DCA_CORE_CHANNELS_MAX]; ///< Dynamic downmix code coefficients
+
+    // Core extensions
+    int     ext_audio_mask;     ///< Bit mask of fully decoded core extensions
+
+    // XCH extension data
+    int     xch_pos;    ///< Bit position of XCH frame in core substream
+
+    // XXCH extension data
+    int     xxch_crc_present;       ///< CRC presence flag for XXCH channel set header
+    int     xxch_mask_nbits;        ///< Number of bits for loudspeaker mask
+    int     xxch_core_mask;         ///< Core loudspeaker activity mask
+    int     xxch_spkr_mask;         ///< Loudspeaker layout mask
+    int     xxch_dmix_embedded;     ///< Downmix already performed by encoder
+    int     xxch_dmix_scale_inv;    ///< Downmix scale factor
+    int     xxch_dmix_mask[DCA_XXCH_CHANNELS_MAX];  ///< Downmix channel mapping mask
+    int     xxch_dmix_coeff[DCA_XXCH_CHANNELS_MAX * DCA_CORE_CHANNELS_MAX];     ///< Downmix coefficients
+    int     xxch_pos;   ///< Bit position of XXCH frame in core substream
+
+    // X96 extension data
+    int     x96_rev_no;         ///< X96 revision number
+    int     x96_crc_present;    ///< CRC presence flag for X96 channel set header
+    int     x96_nchannels;      ///< Number of primary channels in X96 extension
+    int     x96_high_res;       ///< X96 high resolution flag
+    int     x96_subband_start;  ///< First encoded subband in X96 extension
+    int     x96_rand;           ///< Random seed for generating samples for unallocated X96 subbands
+    int     x96_pos;            ///< Bit position of X96 frame in core substream
+
+    // Sample buffers
+    unsigned int    x96_subband_size;
+    int32_t         *x96_subband_buffer;    ///< X96 subband sample buffer base
+    int32_t         *x96_subband_samples[DCA_CHANNELS][DCA_SUBBANDS_X96];   ///< X96 subband samples
+
+    unsigned int    subband_size;
+    int32_t         *subband_buffer;    ///< Subband sample buffer base
+    int32_t         *subband_samples[DCA_CHANNELS][DCA_SUBBANDS];   ///< Subband samples
+    int32_t         *lfe_samples;    ///< Decimated LFE samples
+
+    // DSP contexts
+    DCADSPData              dcadsp_data[DCA_CHANNELS];    ///< FIR history buffers
+    DCADSPContext           *dcadsp;
+    DCADCTContext           dcadct;
+    FFTContext              imdct[2];
+    SynthFilterContext      synth;
+    AVFloatDSPContext       *float_dsp;
+    AVFixedDSPContext       *fixed_dsp;
+
+    // PCM output data
+    unsigned int    output_size;
+    void            *output_buffer;                         ///< PCM output buffer base
+    int32_t         *output_samples[DCA_SPEAKER_COUNT];     ///< PCM output for fixed point mode
+    int32_t         output_history_lfe_fixed;               ///< LFE PCM history for X96 filter
+    float           output_history_lfe_float;               ///< LFE PCM history for X96 filter
+
+    int     ch_remap[DCA_SPEAKER_COUNT];   ///< Channel to speaker map
+    int     request_mask;   ///< Requested channel layout (for stereo downmix)
+
+    int     npcmsamples;    ///< Number of PCM samples per channel
+    int     output_rate;    ///< Output sample rate (1x or 2x header rate)
+
+    int     filter_mode;    ///< Previous filtering mode for detecting changes
+} DCACoreDecoder;
+
+static inline int ff_dca_core_map_spkr(DCACoreDecoder *core, int spkr)
+{
+    if (core->ch_mask & (1U << spkr))
+        return spkr;
+    if (spkr == DCA_SPEAKER_Lss && (core->ch_mask & DCA_SPEAKER_MASK_Ls))
+        return DCA_SPEAKER_Ls;
+    if (spkr == DCA_SPEAKER_Rss && (core->ch_mask & DCA_SPEAKER_MASK_Rs))
+        return DCA_SPEAKER_Rs;
+    return -1;
+}
+
+int ff_dca_core_parse(DCACoreDecoder *s, uint8_t *data, int size);
+int ff_dca_core_parse_exss(DCACoreDecoder *s, uint8_t *data, DCAExssAsset *asset);
+int ff_dca_core_filter_fixed(DCACoreDecoder *s, int x96_synth);
+int ff_dca_core_filter_frame(DCACoreDecoder *s, AVFrame *frame);
+av_cold void ff_dca_core_flush(DCACoreDecoder *s);
+av_cold int ff_dca_core_init(DCACoreDecoder *s);
+av_cold void ff_dca_core_close(DCACoreDecoder *s);
+
+#endif
diff --git a/libavcodec/dca_core_bsf.c b/libavcodec/dca_core_bsf.c
new file mode 100644
index 0000000..9edc0cf
--- /dev/null
+++ b/libavcodec/dca_core_bsf.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "bytestream.h"
+#include "dca_syncwords.h"
+#include "libavutil/mem.h"
+
+static int dca_core_filter(AVBSFContext *ctx, AVPacket *out)
+{
+    AVPacket *in;
+    GetByteContext gb;
+    uint32_t syncword;
+    int core_size = 0, ret;
+
+    ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
+
+    bytestream2_init(&gb, in->data, in->size);
+    syncword = bytestream2_get_be32(&gb);
+    bytestream2_skip(&gb, 1);
+
+    switch (syncword) {
+    case DCA_SYNCWORD_CORE_BE:
+        core_size = ((bytestream2_get_be24(&gb) >> 4) & 0x3fff) + 1;
+        break;
+    }
+
+    av_packet_move_ref(out, in);
+    av_packet_free(&in);
+
+    if (core_size > 0 && core_size <= out->size) {
+        out->size = core_size;
+    }
+
+    return 0;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_DTS, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_dca_core_bsf = {
+    .name      = "dca_core",
+    .filter    = dca_core_filter,
+    .codec_ids = codec_ids,
+};
diff --git a/libavcodec/dca_exss.c b/libavcodec/dca_exss.c
index 2895e20..e873088 100644
--- a/libavcodec/dca_exss.c
+++ b/libavcodec/dca_exss.c
@@ -1,368 +1,514 @@
 /*
- * DCA ExSS extension
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/common.h"
-#include "libavutil/log.h"
-
-#include "dca.h"
-#include "dca_syncwords.h"
-#include "get_bits.h"
-
-/* extensions that reside in core substream */
-#define DCA_CORE_EXTS (DCA_EXT_XCH | DCA_EXT_XXCH | DCA_EXT_X96)
-
-/* these are unconfirmed but should be mostly correct */
-enum DCAExSSSpeakerMask {
-    DCA_EXSS_FRONT_CENTER          = 0x0001,
-    DCA_EXSS_FRONT_LEFT_RIGHT      = 0x0002,
-    DCA_EXSS_SIDE_REAR_LEFT_RIGHT  = 0x0004,
-    DCA_EXSS_LFE                   = 0x0008,
-    DCA_EXSS_REAR_CENTER           = 0x0010,
-    DCA_EXSS_FRONT_HIGH_LEFT_RIGHT = 0x0020,
-    DCA_EXSS_REAR_LEFT_RIGHT       = 0x0040,
-    DCA_EXSS_FRONT_HIGH_CENTER     = 0x0080,
-    DCA_EXSS_OVERHEAD              = 0x0100,
-    DCA_EXSS_CENTER_LEFT_RIGHT     = 0x0200,
-    DCA_EXSS_WIDE_LEFT_RIGHT       = 0x0400,
-    DCA_EXSS_SIDE_LEFT_RIGHT       = 0x0800,
-    DCA_EXSS_LFE2                  = 0x1000,
-    DCA_EXSS_SIDE_HIGH_LEFT_RIGHT  = 0x2000,
-    DCA_EXSS_REAR_HIGH_CENTER      = 0x4000,
-    DCA_EXSS_REAR_HIGH_LEFT_RIGHT  = 0x8000,
-};
-
-/**
- * Return the number of channels in an ExSS speaker mask (HD)
- */
-static int dca_exss_mask2count(int mask)
+#include "dcadec.h"
+
+static void parse_xll_parameters(DCAExssParser *s, DCAExssAsset *asset)
 {
-    /* count bits that mean speaker pairs twice */
-    return av_popcount(mask) +
-           av_popcount(mask & (DCA_EXSS_CENTER_LEFT_RIGHT      |
-                               DCA_EXSS_FRONT_LEFT_RIGHT       |
-                               DCA_EXSS_FRONT_HIGH_LEFT_RIGHT  |
-                               DCA_EXSS_WIDE_LEFT_RIGHT        |
-                               DCA_EXSS_SIDE_LEFT_RIGHT        |
-                               DCA_EXSS_SIDE_HIGH_LEFT_RIGHT   |
-                               DCA_EXSS_SIDE_REAR_LEFT_RIGHT   |
-                               DCA_EXSS_REAR_LEFT_RIGHT        |
-                               DCA_EXSS_REAR_HIGH_LEFT_RIGHT));
+    // Size of XLL data in extension substream
+    asset->xll_size = get_bits(&s->gb, s->exss_size_nbits) + 1;
+
+    // XLL sync word present flag
+    if (asset->xll_sync_present = get_bits1(&s->gb)) {
+        int xll_delay_nbits;
+
+        // Peak bit rate smoothing buffer size
+        skip_bits(&s->gb, 4);
+
+        // Number of bits for XLL decoding delay
+        xll_delay_nbits = get_bits(&s->gb, 5) + 1;
+
+        // Initial XLL decoding delay in frames
+        asset->xll_delay_nframes = get_bits_long(&s->gb, xll_delay_nbits);
+
+        // Number of bytes offset to XLL sync
+        asset->xll_sync_offset = get_bits(&s->gb, s->exss_size_nbits);
+    } else {
+        asset->xll_delay_nframes = 0;
+        asset->xll_sync_offset = 0;
+    }
 }
 
-/**
- * Skip mixing coefficients of a single mix out configuration (HD)
- */
-static void dca_exss_skip_mix_coeffs(GetBitContext *gb, int channels, int out_ch)
+static void parse_lbr_parameters(DCAExssParser *s, DCAExssAsset *asset)
 {
-    int i;
+    // Size of LBR component in extension substream
+    asset->lbr_size = get_bits(&s->gb, 14) + 1;
 
-    for (i = 0; i < channels; i++) {
-        int mix_map_mask = get_bits(gb, out_ch);
-        int num_coeffs = av_popcount(mix_map_mask);
-        skip_bits_long(gb, num_coeffs * 6);
-    }
+    // LBR sync word present flag
+    if (get_bits1(&s->gb))
+        // LBR sync distance
+        skip_bits(&s->gb, 2);
 }
 
-/**
- * Parse extension substream asset header (HD)
- */
-static int dca_exss_parse_asset_header(DCAContext *s)
+static int parse_descriptor(DCAExssParser *s, DCAExssAsset *asset)
 {
-    int header_pos = get_bits_count(&s->gb);
-    int header_size;
-    int channels = 0;
-    int embedded_stereo = 0;
-    int embedded_6ch    = 0;
-    int drc_code_present;
-    int extensions_mask = 0;
-    int i, j;
-
-    if (get_bits_left(&s->gb) < 16)
-        return AVERROR_INVALIDDATA;
+    int i, j, drc_present, descr_size, descr_pos = get_bits_count(&s->gb);
+
+    // Size of audio asset descriptor in bytes
+    descr_size = get_bits(&s->gb, 9) + 1;
 
-    /* We will parse just enough to get to the extensions bitmask with which
-     * we can set the profile value. */
+    // Audio asset identifier
+    asset->asset_index = get_bits(&s->gb, 3);
 
-    header_size = get_bits(&s->gb, 9) + 1;
-    skip_bits(&s->gb, 3); // asset index
+    //
+    // Per stream static metadata
+    //
 
-    if (s->static_fields) {
+    if (s->static_fields_present) {
+        // Asset type descriptor presence
         if (get_bits1(&s->gb))
-            skip_bits(&s->gb, 4); // asset type descriptor
+            // Asset type descriptor
+            skip_bits(&s->gb, 4);
+
+        // Language descriptor presence
         if (get_bits1(&s->gb))
-            skip_bits_long(&s->gb, 24); // language descriptor
+            // Language descriptor
+            skip_bits(&s->gb, 24);
 
+        // Additional textual information presence
         if (get_bits1(&s->gb)) {
-            /* How can one fit 1024 bytes of text here if the maximum value
-             * for the asset header size field above was 512 bytes? */
-            int text_length = get_bits(&s->gb, 10) + 1;
-            if (get_bits_left(&s->gb) < text_length * 8)
+            // Byte size of additional text info
+            int text_size = get_bits(&s->gb, 10) + 1;
+
+            // Sanity check available size
+            if (get_bits_left(&s->gb) < text_size * 8)
                 return AVERROR_INVALIDDATA;
-            skip_bits_long(&s->gb, text_length * 8); // info text
+
+            // Additional textual information string
+            skip_bits_long(&s->gb, text_size * 8);
         }
 
-        skip_bits(&s->gb, 5); // bit resolution - 1
-        skip_bits(&s->gb, 4); // max sample rate code
-        channels = get_bits(&s->gb, 8) + 1;
+        // PCM bit resolution
+        asset->pcm_bit_res = get_bits(&s->gb, 5) + 1;
 
-        s->one2one_map_chtospkr = get_bits1(&s->gb);
-        if (s->one2one_map_chtospkr) {
-            int spkr_remap_sets;
-            int spkr_mask_size = 16;
-            int num_spkrs[7];
+        // Maximum sample rate
+        asset->max_sample_rate = ff_dca_sampling_freqs[get_bits(&s->gb, 4)];
 
-            if (channels > 2)
-                embedded_stereo = get_bits1(&s->gb);
-            if (channels > 6)
-                embedded_6ch = get_bits1(&s->gb);
+        // Total number of channels
+        asset->nchannels_total = get_bits(&s->gb, 8) + 1;
 
-            if (get_bits1(&s->gb)) {
-                spkr_mask_size = (get_bits(&s->gb, 2) + 1) << 2;
-                skip_bits(&s->gb, spkr_mask_size); // spkr activity mask
-            }
+        // One to one map channel to speakers
+        if (asset->one_to_one_map_ch_to_spkr = get_bits1(&s->gb)) {
+            int spkr_mask_nbits = 0;
+            int spkr_remap_nsets;
+            int nspeakers[8];
 
-            spkr_remap_sets = get_bits(&s->gb, 3);
+            // Embedded stereo flag
+            asset->embedded_stereo = asset->nchannels_total > 2 && get_bits1(&s->gb);
 
-            for (i = 0; i < spkr_remap_sets; i++) {
-                /* std layout mask for each remap set */
-                num_spkrs[i] = dca_exss_mask2count(get_bits(&s->gb, spkr_mask_size));
+            // Embedded 6 channels flag
+            asset->embedded_6ch = asset->nchannels_total > 6 && get_bits1(&s->gb);
+
+            // Speaker mask enabled flag
+            if (asset->spkr_mask_enabled = get_bits1(&s->gb)) {
+                // Number of bits for speaker activity mask
+                spkr_mask_nbits = (get_bits(&s->gb, 2) + 1) << 2;
+
+                // Loudspeaker activity mask
+                asset->spkr_mask = get_bits(&s->gb, spkr_mask_nbits);
+            }
+
+            // Number of speaker remapping sets
+            if ((spkr_remap_nsets = get_bits(&s->gb, 3)) && !spkr_mask_nbits) {
+                if (s->avctx)
+                    av_log(s->avctx, AV_LOG_ERROR, "Speaker mask disabled yet there are remapping sets\n");
+                return AVERROR_INVALIDDATA;
             }
 
-            for (i = 0; i < spkr_remap_sets; i++) {
-                int num_dec_ch_remaps = get_bits(&s->gb, 5) + 1;
-                if (get_bits_left(&s->gb) < 0)
-                    return AVERROR_INVALIDDATA;
+            // Standard loudspeaker layout mask
+            for (i = 0; i < spkr_remap_nsets; i++)
+                nspeakers[i] = ff_dca_count_chs_for_mask(get_bits(&s->gb, spkr_mask_nbits));
+
+            for (i = 0; i < spkr_remap_nsets; i++) {
+                // Number of channels to be decoded for speaker remapping
+                int nch_for_remaps = get_bits(&s->gb, 5) + 1;
 
-                for (j = 0; j < num_spkrs[i]; j++) {
-                    int remap_dec_ch_mask = get_bits_long(&s->gb, num_dec_ch_remaps);
-                    int num_dec_ch = av_popcount(remap_dec_ch_mask);
-                    skip_bits_long(&s->gb, num_dec_ch * 5); // remap codes
+                for (j = 0; j < nspeakers[i]; j++) {
+                    // Decoded channels to output speaker mapping mask
+                    int remap_ch_mask = get_bits_long(&s->gb, nch_for_remaps);
+
+                    // Loudspeaker remapping codes
+                    skip_bits_long(&s->gb, av_popcount(remap_ch_mask) * 5);
                 }
             }
         } else {
-            skip_bits(&s->gb, 3); // representation type
+            asset->embedded_stereo = 0;
+            asset->embedded_6ch = 0;
+            asset->spkr_mask_enabled = 0;
+            asset->spkr_mask = 0;
+
+            // Representation type
+            asset->representation_type = get_bits(&s->gb, 3);
         }
     }
 
-    drc_code_present = get_bits1(&s->gb);
-    if (drc_code_present)
-        get_bits(&s->gb, 8); // drc code
+    //
+    // DRC, DNC and mixing metadata
+    //
+
+    // Dynamic range coefficient presence flag
+    drc_present = get_bits1(&s->gb);
 
+    // Code for dynamic range coefficient
+    if (drc_present)
+        skip_bits(&s->gb, 8);
+
+    // Dialog normalization presence flag
     if (get_bits1(&s->gb))
-        skip_bits(&s->gb, 5); // dialog normalization code
+        // Dialog normalization code
+        skip_bits(&s->gb, 5);
 
-    if (drc_code_present && embedded_stereo)
-        get_bits(&s->gb, 8); // drc stereo code
+    // DRC for stereo downmix
+    if (drc_present && asset->embedded_stereo)
+        skip_bits(&s->gb, 8);
 
-    if (s->mix_metadata && get_bits1(&s->gb)) {
-        skip_bits(&s->gb, 1); // external mix
-        skip_bits(&s->gb, 6); // post mix gain code
+    // Mixing metadata presence flag
+    if (s->mix_metadata_enabled && get_bits1(&s->gb)) {
+        int nchannels_dmix;
 
-        if (get_bits(&s->gb, 2) != 3) // mixer drc code
-            skip_bits(&s->gb, 3); // drc limit
-        else
-            skip_bits(&s->gb, 8); // custom drc code
+        // External mixing flag
+        skip_bits1(&s->gb);
+
+        // Post mixing / replacement gain adjustment
+        skip_bits(&s->gb, 6);
 
-        if (get_bits1(&s->gb)) // channel specific scaling
-            for (i = 0; i < s->num_mix_configs; i++)
-                skip_bits_long(&s->gb, s->mix_config_num_ch[i] * 6); // scale codes
+        // DRC prior to mixing
+        if (get_bits(&s->gb, 2) == 3)
+            // Custom code for mixing DRC
+            skip_bits(&s->gb, 8);
         else
-            skip_bits_long(&s->gb, s->num_mix_configs * 6); // scale codes
+            // Limit for mixing DRC
+            skip_bits(&s->gb, 3);
 
-        for (i = 0; i < s->num_mix_configs; i++) {
-            if (get_bits_left(&s->gb) < 0)
+        // Scaling type for channels of main audio
+        // Scaling parameters of main audio
+        if (get_bits1(&s->gb))
+            for (i = 0; i < s->nmixoutconfigs; i++)
+                skip_bits_long(&s->gb, 6 * s->nmixoutchs[i]);
+        else
+            skip_bits_long(&s->gb, 6 * s->nmixoutconfigs);
+
+        nchannels_dmix = asset->nchannels_total;
+        if (asset->embedded_6ch)
+            nchannels_dmix += 6;
+        if (asset->embedded_stereo)
+            nchannels_dmix += 2;
+
+        for (i = 0; i < s->nmixoutconfigs; i++) {
+            if (!s->nmixoutchs[i]) {
+                if (s->avctx)
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid speaker layout mask for mixing configuration\n");
                 return AVERROR_INVALIDDATA;
-            dca_exss_skip_mix_coeffs(&s->gb, channels, s->mix_config_num_ch[i]);
-            if (embedded_6ch)
-                dca_exss_skip_mix_coeffs(&s->gb, 6, s->mix_config_num_ch[i]);
-            if (embedded_stereo)
-                dca_exss_skip_mix_coeffs(&s->gb, 2, s->mix_config_num_ch[i]);
+            }
+            for (j = 0; j < nchannels_dmix; j++) {
+                // Mix output mask
+                int mix_map_mask = get_bits(&s->gb, s->nmixoutchs[i]);
+
+                // Mixing coefficients
+                skip_bits_long(&s->gb, av_popcount(mix_map_mask) * 6);
+            }
         }
     }
 
-    switch (get_bits(&s->gb, 2)) {
-    case 0:
-        extensions_mask = get_bits(&s->gb, 12);
+    //
+    // Decoder navigation data
+    //
+
+    // Coding mode for the asset
+    asset->coding_mode = get_bits(&s->gb, 2);
+
+    // Coding components used in asset
+    switch (asset->coding_mode) {
+    case 0: // Coding mode that may contain multiple coding components
+        asset->extension_mask = get_bits(&s->gb, 12);
+
+        if (asset->extension_mask & DCA_EXSS_CORE) {
+            // Size of core component in extension substream
+            asset->core_size = get_bits(&s->gb, 14) + 1;
+            // Core sync word present flag
+            if (get_bits1(&s->gb))
+                // Core sync distance
+                skip_bits(&s->gb, 2);
+        }
+
+        if (asset->extension_mask & DCA_EXSS_XBR)
+            // Size of XBR extension in extension substream
+            asset->xbr_size = get_bits(&s->gb, 14) + 1;
+
+        if (asset->extension_mask & DCA_EXSS_XXCH)
+            // Size of XXCH extension in extension substream
+            asset->xxch_size = get_bits(&s->gb, 14) + 1;
+
+        if (asset->extension_mask & DCA_EXSS_X96)
+            // Size of X96 extension in extension substream
+            asset->x96_size = get_bits(&s->gb, 12) + 1;
+
+        if (asset->extension_mask & DCA_EXSS_LBR)
+            parse_lbr_parameters(s, asset);
+
+        if (asset->extension_mask & DCA_EXSS_XLL)
+            parse_xll_parameters(s, asset);
+
+        if (asset->extension_mask & DCA_EXSS_RSV1)
+            skip_bits(&s->gb, 16);
+
+        if (asset->extension_mask & DCA_EXSS_RSV2)
+            skip_bits(&s->gb, 16);
         break;
-    case 1:
-        extensions_mask = DCA_EXT_EXSS_XLL;
+
+    case 1: // Loss-less coding mode without CBR component
+        asset->extension_mask = DCA_EXSS_XLL;
+        parse_xll_parameters(s, asset);
         break;
-    case 2:
-        extensions_mask = DCA_EXT_EXSS_LBR;
+
+    case 2: // Low bit rate mode
+        asset->extension_mask = DCA_EXSS_LBR;
+        parse_lbr_parameters(s, asset);
         break;
-    case 3:
-        extensions_mask = 0; /* aux coding */
+
+    case 3: // Auxiliary coding mode
+        asset->extension_mask = 0;
+
+        // Size of auxiliary coded data
+        skip_bits(&s->gb, 14);
+
+        // Auxiliary codec identification
+        skip_bits(&s->gb, 8);
+
+        // Aux sync word present flag
+        if (get_bits1(&s->gb))
+            // Aux sync distance
+            skip_bits(&s->gb, 3);
         break;
     }
 
-    /* not parsed further, we were only interested in the extensions mask */
-
-    if (get_bits_left(&s->gb) < 0)
+    if (asset->extension_mask & DCA_EXSS_XLL)
+        // DTS-HD stream ID
+        asset->hd_stream_id = get_bits(&s->gb, 3);
+
+    // One to one mixing flag
+    // Per channel main audio scaling flag
+    // Main audio scaling codes
+    // Decode asset in secondary decoder flag
+    // Revision 2 DRC metadata
+    // Reserved
+    // Zero pad
+    if (ff_dca_seek_bits(&s->gb, descr_pos + descr_size * 8)) {
+        if (s->avctx)
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of EXSS asset descriptor\n");
         return AVERROR_INVALIDDATA;
+    }
 
-    if (get_bits_count(&s->gb) - header_pos > header_size * 8) {
-        av_log(s->avctx, AV_LOG_WARNING, "Asset header size mismatch.\n");
-        return AVERROR_INVALIDDATA;
+    return 0;
+}
+
+static int set_exss_offsets(DCAExssAsset *asset)
+{
+    int offs = asset->asset_offset;
+    int size = asset->asset_size;
+
+    if (asset->extension_mask & DCA_EXSS_CORE) {
+        asset->core_offset = offs;
+        if (asset->core_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->core_size;
+        size -= asset->core_size;
+    }
+
+    if (asset->extension_mask & DCA_EXSS_XBR) {
+        asset->xbr_offset = offs;
+        if (asset->xbr_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->xbr_size;
+        size -= asset->xbr_size;
+    }
+
+    if (asset->extension_mask & DCA_EXSS_XXCH) {
+        asset->xxch_offset = offs;
+        if (asset->xxch_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->xxch_size;
+        size -= asset->xxch_size;
+    }
+
+    if (asset->extension_mask & DCA_EXSS_X96) {
+        asset->x96_offset = offs;
+        if (asset->x96_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->x96_size;
+        size -= asset->x96_size;
     }
-    skip_bits_long(&s->gb, header_pos + header_size * 8 - get_bits_count(&s->gb));
 
-    if (extensions_mask & DCA_EXT_EXSS_XLL)
-        s->profile = FF_PROFILE_DTS_HD_MA;
-    else if (extensions_mask & (DCA_EXT_EXSS_XBR | DCA_EXT_EXSS_X96 |
-                                DCA_EXT_EXSS_XXCH))
-        s->profile = FF_PROFILE_DTS_HD_HRA;
+    if (asset->extension_mask & DCA_EXSS_LBR) {
+        asset->lbr_offset = offs;
+        if (asset->lbr_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->lbr_size;
+        size -= asset->lbr_size;
+    }
 
-    if (!(extensions_mask & DCA_EXT_CORE))
-        av_log(s->avctx, AV_LOG_WARNING, "DTS core detection mismatch.\n");
-    if ((extensions_mask & DCA_CORE_EXTS) != s->core_ext_mask)
-        av_log(s->avctx, AV_LOG_WARNING,
-               "DTS extensions detection mismatch (%d, %d)\n",
-               extensions_mask & DCA_CORE_EXTS, s->core_ext_mask);
+    if (asset->extension_mask & DCA_EXSS_XLL) {
+        asset->xll_offset = offs;
+        if (asset->xll_size > size)
+            return AVERROR_INVALIDDATA;
+        offs += asset->xll_size;
+        size -= asset->xll_size;
+    }
 
     return 0;
 }
 
-/**
- * Parse extension substream header (HD)
- */
-void ff_dca_exss_parse_header(DCAContext *s)
+int ff_dca_exss_parse(DCAExssParser *s, const uint8_t *data, int size)
 {
-    int asset_size[8];
-    int ss_index;
-    int blownup;
-    int num_audiop = 1;
-    int num_assets = 1;
-    int active_ss_mask[8];
-    int i, j;
-    int start_pos;
-    int hdrsize;
-    uint32_t mkr;
-
-    if (get_bits_left(&s->gb) < 52)
-        return;
-
-    start_pos = get_bits_count(&s->gb) - 32;
-
-    skip_bits(&s->gb, 8); // user data
-    ss_index = get_bits(&s->gb, 2);
-
-    blownup = get_bits1(&s->gb);
-    hdrsize = get_bits(&s->gb,  8 + 4 * blownup) + 1; // header_size
-    skip_bits(&s->gb, 16 + 4 * blownup); // hd_size
-
-    s->static_fields = get_bits1(&s->gb);
-    if (s->static_fields) {
-        skip_bits(&s->gb, 2); // reference clock code
-        skip_bits(&s->gb, 3); // frame duration code
+    int i, ret, offset, wide_hdr, header_size;
 
-        if (get_bits1(&s->gb))
-            skip_bits_long(&s->gb, 36); // timestamp
+    if ((ret = init_get_bits8(&s->gb, data, size)) < 0)
+        return ret;
+
+    // Extension substream sync word
+    skip_bits_long(&s->gb, 32);
+
+    // User defined bits
+    skip_bits(&s->gb, 8);
 
-        /* a single stream can contain multiple audio assets that can be
-         * combined to form multiple audio presentations */
+    // Extension substream index
+    s->exss_index = get_bits(&s->gb, 2);
 
-        num_audiop = get_bits(&s->gb, 3) + 1;
-        if (num_audiop > 1) {
-            avpriv_request_sample(s->avctx,
-                                  "Multiple DTS-HD audio presentations");
-            /* ignore such streams for now */
-            return;
+    // Flag indicating short or long header size
+    wide_hdr = get_bits1(&s->gb);
+
+    // Extension substream header length
+    header_size = get_bits(&s->gb, 8 + 4 * wide_hdr) + 1;
+
+    // Check CRC
+    if (s->avctx && ff_dca_check_crc(s->avctx, &s->gb, 32 + 8, header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid EXSS header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->exss_size_nbits = 16 + 4 * wide_hdr;
+
+    // Number of bytes of extension substream
+    s->exss_size = get_bits(&s->gb, s->exss_size_nbits) + 1;
+    if (s->exss_size > size) {
+        if (s->avctx)
+            av_log(s->avctx, AV_LOG_ERROR, "Packet too short for EXSS frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Per stream static fields presence flag
+    if (s->static_fields_present = get_bits1(&s->gb)) {
+        int active_exss_mask[8];
+
+        // Reference clock code
+        skip_bits(&s->gb, 2);
+
+        // Extension substream frame duration
+        skip_bits(&s->gb, 3);
+
+        // Timecode presence flag
+        if (get_bits1(&s->gb))
+            // Timecode data
+            skip_bits_long(&s->gb, 36);
+
+        // Number of defined audio presentations
+        s->npresents = get_bits(&s->gb, 3) + 1;
+        if (s->npresents > 1) {
+            if (s->avctx)
+                avpriv_request_sample(s->avctx, "%d audio presentations", s->npresents);
+            return AVERROR_PATCHWELCOME;
         }
 
-        num_assets = get_bits(&s->gb, 3) + 1;
-        if (num_assets > 1) {
-            avpriv_request_sample(s->avctx, "Multiple DTS-HD audio assets");
-            /* ignore such streams for now */
-            return;
+        // Number of audio assets in extension substream
+        s->nassets = get_bits(&s->gb, 3) + 1;
+        if (s->nassets > 1) {
+            if (s->avctx)
+                avpriv_request_sample(s->avctx, "%d audio assets", s->nassets);
+            return AVERROR_PATCHWELCOME;
         }
 
-        for (i = 0; i < num_audiop; i++)
-            active_ss_mask[i] = get_bits(&s->gb, ss_index + 1);
+        // Active extension substream mask for audio presentation
+        for (i = 0; i < s->npresents; i++)
+            active_exss_mask[i] = get_bits(&s->gb, s->exss_index + 1);
 
-        for (i = 0; i < num_audiop; i++)
-            for (j = 0; j <= ss_index; j++)
-                if (active_ss_mask[i] & (1 << j))
-                    skip_bits(&s->gb, 8); // active asset mask
+        // Active audio asset mask
+        for (i = 0; i < s->npresents; i++)
+            skip_bits_long(&s->gb, av_popcount(active_exss_mask[i]) * 8);
 
-        s->mix_metadata = get_bits1(&s->gb);
-        if (s->mix_metadata) {
-            int mix_out_mask_size;
+        // Mixing metadata enable flag
+        if (s->mix_metadata_enabled = get_bits1(&s->gb)) {
+            int spkr_mask_nbits;
 
-            skip_bits(&s->gb, 2); // adjustment level
-            mix_out_mask_size  = (get_bits(&s->gb, 2) + 1) << 2;
-            s->num_mix_configs =  get_bits(&s->gb, 2) + 1;
+            // Mixing metadata adjustment level
+            skip_bits(&s->gb, 2);
 
-            for (i = 0; i < s->num_mix_configs; i++) {
-                int mix_out_mask        = get_bits(&s->gb, mix_out_mask_size);
-                s->mix_config_num_ch[i] = dca_exss_mask2count(mix_out_mask);
-            }
-        }
-    }
+            // Number of bits for mixer output speaker activity mask
+            spkr_mask_nbits = (get_bits(&s->gb, 2) + 1) << 2;
 
-    for (i = 0; i < num_assets; i++)
-        asset_size[i] = get_bits_long(&s->gb, 16 + 4 * blownup) + 1;
+            // Number of mixing configurations
+            s->nmixoutconfigs = get_bits(&s->gb, 2) + 1;
 
-    for (i = 0; i < num_assets; i++) {
-        if (dca_exss_parse_asset_header(s))
-            return;
+            // Speaker layout mask for mixer output channels
+            for (i = 0; i < s->nmixoutconfigs; i++)
+                s->nmixoutchs[i] = ff_dca_count_chs_for_mask(get_bits(&s->gb, spkr_mask_nbits));
+        }
+    } else {
+        s->npresents = 1;
+        s->nassets = 1;
     }
 
-    if (num_assets > 0) {
-        j = get_bits_count(&s->gb);
-        if (start_pos + hdrsize * 8 > j)
-            skip_bits_long(&s->gb, start_pos + hdrsize * 8 - j);
-
-        for (i = 0; i < num_assets; i++) {
-            int end_pos;
-            start_pos = get_bits_count(&s->gb);
-            end_pos   = start_pos + asset_size[i] * 8;
-            mkr       = get_bits_long(&s->gb, 32);
-
-            /* parse extensions that we know about */
-            switch (mkr) {
-            case DCA_SYNCWORD_XLL:
-                if (s->xll_disable) {
-                    av_log(s->avctx, AV_LOG_DEBUG,
-                           "DTS-XLL: ignoring XLL extension\n");
-                    break;
-                }
-                av_log(s->avctx, AV_LOG_DEBUG,
-                       "DTS-XLL: decoding XLL extension\n");
-                if (ff_dca_xll_decode_header(s)        == 0 &&
-                    ff_dca_xll_decode_navi(s, end_pos) == 0)
-                    s->exss_ext_mask |= DCA_EXT_EXSS_XLL;
-                break;
-            case DCA_SYNCWORD_XBR:
-            case DCA_SYNCWORD_XXCH:
-            default:
-                av_log(s->avctx, AV_LOG_VERBOSE,
-                       "DTS-ExSS: unknown marker = 0x%08"PRIx32"\n", mkr);
-            }
+    // Size of encoded asset data in bytes
+    offset = header_size;
+    for (i = 0; i < s->nassets; i++) {
+        s->assets[i].asset_offset = offset;
+        s->assets[i].asset_size = get_bits(&s->gb, s->exss_size_nbits) + 1;
+        offset += s->assets[i].asset_size;
+        if (offset > s->exss_size) {
+            if (s->avctx)
+                av_log(s->avctx, AV_LOG_ERROR, "EXSS asset out of bounds\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
 
-            /* skip to end of block */
-            j = get_bits_count(&s->gb);
-            if (j > end_pos)
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "DTS-ExSS: Processed asset too long.\n");
-            if (j < end_pos)
-                skip_bits_long(&s->gb, end_pos - j);
+    // Audio asset descriptor
+    for (i = 0; i < s->nassets; i++) {
+        if ((ret = parse_descriptor(s, &s->assets[i])) < 0)
+            return ret;
+        if ((ret = set_exss_offsets(&s->assets[i])) < 0) {
+            if (s->avctx)
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid extension size in EXSS asset descriptor\n");
+            return ret;
         }
     }
+
+    // Backward compatible core present
+    // Backward compatible core substream index
+    // Backward compatible core asset index
+    // Reserved
+    // Byte align
+    // CRC16 of extension substream header
+    if (ff_dca_seek_bits(&s->gb, header_size * 8)) {
+        if (s->avctx)
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of EXSS header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
 }
diff --git a/libavcodec/dca_exss.h b/libavcodec/dca_exss.h
new file mode 100644
index 0000000..208fae1
--- /dev/null
+++ b/libavcodec/dca_exss.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_EXSS_H
+#define AVCODEC_DCA_EXSS_H
+
+#include "libavutil/common.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+
+typedef struct DCAExssAsset {
+    int     asset_offset;   ///< Offset to asset data from start of substream
+    int     asset_size;     ///< Size of encoded asset data
+    int     asset_index;    ///< Audio asset identifier
+
+    int     pcm_bit_res;                ///< PCM bit resolution
+    int     max_sample_rate;            ///< Maximum sample rate
+    int     nchannels_total;            ///< Total number of channels
+    int     one_to_one_map_ch_to_spkr;  ///< One to one channel to speaker mapping flag
+    int     embedded_stereo;            ///< Embedded stereo flag
+    int     embedded_6ch;               ///< Embedded 6 channels flag
+    int     spkr_mask_enabled;          ///< Speaker mask enabled flag
+    int     spkr_mask;                  ///< Loudspeaker activity mask
+    int     representation_type;        ///< Representation type
+
+    int     coding_mode;        ///< Coding mode for the asset
+    int     extension_mask;     ///< Coding components used in asset
+
+    int     core_offset;    ///< Offset to core component from start of substream
+    int     core_size;      ///< Size of core component in extension substream
+
+    int     xbr_offset;     ///< Offset to XBR extension from start of substream
+    int     xbr_size;       ///< Size of XBR extension in extension substream
+
+    int     xxch_offset;    ///< Offset to XXCH extension from start of substream
+    int     xxch_size;      ///< Size of XXCH extension in extension substream
+
+    int     x96_offset;     ///< Offset to X96 extension from start of substream
+    int     x96_size;       ///< Size of X96 extension in extension substream
+
+    int     lbr_offset;     ///< Offset to LBR component from start of substream
+    int     lbr_size;       ///< Size of LBR component in extension substream
+
+    int     xll_offset;         ///< Offset to XLL data from start of substream
+    int     xll_size;           ///< Size of XLL data in extension substream
+    int     xll_sync_present;   ///< XLL sync word present flag
+    int     xll_delay_nframes;  ///< Initial XLL decoding delay in frames
+    int     xll_sync_offset;    ///< Number of bytes offset to XLL sync
+
+    int     hd_stream_id;   ///< DTS-HD stream ID
+} DCAExssAsset;
+
+typedef struct DCAExssParser {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    int     exss_index;         ///< Extension substream index
+    int     exss_size_nbits;    ///< Number of bits for extension substream size
+    int     exss_size;          ///< Number of bytes of extension substream
+
+    int     static_fields_present;  ///< Per stream static fields presence flag
+    int     npresents;  ///< Number of defined audio presentations
+    int     nassets;    ///< Number of audio assets in extension substream
+
+    int     mix_metadata_enabled;   ///< Mixing metadata enable flag
+    int     nmixoutconfigs;         ///< Number of mixing configurations
+    int     nmixoutchs[4];          ///< Speaker layout mask for mixer output channels
+
+    DCAExssAsset   assets[1];    ///< Audio asset descriptors
+} DCAExssParser;
+
+int ff_dca_exss_parse(DCAExssParser *s, const uint8_t *data, int size);
+
+#endif
diff --git a/libavcodec/dca_lbr.c b/libavcodec/dca_lbr.c
new file mode 100644
index 0000000..342603c
--- /dev/null
+++ b/libavcodec/dca_lbr.c
@@ -0,0 +1,1819 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_READER_LE
+
+#include "libavutil/channel_layout.h"
+
+#include "dcadec.h"
+#include "dcadata.h"
+#include "dcahuff.h"
+#include "dca_syncwords.h"
+#include "bytestream.h"
+
+#define AMP_MAX     56
+
+enum LBRHeader {
+    LBR_HEADER_SYNC_ONLY    = 1,
+    LBR_HEADER_DECODER_INIT = 2
+};
+
+enum LBRFlags {
+    LBR_FLAG_24_BIT             = 0x01,
+    LBR_FLAG_LFE_PRESENT        = 0x02,
+    LBR_FLAG_BAND_LIMIT_2_3     = 0x04,
+    LBR_FLAG_BAND_LIMIT_1_2     = 0x08,
+    LBR_FLAG_BAND_LIMIT_1_3     = 0x0c,
+    LBR_FLAG_BAND_LIMIT_1_4     = 0x10,
+    LBR_FLAG_BAND_LIMIT_1_8     = 0x18,
+    LBR_FLAG_BAND_LIMIT_NONE    = 0x14,
+    LBR_FLAG_BAND_LIMIT_MASK    = 0x1c,
+    LBR_FLAG_DMIX_STEREO        = 0x20,
+    LBR_FLAG_DMIX_MULTI_CH      = 0x40
+};
+
+enum LBRChunkTypes {
+    LBR_CHUNK_NULL              = 0x00,
+    LBR_CHUNK_PAD               = 0x01,
+    LBR_CHUNK_FRAME             = 0x04,
+    LBR_CHUNK_FRAME_NO_CSUM     = 0x06,
+    LBR_CHUNK_LFE               = 0x0a,
+    LBR_CHUNK_ECS               = 0x0b,
+    LBR_CHUNK_RESERVED_1        = 0x0c,
+    LBR_CHUNK_RESERVED_2        = 0x0d,
+    LBR_CHUNK_SCF               = 0x0e,
+    LBR_CHUNK_TONAL             = 0x10,
+    LBR_CHUNK_TONAL_GRP_1       = 0x11,
+    LBR_CHUNK_TONAL_GRP_2       = 0x12,
+    LBR_CHUNK_TONAL_GRP_3       = 0x13,
+    LBR_CHUNK_TONAL_GRP_4       = 0x14,
+    LBR_CHUNK_TONAL_GRP_5       = 0x15,
+    LBR_CHUNK_TONAL_SCF         = 0x16,
+    LBR_CHUNK_TONAL_SCF_GRP_1   = 0x17,
+    LBR_CHUNK_TONAL_SCF_GRP_2   = 0x18,
+    LBR_CHUNK_TONAL_SCF_GRP_3   = 0x19,
+    LBR_CHUNK_TONAL_SCF_GRP_4   = 0x1a,
+    LBR_CHUNK_TONAL_SCF_GRP_5   = 0x1b,
+    LBR_CHUNK_RES_GRID_LR       = 0x30,
+    LBR_CHUNK_RES_GRID_LR_LAST  = 0x3f,
+    LBR_CHUNK_RES_GRID_HR       = 0x40,
+    LBR_CHUNK_RES_GRID_HR_LAST  = 0x4f,
+    LBR_CHUNK_RES_TS_1          = 0x50,
+    LBR_CHUNK_RES_TS_1_LAST     = 0x5f,
+    LBR_CHUNK_RES_TS_2          = 0x60,
+    LBR_CHUNK_RES_TS_2_LAST     = 0x6f,
+    LBR_CHUNK_EXTENSION         = 0x7f
+};
+
+typedef struct LBRChunk {
+    int id, len;
+    const uint8_t *data;
+} LBRChunk;
+
+static const int8_t channel_reorder_nolfe[7][5] = {
+    { 0, -1, -1, -1, -1 },  // C
+    { 0,  1, -1, -1, -1 },  // LR
+    { 0,  1,  2, -1, -1 },  // LR C
+    { 0,  1, -1, -1, -1 },  // LsRs
+    { 1,  2,  0, -1, -1 },  // LsRs C
+    { 0,  1,  2,  3, -1 },  // LR LsRs
+    { 0,  1,  3,  4,  2 },  // LR LsRs C
+};
+
+static const int8_t channel_reorder_lfe[7][5] = {
+    { 0, -1, -1, -1, -1 },  // C
+    { 0,  1, -1, -1, -1 },  // LR
+    { 0,  1,  2, -1, -1 },  // LR C
+    { 1,  2, -1, -1, -1 },  // LsRs
+    { 2,  3,  0, -1, -1 },  // LsRs C
+    { 0,  1,  3,  4, -1 },  // LR LsRs
+    { 0,  1,  4,  5,  2 },  // LR LsRs C
+};
+
+static const uint8_t lfe_index[7] = {
+    1, 2, 3, 0, 1, 2, 3
+};
+
+static const uint8_t channel_counts[7] = {
+    1, 2, 3, 2, 3, 4, 5
+};
+
+static const uint16_t channel_layouts[7] = {
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,
+    AV_CH_FRONT_CENTER | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,
+    AV_CH_LAYOUT_2_2,
+    AV_CH_LAYOUT_5POINT0
+};
+
+static float    cos_tab[256];
+static float    lpc_tab[16];
+
+static av_cold void init_tables(void)
+{
+    static int initialized;
+    int i;
+
+    if (initialized)
+        return;
+
+    for (i = 0; i < 256; i++)
+        cos_tab[i] = cos(M_PI * i / 128);
+
+    for (i = 0; i < 16; i++)
+        lpc_tab[i] = sin((i - 8) * (M_PI / ((i < 8) ? 17 : 15)));
+
+    initialized = 1;
+}
+
+static int parse_lfe_24(DCALbrDecoder *s)
+{
+    int step_max = FF_ARRAY_ELEMS(ff_dca_lfe_step_size_24) - 1;
+    int i, ps, si, code, step_i;
+    float step, value, delta;
+
+    ps = get_bits(&s->gb, 24);
+    si = ps >> 23;
+
+    value = (((ps & 0x7fffff) ^ -si) + si) * (1.0f / 0x7fffff);
+
+    step_i = get_bits(&s->gb, 8);
+    if (step_i > step_max) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE step size index\n");
+        return -1;
+    }
+
+    step = ff_dca_lfe_step_size_24[step_i];
+
+    for (i = 0; i < 64; i++) {
+        code = get_bits(&s->gb, 6);
+
+        delta = step * 0.03125f;
+        if (code & 16)
+            delta += step;
+        if (code & 8)
+            delta += step * 0.5f;
+        if (code & 4)
+            delta += step * 0.25f;
+        if (code & 2)
+            delta += step * 0.125f;
+        if (code & 1)
+            delta += step * 0.0625f;
+
+        if (code & 32) {
+            value -= delta;
+            if (value < -3.0f)
+                value = -3.0f;
+        } else {
+            value += delta;
+            if (value > 3.0f)
+                value = 3.0f;
+        }
+
+        step_i += ff_dca_lfe_delta_index_24[code & 31];
+        step_i = av_clip(step_i, 0, step_max);
+
+        step = ff_dca_lfe_step_size_24[step_i];
+        s->lfe_data[i] = value * s->lfe_scale;
+    }
+
+    return 0;
+}
+
+static int parse_lfe_16(DCALbrDecoder *s)
+{
+    int step_max = FF_ARRAY_ELEMS(ff_dca_lfe_step_size_16) - 1;
+    int i, ps, si, code, step_i;
+    float step, value, delta;
+
+    ps = get_bits(&s->gb, 16);
+    si = ps >> 15;
+
+    value = (((ps & 0x7fff) ^ -si) + si) * (1.0f / 0x7fff);
+
+    step_i = get_bits(&s->gb, 8);
+    if (step_i > step_max) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE step size index\n");
+        return -1;
+    }
+
+    step = ff_dca_lfe_step_size_16[step_i];
+
+    for (i = 0; i < 64; i++) {
+        code = get_bits(&s->gb, 4);
+
+        delta = step * 0.125f;
+        if (code & 4)
+            delta += step;
+        if (code & 2)
+            delta += step * 0.5f;
+        if (code & 1)
+            delta += step * 0.25f;
+
+        if (code & 8) {
+            value -= delta;
+            if (value < -3.0f)
+                value = -3.0f;
+        } else {
+            value += delta;
+            if (value > 3.0f)
+                value = 3.0f;
+        }
+
+        step_i += ff_dca_lfe_delta_index_16[code & 7];
+        step_i = av_clip(step_i, 0, step_max);
+
+        step = ff_dca_lfe_step_size_16[step_i];
+        s->lfe_data[i] = value * s->lfe_scale;
+    }
+
+    return 0;
+}
+
+static int parse_lfe_chunk(DCALbrDecoder *s, LBRChunk *chunk)
+{
+    if (!(s->flags & LBR_FLAG_LFE_PRESENT))
+        return 0;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Determine bit depth from chunk size
+    if (chunk->len >= 52)
+        return parse_lfe_24(s);
+    if (chunk->len >= 35)
+        return parse_lfe_16(s);
+
+    av_log(s->avctx, AV_LOG_ERROR, "LFE chunk too short\n");
+    return -1;
+}
+
+static inline int parse_vlc(GetBitContext *s, VLC *vlc, int max_depth)
+{
+    int v = get_vlc2(s, vlc->table, vlc->bits, max_depth);
+    if (v > 0)
+        return v - 1;
+    // Rare value
+    return get_bits(s, get_bits(s, 3) + 1);
+}
+
+static int parse_tonal(DCALbrDecoder *s, int group)
+{
+    unsigned int amp[DCA_LBR_CHANNELS_TOTAL];
+    unsigned int phs[DCA_LBR_CHANNELS_TOTAL];
+    unsigned int diff, main_amp, shift;
+    int sf, sf_idx, ch, main_ch, freq;
+    int ch_nbits = av_ceil_log2(s->nchannels_total);
+
+    // Parse subframes for this group
+    for (sf = 0; sf < 1 << group; sf += diff ? 8 : 1) {
+        sf_idx = ((s->framenum << group) + sf) & 31;
+        s->tonal_bounds[group][sf_idx][0] = s->ntones;
+
+        // Parse tones for this subframe
+        for (freq = 1;; freq++) {
+            if (get_bits_left(&s->gb) < 1) {
+                av_log(s->avctx, AV_LOG_ERROR, "Tonal group chunk too short\n");
+                return -1;
+            }
+
+            diff = parse_vlc(&s->gb, &ff_dca_vlc_tnl_grp[group], 2);
+            if (diff >= FF_ARRAY_ELEMS(ff_dca_fst_amp)) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid tonal frequency diff\n");
+                return -1;
+            }
+
+            diff = get_bitsz(&s->gb, diff >> 2) + ff_dca_fst_amp[diff];
+            if (diff <= 1)
+                break;  // End of subframe
+
+            freq += diff - 2;
+            if (freq >> (5 - group) > s->nsubbands * 4 - 5) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid spectral line offset\n");
+                return -1;
+            }
+
+            // Main channel
+            main_ch = get_bitsz(&s->gb, ch_nbits);
+            main_amp = parse_vlc(&s->gb, &ff_dca_vlc_tnl_scf, 2)
+                + s->tonal_scf[ff_dca_freq_to_sb[freq >> (7 - group)]]
+                + s->limited_range - 2;
+            amp[main_ch] = main_amp < AMP_MAX ? main_amp : 0;
+            phs[main_ch] = get_bits(&s->gb, 3);
+
+            // Secondary channels
+            for (ch = 0; ch < s->nchannels_total; ch++) {
+                if (ch == main_ch)
+                    continue;
+                if (get_bits1(&s->gb)) {
+                    amp[ch] = amp[main_ch] - parse_vlc(&s->gb, &ff_dca_vlc_damp, 1);
+                    phs[ch] = phs[main_ch] - parse_vlc(&s->gb, &ff_dca_vlc_dph,  1);
+                } else {
+                    amp[ch] = 0;
+                    phs[ch] = 0;
+                }
+            }
+
+            if (amp[main_ch]) {
+                // Allocate new tone
+                DCALbrTone *t = &s->tones[s->ntones];
+                s->ntones = (s->ntones + 1) & (DCA_LBR_TONES - 1);
+
+                t->x_freq = freq >> (5 - group);
+                t->f_delt = (freq & ((1 << (5 - group)) - 1)) << group;
+                t->ph_rot = 256 - (t->x_freq & 1) * 128 - t->f_delt * 4;
+
+                shift = ff_dca_ph0_shift[(t->x_freq & 3) * 2 + (freq & 1)]
+                    - ((t->ph_rot << (5 - group)) - t->ph_rot);
+
+                for (ch = 0; ch < s->nchannels; ch++) {
+                    t->amp[ch] = amp[ch] < AMP_MAX ? amp[ch] : 0;
+                    t->phs[ch] = 128 - phs[ch] * 32 + shift;
+                }
+            }
+        }
+
+        s->tonal_bounds[group][sf_idx][1] = s->ntones;
+    }
+
+    return 0;
+}
+
+static int parse_tonal_chunk(DCALbrDecoder *s, LBRChunk *chunk)
+{
+    int sb, group;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Scale factors
+    if (chunk->id == LBR_CHUNK_SCF || chunk->id == LBR_CHUNK_TONAL_SCF) {
+        if (get_bits_left(&s->gb) < 36) {
+            av_log(s->avctx, AV_LOG_ERROR, "Tonal scale factor chunk too short\n");
+            return -1;
+        }
+        for (sb = 0; sb < 6; sb++)
+            s->tonal_scf[sb] = get_bits(&s->gb, 6);
+    }
+
+    // Tonal groups
+    if (chunk->id == LBR_CHUNK_TONAL || chunk->id == LBR_CHUNK_TONAL_SCF)
+        for (group = 0; group < 5; group++)
+            if (parse_tonal(s, group) < 0)
+                return -1;
+
+    return 0;
+}
+
+static int parse_tonal_group(DCALbrDecoder *s, LBRChunk *chunk)
+{
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    return parse_tonal(s, chunk->id);
+}
+
+/**
+ * Check point to ensure that enough bits are left. Aborts decoding
+ * by skipping to the end of chunk otherwise.
+ */
+static int ensure_bits(GetBitContext *s, int n)
+{
+    int left = get_bits_left(s);
+    if (left < 0)
+        return -1;
+    if (left < n) {
+        skip_bits_long(s, left);
+        return 1;
+    }
+    return 0;
+}
+
+static int parse_scale_factors(DCALbrDecoder *s, uint8_t *scf)
+{
+    int i, sf, prev, next, dist;
+
+    // Truncated scale factors remain zero
+    if (ensure_bits(&s->gb, 20))
+        return 0;
+
+    // Initial scale factor
+    prev = parse_vlc(&s->gb, &ff_dca_vlc_fst_rsd_amp, 2);
+
+    for (sf = 0; sf < 7; sf += dist) {
+        scf[sf] = prev; // Store previous value
+
+        if (ensure_bits(&s->gb, 20))
+            return 0;
+
+        // Interpolation distance
+        dist = parse_vlc(&s->gb, &ff_dca_vlc_rsd_apprx, 1) + 1;
+        if (dist > 7 - sf) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid scale factor distance\n");
+            return -1;
+        }
+
+        if (ensure_bits(&s->gb, 20))
+            return 0;
+
+        // Final interpolation point
+        next = parse_vlc(&s->gb, &ff_dca_vlc_rsd_amp, 2);
+
+        if (next & 1)
+            next = prev + ((next + 1) >> 1);
+        else
+            next = prev - ( next      >> 1);
+
+        // Interpolate
+        switch (dist) {
+        case 2:
+            if (next > prev)
+                scf[sf + 1] = prev + ((next - prev) >> 1);
+            else
+                scf[sf + 1] = prev - ((prev - next) >> 1);
+            break;
+
+        case 4:
+            if (next > prev) {
+                scf[sf + 1] = prev + ( (next - prev)      >> 2);
+                scf[sf + 2] = prev + ( (next - prev)      >> 1);
+                scf[sf + 3] = prev + (((next - prev) * 3) >> 2);
+            } else {
+                scf[sf + 1] = prev - ( (prev - next)      >> 2);
+                scf[sf + 2] = prev - ( (prev - next)      >> 1);
+                scf[sf + 3] = prev - (((prev - next) * 3) >> 2);
+            }
+            break;
+
+        default:
+            for (i = 1; i < dist; i++)
+                scf[sf + i] = prev + (next - prev) * i / dist;
+            break;
+        }
+
+        prev = next;
+    }
+
+    scf[sf] = next; // Store final value
+
+    return 0;
+}
+
+static int parse_st_code(GetBitContext *s, int min_v)
+{
+    unsigned int v = parse_vlc(s, &ff_dca_vlc_st_grid, 2) + min_v;
+
+    if (v & 1)
+        v = 16 + (v >> 1);
+    else
+        v = 16 - (v >> 1);
+
+    if (v >= FF_ARRAY_ELEMS(ff_dca_st_coeff))
+        v = 16;
+    return v;
+}
+
+static int parse_grid_1_chunk(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    int ch, sb, sf, nsubbands;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Scale factors
+    nsubbands = ff_dca_scf_to_grid_1[s->nsubbands - 1] + 1;
+    for (sb = 2; sb < nsubbands; sb++) {
+        if (parse_scale_factors(s, s->grid_1_scf[ch1][sb]) < 0)
+            return -1;
+        if (ch1 != ch2 && ff_dca_grid_1_to_scf[sb] < s->min_mono_subband
+            && parse_scale_factors(s, s->grid_1_scf[ch2][sb]) < 0)
+            return -1;
+    }
+
+    if (get_bits_left(&s->gb) < 1)
+        return 0;   // Should not happen, but a sample exists that proves otherwise
+
+    // Average values for third grid
+    for (sb = 0; sb < s->nsubbands - 4; sb++) {
+        s->grid_3_avg[ch1][sb] = parse_vlc(&s->gb, &ff_dca_vlc_avg_g3, 2) - 16;
+        if (ch1 != ch2) {
+            if (sb + 4 < s->min_mono_subband)
+                s->grid_3_avg[ch2][sb] = parse_vlc(&s->gb, &ff_dca_vlc_avg_g3, 2) - 16;
+            else
+                s->grid_3_avg[ch2][sb] = s->grid_3_avg[ch1][sb];
+        }
+    }
+
+    if (get_bits_left(&s->gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "First grid chunk too short\n");
+        return -1;
+    }
+
+    // Stereo image for partial mono mode
+    if (ch1 != ch2) {
+        int min_v[2];
+
+        if (ensure_bits(&s->gb, 8))
+            return 0;
+
+        min_v[0] = get_bits(&s->gb, 4);
+        min_v[1] = get_bits(&s->gb, 4);
+
+        nsubbands = (s->nsubbands - s->min_mono_subband + 3) / 4;
+        for (sb = 0; sb < nsubbands; sb++)
+            for (ch = ch1; ch <= ch2; ch++)
+                for (sf = 1; sf <= 4; sf++)
+                    s->part_stereo[ch][sb][sf] = parse_st_code(&s->gb, min_v[ch - ch1]);
+
+        if (get_bits_left(&s->gb) >= 0)
+            s->part_stereo_pres |= 1 << ch1;
+    }
+
+    // Low resolution spatial information is not decoded
+
+    return 0;
+}
+
+static int parse_grid_1_sec_ch(DCALbrDecoder *s, int ch2)
+{
+    int sb, nsubbands;
+
+    // Scale factors
+    nsubbands = ff_dca_scf_to_grid_1[s->nsubbands - 1] + 1;
+    for (sb = 2; sb < nsubbands; sb++) {
+        if (ff_dca_grid_1_to_scf[sb] >= s->min_mono_subband
+            && parse_scale_factors(s, s->grid_1_scf[ch2][sb]) < 0)
+            return -1;
+    }
+
+    // Average values for third grid
+    for (sb = 0; sb < s->nsubbands - 4; sb++) {
+        if (sb + 4 >= s->min_mono_subband) {
+            if (ensure_bits(&s->gb, 20))
+                return 0;
+            s->grid_3_avg[ch2][sb] = parse_vlc(&s->gb, &ff_dca_vlc_avg_g3, 2) - 16;
+        }
+    }
+
+    return 0;
+}
+
+static void parse_grid_3(DCALbrDecoder *s, int ch1, int ch2, int sb, int flag)
+{
+    int i, ch;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        if ((ch != ch1 && sb + 4 >= s->min_mono_subband) != flag)
+            continue;
+
+        if (s->grid_3_pres[ch] & (1U << sb))
+            continue;   // Already parsed
+
+        for (i = 0; i < 8; i++) {
+            if (ensure_bits(&s->gb, 20))
+                return;
+            s->grid_3_scf[ch][sb][i] = parse_vlc(&s->gb, &ff_dca_vlc_grid_3, 2) - 16;
+        }
+
+        // Flag scale factors for this subband parsed
+        s->grid_3_pres[ch] |= 1U << sb;
+    }
+}
+
+static float lbr_rand(DCALbrDecoder *s, int sb)
+{
+    s->lbr_rand = 1103515245U * s->lbr_rand + 12345U;
+    return s->lbr_rand * s->sb_scf[sb];
+}
+
+/**
+ * Parse time samples for one subband, filling truncated samples with randomness
+ */
+static void parse_ch(DCALbrDecoder *s, int ch, int sb, int quant_level, int flag)
+{
+    float *samples = s->time_samples[ch][sb];
+    int i, j, code, nblocks, coding_method;
+
+    if (ensure_bits(&s->gb, 20))
+        return; // Too few bits left
+
+    coding_method = get_bits1(&s->gb);
+
+    switch (quant_level) {
+    case 1:
+        nblocks = FFMIN(get_bits_left(&s->gb) / 8, DCA_LBR_TIME_SAMPLES / 8);
+        for (i = 0; i < nblocks; i++, samples += 8) {
+            code = get_bits(&s->gb, 8);
+            for (j = 0; j < 8; j++)
+                samples[j] = ff_dca_rsd_level_2a[(code >> j) & 1];
+        }
+        i = nblocks * 8;
+        break;
+
+    case 2:
+        if (coding_method) {
+            for (i = 0; i < DCA_LBR_TIME_SAMPLES && get_bits_left(&s->gb) >= 2; i++) {
+                if (get_bits1(&s->gb))
+                    samples[i] = ff_dca_rsd_level_2b[get_bits1(&s->gb)];
+                else
+                    samples[i] = 0;
+            }
+        } else {
+            nblocks = FFMIN(get_bits_left(&s->gb) / 8, (DCA_LBR_TIME_SAMPLES + 4) / 5);
+            for (i = 0; i < nblocks; i++, samples += 5) {
+                code = ff_dca_rsd_pack_5_in_8[get_bits(&s->gb, 8)];
+                for (j = 0; j < 5; j++)
+                    samples[j] = ff_dca_rsd_level_3[(code >> j * 2) & 3];
+            }
+            i = nblocks * 5;
+        }
+        break;
+
+    case 3:
+        nblocks = FFMIN(get_bits_left(&s->gb) / 7, (DCA_LBR_TIME_SAMPLES + 2) / 3);
+        for (i = 0; i < nblocks; i++, samples += 3) {
+            code = get_bits(&s->gb, 7);
+            for (j = 0; j < 3; j++)
+                samples[j] = ff_dca_rsd_level_5[ff_dca_rsd_pack_3_in_7[code][j]];
+        }
+        i = nblocks * 3;
+        break;
+
+    case 4:
+        for (i = 0; i < DCA_LBR_TIME_SAMPLES && get_bits_left(&s->gb) >= 6; i++)
+            samples[i] = ff_dca_rsd_level_8[get_vlc2(&s->gb, ff_dca_vlc_rsd.table, 6, 1)];
+        break;
+
+    case 5:
+        nblocks = FFMIN(get_bits_left(&s->gb) / 4, DCA_LBR_TIME_SAMPLES);
+        for (i = 0; i < nblocks; i++)
+            samples[i] = ff_dca_rsd_level_16[get_bits(&s->gb, 4)];
+        break;
+
+    default:
+        av_assert0(0);
+    }
+
+    if (flag && get_bits_left(&s->gb) < 20)
+        return; // Skip incomplete mono subband
+
+    for (; i < DCA_LBR_TIME_SAMPLES; i++)
+        s->time_samples[ch][sb][i] = lbr_rand(s, sb);
+
+    s->ch_pres[ch] |= 1U << sb;
+}
+
+static int parse_ts(DCALbrDecoder *s, int ch1, int ch2,
+                    int start_sb, int end_sb, int flag)
+{
+    int sb, sb_g3, sb_reorder, quant_level;
+
+    for (sb = start_sb; sb < end_sb; sb++) {
+        // Subband number before reordering
+        if (sb < 6) {
+            sb_reorder = sb;
+        } else if (flag && sb < s->max_mono_subband) {
+            sb_reorder = s->sb_indices[sb];
+        } else {
+            if (ensure_bits(&s->gb, 28))
+                break;
+            sb_reorder = get_bits(&s->gb, s->limited_range + 3);
+            if (sb_reorder < 6)
+                sb_reorder = 6;
+            s->sb_indices[sb] = sb_reorder;
+        }
+        if (sb_reorder >= s->nsubbands)
+            return -1;
+
+        // Third grid scale factors
+        if (sb == 12) {
+            for (sb_g3 = 0; sb_g3 < s->g3_avg_only_start_sb - 4; sb_g3++)
+                parse_grid_3(s, ch1, ch2, sb_g3, flag);
+        } else if (sb < 12 && sb_reorder >= 4) {
+            parse_grid_3(s, ch1, ch2, sb_reorder - 4, flag);
+        }
+
+        // Secondary channel flags
+        if (ch1 != ch2) {
+            if (ensure_bits(&s->gb, 20))
+                break;
+            if (!flag || sb_reorder >= s->max_mono_subband)
+                s->sec_ch_sbms[ch1 / 2][sb_reorder] = get_bits(&s->gb, 8);
+            if (flag && sb_reorder >= s->min_mono_subband)
+                s->sec_ch_lrms[ch1 / 2][sb_reorder] = get_bits(&s->gb, 8);
+        }
+
+        quant_level = s->quant_levels[ch1 / 2][sb];
+        if (!quant_level)
+            return -1;
+
+        // Time samples for one or both channels
+        if (sb < s->max_mono_subband && sb_reorder >= s->min_mono_subband) {
+            if (!flag)
+                parse_ch(s, ch1, sb_reorder, quant_level, 0);
+            else if (ch1 != ch2)
+                parse_ch(s, ch2, sb_reorder, quant_level, 1);
+        } else {
+            parse_ch(s, ch1, sb_reorder, quant_level, 0);
+            if (ch1 != ch2)
+                parse_ch(s, ch2, sb_reorder, quant_level, 0);
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * Convert from reflection coefficients to direct form coefficients
+ */
+static void convert_lpc(float *coeff, const int *codes)
+{
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        float rc = lpc_tab[codes[i]];
+        for (j = 0; j < (i + 1) / 2; j++) {
+            float tmp1 = coeff[    j    ];
+            float tmp2 = coeff[i - j - 1];
+            coeff[    j    ] = tmp1 + rc * tmp2;
+            coeff[i - j - 1] = tmp2 + rc * tmp1;
+        }
+        coeff[i] = rc;
+    }
+}
+
+static int parse_lpc(DCALbrDecoder *s, int ch1, int ch2, int start_sb, int end_sb)
+{
+    int f = s->framenum & 1;
+    int i, sb, ch, codes[16];
+
+    // First two subbands have two sets of coefficients, third subband has one
+    for (sb = start_sb; sb < end_sb; sb++) {
+        int ncodes = 8 * (1 + (sb < 2));
+        for (ch = ch1; ch <= ch2; ch++) {
+            if (ensure_bits(&s->gb, 4 * ncodes))
+                return 0;
+            for (i = 0; i < ncodes; i++)
+                codes[i] = get_bits(&s->gb, 4);
+            for (i = 0; i < ncodes / 8; i++)
+                convert_lpc(s->lpc_coeff[f][ch][sb][i], &codes[i * 8]);
+        }
+    }
+
+    return 0;
+}
+
+static int parse_high_res_grid(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    int quant_levels[DCA_LBR_SUBBANDS];
+    int sb, ch, ol, st, max_sb, profile;
+
+    if (!chunk->len)
+        return 0;
+
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+
+    // Quantizer profile
+    profile = get_bits(&s->gb, 8);
+    // Overall level
+    ol = (profile >> 3) & 7;
+    // Steepness
+    st = profile >> 6;
+    // Max energy subband
+    max_sb = profile & 7;
+
+    // Calculate quantization levels
+    for (sb = 0; sb < s->nsubbands; sb++) {
+        int f = sb * s->limited_rate / s->nsubbands;
+        int a = 18000 / (12 * f / 1000 + 100 + 40 * st) + 20 * ol;
+        if (a <= 95)
+            quant_levels[sb] = 1;
+        else if (a <= 140)
+            quant_levels[sb] = 2;
+        else if (a <= 180)
+            quant_levels[sb] = 3;
+        else if (a <= 230)
+            quant_levels[sb] = 4;
+        else
+            quant_levels[sb] = 5;
+    }
+
+    // Reorder quantization levels for lower subbands
+    for (sb = 0; sb < 8; sb++)
+        s->quant_levels[ch1 / 2][sb] = quant_levels[ff_dca_sb_reorder[max_sb][sb]];
+    for (; sb < s->nsubbands; sb++)
+        s->quant_levels[ch1 / 2][sb] = quant_levels[sb];
+
+    // LPC for the first two subbands
+    if (parse_lpc(s, ch1, ch2, 0, 2) < 0)
+        return -1;
+
+    // Time-samples for the first two subbands of main channel
+    if (parse_ts(s, ch1, ch2, 0, 2, 0) < 0)
+        return -1;
+
+    // First two bands of the first grid
+    for (sb = 0; sb < 2; sb++)
+        for (ch = ch1; ch <= ch2; ch++)
+            if (parse_scale_factors(s, s->grid_1_scf[ch][sb]) < 0)
+                return -1;
+
+    return 0;
+}
+
+static int parse_grid_2(DCALbrDecoder *s, int ch1, int ch2,
+                        int start_sb, int end_sb, int flag)
+{
+    int i, j, sb, ch, nsubbands;
+
+    nsubbands = ff_dca_scf_to_grid_2[s->nsubbands - 1] + 1;
+    if (end_sb > nsubbands)
+        end_sb = nsubbands;
+
+    for (sb = start_sb; sb < end_sb; sb++) {
+        for (ch = ch1; ch <= ch2; ch++) {
+            uint8_t *g2_scf = s->grid_2_scf[ch][sb];
+
+            if ((ch != ch1 && ff_dca_grid_2_to_scf[sb] >= s->min_mono_subband) != flag) {
+                if (!flag)
+                    memcpy(g2_scf, s->grid_2_scf[ch1][sb], 64);
+                continue;
+            }
+
+            // Scale factors in groups of 8
+            for (i = 0; i < 8; i++, g2_scf += 8) {
+                if (get_bits_left(&s->gb) < 1) {
+                    memset(g2_scf, 0, 64 - i * 8);
+                    break;
+                }
+                // Bit indicating if whole group has zero values
+                if (get_bits1(&s->gb)) {
+                    for (j = 0; j < 8; j++) {
+                        if (ensure_bits(&s->gb, 20))
+                            break;
+                        g2_scf[j] = parse_vlc(&s->gb, &ff_dca_vlc_grid_2, 2);
+                    }
+                } else {
+                    memset(g2_scf, 0, 8);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int parse_ts1_chunk(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    if (!chunk->len)
+        return 0;
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+    if (parse_lpc(s, ch1, ch2, 2, 3) < 0)
+        return -1;
+    if (parse_ts(s, ch1, ch2, 2, 4, 0) < 0)
+        return -1;
+    if (parse_grid_2(s, ch1, ch2, 0, 1, 0) < 0)
+        return -1;
+    if (parse_ts(s, ch1, ch2, 4, 6, 0) < 0)
+        return -1;
+    return 0;
+}
+
+static int parse_ts2_chunk(DCALbrDecoder *s, LBRChunk *chunk, int ch1, int ch2)
+{
+    if (!chunk->len)
+        return 0;
+    if (init_get_bits8(&s->gb, chunk->data, chunk->len) < 0)
+        return -1;
+    if (parse_grid_2(s, ch1, ch2, 1, 3, 0) < 0)
+        return -1;
+    if (parse_ts(s, ch1, ch2, 6, s->max_mono_subband, 0) < 0)
+        return -1;
+    if (ch1 != ch2) {
+        if (parse_grid_1_sec_ch(s, ch2) < 0)
+            return -1;
+        if (parse_grid_2(s, ch1, ch2, 0, 3, 1) < 0)
+            return -1;
+    }
+    if (parse_ts(s, ch1, ch2, s->min_mono_subband, s->nsubbands, 1) < 0)
+        return -1;
+    return 0;
+}
+
+static int init_sample_rate(DCALbrDecoder *s)
+{
+    double scale = (-1.0 / (1 << 17)) * sqrt(1 << (2 - s->limited_range));
+    int i, br_per_ch = s->bit_rate_scaled / s->nchannels_total;
+
+    ff_mdct_end(&s->imdct);
+
+    if (ff_mdct_init(&s->imdct, s->freq_range + 6, 1, scale) < 0)
+        return -1;
+
+    for (i = 0; i < 32 << s->freq_range; i++)
+        s->window[i] = ff_dca_long_window[i << (2 - s->freq_range)];
+
+    if (br_per_ch < 14000)
+        scale = 0.85;
+    else if (br_per_ch < 32000)
+        scale = (br_per_ch - 14000) * (1.0 / 120000) + 0.85;
+    else
+        scale = 1.0;
+
+    scale *= 1.0 / INT_MAX;
+
+    for (i = 0; i < s->nsubbands; i++) {
+        if (i < 2)
+            s->sb_scf[i] = 0;   // The first two subbands are always zero
+        else if (i < 5)
+            s->sb_scf[i] = (i - 1) * 0.25 * 0.785 * scale;
+        else
+            s->sb_scf[i] = 0.785 * scale;
+    }
+
+    s->lfe_scale = (16 << s->freq_range) * 0.0000078265894;
+
+    return 0;
+}
+
+static int alloc_sample_buffer(DCALbrDecoder *s)
+{
+    // Reserve space for history and padding
+    int nchsamples = DCA_LBR_TIME_SAMPLES + DCA_LBR_TIME_HISTORY * 2;
+    int nsamples = nchsamples * s->nchannels * s->nsubbands;
+    int ch, sb;
+    float *ptr;
+
+    // Reallocate time sample buffer
+    av_fast_mallocz(&s->ts_buffer, &s->ts_size, nsamples * sizeof(float));
+    if (!s->ts_buffer)
+        return -1;
+
+    ptr = s->ts_buffer + DCA_LBR_TIME_HISTORY;
+    for (ch = 0; ch < s->nchannels; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            s->time_samples[ch][sb] = ptr;
+            ptr += nchsamples;
+        }
+    }
+
+    return 0;
+}
+
+static int parse_decoder_init(DCALbrDecoder *s, GetByteContext *gb)
+{
+    int old_rate = s->sample_rate;
+    int old_band_limit = s->band_limit;
+    int old_nchannels = s->nchannels;
+    int version, bit_rate_hi;
+    unsigned int sr_code;
+
+    // Sample rate of LBR audio
+    sr_code = bytestream2_get_byte(gb);
+    if (sr_code >= FF_ARRAY_ELEMS(ff_dca_sampling_freqs)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR sample rate\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->sample_rate = ff_dca_sampling_freqs[sr_code];
+    if (s->sample_rate > 48000) {
+        avpriv_report_missing_feature(s->avctx, "%d Hz LBR sample rate", s->sample_rate);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // LBR speaker mask
+    s->ch_mask = bytestream2_get_le16(gb);
+    if (!(s->ch_mask & 0x7)) {
+        avpriv_report_missing_feature(s->avctx, "LBR channel mask %#x", s->ch_mask);
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((s->ch_mask & 0xfff0) && !(s->warned & 1)) {
+        avpriv_report_missing_feature(s->avctx, "LBR channel mask %#x", s->ch_mask);
+        s->warned |= 1;
+    }
+
+    // LBR bitstream version
+    version = bytestream2_get_le16(gb);
+    if ((version & 0xff00) != 0x0800) {
+        avpriv_report_missing_feature(s->avctx, "LBR stream version %#x", version);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Flags for LBR decoder initialization
+    s->flags = bytestream2_get_byte(gb);
+    if (s->flags & LBR_FLAG_DMIX_MULTI_CH) {
+        avpriv_report_missing_feature(s->avctx, "LBR multi-channel downmix");
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((s->flags & LBR_FLAG_LFE_PRESENT) && s->sample_rate != 48000) {
+        if (!(s->warned & 2)) {
+            avpriv_report_missing_feature(s->avctx, "%d Hz LFE interpolation", s->sample_rate);
+            s->warned |= 2;
+        }
+        s->flags &= ~LBR_FLAG_LFE_PRESENT;
+    }
+
+    // Most significant bit rate nibbles
+    bit_rate_hi = bytestream2_get_byte(gb);
+
+    // Least significant original bit rate word
+    s->bit_rate_orig = bytestream2_get_le16(gb) | ((bit_rate_hi & 0x0F) << 16);
+
+    // Least significant scaled bit rate word
+    s->bit_rate_scaled = bytestream2_get_le16(gb) | ((bit_rate_hi & 0xF0) << 12);
+
+    // Setup number of fullband channels
+    s->nchannels_total = ff_dca_count_chs_for_mask(s->ch_mask & ~DCA_SPEAKER_PAIR_LFE1);
+    s->nchannels = FFMIN(s->nchannels_total, DCA_LBR_CHANNELS);
+
+    // Setup band limit
+    switch (s->flags & LBR_FLAG_BAND_LIMIT_MASK) {
+    case LBR_FLAG_BAND_LIMIT_NONE:
+        s->band_limit = 0;
+        break;
+    case LBR_FLAG_BAND_LIMIT_1_2:
+        s->band_limit = 1;
+        break;
+    case LBR_FLAG_BAND_LIMIT_1_4:
+        s->band_limit = 2;
+        break;
+    default:
+        avpriv_report_missing_feature(s->avctx, "LBR band limit %#x", s->flags & LBR_FLAG_BAND_LIMIT_MASK);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Setup frequency range
+    s->freq_range = ff_dca_freq_ranges[sr_code];
+
+    // Setup resolution profile
+    if (s->bit_rate_orig >= 44000 * (s->nchannels_total + 2))
+        s->res_profile = 2;
+    else if (s->bit_rate_orig >= 25000 * (s->nchannels_total + 2))
+        s->res_profile = 1;
+    else
+        s->res_profile = 0;
+
+    // Setup limited sample rate, number of subbands, etc
+    s->limited_rate = s->sample_rate >> s->band_limit;
+    s->limited_range = s->freq_range - s->band_limit;
+    if (s->limited_range < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR band limit for frequency range\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->nsubbands = 8 << s->limited_range;
+
+    s->g3_avg_only_start_sb = s->nsubbands * ff_dca_avg_g3_freqs[s->res_profile] / (s->limited_rate / 2);
+    if (s->g3_avg_only_start_sb > s->nsubbands)
+        s->g3_avg_only_start_sb = s->nsubbands;
+
+    s->min_mono_subband = s->nsubbands *  2000 / (s->limited_rate / 2);
+    if (s->min_mono_subband > s->nsubbands)
+        s->min_mono_subband = s->nsubbands;
+
+    s->max_mono_subband = s->nsubbands * 14000 / (s->limited_rate / 2);
+    if (s->max_mono_subband > s->nsubbands)
+        s->max_mono_subband = s->nsubbands;
+
+    // Handle change of sample rate
+    if ((old_rate != s->sample_rate || old_band_limit != s->band_limit) && init_sample_rate(s) < 0)
+        return AVERROR(ENOMEM);
+
+    // Setup stereo downmix
+    if (s->flags & LBR_FLAG_DMIX_STEREO) {
+        DCAContext *dca = s->avctx->priv_data;
+
+        if (s->nchannels_total < 3 || s->nchannels_total > DCA_LBR_CHANNELS_TOTAL - 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of channels for LBR stereo downmix\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // This decoder doesn't support ECS chunk
+        if (dca->request_channel_layout != DCA_SPEAKER_LAYOUT_STEREO && !(s->warned & 4)) {
+            avpriv_report_missing_feature(s->avctx, "Embedded LBR stereo downmix");
+            s->warned |= 4;
+        }
+
+        // Account for extra downmixed channel pair
+        s->nchannels_total += 2;
+        s->nchannels = 2;
+        s->ch_mask = DCA_SPEAKER_PAIR_LR;
+        s->flags &= ~LBR_FLAG_LFE_PRESENT;
+    }
+
+    // Handle change of sample rate or number of channels
+    if (old_rate != s->sample_rate
+        || old_band_limit != s->band_limit
+        || old_nchannels != s->nchannels) {
+        if (alloc_sample_buffer(s) < 0)
+            return AVERROR(ENOMEM);
+        ff_dca_lbr_flush(s);
+    }
+
+    return 0;
+}
+
+int ff_dca_lbr_parse(DCALbrDecoder *s, uint8_t *data, DCAExssAsset *asset)
+{
+    struct {
+        LBRChunk    lfe;
+        LBRChunk    tonal;
+        LBRChunk    tonal_grp[5];
+        LBRChunk    grid1[DCA_LBR_CHANNELS / 2];
+        LBRChunk    hr_grid[DCA_LBR_CHANNELS / 2];
+        LBRChunk    ts1[DCA_LBR_CHANNELS / 2];
+        LBRChunk    ts2[DCA_LBR_CHANNELS / 2];
+    } chunk = { {0} };
+
+    GetByteContext gb;
+
+    int i, ch, sb, sf, ret, group, chunk_id, chunk_len;
+
+    bytestream2_init(&gb, data + asset->lbr_offset, asset->lbr_size);
+
+    // LBR sync word
+    if (bytestream2_get_be32(&gb) != DCA_SYNCWORD_LBR) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR sync word\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // LBR header type
+    switch (bytestream2_get_byte(&gb)) {
+    case LBR_HEADER_SYNC_ONLY:
+        if (!s->sample_rate) {
+            av_log(s->avctx, AV_LOG_ERROR, "LBR decoder not initialized\n");
+            return AVERROR_INVALIDDATA;
+        }
+        break;
+    case LBR_HEADER_DECODER_INIT:
+        if ((ret = parse_decoder_init(s, &gb)) < 0) {
+            s->sample_rate = 0;
+            return ret;
+        }
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR header type\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // LBR frame chunk header
+    chunk_id = bytestream2_get_byte(&gb);
+    chunk_len = (chunk_id & 0x80) ? bytestream2_get_be16(&gb) : bytestream2_get_byte(&gb);
+
+    if (chunk_len > bytestream2_get_bytes_left(&gb)) {
+        chunk_len = bytestream2_get_bytes_left(&gb);
+        av_log(s->avctx, AV_LOG_WARNING, "LBR frame chunk was truncated\n");
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_init(&gb, gb.buffer, chunk_len);
+
+    switch (chunk_id & 0x7f) {
+    case LBR_CHUNK_FRAME:
+        if (s->avctx->err_recognition & (AV_EF_CRCCHECK | AV_EF_CAREFUL)) {
+            int checksum = bytestream2_get_be16(&gb);
+            uint16_t res = chunk_id;
+            res += (chunk_len >> 8) & 0xff;
+            res += chunk_len & 0xff;
+            for (i = 0; i < chunk_len - 2; i++)
+                res += gb.buffer[i];
+            if (checksum != res) {
+                av_log(s->avctx, AV_LOG_WARNING, "Invalid LBR checksum\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+        } else {
+            bytestream2_skip(&gb, 2);
+        }
+        break;
+    case LBR_CHUNK_FRAME_NO_CSUM:
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid LBR frame chunk ID\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Clear current frame
+    memset(s->quant_levels, 0, sizeof(s->quant_levels));
+    memset(s->sb_indices, 0xff, sizeof(s->sb_indices));
+    memset(s->sec_ch_sbms, 0, sizeof(s->sec_ch_sbms));
+    memset(s->sec_ch_lrms, 0, sizeof(s->sec_ch_lrms));
+    memset(s->ch_pres, 0, sizeof(s->ch_pres));
+    memset(s->grid_1_scf, 0, sizeof(s->grid_1_scf));
+    memset(s->grid_2_scf, 0, sizeof(s->grid_2_scf));
+    memset(s->grid_3_avg, 0, sizeof(s->grid_3_avg));
+    memset(s->grid_3_scf, 0, sizeof(s->grid_3_scf));
+    memset(s->grid_3_pres, 0, sizeof(s->grid_3_pres));
+    memset(s->tonal_scf, 0, sizeof(s->tonal_scf));
+    memset(s->lfe_data, 0, sizeof(s->lfe_data));
+    s->part_stereo_pres = 0;
+    s->framenum = (s->framenum + 1) & 31;
+
+    for (ch = 0; ch < s->nchannels; ch++) {
+        for (sb = 0; sb < s->nsubbands / 4; sb++) {
+            s->part_stereo[ch][sb][0] = s->part_stereo[ch][sb][4];
+            s->part_stereo[ch][sb][4] = 16;
+        }
+    }
+
+    memset(s->lpc_coeff[s->framenum & 1], 0, sizeof(s->lpc_coeff[0]));
+
+    for (group = 0; group < 5; group++) {
+        for (sf = 0; sf < 1 << group; sf++) {
+            int sf_idx = ((s->framenum << group) + sf) & 31;
+            s->tonal_bounds[group][sf_idx][0] =
+            s->tonal_bounds[group][sf_idx][1] = s->ntones;
+        }
+    }
+
+    // Parse chunk headers
+    while (bytestream2_get_bytes_left(&gb) > 0) {
+        chunk_id = bytestream2_get_byte(&gb);
+        chunk_len = (chunk_id & 0x80) ? bytestream2_get_be16(&gb) : bytestream2_get_byte(&gb);
+        chunk_id &= 0x7f;
+
+        if (chunk_len > bytestream2_get_bytes_left(&gb)) {
+            chunk_len = bytestream2_get_bytes_left(&gb);
+            av_log(s->avctx, AV_LOG_WARNING, "LBR chunk %#x was truncated\n", chunk_id);
+            if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                return AVERROR_INVALIDDATA;
+        }
+
+        switch (chunk_id) {
+        case LBR_CHUNK_LFE:
+            chunk.lfe.len  = chunk_len;
+            chunk.lfe.data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_SCF:
+        case LBR_CHUNK_TONAL:
+        case LBR_CHUNK_TONAL_SCF:
+            chunk.tonal.id   = chunk_id;
+            chunk.tonal.len  = chunk_len;
+            chunk.tonal.data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_TONAL_GRP_1:
+        case LBR_CHUNK_TONAL_GRP_2:
+        case LBR_CHUNK_TONAL_GRP_3:
+        case LBR_CHUNK_TONAL_GRP_4:
+        case LBR_CHUNK_TONAL_GRP_5:
+            i = LBR_CHUNK_TONAL_GRP_5 - chunk_id;
+            chunk.tonal_grp[i].id   = i;
+            chunk.tonal_grp[i].len  = chunk_len;
+            chunk.tonal_grp[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_TONAL_SCF_GRP_1:
+        case LBR_CHUNK_TONAL_SCF_GRP_2:
+        case LBR_CHUNK_TONAL_SCF_GRP_3:
+        case LBR_CHUNK_TONAL_SCF_GRP_4:
+        case LBR_CHUNK_TONAL_SCF_GRP_5:
+            i = LBR_CHUNK_TONAL_SCF_GRP_5 - chunk_id;
+            chunk.tonal_grp[i].id   = i;
+            chunk.tonal_grp[i].len  = chunk_len;
+            chunk.tonal_grp[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_GRID_LR:
+        case LBR_CHUNK_RES_GRID_LR + 1:
+        case LBR_CHUNK_RES_GRID_LR + 2:
+            i = chunk_id - LBR_CHUNK_RES_GRID_LR;
+            chunk.grid1[i].len  = chunk_len;
+            chunk.grid1[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_GRID_HR:
+        case LBR_CHUNK_RES_GRID_HR + 1:
+        case LBR_CHUNK_RES_GRID_HR + 2:
+            i = chunk_id - LBR_CHUNK_RES_GRID_HR;
+            chunk.hr_grid[i].len  = chunk_len;
+            chunk.hr_grid[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_TS_1:
+        case LBR_CHUNK_RES_TS_1 + 1:
+        case LBR_CHUNK_RES_TS_1 + 2:
+            i = chunk_id - LBR_CHUNK_RES_TS_1;
+            chunk.ts1[i].len  = chunk_len;
+            chunk.ts1[i].data = gb.buffer;
+            break;
+
+        case LBR_CHUNK_RES_TS_2:
+        case LBR_CHUNK_RES_TS_2 + 1:
+        case LBR_CHUNK_RES_TS_2 + 2:
+            i = chunk_id - LBR_CHUNK_RES_TS_2;
+            chunk.ts2[i].len  = chunk_len;
+            chunk.ts2[i].data = gb.buffer;
+            break;
+        }
+
+        bytestream2_skip(&gb, chunk_len);
+    }
+
+    // Parse the chunks
+    ret = parse_lfe_chunk(s, &chunk.lfe);
+
+    ret |= parse_tonal_chunk(s, &chunk.tonal);
+
+    for (i = 0; i < 5; i++)
+        ret |= parse_tonal_group(s, &chunk.tonal_grp[i]);
+
+    for (i = 0; i < (s->nchannels + 1) / 2; i++) {
+        int ch1 = i * 2;
+        int ch2 = FFMIN(ch1 + 1, s->nchannels - 1);
+
+        if (parse_grid_1_chunk (s, &chunk.grid1  [i], ch1, ch2) < 0 ||
+            parse_high_res_grid(s, &chunk.hr_grid[i], ch1, ch2) < 0) {
+            ret = -1;
+            continue;
+        }
+
+        // TS chunks depend on both grids. TS_2 depends on TS_1.
+        if (!chunk.grid1[i].len || !chunk.hr_grid[i].len || !chunk.ts1[i].len)
+            continue;
+
+        if (parse_ts1_chunk(s, &chunk.ts1[i], ch1, ch2) < 0 ||
+            parse_ts2_chunk(s, &chunk.ts2[i], ch1, ch2) < 0) {
+            ret = -1;
+            continue;
+        }
+    }
+
+    if (ret < 0 && (s->avctx->err_recognition & AV_EF_EXPLODE))
+        return AVERROR_INVALIDDATA;
+
+    return 0;
+}
+
+/**
+ * Reconstruct high-frequency resolution grid from first and third grids
+ */
+static void decode_grid(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, ch, sb;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            int g1_sb = ff_dca_scf_to_grid_1[sb];
+
+            uint8_t *g1_scf_a = s->grid_1_scf[ch][g1_sb    ];
+            uint8_t *g1_scf_b = s->grid_1_scf[ch][g1_sb + 1];
+
+            int w1 = ff_dca_grid_1_weights[g1_sb    ][sb];
+            int w2 = ff_dca_grid_1_weights[g1_sb + 1][sb];
+
+            uint8_t *hr_scf = s->high_res_scf[ch][sb];
+
+            if (sb < 4) {
+                for (i = 0; i < 8; i++) {
+                    int scf = w1 * g1_scf_a[i] + w2 * g1_scf_b[i];
+                    hr_scf[i] = scf >> 7;
+                }
+            } else {
+                int8_t *g3_scf = s->grid_3_scf[ch][sb - 4];
+                int g3_avg = s->grid_3_avg[ch][sb - 4];
+
+                for (i = 0; i < 8; i++) {
+                    int scf = w1 * g1_scf_a[i] + w2 * g1_scf_b[i];
+                    hr_scf[i] = (scf >> 7) - g3_avg - g3_scf[i];
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Fill unallocated subbands with randomness
+ */
+static void random_ts(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, j, k, ch, sb;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            float *samples = s->time_samples[ch][sb];
+
+            if (s->ch_pres[ch] & (1U << sb))
+                continue;   // Skip allocated subband
+
+            if (sb < 2) {
+                // The first two subbands are always zero
+                memset(samples, 0, DCA_LBR_TIME_SAMPLES * sizeof(float));
+            } else if (sb < 10) {
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES; i++)
+                    samples[i] = lbr_rand(s, sb);
+            } else {
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES / 8; i++, samples += 8) {
+                    float accum[8] = { 0 };
+
+                    // Modulate by subbands 2-5 in blocks of 8
+                    for (k = 2; k < 6; k++) {
+                        float *other = &s->time_samples[ch][k][i * 8];
+                        for (j = 0; j < 8; j++)
+                            accum[j] += fabs(other[j]);
+                    }
+
+                    for (j = 0; j < 8; j++)
+                        samples[j] = (accum[j] * 0.25f + 0.5f) * lbr_rand(s, sb);
+                }
+            }
+        }
+    }
+}
+
+static void predict(float *samples, const float *coeff, int nsamples)
+{
+    int i, j;
+
+    for (i = 0; i < nsamples; i++) {
+        float res = 0;
+        for (j = 0; j < 8; j++)
+            res += coeff[j] * samples[i - j - 1];
+        samples[i] -= res;
+    }
+}
+
+static void synth_lpc(DCALbrDecoder *s, int ch1, int ch2, int sb)
+{
+    int f = s->framenum & 1;
+    int ch;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        float *samples = s->time_samples[ch][sb];
+
+        if (!(s->ch_pres[ch] & (1U << sb)))
+            continue;
+
+        if (sb < 2) {
+            predict(samples,      s->lpc_coeff[f^1][ch][sb][1],  16);
+            predict(samples + 16, s->lpc_coeff[f  ][ch][sb][0],  64);
+            predict(samples + 80, s->lpc_coeff[f  ][ch][sb][1],  48);
+        } else {
+            predict(samples,      s->lpc_coeff[f^1][ch][sb][0],  16);
+            predict(samples + 16, s->lpc_coeff[f  ][ch][sb][0], 112);
+        }
+    }
+}
+
+static void filter_ts(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, j, sb, ch;
+
+    for (sb = 0; sb < s->nsubbands; sb++) {
+        // Scale factors
+        for (ch = ch1; ch <= ch2; ch++) {
+            float *samples = s->time_samples[ch][sb];
+            uint8_t *hr_scf = s->high_res_scf[ch][sb];
+            if (sb < 4) {
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES / 16; i++, samples += 16) {
+                    unsigned int scf = hr_scf[i];
+                    if (scf > AMP_MAX)
+                        scf = AMP_MAX;
+                    for (j = 0; j < 16; j++)
+                        samples[j] *= ff_dca_quant_amp[scf];
+                }
+            } else {
+                uint8_t *g2_scf = s->grid_2_scf[ch][ff_dca_scf_to_grid_2[sb]];
+                for (i = 0; i < DCA_LBR_TIME_SAMPLES / 2; i++, samples += 2) {
+                    unsigned int scf = hr_scf[i / 8] - g2_scf[i];
+                    if (scf > AMP_MAX)
+                        scf = AMP_MAX;
+                    samples[0] *= ff_dca_quant_amp[scf];
+                    samples[1] *= ff_dca_quant_amp[scf];
+                }
+            }
+        }
+
+        // Mid-side stereo
+        if (ch1 != ch2) {
+            float *samples_l = s->time_samples[ch1][sb];
+            float *samples_r = s->time_samples[ch2][sb];
+            int ch2_pres = s->ch_pres[ch2] & (1U << sb);
+
+            for (i = 0; i < DCA_LBR_TIME_SAMPLES / 16; i++) {
+                int sbms = (s->sec_ch_sbms[ch1 / 2][sb] >> i) & 1;
+                int lrms = (s->sec_ch_lrms[ch1 / 2][sb] >> i) & 1;
+
+                if (sb >= s->min_mono_subband) {
+                    if (lrms && ch2_pres) {
+                        if (sbms) {
+                            for (j = 0; j < 16; j++) {
+                                float tmp = samples_l[j];
+                                samples_l[j] =  samples_r[j];
+                                samples_r[j] = -tmp;
+                            }
+                        } else {
+                            for (j = 0; j < 16; j++) {
+                                float tmp = samples_l[j];
+                                samples_l[j] =  samples_r[j];
+                                samples_r[j] =  tmp;
+                            }
+                        }
+                    } else if (!ch2_pres) {
+                        if (sbms && (s->part_stereo_pres & (1 << ch1))) {
+                            for (j = 0; j < 16; j++)
+                                samples_r[j] = -samples_l[j];
+                        } else {
+                            for (j = 0; j < 16; j++)
+                                samples_r[j] =  samples_l[j];
+                        }
+                    }
+                } else if (sbms && ch2_pres) {
+                    for (j = 0; j < 16; j++) {
+                        float tmp = samples_l[j];
+                        samples_l[j] = (tmp + samples_r[j]) * 0.5f;
+                        samples_r[j] = (tmp - samples_r[j]) * 0.5f;
+                    }
+                }
+
+                samples_l += 16;
+                samples_r += 16;
+            }
+        }
+
+        // Inverse prediction
+        if (sb < 3)
+            synth_lpc(s, ch1, ch2, sb);
+    }
+}
+
+/**
+ * Modulate by interpolated partial stereo coefficients
+ */
+static void decode_part_stereo(DCALbrDecoder *s, int ch1, int ch2)
+{
+    int i, ch, sb, sf;
+
+    for (ch = ch1; ch <= ch2; ch++) {
+        for (sb = s->min_mono_subband; sb < s->nsubbands; sb++) {
+            uint8_t *pt_st = s->part_stereo[ch][(sb - s->min_mono_subband) / 4];
+            float *samples = s->time_samples[ch][sb];
+
+            if (s->ch_pres[ch2] & (1U << sb))
+                continue;
+
+            for (sf = 1; sf <= 4; sf++, samples += 32) {
+                float prev = ff_dca_st_coeff[pt_st[sf - 1]];
+                float next = ff_dca_st_coeff[pt_st[sf    ]];
+
+                for (i = 0; i < 32; i++)
+                    samples[i] *= (32 - i) * prev + i * next;
+            }
+        }
+    }
+}
+
+/**
+ * Synthesise tones in the given group for the given tonal subframe
+ */
+static void synth_tones(DCALbrDecoder *s, int ch, float *values,
+                        int group, int group_sf, int synth_idx)
+{
+    int i, start, count;
+
+    if (synth_idx < 0)
+        return;
+
+    start =  s->tonal_bounds[group][group_sf][0];
+    count = (s->tonal_bounds[group][group_sf][1] - start) & (DCA_LBR_TONES - 1);
+
+    for (i = 0; i < count; i++) {
+        DCALbrTone *t = &s->tones[(start + i) & (DCA_LBR_TONES - 1)];
+
+        if (t->amp[ch]) {
+            float amp = ff_dca_synth_env[synth_idx] * ff_dca_quant_amp[t->amp[ch]];
+            float c = amp * cos_tab[(t->phs[ch]     ) & 255];
+            float s = amp * cos_tab[(t->phs[ch] + 64) & 255];
+            const float *cf = ff_dca_corr_cf[t->f_delt];
+            int x_freq = t->x_freq;
+
+            switch (x_freq) {
+            case 0:
+                goto p0;
+            case 1:
+                values[3] += cf[0] * -s;
+                values[2] += cf[1] *  c;
+                values[1] += cf[2] *  s;
+                values[0] += cf[3] * -c;
+                goto p1;
+            case 2:
+                values[2] += cf[0] * -s;
+                values[1] += cf[1] *  c;
+                values[0] += cf[2] *  s;
+                goto p2;
+            case 3:
+                values[1] += cf[0] * -s;
+                values[0] += cf[1] *  c;
+                goto p3;
+            case 4:
+                values[0] += cf[0] * -s;
+                goto p4;
+            }
+
+            values[x_freq - 5] += cf[ 0] * -s;
+        p4: values[x_freq - 4] += cf[ 1] *  c;
+        p3: values[x_freq - 3] += cf[ 2] *  s;
+        p2: values[x_freq - 2] += cf[ 3] * -c;
+        p1: values[x_freq - 1] += cf[ 4] * -s;
+        p0: values[x_freq    ] += cf[ 5] *  c;
+            values[x_freq + 1] += cf[ 6] *  s;
+            values[x_freq + 2] += cf[ 7] * -c;
+            values[x_freq + 3] += cf[ 8] * -s;
+            values[x_freq + 4] += cf[ 9] *  c;
+            values[x_freq + 5] += cf[10] *  s;
+        }
+
+        t->phs[ch] += t->ph_rot;
+    }
+}
+
+/**
+ * Synthesise all tones in all groups for the given residual subframe
+ */
+static void base_func_synth(DCALbrDecoder *s, int ch, float *values, int sf)
+{
+    int group;
+
+    // Tonal vs residual shift is 22 subframes
+    for (group = 0; group < 5; group++) {
+        int group_sf = (s->framenum << group) + ((sf - 22) >> (5 - group));
+        int synth_idx = ((((sf - 22) & 31) << group) & 31) + (1 << group) - 1;
+
+        synth_tones(s, ch, values, group, (group_sf - 1) & 31, 30 - synth_idx);
+        synth_tones(s, ch, values, group, (group_sf    ) & 31,      synth_idx);
+    }
+}
+
+static void transform_channel(DCALbrDecoder *s, int ch, float *output)
+{
+    LOCAL_ALIGNED_32(float, values, [DCA_LBR_SUBBANDS    ], [4]);
+    LOCAL_ALIGNED_32(float, result, [DCA_LBR_SUBBANDS * 2], [4]);
+    int sf, sb, nsubbands = s->nsubbands, noutsubbands = 8 << s->freq_range;
+
+    // Clear inactive subbands
+    if (nsubbands < noutsubbands)
+        memset(values[nsubbands], 0, (noutsubbands - nsubbands) * sizeof(values[0]));
+
+    for (sf = 0; sf < DCA_LBR_TIME_SAMPLES / 4; sf++) {
+        // Hybrid filterbank
+        s->dcadsp->lbr_bank(values, s->time_samples[ch],
+                            ff_dca_bank_coeff, sf * 4, nsubbands);
+
+        base_func_synth(s, ch, values[0], sf);
+
+        s->imdct.imdct_calc(&s->imdct, result[0], values[0]);
+
+        // Long window and overlap-add
+        s->fdsp->vector_fmul_add(output, result[0], s->window,
+                                 s->history[ch], noutsubbands * 4);
+        s->fdsp->vector_fmul_reverse(s->history[ch], result[noutsubbands],
+                                     s->window, noutsubbands * 4);
+        output += noutsubbands * 4;
+    }
+
+    // Update history for LPC and forward MDCT
+    for (sb = 0; sb < nsubbands; sb++) {
+        float *samples = s->time_samples[ch][sb] - DCA_LBR_TIME_HISTORY;
+        memcpy(samples, samples + DCA_LBR_TIME_SAMPLES, DCA_LBR_TIME_HISTORY * sizeof(float));
+    }
+}
+
+int ff_dca_lbr_filter_frame(DCALbrDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    int i, ret, nchannels, ch_conf = (s->ch_mask & 0x7) - 1;
+    const int8_t *reorder;
+
+    avctx->channel_layout = channel_layouts[ch_conf];
+    avctx->channels = nchannels = channel_counts[ch_conf];
+    avctx->sample_rate = s->sample_rate;
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    avctx->bits_per_raw_sample = 0;
+    avctx->profile = FF_PROFILE_DTS_EXPRESS;
+    avctx->bit_rate = s->bit_rate_scaled;
+
+    if (s->flags & LBR_FLAG_LFE_PRESENT) {
+        avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
+        avctx->channels++;
+        reorder = channel_reorder_lfe[ch_conf];
+    } else {
+        reorder = channel_reorder_nolfe[ch_conf];
+    }
+
+    frame->nb_samples = 1024 << s->freq_range;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Filter fullband channels
+    for (i = 0; i < (s->nchannels + 1) / 2; i++) {
+        int ch1 = i * 2;
+        int ch2 = FFMIN(ch1 + 1, s->nchannels - 1);
+
+        decode_grid(s, ch1, ch2);
+
+        random_ts(s, ch1, ch2);
+
+        filter_ts(s, ch1, ch2);
+
+        if (ch1 != ch2 && (s->part_stereo_pres & (1 << ch1)))
+            decode_part_stereo(s, ch1, ch2);
+
+        if (ch1 < nchannels)
+            transform_channel(s, ch1, (float *)frame->extended_data[reorder[ch1]]);
+
+        if (ch1 != ch2 && ch2 < nchannels)
+            transform_channel(s, ch2, (float *)frame->extended_data[reorder[ch2]]);
+    }
+
+    // Interpolate LFE channel
+    if (s->flags & LBR_FLAG_LFE_PRESENT) {
+        s->dcadsp->lfe_iir((float *)frame->extended_data[lfe_index[ch_conf]],
+                           s->lfe_data, ff_dca_lfe_iir,
+                           s->lfe_history, 16 << s->freq_range);
+    }
+
+    if ((ret = ff_side_data_update_matrix_encoding(frame, AV_MATRIX_ENCODING_NONE)) < 0)
+        return ret;
+
+    return 0;
+}
+
+av_cold void ff_dca_lbr_flush(DCALbrDecoder *s)
+{
+    int ch, sb;
+
+    if (!s->sample_rate)
+        return;
+
+    // Clear history
+    memset(s->part_stereo, 16, sizeof(s->part_stereo));
+    memset(s->lpc_coeff, 0, sizeof(s->lpc_coeff));
+    memset(s->history, 0, sizeof(s->history));
+    memset(s->tonal_bounds, 0, sizeof(s->tonal_bounds));
+    memset(s->lfe_history, 0, sizeof(s->lfe_history));
+    s->framenum = 0;
+    s->ntones = 0;
+
+    for (ch = 0; ch < s->nchannels; ch++) {
+        for (sb = 0; sb < s->nsubbands; sb++) {
+            float *samples = s->time_samples[ch][sb] - DCA_LBR_TIME_HISTORY;
+            memset(samples, 0, DCA_LBR_TIME_HISTORY * sizeof(float));
+        }
+    }
+}
+
+av_cold int ff_dca_lbr_init(DCALbrDecoder *s)
+{
+    init_tables();
+
+    if (!(s->fdsp = avpriv_float_dsp_alloc(0)))
+        return -1;
+
+    s->lbr_rand = 1;
+    return 0;
+}
+
+av_cold void ff_dca_lbr_close(DCALbrDecoder *s)
+{
+    s->sample_rate = 0;
+
+    av_freep(&s->ts_buffer);
+    s->ts_size = 0;
+
+    av_freep(&s->fdsp);
+    ff_mdct_end(&s->imdct);
+}
diff --git a/libavcodec/dca_lbr.h b/libavcodec/dca_lbr.h
new file mode 100644
index 0000000..e6ca805
--- /dev/null
+++ b/libavcodec/dca_lbr.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_LBR_H
+#define AVCODEC_DCA_LBR_H
+
+#include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dca_exss.h"
+#include "dcadsp.h"
+#include "fft.h"
+
+#define DCA_LBR_CHANNELS        6
+#define DCA_LBR_CHANNELS_TOTAL  32
+#define DCA_LBR_SUBBANDS        32
+#define DCA_LBR_TONES           512
+
+#define DCA_LBR_TIME_SAMPLES    128
+#define DCA_LBR_TIME_HISTORY    8
+
+typedef struct DCALbrTone {
+    uint8_t     x_freq;     ///< Spectral line offset
+    uint8_t     f_delt;     ///< Difference between original and center frequency
+    uint8_t     ph_rot;     ///< Phase rotation
+    uint8_t     pad;        ///< Padding field
+    uint8_t     amp[DCA_LBR_CHANNELS];  ///< Per-channel amplitude
+    uint8_t     phs[DCA_LBR_CHANNELS];  ///< Per-channel phase
+} DCALbrTone;
+
+typedef struct DCALbrDecoder {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    int     sample_rate;        ///< Sample rate of LBR audio
+    int     ch_mask;            ///< LBR speaker mask
+    int     flags;              ///< Flags for LBR decoder initialization
+    int     bit_rate_orig;      ///< Original bit rate
+    int     bit_rate_scaled;    ///< Scaled bit rate
+
+    int     nchannels;          ///< Number of fullband channels to decode
+    int     nchannels_total;    ///< Total number of fullband channels
+    int     freq_range;         ///< Frequency range of LBR audio
+    int     band_limit;         ///< Band limit factor
+    int     limited_rate;       ///< Band limited sample rate
+    int     limited_range;      ///< Band limited frequency range
+    int     res_profile;        ///< Resolution profile
+    int     nsubbands;          ///< Number of encoded subbands
+    int     g3_avg_only_start_sb;   ///< Subband index where grid 3 scale factors end
+    int     min_mono_subband;   ///< Subband index where mono encoding starts
+    int     max_mono_subband;   ///< Subband index where mono encoding ends
+
+    int     framenum;   ///< Lower 5 bits of current frame number
+    int     lbr_rand;   ///< Seed for subband randomization
+    int     warned;     ///< Flags for warning suppression
+
+    uint8_t     quant_levels[DCA_LBR_CHANNELS / 2][DCA_LBR_SUBBANDS];   ///< Quantization levels
+    uint8_t     sb_indices[DCA_LBR_SUBBANDS];   ///< Subband reordering indices
+
+    uint8_t     sec_ch_sbms[DCA_LBR_CHANNELS / 2][DCA_LBR_SUBBANDS];    ///< Right channel inversion or mid/side decoding flags
+    uint8_t     sec_ch_lrms[DCA_LBR_CHANNELS / 2][DCA_LBR_SUBBANDS];    ///< Flags indicating if left/right channel are swapped
+    uint32_t    ch_pres[DCA_LBR_CHANNELS];  ///< Subband allocation flags
+
+    uint8_t     grid_1_scf[DCA_LBR_CHANNELS][12][8];    ///< Grid 1 scale factors
+    uint8_t     grid_2_scf[DCA_LBR_CHANNELS][3][64];    ///< Grid 2 scale factors
+
+    int8_t      grid_3_avg[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS - 4];     ///< Grid 3 average values
+    int8_t      grid_3_scf[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS - 4][8];  ///< Grid 3 scale factors
+    uint32_t    grid_3_pres[DCA_LBR_CHANNELS];  ///< Grid 3 scale factors presence flags
+
+    uint8_t     high_res_scf[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS][8];    ///< High-frequency resolution scale factors
+
+    uint8_t     part_stereo[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS / 4][5]; ///< Partial stereo coefficients
+    uint8_t     part_stereo_pres;   ///< Partial stereo coefficients presence flags
+
+    float       lpc_coeff[2][DCA_LBR_CHANNELS][3][2][8];    ///< Predictor coefficients
+
+    float       sb_scf[DCA_LBR_SUBBANDS];   ///< Subband randomization scale factors
+
+    float       *time_samples[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS]; ///< Time samples
+
+    float           *ts_buffer; ///< Time sample buffer base
+    unsigned int    ts_size;    ///< Time sample buffer size
+
+    DECLARE_ALIGNED(32, float, history)[DCA_LBR_CHANNELS][DCA_LBR_SUBBANDS * 4];    ///< IMDCT history
+    DECLARE_ALIGNED(32, float, window)[DCA_LBR_SUBBANDS * 4];   ///< Long window for IMDCT
+
+    DECLARE_ALIGNED(32, float, lfe_data)[64];       ///< Decimated LFE samples
+    DECLARE_ALIGNED(32, float, lfe_history)[5][2];  ///< LFE IIR filter history
+    float lfe_scale;    ///< Scale factor of LFE samples before IIR filter
+
+    uint8_t     tonal_scf[6];           ///< Tonal scale factors
+    uint16_t    tonal_bounds[5][32][2]; ///< Per-group per-subframe start/end positions of tones
+    DCALbrTone  tones[DCA_LBR_TONES];   ///< Circular buffer of tones
+    int         ntones;                 ///< Circular buffer head position
+
+    FFTContext          imdct;
+    AVFloatDSPContext   *fdsp;
+    DCADSPContext       *dcadsp;
+} DCALbrDecoder;
+
+int ff_dca_lbr_parse(DCALbrDecoder *s, uint8_t *data, DCAExssAsset *asset);
+int ff_dca_lbr_filter_frame(DCALbrDecoder *s, AVFrame *frame);
+av_cold void ff_dca_lbr_flush(DCALbrDecoder *s);
+av_cold int ff_dca_lbr_init(DCALbrDecoder *s);
+av_cold void ff_dca_lbr_close(DCALbrDecoder *s);
+
+#endif
diff --git a/libavcodec/dca_parser.c b/libavcodec/dca_parser.c
index c33cc9a..e5bea33 100644
--- a/libavcodec/dca_parser.c
+++ b/libavcodec/dca_parser.c
@@ -5,24 +5,25 @@
  * Copyright (C) 2006 Benjamin Larsson
  * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "dca.h"
+#include "dca_exss.h"
 #include "dca_syncwords.h"
 #include "get_bits.h"
 #include "parser.h"
@@ -32,6 +33,9 @@ typedef struct DCAParseContext {
     uint32_t lastmarker;
     int size;
     int framesize;
+    unsigned int startpos;
+    DCAExssParser exss;
+    unsigned int sr_code;
 } DCAParseContext;
 
 #define IS_CORE_MARKER(state) \
@@ -47,6 +51,14 @@ typedef struct DCAParseContext {
 #define CORE_MARKER(state)      ((state >> 16) & 0xFFFFFFFF)
 #define EXSS_MARKER(state)      (state & 0xFFFFFFFF)
 
+#define STATE_LE(state)     (((state & 0xFF00FF00) >> 8) | ((state & 0x00FF00FF) << 8))
+#define STATE_14(state)     (((state & 0x3FFF0000) >> 8) | ((state & 0x00003FFF) >> 6))
+
+#define CORE_FRAMESIZE(state)   (((state >> 4) & 0x3FFF) + 1)
+#define EXSS_FRAMESIZE(state)   ((state & 0x2000000000) ? \
+                                 ((state >>  5) & 0xFFFFF) + 1 : \
+                                 ((state >> 13) & 0x0FFFF) + 1)
+
 /**
  * Find the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame, or -1
@@ -54,41 +66,105 @@ typedef struct DCAParseContext {
 static int dca_find_frame_end(DCAParseContext *pc1, const uint8_t *buf,
                               int buf_size)
 {
-    int start_found, i;
+    int start_found, size, i;
     uint64_t state;
     ParseContext *pc = &pc1->pc;
 
     start_found = pc->frame_start_found;
     state       = pc->state64;
+    size        = pc1->size;
 
     i = 0;
     if (!start_found) {
-        for (i = 0; i < buf_size; i++) {
+        for (; i < buf_size; i++) {
+            size++;
             state = (state << 8) | buf[i];
-            if (IS_MARKER(state)) {
-                if (!pc1->lastmarker ||
-                    pc1->lastmarker == CORE_MARKER(state) ||
-                    pc1->lastmarker == DCA_SYNCWORD_SUBSTREAM) {
-                    start_found = 1;
-                    if (IS_EXSS_MARKER(state))
-                        pc1->lastmarker = EXSS_MARKER(state);
-                    else
-                        pc1->lastmarker = CORE_MARKER(state);
-                    i++;
-                    break;
-                }
+
+            if (IS_MARKER(state) &&
+                (!pc1->lastmarker ||
+                  pc1->lastmarker == CORE_MARKER(state) ||
+                  pc1->lastmarker == DCA_SYNCWORD_SUBSTREAM)) {
+                if (!pc1->lastmarker)
+                    pc1->startpos = IS_EXSS_MARKER(state) ? size - 4 : size - 6;
+
+                if (IS_EXSS_MARKER(state))
+                    pc1->lastmarker = EXSS_MARKER(state);
+                else
+                    pc1->lastmarker = CORE_MARKER(state);
+
+                start_found = 1;
+                size        = 0;
+
+                i++;
+                break;
             }
         }
     }
+
     if (start_found) {
         for (; i < buf_size; i++) {
-            pc1->size++;
+            size++;
             state = (state << 8) | buf[i];
+
+            if (start_found == 1) {
+                switch (pc1->lastmarker) {
+                case DCA_SYNCWORD_CORE_BE:
+                    if (size == 2) {
+                        pc1->framesize = CORE_FRAMESIZE(state);
+                        start_found    = 2;
+                    }
+                    break;
+                case DCA_SYNCWORD_CORE_LE:
+                    if (size == 2) {
+                        pc1->framesize = CORE_FRAMESIZE(STATE_LE(state));
+                        start_found    = 4;
+                    }
+                    break;
+                case DCA_SYNCWORD_CORE_14B_BE:
+                    if (size == 4) {
+                        pc1->framesize = CORE_FRAMESIZE(STATE_14(state)) * 8 / 14 * 2;
+                        start_found    = 4;
+                    }
+                    break;
+                case DCA_SYNCWORD_CORE_14B_LE:
+                    if (size == 4) {
+                        pc1->framesize = CORE_FRAMESIZE(STATE_14(STATE_LE(state))) * 8 / 14 * 2;
+                        start_found    = 4;
+                    }
+                    break;
+                case DCA_SYNCWORD_SUBSTREAM:
+                    if (size == 6) {
+                        pc1->framesize = EXSS_FRAMESIZE(state);
+                        start_found    = 4;
+                    }
+                    break;
+                default:
+                    av_assert0(0);
+                }
+                continue;
+            }
+
+            if (start_found == 2 && IS_EXSS_MARKER(state) &&
+                pc1->framesize <= size + 2) {
+                pc1->framesize  = size + 2;
+                start_found     = 3;
+                continue;
+            }
+
+            if (start_found == 3) {
+                if (size == pc1->framesize + 4) {
+                    pc1->framesize += EXSS_FRAMESIZE(state);
+                    start_found     = 4;
+                }
+                continue;
+            }
+
+            if (pc1->framesize > size)
+                continue;
+
             if (IS_MARKER(state) &&
                 (pc1->lastmarker == CORE_MARKER(state) ||
                  pc1->lastmarker == DCA_SYNCWORD_SUBSTREAM)) {
-                if (pc1->framesize > pc1->size)
-                    continue;
                 pc->frame_start_found = 0;
                 pc->state64           = -1;
                 pc1->size             = 0;
@@ -96,8 +172,10 @@ static int dca_find_frame_end(DCAParseContext *pc1, const uint8_t *buf,
             }
         }
     }
+
     pc->frame_start_found = start_found;
     pc->state64           = state;
+    pc1->size             = size;
     return END_NOT_FOUND;
 }
 
@@ -106,20 +184,78 @@ static av_cold int dca_parse_init(AVCodecParserContext *s)
     DCAParseContext *pc1 = s->priv_data;
 
     pc1->lastmarker = 0;
+    pc1->sr_code = -1;
     return 0;
 }
 
-static int dca_parse_params(const uint8_t *buf, int buf_size, int *duration,
-                            int *sample_rate, int *framesize)
+static int dca_parse_params(DCAParseContext *pc1, const uint8_t *buf,
+                            int buf_size, int *duration, int *sample_rate)
 {
     GetBitContext gb;
     uint8_t hdr[12 + AV_INPUT_BUFFER_PADDING_SIZE] = { 0 };
-    int ret, sample_blocks, sr_code;
+    int ret, sample_blocks;
 
     if (buf_size < 12)
         return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_dca_convert_bitstream(buf, 12, hdr, 12)) < 0)
+    if (AV_RB32(buf) == DCA_SYNCWORD_SUBSTREAM) {
+        DCAExssAsset *asset = &pc1->exss.assets[0];
+
+        if ((ret = ff_dca_exss_parse(&pc1->exss, buf, buf_size)) < 0)
+            return ret;
+
+        if (asset->extension_mask & DCA_EXSS_LBR) {
+            if ((ret = init_get_bits8(&gb, buf + asset->lbr_offset, asset->lbr_size)) < 0)
+                return ret;
+
+            if (get_bits_long(&gb, 32) != DCA_SYNCWORD_LBR)
+                return AVERROR_INVALIDDATA;
+
+            switch (get_bits(&gb, 8)) {
+            case 2:
+                pc1->sr_code = get_bits(&gb, 8);
+            case 1:
+                break;
+            default:
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (pc1->sr_code >= FF_ARRAY_ELEMS(ff_dca_sampling_freqs))
+                return AVERROR_INVALIDDATA;
+
+            *sample_rate = ff_dca_sampling_freqs[pc1->sr_code];
+            *duration = 1024 << ff_dca_freq_ranges[pc1->sr_code];
+            return 0;
+        }
+
+        if (asset->extension_mask & DCA_EXSS_XLL) {
+            int nsamples_log2;
+
+            if ((ret = init_get_bits8(&gb, buf + asset->xll_offset, asset->xll_size)) < 0)
+                return ret;
+
+            if (get_bits_long(&gb, 32) != DCA_SYNCWORD_XLL)
+                return AVERROR_INVALIDDATA;
+
+            if (get_bits(&gb, 4))
+                return AVERROR_INVALIDDATA;
+
+            skip_bits(&gb, 8);
+            skip_bits_long(&gb, get_bits(&gb, 5) + 1);
+            skip_bits(&gb, 4);
+            nsamples_log2 = get_bits(&gb, 4) + get_bits(&gb, 4);
+            if (nsamples_log2 > 24)
+                return AVERROR_INVALIDDATA;
+
+            *sample_rate = asset->max_sample_rate;
+            *duration = (1 + (*sample_rate > 96000)) << nsamples_log2;
+            return 0;
+        }
+
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = avpriv_dca_convert_bitstream(buf, 12, hdr, 12)) < 0)
         return ret;
 
     init_get_bits(&gb, hdr, 96);
@@ -130,13 +266,8 @@ static int dca_parse_params(const uint8_t *buf, int buf_size, int *duration,
         return AVERROR_INVALIDDATA;
     *duration = 256 * (sample_blocks / 8);
 
-    *framesize = get_bits(&gb, 14) + 1;
-    if (*framesize < 95)
-        return AVERROR_INVALIDDATA;
-
-    skip_bits(&gb, 6);
-    sr_code      = get_bits(&gb, 4);
-    *sample_rate = avpriv_dca_sample_rates[sr_code];
+    skip_bits(&gb, 20);
+    *sample_rate = avpriv_dca_sample_rates[get_bits(&gb, 4)];
     if (*sample_rate == 0)
         return AVERROR_INVALIDDATA;
 
@@ -161,12 +292,20 @@ static int dca_parse(AVCodecParserContext *s, AVCodecContext *avctx,
             *poutbuf_size = 0;
             return buf_size;
         }
+
+        /* skip initial padding */
+        if (buf_size  > pc1->startpos) {
+            buf      += pc1->startpos;
+            buf_size -= pc1->startpos;
+        }
+        pc1->startpos = 0;
     }
 
     /* read the duration and sample rate from the frame header */
-    if (!dca_parse_params(buf, buf_size, &duration, &sample_rate, &pc1->framesize)) {
-        s->duration        = duration;
-        avctx->sample_rate = sample_rate;
+    if (!dca_parse_params(pc1, buf, buf_size, &duration, &sample_rate)) {
+        if (!avctx->sample_rate)
+            avctx->sample_rate = sample_rate;
+        s->duration = av_rescale(duration, avctx->sample_rate, sample_rate);
     } else
         s->duration = 0;
 
diff --git a/libavcodec/dca_syncwords.h b/libavcodec/dca_syncwords.h
index 07b60e0..4d2cd5f 100644
--- a/libavcodec/dca_syncwords.h
+++ b/libavcodec/dca_syncwords.h
@@ -1,37 +1,36 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DCA_SYNCWORDS_H
 #define AVCODEC_DCA_SYNCWORDS_H
 
-enum DCASyncwords {
-    DCA_SYNCWORD_CORE_BE        = 0x7FFE8001U,
-    DCA_SYNCWORD_CORE_LE        = 0xFE7F0180U,
-    DCA_SYNCWORD_CORE_14B_BE    = 0x1FFFE800U,
-    DCA_SYNCWORD_CORE_14B_LE    = 0xFF1F00E8U,
-    DCA_SYNCWORD_XCH            = 0x5A5A5A5AU,
-    DCA_SYNCWORD_XXCH           = 0x47004A03U,
-    DCA_SYNCWORD_X96            = 0x1D95F262U,
-    DCA_SYNCWORD_XBR            = 0x655E315EU,
-    DCA_SYNCWORD_LBR            = 0x0A801921U,
-    DCA_SYNCWORD_XLL            = 0x41A29547U,
-    DCA_SYNCWORD_SUBSTREAM      = 0x64582025U,
-    DCA_SYNCWORD_SUBSTREAM_CORE = 0x02B09261U,
-};
+#define    DCA_SYNCWORD_CORE_BE              0x7FFE8001U
+#define    DCA_SYNCWORD_CORE_LE              0xFE7F0180U
+#define    DCA_SYNCWORD_CORE_14B_BE          0x1FFFE800U
+#define    DCA_SYNCWORD_CORE_14B_LE          0xFF1F00E8U
+#define    DCA_SYNCWORD_XCH                  0x5A5A5A5AU
+#define    DCA_SYNCWORD_XXCH                 0x47004A03U
+#define    DCA_SYNCWORD_X96                  0x1D95F262U
+#define    DCA_SYNCWORD_XBR                  0x655E315EU
+#define    DCA_SYNCWORD_LBR                  0x0A801921U
+#define    DCA_SYNCWORD_XLL                  0x41A29547U
+#define    DCA_SYNCWORD_SUBSTREAM            0x64582025U
+#define    DCA_SYNCWORD_SUBSTREAM_CORE       0x02B09261U
+#define    DCA_SYNCWORD_REV1AUX              0x9A1105A0U
 
 #endif /* AVCODEC_DCA_SYNCWORDS_H */
diff --git a/libavcodec/dca_xll.c b/libavcodec/dca_xll.c
index 5a558b8..1d616c2 100644
--- a/libavcodec/dca_xll.c
+++ b/libavcodec/dca_xll.c
@@ -1,747 +1,1491 @@
 /*
- * DCA XLL extension
+ * Copyright (C) 2016 foo86
  *
- * Copyright (C) 2012 Paul B Mahol
- * Copyright (C) 2014 Niels Möller
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/attributes.h"
-#include "libavutil/common.h"
-#include "libavutil/internal.h"
-
-#include "avcodec.h"
-#include "dca.h"
+#include "dcadec.h"
 #include "dcadata.h"
-#include "get_bits.h"
+#include "dcamath.h"
+#include "dca_syncwords.h"
 #include "unary.h"
 
-/* Sign as bit 0 */
-static inline int get_bits_sm(GetBitContext *s, unsigned n)
+static int get_linear(GetBitContext *gb, int n)
 {
-    int x = get_bits(s, n);
-    if (x & 1)
-        return -(x >> 1) - 1;
-    else
-        return x >> 1;
-}
-
-/* Return -1 on error. */
-static int32_t get_dmix_coeff(DCAContext *s, int inverse)
-{
-    unsigned code = get_bits(&s->gb, 9);
-    int32_t sign = (int32_t) (code >> 8) - 1;
-    unsigned idx = code & 0xff;
-    int inv_offset = FF_DCA_DMIXTABLE_SIZE -FF_DCA_INV_DMIXTABLE_SIZE;
-    if (idx >= FF_DCA_DMIXTABLE_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "XLL: Invalid channel set downmix code %x\n", code);
-        return -1;
-    } else if (!inverse) {
-        return (ff_dca_dmixtable[idx] ^ sign) - sign;
-    } else if (idx < inv_offset) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "XLL: Invalid channel set inverse downmix code %x\n", code);
-        return -1;
-    } else {
-        return (ff_dca_inv_dmixtable[idx - inv_offset] ^ sign) - sign;
-    }
+    unsigned int v = get_bits_long(gb, n);
+    return (v >> 1) ^ -(v & 1);
+}
+
+static int get_rice_un(GetBitContext *gb, int k)
+{
+    unsigned int v = get_unary(gb, 1, get_bits_left(gb));
+    return (v << k) | get_bits_long(gb, k);
 }
 
-static int32_t dca_get_dmix_coeff(DCAContext *s)
+static int get_rice(GetBitContext *gb, int k)
 {
-    return get_dmix_coeff(s, 0);
+    unsigned int v = get_rice_un(gb, k);
+    return (v >> 1) ^ -(v & 1);
 }
 
-static int32_t dca_get_inv_dmix_coeff(DCAContext *s)
+static void get_array(GetBitContext *gb, int32_t *array, int size, int n)
 {
-    return get_dmix_coeff(s, 1);
+    int i;
+
+    for (i = 0; i < size; i++)
+        array[i] = get_bits(gb, n);
 }
 
-/* parse XLL header */
-int ff_dca_xll_decode_header(DCAContext *s)
+static void get_linear_array(GetBitContext *gb, int32_t *array, int size, int n)
 {
-    int hdr_pos, hdr_size;
-    av_unused int version, frame_size;
-    int i, chset_index;
+    int i;
 
-    /* get bit position of sync header */
-    hdr_pos    = get_bits_count(&s->gb) - 32;
+    if (n == 0)
+        memset(array, 0, sizeof(*array) * size);
+    else for (i = 0; i < size; i++)
+        array[i] = get_linear(gb, n);
+}
 
-    version    = get_bits(&s->gb, 4) + 1;
-    hdr_size   = get_bits(&s->gb, 8) + 1;
+static void get_rice_array(GetBitContext *gb, int32_t *array, int size, int k)
+{
+    int i;
 
-    frame_size = get_bits_long(&s->gb, get_bits(&s->gb, 5) + 1) + 1;
+    for (i = 0; i < size; i++)
+        array[i] = get_rice(gb, k);
+}
 
-    s->xll_channels          =
-    s->xll_residual_channels = 0;
-    s->xll_nch_sets          = get_bits(&s->gb, 4) + 1;
-    s->xll_segments          = 1 << get_bits(&s->gb, 4);
-    s->xll_log_smpl_in_seg   = get_bits(&s->gb, 4);
-    s->xll_smpl_in_seg       = 1 << s->xll_log_smpl_in_seg;
-    s->xll_bits4seg_size     = get_bits(&s->gb, 5) + 1;
-    s->xll_banddata_crc      = get_bits(&s->gb, 2);
-    s->xll_scalable_lsb      = get_bits1(&s->gb);
-    s->xll_bits4ch_mask      = get_bits(&s->gb, 5) + 1;
+static int parse_dmix_coeffs(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    // Size of downmix coefficient matrix
+    int m = c->primary_chset ? ff_dca_dmix_primary_nch[c->dmix_type] : c->hier_ofs;
+    int i, j, *coeff_ptr = c->dmix_coeff;
+
+    for (i = 0; i < m; i++) {
+        int code, sign, coeff, scale, scale_inv = 0;
+        unsigned int index;
+
+        // Downmix scale (only for non-primary channel sets)
+        if (!c->primary_chset) {
+            code = get_bits(&s->gb, 9);
+            sign = (code >> 8) - 1;
+            index = (code & 0xff) - FF_DCA_DMIXTABLE_OFFSET;
+            if (index >= FF_DCA_INV_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL downmix scale index\n");
+                return AVERROR_INVALIDDATA;
+            }
+            scale = ff_dca_dmixtable[index + FF_DCA_DMIXTABLE_OFFSET];
+            scale_inv = ff_dca_inv_dmixtable[index];
+            c->dmix_scale[i] = (scale ^ sign) - sign;
+            c->dmix_scale_inv[i] = (scale_inv ^ sign) - sign;
+        }
 
-    if (s->xll_scalable_lsb) {
-        s->xll_fixed_lsb_width = get_bits(&s->gb, 4);
-        if (s->xll_fixed_lsb_width)
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "XLL: fixed lsb width = %d, non-zero not supported.\n",
-                   s->xll_fixed_lsb_width);
+        // Downmix coefficients
+        for (j = 0; j < c->nchannels; j++) {
+            code = get_bits(&s->gb, 9);
+            sign = (code >> 8) - 1;
+            index = code & 0xff;
+            if (index >= FF_DCA_DMIXTABLE_SIZE) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL downmix coefficient index\n");
+                return AVERROR_INVALIDDATA;
+            }
+            coeff = ff_dca_dmixtable[index];
+            if (!c->primary_chset)
+                // Multiply by |InvDmixScale| to get |UndoDmixScale|
+                coeff = mul16(scale_inv, coeff);
+            *coeff_ptr++ = (coeff ^ sign) - sign;
+        }
     }
-    /* skip to the end of the common header */
-    i = get_bits_count(&s->gb);
-    if (hdr_pos + hdr_size * 8 > i)
-        skip_bits_long(&s->gb, hdr_pos + hdr_size * 8 - i);
 
-    for (chset_index = 0; chset_index < s->xll_nch_sets; chset_index++) {
-        XllChSetSubHeader *chset = &s->xll_chsets[chset_index];
-        hdr_pos  = get_bits_count(&s->gb);
-        hdr_size = get_bits(&s->gb, 10) + 1;
+    return 0;
+}
+
+static int chs_parse_header(DCAXllDecoder *s, DCAXllChSet *c, DCAExssAsset *asset)
+{
+    int i, j, k, ret, band, header_size, header_pos = get_bits_count(&s->gb);
+    DCAXllChSet *p = &s->chset[0];
+    DCAXllBand *b;
 
-        chset->channels           = get_bits(&s->gb, 4) + 1;
-        chset->residual_encode    = get_bits(&s->gb, chset->channels);
-        chset->bit_resolution     = get_bits(&s->gb, 5) + 1;
-        chset->bit_width          = get_bits(&s->gb, 5) + 1;
-        chset->sampling_frequency = ff_dca_sampling_freqs[get_bits(&s->gb, 4)];
-        chset->samp_freq_interp   = get_bits(&s->gb, 2);
-        chset->replacement_set    = get_bits(&s->gb, 2);
-        if (chset->replacement_set)
-            chset->active_replace_set = get_bits(&s->gb, 1);
+    // Size of channel set sub-header
+    header_size = get_bits(&s->gb, 10) + 1;
 
-        if (s->one2one_map_chtospkr) {
-            chset->primary_ch_set              = get_bits(&s->gb, 1);
-            chset->downmix_coeff_code_embedded = get_bits(&s->gb, 1);
-            if (chset->downmix_coeff_code_embedded) {
-                chset->downmix_embedded = get_bits(&s->gb, 1);
-                if (chset->primary_ch_set) {
-                    chset->downmix_type = get_bits(&s->gb, 3);
-                    if (chset->downmix_type > 6) {
-                        av_log(s->avctx, AV_LOG_ERROR,
-                               "XLL: Invalid channel set downmix type\n");
-                        return AVERROR_INVALIDDATA;
-                    }
-                }
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, header_pos, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL sub-header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of channels in the channel set
+    c->nchannels = get_bits(&s->gb, 4) + 1;
+    if (c->nchannels > DCA_XLL_CHANNELS_MAX) {
+        avpriv_request_sample(s->avctx, "%d XLL channels", c->nchannels);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Residual type
+    c->residual_encode = get_bits(&s->gb, c->nchannels);
+
+    // PCM bit resolution
+    c->pcm_bit_res = get_bits(&s->gb, 5) + 1;
+
+    // Storage unit width
+    c->storage_bit_res = get_bits(&s->gb, 5) + 1;
+    if (c->storage_bit_res != 16 && c->storage_bit_res != 24) {
+        avpriv_request_sample(s->avctx, "%d-bit XLL storage resolution", c->storage_bit_res);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (c->pcm_bit_res > c->storage_bit_res) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid PCM bit resolution for XLL channel set (%d > %d)\n", c->pcm_bit_res, c->storage_bit_res);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Original sampling frequency
+    c->freq = ff_dca_sampling_freqs[get_bits(&s->gb, 4)];
+    if (c->freq > 192000) {
+        avpriv_request_sample(s->avctx, "%d Hz XLL sampling frequency", c->freq);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Sampling frequency modifier
+    if (get_bits(&s->gb, 2)) {
+        avpriv_request_sample(s->avctx, "XLL sampling frequency modifier");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Which replacement set this channel set is member of
+    if (get_bits(&s->gb, 2)) {
+        avpriv_request_sample(s->avctx, "XLL replacement set");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (asset->one_to_one_map_ch_to_spkr) {
+        // Primary channel set flag
+        c->primary_chset = get_bits1(&s->gb);
+        if (c->primary_chset != (c == p)) {
+            av_log(s->avctx, AV_LOG_ERROR, "The first (and only) XLL channel set must be primary\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Downmix coefficients present in stream
+        c->dmix_coeffs_present = get_bits1(&s->gb);
+
+        // Downmix already performed by encoder
+        c->dmix_embedded = c->dmix_coeffs_present && get_bits1(&s->gb);
+
+        // Downmix type
+        if (c->dmix_coeffs_present && c->primary_chset) {
+            c->dmix_type = get_bits(&s->gb, 3);
+            if (c->dmix_type >= DCA_DMIX_TYPE_COUNT) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL primary channel set downmix type\n");
+                return AVERROR_INVALIDDATA;
             }
-            chset->hier_chset = get_bits(&s->gb, 1);
-
-            if (chset->downmix_coeff_code_embedded) {
-                /* nDownmixCoeffs is specified as N * M. For a primary
-                 * channel set, it appears that N = number of
-                 * channels, and M is the number of downmix channels.
-                 *
-                 * For a non-primary channel set, N is specified as
-                 * number of channels + 1, and M is derived from the
-                 * channel set hierarchy, and at least in simple cases
-                 * M is the number of channels in preceding channel
-                 * sets. */
-                if (chset->primary_ch_set) {
-                    static const char dmix_table[7] = { 1, 2, 2, 3, 3, 4, 4 };
-                    chset->downmix_ncoeffs = chset->channels * dmix_table[chset->downmix_type];
-                } else
-                    chset->downmix_ncoeffs = (chset->channels + 1) * s->xll_channels;
-
-                if (chset->downmix_ncoeffs > DCA_XLL_DMIX_NCOEFFS_MAX) {
-                    avpriv_request_sample(s->avctx,
-                                          "XLL: More than %d downmix coefficients",
-                                          DCA_XLL_DMIX_NCOEFFS_MAX);
-                    return AVERROR_PATCHWELCOME;
-                } else if (chset->primary_ch_set) {
-                    for (i = 0; i < chset->downmix_ncoeffs; i++)
-                        if ((chset->downmix_coeffs[i] = dca_get_dmix_coeff(s)) == -1)
-                            return AVERROR_INVALIDDATA;
-                } else {
-                    unsigned c, r;
-                    for (c = 0, i = 0; c < s->xll_channels; c++, i += chset->channels + 1) {
-                        if ((chset->downmix_coeffs[i] = dca_get_inv_dmix_coeff(s)) == -1)
-                            return AVERROR_INVALIDDATA;
-                        for (r = 1; r <= chset->channels; r++) {
-                            int32_t coeff = dca_get_dmix_coeff(s);
-                            if (coeff == -1)
-                                return AVERROR_INVALIDDATA;
-                            chset->downmix_coeffs[i + r] =
-                                (chset->downmix_coeffs[i] * (int64_t) coeff + (1 << 15)) >> 16;
-                        }
-                    }
+        }
+
+        // Whether the channel set is part of a hierarchy
+        c->hier_chset = get_bits1(&s->gb);
+        if (!c->hier_chset && s->nchsets != 1) {
+            avpriv_request_sample(s->avctx, "XLL channel set outside of hierarchy");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        // Downmix coefficients
+        if (c->dmix_coeffs_present && (ret = parse_dmix_coeffs(s, c)) < 0)
+            return ret;
+
+        // Channel mask enabled
+        if (!get_bits1(&s->gb)) {
+            avpriv_request_sample(s->avctx, "Disabled XLL channel mask");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        // Channel mask for set
+        c->ch_mask = get_bits_long(&s->gb, s->ch_mask_nbits);
+        if (av_popcount(c->ch_mask) != c->nchannels) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL channel mask\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Build the channel to speaker map
+        for (i = 0, j = 0; i < s->ch_mask_nbits; i++)
+            if (c->ch_mask & (1U << i))
+                c->ch_remap[j++] = i;
+    } else {
+        // Mapping coeffs present flag
+        if (c->nchannels != 2 || s->nchsets != 1 || get_bits1(&s->gb)) {
+            avpriv_request_sample(s->avctx, "Custom XLL channel to speaker mapping");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        // Setup for LtRt decoding
+        c->primary_chset = 1;
+        c->dmix_coeffs_present = 0;
+        c->dmix_embedded = 0;
+        c->hier_chset = 0;
+        c->ch_mask = DCA_SPEAKER_LAYOUT_STEREO;
+        c->ch_remap[0] = DCA_SPEAKER_L;
+        c->ch_remap[1] = DCA_SPEAKER_R;
+    }
+
+    if (c->freq > 96000) {
+        // Extra frequency bands flag
+        if (get_bits1(&s->gb)) {
+            avpriv_request_sample(s->avctx, "Extra XLL frequency bands");
+            return AVERROR_PATCHWELCOME;
+        }
+        c->nfreqbands = 2;
+    } else {
+        c->nfreqbands = 1;
+    }
+
+    // Set the sampling frequency to that of the first frequency band.
+    // Frequency will be doubled again after bands assembly.
+    c->freq >>= c->nfreqbands - 1;
+
+    // Verify that all channel sets have the same audio characteristics
+    if (c != p && (c->nfreqbands != p->nfreqbands || c->freq != p->freq
+                   || c->pcm_bit_res != p->pcm_bit_res
+                   || c->storage_bit_res != p->storage_bit_res)) {
+        avpriv_request_sample(s->avctx, "Different XLL audio characteristics");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Determine number of bits to read bit allocation coding parameter
+    if (c->storage_bit_res > 16)
+        c->nabits = 5;
+    else if (c->storage_bit_res > 8)
+        c->nabits = 4;
+    else
+        c->nabits = 3;
+
+    // Account for embedded downmix and decimator saturation
+    if ((s->nchsets > 1 || c->nfreqbands > 1) && c->nabits < 5)
+        c->nabits++;
+
+    for (band = 0, b = c->bands; band < c->nfreqbands; band++, b++) {
+        // Pairwise channel decorrelation
+        if ((b->decor_enabled = get_bits1(&s->gb)) && c->nchannels > 1) {
+            int ch_nbits = av_ceil_log2(c->nchannels);
+
+            // Original channel order
+            for (i = 0; i < c->nchannels; i++) {
+                b->orig_order[i] = get_bits(&s->gb, ch_nbits);
+                if (b->orig_order[i] >= c->nchannels) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL original channel order\n");
+                    return AVERROR_INVALIDDATA;
                 }
             }
-            chset->ch_mask_enabled = get_bits(&s->gb, 1);
-            if (chset->ch_mask_enabled)
-                chset->ch_mask = get_bits(&s->gb, s->xll_bits4ch_mask);
-            else
-                /* Skip speaker configuration bits */
-                skip_bits_long(&s->gb, 25 * chset->channels);
+
+            // Pairwise channel coefficients
+            for (i = 0; i < c->nchannels / 2; i++)
+                b->decor_coeff[i] = get_bits1(&s->gb) ? get_linear(&s->gb, 7) : 0;
         } else {
-            chset->primary_ch_set              = 1;
-            chset->downmix_coeff_code_embedded = 0;
-            /* Spec: NumChHierChSet = 0, NumDwnMixCodeCoeffs = 0, whatever that means. */
-            chset->mapping_coeffs_present = get_bits(&s->gb, 1);
-            if (chset->mapping_coeffs_present) {
-                avpriv_report_missing_feature(s->avctx, "XLL: mapping coefficients");
-                return AVERROR_PATCHWELCOME;
-            }
+            for (i = 0; i < c->nchannels; i++)
+                b->orig_order[i] = i;
+            for (i = 0; i < c->nchannels / 2; i++)
+                b->decor_coeff[i] = 0;
         }
-        if (chset->sampling_frequency > 96000)
-            chset->num_freq_bands = 2 * (1 + get_bits(&s->gb, 1));
-        else
-            chset->num_freq_bands = 1;
 
-        if (chset->num_freq_bands > 1) {
-            avpriv_report_missing_feature(s->avctx, "XLL: num_freq_bands > 1");
-            return AVERROR_PATCHWELCOME;
+        // Adaptive predictor order
+        b->highest_pred_order = 0;
+        for (i = 0; i < c->nchannels; i++) {
+            b->adapt_pred_order[i] = get_bits(&s->gb, 4);
+            if (b->adapt_pred_order[i] > b->highest_pred_order)
+                b->highest_pred_order = b->adapt_pred_order[i];
+        }
+        if (b->highest_pred_order > s->nsegsamples) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL adaptive predicition order\n");
+            return AVERROR_INVALIDDATA;
         }
 
-        if (get_bits(&s->gb, 1)) { /* pw_ch_decor_enabled */
-            int bits = av_ceil_log2(chset->channels);
-            for (i = 0; i < chset->channels; i++) {
-                unsigned j = get_bits(&s->gb, bits);
-                if (j >= chset->channels) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Original channel order value %u too large, only %d channels.\n",
-                           j, chset->channels);
+        // Fixed predictor order
+        for (i = 0; i < c->nchannels; i++)
+            b->fixed_pred_order[i] = b->adapt_pred_order[i] ? 0 : get_bits(&s->gb, 2);
+
+        // Adaptive predictor quantized reflection coefficients
+        for (i = 0; i < c->nchannels; i++) {
+            for (j = 0; j < b->adapt_pred_order[i]; j++) {
+                k = get_linear(&s->gb, 8);
+                if (k == -128) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL reflection coefficient index\n");
                     return AVERROR_INVALIDDATA;
                 }
-                chset->orig_chan_order[0][i]     = j;
-                chset->orig_chan_order_inv[0][j] = i;
-            }
-            for (i = 0; i < chset->channels / 2; i++) {
-                if (get_bits(&s->gb, 1)) /* bChPFlag */
-                    chset->pw_ch_pairs_coeffs[0][i] = get_bits_sm(&s->gb, 7);
+                if (k < 0)
+                    b->adapt_refl_coeff[i][j] = -(int)ff_dca_xll_refl_coeff[-k];
                 else
-                    chset->pw_ch_pairs_coeffs[0][i] = 0;
+                    b->adapt_refl_coeff[i][j] =  (int)ff_dca_xll_refl_coeff[ k];
             }
-        } else {
-            for (i = 0; i < chset->channels; i++)
-                chset->orig_chan_order[0][i]     =
-                chset->orig_chan_order_inv[0][i] = i;
-            for (i = 0; i < chset->channels / 2; i++)
-                chset->pw_ch_pairs_coeffs[0][i] = 0;
-        }
-        /* Adaptive prediction order */
-        chset->adapt_order_max[0] = 0;
-        for (i = 0; i < chset->channels; i++) {
-            chset->adapt_order[0][i] = get_bits(&s->gb, 4);
-            if (chset->adapt_order_max[0] < chset->adapt_order[0][i])
-                chset->adapt_order_max[0] = chset->adapt_order[0][i];
-        }
-        /* Fixed prediction order, used in case the adaptive order
-         * above is zero */
-        for (i = 0; i < chset->channels; i++)
-            chset->fixed_order[0][i] =
-                chset->adapt_order[0][i] ? 0 : get_bits(&s->gb, 2);
-
-        for (i = 0; i < chset->channels; i++) {
-            unsigned j;
-            for (j = 0; j < chset->adapt_order[0][i]; j++)
-                chset->lpc_refl_coeffs_q_ind[0][i][j] = get_bits(&s->gb, 8);
-        }
-
-        if (s->xll_scalable_lsb) {
-            chset->lsb_fsize[0] = get_bits(&s->gb, s->xll_bits4seg_size);
-
-            for (i = 0; i < chset->channels; i++)
-                chset->scalable_lsbs[0][i] = get_bits(&s->gb, 4);
-            for (i = 0; i < chset->channels; i++)
-                chset->bit_width_adj_per_ch[0][i] = get_bits(&s->gb, 4);
-        } else {
-            memset(chset->scalable_lsbs[0], 0,
-                   chset->channels * sizeof(chset->scalable_lsbs[0][0]));
-            memset(chset->bit_width_adj_per_ch[0], 0,
-                   chset->channels * sizeof(chset->bit_width_adj_per_ch[0][0]));
         }
 
-        s->xll_channels          += chset->channels;
-        s->xll_residual_channels += chset->channels -
-                                    av_popcount(chset->residual_encode);
+        // Downmix performed by encoder in extension frequency band
+        b->dmix_embedded = c->dmix_embedded && (band == 0 || get_bits1(&s->gb));
+
+        // MSB/LSB split flag in extension frequency band
+        if ((band == 0 && s->scalable_lsbs) || (band != 0 && get_bits1(&s->gb))) {
+            // Size of LSB section in any segment
+            b->lsb_section_size = get_bits_long(&s->gb, s->seg_size_nbits);
+            if (b->lsb_section_size < 0 || b->lsb_section_size > s->frame_size) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid LSB section size\n");
+                return AVERROR_INVALIDDATA;
+            }
 
-        /* FIXME: Parse header data for extra frequency bands. */
+            // Account for optional CRC bytes after LSB section
+            if (b->lsb_section_size && (s->band_crc_present > 2 ||
+                                        (band == 0 && s->band_crc_present > 1)))
+                b->lsb_section_size += 2;
 
-        /* Skip to end of channel set sub header. */
-        i = get_bits_count(&s->gb);
-        if (hdr_pos + 8 * hdr_size < i) {
-            av_log(s->avctx, AV_LOG_ERROR,
-                   "chset header too large, %d bits, should be <= %d bits\n",
-                   i - hdr_pos, 8 * hdr_size);
-            return AVERROR_INVALIDDATA;
+            // Number of bits to represent the samples in LSB part
+            for (i = 0; i < c->nchannels; i++) {
+                b->nscalablelsbs[i] = get_bits(&s->gb, 4);
+                if (b->nscalablelsbs[i] && !b->lsb_section_size) {
+                    av_log(s->avctx, AV_LOG_ERROR, "LSB section missing with non-zero LSB width\n");
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+        } else {
+            b->lsb_section_size = 0;
+            for (i = 0; i < c->nchannels; i++)
+                b->nscalablelsbs[i] = 0;
+        }
+
+        // Scalable resolution flag in extension frequency band
+        if ((band == 0 && s->scalable_lsbs) || (band != 0 && get_bits1(&s->gb))) {
+            // Number of bits discarded by authoring
+            for (i = 0; i < c->nchannels; i++)
+                b->bit_width_adjust[i] = get_bits(&s->gb, 4);
+        } else {
+            for (i = 0; i < c->nchannels; i++)
+                b->bit_width_adjust[i] = 0;
         }
-        if (hdr_pos + 8 * hdr_size > i)
-            skip_bits_long(&s->gb, hdr_pos + 8 * hdr_size - i);
     }
+
+    // Reserved
+    // Byte align
+    // CRC16 of channel set sub-header
+    if (ff_dca_seek_bits(&s->gb, header_pos + header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL sub-header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
-/* parse XLL navigation table */
-int ff_dca_xll_decode_navi(DCAContext *s, int asset_end)
+static int chs_alloc_msb_band_data(DCAXllDecoder *s, DCAXllChSet *c)
 {
-    int nbands, band, chset, seg, data_start;
-
-    /* FIXME: Supports only a single frequency band */
-    nbands = 1;
-
-    for (band = 0; band < nbands; band++) {
-        s->xll_navi.band_size[band] = 0;
-        for (seg = 0; seg < s->xll_segments; seg++) {
-            /* Note: The spec, ETSI TS 102 114 V1.4.1 (2012-09), says
-             * we should read a base value for segment_size from the
-             * stream, before reading the sizes of the channel sets.
-             * But that's apparently incorrect. */
-            s->xll_navi.segment_size[band][seg] = 0;
+    int ndecisamples = c->nfreqbands > 1 ? DCA_XLL_DECI_HISTORY_MAX : 0;
+    int nchsamples = s->nframesamples + ndecisamples;
+    int i, j, nsamples = nchsamples * c->nchannels * c->nfreqbands;
+    int32_t *ptr;
+
+    // Reallocate MSB sample buffer
+    av_fast_malloc(&c->sample_buffer[0], &c->sample_size[0], nsamples * sizeof(int32_t));
+    if (!c->sample_buffer[0])
+        return AVERROR(ENOMEM);
 
-            for (chset = 0; chset < s->xll_nch_sets; chset++)
-                if (band < s->xll_chsets[chset].num_freq_bands) {
-                    s->xll_navi.chset_size[band][seg][chset] =
-                        get_bits(&s->gb, s->xll_bits4seg_size) + 1;
-                    s->xll_navi.segment_size[band][seg] +=
-                        s->xll_navi.chset_size[band][seg][chset];
-                }
-            s->xll_navi.band_size[band] += s->xll_navi.segment_size[band][seg];
+    ptr = c->sample_buffer[0] + ndecisamples;
+    for (i = 0; i < c->nfreqbands; i++) {
+        for (j = 0; j < c->nchannels; j++) {
+            c->bands[i].msb_sample_buffer[j] = ptr;
+            ptr += nchsamples;
         }
     }
-    /* Align to 8 bits and skip 16-bit CRC. */
-    skip_bits_long(&s->gb, 16 + ((-get_bits_count(&s->gb)) & 7));
 
-    data_start = get_bits_count(&s->gb);
-    if (data_start + 8 * s->xll_navi.band_size[0] > asset_end) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "XLL: Data in NAVI table exceeds containing asset\n"
-               "start: %d (bit), size %u (bytes), end %d (bit), error %u\n",
-               data_start, s->xll_navi.band_size[0], asset_end,
-               data_start + 8 * s->xll_navi.band_size[0] - asset_end);
-        return AVERROR_INVALIDDATA;
-    }
-    init_get_bits(&s->xll_navi.gb, s->gb.buffer + data_start / 8,
-                  8 * s->xll_navi.band_size[0]);
     return 0;
 }
 
-static void dca_xll_inv_adapt_pred(int *samples, int nsamples, unsigned order,
-                                   const int *prev, const uint8_t *q_ind)
-{
-    static const uint16_t table[0x81] = {
-            0,  3070,  5110,  7140,  9156, 11154, 13132, 15085,
-        17010, 18904, 20764, 22588, 24373, 26117, 27818, 29474,
-        31085, 32648, 34164, 35631, 37049, 38418, 39738, 41008,
-        42230, 43404, 44530, 45609, 46642, 47630, 48575, 49477,
-        50337, 51157, 51937, 52681, 53387, 54059, 54697, 55302,
-        55876, 56421, 56937, 57426, 57888, 58326, 58741, 59132,
-        59502, 59852, 60182, 60494, 60789, 61066, 61328, 61576,
-        61809, 62029, 62236, 62431, 62615, 62788, 62951, 63105,
-        63250, 63386, 63514, 63635, 63749, 63855, 63956, 64051,
-        64140, 64224, 64302, 64376, 64446, 64512, 64573, 64631,
-        64686, 64737, 64785, 64830, 64873, 64913, 64950, 64986,
-        65019, 65050, 65079, 65107, 65133, 65157, 65180, 65202,
-        65222, 65241, 65259, 65275, 65291, 65306, 65320, 65333,
-        65345, 65357, 65368, 65378, 65387, 65396, 65405, 65413,
-        65420, 65427, 65434, 65440, 65446, 65451, 65456, 65461,
-        65466, 65470, 65474, 65478, 65481, 65485, 65488, 65491,
-        65535, /* Final value is for the -128 corner case, see below. */
-    };
-    int c[DCA_XLL_AORDER_MAX];
-    int64_t s;
-    unsigned i, j;
-
-    for (i = 0; i < order; i++) {
-        if (q_ind[i] & 1)
-            /* The index value 0xff corresponds to a lookup of entry 0x80 in
-             * the table, and no value is provided in the specification. */
-            c[i] = -table[(q_ind[i] >> 1) + 1];
-        else
-            c[i] = table[q_ind[i] >> 1];
-    }
-    /* The description in the spec is a bit convoluted. We can convert
-     * the reflected values to direct values in place, using a
-     * sequence of reflections operating on two values. */
-    for (i = 1; i < order; i++) {
-        /* i = 1: scale c[0]
-         * i = 2: reflect c[0] <-> c[1]
-         * i = 3: scale c[1], reflect c[0] <-> c[2]
-         * i = 4: reflect c[0] <-> c[3] reflect c[1] <-> c[2]
-         * ... */
-        if (i & 1)
-            c[i / 2] += ((int64_t) c[i] * c[i / 2] + 0x8000) >> 16;
-        for (j = 0; j < i / 2; j++) {
-            int r0 = c[j];
-            int r1 = c[i - j - 1];
-            c[j]         += ((int64_t) c[i] * r1 + 0x8000) >> 16;
-            c[i - j - 1] += ((int64_t) c[i] * r0 + 0x8000) >> 16;
-        }
-    }
-    /* Apply predictor. */
-    /* NOTE: Processing samples in this order means that the
-     * predictor is applied to the newly reconstructed samples. */
-    if (prev) {
-        for (i = 0; i < order; i++) {
-            for (j = s = 0; j < i; j++)
-                s += (int64_t) c[j] * samples[i - 1 - j];
-            for (; j < order; j++)
-                s += (int64_t) c[j] * prev[DCA_XLL_AORDER_MAX + i - 1 - j];
-
-            samples[i] -= av_clip((s + 0x8000) >> 16, -0x1000000, 0xffffff);
-        }
-    }
-    for (i = order; i < nsamples; i++) {
-        for (j = s = 0; j < order; j++)
-            s += (int64_t) c[j] * samples[i - 1 - j];
-
-        /* NOTE: Equations seem to imply addition, while the
-         * pseudocode seems to use subtraction.*/
-        samples[i] -= av_clip((s + 0x8000) >> 16, -0x1000000, 0xffffff);
-    }
-}
-
-int ff_dca_xll_decode_audio(DCAContext *s, AVFrame *frame)
-{
-    /* FIXME: Decodes only the first frequency band. */
-    int seg, chset_i;
-
-    /* Coding parameters for each channel set. */
-    struct coding_params {
-        int seg_type;
-        int rice_code_flag[16];
-        int pancAuxABIT[16];
-        int pancABIT0[16];  /* Not sure what this is */
-        int pancABIT[16];   /* Not sure what this is */
-        int nSamplPart0[16];
-    } param_state[16];
-
-    GetBitContext *gb = &s->xll_navi.gb;
-    int *history;
-
-    /* Layout: First the sample buffer for one segment per channel,
-     * followed by history buffers of DCA_XLL_AORDER_MAX samples for
-     * each channel. */
-    av_fast_malloc(&s->xll_sample_buf, &s->xll_sample_buf_size,
-                   (s->xll_smpl_in_seg + DCA_XLL_AORDER_MAX) *
-                   s->xll_channels * sizeof(*s->xll_sample_buf));
-    if (!s->xll_sample_buf)
+static int chs_alloc_lsb_band_data(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    int i, j, nsamples = 0;
+    int32_t *ptr;
+
+    // Determine number of frequency bands that have MSB/LSB split
+    for (i = 0; i < c->nfreqbands; i++)
+        if (c->bands[i].lsb_section_size)
+            nsamples += s->nframesamples * c->nchannels;
+    if (!nsamples)
+        return 0;
+
+    // Reallocate LSB sample buffer
+    av_fast_malloc(&c->sample_buffer[1], &c->sample_size[1], nsamples * sizeof(int32_t));
+    if (!c->sample_buffer[1])
         return AVERROR(ENOMEM);
 
-    history = s->xll_sample_buf + s->xll_smpl_in_seg * s->xll_channels;
-
-    for (seg = 0; seg < s->xll_segments; seg++) {
-        unsigned in_channel;
-
-        for (chset_i = in_channel = 0; chset_i < s->xll_nch_sets; chset_i++) {
-            /* The spec isn't very explicit, but I think the NAVI sizes are in bytes. */
-            int end_pos = get_bits_count(gb) +
-                          8 * s->xll_navi.chset_size[0][seg][chset_i];
-            int i, j;
-            struct coding_params *params = &param_state[chset_i];
-            /* I think this flag means that we should keep seg_type and
-             * other parameters from the previous segment. */
-            int use_seg_state_code_param;
-            XllChSetSubHeader *chset = &s->xll_chsets[chset_i];
-            if (in_channel >= s->avctx->channels)
-                /* FIXME: Could go directly to next segment */
-                goto next_chset;
-
-            if (s->avctx->sample_rate != chset->sampling_frequency) {
-                av_log(s->avctx, AV_LOG_WARNING,
-                       "XLL: unexpected chset sample rate %d, expected %d\n",
-                       chset->sampling_frequency, s->avctx->sample_rate);
-                goto next_chset;
+    ptr = c->sample_buffer[1];
+    for (i = 0; i < c->nfreqbands; i++) {
+        if (c->bands[i].lsb_section_size) {
+            for (j = 0; j < c->nchannels; j++) {
+                c->bands[i].lsb_sample_buffer[j] = ptr;
+                ptr += s->nframesamples;
             }
-            if (seg != 0)
-                use_seg_state_code_param = get_bits(gb, 1);
+        } else {
+            for (j = 0; j < c->nchannels; j++)
+                c->bands[i].lsb_sample_buffer[j] = NULL;
+        }
+    }
+
+    return 0;
+}
+
+static int chs_parse_band_data(DCAXllDecoder *s, DCAXllChSet *c, int band, int seg, int band_data_end)
+{
+    DCAXllBand *b = &c->bands[band];
+    int i, j, k;
+
+    // Start unpacking MSB portion of the segment
+    if (!(seg && get_bits1(&s->gb))) {
+        // Unpack segment type
+        // 0 - distinct coding parameters for each channel
+        // 1 - common coding parameters for all channels
+        c->seg_common = get_bits1(&s->gb);
+
+        // Determine number of coding parameters encoded in segment
+        k = c->seg_common ? 1 : c->nchannels;
+
+        // Unpack Rice coding parameters
+        for (i = 0; i < k; i++) {
+            // Unpack Rice coding flag
+            // 0 - linear code, 1 - Rice code
+            c->rice_code_flag[i] = get_bits1(&s->gb);
+            // Unpack Hybrid Rice coding flag
+            // 0 - Rice code, 1 - Hybrid Rice code
+            if (!c->seg_common && c->rice_code_flag[i] && get_bits1(&s->gb))
+                // Unpack binary code length for isolated samples
+                c->bitalloc_hybrid_linear[i] = get_bits(&s->gb, c->nabits) + 1;
             else
-                use_seg_state_code_param = 0;
+                // 0 indicates no Hybrid Rice coding
+                c->bitalloc_hybrid_linear[i] = 0;
+        }
 
-            if (!use_seg_state_code_param) {
-                int num_param_sets, i;
-                unsigned bits4ABIT;
+        // Unpack coding parameters
+        for (i = 0; i < k; i++) {
+            if (seg == 0) {
+                // Unpack coding parameter for part A of segment 0
+                c->bitalloc_part_a[i] = get_bits(&s->gb, c->nabits);
 
-                params->seg_type = get_bits(gb, 1);
-                num_param_sets   = params->seg_type ? 1 : chset->channels;
+                // Adjust for the linear code
+                if (!c->rice_code_flag[i] && c->bitalloc_part_a[i])
+                    c->bitalloc_part_a[i]++;
 
-                if (chset->bit_width > 16) {
-                    bits4ABIT = 5;
-                } else {
-                    if (chset->bit_width > 8)
-                        bits4ABIT = 4;
-                    else
-                        bits4ABIT = 3;
-                    if (s->xll_nch_sets > 1)
-                        bits4ABIT++;
+                if (!c->seg_common)
+                    c->nsamples_part_a[i] = b->adapt_pred_order[i];
+                else
+                    c->nsamples_part_a[i] = b->highest_pred_order;
+            } else {
+                c->bitalloc_part_a[i] = 0;
+                c->nsamples_part_a[i] = 0;
+            }
+
+            // Unpack coding parameter for part B of segment
+            c->bitalloc_part_b[i] = get_bits(&s->gb, c->nabits);
+
+            // Adjust for the linear code
+            if (!c->rice_code_flag[i] && c->bitalloc_part_b[i])
+                c->bitalloc_part_b[i]++;
+        }
+    }
+
+    // Unpack entropy codes
+    for (i = 0; i < c->nchannels; i++) {
+        int32_t *part_a, *part_b;
+        int nsamples_part_b;
+
+        // Select index of coding parameters
+        k = c->seg_common ? 0 : i;
+
+        // Slice the segment into parts A and B
+        part_a = b->msb_sample_buffer[i] + seg * s->nsegsamples;
+        part_b = part_a + c->nsamples_part_a[k];
+        nsamples_part_b = s->nsegsamples - c->nsamples_part_a[k];
+
+        if (get_bits_left(&s->gb) < 0)
+            return AVERROR_INVALIDDATA;
+
+        if (!c->rice_code_flag[k]) {
+            // Linear codes
+            // Unpack all residuals of part A of segment 0
+            get_linear_array(&s->gb, part_a, c->nsamples_part_a[k],
+                             c->bitalloc_part_a[k]);
+
+            // Unpack all residuals of part B of segment 0 and others
+            get_linear_array(&s->gb, part_b, nsamples_part_b,
+                             c->bitalloc_part_b[k]);
+        } else {
+            // Rice codes
+            // Unpack all residuals of part A of segment 0
+            get_rice_array(&s->gb, part_a, c->nsamples_part_a[k],
+                           c->bitalloc_part_a[k]);
+
+            if (c->bitalloc_hybrid_linear[k]) {
+                // Hybrid Rice codes
+                // Unpack the number of isolated samples
+                int nisosamples = get_bits(&s->gb, s->nsegsamples_log2);
+
+                // Set all locations to 0
+                memset(part_b, 0, sizeof(*part_b) * nsamples_part_b);
+
+                // Extract the locations of isolated samples and flag by -1
+                for (j = 0; j < nisosamples; j++) {
+                    int loc = get_bits(&s->gb, s->nsegsamples_log2);
+                    if (loc >= nsamples_part_b) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid isolated sample location\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    part_b[loc] = -1;
                 }
 
-                for (i = 0; i < num_param_sets; i++) {
-                    params->rice_code_flag[i] = get_bits(gb, 1);
-                    if (!params->seg_type && params->rice_code_flag[i] && get_bits(gb, 1))
-                        params->pancAuxABIT[i] = get_bits(gb, bits4ABIT) + 1;
+                // Unpack all residuals of part B of segment 0 and others
+                for (j = 0; j < nsamples_part_b; j++) {
+                    if (part_b[j])
+                        part_b[j] = get_linear(&s->gb, c->bitalloc_hybrid_linear[k]);
                     else
-                        params->pancAuxABIT[i] = 0;
+                        part_b[j] = get_rice(&s->gb, c->bitalloc_part_b[k]);
                 }
+            } else {
+                // Rice codes
+                // Unpack all residuals of part B of segment 0 and others
+                get_rice_array(&s->gb, part_b, nsamples_part_b, c->bitalloc_part_b[k]);
+            }
+        }
+    }
 
-                for (i = 0; i < num_param_sets; i++) {
-                    if (!seg) {
-                        /* Parameters for part 1 */
-                        params->pancABIT0[i] = get_bits(gb, bits4ABIT);
-                        if (params->rice_code_flag[i] == 0 && params->pancABIT0[i] > 0)
-                            /* For linear code */
-                            params->pancABIT0[i]++;
-
-                        /* NOTE: In the spec, not indexed by band??? */
-                        if (params->seg_type == 0)
-                            params->nSamplPart0[i] = chset->adapt_order[0][i];
-                        else
-                            params->nSamplPart0[i] = chset->adapt_order_max[0];
-                    } else
-                        params->nSamplPart0[i] = 0;
-
-                    /* Parameters for part 2 */
-                    params->pancABIT[i] = get_bits(gb, bits4ABIT);
-                    if (params->rice_code_flag[i] == 0 && params->pancABIT[i] > 0)
-                        /* For linear code */
-                        params->pancABIT[i]++;
-                }
+    // Unpack decimator history for frequency band 1
+    if (seg == 0 && band == 1) {
+        int nbits = get_bits(&s->gb, 5) + 1;
+        for (i = 0; i < c->nchannels; i++)
+            for (j = 1; j < DCA_XLL_DECI_HISTORY_MAX; j++)
+                c->deci_history[i][j] = get_sbits_long(&s->gb, nbits);
+    }
+
+    // Start unpacking LSB portion of the segment
+    if (b->lsb_section_size) {
+        // Skip to the start of LSB portion
+        if (ff_dca_seek_bits(&s->gb, band_data_end - b->lsb_section_size * 8)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL band data\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Unpack all LSB parts of residuals of this segment
+        for (i = 0; i < c->nchannels; i++) {
+            if (b->nscalablelsbs[i]) {
+                get_array(&s->gb,
+                          b->lsb_sample_buffer[i] + seg * s->nsegsamples,
+                          s->nsegsamples, b->nscalablelsbs[i]);
             }
-            for (i = 0; i < chset->channels; i++) {
-                int param_index = params->seg_type ? 0 : i;
-                int part0       = params->nSamplPart0[param_index];
-                int bits        = part0 ? params->pancABIT0[param_index] : 0;
-                int *sample_buf = s->xll_sample_buf +
-                                  (in_channel + i) * s->xll_smpl_in_seg;
-
-                if (!params->rice_code_flag[param_index]) {
-                    /* Linear code */
-                    if (bits)
-                        for (j = 0; j < part0; j++)
-                            sample_buf[j] = get_bits_sm(gb, bits);
-                    else
-                        memset(sample_buf, 0, part0 * sizeof(sample_buf[0]));
+        }
+    }
 
-                    /* Second part */
-                    bits = params->pancABIT[param_index];
-                    if (bits)
-                        for (j = part0; j < s->xll_smpl_in_seg; j++)
-                            sample_buf[j] = get_bits_sm(gb, bits);
-                    else
-                        memset(sample_buf + part0, 0,
-                               (s->xll_smpl_in_seg - part0) * sizeof(sample_buf[0]));
-                } else {
-                    int aux_bits = params->pancAuxABIT[param_index];
-
-                    for (j = 0; j < part0; j++) {
-                        /* FIXME: Is this identical to Golomb code? */
-                        int t = get_unary(gb, 1, 33) << bits;
-                        /* FIXME: Could move this test outside of the loop, for efficiency. */
-                        if (bits)
-                            t |= get_bits(gb, bits);
-                        sample_buf[j] = (t & 1) ? -(t >> 1) - 1 : (t >> 1);
-                    }
+    // Skip to the end of band data
+    if (ff_dca_seek_bits(&s->gb, band_data_end)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL band data\n");
+        return AVERROR_INVALIDDATA;
+    }
 
-                    /* Second part */
-                    bits = params->pancABIT[param_index];
-
-                    /* Follow the spec's suggestion of using the
-                     * buffer also to store the hybrid-rice flags. */
-                    memset(sample_buf + part0, 0,
-                           (s->xll_smpl_in_seg - part0) * sizeof(sample_buf[0]));
-
-                    if (aux_bits > 0) {
-                        /* For hybrid rice encoding, some samples are linearly
-                         * coded. According to the spec, "nBits4SamplLoci" bits
-                         * are used for each index, but this value is not
-                         * defined. I guess we should use log2(xll_smpl_in_seg)
-                         * bits. */
-                        int count = get_bits(gb, s->xll_log_smpl_in_seg);
-                        av_log(s->avctx, AV_LOG_DEBUG, "aux count %d (bits %d)\n",
-                               count, s->xll_log_smpl_in_seg);
-
-                        for (j = 0; j < count; j++)
-                            sample_buf[get_bits(gb, s->xll_log_smpl_in_seg)] = 1;
-                    }
-                    for (j = part0; j < s->xll_smpl_in_seg; j++) {
-                        if (!sample_buf[j]) {
-                            int t = get_unary(gb, 1, 33);
-                            if (bits)
-                                t = (t << bits) | get_bits(gb, bits);
-                            sample_buf[j] = (t & 1) ? -(t >> 1) - 1 : (t >> 1);
-                        } else
-                            sample_buf[j] = get_bits_sm(gb, aux_bits);
-                    }
+    return 0;
+}
+
+static av_cold void chs_clear_band_data(DCAXllDecoder *s, DCAXllChSet *c, int band, int seg)
+{
+    DCAXllBand *b = &c->bands[band];
+    int i, offset, nsamples;
+
+    if (seg < 0) {
+        offset = 0;
+        nsamples = s->nframesamples;
+    } else {
+        offset = seg * s->nsegsamples;
+        nsamples = s->nsegsamples;
+    }
+
+    for (i = 0; i < c->nchannels; i++) {
+        memset(b->msb_sample_buffer[i] + offset, 0, nsamples * sizeof(int32_t));
+        if (b->lsb_section_size)
+            memset(b->lsb_sample_buffer[i] + offset, 0, nsamples * sizeof(int32_t));
+    }
+
+    if (seg <= 0 && band)
+        memset(c->deci_history, 0, sizeof(c->deci_history));
+
+    if (seg < 0) {
+        memset(b->nscalablelsbs, 0, sizeof(b->nscalablelsbs));
+        memset(b->bit_width_adjust, 0, sizeof(b->bit_width_adjust));
+    }
+}
+
+static void chs_filter_band_data(DCAXllDecoder *s, DCAXllChSet *c, int band)
+{
+    DCAXllBand *b = &c->bands[band];
+    int nsamples = s->nframesamples;
+    int i, j, k;
+
+    // Inverse adaptive or fixed prediction
+    for (i = 0; i < c->nchannels; i++) {
+        int32_t *buf = b->msb_sample_buffer[i];
+        int order = b->adapt_pred_order[i];
+        if (order > 0) {
+            int coeff[DCA_XLL_ADAPT_PRED_ORDER_MAX];
+            // Conversion from reflection coefficients to direct form coefficients
+            for (j = 0; j < order; j++) {
+                int rc = b->adapt_refl_coeff[i][j];
+                for (k = 0; k < (j + 1) / 2; k++) {
+                    int tmp1 = coeff[    k    ];
+                    int tmp2 = coeff[j - k - 1];
+                    coeff[    k    ] = tmp1 + mul16(rc, tmp2);
+                    coeff[j - k - 1] = tmp2 + mul16(rc, tmp1);
                 }
+                coeff[j] = rc;
+            }
+            // Inverse adaptive prediction
+            for (j = 0; j < nsamples - order; j++) {
+                int64_t err = 0;
+                for (k = 0; k < order; k++)
+                    err += (int64_t)buf[j + k] * coeff[order - k - 1];
+                buf[j + k] -= clip23(norm16(err));
             }
+        } else {
+            // Inverse fixed coefficient prediction
+            for (j = 0; j < b->fixed_pred_order[i]; j++)
+                for (k = 1; k < nsamples; k++)
+                    buf[k] += buf[k - 1];
+        }
+    }
 
-            for (i = 0; i < chset->channels; i++) {
-                unsigned adapt_order = chset->adapt_order[0][i];
-                int *sample_buf = s->xll_sample_buf +
-                                  (in_channel + i) * s->xll_smpl_in_seg;
-                int *prev = history + (in_channel + i) * DCA_XLL_AORDER_MAX;
-
-                if (!adapt_order) {
-                    unsigned order;
-                    for (order = chset->fixed_order[0][i]; order > 0; order--) {
-                        unsigned j;
-                        for (j = 1; j < s->xll_smpl_in_seg; j++)
-                            sample_buf[j] += sample_buf[j - 1];
-                    }
-                } else
-                    /* Inverse adaptive prediction, in place. */
-                    dca_xll_inv_adapt_pred(sample_buf, s->xll_smpl_in_seg,
-                                           adapt_order, seg ? prev : NULL,
-                                           chset->lpc_refl_coeffs_q_ind[0][i]);
-                memcpy(prev, sample_buf + s->xll_smpl_in_seg - DCA_XLL_AORDER_MAX,
-                       DCA_XLL_AORDER_MAX * sizeof(*prev));
+    // Inverse pairwise channel decorrellation
+    if (b->decor_enabled) {
+        int32_t *tmp[DCA_XLL_CHANNELS_MAX];
+
+        for (i = 0; i < c->nchannels / 2; i++) {
+            int coeff = b->decor_coeff[i];
+            if (coeff) {
+                s->dcadsp->decor(b->msb_sample_buffer[i * 2 + 1],
+                                 b->msb_sample_buffer[i * 2    ],
+                                 coeff, nsamples);
             }
-            for (i = 1; i < chset->channels; i += 2) {
-                int coeff = chset->pw_ch_pairs_coeffs[0][i / 2];
-                if (coeff != 0) {
-                    int *sample_buf = s->xll_sample_buf +
-                                      (in_channel + i) * s->xll_smpl_in_seg;
-                    int *prev = sample_buf - s->xll_smpl_in_seg;
-                    unsigned j;
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        /* Shift is unspecified, but should apparently be 3. */
-                        sample_buf[j] += ((int64_t) coeff * prev[j] + 4) >> 3;
-                }
+        }
+
+        // Reorder channel pointers to the original order
+        for (i = 0; i < c->nchannels; i++)
+            tmp[i] = b->msb_sample_buffer[i];
+
+        for (i = 0; i < c->nchannels; i++)
+            b->msb_sample_buffer[b->orig_order[i]] = tmp[i];
+    }
+
+    // Map output channel pointers for frequency band 0
+    if (c->nfreqbands == 1)
+        for (i = 0; i < c->nchannels; i++)
+            s->output_samples[c->ch_remap[i]] = b->msb_sample_buffer[i];
+}
+
+static int chs_get_lsb_width(DCAXllDecoder *s, DCAXllChSet *c, int band, int ch)
+{
+    int adj = c->bands[band].bit_width_adjust[ch];
+    int shift = c->bands[band].nscalablelsbs[ch];
+
+    if (s->fixed_lsb_width)
+        shift = s->fixed_lsb_width;
+    else if (shift && adj)
+        shift += adj - 1;
+    else
+        shift += adj;
+
+    return shift;
+}
+
+static void chs_assemble_msbs_lsbs(DCAXllDecoder *s, DCAXllChSet *c, int band)
+{
+    DCAXllBand *b = &c->bands[band];
+    int n, ch, nsamples = s->nframesamples;
+
+    for (ch = 0; ch < c->nchannels; ch++) {
+        int shift = chs_get_lsb_width(s, c, band, ch);
+        if (shift) {
+            int32_t *msb = b->msb_sample_buffer[ch];
+            if (b->nscalablelsbs[ch]) {
+                int32_t *lsb = b->lsb_sample_buffer[ch];
+                int adj = b->bit_width_adjust[ch];
+                for (n = 0; n < nsamples; n++)
+                    msb[n] = msb[n] * (1 << shift) + (lsb[n] << adj);
+            } else {
+                for (n = 0; n < nsamples; n++)
+                    msb[n] = msb[n] * (1 << shift);
             }
+        }
+    }
+}
 
-            if (s->xll_scalable_lsb) {
-                int lsb_start = end_pos - 8 * chset->lsb_fsize[0] -
-                                8 * (s->xll_banddata_crc & 2);
-                int done;
-                i = get_bits_count(gb);
-                if (i > lsb_start) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "chset data lsb exceeds NAVI size, end_pos %d, lsb_start %d, pos %d\n",
-                           end_pos, lsb_start, i);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (i < lsb_start)
-                    skip_bits_long(gb, lsb_start - i);
-
-                for (i = done = 0; i < chset->channels; i++) {
-                    int bits = chset->scalable_lsbs[0][i];
-                    if (bits > 0) {
-                        /* The channel reordering is conceptually done
-                         * before adding the lsb:s, so we need to do
-                         * the inverse permutation here. */
-                        unsigned pi = chset->orig_chan_order_inv[0][i];
-                        int *sample_buf = s->xll_sample_buf +
-                                          (in_channel + pi) * s->xll_smpl_in_seg;
-                        int adj = chset->bit_width_adj_per_ch[0][i];
-                        int msb_shift = bits;
-                        unsigned j;
-
-                        if (adj > 0)
-                            msb_shift += adj - 1;
-
-                        for (j = 0; j < s->xll_smpl_in_seg; j++)
-                            sample_buf[j] = (sample_buf[j] << msb_shift) +
-                                            (get_bits(gb, bits) << adj);
-
-                        done += bits * s->xll_smpl_in_seg;
+static int chs_assemble_freq_bands(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    int ch, nsamples = s->nframesamples;
+    int32_t *ptr;
+
+    av_assert1(c->nfreqbands > 1);
+
+    // Reallocate frequency band assembly buffer
+    av_fast_malloc(&c->sample_buffer[2], &c->sample_size[2],
+                   2 * nsamples * c->nchannels * sizeof(int32_t));
+    if (!c->sample_buffer[2])
+        return AVERROR(ENOMEM);
+
+    // Assemble frequency bands 0 and 1
+    ptr = c->sample_buffer[2];
+    for (ch = 0; ch < c->nchannels; ch++) {
+        int32_t *band0 = c->bands[0].msb_sample_buffer[ch];
+        int32_t *band1 = c->bands[1].msb_sample_buffer[ch];
+
+        // Copy decimator history
+        memcpy(band0 - DCA_XLL_DECI_HISTORY_MAX,
+               c->deci_history[ch], sizeof(c->deci_history[0]));
+
+        // Filter
+        s->dcadsp->assemble_freq_bands(ptr, band0, band1,
+                                       ff_dca_xll_band_coeff,
+                                       nsamples);
+
+        // Remap output channel pointer to assembly buffer
+        s->output_samples[c->ch_remap[ch]] = ptr;
+        ptr += nsamples * 2;
+    }
+
+    return 0;
+}
+
+static int parse_common_header(DCAXllDecoder *s)
+{
+    int stream_ver, header_size, frame_size_nbits, nframesegs_log2;
+
+    // XLL extension sync word
+    if (get_bits_long(&s->gb, 32) != DCA_SYNCWORD_XLL) {
+        av_log(s->avctx, AV_LOG_VERBOSE, "Invalid XLL sync word\n");
+        return AVERROR(EAGAIN);
+    }
+
+    // Version number
+    stream_ver = get_bits(&s->gb, 4) + 1;
+    if (stream_ver > 1) {
+        avpriv_request_sample(s->avctx, "XLL stream version %d", stream_ver);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Lossless frame header length
+    header_size = get_bits(&s->gb, 8) + 1;
+
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, 32, header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL common header checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of bits used to read frame size
+    frame_size_nbits = get_bits(&s->gb, 5) + 1;
+
+    // Number of bytes in a lossless frame
+    s->frame_size = get_bits_long(&s->gb, frame_size_nbits);
+    if (s->frame_size < 0 || s->frame_size >= DCA_XLL_PBR_BUFFER_MAX) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid XLL frame size (%d bytes)\n", s->frame_size);
+        return AVERROR_INVALIDDATA;
+    }
+    s->frame_size++;
+
+    // Number of channels sets per frame
+    s->nchsets = get_bits(&s->gb, 4) + 1;
+    if (s->nchsets > DCA_XLL_CHSETS_MAX) {
+        avpriv_request_sample(s->avctx, "%d XLL channel sets", s->nchsets);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    // Number of segments per frame
+    nframesegs_log2 = get_bits(&s->gb, 4);
+    s->nframesegs = 1 << nframesegs_log2;
+    if (s->nframesegs > 1024) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many segments per XLL frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Samples in segment per one frequency band for the first channel set
+    // Maximum value is 256 for sampling frequencies <= 48 kHz
+    // Maximum value is 512 for sampling frequencies > 48 kHz
+    s->nsegsamples_log2 = get_bits(&s->gb, 4);
+    if (!s->nsegsamples_log2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too few samples per XLL segment\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->nsegsamples = 1 << s->nsegsamples_log2;
+    if (s->nsegsamples > 512) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many samples per XLL segment\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Samples in frame per one frequency band for the first channel set
+    s->nframesamples_log2 = s->nsegsamples_log2 + nframesegs_log2;
+    s->nframesamples = 1 << s->nframesamples_log2;
+    if (s->nframesamples > 65536) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many samples per XLL frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Number of bits used to read segment size
+    s->seg_size_nbits = get_bits(&s->gb, 5) + 1;
+
+    // Presence of CRC16 within each frequency band
+    // 0 - No CRC16 within band
+    // 1 - CRC16 placed at the end of MSB0
+    // 2 - CRC16 placed at the end of MSB0 and LSB0
+    // 3 - CRC16 placed at the end of MSB0 and LSB0 and other frequency bands
+    s->band_crc_present = get_bits(&s->gb, 2);
+
+    // MSB/LSB split flag
+    s->scalable_lsbs = get_bits1(&s->gb);
+
+    // Channel position mask
+    s->ch_mask_nbits = get_bits(&s->gb, 5) + 1;
+
+    // Fixed LSB width
+    if (s->scalable_lsbs)
+        s->fixed_lsb_width = get_bits(&s->gb, 4);
+    else
+        s->fixed_lsb_width = 0;
+
+    // Reserved
+    // Byte align
+    // Header CRC16 protection
+    if (ff_dca_seek_bits(&s->gb, header_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL common header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int is_hier_dmix_chset(DCAXllChSet *c)
+{
+    return !c->primary_chset && c->dmix_embedded && c->hier_chset;
+}
+
+static DCAXllChSet *find_next_hier_dmix_chset(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    if (c->hier_chset)
+        while (++c < &s->chset[s->nchsets])
+            if (is_hier_dmix_chset(c))
+                return c;
+
+    return NULL;
+}
+
+static void prescale_down_mix(DCAXllChSet *c, DCAXllChSet *o)
+{
+    int i, j, *coeff_ptr = c->dmix_coeff;
+
+    for (i = 0; i < c->hier_ofs; i++) {
+        int scale = o->dmix_scale[i];
+        int scale_inv = o->dmix_scale_inv[i];
+        c->dmix_scale[i] = mul15(c->dmix_scale[i], scale);
+        c->dmix_scale_inv[i] = mul16(c->dmix_scale_inv[i], scale_inv);
+        for (j = 0; j < c->nchannels; j++) {
+            int coeff = mul16(*coeff_ptr, scale_inv);
+            *coeff_ptr++ = mul15(coeff, o->dmix_scale[c->hier_ofs + j]);
+        }
+    }
+}
+
+static int parse_sub_headers(DCAXllDecoder *s, DCAExssAsset *asset)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    DCAXllChSet *c;
+    int i, ret;
+
+    // Parse channel set headers
+    s->nfreqbands = 0;
+    s->nchannels = 0;
+    s->nreschsets = 0;
+    for (i = 0, c = s->chset; i < s->nchsets; i++, c++) {
+        c->hier_ofs = s->nchannels;
+        if ((ret = chs_parse_header(s, c, asset)) < 0)
+            return ret;
+        if (c->nfreqbands > s->nfreqbands)
+            s->nfreqbands = c->nfreqbands;
+        if (c->hier_chset)
+            s->nchannels += c->nchannels;
+        if (c->residual_encode != (1 << c->nchannels) - 1)
+            s->nreschsets++;
+    }
+
+    // Pre-scale downmixing coefficients for all non-primary channel sets
+    for (i = s->nchsets - 1, c = &s->chset[i]; i > 0; i--, c--) {
+        if (is_hier_dmix_chset(c)) {
+            DCAXllChSet *o = find_next_hier_dmix_chset(s, c);
+            if (o)
+                prescale_down_mix(c, o);
+        }
+    }
+
+    // Determine number of active channel sets to decode
+    switch (dca->request_channel_layout) {
+    case DCA_SPEAKER_LAYOUT_STEREO:
+        s->nactivechsets = 1;
+        break;
+    case DCA_SPEAKER_LAYOUT_5POINT0:
+    case DCA_SPEAKER_LAYOUT_5POINT1:
+        s->nactivechsets = (s->chset[0].nchannels < 5 && s->nchsets > 1) ? 2 : 1;
+        break;
+    default:
+        s->nactivechsets = s->nchsets;
+        break;
+    }
+
+    return 0;
+}
+
+static int parse_navi_table(DCAXllDecoder *s)
+{
+    int chs, seg, band, navi_nb, navi_pos, *navi_ptr;
+    DCAXllChSet *c;
+
+    // Determine size of NAVI table
+    navi_nb = s->nfreqbands * s->nframesegs * s->nchsets;
+    if (navi_nb > 1024) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many NAVI entries (%d)\n", navi_nb);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Reallocate NAVI table
+    av_fast_malloc(&s->navi, &s->navi_size, navi_nb * sizeof(*s->navi));
+    if (!s->navi)
+        return AVERROR(ENOMEM);
+
+    // Parse NAVI
+    navi_pos = get_bits_count(&s->gb);
+    navi_ptr = s->navi;
+    for (band = 0; band < s->nfreqbands; band++) {
+        for (seg = 0; seg < s->nframesegs; seg++) {
+            for (chs = 0, c = s->chset; chs < s->nchsets; chs++, c++) {
+                int size = 0;
+                if (c->nfreqbands > band) {
+                    size = get_bits_long(&s->gb, s->seg_size_nbits);
+                    if (size < 0 || size >= s->frame_size) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid NAVI segment size (%d bytes)\n", size);
+                        return AVERROR_INVALIDDATA;
                     }
+                    size++;
                 }
-                if (done > 8 * chset->lsb_fsize[0]) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "chset lsb exceeds lsb_size\n");
-                    return AVERROR_INVALIDDATA;
-                }
+                *navi_ptr++ = size;
             }
+        }
+    }
+
+    // Byte align
+    // CRC16
+    skip_bits(&s->gb, -get_bits_count(&s->gb) & 7);
+    skip_bits(&s->gb, 16);
+
+    // Check CRC
+    if (ff_dca_check_crc(s->avctx, &s->gb, navi_pos, get_bits_count(&s->gb))) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid NAVI checksum\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int parse_band_data(DCAXllDecoder *s)
+{
+    int ret, chs, seg, band, navi_pos, *navi_ptr;
+    DCAXllChSet *c;
+
+    for (chs = 0, c = s->chset; chs < s->nactivechsets; chs++, c++) {
+        if ((ret = chs_alloc_msb_band_data(s, c)) < 0)
+            return ret;
+        if ((ret = chs_alloc_lsb_band_data(s, c)) < 0)
+            return ret;
+    }
 
-            /* Store output. */
-            for (i = 0; i < chset->channels; i++) {
-                int *sample_buf = s->xll_sample_buf +
-                                  (in_channel + i) * s->xll_smpl_in_seg;
-                int shift = 1 - chset->bit_resolution;
-                int out_channel = chset->orig_chan_order[0][i];
-                float *out;
-
-                /* XLL uses the channel order C, L, R, and we want L,
-                 * R, C. FIXME: Generalize. */
-                if (chset->ch_mask_enabled &&
-                    (chset->ch_mask & 7) == 7 && out_channel < 3)
-                    out_channel = out_channel ? out_channel - 1 : 2;
-
-                out_channel += in_channel;
-                if (out_channel >= s->avctx->channels)
-                    continue;
-
-                out  = (float *) frame->extended_data[out_channel];
-                out += seg * s->xll_smpl_in_seg;
-
-                /* NOTE: A one bit means residual encoding is *not* used. */
-                if ((chset->residual_encode >> i) & 1) {
-                    /* Replace channel samples.
-                     * FIXME: Most likely not the right thing to do. */
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        out[j] = ldexpf(sample_buf[j], shift);
-                } else {
-                    /* Add residual signal to core channel */
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        out[j] += ldexpf(sample_buf[j], shift);
+    navi_pos = get_bits_count(&s->gb);
+    navi_ptr = s->navi;
+    for (band = 0; band < s->nfreqbands; band++) {
+        for (seg = 0; seg < s->nframesegs; seg++) {
+            for (chs = 0, c = s->chset; chs < s->nchsets; chs++, c++) {
+                if (c->nfreqbands > band) {
+                    navi_pos += *navi_ptr * 8;
+                    if (navi_pos > s->gb.size_in_bits) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Invalid NAVI position\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    if (chs < s->nactivechsets &&
+                        (ret = chs_parse_band_data(s, c, band, seg, navi_pos)) < 0) {
+                        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                            return ret;
+                        chs_clear_band_data(s, c, band, seg);
+                    }
+                    s->gb.index = navi_pos;
                 }
+                navi_ptr++;
             }
+        }
+    }
 
-            if (chset->downmix_coeff_code_embedded &&
-                !chset->primary_ch_set && chset->hier_chset) {
-                /* Undo hierarchical downmix of earlier channels. */
-                unsigned mix_channel;
-                for (mix_channel = 0; mix_channel < in_channel; mix_channel++) {
-                    float *mix_buf;
-                    const int *col;
-                    float coeff;
-                    unsigned row;
-                    /* Similar channel reorder C, L, R vs L, R, C reorder. */
-                    if (chset->ch_mask_enabled &&
-                        (chset->ch_mask & 7) == 7 && mix_channel < 3)
-                        mix_buf = (float *) frame->extended_data[mix_channel ? mix_channel - 1 : 2];
-                    else
-                        mix_buf = (float *) frame->extended_data[mix_channel];
-
-                    mix_buf += seg * s->xll_smpl_in_seg;
-                    col = &chset->downmix_coeffs[mix_channel * (chset->channels + 1)];
-
-                    /* Scale */
-                    coeff = ldexpf(col[0], -16);
-                    for (j = 0; j < s->xll_smpl_in_seg; j++)
-                        mix_buf[j] *= coeff;
-
-                    for (row = 0;
-                         row < chset->channels && in_channel + row < s->avctx->channels;
-                         row++)
-                        if (col[row + 1]) {
-                            const float *new_channel =
-                                (const float *) frame->extended_data[in_channel + row];
-                            new_channel += seg * s->xll_smpl_in_seg;
-                            coeff        = ldexpf(col[row + 1], -15);
-                            for (j = 0; j < s->xll_smpl_in_seg; j++)
-                                mix_buf[j] -= coeff * new_channel[j];
-                        }
+    return 0;
+}
+
+static int parse_frame(DCAXllDecoder *s, uint8_t *data, int size, DCAExssAsset *asset)
+{
+    int ret;
+
+    if ((ret = init_get_bits8(&s->gb, data, size)) < 0)
+        return ret;
+    if ((ret = parse_common_header(s)) < 0)
+        return ret;
+    if ((ret = parse_sub_headers(s, asset)) < 0)
+        return ret;
+    if ((ret = parse_navi_table(s)) < 0)
+        return ret;
+    if ((ret = parse_band_data(s)) < 0)
+        return ret;
+    if (ff_dca_seek_bits(&s->gb, s->frame_size * 8)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Read past end of XLL frame\n");
+        return AVERROR_INVALIDDATA;
+    }
+    return ret;
+}
+
+static void clear_pbr(DCAXllDecoder *s)
+{
+    s->pbr_length = 0;
+    s->pbr_delay = 0;
+}
+
+static int copy_to_pbr(DCAXllDecoder *s, uint8_t *data, int size, int delay)
+{
+    if (size > DCA_XLL_PBR_BUFFER_MAX)
+        return AVERROR(ENOSPC);
+
+    if (!s->pbr_buffer && !(s->pbr_buffer = av_malloc(DCA_XLL_PBR_BUFFER_MAX + AV_INPUT_BUFFER_PADDING_SIZE)))
+        return AVERROR(ENOMEM);
+
+    memcpy(s->pbr_buffer, data, size);
+    s->pbr_length = size;
+    s->pbr_delay = delay;
+    return 0;
+}
+
+static int parse_frame_no_pbr(DCAXllDecoder *s, uint8_t *data, int size, DCAExssAsset *asset)
+{
+    int ret = parse_frame(s, data, size, asset);
+
+    // If XLL packet data didn't start with a sync word, we must have jumped
+    // right into the middle of PBR smoothing period
+    if (ret == AVERROR(EAGAIN) && asset->xll_sync_present && asset->xll_sync_offset < size) {
+        // Skip to the next sync word in this packet
+        data += asset->xll_sync_offset;
+        size -= asset->xll_sync_offset;
+
+        // If decoding delay is set, put the frame into PBR buffer and return
+        // failure code. Higher level decoder is expected to switch to lossy
+        // core decoding or mute its output until decoding delay expires.
+        if (asset->xll_delay_nframes > 0) {
+            if ((ret = copy_to_pbr(s, data, size, asset->xll_delay_nframes)) < 0)
+                return ret;
+            return AVERROR(EAGAIN);
+        }
+
+        // No decoding delay, just parse the frame in place
+        ret = parse_frame(s, data, size, asset);
+    }
+
+    if (ret < 0)
+        return ret;
+
+    if (s->frame_size > size)
+        return AVERROR(EINVAL);
+
+    // If the XLL decoder didn't consume full packet, start PBR smoothing period
+    if (s->frame_size < size)
+        if ((ret = copy_to_pbr(s, data + s->frame_size, size - s->frame_size, 0)) < 0)
+            return ret;
+
+    return 0;
+}
+
+static int parse_frame_pbr(DCAXllDecoder *s, uint8_t *data, int size, DCAExssAsset *asset)
+{
+    int ret;
+
+    if (size > DCA_XLL_PBR_BUFFER_MAX - s->pbr_length) {
+        ret = AVERROR(ENOSPC);
+        goto fail;
+    }
+
+    memcpy(s->pbr_buffer + s->pbr_length, data, size);
+    s->pbr_length += size;
+
+    // Respect decoding delay after synchronization error
+    if (s->pbr_delay > 0 && --s->pbr_delay)
+        return AVERROR(EAGAIN);
+
+    if ((ret = parse_frame(s, s->pbr_buffer, s->pbr_length, asset)) < 0)
+        goto fail;
+
+    if (s->frame_size > s->pbr_length) {
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    if (s->frame_size == s->pbr_length) {
+        // End of PBR smoothing period
+        clear_pbr(s);
+    } else {
+        s->pbr_length -= s->frame_size;
+        memmove(s->pbr_buffer, s->pbr_buffer + s->frame_size, s->pbr_length);
+    }
+
+    return 0;
+
+fail:
+    // For now, throw out all PBR state on failure.
+    // Perhaps we can be smarter and try to resync somehow.
+    clear_pbr(s);
+    return ret;
+}
+
+int ff_dca_xll_parse(DCAXllDecoder *s, uint8_t *data, DCAExssAsset *asset)
+{
+    int ret;
+
+    if (s->hd_stream_id != asset->hd_stream_id) {
+        clear_pbr(s);
+        s->hd_stream_id = asset->hd_stream_id;
+    }
+
+    if (s->pbr_length)
+        ret = parse_frame_pbr(s, data + asset->xll_offset, asset->xll_size, asset);
+    else
+        ret = parse_frame_no_pbr(s, data + asset->xll_offset, asset->xll_size, asset);
+
+    return ret;
+}
+
+static void undo_down_mix(DCAXllDecoder *s, DCAXllChSet *o, int band)
+{
+    int i, j, k, nchannels = 0, *coeff_ptr = o->dmix_coeff;
+    DCAXllChSet *c;
+
+    for (i = 0, c = s->chset; i < s->nactivechsets; i++, c++) {
+        if (!c->hier_chset)
+            continue;
+
+        av_assert1(band < c->nfreqbands);
+        for (j = 0; j < c->nchannels; j++) {
+            for (k = 0; k < o->nchannels; k++) {
+                int coeff = *coeff_ptr++;
+                if (coeff) {
+                    s->dcadsp->dmix_sub(c->bands[band].msb_sample_buffer[j],
+                                        o->bands[band].msb_sample_buffer[k],
+                                        coeff, s->nframesamples);
+                    if (band)
+                        s->dcadsp->dmix_sub(c->deci_history[j],
+                                            o->deci_history[k],
+                                            coeff, DCA_XLL_DECI_HISTORY_MAX);
                 }
             }
+        }
 
-next_chset:
-            in_channel += chset->channels;
-            /* Skip to next channel set using the NAVI info. */
-            i = get_bits_count(gb);
-            if (i > end_pos) {
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "chset data exceeds NAVI size\n");
-                return AVERROR_INVALIDDATA;
+        nchannels += c->nchannels;
+        if (nchannels >= o->hier_ofs)
+            break;
+    }
+}
+
+static void scale_down_mix(DCAXllDecoder *s, DCAXllChSet *o, int band)
+{
+    int i, j, nchannels = 0;
+    DCAXllChSet *c;
+
+    for (i = 0, c = s->chset; i < s->nactivechsets; i++, c++) {
+        if (!c->hier_chset)
+            continue;
+
+        av_assert1(band < c->nfreqbands);
+        for (j = 0; j < c->nchannels; j++) {
+            int scale = o->dmix_scale[nchannels++];
+            if (scale != (1 << 15)) {
+                s->dcadsp->dmix_scale(c->bands[band].msb_sample_buffer[j],
+                                      scale, s->nframesamples);
+                if (band)
+                    s->dcadsp->dmix_scale(c->deci_history[j],
+                                          scale, DCA_XLL_DECI_HISTORY_MAX);
             }
-            if (i < end_pos)
-                skip_bits_long(gb, end_pos - i);
         }
+
+        if (nchannels >= o->hier_ofs)
+            break;
+    }
+}
+
+// Clear all band data and replace non-residual encoded channels with lossy
+// counterparts
+static av_cold void force_lossy_output(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    int band, ch;
+
+    for (band = 0; band < c->nfreqbands; band++)
+        chs_clear_band_data(s, c, band, -1);
+
+    for (ch = 0; ch < c->nchannels; ch++) {
+        if (!(c->residual_encode & (1 << ch)))
+            continue;
+        if (ff_dca_core_map_spkr(&dca->core, c->ch_remap[ch]) < 0)
+            continue;
+        c->residual_encode &= ~(1 << ch);
+    }
+}
+
+static int combine_residual_frame(DCAXllDecoder *s, DCAXllChSet *c)
+{
+    DCAContext *dca = s->avctx->priv_data;
+    int ch, nsamples = s->nframesamples;
+    DCAXllChSet *o;
+
+    // Verify that core is compatible
+    if (!(dca->packet & DCA_PACKET_CORE)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Residual encoded channels are present without core\n");
+        return AVERROR(EINVAL);
     }
+
+    if (c->freq != dca->core.output_rate) {
+        av_log(s->avctx, AV_LOG_WARNING, "Sample rate mismatch between core (%d Hz) and XLL (%d Hz)\n", dca->core.output_rate, c->freq);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (nsamples != dca->core.npcmsamples) {
+        av_log(s->avctx, AV_LOG_WARNING, "Number of samples per frame mismatch between core (%d) and XLL (%d)\n", dca->core.npcmsamples, nsamples);
+        return AVERROR_INVALIDDATA;
+    }
+
+    // See if this channel set is downmixed and find the next channel set in
+    // hierarchy. If downmixed, undo core pre-scaling before combining with
+    // residual (residual is not scaled).
+    o = find_next_hier_dmix_chset(s, c);
+
+    // Reduce core bit width and combine with residual
+    for (ch = 0; ch < c->nchannels; ch++) {
+        int n, spkr, shift, round;
+        int32_t *src, *dst;
+
+        if (c->residual_encode & (1 << ch))
+            continue;
+
+        // Map this channel to core speaker
+        spkr = ff_dca_core_map_spkr(&dca->core, c->ch_remap[ch]);
+        if (spkr < 0) {
+            av_log(s->avctx, AV_LOG_WARNING, "Residual encoded channel (%d) references unavailable core channel\n", c->ch_remap[ch]);
+            return AVERROR_INVALIDDATA;
+        }
+
+        // Account for LSB width
+        shift = 24 - c->pcm_bit_res + chs_get_lsb_width(s, c, 0, ch);
+        if (shift > 24) {
+            av_log(s->avctx, AV_LOG_WARNING, "Invalid core shift (%d bits)\n", shift);
+            return AVERROR_INVALIDDATA;
+        }
+
+        round = shift > 0 ? 1 << (shift - 1) : 0;
+
+        src = dca->core.output_samples[spkr];
+        dst = c->bands[0].msb_sample_buffer[ch];
+        if (o) {
+            // Undo embedded core downmix pre-scaling
+            int scale_inv = o->dmix_scale_inv[c->hier_ofs + ch];
+            for (n = 0; n < nsamples; n++)
+                dst[n] += clip23((mul16(src[n], scale_inv) + round) >> shift);
+        } else {
+            // No downmix scaling
+            for (n = 0; n < nsamples; n++)
+                dst[n] += (src[n] + round) >> shift;
+        }
+    }
+
+    return 0;
+}
+
+int ff_dca_xll_filter_frame(DCAXllDecoder *s, AVFrame *frame)
+{
+    AVCodecContext *avctx = s->avctx;
+    DCAContext *dca = avctx->priv_data;
+    DCAExssAsset *asset = &dca->exss.assets[0];
+    DCAXllChSet *p = &s->chset[0], *c;
+    enum AVMatrixEncoding matrix_encoding = AV_MATRIX_ENCODING_NONE;
+    int i, j, k, ret, shift, nsamples, request_mask;
+    int ch_remap[DCA_SPEAKER_COUNT];
+
+    // Force lossy downmixed output during recovery
+    if (dca->packet & DCA_PACKET_RECOVERY) {
+        for (i = 0, c = s->chset; i < s->nchsets; i++, c++) {
+            if (i < s->nactivechsets)
+                force_lossy_output(s, c);
+
+            if (!c->primary_chset)
+                c->dmix_embedded = 0;
+        }
+
+        s->scalable_lsbs = 0;
+        s->fixed_lsb_width = 0;
+    }
+
+    // Filter frequency bands for active channel sets
+    s->output_mask = 0;
+    for (i = 0, c = s->chset; i < s->nactivechsets; i++, c++) {
+        chs_filter_band_data(s, c, 0);
+
+        if (c->residual_encode != (1 << c->nchannels) - 1
+            && (ret = combine_residual_frame(s, c)) < 0)
+            return ret;
+
+        if (s->scalable_lsbs)
+            chs_assemble_msbs_lsbs(s, c, 0);
+
+        if (c->nfreqbands > 1) {
+            chs_filter_band_data(s, c, 1);
+            chs_assemble_msbs_lsbs(s, c, 1);
+        }
+
+        s->output_mask |= c->ch_mask;
+    }
+
+    // Undo hierarchial downmix and/or apply scaling
+    for (i = 1, c = &s->chset[1]; i < s->nchsets; i++, c++) {
+        if (!is_hier_dmix_chset(c))
+            continue;
+
+        if (i >= s->nactivechsets) {
+            for (j = 0; j < c->nfreqbands; j++)
+                if (c->bands[j].dmix_embedded)
+                    scale_down_mix(s, c, j);
+            break;
+        }
+
+        for (j = 0; j < c->nfreqbands; j++)
+            if (c->bands[j].dmix_embedded)
+                undo_down_mix(s, c, j);
+    }
+
+    // Assemble frequency bands for active channel sets
+    if (s->nfreqbands > 1) {
+        for (i = 0; i < s->nactivechsets; i++)
+            if ((ret = chs_assemble_freq_bands(s, &s->chset[i])) < 0)
+                return ret;
+    }
+
+    // Normalize to regular 5.1 layout if downmixing
+    if (dca->request_channel_layout) {
+        if (s->output_mask & DCA_SPEAKER_MASK_Lss) {
+            s->output_samples[DCA_SPEAKER_Ls] = s->output_samples[DCA_SPEAKER_Lss];
+            s->output_mask = (s->output_mask & ~DCA_SPEAKER_MASK_Lss) | DCA_SPEAKER_MASK_Ls;
+        }
+        if (s->output_mask & DCA_SPEAKER_MASK_Rss) {
+            s->output_samples[DCA_SPEAKER_Rs] = s->output_samples[DCA_SPEAKER_Rss];
+            s->output_mask = (s->output_mask & ~DCA_SPEAKER_MASK_Rss) | DCA_SPEAKER_MASK_Rs;
+        }
+    }
+
+    // Handle downmixing to stereo request
+    if (dca->request_channel_layout == DCA_SPEAKER_LAYOUT_STEREO
+        && DCA_HAS_STEREO(s->output_mask) && p->dmix_embedded
+        && (p->dmix_type == DCA_DMIX_TYPE_LoRo ||
+            p->dmix_type == DCA_DMIX_TYPE_LtRt))
+        request_mask = DCA_SPEAKER_LAYOUT_STEREO;
+    else
+        request_mask = s->output_mask;
+    if (!ff_dca_set_channel_layout(avctx, ch_remap, request_mask))
+        return AVERROR(EINVAL);
+
+    avctx->sample_rate = p->freq << (s->nfreqbands - 1);
+
+    switch (p->storage_bit_res) {
+    case 16:
+        avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
+        break;
+    case 24:
+        avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    avctx->bits_per_raw_sample = p->storage_bit_res;
+    avctx->profile = FF_PROFILE_DTS_HD_MA;
+    avctx->bit_rate = 0;
+
+    frame->nb_samples = nsamples = s->nframesamples << (s->nfreqbands - 1);
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    // Downmix primary channel set to stereo
+    if (request_mask != s->output_mask) {
+        ff_dca_downmix_to_stereo_fixed(s->dcadsp, s->output_samples,
+                                       p->dmix_coeff, nsamples,
+                                       s->output_mask);
+    }
+
+    shift = p->storage_bit_res - p->pcm_bit_res;
+    for (i = 0; i < avctx->channels; i++) {
+        int32_t *samples = s->output_samples[ch_remap[i]];
+        if (frame->format == AV_SAMPLE_FMT_S16P) {
+            int16_t *plane = (int16_t *)frame->extended_data[i];
+            for (k = 0; k < nsamples; k++)
+                plane[k] = av_clip_int16(samples[k] * (1 << shift));
+        } else {
+            int32_t *plane = (int32_t *)frame->extended_data[i];
+            for (k = 0; k < nsamples; k++)
+                plane[k] = clip23(samples[k] * (1 << shift)) * (1 << 8);
+        }
+    }
+
+    if (!asset->one_to_one_map_ch_to_spkr) {
+        if (asset->representation_type == DCA_REPR_TYPE_LtRt)
+            matrix_encoding = AV_MATRIX_ENCODING_DOLBY;
+        else if (asset->representation_type == DCA_REPR_TYPE_LhRh)
+            matrix_encoding = AV_MATRIX_ENCODING_DOLBYHEADPHONE;
+    } else if (request_mask != s->output_mask && p->dmix_type == DCA_DMIX_TYPE_LtRt) {
+        matrix_encoding = AV_MATRIX_ENCODING_DOLBY;
+    }
+    if ((ret = ff_side_data_update_matrix_encoding(frame, matrix_encoding)) < 0)
+        return ret;
+
     return 0;
 }
+
+av_cold void ff_dca_xll_flush(DCAXllDecoder *s)
+{
+    clear_pbr(s);
+}
+
+av_cold void ff_dca_xll_close(DCAXllDecoder *s)
+{
+    DCAXllChSet *c;
+    int i, j;
+
+    for (i = 0, c = s->chset; i < DCA_XLL_CHSETS_MAX; i++, c++) {
+        for (j = 0; j < DCA_XLL_SAMPLE_BUFFERS_MAX; j++) {
+            av_freep(&c->sample_buffer[j]);
+            c->sample_size[j] = 0;
+        }
+    }
+
+    av_freep(&s->navi);
+    s->navi_size = 0;
+
+    av_freep(&s->pbr_buffer);
+    clear_pbr(s);
+}
diff --git a/libavcodec/dca_xll.h b/libavcodec/dca_xll.h
new file mode 100644
index 0000000..bc0aa65
--- /dev/null
+++ b/libavcodec/dca_xll.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCA_XLL_H
+#define AVCODEC_DCA_XLL_H
+
+#include "libavutil/common.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dcadsp.h"
+#include "dca_exss.h"
+
+#define DCA_XLL_CHSETS_MAX              3
+#define DCA_XLL_CHANNELS_MAX            8
+#define DCA_XLL_BANDS_MAX               2
+#define DCA_XLL_ADAPT_PRED_ORDER_MAX    16
+#define DCA_XLL_DECI_HISTORY_MAX        8
+#define DCA_XLL_DMIX_SCALES_MAX         ((DCA_XLL_CHSETS_MAX - 1) * DCA_XLL_CHANNELS_MAX)
+#define DCA_XLL_DMIX_COEFFS_MAX         (DCA_XLL_DMIX_SCALES_MAX * DCA_XLL_CHANNELS_MAX)
+#define DCA_XLL_PBR_BUFFER_MAX          (240 << 10)
+#define DCA_XLL_SAMPLE_BUFFERS_MAX      3
+
+typedef struct DCAXllBand {
+    int     decor_enabled;                          ///< Pairwise channel decorrelation flag
+    int     orig_order[DCA_XLL_CHANNELS_MAX];       ///< Original channel order
+    int     decor_coeff[DCA_XLL_CHANNELS_MAX / 2];  ///< Pairwise channel coefficients
+
+    int     adapt_pred_order[DCA_XLL_CHANNELS_MAX]; ///< Adaptive predictor order
+    int     highest_pred_order;                     ///< Highest adaptive predictor order
+    int     fixed_pred_order[DCA_XLL_CHANNELS_MAX]; ///< Fixed predictor order
+    int     adapt_refl_coeff[DCA_XLL_CHANNELS_MAX][DCA_XLL_ADAPT_PRED_ORDER_MAX];   ///< Adaptive predictor reflection coefficients
+
+    int     dmix_embedded;  ///< Downmix performed by encoder in frequency band
+
+    int     lsb_section_size;                       ///< Size of LSB section in any segment
+    int     nscalablelsbs[DCA_XLL_CHANNELS_MAX];    ///< Number of bits to represent the samples in LSB part
+    int     bit_width_adjust[DCA_XLL_CHANNELS_MAX]; ///< Number of bits discarded by authoring
+
+    int32_t *msb_sample_buffer[DCA_XLL_CHANNELS_MAX];   ///< MSB sample buffer pointers
+    int32_t *lsb_sample_buffer[DCA_XLL_CHANNELS_MAX];   ///< LSB sample buffer pointers or NULL
+} DCAXllBand;
+
+typedef struct DCAXllChSet {
+    // Channel set header
+    int     nchannels;          ///< Number of channels in the channel set (N)
+    int     residual_encode;    ///< Residual encoding mask (0 - residual, 1 - full channel)
+    int     pcm_bit_res;        ///< PCM bit resolution (variable)
+    int     storage_bit_res;    ///< Storage bit resolution (16 or 24)
+    int     freq;               ///< Original sampling frequency (max. 96000 Hz)
+
+    int     primary_chset;          ///< Primary channel set flag
+    int     dmix_coeffs_present;    ///< Downmix coefficients present in stream
+    int     dmix_embedded;          ///< Downmix already performed by encoder
+    int     dmix_type;              ///< Primary channel set downmix type
+    int     hier_chset;             ///< Whether the channel set is part of a hierarchy
+    int     hier_ofs;               ///< Number of preceding channels in a hierarchy (M)
+    int     dmix_coeff[DCA_XLL_DMIX_COEFFS_MAX];       ///< Downmixing coefficients
+    int     dmix_scale[DCA_XLL_DMIX_SCALES_MAX];       ///< Downmixing scales
+    int     dmix_scale_inv[DCA_XLL_DMIX_SCALES_MAX];   ///< Inverse downmixing scales
+    int     ch_mask;                ///< Channel mask for set
+    int     ch_remap[DCA_XLL_CHANNELS_MAX];    ///< Channel to speaker map
+
+    int     nfreqbands; ///< Number of frequency bands (1 or 2)
+    int     nabits;     ///< Number of bits to read bit allocation coding parameter
+
+    DCAXllBand     bands[DCA_XLL_BANDS_MAX];   ///< Frequency bands
+
+    // Frequency band coding parameters
+    int     seg_common;                                     ///< Segment type
+    int     rice_code_flag[DCA_XLL_CHANNELS_MAX];           ///< Rice coding flag
+    int     bitalloc_hybrid_linear[DCA_XLL_CHANNELS_MAX];   ///< Binary code length for isolated samples
+    int     bitalloc_part_a[DCA_XLL_CHANNELS_MAX];          ///< Coding parameter for part A of segment
+    int     bitalloc_part_b[DCA_XLL_CHANNELS_MAX];          ///< Coding parameter for part B of segment
+    int     nsamples_part_a[DCA_XLL_CHANNELS_MAX];          ///< Number of samples in part A of segment
+
+    // Decimator history
+    DECLARE_ALIGNED(32, int32_t, deci_history)[DCA_XLL_CHANNELS_MAX][DCA_XLL_DECI_HISTORY_MAX]; ///< Decimator history for frequency band 1
+
+    // Sample buffers
+    unsigned int    sample_size[DCA_XLL_SAMPLE_BUFFERS_MAX];
+    int32_t         *sample_buffer[DCA_XLL_SAMPLE_BUFFERS_MAX];
+} DCAXllChSet;
+
+typedef struct DCAXllDecoder {
+    AVCodecContext  *avctx;
+    GetBitContext   gb;
+
+    int     frame_size;             ///< Number of bytes in a lossless frame
+    int     nchsets;                ///< Number of channels sets per frame
+    int     nframesegs;             ///< Number of segments per frame
+    int     nsegsamples_log2;       ///< log2(nsegsamples)
+    int     nsegsamples;            ///< Samples in segment per one frequency band
+    int     nframesamples_log2;     ///< log2(nframesamples)
+    int     nframesamples;          ///< Samples in frame per one frequency band
+    int     seg_size_nbits;         ///< Number of bits used to read segment size
+    int     band_crc_present;       ///< Presence of CRC16 within each frequency band
+    int     scalable_lsbs;          ///< MSB/LSB split flag
+    int     ch_mask_nbits;          ///< Number of bits used to read channel mask
+    int     fixed_lsb_width;        ///< Fixed LSB width
+
+    DCAXllChSet    chset[DCA_XLL_CHSETS_MAX]; ///< Channel sets
+
+    int             *navi;          ///< NAVI table
+    unsigned int    navi_size;
+
+    int     nfreqbands;     ///< Highest number of frequency bands
+    int     nchannels;      ///< Total number of channels in a hierarchy
+    int     nreschsets;     ///< Number of channel sets that have residual encoded channels
+    int     nactivechsets;  ///< Number of active channel sets to decode
+
+    int     hd_stream_id;   ///< Previous DTS-HD stream ID for detecting changes
+
+    uint8_t     *pbr_buffer;        ///< Peak bit rate (PBR) smoothing buffer
+    int         pbr_length;         ///< Length in bytes of data currently buffered
+    int         pbr_delay;          ///< Delay in frames before decoding buffered data
+
+    DCADSPContext   *dcadsp;
+
+    int     output_mask;
+    int32_t *output_samples[DCA_SPEAKER_COUNT];
+} DCAXllDecoder;
+
+int ff_dca_xll_parse(DCAXllDecoder *s, uint8_t *data, DCAExssAsset *asset);
+int ff_dca_xll_filter_frame(DCAXllDecoder *s, AVFrame *frame);
+av_cold void ff_dca_xll_flush(DCAXllDecoder *s);
+av_cold void ff_dca_xll_close(DCAXllDecoder *s);
+
+#endif
diff --git a/libavcodec/dcadata.c b/libavcodec/dcadata.c
index 2369f55..b2e0f6c 100644
--- a/libavcodec/dcadata.c
+++ b/libavcodec/dcadata.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Gildas Bazin
  * Copyright (c) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include "libavutil/mem.h"
 
+#include "dca.h"
 #include "dcadata.h"
 
 /* Generic tables */
@@ -41,8 +42,12 @@ const uint8_t ff_dca_channels[16] = {
     1, 2, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 6, 7, 8, 8
 };
 
-const uint8_t ff_dca_bits_per_sample[7] = {
-    16, 16, 20, 20, 0, 24, 24
+const uint8_t ff_dca_bits_per_sample[8] = {
+    16, 16, 20, 20, 0, 24, 24, 0
+};
+
+const uint8_t ff_dca_dmix_primary_nch[8] = {
+    1, 2, 2, 3, 3, 4, 4, 0
 };
 
 /* ADPCM data */
@@ -4179,6 +4184,37 @@ const uint32_t ff_dca_scale_factor_quant7[128] = {
     5011872, 5688529, 6456542, 7328245, 8317638,       0,       0,       0
 };
 
+const uint32_t ff_dca_joint_scale_factors[129] = {
+       3288,    3490,    3691,    3909,    4144,    4387,    4647,    4924,
+       5218,    5528,    5855,    6199,    6568,    6963,    7374,    7810,
+       8271,    8758,    9278,    9831,   10410,   11031,   11685,   12373,
+      13103,   13883,   14705,   15578,   16500,   17482,   18514,   19613,
+      20770,   22003,   23312,   24688,   26156,   27699,   29343,   31080,
+      32925,   34871,   36943,   39133,   41448,   43906,   46506,   49258,
+      52177,   55273,   58544,   62017,   65691,   69584,   73711,   78073,
+      82703,   87602,   92795,   98289,  104111,  110285,  116820,  123740,
+     131072,  138840,  147069,  155776,  165012,  174785,  185145,  196117,
+     207735,  220042,  233086,  246894,  261523,  277017,  293434,  310823,
+     329236,  348748,  369409,  391303,  414490,  439043,  465064,  492621,
+     521805,  552725,  585475,  620170,  656920,  695843,  737073,  780745,
+     827008,  876014,  927923,  982902, 1041144, 1102834, 1168181, 1237404,
+    1310720, 1388382, 1470649, 1557790, 1650098, 1747876, 1851441, 1961147,
+    2077355, 2200441, 2330825, 2468935, 2615232, 2770195, 2934335, 3108206,
+    3292378, 3487463, 3694108, 3913000, 4144862, 4390455, 4650611, 4926176,
+    5218066
+};
+
+const uint32_t ff_dca_scale_factor_adj[4] = {
+    4194304, 4718592, 5242880, 6029312
+};
+
+const uint32_t ff_dca_quant_levels[32] = {
+          1,       3,       5,     7,      9,     13,     17,      25,
+         32,      64,     128,   256,    512,   1024,   2048,    4096,
+       8192,   16384,   32768, 65536, 131072, 262144, 524288, 1048576,
+    2097152, 4194304, 8388608,     0,      0,      0,      0,       0
+};
+
 /* 20 bits unsigned fractional binary codes */
 const uint32_t ff_dca_lossy_quant[32] = {
          0, 6710886, 4194304, 3355443, 2474639, 2097152, 1761608, 1426063,
@@ -7507,76 +7543,6 @@ DECLARE_ALIGNED(16, const float, ff_dca_lfe_fir_128)[256] = {
 };
 #undef SCALE
 
-
-#define SCALE(c) ((float)(c) / (256.0f * 32768.0f * 8388608.0f))
-DECLARE_ALIGNED(16, const float, ff_dca_lfe_xll_fir_64)[256] = {
-    SCALE(   6103), SCALE(  52170), SCALE(-558064), SCALE(1592440),
-    SCALE(6290049), SCALE(1502534), SCALE(-546669), SCALE(  53047),
-    SCALE(   1930), SCALE(  51089), SCALE(-568920), SCALE(1683709),
-    SCALE(6286575), SCALE(1414057), SCALE(-534782), SCALE(  53729),
-    SCALE(   2228), SCALE(  49794), SCALE(-579194), SCALE(1776276),
-    SCALE(6279634), SCALE(1327070), SCALE(-522445), SCALE(  54228),
-    SCALE(   2552), SCALE(  48275), SCALE(-588839), SCALE(1870070),
-    SCALE(6269231), SCALE(1241632), SCALE(-509702), SCALE(  54550),
-    SCALE(   2904), SCALE(  46523), SCALE(-597808), SCALE(1965017),
-    SCALE(6255380), SCALE(1157798), SCALE(-496595), SCALE(  54708),
-    SCALE(   3287), SCALE(  44529), SCALE(-606054), SCALE(2061044),
-    SCALE(6238099), SCALE(1075621), SCALE(-483164), SCALE(  54710),
-    SCALE(   3704), SCALE(  42282), SCALE(-613529), SCALE(2158071),
-    SCALE(6217408), SCALE( 995149), SCALE(-469451), SCALE(  54566),
-    SCALE(   4152), SCALE(  39774), SCALE(-620186), SCALE(2256019),
-    SCALE(6193332), SCALE( 916430), SCALE(-455494), SCALE(  54285),
-    SCALE(   4631), SCALE(  36995), SCALE(-625976), SCALE(2354805),
-    SCALE(6165900), SCALE( 839507), SCALE(-441330), SCALE(  53876),
-    SCALE(   5139), SCALE(  33937), SCALE(-630850), SCALE(2454343),
-    SCALE(6135146), SCALE( 764419), SCALE(-426998), SCALE(  53348),
-    SCALE(   5682), SCALE(  30591), SCALE(-634759), SCALE(2554547),
-    SCALE(6101107), SCALE( 691203), SCALE(-412531), SCALE(  52711),
-    SCALE(   6264), SCALE(  26948), SCALE(-637655), SCALE(2655326),
-    SCALE(6063824), SCALE( 619894), SCALE(-397966), SCALE(  51972),
-    SCALE(   6886), SCALE(  23001), SCALE(-639488), SCALE(2756591),
-    SCALE(6023343), SCALE( 550521), SCALE(-383335), SCALE(  51140),
-    SCALE(   7531), SCALE(  18741), SCALE(-640210), SCALE(2858248),
-    SCALE(5979711), SCALE( 483113), SCALE(-368671), SCALE(  50224),
-    SCALE(   8230), SCALE(  14162), SCALE(-639772), SCALE(2960201),
-    SCALE(5932981), SCALE( 417692), SCALE(-354003), SCALE(  49231),
-    SCALE(   8959), SCALE(   9257), SCALE(-638125), SCALE(3062355),
-    SCALE(5883210), SCALE( 354281), SCALE(-339362), SCALE(  48168),
-    SCALE(   9727), SCALE(   4018), SCALE(-635222), SCALE(3164612),
-    SCALE(5830457), SCALE( 292897), SCALE(-324777), SCALE(  47044),
-    SCALE(  10535), SCALE(  -1558), SCALE(-631014), SCALE(3266872),
-    SCALE(5774785), SCALE( 233555), SCALE(-310273), SCALE(  45866),
-    SCALE(  11381), SCALE(  -7480), SCALE(-625455), SCALE(3369035),
-    SCALE(5716260), SCALE( 176267), SCALE(-295877), SCALE(  44640),
-    SCALE(  12267), SCALE( -13750), SCALE(-618499), SCALE(3471000),
-    SCALE(5654952), SCALE( 121042), SCALE(-281613), SCALE(  43373),
-    SCALE(  13190), SCALE( -20372), SCALE(-610098), SCALE(3572664),
-    SCALE(5590933), SCALE(  67886), SCALE(-267505), SCALE(  42072),
-    SCALE(  14152), SCALE( -27352), SCALE(-600209), SCALE(3673924),
-    SCALE(5524280), SCALE(  16800), SCALE(-253574), SCALE(  40743),
-    SCALE(  15153), SCALE( -34691), SCALE(-588788), SCALE(3774676),
-    SCALE(5455069), SCALE( -32214), SCALE(-239840), SCALE(  39391),
-    SCALE(  16192), SCALE( -42390), SCALE(-575791), SCALE(3874816),
-    SCALE(5383383), SCALE( -79159), SCALE(-226323), SCALE(  38022),
-    SCALE(  17267), SCALE( -50453), SCALE(-561178), SCALE(3974239),
-    SCALE(5309305), SCALE(-124041), SCALE(-213041), SCALE(  36642),
-    SCALE(  18377), SCALE( -58879), SCALE(-544906), SCALE(4072841),
-    SCALE(5232922), SCALE(-166869), SCALE(-200010), SCALE(  35256),
-    SCALE(  19525), SCALE( -67667), SCALE(-526937), SCALE(4170517),
-    SCALE(5154321), SCALE(-207653), SCALE(-187246), SCALE(  33866),
-    SCALE(  20704), SCALE( -76817), SCALE(-507233), SCALE(4267162),
-    SCALE(5073593), SCALE(-246406), SCALE(-174764), SCALE(  32480),
-    SCALE(  21915), SCALE( -86327), SCALE(-485757), SCALE(4362672),
-    SCALE(4990831), SCALE(-283146), SCALE(-162575), SCALE(  31101),
-    SCALE(  23157), SCALE( -96193), SCALE(-462476), SCALE(4456942),
-    SCALE(4906129), SCALE(-317890), SCALE(-150692), SCALE(  29732),
-    SCALE(  24426), SCALE(-106412), SCALE(-437356), SCALE(4549871),
-    SCALE(4819584), SCALE(-350658), SCALE(-139125), SCALE(  28376),
-    SCALE(  25721), SCALE(-116977), SCALE(-410365), SCALE(4641355),
-    SCALE(4731293), SCALE(-381475), SCALE(-127884), SCALE(  27038),
-};
-#undef SCALE
-
 DECLARE_ALIGNED(16, const float, ff_dca_fir_64bands)[1024] = {
     /* Bank 0 */
     -7.1279389866041690e-8, -7.0950903150874990e-8,
@@ -8101,6 +8067,562 @@ DECLARE_ALIGNED(16, const float, ff_dca_fir_64bands)[1024] = {
      7.0950903150874990e-8,  7.1279389866041690e-8,
 };
 
+DECLARE_ALIGNED(16, const int32_t, ff_dca_fir_32bands_perfect_fixed)[512] = {
+           0,        0,       -3,      -10,
+         -35,     -105,     -218,     -141,
+        -170,     -216,     -239,     -254,
+        -257,     -251,     -235,     -212,
+        -267,     -317,     -362,     -400,
+        -425,     -434,     -427,     -373,
+        -339,     -593,     -321,     -120,
+         -39,      -16,        0,        1,
+           1,        1,       -3,       -1,
+          -6,      -38,      -93,     -496,
+        -723,     -970,    -1235,    -1501,
+       -1753,    -1978,    -2163,    -2295,
+       -2891,    -2915,    -2860,    -2726,
+       -2517,    -2243,    -1915,    -1590,
+       -1192,     -252,     -117,      -22,
+          -6,      -13,       12,       14,
+          32,       25,      469,      942,
+        1403,     1421,     1239,     2838,
+        3539,     4259,     5002,     5716,
+        6365,     6908,     7311,     7545,
+       11680,    12355,    12785,    12951,
+       12841,    12453,    11803,    10864,
+        9762,     7099,     6725,     5954,
+        4284,     2584,      215,      379,
+         557,      701,      -29,     -687,
+       -1578,    -2749,    -4076,    -7933,
+      -10049,   -12133,   -14039,   -15752,
+      -17213,   -18400,   -19291,   -19878,
+       -1444,    -3005,    -4523,    -5927,
+       -7143,    -8093,    -8713,    -8939,
+       -8700,    -9481,    -7515,    -5279,
+       -2725,       61,     5763,     6113,
+        7571,     6735,    17126,    20165,
+       23328,    26775,    30310,    32639,
+       35464,    38064,    40423,    42512,
+       44261,    45632,    46578,    46974,
+      -45572,   -45008,   -43753,   -41661,
+      -38655,   -34660,   -29587,   -23375,
+      -15998,    -7631,     2472,    13757,
+       26188,    39942,    49789,    67293,
+       84699,   101701,   127325,   148404,
+      170391,   193280,   217044,   241451,
+      266537,   292144,   318161,   344417,
+      370786,   397082,   423133,   448757,
+      475085,   499136,   522007,   543516,
+      563424,   581467,   597422,   611005,
+      621975,   630083,   634996,   636457,
+      634311,   628147,   619871,   604524,
+      585077,   561301,   529204,   494129,
+      453552,   407189,   354920,   296502,
+      231916,   161012,    83700,      -86,
+      -90377,  -187193,  -290528,  -400329,
+      516487,   639054,   767835,   902710,
+     1043512,  1190048,  1342100,  1499418,
+     1661729,  1828700,  2000071,  2175433,
+     2354437,  2536630,  2721120,  2908704,
+     3098059,  3288764,  3480801,  3672922,
+     3864970,  4056432,  4246767,  4435454,
+     4621921,  4805642,  4986073,  5162677,
+     5334921,  5502279,  5664239,  5820300,
+     5969913,  6112723,  6248225,  6375985,
+     6495593,  6606663,  6708832,  6801769,
+     6885168,  6958762,  7022294,  7075566,
+     7118382,  7150633,  7172314,  7183082,
+     7183082,  7172314,  7150633,  7118382,
+     7075566,  7022294,  6958762,  6885168,
+     6801769,  6708832,  6606663,  6495593,
+     6375985,  6248225,  6112723,  5969913,
+    -5820300, -5664239, -5502279, -5334921,
+    -5162677, -4986073, -4805642, -4621921,
+    -4435454, -4246767, -4056432, -3864970,
+    -3672922, -3480801, -3288764, -3098059,
+    -2908704, -2721120, -2536630, -2354437,
+    -2175433, -2000071, -1828700, -1661729,
+    -1499418, -1342100, -1190048, -1043512,
+     -902710,  -767835,  -639054,  -516487,
+     -400329,  -290528,  -187193,   -90377,
+         -86,    83700,   161012,   231916,
+      296502,   354920,   407189,   453552,
+      494129,   529204,   561301,   585077,
+      604524,   619871,   628147,   634311,
+      636457,   634996,   630083,   621975,
+      611005,   597422,   581467,   563424,
+      543516,   522007,   499136,   475085,
+     -448757,  -423133,  -397082,  -370786,
+     -344417,  -318161,  -292144,  -266537,
+     -241451,  -217044,  -193280,  -170391,
+     -148404,  -127325,  -101701,   -84699,
+      -67293,   -49789,   -39942,   -26188,
+      -13757,    -2472,     7631,    15998,
+       23375,    29587,    34660,    38655,
+       41661,    43753,    45008,    45572,
+       46974,    46578,    45632,    44261,
+       42512,    40423,    38064,    35464,
+       32639,    30310,    26775,    23328,
+       20165,    17126,     6735,     7571,
+        6113,     5763,       61,    -2725,
+       -5279,    -7515,    -9481,    -8700,
+       -8939,    -8713,    -8093,    -7143,
+       -5927,    -4523,    -3005,    -1444,
+       19878,    19291,    18400,    17213,
+       15752,    14039,    12133,    10049,
+        7933,     4076,     2749,     1578,
+         687,       29,     -701,     -557,
+        -379,     -215,    -2584,    -4284,
+       -5954,    -6725,    -7099,    -9762,
+      -10864,   -11803,   -12453,   -12841,
+      -12951,   -12785,   -12355,   -11680,
+        7545,     7311,     6908,     6365,
+        5716,     5002,     4259,     3539,
+        2838,     1239,     1421,     1403,
+         942,      469,       25,       32,
+          14,       12,      -13,       -6,
+         -22,     -117,     -252,    -1192,
+       -1590,    -1915,    -2243,    -2517,
+       -2726,    -2860,    -2915,    -2891,
+        2295,     2163,     1978,     1753,
+        1501,     1235,      970,      723,
+         496,       93,       38,        6,
+           1,        3,       -1,       -1,
+          -1,        0,       16,       39,
+         120,      321,      593,      339,
+         373,      427,      434,      425,
+         400,      362,      317,      267,
+        -212,     -235,     -251,     -257,
+        -254,     -239,     -216,     -170,
+        -141,     -218,     -105,      -35,
+         -10,       -3,        0,        0
+};
+
+DECLARE_ALIGNED(16, const int32_t, ff_dca_fir_32bands_nonperfect_fixed)[512] = {
+         -53,      -64,      -77,      -91,
+        -107,     -124,     -144,     -165,
+        -189,     -215,     -244,     -277,
+        -313,     -353,     -397,     -447,
+         502,      563,      631,      706,
+         789,      881,      983,     1095,
+        1218,     1354,     1502,     1665,
+        1843,     2036,     2247,     2475,
+        2723,     2990,     3277,     3586,
+        3916,     4270,     4646,     5046,
+        5470,     5918,     6390,     6886,
+        7405,     7947,     8510,     9094,
+        9698,    10319,    10955,    11605,
+       12265,    12933,    13605,    14277,
+       14945,    15604,    16250,    16877,
+       17480,    18051,    18585,    19075,
+       19513,    19891,    20202,    20436,
+       20587,    20643,    20597,    20439,
+       20160,    19749,    19198,    18496,
+       17634,    16603,    15393,    13996,
+      -12403,   -10605,    -8595,    -6366,
+       -3911,    -1225,     1697,     4860,
+        8265,    11916,    15812,    19953,
+       24337,    28961,    33819,    38904,
+       44210,    49725,    55437,    61334,
+       67398,    73614,    79961,    86417,
+       92960,    99563,   106198,   112837,
+      119446,   125994,   132443,   138758,
+      144898,   150823,   156491,   161858,
+      166879,   171507,   175697,   179400,
+      182566,   185149,   187097,   188363,
+      188899,   188654,   187581,   185635,
+      182770,   178943,   174112,   168238,
+      161285,   153218,   144007,   133624,
+      122046,   109254,    95232,    79969,
+       63462,    45709,    26715,     6492,
+       14943,    37567,    61350,    86256,
+      112242,   139258,   167246,   196143,
+      225877,   256368,   287532,   319275,
+      351496,   384088,   416936,   449919,
+      482909,   515770,   548362,   580539,
+      612148,   643030,   673024,   701963,
+      729674,   755985,   780717,   803690,
+      824721,   843628,   860226,   874332,
+      885761,   894330,   899861,   902174,
+      901096,   896456,   888088,   875832,
+      859535,   839050,   814237,   784966,
+      751116,   712574,   669239,   621021,
+      567840,   509632,   446341,   377927,
+      304365,   225641,   141757,    52732,
+      -41403,  -140599,  -244793,  -353905,
+     -467840,  -586486,  -709716,  -837385,
+      969336,  1105393,  1245366,  1389049,
+     1536224,  1686655,  1840096,  1996285,
+     2154949,  2315802,  2478547,  2642877,
+     2808475,  2975015,  3142163,  3309579,
+     3476914,  3643818,  3809934,  3974901,
+     4138360,  4299948,  4459303,  4616064,
+     4769873,  4920374,  5067219,  5210063,
+     5348569,  5482406,  5611255,  5734805,
+     5852757,  5964823,  6070729,  6170216,
+     6263037,  6348961,  6427777,  6499286,
+     6563310,  6619688,  6668279,  6708963,
+     6741632,  6766206,  6782623,  6790843,
+     6790843,  6782623,  6766206,  6741632,
+     6708963,  6668279,  6619688,  6563310,
+     6499286,  6427777,  6348961,  6263037,
+     6170216,  6070729,  5964823,  5852757,
+    -5734805, -5611255, -5482406, -5348569,
+    -5210063, -5067219, -4920374, -4769873,
+    -4616064, -4459303, -4299948, -4138360,
+    -3974901, -3809934, -3643818, -3476914,
+    -3309579, -3142163, -2975015, -2808475,
+    -2642877, -2478547, -2315802, -2154949,
+    -1996285, -1840096, -1686655, -1536224,
+    -1389049, -1245366, -1105393,  -969336,
+     -837385,  -709716,  -586486,  -467840,
+     -353905,  -244793,  -140599,   -41403,
+       52732,   141757,   225641,   304365,
+      377927,   446341,   509632,   567840,
+      621021,   669239,   712574,   751116,
+      784966,   814237,   839050,   859535,
+      875832,   888088,   896456,   901096,
+      902174,   899861,   894330,   885761,
+     -874332,  -860226,  -843628,  -824721,
+     -803690,  -780717,  -755985,  -729674,
+     -701963,  -673024,  -643030,  -612148,
+     -580539,  -548362,  -515770,  -482909,
+     -449919,  -416936,  -384088,  -351496,
+     -319275,  -287532,  -256368,  -225877,
+     -196143,  -167246,  -139258,  -112242,
+      -86256,   -61350,   -37567,   -14943,
+        6492,    26715,    45709,    63462,
+       79969,    95232,   109254,   122046,
+      133624,   144007,   153218,   161285,
+      168238,   174112,   178943,   182770,
+      185635,   187581,   188654,   188899,
+      188363,   187097,   185149,   182566,
+      179400,   175697,   171507,   166879,
+      161858,   156491,   150823,   144898,
+     -138758,  -132443,  -125994,  -119446,
+     -112837,  -106198,   -99563,   -92960,
+      -86417,   -79961,   -73614,   -67398,
+      -61334,   -55437,   -49725,   -44210,
+      -38904,   -33819,   -28961,   -24337,
+      -19953,   -15812,   -11916,    -8265,
+       -4860,    -1697,     1225,     3911,
+        6366,     8595,    10605,    12403,
+       13996,    15393,    16603,    17634,
+       18496,    19198,    19749,    20160,
+       20439,    20597,    20643,    20587,
+       20436,    20202,    19891,    19513,
+       19075,    18585,    18051,    17480,
+       16877,    16250,    15604,    14945,
+       14277,    13605,    12933,    12265,
+       11605,    10955,    10319,     9698,
+       -9094,    -8510,    -7947,    -7405,
+       -6886,    -6390,    -5918,    -5470,
+       -5046,    -4646,    -4270,    -3916,
+       -3586,    -3277,    -2990,    -2723,
+       -2475,    -2247,    -2036,    -1843,
+       -1665,    -1502,    -1354,    -1218,
+       -1095,     -983,     -881,     -789,
+        -706,     -631,     -563,     -502,
+        -447,     -397,     -353,     -313,
+        -277,     -244,     -215,     -189,
+        -165,     -144,     -124,     -107,
+         -91,      -77,      -64,      -53
+};
+
+DECLARE_ALIGNED(16, const int32_t, ff_dca_lfe_fir_64_fixed)[256] = {
+     6103,   52170, -558064, 1592440, 6290049, 1502534, -546669, 53047,
+     1930,   51089, -568920, 1683709, 6286575, 1414057, -534782, 53729,
+     2228,   49794, -579194, 1776276, 6279634, 1327070, -522445, 54228,
+     2552,   48275, -588839, 1870070, 6269231, 1241632, -509702, 54550,
+     2904,   46523, -597808, 1965017, 6255380, 1157798, -496595, 54708,
+     3287,   44529, -606054, 2061044, 6238099, 1075621, -483164, 54710,
+     3704,   42282, -613529, 2158071, 6217408,  995149, -469451, 54566,
+     4152,   39774, -620186, 2256019, 6193332,  916430, -455494, 54285,
+     4631,   36995, -625976, 2354805, 6165900,  839507, -441330, 53876,
+     5139,   33937, -630850, 2454343, 6135146,  764419, -426998, 53348,
+     5682,   30591, -634759, 2554547, 6101107,  691203, -412531, 52711,
+     6264,   26948, -637655, 2655326, 6063824,  619894, -397966, 51972,
+     6886,   23001, -639488, 2756591, 6023343,  550521, -383335, 51140,
+     7531,   18741, -640210, 2858248, 5979711,  483113, -368671, 50224,
+     8230,   14162, -639772, 2960201, 5932981,  417692, -354003, 49231,
+     8959,    9257, -638125, 3062355, 5883210,  354281, -339362, 48168,
+     9727,    4018, -635222, 3164612, 5830457,  292897, -324777, 47044,
+    10535,   -1558, -631014, 3266872, 5774785,  233555, -310273, 45866,
+    11381,   -7480, -625455, 3369035, 5716260,  176267, -295877, 44640,
+    12267,  -13750, -618499, 3471000, 5654952,  121042, -281613, 43373,
+    13190,  -20372, -610098, 3572664, 5590933,   67886, -267505, 42072,
+    14152,  -27352, -600209, 3673924, 5524280,   16800, -253574, 40743,
+    15153,  -34691, -588788, 3774676, 5455069,  -32214, -239840, 39391,
+    16192,  -42390, -575791, 3874816, 5383383,  -79159, -226323, 38022,
+    17267,  -50453, -561178, 3974239, 5309305, -124041, -213041, 36642,
+    18377,  -58879, -544906, 4072841, 5232922, -166869, -200010, 35256,
+    19525,  -67667, -526937, 4170517, 5154321, -207653, -187246, 33866,
+    20704,  -76817, -507233, 4267162, 5073593, -246406, -174764, 32480,
+    21915,  -86327, -485757, 4362672, 4990831, -283146, -162575, 31101,
+    23157,  -96193, -462476, 4456942, 4906129, -317890, -150692, 29732,
+    24426, -106412, -437356, 4549871, 4819584, -350658, -139125, 28376,
+    25721, -116977, -410365, 4641355, 4731293, -381475, -127884, 27038
+};
+
+DECLARE_ALIGNED(16, const int32_t, ff_dca_fir_64bands_fixed)[1024] = {
+         -38,      -38,      -43,      -48,
+         -52,      -57,      -62,      -67,
+         -73,      -79,      -85,      -91,
+         -98,     -105,     -113,     -121,
+        -129,     -138,     -147,     -157,
+        -167,     -178,     -190,     -202,
+        -215,     -228,     -242,     -257,
+        -273,     -289,     -307,     -325,
+         345,      365,      387,      410,
+         433,      459,      485,      513,
+         543,      574,      606,      640,
+         676,      714,      753,      795,
+         839,      884,      932,      983,
+        1035,     1090,     1148,     1208,
+        1271,     1336,     1405,     1476,
+        1550,     1628,     1709,     1793,
+        1880,     1971,     2065,     2163,
+        2265,     2370,     2479,     2592,
+        2709,     2830,     2955,     3084,
+        3217,     3354,     3496,     3642,
+        3791,     3946,     4104,     4267,
+        4433,     4604,     4780,     4959,
+        5143,     5330,     5522,     5717,
+        5916,     6119,     6326,     6536,
+        6749,     6966,     7186,     7408,
+        7633,     7861,     8090,     8322,
+        8556,     8791,     9027,     9264,
+        9501,     9739,     9977,    10214,
+       10450,    10685,    10918,    11149,
+       11377,    11603,    11825,    12042,
+       12255,    12463,    12665,    12861,
+       13050,    13231,    13405,    13569,
+       13724,    13869,    14002,    14125,
+       14235,    14331,    14415,    14483,
+       14536,    14573,    14594,    14596,
+       14580,    14544,    14488,    14412,
+       14313,    14191,    14046,    13877,
+       13682,    13461,    13213,    12937,
+       12632,    12298,    11934,    11538,
+       11111,    10650,    10156,     9628,
+       -9065,    -8466,    -7830,    -7158,
+       -6447,    -5698,    -4910,    -4083,
+       -3215,    -2306,    -1357,     -366,
+         668,     1743,     2861,     4022,
+        5226,     6474,     7764,     9098,
+       10476,    11897,    13361,    14868,
+       16418,    18011,    19645,    21322,
+       23039,    24798,    26596,    28433,
+       30309,    32222,    34172,    36158,
+       38177,    40231,    42315,    44431,
+       46575,    48747,    50945,    53167,
+       55411,    57676,    59959,    62258,
+       64571,    66897,    69231,    71573,
+       73919,    76268,    78615,    80959,
+       83296,    85624,    87939,    90239,
+       92519,    94778,    97011,    99215,
+      101386,   103521,   105616,   107668,
+      109673,   111626,   113524,   115362,
+      117138,   118847,   120484,   122045,
+      123527,   124925,   126234,   127451,
+      128571,   129591,   130504,   131308,
+      131997,   132568,   133016,   133338,
+      133528,   133582,   133495,   133265,
+      132886,   132355,   131668,   130820,
+      129807,   128626,   127274,   125746,
+      124038,   122148,   120071,   117806,
+      115348,   112694,   109843,   106790,
+      103534,   100071,    96401,    92520,
+       88426,    84119,    79597,    74857,
+       69900,    64723,    59327,    53711,
+       47875,    41818,    35542,    29045,
+       22330,    15397,     8247,      881,
+        6697,    14487,    22487,    30692,
+       39101,    47711,    56517,    65516,
+       74704,    84076,    93628,   103355,
+      113251,   123311,   133528,   143897,
+      154410,   165061,   175843,   186747,
+      197766,   208892,   220116,   231429,
+      242822,   254285,   265810,   277384,
+      288999,   300644,   312306,   323976,
+      335641,   347289,   358909,   370488,
+      382013,   393471,   404848,   416133,
+      427310,   438366,   449286,   460057,
+      470663,   481090,   491323,   501347,
+      511147,   520707,   530011,   539044,
+      547790,   556233,   564357,   572146,
+      579584,   586654,   593341,   599627,
+      605498,   610936,   615925,   620449,
+      624491,   628037,   631069,   633571,
+      635529,   636925,   637745,   637972,
+      637593,   636592,   634953,   632662,
+      629705,   626068,   621737,   616698,
+      610938,   604443,   597202,   589202,
+      580431,   570877,   560530,   549378,
+      537411,   524620,   510994,   496525,
+      481205,   465026,   447979,   430058,
+      411256,   391569,   370989,   349514,
+      327137,   303857,   279670,   254573,
+      228564,   201644,   173811,   145065,
+      115408,    84840,    53365,    20984,
+      -12298,   -46478,   -81550,  -117508,
+     -154347,  -192060,  -230638,  -270073,
+     -310356,  -351478,  -393427,  -436192,
+     -479762,  -524124,  -569264,  -615168,
+      661821,   709209,   757314,   806121,
+      855611,   905766,   956569,  1007998,
+     1060035,  1112658,  1165846,  1219578,
+     1273830,  1328580,  1383805,  1439479,
+     1495578,  1552077,  1608950,  1666171,
+     1723714,  1781550,  1839653,  1897995,
+     1956546,  2015279,  2074163,  2133170,
+     2192270,  2251432,  2310626,  2369822,
+     2428988,  2488093,  2547106,  2605996,
+     2664731,  2723279,  2781607,  2839685,
+     2897481,  2954962,  3012096,  3068851,
+     3125195,  3181097,  3236524,  3291445,
+     3345829,  3399643,  3452858,  3505441,
+     3557362,  3608591,  3659098,  3708853,
+     3757825,  3805987,  3853309,  3899763,
+     3945322,  3989957,  4033642,  4076350,
+     4118055,  4158733,  4198357,  4236904,
+     4274351,  4310673,  4345850,  4379859,
+     4412678,  4444289,  4474670,  4503803,
+     4531671,  4558255,  4583539,  4607508,
+     4630146,  4651438,  4671373,  4689936,
+     4707117,  4722905,  4737290,  4750262,
+     4761813,  4771936,  4780625,  4787874,
+     4793679,  4798036,  4800943,  4802396,
+     4802396,  4800943,  4798036,  4793679,
+     4787874,  4780625,  4771936,  4761813,
+     4750262,  4737290,  4722905,  4707117,
+     4689936,  4671373,  4651438,  4630146,
+     4607508,  4583539,  4558255,  4531671,
+     4503803,  4474670,  4444289,  4412678,
+     4379859,  4345850,  4310673,  4274351,
+     4236904,  4198357,  4158733,  4118055,
+    -4076350, -4033642, -3989957, -3945322,
+    -3899763, -3853309, -3805987, -3757825,
+    -3708853, -3659098, -3608591, -3557362,
+    -3505441, -3452858, -3399643, -3345829,
+    -3291445, -3236524, -3181097, -3125195,
+    -3068851, -3012096, -2954962, -2897481,
+    -2839685, -2781607, -2723279, -2664731,
+    -2605996, -2547106, -2488093, -2428988,
+    -2369822, -2310626, -2251432, -2192270,
+    -2133170, -2074163, -2015279, -1956546,
+    -1897995, -1839653, -1781550, -1723714,
+    -1666171, -1608950, -1552077, -1495578,
+    -1439479, -1383805, -1328580, -1273830,
+    -1219578, -1165846, -1112658, -1060035,
+    -1007998,  -956569,  -905766,  -855611,
+     -806121,  -757314,  -709209,  -661821,
+     -615168,  -569264,  -524124,  -479762,
+     -436192,  -393427,  -351478,  -310356,
+     -270073,  -230638,  -192060,  -154347,
+     -117508,   -81550,   -46478,   -12298,
+       20984,    53365,    84840,   115408,
+      145065,   173811,   201644,   228564,
+      254573,   279670,   303857,   327137,
+      349514,   370989,   391569,   411256,
+      430058,   447979,   465026,   481205,
+      496525,   510994,   524620,   537411,
+      549378,   560530,   570877,   580431,
+      589202,   597202,   604443,   610938,
+      616698,   621737,   626068,   629705,
+      632662,   634953,   636592,   637593,
+      637972,   637745,   636925,   635529,
+      633571,   631069,   628037,   624491,
+     -620449,  -615925,  -610936,  -605498,
+     -599627,  -593341,  -586654,  -579584,
+     -572146,  -564357,  -556233,  -547790,
+     -539044,  -530011,  -520707,  -511147,
+     -501347,  -491323,  -481090,  -470663,
+     -460057,  -449286,  -438366,  -427310,
+     -416133,  -404848,  -393471,  -382013,
+     -370488,  -358909,  -347289,  -335641,
+     -323976,  -312306,  -300644,  -288999,
+     -277384,  -265810,  -254285,  -242822,
+     -231429,  -220116,  -208892,  -197766,
+     -186747,  -175843,  -165061,  -154410,
+     -143897,  -133528,  -123311,  -113251,
+     -103355,   -93628,   -84076,   -74704,
+      -65516,   -56517,   -47711,   -39101,
+      -30692,   -22487,   -14487,    -6697,
+         881,     8247,    15397,    22330,
+       29045,    35542,    41818,    47875,
+       53711,    59327,    64723,    69900,
+       74857,    79597,    84119,    88426,
+       92520,    96401,   100071,   103534,
+      106790,   109843,   112694,   115348,
+      117806,   120071,   122148,   124038,
+      125746,   127274,   128626,   129807,
+      130820,   131668,   132355,   132886,
+      133265,   133495,   133582,   133528,
+      133338,   133016,   132568,   131997,
+      131308,   130504,   129591,   128571,
+      127451,   126234,   124925,   123527,
+      122045,   120484,   118847,   117138,
+      115362,   113524,   111626,   109673,
+      107668,   105616,   103521,   101386,
+      -99215,   -97011,   -94778,   -92519,
+      -90239,   -87939,   -85624,   -83296,
+      -80959,   -78615,   -76268,   -73919,
+      -71573,   -69231,   -66897,   -64571,
+      -62258,   -59959,   -57676,   -55411,
+      -53167,   -50945,   -48747,   -46575,
+      -44431,   -42315,   -40231,   -38177,
+      -36158,   -34172,   -32222,   -30309,
+      -28433,   -26596,   -24798,   -23039,
+      -21322,   -19645,   -18011,   -16418,
+      -14868,   -13361,   -11897,   -10476,
+       -9098,    -7764,    -6474,    -5226,
+       -4022,    -2861,    -1743,     -668,
+         366,     1357,     2306,     3215,
+        4083,     4910,     5698,     6447,
+        7158,     7830,     8466,     9065,
+        9628,    10156,    10650,    11111,
+       11538,    11934,    12298,    12632,
+       12937,    13213,    13461,    13682,
+       13877,    14046,    14191,    14313,
+       14412,    14488,    14544,    14580,
+       14596,    14594,    14573,    14536,
+       14483,    14415,    14331,    14235,
+       14125,    14002,    13869,    13724,
+       13569,    13405,    13231,    13050,
+       12861,    12665,    12463,    12255,
+       12042,    11825,    11603,    11377,
+       11149,    10918,    10685,    10450,
+       10214,     9977,     9739,     9501,
+        9264,     9027,     8791,     8556,
+        8322,     8090,     7861,     7633,
+        7408,     7186,     6966,     6749,
+       -6536,    -6326,    -6119,    -5916,
+       -5717,    -5522,    -5330,    -5143,
+       -4959,    -4780,    -4604,    -4433,
+       -4267,    -4104,    -3946,    -3791,
+       -3642,    -3496,    -3354,    -3217,
+       -3084,    -2955,    -2830,    -2709,
+       -2592,    -2479,    -2370,    -2265,
+       -2163,    -2065,    -1971,    -1880,
+       -1793,    -1709,    -1628,    -1550,
+       -1476,    -1405,    -1336,    -1271,
+       -1208,    -1148,    -1090,    -1035,
+        -983,     -932,     -884,     -839,
+        -795,     -753,     -714,     -676,
+        -640,     -606,     -574,     -543,
+        -513,     -485,     -459,     -433,
+        -410,     -387,     -365,     -345,
+        -325,     -307,     -289,     -273,
+        -257,     -242,     -228,     -215,
+        -202,     -190,     -178,     -167,
+        -157,     -147,     -138,     -129,
+        -121,     -113,     -105,      -98,
+         -91,      -85,      -79,      -73,
+         -67,      -62,      -57,      -52,
+         -48,      -43,      -38,      -38
+};
+
 /*
  * D.11 Look-up Table for Downmix Scale Factors
  *
@@ -8176,216 +8698,495 @@ const uint32_t ff_dca_inv_dmixtable[FF_DCA_INV_DMIXTABLE_SIZE] = {
       65536,
 };
 
-const float ff_dca_default_coeffs[10][6][2] = {
-    { { 0.707107, 0.707107 }, { 0.000000, 0.000000 },                                                                                                 }, // A [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // A + B (dual mono) [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // L + R (stereo) [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // (L+R) + (L-R) (sum-difference) [LFE]
-    { { 1.000000, 0.000000 }, { 0.000000, 1.000000 }, { 0.000000, 0.000000 },                                                                         }, // LT + RT (left and right total) [LFE]
-    { { 0.501187, 0.501187 }, { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.000000, 0.000000 },                                                 }, // C + L + R [LFE]
-    { { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.501187 }, { 0.000000, 0.000000 },                                                 }, // L + R + S [LFE]
-    { { 0.501187, 0.501187 }, { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.501187 }, { 0.000000, 0.000000 },                         }, // C + L + R + S [LFE]
-    { { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.000000 }, { 0.000000, 0.501187 }, { 0.000000, 0.000000 },                         }, // L + R + SL + SR [LFE]
-    { { 0.501187, 0.501187 }, { 0.707107, 0.000000 }, { 0.000000, 0.707107 }, { 0.501187, 0.000000 }, { 0.000000, 0.501187 }, { 0.000000, 0.000000 }, }, // C + L + R + SL + SR [LFE]
+const uint16_t ff_dca_xll_refl_coeff[128] = {
+        0,  3070,  5110,  7140,  9156, 11154, 13132, 15085,
+    17010, 18904, 20764, 22588, 24373, 26117, 27818, 29474,
+    31085, 32648, 34164, 35631, 37049, 38418, 39738, 41008,
+    42230, 43404, 44530, 45609, 46642, 47630, 48575, 49477,
+    50337, 51157, 51937, 52681, 53387, 54059, 54697, 55302,
+    55876, 56421, 56937, 57426, 57888, 58326, 58741, 59132,
+    59502, 59852, 60182, 60494, 60789, 61066, 61328, 61576,
+    61809, 62029, 62236, 62431, 62615, 62788, 62951, 63105,
+    63250, 63386, 63514, 63635, 63749, 63855, 63956, 64051,
+    64140, 64224, 64302, 64376, 64446, 64512, 64573, 64631,
+    64686, 64737, 64785, 64830, 64873, 64913, 64950, 64986,
+    65019, 65050, 65079, 65107, 65133, 65157, 65180, 65202,
+    65222, 65241, 65259, 65275, 65291, 65306, 65320, 65333,
+    65345, 65357, 65368, 65378, 65387, 65396, 65405, 65413,
+    65420, 65427, 65434, 65440, 65446, 65451, 65456, 65461,
+    65466, 65470, 65474, 65478, 65481, 65485, 65488, 65491
 };
 
-const int32_t ff_dca_sampling_freqs[16] = {
-      8000,  16000, 32000, 64000, 128000, 22050,  44100,  88200,
-    176400, 352800, 12000, 24000,  48000, 96000, 192000, 384000,
+const int32_t ff_dca_xll_band_coeff[20] = {
+      868669, -5931642, -1228483,  4194304,
+      -20577,   122631,  -393647,   904476,
+    -1696305,  2825313, -4430736,  6791313,
+       41153,  -245210,   785564, -1788164,
+     3259333, -5074941,  6928550, -8204883
 };
 
-/* downmix coeffs
- *
- * TABLE 9
- * ______________________________________
- * Down-mix coefficients for 8-channel source
- * audio (5 + 3 format)
- * lt
- * cen- rt lt ctr rt
- * lt ter ctr center
- * rt srd srd srd
- * ______________________________________
- * 1 0.71 0.74 1.0 0.71 0.71 0.58 0.58 0.58
- * 2 left 1.0 0.89 0.71 0.46 0.71 0.50
- * rt 0.45 0.71 0.89 1.0 0.50 0.71
- * 3 lt 1.0 0.89 0.71 0.45
- * rt 0.45 0.71 0.89 1.0
- * srd 0.71 0.71 0.71
- * 4 lt 1.0 0.89 0.71 0.45
- * rt 0.45 0.71 0.89 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 0.71
- * 4 lt 1.0 0.5
- * ctr 0.87 1.0 0.87
- * rt 0.5 1.0
- * srd 0.71 0.71 0.71
- * 5 lt 1.0 0.5
- * ctr 0.87 1.0 0.87
- * rt 0.5 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 1.0
- * 6 lt 1.0 0.5
- * lt ctr 0.87 0.71
- * rt ctr 0.71 0.87
- * rt 0.5 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 1.0
- * 6 lt 1.0 0.5
- * ctr 0.86 1.0 0.86
- * rt 0.5 1.0
- * lt srd 1.0
- * ctr srd 1.0
- * rt srd 1.0
- * 7 lt 1.0
- * lt ctr 1.0
- * ctr 1.0
- * rt ctr 1.0
- * rt 1.0
- * lt srd 1.0 0.71
- * rt srd 0.71 1.0
- * 7 lt 1.0 0.5
- * lt ctr 0.87 0.71
- * rt ctr 0.71 0.87
- * rt 0.5 1.0
- * lt srd 1.0
- * ctr srd 1.0
- * rt srd 1.0
- * 8 lt 1.0 0.5
- * lt ctr 0.87 0.71
- * rt ctr 0.71 0.87
- * rt 0.5 1.0
- * lt 1 srd 0.87 0.35
- * lt 2 srd 0.5 0.61
- * rt 2 srd 0.61 0.50
- * rt 2 srd 0.35 0.87
- *
- * Generation of Lt Rt
- *
- * In the case when the playback system has analog or digital surround
- * multi-channel capability, a down matrix from 5, 4, or 3 channel to
- * Lt Rt may be desirable. In the case when the number of decoded audio
- * channels exceeds 5, 4 or 3 respectively a first stage down mix to 5,
- * 4 or 3 chs should be used as described above.
- *
- * The down matrixing equations for 5-channel source audio to a
- * two-channel Lt Rt playback system are given by:
- *
- * Left  = left  + 0.7 * center - 0.7 * (lt surround + rt surround)
- *
- * Right = right + 0.7 * center + 0.7 * (lt surround + rt surround)
- *
- * Embedded mixing to 2-channel
- *
- * One concern arising from the proliferation of multi-channel audio
- * systems is that most home systems presently have only two channel
- * playback capability. To accommodate this a fixed 2-channel down
- * matrix processes is commonly used following the multi-channel
- * decoding stage. However, for music only applications the image
- * quality etc. of the down matrixed signal may not match that of an
- * equivalent stereo recording found on CD.
- *
- * The concept of embedded mixing is to allow the producer to
- * dynamically specify the matrixing coefficients within the audio
- * frame itself. In this way the stereo down mix at the decoder may be
- * better matched to a 2-channel playback environment.
- *
- * CHS*2, 7-bit down mix indexes (MCOEFFS) are transmitted along with
- * the multi-channel audio once in every frame. The indexes are
- * converted to attenuation factors using a 7 bit LUT. The 2-ch down
- * mix equations are as follows,
- *
- * Left Ch  = sum (MCOEFF[n]       * Ch[n]) for n=1, CHS
- *
- * Right Ch = sum (MCOEFF[n + CHS] * Ch[n]) for n=1, CHS
- *
- * where Ch(n) represents the subband samples in the (n)th audio channel.
- */
+const uint16_t ff_dca_avg_g3_freqs[3] = { 16000, 18000, 24000 };
+
+const uint16_t ff_dca_fst_amp[44] = {
+       0,    1,    2,    3,
+       4,    6,    8,   10,
+      12,   16,   20,   24,
+      28,   36,   44,   52,
+      60,   76,   92,  108,
+     124,  156,  188,  220,
+     252,  316,  380,  444,
+     508,  636,  764,  892,
+    1020, 1276, 1532, 1788,
+    2044, 2556, 3068, 3580,
+    4092, 5116, 6140, 7164
+};
+
+const uint8_t ff_dca_freq_to_sb[32] = {
+    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+};
+
+const int8_t ff_dca_ph0_shift[8] = {
+    -32, +96, -96, +32, +96, -32, +32, -96
+};
+
+const uint8_t ff_dca_grid_1_to_scf[11] = {
+    0, 1, 2, 3, 4, 6, 7, 10, 14, 19, 26
+};
+
+const uint8_t ff_dca_grid_2_to_scf[3] = {
+    4, 10, 18
+};
+
+const uint8_t ff_dca_scf_to_grid_1[32] = {
+    0, 1, 2, 3, 4, 4, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7,
+    7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
+const uint8_t ff_dca_scf_to_grid_2[32] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+    1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+const uint8_t ff_dca_grid_1_weights[12][32] = {
+    {
+        128,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0, 128,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0, 128,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0, 128,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0, 128, 128,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0, 128,  85,
+         43,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,  43,
+         85, 128,  96,  64,  32,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,  32,  64,  96, 128, 102,  77,
+         51,  26,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,  26,  51,
+         77, 102, 128, 107,  85,  64,  43,  21,
+          0,   0,   0,   0,   0,   0,   0,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,  21,  43,  64,  85, 107,
+        128, 110,  91,  73,  55,  37,  18,   0,
+    }, {
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,   0,   0,   0,   0,   0,   0,   0,
+          0,  18,  37,  55,  73,  91, 110, 128,
+    }, {
+        0 /* empty */
+    }
+};
+
+const uint8_t ff_dca_sb_reorder[8][8] = {
+    { 0, 1, 2, 3, 4, 5, 6, 7 },
+    { 1, 0, 2, 3, 4, 5, 6, 7 },
+    { 3, 1, 0, 2, 4, 5, 6, 7 },
+    { 1, 2, 3, 0, 4, 5, 6, 7 },
+    { 1, 2, 5, 3, 0, 4, 6, 7 },
+    { 1, 2, 2, 5, 3, 0, 4, 6 },
+    { 1, 2, 2, 6, 5, 3, 0, 4 },
+    { 1, 2, 2, 6, 5, 4, 0, 3 }
+};
+
+const int8_t ff_dca_lfe_delta_index_16[8] = {
+    -4, -3, -2, -1, 2, 4, 6, 8
+};
+
+const int8_t ff_dca_lfe_delta_index_24[32] = {
+    -8, -8, -7, -7, -6, -6, -5, -5, -4, -4, -3, -3, -2, -2, -1, -1,
+     1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8
+};
+
+const uint16_t ff_dca_rsd_pack_5_in_8[256] = {
+    0x0000, 0x0100, 0x0200, 0x0040, 0x0140, 0x0240, 0x0080, 0x0180,
+    0x0280, 0x0010, 0x0110, 0x0210, 0x0050, 0x0150, 0x0250, 0x0090,
+    0x0190, 0x0290, 0x0020, 0x0120, 0x0220, 0x0060, 0x0160, 0x0260,
+    0x00a0, 0x01a0, 0x02a0, 0x0004, 0x0104, 0x0204, 0x0044, 0x0144,
+    0x0244, 0x0084, 0x0184, 0x0284, 0x0014, 0x0114, 0x0214, 0x0054,
+    0x0154, 0x0254, 0x0094, 0x0194, 0x0294, 0x0024, 0x0124, 0x0224,
+    0x0064, 0x0164, 0x0264, 0x00a4, 0x01a4, 0x02a4, 0x0008, 0x0108,
+    0x0208, 0x0048, 0x0148, 0x0248, 0x0088, 0x0188, 0x0288, 0x0018,
+    0x0118, 0x0218, 0x0058, 0x0158, 0x0258, 0x0098, 0x0198, 0x0298,
+    0x0028, 0x0128, 0x0228, 0x0068, 0x0168, 0x0268, 0x00a8, 0x01a8,
+    0x02a8, 0x0001, 0x0101, 0x0201, 0x0041, 0x0141, 0x0241, 0x0081,
+    0x0181, 0x0281, 0x0011, 0x0111, 0x0211, 0x0051, 0x0151, 0x0251,
+    0x0091, 0x0191, 0x0291, 0x0021, 0x0121, 0x0221, 0x0061, 0x0161,
+    0x0261, 0x00a1, 0x01a1, 0x02a1, 0x0005, 0x0105, 0x0205, 0x0045,
+    0x0145, 0x0245, 0x0085, 0x0185, 0x0285, 0x0015, 0x0115, 0x0215,
+    0x0055, 0x0155, 0x0255, 0x0095, 0x0195, 0x0295, 0x0025, 0x0125,
+    0x0225, 0x0065, 0x0165, 0x0265, 0x00a5, 0x01a5, 0x02a5, 0x0009,
+    0x0109, 0x0209, 0x0049, 0x0149, 0x0249, 0x0089, 0x0189, 0x0289,
+    0x0019, 0x0119, 0x0219, 0x0059, 0x0159, 0x0259, 0x0099, 0x0199,
+    0x0299, 0x0029, 0x0129, 0x0229, 0x0069, 0x0169, 0x0269, 0x00a9,
+    0x01a9, 0x02a9, 0x0002, 0x0102, 0x0202, 0x0042, 0x0142, 0x0242,
+    0x0082, 0x0182, 0x0282, 0x0012, 0x0112, 0x0212, 0x0052, 0x0152,
+    0x0252, 0x0092, 0x0192, 0x0292, 0x0022, 0x0122, 0x0222, 0x0062,
+    0x0162, 0x0262, 0x00a2, 0x01a2, 0x02a2, 0x0006, 0x0106, 0x0206,
+    0x0046, 0x0146, 0x0246, 0x0086, 0x0186, 0x0286, 0x0016, 0x0116,
+    0x0216, 0x0056, 0x0156, 0x0256, 0x0096, 0x0196, 0x0296, 0x0026,
+    0x0126, 0x0226, 0x0066, 0x0166, 0x0266, 0x00a6, 0x01a6, 0x02a6,
+    0x000a, 0x010a, 0x020a, 0x004a, 0x014a, 0x024a, 0x008a, 0x018a,
+    0x028a, 0x001a, 0x011a, 0x021a, 0x005a, 0x015a, 0x025a, 0x009a,
+    0x019a, 0x029a, 0x002a, 0x012a, 0x022a, 0x006a, 0x016a, 0x026a,
+    0x00aa, 0x01aa, 0x02aa, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155,
+    0x0155, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155, 0x0155
+};
+
+const uint8_t ff_dca_rsd_pack_3_in_7[128][3] = {
+    { 0, 0, 0 }, { 0, 0, 1 }, { 0, 0, 2 }, { 0, 0, 3 },
+    { 0, 0, 4 }, { 0, 1, 0 }, { 0, 1, 1 }, { 0, 1, 2 },
+    { 0, 1, 3 }, { 0, 1, 4 }, { 0, 2, 0 }, { 0, 2, 1 },
+    { 0, 2, 2 }, { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 0 },
+    { 0, 3, 1 }, { 0, 3, 2 }, { 0, 3, 3 }, { 0, 3, 4 },
+    { 0, 4, 0 }, { 0, 4, 1 }, { 0, 4, 2 }, { 0, 4, 3 },
+    { 0, 4, 4 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 0, 2 },
+    { 1, 0, 3 }, { 1, 0, 4 }, { 1, 1, 0 }, { 1, 1, 1 },
+    { 1, 1, 2 }, { 1, 1, 3 }, { 1, 1, 4 }, { 1, 2, 0 },
+    { 1, 2, 1 }, { 1, 2, 2 }, { 1, 2, 3 }, { 1, 2, 4 },
+    { 1, 3, 0 }, { 1, 3, 1 }, { 1, 3, 2 }, { 1, 3, 3 },
+    { 1, 3, 4 }, { 1, 4, 0 }, { 1, 4, 1 }, { 1, 4, 2 },
+    { 1, 4, 3 }, { 1, 4, 4 }, { 2, 0, 0 }, { 2, 0, 1 },
+    { 2, 0, 2 }, { 2, 0, 3 }, { 2, 0, 4 }, { 2, 1, 0 },
+    { 2, 1, 1 }, { 2, 1, 2 }, { 2, 1, 3 }, { 2, 1, 4 },
+    { 2, 2, 0 }, { 2, 2, 1 }, { 2, 2, 2 }, { 2, 2, 3 },
+    { 2, 2, 4 }, { 2, 3, 0 }, { 2, 3, 1 }, { 2, 3, 2 },
+    { 2, 3, 3 }, { 2, 3, 4 }, { 2, 4, 0 }, { 2, 4, 1 },
+    { 2, 4, 2 }, { 2, 4, 3 }, { 2, 4, 4 }, { 3, 0, 0 },
+    { 3, 0, 1 }, { 3, 0, 2 }, { 3, 0, 3 }, { 3, 0, 4 },
+    { 3, 1, 0 }, { 3, 1, 1 }, { 3, 1, 2 }, { 3, 1, 3 },
+    { 3, 1, 4 }, { 3, 2, 0 }, { 3, 2, 1 }, { 3, 2, 2 },
+    { 3, 2, 3 }, { 3, 2, 4 }, { 3, 3, 0 }, { 3, 3, 1 },
+    { 3, 3, 2 }, { 3, 3, 3 }, { 3, 3, 4 }, { 3, 4, 0 },
+    { 3, 4, 1 }, { 3, 4, 2 }, { 3, 4, 3 }, { 3, 4, 4 },
+    { 4, 0, 0 }, { 4, 0, 1 }, { 4, 0, 2 }, { 4, 0, 3 },
+    { 4, 0, 4 }, { 4, 1, 0 }, { 4, 1, 1 }, { 4, 1, 2 },
+    { 4, 1, 3 }, { 4, 1, 4 }, { 4, 2, 0 }, { 4, 2, 1 },
+    { 4, 2, 2 }, { 4, 2, 3 }, { 4, 2, 4 }, { 4, 3, 0 },
+    { 4, 3, 1 }, { 4, 3, 2 }, { 4, 3, 3 }, { 4, 3, 4 },
+    { 4, 4, 0 }, { 4, 4, 1 }, { 4, 4, 2 }, { 4, 4, 3 },
+    { 4, 4, 4 }, { 2, 2, 2 }, { 2, 2, 2 }, { 2, 2, 2 }
+};
+
+const float ff_dca_rsd_level_2a[2] = {
+    -0.47, 0.47
+};
+
+const float ff_dca_rsd_level_2b[2] = {
+    -0.645, 0.645
+};
+
+const float ff_dca_rsd_level_3[3] = {
+    -0.645, 0.0, 0.645
+};
+
+const float ff_dca_rsd_level_5[5] = {
+    -0.875, -0.375, 0.0, 0.375, 0.875
+};
+
+const float ff_dca_rsd_level_8[8] = {
+    -1.0, -0.625, -0.291666667, 0.0, 0.25, 0.5, 0.75, 1.0
+};
+
+const float ff_dca_rsd_level_16[16] = {
+    -1.3125, -1.1375, -0.9625, -0.7875,
+    -0.6125, -0.4375, -0.2625, -0.0875,
+     0.0875,  0.2625,  0.4375,  0.6125,
+     0.7875,  0.9625,  1.1375,  1.3125
+};
+
+const float ff_dca_synth_env[32] = {
+    0.00240763666390, 0.00960735979838, 0.02152983213390, 0.03806023374436,
+    0.05903936782582, 0.08426519384873, 0.11349477331863, 0.14644660940673,
+    0.18280335791818, 0.22221488349020, 0.26430163158700, 0.30865828381746,
+    0.35485766137277, 0.40245483899194, 0.45099142983522, 0.5,
+    0.54900857016478, 0.59754516100806, 0.64514233862723, 0.69134171618254,
+    0.73569836841300, 0.77778511650980, 0.81719664208182, 0.85355339059327,
+    0.88650522668137, 0.91573480615127, 0.94096063217418, 0.96193976625564,
+    0.97847016786610, 0.99039264020162, 0.99759236333610, 1.0
+};
+
+const float ff_dca_corr_cf[32][11] = {
+    {-0.01179, 0.04281, 0.46712, 0.46345,-3.94525, 3.94525,
+     -0.46345,-0.46712,-0.04281, 0.01179,-0.00299 },
+    {-0.00929, 0.04882, 0.45252, 0.37972,-3.85446, 4.03189,
+     -0.55069,-0.48040,-0.03599, 0.01445,-0.00229 },
+    {-0.00696, 0.05403, 0.43674, 0.29961,-3.75975, 4.11413,
+     -0.64135,-0.49221,-0.02834, 0.01726,-0.00156 },
+    {-0.00481, 0.05847, 0.41993, 0.22319,-3.66138, 4.19175,
+     -0.73529,-0.50241,-0.01983, 0.02021,-0.00080 },
+    {-0.00284, 0.06216, 0.40224, 0.15053,-3.55963, 4.26452,
+     -0.83239,-0.51085,-0.01047, 0.02328,-0.00003 },
+    {-0.00105, 0.06515, 0.38378, 0.08168,-3.45475, 4.33225,
+     -0.93249,-0.51738,-0.00024, 0.02646, 0.00074 },
+    { 0.00054, 0.06745, 0.36471, 0.01668,-3.34703, 4.39475,
+     -1.03543,-0.52184, 0.01085, 0.02973, 0.00152 },
+    { 0.00195, 0.06912, 0.34515,-0.04445,-3.23676, 4.45185,
+     -1.14105,-0.52410, 0.02280, 0.03306, 0.00228 },
+    { 0.00318, 0.07017, 0.32521,-0.10168,-3.12422, 4.50339,
+     -1.24914,-0.52400, 0.03561, 0.03643, 0.00302 },
+    { 0.00422, 0.07065, 0.30503,-0.15503,-3.00969, 4.54921,
+     -1.35952,-0.52141, 0.04925, 0.03981, 0.00373 },
+    { 0.00508, 0.07061, 0.28471,-0.20450,-2.89348, 4.58919,
+     -1.47197,-0.51618, 0.06370, 0.04319, 0.00440 },
+    { 0.00577, 0.07007, 0.26436,-0.25013,-2.77587, 4.62321,
+     -1.58627,-0.50818, 0.07895, 0.04652, 0.00501 },
+    { 0.00629, 0.06909, 0.24410,-0.29194,-2.65716, 4.65118,
+     -1.70219,-0.49727, 0.09494, 0.04979, 0.00556 },
+    { 0.00666, 0.06769, 0.22400,-0.33000,-2.53764, 4.67302,
+     -1.81949,-0.48335, 0.11166, 0.05295, 0.00604 },
+    { 0.00687, 0.06592, 0.20416,-0.36435,-2.41760, 4.68866,
+     -1.93791,-0.46627, 0.12904, 0.05597, 0.00642 },
+    { 0.00694, 0.06383, 0.18468,-0.39506,-2.29732, 4.69806,
+     -2.05720,-0.44593, 0.14705, 0.05881, 0.00671 },
+    { 0.00689, 0.06144, 0.16561,-0.42223,-2.17710, 4.70120,
+     -2.17710,-0.42223, 0.16561, 0.06144, 0.00689 },
+    { 0.00671, 0.05881, 0.14705,-0.44593,-2.05720, 4.69806,
+     -2.29732,-0.39506, 0.18468, 0.06383, 0.00694 },
+    { 0.00642, 0.05597, 0.12904,-0.46627,-1.93791, 4.68865,
+     -2.41759,-0.36435, 0.20416, 0.06592, 0.00687 },
+    { 0.00604, 0.05295, 0.11166,-0.48334,-1.81949, 4.67301,
+     -2.53763,-0.33000, 0.22400, 0.06769, 0.00666 },
+    { 0.00556, 0.04979, 0.09494,-0.49727,-1.70219, 4.65117,
+     -2.65715,-0.29194, 0.24409, 0.06909, 0.00629 },
+    { 0.00501, 0.04652, 0.07894,-0.50818,-1.58627, 4.62321,
+     -2.77587,-0.25013, 0.26436, 0.07007, 0.00577 },
+    { 0.00440, 0.04319, 0.06370,-0.51618,-1.47197, 4.58919,
+     -2.89348,-0.20450, 0.28471, 0.07061, 0.00508 },
+    { 0.00373, 0.03981, 0.04925,-0.52141,-1.35952, 4.54921,
+     -3.00970,-0.15503, 0.30503, 0.07065, 0.00422 },
+    { 0.00302, 0.03643, 0.03561,-0.52400,-1.24915, 4.50339,
+     -3.12422,-0.10168, 0.32521, 0.07017, 0.00318 },
+    { 0.00228, 0.03306, 0.02280,-0.52410,-1.14105, 4.45186,
+     -3.23677,-0.04445, 0.34515, 0.06912, 0.00195 },
+    { 0.00152, 0.02973, 0.01085,-0.52184,-1.03544, 4.39477,
+     -3.34704, 0.01668, 0.36471, 0.06745, 0.00054 },
+    { 0.00074, 0.02646,-0.00024,-0.51738,-0.93249, 4.33226,
+     -3.45476, 0.08168, 0.38378, 0.06515,-0.00105 },
+    {-0.00003, 0.02328,-0.01047,-0.51085,-0.83239, 4.26452,
+     -3.55963, 0.15053, 0.40224, 0.06216,-0.00284 },
+    {-0.00080, 0.02021,-0.01983,-0.50241,-0.73529, 4.19174,
+     -3.66138, 0.22319, 0.41993, 0.05847,-0.00481 },
+    {-0.00156, 0.01726,-0.02834,-0.49221,-0.64135, 4.11413,
+     -3.75974, 0.29961, 0.43674, 0.05403,-0.00696 },
+    {-0.00229, 0.01445,-0.03599,-0.48040,-0.55069, 4.03188,
+     -3.85445, 0.37972, 0.45251, 0.04882,-0.00929 },
+};
+
+const float ff_dca_quant_amp[57] = {
+    4.88281250E-04, 1.46484375E-03, 2.32267031E-03, 3.28475167E-03,
+    4.64534014E-03, 6.56950334E-03, 9.29068029E-03, 1.31390067E-02,
+    1.85813606E-02, 2.62780134E-02, 3.71627212E-02, 5.25560267E-02,
+    7.43254423E-02, 1.05112053E-01, 1.48650885E-01, 2.10224107E-01,
+    2.97301769E-01, 4.20448214E-01, 5.94603539E-01, 8.40896428E-01,
+    1.18920708E+00, 1.68179286E+00, 2.37841415E+00, 3.36358571E+00,
+    4.75682831E+00, 6.72717142E+00, 9.51365662E+00, 1.34543428E+01,
+    1.90273132E+01, 2.69086857E+01, 3.80546265E+01, 5.38173714E+01,
+    7.61092529E+01, 1.07634743E+02, 1.52218506E+02, 2.15269485E+02,
+    3.04437012E+02, 4.30538971E+02, 6.08874023E+02, 8.61077942E+02,
+    1.21774805E+03, 1.72215588E+03, 2.43549609E+03, 3.44431177E+03,
+    4.87099219E+03, 6.88862354E+03, 9.74198438E+03, 1.37772471E+04,
+    1.94839688E+04, 2.75544941E+04, 3.89679375E+04, 5.51089883E+04,
+    7.79358750E+04, 1.10217977E+05, 1.55871750E+05, 2.20435953E+05,
+    0.00000000E+00,
+};
 
-const int8_t ff_dca_lfe_index[16] = {
-    1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3
+const float ff_dca_st_coeff[34] = {
+    2.69086857E+01, 2.69086857E+01, 1.34543419E+01, 6.72717142E+00,
+    3.36358571E+00, 1.68179286E+00, 8.40896428E-01, 5.94603479E-01,
+    4.20448214E-01, 2.97301799E-01, 2.10224107E-01, 1.48650900E-01,
+    1.05112098E-01, 7.43253976E-02, 5.25560006E-02, 3.71626988E-02,
+    3.12500000E-02, 2.62780003E-02, 1.85813997E-02, 1.31390002E-02,
+    9.29069985E-03, 6.56950008E-03, 4.64530010E-03, 3.28480010E-03,
+    2.32270011E-03, 1.64240005E-03, 1.16130000E-03, 5.80699998E-04,
+    2.90299999E-04, 1.45200000E-04, 7.25999998E-05, 3.62999999E-05,
+    1.82000003E-05, 0.00000000E+00,
 };
 
-const int8_t ff_dca_channel_reorder_lfe[16][9] = {
-    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4, -1, -1, -1, -1, -1 },
-    { 0,  1,  3,  4, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4,  5, -1, -1, -1, -1 },
-    { 3,  4,  0,  1,  5,  6, -1, -1, -1 },
-    { 2,  0,  1,  4,  5,  6, -1, -1, -1 },
-    { 0,  6,  4,  5,  2,  3, -1, -1, -1 },
-    { 4,  2,  5,  0,  1,  6,  7, -1, -1 },
-    { 5,  6,  0,  1,  7,  3,  8,  4, -1 },
-    { 4,  2,  5,  0,  1,  6,  8,  7, -1 },
+const float ff_dca_long_window[128] = {
+    0.00000000E+00, 7.42882412E-06, 5.28020973E-05, 1.71007006E-04,
+    3.96653224E-04, 7.63946096E-04, 1.30655791E-03, 2.05750111E-03,
+    3.04900459E-03, 4.31239139E-03, 5.87796280E-03, 7.77488295E-03,
+    1.00310687E-02, 1.26730874E-02, 1.57260559E-02, 1.92135461E-02,
+    2.31574941E-02, 2.75781266E-02, 3.24938744E-02, 3.79213169E-02,
+    4.38751020E-02, 5.03679104E-02, 5.74104004E-02, 6.50111660E-02,
+    7.31767192E-02, 8.19114447E-02, 9.12176073E-02, 1.01095326E-01,
+    1.11542597E-01, 1.22555278E-01, 1.34127125E-01, 1.46249816E-01,
+    1.58912972E-01, 1.72104210E-01, 1.85809180E-01, 2.00011641E-01,
+    2.14693516E-01, 2.29834959E-01, 2.45414421E-01, 2.61408776E-01,
+    2.77793378E-01, 2.94542134E-01, 3.11627686E-01, 3.29021394E-01,
+    3.46693635E-01, 3.64613682E-01, 3.82750064E-01, 4.01070446E-01,
+    4.19541985E-01, 4.38131332E-01, 4.56804723E-01, 4.75528270E-01,
+    4.94267941E-01, 5.12989700E-01, 5.31659782E-01, 5.50244689E-01,
+    5.68711281E-01, 5.87027133E-01, 6.05160415E-01, 6.23080134E-01,
+    6.40756190E-01, 6.58159554E-01, 6.75262392E-01, 6.92038059E-01,
+    7.08461344E-01, 7.24508464E-01, 7.40157187E-01, 7.55386829E-01,
+    7.70178556E-01, 7.84515142E-01, 7.98381269E-01, 8.11763465E-01,
+    8.24650168E-01, 8.37031603E-01, 8.48900259E-01, 8.60250235E-01,
+    8.71077836E-01, 8.81381273E-01, 8.91160548E-01, 9.00417745E-01,
+    9.09156621E-01, 9.17382956E-01, 9.25104082E-01, 9.32328999E-01,
+    9.39068437E-01, 9.45334494E-01, 9.51140642E-01, 9.56501782E-01,
+    9.61433768E-01, 9.65953648E-01, 9.70079303E-01, 9.73829389E-01,
+    9.77223217E-01, 9.80280578E-01, 9.83021557E-01, 9.85466540E-01,
+    9.87635851E-01, 9.89549816E-01, 9.91228402E-01, 9.92691338E-01,
+    9.93957877E-01, 9.95046616E-01, 9.95975435E-01, 9.96761382E-01,
+    9.97420728E-01, 9.97968733E-01, 9.98419642E-01, 9.98786569E-01,
+    9.99081731E-01, 9.99315977E-01, 9.99499321E-01, 9.99640644E-01,
+    9.99747574E-01, 9.99826968E-01, 9.99884665E-01, 9.99925494E-01,
+    9.99953628E-01, 9.99972343E-01, 9.99984324E-01, 9.99991655E-01,
+    9.99995887E-01, 9.99998152E-01, 9.99999285E-01, 9.99999762E-01,
+    9.99999940E-01, 1.00000000E+00, 1.00000000E+00, 1.00000000E+00,
 };
 
-const int8_t ff_dca_channel_reorder_lfe_xch[16][9] = {
-    { 0,  2, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4, -1, -1, -1, -1, -1 },
-    { 0,  1,  3,  4, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  4,  5, -1, -1, -1, -1 },
-    { 0,  1,  4,  5,  3, -1, -1, -1, -1 },
-    { 2,  0,  1,  5,  6,  4, -1, -1, -1 },
-    { 3,  4,  0,  1,  6,  7,  5, -1, -1 },
-    { 2,  0,  1,  4,  5,  6,  7, -1, -1 },
-    { 0,  6,  4,  5,  2,  3,  7, -1, -1 },
-    { 4,  2,  5,  0,  1,  7,  8,  6, -1 },
-    { 5,  6,  0,  1,  8,  3,  9,  4,  7 },
-    { 4,  2,  5,  0,  1,  6,  9,  8,  7 },
+const float ff_dca_lfe_step_size_16[101] = {
+    2.1362956633198035E-004, 2.4414807580797754E-004, 2.7466658528397473E-004,
+    2.7466658528397473E-004, 3.0518509475997192E-004, 3.3570360423596911E-004,
+    3.9674062318796350E-004, 4.2725913266396069E-004, 4.5777764213995788E-004,
+    5.1881466109195227E-004, 5.7985168004394665E-004, 6.1037018951994385E-004,
+    6.7140720847193823E-004, 7.6296273689992981E-004, 8.2399975585192419E-004,
+    9.1555528427991577E-004, 1.0071108127079073E-003, 1.0986663411358989E-003,
+    1.2207403790398877E-003, 1.3428144169438765E-003, 1.4648884548478652E-003,
+    1.6174810022278512E-003, 1.7700735496078372E-003, 1.9531846064638203E-003,
+    2.1362956633198035E-003, 2.3499252296517838E-003, 2.5940733054597613E-003,
+    2.8687398907437361E-003, 3.1434064760277108E-003, 3.4485915707876827E-003,
+    3.7842951750236518E-003, 4.1810357982116153E-003, 4.6082949308755760E-003,
+    5.0660725730155339E-003, 5.5543687246314890E-003, 6.1037018951994385E-003,
+    6.7445905941953795E-003, 7.4159978026673177E-003, 8.1484420300912512E-003,
+    8.9419232764671782E-003, 9.8574785607470940E-003, 1.0834070863979004E-002,
+    1.1932737205114903E-002, 1.3122959074678793E-002, 1.4435254982146673E-002,
+    1.5869624927518540E-002, 1.7456587420270394E-002, 1.9196142460402233E-002,
+    2.1118808557390057E-002, 2.3224585711233862E-002, 2.5543992431409649E-002,
+    2.8107547227393413E-002, 3.0915250099185155E-002, 3.4028138065736867E-002,
+    3.7415692617572556E-002, 4.1169469283120215E-002, 4.5258949552903834E-002,
+    4.9806207464827418E-002, 5.4780724509414958E-002, 6.0274056215094456E-002,
+    6.6286202581865905E-002, 7.2908719138157288E-002, 8.0202642902920618E-002,
+    8.8229010895107887E-002, 9.7048860133671075E-002, 1.0675374614703818E-001,
+    1.1743522446363720E-001, 1.2918485061189611E-001, 1.4209418012024294E-001,
+    1.5628528702658162E-001, 1.7191076387829218E-001, 1.8912320322275461E-001,
+    2.0804467909787286E-001, 2.2882778405102694E-001, 2.5171666615802485E-001,
+    2.7689443647572254E-001, 3.0457472457045198E-001, 3.3503219702749720E-001,
+    3.6854152043214211E-001, 4.0537736136967073E-001, 4.4593646046327096E-001,
+    4.9052400280770286E-001, 5.3956724753563035E-001, 5.9352397228919340E-001,
+    6.5288247322000792E-001, 7.1816156498916595E-001, 7.9000213629566329E-001,
+    8.6898403881954400E-001, 9.5590075380718409E-001, 1.0514847254860074E+000,
+    1.1566209906308176E+000, 1.2722861415448470E+000, 1.3995178075502792E+000,
+    1.5394756920072024E+000, 1.6934110538041323E+000, 1.8627582628864405E+000,
+    2.0490432447279274E+000, 2.2539445173497725E+000, 2.4793237098300120E+000,
+    2.7272865993224893E+000, 3.0000000000000000E+000
 };
 
-const int8_t ff_dca_channel_reorder_nolfe[16][9] = {
-    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3, -1, -1, -1, -1, -1 },
-    { 0,  1,  2,  3, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3,  4, -1, -1, -1, -1 },
-    { 2,  3,  0,  1,  4,  5, -1, -1, -1 },
-    { 2,  0,  1,  3,  4,  5, -1, -1, -1 },
-    { 0,  5,  3,  4,  1,  2, -1, -1, -1 },
-    { 3,  2,  4,  0,  1,  5,  6, -1, -1 },
-    { 4,  5,  0,  1,  6,  2,  7,  3, -1 },
-    { 3,  2,  4,  0,  1,  5,  7,  6, -1 },
+const float ff_dca_lfe_step_size_24[144] = {
+    3.5762791128491298E-006, 3.9339070241340428E-006, 4.4107442391805934E-006,
+    4.7683721504655064E-006, 5.2452093655120570E-006, 5.8412558843202453E-006,
+    6.4373024031284336E-006, 7.0333489219366219E-006, 7.7486047445064479E-006,
+    8.4638605670762738E-006, 9.4175349971693751E-006, 1.0252000123500839E-005,
+    1.1324883857355578E-005, 1.2516976894971954E-005, 1.3709069932588331E-005,
+    1.5139581577727983E-005, 1.6570093222867636E-005, 1.8239023475530564E-005,
+    2.0146372335716766E-005, 2.2053721195902969E-005, 2.4318697967374082E-005,
+    2.6702884042606836E-005, 2.9444698029124504E-005, 3.2305721319403807E-005,
+    3.5643581824729662E-005, 3.9100651633817152E-005, 4.3034558657951193E-005,
+    4.7326093593370149E-005, 5.2094465743835655E-005, 5.7339675109347712E-005,
+    6.3061721689906320E-005, 6.9379814789273121E-005, 7.6293954407448102E-005,
+    8.3923349848192912E-005, 9.2268001111507552E-005, 1.0156632680491529E-004,
+    1.1169911762465449E-004, 1.2290479217824841E-004, 1.3518335046569711E-004,
+    1.4865400179076216E-004, 1.6355516476096688E-004, 1.7988683937631122E-004,
+    1.9788744424431852E-004, 2.1767618866875036E-004, 2.3949149125713007E-004,
+    2.6345256131321922E-004, 2.8979781744454115E-004, 3.1876567825861912E-004,
+    3.5059456236297636E-004, 3.8564209766889782E-004, 4.2426591208766842E-004,
+    4.6670442422681142E-004, 5.1331526199761173E-004, 5.6469447191887759E-004,
+    6.2108047259813216E-004, 6.8318851985794547E-004, 7.5149545091336386E-004,
+    8.2671652158695713E-004, 9.0932856909377204E-004, 1.0002852678639017E-003,
+    1.1003018737199156E-003, 1.2103320610919071E-003, 1.3314487137137310E-003,
+    1.4646055060154803E-003, 1.6109945310347714E-003, 1.7721655097205054E-003,
+    1.9493105351102991E-003, 2.1442177467605765E-003, 2.3586752842277626E-003,
+    2.5945904963720436E-003, 2.8539899413573674E-003, 3.1393770145627278E-003,
+    3.4533743206708813E-003, 3.7987236736683454E-003, 4.1785245154529228E-003,
+    4.5963531251374630E-003, 5.0560242004423382E-003, 5.5617100669992049E-003,
+    6.1178214690472445E-003, 6.7296036159519689E-003, 7.4025401356864135E-003,
+    8.1428299120461841E-003, 8.9571486660419298E-003, 9.8527681652031147E-003,
+    1.0838033060793050E-002, 1.1921884050593860E-002, 1.3114096297513997E-002,
+    1.4425517848195773E-002, 1.5868069633015350E-002, 1.7454864675386508E-002,
+    1.9200327301064409E-002, 2.1120431556753107E-002, 2.3232462791498040E-002,
+    2.5555613703204836E-002, 2.8111222757246822E-002, 3.0922297349250002E-002,
+    3.4014586688826884E-002, 3.7415985753057691E-002, 4.1157608170224208E-002,
+    4.5273428591898514E-002, 4.9800759530157987E-002, 5.4780847404104160E-002,
+    6.0258872539862694E-002, 6.6284783635709721E-002, 7.2913297762071824E-002,
+    8.0204615617348624E-002, 8.8225017574431602E-002, 9.7047578936526643E-002,
+    1.0675228914645780E-001, 1.1742748229831246E-001, 1.2917031397465634E-001,
+    1.4208735729305236E-001, 1.5629603341770570E-001, 1.7192568444319778E-001,
+    1.8911816944100493E-001, 2.0803001022696618E-001, 2.2883310661710579E-001,
+    2.5171640535788598E-001, 2.7688804589367461E-001, 3.0457679087839018E-001,
+    3.3503452957088109E-001, 3.6853794676517804E-001, 4.0539174144169587E-001,
+    4.4593089174400469E-001, 4.9052399283933557E-001, 5.3957635636047796E-001,
+    5.9353406352210802E-001, 6.5288742219059737E-001, 7.1817609288407480E-001,
+    7.8999373793527339E-001, 8.6899314749159184E-001, 9.5589243839889027E-001,
+    1.0514817299225008E+000, 1.1566298194682383E+000, 1.2722928848615747E+000,
+    1.3995221137430804E+000, 1.5394743131964581E+000, 1.6934218041207556E+000,
+    1.8627639845328312E+000, 2.0490403233814627E+000, 2.2539444272451910E+000,
+    2.4793389414952922E+000, 2.7272728356448215E+000, 2.9999998807906962E+000
 };
 
-const int8_t ff_dca_channel_reorder_nolfe_xch[16][9] = {
-    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3, -1, -1, -1, -1, -1 },
-    { 0,  1,  2,  3, -1, -1, -1, -1, -1 },
-    { 2,  0,  1,  3,  4, -1, -1, -1, -1 },
-    { 0,  1,  3,  4,  2, -1, -1, -1, -1 },
-    { 2,  0,  1,  4,  5,  3, -1, -1, -1 },
-    { 2,  3,  0,  1,  5,  6,  4, -1, -1 },
-    { 2,  0,  1,  3,  4,  5,  6, -1, -1 },
-    { 0,  5,  3,  4,  1,  2,  6, -1, -1 },
-    { 3,  2,  4,  0,  1,  6,  7,  5, -1 },
-    { 4,  5,  0,  1,  7,  2,  8,  3,  6 },
-    { 3,  2,  4,  0,  1,  5,  8,  7,  6 },
+const float ff_dca_bank_coeff[10] = {
+    0.022810893, 0.41799772, 0.90844810, 0.99973983,
+    0.068974845, 0.34675997, 0.29396889, 0.19642374,
+    0.308658270, 0.038060233
 };
 
-const uint16_t ff_dca_vlc_offs[63] = {
-        0,   512,   640,   768,  1282,  1794,  2436,  3080,  3770,  4454,  5364,
-     5372,  5380,  5388,  5392,  5396,  5412,  5420,  5428,  5460,  5492,  5508,
-     5572,  5604,  5668,  5796,  5860,  5892,  6412,  6668,  6796,  7308,  7564,
-     7820,  8076,  8620,  9132,  9388,  9910, 10166, 10680, 11196, 11726, 12240,
-    12752, 13298, 13810, 14326, 14840, 15500, 16022, 16540, 17158, 17678, 18264,
-    18796, 19352, 19926, 20468, 21472, 22398, 23014, 23622,
+const float ff_dca_lfe_iir[5][4] = {
+    { -0.98618466, 1.9861259, 1.0, -1.9840510 },
+    { -0.98883152, 1.9887193, 1.0, -1.9979848 },
+    { -0.99252087, 1.9923381, 1.0, -1.9990897 },
+    { -0.99591690, 1.9956781, 1.0, -1.9993745 },
+    { -0.99872285, 1.9984550, 1.0, -1.9994639 }
 };
diff --git a/libavcodec/dcadata.h b/libavcodec/dcadata.h
index 0a3139e..17aa712 100644
--- a/libavcodec/dcadata.h
+++ b/libavcodec/dcadata.h
@@ -1,20 +1,20 @@
 /*
  * DCA compatible decoder data
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,13 +27,21 @@ extern const uint32_t ff_dca_bit_rates[32];
 
 extern const uint8_t ff_dca_channels[16];
 
-extern const uint8_t ff_dca_bits_per_sample[7];
+extern const uint8_t ff_dca_bits_per_sample[8];
+
+extern const uint8_t ff_dca_dmix_primary_nch[8];
 
 extern const int16_t ff_dca_adpcm_vb[4096][4];
 
 extern const uint32_t ff_dca_scale_factor_quant6[64];
 extern const uint32_t ff_dca_scale_factor_quant7[128];
 
+extern const uint32_t ff_dca_joint_scale_factors[129];
+
+extern const uint32_t ff_dca_scale_factor_adj[4];
+
+extern const uint32_t ff_dca_quant_levels[32];
+
 extern const uint32_t ff_dca_lossy_quant[32];
 
 extern const uint32_t ff_dca_lossless_quant[32];
@@ -45,26 +53,69 @@ extern const float ff_dca_fir_32bands_nonperfect[512];
 
 extern const float ff_dca_lfe_fir_64[256];
 extern const float ff_dca_lfe_fir_128[256];
-extern const float ff_dca_lfe_xll_fir_64[256];
 extern const float ff_dca_fir_64bands[1024];
 
-#define FF_DCA_DMIXTABLE_SIZE      242
-#define FF_DCA_INV_DMIXTABLE_SIZE  201
+extern const int32_t ff_dca_fir_32bands_perfect_fixed[512];
+extern const int32_t ff_dca_fir_32bands_nonperfect_fixed[512];
+extern const int32_t ff_dca_lfe_fir_64_fixed[256];
+extern const int32_t ff_dca_fir_64bands_fixed[1024];
+
+#define FF_DCA_DMIXTABLE_SIZE       242U
+#define FF_DCA_INV_DMIXTABLE_SIZE   201U
+#define FF_DCA_DMIXTABLE_OFFSET     (FF_DCA_DMIXTABLE_SIZE - FF_DCA_INV_DMIXTABLE_SIZE)
 
 extern const uint16_t ff_dca_dmixtable[FF_DCA_DMIXTABLE_SIZE];
 extern const uint32_t ff_dca_inv_dmixtable[FF_DCA_INV_DMIXTABLE_SIZE];
 
-extern const float ff_dca_default_coeffs[10][6][2];
+extern const uint16_t ff_dca_xll_refl_coeff[128];
+
+extern const int32_t ff_dca_xll_band_coeff[20];
+
+extern const uint16_t ff_dca_avg_g3_freqs[3];
+
+extern const uint16_t ff_dca_fst_amp[44];
+
+extern const uint8_t ff_dca_freq_to_sb[32];
+
+extern const int8_t ff_dca_ph0_shift[8];
+
+extern const uint8_t ff_dca_grid_1_to_scf[11];
+extern const uint8_t ff_dca_grid_2_to_scf[3];
+
+extern const uint8_t ff_dca_scf_to_grid_1[32];
+extern const uint8_t ff_dca_scf_to_grid_2[32];
+
+extern const uint8_t ff_dca_grid_1_weights[12][32];
+
+extern const uint8_t ff_dca_sb_reorder[8][8];
+
+extern const int8_t ff_dca_lfe_delta_index_16[8];
+extern const int8_t ff_dca_lfe_delta_index_24[32];
+
+extern const uint16_t ff_dca_rsd_pack_5_in_8[256];
+extern const uint8_t ff_dca_rsd_pack_3_in_7[128][3];
+
+extern const float ff_dca_rsd_level_2a[2];
+extern const float ff_dca_rsd_level_2b[2];
+extern const float ff_dca_rsd_level_3[3];
+extern const float ff_dca_rsd_level_5[5];
+extern const float ff_dca_rsd_level_8[8];
+extern const float ff_dca_rsd_level_16[16];
+
+extern const float ff_dca_synth_env[32];
+
+extern const float ff_dca_corr_cf[32][11];
+
+extern const float ff_dca_quant_amp[57];
 
-extern const int32_t ff_dca_sampling_freqs[16];
+extern const float ff_dca_st_coeff[34];
 
-extern const int8_t ff_dca_lfe_index[16];
+extern const float ff_dca_long_window[128];
 
-extern const int8_t ff_dca_channel_reorder_lfe[16][9];
-extern const int8_t ff_dca_channel_reorder_lfe_xch[16][9];
-extern const int8_t ff_dca_channel_reorder_nolfe[16][9];
-extern const int8_t ff_dca_channel_reorder_nolfe_xch[16][9];
+extern const float ff_dca_lfe_step_size_16[101];
+extern const float ff_dca_lfe_step_size_24[144];
 
-extern const uint16_t ff_dca_vlc_offs[63];
+extern const float ff_dca_bank_coeff[10];
+extern const float ff_dca_lfe_iir[5][4];
 
 #endif /* AVCODEC_DCADATA_H */
diff --git a/libavcodec/dcadct.c b/libavcodec/dcadct.c
new file mode 100644
index 0000000..1082aa8
--- /dev/null
+++ b/libavcodec/dcadct.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+
+#include "dcadct.h"
+#include "dcamath.h"
+
+static void sum_a(const int *input, int *output, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        output[i] = input[2 * i] + input[2 * i + 1];
+}
+
+static void sum_b(const int *input, int *output, int len)
+{
+    int i;
+
+    output[0] = input[0];
+    for (i = 1; i < len; i++)
+        output[i] = input[2 * i] + input[2 * i - 1];
+}
+
+static void sum_c(const int *input, int *output, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        output[i] = input[2 * i];
+}
+
+static void sum_d(const int *input, int *output, int len)
+{
+    int i;
+
+    output[0] = input[1];
+    for (i = 1; i < len; i++)
+        output[i] = input[2 * i - 1] + input[2 * i + 1];
+}
+
+static void dct_a(const int *input, int *output)
+{
+    static const int cos_mod[8][8] = {
+         { 8348215,  8027397,  7398092,  6484482,  5321677,  3954362,  2435084,   822227 },
+         { 8027397,  5321677,   822227, -3954362, -7398092, -8348215, -6484482, -2435084 },
+         { 7398092,   822227, -6484482, -8027397, -2435084,  5321677,  8348215,  3954362 },
+         { 6484482, -3954362, -8027397,   822227,  8348215,  2435084, -7398092, -5321677 },
+         { 5321677, -7398092, -2435084,  8348215,  -822227, -8027397,  3954362,  6484482 },
+         { 3954362, -8348215,  5321677,  2435084, -8027397,  6484482,   822227, -7398092 },
+         { 2435084, -6484482,  8348215, -7398092,  3954362,   822227, -5321677,  8027397 },
+         {  822227, -2435084,  3954362, -5321677,  6484482, -7398092,  8027397, -8348215 }
+    };
+
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        int64_t res = 0;
+        for (j = 0; j < 8; j++)
+            res += (int64_t)cos_mod[i][j] * input[j];
+        output[i] = norm23(res);
+    }
+}
+
+static void dct_b(const int *input, int *output)
+{
+    static const int cos_mod[8][7] = {
+        {  8227423,  7750063,  6974873,  5931642,  4660461,  3210181,  1636536 },
+        {  6974873,  3210181, -1636536, -5931642, -8227423, -7750063, -4660461 },
+        {  4660461, -3210181, -8227423, -5931642,  1636536,  7750063,  6974873 },
+        {  1636536, -7750063, -4660461,  5931642,  6974873, -3210181, -8227423 },
+        { -1636536, -7750063,  4660461,  5931642, -6974873, -3210181,  8227423 },
+        { -4660461, -3210181,  8227423, -5931642, -1636536,  7750063, -6974873 },
+        { -6974873,  3210181,  1636536, -5931642,  8227423, -7750063,  4660461 },
+        { -8227423,  7750063, -6974873,  5931642, -4660461,  3210181, -1636536 }
+    };
+
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        int64_t res = input[0] * (INT64_C(1) << 23);
+        for (j = 0; j < 7; j++)
+            res += (int64_t)cos_mod[i][j] * input[1 + j];
+        output[i] = norm23(res);
+    }
+}
+
+static void mod_a(const int *input, int *output)
+{
+    static const int cos_mod[16] = {
+          4199362,   4240198,   4323885,   4454708,
+          4639772,   4890013,   5221943,   5660703,
+         -6245623,  -7040975,  -8158494,  -9809974,
+        -12450076, -17261920, -28585092, -85479984
+    };
+
+    int i, k;
+
+    for (i = 0; i < 8; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[8 + i]);
+
+    for (i = 8, k = 7; i < 16; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[8 + k]);
+}
+
+static void mod_b(int *input, int *output)
+{
+    static const int cos_mod[8] = {
+        4214598,  4383036,  4755871,  5425934,
+        6611520,  8897610, 14448934, 42791536
+    };
+
+    int i, k;
+
+    for (i = 0; i < 8; i++)
+        input[8 + i] = mul23(cos_mod[i], input[8 + i]);
+
+    for (i = 0; i < 8; i++)
+        output[i] = input[i] + input[8 + i];
+
+    for (i = 8, k = 7; i < 16; i++, k--)
+        output[i] = input[k] - input[8 + k];
+}
+
+static void mod_c(const int *input, int *output)
+{
+    static const int cos_mod[32] = {
+         1048892,  1051425,   1056522,   1064244,
+         1074689,  1087987,   1104313,   1123884,
+         1146975,  1173922,   1205139,   1241133,
+         1282529,  1330095,   1384791,   1447815,
+        -1520688, -1605358,  -1704360,  -1821051,
+        -1959964, -2127368,  -2332183,  -2587535,
+        -2913561, -3342802,  -3931480,  -4785806,
+        -6133390, -8566050, -14253820, -42727120
+    };
+
+    int i, k;
+
+    for (i = 0; i < 16; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[16 + i]);
+
+    for (i = 16, k = 15; i < 32; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[16 + k]);
+}
+
+static void clp_v(int *input, int len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        input[i] = clip23(input[i]);
+}
+
+static void imdct_half_32(int32_t *output, const int32_t *input)
+{
+    int buf_a[32], buf_b[32];
+    int i, k, mag, shift, round;
+
+    mag = 0;
+    for (i = 0; i < 32; i++)
+        mag += abs(input[i]);
+
+    shift = mag > 0x400000 ? 2 : 0;
+    round = shift > 0 ? 1 << (shift - 1) : 0;
+
+    for (i = 0; i < 32; i++)
+        buf_a[i] = (input[i] + round) >> shift;
+
+    sum_a(buf_a, buf_b +  0, 16);
+    sum_b(buf_a, buf_b + 16, 16);
+    clp_v(buf_b, 32);
+
+    sum_a(buf_b +  0, buf_a +  0, 8);
+    sum_b(buf_b +  0, buf_a +  8, 8);
+    sum_c(buf_b + 16, buf_a + 16, 8);
+    sum_d(buf_b + 16, buf_a + 24, 8);
+    clp_v(buf_a, 32);
+
+    dct_a(buf_a +  0, buf_b +  0);
+    dct_b(buf_a +  8, buf_b +  8);
+    dct_b(buf_a + 16, buf_b + 16);
+    dct_b(buf_a + 24, buf_b + 24);
+    clp_v(buf_b, 32);
+
+    mod_a(buf_b +  0, buf_a +  0);
+    mod_b(buf_b + 16, buf_a + 16);
+    clp_v(buf_a, 32);
+
+    mod_c(buf_a, buf_b);
+
+    for (i = 0; i < 32; i++)
+        buf_b[i] = clip23(buf_b[i] * (1 << shift));
+
+    for (i = 0, k = 31; i < 16; i++, k--) {
+        output[     i] = clip23(buf_b[i] - buf_b[k]);
+        output[16 + i] = clip23(buf_b[i] + buf_b[k]);
+    }
+}
+
+static void mod64_a(const int *input, int *output)
+{
+    static const int cos_mod[32] = {
+          4195568,   4205700,   4226086,    4256977,
+          4298755,   4351949,   4417251,    4495537,
+          4587901,   4695690,   4820557,    4964534,
+          5130115,   5320382,   5539164,    5791261,
+         -6082752,  -6421430,  -6817439,   -7284203,
+         -7839855,  -8509474,  -9328732,  -10350140,
+        -11654242, -13371208, -15725922,  -19143224,
+        -24533560, -34264200, -57015280, -170908480
+    };
+
+    int i, k;
+
+    for (i = 0; i < 16; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[16 + i]);
+
+    for (i = 16, k = 15; i < 32; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[16 + k]);
+}
+
+static void mod64_b(int *input, int *output)
+{
+    static const int cos_mod[16] = {
+         4199362,  4240198,  4323885,  4454708,
+         4639772,  4890013,  5221943,  5660703,
+         6245623,  7040975,  8158494,  9809974,
+        12450076, 17261920, 28585092, 85479984
+    };
+
+    int i, k;
+
+    for (i = 0; i < 16; i++)
+        input[16 + i] = mul23(cos_mod[i], input[16 + i]);
+
+    for (i = 0; i < 16; i++)
+        output[i] = input[i] + input[16 + i];
+
+    for (i = 16, k = 15; i < 32; i++, k--)
+        output[i] = input[k] - input[16 + k];
+}
+
+static void mod64_c(const int *input, int *output)
+{
+    static const int cos_mod[64] = {
+          741511,    741958,    742853,    744199,
+          746001,    748262,    750992,    754197,
+          757888,    762077,    766777,    772003,
+          777772,    784105,    791021,    798546,
+          806707,    815532,    825054,    835311,
+          846342,    858193,    870912,    884554,
+          899181,    914860,    931667,    949686,
+          969011,    989747,   1012012,   1035941,
+        -1061684,  -1089412,  -1119320,  -1151629,
+        -1186595,  -1224511,  -1265719,  -1310613,
+        -1359657,  -1413400,  -1472490,  -1537703,
+        -1609974,  -1690442,  -1780506,  -1881904,
+        -1996824,  -2128058,  -2279225,  -2455101,
+        -2662128,  -2909200,  -3208956,  -3579983,
+        -4050785,  -4667404,  -5509372,  -6726913,
+        -8641940, -12091426, -20144284, -60420720
+    };
+
+    int i, k;
+
+    for (i = 0; i < 32; i++)
+        output[i] = mul23(cos_mod[i], input[i] + input[32 + i]);
+
+    for (i = 32, k = 31; i < 64; i++, k--)
+        output[i] = mul23(cos_mod[i], input[k] - input[32 + k]);
+}
+
+static void imdct_half_64(int32_t *output, const int32_t *input)
+{
+    int buf_a[64], buf_b[64];
+    int i, k, mag, shift, round;
+
+    mag = 0;
+    for (i = 0; i < 64; i++)
+        mag += abs(input[i]);
+
+    shift = mag > 0x400000 ? 2 : 0;
+    round = shift > 0 ? 1 << (shift - 1) : 0;
+
+    for (i = 0; i < 64; i++)
+        buf_a[i] = (input[i] + round) >> shift;
+
+    sum_a(buf_a, buf_b +  0, 32);
+    sum_b(buf_a, buf_b + 32, 32);
+    clp_v(buf_b, 64);
+
+    sum_a(buf_b +  0, buf_a +  0, 16);
+    sum_b(buf_b +  0, buf_a + 16, 16);
+    sum_c(buf_b + 32, buf_a + 32, 16);
+    sum_d(buf_b + 32, buf_a + 48, 16);
+    clp_v(buf_a, 64);
+
+    sum_a(buf_a +  0, buf_b +  0, 8);
+    sum_b(buf_a +  0, buf_b +  8, 8);
+    sum_c(buf_a + 16, buf_b + 16, 8);
+    sum_d(buf_a + 16, buf_b + 24, 8);
+    sum_c(buf_a + 32, buf_b + 32, 8);
+    sum_d(buf_a + 32, buf_b + 40, 8);
+    sum_c(buf_a + 48, buf_b + 48, 8);
+    sum_d(buf_a + 48, buf_b + 56, 8);
+    clp_v(buf_b, 64);
+
+    dct_a(buf_b +  0, buf_a +  0);
+    dct_b(buf_b +  8, buf_a +  8);
+    dct_b(buf_b + 16, buf_a + 16);
+    dct_b(buf_b + 24, buf_a + 24);
+    dct_b(buf_b + 32, buf_a + 32);
+    dct_b(buf_b + 40, buf_a + 40);
+    dct_b(buf_b + 48, buf_a + 48);
+    dct_b(buf_b + 56, buf_a + 56);
+    clp_v(buf_a, 64);
+
+    mod_a(buf_a +  0, buf_b +  0);
+    mod_b(buf_a + 16, buf_b + 16);
+    mod_b(buf_a + 32, buf_b + 32);
+    mod_b(buf_a + 48, buf_b + 48);
+    clp_v(buf_b, 64);
+
+    mod64_a(buf_b +  0, buf_a +  0);
+    mod64_b(buf_b + 32, buf_a + 32);
+    clp_v(buf_a, 64);
+
+    mod64_c(buf_a, buf_b);
+
+    for (i = 0; i < 64; i++)
+        buf_b[i] = clip23(buf_b[i] * (1 << shift));
+
+    for (i = 0, k = 63; i < 32; i++, k--) {
+        output[     i] = clip23(buf_b[i] - buf_b[k]);
+        output[32 + i] = clip23(buf_b[i] + buf_b[k]);
+    }
+}
+
+av_cold void ff_dcadct_init(DCADCTContext *c)
+{
+    c->imdct_half[0] = imdct_half_32;
+    c->imdct_half[1] = imdct_half_64;
+}
diff --git a/libavcodec/dcadct.h b/libavcodec/dcadct.h
new file mode 100644
index 0000000..518c9f9
--- /dev/null
+++ b/libavcodec/dcadct.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCADCT_H
+#define AVCODEC_DCADCT_H
+
+#include "libavutil/common.h"
+
+typedef struct DCADCTContext {
+    void (*imdct_half[2])(int32_t *output, const int32_t *input);
+} DCADCTContext;
+
+av_cold void ff_dcadct_init(DCADCTContext *c);
+
+#endif
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index cd44323..4146a85 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -1,1601 +1,400 @@
 /*
- * DCA compatible decoder
- * Copyright (C) 2004 Gildas Bazin
- * Copyright (C) 2004 Benjamin Zores
- * Copyright (C) 2006 Benjamin Larsson
- * Copyright (C) 2007 Konstantin Shishkov
- * Copyright (C) 2012 Paul B Mahol
- * Copyright (C) 2014 Niels Möller
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <math.h>
-#include <stddef.h>
-#include <stdio.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/channel_layout.h"
-#include "libavutil/common.h"
-#include "libavutil/float_dsp.h"
-#include "libavutil/internal.h"
-#include "libavutil/intreadwrite.h"
-#include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
-#include "libavutil/samplefmt.h"
+#include "libavutil/channel_layout.h"
 
-#include "avcodec.h"
-#include "dca.h"
-#include "dca_syncwords.h"
-#include "dcadata.h"
-#include "dcadsp.h"
+#include "dcadec.h"
 #include "dcahuff.h"
-#include "fft.h"
-#include "fmtconvert.h"
-#include "get_bits.h"
-#include "internal.h"
-#include "mathops.h"
+#include "dca_syncwords.h"
 #include "profiles.h"
-#include "put_bits.h"
-#include "synth_filter.h"
-
-#if ARCH_ARM
-#   include "arm/dca.h"
-#endif
-
-enum DCAMode {
-    DCA_MONO = 0,
-    DCA_CHANNEL,
-    DCA_STEREO,
-    DCA_STEREO_SUMDIFF,
-    DCA_STEREO_TOTAL,
-    DCA_3F,
-    DCA_2F1R,
-    DCA_3F1R,
-    DCA_2F2R,
-    DCA_3F2R,
-    DCA_4F2R
-};
-
-/* -1 are reserved or unknown */
-static const int dca_ext_audio_descr_mask[] = {
-    DCA_EXT_XCH,
-    -1,
-    DCA_EXT_X96,
-    DCA_EXT_XCH | DCA_EXT_X96,
-    -1,
-    -1,
-    DCA_EXT_XXCH,
-    -1,
-};
-
-/* Tables for mapping dts channel configurations to libavcodec multichannel api.
- * Some compromises have been made for special configurations. Most configurations
- * are never used so complete accuracy is not needed.
- *
- * L = left, R = right, C = center, S = surround, F = front, R = rear, T = total, OV = overhead.
- * S  -> side, when both rear and back are configured move one of them to the side channel
- * OV -> center back
- * All 2 channel configurations -> AV_CH_LAYOUT_STEREO
- */
-static const uint64_t dca_core_channel_layout[] = {
-    AV_CH_FRONT_CENTER,                                                     ///< 1, A
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, A + B (dual mono)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, L + R (stereo)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, (L + R) + (L - R) (sum-difference)
-    AV_CH_LAYOUT_STEREO,                                                    ///< 2, LT + RT (left and right total)
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER,                               ///< 3, C + L + R
-    AV_CH_LAYOUT_STEREO | AV_CH_BACK_CENTER,                                ///< 3, L + R + S
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_BACK_CENTER,           ///< 4, C + L + R + S
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,               ///< 4, L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_FRONT_CENTER | AV_CH_SIDE_LEFT |
-    AV_CH_SIDE_RIGHT,                                                       ///< 5, C + L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER,               ///< 6, CL + CR + L + R + SL + SR
-
-    AV_CH_LAYOUT_STEREO | AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT |
-    AV_CH_FRONT_CENTER  | AV_CH_BACK_CENTER,                                ///< 6, C + L + R + LR + RR + OV
-
-    AV_CH_FRONT_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_BACK_CENTER   |
-    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 6, CF + CR + LF + RF + LR + RR
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
-    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
-    AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT,                                     ///< 7, CL + C + CR + L + R + SL + SR
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER |
-    AV_CH_LAYOUT_STEREO | AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT |
-    AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT,                                     ///< 8, CL + CR + L + R + SL1 + SL2 + SR1 + SR2
-
-    AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_CENTER   |
-    AV_CH_FRONT_RIGHT_OF_CENTER | AV_CH_LAYOUT_STEREO |
-    AV_CH_SIDE_LEFT | AV_CH_BACK_CENTER | AV_CH_SIDE_RIGHT,                 ///< 8, CL + C + CR + L + R + SL + S + SR
-};
-
-#define DCA_DOLBY                  101           /* FIXME */
-
-#define DCA_CHANNEL_BITS             6
-#define DCA_CHANNEL_MASK          0x3F
-
-#define DCA_LFE                   0x80
 
-#define HEADER_SIZE                 14
+#define MIN_PACKET_SIZE     16
+#define MAX_PACKET_SIZE     0x104000
 
-#define DCA_NSYNCAUX        0x9A1105A0
-
-/** Bit allocation */
-typedef struct BitAlloc {
-    int offset;                 ///< code values offset
-    int maxbits[8];             ///< max bits in VLC
-    int wrap;                   ///< wrap for get_vlc2()
-    VLC vlc[8];                 ///< actual codes
-} BitAlloc;
-
-static BitAlloc dca_bitalloc_index;    ///< indexes for samples VLC select
-static BitAlloc dca_tmode;             ///< transition mode VLCs
-static BitAlloc dca_scalefactor;       ///< scalefactor VLCs
-static BitAlloc dca_smpl_bitalloc[11]; ///< samples VLCs
-
-static av_always_inline int get_bitalloc(GetBitContext *gb, BitAlloc *ba,
-                                         int idx)
+int ff_dca_set_channel_layout(AVCodecContext *avctx, int *ch_remap, int dca_mask)
 {
-    return get_vlc2(gb, ba->vlc[idx].table, ba->vlc[idx].bits, ba->wrap) +
-           ba->offset;
-}
-
-static av_cold void dca_init_vlcs(void)
-{
-    static int vlcs_initialized = 0;
-    int i, j, c = 14;
-    static VLC_TYPE dca_table[23622][2];
-
-    if (vlcs_initialized)
-        return;
-
-    dca_bitalloc_index.offset = 1;
-    dca_bitalloc_index.wrap   = 2;
-    for (i = 0; i < 5; i++) {
-        dca_bitalloc_index.vlc[i].table           = &dca_table[ff_dca_vlc_offs[i]];
-        dca_bitalloc_index.vlc[i].table_allocated = ff_dca_vlc_offs[i + 1] - ff_dca_vlc_offs[i];
-        init_vlc(&dca_bitalloc_index.vlc[i], bitalloc_12_vlc_bits[i], 12,
-                 bitalloc_12_bits[i], 1, 1,
-                 bitalloc_12_codes[i], 2, 2, INIT_VLC_USE_NEW_STATIC);
-    }
-    dca_scalefactor.offset = -64;
-    dca_scalefactor.wrap   = 2;
-    for (i = 0; i < 5; i++) {
-        dca_scalefactor.vlc[i].table           = &dca_table[ff_dca_vlc_offs[i + 5]];
-        dca_scalefactor.vlc[i].table_allocated = ff_dca_vlc_offs[i + 6] - ff_dca_vlc_offs[i + 5];
-        init_vlc(&dca_scalefactor.vlc[i], SCALES_VLC_BITS, 129,
-                 scales_bits[i], 1, 1,
-                 scales_codes[i], 2, 2, INIT_VLC_USE_NEW_STATIC);
-    }
-    dca_tmode.offset = 0;
-    dca_tmode.wrap   = 1;
-    for (i = 0; i < 4; i++) {
-        dca_tmode.vlc[i].table           = &dca_table[ff_dca_vlc_offs[i + 10]];
-        dca_tmode.vlc[i].table_allocated = ff_dca_vlc_offs[i + 11] - ff_dca_vlc_offs[i + 10];
-        init_vlc(&dca_tmode.vlc[i], tmode_vlc_bits[i], 4,
-                 tmode_bits[i], 1, 1,
-                 tmode_codes[i], 2, 2, INIT_VLC_USE_NEW_STATIC);
+    static const uint8_t dca2wav_norm[28] = {
+         2,  0, 1, 9, 10,  3,  8,  4,  5,  9, 10, 6, 7, 12,
+        13, 14, 3, 6,  7, 11, 12, 14, 16, 15, 17, 8, 4,  5,
+    };
+
+    static const uint8_t dca2wav_wide[28] = {
+         2,  0, 1, 4,  5,  3,  8,  4,  5,  9, 10, 6, 7, 12,
+        13, 14, 3, 9, 10, 11, 12, 14, 16, 15, 17, 8, 4,  5,
+    };
+
+    int dca_ch, wav_ch, nchannels = 0;
+
+    if (avctx->request_channel_layout & AV_CH_LAYOUT_NATIVE) {
+        for (dca_ch = 0; dca_ch < DCA_SPEAKER_COUNT; dca_ch++)
+            if (dca_mask & (1U << dca_ch))
+                ch_remap[nchannels++] = dca_ch;
+        avctx->channel_layout = dca_mask;
+    } else {
+        int wav_mask = 0;
+        int wav_map[18];
+        const uint8_t *dca2wav;
+        if (dca_mask == DCA_SPEAKER_LAYOUT_7POINT0_WIDE ||
+            dca_mask == DCA_SPEAKER_LAYOUT_7POINT1_WIDE)
+            dca2wav = dca2wav_wide;
+        else
+            dca2wav = dca2wav_norm;
+        for (dca_ch = 0; dca_ch < 28; dca_ch++) {
+            if (dca_mask & (1 << dca_ch)) {
+                wav_ch = dca2wav[dca_ch];
+                if (!(wav_mask & (1 << wav_ch))) {
+                    wav_map[wav_ch] = dca_ch;
+                    wav_mask |= 1 << wav_ch;
+                }
+            }
+        }
+        for (wav_ch = 0; wav_ch < 18; wav_ch++)
+            if (wav_mask & (1 << wav_ch))
+                ch_remap[nchannels++] = wav_map[wav_ch];
+        avctx->channel_layout = wav_mask;
     }
 
-    for (i = 0; i < 10; i++)
-        for (j = 0; j < 7; j++) {
-            if (!bitalloc_codes[i][j])
-                break;
-            dca_smpl_bitalloc[i + 1].offset                 = bitalloc_offsets[i];
-            dca_smpl_bitalloc[i + 1].wrap                   = 1 + (j > 4);
-            dca_smpl_bitalloc[i + 1].vlc[j].table           = &dca_table[ff_dca_vlc_offs[c]];
-            dca_smpl_bitalloc[i + 1].vlc[j].table_allocated = ff_dca_vlc_offs[c + 1] - ff_dca_vlc_offs[c];
-
-            init_vlc(&dca_smpl_bitalloc[i + 1].vlc[j], bitalloc_maxbits[i][j],
-                     bitalloc_sizes[i],
-                     bitalloc_bits[i][j], 1, 1,
-                     bitalloc_codes[i][j], 2, 2, INIT_VLC_USE_NEW_STATIC);
-            c++;
-        }
-    vlcs_initialized = 1;
+    avctx->channels = nchannels;
+    return nchannels;
 }
 
-static inline void get_array(GetBitContext *gb, int *dst, int len, int bits)
+void ff_dca_downmix_to_stereo_fixed(DCADSPContext *dcadsp, int32_t **samples,
+                                    int *coeff_l, int nsamples, int ch_mask)
 {
-    while (len--)
-        *dst++ = get_bits(gb, bits);
-}
+    int pos, spkr, max_spkr = av_log2(ch_mask);
+    int *coeff_r = coeff_l + av_popcount(ch_mask);
 
-static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
-{
-    int i, j;
-    static const uint8_t adj_table[4] = { 16, 18, 20, 23 };
-    static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
-    static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
+    av_assert0(DCA_HAS_STEREO(ch_mask));
 
-    s->audio_header.total_channels = get_bits(&s->gb, 3) + 1 + base_channel;
-    s->audio_header.prim_channels  = s->audio_header.total_channels;
+    // Scale left and right channels
+    pos = (ch_mask & DCA_SPEAKER_MASK_C);
+    dcadsp->dmix_scale(samples[DCA_SPEAKER_L], coeff_l[pos    ], nsamples);
+    dcadsp->dmix_scale(samples[DCA_SPEAKER_R], coeff_r[pos + 1], nsamples);
 
-    if (s->audio_header.prim_channels > DCA_PRIM_CHANNELS_MAX)
-        s->audio_header.prim_channels = DCA_PRIM_CHANNELS_MAX;
+    // Downmix remaining channels
+    for (spkr = 0; spkr <= max_spkr; spkr++) {
+        if (!(ch_mask & (1U << spkr)))
+            continue;
 
-    for (i = base_channel; i < s->audio_header.prim_channels; i++) {
-        s->audio_header.subband_activity[i] = get_bits(&s->gb, 5) + 2;
-        if (s->audio_header.subband_activity[i] > DCA_SUBBANDS)
-            s->audio_header.subband_activity[i] = DCA_SUBBANDS;
-    }
-    for (i = base_channel; i < s->audio_header.prim_channels; i++) {
-        s->audio_header.vq_start_subband[i] = get_bits(&s->gb, 5) + 1;
-        if (s->audio_header.vq_start_subband[i] > DCA_SUBBANDS)
-            s->audio_header.vq_start_subband[i] = DCA_SUBBANDS;
-    }
-    get_array(&s->gb, s->audio_header.joint_intensity + base_channel,
-              s->audio_header.prim_channels - base_channel, 3);
-    get_array(&s->gb, s->audio_header.transient_huffman + base_channel,
-              s->audio_header.prim_channels - base_channel, 2);
-    get_array(&s->gb, s->audio_header.scalefactor_huffman + base_channel,
-              s->audio_header.prim_channels - base_channel, 3);
-    get_array(&s->gb, s->audio_header.bitalloc_huffman + base_channel,
-              s->audio_header.prim_channels - base_channel, 3);
-
-    /* Get codebooks quantization indexes */
-    if (!base_channel)
-        memset(s->audio_header.quant_index_huffman, 0, sizeof(s->audio_header.quant_index_huffman));
-    for (j = 1; j < 11; j++)
-        for (i = base_channel; i < s->audio_header.prim_channels; i++)
-            s->audio_header.quant_index_huffman[i][j] = get_bits(&s->gb, bitlen[j]);
-
-    /* Get scale factor adjustment */
-    for (j = 0; j < 11; j++)
-        for (i = base_channel; i < s->audio_header.prim_channels; i++)
-            s->audio_header.scalefactor_adj[i][j] = 16;
-
-    for (j = 1; j < 11; j++)
-        for (i = base_channel; i < s->audio_header.prim_channels; i++)
-            if (s->audio_header.quant_index_huffman[i][j] < thr[j])
-                s->audio_header.scalefactor_adj[i][j] = adj_table[get_bits(&s->gb, 2)];
-
-    if (s->crc_present) {
-        /* Audio header CRC check */
-        get_bits(&s->gb, 16);
-    }
+        if (*coeff_l && spkr != DCA_SPEAKER_L)
+            dcadsp->dmix_add(samples[DCA_SPEAKER_L], samples[spkr],
+                             *coeff_l, nsamples);
 
-    s->current_subframe    = 0;
-    s->current_subsubframe = 0;
+        if (*coeff_r && spkr != DCA_SPEAKER_R)
+            dcadsp->dmix_add(samples[DCA_SPEAKER_R], samples[spkr],
+                             *coeff_r, nsamples);
 
-    return 0;
+        coeff_l++;
+        coeff_r++;
+    }
 }
 
-static int dca_parse_frame_header(DCAContext *s)
+void ff_dca_downmix_to_stereo_float(AVFloatDSPContext *fdsp, float **samples,
+                                    int *coeff_l, int nsamples, int ch_mask)
 {
-    init_get_bits(&s->gb, s->dca_buffer, s->dca_buffer_size * 8);
-
-    /* Sync code */
-    skip_bits_long(&s->gb, 32);
-
-    /* Frame header */
-    s->frame_type        = get_bits(&s->gb, 1);
-    s->samples_deficit   = get_bits(&s->gb, 5) + 1;
-    s->crc_present       = get_bits(&s->gb, 1);
-    s->sample_blocks     = get_bits(&s->gb, 7) + 1;
-    s->frame_size        = get_bits(&s->gb, 14) + 1;
-    if (s->frame_size < 95)
-        return AVERROR_INVALIDDATA;
-    s->amode             = get_bits(&s->gb, 6);
-    s->sample_rate       = avpriv_dca_sample_rates[get_bits(&s->gb, 4)];
-    if (!s->sample_rate)
-        return AVERROR_INVALIDDATA;
-    s->bit_rate_index    = get_bits(&s->gb, 5);
-    s->bit_rate          = ff_dca_bit_rates[s->bit_rate_index];
-    if (!s->bit_rate)
-        return AVERROR_INVALIDDATA;
+    int pos, spkr, max_spkr = av_log2(ch_mask);
+    int *coeff_r = coeff_l + av_popcount(ch_mask);
+    const float scale = 1.0f / (1 << 15);
 
-    skip_bits1(&s->gb); // always 0 (reserved, cf. ETSI TS 102 114 V1.4.1)
-    s->dynrange          = get_bits(&s->gb, 1);
-    s->timestamp         = get_bits(&s->gb, 1);
-    s->aux_data          = get_bits(&s->gb, 1);
-    s->hdcd              = get_bits(&s->gb, 1);
-    s->ext_descr         = get_bits(&s->gb, 3);
-    s->ext_coding        = get_bits(&s->gb, 1);
-    s->aspf              = get_bits(&s->gb, 1);
-    s->lfe               = get_bits(&s->gb, 2);
-    s->predictor_history = get_bits(&s->gb, 1);
-
-    if (s->lfe > 2) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid LFE value: %d\n", s->lfe);
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(DCA_HAS_STEREO(ch_mask));
 
-    /* TODO: check CRC */
-    if (s->crc_present)
-        s->header_crc    = get_bits(&s->gb, 16);
+    // Scale left and right channels
+    pos = (ch_mask & DCA_SPEAKER_MASK_C);
+    fdsp->vector_fmul_scalar(samples[DCA_SPEAKER_L], samples[DCA_SPEAKER_L],
+                             coeff_l[pos    ] * scale, nsamples);
+    fdsp->vector_fmul_scalar(samples[DCA_SPEAKER_R], samples[DCA_SPEAKER_R],
+                             coeff_r[pos + 1] * scale, nsamples);
 
-    s->multirate_inter   = get_bits(&s->gb, 1);
-    s->version           = get_bits(&s->gb, 4);
-    s->copy_history      = get_bits(&s->gb, 2);
-    s->source_pcm_res    = get_bits(&s->gb, 3);
-    s->front_sum         = get_bits(&s->gb, 1);
-    s->surround_sum      = get_bits(&s->gb, 1);
-    s->dialog_norm       = get_bits(&s->gb, 4);
+    // Downmix remaining channels
+    for (spkr = 0; spkr <= max_spkr; spkr++) {
+        if (!(ch_mask & (1U << spkr)))
+            continue;
 
-    /* FIXME: channels mixing levels */
-    s->output = s->amode;
-    if (s->lfe)
-        s->output |= DCA_LFE;
+        if (*coeff_l && spkr != DCA_SPEAKER_L)
+            fdsp->vector_fmac_scalar(samples[DCA_SPEAKER_L], samples[spkr],
+                                     *coeff_l * scale, nsamples);
 
-    /* Primary audio coding header */
-    s->audio_header.subframes = get_bits(&s->gb, 4) + 1;
+        if (*coeff_r && spkr != DCA_SPEAKER_R)
+            fdsp->vector_fmac_scalar(samples[DCA_SPEAKER_R], samples[spkr],
+                                     *coeff_r * scale, nsamples);
 
-    return dca_parse_audio_coding_header(s, 0);
-}
-
-static inline int get_scale(GetBitContext *gb, int level, int value, int log2range)
-{
-    if (level < 5) {
-        /* huffman encoded */
-        value += get_bitalloc(gb, &dca_scalefactor, level);
-        value  = av_clip(value, 0, (1 << log2range) - 1);
-    } else if (level < 8) {
-        if (level + 1 > log2range) {
-            skip_bits(gb, level + 1 - log2range);
-            value = get_bits(gb, log2range);
-        } else {
-            value = get_bits(gb, level + 1);
-        }
+        coeff_l++;
+        coeff_r++;
     }
-    return value;
 }
 
-static int dca_subframe_header(DCAContext *s, int base_channel, int block_index)
+static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
+                               int *got_frame_ptr, AVPacket *avpkt)
 {
-    /* Primary audio coding side information */
-    int j, k;
-
-    if (get_bits_left(&s->gb) < 0)
+    DCAContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    uint8_t *input = avpkt->data;
+    int input_size = avpkt->size;
+    int i, ret, prev_packet = s->packet;
+    uint32_t mrk;
+
+    if (input_size < MIN_PACKET_SIZE || input_size > MAX_PACKET_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid packet size\n");
         return AVERROR_INVALIDDATA;
-
-    if (!base_channel) {
-        s->subsubframes[s->current_subframe]    = get_bits(&s->gb, 2) + 1;
-        s->partial_samples[s->current_subframe] = get_bits(&s->gb, 3);
-    }
-
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++)
-            s->dca_chan[j].prediction_mode[k] = get_bits(&s->gb, 1);
     }
 
-    /* Get prediction codebook */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++) {
-            if (s->dca_chan[j].prediction_mode[k] > 0) {
-                /* (Prediction coefficient VQ address) */
-                s->dca_chan[j].prediction_vq[k] = get_bits(&s->gb, 12);
-            }
-        }
-    }
+    // Convert input to BE format
+    mrk = AV_RB32(input);
+    if (mrk != DCA_SYNCWORD_CORE_BE && mrk != DCA_SYNCWORD_SUBSTREAM) {
+        av_fast_padded_malloc(&s->buffer, &s->buffer_size, input_size);
+        if (!s->buffer)
+            return AVERROR(ENOMEM);
 
-    /* Bit allocation index */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.vq_start_subband[j]; k++) {
-            if (s->audio_header.bitalloc_huffman[j] == 6)
-                s->dca_chan[j].bitalloc[k] = get_bits(&s->gb, 5);
-            else if (s->audio_header.bitalloc_huffman[j] == 5)
-                s->dca_chan[j].bitalloc[k] = get_bits(&s->gb, 4);
-            else if (s->audio_header.bitalloc_huffman[j] == 7) {
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "Invalid bit allocation index\n");
-                return AVERROR_INVALIDDATA;
-            } else {
-                s->dca_chan[j].bitalloc[k] =
-                    get_bitalloc(&s->gb, &dca_bitalloc_index, s->audio_header.bitalloc_huffman[j]);
-            }
+        for (i = 0, ret = AVERROR_INVALIDDATA; i < input_size - MIN_PACKET_SIZE + 1 && ret < 0; i++)
+            ret = avpriv_dca_convert_bitstream(input + i, input_size - i, s->buffer, s->buffer_size);
 
-            if (s->dca_chan[j].bitalloc[k] > 26) {
-                ff_dlog(s->avctx, "bitalloc index [%i][%i] too big (%i)\n",
-                        j, k, s->dca_chan[j].bitalloc[k]);
-                return AVERROR_INVALIDDATA;
-            }
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Not a valid DCA frame\n");
+            return ret;
         }
-    }
 
-    /* Transition mode */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++) {
-            s->dca_chan[j].transition_mode[k] = 0;
-            if (s->subsubframes[s->current_subframe] > 1 &&
-                k < s->audio_header.vq_start_subband[j] && s->dca_chan[j].bitalloc[k] > 0) {
-                s->dca_chan[j].transition_mode[k] =
-                    get_bitalloc(&s->gb, &dca_tmode, s->audio_header.transient_huffman[j]);
-            }
-        }
+        input      = s->buffer;
+        input_size = ret;
     }
 
-    if (get_bits_left(&s->gb) < 0)
-        return AVERROR_INVALIDDATA;
-
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        const uint32_t *scale_table;
-        int scale_sum, log_size;
-
-        memset(s->dca_chan[j].scale_factor, 0,
-               s->audio_header.subband_activity[j] * sizeof(s->dca_chan[j].scale_factor[0][0]) * 2);
+    s->packet = 0;
 
-        if (s->audio_header.scalefactor_huffman[j] == 6) {
-            scale_table = ff_dca_scale_factor_quant7;
-            log_size    = 7;
-        } else {
-            scale_table = ff_dca_scale_factor_quant6;
-            log_size    = 6;
-        }
+    // Parse backward compatible core sub-stream
+    if (AV_RB32(input) == DCA_SYNCWORD_CORE_BE) {
+        int frame_size;
 
-        /* When huffman coded, only the difference is encoded */
-        scale_sum = 0;
+        if ((ret = ff_dca_core_parse(&s->core, input, input_size)) < 0)
+            return ret;
 
-        for (k = 0; k < s->audio_header.subband_activity[j]; k++) {
-            if (k >= s->audio_header.vq_start_subband[j] || s->dca_chan[j].bitalloc[k] > 0) {
-                scale_sum = get_scale(&s->gb, s->audio_header.scalefactor_huffman[j], scale_sum, log_size);
-                s->dca_chan[j].scale_factor[k][0] = scale_table[scale_sum];
-            }
+        s->packet |= DCA_PACKET_CORE;
 
-            if (k < s->audio_header.vq_start_subband[j] && s->dca_chan[j].transition_mode[k]) {
-                /* Get second scale factor */
-                scale_sum = get_scale(&s->gb, s->audio_header.scalefactor_huffman[j], scale_sum, log_size);
-                s->dca_chan[j].scale_factor[k][1] = scale_table[scale_sum];
-            }
+        // EXXS data must be aligned on 4-byte boundary
+        frame_size = FFALIGN(s->core.frame_size, 4);
+        if (input_size - 4 > frame_size) {
+            input      += frame_size;
+            input_size -= frame_size;
         }
     }
 
-    /* Joint subband scale factor codebook select */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        /* Transmitted only if joint subband coding enabled */
-        if (s->audio_header.joint_intensity[j] > 0)
-            s->dca_chan[j].joint_huff = get_bits(&s->gb, 3);
-    }
+    if (!s->core_only) {
+        DCAExssAsset *asset = NULL;
 
-    if (get_bits_left(&s->gb) < 0)
-        return AVERROR_INVALIDDATA;
-
-    /* Scale factors for joint subband coding */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++) {
-        int source_channel;
-
-        /* Transmitted only if joint subband coding enabled */
-        if (s->audio_header.joint_intensity[j] > 0) {
-            int scale = 0;
-            source_channel = s->audio_header.joint_intensity[j] - 1;
-
-            /* When huffman coded, only the difference is encoded
-             * (is this valid as well for joint scales ???) */
-
-            for (k = s->audio_header.subband_activity[j];
-                 k < s->audio_header.subband_activity[source_channel]; k++) {
-                scale = get_scale(&s->gb, s->dca_chan[j].joint_huff, 64 /* bias */, 7);
-                s->dca_chan[j].joint_scale_factor[k] = scale;    /*joint_scale_table[scale]; */
-            }
-
-            if (!(s->debug_flag & 0x02)) {
-                av_log(s->avctx, AV_LOG_DEBUG,
-                       "Joint stereo coding not supported\n");
-                s->debug_flag |= 0x02;
+        // Parse extension sub-stream (EXSS)
+        if (AV_RB32(input) == DCA_SYNCWORD_SUBSTREAM) {
+            if ((ret = ff_dca_exss_parse(&s->exss, input, input_size)) < 0) {
+                if (avctx->err_recognition & AV_EF_EXPLODE)
+                    return ret;
+            } else {
+                s->packet |= DCA_PACKET_EXSS;
+                asset = &s->exss.assets[0];
             }
         }
-    }
 
-    /* Dynamic range coefficient */
-    if (!base_channel && s->dynrange)
-        s->dynrange_coef = get_bits(&s->gb, 8);
-
-    /* Side information CRC check word */
-    if (s->crc_present) {
-        get_bits(&s->gb, 16);
-    }
-
-    /*
-     * Primary audio data arrays
-     */
-
-    /* VQ encoded high frequency subbands */
-    for (j = base_channel; j < s->audio_header.prim_channels; j++)
-        for (k = s->audio_header.vq_start_subband[j]; k < s->audio_header.subband_activity[j]; k++)
-            /* 1 vector -> 32 samples */
-            s->dca_chan[j].high_freq_vq[k] = get_bits(&s->gb, 10);
-
-    /* Low frequency effect data */
-    if (!base_channel && s->lfe) {
-        /* LFE samples */
-        int lfe_samples    = 2 * s->lfe * (4 + block_index);
-        int lfe_end_sample = 2 * s->lfe * (4 + block_index + s->subsubframes[s->current_subframe]);
-        float lfe_scale;
-
-        for (j = lfe_samples; j < lfe_end_sample; j++) {
-            /* Signed 8 bits int */
-            s->lfe_data[j] = get_sbits(&s->gb, 8);
-        }
-
-        /* Scale factor index */
-        skip_bits(&s->gb, 1);
-        s->lfe_scale_factor = ff_dca_scale_factor_quant7[get_bits(&s->gb, 7)];
-
-        /* Quantization step size * scale factor */
-        lfe_scale = 0.035 * s->lfe_scale_factor;
-
-        for (j = lfe_samples; j < lfe_end_sample; j++)
-            s->lfe_data[j] *= lfe_scale;
-    }
-
-    return 0;
-}
-
-static void qmf_32_subbands(DCAContext *s, int chans,
-                            float samples_in[DCA_SUBBANDS][SAMPLES_PER_SUBBAND], float *samples_out,
-                            float scale)
-{
-    const float *prCoeff;
-
-    int sb_act = s->audio_header.subband_activity[chans];
-
-    scale *= sqrt(1 / 8.0);
-
-    /* Select filter */
-    if (!s->multirate_inter)    /* Non-perfect reconstruction */
-        prCoeff = ff_dca_fir_32bands_nonperfect;
-    else                        /* Perfect reconstruction */
-        prCoeff = ff_dca_fir_32bands_perfect;
-
-    s->dcadsp.qmf_32_subbands(samples_in, sb_act, &s->synth, &s->imdct,
-                              s->dca_chan[chans].subband_fir_hist,
-                              &s->dca_chan[chans].hist_index,
-                              s->dca_chan[chans].subband_fir_noidea, prCoeff,
-                              samples_out, s->raXin, scale);
-}
-
-static QMF64_table *qmf64_precompute(void)
-{
-    unsigned i, j;
-    QMF64_table *table = av_malloc(sizeof(*table));
-    if (!table)
-        return NULL;
-
-    for (i = 0; i < 32; i++)
-        for (j = 0; j < 32; j++)
-            table->dct4_coeff[i][j] = cos((2 * i + 1) * (2 * j + 1) * M_PI / 128);
-    for (i = 0; i < 32; i++)
-        for (j = 0; j < 32; j++)
-            table->dct2_coeff[i][j] = cos((2 * i + 1) *      j      * M_PI /  64);
-
-    /* FIXME: Is the factor 0.125 = 1/8 right? */
-    for (i = 0; i < 32; i++)
-        table->rcos[i] =  0.125 / cos((2 * i + 1) * M_PI / 256);
-    for (i = 0; i < 32; i++)
-        table->rsin[i] = -0.125 / sin((2 * i + 1) * M_PI / 256);
-
-    return table;
-}
-
-/* FIXME: Totally unoptimized. Based on the reference code and
- * http://multimedia.cx/mirror/dca-transform.pdf, with guessed tweaks
- * for doubling the size. */
-static void qmf_64_subbands(DCAContext *s, int chans,
-                            float samples_in[DCA_SUBBANDS_X96K][SAMPLES_PER_SUBBAND],
-                            float *samples_out, float scale)
-{
-    float raXin[64];
-    float A[32], B[32];
-    float *raX = s->dca_chan[chans].subband_fir_hist;
-    float *raZ = s->dca_chan[chans].subband_fir_noidea;
-    unsigned i, j, k, subindex;
-
-    for (i = s->audio_header.subband_activity[chans]; i < DCA_SUBBANDS_X96K; i++)
-        raXin[i] = 0.0;
-    for (subindex = 0; subindex < SAMPLES_PER_SUBBAND; subindex++) {
-        for (i = 0; i < s->audio_header.subband_activity[chans]; i++)
-            raXin[i] = samples_in[i][subindex];
-
-        for (k = 0; k < 32; k++) {
-            A[k] = 0.0;
-            for (i = 0; i < 32; i++)
-                A[k] += (raXin[2 * i] + raXin[2 * i + 1]) * s->qmf64_table->dct4_coeff[k][i];
-        }
-        for (k = 0; k < 32; k++) {
-            B[k] = raXin[0] * s->qmf64_table->dct2_coeff[k][0];
-            for (i = 1; i < 32; i++)
-                B[k] += (raXin[2 * i] + raXin[2 * i - 1]) * s->qmf64_table->dct2_coeff[k][i];
-        }
-        for (k = 0; k < 32; k++) {
-            raX[k]      = s->qmf64_table->rcos[k] * (A[k] + B[k]);
-            raX[63 - k] = s->qmf64_table->rsin[k] * (A[k] - B[k]);
-        }
-
-        for (i = 0; i < DCA_SUBBANDS_X96K; i++) {
-            float out = raZ[i];
-            for (j = 0; j < 1024; j += 128)
-                out += ff_dca_fir_64bands[j + i] * (raX[j + i] - raX[j + 63 - i]);
-            *samples_out++ = out * scale;
-        }
-
-        for (i = 0; i < DCA_SUBBANDS_X96K; i++) {
-            float hist = 0.0;
-            for (j = 0; j < 1024; j += 128)
-                hist += ff_dca_fir_64bands[64 + j + i] * (-raX[i + j] - raX[j + 63 - i]);
-
-            raZ[i] = hist;
-        }
-
-        /* FIXME: Make buffer circular, to avoid this move. */
-        memmove(raX + 64, raX, (1024 - 64) * sizeof(*raX));
-    }
-}
-
-static void lfe_interpolation_fir(DCAContext *s, const float *samples_in,
-                                  float *samples_out)
-{
-    /* samples_in: An array holding decimated samples.
-     *   Samples in current subframe starts from samples_in[0],
-     *   while samples_in[-1], samples_in[-2], ..., stores samples
-     *   from last subframe as history.
-     *
-     * samples_out: An array holding interpolated samples
-     */
-
-    int idx;
-    const float *prCoeff;
-    int deciindex;
-
-    /* Select decimation filter */
-    if (s->lfe == 1) {
-        idx     = 1;
-        prCoeff = ff_dca_lfe_fir_128;
-    } else {
-        idx = 0;
-        if (s->exss_ext_mask & DCA_EXT_EXSS_XLL)
-            prCoeff = ff_dca_lfe_xll_fir_64;
-        else
-            prCoeff = ff_dca_lfe_fir_64;
-    }
-    /* Interpolation */
-    for (deciindex = 0; deciindex < 2 * s->lfe; deciindex++) {
-        s->dcadsp.lfe_fir[idx](samples_out, samples_in, prCoeff);
-        samples_in++;
-        samples_out += 2 * 32 * (1 + idx);
-    }
-}
-
-/* downmixing routines */
-#define MIX_REAR1(samples, s1, rs, coef)            \
-    samples[0][i] += samples[s1][i] * coef[rs][0];  \
-    samples[1][i] += samples[s1][i] * coef[rs][1];
-
-#define MIX_REAR2(samples, s1, s2, rs, coef)                                          \
-    samples[0][i] += samples[s1][i] * coef[rs][0] + samples[s2][i] * coef[rs + 1][0]; \
-    samples[1][i] += samples[s1][i] * coef[rs][1] + samples[s2][i] * coef[rs + 1][1];
-
-#define MIX_FRONT3(samples, coef)                                      \
-    t = samples[c][i];                                                 \
-    u = samples[l][i];                                                 \
-    v = samples[r][i];                                                 \
-    samples[0][i] = t * coef[0][0] + u * coef[1][0] + v * coef[2][0];  \
-    samples[1][i] = t * coef[0][1] + u * coef[1][1] + v * coef[2][1];
-
-#define DOWNMIX_TO_STEREO(op1, op2)             \
-    for (i = 0; i < 256; i++) {                 \
-        op1                                     \
-        op2                                     \
-    }
-
-static void dca_downmix(float **samples, int srcfmt, int lfe_present,
-                        float coef[DCA_PRIM_CHANNELS_MAX + 1][2],
-                        const int8_t *channel_mapping)
-{
-    int c, l, r, sl, sr, s;
-    int i;
-    float t, u, v;
-
-    switch (srcfmt) {
-    case DCA_MONO:
-    case DCA_4F2R:
-        av_log(NULL, 0, "Not implemented!\n");
-        break;
-    case DCA_CHANNEL:
-    case DCA_STEREO:
-    case DCA_STEREO_TOTAL:
-    case DCA_STEREO_SUMDIFF:
-        break;
-    case DCA_3F:
-        c = channel_mapping[0];
-        l = channel_mapping[1];
-        r = channel_mapping[2];
-        DOWNMIX_TO_STEREO(MIX_FRONT3(samples, coef), );
-        break;
-    case DCA_2F1R:
-        s = channel_mapping[2];
-        DOWNMIX_TO_STEREO(MIX_REAR1(samples, s, 2, coef), );
-        break;
-    case DCA_3F1R:
-        c = channel_mapping[0];
-        l = channel_mapping[1];
-        r = channel_mapping[2];
-        s = channel_mapping[3];
-        DOWNMIX_TO_STEREO(MIX_FRONT3(samples, coef),
-                          MIX_REAR1(samples, s, 3, coef));
-        break;
-    case DCA_2F2R:
-        sl = channel_mapping[2];
-        sr = channel_mapping[3];
-        DOWNMIX_TO_STEREO(MIX_REAR2(samples, sl, sr, 2, coef), );
-        break;
-    case DCA_3F2R:
-        c  = channel_mapping[0];
-        l  = channel_mapping[1];
-        r  = channel_mapping[2];
-        sl = channel_mapping[3];
-        sr = channel_mapping[4];
-        DOWNMIX_TO_STEREO(MIX_FRONT3(samples, coef),
-                          MIX_REAR2(samples, sl, sr, 3, coef));
-        break;
-    }
-    if (lfe_present) {
-        int lf_buf = ff_dca_lfe_index[srcfmt];
-        int lf_idx =  ff_dca_channels[srcfmt];
-        for (i = 0; i < 256; i++) {
-            samples[0][i] += samples[lf_buf][i] * coef[lf_idx][0];
-            samples[1][i] += samples[lf_buf][i] * coef[lf_idx][1];
-        }
-    }
-}
-
-#ifndef decode_blockcodes
-/* Very compact version of the block code decoder that does not use table
- * look-up but is slightly slower */
-static int decode_blockcode(int code, int levels, int32_t *values)
-{
-    int i;
-    int offset = (levels - 1) >> 1;
-
-    for (i = 0; i < 4; i++) {
-        int div = FASTDIV(code, levels);
-        values[i] = code - offset - div * levels;
-        code      = div;
-    }
-
-    return code;
-}
-
-static int decode_blockcodes(int code1, int code2, int levels, int32_t *values)
-{
-    return decode_blockcode(code1, levels, values) |
-           decode_blockcode(code2, levels, values + 4);
-}
-#endif
-
-static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
-static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
-
-static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
-{
-    int k, l;
-    int subsubframe = s->current_subsubframe;
-    const uint32_t *quant_step_table;
-
-    /*
-     * Audio data
-     */
-
-    /* Select quantization step size table */
-    if (s->bit_rate_index == 0x1f)
-        quant_step_table = ff_dca_lossless_quant;
-    else
-        quant_step_table = ff_dca_lossy_quant;
-
-    for (k = base_channel; k < s->audio_header.prim_channels; k++) {
-        int32_t (*subband_samples)[8] = s->dca_chan[k].subband_samples[block_index];
-
-        if (get_bits_left(&s->gb) < 0)
-            return AVERROR_INVALIDDATA;
-
-        for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
-            int m;
-
-            /* Select the mid-tread linear quantizer */
-            int abits = s->dca_chan[k].bitalloc[l];
-
-            uint32_t quant_step_size = quant_step_table[abits];
-
-            /*
-             * Extract bits from the bit stream
-             */
-            if (!abits)
-                memset(subband_samples[l], 0, SAMPLES_PER_SUBBAND *
-                       sizeof(subband_samples[l][0]));
-            else {
-                uint32_t rscale;
-                /* Deal with transients */
-                int sfi = s->dca_chan[k].transition_mode[l] &&
-                    subsubframe >= s->dca_chan[k].transition_mode[l];
-                /* Determine quantization index code book and its type.
-                   Select quantization index code book */
-                int sel = s->audio_header.quant_index_huffman[k][abits];
-
-                rscale = (s->dca_chan[k].scale_factor[l][sfi] *
-                          s->audio_header.scalefactor_adj[k][sel] + 8) >> 4;
-
-                if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table) {
-                    if (abits <= 7) {
-                        /* Block code */
-                        int block_code1, block_code2, size, levels, err;
-
-                        size   = abits_sizes[abits - 1];
-                        levels = abits_levels[abits - 1];
-
-                        block_code1 = get_bits(&s->gb, size);
-                        block_code2 = get_bits(&s->gb, size);
-                        err         = decode_blockcodes(block_code1, block_code2,
-                                                        levels, subband_samples[l]);
-                        if (err) {
-                            av_log(s->avctx, AV_LOG_ERROR,
-                                   "ERROR: block code look-up failed\n");
-                            return AVERROR_INVALIDDATA;
-                        }
-                    } else {
-                        /* no coding */
-                        for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
-                            subband_samples[l][m] = get_sbits(&s->gb, abits - 3);
-                    }
-                } else {
-                    /* Huffman coded */
-                    for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
-                        subband_samples[l][m] = get_bitalloc(&s->gb,
-                                                             &dca_smpl_bitalloc[abits], sel);
-                }
-                s->dcadsp.dequantize(subband_samples[l], quant_step_size, rscale);
+        // Parse XLL component in EXSS
+        if (asset && (asset->extension_mask & DCA_EXSS_XLL)) {
+            if ((ret = ff_dca_xll_parse(&s->xll, input, asset)) < 0) {
+                // Conceal XLL synchronization error
+                if (ret == AVERROR(EAGAIN)
+                    && (prev_packet & DCA_PACKET_XLL)
+                    && (s->packet & DCA_PACKET_CORE))
+                    s->packet |= DCA_PACKET_XLL | DCA_PACKET_RECOVERY;
+                else if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->packet |= DCA_PACKET_XLL;
             }
         }
 
-        for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
-            int m;
-            /*
-             * Inverse ADPCM if in prediction mode
-             */
-            if (s->dca_chan[k].prediction_mode[l]) {
-                int n;
-                if (s->predictor_history)
-                    subband_samples[l][0] += (ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][3] +
-                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][2] +
-                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][1] +
-                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
-                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][0]) +
-                                              (1 << 12) >> 13;
-                for (m = 1; m < SAMPLES_PER_SUBBAND; m++) {
-                    int64_t sum = ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
-                                  (int64_t)subband_samples[l][m - 1];
-                    for (n = 2; n <= 4; n++)
-                        if (m >= n)
-                            sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
-                                   (int64_t)subband_samples[l][m - n];
-                        else if (s->predictor_history)
-                            sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
-                                   (int64_t)s->dca_chan[k].subband_samples_hist[l][m - n + 4];
-                    subband_samples[l][m] += (int32_t)(sum + (1 << 12) >> 13);
-                }
-            }
-
-        }
-        /* Backup predictor history for adpcm */
-        for (l = 0; l < DCA_SUBBANDS; l++)
-            AV_COPY128(s->dca_chan[k].subband_samples_hist[l], &subband_samples[l][4]);
-
-
-        /*
-         * Decode VQ encoded high frequencies
-         */
-        if (s->audio_header.subband_activity[k] > s->audio_header.vq_start_subband[k]) {
-            if (!s->debug_flag & 0x01) {
-                av_log(s->avctx, AV_LOG_DEBUG,
-                       "Stream with high frequencies VQ coding\n");
-                s->debug_flag |= 0x01;
+        // Parse LBR component in EXSS
+        if (asset && (asset->extension_mask & DCA_EXSS_LBR)) {
+            if ((ret = ff_dca_lbr_parse(&s->lbr, input, asset)) < 0) {
+                if (ret == AVERROR(ENOMEM) || (avctx->err_recognition & AV_EF_EXPLODE))
+                    return ret;
+            } else {
+                s->packet |= DCA_PACKET_LBR;
             }
-
-            s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
-                                ff_dca_high_freq_vq,
-                                subsubframe * SAMPLES_PER_SUBBAND,
-                                s->dca_chan[k].scale_factor,
-                                s->audio_header.vq_start_subband[k],
-                                s->audio_header.subband_activity[k]);
         }
-    }
 
-    /* Check for DSYNC after subsubframe */
-    if (s->aspf || subsubframe == s->subsubframes[s->current_subframe] - 1) {
-        if (get_bits(&s->gb, 16) != 0xFFFF) {
-            av_log(s->avctx, AV_LOG_ERROR, "Didn't get subframe DSYNC\n");
-            return AVERROR_INVALIDDATA;
-        }
+        // Parse core extensions in EXSS or backward compatible core sub-stream
+        if ((s->packet & DCA_PACKET_CORE)
+            && (ret = ff_dca_core_parse_exss(&s->core, input, asset)) < 0)
+            return ret;
     }
 
-    return 0;
-}
-
-static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
-{
-    int k;
-
-    if (upsample) {
-        LOCAL_ALIGNED(32, float, samples, [DCA_SUBBANDS_X96K], [SAMPLES_PER_SUBBAND]);
+    // Filter the frame
+    if (s->packet & DCA_PACKET_LBR) {
+        if ((ret = ff_dca_lbr_filter_frame(&s->lbr, frame)) < 0)
+            return ret;
+    } else if (s->packet & DCA_PACKET_XLL) {
+        if (s->packet & DCA_PACKET_CORE) {
+            int x96_synth = -1;
+
+            // Enable X96 synthesis if needed
+            if (s->xll.chset[0].freq == 96000 && s->core.sample_rate == 48000)
+                x96_synth = 1;
+
+            if ((ret = ff_dca_core_filter_fixed(&s->core, x96_synth)) < 0)
+                return ret;
+
+            // Force lossy downmixed output on the first core frame filtered.
+            // This prevents audible clicks when seeking and is consistent with
+            // what reference decoder does when there are multiple channel sets.
+            if (!(prev_packet & DCA_PACKET_RESIDUAL) && s->xll.nreschsets > 0
+                && s->xll.nchsets > 1) {
+                av_log(avctx, AV_LOG_VERBOSE, "Forcing XLL recovery mode\n");
+                s->packet |= DCA_PACKET_RECOVERY;
+            }
 
-        if (!s->qmf64_table) {
-            s->qmf64_table = qmf64_precompute();
-            if (!s->qmf64_table)
-                return AVERROR(ENOMEM);
+            // Set 'residual ok' flag for the next frame
+            s->packet |= DCA_PACKET_RESIDUAL;
         }
 
-        /* 64 subbands QMF */
-        for (k = 0; k < s->audio_header.prim_channels; k++) {
-            int32_t (*subband_samples)[SAMPLES_PER_SUBBAND] =
-                     s->dca_chan[k].subband_samples[block_index];
-
-            s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
-                                       DCA_SUBBANDS_X96K * SAMPLES_PER_SUBBAND);
-
-            if (s->channel_order_tab[k] >= 0)
-                qmf_64_subbands(s, k, samples,
-                                s->samples_chanptr[s->channel_order_tab[k]],
-                                /* Upsampling needs a factor 2 here. */
-                                M_SQRT2 / 32768.0);
+        if ((ret = ff_dca_xll_filter_frame(&s->xll, frame)) < 0) {
+            // Fall back to core unless hard error
+            if (!(s->packet & DCA_PACKET_CORE))
+                return ret;
+            if (ret != AVERROR_INVALIDDATA || (avctx->err_recognition & AV_EF_EXPLODE))
+                return ret;
+            if ((ret = ff_dca_core_filter_frame(&s->core, frame)) < 0)
+                return ret;
         }
+    } else if (s->packet & DCA_PACKET_CORE) {
+        if ((ret = ff_dca_core_filter_frame(&s->core, frame)) < 0)
+            return ret;
+        if (s->core.filter_mode & DCA_FILTER_MODE_FIXED)
+            s->packet |= DCA_PACKET_RESIDUAL;
     } else {
-        /* 32 subbands QMF */
-        LOCAL_ALIGNED(32, float, samples, [DCA_SUBBANDS], [SAMPLES_PER_SUBBAND]);
-
-        for (k = 0; k < s->audio_header.prim_channels; k++) {
-            int32_t (*subband_samples)[SAMPLES_PER_SUBBAND] =
-                     s->dca_chan[k].subband_samples[block_index];
-
-            s->fmt_conv.int32_to_float(samples[0], subband_samples[0],
-                                       DCA_SUBBANDS * SAMPLES_PER_SUBBAND);
-
-            if (s->channel_order_tab[k] >= 0)
-                qmf_32_subbands(s, k, samples,
-                                s->samples_chanptr[s->channel_order_tab[k]],
-                                M_SQRT1_2 / 32768.0);
-        }
-    }
-
-    /* Generate LFE samples for this subsubframe FIXME!!! */
-    if (s->lfe) {
-        float *samples = s->samples_chanptr[ff_dca_lfe_index[s->amode]];
-        lfe_interpolation_fir(s,
-                              s->lfe_data + 2 * s->lfe * (block_index + 4),
-                              samples);
-        if (upsample) {
-            unsigned i;
-            /* Should apply the filter in Table 6-11 when upsampling. For
-             * now, just duplicate. */
-            for (i = 511; i > 0; i--) {
-                samples[2 * i]     =
-                samples[2 * i + 1] = samples[i];
-            }
-            samples[1] = samples[0];
-        }
-    }
-
-    /* FIXME: This downmixing is probably broken with upsample.
-     * Probably totally broken also with XLL in general. */
-    /* Downmixing to Stereo */
-    if (s->audio_header.prim_channels + !!s->lfe > 2 &&
-        s->avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-        dca_downmix(s->samples_chanptr, s->amode, !!s->lfe, s->downmix_coef,
-                    s->channel_order_tab);
-    }
-
-    return 0;
-}
-
-static int dca_subframe_footer(DCAContext *s, int base_channel)
-{
-    int in, out, aux_data_count, aux_data_end, reserved;
-    uint32_t nsyncaux;
-
-    /*
-     * Unpack optional information
-     */
-
-    /* presumably optional information only appears in the core? */
-    if (!base_channel) {
-        if (s->timestamp)
-            skip_bits_long(&s->gb, 32);
-
-        if (s->aux_data) {
-            aux_data_count = get_bits(&s->gb, 6);
-
-            // align (32-bit)
-            skip_bits_long(&s->gb, (-get_bits_count(&s->gb)) & 31);
-
-            aux_data_end = 8 * aux_data_count + get_bits_count(&s->gb);
-
-            if ((nsyncaux = get_bits_long(&s->gb, 32)) != DCA_NSYNCAUX) {
-                av_log(s->avctx, AV_LOG_ERROR, "nSYNCAUX mismatch %#"PRIx32"\n",
-                       nsyncaux);
-                return AVERROR_INVALIDDATA;
-            }
-
-            if (get_bits1(&s->gb)) { // bAUXTimeStampFlag
-                avpriv_request_sample(s->avctx,
-                                      "Auxiliary Decode Time Stamp Flag");
-                // align (4-bit)
-                skip_bits(&s->gb, (-get_bits_count(&s->gb)) & 4);
-                // 44 bits: nMSByte (8), nMarker (4), nLSByte (28), nMarker (4)
-                skip_bits_long(&s->gb, 44);
-            }
-
-            if ((s->core_downmix = get_bits1(&s->gb))) {
-                int am = get_bits(&s->gb, 3);
-                switch (am) {
-                case 0:
-                    s->core_downmix_amode = DCA_MONO;
-                    break;
-                case 1:
-                    s->core_downmix_amode = DCA_STEREO;
-                    break;
-                case 2:
-                    s->core_downmix_amode = DCA_STEREO_TOTAL;
-                    break;
-                case 3:
-                    s->core_downmix_amode = DCA_3F;
-                    break;
-                case 4:
-                    s->core_downmix_amode = DCA_2F1R;
-                    break;
-                case 5:
-                    s->core_downmix_amode = DCA_2F2R;
-                    break;
-                case 6:
-                    s->core_downmix_amode = DCA_3F1R;
-                    break;
-                default:
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Invalid mode %d for embedded downmix coefficients\n",
-                           am);
-                    return AVERROR_INVALIDDATA;
-                }
-                for (out = 0; out < ff_dca_channels[s->core_downmix_amode]; out++) {
-                    for (in = 0; in < s->audio_header.prim_channels + !!s->lfe; in++) {
-                        uint16_t tmp = get_bits(&s->gb, 9);
-                        if ((tmp & 0xFF) > 241) {
-                            av_log(s->avctx, AV_LOG_ERROR,
-                                   "Invalid downmix coefficient code %"PRIu16"\n",
-                                   tmp);
-                            return AVERROR_INVALIDDATA;
-                        }
-                        s->core_downmix_codes[in][out] = tmp;
-                    }
-                }
-            }
-
-            align_get_bits(&s->gb); // byte align
-            skip_bits(&s->gb, 16);  // nAUXCRC16
-
-            /*
-             * additional data (reserved, cf. ETSI TS 102 114 V1.4.1)
-             *
-             * Note: don't check for overreads, aux_data_count can't be trusted.
-             */
-            if ((reserved = (aux_data_end - get_bits_count(&s->gb))) > 0) {
-                avpriv_request_sample(s->avctx,
-                                      "Core auxiliary data reserved content");
-                skip_bits_long(&s->gb, reserved);
-            }
-        }
-
-        if (s->crc_present && s->dynrange)
-            get_bits(&s->gb, 16);
-    }
-
-    return 0;
-}
-
-/**
- * Decode a dca frame block
- *
- * @param s     pointer to the DCAContext
- */
-
-static int dca_decode_block(DCAContext *s, int base_channel, int block_index)
-{
-    int ret;
-
-    /* Sanity check */
-    if (s->current_subframe >= s->audio_header.subframes) {
-        av_log(s->avctx, AV_LOG_DEBUG, "check failed: %i>%i",
-               s->current_subframe, s->audio_header.subframes);
+        av_log(avctx, AV_LOG_ERROR, "No valid DCA sub-stream found\n");
+        if (s->core_only)
+            av_log(avctx, AV_LOG_WARNING, "Consider disabling 'core_only' option\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (!s->current_subsubframe) {
-        /* Read subframe header */
-        if ((ret = dca_subframe_header(s, base_channel, block_index)))
-            return ret;
-    }
-
-    /* Read subsubframe */
-    if ((ret = dca_subsubframe(s, base_channel, block_index)))
-        return ret;
-
-    /* Update state */
-    s->current_subsubframe++;
-    if (s->current_subsubframe >= s->subsubframes[s->current_subframe]) {
-        s->current_subsubframe = 0;
-        s->current_subframe++;
-    }
-    if (s->current_subframe >= s->audio_header.subframes) {
-        /* Read subframe footer */
-        if ((ret = dca_subframe_footer(s, base_channel)))
-            return ret;
-    }
-
-    return 0;
-}
+    *got_frame_ptr = 1;
 
-static float dca_dmix_code(unsigned code)
-{
-    int sign = (code >> 8) - 1;
-    code &= 0xff;
-    return ((ff_dca_dmixtable[code] ^ sign) - sign) * (1.0 / (1U << 15));
+    return avpkt->size;
 }
 
-static int scan_for_extensions(AVCodecContext *avctx)
+static av_cold void dcadec_flush(AVCodecContext *avctx)
 {
     DCAContext *s = avctx->priv_data;
-    int core_ss_end, ret = 0;
-
-    core_ss_end = FFMIN(s->frame_size, s->dca_buffer_size) * 8;
-
-    /* only scan for extensions if ext_descr was unknown or indicated a
-     * supported XCh extension */
-    if (s->core_ext_mask < 0 || s->core_ext_mask & DCA_EXT_XCH) {
-        /* if ext_descr was unknown, clear s->core_ext_mask so that the
-         * extensions scan can fill it up */
-        s->core_ext_mask = FFMAX(s->core_ext_mask, 0);
-
-        /* extensions start at 32-bit boundaries into bitstream */
-        skip_bits_long(&s->gb, (-get_bits_count(&s->gb)) & 31);
-
-        while (core_ss_end - get_bits_count(&s->gb) >= 32) {
-            uint32_t bits = get_bits_long(&s->gb, 32);
-            int i;
-
-            switch (bits) {
-            case DCA_SYNCWORD_XCH: {
-                int ext_amode, xch_fsize;
-
-                s->xch_base_channel = s->audio_header.prim_channels;
-
-                /* validate sync word using XCHFSIZE field */
-                xch_fsize = show_bits(&s->gb, 10);
-                if ((s->frame_size != (get_bits_count(&s->gb) >> 3) - 4 + xch_fsize) &&
-                    (s->frame_size != (get_bits_count(&s->gb) >> 3) - 4 + xch_fsize + 1))
-                    continue;
-
-                /* skip length-to-end-of-frame field for the moment */
-                skip_bits(&s->gb, 10);
-
-                s->core_ext_mask |= DCA_EXT_XCH;
-
-                /* extension amode(number of channels in extension) should be 1 */
-                /* AFAIK XCh is not used for more channels */
-                if ((ext_amode = get_bits(&s->gb, 4)) != 1) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "XCh extension amode %d not supported!\n",
-                           ext_amode);
-                    continue;
-                }
 
-                /* much like core primary audio coding header */
-                dca_parse_audio_coding_header(s, s->xch_base_channel);
+    ff_dca_core_flush(&s->core);
+    ff_dca_xll_flush(&s->xll);
+    ff_dca_lbr_flush(&s->lbr);
 
-                for (i = 0; i < (s->sample_blocks / 8); i++)
-                    if ((ret = dca_decode_block(s, s->xch_base_channel, i))) {
-                        av_log(avctx, AV_LOG_ERROR, "error decoding XCh extension\n");
-                        continue;
-                    }
-
-                s->xch_present = 1;
-                break;
-            }
-            case DCA_SYNCWORD_XXCH:
-                /* XXCh: extended channels */
-                /* usually found either in core or HD part in DTS-HD HRA streams,
-                 * but not in DTS-ES which contains XCh extensions instead */
-                s->core_ext_mask |= DCA_EXT_XXCH;
-                break;
-
-            case 0x1d95f262: {
-                int fsize96 = show_bits(&s->gb, 12) + 1;
-                if (s->frame_size != (get_bits_count(&s->gb) >> 3) - 4 + fsize96)
-                    continue;
-
-                av_log(avctx, AV_LOG_DEBUG, "X96 extension found at %d bits\n",
-                       get_bits_count(&s->gb));
-                skip_bits(&s->gb, 12);
-                av_log(avctx, AV_LOG_DEBUG, "FSIZE96 = %d bytes\n", fsize96);
-                av_log(avctx, AV_LOG_DEBUG, "REVNO = %d\n", get_bits(&s->gb, 4));
-
-                s->core_ext_mask |= DCA_EXT_X96;
-                break;
-            }
-            }
-
-            skip_bits_long(&s->gb, (-get_bits_count(&s->gb)) & 31);
-        }
-    } else {
-        /* no supported extensions, skip the rest of the core substream */
-        skip_bits_long(&s->gb, core_ss_end - get_bits_count(&s->gb));
-    }
-
-    if (s->core_ext_mask & DCA_EXT_X96)
-        s->profile = FF_PROFILE_DTS_96_24;
-    else if (s->core_ext_mask & (DCA_EXT_XCH | DCA_EXT_XXCH))
-        s->profile = FF_PROFILE_DTS_ES;
-
-    /* check for ExSS (HD part) */
-    if (s->dca_buffer_size - s->frame_size > 32 &&
-        get_bits_long(&s->gb, 32) == DCA_SYNCWORD_SUBSTREAM)
-        ff_dca_exss_parse_header(s);
-
-    return ret;
+    s->packet &= DCA_PACKET_MASK;
 }
 
-static int set_channel_layout(AVCodecContext *avctx, int channels, int num_core_channels)
+static av_cold int dcadec_close(AVCodecContext *avctx)
 {
     DCAContext *s = avctx->priv_data;
-    int i;
-
-    if (s->amode < 16) {
-        avctx->channel_layout = dca_core_channel_layout[s->amode];
-
-        if (s->audio_header.prim_channels + !!s->lfe > 2 &&
-            avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-            /*
-             * Neither the core's auxiliary data nor our default tables contain
-             * downmix coefficients for the additional channel coded in the XCh
-             * extension, so when we're doing a Stereo downmix, don't decode it.
-             */
-            s->xch_disable = 1;
-        }
 
-        if (s->xch_present && !s->xch_disable) {
-            avctx->channel_layout |= AV_CH_BACK_CENTER;
-            if (s->lfe) {
-                avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
-                s->channel_order_tab = ff_dca_channel_reorder_lfe_xch[s->amode];
-            } else {
-                s->channel_order_tab = ff_dca_channel_reorder_nolfe_xch[s->amode];
-            }
-        } else {
-            channels       = num_core_channels + !!s->lfe;
-            s->xch_present = 0; /* disable further xch processing */
-            if (s->lfe) {
-                avctx->channel_layout |= AV_CH_LOW_FREQUENCY;
-                s->channel_order_tab = ff_dca_channel_reorder_lfe[s->amode];
-            } else
-                s->channel_order_tab = ff_dca_channel_reorder_nolfe[s->amode];
-        }
+    ff_dca_core_close(&s->core);
+    ff_dca_xll_close(&s->xll);
+    ff_dca_lbr_close(&s->lbr);
 
-        if (channels > !!s->lfe &&
-            s->channel_order_tab[channels - 1 - !!s->lfe] < 0)
-            return AVERROR_INVALIDDATA;
-
-        if (num_core_channels + !!s->lfe > 2 &&
-            avctx->request_channel_layout == AV_CH_LAYOUT_STEREO) {
-            channels              = 2;
-            s->output             = s->audio_header.prim_channels == 2 ? s->amode : DCA_STEREO;
-            avctx->channel_layout = AV_CH_LAYOUT_STEREO;
-
-            /* Stereo downmix coefficients
-             *
-             * The decoder can only downmix to 2-channel, so we need to ensure
-             * embedded downmix coefficients are actually targeting 2-channel.
-             */
-            if (s->core_downmix && (s->core_downmix_amode == DCA_STEREO ||
-                                    s->core_downmix_amode == DCA_STEREO_TOTAL)) {
-                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                    /* Range checked earlier */
-                    s->downmix_coef[i][0] = dca_dmix_code(s->core_downmix_codes[i][0]);
-                    s->downmix_coef[i][1] = dca_dmix_code(s->core_downmix_codes[i][1]);
-                }
-                s->output = s->core_downmix_amode;
-            } else {
-                int am = s->amode & DCA_CHANNEL_MASK;
-                if (am >= FF_ARRAY_ELEMS(ff_dca_default_coeffs)) {
-                    av_log(s->avctx, AV_LOG_ERROR,
-                           "Invalid channel mode %d\n", am);
-                    return AVERROR_INVALIDDATA;
-                }
-                if (num_core_channels + !!s->lfe >
-                    FF_ARRAY_ELEMS(ff_dca_default_coeffs[0])) {
-                    avpriv_request_sample(s->avctx, "Downmixing %d channels",
-                                          s->audio_header.prim_channels + !!s->lfe);
-                    return AVERROR_PATCHWELCOME;
-                }
-                for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                    s->downmix_coef[i][0] = ff_dca_default_coeffs[am][i][0];
-                    s->downmix_coef[i][1] = ff_dca_default_coeffs[am][i][1];
-                }
-            }
-            ff_dlog(s->avctx, "Stereo downmix coeffs:\n");
-            for (i = 0; i < num_core_channels + !!s->lfe; i++) {
-                ff_dlog(s->avctx, "L, input channel %d = %f\n", i,
-                        s->downmix_coef[i][0]);
-                ff_dlog(s->avctx, "R, input channel %d = %f\n", i,
-                        s->downmix_coef[i][1]);
-            }
-            ff_dlog(s->avctx, "\n");
-        }
-    } else {
-        av_log(avctx, AV_LOG_ERROR, "Nonstandard configuration %d !\n", s->amode);
-        return AVERROR_INVALIDDATA;
-    }
+    av_freep(&s->buffer);
+    s->buffer_size = 0;
 
     return 0;
 }
 
-/**
- * Main frame decoding function
- * FIXME add arguments
- */
-static int dca_decode_frame(AVCodecContext *avctx, void *data,
-                            int *got_frame_ptr, AVPacket *avpkt)
+static av_cold int dcadec_init(AVCodecContext *avctx)
 {
-    AVFrame *frame     = data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-
-    int lfe_samples;
-    int num_core_channels = 0;
-    int i, ret;
-    float  **samples_flt;
     DCAContext *s = avctx->priv_data;
-    int channels, full_channels;
-    int upsample = 0;
-
-    s->exss_ext_mask = 0;
-    s->xch_present   = 0;
 
-    s->dca_buffer_size = ff_dca_convert_bitstream(buf, buf_size, s->dca_buffer,
-                                                  DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE);
-    if (s->dca_buffer_size == AVERROR_INVALIDDATA) {
-        av_log(avctx, AV_LOG_ERROR, "Not a valid DCA frame\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    if ((ret = dca_parse_frame_header(s)) < 0) {
-        // seems like the frame is corrupt, try with the next one
-        return ret;
-    }
-    // set AVCodec values with parsed data
-    avctx->sample_rate = s->sample_rate;
-    avctx->bit_rate    = s->bit_rate;
-
-    s->profile = FF_PROFILE_DTS;
-
-    for (i = 0; i < (s->sample_blocks / SAMPLES_PER_SUBBAND); i++) {
-        if ((ret = dca_decode_block(s, 0, i))) {
-            av_log(avctx, AV_LOG_ERROR, "error decoding block\n");
-            return ret;
-        }
-    }
-
-    /* record number of core channels incase less than max channels are requested */
-    num_core_channels = s->audio_header.prim_channels;
-
-    if (s->ext_coding)
-        s->core_ext_mask = dca_ext_audio_descr_mask[s->ext_descr];
-    else
-        s->core_ext_mask = 0;
-
-    ret = scan_for_extensions(avctx);
-
-    avctx->profile = s->profile;
-
-    full_channels = channels = s->audio_header.prim_channels + !!s->lfe;
-
-    ret = set_channel_layout(avctx, channels, num_core_channels);
-    if (ret < 0)
-        return ret;
-    avctx->channels = channels;
-
-    /* get output buffer */
-    frame->nb_samples = 256 * (s->sample_blocks / SAMPLES_PER_SUBBAND);
-    if (s->exss_ext_mask & DCA_EXT_EXSS_XLL) {
-        int xll_nb_samples = s->xll_segments * s->xll_smpl_in_seg;
-        /* Check for invalid/unsupported conditions first */
-        if (s->xll_residual_channels > channels) {
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "DCA: too many residual channels (%d, core channels %d). Disabling XLL\n",
-                   s->xll_residual_channels, channels);
-            s->exss_ext_mask &= ~DCA_EXT_EXSS_XLL;
-        } else if (xll_nb_samples != frame->nb_samples &&
-                   2 * frame->nb_samples != xll_nb_samples) {
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "DCA: unsupported upsampling (%d XLL samples, %d core samples). Disabling XLL\n",
-                   xll_nb_samples, frame->nb_samples);
-            s->exss_ext_mask &= ~DCA_EXT_EXSS_XLL;
-        } else {
-            if (2 * frame->nb_samples == xll_nb_samples) {
-                av_log(s->avctx, AV_LOG_INFO,
-                       "XLL: upsampling core channels by a factor of 2\n");
-                upsample = 1;
-
-                frame->nb_samples = xll_nb_samples;
-                // FIXME: Is it good enough to copy from the first channel set?
-                avctx->sample_rate = s->xll_chsets[0].sampling_frequency;
-            }
-            /* If downmixing to stereo, don't decode additional channels.
-             * FIXME: Using the xch_disable flag for this doesn't seem right. */
-            if (!s->xch_disable)
-                avctx->channels += s->xll_channels - s->xll_residual_channels;
-        }
-    }
-
-    /* FIXME: This is an ugly hack, to just revert to the default
-     * layout if we have additional channels. Need to convert the XLL
-     * channel masks to libav channel_layout mask. */
-    if (av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels)
-        avctx->channel_layout = 0;
-
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
-    samples_flt = (float **) frame->extended_data;
-
-    /* allocate buffer for extra channels if downmixing */
-    if (avctx->channels < full_channels) {
-        ret = av_samples_get_buffer_size(NULL, full_channels - channels,
-                                         frame->nb_samples,
-                                         avctx->sample_fmt, 0);
-        if (ret < 0)
-            return ret;
-
-        av_fast_malloc(&s->extra_channels_buffer,
-                       &s->extra_channels_buffer_size, ret);
-        if (!s->extra_channels_buffer)
-            return AVERROR(ENOMEM);
-
-        ret = av_samples_fill_arrays((uint8_t **) s->extra_channels, NULL,
-                                     s->extra_channels_buffer,
-                                     full_channels - channels,
-                                     frame->nb_samples, avctx->sample_fmt, 0);
-        if (ret < 0)
-            return ret;
-    }
-
-    /* filter to get final output */
-    for (i = 0; i < (s->sample_blocks / SAMPLES_PER_SUBBAND); i++) {
-        int ch;
-        unsigned block = upsample ? 512 : 256;
-        for (ch = 0; ch < channels; ch++)
-            s->samples_chanptr[ch] = samples_flt[ch] + i * block;
-        for (; ch < full_channels; ch++)
-            s->samples_chanptr[ch] = s->extra_channels[ch - channels] + i * block;
-
-        dca_filter_channels(s, i, upsample);
-
-        /* If this was marked as a DTS-ES stream we need to subtract back- */
-        /* channel from SL & SR to remove matrixed back-channel signal */
-        if ((s->source_pcm_res & 1) && s->xch_present) {
-            float *back_chan = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel]];
-            float *lt_chan   = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel - 2]];
-            float *rt_chan   = s->samples_chanptr[s->channel_order_tab[s->xch_base_channel - 1]];
-            s->fdsp.vector_fmac_scalar(lt_chan, back_chan, -M_SQRT1_2, 256);
-            s->fdsp.vector_fmac_scalar(rt_chan, back_chan, -M_SQRT1_2, 256);
-        }
-    }
-
-    /* update lfe history */
-    lfe_samples = 2 * s->lfe * (s->sample_blocks / SAMPLES_PER_SUBBAND);
-    for (i = 0; i < 2 * s->lfe * 4; i++)
-        s->lfe_data[i] = s->lfe_data[i + lfe_samples];
-
-    if (s->exss_ext_mask & DCA_EXT_EXSS_XLL) {
-        ret = ff_dca_xll_decode_audio(s, frame);
-        if (ret < 0)
-            return ret;
-    }
-    /* AVMatrixEncoding
-     *
-     * DCA_STEREO_TOTAL (Lt/Rt) is equivalent to Dolby Surround */
-    ret = ff_side_data_update_matrix_encoding(frame,
-                                              (s->output & ~DCA_LFE) == DCA_STEREO_TOTAL ?
-                                              AV_MATRIX_ENCODING_DOLBY : AV_MATRIX_ENCODING_NONE);
-    if (ret < 0)
-        return ret;
-
-    *got_frame_ptr = 1;
-
-    return buf_size;
-}
+    s->avctx = avctx;
+    s->core.avctx = avctx;
+    s->exss.avctx = avctx;
+    s->xll.avctx = avctx;
+    s->lbr.avctx = avctx;
 
-/**
- * DCA initialization
- *
- * @param avctx     pointer to the AVCodecContext
- */
+    ff_dca_init_vlcs();
 
-static av_cold int dca_decode_init(AVCodecContext *avctx)
-{
-    DCAContext *s = avctx->priv_data;
+    if (ff_dca_core_init(&s->core) < 0)
+        return AVERROR(ENOMEM);
 
-    s->avctx = avctx;
-    dca_init_vlcs();
+    if (ff_dca_lbr_init(&s->lbr) < 0)
+        return AVERROR(ENOMEM);
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
-    ff_mdct_init(&s->imdct, 6, 1, 1.0);
-    ff_synth_filter_init(&s->synth);
     ff_dcadsp_init(&s->dcadsp);
-    ff_fmt_convert_init(&s->fmt_conv, avctx);
+    s->core.dcadsp = &s->dcadsp;
+    s->xll.dcadsp = &s->dcadsp;
+    s->lbr.dcadsp = &s->dcadsp;
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    s->crctab = av_crc_get_table(AV_CRC_16_CCITT);
 
-    /* allow downmixing to stereo */
-    if (avctx->channels > 2 &&
-        avctx->request_channel_layout == AV_CH_LAYOUT_STEREO)
-        avctx->channels = 2;
+    switch (avctx->request_channel_layout & ~AV_CH_LAYOUT_NATIVE) {
+    case 0:
+        s->request_channel_layout = 0;
+        break;
+    case AV_CH_LAYOUT_STEREO:
+    case AV_CH_LAYOUT_STEREO_DOWNMIX:
+        s->request_channel_layout = DCA_SPEAKER_LAYOUT_STEREO;
+        break;
+    case AV_CH_LAYOUT_5POINT0:
+        s->request_channel_layout = DCA_SPEAKER_LAYOUT_5POINT0;
+        break;
+    case AV_CH_LAYOUT_5POINT1:
+        s->request_channel_layout = DCA_SPEAKER_LAYOUT_5POINT1;
+        break;
+    default:
+        av_log(avctx, AV_LOG_WARNING, "Invalid request_channel_layout\n");
+        break;
+    }
 
     return 0;
 }
 
-static av_cold int dca_decode_end(AVCodecContext *avctx)
-{
-    DCAContext *s = avctx->priv_data;
-    ff_mdct_end(&s->imdct);
-    av_freep(&s->extra_channels_buffer);
-    av_freep(&s->xll_sample_buf);
-    av_freep(&s->qmf64_table);
-    return 0;
-}
+#define OFFSET(x) offsetof(DCAContext, x)
+#define PARAM AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
-static const AVOption options[] = {
-    { "disable_xch", "disable decoding of the XCh extension", offsetof(DCAContext, xch_disable), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM },
-    { "disable_xll", "disable decoding of the XLL extension", offsetof(DCAContext, xll_disable), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM },
-    { NULL },
+static const AVOption dcadec_options[] = {
+    { "core_only", "Decode core only without extensions", OFFSET(core_only), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, PARAM },
+    { NULL }
 };
 
-static const AVClass dca_decoder_class = {
+static const AVClass dcadec_class = {
     .class_name = "DCA decoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = dcadec_options,
     .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DECODER,
 };
 
 AVCodec ff_dca_decoder = {
-    .name            = "dca",
-    .long_name       = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"),
-    .type            = AVMEDIA_TYPE_AUDIO,
-    .id              = AV_CODEC_ID_DTS,
-    .priv_data_size  = sizeof(DCAContext),
-    .init            = dca_decode_init,
-    .decode          = dca_decode_frame,
-    .close           = dca_decode_end,
-    .capabilities    = AV_CODEC_CAP_CHANNEL_CONF | AV_CODEC_CAP_DR1,
-    .sample_fmts     = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
-                                                       AV_SAMPLE_FMT_NONE },
-    .profiles        = NULL_IF_CONFIG_SMALL(ff_dca_profiles),
-    .priv_class      = &dca_decoder_class,
+    .name           = "dca",
+    .long_name      = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_DTS,
+    .priv_data_size = sizeof(DCAContext),
+    .init           = dcadec_init,
+    .decode         = dcadec_decode_frame,
+    .close          = dcadec_close,
+    .flush          = dcadec_flush,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_CHANNEL_CONF,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S32P,
+                                                      AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_NONE },
+    .priv_class     = &dcadec_class,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_dca_profiles),
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
diff --git a/libavcodec/dcadec.h b/libavcodec/dcadec.h
new file mode 100644
index 0000000..456f3c4
--- /dev/null
+++ b/libavcodec/dcadec.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016 foo86
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCADEC_H
+#define AVCODEC_DCADEC_H
+
+#include "libavutil/common.h"
+#include "libavutil/crc.h"
+#include "libavutil/float_dsp.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "dca.h"
+#include "dcadsp.h"
+#include "dca_core.h"
+#include "dca_exss.h"
+#include "dca_xll.h"
+#include "dca_lbr.h"
+
+#define DCA_PACKET_CORE         0x01
+#define DCA_PACKET_EXSS         0x02
+#define DCA_PACKET_XLL          0x04
+#define DCA_PACKET_LBR          0x08
+#define DCA_PACKET_MASK         0x0f
+
+#define DCA_PACKET_RECOVERY     0x10    ///< Sync error recovery flag
+#define DCA_PACKET_RESIDUAL     0x20    ///< Core valid for residual decoding
+
+typedef struct DCAContext {
+    const AVClass   *class;       ///< class for AVOptions
+    AVCodecContext  *avctx;
+
+    DCACoreDecoder core;  ///< Core decoder context
+    DCAExssParser  exss;  ///< EXSS parser context
+    DCAXllDecoder  xll;   ///< XLL decoder context
+    DCALbrDecoder  lbr;   ///< LBR decoder context
+
+    DCADSPContext   dcadsp;
+
+    const AVCRC     *crctab;
+
+    uint8_t         *buffer;    ///< Packet buffer
+    unsigned int    buffer_size;
+
+    int     packet; ///< Packet flags
+
+    int     request_channel_layout; ///< Converted from avctx.request_channel_layout
+    int     core_only;              ///< Core only decoding flag
+} DCAContext;
+
+int ff_dca_set_channel_layout(AVCodecContext *avctx, int *ch_remap, int dca_mask);
+
+void ff_dca_downmix_to_stereo_fixed(DCADSPContext *dcadsp, int32_t **samples,
+                                    int *coeff_l, int nsamples, int ch_mask);
+void ff_dca_downmix_to_stereo_float(AVFloatDSPContext *fdsp, float **samples,
+                                    int *coeff_l, int nsamples, int ch_mask);
+
+static inline int ff_dca_check_crc(AVCodecContext *avctx, GetBitContext *s,
+                                   int p1, int p2)
+{
+    DCAContext *dca = avctx->priv_data;
+
+    if (!(avctx->err_recognition & (AV_EF_CRCCHECK | AV_EF_CAREFUL)))
+        return 0;
+    if (((p1 | p2) & 7) || p1 < 0 || p2 > s->size_in_bits || p2 - p1 < 16)
+        return -1;
+    if (av_crc(dca->crctab, 0xffff, s->buffer + p1 / 8, (p2 - p1) / 8))
+        return -1;
+    return 0;
+}
+
+static inline int ff_dca_seek_bits(GetBitContext *s, int p)
+{
+    if (p < s->index || p > s->size_in_bits)
+        return -1;
+    s->index = p;
+    return 0;
+}
+
+#endif
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index beec200..1cd2e4e 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -1,134 +1,490 @@
 /*
- * Copyright (c) 2004 Gildas Bazin
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
 
 #include "dcadsp.h"
 #include "dcamath.h"
 
-static void decode_hf_c(int32_t dst[DCA_SUBBANDS][SAMPLES_PER_SUBBAND],
-                        const int32_t vq_num[DCA_SUBBANDS],
-                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
-                        int32_t scale[DCA_SUBBANDS][2],
-                        intptr_t start, intptr_t end)
+static void decode_hf_c(int32_t **dst,
+                        const int32_t *vq_index,
+                        const int8_t hf_vq[1024][32],
+                        int32_t scale_factors[32][2],
+                        ptrdiff_t sb_start, ptrdiff_t sb_end,
+                        ptrdiff_t ofs, ptrdiff_t len)
+{
+    int i, j;
+
+    for (i = sb_start; i < sb_end; i++) {
+        const int8_t *coeff = hf_vq[vq_index[i]];
+        int32_t scale = scale_factors[i][0];
+        for (j = 0; j < len; j++)
+            dst[i][j + ofs] = clip23(coeff[j] * scale + (1 << 3) >> 4);
+    }
+}
+
+static void decode_joint_c(int32_t **dst, int32_t **src,
+                           const int32_t *scale_factors,
+                           ptrdiff_t sb_start, ptrdiff_t sb_end,
+                           ptrdiff_t ofs, ptrdiff_t len)
+{
+    int i, j;
+
+    for (i = sb_start; i < sb_end; i++) {
+        int32_t scale = scale_factors[i];
+        for (j = 0; j < len; j++)
+            dst[i][j + ofs] = clip23(mul17(src[i][j + ofs], scale));
+    }
+}
+
+static void lfe_fir_float_c(float *pcm_samples, int32_t *lfe_samples,
+                            const float *filter_coeff, ptrdiff_t npcmblocks,
+                            int dec_select)
+{
+    // Select decimation factor
+    int factor = 64 << dec_select;
+    int ncoeffs = 8 >> dec_select;
+    int nlfesamples = npcmblocks >> (dec_select + 1);
+    int i, j, k;
+
+    for (i = 0; i < nlfesamples; i++) {
+        // One decimated sample generates 64 or 128 interpolated ones
+        for (j = 0; j < factor / 2; j++) {
+            float a = 0;
+            float b = 0;
+
+            for (k = 0; k < ncoeffs; k++) {
+                a += filter_coeff[      j * ncoeffs + k] * lfe_samples[-k];
+                b += filter_coeff[255 - j * ncoeffs - k] * lfe_samples[-k];
+            }
+
+            pcm_samples[             j] = a;
+            pcm_samples[factor / 2 + j] = b;
+        }
+
+        lfe_samples++;
+        pcm_samples += factor;
+    }
+}
+
+static void lfe_fir0_float_c(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks)
+{
+    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 0);
+}
+
+static void lfe_fir1_float_c(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks)
+{
+    lfe_fir_float_c(pcm_samples, lfe_samples, filter_coeff, npcmblocks, 1);
+}
+
+static void lfe_x96_float_c(float *dst, const float *src,
+                            float *hist, ptrdiff_t len)
+{
+    float prev = *hist;
+    int i;
+
+    for (i = 0; i < len; i++) {
+        float a = 0.25f * src[i] + 0.75f * prev;
+        float b = 0.75f * src[i] + 0.25f * prev;
+        prev = src[i];
+        *dst++ = a;
+        *dst++ = b;
+    }
+
+    *hist = prev;
+}
+
+static void sub_qmf32_float_c(SynthFilterContext *synth,
+                              FFTContext *imdct,
+                              float *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              float *hist1, int *offset, float *hist2,
+                              const float *filter_coeff, ptrdiff_t npcmblocks,
+                              float scale)
 {
+    LOCAL_ALIGNED_32(float, input, [32]);
     int i, j;
 
-    for (j = start; j < end; j++) {
-        const int8_t *ptr = &hf_vq[vq_num[j]][vq_offset];
-        for (i = 0; i < 8; i++)
-            dst[j][i] = ptr[i] * scale[j][0] + 8 >> 4;
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        for (i = 0; i < 32; i++) {
+            if ((i - 1) & 2)
+                input[i] = -subband_samples_lo[i][j];
+            else
+                input[i] =  subband_samples_lo[i][j];
+        }
+
+        // One subband sample generates 32 interpolated ones
+        synth->synth_filter_float(imdct, hist1, offset,
+                                  hist2, filter_coeff,
+                                  pcm_samples, input, scale);
+        pcm_samples += 32;
     }
 }
 
-static inline void dca_lfe_fir(float *out, const float *in, const float *coefs,
-                               int decifactor)
+static void sub_qmf64_float_c(SynthFilterContext *synth,
+                              FFTContext *imdct,
+                              float *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              float *hist1, int *offset, float *hist2,
+                              const float *filter_coeff, ptrdiff_t npcmblocks,
+                              float scale)
 {
-    float *out2    = out + 2 * decifactor - 1;
-    int num_coeffs = 256 / decifactor;
-    int j, k;
+    LOCAL_ALIGNED_32(float, input, [64]);
+    int i, j;
 
-    /* One decimated sample generates 2*decifactor interpolated ones */
-    for (k = 0; k < decifactor; k++) {
-        float v0 = 0.0;
-        float v1 = 0.0;
-        for (j = 0; j < num_coeffs; j++, coefs++) {
-            v0 += in[-j]                 * *coefs;
-            v1 += in[j + 1 - num_coeffs] * *coefs;
+    if (!subband_samples_hi)
+        memset(&input[32], 0, sizeof(input[0]) * 32);
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        if (subband_samples_hi) {
+            // Full 64 subbands, first 32 are residual coded
+            for (i =  0; i < 32; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_lo[i][j] - subband_samples_hi[i][j];
+                else
+                    input[i] =  subband_samples_lo[i][j] + subband_samples_hi[i][j];
+            }
+            for (i = 32; i < 64; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_hi[i][j];
+                else
+                    input[i] =  subband_samples_hi[i][j];
+            }
+        } else {
+            // Only first 32 subbands
+            for (i =  0; i < 32; i++) {
+                if ((i - 1) & 2)
+                    input[i] = -subband_samples_lo[i][j];
+                else
+                    input[i] =  subband_samples_lo[i][j];
+            }
         }
-        *out++  = v0;
-        *out2-- = v1;
+
+        // One subband sample generates 64 interpolated ones
+        synth->synth_filter_float_64(imdct, hist1, offset,
+                                     hist2, filter_coeff,
+                                     pcm_samples, input, scale);
+        pcm_samples += 64;
     }
 }
 
-static void dca_qmf_32_subbands(float samples_in[DCA_SUBBANDS][SAMPLES_PER_SUBBAND], int sb_act,
-                                SynthFilterContext *synth, FFTContext *imdct,
-                                float synth_buf_ptr[512],
-                                int *synth_buf_offset, float synth_buf2[32],
-                                const float window[512], float *samples_out,
-                                float raXin[32], float scale)
+static void lfe_fir_fixed_c(int32_t *pcm_samples, int32_t *lfe_samples,
+                            const int32_t *filter_coeff, ptrdiff_t npcmblocks)
 {
+    // Select decimation factor
+    int nlfesamples = npcmblocks >> 1;
+    int i, j, k;
+
+    for (i = 0; i < nlfesamples; i++) {
+        // One decimated sample generates 64 interpolated ones
+        for (j = 0; j < 32; j++) {
+            int64_t a = 0;
+            int64_t b = 0;
+
+            for (k = 0; k < 8; k++) {
+                a += (int64_t)filter_coeff[      j * 8 + k] * lfe_samples[-k];
+                b += (int64_t)filter_coeff[255 - j * 8 - k] * lfe_samples[-k];
+            }
+
+            pcm_samples[     j] = clip23(norm23(a));
+            pcm_samples[32 + j] = clip23(norm23(b));
+        }
+
+        lfe_samples++;
+        pcm_samples += 64;
+    }
+}
+
+static void lfe_x96_fixed_c(int32_t *dst, const int32_t *src,
+                            int32_t *hist, ptrdiff_t len)
+{
+    int32_t prev = *hist;
     int i;
-    int subindex;
-
-    for (i = sb_act; i < 32; i++)
-        raXin[i] = 0.0;
-
-    /* Reconstructed channel sample index */
-    for (subindex = 0; subindex < 8; subindex++) {
-        /* Load in one sample from each subband and clear inactive subbands */
-        for (i = 0; i < sb_act; i++) {
-            unsigned sign = (i - 1) & 2;
-            uint32_t v    = AV_RN32A(&samples_in[i][subindex]) ^ sign << 30;
-            AV_WN32A(&raXin[i], v);
+
+    for (i = 0; i < len; i++) {
+        int64_t a = INT64_C(2097471) * src[i] + INT64_C(6291137) * prev;
+        int64_t b = INT64_C(6291137) * src[i] + INT64_C(2097471) * prev;
+        prev = src[i];
+        *dst++ = clip23(norm23(a));
+        *dst++ = clip23(norm23(b));
+    }
+
+    *hist = prev;
+}
+
+static void sub_qmf32_fixed_c(SynthFilterContext *synth,
+                              DCADCTContext *imdct,
+                              int32_t *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              int32_t *hist1, int *offset, int32_t *hist2,
+                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
+{
+    LOCAL_ALIGNED_32(int32_t, input, [32]);
+    int i, j;
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        for (i = 0; i < 32; i++)
+            input[i] = subband_samples_lo[i][j];
+
+        // One subband sample generates 32 interpolated ones
+        synth->synth_filter_fixed(imdct, hist1, offset,
+                                  hist2, filter_coeff,
+                                  pcm_samples, input);
+        pcm_samples += 32;
+    }
+}
+
+static void sub_qmf64_fixed_c(SynthFilterContext *synth,
+                              DCADCTContext *imdct,
+                              int32_t *pcm_samples,
+                              int32_t **subband_samples_lo,
+                              int32_t **subband_samples_hi,
+                              int32_t *hist1, int *offset, int32_t *hist2,
+                              const int32_t *filter_coeff, ptrdiff_t npcmblocks)
+{
+    LOCAL_ALIGNED_32(int32_t, input, [64]);
+    int i, j;
+
+    if (!subband_samples_hi)
+        memset(&input[32], 0, sizeof(input[0]) * 32);
+
+    for (j = 0; j < npcmblocks; j++) {
+        // Load in one sample from each subband
+        if (subband_samples_hi) {
+            // Full 64 subbands, first 32 are residual coded
+            for (i =  0; i < 32; i++)
+                input[i] = subband_samples_lo[i][j] + subband_samples_hi[i][j];
+            for (i = 32; i < 64; i++)
+                input[i] = subband_samples_hi[i][j];
+        } else {
+            // Only first 32 subbands
+            for (i =  0; i < 32; i++)
+                input[i] = subband_samples_lo[i][j];
         }
 
-        synth->synth_filter_float(imdct, synth_buf_ptr, synth_buf_offset,
-                                  synth_buf2, window, samples_out, raXin,
-                                  scale);
-        samples_out += 32;
+        // One subband sample generates 64 interpolated ones
+        synth->synth_filter_fixed_64(imdct, hist1, offset,
+                                     hist2, filter_coeff,
+                                     pcm_samples, input);
+        pcm_samples += 64;
+    }
+}
+
+static void decor_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] += src[i] * coeff + (1 << 2) >> 3;
+}
+
+static void dmix_sub_xch_c(int32_t *dst1, int32_t *dst2,
+                           const int32_t *src, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++) {
+        int32_t cs = mul23(src[i], 5931520 /* M_SQRT1_2 * (1 << 23) */);
+        dst1[i] -= cs;
+        dst2[i] -= cs;
     }
 }
 
-static void dequantize_c(int32_t *samples, uint32_t step_size, uint32_t scale)
+static void dmix_sub_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] -= mul15(src[i], coeff);
+}
+
+static void dmix_add_c(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] += mul15(src[i], coeff);
+}
+
+static void dmix_scale_c(int32_t *dst, int scale, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] = mul15(dst[i], scale);
+}
+
+static void dmix_scale_inv_c(int32_t *dst, int scale_inv, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] = mul16(dst[i], scale_inv);
+}
+
+static void filter0(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
+{
+    int i;
+
+    for (i = 0; i < len; i++)
+        dst[i] -= mul22(src[i], coeff);
+}
+
+static void filter1(int32_t *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
 {
-    int64_t step = (int64_t)step_size * scale;
-    int shift, i;
-    int32_t step_scale;
+    int i;
 
-    if (step > (1 << 23))
-        shift = av_log2(step >> 23) + 1;
-    else
-        shift = 0;
-    step_scale = (int32_t)(step >> shift);
+    for (i = 0; i < len; i++)
+        dst[i] -= mul23(src[i], coeff);
+}
 
-    for (i = 0; i < SAMPLES_PER_SUBBAND; i++)
-        samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale, 22 - shift));
+static void assemble_freq_bands_c(int32_t *dst, int32_t *src0, int32_t *src1,
+                                  const int32_t *coeff, ptrdiff_t len)
+{
+    int i;
+
+    filter0(src0, src1, coeff[0], len);
+    filter0(src1, src0, coeff[1], len);
+    filter0(src0, src1, coeff[2], len);
+    filter0(src1, src0, coeff[3], len);
+
+    for (i = 0; i < 8; i++, src0--) {
+        filter1(src0, src1, coeff[i +  4], len);
+        filter1(src1, src0, coeff[i + 12], len);
+        filter1(src0, src1, coeff[i +  4], len);
+    }
+
+    for (i = 0; i < len; i++) {
+        *dst++ = *src1++;
+        *dst++ = *++src0;
+    }
 }
 
-static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs)
+static void lbr_bank_c(float output[32][4], float **input,
+                       const float *coeff, ptrdiff_t ofs, ptrdiff_t len)
 {
-    dca_lfe_fir(out, in, coefs, 32);
+    float SW0 = coeff[0];
+    float SW1 = coeff[1];
+    float SW2 = coeff[2];
+    float SW3 = coeff[3];
+
+    float C1  = coeff[4];
+    float C2  = coeff[5];
+    float C3  = coeff[6];
+    float C4  = coeff[7];
+
+    float AL1 = coeff[8];
+    float AL2 = coeff[9];
+
+    int i;
+
+    // Short window and 8 point forward MDCT
+    for (i = 0; i < len; i++) {
+        float *src = input[i] + ofs;
+
+        float a = src[-4] * SW0 - src[-1] * SW3;
+        float b = src[-3] * SW1 - src[-2] * SW2;
+        float c = src[ 2] * SW1 + src[ 1] * SW2;
+        float d = src[ 3] * SW0 + src[ 0] * SW3;
+
+        output[i][0] = C1 * b - C2 * c + C4 * a - C3 * d;
+        output[i][1] = C1 * d - C2 * a - C4 * b - C3 * c;
+        output[i][2] = C3 * b + C2 * d - C4 * c + C1 * a;
+        output[i][3] = C3 * a - C2 * b + C4 * d - C1 * c;
+    }
+
+    // Aliasing cancellation for high frequencies
+    for (i = 12; i < len - 1; i++) {
+        float a = output[i  ][3] * AL1;
+        float b = output[i+1][0] * AL1;
+        output[i  ][3] += b - a;
+        output[i+1][0] -= b + a;
+        a = output[i  ][2] * AL2;
+        b = output[i+1][1] * AL2;
+        output[i  ][2] += b - a;
+        output[i+1][1] -= b + a;
+    }
 }
 
-static void dca_lfe_fir1_c(float *out, const float *in, const float *coefs)
+static void lfe_iir_c(float *output, const float *input,
+                      const float iir[5][4], float hist[5][2],
+                      ptrdiff_t factor)
 {
-    dca_lfe_fir(out, in, coefs, 64);
+    float res, tmp;
+    int i, j, k;
+
+    for (i = 0; i < 64; i++) {
+        res = *input++;
+
+        for (j = 0; j < factor; j++) {
+            for (k = 0; k < 5; k++) {
+                tmp = hist[k][0] * iir[k][0] + hist[k][1] * iir[k][1] + res;
+                res = hist[k][0] * iir[k][2] + hist[k][1] * iir[k][3] + tmp;
+
+                hist[k][0] = hist[k][1];
+                hist[k][1] = tmp;
+            }
+
+            *output++ = res;
+            res = 0;
+        }
+    }
 }
 
 av_cold void ff_dcadsp_init(DCADSPContext *s)
 {
-    s->lfe_fir[0]      = dca_lfe_fir0_c;
-    s->lfe_fir[1]      = dca_lfe_fir1_c;
-    s->qmf_32_subbands = dca_qmf_32_subbands;
-    s->decode_hf       = decode_hf_c;
-    s->dequantize      = dequantize_c;
+    s->decode_hf     = decode_hf_c;
+    s->decode_joint  = decode_joint_c;
+
+    s->lfe_fir_float[0] = lfe_fir0_float_c;
+    s->lfe_fir_float[1] = lfe_fir1_float_c;
+    s->lfe_x96_float    = lfe_x96_float_c;
+    s->sub_qmf_float[0] = sub_qmf32_float_c;
+    s->sub_qmf_float[1] = sub_qmf64_float_c;
+
+    s->lfe_fir_fixed    = lfe_fir_fixed_c;
+    s->lfe_x96_fixed    = lfe_x96_fixed_c;
+    s->sub_qmf_fixed[0] = sub_qmf32_fixed_c;
+    s->sub_qmf_fixed[1] = sub_qmf64_fixed_c;
+
+    s->decor   = decor_c;
+
+    s->dmix_sub_xch   = dmix_sub_xch_c;
+    s->dmix_sub       = dmix_sub_c;
+    s->dmix_add       = dmix_add_c;
+    s->dmix_scale     = dmix_scale_c;
+    s->dmix_scale_inv = dmix_scale_inv_c;
+
+    s->assemble_freq_bands = assemble_freq_bands_c;
+
+    s->lbr_bank = lbr_bank_c;
+    s->lfe_iir = lfe_iir_c;
 
-    if (ARCH_AARCH64)
-        ff_dcadsp_init_aarch64(s);
-    if (ARCH_ARM)
-        ff_dcadsp_init_arm(s);
     if (ARCH_X86)
         ff_dcadsp_init_x86(s);
 }
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index 9ea89ea..8f2f467 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -1,51 +1,99 @@
 /*
- * This file is part of Libav.
+ * Copyright (C) 2016 foo86
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DCADSP_H
 #define AVCODEC_DCADSP_H
 
-#include "avfft.h"
+#include "libavutil/common.h"
+
+#include "fft.h"
+#include "dcadct.h"
 #include "synth_filter.h"
 
-#define DCA_SUBBANDS_X96K  64
-#define DCA_SUBBANDS       32
-#define SAMPLES_PER_SUBBAND 8 // number of samples per subband per subsubframe
+typedef struct DCADSPContext {
+    void (*decode_hf)(int32_t **dst,
+                      const int32_t *vq_index,
+                      const int8_t hf_vq[1024][32],
+                      int32_t scale_factors[32][2],
+                      ptrdiff_t sb_start, ptrdiff_t sb_end,
+                      ptrdiff_t ofs, ptrdiff_t len);
 
+    void (*decode_joint)(int32_t **dst, int32_t **src,
+                         const int32_t *scale_factors,
+                         ptrdiff_t sb_start, ptrdiff_t sb_end,
+                         ptrdiff_t ofs, ptrdiff_t len);
 
-typedef struct DCADSPContext {
-    void (*lfe_fir[2])(float *out, const float *in, const float *coefs);
-    void (*qmf_32_subbands)(float samples_in[DCA_SUBBANDS][SAMPLES_PER_SUBBAND], int sb_act,
-                            SynthFilterContext *synth, FFTContext *imdct,
-                            float synth_buf_ptr[512],
-                            int *synth_buf_offset, float synth_buf2[32],
-                            const float window[512], float *samples_out,
-                            float raXin[32], float scale);
-    void (*decode_hf)(int32_t dst[DCA_SUBBANDS][SAMPLES_PER_SUBBAND],
-                      const int32_t vq_num[DCA_SUBBANDS],
-                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
-                      int32_t scale[DCA_SUBBANDS][2],
-                      intptr_t start, intptr_t end);
-    void (*dequantize)(int32_t *samples, uint32_t step_size, uint32_t scale);
+    void (*lfe_fir_float[2])(float *pcm_samples, int32_t *lfe_samples,
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
+
+    void (*lfe_x96_float)(float *dst, const float *src,
+                          float *hist, ptrdiff_t len);
+
+    void (*sub_qmf_float[2])(SynthFilterContext *synth,
+                             FFTContext *imdct,
+                             float *pcm_samples,
+                             int32_t **subband_samples_lo,
+                             int32_t **subband_samples_hi,
+                             float *hist1, int *offset, float *hist2,
+                             const float *filter_coeff, ptrdiff_t npcmblocks,
+                             float scale);
+
+    void (*lfe_fir_fixed)(int32_t *pcm_samples, int32_t *lfe_samples,
+                          const int32_t *filter_coeff, ptrdiff_t npcmblocks);
+
+    void (*lfe_x96_fixed)(int32_t *dst, const int32_t *src,
+                          int32_t *hist, ptrdiff_t len);
+
+    void (*sub_qmf_fixed[2])(SynthFilterContext *synth,
+                             DCADCTContext *imdct,
+                             int32_t *pcm_samples,
+                             int32_t **subband_samples_lo,
+                             int32_t **subband_samples_hi,
+                             int32_t *hist1, int *offset, int32_t *hist2,
+                             const int32_t *filter_coeff, ptrdiff_t npcmblocks);
+
+    void (*decor)(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len);
+
+    void (*dmix_sub_xch)(int32_t *dst1, int32_t *dst2,
+                         const int32_t *src, ptrdiff_t len);
+
+    void (*dmix_sub)(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len);
+
+    void (*dmix_add)(int32_t *dst, const int32_t *src, int coeff, ptrdiff_t len);
+
+    void (*dmix_scale)(int32_t *dst, int scale, ptrdiff_t len);
+
+    void (*dmix_scale_inv)(int32_t *dst, int scale_inv, ptrdiff_t len);
+
+    void (*assemble_freq_bands)(int32_t *dst, int32_t *src0, int32_t *src1,
+                                const int32_t *coeff, ptrdiff_t len);
+
+    void (*lbr_bank)(float output[32][4], float **input,
+                     const float *coeff, ptrdiff_t ofs, ptrdiff_t len);
+
+    void (*lfe_iir)(float *output, const float *input,
+                    const float iir[5][4], float hist[5][2],
+                    ptrdiff_t factor);
 } DCADSPContext;
 
-void ff_dcadsp_init(DCADSPContext *s);
-void ff_dcadsp_init_aarch64(DCADSPContext *s);
-void ff_dcadsp_init_arm(DCADSPContext *s);
-void ff_dcadsp_init_x86(DCADSPContext *s);
+av_cold void ff_dcadsp_init(DCADSPContext *s);
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s);
 
-#endif /* AVCODEC_DCADSP_H */
+#endif
diff --git a/libavcodec/dcaenc.c b/libavcodec/dcaenc.c
new file mode 100644
index 0000000..6bb7d29
--- /dev/null
+++ b/libavcodec/dcaenc.c
@@ -0,0 +1,999 @@
+/*
+ * DCA encoder
+ * Copyright (C) 2008-2012 Alexander E. Patrakov
+ *               2010 Benjamin Larsson
+ *               2011 Xiang Wang
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/common.h"
+#include "libavutil/ffmath.h"
+#include "avcodec.h"
+#include "dca.h"
+#include "dcadata.h"
+#include "dcaenc.h"
+#include "internal.h"
+#include "mathops.h"
+#include "put_bits.h"
+
+#define MAX_CHANNELS 6
+#define DCA_MAX_FRAME_SIZE 16384
+#define DCA_HEADER_SIZE 13
+#define DCA_LFE_SAMPLES 8
+
+#define DCAENC_SUBBANDS 32
+#define SUBFRAMES 1
+#define SUBSUBFRAMES 2
+#define SUBBAND_SAMPLES (SUBFRAMES * SUBSUBFRAMES * 8)
+#define AUBANDS 25
+
+typedef struct DCAEncContext {
+    PutBitContext pb;
+    int frame_size;
+    int frame_bits;
+    int fullband_channels;
+    int channels;
+    int lfe_channel;
+    int samplerate_index;
+    int bitrate_index;
+    int channel_config;
+    const int32_t *band_interpolation;
+    const int32_t *band_spectrum;
+    int lfe_scale_factor;
+    softfloat lfe_quant;
+    int32_t lfe_peak_cb;
+    const int8_t *channel_order_tab;  ///< channel reordering table, lfe and non lfe
+
+    int32_t history[512][MAX_CHANNELS]; /* This is a circular buffer */
+    int32_t subband[SUBBAND_SAMPLES][DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t quantized[SUBBAND_SAMPLES][DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t peak_cb[DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t downsampled_lfe[DCA_LFE_SAMPLES];
+    int32_t masking_curve_cb[SUBSUBFRAMES][256];
+    int abits[DCAENC_SUBBANDS][MAX_CHANNELS];
+    int scale_factor[DCAENC_SUBBANDS][MAX_CHANNELS];
+    softfloat quant[DCAENC_SUBBANDS][MAX_CHANNELS];
+    int32_t eff_masking_curve_cb[256];
+    int32_t band_masking_cb[32];
+    int32_t worst_quantization_noise;
+    int32_t worst_noise_ever;
+    int consumed_bits;
+} DCAEncContext;
+
+static int32_t cos_table[2048];
+static int32_t band_interpolation[2][512];
+static int32_t band_spectrum[2][8];
+static int32_t auf[9][AUBANDS][256];
+static int32_t cb_to_add[256];
+static int32_t cb_to_level[2048];
+static int32_t lfe_fir_64i[512];
+
+/* Transfer function of outer and middle ear, Hz -> dB */
+static double hom(double f)
+{
+    double f1 = f / 1000;
+
+    return -3.64 * pow(f1, -0.8)
+           + 6.8 * exp(-0.6 * (f1 - 3.4) * (f1 - 3.4))
+           - 6.0 * exp(-0.15 * (f1 - 8.7) * (f1 - 8.7))
+           - 0.0006 * (f1 * f1) * (f1 * f1);
+}
+
+static double gammafilter(int i, double f)
+{
+    double h = (f - fc[i]) / erb[i];
+
+    h = 1 + h * h;
+    h = 1 / (h * h);
+    return 20 * log10(h);
+}
+
+static int encode_init(AVCodecContext *avctx)
+{
+    DCAEncContext *c = avctx->priv_data;
+    uint64_t layout = avctx->channel_layout;
+    int i, min_frame_bits;
+
+    c->fullband_channels = c->channels = avctx->channels;
+    c->lfe_channel = (avctx->channels == 3 || avctx->channels == 6);
+    c->band_interpolation = band_interpolation[1];
+    c->band_spectrum = band_spectrum[1];
+    c->worst_quantization_noise = -2047;
+    c->worst_noise_ever = -2047;
+
+    if (!layout) {
+        av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The "
+                                      "encoder will guess the layout, but it "
+                                      "might be incorrect.\n");
+        layout = av_get_default_channel_layout(avctx->channels);
+    }
+    switch (layout) {
+    case AV_CH_LAYOUT_MONO:         c->channel_config = 0; break;
+    case AV_CH_LAYOUT_STEREO:       c->channel_config = 2; break;
+    case AV_CH_LAYOUT_2_2:          c->channel_config = 8; break;
+    case AV_CH_LAYOUT_5POINT0:      c->channel_config = 9; break;
+    case AV_CH_LAYOUT_5POINT1:      c->channel_config = 9; break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported channel layout!\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (c->lfe_channel) {
+        c->fullband_channels--;
+        c->channel_order_tab = channel_reorder_lfe[c->channel_config];
+    } else {
+        c->channel_order_tab = channel_reorder_nolfe[c->channel_config];
+    }
+
+    for (i = 0; i < 9; i++) {
+        if (sample_rates[i] == avctx->sample_rate)
+            break;
+    }
+    if (i == 9)
+        return AVERROR(EINVAL);
+    c->samplerate_index = i;
+
+    if (avctx->bit_rate < 32000 || avctx->bit_rate > 3840000) {
+        av_log(avctx, AV_LOG_ERROR, "Bit rate %"PRId64" not supported.", (int64_t)avctx->bit_rate);
+        return AVERROR(EINVAL);
+    }
+    for (i = 0; ff_dca_bit_rates[i] < avctx->bit_rate; i++)
+        ;
+    c->bitrate_index = i;
+    c->frame_bits = FFALIGN((avctx->bit_rate * 512 + avctx->sample_rate - 1) / avctx->sample_rate, 32);
+    min_frame_bits = 132 + (493 + 28 * 32) * c->fullband_channels + c->lfe_channel * 72;
+    if (c->frame_bits < min_frame_bits || c->frame_bits > (DCA_MAX_FRAME_SIZE << 3))
+        return AVERROR(EINVAL);
+
+    c->frame_size = (c->frame_bits + 7) / 8;
+
+    avctx->frame_size = 32 * SUBBAND_SAMPLES;
+
+    if (!cos_table[0]) {
+        int j, k;
+
+        cos_table[0] = 0x7fffffff;
+        cos_table[512] = 0;
+        cos_table[1024] = -cos_table[0];
+        for (i = 1; i < 512; i++) {
+            cos_table[i]   = (int32_t)(0x7fffffff * cos(M_PI * i / 1024));
+            cos_table[1024-i] = -cos_table[i];
+            cos_table[1024+i] = -cos_table[i];
+            cos_table[2048-i] = cos_table[i];
+        }
+        for (i = 0; i < 2048; i++) {
+            cb_to_level[i] = (int32_t)(0x7fffffff * ff_exp10(-0.005 * i));
+        }
+
+        for (k = 0; k < 32; k++) {
+            for (j = 0; j < 8; j++) {
+                lfe_fir_64i[64 * j + k] = (int32_t)(0xffffff800000ULL * ff_dca_lfe_fir_64[8 * k + j]);
+                lfe_fir_64i[64 * (7-j) + (63 - k)] = (int32_t)(0xffffff800000ULL * ff_dca_lfe_fir_64[8 * k + j]);
+            }
+        }
+
+        for (i = 0; i < 512; i++) {
+            band_interpolation[0][i] = (int32_t)(0x1000000000ULL * ff_dca_fir_32bands_perfect[i]);
+            band_interpolation[1][i] = (int32_t)(0x1000000000ULL * ff_dca_fir_32bands_nonperfect[i]);
+        }
+
+        for (i = 0; i < 9; i++) {
+            for (j = 0; j < AUBANDS; j++) {
+                for (k = 0; k < 256; k++) {
+                    double freq = sample_rates[i] * (k + 0.5) / 512;
+
+                    auf[i][j][k] = (int32_t)(10 * (hom(freq) + gammafilter(j, freq)));
+                }
+            }
+        }
+
+        for (i = 0; i < 256; i++) {
+            double add = 1 + ff_exp10(-0.01 * i);
+            cb_to_add[i] = (int32_t)(100 * log10(add));
+        }
+        for (j = 0; j < 8; j++) {
+            double accum = 0;
+            for (i = 0; i < 512; i++) {
+                double reconst = ff_dca_fir_32bands_perfect[i] * ((i & 64) ? (-1) : 1);
+                accum += reconst * cos(2 * M_PI * (i + 0.5 - 256) * (j + 0.5) / 512);
+            }
+            band_spectrum[0][j] = (int32_t)(200 * log10(accum));
+        }
+        for (j = 0; j < 8; j++) {
+            double accum = 0;
+            for (i = 0; i < 512; i++) {
+                double reconst = ff_dca_fir_32bands_nonperfect[i] * ((i & 64) ? (-1) : 1);
+                accum += reconst * cos(2 * M_PI * (i + 0.5 - 256) * (j + 0.5) / 512);
+            }
+            band_spectrum[1][j] = (int32_t)(200 * log10(accum));
+        }
+    }
+    return 0;
+}
+
+static inline int32_t cos_t(int x)
+{
+    return cos_table[x & 2047];
+}
+
+static inline int32_t sin_t(int x)
+{
+    return cos_t(x - 512);
+}
+
+static inline int32_t half32(int32_t a)
+{
+    return (a + 1) >> 1;
+}
+
+static inline int32_t mul32(int32_t a, int32_t b)
+{
+    int64_t r = (int64_t)a * b + 0x80000000ULL;
+    return r >> 32;
+}
+
+static void subband_transform(DCAEncContext *c, const int32_t *input)
+{
+    int ch, subs, i, k, j;
+
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        /* History is copied because it is also needed for PSY */
+        int32_t hist[512];
+        int hist_start = 0;
+        const int chi = c->channel_order_tab[ch];
+
+        for (i = 0; i < 512; i++)
+            hist[i] = c->history[i][ch];
+
+        for (subs = 0; subs < SUBBAND_SAMPLES; subs++) {
+            int32_t accum[64];
+            int32_t resp;
+            int band;
+
+            /* Calculate the convolutions at once */
+            for (i = 0; i < 64; i++)
+                accum[i] = 0;
+
+            for (k = 0, i = hist_start, j = 0;
+                    i < 512; k = (k + 1) & 63, i++, j++)
+                accum[k] += mul32(hist[i], c->band_interpolation[j]);
+            for (i = 0; i < hist_start; k = (k + 1) & 63, i++, j++)
+                accum[k] += mul32(hist[i], c->band_interpolation[j]);
+
+            for (k = 16; k < 32; k++)
+                accum[k] = accum[k] - accum[31 - k];
+            for (k = 32; k < 48; k++)
+                accum[k] = accum[k] + accum[95 - k];
+
+            for (band = 0; band < 32; band++) {
+                resp = 0;
+                for (i = 16; i < 48; i++) {
+                    int s = (2 * band + 1) * (2 * (i + 16) + 1);
+                    resp += mul32(accum[i], cos_t(s << 3)) >> 3;
+                }
+
+                c->subband[subs][band][ch] = ((band + 1) & 2) ? -resp : resp;
+            }
+
+            /* Copy in 32 new samples from input */
+            for (i = 0; i < 32; i++)
+                hist[i + hist_start] = input[(subs * 32 + i) * c->channels + chi];
+            hist_start = (hist_start + 32) & 511;
+        }
+    }
+}
+
+static void lfe_downsample(DCAEncContext *c, const int32_t *input)
+{
+    /* FIXME: make 128x LFE downsampling possible */
+    const int lfech = lfe_index[c->channel_config];
+    int i, j, lfes;
+    int32_t hist[512];
+    int32_t accum;
+    int hist_start = 0;
+
+    for (i = 0; i < 512; i++)
+        hist[i] = c->history[i][c->channels - 1];
+
+    for (lfes = 0; lfes < DCA_LFE_SAMPLES; lfes++) {
+        /* Calculate the convolution */
+        accum = 0;
+
+        for (i = hist_start, j = 0; i < 512; i++, j++)
+            accum += mul32(hist[i], lfe_fir_64i[j]);
+        for (i = 0; i < hist_start; i++, j++)
+            accum += mul32(hist[i], lfe_fir_64i[j]);
+
+        c->downsampled_lfe[lfes] = accum;
+
+        /* Copy in 64 new samples from input */
+        for (i = 0; i < 64; i++)
+            hist[i + hist_start] = input[(lfes * 64 + i) * c->channels + lfech];
+
+        hist_start = (hist_start + 64) & 511;
+    }
+}
+
+typedef struct {
+    int32_t re;
+    int32_t im;
+} cplx32;
+
+static void fft(const int32_t in[2 * 256], cplx32 out[256])
+{
+    cplx32 buf[256], rin[256], rout[256];
+    int i, j, k, l;
+
+    /* do two transforms in parallel */
+    for (i = 0; i < 256; i++) {
+        /* Apply the Hann window */
+        rin[i].re = mul32(in[2 * i], 0x3fffffff - (cos_t(8 * i + 2) >> 1));
+        rin[i].im = mul32(in[2 * i + 1], 0x3fffffff - (cos_t(8 * i + 6) >> 1));
+    }
+    /* pre-rotation */
+    for (i = 0; i < 256; i++) {
+        buf[i].re = mul32(cos_t(4 * i + 2), rin[i].re)
+                  - mul32(sin_t(4 * i + 2), rin[i].im);
+        buf[i].im = mul32(cos_t(4 * i + 2), rin[i].im)
+                  + mul32(sin_t(4 * i + 2), rin[i].re);
+    }
+
+    for (j = 256, l = 1; j != 1; j >>= 1, l <<= 1) {
+        for (k = 0; k < 256; k += j) {
+            for (i = k; i < k + j / 2; i++) {
+                cplx32 sum, diff;
+                int t = 8 * l * i;
+
+                sum.re = buf[i].re + buf[i + j / 2].re;
+                sum.im = buf[i].im + buf[i + j / 2].im;
+
+                diff.re = buf[i].re - buf[i + j / 2].re;
+                diff.im = buf[i].im - buf[i + j / 2].im;
+
+                buf[i].re = half32(sum.re);
+                buf[i].im = half32(sum.im);
+
+                buf[i + j / 2].re = mul32(diff.re, cos_t(t))
+                                  - mul32(diff.im, sin_t(t));
+                buf[i + j / 2].im = mul32(diff.im, cos_t(t))
+                                  + mul32(diff.re, sin_t(t));
+            }
+        }
+    }
+    /* post-rotation */
+    for (i = 0; i < 256; i++) {
+        int b = ff_reverse[i];
+        rout[i].re = mul32(buf[b].re, cos_t(4 * i))
+                   - mul32(buf[b].im, sin_t(4 * i));
+        rout[i].im = mul32(buf[b].im, cos_t(4 * i))
+                   + mul32(buf[b].re, sin_t(4 * i));
+    }
+    for (i = 0; i < 256; i++) {
+        /* separate the results of the two transforms */
+        cplx32 o1, o2;
+
+        o1.re =  rout[i].re - rout[255 - i].re;
+        o1.im =  rout[i].im + rout[255 - i].im;
+
+        o2.re =  rout[i].im - rout[255 - i].im;
+        o2.im = -rout[i].re - rout[255 - i].re;
+
+        /* combine them into one long transform */
+        out[i].re = mul32( o1.re + o2.re, cos_t(2 * i + 1))
+                  + mul32( o1.im - o2.im, sin_t(2 * i + 1));
+        out[i].im = mul32( o1.im + o2.im, cos_t(2 * i + 1))
+                  + mul32(-o1.re + o2.re, sin_t(2 * i + 1));
+    }
+}
+
+static int32_t get_cb(int32_t in)
+{
+    int i, res;
+
+    res = 0;
+    if (in < 0)
+        in = -in;
+    for (i = 1024; i > 0; i >>= 1) {
+        if (cb_to_level[i + res] >= in)
+            res += i;
+    }
+    return -res;
+}
+
+static int32_t add_cb(int32_t a, int32_t b)
+{
+    if (a < b)
+        FFSWAP(int32_t, a, b);
+
+    if (a - b >= 256)
+        return a;
+    return a + cb_to_add[a - b];
+}
+
+static void adjust_jnd(int samplerate_index,
+                       const int32_t in[512], int32_t out_cb[256])
+{
+    int32_t power[256];
+    cplx32 out[256];
+    int32_t out_cb_unnorm[256];
+    int32_t denom;
+    const int32_t ca_cb = -1114;
+    const int32_t cs_cb = 928;
+    int i, j;
+
+    fft(in, out);
+
+    for (j = 0; j < 256; j++) {
+        power[j] = add_cb(get_cb(out[j].re), get_cb(out[j].im));
+        out_cb_unnorm[j] = -2047; /* and can only grow */
+    }
+
+    for (i = 0; i < AUBANDS; i++) {
+        denom = ca_cb; /* and can only grow */
+        for (j = 0; j < 256; j++)
+            denom = add_cb(denom, power[j] + auf[samplerate_index][i][j]);
+        for (j = 0; j < 256; j++)
+            out_cb_unnorm[j] = add_cb(out_cb_unnorm[j],
+                    -denom + auf[samplerate_index][i][j]);
+    }
+
+    for (j = 0; j < 256; j++)
+        out_cb[j] = add_cb(out_cb[j], -out_cb_unnorm[j] - ca_cb - cs_cb);
+}
+
+typedef void (*walk_band_t)(DCAEncContext *c, int band1, int band2, int f,
+                            int32_t spectrum1, int32_t spectrum2, int channel,
+                            int32_t * arg);
+
+static void walk_band_low(DCAEncContext *c, int band, int channel,
+                          walk_band_t walk, int32_t *arg)
+{
+    int f;
+
+    if (band == 0) {
+        for (f = 0; f < 4; f++)
+            walk(c, 0, 0, f, 0, -2047, channel, arg);
+    } else {
+        for (f = 0; f < 8; f++)
+            walk(c, band, band - 1, 8 * band - 4 + f,
+                    c->band_spectrum[7 - f], c->band_spectrum[f], channel, arg);
+    }
+}
+
+static void walk_band_high(DCAEncContext *c, int band, int channel,
+                           walk_band_t walk, int32_t *arg)
+{
+    int f;
+
+    if (band == 31) {
+        for (f = 0; f < 4; f++)
+            walk(c, 31, 31, 256 - 4 + f, 0, -2047, channel, arg);
+    } else {
+        for (f = 0; f < 8; f++)
+            walk(c, band, band + 1, 8 * band + 4 + f,
+                    c->band_spectrum[f], c->band_spectrum[7 - f], channel, arg);
+    }
+}
+
+static void update_band_masking(DCAEncContext *c, int band1, int band2,
+                                int f, int32_t spectrum1, int32_t spectrum2,
+                                int channel, int32_t * arg)
+{
+    int32_t value = c->eff_masking_curve_cb[f] - spectrum1;
+
+    if (value < c->band_masking_cb[band1])
+        c->band_masking_cb[band1] = value;
+}
+
+static void calc_masking(DCAEncContext *c, const int32_t *input)
+{
+    int i, k, band, ch, ssf;
+    int32_t data[512];
+
+    for (i = 0; i < 256; i++)
+        for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+            c->masking_curve_cb[ssf][i] = -2047;
+
+    for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+        for (ch = 0; ch < c->fullband_channels; ch++) {
+            const int chi = c->channel_order_tab[ch];
+
+            for (i = 0, k = 128 + 256 * ssf; k < 512; i++, k++)
+                data[i] = c->history[k][ch];
+            for (k -= 512; i < 512; i++, k++)
+                data[i] = input[k * c->channels + chi];
+            adjust_jnd(c->samplerate_index, data, c->masking_curve_cb[ssf]);
+        }
+    for (i = 0; i < 256; i++) {
+        int32_t m = 2048;
+
+        for (ssf = 0; ssf < SUBSUBFRAMES; ssf++)
+            if (c->masking_curve_cb[ssf][i] < m)
+                m = c->masking_curve_cb[ssf][i];
+        c->eff_masking_curve_cb[i] = m;
+    }
+
+    for (band = 0; band < 32; band++) {
+        c->band_masking_cb[band] = 2048;
+        walk_band_low(c, band, 0, update_band_masking, NULL);
+        walk_band_high(c, band, 0, update_band_masking, NULL);
+    }
+}
+
+static void find_peaks(DCAEncContext *c)
+{
+    int band, ch;
+
+    for (band = 0; band < 32; band++)
+        for (ch = 0; ch < c->fullband_channels; ch++) {
+            int sample;
+            int32_t m = 0;
+
+            for (sample = 0; sample < SUBBAND_SAMPLES; sample++) {
+                int32_t s = abs(c->subband[sample][band][ch]);
+                if (m < s)
+                    m = s;
+            }
+            c->peak_cb[band][ch] = get_cb(m);
+        }
+
+    if (c->lfe_channel) {
+        int sample;
+        int32_t m = 0;
+
+        for (sample = 0; sample < DCA_LFE_SAMPLES; sample++)
+            if (m < abs(c->downsampled_lfe[sample]))
+                m = abs(c->downsampled_lfe[sample]);
+        c->lfe_peak_cb = get_cb(m);
+    }
+}
+
+static const int snr_fudge = 128;
+#define USED_1ABITS 1
+#define USED_NABITS 2
+#define USED_26ABITS 4
+
+static int init_quantization_noise(DCAEncContext *c, int noise)
+{
+    int ch, band, ret = 0;
+
+    c->consumed_bits = 132 + 493 * c->fullband_channels;
+    if (c->lfe_channel)
+        c->consumed_bits += 72;
+
+    /* attempt to guess the bit distribution based on the prevoius frame */
+    for (ch = 0; ch < c->fullband_channels; ch++) {
+        for (band = 0; band < 32; band++) {
+            int snr_cb = c->peak_cb[band][ch] - c->band_masking_cb[band] - noise;
+
+            if (snr_cb >= 1312) {
+                c->abits[band][ch] = 26;
+                ret |= USED_26ABITS;
+            } else if (snr_cb >= 222) {
+                c->abits[band][ch] = 8 + mul32(snr_cb - 222, 69000000);
+                ret |= USED_NABITS;
+            } else if (snr_cb >= 0) {
+                c->abits[band][ch] = 2 + mul32(snr_cb, 106000000);
+                ret |= USED_NABITS;
+            } else {
+                c->abits[band][ch] = 1;
+                ret |= USED_1ABITS;
+            }
+        }
+    }
+
+    for (band = 0; band < 32; band++)
+        for (ch = 0; ch < c->fullband_channels; ch++) {
+            c->consumed_bits += bit_consumption[c->abits[band][ch]];
+        }
+
+    return ret;
+}
+
+static void assign_bits(DCAEncContext *c)
+{
+    /* Find the bounds where the binary search should work */
+    int low, high, down;
+    int used_abits = 0;
+
+    init_quantization_noise(c, c->worst_quantization_noise);
+    low = high = c->worst_quantization_noise;
+    if (c->consumed_bits > c->frame_bits) {
+        while (c->consumed_bits > c->frame_bits) {
+            av_assert0(used_abits != USED_1ABITS);
+            low = high;
+            high += snr_fudge;
+            used_abits = init_quantization_noise(c, high);
+        }
+    } else {
+        while (c->consumed_bits <= c->frame_bits) {
+            high = low;
+            if (used_abits == USED_26ABITS)
+                goto out; /* The requested bitrate is too high, pad with zeros */
+            low -= snr_fudge;
+            used_abits = init_quantization_noise(c, low);
+        }
+    }
+
+    /* Now do a binary search between low and high to see what fits */
+    for (down = snr_fudge >> 1; down; down >>= 1) {
+        init_quantization_noise(c, high - down);
+        if (c->consumed_bits <= c->frame_bits)
+            high -= down;
+    }
+    init_quantization_noise(c, high);
+out:
+    c->worst_quantization_noise = high;
+    if (high > c->worst_noise_ever)
+        c->worst_noise_ever = high;
+}
+
+static void shift_history(DCAEncContext *c, const int32_t *input)
+{
+    int k, ch;
+
+    for (k = 0; k < 512; k++)
+        for (ch = 0; ch < c->channels; ch++) {
+            const int chi = c->channel_order_tab[ch];
+
+            c->history[k][ch] = input[k * c->channels + chi];
+        }
+}
+
+static int32_t quantize_value(int32_t value, softfloat quant)
+{
+    int32_t offset = 1 << (quant.e - 1);
+
+    value = mul32(value, quant.m) + offset;
+    value = value >> quant.e;
+    return value;
+}
+
+static int calc_one_scale(int32_t peak_cb, int abits, softfloat *quant)
+{
+    int32_t peak;
+    int our_nscale, try_remove;
+    softfloat our_quant;
+
+    av_assert0(peak_cb <= 0);
+    av_assert0(peak_cb >= -2047);
+
+    our_nscale = 127;
+    peak = cb_to_level[-peak_cb];
+
+    for (try_remove = 64; try_remove > 0; try_remove >>= 1) {
+        if (scalefactor_inv[our_nscale - try_remove].e + stepsize_inv[abits].e <= 17)
+            continue;
+        our_quant.m = mul32(scalefactor_inv[our_nscale - try_remove].m, stepsize_inv[abits].m);
+        our_quant.e = scalefactor_inv[our_nscale - try_remove].e + stepsize_inv[abits].e - 17;
+        if ((ff_dca_quant_levels[abits] - 1) / 2 < quantize_value(peak, our_quant))
+            continue;
+        our_nscale -= try_remove;
+    }
+
+    if (our_nscale >= 125)
+        our_nscale = 124;
+
+    quant->m = mul32(scalefactor_inv[our_nscale].m, stepsize_inv[abits].m);
+    quant->e = scalefactor_inv[our_nscale].e + stepsize_inv[abits].e - 17;
+    av_assert0((ff_dca_quant_levels[abits] - 1) / 2 >= quantize_value(peak, *quant));
+
+    return our_nscale;
+}
+
+static void calc_scales(DCAEncContext *c)
+{
+    int band, ch;
+
+    for (band = 0; band < 32; band++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            c->scale_factor[band][ch] = calc_one_scale(c->peak_cb[band][ch],
+                                                       c->abits[band][ch],
+                                                       &c->quant[band][ch]);
+
+    if (c->lfe_channel)
+        c->lfe_scale_factor = calc_one_scale(c->lfe_peak_cb, 11, &c->lfe_quant);
+}
+
+static void quantize_all(DCAEncContext *c)
+{
+    int sample, band, ch;
+
+    for (sample = 0; sample < SUBBAND_SAMPLES; sample++)
+        for (band = 0; band < 32; band++)
+            for (ch = 0; ch < c->fullband_channels; ch++)
+                c->quantized[sample][band][ch] = quantize_value(c->subband[sample][band][ch], c->quant[band][ch]);
+}
+
+static void put_frame_header(DCAEncContext *c)
+{
+    /* SYNC */
+    put_bits(&c->pb, 16, 0x7ffe);
+    put_bits(&c->pb, 16, 0x8001);
+
+    /* Frame type: normal */
+    put_bits(&c->pb, 1, 1);
+
+    /* Deficit sample count: none */
+    put_bits(&c->pb, 5, 31);
+
+    /* CRC is not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Number of PCM sample blocks */
+    put_bits(&c->pb, 7, SUBBAND_SAMPLES - 1);
+
+    /* Primary frame byte size */
+    put_bits(&c->pb, 14, c->frame_size - 1);
+
+    /* Audio channel arrangement */
+    put_bits(&c->pb, 6, c->channel_config);
+
+    /* Core audio sampling frequency */
+    put_bits(&c->pb, 4, bitstream_sfreq[c->samplerate_index]);
+
+    /* Transmission bit rate */
+    put_bits(&c->pb, 5, c->bitrate_index);
+
+    /* Embedded down mix: disabled */
+    put_bits(&c->pb, 1, 0);
+
+    /* Embedded dynamic range flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Embedded time stamp flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Auxiliary data flag: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* HDCD source: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Extension audio ID: N/A */
+    put_bits(&c->pb, 3, 0);
+
+    /* Extended audio data: not present */
+    put_bits(&c->pb, 1, 0);
+
+    /* Audio sync word insertion flag: after each sub-frame */
+    put_bits(&c->pb, 1, 0);
+
+    /* Low frequency effects flag: not present or 64x subsampling */
+    put_bits(&c->pb, 2, c->lfe_channel ? 2 : 0);
+
+    /* Predictor history switch flag: on */
+    put_bits(&c->pb, 1, 1);
+
+    /* No CRC */
+    /* Multirate interpolator switch: non-perfect reconstruction */
+    put_bits(&c->pb, 1, 0);
+
+    /* Encoder software revision: 7 */
+    put_bits(&c->pb, 4, 7);
+
+    /* Copy history: 0 */
+    put_bits(&c->pb, 2, 0);
+
+    /* Source PCM resolution: 16 bits, not DTS ES */
+    put_bits(&c->pb, 3, 0);
+
+    /* Front sum/difference coding: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Surrounds sum/difference coding: no */
+    put_bits(&c->pb, 1, 0);
+
+    /* Dialog normalization: 0 dB */
+    put_bits(&c->pb, 4, 0);
+}
+
+static void put_primary_audio_header(DCAEncContext *c)
+{
+    static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
+    static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
+
+    int ch, i;
+    /* Number of subframes */
+    put_bits(&c->pb, 4, SUBFRAMES - 1);
+
+    /* Number of primary audio channels */
+    put_bits(&c->pb, 3, c->fullband_channels - 1);
+
+    /* Subband activity count */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 5, DCAENC_SUBBANDS - 2);
+
+    /* High frequency VQ start subband */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 5, DCAENC_SUBBANDS - 1);
+
+    /* Joint intensity coding index: 0, 0 */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 0);
+
+    /* Transient mode codebook: A4, A4 (arbitrary) */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 2, 0);
+
+    /* Scale factor code book: 7 bit linear, 7-bit sqrt table (for each channel) */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 6);
+
+    /* Bit allocation quantizer select: linear 5-bit */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        put_bits(&c->pb, 3, 6);
+
+    /* Quantization index codebook select: dummy data
+       to avoid transmission of scale factor adjustment */
+    for (i = 1; i < 11; i++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            put_bits(&c->pb, bitlen[i], thr[i]);
+
+    /* Scale factor adjustment index: not transmitted */
+    /* Audio header CRC check word: not transmitted */
+}
+
+static void put_subframe_samples(DCAEncContext *c, int ss, int band, int ch)
+{
+    if (c->abits[band][ch] <= 7) {
+        int sum, i, j;
+        for (i = 0; i < 8; i += 4) {
+            sum = 0;
+            for (j = 3; j >= 0; j--) {
+                sum *= ff_dca_quant_levels[c->abits[band][ch]];
+                sum += c->quantized[ss * 8 + i + j][band][ch];
+                sum += (ff_dca_quant_levels[c->abits[band][ch]] - 1) / 2;
+            }
+            put_bits(&c->pb, bit_consumption[c->abits[band][ch]] / 4, sum);
+        }
+    } else {
+        int i;
+        for (i = 0; i < 8; i++) {
+            int bits = bit_consumption[c->abits[band][ch]] / 16;
+            put_sbits(&c->pb, bits, c->quantized[ss * 8 + i][band][ch]);
+        }
+    }
+}
+
+static void put_subframe(DCAEncContext *c, int subframe)
+{
+    int i, band, ss, ch;
+
+    /* Subsubframes count */
+    put_bits(&c->pb, 2, SUBSUBFRAMES -1);
+
+    /* Partial subsubframe sample count: dummy */
+    put_bits(&c->pb, 3, 0);
+
+    /* Prediction mode: no ADPCM, in each channel and subband */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            put_bits(&c->pb, 1, 0);
+
+    /* Prediction VQ address: not transmitted */
+    /* Bit allocation index */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            put_bits(&c->pb, 5, c->abits[band][ch]);
+
+    if (SUBSUBFRAMES > 1) {
+        /* Transition mode: none for each channel and subband */
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            for (band = 0; band < DCAENC_SUBBANDS; band++)
+                put_bits(&c->pb, 1, 0); /* codebook A4 */
+    }
+
+    /* Scale factors */
+    for (ch = 0; ch < c->fullband_channels; ch++)
+        for (band = 0; band < DCAENC_SUBBANDS; band++)
+            put_bits(&c->pb, 7, c->scale_factor[band][ch]);
+
+    /* Joint subband scale factor codebook select: not transmitted */
+    /* Scale factors for joint subband coding: not transmitted */
+    /* Stereo down-mix coefficients: not transmitted */
+    /* Dynamic range coefficient: not transmitted */
+    /* Stde information CRC check word: not transmitted */
+    /* VQ encoded high frequency subbands: not transmitted */
+
+    /* LFE data: 8 samples and scalefactor */
+    if (c->lfe_channel) {
+        for (i = 0; i < DCA_LFE_SAMPLES; i++)
+            put_bits(&c->pb, 8, quantize_value(c->downsampled_lfe[i], c->lfe_quant) & 0xff);
+        put_bits(&c->pb, 8, c->lfe_scale_factor);
+    }
+
+    /* Audio data (subsubframes) */
+    for (ss = 0; ss < SUBSUBFRAMES ; ss++)
+        for (ch = 0; ch < c->fullband_channels; ch++)
+            for (band = 0; band < DCAENC_SUBBANDS; band++)
+                    put_subframe_samples(c, ss, band, ch);
+
+    /* DSYNC */
+    put_bits(&c->pb, 16, 0xffff);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                        const AVFrame *frame, int *got_packet_ptr)
+{
+    DCAEncContext *c = avctx->priv_data;
+    const int32_t *samples;
+    int ret, i;
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, c->frame_size, 0)) < 0)
+        return ret;
+
+    samples = (const int32_t *)frame->data[0];
+
+    subband_transform(c, samples);
+    if (c->lfe_channel)
+        lfe_downsample(c, samples);
+
+    calc_masking(c, samples);
+    find_peaks(c);
+    assign_bits(c);
+    calc_scales(c);
+    quantize_all(c);
+    shift_history(c, samples);
+
+    init_put_bits(&c->pb, avpkt->data, avpkt->size);
+    put_frame_header(c);
+    put_primary_audio_header(c);
+    for (i = 0; i < SUBFRAMES; i++)
+        put_subframe(c, i);
+
+
+    for (i = put_bits_count(&c->pb); i < 8*c->frame_size; i++)
+        put_bits(&c->pb, 1, 0);
+
+    flush_put_bits(&c->pb);
+
+    avpkt->pts      = frame->pts;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    avpkt->size     = put_bits_count(&c->pb) >> 3;
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static const AVCodecDefault defaults[] = {
+    { "b",          "1411200" },
+    { NULL },
+};
+
+AVCodec ff_dca_encoder = {
+    .name                  = "dca",
+    .long_name             = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_DTS,
+    .priv_data_size        = sizeof(DCAEncContext),
+    .init                  = encode_init,
+    .encode2               = encode_frame,
+    .capabilities          = AV_CODEC_CAP_EXPERIMENTAL,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S32,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = sample_rates,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO,
+                                                  AV_CH_LAYOUT_2_2,
+                                                  AV_CH_LAYOUT_5POINT0,
+                                                  AV_CH_LAYOUT_5POINT1,
+                                                  0 },
+    .defaults              = defaults,
+};
diff --git a/libavcodec/dcaenc.h b/libavcodec/dcaenc.h
new file mode 100644
index 0000000..eccfb42
--- /dev/null
+++ b/libavcodec/dcaenc.h
@@ -0,0 +1,148 @@
+/*
+ * DCA encoder tables
+ * Copyright (C) 2008-2012 Alexander E. Patrakov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DCAENC_H
+#define AVCODEC_DCAENC_H
+
+#include <stdint.h>
+
+typedef struct {
+    int32_t m;
+    int32_t e;
+} softfloat;
+
+static const int sample_rates[] = {
+    8000, 16000, 32000, 11025, 22050, 44100, 12000, 24000, 48000, 0,
+};
+
+static const uint8_t bitstream_sfreq[] = { 1, 2, 3, 6, 7, 8, 11, 12, 13 };
+
+/* Auditory filter center frequencies and bandwidths, in Hz.
+ * The last two are made up, because there is no scientific data.
+ */
+static const uint16_t fc[] = {
+    50, 150, 250, 350, 450, 570, 700, 840, 1000, 1170, 1370, 1600, 1850, 2150,
+    2500, 2900, 3400, 4000, 4800, 5800, 7000, 8500, 10500, 13500, 17000
+};
+
+static const uint16_t erb[] = {
+    80, 100, 100, 100, 110, 120, 140, 150, 160, 190, 210, 240, 280,
+    320, 380, 450, 550, 700, 900, 1100, 1300, 1800, 2500, 3500, 4500
+};
+
+static const softfloat stepsize_inv[27] = {
+    {0, 0}, {1342177360, 21}, {2147483647, 21}, {1342177360, 20},
+    {1819901661, 20}, {2147483647, 20}, {1278263843, 19}, {1579032492, 19},
+    {1412817763, 18}, {1220162327, 17}, {1118482133, 16}, {1917391412, 16},
+    {1766017772, 15}, {1525212826, 14}, {1290553940, 13}, {2097179000, 13},
+    {1677683200, 12}, {1497972244, 11}, {1310893147, 10}, {1165354136, 9},
+    {1748031204, 9}, {1542092044, 8}, {1636178017, 7}, {1636178017, 6},
+    {1636178017, 5}, {1636178017, 4}, {1636178017, 3},
+};
+
+static const softfloat scalefactor_inv[128] = {
+    {2147483647, 1}, {2147483647, 1}, {2147483647, 2}, {2147483647, 2},
+    {2147483647, 2}, {2147483647, 2}, {1431655765, 2}, {1431655765, 2},
+    {1431655765, 2}, {2147483647, 3}, {2147483647, 3}, {1717986918, 3},
+    {1431655765, 3}, {1227133513, 3}, {1227133513, 3}, {2147483647, 4},
+    {1717986918, 4}, {1561806289, 4}, {1431655765, 4}, {1227133513, 4},
+    {2147483647, 5}, {1908874353, 5}, {1717986918, 5}, {1493901668, 5},
+    {1321528398, 5}, {1145324612, 5}, {2021161080, 6}, {1808407282, 6},
+    {1561806289, 6}, {1374389534, 6}, {1227133513, 6}, {2147483647, 7},
+    {1908874353, 7}, {1676084798, 7}, {1477838209, 7}, {1296593900, 7},
+    {1145324612, 7}, {2021161080, 8}, {1773405851, 8}, {1561806289, 8},
+    {1374389534, 8}, {1216273924, 8}, {2139127680, 9}, {1882725390, 9},
+    {1660893697, 9}, {1462116526, 9}, {1287484341, 9}, {1135859119, 9},
+    {1999112050, 10}, {1762037865, 10}, {1552982525, 10}, {1367551775, 10},
+    {1205604855, 10}, {2124660150, 11}, {1871509153, 11}, {1648443220, 11},
+    {1452459217, 11}, {1279990253, 11}, {1127704233, 11}, {1987368509, 12},
+    {1750814693, 12}, {1542632939, 12}, {1359099663, 12}, {1197398995, 12},
+    {2109880792, 13}, {1858853132, 13}, {1638006149, 13}, {1443165385, 13},
+    {1271479187, 13}, {1120235993, 13}, {1973767086, 14}, {1739045674, 14},
+    {1532153461, 14}, {1349922194, 14}, {1189384493, 14}, {2095804865, 15},
+    {1846464029, 15}, {1626872524, 15}, {1433347133, 15}, {1262853884, 15},
+    {1112619678, 15}, {1960569045, 16}, {1727349015, 16}, {1521881227, 16},
+    {1340842289, 16}, {1181357555, 16}, {2081669156, 17}, {1834047752, 17},
+    {1615889229, 17}, {1423675973, 17}, {1254322457, 17}, {1105123583, 17},
+    {1947330755, 18}, {1715693602, 18}, {1511607799, 18}, {1331801790, 18},
+    {1173384427, 18}, {2067616532, 19}, {1821667648, 19}, {1604980024, 19},
+    {1414066955, 19}, {1245861410, 19}, {1097665748, 19}, {1934193616, 20},
+    {1704119624, 20}, {1501412075, 20}, {1322817107, 20}, {1165466323, 20},
+    {2053666205, 21}, {1809379407, 21}, {1594151671, 21}, {1404526328, 21},
+    {1237455941, 21}, {1090259329, 21}, {1921143210, 22}, {1692621231, 22},
+    {1491281857, 22}, {1313892269, 22}, {1157603482, 22}, {2039810470, 23},
+    {1797172644, 23}, {1583396912, 23}, {1395050052, 23}, {1229107276, 23},
+    {1082903494, 23}, {1082903494, 23}, {1082903494, 23}, {1082903494, 23},
+};
+
+/* manually derived from
+ * Table B.5: Selection of quantization levels and codebooks
+ * FIXME: will become invalid when Huffman codes are introduced.
+ */
+static const int bit_consumption[27] = {
+    -8, 28, 40, 48, 52, 60, 68, 76, 80, 96,
+    112, 128, 144, 160, 176, 192, 208, 224, 240, 256,
+    272, 288, 304, 320, 336, 352, 368,
+};
+
+static const int8_t lfe_index[16] = {
+    1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 1, 3, 2, 3
+};
+
+static const int8_t channel_reorder_lfe[16][9] = {
+    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1,  3, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  4, -1, -1, -1, -1, -1 },
+    { 0,  1,  3,  4, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  4,  5, -1, -1, -1, -1 },
+    { 3,  4,  0,  1,  5,  6, -1, -1, -1 },
+    { 2,  0,  1,  4,  5,  6, -1, -1, -1 },
+    { 0,  6,  4,  5,  2,  3, -1, -1, -1 },
+    { 4,  2,  5,  0,  1,  6,  7, -1, -1 },
+    { 5,  6,  0,  1,  7,  3,  8,  4, -1 },
+    { 4,  2,  5,  0,  1,  6,  8,  7, -1 },
+};
+
+static const int8_t channel_reorder_nolfe[16][9] = {
+    { 0, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1, -1, -1, -1, -1, -1, -1 },
+    { 0,  1,  2, -1, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  3, -1, -1, -1, -1, -1 },
+    { 0,  1,  2,  3, -1, -1, -1, -1, -1 },
+    { 2,  0,  1,  3,  4, -1, -1, -1, -1 },
+    { 2,  3,  0,  1,  4,  5, -1, -1, -1 },
+    { 2,  0,  1,  3,  4,  5, -1, -1, -1 },
+    { 0,  5,  3,  4,  1,  2, -1, -1, -1 },
+    { 3,  2,  4,  0,  1,  5,  6, -1, -1 },
+    { 4,  5,  0,  1,  6,  2,  7,  3, -1 },
+    { 3,  2,  4,  0,  1,  5,  7,  6, -1 },
+};
+
+#endif /* AVCODEC_DCAENC_H */
diff --git a/libavcodec/dcahuff.c b/libavcodec/dcahuff.c
new file mode 100644
index 0000000..bea3530
--- /dev/null
+++ b/libavcodec/dcahuff.c
@@ -0,0 +1,1337 @@
+/*
+ * DCA compatible decoder - huffman tables
+ * Copyright (C) 2004 Gildas Bazin
+ * Copyright (C) 2007 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "dcahuff.h"
+
+#define TMODE_COUNT 4
+static const uint8_t tmode_vlc_bits[TMODE_COUNT] = { 3, 3, 3, 2 };
+static const uint16_t tmode_codes[TMODE_COUNT][4] = {
+    { 0x0000, 0x0002, 0x0006, 0x0007 },
+    { 0x0002, 0x0006, 0x0007, 0x0000 },
+    { 0x0006, 0x0007, 0x0000, 0x0002 },
+    { 0x0000, 0x0001, 0x0002, 0x0003 }
+};
+
+static const uint8_t tmode_bits[TMODE_COUNT][4] = {
+    { 1, 2, 3, 3 },
+    { 2, 3, 3, 1 },
+    { 3, 3, 1, 2 },
+    { 2, 2, 2, 2 }
+};
+
+#define BITALLOC_12_COUNT    5
+#define BITALLOC_12_VLC_BITS 9
+static const uint8_t bitalloc_12_vlc_bits[BITALLOC_12_COUNT] = {
+    9, 7, 7, 9, 9
+};
+
+static const uint16_t bitalloc_12_codes[BITALLOC_12_COUNT][12] = {
+    { 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E, 0x00FF, 0x00FE,
+      0x01FB, 0x01FA, 0x01F9, 0x01F8, },
+    { 0x0001, 0x0000, 0x0002, 0x000F, 0x000C, 0x001D, 0x0039, 0x0038,
+      0x0037, 0x0036, 0x0035, 0x0034, },
+    { 0x0000, 0x0007, 0x0005, 0x0004, 0x0002, 0x000D, 0x000C, 0x0006,
+      0x000F, 0x001D, 0x0039, 0x0038, },
+    { 0x0003, 0x0002, 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E,
+      0x007E, 0x00FE, 0x01FF, 0x01FE, },
+    { 0x0001, 0x0000, 0x0002, 0x0006, 0x000E, 0x003F, 0x003D, 0x007C,
+      0x0079, 0x0078, 0x00FB, 0x00FA, }
+};
+
+static const uint8_t bitalloc_12_bits[BITALLOC_12_COUNT][12] = {
+    { 1, 2, 3, 4, 5, 6, 8, 8, 9, 9,  9,  9 },
+    { 1, 2, 3, 5, 5, 6, 7, 7, 7, 7,  7,  7 },
+    { 2, 3, 3, 3, 3, 4, 4, 4, 5, 6,  7,  7 },
+    { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10 },
+    { 1, 2, 3, 4, 5, 7, 7, 8, 8, 8,  9,  9 }
+};
+
+#define SCALES_COUNT    5
+#define SCALES_VLC_BITS 9
+static const uint16_t scales_codes[SCALES_COUNT][129] = {
+    { 0x3AB0, 0x3AB2, 0x3AB4, 0x3AB6, 0x3AB8, 0x3ABA, 0x3ABC, 0x3ABE,
+      0x3AC0, 0x3AC2, 0x3AC4, 0x3AC6, 0x3AC8, 0x3ACA, 0x3ACC, 0x3ACE,
+      0x3AD0, 0x3AD2, 0x3AD4, 0x3AD6, 0x3AD8, 0x3ADA, 0x3ADC, 0x3ADE,
+      0x3AE0, 0x3AE2, 0x3AE4, 0x3AE6, 0x3AE8, 0x3AEA, 0x3AEC, 0x3AEE,
+      0x3AF0, 0x3AF2, 0x3AF4, 0x3AF6, 0x3AF8, 0x3AFA, 0x3AFC, 0x3AFE,
+      0x0540, 0x0542, 0x0544, 0x0546, 0x0548, 0x054A, 0x054C, 0x054E,
+      0x0558, 0x055E, 0x02AD, 0x0154, 0x0754, 0x03A8, 0x0056, 0x0028,
+      0x00E8, 0x004A, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
+      0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x004B,
+      0x00E9, 0x0029, 0x0057, 0x03A9, 0x0755, 0x0155, 0x02AE, 0x055F,
+      0x0559, 0x054F, 0x054D, 0x054B, 0x0549, 0x0547, 0x0545, 0x0543,
+      0x0541, 0x3AFF, 0x3AFD, 0x3AFB, 0x3AF9, 0x3AF7, 0x3AF5, 0x3AF3,
+      0x3AF1, 0x3AEF, 0x3AED, 0x3AEB, 0x3AE9, 0x3AE7, 0x3AE5, 0x3AE3,
+      0x3AE1, 0x3ADF, 0x3ADD, 0x3ADB, 0x3AD9, 0x3AD7, 0x3AD5, 0x3AD3,
+      0x3AD1, 0x3ACF, 0x3ACD, 0x3ACB, 0x3AC9, 0x3AC7, 0x3AC5, 0x3AC3,
+      0x3AC1, 0x3ABF, 0x3ABD, 0x3ABB, 0x3AB9, 0x3AB7, 0x3AB5, 0x3AB3,
+      0x3AB1, },
+    { 0x0F60, 0x0F62, 0x0F64, 0x0F66, 0x0F68, 0x0F6A, 0x0F6C, 0x0F6E,
+      0x0F70, 0x0F72, 0x0F74, 0x0F76, 0x0F78, 0x0F7A, 0x0F7C, 0x0F7E,
+      0x0F80, 0x0F82, 0x0F84, 0x0F86, 0x0F88, 0x0F8A, 0x0F8C, 0x0F8E,
+      0x0F90, 0x0F92, 0x0F94, 0x0F96, 0x0F98, 0x0F9A, 0x0F9C, 0x0F9E,
+      0x0FA0, 0x0FA2, 0x0FA4, 0x0FA6, 0x0FA8, 0x0FAA, 0x0FAC, 0x0FAE,
+      0x0FB0, 0x0FB2, 0x0FB4, 0x0FB6, 0x0FB8, 0x0FBA, 0x0FBC, 0x0FBE,
+      0x07A0, 0x07A2, 0x03D2, 0x01EA, 0x00FC, 0x007F, 0x001C, 0x000C,
+      0x0004, 0x0034, 0x0010, 0x001B, 0x0009, 0x000B, 0x000E, 0x0001,
+      0x0003, 0x0002, 0x000F, 0x000C, 0x000A, 0x0000, 0x0011, 0x0035,
+      0x0005, 0x000D, 0x001D, 0x003C, 0x00FD, 0x01EB, 0x03D3, 0x07A3,
+      0x07A1, 0x0FBF, 0x0FBD, 0x0FBB, 0x0FB9, 0x0FB7, 0x0FB5, 0x0FB3,
+      0x0FB1, 0x0FAF, 0x0FAD, 0x0FAB, 0x0FA9, 0x0FA7, 0x0FA5, 0x0FA3,
+      0x0FA1, 0x0F9F, 0x0F9D, 0x0F9B, 0x0F99, 0x0F97, 0x0F95, 0x0F93,
+      0x0F91, 0x0F8F, 0x0F8D, 0x0F8B, 0x0F89, 0x0F87, 0x0F85, 0x0F83,
+      0x0F81, 0x0F7F, 0x0F7D, 0x0F7B, 0x0F79, 0x0F77, 0x0F75, 0x0F73,
+      0x0F71, 0x0F6F, 0x0F6D, 0x0F6B, 0x0F69, 0x0F67, 0x0F65, 0x0F63,
+      0x0F61, },
+    { 0x51D0, 0x51D2, 0x51D4, 0x51D6, 0x51D8, 0x51DA, 0x51DC, 0x51DE,
+      0x51E0, 0x51E2, 0x51E4, 0x51E6, 0x51E8, 0x51EA, 0x51EC, 0x51EE,
+      0x51F0, 0x51F2, 0x51F4, 0x51F6, 0x51F8, 0x51FA, 0x51FC, 0x51FE,
+      0x70C0, 0x70C2, 0x70C4, 0x70C6, 0x70C8, 0x70CA, 0x70CC, 0x70CE,
+      0x70EC, 0x10EA, 0x3868, 0x3877, 0x0876, 0x1C35, 0x0434, 0x0A34,
+      0x0E1B, 0x021B, 0x051B, 0x070F, 0x010F, 0x0380, 0x0080, 0x0140,
+      0x01C1, 0x0041, 0x00A1, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
+      0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
+      0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
+      0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A2, 0x0042,
+      0x01C2, 0x0141, 0x0081, 0x0381, 0x028C, 0x010C, 0x051C, 0x021C,
+      0x0E1C, 0x0A35, 0x0435, 0x1C3A, 0x0877, 0x0874, 0x3869, 0x10EB,
+      0x70ED, 0x70CF, 0x70CD, 0x70CB, 0x70C9, 0x70C7, 0x70C5, 0x70C3,
+      0x70C1, 0x51FF, 0x51FD, 0x51FB, 0x51F9, 0x51F7, 0x51F5, 0x51F3,
+      0x51F1, 0x51EF, 0x51ED, 0x51EB, 0x51E9, 0x51E7, 0x51E5, 0x51E3,
+      0x51E1, 0x51DF, 0x51DD, 0x51DB, 0x51D9, 0x51D7, 0x51D5, 0x51D3,
+      0x51D1, },
+    { 0x6F64, 0x6F66, 0x6F68, 0x6F6A, 0x6F6C, 0x6F6E, 0x6F70, 0x6F72,
+      0x6F74, 0x6F76, 0x6F78, 0x6F7A, 0x6F7C, 0x6F7E, 0x6F80, 0x6F82,
+      0x6F84, 0x6F86, 0x6F88, 0x6F8A, 0x6F8C, 0x6F8E, 0x6F90, 0x6F92,
+      0x6F94, 0x6F96, 0x6F98, 0x6F9A, 0x6F9C, 0x6F9E, 0x6FA0, 0x6FA2,
+      0x6FA4, 0x6FA6, 0x6FA8, 0x6FAA, 0x6FAC, 0x6FAE, 0x6FB0, 0x6FB2,
+      0x6FB4, 0x6FB6, 0x17B4, 0x37DC, 0x0BDB, 0x1BEF, 0x05EE, 0x0DF8,
+      0x02F8, 0x06FD, 0x017D, 0x037F, 0x00BF, 0x0040, 0x00C0, 0x0021,
+      0x0061, 0x0011, 0x0031, 0x0009, 0x0019, 0x0006, 0x000E, 0x0004,
+      0x0000, 0x0005, 0x000F, 0x0007, 0x001A, 0x000A, 0x0036, 0x0016,
+      0x006E, 0x002E, 0x00C1, 0x0041, 0x01BC, 0x00BC, 0x037A, 0x017A,
+      0x02F9, 0x0DF9, 0x05EF, 0x05EC, 0x1BD8, 0x37DD, 0x17B5, 0x6FB7,
+      0x6FB5, 0x6FB3, 0x6FB1, 0x6FAF, 0x6FAD, 0x6FAB, 0x6FA9, 0x6FA7,
+      0x6FA5, 0x6FA3, 0x6FA1, 0x6F9F, 0x6F9D, 0x6F9B, 0x6F99, 0x6F97,
+      0x6F95, 0x6F93, 0x6F91, 0x6F8F, 0x6F8D, 0x6F8B, 0x6F89, 0x6F87,
+      0x6F85, 0x6F83, 0x6F81, 0x6F7F, 0x6F7D, 0x6F7B, 0x6F79, 0x6F77,
+      0x6F75, 0x6F73, 0x6F71, 0x6F6F, 0x6F6D, 0x6F6B, 0x6F69, 0x6F67,
+      0x6F65, },
+    { 0xDF54, 0xDF56, 0xDFC8, 0xDFCA, 0xDFCC, 0xDFCE, 0xDFD0, 0xDFD2,
+      0xDFD4, 0xDFD6, 0xDFD8, 0xDFDA, 0xDFDC, 0xDFDE, 0xDFE0, 0xDFE2,
+      0x0FE8, 0x2FEA, 0x6FA8, 0x6FF6, 0x07F5, 0x07F7, 0x37D2, 0x37F9,
+      0x03F8, 0x0BF8, 0x0BFB, 0x1BEB, 0x01FA, 0x05FA, 0x09FA, 0x0DFA,
+      0x0DFF, 0x00FF, 0x02FF, 0x06FB, 0x007C, 0x017C, 0x027C, 0x027F,
+      0x003C, 0x00BC, 0x013C, 0x01BC, 0x001C, 0x005C, 0x009C, 0x00DC,
+      0x000C, 0x002C, 0x004C, 0x006C, 0x0004, 0x0014, 0x0024, 0x0034,
+      0x0000, 0x0008, 0x0010, 0x0018, 0x001E, 0x0002, 0x0006, 0x000A,
+      0x000E, 0x000B, 0x0007, 0x0003, 0x001F, 0x0019, 0x0011, 0x0009,
+      0x0001, 0x0035, 0x0025, 0x0015, 0x0005, 0x006D, 0x004D, 0x002D,
+      0x000D, 0x00DD, 0x009D, 0x005D, 0x001D, 0x01BD, 0x013D, 0x00BD,
+      0x003D, 0x037C, 0x027D, 0x017D, 0x007D, 0x06FC, 0x04FC, 0x02FC,
+      0x00FC, 0x0DFB, 0x09FB, 0x05FB, 0x01FB, 0x1BF8, 0x1BE8, 0x0BF9,
+      0x03F9, 0x37FA, 0x37D3, 0x17F4, 0x07F6, 0x6FF7, 0x6FA9, 0x2FEB,
+      0x0FE9, 0xDFE3, 0xDFE1, 0xDFDF, 0xDFDD, 0xDFDB, 0xDFD9, 0xDFD7,
+      0xDFD5, 0xDFD3, 0xDFD1, 0xDFCF, 0xDFCD, 0xDFCB, 0xDFC9, 0xDF57,
+      0xDF55, }
+};
+
+static const uint8_t scales_bits[SCALES_COUNT][129] = {
+    { 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 12, 11, 11, 10,  9,  8,
+       8,  7,  6,  6,  5,  4,  4,  3,
+       2,  3,  3,  4,  5,  5,  6,  7,
+       8,  8,  9, 10, 11, 11, 12, 13,
+      13, 13, 13, 13, 13, 13, 13, 13,
+      13, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14,
+      14, },
+    { 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      14, 14, 13, 12, 11, 10,  8,  7,
+       6,  6,  5,  5,  4,  4,  4,  3,
+       3,  3,  4,  4,  4,  4,  5,  6,
+       6,  7,  8,  9, 11, 12, 13, 14,
+      14, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, },
+    { 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 14, 14, 14, 13, 13, 12, 12,
+      12, 11, 11, 11, 10, 10,  9,  9,
+       9,  8,  8,  8,  7,  7,  7,  6,
+       6,  6,  5,  5,  5,  4,  4,  3,
+       3,  3,  4,  4,  5,  5,  5,  6,
+       6,  6,  7,  7,  7,  8,  8,  8,
+       9,  9,  9, 10, 10, 10, 11, 11,
+      12, 12, 12, 13, 13, 13, 14, 14,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, },
+    { 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 14, 14, 13, 13, 12, 12,
+      11, 11, 10, 10,  9,  8,  8,  7,
+       7,  6,  6,  5,  5,  4,  4,  3,
+       2,  3,  4,  4,  5,  5,  6,  6,
+       7,  7,  8,  8,  9,  9, 10, 10,
+      11, 12, 12, 12, 13, 14, 14, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15,
+      15, },
+    { 16, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16,
+      15, 15, 15, 15, 14, 14, 14, 14,
+      13, 13, 13, 13, 12, 12, 12, 12,
+      12, 11, 11, 11, 10, 10, 10, 10,
+       9,  9,  9,  9,  8,  8,  8,  8,
+       7,  7,  7,  7,  6,  6,  6,  6,
+       5,  5,  5,  5,  5,  4,  4,  4,
+       4,  4,  4,  4,  5,  5,  5,  5,
+       5,  6,  6,  6,  6,  7,  7,  7,
+       7,  8,  8,  8,  8,  9,  9,  9,
+       9, 10, 10, 10, 10, 11, 11, 11,
+      11, 12, 12, 12, 12, 13, 13, 13,
+      13, 14, 14, 14, 14, 15, 15, 15,
+      15, 16, 16, 16, 16, 16, 16, 16,
+      16, 16, 16, 16, 16, 16, 16, 16,
+      16,
+    }
+};
+
+static const uint16_t bitalloc_3_codes[3] = {
+    0x0003, 0x0000, 0x0002,
+};
+
+static const uint8_t bitalloc_3_bits[3] = {
+    2,  1,  2,
+};
+
+static const uint16_t bitalloc_5_codes_a[5] = {
+    0x000F, 0x0006, 0x0000, 0x0002, 0x000E,
+};
+
+static const uint16_t bitalloc_5_codes_b[5] = {
+    0x0007, 0x0001, 0x0002, 0x0000, 0x0006,
+};
+
+static const uint16_t bitalloc_5_codes_c[5] = {
+    0x0007, 0x0005, 0x0000, 0x0004, 0x0006,
+};
+
+static const uint8_t bitalloc_5_bits_a[5] = {
+    4,  3,  1,  2,  4,
+};
+
+static const uint8_t bitalloc_5_bits_b[5] = {
+    3,  2,  2,  2,  3,
+};
+
+static const uint8_t bitalloc_5_bits_c[5] = {
+    3,  3,  1,  3,  3,
+};
+
+static const uint16_t bitalloc_7_codes_a[7] = {
+    0x001E, 0x000E, 0x0005, 0x0000, 0x0006, 0x0004, 0x001F,
+};
+
+static const uint16_t bitalloc_7_codes_b[7] = {
+    0x0014, 0x000B, 0x0000, 0x0003, 0x0001, 0x0004, 0x0015,
+};
+
+static const uint16_t bitalloc_7_codes_c[7] = {
+    0x0000, 0x0002, 0x0001, 0x0003, 0x0002, 0x0003, 0x0001,
+};
+
+static const uint8_t bitalloc_7_bits_a[7] = {
+    5,  4,  3,  1,  3,  3,  5,
+};
+
+static const uint8_t bitalloc_7_bits_b[7] = {
+    5,  4,  2,  2,  2,  3,  5,
+};
+
+static const uint8_t bitalloc_7_bits_c[7] = {
+    4,  4,  2,  2,  2,  4,  4,
+};
+
+static const uint16_t bitalloc_9_codes_a[9] = {
+    0x0030, 0x0019, 0x0009, 0x0005, 0x0000, 0x0007, 0x000D, 0x0008,
+    0x0031,
+};
+
+static const uint16_t bitalloc_9_codes_b[9] = {
+    0x0018, 0x001A, 0x0002, 0x0007, 0x0002, 0x0000, 0x0003, 0x001B,
+    0x0019,
+};
+
+static const uint16_t bitalloc_9_codes_c[9] = {
+    0x001C, 0x000F, 0x0002, 0x0007, 0x0002, 0x0000, 0x0006, 0x0006,
+    0x001D,
+};
+
+static const uint8_t bitalloc_9_bits_a[9] = {
+    6,  5,  4,  3,  1,  3,  4,  4,  6,
+};
+
+static const uint8_t bitalloc_9_bits_b[9] = {
+    5,  5,  3,  3,  2,  2,  3,  5,  5,
+};
+
+static const uint8_t bitalloc_9_bits_c[9] = {
+    6,  5,  3,  3,  2,  2,  3,  4,  6,
+};
+
+static const uint16_t bitalloc_13_codes_a[13] = {
+    0x0070, 0x002E, 0x0039, 0x001D, 0x000C, 0x000F, 0x0000, 0x0004,
+    0x000D, 0x000A, 0x0016, 0x002F, 0x0071,
+};
+
+static const uint16_t bitalloc_13_codes_b[13] = {
+    0x0038, 0x0010, 0x001D, 0x0007, 0x000F, 0x0005, 0x0000, 0x0006,
+    0x0002, 0x0009, 0x0006, 0x0011, 0x0039,
+};
+
+static const uint16_t bitalloc_13_codes_c[13] = {
+    0x0004, 0x001A, 0x0003, 0x000E, 0x0000, 0x0003, 0x0005, 0x0004,
+    0x0002, 0x000F, 0x000C, 0x001B, 0x0005,
+};
+
+static const uint8_t bitalloc_13_bits_a[13] = {
+     7,  6,  6,  5,  4,  4,  1,  3,  4,  4,  5,  6,  7,
+};
+
+static const uint8_t bitalloc_13_bits_b[13] = {
+     6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  4,  5,  6,
+};
+
+static const uint8_t bitalloc_13_bits_c[13] = {
+     5,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  5,
+};
+
+static const uint16_t bitalloc_17_codes_a[17] = {
+    0x0154, 0x00AB, 0x002B, 0x000B, 0x0003, 0x000A, 0x0001, 0x0006,
+    0x0001, 0x0007, 0x0004, 0x000B, 0x0000, 0x0004, 0x0014, 0x0054,
+    0x0155,
+};
+
+static const uint16_t bitalloc_17_codes_b[17] = {
+    0x007C, 0x003F, 0x0019, 0x000D, 0x001C, 0x0008, 0x000F, 0x0005,
+    0x0000, 0x0006, 0x0002, 0x0009, 0x001D, 0x000E, 0x001E, 0x0018,
+    0x007D,
+};
+
+static const uint16_t bitalloc_17_codes_c[17] = {
+    0x002C, 0x0017, 0x0005, 0x001C, 0x0003, 0x000A, 0x000F, 0x0003,
+    0x0006, 0x0004, 0x0000, 0x000B, 0x0004, 0x001D, 0x000A, 0x0004,
+    0x002D,
+};
+
+static const uint16_t bitalloc_17_codes_d[17] = {
+    0x0100, 0x0102, 0x0082, 0x0042, 0x0022, 0x0012, 0x000A, 0x0006,
+    0x0000, 0x0007, 0x000B, 0x0013, 0x0023, 0x0043, 0x0083, 0x0103,
+    0x0101,
+};
+
+static const uint16_t bitalloc_17_codes_e[17] = {
+    0x00E8, 0x00F6, 0x0075, 0x0034, 0x003B, 0x001B, 0x001F, 0x0004,
+    0x0000, 0x0005, 0x000C, 0x001C, 0x003C, 0x0035, 0x007A, 0x00F7,
+    0x00E9,
+};
+
+static const uint16_t bitalloc_17_codes_f[17] = {
+    0x0004, 0x0003, 0x001E, 0x0001, 0x0001, 0x000E, 0x0001, 0x0004,
+    0x0006, 0x0005, 0x0002, 0x000F, 0x0006, 0x000E, 0x001F, 0x0000,
+    0x0005,
+};
+
+static const uint16_t bitalloc_17_codes_g[17] = {
+    0x0060, 0x007E, 0x0031, 0x0019, 0x000D, 0x0004, 0x0000, 0x0006,
+    0x0002, 0x0007, 0x0001, 0x0005, 0x000E, 0x001E, 0x003E, 0x007F,
+    0x0061,
+};
+
+static const uint8_t bitalloc_17_bits_a[17] = {
+    12, 11,  9,  7,  5,  4,  3,  3,  2,  3,  3,  4,  4,  6,  8, 10,
+    12,
+};
+
+static const uint8_t bitalloc_17_bits_b[17] = {
+    8,  7,  6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  5,  5,  6,  6,
+    8,
+};
+
+static const uint8_t bitalloc_17_bits_c[17] = {
+    7,  6,  5,  5,  4,  4,  4,  3,  3,  3,  3,  4,  4,  5,  5,  5,
+    7,
+};
+
+static const uint8_t bitalloc_17_bits_d[17] = {
+    9,  9,  8,  7,  6,  5,  4,  3,  1,  3,  4,  5,  6,  7,  8,  9,
+    9,
+};
+
+static const uint8_t bitalloc_17_bits_e[17] = {
+    8,  8,  7,  6,  6,  5,  5,  3,  1,  3,  4,  5,  6,  6,  7,  8,
+    8,
+};
+
+static const uint8_t bitalloc_17_bits_f[17] = {
+    8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  6,  6,
+    8,
+};
+
+static const uint8_t bitalloc_17_bits_g[17] = {
+    8,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,  5,  6,  7,  8,
+    8,
+};
+
+static const uint16_t bitalloc_25_codes_a[25] = {
+    0x2854, 0x142B, 0x050B, 0x0143, 0x00A2, 0x0052, 0x002E, 0x0015,
+    0x0004, 0x000E, 0x0000, 0x0003, 0x0006, 0x0004, 0x0001, 0x000F,
+    0x0005, 0x0016, 0x002F, 0x0053, 0x00A3, 0x00A0, 0x0284, 0x0A14,
+    0x2855,
+};
+
+static const uint16_t bitalloc_25_codes_b[25] = {
+    0x001C, 0x000F, 0x0005, 0x0000, 0x0030, 0x0036, 0x000E, 0x0019,
+    0x0001, 0x0008, 0x000E, 0x0001, 0x0005, 0x0002, 0x000F, 0x0009,
+    0x0006, 0x001A, 0x000F, 0x0037, 0x0031, 0x0001, 0x0006, 0x0004,
+    0x001D,
+};
+
+static const uint16_t bitalloc_25_codes_c[25] = {
+    0x004C, 0x0027, 0x006D, 0x0028, 0x0037, 0x000E, 0x0015, 0x0000,
+    0x0005, 0x0008, 0x000B, 0x000E, 0x0001, 0x000F, 0x000C, 0x0009,
+    0x0006, 0x0001, 0x001A, 0x000F, 0x0008, 0x0029, 0x0012, 0x006C,
+    0x004D,
+};
+
+static const uint16_t bitalloc_25_codes_d[25] = {
+    0x0780, 0x0782, 0x03C2, 0x01E2, 0x00FE, 0x0079, 0x003D, 0x001C,
+    0x000C, 0x0004, 0x0000, 0x0006, 0x0002, 0x0007, 0x0001, 0x0005,
+    0x000D, 0x001D, 0x003E, 0x007E, 0x00FF, 0x01E3, 0x03C3, 0x0783,
+    0x0781,
+};
+
+static const uint16_t bitalloc_25_codes_e[25] = {
+    0x003C, 0x0092, 0x0018, 0x001F, 0x004E, 0x000D, 0x0025, 0x0004,
+    0x0010, 0x0000, 0x000A, 0x0002, 0x0003, 0x0003, 0x000B, 0x0001,
+    0x0011, 0x0005, 0x0026, 0x000E, 0x004F, 0x0048, 0x0019, 0x0093,
+    0x003D,
+};
+
+static const uint16_t bitalloc_25_codes_f[25] = {
+    0x0324, 0x0193, 0x00CE, 0x0065, 0x0024, 0x000C, 0x0013, 0x0004,
+    0x0007, 0x000A, 0x000D, 0x000F, 0x0001, 0x0000, 0x000E, 0x000B,
+    0x0008, 0x0005, 0x0018, 0x000D, 0x0025, 0x0066, 0x00CF, 0x00C8,
+    0x0325,
+};
+
+static const uint16_t bitalloc_25_codes_g[25] = {
+    0x03A8, 0x03AE, 0x01D5, 0x0094, 0x0014, 0x004B, 0x000B, 0x003B,
+    0x0013, 0x0003, 0x000F, 0x0005, 0x0001, 0x0006, 0x0000, 0x0008,
+    0x001C, 0x0004, 0x0024, 0x0074, 0x0015, 0x0095, 0x01D6, 0x03AF,
+    0x03A9,
+};
+
+static const uint8_t bitalloc_25_bits_a[25] = {
+    14, 13, 11,  9,  8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,
+     4,  5,  6,  7,  8,  8, 10, 12, 14,
+};
+
+static const uint8_t bitalloc_25_bits_b[25] = {
+    9,  8,  7,  6,  6,  6,  5,  5,  4,  4,  4,  3,  3,  3,  4,  4,
+    4,  5,  5,  6,  6,  6,  7,  7,  9,
+};
+
+static const uint8_t bitalloc_25_bits_c[25] = {
+    8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  4,  4,  3,  4,  4,  4,
+    4,  4,  5,  5,  5,  6,  6,  7,  8,
+};
+
+static const uint8_t bitalloc_25_bits_d[25] = {
+    12, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,
+     5,  6,  7,  8,  9, 10, 11, 12, 12,
+};
+
+static const uint8_t bitalloc_25_bits_e[25] = {
+    8,  8,  7,  7,  7,  6,  6,  5,  5,  4,  4,  3,  2,  3,  4,  4,
+    5,  5,  6,  6,  7,  7,  7,  8,  8,
+};
+
+static const uint8_t bitalloc_25_bits_f[25] = {
+    10,  9,  8,  7,  6,  5,  5,  4,  4,  4,  4,  4,  3,  3,  4,  4,
+     4,  4,  5,  5,  6,  7,  8,  8, 10,
+};
+
+static const uint8_t bitalloc_25_bits_g[25] = {
+    10, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,  2,  3,  3,  4,
+     5,  5,  6,  7,  7,  8,  9, 10, 10,
+};
+
+static const uint16_t bitalloc_33_codes_a[33] = {
+    0x1580, 0x1582, 0x0AC2, 0x0562, 0x02B2, 0x015E, 0x00AD, 0x0054,
+    0x001C, 0x003C, 0x000F, 0x001F, 0x0008, 0x000B, 0x000D, 0x0000,
+    0x0002, 0x0001, 0x000E, 0x000C, 0x0009, 0x0006, 0x0014, 0x003D,
+    0x001D, 0x0055, 0x00AE, 0x015F, 0x02B3, 0x0563, 0x0AC3, 0x1583,
+    0x1581,
+};
+
+static const uint16_t bitalloc_33_codes_b[33] = {
+    0x030C, 0x0187, 0x006D, 0x0028, 0x0037, 0x0066, 0x0015, 0x0031,
+    0x0000, 0x000B, 0x0012, 0x001A, 0x0001, 0x0007, 0x000A, 0x000E,
+    0x0001, 0x000F, 0x000B, 0x0008, 0x0004, 0x001B, 0x0013, 0x000C,
+    0x0001, 0x0032, 0x001A, 0x0067, 0x0060, 0x0029, 0x00C2, 0x006C,
+    0x030D,
+};
+
+static const uint16_t bitalloc_33_codes_c[33] = {
+    0x00CC, 0x0067, 0x0005, 0x0070, 0x0003, 0x001A, 0x0039, 0x003F,
+    0x000A, 0x0012, 0x0018, 0x001D, 0x0001, 0x0003, 0x0007, 0x000A,
+    0x000D, 0x000B, 0x0008, 0x0004, 0x0002, 0x001E, 0x0019, 0x0013,
+    0x000B, 0x0000, 0x003E, 0x001B, 0x0018, 0x0071, 0x0032, 0x0004,
+    0x00CD,
+};
+
+static const uint16_t bitalloc_33_codes_d[33] = {
+    0x3AF8, 0x3AFA, 0x1D7E, 0x0EBC, 0x075C, 0x03AC, 0x01D4, 0x0094,
+    0x0014, 0x004B, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
+    0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x0074,
+    0x0015, 0x0095, 0x01D5, 0x03AD, 0x075D, 0x0EBD, 0x1D7F, 0x3AFB,
+    0x3AF9,
+};
+
+static const uint16_t bitalloc_33_codes_e[33] = {
+    0x01C8, 0x01E6, 0x0064, 0x00E2, 0x00E5, 0x0030, 0x0033, 0x0073,
+    0x007A, 0x001A, 0x003A, 0x0002, 0x001A, 0x001F, 0x0007, 0x0001,
+    0x0002, 0x0002, 0x000C, 0x0000, 0x001B, 0x0003, 0x003B, 0x001B,
+    0x007B, 0x0078, 0x0070, 0x0031, 0x00F2, 0x00E3, 0x0065, 0x01E7,
+    0x01C9,
+};
+
+static const uint16_t bitalloc_33_codes_f[33] = {
+    0x0724, 0x0393, 0x01CE, 0x00E5, 0x002C, 0x0008, 0x0017, 0x003E,
+    0x0005, 0x0014, 0x001D, 0x0000, 0x0003, 0x0006, 0x0008, 0x000B,
+    0x000D, 0x000C, 0x0009, 0x0007, 0x0004, 0x0001, 0x001E, 0x0015,
+    0x000A, 0x003F, 0x0038, 0x0009, 0x002D, 0x00E6, 0x01CF, 0x01C8,
+    0x0725,
+};
+
+static const uint16_t bitalloc_33_codes_g[33] = {
+    0x0284, 0x0042, 0x0140, 0x0143, 0x003E, 0x00BE, 0x0011, 0x0051,
+    0x0009, 0x0029, 0x0005, 0x0015, 0x0000, 0x0008, 0x000E, 0x0002,
+    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0016, 0x0006, 0x002E,
+    0x000E, 0x005E, 0x001E, 0x00BF, 0x003F, 0x0020, 0x0141, 0x0043,
+    0x0285,
+};
+
+static const uint8_t bitalloc_33_bits_a[33] = {
+    13, 13, 12, 11, 10,  9,  8,  7,  6,  6,  5,  5,  4,  4,  4,  3,
+     3,  3,  4,  4,  4,  4,  5,  6,  6,  7,  8,  9, 10, 11, 12, 13,
+    13,
+};
+
+static const uint8_t bitalloc_33_bits_b[33] = {
+    10,  9,  8,  7,  7,  7,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
+     3,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  7,  7,  7,  8,  8,
+    10,
+};
+
+static const uint8_t bitalloc_33_bits_c[33] = {
+    9,  8,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,  7,
+    9,
+};
+
+static const uint8_t bitalloc_33_bits_d[33] = {
+    14, 14, 13, 12, 11, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,
+     2,  3,  3,  4,  5,  5,  6,  7,  7,  8,  9, 10, 11, 12, 13, 14,
+    14,
+};
+
+static const uint8_t bitalloc_33_bits_e[33] = {
+    9,  9,  8,  8,  8,  7,  7,  7,  7,  6,  6,  5,  5,  5,  4,  3,
+    2,  3,  4,  4,  5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,
+    9,
+};
+
+static const uint8_t bitalloc_33_bits_f[33] = {
+    11, 10,  9,  8,  7,  6,  6,  6,  5,  5,  5,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  8,  9,  9,
+    11,
+};
+
+static const uint8_t bitalloc_33_bits_g[33] = {
+    10,  9,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
+     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  9,  9,
+    10,
+};
+
+static const uint16_t bitalloc_65_codes_a[65] = {
+    0x9E5C, 0x9E5E, 0x4F2C, 0x2794, 0x13C4, 0x1E44, 0x09E3, 0x0F23,
+    0x04F3, 0x0792, 0x027E, 0x03CE, 0x013D, 0x01E5, 0x009C, 0x00CC,
+    0x0040, 0x0058, 0x0067, 0x001E, 0x0021, 0x002D, 0x003D, 0x0007,
+    0x0011, 0x0014, 0x0017, 0x001A, 0x001C, 0x001F, 0x0001, 0x0004,
+    0x0006, 0x0005, 0x0002, 0x0000, 0x001D, 0x001B, 0x0018, 0x0015,
+    0x0012, 0x000E, 0x0006, 0x0032, 0x0026, 0x001F, 0x0078, 0x0059,
+    0x0041, 0x00CD, 0x009D, 0x01E6, 0x013E, 0x03CF, 0x027F, 0x0793,
+    0x0790, 0x04F0, 0x09E4, 0x1E45, 0x13C5, 0x2795, 0x4F2D, 0x9E5F,
+    0x9E5D,
+};
+
+static const uint16_t bitalloc_65_codes_b[65] = {
+    0x0A8C, 0x0547, 0x01B5, 0x0008, 0x00DB, 0x0152, 0x0005, 0x000B,
+    0x008E, 0x00AE, 0x00E4, 0x0003, 0x0037, 0x0039, 0x0055, 0x006C,
+    0x0073, 0x0003, 0x0015, 0x001D, 0x0028, 0x0030, 0x0037, 0x003E,
+    0x0006, 0x000B, 0x000F, 0x0012, 0x0016, 0x0019, 0x001D, 0x0001,
+    0x0004, 0x0002, 0x001E, 0x001A, 0x0017, 0x0013, 0x0010, 0x000C,
+    0x0007, 0x003F, 0x0038, 0x0031, 0x0029, 0x0022, 0x001A, 0x0014,
+    0x0000, 0x006D, 0x0056, 0x0046, 0x0038, 0x0004, 0x00E5, 0x00AF,
+    0x008F, 0x006C, 0x000A, 0x0153, 0x0150, 0x0009, 0x02A2, 0x01B4,
+    0x0A8D,
+};
+
+static const uint16_t bitalloc_65_codes_c[65] = {
+    0x045C, 0x022F, 0x03F5, 0x01BC, 0x01FB, 0x0059, 0x00D0, 0x00DF,
+    0x000A, 0x002D, 0x002F, 0x0052, 0x0069, 0x0078, 0x007F, 0x000A,
+    0x0010, 0x001C, 0x0023, 0x002A, 0x0035, 0x003A, 0x003D, 0x0000,
+    0x0003, 0x0006, 0x0009, 0x000C, 0x000F, 0x0012, 0x0016, 0x0018,
+    0x001C, 0x0019, 0x0017, 0x0013, 0x0010, 0x000D, 0x000A, 0x0007,
+    0x0004, 0x0001, 0x003E, 0x003B, 0x0036, 0x002B, 0x0028, 0x001D,
+    0x0011, 0x000B, 0x0004, 0x0079, 0x006E, 0x0053, 0x0044, 0x002E,
+    0x000B, 0x00FC, 0x00D1, 0x008A, 0x0058, 0x01BD, 0x0116, 0x03F4,
+    0x045D,
+};
+
+static const uint16_t bitalloc_65_codes_d[65] = {
+    0x70B0, 0x70B2, 0x70B4, 0x2852, 0x385B, 0x142E, 0x1C2E, 0x0A15,
+    0x0E14, 0x0214, 0x0704, 0x0104, 0x010B, 0x0383, 0x0083, 0x0143,
+    0x01C3, 0x0043, 0x00A2, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
+    0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
+    0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
+    0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A3, 0x00A0,
+    0x0040, 0x01C0, 0x0084, 0x0384, 0x0284, 0x0105, 0x0705, 0x0215,
+    0x0E15, 0x0A16, 0x1C2F, 0x142F, 0x1428, 0x2853, 0x70B5, 0x70B3,
+    0x70B1,
+};
+
+static const uint16_t bitalloc_65_codes_e[65] = {
+    0x032C, 0x0332, 0x0378, 0x037E, 0x008C, 0x014A, 0x0188, 0x0197,
+    0x019E, 0x01BD, 0x0044, 0x0047, 0x00AA, 0x00C5, 0x00CD, 0x00DC,
+    0x001C, 0x002C, 0x0053, 0x0063, 0x0068, 0x0008, 0x000F, 0x0017,
+    0x002B, 0x0035, 0x0005, 0x0009, 0x0016, 0x001C, 0x0006, 0x000F,
+    0x0004, 0x0000, 0x0007, 0x001D, 0x0017, 0x000A, 0x0006, 0x0036,
+    0x0030, 0x0028, 0x0010, 0x0009, 0x0069, 0x0064, 0x0054, 0x002D,
+    0x001D, 0x00DD, 0x00CE, 0x00CA, 0x00AB, 0x00A4, 0x0045, 0x01BE,
+    0x019F, 0x0198, 0x0189, 0x014B, 0x008D, 0x037F, 0x0379, 0x0333,
+    0x032D,
+};
+
+static const uint16_t bitalloc_65_codes_f[65] = {
+    0x0FE0, 0x0FE2, 0x0FE8, 0x0FEA, 0x0FEC, 0x0FEE, 0x0FF0, 0x0FF2,
+    0x0FF4, 0x2FF2, 0x07F2, 0x07FB, 0x03F6, 0x0BFA, 0x0BFD, 0x01FF,
+    0x05FF, 0x02FC, 0x007C, 0x017C, 0x003C, 0x00BC, 0x001C, 0x005C,
+    0x000C, 0x002C, 0x0004, 0x0014, 0x0000, 0x0008, 0x000E, 0x0002,
+    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0015, 0x0005, 0x002D,
+    0x000D, 0x005D, 0x001D, 0x00BD, 0x003D, 0x017D, 0x007D, 0x02FD,
+    0x00FC, 0x05FC, 0x01FA, 0x0BFB, 0x03F7, 0x17F8, 0x07F3, 0x2FF3,
+    0x0FF5, 0x0FF3, 0x0FF1, 0x0FEF, 0x0FED, 0x0FEB, 0x0FE9, 0x0FE3,
+    0x0FE1,
+};
+
+static const uint16_t bitalloc_65_codes_g[65] = {
+    0x010C, 0x038A, 0x0608, 0x0786, 0x0084, 0x0087, 0x0302, 0x0305,
+    0x0040, 0x00E0, 0x00E3, 0x0183, 0x001E, 0x005E, 0x009E, 0x00DE,
+    0x00F1, 0x0011, 0x0039, 0x0061, 0x0079, 0x0009, 0x001D, 0x0031,
+    0x003D, 0x0005, 0x000F, 0x0019, 0x001F, 0x0003, 0x0006, 0x000A,
+    0x000E, 0x000B, 0x0008, 0x0004, 0x0000, 0x001A, 0x0012, 0x000A,
+    0x0002, 0x0036, 0x0026, 0x0016, 0x0006, 0x006E, 0x004E, 0x002E,
+    0x000E, 0x00DF, 0x009F, 0x005F, 0x001F, 0x01E0, 0x0180, 0x00E1,
+    0x0041, 0x03C2, 0x0303, 0x01C4, 0x0085, 0x0787, 0x0609, 0x038B,
+    0x010D,
+};
+
+static const uint8_t bitalloc_65_bits_a[65] = {
+    16, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10,  9,  9,  8,  8,
+     7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,  4,
+     4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,
+     7,  8,  8,  9,  9, 10, 10, 11, 11, 11, 12, 13, 13, 14, 15, 16,
+    16,
+};
+
+static const uint8_t bitalloc_65_bits_b[65] = {
+    12, 11, 10,  9,  9,  9,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
+     7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,
+     4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,
+     6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9, 10, 10,
+    12,
+};
+
+static const uint8_t bitalloc_65_bits_c[65] = {
+    11, 10, 10,  9,  9,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  6,
+     6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10,
+    11,
+};
+
+static const uint8_t bitalloc_65_bits_d[65] = {
+    15, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,  9,  9,
+     9,  8,  8,  8,  7,  7,  7,  6,  6,  6,  5,  5,  5,  4,  4,  3,
+     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  8,  8,  8,
+     8,  9,  9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 15, 15,
+    15,
+};
+
+static const uint8_t bitalloc_65_bits_e[65] = {
+    10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,
+     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,
+     3,  3,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  7,  7,
+     7,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10,
+    10,
+};
+
+static const uint8_t bitalloc_65_bits_f[65] = {
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+    11, 10,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
+     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10,
+    10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14,
+};
+
+static const uint8_t bitalloc_65_bits_g[65] = {
+    11, 11, 11, 11, 10, 10, 10, 10,  9,  9,  9,  9,  8,  8,  8,  8,
+     8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,
+     4,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,
+     7,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
+    11,
+};
+
+static const uint16_t bitalloc_129_codes_a[129] = {
+    0x0660, 0x0666, 0x06EC, 0x0722, 0x0760, 0x076E, 0x004C, 0x004E,
+    0x00F4, 0x010A, 0x0148, 0x0156, 0x01D4, 0x01F2, 0x0331, 0x0370,
+    0x0377, 0x0396, 0x03B1, 0x0024, 0x0064, 0x007B, 0x008A, 0x00A5,
+    0x00D4, 0x00EB, 0x00FA, 0x019A, 0x01B9, 0x01C9, 0x01D9, 0x0010,
+    0x0030, 0x0033, 0x0043, 0x0053, 0x006B, 0x007A, 0x00CA, 0x00D2,
+    0x00DE, 0x00E6, 0x00F6, 0x000E, 0x001F, 0x0023, 0x002B, 0x003B,
+    0x003F, 0x0067, 0x0070, 0x0077, 0x0005, 0x000D, 0x0012, 0x001B,
+    0x002C, 0x0035, 0x003A, 0x0004, 0x000B, 0x0017, 0x001F, 0x0009,
+    0x0008, 0x000A, 0x0000, 0x0018, 0x000C, 0x0005, 0x003C, 0x0036,
+    0x002D, 0x001C, 0x0013, 0x000E, 0x0006, 0x007A, 0x0071, 0x0068,
+    0x0064, 0x003C, 0x0034, 0x0028, 0x0020, 0x000F, 0x00F7, 0x00E7,
+    0x00DF, 0x00D3, 0x00CB, 0x007B, 0x0074, 0x0054, 0x0044, 0x003C,
+    0x0031, 0x0011, 0x01DA, 0x01CA, 0x01BA, 0x019B, 0x00FB, 0x00F8,
+    0x00D5, 0x00AA, 0x008B, 0x0084, 0x0065, 0x0025, 0x03B6, 0x0397,
+    0x0390, 0x0371, 0x0332, 0x01F3, 0x01D5, 0x0157, 0x0149, 0x010B,
+    0x00F5, 0x004F, 0x004D, 0x076F, 0x0761, 0x0723, 0x06ED, 0x0667,
+    0x0661,
+};
+
+static const uint16_t bitalloc_129_codes_b[129] = {
+    0x29DC, 0x14EF, 0x0455, 0x0E9C, 0x022B, 0x0489, 0x0740, 0x074F,
+    0x0172, 0x0245, 0x0247, 0x030A, 0x03A1, 0x001C, 0x008B, 0x00D6,
+    0x010C, 0x0148, 0x014F, 0x0186, 0x01D1, 0x0008, 0x000F, 0x0046,
+    0x005D, 0x0078, 0x0087, 0x0096, 0x00A5, 0x00BC, 0x00D8, 0x00DE,
+    0x00F6, 0x0005, 0x0014, 0x0024, 0x002F, 0x003A, 0x003D, 0x0049,
+    0x0050, 0x0058, 0x005F, 0x0066, 0x006D, 0x0075, 0x007C, 0x0004,
+    0x000B, 0x0013, 0x0018, 0x001B, 0x001F, 0x0022, 0x0026, 0x002A,
+    0x002D, 0x0031, 0x0034, 0x0038, 0x003B, 0x003F, 0x0003, 0x0006,
+    0x000A, 0x0007, 0x0004, 0x0000, 0x003C, 0x0039, 0x0035, 0x0032,
+    0x002E, 0x002B, 0x0027, 0x0023, 0x0020, 0x001C, 0x0019, 0x0016,
+    0x0010, 0x0005, 0x007D, 0x007A, 0x006E, 0x0067, 0x0060, 0x0059,
+    0x0051, 0x004A, 0x0042, 0x003B, 0x0034, 0x0025, 0x0015, 0x0006,
+    0x00F7, 0x00DF, 0x00D9, 0x00BD, 0x00A6, 0x0097, 0x0090, 0x0079,
+    0x006A, 0x0047, 0x0044, 0x0009, 0x01D2, 0x0187, 0x0184, 0x0149,
+    0x010D, 0x00D7, 0x00B8, 0x001D, 0x03A6, 0x030B, 0x029C, 0x0246,
+    0x0173, 0x0114, 0x0741, 0x053A, 0x0488, 0x0E9D, 0x0A76, 0x0454,
+    0x29DD,
+};
+
+static const uint16_t bitalloc_129_codes_c[129] = {
+    0x0E5C, 0x072F, 0x001D, 0x0724, 0x000F, 0x010D, 0x0324, 0x0393,
+    0x03E9, 0x0080, 0x0087, 0x00FA, 0x0164, 0x0193, 0x01DE, 0x01F5,
+    0x0010, 0x002A, 0x0041, 0x0064, 0x0073, 0x008E, 0x00A4, 0x00B3,
+    0x00D6, 0x00E5, 0x00F4, 0x00FB, 0x0002, 0x0009, 0x0013, 0x001E,
+    0x0026, 0x002C, 0x0033, 0x003F, 0x0041, 0x004C, 0x0053, 0x005E,
+    0x0065, 0x0070, 0x0073, 0x0078, 0x007B, 0x007E, 0x0002, 0x0005,
+    0x0007, 0x000B, 0x000D, 0x0011, 0x0014, 0x0017, 0x001A, 0x001D,
+    0x0021, 0x0024, 0x0027, 0x002A, 0x002D, 0x0030, 0x0033, 0x0036,
+    0x003A, 0x0037, 0x0034, 0x0031, 0x002E, 0x002B, 0x0028, 0x0025,
+    0x0022, 0x001E, 0x001B, 0x0018, 0x0015, 0x0012, 0x000E, 0x000C,
+    0x0008, 0x0006, 0x0003, 0x007F, 0x007C, 0x0079, 0x0076, 0x0071,
+    0x006A, 0x005F, 0x0058, 0x004D, 0x0046, 0x0040, 0x0038, 0x002D,
+    0x0027, 0x001F, 0x0014, 0x0012, 0x0003, 0x0000, 0x00F5, 0x00EE,
+    0x00D7, 0x00C8, 0x00A5, 0x008F, 0x007C, 0x0065, 0x0042, 0x002B,
+    0x0011, 0x0002, 0x01DF, 0x01C8, 0x0165, 0x00FB, 0x00E4, 0x0081,
+    0x0006, 0x03E8, 0x0325, 0x01CA, 0x010C, 0x0725, 0x0396, 0x001C,
+    0x0E5D,
+};
+
+static const uint16_t bitalloc_129_codes_d[129] = {
+    0xA598, 0xA59A, 0xA59C, 0xA59E, 0xC598, 0xE586, 0x3ACC, 0x52CA,
+    0x62CD, 0x0D48, 0x1D67, 0x2978, 0x3167, 0x3966, 0x06A5, 0x0EBC,
+    0x14BD, 0x1CB1, 0x0350, 0x0353, 0x075F, 0x0A5F, 0x0C5E, 0x0E5E,
+    0x01AE, 0x03AD, 0x052D, 0x062D, 0x072D, 0x00D5, 0x01D4, 0x0294,
+    0x0314, 0x0394, 0x0014, 0x0094, 0x0114, 0x0174, 0x01B4, 0x01F4,
+    0x000B, 0x004B, 0x008B, 0x00BB, 0x00DB, 0x00FB, 0x001B, 0x003B,
+    0x0053, 0x0063, 0x0073, 0x0003, 0x0013, 0x0023, 0x002F, 0x0037,
+    0x003F, 0x0007, 0x000F, 0x0015, 0x0019, 0x001D, 0x0001, 0x0005,
+    0x0009, 0x0006, 0x0002, 0x001E, 0x001A, 0x0016, 0x0010, 0x0008,
+    0x0000, 0x0038, 0x0030, 0x0028, 0x001C, 0x000C, 0x007C, 0x006C,
+    0x005C, 0x0044, 0x0024, 0x0004, 0x00E4, 0x00C4, 0x00A4, 0x0074,
+    0x0034, 0x01F5, 0x01B5, 0x0175, 0x0115, 0x0095, 0x0015, 0x0395,
+    0x0315, 0x0295, 0x01D5, 0x00D6, 0x072E, 0x062E, 0x052E, 0x03AE,
+    0x01AF, 0x0E5F, 0x0C5F, 0x0C58, 0x0A58, 0x0758, 0x0351, 0x1CB2,
+    0x18B2, 0x0EBD, 0x0EB2, 0x3967, 0x3960, 0x2979, 0x2964, 0x0D49,
+    0x72C2, 0x52CB, 0x3ACD, 0xE587, 0xC599, 0xA59F, 0xA59D, 0xA59B,
+    0xA599,
+};
+
+static const uint16_t bitalloc_129_codes_e[129] = {
+    0xA13C, 0xC720, 0xA13F, 0xA13E, 0xA13D, 0xE722, 0x5090, 0x6393,
+    0x7392, 0x2849, 0x31CE, 0x39CE, 0x1425, 0x18E5, 0x1CE5, 0x0844,
+    0x0A1C, 0x0C7C, 0x036C, 0x0423, 0x050F, 0x063F, 0x01B7, 0x0216,
+    0x0285, 0x031D, 0x039D, 0x0109, 0x0140, 0x0180, 0x01C8, 0x01CF,
+    0x007A, 0x008A, 0x00A2, 0x00C1, 0x00E5, 0x0014, 0x0037, 0x0043,
+    0x004E, 0x0056, 0x0061, 0x006C, 0x007C, 0x000B, 0x001C, 0x001F,
+    0x0023, 0x0025, 0x0029, 0x002C, 0x002E, 0x0032, 0x0034, 0x0037,
+    0x003A, 0x003C, 0x003F, 0x0001, 0x0003, 0x0006, 0x0008, 0x000A,
+    0x000C, 0x000B, 0x0009, 0x0007, 0x0004, 0x0002, 0x0000, 0x003D,
+    0x003B, 0x0038, 0x0035, 0x0033, 0x002F, 0x002D, 0x002A, 0x0026,
+    0x0024, 0x0020, 0x001D, 0x001A, 0x007D, 0x006D, 0x0062, 0x0057,
+    0x004F, 0x0044, 0x003C, 0x0015, 0x00E6, 0x00C6, 0x00A3, 0x008B,
+    0x007B, 0x006C, 0x01C9, 0x0181, 0x0141, 0x010A, 0x00DA, 0x031E,
+    0x0286, 0x0217, 0x0210, 0x0738, 0x0638, 0x0508, 0x036D, 0x0C7D,
+    0x0A1D, 0x0845, 0x1CE6, 0x18E6, 0x1426, 0x39CF, 0x31CF, 0x284E,
+    0x7393, 0x7390, 0x5091, 0xE723, 0xC724, 0xC725, 0xC722, 0xC723,
+    0xC721,
+};
+
+static const uint16_t bitalloc_129_codes_f[129] = {
+    0x762C, 0x3B17, 0x1555, 0x0608, 0x0AAB, 0x0FF2, 0x0305, 0x0307,
+    0x0763, 0x0046, 0x010C, 0x01BC, 0x02AB, 0x03B6, 0x03FD, 0x0080,
+    0x0087, 0x00DF, 0x0156, 0x01D9, 0x01F8, 0x01FF, 0x002A, 0x0041,
+    0x0061, 0x0094, 0x00D4, 0x00EA, 0x00F2, 0x00FD, 0x0009, 0x000B,
+    0x001A, 0x0026, 0x0031, 0x0040, 0x004B, 0x006B, 0x0073, 0x0077,
+    0x007A, 0x007C, 0x0000, 0x0002, 0x0006, 0x0008, 0x000B, 0x000E,
+    0x0011, 0x0014, 0x0016, 0x0019, 0x001C, 0x001E, 0x0021, 0x0023,
+    0x0026, 0x0028, 0x002B, 0x002D, 0x002F, 0x0031, 0x0033, 0x0036,
+    0x0038, 0x0037, 0x0034, 0x0032, 0x0030, 0x002E, 0x002C, 0x0029,
+    0x0027, 0x0024, 0x0022, 0x001F, 0x001D, 0x001A, 0x0017, 0x0015,
+    0x0012, 0x000F, 0x000C, 0x0009, 0x0007, 0x0003, 0x0001, 0x007D,
+    0x007B, 0x0078, 0x0074, 0x0072, 0x0054, 0x0041, 0x0036, 0x0027,
+    0x001B, 0x0014, 0x000A, 0x00FE, 0x00F3, 0x00EB, 0x00D5, 0x0095,
+    0x006E, 0x0042, 0x002B, 0x0010, 0x01F9, 0x01DA, 0x0157, 0x0154,
+    0x00C0, 0x0081, 0x0022, 0x03B7, 0x03B0, 0x01BD, 0x010D, 0x0047,
+    0x07F8, 0x0554, 0x0306, 0x0FF3, 0x0EC4, 0x0609, 0x1D8A, 0x1554,
+    0x762D,
+};
+
+static const uint16_t bitalloc_129_codes_g[129] = {
+    0x1E20, 0x1E5E, 0x031C, 0x051A, 0x0718, 0x0916, 0x0B14, 0x0D12,
+    0x0F11, 0x0090, 0x018F, 0x028E, 0x038D, 0x048C, 0x058B, 0x068A,
+    0x0789, 0x0049, 0x00C8, 0x0148, 0x01C7, 0x0247, 0x02C6, 0x0346,
+    0x03C5, 0x0025, 0x0065, 0x00A5, 0x00E4, 0x0124, 0x0164, 0x01A4,
+    0x01E3, 0x0013, 0x0033, 0x0053, 0x0073, 0x0093, 0x00B3, 0x00D3,
+    0x00F3, 0x000A, 0x001A, 0x002A, 0x003A, 0x004A, 0x005A, 0x006A,
+    0x007A, 0x0006, 0x000E, 0x0016, 0x001E, 0x0026, 0x002E, 0x0036,
+    0x003E, 0x0004, 0x0008, 0x000C, 0x0010, 0x0014, 0x0018, 0x001C,
+    0x0000, 0x001D, 0x0019, 0x0015, 0x0011, 0x000D, 0x0009, 0x0005,
+    0x003F, 0x0037, 0x002F, 0x0027, 0x001F, 0x0017, 0x000F, 0x0007,
+    0x007B, 0x006B, 0x005B, 0x004B, 0x003B, 0x002B, 0x001B, 0x000B,
+    0x0008, 0x00F0, 0x00D0, 0x00B0, 0x0090, 0x0070, 0x0050, 0x0030,
+    0x01E4, 0x01A5, 0x0165, 0x0125, 0x00E5, 0x00E2, 0x00A2, 0x0062,
+    0x03CA, 0x0347, 0x02C7, 0x02C4, 0x0244, 0x0149, 0x00C9, 0x00C6,
+    0x0796, 0x068B, 0x0688, 0x048D, 0x048A, 0x028F, 0x028C, 0x0091,
+    0x0F2E, 0x0D13, 0x0B15, 0x0917, 0x0719, 0x051B, 0x031D, 0x1E5F,
+    0x1E21,
+};
+
+static const uint8_t bitalloc_129_bits_a[129] = {
+    11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,
+     4,  4,  4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
+    11,
+};
+
+static const uint8_t bitalloc_129_bits_b[129] = {
+    14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,
+     9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+     5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12,
+    14,
+};
+
+static const uint8_t bitalloc_129_bits_c[129] = {
+    13, 12, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
+    13,
+};
+
+static const uint8_t bitalloc_129_bits_d[129] = {
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10,
+    10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  7,  7,
+     7,  7,  7,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
+     4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,
+     7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
+    16,
+};
+
+static const uint8_t bitalloc_129_bits_e[129] = {
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+    12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,
+     8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
+    12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
+    16,
+};
+
+static const uint8_t bitalloc_129_bits_f[129] = {
+    15, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,  9,
+     9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,
+     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+     6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+     7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13,
+    15,
+};
+
+static const uint8_t bitalloc_129_bits_g[129] = {
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
+     9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
+     7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,
+     4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
+     7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,
+     9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 13,
+    13,
+};
+
+static const uint8_t bitalloc_sizes[DCA_CODE_BOOKS] = {
+    3, 5, 7, 9, 13, 17, 25, 33, 65, 129
+};
+
+static const int8_t bitalloc_offsets[DCA_CODE_BOOKS] = {
+    -1, -2, -3, -4, -6, -8, -12, -16, -32, -64
+};
+
+static const uint8_t bitalloc_maxbits[DCA_CODE_BOOKS][7] = {
+    { 2 },
+    { 4, 3, 3 },
+    { 5, 5, 4 },
+    { 6, 5, 6 },
+    { 7, 6, 5 },
+    { 9, 8, 7, 9, 8, 8, 8 },
+    { 9, 9, 8, 9, 8, 9, 9 },
+    { 9, 9, 9, 9, 9, 9, 9 },
+    { 9, 9, 9, 9, 9, 9, 9 },
+    { 9, 9, 9, 9, 9, 9, 9 }
+};
+
+static const uint16_t *const bitalloc_codes[DCA_CODE_BOOKS][8] = {
+    { bitalloc_3_codes,     NULL },
+    { bitalloc_5_codes_a,   bitalloc_5_codes_b,   bitalloc_5_codes_c,   NULL },
+    { bitalloc_7_codes_a,   bitalloc_7_codes_b,   bitalloc_7_codes_c,   NULL },
+    { bitalloc_9_codes_a,   bitalloc_9_codes_b,   bitalloc_9_codes_c,   NULL },
+    { bitalloc_13_codes_a,  bitalloc_13_codes_b,  bitalloc_13_codes_c,  NULL },
+    { bitalloc_17_codes_a,  bitalloc_17_codes_b,  bitalloc_17_codes_c,  bitalloc_17_codes_d,
+      bitalloc_17_codes_e,  bitalloc_17_codes_f,  bitalloc_17_codes_g,  NULL },
+    { bitalloc_25_codes_a,  bitalloc_25_codes_b,  bitalloc_25_codes_c,  bitalloc_25_codes_d,
+      bitalloc_25_codes_e,  bitalloc_25_codes_f,  bitalloc_25_codes_g,  NULL },
+    { bitalloc_33_codes_a,  bitalloc_33_codes_b,  bitalloc_33_codes_c,  bitalloc_33_codes_d,
+      bitalloc_33_codes_e,  bitalloc_33_codes_f,  bitalloc_33_codes_g,  NULL },
+    { bitalloc_65_codes_a,  bitalloc_65_codes_b,  bitalloc_65_codes_c,  bitalloc_65_codes_d,
+      bitalloc_65_codes_e,  bitalloc_65_codes_f,  bitalloc_65_codes_g,  NULL },
+    { bitalloc_129_codes_a, bitalloc_129_codes_b, bitalloc_129_codes_c, bitalloc_129_codes_d,
+      bitalloc_129_codes_e, bitalloc_129_codes_f, bitalloc_129_codes_g, NULL }
+};
+
+static const uint8_t *const bitalloc_bits[DCA_CODE_BOOKS][8] = {
+    { bitalloc_3_bits,     NULL },
+    { bitalloc_5_bits_a,   bitalloc_5_bits_b,   bitalloc_5_bits_c,   NULL },
+    { bitalloc_7_bits_a,   bitalloc_7_bits_b,   bitalloc_7_bits_c,   NULL },
+    { bitalloc_9_bits_a,   bitalloc_9_bits_b,   bitalloc_9_bits_c,   NULL },
+    { bitalloc_13_bits_a,  bitalloc_13_bits_b,  bitalloc_13_bits_c,  NULL },
+    { bitalloc_17_bits_a,  bitalloc_17_bits_b,  bitalloc_17_bits_c,  bitalloc_17_bits_d,
+      bitalloc_17_bits_e,  bitalloc_17_bits_f,  bitalloc_17_bits_g,  NULL },
+    { bitalloc_25_bits_a,  bitalloc_25_bits_b,  bitalloc_25_bits_c,  bitalloc_25_bits_d,
+      bitalloc_25_bits_e,  bitalloc_25_bits_f,  bitalloc_25_bits_g,  NULL },
+    { bitalloc_33_bits_a,  bitalloc_33_bits_b,  bitalloc_33_bits_c,  bitalloc_33_bits_d,
+      bitalloc_33_bits_e,  bitalloc_33_bits_f,  bitalloc_33_bits_g,  NULL },
+    { bitalloc_65_bits_a,  bitalloc_65_bits_b,  bitalloc_65_bits_c,  bitalloc_65_bits_d,
+      bitalloc_65_bits_e,  bitalloc_65_bits_f,  bitalloc_65_bits_g,  NULL },
+    { bitalloc_129_bits_a, bitalloc_129_bits_b, bitalloc_129_bits_c, bitalloc_129_bits_d,
+      bitalloc_129_bits_e, bitalloc_129_bits_f, bitalloc_129_bits_g, NULL }
+};
+
+static const uint16_t tnl_grp_0_codes[37] = {
+    0x0000, 0x0003, 0x0004, 0x0007, 0x0001, 0x0009, 0x000a, 0x000d,
+    0x000e, 0x0006, 0x0012, 0x0005, 0x0015, 0x0016, 0x0022, 0x0025,
+    0x0035, 0x0076, 0x0002, 0x0042, 0x00b6, 0x0036, 0x00c2, 0x0136,
+    0x0182, 0x01c2, 0x03c2, 0x0482, 0x0682, 0x0082, 0x0882, 0x0a82,
+    0x0282, 0x2282, 0x3282, 0x1282, 0x5282,
+};
+
+static const uint16_t tnl_grp_1_codes[34] = {
+    0x0001, 0x0003, 0x0006, 0x0000, 0x0002, 0x0004, 0x0005, 0x0007,
+    0x0008, 0x000f, 0x001a, 0x001c, 0x001d, 0x000a, 0x002c, 0x002d,
+    0x000d, 0x002a, 0x004c, 0x004d, 0x006a, 0x008c, 0x00cd, 0x00ea,
+    0x000c, 0x010c, 0x01ea, 0x020c, 0x030c, 0x07ea, 0x0bea, 0x03ea,
+    0x13ea, 0x33ea,
+};
+
+static const uint16_t tnl_grp_2_codes[31] = {
+    0x0001, 0x0003, 0x0006, 0x0007, 0x0004, 0x0008, 0x000c, 0x0010,
+    0x0012, 0x001a, 0x0022, 0x0000, 0x000a, 0x0020, 0x0040, 0x004a,
+    0x006a, 0x0002, 0x002a, 0x0042, 0x0082, 0x00aa, 0x00e0, 0x0060,
+    0x00c2, 0x01c2, 0x0160, 0x0360, 0x0f60, 0x0760, 0x1760,
+};
+
+static const uint16_t tnl_grp_3_codes[28] = {
+    0x0001, 0x0006, 0x0008, 0x0014, 0x001c, 0x0000, 0x0002, 0x0004,
+    0x000a, 0x000c, 0x0010, 0x0012, 0x001a, 0x0020, 0x002a, 0x002c,
+    0x0032, 0x003a, 0x0022, 0x0030, 0x0062, 0x0064, 0x0070, 0x0024,
+    0x00a4, 0x01a4, 0x03a4, 0x07a4,
+};
+
+static const uint16_t tnl_grp_4_codes[23] = {
+    0x0001, 0x0000, 0x000a, 0x0006, 0x0012, 0x001e, 0x0022, 0x002e,
+    0x0036, 0x003e, 0x0002, 0x0016, 0x0032, 0x004e, 0x0056, 0x000e,
+    0x0042, 0x0072, 0x00c2, 0x00f2, 0x008e, 0x018e, 0x038e,
+};
+
+static const uint16_t tnl_scf_codes[20] = {
+    0x0000, 0x0001, 0x0002, 0x0005, 0x0006, 0x0007, 0x000b, 0x000c,
+    0x0013, 0x0014, 0x0003, 0x0004, 0x0023, 0x0064, 0x00a4, 0x0024,
+    0x0124, 0x0324, 0x0724, 0x0f24,
+};
+
+static const uint16_t damp_codes[7] = {
+    0x0001, 0x0000, 0x0002, 0x0006, 0x000e, 0x001e, 0x003e,
+};
+
+static const uint16_t dph_codes[9] = {
+    0x0000, 0x0002, 0x0003, 0x0001, 0x0009, 0x000d, 0x0005, 0x0015,
+    0x0035,
+};
+
+static const uint16_t fst_rsd_amp_codes[24] = {
+    0x0003, 0x0005, 0x0006, 0x0007, 0x0000, 0x0001, 0x0002, 0x0008,
+    0x0009, 0x000a, 0x0014, 0x0004, 0x001a, 0x001c, 0x0024, 0x002c,
+    0x003a, 0x000c, 0x003c, 0x004c, 0x00fc, 0x007c, 0x017c, 0x037c,
+};
+
+static const uint16_t rsd_apprx_codes[6] = {
+    0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f,
+};
+
+static const uint16_t rsd_amp_codes[33] = {
+    0x0001, 0x0000, 0x0002, 0x0003, 0x0004, 0x000e, 0x000f, 0x0016,
+    0x0007, 0x0027, 0x0037, 0x0026, 0x0066, 0x0006, 0x0017, 0x0046,
+    0x0097, 0x00d7, 0x0086, 0x00c6, 0x01c6, 0x0157, 0x0186, 0x0257,
+    0x0357, 0x0057, 0x0786, 0x0386, 0x0b86, 0x0457, 0x0c57, 0x1457,
+    0x1c57,
+};
+
+static const uint16_t avg_g3_codes[18] = {
+    0x0001, 0x0002, 0x0003, 0x0000, 0x000c, 0x0014, 0x0018, 0x0004,
+    0x0008, 0x0028, 0x0068, 0x0024, 0x00a4, 0x00e4, 0x0164, 0x0064,
+    0x0264, 0x0664,
+};
+
+static const uint16_t st_grid_codes[22] = {
+    0x0001, 0x0002, 0x0000, 0x0004, 0x0008, 0x001c, 0x004c, 0x006c,
+    0x000c, 0x002c, 0x008c, 0x00ac, 0x012c, 0x018c, 0x01ac, 0x038c,
+    0x03ac, 0x032c, 0x072c, 0x0f2c, 0x172c, 0x1f2c,
+};
+
+static const uint16_t grid_2_codes[20] = {
+    0x0000, 0x0002, 0x0003, 0x0001, 0x0005, 0x000d, 0x003d, 0x005d,
+    0x009d, 0x011d, 0x001d, 0x061d, 0x041d, 0x0c1d, 0x0a1d, 0x121d,
+    0x021d, 0x1a1d, 0x221d, 0x3a1d,
+};
+
+static const uint16_t grid_3_codes[13] = {
+    0x0001, 0x0002, 0x0000, 0x0004, 0x000c, 0x001c, 0x007c, 0x003c,
+    0x01bc, 0x00bc, 0x06bc, 0x02bc, 0x0abc,
+};
+
+static const uint16_t rsd_codes[9] = {
+    0x0001, 0x0003, 0x0000, 0x0002, 0x0006, 0x0004, 0x000c, 0x001c,
+    0x003c,
+};
+
+static const uint8_t tnl_grp_0_bitvals[74] = {
+     3,  5,  3,  9,  3,  4,  3,  6,  4, 10,  4, 13,  4,  7,  4, 11,
+     4,  8,  5, 12,  5, 14,  6, 15,  6, 18,  6,  1,  6, 17,  6, 16,
+     6, 21,  7, 20,  8, 19,  8, 22,  8, 25,  9, 26,  9, 23,  9,  3,
+     9, 24, 10, 29, 10, 27, 11, 28, 11, 30, 12, 33, 12, 31, 12, 32,
+    14, 34, 14, 37, 14, 36, 15, 35, 15,  0,
+};
+
+static const uint8_t tnl_grp_1_bitvals[68] = {
+     3,  9,  3,  6,  3,  5,  4,  4,  4,  8,  4, 10,  4,  1,  4, 11,
+     4,  7,  4, 13,  5, 12,  5, 14,  5, 17,  6, 16,  6, 15,  6, 18,
+     7, 20,  7, 19,  7, 21,  8, 25,  8, 23,  8, 22,  8, 24,  9, 26,
+    10,  3, 10, 29, 10, 30, 10, 27, 10, 28, 11, 31, 12, 32, 13, 33,
+    14, 34, 14,  0,
+};
+
+static const uint8_t tnl_grp_2_bitvals[62] = {
+     2,  1,  3,  6,  3,  5,  3,  7,  4,  9,  4,  8,  4,  4,  5, 10,
+     5, 11,  5, 13,  6, 12,  7, 14,  7, 16,  7, 15,  7, 17,  7, 18,
+     7, 19,  8, 22,  8, 20,  8, 21,  8,  3,  8, 24,  8, 25,  9, 23,
+     9, 26,  9, 27, 10, 28, 11, 29, 12, 31, 13, 30, 13,  0,
+};
+
+static const uint8_t tnl_grp_3_bitvals[56] = {
+     1,  1,  3,  6,  4,  5,  5,  9,  5,  4,  6,  8,  6, 14,  6, 10,
+     6, 21,  6, 13,  6,  7,  6,  3,  6, 16,  6,  2,  6, 18,  6, 17,
+     6, 11,  6, 15,  7, 19,  7, 23,  7, 24,  7, 22,  7, 12,  8, 20,
+     9, 25, 10, 26, 11, 27, 11,  0,
+};
+
+static const uint8_t tnl_grp_4_bitvals[46] = {
+     1,  1,  2,  2,  4,  4,  5,  5,  6,  6,  6,  8,  6,  3,  6, 19,
+     6, 20,  6,  9,  7,  7,  7, 11,  7, 13,  7, 17,  7, 10,  8, 12,
+     8, 15,  8, 14,  8, 21,  8, 18,  9, 16, 10, 22, 10,  0,
+};
+
+static const uint8_t tnl_scf_bitvals[40] = {
+     3,  3,  3,  1,  3,  2,  3,  5,  3,  4,  3,  6,  4,  8,  4,  7,
+     5, 10,  5,  9,  6, 12,  6, 11,  6, 13,  7, 14,  8, 15,  9, 16,
+    10, 17, 11, 18, 12, 19, 12,  0,
+};
+
+static const uint8_t damp_bitvals[14] = {
+     1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  6,  0,
+};
+
+static const uint8_t dph_bitvals[18] = {
+     2,  2,  2,  1,  2,  8,  4,  3,  4,  7,  4,  4,  5,  6,  6,  5,
+     6,  0,
+};
+
+static const uint8_t fst_rsd_amp_bitvals[48] = {
+     3, 13,  3, 15,  3, 16,  3, 14,  4, 12,  4, 10,  4, 11,  4, 17,
+     4, 18,  5, 19,  5,  9,  6,  1,  6,  7,  6,  6,  6,  8,  6,  5,
+     6,  4,  7, 20,  7,  2,  7,  3,  8, 21,  9, 22, 10, 23, 10,  0,
+};
+
+static const uint8_t rsd_apprx_bitvals[12] = {
+     1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  5,  0,
+};
+
+static const uint8_t rsd_amp_bitvals[66] = {
+     2,  3,  3,  2,  3,  5,  3,  4,  3,  1,  4,  7,  4,  6,  5,  9,
+     6,  8,  6, 11,  6, 10,  7, 12,  7, 13,  8, 14,  8, 18,  8, 16,
+     8, 15,  8, 22,  9, 20,  9, 24,  9, 17, 10, 28, 10, 26, 10, 21,
+    10, 23, 11, 30, 11, 19, 12, 25, 12, 32, 13, 36, 13, 29, 13, 34,
+    13,  0,
+};
+
+static const uint8_t avg_g3_bitvals[36] = {
+     2, 15,  2, 16,  2, 17,  4, 14,  4, 18,  5, 12,  5, 13,  6, 10,
+     6, 11,  7, 19,  7,  9,  8, 20,  8,  8,  8,  7,  9, 21, 10,  6,
+    11, 23, 11,  0,
+};
+
+static const uint8_t st_grid_bitvals[44] = {
+     1,  6,  2,  1,  4,  4,  4,  8,  4,  3,  5, 10,  7, 12,  7,  5,
+     8, 14,  9, 16,  9,  7,  9, 18, 10, 11, 10,  9, 10, 20, 10, 22,
+    10,  2, 11, 13, 13, 17, 13, 24, 13, 15, 13,  0,
+};
+
+static const uint8_t grid_2_bitvals[40] = {
+     2,  3,  2,  2,  2,  1,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
+     8,  9,  9, 10, 11, 11, 11, 12, 12, 13, 12, 17, 13, 15, 13, 18,
+    14, 19, 14, 16, 14, 14, 14,  0,
+};
+
+static const uint8_t grid_3_bitvals[26] = {
+     1, 17,  2, 16,  3, 18,  4, 15,  5, 19,  6, 14,  7, 20,  8, 13,
+     9, 21, 10, 12, 11, 22, 12, 11, 12,  0,
+};
+
+static const uint8_t rsd_bitvals[18] = {
+     2,  2,  2,  3,  3,  1,  3,  4,  3,  0,  4,  5,  5,  6,  6,  7,
+     6,  4,
+};
+
+static const uint16_t vlc_offs[80] = {
+        0,   512,   640,   768,  1282,  1794,  2436,  3080,  3770,  4454,  5364,
+     5372,  5380,  5388,  5392,  5396,  5412,  5420,  5428,  5460,  5492,  5508,
+     5572,  5604,  5668,  5796,  5860,  5892,  6412,  6668,  6796,  7308,  7564,
+     7820,  8076,  8620,  9132,  9388,  9910, 10166, 10680, 11196, 11726, 12240,
+    12752, 13298, 13810, 14326, 14840, 15500, 16022, 16540, 17158, 17678, 18264,
+    18796, 19352, 19926, 20468, 21472, 22398, 23014, 23622, 24200, 24748, 25276,
+    25792, 26306, 26826, 26890, 26954, 27468, 27500, 28038, 28554, 29086, 29630,
+    30150, 30214
+};
+
+DCAVLC  ff_dca_vlc_bit_allocation;
+DCAVLC  ff_dca_vlc_transition_mode;
+DCAVLC  ff_dca_vlc_scale_factor;
+DCAVLC  ff_dca_vlc_quant_index[DCA_CODE_BOOKS];
+
+VLC     ff_dca_vlc_tnl_grp[5];
+VLC     ff_dca_vlc_tnl_scf;
+VLC     ff_dca_vlc_damp;
+VLC     ff_dca_vlc_dph;
+VLC     ff_dca_vlc_fst_rsd_amp;
+VLC     ff_dca_vlc_rsd_apprx;
+VLC     ff_dca_vlc_rsd_amp;
+VLC     ff_dca_vlc_avg_g3;
+VLC     ff_dca_vlc_st_grid;
+VLC     ff_dca_vlc_grid_2;
+VLC     ff_dca_vlc_grid_3;
+VLC     ff_dca_vlc_rsd;
+
+av_cold void ff_dca_init_vlcs(void)
+{
+    static VLC_TYPE dca_table[30214][2];
+    static int vlcs_initialized = 0;
+    int i, j, k = 0;
+
+    if (vlcs_initialized)
+        return;
+
+#define DCA_INIT_VLC(vlc, a, b, c, d)                                       \
+    do {                                                                    \
+        vlc.table           = &dca_table[vlc_offs[k]];                      \
+        vlc.table_allocated = vlc_offs[k + 1] - vlc_offs[k];                \
+        init_vlc(&vlc, a, b, c, 1, 1, d, 2, 2, INIT_VLC_USE_NEW_STATIC);    \
+        k++;                                                                \
+    } while (0)
+
+    ff_dca_vlc_bit_allocation.offset    = 1;
+    ff_dca_vlc_bit_allocation.max_depth = 2;
+    for (i = 0; i < 5; i++)
+        DCA_INIT_VLC(ff_dca_vlc_bit_allocation.vlc[i], bitalloc_12_vlc_bits[i], 12,
+                     bitalloc_12_bits[i], bitalloc_12_codes[i]);
+
+    ff_dca_vlc_scale_factor.offset    = -64;
+    ff_dca_vlc_scale_factor.max_depth = 2;
+    for (i = 0; i < 5; i++)
+        DCA_INIT_VLC(ff_dca_vlc_scale_factor.vlc[i], SCALES_VLC_BITS, 129,
+                     scales_bits[i], scales_codes[i]);
+
+    ff_dca_vlc_transition_mode.offset    = 0;
+    ff_dca_vlc_transition_mode.max_depth = 1;
+    for (i = 0; i < 4; i++)
+        DCA_INIT_VLC(ff_dca_vlc_transition_mode.vlc[i], tmode_vlc_bits[i], 4,
+                     tmode_bits[i], tmode_codes[i]);
+
+    for (i = 0; i < DCA_CODE_BOOKS; i++) {
+        ff_dca_vlc_quant_index[i].offset    = bitalloc_offsets[i];
+        ff_dca_vlc_quant_index[i].max_depth = 1 + (i > 4);
+        for (j = 0; bitalloc_codes[i][j]; j++)
+            DCA_INIT_VLC(ff_dca_vlc_quant_index[i].vlc[j], bitalloc_maxbits[i][j],
+                         bitalloc_sizes[i], bitalloc_bits[i][j], bitalloc_codes[i][j]);
+    }
+
+#define LBR_INIT_VLC(vlc, tab, nb_bits)                                 \
+    do {                                                                \
+        vlc.table           = &dca_table[vlc_offs[k]];                  \
+        vlc.table_allocated = vlc_offs[k + 1] - vlc_offs[k];            \
+        ff_init_vlc_sparse(&vlc, nb_bits, FF_ARRAY_ELEMS(tab##_codes),  \
+                           &tab##_bitvals[0], 2, 1,                     \
+                           tab##_codes, 2, 2,                           \
+                           &tab##_bitvals[1], 2, 1,                     \
+                           INIT_VLC_LE | INIT_VLC_USE_NEW_STATIC);      \
+        k++;                                                            \
+    } while (0)
+
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[0],  tnl_grp_0,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[1],  tnl_grp_1,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[2],  tnl_grp_2,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[3],  tnl_grp_3,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_grp[4],  tnl_grp_4,   9);
+    LBR_INIT_VLC(ff_dca_vlc_tnl_scf,     tnl_scf,     9);
+    LBR_INIT_VLC(ff_dca_vlc_damp,        damp,        6);
+    LBR_INIT_VLC(ff_dca_vlc_dph,         dph,         6);
+    LBR_INIT_VLC(ff_dca_vlc_fst_rsd_amp, fst_rsd_amp, 9);
+    LBR_INIT_VLC(ff_dca_vlc_rsd_apprx,   rsd_apprx,   5);
+    LBR_INIT_VLC(ff_dca_vlc_rsd_amp,     rsd_amp,     9);
+    LBR_INIT_VLC(ff_dca_vlc_avg_g3,      avg_g3,      9);
+    LBR_INIT_VLC(ff_dca_vlc_st_grid,     st_grid,     9);
+    LBR_INIT_VLC(ff_dca_vlc_grid_2,      grid_2,      9);
+    LBR_INIT_VLC(ff_dca_vlc_grid_3,      grid_3,      9);
+    LBR_INIT_VLC(ff_dca_vlc_rsd,         rsd,         6);
+
+    vlcs_initialized = 1;
+}
diff --git a/libavcodec/dcahuff.h b/libavcodec/dcahuff.h
index 79be493..b1d5735 100644
--- a/libavcodec/dcahuff.h
+++ b/libavcodec/dcahuff.h
@@ -3,1039 +3,57 @@
  * Copyright (C) 2004 Gildas Bazin
  * Copyright (C) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DCAHUFF_H
 #define AVCODEC_DCAHUFF_H
 
-#include <stdint.h>
-#include <stdlib.h>
-
-#define TMODE_COUNT 4
-static const uint8_t tmode_vlc_bits[TMODE_COUNT] = { 3, 3, 3, 2 };
-static const uint16_t tmode_codes[TMODE_COUNT][4] = {
-    { 0x0000, 0x0002, 0x0006, 0x0007 },
-    { 0x0002, 0x0006, 0x0007, 0x0000 },
-    { 0x0006, 0x0007, 0x0000, 0x0002 },
-    { 0x0000, 0x0001, 0x0002, 0x0003 }
-};
-
-static const uint8_t tmode_bits[TMODE_COUNT][4] = {
-    { 1, 2, 3, 3 },
-    { 2, 3, 3, 1 },
-    { 3, 3, 1, 2 },
-    { 2, 2, 2, 2 }
-};
-
-#define BITALLOC_12_COUNT    5
-#define BITALLOC_12_VLC_BITS 9
-static const uint8_t bitalloc_12_vlc_bits[BITALLOC_12_COUNT] = {
-    9, 7, 7, 9, 9
-};
-
-static const uint16_t bitalloc_12_codes[BITALLOC_12_COUNT][12] = {
-    { 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E, 0x00FF, 0x00FE,
-      0x01FB, 0x01FA, 0x01F9, 0x01F8, },
-    { 0x0001, 0x0000, 0x0002, 0x000F, 0x000C, 0x001D, 0x0039, 0x0038,
-      0x0037, 0x0036, 0x0035, 0x0034, },
-    { 0x0000, 0x0007, 0x0005, 0x0004, 0x0002, 0x000D, 0x000C, 0x0006,
-      0x000F, 0x001D, 0x0039, 0x0038, },
-    { 0x0003, 0x0002, 0x0000, 0x0002, 0x0006, 0x000E, 0x001E, 0x003E,
-      0x007E, 0x00FE, 0x01FF, 0x01FE, },
-    { 0x0001, 0x0000, 0x0002, 0x0006, 0x000E, 0x003F, 0x003D, 0x007C,
-      0x0079, 0x0078, 0x00FB, 0x00FA, }
-};
-
-static const uint8_t bitalloc_12_bits[BITALLOC_12_COUNT][12] = {
-    { 1, 2, 3, 4, 5, 6, 8, 8, 9, 9,  9,  9 },
-    { 1, 2, 3, 5, 5, 6, 7, 7, 7, 7,  7,  7 },
-    { 2, 3, 3, 3, 3, 4, 4, 4, 5, 6,  7,  7 },
-    { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10 },
-    { 1, 2, 3, 4, 5, 7, 7, 8, 8, 8,  9,  9 }
-};
-
-#define SCALES_COUNT    5
-#define SCALES_VLC_BITS 9
-static const uint16_t scales_codes[SCALES_COUNT][129] = {
-    { 0x3AB0, 0x3AB2, 0x3AB4, 0x3AB6, 0x3AB8, 0x3ABA, 0x3ABC, 0x3ABE,
-      0x3AC0, 0x3AC2, 0x3AC4, 0x3AC6, 0x3AC8, 0x3ACA, 0x3ACC, 0x3ACE,
-      0x3AD0, 0x3AD2, 0x3AD4, 0x3AD6, 0x3AD8, 0x3ADA, 0x3ADC, 0x3ADE,
-      0x3AE0, 0x3AE2, 0x3AE4, 0x3AE6, 0x3AE8, 0x3AEA, 0x3AEC, 0x3AEE,
-      0x3AF0, 0x3AF2, 0x3AF4, 0x3AF6, 0x3AF8, 0x3AFA, 0x3AFC, 0x3AFE,
-      0x0540, 0x0542, 0x0544, 0x0546, 0x0548, 0x054A, 0x054C, 0x054E,
-      0x0558, 0x055E, 0x02AD, 0x0154, 0x0754, 0x03A8, 0x0056, 0x0028,
-      0x00E8, 0x004A, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
-      0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x004B,
-      0x00E9, 0x0029, 0x0057, 0x03A9, 0x0755, 0x0155, 0x02AE, 0x055F,
-      0x0559, 0x054F, 0x054D, 0x054B, 0x0549, 0x0547, 0x0545, 0x0543,
-      0x0541, 0x3AFF, 0x3AFD, 0x3AFB, 0x3AF9, 0x3AF7, 0x3AF5, 0x3AF3,
-      0x3AF1, 0x3AEF, 0x3AED, 0x3AEB, 0x3AE9, 0x3AE7, 0x3AE5, 0x3AE3,
-      0x3AE1, 0x3ADF, 0x3ADD, 0x3ADB, 0x3AD9, 0x3AD7, 0x3AD5, 0x3AD3,
-      0x3AD1, 0x3ACF, 0x3ACD, 0x3ACB, 0x3AC9, 0x3AC7, 0x3AC5, 0x3AC3,
-      0x3AC1, 0x3ABF, 0x3ABD, 0x3ABB, 0x3AB9, 0x3AB7, 0x3AB5, 0x3AB3,
-      0x3AB1, },
-    { 0x0F60, 0x0F62, 0x0F64, 0x0F66, 0x0F68, 0x0F6A, 0x0F6C, 0x0F6E,
-      0x0F70, 0x0F72, 0x0F74, 0x0F76, 0x0F78, 0x0F7A, 0x0F7C, 0x0F7E,
-      0x0F80, 0x0F82, 0x0F84, 0x0F86, 0x0F88, 0x0F8A, 0x0F8C, 0x0F8E,
-      0x0F90, 0x0F92, 0x0F94, 0x0F96, 0x0F98, 0x0F9A, 0x0F9C, 0x0F9E,
-      0x0FA0, 0x0FA2, 0x0FA4, 0x0FA6, 0x0FA8, 0x0FAA, 0x0FAC, 0x0FAE,
-      0x0FB0, 0x0FB2, 0x0FB4, 0x0FB6, 0x0FB8, 0x0FBA, 0x0FBC, 0x0FBE,
-      0x07A0, 0x07A2, 0x03D2, 0x01EA, 0x00FC, 0x007F, 0x001C, 0x000C,
-      0x0004, 0x0034, 0x0010, 0x001B, 0x0009, 0x000B, 0x000E, 0x0001,
-      0x0003, 0x0002, 0x000F, 0x000C, 0x000A, 0x0000, 0x0011, 0x0035,
-      0x0005, 0x000D, 0x001D, 0x003C, 0x00FD, 0x01EB, 0x03D3, 0x07A3,
-      0x07A1, 0x0FBF, 0x0FBD, 0x0FBB, 0x0FB9, 0x0FB7, 0x0FB5, 0x0FB3,
-      0x0FB1, 0x0FAF, 0x0FAD, 0x0FAB, 0x0FA9, 0x0FA7, 0x0FA5, 0x0FA3,
-      0x0FA1, 0x0F9F, 0x0F9D, 0x0F9B, 0x0F99, 0x0F97, 0x0F95, 0x0F93,
-      0x0F91, 0x0F8F, 0x0F8D, 0x0F8B, 0x0F89, 0x0F87, 0x0F85, 0x0F83,
-      0x0F81, 0x0F7F, 0x0F7D, 0x0F7B, 0x0F79, 0x0F77, 0x0F75, 0x0F73,
-      0x0F71, 0x0F6F, 0x0F6D, 0x0F6B, 0x0F69, 0x0F67, 0x0F65, 0x0F63,
-      0x0F61, },
-    { 0x51D0, 0x51D2, 0x51D4, 0x51D6, 0x51D8, 0x51DA, 0x51DC, 0x51DE,
-      0x51E0, 0x51E2, 0x51E4, 0x51E6, 0x51E8, 0x51EA, 0x51EC, 0x51EE,
-      0x51F0, 0x51F2, 0x51F4, 0x51F6, 0x51F8, 0x51FA, 0x51FC, 0x51FE,
-      0x70C0, 0x70C2, 0x70C4, 0x70C6, 0x70C8, 0x70CA, 0x70CC, 0x70CE,
-      0x70EC, 0x10EA, 0x3868, 0x3877, 0x0876, 0x1C35, 0x0434, 0x0A34,
-      0x0E1B, 0x021B, 0x051B, 0x070F, 0x010F, 0x0380, 0x0080, 0x0140,
-      0x01C1, 0x0041, 0x00A1, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
-      0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
-      0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
-      0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A2, 0x0042,
-      0x01C2, 0x0141, 0x0081, 0x0381, 0x028C, 0x010C, 0x051C, 0x021C,
-      0x0E1C, 0x0A35, 0x0435, 0x1C3A, 0x0877, 0x0874, 0x3869, 0x10EB,
-      0x70ED, 0x70CF, 0x70CD, 0x70CB, 0x70C9, 0x70C7, 0x70C5, 0x70C3,
-      0x70C1, 0x51FF, 0x51FD, 0x51FB, 0x51F9, 0x51F7, 0x51F5, 0x51F3,
-      0x51F1, 0x51EF, 0x51ED, 0x51EB, 0x51E9, 0x51E7, 0x51E5, 0x51E3,
-      0x51E1, 0x51DF, 0x51DD, 0x51DB, 0x51D9, 0x51D7, 0x51D5, 0x51D3,
-      0x51D1, },
-    { 0x6F64, 0x6F66, 0x6F68, 0x6F6A, 0x6F6C, 0x6F6E, 0x6F70, 0x6F72,
-      0x6F74, 0x6F76, 0x6F78, 0x6F7A, 0x6F7C, 0x6F7E, 0x6F80, 0x6F82,
-      0x6F84, 0x6F86, 0x6F88, 0x6F8A, 0x6F8C, 0x6F8E, 0x6F90, 0x6F92,
-      0x6F94, 0x6F96, 0x6F98, 0x6F9A, 0x6F9C, 0x6F9E, 0x6FA0, 0x6FA2,
-      0x6FA4, 0x6FA6, 0x6FA8, 0x6FAA, 0x6FAC, 0x6FAE, 0x6FB0, 0x6FB2,
-      0x6FB4, 0x6FB6, 0x17B4, 0x37DC, 0x0BDB, 0x1BEF, 0x05EE, 0x0DF8,
-      0x02F8, 0x06FD, 0x017D, 0x037F, 0x00BF, 0x0040, 0x00C0, 0x0021,
-      0x0061, 0x0011, 0x0031, 0x0009, 0x0019, 0x0006, 0x000E, 0x0004,
-      0x0000, 0x0005, 0x000F, 0x0007, 0x001A, 0x000A, 0x0036, 0x0016,
-      0x006E, 0x002E, 0x00C1, 0x0041, 0x01BC, 0x00BC, 0x037A, 0x017A,
-      0x02F9, 0x0DF9, 0x05EF, 0x05EC, 0x1BD8, 0x37DD, 0x17B5, 0x6FB7,
-      0x6FB5, 0x6FB3, 0x6FB1, 0x6FAF, 0x6FAD, 0x6FAB, 0x6FA9, 0x6FA7,
-      0x6FA5, 0x6FA3, 0x6FA1, 0x6F9F, 0x6F9D, 0x6F9B, 0x6F99, 0x6F97,
-      0x6F95, 0x6F93, 0x6F91, 0x6F8F, 0x6F8D, 0x6F8B, 0x6F89, 0x6F87,
-      0x6F85, 0x6F83, 0x6F81, 0x6F7F, 0x6F7D, 0x6F7B, 0x6F79, 0x6F77,
-      0x6F75, 0x6F73, 0x6F71, 0x6F6F, 0x6F6D, 0x6F6B, 0x6F69, 0x6F67,
-      0x6F65, },
-    { 0xDF54, 0xDF56, 0xDFC8, 0xDFCA, 0xDFCC, 0xDFCE, 0xDFD0, 0xDFD2,
-      0xDFD4, 0xDFD6, 0xDFD8, 0xDFDA, 0xDFDC, 0xDFDE, 0xDFE0, 0xDFE2,
-      0x0FE8, 0x2FEA, 0x6FA8, 0x6FF6, 0x07F5, 0x07F7, 0x37D2, 0x37F9,
-      0x03F8, 0x0BF8, 0x0BFB, 0x1BEB, 0x01FA, 0x05FA, 0x09FA, 0x0DFA,
-      0x0DFF, 0x00FF, 0x02FF, 0x06FB, 0x007C, 0x017C, 0x027C, 0x027F,
-      0x003C, 0x00BC, 0x013C, 0x01BC, 0x001C, 0x005C, 0x009C, 0x00DC,
-      0x000C, 0x002C, 0x004C, 0x006C, 0x0004, 0x0014, 0x0024, 0x0034,
-      0x0000, 0x0008, 0x0010, 0x0018, 0x001E, 0x0002, 0x0006, 0x000A,
-      0x000E, 0x000B, 0x0007, 0x0003, 0x001F, 0x0019, 0x0011, 0x0009,
-      0x0001, 0x0035, 0x0025, 0x0015, 0x0005, 0x006D, 0x004D, 0x002D,
-      0x000D, 0x00DD, 0x009D, 0x005D, 0x001D, 0x01BD, 0x013D, 0x00BD,
-      0x003D, 0x037C, 0x027D, 0x017D, 0x007D, 0x06FC, 0x04FC, 0x02FC,
-      0x00FC, 0x0DFB, 0x09FB, 0x05FB, 0x01FB, 0x1BF8, 0x1BE8, 0x0BF9,
-      0x03F9, 0x37FA, 0x37D3, 0x17F4, 0x07F6, 0x6FF7, 0x6FA9, 0x2FEB,
-      0x0FE9, 0xDFE3, 0xDFE1, 0xDFDF, 0xDFDD, 0xDFDB, 0xDFD9, 0xDFD7,
-      0xDFD5, 0xDFD3, 0xDFD1, 0xDFCF, 0xDFCD, 0xDFCB, 0xDFC9, 0xDF57,
-      0xDF55, }
-};
-
-static const uint8_t scales_bits[SCALES_COUNT][129] = {
-    { 14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      13, 13, 13, 13, 13, 13, 13, 13,
-      13, 13, 12, 11, 11, 10,  9,  8,
-       8,  7,  6,  6,  5,  4,  4,  3,
-       2,  3,  3,  4,  5,  5,  6,  7,
-       8,  8,  9, 10, 11, 11, 12, 13,
-      13, 13, 13, 13, 13, 13, 13, 13,
-      13, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, 14, 14, 14, 14, 14, 14, 14,
-      14, },
-    { 15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      14, 14, 13, 12, 11, 10,  8,  7,
-       6,  6,  5,  5,  4,  4,  4,  3,
-       3,  3,  4,  4,  4,  4,  5,  6,
-       6,  7,  8,  9, 11, 12, 13, 14,
-      14, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, },
-    { 15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 14, 14, 14, 13, 13, 12, 12,
-      12, 11, 11, 11, 10, 10,  9,  9,
-       9,  8,  8,  8,  7,  7,  7,  6,
-       6,  6,  5,  5,  5,  4,  4,  3,
-       3,  3,  4,  4,  5,  5,  5,  6,
-       6,  6,  7,  7,  7,  8,  8,  8,
-       9,  9,  9, 10, 10, 10, 11, 11,
-      12, 12, 12, 13, 13, 13, 14, 14,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, },
-    { 15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 14, 14, 13, 13, 12, 12,
-      11, 11, 10, 10,  9,  8,  8,  7,
-       7,  6,  6,  5,  5,  4,  4,  3,
-       2,  3,  4,  4,  5,  5,  6,  6,
-       7,  7,  8,  8,  9,  9, 10, 10,
-      11, 12, 12, 12, 13, 14, 14, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, 15, 15, 15, 15, 15, 15, 15,
-      15, },
-    { 16, 16, 16, 16, 16, 16, 16, 16,
-      16, 16, 16, 16, 16, 16, 16, 16,
-      15, 15, 15, 15, 14, 14, 14, 14,
-      13, 13, 13, 13, 12, 12, 12, 12,
-      12, 11, 11, 11, 10, 10, 10, 10,
-       9,  9,  9,  9,  8,  8,  8,  8,
-       7,  7,  7,  7,  6,  6,  6,  6,
-       5,  5,  5,  5,  5,  4,  4,  4,
-       4,  4,  4,  4,  5,  5,  5,  5,
-       5,  6,  6,  6,  6,  7,  7,  7,
-       7,  8,  8,  8,  8,  9,  9,  9,
-       9, 10, 10, 10, 10, 11, 11, 11,
-      11, 12, 12, 12, 12, 13, 13, 13,
-      13, 14, 14, 14, 14, 15, 15, 15,
-      15, 16, 16, 16, 16, 16, 16, 16,
-      16, 16, 16, 16, 16, 16, 16, 16,
-      16,
-    }
-};
-
-static const uint16_t bitalloc_3_codes[3] = {
-    0x0003, 0x0000, 0x0002,
-};
-
-static const uint8_t bitalloc_3_bits[3] = {
-    2,  1,  2,
-};
-
-static const uint16_t bitalloc_5_codes_a[5] = {
-    0x000F, 0x0006, 0x0000, 0x0002, 0x000E,
-};
-
-static const uint16_t bitalloc_5_codes_b[5] = {
-    0x0007, 0x0001, 0x0002, 0x0000, 0x0006,
-};
-
-static const uint16_t bitalloc_5_codes_c[5] = {
-    0x0007, 0x0005, 0x0000, 0x0004, 0x0006,
-};
-
-static const uint8_t bitalloc_5_bits_a[5] = {
-    4,  3,  1,  2,  4,
-};
-
-static const uint8_t bitalloc_5_bits_b[5] = {
-    3,  2,  2,  2,  3,
-};
-
-static const uint8_t bitalloc_5_bits_c[5] = {
-    3,  3,  1,  3,  3,
-};
-
-static const uint16_t bitalloc_7_codes_a[7] = {
-    0x001E, 0x000E, 0x0005, 0x0000, 0x0006, 0x0004, 0x001F,
-};
-
-static const uint16_t bitalloc_7_codes_b[7] = {
-    0x0014, 0x000B, 0x0000, 0x0003, 0x0001, 0x0004, 0x0015,
-};
-
-static const uint16_t bitalloc_7_codes_c[7] = {
-    0x0000, 0x0002, 0x0001, 0x0003, 0x0002, 0x0003, 0x0001,
-};
-
-static const uint8_t bitalloc_7_bits_a[7] = {
-    5,  4,  3,  1,  3,  3,  5,
-};
-
-static const uint8_t bitalloc_7_bits_b[7] = {
-    5,  4,  2,  2,  2,  3,  5,
-};
-
-static const uint8_t bitalloc_7_bits_c[7] = {
-    4,  4,  2,  2,  2,  4,  4,
-};
-
-static const uint16_t bitalloc_9_codes_a[9] = {
-    0x0030, 0x0019, 0x0009, 0x0005, 0x0000, 0x0007, 0x000D, 0x0008,
-    0x0031,
-};
-
-static const uint16_t bitalloc_9_codes_b[9] = {
-    0x0018, 0x001A, 0x0002, 0x0007, 0x0002, 0x0000, 0x0003, 0x001B,
-    0x0019,
-};
-
-static const uint16_t bitalloc_9_codes_c[9] = {
-    0x001C, 0x000F, 0x0002, 0x0007, 0x0002, 0x0000, 0x0006, 0x0006,
-    0x001D,
-};
-
-static const uint8_t bitalloc_9_bits_a[9] = {
-    6,  5,  4,  3,  1,  3,  4,  4,  6,
-};
-
-static const uint8_t bitalloc_9_bits_b[9] = {
-    5,  5,  3,  3,  2,  2,  3,  5,  5,
-};
-
-static const uint8_t bitalloc_9_bits_c[9] = {
-    6,  5,  3,  3,  2,  2,  3,  4,  6,
-};
-
-static const uint16_t bitalloc_13_codes_a[13] = {
-    0x0070, 0x002E, 0x0039, 0x001D, 0x000C, 0x000F, 0x0000, 0x0004,
-    0x000D, 0x000A, 0x0016, 0x002F, 0x0071,
-};
-
-static const uint16_t bitalloc_13_codes_b[13] = {
-    0x0038, 0x0010, 0x001D, 0x0007, 0x000F, 0x0005, 0x0000, 0x0006,
-    0x0002, 0x0009, 0x0006, 0x0011, 0x0039,
-};
-
-static const uint16_t bitalloc_13_codes_c[13] = {
-    0x0004, 0x001A, 0x0003, 0x000E, 0x0000, 0x0003, 0x0005, 0x0004,
-    0x0002, 0x000F, 0x000C, 0x001B, 0x0005,
-};
-
-static const uint8_t bitalloc_13_bits_a[13] = {
-     7,  6,  6,  5,  4,  4,  1,  3,  4,  4,  5,  6,  7,
-};
-
-static const uint8_t bitalloc_13_bits_b[13] = {
-     6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  4,  5,  6,
-};
-
-static const uint8_t bitalloc_13_bits_c[13] = {
-     5,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  5,
-};
-
-static const uint16_t bitalloc_17_codes_a[17] = {
-    0x0154, 0x00AB, 0x002B, 0x000B, 0x0003, 0x000A, 0x0001, 0x0006,
-    0x0001, 0x0007, 0x0004, 0x000B, 0x0000, 0x0004, 0x0014, 0x0054,
-    0x0155,
-};
-
-static const uint16_t bitalloc_17_codes_b[17] = {
-    0x007C, 0x003F, 0x0019, 0x000D, 0x001C, 0x0008, 0x000F, 0x0005,
-    0x0000, 0x0006, 0x0002, 0x0009, 0x001D, 0x000E, 0x001E, 0x0018,
-    0x007D,
-};
-
-static const uint16_t bitalloc_17_codes_c[17] = {
-    0x002C, 0x0017, 0x0005, 0x001C, 0x0003, 0x000A, 0x000F, 0x0003,
-    0x0006, 0x0004, 0x0000, 0x000B, 0x0004, 0x001D, 0x000A, 0x0004,
-    0x002D,
-};
-
-static const uint16_t bitalloc_17_codes_d[17] = {
-    0x0100, 0x0102, 0x0082, 0x0042, 0x0022, 0x0012, 0x000A, 0x0006,
-    0x0000, 0x0007, 0x000B, 0x0013, 0x0023, 0x0043, 0x0083, 0x0103,
-    0x0101,
-};
-
-static const uint16_t bitalloc_17_codes_e[17] = {
-    0x00E8, 0x00F6, 0x0075, 0x0034, 0x003B, 0x001B, 0x001F, 0x0004,
-    0x0000, 0x0005, 0x000C, 0x001C, 0x003C, 0x0035, 0x007A, 0x00F7,
-    0x00E9,
-};
-
-static const uint16_t bitalloc_17_codes_f[17] = {
-    0x0004, 0x0003, 0x001E, 0x0001, 0x0001, 0x000E, 0x0001, 0x0004,
-    0x0006, 0x0005, 0x0002, 0x000F, 0x0006, 0x000E, 0x001F, 0x0000,
-    0x0005,
-};
-
-static const uint16_t bitalloc_17_codes_g[17] = {
-    0x0060, 0x007E, 0x0031, 0x0019, 0x000D, 0x0004, 0x0000, 0x0006,
-    0x0002, 0x0007, 0x0001, 0x0005, 0x000E, 0x001E, 0x003E, 0x007F,
-    0x0061,
-};
-
-static const uint8_t bitalloc_17_bits_a[17] = {
-    12, 11,  9,  7,  5,  4,  3,  3,  2,  3,  3,  4,  4,  6,  8, 10,
-    12,
-};
-
-static const uint8_t bitalloc_17_bits_b[17] = {
-    8,  7,  6,  5,  5,  4,  4,  3,  2,  3,  3,  4,  5,  5,  6,  6,
-    8,
-};
-
-static const uint8_t bitalloc_17_bits_c[17] = {
-    7,  6,  5,  5,  4,  4,  4,  3,  3,  3,  3,  4,  4,  5,  5,  5,
-    7,
-};
-
-static const uint8_t bitalloc_17_bits_d[17] = {
-    9,  9,  8,  7,  6,  5,  4,  3,  1,  3,  4,  5,  6,  7,  8,  9,
-    9,
-};
-
-static const uint8_t bitalloc_17_bits_e[17] = {
-    8,  8,  7,  6,  6,  5,  5,  3,  1,  3,  4,  5,  6,  6,  7,  8,
-    8,
-};
-
-static const uint8_t bitalloc_17_bits_f[17] = {
-    8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,  4,  5,  6,  6,
-    8,
-};
-
-static const uint8_t bitalloc_17_bits_g[17] = {
-    8,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,  5,  6,  7,  8,
-    8,
-};
-
-static const uint16_t bitalloc_25_codes_a[25] = {
-    0x2854, 0x142B, 0x050B, 0x0143, 0x00A2, 0x0052, 0x002E, 0x0015,
-    0x0004, 0x000E, 0x0000, 0x0003, 0x0006, 0x0004, 0x0001, 0x000F,
-    0x0005, 0x0016, 0x002F, 0x0053, 0x00A3, 0x00A0, 0x0284, 0x0A14,
-    0x2855,
-};
-
-static const uint16_t bitalloc_25_codes_b[25] = {
-    0x001C, 0x000F, 0x0005, 0x0000, 0x0030, 0x0036, 0x000E, 0x0019,
-    0x0001, 0x0008, 0x000E, 0x0001, 0x0005, 0x0002, 0x000F, 0x0009,
-    0x0006, 0x001A, 0x000F, 0x0037, 0x0031, 0x0001, 0x0006, 0x0004,
-    0x001D,
-};
-
-static const uint16_t bitalloc_25_codes_c[25] = {
-    0x004C, 0x0027, 0x006D, 0x0028, 0x0037, 0x000E, 0x0015, 0x0000,
-    0x0005, 0x0008, 0x000B, 0x000E, 0x0001, 0x000F, 0x000C, 0x0009,
-    0x0006, 0x0001, 0x001A, 0x000F, 0x0008, 0x0029, 0x0012, 0x006C,
-    0x004D,
-};
-
-static const uint16_t bitalloc_25_codes_d[25] = {
-    0x0780, 0x0782, 0x03C2, 0x01E2, 0x00FE, 0x0079, 0x003D, 0x001C,
-    0x000C, 0x0004, 0x0000, 0x0006, 0x0002, 0x0007, 0x0001, 0x0005,
-    0x000D, 0x001D, 0x003E, 0x007E, 0x00FF, 0x01E3, 0x03C3, 0x0783,
-    0x0781,
-};
-
-static const uint16_t bitalloc_25_codes_e[25] = {
-    0x003C, 0x0092, 0x0018, 0x001F, 0x004E, 0x000D, 0x0025, 0x0004,
-    0x0010, 0x0000, 0x000A, 0x0002, 0x0003, 0x0003, 0x000B, 0x0001,
-    0x0011, 0x0005, 0x0026, 0x000E, 0x004F, 0x0048, 0x0019, 0x0093,
-    0x003D,
-};
-
-static const uint16_t bitalloc_25_codes_f[25] = {
-    0x0324, 0x0193, 0x00CE, 0x0065, 0x0024, 0x000C, 0x0013, 0x0004,
-    0x0007, 0x000A, 0x000D, 0x000F, 0x0001, 0x0000, 0x000E, 0x000B,
-    0x0008, 0x0005, 0x0018, 0x000D, 0x0025, 0x0066, 0x00CF, 0x00C8,
-    0x0325,
-};
-
-static const uint16_t bitalloc_25_codes_g[25] = {
-    0x03A8, 0x03AE, 0x01D5, 0x0094, 0x0014, 0x004B, 0x000B, 0x003B,
-    0x0013, 0x0003, 0x000F, 0x0005, 0x0001, 0x0006, 0x0000, 0x0008,
-    0x001C, 0x0004, 0x0024, 0x0074, 0x0015, 0x0095, 0x01D6, 0x03AF,
-    0x03A9,
-};
-
-static const uint8_t bitalloc_25_bits_a[25] = {
-    14, 13, 11,  9,  8,  7,  6,  5,  4,  4,  3,  3,  3,  3,  3,  4,
-     4,  5,  6,  7,  8,  8, 10, 12, 14,
-};
-
-static const uint8_t bitalloc_25_bits_b[25] = {
-    9,  8,  7,  6,  6,  6,  5,  5,  4,  4,  4,  3,  3,  3,  4,  4,
-    4,  5,  5,  6,  6,  6,  7,  7,  9,
-};
-
-static const uint8_t bitalloc_25_bits_c[25] = {
-    8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  4,  4,  3,  4,  4,  4,
-    4,  4,  5,  5,  5,  6,  6,  7,  8,
-};
-
-static const uint8_t bitalloc_25_bits_d[25] = {
-    12, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  3,  2,  3,  3,  4,
-     5,  6,  7,  8,  9, 10, 11, 12, 12,
-};
-
-static const uint8_t bitalloc_25_bits_e[25] = {
-    8,  8,  7,  7,  7,  6,  6,  5,  5,  4,  4,  3,  2,  3,  4,  4,
-    5,  5,  6,  6,  7,  7,  7,  8,  8,
-};
-
-static const uint8_t bitalloc_25_bits_f[25] = {
-    10,  9,  8,  7,  6,  5,  5,  4,  4,  4,  4,  4,  3,  3,  4,  4,
-     4,  4,  5,  5,  6,  7,  8,  8, 10,
-};
-
-static const uint8_t bitalloc_25_bits_g[25] = {
-    10, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,  2,  3,  3,  4,
-     5,  5,  6,  7,  7,  8,  9, 10, 10,
-};
-
-static const uint16_t bitalloc_33_codes_a[33] = {
-    0x1580, 0x1582, 0x0AC2, 0x0562, 0x02B2, 0x015E, 0x00AD, 0x0054,
-    0x001C, 0x003C, 0x000F, 0x001F, 0x0008, 0x000B, 0x000D, 0x0000,
-    0x0002, 0x0001, 0x000E, 0x000C, 0x0009, 0x0006, 0x0014, 0x003D,
-    0x001D, 0x0055, 0x00AE, 0x015F, 0x02B3, 0x0563, 0x0AC3, 0x1583,
-    0x1581,
-};
-
-static const uint16_t bitalloc_33_codes_b[33] = {
-    0x030C, 0x0187, 0x006D, 0x0028, 0x0037, 0x0066, 0x0015, 0x0031,
-    0x0000, 0x000B, 0x0012, 0x001A, 0x0001, 0x0007, 0x000A, 0x000E,
-    0x0001, 0x000F, 0x000B, 0x0008, 0x0004, 0x001B, 0x0013, 0x000C,
-    0x0001, 0x0032, 0x001A, 0x0067, 0x0060, 0x0029, 0x00C2, 0x006C,
-    0x030D,
-};
-
-static const uint16_t bitalloc_33_codes_c[33] = {
-    0x00CC, 0x0067, 0x0005, 0x0070, 0x0003, 0x001A, 0x0039, 0x003F,
-    0x000A, 0x0012, 0x0018, 0x001D, 0x0001, 0x0003, 0x0007, 0x000A,
-    0x000D, 0x000B, 0x0008, 0x0004, 0x0002, 0x001E, 0x0019, 0x0013,
-    0x000B, 0x0000, 0x003E, 0x001B, 0x0018, 0x0071, 0x0032, 0x0004,
-    0x00CD,
-};
-
-static const uint16_t bitalloc_33_codes_d[33] = {
-    0x3AF8, 0x3AFA, 0x1D7E, 0x0EBC, 0x075C, 0x03AC, 0x01D4, 0x0094,
-    0x0014, 0x004B, 0x000B, 0x003B, 0x0013, 0x0003, 0x000F, 0x0005,
-    0x0001, 0x0006, 0x0000, 0x0008, 0x001C, 0x0004, 0x0024, 0x0074,
-    0x0015, 0x0095, 0x01D5, 0x03AD, 0x075D, 0x0EBD, 0x1D7F, 0x3AFB,
-    0x3AF9,
-};
-
-static const uint16_t bitalloc_33_codes_e[33] = {
-    0x01C8, 0x01E6, 0x0064, 0x00E2, 0x00E5, 0x0030, 0x0033, 0x0073,
-    0x007A, 0x001A, 0x003A, 0x0002, 0x001A, 0x001F, 0x0007, 0x0001,
-    0x0002, 0x0002, 0x000C, 0x0000, 0x001B, 0x0003, 0x003B, 0x001B,
-    0x007B, 0x0078, 0x0070, 0x0031, 0x00F2, 0x00E3, 0x0065, 0x01E7,
-    0x01C9,
-};
-
-static const uint16_t bitalloc_33_codes_f[33] = {
-    0x0724, 0x0393, 0x01CE, 0x00E5, 0x002C, 0x0008, 0x0017, 0x003E,
-    0x0005, 0x0014, 0x001D, 0x0000, 0x0003, 0x0006, 0x0008, 0x000B,
-    0x000D, 0x000C, 0x0009, 0x0007, 0x0004, 0x0001, 0x001E, 0x0015,
-    0x000A, 0x003F, 0x0038, 0x0009, 0x002D, 0x00E6, 0x01CF, 0x01C8,
-    0x0725,
-};
-
-static const uint16_t bitalloc_33_codes_g[33] = {
-    0x0284, 0x0042, 0x0140, 0x0143, 0x003E, 0x00BE, 0x0011, 0x0051,
-    0x0009, 0x0029, 0x0005, 0x0015, 0x0000, 0x0008, 0x000E, 0x0002,
-    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0016, 0x0006, 0x002E,
-    0x000E, 0x005E, 0x001E, 0x00BF, 0x003F, 0x0020, 0x0141, 0x0043,
-    0x0285,
-};
-
-static const uint8_t bitalloc_33_bits_a[33] = {
-    13, 13, 12, 11, 10,  9,  8,  7,  6,  6,  5,  5,  4,  4,  4,  3,
-     3,  3,  4,  4,  4,  4,  5,  6,  6,  7,  8,  9, 10, 11, 12, 13,
-    13,
-};
-
-static const uint8_t bitalloc_33_bits_b[33] = {
-    10,  9,  8,  7,  7,  7,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
-     3,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  7,  7,  7,  8,  8,
-    10,
-};
-
-static const uint8_t bitalloc_33_bits_c[33] = {
-    9,  8,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,
-    4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,  7,
-    9,
-};
-
-static const uint8_t bitalloc_33_bits_d[33] = {
-    14, 14, 13, 12, 11, 10,  9,  8,  7,  7,  6,  6,  5,  4,  4,  3,
-     2,  3,  3,  4,  5,  5,  6,  7,  7,  8,  9, 10, 11, 12, 13, 14,
-    14,
-};
-
-static const uint8_t bitalloc_33_bits_e[33] = {
-    9,  9,  8,  8,  8,  7,  7,  7,  7,  6,  6,  5,  5,  5,  4,  3,
-    2,  3,  4,  4,  5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,
-    9,
-};
-
-static const uint8_t bitalloc_33_bits_f[33] = {
-    11, 10,  9,  8,  7,  6,  6,  6,  5,  5,  5,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  8,  9,  9,
-    11,
-};
-
-static const uint8_t bitalloc_33_bits_g[33] = {
-    10,  9,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
-     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  9,  9,
-    10,
-};
-
-static const uint16_t bitalloc_65_codes_a[65] = {
-    0x9E5C, 0x9E5E, 0x4F2C, 0x2794, 0x13C4, 0x1E44, 0x09E3, 0x0F23,
-    0x04F3, 0x0792, 0x027E, 0x03CE, 0x013D, 0x01E5, 0x009C, 0x00CC,
-    0x0040, 0x0058, 0x0067, 0x001E, 0x0021, 0x002D, 0x003D, 0x0007,
-    0x0011, 0x0014, 0x0017, 0x001A, 0x001C, 0x001F, 0x0001, 0x0004,
-    0x0006, 0x0005, 0x0002, 0x0000, 0x001D, 0x001B, 0x0018, 0x0015,
-    0x0012, 0x000E, 0x0006, 0x0032, 0x0026, 0x001F, 0x0078, 0x0059,
-    0x0041, 0x00CD, 0x009D, 0x01E6, 0x013E, 0x03CF, 0x027F, 0x0793,
-    0x0790, 0x04F0, 0x09E4, 0x1E45, 0x13C5, 0x2795, 0x4F2D, 0x9E5F,
-    0x9E5D,
-};
-
-static const uint16_t bitalloc_65_codes_b[65] = {
-    0x0A8C, 0x0547, 0x01B5, 0x0008, 0x00DB, 0x0152, 0x0005, 0x000B,
-    0x008E, 0x00AE, 0x00E4, 0x0003, 0x0037, 0x0039, 0x0055, 0x006C,
-    0x0073, 0x0003, 0x0015, 0x001D, 0x0028, 0x0030, 0x0037, 0x003E,
-    0x0006, 0x000B, 0x000F, 0x0012, 0x0016, 0x0019, 0x001D, 0x0001,
-    0x0004, 0x0002, 0x001E, 0x001A, 0x0017, 0x0013, 0x0010, 0x000C,
-    0x0007, 0x003F, 0x0038, 0x0031, 0x0029, 0x0022, 0x001A, 0x0014,
-    0x0000, 0x006D, 0x0056, 0x0046, 0x0038, 0x0004, 0x00E5, 0x00AF,
-    0x008F, 0x006C, 0x000A, 0x0153, 0x0150, 0x0009, 0x02A2, 0x01B4,
-    0x0A8D,
-};
-
-static const uint16_t bitalloc_65_codes_c[65] = {
-    0x045C, 0x022F, 0x03F5, 0x01BC, 0x01FB, 0x0059, 0x00D0, 0x00DF,
-    0x000A, 0x002D, 0x002F, 0x0052, 0x0069, 0x0078, 0x007F, 0x000A,
-    0x0010, 0x001C, 0x0023, 0x002A, 0x0035, 0x003A, 0x003D, 0x0000,
-    0x0003, 0x0006, 0x0009, 0x000C, 0x000F, 0x0012, 0x0016, 0x0018,
-    0x001C, 0x0019, 0x0017, 0x0013, 0x0010, 0x000D, 0x000A, 0x0007,
-    0x0004, 0x0001, 0x003E, 0x003B, 0x0036, 0x002B, 0x0028, 0x001D,
-    0x0011, 0x000B, 0x0004, 0x0079, 0x006E, 0x0053, 0x0044, 0x002E,
-    0x000B, 0x00FC, 0x00D1, 0x008A, 0x0058, 0x01BD, 0x0116, 0x03F4,
-    0x045D,
-};
-
-static const uint16_t bitalloc_65_codes_d[65] = {
-    0x70B0, 0x70B2, 0x70B4, 0x2852, 0x385B, 0x142E, 0x1C2E, 0x0A15,
-    0x0E14, 0x0214, 0x0704, 0x0104, 0x010B, 0x0383, 0x0083, 0x0143,
-    0x01C3, 0x0043, 0x00A2, 0x00E2, 0x0022, 0x0052, 0x0072, 0x0012,
-    0x002A, 0x003A, 0x000A, 0x0016, 0x001E, 0x0006, 0x000C, 0x0000,
-    0x0004, 0x0001, 0x000D, 0x0007, 0x001F, 0x0017, 0x000B, 0x003B,
-    0x002B, 0x0013, 0x0073, 0x0053, 0x0023, 0x00E3, 0x00A3, 0x00A0,
-    0x0040, 0x01C0, 0x0084, 0x0384, 0x0284, 0x0105, 0x0705, 0x0215,
-    0x0E15, 0x0A16, 0x1C2F, 0x142F, 0x1428, 0x2853, 0x70B5, 0x70B3,
-    0x70B1,
-};
-
-static const uint16_t bitalloc_65_codes_e[65] = {
-    0x032C, 0x0332, 0x0378, 0x037E, 0x008C, 0x014A, 0x0188, 0x0197,
-    0x019E, 0x01BD, 0x0044, 0x0047, 0x00AA, 0x00C5, 0x00CD, 0x00DC,
-    0x001C, 0x002C, 0x0053, 0x0063, 0x0068, 0x0008, 0x000F, 0x0017,
-    0x002B, 0x0035, 0x0005, 0x0009, 0x0016, 0x001C, 0x0006, 0x000F,
-    0x0004, 0x0000, 0x0007, 0x001D, 0x0017, 0x000A, 0x0006, 0x0036,
-    0x0030, 0x0028, 0x0010, 0x0009, 0x0069, 0x0064, 0x0054, 0x002D,
-    0x001D, 0x00DD, 0x00CE, 0x00CA, 0x00AB, 0x00A4, 0x0045, 0x01BE,
-    0x019F, 0x0198, 0x0189, 0x014B, 0x008D, 0x037F, 0x0379, 0x0333,
-    0x032D,
-};
-
-static const uint16_t bitalloc_65_codes_f[65] = {
-    0x0FE0, 0x0FE2, 0x0FE8, 0x0FEA, 0x0FEC, 0x0FEE, 0x0FF0, 0x0FF2,
-    0x0FF4, 0x2FF2, 0x07F2, 0x07FB, 0x03F6, 0x0BFA, 0x0BFD, 0x01FF,
-    0x05FF, 0x02FC, 0x007C, 0x017C, 0x003C, 0x00BC, 0x001C, 0x005C,
-    0x000C, 0x002C, 0x0004, 0x0014, 0x0000, 0x0008, 0x000E, 0x0002,
-    0x0006, 0x0003, 0x000F, 0x0009, 0x0001, 0x0015, 0x0005, 0x002D,
-    0x000D, 0x005D, 0x001D, 0x00BD, 0x003D, 0x017D, 0x007D, 0x02FD,
-    0x00FC, 0x05FC, 0x01FA, 0x0BFB, 0x03F7, 0x17F8, 0x07F3, 0x2FF3,
-    0x0FF5, 0x0FF3, 0x0FF1, 0x0FEF, 0x0FED, 0x0FEB, 0x0FE9, 0x0FE3,
-    0x0FE1,
-};
-
-static const uint16_t bitalloc_65_codes_g[65] = {
-    0x010C, 0x038A, 0x0608, 0x0786, 0x0084, 0x0087, 0x0302, 0x0305,
-    0x0040, 0x00E0, 0x00E3, 0x0183, 0x001E, 0x005E, 0x009E, 0x00DE,
-    0x00F1, 0x0011, 0x0039, 0x0061, 0x0079, 0x0009, 0x001D, 0x0031,
-    0x003D, 0x0005, 0x000F, 0x0019, 0x001F, 0x0003, 0x0006, 0x000A,
-    0x000E, 0x000B, 0x0008, 0x0004, 0x0000, 0x001A, 0x0012, 0x000A,
-    0x0002, 0x0036, 0x0026, 0x0016, 0x0006, 0x006E, 0x004E, 0x002E,
-    0x000E, 0x00DF, 0x009F, 0x005F, 0x001F, 0x01E0, 0x0180, 0x00E1,
-    0x0041, 0x03C2, 0x0303, 0x01C4, 0x0085, 0x0787, 0x0609, 0x038B,
-    0x010D,
-};
-
-static const uint8_t bitalloc_65_bits_a[65] = {
-    16, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10,  9,  9,  8,  8,
-     7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,  4,
-     4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,
-     7,  8,  8,  9,  9, 10, 10, 11, 11, 11, 12, 13, 13, 14, 15, 16,
-    16,
-};
-
-static const uint8_t bitalloc_65_bits_b[65] = {
-    12, 11, 10,  9,  9,  9,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
-     7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  4,
-     4,  4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,
-     6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9, 10, 10,
-    12,
-};
-
-static const uint8_t bitalloc_65_bits_c[65] = {
-    11, 10, 10,  9,  9,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  6,
-     6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10,
-    11,
-};
-
-static const uint8_t bitalloc_65_bits_d[65] = {
-    15, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,  9,  9,
-     9,  8,  8,  8,  7,  7,  7,  6,  6,  6,  5,  5,  5,  4,  4,  3,
-     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  8,  8,  8,
-     8,  9,  9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 15, 15,
-    15,
-};
-
-static const uint8_t bitalloc_65_bits_e[65] = {
-    10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,
-     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,
-     3,  3,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  7,  7,
-     7,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10, 10, 10,
-    10,
-};
-
-static const uint8_t bitalloc_65_bits_f[65] = {
-    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11,
-    11, 10,  9,  9,  8,  8,  7,  7,  6,  6,  5,  5,  4,  4,  4,  3,
-     3,  3,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10,
-    10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-    14,
-};
-
-static const uint8_t bitalloc_65_bits_g[65] = {
-    11, 11, 11, 11, 10, 10, 10, 10,  9,  9,  9,  9,  8,  8,  8,  8,
-     8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,
-     4,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,
-     7,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
-    11,
-};
-
-static const uint16_t bitalloc_129_codes_a[129] = {
-    0x0660, 0x0666, 0x06EC, 0x0722, 0x0760, 0x076E, 0x004C, 0x004E,
-    0x00F4, 0x010A, 0x0148, 0x0156, 0x01D4, 0x01F2, 0x0331, 0x0370,
-    0x0377, 0x0396, 0x03B1, 0x0024, 0x0064, 0x007B, 0x008A, 0x00A5,
-    0x00D4, 0x00EB, 0x00FA, 0x019A, 0x01B9, 0x01C9, 0x01D9, 0x0010,
-    0x0030, 0x0033, 0x0043, 0x0053, 0x006B, 0x007A, 0x00CA, 0x00D2,
-    0x00DE, 0x00E6, 0x00F6, 0x000E, 0x001F, 0x0023, 0x002B, 0x003B,
-    0x003F, 0x0067, 0x0070, 0x0077, 0x0005, 0x000D, 0x0012, 0x001B,
-    0x002C, 0x0035, 0x003A, 0x0004, 0x000B, 0x0017, 0x001F, 0x0009,
-    0x0008, 0x000A, 0x0000, 0x0018, 0x000C, 0x0005, 0x003C, 0x0036,
-    0x002D, 0x001C, 0x0013, 0x000E, 0x0006, 0x007A, 0x0071, 0x0068,
-    0x0064, 0x003C, 0x0034, 0x0028, 0x0020, 0x000F, 0x00F7, 0x00E7,
-    0x00DF, 0x00D3, 0x00CB, 0x007B, 0x0074, 0x0054, 0x0044, 0x003C,
-    0x0031, 0x0011, 0x01DA, 0x01CA, 0x01BA, 0x019B, 0x00FB, 0x00F8,
-    0x00D5, 0x00AA, 0x008B, 0x0084, 0x0065, 0x0025, 0x03B6, 0x0397,
-    0x0390, 0x0371, 0x0332, 0x01F3, 0x01D5, 0x0157, 0x0149, 0x010B,
-    0x00F5, 0x004F, 0x004D, 0x076F, 0x0761, 0x0723, 0x06ED, 0x0667,
-    0x0661,
-};
-
-static const uint16_t bitalloc_129_codes_b[129] = {
-    0x29DC, 0x14EF, 0x0455, 0x0E9C, 0x022B, 0x0489, 0x0740, 0x074F,
-    0x0172, 0x0245, 0x0247, 0x030A, 0x03A1, 0x001C, 0x008B, 0x00D6,
-    0x010C, 0x0148, 0x014F, 0x0186, 0x01D1, 0x0008, 0x000F, 0x0046,
-    0x005D, 0x0078, 0x0087, 0x0096, 0x00A5, 0x00BC, 0x00D8, 0x00DE,
-    0x00F6, 0x0005, 0x0014, 0x0024, 0x002F, 0x003A, 0x003D, 0x0049,
-    0x0050, 0x0058, 0x005F, 0x0066, 0x006D, 0x0075, 0x007C, 0x0004,
-    0x000B, 0x0013, 0x0018, 0x001B, 0x001F, 0x0022, 0x0026, 0x002A,
-    0x002D, 0x0031, 0x0034, 0x0038, 0x003B, 0x003F, 0x0003, 0x0006,
-    0x000A, 0x0007, 0x0004, 0x0000, 0x003C, 0x0039, 0x0035, 0x0032,
-    0x002E, 0x002B, 0x0027, 0x0023, 0x0020, 0x001C, 0x0019, 0x0016,
-    0x0010, 0x0005, 0x007D, 0x007A, 0x006E, 0x0067, 0x0060, 0x0059,
-    0x0051, 0x004A, 0x0042, 0x003B, 0x0034, 0x0025, 0x0015, 0x0006,
-    0x00F7, 0x00DF, 0x00D9, 0x00BD, 0x00A6, 0x0097, 0x0090, 0x0079,
-    0x006A, 0x0047, 0x0044, 0x0009, 0x01D2, 0x0187, 0x0184, 0x0149,
-    0x010D, 0x00D7, 0x00B8, 0x001D, 0x03A6, 0x030B, 0x029C, 0x0246,
-    0x0173, 0x0114, 0x0741, 0x053A, 0x0488, 0x0E9D, 0x0A76, 0x0454,
-    0x29DD,
-};
-
-static const uint16_t bitalloc_129_codes_c[129] = {
-    0x0E5C, 0x072F, 0x001D, 0x0724, 0x000F, 0x010D, 0x0324, 0x0393,
-    0x03E9, 0x0080, 0x0087, 0x00FA, 0x0164, 0x0193, 0x01DE, 0x01F5,
-    0x0010, 0x002A, 0x0041, 0x0064, 0x0073, 0x008E, 0x00A4, 0x00B3,
-    0x00D6, 0x00E5, 0x00F4, 0x00FB, 0x0002, 0x0009, 0x0013, 0x001E,
-    0x0026, 0x002C, 0x0033, 0x003F, 0x0041, 0x004C, 0x0053, 0x005E,
-    0x0065, 0x0070, 0x0073, 0x0078, 0x007B, 0x007E, 0x0002, 0x0005,
-    0x0007, 0x000B, 0x000D, 0x0011, 0x0014, 0x0017, 0x001A, 0x001D,
-    0x0021, 0x0024, 0x0027, 0x002A, 0x002D, 0x0030, 0x0033, 0x0036,
-    0x003A, 0x0037, 0x0034, 0x0031, 0x002E, 0x002B, 0x0028, 0x0025,
-    0x0022, 0x001E, 0x001B, 0x0018, 0x0015, 0x0012, 0x000E, 0x000C,
-    0x0008, 0x0006, 0x0003, 0x007F, 0x007C, 0x0079, 0x0076, 0x0071,
-    0x006A, 0x005F, 0x0058, 0x004D, 0x0046, 0x0040, 0x0038, 0x002D,
-    0x0027, 0x001F, 0x0014, 0x0012, 0x0003, 0x0000, 0x00F5, 0x00EE,
-    0x00D7, 0x00C8, 0x00A5, 0x008F, 0x007C, 0x0065, 0x0042, 0x002B,
-    0x0011, 0x0002, 0x01DF, 0x01C8, 0x0165, 0x00FB, 0x00E4, 0x0081,
-    0x0006, 0x03E8, 0x0325, 0x01CA, 0x010C, 0x0725, 0x0396, 0x001C,
-    0x0E5D,
-};
-
-static const uint16_t bitalloc_129_codes_d[129] = {
-    0xA598, 0xA59A, 0xA59C, 0xA59E, 0xC598, 0xE586, 0x3ACC, 0x52CA,
-    0x62CD, 0x0D48, 0x1D67, 0x2978, 0x3167, 0x3966, 0x06A5, 0x0EBC,
-    0x14BD, 0x1CB1, 0x0350, 0x0353, 0x075F, 0x0A5F, 0x0C5E, 0x0E5E,
-    0x01AE, 0x03AD, 0x052D, 0x062D, 0x072D, 0x00D5, 0x01D4, 0x0294,
-    0x0314, 0x0394, 0x0014, 0x0094, 0x0114, 0x0174, 0x01B4, 0x01F4,
-    0x000B, 0x004B, 0x008B, 0x00BB, 0x00DB, 0x00FB, 0x001B, 0x003B,
-    0x0053, 0x0063, 0x0073, 0x0003, 0x0013, 0x0023, 0x002F, 0x0037,
-    0x003F, 0x0007, 0x000F, 0x0015, 0x0019, 0x001D, 0x0001, 0x0005,
-    0x0009, 0x0006, 0x0002, 0x001E, 0x001A, 0x0016, 0x0010, 0x0008,
-    0x0000, 0x0038, 0x0030, 0x0028, 0x001C, 0x000C, 0x007C, 0x006C,
-    0x005C, 0x0044, 0x0024, 0x0004, 0x00E4, 0x00C4, 0x00A4, 0x0074,
-    0x0034, 0x01F5, 0x01B5, 0x0175, 0x0115, 0x0095, 0x0015, 0x0395,
-    0x0315, 0x0295, 0x01D5, 0x00D6, 0x072E, 0x062E, 0x052E, 0x03AE,
-    0x01AF, 0x0E5F, 0x0C5F, 0x0C58, 0x0A58, 0x0758, 0x0351, 0x1CB2,
-    0x18B2, 0x0EBD, 0x0EB2, 0x3967, 0x3960, 0x2979, 0x2964, 0x0D49,
-    0x72C2, 0x52CB, 0x3ACD, 0xE587, 0xC599, 0xA59F, 0xA59D, 0xA59B,
-    0xA599,
-};
-
-static const uint16_t bitalloc_129_codes_e[129] = {
-    0xA13C, 0xC720, 0xA13F, 0xA13E, 0xA13D, 0xE722, 0x5090, 0x6393,
-    0x7392, 0x2849, 0x31CE, 0x39CE, 0x1425, 0x18E5, 0x1CE5, 0x0844,
-    0x0A1C, 0x0C7C, 0x036C, 0x0423, 0x050F, 0x063F, 0x01B7, 0x0216,
-    0x0285, 0x031D, 0x039D, 0x0109, 0x0140, 0x0180, 0x01C8, 0x01CF,
-    0x007A, 0x008A, 0x00A2, 0x00C1, 0x00E5, 0x0014, 0x0037, 0x0043,
-    0x004E, 0x0056, 0x0061, 0x006C, 0x007C, 0x000B, 0x001C, 0x001F,
-    0x0023, 0x0025, 0x0029, 0x002C, 0x002E, 0x0032, 0x0034, 0x0037,
-    0x003A, 0x003C, 0x003F, 0x0001, 0x0003, 0x0006, 0x0008, 0x000A,
-    0x000C, 0x000B, 0x0009, 0x0007, 0x0004, 0x0002, 0x0000, 0x003D,
-    0x003B, 0x0038, 0x0035, 0x0033, 0x002F, 0x002D, 0x002A, 0x0026,
-    0x0024, 0x0020, 0x001D, 0x001A, 0x007D, 0x006D, 0x0062, 0x0057,
-    0x004F, 0x0044, 0x003C, 0x0015, 0x00E6, 0x00C6, 0x00A3, 0x008B,
-    0x007B, 0x006C, 0x01C9, 0x0181, 0x0141, 0x010A, 0x00DA, 0x031E,
-    0x0286, 0x0217, 0x0210, 0x0738, 0x0638, 0x0508, 0x036D, 0x0C7D,
-    0x0A1D, 0x0845, 0x1CE6, 0x18E6, 0x1426, 0x39CF, 0x31CF, 0x284E,
-    0x7393, 0x7390, 0x5091, 0xE723, 0xC724, 0xC725, 0xC722, 0xC723,
-    0xC721,
-};
-
-static const uint16_t bitalloc_129_codes_f[129] = {
-    0x762C, 0x3B17, 0x1555, 0x0608, 0x0AAB, 0x0FF2, 0x0305, 0x0307,
-    0x0763, 0x0046, 0x010C, 0x01BC, 0x02AB, 0x03B6, 0x03FD, 0x0080,
-    0x0087, 0x00DF, 0x0156, 0x01D9, 0x01F8, 0x01FF, 0x002A, 0x0041,
-    0x0061, 0x0094, 0x00D4, 0x00EA, 0x00F2, 0x00FD, 0x0009, 0x000B,
-    0x001A, 0x0026, 0x0031, 0x0040, 0x004B, 0x006B, 0x0073, 0x0077,
-    0x007A, 0x007C, 0x0000, 0x0002, 0x0006, 0x0008, 0x000B, 0x000E,
-    0x0011, 0x0014, 0x0016, 0x0019, 0x001C, 0x001E, 0x0021, 0x0023,
-    0x0026, 0x0028, 0x002B, 0x002D, 0x002F, 0x0031, 0x0033, 0x0036,
-    0x0038, 0x0037, 0x0034, 0x0032, 0x0030, 0x002E, 0x002C, 0x0029,
-    0x0027, 0x0024, 0x0022, 0x001F, 0x001D, 0x001A, 0x0017, 0x0015,
-    0x0012, 0x000F, 0x000C, 0x0009, 0x0007, 0x0003, 0x0001, 0x007D,
-    0x007B, 0x0078, 0x0074, 0x0072, 0x0054, 0x0041, 0x0036, 0x0027,
-    0x001B, 0x0014, 0x000A, 0x00FE, 0x00F3, 0x00EB, 0x00D5, 0x0095,
-    0x006E, 0x0042, 0x002B, 0x0010, 0x01F9, 0x01DA, 0x0157, 0x0154,
-    0x00C0, 0x0081, 0x0022, 0x03B7, 0x03B0, 0x01BD, 0x010D, 0x0047,
-    0x07F8, 0x0554, 0x0306, 0x0FF3, 0x0EC4, 0x0609, 0x1D8A, 0x1554,
-    0x762D,
-};
-
-static const uint16_t bitalloc_129_codes_g[129] = {
-    0x1E20, 0x1E5E, 0x031C, 0x051A, 0x0718, 0x0916, 0x0B14, 0x0D12,
-    0x0F11, 0x0090, 0x018F, 0x028E, 0x038D, 0x048C, 0x058B, 0x068A,
-    0x0789, 0x0049, 0x00C8, 0x0148, 0x01C7, 0x0247, 0x02C6, 0x0346,
-    0x03C5, 0x0025, 0x0065, 0x00A5, 0x00E4, 0x0124, 0x0164, 0x01A4,
-    0x01E3, 0x0013, 0x0033, 0x0053, 0x0073, 0x0093, 0x00B3, 0x00D3,
-    0x00F3, 0x000A, 0x001A, 0x002A, 0x003A, 0x004A, 0x005A, 0x006A,
-    0x007A, 0x0006, 0x000E, 0x0016, 0x001E, 0x0026, 0x002E, 0x0036,
-    0x003E, 0x0004, 0x0008, 0x000C, 0x0010, 0x0014, 0x0018, 0x001C,
-    0x0000, 0x001D, 0x0019, 0x0015, 0x0011, 0x000D, 0x0009, 0x0005,
-    0x003F, 0x0037, 0x002F, 0x0027, 0x001F, 0x0017, 0x000F, 0x0007,
-    0x007B, 0x006B, 0x005B, 0x004B, 0x003B, 0x002B, 0x001B, 0x000B,
-    0x0008, 0x00F0, 0x00D0, 0x00B0, 0x0090, 0x0070, 0x0050, 0x0030,
-    0x01E4, 0x01A5, 0x0165, 0x0125, 0x00E5, 0x00E2, 0x00A2, 0x0062,
-    0x03CA, 0x0347, 0x02C7, 0x02C4, 0x0244, 0x0149, 0x00C9, 0x00C6,
-    0x0796, 0x068B, 0x0688, 0x048D, 0x048A, 0x028F, 0x028C, 0x0091,
-    0x0F2E, 0x0D13, 0x0B15, 0x0917, 0x0719, 0x051B, 0x031D, 0x1E5F,
-    0x1E21,
-};
-
-static const uint8_t bitalloc_129_bits_a[129] = {
-    11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-    10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,
-     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,
-     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,
-     4,  4,  4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,
-     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-     8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
-    11,
-};
-
-static const uint8_t bitalloc_129_bits_b[129] = {
-    14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,
-     9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-     8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,
-     5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
-     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12,
-    14,
-};
-
-static const uint8_t bitalloc_129_bits_c[129] = {
-    13, 12, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
-     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
-     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-     7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-     8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11,
-    13,
-};
-
-static const uint8_t bitalloc_129_bits_d[129] = {
-    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13,
-    13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10,
-    10, 10,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  7,  7,
-     7,  7,  7,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
-     4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,
-     7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9, 10,
-    10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13,
-    13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
-    16,
-};
-
-static const uint8_t bitalloc_129_bits_e[129] = {
-    16, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 12,
-    12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,
-     8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,
-     8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
-    12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16,
-    16,
-};
-
-static const uint8_t bitalloc_129_bits_f[129] = {
-    15, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,  9,
-     9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,
-     7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-     6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-     7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,
-     9,  9,  9, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13,
-    15,
-};
-
-static const uint8_t bitalloc_129_bits_g[129] = {
-    13, 13, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
-    11, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,
-     9,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
-     7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,
-     4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
-     7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,
-     9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
-    11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 13,
-    13,
-};
-
-static const uint8_t bitalloc_sizes[10] = {
-    3, 5, 7, 9, 13, 17, 25, 33, 65, 129
-};
-
-static const int8_t bitalloc_offsets[10] = {
-    -1, -2, -3, -4, -6, -8, -12, -16, -32, -64
-};
-
-static const uint8_t bitalloc_maxbits[10][7] = {
-    { 2 },
-    { 4, 3, 3 },
-    { 5, 5, 4 },
-    { 6, 5, 6 },
-    { 7, 6, 5 },
-    { 9, 8, 7, 9, 8, 8, 8 },
-    { 9, 9, 8, 9, 8, 9, 9 },
-    { 9, 9, 9, 9, 9, 9, 9 },
-    { 9, 9, 9, 9, 9, 9, 9 },
-    { 9, 9, 9, 9, 9, 9, 9 }
-};
-
-static const uint16_t *const bitalloc_codes[10][8] = {
-    { bitalloc_3_codes,     NULL },
-    { bitalloc_5_codes_a,   bitalloc_5_codes_b,   bitalloc_5_codes_c,   NULL },
-    { bitalloc_7_codes_a,   bitalloc_7_codes_b,   bitalloc_7_codes_c,   NULL },
-    { bitalloc_9_codes_a,   bitalloc_9_codes_b,   bitalloc_9_codes_c,   NULL },
-    { bitalloc_13_codes_a,  bitalloc_13_codes_b,  bitalloc_13_codes_c,  NULL },
-    { bitalloc_17_codes_a,  bitalloc_17_codes_b,  bitalloc_17_codes_c,  bitalloc_17_codes_d,
-      bitalloc_17_codes_e,  bitalloc_17_codes_f,  bitalloc_17_codes_g,  NULL },
-    { bitalloc_25_codes_a,  bitalloc_25_codes_b,  bitalloc_25_codes_c,  bitalloc_25_codes_d,
-      bitalloc_25_codes_e,  bitalloc_25_codes_f,  bitalloc_25_codes_g,  NULL },
-    { bitalloc_33_codes_a,  bitalloc_33_codes_b,  bitalloc_33_codes_c,  bitalloc_33_codes_d,
-      bitalloc_33_codes_e,  bitalloc_33_codes_f,  bitalloc_33_codes_g,  NULL },
-    { bitalloc_65_codes_a,  bitalloc_65_codes_b,  bitalloc_65_codes_c,  bitalloc_65_codes_d,
-      bitalloc_65_codes_e,  bitalloc_65_codes_f,  bitalloc_65_codes_g,  NULL },
-    { bitalloc_129_codes_a, bitalloc_129_codes_b, bitalloc_129_codes_c, bitalloc_129_codes_d,
-      bitalloc_129_codes_e, bitalloc_129_codes_f, bitalloc_129_codes_g, NULL }
-};
-
-static const uint8_t *const bitalloc_bits[10][8] = {
-    { bitalloc_3_bits,     NULL },
-    { bitalloc_5_bits_a,   bitalloc_5_bits_b,   bitalloc_5_bits_c,   NULL },
-    { bitalloc_7_bits_a,   bitalloc_7_bits_b,   bitalloc_7_bits_c,   NULL },
-    { bitalloc_9_bits_a,   bitalloc_9_bits_b,   bitalloc_9_bits_c,   NULL },
-    { bitalloc_13_bits_a,  bitalloc_13_bits_b,  bitalloc_13_bits_c,  NULL },
-    { bitalloc_17_bits_a,  bitalloc_17_bits_b,  bitalloc_17_bits_c,  bitalloc_17_bits_d,
-      bitalloc_17_bits_e,  bitalloc_17_bits_f,  bitalloc_17_bits_g,  NULL },
-    { bitalloc_25_bits_a,  bitalloc_25_bits_b,  bitalloc_25_bits_c,  bitalloc_25_bits_d,
-      bitalloc_25_bits_e,  bitalloc_25_bits_f,  bitalloc_25_bits_g,  NULL },
-    { bitalloc_33_bits_a,  bitalloc_33_bits_b,  bitalloc_33_bits_c,  bitalloc_33_bits_d,
-      bitalloc_33_bits_e,  bitalloc_33_bits_f,  bitalloc_33_bits_g,  NULL },
-    { bitalloc_65_bits_a,  bitalloc_65_bits_b,  bitalloc_65_bits_c,  bitalloc_65_bits_d,
-      bitalloc_65_bits_e,  bitalloc_65_bits_f,  bitalloc_65_bits_g,  NULL },
-    { bitalloc_129_bits_a, bitalloc_129_bits_b, bitalloc_129_bits_c, bitalloc_129_bits_d,
-      bitalloc_129_bits_e, bitalloc_129_bits_f, bitalloc_129_bits_g, NULL }
-};
+#include "libavutil/common.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+
+#define DCA_CODE_BOOKS      10
+
+typedef struct DCAVLC {
+    int offset;         ///< Code values offset
+    int max_depth;      ///< Parameter for get_vlc2()
+    VLC vlc[7];         ///< Actual codes
+} DCAVLC;
+
+extern DCAVLC   ff_dca_vlc_bit_allocation;
+extern DCAVLC   ff_dca_vlc_transition_mode;
+extern DCAVLC   ff_dca_vlc_scale_factor;
+extern DCAVLC   ff_dca_vlc_quant_index[DCA_CODE_BOOKS];
+
+extern VLC  ff_dca_vlc_tnl_grp[5];
+extern VLC  ff_dca_vlc_tnl_scf;
+extern VLC  ff_dca_vlc_damp;
+extern VLC  ff_dca_vlc_dph;
+extern VLC  ff_dca_vlc_fst_rsd_amp;
+extern VLC  ff_dca_vlc_rsd_apprx;
+extern VLC  ff_dca_vlc_rsd_amp;
+extern VLC  ff_dca_vlc_avg_g3;
+extern VLC  ff_dca_vlc_st_grid;
+extern VLC  ff_dca_vlc_grid_2;
+extern VLC  ff_dca_vlc_grid_3;
+extern VLC  ff_dca_vlc_rsd;
+
+av_cold void ff_dca_init_vlcs(void);
 
 #endif /* AVCODEC_DCAHUFF_H */
diff --git a/libavcodec/dcamath.h b/libavcodec/dcamath.h
index e21eb07..e0d6f4f 100644
--- a/libavcodec/dcamath.h
+++ b/libavcodec/dcamath.h
@@ -1,31 +1,30 @@
 /*
- * This file is part of Libav.
+ * Copyright (C) 2016 foo86
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/common.h"
-
+#ifndef AVCODEC_DCAMATH_H
+#define AVCODEC_DCAMATH_H
 
-// clip a signed integer into the (-2^23), (2^23-1) range
-static inline int dca_clip23(int a)
-{
-    return av_clip_intp2(a, 23);
-}
+#include "libavutil/common.h"
+#include "libavutil/intmath.h"
 
-static inline int32_t dca_norm(int64_t a, int bits)
+static inline int32_t norm__(int64_t a, int bits)
 {
     if (bits > 0)
         return (int32_t)((a + (INT64_C(1) << (bits - 1))) >> bits);
@@ -33,10 +32,24 @@ static inline int32_t dca_norm(int64_t a, int bits)
         return (int32_t)a;
 }
 
-static inline int64_t dca_round(int64_t a, int bits)
+static inline int32_t mul__(int32_t a, int32_t b, int bits)
 {
-    if (bits > 0)
-        return (a + (INT64_C(1) << (bits - 1))) & ~((INT64_C(1) << bits) - 1);
-    else
-        return a;
+    return norm__((int64_t)a * b, bits);
 }
+
+static inline int32_t norm13(int64_t a) { return norm__(a, 13); }
+static inline int32_t norm16(int64_t a) { return norm__(a, 16); }
+static inline int32_t norm20(int64_t a) { return norm__(a, 20); }
+static inline int32_t norm21(int64_t a) { return norm__(a, 21); }
+static inline int32_t norm23(int64_t a) { return norm__(a, 23); }
+
+static inline int32_t mul15(int32_t a, int32_t b) { return mul__(a, b, 15); }
+static inline int32_t mul16(int32_t a, int32_t b) { return mul__(a, b, 16); }
+static inline int32_t mul17(int32_t a, int32_t b) { return mul__(a, b, 17); }
+static inline int32_t mul22(int32_t a, int32_t b) { return mul__(a, b, 22); }
+static inline int32_t mul23(int32_t a, int32_t b) { return mul__(a, b, 23); }
+static inline int32_t mul31(int32_t a, int32_t b) { return mul__(a, b, 31); }
+
+static inline int32_t clip23(int32_t a) { return av_clip_intp2(a, 23); }
+
+#endif
diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 262319e..df1b75d 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -2,20 +2,20 @@
  * (c) 2001 Fabrice Bellard
  *     2007 Marc Hoffman <marc.hoffman@analog.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,10 +65,26 @@ static const struct algo fdct_tab[] = {
 #endif /* CONFIG_FAANDCT */
 };
 
+static void ff_prores_idct_wrap(int16_t *dst){
+    LOCAL_ALIGNED(16, int16_t, qmat, [64]);
+    int i;
+
+    for(i=0; i<64; i++){
+        qmat[i]=4;
+    }
+    ff_prores_idct(dst, qmat);
+    for(i=0; i<64; i++) {
+         dst[i] -= 512;
+    }
+}
+
 static const struct algo idct_tab[] = {
     { "REF-DBL",     ff_ref_idct,          FF_IDCT_PERM_NONE },
     { "INT",         ff_j_rev_dct,         FF_IDCT_PERM_LIBMPEG2 },
     { "SIMPLE-C",    ff_simple_idct_8,     FF_IDCT_PERM_NONE },
+    { "SIMPLE-C10",  ff_simple_idct_10,    FF_IDCT_PERM_NONE },
+    { "SIMPLE-C12",  ff_simple_idct_12,    FF_IDCT_PERM_NONE, 0, 1 },
+    { "PR-C",        ff_prores_idct_wrap,  FF_IDCT_PERM_NONE, 0, 1 },
 #if CONFIG_FAANIDCT
     { "FAANI",       ff_faanidct,          FF_IDCT_PERM_NONE },
 #endif /* CONFIG_FAANIDCT */
@@ -96,7 +112,7 @@ static const struct algo idct_tab_arch[] = { { 0 } };
 DECLARE_ALIGNED(16, static int16_t, block)[64];
 DECLARE_ALIGNED(8,  static int16_t, block1)[64];
 
-static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
+static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
 {
     int i, j;
 
@@ -105,7 +121,7 @@ static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
     switch (test) {
     case 0:
         for (i = 0; i < 64; i++)
-            block[i] = (av_lfg_get(prng) % 512) - 256;
+            block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
         if (is_idct) {
             ff_ref_fdct(block);
             for (i = 0; i < 64; i++)
@@ -114,11 +130,13 @@ static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
         break;
     case 1:
         j = av_lfg_get(prng) % 10 + 1;
-        for (i = 0; i < j; i++)
-            block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
+        for (i = 0; i < j; i++) {
+            int idx = av_lfg_get(prng) % 64;
+            block[idx] = av_lfg_get(prng) % (2*vals) -vals;
+        }
         break;
     case 2:
-        block[ 0] = av_lfg_get(prng) % 4096 - 2048;
+        block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
         block[63] = (block[0] & 1) ^ 1;
         break;
     }
@@ -143,6 +161,10 @@ static void permute(int16_t dst[64], const int16_t src[64],
         for (i = 0; i < 64; i++)
             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
         break;
+    case FF_IDCT_PERM_TRANSPOSE:
+        for (i = 0; i < 64; i++)
+            dst[(i>>3) | ((i<<3)&0x38)] = src[i];
+        break;
     default:
         for (i = 0; i < 64; i++)
             dst[i] = src[i];
@@ -150,7 +172,7 @@ static void permute(int16_t dst[64], const int16_t src[64],
     }
 }
 
-static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
+static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 {
     void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
     int it, i, scale;
@@ -160,6 +182,7 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
     int maxout = 0;
     int blockSumErrMax = 0, blockSumErr;
     AVLFG prng;
+    const int vals=1<<bits;
     double omse, ome;
     int spec_err;
 
@@ -170,7 +193,7 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
     for (i = 0; i < 64; i++)
         sysErr[i] = 0;
     for (it = 0; it < NB_ITS; it++) {
-        init_block(block1, test, is_idct, &prng);
+        init_block(block1, test, is_idct, &prng, vals);
         permute(block, block1, dct->perm_type);
 
         dct->func(block);
@@ -184,6 +207,9 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
         }
 
         ref(block1);
+        if (!strcmp(dct->name, "PR-SSE2"))
+            for (i = 0; i < 64; i++)
+                block1[i] = av_clip(block1[i], 4-512, 1019-512);
 
         blockSumErr = 0;
         for (i = 0; i < 64; i++) {
@@ -216,19 +242,22 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
 
     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 
-    printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
+    printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
            omse, ome, (double) sysErrMax / NB_ITS,
            maxout, blockSumErrMax);
 
-    if (spec_err && !dct->nonspec)
+    if (spec_err && !dct->nonspec) {
+        printf("Failed!\n");
         return 1;
+    }
 
     if (!speed)
         return 0;
 
     /* speed test */
-    init_block(block, test, is_idct, &prng);
+
+    init_block(block, test, is_idct, &prng, vals);
     permute(block1, block, dct->perm_type);
 
     ti = av_gettime_relative();
@@ -238,10 +267,10 @@ static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
             memcpy(block, block1, sizeof(block));
             dct->func(block);
         }
+        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
-    emms_c();
 
     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
            (double) it1 * 1000.0 / (double) ti1);
@@ -366,6 +395,25 @@ static void idct248_error(const char *name,
             if (v > err_max)
                 err_max = v;
         }
+#if 0
+        printf("ref=\n");
+        for(i=0;i<8;i++) {
+            int j;
+            for(j=0;j<8;j++) {
+                printf(" %3d", img_dest1[i*8+j]);
+            }
+            printf("\n");
+        }
+
+        printf("out=\n");
+        for(i=0;i<8;i++) {
+            int j;
+            for(j=0;j<8;j++) {
+                printf(" %3d", img_dest[i*8+j]);
+            }
+            printf("\n");
+        }
+#endif
     }
     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 
@@ -380,10 +428,10 @@ static void idct248_error(const char *name,
                 block[i] = block1[i];
             idct248_put(img_dest, 8, block);
         }
+        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
-    emms_c();
 
     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
            (double) it1 * 1000.0 / (double) ti1);
@@ -391,10 +439,11 @@ static void idct248_error(const char *name,
 
 static void help(void)
 {
-    printf("dct-test [-i] [<test-number>]\n"
+    printf("dct-test [-i] [<test-number>] [<bits>]\n"
            "test-number 0 -> test with random matrixes\n"
            "            1 -> test with random sparse matrixes\n"
            "            2 -> do 3. test from MPEG-4 std\n"
+           "bits        Number of time domain bits to use, 8 is default\n"
            "-i          test IDCT implementations\n"
            "-4          test IDCT248 implementations\n"
            "-t          speed test\n");
@@ -411,6 +460,7 @@ int main(int argc, char **argv)
     int test = 1;
     int speed = 0;
     int err = 0;
+    int bits=8;
 
     ff_ref_dct_init();
 
@@ -437,8 +487,9 @@ int main(int argc, char **argv)
 
     if (optind < argc)
         test = atoi(argv[optind]);
+    if(optind+1 < argc) bits= atoi(argv[optind+1]);
 
-    printf("Libav DCT/IDCT test\n");
+    printf("ffmpeg DCT/IDCT test\n");
 
     if (test_248_dct) {
         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
@@ -446,20 +497,20 @@ int main(int argc, char **argv)
         const int cpu_flags = av_get_cpu_flags();
         if (test_idct) {
             for (i = 0; i < FF_ARRAY_ELEMS(idct_tab); i++)
-                err |= dct_error(&idct_tab[i], test, test_idct, speed);
+                err |= dct_error(&idct_tab[i], test, test_idct, speed, bits);
 
             for (i = 0; idct_tab_arch[i].name; i++)
                 if (!(~cpu_flags & idct_tab_arch[i].cpu_flag))
-                    err |= dct_error(&idct_tab_arch[i], test, test_idct, speed);
+                    err |= dct_error(&idct_tab_arch[i], test, test_idct, speed, bits);
         }
 #if CONFIG_FDCTDSP
         else {
             for (i = 0; i < FF_ARRAY_ELEMS(fdct_tab); i++)
-                err |= dct_error(&fdct_tab[i], test, test_idct, speed);
+                err |= dct_error(&fdct_tab[i], test, test_idct, speed, bits);
 
             for (i = 0; fdct_tab_arch[i].name; i++)
                 if (!(~cpu_flags & fdct_tab_arch[i].cpu_flag))
-                    err |= dct_error(&fdct_tab_arch[i], test, test_idct, speed);
+                    err |= dct_error(&fdct_tab_arch[i], test, test_idct, speed, bits);
         }
 #endif /* CONFIG_FDCTDSP */
     }
diff --git a/libavcodec/dct.c b/libavcodec/dct.c
index 180477e..cca51ee 100644
--- a/libavcodec/dct.c
+++ b/libavcodec/dct.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -190,12 +190,12 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
         ff_init_ff_cos_tabs(nbits + 2);
 
         s->costab = ff_cos_tabs[nbits + 2];
-        s->csc2   = av_malloc(n / 2 * sizeof(FFTSample));
+        s->csc2   = av_malloc_array(n / 2, sizeof(FFTSample));
         if (!s->csc2)
             return AVERROR(ENOMEM);
 
         if (ff_rdft_init(&s->rdft, nbits, inverse == DCT_III) < 0) {
-            av_free(s->csc2);
+            av_freep(&s->csc2);
             return -1;
         }
 
@@ -220,5 +220,5 @@ av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
 av_cold void ff_dct_end(DCTContext *s)
 {
     ff_rdft_end(&s->rdft);
-    av_free(s->csc2);
+    av_freep(&s->csc2);
 }
diff --git a/libavcodec/dct.h b/libavcodec/dct.h
index 4a31f54..05297ba 100644
--- a/libavcodec/dct.h
+++ b/libavcodec/dct.h
@@ -4,24 +4,24 @@
  * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-#ifndef AVCODEC_DCT_H
+#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
 #define AVCODEC_DCT_H
 
 #include <stdint.h>
@@ -59,6 +59,9 @@ void ff_fdct248_islow_8(int16_t *data);
 void ff_fdct248_islow_10(int16_t *data);
 
 void ff_j_rev_dct(int16_t *data);
+void ff_j_rev_dct4(int16_t *data);
+void ff_j_rev_dct2(int16_t *data);
+void ff_j_rev_dct1(int16_t *data);
 void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block);
 void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block);
 
diff --git a/libavcodec/dct32.h b/libavcodec/dct32.h
index 8bf6880..61bf223 100644
--- a/libavcodec/dct32.h
+++ b/libavcodec/dct32.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_fixed.c b/libavcodec/dct32_fixed.c
index 64efe8b..9025d5e 100644
--- a/libavcodec/dct32_fixed.c
+++ b/libavcodec/dct32_fixed.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_float.c b/libavcodec/dct32_float.c
index ef37ce9..597c9bb 100644
--- a/libavcodec/dct32_float.c
+++ b/libavcodec/dct32_float.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dct32_template.c b/libavcodec/dct32_template.c
index 272e0db..c70396e 100644
--- a/libavcodec/dct32_template.c
+++ b/libavcodec/dct32_template.c
@@ -2,20 +2,20 @@
  * Template for the Discrete Cosine Transform for 32 samples
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@
 #define COS3_0 FIXHR(0.54119610014619698439/2)
 #define COS3_1 FIXHR(1.30656296487637652785/4)
 
-#define COS4_0 FIXHR(0.70710678118654752439/2)
+#define COS4_0 FIXHR(M_SQRT1_2/2)
 
 /* butterfly operator */
 #define BF(a, b, c, s)\
diff --git a/libavcodec/dctref.c b/libavcodec/dctref.c
index ae3dec5..851014b 100644
--- a/libavcodec/dctref.c
+++ b/libavcodec/dctref.c
@@ -2,20 +2,20 @@
  * reference discrete cosine transform (double precision)
  * Copyright (C) 2009 Dylan Yudaken
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dctref.h b/libavcodec/dctref.h
index a93b70d..f6fde88 100644
--- a/libavcodec/dctref.h
+++ b/libavcodec/dctref.h
@@ -2,20 +2,20 @@
  * reference discrete cosine transform (double precision)
  * Copyright (C) 2009 Dylan Yudaken
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dds.c b/libavcodec/dds.c
index 91e0c24..763371a 100644
--- a/libavcodec/dds.c
+++ b/libavcodec/dds.c
@@ -2,20 +2,20 @@
  * DirectDraw Surface image decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 
 #include <stdint.h>
 
+#include "libavutil/libm.h"
 #include "libavutil/imgutils.h"
 
 #include "avcodec.h"
@@ -45,6 +46,7 @@ enum DDSPostProc {
     DDS_ALPHA_EXP,
     DDS_NORMAL_MAP,
     DDS_RAW_YCOCG,
+    DDS_SWAP_ALPHA,
     DDS_SWIZZLE_A2XY,
     DDS_SWIZZLE_RBXG,
     DDS_SWIZZLE_RGXB,
@@ -360,6 +362,10 @@ static int parse_pixel_format(AVCodecContext *avctx)
         /* 16 bpp */
         else if (bpp == 16 && r == 0xff && g == 0 && b == 0 && a == 0xff00)
             avctx->pix_fmt = AV_PIX_FMT_YA8;
+        else if (bpp == 16 && r == 0xff00 && g == 0 && b == 0 && a == 0xff) {
+            avctx->pix_fmt = AV_PIX_FMT_YA8;
+            ctx->postproc = DDS_SWAP_ALPHA;
+        }
         else if (bpp == 16 && r == 0xffff && g == 0 && b == 0 && a == 0)
             avctx->pix_fmt = AV_PIX_FMT_GRAY16LE;
         else if (bpp == 16 && r == 0x7c00 && g == 0x3e0 && b == 0x1f && a == 0)
@@ -373,9 +379,9 @@ static int parse_pixel_format(AVCodecContext *avctx)
             avctx->pix_fmt = AV_PIX_FMT_BGR24;
         /* 32 bpp */
         else if (bpp == 32 && r == 0xff0000 && g == 0xff00 && b == 0xff && a == 0)
-            avctx->pix_fmt = AV_PIX_FMT_BGRA; // opaque
+            avctx->pix_fmt = AV_PIX_FMT_BGR0; // opaque
         else if (bpp == 32 && r == 0xff && g == 0xff00 && b == 0xff0000 && a == 0)
-            avctx->pix_fmt = AV_PIX_FMT_RGBA; // opaque
+            avctx->pix_fmt = AV_PIX_FMT_RGB0; // opaque
         else if (bpp == 32 && r == 0xff0000 && g == 0xff00 && b == 0xff && a == 0xff000000)
             avctx->pix_fmt = AV_PIX_FMT_BGRA;
         else if (bpp == 32 && r == 0xff && g == 0xff00 && b == 0xff0000 && a == 0xff000000)
@@ -515,7 +521,7 @@ static void run_postproc(AVCodecContext *avctx, AVFrame *frame)
 
             int d = (255 * 255 - x * x - y * y) / 2;
             if (d > 0)
-                z = rint(sqrtf(d));
+                z = lrint(sqrtf(d));
 
             src[0] = x;
             src[1] = y;
@@ -541,6 +547,15 @@ static void run_postproc(AVCodecContext *avctx, AVFrame *frame)
             src[3] = a;
         }
         break;
+    case DDS_SWAP_ALPHA:
+        /* Alpha and Luma are stored swapped. */
+        av_log(avctx, AV_LOG_DEBUG, "Post-processing swapped Luma/Alpha.\n");
+
+        for (i = 0; i < frame->linesize[0] * frame->height; i += 2) {
+            uint8_t *src = frame->data[0] + i;
+            FFSWAP(uint8_t, src[0], src[1]);
+        }
+        break;
     case DDS_SWIZZLE_A2XY:
         /* Swap R and G, often used to restore a standard RGTC2. */
         av_log(avctx, AV_LOG_DEBUG, "Post-processing A2XY swizzle.\n");
@@ -666,17 +681,15 @@ static int dds_decode(AVCodecContext *avctx, void *data,
 
         if (ctx->paletted) {
             int i;
-            uint32_t *p = (uint32_t*) frame->data[1];
-
             /* Use the first 1024 bytes as palette, then copy the rest. */
-            for (i = 0; i < 256; i++) {
-                uint32_t rgba = 0;
-                rgba |= bytestream2_get_byte(gbc) << 16;
-                rgba |= bytestream2_get_byte(gbc) << 8;
-                rgba |= bytestream2_get_byte(gbc) << 0;
-                rgba |= bytestream2_get_byte(gbc) << 24;
-                p[i] = rgba;
-            }
+            bytestream2_get_buffer(gbc, frame->data[1], 256 * 4);
+            for (i = 0; i < 256; i++)
+                AV_WN32(frame->data[1] + i*4,
+                        (frame->data[1][2+i*4]<<0)+
+                        (frame->data[1][1+i*4]<<8)+
+                        (frame->data[1][0+i*4]<<16)+
+                        (frame->data[1][3+i*4]<<24)
+                );
 
             frame->palette_has_changed = 1;
         }
diff --git a/libavcodec/dfa.c b/libavcodec/dfa.c
index 8021193..f45d019 100644
--- a/libavcodec/dfa.c
+++ b/libavcodec/dfa.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2011 Konstantin Shishkov
  * based on work by Vladimir "VAG" Gneushev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include "bytestream.h"
 #include "internal.h"
 
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 
@@ -37,12 +38,13 @@ typedef struct DfaContext {
 static av_cold int dfa_decode_init(AVCodecContext *avctx)
 {
     DfaContext *s = avctx->priv_data;
-    int ret;
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
-    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
-        return ret;
+    if (!avctx->width || !avctx->height)
+        return AVERROR_INVALIDDATA;
+
+    av_assert0(av_image_check_size(avctx->width, avctx->height, 0, avctx) >= 0);
 
     s->frame_buf = av_mallocz(avctx->width * avctx->height);
     if (!s->frame_buf)
@@ -70,6 +72,8 @@ static int decode_tsw1(GetByteContext *gb, uint8_t *frame, int width, int height
 
     segments = bytestream2_get_le32(gb);
     offset   = bytestream2_get_le32(gb);
+    if (segments == 0 && offset == frame_end - frame)
+        return 0; // skip frame
     if (frame_end - frame <= offset)
         return AVERROR_INVALIDDATA;
     frame += offset;
@@ -252,6 +256,9 @@ static int decode_wdlt(GetByteContext *gb, uint8_t *frame, int width, int height
             y        += skip_lines;
             segments = bytestream2_get_le16(gb);
         }
+
+        if (frame_end <= frame)
+            return AVERROR_INVALIDDATA;
         if (segments & 0x8000) {
             frame[width - 1] = segments & 0xFF;
             segments = bytestream2_get_le16(gb);
@@ -289,7 +296,7 @@ static int decode_wdlt(GetByteContext *gb, uint8_t *frame, int width, int height
 static int decode_tdlt(GetByteContext *gb, uint8_t *frame, int width, int height)
 {
     const uint8_t *frame_end = frame + width * height;
-    int segments = bytestream2_get_le32(gb);
+    uint32_t segments = bytestream2_get_le32(gb);
     int skip, copy;
 
     while (segments--) {
@@ -338,11 +345,10 @@ static int dfa_decode_frame(AVCodecContext *avctx,
     uint8_t *dst;
     int ret;
     int i, pal_elems;
+    int version = avctx->extradata_size==2 ? AV_RL16(avctx->extradata) : 0;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0))) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
     while (bytestream2_get_bytes_left(&gb) > 0) {
@@ -355,7 +361,7 @@ static int dfa_decode_frame(AVCodecContext *avctx,
             pal_elems = FFMIN(chunk_size / 3, 256);
             for (i = 0; i < pal_elems; i++) {
                 s->pal[i] = bytestream2_get_be24(&gb) << 2;
-                s->pal[i] |= (s->pal[i] >> 6) & 0x333;
+                s->pal[i] |= 0xFFU << 24 | (s->pal[i] >> 6) & 0x30303;
             }
             frame->palette_has_changed = 1;
         } else if (chunk_type <= 9) {
@@ -375,9 +381,17 @@ static int dfa_decode_frame(AVCodecContext *avctx,
     buf = s->frame_buf;
     dst = frame->data[0];
     for (i = 0; i < avctx->height; i++) {
-        memcpy(dst, buf, avctx->width);
+        if(version == 0x100) {
+            int j;
+            for(j = 0; j < avctx->width; j++) {
+                dst[j] = buf[ (i&3)*(avctx->width /4) + (j/4) +
+                             ((j&3)*(avctx->height/4) + (i/4))*avctx->width];
+            }
+        } else {
+            memcpy(dst, buf, avctx->width);
+            buf += avctx->width;
+        }
         dst += frame->linesize[0];
-        buf += avctx->width;
     }
     memcpy(frame->data[1], s->pal, sizeof(s->pal));
 
diff --git a/libavcodec/dirac.c b/libavcodec/dirac.c
index aced2ac..527f015 100644
--- a/libavcodec/dirac.c
+++ b/libavcodec/dirac.c
@@ -1,28 +1,29 @@
 /*
  * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
  * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * Dirac Decoder
- * @author Marco Gerards <marco@gnu.org>
+ * @author Marco Gerards <marco@gnu.org>, David Conrad, Jordi Ortiz <nenjordi@gmail.com>
  */
 
 #include "libavutil/imgutils.h"
@@ -55,7 +56,7 @@ typedef struct dirac_source_params {
     uint8_t color_spec_index;       ///< index into dirac_color_spec_presets[]
 } dirac_source_params;
 
-// defaults for source parameters
+/* defaults for source parameters */
 static const dirac_source_params dirac_source_parameters_defaults[] = {
     {  640,  480, 2, 0, 0,  1, 1,  640,  480, 0, 0, 1, 0 },
     {  176,  120, 2, 0, 0,  9, 2,  176,  120, 0, 0, 1, 1 },
@@ -130,10 +131,11 @@ static const struct {
     { AVCOL_PRI_BT709,     AVCOL_SPC_BT709,   AVCOL_TRC_UNSPECIFIED /* DCinema */ },
 };
 
-/* [DIRAC_STD] Table 10.2 Supported chroma sampling formats + luma Offset */
-static const enum AVPixelFormat dirac_pix_fmt[2][3] = {
-    { AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,  AV_PIX_FMT_YUV420P  },
-    { AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P },
+/* [DIRAC_STD] Table 10.2 Supported chroma sampling formats */
+static const enum AVPixelFormat dirac_pix_fmt[][3] = {
+    {AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12},
+    {AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12},
+    {AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12},
 };
 
 /* [DIRAC_STD] 10.3 Parse Source Parameters.
@@ -144,6 +146,7 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
     AVRational frame_rate = { 0, 0 };
     unsigned luma_depth = 8, luma_offset = 16;
     int idx;
+    int chroma_x_shift, chroma_y_shift;
 
     /* [DIRAC_STD] 10.3.2 Frame size. frame_size(video_params) */
     /* [DIRAC_STD] custom_dimensions_flag */
@@ -158,7 +161,7 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
     if (get_bits1(gb))
         /* [DIRAC_STD] CHROMA_FORMAT_INDEX */
         dsh->chroma_format = svq3_get_ue_golomb(gb);
-    if (dsh->chroma_format > 2) {
+    if (dsh->chroma_format > 2U) {
         if (log_ctx)
             av_log(log_ctx, AV_LOG_ERROR, "Unknown chroma format %d\n",
                    dsh->chroma_format);
@@ -170,14 +173,14 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
     if (get_bits1(gb))
         /* [DIRAC_STD] SOURCE_SAMPLING */
         dsh->interlaced = svq3_get_ue_golomb(gb);
-    if (dsh->interlaced > 1)
+    if (dsh->interlaced > 1U)
         return AVERROR_INVALIDDATA;
 
     /* [DIRAC_STD] 10.3.5 Frame Rate. frame_rate(video_params) */
     if (get_bits1(gb)) { /* [DIRAC_STD] custom_frame_rate_flag */
         dsh->frame_rate_index = svq3_get_ue_golomb(gb);
 
-        if (dsh->frame_rate_index > 10)
+        if (dsh->frame_rate_index > 10U)
             return AVERROR_INVALIDDATA;
 
         if (!dsh->frame_rate_index) {
@@ -203,7 +206,7 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
         /* [DIRAC_STD] index */
         dsh->aspect_ratio_index = svq3_get_ue_golomb(gb);
 
-        if (dsh->aspect_ratio_index > 6)
+        if (dsh->aspect_ratio_index > 6U)
             return AVERROR_INVALIDDATA;
 
         if (!dsh->aspect_ratio_index) {
@@ -236,10 +239,10 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
         /* [DIRAC_STD] index */
         dsh->pixel_range_index = svq3_get_ue_golomb(gb);
 
-        if (dsh->pixel_range_index > 4)
+        if (dsh->pixel_range_index > 4U)
             return AVERROR_INVALIDDATA;
 
-        // This assumes either fullrange or MPEG levels only
+        /* This assumes either fullrange or MPEG levels only */
         if (!dsh->pixel_range_index) {
             luma_offset = svq3_get_ue_golomb(gb);
             luma_depth  = av_log2(svq3_get_ue_golomb(gb)) + 1;
@@ -257,17 +260,28 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
         dsh->color_range   = pixel_range_presets[idx].color_range;
     }
 
-    if (luma_depth > 8 && log_ctx)
-        av_log(log_ctx, AV_LOG_WARNING, "Bitdepth greater than 8");
+    dsh->bit_depth = luma_depth;
 
-    dsh->pix_fmt = dirac_pix_fmt[!luma_offset][dsh->chroma_format];
+    /* Full range 8 bts uses the same pix_fmts as limited range 8 bits */
+    dsh->pixel_range_index += dsh->pixel_range_index == 1;
+
+    if (dsh->pixel_range_index < 2U)
+        return AVERROR_INVALIDDATA;
+
+    dsh->pix_fmt = dirac_pix_fmt[dsh->chroma_format][dsh->pixel_range_index-2];
+    avcodec_get_chroma_sub_sample(dsh->pix_fmt, &chroma_x_shift, &chroma_y_shift);
+    if ((dsh->width % (1<<chroma_x_shift)) || (dsh->height % (1<<chroma_y_shift))) {
+        if (log_ctx)
+            av_log(log_ctx, AV_LOG_ERROR, "Dimensions must be an integer multiple of the chroma subsampling\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     /* [DIRAC_STD] 10.3.9 Colour specification. colour_spec(video_params) */
     if (get_bits1(gb)) { /* [DIRAC_STD] custom_colour_spec_flag */
         /* [DIRAC_STD] index */
         idx = dsh->color_spec_index = svq3_get_ue_golomb(gb);
 
-        if (dsh->color_spec_index > 4)
+        if (dsh->color_spec_index > 4U)
             return AVERROR_INVALIDDATA;
 
         dsh->color_primaries = dirac_color_presets[idx].color_primaries;
@@ -278,7 +292,7 @@ static int parse_source_parameters(AVDiracSeqHeader *dsh, GetBitContext *gb,
             /* [DIRAC_STD] 10.3.9.1 Colour primaries */
             if (get_bits1(gb)) {
                 idx = svq3_get_ue_golomb(gb);
-                if (idx < 3)
+                if (idx < 3U)
                     dsh->color_primaries = dirac_primaries[idx];
             }
             /* [DIRAC_STD] 10.3.9.2 Colour matrix */
@@ -310,7 +324,6 @@ int av_dirac_parse_sequence_header(AVDiracSeqHeader **pdsh,
 {
     AVDiracSeqHeader *dsh;
     GetBitContext gb;
-    unsigned version_major;
     unsigned video_format, picture_coding_mode;
     int ret;
 
@@ -323,27 +336,25 @@ int av_dirac_parse_sequence_header(AVDiracSeqHeader **pdsh,
         goto fail;
 
     /* [DIRAC_SPEC] 10.1 Parse Parameters. parse_parameters() */
-    version_major  = svq3_get_ue_golomb(&gb);
-    svq3_get_ue_golomb(&gb); /* version_minor */
-    dsh->profile = svq3_get_ue_golomb(&gb);
-    dsh->level   = svq3_get_ue_golomb(&gb);
+    dsh->version.major = svq3_get_ue_golomb(&gb);
+    dsh->version.minor = svq3_get_ue_golomb(&gb);
+    dsh->profile   = svq3_get_ue_golomb(&gb);
+    dsh->level     = svq3_get_ue_golomb(&gb);
     /* [DIRAC_SPEC] sequence_header() -> base_video_format as defined in
      * 10.2 Base Video Format, table 10.1 Dirac predefined video formats */
     video_format   = svq3_get_ue_golomb(&gb);
 
-    if (log_ctx) {
-        if (version_major < 2)
-            av_log(log_ctx, AV_LOG_WARNING, "Stream is old and may not work\n");
-        else if (version_major > 2)
-            av_log(log_ctx, AV_LOG_WARNING, "Stream may have unhandled features\n");
-    }
+    if (dsh->version.major < 2 && log_ctx)
+        av_log(log_ctx, AV_LOG_WARNING, "Stream is old and may not work\n");
+    else if (dsh->version.major > 2 && log_ctx)
+        av_log(log_ctx, AV_LOG_WARNING, "Stream may have unhandled features\n");
 
-    if (video_format > 20) {
+    if (video_format > 20U) {
         ret = AVERROR_INVALIDDATA;
         goto fail;
     }
 
-    // Fill in defaults for the source parameters.
+    /* Fill in defaults for the source parameters. */
     dsh->width              = dirac_source_parameters_defaults[video_format].width;
     dsh->height             = dirac_source_parameters_defaults[video_format].height;
     dsh->chroma_format      = dirac_source_parameters_defaults[video_format].chroma_format;
diff --git a/libavcodec/dirac.h b/libavcodec/dirac.h
index 25cefdb..e6d9d34 100644
--- a/libavcodec/dirac.h
+++ b/libavcodec/dirac.h
@@ -1,21 +1,22 @@
 /*
  * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
  * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,10 +27,57 @@
  * @file
  * Interface to Dirac Decoder/Encoder
  * @author Marco Gerards <marco@gnu.org>
+ * @author David Conrad
+ * @author Jordi Ortiz
  */
 
 #include "avcodec.h"
 
+/**
+ * The spec limits the number of wavelet decompositions to 4 for both
+ * level 1 (VC-2) and 128 (long-gop default).
+ * 5 decompositions is the maximum before >16-bit buffers are needed.
+ * Schroedinger allows this for DD 9,7 and 13,7 wavelets only, limiting
+ * the others to 4 decompositions (or 3 for the fidelity filter).
+ *
+ * We use this instead of MAX_DECOMPOSITIONS to save some memory.
+ */
+#define MAX_DWT_LEVELS 5
+
+/**
+ * Parse code values:
+ *
+ * Dirac Specification ->
+ * 9.6.1  Table 9.1
+ *
+ * VC-2 Specification  ->
+ * 10.4.1 Table 10.1
+ */
+
+enum DiracParseCodes {
+    DIRAC_PCODE_SEQ_HEADER      = 0x00,
+    DIRAC_PCODE_END_SEQ         = 0x10,
+    DIRAC_PCODE_AUX             = 0x20,
+    DIRAC_PCODE_PAD             = 0x30,
+    DIRAC_PCODE_PICTURE_CODED   = 0x08,
+    DIRAC_PCODE_PICTURE_RAW     = 0x48,
+    DIRAC_PCODE_PICTURE_LOW_DEL = 0xC8,
+    DIRAC_PCODE_PICTURE_HQ      = 0xE8,
+    DIRAC_PCODE_INTER_NOREF_CO1 = 0x0A,
+    DIRAC_PCODE_INTER_NOREF_CO2 = 0x09,
+    DIRAC_PCODE_INTER_REF_CO1   = 0x0D,
+    DIRAC_PCODE_INTER_REF_CO2   = 0x0E,
+    DIRAC_PCODE_INTRA_REF_CO    = 0x0C,
+    DIRAC_PCODE_INTRA_REF_RAW   = 0x4C,
+    DIRAC_PCODE_INTRA_REF_PICT  = 0xCC,
+    DIRAC_PCODE_MAGIC           = 0x42424344,
+};
+
+typedef struct DiracVersionInfo {
+    int major;
+    int minor;
+} DiracVersionInfo;
+
 typedef struct AVDiracSeqHeader {
     unsigned width;
     unsigned height;
@@ -60,6 +108,9 @@ typedef struct AVDiracSeqHeader {
     enum AVColorPrimaries color_primaries;
     enum AVColorTransferCharacteristic color_trc;
     enum AVColorSpace colorspace;
+
+    DiracVersionInfo version;
+    int bit_depth;
 } AVDiracSeqHeader;
 
 /**
diff --git a/libavcodec/dirac_arith.c b/libavcodec/dirac_arith.c
new file mode 100644
index 0000000..bf91392
--- /dev/null
+++ b/libavcodec/dirac_arith.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Arithmetic decoder for Dirac
+ * @author Marco Gerards <marco@gnu.org>
+ */
+
+#include "dirac_arith.h"
+
+
+const uint16_t ff_dirac_prob[256] = {
+    0,    2,    5,    8,    11,   15,   20,   24,
+    29,   35,   41,   47,   53,   60,   67,   74,
+    82,   89,   97,   106,  114,  123,  132,  141,
+    150,  160,  170,  180,  190,  201,  211,  222,
+    233,  244,  256,  267,  279,  291,  303,  315,
+    327,  340,  353,  366,  379,  392,  405,  419,
+    433,  447,  461,  475,  489,  504,  518,  533,
+    548,  563,  578,  593,  609,  624,  640,  656,
+    672,  688,  705,  721,  738,  754,  771,  788,
+    805,  822,  840,  857,  875,  892,  910,  928,
+    946,  964,  983,  1001, 1020, 1038, 1057, 1076,
+    1095, 1114, 1133, 1153, 1172, 1192, 1211, 1231,
+    1251, 1271, 1291, 1311, 1332, 1352, 1373, 1393,
+    1414, 1435, 1456, 1477, 1498, 1520, 1541, 1562,
+    1584, 1606, 1628, 1649, 1671, 1694, 1716, 1738,
+    1760, 1783, 1806, 1828, 1851, 1874, 1897, 1920,
+    1935, 1942, 1949, 1955, 1961, 1968, 1974, 1980,
+    1985, 1991, 1996, 2001, 2006, 2011, 2016, 2021,
+    2025, 2029, 2033, 2037, 2040, 2044, 2047, 2050,
+    2053, 2056, 2058, 2061, 2063, 2065, 2066, 2068,
+    2069, 2070, 2071, 2072, 2072, 2072, 2072, 2072,
+    2072, 2071, 2070, 2069, 2068, 2066, 2065, 2063,
+    2060, 2058, 2055, 2052, 2049, 2045, 2042, 2038,
+    2033, 2029, 2024, 2019, 2013, 2008, 2002, 1996,
+    1989, 1982, 1975, 1968, 1960, 1952, 1943, 1934,
+    1925, 1916, 1906, 1896, 1885, 1874, 1863, 1851,
+    1839, 1827, 1814, 1800, 1786, 1772, 1757, 1742,
+    1727, 1710, 1694, 1676, 1659, 1640, 1622, 1602,
+    1582, 1561, 1540, 1518, 1495, 1471, 1447, 1422,
+    1396, 1369, 1341, 1312, 1282, 1251, 1219, 1186,
+    1151, 1114, 1077, 1037, 995,  952,  906,  857,
+    805,  750,  690,  625,  553,  471,  376,  255
+};
+
+const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT] = {
+    [CTX_ZPZN_F1]   = CTX_ZP_F2,
+    [CTX_ZPNN_F1]   = CTX_ZP_F2,
+    [CTX_ZP_F2]     = CTX_ZP_F3,
+    [CTX_ZP_F3]     = CTX_ZP_F4,
+    [CTX_ZP_F4]     = CTX_ZP_F5,
+    [CTX_ZP_F5]     = CTX_ZP_F6,
+    [CTX_ZP_F6]     = CTX_ZP_F6,
+    [CTX_NPZN_F1]   = CTX_NP_F2,
+    [CTX_NPNN_F1]   = CTX_NP_F2,
+    [CTX_NP_F2]     = CTX_NP_F3,
+    [CTX_NP_F3]     = CTX_NP_F4,
+    [CTX_NP_F4]     = CTX_NP_F5,
+    [CTX_NP_F5]     = CTX_NP_F6,
+    [CTX_NP_F6]     = CTX_NP_F6,
+    [CTX_DELTA_Q_F] = CTX_DELTA_Q_F,
+};
+
+int16_t ff_dirac_prob_branchless[256][2];
+
+void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length)
+{
+    int i;
+    align_get_bits(gb);
+
+    length = FFMIN(length, get_bits_left(gb)/8);
+
+    c->bytestream     = gb->buffer + get_bits_count(gb)/8;
+    c->bytestream_end = c->bytestream + length;
+    skip_bits_long(gb, length*8);
+
+    c->low = 0;
+    for (i = 0; i < 4; i++) {
+        c->low <<= 8;
+        if (c->bytestream < c->bytestream_end)
+            c->low |= *c->bytestream++;
+        else
+            c->low |= 0xff;
+    }
+
+    c->counter = -16;
+    c->range   = 0xffff;
+
+    for (i = 0; i < 256; i++) {
+        ff_dirac_prob_branchless[i][0] =  ff_dirac_prob[255-i];
+        ff_dirac_prob_branchless[i][1] = -ff_dirac_prob[i];
+    }
+
+    for (i = 0; i < DIRAC_CTX_COUNT; i++)
+        c->contexts[i] = 0x8000;
+}
diff --git a/libavcodec/dirac_arith.h b/libavcodec/dirac_arith.h
new file mode 100644
index 0000000..003430a
--- /dev/null
+++ b/libavcodec/dirac_arith.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Arithmetic decoder for Dirac
+ * @author Marco Gerards <marco@gnu.org>
+ */
+
+#ifndef AVCODEC_DIRAC_ARITH_H
+#define AVCODEC_DIRAC_ARITH_H
+
+#include "libavutil/x86/asm.h"
+#include "bytestream.h"
+#include "get_bits.h"
+
+enum dirac_arith_contexts {
+    CTX_ZPZN_F1,
+    CTX_ZPNN_F1,
+    CTX_NPZN_F1,
+    CTX_NPNN_F1,
+    CTX_ZP_F2,
+    CTX_ZP_F3,
+    CTX_ZP_F4,
+    CTX_ZP_F5,
+    CTX_ZP_F6,
+    CTX_NP_F2,
+    CTX_NP_F3,
+    CTX_NP_F4,
+    CTX_NP_F5,
+    CTX_NP_F6,
+    CTX_COEFF_DATA,
+    CTX_SIGN_NEG,
+    CTX_SIGN_ZERO,
+    CTX_SIGN_POS,
+    CTX_ZERO_BLOCK,
+    CTX_DELTA_Q_F,
+    CTX_DELTA_Q_DATA,
+    CTX_DELTA_Q_SIGN,
+
+    DIRAC_CTX_COUNT
+};
+
+// Dirac resets the arith decoder between decoding various types of data,
+// so many contexts are never used simultaneously. Thus, we can reduce
+// the number of contexts needed by reusing them.
+#define CTX_SB_F1        CTX_ZP_F5
+#define CTX_SB_DATA      0
+#define CTX_PMODE_REF1   0
+#define CTX_PMODE_REF2   1
+#define CTX_GLOBAL_BLOCK 2
+#define CTX_MV_F1        CTX_ZP_F2
+#define CTX_MV_DATA      0
+#define CTX_DC_F1        CTX_ZP_F5
+#define CTX_DC_DATA      0
+
+typedef struct {
+    unsigned low;
+    uint16_t range;
+    int16_t  counter;
+
+    const uint8_t *bytestream;
+    const uint8_t *bytestream_end;
+
+    uint16_t contexts[DIRAC_CTX_COUNT];
+} DiracArith;
+
+extern const uint8_t ff_dirac_next_ctx[DIRAC_CTX_COUNT];
+extern const uint16_t ff_dirac_prob[256];
+extern int16_t ff_dirac_prob_branchless[256][2];
+
+static inline void renorm(DiracArith *c)
+{
+#if HAVE_FAST_CLZ
+    int shift = 14 - av_log2_16bit(c->range-1) + ((c->range-1)>>15);
+
+    c->low    <<= shift;
+    c->range  <<= shift;
+    c->counter += shift;
+#else
+    while (c->range <= 0x4000) {
+        c->low   <<= 1;
+        c->range <<= 1;
+        c->counter++;
+    }
+#endif
+}
+
+static inline void refill(DiracArith *c)
+{
+    int counter = c->counter;
+
+    if (counter >= 0) {
+        int new = bytestream_get_be16(&c->bytestream);
+
+        // the spec defines overread bits to be 1, and streams rely on this
+        if (c->bytestream > c->bytestream_end) {
+            new |= 0xff;
+            if (c->bytestream > c->bytestream_end+1)
+                new |= 0xff00;
+
+            c->bytestream = c->bytestream_end;
+        }
+
+        c->low += new << counter;
+        counter -= 16;
+    }
+    c->counter = counter;
+}
+
+static inline int dirac_get_arith_bit(DiracArith *c, int ctx)
+{
+    int prob_zero = c->contexts[ctx];
+    int range_times_prob, bit;
+    unsigned low = c->low;
+    int    range = c->range;
+
+    range_times_prob = (c->range * prob_zero) >> 16;
+
+#if ARCH_X86 && HAVE_FAST_CMOV && HAVE_INLINE_ASM && HAVE_6REGS
+    low   -= range_times_prob << 16;
+    range -= range_times_prob;
+    bit = 0;
+    __asm__(
+        "cmpl   %5, %4 \n\t"
+        "setae  %b0    \n\t"
+        "cmovb  %3, %2 \n\t"
+        "cmovb  %5, %1 \n\t"
+        : "+q"(bit), "+r"(range), "+r"(low)
+        : "r"(c->low), "r"(c->low>>16),
+          "r"(range_times_prob)
+    );
+#else
+    bit = (low >> 16) >= range_times_prob;
+    if (bit) {
+        low   -= range_times_prob << 16;
+        range -= range_times_prob;
+    } else {
+        range  = range_times_prob;
+    }
+#endif
+
+    c->contexts[ctx] += ff_dirac_prob_branchless[prob_zero>>8][bit];
+    c->low   = low;
+    c->range = range;
+
+    renorm(c);
+    refill(c);
+    return bit;
+}
+
+static inline int dirac_get_arith_uint(DiracArith *c, int follow_ctx, int data_ctx)
+{
+    int ret = 1;
+    while (!dirac_get_arith_bit(c, follow_ctx)) {
+        if (ret >= 0x40000000) {
+            av_log(NULL, AV_LOG_ERROR, "dirac_get_arith_uint overflow\n");
+            return -1;
+        }
+        ret <<= 1;
+        ret += dirac_get_arith_bit(c, data_ctx);
+        follow_ctx = ff_dirac_next_ctx[follow_ctx];
+    }
+    return ret-1;
+}
+
+static inline int dirac_get_arith_int(DiracArith *c, int follow_ctx, int data_ctx)
+{
+    int ret = dirac_get_arith_uint(c, follow_ctx, data_ctx);
+    if (ret && dirac_get_arith_bit(c, data_ctx+1))
+        ret = -ret;
+    return ret;
+}
+
+void ff_dirac_init_arith_decoder(DiracArith *c, GetBitContext *gb, int length);
+
+#endif /* AVCODEC_DIRAC_ARITH_H */
diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
new file mode 100644
index 0000000..cc08f88
--- /dev/null
+++ b/libavcodec/dirac_dwt.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "dirac_dwt.h"
+
+#define TEMPLATE_8bit
+#include "dirac_dwt_template.c"
+
+#define TEMPLATE_10bit
+#include "dirac_dwt_template.c"
+
+#define TEMPLATE_12bit
+#include "dirac_dwt_template.c"
+
+int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
+                         int decomposition_count, int bit_depth)
+{
+    int ret = 0;
+
+    d->buffer = p->buf;
+    d->width  = p->width;
+    d->height = p->height;
+    d->stride = p->stride;
+    d->temp   = p->tmp;
+    d->decomposition_count = decomposition_count;
+
+    if (bit_depth == 8)
+        ret = ff_spatial_idwt_init_8bit(d, type);
+    else if (bit_depth == 10)
+        ret = ff_spatial_idwt_init_10bit(d, type);
+    else if (bit_depth == 12)
+        ret = ff_spatial_idwt_init_12bit(d, type);
+    else
+        av_log(NULL, AV_LOG_WARNING, "Unsupported bit depth = %i\n", bit_depth);
+
+    if (ret) {
+        av_log(NULL, AV_LOG_ERROR, "Unknown wavelet type %d\n", type);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ARCH_X86 && bit_depth == 8)
+        ff_spatial_idwt_init_x86(d, type);
+    return 0;
+}
+
+void ff_spatial_idwt_slice2(DWTContext *d, int y)
+{
+    int level, support = d->support;
+
+    for (level = d->decomposition_count-1; level >= 0; level--) {
+        int wl = d->width  >> level;
+        int hl = d->height >> level;
+        int stride_l = d->stride << level;
+
+        while (d->cs[level].y <= FFMIN((y>>level)+support, hl))
+            d->spatial_compose(d, level, wl, hl, stride_l);
+    }
+}
diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
new file mode 100644
index 0000000..4d33865
--- /dev/null
+++ b/libavcodec/dirac_dwt.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRAC_DWT_H
+#define AVCODEC_DIRAC_DWT_H
+
+#include <stdint.h>
+
+typedef int DWTELEM;
+typedef short IDWTELEM;
+
+#define MAX_DWT_SUPPORT 8
+#define MAX_DECOMPOSITIONS 8
+
+typedef struct DWTCompose {
+    uint8_t *b[MAX_DWT_SUPPORT];
+    int y;
+} DWTCompose;
+
+typedef struct DWTPlane {
+    int width;
+    int height;
+    int stride;
+    uint8_t *buf;
+    uint8_t *buf_base;
+    uint8_t *tmp;
+} DWTPlane;
+
+struct DWTContext;
+
+// Possible prototypes for vertical_compose functions
+typedef void (*vertical_compose_2tap)(uint8_t *b0, uint8_t *b1, int width);
+typedef void (*vertical_compose_3tap)(uint8_t *b0, uint8_t *b1, uint8_t *b2, int width);
+typedef void (*vertical_compose_5tap)(uint8_t *b0, uint8_t *b1, uint8_t *b2, uint8_t *b3, uint8_t *b4, int width);
+typedef void (*vertical_compose_9tap)(uint8_t *dst, uint8_t *b[8], int width);
+
+typedef struct DWTContext {
+    uint8_t *buffer;
+    uint8_t *temp;
+    int width;
+    int height;
+    int stride;
+    int decomposition_count;
+    int support;
+
+    void (*spatial_compose)(struct DWTContext *cs, int level, int width, int height, int stride);
+    void (*vertical_compose_l0)(void);
+    void (*vertical_compose_h0)(void);
+    void (*vertical_compose_l1)(void);
+    void (*vertical_compose_h1)(void);
+    void (*vertical_compose)(void);     ///< one set of lowpass and highpass combined
+    void (*horizontal_compose)(uint8_t *b, uint8_t *tmp, int width);
+
+    DWTCompose cs[MAX_DECOMPOSITIONS];
+} DWTContext;
+
+enum dwt_type {
+    DWT_SNOW_DAUB9_7,
+    DWT_SNOW_LEGALL5_3,
+    DWT_DIRAC_DD9_7,
+    DWT_DIRAC_LEGALL5_3,
+    DWT_DIRAC_DD13_7,
+    DWT_DIRAC_HAAR0,
+    DWT_DIRAC_HAAR1,
+    DWT_DIRAC_FIDELITY,
+    DWT_DIRAC_DAUB9_7,
+    DWT_NUM_TYPES
+};
+
+// -1 if an error occurred, e.g. the dwt_type isn't recognized
+int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
+                         int decomposition_count, int bit_depth);
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
+
+void ff_spatial_idwt_slice2(DWTContext *d, int y);
+
+// shared stuff for simd optimizations
+#define COMPOSE_53iL0(b0, b1, b2)\
+    (b1 - ((b0 + b2 + 2) >> 2))
+
+#define COMPOSE_DIRAC53iH0(b0, b1, b2)\
+    (b1 + ((b0 + b2 + 1) >> 1))
+
+#define COMPOSE_DD97iH0(b0, b1, b2, b3, b4)\
+    (b2 + ((-b0 + 9*b1 + 9*b3 - b4 + 8) >> 4))
+
+#define COMPOSE_DD137iL0(b0, b1, b2, b3, b4)\
+    (b2 - ((-b0 + 9*b1 + 9*b3 - b4 + 16) >> 5))
+
+#define COMPOSE_HAARiL0(b0, b1)\
+    (b0 - ((b1 + 1) >> 1))
+
+#define COMPOSE_HAARiH0(b0, b1)\
+    (b0 + b1)
+
+#define COMPOSE_FIDELITYiL0(b0, b1, b2, b3, b4, b5, b6, b7, b8)\
+    (b4 - ((-8*(b0+b8) + 21*(b1+b7) - 46*(b2+b6) + 161*(b3+b5) + 128) >> 8))
+
+#define COMPOSE_FIDELITYiH0(b0, b1, b2, b3, b4, b5, b6, b7, b8)\
+    (b4 + ((-2*(b0+b8) + 10*(b1+b7) - 25*(b2+b6) + 81*(b3+b5) + 128) >> 8))
+
+#define COMPOSE_DAUB97iL1(b0, b1, b2)\
+    (b1 - ((1817*(b0 + b2) + 2048) >> 12))
+
+#define COMPOSE_DAUB97iH1(b0, b1, b2)\
+    (b1 - (( 113*(b0 + b2) + 64) >> 7))
+
+#define COMPOSE_DAUB97iL0(b0, b1, b2)\
+    (b1 + (( 217*(b0 + b2) + 2048) >> 12))
+
+#define COMPOSE_DAUB97iH0(b0, b1, b2)\
+    (b1 + ((6497*(b0 + b2) + 2048) >> 12))
+
+
+#endif /* AVCODEC_DWT_H */
diff --git a/libavcodec/dirac_dwt_template.c b/libavcodec/dirac_dwt_template.c
new file mode 100644
index 0000000..972c711
--- /dev/null
+++ b/libavcodec/dirac_dwt_template.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ * Copyright (C) 2015 Open Broadcast Systems Ltd.
+ * Author    (C) 2015 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#if defined(TEMPLATE_8bit)
+
+#    define RENAME(N)   N ## _8bit
+#    define TYPE        int16_t
+#    undef  TEMPLATE_8bit
+
+#elif defined(TEMPLATE_10bit)
+
+#    define RENAME(N)   N ## _10bit
+#    define TYPE        int32_t
+#    undef  TEMPLATE_10bit
+
+#elif defined(TEMPLATE_12bit)
+
+#    define RENAME(N)   N ## _12bit
+#    define TYPE        int32_t
+#    undef  TEMPLATE_12bit
+
+#endif
+
+static void RENAME(vertical_compose53iL0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                          int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i] + 2) >> 2;
+}
+
+static av_always_inline void RENAME(interleave)(TYPE *dst, TYPE *src0, TYPE *src1, int w2,
+                                                int add, int shift)
+{
+    int i;
+    for (i = 0; i < w2; i++) {
+        dst[2*i  ] = (src0[i] + add) >> shift;
+        dst[2*i+1] = (src1[i] + add) >> shift;
+    }
+}
+
+static void RENAME(horizontal_compose_dirac53i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    int x;
+    const int w2 = w >> 1;
+    TYPE *b     = (TYPE *)_b;
+    TYPE *temp  = (TYPE *)_temp;
+
+    temp[0] = COMPOSE_53iL0(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++) {
+        temp[x     ] = COMPOSE_53iL0     (b[x+w2-1], b[x     ], b[x+w2]);
+        temp[x+w2-1] = COMPOSE_DIRAC53iH0(temp[x-1], b[x+w2-1], temp[x]);
+    }
+    temp[w-1] = COMPOSE_DIRAC53iH0(temp[w2-1], b[w-1], temp[w2-1]);
+
+    RENAME(interleave)(b, temp, temp+w2, w2, 1, 1);
+}
+
+static void RENAME(horizontal_compose_dd97i)(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    int x;
+    const int w2 = w >> 1;
+    TYPE *b   = (TYPE *)_b;
+    TYPE *tmp = (TYPE *)_tmp;
+
+    tmp[0] = COMPOSE_53iL0(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++)
+        tmp[x] = COMPOSE_53iL0(b[x+w2-1], b[x], b[x+w2]);
+
+    // extend the edges
+    tmp[-1]   = tmp[0];
+    tmp[w2+1] = tmp[w2] = tmp[w2-1];
+
+    for (x = 0; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+
+static void RENAME(horizontal_compose_dd137i)(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    const int w2 = w >> 1;
+    int x;
+    TYPE *b   = (TYPE *)_b;
+    TYPE *tmp = (TYPE *)_tmp;
+
+    tmp[0] = COMPOSE_DD137iL0(b[w2], b[w2], b[0], b[w2  ], b[w2+1]);
+    tmp[1] = COMPOSE_DD137iL0(b[w2], b[w2], b[1], b[w2+1], b[w2+2]);
+    for (x = 2; x < w2-1; x++)
+        tmp[x] = COMPOSE_DD137iL0(b[x+w2-2], b[x+w2-1], b[x], b[x+w2], b[x+w2+1]);
+    tmp[w2-1] = COMPOSE_DD137iL0(b[w-3], b[w-2], b[w2-1], b[w-1], b[w-1]);
+
+    // extend the edges
+    tmp[-1]   = tmp[0];
+    tmp[w2+1] = tmp[w2] = tmp[w2-1];
+
+    for (x = 0; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+
+static av_always_inline void RENAME(horizontal_compose_haari)(TYPE *b, TYPE *temp,
+                                                              int w, int shift)
+{
+    const int w2 = w >> 1;
+    int x;
+
+    for (x = 0; x < w2; x++) {
+        temp[x   ] = COMPOSE_HAARiL0(b[x   ], b[x+w2]);
+        temp[x+w2] = COMPOSE_HAARiH0(b[x+w2], temp[x]);
+    }
+
+    RENAME(interleave)(b, temp, temp+w2, w2, shift, shift);
+}
+
+static void RENAME(horizontal_compose_haar0i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    TYPE *b    = (TYPE *)_b;
+    TYPE *temp = (TYPE *)_temp;
+    RENAME(horizontal_compose_haari)(b, temp, w, 0);
+}
+
+static void RENAME(horizontal_compose_haar1i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    TYPE *b    = (TYPE *)_b;
+    TYPE *temp = (TYPE *)_temp;
+    RENAME(horizontal_compose_haari)(b, temp, w, 1);
+}
+
+static void RENAME(horizontal_compose_fidelityi)(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    const int w2 = w >> 1;
+    int i, x;
+    TYPE v[8];
+    TYPE *b   = (TYPE *)_b;
+    TYPE *tmp = (TYPE *)_tmp;
+
+    for (x = 0; x < w2; x++) {
+        for (i = 0; i < 8; i++)
+            v[i] = b[av_clip(x-3+i, 0, w2-1)];
+        tmp[x] = COMPOSE_FIDELITYiH0(v[0], v[1], v[2], v[3], b[x+w2], v[4], v[5], v[6], v[7]);
+    }
+
+    for (x = 0; x < w2; x++) {
+        for (i = 0; i < 8; i++)
+            v[i] = tmp[av_clip(x-4+i, 0, w2-1)];
+        tmp[x+w2] = COMPOSE_FIDELITYiL0(v[0], v[1], v[2], v[3], b[x], v[4], v[5], v[6], v[7]);
+    }
+
+    RENAME(interleave)(b, tmp+w2, tmp, w2, 0, 0);
+}
+
+static void RENAME(horizontal_compose_daub97i)(uint8_t *_b, uint8_t *_temp, int w)
+{
+    const int w2 = w >> 1;
+    int x, b0, b1, b2;
+    TYPE *b    = (TYPE *)_b;
+    TYPE *temp = (TYPE *)_temp;
+
+    temp[0] = COMPOSE_DAUB97iL1(b[w2], b[0], b[w2]);
+    for (x = 1; x < w2; x++) {
+        temp[x     ] = COMPOSE_DAUB97iL1(b[x+w2-1], b[x     ], b[x+w2]);
+        temp[x+w2-1] = COMPOSE_DAUB97iH1(temp[x-1], b[x+w2-1], temp[x]);
+    }
+    temp[w-1] = COMPOSE_DAUB97iH1(temp[w2-1], b[w-1], temp[w2-1]);
+
+    // second stage combined with interleave and shift
+    b0 = b2 = COMPOSE_DAUB97iL0(temp[w2], temp[0], temp[w2]);
+    b[0] = (b0 + 1) >> 1;
+    for (x = 1; x < w2; x++) {
+        b2 = COMPOSE_DAUB97iL0(temp[x+w2-1], temp[x     ], temp[x+w2]);
+        b1 = COMPOSE_DAUB97iH0(          b0, temp[x+w2-1], b2        );
+        b[2*x-1] = (b1 + 1) >> 1;
+        b[2*x  ] = (b2 + 1) >> 1;
+        b0 = b2;
+    }
+    b[w-1] = (COMPOSE_DAUB97iH0(b2, temp[w-1], b2) + 1) >> 1;
+}
+
+static void RENAME(vertical_compose_dirac53iH0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                                int width)
+{
+    int i;
+    TYPE *b0  = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_dd97iH0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                             uint8_t *_b3, uint8_t *_b4, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    TYPE *b3 = (TYPE *)_b3;
+    TYPE *b4 = (TYPE *)_b4;
+    for(i=0; i<width; i++){
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+    }
+}
+
+static void RENAME(vertical_compose_dd137iL0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2,
+                                              uint8_t *_b3, uint8_t *_b4, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+    TYPE *b3 = (TYPE *)_b3;
+    TYPE *b4 = (TYPE *)_b4;
+    for(i=0; i<width; i++){
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+    }
+}
+
+static void RENAME(vertical_compose_haar)(uint8_t *_b0, uint8_t *_b1, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+
+    for (i = 0; i < width; i++) {
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]);
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]);
+    }
+}
+
+static void RENAME(vertical_compose_fidelityiH0)(uint8_t *_dst, uint8_t *_b[8], int width)
+{
+    int i;
+    TYPE *dst = (TYPE *)_dst;
+    TYPE *b0  = (TYPE *)_b[0];
+    TYPE *b1  = (TYPE *)_b[1];
+    TYPE *b2  = (TYPE *)_b[2];
+    TYPE *b3  = (TYPE *)_b[3];
+    TYPE *b4  = (TYPE *)_b[4];
+    TYPE *b5  = (TYPE *)_b[5];
+    TYPE *b6  = (TYPE *)_b[6];
+    TYPE *b7  = (TYPE *)_b[7];
+    for(i=0; i<width; i++){
+        dst[i] = COMPOSE_FIDELITYiH0(b0[i], b1[i], b2[i], b3[i], dst[i], b4[i], b5[i], b6[i], b7[i]);
+    }
+}
+
+static void RENAME(vertical_compose_fidelityiL0)(uint8_t *_dst, uint8_t *_b[8], int width)
+{
+    int i;
+    TYPE *dst = (TYPE *)_dst;
+    TYPE *b0  = (TYPE *)_b[0];
+    TYPE *b1  = (TYPE *)_b[1];
+    TYPE *b2  = (TYPE *)_b[2];
+    TYPE *b3  = (TYPE *)_b[3];
+    TYPE *b4  = (TYPE *)_b[4];
+    TYPE *b5  = (TYPE *)_b[5];
+    TYPE *b6  = (TYPE *)_b[6];
+    TYPE *b7  = (TYPE *)_b[7];
+
+    for(i=0; i<width; i++){
+        dst[i] = COMPOSE_FIDELITYiL0(b0[i], b1[i], b2[i], b3[i], dst[i], b4[i], b5[i], b6[i], b7[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iH0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iH0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iH1)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iH1(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iL0)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iL0(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(vertical_compose_daub97iL1)(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width)
+{
+    int i;
+    TYPE *b0 = (TYPE *)_b0;
+    TYPE *b1 = (TYPE *)_b1;
+    TYPE *b2 = (TYPE *)_b2;
+
+    for(i=0; i<width; i++){
+        b1[i] = COMPOSE_DAUB97iL1(b0[i], b1[i], b2[i]);
+    }
+}
+
+static void RENAME(spatial_compose_dd97i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_5tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    uint8_t *b[8];
+    for (i = 0; i < 6; i++)
+        b[i] = cs->b[i];
+    b[6] = d->buffer + av_clip(y+5, 0, height-2)*stride;
+    b[7] = d->buffer + av_clip(y+6, 1, height-1)*stride;
+
+    if(y+5<(unsigned)height) vertical_compose_l0(      b[5], b[6], b[7],       width);
+    if(y+1<(unsigned)height) vertical_compose_h0(b[0], b[2], b[3], b[4], b[6], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 6; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+static void RENAME(spatial_compose_dirac53i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_3tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int y= cs->y;
+    uint8_t *b[4] = { cs->b[0], cs->b[1] };
+    b[2] = d->buffer + avpriv_mirror(y+1, height-1)*stride;
+    b[3] = d->buffer + avpriv_mirror(y+2, height-1)*stride;
+
+    if(y+1<(unsigned)height) vertical_compose_l0(b[1], b[2], b[3], width);
+    if(y+0<(unsigned)height) vertical_compose_h0(b[0], b[1], b[2], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    cs->b[0] = b[2];
+    cs->b[1] = b[3];
+    cs->y += 2;
+}
+
+static void RENAME(spatial_compose_dd137i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_5tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_5tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    uint8_t *b[10];
+    for (i = 0; i < 8; i++)
+        b[i] = cs->b[i];
+    b[8] = d->buffer + av_clip(y+7, 0, height-2)*stride;
+    b[9] = d->buffer + av_clip(y+8, 1, height-1)*stride;
+
+    if(y+5<(unsigned)height) vertical_compose_l0(b[3], b[5], b[6], b[7], b[9], width);
+    if(y+1<(unsigned)height) vertical_compose_h0(b[0], b[2], b[3], b[4], b[6], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 8; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+// haar makes the assumption that height is even (always true for dirac)
+static void RENAME(spatial_compose_haari_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_2tap vertical_compose = (void*)d->vertical_compose;
+    int y = d->cs[level].y;
+    uint8_t *b0 = d->buffer + (y-1)*stride;
+    uint8_t *b1 = d->buffer + (y  )*stride;
+
+    vertical_compose(b0, b1, width);
+    d->horizontal_compose(b0, d->temp, width);
+    d->horizontal_compose(b1, d->temp, width);
+
+    d->cs[level].y += 2;
+}
+
+// Don't do sliced idwt for fidelity; the 9 tap filter makes it a bit annoying
+// Fortunately, this filter isn't used in practice.
+static void RENAME(spatial_compose_fidelity)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_9tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_9tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    int i, y;
+    uint8_t *b[8];
+
+    for (y = 1; y < height; y += 2) {
+        for (i = 0; i < 8; i++)
+            b[i] = d->buffer + av_clip((y-7 + 2*i), 0, height-2)*stride;
+        vertical_compose_h0(d->buffer + y*stride, b, width);
+    }
+
+    for (y = 0; y < height; y += 2) {
+        for (i = 0; i < 8; i++)
+            b[i] = d->buffer + av_clip((y-7 + 2*i), 1, height-1)*stride;
+        vertical_compose_l0(d->buffer + y*stride, b, width);
+    }
+
+    for (y = 0; y < height; y++)
+        d->horizontal_compose(d->buffer + y*stride, d->temp, width);
+
+    d->cs[level].y = height+1;
+}
+
+static void RENAME(spatial_compose_daub97i_dy)(DWTContext *d, int level, int width, int height, int stride)
+{
+    vertical_compose_3tap vertical_compose_l0 = (void*)d->vertical_compose_l0;
+    vertical_compose_3tap vertical_compose_h0 = (void*)d->vertical_compose_h0;
+    vertical_compose_3tap vertical_compose_l1 = (void*)d->vertical_compose_l1;
+    vertical_compose_3tap vertical_compose_h1 = (void*)d->vertical_compose_h1;
+    DWTCompose *cs = d->cs + level;
+
+    int i, y = cs->y;
+    uint8_t *b[6];
+    for (i = 0; i < 4; i++)
+        b[i] = cs->b[i];
+    b[4] = d->buffer + avpriv_mirror(y+3, height-1)*stride;
+    b[5] = d->buffer + avpriv_mirror(y+4, height-1)*stride;
+
+    if(y+3<(unsigned)height) vertical_compose_l1(b[3], b[4], b[5], width);
+    if(y+2<(unsigned)height) vertical_compose_h1(b[2], b[3], b[4], width);
+    if(y+1<(unsigned)height) vertical_compose_l0(b[1], b[2], b[3], width);
+    if(y+0<(unsigned)height) vertical_compose_h0(b[0], b[1], b[2], width);
+
+    if(y-1<(unsigned)height) d->horizontal_compose(b[0], d->temp, width);
+    if(y+0<(unsigned)height) d->horizontal_compose(b[1], d->temp, width);
+
+    for (i = 0; i < 4; i++)
+        cs->b[i] = b[i+2];
+    cs->y += 2;
+}
+
+static void RENAME(spatial_compose97i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + avpriv_mirror(-3-1, height-1)*stride;
+    cs->b[1] = buffer + avpriv_mirror(-3  , height-1)*stride;
+    cs->b[2] = buffer + avpriv_mirror(-3+1, height-1)*stride;
+    cs->b[3] = buffer + avpriv_mirror(-3+2, height-1)*stride;
+    cs->y = -3;
+}
+
+static void RENAME(spatial_compose53i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + avpriv_mirror(-1-1, height-1)*stride;
+    cs->b[1] = buffer + avpriv_mirror(-1  , height-1)*stride;
+    cs->y = -1;
+}
+
+static void RENAME(spatial_compose_dd97i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + av_clip(-5-1, 0, height-2)*stride;
+    cs->b[1] = buffer + av_clip(-5  , 1, height-1)*stride;
+    cs->b[2] = buffer + av_clip(-5+1, 0, height-2)*stride;
+    cs->b[3] = buffer + av_clip(-5+2, 1, height-1)*stride;
+    cs->b[4] = buffer + av_clip(-5+3, 0, height-2)*stride;
+    cs->b[5] = buffer + av_clip(-5+4, 1, height-1)*stride;
+    cs->y = -5;
+}
+
+static void RENAME(spatial_compose_dd137i_init)(DWTCompose *cs, uint8_t *buffer, int height, int stride)
+{
+    cs->b[0] = buffer + av_clip(-5-1, 0, height-2)*stride;
+    cs->b[1] = buffer + av_clip(-5  , 1, height-1)*stride;
+    cs->b[2] = buffer + av_clip(-5+1, 0, height-2)*stride;
+    cs->b[3] = buffer + av_clip(-5+2, 1, height-1)*stride;
+    cs->b[4] = buffer + av_clip(-5+3, 0, height-2)*stride;
+    cs->b[5] = buffer + av_clip(-5+4, 1, height-1)*stride;
+    cs->b[6] = buffer + av_clip(-5+5, 0, height-2)*stride;
+    cs->b[7] = buffer + av_clip(-5+6, 1, height-1)*stride;
+    cs->y = -5;
+}
+
+static int RENAME(ff_spatial_idwt_init)(DWTContext *d, enum dwt_type type)
+{
+    int level;
+
+    d->temp = (uint8_t *)(((TYPE *)d->temp) + 8);
+
+    for (level = d->decomposition_count - 1; level >= 0; level--){
+        int hl = d->height >> level;
+        int stride_l = d->stride << level;
+
+        switch(type){
+            case DWT_DIRAC_DD9_7:
+                RENAME(spatial_compose_dd97i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            case DWT_DIRAC_LEGALL5_3:
+                RENAME(spatial_compose53i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            case DWT_DIRAC_DD13_7:
+                RENAME(spatial_compose_dd137i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            case DWT_DIRAC_HAAR0:
+            case DWT_DIRAC_HAAR1:
+                d->cs[level].y = 1;
+                break;
+            case DWT_DIRAC_DAUB9_7:
+                RENAME(spatial_compose97i_init)(d->cs+level, d->buffer, hl, stride_l);
+                break;
+            default:
+                d->cs[level].y = 0;
+                break;
+        }
+    }
+
+    switch (type) {
+        case DWT_DIRAC_DD9_7:
+            d->spatial_compose = RENAME(spatial_compose_dd97i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose53iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_dd97iH0);
+            d->horizontal_compose = RENAME(horizontal_compose_dd97i);
+            d->support = 7;
+            break;
+        case DWT_DIRAC_LEGALL5_3:
+            d->spatial_compose = RENAME(spatial_compose_dirac53i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose53iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_dirac53iH0);
+            d->horizontal_compose = RENAME(horizontal_compose_dirac53i);
+            d->support = 3;
+            break;
+        case DWT_DIRAC_DD13_7:
+            d->spatial_compose = RENAME(spatial_compose_dd137i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose_dd137iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_dd97iH0);
+            d->horizontal_compose = RENAME(horizontal_compose_dd137i);
+            d->support = 7;
+            break;
+        case DWT_DIRAC_HAAR0:
+        case DWT_DIRAC_HAAR1:
+            d->spatial_compose = RENAME(spatial_compose_haari_dy);
+            d->vertical_compose = (void*)RENAME(vertical_compose_haar);
+            if (type == DWT_DIRAC_HAAR0)
+                d->horizontal_compose = RENAME(horizontal_compose_haar0i);
+            else
+                d->horizontal_compose = RENAME(horizontal_compose_haar1i);
+            d->support = 1;
+            break;
+        case DWT_DIRAC_FIDELITY:
+            d->spatial_compose = RENAME(spatial_compose_fidelity);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose_fidelityiL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_fidelityiH0);
+            d->horizontal_compose = RENAME(horizontal_compose_fidelityi);
+            d->support = 0; // not really used
+            break;
+        case DWT_DIRAC_DAUB9_7:
+            d->spatial_compose = RENAME(spatial_compose_daub97i_dy);
+            d->vertical_compose_l0 = (void*)RENAME(vertical_compose_daub97iL0);
+            d->vertical_compose_h0 = (void*)RENAME(vertical_compose_daub97iH0);
+            d->vertical_compose_l1 = (void*)RENAME(vertical_compose_daub97iL1);
+            d->vertical_compose_h1 = (void*)RENAME(vertical_compose_daub97iH1);
+            d->horizontal_compose = RENAME(horizontal_compose_daub97i);
+            d->support = 5;
+            break;
+        default:
+            return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+#undef RENAME
+#undef TYPE
diff --git a/libavcodec/dirac_parser.c b/libavcodec/dirac_parser.c
index 5c9d266..1ade44a 100644
--- a/libavcodec/dirac_parser.c
+++ b/libavcodec/dirac_parser.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2007-2008 Marco Gerards <marco@gnu.org>
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -100,17 +100,36 @@ typedef struct DiracParseUnit {
 static int unpack_parse_unit(DiracParseUnit *pu, DiracParseContext *pc,
                              int offset)
 {
-    uint8_t *start = pc->buffer + offset;
-    uint8_t *end   = pc->buffer + pc->index;
-    if (start < pc->buffer || (start + 13 > end))
+    int i;
+    int8_t *start;
+    static const uint8_t valid_pu_types[] = {
+        0x00, 0x10, 0x20, 0x30, 0x08, 0x48, 0xC8, 0xE8, 0x0A, 0x0C, 0x0D, 0x0E,
+        0x4C, 0x09, 0xCC, 0x88, 0xCB
+    };
+
+    if (offset < 0 || pc->index - 13 < offset)
         return 0;
+
+    start = pc->buffer + offset;
     pu->pu_type = start[4];
 
     pu->next_pu_offset = AV_RB32(start + 5);
     pu->prev_pu_offset = AV_RB32(start + 9);
 
-    if (pu->pu_type == 0x10 && pu->next_pu_offset == 0)
-        pu->next_pu_offset = 13;
+    /* Check for valid parse code */
+    for (i = 0; i < 17; i++)
+        if (valid_pu_types[i] == pu->pu_type)
+            break;
+    if (i == 17)
+        return 0;
+
+    if (pu->pu_type == 0x10 && pu->next_pu_offset == 0x00)
+        pu->next_pu_offset = 13; /* The length of a parse info header */
+
+    /* Check if the parse offsets are somewhat sane */
+    if ((pu->next_pu_offset && pu->next_pu_offset < 13) ||
+        (pu->prev_pu_offset && pu->prev_pu_offset < 13))
+        return 0;
 
     return 1;
 }
@@ -123,7 +142,7 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
     DiracParseContext *pc = s->priv_data;
 
     if (pc->overread_index) {
-        memcpy(pc->buffer, pc->buffer + pc->overread_index,
+        memmove(pc->buffer, pc->buffer + pc->overread_index,
                pc->index - pc->overread_index);
         pc->index         -= pc->overread_index;
         pc->overread_index = 0;
@@ -139,6 +158,8 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         void *new_buffer =
             av_fast_realloc(pc->buffer, &pc->buffer_size,
                             pc->index + (*buf_size - pc->sync_offset));
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
         pc->buffer = new_buffer;
         memcpy(pc->buffer + pc->index, (*buf + pc->sync_offset),
                *buf_size - pc->sync_offset);
@@ -149,6 +170,8 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         DiracParseUnit pu1, pu;
         void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
                                            pc->index + next);
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
         pc->buffer = new_buffer;
         memcpy(pc->buffer + pc->index, *buf, next);
         pc->index += next;
@@ -161,7 +184,9 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
          * we can be pretty sure that we have a valid parse unit */
         if (!unpack_parse_unit(&pu1, pc, pc->index - 13)                     ||
             !unpack_parse_unit(&pu, pc, pc->index - 13 - pu1.prev_pu_offset) ||
-            pu.next_pu_offset != pu1.prev_pu_offset) {
+            pu.next_pu_offset != pu1.prev_pu_offset                          ||
+            pc->index < pc->dirac_unit_size + 13LL + pu1.prev_pu_offset
+        ) {
             pc->index              -= 9;
             *buf_size               = next - 9;
             pc->header_bytes_needed = 9;
@@ -184,7 +209,7 @@ static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
         }
 
         /* Get the picture number to set the pts and dts*/
-        if (parse_timing_info) {
+        if (parse_timing_info && pu1.prev_pu_offset >= 13) {
             uint8_t *cur_pu = pc->buffer +
                               pc->index - 13 - pu1.prev_pu_offset;
             int pts = AV_RB32(cur_pu + 13);
@@ -245,7 +270,7 @@ static void dirac_parse_close(AVCodecParserContext *s)
     DiracParseContext *pc = s->priv_data;
 
     if (pc->buffer_size > 0)
-        av_free(pc->buffer);
+        av_freep(&pc->buffer);
 }
 
 AVCodecParser ff_dirac_parser = {
diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
new file mode 100644
index 0000000..1d7bb9b
--- /dev/null
+++ b/libavcodec/diracdec.c
@@ -0,0 +1,2221 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Dirac Decoder
+ * @author Marco Gerards <marco@gnu.org>, David Conrad, Jordi Ortiz <nenjordi@gmail.com>
+ */
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "golomb.h"
+#include "dirac_arith.h"
+#include "mpeg12data.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideoencdsp.h"
+#include "dirac_dwt.h"
+#include "dirac.h"
+#include "diractab.h"
+#include "diracdsp.h"
+#include "videodsp.h"
+
+/**
+ * The spec limits this to 3 for frame coding, but in practice can be as high as 6
+ */
+#define MAX_REFERENCE_FRAMES 8
+#define MAX_DELAY 5         /* limit for main profile for frame coding (TODO: field coding) */
+#define MAX_FRAMES (MAX_REFERENCE_FRAMES + MAX_DELAY + 1)
+#define MAX_QUANT 255        /* max quant for VC-2 */
+#define MAX_BLOCKSIZE 32    /* maximum xblen/yblen we support */
+
+/**
+ * DiracBlock->ref flags, if set then the block does MC from the given ref
+ */
+#define DIRAC_REF_MASK_REF1   1
+#define DIRAC_REF_MASK_REF2   2
+#define DIRAC_REF_MASK_GLOBAL 4
+
+/**
+ * Value of Picture.reference when Picture is not a reference picture, but
+ * is held for delayed output.
+ */
+#define DELAYED_PIC_REF 4
+
+#define CALC_PADDING(size, depth)                       \
+    (((size + (1 << depth) - 1) >> depth) << depth)
+
+#define DIVRNDUP(a, b) (((a) + (b) - 1) / (b))
+
+typedef struct {
+    AVFrame *avframe;
+    int interpolated[3];    /* 1 if hpel[] is valid */
+    uint8_t *hpel[3][4];
+    uint8_t *hpel_base[3][4];
+    int reference;
+} DiracFrame;
+
+typedef struct {
+    union {
+        int16_t mv[2][2];
+        int16_t dc[3];
+    } u; /* anonymous unions aren't in C99 :( */
+    uint8_t ref;
+} DiracBlock;
+
+typedef struct SubBand {
+    int level;
+    int orientation;
+    int stride; /* in bytes */
+    int width;
+    int height;
+    int pshift;
+    int quant;
+    uint8_t *ibuf;
+    struct SubBand *parent;
+
+    /* for low delay */
+    unsigned length;
+    const uint8_t *coeff_data;
+} SubBand;
+
+typedef struct Plane {
+    DWTPlane idwt;
+
+    int width;
+    int height;
+    ptrdiff_t stride;
+
+    /* block length */
+    uint8_t xblen;
+    uint8_t yblen;
+    /* block separation (block n+1 starts after this many pixels in block n) */
+    uint8_t xbsep;
+    uint8_t ybsep;
+    /* amount of overspill on each edge (half of the overlap between blocks) */
+    uint8_t xoffset;
+    uint8_t yoffset;
+
+    SubBand band[MAX_DWT_LEVELS][4];
+} Plane;
+
+typedef struct DiracContext {
+    AVCodecContext *avctx;
+    MpegvideoEncDSPContext mpvencdsp;
+    VideoDSPContext vdsp;
+    DiracDSPContext diracdsp;
+    DiracVersionInfo version;
+    GetBitContext gb;
+    AVDiracSeqHeader seq;
+    int seen_sequence_header;
+    int frame_number;           /* number of the next frame to display       */
+    Plane plane[3];
+    int chroma_x_shift;
+    int chroma_y_shift;
+
+    int bit_depth;              /* bit depth                                 */
+    int pshift;                 /* pixel shift = bit_depth > 8               */
+
+    int zero_res;               /* zero residue flag                         */
+    int is_arith;               /* whether coeffs use arith or golomb coding */
+    int core_syntax;            /* use core syntax only                      */
+    int low_delay;              /* use the low delay syntax                  */
+    int hq_picture;             /* high quality picture, enables low_delay   */
+    int ld_picture;             /* use low delay picture, turns on low_delay */
+    int dc_prediction;          /* has dc prediction                         */
+    int globalmc_flag;          /* use global motion compensation            */
+    int num_refs;               /* number of reference pictures              */
+
+    /* wavelet decoding */
+    unsigned wavelet_depth;     /* depth of the IDWT                         */
+    unsigned wavelet_idx;
+
+    /**
+     * schroedinger older than 1.0.8 doesn't store
+     * quant delta if only one codebook exists in a band
+     */
+    unsigned old_delta_quant;
+    unsigned codeblock_mode;
+
+    unsigned num_x;              /* number of horizontal slices               */
+    unsigned num_y;              /* number of vertical slices                 */
+
+    struct {
+        unsigned width;
+        unsigned height;
+    } codeblock[MAX_DWT_LEVELS+1];
+
+    struct {
+        AVRational bytes;       /* average bytes per slice                   */
+        uint8_t quant[MAX_DWT_LEVELS][4]; /* [DIRAC_STD] E.1 */
+    } lowdelay;
+
+    struct {
+        unsigned prefix_bytes;
+        uint64_t size_scaler;
+    } highquality;
+
+    struct {
+        int pan_tilt[2];        /* pan/tilt vector                           */
+        int zrs[2][2];          /* zoom/rotate/shear matrix                  */
+        int perspective[2];     /* perspective vector                        */
+        unsigned zrs_exp;
+        unsigned perspective_exp;
+    } globalmc[2];
+
+    /* motion compensation */
+    uint8_t mv_precision;       /* [DIRAC_STD] REFS_WT_PRECISION             */
+    int16_t weight[2];          /* [DIRAC_STD] REF1_WT and REF2_WT           */
+    unsigned weight_log2denom;  /* [DIRAC_STD] REFS_WT_PRECISION             */
+
+    int blwidth;                /* number of blocks (horizontally)           */
+    int blheight;               /* number of blocks (vertically)             */
+    int sbwidth;                /* number of superblocks (horizontally)      */
+    int sbheight;               /* number of superblocks (vertically)        */
+
+    uint8_t *sbsplit;
+    DiracBlock *blmotion;
+
+    uint8_t *edge_emu_buffer[4];
+    uint8_t *edge_emu_buffer_base;
+
+    uint16_t *mctmp;            /* buffer holding the MC data multiplied by OBMC weights */
+    uint8_t *mcscratch;
+    int buffer_stride;
+
+    DECLARE_ALIGNED(16, uint8_t, obmc_weight)[3][MAX_BLOCKSIZE*MAX_BLOCKSIZE];
+
+    void (*put_pixels_tab[4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*avg_pixels_tab[4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*add_obmc)(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+    dirac_weight_func weight_func;
+    dirac_biweight_func biweight_func;
+
+    DiracFrame *current_picture;
+    DiracFrame *ref_pics[2];
+
+    DiracFrame *ref_frames[MAX_REFERENCE_FRAMES+1];
+    DiracFrame *delay_frames[MAX_DELAY+1];
+    DiracFrame all_frames[MAX_FRAMES];
+} DiracContext;
+
+enum dirac_subband {
+    subband_ll = 0,
+    subband_hl = 1,
+    subband_lh = 2,
+    subband_hh = 3,
+    subband_nb,
+};
+
+/* magic number division by 3 from schroedinger */
+static inline int divide3(int x)
+{
+    return ((x+1)*21845 + 10922) >> 16;
+}
+
+static DiracFrame *remove_frame(DiracFrame *framelist[], int picnum)
+{
+    DiracFrame *remove_pic = NULL;
+    int i, remove_idx = -1;
+
+    for (i = 0; framelist[i]; i++)
+        if (framelist[i]->avframe->display_picture_number == picnum) {
+            remove_pic = framelist[i];
+            remove_idx = i;
+        }
+
+    if (remove_pic)
+        for (i = remove_idx; framelist[i]; i++)
+            framelist[i] = framelist[i+1];
+
+    return remove_pic;
+}
+
+static int add_frame(DiracFrame *framelist[], int maxframes, DiracFrame *frame)
+{
+    int i;
+    for (i = 0; i < maxframes; i++)
+        if (!framelist[i]) {
+            framelist[i] = frame;
+            return 0;
+        }
+    return -1;
+}
+
+static int alloc_sequence_buffers(DiracContext *s)
+{
+    int sbwidth  = DIVRNDUP(s->seq.width,  4);
+    int sbheight = DIVRNDUP(s->seq.height, 4);
+    int i, w, h, top_padding;
+
+    /* todo: think more about this / use or set Plane here */
+    for (i = 0; i < 3; i++) {
+        int max_xblen = MAX_BLOCKSIZE >> (i ? s->chroma_x_shift : 0);
+        int max_yblen = MAX_BLOCKSIZE >> (i ? s->chroma_y_shift : 0);
+        w = s->seq.width  >> (i ? s->chroma_x_shift : 0);
+        h = s->seq.height >> (i ? s->chroma_y_shift : 0);
+
+        /* we allocate the max we support here since num decompositions can
+         * change from frame to frame. Stride is aligned to 16 for SIMD, and
+         * 1<<MAX_DWT_LEVELS top padding to avoid if(y>0) in arith decoding
+         * MAX_BLOCKSIZE padding for MC: blocks can spill up to half of that
+         * on each side */
+        top_padding = FFMAX(1<<MAX_DWT_LEVELS, max_yblen/2);
+        w = FFALIGN(CALC_PADDING(w, MAX_DWT_LEVELS), 8); /* FIXME: Should this be 16 for SSE??? */
+        h = top_padding + CALC_PADDING(h, MAX_DWT_LEVELS) + max_yblen/2;
+
+        s->plane[i].idwt.buf_base = av_mallocz_array((w+max_xblen), h * (2 << s->pshift));
+        s->plane[i].idwt.tmp      = av_malloc_array((w+16), 2 << s->pshift);
+        s->plane[i].idwt.buf      = s->plane[i].idwt.buf_base + (top_padding*w)*(2 << s->pshift);
+        if (!s->plane[i].idwt.buf_base || !s->plane[i].idwt.tmp)
+            return AVERROR(ENOMEM);
+    }
+
+    /* fixme: allocate using real stride here */
+    s->sbsplit  = av_malloc_array(sbwidth, sbheight);
+    s->blmotion = av_malloc_array(sbwidth, sbheight * 16 * sizeof(*s->blmotion));
+
+    if (!s->sbsplit || !s->blmotion)
+        return AVERROR(ENOMEM);
+    return 0;
+}
+
+static int alloc_buffers(DiracContext *s, int stride)
+{
+    int w = s->seq.width;
+    int h = s->seq.height;
+
+    av_assert0(stride >= w);
+    stride += 64;
+
+    if (s->buffer_stride >= stride)
+        return 0;
+    s->buffer_stride = 0;
+
+    av_freep(&s->edge_emu_buffer_base);
+    memset(s->edge_emu_buffer, 0, sizeof(s->edge_emu_buffer));
+    av_freep(&s->mctmp);
+    av_freep(&s->mcscratch);
+
+    s->edge_emu_buffer_base = av_malloc_array(stride, MAX_BLOCKSIZE);
+
+    s->mctmp     = av_malloc_array((stride+MAX_BLOCKSIZE), (h+MAX_BLOCKSIZE) * sizeof(*s->mctmp));
+    s->mcscratch = av_malloc_array(stride, MAX_BLOCKSIZE);
+
+    if (!s->edge_emu_buffer_base || !s->mctmp || !s->mcscratch)
+        return AVERROR(ENOMEM);
+
+    s->buffer_stride = stride;
+    return 0;
+}
+
+static void free_sequence_buffers(DiracContext *s)
+{
+    int i, j, k;
+
+    for (i = 0; i < MAX_FRAMES; i++) {
+        if (s->all_frames[i].avframe->data[0]) {
+            av_frame_unref(s->all_frames[i].avframe);
+            memset(s->all_frames[i].interpolated, 0, sizeof(s->all_frames[i].interpolated));
+        }
+
+        for (j = 0; j < 3; j++)
+            for (k = 1; k < 4; k++)
+                av_freep(&s->all_frames[i].hpel_base[j][k]);
+    }
+
+    memset(s->ref_frames, 0, sizeof(s->ref_frames));
+    memset(s->delay_frames, 0, sizeof(s->delay_frames));
+
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->plane[i].idwt.buf_base);
+        av_freep(&s->plane[i].idwt.tmp);
+    }
+
+    s->buffer_stride = 0;
+    av_freep(&s->sbsplit);
+    av_freep(&s->blmotion);
+    av_freep(&s->edge_emu_buffer_base);
+
+    av_freep(&s->mctmp);
+    av_freep(&s->mcscratch);
+}
+
+static av_cold int dirac_decode_init(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    int i;
+
+    s->avctx = avctx;
+    s->frame_number = -1;
+
+    ff_diracdsp_init(&s->diracdsp);
+    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
+    ff_videodsp_init(&s->vdsp, 8);
+
+    for (i = 0; i < MAX_FRAMES; i++) {
+        s->all_frames[i].avframe = av_frame_alloc();
+        if (!s->all_frames[i].avframe) {
+            while (i > 0)
+                av_frame_free(&s->all_frames[--i].avframe);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static void dirac_decode_flush(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    free_sequence_buffers(s);
+    s->seen_sequence_header = 0;
+    s->frame_number = -1;
+}
+
+static av_cold int dirac_decode_end(AVCodecContext *avctx)
+{
+    DiracContext *s = avctx->priv_data;
+    int i;
+
+    dirac_decode_flush(avctx);
+    for (i = 0; i < MAX_FRAMES; i++)
+        av_frame_free(&s->all_frames[i].avframe);
+
+    return 0;
+}
+
+#define SIGN_CTX(x) (CTX_SIGN_ZERO + ((x) > 0) - ((x) < 0))
+
+static inline int coeff_unpack_golomb(GetBitContext *gb, int qfactor, int qoffset)
+{
+    int sign, coeff;
+    uint32_t buf;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf = GET_CACHE(re, gb);
+
+    if (buf & 0x80000000) {
+        LAST_SKIP_BITS(re,gb,1);
+        CLOSE_READER(re, gb);
+        return 0;
+    }
+
+    if (buf & 0xAA800000) {
+        buf >>= 32 - 8;
+        SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
+
+        coeff = ff_interleaved_ue_golomb_vlc_code[buf];
+    } else {
+        unsigned ret = 1;
+
+        do {
+            buf >>= 32 - 8;
+            SKIP_BITS(re, gb,
+                           FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
+
+            if (ff_interleaved_golomb_vlc_len[buf] != 9) {
+                ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1;
+                ret  |= ff_interleaved_dirac_golomb_vlc_code[buf];
+                break;
+            }
+            ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
+            UPDATE_CACHE(re, gb);
+            buf = GET_CACHE(re, gb);
+        } while (ret<0x8000000U && BITS_AVAILABLE(re, gb));
+
+        coeff = ret - 1;
+    }
+
+    coeff = (coeff * qfactor + qoffset) >> 2;
+    sign  = SHOW_SBITS(re, gb, 1);
+    LAST_SKIP_BITS(re, gb, 1);
+    coeff = (coeff ^ sign) - sign;
+
+    CLOSE_READER(re, gb);
+    return coeff;
+}
+
+#define UNPACK_ARITH(n, type) \
+    static inline void coeff_unpack_arith_##n(DiracArith *c, int qfactor, int qoffset, \
+                                              SubBand *b, type *buf, int x, int y) \
+    { \
+        int coeff, sign, sign_pred = 0, pred_ctx = CTX_ZPZN_F1; \
+        const int mstride = -(b->stride >> (1+b->pshift)); \
+        if (b->parent) { \
+            const type *pbuf = (type *)b->parent->ibuf; \
+            const int stride = b->parent->stride >> (1+b->parent->pshift); \
+            pred_ctx += !!pbuf[stride * (y>>1) + (x>>1)] << 1; \
+        } \
+        if (b->orientation == subband_hl) \
+            sign_pred = buf[mstride]; \
+        if (x) { \
+            pred_ctx += !(buf[-1] | buf[mstride] | buf[-1 + mstride]); \
+            if (b->orientation == subband_lh) \
+                sign_pred = buf[-1]; \
+        } else { \
+            pred_ctx += !buf[mstride]; \
+        } \
+        coeff = dirac_get_arith_uint(c, pred_ctx, CTX_COEFF_DATA); \
+        if (coeff) { \
+            coeff = (coeff * qfactor + qoffset) >> 2; \
+            sign  = dirac_get_arith_bit(c, SIGN_CTX(sign_pred)); \
+            coeff = (coeff ^ -sign) + sign; \
+        } \
+        *buf = coeff; \
+    } \
+
+UNPACK_ARITH(8, int16_t)
+UNPACK_ARITH(10, int32_t)
+
+/**
+ * Decode the coeffs in the rectangle defined by left, right, top, bottom
+ * [DIRAC_STD] 13.4.3.2 Codeblock unpacking loop. codeblock()
+ */
+static inline void codeblock(DiracContext *s, SubBand *b,
+                             GetBitContext *gb, DiracArith *c,
+                             int left, int right, int top, int bottom,
+                             int blockcnt_one, int is_arith)
+{
+    int x, y, zero_block;
+    int qoffset, qfactor;
+    uint8_t *buf;
+
+    /* check for any coded coefficients in this codeblock */
+    if (!blockcnt_one) {
+        if (is_arith)
+            zero_block = dirac_get_arith_bit(c, CTX_ZERO_BLOCK);
+        else
+            zero_block = get_bits1(gb);
+
+        if (zero_block)
+            return;
+    }
+
+    if (s->codeblock_mode && !(s->old_delta_quant && blockcnt_one)) {
+        int quant = b->quant;
+        if (is_arith)
+            quant += dirac_get_arith_int(c, CTX_DELTA_Q_F, CTX_DELTA_Q_DATA);
+        else
+            quant += dirac_get_se_golomb(gb);
+        if (quant < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid quant\n");
+            return;
+        }
+        b->quant = quant;
+    }
+
+    if (b->quant > 115) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported quant %d\n", b->quant);
+        b->quant = 0;
+        return;
+    }
+
+    qfactor = ff_dirac_qscale_tab[b->quant];
+    /* TODO: context pointer? */
+    if (!s->num_refs)
+        qoffset = ff_dirac_qoffset_intra_tab[b->quant] + 2;
+    else
+        qoffset = ff_dirac_qoffset_inter_tab[b->quant] + 2;
+
+    buf = b->ibuf + top * b->stride;
+    if (is_arith) {
+        for (y = top; y < bottom; y++) {
+            for (x = left; x < right; x++) {
+                if (b->pshift) {
+                    coeff_unpack_arith_10(c, qfactor, qoffset, b, (int32_t*)(buf)+x, x, y);
+                } else {
+                    coeff_unpack_arith_8(c, qfactor, qoffset, b, (int16_t*)(buf)+x, x, y);
+                }
+            }
+            buf += b->stride;
+        }
+    } else {
+        for (y = top; y < bottom; y++) {
+            for (x = left; x < right; x++) {
+                int val = coeff_unpack_golomb(gb, qfactor, qoffset);
+                if (b->pshift) {
+                    AV_WN32(&buf[4*x], val);
+                } else {
+                    AV_WN16(&buf[2*x], val);
+                }
+            }
+            buf += b->stride;
+         }
+     }
+}
+
+/**
+ * Dirac Specification ->
+ * 13.3 intra_dc_prediction(band)
+ */
+#define INTRA_DC_PRED(n, type) \
+    static inline void intra_dc_prediction_##n(SubBand *b) \
+    { \
+        type *buf = (type*)b->ibuf; \
+        int x, y; \
+        \
+        for (x = 1; x < b->width; x++) \
+            buf[x] += buf[x-1]; \
+        buf += (b->stride >> (1+b->pshift)); \
+        \
+        for (y = 1; y < b->height; y++) { \
+            buf[0] += buf[-(b->stride >> (1+b->pshift))]; \
+            \
+            for (x = 1; x < b->width; x++) { \
+                int pred = buf[x - 1] + buf[x - (b->stride >> (1+b->pshift))] + buf[x - (b->stride >> (1+b->pshift))-1]; \
+                buf[x]  += divide3(pred); \
+            } \
+            buf += (b->stride >> (1+b->pshift)); \
+        } \
+    } \
+
+INTRA_DC_PRED(8, int16_t)
+INTRA_DC_PRED(10, int32_t)
+
+/**
+ * Dirac Specification ->
+ * 13.4.2 Non-skipped subbands.  subband_coeffs()
+ */
+static av_always_inline void decode_subband_internal(DiracContext *s, SubBand *b, int is_arith)
+{
+    int cb_x, cb_y, left, right, top, bottom;
+    DiracArith c;
+    GetBitContext gb;
+    int cb_width  = s->codeblock[b->level + (b->orientation != subband_ll)].width;
+    int cb_height = s->codeblock[b->level + (b->orientation != subband_ll)].height;
+    int blockcnt_one = (cb_width + cb_height) == 2;
+
+    if (!b->length)
+        return;
+
+    init_get_bits8(&gb, b->coeff_data, b->length);
+
+    if (is_arith)
+        ff_dirac_init_arith_decoder(&c, &gb, b->length);
+
+    top = 0;
+    for (cb_y = 0; cb_y < cb_height; cb_y++) {
+        bottom = (b->height * (cb_y+1LL)) / cb_height;
+        left = 0;
+        for (cb_x = 0; cb_x < cb_width; cb_x++) {
+            right = (b->width * (cb_x+1LL)) / cb_width;
+            codeblock(s, b, &gb, &c, left, right, top, bottom, blockcnt_one, is_arith);
+            left = right;
+        }
+        top = bottom;
+    }
+
+    if (b->orientation == subband_ll && s->num_refs == 0) {
+        if (s->pshift) {
+            intra_dc_prediction_10(b);
+        } else {
+            intra_dc_prediction_8(b);
+        }
+    }
+}
+
+static int decode_subband_arith(AVCodecContext *avctx, void *b)
+{
+    DiracContext *s = avctx->priv_data;
+    decode_subband_internal(s, b, 1);
+    return 0;
+}
+
+static int decode_subband_golomb(AVCodecContext *avctx, void *arg)
+{
+    DiracContext *s = avctx->priv_data;
+    SubBand **b     = arg;
+    decode_subband_internal(s, *b, 0);
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * [DIRAC_STD] 13.4.1 core_transform_data()
+ */
+static void decode_component(DiracContext *s, int comp)
+{
+    AVCodecContext *avctx = s->avctx;
+    SubBand *bands[3*MAX_DWT_LEVELS+1];
+    enum dirac_subband orientation;
+    int level, num_bands = 0;
+
+    /* Unpack all subbands at all levels. */
+    for (level = 0; level < s->wavelet_depth; level++) {
+        for (orientation = !!level; orientation < 4; orientation++) {
+            SubBand *b = &s->plane[comp].band[level][orientation];
+            bands[num_bands++] = b;
+
+            align_get_bits(&s->gb);
+            /* [DIRAC_STD] 13.4.2 subband() */
+            b->length = svq3_get_ue_golomb(&s->gb);
+            if (b->length) {
+                b->quant = svq3_get_ue_golomb(&s->gb);
+                align_get_bits(&s->gb);
+                b->coeff_data = s->gb.buffer + get_bits_count(&s->gb)/8;
+                b->length = FFMIN(b->length, FFMAX(get_bits_left(&s->gb)/8, 0));
+                skip_bits_long(&s->gb, b->length*8);
+            }
+        }
+        /* arithmetic coding has inter-level dependencies, so we can only execute one level at a time */
+        if (s->is_arith)
+            avctx->execute(avctx, decode_subband_arith, &s->plane[comp].band[level][!!level],
+                           NULL, 4-!!level, sizeof(SubBand));
+    }
+    /* golomb coding has no inter-level dependencies, so we can execute all subbands in parallel */
+    if (!s->is_arith)
+        avctx->execute(avctx, decode_subband_golomb, bands, NULL, num_bands, sizeof(SubBand*));
+}
+
+#define PARSE_VALUES(type, x, gb, ebits, buf1, buf2) \
+    type *buf = (type *)buf1; \
+    buf[x] = coeff_unpack_golomb(gb, qfactor, qoffset); \
+    if (get_bits_count(gb) >= ebits) \
+        return; \
+    if (buf2) { \
+        buf = (type *)buf2; \
+        buf[x] = coeff_unpack_golomb(gb, qfactor, qoffset); \
+        if (get_bits_count(gb) >= ebits) \
+            return; \
+    } \
+
+static void decode_subband(DiracContext *s, GetBitContext *gb, int quant,
+                           int slice_x, int slice_y, int bits_end,
+                           SubBand *b1, SubBand *b2)
+{
+    int left   = b1->width  * slice_x    / s->num_x;
+    int right  = b1->width  *(slice_x+1) / s->num_x;
+    int top    = b1->height * slice_y    / s->num_y;
+    int bottom = b1->height *(slice_y+1) / s->num_y;
+
+    int qfactor, qoffset;
+
+    uint8_t *buf1 =      b1->ibuf + top * b1->stride;
+    uint8_t *buf2 = b2 ? b2->ibuf + top * b2->stride: NULL;
+    int x, y;
+
+    if (quant > 115) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported quant %d\n", quant);
+        return;
+    }
+    qfactor = ff_dirac_qscale_tab[quant & 0x7f];
+    qoffset = ff_dirac_qoffset_intra_tab[quant & 0x7f] + 2;
+    /* we have to constantly check for overread since the spec explicitly
+       requires this, with the meaning that all remaining coeffs are set to 0 */
+    if (get_bits_count(gb) >= bits_end)
+        return;
+
+    if (s->pshift) {
+        for (y = top; y < bottom; y++) {
+            for (x = left; x < right; x++) {
+                PARSE_VALUES(int32_t, x, gb, bits_end, buf1, buf2);
+            }
+            buf1 += b1->stride;
+            if (buf2)
+                buf2 += b2->stride;
+        }
+    }
+    else {
+        for (y = top; y < bottom; y++) {
+            for (x = left; x < right; x++) {
+                PARSE_VALUES(int16_t, x, gb, bits_end, buf1, buf2);
+            }
+            buf1 += b1->stride;
+            if (buf2)
+                buf2 += b2->stride;
+        }
+    }
+}
+
+/* Used by Low Delay and High Quality profiles */
+typedef struct DiracSlice {
+    GetBitContext gb;
+    int slice_x;
+    int slice_y;
+    int bytes;
+} DiracSlice;
+
+
+/**
+ * Dirac Specification ->
+ * 13.5.2 Slices. slice(sx,sy)
+ */
+static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg)
+{
+    DiracContext *s = avctx->priv_data;
+    DiracSlice *slice = arg;
+    GetBitContext *gb = &slice->gb;
+    enum dirac_subband orientation;
+    int level, quant, chroma_bits, chroma_end;
+
+    int quant_base  = get_bits(gb, 7); /*[DIRAC_STD] qindex */
+    int length_bits = av_log2(8 * slice->bytes)+1;
+    int luma_bits   = get_bits_long(gb, length_bits);
+    int luma_end    = get_bits_count(gb) + FFMIN(luma_bits, get_bits_left(gb));
+
+    /* [DIRAC_STD] 13.5.5.2 luma_slice_band */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++) {
+            quant = FFMAX(quant_base - s->lowdelay.quant[level][orientation], 0);
+            decode_subband(s, gb, quant, slice->slice_x, slice->slice_y, luma_end,
+                           &s->plane[0].band[level][orientation], NULL);
+        }
+
+    /* consume any unused bits from luma */
+    skip_bits_long(gb, get_bits_count(gb) - luma_end);
+
+    chroma_bits = 8*slice->bytes - 7 - length_bits - luma_bits;
+    chroma_end  = get_bits_count(gb) + FFMIN(chroma_bits, get_bits_left(gb));
+    /* [DIRAC_STD] 13.5.5.3 chroma_slice_band */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++) {
+            quant = FFMAX(quant_base - s->lowdelay.quant[level][orientation], 0);
+            decode_subband(s, gb, quant, slice->slice_x, slice->slice_y, chroma_end,
+                           &s->plane[1].band[level][orientation],
+                           &s->plane[2].band[level][orientation]);
+        }
+
+    return 0;
+}
+
+/**
+ * VC-2 Specification ->
+ * 13.5.3 hq_slice(sx,sy)
+ */
+static int decode_hq_slice(AVCodecContext *avctx, void *arg)
+{
+    int i, quant, level, orientation, quant_idx;
+    uint8_t quants[MAX_DWT_LEVELS][4];
+    DiracContext *s = avctx->priv_data;
+    DiracSlice *slice = arg;
+    GetBitContext *gb = &slice->gb;
+
+    skip_bits_long(gb, 8*s->highquality.prefix_bytes);
+    quant_idx = get_bits(gb, 8);
+
+    /* Slice quantization (slice_quantizers() in the specs) */
+    for (level = 0; level < s->wavelet_depth; level++) {
+        for (orientation = !!level; orientation < 4; orientation++) {
+            quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
+            quants[level][orientation] = quant;
+        }
+    }
+
+    /* Luma + 2 Chroma planes */
+    for (i = 0; i < 3; i++) {
+        int64_t length = s->highquality.size_scaler * get_bits(gb, 8);
+        int64_t bits_left = 8 * length;
+        int64_t bits_end = get_bits_count(gb) + bits_left;
+
+        if (bits_end >= INT_MAX) {
+            av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = !!level; orientation < 4; orientation++) {
+                decode_subband(s, gb, quants[level][orientation], slice->slice_x, slice->slice_y, bits_end,
+                               &s->plane[i].band[level][orientation], NULL);
+            }
+        }
+        skip_bits_long(gb, bits_end - get_bits_count(gb));
+    }
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 13.5.1 low_delay_transform_data()
+ */
+static int decode_lowdelay(DiracContext *s)
+{
+    AVCodecContext *avctx = s->avctx;
+    int slice_x, slice_y, bufsize;
+    int64_t bytes = 0;
+    const uint8_t *buf;
+    DiracSlice *slices;
+    int slice_num = 0;
+
+    slices = av_mallocz_array(s->num_x, s->num_y * sizeof(DiracSlice));
+    if (!slices)
+        return AVERROR(ENOMEM);
+
+    align_get_bits(&s->gb);
+    /*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */
+    buf = s->gb.buffer + get_bits_count(&s->gb)/8;
+    bufsize = get_bits_left(&s->gb);
+
+    if (s->hq_picture) {
+        int i;
+
+        for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {
+            for (slice_x = 0; bufsize > 0 && slice_x < s->num_x; slice_x++) {
+                bytes = s->highquality.prefix_bytes + 1;
+                for (i = 0; i < 3; i++) {
+                    if (bytes <= bufsize/8)
+                        bytes += buf[bytes] * s->highquality.size_scaler + 1;
+                }
+                if (bytes >= INT_MAX) {
+                    av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n");
+                    av_free(slices);
+                    return AVERROR_INVALIDDATA;
+                }
+
+                slices[slice_num].bytes   = bytes;
+                slices[slice_num].slice_x = slice_x;
+                slices[slice_num].slice_y = slice_y;
+                init_get_bits(&slices[slice_num].gb, buf, bufsize);
+                slice_num++;
+
+                buf     += bytes;
+                if (bufsize/8 >= bytes)
+                    bufsize -= bytes*8;
+                else
+                    bufsize = 0;
+            }
+        }
+        avctx->execute(avctx, decode_hq_slice, slices, NULL, slice_num,
+                       sizeof(DiracSlice));
+    } else {
+        for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {
+            for (slice_x = 0; bufsize > 0 && slice_x < s->num_x; slice_x++) {
+                bytes = (slice_num+1) * (int64_t)s->lowdelay.bytes.num / s->lowdelay.bytes.den
+                       - slice_num    * (int64_t)s->lowdelay.bytes.num / s->lowdelay.bytes.den;
+                slices[slice_num].bytes   = bytes;
+                slices[slice_num].slice_x = slice_x;
+                slices[slice_num].slice_y = slice_y;
+                init_get_bits(&slices[slice_num].gb, buf, bufsize);
+                slice_num++;
+
+                buf     += bytes;
+                if (bufsize/8 >= bytes)
+                    bufsize -= bytes*8;
+                else
+                    bufsize = 0;
+            }
+        }
+        avctx->execute(avctx, decode_lowdelay_slice, slices, NULL, slice_num,
+                       sizeof(DiracSlice)); /* [DIRAC_STD] 13.5.2 Slices */
+    }
+
+    if (s->dc_prediction) {
+        if (s->pshift) {
+            intra_dc_prediction_10(&s->plane[0].band[0][0]); /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+            intra_dc_prediction_10(&s->plane[1].band[0][0]); /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+            intra_dc_prediction_10(&s->plane[2].band[0][0]); /* [DIRAC_STD] 13.3 intra_dc_prediction() */
+        } else {
+            intra_dc_prediction_8(&s->plane[0].band[0][0]);
+            intra_dc_prediction_8(&s->plane[1].band[0][0]);
+            intra_dc_prediction_8(&s->plane[2].band[0][0]);
+        }
+    }
+    av_free(slices);
+    return 0;
+}
+
+static void init_planes(DiracContext *s)
+{
+    int i, w, h, level, orientation;
+
+    for (i = 0; i < 3; i++) {
+        Plane *p = &s->plane[i];
+
+        p->width       = s->seq.width  >> (i ? s->chroma_x_shift : 0);
+        p->height      = s->seq.height >> (i ? s->chroma_y_shift : 0);
+        p->idwt.width  = w = CALC_PADDING(p->width , s->wavelet_depth);
+        p->idwt.height = h = CALC_PADDING(p->height, s->wavelet_depth);
+        p->idwt.stride = FFALIGN(p->idwt.width, 8) << (1 + s->pshift);
+
+        for (level = s->wavelet_depth-1; level >= 0; level--) {
+            w = w>>1;
+            h = h>>1;
+            for (orientation = !!level; orientation < 4; orientation++) {
+                SubBand *b = &p->band[level][orientation];
+
+                b->pshift = s->pshift;
+                b->ibuf   = p->idwt.buf;
+                b->level  = level;
+                b->stride = p->idwt.stride << (s->wavelet_depth - level);
+                b->width  = w;
+                b->height = h;
+                b->orientation = orientation;
+
+                if (orientation & 1)
+                    b->ibuf += w << (1+b->pshift);
+                if (orientation > 1)
+                    b->ibuf += (b->stride>>1);
+
+                if (level)
+                    b->parent = &p->band[level-1][orientation];
+            }
+        }
+
+        if (i > 0) {
+            p->xblen = s->plane[0].xblen >> s->chroma_x_shift;
+            p->yblen = s->plane[0].yblen >> s->chroma_y_shift;
+            p->xbsep = s->plane[0].xbsep >> s->chroma_x_shift;
+            p->ybsep = s->plane[0].ybsep >> s->chroma_y_shift;
+        }
+
+        p->xoffset = (p->xblen - p->xbsep)/2;
+        p->yoffset = (p->yblen - p->ybsep)/2;
+    }
+}
+
+/**
+ * Unpack the motion compensation parameters
+ * Dirac Specification ->
+ * 11.2 Picture prediction data. picture_prediction()
+ */
+static int dirac_unpack_prediction_parameters(DiracContext *s)
+{
+    static const uint8_t default_blen[] = { 4, 12, 16, 24 };
+
+    GetBitContext *gb = &s->gb;
+    unsigned idx, ref;
+
+    align_get_bits(gb);
+    /* [DIRAC_STD] 11.2.2 Block parameters. block_parameters() */
+    /* Luma and Chroma are equal. 11.2.3 */
+    idx = svq3_get_ue_golomb(gb); /* [DIRAC_STD] index */
+
+    if (idx > 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block prediction index too high\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (idx == 0) {
+        s->plane[0].xblen = svq3_get_ue_golomb(gb);
+        s->plane[0].yblen = svq3_get_ue_golomb(gb);
+        s->plane[0].xbsep = svq3_get_ue_golomb(gb);
+        s->plane[0].ybsep = svq3_get_ue_golomb(gb);
+    } else {
+        /*[DIRAC_STD] preset_block_params(index). Table 11.1 */
+        s->plane[0].xblen = default_blen[idx-1];
+        s->plane[0].yblen = default_blen[idx-1];
+        s->plane[0].xbsep = 4 * idx;
+        s->plane[0].ybsep = 4 * idx;
+    }
+    /*[DIRAC_STD] 11.2.4 motion_data_dimensions()
+      Calculated in function dirac_unpack_block_motion_data */
+
+    if (s->plane[0].xblen % (1 << s->chroma_x_shift) != 0 ||
+        s->plane[0].yblen % (1 << s->chroma_y_shift) != 0 ||
+        !s->plane[0].xblen || !s->plane[0].yblen) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "invalid x/y block length (%d/%d) for x/y chroma shift (%d/%d)\n",
+               s->plane[0].xblen, s->plane[0].yblen, s->chroma_x_shift, s->chroma_y_shift);
+        return AVERROR_INVALIDDATA;
+    }
+    if (!s->plane[0].xbsep || !s->plane[0].ybsep || s->plane[0].xbsep < s->plane[0].xblen/2 || s->plane[0].ybsep < s->plane[0].yblen/2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block separation too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (s->plane[0].xbsep > s->plane[0].xblen || s->plane[0].ybsep > s->plane[0].yblen) {
+        av_log(s->avctx, AV_LOG_ERROR, "Block separation greater than size\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (FFMAX(s->plane[0].xblen, s->plane[0].yblen) > MAX_BLOCKSIZE) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported large block size\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    /*[DIRAC_STD] 11.2.5 Motion vector precision. motion_vector_precision()
+      Read motion vector precision */
+    s->mv_precision = svq3_get_ue_golomb(gb);
+    if (s->mv_precision > 3) {
+        av_log(s->avctx, AV_LOG_ERROR, "MV precision finer than eighth-pel\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*[DIRAC_STD] 11.2.6 Global motion. global_motion()
+      Read the global motion compensation parameters */
+    s->globalmc_flag = get_bits1(gb);
+    if (s->globalmc_flag) {
+        memset(s->globalmc, 0, sizeof(s->globalmc));
+        /* [DIRAC_STD] pan_tilt(gparams) */
+        for (ref = 0; ref < s->num_refs; ref++) {
+            if (get_bits1(gb)) {
+                s->globalmc[ref].pan_tilt[0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].pan_tilt[1] = dirac_get_se_golomb(gb);
+            }
+            /* [DIRAC_STD] zoom_rotate_shear(gparams)
+               zoom/rotation/shear parameters */
+            if (get_bits1(gb)) {
+                s->globalmc[ref].zrs_exp   = svq3_get_ue_golomb(gb);
+                s->globalmc[ref].zrs[0][0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[0][1] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[1][0] = dirac_get_se_golomb(gb);
+                s->globalmc[ref].zrs[1][1] = dirac_get_se_golomb(gb);
+            } else {
+                s->globalmc[ref].zrs[0][0] = 1;
+                s->globalmc[ref].zrs[1][1] = 1;
+            }
+            /* [DIRAC_STD] perspective(gparams) */
+            if (get_bits1(gb)) {
+                s->globalmc[ref].perspective_exp = svq3_get_ue_golomb(gb);
+                s->globalmc[ref].perspective[0]  = dirac_get_se_golomb(gb);
+                s->globalmc[ref].perspective[1]  = dirac_get_se_golomb(gb);
+            }
+        }
+    }
+
+    /*[DIRAC_STD] 11.2.7 Picture prediction mode. prediction_mode()
+      Picture prediction mode, not currently used. */
+    if (svq3_get_ue_golomb(gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unknown picture prediction mode\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* [DIRAC_STD] 11.2.8 Reference picture weight. reference_picture_weights()
+       just data read, weight calculation will be done later on. */
+    s->weight_log2denom = 1;
+    s->weight[0]        = 1;
+    s->weight[1]        = 1;
+
+    if (get_bits1(gb)) {
+        s->weight_log2denom = svq3_get_ue_golomb(gb);
+        s->weight[0] = dirac_get_se_golomb(gb);
+        if (s->num_refs == 2)
+            s->weight[1] = dirac_get_se_golomb(gb);
+    }
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 11.3 Wavelet transform data. wavelet_transform()
+ */
+static int dirac_unpack_idwt_params(DiracContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    int i, level;
+    unsigned tmp;
+
+#define CHECKEDREAD(dst, cond, errmsg) \
+    tmp = svq3_get_ue_golomb(gb); \
+    if (cond) { \
+        av_log(s->avctx, AV_LOG_ERROR, errmsg); \
+        return AVERROR_INVALIDDATA; \
+    }\
+    dst = tmp;
+
+    align_get_bits(gb);
+
+    s->zero_res = s->num_refs ? get_bits1(gb) : 0;
+    if (s->zero_res)
+        return 0;
+
+    /*[DIRAC_STD] 11.3.1 Transform parameters. transform_parameters() */
+    CHECKEDREAD(s->wavelet_idx, tmp > 6, "wavelet_idx is too big\n")
+
+    CHECKEDREAD(s->wavelet_depth, tmp > MAX_DWT_LEVELS || tmp < 1, "invalid number of DWT decompositions\n")
+
+    if (!s->low_delay) {
+        /* Codeblock parameters (core syntax only) */
+        if (get_bits1(gb)) {
+            for (i = 0; i <= s->wavelet_depth; i++) {
+                CHECKEDREAD(s->codeblock[i].width , tmp < 1 || tmp > (s->avctx->width >>s->wavelet_depth-i), "codeblock width invalid\n")
+                CHECKEDREAD(s->codeblock[i].height, tmp < 1 || tmp > (s->avctx->height>>s->wavelet_depth-i), "codeblock height invalid\n")
+            }
+
+            CHECKEDREAD(s->codeblock_mode, tmp > 1, "unknown codeblock mode\n")
+        }
+        else {
+            for (i = 0; i <= s->wavelet_depth; i++)
+                s->codeblock[i].width = s->codeblock[i].height = 1;
+        }
+    }
+    else {
+        s->num_x        = svq3_get_ue_golomb(gb);
+        s->num_y        = svq3_get_ue_golomb(gb);
+        if (s->ld_picture) {
+            s->lowdelay.bytes.num = svq3_get_ue_golomb(gb);
+            s->lowdelay.bytes.den = svq3_get_ue_golomb(gb);
+            if (s->lowdelay.bytes.den <= 0) {
+                av_log(s->avctx,AV_LOG_ERROR,"Invalid lowdelay.bytes.den\n");
+                return AVERROR_INVALIDDATA;
+            }
+        } else if (s->hq_picture) {
+            s->highquality.prefix_bytes = svq3_get_ue_golomb(gb);
+            s->highquality.size_scaler  = svq3_get_ue_golomb(gb);
+            if (s->highquality.prefix_bytes >= INT_MAX / 8) {
+                av_log(s->avctx,AV_LOG_ERROR,"too many prefix bytes\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+
+        /* [DIRAC_STD] 11.3.5 Quantisation matrices (low-delay syntax). quant_matrix() */
+        if (get_bits1(gb)) {
+            av_log(s->avctx,AV_LOG_DEBUG,"Low Delay: Has Custom Quantization Matrix!\n");
+            /* custom quantization matrix */
+            s->lowdelay.quant[0][0] = svq3_get_ue_golomb(gb);
+            for (level = 0; level < s->wavelet_depth; level++) {
+                s->lowdelay.quant[level][1] = svq3_get_ue_golomb(gb);
+                s->lowdelay.quant[level][2] = svq3_get_ue_golomb(gb);
+                s->lowdelay.quant[level][3] = svq3_get_ue_golomb(gb);
+            }
+        } else {
+            if (s->wavelet_depth > 4) {
+                av_log(s->avctx,AV_LOG_ERROR,"Mandatory custom low delay matrix missing for depth %d\n", s->wavelet_depth);
+                return AVERROR_INVALIDDATA;
+            }
+            /* default quantization matrix */
+            for (level = 0; level < s->wavelet_depth; level++)
+                for (i = 0; i < 4; i++) {
+                    s->lowdelay.quant[level][i] = ff_dirac_default_qmat[s->wavelet_idx][level][i];
+                    /* haar with no shift differs for different depths */
+                    if (s->wavelet_idx == 3)
+                        s->lowdelay.quant[level][i] += 4*(s->wavelet_depth-1 - level);
+                }
+        }
+    }
+    return 0;
+}
+
+static inline int pred_sbsplit(uint8_t *sbsplit, int stride, int x, int y)
+{
+    static const uint8_t avgsplit[7] = { 0, 0, 1, 1, 1, 2, 2 };
+
+    if (!(x|y))
+        return 0;
+    else if (!y)
+        return sbsplit[-1];
+    else if (!x)
+        return sbsplit[-stride];
+
+    return avgsplit[sbsplit[-1] + sbsplit[-stride] + sbsplit[-stride-1]];
+}
+
+static inline int pred_block_mode(DiracBlock *block, int stride, int x, int y, int refmask)
+{
+    int pred;
+
+    if (!(x|y))
+        return 0;
+    else if (!y)
+        return block[-1].ref & refmask;
+    else if (!x)
+        return block[-stride].ref & refmask;
+
+    /* return the majority */
+    pred = (block[-1].ref & refmask) + (block[-stride].ref & refmask) + (block[-stride-1].ref & refmask);
+    return (pred >> 1) & refmask;
+}
+
+static inline void pred_block_dc(DiracBlock *block, int stride, int x, int y)
+{
+    int i, n = 0;
+
+    memset(block->u.dc, 0, sizeof(block->u.dc));
+
+    if (x && !(block[-1].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-1].u.dc[i];
+        n++;
+    }
+
+    if (y && !(block[-stride].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-stride].u.dc[i];
+        n++;
+    }
+
+    if (x && y && !(block[-1-stride].ref & 3)) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += block[-1-stride].u.dc[i];
+        n++;
+    }
+
+    if (n == 2) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] = (block->u.dc[i]+1)>>1;
+    } else if (n == 3) {
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] = divide3(block->u.dc[i]);
+    }
+}
+
+static inline void pred_mv(DiracBlock *block, int stride, int x, int y, int ref)
+{
+    int16_t *pred[3];
+    int refmask = ref+1;
+    int mask = refmask | DIRAC_REF_MASK_GLOBAL; /*  exclude gmc blocks */
+    int n = 0;
+
+    if (x && (block[-1].ref & mask) == refmask)
+        pred[n++] = block[-1].u.mv[ref];
+
+    if (y && (block[-stride].ref & mask) == refmask)
+        pred[n++] = block[-stride].u.mv[ref];
+
+    if (x && y && (block[-stride-1].ref & mask) == refmask)
+        pred[n++] = block[-stride-1].u.mv[ref];
+
+    switch (n) {
+    case 0:
+        block->u.mv[ref][0] = 0;
+        block->u.mv[ref][1] = 0;
+        break;
+    case 1:
+        block->u.mv[ref][0] = pred[0][0];
+        block->u.mv[ref][1] = pred[0][1];
+        break;
+    case 2:
+        block->u.mv[ref][0] = (pred[0][0] + pred[1][0] + 1) >> 1;
+        block->u.mv[ref][1] = (pred[0][1] + pred[1][1] + 1) >> 1;
+        break;
+    case 3:
+        block->u.mv[ref][0] = mid_pred(pred[0][0], pred[1][0], pred[2][0]);
+        block->u.mv[ref][1] = mid_pred(pred[0][1], pred[1][1], pred[2][1]);
+        break;
+    }
+}
+
+static void global_mv(DiracContext *s, DiracBlock *block, int x, int y, int ref)
+{
+    int ez      = s->globalmc[ref].zrs_exp;
+    int ep      = s->globalmc[ref].perspective_exp;
+    int (*A)[2] = s->globalmc[ref].zrs;
+    int *b      = s->globalmc[ref].pan_tilt;
+    int *c      = s->globalmc[ref].perspective;
+
+    int m       = (1<<ep) - (c[0]*x + c[1]*y);
+    int mx      = m * ((A[0][0] * x + A[0][1]*y) + (1<<ez) * b[0]);
+    int my      = m * ((A[1][0] * x + A[1][1]*y) + (1<<ez) * b[1]);
+
+    block->u.mv[ref][0] = (mx + (1<<(ez+ep))) >> (ez+ep);
+    block->u.mv[ref][1] = (my + (1<<(ez+ep))) >> (ez+ep);
+}
+
+static void decode_block_params(DiracContext *s, DiracArith arith[8], DiracBlock *block,
+                                int stride, int x, int y)
+{
+    int i;
+
+    block->ref  = pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_REF1);
+    block->ref ^= dirac_get_arith_bit(arith, CTX_PMODE_REF1);
+
+    if (s->num_refs == 2) {
+        block->ref |= pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_REF2);
+        block->ref ^= dirac_get_arith_bit(arith, CTX_PMODE_REF2) << 1;
+    }
+
+    if (!block->ref) {
+        pred_block_dc(block, stride, x, y);
+        for (i = 0; i < 3; i++)
+            block->u.dc[i] += dirac_get_arith_int(arith+1+i, CTX_DC_F1, CTX_DC_DATA);
+        return;
+    }
+
+    if (s->globalmc_flag) {
+        block->ref |= pred_block_mode(block, stride, x, y, DIRAC_REF_MASK_GLOBAL);
+        block->ref ^= dirac_get_arith_bit(arith, CTX_GLOBAL_BLOCK) << 2;
+    }
+
+    for (i = 0; i < s->num_refs; i++)
+        if (block->ref & (i+1)) {
+            if (block->ref & DIRAC_REF_MASK_GLOBAL) {
+                global_mv(s, block, x, y, i);
+            } else {
+                pred_mv(block, stride, x, y, i);
+                block->u.mv[i][0] += dirac_get_arith_int(arith + 4 + 2 * i, CTX_MV_F1, CTX_MV_DATA);
+                block->u.mv[i][1] += dirac_get_arith_int(arith + 5 + 2 * i, CTX_MV_F1, CTX_MV_DATA);
+            }
+        }
+}
+
+/**
+ * Copies the current block to the other blocks covered by the current superblock split mode
+ */
+static void propagate_block_data(DiracBlock *block, int stride, int size)
+{
+    int x, y;
+    DiracBlock *dst = block;
+
+    for (x = 1; x < size; x++)
+        dst[x] = *block;
+
+    for (y = 1; y < size; y++) {
+        dst += stride;
+        for (x = 0; x < size; x++)
+            dst[x] = *block;
+    }
+}
+
+/**
+ * Dirac Specification ->
+ * 12. Block motion data syntax
+ */
+static int dirac_unpack_block_motion_data(DiracContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    uint8_t *sbsplit = s->sbsplit;
+    int i, x, y, q, p;
+    DiracArith arith[8];
+
+    align_get_bits(gb);
+
+    /* [DIRAC_STD] 11.2.4 and 12.2.1 Number of blocks and superblocks */
+    s->sbwidth  = DIVRNDUP(s->seq.width,  4*s->plane[0].xbsep);
+    s->sbheight = DIVRNDUP(s->seq.height, 4*s->plane[0].ybsep);
+    s->blwidth  = 4 * s->sbwidth;
+    s->blheight = 4 * s->sbheight;
+
+    /* [DIRAC_STD] 12.3.1 Superblock splitting modes. superblock_split_modes()
+       decode superblock split modes */
+    ff_dirac_init_arith_decoder(arith, gb, svq3_get_ue_golomb(gb));     /* svq3_get_ue_golomb(gb) is the length */
+    for (y = 0; y < s->sbheight; y++) {
+        for (x = 0; x < s->sbwidth; x++) {
+            unsigned int split  = dirac_get_arith_uint(arith, CTX_SB_F1, CTX_SB_DATA);
+            if (split > 2)
+                return AVERROR_INVALIDDATA;
+            sbsplit[x] = (split + pred_sbsplit(sbsplit+x, s->sbwidth, x, y)) % 3;
+        }
+        sbsplit += s->sbwidth;
+    }
+
+    /* setup arith decoding */
+    ff_dirac_init_arith_decoder(arith, gb, svq3_get_ue_golomb(gb));
+    for (i = 0; i < s->num_refs; i++) {
+        ff_dirac_init_arith_decoder(arith + 4 + 2 * i, gb, svq3_get_ue_golomb(gb));
+        ff_dirac_init_arith_decoder(arith + 5 + 2 * i, gb, svq3_get_ue_golomb(gb));
+    }
+    for (i = 0; i < 3; i++)
+        ff_dirac_init_arith_decoder(arith+1+i, gb, svq3_get_ue_golomb(gb));
+
+    for (y = 0; y < s->sbheight; y++)
+        for (x = 0; x < s->sbwidth; x++) {
+            int blkcnt = 1 << s->sbsplit[y * s->sbwidth + x];
+            int step   = 4 >> s->sbsplit[y * s->sbwidth + x];
+
+            for (q = 0; q < blkcnt; q++)
+                for (p = 0; p < blkcnt; p++) {
+                    int bx = 4 * x + p*step;
+                    int by = 4 * y + q*step;
+                    DiracBlock *block = &s->blmotion[by*s->blwidth + bx];
+                    decode_block_params(s, arith, block, s->blwidth, bx, by);
+                    propagate_block_data(block, s->blwidth, step);
+                }
+        }
+
+    return 0;
+}
+
+static int weight(int i, int blen, int offset)
+{
+#define ROLLOFF(i) offset == 1 ? ((i) ? 5 : 3) :        \
+    (1 + (6*(i) + offset - 1) / (2*offset - 1))
+
+    if (i < 2*offset)
+        return ROLLOFF(i);
+    else if (i > blen-1 - 2*offset)
+        return ROLLOFF(blen-1 - i);
+    return 8;
+}
+
+static void init_obmc_weight_row(Plane *p, uint8_t *obmc_weight, int stride,
+                                 int left, int right, int wy)
+{
+    int x;
+    for (x = 0; left && x < p->xblen >> 1; x++)
+        obmc_weight[x] = wy*8;
+    for (; x < p->xblen >> right; x++)
+        obmc_weight[x] = wy*weight(x, p->xblen, p->xoffset);
+    for (; x < p->xblen; x++)
+        obmc_weight[x] = wy*8;
+    for (; x < stride; x++)
+        obmc_weight[x] = 0;
+}
+
+static void init_obmc_weight(Plane *p, uint8_t *obmc_weight, int stride,
+                             int left, int right, int top, int bottom)
+{
+    int y;
+    for (y = 0; top && y < p->yblen >> 1; y++) {
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, 8);
+        obmc_weight += stride;
+    }
+    for (; y < p->yblen >> bottom; y++) {
+        int wy = weight(y, p->yblen, p->yoffset);
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, wy);
+        obmc_weight += stride;
+    }
+    for (; y < p->yblen; y++) {
+        init_obmc_weight_row(p, obmc_weight, stride, left, right, 8);
+        obmc_weight += stride;
+    }
+}
+
+static void init_obmc_weights(DiracContext *s, Plane *p, int by)
+{
+    int top = !by;
+    int bottom = by == s->blheight-1;
+
+    /* don't bother re-initing for rows 2 to blheight-2, the weights don't change */
+    if (top || bottom || by == 1) {
+        init_obmc_weight(p, s->obmc_weight[0], MAX_BLOCKSIZE, 1, 0, top, bottom);
+        init_obmc_weight(p, s->obmc_weight[1], MAX_BLOCKSIZE, 0, 0, top, bottom);
+        init_obmc_weight(p, s->obmc_weight[2], MAX_BLOCKSIZE, 0, 1, top, bottom);
+    }
+}
+
+static const uint8_t epel_weights[4][4][4] = {
+    {{ 16,  0,  0,  0 },
+     { 12,  4,  0,  0 },
+     {  8,  8,  0,  0 },
+     {  4, 12,  0,  0 }},
+    {{ 12,  0,  4,  0 },
+     {  9,  3,  3,  1 },
+     {  6,  6,  2,  2 },
+     {  3,  9,  1,  3 }},
+    {{  8,  0,  8,  0 },
+     {  6,  2,  6,  2 },
+     {  4,  4,  4,  4 },
+     {  2,  6,  2,  6 }},
+    {{  4,  0, 12,  0 },
+     {  3,  1,  9,  3 },
+     {  2,  2,  6,  6 },
+     {  1,  3,  3,  9 }}
+};
+
+/**
+ * For block x,y, determine which of the hpel planes to do bilinear
+ * interpolation from and set src[] to the location in each hpel plane
+ * to MC from.
+ *
+ * @return the index of the put_dirac_pixels_tab function to use
+ *  0 for 1 plane (fpel,hpel), 1 for 2 planes (qpel), 2 for 4 planes (qpel), and 3 for epel
+ */
+static int mc_subpel(DiracContext *s, DiracBlock *block, const uint8_t *src[5],
+                     int x, int y, int ref, int plane)
+{
+    Plane *p = &s->plane[plane];
+    uint8_t **ref_hpel = s->ref_pics[ref]->hpel[plane];
+    int motion_x = block->u.mv[ref][0];
+    int motion_y = block->u.mv[ref][1];
+    int mx, my, i, epel, nplanes = 0;
+
+    if (plane) {
+        motion_x >>= s->chroma_x_shift;
+        motion_y >>= s->chroma_y_shift;
+    }
+
+    mx         = motion_x & ~(-1U << s->mv_precision);
+    my         = motion_y & ~(-1U << s->mv_precision);
+    motion_x >>= s->mv_precision;
+    motion_y >>= s->mv_precision;
+    /* normalize subpel coordinates to epel */
+    /* TODO: template this function? */
+    mx      <<= 3 - s->mv_precision;
+    my      <<= 3 - s->mv_precision;
+
+    x += motion_x;
+    y += motion_y;
+    epel = (mx|my)&1;
+
+    /* hpel position */
+    if (!((mx|my)&3)) {
+        nplanes = 1;
+        src[0] = ref_hpel[(my>>1)+(mx>>2)] + y*p->stride + x;
+    } else {
+        /* qpel or epel */
+        nplanes = 4;
+        for (i = 0; i < 4; i++)
+            src[i] = ref_hpel[i] + y*p->stride + x;
+
+        /* if we're interpolating in the right/bottom halves, adjust the planes as needed
+           we increment x/y because the edge changes for half of the pixels */
+        if (mx > 4) {
+            src[0] += 1;
+            src[2] += 1;
+            x++;
+        }
+        if (my > 4) {
+            src[0] += p->stride;
+            src[1] += p->stride;
+            y++;
+        }
+
+        /* hpel planes are:
+           [0]: F  [1]: H
+           [2]: V  [3]: C */
+        if (!epel) {
+            /* check if we really only need 2 planes since either mx or my is
+               a hpel position. (epel weights of 0 handle this there) */
+            if (!(mx&3)) {
+                /* mx == 0: average [0] and [2]
+                   mx == 4: average [1] and [3] */
+                src[!mx] = src[2 + !!mx];
+                nplanes = 2;
+            } else if (!(my&3)) {
+                src[0] = src[(my>>1)  ];
+                src[1] = src[(my>>1)+1];
+                nplanes = 2;
+            }
+        } else {
+            /* adjust the ordering if needed so the weights work */
+            if (mx > 4) {
+                FFSWAP(const uint8_t *, src[0], src[1]);
+                FFSWAP(const uint8_t *, src[2], src[3]);
+            }
+            if (my > 4) {
+                FFSWAP(const uint8_t *, src[0], src[2]);
+                FFSWAP(const uint8_t *, src[1], src[3]);
+            }
+            src[4] = epel_weights[my&3][mx&3];
+        }
+    }
+
+    /* fixme: v/h _edge_pos */
+    if (x + p->xblen > p->width +EDGE_WIDTH/2 ||
+        y + p->yblen > p->height+EDGE_WIDTH/2 ||
+        x < 0 || y < 0) {
+        for (i = 0; i < nplanes; i++) {
+            s->vdsp.emulated_edge_mc(s->edge_emu_buffer[i], src[i],
+                                     p->stride, p->stride,
+                                     p->xblen, p->yblen, x, y,
+                                     p->width+EDGE_WIDTH/2, p->height+EDGE_WIDTH/2);
+            src[i] = s->edge_emu_buffer[i];
+        }
+    }
+    return (nplanes>>1) + epel;
+}
+
+static void add_dc(uint16_t *dst, int dc, int stride,
+                   uint8_t *obmc_weight, int xblen, int yblen)
+{
+    int x, y;
+    dc += 128;
+
+    for (y = 0; y < yblen; y++) {
+        for (x = 0; x < xblen; x += 2) {
+            dst[x  ] += dc * obmc_weight[x  ];
+            dst[x+1] += dc * obmc_weight[x+1];
+        }
+        dst          += stride;
+        obmc_weight  += MAX_BLOCKSIZE;
+    }
+}
+
+static void block_mc(DiracContext *s, DiracBlock *block,
+                     uint16_t *mctmp, uint8_t *obmc_weight,
+                     int plane, int dstx, int dsty)
+{
+    Plane *p = &s->plane[plane];
+    const uint8_t *src[5];
+    int idx;
+
+    switch (block->ref&3) {
+    case 0: /* DC */
+        add_dc(mctmp, block->u.dc[plane], p->stride, obmc_weight, p->xblen, p->yblen);
+        return;
+    case 1:
+    case 2:
+        idx = mc_subpel(s, block, src, dstx, dsty, (block->ref&3)-1, plane);
+        s->put_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        if (s->weight_func)
+            s->weight_func(s->mcscratch, p->stride, s->weight_log2denom,
+                           s->weight[0] + s->weight[1], p->yblen);
+        break;
+    case 3:
+        idx = mc_subpel(s, block, src, dstx, dsty, 0, plane);
+        s->put_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        idx = mc_subpel(s, block, src, dstx, dsty, 1, plane);
+        if (s->biweight_func) {
+            /* fixme: +32 is a quick hack */
+            s->put_pixels_tab[idx](s->mcscratch + 32, src, p->stride, p->yblen);
+            s->biweight_func(s->mcscratch, s->mcscratch+32, p->stride, s->weight_log2denom,
+                             s->weight[0], s->weight[1], p->yblen);
+        } else
+            s->avg_pixels_tab[idx](s->mcscratch, src, p->stride, p->yblen);
+        break;
+    }
+    s->add_obmc(mctmp, s->mcscratch, p->stride, obmc_weight, p->yblen);
+}
+
+static void mc_row(DiracContext *s, DiracBlock *block, uint16_t *mctmp, int plane, int dsty)
+{
+    Plane *p = &s->plane[plane];
+    int x, dstx = p->xbsep - p->xoffset;
+
+    block_mc(s, block, mctmp, s->obmc_weight[0], plane, -p->xoffset, dsty);
+    mctmp += p->xbsep;
+
+    for (x = 1; x < s->blwidth-1; x++) {
+        block_mc(s, block+x, mctmp, s->obmc_weight[1], plane, dstx, dsty);
+        dstx  += p->xbsep;
+        mctmp += p->xbsep;
+    }
+    block_mc(s, block+x, mctmp, s->obmc_weight[2], plane, dstx, dsty);
+}
+
+static void select_dsp_funcs(DiracContext *s, int width, int height, int xblen, int yblen)
+{
+    int idx = 0;
+    if (xblen > 8)
+        idx = 1;
+    if (xblen > 16)
+        idx = 2;
+
+    memcpy(s->put_pixels_tab, s->diracdsp.put_dirac_pixels_tab[idx], sizeof(s->put_pixels_tab));
+    memcpy(s->avg_pixels_tab, s->diracdsp.avg_dirac_pixels_tab[idx], sizeof(s->avg_pixels_tab));
+    s->add_obmc = s->diracdsp.add_dirac_obmc[idx];
+    if (s->weight_log2denom > 1 || s->weight[0] != 1 || s->weight[1] != 1) {
+        s->weight_func   = s->diracdsp.weight_dirac_pixels_tab[idx];
+        s->biweight_func = s->diracdsp.biweight_dirac_pixels_tab[idx];
+    } else {
+        s->weight_func   = NULL;
+        s->biweight_func = NULL;
+    }
+}
+
+static int interpolate_refplane(DiracContext *s, DiracFrame *ref, int plane, int width, int height)
+{
+    /* chroma allocates an edge of 8 when subsampled
+       which for 4:2:2 means an h edge of 16 and v edge of 8
+       just use 8 for everything for the moment */
+    int i, edge = EDGE_WIDTH/2;
+
+    ref->hpel[plane][0] = ref->avframe->data[plane];
+    s->mpvencdsp.draw_edges(ref->hpel[plane][0], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM); /* EDGE_TOP | EDGE_BOTTOM values just copied to make it build, this needs to be ensured */
+
+    /* no need for hpel if we only have fpel vectors */
+    if (!s->mv_precision)
+        return 0;
+
+    for (i = 1; i < 4; i++) {
+        if (!ref->hpel_base[plane][i])
+            ref->hpel_base[plane][i] = av_malloc((height+2*edge) * ref->avframe->linesize[plane] + 32);
+        if (!ref->hpel_base[plane][i]) {
+            return AVERROR(ENOMEM);
+        }
+        /* we need to be 16-byte aligned even for chroma */
+        ref->hpel[plane][i] = ref->hpel_base[plane][i] + edge*ref->avframe->linesize[plane] + 16;
+    }
+
+    if (!ref->interpolated[plane]) {
+        s->diracdsp.dirac_hpel_filter(ref->hpel[plane][1], ref->hpel[plane][2],
+                                      ref->hpel[plane][3], ref->hpel[plane][0],
+                                      ref->avframe->linesize[plane], width, height);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][1], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][2], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+        s->mpvencdsp.draw_edges(ref->hpel[plane][3], ref->avframe->linesize[plane], width, height, edge, edge, EDGE_TOP | EDGE_BOTTOM);
+    }
+    ref->interpolated[plane] = 1;
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 13.0 Transform data syntax. transform_data()
+ */
+static int dirac_decode_frame_internal(DiracContext *s)
+{
+    DWTContext d;
+    int y, i, comp, dsty;
+    int ret;
+
+    if (s->low_delay) {
+        /* [DIRAC_STD] 13.5.1 low_delay_transform_data() */
+        for (comp = 0; comp < 3; comp++) {
+            Plane *p = &s->plane[comp];
+            memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
+        }
+        if (!s->zero_res) {
+            if ((ret = decode_lowdelay(s)) < 0)
+                return ret;
+        }
+    }
+
+    for (comp = 0; comp < 3; comp++) {
+        Plane *p       = &s->plane[comp];
+        uint8_t *frame = s->current_picture->avframe->data[comp];
+
+        /* FIXME: small resolutions */
+        for (i = 0; i < 4; i++)
+            s->edge_emu_buffer[i] = s->edge_emu_buffer_base + i*FFALIGN(p->width, 16);
+
+        if (!s->zero_res && !s->low_delay)
+        {
+            memset(p->idwt.buf, 0, p->idwt.stride * p->idwt.height);
+            decode_component(s, comp); /* [DIRAC_STD] 13.4.1 core_transform_data() */
+        }
+        ret = ff_spatial_idwt_init(&d, &p->idwt, s->wavelet_idx+2,
+                                   s->wavelet_depth, s->bit_depth);
+        if (ret < 0)
+            return ret;
+
+        if (!s->num_refs) { /* intra */
+            for (y = 0; y < p->height; y += 16) {
+                int idx = (s->bit_depth - 8) >> 1;
+                ff_spatial_idwt_slice2(&d, y+16); /* decode */
+                s->diracdsp.put_signed_rect_clamped[idx](frame + y*p->stride,
+                                                         p->stride,
+                                                         p->idwt.buf + y*p->idwt.stride,
+                                                         p->idwt.stride, p->width, 16);
+            }
+        } else { /* inter */
+            int rowheight = p->ybsep*p->stride;
+
+            select_dsp_funcs(s, p->width, p->height, p->xblen, p->yblen);
+
+            for (i = 0; i < s->num_refs; i++) {
+                int ret = interpolate_refplane(s, s->ref_pics[i], comp, p->width, p->height);
+                if (ret < 0)
+                    return ret;
+            }
+
+            memset(s->mctmp, 0, 4*p->yoffset*p->stride);
+
+            dsty = -p->yoffset;
+            for (y = 0; y < s->blheight; y++) {
+                int h     = 0,
+                    start = FFMAX(dsty, 0);
+                uint16_t *mctmp    = s->mctmp + y*rowheight;
+                DiracBlock *blocks = s->blmotion + y*s->blwidth;
+
+                init_obmc_weights(s, p, y);
+
+                if (y == s->blheight-1 || start+p->ybsep > p->height)
+                    h = p->height - start;
+                else
+                    h = p->ybsep - (start - dsty);
+                if (h < 0)
+                    break;
+
+                memset(mctmp+2*p->yoffset*p->stride, 0, 2*rowheight);
+                mc_row(s, blocks, mctmp, comp, dsty);
+
+                mctmp += (start - dsty)*p->stride + p->xoffset;
+                ff_spatial_idwt_slice2(&d, start + h); /* decode */
+                /* NOTE: add_rect_clamped hasn't been templated hence the shifts.
+                 * idwt.stride is passed as pixels, not in bytes as in the rest of the decoder */
+                s->diracdsp.add_rect_clamped(frame + start*p->stride, mctmp, p->stride,
+                                             (int16_t*)(p->idwt.buf) + start*(p->idwt.stride >> 1), (p->idwt.stride >> 1), p->width, h);
+
+                dsty += p->ybsep;
+            }
+        }
+    }
+
+
+    return 0;
+}
+
+static int get_buffer_with_edge(AVCodecContext *avctx, AVFrame *f, int flags)
+{
+    int ret, i;
+    int chroma_x_shift, chroma_y_shift;
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &chroma_x_shift, &chroma_y_shift);
+
+    f->width  = avctx->width  + 2 * EDGE_WIDTH;
+    f->height = avctx->height + 2 * EDGE_WIDTH + 2;
+    ret = ff_get_buffer(avctx, f, flags);
+    if (ret < 0)
+        return ret;
+
+    for (i = 0; f->data[i]; i++) {
+        int offset = (EDGE_WIDTH >> (i && i<3 ? chroma_y_shift : 0)) *
+                     f->linesize[i] + 32;
+        f->data[i] += offset;
+    }
+    f->width  = avctx->width;
+    f->height = avctx->height;
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 11.1.1 Picture Header. picture_header()
+ */
+static int dirac_decode_picture_header(DiracContext *s)
+{
+    unsigned retire, picnum;
+    int i, j, ret;
+    int64_t refdist, refnum;
+    GetBitContext *gb = &s->gb;
+
+    /* [DIRAC_STD] 11.1.1 Picture Header. picture_header() PICTURE_NUM */
+    picnum = s->current_picture->avframe->display_picture_number = get_bits_long(gb, 32);
+
+
+    av_log(s->avctx,AV_LOG_DEBUG,"PICTURE_NUM: %d\n",picnum);
+
+    /* if this is the first keyframe after a sequence header, start our
+       reordering from here */
+    if (s->frame_number < 0)
+        s->frame_number = picnum;
+
+    s->ref_pics[0] = s->ref_pics[1] = NULL;
+    for (i = 0; i < s->num_refs; i++) {
+        refnum = (picnum + dirac_get_se_golomb(gb)) & 0xFFFFFFFF;
+        refdist = INT64_MAX;
+
+        /* find the closest reference to the one we want */
+        /* Jordi: this is needed if the referenced picture hasn't yet arrived */
+        for (j = 0; j < MAX_REFERENCE_FRAMES && refdist; j++)
+            if (s->ref_frames[j]
+                && FFABS(s->ref_frames[j]->avframe->display_picture_number - refnum) < refdist) {
+                s->ref_pics[i] = s->ref_frames[j];
+                refdist = FFABS(s->ref_frames[j]->avframe->display_picture_number - refnum);
+            }
+
+        if (!s->ref_pics[i] || refdist)
+            av_log(s->avctx, AV_LOG_DEBUG, "Reference not found\n");
+
+        /* if there were no references at all, allocate one */
+        if (!s->ref_pics[i])
+            for (j = 0; j < MAX_FRAMES; j++)
+                if (!s->all_frames[j].avframe->data[0]) {
+                    s->ref_pics[i] = &s->all_frames[j];
+                    get_buffer_with_edge(s->avctx, s->ref_pics[i]->avframe, AV_GET_BUFFER_FLAG_REF);
+                    break;
+                }
+
+        if (!s->ref_pics[i]) {
+            av_log(s->avctx, AV_LOG_ERROR, "Reference could not be allocated\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+    }
+
+    /* retire the reference frames that are not used anymore */
+    if (s->current_picture->reference) {
+        retire = (picnum + dirac_get_se_golomb(gb)) & 0xFFFFFFFF;
+        if (retire != picnum) {
+            DiracFrame *retire_pic = remove_frame(s->ref_frames, retire);
+
+            if (retire_pic)
+                retire_pic->reference &= DELAYED_PIC_REF;
+            else
+                av_log(s->avctx, AV_LOG_DEBUG, "Frame to retire not found\n");
+        }
+
+        /* if reference array is full, remove the oldest as per the spec */
+        while (add_frame(s->ref_frames, MAX_REFERENCE_FRAMES, s->current_picture)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Reference frame overflow\n");
+            remove_frame(s->ref_frames, s->ref_frames[0]->avframe->display_picture_number)->reference &= DELAYED_PIC_REF;
+        }
+    }
+
+    if (s->num_refs) {
+        ret = dirac_unpack_prediction_parameters(s);  /* [DIRAC_STD] 11.2 Picture Prediction Data. picture_prediction() */
+        if (ret < 0)
+            return ret;
+        ret = dirac_unpack_block_motion_data(s);      /* [DIRAC_STD] 12. Block motion data syntax                       */
+        if (ret < 0)
+            return ret;
+    }
+    ret = dirac_unpack_idwt_params(s);                /* [DIRAC_STD] 11.3 Wavelet transform data                        */
+    if (ret < 0)
+        return ret;
+
+    init_planes(s);
+    return 0;
+}
+
+static int get_delayed_pic(DiracContext *s, AVFrame *picture, int *got_frame)
+{
+    DiracFrame *out = s->delay_frames[0];
+    int i, out_idx  = 0;
+    int ret;
+
+    /* find frame with lowest picture number */
+    for (i = 1; s->delay_frames[i]; i++)
+        if (s->delay_frames[i]->avframe->display_picture_number < out->avframe->display_picture_number) {
+            out     = s->delay_frames[i];
+            out_idx = i;
+        }
+
+    for (i = out_idx; s->delay_frames[i]; i++)
+        s->delay_frames[i] = s->delay_frames[i+1];
+
+    if (out) {
+        out->reference ^= DELAYED_PIC_REF;
+        *got_frame = 1;
+        if((ret = av_frame_ref(picture, out->avframe)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+/**
+ * Dirac Specification ->
+ * 9.6 Parse Info Header Syntax. parse_info()
+ * 4 byte start code + byte parse code + 4 byte size + 4 byte previous size
+ */
+#define DATA_UNIT_HEADER_SIZE 13
+
+/* [DIRAC_STD] dirac_decode_data_unit makes reference to the while defined in 9.3
+   inside the function parse_sequence() */
+static int dirac_decode_data_unit(AVCodecContext *avctx, const uint8_t *buf, int size)
+{
+    DiracContext *s   = avctx->priv_data;
+    DiracFrame *pic   = NULL;
+    AVDiracSeqHeader *dsh;
+    int ret, i;
+    uint8_t parse_code;
+    unsigned tmp;
+
+    if (size < DATA_UNIT_HEADER_SIZE)
+        return AVERROR_INVALIDDATA;
+
+    parse_code = buf[4];
+
+    init_get_bits(&s->gb, &buf[13], 8*(size - DATA_UNIT_HEADER_SIZE));
+
+    if (parse_code == DIRAC_PCODE_SEQ_HEADER) {
+        if (s->seen_sequence_header)
+            return 0;
+
+        /* [DIRAC_STD] 10. Sequence header */
+        ret = av_dirac_parse_sequence_header(&dsh, buf + DATA_UNIT_HEADER_SIZE, size - DATA_UNIT_HEADER_SIZE, avctx);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "error parsing sequence header");
+            return ret;
+        }
+
+        ret = ff_set_dimensions(avctx, dsh->width, dsh->height);
+        if (ret < 0) {
+            av_freep(&dsh);
+            return ret;
+        }
+
+        ff_set_sar(avctx, dsh->sample_aspect_ratio);
+        avctx->pix_fmt         = dsh->pix_fmt;
+        avctx->color_range     = dsh->color_range;
+        avctx->color_trc       = dsh->color_trc;
+        avctx->color_primaries = dsh->color_primaries;
+        avctx->colorspace      = dsh->colorspace;
+        avctx->profile         = dsh->profile;
+        avctx->level           = dsh->level;
+        avctx->framerate       = dsh->framerate;
+        s->bit_depth           = dsh->bit_depth;
+        s->version.major       = dsh->version.major;
+        s->version.minor       = dsh->version.minor;
+        s->seq                 = *dsh;
+        av_freep(&dsh);
+
+        s->pshift = s->bit_depth > 8;
+
+        avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
+
+        ret = alloc_sequence_buffers(s);
+        if (ret < 0)
+            return ret;
+
+        s->seen_sequence_header = 1;
+    } else if (parse_code == DIRAC_PCODE_END_SEQ) { /* [DIRAC_STD] End of Sequence */
+        free_sequence_buffers(s);
+        s->seen_sequence_header = 0;
+    } else if (parse_code == DIRAC_PCODE_AUX) {
+        if (buf[13] == 1) {     /* encoder implementation/version */
+            int ver[3];
+            /* versions older than 1.0.8 don't store quant delta for
+               subbands with only one codeblock */
+            if (sscanf(buf+14, "Schroedinger %d.%d.%d", ver, ver+1, ver+2) == 3)
+                if (ver[0] == 1 && ver[1] == 0 && ver[2] <= 7)
+                    s->old_delta_quant = 1;
+        }
+    } else if (parse_code & 0x8) {  /* picture data unit */
+        if (!s->seen_sequence_header) {
+            av_log(avctx, AV_LOG_DEBUG, "Dropping frame without sequence header\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        /* find an unused frame */
+        for (i = 0; i < MAX_FRAMES; i++)
+            if (s->all_frames[i].avframe->data[0] == NULL)
+                pic = &s->all_frames[i];
+        if (!pic) {
+            av_log(avctx, AV_LOG_ERROR, "framelist full\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        av_frame_unref(pic->avframe);
+
+        /* [DIRAC_STD] Defined in 9.6.1 ... */
+        tmp            =  parse_code & 0x03;                   /* [DIRAC_STD] num_refs()      */
+        if (tmp > 2) {
+            av_log(avctx, AV_LOG_ERROR, "num_refs of 3\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->num_refs      = tmp;
+        s->is_arith      = (parse_code & 0x48) == 0x08;          /* [DIRAC_STD] using_ac()            */
+        s->low_delay     = (parse_code & 0x88) == 0x88;          /* [DIRAC_STD] is_low_delay()        */
+        s->core_syntax   = (parse_code & 0x88) == 0x08;          /* [DIRAC_STD] is_core_syntax()      */
+        s->ld_picture    = (parse_code & 0xF8) == 0xC8;          /* [DIRAC_STD] is_ld_picture()       */
+        s->hq_picture    = (parse_code & 0xF8) == 0xE8;          /* [DIRAC_STD] is_hq_picture()       */
+        s->dc_prediction = (parse_code & 0x28) == 0x08;          /* [DIRAC_STD] using_dc_prediction() */
+        pic->reference   = (parse_code & 0x0C) == 0x0C;          /* [DIRAC_STD] is_reference()        */
+        pic->avframe->key_frame = s->num_refs == 0;              /* [DIRAC_STD] is_intra()            */
+        pic->avframe->pict_type = s->num_refs + 1;               /* Definition of AVPictureType in avutil.h */
+
+        /* VC-2 Low Delay has a different parse code than the Dirac Low Delay */
+        if (s->version.minor == 2 && parse_code == 0x88)
+            s->ld_picture = 1;
+
+        if (s->low_delay && !(s->ld_picture || s->hq_picture) ) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid low delay flag\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        if ((ret = get_buffer_with_edge(avctx, pic->avframe, (parse_code & 0x0C) == 0x0C ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
+            return ret;
+        s->current_picture = pic;
+        s->plane[0].stride = pic->avframe->linesize[0];
+        s->plane[1].stride = pic->avframe->linesize[1];
+        s->plane[2].stride = pic->avframe->linesize[2];
+
+        if (alloc_buffers(s, FFMAX3(FFABS(s->plane[0].stride), FFABS(s->plane[1].stride), FFABS(s->plane[2].stride))) < 0)
+            return AVERROR(ENOMEM);
+
+        /* [DIRAC_STD] 11.1 Picture parse. picture_parse() */
+        ret = dirac_decode_picture_header(s);
+        if (ret < 0)
+            return ret;
+
+        /* [DIRAC_STD] 13.0 Transform data syntax. transform_data() */
+        ret = dirac_decode_frame_internal(s);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
+}
+
+static int dirac_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *pkt)
+{
+    DiracContext *s     = avctx->priv_data;
+    AVFrame *picture    = data;
+    uint8_t *buf        = pkt->data;
+    int buf_size        = pkt->size;
+    int i, buf_idx      = 0;
+    int ret;
+    unsigned data_unit_size;
+
+    /* release unused frames */
+    for (i = 0; i < MAX_FRAMES; i++)
+        if (s->all_frames[i].avframe->data[0] && !s->all_frames[i].reference) {
+            av_frame_unref(s->all_frames[i].avframe);
+            memset(s->all_frames[i].interpolated, 0, sizeof(s->all_frames[i].interpolated));
+        }
+
+    s->current_picture = NULL;
+    *got_frame = 0;
+
+    /* end of stream, so flush delayed pics */
+    if (buf_size == 0)
+        return get_delayed_pic(s, (AVFrame *)data, got_frame);
+
+    for (;;) {
+        /*[DIRAC_STD] Here starts the code from parse_info() defined in 9.6
+          [DIRAC_STD] PARSE_INFO_PREFIX = "BBCD" as defined in ISO/IEC 646
+          BBCD start code search */
+        for (; buf_idx + DATA_UNIT_HEADER_SIZE < buf_size; buf_idx++) {
+            if (buf[buf_idx  ] == 'B' && buf[buf_idx+1] == 'B' &&
+                buf[buf_idx+2] == 'C' && buf[buf_idx+3] == 'D')
+                break;
+        }
+        /* BBCD found or end of data */
+        if (buf_idx + DATA_UNIT_HEADER_SIZE >= buf_size)
+            break;
+
+        data_unit_size = AV_RB32(buf+buf_idx+5);
+        if (data_unit_size > buf_size - buf_idx || !data_unit_size) {
+            if(data_unit_size > buf_size - buf_idx)
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Data unit with size %d is larger than input buffer, discarding\n",
+                   data_unit_size);
+            buf_idx += 4;
+            continue;
+        }
+        /* [DIRAC_STD] dirac_decode_data_unit makes reference to the while defined in 9.3 inside the function parse_sequence() */
+        ret = dirac_decode_data_unit(avctx, buf+buf_idx, data_unit_size);
+        if (ret < 0)
+        {
+            av_log(s->avctx, AV_LOG_ERROR,"Error in dirac_decode_data_unit\n");
+            return ret;
+        }
+        buf_idx += data_unit_size;
+    }
+
+    if (!s->current_picture)
+        return buf_size;
+
+    if (s->current_picture->avframe->display_picture_number > s->frame_number) {
+        DiracFrame *delayed_frame = remove_frame(s->delay_frames, s->frame_number);
+
+        s->current_picture->reference |= DELAYED_PIC_REF;
+
+        if (add_frame(s->delay_frames, MAX_DELAY, s->current_picture)) {
+            int min_num = s->delay_frames[0]->avframe->display_picture_number;
+            /* Too many delayed frames, so we display the frame with the lowest pts */
+            av_log(avctx, AV_LOG_ERROR, "Delay frame overflow\n");
+
+            for (i = 1; s->delay_frames[i]; i++)
+                if (s->delay_frames[i]->avframe->display_picture_number < min_num)
+                    min_num = s->delay_frames[i]->avframe->display_picture_number;
+
+            delayed_frame = remove_frame(s->delay_frames, min_num);
+            add_frame(s->delay_frames, MAX_DELAY, s->current_picture);
+        }
+
+        if (delayed_frame) {
+            delayed_frame->reference ^= DELAYED_PIC_REF;
+            if((ret=av_frame_ref(data, delayed_frame->avframe)) < 0)
+                return ret;
+            *got_frame = 1;
+        }
+    } else if (s->current_picture->avframe->display_picture_number == s->frame_number) {
+        /* The right frame at the right time :-) */
+        if((ret=av_frame_ref(data, s->current_picture->avframe)) < 0)
+            return ret;
+        *got_frame = 1;
+    }
+
+    if (*got_frame)
+        s->frame_number = picture->display_picture_number + 1;
+
+    return buf_idx;
+}
+
+AVCodec ff_dirac_decoder = {
+    .name           = "dirac",
+    .long_name      = NULL_IF_CONFIG_SMALL("BBC Dirac VC-2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DIRAC,
+    .priv_data_size = sizeof(DiracContext),
+    .init           = dirac_decode_init,
+    .close          = dirac_decode_end,
+    .decode         = dirac_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_DR1,
+    .flush          = dirac_decode_flush,
+};
diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
new file mode 100644
index 0000000..ab8d149
--- /dev/null
+++ b/libavcodec/diracdsp.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "diracdsp.h"
+
+#define FILTER(src, stride)                                     \
+    ((21*((src)[ 0*stride] + (src)[1*stride])                   \
+      -7*((src)[-1*stride] + (src)[2*stride])                   \
+      +3*((src)[-2*stride] + (src)[3*stride])                   \
+      -1*((src)[-3*stride] + (src)[4*stride]) + 16) >> 5)
+
+static void dirac_hpel_filter(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, const uint8_t *src,
+                              int stride, int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = -3; x < width+5; x++)
+            dstv[x] = av_clip_uint8(FILTER(src+x, stride));
+
+        for (x = 0; x < width; x++)
+            dstc[x] = av_clip_uint8(FILTER(dstv+x, 1));
+
+        for (x = 0; x < width; x++)
+            dsth[x] = av_clip_uint8(FILTER(src+x, 1));
+
+        src  += stride;
+        dsth += stride;
+        dstv += stride;
+        dstc += stride;
+    }
+}
+
+#define PIXOP_BILINEAR(PFX, OP, WIDTH)                                  \
+    static void ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c(uint8_t *dst, const uint8_t *src[5], int stride, int h) \
+    {                                                                   \
+        int x;                                                          \
+        const uint8_t *s0 = src[0];                                     \
+        const uint8_t *s1 = src[1];                                     \
+        const uint8_t *s2 = src[2];                                     \
+        const uint8_t *s3 = src[3];                                     \
+        const uint8_t *w  = src[4];                                     \
+                                                                        \
+        while (h--) {                                                   \
+            for (x = 0; x < WIDTH; x++) {                               \
+                OP(dst[x], (s0[x]*w[0] + s1[x]*w[1] + s2[x]*w[2] + s3[x]*w[3] + 8) >> 4); \
+            }                                                           \
+                                                                        \
+            dst += stride;                                              \
+            s0 += stride;                                               \
+            s1 += stride;                                               \
+            s2 += stride;                                               \
+            s3 += stride;                                               \
+        }                                                               \
+    }
+
+#define OP_PUT(dst, val) (dst) = (val)
+#define OP_AVG(dst, val) (dst) = (((dst) + (val) + 1)>>1)
+
+PIXOP_BILINEAR(put, OP_PUT, 8)
+PIXOP_BILINEAR(put, OP_PUT, 16)
+PIXOP_BILINEAR(put, OP_PUT, 32)
+PIXOP_BILINEAR(avg, OP_AVG, 8)
+PIXOP_BILINEAR(avg, OP_AVG, 16)
+PIXOP_BILINEAR(avg, OP_AVG, 32)
+
+#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + (1<<(log2_denom-1))) >> log2_denom)
+#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + (1<<(log2_denom-1))) >> log2_denom)
+
+#define DIRAC_WEIGHT(W)                                                 \
+    static void weight_dirac_pixels ## W ## _c(uint8_t *block, int stride, int log2_denom, \
+                                               int weight, int h) {     \
+        int x;                                                          \
+        while (h--) {                                                   \
+            for (x = 0; x < W; x++) {                                   \
+                op_scale1(x);                                           \
+                op_scale1(x+1);                                         \
+            }                                                           \
+            block += stride;                                            \
+        }                                                               \
+    }                                                                   \
+    static void biweight_dirac_pixels ## W ## _c(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, \
+                                                 int weightd, int weights, int h) { \
+        int x;                                                          \
+        while (h--) {                                                   \
+            for (x = 0; x < W; x++) {                                   \
+                op_scale2(x);                                           \
+                op_scale2(x+1);                                         \
+            }                                                           \
+            dst += stride;                                              \
+            src += stride;                                              \
+        }                                                               \
+    }
+
+DIRAC_WEIGHT(8)
+DIRAC_WEIGHT(16)
+DIRAC_WEIGHT(32)
+
+#define ADD_OBMC(xblen)                                                 \
+    static void add_obmc ## xblen ## _c(uint16_t *dst, const uint8_t *src, int stride, \
+                                        const uint8_t *obmc_weight, int yblen) \
+    {                                                                   \
+        int x;                                                          \
+        while (yblen--) {                                               \
+            for (x = 0; x < xblen; x += 2) {                            \
+                dst[x  ] += src[x  ] * obmc_weight[x  ];                \
+                dst[x+1] += src[x+1] * obmc_weight[x+1];                \
+            }                                                           \
+            dst += stride;                                              \
+            src += stride;                                              \
+            obmc_weight += 32;                                          \
+        }                                                               \
+    }
+
+ADD_OBMC(8)
+ADD_OBMC(16)
+ADD_OBMC(32)
+
+static void put_signed_rect_clamped_8bit_c(uint8_t *dst, int dst_stride, const uint8_t *_src, int src_stride, int width, int height)
+{
+    int x, y;
+    int16_t *src = (int16_t *)_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x+=4) {
+            dst[x  ] = av_clip_uint8(src[x  ] + 128);
+            dst[x+1] = av_clip_uint8(src[x+1] + 128);
+            dst[x+2] = av_clip_uint8(src[x+2] + 128);
+            dst[x+3] = av_clip_uint8(src[x+3] + 128);
+        }
+        dst += dst_stride;
+        src += src_stride >> 1;
+    }
+}
+
+#define PUT_SIGNED_RECT_CLAMPED(PX)                                                                     \
+static void put_signed_rect_clamped_ ## PX ## bit_c(uint8_t *_dst, int dst_stride, const uint8_t *_src, \
+                                                  int src_stride, int width, int height)                \
+{                                                                                                       \
+    int x, y;                                                                                           \
+    uint16_t *dst = (uint16_t *)_dst;                                                                   \
+    int32_t *src = (int32_t *)_src;                                                                     \
+    for (y = 0; y < height; y++) {                                                                      \
+        for (x = 0; x < width; x+=4) {                                                                  \
+            dst[x  ] = av_clip_uintp2(src[x  ] + (1 << (PX - 1)), PX);                                  \
+            dst[x+1] = av_clip_uintp2(src[x+1] + (1 << (PX - 1)), PX);                                  \
+            dst[x+2] = av_clip_uintp2(src[x+2] + (1 << (PX - 1)), PX);                                  \
+            dst[x+3] = av_clip_uintp2(src[x+3] + (1 << (PX - 1)), PX);                                  \
+        }                                                                                               \
+        dst += dst_stride >> 1;                                                                         \
+        src += src_stride >> 2;                                                                         \
+    }                                                                                                   \
+}
+
+PUT_SIGNED_RECT_CLAMPED(10)
+PUT_SIGNED_RECT_CLAMPED(12)
+
+static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride,
+                               const int16_t *idwt, int idwt_stride,
+                               int width, int height)
+{
+    int x, y;
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x+=2) {
+            dst[x  ] = av_clip_uint8(((src[x  ]+32)>>6) + idwt[x  ]);
+            dst[x+1] = av_clip_uint8(((src[x+1]+32)>>6) + idwt[x+1]);
+        }
+        dst += stride;
+        src += stride;
+        idwt += idwt_stride;
+    }
+}
+
+#define PIXFUNC(PFX, WIDTH)                                             \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][2] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l4_c; \
+    c->PFX ## _dirac_pixels_tab[WIDTH>>4][3] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _bilinear_c
+
+av_cold void ff_diracdsp_init(DiracDSPContext *c)
+{
+    c->dirac_hpel_filter = dirac_hpel_filter;
+    c->add_rect_clamped = add_rect_clamped_c;
+    c->put_signed_rect_clamped[0] = put_signed_rect_clamped_8bit_c;
+    c->put_signed_rect_clamped[1] = put_signed_rect_clamped_10bit_c;
+    c->put_signed_rect_clamped[2] = put_signed_rect_clamped_12bit_c;
+
+    c->add_dirac_obmc[0] = add_obmc8_c;
+    c->add_dirac_obmc[1] = add_obmc16_c;
+    c->add_dirac_obmc[2] = add_obmc32_c;
+
+    c->weight_dirac_pixels_tab[0] = weight_dirac_pixels8_c;
+    c->weight_dirac_pixels_tab[1] = weight_dirac_pixels16_c;
+    c->weight_dirac_pixels_tab[2] = weight_dirac_pixels32_c;
+    c->biweight_dirac_pixels_tab[0] = biweight_dirac_pixels8_c;
+    c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
+    c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
+
+    PIXFUNC(put, 8);
+    PIXFUNC(put, 16);
+    PIXFUNC(put, 32);
+    PIXFUNC(avg, 8);
+    PIXFUNC(avg, 16);
+    PIXFUNC(avg, 32);
+
+    if (ARCH_X86)
+        ff_diracdsp_init_x86(c);
+}
diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
new file mode 100644
index 0000000..25a872d
--- /dev/null
+++ b/libavcodec/diracdsp.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRACDSP_H
+#define AVCODEC_DIRACDSP_H
+
+#include <stdint.h>
+
+typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h);
+typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h);
+
+typedef struct {
+    void (*dirac_hpel_filter)(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, const uint8_t *src, int stride, int width, int height);
+    /**
+     * dirac_pixels_tab[width][subpel]
+     * width is 2 for 32, 1 for 16, 0 for 8
+     * subpel is 0 for fpel and hpel (only need to copy from the first plane in src)
+     *           1 if an average of the first 2 planes is needed (TODO: worth it?)
+     *           2 for general qpel (avg of 4)
+     *           3 for general epel (biweight of 4 using the weights in src[4])
+     * src[0-3] is each of the hpel planes
+     * src[4] is the 1/8 pel weights if needed
+     */
+    void (*put_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+    void (*avg_dirac_pixels_tab[3][4])(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+    void (*put_signed_rect_clamped[3])(uint8_t *dst/*align 16*/, int dst_stride, const uint8_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/);
+    void (*put_rect_clamped)(uint8_t *dst/*align 16*/, int dst_stride, const uint8_t *src/*align 16*/, int src_stride, int width, int height/*mod 2*/);
+    void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/);
+    void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+    dirac_weight_func weight_dirac_pixels_tab[3];
+    dirac_biweight_func biweight_dirac_pixels_tab[3];
+} DiracDSPContext;
+
+#define DECL_DIRAC_PIXOP(PFX, EXT)                                      \
+    void ff_ ## PFX ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
+    void ff_ ## PFX ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h); \
+    void ff_ ## PFX ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+
+DECL_DIRAC_PIXOP(put, c);
+DECL_DIRAC_PIXOP(avg, c);
+DECL_DIRAC_PIXOP(put, l2_c);
+DECL_DIRAC_PIXOP(avg, l2_c);
+DECL_DIRAC_PIXOP(put, l4_c);
+DECL_DIRAC_PIXOP(avg, l4_c);
+
+void ff_diracdsp_init(DiracDSPContext *c);
+void ff_diracdsp_init_x86(DiracDSPContext* c);
+
+#endif /* AVCODEC_DIRACDSP_H */
diff --git a/libavcodec/diractab.c b/libavcodec/diractab.c
new file mode 100644
index 0000000..816b939
--- /dev/null
+++ b/libavcodec/diractab.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author    (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "diractab.h"
+
+const uint8_t ff_dirac_default_qmat[7][4][4] = {
+    { { 5,  3,  3,  0}, { 0,  4,  4,  1}, { 0,  5,  5,  2}, { 0,  6,  6,  3} },
+    { { 4,  2,  2,  0}, { 0,  4,  4,  2}, { 0,  5,  5,  3}, { 0,  7,  7,  5} },
+    { { 5,  3,  3,  0}, { 0,  4,  4,  1}, { 0,  5,  5,  2}, { 0,  6,  6,  3} },
+    { { 8,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0} },
+    { { 8,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0}, { 0,  4,  4,  0} },
+    { { 0,  4,  4,  8}, { 0,  8,  8, 12}, { 0, 13, 13, 17}, { 0, 17, 17, 21} },
+    { { 3,  1,  1,  0}, { 0,  4,  4,  2}, { 0,  6,  6,  5}, { 0,  9,  9,  7} },
+};
+
+const int32_t ff_dirac_qscale_tab[116] = {
+    4,         5,         6,         7,         8,        10,        11,        13,
+    16,        19,        23,        27,        32,        38,        45,        54,
+    64,        76,        91,       108,       128,       152,       181,       215,
+    256,       304,       362,       431,       512,       609,       724,       861,
+    1024,      1218,      1448,      1722,      2048,      2435,      2896,      3444,
+    4096,      4871,      5793,      6889,      8192,      9742,     11585,     13777,
+    16384,     19484,     23170,     27554,     32768,     38968,     46341,     55109,
+    65536,     77936,     92682,    110218,    131072,    155872,    185364,    220436,
+    262144,    311744,    370728,    440872,    524288,    623487,    741455,    881744,
+    1048576,   1246974,   1482910,   1763488,   2097152,   2493948,   2965821,   3526975,
+    4194304,   4987896,   5931642,   7053950,   8388608,   9975792,  11863283,  14107901,
+    16777216,  19951585,  23726566,  28215802,  33554432,  39903169,  47453133,  56431603,
+    67108864,  79806339,  94906266, 112863206, 134217728, 159612677, 189812531, 225726413,
+    268435456, 319225354, 379625062, 451452825, 536870912, 638450708, 759250125, 902905651,
+    1073741824,1276901417,1518500250,1805811301,/*2147483648,2553802834,3037000500,3611622603,
+    4294967296*/
+};
+
+const int32_t ff_dirac_qoffset_intra_tab[120] = {
+    1,         2,         3,         4,         4,         5,         6,         7,
+    8,        10,        12,        14,        16,        19,        23,        27,
+    32,        38,        46,        54,        64,        76,        91,       108,
+    128,       152,       181,       216,       256,       305,       362,       431,
+    512,       609,       724,       861,      1024,      1218,      1448,      1722,
+    2048,      2436,      2897,      3445,      4096,      4871,      5793,      6889,
+    8192,      9742,     11585,     13777,     16384,     19484,     23171,     27555,
+    32768,     38968,     46341,     55109,     65536,     77936,     92682,    110218,
+    131072,    155872,    185364,    220436,    262144,    311744,    370728,    440872,
+    524288,    623487,    741455,    881744,   1048576,   1246974,   1482911,   1763488,
+    2097152,   2493948,   2965821,   3526975,   4194304,   4987896,   5931642,   7053951,
+    8388608,   9975793,  11863283,  14107901,  16777216,  19951585,  23726567,  28215802,
+    33554432,  39903170,  47453133,  56431603,  67108864,  79806339,  94906266, 112863207,
+    134217728, 159612677, 189812531, 225726413, 268435456, 319225354, 379625063, 451452826,
+    536870912, 638450709, 759250125, 902905651,1073741824,1276901417,1518500250,1805811302,
+    /*2147483648, 2553802834, 3037000500, 3611622603, 4294967296,*/
+};
+
+const int ff_dirac_qoffset_inter_tab[122] = {
+    1,         2,         2,         3,         3,         4,         4,         5,
+    6,         7,         9,        10,        12,        14,        17,        20,
+    24,        29,        34,        41,        48,        57,        68,        81,
+    96,       114,       136,       162,       192,       228,       272,       323,
+    384,       457,       543,       646,       768,       913,      1086,      1292,
+    1536,      1827,      2172,      2583,      3072,      3653,      4344,      5166,
+    6144,      7307,      8689,     10333,     12288,     14613,     17378,     20666,
+    24576,     29226,     34756,     41332,     49152,     58452,     69512,     82664,
+    98304,    116904,    139023,    165327,    196608,    233808,    278046,    330654,
+    393216,    467615,    556091,    661308,    786432,    935231,   1112183,   1322616,
+    1572864,   1870461,   2224366,   2645231,   3145728,   3740922,   4448731,   5290463,
+    6291456,   7481844,   8897462,  10580926,  12582912,  14963688,  17794925,  21161851,
+    25165824,  29927377,  35589850,  42323702,  50331648,  59854754,  71179699,  84647405,
+    100663296, 119709508, 142359398, 169294809, 201326592, 239419016, 284718797, 338589619,
+    402653184, 478838031, 569437594, 677179238, 805306368, 957676063,1138875188,1354358476,
+    1610612736, 1915352125, /*2277750375, 2708716952, 3221225472, 3830704250,*/
+};
diff --git a/libavcodec/diractab.h b/libavcodec/diractab.h
new file mode 100644
index 0000000..cd8b8ac
--- /dev/null
+++ b/libavcodec/diractab.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author    (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRACTAB_H
+#define AVCODEC_DIRACTAB_H
+
+#include <stdint.h>
+
+/* Tables here are shared between the Dirac/VC-2 decoder and the VC-2 encoder */
+
+/* Default quantization tables for each wavelet transform */
+extern const uint8_t ff_dirac_default_qmat[7][4][4];
+
+/* Scaling factors needed for quantization/dequantization */
+extern const int32_t ff_dirac_qscale_tab[116];
+
+/* Scaling offsets needed for quantization/dequantization, for intra frames */
+extern const int32_t ff_dirac_qoffset_intra_tab[120];
+
+/* Scaling offsets needed for quantization/dequantization, for inter frames */
+extern const int ff_dirac_qoffset_inter_tab[122];
+
+#endif /* AVCODEC_DIRACTAB_H */
diff --git a/libavcodec/dnxhd_parser.c b/libavcodec/dnxhd_parser.c
index 0de3561..033b8ee 100644
--- a/libavcodec/dnxhd_parser.c
+++ b/libavcodec/dnxhd_parser.c
@@ -2,20 +2,20 @@
  * DNxHD/VC-3 parser
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,22 +25,32 @@
  */
 
 #include "parser.h"
+#include "dnxhddata.h"
 
-#define DNXHD_HEADER_PREFIX 0x0000028001
+typedef struct {
+    ParseContext pc;
+    int interlaced;
+    int cur_field; /* first field is 0, second is 1 */
+} DNXHDParserContext;
 
-static int dnxhd_find_frame_end(ParseContext *pc,
+static int dnxhd_find_frame_end(DNXHDParserContext *dctx,
                                 const uint8_t *buf, int buf_size)
 {
+    ParseContext *pc = &dctx->pc;
     uint64_t state = pc->state64;
     int pic_found = pc->frame_start_found;
     int i = 0;
+    int interlaced = dctx->interlaced;
+    int cur_field = dctx->cur_field;
 
     if (!pic_found) {
         for (i = 0; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state & 0xffffffffffLL) == DNXHD_HEADER_PREFIX) {
+            if (ff_dnxhd_check_header_prefix(state & 0xffffffffff00LL) != 0) {
                 i++;
                 pic_found = 1;
+                interlaced = (state&2)>>1; /* byte following the 5-byte header prefix */
+                cur_field = state&1;
                 break;
             }
         }
@@ -51,15 +61,25 @@ static int dnxhd_find_frame_end(ParseContext *pc,
             return 0;
         for (; i < buf_size; i++) {
             state = (state << 8) | buf[i];
-            if ((state & 0xffffffffffLL) == DNXHD_HEADER_PREFIX) {
-                pc->frame_start_found = 0;
-                pc->state64 = -1;
-                return i - 4;
+            if (ff_dnxhd_check_header_prefix(state & 0xffffffffff00LL) != 0) {
+                if (!interlaced || dctx->cur_field) {
+                    pc->frame_start_found = 0;
+                    pc->state64 = -1;
+                    dctx->interlaced = interlaced;
+                    dctx->cur_field = 0;
+                    return i - 5;
+                } else {
+                    /* continue, to get the second field */
+                    dctx->interlaced = interlaced = (state&2)>>1;
+                    dctx->cur_field = cur_field = state&1;
+                }
             }
         }
     }
     pc->frame_start_found = pic_found;
     pc->state64 = state;
+    dctx->interlaced = interlaced;
+    dctx->cur_field = cur_field;
     return END_NOT_FOUND;
 }
 
@@ -68,13 +88,14 @@ static int dnxhd_parse(AVCodecParserContext *s,
                        const uint8_t **poutbuf, int *poutbuf_size,
                        const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    DNXHDParserContext *dctx = s->priv_data;
+    ParseContext *pc = &dctx->pc;
     int next;
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         next = buf_size;
     } else {
-        next = dnxhd_find_frame_end(pc, buf, buf_size);
+        next = dnxhd_find_frame_end(dctx, buf, buf_size);
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf      = NULL;
             *poutbuf_size = 0;
@@ -88,7 +109,7 @@ static int dnxhd_parse(AVCodecParserContext *s,
 
 AVCodecParser ff_dnxhd_parser = {
     .codec_ids      = { AV_CODEC_ID_DNXHD },
-    .priv_data_size = sizeof(ParseContext),
+    .priv_data_size = sizeof(DNXHDParserContext),
     .parser_parse   = dnxhd_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/dnxhddata.c b/libavcodec/dnxhddata.c
index 55272e9..7d935a3 100644
--- a/libavcodec/dnxhddata.c
+++ b/libavcodec/dnxhddata.c
@@ -2,30 +2,31 @@
  * VC3/DNxHD data.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "dnxhddata.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
 /* The quantization tables below are in zigzag order! */
 
-/* Used in CID 1235, 1256 */
+/* Used in CID 1235, 1256, 1270 */
 static const uint8_t dnxhd_1235_luma_weight[] = {
      0, 32, 32, 32, 33, 32, 32, 32,
     32, 31, 32, 33, 33, 33, 33, 35,
@@ -49,7 +50,7 @@ static const uint8_t dnxhd_1235_chroma_weight[] = {
     90, 90, 85, 79, 73, 73, 73, 73,
 };
 
-/* Used in CID 1237, 1253, 1259 */
+/* Used in CID 1237, 1253, 1259, 1273, 1274 */
 static const uint8_t dnxhd_1237_luma_weight[] = {
      0,  32,  33,  34, 34, 36, 37, 36,
     36,  37,  38,  38, 38, 39, 41, 44,
@@ -61,7 +62,7 @@ static const uint8_t dnxhd_1237_luma_weight[] = {
     97, 100, 104, 102, 98, 98, 99, 99,
 };
 
-/* Used in CID 1237, 1253, 1259 */
+/* Used in CID 1237, 1253, 1259, 1273, 1274 */
 static const uint8_t dnxhd_1237_chroma_weight[] = {
      0,  32,  36,  39, 39, 38, 39,  41,
     45,  51,  57,  58, 53, 48, 47,  51,
@@ -73,6 +74,7 @@ static const uint8_t dnxhd_1237_chroma_weight[] = {
     97, 100, 104, 102, 98, 98, 99,  99,
 };
 
+/* Used in CID 1238, 1272 */
 static const uint8_t dnxhd_1238_luma_weight[] = {
      0, 32, 32, 33, 34, 33, 33, 33,
     33, 33, 33, 33, 33, 35, 37, 37,
@@ -84,6 +86,7 @@ static const uint8_t dnxhd_1238_luma_weight[] = {
     51, 53, 55, 57, 58, 59, 57, 57,
 };
 
+/* Used in CID 1238, 1272 */
 static const uint8_t dnxhd_1238_chroma_weight[] = {
      0, 32, 35, 35, 35, 34, 34, 35,
     39, 43, 45, 45, 41, 39, 40, 41,
@@ -95,6 +98,7 @@ static const uint8_t dnxhd_1238_chroma_weight[] = {
     82, 77, 80, 86, 84, 82, 82, 82,
 };
 
+/* Used in CID 1241, 1271 */
 static const uint8_t dnxhd_1241_luma_weight[] = {
      0, 32, 33, 34, 34, 35, 36, 37,
     36, 37, 38, 38, 38, 39, 39, 40,
@@ -106,6 +110,7 @@ static const uint8_t dnxhd_1241_luma_weight[] = {
     48, 46, 47, 48, 48, 49, 49, 49,
 };
 
+/* Used in CID 1241, 1271 */
 static const uint8_t dnxhd_1241_chroma_weight[] = {
      0, 32, 36, 38, 37, 37, 40, 41,
     40, 40, 42, 42, 41, 41, 41, 41,
@@ -251,27 +256,27 @@ static const uint8_t dnxhd_1260_chroma_weight[] = {
     56, 56, 53, 53, 53, 54, 58, 58,
 };
 
-/* Used in CID 1235, 1241, 1250, 1256 */
+/* Used in CID 1235, 1236, 1241, 1250, 1256, 1257, 1270, 1271 */
 static const uint8_t dnxhd_1235_dc_codes[14] = {
     10, 62, 11, 12, 13, 0, 1, 2, 3, 4, 14, 30, 126, 127,
 };
 
-/* Used in CID 1235, 1241, 1250, 1256 */
+/* Used in CID 1235, 1236, 1241, 1250, 1256, 1257, 1270, 1271 */
 static const uint8_t dnxhd_1235_dc_bits[14] = {
     4, 6, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 7, 7,
 };
 
-/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260 */
+/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260, 1272, 1273, 1274 */
 static const uint8_t dnxhd_1237_dc_codes[12] = {
     0, 12, 13, 1, 2, 3, 4, 5, 14, 30, 62, 63,
 };
 
-/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260 */
+/* Used in CID 1237, 1238, 1242, 1243, 1251, 1252, 1253, 1258, 1259, 1260, 1272, 1273, 1274 */
 static const uint8_t dnxhd_1237_dc_bits[12] = {
     3, 4, 4, 3, 3, 3, 3, 3, 4, 5, 6, 6,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint16_t dnxhd_1237_ac_codes[257] = {
         0,     1,     4,     5,    12,    26,    27,    56,
        57,    58,    59,   120,   121,   244,   245,   246,
@@ -308,7 +313,7 @@ static const uint16_t dnxhd_1237_ac_codes[257] = {
     65535,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint8_t dnxhd_1237_ac_bits[257] = {
      2,  2,  3,  3,  4,  5,  5,  6,  6,  6,  6,  7,  7,  8,  8,  8,
      8,  8,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 11, 11, 11,
@@ -329,70 +334,44 @@ static const uint8_t dnxhd_1237_ac_bits[257] = {
     16,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_level[257] = {
-     1,  1,  2,  0,  3,  4,  2,  5,  6,  7,  3,  8,  9, 10, 11, 12,
-     4,  5, 13, 14, 15, 16,  6, 17, 18, 19, 20, 21,  7, 22, 23, 24,
-    25, 26, 27,  8,  9, 28, 29, 30, 31, 32, 33, 34, 10, 11, 12, 35,
-    36, 37, 38, 39, 40, 41, 13, 14, 15, 16, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 17, 18, 19, 20, 21, 53, 54, 55, 56, 57, 58,
-    59, 60, 61, 64,  1, 22, 23, 24, 25, 26, 27, 62, 63,  2,  3,  4,
-     5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-    21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
-    37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
-    53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
-static const uint8_t dnxhd_1237_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1238, 1243 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
+static const uint8_t dnxhd_1237_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   0, 0,   7, 0,   9, 0,   5, 2,  11, 0,
+     13, 0,  15, 0,   7, 2,  17, 0,  19, 0,  21, 0,  23, 0,  25, 0,
+      9, 2,  11, 2,  27, 0,  29, 0,  31, 0,  33, 0,  13, 2,  35, 0,
+     37, 0,  39, 0,  41, 0,  43, 0,  15, 2,  45, 0,  47, 0,  49, 0,
+     51, 0,  53, 0,  55, 0,  17, 2,  19, 2,  57, 0,  59, 0,  61, 0,
+     63, 0,  65, 0,  67, 0,  69, 0,  21, 2,  23, 2,  25, 2,  71, 0,
+     73, 0,  75, 0,  77, 0,  79, 0,  81, 0,  83, 0,  27, 2,  29, 2,
+     31, 2,  33, 2,  85, 0,  87, 0,  89, 0,  91, 0,  93, 0,  95, 0,
+     97, 0,  99, 0, 101, 0, 103, 0, 105, 0,  35, 2,  37, 2,  39, 2,
+     41, 2,  43, 2, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0, 117, 0,
+    119, 0, 121, 0, 123, 0, 129, 0,   3, 1,  45, 2,  47, 2,  49, 2,
+     51, 2,  53, 2,  55, 2, 125, 0, 127, 0,   5, 1,   7, 1,   9, 1,
+     11, 1,  13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,
+     27, 1,  29, 1,  31, 1,  33, 1,  35, 1,  37, 1,  39, 1,  41, 1,
+     43, 1,  45, 1,  47, 1,  49, 1,  51, 1,  53, 1,  55, 1,  57, 1,
+     59, 1,  61, 1,  63, 1,  65, 1,  67, 1,  69, 1,  71, 1,  73, 1,
+     75, 1,  77, 1,  79, 1,  81, 1,  83, 1,  85, 1,  87, 1,  89, 1,
+     91, 1,  93, 1,  95, 1,  97, 1,  99, 1, 101, 1, 103, 1, 105, 1,
+    107, 1, 109, 1, 111, 1, 113, 1, 115, 1, 117, 1, 119, 1, 121, 1,
+    123, 1, 125, 1, 127, 1, 129, 1,  57, 2,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
+};
+
+/* Used in CID 1238, 1240, 1243, 1272 */
 static const uint16_t dnxhd_1238_ac_codes[257] = {
         0,     1,     4,    10,    11,    24,    25,    26,
        54,    55,    56,    57,   116,   117,   118,   119,
@@ -429,7 +408,7 @@ static const uint16_t dnxhd_1238_ac_codes[257] = {
     65535,
 };
 
-/* Used in CID 1238, 1243 */
+/* Used in CID 1238, 1240, 1243, 1272 */
 static const uint8_t dnxhd_1238_ac_bits[257] = {
      2,  2,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
      8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10,
@@ -450,70 +429,44 @@ static const uint8_t dnxhd_1238_ac_bits[257] = {
     16,
 };
 
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21, 22,  6,  7, 23, 24,
-    25, 26, 27, 28, 29,  8,  9, 30, 31, 32, 33, 34, 35, 36, 37, 10,
-    11, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 12, 13, 14, 49,
-    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 15, 16, 17, 18,
-    62, 63, 64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
-    14, 15, 16, 19, 20, 21, 22, 23, 24, 17, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 25,
-    26, 27, 28, 29, 30, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49,
-    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
+/* Used in CID 1238, 1240, 1243, 1272 */
+static const uint8_t dnxhd_1238_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  45, 0,  13, 2,  15, 2,  47, 0,  49, 0,
+     51, 0,  53, 0,  55, 0,  57, 0,  59, 0,  17, 2,  19, 2,  61, 0,
+     63, 0,  65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  21, 2,
+     23, 2,  77, 0,  79, 0,  81, 0,  83, 0,  85, 0,  87, 0,  89, 0,
+     91, 0,  93, 0,  95, 0,  97, 0,  25, 2,  27, 2,  29, 2,  99, 0,
+    101, 0, 103, 0, 105, 0, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0,
+    117, 0, 119, 0, 121, 0, 123, 0,  31, 2,  33, 2,  35, 2,  37, 2,
+    125, 0, 127, 0, 129, 0,   3, 1,   5, 1,   7, 1,   9, 1,  11, 1,
+     13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,  27, 1,
+     29, 1,  31, 1,  33, 1,  39, 2,  41, 2,  43, 2,  45, 2,  47, 2,
+     49, 2,  35, 1,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,  47, 1,
+     49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  61, 1,  63, 1,
+     65, 1,  67, 1,  69, 1,  71, 1,  73, 1,  75, 1,  81, 1,  51, 2,
+     53, 2,  55, 2,  57, 2,  59, 2,  61, 2,  77, 1,  79, 1,  83, 1,
+     85, 1,  87, 1,  89, 1,  91, 1,  93, 1,  95, 1,  97, 1,  99, 1,
+    101, 1, 103, 1, 105, 1, 107, 1, 109, 1, 111, 1, 113, 1, 115, 1,
+    117, 1, 119, 1, 121, 1, 123, 1, 125, 1, 127, 1, 129, 1,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 }; /* 0 is EOB */
 
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1238, 1243 */
-static const uint8_t dnxhd_1238_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1235, 1241, 1256 */
+/* Used in CID 1235, 1236, 1241, 1256, 1257, 1270, 1271 */
 static const uint16_t dnxhd_1235_ac_codes[257] = {
         0,     1,     4,    10,    11,    24,    25,    26,
        54,    55,    56,    57,   116,   117,   118,   119,
@@ -550,7 +503,7 @@ static const uint16_t dnxhd_1235_ac_codes[257] = {
     65535,
 };
 
-/* Used in CID 1235, 1241, 1256 */
+/* Used in CID 1235, 1236, 1241, 1256, 1257, 1270, 1271 */
 static const uint8_t dnxhd_1235_ac_bits[257] = {
      2,  2,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
      8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10,
@@ -571,67 +524,41 @@ static const uint8_t dnxhd_1235_ac_bits[257] = {
     16,
 };
 
-/* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21,  6,  7, 22, 23, 24,
-    25, 26, 27, 28, 29,  8,  9, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-    10, 11, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 12, 13,
-    14, 15, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,  1,
-    16, 17, 18, 19, 64,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
-    13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 18, 19, 20, 21, 22, 23,
-    24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-    40, 41, 42, 25, 26, 27, 28, 29, 30, 31, 32, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-/* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1235, 1241, 1256 */
-static const uint8_t dnxhd_1235_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+/* Used in CID 1235, 1241, 1256, 1270, 1271 */
+static const uint8_t dnxhd_1235_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  13, 2,  15, 2,  45, 0,  47, 0,  49, 0,
+     51, 0,  53, 0,  55, 0,  57, 0,  59, 0,  17, 2,  19, 2,  61, 0,
+     63, 0,  65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  77, 0,
+     21, 2,  23, 2,  79, 0,  81, 0,  83, 0,  85, 0,  87, 0,  89, 0,
+     91, 0,  93, 0,  95, 0,  97, 0,  99, 0, 101, 0,  25, 2,  27, 2,
+     29, 2,  31, 2, 103, 0, 105, 0, 107, 0, 109, 0, 111, 0, 113, 0,
+    115, 0, 117, 0, 119, 0, 121, 0, 123, 0, 125, 0, 127, 0,   3, 1,
+     33, 2,  35, 2,  37, 2,  39, 2, 129, 0,   5, 1,   7, 1,   9, 1,
+     11, 1,  13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,
+     27, 1,  29, 1,  31, 1,  33, 1,  35, 1,  41, 2,  43, 2,  45, 2,
+     47, 2,  49, 2,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,  47, 1,
+     49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  61, 1,  63, 1,
+     65, 1,  67, 1,  69, 1,  71, 1,  73, 1,  75, 1,  77, 1,  79, 1,
+     81, 1,  83, 1,  85, 1,  51, 2,  53, 2,  55, 2,  57, 2,  59, 2,
+     61, 2,  63, 2,  65, 2,  87, 1,  89, 1,  91, 1,  93, 1,  95, 1,
+     97, 1,  99, 1, 101, 1, 103, 1, 105, 1, 107, 1, 109, 1, 111, 1,
+    113, 1, 115, 1, 117, 1, 119, 1, 121, 1, 123, 1, 125, 1, 127, 1,
+    129, 1,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 };
 
 static const uint16_t dnxhd_1250_ac_codes[257] = {
@@ -688,62 +615,41 @@ static const uint8_t dnxhd_1250_ac_bits[257] = {
     16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
     16
 };
-static const uint8_t dnxhd_1250_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21, 22,  6, 23, 24, 25,
-    26, 27, 28, 29,  7,  8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-     9, 10, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 11,
-    12, 13, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,  1,  2,
-     3,  4,  5, 14, 15, 16, 17,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 18, 19, 20, 21,
-    27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-    43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 22, 23, 24,
-    25, 26, 27, 54, 57, 58, 59, 60, 61, 62, 63, 64, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64
-};
-static const uint8_t dnxhd_1250_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1
-};
-static const uint8_t dnxhd_1250_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-    1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1
+
+static const uint8_t dnxhd_1250_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  45, 0,  13, 2,  47, 0,  49, 0,  51, 0,
+     53, 0,  55, 0,  57, 0,  59, 0,  15, 2,  17, 2,  61, 0,  63, 0,
+     65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  77, 0,  79, 0,
+     19, 2,  21, 2,  81, 0,  83, 0,  85, 0,  87, 0,  89, 0,  91, 0,
+     93, 0,  95, 0,  97, 0,  99, 0, 101, 0, 103, 0, 105, 0,  23, 2,
+     25, 2,  27, 2, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0, 117, 0,
+    119, 0, 121, 0, 123, 0, 125, 0, 127, 0, 129, 0,   3, 1,   5, 1,
+      7, 1,   9, 1,  11, 1,  29, 2,  31, 2,  33, 2,  35, 2,  13, 1,
+     15, 1,  17, 1,  19, 1,  21, 1,  23, 1,  25, 1,  27, 1,  29, 1,
+     31, 1,  33, 1,  35, 1,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,
+     47, 1,  49, 1,  51, 1,  53, 1,  37, 2,  39, 2,  41, 2,  43, 2,
+     55, 1,  57, 1,  59, 1,  61, 1,  63, 1,  65, 1,  67, 1,  69, 1,
+     71, 1,  73, 1,  75, 1,  77, 1,  79, 1,  81, 1,  83, 1,  85, 1,
+     87, 1,  89, 1,  91, 1,  93, 1,  95, 1,  97, 1,  99, 1, 101, 1,
+    103, 1, 105, 1, 107, 1, 111, 1, 113, 1,  45, 2,  47, 2,  49, 2,
+     51, 2,  53, 2,  55, 2, 109, 1, 115, 1, 117, 1, 119, 1, 121, 1,
+    123, 1, 125, 1, 127, 1, 129, 1,  57, 2,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 };
 
 static const uint16_t dnxhd_1251_ac_codes[257] = {
@@ -802,64 +708,40 @@ static const uint8_t dnxhd_1251_ac_bits[257] = {
     16,
 };
 
-static const uint8_t dnxhd_1251_ac_level[257] = {
-     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
-    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21,  6, 22, 23, 24, 25,
-    26, 27, 28, 29,  7,  8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-    40,  9, 10, 11, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
-    12, 13, 14, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,  1,
-     2,  3,  4,  5,  6,  7,  8, 15, 16, 17,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 18,
-    19, 20, 21, 22, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
-    42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-    58, 23, 24, 25, 26, 27, 28, 59, 60, 61, 62, 63, 64, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-static const uint8_t dnxhd_1251_ac_run_flag[257] = {
-    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-static const uint8_t dnxhd_1251_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
+static const uint8_t dnxhd_1251_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   0, 0,   9, 0,  11, 0,   5, 2,
+     13, 0,  15, 0,  17, 0,   7, 2,  19, 0,  21, 0,  23, 0,   9, 2,
+     25, 0,  27, 0,  29, 0,  31, 0,  33, 0,  11, 2,  35, 0,  37, 0,
+     39, 0,  41, 0,  43, 0,  13, 2,  45, 0,  47, 0,  49, 0,  51, 0,
+     53, 0,  55, 0,  57, 0,  59, 0,  15, 2,  17, 2,  61, 0,  63, 0,
+     65, 0,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,  77, 0,  79, 0,
+     81, 0,  19, 2,  21, 2,  23, 2,  83, 0,  85, 0,  87, 0,  89, 0,
+     91, 0,  93, 0,  95, 0,  97, 0,  99, 0, 101, 0, 103, 0, 105, 0,
+     25, 2,  27, 2,  29, 2, 107, 0, 109, 0, 111, 0, 113, 0, 115, 0,
+    117, 0, 119, 0, 121, 0, 123, 0, 125, 0, 127, 0, 129, 0,   3, 1,
+      5, 1,   7, 1,   9, 1,  11, 1,  13, 1,  15, 1,  17, 1,  31, 2,
+     33, 2,  35, 2,  19, 1,  21, 1,  23, 1,  25, 1,  27, 1,  29, 1,
+     31, 1,  33, 1,  35, 1,  37, 1,  39, 1,  41, 1,  43, 1,  45, 1,
+     47, 1,  49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  37, 2,
+     39, 2,  41, 2,  43, 2,  45, 2,  61, 1,  63, 1,  65, 1,  67, 1,
+     69, 1,  71, 1,  73, 1,  75, 1,  77, 1,  79, 1,  81, 1,  83, 1,
+     85, 1,  87, 1,  89, 1,  91, 1,  93, 1,  95, 1,  97, 1,  99, 1,
+    101, 1, 103, 1, 105, 1, 107, 1, 109, 1, 111, 1, 113, 1, 115, 1,
+    117, 1,  47, 2,  49, 2,  51, 2,  53, 2,  55, 2,  57, 2, 119, 1,
+    121, 1, 123, 1, 125, 1, 127, 1, 129, 1,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
 };
 
 /* Used in CID 1252, 1258 */
@@ -921,69 +803,43 @@ static const uint8_t dnxhd_1252_ac_bits[257] = {
 };
 
 /* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_level[257] = {
-     1,  1,  2,  3,  2,  0,  4,  5,  6,  7,  3,  8,  9, 10, 11, 12,
-    13, 14,  4,  5, 15, 16, 17, 18,  6, 19, 20, 21, 22, 23, 24,  7,
-     8, 25, 26, 27, 28, 29, 30, 31, 32,  9, 10, 33, 34, 35, 36, 37,
-    38, 39, 40, 41, 11, 12, 13, 42, 43, 44, 45, 46, 47, 48, 49, 50,
-    51, 52, 53, 14, 15, 16, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3, 17, 18, 19, 20,  4,  5,  6,  7,  8,  9, 10, 11,
-    12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 22,
-    23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-    39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-    55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-    64,
-};
-
-/* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_run_flag[257] = {
-    0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-    0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
-    1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1252, 1258 */
-static const uint8_t dnxhd_1252_ac_index_flag[257] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1,
-};
-
-/* Used in CID 1235, 1238, 1241, 1256 */
+static const uint8_t dnxhd_1252_ac_info[2*257] = {
+      3, 0,   3, 2,   5, 0,   7, 0,   5, 2,   0, 0,   9, 0,  11, 0,
+     13, 0,  15, 0,   7, 2,  17, 0,  19, 0,  21, 0,  23, 0,  25, 0,
+     27, 0,  29, 0,   9, 2,  11, 2,  31, 0,  33, 0,  35, 0,  37, 0,
+     13, 2,  39, 0,  41, 0,  43, 0,  45, 0,  47, 0,  49, 0,  15, 2,
+     17, 2,  51, 0,  53, 0,  55, 0,  57, 0,  59, 0,  61, 0,  63, 0,
+     65, 0,  19, 2,  21, 2,  67, 0,  69, 0,  71, 0,  73, 0,  75, 0,
+     77, 0,  79, 0,  81, 0,  83, 0,  23, 2,  25, 2,  27, 2,  85, 0,
+     87, 0,  89, 0,  91, 0,  93, 0,  95, 0,  97, 0,  99, 0, 101, 0,
+    103, 0, 105, 0, 107, 0,  29, 2,  31, 2,  33, 2, 109, 0, 111, 0,
+    113, 0, 115, 0, 117, 0, 119, 0, 121, 0, 123, 0, 125, 0, 127, 0,
+    129, 0,   3, 1,   5, 1,   7, 1,  35, 2,  37, 2,  39, 2,  41, 2,
+      9, 1,  11, 1,  13, 1,  15, 1,  17, 1,  19, 1,  21, 1,  23, 1,
+     25, 1,  27, 1,  29, 1,  31, 1,  33, 1,  35, 1,  37, 1,  39, 1,
+     41, 1,  43, 1,  43, 2,  45, 2,  47, 2,  49, 2,  51, 2,  45, 1,
+     47, 1,  49, 1,  51, 1,  53, 1,  55, 1,  57, 1,  59, 1,  61, 1,
+     63, 1,  65, 1,  67, 1,  69, 1,  71, 1,  73, 1,  75, 1,  77, 1,
+     79, 1,  81, 1,  83, 1,  85, 1,  87, 1,  89, 1,  91, 1,  93, 1,
+     95, 1,  97, 1,  99, 1, 101, 1, 103, 1, 105, 1, 107, 1, 109, 1,
+    111, 1, 113, 1, 115, 1, 117, 1, 119, 1, 121, 1, 123, 1, 125, 1,
+    127, 1, 129, 1,  53, 2,  55, 2,  57, 2,  59, 2,  61, 2,  63, 2,
+     65, 2,  67, 2,  69, 2,  71, 2,  73, 2,  75, 2,  77, 2,  79, 2,
+     81, 2,  83, 2,  85, 2,  87, 2,  89, 2,  91, 2,  93, 2,  95, 2,
+     97, 2,  99, 2, 101, 2, 103, 2, 105, 2, 107, 2, 109, 2, 111, 2,
+    113, 2, 115, 2, 117, 2, 119, 2, 121, 2, 123, 2, 125, 2, 127, 2,
+    129, 2,   3, 3,   5, 3,   7, 3,   9, 3,  11, 3,  13, 3,  15, 3,
+     17, 3,  19, 3,  21, 3,  23, 3,  25, 3,  27, 3,  29, 3,  31, 3,
+     33, 3,  35, 3,  37, 3,  39, 3,  41, 3,  43, 3,  45, 3,  47, 3,
+     49, 3,  51, 3,  53, 3,  55, 3,  57, 3,  59, 3,  61, 3,  63, 3,
+     65, 3,  67, 3,  69, 3,  71, 3,  73, 3,  75, 3,  77, 3,  79, 3,
+     81, 3,  83, 3,  85, 3,  87, 3,  89, 3,  91, 3,  93, 3,  95, 3,
+     97, 3,  99, 3, 101, 3, 103, 3, 105, 3, 107, 3, 109, 3, 111, 3,
+    113, 3, 115, 3, 117, 3, 119, 3, 121, 3, 123, 3, 125, 3, 127, 3,
+    129, 3,
+};
+
+/* Used in CID 1235, 1238, 1241, 1243, 1256, 1270, 1271, 1272 */
 static const uint16_t dnxhd_1235_run_codes[62] = {
        0,    4,   10,   11,   24,   25,   26,   27,
       56,   57,   58,   59,  120,  242,  486,  487,
@@ -995,7 +851,7 @@ static const uint16_t dnxhd_1235_run_codes[62] = {
     1018, 1019, 1020, 1021, 1022, 1023,
 };
 
-/* Used in CID 1235, 1238, 1241, 1243, 1256 */
+/* Used in CID 1235, 1238, 1241, 1243, 1256, 1270, 1271, 1272 */
 static const uint8_t dnxhd_1235_run_bits[62] = {
      1,  3,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  8,  9,  9,
      9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
@@ -1003,7 +859,7 @@ static const uint8_t dnxhd_1235_run_bits[62] = {
     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
 };
 
-/* Used in CID 1235, 1241, 1256 */
+/* Used in CID 1235, 1241, 1256, 1270, 1271 */
 static const uint8_t dnxhd_1235_run[62] = {
      1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
     18, 20, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
@@ -1011,7 +867,7 @@ static const uint8_t dnxhd_1235_run[62] = {
     49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint16_t dnxhd_1237_run_codes[62] = {
        0,    4,   10,   11,   24,   25,   26,   54,
       55,   56,   57,   58,  118,  119,  240,  482,
@@ -1023,7 +879,7 @@ static const uint16_t dnxhd_1237_run_codes[62] = {
     1018, 1019, 1020, 1021, 1022, 1023,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint8_t dnxhd_1237_run_bits[62] = {
      1,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  8,  9,
      9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
@@ -1031,7 +887,7 @@ static const uint8_t dnxhd_1237_run_bits[62] = {
     10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
 };
 
-/* Used in CID 1237, 1242, 1253, 1259, 1260 */
+/* Used in CID 1237, 1242, 1253, 1259, 1260, 1273, 1274 */
 static const uint8_t dnxhd_1237_run[62] = {
      1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
     17, 18, 19, 20, 21, 53, 57, 58, 59, 60, 61, 62, 22, 23, 24, 25,
@@ -1039,6 +895,7 @@ static const uint8_t dnxhd_1237_run[62] = {
     42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56,
 };
 
+/* Used in CID 1238, 1243, 1272 */
 static const uint8_t dnxhd_1238_run[62] = {
      1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
     20, 21, 17, 18, 19, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
@@ -1075,104 +932,150 @@ static const uint8_t dnxhd_1250_run[62] = {
 };
 
 const CIDEntry ff_dnxhd_cid_table[] = {
-    { 1235, 1920, 1080, 0, 917504, 917504, 6, 10,
+    { 1235, 1920, 1080, 917504, 917504,
+      0, 6, 10, 4,
       dnxhd_1235_luma_weight, dnxhd_1235_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
-      { 175, 185, 365, 440 } },
-    { 1237, 1920, 1080, 0, 606208, 606208, 4, 8,
+      { 175, 185, 365, 440 },
+      { { 24000, 1001 }, { 25, 1 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1237, 1920, 1080, 606208, 606208,
+      0, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
-      { 115, 120, 145, 240, 290 } },
-    { 1238, 1920, 1080, 0, 917504, 917504, 4, 8,
+      { 115, 120, 145, 240, 290 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1238, 1920, 1080, 917504, 917504,
+      0, 4, 8, 4,
       dnxhd_1238_luma_weight, dnxhd_1238_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
-      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
+      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
-      { 175, 185, 220, 365, 440 } },
-    { 1241, 1920, 1080, 1, 917504, 458752, 6, 10,
+      { 175, 185, 220, 365, 440 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1241, 1920, 1080, 917504, 458752,
+      DNXHD_INTERLACED, 6, 10, 4,
       dnxhd_1241_luma_weight, dnxhd_1241_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
-      { 185, 220 } },
-    { 1242, 1920, 1080, 1, 606208, 303104, 4, 8,
+      { 185, 220 },
+      { { 25, 1 }, { 30000, 1001 } } },
+    { 1242, 1920, 1080, 606208, 303104,
+      DNXHD_INTERLACED, 4, 8, 3,
       dnxhd_1242_luma_weight, dnxhd_1242_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
-      { 120, 145 } },
-    { 1243, 1920, 1080, 1, 917504, 458752, 4, 8,
+      { 120, 145 },
+      { { 25, 1 }, { 30000, 1001 } } },
+    { 1243, 1920, 1080, 917504, 458752,
+      DNXHD_INTERLACED, 4, 8, 4,
       dnxhd_1243_luma_weight, dnxhd_1243_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_level,
-      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
+      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
-      { 185, 220 } },
-    { 1250, 1280,  720, 0, 458752, 458752, 6, 10,
+      { 185, 220 },
+      { { 25, 1 }, { 30000, 1001 } } },
+    { 1250, 1280,  720, 458752, 458752,
+      0, 6, 10, 4,
       dnxhd_1250_luma_weight, dnxhd_1250_chroma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1250_ac_codes, dnxhd_1250_ac_bits, dnxhd_1250_ac_level,
-      dnxhd_1250_ac_run_flag, dnxhd_1250_ac_index_flag,
+      dnxhd_1250_ac_codes, dnxhd_1250_ac_bits, dnxhd_1250_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 90, 180, 220 } },
-    { 1251, 1280,  720, 0, 458752, 458752, 4, 8,
+      { 90, 90, 180, 220 },
+      { { 24000, 1001 }, { 25, 1 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1251, 1280,  720, 458752, 458752,
+      0, 4, 8, 4,
       dnxhd_1251_luma_weight, dnxhd_1251_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1251_ac_codes, dnxhd_1251_ac_bits, dnxhd_1251_ac_level,
-      dnxhd_1251_ac_run_flag, dnxhd_1251_ac_index_flag,
+      dnxhd_1251_ac_codes, dnxhd_1251_ac_bits, dnxhd_1251_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 90, 110, 175, 220 } },
-    { 1252, 1280,  720, 0, 303104, 303104, 4, 8,
+      { 90, 90, 110, 180, 220 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1252, 1280,  720, 303104, 303104,
+      0, 4, 8, 5,
       dnxhd_1252_luma_weight, dnxhd_1252_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_level,
-      dnxhd_1252_ac_run_flag, dnxhd_1252_ac_index_flag,
+      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
-      { 60, 75, 115, 145 } },
-    { 1253, 1920, 1080, 0, 188416, 188416, 4, 8,
+      { 60, 60, 75, 120, 145 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1253, 1920, 1080, 188416, 188416,
+      0, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
-      { 36, 45, 75, 90 } },
-    { 1256, 1920, 1080, 0, 1835008, 1835008, 6, 10,
-      dnxhd_1235_luma_weight, dnxhd_1235_chroma_weight,
+      { 36, 36, 45, 75, 90 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1256, 1920, 1080, 1835008, 1835008,
+      DNXHD_444, 6, 10, 4,
+      dnxhd_1235_luma_weight, dnxhd_1235_luma_weight,
       dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
-      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_level,
-      dnxhd_1235_ac_run_flag, dnxhd_1235_ac_index_flag,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
       dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
-      { 350, 390, 440, 730, 880 } },
-    { 1258, 960, 720, 0, 212992, 212992, 4, 8,
+      { 350, 390, 440, 730, 880 },
+      { { 24000, 1001 }, { 25, 1 }, { 30000, 1001 }, { 50, 1 }, { 60000, 1001 } } },
+    { 1258, 960, 720, 212992, 212992,
+      0, 4, 8, 5,
       dnxhd_1252_luma_weight, dnxhd_1252_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_level,
-      dnxhd_1252_ac_run_flag, dnxhd_1252_ac_index_flag,
+      dnxhd_1252_ac_codes, dnxhd_1252_ac_bits, dnxhd_1252_ac_info,
       dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
       { 42, 60, 75, 115 } },
-    { 1259, 1440, 1080, 0, 417792, 417792, 4, 8,
+    { 1259, 1440, 1080, 417792, 417792,
+      0, 4, 8, 3,
       dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 63, 84, 100, 110 } },
-    { 1260, 1440, 1080, 1, 835584, 417792, 4, 8,
+    { 1260, 1440, 1080, 835584, 417792,
+      DNXHD_INTERLACED | DNXHD_MBAFF, 4, 8, 3,
       dnxhd_1260_luma_weight, dnxhd_1260_chroma_weight,
       dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
-      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_level,
-      dnxhd_1237_ac_run_flag, dnxhd_1237_ac_index_flag,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
       dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
       { 80, 90, 100, 110 } },
+    { 1270, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      DNXHD_444, 6, DNXHD_VARIABLE, 4,
+      dnxhd_1235_luma_weight, dnxhd_1235_luma_weight,
+      dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
+      dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
+      { 0 } },
+    { 1271, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 6, DNXHD_VARIABLE, 4,
+      dnxhd_1241_luma_weight, dnxhd_1241_chroma_weight,
+      dnxhd_1235_dc_codes, dnxhd_1235_dc_bits,
+      dnxhd_1235_ac_codes, dnxhd_1235_ac_bits, dnxhd_1235_ac_info,
+      dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1235_run,
+      { 0 } },
+    { 1272, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 4, 8, 4,
+      dnxhd_1238_luma_weight, dnxhd_1238_chroma_weight,
+      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
+      dnxhd_1238_ac_codes, dnxhd_1238_ac_bits, dnxhd_1238_ac_info,
+      dnxhd_1235_run_codes, dnxhd_1235_run_bits, dnxhd_1238_run,
+      { 0 } },
+    { 1273, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 4, 8, 3,
+      dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
+      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
+      dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
+      { 0 } },
+    { 1274, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE, DNXHD_VARIABLE,
+      0, 4, 8, 3,
+      dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
+      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
+      dnxhd_1237_ac_codes, dnxhd_1237_ac_bits, dnxhd_1237_ac_info,
+      dnxhd_1237_run_codes, dnxhd_1237_run_bits, dnxhd_1237_run,
+      { 0 } },
 };
 
 int ff_dnxhd_get_cid_table(int cid)
@@ -1184,6 +1087,29 @@ int ff_dnxhd_get_cid_table(int cid)
     return -1;
 }
 
+int avpriv_dnxhd_get_frame_size(int cid)
+{
+    int i = ff_dnxhd_get_cid_table(cid);
+    if (i<0)
+        return i;
+    return ff_dnxhd_cid_table[i].frame_size;
+}
+
+int avpriv_dnxhd_get_interlaced(int cid)
+{
+    int i = ff_dnxhd_get_cid_table(cid);
+    if (i < 0)
+        return i;
+    return ff_dnxhd_cid_table[i].flags & DNXHD_INTERLACED ? 1 : 0;
+}
+
+uint64_t avpriv_dnxhd_parse_header_prefix(const uint8_t *buf)
+{
+    uint64_t prefix = AV_RB32(buf);
+    prefix = (prefix << 16) | buf[4] << 8;
+    return ff_dnxhd_check_header_prefix(prefix);
+}
+
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
 {
     int i, j;
@@ -1192,10 +1118,16 @@ int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
         return 0;
     for (i = 0; i < FF_ARRAY_ELEMS(ff_dnxhd_cid_table); i++) {
         const CIDEntry *cid = &ff_dnxhd_cid_table[i];
+        int interlaced = cid->flags & DNXHD_INTERLACED ? 1 : 0;
         if (cid->width == avctx->width && cid->height == avctx->height &&
-            cid->interlaced == !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) &&
-            cid->bit_depth == bit_depth) {
-            for (j = 0; j < sizeof(cid->bit_rates); j++) {
+            interlaced == !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) &&
+            !(cid->flags & DNXHD_444) && cid->bit_depth == bit_depth) {
+            if (avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL &&
+                cid->flags & DNXHD_MBAFF) {
+                av_log(avctx, AV_LOG_WARNING, "Profile selected is experimental\n");
+                continue;
+            }
+            for (j = 0; j < FF_ARRAY_ELEMS(cid->bit_rates); j++) {
                 if (cid->bit_rates[j] == mbs)
                     return cid->cid;
             }
@@ -1204,25 +1136,18 @@ int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
     return 0;
 }
 
-void ff_dnxhd_list_cid(AVCodecContext *avctx)
+void ff_dnxhd_print_profiles(AVCodecContext *avctx, int loglevel)
 {
     int i, j;
-
     for (i = 0; i < FF_ARRAY_ELEMS(ff_dnxhd_cid_table); i++) {
         const CIDEntry *cid = &ff_dnxhd_cid_table[i];
-        av_log(avctx, AV_LOG_INFO,
-               "cid %d %ux%u %dbits %s bit rates",
-               cid->cid,
-               cid->width, cid->height,
-               cid->bit_depth,
-               cid->interlaced ? "interlaced " :
-                                 "progressive");
         for (j = 0; j < FF_ARRAY_ELEMS(cid->bit_rates); j++) {
             if (!cid->bit_rates[j])
                 break;
-            av_log(avctx, AV_LOG_INFO, " %dM",
-                   cid->bit_rates[j]);
+
+            av_log(avctx, loglevel, "Frame size: %dx%d%c; bitrate: %dMbps; pixel format: %s; framerate: %d/%d\n",
+                   cid->width, cid->height, cid->flags & DNXHD_INTERLACED ? 'i' : 'p', cid->bit_rates[j],
+                   cid->bit_depth == 10 ? "yuv422p10" : "yuv422p", cid->frame_rates[j].num, cid->frame_rates[j].den);
         }
-        av_log(avctx, AV_LOG_INFO, "\n");
     }
 }
diff --git a/libavcodec/dnxhddata.h b/libavcodec/dnxhddata.h
index cc877b6..3ae4683 100644
--- a/libavcodec/dnxhddata.h
+++ b/libavcodec/dnxhddata.h
@@ -2,20 +2,20 @@
  * VC3/DNxHD decoder.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,29 +24,58 @@
 
 #include <stdint.h>
 #include "avcodec.h"
+#include "libavutil/internal.h"
+
+/** Additional profile info flags */
+#define DNXHD_INTERLACED   (1<<0)
+#define DNXHD_MBAFF        (1<<1)
+#define DNXHD_444          (1<<2)
+
+/** Frame headers, extra 0x00 added to end for parser */
+#define DNXHD_HEADER_INITIAL 0x000002800100
+#define DNXHD_HEADER_444     0x000002800200
+#define DNXHD_HEADER_HR1     0x000002800300
+#define DNXHD_HEADER_HR2     0x0000038C0300
+
+/** Indicate that a CIDEntry value must be read in the bitstream */
+#define DNXHD_VARIABLE 0
 
 typedef struct CIDEntry {
     int cid;
     unsigned int width, height;
-    int interlaced;
     unsigned int frame_size;
     unsigned int coding_unit_size;
+    uint16_t flags;
     int index_bits;
     int bit_depth;
+    int eob_index;
     const uint8_t *luma_weight, *chroma_weight;
     const uint8_t *dc_codes, *dc_bits;
     const uint16_t *ac_codes;
-    const uint8_t *ac_bits, *ac_level;
-    const uint8_t *ac_run_flag, *ac_index_flag;
+    const uint8_t *ac_bits, *ac_info;
     const uint16_t *run_codes;
     const uint8_t *run_bits, *run;
     int bit_rates[5]; ///< Helper to choose variants, rounded to nearest 5Mb/s
+    AVRational frame_rates[5];
 } CIDEntry;
 
 extern const CIDEntry ff_dnxhd_cid_table[];
 
 int ff_dnxhd_get_cid_table(int cid);
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth);
-void ff_dnxhd_list_cid(AVCodecContext *avctx);
+void ff_dnxhd_print_profiles(AVCodecContext *avctx, int loglevel);
+
+static av_always_inline uint64_t ff_dnxhd_check_header_prefix(uint64_t prefix)
+{
+    if (prefix == DNXHD_HEADER_INITIAL ||
+        prefix == DNXHD_HEADER_444     ||
+        prefix == DNXHD_HEADER_HR1     ||
+        prefix == DNXHD_HEADER_HR2)
+        return prefix;
+    return 0;
+}
 
+int avpriv_dnxhd_get_frame_size(int cid);
+int avpriv_dnxhd_get_interlaced(int cid);
+uint64_t avpriv_dnxhd_parse_header_prefix(const uint8_t *buf);
 #endif /* AVCODEC_DNXHDDATA_H */
diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c
index 50747ea..1808080 100644
--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -2,23 +2,25 @@
  * VC3/DNxHD decoder.
  * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
  * Copyright (c) 2011 MirriAd Ltd
+ * Copyright (c) 2015 Christophe Gisquet
  *
  * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
+ * Slice multithreading and MB interlaced support added by Christophe Gisquet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,53 +28,83 @@
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#define  UNCHECKED_BITSTREAM_READER 1
 #include "get_bits.h"
 #include "dnxhddata.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "thread.h"
 
+typedef struct RowContext {
+    DECLARE_ALIGNED(16, int16_t, blocks)[12][64];
+    int luma_scale[64];
+    int chroma_scale[64];
+    GetBitContext gb;
+    int last_dc[3];
+    int last_qscale;
+    int errors;
+    /** -1:not set yet  0:off=RGB  1:on=YUV  2:variable */
+    int format;
+} RowContext;
+
 typedef struct DNXHDContext {
     AVCodecContext *avctx;
-    GetBitContext gb;
+    RowContext *rows;
     BlockDSPContext bdsp;
-    int cid;                            ///< compression id
+    const uint8_t* buf;
+    int buf_size;
+    int64_t cid;                        ///< compression id
     unsigned int width, height;
+    enum AVPixelFormat pix_fmt;
     unsigned int mb_width, mb_height;
-    uint32_t mb_scan_index[68];         /* max for 1080p */
+    uint32_t mb_scan_index[256];
+    int data_offset;                    // End of mb_scan_index, where macroblocks start
     int cur_field;                      ///< current interlaced field
     VLC ac_vlc, dc_vlc, run_vlc;
-    int last_dc[3];
     IDCTDSPContext idsp;
-    DECLARE_ALIGNED(16, int16_t, blocks)[12][64];
     ScanTable scantable;
     const CIDEntry *cid_table;
-    int bit_depth; // 8, 10 or 0 if not initialized at all.
+    int bit_depth; // 8, 10, 12 or 0 if not initialized at all.
     int is_444;
     int mbaff;
-    void (*decode_dct_block)(struct DNXHDContext *ctx, int16_t *block,
-                             int n, int qscale);
+    int act;
+    int (*decode_dct_block)(const struct DNXHDContext *ctx,
+                            RowContext *row, int n);
 } DNXHDContext;
 
 #define DNXHD_VLC_BITS 9
 #define DNXHD_DC_VLC_BITS 7
 
-static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block,
-                                     int n, int qscale);
-static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block,
-                                      int n, int qscale);
-static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block,
-                                          int n, int qscale);
+static int dnxhd_decode_dct_block_8(const DNXHDContext *ctx,
+                                    RowContext *row, int n);
+static int dnxhd_decode_dct_block_10(const DNXHDContext *ctx,
+                                     RowContext *row, int n);
+static int dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n);
+static int dnxhd_decode_dct_block_12(const DNXHDContext *ctx,
+                                     RowContext *row, int n);
+static int dnxhd_decode_dct_block_12_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n);
 
 static av_cold int dnxhd_decode_init(AVCodecContext *avctx)
 {
     DNXHDContext *ctx = avctx->priv_data;
 
     ctx->avctx = avctx;
+    ctx->cid = -1;
+    avctx->colorspace = AVCOL_SPC_BT709;
+
+    avctx->coded_width  = FFALIGN(avctx->width,  16);
+    avctx->coded_height = FFALIGN(avctx->height, 16);
+
+    ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext));
+    if (!ctx->rows)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
-static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
+static int dnxhd_init_vlc(DNXHDContext *ctx, uint32_t cid, int bitdepth)
 {
     if (cid != ctx->cid) {
         int index;
@@ -81,6 +113,16 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
             av_log(ctx->avctx, AV_LOG_ERROR, "unsupported cid %d\n", cid);
             return AVERROR(ENOSYS);
         }
+        if (ff_dnxhd_cid_table[index].bit_depth != bitdepth &&
+            ff_dnxhd_cid_table[index].bit_depth != DNXHD_VARIABLE) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "bit depth mismatches %d %d\n", ff_dnxhd_cid_table[index].bit_depth, bitdepth);
+            return AVERROR_INVALIDDATA;
+        }
+        if (bitdepth > 10) {
+            avpriv_request_sample(ctx->avctx, "DNXHR 12-bit");
+            if (ctx->avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL)
+                return AVERROR_PATCHWELCOME;
+        }
         ctx->cid_table = &ff_dnxhd_cid_table[index];
         av_log(ctx->avctx, AV_LOG_VERBOSE, "Profile cid %d.\n", cid);
 
@@ -91,36 +133,47 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
         init_vlc(&ctx->ac_vlc, DNXHD_VLC_BITS, 257,
                  ctx->cid_table->ac_bits, 1, 1,
                  ctx->cid_table->ac_codes, 2, 2, 0);
-        init_vlc(&ctx->dc_vlc, DNXHD_DC_VLC_BITS, ctx->bit_depth + 4,
+        init_vlc(&ctx->dc_vlc, DNXHD_DC_VLC_BITS, bitdepth + 4,
                  ctx->cid_table->dc_bits, 1, 1,
                  ctx->cid_table->dc_codes, 1, 1, 0);
         init_vlc(&ctx->run_vlc, DNXHD_VLC_BITS, 62,
                  ctx->cid_table->run_bits, 1, 1,
                  ctx->cid_table->run_codes, 2, 2, 0);
 
-        ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable,
-                          ff_zigzag_direct);
         ctx->cid = cid;
     }
     return 0;
 }
 
+static av_cold int dnxhd_decode_init_thread_copy(AVCodecContext *avctx)
+{
+    DNXHDContext *ctx = avctx->priv_data;
+
+    // make sure VLC tables will be loaded when cid is parsed
+    ctx->cid = -1;
+
+    ctx->rows = av_mallocz_array(avctx->thread_count, sizeof(RowContext));
+    if (!ctx->rows)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
 static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
                                const uint8_t *buf, int buf_size,
                                int first_field)
 {
-    static const uint8_t header_prefix[]    = { 0x00, 0x00, 0x02, 0x80, 0x01 };
-    static const uint8_t header_prefix444[] = { 0x00, 0x00, 0x02, 0x80, 0x02 };
     int i, cid, ret;
-    int old_bit_depth = ctx->bit_depth;
-
+    int old_bit_depth = ctx->bit_depth, bitdepth;
+    uint64_t header_prefix;
     if (buf_size < 0x280) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "buffer too small (%d < 640).\n",
-               buf_size);
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "buffer too small (%d < 640).\n", buf_size);
         return AVERROR_INVALIDDATA;
     }
 
-    if (memcmp(buf, header_prefix, 5) && memcmp(buf, header_prefix444, 5)) {
+    header_prefix = avpriv_dnxhd_parse_header_prefix(buf);
+    if (header_prefix == 0) {
         av_log(ctx->avctx, AV_LOG_ERROR,
                "unknown header 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X\n",
                buf[0], buf[1], buf[2], buf[3], buf[4]);
@@ -132,52 +185,73 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         frame->top_field_first  = first_field ^ ctx->cur_field;
         av_log(ctx->avctx, AV_LOG_DEBUG,
                "interlaced %d, cur field %d\n", buf[5] & 3, ctx->cur_field);
+    } else {
+        ctx->cur_field = 0;
     }
-    ctx->mbaff = buf[0x6] & 32;
+    ctx->mbaff = (buf[0x6] >> 5) & 1;
 
     ctx->height = AV_RB16(buf + 0x18);
     ctx->width  = AV_RB16(buf + 0x1a);
 
-    ff_dlog(ctx->avctx, "width %d, height %d\n", ctx->width, ctx->height);
+    switch(buf[0x21] >> 5) {
+    case 1: bitdepth = 8; break;
+    case 2: bitdepth = 10; break;
+    case 3: bitdepth = 12; break;
+    default:
+        av_log(ctx->avctx, AV_LOG_ERROR,
+               "Unknown bitdepth indicator (%d)\n", buf[0x21] >> 5);
+        return AVERROR_INVALIDDATA;
+    }
 
-    if (buf[0x21] == 0x58) { /* 10 bit */
-        ctx->bit_depth = ctx->avctx->bits_per_raw_sample = 10;
+    cid = AV_RB32(buf + 0x28);
+    if ((ret = dnxhd_init_vlc(ctx, cid, bitdepth)) < 0)
+        return ret;
+    if (ctx->mbaff && ctx->cid_table->cid != 1260)
+        av_log(ctx->avctx, AV_LOG_WARNING,
+               "Adaptive MB interlace flag in an unsupported profile.\n");
+
+    ctx->act = buf[0x2C] & 7;
+    if (ctx->act && ctx->cid_table->cid != 1256 && ctx->cid_table->cid != 1270)
+        av_log(ctx->avctx, AV_LOG_WARNING,
+               "Adaptive color transform in an unsupported profile.\n");
 
-        if (buf[0x4] == 0x2) {
+    ctx->is_444 = (buf[0x2C] >> 6) & 1;
+    if (ctx->is_444) {
+        if (bitdepth == 8) {
+            avpriv_request_sample(ctx->avctx, "4:4:4 8 bits\n");
+            return AVERROR_INVALIDDATA;
+        } else if (bitdepth == 10) {
             ctx->decode_dct_block = dnxhd_decode_dct_block_10_444;
-            ctx->avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
-            ctx->is_444 = 1;
+            ctx->pix_fmt = ctx->act ? AV_PIX_FMT_YUV444P10
+                                    : AV_PIX_FMT_GBRP10;
         } else {
-            ctx->decode_dct_block = dnxhd_decode_dct_block_10;
-            ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            ctx->decode_dct_block = dnxhd_decode_dct_block_12_444;
+            ctx->pix_fmt = ctx->act ? AV_PIX_FMT_YUV444P12
+                                    : AV_PIX_FMT_GBRP12;
         }
-    } else if (buf[0x21] == 0x38) { /* 8 bit */
-        ctx->bit_depth = ctx->avctx->bits_per_raw_sample = 8;
-
-        ctx->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-        ctx->decode_dct_block = dnxhd_decode_dct_block_8;
+    } else if (bitdepth == 12) {
+        ctx->decode_dct_block = dnxhd_decode_dct_block_12;
+        ctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+    } else if (bitdepth == 10) {
+        ctx->decode_dct_block = dnxhd_decode_dct_block_10;
+        ctx->pix_fmt = AV_PIX_FMT_YUV422P10;
     } else {
-        av_log(ctx->avctx, AV_LOG_ERROR, "invalid bit depth value (%d).\n",
-               buf[0x21]);
-        return AVERROR_INVALIDDATA;
+        ctx->decode_dct_block = dnxhd_decode_dct_block_8;
+        ctx->pix_fmt = AV_PIX_FMT_YUV422P;
     }
+
+    ctx->avctx->bits_per_raw_sample = ctx->bit_depth = bitdepth;
     if (ctx->bit_depth != old_bit_depth) {
         ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
         ff_idctdsp_init(&ctx->idsp, ctx->avctx);
+        ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable,
+                          ff_zigzag_direct);
     }
 
-    cid = AV_RB32(buf + 0x28);
-    ff_dlog(ctx->avctx, "compression id %d\n", cid);
-
-    if ((ret = dnxhd_init_vlc(ctx, cid)) < 0)
-        return ret;
-    if (ctx->mbaff && ctx->cid_table->cid != 1260)
-        av_log(ctx->avctx, AV_LOG_WARNING,
-               "Adaptive MB interlace flag in an unsupported profile.\n");
-
     // make sure profile size constraints are respected
     // DNx100 allows 1920->1440 and 1280->960 subsampling
-    if (ctx->width != ctx->cid_table->width) {
+    if (ctx->width != ctx->cid_table->width &&
+        ctx->cid_table->width != DNXHD_VARIABLE) {
         av_reduce(&ctx->avctx->sample_aspect_ratio.num,
                   &ctx->avctx->sample_aspect_ratio.den,
                   ctx->width, ctx->cid_table->width, 255);
@@ -190,29 +264,44 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         return AVERROR_INVALIDDATA;
     }
 
-    ctx->mb_width  = ctx->width >> 4;
+    ctx->mb_width  = (ctx->width + 15)>> 4;
     ctx->mb_height = buf[0x16d];
 
-    ff_dlog(ctx->avctx,
-            "mb width %d, mb height %d\n", ctx->mb_width, ctx->mb_height);
-
     if ((ctx->height + 15) >> 4 == ctx->mb_height && frame->interlaced_frame)
         ctx->height <<= 1;
 
-    if (ctx->mb_height > 68 ||
-        (ctx->mb_height << frame->interlaced_frame) > (ctx->height + 15) >> 4) {
+    av_log(ctx->avctx, AV_LOG_VERBOSE, "%dx%d, 4:%s %d bits, MBAFF=%d ACT=%d\n",
+           ctx->width, ctx->height, ctx->is_444 ? "4:4" : "2:2",
+           ctx->bit_depth, ctx->mbaff, ctx->act);
+
+    // Newer format supports variable mb_scan_index sizes
+    if (header_prefix == DNXHD_HEADER_HR2) {
+        ctx->data_offset = 0x170 + (ctx->mb_height << 2);
+    } else {
+        if (ctx->mb_height > 68 ||
+            (ctx->mb_height << frame->interlaced_frame) > (ctx->height + 15) >> 4) {
+            av_log(ctx->avctx, AV_LOG_ERROR,
+                   "mb height too big: %d\n", ctx->mb_height);
+            return AVERROR_INVALIDDATA;
+        }
+        ctx->data_offset = 0x280;
+    }
+
+    if (buf_size < ctx->data_offset) {
         av_log(ctx->avctx, AV_LOG_ERROR,
-               "mb height too big: %d\n", ctx->mb_height);
+               "buffer too small (%d < %d).\n", buf_size, ctx->data_offset);
         return AVERROR_INVALIDDATA;
     }
 
+    av_assert0((unsigned)ctx->mb_height <= FF_ARRAY_ELEMS(ctx->mb_scan_index));
+
     for (i = 0; i < ctx->mb_height; i++) {
         ctx->mb_scan_index[i] = AV_RB32(buf + 0x170 + (i << 2));
-        ff_dlog(ctx->avctx, "mb scan index %d\n", ctx->mb_scan_index[i]);
-        if (buf_size < ctx->mb_scan_index[i] + 0x280) {
+        ff_dlog(ctx->avctx, "mb scan index %d, pos %d: %u\n", i, 0x170 + (i << 2), ctx->mb_scan_index[i]);
+        if (buf_size - ctx->data_offset < ctx->mb_scan_index[i]) {
             av_log(ctx->avctx, AV_LOG_ERROR,
-                   "invalid mb scan index (%d < %d).\n",
-                   buf_size, ctx->mb_scan_index[i] + 0x280);
+                   "invalid mb scan index (%u vs %u).\n",
+                   ctx->mb_scan_index[i], buf_size - ctx->data_offset);
             return AVERROR_INVALIDDATA;
         }
     }
@@ -220,132 +309,181 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
     return 0;
 }
 
-static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx,
-                                                    int16_t *block, int n,
-                                                    int qscale,
-                                                    int index_bits,
-                                                    int level_bias,
-                                                    int level_shift)
+static av_always_inline int dnxhd_decode_dct_block(const DNXHDContext *ctx,
+                                                   RowContext *row,
+                                                   int n,
+                                                   int index_bits,
+                                                   int level_bias,
+                                                   int level_shift,
+                                                   int dc_shift)
 {
-    int i, j, index1, index2, len;
+    int i, j, index1, index2, len, flags;
     int level, component, sign;
+    const int *scale;
     const uint8_t *weight_matrix;
-    OPEN_READER(bs, &ctx->gb);
+    const uint8_t *ac_info = ctx->cid_table->ac_info;
+    int16_t *block = row->blocks[n];
+    const int eob_index     = ctx->cid_table->eob_index;
+    int ret = 0;
+    OPEN_READER(bs, &row->gb);
+
+    ctx->bdsp.clear_block(block);
 
     if (!ctx->is_444) {
         if (n & 2) {
             component     = 1 + (n & 1);
+            scale = row->chroma_scale;
             weight_matrix = ctx->cid_table->chroma_weight;
         } else {
             component     = 0;
+            scale = row->luma_scale;
             weight_matrix = ctx->cid_table->luma_weight;
         }
     } else {
         component = (n >> 1) % 3;
         if (component) {
+            scale = row->chroma_scale;
             weight_matrix = ctx->cid_table->chroma_weight;
         } else {
+            scale = row->luma_scale;
             weight_matrix = ctx->cid_table->luma_weight;
         }
     }
 
-    UPDATE_CACHE(bs, &ctx->gb);
-    GET_VLC(len, bs, &ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
+    UPDATE_CACHE(bs, &row->gb);
+    GET_VLC(len, bs, &row->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
     if (len) {
-        level = GET_CACHE(bs, &ctx->gb);
-        LAST_SKIP_BITS(bs, &ctx->gb, len);
+        level = GET_CACHE(bs, &row->gb);
+        LAST_SKIP_BITS(bs, &row->gb, len);
         sign  = ~level >> 31;
         level = (NEG_USR32(sign ^ level, len) ^ sign) - sign;
-        ctx->last_dc[component] += level;
+        row->last_dc[component] += level * (1 << dc_shift);
     }
-    block[0] = ctx->last_dc[component];
+    block[0] = row->last_dc[component];
 
-    for (i = 1; ; i++) {
-        UPDATE_CACHE(bs, &ctx->gb);
-        GET_VLC(index1, bs, &ctx->gb, ctx->ac_vlc.table,
-                DNXHD_VLC_BITS, 2);
-        level = ctx->cid_table->ac_level[index1];
-        if (!level) /* EOB */
-            break;
+    i = 0;
 
-        sign = SHOW_SBITS(bs, &ctx->gb, 1);
-        SKIP_BITS(bs, &ctx->gb, 1);
+    UPDATE_CACHE(bs, &row->gb);
+    GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table,
+            DNXHD_VLC_BITS, 2);
 
-        if (ctx->cid_table->ac_index_flag[index1]) {
-            level += SHOW_UBITS(bs, &ctx->gb, index_bits) << 6;
-            SKIP_BITS(bs, &ctx->gb, index_bits);
+    while (index1 != eob_index) {
+        level = ac_info[2*index1+0];
+        flags = ac_info[2*index1+1];
+
+        sign = SHOW_SBITS(bs, &row->gb, 1);
+        SKIP_BITS(bs, &row->gb, 1);
+
+        if (flags & 1) {
+            level += SHOW_UBITS(bs, &row->gb, index_bits) << 7;
+            SKIP_BITS(bs, &row->gb, index_bits);
         }
 
-        if (ctx->cid_table->ac_run_flag[index1]) {
-            UPDATE_CACHE(bs, &ctx->gb);
-            GET_VLC(index2, bs, &ctx->gb, ctx->run_vlc.table,
+        if (flags & 2) {
+            UPDATE_CACHE(bs, &row->gb);
+            GET_VLC(index2, bs, &row->gb, ctx->run_vlc.table,
                     DNXHD_VLC_BITS, 2);
             i += ctx->cid_table->run[index2];
         }
 
-        if (i > 63) {
+        if (++i > 63) {
             av_log(ctx->avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", n, i);
+            ret = -1;
             break;
         }
 
         j     = ctx->scantable.permutated[i];
-        level = (2 * level + 1) * qscale * weight_matrix[i];
+        level *= scale[i];
+        level += scale[i] >> 1;
         if (level_bias < 32 || weight_matrix[i] != level_bias)
-            level += level_bias;
+            level += level_bias; // 1<<(level_shift-1)
         level >>= level_shift;
 
         block[j] = (level ^ sign) - sign;
+
+        UPDATE_CACHE(bs, &row->gb);
+        GET_VLC(index1, bs, &row->gb, ctx->ac_vlc.table,
+                DNXHD_VLC_BITS, 2);
     }
 
-    CLOSE_READER(bs, &ctx->gb);
+    CLOSE_READER(bs, &row->gb);
+    return ret;
+}
+
+static int dnxhd_decode_dct_block_8(const DNXHDContext *ctx,
+                                    RowContext *row, int n)
+{
+    return dnxhd_decode_dct_block(ctx, row, n, 4, 32, 6, 0);
+}
+
+static int dnxhd_decode_dct_block_10(const DNXHDContext *ctx,
+                                     RowContext *row, int n)
+{
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 8, 4, 0);
 }
 
-static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, int16_t *block,
-                                     int n, int qscale)
+static int dnxhd_decode_dct_block_10_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 4, 32, 6);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 32, 6, 0);
 }
 
-static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, int16_t *block,
-                                      int n, int qscale)
+static int dnxhd_decode_dct_block_12(const DNXHDContext *ctx,
+                                     RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 8, 4);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 8, 4, 2);
 }
 
-static void dnxhd_decode_dct_block_10_444(DNXHDContext *ctx, int16_t *block,
-                                          int n, int qscale)
+static int dnxhd_decode_dct_block_12_444(const DNXHDContext *ctx,
+                                         RowContext *row, int n)
 {
-    dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 32, 6);
+    return dnxhd_decode_dct_block(ctx, row, n, 6, 32, 4, 2);
 }
 
-static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
-                                   int x, int y)
+static int dnxhd_decode_macroblock(const DNXHDContext *ctx, RowContext *row,
+                                   AVFrame *frame, int x, int y)
 {
-    int shift1 = ctx->bit_depth == 10;
+    int shift1 = ctx->bit_depth >= 10;
     int dct_linesize_luma   = frame->linesize[0];
     int dct_linesize_chroma = frame->linesize[1];
     uint8_t *dest_y, *dest_u, *dest_v;
     int dct_y_offset, dct_x_offset;
-    int qscale, i;
+    int qscale, i, act;
     int interlaced_mb = 0;
 
     if (ctx->mbaff) {
-        interlaced_mb = get_bits1(&ctx->gb);
-        qscale = get_bits(&ctx->gb, 10);
+        interlaced_mb = get_bits1(&row->gb);
+        qscale = get_bits(&row->gb, 10);
     } else {
-        qscale = get_bits(&ctx->gb, 11);
+        qscale = get_bits(&row->gb, 11);
     }
-    skip_bits1(&ctx->gb);
-
-    for (i = 0; i < 8; i++) {
-        ctx->bdsp.clear_block(ctx->blocks[i]);
-        ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
+    act = get_bits1(&row->gb);
+    if (act) {
+        if (!ctx->act) {
+            static int act_warned;
+            if (!act_warned) {
+                act_warned = 1;
+                av_log(ctx->avctx, AV_LOG_ERROR,
+                       "ACT flag set, in violation of frame header.\n");
+            }
+        } else if (row->format == -1) {
+            row->format = act;
+        } else if (row->format != act) {
+            row->format = 2; // Variable
+        }
     }
-    if (ctx->is_444) {
-        for (; i < 12; i++) {
-            ctx->bdsp.clear_block(ctx->blocks[i]);
-            ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
+
+    if (qscale != row->last_qscale) {
+        for (i = 0; i < 64; i++) {
+            row->luma_scale[i]   = qscale * ctx->cid_table->luma_weight[i];
+            row->chroma_scale[i] = qscale * ctx->cid_table->chroma_weight[i];
         }
+        row->last_qscale = qscale;
+    }
+
+    for (i = 0; i < 8 + 4 * ctx->is_444; i++) {
+        if (ctx->decode_dct_block(ctx, row, i) < 0)
+            return AVERROR_INVALIDDATA;
     }
 
     if (frame->interlaced_frame) {
@@ -357,7 +495,7 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
     dest_u = frame->data[1] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1 + ctx->is_444));
     dest_v = frame->data[2] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1 + ctx->is_444));
 
-    if (ctx->cur_field) {
+    if (frame->interlaced_frame && ctx->cur_field) {
         dest_y += frame->linesize[0];
         dest_u += frame->linesize[1];
         dest_v += frame->linesize[2];
@@ -370,55 +508,62 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
     dct_y_offset = interlaced_mb ? frame->linesize[0] : (dct_linesize_luma << 3);
     dct_x_offset = 8 << shift1;
     if (!ctx->is_444) {
-        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[4]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, row->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, row->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, row->blocks[4]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[5]);
 
         if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3);
-            ctx->idsp.idct_put(dest_u,                dct_linesize_chroma, ctx->blocks[2]);
-            ctx->idsp.idct_put(dest_v,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]);
+            ctx->idsp.idct_put(dest_u,                dct_linesize_chroma, row->blocks[2]);
+            ctx->idsp.idct_put(dest_v,                dct_linesize_chroma, row->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, row->blocks[6]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, row->blocks[7]);
         }
     } else {
-        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[6]);
-        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, row->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, row->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, row->blocks[6]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, row->blocks[7]);
 
         if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             dct_y_offset = interlaced_mb ? frame->linesize[1] : (dct_linesize_chroma << 3);
-            ctx->idsp.idct_put(dest_u,                               dct_linesize_chroma, ctx->blocks[2]);
-            ctx->idsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, ctx->blocks[8]);
-            ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]);
-            ctx->idsp.idct_put(dest_v,                               dct_linesize_chroma, ctx->blocks[4]);
-            ctx->idsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, ctx->blocks[5]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, ctx->blocks[10]);
-            ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]);
+            ctx->idsp.idct_put(dest_u,                               dct_linesize_chroma, row->blocks[2]);
+            ctx->idsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, row->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, row->blocks[8]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[9]);
+            ctx->idsp.idct_put(dest_v,                               dct_linesize_chroma, row->blocks[4]);
+            ctx->idsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, row->blocks[5]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, row->blocks[10]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, row->blocks[11]);
         }
     }
 
     return 0;
 }
 
-static int dnxhd_decode_macroblocks(DNXHDContext *ctx, AVFrame *frame,
-                                    const uint8_t *buf, int buf_size)
+static int dnxhd_decode_row(AVCodecContext *avctx, void *data,
+                            int rownb, int threadnb)
 {
-    int x, y;
-    for (y = 0; y < ctx->mb_height; y++) {
-        ctx->last_dc[0] =
-        ctx->last_dc[1] =
-        ctx->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1)
-        init_get_bits(&ctx->gb, buf + ctx->mb_scan_index[y], (buf_size - ctx->mb_scan_index[y]) << 3);
-        for (x = 0; x < ctx->mb_width; x++) {
-            //START_TIMER;
-            dnxhd_decode_macroblock(ctx, frame, x, y);
-            //STOP_TIMER("decode macroblock");
+    const DNXHDContext *ctx = avctx->priv_data;
+    uint32_t offset = ctx->mb_scan_index[rownb];
+    RowContext *row = ctx->rows + threadnb;
+    int x;
+
+    row->last_dc[0] =
+    row->last_dc[1] =
+    row->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1)
+    init_get_bits(&row->gb, ctx->buf + offset, (ctx->buf_size - offset) << 3);
+    for (x = 0; x < ctx->mb_width; x++) {
+        //START_TIMER;
+        int ret = dnxhd_decode_macroblock(ctx, row, data, x, rownb);
+        if (ret < 0) {
+            row->errors++;
+            return ret;
         }
+        //STOP_TIMER("decode macroblock");
     }
+
     return 0;
 }
 
@@ -428,16 +573,18 @@ static int dnxhd_decode_frame(AVCodecContext *avctx, void *data,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     DNXHDContext *ctx = avctx->priv_data;
-    ThreadFrame tf;
+    ThreadFrame frame = { .f = data };
+    AVFrame *picture = data;
     int first_field = 1;
-    int ret;
-
-    tf.f = data;
+    int ret, i;
 
     ff_dlog(avctx, "frame size %d\n", buf_size);
 
+    for (i = 0; i < avctx->thread_count; i++)
+        ctx->rows[i].format = -1;
+
 decode_coding_unit:
-    if ((ret = dnxhd_decode_header(ctx, tf.f, buf, buf_size, first_field)) < 0)
+    if ((ret = dnxhd_decode_header(ctx, picture, buf, buf_size, first_field)) < 0)
         return ret;
 
     if ((avctx->width || avctx->height) &&
@@ -446,31 +593,78 @@ decode_coding_unit:
                avctx->width, avctx->height, ctx->width, ctx->height);
         first_field = 1;
     }
+    if (avctx->pix_fmt != AV_PIX_FMT_NONE && avctx->pix_fmt != ctx->pix_fmt) {
+        av_log(avctx, AV_LOG_WARNING, "pix_fmt changed: %s -> %s\n",
+               av_get_pix_fmt_name(avctx->pix_fmt), av_get_pix_fmt_name(ctx->pix_fmt));
+        first_field = 1;
+    }
 
+    avctx->pix_fmt = ctx->pix_fmt;
     ret = ff_set_dimensions(avctx, ctx->width, ctx->height);
     if (ret < 0)
         return ret;
 
     if (first_field) {
-        if ((ret = ff_thread_get_buffer(avctx, &tf, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
             return ret;
-        }
-        tf.f->pict_type = AV_PICTURE_TYPE_I;
-        tf.f->key_frame = 1;
+        picture->pict_type = AV_PICTURE_TYPE_I;
+        picture->key_frame = 1;
     }
 
-    dnxhd_decode_macroblocks(ctx, tf.f, buf + 0x280, buf_size - 0x280);
+    ctx->buf_size = buf_size - ctx->data_offset;
+    ctx->buf = buf + ctx->data_offset;
+    avctx->execute2(avctx, dnxhd_decode_row, picture, NULL, ctx->mb_height);
 
-    if (first_field && tf.f->interlaced_frame) {
+    if (first_field && picture->interlaced_frame) {
         buf      += ctx->cid_table->coding_unit_size;
         buf_size -= ctx->cid_table->coding_unit_size;
         first_field = 0;
         goto decode_coding_unit;
     }
 
+    ret = 0;
+    for (i = 0; i < avctx->thread_count; i++) {
+        ret += ctx->rows[i].errors;
+        ctx->rows[i].errors = 0;
+    }
+
+    if (ctx->act) {
+        static int act_warned;
+        int format = ctx->rows[0].format;
+        for (i = 1; i < avctx->thread_count; i++) {
+            if (ctx->rows[i].format != format &&
+                ctx->rows[i].format != -1 /* not run */) {
+                format = 2;
+                break;
+            }
+        }
+        switch (format) {
+        case -1:
+        case 2:
+            if (!act_warned) {
+                act_warned = 1;
+                av_log(ctx->avctx, AV_LOG_ERROR,
+                       "Unsupported: variable ACT flag.\n");
+            }
+            break;
+        case 0:
+            ctx->pix_fmt = ctx->bit_depth==10
+                         ? AV_PIX_FMT_GBRP10 : AV_PIX_FMT_GBRP12;
+            break;
+        case 1:
+            ctx->pix_fmt = ctx->bit_depth==10
+                         ? AV_PIX_FMT_YUV444P10 : AV_PIX_FMT_YUV444P12;
+            break;
+        }
+    }
+    avctx->pix_fmt = ctx->pix_fmt;
+    if (ret) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "%d lines with errors\n", ret);
+        return AVERROR_INVALIDDATA;
+    }
+
     *got_frame = 1;
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int dnxhd_decode_close(AVCodecContext *avctx)
@@ -480,6 +674,9 @@ static av_cold int dnxhd_decode_close(AVCodecContext *avctx)
     ff_free_vlc(&ctx->ac_vlc);
     ff_free_vlc(&ctx->dc_vlc);
     ff_free_vlc(&ctx->run_vlc);
+
+    av_freep(&ctx->rows);
+
     return 0;
 }
 
@@ -492,5 +689,7 @@ AVCodec ff_dnxhd_decoder = {
     .init           = dnxhd_decode_init,
     .close          = dnxhd_decode_close,
     .decode         = dnxhd_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
+                      AV_CODEC_CAP_SLICE_THREADS,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(dnxhd_decode_init_thread_copy),
 };
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index f3182c9..aee4323 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -6,20 +6,20 @@
  * VC-3 encoder funded by the British Broadcasting Corporation
  * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,21 +44,21 @@
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
     { "nitris_compat", "encode with Avid Nitris compatibility",
-        offsetof(DNXHDEncContext, nitris_compat), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+        offsetof(DNXHDEncContext, nitris_compat), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "ibias", "intra quant bias",
         offsetof(DNXHDEncContext, intra_quant_bias), AV_OPT_TYPE_INT,
-        { .i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, VE },
+        { .i64 = 0 }, INT_MIN, INT_MAX, VE },
     { NULL }
 };
 
-static const AVClass class = {
-    "dnxhd",
-    av_default_item_name,
-    options,
-    LIBAVUTIL_VERSION_INT
+static const AVClass dnxhd_class = {
+    .class_name = "dnxhd",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *restrict block,
+static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *av_restrict block,
                                           const uint8_t *pixels,
                                           ptrdiff_t line_size)
 {
@@ -82,25 +82,25 @@ static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *restrict block,
 }
 
 static av_always_inline
-void dnxhd_10bit_get_pixels_8x4_sym(int16_t *restrict block,
+void dnxhd_10bit_get_pixels_8x4_sym(int16_t *av_restrict block,
                                     const uint8_t *pixels,
                                     ptrdiff_t line_size)
 {
-    int i;
-
-    block += 32;
-
-    for (i = 0; i < 4; i++) {
-        memcpy(block + i * 8, pixels + i * line_size, 8 * sizeof(*block));
-        memcpy(block - (i + 1) * 8, pixels + i * line_size, 8 * sizeof(*block));
-    }
+    memcpy(block + 0 * 8, pixels + 0 * line_size, 8 * sizeof(*block));
+    memcpy(block + 7 * 8, pixels + 0 * line_size, 8 * sizeof(*block));
+    memcpy(block + 1 * 8, pixels + 1 * line_size, 8 * sizeof(*block));
+    memcpy(block + 6 * 8, pixels + 1 * line_size, 8 * sizeof(*block));
+    memcpy(block + 2 * 8, pixels + 2 * line_size, 8 * sizeof(*block));
+    memcpy(block + 5 * 8, pixels + 2 * line_size, 8 * sizeof(*block));
+    memcpy(block + 3 * 8, pixels + 3 * line_size, 8 * sizeof(*block));
+    memcpy(block + 4 * 8, pixels + 3 * line_size, 8 * sizeof(*block));
 }
 
 static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, int16_t *block,
                                     int n, int qscale, int *overflow)
 {
     const uint8_t *scantable= ctx->intra_scantable.scantable;
-    const int *qmat = ctx->q_intra_matrix[qscale];
+    const int *qmat = n<4 ? ctx->q_intra_matrix[qscale] : ctx->q_chroma_intra_matrix[qscale];
     int last_non_zero = 0;
     int i;
 
@@ -119,6 +119,11 @@ static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, int16_t *block,
             last_non_zero = i;
     }
 
+    /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */
+    if (ctx->idsp.perm_type != FF_IDCT_PERM_NONE)
+        ff_block_permute(block, ctx->idsp.idct_permutation,
+                         scantable, last_non_zero);
+
     return last_non_zero;
 }
 
@@ -127,10 +132,10 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
     int i, j, level, run;
     int max_level = 1 << (ctx->cid_table->bit_depth + 2);
 
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_codes,
-                      max_level * 4 * sizeof(*ctx->vlc_codes), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_bits,
-                      max_level * 4 * sizeof(*ctx->vlc_bits), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->vlc_codes,
+                      max_level, 4 * sizeof(*ctx->vlc_codes), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->vlc_bits,
+                      max_level, 4 * sizeof(*ctx->vlc_bits), fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_codes,
                       63 * 2, fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_bits,
@@ -149,10 +154,10 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
                 alevel -= offset << 6;
             }
             for (j = 0; j < 257; j++) {
-                if (ctx->cid_table->ac_level[j] == alevel &&
-                    (!offset || (ctx->cid_table->ac_index_flag[j] && offset)) &&
-                    (!run    || (ctx->cid_table->ac_run_flag  [j] && run))) {
-                    assert(!ctx->vlc_codes[index]);
+                if (ctx->cid_table->ac_info[2*j+0] >> 1 == alevel &&
+                    (!offset || (ctx->cid_table->ac_info[2*j+1] & 1) && offset) &&
+                    (!run    || (ctx->cid_table->ac_info[2*j+1] & 2) && run)) {
+                    av_assert1(!ctx->vlc_codes[index]);
                     if (alevel) {
                         ctx->vlc_codes[index] =
                             (ctx->cid_table->ac_codes[j] << 1) | (sign & 1);
@@ -164,7 +169,7 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
                     break;
                 }
             }
-            assert(!alevel || j < 257);
+            av_assert0(!alevel || j < 257);
             if (offset) {
                 ctx->vlc_codes[index] =
                     (ctx->vlc_codes[index] << ctx->cid_table->index_bits) | offset;
@@ -174,7 +179,7 @@ static av_cold int dnxhd_init_vlc(DNXHDEncContext *ctx)
     }
     for (i = 0; i < 62; i++) {
         int run = ctx->cid_table->run[i];
-        assert(run < 63);
+        av_assert0(run < 63);
         ctx->run_codes[run] = ctx->cid_table->run_codes[i];
         ctx->run_bits[run]  = ctx->cid_table->run_bits[i];
     }
@@ -191,15 +196,15 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
     const uint8_t *luma_weight_table   = ctx->cid_table->luma_weight;
     const uint8_t *chroma_weight_table = ctx->cid_table->chroma_weight;
 
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,
-                      (ctx->m.avctx->qmax + 1) * 64 * sizeof(int), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,
-                      (ctx->m.avctx->qmax + 1) * 64 * sizeof(int), fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16,
-                      (ctx->m.avctx->qmax + 1) * 64 * 2 * sizeof(uint16_t),
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,
+                      (ctx->m.avctx->qmax + 1), 64 * sizeof(int), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,
+                      (ctx->m.avctx->qmax + 1), 64 * sizeof(int), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16,
+                      (ctx->m.avctx->qmax + 1), 64 * 2 * sizeof(uint16_t),
                       fail);
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16,
-                      (ctx->m.avctx->qmax + 1) * 64 * 2 * sizeof(uint16_t),
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16,
+                      (ctx->m.avctx->qmax + 1), 64 * 2 * sizeof(uint16_t),
                       fail);
 
     if (ctx->cid_table->bit_depth == 8) {
@@ -232,7 +237,7 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
         // 10-bit
         for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
             for (i = 1; i < 64; i++) {
-                int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]];
+                int j = ff_zigzag_direct[i];
 
                 /* The quantization formula from the VC-3 standard is:
                  * quantized = sign(block[i]) * floor(abs(block[i]/s) * p /
@@ -253,6 +258,11 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
         }
     }
 
+    ctx->m.q_chroma_intra_matrix16 = ctx->qmatrix_c16;
+    ctx->m.q_chroma_intra_matrix   = ctx->qmatrix_c;
+    ctx->m.q_intra_matrix16        = ctx->qmatrix_l16;
+    ctx->m.q_intra_matrix          = ctx->qmatrix_l;
+
     return 0;
 fail:
     return AVERROR(ENOMEM);
@@ -260,11 +270,10 @@ fail:
 
 static av_cold int dnxhd_init_rc(DNXHDEncContext *ctx)
 {
-    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_rc,
-                      8160 * ctx->m.avctx->qmax * sizeof(RCEntry), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->mb_rc, (ctx->m.avctx->qmax + 1), 8160 * sizeof(RCEntry), fail);
     if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD)
-        FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_cmp,
-                          ctx->m.mb_num * sizeof(RCCMPEntry), fail);
+        FF_ALLOCZ_ARRAY_OR_GOTO(ctx->m.avctx, ctx->mb_cmp,
+                          ctx->m.mb_num, sizeof(RCCMPEntry), fail);
 
     ctx->frame_bits = (ctx->cid_table->coding_unit_size -
                        640 - 4 - ctx->min_padding) * 8;
@@ -289,22 +298,22 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
-               "Pixel format is incompatible with DNxHD, use yuv422p or yuv422p10.\n");
+               "pixel format is incompatible with DNxHD\n");
         return AVERROR(EINVAL);
     }
 
     ctx->cid = ff_dnxhd_find_cid(avctx, bit_depth);
     if (!ctx->cid) {
         av_log(avctx, AV_LOG_ERROR,
-               "Video parameters incompatible with DNxHD, available CIDs:\n");
-        ff_dnxhd_list_cid(avctx);
+               "video parameters incompatible with DNxHD. Valid DNxHD profiles:\n");
+        ff_dnxhd_print_profiles(avctx, AV_LOG_ERROR);
         return AVERROR(EINVAL);
     }
     av_log(avctx, AV_LOG_DEBUG, "cid %d\n", ctx->cid);
 
     index = ff_dnxhd_get_cid_table(ctx->cid);
-    if (index < 0)
-        return index;
+    av_assert0(index >= 0);
+
     ctx->cid_table = &ff_dnxhd_cid_table[index];
 
     ctx->m.avctx    = avctx;
@@ -318,6 +327,8 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
     ff_mpv_idct_init(&ctx->m);
     ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
     ff_pixblockdsp_init(&ctx->m.pdsp, avctx);
+    ff_dct_encode_init(&ctx->m);
+
     if (!ctx->m.dct_quantize)
         ctx->m.dct_quantize = ff_dct_quantize_c;
 
@@ -384,6 +395,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
         return AVERROR(EINVAL);
     }
 
+    if (avctx->qmax <= 1) {
+        av_log(avctx, AV_LOG_ERROR, "qmax must be at least 2\n");
+        return AVERROR(EINVAL);
+    }
+
     ctx->thread[0] = ctx;
     for (i = 1; i < avctx->thread_count; i++) {
         ctx->thread[i] = av_malloc(sizeof(DNXHDEncContext));
@@ -398,7 +414,7 @@ fail:  // for FF_ALLOCZ_OR_GOTO
 static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
 {
     DNXHDEncContext *ctx = avctx->priv_data;
-    const uint8_t header_prefix[5] = { 0x00, 0x00, 0x02, 0x80, 0x01 };
+    static const uint8_t header_prefix[5] = { 0x00, 0x00, 0x02, 0x80, 0x01 };
 
     memset(buf, 0, 640);
 
@@ -437,7 +453,7 @@ static av_always_inline void dnxhd_encode_dc(DNXHDEncContext *ctx, int diff)
     }
     put_bits(&ctx->m.pb, ctx->cid_table->dc_bits[nbits] + nbits,
              (ctx->cid_table->dc_codes[nbits] << nbits) +
-             (diff & ((1 << nbits) - 1)));
+             av_mod_uintp2(diff, nbits));
 }
 
 static av_always_inline
@@ -591,15 +607,8 @@ void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
 static av_always_inline
 int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i)
 {
-    if (i & 2) {
-        ctx->m.q_intra_matrix16 = ctx->qmatrix_c16;
-        ctx->m.q_intra_matrix   = ctx->qmatrix_c;
-        return 1 + (i & 1);
-    } else {
-        ctx->m.q_intra_matrix16 = ctx->qmatrix_l16;
-        ctx->m.q_intra_matrix   = ctx->qmatrix_l;
-        return 0;
-    }
+    const static uint8_t component[8]={0,0,1,2,0,0,1,2};
+    return component[i];
 }
 
 static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
@@ -630,7 +639,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
             int n = dnxhd_switch_matrix(ctx, i);
 
             memcpy(block, src_block, 64 * sizeof(*block));
-            last_index = ctx->m.dct_quantize(&ctx->m, block, i,
+            last_index = ctx->m.dct_quantize(&ctx->m, block, 4 & (2*i),
                                              qscale, &overflow);
             ac_bits   += dnxhd_calc_ac_bits(ctx, block, last_index);
 
@@ -640,7 +649,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
             else
                 nbits = av_log2_16bit(2 * diff);
 
-            assert(nbits < ctx->cid_table->bit_depth + 4);
+            av_assert1(nbits < ctx->cid_table->bit_depth + 4);
             dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;
 
             ctx->m.last_dc[n] = block[0];
@@ -682,7 +691,7 @@ static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg,
         for (i = 0; i < 8; i++) {
             int16_t *block = ctx->blocks[i];
             int overflow, n = dnxhd_switch_matrix(ctx, i);
-            int last_index = ctx->m.dct_quantize(&ctx->m, block, i,
+            int last_index = ctx->m.dct_quantize(&ctx->m, block, 4 & (2*i),
                                                  qscale, &overflow);
             // START_TIMER;
             dnxhd_encode_block(ctx, block, last_index, n);
@@ -758,11 +767,13 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg,
             unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
             int sum = 0;
             int sqsum = 0;
+            int bw = FFMIN(avctx->width - 16 * mb_x, 16);
+            int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16);
             int mean, sqmean;
             int i, j;
             // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
-            for (i = 0; i < 16; ++i) {
-                for (j = 0; j < 16; ++j) {
+            for (i = 0; i < bh; ++i) {
+                for (j = 0; j < bw; ++j) {
                     // Turn 16-bit pixels into 10-bit ones.
                     const int sample = (unsigned) pix[j] >> 6;
                     sum   += sample;
@@ -941,13 +952,13 @@ static void radix_count(const RCCMPEntry *data, int size,
             buckets[j][get_bucket(v, 0)]++;
             v >>= BUCKET_BITS;
         }
-        assert(!v);
+        av_assert1(!v);
     }
     for (j = 0; j < RADIX_PASSES; j++) {
         int offset = size;
         for (i = NBUCKETS - 1; i >= 0; i--)
             buckets[j][i] = offset -= buckets[j][i];
-        assert(!buckets[j][0]);
+        av_assert1(!buckets[j][0]);
     }
 }
 
@@ -966,7 +977,7 @@ static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data,
 static void radix_sort(RCCMPEntry *data, int size)
 {
     int buckets[RADIX_PASSES][NBUCKETS];
-    RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size);
+    RCCMPEntry *tmp = av_malloc_array(size, sizeof(*tmp));
     radix_count(data, size, buckets);
     radix_sort_pass(tmp, data, size, buckets[0], 0);
     radix_sort_pass(data, tmp, size, buckets[1], 1);
@@ -1044,13 +1055,10 @@ static int dnxhd_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
     DNXHDEncContext *ctx = avctx->priv_data;
     int first_field = 1;
     int offset, i, ret;
-    uint8_t *buf, *sd;
+    uint8_t *buf;
 
-    if ((ret = ff_alloc_packet(pkt, ctx->cid_table->frame_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "output buffer is too small to compress picture\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, ctx->cid_table->frame_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
     dnxhd_load_picture(ctx, frame);
@@ -1080,12 +1088,12 @@ encode_coding_unit:
     for (i = 0; i < ctx->m.mb_height; i++) {
         AV_WB32(ctx->msip + i * 4, offset);
         offset += ctx->slice_size[i];
-        assert(!(ctx->slice_size[i] & 3));
+        av_assert1(!(ctx->slice_size[i] & 3));
     }
 
     avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
 
-    assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
+    av_assert1(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
     memset(buf + 640 + offset, 0,
            ctx->cid_table->coding_unit_size - 4 - offset - 640);
 
@@ -1104,10 +1112,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = ctx->qscale * FF_QP2LAMBDA;
+    ff_side_data_set_encoder_stats(pkt, ctx->qscale * FF_QP2LAMBDA, NULL, 0, AV_PICTURE_TYPE_I);
 
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
@@ -1143,6 +1148,11 @@ static av_cold int dnxhd_encode_end(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVCodecDefault dnxhd_defaults[] = {
+    { "qmax", "1024" }, /* Maximum quantization scale factor allowed for VC-3 */
+    { NULL },
+};
+
 AVCodec ff_dnxhd_encoder = {
     .name           = "dnxhd",
     .long_name      = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
@@ -1158,5 +1168,6 @@ AVCodec ff_dnxhd_encoder = {
         AV_PIX_FMT_YUV422P10,
         AV_PIX_FMT_NONE
     },
-    .priv_class     = &class,
+    .priv_class     = &dnxhd_class,
+    .defaults       = dnxhd_defaults,
 };
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
index d3df0e0..3f531ef 100644
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -85,8 +85,6 @@ typedef struct DNXHDEncContext {
     unsigned qscale;
     unsigned lambda;
 
-    unsigned thread_size;
-
     uint16_t *mb_bits;
     uint8_t  *mb_qscale;
 
diff --git a/libavcodec/dpcm.c b/libavcodec/dpcm.c
index 7567643..2edd4d5 100644
--- a/libavcodec/dpcm.c
+++ b/libavcodec/dpcm.c
@@ -2,20 +2,20 @@
  * Assorted DPCM codecs
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,7 +44,7 @@
 #include "mathops.h"
 
 typedef struct DPCMContext {
-    int16_t roq_square_array[256];
+    int16_t square_array[256];
     int sample[2];                  ///< previous sample (for SOL_DPCM)
     const int8_t *sol_table;        ///< delta table for SOL_DPCM
 } DPCMContext;
@@ -118,7 +118,7 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
     int i;
 
     if (avctx->channels < 1 || avctx->channels > 2) {
-        av_log(avctx, AV_LOG_INFO, "invalid number of channels\n");
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
         return AVERROR(EINVAL);
     }
 
@@ -130,8 +130,8 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
         /* initialize square table */
         for (i = 0; i < 128; i++) {
             int16_t square = i * i;
-            s->roq_square_array[i      ] =  square;
-            s->roq_square_array[i + 128] = -square;
+            s->square_array[i      ] =  square;
+            s->square_array[i + 128] = -square;
         }
         break;
 
@@ -153,6 +153,13 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
         }
         break;
 
+    case AV_CODEC_ID_SDX2_DPCM:
+        for (i = -128; i < 128; i++) {
+            int16_t square = i * i * 2;
+            s->square_array[i+128] = i < 0 ? -square: square;
+        }
+        break;
+
     default:
         break;
     }
@@ -200,18 +207,22 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
         else
             out = buf_size;
         break;
+    case AV_CODEC_ID_SDX2_DPCM:
+        out = buf_size;
+        break;
     }
     if (out <= 0) {
         av_log(avctx, AV_LOG_ERROR, "packet is too small\n");
         return AVERROR(EINVAL);
     }
+    if (out % avctx->channels) {
+        av_log(avctx, AV_LOG_WARNING, "channels have differing number of samples\n");
+    }
 
     /* get output buffer */
-    frame->nb_samples = out / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = (out + avctx->channels - 1) / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples = (int16_t *)frame->data[0];
     samples_end = output_samples + out;
 
@@ -229,7 +240,7 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
 
         /* decode the samples */
         while (output_samples < samples_end) {
-            predictor[ch] += s->roq_square_array[bytestream2_get_byteu(&gb)];
+            predictor[ch] += s->square_array[bytestream2_get_byteu(&gb)];
             predictor[ch]  = av_clip_int16(predictor[ch]);
             *output_samples++ = predictor[ch];
 
@@ -317,6 +328,19 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
         break;
+
+    case AV_CODEC_ID_SDX2_DPCM:
+        while (output_samples < samples_end) {
+            int8_t n = bytestream2_get_byteu(&gb);
+
+            if (!(n & 1))
+                s->sample[ch] = 0;
+            s->sample[ch] += s->square_array[n + 128];
+            s->sample[ch]  = av_clip_int16(s->sample[ch]);
+            *output_samples++ = s->sample[ch];
+            ch ^= stereo;
+        }
+        break;
     }
 
     *got_frame_ptr = 1;
@@ -338,5 +362,6 @@ AVCodec ff_ ## name_ ## _decoder = {                        \
 
 DPCM_DECODER(AV_CODEC_ID_INTERPLAY_DPCM, interplay_dpcm, "DPCM Interplay");
 DPCM_DECODER(AV_CODEC_ID_ROQ_DPCM,       roq_dpcm,       "DPCM id RoQ");
+DPCM_DECODER(AV_CODEC_ID_SDX2_DPCM,      sdx2_dpcm,      "DPCM Squareroot-Delta-Exact");
 DPCM_DECODER(AV_CODEC_ID_SOL_DPCM,       sol_dpcm,       "DPCM Sol");
 DPCM_DECODER(AV_CODEC_ID_XAN_DPCM,       xan_dpcm,       "DPCM Xan");
diff --git a/libavcodec/dpx.c b/libavcodec/dpx.c
index d4effa4..e4dd1b0 100644
--- a/libavcodec/dpx.c
+++ b/libavcodec/dpx.c
@@ -2,29 +2,42 @@
  * DPX (.dpx) image decoder
  * Copyright (c) 2009 Jimmy Christensen
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/intfloat.h"
 #include "libavutil/imgutils.h"
 #include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
 
+static unsigned int read16(const uint8_t **ptr, int is_big)
+{
+    unsigned int temp;
+    if (is_big) {
+        temp = AV_RB16(*ptr);
+    } else {
+        temp = AV_RL16(*ptr);
+    }
+    *ptr += 2;
+    return temp;
+}
+
 static unsigned int read32(const uint8_t **ptr, int is_big)
 {
     unsigned int temp;
@@ -37,12 +50,19 @@ static unsigned int read32(const uint8_t **ptr, int is_big)
     return temp;
 }
 
-static inline unsigned make_16bit(unsigned value)
+static uint16_t read10in32(const uint8_t **ptr, uint32_t * lbuf,
+                                  int * n_datum, int is_big)
 {
-    // mask away invalid bits
-    value &= 0xFFC0;
-    // correctly expand to 16 bits
-    return value + (value >> 10);
+    if (*n_datum)
+        (*n_datum)--;
+    else {
+        *lbuf = read32(ptr, is_big);
+        *n_datum = 2;
+    }
+
+    *lbuf = (*lbuf << 10) | (*lbuf >> 22);
+
+    return *lbuf & 0x3FF;
 }
 
 static int decode_frame(AVCodecContext *avctx,
@@ -51,17 +71,18 @@ static int decode_frame(AVCodecContext *avctx,
                         AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = avpkt->data + avpkt->size;
     int buf_size       = avpkt->size;
     AVFrame *const p = data;
-    uint8_t *ptr;
+    uint8_t *ptr[AV_NUM_DATA_POINTERS];
 
     unsigned int offset;
     int magic_num, endian;
-    int x, y, ret;
-    int w, h, stride, bits_per_color, descriptor, elements, target_packet_size, source_packet_size;
+    int x, y, stride, i, ret;
+    int w, h, bits_per_color, descriptor, elements, packing;
+    int encoding, need_align = 0;
 
-    unsigned int rgbBuffer;
+    unsigned int rgbBuffer = 0;
+    int n_datum = 0;
 
     if (avpkt->size <= 1634) {
         av_log(avctx, AV_LOG_ERROR, "Packet too small for DPX header\n");
@@ -87,11 +108,24 @@ static int decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Invalid data start offset\n");
         return AVERROR_INVALIDDATA;
     }
+
+    // Check encryption
+    buf = avpkt->data + 660;
+    ret = read32(&buf, endian);
+    if (ret != 0xFFFFFFFF) {
+        avpriv_report_missing_feature(avctx, "Encryption");
+        av_log(avctx, AV_LOG_WARNING, "The image is encrypted and may "
+               "not properly decode.\n");
+    }
+
     // Need to end in 0x304 offset from start of file
     buf = avpkt->data + 0x304;
     w = read32(&buf, endian);
     h = read32(&buf, endian);
 
+    if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
+        return ret;
+
     // Need to end in 0x320 to read the descriptor
     buf += 20;
     descriptor = buf[0];
@@ -100,108 +134,256 @@ static int decode_frame(AVCodecContext *avctx,
     buf += 3;
     avctx->bits_per_raw_sample =
     bits_per_color = buf[0];
+    buf++;
+    packing = read16(&buf, endian);
+    encoding = read16(&buf, endian);
+
+    if (packing > 1) {
+        avpriv_report_missing_feature(avctx, "Packing %d", packing);
+        return AVERROR_PATCHWELCOME;
+    }
+    if (encoding) {
+        avpriv_report_missing_feature(avctx, "Encoding %d", encoding);
+        return AVERROR_PATCHWELCOME;
+    }
 
-    buf += 825;
+    buf += 820;
     avctx->sample_aspect_ratio.num = read32(&buf, endian);
     avctx->sample_aspect_ratio.den = read32(&buf, endian);
+    if (avctx->sample_aspect_ratio.num > 0 && avctx->sample_aspect_ratio.den > 0)
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den,
+                  0x10000);
+    else
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+
+    if (offset >= 1724 + 4) {
+        buf = avpkt->data + 1724;
+        i = read32(&buf, endian);
+        if(i) {
+            AVRational q = av_d2q(av_int2float(i), 4096);
+            if (q.num > 0 && q.den > 0)
+                avctx->framerate = q;
+        }
+    }
 
     switch (descriptor) {
-        case 51: // RGBA
-            elements = 4;
-            break;
-        case 50: // RGB
-            elements = 3;
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported descriptor %d\n", descriptor);
-            return AVERROR_INVALIDDATA;
+    case 6:  // Y
+        elements = 1;
+        break;
+    case 52: // ABGR
+    case 51: // RGBA
+    case 103: // UYVA4444
+        elements = 4;
+        break;
+    case 50: // RGB
+    case 102: // UYV444
+        elements = 3;
+        break;
+    case 100: // UYVY422
+        elements = 2;
+        break;
+    default:
+        avpriv_report_missing_feature(avctx, "Descriptor %d", descriptor);
+        return AVERROR_PATCHWELCOME;
     }
 
     switch (bits_per_color) {
-        case 8:
-            if (elements == 4) {
-                avctx->pix_fmt = AV_PIX_FMT_RGBA;
-            } else {
-                avctx->pix_fmt = AV_PIX_FMT_RGB24;
-            }
-            source_packet_size = elements;
-            target_packet_size = elements;
-            break;
-        case 10:
-            avctx->pix_fmt = AV_PIX_FMT_RGB48;
-            target_packet_size = 6;
-            source_packet_size = 4;
-            break;
-        case 12:
-        case 16:
-            if (endian) {
-                avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
-            } else {
-                avctx->pix_fmt = AV_PIX_FMT_RGB48LE;
-            }
-            target_packet_size = 6;
-            source_packet_size = elements * 2;
-            break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported color depth : %d\n", bits_per_color);
+    case 8:
+        stride = avctx->width * elements;
+        break;
+    case 10:
+        if (!packing) {
+            av_log(avctx, AV_LOG_ERROR, "Packing to 32bit required\n");
+            return -1;
+        }
+        stride = (avctx->width * elements + 2) / 3 * 4;
+        break;
+    case 12:
+        if (!packing) {
+            av_log(avctx, AV_LOG_ERROR, "Packing to 16bit required\n");
+            return -1;
+        }
+        stride = 2 * avctx->width * elements;
+        break;
+    case 16:
+        stride = 2 * avctx->width * elements;
+        break;
+    case 1:
+    case 32:
+    case 64:
+        avpriv_report_missing_feature(avctx, "Depth %d", bits_per_color);
+        return AVERROR_PATCHWELCOME;
+    default:
+        return AVERROR_INVALIDDATA;
+    }
+
+    // Table 3c: Runs will always break at scan line boundaries. Packing
+    // will always break to the next 32-bit word at scan-line boundaries.
+    // Unfortunately, the encoder produced invalid files, so attempt
+    // to detect it
+    need_align = FFALIGN(stride, 4);
+    if (need_align*avctx->height + (int64_t)offset > avpkt->size) {
+        // Alignment seems unappliable, try without
+        if (stride*avctx->height + (int64_t)offset > avpkt->size) {
+            av_log(avctx, AV_LOG_ERROR, "Overread buffer. Invalid header?\n");
             return AVERROR_INVALIDDATA;
+        } else {
+            av_log(avctx, AV_LOG_INFO, "Decoding DPX without scanline "
+                   "alignment.\n");
+            need_align = 0;
+        }
+    } else {
+        need_align -= stride;
+        stride = FFALIGN(stride, 4);
     }
 
-    if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
-        return ret;
+    switch (1000 * descriptor + 10 * bits_per_color + endian) {
+    case 6081:
+    case 6080:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        break;
+    case 50081:
+    case 50080:
+        avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        break;
+    case 52081:
+    case 52080:
+        avctx->pix_fmt = AV_PIX_FMT_ABGR;
+        break;
+    case 51081:
+    case 51080:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        break;
+    case 50100:
+    case 50101:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        break;
+    case 51100:
+    case 51101:
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        break;
+    case 50120:
+    case 50121:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+        break;
+    case 51120:
+    case 51121:
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP12;
+        break;
+    case 6161:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
+        break;
+    case 6160:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY16LE;
+        break;
+    case 50161:
+        avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+        break;
+    case 50160:
+        avctx->pix_fmt = AV_PIX_FMT_RGB48LE;
+        break;
+    case 51161:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+        break;
+    case 51160:
+        avctx->pix_fmt = AV_PIX_FMT_RGBA64LE;
+        break;
+    case 100081:
+        avctx->pix_fmt = AV_PIX_FMT_UYVY422;
+        break;
+    case 102081:
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        break;
+    case 103081:
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported format\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     ff_set_sar(avctx, avctx->sample_aspect_ratio);
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     // Move pointer to offset from start of file
     buf =  avpkt->data + offset;
 
-    ptr    = p->data[0];
-    stride = p->linesize[0];
+    for (i=0; i<AV_NUM_DATA_POINTERS; i++)
+        ptr[i] = p->data[i];
 
-    if (source_packet_size*avctx->width*avctx->height > buf_end - buf) {
-        av_log(avctx, AV_LOG_ERROR, "Overread buffer. Invalid header?\n");
-        return AVERROR_INVALIDDATA;
-    }
     switch (bits_per_color) {
-        case 10:
-            for (x = 0; x < avctx->height; x++) {
-               uint16_t *dst = (uint16_t*)ptr;
-               for (y = 0; y < avctx->width; y++) {
-                   rgbBuffer = read32(&buf, endian);
-                   // Read out the 10-bit colors and convert to 16-bit
-                   *dst++ = make_16bit(rgbBuffer >> 16);
-                   *dst++ = make_16bit(rgbBuffer >>  6);
-                   *dst++ = make_16bit(rgbBuffer <<  4);
-               }
-               ptr += stride;
+    case 10:
+        for (x = 0; x < avctx->height; x++) {
+            uint16_t *dst[4] = {(uint16_t*)ptr[0],
+                                (uint16_t*)ptr[1],
+                                (uint16_t*)ptr[2],
+                                (uint16_t*)ptr[3]};
+            for (y = 0; y < avctx->width; y++) {
+                *dst[2]++ = read10in32(&buf, &rgbBuffer,
+                                       &n_datum, endian);
+                *dst[0]++ = read10in32(&buf, &rgbBuffer,
+                                       &n_datum, endian);
+                *dst[1]++ = read10in32(&buf, &rgbBuffer,
+                                       &n_datum, endian);
+                if (elements == 4)
+                    *dst[3]++ =
+                    read10in32(&buf, &rgbBuffer,
+                               &n_datum, endian);
             }
-            break;
-        case 8:
-        case 12: // Treat 12-bit as 16-bit
-        case 16:
-            if (source_packet_size == target_packet_size) {
-                for (x = 0; x < avctx->height; x++) {
-                    memcpy(ptr, buf, target_packet_size*avctx->width);
-                    ptr += stride;
-                    buf += source_packet_size*avctx->width;
-                }
-            } else {
-                for (x = 0; x < avctx->height; x++) {
-                    uint8_t *dst = ptr;
-                    for (y = 0; y < avctx->width; y++) {
-                        memcpy(dst, buf, target_packet_size);
-                        dst += target_packet_size;
-                        buf += source_packet_size;
-                    }
-                    ptr += stride;
+            n_datum = 0;
+            for (i = 0; i < elements; i++)
+                ptr[i] += p->linesize[i];
+        }
+        break;
+    case 12:
+        for (x = 0; x < avctx->height; x++) {
+            uint16_t *dst[4] = {(uint16_t*)ptr[0],
+                                (uint16_t*)ptr[1],
+                                (uint16_t*)ptr[2],
+                                (uint16_t*)ptr[3]};
+            for (y = 0; y < avctx->width; y++) {
+                *dst[2] = read16(&buf, endian) >> 4;
+                dst[2]++;
+                *dst[0] = read16(&buf, endian) >> 4;
+                dst[0]++;
+                *dst[1] = read16(&buf, endian) >> 4;
+                dst[1]++;
+                if (elements == 4)
+                    *dst[3]++ = read16(&buf, endian) >> 4;
+            }
+            for (i = 0; i < elements; i++)
+                ptr[i] += p->linesize[i];
+            // Jump to next aligned position
+            buf += need_align;
+        }
+        break;
+    case 16:
+        elements *= 2;
+    case 8:
+        if (   avctx->pix_fmt == AV_PIX_FMT_YUVA444P
+            || avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
+            for (x = 0; x < avctx->height; x++) {
+                ptr[0] = p->data[0] + x * p->linesize[0];
+                ptr[1] = p->data[1] + x * p->linesize[1];
+                ptr[2] = p->data[2] + x * p->linesize[2];
+                ptr[3] = p->data[3] + x * p->linesize[3];
+                for (y = 0; y < avctx->width; y++) {
+                    *ptr[1]++ = *buf++;
+                    *ptr[0]++ = *buf++;
+                    *ptr[2]++ = *buf++;
+                    if (avctx->pix_fmt == AV_PIX_FMT_YUVA444P)
+                        *ptr[3]++ = *buf++;
                 }
             }
-            break;
+        } else {
+        av_image_copy_plane(ptr[0], p->linesize[0],
+                            buf, stride,
+                            elements * avctx->width, avctx->height);
+        }
+        break;
     }
 
     *got_frame = 1;
@@ -211,7 +393,7 @@ static int decode_frame(AVCodecContext *avctx,
 
 AVCodec ff_dpx_decoder = {
     .name           = "dpx",
-    .long_name      = NULL_IF_CONFIG_SMALL("DPX image"),
+    .long_name      = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_DPX,
     .decode         = decode_frame,
diff --git a/libavcodec/dpx_parser.c b/libavcodec/dpx_parser.c
index e3a7ac5..8e4a01e 100644
--- a/libavcodec/dpx_parser.c
+++ b/libavcodec/dpx_parser.c
@@ -2,20 +2,20 @@
  * DPX parser
  * Copyright (c) 2013 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dpxenc.c b/libavcodec/dpxenc.c
index adcb529..a596033 100644
--- a/libavcodec/dpxenc.c
+++ b/libavcodec/dpxenc.c
@@ -2,20 +2,20 @@
  * DPX (.dpx) image encoder
  * Copyright (c) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,35 +28,44 @@
 typedef struct DPXContext {
     int big_endian;
     int bits_per_component;
+    int num_components;
     int descriptor;
+    int planar;
 } DPXContext;
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     DPXContext *s = avctx->priv_data;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    s->big_endian         = 1;
-    s->bits_per_component = 8;
-    s->descriptor         = 50; /* RGB */
+    s->big_endian         = !!(desc->flags & AV_PIX_FMT_FLAG_BE);
+    s->bits_per_component = desc->comp[0].depth;
+    s->num_components     = desc->nb_components;
+    s->descriptor         = (desc->flags & AV_PIX_FMT_FLAG_ALPHA) ? 51 : 50;
+    s->planar             = !!(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
 
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_ABGR:
+        s->descriptor = 52;
+        break;
+    case AV_PIX_FMT_GRAY16BE:
+    case AV_PIX_FMT_GRAY16LE:
+    case AV_PIX_FMT_GRAY8:
+        s->descriptor = 6;
         break;
+    case AV_PIX_FMT_GBRP10BE:
+    case AV_PIX_FMT_GBRP10LE:
+    case AV_PIX_FMT_GBRP12BE:
+    case AV_PIX_FMT_GBRP12LE:
+    case AV_PIX_FMT_RGB24:
+    case AV_PIX_FMT_RGBA64BE:
+    case AV_PIX_FMT_RGBA64LE:
     case AV_PIX_FMT_RGBA:
-        s->descriptor = 51; /* RGBA */
         break;
     case AV_PIX_FMT_RGB48LE:
-        s->big_endian = 0;
-        /* fall-through */
     case AV_PIX_FMT_RGB48BE:
-        s->bits_per_component = avctx->bits_per_raw_sample ? avctx->bits_per_raw_sample : 16;
+        if (avctx->bits_per_raw_sample)
+            s->bits_per_component = avctx->bits_per_raw_sample;
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
@@ -66,17 +75,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
-#define write16(p, value) \
-do { \
-    if (s->big_endian) AV_WB16(p, value); \
-    else               AV_WL16(p, value); \
-} while(0)
+static av_always_inline void write16_internal(int big_endian, void *p, int value)
+{
+    if (big_endian) AV_WB16(p, value);
+    else            AV_WL16(p, value);
+}
+
+static av_always_inline void write32_internal(int big_endian, void *p, int value)
+{
+    if (big_endian) AV_WB32(p, value);
+    else            AV_WL32(p, value);
+}
 
-#define write32(p, value) \
-do { \
-    if (s->big_endian) AV_WB32(p, value); \
-    else               AV_WL32(p, value); \
-} while(0)
+#define write16(p, value) write16_internal(s->big_endian, p, value)
+#define write32(p, value) write32_internal(s->big_endian, p, value)
 
 static void encode_rgb48_10bit(AVCodecContext *avctx, const AVFrame *pic,
                                uint8_t *dst)
@@ -88,14 +100,14 @@ static void encode_rgb48_10bit(AVCodecContext *avctx, const AVFrame *pic,
     for (y = 0; y < avctx->height; y++) {
         for (x = 0; x < avctx->width; x++) {
             int value;
-            if ((avctx->pix_fmt & 1)) {
-                value = ((AV_RB16(src + 6*x + 4) & 0xFFC0) >> 4)
-                      | ((AV_RB16(src + 6*x + 2) & 0xFFC0) << 6)
-                      | ((AV_RB16(src + 6*x + 0) & 0xFFC0) << 16);
+            if (s->big_endian) {
+                value = ((AV_RB16(src + 6*x + 4) & 0xFFC0U) >> 4)
+                      | ((AV_RB16(src + 6*x + 2) & 0xFFC0U) << 6)
+                      | ((AV_RB16(src + 6*x + 0) & 0xFFC0U) << 16);
             } else {
-                value = ((AV_RL16(src + 6*x + 4) & 0xFFC0) >> 4)
-                      | ((AV_RL16(src + 6*x + 2) & 0xFFC0) << 6)
-                      | ((AV_RL16(src + 6*x + 0) & 0xFFC0) << 16);
+                value = ((AV_RL16(src + 6*x + 4) & 0xFFC0U) >> 4)
+                      | ((AV_RL16(src + 6*x + 2) & 0xFFC0U) << 6)
+                      | ((AV_RL16(src + 6*x + 0) & 0xFFC0U) << 16);
             }
             write32(dst, value);
             dst += 4;
@@ -104,23 +116,88 @@ static void encode_rgb48_10bit(AVCodecContext *avctx, const AVFrame *pic,
     }
 }
 
+static void encode_gbrp10(AVCodecContext *avctx, const AVFrame *pic, uint8_t *dst)
+{
+    DPXContext *s = avctx->priv_data;
+    const uint8_t *src[3] = {pic->data[0], pic->data[1], pic->data[2]};
+    int x, y, i;
+
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width; x++) {
+            int value;
+            if (s->big_endian) {
+                value = (AV_RB16(src[0] + 2*x) << 12)
+                      | (AV_RB16(src[1] + 2*x) << 2)
+                      | ((unsigned)AV_RB16(src[2] + 2*x) << 22);
+            } else {
+                value = (AV_RL16(src[0] + 2*x) << 12)
+                      | (AV_RL16(src[1] + 2*x) << 2)
+                      | ((unsigned)AV_RL16(src[2] + 2*x) << 22);
+            }
+            write32(dst, value);
+            dst += 4;
+        }
+        for (i = 0; i < 3; i++)
+            src[i] += pic->linesize[i];
+    }
+}
+
+static void encode_gbrp12(AVCodecContext *avctx, const AVFrame *pic, uint16_t *dst)
+{
+    DPXContext *s = avctx->priv_data;
+    const uint16_t *src[3] = {(uint16_t*)pic->data[0],
+                              (uint16_t*)pic->data[1],
+                              (uint16_t*)pic->data[2]};
+    int x, y, i, pad;
+    pad = avctx->width*6;
+    pad = (FFALIGN(pad, 4) - pad) >> 1;
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width; x++) {
+            uint16_t value[3];
+            if (s->big_endian) {
+                value[1] = AV_RB16(src[0] + x) << 4;
+                value[2] = AV_RB16(src[1] + x) << 4;
+                value[0] = AV_RB16(src[2] + x) << 4;
+            } else {
+                value[1] = AV_RL16(src[0] + x) << 4;
+                value[2] = AV_RL16(src[1] + x) << 4;
+                value[0] = AV_RL16(src[2] + x) << 4;
+            }
+            for (i = 0; i < 3; i++)
+                write16(dst++, value[i]);
+        }
+        for (i = 0; i < pad; i++)
+            *dst++ = 0;
+        for (i = 0; i < 3; i++)
+            src[i] += pic->linesize[i]/2;
+    }
+}
+
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *frame, int *got_packet)
 {
     DPXContext *s = avctx->priv_data;
-    int size, ret;
+    int size, ret, need_align, len;
     uint8_t *buf;
 
 #define HEADER_SIZE 1664  /* DPX Generic header */
     if (s->bits_per_component == 10)
         size = avctx->height * avctx->width * 4;
-    else
-        size = av_image_get_buffer_size(avctx->pix_fmt,
-                                        avctx->width, avctx->height, 1);
-    if ((ret = ff_alloc_packet(pkt, size + HEADER_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+    else if (s->bits_per_component == 12) {
+        // 3 components, 12 bits put on 16 bits
+        len  = avctx->width*6;
+        size = FFALIGN(len, 4);
+        need_align = size - len;
+        size *= avctx->height;
+    } else {
+        // N components, M bits
+        len = avctx->width * s->num_components * s->bits_per_component >> 3;
+        size = FFALIGN(len, 4);
+        need_align = size - len;
+        size *= avctx->height;
     }
+    if ((ret = ff_alloc_packet2(avctx, pkt, size + HEADER_SIZE, 0)) < 0)
+        return ret;
     buf = pkt->data;
 
     memset(buf, 0, HEADER_SIZE);
@@ -144,26 +221,45 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     buf[801] = 2; /* linear transfer */
     buf[802] = 2; /* linear colorimetric */
     buf[803] = s->bits_per_component;
-    write16(buf + 804, s->bits_per_component == 10 ? 1 : 0); /* packing method */
+    write16(buf + 804, (s->bits_per_component == 10 || s->bits_per_component == 12) ?
+                       1 : 0); /* packing method */
     write32(buf + 808, HEADER_SIZE); /* data offset */
 
     /* Image source information header */
     write32(buf + 1628, avctx->sample_aspect_ratio.num);
     write32(buf + 1632, avctx->sample_aspect_ratio.den);
 
-    switch (s->bits_per_component) {
+    switch(s->bits_per_component) {
     case 8:
     case 16:
-        size = av_image_copy_to_buffer(buf + HEADER_SIZE,
-                                       pkt->size - HEADER_SIZE,
-                                       frame->data, frame->linesize,
-                                       avctx->pix_fmt,
-                                       avctx->width, avctx->height, 1);
+        if (need_align) {
+            int j;
+            const uint8_t *src = frame->data[0];
+            uint8_t *dst = pkt->data + HEADER_SIZE;
+            size = (len + need_align) * avctx->height;
+            for (j=0; j<avctx->height; j++) {
+                memcpy(dst, src, len);
+                memset(dst + len, 0, need_align);
+                dst += len + need_align;
+                src += frame->linesize[0];
+            }
+        } else {
+            size = av_image_copy_to_buffer(buf + HEADER_SIZE, pkt->size - HEADER_SIZE,
+                                           (const uint8_t**)frame->data, frame->linesize,
+                                           avctx->pix_fmt,
+                                           avctx->width, avctx->height, 1);
+        }
         if (size < 0)
             return size;
         break;
     case 10:
-        encode_rgb48_10bit(avctx, frame, buf + HEADER_SIZE);
+        if (s->planar)
+            encode_gbrp10(avctx, frame, buf + HEADER_SIZE);
+        else
+            encode_rgb48_10bit(avctx, frame, buf + HEADER_SIZE);
+        break;
+    case 12:
+        encode_gbrp12(avctx, frame, (uint16_t*)(buf + HEADER_SIZE));
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n", s->bits_per_component);
@@ -181,17 +277,20 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 }
 
 AVCodec ff_dpx_encoder = {
-    .name = "dpx",
-    .long_name = NULL_IF_CONFIG_SMALL("DPX image"),
-    .type = AVMEDIA_TYPE_VIDEO,
-    .id   = AV_CODEC_ID_DPX,
+    .name           = "dpx",
+    .long_name      = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DPX,
     .priv_data_size = sizeof(DPXContext),
-    .init   = encode_init,
-    .encode2 = encode_frame,
-    .pix_fmts = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24,
-        AV_PIX_FMT_RGBA,
-        AV_PIX_FMT_RGB48LE,
-        AV_PIX_FMT_RGB48BE,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_RGB24,    AV_PIX_FMT_RGBA, AV_PIX_FMT_ABGR,
+        AV_PIX_FMT_GRAY16LE, AV_PIX_FMT_GRAY16BE,
+        AV_PIX_FMT_RGB48LE,  AV_PIX_FMT_RGB48BE,
+        AV_PIX_FMT_RGBA64LE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_GBRP10LE, AV_PIX_FMT_GBRP10BE,
+        AV_PIX_FMT_GBRP12LE, AV_PIX_FMT_GBRP12BE,
         AV_PIX_FMT_NONE},
 };
diff --git a/libavcodec/dsd.c b/libavcodec/dsd.c
new file mode 100644
index 0000000..9104f38
--- /dev/null
+++ b/libavcodec/dsd.c
@@ -0,0 +1,86 @@
+/*
+ * Direct Stream Digital (DSD) decoder
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ * Copyright (c) 2014 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/internal.h"
+#include "libavcodec/mathops.h"
+#include "avcodec.h"
+#include "dsd_tablegen.h"
+#include "dsd.h"
+
+static av_cold void dsd_ctables_tableinit(void)
+{
+    int t, e, m, sign;
+    double acc[CTABLES];
+    for (e = 0; e < 256; ++e) {
+        memset(acc, 0, sizeof(acc));
+        for (m = 0; m < 8; ++m) {
+            sign = (((e >> (7 - m)) & 1) * 2 - 1);
+            for (t = 0; t < CTABLES; ++t)
+                acc[t] += sign * htaps[t * 8 + m];
+        }
+        for (t = 0; t < CTABLES; ++t)
+            ctables[CTABLES - 1 - t][e] = acc[t];
+    }
+}
+
+av_cold void ff_init_dsd_data(void)
+{
+    static int done = 0;
+    if (done)
+        return;
+    dsd_ctables_tableinit();
+    done = 1;
+}
+
+void ff_dsd2pcm_translate(DSDContext* s, size_t samples, int lsbf,
+                          const unsigned char *src, ptrdiff_t src_stride,
+                          float *dst, ptrdiff_t dst_stride)
+{
+    unsigned pos, i;
+    unsigned char* p;
+    double sum;
+
+    pos = s->pos;
+
+    while (samples-- > 0) {
+        s->buf[pos] = lsbf ? ff_reverse[*src] : *src;
+        src += src_stride;
+
+        p = s->buf + ((pos - CTABLES) & FIFOMASK);
+        *p = ff_reverse[*p];
+
+        sum = 0.0;
+        for (i = 0; i < CTABLES; i++) {
+            unsigned char a = s->buf[(pos                   - i) & FIFOMASK];
+            unsigned char b = s->buf[(pos - (CTABLES*2 - 1) + i) & FIFOMASK];
+            sum += ctables[i][a] + ctables[i][b];
+        }
+
+        *dst = (float)sum;
+        dst += dst_stride;
+
+        pos = (pos + 1) & FIFOMASK;
+    }
+
+    s->pos = pos;
+}
diff --git a/libavcodec/dsd.h b/libavcodec/dsd.h
new file mode 100644
index 0000000..5ca4574
--- /dev/null
+++ b/libavcodec/dsd.h
@@ -0,0 +1,52 @@
+/*
+ * Direct Stream Digital (DSD) decoder
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ * Copyright (c) 2014 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DSD_H
+#define AVCODEC_DSD_H
+
+#include "libavcodec/internal.h"
+#include "libavcodec/mathops.h"
+#include "avcodec.h"
+
+#define HTAPS   48               /** number of FIR constants */
+#define FIFOSIZE 16              /** must be a power of two */
+#define FIFOMASK (FIFOSIZE - 1)  /** bit mask for FIFO offsets */
+
+#if FIFOSIZE * 8 < HTAPS * 2
+#error "FIFOSIZE too small"
+#endif
+
+/**
+ * Per-channel buffer
+ */
+typedef struct DSDContext {
+    unsigned char buf[FIFOSIZE];
+    unsigned pos;
+} DSDContext;
+
+void ff_init_dsd_data(void);
+
+void ff_dsd2pcm_translate(DSDContext* s, size_t samples, int lsbf,
+                          const unsigned char *src, ptrdiff_t src_stride,
+                          float *dst, ptrdiff_t dst_stride);
+#endif /* AVCODEC_DSD_H */
diff --git a/libavcodec/dsd_tablegen.h b/libavcodec/dsd_tablegen.h
new file mode 100644
index 0000000..e5da86a
--- /dev/null
+++ b/libavcodec/dsd_tablegen.h
@@ -0,0 +1,75 @@
+/*
+ * Header file for hardcoded DSD tables
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DSD_TABLEGEN_H
+#define AVCODEC_DSD_TABLEGEN_H
+
+#include <stdint.h>
+#include "libavutil/attributes.h"
+#include "dsd.h"
+
+#define HTAPS   48                /** number of FIR constants */
+#define CTABLES ((HTAPS + 7) / 8) /** number of "8 MACs" lookup tables */
+
+#include "libavutil/common.h"
+
+/*
+ * Properties of this 96-tap lowpass filter when applied on a signal
+ * with sampling rate of 44100*64 Hz:
+ *
+ * () has a delay of 17 microseconds.
+ *
+ * () flat response up to 48 kHz
+ *
+ * () if you downsample afterwards by a factor of 8, the
+ *    spectrum below 70 kHz is practically alias-free.
+ *
+ * () stopband rejection is about 160 dB
+ *
+ * The coefficient tables ("ctables") take only 6 Kibi Bytes and
+ * should fit into a modern processor's fast cache.
+ */
+
+/**
+ * The 2nd half (48 coeffs) of a 96-tap symmetric lowpass filter
+ */
+static const double htaps[HTAPS] = {
+     0.09950731974056658,    0.09562845727714668,    0.08819647126516944,
+     0.07782552527068175,    0.06534876523171299,    0.05172629311427257,
+     0.0379429484910187,     0.02490921351762261,    0.0133774746265897,
+     0.003883043418804416,  -0.003284703416210726,  -0.008080250212687497,
+    -0.01067241812471033,   -0.01139427235000863,   -0.0106813877974587,
+    -0.009007905078766049,  -0.006828859761015335,  -0.004535184322001496,
+    -0.002425035959059578,  -0.0006922187080790708,  0.0005700762133516592,
+     0.001353838005269448,   0.001713709169690937,   0.001742046839472948,
+     0.001545601648013235,   0.001226696225277855,   0.0008704322683580222,
+     0.0005381636200535649,  0.000266446345425276,   7.002968738383528e-05,
+    -5.279407053811266e-05, -0.0001140625650874684, -0.0001304796361231895,
+    -0.0001189970287491285, -9.396247155265073e-05, -6.577634378272832e-05,
+    -4.07492895872535e-05,  -2.17407957554587e-05,  -9.163058931391722e-06,
+    -2.017460145032201e-06,  1.249721855219005e-06,  2.166655190537392e-06,
+     1.930520892991082e-06,  1.319400334374195e-06,  7.410039764949091e-07,
+     3.423230509967409e-07,  1.244182214744588e-07,  3.130441005359396e-08
+};
+
+static float ctables[CTABLES][256];
+#endif /* AVCODEC_DSD_TABLEGEN_H */
diff --git a/libavcodec/dsddec.c b/libavcodec/dsddec.c
new file mode 100644
index 0000000..880d691
--- /dev/null
+++ b/libavcodec/dsddec.c
@@ -0,0 +1,110 @@
+/*
+ * Direct Stream Digital (DSD) decoder
+ * based on BSD licensed dsd2pcm by Sebastian Gesemann
+ * Copyright (c) 2009, 2011 Sebastian Gesemann. All rights reserved.
+ * Copyright (c) 2014 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Direct Stream Digital (DSD) decoder
+ */
+
+#include "libavcodec/internal.h"
+#include "libavcodec/mathops.h"
+#include "avcodec.h"
+#include "dsd.h"
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    DSDContext * s;
+    int i;
+
+    ff_init_dsd_data();
+
+    s = av_malloc_array(sizeof(DSDContext), avctx->channels);
+    if (!s)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < avctx->channels; i++) {
+        s[i].pos = 0;
+        memset(s[i].buf, 0x69, sizeof(s[i].buf));
+
+        /* 0x69 = 01101001
+         * This pattern "on repeat" makes a low energy 352.8 kHz tone
+         * and a high energy 1.0584 MHz tone which should be filtered
+         * out completely by any playback system --> silence
+         */
+    }
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
+    avctx->priv_data  = s;
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    DSDContext * s = avctx->priv_data;
+    AVFrame *frame = data;
+    int ret, i;
+    int lsbf = avctx->codec_id == AV_CODEC_ID_DSD_LSBF || avctx->codec_id == AV_CODEC_ID_DSD_LSBF_PLANAR;
+    int src_next;
+    int src_stride;
+
+    frame->nb_samples = avpkt->size / avctx->channels;
+
+    if (avctx->codec_id == AV_CODEC_ID_DSD_LSBF_PLANAR || avctx->codec_id == AV_CODEC_ID_DSD_MSBF_PLANAR) {
+        src_next   = frame->nb_samples;
+        src_stride = 1;
+    } else {
+        src_next   = 1;
+        src_stride = avctx->channels;
+    }
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (i = 0; i < avctx->channels; i++) {
+        float * dst = ((float **)frame->extended_data)[i];
+        ff_dsd2pcm_translate(&s[i], frame->nb_samples, lsbf,
+            avpkt->data + i * src_next, src_stride,
+            dst, 1);
+    }
+
+    *got_frame_ptr = 1;
+    return frame->nb_samples * avctx->channels;
+}
+
+#define DSD_DECODER(id_, name_, long_name_) \
+AVCodec ff_##name_##_decoder = { \
+    .name         = #name_, \
+    .long_name    = NULL_IF_CONFIG_SMALL(long_name_), \
+    .type         = AVMEDIA_TYPE_AUDIO, \
+    .id           = AV_CODEC_ID_##id_, \
+    .init         = decode_init, \
+    .decode       = decode_frame, \
+    .sample_fmts  = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_FLTP, \
+                                                   AV_SAMPLE_FMT_NONE }, \
+};
+
+DSD_DECODER(DSD_LSBF, dsd_lsbf, "DSD (Direct Stream Digital), least significant bit first")
+DSD_DECODER(DSD_MSBF, dsd_msbf, "DSD (Direct Stream Digital), most significant bit first")
+DSD_DECODER(DSD_MSBF_PLANAR, dsd_msbf_planar, "DSD (Direct Stream Digital), most significant bit first, planar")
+DSD_DECODER(DSD_LSBF_PLANAR, dsd_lsbf_planar, "DSD (Direct Stream Digital), least significant bit first, planar")
diff --git a/libavcodec/dsicinaudio.c b/libavcodec/dsicinaudio.c
index e0fecbe..290dab4 100644
--- a/libavcodec/dsicinaudio.c
+++ b/libavcodec/dsicinaudio.c
@@ -2,20 +2,20 @@
  * Delphine Software International CIN audio decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -98,10 +98,8 @@ static int cinaudio_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avpkt->size - cin->initial_decode_frame;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     delta = cin->delta;
diff --git a/libavcodec/dsicinvideo.c b/libavcodec/dsicinvideo.c
index 7c62dcf..f95cbc7 100644
--- a/libavcodec/dsicinvideo.c
+++ b/libavcodec/dsicinvideo.c
@@ -2,20 +2,20 @@
  * Delphine Software International CIN video decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,10 +42,33 @@ typedef struct CinVideoContext {
     uint8_t *bitmap_table[3];
 } CinVideoContext;
 
+static av_cold void destroy_buffers(CinVideoContext *cin)
+{
+    int i;
+
+    for (i = 0; i < 3; ++i)
+        av_freep(&cin->bitmap_table[i]);
+}
+
+static av_cold int allocate_buffers(CinVideoContext *cin)
+{
+    int i;
+
+    for (i = 0; i < 3; ++i) {
+        cin->bitmap_table[i] = av_mallocz(cin->bitmap_size);
+        if (!cin->bitmap_table[i]) {
+            av_log(cin->avctx, AV_LOG_ERROR, "Can't allocate bitmap buffers.\n");
+            destroy_buffers(cin);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
 static av_cold int cinvideo_decode_init(AVCodecContext *avctx)
 {
     CinVideoContext *cin = avctx->priv_data;
-    unsigned int i;
 
     cin->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -55,11 +78,8 @@ static av_cold int cinvideo_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     cin->bitmap_size = avctx->width * avctx->height;
-    for (i = 0; i < 3; ++i) {
-        cin->bitmap_table[i] = av_mallocz(cin->bitmap_size);
-        if (!cin->bitmap_table[i])
-            av_log(avctx, AV_LOG_ERROR, "Can't allocate bitmap buffers.\n");
-    }
+    if (allocate_buffers(cin))
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -141,27 +161,30 @@ static int cin_decode_lzss(const unsigned char *src, int src_size,
     return 0;
 }
 
-static void cin_decode_rle(const unsigned char *src, int src_size,
+static int cin_decode_rle(const unsigned char *src, int src_size,
                            unsigned char *dst, int dst_size)
 {
     int len, code;
     unsigned char *dst_end       = dst + dst_size;
     const unsigned char *src_end = src + src_size;
 
-    while (src < src_end && dst < dst_end) {
+    while (src + 1 < src_end && dst < dst_end) {
         code = *src++;
         if (code & 0x80) {
-            if (src >= src_end)
-                break;
             len = code - 0x7F;
             memset(dst, *src++, FFMIN(len, dst_end - dst));
         } else {
             len = code + 1;
+            if (len > src_end-src) {
+                av_log(NULL, AV_LOG_ERROR, "RLE overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             memcpy(dst, src, FFMIN3(len, dst_end - dst, src_end - src));
             src += len;
         }
         dst += len;
     }
+    return 0;
 }
 
 static int cinvideo_decode_frame(AVCodecContext *avctx,
@@ -188,19 +211,17 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
         if (palette_colors_count > 256)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < palette_colors_count; ++i) {
-            cin->palette[i]    = bytestream_get_le24(&buf);
+            cin->palette[i]    = 0xFFU << 24 | bytestream_get_le24(&buf);
             bitmap_frame_size -= 3;
         }
     } else {
         for (i = 0; i < palette_colors_count; ++i) {
-            cin->palette[buf[0]] = AV_RL24(buf + 1);
+            cin->palette[buf[0]] = 0xFFU << 24 | AV_RL24(buf + 1);
             buf                 += 4;
             bitmap_frame_size   -= 4;
         }
     }
 
-    bitmap_frame_size = FFMIN(cin->bitmap_size, bitmap_frame_size);
-
     /* note: the decoding routines below assumes that
      * surface.width = surface.pitch */
     switch (bitmap_frame_type) {
@@ -215,7 +236,7 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
                              cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
         break;
     case 35:
-        cin_decode_huffman(buf, bitmap_frame_size,
+        bitmap_frame_size = cin_decode_huffman(buf, bitmap_frame_size,
                            cin->bitmap_table[CIN_INT_BMP], cin->bitmap_size);
         cin_decode_rle(cin->bitmap_table[CIN_INT_BMP], bitmap_frame_size,
                        cin->bitmap_table[CIN_CUR_BMP], cin->bitmap_size);
@@ -251,11 +272,8 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
         break;
     }
 
-    if ((res = ff_reget_buffer(avctx, cin->frame)) < 0) {
-        av_log(cin->avctx, AV_LOG_ERROR,
-               "delphinecinvideo: reget_buffer() failed to allocate a frame\n");
+    if ((res = ff_reget_buffer(avctx, cin->frame)) < 0)
         return res;
-    }
 
     memcpy(cin->frame->data[1], cin->palette, sizeof(cin->palette));
     cin->frame->palette_has_changed = 1;
@@ -278,12 +296,10 @@ static int cinvideo_decode_frame(AVCodecContext *avctx,
 static av_cold int cinvideo_decode_end(AVCodecContext *avctx)
 {
     CinVideoContext *cin = avctx->priv_data;
-    int i;
 
     av_frame_free(&cin->frame);
 
-    for (i = 0; i < 3; ++i)
-        av_free(cin->bitmap_table[i]);
+    destroy_buffers(cin);
 
     return 0;
 }
diff --git a/libavcodec/dss_sp.c b/libavcodec/dss_sp.c
index 20b0528..ddea483 100644
--- a/libavcodec/dss_sp.c
+++ b/libavcodec/dss_sp.c
@@ -2,20 +2,20 @@
  * Digital Speech Standard - Standard Play mode (DSS SP) audio decoder.
  * Copyright (C) 2014 Oleksij Rempel <linux@rempel-privat.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 #define DSS_SP_FRAME_SIZE        42
 #define DSS_SP_SAMPLE_COUNT     (66 * SUBFRAMES)
-#define DSS_SP_FORMULA(a, b, c) ((((a) << 15) + (b) * (c)) + 0x4000) >> 15
+#define DSS_SP_FORMULA(a, b, c) (((((a) << 15) + (b) * (c)) + 0x4000) >> 15)
 
 typedef struct DssSpSubframe {
     int16_t gain;
@@ -50,6 +50,7 @@ typedef struct DssSpFrame {
 } DssSpFrame;
 
 typedef struct DssSpContext {
+    AVCodecContext *avctx;
     int32_t excitation[288 + 6];
     int32_t history[187];
     DssSpFrame fparam;
@@ -296,6 +297,7 @@ static av_cold int dss_sp_decode_init(AVCodecContext *avctx)
 
     memset(p->history, 0, sizeof(p->history));
     p->pulse_dec_mode = 1;
+    p->avctx          = avctx;
 
     return 0;
 }
@@ -378,7 +380,7 @@ static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
                 if (C72_binomials[index] <= combined_pulse_pos) {
                     combined_pulse_pos -= C72_binomials[index];
 
-                    fparam->sf[subframe_idx].pulse_pos[(index ^ 7) - 1] = i;
+                    fparam->sf[subframe_idx].pulse_pos[6 - index] = i;
 
                     if (!index)
                         break;
@@ -400,10 +402,15 @@ static void dss_sp_unpack_coeffs(DssSpContext *p, const uint8_t *src)
 
     combined_pitch /= 151;
 
-    for (i = 1; i < SUBFRAMES; i++) {
+    for (i = 1; i < SUBFRAMES - 1; i++) {
         fparam->pitch_lag[i] = combined_pitch % 48;
         combined_pitch      /= 48;
     }
+    if (combined_pitch > 47) {
+        av_log (p->avctx, AV_LOG_WARNING, "combined_pitch was too large\n");
+        combined_pitch = 0;
+    }
+    fparam->pitch_lag[i] = combined_pitch;
 
     pitch_lag = fparam->pitch_lag[0];
     for (i = 1; i < SUBFRAMES; i++) {
@@ -754,10 +761,8 @@ static int dss_sp_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame->nb_samples = DSS_SP_SAMPLE_COUNT;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed.\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     out = (int16_t *)frame->data[0];
 
diff --git a/libavcodec/dstdec.c b/libavcodec/dstdec.c
new file mode 100644
index 0000000..368cb64
--- /dev/null
+++ b/libavcodec/dstdec.c
@@ -0,0 +1,374 @@
+/*
+ * Direct Stream Transfer (DST) decoder
+ * Copyright (c) 2014 Peter Ross <pross@xvid.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Direct Stream Transfer (DST) decoder
+ * ISO/IEC 14496-3 Part 3 Subpart 10: Technical description of lossless coding of oversampled audio
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "avcodec.h"
+#include "golomb.h"
+#include "mathops.h"
+#include "dsd.h"
+
+#define DST_MAX_CHANNELS 6
+#define DST_MAX_ELEMENTS (2 * DST_MAX_CHANNELS)
+
+#define DSD_FS44(sample_rate) (sample_rate * 8 / 44100)
+
+#define DST_SAMPLES_PER_FRAME(sample_rate) (588 * DSD_FS44(sample_rate))
+
+static const int8_t fsets_code_pred_coeff[3][3] = {
+    {  -8 },
+    { -16,  8 },
+    {  -9, -5, 6 },
+};
+
+static const int8_t probs_code_pred_coeff[3][3] = {
+    {  -8 },
+    { -16,  8 },
+    { -24, 24, -8 },
+};
+
+typedef struct ArithCoder {
+    unsigned int a;
+    unsigned int c;
+} ArithCoder;
+
+typedef struct Table {
+    unsigned int elements;
+    unsigned int length[DST_MAX_ELEMENTS];
+    int coeff[DST_MAX_ELEMENTS][128];
+} Table;
+
+typedef struct DSTContext {
+    AVClass *class;
+
+    GetBitContext gb;
+    ArithCoder ac;
+    Table fsets, probs;
+    DECLARE_ALIGNED(64, uint8_t, status)[DST_MAX_CHANNELS][16];
+    DECLARE_ALIGNED(16, int16_t, filter)[DST_MAX_ELEMENTS][16][256];
+    DSDContext dsdctx[DST_MAX_CHANNELS];
+} DSTContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    DSTContext *s = avctx->priv_data;
+    int i;
+
+    if (avctx->channels > DST_MAX_CHANNELS) {
+        avpriv_request_sample(avctx, "Channel count %d", avctx->channels);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
+
+    for (i = 0; i < avctx->channels; i++)
+        memset(s->dsdctx[i].buf, 0x69, sizeof(s->dsdctx[i].buf));
+
+    ff_init_dsd_data();
+
+    return 0;
+}
+
+static int read_map(GetBitContext *gb, Table *t, unsigned int map[DST_MAX_CHANNELS], int channels)
+{
+    int ch;
+    t->elements = 1;
+    map[0] = 0;
+    if (!get_bits1(gb)) {
+        for (ch = 1; ch < channels; ch++) {
+            int bits = av_log2(t->elements) + 1;
+            map[ch] = get_bits(gb, bits);
+            if (map[ch] == t->elements) {
+                t->elements++;
+                if (t->elements >= DST_MAX_ELEMENTS)
+                    return AVERROR_INVALIDDATA;
+            } else if (map[ch] > t->elements) {
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    } else {
+        memset(map, 0, sizeof(*map) * DST_MAX_CHANNELS);
+    }
+    return 0;
+}
+
+static av_always_inline int get_sr_golomb_dst(GetBitContext *gb, unsigned int k)
+{
+    int v = get_ur_golomb(gb, k, get_bits_left(gb), 0);
+    if (v && get_bits1(gb))
+        v = -v;
+    return v;
+}
+
+static void read_uncoded_coeff(GetBitContext *gb, int *dst, unsigned int elements,
+                               int coeff_bits, int is_signed, int offset)
+{
+    int i;
+
+    for (i = 0; i < elements; i++) {
+        dst[i] = (is_signed ? get_sbits(gb, coeff_bits) : get_bits(gb, coeff_bits)) + offset;
+    }
+}
+
+static int read_table(GetBitContext *gb, Table *t, const int8_t code_pred_coeff[3][3],
+                      int length_bits, int coeff_bits, int is_signed, int offset)
+{
+    unsigned int i, j, k;
+    for (i = 0; i < t->elements; i++) {
+        t->length[i] = get_bits(gb, length_bits) + 1;
+        if (!get_bits1(gb)) {
+            read_uncoded_coeff(gb, t->coeff[i], t->length[i], coeff_bits, is_signed, offset);
+        } else {
+            int method = get_bits(gb, 2), lsb_size;
+            if (method == 3)
+                return AVERROR_INVALIDDATA;
+
+            read_uncoded_coeff(gb, t->coeff[i], method + 1, coeff_bits, is_signed, offset);
+
+            lsb_size  = get_bits(gb, 3);
+            for (j = method + 1; j < t->length[i]; j++) {
+                int c, x = 0;
+                for (k = 0; k < method + 1; k++)
+                    x += code_pred_coeff[method][k] * t->coeff[i][j - k - 1];
+                c = get_sr_golomb_dst(gb, lsb_size);
+                if (x >= 0)
+                    c -= (x + 4) / 8;
+                else
+                    c += (-x + 3) / 8;
+                t->coeff[i][j] = c;
+            }
+        }
+    }
+    return 0;
+}
+
+static void ac_init(ArithCoder *ac, GetBitContext *gb)
+{
+    ac->a = 4095;
+    ac->c = get_bits(gb, 12);
+}
+
+static av_always_inline void ac_get(ArithCoder *ac, GetBitContext *gb, int p, int *e)
+{
+    unsigned int k = (ac->a >> 8) | ((ac->a >> 7) & 1);
+    unsigned int q = k * p;
+    unsigned int a_q = ac->a - q;
+
+    *e = ac->c < a_q;
+    if (*e) {
+        ac->a  = a_q;
+    } else {
+        ac->a  = q;
+        ac->c -= a_q;
+    }
+
+    if (ac->a < 2048) {
+        int n = 11 - av_log2(ac->a);
+        ac->a <<= n;
+        ac->c = (ac->c << n) | get_bits(gb, n);
+    }
+}
+
+static uint8_t prob_dst_x_bit(int c)
+{
+    return (ff_reverse[c & 127] >> 1) + 1;
+}
+
+static void build_filter(int16_t table[DST_MAX_ELEMENTS][16][256], const Table *fsets)
+{
+    int i, j, k, l;
+
+    for (i = 0; i < fsets->elements; i++) {
+        int length = fsets->length[i];
+
+        for (j = 0; j < 16; j++) {
+            int total = av_clip(length - j * 8, 0, 8);
+
+            for (k = 0; k < 256; k++) {
+                int v = 0;
+
+                for (l = 0; l < total; l++)
+                    v += (((k >> l) & 1) * 2 - 1) * fsets->coeff[i][j * 8 + l];
+                table[i][j][k] = v;
+            }
+        }
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    unsigned samples_per_frame = DST_SAMPLES_PER_FRAME(avctx->sample_rate);
+    unsigned map_ch_to_felem[DST_MAX_CHANNELS];
+    unsigned map_ch_to_pelem[DST_MAX_CHANNELS];
+    unsigned i, ch, same_map, dst_x_bit;
+    unsigned half_prob[DST_MAX_CHANNELS];
+    const int channels = avctx->channels;
+    DSTContext *s = avctx->priv_data;
+    GetBitContext *gb = &s->gb;
+    ArithCoder *ac = &s->ac;
+    AVFrame *frame = data;
+    uint8_t *dsd;
+    float *pcm;
+    int ret;
+
+    if (avpkt->size <= 1)
+        return AVERROR_INVALIDDATA;
+
+    frame->nb_samples = samples_per_frame / 8;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    dsd = frame->data[0];
+    pcm = (float *)frame->data[0];
+
+    if ((ret = init_get_bits8(gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+
+    if (!get_bits1(gb)) {
+        skip_bits1(gb);
+        if (get_bits(gb, 6))
+            return AVERROR_INVALIDDATA;
+        memcpy(frame->data[0], avpkt->data + 1, FFMIN(avpkt->size - 1, frame->nb_samples * avctx->channels));
+        goto dsd;
+    }
+
+    /* Segmentation (10.4, 10.5, 10.6) */
+
+    if (!get_bits1(gb)) {
+        avpriv_request_sample(avctx, "Not Same Segmentation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (!get_bits1(gb)) {
+        avpriv_request_sample(avctx, "Not Same Segmentation For All Channels");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (!get_bits1(gb)) {
+        avpriv_request_sample(avctx, "Not End Of Channel Segmentation");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    /* Mapping (10.7, 10.8, 10.9) */
+
+    same_map = get_bits1(gb);
+
+    if ((ret = read_map(gb, &s->fsets, map_ch_to_felem, avctx->channels)) < 0)
+        return ret;
+
+    if (same_map) {
+        s->probs.elements = s->fsets.elements;
+        memcpy(map_ch_to_pelem, map_ch_to_felem, sizeof(map_ch_to_felem));
+    } else {
+        avpriv_request_sample(avctx, "Not Same Mapping");
+        if ((ret = read_map(gb, &s->probs, map_ch_to_pelem, avctx->channels)) < 0)
+            return ret;
+    }
+
+    /* Half Probability (10.10) */
+
+    for (ch = 0; ch < avctx->channels; ch++)
+        half_prob[ch] = get_bits1(gb);
+
+    /* Filter Coef Sets (10.12) */
+
+    read_table(gb, &s->fsets, fsets_code_pred_coeff, 7, 9, 1, 0);
+
+    /* Probability Tables (10.13) */
+
+    read_table(gb, &s->probs, probs_code_pred_coeff, 6, 7, 0, 1);
+
+    /* Arithmetic Coded Data (10.11) */
+
+    if (get_bits1(gb))
+        return AVERROR_INVALIDDATA;
+    ac_init(ac, gb);
+
+    build_filter(s->filter, &s->fsets);
+
+    memset(s->status, 0xAA, sizeof(s->status));
+    memset(dsd, 0, frame->nb_samples * 4 * avctx->channels);
+
+    ac_get(ac, gb, prob_dst_x_bit(s->fsets.coeff[0][0]), &dst_x_bit);
+
+    for (i = 0; i < samples_per_frame; i++) {
+        for (ch = 0; ch < channels; ch++) {
+            const unsigned felem = map_ch_to_felem[ch];
+            int16_t (*filter)[256] = s->filter[felem];
+            uint8_t *status = s->status[ch];
+            int prob, residual, v;
+
+#define F(x) filter[(x)][status[(x)]]
+            const int16_t predict = F( 0) + F( 1) + F( 2) + F( 3) +
+                                    F( 4) + F( 5) + F( 6) + F( 7) +
+                                    F( 8) + F( 9) + F(10) + F(11) +
+                                    F(12) + F(13) + F(14) + F(15);
+#undef F
+
+            if (!half_prob[ch] || i >= s->fsets.length[felem]) {
+                unsigned pelem = map_ch_to_pelem[ch];
+                unsigned index = FFABS(predict) >> 3;
+                prob = s->probs.coeff[pelem][FFMIN(index, s->probs.length[pelem] - 1)];
+            } else {
+                prob = 128;
+            }
+
+            ac_get(ac, gb, prob, &residual);
+            v = ((predict >> 15) ^ residual) & 1;
+            dsd[((i >> 3) * channels + ch) << 2] |= v << (7 - (i & 0x7 ));
+
+            AV_WN64A(status + 8, (AV_RN64A(status + 8) << 1) | ((AV_RN64A(status) >> 63) & 1));
+            AV_WN64A(status, (AV_RN64A(status) << 1) | v);
+        }
+    }
+
+dsd:
+    for (i = 0; i < avctx->channels; i++) {
+        ff_dsd2pcm_translate(&s->dsdctx[i], frame->nb_samples, 0,
+                             frame->data[0] + i * 4,
+                             avctx->channels * 4, pcm + i, avctx->channels);
+    }
+
+    *got_frame_ptr = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_dst_decoder = {
+    .name           = "dst",
+    .long_name      = NULL_IF_CONFIG_SMALL("DST (Digital Stream Transfer)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_DST,
+    .priv_data_size = sizeof(DSTContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLT,
+                                                      AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/dump_extradata_bsf.c b/libavcodec/dump_extradata_bsf.c
index c960d6a..fa7bc86 100644
--- a/libavcodec/dump_extradata_bsf.c
+++ b/libavcodec/dump_extradata_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -81,7 +81,9 @@ fail:
 static const AVOption options[] = {
     { "freq", "When do dump extradata", OFFSET(freq), AV_OPT_TYPE_INT,
         { .i64 = DUMP_FREQ_KEYFRAME }, DUMP_FREQ_KEYFRAME, DUMP_FREQ_ALL, 0, "freq" },
+        { "k",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_KEYFRAME }, .unit = "freq" },
         { "keyframe", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_KEYFRAME }, .unit = "freq" },
+        { "e",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_ALL      }, .unit = "freq" },
         { "all",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = DUMP_FREQ_ALL      }, .unit = "freq" },
     { NULL },
 };
diff --git a/libavcodec/dv.c b/libavcodec/dv.c
index 81d28d1..31d1315 100644
--- a/libavcodec/dv.c
+++ b/libavcodec/dv.c
@@ -16,20 +16,20 @@
  * Many thanks to Dan Dennedy <dan@dennedy.org> for providing wealth
  * of DV technical info.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,7 @@
 #include "simple_idct.h"
 
 /* XXX: also include quantization */
-RL_VLC_ELEM ff_dv_rl_vlc[1184];
+RL_VLC_ELEM ff_dv_rl_vlc[1664];
 
 static inline void dv_calc_mb_coordinates(const AVDVProfile *d, int chan,
                                           int seq, int slot, uint16_t *tbl)
@@ -173,20 +173,9 @@ static inline void dv_calc_mb_coordinates(const AVDVProfile *d, int chan,
     }
 }
 
-/* quantization quanta by QNO for DV100 */
-static const uint8_t dv100_qstep[16] = {
-    1, /* QNO = 0 and 1 both have no quantization */
-    1,
-    2, 3, 4, 5, 6, 7, 8, 16, 18, 20, 22, 24, 28, 52
-};
-
-static const uint8_t dv_quant_areas[4] = { 6, 21, 43, 64 };
-
 int ff_dv_init_dynamic_tables(DVVideoContext *ctx, const AVDVProfile *d)
 {
     int j, i, c, s, p;
-    uint32_t *factor1, *factor2;
-    const int *iweight1, *iweight2;
 
     p = i = 0;
     for (c = 0; c < d->n_difchan; c++) {
@@ -204,38 +193,6 @@ int ff_dv_init_dynamic_tables(DVVideoContext *ctx, const AVDVProfile *d)
         }
     }
 
-    factor1 = &ctx->idct_factor[0];
-    factor2 = &ctx->idct_factor[DV_PROFILE_IS_HD(d) ? 4096 : 2816];
-    if (d->height == 720) {
-        iweight1 = &ff_dv_iweight_720_y[0];
-        iweight2 = &ff_dv_iweight_720_c[0];
-    } else {
-        iweight1 = &ff_dv_iweight_1080_y[0];
-        iweight2 = &ff_dv_iweight_1080_c[0];
-    }
-    if (DV_PROFILE_IS_HD(d)) {
-        for (c = 0; c < 4; c++) {
-            for (s = 0; s < 16; s++) {
-                for (i = 0; i < 64; i++) {
-                    *factor1++ = (dv100_qstep[s] << (c + 9)) * iweight1[i];
-                    *factor2++ = (dv100_qstep[s] << (c + 9)) * iweight2[i];
-                }
-            }
-        }
-    } else {
-        iweight1 = &ff_dv_iweight_88[0];
-        for (j = 0; j < 2; j++, iweight1 = &ff_dv_iweight_248[0]) {
-            for (s = 0; s < 22; s++) {
-                for (i = c = 0; c < 4; c++) {
-                    for (; i < dv_quant_areas[c]; i++) {
-                        *factor1   = iweight1[i] << (ff_dv_quant_shifts[s][c] + 1);
-                        *factor2++ = (*factor1++) << 1;
-                    }
-                }
-            }
-        }
-    }
-
     return 0;
 }
 
@@ -277,7 +234,7 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)
          * to accelerate the parsing of partial codes */
         init_vlc(&dv_vlc, TEX_VLC_BITS, j, new_dv_vlc_len,
                  1, 1, new_dv_vlc_bits, 2, 2, 0);
-        assert(dv_vlc.table_size == 1184);
+        av_assert1(dv_vlc.table_size == 1664);
 
         for (i = 0; i < dv_vlc.table_size; i++) {
             int code = dv_vlc.table[i][0];
@@ -303,3 +260,4 @@ av_cold int ff_dvvideo_init(AVCodecContext *avctx)
 
     return 0;
 }
+
diff --git a/libavcodec/dv.h b/libavcodec/dv.h
index d032405..af506eb 100644
--- a/libavcodec/dv.h
+++ b/libavcodec/dv.h
@@ -2,20 +2,20 @@
  * Constants for DV codec
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,7 @@ typedef struct DVVideoContext {
 
     uint8_t dv_zigzag[2][64];
 
-    void (*get_pixels)(int16_t *block, const uint8_t *pixels, int line_size);
+    void (*get_pixels)(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
     void (*fdct[2])(int16_t *block);
     void (*idct_put[2])(uint8_t *dest, int line_size, int16_t *block);
     me_cmp_func ildct_cmp;
@@ -80,10 +80,6 @@ enum dv_pack_type {
 #define DV_PROFILE_IS_1080i50(p) (((p)->video_stype == 0x14) && ((p)->dsf == 1))
 #define DV_PROFILE_IS_720p50(p)  (((p)->video_stype == 0x18) && ((p)->dsf == 1))
 
-/* minimum number of bytes to read from a DV stream in order to
- * determine the profile */
-#define DV_PROFILE_BYTES (6 * 80) /* 6 DIF blocks */
-
 /**
  * largest possible DV frame, in bytes (1080i50)
  */
@@ -94,11 +90,12 @@ enum dv_pack_type {
  */
 #define DV_MAX_BPM 8
 
-#define TEX_VLC_BITS 9
+#define TEX_VLC_BITS 10
 
-extern RL_VLC_ELEM ff_dv_rl_vlc[1184];
+extern RL_VLC_ELEM ff_dv_rl_vlc[1664];
 
 int ff_dv_init_dynamic_tables(DVVideoContext *s, const AVDVProfile *d);
+
 int ff_dvvideo_init(AVCodecContext *avctx);
 
 static inline int dv_work_pool_size(const AVDVProfile *d)
diff --git a/libavcodec/dv_profile.c b/libavcodec/dv_profile.c
index 74c529d..66505c8 100644
--- a/libavcodec/dv_profile.c
+++ b/libavcodec/dv_profile.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
 #include "libavutil/pixdesc.h"
 
@@ -256,22 +257,30 @@ void ff_dv_print_profiles(void *logctx, int loglevel)
 
 #endif /* CONFIG_DVPROFILE */
 
-const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
+const AVDVProfile* ff_dv_frame_profile(AVCodecContext* codec, const AVDVProfile *sys,
                                        const uint8_t *frame, unsigned buf_size)
 {
 #if CONFIG_DVPROFILE
     int i, dsf, stype;
 
-    if (buf_size < 80 * 5 + 48 + 4)
+    if(buf_size < DV_PROFILE_BYTES)
         return NULL;
 
     dsf   = (frame[3] & 0x80) >> 7;
     stype = frame[80 * 5 + 48 + 3] & 0x1f;
 
     /* 576i50 25Mbps 4:1:1 is a special case */
-    if (dsf == 1 && stype == 0 && frame[4] & 0x07 /* the APT field */)
+    if ((dsf == 1 && stype == 0 && frame[4] & 0x07 /* the APT field */) ||
+        (stype == 31 && codec && codec->codec_tag==AV_RL32("SL25") && codec->coded_width==720 && codec->coded_height==576))
         return &dv_profiles[2];
 
+    if(   stype == 0
+       && codec
+       && (codec->codec_tag==AV_RL32("dvsd") || codec->codec_tag==AV_RL32("CDVC"))
+       && codec->coded_width ==720
+       && codec->coded_height==576)
+        return &dv_profiles[1];
+
     for (i = 0; i < FF_ARRAY_ELEMS(dv_profiles); i++)
         if (dsf == dv_profiles[i].dsf && stype == dv_profiles[i].video_stype)
             return &dv_profiles[i];
@@ -279,23 +288,54 @@ const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
     /* check if old sys matches and assumes corrupted input */
     if (sys && buf_size == sys->frame_size)
         return sys;
+
+    /* hack for trac issue #217, dv files created with QuickTime 3 */
+    if ((frame[3] & 0x7f) == 0x3f && frame[80 * 5 + 48 + 3] == 0xff)
+        return &dv_profiles[dsf];
 #endif
 
     return NULL;
 }
 
+const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
+                                       const uint8_t *frame, unsigned buf_size)
+{
+    return ff_dv_frame_profile(NULL, sys, frame, buf_size);
+}
+
 const AVDVProfile *av_dv_codec_profile(int width, int height,
                                        enum AVPixelFormat pix_fmt)
 {
 #if CONFIG_DVPROFILE
+    return av_dv_codec_profile2(width, height, pix_fmt, (AVRational){0, 0});
+#endif
+
+    return NULL;
+}
+
+const AVDVProfile *av_dv_codec_profile2(int width, int height,
+                                       enum AVPixelFormat pix_fmt,
+                                       AVRational frame_rate)
+{
+    const AVDVProfile *p = NULL;
+#if CONFIG_DVPROFILE
     int i;
+    /* frame rate is necessary to select between 720p50 and 720p60 profiles */
+    int invalid_framerate = frame_rate.num == 0 || frame_rate.den == 0;
 
     for (i = 0; i < FF_ARRAY_ELEMS(dv_profiles); i++)
         if (height  == dv_profiles[i].height  &&
             pix_fmt == dv_profiles[i].pix_fmt &&
             width   == dv_profiles[i].width)
-            return &dv_profiles[i];
+        {
+            if( invalid_framerate || av_div_q(dv_profiles[i].time_base, frame_rate).num == 1 )
+                return &dv_profiles[i];
+
+            if(!p)
+                p = &dv_profiles[i];
+        }
 #endif
 
-    return NULL;
+    return p;
 }
+
diff --git a/libavcodec/dv_profile.h b/libavcodec/dv_profile.h
index 5ad7b4f..9380a66 100644
--- a/libavcodec/dv_profile.h
+++ b/libavcodec/dv_profile.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,11 @@
 #include "libavutil/rational.h"
 #include "avcodec.h"
 
+/* minimum number of bytes to read from a DV stream in order to
+ * determine the profile */
+#define DV_PROFILE_BYTES (6 * 80) /* 6 DIF blocks */
+
+
 /*
  * AVDVProfile is used to express the differences between various
  * DV flavors. For now it's primarily used for differentiating
@@ -69,4 +74,10 @@ const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
  */
 const AVDVProfile *av_dv_codec_profile(int width, int height, enum AVPixelFormat pix_fmt);
 
+/**
+ * Get a DV profile for the provided stream parameters.
+ * The frame rate is used as a best-effort parameter.
+ */
+const AVDVProfile *av_dv_codec_profile2(int width, int height, enum AVPixelFormat pix_fmt, AVRational frame_rate);
+
 #endif /* AVCODEC_DV_PROFILE_H */
diff --git a/libavcodec/dv_profile_internal.h b/libavcodec/dv_profile_internal.h
index f93e7ca..67d3a2b 100644
--- a/libavcodec/dv_profile_internal.h
+++ b/libavcodec/dv_profile_internal.h
@@ -1,27 +1,35 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_DV_PROFILE_INTERNAL_H
 #define AVCODEC_DV_PROFILE_INTERNAL_H
 
+#include "dv_profile.h"
+
 /**
  *  Print all allowed DV profiles into logctx at specified logging level.
  */
 void ff_dv_print_profiles(void *logctx, int loglevel);
 
+/**
+ * Get a DV profile for the provided compressed frame.
+ */
+const AVDVProfile* ff_dv_frame_profile(AVCodecContext* codec, const AVDVProfile *sys,
+                                       const uint8_t *frame, unsigned buf_size);
+
 #endif /* AVCODEC_DV_PROFILE_INTERNAL_H */
diff --git a/libavcodec/dv_tablegen.c b/libavcodec/dv_tablegen.c
index 9b2b954..d032101 100644
--- a/libavcodec/dv_tablegen.c
+++ b/libavcodec/dv_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include <inttypes.h>
 
 WRITE_1D_FUNC_ARGV(dv_vlc_pair, 7,
-                   "{0x%"PRIx32", %"PRId8"}", data[i].vlc, data[i].size)
+                   "{0x%"PRIx32", %"PRIu32"}", data[i].vlc, data[i].size)
 WRITE_2D_FUNC(dv_vlc_pair)
 
 int main(void)
diff --git a/libavcodec/dv_tablegen.h b/libavcodec/dv_tablegen.h
index b69721b..941b557 100644
--- a/libavcodec/dv_tablegen.h
+++ b/libavcodec/dv_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #define AVCODEC_DV_TABLEGEN_H
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
 
 #include "dvdata.h"
 
@@ -47,7 +48,7 @@ typedef struct dv_vlc_pair {
 #else
 static struct dv_vlc_pair dv_vlc_map[DV_VLC_MAP_RUN_SIZE][DV_VLC_MAP_LEV_SIZE];
 
-static void dv_vlc_map_tableinit(void)
+static av_cold void dv_vlc_map_tableinit(void)
 {
     int i, j;
     for (i = 0; i < NB_DV_VLC - 1; i++) {
diff --git a/libavcodec/dvaudio.h b/libavcodec/dvaudio.h
new file mode 100644
index 0000000..e7f70c5
--- /dev/null
+++ b/libavcodec/dvaudio.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DVAUDIO_H
+#define AVCODEC_DVAUDIO_H
+
+#include <stdint.h>
+
+static inline int dv_get_audio_sample_count(const uint8_t *buffer, int dsf)
+{
+    int samples = buffer[0] & 0x3f; /* samples in this frame - min samples */
+
+    switch ((buffer[3] >> 3) & 0x07) {
+    case 0:
+        return samples + (dsf ? 1896 : 1580);
+    case 1:
+        return samples + (dsf ? 1742 : 1452);
+    case 2:
+    default:
+        return samples + (dsf ? 1264 : 1053);
+    }
+}
+
+#endif /* AVCODEC_DVAUDIO_H */
diff --git a/libavcodec/dvaudio_parser.c b/libavcodec/dvaudio_parser.c
new file mode 100644
index 0000000..160faaf
--- /dev/null
+++ b/libavcodec/dvaudio_parser.c
@@ -0,0 +1,46 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Ulead DV audio parser
+ *
+ * Determines the duration for each packet.
+ */
+
+#include "parser.h"
+#include "dvaudio.h"
+
+static int dvaudio_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                        const uint8_t **poutbuf, int *poutbuf_size,
+                        const uint8_t *buf, int buf_size)
+{
+    if (buf_size >= 248)
+        s1->duration = dv_get_audio_sample_count(buf + 244, avctx->block_align == 8640);
+
+    /* always return the full packet. this parser isn't doing any splitting or
+       combining, only packet analysis */
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return buf_size;
+}
+
+AVCodecParser ff_dvaudio_parser = {
+    .codec_ids      = { AV_CODEC_ID_DVAUDIO },
+    .parser_parse   = dvaudio_parse,
+};
diff --git a/libavcodec/dvaudiodec.c b/libavcodec/dvaudiodec.c
new file mode 100644
index 0000000..5aa2a95
--- /dev/null
+++ b/libavcodec/dvaudiodec.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012 Laurent Aimar
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "dvaudio.h"
+
+typedef struct DVAudioContext {
+    int block_size;
+    int is_12bit;
+    int is_pal;
+    int16_t shuffle[2000];
+} DVAudioContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    DVAudioContext *s = avctx->priv_data;
+    int i;
+
+    if (avctx->channels != 2) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->codec_tag == 0x0215) {
+        s->block_size = 7200;
+    } else if (avctx->codec_tag == 0x0216) {
+        s->block_size = 8640;
+    } else if (avctx->block_align == 7200 ||
+               avctx->block_align == 8640) {
+        s->block_size = avctx->block_align;
+    } else {
+        return AVERROR(EINVAL);
+    }
+
+    s->is_pal = s->block_size == 8640;
+    s->is_12bit = avctx->bits_per_coded_sample == 12;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+    avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->shuffle); i++) {
+        const unsigned a = s->is_pal ? 18 : 15;
+        const unsigned b = 3 * a;
+
+        s->shuffle[i] = 80 * ((21 * (i % 3) + 9 * (i / 3) + ((i / a) % 3)) % b) +
+                         (2 + s->is_12bit) * (i / b) + 8;
+    }
+
+    return 0;
+}
+
+static inline uint16_t dv_audio_12to16(uint16_t sample)
+{
+    uint16_t shift, result;
+
+    sample = (sample < 0x800) ? sample : sample | 0xf000;
+    shift  = (sample & 0xf00) >> 8;
+
+    if (shift < 0x2 || shift > 0xd) {
+        result = sample;
+    } else if (shift < 0x8) {
+        shift--;
+        result = (sample - (256 * shift)) << shift;
+    } else {
+        shift  = 0xe - shift;
+        result = ((sample + ((256 * shift) + 1)) << shift) - 1;
+    }
+
+    return result;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *pkt)
+{
+    DVAudioContext *s = avctx->priv_data;
+    AVFrame *frame = data;
+    const uint8_t *src = pkt->data;
+    int16_t *dst;
+    int ret, i;
+
+    if (pkt->size < s->block_size)
+        return AVERROR_INVALIDDATA;
+
+    frame->nb_samples = dv_get_audio_sample_count(pkt->data + 244, s->is_pal);
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    dst = (int16_t *)frame->data[0];
+
+    for (i = 0; i < frame->nb_samples; i++) {
+       const uint8_t *v = &src[s->shuffle[i]];
+
+       if (s->is_12bit) {
+           *dst++ = dv_audio_12to16((v[0] << 4) | ((v[2] >> 4) & 0x0f));
+           *dst++ = dv_audio_12to16((v[1] << 4) | ((v[2] >> 0) & 0x0f));
+       } else {
+           *dst++ = AV_RB16(&v[0]);
+           *dst++ = AV_RB16(&v[s->is_pal ? 4320 : 3600]);
+       }
+    }
+
+    *got_frame_ptr = 1;
+
+    return s->block_size;
+}
+
+AVCodec ff_dvaudio_decoder = {
+    .name           = "dvaudio",
+    .long_name      = NULL_IF_CONFIG_SMALL("Ulead DV Audio"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_DVAUDIO,
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(DVAudioContext),
+};
diff --git a/libavcodec/dvbsub.c b/libavcodec/dvbsub.c
index 548bae1..3cdbade 100644
--- a/libavcodec/dvbsub.c
+++ b/libavcodec/dvbsub.c
@@ -2,20 +2,20 @@
  * DVB subtitle encoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
@@ -23,7 +23,6 @@
 #include "libavutil/colorspace.h"
 
 typedef struct DVBSubtitleContext {
-    int hide_state;
     int object_version;
 } DVBSubtitleContext;
 
@@ -194,6 +193,60 @@ static void dvb_encode_rle4(uint8_t **pq,
     *pq = q;
 }
 
+static void dvb_encode_rle8(uint8_t **pq,
+                            const uint8_t *bitmap, int linesize,
+                            int w, int h)
+{
+    uint8_t *q;
+    int x, y, len, x1, color;
+
+    q = *pq;
+
+    for (y = 0; y < h; y++) {
+        *q++ = 0x12;
+
+        x = 0;
+        while (x < w) {
+            x1 = x;
+            color = bitmap[x1++];
+            while (x1 < w && bitmap[x1] == color)
+                x1++;
+            len = x1 - x;
+            if (len == 1 && color) {
+                // 00000001 to 11111111           1 pixel in colour x
+                *q++ = color;
+            } else {
+                if (color == 0x00) {
+                    // 00000000 0LLLLLLL          L pixels (1-127) in colour 0 (L > 0)
+                    len = FFMIN(len, 127);
+                    *q++ = 0x00;
+                    *q++ = len;
+                } else if (len > 2) {
+                    // 00000000 1LLLLLLL CCCCCCCC L pixels (3-127) in colour C (L > 2)
+                    len = FFMIN(len, 127);
+                    *q++ = 0x00;
+                    *q++ = 0x80+len;
+                    *q++ = color;
+                }
+                else if (len == 2) {
+                    *q++ = color;
+                    *q++ = color;
+                } else {
+                    *q++ = color;
+                    len = 1;
+                }
+            }
+            x += len;
+        }
+        /* end of line */
+        // 00000000 00000000 end of 8-bit/pixel_code_string
+        *q++ = 0x00;
+        *q++ = 0x00;
+        bitmap += linesize;
+    }
+    *pq = q;
+}
+
 static int encode_dvb_subtitles(DVBSubtitleContext *s,
                                 uint8_t *outbuf, const AVSubtitle *h)
 {
@@ -205,11 +258,9 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     page_id = 1;
 
-    if (h->num_rects == 0 || !h->rects)
+    if (h->num_rects && !h->rects)
         return -1;
 
-    *q++ = 0x00; /* subtitle_stream_id */
-
     /* page composition segment */
 
     *q++ = 0x0f; /* sync_byte */
@@ -218,10 +269,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
     pseg_len = q;
     q += 2; /* segment length */
     *q++ = 30; /* page_timeout (seconds) */
-    if (s->hide_state)
-        page_state = 0; /* normal case */
-    else
-        page_state = 2; /* mode change */
+    page_state = 2; /* mode change */
     /* page_version = 0 + page_state */
     *q++ = (s->object_version << 4) | (page_state << 2) | 3;
 
@@ -234,7 +282,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     bytestream_put_be16(&pseg_len, q - pseg_len - 2);
 
-    if (!s->hide_state) {
+    if (h->num_rects) {
         for (clut_id = 0; clut_id < h->num_rects; clut_id++) {
 
             /* CLUT segment */
@@ -245,10 +293,15 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
             } else if (h->rects[clut_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
                 bpp_index = 1;
+            } else if (h->rects[clut_id]->nb_colors <= 256) {
+                /* 8 bpp, standard encoding */
+                bpp_index = 2;
             } else {
                 return -1;
             }
 
+
+            /* CLUT segment */
             *q++ = 0x0f; /* sync byte */
             *q++ = 0x12; /* CLUT definition segment */
             bytestream_put_be16(&q, page_id);
@@ -307,32 +360,37 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
         *q++ = 0; /* 8 bit fill colors */
         *q++ = 0x03; /* 4 bit and 2 bit fill colors */
 
-        if (!s->hide_state) {
-            bytestream_put_be16(&q, region_id); /* object_id == region_id */
-            *q++ = (0 << 6) | (0 << 4);
-            *q++ = 0;
-            *q++ = 0xf0;
-            *q++ = 0;
-        }
+        bytestream_put_be16(&q, region_id); /* object_id == region_id */
+        *q++ = (0 << 6) | (0 << 4);
+        *q++ = 0;
+        *q++ = 0xf0;
+        *q++ = 0;
 
         bytestream_put_be16(&pseg_len, q - pseg_len - 2);
     }
 
-    if (!s->hide_state) {
+    if (h->num_rects) {
 
         for (object_id = 0; object_id < h->num_rects; object_id++) {
-            /* Object Data segment */
+            void (*dvb_encode_rle)(uint8_t **pq,
+                                    const uint8_t *bitmap, int linesize,
+                                    int w, int h);
 
+            /* bpp_index maths */
             if (h->rects[object_id]->nb_colors <= 4) {
                 /* 2 bpp, some decoders do not support it correctly */
-                bpp_index = 0;
+                dvb_encode_rle = dvb_encode_rle2;
             } else if (h->rects[object_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
-                bpp_index = 1;
+                dvb_encode_rle = dvb_encode_rle4;
+            } else if (h->rects[object_id]->nb_colors <= 256) {
+                /* 8 bpp, standard encoding */
+                dvb_encode_rle = dvb_encode_rle8;
             } else {
                 return -1;
             }
 
+            /* Object Data segment */
             *q++ = 0x0f; /* sync byte */
             *q++ = 0x13;
             bytestream_put_be16(&q, page_id);
@@ -345,19 +403,12 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
                                                                        non_modifying_color_flag */
             {
                 uint8_t *ptop_field_len, *pbottom_field_len, *top_ptr, *bottom_ptr;
-                void (*dvb_encode_rle)(uint8_t **pq,
-                                        const uint8_t *bitmap, int linesize,
-                                        int w, int h);
+
                 ptop_field_len = q;
                 q += 2;
                 pbottom_field_len = q;
                 q += 2;
 
-                if (bpp_index == 0)
-                    dvb_encode_rle = dvb_encode_rle2;
-                else
-                    dvb_encode_rle = dvb_encode_rle4;
-
                 top_ptr = q;
                 dvb_encode_rle(&q, h->rects[object_id]->data[0], h->rects[object_id]->w * 2,
                                     h->rects[object_id]->w, h->rects[object_id]->h >> 1);
@@ -384,10 +435,7 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
     bytestream_put_be16(&pseg_len, q - pseg_len - 2);
 
-    *q++ = 0xff; /* end of PES data */
-
     s->object_version = (s->object_version + 1) & 0xf;
-    s->hide_state = !s->hide_state;
     return q - outbuf;
 }
 
diff --git a/libavcodec/dvbsub_parser.c b/libavcodec/dvbsub_parser.c
index 2e7d8c2..af467f7 100644
--- a/libavcodec/dvbsub_parser.c
+++ b/libavcodec/dvbsub_parser.c
@@ -1,21 +1,21 @@
 /*
- * DVB subtitle parser for Libav
+ * DVB subtitle parser for FFmpeg
  * Copyright (c) 2005 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
@@ -123,11 +123,11 @@ static int dvbsub_parse(AVCodecParserContext *s,
     {
         if (*p == 0x0f)
         {
-            if (p + 6 <= p_end)
+            if (6 <= p_end - p)
             {
                 len = AV_RB16(p + 4);
 
-                if (p + len + 6 <= p_end)
+                if (len + 6 <= p_end - p)
                 {
                     *poutbuf_size += len + 6;
 
@@ -137,7 +137,7 @@ static int dvbsub_parse(AVCodecParserContext *s,
             } else
                 break;
         } else if (*p == 0xff) {
-            if (p + 1 < p_end)
+            if (1 < p_end - p)
             {
                 ff_dlog(avctx, "Junk at end of packet\n");
             }
diff --git a/libavcodec/dvbsubdec.c b/libavcodec/dvbsubdec.c
index be68c58..a4663d9 100644
--- a/libavcodec/dvbsubdec.c
+++ b/libavcodec/dvbsubdec.c
@@ -2,20 +2,20 @@
  * DVB subtitle decoding
  * Copyright (c) 2005 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #include "bytestream.h"
 #include "internal.h"
 #include "libavutil/colorspace.h"
+#include "libavutil/opt.h"
 
 #define DVBSUB_PAGE_SEGMENT     0x10
 #define DVBSUB_REGION_SEGMENT   0x11
@@ -153,6 +154,7 @@ static void png_save2(const char *filename, uint32_t *bitmap, int w, int h)
 
 typedef struct DVBSubCLUT {
     int id;
+    int version;
 
     uint32_t clut4[4];
     uint32_t clut16[16];
@@ -179,6 +181,7 @@ typedef struct DVBSubObjectDisplay {
 
 typedef struct DVBSubObject {
     int id;
+    int version;
 
     int type;
 
@@ -198,6 +201,7 @@ typedef struct DVBSubRegionDisplay {
 
 typedef struct DVBSubRegion {
     int id;
+    int version;
 
     int width;
     int height;
@@ -208,6 +212,7 @@ typedef struct DVBSubRegion {
 
     uint8_t *pbuf;
     int buf_size;
+    int dirty;
 
     DVBSubObjectDisplay *display_list;
 
@@ -224,15 +229,21 @@ typedef struct DVBSubDisplayDefinition {
 } DVBSubDisplayDefinition;
 
 typedef struct DVBSubContext {
+    AVClass *class;
     int composition_id;
     int ancillary_id;
 
+    int version;
     int time_out;
+    int compute_edt; /**< if 1 end display time calculated using pts
+                          if 0 (Default) calculated using time out */
+    int compute_clut;
+    int substream;
+    int64_t prev_start;
     DVBSubRegion *region_list;
     DVBSubCLUT   *clut_list;
     DVBSubObject *object_list;
 
-    int display_list_size;
     DVBSubRegionDisplay *display_list;
     DVBSubDisplayDefinition *display_definition;
 } DVBSubContext;
@@ -298,53 +309,59 @@ static void delete_region_display_list(DVBSubContext *ctx, DVBSubRegion *region)
                     obj2 = *obj2_ptr;
 
                     while (obj2 != object) {
-                        assert(obj2);
+                        av_assert0(obj2);
                         obj2_ptr = &obj2->next;
                         obj2 = *obj2_ptr;
                     }
 
                     *obj2_ptr = obj2->next;
 
-                    av_free(obj2);
+                    av_freep(&obj2);
                 }
             }
         }
 
         region->display_list = display->region_list_next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
 }
 
-static void delete_state(DVBSubContext *ctx)
+static void delete_cluts(DVBSubContext *ctx)
 {
-    DVBSubRegion *region;
-    DVBSubCLUT *clut;
-
-    while (ctx->region_list) {
-        region = ctx->region_list;
+    while (ctx->clut_list) {
+        DVBSubCLUT *clut = ctx->clut_list;
 
-        ctx->region_list = region->next;
+        ctx->clut_list = clut->next;
 
-        delete_region_display_list(ctx, region);
-        av_free(region->pbuf);
-        av_free(region);
+        av_freep(&clut);
     }
+}
 
-    while (ctx->clut_list) {
-        clut = ctx->clut_list;
+static void delete_objects(DVBSubContext *ctx)
+{
+    while (ctx->object_list) {
+        DVBSubObject *object = ctx->object_list;
 
-        ctx->clut_list = clut->next;
+        ctx->object_list = object->next;
 
-        av_free(clut);
+        av_freep(&object);
     }
+}
 
-    av_freep(&ctx->display_definition);
+static void delete_regions(DVBSubContext *ctx)
+{
+    while (ctx->region_list) {
+        DVBSubRegion *region = ctx->region_list;
+
+        ctx->region_list = region->next;
+
+        delete_region_display_list(ctx, region);
 
-    /* Should already be null */
-    if (ctx->object_list)
-        av_log(0, AV_LOG_ERROR, "Memory deallocation error!\n");
+        av_freep(&region->pbuf);
+        av_freep(&region);
+    }
 }
 
 static av_cold int dvbsub_init_decoder(AVCodecContext *avctx)
@@ -352,15 +369,27 @@ static av_cold int dvbsub_init_decoder(AVCodecContext *avctx)
     int i, r, g, b, a = 0;
     DVBSubContext *ctx = avctx->priv_data;
 
-    if (!avctx->extradata || avctx->extradata_size != 4) {
-        av_log(avctx, AV_LOG_WARNING, "Invalid extradata, subtitle streams may be combined!\n");
+    if (ctx->substream < 0) {
+        ctx->composition_id = -1;
+        ctx->ancillary_id   = -1;
+    } else if (!avctx->extradata || (avctx->extradata_size < 4) || ((avctx->extradata_size % 5 != 0) && (avctx->extradata_size != 4))) {
+        av_log(avctx, AV_LOG_WARNING, "Invalid DVB subtitles stream extradata!\n");
         ctx->composition_id = -1;
         ctx->ancillary_id   = -1;
     } else {
-        ctx->composition_id = AV_RB16(avctx->extradata);
-        ctx->ancillary_id   = AV_RB16(avctx->extradata + 2);
+        if (avctx->extradata_size > 5*ctx->substream + 2) {
+            ctx->composition_id = AV_RB16(avctx->extradata + 5*ctx->substream);
+            ctx->ancillary_id   = AV_RB16(avctx->extradata + 5*ctx->substream + 2);
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "Selected DVB subtitles sub-stream %d is not available\n", ctx->substream);
+            ctx->composition_id = AV_RB16(avctx->extradata);
+            ctx->ancillary_id   = AV_RB16(avctx->extradata + 2);
+        }
     }
 
+    ctx->version = -1;
+    ctx->prev_start = AV_NOPTS_VALUE;
+
     default_clut.id = -1;
     default_clut.next = NULL;
 
@@ -429,30 +458,39 @@ static av_cold int dvbsub_close_decoder(AVCodecContext *avctx)
     DVBSubContext *ctx = avctx->priv_data;
     DVBSubRegionDisplay *display;
 
-    delete_state(ctx);
+    delete_regions(ctx);
+
+    delete_objects(ctx);
+
+    delete_cluts(ctx);
+
+    av_freep(&ctx->display_definition);
 
     while (ctx->display_list) {
         display = ctx->display_list;
         ctx->display_list = display->next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
     return 0;
 }
 
-static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_2bit_string(AVCodecContext *avctx,
+                                   uint8_t *destbuf, int dbuf_len,
                                    const uint8_t **srcbuf, int buf_size,
-                                   int non_mod, uint8_t *map_table)
+                                   int non_mod, uint8_t *map_table, int x_pos)
 {
     GetBitContext gb;
 
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
 
     init_get_bits(&gb, *srcbuf, buf_size << 3);
 
+    destbuf += x_pos;
+
     while (get_bits_count(&gb) < buf_size << 3 && pixels_read < dbuf_len) {
         bits = get_bits(&gb, 2);
 
@@ -513,14 +551,14 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 1) {
-                        pixels_read += 2;
                         if (map_table)
                             bits = map_table[0];
                         else
                             bits = 0;
-                        if (pixels_read <= dbuf_len) {
-                            *destbuf++ = bits;
+                        run_length = 2;
+                        while (run_length-- > 0 && pixels_read < dbuf_len) {
                             *destbuf++ = bits;
+                            pixels_read++;
                         }
                     } else {
                         (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
@@ -539,25 +577,27 @@ static int dvbsub_read_2bit_string(uint8_t *destbuf, int dbuf_len,
     }
 
     if (get_bits(&gb, 6))
-        av_log(0, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
     (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
 
     return pixels_read;
 }
 
-static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_4bit_string(AVCodecContext *avctx, uint8_t *destbuf, int dbuf_len,
                                    const uint8_t **srcbuf, int buf_size,
-                                   int non_mod, uint8_t *map_table)
+                                   int non_mod, uint8_t *map_table, int x_pos)
 {
     GetBitContext gb;
 
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
 
     init_get_bits(&gb, *srcbuf, buf_size << 3);
 
+    destbuf += x_pos;
+
     while (get_bits_count(&gb) < buf_size << 3 && pixels_read < dbuf_len) {
         bits = get_bits(&gb, 4);
 
@@ -637,14 +677,14 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
                             }
                         }
                     } else if (bits == 1) {
-                        pixels_read += 2;
                         if (map_table)
                             bits = map_table[0];
                         else
                             bits = 0;
-                        if (pixels_read <= dbuf_len) {
-                            *destbuf++ = bits;
+                        run_length = 2;
+                        while (run_length-- > 0 && pixels_read < dbuf_len) {
                             *destbuf++ = bits;
+                            pixels_read++;
                         }
                     } else {
                         if (map_table)
@@ -660,21 +700,24 @@ static int dvbsub_read_4bit_string(uint8_t *destbuf, int dbuf_len,
     }
 
     if (get_bits(&gb, 8))
-        av_log(0, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
     (*srcbuf) += (get_bits_count(&gb) + 7) >> 3;
 
     return pixels_read;
 }
 
-static int dvbsub_read_8bit_string(uint8_t *destbuf, int dbuf_len,
+static int dvbsub_read_8bit_string(AVCodecContext *avctx,
+                                   uint8_t *destbuf, int dbuf_len,
                                     const uint8_t **srcbuf, int buf_size,
-                                    int non_mod, uint8_t *map_table)
+                                    int non_mod, uint8_t *map_table, int x_pos)
 {
     const uint8_t *sbuf_end = (*srcbuf) + buf_size;
     int bits;
     int run_length;
-    int pixels_read = 0;
+    int pixels_read = x_pos;
+
+    destbuf += x_pos;
 
     while (*srcbuf < sbuf_end && pixels_read < dbuf_len) {
         bits = *(*srcbuf)++;
@@ -694,30 +737,220 @@ static int dvbsub_read_8bit_string(uint8_t *destbuf, int dbuf_len,
                 if (run_length == 0) {
                     return pixels_read;
                 }
+
+                bits = 0;
             } else {
                 bits = *(*srcbuf)++;
-
-                if (non_mod == 1 && bits == 1)
-                    pixels_read += run_length;
             }
-            if (map_table)
-                bits = map_table[0];
-            else
-                bits = 0;
-            while (run_length-- > 0 && pixels_read < dbuf_len) {
-                *destbuf++ = bits;
-                pixels_read++;
+            if (non_mod == 1 && bits == 1)
+                pixels_read += run_length;
+            else {
+                if (map_table)
+                    bits = map_table[bits];
+                while (run_length-- > 0 && pixels_read < dbuf_len) {
+                    *destbuf++ = bits;
+                    pixels_read++;
+                }
             }
         }
     }
 
     if (*(*srcbuf)++)
-        av_log(0, AV_LOG_ERROR, "DVBSub error: line overflow\n");
+        av_log(avctx, AV_LOG_ERROR, "line overflow\n");
 
     return pixels_read;
 }
 
+static void compute_default_clut(AVSubtitleRect *rect, int w, int h)
+{
+    uint8_t list[256] = {0};
+    uint8_t list_inv[256];
+    int counttab[256] = {0};
+    int count, i, x, y;
+
+#define V(x,y) rect->data[0][(x) + (y)*rect->linesize[0]]
+    for (y = 0; y<h; y++) {
+        for (x = 0; x<w; x++) {
+            int v = V(x,y) + 1;
+            int vl = x     ? V(x-1,y) + 1 : 0;
+            int vr = x+1<w ? V(x+1,y) + 1 : 0;
+            int vt = y     ? V(x,y-1) + 1 : 0;
+            int vb = y+1<h ? V(x,y+1) + 1 : 0;
+            counttab[v-1] += !!((v!=vl) + (v!=vr) + (v!=vt) + (v!=vb));
+        }
+    }
+#define L(x,y) list[ rect->data[0][(x) + (y)*rect->linesize[0]] ]
+
+    for (i = 0; i<256; i++) {
+        int scoretab[256] = {0};
+        int bestscore = 0;
+        int bestv = 0;
+        for (y = 0; y<h; y++) {
+            for (x = 0; x<w; x++) {
+                int v = rect->data[0][x + y*rect->linesize[0]];
+                int l_m = list[v];
+                int l_l = x     ? L(x-1, y) : 1;
+                int l_r = x+1<w ? L(x+1, y) : 1;
+                int l_t = y     ? L(x, y-1) : 1;
+                int l_b = y+1<h ? L(x, y+1) : 1;
+                int score;
+                if (l_m)
+                    continue;
+                scoretab[v] += l_l + l_r + l_t + l_b;
+                score = 1024LL*scoretab[v] / counttab[v];
+                if (score > bestscore) {
+                    bestscore = score;
+                    bestv = v;
+                }
+            }
+        }
+        if (!bestscore)
+            break;
+        list    [ bestv ] = 1;
+        list_inv[     i ] = bestv;
+    }
 
+    count = i - 1;
+    for (i--; i>=0; i--) {
+        int v = i*255/count;
+        AV_WN32(rect->data[1] + 4*list_inv[i], RGBA(v/2,v,v/2,v));
+    }
+}
+
+
+static int save_subtitle_set(AVCodecContext *avctx, AVSubtitle *sub, int *got_output)
+{
+    DVBSubContext *ctx = avctx->priv_data;
+    DVBSubRegionDisplay *display;
+    DVBSubDisplayDefinition *display_def = ctx->display_definition;
+    DVBSubRegion *region;
+    AVSubtitleRect *rect;
+    DVBSubCLUT *clut;
+    uint32_t *clut_table;
+    int i,j;
+    int offset_x=0, offset_y=0;
+    int ret = 0;
+
+
+    if (display_def) {
+        offset_x = display_def->x;
+        offset_y = display_def->y;
+    }
+
+    /* Not touching AVSubtitles again*/
+    if(sub->num_rects) {
+        avpriv_request_sample(ctx, "Different Version of Segment asked Twice");
+        return AVERROR_PATCHWELCOME;
+    }
+    for (display = ctx->display_list; display; display = display->next) {
+        region = get_region(ctx, display->region_id);
+        if (region && region->dirty)
+            sub->num_rects++;
+    }
+
+    if(ctx->compute_edt == 0) {
+        sub->end_display_time = ctx->time_out * 1000;
+        *got_output = 1;
+    } else if (ctx->prev_start != AV_NOPTS_VALUE) {
+        sub->end_display_time = av_rescale_q((sub->pts - ctx->prev_start ), AV_TIME_BASE_Q, (AVRational){ 1, 1000 }) - 1;
+        *got_output = 1;
+    }
+    if (sub->num_rects > 0) {
+
+        sub->rects = av_mallocz_array(sizeof(*sub->rects), sub->num_rects);
+        if (!sub->rects) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        for(i=0; i<sub->num_rects; i++)
+            sub->rects[i] = av_mallocz(sizeof(*sub->rects[i]));
+
+        i = 0;
+
+        for (display = ctx->display_list; display; display = display->next) {
+            region = get_region(ctx, display->region_id);
+
+            if (!region)
+                continue;
+
+            if (!region->dirty)
+                continue;
+
+            rect = sub->rects[i];
+            rect->x = display->x_pos + offset_x;
+            rect->y = display->y_pos + offset_y;
+            rect->w = region->width;
+            rect->h = region->height;
+            rect->nb_colors = (1 << region->depth);
+            rect->type      = SUBTITLE_BITMAP;
+            rect->linesize[0] = region->width;
+
+            clut = get_clut(ctx, region->clut);
+
+            if (!clut)
+                clut = &default_clut;
+
+            switch (region->depth) {
+            case 2:
+                clut_table = clut->clut4;
+                break;
+            case 8:
+                clut_table = clut->clut256;
+                break;
+            case 4:
+            default:
+                clut_table = clut->clut16;
+                break;
+            }
+
+            rect->data[1] = av_mallocz(AVPALETTE_SIZE);
+            if (!rect->data[1]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+            memcpy(rect->data[1], clut_table, (1 << region->depth) * sizeof(uint32_t));
+
+            rect->data[0] = av_malloc(region->buf_size);
+            if (!rect->data[0]) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+
+            memcpy(rect->data[0], region->pbuf, region->buf_size);
+
+            if ((clut == &default_clut && ctx->compute_clut == -1) || ctx->compute_clut == 1)
+                compute_default_clut(rect, rect->w, rect->h);
+
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+            for (j = 0; j < 4; j++) {
+                rect->pict.data[j] = rect->data[j];
+                rect->pict.linesize[j] = rect->linesize[j];
+            }
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+            i++;
+        }
+    }
+
+    return 0;
+fail:
+    if (sub->rects) {
+        for(i=0; i<sub->num_rects; i++) {
+            rect = sub->rects[i];
+            if (rect) {
+                av_freep(&rect->data[0]);
+                av_freep(&rect->data[1]);
+            }
+            av_freep(&sub->rects[i]);
+        }
+        av_freep(&sub->rects);
+    }
+    sub->num_rects = 0;
+    return ret;
+}
 
 static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDisplay *display,
                                           const uint8_t *buf, int buf_size, int top_bottom, int non_mod)
@@ -736,6 +969,7 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
                          0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff};
     uint8_t *map_table;
 
+#if 0
     ff_dlog(avctx, "DVB pixel block size %d, %s field:\n", buf_size,
             top_bottom ? "bottom" : "top");
 
@@ -750,21 +984,22 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
 
     if (i % 16)
         ff_dlog(avctx, "\n");
+#endif
 
-    if (region == 0)
+    if (!region)
         return;
 
     pbuf = region->pbuf;
+    region->dirty = 1;
 
     x_pos = display->x_pos;
     y_pos = display->y_pos;
 
-    if ((y_pos & 1) != top_bottom)
-        y_pos++;
+    y_pos += top_bottom;
 
     while (buf < buf_end) {
-        if (x_pos > region->width || y_pos > region->height) {
-            av_log(avctx, AV_LOG_ERROR, "Invalid object location!\n");
+        if ((*buf!=0xf0 && x_pos >= region->width) || y_pos >= region->height) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid object location! %d-%d %d-%d %02x\n", x_pos, region->width, y_pos, region->height, *buf);
             return;
         }
 
@@ -777,9 +1012,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
             else
                 map_table = NULL;
 
-            x_pos += dvbsub_read_2bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, map_table);
+            x_pos = dvbsub_read_2bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, map_table, x_pos);
             break;
         case 0x11:
             if (region->depth < 4) {
@@ -792,9 +1027,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
             else
                 map_table = NULL;
 
-            x_pos += dvbsub_read_4bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, map_table);
+            x_pos = dvbsub_read_4bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, map_table, x_pos);
             break;
         case 0x12:
             if (region->depth < 8) {
@@ -802,9 +1037,9 @@ static void dvbsub_parse_pixel_data_block(AVCodecContext *avctx, DVBSubObjectDis
                 return;
             }
 
-            x_pos += dvbsub_read_8bit_string(pbuf + (y_pos * region->width) + x_pos,
-                                                region->width - x_pos, &buf, buf_end - buf,
-                                                non_mod, NULL);
+            x_pos = dvbsub_read_8bit_string(avctx, pbuf + (y_pos * region->width),
+                                            region->width, &buf, buf_end - buf,
+                                            non_mod, NULL, x_pos);
             break;
 
         case 0x20:
@@ -839,7 +1074,6 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
     DVBSubContext *ctx = avctx->priv_data;
 
     const uint8_t *buf_end = buf + buf_size;
-    const uint8_t *block;
     int object_id;
     DVBSubObject *object;
     DVBSubObjectDisplay *display;
@@ -865,12 +1099,13 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
         buf += 2;
 
         if (buf + top_field_len + bottom_field_len > buf_end) {
-            av_log(avctx, AV_LOG_ERROR, "Field data size too large\n");
+            av_log(avctx, AV_LOG_ERROR, "Field data size %d+%d too large\n", top_field_len, bottom_field_len);
             return AVERROR_INVALIDDATA;
         }
 
         for (display = object->display_list; display; display = display->object_list_next) {
-            block = buf;
+            const uint8_t *block = buf;
+            int bfl = bottom_field_len;
 
             dvbsub_parse_pixel_data_block(avctx, display, block, top_field_len, 0,
                                             non_modifying_color);
@@ -878,9 +1113,9 @@ static int dvbsub_parse_object_segment(AVCodecContext *avctx,
             if (bottom_field_len > 0)
                 block = buf + top_field_len;
             else
-                bottom_field_len = top_field_len;
+                bfl = top_field_len;
 
-            dvbsub_parse_pixel_data_block(avctx, display, block, bottom_field_len, 1,
+            dvbsub_parse_pixel_data_block(avctx, display, block, bfl, 1,
                                             non_modifying_color);
         }
 
@@ -900,6 +1135,7 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
 
     const uint8_t *buf_end = buf + buf_size;
     int i, clut_id;
+    int version;
     DVBSubCLUT *clut;
     int entry_id, depth , full_range;
     int y, cr, cb, alpha;
@@ -917,6 +1153,7 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         ff_dlog(avctx, "\n");
 
     clut_id = *buf++;
+    version = ((*buf)>>4)&15;
     buf += 1;
 
     clut = get_clut(ctx, clut_id);
@@ -929,11 +1166,16 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         memcpy(clut, &default_clut, sizeof(DVBSubCLUT));
 
         clut->id = clut_id;
+        clut->version = -1;
 
         clut->next = ctx->clut_list;
         ctx->clut_list = clut;
     }
 
+    if (clut->version != version) {
+
+    clut->version = version;
+
     while (buf + 4 < buf_end) {
         entry_id = *buf++;
 
@@ -941,7 +1183,6 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
 
         if (depth == 0) {
             av_log(avctx, AV_LOG_ERROR, "Invalid clut depth 0x%x!\n", *buf);
-            return AVERROR_INVALIDDATA;
         }
 
         full_range = (*buf++) & 1;
@@ -967,14 +1208,20 @@ static int dvbsub_parse_clut_segment(AVCodecContext *avctx,
         YUV_TO_RGB2_CCIR(r, g, b, y);
 
         ff_dlog(avctx, "clut %d := (%d,%d,%d,%d)\n", entry_id, r, g, b, alpha);
+        if (!!(depth & 0x80) + !!(depth & 0x40) + !!(depth & 0x20) > 1) {
+            ff_dlog(avctx, "More than one bit level marked: %x\n", depth);
+            if (avctx->strict_std_compliance > FF_COMPLIANCE_NORMAL)
+                return AVERROR_INVALIDDATA;
+        }
 
         if (depth & 0x80)
             clut->clut4[entry_id] = RGBA(r,g,b,255 - alpha);
-        if (depth & 0x40)
+        else if (depth & 0x40)
             clut->clut16[entry_id] = RGBA(r,g,b,255 - alpha);
-        if (depth & 0x20)
+        else if (depth & 0x20)
             clut->clut256[entry_id] = RGBA(r,g,b,255 - alpha);
     }
+    }
 
     return 0;
 }
@@ -987,6 +1234,7 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
 
     const uint8_t *buf_end = buf + buf_size;
     int region_id, object_id;
+    int av_unused version;
     DVBSubRegion *region;
     DVBSubObject *object;
     DVBSubObjectDisplay *display;
@@ -1005,11 +1253,13 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
             return AVERROR(ENOMEM);
 
         region->id = region_id;
+        region->version = -1;
 
         region->next = ctx->region_list;
         ctx->region_list = region;
     }
 
+    version = ((*buf)>>4) & 15;
     fill = ((*buf++) >> 3) & 1;
 
     region->width = AV_RB16(buf);
@@ -1023,10 +1273,15 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
         region->buf_size = region->width * region->height;
 
         region->pbuf = av_malloc(region->buf_size);
-        if (!region->pbuf)
+        if (!region->pbuf) {
+            region->buf_size =
+            region->width =
+            region->height = 0;
             return AVERROR(ENOMEM);
+        }
 
         fill = 1;
+        region->dirty = 0;
     }
 
     region->depth = 1 << (((*buf++) >> 2) & 7);
@@ -1036,9 +1291,10 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
     }
     region->clut = *buf++;
 
-    if (region->depth == 8)
+    if (region->depth == 8) {
         region->bgcolor = *buf++;
-    else {
+        buf += 1;
+    } else {
         buf += 1;
 
         if (region->depth == 4)
@@ -1102,7 +1358,7 @@ static int dvbsub_parse_region_segment(AVCodecContext *avctx,
 }
 
 static int dvbsub_parse_page_segment(AVCodecContext *avctx,
-                                     const uint8_t *buf, int buf_size)
+                                     const uint8_t *buf, int buf_size, AVSubtitle *sub, int *got_output)
 {
     DVBSubContext *ctx = avctx->priv_data;
     DVBSubRegionDisplay *display;
@@ -1111,22 +1367,36 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
     const uint8_t *buf_end = buf + buf_size;
     int region_id;
     int page_state;
+    int timeout;
+    int version;
 
     if (buf_size < 1)
         return AVERROR_INVALIDDATA;
 
-    ctx->time_out = *buf++;
+    timeout = *buf++;
+    version = ((*buf)>>4) & 15;
     page_state = ((*buf++) >> 2) & 3;
 
+    if (ctx->version == version) {
+        return 0;
+    }
+
+    ctx->time_out = timeout;
+    ctx->version = version;
+
     ff_dlog(avctx, "Page time out %ds, state %d\n", ctx->time_out, page_state);
 
-    if (page_state == 2) {
-        delete_state(ctx);
+    if(ctx->compute_edt == 1)
+        save_subtitle_set(avctx, sub, got_output);
+
+    if (page_state == 1 || page_state == 2) {
+        delete_regions(ctx);
+        delete_objects(ctx);
+        delete_cluts(ctx);
     }
 
     tmp_display_list = ctx->display_list;
     ctx->display_list = NULL;
-    ctx->display_list_size = 0;
 
     while (buf + 5 < buf_end) {
         region_id = *buf++;
@@ -1157,7 +1427,6 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
 
         display->next = ctx->display_list;
         ctx->display_list = display;
-        ctx->display_list_size++;
 
         ff_dlog(avctx, "Region %d, (%d,%d)\n", region_id, display->x_pos, display->y_pos);
     }
@@ -1167,7 +1436,7 @@ static int dvbsub_parse_page_segment(AVCodecContext *avctx,
 
         tmp_display_list = display->next;
 
-        av_free(display);
+        av_freep(&display);
     }
 
     return 0;
@@ -1195,6 +1464,9 @@ static int save_display_set(DVBSubContext *ctx)
     for (display = ctx->display_list; display; display = display->next) {
         region = get_region(ctx, display->region_id);
 
+        if (!region)
+            return -1;
+
         if (x_pos == -1) {
             x_pos = display->x_pos;
             y_pos = display->y_pos;
@@ -1225,17 +1497,20 @@ static int save_display_set(DVBSubContext *ctx)
 
         pbuf = av_malloc(width * height * 4);
         if (!pbuf)
-            return AVERROR(ENOMEM);
+            return -1;
 
         for (display = ctx->display_list; display; display = display->next) {
             region = get_region(ctx, display->region_id);
 
+            if (!region)
+                return -1;
+
             x_off = display->x_pos - x_pos;
             y_off = display->y_pos - y_pos;
 
             clut = get_clut(ctx, region->clut);
 
-            if (clut == 0)
+            if (!clut)
                 clut = &default_clut;
 
             switch (region->depth) {
@@ -1264,7 +1539,7 @@ static int save_display_set(DVBSubContext *ctx)
 
         png_save2(filename, pbuf, width, height);
 
-        av_free(pbuf);
+        av_freep(&pbuf);
     }
 
     fileno_index++;
@@ -1300,14 +1575,18 @@ static int dvbsub_parse_display_definition_segment(AVCodecContext *avctx,
     display_def->y       = 0;
     display_def->width   = bytestream_get_be16(&buf) + 1;
     display_def->height  = bytestream_get_be16(&buf) + 1;
-
-    if (buf_size < 13)
-        return AVERROR_INVALIDDATA;
+    if (!avctx->width || !avctx->height) {
+        avctx->width  = display_def->width;
+        avctx->height = display_def->height;
+    }
 
     if (info_byte & 1<<3) { // display_window_flag
+        if (buf_size < 13)
+            return AVERROR_INVALIDDATA;
+
         display_def->x = bytestream_get_be16(&buf);
-        display_def->y = bytestream_get_be16(&buf);
         display_def->width  = bytestream_get_be16(&buf) - display_def->x + 1;
+        display_def->y = bytestream_get_be16(&buf);
         display_def->height = bytestream_get_be16(&buf) - display_def->y + 1;
     }
 
@@ -1315,108 +1594,16 @@ static int dvbsub_parse_display_definition_segment(AVCodecContext *avctx,
 }
 
 static int dvbsub_display_end_segment(AVCodecContext *avctx, const uint8_t *buf,
-                                      int buf_size, AVSubtitle *sub)
+                                      int buf_size, AVSubtitle *sub,int *got_output)
 {
     DVBSubContext *ctx = avctx->priv_data;
-    DVBSubDisplayDefinition *display_def = ctx->display_definition;
-
-    DVBSubRegion *region;
-    DVBSubRegionDisplay *display;
-    AVSubtitleRect *rect;
-    DVBSubCLUT *clut;
-    uint32_t *clut_table;
-    int i;
-    int offset_x=0, offset_y=0;
-
-    sub->rects = NULL;
-    sub->start_display_time = 0;
-    sub->end_display_time = ctx->time_out * 1000;
-    sub->format = 0;
-
-    if (display_def) {
-        offset_x = display_def->x;
-        offset_y = display_def->y;
-    }
-
-    sub->num_rects = ctx->display_list_size;
-    if (sub->num_rects <= 0)
-        return AVERROR_INVALIDDATA;
-
-    sub->rects = av_mallocz_array(sub->num_rects * sub->num_rects,
-                                  sizeof(*sub->rects));
-    if (!sub->rects)
-        return AVERROR(ENOMEM);
-
-    i = 0;
-
-    for (display = ctx->display_list; display; display = display->next) {
-        int j;
-        region = get_region(ctx, display->region_id);
-        rect = sub->rects[i];
-
-        if (!region)
-            continue;
-
-        rect->x = display->x_pos + offset_x;
-        rect->y = display->y_pos + offset_y;
-        rect->w = region->width;
-        rect->h = region->height;
-        rect->nb_colors = 16;
-        rect->type      = SUBTITLE_BITMAP;
-        rect->linesize[0] = region->width;
-
-        clut = get_clut(ctx, region->clut);
-
-        if (!clut)
-            clut = &default_clut;
-
-        switch (region->depth) {
-        case 2:
-            clut_table = clut->clut4;
-            break;
-        case 8:
-            clut_table = clut->clut256;
-            break;
-        case 4:
-        default:
-            clut_table = clut->clut16;
-            break;
-        }
-
-        rect->data[1] = av_mallocz(AVPALETTE_SIZE);
-        if (!rect->data[1]) {
-            av_free(sub->rects);
-            return AVERROR(ENOMEM);
-        }
-        memcpy(rect->data[1], clut_table, (1 << region->depth) * sizeof(uint32_t));
-
-        rect->data[0] = av_malloc(region->buf_size);
-        if (!rect->data[0]) {
-            av_free(rect->data[1]);
-            av_free(sub->rects);
-            return AVERROR(ENOMEM);
-        }
-        memcpy(rect->data[0], region->pbuf, region->buf_size);
-
-#if FF_API_AVPICTURE
-FF_DISABLE_DEPRECATION_WARNINGS
-        for (j = 0; j < 4; j++) {
-            rect->pict.data[j] = rect->data[j];
-            rect->pict.linesize[j] = rect->linesize[j];
-        }
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-        i++;
-    }
-
-    sub->num_rects = i;
 
+    if(ctx->compute_edt == 0)
+        save_subtitle_set(avctx, sub, got_output);
 #ifdef DEBUG
     save_display_set(ctx);
 #endif
-
-    return 1;
+    return 0;
 }
 
 static int dvbsub_decode(AVCodecContext *avctx,
@@ -1432,6 +1619,9 @@ static int dvbsub_decode(AVCodecContext *avctx,
     int page_id;
     int segment_length;
     int i;
+    int ret = 0;
+    int got_segment = 0;
+    int got_dds = 0;
 
     ff_dlog(avctx, "DVB sub packet:\n");
 
@@ -1460,9 +1650,14 @@ static int dvbsub_decode(AVCodecContext *avctx,
         segment_length = AV_RB16(p);
         p += 2;
 
+        if (avctx->debug & FF_DEBUG_STARTCODE) {
+            av_log(avctx, AV_LOG_DEBUG, "segment_type:%d page_id:%d segment_length:%d\n", segment_type, page_id, segment_length);
+        }
+
         if (p_end - p < segment_length) {
             ff_dlog(avctx, "incomplete or broken packet");
-            return -1;
+            ret = -1;
+            goto end;
         }
 
         if (page_id == ctx->composition_id || page_id == ctx->ancillary_id ||
@@ -1470,24 +1665,35 @@ static int dvbsub_decode(AVCodecContext *avctx,
             int ret = 0;
             switch (segment_type) {
             case DVBSUB_PAGE_SEGMENT:
-                ret = dvbsub_parse_page_segment(avctx, p, segment_length);
+                ret = dvbsub_parse_page_segment(avctx, p, segment_length, sub, data_size);
+                got_segment |= 1;
                 break;
             case DVBSUB_REGION_SEGMENT:
                 ret = dvbsub_parse_region_segment(avctx, p, segment_length);
+                got_segment |= 2;
                 break;
             case DVBSUB_CLUT_SEGMENT:
                 ret = dvbsub_parse_clut_segment(avctx, p, segment_length);
+                if (ret < 0) goto end;
+                got_segment |= 4;
                 break;
             case DVBSUB_OBJECT_SEGMENT:
                 ret = dvbsub_parse_object_segment(avctx, p, segment_length);
+                got_segment |= 8;
                 break;
             case DVBSUB_DISPLAYDEFINITION_SEGMENT:
                 ret = dvbsub_parse_display_definition_segment(avctx, p,
                                                               segment_length);
+                got_dds = 1;
                 break;
             case DVBSUB_DISPLAY_SEGMENT:
-                ret = dvbsub_display_end_segment(avctx, p, segment_length, sub);
-                *data_size = ret;
+                ret = dvbsub_display_end_segment(avctx, p, segment_length, sub, data_size);
+                if (got_segment == 15 && !got_dds && !avctx->width && !avctx->height) {
+                    // Default from ETSI EN 300 743 V1.3.1 (7.2.1)
+                    avctx->width  = 720;
+                    avctx->height = 576;
+                }
+                got_segment |= 16;
                 break;
             default:
                 ff_dlog(avctx, "Subtitling segment type 0x%x, page id %d, length %d\n",
@@ -1495,15 +1701,44 @@ static int dvbsub_decode(AVCodecContext *avctx,
                 break;
             }
             if (ret < 0)
-                return ret;
+                goto end;
         }
 
         p += segment_length;
     }
+    // Some streams do not send a display segment but if we have all the other
+    // segments then we need no further data.
+    if (got_segment == 15) {
+        av_log(avctx, AV_LOG_DEBUG, "Missing display_end_segment, emulating\n");
+        dvbsub_display_end_segment(avctx, p, 0, sub, data_size);
+    }
+
+end:
+    if(ret < 0) {
+        *data_size = 0;
+        avsubtitle_free(sub);
+        return ret;
+    } else {
+        if(ctx->compute_edt == 1 )
+            FFSWAP(int64_t, ctx->prev_start, sub->pts);
+    }
 
     return p - buf;
 }
 
+#define DS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM
+static const AVOption options[] = {
+    {"compute_edt", "compute end of time using pts or timeout", offsetof(DVBSubContext, compute_edt), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, DS},
+    {"compute_clut", "compute clut when not available(-1) or always(1) or never(0)", offsetof(DVBSubContext, compute_clut), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, DS},
+    {"dvb_substream", "", offsetof(DVBSubContext, substream), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 63, DS},
+    {NULL}
+};
+static const AVClass dvbsubdec_class = {
+    .class_name = "DVB Sub Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 AVCodec ff_dvbsub_decoder = {
     .name           = "dvbsub",
@@ -1514,4 +1749,5 @@ AVCodec ff_dvbsub_decoder = {
     .init           = dvbsub_init_decoder,
     .close          = dvbsub_close_decoder,
     .decode         = dvbsub_decode,
+    .priv_class     = &dvbsubdec_class,
 };
diff --git a/libavcodec/dvbtxt.h b/libavcodec/dvbtxt.h
new file mode 100644
index 0000000..ff88fcf
--- /dev/null
+++ b/libavcodec/dvbtxt.h
@@ -0,0 +1,41 @@
+/*
+ * DVB teletext common functions.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DVBTXT_H
+#define AVCODEC_DVBTXT_H
+
+#include "libavutil/attributes.h"
+
+/* Returns true if data identifier matches a teletext stream according to EN
+ * 301 775 section 4.4.2 */
+static av_always_inline int ff_data_identifier_is_teletext(int data_identifier)
+{
+    return (data_identifier >= 0x10 && data_identifier <= 0x1F ||
+            data_identifier >= 0x99 && data_identifier <= 0x9B);
+}
+
+/* Returns true if data unit id matches EBU teletext data according to
+ * EN 301 775 section 4.4.2 */
+static av_always_inline int ff_data_unit_id_is_teletext(int data_unit_id)
+{
+    return (data_unit_id == 0x02 || data_unit_id == 0x03);
+}
+
+#endif /* AVCODEC_DVBTXT_H */
diff --git a/libavcodec/dvd_nav_parser.c b/libavcodec/dvd_nav_parser.c
new file mode 100644
index 0000000..6e2352d
--- /dev/null
+++ b/libavcodec/dvd_nav_parser.c
@@ -0,0 +1,115 @@
+/*
+ * DVD navigation block parser for FFmpeg
+ * Copyright (c) 2013 The FFmpeg Project
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "get_bits.h"
+#include "parser.h"
+
+#define PCI_SIZE  980
+#define DSI_SIZE 1018
+
+/* parser definition */
+typedef struct DVDNavParseContext {
+    uint32_t     lba;
+    uint8_t      buffer[PCI_SIZE+DSI_SIZE];
+    int          copied;
+} DVDNavParseContext;
+
+static av_cold int dvd_nav_parse_init(AVCodecParserContext *s)
+{
+    DVDNavParseContext *pc = s->priv_data;
+
+    pc->lba    = 0xFFFFFFFF;
+    pc->copied = 0;
+    return 0;
+}
+
+static int dvd_nav_parse(AVCodecParserContext *s,
+                         AVCodecContext *avctx,
+                         const uint8_t **poutbuf, int *poutbuf_size,
+                         const uint8_t *buf, int buf_size)
+{
+    DVDNavParseContext *pc1 = s->priv_data;
+    int lastPacket          = 0;
+    int valid               = 0;
+
+    s->pict_type = AV_PICTURE_TYPE_NONE;
+
+    avctx->time_base.num = 1;
+    avctx->time_base.den = 90000;
+
+    if (buf && buf_size) {
+        switch(buf[0]) {
+            case 0x00:
+                if (buf_size == PCI_SIZE) {
+                    /* PCI */
+                    uint32_t lba      = AV_RB32(&buf[0x01]);
+                    uint32_t startpts = AV_RB32(&buf[0x0D]);
+                    uint32_t endpts   = AV_RB32(&buf[0x11]);
+
+                    if (endpts > startpts) {
+                        pc1->lba    = lba;
+                        s->pts      = (int64_t)startpts;
+                        s->duration = endpts - startpts;
+
+                        memcpy(pc1->buffer, buf, PCI_SIZE);
+                        pc1->copied = PCI_SIZE;
+                        valid       = 1;
+                    }
+                }
+                break;
+
+            case 0x01:
+                if ((buf_size == DSI_SIZE) && (pc1->copied == PCI_SIZE)) {
+                    /* DSI */
+                    uint32_t lba = AV_RB32(&buf[0x05]);
+
+                    if (lba == pc1->lba) {
+                        memcpy(pc1->buffer + pc1->copied, buf, DSI_SIZE);
+                        lastPacket  = 1;
+                        valid       = 1;
+                    }
+                }
+                break;
+        }
+    }
+
+    if (!valid || lastPacket) {
+        pc1->copied = 0;
+        pc1->lba    = 0xFFFFFFFF;
+    }
+
+    if (lastPacket) {
+        *poutbuf      = pc1->buffer;
+        *poutbuf_size = sizeof(pc1->buffer);
+    } else {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+    }
+
+    return buf_size;
+}
+
+AVCodecParser ff_dvd_nav_parser = {
+    .codec_ids      = { AV_CODEC_ID_DVD_NAV },
+    .priv_data_size = sizeof(DVDNavParseContext),
+    .parser_init    = dvd_nav_parse_init,
+    .parser_parse   = dvd_nav_parse,
+};
diff --git a/libavcodec/dvdata.c b/libavcodec/dvdata.c
index 985cda7..231569a 100644
--- a/libavcodec/dvdata.c
+++ b/libavcodec/dvdata.c
@@ -2,20 +2,20 @@
  * Constants for DV codec
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,71 +69,6 @@ const uint8_t ff_dv_quant_shifts[22][4] = {
 
 const uint8_t ff_dv_quant_offset[4] = { 6, 3, 0, 1 };
 
-const int ff_dv_iweight_88[64] = {
-    32768, 16710, 16710, 17735, 17015, 17735, 18197, 18079,
-    18079, 18197, 18725, 18559, 19196, 18559, 18725, 19284,
-    19108, 19692, 19692, 19108, 19284, 21400, 19645, 20262,
-    20214, 20262, 19645, 21400, 22733, 21845, 20867, 20815,
-    20815, 20867, 21845, 22733, 23173, 23173, 21400, 21400,
-    21400, 23173, 23173, 24600, 23764, 22017, 22017, 23764,
-    24600, 25267, 24457, 22672, 24457, 25267, 25971, 25191,
-    25191, 25971, 26715, 27962, 26715, 29642, 29642, 31536,
-};
-const int ff_dv_iweight_248[64] = {
-    32768, 17735, 16710, 18079, 18725, 21400, 17735, 19196,
-    19108, 21845, 16384, 17735, 18725, 21400, 16710, 18079,
-    20262, 23173, 18197, 19692, 18725, 20262, 20815, 23764,
-    17735, 19196, 19108, 21845, 20262, 23173, 18197, 19692,
-    21400, 24457, 19284, 20867, 21400, 23173, 22017, 25191,
-    18725, 20262, 20815, 23764, 21400, 24457, 19284, 20867,
-    24457, 27962, 22733, 24600, 25971, 29642, 21400, 23173,
-    22017, 25191, 24457, 27962, 22733, 24600, 25971, 29642,
-};
-
-/**
- * The "inverse" DV100 weights are actually just the spec weights (zig-zagged).
- */
-const int ff_dv_iweight_1080_y[64] = {
-    128,  16,  16,  17,  17,  17,  18,  18,
-     18,  18,  18,  18,  19,  18,  18,  19,
-     19,  19,  19,  19,  19,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  45,  45,  42,  42,
-     42,  45,  45,  48,  46,  43,  43,  46,
-     48,  49,  48,  44,  48,  49, 101,  98,
-     98, 101, 104, 109, 104, 116, 116, 123,
-};
-const int ff_dv_iweight_1080_c[64] = {
-    128,  16,  16,  17,  17,  17,  25,  25,
-     25,  25,  26,  25,  26,  25,  26,  26,
-     26,  27,  27,  26,  26,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  91,  91,  84,  84,
-     84,  91,  91,  96,  93,  86,  86,  93,
-     96, 197, 191, 177, 191, 197, 203, 197,
-    197, 203, 209, 219, 209, 232, 232, 246,
-};
-const int ff_dv_iweight_720_y[64] = {
-    128,  16,  16,  17,  17,  17,  18,  18,
-     18,  18,  18,  18,  19,  18,  18,  19,
-     19,  19,  19,  19,  19,  42,  38,  40,
-     40,  40,  38,  42,  44,  43,  41,  41,
-     41,  41,  43,  44,  68,  68,  63,  63,
-     63,  68,  68,  96,  92,  86,  86,  92,
-     96,  98,  96,  88,  96,  98, 202, 196,
-    196, 202, 208, 218, 208, 232, 232, 246,
-};
-const int ff_dv_iweight_720_c[64] = {
-    128,  24,  24,  26,  26,  26,  36,  36,
-     36,  36,  36,  36,  38,  36,  36,  38,
-     38,  38,  38,  38,  38,  84,  76,  80,
-     80,  80,  76,  84,  88,  86,  82,  82,
-     82,  82,  86,  88, 182, 182, 168, 168,
-    168, 182, 182, 192, 186, 192, 172, 186,
-    192, 394, 382, 354, 382, 394, 406, 394,
-    394, 406, 418, 438, 418, 464, 464, 492,
-};
-
 /*
  * There's a catch about the following three tables: the mapping they establish
  * between (run, level) and vlc is not 1-1. So you have to watch out for that
diff --git a/libavcodec/dvdata.h b/libavcodec/dvdata.h
index 8e7c0fb..e0ed043 100644
--- a/libavcodec/dvdata.h
+++ b/libavcodec/dvdata.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,13 +26,6 @@ extern const uint8_t ff_dv_zigzag248_direct[64];
 extern const uint8_t ff_dv_quant_shifts[22][4];
 extern const uint8_t ff_dv_quant_offset[4];
 
-extern const int ff_dv_iweight_88[64];
-extern const int ff_dv_iweight_248[64];
-extern const int ff_dv_iweight_1080_y[64];
-extern const int ff_dv_iweight_1080_c[64];
-extern const int ff_dv_iweight_720_y[64];
-extern const int ff_dv_iweight_720_c[64];
-
 #define NB_DV_VLC 409
 
 extern const uint16_t ff_dv_vlc_bits[NB_DV_VLC];
diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 463d108..0b4c1bc 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -13,20 +13,20 @@
  * Many thanks to Dan Dennedy <dan@dennedy.org> for providing wealth
  * of DV technical info.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,12 +35,14 @@
  * DV decoder
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
 #include "dv.h"
+#include "dv_profile_internal.h"
 #include "dvdata.h"
 #include "get_bits.h"
 #include "idctdsp.h"
@@ -60,18 +62,136 @@ typedef struct BlockInfo {
 
 static const int dv_iweight_bits = 14;
 
+static const uint16_t dv_iweight_88[64] = {
+    32768, 16705, 16705, 17734, 17032, 17734, 18205, 18081,
+    18081, 18205, 18725, 18562, 19195, 18562, 18725, 19266,
+    19091, 19705, 19705, 19091, 19266, 21407, 19643, 20267,
+    20228, 20267, 19643, 21407, 22725, 21826, 20853, 20806,
+    20806, 20853, 21826, 22725, 23170, 23170, 21407, 21400,
+    21407, 23170, 23170, 24598, 23786, 22018, 22018, 23786,
+    24598, 25251, 24465, 22654, 24465, 25251, 25972, 25172,
+    25172, 25972, 26722, 27969, 26722, 29692, 29692, 31521,
+};
+static const uint16_t dv_iweight_248[64] = {
+    32768, 16384, 16705, 16705, 17734, 17734, 17734, 17734,
+    18081, 18081, 18725, 18725, 21407, 21407, 19091, 19091,
+    19195, 19195, 18205, 18205, 18725, 18725, 19705, 19705,
+    20267, 20267, 21826, 21826, 23170, 23170, 20806, 20806,
+    20267, 20267, 19266, 19266, 21407, 21407, 20853, 20853,
+    21400, 21400, 23786, 23786, 24465, 24465, 22018, 22018,
+    23170, 23170, 22725, 22725, 24598, 24598, 24465, 24465,
+    25172, 25172, 27969, 27969, 25972, 25972, 29692, 29692
+};
+
+/**
+ * The "inverse" DV100 weights are actually just the spec weights (zig-zagged).
+ */
+static const uint16_t dv_iweight_1080_y[64] = {
+    128,  16,  16,  17,  17,  17,  18,  18,
+     18,  18,  18,  18,  19,  18,  18,  19,
+     19,  19,  19,  19,  19,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  45,  45,  42,  42,
+     42,  45,  45,  48,  46,  43,  43,  46,
+     48,  49,  48,  44,  48,  49, 101,  98,
+     98, 101, 104, 109, 104, 116, 116, 123,
+};
+static const uint16_t dv_iweight_1080_c[64] = {
+    128,  16,  16,  17,  17,  17,  25,  25,
+     25,  25,  26,  25,  26,  25,  26,  26,
+     26,  27,  27,  26,  26,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  91,  91,  84,  84,
+     84,  91,  91,  96,  93,  86,  86,  93,
+     96, 197, 191, 177, 191, 197, 203, 197,
+    197, 203, 209, 219, 209, 232, 232, 246,
+};
+static const uint16_t dv_iweight_720_y[64] = {
+    128,  16,  16,  17,  17,  17,  18,  18,
+     18,  18,  18,  18,  19,  18,  18,  19,
+     19,  19,  19,  19,  19,  42,  38,  40,
+     40,  40,  38,  42,  44,  43,  41,  41,
+     41,  41,  43,  44,  68,  68,  63,  63,
+     63,  68,  68,  96,  92,  86,  86,  92,
+     96,  98,  96,  88,  96,  98, 202, 196,
+    196, 202, 208, 218, 208, 232, 232, 246,
+};
+static const uint16_t dv_iweight_720_c[64] = {
+    128,  24,  24,  26,  26,  26,  36,  36,
+     36,  36,  36,  36,  38,  36,  36,  38,
+     38,  38,  38,  38,  38,  84,  76,  80,
+     80,  80,  76,  84,  88,  86,  82,  82,
+     82,  82,  86,  88, 182, 182, 168, 168,
+    168, 182, 182, 192, 186, 192, 172, 186,
+    192, 394, 382, 354, 382, 394, 406, 394,
+    394, 406, 418, 438, 418, 464, 464, 492,
+};
+
+static void dv_init_weight_tables(DVVideoContext *ctx, const AVDVProfile *d)
+{
+    int j, i, c, s;
+    uint32_t *factor1 = &ctx->idct_factor[0],
+             *factor2 = &ctx->idct_factor[DV_PROFILE_IS_HD(d) ? 4096 : 2816];
+
+    if (DV_PROFILE_IS_HD(d)) {
+        /* quantization quanta by QNO for DV100 */
+        static const uint8_t dv100_qstep[16] = {
+            1, /* QNO = 0 and 1 both have no quantization */
+            1,
+            2, 3, 4, 5, 6, 7, 8, 16, 18, 20, 22, 24, 28, 52
+        };
+        const uint16_t *iweight1, *iweight2;
+
+        if (d->height == 720) {
+            iweight1 = &dv_iweight_720_y[0];
+            iweight2 = &dv_iweight_720_c[0];
+        } else {
+            iweight1 = &dv_iweight_1080_y[0];
+            iweight2 = &dv_iweight_1080_c[0];
+        }
+        for (c = 0; c < 4; c++) {
+            for (s = 0; s < 16; s++) {
+                for (i = 0; i < 64; i++) {
+                    *factor1++ = (dv100_qstep[s] << (c + 9)) * iweight1[i];
+                    *factor2++ = (dv100_qstep[s] << (c + 9)) * iweight2[i];
+                }
+            }
+        }
+    } else {
+        static const uint8_t dv_quant_areas[4] = { 6, 21, 43, 64 };
+        const uint16_t *iweight1 = &dv_iweight_88[0];
+        for (j = 0; j < 2; j++, iweight1 = &dv_iweight_248[0]) {
+            for (s = 0; s < 22; s++) {
+                for (i = c = 0; c < 4; c++) {
+                    for (; i < dv_quant_areas[c]; i++) {
+                        *factor1   = iweight1[i] << (ff_dv_quant_shifts[s][c] + 1);
+                        *factor2++ = (*factor1++) << 1;
+                    }
+                }
+            }
+        }
+    }
+}
+
 static av_cold int dvvideo_decode_init(AVCodecContext *avctx)
 {
     DVVideoContext *s = avctx->priv_data;
     IDCTDSPContext idsp;
     int i;
 
+    memset(&idsp,0, sizeof(idsp));
     ff_idctdsp_init(&idsp, avctx);
 
     for (i = 0; i < 64; i++)
         s->dv_zigzag[0][i] = idsp.idct_permutation[ff_zigzag_direct[i]];
 
-    memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1]));
+    if (avctx->lowres){
+        for (i = 0; i < 64; i++){
+            int j = ff_dv_zigzag248_direct[i];
+            s->dv_zigzag[1][i] = idsp.idct_permutation[(j & 7) + (j & 8) * 4 + (j & 48) / 2];
+        }
+    }else
+        memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1]));
 
     s->idct_put[0] = idsp.idct_put;
     s->idct_put[1] = ff_simple_idct248_put;
@@ -169,11 +289,17 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     LOCAL_ALIGNED_16(int16_t, sblock, [5 * DV_MAX_BPM], [64]);
     LOCAL_ALIGNED_16(uint8_t, mb_bit_buffer, [80     + AV_INPUT_BUFFER_PADDING_SIZE]); /* allow some slack */
     LOCAL_ALIGNED_16(uint8_t, vs_bit_buffer, [80 * 5 + AV_INPUT_BUFFER_PADDING_SIZE]); /* allow some slack */
-    const int log2_blocksize = 3;
+    const int log2_blocksize = 3-s->avctx->lowres;
     int is_field_mode[5];
+    int vs_bit_buffer_damaged = 0;
+    int mb_bit_buffer_damaged[5] = {0};
+    int retried = 0;
+    int sta;
+
+    av_assert1((((int) mb_bit_buffer) & 7) == 0);
+    av_assert1((((int) vs_bit_buffer) & 7) == 0);
 
-    assert((((int) mb_bit_buffer) & 7) == 0);
-    assert((((int) vs_bit_buffer) & 7) == 0);
+retry:
 
     memset(sblock, 0, 5 * DV_MAX_BPM * sizeof(*sblock));
 
@@ -185,6 +311,14 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     for (mb_index = 0; mb_index < 5; mb_index++, mb1 += s->sys->bpm, block1 += s->sys->bpm * 64) {
         /* skip header */
         quant    = buf_ptr[3] & 0x0f;
+        if (avctx->error_concealment) {
+            if ((buf_ptr[3] >> 4) == 0x0E)
+                vs_bit_buffer_damaged = 1;
+            if (!mb_index) {
+                sta = buf_ptr[3] >> 4;
+            } else if (sta != (buf_ptr[3] >> 4))
+                vs_bit_buffer_damaged = 1;
+        }
         buf_ptr += 4;
         init_put_bits(&pb, mb_bit_buffer, 80);
         mb    = mb1;
@@ -213,7 +347,7 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
                                     dct_mode                        * 22 * 64 +
                                     (quant + ff_dv_quant_offset[class1]) * 64];
             }
-            dc = dc << 2;
+            dc = dc * 4;
             /* convert to unsigned because 128 is not added in the
              * standard IDCT */
             dc                   += 1024;
@@ -229,11 +363,16 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
              * block is finished */
             if (mb->pos >= 64)
                 bit_copy(&pb, &gb);
+            if (mb->pos >= 64 && mb->pos < 127)
+                vs_bit_buffer_damaged = mb_bit_buffer_damaged[mb_index] = 1;
 
             block += 64;
             mb++;
         }
 
+        if (mb_bit_buffer_damaged[mb_index] > 0)
+            continue;
+
         /* pass 2: we can do it just after */
         ff_dlog(avctx, "***pass 2 size=%d MB#=%d\n", put_bits_count(&pb), mb_index);
         block = block1;
@@ -247,6 +386,8 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
                 /* if still not finished, no need to parse other blocks */
                 if (mb->pos < 64)
                     break;
+                if (mb->pos < 127)
+                    vs_bit_buffer_damaged = mb_bit_buffer_damaged[mb_index] = 1;
             }
         }
         /* all blocks are finished, so the extra bytes can be used at
@@ -264,17 +405,25 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
     flush_put_bits(&vs_pb);
     for (mb_index = 0; mb_index < 5; mb_index++) {
         for (j = 0; j < s->sys->bpm; j++) {
-            if (mb->pos < 64) {
+            if (mb->pos < 64 && get_bits_left(&gb) > 0 && !vs_bit_buffer_damaged) {
                 ff_dlog(avctx, "start %d:%d\n", mb_index, j);
                 dv_decode_ac(&gb, mb, block);
             }
-            if (mb->pos >= 64 && mb->pos < 127)
+
+            if (mb->pos >= 64 && mb->pos < 127) {
                 av_log(avctx, AV_LOG_ERROR,
                        "AC EOB marker is absent pos=%d\n", mb->pos);
+                vs_bit_buffer_damaged = 1;
+            }
             block += 64;
             mb++;
         }
     }
+    if (vs_bit_buffer_damaged && !retried) {
+        av_log(avctx, AV_LOG_ERROR, "Concealing bitstream errors\n");
+        retried = 1;
+        goto retry;
+    }
 
     /* compute idct and place blocks */
     block = &sblock[0][0];
@@ -317,9 +466,9 @@ static int dv_decode_video_segment(AVCodecContext *avctx, void *arg)
                 int x, y;
                 mb->idct_put(pixels, 8, block);
                 for (y = 0; y < (1 << log2_blocksize); y++, c_ptr += s->frame->linesize[j], pixels += 8) {
-                    ptr1   = pixels + (1 << (log2_blocksize - 1));
+                    ptr1   = pixels + ((1 << (log2_blocksize))>>1);
                     c_ptr1 = c_ptr + (s->frame->linesize[j] << log2_blocksize);
-                    for (x = 0; x < (1 << (log2_blocksize - 1)); x++) {
+                    for (x = 0; x < (1 << FFMAX(log2_blocksize - 1, 0)); x++) {
                         c_ptr[x]  = pixels[x];
                         c_ptr1[x] = ptr1[x];
                     }
@@ -355,7 +504,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
     int apt, is16_9, ret;
     const AVDVProfile *sys;
 
-    sys = av_dv_frame_profile(s->sys, buf, buf_size);
+    sys = ff_dv_frame_profile(avctx, s->sys, buf, buf_size);
     if (!sys || buf_size < sys->frame_size) {
         av_log(avctx, AV_LOG_ERROR, "could not find dv frame profile\n");
         return -1; /* NOTE: we only accept several full frames */
@@ -367,6 +516,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
             av_log(avctx, AV_LOG_ERROR, "Error initializing the work tables.\n");
             return ret;
         }
+        dv_init_weight_tables(s, sys);
         s->sys = sys;
     }
 
@@ -389,13 +539,16 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data,
         ff_set_sar(avctx, s->sys->sar[is16_9]);
     }
 
-    if (ff_get_buffer(avctx, frame, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return -1;
-    }
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
     frame->interlaced_frame = 1;
     frame->top_field_first  = 0;
 
+    /* Determine the codec's field order from the packet */
+    if ( *vsc_pack == dv_video_control ) {
+        frame->top_field_first = !(vsc_pack[3] & 0x40);
+    }
+
     s->buf = buf;
     avctx->execute(avctx, dv_decode_video_segment, s->work_chunks, NULL,
                    dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
@@ -417,4 +570,5 @@ AVCodec ff_dvvideo_decoder = {
     .init           = dvvideo_decode_init,
     .decode         = dvvideo_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+    .max_lowres     = 3,
 };
diff --git a/libavcodec/dvdsub_parser.c b/libavcodec/dvdsub_parser.c
index 2ad3b33..32a945e 100644
--- a/libavcodec/dvdsub_parser.c
+++ b/libavcodec/dvdsub_parser.c
@@ -2,20 +2,20 @@
  * DVD subtitle decoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,8 +45,11 @@ static int dvdsub_parse(AVCodecParserContext *s,
     DVDSubParseContext *pc = s->priv_data;
 
     if (pc->packet_index == 0) {
-        if (buf_size < 2)
-            return 0;
+        if (buf_size < 2 || AV_RB16(buf) && buf_size < 6) {
+            if (buf_size)
+                av_log(avctx, AV_LOG_DEBUG, "Parser input %d too small\n", buf_size);
+            return buf_size;
+        }
         pc->packet_len = AV_RB16(buf);
         if (pc->packet_len == 0) /* HD-DVD subpicture packet */
             pc->packet_len = AV_RB32(buf+2);
diff --git a/libavcodec/dvdsubdec.c b/libavcodec/dvdsubdec.c
index da1a83f..19f25f0 100644
--- a/libavcodec/dvdsubdec.c
+++ b/libavcodec/dvdsubdec.c
@@ -2,20 +2,20 @@
  * DVD subtitle decoding
  * Copyright (c) 2005 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,26 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/colorspace.h"
+#include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bswap.h"
 
-typedef struct DVDSubContext {
-    uint32_t palette[16];
-    int      has_palette;
+typedef struct DVDSubContext
+{
+  AVClass *class;
+  uint32_t palette[16];
+  char    *palette_str;
+  char    *ifo_str;
+  int      has_palette;
+  uint8_t  colormap[4];
+  uint8_t  alpha[256];
+  uint8_t  buf[0x10000];
+  int      buf_size;
+  int      forced_subs_only;
+#ifdef DEBUG
+  int sub_id;
+#endif
 } DVDSubContext;
 
 static void yuv_a_to_rgba(const uint8_t *ycbcr, const uint8_t *alpha, uint32_t *rgba, int num_values)
@@ -94,6 +108,12 @@ static int decode_rle(uint8_t *bitmap, int linesize, int w, int h,
     int x, y, len, color;
     uint8_t *d;
 
+    if (start >= buf_size)
+        return -1;
+
+    if (w <= 0 || h <= 0)
+        return -1;
+
     bit_len = (buf_size - start) * 8;
     init_get_bits(&gb, buf + start, bit_len);
 
@@ -125,17 +145,24 @@ static int decode_rle(uint8_t *bitmap, int linesize, int w, int h,
 
 static void guess_palette(DVDSubContext* ctx,
                           uint32_t *rgba_palette,
-                          uint8_t *colormap,
-                          uint8_t *alpha,
                           uint32_t subtitle_color)
 {
+    static const uint8_t level_map[4][4] = {
+        // this configuration (full range, lowest to highest) in tests
+        // seemed most common, so assume this
+        {0xff},
+        {0x00, 0xff},
+        {0x00, 0x80, 0xff},
+        {0x00, 0x55, 0xaa, 0xff},
+    };
     uint8_t color_used[16] = { 0 };
     int nb_opaque_colors, i, level, j, r, g, b;
+    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;
 
-    if (ctx->has_palette) {
-        for (i = 0; i < 4; i++)
+    if(ctx->has_palette) {
+        for(i = 0; i < 4; i++)
             rgba_palette[i] = (ctx->palette[colormap[i]] & 0x00ffffff)
-                              | ((alpha[i] * 17) << 24);
+                              | ((alpha[i] * 17U) << 24);
         return;
     }
 
@@ -153,18 +180,18 @@ static void guess_palette(DVDSubContext* ctx,
     if (nb_opaque_colors == 0)
         return;
 
-    j = nb_opaque_colors;
+    j = 0;
     memset(color_used, 0, 16);
     for(i = 0; i < 4; i++) {
         if (alpha[i] != 0) {
             if (!color_used[colormap[i]])  {
-                level = (0xff * j) / nb_opaque_colors;
+                level = level_map[nb_opaque_colors][j];
                 r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
                 g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
                 b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
                 rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17) << 24);
                 color_used[colormap[i]] = (i + 1);
-                j--;
+                j++;
             } else {
                 rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) |
                                     ((alpha[i] * 17) << 24);
@@ -173,6 +200,21 @@ static void guess_palette(DVDSubContext* ctx,
     }
 }
 
+static void reset_rects(AVSubtitle *sub_header)
+{
+    int i;
+
+    if (sub_header->rects) {
+        for (i = 0; i < sub_header->num_rects; i++) {
+            av_freep(&sub_header->rects[i]->data[0]);
+            av_freep(&sub_header->rects[i]->data[1]);
+            av_freep(&sub_header->rects[i]);
+        }
+        av_freep(&sub_header->rects);
+        sub_header->num_rects = 0;
+    }
+}
+
 #define READ_OFFSET(a) (big_offsets ? AV_RB32(a) : AV_RB16(a))
 
 static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
@@ -180,16 +222,16 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
 {
     int cmd_pos, pos, cmd, x1, y1, x2, y2, next_cmd_pos;
     int big_offsets, offset_size, is_8bit = 0;
-    const uint8_t *yuv_palette = 0;
-    uint8_t colormap[4] = { 0 }, alpha[256] = { 0 };
+    const uint8_t *yuv_palette = NULL;
+    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;
     int date;
     int i;
     int is_menu = 0;
+    uint32_t size;
     int64_t offset1, offset2;
 
     if (buf_size < 10)
         return -1;
-    memset(sub_header, 0, sizeof(*sub_header));
 
     if (AV_RB16(buf) == 0) {   /* HD subpicture with 4-byte offsets */
         big_offsets = 1;
@@ -201,8 +243,17 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
         cmd_pos = 2;
     }
 
+    size = READ_OFFSET(buf + (big_offsets ? 2 : 0));
     cmd_pos = READ_OFFSET(buf + cmd_pos);
 
+    if (cmd_pos < 0 || cmd_pos > buf_size - 2 - offset_size) {
+        if (cmd_pos > size) {
+            av_log(ctx, AV_LOG_ERROR, "Discarding invalid packet\n");
+            return 0;
+        }
+        return AVERROR(EAGAIN);
+    }
+
     while (cmd_pos > 0 && cmd_pos < buf_size - 2 - offset_size) {
         date = AV_RB16(buf + cmd_pos);
         next_cmd_pos = READ_OFFSET(buf + cmd_pos + 2);
@@ -247,7 +298,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 alpha[1] = buf[pos + 1] >> 4;
                 alpha[0] = buf[pos + 1] & 0x0f;
                 pos += 2;
-            ff_dlog(NULL, "alpha=%x%x%x%x\n", alpha[0],alpha[1],alpha[2],alpha[3]);
+                ff_dlog(NULL, "alpha=%x%x%x%x\n", alpha[0],alpha[1],alpha[2],alpha[3]);
                 break;
             case 0x05:
             case 0x85:
@@ -267,7 +318,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                     goto fail;
                 offset1 = AV_RB16(buf + pos);
                 offset2 = AV_RB16(buf + pos + 2);
-                ff_dlog(NULL, "offset1=0x%04x offset2=0x%04x\n", offset1, offset2);
+                ff_dlog(NULL, "offset1=0x%04"PRIx64" offset2=0x%04"PRIx64"\n", offset1, offset2);
                 pos += 4;
                 break;
             case 0x86:
@@ -275,7 +326,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                     goto fail;
                 offset1 = AV_RB32(buf + pos);
                 offset2 = AV_RB32(buf + pos + 4);
-                ff_dlog(NULL, "offset1=0x%04x offset2=0x%04x\n", offset1, offset2);
+                ff_dlog(NULL, "offset1=0x%04"PRIx64" offset2=0x%04"PRIx64"\n", offset1, offset2);
                 pos += 8;
                 break;
 
@@ -306,7 +357,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
         if (offset1 >= buf_size || offset2 >= buf_size)
             goto fail;
 
-        if (offset1 >= 0) {
+        if (offset1 >= 0 && offset2 >= 0) {
             int w, h;
             uint8_t *bitmap;
 
@@ -314,21 +365,11 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
             w = x2 - x1 + 1;
             if (w < 0)
                 w = 0;
-            h = y2 - y1;
+            h = y2 - y1 + 1;
             if (h < 0)
                 h = 0;
             if (w > 0 && h > 0) {
-                int j;
-                AVSubtitleRect *rect;
-                if (sub_header->rects) {
-                    for (i = 0; i < sub_header->num_rects; i++) {
-                        av_freep(&sub_header->rects[i]->data[0]);
-                        av_freep(&sub_header->rects[i]->data[1]);
-                        av_freep(&sub_header->rects[i]);
-                    }
-                    av_freep(&sub_header->rects);
-                    sub_header->num_rects = 0;
-                }
+                reset_rects(sub_header);
 
                 sub_header->rects = av_mallocz(sizeof(*sub_header->rects));
                 if (!sub_header->rects)
@@ -340,15 +381,17 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 bitmap = sub_header->rects[0]->data[0] = av_malloc(w * h);
                 if (!bitmap)
                     goto fail;
-                decode_rle(bitmap, w * 2, w, (h + 1) / 2,
-                           buf, offset1, buf_size, is_8bit);
-                decode_rle(bitmap + w, w * 2, w, h / 2,
-                           buf, offset2, buf_size, is_8bit);
+                if (decode_rle(bitmap, w * 2, w, (h + 1) / 2,
+                               buf, offset1, buf_size, is_8bit) < 0)
+                    goto fail;
+                if (decode_rle(bitmap + w, w * 2, w, h / 2,
+                               buf, offset2, buf_size, is_8bit) < 0)
+                    goto fail;
                 sub_header->rects[0]->data[1] = av_mallocz(AVPALETTE_SIZE);
                 if (!sub_header->rects[0]->data[1])
                     goto fail;
                 if (is_8bit) {
-                    if (yuv_palette == 0)
+                    if (!yuv_palette)
                         goto fail;
                     sub_header->rects[0]->nb_colors = 256;
                     yuv_a_to_rgba(yuv_palette, alpha,
@@ -356,9 +399,8 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                                   256);
                 } else {
                     sub_header->rects[0]->nb_colors = 4;
-                    guess_palette(ctx,
-                                  (uint32_t *)sub_header->rects[0]->data[1],
-                                  colormap, alpha, 0xffff00);
+                    guess_palette(ctx, (uint32_t*)sub_header->rects[0]->data[1],
+                                  0xffff00);
                 }
                 sub_header->rects[0]->x = x1;
                 sub_header->rects[0]->y = y1;
@@ -366,18 +408,22 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
                 sub_header->rects[0]->h = h;
                 sub_header->rects[0]->type = SUBTITLE_BITMAP;
                 sub_header->rects[0]->linesize[0] = w;
+                sub_header->rects[0]->flags = is_menu ? AV_SUBTITLE_FLAG_FORCED : 0;
 
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
-                rect = sub_header->rects[0];
-                for (j = 0; j < 4; j++) {
-                    rect->pict.data[j] = rect->data[j];
-                    rect->pict.linesize[j] = rect->linesize[j];
+                for (i = 0; i < 4; i++) {
+                    sub_header->rects[0]->pict.data[i] = sub_header->rects[0]->data[i];
+                    sub_header->rects[0]->pict.linesize[i] = sub_header->rects[0]->linesize[i];
                 }
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
             }
         }
+        if (next_cmd_pos < cmd_pos) {
+            av_log(ctx, AV_LOG_ERROR, "Invalid command offset\n");
+            break;
+        }
         if (next_cmd_pos == cmd_pos)
             break;
         cmd_pos = next_cmd_pos;
@@ -385,15 +431,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (sub_header->num_rects > 0)
         return is_menu;
  fail:
-    if (!sub_header->rects) {
-        for (i = 0; i < sub_header->num_rects; i++) {
-            av_freep(&sub_header->rects[i]->data[0]);
-            av_freep(&sub_header->rects[i]->data[1]);
-            av_freep(&sub_header->rects[i]);
-        }
-        av_freep(&sub_header->rects);
-        sub_header->num_rects = 0;
-    }
+    reset_rects(sub_header);
     return -1;
 }
 
@@ -460,20 +498,33 @@ static int find_smallest_bounding_rectangle(AVSubtitle *s)
     s->rects[0]->h = h;
     s->rects[0]->x += x1;
     s->rects[0]->y += y1;
+
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+    for (i = 0; i < 4; i++) {
+        s->rects[0]->pict.data[i] = s->rects[0]->data[i];
+        s->rects[0]->pict.linesize[i] = s->rects[0]->linesize[i];
+    }
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
     return 1;
 }
 
 #ifdef DEBUG
+#define ALPHA_MIX(A,BACK,FORE) (((255-(A)) * (BACK) + (A) * (FORE)) / 255)
 static void ppm_save(const char *filename, uint8_t *bitmap, int w, int h,
                      uint32_t *rgba_palette)
 {
-    int x, y, v;
+    int x, y, alpha;
+    uint32_t v;
+    int back[3] = {0, 255, 0};  /* green background */
     FILE *f;
 
     f = fopen(filename, "w");
     if (!f) {
         perror(filename);
-        exit(1);
+        return;
     }
     fprintf(f, "P6\n"
             "%d %d\n"
@@ -482,15 +533,32 @@ static void ppm_save(const char *filename, uint8_t *bitmap, int w, int h,
     for(y = 0; y < h; y++) {
         for(x = 0; x < w; x++) {
             v = rgba_palette[bitmap[y * w + x]];
-            putc((v >> 16) & 0xff, f);
-            putc((v >> 8) & 0xff, f);
-            putc((v >> 0) & 0xff, f);
+            alpha = v >> 24;
+            putc(ALPHA_MIX(alpha, back[0], (v >> 16) & 0xff), f);
+            putc(ALPHA_MIX(alpha, back[1], (v >> 8) & 0xff), f);
+            putc(ALPHA_MIX(alpha, back[2], (v >> 0) & 0xff), f);
         }
     }
     fclose(f);
 }
 #endif
 
+static int append_to_cached_buf(AVCodecContext *avctx,
+                                const uint8_t *buf, int buf_size)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+
+    if (ctx->buf_size >= sizeof(ctx->buf) - buf_size) {
+        av_log(avctx, AV_LOG_WARNING, "Attempt to reconstruct "
+               "too large SPU packets aborted.\n");
+        ctx->buf_size = 0;
+        return AVERROR_INVALIDDATA;
+    }
+    memcpy(ctx->buf + ctx->buf_size, buf, buf_size);
+    ctx->buf_size += buf_size;
+    return 0;
+}
+
 static int dvdsub_decode(AVCodecContext *avctx,
                          void *data, int *data_size,
                          AVPacket *avpkt)
@@ -499,12 +567,29 @@ static int dvdsub_decode(AVCodecContext *avctx,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     AVSubtitle *sub = data;
+    int appended = 0;
     int is_menu;
 
+    if (ctx->buf_size) {
+        int ret = append_to_cached_buf(avctx, buf, buf_size);
+        if (ret < 0) {
+            *data_size = 0;
+            return ret;
+        }
+        buf = ctx->buf;
+        buf_size = ctx->buf_size;
+        appended = 1;
+    }
+
     is_menu = decode_dvd_subtitles(ctx, sub, buf, buf_size);
+    if (is_menu == AVERROR(EAGAIN)) {
+        *data_size = 0;
+        return appended ? 0 : append_to_cached_buf(avctx, buf, buf_size);
+    }
 
     if (is_menu < 0) {
     no_subtitle:
+        reset_rects(sub);
         *data_size = 0;
 
         return buf_size;
@@ -512,61 +597,189 @@ static int dvdsub_decode(AVCodecContext *avctx,
     if (!is_menu && find_smallest_bounding_rectangle(sub) == 0)
         goto no_subtitle;
 
+    if (ctx->forced_subs_only && !(sub->rects[0]->flags & AV_SUBTITLE_FLAG_FORCED))
+        goto no_subtitle;
+
 #if defined(DEBUG)
+    {
+    char ppm_name[32];
+
+    snprintf(ppm_name, sizeof(ppm_name), "/tmp/%05d.ppm", ctx->sub_id++);
     ff_dlog(NULL, "start=%d ms end =%d ms\n",
             sub->start_display_time,
             sub->end_display_time);
-    ppm_save("/tmp/a.ppm", sub->rects[0]->data[0],
-             sub->rects[0]->w, sub->rects[0]->h, sub->rects[0]->data[1]);
+    ppm_save(ppm_name, sub->rects[0]->data[0],
+             sub->rects[0]->w, sub->rects[0]->h, (uint32_t*) sub->rects[0]->data[1]);
+    }
 #endif
 
+    ctx->buf_size = 0;
     *data_size = 1;
     return buf_size;
 }
 
-static av_cold int dvdsub_init(AVCodecContext *avctx)
+static void parse_palette(DVDSubContext *ctx, char *p)
 {
-    DVDSubContext *ctx = avctx->priv_data;
-    char *data, *cur;
+    int i;
+
+    ctx->has_palette = 1;
+    for(i=0;i<16;i++) {
+        ctx->palette[i] = strtoul(p, &p, 16);
+        while(*p == ',' || av_isspace(*p))
+            p++;
+    }
+}
+
+static int parse_ifo_palette(DVDSubContext *ctx, char *p)
+{
+    FILE *ifo;
+    char ifostr[12];
+    uint32_t sp_pgci, pgci, off_pgc, pgc;
+    uint8_t r, g, b, yuv[65], *buf;
+    int i, y, cb, cr, r_add, g_add, b_add;
     int ret = 0;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+
+    ctx->has_palette = 0;
+    if ((ifo = fopen(p, "r")) == NULL) {
+        av_log(ctx, AV_LOG_WARNING, "Unable to open IFO file \"%s\": %s\n", p, av_err2str(AVERROR(errno)));
+        return AVERROR_EOF;
+    }
+    if (fread(ifostr, 12, 1, ifo) != 1 || memcmp(ifostr, "DVDVIDEO-VTS", 12)) {
+        av_log(ctx, AV_LOG_WARNING, "\"%s\" is not a proper IFO file\n", p);
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    if (fseek(ifo, 0xCC, SEEK_SET) == -1) {
+        ret = AVERROR(errno);
+        goto end;
+    }
+    if (fread(&sp_pgci, 4, 1, ifo) == 1) {
+        pgci = av_be2ne32(sp_pgci) * 2048;
+        if (fseek(ifo, pgci + 0x0C, SEEK_SET) == -1) {
+            ret = AVERROR(errno);
+            goto end;
+        }
+        if (fread(&off_pgc, 4, 1, ifo) == 1) {
+            pgc = pgci + av_be2ne32(off_pgc);
+            if (fseek(ifo, pgc + 0xA4, SEEK_SET) == -1) {
+                ret = AVERROR(errno);
+                goto end;
+            }
+            if (fread(yuv, 64, 1, ifo) == 1) {
+                buf = yuv;
+                for(i=0; i<16; i++) {
+                    y  = *++buf;
+                    cr = *++buf;
+                    cb = *++buf;
+                    YUV_TO_RGB1_CCIR(cb, cr);
+                    YUV_TO_RGB2_CCIR(r, g, b, y);
+                    ctx->palette[i] = (r << 16) + (g << 8) + b;
+                    buf++;
+                }
+                ctx->has_palette = 1;
+            }
+        }
+    }
+    if (ctx->has_palette == 0) {
+        av_log(ctx, AV_LOG_WARNING, "Failed to read palette from IFO file \"%s\"\n", p);
+        ret = AVERROR_INVALIDDATA;
+    }
+end:
+    fclose(ifo);
+    return ret;
+}
+
+static int dvdsub_parse_extradata(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = (DVDSubContext*) avctx->priv_data;
+    char *dataorig, *data;
+    int ret = 1;
 
     if (!avctx->extradata || !avctx->extradata_size)
-        return 0;
+        return 1;
 
-    data = av_malloc(avctx->extradata_size + 1);
+    dataorig = data = av_malloc(avctx->extradata_size+1);
     if (!data)
         return AVERROR(ENOMEM);
     memcpy(data, avctx->extradata, avctx->extradata_size);
     data[avctx->extradata_size] = '\0';
-    cur = data;
-
-    while (*cur) {
-        if (strncmp("palette:", cur, 8) == 0) {
-            int i;
-            char *p = cur + 8;
-            ctx->has_palette = 1;
-            for (i = 0; i < 16; i++) {
-                ctx->palette[i] = strtoul(p, &p, 16);
-                while (*p == ',' || av_isspace(*p))
-                    p++;
-            }
-        } else if (!strncmp("size:", cur, 5)) {
+
+    for(;;) {
+        int pos = strcspn(data, "\n\r");
+        if (pos==0 && *data==0)
+            break;
+
+        if (strncmp("palette:", data, 8) == 0) {
+            parse_palette(ctx, data + 8);
+        } else if (strncmp("size:", data, 5) == 0) {
             int w, h;
-            if (sscanf(cur + 5, "%dx%d", &w, &h) == 2) {
+            if (sscanf(data + 5, "%dx%d", &w, &h) == 2) {
                ret = ff_set_dimensions(avctx, w, h);
                if (ret < 0)
                    goto fail;
             }
         }
-        cur += strcspn(cur, "\n\r");
-        cur += strspn(cur, "\n\r");
+
+        data += pos;
+        data += strspn(data, "\n\r");
     }
 
 fail:
-    av_free(data);
+    av_free(dataorig);
     return ret;
 }
 
+static av_cold int dvdsub_init(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+    int ret;
+
+    if ((ret = dvdsub_parse_extradata(avctx)) < 0)
+        return ret;
+
+    if (ctx->ifo_str)
+        parse_ifo_palette(ctx, ctx->ifo_str);
+    if (ctx->palette_str)
+        parse_palette(ctx, ctx->palette_str);
+    if (ctx->has_palette) {
+        int i;
+        av_log(avctx, AV_LOG_DEBUG, "palette:");
+        for(i=0;i<16;i++)
+            av_log(avctx, AV_LOG_DEBUG, " 0x%06x", ctx->palette[i]);
+        av_log(avctx, AV_LOG_DEBUG, "\n");
+    }
+
+    return 1;
+}
+
+static void dvdsub_flush(AVCodecContext *avctx)
+{
+    DVDSubContext *ctx = avctx->priv_data;
+    ctx->buf_size = 0;
+}
+
+static av_cold int dvdsub_close(AVCodecContext *avctx)
+{
+    dvdsub_flush(avctx);
+    return 0;
+}
+
+#define OFFSET(field) offsetof(DVDSubContext, field)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "palette", "set the global palette", OFFSET(palette_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, SD },
+    { "ifo_palette", "obtain the global palette from .IFO file", OFFSET(ifo_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, SD },
+    { "forced_subs_only", "Only show forced subtitles", OFFSET(forced_subs_only), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, SD},
+    { NULL }
+};
+static const AVClass dvdsub_class = {
+    .class_name = "dvdsubdec",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_dvdsub_decoder = {
     .name           = "dvdsub",
     .long_name      = NULL_IF_CONFIG_SMALL("DVD subtitles"),
@@ -575,4 +788,7 @@ AVCodec ff_dvdsub_decoder = {
     .priv_data_size = sizeof(DVDSubContext),
     .init           = dvdsub_init,
     .decode         = dvdsub_decode,
+    .flush          = dvdsub_flush,
+    .close          = dvdsub_close,
+    .priv_class     = &dvdsub_class,
 };
diff --git a/libavcodec/dvdsubenc.c b/libavcodec/dvdsubenc.c
index b0c2b63..29e0322 100644
--- a/libavcodec/dvdsubenc.c
+++ b/libavcodec/dvdsubenc.c
@@ -2,27 +2,35 @@
  * DVD subtitle encoding
  * Copyright (c) 2005 Wolfram Gloger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
+#include "libavutil/bprint.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
-#undef NDEBUG
-#include <assert.h>
+typedef struct {
+    AVClass *class;
+    uint32_t global_palette[16];
+    int even_rows_fix;
+} DVDSubtitleContext;
 
 // ncnt is the nibble counter
 #define PUTNIBBLE(val)\
@@ -53,7 +61,7 @@ static void dvd_encode_rle(uint8_t **pq,
                 if (bitmap[x+len] != color)
                     break;
             color = cmap[color];
-            assert(color < 4);
+            av_assert0(color < 4);
             if (len < 0x04) {
                 PUTNIBBLE((len << 2)|color);
             } else if (len < 0x10) {
@@ -86,32 +94,195 @@ static void dvd_encode_rle(uint8_t **pq,
     *pq = q;
 }
 
-static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
+static int color_distance(uint32_t a, uint32_t b)
+{
+    int r = 0, d, i;
+    int alpha_a = 8, alpha_b = 8;
+
+    for (i = 24; i >= 0; i -= 8) {
+        d = alpha_a * (int)((a >> i) & 0xFF) -
+            alpha_b * (int)((b >> i) & 0xFF);
+        r += d * d;
+        alpha_a = a >> 28;
+        alpha_b = b >> 28;
+    }
+    return r;
+}
+
+/**
+ * Count colors used in a rectangle, quantizing alpha and grouping by
+ * nearest global palette entry.
+ */
+static void count_colors(AVCodecContext *avctx, unsigned hits[33],
+                         const AVSubtitleRect *r)
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    unsigned count[256] = { 0 };
+    uint32_t *palette = (uint32_t *)r->data[1];
+    uint32_t color;
+    int x, y, i, j, match, d, best_d, av_uninit(best_j);
+    uint8_t *p = r->data[0];
+
+    for (y = 0; y < r->h; y++) {
+        for (x = 0; x < r->w; x++)
+            count[*(p++)]++;
+        p += r->linesize[0] - r->w;
+    }
+    for (i = 0; i < 256; i++) {
+        if (!count[i]) /* avoid useless search */
+            continue;
+        color = palette[i];
+        /* 0: transparent, 1-16: semi-transparent, 17-33 opaque */
+        match = color < 0x33000000 ? 0 : color < 0xCC000000 ? 1 : 17;
+        if (match) {
+            best_d = INT_MAX;
+            for (j = 0; j < 16; j++) {
+                d = color_distance(0xFF000000 | color,
+                                   0xFF000000 | dvdc->global_palette[j]);
+                if (d < best_d) {
+                    best_d = d;
+                    best_j = j;
+                }
+            }
+            match += best_j;
+        }
+        hits[match] += count[i];
+    }
+}
+
+static void select_palette(AVCodecContext *avctx, int out_palette[4],
+                           int out_alpha[4], unsigned hits[33])
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    int i, j, bright, mult;
+    uint32_t color;
+    int selected[4] = { 0 };
+    uint32_t pseudopal[33] = { 0 };
+    uint32_t refcolor[3] = { 0x00000000, 0xFFFFFFFF, 0xFF000000 };
+
+    /* Bonus for transparent: if the rectangle fits tightly the text, the
+       background color can be quite rare, but it would be ugly without it */
+    hits[0] *= 16;
+    /* Bonus for bright colors */
+    for (i = 0; i < 16; i++) {
+        if (!(hits[1 + i] + hits[17 + i]))
+            continue; /* skip unused colors to gain time */
+        color = dvdc->global_palette[i];
+        bright = 0;
+        for (j = 0; j < 3; j++, color >>= 8)
+            bright += (color & 0xFF) < 0x40 || (color & 0xFF) >= 0xC0;
+        mult = 2 + FFMIN(bright, 2);
+        hits[ 1 + i] *= mult;
+        hits[17 + i] *= mult;
+    }
+
+    /* Select four most frequent colors */
+    for (i = 0; i < 4; i++) {
+        for (j = 0; j < 33; j++)
+            if (hits[j] > hits[selected[i]])
+                selected[i] = j;
+        hits[selected[i]] = 0;
+    }
+
+    /* Order the colors like in most DVDs:
+       0: background, 1: foreground, 2: outline */
+    for (i = 0; i < 16; i++) {
+        pseudopal[ 1 + i] = 0x80000000 | dvdc->global_palette[i];
+        pseudopal[17 + i] = 0xFF000000 | dvdc->global_palette[i];
+    }
+    for (i = 0; i < 3; i++) {
+        int best_d = color_distance(refcolor[i], pseudopal[selected[i]]);
+        for (j = i + 1; j < 4; j++) {
+            int d = color_distance(refcolor[i], pseudopal[selected[j]]);
+            if (d < best_d) {
+                FFSWAP(int, selected[i], selected[j]);
+                best_d = d;
+            }
+        }
+    }
+
+    /* Output */
+    for (i = 0; i < 4; i++) {
+        out_palette[i] = selected[i] ? (selected[i] - 1) & 0xF : 0;
+        out_alpha  [i] = !selected[i] ? 0 : selected[i] < 17 ? 0x80 : 0xFF;
+    }
+}
+
+static void build_color_map(AVCodecContext *avctx, int cmap[],
+                            const uint32_t palette[],
+                            const int out_palette[], unsigned int const out_alpha[])
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    int i, j, d, best_d;
+    uint32_t pseudopal[4];
+
+    for (i = 0; i < 4; i++)
+        pseudopal[i] = (out_alpha[i] << 24) |
+                       dvdc->global_palette[out_palette[i]];
+    for (i = 0; i < 256; i++) {
+        best_d = INT_MAX;
+        for (j = 0; j < 4; j++) {
+            d = color_distance(pseudopal[j], palette[i]);
+            if (d < best_d) {
+                cmap[i] = j;
+                best_d = d;
+            }
+        }
+    }
+}
+
+static void copy_rectangle(AVSubtitleRect *dst, AVSubtitleRect *src, int cmap[])
+{
+    int x, y;
+    uint8_t *p, *q;
+
+    p = src->data[0];
+    q = dst->data[0] + (src->x - dst->x) +
+                            (src->y - dst->y) * dst->linesize[0];
+    for (y = 0; y < src->h; y++) {
+        for (x = 0; x < src->w; x++)
+            *(q++) = cmap[*(p++)];
+        p += src->linesize[0] - src->w;
+        q += dst->linesize[0] - src->w;
+    }
+}
+
+static int encode_dvd_subtitles(AVCodecContext *avctx,
+                                uint8_t *outbuf, int outbuf_size,
                                 const AVSubtitle *h)
 {
+    DVDSubtitleContext *dvdc = avctx->priv_data;
     uint8_t *q, *qq;
-    int object_id;
-    int offset1[20], offset2[20];
-    int i, imax, color, alpha, rects = h->num_rects;
-    unsigned long hmax;
-    unsigned long hist[256];
-    int           cmap[256];
+    int offset1, offset2;
+    int i, rects = h->num_rects, ret;
+    unsigned global_palette_hits[33] = { 0 };
+    int cmap[256];
+    int out_palette[4];
+    int out_alpha[4];
+    AVSubtitleRect vrect;
+    uint8_t *vrect_data = NULL;
+    int x2, y2;
+    int forced = 0;
 
     if (rects == 0 || !h->rects)
-        return -1;
-    if (rects > 20)
-        rects = 20;
-
-    // analyze bitmaps, compress to 4 colors
-    for (i=0; i<256; ++i) {
-        hist[i] = 0;
-        cmap[i] = 0;
-    }
-    for (object_id = 0; object_id < rects; object_id++) {
+        return AVERROR(EINVAL);
+    for (i = 0; i < rects; i++)
+        if (h->rects[i]->type != SUBTITLE_BITMAP) {
+            av_log(avctx, AV_LOG_ERROR, "Bitmap subtitle required\n");
+            return AVERROR(EINVAL);
+        }
+    /* Mark this subtitle forced if any of the rectangles is forced. */
+    for (i = 0; i < rects; i++)
+        if ((h->rects[i]->flags & AV_SUBTITLE_FLAG_FORCED) != 0) {
+            forced = 1;
+            break;
+        }
+
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
-        if (!h->rects[object_id]->data[0]) {
-            AVSubtitleRect *rect = h->rects[object_id];
+    for (i = 0; i < rects; i++)
+        if (!h->rects[i]->data[0]) {
+            AVSubtitleRect *rect = h->rects[i];
             int j;
             for (j = 0; j < 4; j++) {
                 rect->data[j] = rect->pict.data[j];
@@ -121,51 +292,82 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-        for (i=0; i<h->rects[object_id]->w*h->rects[object_id]->h; ++i) {
-            color = h->rects[object_id]->data[0][i];
-            // only count non-transparent pixels
-            alpha = ((uint32_t *)h->rects[object_id]->data[1])[color] >> 24;
-            hist[color] += alpha;
+    vrect = *h->rects[0];
+
+    if (rects > 1) {
+        /* DVD subtitles can have only one rectangle: build a virtual
+           rectangle containing all actual rectangles.
+           The data of the rectangles will be copied later, when the palette
+           is decided, because the rectangles may have different palettes. */
+        int xmin = h->rects[0]->x, xmax = xmin + h->rects[0]->w;
+        int ymin = h->rects[0]->y, ymax = ymin + h->rects[0]->h;
+        for (i = 1; i < rects; i++) {
+            xmin = FFMIN(xmin, h->rects[i]->x);
+            ymin = FFMIN(ymin, h->rects[i]->y);
+            xmax = FFMAX(xmax, h->rects[i]->x + h->rects[i]->w);
+            ymax = FFMAX(ymax, h->rects[i]->y + h->rects[i]->h);
         }
+        vrect.x = xmin;
+        vrect.y = ymin;
+        vrect.w = xmax - xmin;
+        vrect.h = ymax - ymin;
+        if ((ret = av_image_check_size(vrect.w, vrect.h, 0, avctx)) < 0)
+            return ret;
+
+        /* Count pixels outside the virtual rectangle as transparent */
+        global_palette_hits[0] = vrect.w * vrect.h;
+        for (i = 0; i < rects; i++)
+            global_palette_hits[0] -= h->rects[i]->w * h->rects[i]->h;
     }
-    for (color=3;; --color) {
-        hmax = 0;
-        imax = 0;
-        for (i=0; i<256; ++i)
-            if (hist[i] > hmax) {
-                imax = i;
-                hmax = hist[i];
-            }
-        if (hmax == 0)
-            break;
-        if (color == 0)
-            color = 3;
-        av_log(NULL, AV_LOG_DEBUG, "dvd_subtitle hist[%d]=%ld -> col %d\n",
-               imax, hist[imax], color);
-        cmap[imax] = color;
-        hist[imax] = 0;
+
+    for (i = 0; i < rects; i++)
+        count_colors(avctx, global_palette_hits, h->rects[i]);
+    select_palette(avctx, out_palette, out_alpha, global_palette_hits);
+
+    if (rects > 1) {
+        if (!(vrect_data = av_calloc(vrect.w, vrect.h)))
+            return AVERROR(ENOMEM);
+        vrect.data    [0] = vrect_data;
+        vrect.linesize[0] = vrect.w;
+        for (i = 0; i < rects; i++) {
+            build_color_map(avctx, cmap, (uint32_t *)h->rects[i]->data[1],
+                            out_palette, out_alpha);
+            copy_rectangle(&vrect, h->rects[i], cmap);
+        }
+        for (i = 0; i < 4; i++)
+            cmap[i] = i;
+    } else {
+        build_color_map(avctx, cmap, (uint32_t *)h->rects[0]->data[1],
+                        out_palette, out_alpha);
     }
 
+    av_log(avctx, AV_LOG_DEBUG, "Selected palette:");
+    for (i = 0; i < 4; i++)
+        av_log(avctx, AV_LOG_DEBUG, " 0x%06x@@%02x (0x%x,0x%x)",
+               dvdc->global_palette[out_palette[i]], out_alpha[i],
+               out_palette[i], out_alpha[i] >> 4);
+    av_log(avctx, AV_LOG_DEBUG, "\n");
 
     // encode data block
     q = outbuf + 4;
-    for (object_id = 0; object_id < rects; object_id++) {
-        offset1[object_id] = q - outbuf;
-        // worst case memory requirement: 1 nibble per pixel..
-        if ((q - outbuf) + h->rects[object_id]->w*h->rects[object_id]->h/2
-            + 17*rects + 21 > outbuf_size) {
-            av_log(NULL, AV_LOG_ERROR, "dvd_subtitle too big\n");
-            return -1;
-        }
-        dvd_encode_rle(&q, h->rects[object_id]->data[0],
-                       h->rects[object_id]->w*2,
-                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
-                       cmap);
-        offset2[object_id] = q - outbuf;
-        dvd_encode_rle(&q, h->rects[object_id]->data[0] + h->rects[object_id]->w,
-                       h->rects[object_id]->w*2,
-                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
-                       cmap);
+    offset1 = q - outbuf;
+    // worst case memory requirement: 1 nibble per pixel..
+    if ((q - outbuf) + vrect.w * vrect.h / 2 + 17 + 21 > outbuf_size) {
+        av_log(NULL, AV_LOG_ERROR, "dvd_subtitle too big\n");
+        ret = AVERROR_BUFFER_TOO_SMALL;
+        goto fail;
+    }
+    dvd_encode_rle(&q, vrect.data[0], vrect.w * 2,
+                   vrect.w, (vrect.h + 1) >> 1, cmap);
+    offset2 = q - outbuf;
+    dvd_encode_rle(&q, vrect.data[0] + vrect.w, vrect.w * 2,
+                   vrect.w, vrect.h >> 1, cmap);
+
+    if (dvdc->even_rows_fix && (vrect.h & 1)) {
+        // Work-around for some players that want the height to be even.
+        vrect.h++;
+        *q++ = 0x00; // 0x00 0x00 == empty row, i.e. fully transparent
+        *q++ = 0x00;
     }
 
     // set data packet size
@@ -174,35 +376,34 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     // send start display command
     bytestream_put_be16(&q, (h->start_display_time*90) >> 10);
-    bytestream_put_be16(&q, (q - outbuf) /*- 2 */ + 8 + 12*rects + 2);
+    bytestream_put_be16(&q, (q - outbuf) /*- 2 */ + 8 + 12 + 2);
     *q++ = 0x03; // palette - 4 nibbles
-    *q++ = 0x03; *q++ = 0x7f;
+    *q++ = (out_palette[3] << 4) | out_palette[2];
+    *q++ = (out_palette[1] << 4) | out_palette[0];
     *q++ = 0x04; // alpha - 4 nibbles
-    *q++ = 0xf0; *q++ = 0x00;
-    //*q++ = 0x0f; *q++ = 0xff;
+    *q++ = (out_alpha[3] & 0xF0) | (out_alpha[2] >> 4);
+    *q++ = (out_alpha[1] & 0xF0) | (out_alpha[0] >> 4);
 
-    // XXX not sure if more than one rect can really be encoded..
     // 12 bytes per rect
-    for (object_id = 0; object_id < rects; object_id++) {
-        int x2 = h->rects[object_id]->x + h->rects[object_id]->w - 1;
-        int y2 = h->rects[object_id]->y + h->rects[object_id]->h - 1;
-
-        *q++ = 0x05;
-        // x1 x2 -> 6 nibbles
-        *q++ = h->rects[object_id]->x >> 4;
-        *q++ = (h->rects[object_id]->x << 4) | ((x2 >> 8) & 0xf);
-        *q++ = x2;
-        // y1 y2 -> 6 nibbles
-        *q++ = h->rects[object_id]->y >> 4;
-        *q++ = (h->rects[object_id]->y << 4) | ((y2 >> 8) & 0xf);
-        *q++ = y2;
-
-        *q++ = 0x06;
-        // offset1, offset2
-        bytestream_put_be16(&q, offset1[object_id]);
-        bytestream_put_be16(&q, offset2[object_id]);
-    }
-    *q++ = 0x01; // start command
+    x2 = vrect.x + vrect.w - 1;
+    y2 = vrect.y + vrect.h - 1;
+
+    *q++ = 0x05;
+    // x1 x2 -> 6 nibbles
+    *q++ = vrect.x >> 4;
+    *q++ = (vrect.x << 4) | ((x2 >> 8) & 0xf);
+    *q++ = x2;
+    // y1 y2 -> 6 nibbles
+    *q++ = vrect.y >> 4;
+    *q++ = (vrect.y << 4) | ((y2 >> 8) & 0xf);
+    *q++ = y2;
+
+    *q++ = 0x06;
+    // offset1, offset2
+    bytestream_put_be16(&q, offset1);
+    bytestream_put_be16(&q, offset2);
+
+    *q++ = forced ? 0x00 : 0x01; // start command
     *q++ = 0xff; // terminating command
 
     // send stop display command last
@@ -214,8 +415,42 @@ FF_ENABLE_DEPRECATION_WARNINGS
     qq = outbuf;
     bytestream_put_be16(&qq, q - outbuf);
 
-    av_log(NULL, AV_LOG_DEBUG, "subtitle_packet size=%td\n", q - outbuf);
-    return q - outbuf;
+    av_log(NULL, AV_LOG_DEBUG, "subtitle_packet size=%"PTRDIFF_SPECIFIER"\n", q - outbuf);
+    ret = q - outbuf;
+
+fail:
+    av_free(vrect_data);
+    return ret;
+}
+
+static int dvdsub_init(AVCodecContext *avctx)
+{
+    DVDSubtitleContext *dvdc = avctx->priv_data;
+    static const uint32_t default_palette[16] = {
+        0x000000, 0x0000FF, 0x00FF00, 0xFF0000,
+        0xFFFF00, 0xFF00FF, 0x00FFFF, 0xFFFFFF,
+        0x808000, 0x8080FF, 0x800080, 0x80FF80,
+        0x008080, 0xFF8080, 0x555555, 0xAAAAAA,
+    };
+    AVBPrint extradata;
+    int i, ret;
+
+    av_assert0(sizeof(dvdc->global_palette) == sizeof(default_palette));
+    memcpy(dvdc->global_palette, default_palette, sizeof(dvdc->global_palette));
+
+    av_bprint_init(&extradata, 0, 1);
+    if (avctx->width && avctx->height)
+        av_bprintf(&extradata, "size: %dx%d\n", avctx->width, avctx->height);
+    av_bprintf(&extradata, "palette:");
+    for (i = 0; i < 16; i++)
+        av_bprintf(&extradata, " %06"PRIx32"%c",
+                   dvdc->global_palette[i] & 0xFFFFFF, i < 15 ? ',' : '\n');
+
+    ret = avpriv_bprint_to_extradata(avctx, &extradata);
+    if (ret < 0)
+        return ret;
+
+    return 0;
 }
 
 static int dvdsub_encode(AVCodecContext *avctx,
@@ -225,14 +460,31 @@ static int dvdsub_encode(AVCodecContext *avctx,
     //DVDSubtitleContext *s = avctx->priv_data;
     int ret;
 
-    ret = encode_dvd_subtitles(buf, buf_size, sub);
+    ret = encode_dvd_subtitles(avctx, buf, buf_size, sub);
     return ret;
 }
 
+#define OFFSET(x) offsetof(DVDSubtitleContext, x)
+#define SE AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    {"even_rows_fix", "Make number of rows even (workaround for some players)", OFFSET(even_rows_fix), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, SE},
+    { NULL },
+};
+
+static const AVClass dvdsubenc_class = {
+    .class_name = "VOBSUB subtitle encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_dvdsub_encoder = {
     .name           = "dvdsub",
     .long_name      = NULL_IF_CONFIG_SMALL("DVD subtitles"),
     .type           = AVMEDIA_TYPE_SUBTITLE,
     .id             = AV_CODEC_ID_DVD_SUBTITLE,
+    .init           = dvdsub_init,
     .encode_sub     = dvdsub_encode,
+    .priv_class     = &dvdsubenc_class,
+    .priv_data_size = sizeof(DVDSubtitleContext),
 };
diff --git a/libavcodec/dvenc.c b/libavcodec/dvenc.c
index 85e27410..5de12cc 100644
--- a/libavcodec/dvenc.c
+++ b/libavcodec/dvenc.c
@@ -2,20 +2,20 @@
  * DV encoder
  * Copyright (c) 2003 Roman Shaposhnik
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
     PixblockDSPContext pdsp;
     int ret;
 
-    s->sys = av_dv_codec_profile(avctx->width, avctx->height, avctx->pix_fmt);
+    s->sys = av_dv_codec_profile2(avctx->width, avctx->height, avctx->pix_fmt, avctx->time_base);
     if (!s->sys) {
         av_log(avctx, AV_LOG_ERROR, "Found no DV profile for %ix%i %s video. "
                                     "Valid DV profiles are:\n",
@@ -55,6 +55,10 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
         ff_dv_print_profiles(avctx, AV_LOG_ERROR);
         return AVERROR(EINVAL);
     }
+    if (avctx->height > 576) {
+        av_log(avctx, AV_LOG_ERROR, "DVCPRO HD encoding is not supported.\n");
+        return AVERROR_PATCHWELCOME;
+    }
     ret = ff_dv_init_dynamic_tables(s, s->sys);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing work tables.\n");
@@ -63,6 +67,9 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx)
 
     dv_vlc_map_tableinit();
 
+    memset(&fdsp,0, sizeof(fdsp));
+    memset(&mecc,0, sizeof(mecc));
+    memset(&pdsp,0, sizeof(pdsp));
     ff_fdctdsp_init(&fdsp, avctx);
     ff_me_cmp_init(&mecc, avctx);
     ff_pixblockdsp_init(&pdsp, avctx);
@@ -165,7 +172,7 @@ static av_always_inline PutBitContext *dv_encode_ac(EncBlockInfo *bi,
             if (bits_left) {
                 size -= bits_left;
                 put_bits(pb, bits_left, vlc >> size);
-                vlc = vlc & ((1 << size) - 1);
+                vlc = av_mod_uintp2(vlc, size);
             }
             if (pb + 1 >= pb_end) {
                 bi->partial_bit_count  = size;
@@ -221,14 +228,14 @@ static const int dv_weight_88[64] = {
     170627, 165371, 160727, 153560, 160727, 144651, 144651, 136258,
 };
 static const int dv_weight_248[64] = {
-    131072, 242189, 257107, 237536, 229376, 200636, 242189, 223754,
-    224969, 196781, 262144, 242189, 229376, 200636, 257107, 237536,
-    211916, 185364, 235923, 217965, 229376, 211916, 206433, 180568,
-    242189, 223754, 224969, 196781, 211916, 185364, 235923, 217965,
-    200704, 175557, 222935, 205965, 200636, 185364, 195068, 170627,
-    229376, 211916, 206433, 180568, 200704, 175557, 222935, 205965,
-    175557, 153560, 188995, 174609, 165371, 144651, 200636, 185364,
-    195068, 170627, 175557, 153560, 188995, 174609, 165371, 144651,
+    131072, 262144, 257107, 257107, 242189, 242189, 242189, 242189,
+    237536, 237536, 229376, 229376, 200636, 200636, 224973, 224973,
+    223754, 223754, 235923, 235923, 229376, 229376, 217965, 217965,
+    211916, 211916, 196781, 196781, 185364, 185364, 206433, 206433,
+    211916, 211916, 222935, 222935, 200636, 200636, 205964, 205964,
+    200704, 200704, 180568, 180568, 175557, 175557, 195068, 195068,
+    185364, 185364, 188995, 188995, 174606, 174606, 175557, 175557,
+    170627, 170627, 153560, 153560, 165371, 165371, 144651, 144651,
 };
 
 static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
@@ -243,7 +250,7 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
      * method suggested in SMPTE 314M Table 22, and an improved
      * method. The SMPTE method is very conservative; it assigns class
      * 3 (i.e. severe quantization) to any block where the largest AC
-     * component is greater than 36. Libav's DV encoder tracks AC bit
+     * component is greater than 36. FFmpeg's DV encoder tracks AC bit
      * consumption precisely, so there is no need to bias most blocks
      * towards strongly lossy compression. Instead, we assign class 2
      * to most blocks, and use class 3 only when strictly necessary
@@ -251,13 +258,13 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
 
 #if 0 /* SMPTE spec method */
     static const int classes[] = { 12, 24, 36, 0xffff };
-#else /* improved Libav method */
+#else /* improved FFmpeg method */
     static const int classes[] = { -1, -1, 255, 0xffff };
 #endif
     int max  = classes[0];
     int prev = 0;
 
-    assert((((int) blk) & 15) == 0);
+    av_assert2((((int) blk) & 15) == 0);
 
     bi->area_q[0]          =
     bi->area_q[1]          =
@@ -290,7 +297,7 @@ static av_always_inline int dv_init_enc_block(EncBlockInfo *bi, uint8_t *data,
 
             if (level + 15 > 30U) {
                 bi->sign[i] = (level >> 31) & 1;
-                /* Weight it and and shift down into range, adding for rounding.
+                /* Weight it and shift down into range, adding for rounding.
                  * The extra division by a factor of 2^4 reverses the 8x
                  * expansion of the DCT AND the 2x doubling of the weights. */
                 level     = (FFABS(level) * weight[i] + (1 << (dv_weight_bits + 3))) >>
@@ -359,7 +366,7 @@ static inline void dv_guess_qnos(EncBlockInfo *blks, int *qnos)
                         b->bit_size[a] = 1; // 4 areas 4 bits for EOB :)
                         b->area_q[a]++;
                         prev = b->prev[a];
-                        assert(b->next[prev] >= mb_area_start[a + 1] || b->mb[prev]);
+                        av_assert2(b->next[prev] >= mb_area_start[a + 1] || b->mb[prev]);
                         for (k = b->next[prev]; k < mb_area_start[a + 1]; k = b->next[k]) {
                             b->mb[k] >>= 1;
                             if (b->mb[k]) {
@@ -369,11 +376,11 @@ static inline void dv_guess_qnos(EncBlockInfo *blks, int *qnos)
                                 if (b->next[k] >= mb_area_start[a + 1] && b->next[k] < 64) {
                                     for (a2 = a + 1; b->next[k] >= mb_area_start[a2 + 1]; a2++)
                                         b->prev[a2] = prev;
-                                    assert(a2 < 4);
-                                    assert(b->mb[b->next[k]]);
+                                    av_assert2(a2 < 4);
+                                    av_assert2(b->mb[b->next[k]]);
                                     b->bit_size[a2] += dv_rl2vlc_size(b->next[k] - prev - 1, b->mb[b->next[k]]) -
                                                        dv_rl2vlc_size(b->next[k] - k    - 1, b->mb[b->next[k]]);
-                                    assert(b->prev[a2] == k && (a2 + 1 >= 4 || b->prev[a2 + 1] != k));
+                                    av_assert2(b->prev[a2] == k && (a2 + 1 >= 4 || b->prev[a2 + 1] != k));
                                     b->prev[a2] = prev;
                                 }
                                 b->next[prev] = b->next[k];
@@ -567,6 +574,7 @@ static inline int dv_write_pack(enum dv_pack_type pack_id, DVVideoContext *c,
      *      compression scheme (if any).
      */
     int apt = (c->sys->pix_fmt == AV_PIX_FMT_YUV420P ? 0 : 1);
+    int fs  = c->frame->top_field_first ? 0x00 : 0x40;
 
     uint8_t aspect = 0;
     if ((int) (av_q2d(c->avctx->sample_aspect_ratio) *
@@ -606,7 +614,7 @@ static inline int dv_write_pack(enum dv_pack_type pack_id, DVVideoContext *c,
         buf[2] = 0xc8 |        /* reserved -- always b11001xxx */
                  aspect;
         buf[3] = (1 << 7) |    /* frame/field flag 1 -- frame, 0 -- field */
-                 (1 << 6) |    /* first/second field flag 0 -- field 2, 1 -- field 1 */
+                 fs       |    /* first/second field flag 0 -- field 2, 1 -- field 1 */
                  (1 << 5) |    /* frame change flag 0 -- same picture as before, 1 -- different */
                  (1 << 4) |    /* 1 - interlaced, 0 - noninterlaced */
                  0xc;          /* reserved -- always b1100 */
@@ -709,10 +717,8 @@ static int dvvideo_encode_frame(AVCodecContext *c, AVPacket *pkt,
     DVVideoContext *s = c->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, s->sys->frame_size)) < 0) {
-        av_log(c, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(c, pkt, s->sys->frame_size, 0)) < 0)
         return ret;
-    }
 
     c->pix_fmt                = s->sys->pix_fmt;
     s->frame                  = frame;
@@ -745,7 +751,7 @@ AVCodec ff_dvvideo_encoder = {
     .priv_data_size = sizeof(DVVideoContext),
     .init           = dvvideo_encode_init,
     .encode2        = dvvideo_encode_frame,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV422P,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE
diff --git a/libavcodec/dxa.c b/libavcodec/dxa.c
index b804935..f6edc03 100644
--- a/libavcodec/dxa.c
+++ b/libavcodec/dxa.c
@@ -2,20 +2,20 @@
  * Feeble Files/ScummVM DXA decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
 
@@ -41,6 +42,7 @@ typedef struct DxaDecContext {
     AVFrame *prev;
 
     int dsize;
+#define DECOMP_BUF_PADDING 16
     uint8_t *decomp_buf;
     uint32_t pal[256];
 } DxaDecContext;
@@ -49,13 +51,17 @@ static const int shift1[6] = { 0, 8, 8, 8, 4, 4 };
 static const int shift2[6] = { 0, 0, 8, 4, 0, 4 };
 
 static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
-                     int stride, uint8_t *src, uint8_t *ref)
+                     int stride, uint8_t *src, int srcsize, uint8_t *ref)
 {
     uint8_t *code, *data, *mv, *msk, *tmp, *tmp2;
+    uint8_t *src_end = src + srcsize;
     int i, j, k;
     int type, x, y, d, d2;
     uint32_t mask;
 
+    if (12ULL  + ((avctx->width * avctx->height) >> 4) + AV_RB32(src + 0) + AV_RB32(src + 4) > srcsize)
+        return AVERROR_INVALIDDATA;
+
     code = src  + 12;
     data = code + ((avctx->width * avctx->height) >> 4);
     mv   = data + AV_RB32(src + 0);
@@ -63,6 +69,8 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
 
     for(j = 0; j < avctx->height; j += 4){
         for(i = 0; i < avctx->width; i += 4){
+            if (data > src_end || mv > src_end || msk > src_end)
+                return AVERROR_INVALIDDATA;
             tmp  = dst + i;
             tmp2 = ref + i;
             type = *code++;
@@ -70,6 +78,11 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
             case 4: // motion compensation
                 x = (*mv) >> 4;    if(x & 8) x = 8 - x;
                 y = (*mv++) & 0xF; if(y & 8) y = 8 - y;
+                if (i < -x || avctx->width  - i - 4 < x ||
+                    j < -y || avctx->height - j - 4 < y) {
+                    av_log(avctx, AV_LOG_ERROR, "MV %d %d out of bounds\n", x,y);
+                    return AVERROR_INVALIDDATA;
+                }
                 tmp2 += x + y*stride;
             case 0: // skip
             case 5: // skip in method 12
@@ -127,6 +140,11 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
                     case 0x80: // motion compensation
                         x = (*mv) >> 4;    if(x & 8) x = 8 - x;
                         y = (*mv++) & 0xF; if(y & 8) y = 8 - y;
+                        if (i + 2*(k & 1) < -x || avctx->width  - i - 2*(k & 1) - 2 < x ||
+                            j +   (k & 2) < -y || avctx->height - j -   (k & 2) - 2 < y) {
+                            av_log(avctx, AV_LOG_ERROR, "MV %d %d out of bounds\n", x,y);
+                            return AVERROR_INVALIDDATA;
+                        }
                         tmp2 += x + y*stride;
                     case 0x00: // skip
                         tmp[d + 0         ] = tmp2[0];
@@ -192,35 +210,27 @@ static int decode_13(AVCodecContext *avctx, DxaDecContext *c, uint8_t* dst,
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
     AVFrame *frame = data;
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     DxaDecContext * const c = avctx->priv_data;
     uint8_t *outptr, *srcptr, *tmpptr;
     unsigned long dsize;
     int i, j, compr, ret;
     int stride;
-    int orig_buf_size = buf_size;
     int pc = 0;
+    GetByteContext gb;
 
-    /* make the palette available on the way out */
-    if(buf[0]=='C' && buf[1]=='M' && buf[2]=='A' && buf[3]=='P'){
-        int r, g, b;
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-        buf += 4;
+    /* make the palette available on the way out */
+    if (bytestream2_peek_le32(&gb) == MKTAG('C','M','A','P')) {
+        bytestream2_skip(&gb, 4);
         for(i = 0; i < 256; i++){
-            r = *buf++;
-            g = *buf++;
-            b = *buf++;
-            c->pal[i] = (r << 16) | (g << 8) | b;
+            c->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&gb);
         }
         pc = 1;
-        buf_size -= 768+4;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
     memcpy(frame->data[1], c->pal, AVPALETTE_SIZE);
     frame->palette_has_changed = pc;
 
@@ -229,16 +239,25 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     tmpptr = c->prev->data[0];
     stride = frame->linesize[0];
 
-    if(buf[0]=='N' && buf[1]=='U' && buf[2]=='L' && buf[3]=='L')
+    if (bytestream2_get_le32(&gb) == MKTAG('N','U','L','L'))
         compr = -1;
     else
-        compr = buf[4];
+        compr = bytestream2_get_byte(&gb);
 
     dsize = c->dsize;
-    if((compr != 4 && compr != -1) && uncompress(c->decomp_buf, &dsize, buf + 9, buf_size - 9) != Z_OK){
-        av_log(avctx, AV_LOG_ERROR, "Uncompress failed!\n");
-        return AVERROR_UNKNOWN;
+    if (compr != 4 && compr != -1) {
+        bytestream2_skip(&gb, 4);
+        if (uncompress(c->decomp_buf, &dsize, avpkt->data + bytestream2_tell(&gb),
+                       bytestream2_get_bytes_left(&gb)) != Z_OK) {
+            av_log(avctx, AV_LOG_ERROR, "Uncompress failed!\n");
+            return AVERROR_UNKNOWN;
+        }
+        memset(c->decomp_buf + dsize, 0, DECOMP_BUF_PADDING);
     }
+
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_DEBUG, "compr:%2d, dsize:%d\n", compr, (int)dsize);
+
     switch(compr){
     case -1:
         frame->key_frame = 0;
@@ -265,14 +284,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case 5:
         if (!tmpptr) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
-            return AVERROR_INVALIDDATA;
+            if (!(avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
+                return AVERROR_INVALIDDATA;
         }
         frame->key_frame = 0;
         frame->pict_type = AV_PICTURE_TYPE_P;
         for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++)
-                outptr[i] = srcptr[i] ^ tmpptr[i];
-            tmpptr += stride;
+            if(tmpptr){
+                for(i = 0; i < avctx->width; i++)
+                    outptr[i] = srcptr[i] ^ tmpptr[i];
+                tmpptr += stride;
+            }else
+                memcpy(outptr, srcptr, avctx->width);
             outptr += stride;
             srcptr += avctx->width;
         }
@@ -281,10 +304,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case 13:
         frame->key_frame = 0;
         frame->pict_type = AV_PICTURE_TYPE_P;
-        decode_13(avctx, c, frame->data[0], frame->linesize[0], srcptr, c->prev->data[0]);
+        if (!c->prev->data[0]) {
+            av_log(avctx, AV_LOG_ERROR, "Missing reference frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+        decode_13(avctx, c, frame->data[0], frame->linesize[0], srcptr, dsize, c->prev->data[0]);
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "Unknown/unsupported compression type %d\n", buf[4]);
+        av_log(avctx, AV_LOG_ERROR, "Unknown/unsupported compression type %d\n", compr);
         return AVERROR_INVALIDDATA;
     }
 
@@ -295,13 +322,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     *got_frame = 1;
 
     /* always report that the buffer was completely consumed */
-    return orig_buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     DxaDecContext * const c = avctx->priv_data;
 
+    if (avctx->width%4 || avctx->height%4) {
+        avpriv_request_sample(avctx, "dimensions are not a multiple of 4");
+        return AVERROR_INVALIDDATA;
+    }
+
     c->prev = av_frame_alloc();
     if (!c->prev)
         return AVERROR(ENOMEM);
@@ -309,7 +341,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
     c->dsize = avctx->width * avctx->height * 2;
-    if (!(c->decomp_buf = av_malloc(c->dsize))) {
+    c->decomp_buf = av_malloc(c->dsize + DECOMP_BUF_PADDING);
+    if (!c->decomp_buf) {
+        av_frame_free(&c->prev);
         av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
         return AVERROR(ENOMEM);
     }
diff --git a/libavcodec/dxtory.c b/libavcodec/dxtory.c
index 01726b9..fc19369 100644
--- a/libavcodec/dxtory.c
+++ b/libavcodec/dxtory.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,7 +39,7 @@ static int dxtory_decode_v1_rgb(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *dst;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * bpp) {
+    if (src_size < avctx->width * avctx->height * (int64_t)bpp) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -65,7 +65,7 @@ static int dxtory_decode_v1_410(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y1, *Y2, *Y3, *Y4, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 18 / 16) {
+    if (src_size < FFALIGN(avctx->width, 4) * FFALIGN(avctx->height, 4) * 9LL / 8) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -82,10 +82,10 @@ static int dxtory_decode_v1_410(AVCodecContext *avctx, AVFrame *pic,
     V  = pic->data[2];
     for (h = 0; h < avctx->height; h += 4) {
         for (w = 0; w < avctx->width; w += 4) {
-            AV_COPY32(Y1 + w, src);
-            AV_COPY32(Y2 + w, src + 4);
-            AV_COPY32(Y3 + w, src + 8);
-            AV_COPY32(Y4 + w, src + 12);
+            AV_COPY32U(Y1 + w, src);
+            AV_COPY32U(Y2 + w, src + 4);
+            AV_COPY32U(Y3 + w, src + 8);
+            AV_COPY32U(Y4 + w, src + 12);
             U[w >> 2] = src[16] + 0x80;
             V[w >> 2] = src[17] + 0x80;
             src += 18;
@@ -108,7 +108,7 @@ static int dxtory_decode_v1_420(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y1, *Y2, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 3 / 2) {
+    if (src_size < FFALIGN(avctx->width, 2) * FFALIGN(avctx->height, 2) * 3LL / 2) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -145,7 +145,7 @@ static int dxtory_decode_v1_444(AVCodecContext *avctx, AVFrame *pic,
     uint8_t *Y, *U, *V;
     int ret;
 
-    if (src_size < avctx->width * avctx->height * 3) {
+    if (src_size < avctx->width * avctx->height * 3LL) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
         return AVERROR_INVALIDDATA;
     }
@@ -295,7 +295,8 @@ static int dxtory_decode_v2(AVCodecContext *avctx, AVFrame *pic,
         if (ret < 0)
             return ret;
 
-        init_get_bits(&gb2, src + off + 16, (slice_size - 16) * 8);
+        if ((ret = init_get_bits8(&gb2, src + off + 16, slice_size - 16)) < 0)
+            return ret;
 
         line += decode_slice(&gb2, pic, line, avctx->height - line, lru);
 
diff --git a/libavcodec/dxv.c b/libavcodec/dxv.c
index 32137f5..05a9aad 100644
--- a/libavcodec/dxv.c
+++ b/libavcodec/dxv.c
@@ -2,20 +2,20 @@
  * Resolume DXV decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,9 +105,17 @@ static int decompress_texture_thread(AVCodecContext *avctx, void *arg,
             break;                                                            \
         case 2:                                                               \
             idx = (bytestream2_get_byte(gbc) + 2) * x;                        \
+            if (idx > pos) {                                                  \
+                av_log(avctx, AV_LOG_ERROR, "idx %d > %d\n", idx, pos);       \
+                return AVERROR_INVALIDDATA;                                   \
+            }                                                                 \
             break;                                                            \
         case 3:                                                               \
             idx = (bytestream2_get_le16(gbc) + 0x102) * x;                    \
+            if (idx > pos) {                                                  \
+                av_log(avctx, AV_LOG_ERROR, "idx %d > %d\n", idx, pos);       \
+                return AVERROR_INVALIDDATA;                                   \
+            }                                                                 \
             break;                                                            \
         }                                                                     \
     } while(0)
@@ -252,6 +260,10 @@ static int dxv_decompress_dxt5(AVCodecContext *avctx)
             case 2:
                 /* Copy two dwords from a previous index */
                 idx = 8 + bytestream2_get_le16(gbc);
+                if (idx > pos) {
+                    av_log(avctx, AV_LOG_ERROR, "idx %d > %d\n", idx, pos);
+                    return AVERROR_INVALIDDATA;
+                }
                 prev = AV_RL32(ctx->tex_data + 4 * (pos - idx));
                 AV_WL32(ctx->tex_data + 4 * pos, prev);
                 pos++;
@@ -432,7 +444,6 @@ static int dxv_decode(AVCodecContext *avctx, void *data,
     ret = ff_thread_get_buffer(avctx, &tframe, 0);
     if (ret < 0)
         return ret;
-    ff_thread_finish_setup(avctx);
 
     /* Now decompress the texture with the standard functions. */
     avctx->execute2(avctx, decompress_texture_thread,
diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c
index 8b0e686..f68df86 100644
--- a/libavcodec/dxva2.c
+++ b/libavcodec/dxva2.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,7 +58,7 @@ int ff_dxva2_commit_buffer(AVCodecContext *avctx,
     void     *dxva_data;
     unsigned dxva_size;
     int      result;
-    HRESULT hr;
+    HRESULT hr = 0;
 
 #if CONFIG_D3D11VA
     if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD)
@@ -137,7 +137,7 @@ int ff_dxva2_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
 #if CONFIG_DXVA2
     DXVA2_DecodeBufferDesc          buffer2[4];
 #endif
-    DECODER_BUFFER_DESC             *buffer,*buffer_slice;
+    DECODER_BUFFER_DESC             *buffer = NULL, *buffer_slice = NULL;
     int result, runs = 0;
     HRESULT hr;
     unsigned type;
@@ -158,9 +158,15 @@ int ff_dxva2_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
                                                  ff_dxva2_get_surface(frame),
                                                  NULL);
 #endif
-        if (hr == E_PENDING)
-            av_usleep(2000);
-    } while (hr == E_PENDING && ++runs < 50);
+        if (hr != E_PENDING || ++runs > 50)
+            break;
+#if CONFIG_D3D11VA
+        if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD)
+            if (D3D11VA_CONTEXT(ctx)->context_mutex != INVALID_HANDLE_VALUE)
+                ReleaseMutex(D3D11VA_CONTEXT(ctx)->context_mutex);
+#endif
+        av_usleep(2000);
+    } while(1);
 
     if (FAILED(hr)) {
         av_log(avctx, AV_LOG_ERROR, "Failed to begin frame: 0x%lx\n", hr);
diff --git a/libavcodec/dxva2.h b/libavcodec/dxva2.h
index ec448a4..22c9399 100644
--- a/libavcodec/dxva2.h
+++ b/libavcodec/dxva2.h
@@ -3,25 +3,25 @@
  *
  * copyright (c) 2009 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_DXVA_H
-#define AVCODEC_DXVA_H
+#ifndef AVCODEC_DXVA2_H
+#define AVCODEC_DXVA2_H
 
 /**
  * @file
@@ -50,7 +50,7 @@
 
 /**
  * This structure is used to provides the necessary configurations and data
- * to the DXVA2 Libav HWAccel implementation.
+ * to the DXVA2 FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  */
@@ -81,7 +81,7 @@ struct dxva_context {
     uint64_t workaround;
 
     /**
-     * Private to the Libav AVHWAccel implementation
+     * Private to the FFmpeg AVHWAccel implementation
      */
     unsigned report_id;
 };
@@ -90,4 +90,4 @@ struct dxva_context {
  * @}
  */
 
-#endif /* AVCODEC_DXVA_H */
+#endif /* AVCODEC_DXVA2_H */
diff --git a/libavcodec/dxva2_h264.c b/libavcodec/dxva2_h264.c
index e2e987d..84a8e6c 100644
--- a/libavcodec/dxva2_h264.c
+++ b/libavcodec/dxva2_h264.c
@@ -3,23 +3,25 @@
  *
  * copyright (c) 2009 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
+
 #include "h264.h"
 #include "h264data.h"
 #include "mpegutils.h"
@@ -102,7 +104,7 @@ static void fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *
                                         ((sps->mb_aff &&
                                         (h->picture_structure == PICT_FRAME)) <<  1) |
                                         (sps->residual_color_transform_flag   <<  2) |
-                                        /* sp_for_switch_flag (not implemented by Libav) */
+                                        /* sp_for_switch_flag (not implemented by FFmpeg) */
                                         (0                                    <<  3) |
                                         (sps->chroma_format_idc               <<  4) |
                                         ((h->nal_ref_idc != 0)                <<  6) |
@@ -158,15 +160,14 @@ static void fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *
     pp->deblocking_filter_control_present_flag = pps->deblocking_filter_parameters_present;
     pp->redundant_pic_cnt_present_flag= pps->redundant_pic_cnt_present;
     pp->Reserved8BitsB                = 0;
-    pp->slice_group_change_rate_minus1= 0;  /* XXX not implemented by Libav */
-    //pp->SliceGroupMap[810];               /* XXX not implemented by Libav */
+    pp->slice_group_change_rate_minus1= 0;  /* XXX not implemented by FFmpeg */
+    //pp->SliceGroupMap[810];               /* XXX not implemented by FFmpeg */
 }
 
 static void fill_scaling_lists(const AVCodecContext *avctx, AVDXVAContext *ctx, const H264Context *h, DXVA_Qmatrix_H264 *qm)
 {
-    unsigned i, j;
-    const SPS *sps = h->ps.sps;
     const PPS *pps = h->ps.pps;
+    unsigned i, j;
     memset(qm, 0, sizeof(*qm));
     if (DXVA_CONTEXT_WORKAROUND(avctx, ctx) & FF_DXVA2_WORKAROUND_SCALING_LIST_ZIGZAG) {
         for (i = 0; i < 6; i++)
@@ -230,7 +231,7 @@ static void fill_slice_long(AVCodecContext *avctx, DXVA_Slice_H264_Long *slice,
 
     slice->first_mb_in_slice     = (sl->mb_y >> FIELD_OR_MBAFF_PICTURE(h)) * h->mb_width + sl->mb_x;
     slice->NumMbsForSlice        = 0; /* XXX it is set once we have all slices */
-    slice->BitOffsetToSliceData  = get_bits_count(&sl->gb);
+    slice->BitOffsetToSliceData  = get_bits_count(&sl->gb) - 8;
     slice->slice_type            = ff_h264_get_slice_type(sl);
     if (sl->slice_type_fixed)
         slice->slice_type += 5;
@@ -256,7 +257,7 @@ static void fill_slice_long(AVCodecContext *avctx, DXVA_Slice_H264_Long *slice,
                 else
                     index = get_refpic_index(pp, ff_dxva2_get_surface_index(avctx, ctx, r->f));
                 fill_picture_entry(&slice->RefPicList[list][i], index,
-                                   r->reference == PICT_BOTTOM_FIELD);
+                                   sl->ref_list[list][i].reference == PICT_BOTTOM_FIELD);
                 for (plane = 0; plane < 3; plane++) {
                     int w, o;
                     if (plane == 0 && sl->pwt.luma_weight_flag[list]) {
@@ -283,7 +284,7 @@ static void fill_slice_long(AVCodecContext *avctx, DXVA_Slice_H264_Long *slice,
             }
         }
     }
-    slice->slice_qs_delta    = 0; /* XXX not implemented by Libav */
+    slice->slice_qs_delta    = 0; /* XXX not implemented by FFmpeg */
     slice->slice_qp_delta    = sl->qscale - h->ps.pps->init_qp;
     slice->redundant_pic_cnt = sl->redundant_pic_count;
     if (sl->slice_type == AV_PICTURE_TYPE_B)
@@ -306,9 +307,9 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
     const H264Picture *current_picture = h->cur_pic_ptr;
     struct dxva2_picture_context *ctx_pic = current_picture->hwaccel_picture_private;
     DXVA_Slice_H264_Short *slice = NULL;
-    void     *dxva_data_ptr;
+    void     *dxva_data_ptr = NULL;
     uint8_t  *dxva_data, *current, *end;
-    unsigned dxva_size;
+    unsigned dxva_size = 0;
     void     *slice_data;
     unsigned slice_size;
     unsigned padding;
@@ -409,6 +410,8 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         dsc11->NumMBsInBuffer       = mb_count;
 
         type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
+
+        av_assert0((dsc11->DataSize & 127) == 0);
     }
 #endif
 #if CONFIG_DXVA2
@@ -420,6 +423,8 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         dsc2->NumMBsInBuffer       = mb_count;
 
         type = DXVA2_SliceControlBufferType;
+
+        av_assert0((dsc2->DataSize & 127) == 0);
     }
 #endif
 
@@ -430,7 +435,6 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
         slice_data = ctx_pic->slice_long;
         slice_size = ctx_pic->slice_count * sizeof(*ctx_pic->slice_long);
     }
-    assert((bs->DataSize & 127) == 0);
     return ff_dxva2_commit_buffer(avctx, ctx, sc,
                                   type,
                                   slice_data, slice_size, mb_count);
diff --git a/libavcodec/dxva2_hevc.c b/libavcodec/dxva2_hevc.c
index 5bb10d6..5a312ea 100644
--- a/libavcodec/dxva2_hevc.c
+++ b/libavcodec/dxva2_hevc.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2014 - 2015 Hendrik Leppkes
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/dxva2_internal.h b/libavcodec/dxva2_internal.h
index 30aec8b..ad89f82 100644
--- a/libavcodec/dxva2_internal.h
+++ b/libavcodec/dxva2_internal.h
@@ -3,25 +3,25 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_DXVA_INTERNAL_H
-#define AVCODEC_DXVA_INTERNAL_H
+#ifndef AVCODEC_DXVA2_INTERNAL_H
+#define AVCODEC_DXVA2_INTERNAL_H
 
 #define COBJMACROS
 
@@ -36,6 +36,7 @@
 #if CONFIG_D3D11VA
 #include "d3d11va.h"
 #endif
+
 #if HAVE_DXVA_H
 /* When targeting WINAPI_FAMILY_PHONE_APP or WINAPI_FAMILY_APP, dxva.h
  * defines nothing. Force the struct definitions to be visible. */
@@ -117,4 +118,4 @@ int ff_dxva2_common_end_frame(AVCodecContext *, AVFrame *,
                                                   DECODER_BUFFER_DESC *bs,
                                                   DECODER_BUFFER_DESC *slice));
 
-#endif /* AVCODEC_DXVA_INTERNAL_H */
+#endif /* AVCODEC_DXVA2_INTERNAL_H */
diff --git a/libavcodec/dxva2_mpeg2.c b/libavcodec/dxva2_mpeg2.c
index 2d88f9b..c2f0b58 100644
--- a/libavcodec/dxva2_mpeg2.c
+++ b/libavcodec/dxva2_mpeg2.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -116,10 +116,10 @@ static void fill_quantization_matrices(AVCodecContext *avctx,
         qm->bNewQmatrix[i] = 1;
     for (i = 0; i < 64; i++) {
         int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
-        qm->Qmatrix[0][i] = s->intra_matrix[n];;
-        qm->Qmatrix[1][i] = s->inter_matrix[n];;
-        qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];;
-        qm->Qmatrix[3][i] = s->chroma_inter_matrix[n];;
+        qm->Qmatrix[0][i] = s->intra_matrix[n];
+        qm->Qmatrix[1][i] = s->inter_matrix[n];
+        qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];
+        qm->Qmatrix[3][i] = s->chroma_inter_matrix[n];
     }
 }
 
@@ -146,8 +146,7 @@ static void fill_slice(AVCodecContext *avctx,
     init_get_bits(&gb, &buffer[4], 8 * (size - 4));
 
     slice->wQuantizerScaleCode = get_bits(&gb, 5);
-    while (get_bits1(&gb))
-        skip_bits(&gb, 8);
+    skip_1stop_8data_bits(&gb);
 
     slice->wMBbitOffset        = 4 * 8 + get_bits_count(&gb);
 }
diff --git a/libavcodec/dxva2_vc1.c b/libavcodec/dxva2_vc1.c
index d170e18..7cbbc7e 100644
--- a/libavcodec/dxva2_vc1.c
+++ b/libavcodec/dxva2_vc1.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2010 Laurent Aimar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,15 @@ static void fill_picture_parameters(AVCodecContext *avctx,
 {
     const MpegEncContext *s = &v->s;
     const Picture *current_picture = s->current_picture_ptr;
+    int intcomp = 0;
+
+    // determine if intensity compensation is needed
+    if (s->pict_type == AV_PICTURE_TYPE_P) {
+      if ((v->fcm == ILACE_FRAME && v->intcomp) || (v->fcm != ILACE_FRAME && v->mv_mode == MV_PMODE_INTENSITY_COMP)) {
+        if (v->lumscale != 32 || v->lumshift != 0 || (s->picture_structure != PICT_FRAME && (v->lumscale2 != 32 || v->lumshift2 != 0)))
+          intcomp = 1;
+      }
+    }
 
     memset(pp, 0, sizeof(*pp));
     pp->wDecodedPictureIndex    =
@@ -73,13 +82,13 @@ static void fill_picture_parameters(AVCodecContext *avctx,
         pp->bPicStructure      |= 0x01;
     if (s->picture_structure & PICT_BOTTOM_FIELD)
         pp->bPicStructure      |= 0x02;
-    pp->bSecondField            = v->interlace && v->fcm != ILACE_FIELD && !s->first_field;
+    pp->bSecondField            = v->interlace && v->fcm == ILACE_FIELD && v->second_field;
     pp->bPicIntra               = s->pict_type == AV_PICTURE_TYPE_I || v->bi_type;
     pp->bPicBackwardPrediction  = s->pict_type == AV_PICTURE_TYPE_B && !v->bi_type;
     pp->bBidirectionalAveragingMode = (1                                           << 7) |
                                       ((DXVA_CONTEXT_CFG_INTRARESID(avctx, ctx) != 0) << 6) |
                                       ((DXVA_CONTEXT_CFG_RESIDACCEL(avctx, ctx) != 0) << 5) |
-                                      ((v->lumscale != 32 || v->lumshift != 0)     << 4) |
+                                      (intcomp                                     << 4) |
                                       ((v->profile == PROFILE_ADVANCED)            << 3);
     pp->bMVprecisionAndChromaRelation = ((v->mv_mode == MV_PMODE_1MV_HPEL_BILIN) << 3) |
                                         (1                                       << 2) |
@@ -127,15 +136,25 @@ static void fill_picture_parameters(AVCodecContext *avctx,
                                   (v->range_mapuv_flag << 3) |
                                   (v->range_mapuv          );
     pp->bPicBinPB               = 0;
-    pp->bMV_RPS                 = 0;
-    pp->bReservedBits           = 0;
+    pp->bMV_RPS                 = (v->fcm == ILACE_FIELD && pp->bPicBackwardPrediction) ? v->refdist + 9 : 0;
+    pp->bReservedBits           = v->pq;
     if (s->picture_structure == PICT_FRAME) {
-        pp->wBitstreamFcodes        = v->lumscale;
-        pp->wBitstreamPCEelements   = v->lumshift;
+        if (intcomp) {
+            pp->wBitstreamFcodes      = v->lumscale;
+            pp->wBitstreamPCEelements = v->lumshift;
+        } else {
+            pp->wBitstreamFcodes      = 32;
+            pp->wBitstreamPCEelements = 0;
+        }
     } else {
         /* Syntax: (top_field_param << 8) | bottom_field_param */
-        pp->wBitstreamFcodes        = (v->lumscale << 8) | v->lumscale;
-        pp->wBitstreamPCEelements   = (v->lumshift << 8) | v->lumshift;
+        if (intcomp) {
+            pp->wBitstreamFcodes      = (v->lumscale << 8) | v->lumscale2;
+            pp->wBitstreamPCEelements = (v->lumshift << 8) | v->lumshift2;
+        } else {
+            pp->wBitstreamFcodes      = (32 << 8) | 32;
+            pp->wBitstreamPCEelements = 0;
+        }
     }
     pp->bBitstreamConcealmentNeed   = 0;
     pp->bBitstreamConcealmentMethod = 0;
@@ -153,8 +172,8 @@ static void fill_slice(AVCodecContext *avctx, DXVA_SliceInfo *slice,
     slice->dwSliceBitsInBuffer = 8 * size;
     slice->dwSliceDataLocation = position;
     slice->bStartCodeBitOffset = 0;
-    slice->bReservedBits       = 0;
-    slice->wMBbitOffset        = get_bits_count(&s->gb);
+    slice->bReservedBits       = (s->pict_type == AV_PICTURE_TYPE_B && !v->bi_type) ? v->bfraction_lut_index + 9 : 0;
+    slice->wMBbitOffset        = v->p_frame_skipped ? 0xffff : get_bits_count(&s->gb) + (avctx->codec_id == AV_CODEC_ID_VC1 ? 32 : 0);
     slice->wNumberMBsInSlice   = s->mb_width * s->mb_height; /* XXX We assume 1 slice */
     slice->wQuantizerScaleCode = v->pq;
     slice->wBadSliceChopping   = 0;
@@ -206,8 +225,11 @@ static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
     dxva_data = dxva_data_ptr;
     result = data_size <= dxva_size ? 0 : -1;
     if (!result) {
-        if (start_code_size > 0)
+        if (start_code_size > 0) {
             memcpy(dxva_data, start_code, start_code_size);
+            if (v->second_field)
+                dxva_data[3] = 0x0c;
+        }
         memcpy(dxva_data + start_code_size,
                ctx_pic->bitstream + slice->dwSliceDataLocation, slice_size);
         if (padding > 0)
diff --git a/libavcodec/dxva2_vp9.c b/libavcodec/dxva2_vp9.c
new file mode 100644
index 0000000..0c4996c
--- /dev/null
+++ b/libavcodec/dxva2_vp9.c
@@ -0,0 +1,337 @@
+/*
+ * DXVA2 VP9 HW acceleration.
+ *
+ * copyright (c) 2015 Hendrik Leppkes
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+
+#include "vp9.h"
+
+// The headers above may include w32threads.h, which uses the original
+// _WIN32_WINNT define, while dxva2_internal.h redefines it to target a
+// potentially newer version.
+#include "dxva2_internal.h"
+
+struct vp9_dxva2_picture_context {
+    DXVA_PicParams_VP9    pp;
+    DXVA_Slice_VPx_Short  slice;
+    const uint8_t         *bitstream;
+    unsigned              bitstream_size;
+};
+
+static void fill_picture_entry(DXVA_PicEntry_VPx *pic,
+                               unsigned index, unsigned flag)
+{
+    av_assert0((index & 0x7f) == index && (flag & 0x01) == flag);
+    pic->bPicEntry = index | (flag << 7);
+}
+
+static int fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *ctx, const VP9SharedContext *h,
+                                    DXVA_PicParams_VP9 *pp)
+{
+    int i;
+    const AVPixFmtDescriptor * pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+
+    if (!pixdesc)
+        return -1;
+
+    memset(pp, 0, sizeof(*pp));
+
+    fill_picture_entry(&pp->CurrPic, ff_dxva2_get_surface_index(avctx, ctx, h->frames[CUR_FRAME].tf.f), 0);
+
+    pp->profile = h->h.profile;
+    pp->wFormatAndPictureInfoFlags = ((h->h.keyframe == 0)   <<  0) |
+                                     ((h->h.invisible == 0)  <<  1) |
+                                     (h->h.errorres          <<  2) |
+                                     (pixdesc->log2_chroma_w <<  3) | /* subsampling_x */
+                                     (pixdesc->log2_chroma_h <<  4) | /* subsampling_y */
+                                     (0                      <<  5) | /* extra_plane */
+                                     (h->h.refreshctx        <<  6) |
+                                     (h->h.parallelmode      <<  7) |
+                                     (h->h.intraonly         <<  8) |
+                                     (h->h.framectxid        <<  9) |
+                                     (h->h.resetctx          << 11) |
+                                     ((h->h.keyframe ? 0 : h->h.highprecisionmvs) << 13) |
+                                     (0                      << 14);  /* ReservedFormatInfo2Bits */
+
+    pp->width  = avctx->width;
+    pp->height = avctx->height;
+    pp->BitDepthMinus8Luma   = pixdesc->comp[0].depth - 8;
+    pp->BitDepthMinus8Chroma = pixdesc->comp[1].depth - 8;
+    /* swap 0/1 to match the reference */
+    pp->interp_filter = h->h.filtermode ^ (h->h.filtermode <= 1);
+    pp->Reserved8Bits = 0;
+
+    for (i = 0; i < 8; i++) {
+        if (h->refs[i].f->buf[0]) {
+            fill_picture_entry(&pp->ref_frame_map[i], ff_dxva2_get_surface_index(avctx, ctx, h->refs[i].f), 0);
+            pp->ref_frame_coded_width[i]  = h->refs[i].f->width;
+            pp->ref_frame_coded_height[i] = h->refs[i].f->height;
+        } else
+            pp->ref_frame_map[i].bPicEntry = 0xFF;
+    }
+
+    for (i = 0; i < 3; i++) {
+        uint8_t refidx = h->h.refidx[i];
+        if (h->refs[refidx].f->buf[0])
+            fill_picture_entry(&pp->frame_refs[i], ff_dxva2_get_surface_index(avctx, ctx, h->refs[refidx].f), 0);
+        else
+            pp->frame_refs[i].bPicEntry = 0xFF;
+
+        pp->ref_frame_sign_bias[i + 1] = h->h.signbias[i];
+    }
+
+    pp->filter_level    = h->h.filter.level;
+    pp->sharpness_level = h->h.filter.sharpness;
+
+    pp->wControlInfoFlags = (h->h.lf_delta.enabled   << 0) |
+                            (h->h.lf_delta.updated   << 1) |
+                            (h->h.use_last_frame_mvs << 2) |
+                            (0                       << 3);  /* ReservedControlInfo5Bits */
+
+    for (i = 0; i < 4; i++)
+        pp->ref_deltas[i]  = h->h.lf_delta.ref[i];
+
+    for (i = 0; i < 2; i++)
+        pp->mode_deltas[i]  = h->h.lf_delta.mode[i];
+
+    pp->base_qindex   = h->h.yac_qi;
+    pp->y_dc_delta_q  = h->h.ydc_qdelta;
+    pp->uv_dc_delta_q = h->h.uvdc_qdelta;
+    pp->uv_ac_delta_q = h->h.uvac_qdelta;
+
+    /* segmentation data */
+    pp->stVP9Segments.wSegmentInfoFlags = (h->h.segmentation.enabled       << 0) |
+                                          (h->h.segmentation.update_map    << 1) |
+                                          (h->h.segmentation.temporal      << 2) |
+                                          (h->h.segmentation.absolute_vals << 3) |
+                                          (0                               << 4);  /* ReservedSegmentFlags4Bits */
+
+    for (i = 0; i < 7; i++)
+        pp->stVP9Segments.tree_probs[i] = h->h.segmentation.prob[i];
+
+    if (h->h.segmentation.temporal)
+        for (i = 0; i < 3; i++)
+            pp->stVP9Segments.pred_probs[i] = h->h.segmentation.pred_prob[i];
+    else
+        memset(pp->stVP9Segments.pred_probs, 255, sizeof(pp->stVP9Segments.pred_probs));
+
+    for (i = 0; i < 8; i++) {
+        pp->stVP9Segments.feature_mask[i] = (h->h.segmentation.feat[i].q_enabled    << 0) |
+                                            (h->h.segmentation.feat[i].lf_enabled   << 1) |
+                                            (h->h.segmentation.feat[i].ref_enabled  << 2) |
+                                            (h->h.segmentation.feat[i].skip_enabled << 3);
+
+        pp->stVP9Segments.feature_data[i][0] = h->h.segmentation.feat[i].q_val;
+        pp->stVP9Segments.feature_data[i][1] = h->h.segmentation.feat[i].lf_val;
+        pp->stVP9Segments.feature_data[i][2] = h->h.segmentation.feat[i].ref_val;
+        pp->stVP9Segments.feature_data[i][3] = 0; /* no data for skip */
+    }
+
+    pp->log2_tile_cols = h->h.tiling.log2_tile_cols;
+    pp->log2_tile_rows = h->h.tiling.log2_tile_rows;
+
+    pp->uncompressed_header_size_byte_aligned = h->h.uncompressed_header_size;
+    pp->first_partition_size = h->h.compressed_header_size;
+
+    pp->StatusReportFeedbackNumber = 1 + DXVA_CONTEXT_REPORT_ID(avctx, ctx)++;
+    return 0;
+}
+
+static void fill_slice_short(DXVA_Slice_VPx_Short *slice,
+                             unsigned position, unsigned size)
+{
+    memset(slice, 0, sizeof(*slice));
+    slice->BSNALunitDataLocation = position;
+    slice->SliceBytesInBuffer    = size;
+    slice->wBadSliceChopping     = 0;
+}
+
+static int commit_bitstream_and_slice_buffer(AVCodecContext *avctx,
+                                             DECODER_BUFFER_DESC *bs,
+                                             DECODER_BUFFER_DESC *sc)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    AVDXVAContext *ctx = avctx->hwaccel_context;
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    void     *dxva_data_ptr;
+    uint8_t  *dxva_data;
+    unsigned dxva_size;
+    unsigned padding;
+    unsigned type;
+
+#if CONFIG_D3D11VA
+    if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD) {
+        type = D3D11_VIDEO_DECODER_BUFFER_BITSTREAM;
+        if (FAILED(ID3D11VideoContext_GetDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context,
+                                                       D3D11VA_CONTEXT(ctx)->decoder,
+                                                       type,
+                                                       &dxva_size, &dxva_data_ptr)))
+            return -1;
+    }
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
+        type = DXVA2_BitStreamDateBufferType;
+        if (FAILED(IDirectXVideoDecoder_GetBuffer(DXVA2_CONTEXT(ctx)->decoder,
+                                                  type,
+                                                  &dxva_data_ptr, &dxva_size)))
+            return -1;
+    }
+#endif
+
+    dxva_data = dxva_data_ptr;
+
+    if (ctx_pic->slice.SliceBytesInBuffer > dxva_size) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to build bitstream");
+        return -1;
+    }
+
+    memcpy(dxva_data, ctx_pic->bitstream, ctx_pic->slice.SliceBytesInBuffer);
+
+    padding = FFMIN(128 - ((ctx_pic->slice.SliceBytesInBuffer) & 127), dxva_size - ctx_pic->slice.SliceBytesInBuffer);
+    if (padding > 0) {
+        memset(dxva_data + ctx_pic->slice.SliceBytesInBuffer, 0, padding);
+        ctx_pic->slice.SliceBytesInBuffer += padding;
+    }
+
+#if CONFIG_D3D11VA
+    if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD)
+        if (FAILED(ID3D11VideoContext_ReleaseDecoderBuffer(D3D11VA_CONTEXT(ctx)->video_context, D3D11VA_CONTEXT(ctx)->decoder, type)))
+            return -1;
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD)
+        if (FAILED(IDirectXVideoDecoder_ReleaseBuffer(DXVA2_CONTEXT(ctx)->decoder, type)))
+            return -1;
+#endif
+
+#if CONFIG_D3D11VA
+    if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD) {
+        D3D11_VIDEO_DECODER_BUFFER_DESC *dsc11 = bs;
+        memset(dsc11, 0, sizeof(*dsc11));
+        dsc11->BufferType           = type;
+        dsc11->DataSize             = ctx_pic->slice.SliceBytesInBuffer;
+        dsc11->NumMBsInBuffer       = 0;
+
+        type = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
+    }
+#endif
+#if CONFIG_DXVA2
+    if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) {
+        DXVA2_DecodeBufferDesc *dsc2 = bs;
+        memset(dsc2, 0, sizeof(*dsc2));
+        dsc2->CompressedBufferType = type;
+        dsc2->DataSize             = ctx_pic->slice.SliceBytesInBuffer;
+        dsc2->NumMBsInBuffer       = 0;
+
+        type = DXVA2_SliceControlBufferType;
+    }
+#endif
+
+    return ff_dxva2_commit_buffer(avctx, ctx, sc,
+                                  type,
+                                  &ctx_pic->slice, sizeof(ctx_pic->slice), 0);
+}
+
+
+static int dxva2_vp9_start_frame(AVCodecContext *avctx,
+                                 av_unused const uint8_t *buffer,
+                                 av_unused uint32_t size)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    AVDXVAContext *ctx = avctx->hwaccel_context;
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+
+    if (DXVA_CONTEXT_DECODER(avctx, ctx) == NULL ||
+        DXVA_CONTEXT_CFG(avctx, ctx) == NULL ||
+        DXVA_CONTEXT_COUNT(avctx, ctx) <= 0)
+        return -1;
+    av_assert0(ctx_pic);
+
+    /* Fill up DXVA_PicParams_VP9 */
+    if (fill_picture_parameters(avctx, ctx, h, &ctx_pic->pp) < 0)
+        return -1;
+
+    ctx_pic->bitstream_size = 0;
+    ctx_pic->bitstream      = NULL;
+    return 0;
+}
+
+static int dxva2_vp9_decode_slice(AVCodecContext *avctx,
+                                  const uint8_t *buffer,
+                                  uint32_t size)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    unsigned position;
+
+    if (!ctx_pic->bitstream)
+        ctx_pic->bitstream = buffer;
+    ctx_pic->bitstream_size += size;
+
+    position = buffer - ctx_pic->bitstream;
+    fill_slice_short(&ctx_pic->slice, position, size);
+
+    return 0;
+}
+
+static int dxva2_vp9_end_frame(AVCodecContext *avctx)
+{
+    VP9SharedContext *h = avctx->priv_data;
+    struct vp9_dxva2_picture_context *ctx_pic = h->frames[CUR_FRAME].hwaccel_picture_private;
+    int ret;
+
+    if (ctx_pic->bitstream_size <= 0)
+        return -1;
+
+    ret = ff_dxva2_common_end_frame(avctx, h->frames[CUR_FRAME].tf.f,
+                                    &ctx_pic->pp, sizeof(ctx_pic->pp),
+                                    NULL, 0,
+                                    commit_bitstream_and_slice_buffer);
+    return ret;
+}
+
+#if CONFIG_VP9_DXVA2_HWACCEL
+AVHWAccel ff_vp9_dxva2_hwaccel = {
+    .name           = "vp9_dxva2",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VP9,
+    .pix_fmt        = AV_PIX_FMT_DXVA2_VLD,
+    .start_frame    = dxva2_vp9_start_frame,
+    .decode_slice   = dxva2_vp9_decode_slice,
+    .end_frame      = dxva2_vp9_end_frame,
+    .frame_priv_data_size = sizeof(struct vp9_dxva2_picture_context),
+};
+#endif
+
+#if CONFIG_VP9_D3D11VA_HWACCEL
+AVHWAccel ff_vp9_d3d11va_hwaccel = {
+    .name           = "vp9_d3d11va",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VP9,
+    .pix_fmt        = AV_PIX_FMT_D3D11VA_VLD,
+    .start_frame    = dxva2_vp9_start_frame,
+    .decode_slice   = dxva2_vp9_decode_slice,
+    .end_frame      = dxva2_vp9_end_frame,
+    .frame_priv_data_size = sizeof(struct vp9_dxva2_picture_context),
+};
+#endif
diff --git a/libavcodec/eac3_data.c b/libavcodec/eac3_data.c
index b0416f3..b159e16 100644
--- a/libavcodec/eac3_data.c
+++ b/libavcodec/eac3_data.c
@@ -2,20 +2,20 @@
  * E-AC-3 tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eac3_data.h b/libavcodec/eac3_data.h
index 4d88ce0..10a67f1 100644
--- a/libavcodec/eac3_data.h
+++ b/libavcodec/eac3_data.h
@@ -2,20 +2,20 @@
  * E-AC-3 tables
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eac3dec.c b/libavcodec/eac3dec.c
index fe52d27..47e5aa6 100644
--- a/libavcodec/eac3dec.c
+++ b/libavcodec/eac3dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Bartlomiej Wolowiec <bartek.wolowiec@gmail.com>
  * Copyright (c) 2008 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,7 +63,7 @@ typedef enum {
 
 #define EAC3_SR_CODE_REDUCED  3
 
-void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
+static void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
 {
     int bin, bnd, ch, i;
     uint8_t wrapflag[SPX_MAX_BANDS]={1,0,}, num_copy_sections, copy_sizes[SPX_MAX_BANDS];
@@ -101,7 +101,7 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
         for (i = 0; i < num_copy_sections; i++) {
             memcpy(&s->transform_coeffs[ch][bin],
                    &s->transform_coeffs[ch][s->spx_dst_start_freq],
-                   copy_sizes[i]*sizeof(float));
+                   copy_sizes[i]*sizeof(INTFLOAT));
             bin += copy_sizes[i];
         }
 
@@ -124,7 +124,7 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
             bin = s->spx_src_start_freq - 2;
             for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
                 if (wrapflag[bnd]) {
-                    float *coeffs = &s->transform_coeffs[ch][bin];
+                    INTFLOAT *coeffs = &s->transform_coeffs[ch][bin];
                     coeffs[0] *= atten_tab[0];
                     coeffs[1] *= atten_tab[1];
                     coeffs[2] *= atten_tab[2];
@@ -142,6 +142,11 @@ void ff_eac3_apply_spectral_extension(AC3DecodeContext *s)
         for (bnd = 0; bnd < s->num_spx_bands; bnd++) {
             float nscale = s->spx_noise_blend[ch][bnd] * rms_energy[bnd] * (1.0f / INT32_MIN);
             float sscale = s->spx_signal_blend[ch][bnd];
+#if USE_FIXED
+            // spx_noise_blend and spx_signal_blend are both FP.23
+            nscale *= 1.0 / (1<<23);
+            sscale *= 1.0 / (1<<23);
+#endif
             for (i = 0; i < s->spx_band_sizes[bnd]; i++) {
                 float noise  = nscale * (int32_t)av_lfg_get(&s->dith_state);
                 s->transform_coeffs[ch][bin]   *= sscale;
@@ -195,7 +200,7 @@ static void idct6(int pre_mant[6])
     pre_mant[5] = even0 - odd0;
 }
 
-void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
+static void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
 {
     int bin, blk, gs;
     int end_bap, gaq_mode;
@@ -288,7 +293,7 @@ void ff_eac3_decode_transform_coeffs_aht_ch(AC3DecodeContext *s, int ch)
     }
 }
 
-int ff_eac3_parse_header(AC3DecodeContext *s)
+static int ff_eac3_parse_header(AC3DecodeContext *s)
 {
     int i, blk, ch;
     int ac3_exponent_strategy, parse_aht_info, parse_spx_atten_data;
diff --git a/libavcodec/eac3enc.c b/libavcodec/eac3enc.c
index 3aa2d54..e1d61f6 100644
--- a/libavcodec/eac3enc.c
+++ b/libavcodec/eac3enc.c
@@ -2,20 +2,20 @@
  * E-AC-3 encoder
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,13 @@
 
 #define AC3ENC_TYPE AC3ENC_TYPE_EAC3
 #include "ac3enc_opts_template.c"
-static const AVClass eac3enc_class = { "E-AC-3 Encoder", av_default_item_name,
-                                       ac3_options, LIBAVUTIL_VERSION_INT };
 
+static const AVClass eac3enc_class = {
+    .class_name = "E-AC-3 Encoder",
+    .item_name  = av_default_item_name,
+    .option     = ac3_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 /**
  * LUT for finding a matching frame exponent strategy index from a set of
diff --git a/libavcodec/eac3enc.h b/libavcodec/eac3enc.h
index a92a24c..7d61559 100644
--- a/libavcodec/eac3enc.h
+++ b/libavcodec/eac3enc.h
@@ -2,20 +2,20 @@
  * E-AC-3 encoder
  * Copyright (c) 2011 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eacmv.c b/libavcodec/eacmv.c
index 9668f64..047be81 100644
--- a/libavcodec/eacmv.c
+++ b/libavcodec/eacmv.c
@@ -2,20 +2,20 @@
  * Electronic Arts CMV Video Decoder
  * Copyright (c) 2007-2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -44,6 +44,7 @@ typedef struct CmvContext {
 
 static av_cold int cmv_decode_init(AVCodecContext *avctx){
     CmvContext *s = avctx->priv_data;
+
     s->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
@@ -160,7 +161,7 @@ static int cmv_process_header(CmvContext *s, const uint8_t *buf, const uint8_t *
 
     buf += 16;
     for (i=pal_start; i<pal_start+pal_count && i<AVPALETTE_COUNT && buf_end - buf >= 3; i++) {
-        s->palette[i] = AV_RB24(buf);
+        s->palette[i] = 0xFFU << 24 | AV_RB24(buf);
         buf += 3;
     }
 
@@ -185,19 +186,20 @@ static int cmv_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
 
     if (AV_RL32(buf)==MVIh_TAG||AV_RB32(buf)==MVIh_TAG) {
+        unsigned size = AV_RL32(buf + 4);
         ret = cmv_process_header(s, buf+EA_PREAMBLE_SIZE, buf_end);
         if (ret < 0)
             return ret;
-        return buf_size;
+        if (size > buf_end - buf - EA_PREAMBLE_SIZE)
+            return -1;
+        buf += size;
     }
 
     if (av_image_check_size(s->width, s->height, 0, s->avctx))
         return -1;
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     memcpy(frame->data[1], s->palette, AVPALETTE_SIZE);
 
diff --git a/libavcodec/eaidct.c b/libavcodec/eaidct.c
index 5b2db44..e4840f2 100644
--- a/libavcodec/eaidct.c
+++ b/libavcodec/eaidct.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGQ/TQI/MAD IDCT algorithm
  * Copyright (c) 2007-2008 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eaidct.h b/libavcodec/eaidct.h
index e78de04..6b9ec1c 100644
--- a/libavcodec/eaidct.h
+++ b/libavcodec/eaidct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c
index 3e8d4fd..bb0f005 100644
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -2,20 +2,20 @@
  * Electronic Arts Madcow Video Decoder
  * Copyright (c) 2007-2009 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -94,15 +94,21 @@ static inline void comp_block(MadContext *t, AVFrame *frame,
                               int j, int mv_x, int mv_y, int add)
 {
     if (j < 4) {
+        unsigned offset = (mb_y*16 + ((j&2)<<2) + mv_y)*t->last_frame->linesize[0] + mb_x*16 + ((j&1)<<3) + mv_x;
+        if (offset >= (t->avctx->height - 7) * t->last_frame->linesize[0] - 7)
+            return;
         comp(frame->data[0] + (mb_y*16 + ((j&2)<<2))*frame->linesize[0] + mb_x*16 + ((j&1)<<3),
              frame->linesize[0],
-             t->last_frame->data[0] + (mb_y*16 + ((j&2)<<2) + mv_y)*t->last_frame->linesize[0] + mb_x*16 + ((j&1)<<3) + mv_x,
+             t->last_frame->data[0] + offset,
              t->last_frame->linesize[0], add);
     } else if (!(t->avctx->flags & AV_CODEC_FLAG_GRAY)) {
         int index = j - 3;
+        unsigned offset = (mb_y * 8 + (mv_y/2))*t->last_frame->linesize[index] + mb_x * 8 + (mv_x/2);
+        if (offset >= (t->avctx->height/2 - 7) * t->last_frame->linesize[index] - 7)
+            return;
         comp(frame->data[index] + (mb_y*8)*frame->linesize[index] + mb_x * 8,
              frame->linesize[index],
-             t->last_frame->data[index] + (mb_y * 8 + (mv_y/2))*t->last_frame->linesize[index] + mb_x * 8 + (mv_x/2),
+             t->last_frame->data[index] + offset,
              t->last_frame->linesize[index], add);
     }
 }
@@ -122,7 +128,7 @@ static inline void idct_put(MadContext *t, AVFrame *frame, int16_t *block,
     }
 }
 
-static inline void decode_block_intra(MadContext *s, int16_t * block)
+static inline int decode_block_intra(MadContext *s, int16_t * block)
 {
     int level, i, j, run;
     RLTable *rl = &ff_rl_mpeg1;
@@ -148,7 +154,7 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
                 if (i > 63) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                    return;
+                    return -1;
                 }
                 j = scantable[i];
                 level = (level*quant_matrix[j]) >> 4;
@@ -167,7 +173,7 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
                 if (i > 63) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                    return;
+                    return -1;
                 }
                 j = scantable[i];
                 if (level < 0) {
@@ -185,6 +191,7 @@ static inline void decode_block_intra(MadContext *s, int16_t * block)
         }
         CLOSE_READER(re, &s->gb);
     }
+    return 0;
 }
 
 static int decode_motion(GetBitContext *gb)
@@ -198,10 +205,10 @@ static int decode_motion(GetBitContext *gb)
     return value;
 }
 
-static void decode_mb(MadContext *s, AVFrame *frame, int inter)
+static int decode_mb(MadContext *s, AVFrame *frame, int inter)
 {
     int mv_map = 0;
-    int mv_x, mv_y;
+    int av_uninit(mv_x), av_uninit(mv_y);
     int j;
 
     if (inter) {
@@ -210,21 +217,22 @@ static void decode_mb(MadContext *s, AVFrame *frame, int inter)
             mv_map = v ? get_bits(&s->gb, 6) : 63;
             mv_x = decode_motion(&s->gb);
             mv_y = decode_motion(&s->gb);
-        } else {
-            mv_map = 0;
         }
     }
 
     for (j=0; j<6; j++) {
         if (mv_map & (1<<j)) {  // mv_x and mv_y are guarded by mv_map
             int add = 2*decode_motion(&s->gb);
-            comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add);
+            if (s->last_frame->data[0])
+                comp_block(s, frame, s->mb_x, s->mb_y, j, mv_x, mv_y, add);
         } else {
             s->bdsp.clear_block(s->block);
-            decode_block_intra(s, s->block);
+            if(decode_block_intra(s, s->block) < 0)
+                return -1;
             idct_put(s, frame, s->block, s->mb_x, s->mb_y, j);
         }
     }
+    return 0;
 }
 
 static void calc_quant_matrix(MadContext *s, int qscale)
@@ -269,16 +277,21 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    if (width < 16 || height < 16) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     if (avctx->width != width || avctx->height != height) {
         av_frame_unref(s->last_frame);
+        if((width * height)/2048*7 > bytestream2_get_bytes_left(&gb))
+            return AVERROR_INVALIDDATA;
         if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
             return ret;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (inter && !s->last_frame->data[0]) {
         av_log(avctx, AV_LOG_WARNING, "Missing reference frame.\n");
@@ -299,11 +312,13 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR(ENOMEM);
     s->bbdsp.bswap16_buf(s->bitstream_buf, (const uint16_t *)(buf + bytestream2_tell(&gb)),
                          bytestream2_get_bytes_left(&gb) / 2);
+    memset((uint8_t*)s->bitstream_buf + bytestream2_get_bytes_left(&gb), 0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&s->gb, s->bitstream_buf, 8*(bytestream2_get_bytes_left(&gb)));
 
     for (s->mb_y=0; s->mb_y < (avctx->height+15)/16; s->mb_y++)
         for (s->mb_x=0; s->mb_x < (avctx->width +15)/16; s->mb_x++)
-            decode_mb(s, frame, inter);
+            if(decode_mb(s, frame, inter) < 0)
+                return AVERROR_INVALIDDATA;
 
     *got_frame = 1;
 
@@ -320,7 +335,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
 {
     MadContext *t = avctx->priv_data;
     av_frame_free(&t->last_frame);
-    av_free(t->bitstream_buf);
+    av_freep(&t->bitstream_buf);
     return 0;
 }
 
diff --git a/libavcodec/eatgq.c b/libavcodec/eatgq.c
index a0496a0..f8a47cb 100644
--- a/libavcodec/eatgq.c
+++ b/libavcodec/eatgq.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGQ Video Decoder
  * Copyright (c) 2007-2008 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -148,7 +148,7 @@ static void tgq_idct_put_mb_dconly(TgqContext *s, AVFrame *frame,
     }
 }
 
-static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
+static int tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
 {
     int mode;
     int i;
@@ -157,7 +157,10 @@ static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
     mode = bytestream2_get_byte(&s->gb);
     if (mode > 12) {
         GetBitContext gb;
-        init_get_bits(&gb, s->gb.buffer, FFMIN(s->gb.buffer_end - s->gb.buffer, mode) * 8);
+        int ret = init_get_bits8(&gb, s->gb.buffer, FFMIN(bytestream2_get_bytes_left(&s->gb), mode));
+        if (ret < 0)
+            return ret;
+
         for (i = 0; i < 6; i++)
             tgq_decode_block(s, s->block[i], &gb);
         tgq_idct_put_mb(s, s->block, frame, mb_x, mb_y);
@@ -176,9 +179,11 @@ static void tgq_decode_mb(TgqContext *s, AVFrame *frame, int mb_y, int mb_x)
             }
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "unsupported mb mode %i\n", mode);
+            return -1;
         }
         tgq_idct_put_mb_dconly(s, frame, mb_x, mb_y, dc);
     }
+    return 0;
 }
 
 static void tgq_calculate_qtable(TgqContext *s, int quant)
@@ -201,12 +206,13 @@ static int tgq_decode_frame(AVCodecContext *avctx,
     TgqContext *s      = avctx->priv_data;
     AVFrame *frame     = data;
     int x, y, ret;
-    int big_endian = AV_RL32(&buf[4]) > 0x000FFFFF;
+    int big_endian;
 
     if (buf_size < 16) {
         av_log(avctx, AV_LOG_WARNING, "truncated header\n");
         return AVERROR_INVALIDDATA;
     }
+    big_endian = AV_RL32(&buf[4]) > 0x000FFFFF;
     bytestream2_init(&s->gb, buf + 8, buf_size - 8);
     if (big_endian) {
         s->width  = bytestream2_get_be16u(&s->gb);
@@ -223,16 +229,15 @@ static int tgq_decode_frame(AVCodecContext *avctx,
     tgq_calculate_qtable(s, bytestream2_get_byteu(&s->gb));
     bytestream2_skip(&s->gb, 3);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     frame->key_frame = 1;
     frame->pict_type = AV_PICTURE_TYPE_I;
 
     for (y = 0; y < FFALIGN(avctx->height, 16) >> 4; y++)
         for (x = 0; x < FFALIGN(avctx->width, 16) >> 4; x++)
-            tgq_decode_mb(s, frame, y, x);
+            if (tgq_decode_mb(s, frame, y, x) < 0)
+                return AVERROR_INVALIDDATA;
 
     *got_frame = 1;
 
diff --git a/libavcodec/eatgv.c b/libavcodec/eatgv.c
index 4faae50..882bf07 100644
--- a/libavcodec/eatgv.c
+++ b/libavcodec/eatgv.c
@@ -2,20 +2,20 @@
  * Electronic Arts TGV Video Decoder
  * Copyright (c) 2007-2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -81,7 +81,7 @@ static int unpack(const uint8_t *src, const uint8_t *src_end,
     else
         src += 2;
 
-    if (src + 3 > src_end)
+    if (src_end - src < 3)
         return AVERROR_INVALIDDATA;
     size = AV_RB24(src);
     src += 3;
@@ -156,7 +156,7 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     int mvbits;
     const uint8_t *blocks_raw;
 
-    if (buf + 12 > buf_end)
+    if(buf_end - buf < 12)
         return AVERROR_INVALIDDATA;
 
     num_mvs           = AV_RL16(&buf[0]);
@@ -173,9 +173,11 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
 
     /* allocate codebook buffers as necessary */
     if (num_mvs > s->num_mvs) {
-        int err = av_reallocp(&s->mv_codebook, num_mvs * 2 * sizeof(int));
-        if (err < 0)
+        int err = av_reallocp_array(&s->mv_codebook, num_mvs, sizeof(*s->mv_codebook));
+        if (err < 0) {
+            s->num_mvs = 0;
             return err;
+        }
         s->num_mvs = num_mvs;
     }
 
@@ -191,7 +193,7 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
     /* read motion vectors */
     mvbits = (num_mvs * 2 * 10 + 31) & ~31;
 
-    if (buf + (mvbits >> 3) + 16 * num_blocks_raw + 8 * num_blocks_packed > buf_end)
+    if (buf_end - buf < (mvbits>>3) + 16*num_blocks_raw + 8*num_blocks_packed)
         return AVERROR_INVALIDDATA;
 
     init_get_bits(&gb, buf, mvbits);
@@ -231,8 +233,10 @@ static int tgv_decode_inter(TgvContext *s, AVFrame *frame,
                 int my = y * 4 + s->mv_codebook[vector][1];
 
                 if (mx < 0 || mx + 4 > s->avctx->width ||
-                    my < 0 || my + 4 > s->avctx->height)
+                    my < 0 || my + 4 > s->avctx->height) {
+                    av_log(s->avctx, AV_LOG_ERROR, "MV %d %d out of picture\n", mx, my);
                     continue;
+                }
 
                 src = s->last_frame->data[0] + mx + my * s->last_frame->linesize[0];
                 src_stride = s->last_frame->linesize[0];
@@ -267,12 +271,15 @@ static int tgv_decode_frame(AVCodecContext *avctx,
     AVFrame *frame         = data;
     int chunk_type, ret;
 
+    if (buf_end - buf < EA_PREAMBLE_SIZE)
+        return AVERROR_INVALIDDATA;
+
     chunk_type = AV_RL32(&buf[0]);
     buf       += EA_PREAMBLE_SIZE;
 
     if (chunk_type == kVGT_TAG) {
         int pal_count, i;
-        if (buf + 12 > buf_end) {
+        if(buf_end - buf < 12) {
             av_log(avctx, AV_LOG_WARNING, "truncated header\n");
             return AVERROR_INVALIDDATA;
         }
@@ -288,8 +295,8 @@ static int tgv_decode_frame(AVCodecContext *avctx,
 
         pal_count = AV_RL16(&buf[6]);
         buf += 12;
-        for (i = 0; i < pal_count && i < AVPALETTE_COUNT && buf + 2 < buf_end; i++) {
-            s->palette[i] = AV_RB24(buf);
+        for(i = 0; i < pal_count && i < AVPALETTE_COUNT && buf_end - buf >= 3; i++) {
+            s->palette[i] = 0xFFU << 24 | AV_RB24(buf);
             buf += 3;
         }
     }
@@ -305,7 +312,7 @@ static int tgv_decode_frame(AVCodecContext *avctx,
         frame->pict_type = AV_PICTURE_TYPE_I;
 
         if (!s->frame_buffer &&
-            !(s->frame_buffer = av_malloc(s->width * s->height)))
+            !(s->frame_buffer = av_mallocz(s->width * s->height)))
             return AVERROR(ENOMEM);
 
         if (unpack(buf, buf_end, s->frame_buffer, s->avctx->width, s->avctx->height) < 0) {
@@ -343,8 +350,8 @@ static av_cold int tgv_decode_end(AVCodecContext *avctx)
     TgvContext *s = avctx->priv_data;
     av_frame_free(&s->last_frame);
     av_freep(&s->frame_buffer);
-    av_free(s->mv_codebook);
-    av_free(s->block_codebook);
+    av_freep(&s->mv_codebook);
+    av_freep(&s->block_codebook);
     return 0;
 }
 
diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c
index f4cad9c..8fd5cdb 100644
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -2,20 +2,20 @@
  * Electronic Arts TQI Video Decoder
  * Copyright (c) 2007-2009 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
@@ -37,6 +37,7 @@
 #include "mpeg12.h"
 
 typedef struct TqiContext {
+    AVCodecContext *avctx;
     GetBitContext gb;
     BlockDSPContext bdsp;
     BswapDSPContext bsdsp;
@@ -79,8 +80,11 @@ static int tqi_decode_mb(TqiContext *t, int16_t (*block)[64])
                                               t->intra_matrix,
                                               t->intra_scantable.permutated,
                                               t->last_dc, block[n], n, 1);
-        if (ret < 0)
+        if (ret < 0) {
+            av_log(t->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n",
+                   t->mb_x, t->mb_y);
             return -1;
+        }
     }
 
     return 0;
@@ -127,6 +131,8 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     int ret, w, h;
 
+    t->avctx = avctx;
+
     w = AV_RL16(&buf[0]);
     h = AV_RL16(&buf[2]);
     tqi_calculate_qtable(t, buf[4]);
@@ -136,10 +142,8 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     av_fast_padded_malloc(&t->bitstream_buf, &t->bitstream_buf_size,
                           buf_end - buf);
@@ -155,10 +159,11 @@ static int tqi_decode_frame(AVCodecContext *avctx,
     for (t->mb_y = 0; t->mb_y < (h + 15) / 16; t->mb_y++) {
         for (t->mb_x = 0; t->mb_x < (w + 15) / 16; t->mb_x++) {
             if (tqi_decode_mb(t, t->block) < 0)
-                break;
+                goto end;
             tqi_idct_put(avctx, frame, t->block);
         }
     }
+    end:
 
     *got_frame = 1;
     return buf_size;
@@ -167,7 +172,7 @@ static int tqi_decode_frame(AVCodecContext *avctx,
 static av_cold int tqi_decode_end(AVCodecContext *avctx)
 {
     TqiContext *t = avctx->priv_data;
-    av_free(t->bitstream_buf);
+    av_freep(&t->bitstream_buf);
     return 0;
 }
 
diff --git a/libavcodec/elbg.c b/libavcodec/elbg.c
index 07bb2e3..b6049c9 100644
--- a/libavcodec/elbg.c
+++ b/libavcodec/elbg.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007 Vitor Sessak <vitor1001@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/lfg.h"
 #include "elbg.h"
@@ -50,7 +51,7 @@ typedef struct elbg_data {
     int *codebook;
     cell **cells;
     int *utility;
-    int *utility_inc;
+    int64_t *utility_inc;
     int *nearest_cb;
     int *points;
     AVLFG *rand_state;
@@ -107,11 +108,20 @@ static int get_high_utility_cell(elbg_data *elbg)
 {
     int i=0;
     /* Using linear search, do binary if it ever turns to be speed critical */
-    int r = av_lfg_get(elbg->rand_state)%elbg->utility_inc[elbg->numCB-1] + 1;
-    while (elbg->utility_inc[i] < r)
+    uint64_t r;
+
+    if (elbg->utility_inc[elbg->numCB-1] < INT_MAX) {
+        r = av_lfg_get(elbg->rand_state) % (unsigned int)elbg->utility_inc[elbg->numCB-1] + 1;
+    } else {
+        r = av_lfg_get(elbg->rand_state);
+        r = (av_lfg_get(elbg->rand_state) + (r<<32)) % elbg->utility_inc[elbg->numCB-1] + 1;
+    }
+
+    while (elbg->utility_inc[i] < r) {
         i++;
+    }
 
-    assert(elbg->cells[i]);
+    av_assert2(elbg->cells[i]);
 
     return i;
 }
@@ -226,7 +236,8 @@ static void shift_codebook(elbg_data *elbg, int *indexes,
 
 static void evaluate_utility_inc(elbg_data *elbg)
 {
-    int i, inc=0;
+    int i;
+    int64_t inc=0;
 
     for (i=0; i < elbg->numCB; i++) {
         if (elbg->numCB*elbg->utility[i] > elbg->error)
@@ -323,7 +334,7 @@ static void do_shiftings(elbg_data *elbg)
 
 #define BIG_PRIME 433494437LL
 
-int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_init_elbg(int *points, int dim, int numpoints, int *codebook,
                  int numCB, int max_steps, int *closest_cb,
                  AVLFG *rand_state)
 {
@@ -332,7 +343,7 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
     if (numpoints > 24*numCB) {
         /* ELBG is very costly for a big number of points. So if we have a lot
            of them, get a good initial codebook to save on iterations       */
-        int *temp_points = av_malloc(dim*(numpoints/8)*sizeof(int));
+        int *temp_points = av_malloc_array(dim, (numpoints/8)*sizeof(int));
         if (!temp_points)
             return AVERROR(ENOMEM);
         for (i=0; i<numpoints/8; i++) {
@@ -340,14 +351,14 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
             memcpy(temp_points + i*dim, points + k*dim, dim*sizeof(int));
         }
 
-        ret = ff_init_elbg(temp_points, dim, numpoints / 8, codebook,
-                           numCB, 2 * max_steps, closest_cb, rand_state);
+        ret = avpriv_init_elbg(temp_points, dim, numpoints / 8, codebook,
+                               numCB, 2 * max_steps, closest_cb, rand_state);
         if (ret < 0) {
             av_freep(&temp_points);
             return ret;
         }
-        ret = ff_do_elbg(temp_points, dim, numpoints / 8, codebook,
-                         numCB, 2 * max_steps, closest_cb, rand_state);
+        ret = avpriv_do_elbg(temp_points, dim, numpoints / 8, codebook,
+                             numCB, 2 * max_steps, closest_cb, rand_state);
         av_free(temp_points);
 
     } else  // If not, initialize the codebook with random positions
@@ -357,7 +368,7 @@ int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
     return ret;
 }
 
-int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_do_elbg(int *points, int dim, int numpoints, int *codebook,
                 int numCB, int max_steps, int *closest_cb,
                 AVLFG *rand_state)
 {
@@ -365,9 +376,9 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
     elbg_data elbg_d;
     elbg_data *elbg = &elbg_d;
     int i, j, k, last_error, steps = 0, ret = 0;
-    int *dist_cb = av_malloc(numpoints*sizeof(int));
-    int *size_part = av_malloc(numCB*sizeof(int));
-    cell *list_buffer = av_malloc(numpoints*sizeof(cell));
+    int *dist_cb = av_malloc_array(numpoints, sizeof(int));
+    int *size_part = av_malloc_array(numCB, sizeof(int));
+    cell *list_buffer = av_malloc_array(numpoints, sizeof(cell));
     cell *free_cells;
     int best_dist, best_idx = 0;
 
@@ -375,12 +386,12 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
     elbg->dim = dim;
     elbg->numCB = numCB;
     elbg->codebook = codebook;
-    elbg->cells = av_malloc(numCB*sizeof(cell *));
-    elbg->utility = av_malloc(numCB*sizeof(int));
+    elbg->cells = av_malloc_array(numCB, sizeof(cell *));
+    elbg->utility = av_malloc_array(numCB, sizeof(int));
     elbg->nearest_cb = closest_cb;
     elbg->points = points;
-    elbg->utility_inc = av_malloc(numCB*sizeof(int));
-    elbg->scratchbuf = av_malloc(5*dim*sizeof(int));
+    elbg->utility_inc = av_malloc_array(numCB, sizeof(*elbg->utility_inc));
+    elbg->scratchbuf = av_malloc_array(5*dim, sizeof(int));
 
     if (!dist_cb || !size_part || !list_buffer || !elbg->cells ||
         !elbg->utility || !elbg->utility_inc || !elbg->scratchbuf) {
diff --git a/libavcodec/elbg.h b/libavcodec/elbg.h
index 3b1587a..f48aa3b 100644
--- a/libavcodec/elbg.h
+++ b/libavcodec/elbg.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007 Vitor Sessak <vitor1001@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
  * @param rand_state A random number generator state. Should be already initialized by av_lfg_init().
  * @return < 0 in case of error, 0 otherwise
  */
-int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_do_elbg(int *points, int dim, int numpoints, int *codebook,
                int numCB, int num_steps, int *closest_cb,
                AVLFG *rand_state);
 
@@ -46,11 +46,11 @@ int ff_do_elbg(int *points, int dim, int numpoints, int *codebook,
  * Initialize the **codebook vector for the elbg algorithm. If you have already
  * a codebook and you want to refine it, you shouldn't call this function.
  * If numpoints < 8*numCB this function fills **codebook with random numbers.
- * If not, it calls ff_do_elbg for a (smaller) random sample of the points in
- * **points. Get the same parameters as ff_do_elbg.
+ * If not, it calls avpriv_do_elbg for a (smaller) random sample of the points in
+ * **points. Get the same parameters as avpriv_do_elbg.
  * @return < 0 in case of error, 0 otherwise
  */
-int ff_init_elbg(int *points, int dim, int numpoints, int *codebook,
+int avpriv_init_elbg(int *points, int dim, int numpoints, int *codebook,
                  int numCB, int num_steps, int *closest_cb,
                  AVLFG *rand_state);
 
diff --git a/libavcodec/elsdec.c b/libavcodec/elsdec.c
index 10a1a9d..4797965 100644
--- a/libavcodec/elsdec.c
+++ b/libavcodec/elsdec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/elsdec.h b/libavcodec/elsdec.h
index 515b49a..139a24a 100644
--- a/libavcodec/elsdec.h
+++ b/libavcodec/elsdec.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c
index eb6fd14..23d43e2 100644
--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 
 #include <limits.h>
 
+#include "libavutil/atomic.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
 #include "error_resilience.h"
@@ -44,7 +45,7 @@
 static void set_mv_strides(ERContext *s, int *mv_step, int *stride)
 {
     if (s->avctx->codec_id == AV_CODEC_ID_H264) {
-        assert(s->quarter_sample);
+        av_assert0(s->quarter_sample);
         *mv_step = 4;
         *stride  = s->mb_width * 4;
     } else {
@@ -83,6 +84,8 @@ static void put_dc(ERContext *s, uint8_t *dest_y, uint8_t *dest_cb,
         dcv = 0;
     else if (dcv > 2040)
         dcv = 2040;
+
+    if (dest_cr)
     for (y = 0; y < 8; y++) {
         int x;
         for (x = 0; x < 8; x++) {
@@ -137,11 +140,73 @@ static void guess_dc(ERContext *s, int16_t *dc, int w,
                      int h, int stride, int is_luma)
 {
     int b_x, b_y;
+    int16_t  (*col )[4] = av_malloc_array(stride, h*sizeof( int16_t)*4);
+    uint32_t (*dist)[4] = av_malloc_array(stride, h*sizeof(uint32_t)*4);
+
+    if(!col || !dist) {
+        av_log(s->avctx, AV_LOG_ERROR, "guess_dc() is out of memory\n");
+        goto fail;
+    }
+
+    for(b_y=0; b_y<h; b_y++){
+        int color= 1024;
+        int distance= -1;
+        for(b_x=0; b_x<w; b_x++){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_x;
+            }
+            col [b_x + b_y*stride][1]= color;
+            dist[b_x + b_y*stride][1]= distance >= 0 ? b_x-distance : 9999;
+        }
+        color= 1024;
+        distance= -1;
+        for(b_x=w-1; b_x>=0; b_x--){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_x;
+            }
+            col [b_x + b_y*stride][0]= color;
+            dist[b_x + b_y*stride][0]= distance >= 0 ? distance-b_x : 9999;
+        }
+    }
+    for(b_x=0; b_x<w; b_x++){
+        int color= 1024;
+        int distance= -1;
+        for(b_y=0; b_y<h; b_y++){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_y;
+            }
+            col [b_x + b_y*stride][3]= color;
+            dist[b_x + b_y*stride][3]= distance >= 0 ? b_y-distance : 9999;
+        }
+        color= 1024;
+        distance= -1;
+        for(b_y=h-1; b_y>=0; b_y--){
+            int mb_index_j= (b_x>>is_luma) + (b_y>>is_luma)*s->mb_stride;
+            int error_j= s->error_status_table[mb_index_j];
+            int intra_j = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
+            if(intra_j==0 || !(error_j&ER_DC_ERROR)){
+                color= dc[b_x + b_y*stride];
+                distance= b_y;
+            }
+            col [b_x + b_y*stride][2]= color;
+            dist[b_x + b_y*stride][2]= distance >= 0 ? distance-b_y : 9999;
+        }
+    }
 
     for (b_y = 0; b_y < h; b_y++) {
         for (b_x = 0; b_x < w; b_x++) {
-            int color[4]    = { 1024, 1024, 1024, 1024 };
-            int distance[4] = { 9999, 9999, 9999, 9999 };
             int mb_index, error, j;
             int64_t guess, weight_sum;
             mb_index = (b_x >> is_luma) + (b_y >> is_luma) * s->mb_stride;
@@ -152,66 +217,21 @@ static void guess_dc(ERContext *s, int16_t *dc, int w,
             if (!(error & ER_DC_ERROR))
                 continue; // dc-ok
 
-            /* right block */
-            for (j = b_x + 1; j < w; j++) {
-                int mb_index_j = (j >> is_luma) + (b_y >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[0]    = dc[j + b_y * stride];
-                    distance[0] = j - b_x;
-                    break;
-                }
-            }
-
-            /* left block */
-            for (j = b_x - 1; j >= 0; j--) {
-                int mb_index_j = (j >> is_luma) + (b_y >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[1]    = dc[j + b_y * stride];
-                    distance[1] = b_x - j;
-                    break;
-                }
-            }
-
-            /* bottom block */
-            for (j = b_y + 1; j < h; j++) {
-                int mb_index_j = (b_x >> is_luma) + (j >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[2]    = dc[b_x + j * stride];
-                    distance[2] = j - b_y;
-                    break;
-                }
-            }
-
-            /* top block */
-            for (j = b_y - 1; j >= 0; j--) {
-                int mb_index_j = (b_x >> is_luma) + (j >> is_luma) * s->mb_stride;
-                int error_j    = s->error_status_table[mb_index_j];
-                int intra_j    = IS_INTRA(s->cur_pic.mb_type[mb_index_j]);
-                if (intra_j == 0 || !(error_j & ER_DC_ERROR)) {
-                    color[3]    = dc[b_x + j * stride];
-                    distance[3] = b_y - j;
-                    break;
-                }
-            }
-
             weight_sum = 0;
             guess      = 0;
             for (j = 0; j < 4; j++) {
-                int64_t weight  = 256 * 256 * 256 * 16 / distance[j];
-                guess          += weight * (int64_t) color[j];
+                int64_t weight  = 256 * 256 * 256 * 16 / FFMAX(dist[b_x + b_y*stride][j], 1);
+                guess          += weight*(int64_t)col[b_x + b_y*stride][j];
                 weight_sum     += weight;
             }
             guess = (guess + weight_sum / 2) / weight_sum;
             dc[b_x + b_y * stride] = guess;
         }
     }
+
+fail:
+    av_freep(&col);
+    av_freep(&dist);
 }
 
 /**
@@ -361,14 +381,21 @@ static void guess_mv(ERContext *s)
 #define MV_UNCHANGED 1
     const int mb_stride = s->mb_stride;
     const int mb_width  = s->mb_width;
-    const int mb_height = s->mb_height;
+    int mb_height = s->mb_height;
     int i, depth, num_avail;
     int mb_x, mb_y, mot_step, mot_stride;
 
+    if (s->last_pic.f && s->last_pic.f->data[0])
+        mb_height = FFMIN(mb_height, (s->last_pic.f->height+15)>>4);
+    if (s->next_pic.f && s->next_pic.f->data[0])
+        mb_height = FFMIN(mb_height, (s->next_pic.f->height+15)>>4);
+
     set_mv_strides(s, &mot_step, &mot_stride);
 
     num_avail = 0;
-    for (i = 0; i < s->mb_num; i++) {
+    if (s->last_pic.motion_val[0])
+        ff_thread_await_progress(s->last_pic.tf, mb_height-1, 0);
+    for (i = 0; i < mb_width * mb_height; i++) {
         const int mb_xy = s->mb_index2xy[i];
         int f = 0;
         int error = s->error_status_table[mb_xy];
@@ -381,11 +408,19 @@ static void guess_mv(ERContext *s)
         fixed[mb_xy] = f;
         if (f == MV_FROZEN)
             num_avail++;
+        else if(s->last_pic.f->data[0] && s->last_pic.motion_val[0]){
+            const int mb_y= mb_xy / s->mb_stride;
+            const int mb_x= mb_xy % s->mb_stride;
+            const int mot_index= (mb_x + mb_y*mot_stride) * mot_step;
+            s->cur_pic.motion_val[0][mot_index][0]= s->last_pic.motion_val[0][mot_index][0];
+            s->cur_pic.motion_val[0][mot_index][1]= s->last_pic.motion_val[0][mot_index][1];
+            s->cur_pic.ref_index[0][4*mb_xy]      = s->last_pic.ref_index[0][4*mb_xy];
+        }
     }
 
     if ((!(s->avctx->error_concealment&FF_EC_GUESS_MVS)) ||
         num_avail <= mb_width / 2) {
-        for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
                 const int mb_xy = mb_x + mb_y * s->mb_stride;
                 int mv_dir = (s->last_pic.f && s->last_pic.f->data[0]) ? MV_DIR_FORWARD : MV_DIR_BACKWARD;
@@ -414,7 +449,7 @@ static void guess_mv(ERContext *s)
             int score_sum = 0;
 
             changed = 0;
-            for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
+            for (mb_y = 0; mb_y < mb_height; mb_y++) {
                 for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
                     const int mb_xy        = mb_x + mb_y * s->mb_stride;
                     int mv_predictor[8][2] = { { 0 } };
@@ -431,6 +466,8 @@ static void guess_mv(ERContext *s)
 
                     if (fixed[mb_xy] == MV_FROZEN)
                         continue;
+                    av_assert1(!IS_INTRA(s->cur_pic.mb_type[mb_xy]));
+                    av_assert1(s->last_pic.f && s->last_pic.f->data[0]);
 
                     j = 0;
                     if (mb_x > 0             && fixed[mb_xy - 1]         == MV_FROZEN)
@@ -545,24 +582,9 @@ skip_mean_and_median:
                     /* zero MV */
                     pred_count++;
 
-                    if (!fixed[mb_xy]) {
-                        if (s->avctx->codec_id == AV_CODEC_ID_H264) {
-                            // FIXME
-                        } else {
-                            ff_thread_await_progress(s->last_pic.tf,
-                                                     mb_y, 0);
-                        }
-                        if (!s->last_pic.motion_val[0] ||
-                            !s->last_pic.ref_index[0])
-                            goto skip_last_mv;
-                        prev_x   = s->last_pic.motion_val[0][mot_index][0];
-                        prev_y   = s->last_pic.motion_val[0][mot_index][1];
-                        prev_ref = s->last_pic.ref_index[0][4 * mb_xy];
-                    } else {
-                        prev_x   = s->cur_pic.motion_val[0][mot_index][0];
-                        prev_y   = s->cur_pic.motion_val[0][mot_index][1];
-                        prev_ref = s->cur_pic.ref_index[0][4 * mb_xy];
-                    }
+                    prev_x   = s->cur_pic.motion_val[0][mot_index][0];
+                    prev_y   = s->cur_pic.motion_val[0][mot_index][1];
+                    prev_ref = s->cur_pic.ref_index[0][4 * mb_xy];
 
                     /* last MV */
                     mv_predictor[pred_count][0] = prev_x;
@@ -645,7 +667,7 @@ skip_last_mv:
         if (none_left)
             return;
 
-        for (i = 0; i < s->mb_num; i++) {
+        for (i = 0; i < mb_width * mb_height; i++) {
             int mb_xy = s->mb_index2xy[i];
             if (fixed[mb_xy])
                 fixed[mb_xy] = MV_FROZEN;
@@ -660,6 +682,9 @@ static int is_intra_more_likely(ERContext *s)
     if (!s->last_pic.f || !s->last_pic.f->data[0])
         return 1; // no previous frame available -> use spatial prediction
 
+    if (s->avctx->error_concealment & FF_EC_FAVOR_INTER)
+        return 0;
+
     undamaged_count = 0;
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
@@ -668,21 +693,14 @@ static int is_intra_more_likely(ERContext *s)
             undamaged_count++;
     }
 
-    if (s->avctx->codec_id == AV_CODEC_ID_H264 && s->ref_count <= 0)
-        return 1;
-
     if (undamaged_count < 5)
         return 0; // almost all MBs damaged -> use temporal prediction
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
     // prevent dsp.sad() check, that requires access to the image
-    if (CONFIG_MPEG_XVMC_DECODER    &&
-        s->avctx->xvmc_acceleration &&
+    if (CONFIG_XVMC    &&
+        s->avctx->hwaccel && s->avctx->hwaccel->decode_mb &&
         s->cur_pic.f->pict_type == AV_PICTURE_TYPE_I)
         return 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
     skip_amount     = FFMAX(undamaged_count / 50, 1); // check only up to 50 MBs
     is_intra_likely = 0;
@@ -716,6 +734,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
                 is_intra_likely += s->mecc.sad[0](NULL, last_mb_ptr, mb_ptr,
                                                   linesize[0], 16);
+                // FIXME need await_progress() here
                 is_intra_likely -= s->mecc.sad[0](NULL, last_mb_ptr,
                                                   last_mb_ptr + linesize[0] * 16,
                                                   linesize[0], 16);
@@ -727,6 +746,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
         }
     }
+//      av_log(NULL, AV_LOG_ERROR, "is_intra_likely: %d type:%d\n", is_intra_likely, s->pict_type);
     return is_intra_likely > 0;
 }
 
@@ -746,6 +766,19 @@ void ff_er_frame_start(ERContext *s)
     s->error_occurred = 0;
 }
 
+static int er_supported(ERContext *s)
+{
+    if(s->avctx->hwaccel && s->avctx->hwaccel->decode_slice           ||
+#if FF_API_CAP_VDPAU
+       s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU          ||
+#endif
+       !s->cur_pic.f                                                  ||
+       s->cur_pic.field_picture
+    )
+        return 0;
+    return 1;
+}
+
 /**
  * Add a slice.
  * @param endx   x component of the last macroblock, can be -1
@@ -762,7 +795,7 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     const int end_xy   = s->mb_index2xy[end_i];
     int mask           = -1;
 
-    if (s->avctx->hwaccel)
+    if (s->avctx->hwaccel && s->avctx->hwaccel->decode_slice)
         return;
 
     if (start_i > end_i || start_xy > end_xy) {
@@ -777,20 +810,20 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     mask &= ~VP_START;
     if (status & (ER_AC_ERROR | ER_AC_END)) {
         mask           &= ~(ER_AC_ERROR | ER_AC_END);
-        s->error_count -= end_i - start_i + 1;
+        avpriv_atomic_int_add_and_fetch(&s->error_count, start_i - end_i - 1);
     }
     if (status & (ER_DC_ERROR | ER_DC_END)) {
         mask           &= ~(ER_DC_ERROR | ER_DC_END);
-        s->error_count -= end_i - start_i + 1;
+        avpriv_atomic_int_add_and_fetch(&s->error_count, start_i - end_i - 1);
     }
     if (status & (ER_MV_ERROR | ER_MV_END)) {
         mask           &= ~(ER_MV_ERROR | ER_MV_END);
-        s->error_count -= end_i - start_i + 1;
+        avpriv_atomic_int_add_and_fetch(&s->error_count, start_i - end_i - 1);
     }
 
     if (status & ER_MB_ERROR) {
         s->error_occurred = 1;
-        s->error_count    = INT_MAX;
+        avpriv_atomic_int_set(&s->error_count, INT_MAX);
     }
 
     if (mask == ~0x7F) {
@@ -803,7 +836,7 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
     }
 
     if (end_i == s->mb_num)
-        s->error_count = INT_MAX;
+        avpriv_atomic_int_set(&s->error_count, INT_MAX);
     else {
         s->error_status_table[end_xy] &= mask;
         s->error_status_table[end_xy] |= status;
@@ -811,41 +844,92 @@ void ff_er_add_slice(ERContext *s, int startx, int starty,
 
     s->error_status_table[start_xy] |= VP_START;
 
-    if (start_xy > 0 && s->avctx->thread_count <= 1 &&
-        s->avctx->skip_top * s->mb_width < start_i) {
+    if (start_xy > 0 && !(s->avctx->active_thread_type & FF_THREAD_SLICE) &&
+        er_supported(s) && s->avctx->skip_top * s->mb_width < start_i) {
         int prev_status = s->error_status_table[s->mb_index2xy[start_i - 1]];
 
         prev_status &= ~ VP_START;
-        if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END))
-            s->error_count = INT_MAX;
+        if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END)) {
+            s->error_occurred = 1;
+            avpriv_atomic_int_set(&s->error_count, INT_MAX);
+        }
     }
 }
 
 void ff_er_frame_end(ERContext *s)
 {
-    int *linesize = s->cur_pic.f->linesize;
+    int *linesize = NULL;
     int i, mb_x, mb_y, error, error_type, dc_error, mv_error, ac_error;
     int distance;
     int threshold_part[4] = { 100, 100, 100 };
     int threshold = 50;
     int is_intra_likely;
+    int size = s->b8_stride * 2 * s->mb_height;
 
     /* We do not support ER of field pictures yet,
      * though it should not crash if enabled. */
     if (!s->avctx->error_concealment || s->error_count == 0            ||
-        s->avctx->hwaccel                                              ||
-        !s->cur_pic.f                                                  ||
-        s->cur_pic.field_picture                                       ||
+        s->avctx->lowres                                               ||
+        !er_supported(s)                                               ||
         s->error_count == 3 * s->mb_width *
                           (s->avctx->skip_top + s->avctx->skip_bottom)) {
         return;
-    };
+    }
+    linesize = s->cur_pic.f->linesize;
+    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
+        int status = s->error_status_table[mb_x + (s->mb_height - 1) * s->mb_stride];
+        if (status != 0x7F)
+            break;
+    }
 
-    if (!s->cur_pic.motion_val[0] || !s->cur_pic.ref_index[0]) {
-        av_log(s->avctx, AV_LOG_ERROR, "MVs not available, ER not possible.\n");
+    if (   mb_x == s->mb_width
+        && s->avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO
+        && (FFALIGN(s->avctx->height, 16)&16)
+        && s->error_count == 3 * s->mb_width * (s->avctx->skip_top + s->avctx->skip_bottom + 1)
+    ) {
+        av_log(s->avctx, AV_LOG_DEBUG, "ignoring last missing slice\n");
         return;
     }
 
+    if (s->last_pic.f) {
+        if (s->last_pic.f->width  != s->cur_pic.f->width  ||
+            s->last_pic.f->height != s->cur_pic.f->height ||
+            s->last_pic.f->format != s->cur_pic.f->format) {
+            av_log(s->avctx, AV_LOG_WARNING, "Cannot use previous picture in error concealment\n");
+            memset(&s->last_pic, 0, sizeof(s->last_pic));
+        }
+    }
+    if (s->next_pic.f) {
+        if (s->next_pic.f->width  != s->cur_pic.f->width  ||
+            s->next_pic.f->height != s->cur_pic.f->height ||
+            s->next_pic.f->format != s->cur_pic.f->format) {
+            av_log(s->avctx, AV_LOG_WARNING, "Cannot use next picture in error concealment\n");
+            memset(&s->next_pic, 0, sizeof(s->next_pic));
+        }
+    }
+
+    if (!s->cur_pic.motion_val[0] || !s->cur_pic.ref_index[0]) {
+        av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n");
+
+        for (i = 0; i < 2; i++) {
+            s->ref_index_buf[i]  = av_buffer_allocz(s->mb_stride * s->mb_height * 4 * sizeof(uint8_t));
+            s->motion_val_buf[i] = av_buffer_allocz((size + 4) * 2 * sizeof(uint16_t));
+            if (!s->ref_index_buf[i] || !s->motion_val_buf[i])
+                break;
+            s->cur_pic.ref_index[i]  = s->ref_index_buf[i]->data;
+            s->cur_pic.motion_val[i] = (int16_t (*)[2])s->motion_val_buf[i]->data + 4;
+        }
+        if (i < 2) {
+            for (i = 0; i < 2; i++) {
+                av_buffer_unref(&s->ref_index_buf[i]);
+                av_buffer_unref(&s->motion_val_buf[i]);
+                s->cur_pic.ref_index[i]  = NULL;
+                s->cur_pic.motion_val[i] = NULL;
+            }
+            return;
+        }
+    }
+
     if (s->avctx->debug & FF_DEBUG_ER) {
         for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -857,6 +941,7 @@ void ff_er_frame_end(ERContext *s)
         }
     }
 
+#if 1
     /* handle overlapping slices */
     for (error_type = 1; error_type <= 3; error_type++) {
         int end_ok = 0;
@@ -877,7 +962,8 @@ void ff_er_frame_end(ERContext *s)
                 end_ok = 0;
         }
     }
-
+#endif
+#if 1
     /* handle slices with partitions of different length */
     if (s->partitioned_frame) {
         int end_ok = 0;
@@ -900,7 +986,7 @@ void ff_er_frame_end(ERContext *s)
                 end_ok = 0;
         }
     }
-
+#endif
     /* handle missing slices */
     if (s->avctx->err_recognition & AV_EF_EXPLODE) {
         int end_ok = 1;
@@ -927,6 +1013,7 @@ void ff_er_frame_end(ERContext *s)
         }
     }
 
+#if 1
     /* backward mark errors */
     distance = 9999999;
     for (error_type = 1; error_type <= 3; error_type++) {
@@ -934,7 +1021,7 @@ void ff_er_frame_end(ERContext *s)
             const int mb_xy = s->mb_index2xy[i];
             int       error = s->error_status_table[mb_xy];
 
-            if (s->mbskip_table && !s->mbskip_table[mb_xy]) // FIXME partition specific
+            if (!s->mbskip_table || !s->mbskip_table[mb_xy]) // FIXME partition specific
                 distance++;
             if (error & (1 << error_type))
                 distance = 0;
@@ -951,6 +1038,7 @@ void ff_er_frame_end(ERContext *s)
                 distance = 9999999;
         }
     }
+#endif
 
     /* forward mark errors */
     error = 0;
@@ -965,22 +1053,23 @@ void ff_er_frame_end(ERContext *s)
             s->error_status_table[mb_xy] |= error;
         }
     }
-
+#if 1
     /* handle not partitioned case */
     if (!s->partitioned_frame) {
         for (i = 0; i < s->mb_num; i++) {
             const int mb_xy = s->mb_index2xy[i];
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
             if (error & ER_MB_ERROR)
                 error |= ER_MB_ERROR;
             s->error_status_table[mb_xy] = error;
         }
     }
+#endif
 
     dc_error = ac_error = mv_error = 0;
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
-        error = s->error_status_table[mb_xy];
+        int error = s->error_status_table[mb_xy];
         if (error & ER_DC_ERROR)
             dc_error++;
         if (error & ER_AC_ERROR)
@@ -988,15 +1077,15 @@ void ff_er_frame_end(ERContext *s)
         if (error & ER_MV_ERROR)
             mv_error++;
     }
-    av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors\n",
-           dc_error, ac_error, mv_error);
+    av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors in %c frame\n",
+           dc_error, ac_error, mv_error, av_get_picture_type_char(s->cur_pic.f->pict_type));
 
     is_intra_likely = is_intra_more_likely(s);
 
     /* set unknown mb-type to most likely */
     for (i = 0; i < s->mb_num; i++) {
         const int mb_xy = s->mb_index2xy[i];
-        error = s->error_status_table[mb_xy];
+        int error = s->error_status_table[mb_xy];
         if (!((error & ER_DC_ERROR) && (error & ER_MV_ERROR)))
             continue;
 
@@ -1024,7 +1113,7 @@ void ff_er_frame_end(ERContext *s)
             const int mv_dir  = dir ? MV_DIR_BACKWARD : MV_DIR_FORWARD;
             int mv_type;
 
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
 
             if (IS_INTRA(mb_type))
                 continue; // intra
@@ -1061,7 +1150,7 @@ void ff_er_frame_end(ERContext *s)
                 const int mb_type = s->cur_pic.mb_type[mb_xy];
                 int mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
 
-                error = s->error_status_table[mb_xy];
+                int error = s->error_status_table[mb_xy];
 
                 if (IS_INTRA(mb_type))
                     continue;
@@ -1079,6 +1168,7 @@ void ff_er_frame_end(ERContext *s)
                     int time_pp = s->pp_time;
                     int time_pb = s->pb_time;
 
+                    av_assert0(s->avctx->codec_id != AV_CODEC_ID_H264);
                     ff_thread_await_progress(s->next_pic.tf, mb_y, 0);
 
                     s->mv[0][0][0] = s->next_pic.motion_val[0][xy][0] *  time_pb            / time_pp;
@@ -1099,13 +1189,9 @@ void ff_er_frame_end(ERContext *s)
     } else
         guess_mv(s);
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* the filters below are not XvMC compatible, skip them */
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
+    /* the filters below manipulate raw image, skip them */
+    if (CONFIG_XVMC && s->avctx->hwaccel && s->avctx->hwaccel->decode_mb)
         goto ec_clean;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
     /* fill DC for inter blocks */
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -1115,7 +1201,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             const int mb_xy   = mb_x + mb_y * s->mb_stride;
             const int mb_type = s->cur_pic.mb_type[mb_xy];
 
-            error = s->error_status_table[mb_xy];
+            // error = s->error_status_table[mb_xy];
 
             if (IS_INTRA(mb_type) && s->partitioned_frame)
                 continue;
@@ -1138,6 +1224,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 dc_ptr[(n & 1) + (n >> 1) * s->b8_stride] = (dc + 4) >> 3;
             }
 
+            if (!s->cur_pic.f->data[2])
+                continue;
+
             dcu = dcv = 0;
             for (y = 0; y < 8; y++) {
                 int x;
@@ -1150,15 +1239,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->dc_val[2][mb_x + mb_y * s->mb_stride] = (dcv + 4) >> 3;
         }
     }
-
+#if 1
     /* guess DC for damaged blocks */
-    guess_dc(s, s->dc_val[0], s->mb_width * 2, s->mb_height * 2, s->b8_stride, 1);
-    guess_dc(s, s->dc_val[1], s->mb_width, s->mb_height, s->mb_stride, 0);
-    guess_dc(s, s->dc_val[2], s->mb_width, s->mb_height, s->mb_stride, 0);
+    guess_dc(s, s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride, 1);
+    guess_dc(s, s->dc_val[1], s->mb_width  , s->mb_height  , s->mb_stride, 0);
+    guess_dc(s, s->dc_val[2], s->mb_width  , s->mb_height  , s->mb_stride, 0);
+#endif
 
     /* filter luma DC */
     filter181(s->dc_val[0], s->mb_width * 2, s->mb_height * 2, s->b8_stride);
 
+#if 1
     /* render DC only intra */
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
@@ -1166,7 +1257,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             const int mb_xy   = mb_x + mb_y * s->mb_stride;
             const int mb_type = s->cur_pic.mb_type[mb_xy];
 
-            error = s->error_status_table[mb_xy];
+            int error = s->error_status_table[mb_xy];
 
             if (IS_INTER(mb_type))
                 continue;
@@ -1176,27 +1267,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
             dest_y  = s->cur_pic.f->data[0] + mb_x * 16 + mb_y * 16 * linesize[0];
             dest_cb = s->cur_pic.f->data[1] + mb_x *  8 + mb_y *  8 * linesize[1];
             dest_cr = s->cur_pic.f->data[2] + mb_x *  8 + mb_y *  8 * linesize[2];
+            if (!s->cur_pic.f->data[2])
+                dest_cb = dest_cr = NULL;
 
             put_dc(s, dest_y, dest_cb, dest_cr, mb_x, mb_y);
         }
     }
+#endif
 
     if (s->avctx->error_concealment & FF_EC_DEBLOCK) {
         /* filter horizontal block boundaries */
         h_block_filter(s, s->cur_pic.f->data[0], s->mb_width * 2,
                        s->mb_height * 2, linesize[0], 1);
-        h_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
-                       s->mb_height, linesize[1], 0);
-        h_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
-                       s->mb_height, linesize[2], 0);
 
         /* filter vertical block boundaries */
         v_block_filter(s, s->cur_pic.f->data[0], s->mb_width * 2,
                        s->mb_height * 2, linesize[0], 1);
-        v_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
-                       s->mb_height, linesize[1], 0);
-        v_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
-                       s->mb_height, linesize[2], 0);
+
+        if (s->cur_pic.f->data[2]) {
+            h_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
+                        s->mb_height, linesize[1], 0);
+            h_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
+                        s->mb_height, linesize[2], 0);
+            v_block_filter(s, s->cur_pic.f->data[1], s->mb_width,
+                        s->mb_height, linesize[1], 0);
+            v_block_filter(s, s->cur_pic.f->data[2], s->mb_width,
+                        s->mb_height, linesize[2], 0);
+        }
     }
 
 ec_clean:
@@ -1213,6 +1310,13 @@ ec_clean:
             s->mbintra_table[mb_xy] = 1;
     }
 
+    for (i = 0; i < 2; i++) {
+        av_buffer_unref(&s->ref_index_buf[i]);
+        av_buffer_unref(&s->motion_val_buf[i]);
+        s->cur_pic.ref_index[i]  = NULL;
+        s->cur_pic.motion_val[i] = NULL;
+    }
+
     memset(&s->cur_pic, 0, sizeof(ERPicture));
     memset(&s->last_pic, 0, sizeof(ERPicture));
     memset(&s->next_pic, 0, sizeof(ERPicture));
diff --git a/libavcodec/error_resilience.h b/libavcodec/error_resilience.h
index 3139880..d444ec3 100644
--- a/libavcodec/error_resilience.h
+++ b/libavcodec/error_resilience.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -60,7 +60,8 @@ typedef struct ERContext {
     int mb_stride;
     int b8_stride;
 
-    int error_count, error_occurred;
+    volatile int error_count;
+    int error_occurred;
     uint8_t *error_status_table;
     uint8_t *er_temp_buffer;
     int16_t *dc_val[3];
@@ -72,6 +73,9 @@ typedef struct ERContext {
     ERPicture last_pic;
     ERPicture next_pic;
 
+    AVBufferRef *ref_index_buf[2];
+    AVBufferRef *motion_val_buf[2];
+
     uint16_t pp_time;
     uint16_t pb_time;
     int quarter_sample;
diff --git a/libavcodec/escape124.c b/libavcodec/escape124.c
index 629ba5e..9a51bda 100644
--- a/libavcodec/escape124.c
+++ b/libavcodec/escape124.c
@@ -2,20 +2,20 @@
  * Escape 124 Video Decoder
  * Copyright (C) 2008 Eli Friedman (eli.friedman@gmail.com)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,10 +49,6 @@ typedef struct Escape124Context {
     CodeBook codebooks[3];
 } Escape124Context;
 
-static int can_safely_read(GetBitContext* gb, int bits) {
-    return get_bits_left(gb) >= bits;
-}
-
 /**
  * Initialize the decoder
  * @param avctx decoder context
@@ -80,7 +76,7 @@ static av_cold int escape124_decode_close(AVCodecContext *avctx)
     Escape124Context *s = avctx->priv_data;
 
     for (i = 0; i < 3; i++)
-        av_free(s->codebooks[i].blocks);
+        av_freep(&s->codebooks[i].blocks);
 
     av_frame_free(&s->frame);
 
@@ -93,7 +89,7 @@ static CodeBook unpack_codebook(GetBitContext* gb, unsigned depth,
     unsigned i, j;
     CodeBook cb = { 0 };
 
-    if (!can_safely_read(gb, size * 34))
+    if (size >= INT_MAX / 34 || get_bits_left(gb) < size * 34)
         return cb;
 
     if (size >= INT_MAX / sizeof(MacroBlock))
@@ -124,7 +120,7 @@ static unsigned decode_skip_count(GetBitContext* gb)
     unsigned value;
     // This function reads a maximum of 23 bits,
     // which is within the padding space
-    if (!can_safely_read(gb, 1))
+    if (get_bits_left(gb) < 1)
         return -1;
     value = get_bits1(gb);
     if (!value)
@@ -149,7 +145,7 @@ static MacroBlock decode_macroblock(Escape124Context* s, GetBitContext* gb,
     unsigned block_index, depth;
     int value = get_bits1(gb);
     if (value) {
-        static const char transitions[3][2] = { {2, 1}, {0, 2}, {1, 0} };
+        static const int8_t transitions[3][2] = { {2, 1}, {0, 2}, {1, 0} };
         value = get_bits1(gb);
         *codebook_index = transitions[*codebook_index][value];
     }
@@ -204,7 +200,6 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                                   void *data, int *got_frame,
                                   AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     Escape124Context *s = avctx->priv_data;
     AVFrame *frame = data;
@@ -219,13 +214,15 @@ static int escape124_decode_frame(AVCodecContext *avctx,
 
     uint16_t* old_frame_data, *new_frame_data;
     unsigned old_stride, new_stride;
+
     int ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     // This call also guards the potential depth reads for the
     // codebook unpacking.
-    if (!can_safely_read(&gb, 64))
+    if (get_bits_left(&gb) < 64)
         return -1;
 
     frame_flags = get_bits_long(&gb, 32);
@@ -237,7 +234,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
         if (!s->frame->data[0])
             return AVERROR_INVALIDDATA;
 
-        av_log(NULL, AV_LOG_DEBUG, "Skipping frame\n");
+        av_log(avctx, AV_LOG_DEBUG, "Skipping frame\n");
 
         *got_frame = 1;
         if ((ret = av_frame_ref(frame, s->frame)) < 0)
@@ -267,17 +264,15 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                     cb_size = s->num_superblocks << cb_depth;
                 }
             }
-            av_free(s->codebooks[i].blocks);
+            av_freep(&s->codebooks[i].blocks);
             s->codebooks[i] = unpack_codebook(&gb, cb_depth, cb_size);
             if (!s->codebooks[i].blocks)
                 return -1;
         }
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     new_frame_data = (uint16_t*)frame->data[0];
     new_stride = frame->linesize[0] / 2;
@@ -303,7 +298,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
             copy_superblock(sb.pixels, 8,
                             old_frame_data, old_stride);
 
-            while (can_safely_read(&gb, 1) && !get_bits1(&gb)) {
+            while (get_bits_left(&gb) >= 1 && !get_bits1(&gb)) {
                 unsigned mask;
                 mb = decode_macroblock(s, &gb, &cb_index, superblock_index);
                 mask = get_bits(&gb, 16);
@@ -315,7 +310,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
                 }
             }
 
-            if (can_safely_read(&gb, 1) && !get_bits1(&gb)) {
+            if (!get_bits1(&gb)) {
                 unsigned inv_mask = get_bits(&gb, 4);
                 for (i = 0; i < 4; i++) {
                     if (inv_mask & (1 << i)) {
@@ -327,15 +322,13 @@ static int escape124_decode_frame(AVCodecContext *avctx,
 
                 for (i = 0; i < 16; i++) {
                     if (multi_mask & mask_matrix[i]) {
-                        if (!can_safely_read(&gb, 1))
-                            break;
                         mb = decode_macroblock(s, &gb, &cb_index,
                                                superblock_index);
                         insert_mb_into_sb(&sb, mb, i);
                     }
                 }
             } else if (frame_flags & (1 << 16)) {
-                while (can_safely_read(&gb, 1) && !get_bits1(&gb)) {
+                while (get_bits_left(&gb) >= 1 && !get_bits1(&gb)) {
                     mb = decode_macroblock(s, &gb, &cb_index, superblock_index);
                     insert_mb_into_sb(&sb, mb, get_bits(&gb, 4));
                 }
@@ -357,7 +350,7 @@ static int escape124_decode_frame(AVCodecContext *avctx,
         skip--;
     }
 
-    av_log(NULL, AV_LOG_DEBUG,
+    av_log(avctx, AV_LOG_DEBUG,
            "Escape sizes: %i, %i, %i\n",
            frame_size, buf_size, get_bits_count(&gb) / 8);
 
diff --git a/libavcodec/escape130.c b/libavcodec/escape130.c
index e69e42e..f4f64d8 100644
--- a/libavcodec/escape130.c
+++ b/libavcodec/escape130.c
@@ -2,20 +2,20 @@
  * Escape 130 video decoder
  * Copyright (C) 2008 Eli Friedman (eli.friedman <at> gmail.com)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -166,6 +166,9 @@ static int decode_skip_count(GetBitContext* gb)
 {
     int value;
 
+    if (get_bits_left(gb) < 1+3)
+        return -1;
+
     value = get_bits1(gb);
     if (value)
         return 0;
@@ -188,7 +191,6 @@ static int decode_skip_count(GetBitContext* gb)
 static int escape130_decode_frame(AVCodecContext *avctx, void *data,
                                   int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf  = avpkt->data;
     int buf_size        = avpkt->size;
     Escape130Context *s = avctx->priv_data;
     AVFrame *pic        = data;
@@ -215,7 +217,9 @@ static int escape130_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
         return ret;
 
-    init_get_bits(&gb, buf + 16, (buf_size - 16) * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+    skip_bits_long(&gb, 16 * 8);
 
     new_y  = s->new_y;
     new_cb = s->new_u;
diff --git a/libavcodec/evrcdata.h b/libavcodec/evrcdata.h
new file mode 100644
index 0000000..8cfc202
--- /dev/null
+++ b/libavcodec/evrcdata.h
@@ -0,0 +1,1499 @@
+/*
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_EVRCDATA_H
+#define AVCODEC_EVRCDATA_H
+
+/**
+ * @file
+ * Data tables for the EVRC decoder
+ * @author Paul B Mahol
+ */
+
+#include "libavutil/common.h"
+
+/**
+ * Rate 1/8 frame energy quantization
+ *
+ * TIA/IS-127 table 8-18
+ */
+static const float evrc_energy_quant[][3] = {
+{-0.2464E-01,-0.4005E-02,-0.1107E+00 }, { 0.8734E+00, 0.1004E+01, 0.9930E+00 },
+{ 0.4222E+00, 0.3894E+00, 0.5020E+00 }, { 0.1450E+01, 0.1328E+01, 0.1278E+01 },
+{ 0.1957E+00, 0.2169E+00, 0.2735E+00 }, { 0.1142E+01, 0.1240E+01, 0.1157E+01 },
+{ 0.7881E+00, 0.6778E+00, 0.4185E+00 }, { 0.1504E+01, 0.1468E+01, 0.1534E+01 },
+{ 0.3173E+00, 0.2693E+00,-0.9526E-01 }, { 0.1141E+01, 0.1154E+01, 0.1044E+01 },
+{ 0.5147E+00, 0.5784E+00, 0.8802E+00 }, { 0.1502E+01, 0.1407E+01, 0.1409E+01 },
+{ 0.3163E+00, 0.3592E+00, 0.2830E+00 }, { 0.1217E+01, 0.1213E+01, 0.1216E+01 },
+{ 0.1023E+01, 0.1139E+01,-0.9526E-01 }, { 0.1619E+01, 0.1655E+01, 0.1642E+01 },
+{ 0.1437E+00, 0.1505E+00, 0.6838E-01 }, { 0.9794E+00, 0.1021E+01, 0.1117E+01 },
+{ 0.4701E+00, 0.6426E+00, 0.5519E+00 }, { 0.1366E+01, 0.1397E+01, 0.1406E+01 },
+{ 0.2918E+00, 0.3022E+00, 0.2420E+00 }, { 0.1309E+01, 0.1241E+01, 0.1220E+01 },
+{ 0.7989E+00, 0.7654E+00, 0.7391E+00 }, { 0.1612E+01, 0.1502E+01, 0.1447E+01 },
+{ 0.2594E+00, 0.1948E+00, 0.2555E+00 }, { 0.1091E+01, 0.1150E+01, 0.1272E+01 },
+{ 0.3423E+00, 0.4150E+00, 0.1294E+01 }, { 0.1729E+01, 0.1377E+01, 0.1065E+01 },
+{ 0.4103E+00, 0.3287E+00, 0.3228E+00 }, { 0.1144E+01, 0.1281E+01, 0.1416E+01 },
+{ 0.1047E+01, 0.1117E+01, 0.6188E+00 }, { 0.1914E+01, 0.1777E+01, 0.1516E+01 },
+{-0.2117E-01, 0.2159E+00, 0.2351E+00 }, { 0.1093E+01, 0.1088E+01, 0.1026E+01 },
+{ 0.5567E+00, 0.5092E+00, 0.4654E+00 }, { 0.1510E+01, 0.1449E+01, 0.1201E+01 },
+{ 0.2362E+00, 0.3426E+00, 0.2549E+00 }, { 0.1340E+01, 0.1225E+01, 0.1117E+01 },
+{ 0.1203E+01, 0.3819E+00, 0.2269E+00 }, { 0.1373E+01, 0.1404E+01, 0.1830E+01 },
+{ 0.2570E+00, 0.2668E+00, 0.1636E+00 }, { 0.1219E+01, 0.1098E+01, 0.1122E+01 },
+{ 0.6985E+00, 0.8456E+00, 0.1069E+01 }, { 0.1550E+01, 0.1501E+01, 0.1388E+01 },
+{ 0.2870E+00, 0.3060E+00, 0.3599E+00 }, { 0.1178E+01, 0.1345E+01, 0.1302E+01 },
+{ 0.1270E+01, 0.1215E+01, 0.1812E+00 }, { 0.1725E+01, 0.1777E+01, 0.1693E+01 },
+{ 0.2074E+00, 0.2104E+00, 0.1539E+00 }, { 0.1105E+01, 0.1034E+01, 0.1104E+01 },
+{ 0.6683E+00, 0.6646E+00, 0.6639E+00 }, { 0.1403E+01, 0.1462E+01, 0.1435E+01 },
+{ 0.3389E+00, 0.3754E+00, 0.2150E+00 }, { 0.1288E+01, 0.1325E+01, 0.1257E+01 },
+{ 0.8933E+00, 0.8253E+00, 0.8133E+00 }, { 0.1555E+01, 0.1579E+01, 0.1565E+01 },
+{ 0.3264E+00, 0.2434E+00, 0.2852E+00 }, { 0.1242E+01, 0.1180E+01, 0.1202E+01 },
+{ 0.1314E+00, 0.1698E+00, 0.1646E+01 }, { 0.1797E+01, 0.1597E+01, 0.1241E+01 },
+{ 0.4721E+00, 0.5346E+00, 0.3066E+00 }, { 0.1274E+01, 0.1401E+01, 0.1351E+01 },
+{ 0.1455E+01, 0.1386E+01, 0.6430E+00 }, { 0.1828E+01, 0.1867E+01, 0.1825E+01 },
+{-0.3265E+00,-0.2956E+00,-0.2462E+00 }, { 0.1035E+01, 0.1020E+01, 0.1003E+01 },
+{ 0.3702E+00, 0.4307E+00, 0.7072E+00 }, { 0.1424E+01, 0.1345E+01, 0.1352E+01 },
+{ 0.2267E+00, 0.2680E+00, 0.3037E+00 }, { 0.1235E+01, 0.1249E+01, 0.1146E+01 },
+{ 0.9944E+00, 0.6485E+00, 0.5248E+00 }, { 0.1539E+01, 0.1492E+01, 0.1612E+01 },
+{ 0.3815E+00, 0.3360E+00,-0.9526E-01 }, { 0.1163E+01, 0.1144E+01, 0.1117E+01 },
+{ 0.6734E+00, 0.7656E+00, 0.1014E+01 }, { 0.1568E+01, 0.1438E+01, 0.1455E+01 },
+{ 0.3409E+00, 0.3317E+00, 0.3856E+00 }, { 0.1180E+01, 0.1284E+01, 0.1284E+01 },
+{ 0.1244E+01, 0.1214E+01,-0.9526E-01 }, { 0.1753E+01, 0.1598E+01, 0.1744E+01 },
+{ 0.1548E+00, 0.1388E+00, 0.2020E+00 }, { 0.1027E+01, 0.1133E+01, 0.1093E+01 },
+{ 0.3906E+00, 0.7505E+00, 0.5705E+00 }, { 0.1420E+01, 0.1357E+01, 0.1543E+01 },
+{ 0.3252E+00, 0.3136E+00, 0.2804E+00 }, { 0.1351E+01, 0.1309E+01, 0.1224E+01 },
+{ 0.8781E+00, 0.8095E+00, 0.7109E+00 }, { 0.1614E+01, 0.1580E+01, 0.1433E+01 },
+{ 0.3222E+00, 0.2298E+00, 0.2157E+00 }, { 0.1216E+01, 0.1077E+01, 0.1247E+01 },
+{ 0.1363E+01, 0.1280E+01, 0.1317E+01 }, { 0.1751E+01, 0.1457E+01, 0.1182E+01 },
+{ 0.4428E+00, 0.4082E+00, 0.3181E+00 }, { 0.1157E+01, 0.1227E+01, 0.1604E+01 },
+{ 0.1286E+01, 0.1268E+01, 0.8167E+00 }, { 0.1994E+01, 0.2018E+01, 0.1307E+01 },
+{ 0.2671E-01, 0.2594E+00, 0.3397E+00 }, { 0.1164E+01, 0.1080E+01, 0.9321E+00 },
+{ 0.5998E+00, 0.6076E+00, 0.5081E+00 }, { 0.1442E+01, 0.1442E+01, 0.1375E+01 },
+{ 0.2390E+00, 0.3554E+00, 0.3426E+00 }, { 0.1287E+01, 0.1307E+01, 0.1144E+01 },
+{ 0.1200E+01, 0.7495E+00, 0.3967E+00 }, { 0.1561E+01, 0.1517E+01, 0.1898E+01 },
+{ 0.3598E+00, 0.3463E+00, 0.1200E+00 }, { 0.1298E+01, 0.1125E+01, 0.1062E+01 },
+{ 0.7577E+00, 0.1013E+01, 0.1194E+01 }, { 0.1537E+01, 0.1513E+01, 0.1464E+01 },
+{ 0.4041E+00, 0.4038E+00, 0.3897E+00 }, { 0.1293E+01, 0.1219E+01, 0.1378E+01 },
+{ 0.1250E+01, 0.1391E+01, 0.2451E+00 }, { 0.1558E+01, 0.1764E+01, 0.1728E+01 },
+{ 0.2700E+00, 0.1894E+00, 0.1924E+00 }, { 0.1111E+01, 0.1112E+01, 0.1173E+01 },
+{ 0.7579E+00, 0.8342E+00, 0.4781E+00 }, { 0.1464E+01, 0.1477E+01, 0.1469E+01 },
+{ 0.4001E+00, 0.3104E+00, 0.2217E+00 }, { 0.1346E+01, 0.1421E+01, 0.1312E+01 },
+{ 0.1071E+01, 0.8967E+00, 0.7511E+00 }, { 0.1616E+01, 0.1551E+01, 0.1574E+01 },
+{ 0.3329E+00, 0.2785E+00, 0.3140E+00 }, { 0.1281E+01, 0.1209E+01, 0.1239E+01 },
+{ 0.2805E+00, 0.2687E+00, 0.1646E+01 }, { 0.1814E+01, 0.1514E+01, 0.1510E+01 },
+{ 0.6231E+00, 0.4200E+00, 0.3701E+00 }, { 0.1255E+01, 0.1429E+01, 0.1454E+01 },
+{ 0.1642E+01, 0.1581E+01, 0.7112E+00 }, { 0.1844E+01, 0.1963E+01, 0.1895E+01 },
+{-0.4208E-01,-0.1491E+00,-0.7639E-01 }, { 0.1046E+01, 0.9598E+00, 0.9176E+00 },
+{ 0.4478E+00, 0.4605E+00, 0.5111E+00 }, { 0.1521E+01, 0.1292E+01, 0.1342E+01 },
+{ 0.2220E+00, 0.2549E+00, 0.2510E+00 }, { 0.1186E+01, 0.1254E+01, 0.1171E+01 },
+{ 0.8999E+00, 0.4960E+00, 0.4943E+00 }, { 0.1423E+01, 0.1484E+01, 0.1620E+01 },
+{ 0.2796E+00, 0.2778E+00,-0.2820E+00 }, { 0.1170E+01, 0.1181E+01, 0.1076E+01 },
+{ 0.4068E+00, 0.8541E+00, 0.9352E+00 }, { 0.1584E+01, 0.1416E+01, 0.1387E+01 },
+{ 0.3325E+00, 0.3655E+00, 0.3340E+00 }, { 0.1224E+01, 0.1257E+01, 0.1245E+01 },
+{ 0.1061E+01, 0.1138E+01,-0.9526E-01 }, { 0.1681E+01, 0.1704E+01, 0.1673E+01 },
+{ 0.1932E+00, 0.1489E+00, 0.1258E+00 }, { 0.1023E+01, 0.1088E+01, 0.1145E+01 },
+{ 0.5190E+00, 0.6873E+00, 0.5172E+00 }, { 0.1380E+01, 0.1405E+01, 0.1474E+01 },
+{ 0.3393E+00, 0.3100E+00, 0.2231E+00 }, { 0.1354E+01, 0.1249E+01, 0.1270E+01 },
+{ 0.7363E+00, 0.8508E+00, 0.8247E+00 }, { 0.1612E+01, 0.1537E+01, 0.1509E+01 },
+{ 0.2952E+00, 0.2053E+00, 0.2590E+00 }, { 0.1138E+01, 0.1219E+01, 0.1262E+01 },
+{ 0.1345E+01, 0.1289E+01, 0.1338E+01 }, { 0.1437E+01, 0.1360E+01, 0.1442E+01 },
+{ 0.4826E+00, 0.3298E+00, 0.3842E+00 }, { 0.1219E+01, 0.1311E+01, 0.1413E+01 },
+{ 0.1212E+01, 0.1186E+01, 0.6357E+00 }, { 0.1873E+01, 0.1939E+01, 0.1674E+01 },
+{ 0.1260E+01, 0.1306E+01, 0.1368E+01 }, { 0.1146E+01, 0.1077E+01, 0.1025E+01 },
+{ 0.6029E+00, 0.5039E+00, 0.5781E+00 }, { 0.1514E+01, 0.1420E+01, 0.1324E+01 },
+{ 0.2652E+00, 0.3192E+00, 0.3042E+00 }, { 0.1368E+01, 0.1198E+01, 0.1200E+01 },
+{ 0.1234E+01, 0.4910E+00, 0.3464E-01 }, { 0.1347E+01, 0.1560E+01, 0.1861E+01 },
+{ 0.2766E+00, 0.2887E+00, 0.2029E+00 }, { 0.1257E+01, 0.1105E+01, 0.1145E+01 },
+{ 0.1351E+01, 0.1353E+01, 0.1406E+01 }, { 0.1506E+01, 0.1580E+01, 0.1362E+01 },
+{ 0.2794E+00, 0.3868E+00, 0.4277E+00 }, { 0.1234E+01, 0.1334E+01, 0.1336E+01 },
+{ 0.1280E+01, 0.1252E+01, 0.1805E+00 }, { 0.1387E+01, 0.1396E+01, 0.1434E+01 },
+{ 0.2902E+00, 0.1170E+00, 0.1698E+00 }, { 0.1134E+01, 0.1077E+01, 0.1117E+01 },
+{ 0.6986E+00, 0.7177E+00, 0.7366E+00 }, { 0.1370E+01, 0.1491E+01, 0.1495E+01 },
+{ 0.4031E+00, 0.5144E+00, 0.1751E+00 }, { 0.1333E+01, 0.1377E+01, 0.1257E+01 },
+{ 0.9212E+00, 0.8934E+00, 0.8897E+00 }, { 0.1589E+01, 0.1614E+01, 0.1523E+01 },
+{ 0.3152E+00, 0.2164E+00, 0.3230E+00 }, { 0.1300E+01, 0.1145E+01, 0.1212E+01 },
+{ 0.1269E+01, 0.1245E+01, 0.1497E+01 }, { 0.1763E+01, 0.1716E+01, 0.1311E+01 },
+{ 0.4702E+00, 0.5422E+00, 0.4306E+00 }, { 0.1342E+01, 0.1433E+01, 0.1423E+01 },
+{ 0.1472E+01, 0.1404E+01, 0.8371E+00 }, { 0.1936E+01, 0.1883E+01, 0.1838E+01 },
+{ 0.1266E+01, 0.1295E+01, 0.1302E+01 }, { 0.1074E+01, 0.1002E+01, 0.1023E+01 },
+{ 0.5206E+00, 0.4045E+00, 0.6549E+00 }, { 0.1457E+01, 0.1378E+01, 0.1363E+01 },
+{ 0.2715E+00, 0.2629E+00, 0.2841E+00 }, { 0.1264E+01, 0.1271E+01, 0.1175E+01 },
+{ 0.1337E+01, 0.1305E+01, 0.1306E+01 }, { 0.1555E+01, 0.1571E+01, 0.1657E+01 },
+{ 0.3341E+00, 0.4147E+00,-0.3648E+00 }, { 0.1188E+01, 0.1185E+01, 0.1161E+01 },
+{ 0.6198E+00, 0.7208E+00, 0.1157E+01 }, { 0.1582E+01, 0.1465E+01, 0.1513E+01 },
+{ 0.3839E+00, 0.3651E+00, 0.3814E+00 }, { 0.1214E+01, 0.1256E+01, 0.1292E+01 },
+{ 0.1361E+01, 0.1363E+01, 0.1312E+01 }, { 0.1793E+01, 0.1693E+01, 0.1669E+01 },
+{ 0.1889E+00, 0.1275E+00, 0.2534E+00 }, { 0.1066E+01, 0.1174E+01, 0.1133E+01 },
+{ 0.4999E+00, 0.8207E+00, 0.5813E+00 }, { 0.1478E+01, 0.1416E+01, 0.1497E+01 },
+{ 0.3814E+00, 0.3138E+00, 0.2889E+00 }, { 0.1396E+01, 0.1265E+01, 0.1233E+01 },
+{ 0.9458E+00, 0.9161E+00, 0.5875E+00 }, { 0.1672E+01, 0.1632E+01, 0.1553E+01 },
+{ 0.3505E+00, 0.2525E+00, 0.2364E+00 }, { 0.1211E+01, 0.1138E+01, 0.1235E+01 },
+{ 0.1391E+01, 0.1231E+01, 0.1355E+01 }, { 0.1783E+01, 0.1510E+01, 0.1199E+01 },
+{ 0.4227E+00, 0.4548E+00, 0.3671E+00 }, { 0.1281E+01, 0.1254E+01, 0.1661E+01 },
+{ 0.1338E+01, 0.1379E+01, 0.9531E+00 }, { 0.2148E+01, 0.1965E+01, 0.1584E+01 },
+{ 0.9324E-01, 0.3575E+00, 0.3522E+00 }, { 0.1212E+01, 0.1086E+01, 0.1044E+01 },
+{ 0.6128E+00, 0.6136E+00, 0.6060E+00 }, { 0.1484E+01, 0.1507E+01, 0.1396E+01 },
+{ 0.2820E+00, 0.3848E+00, 0.3156E+00 }, { 0.1368E+01, 0.1287E+01, 0.1128E+01 },
+{ 0.1369E+01, 0.1352E+01, 0.1358E+01 }, { 0.1381E+01, 0.1765E+01, 0.2113E+01 },
+{ 0.1314E+01, 0.1345E+01, 0.1334E+01 }, { 0.1290E+01, 0.1172E+01, 0.1119E+01 },
+{ 0.1304E+01, 0.1377E+01, 0.1427E+01 }, { 0.1490E+01, 0.1540E+01, 0.1536E+01 },
+{ 0.3994E+00, 0.4402E+00, 0.4173E+00 }, { 0.1323E+01, 0.1307E+01, 0.1392E+01 },
+{ 0.1400E+01, 0.1388E+01, 0.1369E+01 }, { 0.1669E+01, 0.1818E+01, 0.1834E+01 },
+{ 0.2742E+00, 0.2235E+00, 0.1986E+00 }, { 0.1137E+01, 0.1139E+01, 0.1201E+01 },
+{ 0.1324E+01, 0.1385E+01, 0.1349E+01 }, { 0.1455E+01, 0.1574E+01, 0.1454E+01 },
+{ 0.5019E+00, 0.3255E+00, 0.2555E+00 }, { 0.1388E+01, 0.1438E+01, 0.1300E+01 },
+{ 0.1394E+01, 0.1349E+01, 0.1411E+01 }, { 0.1639E+01, 0.1580E+01, 0.1681E+01 },
+{ 0.3920E+00, 0.2498E+00, 0.3523E+00 }, { 0.1301E+01, 0.1221E+01, 0.1285E+01 },
+{ 0.1318E+01, 0.1342E+01, 0.1494E+01 }, { 0.1910E+01, 0.1680E+01, 0.1470E+01 },
+{ 0.6082E+00, 0.5270E+00, 0.4173E+00 }, { 0.1255E+01, 0.1477E+01, 0.1503E+01 },
+{ 0.1807E+01, 0.1742E+01, 0.6553E+00 }, { 0.2000E+01, 0.2072E+01, 0.2051E+01 }};
+
+/**
+ * LSP vector quantization tables
+ *
+ * TIA/IS-127 tables 8-1 through 8-9
+ */
+
+static const float evrc_lspq_full_codebook1[64][2] = {
+{1.42016308E-2, 1.93881616E-2}, {2.91667543E-2, 6.51749149E-2},
+{2.06693150E-2, 4.97564934E-2}, {3.94719802E-2, 9.55850929E-2},
+{2.27012448E-2, 3.96625809E-2}, {5.38789518E-2, 6.28347769E-2},
+{2.90525518E-2, 5.73435798E-2}, {4.48280610E-2, 1.15364626E-1},
+{1.94110647E-2, 3.46889682E-2}, {4.37502973E-2, 6.75228462E-2},
+{3.55497338E-2, 4.94086780E-2}, {6.99219853E-2, 8.67279768E-2},
+{2.77880151E-2, 4.65748496E-2}, {5.79111017E-2, 6.74542487E-2},
+{4.74664383E-2, 5.50271496E-2}, {7.88898915E-2, 1.22443043E-1},
+{2.21715886E-2, 3.02628800E-2}, {3.39134485E-2, 7.17703998E-2},
+{3.17989141E-2, 4.98996116E-2}, {6.11555986E-2, 8.73361230E-2},
+{2.67506503E-2, 3.96735854E-2}, {4.44100983E-2, 8.26731324E-2},
+{3.89172547E-2, 5.65788932E-2}, {6.04800619E-2, 1.04536951E-1},
+{2.69156620E-2, 3.57168876E-2}, {4.11117189E-2, 7.33322948E-2},
+{4.12660725E-2, 4.85165231E-2}, {7.18049556E-2, 1.06202349E-1},
+{3.38037871E-2, 4.24300395E-2}, {5.91818243E-2, 7.97467977E-2},
+{4.70107906E-2, 6.28563762E-2}, {9.42011923E-2, 1.30053163E-1},
+{1.94244273E-2, 2.72732340E-2}, {3.70831676E-2, 6.64898157E-2},
+{2.80136354E-2, 5.15984930E-2}, {5.34461029E-2, 9.25904214E-2},
+{2.54959203E-2, 4.32844795E-2}, {5.51860742E-2, 7.36182332E-2},
+{3.39851119E-2, 6.05329126E-2}, {6.18182123E-2, 1.34581268E-1},
+{2.35669166E-2, 3.55242006E-2}, {5.10804243E-2, 6.79562539E-2},
+{3.83464955E-2, 5.23469411E-2}, {7.44275749E-2, 9.66108292E-2},
+{3.18591148E-2, 4.62123118E-2}, {6.18909821E-2, 7.33231753E-2},
+{4.41718437E-2, 5.79240918E-2}, {7.93596208E-2, 1.41177371E-1},
+{2.47412287E-2, 3.23629379E-2}, {3.36563922E-2, 8.04650635E-2},
+{3.37943695E-2, 5.44977151E-2}, {6.53648973E-2, 9.52775925E-2},
+{2.93364152E-2, 4.28411029E-2}, {5.27870469E-2, 8.16159397E-2},
+{4.00724895E-2, 6.18144684E-2}, {6.75848573E-2, 1.17196076E-1},
+{3.03064957E-2, 3.86914052E-2}, {4.83106263E-2, 7.42383003E-2},
+{4.37548272E-2, 5.22842295E-2}, {8.32310021E-2, 1.09881967E-1},
+{3.75600643E-2, 4.53217216E-2}, {6.60113171E-2, 7.97580183E-2},
+{5.03225066E-2, 5.90176322E-2}, {8.77133310E-2, 1.63187444E-1}};
+
+static const float evrc_lspq_full_codebook2[64][2] = {
+{5.21959551E-2, 8.38445649E-2}, {1.05874076E-1, 1.28694162E-1},
+{5.48323877E-2, 1.33842856E-1}, {1.17768474E-1, 1.94037274E-1},
+{5.36086522E-2, 1.11398734E-1}, {1.19989693E-1, 1.47474691E-1},
+{8.00373554E-2, 1.42999724E-1}, {1.64086595E-1, 2.09821835E-1},
+{5.21059223E-2, 9.95229408E-2}, {8.67567956E-2, 1.85966507E-1},
+{7.77341127E-2, 1.31506845E-1}, {1.60545513E-1, 1.81930289E-1},
+{7.42243677E-2, 1.10437103E-1}, {1.18635088E-1, 1.75306752E-1},
+{6.61557764E-2, 1.64441928E-1}, {1.96810856E-1, 2.16682002E-1},
+{6.05317838E-2, 9.45408568E-2}, {1.06271386E-1, 1.48013934E-1},
+{5.87486550E-2, 1.47724584E-1}, {1.34816468E-1, 2.01517954E-1},
+{6.59698322E-2, 1.16447397E-1}, {1.32297173E-1, 1.53267249E-1},
+{9.26660746E-2, 1.46725491E-1}, {1.79285541E-1, 2.19705954E-1},
+{7.06458464E-2, 9.99924466E-2}, {1.06500491E-1, 1.79443434E-1},
+{8.79249722E-2, 1.25287697E-1}, {1.53640196E-1, 1.97852716E-1},
+{8.88430104E-2, 1.12465657E-1}, {1.48286715E-1, 1.67517021E-1},
+{8.16568136E-2, 1.69274017E-1}, {2.07810536E-1, 2.31033549E-1},
+{6.14927970E-2, 8.36263224E-2}, {1.14473253E-1, 1.36779979E-1},
+{6.87129870E-2, 1.38099059E-1}, {1.10511415E-1, 2.15352878E-1},
+{5.55652268E-2, 1.22242786E-1}, {1.20557591E-1, 1.61072448E-1},
+{8.32249671E-2, 1.55475482E-1}, {1.61638483E-1, 2.28268847E-1},
+{6.29152283E-2, 1.06229566E-1}, {8.29186887E-2, 2.06774518E-1},
+{8.84756893E-2, 1.35799959E-1}, {1.69772223E-1, 1.93773940E-1},
+{7.77297840E-2, 1.20287232E-1}, {1.30648017E-1, 1.84331819E-1},
+{6.91939592E-2, 1.84218004E-1}, {2.03904077E-1, 2.49715164E-1},
+{7.07671717E-2, 9.03186128E-2}, {1.08471557E-1, 1.61966518E-1},
+{7.16886371E-2, 1.51093170E-1}, {1.38779536E-1, 2.18801782E-1},
+{6.75907061E-2, 1.26740307E-1}, {1.33412346E-1, 1.68838874E-1},
+{9.61822569E-2, 1.58728704E-1}, {1.86485633E-1, 2.36560926E-1},
+{8.23447108E-2, 1.02126025E-1}, {1.00336641E-1, 1.94918498E-1},
+{9.95981991E-2, 1.36425093E-1}, {1.82448462E-1, 2.03655198E-1},
+{9.78890732E-2, 1.21145472E-1}, {1.45453140E-1, 1.83604524E-1},
+{9.58395451E-2, 1.72194853E-1}, {2.23295853E-1, 2.46418610E-1}};
+
+static const float evrc_lspq_full_codebook3[512][3] = {
+{1.36425778E-1, 1.68651849E-1, 2.04688221E-1},
+{1.85717627E-1, 2.28756160E-1, 2.51958042E-1},
+{1.22760192E-1, 1.85950696E-1, 2.79446691E-1},
+{1.96468458E-1, 2.64484435E-1, 2.89318889E-1},
+{1.25653744E-1, 1.50529265E-1, 2.76144296E-1},
+{1.96301565E-1, 2.41699994E-1, 2.88230687E-1},
+{1.40099391E-1, 2.22365588E-1, 2.74666578E-1},
+{2.59952307E-1, 2.75394946E-1, 3.10975939E-1},
+{1.58452198E-1, 1.88591003E-1, 2.07339197E-1},
+{1.95616230E-1, 2.21379519E-1, 2.87022918E-1},
+{1.69424579E-1, 2.01614648E-1, 2.75669187E-1},
+{2.12393746E-1, 2.64250666E-1, 3.17967504E-1},
+{1.82965085E-1, 1.99547559E-1, 2.29538843E-1},
+{2.15200707E-1, 2.62409419E-1, 2.82432705E-1},
+{1.46404549E-1, 2.36966729E-1, 2.90067106E-1},
+{2.45338634E-1, 3.03358108E-1, 3.42260152E-1},
+{1.37478963E-1, 1.58276558E-1, 2.39217222E-1},
+{2.01999024E-1, 2.20102608E-1, 2.69546896E-1},
+{1.18350029E-1, 2.30206400E-1, 2.83554822E-1},
+{2.25519255E-1, 2.72272140E-1, 3.06072980E-1},
+{1.35661438E-1, 1.91633970E-1, 2.65912026E-1},
+{1.95733085E-1, 2.31926173E-1, 3.14376086E-1},
+{1.67998984E-1, 2.27706313E-1, 2.76947826E-1},
+{2.50170559E-1, 3.01627070E-1, 3.21084231E-1},
+{1.33492306E-1, 2.01223105E-1, 2.33893991E-1},
+{2.06442133E-1, 2.38704175E-1, 2.77560145E-1},
+{1.79048792E-1, 1.95776582E-1, 2.80656606E-1},
+{2.06193641E-1, 2.64055401E-1, 3.33098441E-1},
+{1.75185278E-1, 1.91166341E-1, 2.57540315E-1},
+{2.28398636E-1, 2.45296657E-1, 3.08980793E-1},
+{1.80859819E-1, 2.43579060E-1, 2.96631068E-1},
+{2.76152968E-1, 3.08256060E-1, 3.46822590E-1},
+{1.37115732E-1, 1.80057764E-1, 2.20953465E-1},
+{1.81370094E-1, 2.26770103E-1, 2.70392686E-1},
+{1.25246510E-1, 1.79606944E-1, 3.10376436E-1},
+{1.90708354E-1, 2.87734240E-1, 3.13476235E-1},
+{1.30486086E-1, 1.60435289E-1, 3.00243706E-1},
+{1.97318628E-1, 2.56378502E-1, 2.78474301E-1},
+{1.58597067E-1, 2.37381399E-1, 2.62910336E-1},
+{2.61825919E-1, 2.77717203E-1, 3.31382245E-1},
+{1.64160743E-1, 1.85841531E-1, 2.35615849E-1},
+{2.09486142E-1, 2.21452802E-1, 2.92153865E-1},
+{1.66807845E-1, 2.13641763E-1, 2.70675927E-1},
+{2.29834273E-1, 2.88374633E-1, 3.06238323E-1},
+{1.82154253E-1, 2.00822473E-1, 2.40169376E-1},
+{2.24944726E-1, 2.69813925E-1, 2.91401237E-1},
+{1.63940564E-1, 2.50341147E-1, 2.78307766E-1},
+{2.56727993E-1, 2.95103759E-1, 3.53297085E-1},
+{1.40218839E-1, 1.76687688E-1, 2.46773273E-1},
+{2.15291306E-1, 2.29216009E-1, 2.64283627E-1},
+{1.21002659E-1, 2.18333840E-1, 3.22341293E-1},
+{2.54243195E-1, 2.73986191E-1, 2.96262473E-1},
+{1.60385415E-1, 1.83762908E-1, 2.81598717E-1},
+{1.87832162E-1, 2.37420350E-1, 3.29777509E-1},
+{1.77788362E-1, 2.26703495E-1, 3.02322537E-1},
+{2.75108218E-1, 2.93730587E-1, 3.12373787E-1},
+{1.70116410E-1, 1.85232103E-1, 2.46125028E-1},
+{2.21754774E-1, 2.39912242E-1, 2.86891907E-1},
+{1.95083722E-1, 2.08337873E-1, 2.88349718E-1},
+{2.37536535E-1, 2.75004476E-1, 3.39786023E-1},
+{1.88369319E-1, 2.04371840E-1, 2.57375032E-1},
+{2.47250155E-1, 2.60551840E-1, 3.02137524E-1},
+{1.66944191E-1, 2.46912360E-1, 3.18894416E-1},
+{2.78118610E-1, 3.13011140E-1, 3.65329295E-1},
+{1.45213529E-1, 1.63051456E-1, 2.24912614E-1},
+{2.05692515E-1, 2.20831484E-1, 2.52817810E-1},
+{1.21125661E-1, 1.96374118E-1, 3.00122708E-1},
+{2.15566799E-1, 2.65657336E-1, 2.99202889E-1},
+{1.09134212E-1, 1.78472102E-1, 2.88323194E-1},
+{2.03508541E-1, 2.40347922E-1, 2.96309739E-1},
+{1.53101787E-1, 2.25415319E-1, 2.84843713E-1},
+{2.50233442E-1, 2.77736932E-1, 3.24840695E-1},
+{1.66308925E-1, 1.94173396E-1, 2.11635381E-1},
+{2.01289460E-1, 2.26062179E-1, 2.93246478E-1},
+{1.49518773E-1, 2.14201719E-1, 2.83894747E-1},
+{2.21836135E-1, 2.85231501E-1, 3.20082635E-1},
+{1.89573213E-1, 2.06577629E-1, 2.30332345E-1},
+{2.31247649E-1, 2.46864259E-1, 2.89846569E-1},
+{1.39116928E-1, 2.59189934E-1, 2.98019558E-1},
+{2.44512573E-1, 2.82671362E-1, 3.61258298E-1},
+{1.22530967E-1, 1.68514788E-1, 2.70879298E-1},
+{2.04372838E-1, 2.30398357E-1, 2.71792918E-1},
+{1.42643943E-1, 2.22405583E-1, 2.92057186E-1},
+{2.42643669E-1, 2.77429372E-1, 2.97135502E-1},
+{1.52048603E-1, 1.96921080E-1, 2.61013240E-1},
+{2.17875019E-1, 2.45840371E-1, 3.08138579E-1},
+{1.90109268E-1, 2.31099129E-1, 2.80178159E-1},
+{2.54314184E-1, 2.94079810E-1, 3.39649171E-1},
+{1.56698599E-1, 2.08597451E-1, 2.28010774E-1},
+{2.25088730E-1, 2.50014484E-1, 2.76250154E-1},
+{1.78219035E-1, 1.98228240E-1, 3.04198891E-1},
+{2.08567217E-1, 2.92395383E-1, 3.46786886E-1},
+{1.71052113E-1, 2.03438759E-1, 2.62644321E-1},
+{2.30275467E-1, 2.58817524E-1, 3.11986536E-1},
+{1.85333565E-1, 2.45760202E-1, 3.10553998E-1},
+{2.89413869E-1, 3.11095625E-1, 3.46476167E-1},
+{1.50332406E-1, 1.67538226E-1, 2.40182847E-1},
+{1.79971650E-1, 2.37168610E-1, 2.60899693E-1},
+{1.49866179E-1, 1.97890073E-1, 3.07916552E-1},
+{2.10799649E-1, 2.88180083E-1, 3.29747230E-1},
+{1.31711140E-1, 1.65906459E-1, 3.22898000E-1},
+{2.14832023E-1, 2.52822131E-1, 2.97547072E-1},
+{1.83760419E-1, 2.37523615E-1, 2.74610013E-1},
+{2.55575180E-1, 2.75439233E-1, 3.46021861E-1},
+{1.82662204E-1, 1.99470907E-1, 2.16051653E-1},
+{2.09240332E-1, 2.22406715E-1, 3.02382857E-1},
+{1.84088245E-1, 2.11327791E-1, 2.82538086E-1},
+{2.41171077E-1, 2.97036022E-1, 3.15979272E-1},
+{1.96804658E-1, 2.11815894E-1, 2.41647676E-1},
+{2.42761984E-1, 2.58586556E-1, 2.93204397E-1},
+{1.58905461E-1, 2.65077025E-1, 2.89881319E-1},
+{2.58060575E-1, 3.18903178E-1, 3.47846836E-1},
+{1.48766384E-1, 1.66853935E-1, 2.66827434E-1},
+{2.15942249E-1, 2.29938298E-1, 2.76041597E-1},
+{1.38410494E-1, 2.39283442E-1, 3.27972382E-1},
+{2.43765280E-1, 2.88408488E-1, 3.06048721E-1},
+{1.70157120E-1, 1.89986289E-1, 2.81219155E-1},
+{2.19117031E-1, 2.58005291E-1, 3.26571971E-1},
+{1.92163572E-1, 2.23614186E-1, 2.98683077E-1},
+{2.73545444E-1, 3.12078089E-1, 3.30766588E-1},
+{1.62452087E-1, 2.04930902E-1, 2.53337711E-1},
+{2.23855302E-1, 2.37671077E-1, 3.03202003E-1},
+{1.93955287E-1, 2.12335557E-1, 3.07566851E-1},
+{2.29912683E-1, 2.97581047E-1, 3.37499231E-1},
+{1.89335391E-1, 2.04148144E-1, 2.78609782E-1},
+{2.42303565E-1, 2.73163110E-1, 3.15361649E-1},
+{1.55009672E-1, 2.88095146E-1, 3.35996419E-1},
+{2.73716152E-1, 3.31215471E-1, 3.62539083E-1},
+{1.52389362E-1, 1.72619134E-1, 1.90585673E-1},
+{1.96988270E-1, 2.26309747E-1, 2.46197492E-1},
+{1.20555148E-1, 2.06369758E-1, 2.81199783E-1},
+{1.93709418E-1, 2.71900505E-1, 3.01332921E-1},
+{1.36701152E-1, 1.54093146E-1, 2.82258362E-1},
+{1.97299168E-1, 2.53656298E-1, 2.90315062E-1},
+{1.43463776E-1, 2.43872911E-1, 2.75533706E-1},
+{2.58477271E-1, 2.73279876E-1, 3.21119100E-1},
+{1.54406175E-1, 1.93793535E-1, 2.15884149E-1},
+{2.05979452E-1, 2.24277020E-1, 2.85732359E-1},
+{1.74535319E-1, 2.08482355E-1, 2.79668540E-1},
+{2.18844578E-1, 2.72486299E-1, 3.27095598E-1},
+{1.77609727E-1, 2.12990195E-1, 2.39119649E-1},
+{2.29163751E-1, 2.59165913E-1, 2.83514649E-1},
+{1.57353148E-1, 2.39961296E-1, 3.04263145E-1},
+{2.45613828E-1, 3.16824526E-1, 3.42909366E-1},
+{1.42953232E-1, 1.61905348E-1, 2.53710240E-1},
+{2.10192814E-1, 2.22847700E-1, 2.71103770E-1},
+{1.26843944E-1, 2.16709048E-1, 2.97734648E-1},
+{2.31000140E-1, 2.80109137E-1, 2.99707443E-1},
+{1.52980462E-1, 1.93996876E-1, 2.72895664E-1},
+{2.12860718E-1, 2.41545349E-1, 3.16518754E-1},
+{1.71154693E-1, 2.22469687E-1, 2.93786496E-1},
+{2.51988232E-1, 3.04254979E-1, 3.31269950E-1},
+{1.33188918E-1, 2.07924992E-1, 2.55362093E-1},
+{2.12044910E-1, 2.42189646E-1, 2.88903743E-1},
+{1.84612468E-1, 2.01143622E-1, 2.86360770E-1},
+{2.18286708E-1, 2.76752442E-1, 3.44581515E-1},
+{1.83562174E-1, 1.99478507E-1, 2.62156576E-1},
+{2.33130530E-1, 2.49596909E-1, 3.15842837E-1},
+{1.89898983E-1, 2.46874869E-1, 2.97132462E-1},
+{2.75022447E-1, 3.22490305E-1, 3.46977681E-1},
+{1.42305329E-1, 1.92689180E-1, 2.16155857E-1},
+{1.95676163E-1, 2.22268641E-1, 2.76587397E-1},
+{1.33241490E-1, 1.97791785E-1, 3.22897941E-1},
+{1.84865132E-1, 2.97106177E-1, 3.26105148E-1},
+{1.50203660E-1, 1.76781267E-1, 2.91536182E-1},
+{2.03144446E-1, 2.59616166E-1, 2.99156040E-1},
+{1.65488973E-1, 2.38342047E-1, 2.87493914E-1},
+{2.71071255E-1, 2.89544493E-1, 3.19521040E-1},
+{1.68598369E-1, 1.98825568E-1, 2.30347604E-1},
+{2.13811651E-1, 2.34471768E-1, 2.90959626E-1},
+{1.74605444E-1, 2.17256010E-1, 2.85688072E-1},
+{2.28503481E-1, 2.96190292E-1, 3.16534668E-1},
+{1.87172607E-1, 2.20547438E-1, 2.39688724E-1},
+{2.28884771E-1, 2.63583153E-1, 3.01329464E-1},
+{1.77897051E-1, 2.58131474E-1, 2.81487674E-1},
+{2.59513617E-1, 3.07204396E-1, 3.48793596E-1},
+{1.45224437E-1, 1.78715974E-1, 2.59186983E-1},
+{2.19062313E-1, 2.38223523E-1, 2.60461539E-1},
+{1.43650874E-1, 2.09760785E-1, 3.15830201E-1},
+{2.50127465E-1, 2.79182345E-1, 3.05153579E-1},
+{1.48986444E-1, 2.01226771E-1, 2.82543689E-1},
+{2.08387777E-1, 2.35603899E-1, 3.45363885E-1},
+{1.85830340E-1, 2.21607298E-1, 3.10773641E-1},
+{2.80904710E-1, 2.95469791E-1, 3.25499445E-1},
+{1.72967300E-1, 1.97078109E-1, 2.45801106E-1},
+{2.19495699E-1, 2.44767100E-1, 2.93587774E-1},
+{1.83909580E-1, 2.15004295E-1, 3.00334543E-1},
+{2.45338634E-1, 2.68595248E-1, 3.48330349E-1},
+{1.92957386E-1, 2.06625074E-1, 2.67336398E-1},
+{2.54845560E-1, 2.68642277E-1, 3.03547889E-1},
+{1.76853105E-1, 2.59330958E-1, 3.16200763E-1},
+{2.90929139E-1, 3.15634757E-1, 3.68723541E-1},
+{1.57116994E-1, 1.73552901E-1, 2.28736520E-1},
+{2.12509260E-1, 2.30501205E-1, 2.52217978E-1},
+{1.42521843E-1, 2.01979935E-1, 2.93012232E-1},
+{2.14919671E-1, 2.78065056E-1, 3.14176053E-1},
+{1.35947272E-1, 1.81055903E-1, 2.75475413E-1},
+{1.98416695E-1, 2.41673797E-1, 3.05173427E-1},
+{1.59517333E-1, 2.31580108E-1, 2.95412451E-1},
+{2.58203626E-1, 2.87348121E-1, 3.20351988E-1},
+{1.74840674E-1, 1.92883253E-1, 2.11250007E-1},
+{2.02168509E-1, 2.27025688E-1, 3.04884046E-1},
+{1.69532105E-1, 2.11826235E-1, 2.97355384E-1},
+{2.30033740E-1, 2.91504353E-1, 3.26589435E-1},
+{1.95046112E-1, 2.11709172E-1, 2.27705747E-1},
+{2.37926885E-1, 2.52411634E-1, 2.97752172E-1},
+{1.53762922E-1, 2.46541560E-1, 3.14768940E-1},
+{2.36075714E-1, 3.03568929E-1, 3.70624453E-1},
+{1.38660327E-1, 1.67949975E-1, 2.73515254E-1},
+{2.13806167E-1, 2.27267206E-1, 2.86276251E-1},
+{1.25080630E-1, 2.44098395E-1, 3.02548796E-1},
+{2.35714868E-1, 2.81208843E-1, 3.08903724E-1},
+{1.51691392E-1, 2.10877746E-1, 2.63812989E-1},
+{2.20730439E-1, 2.52777904E-1, 3.16413730E-1},
+{1.84924737E-1, 2.39424765E-1, 2.85120815E-1},
+{2.59548545E-1, 3.09809893E-1, 3.26423734E-1},
+{1.62930742E-1, 2.19900876E-1, 2.36148626E-1},
+{2.34194234E-1, 2.49944329E-1, 2.77549058E-1},
+{1.70870200E-1, 1.98291600E-1, 3.21412593E-1},
+{2.31566861E-1, 2.75015086E-1, 3.69710356E-1},
+{1.80002406E-1, 2.06701040E-1, 2.71204919E-1},
+{2.38075271E-1, 2.54006237E-1, 3.23827595E-1},
+{1.99148253E-1, 2.54273921E-1, 3.07479709E-1},
+{2.87428617E-1, 3.25045079E-1, 3.48634571E-1},
+{1.45285025E-1, 1.91359162E-1, 2.49691397E-1},
+{1.94659308E-1, 2.40821242E-1, 2.77302653E-1},
+{1.53150991E-1, 1.94375664E-1, 3.27550441E-1},
+{2.04085842E-1, 2.98595697E-1, 3.21480066E-1},
+{1.56009689E-1, 1.81012720E-1, 3.00931662E-1},
+{2.10962430E-1, 2.55770296E-1, 3.08086127E-1},
+{1.85444072E-1, 2.49021322E-1, 2.74029821E-1},
+{2.74493456E-1, 2.89441973E-1, 3.38794917E-1},
+{1.76941887E-1, 1.94476932E-1, 2.22077265E-1},
+{2.16377512E-1, 2.30735779E-1, 3.03689271E-1},
+{1.89683452E-1, 2.14660764E-1, 2.88445383E-1},
+{2.40827337E-1, 2.98141748E-1, 3.27378422E-1},
+{2.01787844E-1, 2.19441772E-1, 2.39327446E-1},
+{2.48812512E-1, 2.65865892E-1, 2.93382376E-1},
+{1.82027832E-1, 2.68279046E-1, 2.93991417E-1},
+{2.56498635E-1, 3.19984466E-1, 3.62663239E-1},
+{1.58799276E-1, 1.75433666E-1, 2.67389864E-1},
+{2.24259302E-1, 2.36668259E-1, 2.77639121E-1},
+{1.49203405E-1, 2.26585329E-1, 3.45255584E-1},
+{2.50655770E-1, 2.92264849E-1, 3.13574284E-1},
+{1.58096299E-1, 2.02193201E-1, 2.98711687E-1},
+{2.28820905E-1, 2.48557344E-1, 3.44726473E-1},
+{1.87972054E-1, 2.34109432E-1, 3.04235607E-1},
+{2.85657108E-1, 3.14878136E-1, 3.36931497E-1},
+{1.62680015E-1, 2.17820048E-1, 2.57436782E-1},
+{2.24049792E-1, 2.46739820E-1, 3.00795883E-1},
+{2.01354548E-1, 2.18286663E-1, 3.13036293E-1},
+{2.38028511E-1, 2.98103482E-1, 3.53503793E-1},
+{1.98829994E-1, 2.12877125E-1, 2.72980839E-1},
+{2.50616491E-1, 2.67659992E-1, 3.20611864E-1},
+{1.70901820E-1, 2.69330353E-1, 3.34428221E-1},
+{3.04988861E-1, 3.36196691E-1, 3.65235358E-1},
+{1.47624031E-1, 1.81272805E-1, 2.04707921E-1},
+{1.93751350E-1, 2.20973969E-1, 2.61775166E-1},
+{1.32089809E-1, 1.94851607E-1, 2.83547610E-1},
+{2.07739428E-1, 2.70596832E-1, 2.92264789E-1},
+{1.27733424E-1, 1.66896015E-1, 2.83891350E-1},
+{2.05309406E-1, 2.47807533E-1, 2.83632785E-1},
+{1.54211894E-1, 2.25014091E-1, 2.70082027E-1},
+{2.67574131E-1, 2.84426898E-1, 3.09334785E-1},
+{1.68846920E-1, 1.87004536E-1, 2.02433169E-1},
+{2.02441111E-1, 2.16733068E-1, 2.93079227E-1},
+{1.63621262E-1, 2.15616465E-1, 2.82792896E-1},
+{2.25509301E-1, 2.66283005E-1, 3.17886561E-1},
+{1.89110294E-1, 2.05609441E-1, 2.22113580E-1},
+{2.21240178E-1, 2.60288864E-1, 2.92541057E-1},
+{1.55563369E-1, 2.46850818E-1, 2.89648801E-1},
+{2.48406157E-1, 3.05291861E-1, 3.55316669E-1},
+{1.27122149E-1, 1.58053726E-1, 2.54164368E-1},
+{2.04998836E-1, 2.19476849E-1, 2.78342038E-1},
+{1.33302316E-1, 2.29614019E-1, 2.86947161E-1},
+{2.36777052E-1, 2.67918199E-1, 3.08230907E-1},
+{1.40853569E-1, 2.03414679E-1, 2.73257107E-1},
+{2.07684264E-1, 2.34520018E-1, 3.24583262E-1},
+{1.77181646E-1, 2.29595393E-1, 2.83539146E-1},
+{2.61378348E-1, 3.01160187E-1, 3.21707100E-1},
+{1.48595735E-1, 2.07772017E-1, 2.46946126E-1},
+{2.14334831E-1, 2.48061299E-1, 2.72259146E-1},
+{1.76380262E-1, 1.96897894E-1, 2.92286903E-1},
+{1.98193476E-1, 2.75483340E-1, 3.49037558E-1},
+{1.76153168E-1, 1.93248957E-1, 2.69548506E-1},
+{2.36968622E-1, 2.50065804E-1, 3.06820840E-1},
+{1.76060721E-1, 2.54037619E-1, 3.03566784E-1},
+{2.82952905E-1, 3.01765054E-1, 3.53956312E-1},
+{1.45353720E-1, 1.83678836E-1, 2.34750062E-1},
+{1.93842635E-1, 2.30635554E-1, 2.67817765E-1},
+{1.38958976E-1, 1.86760783E-1, 3.13113242E-1},
+{1.99944481E-1, 2.77624756E-1, 3.25046331E-1},
+{1.42966077E-1, 1.71310842E-1, 3.03013414E-1},
+{2.07741663E-1, 2.58691758E-1, 2.88766950E-1},
+{1.71776935E-1, 2.40246087E-1, 2.73284525E-1},
+{2.71046638E-1, 2.85170943E-1, 3.27401131E-1},
+{1.69854626E-1, 1.87545776E-1, 2.24484712E-1},
+{2.15221986E-1, 2.27339745E-1, 2.95008808E-1},
+{1.75596640E-1, 2.17936546E-1, 2.74879605E-1},
+{2.34665439E-1, 2.89530903E-1, 3.16494375E-1},
+{1.89946994E-1, 2.04953820E-1, 2.46955171E-1},
+{2.37297818E-1, 2.68316716E-1, 2.90684313E-1},
+{1.69963166E-1, 2.53367484E-1, 2.92533010E-1},
+{2.70659864E-1, 2.97146112E-1, 3.56183976E-1},
+{1.52539685E-1, 1.70138955E-1, 2.52703935E-1},
+{2.19119206E-1, 2.35900700E-1, 2.69739121E-1},
+{1.42245665E-1, 2.18184620E-1, 3.28218073E-1},
+{2.61472821E-1, 2.78025657E-1, 3.02375883E-1},
+{1.53526023E-1, 1.90727741E-1, 2.92820841E-1},
+{2.09240988E-1, 2.49808684E-1, 3.24709088E-1},
+{1.75176397E-1, 2.38646746E-1, 3.06392699E-1},
+{2.73218870E-1, 3.03954989E-1, 3.20513874E-1},
+{1.63911596E-1, 1.89611584E-1, 2.56272525E-1},
+{2.26953760E-1, 2.40120232E-1, 2.92728513E-1},
+{1.95565715E-1, 2.11956203E-1, 2.97374696E-1},
+{2.41045550E-1, 2.88497001E-1, 3.36352319E-1},
+{1.94948331E-1, 2.09475279E-1, 2.56309658E-1},
+{2.47884631E-1, 2.63356417E-1, 3.11270863E-1},
+{1.69189706E-1, 2.35864580E-1, 3.36249381E-1},
+{2.86001563E-1, 3.25423747E-1, 3.59607369E-1},
+{1.56258598E-1, 1.76704943E-1, 2.14393437E-1},
+{2.08996847E-1, 2.23968685E-1, 2.60886759E-1},
+{1.35765389E-1, 2.03580052E-1, 3.05503219E-1},
+{2.18961373E-1, 2.79463500E-1, 2.99450845E-1},
+{1.34064749E-1, 1.78332120E-1, 2.90169626E-1},
+{2.13298395E-1, 2.40031511E-1, 3.00345927E-1},
+{1.64373413E-1, 2.26438701E-1, 2.87171155E-1},
+{2.50739604E-1, 2.80812472E-1, 3.35349351E-1},
+{1.63649514E-1, 1.97108001E-1, 2.21165180E-1},
+{2.08139613E-1, 2.30869800E-1, 2.96137065E-1},
+{1.59113124E-1, 2.18189180E-1, 2.95531958E-1},
+{2.39883497E-1, 2.81831235E-1, 3.26045603E-1},
+{1.89394727E-1, 2.08127141E-1, 2.38446414E-1},
+{2.32995704E-1, 2.59603471E-1, 2.93427974E-1},
+{1.60558835E-1, 2.55164832E-1, 3.02872926E-1},
+{2.53509283E-1, 2.96028465E-1, 3.67721587E-1},
+{1.30124375E-1, 1.74838990E-1, 2.60486037E-1},
+{2.10203990E-1, 2.33570784E-1, 2.83061892E-1},
+{1.52365491E-1, 2.25338757E-1, 3.03720981E-1},
+{2.40558609E-1, 2.77192205E-1, 3.05891901E-1},
+{1.63728818E-1, 1.94779396E-1, 2.69253582E-1},
+{2.25709423E-1, 2.40902692E-1, 3.18060607E-1},
+{1.92055091E-1, 2.29857832E-1, 2.89826721E-1},
+{2.62759686E-1, 3.04292172E-1, 3.35680574E-1},
+{1.66071162E-1, 2.06819177E-1, 2.39712462E-1},
+{2.23915562E-1, 2.50106871E-1, 2.85296232E-1},
+{1.88402340E-1, 2.03793734E-1, 3.03041130E-1},
+{2.30698988E-1, 2.87044138E-1, 3.49802762E-1},
+{1.82025358E-1, 2.14073509E-1, 2.63470024E-1},
+{2.37297758E-1, 2.65025407E-1, 3.17815512E-1},
+{1.89278707E-1, 2.58802205E-1, 3.04866165E-1},
+{2.97243059E-1, 3.17153066E-1, 3.56583923E-1},
+{1.58607468E-1, 1.78659767E-1, 2.41919369E-1},
+{1.94887385E-1, 2.41695851E-1, 2.62176663E-1},
+{1.58124432E-1, 2.11753070E-1, 3.11352164E-1},
+{2.16902718E-1, 2.98796803E-1, 3.20994049E-1},
+{1.49272785E-1, 1.74964130E-1, 3.15334409E-1},
+{2.21622273E-1, 2.56179065E-1, 3.03902954E-1},
+{1.75979599E-1, 2.43505448E-1, 2.85801739E-1},
+{2.64590383E-1, 2.85541564E-1, 3.45107764E-1},
+{1.80137083E-1, 2.05279350E-1, 2.22255990E-1},
+{2.10796222E-1, 2.26315439E-1, 3.14426929E-1},
+{1.79151163E-1, 2.09439725E-1, 2.93280870E-1},
+{2.49719024E-1, 2.91257650E-1, 3.27162296E-1},
+{1.98700234E-1, 2.15896755E-1, 2.49960214E-1},
+{2.40726396E-1, 2.64857739E-1, 2.99639553E-1},
+{1.71249732E-1, 2.68166155E-1, 3.03572744E-1},
+{2.69555569E-1, 3.16100627E-1, 3.56570691E-1},
+{1.50564745E-1, 1.84190869E-1, 2.68674821E-1},
+{2.16941193E-1, 2.40813971E-1, 2.78942198E-1},
+{1.35399476E-1, 2.60586530E-1, 3.32604855E-1},
+{2.56150961E-1, 2.87822872E-1, 3.06156367E-1},
+{1.66398838E-1, 1.88721806E-1, 2.93023735E-1},
+{2.29214087E-1, 2.61565417E-1, 3.27494055E-1},
+{1.98266640E-1, 2.32970506E-1, 2.99134284E-1},
+{2.87046254E-1, 3.07103783E-1, 3.27298075E-1},
+{1.75898686E-1, 2.11898595E-1, 2.51332909E-1},
+{2.32067421E-1, 2.44622201E-1, 2.99443692E-1},
+{1.90780059E-1, 2.12090015E-1, 3.25059265E-1},
+{2.31531218E-1, 3.14166099E-1, 3.42735857E-1},
+{1.95099846E-1, 2.09554315E-1, 2.79483467E-1},
+{2.40416065E-1, 2.69604772E-1, 3.28015476E-1},
+{1.71800867E-1, 2.82233089E-1, 3.14749271E-1},
+{2.69243777E-1, 3.38462502E-1, 3.79935652E-1},
+{1.59934625E-1, 1.77966774E-1, 2.00818628E-1},
+{2.01979712E-1, 2.30668545E-1, 2.56773323E-1},
+{1.34024277E-1, 2.10961610E-1, 2.84687728E-1},
+{2.03712896E-1, 2.83053070E-1, 3.03309411E-1},
+{1.44528881E-1, 1.64728075E-1, 2.85079390E-1},
+{2.06285611E-1, 2.48649031E-1, 2.96383053E-1},
+{1.58138171E-1, 2.34317720E-1, 2.79650003E-1},
+{2.64995635E-1, 2.79900700E-1, 3.18619400E-1},
+{1.66537479E-1, 1.84279412E-1, 2.14547485E-1},
+{2.03051880E-1, 2.35110492E-1, 2.88755983E-1},
+{1.68422714E-1, 2.03946173E-1, 2.87478894E-1},
+{2.31727019E-1, 2.74086386E-1, 3.24755162E-1},
+{1.85356215E-1, 2.14113116E-1, 2.29030401E-1},
+{2.42482558E-1, 2.60655493E-1, 2.83030301E-1},
+{1.67562261E-1, 2.42027491E-1, 2.99461991E-1},
+{2.38809898E-1, 3.19003850E-1, 3.58415872E-1},
+{1.37908265E-1, 1.54787809E-1, 2.65611202E-1},
+{2.11019263E-1, 2.24607319E-1, 2.79954702E-1},
+{1.37569889E-1, 2.25128531E-1, 3.09312850E-1},
+{2.29239866E-1, 2.76150972E-1, 3.15241843E-1},
+{1.60487458E-1, 1.95461214E-1, 2.83169478E-1},
+{2.18505666E-1, 2.38197207E-1, 3.30340117E-1},
+{1.81991324E-1, 2.33026952E-1, 2.93276042E-1},
+{2.54552305E-1, 3.14394146E-1, 3.36392254E-1},
+{1.44095764E-1, 2.26640165E-1, 2.50595063E-1},
+{2.15188012E-1, 2.51417249E-1, 2.85043985E-1},
+{1.87674388E-1, 2.04458863E-1, 2.94168979E-1},
+{2.30494842E-1, 2.68452436E-1, 3.52370054E-1},
+{1.85022101E-1, 1.99075252E-1, 2.71930546E-1},
+{2.42569372E-1, 2.55389154E-1, 3.11399311E-1},
+{1.95166096E-1, 2.49102056E-1, 2.98998445E-1},
+{2.83654153E-1, 3.14600259E-1, 3.55619401E-1},
+{1.51490018E-1, 1.97729796E-1, 2.32467473E-1},
+{2.00029895E-1, 2.30101258E-1, 2.81933933E-1},
+{1.38711318E-1, 1.91816628E-1, 3.45780402E-1},
+{1.96580395E-1, 3.04714769E-1, 3.40553433E-1},
+{1.38154253E-1, 1.88543141E-1, 2.99461216E-1},
+{2.05666468E-1, 2.68904895E-1, 3.05537194E-1},
+{1.72447845E-1, 2.33558387E-1, 2.93625206E-1},
+{2.70145416E-1, 2.98654765E-1, 3.28556389E-1},
+{1.75489411E-1, 1.91361547E-1, 2.35585332E-1},
+{2.20548794E-1, 2.34773993E-1, 2.95397669E-1},
+{1.85652360E-1, 2.22349137E-1, 2.79883891E-1},
+{2.29456946E-1, 3.04546326E-1, 3.24684292E-1},
+{1.86900780E-1, 2.15469390E-1, 2.51856804E-1},
+{2.34910533E-1, 2.71217376E-1, 2.99894661E-1},
+{1.85142443E-1, 2.56071001E-1, 2.93291301E-1},
+{2.63883710E-1, 3.07127446E-1, 3.62546653E-1},
+{1.60997644E-1, 1.78937852E-1, 2.55808324E-1},
+{2.25671068E-1, 2.43735075E-1, 2.68624991E-1},
+{1.55076161E-1, 2.30396181E-1, 3.21005553E-1},
+{2.51760483E-1, 2.79653400E-1, 3.14202160E-1},
+{1.56988814E-1, 2.07466930E-1, 2.89933950E-1},
+{2.17479482E-1, 2.59626418E-1, 3.40659052E-1},
+{1.76811531E-1, 2.31087089E-1, 3.17562491E-1},
+{2.82952607E-1, 2.99844354E-1, 3.36822897E-1},
+{1.82060316E-1, 1.98734730E-1, 2.51980305E-1},
+{2.25874200E-1, 2.52469152E-1, 2.93356389E-1},
+{2.00799957E-1, 2.17786849E-1, 3.02210063E-1},
+{2.47423753E-1, 2.86882848E-1, 3.47820610E-1},
+{2.01128140E-1, 2.14746892E-1, 2.62269646E-1},
+{2.53963351E-1, 2.69477993E-1, 3.12133819E-1},
+{1.91034868E-1, 2.55738169E-1, 3.32559615E-1},
+{2.91053712E-1, 3.31458420E-1, 3.68588477E-1},
+{1.57229915E-1, 1.85374141E-1, 2.25361317E-1},
+{2.08051339E-1, 2.38350868E-1, 2.64212936E-1},
+{1.46848336E-1, 2.13000089E-1, 3.00192565E-1},
+{2.18630567E-1, 2.90263802E-1, 3.09045762E-1},
+{1.43699184E-1, 1.87815160E-1, 2.83769876E-1},
+{2.07328036E-1, 2.45088696E-1, 3.08956414E-1},
+{1.64228097E-1, 2.27826655E-1, 3.08907896E-1},
+{2.61919737E-1, 2.91333705E-1, 3.31527978E-1},
+{1.70648888E-1, 2.02157527E-1, 2.17827827E-1},
+{2.07796112E-1, 2.34704822E-1, 3.06783766E-1},
+{1.72118798E-1, 2.14057386E-1, 3.10151786E-1},
+{2.29116157E-1, 2.80949861E-1, 3.33774298E-1},
+{1.96622208E-1, 2.16653049E-1, 2.33279720E-1},
+{2.37789229E-1, 2.58971304E-1, 3.04609209E-1},
+{1.55182019E-1, 2.63032585E-1, 3.18943053E-1},
+{2.49388829E-1, 3.16970855E-1, 3.77762467E-1},
+{1.51363596E-1, 1.75010651E-1, 2.78245836E-1},
+{2.19810233E-1, 2.32360214E-1, 2.85034925E-1},
+{1.42630622E-1, 2.40602851E-1, 3.04125100E-1},
+{2.42764875E-1, 2.83762127E-1, 3.15481216E-1},
+{1.57467470E-1, 2.07524061E-1, 2.75674909E-1},
+{2.28758618E-1, 2.49092206E-1, 3.28139395E-1},
+{1.90872714E-1, 2.38125205E-1, 2.94894546E-1},
+{2.66389251E-1, 3.14321429E-1, 3.38669509E-1},
+{1.70644209E-1, 2.25980043E-1, 2.47372389E-1},
+{2.36442789E-1, 2.53003448E-1, 2.88220435E-1},
+{1.85423777E-1, 2.04888850E-1, 3.14608842E-1},
+{2.17379019E-1, 2.94553548E-1, 3.67831022E-1},
+{1.88563988E-1, 2.15174288E-1, 2.72999734E-1},
+{2.45102122E-1, 2.59770364E-1, 3.21885556E-1},
+{1.98444173E-1, 2.61160702E-1, 3.17097872E-1},
+{2.99013853E-1, 3.28965336E-1, 3.56681198E-1},
+{1.58248767E-1, 1.92205697E-1, 2.46059090E-1},
+{2.02385351E-1, 2.47965842E-1, 2.71749645E-1},
+{1.61710784E-1, 2.13708103E-1, 3.27384740E-1},
+{2.14419708E-1, 3.05552453E-1, 3.33721548E-1},
+{1.61819980E-1, 1.89897299E-1, 3.10501546E-1},
+{2.19436333E-1, 2.65029579E-1, 3.09288830E-1},
+{1.88303933E-1, 2.49633163E-1, 2.85499543E-1},
+{2.69325376E-1, 2.99807042E-1, 3.41722459E-1},
+{1.72406003E-1, 2.10977256E-1, 2.27773219E-1},
+{2.20281526E-1, 2.34015763E-1, 3.12846094E-1},
+{1.83257267E-1, 2.22061962E-1, 2.91052371E-1},
+{2.42531225E-1, 3.09527606E-1, 3.30389649E-1},
+{2.07546696E-1, 2.24662632E-1, 2.44420141E-1},
+{2.45858207E-1, 2.70285994E-1, 3.05132121E-1},
+{1.84840545E-1, 2.72096783E-1, 3.12531084E-1},
+{2.74252594E-1, 3.21252435E-1, 3.74658197E-1},
+{1.66425839E-1, 1.84491634E-1, 2.68278092E-1},
+{2.28423670E-1, 2.43025422E-1, 2.81184882E-1},
+{1.60091296E-1, 2.52953321E-1, 3.35822314E-1},
+{2.62109995E-1, 2.95581907E-1, 3.13354105E-1},
+{1.67702749E-1, 2.01536924E-1, 3.01801592E-1},
+{2.37822965E-1, 2.59894758E-1, 3.38231117E-1},
+{1.97206214E-1, 2.45490909E-1, 3.17895442E-1},
+{2.98455298E-1, 3.19209784E-1, 3.40971738E-1},
+{1.71195343E-1, 2.24327832E-1, 2.62736112E-1},
+{2.30626896E-1, 2.53310233E-1, 3.01206797E-1},
+{2.04814211E-1, 2.21881568E-1, 3.25966567E-1},
+{2.22987518E-1, 3.06339115E-1, 3.50717157E-1},
+{2.00855389E-1, 2.15359926E-1, 2.84143478E-1},
+{2.50951648E-1, 2.66189247E-1, 3.33360583E-1},
+{1.75610259E-1, 2.93791324E-1, 3.40326935E-1},
+{2.91745067E-1, 3.40602487E-1, 3.81397158E-1}};
+
+static const float evrc_lspq_full_codebook4[128][3] = {
+{2.77461529E-1, 3.16972077E-1, 3.95498335E-1},
+{3.36560428E-1, 3.60156953E-1, 3.81473005E-1},
+{3.10509324E-1, 3.31732392E-1, 3.66864383E-1},
+{3.37470949E-1, 3.96795273E-1, 4.12356317E-1},
+{2.79660404E-1, 3.66520107E-1, 3.85313451E-1},
+{3.16038966E-1, 3.85609329E-1, 4.01304781E-1},
+{3.09960425E-1, 3.43410730E-1, 4.24745500E-1},
+{3.54243636E-1, 4.08699274E-1, 4.22167957E-1},
+{2.95587242E-1, 3.33741128E-1, 3.87421668E-1},
+{3.33446383E-1, 3.86974752E-1, 4.01353061E-1},
+{3.23412836E-1, 3.65269661E-1, 3.85193288E-1},
+{3.42731953E-1, 4.03192520E-1, 4.19920385E-1},
+{2.77681828E-1, 3.82494986E-1, 4.04274166E-1},
+{3.18247974E-1, 3.95985305E-1, 4.31353152E-1},
+{3.03711414E-1, 3.80319715E-1, 4.37173545E-1},
+{3.78288805E-1, 4.07077312E-1, 4.22679126E-1},
+{2.38116503E-1, 3.42454314E-1, 4.24624741E-1},
+{3.45615685E-1, 3.68681073E-1, 4.00817335E-1},
+{3.17688107E-1, 3.41902673E-1, 4.05601799E-1},
+{3.66368949E-1, 3.89039934E-1, 4.06154454E-1},
+{2.99398005E-1, 3.52021694E-1, 3.99955690E-1},
+{3.24991941E-1, 3.90028834E-1, 4.19478714E-1},
+{3.23025763E-1, 3.68114293E-1, 4.02087748E-1},
+{3.62326264E-1, 4.16927993E-1, 4.32773650E-1},
+{2.72696435E-1, 3.59205008E-1, 4.26880658E-1},
+{3.46539855E-1, 3.69616628E-1, 4.15621221E-1},
+{3.34109128E-1, 3.55736315E-1, 3.96749556E-1},
+{3.37468982E-1, 4.10392702E-1, 4.25986826E-1},
+{2.99468994E-1, 3.80648255E-1, 4.18284118E-1},
+{3.21378171E-1, 4.11198020E-1, 4.28792536E-1},
+{3.27841163E-1, 3.69345129E-1, 4.34395611E-1},
+{3.80669057E-1, 4.26086366E-1, 4.42754567E-1},
+{2.68943667E-1, 3.42942953E-1, 3.98681462E-1},
+{3.38102877E-1, 3.76338840E-1, 3.92043173E-1},
+{3.23593497E-1, 3.48742068E-1, 3.72551978E-1},
+{3.47550809E-1, 3.92885387E-1, 4.21169937E-1},
+{3.04182827E-1, 3.59816670E-1, 3.81633341E-1},
+{3.14221382E-1, 4.02108550E-1, 4.20085251E-1},
+{3.01306546E-1, 3.62662733E-1, 4.29262817E-1},
+{3.71770263E-1, 3.98696363E-1, 4.31438982E-1},
+{2.74591267E-1, 3.35595489E-1, 4.20079648E-1},
+{3.44540834E-1, 3.90451789E-1, 4.06412065E-1},
+{3.25239837E-1, 3.78344476E-1, 3.94673288E-1},
+{3.56683493E-1, 3.90574157E-1, 4.33851063E-1},
+{2.63501287E-1, 3.95260096E-1, 4.23116386E-1},
+{3.37520659E-1, 3.92563462E-1, 4.43415821E-1},
+{3.14522266E-1, 3.80968630E-1, 4.22676384E-1},
+{3.76235068E-1, 4.17298734E-1, 4.31451261E-1},
+{2.61855006E-1, 3.68646085E-1, 4.04260576E-1},
+{3.55580151E-1, 3.77994478E-1, 3.95868242E-1},
+{3.27742815E-1, 3.53872776E-1, 4.11040604E-1},
+{3.62960637E-1, 3.99466991E-1, 4.14690197E-1},
+{3.09410870E-1, 3.73796046E-1, 3.92672479E-1},
+{3.31016302E-1, 4.00801599E-1, 4.31759298E-1},
+{3.23573053E-1, 3.68619561E-1, 4.17455137E-1},
+{3.49115849E-1, 4.26840067E-1, 4.43913996E-1},
+{2.89738595E-1, 3.63759339E-1, 4.10511792E-1},
+{3.55286479E-1, 3.89331281E-1, 4.13432419E-1},
+{3.36565912E-1, 3.60222459E-1, 4.24179018E-1},
+{3.39932680E-1, 4.09228802E-1, 4.40184891E-1},
+{3.00889730E-1, 4.00081098E-1, 4.17955697E-1},
+{3.17052066E-1, 4.22288120E-1, 4.42229569E-1},
+{3.27336788E-1, 3.84311676E-1, 4.30288613E-1},
+{3.98990929E-1, 4.29498434E-1, 4.43475187E-1},
+{2.49110118E-1, 3.25696886E-1, 4.11728263E-1},
+{3.45929205E-1, 3.68577540E-1, 3.88473272E-1},
+{3.13219666E-1, 3.39229465E-1, 3.87597919E-1},
+{3.51453960E-1, 3.98730278E-1, 4.12656188E-1},
+{2.93487132E-1, 3.75763118E-1, 3.94488096E-1},
+{3.24470758E-1, 3.94202888E-1, 4.08882737E-1},
+{3.12710822E-1, 3.57720256E-1, 4.14061189E-1},
+{3.66507173E-1, 4.08171296E-1, 4.23891425E-1},
+{2.99965680E-1, 3.31993401E-1, 4.07860160E-1},
+{3.34925175E-1, 3.86143029E-1, 4.11538124E-1},
+{3.34788024E-1, 3.66196156E-1, 3.93347144E-1},
+{3.47847939E-1, 4.05926466E-1, 4.30507302E-1},
+{2.85952926E-1, 3.95283282E-1, 4.16119337E-1},
+{3.23867381E-1, 4.06476676E-1, 4.42482829E-1},
+{3.16716671E-1, 3.84451628E-1, 4.39411044E-1},
+{3.86772931E-1, 4.11824584E-1, 4.27831531E-1},
+{2.38072395E-1, 3.62342358E-1, 4.30931687E-1},
+{3.46450031E-1, 3.79082918E-1, 4.06567812E-1},
+{3.16576600E-1, 3.56468618E-1, 3.96218300E-1},
+{3.66539180E-1, 3.89590919E-1, 4.21055555E-1},
+{3.08291376E-1, 3.71324301E-1, 4.07867432E-1},
+{3.36435199E-1, 3.91514421E-1, 4.22977090E-1},
+{3.23035538E-1, 3.80447328E-1, 4.09550190E-1},
+{3.65228057E-1, 4.27910388E-1, 4.43691254E-1},
+{2.72038043E-1, 3.76596808E-1, 4.33685899E-1},
+{3.57665777E-1, 3.77761602E-1, 4.09178972E-1},
+{3.36498559E-1, 3.64215910E-1, 4.09255505E-1},
+{3.48082423E-1, 4.17631805E-1, 4.33284521E-1},
+{3.02754521E-1, 3.95974755E-1, 4.33717251E-1},
+{3.31676304E-1, 4.17587161E-1, 4.36239839E-1},
+{3.33287597E-1, 3.80799115E-1, 4.39620733E-1},
+{3.88112009E-1, 4.36933577E-1, 4.50829268E-1},
+{2.56026626E-1, 3.48015189E-1, 4.22922611E-1},
+{3.45773995E-1, 3.81725788E-1, 3.96794081E-1},
+{3.25623751E-1, 3.50391924E-1, 3.87330651E-1},
+{3.56868088E-1, 3.98574769E-1, 4.23177242E-1},
+{3.01226199E-1, 3.86906981E-1, 4.03335571E-1},
+{3.28178406E-1, 4.02090192E-1, 4.19389248E-1},
+{3.14385355E-1, 3.69043887E-1, 4.34375286E-1},
+{3.72321129E-1, 4.11672413E-1, 4.40518737E-1},
+{2.90479720E-1, 3.48121881E-1, 4.26216483E-1},
+{3.44438791E-1, 3.82666349E-1, 4.17321086E-1},
+{3.34866822E-1, 3.76235664E-1, 4.04475212E-1},
+{3.59025359E-1, 4.04721916E-1, 4.34838414E-1},
+{2.79127955E-1, 4.11106586E-1, 4.35360551E-1},
+{3.48125517E-1, 3.98732066E-1, 4.46927428E-1},
+{3.27018857E-1, 3.90107334E-1, 4.41707492E-1},
+{3.90858352E-1, 4.19813931E-1, 4.35153484E-1},
+{2.55319297E-1, 3.70405972E-1, 4.32188630E-1},
+{3.54651988E-1, 3.88332665E-1, 4.02956128E-1},
+{3.21608186E-1, 3.54489803E-1, 4.28299785E-1},
+{3.75163496E-1, 3.98833990E-1, 4.14177418E-1},
+{3.11953604E-1, 3.91430676E-1, 4.12552476E-1},
+{3.42528820E-1, 3.96365345E-1, 4.32497382E-1},
+{3.33744347E-1, 3.76422405E-1, 4.20536995E-1},
+{3.53529096E-1, 4.29231048E-1, 4.59699273E-1},
+{2.88017929E-1, 3.77999961E-1, 4.34011698E-1},
+{3.55683446E-1, 3.80780041E-1, 4.23145533E-1},
+{3.44358265E-1, 3.72184873E-1, 4.31265354E-1},
+{3.53966117E-1, 4.14166689E-1, 4.42941308E-1},
+{3.04770231E-1, 4.12517488E-1, 4.34183121E-1},
+{3.35913360E-1, 4.24590766E-1, 4.46378469E-1},
+{3.43738198E-1, 3.84766221E-1, 4.35271382E-1},
+{4.10941303E-1, 4.40662980E-1, 4.52113390E-1}};
+
+static const float evrc_lspq_half_codebook1[128][3] = {
+{1.35226343E-2, 1.82081293E-2, 3.93940695E-2},
+{2.29392890E-2, 3.57831158E-2, 1.05352886E-1},
+{2.09106486E-2, 3.04159056E-2, 8.93941075E-2},
+{1.88909005E-2, 3.82722206E-2, 1.37820408E-1},
+{2.05143820E-2, 2.85481159E-2, 7.39762187E-2},
+{4.69510332E-2, 6.84031919E-2, 1.09123811E-1},
+{3.15557197E-2, 5.69139980E-2, 8.57057571E-2},
+{3.81181911E-2, 7.77784660E-2, 1.92532852E-1},
+{2.16297153E-2, 2.92908940E-2, 6.25042021E-2},
+{3.11414022E-2, 5.99079318E-2, 1.02860682E-1},
+{3.02799307E-2, 5.35012372E-2, 7.80925751E-2},
+{6.50846213E-2, 9.06624720E-2, 1.42850950E-1},
+{3.27340364E-2, 5.04027791E-2, 6.26492277E-2},
+{5.27439862E-2, 6.22574277E-2, 1.22198336E-1},
+{3.48840356E-2, 6.42222390E-2, 9.16024595E-2},
+{4.88984436E-2, 1.05058022E-1, 1.68813452E-1},
+{2.35791076E-2, 3.21034677E-2, 5.60899563E-2},
+{2.77252812E-2, 4.87281792E-2, 1.01224191E-1},
+{2.74348017E-2, 4.04965915E-2, 9.34926122E-2},
+{4.38360050E-2, 6.03261292E-2, 1.52400866E-1},
+{2.68994924E-2, 4.52906378E-2, 6.49800375E-2},
+{5.16058952E-2, 6.08312152E-2, 1.08799636E-1},
+{4.20064926E-2, 6.11845106E-2, 8.54474008E-2},
+{7.13502690E-2, 1.01972111E-1, 1.74640998E-1},
+{2.88906675E-2, 4.13964354E-2, 5.25928028E-2},
+{3.16364467E-2, 6.63532093E-2, 1.24950245E-1},
+{4.30289507E-2, 5.14023267E-2, 7.96877742E-2},
+{5.70970774E-2, 1.08444504E-1, 1.44075617E-1},
+{3.38840261E-2, 5.04746847E-2, 7.29765445E-2},
+{6.54265657E-2, 7.90987685E-2, 1.15570590E-1},
+{3.85423526E-2, 7.33125433E-2, 1.02307513E-1},
+{6.57824501E-2, 1.02909811E-1, 2.11874440E-1},
+{1.54727865E-2, 2.04559695E-2, 5.46121262E-2},
+{2.27950197E-2, 3.90954204E-2, 1.19443826E-1},
+{3.06889173E-2, 4.54540215E-2, 8.20418894E-2},
+{2.25957241E-2, 4.79101725E-2, 1.71844408E-1},
+{2.71088015E-2, 4.01739590E-2, 7.01922849E-2},
+{4.95789349E-2, 7.92963281E-2, 1.04862511E-1},
+{3.06095853E-2, 5.64059429E-2, 9.49584097E-2},
+{6.34224564E-2, 9.11655501E-2, 1.84724405E-1},
+{2.43342388E-2, 3.91998328E-2, 6.31406233E-2},
+{3.38011980E-2, 6.60846457E-2, 1.11031540E-1},
+{3.51784080E-2, 5.79397269E-2, 7.20702857E-2},
+{6.49054050E-2, 8.65831897E-2, 1.54648736E-1},
+{2.91934665E-2, 5.16204573E-2, 6.94437325E-2},
+{5.94522804E-2, 7.19829276E-2, 1.27434507E-1},
+{5.31888530E-2, 6.38182089E-2, 9.88218486E-2},
+{8.68290961E-2, 1.41135350E-1, 1.91728458E-1},
+{2.49991138E-2, 3.62556018E-2, 5.03724031E-2},
+{2.82246377E-2, 5.44572286E-2, 1.12663500E-1},
+{3.62618119E-2, 4.59073223E-2, 9.43343639E-2},
+{5.70455343E-2, 7.46300444E-2, 1.59157172E-1},
+{2.72987466E-2, 4.56625856E-2, 7.52529651E-2},
+{5.12860194E-2, 8.51126984E-2, 1.23587973E-1},
+{4.91451994E-2, 5.93483113E-2, 9.22686011E-2},
+{7.06961900E-2, 1.05451979E-1, 1.92602143E-1},
+{2.80733760E-2, 4.18509208E-2, 5.87159805E-2},
+{4.64449003E-2, 7.06698820E-2, 1.26038432E-1},
+{4.18453738E-2, 6.30445331E-2, 7.66169876E-2},
+{8.42416435E-2, 1.13282882E-1, 1.43687114E-1},
+{4.17615622E-2, 5.59472926E-2, 7.09872842E-2},
+{5.55161387E-2, 9.50126722E-2, 1.27727196E-1},
+{5.90935498E-2, 7.36730024E-2, 9.65935886E-2},
+{7.84136653E-2, 1.41432360E-1, 2.17428640E-1},
+{2.10490543E-2, 2.91891042E-2, 4.60035764E-2},
+{3.64863276E-2, 4.62387018E-2, 1.07044168E-1},
+{2.68652122E-2, 3.92937548E-2, 8.41179937E-2},
+{2.72903945E-2, 5.53805046E-2, 1.41586170E-1},
+{2.48476695E-2, 3.63277681E-2, 7.62430876E-2},
+{5.25430813E-2, 7.75778666E-2, 1.14567965E-1},
+{4.07741442E-2, 5.39923795E-2, 9.07640457E-2},
+{5.73043302E-2, 7.65803084E-2, 1.79578975E-1},
+{2.46032421E-2, 3.41408364E-2, 6.78990781E-2},
+{4.08220068E-2, 6.29783794E-2, 9.95191261E-2},
+{3.83025035E-2, 5.52857481E-2, 7.90019333E-2},
+{7.24111274E-2, 1.01903863E-1, 1.46979645E-1},
+{3.73902172E-2, 4.70463894E-2, 6.54684529E-2},
+{5.27397543E-2, 6.72770366E-2, 1.39680430E-1},
+{4.05365378E-2, 7.05081299E-2, 9.25668627E-2},
+{4.43425253E-2, 1.10367171E-1, 1.99636266E-1},
+{2.54920740E-2, 3.47603969E-2, 6.05902039E-2},
+{4.35465500E-2, 5.32369502E-2, 1.08325966E-1},
+{2.79599819E-2, 4.91324775E-2, 8.84284526E-2},
+{4.98051867E-2, 8.81728902E-2, 1.52597323E-1},
+{3.19346264E-2, 4.62169312E-2, 6.85206428E-2},
+{5.80246300E-2, 6.84268698E-2, 1.15085281E-1},
+{4.33904678E-2, 6.90575615E-2, 8.44984353E-2},
+{7.39691556E-2, 1.19240515E-1, 1.77340195E-1},
+{3.18767503E-2, 4.59697433E-2, 5.72372638E-2},
+{4.50873822E-2, 5.66509366E-2, 1.32005826E-1},
+{4.59097028E-2, 5.45580424E-2, 8.61423314E-2},
+{7.44685754E-2, 1.13815404E-1, 1.61570594E-1},
+{3.97509560E-2, 4.95359488E-2, 7.22542256E-2},
+{6.76257759E-2, 8.31029043E-2, 1.27990112E-1},
+{5.76258078E-2, 6.95326403E-2, 1.05012968E-1},
+{6.85313493E-2, 1.21758826E-1, 2.20626548E-1},
+{2.18480472E-2, 2.99130920E-2, 5.16208000E-2},
+{3.64343151E-2, 4.91795056E-2, 1.23277210E-1},
+{3.89611274E-2, 4.76634987E-2, 8.61716568E-2},
+{4.14635167E-2, 6.88006952E-2, 1.69356152E-1},
+{3.35514620E-2, 4.17815186E-2, 7.37159401E-2},
+{5.80224693E-2, 8.70314166E-2, 1.12917498E-1},
+{4.80243117E-2, 5.69486506E-2, 1.00755706E-1},
+{5.98873124E-2, 8.57942328E-2, 2.01388851E-1},
+{2.99309995E-2, 3.94828431E-2, 6.46376088E-2},
+{3.88626605E-2, 8.07443634E-2, 1.15519784E-1},
+{3.49444002E-2, 6.28911033E-2, 8.04982036E-2},
+{6.88817874E-2, 9.92431119E-2, 1.60393253E-1},
+{3.64237651E-2, 5.34016453E-2, 6.70152009E-2},
+{5.83492741E-2, 7.85285756E-2, 1.41746715E-1},
+{4.86469641E-2, 7.26736858E-2, 9.48315859E-2},
+{5.85533604E-2, 1.36289746E-1, 1.98639736E-1},
+{2.60888506E-2, 3.73406820E-2, 5.57853170E-2},
+{4.58504409E-2, 5.60512505E-2, 1.17927872E-1},
+{4.28801328E-2, 5.14739119E-2, 9.75309014E-2},
+{6.37611598E-2, 8.73552933E-2, 1.68334916E-1},
+{3.76709923E-2, 4.58216034E-2, 7.86528140E-2},
+{6.75194561E-2, 8.98697898E-2, 1.19418114E-1},
+{5.46374246E-2, 6.66805878E-2, 8.93813819E-2},
+{7.73086548E-2, 1.21754415E-1, 1.99579224E-1},
+{3.15621309E-2, 4.51702215E-2, 6.25768527E-2},
+{3.78782675E-2, 8.03486481E-2, 1.38961688E-1},
+{5.08303270E-2, 6.18740581E-2, 8.31153840E-2},
+{8.96311402E-2, 1.28753766E-1, 1.64891586E-1},
+{4.73503470E-2, 5.75724356E-2, 7.65264630E-2},
+{7.16898590E-2, 9.89895687E-2, 1.30078360E-1},
+{6.29082546E-2, 7.90778771E-2, 1.05111063E-1},
+{8.80649835E-2, 1.65206164E-1, 2.13214174E-1}};
+
+static const float evrc_lspq_half_codebook2[128][3] = {
+{9.75915268E-2, 1.23701490E-1, 1.69437975E-1},
+{9.49536338E-2, 2.01081768E-1, 2.26855248E-1},
+{9.00496617E-2, 1.49164870E-1, 2.26532787E-1},
+{1.70302704E-1, 1.97222874E-1, 2.49974832E-1},
+{1.08773641E-1, 1.51972428E-1, 1.75123364E-1},
+{1.30278930E-1, 2.13229164E-1, 2.29646355E-1},
+{1.24917991E-1, 1.87347755E-1, 2.04712003E-1},
+{2.00670198E-1, 2.28963569E-1, 2.69420803E-1},
+{8.98375586E-2, 1.25332758E-1, 2.10539430E-1},
+{9.62376669E-2, 2.07185850E-1, 2.54174471E-1},
+{1.05694629E-1, 1.78856418E-1, 2.00121015E-1},
+{1.56048968E-1, 2.19573721E-1, 2.91079402E-1},
+{1.37392268E-1, 1.59993336E-1, 1.94698542E-1},
+{1.07262500E-1, 2.37790957E-1, 2.70740807E-1},
+{1.42976448E-1, 2.01550499E-1, 2.18468934E-1},
+{2.14270487E-1, 2.71881402E-1, 3.01200211E-1},
+{1.10729210E-1, 1.33688226E-1, 1.54877156E-1},
+{1.06667660E-1, 1.76678821E-1, 2.62798905E-1},
+{9.16352943E-2, 1.74592838E-1, 2.19329327E-1},
+{1.84038624E-1, 2.27964059E-1, 2.47762203E-1},
+{1.10572360E-1, 1.58207163E-1, 1.96013063E-1},
+{1.33543387E-1, 2.32269660E-1, 2.51828164E-1},
+{1.55922309E-1, 1.77941337E-1, 2.18096644E-1},
+{1.92260072E-1, 2.49512479E-1, 2.89911509E-1},
+{1.13708906E-1, 1.37872443E-1, 2.02929884E-1},
+{1.02557532E-1, 1.84820071E-1, 2.92164624E-1},
+{1.36595622E-1, 1.58687428E-1, 2.41399556E-1},
+{1.72813818E-1, 2.49303415E-1, 3.00458610E-1},
+{1.36871174E-1, 1.57249823E-1, 2.10913152E-1},
+{1.28974810E-1, 2.45167866E-1, 2.67653584E-1},
+{1.66812256E-1, 1.88998029E-1, 2.31345922E-1},
+{2.32248470E-1, 2.63196051E-1, 3.16754937E-1},
+{9.24560949E-2, 1.19977452E-1, 1.91262275E-1},
+{1.13085262E-1, 2.08461538E-1, 2.29368120E-1},
+{1.00716405E-1, 1.40670076E-1, 2.58062959E-1},
+{1.67010382E-1, 2.18105540E-1, 2.62592494E-1},
+{1.25487238E-1, 1.62686959E-1, 1.84409231E-1},
+{1.52406558E-1, 2.07131729E-1, 2.47582436E-1},
+{1.37441203E-1, 1.80262372E-1, 2.17698842E-1},
+{2.07853511E-1, 2.49209508E-1, 2.69830108E-1},
+{9.35257301E-2, 1.49197355E-1, 2.04652041E-1},
+{1.11997180E-1, 2.25233063E-1, 2.47003049E-1},
+{1.09315015E-1, 1.93811879E-1, 2.13802189E-1},
+{1.75118580E-1, 2.52520263E-1, 2.75082767E-1},
+{1.36918738E-1, 1.77440569E-1, 1.97931141E-1},
+{1.36811242E-1, 2.37426177E-1, 2.84737825E-1},
+{1.60759792E-1, 2.00833157E-1, 2.18084484E-1},
+{2.33710244E-1, 2.66372561E-1, 2.91802049E-1},
+{1.19171090E-1, 1.39703169E-1, 1.87723249E-1},
+{1.31049946E-1, 1.93696663E-1, 2.60426998E-1},
+{1.08267047E-1, 1.65194795E-1, 2.39523023E-1},
+{2.03195021E-1, 2.25942209E-1, 2.49403238E-1},
+{1.23842932E-1, 1.45794615E-1, 2.15635628E-1},
+{1.71226338E-1, 2.38054529E-1, 2.57975638E-1},
+{1.66923836E-1, 1.88604668E-1, 2.11124212E-1},
+{2.10620746E-1, 2.62442708E-1, 2.83127964E-1},
+{1.05748810E-1, 1.36286482E-1, 2.20050186E-1},
+{9.72945765E-2, 2.33471528E-1, 2.96113968E-1},
+{1.34298369E-1, 1.93955436E-1, 2.39148825E-1},
+{1.64229318E-1, 2.70067751E-1, 2.94142485E-1},
+{1.42760262E-1, 1.65033355E-1, 2.24100381E-1},
+{1.46414533E-1, 2.47942328E-1, 3.00708115E-1},
+{1.74778774E-1, 2.19349250E-1, 2.38162965E-1},
+{2.36311123E-1, 2.90669680E-1, 3.28010976E-1},
+{1.14076428E-1, 1.33071408E-1, 1.73181504E-1},
+{1.13575839E-1, 1.90307274E-1, 2.41681188E-1},
+{8.59165266E-2, 1.63920239E-1, 2.37934500E-1},
+{1.92916945E-1, 2.15082392E-1, 2.39128128E-1},
+{1.37291834E-1, 1.59423307E-1, 1.79722220E-1},
+{1.40435383E-1, 2.22092256E-1, 2.40960747E-1},
+{1.40387163E-1, 1.89601168E-1, 2.05635697E-1},
+{2.11695507E-1, 2.36578360E-1, 2.81248927E-1},
+{9.03010592E-2, 1.27157405E-1, 2.33567923E-1},
+{1.10118054E-1, 2.09328398E-1, 2.72836268E-1},
+{1.16710417E-1, 1.77853987E-1, 2.22808748E-1},
+{1.81691542E-1, 2.32265159E-1, 2.74991214E-1},
+{1.46553472E-1, 1.69474706E-1, 1.90245956E-1},
+{1.09213792E-1, 2.63291955E-1, 2.88490772E-1},
+{1.49815127E-1, 2.11342707E-1, 2.28899449E-1},
+{1.97645500E-1, 2.83229947E-1, 3.14882278E-1},
+{1.24495603E-1, 1.46097973E-1, 1.66125208E-1},
+{1.34878591E-1, 1.83030054E-1, 2.89288282E-1},
+{9.33032110E-2, 1.83962211E-1, 2.38543004E-1},
+{1.92844257E-1, 2.39588335E-1, 2.58421540E-1},
+{1.23796798E-1, 1.65556595E-1, 2.08408386E-1},
+{1.51144341E-1, 2.35801116E-1, 2.59280622E-1},
+{1.50657728E-1, 1.90052524E-1, 2.28362590E-1},
+{1.98180959E-1, 2.56794214E-1, 3.08975637E-1},
+{1.28490031E-1, 1.49084017E-1, 1.98376507E-1},
+{9.20595750E-2, 2.12231293E-1, 2.92948842E-1},
+{1.41698137E-1, 1.72356680E-1, 2.58454144E-1},
+{1.96733460E-1, 2.29709730E-1, 2.95780182E-1},
+{1.47062227E-1, 1.68918088E-1, 2.07363635E-1},
+{1.36309877E-1, 2.60373056E-1, 2.82607377E-1},
+{1.81041077E-1, 2.01826140E-1, 2.38867551E-1},
+{2.45326266E-1, 2.80183077E-1, 3.11954319E-1},
+{1.04131766E-1, 1.33040652E-1, 1.89834684E-1},
+{1.23298146E-1, 2.09621087E-1, 2.47813210E-1},
+{1.24040775E-1, 1.59827366E-1, 2.58856058E-1},
+{1.87048867E-1, 2.12488100E-1, 2.59629130E-1},
+{1.24255307E-1, 1.73768952E-1, 1.92850024E-1},
+{1.58917829E-1, 2.25389823E-1, 2.43284762E-1},
+{1.53421149E-1, 1.91807315E-1, 2.09249526E-1},
+{2.27154449E-1, 2.51181155E-1, 2.72600353E-1},
+{1.09922059E-1, 1.57100275E-1, 2.20024973E-1},
+{1.32782355E-1, 2.19485506E-1, 2.67028928E-1},
+{1.26857504E-1, 1.98836312E-1, 2.17928499E-1},
+{1.91415027E-1, 2.52424240E-1, 2.72652745E-1},
+{1.55277625E-1, 1.79573521E-1, 2.00773627E-1},
+{1.17547743E-1, 2.47869864E-1, 3.08279335E-1},
+{1.65706977E-1, 2.10339502E-1, 2.29199320E-1},
+{2.25694910E-1, 2.84438193E-1, 3.12106073E-1},
+{1.29503176E-1, 1.48420051E-1, 1.80180401E-1},
+{1.54752508E-1, 1.97748467E-1, 2.67275035E-1},
+{1.28590241E-1, 1.76178381E-1, 2.39905864E-1},
+{2.14926764E-1, 2.37634435E-1, 2.58794010E-1},
+{1.28322318E-1, 1.59338519E-1, 2.26626605E-1},
+{1.55747548E-1, 2.47740522E-1, 2.73726821E-1},
+{1.75741687E-1, 1.97952345E-1, 2.19115943E-1},
+{2.18626365E-1, 2.45809183E-1, 3.00479650E-1},
+{1.17709018E-1, 1.45512864E-1, 2.38044471E-1},
+{1.18006893E-1, 2.23775521E-1, 2.94175088E-1},
+{1.51349202E-1, 1.88157812E-1, 2.48743281E-1},
+{1.89312205E-1, 2.69580543E-1, 2.93785989E-1},
+{1.49895594E-1, 1.74537256E-1, 2.37430006E-1},
+{1.39775530E-1, 2.71709383E-1, 3.07839513E-1},
+{1.83945730E-1, 2.07717165E-1, 2.26722151E-1},
+{2.54552156E-1, 2.96640933E-1, 3.24801445E-1}};
+
+static const float evrc_lspq_half_codebook3[256][4] = {
+{2.36904725E-1, 2.56104350E-1, 3.16955745E-1, 4.07520533E-1},
+{2.97596931E-1, 3.23482454E-1, 3.47667515E-1, 3.74551237E-1},
+{2.73721159E-1, 2.98297524E-1, 3.29923928E-1, 3.83599102E-1},
+{3.07849586E-1, 3.32836270E-1, 3.89340341E-1, 4.05575991E-1},
+{2.33803615E-1, 2.60296524E-1, 3.67351949E-1, 4.04388249E-1},
+{2.97513664E-1, 3.15356553E-1, 3.85135233E-1, 4.02197123E-1},
+{2.85618782E-1, 3.10872793E-1, 3.65022361E-1, 3.84816766E-1},
+{3.35271597E-1, 3.55222225E-1, 3.81921113E-1, 3.98685753E-1},
+{2.00265601E-1, 2.50502288E-1, 3.70398223E-1, 4.32012677E-1},
+{3.07982087E-1, 3.33767712E-1, 3.58199060E-1, 3.78386796E-1},
+{2.60086119E-1, 3.25520277E-1, 3.56873333E-1, 3.84737790E-1},
+{3.01356375E-1, 3.41369390E-1, 4.00296748E-1, 4.17337179E-1},
+{2.67080963E-1, 2.97674358E-1, 3.69702041E-1, 3.89139235E-1},
+{2.72669852E-1, 3.49704087E-1, 3.91925275E-1, 4.06383276E-1},
+{2.52825916E-1, 3.49636555E-1, 3.84550989E-1, 4.05930996E-1},
+{3.42927098E-1, 3.74274015E-1, 4.05468166E-1, 4.20351923E-1},
+{2.52408743E-1, 2.80375838E-1, 3.21436584E-1, 3.88436913E-1},
+{2.96970189E-1, 3.17173600E-1, 3.65342557E-1, 4.02736843E-1},
+{2.81905174E-1, 3.01479161E-1, 3.34335625E-1, 4.07633483E-1},
+{3.26872945E-1, 3.47177684E-1, 3.75017703E-1, 4.05372381E-1},
+{2.36371145E-1, 3.16441059E-1, 3.48707020E-1, 3.82030427E-1},
+{2.87817597E-1, 3.13627005E-1, 4.05129731E-1, 4.23379660E-1},
+{2.77502477E-1, 3.01843822E-1, 3.72250855E-1, 4.19212818E-1},
+{3.28988850E-1, 3.61901104E-1, 4.02015507E-1, 4.19229805E-1},
+{2.24960461E-1, 2.74636388E-1, 3.77016127E-1, 3.94726515E-1},
+{3.01045477E-1, 3.40486169E-1, 3.74888122E-1, 4.02532160E-1},
+{2.59897947E-1, 3.30334961E-1, 3.57493818E-1, 4.08657968E-1},
+{3.00961852E-1, 3.56449068E-1, 4.04779494E-1, 4.22508955E-1},
+{2.20979586E-1, 3.16477656E-1, 4.01744068E-1, 4.20735776E-1},
+{2.79754996E-1, 3.30776095E-1, 4.11152899E-1, 4.32687044E-1},
+{2.64246881E-1, 3.16610634E-1, 3.83876741E-1, 4.36683774E-1},
+{3.44381154E-1, 3.85365665E-1, 4.24949467E-1, 4.41560209E-1},
+{2.19488308E-1, 2.36459881E-1, 3.42465997E-1, 4.24989998E-1},
+{2.91465104E-1, 3.22282016E-1, 3.72852802E-1, 3.91635895E-1},
+{2.74792433E-1, 3.16536307E-1, 3.45392585E-1, 3.74555230E-1},
+{3.10583472E-1, 3.35264921E-1, 3.87527227E-1, 4.23076212E-1},
+{2.23211512E-1, 2.98497617E-1, 3.68426204E-1, 3.90213728E-1},
+{2.89078832E-1, 3.26512754E-1, 3.76308680E-1, 4.09553707E-1},
+{2.63830125E-1, 3.08977246E-1, 3.81453037E-1, 4.04660761E-1},
+{3.47073615E-1, 3.64797831E-1, 3.86763453E-1, 4.04511690E-1},
+{2.18452707E-1, 2.75614083E-1, 3.62711072E-1, 4.18278992E-1},
+{3.15042794E-1, 3.40813220E-1, 3.78627181E-1, 3.96316767E-1},
+{2.79727697E-1, 3.31259727E-1, 3.60061288E-1, 3.81175518E-1},
+{3.18602443E-1, 3.38044286E-1, 4.09010768E-1, 4.30300415E-1},
+{2.64196932E-1, 2.90672481E-1, 3.68595004E-1, 4.31856751E-1},
+{2.72645593E-1, 3.63514841E-1, 3.96518826E-1, 4.20091212E-1},
+{2.26540968E-1, 3.50055099E-1, 3.93851519E-1, 4.12597001E-1},
+{3.53053868E-1, 3.69929552E-1, 4.09656048E-1, 4.26387310E-1},
+{2.60788381E-1, 2.85172462E-1, 3.45943332E-1, 3.97500694E-1},
+{3.01113129E-1, 3.28201890E-1, 3.56068015E-1, 4.10803795E-1},
+{2.88101614E-1, 3.09559643E-1, 3.43756795E-1, 4.24872875E-1},
+{3.10489357E-1, 3.51421893E-1, 3.93717408E-1, 4.15550530E-1},
+{2.22308263E-1, 3.26798201E-1, 3.77981663E-1, 3.98635030E-1},
+{3.02915514E-1, 3.22781920E-1, 3.98558855E-1, 4.25489604E-1},
+{2.77136803E-1, 3.19992602E-1, 3.77490878E-1, 4.29177463E-1},
+{3.38731766E-1, 3.58164370E-1, 4.08386350E-1, 4.25495386E-1},
+{2.18726233E-1, 2.84384966E-1, 3.94053698E-1, 4.16346967E-1},
+{3.01005960E-1, 3.44093680E-1, 3.69013667E-1, 4.15091276E-1},
+{2.80783713E-1, 3.33053648E-1, 3.76726151E-1, 3.97526860E-1},
+{3.14394057E-1, 3.62678826E-1, 4.23668981E-1, 4.41899240E-1},
+{2.66453624E-1, 3.08513761E-1, 3.97407174E-1, 4.17450190E-1},
+{2.94222653E-1, 3.41904402E-1, 4.12726879E-1, 4.34888899E-1},
+{2.87300706E-1, 3.32434595E-1, 3.78856659E-1, 4.38234031E-1},
+{3.57146621E-1, 3.98147047E-1, 4.29875731E-1, 4.44243908E-1},
+{2.29671344E-1, 2.51018614E-1, 3.41046572E-1, 4.04376328E-1},
+{2.94472575E-1, 3.34944606E-1, 3.60409737E-1, 3.83682847E-1},
+{2.88250983E-1, 3.11722696E-1, 3.31680059E-1, 3.65104675E-1},
+{3.24881613E-1, 3.45656693E-1, 3.88306379E-1, 4.05954897E-1},
+{2.50829220E-1, 2.77623534E-1, 3.70799541E-1, 3.90479207E-1},
+{2.93523371E-1, 3.28319192E-1, 3.92112255E-1, 4.09464061E-1},
+{2.83608794E-1, 3.03885639E-1, 3.78504395E-1, 3.97310555E-1},
+{3.34039807E-1, 3.52837384E-1, 3.97272944E-1, 4.14322019E-1},
+{2.21891895E-1, 2.51877457E-1, 3.71723533E-1, 4.31791008E-1},
+{3.13201427E-1, 3.41175437E-1, 3.65503550E-1, 3.88567209E-1},
+{2.71330535E-1, 3.39163721E-1, 3.62616420E-1, 3.95735979E-1},
+{3.07550132E-1, 3.47777665E-1, 4.01049614E-1, 4.32767451E-1},
+{2.59387434E-1, 2.87243843E-1, 3.86817336E-1, 4.06042695E-1},
+{2.85485208E-1, 3.44094992E-1, 4.02050495E-1, 4.19413745E-1},
+{2.65781403E-1, 3.40084374E-1, 3.69407654E-1, 4.27031696E-1},
+{3.53740931E-1, 3.84463251E-1, 4.11747813E-1, 4.26181793E-1},
+{2.43866488E-1, 2.68350184E-1, 3.42201948E-1, 3.98457229E-1},
+{2.93145239E-1, 3.34754169E-1, 3.61702800E-1, 3.98416638E-1},
+{2.91342974E-1, 3.13155174E-1, 3.36525917E-1, 3.87748599E-1},
+{3.05656791E-1, 3.62904549E-1, 3.88153434E-1, 4.05543149E-1},
+{2.17492327E-1, 3.11723530E-1, 3.75984788E-1, 4.28997755E-1},
+{2.91149259E-1, 3.29380929E-1, 4.03900385E-1, 4.22333181E-1},
+{2.90362060E-1, 3.09530973E-1, 3.78994226E-1, 4.13688362E-1},
+{3.29564869E-1, 3.77404690E-1, 4.06584859E-1, 4.24739718E-1},
+{2.46461585E-1, 2.71593273E-1, 3.66338253E-1, 4.30753767E-1},
+{3.14107716E-1, 3.37011874E-1, 3.80409718E-1, 4.11099434E-1},
+{2.76568413E-1, 3.27320695E-1, 3.58844280E-1, 4.28949475E-1},
+{3.17179084E-1, 3.58972430E-1, 4.04765844E-1, 4.40376341E-1},
+{2.42777750E-1, 3.34954798E-1, 3.96943450E-1, 4.13318396E-1},
+{2.88895488E-1, 3.25691164E-1, 4.22859550E-1, 4.43758667E-1},
+{2.77583301E-1, 3.25479031E-1, 3.89144659E-1, 4.41075861E-1},
+{3.59125674E-1, 3.90694141E-1, 4.21009541E-1, 4.35708523E-1},
+{2.20172390E-1, 2.47719273E-1, 3.54381859E-1, 4.25398111E-1},
+{3.06046784E-1, 3.27924728E-1, 3.66992772E-1, 3.93192589E-1},
+{2.70805597E-1, 3.16826642E-1, 3.45648706E-1, 4.11717594E-1},
+{3.23188901E-1, 3.45463097E-1, 3.89778793E-1, 4.21570778E-1},
+{2.46136114E-1, 3.12391996E-1, 3.72188628E-1, 3.95842731E-1},
+{3.03856730E-1, 3.24354768E-1, 3.85747254E-1, 4.14155006E-1},
+{2.81075418E-1, 3.18608463E-1, 3.85646880E-1, 4.02703643E-1},
+{3.53517115E-1, 3.72702539E-1, 3.96264613E-1, 4.13074911E-1},
+{2.09221140E-1, 2.95262218E-1, 3.80314291E-1, 4.31278229E-1},
+{3.25313628E-1, 3.46735477E-1, 3.70724022E-1, 3.91045630E-1},
+{2.86396503E-1, 3.43560040E-1, 3.69713604E-1, 3.89867842E-1},
+{3.27794671E-1, 3.47367823E-1, 4.05465066E-1, 4.24566150E-1},
+{2.53054976E-1, 3.02656293E-1, 3.82165134E-1, 4.29898322E-1},
+{2.94418454E-1, 3.70745420E-1, 3.95443261E-1, 4.19514775E-1},
+{2.62873113E-1, 3.45069230E-1, 4.04140890E-1, 4.21902061E-1},
+{3.65063488E-1, 3.82435143E-1, 4.13424790E-1, 4.31241691E-1},
+{2.48788506E-1, 2.82372773E-1, 3.65772307E-1, 4.10981059E-1},
+{3.07288766E-1, 3.27828944E-1, 3.77664983E-1, 4.36220944E-1},
+{2.98542321E-1, 3.20627332E-1, 3.50569665E-1, 4.27620232E-1},
+{3.16258013E-1, 3.62903833E-1, 3.88225138E-1, 4.25608873E-1},
+{2.39077866E-1, 3.31310451E-1, 3.70317876E-1, 4.15995896E-1},
+{3.03735793E-1, 3.32806051E-1, 4.10232842E-1, 4.27751064E-1},
+{2.96002507E-1, 3.19014788E-1, 3.81062448E-1, 4.26954985E-1},
+{3.32508922E-1, 3.62516999E-1, 4.23315108E-1, 4.40995157E-1},
+{2.35128701E-1, 2.74731100E-1, 4.12070572E-1, 4.35478806E-1},
+{2.98073769E-1, 3.55338752E-1, 3.79087746E-1, 4.15318787E-1},
+{2.83429801E-1, 3.45264912E-1, 3.70376289E-1, 4.09900844E-1},
+{3.23593080E-1, 3.65412831E-1, 4.12813127E-1, 4.31023479E-1},
+{2.76626348E-1, 3.00508440E-1, 4.02236879E-1, 4.26638782E-1},
+{2.94512928E-1, 3.61443222E-1, 4.19635236E-1, 4.36999202E-1},
+{2.90807247E-1, 3.41689348E-1, 3.92779291E-1, 4.43490267E-1},
+{3.59391451E-1, 4.03985143E-1, 4.40843761E-1, 4.53028619E-1},
+{2.23295465E-1, 2.39192486E-1, 3.23768020E-1, 4.21689451E-1},
+{2.94778049E-1, 3.18798721E-1, 3.53217840E-1, 3.91906381E-1},
+{2.59032130E-1, 3.10240507E-1, 3.43569040E-1, 3.95064235E-1},
+{3.16474676E-1, 3.38544369E-1, 3.93329024E-1, 4.12235558E-1},
+{2.40108207E-1, 2.84631193E-1, 3.60280991E-1, 3.79973769E-1},
+{2.96909094E-1, 3.15798342E-1, 3.94964337E-1, 4.15127575E-1},
+{2.85434067E-1, 3.04921508E-1, 3.61974716E-1, 4.05767262E-1},
+{3.37407053E-1, 3.56672168E-1, 3.85155082E-1, 4.11186695E-1},
+{2.24014923E-1, 2.60116160E-1, 3.94772530E-1, 4.19585884E-1},
+{3.00647914E-1, 3.41640651E-1, 3.70223522E-1, 3.89520049E-1},
+{2.65946031E-1, 3.25039148E-1, 3.74339938E-1, 3.92346144E-1},
+{3.16029310E-1, 3.40491295E-1, 4.02355313E-1, 4.20484245E-1},
+{2.69841492E-1, 2.94562399E-1, 3.62341762E-1, 4.06415462E-1},
+{2.78897285E-1, 3.59831035E-1, 3.82025838E-1, 4.10577476E-1},
+{2.60760844E-1, 3.31088543E-1, 3.88826251E-1, 4.05486643E-1},
+{3.43372285E-1, 3.82647038E-1, 4.14716601E-1, 4.31592941E-1},
+{2.47998103E-1, 2.73393154E-1, 3.31160426E-1, 4.18943226E-1},
+{3.03579569E-1, 3.25202465E-1, 3.70984435E-1, 4.14420485E-1},
+{2.76896894E-1, 3.00499499E-1, 3.54178190E-1, 4.28807020E-1},
+{3.23655546E-1, 3.59816968E-1, 3.89525414E-1, 4.09288704E-1},
+{2.38927796E-1, 3.09919238E-1, 3.53915572E-1, 4.16634321E-1},
+{2.81171739E-1, 3.07520270E-1, 4.16264892E-1, 4.38523829E-1},
+{2.88858652E-1, 3.09810817E-1, 3.67845178E-1, 4.36035573E-1},
+{3.38423491E-1, 3.70634377E-1, 4.15449977E-1, 4.31534529E-1},
+{2.41260394E-1, 2.73617864E-1, 3.89554620E-1, 4.12539542E-1},
+{2.98046708E-1, 3.40122104E-1, 3.86183739E-1, 4.13826346E-1},
+{2.82436430E-1, 3.31597507E-1, 3.57941389E-1, 4.12115216E-1},
+{3.03820193E-1, 3.70588601E-1, 4.05774951E-1, 4.31517065E-1},
+{2.39077732E-1, 3.11638474E-1, 4.13935781E-1, 4.35304046E-1},
+{2.67116845E-1, 3.41937900E-1, 4.17409420E-1, 4.39184844E-1},
+{2.67946839E-1, 3.33343923E-1, 3.86481404E-1, 4.37462509E-1},
+{3.40510964E-1, 3.90878022E-1, 4.35485125E-1, 4.49101925E-1},
+{2.10069850E-1, 2.32524484E-1, 3.61781418E-1, 4.31357861E-1},
+{2.94509888E-1, 3.33709776E-1, 3.82278621E-1, 3.98638904E-1},
+{2.80525148E-1, 3.25905204E-1, 3.50647032E-1, 3.92873943E-1},
+{3.19999635E-1, 3.43674660E-1, 3.91070545E-1, 4.37501073E-1},
+{2.20581010E-1, 3.03151906E-1, 3.81765544E-1, 4.04488146E-1},
+{2.86122739E-1, 3.29746544E-1, 3.88102829E-1, 4.24247742E-1},
+{2.69807100E-1, 3.25332284E-1, 3.79154503E-1, 4.15138245E-1},
+{3.34858894E-1, 3.69258404E-1, 3.94743145E-1, 4.11922157E-1},
+{2.07109794E-1, 2.72779524E-1, 3.78566444E-1, 4.34579968E-1},
+{3.06466222E-1, 3.46695721E-1, 3.87138307E-1, 4.03558314E-1},
+{2.70148575E-1, 3.46654534E-1, 3.77696693E-1, 3.96434486E-1},
+{3.18745911E-1, 3.40225697E-1, 4.14991558E-1, 4.41578746E-1},
+{2.58592844E-1, 3.14370096E-1, 3.65083754E-1, 4.21615183E-1},
+{2.82712996E-1, 3.54137123E-1, 4.06745970E-1, 4.29267883E-1},
+{2.52021760E-1, 3.59105110E-1, 3.95102918E-1, 4.18148398E-1},
+{3.54906201E-1, 3.74952912E-1, 4.18965995E-1, 4.36144412E-1},
+{2.64841139E-1, 2.92941809E-1, 3.27751458E-1, 4.08790469E-1},
+{3.07774246E-1, 3.35586190E-1, 3.62209618E-1, 4.25394237E-1},
+{2.88466334E-1, 3.16075742E-1, 3.60989630E-1, 4.19551432E-1},
+{3.17128420E-1, 3.55772197E-1, 4.05808747E-1, 4.23972964E-1},
+{2.47089684E-1, 3.38184595E-1, 3.71859610E-1, 3.95971477E-1},
+{3.07981730E-1, 3.32691789E-1, 4.00534213E-1, 4.38273668E-1},
+{2.79484808E-1, 3.16183507E-1, 3.97237718E-1, 4.34746623E-1},
+{3.44490469E-1, 3.66153181E-1, 4.10959423E-1, 4.41727102E-1},
+{2.35741779E-1, 2.94587255E-1, 3.98072541E-1, 4.16833401E-1},
+{3.14038455E-1, 3.52272034E-1, 3.79138887E-1, 4.10969079E-1},
+{2.83002496E-1, 3.38136256E-1, 3.88641894E-1, 4.06193316E-1},
+{3.23625326E-1, 3.50243390E-1, 4.28089559E-1, 4.46630359E-1},
+{2.61252105E-1, 3.24970961E-1, 4.00214493E-1, 4.25321758E-1},
+{3.05284500E-1, 3.42164159E-1, 4.24475133E-1, 4.43830967E-1},
+{2.87374794E-1, 3.32500637E-1, 3.94308269E-1, 4.42538500E-1},
+{3.74075353E-1, 4.02026355E-1, 4.30933535E-1, 4.44160044E-1},
+{2.34503999E-1, 2.56218612E-1, 3.41238797E-1, 4.23045278E-1},
+{3.05492580E-1, 3.29156995E-1, 3.52709830E-1, 3.92439067E-1},
+{2.81323552E-1, 3.03292334E-1, 3.48925412E-1, 3.93163860E-1},
+{3.21893454E-1, 3.50419939E-1, 3.97317469E-1, 4.14560318E-1},
+{2.39684582E-1, 2.92451501E-1, 3.78937423E-1, 3.96535456E-1},
+{3.07307243E-1, 3.29127908E-1, 3.98455560E-1, 4.16143298E-1},
+{2.85274565E-1, 3.08774531E-1, 3.92916501E-1, 4.14437652E-1},
+{3.44446361E-1, 3.62201869E-1, 3.97619784E-1, 4.17743623E-1},
+{2.32083067E-1, 2.67807961E-1, 3.78075659E-1, 4.34560895E-1},
+{3.04738700E-1, 3.51865292E-1, 3.75973165E-1, 3.95293653E-1},
+{2.61990905E-1, 3.46207321E-1, 3.71296942E-1, 4.12438929E-1},
+{3.11080933E-1, 3.51040900E-1, 4.16082799E-1, 4.34340119E-1},
+{2.74980426E-1, 2.96631455E-1, 3.87520492E-1, 4.09243762E-1},
+{2.90939093E-1, 3.54455590E-1, 3.93426955E-1, 4.08220291E-1},
+{2.71871865E-1, 3.45510781E-1, 3.87125313E-1, 4.22590613E-1},
+{3.63245904E-1, 3.81932199E-1, 4.04114902E-1, 4.18370664E-1},
+{2.45770738E-1, 2.72909343E-1, 3.48317921E-1, 4.25161839E-1},
+{3.14139009E-1, 3.37872326E-1, 3.65195215E-1, 4.04423416E-1},
+{2.94075787E-1, 3.16935539E-1, 3.43047202E-1, 4.06130373E-1},
+{3.14627469E-1, 3.72413397E-1, 4.00660694E-1, 4.17930841E-1},
+{2.34014243E-1, 3.14007223E-1, 3.83003533E-1, 4.34829175E-1},
+{2.93635666E-1, 3.20529997E-1, 4.10837352E-1, 4.36393142E-1},
+{2.89505839E-1, 3.11828852E-1, 3.86311471E-1, 4.38771248E-1},
+{3.26317430E-1, 3.80858183E-1, 4.19721425E-1, 4.38795507E-1},
+{2.50809520E-1, 2.83018053E-1, 3.82247388E-1, 4.34244394E-1},
+{3.18994045E-1, 3.44855130E-1, 3.72690141E-1, 4.23067033E-1},
+{2.88380086E-1, 3.36622238E-1, 3.69742334E-1, 4.25057590E-1},
+{3.06107700E-1, 3.81856918E-1, 4.18206155E-1, 4.32868361E-1},
+{2.33898312E-1, 3.44861805E-1, 4.12176549E-1, 4.29216206E-1},
+{2.85980880E-1, 3.42903793E-1, 4.25112903E-1, 4.44299698E-1},
+{2.79858828E-1, 3.38789344E-1, 3.92085373E-1, 4.40541029E-1},
+{3.64509344E-1, 3.82202744E-1, 4.29830611E-1, 4.45818365E-1},
+{2.34392300E-1, 2.57377386E-1, 3.59567046E-1, 4.30088580E-1},
+{3.05031896E-1, 3.27589393E-1, 3.78305554E-1, 4.01026130E-1},
+{2.77522624E-1, 3.18130314E-1, 3.67794275E-1, 4.01543021E-1},
+{3.33035767E-1, 3.55820954E-1, 3.87548923E-1, 4.24628675E-1},
+{2.45021001E-1, 3.12560678E-1, 3.91147614E-1, 4.08762813E-1},
+{2.97059119E-1, 3.40246916E-1, 3.92919302E-1, 4.28899705E-1},
+{2.77839303E-1, 3.25019777E-1, 3.97436380E-1, 4.15920913E-1},
+{3.49465251E-1, 3.70362461E-1, 3.95482540E-1, 4.31923389E-1},
+{2.31485590E-1, 2.91023374E-1, 3.77909541E-1, 4.32259738E-1},
+{3.19283485E-1, 3.53671074E-1, 3.80982876E-1, 3.97843361E-1},
+{2.89689243E-1, 3.50265682E-1, 3.80729675E-1, 3.97969365E-1},
+{3.28987300E-1, 3.52005422E-1, 4.12557244E-1, 4.37597930E-1},
+{2.76273251E-1, 3.02267194E-1, 3.81723404E-1, 4.34989095E-1},
+{2.79627264E-1, 3.73727322E-1, 4.12374616E-1, 4.30626333E-1},
+{2.53442764E-1, 3.65940034E-1, 4.14937019E-1, 4.32743609E-1},
+{3.76107216E-1, 3.95142019E-1, 4.16787744E-1, 4.33023572E-1},
+{2.62815833E-1, 2.88270533E-1, 3.47397208E-1, 4.24182594E-1},
+{3.01931322E-1, 3.43652546E-1, 3.77031326E-1, 4.34204459E-1},
+{2.97834277E-1, 3.23495388E-1, 3.64492416E-1, 4.33550835E-1},
+{3.31774473E-1, 3.64324927E-1, 3.98243546E-1, 4.35078323E-1},
+{2.49049723E-1, 3.27870786E-1, 3.83587003E-1, 4.35558081E-1},
+{3.04653406E-1, 3.27671230E-1, 4.18484688E-1, 4.41378772E-1},
+{2.96960890E-1, 3.23898911E-1, 3.90463710E-1, 4.39915955E-1},
+{3.43923748E-1, 3.67100477E-1, 4.29523230E-1, 4.45214987E-1},
+{2.59399652E-1, 2.91602671E-1, 4.04372454E-1, 4.31413233E-1},
+{2.97537506E-1, 3.57573807E-1, 3.88991833E-1, 4.30006981E-1},
+{2.84068942E-1, 3.49574566E-1, 3.81042838E-1, 4.29712772E-1},
+{3.25716257E-1, 3.74875903E-1, 4.31959271E-1, 4.47290838E-1},
+{2.65302956E-1, 3.14745963E-1, 4.16703463E-1, 4.37294722E-1},
+{3.00398588E-1, 3.54147255E-1, 4.28538084E-1, 4.60336387E-1},
+{2.98077166E-1, 3.49304914E-1, 4.00429249E-1, 4.48213518E-1},
+{3.75576198E-1, 4.16657329E-1, 4.42136765E-1, 4.52728629E-1}};
+
+static const float evrc_lspq_quant_codebook1[16][5] = {
+{0.42091064E-1, 0.69474973E-1, 0.11168948E+0, 0.14571965E+0, 0.20893581E+0},
+{0.54944664E-1, 0.98242261E-1, 0.11007882E+0, 0.15890779E+0, 0.20548241E+0},
+{0.45188572E-1, 0.75199433E-1, 0.11423391E+0, 0.15469728E+0, 0.19746706E+0},
+{0.49474996E-1, 0.79667501E-1, 0.12571351E+0, 0.16944779E+0, 0.20775315E+0},
+{0.41789379E-1, 0.63459560E-1, 0.12068028E+0, 0.15850765E+0, 0.20406815E+0},
+{0.47159236E-1, 0.79129547E-1, 0.12183110E+0, 0.15650047E+0, 0.22309226E+0},
+{0.54539919E-1, 0.80343045E-1, 0.12947764E+0, 0.15186153E+0, 0.20171718E+0},
+{0.55852082E-1, 0.94114847E-1, 0.14016025E+0, 0.17807084E+0, 0.22955489E+0},
+{0.45443531E-1, 0.73541410E-1, 0.11937657E+0, 0.15442030E+0, 0.21010752E+0},
+{0.63178010E-1, 0.95231488E-1, 0.12364983E+0, 0.17672543E+0, 0.21743731E+0},
+{0.52765369E-1, 0.84351443E-1, 0.11589085E+0, 0.15790924E+0, 0.20732352E+0},
+{0.51865745E-1, 0.81328541E-1, 0.13756232E+0, 0.18322878E+0, 0.21640070E+0},
+{0.44419531E-1, 0.68874463E-1, 0.13115251E+0, 0.16263582E+0, 0.21659100E+0},
+{0.49378436E-1, 0.81882551E-1, 0.13067168E+0, 0.16821896E+0, 0.23136081E+0},
+{0.55909779E-1, 0.90783298E-1, 0.13348848E+0, 0.16298474E+0, 0.20961523E+0},
+{0.61378211E-1, 0.98602772E-1, 0.14793332E+0, 0.19283190E+0, 0.23156509E+0}};
+
+static const float evrc_lspq_quant_codebook2[16][5] = {
+{0.26822963, 0.30585295, 0.31110349, 0.36823335, 0.40774474},
+{0.24418014, 0.28970167, 0.32573757, 0.39021483, 0.41345838},
+{0.23341830, 0.30078292, 0.32893899, 0.38557330, 0.41068462},
+{0.25905868, 0.29756859, 0.34196618, 0.38531172, 0.41295227},
+{0.24290450, 0.29223618, 0.32718554, 0.37788135, 0.40332928},
+{0.24674191, 0.29749370, 0.33631226, 0.39426059, 0.42258954},
+{0.21377595, 0.33140418, 0.34067687, 0.38222077, 0.40939021},
+{0.26673481, 0.30791649, 0.34419721, 0.39611506, 0.42387524},
+{0.26121426, 0.30492544, 0.32997236, 0.38486803, 0.42023736},
+{0.24954870, 0.29372856, 0.33382735, 0.37850669, 0.41714057},
+{0.24158891, 0.30173415, 0.34128246, 0.38428575, 0.41619650},
+{0.25818908, 0.31736413, 0.34904337, 0.38769925, 0.41551358},
+{0.24450587, 0.30673453, 0.33579323, 0.37844428, 0.40557048},
+{0.25164026, 0.31225079, 0.33847794, 0.39554194, 0.42396802},
+{0.22787990, 0.31779197, 0.33831909, 0.40044111, 0.41185561},
+{0.27896860, 0.32261974, 0.35658112, 0.40206763, 0.42370448}};
+
+static const float * const evrc_lspq_full_codebooks[] = {
+    evrc_lspq_full_codebook1[0], evrc_lspq_full_codebook2[0],
+    evrc_lspq_full_codebook3[0], evrc_lspq_full_codebook4[0],
+};
+
+static const float * const evrc_lspq_half_codebooks[] = {
+    evrc_lspq_half_codebook1[0], evrc_lspq_half_codebook2[0],
+    evrc_lspq_half_codebook3[0],
+};
+
+static const float * const evrc_lspq_quant_codebooks[] = {
+    evrc_lspq_quant_codebook1[0], evrc_lspq_quant_codebook2[0],
+};
+
+static const float * const * const evrc_lspq_codebooks[] = {
+    0,
+    evrc_lspq_quant_codebooks,
+    0,
+    evrc_lspq_half_codebooks,
+    evrc_lspq_full_codebooks,
+};
+
+static const uint8_t evrc_lspq_nb_codebooks[] = {
+    0,
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebooks),
+    0,
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebooks),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebooks),
+};
+
+static const uint8_t evrc_lspq_full_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook2[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook3[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_full_codebook4[0]),
+};
+
+static const uint8_t evrc_lspq_half_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook2[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_half_codebook3[0]),
+};
+
+static const uint8_t evrc_lspq_quant_codebooks_row_sizes[] = {
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebook1[0]),
+    FF_ARRAY_ELEMS(evrc_lspq_quant_codebook2[0]),
+};
+
+static const uint8_t* const evrc_lspq_codebooks_row_sizes[] = {
+    NULL,
+    evrc_lspq_quant_codebooks_row_sizes,
+    NULL,
+    evrc_lspq_half_codebooks_row_sizes,
+    evrc_lspq_full_codebooks_row_sizes,
+};
+
+static const float pitch_gain_vq[] = { 0, 0.3, 0.55, 0.7, 0.8, 0.9, 1, 1.2 };
+static const float estimation_delay[] = { 55.0, 80.0, 39.0, 71.0, 33.0 }; // 5.2.3.4
+static const uint8_t subframe_sizes[] = { 53, 53, 54 };
+#endif /* AVCODEC_EVRCDATA_H */
diff --git a/libavcodec/evrcdec.c b/libavcodec/evrcdec.c
new file mode 100644
index 0000000..8728c02
--- /dev/null
+++ b/libavcodec/evrcdec.c
@@ -0,0 +1,941 @@
+/*
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Enhanced Variable Rate Codec, Service Option 3 decoder
+ * @author Paul B Mahol
+ */
+
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "get_bits.h"
+#include "evrcdata.h"
+#include "acelp_vectors.h"
+#include "lsp.h"
+
+#define MIN_LSP_SEP (0.05 / (2.0 * M_PI))
+#define MIN_DELAY      20
+#define MAX_DELAY     120
+#define NB_SUBFRAMES    3
+#define SUBFRAME_SIZE  54
+#define FILTER_ORDER   10
+#define ACB_SIZE      128
+
+typedef enum {
+    RATE_ERRS = -1,
+    SILENCE,
+    RATE_QUANT,
+    RATE_QUARTER,
+    RATE_HALF,
+    RATE_FULL,
+} evrc_packet_rate;
+
+/**
+ * EVRC-A unpacked data frame
+ */
+typedef struct EVRCAFrame {
+    uint8_t  lpc_flag;        ///< spectral change indicator
+    uint16_t lsp[4];          ///< index into LSP codebook
+    uint8_t  pitch_delay;     ///< pitch delay for entire frame
+    uint8_t  delay_diff;      ///< delay difference for entire frame
+    uint8_t  acb_gain[3];     ///< adaptive codebook gain
+    uint16_t fcb_shape[3][4]; ///< fixed codebook shape
+    uint8_t  fcb_gain[3];     ///< fixed codebook gain index
+    uint8_t  energy_gain;     ///< frame energy gain index
+    uint8_t  tty;             ///< tty baud rate bit
+} EVRCAFrame;
+
+typedef struct EVRCContext {
+    AVClass *class;
+
+    int              postfilter;
+
+    GetBitContext    gb;
+    evrc_packet_rate bitrate;
+    evrc_packet_rate last_valid_bitrate;
+    EVRCAFrame       frame;
+
+    float            lspf[FILTER_ORDER];
+    float            prev_lspf[FILTER_ORDER];
+    float            synthesis[FILTER_ORDER];
+    float            postfilter_fir[FILTER_ORDER];
+    float            postfilter_iir[FILTER_ORDER];
+    float            postfilter_residual[ACB_SIZE + SUBFRAME_SIZE];
+    float            pitch_delay;
+    float            prev_pitch_delay;
+    float            avg_acb_gain;  ///< average adaptive codebook gain
+    float            avg_fcb_gain;  ///< average fixed codebook gain
+    float            pitch[ACB_SIZE + FILTER_ORDER + SUBFRAME_SIZE];
+    float            pitch_back[ACB_SIZE];
+    float            interpolation_coeffs[136];
+    float            energy_vector[NB_SUBFRAMES];
+    float            fade_scale;
+    float            last;
+
+    uint8_t          prev_energy_gain;
+    uint8_t          prev_error_flag;
+    uint8_t          warned_buf_mismatch_bitrate;
+} EVRCContext;
+
+/**
+ * Frame unpacking for RATE_FULL, RATE_HALF and RATE_QUANT
+ *
+ * @param e the context
+ *
+ * TIA/IS-127 Table 4.21-1
+ */
+static void unpack_frame(EVRCContext *e)
+{
+    EVRCAFrame *frame = &e->frame;
+    GetBitContext *gb = &e->gb;
+
+    switch (e->bitrate) {
+    case RATE_FULL:
+        frame->lpc_flag        = get_bits1(gb);
+        frame->lsp[0]          = get_bits(gb,  6);
+        frame->lsp[1]          = get_bits(gb,  6);
+        frame->lsp[2]          = get_bits(gb,  9);
+        frame->lsp[3]          = get_bits(gb,  7);
+        frame->pitch_delay     = get_bits(gb,  7);
+        frame->delay_diff      = get_bits(gb,  5);
+        frame->acb_gain[0]     = get_bits(gb,  3);
+        frame->fcb_shape[0][0] = get_bits(gb,  8);
+        frame->fcb_shape[0][1] = get_bits(gb,  8);
+        frame->fcb_shape[0][2] = get_bits(gb,  8);
+        frame->fcb_shape[0][3] = get_bits(gb, 11);
+        frame->fcb_gain[0]     = get_bits(gb,  5);
+        frame->acb_gain[1]     = get_bits(gb,  3);
+        frame->fcb_shape[1][0] = get_bits(gb,  8);
+        frame->fcb_shape[1][1] = get_bits(gb,  8);
+        frame->fcb_shape[1][2] = get_bits(gb,  8);
+        frame->fcb_shape[1][3] = get_bits(gb, 11);
+        frame->fcb_gain    [1] = get_bits(gb,  5);
+        frame->acb_gain    [2] = get_bits(gb,  3);
+        frame->fcb_shape[2][0] = get_bits(gb,  8);
+        frame->fcb_shape[2][1] = get_bits(gb,  8);
+        frame->fcb_shape[2][2] = get_bits(gb,  8);
+        frame->fcb_shape[2][3] = get_bits(gb, 11);
+        frame->fcb_gain    [2] = get_bits(gb,  5);
+        frame->tty             = get_bits1(gb);
+        break;
+    case RATE_HALF:
+        frame->lsp         [0] = get_bits(gb,  7);
+        frame->lsp         [1] = get_bits(gb,  7);
+        frame->lsp         [2] = get_bits(gb,  8);
+        frame->pitch_delay     = get_bits(gb,  7);
+        frame->acb_gain    [0] = get_bits(gb,  3);
+        frame->fcb_shape[0][0] = get_bits(gb, 10);
+        frame->fcb_gain    [0] = get_bits(gb,  4);
+        frame->acb_gain    [1] = get_bits(gb,  3);
+        frame->fcb_shape[1][0] = get_bits(gb, 10);
+        frame->fcb_gain    [1] = get_bits(gb,  4);
+        frame->acb_gain    [2] = get_bits(gb,  3);
+        frame->fcb_shape[2][0] = get_bits(gb, 10);
+        frame->fcb_gain    [2] = get_bits(gb,  4);
+        break;
+    case RATE_QUANT:
+        frame->lsp         [0] = get_bits(gb, 4);
+        frame->lsp         [1] = get_bits(gb, 4);
+        frame->energy_gain     = get_bits(gb, 8);
+        break;
+    }
+}
+
+static evrc_packet_rate buf_size2bitrate(const int buf_size)
+{
+    switch (buf_size) {
+    case 23: return RATE_FULL;
+    case 11: return RATE_HALF;
+    case  6: return RATE_QUARTER;
+    case  3: return RATE_QUANT;
+    case  1: return SILENCE;
+    }
+
+    return RATE_ERRS;
+}
+
+/**
+ * Determine the bitrate from the frame size and/or the first byte of the frame.
+ *
+ * @param avctx the AV codec context
+ * @param buf_size length of the buffer
+ * @param buf the bufffer
+ *
+ * @return the bitrate on success,
+ *         RATE_ERRS  if the bitrate cannot be satisfactorily determined
+ */
+static evrc_packet_rate determine_bitrate(AVCodecContext *avctx,
+                                          int *buf_size,
+                                          const uint8_t **buf)
+{
+    evrc_packet_rate bitrate;
+
+    if ((bitrate = buf_size2bitrate(*buf_size)) >= 0) {
+        if (bitrate > **buf) {
+            EVRCContext *e = avctx->priv_data;
+            if (!e->warned_buf_mismatch_bitrate) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Claimed bitrate and buffer size mismatch.\n");
+                e->warned_buf_mismatch_bitrate = 1;
+            }
+            bitrate = **buf;
+        } else if (bitrate < **buf) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Buffer is too small for the claimed bitrate.\n");
+            return RATE_ERRS;
+        }
+        (*buf)++;
+        *buf_size -= 1;
+    } else if ((bitrate = buf_size2bitrate(*buf_size + 1)) >= 0) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Bitrate byte is missing, guessing the bitrate from packet size.\n");
+    } else
+        return RATE_ERRS;
+
+    return bitrate;
+}
+
+static void warn_insufficient_frame_quality(AVCodecContext *avctx,
+                                            const char *message)
+{
+    av_log(avctx, AV_LOG_WARNING, "Frame #%d, %s\n",
+           avctx->frame_number, message);
+}
+
+/**
+ * Initialize the speech codec according to the specification.
+ *
+ * TIA/IS-127 5.2
+ */
+static av_cold int evrc_decode_init(AVCodecContext *avctx)
+{
+    EVRCContext *e = avctx->priv_data;
+    int i, n, idx = 0;
+    float denom = 2.0 / (2.0 * 8.0 + 1.0);
+
+    avctx->channels       = 1;
+    avctx->channel_layout = AV_CH_LAYOUT_MONO;
+    avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        e->prev_lspf[i] = (i + 1) * 0.048;
+        e->synthesis[i] = 0.0;
+    }
+
+    for (i = 0; i < ACB_SIZE; i++)
+        e->pitch[i] = e->pitch_back[i] = 0.0;
+
+    e->last_valid_bitrate = RATE_QUANT;
+    e->prev_pitch_delay   = 40.0;
+    e->fade_scale         = 1.0;
+    e->prev_error_flag    = 0;
+    e->avg_acb_gain = e->avg_fcb_gain = 0.0;
+
+    for (i = 0; i < 8; i++) {
+        float tt = ((float)i - 8.0 / 2.0) / 8.0;
+
+        for (n = -8; n <= 8; n++, idx++) {
+            float arg1 = M_PI * 0.9 * (tt - n);
+            float arg2 = M_PI * (tt - n);
+
+            e->interpolation_coeffs[idx] = 0.9;
+            if (arg1)
+                e->interpolation_coeffs[idx] *= (0.54 + 0.46 * cos(arg2 * denom)) *
+                                                 sin(arg1) / arg1;
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * Decode the 10 vector quantized line spectral pair frequencies from the LSP
+ * transmission codes of any bitrate and check for badly received packets.
+ *
+ * @param e the context
+ *
+ * @return 0 on success, -1 if the packet is badly received
+ *
+ * TIA/IS-127 5.2.1, 5.7.1
+ */
+static int decode_lspf(EVRCContext *e)
+{
+    const float * const *codebooks = evrc_lspq_codebooks[e->bitrate];
+    int i, j, k = 0;
+
+    for (i = 0; i < evrc_lspq_nb_codebooks[e->bitrate]; i++) {
+        int row_size = evrc_lspq_codebooks_row_sizes[e->bitrate][i];
+        const float *codebook = codebooks[i];
+
+        for (j = 0; j < row_size; j++)
+            e->lspf[k++] = codebook[e->frame.lsp[i] * row_size + j];
+    }
+
+    // check for monotonic LSPs
+    for (i = 1; i < FILTER_ORDER; i++)
+        if (e->lspf[i] <= e->lspf[i - 1])
+            return -1;
+
+    // check for minimum separation of LSPs at the splits
+    for (i = 0, k = 0; i < evrc_lspq_nb_codebooks[e->bitrate] - 1; i++) {
+        k += evrc_lspq_codebooks_row_sizes[e->bitrate][i];
+        if (e->lspf[k] - e->lspf[k - 1] <= MIN_LSP_SEP)
+            return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Interpolation of LSP parameters.
+ *
+ * TIA/IS-127 5.2.3.1, 5.7.3.2
+ */
+static void interpolate_lsp(float *ilsp, const float *lsp,
+                            const float *prev, int index)
+{
+    static const float lsp_interpolation_factors[] = { 0.1667, 0.5, 0.8333 };
+    ff_weighted_vector_sumf(ilsp, prev, lsp,
+                            1.0 - lsp_interpolation_factors[index],
+                            lsp_interpolation_factors[index], FILTER_ORDER);
+}
+
+/*
+ * Reconstruction of the delay contour.
+ *
+ * TIA/IS-127 5.2.2.3.2
+ */
+static void interpolate_delay(float *dst, float current, float prev, int index)
+{
+    static const float d_interpolation_factors[] = { 0, 0.3313, 0.6625, 1, 1 };
+    dst[0] = (1.0 - d_interpolation_factors[index    ]) * prev
+                  + d_interpolation_factors[index    ]  * current;
+    dst[1] = (1.0 - d_interpolation_factors[index + 1]) * prev
+                  + d_interpolation_factors[index + 1]  * current;
+    dst[2] = (1.0 - d_interpolation_factors[index + 2]) * prev
+                  + d_interpolation_factors[index + 2]  * current;
+}
+
+/*
+ * Convert the quantized, interpolated line spectral frequencies,
+ * to prediction coefficients.
+ *
+ * TIA/IS-127 5.2.3.2, 4.7.2.2
+ */
+static void decode_predictor_coeffs(const float *ilspf, float *ilpc)
+{
+    double lsp[FILTER_ORDER];
+    float a[FILTER_ORDER / 2 + 1], b[FILTER_ORDER / 2 + 1];
+    float a1[FILTER_ORDER / 2] = { 0 };
+    float a2[FILTER_ORDER / 2] = { 0 };
+    float b1[FILTER_ORDER / 2] = { 0 };
+    float b2[FILTER_ORDER / 2] = { 0 };
+    int i, k;
+
+    ff_acelp_lsf2lspd(lsp, ilspf, FILTER_ORDER);
+
+    for (k = 0; k <= FILTER_ORDER; k++) {
+        a[0] = k < 2 ? 0.25 : 0;
+        b[0] = k < 2 ? k < 1 ? 0.25 : -0.25 : 0;
+
+        for (i = 0; i < FILTER_ORDER / 2; i++) {
+            a[i + 1] = a[i] - 2 * lsp[i * 2    ] * a1[i] + a2[i];
+            b[i + 1] = b[i] - 2 * lsp[i * 2 + 1] * b1[i] + b2[i];
+            a2[i] = a1[i];
+            a1[i] = a[i];
+            b2[i] = b1[i];
+            b1[i] = b[i];
+        }
+
+        if (k)
+            ilpc[k - 1] = 2.0 * (a[FILTER_ORDER / 2] + b[FILTER_ORDER / 2]);
+    }
+}
+
+static void bl_intrp(EVRCContext *e, float *ex, float delay)
+{
+    float *f;
+    int offset, i, coef_idx;
+    int16_t t;
+
+    offset = lrintf(delay);
+
+    t = (offset - delay + 0.5) * 8.0 + 0.5;
+    if (t == 8) {
+        t = 0;
+        offset--;
+    }
+
+    f = ex - offset - 8;
+
+    coef_idx = t * (2 * 8 + 1);
+
+    ex[0] = 0.0;
+    for (i = 0; i < 2 * 8 + 1; i++)
+        ex[0] += e->interpolation_coeffs[coef_idx + i] * f[i];
+}
+
+/*
+ * Adaptive codebook excitation.
+ *
+ * TIA/IS-127 5.2.2.3.3, 4.12.5.2
+ */
+static void acb_excitation(EVRCContext *e, float *excitation, float gain,
+                           const float delay[3], int length)
+{
+    float denom, locdelay, dpr, invl;
+    int i;
+
+    invl = 1.0 / ((float) length);
+    dpr = length;
+
+    /* first at-most extra samples */
+    denom = (delay[1] - delay[0]) * invl;
+    for (i = 0; i < dpr; i++) {
+        locdelay = delay[0] + i * denom;
+        bl_intrp(e, excitation + i, locdelay);
+    }
+
+    denom = (delay[2] - delay[1]) * invl;
+    /* interpolation */
+    for (i = dpr; i < dpr + 10; i++) {
+        locdelay = delay[1] + (i - dpr) * denom;
+        bl_intrp(e, excitation + i, locdelay);
+    }
+
+    for (i = 0; i < length; i++)
+        excitation[i] *= gain;
+}
+
+static void decode_8_pulses_35bits(const uint16_t *fixed_index, float *cod)
+{
+    int i, pos1, pos2, offset;
+
+    offset = (fixed_index[3] >> 9) & 3;
+
+    for (i = 0; i < 3; i++) {
+        pos1 = ((fixed_index[i] & 0x7f) / 11) * 5 + ((i + offset) % 5);
+        pos2 = ((fixed_index[i] & 0x7f) % 11) * 5 + ((i + offset) % 5);
+
+        cod[pos1] = (fixed_index[i] & 0x80) ? -1.0 : 1.0;
+
+        if (pos2 < pos1)
+            cod[pos2]  = -cod[pos1];
+        else
+            cod[pos2] +=  cod[pos1];
+    }
+
+    pos1 = ((fixed_index[3] & 0x7f) / 11) * 5 + ((3 + offset) % 5);
+    pos2 = ((fixed_index[3] & 0x7f) % 11) * 5 + ((4 + offset) % 5);
+
+    cod[pos1] = (fixed_index[3] & 0x100) ? -1.0 : 1.0;
+    cod[pos2] = (fixed_index[3] & 0x80 ) ? -1.0 : 1.0;
+}
+
+static void decode_3_pulses_10bits(uint16_t fixed_index, float *cod)
+{
+    float sign;
+    int pos;
+
+    sign = (fixed_index & 0x200) ? -1.0 : 1.0;
+
+    pos = ((fixed_index        & 0x7) * 7) + 4;
+    cod[pos] += sign;
+    pos = (((fixed_index >> 3) & 0x7) * 7) + 2;
+    cod[pos] -= sign;
+    pos = (((fixed_index >> 6) & 0x7) * 7);
+    cod[pos] += sign;
+}
+
+/*
+ * Reconstruction of ACELP fixed codebook excitation for full and half rate.
+ *
+ * TIA/IS-127 5.2.3.7
+ */
+static void fcb_excitation(EVRCContext *e, const uint16_t *codebook,
+                           float *excitation, float pitch_gain,
+                           int pitch_lag, int subframe_size)
+{
+    int i;
+
+    if (e->bitrate == RATE_FULL)
+        decode_8_pulses_35bits(codebook, excitation);
+    else
+        decode_3_pulses_10bits(*codebook, excitation);
+
+    pitch_gain = av_clipf(pitch_gain, 0.2, 0.9);
+
+    for (i = pitch_lag; i < subframe_size; i++)
+        excitation[i] += pitch_gain * excitation[i - pitch_lag];
+}
+
+/**
+ * Synthesis of the decoder output signal.
+ *
+ * param[in]     in              input signal
+ * param[in]     filter_coeffs   LPC coefficients
+ * param[in/out] memory          synthesis filter memory
+ * param         buffer_length   amount of data to process
+ * param[out]    samples         output samples
+ *
+ * TIA/IS-127 5.2.3.15, 5.7.3.4
+ */
+static void synthesis_filter(const float *in, const float *filter_coeffs,
+                             float *memory, int buffer_length, float *samples)
+{
+    int i, j;
+
+    for (i = 0; i < buffer_length; i++) {
+        samples[i] = in[i];
+        for (j = FILTER_ORDER - 1; j > 0; j--) {
+            samples[i] -= filter_coeffs[j] * memory[j];
+            memory[j]   = memory[j - 1];
+        }
+        samples[i] -= filter_coeffs[0] * memory[0];
+        memory[0]   = samples[i];
+    }
+}
+
+static void bandwidth_expansion(float *coeff, const float *inbuf, float gamma)
+{
+    double fac = gamma;
+    int i;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        coeff[i] = inbuf[i] * fac;
+        fac *= gamma;
+    }
+}
+
+static void residual_filter(float *output, const float *input,
+                            const float *coef, float *memory, int length)
+{
+    float sum;
+    int i, j;
+
+    for (i = 0; i < length; i++) {
+        sum = input[i];
+
+        for (j = FILTER_ORDER - 1; j > 0; j--) {
+            sum      += coef[j] * memory[j];
+            memory[j] = memory[j - 1];
+        }
+        sum += coef[0] * memory[0];
+        memory[0] = input[i];
+        output[i] = sum;
+    }
+}
+
+/*
+ * TIA/IS-127 Table 5.9.1-1.
+ */
+static const struct PfCoeff {
+    float tilt;
+    float ltgain;
+    float p1;
+    float p2;
+} postfilter_coeffs[5] = {
+    { 0.0 , 0.0 , 0.0 , 0.0  },
+    { 0.0 , 0.0 , 0.57, 0.57 },
+    { 0.0 , 0.0 , 0.0 , 0.0  },
+    { 0.35, 0.50, 0.50, 0.75 },
+    { 0.20, 0.50, 0.57, 0.75 },
+};
+
+/*
+ * Adaptive postfilter.
+ *
+ * TIA/IS-127 5.9
+ */
+static void postfilter(EVRCContext *e, float *in, const float *coeff,
+                       float *out, int idx, const struct PfCoeff *pfc,
+                       int length)
+{
+    float wcoef1[FILTER_ORDER], wcoef2[FILTER_ORDER],
+          scratch[SUBFRAME_SIZE], temp[SUBFRAME_SIZE],
+          mem[SUBFRAME_SIZE];
+    float sum1 = 0.0, sum2 = 0.0, gamma, gain;
+    float tilt = pfc->tilt;
+    int i, n, best;
+
+    bandwidth_expansion(wcoef1, coeff, pfc->p1);
+    bandwidth_expansion(wcoef2, coeff, pfc->p2);
+
+    /* Tilt compensation filter, TIA/IS-127 5.9.1 */
+    for (i = 0; i < length - 1; i++)
+        sum2 += in[i] * in[i + 1];
+    if (sum2 < 0.0)
+        tilt = 0.0;
+
+    for (i = 0; i < length; i++) {
+        scratch[i] = in[i] - tilt * e->last;
+        e->last = in[i];
+    }
+
+    /* Short term residual filter, TIA/IS-127 5.9.2 */
+    residual_filter(&e->postfilter_residual[ACB_SIZE], scratch, wcoef1, e->postfilter_fir, length);
+
+    /* Long term postfilter */
+    best = idx;
+    for (i = FFMIN(MIN_DELAY, idx - 3); i <= FFMAX(MAX_DELAY, idx + 3); i++) {
+        for (n = ACB_SIZE, sum2 = 0; n < ACB_SIZE + length; n++)
+            sum2 += e->postfilter_residual[n] * e->postfilter_residual[n - i];
+        if (sum2 > sum1) {
+            sum1 = sum2;
+            best = i;
+        }
+    }
+
+    for (i = ACB_SIZE, sum1 = 0; i < ACB_SIZE + length; i++)
+        sum1 += e->postfilter_residual[i - best] * e->postfilter_residual[i - best];
+    for (i = ACB_SIZE, sum2 = 0; i < ACB_SIZE + length; i++)
+        sum2 += e->postfilter_residual[i] * e->postfilter_residual[i - best];
+
+    if (sum2 * sum1 == 0 || e->bitrate == RATE_QUANT) {
+        memcpy(temp, e->postfilter_residual + ACB_SIZE, length * sizeof(float));
+    } else {
+        gamma = sum2 / sum1;
+        if (gamma < 0.5)
+            memcpy(temp, e->postfilter_residual + ACB_SIZE, length * sizeof(float));
+        else {
+            gamma = FFMIN(gamma, 1.0);
+
+            for (i = 0; i < length; i++) {
+                temp[i] = e->postfilter_residual[ACB_SIZE + i] + gamma *
+                    pfc->ltgain * e->postfilter_residual[ACB_SIZE + i - best];
+            }
+        }
+    }
+
+    memcpy(scratch, temp, length * sizeof(float));
+    memcpy(mem, e->postfilter_iir, FILTER_ORDER * sizeof(float));
+    synthesis_filter(scratch, wcoef2, mem, length, scratch);
+
+    /* Gain computation, TIA/IS-127 5.9.4-2 */
+    for (i = 0, sum1 = 0, sum2 = 0; i < length; i++) {
+        sum1 += in[i] * in[i];
+        sum2 += scratch[i] * scratch[i];
+    }
+    gain = sum2 ? sqrt(sum1 / sum2) : 1.0;
+
+    for (i = 0; i < length; i++)
+        temp[i] *= gain;
+
+    /* Short term postfilter */
+    synthesis_filter(temp, wcoef2, e->postfilter_iir, length, out);
+
+    memmove(e->postfilter_residual,
+           e->postfilter_residual + length, ACB_SIZE * sizeof(float));
+}
+
+static void frame_erasure(EVRCContext *e, float *samples)
+{
+    float ilspf[FILTER_ORDER], ilpc[FILTER_ORDER], idelay[NB_SUBFRAMES],
+          tmp[SUBFRAME_SIZE + 6], f;
+    int i, j;
+
+    for (i = 0; i < FILTER_ORDER; i++) {
+        if (e->bitrate != RATE_QUANT)
+            e->lspf[i] = e->prev_lspf[i] * 0.875 + 0.125 * (i + 1) * 0.048;
+        else
+            e->lspf[i] = e->prev_lspf[i];
+    }
+
+    if (e->prev_error_flag)
+        e->avg_acb_gain *= 0.75;
+    if (e->bitrate == RATE_FULL)
+        memcpy(e->pitch_back, e->pitch, ACB_SIZE * sizeof(float));
+    if (e->last_valid_bitrate == RATE_QUANT)
+        e->bitrate = RATE_QUANT;
+    else
+        e->bitrate = RATE_FULL;
+
+    if (e->bitrate == RATE_FULL || e->bitrate == RATE_HALF) {
+        e->pitch_delay = e->prev_pitch_delay;
+    } else {
+        float sum = 0;
+
+        idelay[0] = idelay[1] = idelay[2] = MIN_DELAY;
+
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            sum += evrc_energy_quant[e->prev_energy_gain][i];
+        sum /= (float) NB_SUBFRAMES;
+        sum  = pow(10, sum);
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            e->energy_vector[i] = sum;
+    }
+
+    if (fabs(e->pitch_delay - e->prev_pitch_delay) > 15)
+        e->prev_pitch_delay = e->pitch_delay;
+
+    for (i = 0; i < NB_SUBFRAMES; i++) {
+        int subframe_size = subframe_sizes[i];
+        int pitch_lag;
+
+        interpolate_lsp(ilspf, e->lspf, e->prev_lspf, i);
+
+        if (e->bitrate != RATE_QUANT) {
+            if (e->avg_acb_gain < 0.3) {
+                idelay[0] = estimation_delay[i];
+                idelay[1] = estimation_delay[i + 1];
+                idelay[2] = estimation_delay[i + 2];
+            } else {
+                interpolate_delay(idelay, e->pitch_delay, e->prev_pitch_delay, i);
+            }
+        }
+
+        pitch_lag = lrintf((idelay[1] + idelay[0]) / 2.0);
+        decode_predictor_coeffs(ilspf, ilpc);
+
+        if (e->bitrate != RATE_QUANT) {
+            acb_excitation(e, e->pitch + ACB_SIZE,
+                           e->avg_acb_gain, idelay, subframe_size);
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] *= e->fade_scale;
+            e->fade_scale = FFMAX(e->fade_scale - 0.05, 0.0);
+        } else {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+
+        if (e->bitrate != RATE_QUANT && e->avg_acb_gain < 0.4) {
+            f = 0.1 * e->avg_fcb_gain;
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] += f;
+        } else if (e->bitrate == RATE_QUANT) {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        synthesis_filter(e->pitch + ACB_SIZE, ilpc,
+                         e->synthesis, subframe_size, tmp);
+        postfilter(e, tmp, ilpc, samples, pitch_lag,
+                   &postfilter_coeffs[e->bitrate], subframe_size);
+
+        samples += subframe_size;
+    }
+}
+
+static int evrc_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame_ptr, AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    AVFrame *frame     = data;
+    EVRCContext *e     = avctx->priv_data;
+    int buf_size       = avpkt->size;
+    float ilspf[FILTER_ORDER], ilpc[FILTER_ORDER], idelay[NB_SUBFRAMES];
+    float *samples;
+    int   i, j, ret, error_flag = 0;
+
+    frame->nb_samples = 160;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    samples = (float *)frame->data[0];
+
+    if ((e->bitrate = determine_bitrate(avctx, &buf_size, &buf)) == RATE_ERRS) {
+        warn_insufficient_frame_quality(avctx, "bitrate cannot be determined.");
+        goto erasure;
+    }
+    if (e->bitrate <= SILENCE || e->bitrate == RATE_QUARTER)
+        goto erasure;
+    if (e->bitrate == RATE_QUANT && e->last_valid_bitrate == RATE_FULL
+                                 && !e->prev_error_flag)
+        goto erasure;
+
+    if ((ret = init_get_bits8(&e->gb, buf, buf_size)) < 0)
+        return ret;
+    memset(&e->frame, 0, sizeof(EVRCAFrame));
+
+    unpack_frame(e);
+
+    if (e->bitrate != RATE_QUANT) {
+        uint8_t *p = (uint8_t *) &e->frame;
+        for (i = 0; i < sizeof(EVRCAFrame); i++) {
+            if (p[i])
+                break;
+        }
+        if (i == sizeof(EVRCAFrame))
+            goto erasure;
+    } else if (e->frame.lsp[0] == 0xf &&
+               e->frame.lsp[1] == 0xf &&
+               e->frame.energy_gain == 0xff) {
+        goto erasure;
+    }
+
+    if (decode_lspf(e) < 0)
+        goto erasure;
+
+    if (e->bitrate == RATE_FULL || e->bitrate == RATE_HALF) {
+        /* Pitch delay parameter checking as per TIA/IS-127 5.1.5.1 */
+        if (e->frame.pitch_delay > MAX_DELAY - MIN_DELAY)
+            goto erasure;
+
+        e->pitch_delay = e->frame.pitch_delay + MIN_DELAY;
+
+        /* Delay diff parameter checking as per TIA/IS-127 5.1.5.2 */
+        if (e->frame.delay_diff) {
+            int p = e->pitch_delay - e->frame.delay_diff + 16;
+            if (p < MIN_DELAY || p > MAX_DELAY)
+                goto erasure;
+        }
+
+        /* Delay contour reconstruction as per TIA/IS-127 5.2.2.2 */
+        if (e->frame.delay_diff &&
+            e->bitrate == RATE_FULL && e->prev_error_flag) {
+            float delay;
+
+            memcpy(e->pitch, e->pitch_back, ACB_SIZE * sizeof(float));
+
+            delay = e->prev_pitch_delay;
+            e->prev_pitch_delay = delay - e->frame.delay_diff + 16.0;
+
+            if (fabs(e->pitch_delay - delay) > 15)
+                delay = e->pitch_delay;
+
+            for (i = 0; i < NB_SUBFRAMES; i++) {
+                int subframe_size = subframe_sizes[i];
+
+                interpolate_delay(idelay, delay, e->prev_pitch_delay, i);
+                acb_excitation(e, e->pitch + ACB_SIZE, e->avg_acb_gain, idelay, subframe_size);
+                memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+            }
+        }
+
+        /* Smoothing of the decoded delay as per TIA/IS-127 5.2.2.5 */
+        if (fabs(e->pitch_delay - e->prev_pitch_delay) > 15)
+            e->prev_pitch_delay = e->pitch_delay;
+
+        e->avg_acb_gain = e->avg_fcb_gain = 0.0;
+    } else {
+        idelay[0] = idelay[1] = idelay[2] = MIN_DELAY;
+
+        /* Decode frame energy vectors as per TIA/IS-127 5.7.2 */
+        for (i = 0; i < NB_SUBFRAMES; i++)
+            e->energy_vector[i] = pow(10, evrc_energy_quant[e->frame.energy_gain][i]);
+        e->prev_energy_gain = e->frame.energy_gain;
+    }
+
+    for (i = 0; i < NB_SUBFRAMES; i++) {
+        float tmp[SUBFRAME_SIZE + 6] = { 0 };
+        int subframe_size = subframe_sizes[i];
+        int pitch_lag;
+
+        interpolate_lsp(ilspf, e->lspf, e->prev_lspf, i);
+
+        if (e->bitrate != RATE_QUANT)
+            interpolate_delay(idelay, e->pitch_delay, e->prev_pitch_delay, i);
+
+        pitch_lag = lrintf((idelay[1] + idelay[0]) / 2.0);
+        decode_predictor_coeffs(ilspf, ilpc);
+
+        /* Bandwidth expansion as per TIA/IS-127 5.2.3.3 */
+        if (e->frame.lpc_flag && e->prev_error_flag)
+            bandwidth_expansion(ilpc, ilpc, 0.75);
+
+        if (e->bitrate != RATE_QUANT) {
+            float acb_sum, f;
+
+            f = exp((e->bitrate == RATE_HALF ? 0.5 : 0.25)
+                         * (e->frame.fcb_gain[i] + 1));
+            acb_sum = pitch_gain_vq[e->frame.acb_gain[i]];
+            e->avg_acb_gain += acb_sum / NB_SUBFRAMES;
+            e->avg_fcb_gain += f / NB_SUBFRAMES;
+
+            acb_excitation(e, e->pitch + ACB_SIZE,
+                           acb_sum, idelay, subframe_size);
+            fcb_excitation(e, e->frame.fcb_shape[i], tmp,
+                           acb_sum, pitch_lag, subframe_size);
+
+            /* Total excitation generation as per TIA/IS-127 5.2.3.9 */
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] += f * tmp[j];
+            e->fade_scale = FFMIN(e->fade_scale + 0.2, 1.0);
+        } else {
+            for (j = 0; j < subframe_size; j++)
+                e->pitch[ACB_SIZE + j] = e->energy_vector[i];
+        }
+
+        memmove(e->pitch, e->pitch + subframe_size, ACB_SIZE * sizeof(float));
+
+        synthesis_filter(e->pitch + ACB_SIZE, ilpc,
+                         e->synthesis, subframe_size,
+                         e->postfilter ? tmp : samples);
+        if (e->postfilter)
+            postfilter(e, tmp, ilpc, samples, pitch_lag,
+                       &postfilter_coeffs[e->bitrate], subframe_size);
+
+        samples += subframe_size;
+    }
+
+    if (error_flag) {
+erasure:
+        error_flag = 1;
+        av_log(avctx, AV_LOG_WARNING, "frame erasure\n");
+        frame_erasure(e, samples);
+    }
+
+    memcpy(e->prev_lspf, e->lspf, sizeof(e->prev_lspf));
+    e->prev_error_flag    = error_flag;
+    e->last_valid_bitrate = e->bitrate;
+
+    if (e->bitrate != RATE_QUANT)
+        e->prev_pitch_delay = e->pitch_delay;
+
+    samples = (float *)frame->data[0];
+    for (i = 0; i < 160; i++)
+        samples[i] /= 32768;
+
+    *got_frame_ptr   = 1;
+
+    return avpkt->size;
+}
+
+#define OFFSET(x) offsetof(EVRCContext, x)
+#define AD AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption options[] = {
+    { "postfilter", "enable postfilter", OFFSET(postfilter), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, AD },
+    { NULL }
+};
+
+static const AVClass evrcdec_class = {
+    .class_name = "evrc",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_evrc_decoder = {
+    .name           = "evrc",
+    .long_name      = NULL_IF_CONFIG_SMALL("EVRC (Enhanced Variable Rate Codec)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_EVRC,
+    .init           = evrc_decode_init,
+    .decode         = evrc_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(EVRCContext),
+    .priv_class     = &evrcdec_class,
+};
diff --git a/libavcodec/exif.c b/libavcodec/exif.c
new file mode 100644
index 0000000..07ce174
--- /dev/null
+++ b/libavcodec/exif.c
@@ -0,0 +1,142 @@
+/*
+ * EXIF metadata parser
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * EXIF metadata parser
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#include "exif.h"
+
+
+static const char *exif_get_tag_name(uint16_t id)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(tag_list); i++) {
+        if (tag_list[i].id == id)
+            return tag_list[i].name;
+    }
+
+    return NULL;
+}
+
+
+static int exif_add_metadata(void *logctx, int count, int type,
+                             const char *name, const char *sep,
+                             GetByteContext *gb, int le,
+                             AVDictionary **metadata)
+{
+    switch(type) {
+    case 0:
+        av_log(logctx, AV_LOG_WARNING,
+               "Invalid TIFF tag type 0 found for %s with size %d\n",
+               name, count);
+        return 0;
+    case TIFF_DOUBLE   : return ff_tadd_doubles_metadata(count, name, sep, gb, le, metadata);
+    case TIFF_SSHORT   : return ff_tadd_shorts_metadata(count, name, sep, gb, le, 1, metadata);
+    case TIFF_SHORT    : return ff_tadd_shorts_metadata(count, name, sep, gb, le, 0, metadata);
+    case TIFF_SBYTE    : return ff_tadd_bytes_metadata(count, name, sep, gb, le, 1, metadata);
+    case TIFF_BYTE     :
+    case TIFF_UNDEFINED: return ff_tadd_bytes_metadata(count, name, sep, gb, le, 0, metadata);
+    case TIFF_STRING   : return ff_tadd_string_metadata(count, name, gb, le, metadata);
+    case TIFF_SRATIONAL:
+    case TIFF_RATIONAL : return ff_tadd_rational_metadata(count, name, sep, gb, le, metadata);
+    case TIFF_SLONG    :
+    case TIFF_LONG     : return ff_tadd_long_metadata(count, name, sep, gb, le, metadata);
+    default:
+        avpriv_request_sample(logctx, "TIFF tag type (%u)", type);
+        return 0;
+    };
+}
+
+
+static int exif_decode_tag(void *logctx, GetByteContext *gbytes, int le,
+                           int depth, AVDictionary **metadata)
+{
+    int ret, cur_pos;
+    unsigned id, count;
+    enum TiffTypes type;
+
+    if (depth > 2) {
+        return 0;
+    }
+
+    ff_tread_tag(gbytes, le, &id, &type, &count, &cur_pos);
+
+    if (!bytestream2_tell(gbytes)) {
+        bytestream2_seek(gbytes, cur_pos, SEEK_SET);
+        return 0;
+    }
+
+    // read count values and add it metadata
+    // store metadata or proceed with next IFD
+    ret = ff_tis_ifd(id);
+    if (ret) {
+        ret = avpriv_exif_decode_ifd(logctx, gbytes, le, depth + 1, metadata);
+    } else {
+        const char *name = exif_get_tag_name(id);
+        char *use_name   = (char*) name;
+
+        if (!use_name) {
+            use_name = av_malloc(7);
+            if (!use_name) {
+                return AVERROR(ENOMEM);
+            }
+            snprintf(use_name, 7, "0x%04X", id);
+        }
+
+        ret = exif_add_metadata(logctx, count, type, use_name, NULL,
+                                gbytes, le, metadata);
+
+        if (!name) {
+            av_freep(&use_name);
+        }
+    }
+
+    bytestream2_seek(gbytes, cur_pos, SEEK_SET);
+
+    return ret;
+}
+
+
+int avpriv_exif_decode_ifd(void *logctx, GetByteContext *gbytes, int le,
+                           int depth, AVDictionary **metadata)
+{
+    int i, ret;
+    int entries;
+
+    entries = ff_tget_short(gbytes, le);
+
+    if (bytestream2_get_bytes_left(gbytes) < entries * 12) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i = 0; i < entries; i++) {
+        if ((ret = exif_decode_tag(logctx, gbytes, le, depth, metadata)) < 0) {
+            return ret;
+        }
+    }
+
+    // return next IDF offset or 0x000000000 or a value < 0 for failure
+    return ff_tget_long(gbytes, le);
+}
diff --git a/libavcodec/exif.h b/libavcodec/exif.h
new file mode 100644
index 0000000..5f09208
--- /dev/null
+++ b/libavcodec/exif.h
@@ -0,0 +1,170 @@
+/*
+ * EXIF metadata parser
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * EXIF metadata parser
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#ifndef AVCODEC_EXIF_H
+#define AVCODEC_EXIF_H
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "tiff.h"
+
+#define EXIF_MAX_IFD_RECURSION 2
+#define EXIF_TAG_NAME_LENGTH   32
+
+struct exif_tag {
+    char      name[EXIF_TAG_NAME_LENGTH];
+    uint16_t  id;
+};
+
+static const struct exif_tag tag_list[] = { // JEITA CP-3451 EXIF specification:
+    {"GPSVersionID",               0x00}, // <- Table 12 GPS Attribute Information
+    {"GPSLatitudeRef",             0x01},
+    {"GPSLatitude",                0x02},
+    {"GPSLongitudeRef",            0x03},
+    {"GPSLongitude",               0x04},
+    {"GPSAltitudeRef",             0x05},
+    {"GPSAltitude",                0x06},
+    {"GPSTimeStamp",               0x07},
+    {"GPSSatellites",              0x08},
+    {"GPSStatus",                  0x09},
+    {"GPSMeasureMode",             0x0A},
+    {"GPSDOP",                     0x0B},
+    {"GPSSpeedRef",                0x0C},
+    {"GPSSpeed",                   0x0D},
+    {"GPSTrackRef",                0x0E},
+    {"GPSTrack",                   0x0F},
+    {"GPSImgDirectionRef",         0x10},
+    {"GPSImgDirection",            0x11},
+    {"GPSMapDatum",                0x12},
+    {"GPSDestLatitudeRef",         0x13},
+    {"GPSDestLatitude",            0x14},
+    {"GPSDestLongitudeRef",        0x15},
+    {"GPSDestLongitude",           0x16},
+    {"GPSDestBearingRef",          0x17},
+    {"GPSDestBearing",             0x18},
+    {"GPSDestDistanceRef",         0x19},
+    {"GPSDestDistance",            0x1A},
+    {"GPSProcessingMethod",        0x1B},
+    {"GPSAreaInformation",         0x1C},
+    {"GPSDateStamp",               0x1D},
+    {"GPSDifferential",            0x1E},
+    {"ImageWidth",                 0x100}, // <- Table 3 TIFF Rev. 6.0 Attribute Information Used in Exif
+    {"ImageLength",                0x101},
+    {"BitsPerSample",              0x102},
+    {"Compression",                0x103},
+    {"PhotometricInterpretation",  0x106},
+    {"Orientation",                0x112},
+    {"SamplesPerPixel",            0x115},
+    {"PlanarConfiguration",        0x11C},
+    {"YCbCrSubSampling",           0x212},
+    {"YCbCrPositioning",           0x213},
+    {"XResolution",                0x11A},
+    {"YResolution",                0x11B},
+    {"ResolutionUnit",             0x128},
+    {"StripOffsets",               0x111},
+    {"RowsPerStrip",               0x116},
+    {"StripByteCounts",            0x117},
+    {"JPEGInterchangeFormat",      0x201},
+    {"JPEGInterchangeFormatLength",0x202},
+    {"TransferFunction",           0x12D},
+    {"WhitePoint",                 0x13E},
+    {"PrimaryChromaticities",      0x13F},
+    {"YCbCrCoefficients",          0x211},
+    {"ReferenceBlackWhite",        0x214},
+    {"DateTime",                   0x132},
+    {"ImageDescription",           0x10E},
+    {"Make",                       0x10F},
+    {"Model",                      0x110},
+    {"Software",                   0x131},
+    {"Artist",                     0x13B},
+    {"Copyright",                  0x8298},
+    {"ExifVersion",                0x9000}, // <- Table 4 Exif IFD Attribute Information (1)
+    {"FlashpixVersion",            0xA000},
+    {"ColorSpace",                 0xA001},
+    {"ComponentsConfiguration",    0x9101},
+    {"CompressedBitsPerPixel",     0x9102},
+    {"PixelXDimension",            0xA002},
+    {"PixelYDimension",            0xA003},
+    {"MakerNote",                  0x927C},
+    {"UserComment",                0x9286},
+    {"RelatedSoundFile",           0xA004},
+    {"DateTimeOriginal",           0x9003},
+    {"DateTimeDigitized",          0x9004},
+    {"SubSecTime",                 0x9290},
+    {"SubSecTimeOriginal",         0x9291},
+    {"SubSecTimeDigitized",        0x9292},
+    {"ImageUniqueID",              0xA420},
+    {"ExposureTime",               0x829A}, // <- Table 5 Exif IFD Attribute Information (2)
+    {"FNumber",                    0x829D},
+    {"ExposureProgram",            0x8822},
+    {"SpectralSensitivity",        0x8824},
+    {"ISOSpeedRatings",            0x8827},
+    {"OECF",                       0x8828},
+    {"ShutterSpeedValue",          0x9201},
+    {"ApertureValue",              0x9202},
+    {"BrightnessValue",            0x9203},
+    {"ExposureBiasValue",          0x9204},
+    {"MaxApertureValue",           0x9205},
+    {"SubjectDistance",            0x9206},
+    {"MeteringMode",               0x9207},
+    {"LightSource",                0x9208},
+    {"Flash",                      0x9209},
+    {"FocalLength",                0x920A},
+    {"SubjectArea",                0x9214},
+    {"FlashEnergy",                0xA20B},
+    {"SpatialFrequencyResponse",   0xA20C},
+    {"FocalPlaneXResolution",      0xA20E},
+    {"FocalPlaneYResolution",      0xA20F},
+    {"FocalPlaneResolutionUnit",   0xA210},
+    {"SubjectLocation",            0xA214},
+    {"ExposureIndex",              0xA215},
+    {"SensingMethod",              0xA217},
+    {"FileSource",                 0xA300},
+    {"SceneType",                  0xA301},
+    {"CFAPattern",                 0xA302},
+    {"CustomRendered",             0xA401},
+    {"ExposureMode",               0xA402},
+    {"WhiteBalance",               0xA403},
+    {"DigitalZoomRatio",           0xA404},
+    {"FocalLengthIn35mmFilm",      0xA405},
+    {"SceneCaptureType",           0xA406},
+    {"GainControl",                0xA407},
+    {"Contrast",                   0xA408},
+    {"Saturation",                 0xA409},
+    {"Sharpness",                  0xA40A},
+    {"DeviceSettingDescription",   0xA40B},
+    {"SubjectDistanceRange",       0xA40C}
+//    {"InteroperabilityIndex",      0x1}, // <- Table 13 Interoperability IFD Attribute Information
+//    {"",                           0x0}
+};
+
+/** Recursively decodes all IFD's and
+ *  adds included TAGS into the metadata dictionary. */
+int avpriv_exif_decode_ifd(void *logctx, GetByteContext *gbytes, int le,
+                           int depth, AVDictionary **metadata);
+
+#endif /* AVCODEC_EXIF_H */
diff --git a/libavcodec/exr.c b/libavcodec/exr.c
index d10841d..75ada24 100644
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@@ -1,21 +1,24 @@
 /*
  * OpenEXR (.exr) image decoder
+ * Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
  * Copyright (c) 2009 Jimmy Christensen
  *
- * This file is part of Libav
+ * B44/B44A, Tile added by Jokyo Images support by CNC - French National Center for Cinema
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +37,11 @@
 #include <float.h>
 #include <zlib.h>
 
+#include "libavutil/common.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/opt.h"
+#include "libavutil/color_utils.h"
 
 #include "avcodec.h"
 #include "bytestream.h"
@@ -64,11 +69,31 @@ enum ExrPixelType {
     EXR_UNKNOWN,
 };
 
+enum ExrTileLevelMode {
+    EXR_TILE_LEVEL_ONE,
+    EXR_TILE_LEVEL_MIPMAP,
+    EXR_TILE_LEVEL_RIPMAP,
+    EXR_TILE_LEVEL_UNKNOWN,
+};
+
+enum ExrTileLevelRound {
+    EXR_TILE_ROUND_UP,
+    EXR_TILE_ROUND_DOWN,
+    EXR_TILE_ROUND_UNKNOWN,
+};
+
 typedef struct EXRChannel {
     int xsub, ysub;
     enum ExrPixelType pixel_type;
 } EXRChannel;
 
+typedef struct EXRTileAttribute {
+    int32_t xSize;
+    int32_t ySize;
+    enum ExrTileLevelMode level_mode;
+    enum ExrTileLevelRound level_round;
+} EXRTileAttribute;
+
 typedef struct EXRThreadData {
     uint8_t *uncompressed_data;
     int uncompressed_size;
@@ -78,6 +103,10 @@ typedef struct EXRThreadData {
 
     uint8_t *bitmap;
     uint16_t *lut;
+
+    int ysize, xsize;
+
+    int channel_line_size;
 } EXRThreadData;
 
 typedef struct EXRContext {
@@ -94,22 +123,25 @@ typedef struct EXRContext {
     uint32_t xmax, xmin;
     uint32_t ymax, ymin;
     uint32_t xdelta, ydelta;
-    int ysize;
 
-    uint64_t scan_line_size;
     int scan_lines_per_block;
 
+    EXRTileAttribute tile_attr; /* header data attribute of tile */
+    int is_tile; /* 0 if scanline, 1 if tile */
+
     GetByteContext gb;
     const uint8_t *buf;
     int buf_size;
 
     EXRChannel *channels;
     int nb_channels;
+    int current_channel_offset;
 
     EXRThreadData *thread_data;
 
     const char *layer;
 
+    enum AVColorTransferCharacteristic apply_trc_type;
     float gamma;
     uint16_t gamma_table[65536];
 } EXRContext;
@@ -490,7 +522,8 @@ static int huf_decode(const uint64_t *hcode, const HufDec *hdecod,
     uint16_t *outb    = out;
     uint16_t *oe      = out + no;
     const uint8_t *ie = gb->buffer + (nbits + 7) / 8; // input byte size
-    uint8_t cs, s;
+    uint8_t cs;
+    uint16_t s;
     int i, lc = 0;
 
     while (gb->buffer < ie) {
@@ -722,8 +755,8 @@ static int piz_uncompress(EXRContext *s, const uint8_t *src, int ssize,
     if (!td->lut)
         td->lut = av_malloc(1 << 17);
     if (!td->bitmap || !td->lut) {
-        av_free(td->bitmap);
-        av_free(td->lut);
+        av_freep(&td->bitmap);
+        av_freep(&td->lut);
         return AVERROR(ENOMEM);
     }
 
@@ -738,7 +771,7 @@ static int piz_uncompress(EXRContext *s, const uint8_t *src, int ssize,
     if (min_non_zero <= max_non_zero)
         bytestream2_get_buffer(&gb, td->bitmap + min_non_zero,
                                max_non_zero - min_non_zero + 1);
-    memset(td->bitmap + max_non_zero, 0, BITMAP_SIZE - max_non_zero);
+    memset(td->bitmap + max_non_zero + 1, 0, BITMAP_SIZE - max_non_zero - 1);
 
     maxval = reverse_lut(td->bitmap, td->lut);
 
@@ -752,19 +785,19 @@ static int piz_uncompress(EXRContext *s, const uint8_t *src, int ssize,
         int size = channel->pixel_type;
 
         for (j = 0; j < size; j++)
-            wav_decode(ptr + j, s->xdelta, size, s->ysize,
-                       s->xdelta * size, maxval);
-        ptr += s->xdelta * s->ysize * size;
+            wav_decode(ptr + j, td->xsize, size, td->ysize,
+                       td->xsize * size, maxval);
+        ptr += td->xsize * td->ysize * size;
     }
 
     apply_lut(td->lut, tmp, dsize / sizeof(uint16_t));
 
     out = td->uncompressed_data;
-    for (i = 0; i < s->ysize; i++)
+    for (i = 0; i < td->ysize; i++)
         for (j = 0; j < s->nb_channels; j++) {
-            uint16_t *in = tmp + j * s->xdelta * s->ysize + i * s->xdelta;
-            memcpy(out, in, s->xdelta * 2);
-            out += s->xdelta * 2;
+            uint16_t *in = tmp + j * td->xsize * td->ysize + i * td->xsize;
+            memcpy(out, in, td->xsize * 2);
+            out += td->xsize * 2;
         }
 
     return 0;
@@ -774,17 +807,31 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
                             int compressed_size, int uncompressed_size,
                             EXRThreadData *td)
 {
-    unsigned long dest_len = uncompressed_size;
+    unsigned long dest_len, expected_len = 0;
     const uint8_t *in = td->tmp;
     uint8_t *out;
     int c, i, j;
 
-    if (uncompress(td->tmp, &dest_len, src, compressed_size) != Z_OK ||
-        dest_len != uncompressed_size)
+    for (i = 0; i < s->nb_channels; i++) {
+        if (s->channels[i].pixel_type == EXR_FLOAT) {
+            expected_len += (td->xsize * td->ysize * 3);/* PRX 24 store float in 24 bit instead of 32 */
+        } else if (s->channels[i].pixel_type == EXR_HALF) {
+            expected_len += (td->xsize * td->ysize * 2);
+        } else {//UINT 32
+            expected_len += (td->xsize * td->ysize * 4);
+        }
+    }
+
+    dest_len = expected_len;
+
+    if (uncompress(td->tmp, &dest_len, src, compressed_size) != Z_OK) {
+        return AVERROR_INVALIDDATA;
+    } else if (dest_len != expected_len) {
         return AVERROR_INVALIDDATA;
+    }
 
     out = td->uncompressed_data;
-    for (i = 0; i < s->ysize; i++)
+    for (i = 0; i < td->ysize; i++)
         for (c = 0; c < s->nb_channels; c++) {
             EXRChannel *channel = &s->channels[c];
             const uint8_t *ptr[4];
@@ -793,11 +840,11 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
             switch (channel->pixel_type) {
             case EXR_FLOAT:
                 ptr[0] = in;
-                ptr[1] = ptr[0] + s->xdelta;
-                ptr[2] = ptr[1] + s->xdelta;
-                in     = ptr[2] + s->xdelta;
+                ptr[1] = ptr[0] + td->xsize;
+                ptr[2] = ptr[1] + td->xsize;
+                in     = ptr[2] + td->xsize;
 
-                for (j = 0; j < s->xdelta; ++j) {
+                for (j = 0; j < td->xsize; ++j) {
                     uint32_t diff = (*(ptr[0]++) << 24) |
                                     (*(ptr[1]++) << 16) |
                                     (*(ptr[2]++) << 8);
@@ -807,9 +854,9 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
                 break;
             case EXR_HALF:
                 ptr[0] = in;
-                ptr[1] = ptr[0] + s->xdelta;
-                in     = ptr[1] + s->xdelta;
-                for (j = 0; j < s->xdelta; j++) {
+                ptr[1] = ptr[0] + td->xsize;
+                in     = ptr[1] + td->xsize;
+                for (j = 0; j < td->xsize; j++) {
                     uint32_t diff = (*(ptr[0]++) << 8) | *(ptr[1]++);
 
                     pixel += diff;
@@ -824,6 +871,132 @@ static int pxr24_uncompress(EXRContext *s, const uint8_t *src,
     return 0;
 }
 
+static void unpack_14(const uint8_t b[14], uint16_t s[16])
+{
+    unsigned short shift = (b[ 2] >> 2);
+    unsigned short bias = (0x20 << shift);
+    int i;
+
+    s[ 0] = (b[0] << 8) | b[1];
+
+    s[ 4] = s[ 0] + ((((b[ 2] << 4) | (b[ 3] >> 4)) & 0x3f) << shift) - bias;
+    s[ 8] = s[ 4] + ((((b[ 3] << 2) | (b[ 4] >> 6)) & 0x3f) << shift) - bias;
+    s[12] = s[ 8] +   ((b[ 4]                       & 0x3f) << shift) - bias;
+
+    s[ 1] = s[ 0] +   ((b[ 5] >> 2)                         << shift) - bias;
+    s[ 5] = s[ 4] + ((((b[ 5] << 4) | (b[ 6] >> 4)) & 0x3f) << shift) - bias;
+    s[ 9] = s[ 8] + ((((b[ 6] << 2) | (b[ 7] >> 6)) & 0x3f) << shift) - bias;
+    s[13] = s[12] +   ((b[ 7]                       & 0x3f) << shift) - bias;
+
+    s[ 2] = s[ 1] +   ((b[ 8] >> 2)                         << shift) - bias;
+    s[ 6] = s[ 5] + ((((b[ 8] << 4) | (b[ 9] >> 4)) & 0x3f) << shift) - bias;
+    s[10] = s[ 9] + ((((b[ 9] << 2) | (b[10] >> 6)) & 0x3f) << shift) - bias;
+    s[14] = s[13] +   ((b[10]                       & 0x3f) << shift) - bias;
+
+    s[ 3] = s[ 2] +   ((b[11] >> 2)                         << shift) - bias;
+    s[ 7] = s[ 6] + ((((b[11] << 4) | (b[12] >> 4)) & 0x3f) << shift) - bias;
+    s[11] = s[10] + ((((b[12] << 2) | (b[13] >> 6)) & 0x3f) << shift) - bias;
+    s[15] = s[14] +   ((b[13]                       & 0x3f) << shift) - bias;
+
+    for (i = 0; i < 16; ++i) {
+        if (s[i] & 0x8000)
+            s[i] &= 0x7fff;
+        else
+            s[i] = ~s[i];
+    }
+}
+
+static void unpack_3(const uint8_t b[3], uint16_t s[16])
+{
+    int i;
+
+    s[0] = (b[0] << 8) | b[1];
+
+    if (s[0] & 0x8000)
+        s[0] &= 0x7fff;
+    else
+        s[0] = ~s[0];
+
+    for (i = 1; i < 16; i++)
+        s[i] = s[0];
+}
+
+
+static int b44_uncompress(EXRContext *s, const uint8_t *src, int compressed_size,
+                          int uncompressed_size, EXRThreadData *td) {
+    const int8_t *sr = src;
+    int stayToUncompress = compressed_size;
+    int nbB44BlockW, nbB44BlockH;
+    int indexHgX, indexHgY, indexOut, indexTmp;
+    uint16_t tmpBuffer[16]; /* B44 use 4x4 half float pixel */
+    int c, iY, iX, y, x;
+    int target_channel_offset = 0;
+
+    /* calc B44 block count */
+    nbB44BlockW = td->xsize / 4;
+    if ((td->xsize % 4) != 0)
+        nbB44BlockW++;
+
+    nbB44BlockH = td->ysize / 4;
+    if ((td->ysize % 4) != 0)
+        nbB44BlockH++;
+
+    for (c = 0; c < s->nb_channels; c++) {
+        for (iY = 0; iY < nbB44BlockH; iY++) {
+            for (iX = 0; iX < nbB44BlockW; iX++) {/* For each B44 block */
+                if (s->channels[c].pixel_type == EXR_HALF) {/* B44 only compress half float data */
+                    if (stayToUncompress < 3) {
+                        av_log(s, AV_LOG_ERROR, "Not enough data for B44A block: %d", stayToUncompress);
+                        return AVERROR_INVALIDDATA;
+                    }
+
+                    if (src[compressed_size - stayToUncompress + 2] == 0xfc) { /* B44A block */
+                        unpack_3(sr, tmpBuffer);
+                        sr += 3;
+                        stayToUncompress -= 3;
+                    }  else {/* B44 Block */
+                        if (stayToUncompress < 14) {
+                            av_log(s, AV_LOG_ERROR, "Not enough data for B44 block: %d", stayToUncompress);
+                            return AVERROR_INVALIDDATA;
+                        }
+                        unpack_14(sr, tmpBuffer);
+                        sr += 14;
+                        stayToUncompress -= 14;
+                    }
+
+                    /* copy data to uncompress buffer (B44 block can exceed target resolution)*/
+                    indexHgX = iX * 4;
+                    indexHgY = iY * 4;
+
+                    for (y = indexHgY; y < FFMIN(indexHgY + 4, td->ysize); y++) {
+                        for (x = indexHgX; x < FFMIN(indexHgX + 4, td->xsize); x++) {
+                            indexOut = target_channel_offset * td->xsize + y * td->channel_line_size + 2 * x;
+                            indexTmp = (y-indexHgY) * 4 + (x-indexHgX);
+                            td->uncompressed_data[indexOut] = tmpBuffer[indexTmp] & 0xff;
+                            td->uncompressed_data[indexOut + 1] = tmpBuffer[indexTmp] >> 8;
+                        }
+                    }
+                } else{/* Float or UINT 32 channel */
+                    for (y = indexHgY; y < FFMIN(indexHgY + 4, td->ysize); y++) {
+                        for (x = indexHgX; x < FFMIN(indexHgX + 4, td->xsize); x++) {
+                            indexOut = target_channel_offset * td->xsize + y * td->channel_line_size + 4 * x;
+                            memcpy(&td->uncompressed_data[indexOut], sr, 4);
+                            sr += 4;
+                        }
+                    }
+                }
+            }
+        }
+        if (s->channels[c].pixel_type == EXR_HALF) {
+            target_channel_offset += 2;
+        } else {
+            target_channel_offset += 4;
+        }
+    }
+
+    return 0;
+}
+
 static int decode_block(AVCodecContext *avctx, void *tdata,
                         int jobnr, int threadnr)
 {
@@ -833,45 +1006,94 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
     const uint8_t *channel_buffer[4] = { 0 };
     const uint8_t *buf = s->buf;
     uint64_t line_offset, uncompressed_size;
-    uint32_t xdelta = s->xdelta;
     uint16_t *ptr_x;
     uint8_t *ptr;
-    uint32_t data_size, line;
+    uint32_t data_size, line, col = 0;
+    uint32_t tileX, tileY, tileLevelX, tileLevelY;
     const uint8_t *src;
-    int axmax = (avctx->width - (s->xmax + 1)) * 2 * s->desc->nb_components;
-    int bxmin = s->xmin * 2 * s->desc->nb_components;
+    int axmax = (avctx->width - (s->xmax + 1)) * 2 * s->desc->nb_components; /* nb pixel to add at the right of the datawindow */
+    int bxmin = s->xmin * 2 * s->desc->nb_components; /* nb pixel to add at the left of the datawindow */
     int i, x, buf_size = s->buf_size;
     float one_gamma = 1.0f / s->gamma;
+    avpriv_trc_function trc_func = avpriv_get_trc_function_from_trc(s->apply_trc_type);
     int ret;
 
     line_offset = AV_RL64(s->gb.buffer + jobnr * 8);
-    // Check if the buffer has the required bytes needed from the offset
-    if (line_offset > buf_size - 8)
-        return AVERROR_INVALIDDATA;
 
-    src  = buf + line_offset + 8;
-    line = AV_RL32(src - 8);
-    if (line < s->ymin || line > s->ymax)
-        return AVERROR_INVALIDDATA;
+    if (s->is_tile) {
+        if (line_offset > buf_size - 20)
+            return AVERROR_INVALIDDATA;
 
-    data_size = AV_RL32(src - 4);
-    if (data_size <= 0 || data_size > buf_size)
-        return AVERROR_INVALIDDATA;
+        src  = buf + line_offset + 20;
 
-    s->ysize          = FFMIN(s->scan_lines_per_block, s->ymax - line + 1);
-    uncompressed_size = s->scan_line_size * s->ysize;
-    if ((s->compression == EXR_RAW && (data_size != uncompressed_size ||
-                                 line_offset > buf_size - uncompressed_size)) ||
-        (s->compression != EXR_RAW && (data_size > uncompressed_size ||
-                                 line_offset > buf_size - data_size))) {
-        return AVERROR_INVALIDDATA;
+        tileX = AV_RL32(src - 20);
+        tileY = AV_RL32(src - 16);
+        tileLevelX = AV_RL32(src - 12);
+        tileLevelY = AV_RL32(src - 8);
+
+        data_size = AV_RL32(src - 4);
+        if (data_size <= 0 || data_size > buf_size)
+            return AVERROR_INVALIDDATA;
+
+        if (tileLevelX || tileLevelY) { /* tile level, is not the full res level */
+            avpriv_report_missing_feature(s->avctx, "Subres tile before full res tile");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        line = s->tile_attr.ySize * tileY;
+        col = s->tile_attr.xSize * tileX;
+
+        td->ysize = FFMIN(s->tile_attr.ySize, s->ydelta - tileY * s->tile_attr.ySize);
+        td->xsize = FFMIN(s->tile_attr.xSize, s->xdelta - tileX * s->tile_attr.xSize);
+
+        if (col) { /* not the first tile of the line */
+            bxmin = 0; /* doesn't add pixel at the left of the datawindow */
+        }
+
+        if ((col + td->xsize) != s->xdelta)/* not the last tile of the line */
+            axmax = 0; /* doesn't add pixel at the right of the datawindow */
+
+        td->channel_line_size = td->xsize * s->current_channel_offset;/* uncompress size of one line */
+        uncompressed_size = td->channel_line_size * (uint64_t)td->ysize;/* uncompress size of the block */
+    } else {
+        if (line_offset > buf_size - 8)
+            return AVERROR_INVALIDDATA;
+
+        src  = buf + line_offset + 8;
+        line = AV_RL32(src - 8);
+
+        if (line < s->ymin || line > s->ymax)
+            return AVERROR_INVALIDDATA;
+
+        data_size = AV_RL32(src - 4);
+        if (data_size <= 0 || data_size > buf_size)
+            return AVERROR_INVALIDDATA;
+
+        td->ysize          = FFMIN(s->scan_lines_per_block, s->ymax - line + 1); /* s->ydelta - line ?? */
+        td->xsize          = s->xdelta;
+
+        td->channel_line_size = td->xsize * s->current_channel_offset;/* uncompress size of one line */
+        uncompressed_size = td->channel_line_size * (uint64_t)td->ysize;/* uncompress size of the block */
+
+        if ((s->compression == EXR_RAW && (data_size != uncompressed_size ||
+                                           line_offset > buf_size - uncompressed_size)) ||
+            (s->compression != EXR_RAW && (data_size > uncompressed_size ||
+                                           line_offset > buf_size - data_size))) {
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (data_size < uncompressed_size || s->is_tile) { /* td->tmp is use for tile reorganization */
+        av_fast_padded_malloc(&td->tmp, &td->tmp_size, uncompressed_size);
+        if (!td->tmp)
+            return AVERROR(ENOMEM);
     }
 
     if (data_size < uncompressed_size) {
         av_fast_padded_malloc(&td->uncompressed_data,
                               &td->uncompressed_size, uncompressed_size);
-        av_fast_padded_malloc(&td->tmp, &td->tmp_size, uncompressed_size);
-        if (!td->uncompressed_data || !td->tmp)
+
+        if (!td->uncompressed_data)
             return AVERROR(ENOMEM);
 
         ret = AVERROR_INVALIDDATA;
@@ -888,6 +1110,11 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
             break;
         case EXR_RLE:
             ret = rle_uncompress(src, data_size, uncompressed_size, td);
+            break;
+        case EXR_B44:
+        case EXR_B44A:
+            ret = b44_uncompress(s, src, data_size, uncompressed_size, td);
+            break;
         }
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "decode_block() failed.\n");
@@ -896,16 +1123,17 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         src = td->uncompressed_data;
     }
 
-    channel_buffer[0] = src + xdelta * s->channel_offsets[0];
-    channel_buffer[1] = src + xdelta * s->channel_offsets[1];
-    channel_buffer[2] = src + xdelta * s->channel_offsets[2];
+    channel_buffer[0] = src + td->xsize * s->channel_offsets[0];
+    channel_buffer[1] = src + td->xsize * s->channel_offsets[1];
+    channel_buffer[2] = src + td->xsize * s->channel_offsets[2];
     if (s->channel_offsets[3] >= 0)
-        channel_buffer[3] = src + xdelta * s->channel_offsets[3];
+        channel_buffer[3] = src + td->xsize * s->channel_offsets[3];
+
+    ptr = p->data[0] + line * p->linesize[0] + (col * s->desc->nb_components * 2);
 
-    ptr = p->data[0] + line * p->linesize[0];
     for (i = 0;
-         i < s->scan_lines_per_block && line + i <= s->ymax;
-         i++, ptr += p->linesize[0]) {
+         i < td->ysize; i++, ptr += p->linesize[0]) {
+
         const uint8_t *r, *g, *b, *a;
 
         r = channel_buffer[0];
@@ -919,30 +1147,50 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         // Zero out the start if xmin is not 0
         memset(ptr_x, 0, bxmin);
         ptr_x += s->xmin * s->desc->nb_components;
+
         if (s->pixel_type == EXR_FLOAT) {
             // 32-bit
-            for (x = 0; x < xdelta; x++) {
-                union av_intfloat32 t;
-                t.i = bytestream_get_le32(&r);
-                if (t.f > 0.0f)  /* avoid negative values */
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-
-                t.i = bytestream_get_le32(&g);
-                if (t.f > 0.0f)
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-
-                t.i = bytestream_get_le32(&b);
-                if (t.f > 0.0f)
-                    t.f = powf(t.f, one_gamma);
-                *ptr_x++ = exr_flt2uint(t.i);
-                if (channel_buffer[3])
-                    *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+            if (trc_func) {
+                for (x = 0; x < td->xsize; x++) {
+                    union av_intfloat32 t;
+                    t.i = bytestream_get_le32(&r);
+                    t.f = trc_func(t.f);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&g);
+                    t.f = trc_func(t.f);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&b);
+                    t.f = trc_func(t.f);
+                    *ptr_x++ = exr_flt2uint(t.i);
+                    if (channel_buffer[3])
+                        *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+                }
+            } else {
+                for (x = 0; x < td->xsize; x++) {
+                    union av_intfloat32 t;
+                    t.i = bytestream_get_le32(&r);
+                    if (t.f > 0.0f)  /* avoid negative values */
+                        t.f = powf(t.f, one_gamma);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&g);
+                    if (t.f > 0.0f)
+                        t.f = powf(t.f, one_gamma);
+                    *ptr_x++ = exr_flt2uint(t.i);
+
+                    t.i = bytestream_get_le32(&b);
+                    if (t.f > 0.0f)
+                        t.f = powf(t.f, one_gamma);
+                    *ptr_x++ = exr_flt2uint(t.i);
+                    if (channel_buffer[3])
+                        *ptr_x++ = exr_flt2uint(bytestream_get_le32(&a));
+                }
             }
         } else {
             // 16-bit
-            for (x = 0; x < xdelta; x++) {
+            for (x = 0; x < td->xsize; x++) {
                 *ptr_x++ = s->gamma_table[bytestream_get_le16(&r)];
                 *ptr_x++ = s->gamma_table[bytestream_get_le16(&g)];
                 *ptr_x++ = s->gamma_table[bytestream_get_le16(&b)];
@@ -954,11 +1202,11 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
         // Zero out the end if xmax+1 is not w
         memset(ptr_x, 0, axmax);
 
-        channel_buffer[0] += s->scan_line_size;
-        channel_buffer[1] += s->scan_line_size;
-        channel_buffer[2] += s->scan_line_size;
+        channel_buffer[0] += td->channel_line_size;
+        channel_buffer[1] += td->channel_line_size;
+        channel_buffer[2] += td->channel_line_size;
         if (channel_buffer[3])
-            channel_buffer[3] += s->scan_line_size;
+            channel_buffer[3] += td->channel_line_size;
     }
 
     return 0;
@@ -1007,8 +1255,28 @@ static int check_header_variable(EXRContext *s,
 
 static int decode_header(EXRContext *s)
 {
-    int current_channel_offset = 0;
-    int magic_number, version, flags, i;
+    int magic_number, version, i, flags, sar = 0;
+    int layer_match = 0;
+
+    s->current_channel_offset = 0;
+    s->xmin               = ~0;
+    s->xmax               = ~0;
+    s->ymin               = ~0;
+    s->ymax               = ~0;
+    s->xdelta             = ~0;
+    s->ydelta             = ~0;
+    s->channel_offsets[0] = -1;
+    s->channel_offsets[1] = -1;
+    s->channel_offsets[2] = -1;
+    s->channel_offsets[3] = -1;
+    s->pixel_type         = EXR_UNKNOWN;
+    s->compression        = EXR_UNKN;
+    s->nb_channels        = 0;
+    s->w                  = 0;
+    s->h                  = 0;
+    s->tile_attr.xSize    = -1;
+    s->tile_attr.ySize    = -1;
+    s->is_tile            = 0;
 
     if (bytestream2_get_bytes_left(&s->gb) < 10) {
         av_log(s->avctx, AV_LOG_ERROR, "Header too short to parse.\n");
@@ -1030,8 +1298,13 @@ static int decode_header(EXRContext *s)
     }
 
     flags = bytestream2_get_le24(&s->gb);
-    if (flags & 0x02) {
-        avpriv_report_missing_feature(s->avctx, "Tile support");
+
+    if (flags == 0x00)
+        s->is_tile = 0;
+    else if (flags & 0x02)
+        s->is_tile = 1;
+    else{
+        avpriv_report_missing_feature(s->avctx, "flags %d", flags);
         return AVERROR_PATCHWELCOME;
     }
 
@@ -1054,31 +1327,39 @@ static int decode_header(EXRContext *s)
 
                 if (strcmp(s->layer, "") != 0) {
                     if (strncmp(ch_gb.buffer, s->layer, strlen(s->layer)) == 0) {
+                        layer_match = 1;
+                        av_log(s->avctx, AV_LOG_INFO,
+                               "Channel match layer : %s.\n", ch_gb.buffer);
                         ch_gb.buffer += strlen(s->layer);
                         if (*ch_gb.buffer == '.')
                             ch_gb.buffer++;         /* skip dot if not given */
+                    } else {
                         av_log(s->avctx, AV_LOG_INFO,
-                               "Layer %s.%s matched.\n", s->layer, ch_gb.buffer);
+                               "Channel doesn't match layer : %s.\n", ch_gb.buffer);
                     }
+                } else {
+                    layer_match = 1;
                 }
 
-                if (!strcmp(ch_gb.buffer, "R") ||
-                    !strcmp(ch_gb.buffer, "X") ||
-                    !strcmp(ch_gb.buffer, "U"))
-                    channel_index = 0;
-                else if (!strcmp(ch_gb.buffer, "G") ||
-                         !strcmp(ch_gb.buffer, "Y") ||
-                         !strcmp(ch_gb.buffer, "V"))
-                    channel_index = 1;
-                else if (!strcmp(ch_gb.buffer, "B") ||
-                         !strcmp(ch_gb.buffer, "Z") ||
-                         !strcmp(ch_gb.buffer, "W"))
-                    channel_index = 2;
-                else if (!strcmp(ch_gb.buffer, "A"))
-                    channel_index = 3;
-                else
-                    av_log(s->avctx, AV_LOG_WARNING,
-                           "Unsupported channel %.256s.\n", ch_gb.buffer);
+                if (layer_match) { /* only search channel if the layer match is valid */
+                    if (!strcmp(ch_gb.buffer, "R") ||
+                        !strcmp(ch_gb.buffer, "X") ||
+                        !strcmp(ch_gb.buffer, "U"))
+                        channel_index = 0;
+                    else if (!strcmp(ch_gb.buffer, "G") ||
+                             !strcmp(ch_gb.buffer, "Y") ||
+                             !strcmp(ch_gb.buffer, "V"))
+                        channel_index = 1;
+                    else if (!strcmp(ch_gb.buffer, "B") ||
+                             !strcmp(ch_gb.buffer, "Z") ||
+                             !strcmp(ch_gb.buffer, "W"))
+                        channel_index = 2;
+                    else if (!strcmp(ch_gb.buffer, "A"))
+                        channel_index = 3;
+                    else
+                        av_log(s->avctx, AV_LOG_WARNING,
+                               "Unsupported channel %.256s.\n", ch_gb.buffer);
+                }
 
                 /* skip until you get a 0 */
                 while (bytestream2_get_bytes_left(&ch_gb) > 0 &&
@@ -1107,15 +1388,17 @@ static int decode_header(EXRContext *s)
                     return AVERROR_PATCHWELCOME;
                 }
 
-                if (channel_index >= 0) {
-                    if (s->pixel_type != EXR_UNKNOWN &&
-                        s->pixel_type != current_pixel_type) {
-                        av_log(s->avctx, AV_LOG_ERROR,
-                               "RGB channels not of the same depth.\n");
-                        return AVERROR_INVALIDDATA;
+                if (s->channel_offsets[channel_index] == -1){/* channel have not been previously assign */
+                    if (channel_index >= 0) {
+                        if (s->pixel_type != EXR_UNKNOWN &&
+                            s->pixel_type != current_pixel_type) {
+                            av_log(s->avctx, AV_LOG_ERROR,
+                                   "RGB channels not of the same depth.\n");
+                            return AVERROR_INVALIDDATA;
+                        }
+                        s->pixel_type                     = current_pixel_type;
+                        s->channel_offsets[channel_index] = s->current_channel_offset;
                     }
-                    s->pixel_type                     = current_pixel_type;
-                    s->channel_offsets[channel_index] = current_channel_offset;
                 }
 
                 s->channels = av_realloc(s->channels,
@@ -1127,7 +1410,7 @@ static int decode_header(EXRContext *s)
                 channel->xsub       = xsub;
                 channel->ysub       = ysub;
 
-                current_channel_offset += 1 << current_pixel_type;
+                s->current_channel_offset += 1 << current_pixel_type;
             }
 
             /* Check if all channels are set with an offset or if the channels
@@ -1189,8 +1472,7 @@ static int decode_header(EXRContext *s)
             if (!var_size)
                 return AVERROR_INVALIDDATA;
 
-            ff_set_sar(s->avctx,
-                       av_d2q(av_int2float(bytestream2_get_le32(&s->gb)), 255));
+            sar = bytestream2_get_le32(&s->gb);
 
             continue;
         } else if ((var_size = check_header_variable(s, "compression",
@@ -1205,6 +1487,34 @@ static int decode_header(EXRContext *s)
                        "Found more than one compression attribute.\n");
 
             continue;
+        } else if ((var_size = check_header_variable(s, "tiles",
+                                                     "tiledesc", 22)) >= 0) {
+            char tileLevel;
+
+            if (!s->is_tile)
+                av_log(s->avctx, AV_LOG_WARNING,
+                       "Found tile attribute and scanline flags. Exr will be interpreted as scanline.\n");
+
+            s->tile_attr.xSize = bytestream2_get_le32(&s->gb);
+            s->tile_attr.ySize = bytestream2_get_le32(&s->gb);
+
+            tileLevel = bytestream2_get_byte(&s->gb);
+            s->tile_attr.level_mode = tileLevel & 0x0f;
+            s->tile_attr.level_round = (tileLevel >> 4) & 0x0f;
+
+            if (s->tile_attr.level_mode >= EXR_TILE_LEVEL_UNKNOWN){
+                avpriv_report_missing_feature(s->avctx, "Tile level mode %d",
+                                              s->tile_attr.level_mode);
+                return AVERROR_PATCHWELCOME;
+            }
+
+            if (s->tile_attr.level_round >= EXR_TILE_ROUND_UNKNOWN) {
+                avpriv_report_missing_feature(s->avctx, "Tile level round %d",
+                                              s->tile_attr.level_round);
+                return AVERROR_PATCHWELCOME;
+            }
+
+            continue;
         }
 
         // Check if there are enough bytes for a header
@@ -1221,11 +1531,19 @@ static int decode_header(EXRContext *s)
         bytestream2_skip(&s->gb, bytestream2_get_le32(&s->gb));
     }
 
+    ff_set_sar(s->avctx, av_d2q(av_int2float(sar), 255));
+
     if (s->compression == EXR_UNKN) {
         av_log(s->avctx, AV_LOG_ERROR, "Missing compression attribute.\n");
         return AVERROR_INVALIDDATA;
     }
-    s->scan_line_size = s->xdelta * current_channel_offset;
+
+    if (s->is_tile) {
+        if (s->tile_attr.xSize < 1 || s->tile_attr.ySize < 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid tile attribute.\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
 
     if (bytestream2_get_bytes_left(&s->gb) <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Incomplete frame.\n");
@@ -1247,7 +1565,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
     int y, ret;
     int out_line_size;
-    int scan_line_blocks;
+    int nb_blocks;/* nb scanline or nb tile */
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
@@ -1270,6 +1588,9 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    if (s->apply_trc_type != AVCOL_TRC_UNSPECIFIED)
+        avctx->color_trc = s->apply_trc_type;
+
     switch (s->compression) {
     case EXR_RAW:
     case EXR_RLE:
@@ -1281,6 +1602,8 @@ static int decode_frame(AVCodecContext *avctx, void *data,
         s->scan_lines_per_block = 16;
         break;
     case EXR_PIZ:
+    case EXR_B44:
+    case EXR_B44A:
         s->scan_lines_per_block = 32;
         break;
     default:
@@ -1306,13 +1629,19 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     if (!s->desc)
         return AVERROR_INVALIDDATA;
     out_line_size    = avctx->width * 2 * s->desc->nb_components;
-    scan_line_blocks = (s->ydelta + s->scan_lines_per_block - 1) /
-                       s->scan_lines_per_block;
+
+    if (s->is_tile) {
+        nb_blocks = ((s->xdelta + s->tile_attr.xSize - 1) / s->tile_attr.xSize) *
+        ((s->ydelta + s->tile_attr.ySize - 1) / s->tile_attr.ySize);
+    } else { /* scanline */
+        nb_blocks = (s->ydelta + s->scan_lines_per_block - 1) /
+        s->scan_lines_per_block;
+    }
 
     if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
 
-    if (bytestream2_get_bytes_left(&s->gb) < scan_line_blocks * 8)
+    if (bytestream2_get_bytes_left(&s->gb) < nb_blocks * 8)
         return AVERROR_INVALIDDATA;
 
     // save pointer we are going to use in decode_block
@@ -1327,7 +1656,8 @@ static int decode_frame(AVCodecContext *avctx, void *data,
     }
 
     s->picture = picture;
-    avctx->execute2(avctx, decode_block, s->thread_data, NULL, scan_line_blocks);
+
+    avctx->execute2(avctx, decode_block, s->thread_data, NULL, nb_blocks);
 
     // Zero out the end if ymax+1 is not h
     for (y = s->ymax + 1; y < avctx->height; y++) {
@@ -1347,36 +1677,31 @@ static av_cold int decode_init(AVCodecContext *avctx)
     uint32_t i;
     union av_intfloat32 t;
     float one_gamma = 1.0f / s->gamma;
+    avpriv_trc_function trc_func = NULL;
 
     s->avctx              = avctx;
-    s->xmin               = ~0;
-    s->xmax               = ~0;
-    s->ymin               = ~0;
-    s->ymax               = ~0;
-    s->xdelta             = ~0;
-    s->ydelta             = ~0;
-    s->channel_offsets[0] = -1;
-    s->channel_offsets[1] = -1;
-    s->channel_offsets[2] = -1;
-    s->channel_offsets[3] = -1;
-    s->pixel_type         = EXR_UNKNOWN;
-    s->compression        = EXR_UNKN;
-    s->nb_channels        = 0;
-    s->w                  = 0;
-    s->h                  = 0;
 
-    if (one_gamma > 0.9999f && one_gamma < 1.0001f) {
-        for (i = 0; i < 65536; ++i)
-            s->gamma_table[i] = exr_halflt2uint(i);
-    } else {
+    trc_func = avpriv_get_trc_function_from_trc(s->apply_trc_type);
+    if (trc_func) {
         for (i = 0; i < 65536; ++i) {
             t = exr_half2float(i);
-            /* If negative value we reuse half value */
-            if (t.f <= 0.0f) {
+            t.f = trc_func(t.f);
+            s->gamma_table[i] = exr_flt2uint(t.i);
+        }
+    } else {
+        if (one_gamma > 0.9999f && one_gamma < 1.0001f) {
+            for (i = 0; i < 65536; ++i)
                 s->gamma_table[i] = exr_halflt2uint(i);
-            } else {
-                t.f = powf(t.f, one_gamma);
-                s->gamma_table[i] = exr_flt2uint(t.i);
+        } else {
+            for (i = 0; i < 65536; ++i) {
+                t = exr_half2float(i);
+                /* If negative value we reuse half value */
+                if (t.f <= 0.0f) {
+                    s->gamma_table[i] = exr_halflt2uint(i);
+                } else {
+                    t.f = powf(t.f, one_gamma);
+                    s->gamma_table[i] = exr_flt2uint(t.i);
+                }
             }
         }
     }
@@ -1389,6 +1714,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static int decode_init_thread_copy(AVCodecContext *avctx)
 {    EXRContext *s = avctx->priv_data;
 
@@ -1399,6 +1725,7 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
@@ -1425,6 +1752,43 @@ static const AVOption options[] = {
         AV_OPT_TYPE_STRING, { .str = "" }, 0, 0, VD },
     { "gamma", "Set the float gamma value when decoding", OFFSET(gamma),
         AV_OPT_TYPE_FLOAT, { .dbl = 1.0f }, 0.001, FLT_MAX, VD },
+
+    // XXX: Note the abuse of the enum using AVCOL_TRC_UNSPECIFIED to subsume the existing gamma option
+    { "apply_trc", "color transfer characteristics to apply to EXR linear input", OFFSET(apply_trc_type),
+        AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_UNSPECIFIED }, 1, AVCOL_TRC_NB-1, VD, "apply_trc_type"},
+    { "bt709",        "BT.709",           0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709 },        INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma",        "gamma",            0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED },  INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma22",      "BT.470 M",         0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA22 },      INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "gamma28",      "BT.470 BG",        0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA28 },      INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte170m",    "SMPTE 170 M",      0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE170M },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte240m",    "SMPTE 240 M",      0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE240M },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "linear",       "Linear",           0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LINEAR },       INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "log",          "Log",              0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG },          INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "log_sqrt",     "Log square root",  0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG_SQRT },     INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "iec61966_2_4", "IEC 61966-2-4",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_4 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt1361",       "BT.1361",          0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT1361_ECG },   INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "iec61966_2_1", "IEC 61966-2-1",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt2020_10bit", "BT.2020 - 10 bit", 0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "bt2020_12bit", "BT.2020 - 12 bit", 0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 },    INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte2084",    "SMPTE ST 2084",    0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST2084 },  INT_MIN, INT_MAX, VD, "apply_trc_type"},
+    { "smpte428_1",   "SMPTE ST 428-1",   0,
+        AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST428_1 }, INT_MIN, INT_MAX, VD, "apply_trc_type"},
+
     { NULL },
 };
 
diff --git a/libavcodec/faandct.c b/libavcodec/faandct.c
index 4053d69..b683072 100644
--- a/libavcodec/faandct.c
+++ b/libavcodec/faandct.c
@@ -29,25 +29,24 @@
 #include "libavutil/internal.h"
 #include "libavutil/libm.h"
 
-#define FLOAT float
+typedef float FLOAT;
 
-//numbers generated by simple c code (not as accurate as they could be)
-/*
-for(i=0; i<8; i++){
-    printf("#define B%d %1.20llf\n", i, (long double)1.0/(cosl(i*acosl(-1.0)/(long double)16.0)*sqrtl(2)));
-}
+/* numbers generated by arbitrary precision arithmetic followed by truncation
+to 36 fractional digits (enough for a 128-bit IEEE quad, see /usr/include/math.h
+for this approach). Unfortunately, long double is not always available correctly,
+e.g ppc has issues.
+TODO: add L suffixes when ppc and toolchains sort out their stuff.
 */
-#define B0 1.00000000000000000000
-#define B1 0.72095982200694791383 // (cos(pi*1/16)sqrt(2))^-1
-#define B2 0.76536686473017954350 // (cos(pi*2/16)sqrt(2))^-1
-#define B3 0.85043009476725644878 // (cos(pi*3/16)sqrt(2))^-1
-#define B4 1.00000000000000000000 // (cos(pi*4/16)sqrt(2))^-1
-#define B5 1.27275858057283393842 // (cos(pi*5/16)sqrt(2))^-1
-#define B6 1.84775906502257351242 // (cos(pi*6/16)sqrt(2))^-1
-#define B7 3.62450978541155137218 // (cos(pi*7/16)sqrt(2))^-1
-
-
-#define A1 0.70710678118654752438 // cos(pi*4/16)
+#define B0 1.000000000000000000000000000000000000
+#define B1 0.720959822006947913789091890943021267 // (cos(pi*1/16)sqrt(2))^-1
+#define B2 0.765366864730179543456919968060797734 // (cos(pi*2/16)sqrt(2))^-1
+#define B3 0.850430094767256448766702844371412325 // (cos(pi*3/16)sqrt(2))^-1
+#define B4 1.000000000000000000000000000000000000 // (cos(pi*4/16)sqrt(2))^-1
+#define B5 1.272758580572833938461007018281767032 // (cos(pi*5/16)sqrt(2))^-1
+#define B6 1.847759065022573512256366378793576574 // (cos(pi*6/16)sqrt(2))^-1
+#define B7 3.624509785411551372409941227504289587 // (cos(pi*7/16)sqrt(2))^-1
+
+#define A1 M_SQRT1_2              // cos(pi*4/16)
 #define A2 0.54119610014619698435 // cos(pi*6/16)sqrt(2)
 #define A5 0.38268343236508977170 // cos(pi*6/16)
 #define A4 1.30656296487637652774 // cos(pi*2/16)sqrt(2)
diff --git a/libavcodec/faandct.h b/libavcodec/faandct.h
index 59d5ff3..c5ef96d 100644
--- a/libavcodec/faandct.h
+++ b/libavcodec/faandct.h
@@ -2,20 +2,20 @@
  * Floating point AAN DCT
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/faanidct.c b/libavcodec/faanidct.c
index 2e9ce9c..bbaaa3f 100644
--- a/libavcodec/faanidct.c
+++ b/libavcodec/faanidct.c
@@ -2,27 +2,27 @@
  * Floating point AAN IDCT
  * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "faanidct.h"
 #include "libavutil/common.h"
 
 /* To allow switching to double. */
-#define FLOAT float
+typedef float FLOAT;
 
 #define B0 1.0000000000000000000000
 #define B1 1.3870398453221474618216 // cos(pi*1/16)sqrt(2)
diff --git a/libavcodec/faanidct.h b/libavcodec/faanidct.h
index 0c01520..4cd2c78 100644
--- a/libavcodec/faanidct.h
+++ b/libavcodec/faanidct.h
@@ -2,20 +2,20 @@
  * Floating point AAN IDCT
  * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/faxcompr.c b/libavcodec/faxcompr.c
index 4cbda3f..2a1d2bc 100644
--- a/libavcodec/faxcompr.c
+++ b/libavcodec/faxcompr.c
@@ -2,20 +2,20 @@
  * CCITT Fax Group 3 and 4 decompression
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -122,6 +122,81 @@ av_cold void ff_ccitt_unpack_init(void)
     initialized = 1;
 }
 
+static int decode_uncompressed(AVCodecContext *avctx, GetBitContext *gb,
+                               unsigned int *pix_left, int **runs,
+                               const int *runend, int *mode)
+{
+    int eob = 0;
+    int newmode;
+    int saved_run = 0;
+
+    do {
+        int cwi, k;
+        int cw = 0;
+        int codes[2];
+        do {
+            cwi = show_bits(gb, 11);
+            if (!cwi) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid uncompressed codeword\n");
+                return AVERROR_INVALIDDATA;
+            }
+            cwi = 10 - av_log2(cwi);
+            skip_bits(gb, cwi + 1);
+            if (cwi > 5) {
+                newmode = get_bits1(gb);
+                eob = 1;
+                cwi -= 6;
+            }
+            cw += cwi;
+        } while(cwi == 5);
+
+        codes[0] = cw;
+        codes[1] = !eob;
+
+        for (k = 0; k < 2; k++) {
+            if (codes[k]) {
+                if (*mode == !k) {
+                    *(*runs)++ = saved_run;
+                    if (*runs >= runend) {
+                        av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    if (*pix_left <= saved_run) {
+                        av_log(avctx, AV_LOG_ERROR, "uncompressed run went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    *pix_left -= saved_run;
+                    saved_run = 0;
+                    *mode = !*mode;
+                }
+                saved_run += codes[k];
+            }
+        }
+    } while (!eob);
+    *(*runs)++ = saved_run;
+    if (*runs >= runend) {
+        av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (*pix_left <= saved_run) {
+        if (*pix_left == saved_run)
+            return 1;
+        av_log(avctx, AV_LOG_ERROR, "uncompressed run went out of boundsE\n");
+        return AVERROR_INVALIDDATA;
+    }
+    *pix_left -= saved_run;
+    saved_run = 0;
+    *mode = !*mode;
+    if (newmode != *mode) { //FIXME CHECK
+        *(*runs)++ = 0;
+        if (*runs >= runend) {
+            av_log(avctx, AV_LOG_ERROR, "uncompressed run overrun\n");
+            return AVERROR_INVALIDDATA;
+        }
+        *mode = newmode;
+    }
+    return 0;
+}
 
 static int decode_group3_1d_line(AVCodecContext *avctx, GetBitContext *gb,
                                  unsigned int pix_left, int *runs,
@@ -149,8 +224,18 @@ static int decode_group3_1d_line(AVCodecContext *avctx, GetBitContext *gb,
             run       = 0;
             mode      = !mode;
         } else if ((int)t == -1) {
-            av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
-            return AVERROR_INVALIDDATA;
+            if (show_bits(gb, 12) == 15) {
+                int ret;
+                skip_bits(gb, 12);
+                ret = decode_uncompressed(avctx, gb, &pix_left, &runs, runend, &mode);
+                if (ret < 0) {
+                    return ret;
+                } else if (ret)
+                    break;
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
+                return AVERROR_INVALIDDATA;
+            }
         }
     }
     *runs++ = 0;
@@ -165,8 +250,6 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
     int run_off       = *ref++;
     unsigned int offs = 0, run = 0;
 
-    runend--; // for the last written 0
-
     while (offs < width) {
         int cmode = get_vlc2(gb, ccitt_group3_2d_vlc.table, 9, 1);
         if (cmode == -1) {
@@ -174,10 +257,12 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
             return AVERROR_INVALIDDATA;
         }
         if (!cmode) { //pass mode
-            run_off += *ref++;
+            if (run_off < width)
+                run_off += *ref++;
             run      = run_off - offs;
             offs     = run_off;
-            run_off += *ref++;
+            if (run_off < width)
+                run_off += *ref++;
             if (offs > width) {
                 av_log(avctx, AV_LOG_ERROR, "Run went out of bounds\n");
                 return AVERROR_INVALIDDATA;
@@ -211,8 +296,25 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
                 mode = !mode;
             }
         } else if (cmode == 9 || cmode == 10) {
-            avpriv_report_missing_feature(avctx, "Special modes support");
-            return AVERROR_PATCHWELCOME;
+            int xxx = get_bits(gb, 3);
+            if (cmode == 9 && xxx == 7) {
+                int ret;
+                int pix_left = width - offs;
+
+                if (saved_run) {
+                    av_log(avctx, AV_LOG_ERROR, "saved run %d on entering uncompressed mode\n", saved_run);
+                    return AVERROR_INVALIDDATA;
+                }
+                ret = decode_uncompressed(avctx, gb, &pix_left, &runs, runend, &mode);
+                offs = width - pix_left;
+                if (ret < 0) {
+                    return ret;
+                } else if (ret)
+                    break;
+            } else {
+                avpriv_report_missing_feature(avctx, "Special mode %d xxx=%d support", cmode, xxx);
+                return AVERROR_PATCHWELCOME;
+            }
         } else { //vertical mode
             run      = run_off - offs + (cmode - 5);
             run_off -= *--ref;
@@ -230,13 +332,19 @@ static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
             mode      = !mode;
         }
         //sync line pointers
-        while (run_off <= offs) {
+        while (offs < width && run_off <= offs) {
             run_off += *ref++;
             run_off += *ref++;
         }
     }
     *runs++ = saved_run;
-    *runs++ = 0;
+    if (saved_run) {
+        if (runs >= runend) {
+            av_log(avctx, AV_LOG_ERROR, "Run overrun\n");
+            return -1;
+        }
+        *runs++ = 0;
+    }
     return 0;
 }
 
@@ -245,7 +353,7 @@ static void put_line(uint8_t *dst, int size, int width, const int *runs)
     PutBitContext pb;
     int run, mode = ~0, pix_left = width, run_idx = 0;
 
-    init_put_bits(&pb, dst, size * 8);
+    init_put_bits(&pb, dst, size);
     while (pix_left > 0) {
         run       = runs[run_idx++];
         mode      = ~mode;
@@ -279,9 +387,10 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
     int *runs, *ref = NULL, *runend;
     int ret;
     int runsize = avctx->width + 2;
+    int has_eol;
 
-    runs = av_malloc(runsize * sizeof(runs[0]));
-    ref  = av_malloc(runsize * sizeof(ref[0]));
+    runs = av_malloc_array(runsize, sizeof(runs[0]));
+    ref  = av_malloc_array(runsize, sizeof(ref[0]));
     if (!runs || !ref) {
         ret = AVERROR(ENOMEM);
         goto fail;
@@ -289,7 +398,10 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
     ref[0] = avctx->width;
     ref[1] = 0;
     ref[2] = 0;
-    init_get_bits(&gb, src, srcsize * 8);
+    if ((ret = init_get_bits8(&gb, src, srcsize)) < 0)
+        goto fail;
+    has_eol = show_bits(&gb, 12) == 1 || show_bits(&gb, 16) == 1;
+
     for (j = 0; j < height; j++) {
         runend = runs + runsize;
         if (compr == TIFF_G4) {
@@ -300,6 +412,7 @@ int ff_ccitt_unpack(AVCodecContext *avctx, const uint8_t *src, int srcsize,
         } else {
             int g3d1 = (compr == TIFF_G3) && !(opts & 1);
             if (compr != TIFF_CCITT_RLE &&
+                has_eol &&
                 find_group3_syncmarker(&gb, srcsize * 8) < 0)
                 break;
             if (compr == TIFF_CCITT_RLE || g3d1 || get_bits1(&gb))
diff --git a/libavcodec/faxcompr.h b/libavcodec/faxcompr.h
index 0a8b64d..aa29a7b 100644
--- a/libavcodec/faxcompr.h
+++ b/libavcodec/faxcompr.h
@@ -2,20 +2,20 @@
  * CCITT Fax Group 3 and 4 decompression
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c
index f299eae..b9c2c86 100644
--- a/libavcodec/fdctdsp.c
+++ b/libavcodec/fdctdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (avctx->bits_per_raw_sample == 10) {
+    if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) {
         c->fdct    = ff_jpeg_fdct_islow_10;
         c->fdct248 = ff_fdct248_islow_10;
     } else if (avctx->dct_algo == FF_DCT_FASTINT) {
diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h
index 944dc6d..3e1f683 100644
--- a/libavcodec/fdctdsp.h
+++ b/libavcodec/fdctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ffjni.c b/libavcodec/ffjni.c
new file mode 100644
index 0000000..82ee5d3
--- /dev/null
+++ b/libavcodec/ffjni.c
@@ -0,0 +1,402 @@
+/*
+ * JNI utility functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <jni.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "libavutil/bprint.h"
+#include "libavutil/log.h"
+
+#include "config.h"
+#include "jni.h"
+#include "ffjni.h"
+
+static JavaVM *java_vm = NULL;
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+JNIEnv *ff_jni_attach_env(int *attached, void *log_ctx)
+{
+    int ret = 0;
+    JNIEnv *env = NULL;
+
+    *attached = 0;
+
+    pthread_mutex_lock(&lock);
+    if (java_vm == NULL) {
+        java_vm = av_jni_get_java_vm(log_ctx);
+    }
+    pthread_mutex_unlock(&lock);
+
+    if (!java_vm) {
+        av_log(log_ctx, AV_LOG_ERROR, "No Java virtual machine has been registered\n");
+        return NULL;
+    }
+
+    ret = (*java_vm)->GetEnv(java_vm, (void **)&env, JNI_VERSION_1_6);
+    switch(ret) {
+    case JNI_EDETACHED:
+        if ((*java_vm)->AttachCurrentThread(java_vm, &env, NULL) != 0) {
+            av_log(log_ctx, AV_LOG_ERROR, "Failed to attach the JNI environment to the current thread\n");
+            env = NULL;
+        } else {
+            *attached = 1;
+        }
+        break;
+    case JNI_OK:
+        break;
+    case JNI_EVERSION:
+        av_log(log_ctx, AV_LOG_ERROR, "The specified JNI version is not supported\n");
+        break;
+    default:
+        av_log(log_ctx, AV_LOG_ERROR, "Failed to get the JNI environment attached to this thread");
+        break;
+    }
+
+    return env;
+}
+
+int ff_jni_detach_env(void *log_ctx)
+{
+    if (java_vm == NULL) {
+        av_log(log_ctx, AV_LOG_ERROR, "No Java virtual machine has been registered\n");
+        return AVERROR(EINVAL);
+    }
+
+    return (*java_vm)->DetachCurrentThread(java_vm);
+}
+
+char *ff_jni_jstring_to_utf_chars(JNIEnv *env, jstring string, void *log_ctx)
+{
+    char *ret = NULL;
+    const char *utf_chars = NULL;
+
+    jboolean copy = 0;
+
+    if (!string) {
+        return NULL;
+    }
+
+    utf_chars = (*env)->GetStringUTFChars(env, string, &copy);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "String.getStringUTFChars() threw an exception\n");
+        return NULL;
+    }
+
+    ret = av_strdup(utf_chars);
+
+    (*env)->ReleaseStringUTFChars(env, string, utf_chars);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "String.releaseStringUTFChars() threw an exception\n");
+        return NULL;
+    }
+
+    return ret;
+}
+
+jstring ff_jni_utf_chars_to_jstring(JNIEnv *env, const char *utf_chars, void *log_ctx)
+{
+    jstring ret;
+
+    ret = (*env)->NewStringUTF(env, utf_chars);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "NewStringUTF() threw an exception\n");
+        return NULL;
+    }
+
+    return ret;
+}
+
+int ff_jni_exception_get_summary(JNIEnv *env, jthrowable exception, char **error, void *log_ctx)
+{
+    int ret = 0;
+
+    AVBPrint bp;
+
+    char *name = NULL;
+    char *message = NULL;
+
+    jclass class_class = NULL;
+    jmethodID get_name_id = NULL;
+
+    jclass exception_class = NULL;
+    jmethodID get_message_id = NULL;
+
+    jstring string = NULL;
+
+    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
+
+    exception_class = (*env)->GetObjectClass(env, exception);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find Throwable class\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    class_class = (*env)->GetObjectClass(env, exception_class);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find Throwable class's class\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    get_name_id = (*env)->GetMethodID(env, class_class, "getName", "()Ljava/lang/String;");
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find method Class.getName()\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    string = (*env)->CallObjectMethod(env, exception_class, get_name_id);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Class.getName() threw an exception\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    if (string) {
+        name = ff_jni_jstring_to_utf_chars(env, string, log_ctx);
+        (*env)->DeleteLocalRef(env, string);
+        string = NULL;
+    }
+
+    get_message_id = (*env)->GetMethodID(env, exception_class, "getMessage", "()Ljava/lang/String;");
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Could not find method java/lang/Throwable.getMessage()\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    string = (*env)->CallObjectMethod(env, exception, get_message_id);
+    if ((*env)->ExceptionCheck(env)) {
+        (*env)->ExceptionClear(env);
+        av_log(log_ctx, AV_LOG_ERROR, "Throwable.getMessage() threw an exception\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    if (string) {
+        message = ff_jni_jstring_to_utf_chars(env, string, log_ctx);
+        (*env)->DeleteLocalRef(env, string);
+        string = NULL;
+    }
+
+    if (name && message) {
+        av_bprintf(&bp, "%s: %s", name, message);
+    } else if (name && !message) {
+        av_bprintf(&bp, "%s occurred", name);
+    } else if (!name && message) {
+        av_bprintf(&bp, "Exception: %s", message);
+    } else {
+        av_log(log_ctx, AV_LOG_WARNING, "Could not retreive exception name and message\n");
+        av_bprintf(&bp, "Exception occurred");
+    }
+
+    ret = av_bprint_finalize(&bp, error);
+done:
+
+    av_free(name);
+    av_free(message);
+
+    if (class_class) {
+        (*env)->DeleteLocalRef(env, class_class);
+    }
+
+    if (exception_class) {
+        (*env)->DeleteLocalRef(env, exception_class);
+    }
+
+    if (string) {
+        (*env)->DeleteLocalRef(env, string);
+    }
+
+    return ret;
+}
+
+int ff_jni_exception_check(JNIEnv *env, int log, void *log_ctx)
+{
+    int ret;
+
+    jthrowable exception;
+
+    char *message = NULL;
+
+    if (!(*(env))->ExceptionCheck((env))) {
+        return 0;
+    }
+
+    if (!log) {
+        (*(env))->ExceptionClear((env));
+        return -1;
+    }
+
+    exception = (*env)->ExceptionOccurred(env);
+    (*(env))->ExceptionClear((env));
+
+    if ((ret = ff_jni_exception_get_summary(env, exception, &message, log_ctx)) < 0) {
+        (*env)->DeleteLocalRef(env, exception);
+        return ret;
+    }
+
+    (*env)->DeleteLocalRef(env, exception);
+
+    av_log(log_ctx, AV_LOG_ERROR, "%s\n", message);
+    av_free(message);
+
+    return -1;
+}
+
+int ff_jni_init_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx)
+{
+    int i, ret = 0;
+    jclass last_clazz = NULL;
+
+    for (i = 0; jfields_mapping[i].name; i++) {
+        int mandatory = jfields_mapping[i].mandatory;
+        enum FFJniFieldType type = jfields_mapping[i].type;
+
+        if (type == FF_JNI_CLASS) {
+            jclass clazz;
+
+            last_clazz = NULL;
+
+            clazz = (*env)->FindClass(env, jfields_mapping[i].name);
+            if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                goto done;
+            }
+
+            last_clazz = *(jclass*)((uint8_t*)jfields + jfields_mapping[i].offset) =
+                    global ? (*env)->NewGlobalRef(env, clazz) : clazz;
+        } else {
+
+            if (!last_clazz) {
+                ret = AVERROR_EXTERNAL;
+                break;
+            }
+
+            switch(type) {
+            case FF_JNI_FIELD: {
+                jfieldID field_id = (*env)->GetFieldID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = field_id;
+                break;
+            }
+            case FF_JNI_STATIC_FIELD: {
+                jfieldID field_id = (*env)->GetStaticFieldID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = field_id;
+                break;
+            }
+            case FF_JNI_METHOD: {
+                jmethodID method_id = (*env)->GetMethodID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = method_id;
+                break;
+            }
+            case FF_JNI_STATIC_METHOD: {
+                jmethodID method_id = (*env)->GetStaticMethodID(env, last_clazz, jfields_mapping[i].method, jfields_mapping[i].signature);
+                if ((ret = ff_jni_exception_check(env, mandatory, log_ctx)) < 0 && mandatory) {
+                    goto done;
+                }
+
+                *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = method_id;
+                break;
+            }
+            default:
+                av_log(log_ctx, AV_LOG_ERROR, "Unknown JNI field type\n");
+                ret = AVERROR(EINVAL);
+                goto done;
+            }
+        }
+    }
+
+done:
+    if (ret < 0) {
+        /* reset jfields in case of failure so it does not leak references */
+        ff_jni_reset_jfields(env, jfields, jfields_mapping, global, log_ctx);
+    }
+
+    return ret;
+}
+
+int ff_jni_reset_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx)
+{
+    int i;
+
+    for (i = 0; jfields_mapping[i].name; i++) {
+        enum FFJniFieldType type = jfields_mapping[i].type;
+
+        switch(type) {
+        case FF_JNI_CLASS: {
+            jclass clazz = *(jclass*)((uint8_t*)jfields + jfields_mapping[i].offset);
+            if (!clazz)
+                continue;
+
+            if (global) {
+                (*env)->DeleteGlobalRef(env, clazz);
+            } else {
+                (*env)->DeleteLocalRef(env, clazz);
+            }
+
+            *(jclass*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_FIELD: {
+            *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_STATIC_FIELD: {
+            *(jfieldID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_METHOD: {
+            *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        case FF_JNI_STATIC_METHOD: {
+            *(jmethodID*)((uint8_t*)jfields + jfields_mapping[i].offset) = NULL;
+            break;
+        }
+        default:
+            av_log(log_ctx, AV_LOG_ERROR, "Unknown JNI field type\n");
+        }
+    }
+
+    return 0;
+}
diff --git a/libavcodec/ffjni.h b/libavcodec/ffjni.h
new file mode 100644
index 0000000..990c7b9
--- /dev/null
+++ b/libavcodec/ffjni.h
@@ -0,0 +1,150 @@
+/*
+ * JNI utility functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FFJNI_H
+#define AVCODEC_FFJNI_H
+
+#include <jni.h>
+
+/*
+ * Attach a JNI environment to the current thread.
+ *
+ * @param attached pointer to an integer that will be set to 1 if the
+ * environment has been attached to the current thread or 0 if it is
+ * already attached.
+ * @param log_ctx context used for logging, can be NULL
+ * @return the JNI environment on success, NULL otherwise
+ */
+JNIEnv *ff_jni_attach_env(int *attached, void *log_ctx);
+
+/*
+ * Detach the JNI environment from the current thread.
+ *
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int ff_jni_detach_env(void *log_ctx);
+
+/*
+ * Convert a jstring to its utf characters equivalent.
+ *
+ * @param env JNI environment
+ * @param string Java string to convert
+ * @param log_ctx context used for logging, can be NULL
+ * @return a pointer to an array of unicode characters on success, NULL
+ * otherwise
+ */
+char *ff_jni_jstring_to_utf_chars(JNIEnv *env, jstring string, void *log_ctx);
+
+/*
+ * Convert utf chars to its jstring equivalent.
+ *
+ * @param env JNI environment
+ * @param utf_chars a pointer to an array of unicode characters
+ * @param log_ctx context used for logging, can be NULL
+ * @return a Java string object on success, NULL otherwise
+ */
+jstring ff_jni_utf_chars_to_jstring(JNIEnv *env, const char *utf_chars, void *log_ctx);
+
+/*
+ * Extract the error summary from a jthrowable in the form of "className: errorMessage"
+ *
+ * @param env JNI environment
+ * @param exception exception to get the summary from
+ * @param error address pointing to the error, the value is updated if a
+ * summary can be extracted
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int ff_jni_exception_get_summary(JNIEnv *env, jthrowable exception, char **error, void *log_ctx);
+
+/*
+ * Check if an exception has occurred,log it using av_log and clear it.
+ *
+ * @param env JNI environment
+ * @param log value used to enable logging if an exception has occurred,
+ * 0 disables logging, != 0 enables logging
+ * @param log_ctx context used for logging, can be NULL
+ */
+int ff_jni_exception_check(JNIEnv *env, int log, void *log_ctx);
+
+/*
+ * Jni field type.
+ */
+enum FFJniFieldType {
+
+    FF_JNI_CLASS,
+    FF_JNI_FIELD,
+    FF_JNI_STATIC_FIELD,
+    FF_JNI_METHOD,
+    FF_JNI_STATIC_METHOD
+
+};
+
+/*
+ * Jni field describing a class, a field or a method to be retrieved using
+ * the ff_jni_init_jfields method.
+ */
+struct FFJniField {
+
+    const char *name;
+    const char *method;
+    const char *signature;
+    enum FFJniFieldType type;
+    int offset;
+    int mandatory;
+
+};
+
+/*
+ * Retrieve class references, field ids and method ids to an arbitrary structure.
+ *
+ * @param env JNI environment
+ * @param jfields a pointer to an arbitrary structure where the different
+ * fields are declared and where the FFJNIField mapping table offsets are
+ * pointing to
+ * @param jfields_mapping null terminated array of FFJNIFields describing
+ * the class/field/method to be retrieved
+ * @param global make the classes references global. It is the caller
+ * responsibility to properly release global references.
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int ff_jni_init_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx);
+
+/*
+ * Delete class references, field ids and method ids of an arbitrary structure.
+ *
+ * @param env JNI environment
+ * @param jfields a pointer to an arbitrary structure where the different
+ * fields are declared and where the FFJNIField mapping table offsets are
+ * pointing to
+ * @param jfields_mapping null terminated array of FFJNIFields describing
+ * the class/field/method to be deleted
+ * @param global threat the classes references as global and delete them
+ * accordingly
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int ff_jni_reset_jfields(JNIEnv *env, void *jfields, const struct FFJniField *jfields_mapping, int global, void *log_ctx);
+
+#endif /* AVCODEC_FFJNI_H */
diff --git a/libavcodec/fft-fixed-test.c b/libavcodec/fft-fixed-test.c
index d6ea987..330211e 100644
--- a/libavcodec/fft-fixed-test.c
+++ b/libavcodec/fft-fixed-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fft-fixed32-test.c b/libavcodec/fft-fixed32-test.c
new file mode 100644
index 0000000..4bd11ce
--- /dev/null
+++ b/libavcodec/fft-fixed32-test.c
@@ -0,0 +1,21 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "fft-test.c"
diff --git a/libavcodec/fft-internal.h b/libavcodec/fft-internal.h
index a449ec0..0a8f7d0 100644
--- a/libavcodec/fft-internal.h
+++ b/libavcodec/fft-internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,12 +36,29 @@
 
 #else
 
+#define SCALE_FLOAT(a, bits) lrint((a) * (double)(1 << (bits)))
+
+#if FFT_FIXED_32
+
+#define CMUL(dre, dim, are, aim, bre, bim) do {             \
+        int64_t accu;                                     \
+        (accu)  = (int64_t)(bre) * (are);                 \
+        (accu) -= (int64_t)(bim) * (aim);                 \
+        (dre)   = (int)(((accu) + 0x40000000) >> 31);       \
+        (accu)  = (int64_t)(bre) * (aim);                 \
+        (accu) += (int64_t)(bim) * (are);                 \
+        (dim)   = (int)(((accu) + 0x40000000) >> 31);       \
+    } while (0)
+
+#define FIX15(a) av_clip(SCALE_FLOAT(a, 31), -2147483647, 2147483647)
+
+#else /* FFT_FIXED_32 */
+
 #include "fft.h"
 #include "mathops.h"
 
 void ff_mdct_calcw_c(FFTContext *s, FFTDouble *output, const FFTSample *input);
 
-#define SCALE_FLOAT(a, bits) lrint((a) * (double)(1 << (bits)))
 #define FIX15(a) av_clip(SCALE_FLOAT(a, 15), -32767, 32767)
 
 #define sqrthalf ((int16_t)((1<<15)*M_SQRT1_2))
@@ -62,6 +79,8 @@ void ff_mdct_calcw_c(FFTContext *s, FFTDouble *output, const FFTSample *input);
 #define CMULL(dre, dim, are, aim, bre, bim)     \
     CMULS(dre, dim, are, aim, bre, bim, 0)
 
+#endif /* FFT_FIXED_32 */
+
 #endif /* FFT_FLOAT */
 
 #define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c)
diff --git a/libavcodec/fft-test.c b/libavcodec/fft-test.c
index 83b5546..d647fde 100644
--- a/libavcodec/fft-test.c
+++ b/libavcodec/fft-test.c
@@ -1,20 +1,20 @@
 /*
  * (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,6 +59,10 @@
 #define RANGE 1.0
 #define REF_SCALE(x, bits)  (x)
 #define FMT "%10.6f"
+#elif FFT_FIXED_32
+#define RANGE 8388608
+#define REF_SCALE(x, bits) (x)
+#define FMT "%6d"
 #else
 #define RANGE 16384
 #define REF_SCALE(x, bits) ((x) / (1 << (bits)))
@@ -73,7 +77,7 @@ static int fft_ref_init(int nbits, int inverse)
 {
     int i, n = 1 << nbits;
 
-    exptab = av_malloc((n / 2) * sizeof(*exptab));
+    exptab = av_malloc_array((n / 2), sizeof(*exptab));
     if (!exptab)
         return AVERROR(ENOMEM);
 
@@ -150,7 +154,7 @@ static void mdct_ref(FFTSample *output, FFTSample *input, int nbits)
 
 #if FFT_FLOAT
 #if CONFIG_DCT
-static void idct_ref(float *output, float *input, int nbits)
+static void idct_ref(FFTSample *output, FFTSample *input, int nbits)
 {
     int i, k, n = 1 << nbits;
 
@@ -165,7 +169,7 @@ static void idct_ref(float *output, float *input, int nbits)
     }
 }
 
-static void dct_ref(float *output, float *input, int nbits)
+static void dct_ref(FFTSample *output, FFTSample *input, int nbits)
 {
     int i, k, n = 1 << nbits;
 
@@ -203,7 +207,7 @@ static int check_diff(FFTSample *tab1, FFTSample *tab2, int n, double scale)
         if (e > max)
             max = e;
     }
-    av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error) / n);
+    av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error / n));
     return err;
 }
 
@@ -281,20 +285,22 @@ int main(int argc, char **argv)
             break;
         case 'c':
         {
-            int cpuflags = av_parse_cpu_flags(optarg);
-            if (cpuflags < 0)
+            unsigned cpuflags = av_get_cpu_flags();
+
+            if (av_parse_cpu_caps(&cpuflags, optarg) < 0)
                 return 1;
-            av_set_cpu_flags_mask(cpuflags);
+
+            av_force_cpu_flags(cpuflags);
             break;
         }
         }
     }
 
     fft_size = 1 << fft_nbits;
-    tab      = av_malloc(fft_size * sizeof(FFTComplex));
-    tab1     = av_malloc(fft_size * sizeof(FFTComplex));
-    tab_ref  = av_malloc(fft_size * sizeof(FFTComplex));
-    tab2     = av_malloc(fft_size * sizeof(FFTSample));
+    tab      = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab1     = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab_ref  = av_malloc_array(fft_size, sizeof(FFTComplex));
+    tab2     = av_malloc_array(fft_size, sizeof(FFTSample));
 
     if (!(tab && tab1 && tab_ref && tab2))
         goto cleanup;
@@ -316,22 +322,22 @@ int main(int argc, char **argv)
         else
             av_log(NULL, AV_LOG_INFO, "FFT");
         ff_fft_init(&s, fft_nbits, do_inverse);
-        if (err = fft_ref_init(fft_nbits, do_inverse) < 0)
+        if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
             goto cleanup;
         break;
 #if FFT_FLOAT
-#if CONFIG_RDFT
+#    if CONFIG_RDFT
     case TRANSFORM_RDFT:
         if (do_inverse)
             av_log(NULL, AV_LOG_INFO, "IDFT_C2R");
         else
             av_log(NULL, AV_LOG_INFO, "DFT_R2C");
         ff_rdft_init(&r, fft_nbits, do_inverse ? IDFT_C2R : DFT_R2C);
-        if (err = fft_ref_init(fft_nbits, do_inverse) < 0)
+        if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
             goto cleanup;
         break;
-#endif /* CONFIG_RDFT */
-#if CONFIG_DCT
+#    endif /* CONFIG_RDFT */
+#    if CONFIG_DCT
     case TRANSFORM_DCT:
         if (do_inverse)
             av_log(NULL, AV_LOG_INFO, "DCT_III");
@@ -339,7 +345,7 @@ int main(int argc, char **argv)
             av_log(NULL, AV_LOG_INFO, "DCT_II");
         ff_dct_init(&d, fft_nbits, do_inverse ? DCT_III : DCT_II);
         break;
-#endif /* CONFIG_DCT */
+#    endif /* CONFIG_DCT */
 #endif /* FFT_FLOAT */
     default:
         av_log(NULL, AV_LOG_ERROR, "Requested transform not supported\n");
@@ -486,16 +492,16 @@ int main(int argc, char **argv)
         ff_fft_end(&s);
         break;
 #if FFT_FLOAT
-#if CONFIG_RDFT
+#    if CONFIG_RDFT
     case TRANSFORM_RDFT:
         ff_rdft_end(&r);
         break;
-#endif /* CONFIG_RDFT */
-#if CONFIG_DCT
+#    endif /* CONFIG_RDFT */
+#    if CONFIG_DCT
     case TRANSFORM_DCT:
         ff_dct_end(&d);
         break;
-#endif /* CONFIG_DCT */
+#    endif /* CONFIG_DCT */
 #endif /* FFT_FLOAT */
     }
 
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index 57dc17f..c858570 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -2,20 +2,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,10 @@
 #define FFT_FLOAT 1
 #endif
 
+#ifndef FFT_FIXED_32
+#define FFT_FIXED_32 0
+#endif
+
 #include <stdint.h>
 #include "config.h"
 #include "libavutil/mem.h"
@@ -40,15 +44,26 @@ typedef float FFTDouble;
 
 #else
 
+#if FFT_FIXED_32
+
+#define Q31(x) (int)((x)*2147483648.0 + 0.5)
+#define FFT_NAME(x) x ## _fixed_32
+
+typedef int32_t FFTSample;
+
+#else /* FFT_FIXED_32 */
+
 #define FFT_NAME(x) x ## _fixed
 
 typedef int16_t FFTSample;
-typedef int     FFTDouble;
+
+#endif /* FFT_FIXED_32 */
 
 typedef struct FFTComplex {
-    int16_t re, im;
+    FFTSample re, im;
 } FFTComplex;
 
+typedef int    FFTDouble;
 typedef struct FFTContext FFTContext;
 
 #endif /* FFT_FLOAT */
@@ -95,6 +110,7 @@ struct FFTContext {
     void (*mdct_calcw)(struct FFTContext *s, FFTDouble *output, const FFTSample *input);
     enum fft_permutation_type fft_permutation;
     enum mdct_permutation_type mdct_permutation;
+    uint32_t *revtab32;
 };
 
 #if CONFIG_HARDCODED_TABLES
@@ -119,7 +135,8 @@ extern COSTABLE(8192);
 extern COSTABLE(16384);
 extern COSTABLE(32768);
 extern COSTABLE(65536);
-extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[17];
+extern COSTABLE(131072);
+extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[18];
 
 #define ff_init_ff_cos_tabs FFT_NAME(ff_init_ff_cos_tabs)
 
@@ -142,6 +159,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse);
 void ff_fft_init_aarch64(FFTContext *s);
 void ff_fft_init_x86(FFTContext *s);
 void ff_fft_init_arm(FFTContext *s);
+void ff_fft_init_mips(FFTContext *s);
 void ff_fft_init_ppc(FFTContext *s);
 
 void ff_fft_fixed_init_arm(FFTContext *s);
@@ -154,11 +172,4 @@ void ff_fft_end(FFTContext *s);
 int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale);
 void ff_mdct_end(FFTContext *s);
 
-void ff_mdct_init_aarch64(FFTContext *s);
-void ff_mdct_init_arm(FFTContext *s);
-void ff_mdct_init_ppc(FFTContext *s);
-void ff_mdct_init_x86(FFTContext *s);
-
-void ff_mdct_fixed_init_arm(FFTContext *s);
-
 #endif /* AVCODEC_FFT_H */
diff --git a/libavcodec/fft_fixed.c b/libavcodec/fft_fixed.c
index bad4821..3d3bd2f 100644
--- a/libavcodec/fft_fixed.c
+++ b/libavcodec/fft_fixed.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 0
+#define FFT_FIXED_32 0
 #include "fft_template.c"
diff --git a/libavcodec/fft_fixed_32.c b/libavcodec/fft_fixed_32.c
new file mode 100644
index 0000000..fbdbf84
--- /dev/null
+++ b/libavcodec/fft_fixed_32.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "fft_template.c"
diff --git a/libavcodec/fft_float.c b/libavcodec/fft_float.c
index ed4cffa..73cc98d 100644
--- a/libavcodec/fft_float.c
+++ b/libavcodec/fft_float.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 1
+#define FFT_FIXED_32 0
 #include "fft_template.c"
diff --git a/libavcodec/fft_init_table.c b/libavcodec/fft_init_table.c
new file mode 100644
index 0000000..c488018
--- /dev/null
+++ b/libavcodec/fft_init_table.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * definitions and initialization of LUT table for FFT
+ */
+#include "libavcodec/fft_table.h"
+
+const int32_t ff_w_tab_sr[MAX_FFT_SIZE/(4*16)] = {
+2147483647, 2147483016, 2147481121, 2147477963, 2147473542, 2147467857, 2147460908, 2147452697,
+2147443222, 2147432484, 2147420483, 2147407218, 2147392690, 2147376899, 2147359845, 2147341527,
+2147321946, 2147301102, 2147278995, 2147255625, 2147230991, 2147205094, 2147177934, 2147149511,
+2147119825, 2147088876, 2147056664, 2147023188, 2146988450, 2146952448, 2146915184, 2146876656,
+2146836866, 2146795813, 2146753497, 2146709917, 2146665076, 2146618971, 2146571603, 2146522973,
+2146473080, 2146421924, 2146369505, 2146315824, 2146260881, 2146204674, 2146147205, 2146088474,
+2146028480, 2145967224, 2145904705, 2145840924, 2145775880, 2145709574, 2145642006, 2145573176,
+2145503083, 2145431729, 2145359112, 2145285233, 2145210092, 2145133690, 2145056025, 2144977098,
+2144896910, 2144815460, 2144732748, 2144648774, 2144563539, 2144477042, 2144389283, 2144300264,
+2144209982, 2144118439, 2144025635, 2143931570, 2143836244, 2143739656, 2143641807, 2143542697,
+2143442326, 2143340694, 2143237802, 2143133648, 2143028234, 2142921559, 2142813624, 2142704427,
+2142593971, 2142482254, 2142369276, 2142255039, 2142139541, 2142022783, 2141904764, 2141785486,
+2141664948, 2141543150, 2141420092, 2141295774, 2141170197, 2141043360, 2140915264, 2140785908,
+2140655293, 2140523418, 2140390284, 2140255892, 2140120240, 2139983329, 2139845159, 2139705730,
+2139565043, 2139423097, 2139279892, 2139135429, 2138989708, 2138842728, 2138694490, 2138544994,
+2138394240, 2138242228, 2138088958, 2137934430, 2137778644, 2137621601, 2137463301, 2137303743,
+2137142927, 2136980855, 2136817525, 2136652938, 2136487095, 2136319994, 2136151637, 2135982023,
+2135811153, 2135639026, 2135465642, 2135291003, 2135115107, 2134937956, 2134759548, 2134579885,
+2134398966, 2134216791, 2134033361, 2133848675, 2133662734, 2133475538, 2133287087, 2133097381,
+2132906420, 2132714204, 2132520734, 2132326009, 2132130030, 2131932796, 2131734309, 2131534567,
+2131333572, 2131131322, 2130927819, 2130723062, 2130517052, 2130309789, 2130101272, 2129891502,
+2129680480, 2129468204, 2129254676, 2129039895, 2128823862, 2128606576, 2128388038, 2128168248,
+2127947206, 2127724913, 2127501367, 2127276570, 2127050522, 2126823222, 2126594672, 2126364870,
+2126133817, 2125901514, 2125667960, 2125433155, 2125197100, 2124959795, 2124721240, 2124481435,
+2124240380, 2123998076, 2123754522, 2123509718, 2123263666, 2123016364, 2122767814, 2122518015,
+2122266967, 2122014670, 2121761126, 2121506333, 2121250292, 2120993003, 2120734467, 2120474683,
+2120213651, 2119951372, 2119687847, 2119423074, 2119157054, 2118889788, 2118621275, 2118351516,
+2118080511, 2117808259, 2117534762, 2117260020, 2116984031, 2116706797, 2116428319, 2116148595,
+2115867626, 2115585412, 2115301954, 2115017252, 2114731305, 2114444114, 2114155680, 2113866001,
+2113575080, 2113282914, 2112989506, 2112694855, 2112398960, 2112101824, 2111803444, 2111503822,
+2111202959, 2110900853, 2110597505, 2110292916, 2109987085, 2109680013, 2109371700, 2109062146,
+2108751352, 2108439317, 2108126041, 2107811526, 2107495770, 2107178775, 2106860540, 2106541065,
+2106220352, 2105898399, 2105575208, 2105250778, 2104925109, 2104598202, 2104270057, 2103940674,
+2103610054, 2103278196, 2102945101, 2102610768, 2102275199, 2101938393, 2101600350, 2101261071,
+2100920556, 2100578805, 2100235819, 2099891596, 2099546139, 2099199446, 2098851519, 2098502357,
+2098151960, 2097800329, 2097447464, 2097093365, 2096738032, 2096381466, 2096023667, 2095664635,
+2095304370, 2094942872, 2094580142, 2094216179, 2093850985, 2093484559, 2093116901, 2092748012,
+2092377892, 2092006541, 2091633960, 2091260147, 2090885105, 2090508833, 2090131331, 2089752599,
+2089372638, 2088991448, 2088609029, 2088225381, 2087840505, 2087454400, 2087067068, 2086678508,
+2086288720, 2085897705, 2085505463, 2085111994, 2084717298, 2084321376, 2083924228, 2083525854,
+2083126254, 2082725429, 2082323379, 2081920103, 2081515603, 2081109879, 2080702930, 2080294757,
+2079885360, 2079474740, 2079062896, 2078649830, 2078235540, 2077820028, 2077403294, 2076985338,
+2076566160, 2076145760, 2075724139, 2075301296, 2074877233, 2074451950, 2074025446, 2073597721,
+2073168777, 2072738614, 2072307231, 2071874629, 2071440808, 2071005769, 2070569511, 2070132035,
+2069693342, 2069253430, 2068812302, 2068369957, 2067926394, 2067481616, 2067035621, 2066588410,
+2066139983, 2065690341, 2065239484, 2064787411, 2064334124, 2063879623, 2063423908, 2062966978,
+2062508835, 2062049479, 2061588910, 2061127128, 2060664133, 2060199927, 2059734508, 2059267877,
+2058800036, 2058330983, 2057860719, 2057389244, 2056916560, 2056442665, 2055967560, 2055491246,
+2055013723, 2054534991, 2054055050, 2053573901, 2053091544, 2052607979, 2052123207, 2051637227,
+2051150040, 2050661647, 2050172048, 2049681242, 2049189231, 2048696014, 2048201592, 2047705965,
+2047209133, 2046711097, 2046211857, 2045711414, 2045209767, 2044706916, 2044202863, 2043697608,
+2043191150, 2042683490, 2042174628, 2041664565, 2041153301, 2040640837, 2040127172, 2039612306,
+2039096241, 2038578976, 2038060512, 2037540850, 2037019988, 2036497928, 2035974670, 2035450215,
+2034924562, 2034397712, 2033869665, 2033340422, 2032809982, 2032278347, 2031745516, 2031211490,
+2030676269, 2030139853, 2029602243, 2029063439, 2028523442, 2027982251, 2027439867, 2026896291,
+2026351522, 2025805561, 2025258408, 2024710064, 2024160529, 2023609803, 2023057887, 2022504780,
+2021950484, 2021394998, 2020838323, 2020280460, 2019721407, 2019161167, 2018599739, 2018037123,
+2017473321, 2016908331, 2016342155, 2015774793, 2015206245, 2014636511, 2014065592, 2013493489,
+2012920201, 2012345729, 2011770073, 2011193233, 2010615210, 2010036005, 2009455617, 2008874047,
+2008291295, 2007707362, 2007122248, 2006535953, 2005948478, 2005359822, 2004769987, 2004178973,
+2003586779, 2002993407, 2002398857, 2001803128, 2001206222, 2000608139, 2000008879, 1999408442,
+1998806829, 1998204040, 1997600076, 1996994937, 1996388622, 1995781134, 1995172471, 1994562635,
+1993951625, 1993339442, 1992726087, 1992111559, 1991495860, 1990878989, 1990260946, 1989641733,
+1989021350, 1988399796, 1987777073, 1987153180, 1986528118, 1985901888, 1985274489, 1984645923,
+1984016189, 1983385288, 1982753220, 1982119985, 1981485585, 1980850019, 1980213288, 1979575392,
+1978936331, 1978296106, 1977654717, 1977012165, 1976368450, 1975723572, 1975077532, 1974430331,
+1973781967, 1973132443, 1972481757, 1971829912, 1971176906, 1970522741, 1969867417, 1969210933,
+1968553292, 1967894492, 1967234535, 1966573420, 1965911148, 1965247720, 1964583136, 1963917396,
+1963250501, 1962582451, 1961913246, 1961242888, 1960571375, 1959898709, 1959224890, 1958549919,
+1957873796, 1957196520, 1956518093, 1955838516, 1955157788, 1954475909, 1953792881, 1953108703,
+1952423377, 1951736902, 1951049279, 1950360508, 1949670589, 1948979524, 1948287312, 1947593954,
+1946899451, 1946203802, 1945507008, 1944809070, 1944109987, 1943409761, 1942708392, 1942005880,
+1941302225, 1940597428, 1939891490, 1939184411, 1938476190, 1937766830, 1937056329, 1936344689,
+1935631910, 1934917992, 1934202936, 1933486742, 1932769411, 1932050943, 1931331338, 1930610597,
+1929888720, 1929165708, 1928441561, 1927716279, 1926989864, 1926262315, 1925533633, 1924803818,
+1924072871, 1923340791, 1922607581, 1921873239, 1921137767, 1920401165, 1919663432, 1918924571,
+1918184581, 1917443462, 1916701216, 1915957841, 1915213340, 1914467712, 1913720958, 1912973078,
+1912224073, 1911473942, 1910722688, 1909970309, 1909216806, 1908462181, 1907706433, 1906949562,
+1906191570, 1905432457, 1904672222, 1903910867, 1903148392, 1902384797, 1901620084, 1900854251,
+1900087301, 1899319232, 1898550047, 1897779744, 1897008325, 1896235790, 1895462140, 1894687374,
+1893911494, 1893134500, 1892356392, 1891577171, 1890796837, 1890015391, 1889232832, 1888449163,
+1887664383, 1886878492, 1886091491, 1885303381, 1884514161, 1883723833, 1882932397, 1882139853,
+1881346202, 1880551444, 1879755580, 1878958610, 1878160535, 1877361354, 1876561070, 1875759681,
+1874957189, 1874153594, 1873348897, 1872543097, 1871736196, 1870928194, 1870119091, 1869308888,
+1868497586, 1867685184, 1866871683, 1866057085, 1865241388, 1864424594, 1863606704, 1862787717,
+1861967634, 1861146456, 1860324183, 1859500816, 1858676355, 1857850800, 1857024153, 1856196413,
+1855367581, 1854537657, 1853706643, 1852874538, 1852041343, 1851207059, 1850371686, 1849535224,
+1848697674, 1847859036, 1847019312, 1846178501, 1845336604, 1844493621, 1843649553, 1842804401,
+1841958164, 1841110844, 1840262441, 1839412956, 1838562388, 1837710739, 1836858008, 1836004197,
+1835149306, 1834293336, 1833436286, 1832578158, 1831718951, 1830858668, 1829997307, 1829134869,
+1828271356, 1827406767, 1826541103, 1825674364, 1824806552, 1823937666, 1823067707, 1822196675,
+1821324572, 1820451397, 1819577151, 1818701835, 1817825449, 1816947994, 1816069469, 1815189877,
+1814309216, 1813427489, 1812544694, 1811660833, 1810775906, 1809889915, 1809002858, 1808114737,
+1807225553, 1806335305, 1805443995, 1804551623, 1803658189, 1802763694, 1801868139, 1800971523,
+1800073849, 1799175115, 1798275323, 1797374472, 1796472565, 1795569601, 1794665580, 1793760504,
+1792854372, 1791947186, 1791038946, 1790129652, 1789219305, 1788307905, 1787395453, 1786481950,
+1785567396, 1784651792, 1783735137, 1782817434, 1781898681, 1780978881, 1780058032, 1779136137,
+1778213194, 1777289206, 1776364172, 1775438094, 1774510970, 1773582803, 1772653593, 1771723340,
+1770792044, 1769859707, 1768926328, 1767991909, 1767056450, 1766119952, 1765182414, 1764243838,
+1763304224, 1762363573, 1761421885, 1760479161, 1759535401, 1758590607, 1757644777, 1756697914,
+1755750017, 1754801087, 1753851126, 1752900132, 1751948107, 1750995052, 1750040966, 1749085851,
+1748129707, 1747172535, 1746214334, 1745255107, 1744294853, 1743333573, 1742371267, 1741407936,
+1740443581, 1739478202, 1738511799, 1737544374, 1736575927, 1735606458, 1734635968, 1733664458,
+1732691928, 1731718378, 1730743810, 1729768224, 1728791620, 1727813999, 1726835361, 1725855708,
+1724875040, 1723893357, 1722910659, 1721926948, 1720942225, 1719956488, 1718969740, 1717981981,
+1716993211, 1716003431, 1715012642, 1714020844, 1713028037, 1712034223, 1711039401, 1710043573,
+1709046739, 1708048900, 1707050055, 1706050207, 1705049355, 1704047500, 1703044642, 1702040783,
+1701035922, 1700030061, 1699023199, 1698015339, 1697006479, 1695996621, 1694985765, 1693973912,
+1692961062, 1691947217, 1690932376, 1689916541, 1688899711, 1687881888, 1686863072, 1685843263,
+1684822463, 1683800672, 1682777890, 1681754118, 1680729357, 1679703608, 1678676870, 1677649144,
+1676620432, 1675590733, 1674560049, 1673528379, 1672495725, 1671462087, 1670427466, 1669391862,
+1668355276, 1667317709, 1666279161, 1665239632, 1664199124, 1663157637, 1662115172, 1661071729,
+1660027308, 1658981911, 1657935539, 1656888190, 1655839867, 1654790570, 1653740300, 1652689057,
+1651636841, 1650583654, 1649529496, 1648474367, 1647418269, 1646361202, 1645303166, 1644244162,
+1643184191, 1642123253, 1641061349, 1639998480, 1638934646, 1637869848, 1636804087, 1635737362,
+1634669676, 1633601027, 1632531418, 1631460848, 1630389319, 1629316830, 1628243383, 1627168978,
+1626093616, 1625017297, 1623940023, 1622861793, 1621782608, 1620702469, 1619621377, 1618539332,
+1617456335, 1616372386, 1615287487, 1614201637, 1613114838, 1612027089, 1610938393, 1609848749,
+1608758157, 1607666620, 1606574136, 1605480708, 1604386335, 1603291018, 1602194758, 1601097555,
+1599999411, 1598900325, 1597800299, 1596699333, 1595597428, 1594494583, 1593390801, 1592286082,
+1591180426, 1590073833, 1588966306, 1587857843, 1586748447, 1585638117, 1584526854, 1583414660,
+1582301533, 1581187476, 1580072489, 1578956572, 1577839726, 1576721952, 1575603251, 1574483623,
+1573363068, 1572241588, 1571119183, 1569995854, 1568871601, 1567746425, 1566620327, 1565493307,
+1564365367, 1563236506, 1562106725, 1560976026, 1559844408, 1558711873, 1557578421, 1556444052,
+1555308768, 1554172569, 1553035455, 1551897428, 1550758488, 1549618636, 1548477872, 1547336197,
+1546193612, 1545050118, 1543905714, 1542760402, 1541614183, 1540467057, 1539319024, 1538170087,
+1537020244, 1535869497, 1534717846, 1533565293, 1532411837, 1531257480, 1530102222, 1528946064,
+1527789007, 1526631051, 1525472197, 1524312445, 1523151797, 1521990252, 1520827813, 1519664478,
+1518500250, 1517335128, 1516169114, 1515002208, 1513834411, 1512665723, 1511496145, 1510325678,
+1509154322, 1507982079, 1506808949, 1505634932, 1504460029, 1503284242, 1502107570, 1500930014,
+1499751576, 1498572255, 1497392053, 1496210969, 1495029006, 1493846163, 1492662441, 1491477842,
+1490292364, 1489106011, 1487918781, 1486730675, 1485541696, 1484351842, 1483161115, 1481969516,
+1480777044, 1479583702, 1478389489, 1477194407, 1475998456, 1474801636, 1473603949, 1472405394,
+1471205974, 1470005688, 1468804538, 1467602523, 1466399645, 1465195904, 1463991302, 1462785838,
+1461579514, 1460372329, 1459164286, 1457955385, 1456745625, 1455535009, 1454323536, 1453111208,
+1451898025, 1450683988, 1449469098, 1448253355, 1447036760, 1445819314, 1444601017, 1443381870,
+1442161874, 1440941030, 1439719338, 1438496799, 1437273414, 1436049184, 1434824109, 1433598189,
+1432371426, 1431143821, 1429915374, 1428686085, 1427455956, 1426224988, 1424993180, 1423760534,
+1422527051, 1421292730, 1420057574, 1418821582, 1417584755, 1416347095, 1415108601, 1413869275,
+1412629117, 1411388129, 1410146309, 1408903661, 1407660183, 1406415878, 1405170745, 1403924785,
+1402678000, 1401430389, 1400181954, 1398932695, 1397682613, 1396431709, 1395179984, 1393927438,
+1392674072, 1391419886, 1390164882, 1388909060, 1387652422, 1386394966, 1385136696, 1383877610,
+1382617710, 1381356997, 1380095472, 1378833134, 1377569986, 1376306026, 1375041258, 1373775680,
+1372509294, 1371242101, 1369974101, 1368705296, 1367435685, 1366165269, 1364894050, 1363622028,
+1362349204, 1361075579, 1359801152, 1358525926, 1357249901, 1355973077, 1354695455, 1353417037,
+1352137822, 1350857812, 1349577007, 1348295409, 1347013017, 1345729833, 1344445857, 1343161090,
+1341875533, 1340589187, 1339302052, 1338014129, 1336725419, 1335435923, 1334145641, 1332854574,
+1331562723, 1330270089, 1328976672, 1327682474, 1326387494, 1325091734, 1323795195, 1322497877,
+1321199781, 1319900907, 1318601257, 1317300832, 1315999631, 1314697657, 1313394909, 1312091388,
+1310787095, 1309482032, 1308176198, 1306869594, 1305562222, 1304254082, 1302945174, 1301635500,
+1300325060, 1299013855, 1297701886, 1296389154, 1295075659, 1293761402, 1292446384, 1291130606,
+1289814068, 1288496772, 1287178717, 1285859905, 1284540337, 1283220013, 1281898935, 1280577102,
+1279254516, 1277931177, 1276607086, 1275282245, 1273956653, 1272630312, 1271303222, 1269975384,
+1268646800, 1267317469, 1265987392, 1264656571, 1263325005, 1261992697, 1260659646, 1259325853,
+1257991320, 1256656047, 1255320034, 1253983283, 1252645794, 1251307568, 1249968606, 1248628909,
+1247288478, 1245947312, 1244605414, 1243262783, 1241919421, 1240575329, 1239230506, 1237884955,
+1236538675, 1235191668, 1233843935, 1232495475, 1231146291, 1229796382, 1228445750, 1227094395,
+1225742318, 1224389521, 1223036002, 1221681765, 1220326809, 1218971135, 1217614743, 1216257636,
+1214899813, 1213541275, 1212182024, 1210822059, 1209461382, 1208099993, 1206737894, 1205375085,
+1204011567, 1202647340, 1201282407, 1199916766, 1198550419, 1197183368, 1195815612, 1194447153,
+1193077991, 1191708127, 1190337562, 1188966297, 1187594332, 1186221669, 1184848308, 1183474250,
+1182099496, 1180724046, 1179347902, 1177971064, 1176593533, 1175215310, 1173836395, 1172456790,
+1171076495, 1169695512, 1168313840, 1166931481, 1165548435, 1164164704, 1162780288, 1161395188,
+1160009405, 1158622939, 1157235792, 1155847964, 1154459456, 1153070269, 1151680403, 1150289860,
+1148898640, 1147506745, 1146114174, 1144720929, 1143327011, 1141932420, 1140537158, 1139141224,
+1137744621, 1136347348, 1134949406, 1133550797, 1132151521, 1130751579, 1129350972, 1127949701,
+1126547765, 1125145168, 1123741908, 1122337987, 1120933406, 1119528166, 1118122267, 1116715710,
+1115308496, 1113900627, 1112492101, 1111082922, 1109673089, 1108262603, 1106851465, 1105439676,
+1104027237, 1102614148, 1101200410, 1099786025, 1098370993, 1096955314, 1095538991, 1094122023,
+1092704411, 1091286156, 1089867259, 1088447722, 1087027544, 1085606726, 1084185270, 1082763176,
+1081340445, 1079917078, 1078493076, 1077068439, 1075643169, 1074217266, 1072790730, 1071363564,
+1069935768, 1068507342, 1067078288, 1065648605, 1064218296, 1062787361, 1061355801, 1059923616,
+1058490808, 1057057377, 1055623324, 1054188651, 1052753357, 1051317443, 1049880912, 1048443763,
+1047005996, 1045567615, 1044128617, 1042689006, 1041248781, 1039807944, 1038366495, 1036924436,
+1035481766, 1034038487, 1032594600, 1031150105, 1029705004, 1028259297, 1026812985, 1025366069,
+1023918550, 1022470428, 1021021705, 1019572382, 1018122458, 1016671936, 1015220816, 1013769098,
+1012316784, 1010863875, 1009410370, 1007956272, 1006501581, 1005046298, 1003590424, 1002133959,
+1000676905, 999219262, 997761031, 996302214, 994842810, 993382821, 991922248, 990461091,
+988999351, 987537030, 986074127, 984610645, 983146583, 981681943, 980216726, 978750932,
+977284562, 975817617, 974350098, 972882006, 971413342, 969944106, 968474300, 967003923,
+965532978, 964061465, 962589385, 961116739, 959643527, 958169751, 956695411, 955220508,
+953745043, 952269017, 950792431, 949315286, 947837582, 946359321, 944880503, 943401129,
+941921200, 940440717, 938959681, 937478092, 935995952, 934513261, 933030021, 931546231,
+930061894, 928577010, 927091579, 925605603, 924119082, 922632018, 921144411, 919656262,
+918167572, 916678342, 915188572, 913698265, 912207419, 910716038, 909224120, 907731667,
+906238681, 904745161, 903251110, 901756526, 900261413, 898765769, 897269597, 895772898,
+894275671, 892777918, 891279640, 889780838, 888281512, 886781663, 885281293, 883780402,
+882278992, 880777062, 879274614, 877771649, 876268167, 874764170, 873259659, 871754633,
+870249095, 868743045, 867236484, 865729413, 864221832, 862713743, 861205147, 859696043,
+858186435, 856676321, 855165703, 853654582, 852142959, 850630835, 849118210, 847605086,
+846091463, 844577343, 843062726, 841547612, 840032004, 838515901, 836999305, 835482217,
+833964638, 832446567, 830928007, 829408958, 827889422, 826369398, 824848888, 823327893,
+821806413, 820284450, 818762005, 817239078, 815715670, 814191782, 812667415, 811142571,
+809617249, 808091450, 806565177, 805038429, 803511207, 801983513, 800455346, 798926709,
+797397602, 795868026, 794337982, 792807470, 791276492, 789745049, 788213141, 786680769,
+785147934, 783614638, 782080880, 780546663, 779011986, 777476851, 775941259, 774405210,
+772868706, 771331747, 769794334, 768256469, 766718151, 765179382, 763640164, 762100496,
+760560380, 759019816, 757478806, 755937350, 754395449, 752853105, 751310318, 749767089,
+748223418, 746679308, 745134758, 743589770, 742044345, 740498483, 738952186, 737405453,
+735858287, 734310688, 732762657, 731214195, 729665303, 728115982, 726566232, 725016055,
+723465451, 721914422, 720362968, 718811090, 717258790, 715706067, 714152924, 712599360,
+711045377, 709490976, 707936158, 706380923, 704825272, 703269207, 701712728, 700155836,
+698598533, 697040818, 695482694, 693924160, 692365218, 690805869, 689246113, 687685952,
+686125387, 684564417, 683003045, 681441272, 679879097, 678316522, 676753549, 675190177,
+673626408, 672062243, 670497682, 668932727, 667367379, 665801638, 664235505, 662668981,
+661102068, 659534766, 657967075, 656398998, 654830535, 653261686, 651692453, 650122837,
+648552838, 646982457, 645411696, 643840556, 642269036, 640697139, 639124865, 637552215,
+635979190, 634405791, 632832018, 631257873, 629683357, 628108471, 626533215, 624957590,
+623381598, 621805239, 620228514, 618651424, 617073971, 615496154, 613917975, 612339436,
+610760536, 609181276, 607601658, 606021683, 604441352, 602860664, 601279623, 599698227,
+598116479, 596534378, 594951927, 593369126, 591785976, 590202477, 588618632, 587034440,
+585449903, 583865021, 582279796, 580694229, 579108320, 577522070, 575935480, 574348552,
+572761285, 571173682, 569585743, 567997469, 566408860, 564819919, 563230645, 561641039,
+560051104, 558460839, 556870245, 555279324, 553688076, 552096502, 550504604, 548912382,
+547319836, 545726969, 544133781, 542540273, 540946445, 539352300, 537757837, 536163058,
+534567963, 532972554, 531376831, 529780796, 528184449, 526587791, 524990824, 523393547,
+521795963, 520198072, 518599875, 517001373, 515402566, 513803457, 512204045, 510604332,
+509004318, 507404005, 505803394, 504202485, 502601279, 500999778, 499397982, 497795892,
+496193509, 494590835, 492987869, 491384614, 489781069, 488177236, 486573117, 484968710,
+483364019, 481759043, 480153784, 478548243, 476942419, 475336316, 473729932, 472123270,
+470516330, 468909114, 467301622, 465693854, 464085813, 462477499, 460868912, 459260055,
+457650927, 456041530, 454431865, 452821933, 451211734, 449601270, 447990541, 446379549,
+444768294, 443156777, 441545000, 439932963, 438320667, 436708113, 435095303, 433482236,
+431868915, 430255339, 428641511, 427027430, 425413098, 423798515, 422183684, 420568604,
+418953276, 417337703, 415721883, 414105819, 412489512, 410872962, 409256170, 407639137,
+406021865, 404404353, 402786604, 401168618, 399550396, 397931939, 396313247, 394694323,
+393075166, 391455778, 389836160, 388216313, 386596237, 384975934, 383355404, 381734649,
+380113669, 378492466, 376871039, 375249392, 373627523, 372005435, 370383128, 368760603,
+367137861, 365514903, 363891730, 362268343, 360644742, 359020930, 357396906, 355772673,
+354148230, 352523578, 350898719, 349273654, 347648383, 346022908, 344397230, 342771348,
+341145265, 339518981, 337892498, 336265816, 334638936, 333011859, 331384586, 329757119,
+328129457, 326501602, 324873555, 323245317, 321616889, 319988272, 318359466, 316730474,
+315101295, 313471930, 311842381, 310212649, 308582734, 306952638, 305322361, 303691904,
+302061269, 300430456, 298799466, 297168301, 295536961, 293905447, 292273760, 290641901,
+289009871, 287377671, 285745302, 284112765, 282480061, 280847190, 279214155, 277580955,
+275947592, 274314066, 272680379, 271046532, 269412525, 267778360, 266144038, 264509558,
+262874923, 261240134, 259605191, 257970095, 256334847, 254699448, 253063900, 251428203,
+249792358, 248156366, 246520228, 244883945, 243247518, 241610947, 239974235, 238337382,
+236700388, 235063255, 233425984, 231788575, 230151030, 228513350, 226875535, 225237587,
+223599506, 221961294, 220322951, 218684479, 217045878, 215407149, 213768293, 212129312,
+210490206, 208850976, 207211624, 205572149, 203932553, 202292838, 200653003, 199013051,
+197372981, 195732795, 194092495, 192452080, 190811551, 189170911, 187530159, 185889297,
+184248325, 182607245, 180966058, 179324764, 177683365, 176041861, 174400254, 172758544,
+171116733, 169474820, 167832808, 166190698, 164548489, 162906184, 161263783, 159621287,
+157978697, 156336015, 154693240, 153050374, 151407418, 149764374, 148121241, 146478021,
+144834714, 143191323, 141547847, 139904288, 138260647, 136616925, 134973122, 133329239,
+131685278, 130041240, 128397125, 126752935, 125108670, 123464332, 121819921, 120175438,
+118530885, 116886262, 115241570, 113596810, 111951983, 110307091, 108662134, 107017112,
+105372028, 103726882, 102081675, 100436408,  98791081,  97145697,  95500255,  93854758,
+ 92209205,  90563597,  88917937,  87272224,  85626460,  83980645,  82334782,  80688869,
+ 79042909,  77396903,  75750851,  74104755,  72458615,  70812432,  69166208,  67519943,
+ 65873638,  64227295,  62580914,  60934496,  59288042,  57641553,  55995030,  54348475,
+ 52701887,  51055268,  49408620,  47761942,  46115236,  44468503,  42821744,  41174960,
+ 39528151,  37881320,  36234466,  34587590,  32940695,  31293780,  29646846,  27999895,
+ 26352928,  24705945,  23058947,  21411936,  19764913,  18117878,  16470832,  14823776,
+ 13176712,  11529640,   9882561,   8235476,   6588387,   4941294,   3294197,   1647099
+};
+
+uint16_t ff_fft_offsets_lut[21845];
+
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index)
+{
+    if (size < 16) {
+        table[*index] = off >> 2;
+        (*index)++;
+    }
+    else {
+        ff_fft_lut_init(table, off, size>>1, index);
+        ff_fft_lut_init(table, off+(size>>1), size>>2, index);
+        ff_fft_lut_init(table, off+3*(size>>2), size>>2, index);
+    }
+}
diff --git a/libavcodec/fft_table.h b/libavcodec/fft_table.h
new file mode 100644
index 0000000..ed0a658
--- /dev/null
+++ b/libavcodec/fft_table.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * definitions and tables for FFT
+ */
+#ifndef AVCODEC_FFT_TABLE_H
+#define AVCODEC_FFT_TABLE_H
+
+#include "libavcodec/fft.h"
+
+#define MAX_LOG2_NFFT 17 //!< Specifies maximum allowed fft size
+#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT)
+
+extern const int32_t ff_w_tab_sr[];
+extern uint16_t ff_fft_offsets_lut[];
+void ff_fft_lut_init(uint16_t *table, int off, int size, int *index);
+
+#endif /* AVCODEC_FFT_TABLE_H */
diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c
index 3642b43..480557f 100644
--- a/libavcodec/fft_template.c
+++ b/libavcodec/fft_template.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2002 Fabrice Bellard
  * Partly based on libdjbfft by D. J. Bernstein
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 #include "fft.h"
 #include "fft-internal.h"
 
+#if FFT_FIXED_32
+#include "fft_table.h"
+#else /* FFT_FIXED_32 */
+
 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
 #if !CONFIG_HARDCODED_TABLES
 COSTABLE(16);
@@ -47,6 +51,7 @@ COSTABLE(8192);
 COSTABLE(16384);
 COSTABLE(32768);
 COSTABLE(65536);
+COSTABLE(131072);
 #endif
 COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
     NULL, NULL, NULL, NULL,
@@ -63,8 +68,11 @@ COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
     FFT_NAME(ff_cos_16384),
     FFT_NAME(ff_cos_32768),
     FFT_NAME(ff_cos_65536),
+    FFT_NAME(ff_cos_131072),
 };
 
+#endif /* FFT_FIXED_32 */
+
 static void fft_permute_c(FFTContext *s, FFTComplex *z);
 static void fft_calc_c(FFTContext *s, FFTComplex *z);
 
@@ -81,7 +89,7 @@ static int split_radix_permutation(int i, int n, int inverse)
 
 av_cold void ff_init_ff_cos_tabs(int index)
 {
-#if !CONFIG_HARDCODED_TABLES
+#if (!CONFIG_HARDCODED_TABLES) && (!FFT_FIXED_32)
     int i;
     int m = 1<<index;
     double freq = 2*M_PI/m;
@@ -135,14 +143,23 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 {
     int i, j, n;
 
-    if (nbits < 2 || nbits > 16)
+    s->revtab = NULL;
+    s->revtab32 = NULL;
+
+    if (nbits < 2 || nbits > 17)
         goto fail;
     s->nbits = nbits;
     n = 1 << nbits;
 
-    s->revtab = av_malloc(n * sizeof(uint16_t));
-    if (!s->revtab)
-        goto fail;
+    if (nbits <= 16) {
+        s->revtab = av_malloc(n * sizeof(uint16_t));
+        if (!s->revtab)
+            goto fail;
+    } else {
+        s->revtab32 = av_malloc(n * sizeof(uint32_t));
+        if (!s->revtab32)
+            goto fail;
+    }
     s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
     if (!s->tmp_buf)
         goto fail;
@@ -151,34 +168,55 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 
     s->fft_permute = fft_permute_c;
     s->fft_calc    = fft_calc_c;
+#if CONFIG_MDCT
+    s->imdct_calc  = ff_imdct_calc_c;
+    s->imdct_half  = ff_imdct_half_c;
+    s->mdct_calc   = ff_mdct_calc_c;
+#endif
 
+#if FFT_FIXED_32
+    {
+        int n=0;
+        ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n);
+    }
+#else /* FFT_FIXED_32 */
 #if FFT_FLOAT
     if (ARCH_AARCH64) ff_fft_init_aarch64(s);
     if (ARCH_ARM)     ff_fft_init_arm(s);
     if (ARCH_PPC)     ff_fft_init_ppc(s);
     if (ARCH_X86)     ff_fft_init_x86(s);
+    if (CONFIG_MDCT)  s->mdct_calcw = s->mdct_calc;
+    if (HAVE_MIPSFPU) ff_fft_init_mips(s);
 #else
+    if (CONFIG_MDCT)  s->mdct_calcw = ff_mdct_calcw_c;
     if (ARCH_ARM)     ff_fft_fixed_init_arm(s);
 #endif
-
     for(j=4; j<=nbits; j++) {
         ff_init_ff_cos_tabs(j);
     }
+#endif /* FFT_FIXED_32 */
+
 
     if (s->fft_permutation == FF_FFT_PERM_AVX) {
         fft_perm_avx(s);
     } else {
         for(i=0; i<n; i++) {
-            int j = i;
+            int k;
+            j = i;
             if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
                 j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
-            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
+            k = -split_radix_permutation(i, n, s->inverse) & (n-1);
+            if (s->revtab)
+                s->revtab[k] = j;
+            if (s->revtab32)
+                s->revtab32[k] = j;
         }
     }
 
     return 0;
  fail:
     av_freep(&s->revtab);
+    av_freep(&s->revtab32);
     av_freep(&s->tmp_buf);
     return -1;
 }
@@ -187,18 +225,184 @@ static void fft_permute_c(FFTContext *s, FFTComplex *z)
 {
     int j, np;
     const uint16_t *revtab = s->revtab;
+    const uint32_t *revtab32 = s->revtab32;
     np = 1 << s->nbits;
     /* TODO: handle split-radix permute in a more optimal way, probably in-place */
-    for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+    if (revtab) {
+        for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+    } else
+        for(j=0;j<np;j++) s->tmp_buf[revtab32[j]] = z[j];
+
     memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
 }
 
 av_cold void ff_fft_end(FFTContext *s)
 {
     av_freep(&s->revtab);
+    av_freep(&s->revtab32);
     av_freep(&s->tmp_buf);
 }
 
+#if FFT_FIXED_32
+
+static void fft_calc_c(FFTContext *s, FFTComplex *z) {
+
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    FFTComplex *tmpz;
+    const int fft_size = (1 << s->nbits);
+    int64_t accu;
+
+    num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = ff_fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[0].re + tmpz[1].re;
+        tmp5 = tmpz[2].re + tmpz[3].re;
+        tmp2 = tmpz[0].im + tmpz[1].im;
+        tmp6 = tmpz[2].im + tmpz[3].im;
+        tmp3 = tmpz[0].re - tmpz[1].re;
+        tmp8 = tmpz[2].im - tmpz[3].im;
+        tmp4 = tmpz[0].im - tmpz[1].im;
+        tmp7 = tmpz[2].re - tmpz[3].re;
+
+        tmpz[0].re = tmp1 + tmp5;
+        tmpz[2].re = tmp1 - tmp5;
+        tmpz[0].im = tmp2 + tmp6;
+        tmpz[2].im = tmp2 - tmp6;
+        tmpz[1].re = tmp3 + tmp8;
+        tmpz[3].re = tmp3 - tmp8;
+        tmpz[1].im = tmp4 - tmp7;
+        tmpz[3].im = tmp4 + tmp7;
+    }
+
+    if (fft_size < 8)
+        return;
+
+    num_transforms = (num_transforms >> 1) | 1;
+
+    for (n=0; n<num_transforms; n++){
+        offset = ff_fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[4].re + tmpz[5].re;
+        tmp3 = tmpz[6].re + tmpz[7].re;
+        tmp2 = tmpz[4].im + tmpz[5].im;
+        tmp4 = tmpz[6].im + tmpz[7].im;
+        tmp5 = tmp1 + tmp3;
+        tmp7 = tmp1 - tmp3;
+        tmp6 = tmp2 + tmp4;
+        tmp8 = tmp2 - tmp4;
+
+        tmp1 = tmpz[4].re - tmpz[5].re;
+        tmp2 = tmpz[4].im - tmpz[5].im;
+        tmp3 = tmpz[6].re - tmpz[7].re;
+        tmp4 = tmpz[6].im - tmpz[7].im;
+
+        tmpz[4].re = tmpz[0].re - tmp5;
+        tmpz[0].re = tmpz[0].re + tmp5;
+        tmpz[4].im = tmpz[0].im - tmp6;
+        tmpz[0].im = tmpz[0].im + tmp6;
+        tmpz[6].re = tmpz[2].re - tmp8;
+        tmpz[2].re = tmpz[2].re + tmp8;
+        tmpz[6].im = tmpz[2].im + tmp7;
+        tmpz[2].im = tmpz[2].im - tmp7;
+
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp1 + tmp2);
+        tmp5 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 - tmp4);
+        tmp7 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp2 - tmp1);
+        tmp6 = (int32_t)((accu + 0x40000000) >> 31);
+        accu = (int64_t)Q31(M_SQRT1_2)*(tmp3 + tmp4);
+        tmp8 = (int32_t)((accu + 0x40000000) >> 31);
+        tmp1 = tmp5 + tmp7;
+        tmp3 = tmp5 - tmp7;
+        tmp2 = tmp6 + tmp8;
+        tmp4 = tmp6 - tmp8;
+
+        tmpz[5].re = tmpz[1].re - tmp1;
+        tmpz[1].re = tmpz[1].re + tmp1;
+        tmpz[5].im = tmpz[1].im - tmp2;
+        tmpz[1].im = tmpz[1].im + tmp2;
+        tmpz[7].re = tmpz[3].re - tmp4;
+        tmpz[3].re = tmpz[3].re + tmp4;
+        tmpz[7].im = tmpz[3].im + tmp3;
+        tmpz[3].im = tmpz[3].im - tmp3;
+    }
+
+    step = 1 << ((MAX_LOG2_NFFT-4) - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++){
+        n2  = 2*n4;
+        n34 = 3*n4;
+        num_transforms = (num_transforms >> 1) | 1;
+
+        for (n=0; n<num_transforms; n++){
+            const FFTSample *w_re_ptr = ff_w_tab_sr + step;
+            const FFTSample *w_im_ptr = ff_w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
+            offset = ff_fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            tmp5 = tmpz[ n2].re + tmpz[n34].re;
+            tmp1 = tmpz[ n2].re - tmpz[n34].re;
+            tmp6 = tmpz[ n2].im + tmpz[n34].im;
+            tmp2 = tmpz[ n2].im - tmpz[n34].im;
+
+            tmpz[ n2].re = tmpz[ 0].re - tmp5;
+            tmpz[  0].re = tmpz[ 0].re + tmp5;
+            tmpz[ n2].im = tmpz[ 0].im - tmp6;
+            tmpz[  0].im = tmpz[ 0].im + tmp6;
+            tmpz[n34].re = tmpz[n4].re - tmp2;
+            tmpz[ n4].re = tmpz[n4].re + tmp2;
+            tmpz[n34].im = tmpz[n4].im + tmp1;
+            tmpz[ n4].im = tmpz[n4].im - tmp1;
+
+            for (i=1; i<n4; i++){
+                FFTSample w_re = w_re_ptr[0];
+                FFTSample w_im = w_im_ptr[0];
+                accu  = (int64_t)w_re*tmpz[ n2+i].re;
+                accu += (int64_t)w_im*tmpz[ n2+i].im;
+                tmp1 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[ n2+i].im;
+                accu -= (int64_t)w_im*tmpz[ n2+i].re;
+                tmp2 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].re;
+                accu -= (int64_t)w_im*tmpz[n34+i].im;
+                tmp3 = (int32_t)((accu + 0x40000000) >> 31);
+                accu  = (int64_t)w_re*tmpz[n34+i].im;
+                accu += (int64_t)w_im*tmpz[n34+i].re;
+                tmp4 = (int32_t)((accu + 0x40000000) >> 31);
+
+                tmp5 = tmp1 + tmp3;
+                tmp1 = tmp1 - tmp3;
+                tmp6 = tmp2 + tmp4;
+                tmp2 = tmp2 - tmp4;
+
+                tmpz[ n2+i].re = tmpz[   i].re - tmp5;
+                tmpz[    i].re = tmpz[   i].re + tmp5;
+                tmpz[ n2+i].im = tmpz[   i].im - tmp6;
+                tmpz[    i].im = tmpz[   i].im + tmp6;
+                tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+                tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+                tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+                tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+
+                w_re_ptr += step;
+                w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+
+#else /* FFT_FIXED_32 */
+
 #define BUTTERFLIES(a0,a1,a2,a3) {\
     BF(t3, t5, t5, t1);\
     BF(a2.re, a0.re, a0.re, t5);\
@@ -334,13 +538,15 @@ DECL_FFT(8192,4096,2048)
 DECL_FFT(16384,8192,4096)
 DECL_FFT(32768,16384,8192)
 DECL_FFT(65536,32768,16384)
+DECL_FFT(131072,65536,32768)
 
 static void (* const fft_dispatch[])(FFTComplex*) = {
     fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
-    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
+    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072
 };
 
 static void fft_calc_c(FFTContext *s, FFTComplex *z)
 {
     fft_dispatch[s->nbits-2](z);
 }
+#endif /* FFT_FIXED_32 */
diff --git a/libavcodec/ffv1.c b/libavcodec/ffv1.c
index 21d3583..60eb523 100644
--- a/libavcodec/ffv1.c
+++ b/libavcodec/ffv1.c
@@ -1,22 +1,22 @@
 /*
  * FFV1 codec for libavcodec
  *
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,120 +26,34 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 
 #include "avcodec.h"
-#include "get_bits.h"
-#include "put_bits.h"
+#include "internal.h"
 #include "rangecoder.h"
 #include "golomb.h"
 #include "mathops.h"
 #include "ffv1.h"
 
-const int8_t ffv1_quant5_10bit[256] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0,
-};
-
-const int8_t ffv1_quant5[256] = {
-     0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1,
-};
-
-const int8_t ffv1_quant9_10bit[256] = {
-     0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,
-     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
-     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-     3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3,
-    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
-    -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-    -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -0, -0, -0, -0,
-};
-
-const int8_t ffv1_quant11[256] = {
-     0,  1,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,
-     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
-     4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
-    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -4, -4,
-    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
-    -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -1,
-};
-
-const uint8_t ffv1_ver2_state[256] = {
-      0,  10,  10,  10,  10,  16,  16,  16,  28,  16,  16,  29,  42,  49,  20,  49,
-     59,  25,  26,  26,  27,  31,  33,  33,  33,  34,  34,  37,  67,  38,  39,  39,
-     40,  40,  41,  79,  43,  44,  45,  45,  48,  48,  64,  50,  51,  52,  88,  52,
-     53,  74,  55,  57,  58,  58,  74,  60,  101, 61,  62,  84,  66,  66,  68,  69,
-     87,  82,  71,  97,  73,  73,  82,  75,  111, 77,  94,  78,  87,  81,  83,  97,
-     85,  83,  94,  86,  99,  89,  90,  99,  111, 92,  93,  134, 95,  98,  105, 98,
-    105, 110, 102, 108, 102, 118, 103, 106, 106, 113, 109, 112, 114, 112, 116, 125,
-    115, 116, 117, 117, 126, 119, 125, 121, 121, 123, 145, 124, 126, 131, 127, 129,
-    165, 130, 132, 138, 133, 135, 145, 136, 137, 139, 146, 141, 143, 142, 144, 148,
-    147, 155, 151, 149, 151, 150, 152, 157, 153, 154, 156, 168, 158, 162, 161, 160,
-    172, 163, 169, 164, 166, 184, 167, 170, 177, 174, 171, 173, 182, 176, 180, 178,
-    175, 189, 179, 181, 186, 183, 192, 185, 200, 187, 191, 188, 190, 197, 193, 196,
-    197, 194, 195, 196, 198, 202, 199, 201, 210, 203, 207, 204, 205, 206, 208, 214,
-    209, 211, 221, 212, 213, 215, 224, 216, 217, 218, 219, 220, 222, 228, 223, 225,
-    226, 224, 227, 229, 240, 230, 231, 232, 233, 234, 235, 236, 238, 239, 237, 242,
-    241, 243, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255,
-};
-
-
-av_cold int ffv1_common_init(AVCodecContext *avctx)
+av_cold int ff_ffv1_common_init(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
 
+    if (!avctx->width || !avctx->height)
+        return AVERROR_INVALIDDATA;
+
     s->avctx = avctx;
     s->flags = avctx->flags;
 
-    if (!avctx->width || !avctx->height)
-        return AVERROR_INVALIDDATA;
+    s->picture.f = av_frame_alloc();
+    s->last_picture.f = av_frame_alloc();
+    if (!s->picture.f || !s->last_picture.f)
+        return AVERROR(ENOMEM);
 
     s->width  = avctx->width;
     s->height = avctx->height;
@@ -151,9 +65,9 @@ av_cold int ffv1_common_init(AVCodecContext *avctx)
     return 0;
 }
 
-int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
+av_cold int ff_ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
 {
-    int j;
+    int j, i;
 
     fs->plane_count  = f->plane_count;
     fs->transparency = f->transparency;
@@ -162,22 +76,27 @@ int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
 
         if (fs->ac != AC_GOLOMB_RICE) {
             if (!p->state)
-                p->state = av_malloc(CONTEXT_SIZE * p->context_count *
+                p->state = av_malloc_array(p->context_count, CONTEXT_SIZE *
                                      sizeof(uint8_t));
             if (!p->state)
                 return AVERROR(ENOMEM);
         } else {
-            if (!p->vlc_state)
-                p->vlc_state = av_malloc(p->context_count * sizeof(VlcState));
-            if (!p->vlc_state)
-                return AVERROR(ENOMEM);
+            if (!p->vlc_state) {
+                p->vlc_state = av_mallocz_array(p->context_count, sizeof(VlcState));
+                if (!p->vlc_state)
+                    return AVERROR(ENOMEM);
+                for (i = 0; i < p->context_count; i++) {
+                    p->vlc_state[i].error_sum = 4;
+                    p->vlc_state[i].count     = 1;
+                }
+            }
         }
     }
 
     if (fs->ac == AC_RANGE_CUSTOM_TAB) {
         //FIXME only redo if state_transition changed
         for (j = 1; j < 256; j++) {
-            fs->c.one_state[j]        = f->state_transition[j];
+            fs->c. one_state[      j] = f->state_transition[j];
             fs->c.zero_state[256 - j] = 256 - fs->c.one_state[j];
         }
     }
@@ -185,17 +104,25 @@ int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs)
     return 0;
 }
 
-av_cold int ffv1_init_slice_contexts(FFV1Context *f)
+av_cold int ff_ffv1_init_slices_state(FFV1Context *f)
 {
-    int i, j;
-
-    f->slice_count = f->num_h_slices * f->num_v_slices;
-    if (f->slice_count <= 0) {
-        av_log(f->avctx, AV_LOG_ERROR, "Invalid number of slices\n");
-        return AVERROR(EINVAL);
+    int i, ret;
+    for (i = 0; i < f->max_slice_count; i++) {
+        FFV1Context *fs = f->slice_context[i];
+        if ((ret = ff_ffv1_init_slice_state(f, fs)) < 0)
+            return AVERROR(ENOMEM);
     }
+    return 0;
+}
+
+av_cold int ff_ffv1_init_slice_contexts(FFV1Context *f)
+{
+    int i;
 
-    for (i = 0; i < f->slice_count; i++) {
+    f->max_slice_count = f->num_h_slices * f->num_v_slices;
+    av_assert0(f->max_slice_count > 0);
+
+    for (i = 0; i < f->max_slice_count; i++) {
         int sx          = i % f->num_h_slices;
         int sy          = i / f->num_h_slices;
         int sxs         = f->avctx->width  *  sx      / f->num_h_slices;
@@ -203,6 +130,7 @@ av_cold int ffv1_init_slice_contexts(FFV1Context *f)
         int sys         = f->avctx->height *  sy      / f->num_v_slices;
         int sye         = f->avctx->height * (sy + 1) / f->num_v_slices;
         FFV1Context *fs = av_mallocz(sizeof(*fs));
+
         if (!fs)
             goto memfail;
 
@@ -215,29 +143,29 @@ av_cold int ffv1_init_slice_contexts(FFV1Context *f)
         fs->slice_x      = sxs;
         fs->slice_y      = sys;
 
-        fs->sample_buffer = av_malloc(3 * MAX_PLANES * (fs->width + 6) *
+        fs->sample_buffer = av_malloc_array((fs->width + 6), 3 * MAX_PLANES *
                                       sizeof(*fs->sample_buffer));
         if (!fs->sample_buffer) {
-            av_free(fs);
+            av_freep(&f->slice_context[i]);
             goto memfail;
         }
     }
     return 0;
 
 memfail:
-    for (j = 0; j < i; j++) {
-        av_free(f->slice_context[j]->sample_buffer);
-        av_free(f->slice_context[j]);
+    while(--i >= 0) {
+        av_freep(&f->slice_context[i]->sample_buffer);
+        av_freep(&f->slice_context[i]);
     }
     return AVERROR(ENOMEM);
 }
 
-int ffv1_allocate_initial_states(FFV1Context *f)
+int ff_ffv1_allocate_initial_states(FFV1Context *f)
 {
     int i;
 
     for (i = 0; i < f->quant_table_count; i++) {
-        f->initial_states[i] = av_malloc(f->context_count[i] *
+        f->initial_states[i] = av_malloc_array(f->context_count[i],
                                          sizeof(*f->initial_states[i]));
         if (!f->initial_states[i])
             return AVERROR(ENOMEM);
@@ -247,7 +175,7 @@ int ffv1_allocate_initial_states(FFV1Context *f)
     return 0;
 }
 
-void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
+void ff_ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
 {
     int i, j;
 
@@ -274,12 +202,21 @@ void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs)
     }
 }
 
-av_cold int ffv1_close(AVCodecContext *avctx)
+
+av_cold int ff_ffv1_close(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
     int i, j;
 
-    for (j = 0; j < s->slice_count; j++) {
+    if (s->picture.f)
+        ff_thread_release_buffer(avctx, &s->picture);
+    av_frame_free(&s->picture.f);
+
+    if (s->last_picture.f)
+        ff_thread_release_buffer(avctx, &s->last_picture);
+    av_frame_free(&s->last_picture.f);
+
+    for (j = 0; j < s->max_slice_count; j++) {
         FFV1Context *fs = s->slice_context[j];
         for (i = 0; i < s->plane_count; i++) {
             PlaneContext *p = &fs->plane[i];
@@ -293,14 +230,14 @@ av_cold int ffv1_close(AVCodecContext *avctx)
     av_freep(&avctx->stats_out);
     for (j = 0; j < s->quant_table_count; j++) {
         av_freep(&s->initial_states[j]);
-        for (i = 0; i < s->slice_count; i++) {
+        for (i = 0; i < s->max_slice_count; i++) {
             FFV1Context *sf = s->slice_context[i];
             av_freep(&sf->rc_stat2[j]);
         }
         av_freep(&s->rc_stat2[j]);
     }
 
-    for (i = 0; i < s->slice_count; i++)
+    for (i = 0; i < s->max_slice_count; i++)
         av_freep(&s->slice_context[i]);
 
     return 0;
diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
index 34370fa..d9398e5 100644
--- a/libavcodec/ffv1.h
+++ b/libavcodec/ffv1.h
@@ -3,32 +3,49 @@
  *
  * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_FFV1_H
 #define AVCODEC_FFV1_H
 
-#include <stdint.h>
+/**
+ * @file
+ * FF Video Codec 1 (a lossless codec)
+ */
 
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 #include "avcodec.h"
 #include "get_bits.h"
+#include "internal.h"
+#include "mathops.h"
 #include "put_bits.h"
 #include "rangecoder.h"
+#include "thread.h"
+
+#ifdef __INTEL_COMPILER
+#undef av_flatten
+#define av_flatten
+#endif
 
 #define MAX_PLANES 4
 #define CONTEXT_SIZE 32
@@ -39,14 +56,7 @@
 #define AC_GOLOMB_RICE          0
 #define AC_RANGE_DEFAULT_TAB    1
 #define AC_RANGE_CUSTOM_TAB     2
-
-extern const uint8_t ff_log2_run[41];
-
-extern const int8_t ffv1_quant5_10bit[256];
-extern const int8_t ffv1_quant5[256];
-extern const int8_t ffv1_quant9_10bit[256];
-extern const int8_t ffv1_quant11[256];
-extern const uint8_t ffv1_ver2_state[256];
+#define AC_RANGE_DEFAULT_TAB_FORCE -2
 
 typedef struct VlcState {
     int16_t drift;
@@ -75,7 +85,7 @@ typedef struct FFV1Context {
     uint64_t rc_stat[256][2];
     uint64_t (*rc_stat2[MAX_QUANT_TABLES])[32][2];
     int version;
-    int minor_version;
+    int micro_version;
     int width, height;
     int chroma_planes;
     int chroma_h_shift, chroma_v_shift;
@@ -83,13 +93,13 @@ typedef struct FFV1Context {
     int flags;
     int picture_number;
     int key_frame;
-    const AVFrame *frame;
-    AVFrame *last_picture;
+    ThreadFrame picture, last_picture;
+    struct FFV1Context *fsrc;
 
     AVFrame *cur;
     int plane_count;
-    int ac;     // 1 = range coder <-> 0 = golomb rice
-    int ac_byte_count;      // number of bytes used for AC coding
+    int ac;                              ///< 1=range coder <-> 0=golomb rice
+    int ac_byte_count;                   ///< number of bytes used for AC coding
     PlaneContext plane[MAX_PLANES];
     int16_t quant_table[MAX_CONTEXT_INPUTS][256];
     int16_t quant_tables[MAX_QUANT_TABLES][MAX_CONTEXT_INPUTS][256];
@@ -101,6 +111,7 @@ typedef struct FFV1Context {
     int16_t *sample_buffer;
 
     int ec;
+    int intra;
     int slice_damaged;
     int key_frame_ok;
     int context_model;
@@ -113,21 +124,34 @@ typedef struct FFV1Context {
 
     struct FFV1Context *slice_context[MAX_SLICES];
     int slice_count;
+    int max_slice_count;
     int num_v_slices;
     int num_h_slices;
     int slice_width;
     int slice_height;
     int slice_x;
     int slice_y;
+    int slice_reset_contexts;
+    int slice_coding_mode;
+    int slice_rct_by_coef;
+    int slice_rct_ry_coef;
 } FFV1Context;
 
+int ff_ffv1_common_init(AVCodecContext *avctx);
+int ff_ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs);
+int ff_ffv1_init_slices_state(FFV1Context *f);
+int ff_ffv1_init_slice_contexts(FFV1Context *f);
+int ff_ffv1_allocate_initial_states(FFV1Context *f);
+void ff_ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs);
+int ff_ffv1_close(AVCodecContext *avctx);
+
 static av_always_inline int fold(int diff, int bits)
 {
     if (bits == 8)
         diff = (int8_t)diff;
     else {
         diff +=  1 << (bits  - 1);
-        diff &= (1 <<  bits) - 1;
+        diff  = av_mod_uintp2(diff, bits);
         diff -=  1 << (bits  - 1);
     }
 
@@ -199,11 +223,4 @@ static inline void update_vlc_state(VlcState *const state, const int v)
     state->count = count;
 }
 
-int ffv1_common_init(AVCodecContext *avctx);
-int ffv1_init_slice_state(FFV1Context *f, FFV1Context *fs);
-int ffv1_init_slice_contexts(FFV1Context *f);
-int ffv1_allocate_initial_states(FFV1Context *f);
-void ffv1_clear_slice_state(FFV1Context *f, FFV1Context *fs);
-int ffv1_close(AVCodecContext *avctx);
-
 #endif /* AVCODEC_FFV1_H */
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 467fd0d..6a932b2 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -1,22 +1,22 @@
 /*
  * FFV1 decoder
  *
- * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,15 +26,14 @@
  */
 
 #include "libavutil/avassert.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/crc.h"
 #include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
-#include "put_bits.h"
 #include "rangecoder.h"
 #include "golomb.h"
 #include "mathops.h"
@@ -48,8 +47,11 @@ static inline av_flatten int get_symbol_inline(RangeCoder *c, uint8_t *state,
     else {
         int i, e, a;
         e = 0;
-        while (get_rac(c, state + 1 + FFMIN(e, 9))) // 1..10
+        while (get_rac(c, state + 1 + FFMIN(e, 9))) { // 1..10
             e++;
+            if (e > 31)
+                return AVERROR_INVALIDDATA;
+        }
 
         a = 1;
         for (i = e - 1; i >= 0; i--)
@@ -77,8 +79,6 @@ static inline int get_vlc_symbol(GetBitContext *gb, VlcState *const state,
         i += i;
     }
 
-    assert(k <= 8);
-
     v = get_sr_golomb(gb, k, 12, bits);
     ff_dlog(NULL, "v:%d bias:%d error:%d drift:%d count:%d k:%d",
             v, state->bias, state->error_sum, state->drift, state->count, k);
@@ -108,6 +108,19 @@ static av_always_inline void decode_line(FFV1Context *s, int w,
     int run_mode  = 0;
     int run_index = s->run_index;
 
+    if (s->slice_coding_mode == 1) {
+        int i;
+        for (x = 0; x < w; x++) {
+            int v = 0;
+            for (i=0; i<bits; i++) {
+                uint8_t state = 128;
+                v += v + get_rac(c, &state);
+            }
+            sample[1][x] = v;
+        }
+        return;
+    }
+
     for (x = 0; x < w; x++) {
         int diff, context, sign;
 
@@ -162,14 +175,14 @@ static av_always_inline void decode_line(FFV1Context *s, int w,
         if (sign)
             diff = -diff;
 
-        sample[1][x] = (predict(sample[1] + x, sample[0] + x) + diff) &
-                       ((1 << bits) - 1);
+        sample[1][x] = av_mod_uintp2(predict(sample[1] + x, sample[0] + x) + diff, bits);
     }
     s->run_index = run_index;
 }
 
 static void decode_plane(FFV1Context *s, uint8_t *src,
-                         int w, int h, int stride, int plane_index)
+                         int w, int h, int stride, int plane_index,
+                         int pixel_stride)
 {
     int x, y;
     int16_t *sample[2];
@@ -193,31 +206,29 @@ static void decode_plane(FFV1Context *s, uint8_t *src,
         if (s->avctx->bits_per_raw_sample <= 8) {
             decode_line(s, w, sample, plane_index, 8);
             for (x = 0; x < w; x++)
-                src[x + stride * y] = sample[1][x];
+                src[x*pixel_stride + stride * y] = sample[1][x];
         } else {
-            decode_line(s, w, sample, plane_index,
-                        s->avctx->bits_per_raw_sample);
+            decode_line(s, w, sample, plane_index, s->avctx->bits_per_raw_sample);
             if (s->packed_at_lsb) {
-                for (x = 0; x < w; x++)
-                    ((uint16_t *)(src + stride * y))[x] = sample[1][x];
+                for (x = 0; x < w; x++) {
+                    ((uint16_t*)(src + stride*y))[x*pixel_stride] = sample[1][x];
+                }
             } else {
-                for (x = 0; x < w; x++)
-                    ((uint16_t *)(src + stride * y))[x] = sample[1][x] << (16 - s->avctx->bits_per_raw_sample);
+                for (x = 0; x < w; x++) {
+                    ((uint16_t*)(src + stride*y))[x*pixel_stride] = sample[1][x] << (16 - s->avctx->bits_per_raw_sample);
+                }
             }
         }
 // STOP_TIMER("decode-line") }
     }
 }
 
-static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h,
-                             int stride[3])
+static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h, int stride[3])
 {
     int x, y, p;
     int16_t *sample[4][2];
-    int lbd  = s->avctx->bits_per_raw_sample <= 8;
-    int bits = s->avctx->bits_per_raw_sample > 0
-               ? s->avctx->bits_per_raw_sample
-               : 8;
+    int lbd    = s->avctx->bits_per_raw_sample <= 8;
+    int bits   = s->avctx->bits_per_raw_sample > 0 ? s->avctx->bits_per_raw_sample : 8;
     int offset = 1 << bits;
 
     for (x = 0; x < 4; x++) {
@@ -231,17 +242,17 @@ static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h,
 
     for (y = 0; y < h; y++) {
         for (p = 0; p < 3 + s->transparency; p++) {
-            int16_t *temp = sample[p][0]; //FIXME try a normal buffer
+            int16_t *temp = sample[p][0]; // FIXME: try a normal buffer
 
             sample[p][0] = sample[p][1];
             sample[p][1] = temp;
 
-            sample[p][1][-1] = sample[p][0][0];
-            sample[p][0][w]  = sample[p][0][w - 1];
-            if (lbd)
-                decode_line(s, w, sample[p], (p + 1) / 2, 9);
+            sample[p][1][-1]= sample[p][0][0  ];
+            sample[p][0][ w]= sample[p][0][w-1];
+            if (lbd && s->slice_coding_mode == 0)
+                decode_line(s, w, sample[p], (p + 1)/2, 9);
             else
-                decode_line(s, w, sample[p], (p + 1) / 2, bits + 1);
+                decode_line(s, w, sample[p], (p + 1)/2, bits + (s->slice_coding_mode != 1));
         }
         for (x = 0; x < w; x++) {
             int g = sample[0][1][x];
@@ -249,19 +260,20 @@ static void decode_rgb_frame(FFV1Context *s, uint8_t *src[3], int w, int h,
             int r = sample[2][1][x];
             int a = sample[3][1][x];
 
-            b -= offset;
-            r -= offset;
-            g -= (b + r) >> 2;
-            b += g;
-            r += g;
+            if (s->slice_coding_mode != 1) {
+                b -= offset;
+                r -= offset;
+                g -= (b * s->slice_rct_by_coef + r * s->slice_rct_ry_coef) >> 2;
+                b += g;
+                r += g;
+            }
 
             if (lbd)
-                *((uint32_t *)(src[0] + x * 4 + stride[0] * y)) = b +
-                    (g << 8) + (r << 16) + (a << 24);
+                *((uint32_t*)(src[0] + x*4 + stride[0]*y)) = b + (g<<8) + (r<<16) + (a<<24);
             else {
-                *((uint16_t *)(src[0] + x * 2 + stride[0] * y)) = b;
-                *((uint16_t *)(src[1] + x * 2 + stride[1] * y)) = g;
-                *((uint16_t *)(src[2] + x * 2 + stride[2] * y)) = r;
+                *((uint16_t*)(src[0] + x*2 + stride[0]*y)) = b;
+                *((uint16_t*)(src[1] + x*2 + stride[1]*y)) = g;
+                *((uint16_t*)(src[2] + x*2 + stride[2]*y)) = r;
             }
         }
     }
@@ -274,35 +286,29 @@ static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
     unsigned ps, i, context_count;
     memset(state, 128, sizeof(state));
 
-    if (fs->ac == AC_RANGE_CUSTOM_TAB) {
-        for (i = 1; i < 256; i++) {
-            fs->c.one_state[i]        = f->state_transition[i];
-            fs->c.zero_state[256 - i] = 256 - fs->c.one_state[i];
-        }
-    }
+    av_assert0(f->version > 2);
 
-    fs->slice_x      = get_symbol(c, state, 0) * f->width;
-    fs->slice_y      = get_symbol(c, state, 0) * f->height;
-    fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width + fs->slice_x;
+    fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+    fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+    fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
     fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
 
-    fs->slice_x     /= f->num_h_slices;
-    fs->slice_y     /= f->num_v_slices;
-    fs->slice_width  = fs->slice_width / f->num_h_slices - fs->slice_x;
-    fs->slice_height = fs->slice_height / f->num_v_slices - fs->slice_y;
-    if ((unsigned)fs->slice_width  > f->width ||
-        (unsigned)fs->slice_height > f->height)
-        return AVERROR_INVALIDDATA;
-    if ((unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width ||
-        (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
-        return AVERROR_INVALIDDATA;
+    fs->slice_x /= f->num_h_slices;
+    fs->slice_y /= f->num_v_slices;
+    fs->slice_width  = fs->slice_width /f->num_h_slices - fs->slice_x;
+    fs->slice_height = fs->slice_height/f->num_v_slices - fs->slice_y;
+    if ((unsigned)fs->slice_width > f->width || (unsigned)fs->slice_height > f->height)
+        return -1;
+    if (    (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+         || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
+        return -1;
 
     for (i = 0; i < f->plane_count; i++) {
-        PlaneContext *const p = &fs->plane[i];
-        int idx               = get_symbol(c, state, 0);
-        if (idx > (unsigned)f->quant_table_count) {
+        PlaneContext * const p = &fs->plane[i];
+        int idx = get_symbol(c, state, 0);
+        if (idx >= (unsigned)f->quant_table_count) {
             av_log(f->avctx, AV_LOG_ERROR, "quant_table_index out of range\n");
-            return AVERROR_INVALIDDATA;
+            return -1;
         }
         p->quant_table_index = idx;
         memcpy(p->quant_table, f->quant_tables[idx], sizeof(p->quant_table));
@@ -336,65 +342,114 @@ static int decode_slice_header(FFV1Context *f, FFV1Context *fs)
         f->cur->sample_aspect_ratio = (AVRational){ 0, 1 };
     }
 
+    if (fs->version > 3) {
+        fs->slice_reset_contexts = get_rac(c, state);
+        fs->slice_coding_mode = get_symbol(c, state, 0);
+        if (fs->slice_coding_mode != 1) {
+            fs->slice_rct_by_coef = get_symbol(c, state, 0);
+            fs->slice_rct_ry_coef = get_symbol(c, state, 0);
+            if ((uint64_t)fs->slice_rct_by_coef + (uint64_t)fs->slice_rct_ry_coef > 4) {
+                av_log(f->avctx, AV_LOG_ERROR, "slice_rct_y_coef out of range\n");
+                return AVERROR_INVALIDDATA;
+            }
+        }
+    }
+
     return 0;
 }
 
 static int decode_slice(AVCodecContext *c, void *arg)
 {
-    FFV1Context *fs = *(void **)arg;
-    FFV1Context *f  = fs->avctx->priv_data;
+    FFV1Context *fs   = *(void **)arg;
+    FFV1Context *f    = fs->avctx->priv_data;
     int width, height, x, y, ret;
-    const int ps = (av_pix_fmt_desc_get(c->pix_fmt)->flags & AV_PIX_FMT_FLAG_PLANAR)
-                   ? (c->bits_per_raw_sample > 8) + 1
-                   : 4;
-    AVFrame *const p = f->cur;
+    const int ps      = av_pix_fmt_desc_get(c->pix_fmt)->comp[0].step;
+    AVFrame * const p = f->cur;
+    int i, si;
+
+    for( si=0; fs != f->slice_context[si]; si ++)
+        ;
+
+    if(f->fsrc && !p->key_frame)
+        ff_thread_await_progress(&f->last_picture, si, 0);
+
+    if(f->fsrc && !p->key_frame) {
+        FFV1Context *fssrc = f->fsrc->slice_context[si];
+        FFV1Context *fsdst = f->slice_context[si];
+        av_assert1(fsdst->plane_count == fssrc->plane_count);
+        av_assert1(fsdst == fs);
+
+        if (!p->key_frame)
+            fsdst->slice_damaged |= fssrc->slice_damaged;
+
+        for (i = 0; i < f->plane_count; i++) {
+            PlaneContext *psrc = &fssrc->plane[i];
+            PlaneContext *pdst = &fsdst->plane[i];
+
+            av_free(pdst->state);
+            av_free(pdst->vlc_state);
+            memcpy(pdst, psrc, sizeof(*pdst));
+            pdst->state = NULL;
+            pdst->vlc_state = NULL;
+
+            if (fssrc->ac) {
+                pdst->state = av_malloc_array(CONTEXT_SIZE,  psrc->context_count);
+                memcpy(pdst->state, psrc->state, CONTEXT_SIZE * psrc->context_count);
+            } else {
+                pdst->vlc_state = av_malloc_array(sizeof(*pdst->vlc_state), psrc->context_count);
+                memcpy(pdst->vlc_state, psrc->vlc_state, sizeof(*pdst->vlc_state) * psrc->context_count);
+            }
+        }
+    }
+
+    fs->slice_rct_by_coef = 1;
+    fs->slice_rct_ry_coef = 1;
 
     if (f->version > 2) {
+        if (ff_ffv1_init_slice_state(f, fs) < 0)
+            return AVERROR(ENOMEM);
         if (decode_slice_header(f, fs) < 0) {
+            fs->slice_x = fs->slice_y = fs->slice_height = fs->slice_width = 0;
             fs->slice_damaged = 1;
             return AVERROR_INVALIDDATA;
         }
     }
-    if ((ret = ffv1_init_slice_state(f, fs)) < 0)
+    if ((ret = ff_ffv1_init_slice_state(f, fs)) < 0)
         return ret;
-    if (f->cur->key_frame)
-        ffv1_clear_slice_state(f, fs);
+    if (f->cur->key_frame || fs->slice_reset_contexts)
+        ff_ffv1_clear_slice_state(f, fs);
+
     width  = fs->slice_width;
     height = fs->slice_height;
     x      = fs->slice_x;
     y      = fs->slice_y;
 
     if (fs->ac == AC_GOLOMB_RICE) {
-        if (f->version == 3 && f->minor_version > 1 || f->version > 3)
+        if (f->version == 3 && f->micro_version > 1 || f->version > 3)
             get_rac(&fs->c, (uint8_t[]) { 129 });
         fs->ac_byte_count = f->version > 2 || (!x && !y) ? fs->c.bytestream - fs->c.bytestream_start - 1 : 0;
-        init_get_bits(&fs->gb, fs->c.bytestream_start + fs->ac_byte_count,
-                      (fs->c.bytestream_end - fs->c.bytestream_start -
-                       fs->ac_byte_count) * 8);
+        init_get_bits(&fs->gb,
+                      fs->c.bytestream_start + fs->ac_byte_count,
+                      (fs->c.bytestream_end - fs->c.bytestream_start - fs->ac_byte_count) * 8);
     }
 
     av_assert1(width && height);
-    if (f->colorspace == 0) {
+    if (f->colorspace == 0 && (f->chroma_planes || !fs->transparency)) {
         const int chroma_width  = AV_CEIL_RSHIFT(width,  f->chroma_h_shift);
         const int chroma_height = AV_CEIL_RSHIFT(height, f->chroma_v_shift);
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
-        decode_plane(fs, p->data[0] + ps * x + y * p->linesize[0], width,
-                     height, p->linesize[0],
-                     0);
+        decode_plane(fs, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
 
         if (f->chroma_planes) {
-            decode_plane(fs, p->data[1] + ps * cx + cy * p->linesize[1],
-                         chroma_width, chroma_height, p->linesize[1],
-                         1);
-            decode_plane(fs, p->data[2] + ps * cx + cy * p->linesize[2],
-                         chroma_width, chroma_height, p->linesize[2],
-                         1);
+            decode_plane(fs, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
+            decode_plane(fs, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
         }
         if (fs->transparency)
-            decode_plane(fs, p->data[3] + ps * x + y * p->linesize[3], width,
-                         height, p->linesize[3],
-                         2);
+            decode_plane(fs, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], (f->version >= 4 && !f->chroma_planes) ? 1 : 2, 1);
+    } else if (f->colorspace == 0) {
+         decode_plane(fs, p->data[0] + ps*x + y*p->linesize[0]    , width, height, p->linesize[0], 0, 2);
+         decode_plane(fs, p->data[0] + ps*x + y*p->linesize[0] + 1, width, height, p->linesize[0], 1, 2);
     } else {
         uint8_t *planes[3] = { p->data[0] + ps * x + y * p->linesize[0],
                                p->data[1] + ps * x + y * p->linesize[1],
@@ -404,16 +459,17 @@ static int decode_slice(AVCodecContext *c, void *arg)
     if (fs->ac != AC_GOLOMB_RICE && f->version > 2) {
         int v;
         get_rac(&fs->c, (uint8_t[]) { 129 });
-        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5 * f->ec;
+        v = fs->c.bytestream_end - fs->c.bytestream - 2 - 5*f->ec;
         if (v) {
-            av_log(f->avctx, AV_LOG_ERROR, "bytestream end mismatching by %d\n",
-                   v);
+            av_log(f->avctx, AV_LOG_ERROR, "bytestream end mismatching by %d\n", v);
             fs->slice_damaged = 1;
         }
     }
 
     emms_c();
 
+    ff_thread_report_progress(&f->picture, si, 0);
+
     return 0;
 }
 
@@ -428,8 +484,8 @@ static int read_quant_table(RangeCoder *c, int16_t *quant_table, int scale)
     for (v = 0; i < 128; v++) {
         unsigned len = get_symbol(c, state, 0) + 1;
 
-        if (len > 128 - i)
-            return -1;
+        if (len > 128 - i || !len)
+            return AVERROR_INVALIDDATA;
 
         while (len--) {
             quant_table[i] = scale * v;
@@ -451,9 +507,12 @@ static int read_quant_tables(RangeCoder *c,
     int context_count = 1;
 
     for (i = 0; i < 5; i++) {
-        context_count *= read_quant_table(c, quant_table[i], context_count);
+        int ret = read_quant_table(c, quant_table[i], context_count);
+        if (ret < 0)
+            return ret;
+        context_count *= ret;
         if (context_count > 32768U) {
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
     }
     return (context_count + 1) / 2;
@@ -465,6 +524,7 @@ static int read_extra_header(FFV1Context *f)
     uint8_t state[CONTEXT_SIZE];
     int i, j, k, ret;
     uint8_t state2[32][CONTEXT_SIZE];
+    unsigned crc = 0;
 
     memset(state2, 128, sizeof(state2));
     memset(state, 128, sizeof(state));
@@ -473,9 +533,15 @@ static int read_extra_header(FFV1Context *f)
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
     f->version = get_symbol(c, state, 0);
+    if (f->version < 2) {
+        av_log(f->avctx, AV_LOG_ERROR, "Invalid version in global header\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (f->version > 2) {
         c->bytestream_end -= 4;
-        f->minor_version   = get_symbol(c, state, 0);
+        f->micro_version = get_symbol(c, state, 0);
+        if (f->micro_version < 0)
+            return AVERROR_INVALIDDATA;
     }
     f->ac = get_symbol(c, state, 0);
 
@@ -490,19 +556,30 @@ static int read_extra_header(FFV1Context *f)
     f->chroma_h_shift             = get_symbol(c, state, 0);
     f->chroma_v_shift             = get_symbol(c, state, 0);
     f->transparency               = get_rac(c, state);
-    f->plane_count                = 2 + f->transparency;
+    f->plane_count                = 1 + (f->chroma_planes || f->version<4) + f->transparency;
     f->num_h_slices               = 1 + get_symbol(c, state, 0);
     f->num_v_slices               = 1 + get_symbol(c, state, 0);
 
-    if (f->num_h_slices > (unsigned)f->width ||
-        f->num_v_slices > (unsigned)f->height) {
-        av_log(f->avctx, AV_LOG_ERROR, "too many slices\n");
+    if (f->chroma_h_shift > 4U || f->chroma_v_shift > 4U) {
+        av_log(f->avctx, AV_LOG_ERROR, "chroma shift parameters %d %d are invalid\n",
+               f->chroma_h_shift, f->chroma_v_shift);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (f->num_h_slices > (unsigned)f->width  || !f->num_h_slices ||
+        f->num_v_slices > (unsigned)f->height || !f->num_v_slices
+       ) {
+        av_log(f->avctx, AV_LOG_ERROR, "slice count invalid\n");
         return AVERROR_INVALIDDATA;
     }
 
     f->quant_table_count = get_symbol(c, state, 0);
-    if (f->quant_table_count > (unsigned)MAX_QUANT_TABLES)
+    if (f->quant_table_count > (unsigned)MAX_QUANT_TABLES || !f->quant_table_count) {
+        av_log(f->avctx, AV_LOG_ERROR, "quant table count %d is invalid\n", f->quant_table_count);
+        f->quant_table_count = 0;
         return AVERROR_INVALIDDATA;
+    }
+
     for (i = 0; i < f->quant_table_count; i++) {
         f->context_count[i] = read_quant_tables(c, f->quant_tables[i]);
         if (f->context_count[i] < 0) {
@@ -510,7 +587,7 @@ static int read_extra_header(FFV1Context *f)
             return AVERROR_INVALIDDATA;
         }
     }
-    if ((ret = ffv1_allocate_initial_states(f)) < 0)
+    if ((ret = ff_ffv1_allocate_initial_states(f)) < 0)
         return ret;
 
     for (i = 0; i < f->quant_table_count; i++)
@@ -525,46 +602,60 @@ static int read_extra_header(FFV1Context *f)
 
     if (f->version > 2) {
         f->ec = get_symbol(c, state, 0);
+        if (f->micro_version > 2)
+            f->intra = get_symbol(c, state, 0);
     }
 
     if (f->version > 2) {
         unsigned v;
         v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0,
                    f->avctx->extradata, f->avctx->extradata_size);
-        if (v) {
+        if (v || f->avctx->extradata_size < 4) {
             av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!\n", v);
             return AVERROR_INVALIDDATA;
         }
+        crc = AV_RB32(f->avctx->extradata + f->avctx->extradata_size - 4);
     }
 
+    if (f->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(f->avctx, AV_LOG_DEBUG,
+               "global: ver:%d.%d, coder:%d, colorspace: %d bpr:%d chroma:%d(%d:%d), alpha:%d slices:%dx%d qtabs:%d ec:%d intra:%d CRC:0x%08X\n",
+               f->version, f->micro_version,
+               f->ac,
+               f->colorspace,
+               f->avctx->bits_per_raw_sample,
+               f->chroma_planes, f->chroma_h_shift, f->chroma_v_shift,
+               f->transparency,
+               f->num_h_slices, f->num_v_slices,
+               f->quant_table_count,
+               f->ec,
+               f->intra,
+               crc
+              );
     return 0;
 }
 
-
 static int read_header(FFV1Context *f)
 {
     uint8_t state[CONTEXT_SIZE];
-    int i, j, context_count = -1;
+    int i, j, context_count = -1; //-1 to avoid warning
     RangeCoder *const c = &f->slice_context[0]->c;
 
     memset(state, 128, sizeof(state));
 
     if (f->version < 2) {
         int chroma_planes, chroma_h_shift, chroma_v_shift, transparency, colorspace, bits_per_raw_sample;
-        unsigned v = get_symbol(c, state, 0);
-        if (v > 1) {
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "invalid version %d in version 1 header\n", v);
+        unsigned v= get_symbol(c, state, 0);
+        if (v >= 2) {
+            av_log(f->avctx, AV_LOG_ERROR, "invalid version %d in ver01 header\n", v);
             return AVERROR_INVALIDDATA;
         }
         f->version = v;
-
         f->ac = get_symbol(c, state, 0);
 
         if (f->ac == AC_RANGE_CUSTOM_TAB) {
             for (i = 1; i < 256; i++)
-                f->state_transition[i] =
-                    get_symbol(c, state, 1) + c->one_state[i];
+                f->state_transition[i] = get_symbol(c, state, 1) + c->one_state[i];
         }
 
         colorspace          = get_symbol(c, state, 0); //YUV cs type
@@ -573,6 +664,8 @@ static int read_header(FFV1Context *f)
         chroma_h_shift      = get_symbol(c, state, 0);
         chroma_v_shift      = get_symbol(c, state, 0);
         transparency        = get_rac(c, state);
+        if (colorspace == 0 && f->avctx->skip_alpha)
+            transparency = 0;
 
         if (f->plane_count) {
             if (colorspace          != f->colorspace                 ||
@@ -586,6 +679,12 @@ static int read_header(FFV1Context *f)
             }
         }
 
+        if (chroma_h_shift > 4U || chroma_v_shift > 4U) {
+            av_log(f->avctx, AV_LOG_ERROR, "chroma shift parameters %d %d are invalid\n",
+                   chroma_h_shift, chroma_v_shift);
+            return AVERROR_INVALIDDATA;
+        }
+
         f->colorspace                 = colorspace;
         f->avctx->bits_per_raw_sample = bits_per_raw_sample;
         f->chroma_planes              = chroma_planes;
@@ -602,91 +701,65 @@ static int read_header(FFV1Context *f)
                 f->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
             else
                 f->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
-        } else if (f->avctx->bits_per_raw_sample <= 8 && !f->transparency) {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P;
-                break;
-            case 0x01:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV440P;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-                break;
-            case 0x20:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV411P;
-                break;
-            case 0x22:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV410P;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
+        } else if (f->transparency && !f->chroma_planes) {
+            if (f->avctx->bits_per_raw_sample <= 8)
+                f->avctx->pix_fmt = AV_PIX_FMT_YA8;
+            else
                 return AVERROR(ENOSYS);
+        } else if (f->avctx->bits_per_raw_sample<=8 && !f->transparency) {
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P; break;
+            case 0x01: f->avctx->pix_fmt = AV_PIX_FMT_YUV440P; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P; break;
+            case 0x20: f->avctx->pix_fmt = AV_PIX_FMT_YUV411P; break;
+            case 0x22: f->avctx->pix_fmt = AV_PIX_FMT_YUV410P; break;
             }
         } else if (f->avctx->bits_per_raw_sample <= 8 && f->transparency) {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16*f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P; break;
             }
-        } else if (f->avctx->bits_per_raw_sample == 9) {
+        } else if (f->avctx->bits_per_raw_sample == 9 && !f->transparency) {
             f->packed_at_lsb = 1;
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P9;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P9;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P9; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P9; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P9; break;
             }
-        } else if (f->avctx->bits_per_raw_sample == 10) {
+        } else if (f->avctx->bits_per_raw_sample == 9 && f->transparency) {
             f->packed_at_lsb = 1;
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P9; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P9; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P9; break;
             }
-        } else {
-            switch (16 * f->chroma_h_shift + f->chroma_v_shift) {
-            case 0x00:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
-                break;
-            case 0x10:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
-                break;
-            case 0x11:
-                f->avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
-                break;
-            default:
-                av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
-                return AVERROR(ENOSYS);
+        } else if (f->avctx->bits_per_raw_sample == 10 && !f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P10; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P10; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P10; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 10 && f->transparency) {
+            f->packed_at_lsb = 1;
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P10; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P10; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P10; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 16 && !f->transparency){
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUV444P16; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUV422P16; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUV420P16; break;
+            }
+        } else if (f->avctx->bits_per_raw_sample == 16 && f->transparency){
+            switch(16 * f->chroma_h_shift + f->chroma_v_shift) {
+            case 0x00: f->avctx->pix_fmt = AV_PIX_FMT_YUVA444P16; break;
+            case 0x10: f->avctx->pix_fmt = AV_PIX_FMT_YUVA422P16; break;
+            case 0x11: f->avctx->pix_fmt = AV_PIX_FMT_YUVA420P16; break;
             }
         }
     } else if (f->colorspace == 1) {
@@ -695,27 +768,26 @@ static int read_header(FFV1Context *f)
                    "chroma subsampling not supported in this colorspace\n");
             return AVERROR(ENOSYS);
         }
-        switch (f->avctx->bits_per_raw_sample) {
-        case 0:
-        case 8:
+        if (     f->avctx->bits_per_raw_sample <=  8 && !f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_0RGB32;
+        else if (f->avctx->bits_per_raw_sample <=  8 && f->transparency)
             f->avctx->pix_fmt = AV_PIX_FMT_RGB32;
-            break;
-        case 9:
+        else if (f->avctx->bits_per_raw_sample ==  9 && !f->transparency)
             f->avctx->pix_fmt = AV_PIX_FMT_GBRP9;
-            break;
-        case 10:
+        else if (f->avctx->bits_per_raw_sample == 10 && !f->transparency)
             f->avctx->pix_fmt = AV_PIX_FMT_GBRP10;
-            break;
-        default:
-            av_log(f->avctx, AV_LOG_ERROR,
-                   "bit depth %d not supported\n",
-                   f->avctx->bits_per_raw_sample);
-            return AVERROR(ENOSYS);
-        }
+        else if (f->avctx->bits_per_raw_sample == 12 && !f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+        else if (f->avctx->bits_per_raw_sample == 14 && !f->transparency)
+            f->avctx->pix_fmt = AV_PIX_FMT_GBRP14;
     } else {
         av_log(f->avctx, AV_LOG_ERROR, "colorspace not supported\n");
         return AVERROR(ENOSYS);
     }
+    if (f->avctx->pix_fmt == AV_PIX_FMT_NONE) {
+        av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
+        return AVERROR(ENOSYS);
+    }
 
     ff_dlog(f->avctx, "%d %d %d\n",
             f->chroma_h_shift, f->chroma_v_shift, f->avctx->pix_fmt);
@@ -725,6 +797,7 @@ static int read_header(FFV1Context *f)
             av_log(f->avctx, AV_LOG_ERROR, "read_quant_table error\n");
             return AVERROR_INVALIDDATA;
         }
+        f->slice_count = f->max_slice_count;
     } else if (f->version < 3) {
         f->slice_count = get_symbol(c, state, 0);
     } else {
@@ -732,16 +805,15 @@ static int read_header(FFV1Context *f)
         for (f->slice_count = 0;
              f->slice_count < MAX_SLICES && 3 < p - c->bytestream_start;
              f->slice_count++) {
-            int trailer = 3 + 5 * !!f->ec;
-            int size    = AV_RB24(p - trailer);
+            int trailer = 3 + 5*!!f->ec;
+            int size = AV_RB24(p-trailer);
             if (size + trailer > p - c->bytestream_start)
                 break;
             p -= size + trailer;
         }
     }
-    if (f->slice_count > (unsigned)MAX_SLICES || f->slice_count <= 0) {
-        av_log(f->avctx, AV_LOG_ERROR, "slice count %d is invalid\n",
-               f->slice_count);
+    if (f->slice_count > (unsigned)MAX_SLICES || f->slice_count <= 0 || f->slice_count > f->max_slice_count) {
+        av_log(f->avctx, AV_LOG_ERROR, "slice count %d is invalid (max=%d)\n", f->slice_count, f->max_slice_count);
         return AVERROR_INVALIDDATA;
     }
 
@@ -753,23 +825,20 @@ static int read_header(FFV1Context *f)
         fs->slice_damaged = 0;
 
         if (f->version == 2) {
-            fs->slice_x     = get_symbol(c, state, 0) * f->width;
-            fs->slice_y     = get_symbol(c, state, 0) * f->height;
-            fs->slice_width =
-                (get_symbol(c, state, 0) + 1) * f->width + fs->slice_x;
-            fs->slice_height =
-                (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
-
-            fs->slice_x      /= f->num_h_slices;
-            fs->slice_y      /= f->num_v_slices;
+            fs->slice_x      =  get_symbol(c, state, 0)      * f->width ;
+            fs->slice_y      =  get_symbol(c, state, 0)      * f->height;
+            fs->slice_width  = (get_symbol(c, state, 0) + 1) * f->width  + fs->slice_x;
+            fs->slice_height = (get_symbol(c, state, 0) + 1) * f->height + fs->slice_y;
+
+            fs->slice_x     /= f->num_h_slices;
+            fs->slice_y     /= f->num_v_slices;
             fs->slice_width  = fs->slice_width  / f->num_h_slices - fs->slice_x;
             fs->slice_height = fs->slice_height / f->num_v_slices - fs->slice_y;
-            if ((unsigned)fs->slice_width > f->width ||
+            if ((unsigned)fs->slice_width  > f->width ||
                 (unsigned)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
-            if ((unsigned)fs->slice_x + (uint64_t)fs->slice_width > f->width
-                || (unsigned)fs->slice_y + (uint64_t)fs->slice_height >
-                f->height)
+            if (   (unsigned)fs->slice_x + (uint64_t)fs->slice_width  > f->width
+                || (unsigned)fs->slice_y + (uint64_t)fs->slice_height > f->height)
                 return AVERROR_INVALIDDATA;
         }
 
@@ -804,28 +873,26 @@ static int read_header(FFV1Context *f)
     return 0;
 }
 
-static av_cold int ffv1_decode_init(AVCodecContext *avctx)
+static av_cold int decode_init(AVCodecContext *avctx)
 {
     FFV1Context *f = avctx->priv_data;
     int ret;
 
-    ffv1_common_init(avctx);
-
-    f->last_picture = av_frame_alloc();
-    if (!f->last_picture)
-        return AVERROR(ENOMEM);
+    if ((ret = ff_ffv1_common_init(avctx)) < 0)
+        return ret;
 
     if (avctx->extradata && (ret = read_extra_header(f)) < 0)
         return ret;
 
-    if ((ret = ffv1_init_slice_contexts(f)) < 0)
+    if ((ret = ff_ffv1_init_slice_contexts(f)) < 0)
         return ret;
 
+    avctx->internal->allocate_progress = 1;
+
     return 0;
 }
 
-static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
-                             int *got_frame, AVPacket *avpkt)
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
     uint8_t *buf        = avpkt->data;
     int buf_size        = avpkt->size;
@@ -834,10 +901,22 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
     int i, ret;
     uint8_t keystate = 128;
     uint8_t *buf_p;
-    AVFrame *const p    = data;
+    AVFrame *p;
+
+    if (f->last_picture.f)
+        ff_thread_release_buffer(avctx, &f->last_picture);
+    FFSWAP(ThreadFrame, f->picture, f->last_picture);
 
-    f->cur = p;
+    f->cur = p = f->picture.f;
 
+    if (f->version < 3 && avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        /* we have interlaced material flagged in container */
+        p->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            p->top_field_first = 1;
+    }
+
+    f->avctx = avctx;
     ff_init_range_decoder(c, buf, buf_size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
@@ -857,29 +936,26 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         p->key_frame = 0;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &f->picture, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(avctx, AV_LOG_DEBUG,
-               "ver:%d keyframe:%d coder:%d ec:%d slices:%d bps:%d\n",
-               f->version, p->key_frame, f->ac, f->ec, f->slice_count,
-               f->avctx->bits_per_raw_sample);
+        av_log(avctx, AV_LOG_DEBUG, "ver:%d keyframe:%d coder:%d ec:%d slices:%d bps:%d\n",
+               f->version, p->key_frame, f->ac, f->ec, f->slice_count, f->avctx->bits_per_raw_sample);
+
+    ff_thread_finish_setup(avctx);
 
     buf_p = buf + buf_size;
     for (i = f->slice_count - 1; i >= 0; i--) {
         FFV1Context *fs = f->slice_context[i];
-        int trailer     = 3 + 5 * !!f->ec;
+        int trailer = 3 + 5*!!f->ec;
         int v;
 
-        if (i || f->version > 2)
-            v = AV_RB24(buf_p - trailer) + trailer;
-        else
-            v = buf_p - c->bytestream_start;
+        if (i || f->version > 2) v = AV_RB24(buf_p-trailer) + trailer;
+        else                     v = buf_p - c->bytestream_start;
         if (buf_p - c->bytestream_start < v) {
             av_log(avctx, AV_LOG_ERROR, "Slice pointer chain broken\n");
+            ff_thread_report_progress(&f->picture, INT_MAX, 0);
             return AVERROR_INVALIDDATA;
         }
         buf_p -= v;
@@ -887,9 +963,20 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         if (f->ec) {
             unsigned crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0, buf_p, v);
             if (crc) {
-                av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!\n", crc);
+                int64_t ts = avpkt->pts != AV_NOPTS_VALUE ? avpkt->pts : avpkt->dts;
+                av_log(f->avctx, AV_LOG_ERROR, "CRC mismatch %X!", crc);
+                if (ts != AV_NOPTS_VALUE && avctx->pkt_timebase.num) {
+                    av_log(f->avctx, AV_LOG_ERROR, "at %f seconds\n", ts*av_q2d(avctx->pkt_timebase));
+                } else if (ts != AV_NOPTS_VALUE) {
+                    av_log(f->avctx, AV_LOG_ERROR, "at %"PRId64"\n", ts);
+                } else {
+                    av_log(f->avctx, AV_LOG_ERROR, "\n");
+                }
                 fs->slice_damaged = 1;
             }
+            if (avctx->debug & FF_DEBUG_PICT_INFO) {
+                av_log(avctx, AV_LOG_DEBUG, "slice %d, CRC: 0x%08X\n", i, AV_RB32(buf_p + v - 4));
+            }
         }
 
         if (i) {
@@ -897,57 +984,157 @@ static int ffv1_decode_frame(AVCodecContext *avctx, void *data,
         } else
             fs->c.bytestream_end = buf_p + v;
 
+        fs->avctx = avctx;
         fs->cur = p;
     }
 
-    avctx->execute(avctx, decode_slice, &f->slice_context[0], NULL,
+    avctx->execute(avctx,
+                   decode_slice,
+                   &f->slice_context[0],
+                   NULL,
                    f->slice_count,
-                   sizeof(void *));
+                   sizeof(void*));
 
     for (i = f->slice_count - 1; i >= 0; i--) {
         FFV1Context *fs = f->slice_context[i];
         int j;
-        if (fs->slice_damaged && f->last_picture->data[0]) {
+        if (fs->slice_damaged && f->last_picture.f->data[0]) {
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
             const uint8_t *src[4];
             uint8_t *dst[4];
+            ff_thread_await_progress(&f->last_picture, INT_MAX, 0);
             for (j = 0; j < 4; j++) {
+                int pixshift = desc->comp[j].depth > 8;
                 int sh = (j == 1 || j == 2) ? f->chroma_h_shift : 0;
                 int sv = (j == 1 || j == 2) ? f->chroma_v_shift : 0;
                 dst[j] = p->data[j] + p->linesize[j] *
-                         (fs->slice_y >> sv) + (fs->slice_x >> sh);
-                src[j] = f->last_picture->data[j] +
-                         f->last_picture->linesize[j] *
-                         (fs->slice_y >> sv) + (fs->slice_x >> sh);
+                         (fs->slice_y >> sv) + ((fs->slice_x >> sh) << pixshift);
+                src[j] = f->last_picture.f->data[j] + f->last_picture.f->linesize[j] *
+                         (fs->slice_y >> sv) + ((fs->slice_x >> sh) << pixshift);
             }
             av_image_copy(dst, p->linesize, src,
-                          f->last_picture->linesize,
-                          avctx->pix_fmt, fs->slice_width,
+                          f->last_picture.f->linesize,
+                          avctx->pix_fmt,
+                          fs->slice_width,
                           fs->slice_height);
         }
     }
+    ff_thread_report_progress(&f->picture, INT_MAX, 0);
 
     f->picture_number++;
 
-    av_frame_unref(f->last_picture);
-    if ((ret = av_frame_ref(f->last_picture, p)) < 0)
-        return ret;
+    if (f->last_picture.f)
+        ff_thread_release_buffer(avctx, &f->last_picture);
     f->cur = NULL;
+    if ((ret = av_frame_ref(data, f->picture.f)) < 0)
+        return ret;
 
     *got_frame = 1;
 
     return buf_size;
 }
 
-static av_cold int ffv1_decode_close(AVCodecContext *avctx)
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    FFV1Context *f = avctx->priv_data;
+    int i, ret;
+
+    f->picture.f      = NULL;
+    f->last_picture.f = NULL;
+    f->sample_buffer  = NULL;
+    f->max_slice_count = 0;
+    f->slice_count = 0;
+
+    for (i = 0; i < f->quant_table_count; i++) {
+        av_assert0(f->version > 1);
+        f->initial_states[i] = av_memdup(f->initial_states[i],
+                                         f->context_count[i] * sizeof(*f->initial_states[i]));
+    }
+
+    f->picture.f      = av_frame_alloc();
+    f->last_picture.f = av_frame_alloc();
+
+    if ((ret = ff_ffv1_init_slice_contexts(f)) < 0)
+        return ret;
+
+    return 0;
+}
+#endif
+
+static void copy_fields(FFV1Context *fsdst, FFV1Context *fssrc, FFV1Context *fsrc)
 {
-    FFV1Context *s = avctx->priv_data;;
+    fsdst->version             = fsrc->version;
+    fsdst->micro_version       = fsrc->micro_version;
+    fsdst->chroma_planes       = fsrc->chroma_planes;
+    fsdst->chroma_h_shift      = fsrc->chroma_h_shift;
+    fsdst->chroma_v_shift      = fsrc->chroma_v_shift;
+    fsdst->transparency        = fsrc->transparency;
+    fsdst->plane_count         = fsrc->plane_count;
+    fsdst->ac                  = fsrc->ac;
+    fsdst->colorspace          = fsrc->colorspace;
+
+    fsdst->ec                  = fsrc->ec;
+    fsdst->intra               = fsrc->intra;
+    fsdst->slice_damaged       = fssrc->slice_damaged;
+    fsdst->key_frame_ok        = fsrc->key_frame_ok;
+
+    fsdst->bits_per_raw_sample = fsrc->bits_per_raw_sample;
+    fsdst->packed_at_lsb       = fsrc->packed_at_lsb;
+    fsdst->slice_count         = fsrc->slice_count;
+    if (fsrc->version<3){
+        fsdst->slice_x             = fssrc->slice_x;
+        fsdst->slice_y             = fssrc->slice_y;
+        fsdst->slice_width         = fssrc->slice_width;
+        fsdst->slice_height        = fssrc->slice_height;
+    }
+}
 
-    av_frame_free(&s->last_picture);
+#if HAVE_THREADS
+static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    FFV1Context *fsrc = src->priv_data;
+    FFV1Context *fdst = dst->priv_data;
+    int i, ret;
 
-    ffv1_close(avctx);
+    if (dst == src)
+        return 0;
+
+    {
+        ThreadFrame picture = fdst->picture, last_picture = fdst->last_picture;
+        uint8_t (*initial_states[MAX_QUANT_TABLES])[32];
+        struct FFV1Context *slice_context[MAX_SLICES];
+        memcpy(initial_states, fdst->initial_states, sizeof(fdst->initial_states));
+        memcpy(slice_context,  fdst->slice_context , sizeof(fdst->slice_context));
+
+        memcpy(fdst, fsrc, sizeof(*fdst));
+        memcpy(fdst->initial_states, initial_states, sizeof(fdst->initial_states));
+        memcpy(fdst->slice_context,  slice_context , sizeof(fdst->slice_context));
+        fdst->picture      = picture;
+        fdst->last_picture = last_picture;
+        for (i = 0; i<fdst->num_h_slices * fdst->num_v_slices; i++) {
+            FFV1Context *fssrc = fsrc->slice_context[i];
+            FFV1Context *fsdst = fdst->slice_context[i];
+            copy_fields(fsdst, fssrc, fsrc);
+        }
+        av_assert0(!fdst->plane[0].state);
+        av_assert0(!fdst->sample_buffer);
+    }
+
+    av_assert1(fdst->max_slice_count == fsrc->max_slice_count);
+
+
+    ff_thread_release_buffer(dst, &fdst->picture);
+    if (fsrc->picture.f->data[0]) {
+        if ((ret = ff_thread_ref_frame(&fdst->picture, &fsrc->picture)) < 0)
+            return ret;
+    }
+
+    fdst->fsrc = fsrc;
 
     return 0;
 }
+#endif
 
 AVCodec ff_ffv1_decoder = {
     .name           = "ffv1",
@@ -955,9 +1142,12 @@ AVCodec ff_ffv1_decoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFV1,
     .priv_data_size = sizeof(FFV1Context),
-    .init           = ffv1_decode_init,
-    .close          = ffv1_decode_close,
-    .decode         = ffv1_decode_frame,
+    .init           = decode_init,
+    .close          = ff_ffv1_close,
+    .decode         = decode_frame,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
     .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/ |
-                      AV_CODEC_CAP_SLICE_THREADS,
+                      AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP
 };
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 7995376..948a230 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -1,22 +1,22 @@
 /*
- * FFV1 encoder for libavcodec
+ * FFV1 encoder
  *
- * Copyright (c) 2003-2012 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,10 +27,11 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/crc.h"
 #include "libavutil/opt.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/timer.h"
 
 #include "avcodec.h"
 #include "internal.h"
@@ -40,6 +41,101 @@
 #include "mathops.h"
 #include "ffv1.h"
 
+static const int8_t quant5_10bit[256] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0,
+};
+
+static const int8_t quant5[256] = {
+     0,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1,
+};
+
+static const int8_t quant9_10bit[256] = {
+     0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,
+     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
+     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+     3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3,
+    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+    -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+    -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -0, -0, -0, -0,
+};
+
+static const int8_t quant11[256] = {
+     0,  1,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,
+     4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+     4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5,
+    -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -4, -4,
+    -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+    -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -1,
+};
+
+static const uint8_t ver2_state[256] = {
+      0,  10,  10,  10,  10,  16,  16,  16, 28,   16,  16,  29,  42,  49,  20,  49,
+     59,  25,  26,  26,  27,  31,  33,  33, 33,   34,  34,  37,  67,  38,  39,  39,
+     40,  40,  41,  79,  43,  44,  45,  45, 48,   48,  64,  50,  51,  52,  88,  52,
+     53,  74,  55,  57,  58,  58,  74,  60, 101,  61,  62,  84,  66,  66,  68,  69,
+     87,  82,  71,  97,  73,  73,  82,  75, 111,  77,  94,  78,  87,  81,  83,  97,
+     85,  83,  94,  86,  99,  89,  90,  99, 111,  92,  93,  134, 95,  98,  105, 98,
+    105, 110, 102, 108, 102, 118, 103, 106, 106, 113, 109, 112, 114, 112, 116, 125,
+    115, 116, 117, 117, 126, 119, 125, 121, 121, 123, 145, 124, 126, 131, 127, 129,
+    165, 130, 132, 138, 133, 135, 145, 136, 137, 139, 146, 141, 143, 142, 144, 148,
+    147, 155, 151, 149, 151, 150, 152, 157, 153, 154, 156, 168, 158, 162, 161, 160,
+    172, 163, 169, 164, 166, 184, 167, 170, 177, 174, 171, 173, 182, 176, 180, 178,
+    175, 189, 179, 181, 186, 183, 192, 185, 200, 187, 191, 188, 190, 197, 193, 196,
+    197, 194, 195, 196, 198, 202, 199, 201, 210, 203, 207, 204, 205, 206, 208, 214,
+    209, 211, 221, 212, 213, 215, 224, 216, 217, 218, 219, 220, 222, 228, 223, 225,
+    226, 224, 227, 229, 240, 230, 231, 232, 233, 234, 235, 236, 238, 239, 237, 242,
+    241, 243, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 252, 253, 254, 255,
+};
+
 static void find_best_state(uint8_t best_state[256][256],
                             const uint8_t one_state[256])
 {
@@ -60,12 +156,16 @@ static void find_best_state(uint8_t best_state[256][256],
             double occ[256] = { 0 };
             double len      = 0;
             occ[j] = 1.0;
+
+            if (!one_state[j])
+                continue;
+
             for (k = 0; k < 256; k++) {
                 double newocc[256] = { 0 };
                 for (m = 1; m < 256; m++)
                     if (occ[m]) {
-                        len -= occ[m] *     (p  * l2tab[m] +
-                                        (1 - p) * l2tab[256 - m]);
+                        len -=occ[m]*(     p *l2tab[    m]
+                                      + (1-p)*l2tab[256-m]);
                     }
                 if (len < best_len[k]) {
                     best_len[k]      = len;
@@ -73,7 +173,7 @@ static void find_best_state(uint8_t best_state[256][256],
                 }
                 for (m = 1; m < 256; m++)
                     if (occ[m]) {
-                        newocc[one_state[m]]             += occ[m] * p;
+                        newocc[      one_state[      m]] += occ[m] * p;
                         newocc[256 - one_state[256 - m]] += occ[m] * (1 - p);
                     }
                 memcpy(occ, newocc, sizeof(occ));
@@ -136,6 +236,7 @@ static av_noinline void put_symbol(RangeCoder *c, uint8_t *state,
     put_symbol_inline(c, state, v, is_signed, NULL, NULL);
 }
 
+
 static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
                                   int v, int bits)
 {
@@ -149,7 +250,7 @@ static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
         i += i;
     }
 
-    assert(k <= 13);
+    av_assert2(k <= 13);
 
 #if 0 // JPEG LS
     if (k == 0 && 2 * state->drift <= -state->count)
@@ -179,7 +280,7 @@ static av_always_inline int encode_line(FFV1Context *s, int w,
     int run_mode  = 0;
 
     if (s->ac != AC_GOLOMB_RICE) {
-        if (c->bytestream_end - c->bytestream < w * 20) {
+        if (c->bytestream_end - c->bytestream < w * 35) {
             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
             return AVERROR_INVALIDDATA;
         }
@@ -190,6 +291,18 @@ static av_always_inline int encode_line(FFV1Context *s, int w,
         }
     }
 
+    if (s->slice_coding_mode == 1) {
+        for (x = 0; x < w; x++) {
+            int i;
+            int v = sample[0][x];
+            for (i = bits-1; i>=0; i--) {
+                uint8_t state = 128;
+                put_rac(c, &state, (v>>i) & 1);
+            }
+        }
+        return 0;
+    }
+
     for (x = 0; x < w; x++) {
         int diff, context;
 
@@ -257,10 +370,10 @@ static av_always_inline int encode_line(FFV1Context *s, int w,
     return 0;
 }
 
-static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
-                         int stride, int plane_index)
+static int encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
+                         int stride, int plane_index, int pixel_stride)
 {
-    int x, y, i;
+    int x, y, i, ret;
     const int ring_size = s->context_model ? 3 : 2;
     int16_t *sample[3];
     s->run_index = 0;
@@ -271,38 +384,40 @@ static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h,
         for (i = 0; i < ring_size; i++)
             sample[i] = s->sample_buffer + (w + 6) * ((h + i - y) % ring_size) + 3;
 
-        sample[0][-1] = sample[1][0];
-        sample[1][w]  = sample[1][w - 1];
+        sample[0][-1]= sample[1][0  ];
+        sample[1][ w]= sample[1][w-1];
 // { START_TIMER
         if (s->bits_per_raw_sample <= 8) {
             for (x = 0; x < w; x++)
-                sample[0][x] = src[x + stride * y];
-            encode_line(s, w, sample, plane_index, 8);
+                sample[0][x] = src[x * pixel_stride + stride * y];
+            if((ret = encode_line(s, w, sample, plane_index, 8)) < 0)
+                return ret;
         } else {
             if (s->packed_at_lsb) {
-                for (x = 0; x < w; x++)
-                    sample[0][x] = ((uint16_t *)(src + stride * y))[x];
+                for (x = 0; x < w; x++) {
+                    sample[0][x] = ((uint16_t*)(src + stride*y))[x];
+                }
             } else {
-                for (x = 0; x < w; x++)
-                    sample[0][x] =
-                        ((uint16_t *)(src + stride * y))[x] >> (16 - s->bits_per_raw_sample);
+                for (x = 0; x < w; x++) {
+                    sample[0][x] = ((uint16_t*)(src + stride*y))[x] >> (16 - s->bits_per_raw_sample);
+                }
             }
-            encode_line(s, w, sample, plane_index, s->bits_per_raw_sample);
+            if((ret = encode_line(s, w, sample, plane_index, s->bits_per_raw_sample)) < 0)
+                return ret;
         }
 // STOP_TIMER("encode line") }
     }
+    return 0;
 }
 
-static void encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
+static int encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
                              int w, int h, const int stride[3])
 {
     int x, y, p, i;
     const int ring_size = s->context_model ? 3 : 2;
-    int16_t *sample[MAX_PLANES][3];
-    int lbd  = s->avctx->bits_per_raw_sample <= 8;
-    int bits = s->avctx->bits_per_raw_sample > 0
-               ? s->avctx->bits_per_raw_sample
-               : 8;
+    int16_t *sample[4][3];
+    int lbd    = s->bits_per_raw_sample <= 8;
+    int bits   = s->bits_per_raw_sample > 0 ? s->bits_per_raw_sample : 8;
     int offset = 1 << bits;
 
     s->run_index = 0;
@@ -313,29 +428,29 @@ static void encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
     for (y = 0; y < h; y++) {
         for (i = 0; i < ring_size; i++)
             for (p = 0; p < MAX_PLANES; p++)
-                sample[p][i] = s->sample_buffer + p * ring_size *
-                               (w + 6) +
-                               ((h + i - y) % ring_size) * (w + 6) + 3;
+                sample[p][i]= s->sample_buffer + p*ring_size*(w+6) + ((h+i-y)%ring_size)*(w+6) + 3;
 
         for (x = 0; x < w; x++) {
             int b, g, r, av_uninit(a);
             if (lbd) {
-                unsigned v = *((const uint32_t *)(src[0] + x * 4 + stride[0] * y));
-                b = v & 0xFF;
-                g = (v >> 8) & 0xFF;
+                unsigned v = *((const uint32_t*)(src[0] + x*4 + stride[0]*y));
+                b =  v        & 0xFF;
+                g = (v >>  8) & 0xFF;
                 r = (v >> 16) & 0xFF;
-                a = v >> 24;
+                a =  v >> 24;
             } else {
-                b = *((const uint16_t *)(src[0] + x * 2 + stride[0] * y));
-                g = *((const uint16_t *)(src[1] + x * 2 + stride[1] * y));
-                r = *((const uint16_t *)(src[2] + x * 2 + stride[2] * y));
+                b = *((const uint16_t *)(src[0] + x*2 + stride[0]*y));
+                g = *((const uint16_t *)(src[1] + x*2 + stride[1]*y));
+                r = *((const uint16_t *)(src[2] + x*2 + stride[2]*y));
             }
 
-            b -= g;
-            r -= g;
-            g += (b + r) >> 2;
-            b += offset;
-            r += offset;
+            if (s->slice_coding_mode != 1) {
+                b -= g;
+                r -= g;
+                g += (b * s->slice_rct_by_coef + r * s->slice_rct_ry_coef) >> 2;
+                b += offset;
+                r += offset;
+            }
 
             sample[0][0][x] = g;
             sample[1][0][x] = b;
@@ -343,17 +458,20 @@ static void encode_rgb_frame(FFV1Context *s, const uint8_t *src[3],
             sample[3][0][x] = a;
         }
         for (p = 0; p < 3 + s->transparency; p++) {
-            sample[p][0][-1] = sample[p][1][0];
-            sample[p][1][w]  = sample[p][1][w - 1];
-            if (lbd)
-                encode_line(s, w, sample[p], (p + 1) / 2, 9);
+            int ret;
+            sample[p][0][-1] = sample[p][1][0  ];
+            sample[p][1][ w] = sample[p][1][w-1];
+            if (lbd && s->slice_coding_mode == 0)
+                ret = encode_line(s, w, sample[p], (p + 1) / 2, 9);
             else
-                encode_line(s, w, sample[p], (p + 1) / 2, bits + 1);
+                ret = encode_line(s, w, sample[p], (p + 1) / 2, bits + (s->slice_coding_mode != 1));
+            if (ret < 0)
+                return ret;
         }
     }
+    return 0;
 }
 
-
 static void write_quant_table(RangeCoder *c, int16_t *quant_table)
 {
     int last = 0;
@@ -393,7 +511,7 @@ static void write_header(FFV1Context *f)
                 put_symbol(c, state,
                            f->state_transition[i] - c->one_state[i], 1);
         }
-        put_symbol(c, state, f->colorspace, 0); // YUV cs type
+        put_symbol(c, state, f->colorspace, 0); //YUV cs type
         if (f->version > 0)
             put_symbol(c, state, f->bits_per_raw_sample, 0);
         put_rac(c, state, f->chroma_planes);
@@ -437,15 +555,19 @@ static int write_extradata(FFV1Context *f)
 
     f->avctx->extradata_size = 10000 + 4 +
                                     (11 * 11 * 5 * 5 * 5 + 11 * 11 * 11) * 32;
-    f->avctx->extradata = av_malloc(f->avctx->extradata_size);
+    f->avctx->extradata = av_malloc(f->avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!f->avctx->extradata)
+        return AVERROR(ENOMEM);
     ff_init_range_encoder(c, f->avctx->extradata, f->avctx->extradata_size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
     put_symbol(c, state, f->version, 0);
     if (f->version > 2) {
-        if (f->version == 3)
-            f->minor_version = 2;
-        put_symbol(c, state, f->minor_version, 0);
+        if (f->version == 3) {
+            f->micro_version = 4;
+        } else if (f->version == 4)
+            f->micro_version = 2;
+        put_symbol(c, state, f->micro_version, 0);
     }
 
     put_symbol(c, state, f->ac, 0);
@@ -485,12 +607,11 @@ static int write_extradata(FFV1Context *f)
 
     if (f->version > 2) {
         put_symbol(c, state, f->ec, 0);
+        put_symbol(c, state, f->intra = (f->avctx->gop_size < 2), 0);
     }
 
     f->avctx->extradata_size = ff_rac_terminate(c);
-
-    v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0,
-               f->avctx->extradata, f->avctx->extradata_size);
+    v = av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0, f->avctx->extradata, f->avctx->extradata_size);
     AV_WL32(f->avctx->extradata + f->avctx->extradata_size, v);
     f->avctx->extradata_size += 4;
 
@@ -515,7 +636,7 @@ static int sort_stt(FFV1Context *s, uint8_t stt[256])
 
                 double size0 = COST2(i,  i) + COST2(i2, i2);
                 double sizeX = COST2(i, i2) + COST2(i2, i);
-                if (sizeX < size0 && i != 128 && i2 != 128) {
+                if (size0 - sizeX > size0*(1e-14) && i != 128 && i2 != 128) {
                     int j;
                     FFSWAP(int, stt[i], stt[i2]);
                     FFSWAP(int, s->rc_stat[i][0], s->rc_stat[i2][0]);
@@ -545,24 +666,14 @@ static int sort_stt(FFV1Context *s, uint8_t stt[256])
     return print;
 }
 
-static av_cold int init_slices_state(FFV1Context *f)
-{
-    int i, ret;
-    for (i = 0; i < f->slice_count; i++) {
-        FFV1Context *fs = f->slice_context[i];
-        if ((ret = ffv1_init_slice_state(f, fs)) < 0)
-            return AVERROR(ENOMEM);
-    }
-    return 0;
-}
-
-static av_cold int ffv1_encode_init(AVCodecContext *avctx)
+static av_cold int encode_init(AVCodecContext *avctx)
 {
     FFV1Context *s = avctx->priv_data;
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
     int i, j, k, m, ret;
 
-    ffv1_common_init(avctx);
+    if ((ret = ff_ffv1_common_init(avctx)) < 0)
+        return ret;
 
     s->version = 0;
 
@@ -570,40 +681,58 @@ static av_cold int ffv1_encode_init(AVCodecContext *avctx)
         avctx->slices > 1)
         s->version = FFMAX(s->version, 2);
 
-    if (avctx->level == 3) {
+    // Unspecified level & slices, we choose version 1.2+ to ensure multithreaded decodability
+    if (avctx->slices == 0 && avctx->level < 0 && avctx->width * avctx->height > 720*576)
+        s->version = FFMAX(s->version, 2);
+
+    if (avctx->level <= 0 && s->version == 2) {
         s->version = 3;
     }
+    if (avctx->level >= 0 && avctx->level <= 4) {
+        if (avctx->level < s->version) {
+            av_log(avctx, AV_LOG_ERROR, "Version %d needed for requested features but %d requested\n", s->version, avctx->level);
+            return AVERROR(EINVAL);
+        }
+        s->version = avctx->level;
+    }
 
     if (s->ec < 0) {
         s->ec = (s->version >= 3);
     }
 
-    if (s->version >= 2 &&
-        avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Version %d requested, please set -strict experimental in "
-               "order to enable it\n",
-               s->version);
-        return AVERROR(ENOSYS);
+    if ((s->version == 2 || s->version>3) && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_ERROR, "Version 2 needed for requested features but version 2 is experimental and not enabled\n");
+        return AVERROR_INVALIDDATA;
     }
 
 #if FF_API_CODER_TYPE
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->coder_type != -1)
         s->ac = avctx->coder_type > 0 ? AC_RANGE_CUSTOM_TAB : AC_GOLOMB_RICE;
+    else
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+    if (s->ac == 1) // Compatbility with common command line usage
+        s->ac = AC_RANGE_CUSTOM_TAB;
+    else if (s->ac == AC_RANGE_DEFAULT_TAB_FORCE)
+        s->ac = AC_RANGE_DEFAULT_TAB;
 
     s->plane_count = 3;
-    switch (avctx->pix_fmt) {
+    switch(avctx->pix_fmt) {
     case AV_PIX_FMT_YUV444P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUVA444P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA420P9:
         if (!avctx->bits_per_raw_sample)
             s->bits_per_raw_sample = 9;
     case AV_PIX_FMT_YUV444P10:
     case AV_PIX_FMT_YUV420P10:
     case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA420P10:
         s->packed_at_lsb = 1;
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 10;
@@ -611,6 +740,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     case AV_PIX_FMT_YUV444P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV420P16:
+    case AV_PIX_FMT_YUVA444P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA420P16:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample) {
             s->bits_per_raw_sample = 16;
         } else if (!s->bits_per_raw_sample) {
@@ -627,25 +759,36 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
         s->version = FFMAX(s->version, 1);
     case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_YA8:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV422P:
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV411P:
     case AV_PIX_FMT_YUV410P:
-        s->chroma_planes = desc->nb_components < 3 ? 0 : 1;
-        s->colorspace    = 0;
-        break;
     case AV_PIX_FMT_YUVA444P:
     case AV_PIX_FMT_YUVA422P:
     case AV_PIX_FMT_YUVA420P:
-        s->chroma_planes = 1;
-        s->colorspace    = 0;
-        s->transparency  = 1;
+        s->chroma_planes = desc->nb_components < 3 ? 0 : 1;
+        s->colorspace = 0;
+        s->transparency = desc->nb_components == 4 || desc->nb_components == 2;
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
+        else if (!s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
         break;
     case AV_PIX_FMT_RGB32:
-        s->colorspace   = 1;
+        s->colorspace = 1;
         s->transparency = 1;
+        s->chroma_planes = 1;
+        if (!avctx->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
+        break;
+    case AV_PIX_FMT_0RGB32:
+        s->colorspace = 1;
+        s->chroma_planes = 1;
+        if (!avctx->bits_per_raw_sample)
+            s->bits_per_raw_sample = 8;
         break;
     case AV_PIX_FMT_GBRP9:
         if (!avctx->bits_per_raw_sample)
@@ -653,61 +796,73 @@ FF_ENABLE_DEPRECATION_WARNINGS
     case AV_PIX_FMT_GBRP10:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
             s->bits_per_raw_sample = 10;
-    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_GBRP12:
         if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
-            s->bits_per_raw_sample = 16;
+            s->bits_per_raw_sample = 12;
+    case AV_PIX_FMT_GBRP14:
+        if (!avctx->bits_per_raw_sample && !s->bits_per_raw_sample)
+            s->bits_per_raw_sample = 14;
         else if (!s->bits_per_raw_sample)
             s->bits_per_raw_sample = avctx->bits_per_raw_sample;
-        s->colorspace    = 1;
+        s->colorspace = 1;
         s->chroma_planes = 1;
-        s->version       = FFMAX(s->version, 1);
+        s->version = FFMAX(s->version, 1);
+        if (s->ac == AC_GOLOMB_RICE) {
+            av_log(avctx, AV_LOG_INFO,
+                   "bits_per_raw_sample > 8, forcing coder 1\n");
+            s->ac = AC_RANGE_CUSTOM_TAB;
+        }
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
-        return AVERROR_INVALIDDATA;
+        return AVERROR(ENOSYS);
     }
+    av_assert0(s->bits_per_raw_sample >= 8);
+
     if (s->transparency) {
-        av_log(
-            avctx, AV_LOG_WARNING,
-            "Storing alpha plane, this will require a recent FFV1 decoder to playback!\n");
+        av_log(avctx, AV_LOG_WARNING, "Storing alpha plane, this will require a recent FFV1 decoder to playback!\n");
     }
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->context_model)
         s->context_model = avctx->context_model;
     if (avctx->context_model > 1U) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid context model %d, valid values are 0 and 1\n",
-               avctx->context_model);
+        av_log(avctx, AV_LOG_ERROR, "Invalid context model %d, valid values are 0 and 1\n", avctx->context_model);
         return AVERROR(EINVAL);
     }
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    if (s->ac == AC_RANGE_CUSTOM_TAB)
+    if (s->ac == AC_RANGE_CUSTOM_TAB) {
         for (i = 1; i < 256; i++)
-            s->state_transition[i] = ffv1_ver2_state[i];
+            s->state_transition[i] = ver2_state[i];
+    } else {
+        RangeCoder c;
+        ff_build_rac_states(&c, 0.05 * (1LL << 32), 256 - 8);
+        for (i = 1; i < 256; i++)
+            s->state_transition[i] = c.one_state[i];
+    }
 
     for (i = 0; i < 256; i++) {
         s->quant_table_count = 2;
         if (s->bits_per_raw_sample <= 8) {
-            s->quant_tables[0][0][i] = ffv1_quant11[i];
-            s->quant_tables[0][1][i] = ffv1_quant11[i] * 11;
-            s->quant_tables[0][2][i] = ffv1_quant11[i] * 11 * 11;
-            s->quant_tables[1][0][i] = ffv1_quant11[i];
-            s->quant_tables[1][1][i] = ffv1_quant11[i] * 11;
-            s->quant_tables[1][2][i] = ffv1_quant5[i]  * 11 * 11;
-            s->quant_tables[1][3][i] = ffv1_quant5[i]  *  5 * 11 * 11;
-            s->quant_tables[1][4][i] = ffv1_quant5[i]  *  5 *  5 * 11 * 11;
+            s->quant_tables[0][0][i]=           quant11[i];
+            s->quant_tables[0][1][i]=        11*quant11[i];
+            s->quant_tables[0][2][i]=     11*11*quant11[i];
+            s->quant_tables[1][0][i]=           quant11[i];
+            s->quant_tables[1][1][i]=        11*quant11[i];
+            s->quant_tables[1][2][i]=     11*11*quant5 [i];
+            s->quant_tables[1][3][i]=   5*11*11*quant5 [i];
+            s->quant_tables[1][4][i]= 5*5*11*11*quant5 [i];
         } else {
-            s->quant_tables[0][0][i] = ffv1_quant9_10bit[i];
-            s->quant_tables[0][1][i] = ffv1_quant9_10bit[i] * 11;
-            s->quant_tables[0][2][i] = ffv1_quant9_10bit[i] * 11 * 11;
-            s->quant_tables[1][0][i] = ffv1_quant9_10bit[i];
-            s->quant_tables[1][1][i] = ffv1_quant9_10bit[i] * 11;
-            s->quant_tables[1][2][i] = ffv1_quant5_10bit[i] * 11 * 11;
-            s->quant_tables[1][3][i] = ffv1_quant5_10bit[i] *  5 * 11 * 11;
-            s->quant_tables[1][4][i] = ffv1_quant5_10bit[i] *  5 *  5 * 11 * 11;
+            s->quant_tables[0][0][i]=           quant9_10bit[i];
+            s->quant_tables[0][1][i]=        11*quant9_10bit[i];
+            s->quant_tables[0][2][i]=     11*11*quant9_10bit[i];
+            s->quant_tables[1][0][i]=           quant9_10bit[i];
+            s->quant_tables[1][1][i]=        11*quant9_10bit[i];
+            s->quant_tables[1][2][i]=     11*11*quant5_10bit[i];
+            s->quant_tables[1][3][i]=   5*11*11*quant5_10bit[i];
+            s->quant_tables[1][4][i]= 5*5*11*11*quant5_10bit[i];
         }
     }
     s->context_count[0] = (11 * 11 * 11        + 1) / 2;
@@ -723,7 +878,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         p->context_count     = s->context_count[p->quant_table_index];
     }
 
-    if ((ret = ffv1_allocate_initial_states(s)) < 0)
+    if ((ret = ff_ffv1_allocate_initial_states(s)) < 0)
         return ret;
 
 #if FF_API_CODED_FRAME
@@ -734,10 +889,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (!s->transparency)
         s->plane_count = 2;
+    if (!s->chroma_planes && s->version > 3)
+        s->plane_count--;
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift,
-                                     &s->chroma_v_shift);
-
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
     s->picture_number = 0;
 
     if (avctx->flags & (AV_CODEC_FLAG_PASS1 | AV_CODEC_FLAG_PASS2)) {
@@ -750,19 +905,22 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
     if (avctx->stats_in) {
         char *p = avctx->stats_in;
-        uint8_t best_state[256][256];
+        uint8_t (*best_state)[256] = av_malloc_array(256, 256);
         int gob_count = 0;
         char *next;
+        if (!best_state)
+            return AVERROR(ENOMEM);
 
         av_assert0(s->version >= 2);
 
-        for (;; ) {
+        for (;;) {
             for (j = 0; j < 256; j++)
                 for (i = 0; i < 2; i++) {
                     s->rc_stat[j][i] = strtol(p, &next, 0);
                     if (next == p) {
                         av_log(avctx, AV_LOG_ERROR,
                                "2Pass file invalid at %d %d [%s]\n", j, i, p);
+                        av_freep(&best_state);
                         return AVERROR_INVALIDDATA;
                     }
                     p = next;
@@ -776,6 +934,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                 av_log(avctx, AV_LOG_ERROR,
                                        "2Pass file invalid at %d %d %d %d [%s]\n",
                                        i, j, k, m, p);
+                                av_freep(&best_state);
                                 return AVERROR_INVALIDDATA;
                             }
                             p = next;
@@ -784,6 +943,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             gob_count = strtol(p, &next, 0);
             if (next == p || gob_count <= 0) {
                 av_log(avctx, AV_LOG_ERROR, "2Pass file invalid\n");
+                av_freep(&best_state);
                 return AVERROR_INVALIDDATA;
             }
             p = next;
@@ -792,52 +952,70 @@ FF_ENABLE_DEPRECATION_WARNINGS
             if (p[0] == 0)
                 break;
         }
-        sort_stt(s, s->state_transition);
+        if (s->ac == AC_RANGE_CUSTOM_TAB)
+            sort_stt(s, s->state_transition);
 
         find_best_state(best_state, s->state_transition);
 
         for (i = 0; i < s->quant_table_count; i++) {
-            for (j = 0; j < s->context_count[i]; j++)
-                for (k = 0; k < 32; k++) {
+            for (k = 0; k < 32; k++) {
+                double a=0, b=0;
+                int jp = 0;
+                for (j = 0; j < s->context_count[i]; j++) {
                     double p = 128;
-                    if (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1]) {
-                        p = 256.0 * s->rc_stat2[i][j][k][1] /
-                            (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1]);
+                    if (s->rc_stat2[i][j][k][0] + s->rc_stat2[i][j][k][1] > 200 && j || a+b > 200) {
+                        if (a+b)
+                            p = 256.0 * b / (a + b);
+                        s->initial_states[i][jp][k] =
+                            best_state[av_clip(round(p), 1, 255)][av_clip_uint8((a + b) / gob_count)];
+                        for(jp++; jp<j; jp++)
+                            s->initial_states[i][jp][k] = s->initial_states[i][jp-1][k];
+                        a=b=0;
+                    }
+                    a += s->rc_stat2[i][j][k][0];
+                    b += s->rc_stat2[i][j][k][1];
+                    if (a+b) {
+                        p = 256.0 * b / (a + b);
                     }
                     s->initial_states[i][j][k] =
-                        best_state[av_clip(round(p), 1, 255)][av_clip((s->rc_stat2[i][j][k][0] +
-                                                                       s->rc_stat2[i][j][k][1]) /
-                                                                      gob_count, 0, 255)];
+                        best_state[av_clip(round(p), 1, 255)][av_clip_uint8((a + b) / gob_count)];
                 }
+            }
         }
+        av_freep(&best_state);
     }
 
     if (s->version > 1) {
-        for (s->num_v_slices = 2; s->num_v_slices < 9; s->num_v_slices++)
-            for (s->num_h_slices = s->num_v_slices;
-                 s->num_h_slices < 2 * s->num_v_slices; s->num_h_slices++)
-                if (avctx->slices == s->num_h_slices * s->num_v_slices &&
-                    avctx->slices <= 64 || !avctx->slices)
+        s->num_v_slices = (avctx->width > 352 || avctx->height > 288 || !avctx->slices) ? 2 : 1;
+        for (; s->num_v_slices < 9; s->num_v_slices++) {
+            for (s->num_h_slices = s->num_v_slices; s->num_h_slices < 2*s->num_v_slices; s->num_h_slices++) {
+                if (avctx->slices == s->num_h_slices * s->num_v_slices && avctx->slices <= 64 || !avctx->slices)
                     goto slices_ok;
+            }
+        }
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported number %d of slices requested, please specify a "
                "supported number with -slices (ex:4,6,9,12,16, ...)\n",
                avctx->slices);
         return AVERROR(ENOSYS);
 slices_ok:
-        write_extradata(s);
+        if ((ret = write_extradata(s)) < 0)
+            return ret;
     }
 
-    if ((ret = ffv1_init_slice_contexts(s)) < 0)
+    if ((ret = ff_ffv1_init_slice_contexts(s)) < 0)
         return ret;
-    if ((ret = init_slices_state(s)) < 0)
+    s->slice_count = s->max_slice_count;
+    if ((ret = ff_ffv1_init_slices_state(s)) < 0)
         return ret;
 
 #define STATS_OUT_SIZE 1024 * 1024 * 6
     if (avctx->flags & AV_CODEC_FLAG_PASS1) {
         avctx->stats_out = av_mallocz(STATS_OUT_SIZE);
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
         for (i = 0; i < s->quant_table_count; i++)
-            for (j = 0; j < s->slice_count; j++) {
+            for (j = 0; j < s->max_slice_count; j++) {
                 FFV1Context *sf = s->slice_context[j];
                 av_assert0(!sf->rc_stat2[i]);
                 sf->rc_stat2[i] = av_mallocz(s->context_count[i] *
@@ -857,23 +1035,112 @@ static void encode_slice_header(FFV1Context *f, FFV1Context *fs)
     int j;
     memset(state, 128, sizeof(state));
 
-    put_symbol(c, state, (fs->slice_x + 1) * f->num_h_slices / f->width, 0);
-    put_symbol(c, state, (fs->slice_y + 1) * f->num_v_slices / f->height, 0);
-    put_symbol(c, state, (fs->slice_width + 1) * f->num_h_slices / f->width - 1,
-               0);
-    put_symbol(c, state,
-               (fs->slice_height + 1) * f->num_v_slices / f->height - 1,
-               0);
-    for (j = 0; j < f->plane_count; j++) {
+    put_symbol(c, state, (fs->slice_x     +1)*f->num_h_slices / f->width   , 0);
+    put_symbol(c, state, (fs->slice_y     +1)*f->num_v_slices / f->height  , 0);
+    put_symbol(c, state, (fs->slice_width +1)*f->num_h_slices / f->width -1, 0);
+    put_symbol(c, state, (fs->slice_height+1)*f->num_v_slices / f->height-1, 0);
+    for (j=0; j<f->plane_count; j++) {
         put_symbol(c, state, f->plane[j].quant_table_index, 0);
         av_assert0(f->plane[j].quant_table_index == f->context_model);
     }
-    if (!f->frame->interlaced_frame)
+    if (!f->picture.f->interlaced_frame)
         put_symbol(c, state, 3, 0);
     else
-        put_symbol(c, state, 1 + !f->frame->top_field_first, 0);
-    put_symbol(c, state, f->frame->sample_aspect_ratio.num, 0);
-    put_symbol(c, state, f->frame->sample_aspect_ratio.den, 0);
+        put_symbol(c, state, 1 + !f->picture.f->top_field_first, 0);
+    put_symbol(c, state, f->picture.f->sample_aspect_ratio.num, 0);
+    put_symbol(c, state, f->picture.f->sample_aspect_ratio.den, 0);
+    if (f->version > 3) {
+        put_rac(c, state, fs->slice_coding_mode == 1);
+        if (fs->slice_coding_mode == 1)
+            ff_ffv1_clear_slice_state(f, fs);
+        put_symbol(c, state, fs->slice_coding_mode, 0);
+        if (fs->slice_coding_mode != 1) {
+            put_symbol(c, state, fs->slice_rct_by_coef, 0);
+            put_symbol(c, state, fs->slice_rct_ry_coef, 0);
+        }
+    }
+}
+
+static void choose_rct_params(FFV1Context *fs, const uint8_t *src[3], const int stride[3], int w, int h)
+{
+#define NB_Y_COEFF 15
+    static const int rct_y_coeff[15][2] = {
+        {0, 0}, //      4G
+        {1, 1}, //  R + 2G + B
+        {2, 2}, // 2R      + 2B
+        {0, 2}, //      2G + 2B
+        {2, 0}, // 2R + 2G
+        {4, 0}, // 4R
+        {0, 4}, //           4B
+
+        {0, 3}, //      1G + 3B
+        {3, 0}, // 3R + 1G
+        {3, 1}, // 3R      +  B
+        {1, 3}, //  R      + 3B
+        {1, 2}, //  R +  G + 2B
+        {2, 1}, // 2R +  G +  B
+        {0, 1}, //      3G +  B
+        {1, 0}, //  R + 3G
+    };
+
+    int stat[NB_Y_COEFF] = {0};
+    int x, y, i, p, best;
+    int16_t *sample[3];
+    int lbd = fs->bits_per_raw_sample <= 8;
+
+    for (y = 0; y < h; y++) {
+        int lastr=0, lastg=0, lastb=0;
+        for (p = 0; p < 3; p++)
+            sample[p] = fs->sample_buffer + p*w;
+
+        for (x = 0; x < w; x++) {
+            int b, g, r;
+            int ab, ag, ar;
+            if (lbd) {
+                unsigned v = *((const uint32_t*)(src[0] + x*4 + stride[0]*y));
+                b =  v        & 0xFF;
+                g = (v >>  8) & 0xFF;
+                r = (v >> 16) & 0xFF;
+            } else {
+                b = *((const uint16_t*)(src[0] + x*2 + stride[0]*y));
+                g = *((const uint16_t*)(src[1] + x*2 + stride[1]*y));
+                r = *((const uint16_t*)(src[2] + x*2 + stride[2]*y));
+            }
+
+            ar = r - lastr;
+            ag = g - lastg;
+            ab = b - lastb;
+            if (x && y) {
+                int bg = ag - sample[0][x];
+                int bb = ab - sample[1][x];
+                int br = ar - sample[2][x];
+
+                br -= bg;
+                bb -= bg;
+
+                for (i = 0; i<NB_Y_COEFF; i++) {
+                    stat[i] += FFABS(bg + ((br*rct_y_coeff[i][0] + bb*rct_y_coeff[i][1])>>2));
+                }
+
+            }
+            sample[0][x] = ag;
+            sample[1][x] = ab;
+            sample[2][x] = ar;
+
+            lastr = r;
+            lastg = g;
+            lastb = b;
+        }
+    }
+
+    best = 0;
+    for (i=1; i<NB_Y_COEFF; i++) {
+        if (stat[i] < stat[best])
+            best = i;
+    }
+
+    fs->slice_rct_by_coef = rct_y_coeff[best][1];
+    fs->slice_rct_ry_coef = rct_y_coeff[best][0];
 }
 
 static int encode_slice(AVCodecContext *c, void *arg)
@@ -884,75 +1151,151 @@ static int encode_slice(AVCodecContext *c, void *arg)
     int height       = fs->slice_height;
     int x            = fs->slice_x;
     int y            = fs->slice_y;
-    const AVFrame *const p = f->frame;
-    const int ps     = (av_pix_fmt_desc_get(c->pix_fmt)->flags & AV_PIX_FMT_FLAG_PLANAR)
-                       ? (f->bits_per_raw_sample > 8) + 1
-                       : 4;
+    const AVFrame *const p = f->picture.f;
+    const int ps     = av_pix_fmt_desc_get(c->pix_fmt)->comp[0].step;
+    int ret;
+    RangeCoder c_bak = fs->c;
+    const uint8_t *planes[3] = {p->data[0] + ps*x + y*p->linesize[0],
+                                p->data[1] + ps*x + y*p->linesize[1],
+                                p->data[2] + ps*x + y*p->linesize[2]};
+
+    fs->slice_coding_mode = 0;
+    if (f->version > 3) {
+        choose_rct_params(fs, planes, p->linesize, width, height);
+    } else {
+        fs->slice_rct_by_coef = 1;
+        fs->slice_rct_ry_coef = 1;
+    }
 
+retry:
     if (f->key_frame)
-        ffv1_clear_slice_state(f, fs);
+        ff_ffv1_clear_slice_state(f, fs);
     if (f->version > 2) {
         encode_slice_header(f, fs);
     }
     if (fs->ac == AC_GOLOMB_RICE) {
         if (f->version > 2)
             put_rac(&fs->c, (uint8_t[]) { 129 }, 0);
-        fs->ac_byte_count = f->version > 2 || (!x && !y) ? ff_rac_terminate( &fs->c) : 0;
-        init_put_bits(&fs->pb, fs->c.bytestream_start + fs->ac_byte_count,
+        fs->ac_byte_count = f->version > 2 || (!x && !y) ? ff_rac_terminate(&fs->c) : 0;
+        init_put_bits(&fs->pb,
+                      fs->c.bytestream_start + fs->ac_byte_count,
                       fs->c.bytestream_end - fs->c.bytestream_start - fs->ac_byte_count);
     }
 
-    if (f->colorspace == 0) {
+    if (f->colorspace == 0 && c->pix_fmt != AV_PIX_FMT_YA8) {
         const int chroma_width  = AV_CEIL_RSHIFT(width,  f->chroma_h_shift);
         const int chroma_height = AV_CEIL_RSHIFT(height, f->chroma_v_shift);
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
 
-        encode_plane(fs, p->data[0] + ps * x + y * p->linesize[0],
-                     width, height, p->linesize[0], 0);
+        ret = encode_plane(fs, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
 
         if (f->chroma_planes) {
-            encode_plane(fs, p->data[1] + ps * cx + cy * p->linesize[1],
-                         chroma_width, chroma_height, p->linesize[1], 1);
-            encode_plane(fs, p->data[2] + ps * cx + cy * p->linesize[2],
-                         chroma_width, chroma_height, p->linesize[2], 1);
+            ret |= encode_plane(fs, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
+            ret |= encode_plane(fs, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
         }
         if (fs->transparency)
-            encode_plane(fs, p->data[3] + ps * x + y * p->linesize[3], width,
-                         height, p->linesize[3], 2);
+            ret |= encode_plane(fs, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], 2, 1);
+    } else if (c->pix_fmt == AV_PIX_FMT_YA8) {
+        ret  = encode_plane(fs, p->data[0] +     ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 2);
+        ret |= encode_plane(fs, p->data[0] + 1 + ps*x + y*p->linesize[0], width, height, p->linesize[0], 1, 2);
     } else {
-        const uint8_t *planes[3] = { p->data[0] + ps * x + y * p->linesize[0],
-                                     p->data[1] + ps * x + y * p->linesize[1],
-                                     p->data[2] + ps * x + y * p->linesize[2] };
-        encode_rgb_frame(fs, planes, width, height, p->linesize);
+        ret = encode_rgb_frame(fs, planes, width, height, p->linesize);
     }
     emms_c();
 
+    if (ret < 0) {
+        av_assert0(fs->slice_coding_mode == 0);
+        if (fs->version < 4 || !fs->ac) {
+            av_log(c, AV_LOG_ERROR, "Buffer too small\n");
+            return ret;
+        }
+        av_log(c, AV_LOG_DEBUG, "Coding slice as PCM\n");
+        fs->slice_coding_mode = 1;
+        fs->c = c_bak;
+        goto retry;
+    }
+
     return 0;
 }
 
-static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pict, int *got_packet)
 {
     FFV1Context *f      = avctx->priv_data;
     RangeCoder *const c = &f->slice_context[0]->c;
+    AVFrame *const p    = f->picture.f;
     int used_count      = 0;
     uint8_t keystate    = 128;
     uint8_t *buf_p;
     int i, ret;
+    int64_t maxsize =   AV_INPUT_BUFFER_MIN_SIZE
+                      + avctx->width*avctx->height*35LL*4;
+
+    if(!pict) {
+        if (avctx->flags & AV_CODEC_FLAG_PASS1) {
+            int j, k, m;
+            char *p   = avctx->stats_out;
+            char *end = p + STATS_OUT_SIZE;
+
+            memset(f->rc_stat, 0, sizeof(f->rc_stat));
+            for (i = 0; i < f->quant_table_count; i++)
+                memset(f->rc_stat2[i], 0, f->context_count[i] * sizeof(*f->rc_stat2[i]));
+
+            av_assert0(f->slice_count == f->max_slice_count);
+            for (j = 0; j < f->slice_count; j++) {
+                FFV1Context *fs = f->slice_context[j];
+                for (i = 0; i < 256; i++) {
+                    f->rc_stat[i][0] += fs->rc_stat[i][0];
+                    f->rc_stat[i][1] += fs->rc_stat[i][1];
+                }
+                for (i = 0; i < f->quant_table_count; i++) {
+                    for (k = 0; k < f->context_count[i]; k++)
+                        for (m = 0; m < 32; m++) {
+                            f->rc_stat2[i][k][m][0] += fs->rc_stat2[i][k][m][0];
+                            f->rc_stat2[i][k][m][1] += fs->rc_stat2[i][k][m][1];
+                        }
+                }
+            }
 
-    f->frame = pict;
+            for (j = 0; j < 256; j++) {
+                snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
+                        f->rc_stat[j][0], f->rc_stat[j][1]);
+                p += strlen(p);
+            }
+            snprintf(p, end - p, "\n");
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height *
-                             ((8 * 2 + 1 + 1) * 4) / 8 +
-                             AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
-        return ret;
+            for (i = 0; i < f->quant_table_count; i++) {
+                for (j = 0; j < f->context_count[i]; j++)
+                    for (m = 0; m < 32; m++) {
+                        snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
+                                f->rc_stat2[i][j][m][0], f->rc_stat2[i][j][m][1]);
+                        p += strlen(p);
+                    }
+            }
+            snprintf(p, end - p, "%d\n", f->gob_count);
+        }
+        return 0;
     }
 
+    if (f->version > 3)
+        maxsize = AV_INPUT_BUFFER_MIN_SIZE + avctx->width*avctx->height*3LL*4;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, maxsize, 0)) < 0)
+        return ret;
+
     ff_init_range_encoder(c, pkt->data, pkt->size);
     ff_build_rac_states(c, 0.05 * (1LL << 32), 256 - 8);
 
+    av_frame_unref(p);
+    if ((ret = av_frame_ref(p, pict)) < 0)
+        return ret;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
     if (avctx->gop_size == 0 || f->picture_number % avctx->gop_size == 0) {
         put_rac(c, &keystate, 1);
         f->key_frame = 1;
@@ -973,9 +1316,8 @@ static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     for (i = 1; i < f->slice_count; i++) {
         FFV1Context *fs = f->slice_context[i];
-        uint8_t *start  = pkt->data +
-                          (pkt->size - used_count) * (int64_t)i / f->slice_count;
-        int len = pkt->size / f->slice_count;
+        uint8_t *start  = pkt->data + (pkt->size - used_count) * (int64_t)i / f->slice_count;
+        int len         = pkt->size / f->slice_count;
         ff_init_range_encoder(&fs->c, start, len);
     }
     avctx->execute(avctx, encode_slice, &f->slice_context[0], NULL,
@@ -1011,47 +1353,7 @@ static int ffv1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         buf_p += bytes;
     }
 
-    if ((avctx->flags & AV_CODEC_FLAG_PASS1) && (f->picture_number & 31) == 0) {
-        int j, k, m;
-        char *p   = avctx->stats_out;
-        char *end = p + STATS_OUT_SIZE;
-
-        memset(f->rc_stat, 0, sizeof(f->rc_stat));
-        for (i = 0; i < f->quant_table_count; i++)
-            memset(f->rc_stat2[i], 0, f->context_count[i] * sizeof(*f->rc_stat2[i]));
-
-        for (j = 0; j < f->slice_count; j++) {
-            FFV1Context *fs = f->slice_context[j];
-            for (i = 0; i < 256; i++) {
-                f->rc_stat[i][0] += fs->rc_stat[i][0];
-                f->rc_stat[i][1] += fs->rc_stat[i][1];
-            }
-            for (i = 0; i < f->quant_table_count; i++) {
-                for (k = 0; k < f->context_count[i]; k++)
-                    for (m = 0; m < 32; m++) {
-                        f->rc_stat2[i][k][m][0] += fs->rc_stat2[i][k][m][0];
-                        f->rc_stat2[i][k][m][1] += fs->rc_stat2[i][k][m][1];
-                    }
-            }
-        }
-
-        for (j = 0; j < 256; j++) {
-            snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
-                     f->rc_stat[j][0], f->rc_stat[j][1]);
-            p += strlen(p);
-        }
-        snprintf(p, end - p, "\n");
-
-        for (i = 0; i < f->quant_table_count; i++) {
-            for (j = 0; j < f->context_count[i]; j++)
-                for (m = 0; m < 32; m++) {
-                    snprintf(p, end - p, "%" PRIu64 " %" PRIu64 " ",
-                             f->rc_stat2[i][j][m][0], f->rc_stat2[i][j][m][1]);
-                    p += strlen(p);
-                }
-        }
-        snprintf(p, end - p, "%d\n", f->gob_count);
-    } else if (avctx->flags & AV_CODEC_FLAG_PASS1)
+    if (avctx->flags & AV_CODEC_FLAG_PASS1)
         avctx->stats_out[0] = '\0';
 
 #if FF_API_CODED_FRAME
@@ -1062,38 +1364,41 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     f->picture_number++;
     pkt->size   = buf_p - pkt->data;
+    pkt->pts    =
+    pkt->dts    = pict->pts;
     pkt->flags |= AV_PKT_FLAG_KEY * f->key_frame;
     *got_packet = 1;
 
     return 0;
 }
 
-static av_cold int ffv1_encode_close(AVCodecContext *avctx)
+static av_cold int encode_close(AVCodecContext *avctx)
 {
-    ffv1_close(avctx);
+    ff_ffv1_close(avctx);
     return 0;
 }
 
 #define OFFSET(x) offsetof(FFV1Context, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "slicecrc", "Protect slices with CRCs", OFFSET(ec), AV_OPT_TYPE_INT,
-             { .i64 = -1 }, -1, 1, VE },
+    { "slicecrc", "Protect slices with CRCs", OFFSET(ec), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
     { "coder", "Coder type", OFFSET(ac), AV_OPT_TYPE_INT,
-            { .i64 = AC_GOLOMB_RICE }, 0, 2, VE, "coder" },
+            { .i64 = 0 }, -2, 2, VE, "coder" },
         { "rice", "Golomb rice", 0, AV_OPT_TYPE_CONST,
             { .i64 = AC_GOLOMB_RICE }, INT_MIN, INT_MAX, VE, "coder" },
         { "range_def", "Range with default table", 0, AV_OPT_TYPE_CONST,
-            { .i64 = AC_RANGE_DEFAULT_TAB }, INT_MIN, INT_MAX, VE, "coder" },
+            { .i64 = AC_RANGE_DEFAULT_TAB_FORCE }, INT_MIN, INT_MAX, VE, "coder" },
         { "range_tab", "Range with custom table", 0, AV_OPT_TYPE_CONST,
             { .i64 = AC_RANGE_CUSTOM_TAB }, INT_MIN, INT_MAX, VE, "coder" },
+        { "ac", "Range with custom table (the ac option exists for compatibility and is deprecated)", 0, AV_OPT_TYPE_CONST,
+            { .i64 = 1 }, INT_MIN, INT_MAX, VE, "coder" },
     { "context", "Context model", OFFSET(context_model), AV_OPT_TYPE_INT,
             { .i64 = 0 }, 0, 1, VE },
 
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass ffv1_class = {
     .class_name = "ffv1 encoder",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -1113,25 +1418,27 @@ AVCodec ff_ffv1_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFV1,
     .priv_data_size = sizeof(FFV1Context),
-    .init           = ffv1_encode_init,
-    .encode2        = ffv1_encode_frame,
-    .close          = ffv1_encode_close,
-    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_close,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV411P,   AV_PIX_FMT_YUV410P,
-        AV_PIX_FMT_YUV444P9,  AV_PIX_FMT_YUV422P9,  AV_PIX_FMT_YUV420P9,
-        AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
-        AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
-        AV_PIX_FMT_RGB32,
-        AV_PIX_FMT_GBRP9,     AV_PIX_FMT_GBRP10,
-        AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,  AV_PIX_FMT_YUVA444P,
-        AV_PIX_FMT_GRAY16,    AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUVA420P,  AV_PIX_FMT_YUVA422P,  AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUVA444P,  AV_PIX_FMT_YUV440P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_YUV410P,   AV_PIX_FMT_0RGB32,    AV_PIX_FMT_RGB32,     AV_PIX_FMT_YUV420P16,
+        AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16, AV_PIX_FMT_YUV444P9,  AV_PIX_FMT_YUV422P9,
+        AV_PIX_FMT_YUV420P9,  AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_YUVA444P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA420P16,
+        AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA420P10,
+        AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA420P9,
+        AV_PIX_FMT_GRAY16,    AV_PIX_FMT_GRAY8,     AV_PIX_FMT_GBRP9,     AV_PIX_FMT_GBRP10,
+        AV_PIX_FMT_GBRP12,    AV_PIX_FMT_GBRP14,
+        AV_PIX_FMT_YA8,
         AV_PIX_FMT_NONE
 
     },
 #if FF_API_CODER_TYPE
     .defaults       = ffv1_defaults,
 #endif
-    .priv_class     = &class,
+    .priv_class     = &ffv1_class,
 };
diff --git a/libavcodec/ffwavesynth.c b/libavcodec/ffwavesynth.c
new file mode 100644
index 0000000..9d055e4
--- /dev/null
+++ b/libavcodec/ffwavesynth.c
@@ -0,0 +1,481 @@
+/*
+ * Wavesynth pseudo-codec
+ * Copyright (c) 2011 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+#include "avcodec.h"
+#include "internal.h"
+
+
+#define SIN_BITS 14
+#define WS_MAX_CHANNELS 32
+#define INF_TS 0x7FFFFFFFFFFFFFFF
+
+#define PINK_UNIT 128
+
+/*
+   Format of the extradata and packets
+
+   THIS INFORMATION IS NOT PART OF THE PUBLIC API OR ABI.
+   IT CAN CHANGE WITHOUT NOTIFICATION.
+
+   All numbers are in little endian.
+
+   The codec extradata define a set of intervals with uniform content.
+   Overlapping intervals are added together.
+
+   extradata:
+       uint32      number of intervals
+       ...         intervals
+
+   interval:
+       int64       start timestamp; time_base must be 1/sample_rate;
+                   start timestamps must be in ascending order
+       int64       end timestamp
+       uint32      type
+       uint32      channels mask
+       ...         additional information, depends on type
+
+   sine interval (type fourcc "SINE"):
+       int32       start frequency, in 1/(1<<16) Hz
+       int32       end frequency
+       int32       start amplitude, 1<<16 is the full amplitude
+       int32       end amplitude
+       uint32      start phase, 0 is sin(0), 0x20000000 is sin(pi/2), etc.;
+                   n | (1<<31) means to match the phase of previous channel #n
+
+   pink noise interval (type fourcc "NOIS"):
+       int32       start amplitude
+       int32       end amplitude
+
+   The input packets encode the time and duration of the requested segment.
+
+   packet:
+       int64       start timestamp
+       int32       duration
+
+*/
+
+enum ws_interval_type {
+    WS_SINE  = MKTAG('S','I','N','E'),
+    WS_NOISE = MKTAG('N','O','I','S'),
+};
+
+struct ws_interval {
+    int64_t ts_start, ts_end;
+    uint64_t phi0, dphi0, ddphi;
+    uint64_t amp0, damp;
+    uint64_t phi, dphi, amp;
+    uint32_t channels;
+    enum ws_interval_type type;
+    int next;
+};
+
+struct wavesynth_context {
+    int64_t cur_ts;
+    int64_t next_ts;
+    int32_t *sin;
+    struct ws_interval *inter;
+    uint32_t dither_state;
+    uint32_t pink_state;
+    int32_t pink_pool[PINK_UNIT];
+    unsigned pink_need, pink_pos;
+    int nb_inter;
+    int cur_inter;
+    int next_inter;
+};
+
+#define LCG_A 1284865837
+#define LCG_C 4150755663
+#define LCG_AI 849225893 /* A*AI = 1 [mod 1<<32] */
+
+static uint32_t lcg_next(uint32_t *s)
+{
+    *s = *s * LCG_A + LCG_C;
+    return *s;
+}
+
+static void lcg_seek(uint32_t *s, int64_t dt)
+{
+    uint32_t a, c, t = *s;
+
+    if (dt >= 0) {
+        a = LCG_A;
+        c = LCG_C;
+    } else { /* coefficients for a step backward */
+        a = LCG_AI;
+        c = (uint32_t)(LCG_AI * LCG_C);
+        dt = -dt;
+    }
+    while (dt) {
+        if (dt & 1)
+            t = a * t + c;
+        c *= a + 1; /* coefficients for a double step */
+        a *= a;
+        dt >>= 1;
+    }
+    *s = t;
+}
+
+/* Emulate pink noise by summing white noise at the sampling frequency,
+ * white noise at half the sampling frequency (each value taken twice),
+ * etc., with a total of 8 octaves.
+ * This is known as the Voss-McCartney algorithm. */
+
+static void pink_fill(struct wavesynth_context *ws)
+{
+    int32_t vt[7] = { 0 }, v = 0;
+    int i, j;
+
+    ws->pink_pos = 0;
+    if (!ws->pink_need)
+        return;
+    for (i = 0; i < PINK_UNIT; i++) {
+        for (j = 0; j < 7; j++) {
+            if ((i >> j) & 1)
+                break;
+            v -= vt[j];
+            vt[j] = (int32_t)lcg_next(&ws->pink_state) >> 3;
+            v += vt[j];
+        }
+        ws->pink_pool[i] = v + ((int32_t)lcg_next(&ws->pink_state) >> 3);
+    }
+    lcg_next(&ws->pink_state); /* so we use exactly 256 steps */
+}
+
+/**
+ * @return  (1<<64) * a / b, without overflow, if a < b
+ */
+static uint64_t frac64(uint64_t a, uint64_t b)
+{
+    uint64_t r = 0;
+    int i;
+
+    if (b < (uint64_t)1 << 32) { /* b small, use two 32-bits steps */
+        a <<= 32;
+        return ((a / b) << 32) | ((a % b) << 32) / b;
+    }
+    if (b < (uint64_t)1 << 48) { /* b medium, use four 16-bits steps */
+        for (i = 0; i < 4; i++) {
+            a <<= 16;
+            r = (r << 16) | (a / b);
+            a %= b;
+        }
+        return r;
+    }
+    for (i = 63; i >= 0; i--) {
+        if (a >= (uint64_t)1 << 63 || a << 1 >= b) {
+            r |= (uint64_t)1 << i;
+            a = (a << 1) - b;
+        } else {
+            a <<= 1;
+        }
+    }
+    return r;
+}
+
+static uint64_t phi_at(struct ws_interval *in, int64_t ts)
+{
+    uint64_t dt = ts - in->ts_start;
+    uint64_t dt2 = dt & 1 ? /* dt * (dt - 1) / 2 without overflow */
+                   dt * ((dt - 1) >> 1) : (dt >> 1) * (dt - 1);
+    return in->phi0 + dt * in->dphi0 + dt2 * in->ddphi;
+}
+
+static void wavesynth_seek(struct wavesynth_context *ws, int64_t ts)
+{
+    int *last, i;
+    struct ws_interval *in;
+
+    last = &ws->cur_inter;
+    for (i = 0; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (ts < in->ts_start)
+            break;
+        if (ts >= in->ts_end)
+            continue;
+        *last = i;
+        last = &in->next;
+        in->phi  = phi_at(in, ts);
+        in->dphi = in->dphi0 + (ts - in->ts_start) * in->ddphi;
+        in->amp  = in->amp0  + (ts - in->ts_start) * in->damp;
+    }
+    ws->next_inter = i;
+    ws->next_ts = i < ws->nb_inter ? ws->inter[i].ts_start : INF_TS;
+    *last = -1;
+    lcg_seek(&ws->dither_state, ts - ws->cur_ts);
+    if (ws->pink_need) {
+        int64_t pink_ts_cur  = (ws->cur_ts + PINK_UNIT - 1) & ~(PINK_UNIT - 1);
+        int64_t pink_ts_next = ts & ~(PINK_UNIT - 1);
+        int pos = ts & (PINK_UNIT - 1);
+        lcg_seek(&ws->pink_state, (pink_ts_next - pink_ts_cur) << 1);
+        if (pos) {
+            pink_fill(ws);
+            ws->pink_pos = pos;
+        } else {
+            ws->pink_pos = PINK_UNIT;
+        }
+    }
+    ws->cur_ts = ts;
+}
+
+static int wavesynth_parse_extradata(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    struct ws_interval *in;
+    uint8_t *edata, *edata_end;
+    int32_t f1, f2, a1, a2;
+    uint32_t phi;
+    int64_t dphi1, dphi2, dt, cur_ts = -0x8000000000000000;
+    int i;
+
+    if (avc->extradata_size < 4)
+        return AVERROR(EINVAL);
+    edata = avc->extradata;
+    edata_end = edata + avc->extradata_size;
+    ws->nb_inter = AV_RL32(edata);
+    edata += 4;
+    if (ws->nb_inter < 0)
+        return AVERROR(EINVAL);
+    ws->inter = av_calloc(ws->nb_inter, sizeof(*ws->inter));
+    if (!ws->inter)
+        return AVERROR(ENOMEM);
+    for (i = 0; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (edata_end - edata < 24)
+            return AVERROR(EINVAL);
+        in->ts_start = AV_RL64(edata +  0);
+        in->ts_end   = AV_RL64(edata +  8);
+        in->type     = AV_RL32(edata + 16);
+        in->channels = AV_RL32(edata + 20);
+        edata += 24;
+        if (in->ts_start < cur_ts || in->ts_end <= in->ts_start)
+            return AVERROR(EINVAL);
+        cur_ts = in->ts_start;
+        dt = in->ts_end - in->ts_start;
+        switch (in->type) {
+            case WS_SINE:
+                if (edata_end - edata < 20)
+                    return AVERROR(EINVAL);
+                f1  = AV_RL32(edata +  0);
+                f2  = AV_RL32(edata +  4);
+                a1  = AV_RL32(edata +  8);
+                a2  = AV_RL32(edata + 12);
+                phi = AV_RL32(edata + 16);
+                edata += 20;
+                dphi1 = frac64(f1, (int64_t)avc->sample_rate << 16);
+                dphi2 = frac64(f2, (int64_t)avc->sample_rate << 16);
+                in->dphi0 = dphi1;
+                in->ddphi = (dphi2 - dphi1) / dt;
+                if (phi & 0x80000000) {
+                    phi &= ~0x80000000;
+                    if (phi >= i)
+                        return AVERROR(EINVAL);
+                    in->phi0 = phi_at(&ws->inter[phi], in->ts_start);
+                } else {
+                    in->phi0 = (uint64_t)phi << 33;
+                }
+                break;
+            case WS_NOISE:
+                if (edata_end - edata < 8)
+                    return AVERROR(EINVAL);
+                a1  = AV_RL32(edata +  0);
+                a2  = AV_RL32(edata +  4);
+                edata += 8;
+                break;
+            default:
+                return AVERROR(EINVAL);
+        }
+        in->amp0 = (int64_t)a1 << 32;
+        in->damp = (((int64_t)a2 << 32) - ((int64_t)a1 << 32)) / dt;
+    }
+    if (edata != edata_end)
+        return AVERROR(EINVAL);
+    return 0;
+}
+
+static av_cold int wavesynth_init(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    int i, r;
+
+    if (avc->channels > WS_MAX_CHANNELS) {
+        av_log(avc, AV_LOG_ERROR,
+               "This implementation is limited to %d channels.\n",
+               WS_MAX_CHANNELS);
+        return AVERROR(EINVAL);
+    }
+    r = wavesynth_parse_extradata(avc);
+    if (r < 0) {
+        av_log(avc, AV_LOG_ERROR, "Invalid intervals definitions.\n");
+        goto fail;
+    }
+    ws->sin = av_malloc(sizeof(*ws->sin) << SIN_BITS);
+    if (!ws->sin) {
+        r = AVERROR(ENOMEM);
+        goto fail;
+    }
+    for (i = 0; i < 1 << SIN_BITS; i++)
+        ws->sin[i] = floor(32767 * sin(2 * M_PI * i / (1 << SIN_BITS)));
+    ws->dither_state = MKTAG('D','I','T','H');
+    for (i = 0; i < ws->nb_inter; i++)
+        ws->pink_need += ws->inter[i].type == WS_NOISE;
+    ws->pink_state = MKTAG('P','I','N','K');
+    ws->pink_pos = PINK_UNIT;
+    wavesynth_seek(ws, 0);
+    avc->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+
+fail:
+    av_freep(&ws->inter);
+    av_freep(&ws->sin);
+    return r;
+}
+
+static void wavesynth_synth_sample(struct wavesynth_context *ws, int64_t ts,
+                                   int32_t *channels)
+{
+    int32_t amp, val, *cv;
+    struct ws_interval *in;
+    int i, *last, pink;
+    uint32_t c, all_ch = 0;
+
+    i = ws->cur_inter;
+    last = &ws->cur_inter;
+    if (ws->pink_pos == PINK_UNIT)
+        pink_fill(ws);
+    pink = ws->pink_pool[ws->pink_pos++] >> 16;
+    while (i >= 0) {
+        in = &ws->inter[i];
+        i = in->next;
+        if (ts >= in->ts_end) {
+            *last = i;
+            continue;
+        }
+        last = &in->next;
+        amp = in->amp >> 32;
+        in->amp  += in->damp;
+        switch (in->type) {
+            case WS_SINE:
+                val = amp * ws->sin[in->phi >> (64 - SIN_BITS)];
+                in->phi  += in->dphi;
+                in->dphi += in->ddphi;
+                break;
+            case WS_NOISE:
+                val = amp * pink;
+                break;
+            default:
+                val = 0;
+        }
+        all_ch |= in->channels;
+        for (c = in->channels, cv = channels; c; c >>= 1, cv++)
+            if (c & 1)
+                *cv += val;
+    }
+    val = (int32_t)lcg_next(&ws->dither_state) >> 16;
+    for (c = all_ch, cv = channels; c; c >>= 1, cv++)
+        if (c & 1)
+            *cv += val;
+}
+
+static void wavesynth_enter_intervals(struct wavesynth_context *ws, int64_t ts)
+{
+    int *last, i;
+    struct ws_interval *in;
+
+    last = &ws->cur_inter;
+    for (i = ws->cur_inter; i >= 0; i = ws->inter[i].next)
+        last = &ws->inter[i].next;
+    for (i = ws->next_inter; i < ws->nb_inter; i++) {
+        in = &ws->inter[i];
+        if (ts < in->ts_start)
+            break;
+        if (ts >= in->ts_end)
+            continue;
+        *last = i;
+        last = &in->next;
+        in->phi = in->phi0;
+        in->dphi = in->dphi0;
+        in->amp = in->amp0;
+    }
+    ws->next_inter = i;
+    ws->next_ts = i < ws->nb_inter ? ws->inter[i].ts_start : INF_TS;
+    *last = -1;
+}
+
+static int wavesynth_decode(AVCodecContext *avc, void *rframe, int *rgot_frame,
+                            AVPacket *packet)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+    AVFrame *frame = rframe;
+    int64_t ts;
+    int duration;
+    int s, c, r;
+    int16_t *pcm;
+    int32_t channels[WS_MAX_CHANNELS];
+
+    *rgot_frame = 0;
+    if (packet->size != 12)
+        return AVERROR_INVALIDDATA;
+    ts = AV_RL64(packet->data);
+    if (ts != ws->cur_ts)
+        wavesynth_seek(ws, ts);
+    duration = AV_RL32(packet->data + 8);
+    if (duration <= 0)
+        return AVERROR(EINVAL);
+    frame->nb_samples = duration;
+    r = ff_get_buffer(avc, frame, 0);
+    if (r < 0)
+        return r;
+    pcm = (int16_t *)frame->data[0];
+    for (s = 0; s < duration; s++, ts++) {
+        memset(channels, 0, avc->channels * sizeof(*channels));
+        if (ts >= ws->next_ts)
+            wavesynth_enter_intervals(ws, ts);
+        wavesynth_synth_sample(ws, ts, channels);
+        for (c = 0; c < avc->channels; c++)
+            *(pcm++) = channels[c] >> 16;
+    }
+    ws->cur_ts += duration;
+    *rgot_frame = 1;
+    return packet->size;
+}
+
+static av_cold int wavesynth_close(AVCodecContext *avc)
+{
+    struct wavesynth_context *ws = avc->priv_data;
+
+    av_freep(&ws->sin);
+    av_freep(&ws->inter);
+    return 0;
+}
+
+AVCodec ff_ffwavesynth_decoder = {
+    .name           = "wavesynth",
+    .long_name      = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_FFWAVESYNTH,
+    .priv_data_size = sizeof(struct wavesynth_context),
+    .init           = wavesynth_init,
+    .close          = wavesynth_close,
+    .decode         = wavesynth_decode,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/fic.c b/libavcodec/fic.c
index b1286eb..d3952a4 100644
--- a/libavcodec/fic.c
+++ b/libavcodec/fic.c
@@ -4,24 +4,25 @@
  * Copyright (c) 2014 Konstantin Shishkov
  * Copyright (c) 2014 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/common.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
@@ -36,6 +37,7 @@ typedef struct FICThreadContext {
 } FICThreadContext;
 
 typedef struct FICContext {
+    AVClass *class;
     AVCodecContext *avctx;
     AVFrame *frame;
     AVFrame *final_frame;
@@ -51,6 +53,7 @@ typedef struct FICContext {
     int num_slices, slice_h;
 
     uint8_t cursor_buf[4096];
+    int skip_cursor;
 } FICContext;
 
 static const uint8_t fic_qmat_hq[64] = {
@@ -79,7 +82,7 @@ static const uint8_t fic_header[7] = { 0, 0, 1, 'F', 'I', 'C', 'V' };
 
 #define FIC_HEADER_SIZE 27
 
-static av_always_inline void fic_idct(int16_t *blk, int step, int shift)
+static av_always_inline void fic_idct(int16_t *blk, int step, int shift, int rnd)
 {
     const int t0 =  27246 * blk[3 * step] + 18405 * blk[5 * step];
     const int t1 =  27246 * blk[5 * step] - 18405 * blk[3 * step];
@@ -91,8 +94,8 @@ static av_always_inline void fic_idct(int16_t *blk, int step, int shift)
     const int t7 = t3 - t1;
     const int t8 =  17734 * blk[2 * step] - 42813 * blk[6 * step];
     const int t9 =  17734 * blk[6 * step] + 42814 * blk[2 * step];
-    const int tA = (blk[0 * step] - blk[4 * step] << 15) + (1 << shift - 1);
-    const int tB = (blk[0 * step] + blk[4 * step] << 15) + (1 << shift - 1);
+    const int tA = (blk[0 * step] - blk[4 * step] << 15) + rnd;
+    const int tB = (blk[0 * step] + blk[4 * step] << 15) + rnd;
     blk[0 * step] = (  t4       + t9 + tB) >> shift;
     blk[1 * step] = (  t6 + t7  + t8 + tA) >> shift;
     blk[2 * step] = (  t6 - t7  - t8 + tA) >> shift;
@@ -109,14 +112,15 @@ static void fic_idct_put(uint8_t *dst, int stride, int16_t *block)
     int16_t *ptr;
 
     ptr = block;
-    for (i = 0; i < 8; i++) {
-        fic_idct(ptr, 8, 13);
+    fic_idct(ptr++, 8, 13, (1 << 12) + (1 << 17));
+    for (i = 1; i < 8; i++) {
+        fic_idct(ptr, 8, 13, 1 << 12);
         ptr++;
     }
 
     ptr = block;
     for (i = 0; i < 8; i++) {
-        fic_idct(ptr, 1, 20);
+        fic_idct(ptr, 1, 20, 0);
         ptr += 8;
     }
 
@@ -262,13 +266,11 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
     int msize;
     int tsize;
     int cur_x, cur_y;
-    int skip_cursor = 0;
+    int skip_cursor = ctx->skip_cursor;
     uint8_t *sdata;
 
-    if ((ret = ff_reget_buffer(avctx, ctx->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, ctx->frame)) < 0)
         return ret;
-    }
 
     /* Header + at least one slice (4) */
     if (avpkt->size < FIC_HEADER_SIZE + 4) {
@@ -281,8 +283,13 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_WARNING, "Invalid FIC Header.\n");
 
     /* Is it a skip frame? */
-    if (src[17])
+    if (src[17]) {
+        if (!ctx->final_frame) {
+            av_log(avctx, AV_LOG_WARNING, "Initial frame is skipped\n");
+            return AVERROR_INVALIDDATA;
+        }
         goto skip;
+    }
 
     nslices = src[13];
     if (!nslices) {
@@ -302,7 +309,10 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (tsize < 32) {
+    if (!tsize || !AV_RL16(src + 37) || !AV_RL16(src + 39))
+        skip_cursor = 1;
+
+    if (!skip_cursor && tsize < 32) {
         av_log(avctx, AV_LOG_WARNING,
                "Cursor data too small. Skipping cursor.\n");
         skip_cursor = 1;
@@ -311,14 +321,14 @@ static int fic_decode_frame(AVCodecContext *avctx, void *data,
     /* Cursor position. */
     cur_x = AV_RL16(src + 33);
     cur_y = AV_RL16(src + 35);
-    if (cur_x > avctx->width || cur_y > avctx->height) {
-        av_log(avctx, AV_LOG_WARNING,
-               "Invalid cursor position: (%d,%d). Skipping cusor.\n",
+    if (!skip_cursor && (cur_x > avctx->width || cur_y > avctx->height)) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Invalid cursor position: (%d,%d). Skipping cursor.\n",
                cur_x, cur_y);
         skip_cursor = 1;
     }
 
-    if (AV_RL16(src + 37) != 32 || AV_RL16(src + 39) != 32) {
+    if (!skip_cursor && (AV_RL16(src + 37) != 32 || AV_RL16(src + 39) != 32)) {
         av_log(avctx, AV_LOG_WARNING,
                "Invalid cursor size. Skipping cursor.\n");
         skip_cursor = 1;
@@ -445,6 +455,18 @@ static av_cold int fic_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption options[] = {
+{ "skip_cursor", "skip the cursor", offsetof(FICContext, skip_cursor), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM },
+{ NULL },
+};
+
+static const AVClass fic_decoder_class = {
+    .class_name = "FIC encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_fic_decoder = {
     .name           = "fic",
     .long_name      = NULL_IF_CONFIG_SMALL("Mirillis FIC"),
@@ -455,4 +477,5 @@ AVCodec ff_fic_decoder = {
     .decode         = fic_decode_frame,
     .close          = fic_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+    .priv_class     = &fic_decoder_class,
 };
diff --git a/libavcodec/flac.c b/libavcodec/flac.c
index 3e51fde..f5154b9 100644
--- a/libavcodec/flac.c
+++ b/libavcodec/flac.c
@@ -2,20 +2,20 @@
  * FLAC common code
  * Copyright (c) 2009 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -230,24 +230,8 @@ void ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
         av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels)
         ff_flac_set_channel_layout(avctx);
 
-    s->samples  = get_bits_long(&gb, 32) << 4;
-    s->samples |= get_bits(&gb, 4);
+    s->samples = get_bits64(&gb, 36);
 
     skip_bits_long(&gb, 64); /* md5 sum */
     skip_bits_long(&gb, 64); /* md5 sum */
 }
-
-#if LIBAVCODEC_VERSION_MAJOR < 57
-void avpriv_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
-                              const uint8_t *buffer)
-{
-    ff_flac_parse_streaminfo(avctx, s, buffer);
-}
-
-int avpriv_flac_is_extradata_valid(AVCodecContext *avctx,
-                               enum FLACExtradataFormat *format,
-                               uint8_t **streaminfo_start)
-{
-    return ff_flac_is_extradata_valid(avctx, format, streaminfo_start);
-}
-#endif
diff --git a/libavcodec/flac.h b/libavcodec/flac.h
index 3229682..96d971c 100644
--- a/libavcodec/flac.h
+++ b/libavcodec/flac.h
@@ -2,20 +2,20 @@
  * FLAC (Free Lossless Audio Codec) decoder/demuxer common functions
  * Copyright (c) 2008 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -99,14 +99,6 @@ typedef struct FLACFrameInfo {
 void ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
                               const uint8_t *buffer);
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-void avpriv_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s,
-                                  const uint8_t *buffer);
-int avpriv_flac_is_extradata_valid(AVCodecContext *avctx,
-                                   enum FLACExtradataFormat *format,
-                                   uint8_t **streaminfo_start);
-#endif
-
 /**
  * Validate the FLAC extradata.
  * @param[in]  avctx codec context containing the extradata.
diff --git a/libavcodec/flac_parser.c b/libavcodec/flac_parser.c
index 70b9a65..3723716 100644
--- a/libavcodec/flac_parser.c
+++ b/libavcodec/flac_parser.c
@@ -2,20 +2,20 @@
  * FLAC parser
  * Copyright (c) 2010 Michael Chinen
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -87,6 +87,8 @@ typedef struct FLACParseContext {
     int end_padded;                /**< specifies if fifo_buf's end is padded */
     uint8_t *wrap_buf;             /**< general fifo read buffer when wrapped */
     int wrap_buf_allocated_size;   /**< actual allocated size of the buffer   */
+    FLACFrameInfo last_fi;         /**< last decoded frame header info        */
+    int last_fi_valid;             /**< set if last_fi is valid               */
 } FLACParseContext;
 
 static int frame_header_is_valid(AVCodecContext *avctx, const uint8_t *buf,
@@ -180,7 +182,7 @@ static int find_headers_search_validate(FLACParseContext *fpc, int offset)
             size++;
         }
 
-        *end_handle = av_mallocz(sizeof(FLACHeaderMarker));
+        *end_handle = av_mallocz(sizeof(**end_handle));
         if (!*end_handle) {
             av_log(fpc->avctx, AV_LOG_ERROR,
                    "couldn't allocate FLACHeaderMarker\n");
@@ -190,6 +192,13 @@ static int find_headers_search_validate(FLACParseContext *fpc, int offset)
         (*end_handle)->offset       = offset;
         (*end_handle)->link_penalty = av_malloc(sizeof(int) *
                                             FLAC_MAX_SEQUENTIAL_HEADERS);
+        if (!(*end_handle)->link_penalty) {
+            av_freep(end_handle);
+            av_log(fpc->avctx, AV_LOG_ERROR,
+                   "couldn't allocate link_penalty\n");
+            return AVERROR(ENOMEM);
+        }
+
         for (i = 0; i < FLAC_MAX_SEQUENTIAL_HEADERS; i++)
             (*end_handle)->link_penalty[i] = FLAC_HEADER_NOT_PENALIZED_YET;
 
@@ -267,13 +276,12 @@ static int find_new_headers(FLACParseContext *fpc, int search_start)
     return size;
 }
 
-static int check_header_mismatch(FLACParseContext  *fpc,
-                                 FLACHeaderMarker  *header,
-                                 FLACHeaderMarker  *child,
-                                 int                log_level_offset)
+static int check_header_fi_mismatch(FLACParseContext  *fpc,
+                                    FLACFrameInfo     *header_fi,
+                                    FLACFrameInfo     *child_fi,
+                                    int                log_level_offset)
 {
-    FLACFrameInfo  *header_fi = &header->fi, *child_fi = &child->fi;
-    int deduction = 0, deduction_expected = 0, i;
+    int deduction = 0;
     if (child_fi->samplerate != header_fi->samplerate) {
         deduction += FLAC_HEADER_CHANGED_PENALTY;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
@@ -288,13 +296,25 @@ static int check_header_mismatch(FLACParseContext  *fpc,
         /* Changing blocking strategy not allowed per the spec */
         deduction += FLAC_HEADER_BASE_SCORE;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
-                   "blocking strategy change detected in adjacent frames\n");
+               "blocking strategy change detected in adjacent frames\n");
     }
     if (child_fi->channels != header_fi->channels) {
         deduction += FLAC_HEADER_CHANGED_PENALTY;
         av_log(fpc->avctx, AV_LOG_WARNING + log_level_offset,
-                   "number of channels change detected in adjacent frames\n");
+               "number of channels change detected in adjacent frames\n");
     }
+    return deduction;
+}
+
+static int check_header_mismatch(FLACParseContext  *fpc,
+                                 FLACHeaderMarker  *header,
+                                 FLACHeaderMarker  *child,
+                                 int                log_level_offset)
+{
+    FLACFrameInfo  *header_fi = &header->fi, *child_fi = &child->fi;
+    int deduction, deduction_expected = 0, i;
+    deduction = check_header_fi_mismatch(fpc, header_fi, child_fi,
+                                         log_level_offset);
     /* Check sample and frame numbers. */
     if ((child_fi->frame_or_sample_num - header_fi->frame_or_sample_num
          != header_fi->blocksize) &&
@@ -399,11 +419,18 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
     FLACHeaderMarker *child;
     int dist = 0;
     int child_score;
-
+    int base_score = FLAC_HEADER_BASE_SCORE;
     if (header->max_score != FLAC_HEADER_NOT_SCORED_YET)
         return header->max_score;
 
-    header->max_score = FLAC_HEADER_BASE_SCORE;
+    /* Modify the base score with changes from the last output header */
+    if (fpc->last_fi_valid) {
+        /* Silence the log since this will be repeated if selected */
+        base_score -= check_header_fi_mismatch(fpc, &fpc->last_fi, &header->fi,
+                                               AV_LOG_DEBUG);
+    }
+
+    header->max_score = base_score;
 
     /* Check and compute the children's scores. */
     child = header->next;
@@ -419,7 +446,7 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
         if (FLAC_HEADER_BASE_SCORE + child_score > header->max_score) {
             /* Keep the child because the frame scoring is dynamic. */
             header->best_child = child;
-            header->max_score  = FLAC_HEADER_BASE_SCORE + child_score;
+            header->max_score  = base_score + child_score;
         }
         child = child->next;
     }
@@ -430,7 +457,7 @@ static int score_header(FLACParseContext *fpc, FLACHeaderMarker *header)
 static void score_sequences(FLACParseContext *fpc)
 {
     FLACHeaderMarker *curr;
-    int best_score = FLAC_HEADER_NOT_SCORED_YET;
+    int best_score = 0;//FLAC_HEADER_NOT_SCORED_YET;
     /* First pass to clear all old scores. */
     for (curr = fpc->headers; curr; curr = curr->next)
         curr->max_score = FLAC_HEADER_NOT_SCORED_YET;
@@ -469,7 +496,18 @@ static int get_best_header(FLACParseContext* fpc, const uint8_t **poutbuf,
                                         &fpc->wrap_buf,
                                         &fpc->wrap_buf_allocated_size);
 
+
+    if (fpc->pc->flags & PARSER_FLAG_USE_CODEC_TS){
+        if (header->fi.is_var_size)
+          fpc->pc->pts = header->fi.frame_or_sample_num;
+        else if (header->best_child)
+          fpc->pc->pts = header->fi.frame_or_sample_num * header->fi.blocksize;
+    }
+
     fpc->best_header_valid = 0;
+    fpc->last_fi_valid = 1;
+    fpc->last_fi = header->fi;
+
     /* Return the negative overread index so the client can compute pos.
        This should be the amount overread to the beginning of the child */
     if (child)
@@ -489,8 +527,16 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         FLACFrameInfo fi;
-        if (frame_header_is_valid(avctx, buf, &fi))
+        if (frame_header_is_valid(avctx, buf, &fi)) {
             s->duration = fi.blocksize;
+            if (!avctx->sample_rate)
+                avctx->sample_rate = fi.samplerate;
+            if (fpc->pc->flags & PARSER_FLAG_USE_CODEC_TS){
+                fpc->pc->pts = fi.frame_or_sample_num;
+                if (!fi.is_var_size)
+                  fpc->pc->pts *= fi.blocksize;
+            }
+        }
         *poutbuf      = buf;
         *poutbuf_size = buf_size;
         return buf_size;
@@ -546,14 +592,18 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         av_freep(&fpc->best_header);
     }
 
-    /* Find and score new headers. */
-    while ((buf && read_end < buf + buf_size &&
+    /* Find and score new headers.                                     */
+    /* buf_size is to zero when padding, so check for this since we do */
+    /* not want to try to read more input once we have found the end.  */
+    /* Note that as (non-modified) parameters, buf can be non-NULL,    */
+    /* while buf_size is 0.                                            */
+    while ((buf && buf_size && read_end < buf + buf_size &&
             fpc->nb_headers_buffered < FLAC_MIN_HEADERS)
-           || (!buf && !fpc->end_padded)) {
+           || ((!buf || !buf_size) && !fpc->end_padded)) {
         int start_offset;
 
         /* Pad the end once if EOF, to check the final region for headers. */
-        if (!buf) {
+        if (!buf || !buf_size) {
             fpc->end_padded      = 1;
             buf_size = MAX_FRAME_HEADER_SIZE;
             read_end = read_start + MAX_FRAME_HEADER_SIZE;
@@ -575,15 +625,15 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         }
 
         /* Fill the buffer. */
-        if (av_fifo_realloc2(fpc->fifo_buf,
-                             (read_end - read_start) + av_fifo_size(fpc->fifo_buf)) < 0) {
+        if (   av_fifo_space(fpc->fifo_buf) < read_end - read_start
+            && av_fifo_realloc2(fpc->fifo_buf, (read_end - read_start) + 2*av_fifo_size(fpc->fifo_buf)) < 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "couldn't reallocate buffer of size %td\n",
+                   "couldn't reallocate buffer of size %"PTRDIFF_SPECIFIER"\n",
                    (read_end - read_start) + av_fifo_size(fpc->fifo_buf));
             goto handle_error;
         }
 
-        if (buf) {
+        if (buf && buf_size) {
             av_fifo_generic_write(fpc->fifo_buf, (void*) read_start,
                                   read_end - read_start, NULL);
         } else {
@@ -620,10 +670,11 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
         /* restore the state pre-padding */
         if (fpc->end_padded) {
+            int warp = fpc->fifo_buf->wptr - fpc->fifo_buf->buffer < MAX_FRAME_HEADER_SIZE;
             /* HACK: drain the tail of the fifo */
             fpc->fifo_buf->wptr -= MAX_FRAME_HEADER_SIZE;
             fpc->fifo_buf->wndx -= MAX_FRAME_HEADER_SIZE;
-            if (fpc->fifo_buf->wptr < 0) {
+            if (warp) {
                 fpc->fifo_buf->wptr += fpc->fifo_buf->end -
                     fpc->fifo_buf->buffer;
             }
@@ -632,10 +683,12 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
         }
     }
 
-    curr = fpc->headers;
-    for (curr = fpc->headers; curr; curr = curr->next)
-        if (!fpc->best_header || curr->max_score > fpc->best_header->max_score)
+    for (curr = fpc->headers; curr; curr = curr->next) {
+        if (curr->max_score > 0 &&
+            (!fpc->best_header || curr->max_score > fpc->best_header->max_score)) {
             fpc->best_header = curr;
+        }
+    }
 
     if (fpc->best_header) {
         fpc->best_header_valid = 1;
@@ -660,7 +713,7 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 handle_error:
     *poutbuf      = NULL;
     *poutbuf_size = 0;
-    return read_end - buf;
+    return buf_size ? read_end - buf : 0;
 }
 
 static av_cold int flac_parse_init(AVCodecParserContext *c)
@@ -669,7 +722,12 @@ static av_cold int flac_parse_init(AVCodecParserContext *c)
     fpc->pc = c;
     /* There will generally be FLAC_MIN_HEADERS buffered in the fifo before
        it drains.  This is allocated early to avoid slow reallocation. */
-    fpc->fifo_buf = av_fifo_alloc(FLAC_AVG_FRAME_SIZE * (FLAC_MIN_HEADERS + 3));
+    fpc->fifo_buf = av_fifo_alloc_array(FLAC_MIN_HEADERS + 3, FLAC_AVG_FRAME_SIZE);
+    if (!fpc->fifo_buf) {
+        av_log(fpc->avctx, AV_LOG_ERROR,
+                "couldn't allocate fifo_buf\n");
+        return AVERROR(ENOMEM);
+    }
     return 0;
 }
 
@@ -684,8 +742,8 @@ static void flac_parse_close(AVCodecParserContext *c)
         av_free(curr);
         curr = temp;
     }
-    av_fifo_free(fpc->fifo_buf);
-    av_free(fpc->wrap_buf);
+    av_fifo_freep(&fpc->fifo_buf);
+    av_freep(&fpc->wrap_buf);
 }
 
 AVCodecParser ff_flac_parser = {
diff --git a/libavcodec/flacdata.c b/libavcodec/flacdata.c
index 820c3aa..1954f32 100644
--- a/libavcodec/flacdata.c
+++ b/libavcodec/flacdata.c
@@ -2,20 +2,20 @@
  * FLAC data
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@ const int ff_flac_sample_rate_table[16] =
   8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000,
   0, 0, 0, 0 };
 
-const int16_t ff_flac_blocksize_table[16] = {
+const int32_t ff_flac_blocksize_table[16] = {
      0,    192, 576<<0, 576<<1, 576<<2, 576<<3,      0,      0,
 256<<0, 256<<1, 256<<2, 256<<3, 256<<4, 256<<5, 256<<6, 256<<7
 };
diff --git a/libavcodec/flacdata.h b/libavcodec/flacdata.h
index f566377..e2c1e5d 100644
--- a/libavcodec/flacdata.h
+++ b/libavcodec/flacdata.h
@@ -2,20 +2,20 @@
  * FLAC data header
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,6 @@
 
 extern const int ff_flac_sample_rate_table[16];
 
-extern const int16_t ff_flac_blocksize_table[16];
+extern const int32_t ff_flac_blocksize_table[16];
 
 #endif /* AVCODEC_FLACDATA_H */
diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
index 7af71f3..b7237e1 100644
--- a/libavcodec/flacdec.c
+++ b/libavcodec/flacdec.c
@@ -2,20 +2,20 @@
  * FLAC (Free Lossless Audio Codec) decoder
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,9 @@
 
 #include <limits.h>
 
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
@@ -41,9 +44,13 @@
 #include "flac.h"
 #include "flacdata.h"
 #include "flacdsp.h"
+#include "thread.h"
+#include "unary.h"
+
 
 typedef struct FLACContext {
-    FLACSTREAMINFO
+    AVClass *class;
+    struct FLACStreaminfo flac_stream_info;
 
     AVCodecContext *avctx;                  ///< parent AVCodecContext
     GetBitContext gb;                       ///< GetBitContext initialized to start at the current frame
@@ -56,6 +63,7 @@ typedef struct FLACContext {
     int32_t *decoded[FLAC_MAX_CHANNELS];    ///< decoded samples
     uint8_t *decoded_buffer;
     unsigned int decoded_buffer_size;
+    int buggy_lpc;                          ///< use workaround for old lavc encoded files
 
     FLACDSPContext dsp;
 } FLACContext;
@@ -65,7 +73,7 @@ static int allocate_buffers(FLACContext *s);
 static void flac_set_bps(FLACContext *s)
 {
     enum AVSampleFormat req = s->avctx->request_sample_fmt;
-    int need32 = s->bps > 16;
+    int need32 = s->flac_stream_info.bps > 16;
     int want32 = av_get_bytes_per_sample(req) > 2;
     int planar = av_sample_fmt_is_planar(req);
 
@@ -74,13 +82,13 @@ static void flac_set_bps(FLACContext *s)
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
         else
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
-        s->sample_shift = 32 - s->bps;
+        s->sample_shift = 32 - s->flac_stream_info.bps;
     } else {
         if (planar)
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         else
             s->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
-        s->sample_shift = 16 - s->bps;
+        s->sample_shift = 16 - s->flac_stream_info.bps;
     }
 }
 
@@ -101,12 +109,13 @@ static av_cold int flac_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
 
     /* initialize based on the demuxer-supplied streamdata header */
-    ff_flac_parse_streaminfo(avctx, (FLACStreaminfo *)s, streaminfo);
+    ff_flac_parse_streaminfo(avctx, &s->flac_stream_info, streaminfo);
     ret = allocate_buffers(s);
     if (ret < 0)
         return ret;
     flac_set_bps(s);
-    ff_flacdsp_init(&s->dsp, avctx->sample_fmt, s->bps);
+    ff_flacdsp_init(&s->dsp, avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
     s->got_streaminfo = 1;
 
     return 0;
@@ -124,8 +133,12 @@ static void dump_headers(AVCodecContext *avctx, FLACStreaminfo *s)
 static int allocate_buffers(FLACContext *s)
 {
     int buf_size;
+    int ret;
+
+    av_assert0(s->flac_stream_info.max_blocksize);
 
-    buf_size = av_samples_get_buffer_size(NULL, s->channels, s->max_blocksize,
+    buf_size = av_samples_get_buffer_size(NULL, s->flac_stream_info.channels,
+                                          s->flac_stream_info.max_blocksize,
                                           AV_SAMPLE_FMT_S32P, 0);
     if (buf_size < 0)
         return buf_size;
@@ -134,9 +147,12 @@ static int allocate_buffers(FLACContext *s)
     if (!s->decoded_buffer)
         return AVERROR(ENOMEM);
 
-    return av_samples_fill_arrays((uint8_t **)s->decoded, NULL,
-                                  s->decoded_buffer, s->channels,
-                                  s->max_blocksize, AV_SAMPLE_FMT_S32P, 0);
+    ret = av_samples_fill_arrays((uint8_t **)s->decoded, NULL,
+                                 s->decoded_buffer,
+                                 s->flac_stream_info.channels,
+                                 s->flac_stream_info.max_blocksize,
+                                 AV_SAMPLE_FMT_S32P, 0);
+    return ret < 0 ? ret : 0;
 }
 
 /**
@@ -159,12 +175,13 @@ static int parse_streaminfo(FLACContext *s, const uint8_t *buf, int buf_size)
         metadata_size != FLAC_STREAMINFO_SIZE) {
         return AVERROR_INVALIDDATA;
     }
-    ff_flac_parse_streaminfo(s->avctx, (FLACStreaminfo *)s, &buf[8]);
+    ff_flac_parse_streaminfo(s->avctx, &s->flac_stream_info, &buf[8]);
     ret = allocate_buffers(s);
     if (ret < 0)
         return ret;
     flac_set_bps(s);
-    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, s->bps);
+    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
     s->got_streaminfo = 1;
 
     return 0;
@@ -213,6 +230,12 @@ static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
     rice_order = get_bits(&s->gb, 4);
 
     samples= s->blocksize >> rice_order;
+    if (samples << rice_order != s->blocksize) {
+        av_log(s->avctx, AV_LOG_ERROR, "invalid rice order: %i blocksize %i\n",
+               rice_order, s->blocksize);
+        return AVERROR_INVALIDDATA;
+    }
+
     if (pred_order > samples) {
         av_log(s->avctx, AV_LOG_ERROR, "invalid predictor order: %i > %i\n",
                pred_order, samples);
@@ -245,7 +268,8 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
                                  int pred_order, int bps)
 {
     const int blocksize = s->blocksize;
-    int a, b, c, d, i, ret;
+    int av_uninit(a), av_uninit(b), av_uninit(c), av_uninit(d), i;
+    int ret;
 
     /* warm up samples */
     for (i = 0; i < pred_order; i++) {
@@ -291,6 +315,33 @@ static int decode_subframe_fixed(FLACContext *s, int32_t *decoded,
     return 0;
 }
 
+static void lpc_analyze_remodulate(int32_t *decoded, const int coeffs[32],
+                                   int order, int qlevel, int len, int bps)
+{
+    int i, j;
+    int ebps = 1 << (bps-1);
+    unsigned sigma = 0;
+
+    for (i = order; i < len; i++)
+        sigma |= decoded[i] + ebps;
+
+    if (sigma < 2*ebps)
+        return;
+
+    for (i = len - 1; i >= order; i--) {
+        int64_t p = 0;
+        for (j = 0; j < order; j++)
+            p += coeffs[j] * (int64_t)decoded[i-order+j];
+        decoded[i] -= p >> qlevel;
+    }
+    for (i = order; i < len; i++, decoded++) {
+        int32_t p = 0;
+        for (j = 0; j < order; j++)
+            p += coeffs[j] * (uint32_t)decoded[j];
+        decoded[j] += p >> qlevel;
+    }
+}
+
 static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order,
                                int bps)
 {
@@ -322,7 +373,15 @@ static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order,
     if ((ret = decode_residuals(s, decoded, pred_order)) < 0)
         return ret;
 
-    s->dsp.lpc(decoded, coeffs, pred_order, qlevel, s->blocksize);
+    if (   (    s->buggy_lpc && s->flac_stream_info.bps <= 16)
+        || (   !s->buggy_lpc && bps <= 16
+            && bps + coeff_prec + av_log2(pred_order) <= 32)) {
+        s->dsp.lpc16(decoded, coeffs, pred_order, qlevel, s->blocksize);
+    } else {
+        s->dsp.lpc32(decoded, coeffs, pred_order, qlevel, s->blocksize);
+        if (s->flac_stream_info.bps <= 16)
+            lpc_analyze_remodulate(decoded, coeffs, pred_order, qlevel, s->blocksize, bps);
+    }
 
     return 0;
 }
@@ -331,7 +390,7 @@ static inline int decode_subframe(FLACContext *s, int channel)
 {
     int32_t *decoded = s->decoded[channel];
     int type, wasted = 0;
-    int bps = s->bps;
+    int bps = s->flac_stream_info.bps;
     int i, tmp, ret;
 
     if (channel == 0) {
@@ -350,8 +409,7 @@ static inline int decode_subframe(FLACContext *s, int channel)
 
     if (get_bits1(&s->gb)) {
         int left = get_bits_left(&s->gb);
-        wasted = 1;
-        if ( left < 0 ||
+        if ( left <= 0 ||
             (left < bps && !show_bits_long(&s->gb, left)) ||
                            !show_bits_long(&s->gb, bps)) {
             av_log(s->avctx, AV_LOG_ERROR,
@@ -359,8 +417,7 @@ static inline int decode_subframe(FLACContext *s, int channel)
                    bps, left);
             return AVERROR_INVALIDDATA;
         }
-        while (!get_bits1(&s->gb))
-            wasted++;
+        wasted = 1 + get_unary(&s->gb, 1, get_bits_left(&s->gb));
         bps -= wasted;
     }
     if (bps > 32) {
@@ -407,66 +464,69 @@ static int decode_frame(FLACContext *s)
         return ret;
     }
 
-    if (s->channels && fi.channels != s->channels && s->got_streaminfo) {
-        s->channels = s->avctx->channels = fi.channels;
+    if (   s->flac_stream_info.channels
+        && fi.channels != s->flac_stream_info.channels
+        && s->got_streaminfo) {
+        s->flac_stream_info.channels = s->avctx->channels = fi.channels;
         ff_flac_set_channel_layout(s->avctx);
         ret = allocate_buffers(s);
         if (ret < 0)
             return ret;
     }
-    s->channels = s->avctx->channels = fi.channels;
+    s->flac_stream_info.channels = s->avctx->channels = fi.channels;
     if (!s->avctx->channel_layout)
         ff_flac_set_channel_layout(s->avctx);
     s->ch_mode = fi.ch_mode;
 
-    if (!s->bps && !fi.bps) {
+    if (!s->flac_stream_info.bps && !fi.bps) {
         av_log(s->avctx, AV_LOG_ERROR, "bps not found in STREAMINFO or frame header\n");
         return AVERROR_INVALIDDATA;
     }
     if (!fi.bps) {
-        fi.bps = s->bps;
-    } else if (s->bps && fi.bps != s->bps) {
+        fi.bps = s->flac_stream_info.bps;
+    } else if (s->flac_stream_info.bps && fi.bps != s->flac_stream_info.bps) {
         av_log(s->avctx, AV_LOG_ERROR, "switching bps mid-stream is not "
                                        "supported\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (!s->bps) {
-        s->bps = s->avctx->bits_per_raw_sample = fi.bps;
+    if (!s->flac_stream_info.bps) {
+        s->flac_stream_info.bps = s->avctx->bits_per_raw_sample = fi.bps;
         flac_set_bps(s);
     }
 
-    if (!s->max_blocksize)
-        s->max_blocksize = FLAC_MAX_BLOCKSIZE;
-    if (fi.blocksize > s->max_blocksize) {
+    if (!s->flac_stream_info.max_blocksize)
+        s->flac_stream_info.max_blocksize = FLAC_MAX_BLOCKSIZE;
+    if (fi.blocksize > s->flac_stream_info.max_blocksize) {
         av_log(s->avctx, AV_LOG_ERROR, "blocksize %d > %d\n", fi.blocksize,
-               s->max_blocksize);
+               s->flac_stream_info.max_blocksize);
         return AVERROR_INVALIDDATA;
     }
     s->blocksize = fi.blocksize;
 
-    if (!s->samplerate && !fi.samplerate) {
+    if (!s->flac_stream_info.samplerate && !fi.samplerate) {
         av_log(s->avctx, AV_LOG_ERROR, "sample rate not found in STREAMINFO"
                                         " or frame header\n");
         return AVERROR_INVALIDDATA;
     }
     if (fi.samplerate == 0)
-        fi.samplerate = s->samplerate;
-    s->samplerate = s->avctx->sample_rate = fi.samplerate;
+        fi.samplerate = s->flac_stream_info.samplerate;
+    s->flac_stream_info.samplerate = s->avctx->sample_rate = fi.samplerate;
 
     if (!s->got_streaminfo) {
         ret = allocate_buffers(s);
         if (ret < 0)
             return ret;
-        ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, s->bps);
         s->got_streaminfo = 1;
-        dump_headers(s->avctx, (FLACStreaminfo *)s);
+        dump_headers(s->avctx, &s->flac_stream_info);
     }
+    ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt,
+                    s->flac_stream_info.channels, s->flac_stream_info.bps);
 
-//    dump_headers(s->avctx, (FLACStreaminfo *)s);
+//    dump_headers(s->avctx, &s->flac_stream_info);
 
     /* subframes */
-    for (i = 0; i < s->channels; i++) {
+    for (i = 0; i < s->flac_stream_info.channels; i++) {
         if ((ret = decode_subframe(s, i)) < 0)
             return ret;
     }
@@ -483,6 +543,7 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame_ptr, AVPacket *avpkt)
 {
     AVFrame *frame     = data;
+    ThreadFrame tframe = { .f = data };
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     FLACContext *s = avctx->priv_data;
@@ -491,12 +552,22 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 0;
 
-    if (s->max_framesize == 0) {
-        s->max_framesize =
-            ff_flac_get_max_frame_size(s->max_blocksize ? s->max_blocksize : FLAC_MAX_BLOCKSIZE,
+    if (s->flac_stream_info.max_framesize == 0) {
+        s->flac_stream_info.max_framesize =
+            ff_flac_get_max_frame_size(s->flac_stream_info.max_blocksize ? s->flac_stream_info.max_blocksize : FLAC_MAX_BLOCKSIZE,
                                        FLAC_MAX_CHANNELS, 32);
     }
 
+    if (buf_size > 5 && !memcmp(buf, "\177FLAC", 5)) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skipping flac header packet 1\n");
+        return buf_size;
+    }
+
+    if (buf_size > 0 && (*buf & 0x7F) == FLAC_METADATA_TYPE_VORBIS_COMMENT) {
+        av_log(s->avctx, AV_LOG_DEBUG, "skipping vorbis comment\n");
+        return buf_size;
+    }
+
     /* check that there is at least the smallest decodable amount of data.
        this amount corresponds to the smallest valid FLAC frame possible.
        FF F8 69 02 00 00 9A 00 00 34 46 */
@@ -513,21 +584,29 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     /* decode frame */
-    init_get_bits(&s->gb, buf, buf_size*8);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
     if ((ret = decode_frame(s)) < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "decode_frame() failed\n");
         return ret;
     }
-    bytes_read = (get_bits_count(&s->gb)+7)/8;
+    bytes_read = get_bits_count(&s->gb)/8;
+
+    if ((s->avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_COMPLIANT)) &&
+        av_crc(av_crc_get_table(AV_CRC_16_ANSI),
+               0, buf, bytes_read)) {
+        av_log(s->avctx, AV_LOG_ERROR, "CRC error at PTS %"PRId64"\n", avpkt->pts);
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return AVERROR_INVALIDDATA;
+    }
 
     /* get output buffer */
     frame->nb_samples = s->blocksize;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
-    }
 
-    s->dsp.decorrelate[s->ch_mode](frame->data, s->decoded, s->channels,
+    s->dsp.decorrelate[s->ch_mode](frame->data, s->decoded,
+                                   s->flac_stream_info.channels,
                                    s->blocksize, s->sample_shift);
 
     if (bytes_read > buf_size) {
@@ -544,6 +623,19 @@ static int flac_decode_frame(AVCodecContext *avctx, void *data,
     return bytes_read;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    FLACContext *s = avctx->priv_data;
+    s->decoded_buffer = NULL;
+    s->decoded_buffer_size = 0;
+    s->avctx = avctx;
+    if (s->flac_stream_info.max_blocksize)
+        return allocate_buffers(s);
+    return 0;
+}
+#endif
+
 static av_cold int flac_decode_close(AVCodecContext *avctx)
 {
     FLACContext *s = avctx->priv_data;
@@ -553,6 +645,18 @@ static av_cold int flac_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption options[] = {
+{ "use_buggy_lpc", "emulate old buggy lavc behavior", offsetof(FLACContext, buggy_lpc), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM },
+{ NULL },
+};
+
+static const AVClass flac_decoder_class = {
+    "FLAC decoder",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_flac_decoder = {
     .name           = "flac",
     .long_name      = NULL_IF_CONFIG_SMALL("FLAC (Free Lossless Audio Codec)"),
@@ -562,10 +666,12 @@ AVCodec ff_flac_decoder = {
     .init           = flac_decode_init,
     .close          = flac_decode_close,
     .decode         = flac_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16,
                                                       AV_SAMPLE_FMT_S16P,
                                                       AV_SAMPLE_FMT_S32,
                                                       AV_SAMPLE_FMT_S32P,
-                                                      -1 },
+                                                      AV_SAMPLE_FMT_NONE },
+    .priv_class     = &flac_decoder_class,
 };
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index b916869..30b6648 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -85,16 +85,13 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
 
 }
 
-av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
                              int bps)
 {
-    if (bps > 16) {
-        c->lpc            = flac_lpc_32_c;
-        c->lpc_encode     = flac_lpc_encode_c_32;
-    } else {
-        c->lpc            = flac_lpc_16_c;
-        c->lpc_encode     = flac_lpc_encode_c_16;
-    }
+    c->lpc16        = flac_lpc_16_c;
+    c->lpc32        = flac_lpc_32_c;
+    c->lpc16_encode = flac_lpc_encode_c_16;
+    c->lpc32_encode = flac_lpc_encode_c_32;
 
     switch (fmt) {
     case AV_SAMPLE_FMT_S32:
@@ -127,5 +124,7 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
     }
 
     if (ARCH_ARM)
-        ff_flacdsp_init_arm(c, fmt, bps);
+        ff_flacdsp_init_arm(c, fmt, channels, bps);
+    if (ARCH_X86)
+        ff_flacdsp_init_x86(c, fmt, channels, bps);
 }
diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
index 33184b5..f5cbd94 100644
--- a/libavcodec/flacdsp.h
+++ b/libavcodec/flacdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,13 +25,18 @@
 typedef struct FLACDSPContext {
     void (*decorrelate[4])(uint8_t **out, int32_t **in, int channels,
                            int len, int shift);
-    void (*lpc)(int32_t *samples, const int coeffs[32], int order,
-                int qlevel, int len);
-    void (*lpc_encode)(int32_t *res, const int32_t *smp, int len, int order,
-                       const int32_t *coefs, int shift);
+    void (*lpc16)(int32_t *samples, const int coeffs[32], int order,
+                  int qlevel, int len);
+    void (*lpc32)(int32_t *samples, const int coeffs[32], int order,
+                  int qlevel, int len);
+    void (*lpc16_encode)(int32_t *res, const int32_t *smp, int len, int order,
+                         const int32_t coefs[32], int shift);
+    void (*lpc32_encode)(int32_t *res, const int32_t *smp, int len, int order,
+                         const int32_t coefs[32], int shift);
 } FLACDSPContext;
 
-void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
-void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
+void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
+void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
+void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps);
 
 #endif /* AVCODEC_FLACDSP_H */
diff --git a/libavcodec/flacdsp_lpc_template.c b/libavcodec/flacdsp_lpc_template.c
index 269e64b..5d532e0 100644
--- a/libavcodec/flacdsp_lpc_template.c
+++ b/libavcodec/flacdsp_lpc_template.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,3 +139,21 @@ static void FUNC(flac_lpc_encode_c)(int32_t *res, const int32_t *smp, int len,
     }
 #endif
 }
+
+/* Comment for clarity/de-obfuscation.
+ *
+ * for (int i = order; i < len; i++) {
+ *     int32_t p = 0;
+ *     for (int j = 0; j < order; j++) {
+ *         int c = coefs[j];
+ *         int s = smp[(i-1)-j];
+ *         p    += c*s;
+ *     }
+ *     res[i] = smp[i] - (p >> shift);
+ * }
+ *
+ * The CONFIG_SMALL code above simplifies to this, in the case of SAMPLE_SIZE
+ * not being equal to 32 (at the present time that means for 16-bit audio). The
+ * code above does 2 samples per iteration.  Commit bfdd5bc (made all the way
+ * back in 2007) says that way is faster.
+ */
diff --git a/libavcodec/flacdsp_template.c b/libavcodec/flacdsp_template.c
index 0affe22..62c0a15 100644
--- a/libavcodec/flacdsp_template.c
+++ b/libavcodec/flacdsp_template.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index d05a0c6..a91ed19 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -2,30 +2,31 @@
  * FLAC audio encoder
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/crc.h"
 #include "libavutil/intmath.h"
 #include "libavutil/md5.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "bswapdsp.h"
-#include "get_bits.h"
+#include "put_bits.h"
 #include "golomb.h"
 #include "internal.h"
 #include "lpc.h"
@@ -61,13 +62,14 @@ typedef struct CompressionOptions {
     int min_partition_order;
     int max_partition_order;
     int ch_mode;
+    int exact_rice_parameters;
+    int multi_dim_quant;
 } CompressionOptions;
 
 typedef struct RiceContext {
     enum CodingMode coding_mode;
     int porder;
     int params[MAX_PARTITIONS];
-    uint32_t udata[FLAC_MAX_BLOCKSIZE];
 } RiceContext;
 
 typedef struct FlacSubframe {
@@ -78,9 +80,13 @@ typedef struct FlacSubframe {
     int order;
     int32_t coefs[MAX_LPC_ORDER];
     int shift;
+
     RiceContext rc;
+    uint32_t rc_udata[FLAC_MAX_BLOCKSIZE];
+    uint64_t rc_sums[32][MAX_PARTITIONS];
+
     int32_t samples[FLAC_MAX_BLOCKSIZE];
-    int32_t residual[FLAC_MAX_BLOCKSIZE+1];
+    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
 } FlacSubframe;
 
 typedef struct FlacFrame {
@@ -157,7 +163,7 @@ static int select_blocksize(int samplerate, int block_time_ms)
     int target;
     int blocksize;
 
-    assert(samplerate > 0);
+    av_assert0(samplerate > 0);
     blocksize = ff_flac_blocksize_table[1];
     target    = (samplerate * block_time_ms) / 1000;
     for (i = 0; i < 16; i++) {
@@ -251,8 +257,11 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
         break;
     }
 
-    if (channels < 1 || channels > FLAC_MAX_CHANNELS)
-        return -1;
+    if (channels < 1 || channels > FLAC_MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "%d channels not supported (max %d)\n",
+               channels, FLAC_MAX_CHANNELS);
+        return AVERROR(EINVAL);
+    }
     s->channels = channels;
 
     /* find samplerate in table */
@@ -278,7 +287,8 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
             s->sr_code[0] = 13;
             s->sr_code[1] = freq;
         } else {
-            return -1;
+            av_log(avctx, AV_LOG_ERROR, "%d Hz not supported\n", freq);
+            return AVERROR(EINVAL);
         }
         s->samplerate = freq;
     }
@@ -293,7 +303,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
     if (level > 12) {
         av_log(avctx, AV_LOG_ERROR, "invalid compression level: %d\n",
                s->options.compression_level);
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     s->options.block_time_ms = ((int[]){ 27, 27, 27,105,105,105,105,105,105,105,105,105,105})[level];
@@ -341,7 +351,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
                    avctx->min_prediction_order > MAX_LPC_ORDER) {
             av_log(avctx, AV_LOG_ERROR, "invalid min prediction order: %d\n",
                    avctx->min_prediction_order);
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->options.min_prediction_order = avctx->min_prediction_order;
     }
@@ -357,7 +367,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
                    avctx->max_prediction_order > MAX_LPC_ORDER) {
             av_log(avctx, AV_LOG_ERROR, "invalid max prediction order: %d\n",
                    avctx->max_prediction_order);
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->options.max_prediction_order = avctx->max_prediction_order;
     }
@@ -384,7 +394,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (s->options.max_prediction_order < s->options.min_prediction_order) {
         av_log(avctx, AV_LOG_ERROR, "invalid prediction orders: min=%d max=%d\n",
                s->options.min_prediction_order, s->options.max_prediction_order);
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     if (avctx->frame_size > 0) {
@@ -392,7 +402,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 avctx->frame_size > FLAC_MAX_BLOCKSIZE) {
             av_log(avctx, AV_LOG_ERROR, "invalid block size: %d\n",
                    avctx->frame_size);
-            return -1;
+            return AVERROR(EINVAL);
         }
     } else {
         s->avctx->frame_size = select_blocksize(s->samplerate, s->options.block_time_ms);
@@ -420,11 +430,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->frame_count   = 0;
     s->min_framesize = s->max_framesize;
 
+    if (channels == 3 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_STEREO|AV_CH_FRONT_CENTER) ||
+        channels == 4 &&
+            avctx->channel_layout != AV_CH_LAYOUT_2_2 &&
+            avctx->channel_layout != AV_CH_LAYOUT_QUAD ||
+        channels == 5 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0_BACK ||
+        channels == 6 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1_BACK) {
+        if (avctx->channel_layout) {
+            av_log(avctx, AV_LOG_ERROR, "Channel layout not supported by Flac, "
+                                             "output stream will have incorrect "
+                                             "channel layout.\n");
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+                                               "will use Flac channel layout for "
+                                               "%d channels.\n", channels);
+        }
+    }
+
     ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size,
                       s->options.max_prediction_order, FF_LPC_TYPE_LEVINSON);
 
     ff_bswapdsp_init(&s->bdsp);
-    ff_flacdsp_init(&s->flac_dsp, avctx->sample_fmt,
+    ff_flacdsp_init(&s->flac_dsp, avctx->sample_fmt, channels,
                     avctx->bits_per_raw_sample);
 
     dprint_compression_options(s);
@@ -500,7 +532,7 @@ static void copy_samples(FlacEncodeContext *s, const void *samples)
 }
 
 
-static uint64_t rice_count_exact(int32_t *res, int n, int k)
+static uint64_t rice_count_exact(const int32_t *res, int n, int k)
 {
     int i;
     uint64_t count = 0;
@@ -524,6 +556,9 @@ static uint64_t subframe_count_exact(FlacEncodeContext *s, FlacSubframe *sub,
     /* subframe header */
     count += 8;
 
+    if (sub->wasted)
+        count += sub->wasted;
+
     /* subframe */
     if (sub->type == FLAC_SUBFRAME_CONSTANT) {
         count += sub->obits;
@@ -578,24 +613,44 @@ static int find_optimal_param(uint64_t sum, int n, int max_param)
     return FFMIN(k, max_param);
 }
 
+static int find_optimal_param_exact(uint64_t sums[32][MAX_PARTITIONS], int i, int max_param)
+{
+    int bestk = 0;
+    int64_t bestbits = INT64_MAX;
+    int k;
+
+    for (k = 0; k <= max_param; k++) {
+        int64_t bits = sums[k][i];
+        if (bits < bestbits) {
+            bestbits = bits;
+            bestk = k;
+        }
+    }
+
+    return bestk;
+}
 
 static uint64_t calc_optimal_rice_params(RiceContext *rc, int porder,
-                                         uint64_t *sums, int n, int pred_order)
+                                         uint64_t sums[32][MAX_PARTITIONS],
+                                         int n, int pred_order, int max_param, int exact)
 {
     int i;
-    int k, cnt, part, max_param;
+    int k, cnt, part;
     uint64_t all_bits;
 
-    max_param = (1 << rc->coding_mode) - 2;
-
     part     = (1 << porder);
     all_bits = 4 * part;
 
     cnt = (n >> porder) - pred_order;
     for (i = 0; i < part; i++) {
-        k = find_optimal_param(sums[i], cnt, max_param);
+        if (exact) {
+            k = find_optimal_param_exact(sums, i, max_param);
+            all_bits += sums[k][i];
+        } else {
+            k = find_optimal_param(sums[0][i], cnt, max_param);
+            all_bits += rice_encode_count(sums[0][i], cnt, k);
+        }
         rc->params[i] = k;
-        all_bits += rice_encode_count(sums[i], cnt, k);
         cnt = n >> porder;
     }
 
@@ -605,61 +660,80 @@ static uint64_t calc_optimal_rice_params(RiceContext *rc, int porder,
 }
 
 
-static void calc_sums(int pmin, int pmax, uint32_t *data, int n, int pred_order,
-                      uint64_t sums[][MAX_PARTITIONS])
+static void calc_sum_top(int pmax, int kmax, const uint32_t *data, int n, int pred_order,
+                         uint64_t sums[32][MAX_PARTITIONS])
 {
-    int i, j;
+    int i, k;
     int parts;
-    uint32_t *res, *res_end;
+    const uint32_t *res, *res_end;
 
     /* sums for highest level */
     parts   = (1 << pmax);
-    res     = &data[pred_order];
-    res_end = &data[n >> pmax];
-    for (i = 0; i < parts; i++) {
-        uint64_t sum = 0;
-        while (res < res_end)
-            sum += *(res++);
-        sums[pmax][i] = sum;
-        res_end += n >> pmax;
-    }
-    /* sums for lower levels */
-    for (i = pmax - 1; i >= pmin; i--) {
-        parts = (1 << i);
-        for (j = 0; j < parts; j++)
-            sums[i][j] = sums[i+1][2*j] + sums[i+1][2*j+1];
+
+    for (k = 0; k <= kmax; k++) {
+        res     = &data[pred_order];
+        res_end = &data[n >> pmax];
+        for (i = 0; i < parts; i++) {
+            if (kmax) {
+                uint64_t sum = (1LL + k) * (res_end - res);
+                while (res < res_end)
+                    sum += *(res++) >> k;
+                sums[k][i] = sum;
+            } else {
+                uint64_t sum = 0;
+                while (res < res_end)
+                    sum += *(res++);
+                sums[k][i] = sum;
+            }
+            res_end += n >> pmax;
+        }
     }
 }
 
+static void calc_sum_next(int level, uint64_t sums[32][MAX_PARTITIONS], int kmax)
+{
+    int i, k;
+    int parts = (1 << level);
+    for (i = 0; i < parts; i++) {
+        for (k=0; k<=kmax; k++)
+            sums[k][i] = sums[k][2*i] + sums[k][2*i+1];
+    }
+}
 
-static uint64_t calc_rice_params(RiceContext *rc, int pmin, int pmax,
-                                 int32_t *data, int n, int pred_order)
+static uint64_t calc_rice_params(RiceContext *rc,
+                                 uint32_t udata[FLAC_MAX_BLOCKSIZE],
+                                 uint64_t sums[32][MAX_PARTITIONS],
+                                 int pmin, int pmax,
+                                 const int32_t *data, int n, int pred_order, int exact)
 {
     int i;
     uint64_t bits[MAX_PARTITION_ORDER+1];
     int opt_porder;
     RiceContext tmp_rc;
-    uint64_t sums[MAX_PARTITION_ORDER + 1][MAX_PARTITIONS] = { { 0 } };
+    int kmax = (1 << rc->coding_mode) - 2;
 
-    assert(pmin >= 0 && pmin <= MAX_PARTITION_ORDER);
-    assert(pmax >= 0 && pmax <= MAX_PARTITION_ORDER);
-    assert(pmin <= pmax);
+    av_assert1(pmin >= 0 && pmin <= MAX_PARTITION_ORDER);
+    av_assert1(pmax >= 0 && pmax <= MAX_PARTITION_ORDER);
+    av_assert1(pmin <= pmax);
 
     tmp_rc.coding_mode = rc->coding_mode;
 
     for (i = 0; i < n; i++)
-        rc->udata[i] = (2 * data[i]) ^ (data[i] >> 31);
+        udata[i] = (2 * data[i]) ^ (data[i] >> 31);
 
-    calc_sums(pmin, pmax, rc->udata, n, pred_order, sums);
+    calc_sum_top(pmax, exact ? kmax : 0, udata, n, pred_order, sums);
 
     opt_porder = pmin;
     bits[pmin] = UINT32_MAX;
-    for (i = pmin; i <= pmax; i++) {
-        bits[i] = calc_optimal_rice_params(&tmp_rc, i, sums[i], n, pred_order);
-        if (bits[i] <= bits[opt_porder]) {
+    for (i = pmax; ; ) {
+        bits[i] = calc_optimal_rice_params(&tmp_rc, i, sums, n, pred_order, kmax, exact);
+        if (bits[i] < bits[opt_porder] || pmax == pmin) {
             opt_porder = i;
             *rc = tmp_rc;
         }
+        if (i == pmin)
+            break;
+        calc_sum_next(--i, sums, exact ? kmax : 0);
     }
 
     return bits[opt_porder];
@@ -686,8 +760,8 @@ static uint64_t find_subframe_rice_params(FlacEncodeContext *s,
     uint64_t bits = 8 + pred_order * sub->obits + 2 + sub->rc.coding_mode;
     if (sub->type == FLAC_SUBFRAME_LPC)
         bits += 4 + 5 + pred_order * s->options.lpc_coeff_precision;
-    bits += calc_rice_params(&sub->rc, pmin, pmax, sub->residual,
-                             s->frame.blocksize, pred_order);
+    bits += calc_rice_params(&sub->rc, sub->rc_udata, sub->rc_sums, pmin, pmax, sub->residual,
+                             s->frame.blocksize, pred_order, s->options.exact_rice_parameters);
     return bits;
 }
 
@@ -826,8 +900,13 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             order = av_clip(order, min_order - 1, max_order - 1);
             if (order == last_order)
                 continue;
-            s->flac_dsp.lpc_encode(res, smp, n, order+1, coefs[order],
-                                   shift[order]);
+            if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(order) <= 32) {
+                s->flac_dsp.lpc16_encode(res, smp, n, order+1, coefs[order],
+                                         shift[order]);
+            } else {
+                s->flac_dsp.lpc32_encode(res, smp, n, order+1, coefs[order],
+                                         shift[order]);
+            }
             bits[i] = find_subframe_rice_params(s, sub, order+1);
             if (bits[i] < bits[opt_index]) {
                 opt_index = i;
@@ -841,7 +920,11 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
         opt_order = 0;
         bits[0]   = UINT32_MAX;
         for (i = min_order-1; i < max_order; i++) {
-            s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(i) <= 32) {
+                s->flac_dsp.lpc16_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            } else {
+                s->flac_dsp.lpc32_encode(res, smp, n, i+1, coefs[i], shift[i]);
+            }
             bits[i] = find_subframe_rice_params(s, sub, i+1);
             if (bits[i] < bits[opt_order])
                 opt_order = i;
@@ -859,7 +942,11 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             for (i = last-step; i <= last+step; i += step) {
                 if (i < min_order-1 || i >= max_order || bits[i] < UINT32_MAX)
                     continue;
-                s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(i) <= 32) {
+                    s->flac_dsp.lpc32_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                } else {
+                    s->flac_dsp.lpc16_encode(res, smp, n, i+1, coefs[i], shift[i]);
+                }
                 bits[i] = find_subframe_rice_params(s, sub, i+1);
                 if (bits[i] < bits[opt_order])
                     opt_order = i;
@@ -868,13 +955,60 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
         opt_order++;
     }
 
+    if (s->options.multi_dim_quant) {
+        int allsteps = 1;
+        int i, step, improved;
+        int64_t best_score = INT64_MAX;
+        int32_t qmax;
+
+        qmax = (1 << (s->options.lpc_coeff_precision - 1)) - 1;
+
+        for (i=0; i<opt_order; i++)
+            allsteps *= 3;
+
+        do {
+            improved = 0;
+            for (step = 0; step < allsteps; step++) {
+                int tmp = step;
+                int32_t lpc_try[MAX_LPC_ORDER];
+                int64_t score = 0;
+                int diffsum = 0;
+
+                for (i=0; i<opt_order; i++) {
+                    int diff = ((tmp + 1) % 3) - 1;
+                    lpc_try[i] = av_clip(coefs[opt_order - 1][i] + diff, -qmax, qmax);
+                    tmp /= 3;
+                    diffsum += !!diff;
+                }
+                if (diffsum >8)
+                    continue;
+
+                if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(opt_order - 1) <= 32) {
+                    s->flac_dsp.lpc16_encode(res, smp, n, opt_order, lpc_try, shift[opt_order-1]);
+                } else {
+                    s->flac_dsp.lpc32_encode(res, smp, n, opt_order, lpc_try, shift[opt_order-1]);
+                }
+                score = find_subframe_rice_params(s, sub, opt_order);
+                if (score < best_score) {
+                    best_score = score;
+                    memcpy(coefs[opt_order-1], lpc_try, sizeof(*coefs));
+                    improved=1;
+                }
+            }
+        } while(improved);
+    }
+
     sub->order     = opt_order;
     sub->type_code = sub->type | (sub->order-1);
     sub->shift     = shift[sub->order-1];
     for (i = 0; i < sub->order; i++)
         sub->coefs[i] = coefs[sub->order-1][i];
 
-    s->flac_dsp.lpc_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    if (s->bps_code * 4 + s->options.lpc_coeff_precision + av_log2(opt_order) <= 32) {
+        s->flac_dsp.lpc16_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    } else {
+        s->flac_dsp.lpc32_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
+    }
 
     find_subframe_rice_params(s, sub, sub->order);
 
@@ -909,7 +1043,7 @@ static int count_frame_header(FlacEncodeContext *s)
         count += 16;
 
     /* explicit sample rate */
-    count += ((s->sr_code[0] == 12) + (s->sr_code[0] > 12)) * 8;
+    count += ((s->sr_code[0] == 12) + (s->sr_code[0] > 12) * 2) * 8;
 
     /* frame header CRC-8 */
     count += 8;
@@ -953,7 +1087,7 @@ static void remove_wasted_bits(FlacEncodeContext *s)
         }
 
         if (v && !(v & 1)) {
-            v = av_ctz(v);
+            v = ff_ctz(v);
 
             for (i = 0; i < s->frame.blocksize; i++)
                 sub->samples[i] >>= v;
@@ -970,7 +1104,7 @@ static void remove_wasted_bits(FlacEncodeContext *s)
 }
 
 
-static int estimate_stereo_mode(int32_t *left_ch, int32_t *right_ch, int n,
+static int estimate_stereo_mode(const int32_t *left_ch, const int32_t *right_ch, int n,
                                 int max_rice_param)
 {
     int i, best;
@@ -1210,9 +1344,7 @@ static int update_md5_sum(FlacEncodeContext *s, const void *samples)
 
         for (i = 0; i < s->frame.blocksize * s->channels; i++) {
             int32_t v = samples0[i] >> 8;
-            *tmp++    = (v      ) & 0xFF;
-            *tmp++    = (v >>  8) & 0xFF;
-            *tmp++    = (v >> 16) & 0xFF;
+            AV_WL24(tmp + 3*i, v);
         }
         buf = s->md5_buffer;
     }
@@ -1286,10 +1418,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     }
 
-    if ((ret = ff_alloc_packet(avpkt, frame_bytes))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, frame_bytes, 0)) < 0)
         return ret;
-    }
 
     out_bytes = write_frame(s, avpkt);
 
@@ -1336,7 +1466,7 @@ static const AVOption options[] = {
 { "fixed",    NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_FIXED },    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
 { "levinson", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
 { "cholesky", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  AV_OPT_TYPE_INT, {.i64 = 1 }, 1, INT_MAX, FLAGS },
+{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  AV_OPT_TYPE_INT, {.i64 = 2 }, 1, INT_MAX, FLAGS },
 { "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  AV_OPT_TYPE_INT, {.i64 = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
 { "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  AV_OPT_TYPE_INT, {.i64 = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
 { "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), AV_OPT_TYPE_INT, {.i64 = -1 }, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
@@ -1352,6 +1482,8 @@ static const AVOption options[] = {
 { "left_side",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_LEFT_SIDE   }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
 { "right_side", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_RIGHT_SIDE  }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
 { "mid_side",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FLAC_CHMODE_MID_SIDE    }, INT_MIN, INT_MAX, FLAGS, "ch_mode" },
+{ "exact_rice_parameters", "Calculate rice parameters exactly", offsetof(FlacEncodeContext, options.exact_rice_parameters), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+{ "multi_dim_quant",       "Multi-dimensional quantization",    offsetof(FlacEncodeContext, options.multi_dim_quant),       AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
 { "min_prediction_order", NULL, offsetof(FlacEncodeContext, options.min_prediction_order), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, MAX_LPC_ORDER, FLAGS },
 { "max_prediction_order", NULL, offsetof(FlacEncodeContext, options.max_prediction_order), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, MAX_LPC_ORDER, FLAGS },
 
@@ -1359,10 +1491,10 @@ static const AVOption options[] = {
 };
 
 static const AVClass flac_encoder_class = {
-    "FLAC encoder",
-    av_default_item_name,
-    options,
-    LIBAVUTIL_VERSION_INT,
+    .class_name = "FLAC encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 AVCodec ff_flac_encoder = {
@@ -1374,7 +1506,7 @@ AVCodec ff_flac_encoder = {
     .init           = flac_encode_init,
     .encode2        = flac_encode_frame,
     .close          = flac_encode_close,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_LOSSLESS,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_S32,
                                                      AV_SAMPLE_FMT_NONE },
diff --git a/libavcodec/flashsv.c b/libavcodec/flashsv.c
index 2cf8f3f..90e1d43 100644
--- a/libavcodec/flashsv.c
+++ b/libavcodec/flashsv.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Alex Beregszaszi
  * Copyright (C) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,7 +69,7 @@ typedef struct FlashSVContext {
     int             diff_start, diff_height;
 } FlashSVContext;
 
-static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy,
+static int decode_hybrid(const uint8_t *sptr, const uint8_t *sptr_end, uint8_t *dptr, int dx, int dy,
                          int h, int w, int stride, const uint32_t *pal)
 {
     int x, y;
@@ -78,6 +78,8 @@ static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy,
     for (y = dx + h; y > dx; y--) {
         uint8_t *dst = dptr + (y * stride) + dy * 3;
         for (x = 0; x < w; x++) {
+            if (sptr >= sptr_end)
+                return AVERROR_INVALIDDATA;
             if (*sptr & 0x80) {
                 /* 15-bit color */
                 unsigned c = AV_RB16(sptr) & ~0x8000;
@@ -107,7 +109,7 @@ static av_cold int flashsv_decode_end(AVCodecContext *avctx)
     av_frame_free(&s->frame);
 
     /* free the tmpblock */
-    av_free(s->tmpblock);
+    av_freep(&s->tmpblock);
 
     return 0;
 }
@@ -142,6 +144,9 @@ static int flashsv2_prime(FlashSVContext *s, uint8_t *src, int size)
     z_stream zs;
     int zret; // Zlib return code
 
+    if (!src)
+        return AVERROR_INVALIDDATA;
+
     zs.zalloc = NULL;
     zs.zfree  = NULL;
     zs.opaque = NULL;
@@ -152,7 +157,8 @@ static int flashsv2_prime(FlashSVContext *s, uint8_t *src, int size)
     s->zstream.avail_out = s->block_size * 3;
     inflate(&s->zstream, Z_SYNC_FLUSH);
 
-    deflateInit(&zs, 0);
+    if (deflateInit(&zs, 0) != Z_OK)
+        return -1;
     zs.next_in   = s->tmpblock;
     zs.avail_in  = s->block_size * 3 - s->zstream.avail_out;
     zs.next_out  = s->deflate_block;
@@ -228,10 +234,15 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
         }
     } else {
         /* hybrid 15-bit/palette mode */
-        decode_hybrid(s->tmpblock, s->frame->data[0],
+        ret = decode_hybrid(s->tmpblock, s->zstream.next_out,
+                      s->frame->data[0],
                       s->image_height - (y_pos + 1 + s->diff_height),
                       x_pos, s->diff_height, width,
                       s->frame->linesize[0], s->pal);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "decode_hybrid failed\n");
+            return ret;
+        }
     }
     skip_bits_long(gb, 8 * block_size); /* skip the consumed bits */
     return 0;
@@ -260,6 +271,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     FlashSVContext *s = avctx->priv_data;
     int h_blocks, v_blocks, h_part, v_part, i, j, ret;
     GetBitContext gb;
+    int last_blockwidth = s->block_width;
+    int last_blockheight= s->block_height;
 
     /* no supplementary picture */
     if (buf_size == 0)
@@ -267,7 +280,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     if (buf_size < 4)
         return -1;
 
-    init_get_bits(&gb, avpkt->data, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, buf_size)) < 0)
+        return ret;
 
     /* start to parse the bitstream */
     s->block_width  = 16 * (get_bits(&gb, 4) + 1);
@@ -275,6 +289,10 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     s->block_height = 16 * (get_bits(&gb, 4) + 1);
     s->image_height = get_bits(&gb, 12);
 
+    if (   last_blockwidth != s->block_width
+        || last_blockheight!= s->block_height)
+        av_freep(&s->blocks);
+
     if (s->ver == 2) {
         skip_bits(&gb, 6);
         if (get_bits1(&gb)) {
@@ -322,8 +340,8 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
 
     /* initialize the image size once */
     if (avctx->width == 0 && avctx->height == 0) {
-        avctx->width  = s->image_width;
-        avctx->height = s->image_height;
+        if ((ret = ff_set_dimensions(avctx, s->image_width, s->image_height)) < 0)
+            return ret;
     }
 
     /* check for changes of image width and image height */
@@ -339,24 +357,20 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
     s->is_keyframe = (avpkt->flags & AV_PKT_FLAG_KEY) && (s->ver == 2);
     if (s->is_keyframe) {
         int err;
-        int nb_blocks = (v_blocks + !!v_part) *
-                        (h_blocks + !!h_part) * sizeof(s->blocks[0]);
         if ((err = av_reallocp(&s->keyframedata, avpkt->size)) < 0)
             return err;
         memcpy(s->keyframedata, avpkt->data, avpkt->size);
-        if ((err = av_reallocp(&s->blocks, nb_blocks)) < 0)
-            return err;
-        memset(s->blocks, 0, nb_blocks);
     }
+    if(s->ver == 2 && !s->blocks)
+        s->blocks = av_mallocz((v_blocks + !!v_part) * (h_blocks + !!h_part) *
+                               sizeof(s->blocks[0]));
 
     ff_dlog(avctx, "image: %dx%d block: %dx%d num: %dx%d part: %dx%d\n",
             s->image_width, s->image_height, s->block_width, s->block_height,
             h_blocks, v_blocks, h_part, v_part);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     /* loop over all block columns */
     for (j = 0; j < v_blocks + (v_part ? 1 : 0); j++) {
@@ -399,6 +413,10 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
                 }
 
                 if (has_diff) {
+                    if (size < 3) {
+                        av_log(avctx, AV_LOG_ERROR, "size too small for diff\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     if (!s->keyframe) {
                         av_log(avctx, AV_LOG_ERROR,
                                "Inter frame without keyframe\n");
@@ -426,6 +444,10 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
                     int row = get_bits(&gb, 8);
                     av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_curr %dx%d\n",
                            i, j, col, row);
+                    if (size < 3) {
+                        av_log(avctx, AV_LOG_ERROR, "size too small for zlibprime_curr\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     size -= 2;
                     avpriv_request_sample(avctx, "zlibprime_curr");
                     return AVERROR_PATCHWELCOME;
diff --git a/libavcodec/flashsv2enc.c b/libavcodec/flashsv2enc.c
new file mode 100644
index 0000000..65db112
--- /dev/null
+++ b/libavcodec/flashsv2enc.c
@@ -0,0 +1,922 @@
+/*
+ * Flash Screen Video Version 2 encoder
+ * Copyright (C) 2009 Joshua Warner
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Flash Screen Video Version 2 encoder
+ * @author Joshua Warner
+ */
+
+/* Differences from version 1 stream:
+ * NOTE: Currently, the only player that supports version 2 streams is Adobe Flash Player itself.
+ * * Supports sending only a range of scanlines in a block,
+ *   indicating a difference from the corresponding block in the last keyframe.
+ * * Supports initializing the zlib dictionary with data from the corresponding
+ *   block in the last keyframe, to improve compression.
+ * * Supports a hybrid 15-bit rgb / 7-bit palette color space.
+ */
+
+/* TODO:
+ * Don't keep Block structures for both current frame and keyframe.
+ * Make better heuristics for deciding stream parameters (optimum_* functions).  Currently these return constants.
+ * Figure out how to encode palette information in the stream, choose an optimum palette at each keyframe.
+ * Figure out how the zlibPrimeCompressCurrent flag works, implement support.
+ * Find other sample files (that weren't generated here), develop a decoder.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+
+#include "libavutil/imgutils.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+
+#define HAS_IFRAME_IMAGE 0x02
+#define HAS_PALLET_INFO 0x01
+
+#define COLORSPACE_BGR 0x00
+#define COLORSPACE_15_7 0x10
+#define HAS_DIFF_BLOCKS 0x04
+#define ZLIB_PRIME_COMPRESS_CURRENT 0x02
+#define ZLIB_PRIME_COMPRESS_PREVIOUS 0x01
+
+// Disables experimental "smart" parameter-choosing code, as well as the statistics that it depends on.
+// At the moment, the "smart" code is a great example of how the parameters *shouldn't* be chosen.
+#define FLASHSV2_DUMB
+
+typedef struct Block {
+    uint8_t *enc;
+    uint8_t *sl_begin, *sl_end;
+    int enc_size;
+    uint8_t *data;
+    unsigned long data_size;
+
+    uint8_t start, len;
+    uint8_t dirty;
+    uint8_t col, row, width, height;
+    uint8_t flags;
+} Block;
+
+typedef struct Palette {
+    unsigned colors[128];
+    uint8_t index[1 << 15];
+} Palette;
+
+typedef struct FlashSV2Context {
+    AVCodecContext *avctx;
+    uint8_t *current_frame;
+    uint8_t *key_frame;
+    uint8_t *encbuffer;
+    uint8_t *keybuffer;
+    uint8_t *databuffer;
+
+    uint8_t *blockbuffer;
+    int blockbuffer_size;
+
+    Block *frame_blocks;
+    Block *key_blocks;
+    int frame_size;
+    int blocks_size;
+
+    int use15_7, dist, comp;
+
+    int rows, cols;
+
+    int last_key_frame;
+
+    int image_width, image_height;
+    int block_width, block_height;
+    uint8_t flags;
+    uint8_t use_custom_palette;
+    uint8_t palette_type;       ///< 0=>default, 1=>custom - changed when palette regenerated.
+    Palette palette;
+#ifndef FLASHSV2_DUMB
+    double tot_blocks;          ///< blocks encoded since last keyframe
+    double diff_blocks;         ///< blocks that were different since last keyframe
+    double tot_lines;           ///< total scanlines in image since last keyframe
+    double diff_lines;          ///< scanlines that were different since last keyframe
+    double raw_size;            ///< size of raw frames since last keyframe
+    double comp_size;           ///< size of compressed data since last keyframe
+    double uncomp_size;         ///< size of uncompressed data since last keyframe
+
+    double total_bits;          ///< total bits written to stream so far
+#endif
+} FlashSV2Context;
+
+static av_cold void cleanup(FlashSV2Context * s)
+{
+    av_freep(&s->encbuffer);
+    av_freep(&s->keybuffer);
+    av_freep(&s->databuffer);
+    av_freep(&s->blockbuffer);
+    av_freep(&s->current_frame);
+    av_freep(&s->key_frame);
+
+    av_freep(&s->frame_blocks);
+    av_freep(&s->key_blocks);
+}
+
+static void init_blocks(FlashSV2Context * s, Block * blocks,
+                        uint8_t * encbuf, uint8_t * databuf)
+{
+    int row, col;
+    Block *b;
+    for (col = 0; col < s->cols; col++) {
+        for (row = 0; row < s->rows; row++) {
+            b = blocks + (col + row * s->cols);
+            b->width = (col < s->cols - 1) ?
+                s->block_width :
+                s->image_width - col * s->block_width;
+
+            b->height = (row < s->rows - 1) ?
+                s->block_height :
+                s->image_height - row * s->block_height;
+
+            b->row   = row;
+            b->col   = col;
+            b->enc   = encbuf;
+            b->data  = databuf;
+            encbuf  += b->width * b->height * 3;
+            databuf += !databuf ? 0 : b->width * b->height * 6;
+        }
+    }
+}
+
+static void reset_stats(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    s->diff_blocks = 0.1;
+    s->tot_blocks = 1;
+    s->diff_lines = 0.1;
+    s->tot_lines = 1;
+    s->raw_size = s->comp_size = s->uncomp_size = 10;
+#endif
+}
+
+static av_cold int flashsv2_encode_init(AVCodecContext * avctx)
+{
+    FlashSV2Context *s = avctx->priv_data;
+
+    s->avctx = avctx;
+
+    s->comp = avctx->compression_level;
+    if (s->comp == -1)
+        s->comp = 9;
+    if (s->comp < 0 || s->comp > 9) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Compression level should be 0-9, not %d\n", s->comp);
+        return -1;
+    }
+
+
+    if ((avctx->width > 4095) || (avctx->height > 4095)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Input dimensions too large, input must be max 4095x4095 !\n");
+        return -1;
+    }
+    if ((avctx->width < 16) || (avctx->height < 16)) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Input dimensions too small, input must be at least 16x16 !\n");
+        return -1;
+    }
+
+    if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0)
+        return -1;
+
+
+    s->last_key_frame = 0;
+
+    s->image_width  = avctx->width;
+    s->image_height = avctx->height;
+
+    s->block_width  = (s->image_width /  12) & ~15;
+    s->block_height = (s->image_height / 12) & ~15;
+
+    if(!s->block_width)
+        s->block_width = 1;
+    if(!s->block_height)
+        s->block_height = 1;
+
+    s->rows = (s->image_height + s->block_height - 1) / s->block_height;
+    s->cols = (s->image_width +  s->block_width -  1) / s->block_width;
+
+    s->frame_size  = s->image_width * s->image_height * 3;
+    s->blocks_size = s->rows * s->cols * sizeof(Block);
+
+    s->encbuffer     = av_mallocz(s->frame_size);
+    s->keybuffer     = av_mallocz(s->frame_size);
+    s->databuffer    = av_mallocz(s->frame_size * 6);
+    s->current_frame = av_mallocz(s->frame_size);
+    s->key_frame     = av_mallocz(s->frame_size);
+    s->frame_blocks  = av_mallocz(s->blocks_size);
+    s->key_blocks    = av_mallocz(s->blocks_size);
+
+    s->blockbuffer      = NULL;
+    s->blockbuffer_size = 0;
+
+    init_blocks(s, s->frame_blocks, s->encbuffer, s->databuffer);
+    init_blocks(s, s->key_blocks,   s->keybuffer, 0);
+    reset_stats(s);
+#ifndef FLASHSV2_DUMB
+    s->total_bits = 1;
+#endif
+
+    s->use_custom_palette =  0;
+    s->palette_type       = -1;        // so that the palette will be generated in reconfigure_at_keyframe
+
+    if (!s->encbuffer || !s->keybuffer || !s->databuffer
+        || !s->current_frame || !s->key_frame || !s->key_blocks
+        || !s->frame_blocks) {
+        av_log(avctx, AV_LOG_ERROR, "Memory allocation failed.\n");
+        cleanup(s);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int new_key_frame(FlashSV2Context * s)
+{
+    int i;
+    memcpy(s->key_blocks, s->frame_blocks, s->blocks_size);
+    memcpy(s->key_frame, s->current_frame, s->frame_size);
+
+    for (i = 0; i < s->rows * s->cols; i++) {
+        s->key_blocks[i].enc += (s->keybuffer - s->encbuffer);
+        s->key_blocks[i].sl_begin = 0;
+        s->key_blocks[i].sl_end   = 0;
+        s->key_blocks[i].data     = 0;
+    }
+    memcpy(s->keybuffer, s->encbuffer, s->frame_size);
+
+    return 0;
+}
+
+static int write_palette(FlashSV2Context * s, uint8_t * buf, int buf_size)
+{
+    //this isn't implemented yet!  Default palette only!
+    return -1;
+}
+
+static int write_header(FlashSV2Context * s, uint8_t * buf, int buf_size)
+{
+    PutBitContext pb;
+    int buf_pos, len;
+
+    if (buf_size < 5)
+        return -1;
+
+    init_put_bits(&pb, buf, buf_size);
+
+    put_bits(&pb, 4, (s->block_width  >> 4) - 1);
+    put_bits(&pb, 12, s->image_width);
+    put_bits(&pb, 4, (s->block_height >> 4) - 1);
+    put_bits(&pb, 12, s->image_height);
+
+    flush_put_bits(&pb);
+    buf_pos = 4;
+
+    buf[buf_pos++] = s->flags;
+
+    if (s->flags & HAS_PALLET_INFO) {
+        len = write_palette(s, buf + buf_pos, buf_size - buf_pos);
+        if (len < 0)
+            return -1;
+        buf_pos += len;
+    }
+
+    return buf_pos;
+}
+
+static int write_block(Block * b, uint8_t * buf, int buf_size)
+{
+    int buf_pos = 0;
+    unsigned block_size = b->data_size;
+
+    if (b->flags & HAS_DIFF_BLOCKS)
+        block_size += 2;
+    if (b->flags & ZLIB_PRIME_COMPRESS_CURRENT)
+        block_size += 2;
+    if (block_size > 0)
+        block_size += 1;
+    if (buf_size < block_size + 2)
+        return -1;
+
+    buf[buf_pos++] = block_size >> 8;
+    buf[buf_pos++] = block_size;
+
+    if (block_size == 0)
+        return buf_pos;
+
+    buf[buf_pos++] = b->flags;
+
+    if (b->flags & HAS_DIFF_BLOCKS) {
+        buf[buf_pos++] = (b->start);
+        buf[buf_pos++] = (b->len);
+    }
+
+    if (b->flags & ZLIB_PRIME_COMPRESS_CURRENT) {
+        //This feature of the format is poorly understood, and as of now, unused.
+        buf[buf_pos++] = (b->col);
+        buf[buf_pos++] = (b->row);
+    }
+
+    memcpy(buf + buf_pos, b->data, b->data_size);
+
+    buf_pos += b->data_size;
+
+    return buf_pos;
+}
+
+static int encode_zlib(Block * b, uint8_t * buf, unsigned long *buf_size, int comp)
+{
+    int res = compress2(buf, buf_size, b->sl_begin, b->sl_end - b->sl_begin, comp);
+    return res == Z_OK ? 0 : -1;
+}
+
+static int encode_zlibprime(Block * b, Block * prime, uint8_t * buf,
+                            int *buf_size, int comp)
+{
+    z_stream s;
+    int res;
+    s.zalloc = NULL;
+    s.zfree  = NULL;
+    s.opaque = NULL;
+    res = deflateInit(&s, comp);
+    if (res < 0)
+        return -1;
+
+    s.next_in  = prime->enc;
+    s.avail_in = prime->enc_size;
+    while (s.avail_in > 0) {
+        s.next_out  = buf;
+        s.avail_out = *buf_size;
+        res = deflate(&s, Z_SYNC_FLUSH);
+        if (res < 0)
+            return -1;
+    }
+
+    s.next_in   = b->sl_begin;
+    s.avail_in  = b->sl_end - b->sl_begin;
+    s.next_out  = buf;
+    s.avail_out = *buf_size;
+    res = deflate(&s, Z_FINISH);
+    deflateEnd(&s);
+    *buf_size -= s.avail_out;
+    if (res != Z_STREAM_END)
+        return -1;
+    return 0;
+}
+
+static int encode_bgr(Block * b, const uint8_t * src, int stride)
+{
+    int i;
+    uint8_t *ptr = b->enc;
+    for (i = 0; i < b->start; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->sl_begin = ptr + i * b->width * 3;
+    for (; i < b->start + b->len; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->sl_end = ptr + i * b->width * 3;
+    for (; i < b->height; i++)
+        memcpy(ptr + i * b->width * 3, src + i * stride, b->width * 3);
+    b->enc_size = ptr + i * b->width * 3 - b->enc;
+    return b->enc_size;
+}
+
+static inline unsigned pixel_color15(const uint8_t * src)
+{
+    return (src[0] >> 3) | ((src[1] & 0xf8) << 2) | ((src[2] & 0xf8) << 7);
+}
+
+static inline unsigned int chroma_diff(unsigned int c1, unsigned int c2)
+{
+#define ABSDIFF(a,b) (abs((int)(a)-(int)(b)))
+
+    unsigned int t1 = (c1 & 0x000000ff) + ((c1 & 0x0000ff00) >> 8) + ((c1 & 0x00ff0000) >> 16);
+    unsigned int t2 = (c2 & 0x000000ff) + ((c2 & 0x0000ff00) >> 8) + ((c2 & 0x00ff0000) >> 16);
+
+    return ABSDIFF(t1, t2) + ABSDIFF(c1 & 0x000000ff, c2 & 0x000000ff) +
+        ABSDIFF((c1 & 0x0000ff00) >> 8 , (c2 & 0x0000ff00) >> 8) +
+        ABSDIFF((c1 & 0x00ff0000) >> 16, (c2 & 0x00ff0000) >> 16);
+}
+
+static inline int pixel_color7_fast(Palette * palette, unsigned c15)
+{
+    return palette->index[c15];
+}
+
+static int pixel_color7_slow(Palette * palette, unsigned color)
+{
+    int i, min = 0x7fffffff;
+    int minc = -1;
+    for (i = 0; i < 128; i++) {
+        int c1 = palette->colors[i];
+        int diff = chroma_diff(c1, color);
+        if (diff < min) {
+            min = diff;
+            minc = i;
+        }
+    }
+    return minc;
+}
+
+static inline unsigned pixel_bgr(const uint8_t * src)
+{
+    return (src[0]) | (src[1] << 8) | (src[2] << 16);
+}
+
+static int write_pixel_15_7(Palette * palette, uint8_t * dest, const uint8_t * src,
+                            int dist)
+{
+    unsigned c15 = pixel_color15(src);
+    unsigned color = pixel_bgr(src);
+    int d15 = chroma_diff(color, color & 0x00f8f8f8);
+    int c7 = pixel_color7_fast(palette, c15);
+    int d7 = chroma_diff(color, palette->colors[c7]);
+    if (dist + d15 >= d7) {
+        dest[0] = c7;
+        return 1;
+    } else {
+        dest[0] = 0x80 | (c15 >> 8);
+        dest[1] = c15 & 0xff;
+        return 2;
+    }
+}
+
+static int update_palette_index(Palette * palette)
+{
+    int r, g, b;
+    unsigned int bgr, c15, index;
+    for (r = 4; r < 256; r += 8) {
+        for (g = 4; g < 256; g += 8) {
+            for (b = 4; b < 256; b += 8) {
+                bgr = b | (g << 8) | (r << 16);
+                c15 = (b >> 3) | ((g & 0xf8) << 2) | ((r & 0xf8) << 7);
+                index = pixel_color7_slow(palette, bgr);
+
+                palette->index[c15] = index;
+            }
+        }
+    }
+    return 0;
+}
+
+static const unsigned int default_screen_video_v2_palette[128] = {
+    0x00000000, 0x00333333, 0x00666666, 0x00999999, 0x00CCCCCC, 0x00FFFFFF,
+    0x00330000, 0x00660000, 0x00990000, 0x00CC0000, 0x00FF0000, 0x00003300,
+    0x00006600, 0x00009900, 0x0000CC00, 0x0000FF00, 0x00000033, 0x00000066,
+    0x00000099, 0x000000CC, 0x000000FF, 0x00333300, 0x00666600, 0x00999900,
+    0x00CCCC00, 0x00FFFF00, 0x00003333, 0x00006666, 0x00009999, 0x0000CCCC,
+    0x0000FFFF, 0x00330033, 0x00660066, 0x00990099, 0x00CC00CC, 0x00FF00FF,
+    0x00FFFF33, 0x00FFFF66, 0x00FFFF99, 0x00FFFFCC, 0x00FF33FF, 0x00FF66FF,
+    0x00FF99FF, 0x00FFCCFF, 0x0033FFFF, 0x0066FFFF, 0x0099FFFF, 0x00CCFFFF,
+    0x00CCCC33, 0x00CCCC66, 0x00CCCC99, 0x00CCCCFF, 0x00CC33CC, 0x00CC66CC,
+    0x00CC99CC, 0x00CCFFCC, 0x0033CCCC, 0x0066CCCC, 0x0099CCCC, 0x00FFCCCC,
+    0x00999933, 0x00999966, 0x009999CC, 0x009999FF, 0x00993399, 0x00996699,
+    0x0099CC99, 0x0099FF99, 0x00339999, 0x00669999, 0x00CC9999, 0x00FF9999,
+    0x00666633, 0x00666699, 0x006666CC, 0x006666FF, 0x00663366, 0x00669966,
+    0x0066CC66, 0x0066FF66, 0x00336666, 0x00996666, 0x00CC6666, 0x00FF6666,
+    0x00333366, 0x00333399, 0x003333CC, 0x003333FF, 0x00336633, 0x00339933,
+    0x0033CC33, 0x0033FF33, 0x00663333, 0x00993333, 0x00CC3333, 0x00FF3333,
+    0x00003366, 0x00336600, 0x00660033, 0x00006633, 0x00330066, 0x00663300,
+    0x00336699, 0x00669933, 0x00993366, 0x00339966, 0x00663399, 0x00996633,
+    0x006699CC, 0x0099CC66, 0x00CC6699, 0x0066CC99, 0x009966CC, 0x00CC9966,
+    0x0099CCFF, 0x00CCFF99, 0x00FF99CC, 0x0099FFCC, 0x00CC99FF, 0x00FFCC99,
+    0x00111111, 0x00222222, 0x00444444, 0x00555555, 0x00AAAAAA, 0x00BBBBBB,
+    0x00DDDDDD, 0x00EEEEEE
+};
+
+static int generate_default_palette(Palette * palette)
+{
+    memcpy(palette->colors, default_screen_video_v2_palette,
+           sizeof(default_screen_video_v2_palette));
+
+    return update_palette_index(palette);
+}
+
+static int generate_optimum_palette(Palette * palette, const uint8_t * image,
+                                   int width, int height, int stride)
+{
+    //this isn't implemented yet!  Default palette only!
+    return -1;
+}
+
+static inline int encode_15_7_sl(Palette * palette, uint8_t * dest,
+                                 const uint8_t * src, int width, int dist)
+{
+    int len = 0, x;
+    for (x = 0; x < width; x++) {
+        len += write_pixel_15_7(palette, dest + len, src + 3 * x, dist);
+    }
+    return len;
+}
+
+static int encode_15_7(Palette * palette, Block * b, const uint8_t * src,
+                       int stride, int dist)
+{
+    int i;
+    uint8_t *ptr = b->enc;
+    for (i = 0; i < b->start; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->sl_begin = ptr;
+    for (; i < b->start + b->len; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->sl_end = ptr;
+    for (; i < b->height; i++)
+        ptr += encode_15_7_sl(palette, ptr, src + i * stride, b->width, dist);
+    b->enc_size = ptr - b->enc;
+    return b->enc_size;
+}
+
+static int encode_block(FlashSV2Context *s, Palette * palette, Block * b,
+                        Block * prev, const uint8_t * src, int stride, int comp,
+                        int dist, int keyframe)
+{
+    unsigned buf_size = b->width * b->height * 6;
+    uint8_t *buf = s->blockbuffer;
+    int res;
+
+    if (b->flags & COLORSPACE_15_7) {
+        encode_15_7(palette, b, src, stride, dist);
+    } else {
+        encode_bgr(b, src, stride);
+    }
+
+    if (b->len > 0) {
+        b->data_size = buf_size;
+        res = encode_zlib(b, b->data, &b->data_size, comp);
+        if (res)
+            return res;
+
+        if (!keyframe) {
+            res = encode_zlibprime(b, prev, buf, &buf_size, comp);
+            if (res)
+                return res;
+
+            if (buf_size < b->data_size) {
+                b->data_size = buf_size;
+                memcpy(b->data, buf, buf_size);
+                b->flags |= ZLIB_PRIME_COMPRESS_PREVIOUS;
+            }
+        }
+    } else {
+        b->data_size = 0;
+    }
+    return 0;
+}
+
+static int compare_sl(FlashSV2Context * s, Block * b, const uint8_t * src,
+                      uint8_t * frame, uint8_t * key, int y, int keyframe)
+{
+    if (memcmp(src, frame, b->width * 3) != 0) {
+        b->dirty = 1;
+        memcpy(frame, src, b->width * 3);
+#ifndef FLASHSV2_DUMB
+        s->diff_lines++;
+#endif
+    }
+    if (memcmp(src, key, b->width * 3) != 0) {
+        if (b->len == 0)
+            b->start = y;
+        b->len = y + 1 - b->start;
+    }
+    return 0;
+}
+
+static int mark_all_blocks(FlashSV2Context * s, const uint8_t * src, int stride,
+                           int keyframe)
+{
+    int sl, rsl, col, pos, possl;
+    Block *b;
+    for (sl = s->image_height - 1; sl >= 0; sl--) {
+        for (col = 0; col < s->cols; col++) {
+            rsl = s->image_height - sl - 1;
+            b = s->frame_blocks + col + rsl / s->block_height * s->cols;
+            possl = stride * sl + col * s->block_width * 3;
+            pos = s->image_width * rsl * 3 + col * s->block_width * 3;
+            compare_sl(s, b, src + possl, s->current_frame + pos,
+                       s->key_frame + pos, rsl % s->block_height, keyframe);
+        }
+    }
+#ifndef FLASHSV2_DUMB
+    s->tot_lines += s->image_height * s->cols;
+#endif
+    return 0;
+}
+
+static int encode_all_blocks(FlashSV2Context * s, int keyframe)
+{
+    int row, col, res;
+    uint8_t *data;
+    Block *b, *prev;
+    for (row = 0; row < s->rows; row++) {
+        for (col = 0; col < s->cols; col++) {
+            b = s->frame_blocks + (row * s->cols + col);
+            prev = s->key_blocks + (row * s->cols + col);
+            b->flags = s->use15_7 ? COLORSPACE_15_7 : 0;
+            if (keyframe) {
+                b->start = 0;
+                b->len = b->height;
+            } else if (!b->dirty) {
+                b->start = 0;
+                b->len = 0;
+                b->data_size = 0;
+                continue;
+            } else if (b->start != 0 || b->len != b->height) {
+                b->flags |= HAS_DIFF_BLOCKS;
+            }
+            data = s->current_frame + s->image_width * 3 * s->block_height * row + s->block_width * col * 3;
+            res = encode_block(s, &s->palette, b, prev, data, s->image_width * 3, s->comp, s->dist, keyframe);
+#ifndef FLASHSV2_DUMB
+            if (b->dirty)
+                s->diff_blocks++;
+            s->comp_size += b->data_size;
+            s->uncomp_size += b->enc_size;
+#endif
+            if (res)
+                return res;
+        }
+    }
+#ifndef FLASHSV2_DUMB
+    s->raw_size += s->image_width * s->image_height * 3;
+    s->tot_blocks += s->rows * s->cols;
+#endif
+    return 0;
+}
+
+static int write_all_blocks(FlashSV2Context * s, uint8_t * buf,
+                            int buf_size)
+{
+    int row, col, buf_pos = 0, len;
+    Block *b;
+    for (row = 0; row < s->rows; row++) {
+        for (col = 0; col < s->cols; col++) {
+            b = s->frame_blocks + row * s->cols + col;
+            len = write_block(b, buf + buf_pos, buf_size - buf_pos);
+            b->start = b->len = b->dirty = 0;
+            if (len < 0)
+                return len;
+            buf_pos += len;
+        }
+    }
+    return buf_pos;
+}
+
+static int write_bitstream(FlashSV2Context * s, const uint8_t * src, int stride,
+                           uint8_t * buf, int buf_size, int keyframe)
+{
+    int buf_pos, res;
+
+    res = mark_all_blocks(s, src, stride, keyframe);
+    if (res)
+        return res;
+    res = encode_all_blocks(s, keyframe);
+    if (res)
+        return res;
+
+    res = write_header(s, buf, buf_size);
+    if (res < 0) {
+        return res;
+    } else {
+        buf_pos = res;
+    }
+    res = write_all_blocks(s, buf + buf_pos, buf_size - buf_pos);
+    if (res < 0)
+        return res;
+    buf_pos += res;
+#ifndef FLASHSV2_DUMB
+    s->total_bits += ((double) buf_pos) * 8.0;
+#endif
+
+    return buf_pos;
+}
+
+static void recommend_keyframe(FlashSV2Context * s, int *keyframe)
+{
+#ifndef FLASHSV2_DUMB
+    double block_ratio, line_ratio, enc_ratio, comp_ratio, data_ratio;
+    if (s->avctx->gop_size > 0) {
+        block_ratio = s->diff_blocks / s->tot_blocks;
+        line_ratio = s->diff_lines / s->tot_lines;
+        enc_ratio = s->uncomp_size / s->raw_size;
+        comp_ratio = s->comp_size / s->uncomp_size;
+        data_ratio = s->comp_size / s->raw_size;
+
+        if ((block_ratio >= 0.5 && line_ratio / block_ratio <= 0.5) || line_ratio >= 0.95) {
+            *keyframe = 1;
+            return;
+        }
+    }
+#else
+    return;
+#endif
+}
+
+#ifndef FLASHSV2_DUMB
+static const double block_size_fraction = 1.0 / 300;
+static const double use15_7_threshold = 8192;
+static const double color15_7_factor = 100;
+#endif
+static int optimum_block_width(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double save = (1-pow(s->diff_lines/s->diff_blocks/s->block_height, 0.5)) * s->comp_size/s->tot_blocks;
+    double width = block_size_fraction * sqrt(0.5 * save * s->rows * s->cols) * s->image_width;
+    int pwidth = ((int) width);
+    return FFCLIP(pwidth & ~15, 256, 16);
+#else
+    return 64;
+#endif
+}
+
+static int optimum_block_height(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double save = (1-pow(s->diff_lines/s->diff_blocks/s->block_height, 0.5)) * s->comp_size/s->tot_blocks;
+    double height = block_size_fraction * sqrt(0.5 * save * s->rows * s->cols) * s->image_height;
+    int pheight = ((int) height);
+    return FFCLIP(pheight & ~15, 256, 16);
+#else
+    return 64;
+#endif
+}
+
+static int optimum_use15_7(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double ideal = ((double)(s->avctx->bit_rate * s->avctx->time_base.den * s->avctx->ticks_per_frame)) /
+        ((double) s->avctx->time_base.num) * s->avctx->frame_number;
+    if (ideal + use15_7_threshold < s->total_bits) {
+        return 1;
+    } else {
+        return 0;
+    }
+#else
+    return s->avctx->global_quality == 0;
+#endif
+}
+
+static int optimum_dist(FlashSV2Context * s)
+{
+#ifndef FLASHSV2_DUMB
+    double ideal =
+        s->avctx->bit_rate * s->avctx->time_base.den *
+        s->avctx->ticks_per_frame;
+    int dist = pow((s->total_bits / ideal) * color15_7_factor, 3);
+    av_log(s->avctx, AV_LOG_DEBUG, "dist: %d\n", dist);
+    return dist;
+#else
+    return 15;
+#endif
+}
+
+
+static int reconfigure_at_keyframe(FlashSV2Context * s, const uint8_t * image,
+                                   int stride)
+{
+    int update_palette = 0;
+    int res;
+    int block_width  = optimum_block_width (s);
+    int block_height = optimum_block_height(s);
+
+    s->rows = (s->image_height + block_height - 1) / block_height;
+    s->cols = (s->image_width  + block_width  - 1) / block_width;
+
+    if (block_width != s->block_width || block_height != s->block_height) {
+        s->block_width  = block_width;
+        s->block_height = block_height;
+        if (s->rows * s->cols > s->blocks_size / sizeof(Block)) {
+            s->frame_blocks = av_realloc_array(s->frame_blocks, s->rows, s->cols * sizeof(Block));
+            s->key_blocks = av_realloc_array(s->key_blocks, s->cols, s->rows * sizeof(Block));
+            if (!s->frame_blocks || !s->key_blocks) {
+                av_log(s->avctx, AV_LOG_ERROR, "Memory allocation failed.\n");
+                return -1;
+            }
+            s->blocks_size = s->rows * s->cols * sizeof(Block);
+        }
+        init_blocks(s, s->frame_blocks, s->encbuffer, s->databuffer);
+        init_blocks(s, s->key_blocks, s->keybuffer, 0);
+
+        av_fast_malloc(&s->blockbuffer, &s->blockbuffer_size, block_width * block_height * 6);
+        if (!s->blockbuffer) {
+            av_log(s->avctx, AV_LOG_ERROR, "Could not allocate block buffer.\n");
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    s->use15_7 = optimum_use15_7(s);
+    if (s->use15_7) {
+        if ((s->use_custom_palette && s->palette_type != 1) || update_palette) {
+            res = generate_optimum_palette(&s->palette, image, s->image_width, s->image_height, stride);
+            if (res)
+                return res;
+            s->palette_type = 1;
+            av_log(s->avctx, AV_LOG_DEBUG, "Generated optimum palette\n");
+        } else if (!s->use_custom_palette && s->palette_type != 0) {
+            res = generate_default_palette(&s->palette);
+            if (res)
+                return res;
+            s->palette_type = 0;
+            av_log(s->avctx, AV_LOG_DEBUG, "Generated default palette\n");
+        }
+    }
+
+
+    reset_stats(s);
+
+    return 0;
+}
+
+static int flashsv2_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                 const AVFrame *p, int *got_packet)
+{
+    FlashSV2Context *const s = avctx->priv_data;
+    int res;
+    int keyframe = 0;
+
+    if ((res = ff_alloc_packet2(avctx, pkt, s->frame_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return res;
+
+    /* First frame needs to be a keyframe */
+    if (avctx->frame_number == 0)
+        keyframe = 1;
+
+    /* Check the placement of keyframes */
+    if (avctx->gop_size > 0) {
+        if (avctx->frame_number >= s->last_key_frame + avctx->gop_size)
+            keyframe = 1;
+    }
+
+    if (!keyframe
+        && avctx->frame_number > s->last_key_frame + avctx->keyint_min) {
+        recommend_keyframe(s, &keyframe);
+        if (keyframe)
+            av_log(avctx, AV_LOG_DEBUG, "Recommending key frame at frame %d\n", avctx->frame_number);
+    }
+
+    if (keyframe) {
+        res = reconfigure_at_keyframe(s, p->data[0], p->linesize[0]);
+        if (res)
+            return res;
+    }
+
+    if (s->use15_7)
+        s->dist = optimum_dist(s);
+
+    res = write_bitstream(s, p->data[0], p->linesize[0], pkt->data, pkt->size, keyframe);
+
+    if (keyframe) {
+        new_key_frame(s);
+        s->last_key_frame = avctx->frame_number;
+        pkt->flags |= AV_PKT_FLAG_KEY;
+        av_log(avctx, AV_LOG_DEBUG, "Inserting key frame at frame %d\n", avctx->frame_number);
+    }
+
+    pkt->size = res;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int flashsv2_encode_end(AVCodecContext * avctx)
+{
+    FlashSV2Context *s = avctx->priv_data;
+
+    cleanup(s);
+
+    return 0;
+}
+
+AVCodec ff_flashsv2_encoder = {
+    .name           = "flashsv2",
+    .long_name      = NULL_IF_CONFIG_SMALL("Flash Screen Video Version 2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_FLASHSV2,
+    .priv_data_size = sizeof(FlashSV2Context),
+    .init           = flashsv2_encode_init,
+    .encode2        = flashsv2_encode_frame,
+    .close          = flashsv2_encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_BGR24, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/flashsvenc.c b/libavcodec/flashsvenc.c
index 7e14e47..f7f98ef 100644
--- a/libavcodec/flashsvenc.c
+++ b/libavcodec/flashsvenc.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2004 Alex Beregszaszi
  * Copyright (C) 2006 Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -94,9 +94,9 @@ static av_cold int flashsv_encode_end(AVCodecContext *avctx)
 
     deflateEnd(&s->zstream);
 
-    av_free(s->encbuffer);
-    av_free(s->previous_frame);
-    av_free(s->tmpblock);
+    av_freep(&s->encbuffer);
+    av_freep(&s->previous_frame);
+    av_freep(&s->tmpblock);
 
     return 0;
 }
@@ -109,7 +109,7 @@ static av_cold int flashsv_encode_init(AVCodecContext *avctx)
 
     if (avctx->width > 4095 || avctx->height > 4095) {
         av_log(avctx, AV_LOG_ERROR,
-               "Input dimensions too large, input must be max 4096x4096 !\n");
+               "Input dimensions too large, input must be max 4095x4095 !\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -143,7 +143,7 @@ static int encode_bitstream(FlashSVContext *s, const AVFrame *p, uint8_t *buf,
     int buf_pos, res;
     int pred_blocks = 0;
 
-    init_put_bits(&pb, buf, buf_size * 8);
+    init_put_bits(&pb, buf, buf_size);
 
     put_bits(&pb,  4, block_width / 16 - 1);
     put_bits(&pb, 12, s->image_width);
@@ -238,12 +238,8 @@ static int flashsv_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         I_frame = 1;
     }
 
-    if ((res = ff_alloc_packet(pkt, s->image_width * s->image_height * 3)) < 0) {
-        //Conservative upper bound check for compressed data
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n",
-               s->image_width * s->image_height * 3);
+    if ((res = ff_alloc_packet2(avctx, pkt, s->image_width * s->image_height * 3, 0)) < 0)
         return res;
-    }
 
     pkt->size = encode_bitstream(s, p, pkt->data, pkt->size, opt_w * 16, opt_h * 16,
                                  pfptr, &I_frame);
diff --git a/libavcodec/flicvideo.c b/libavcodec/flicvideo.c
index 13e6ae4..c9c6c24 100644
--- a/libavcodec/flicvideo.c
+++ b/libavcodec/flicvideo.c
@@ -2,20 +2,20 @@
  * FLI/FLC Animation Video Decoder
  * Copyright (C) 2003, 2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,7 @@
 
 #define CHECK_PIXEL_PTR(n) \
     if (pixel_ptr + n > pixel_limit) { \
-        av_log (s->avctx, AV_LOG_INFO, "Problem: pixel_ptr >= pixel_limit (%d >= %d)\n", \
+        av_log (s->avctx, AV_LOG_ERROR, "Invalid pixel_ptr = %d > pixel_limit = %d\n", \
         pixel_ptr + n, pixel_limit); \
         return AVERROR_INVALIDDATA; \
     } \
@@ -84,22 +84,40 @@ static av_cold int flic_decode_init(AVCodecContext *avctx)
     unsigned char *fli_header = (unsigned char *)avctx->extradata;
     int depth;
 
-    if (avctx->extradata_size != 12 &&
-        avctx->extradata_size != 128) {
-        av_log(avctx, AV_LOG_ERROR, "Expected extradata of 12 or 128 bytes\n");
+    if (avctx->extradata_size != 0 &&
+        avctx->extradata_size != 12 &&
+        avctx->extradata_size != 128 &&
+        avctx->extradata_size != 256 &&
+        avctx->extradata_size != 904 &&
+        avctx->extradata_size != 1024) {
+        av_log(avctx, AV_LOG_ERROR, "Unexpected extradata size %d\n", avctx->extradata_size);
         return AVERROR_INVALIDDATA;
     }
 
     s->avctx = avctx;
 
-    s->fli_type = AV_RL16(&fli_header[4]); /* Might be overridden if a Magic Carpet FLC */
-
-    depth = 0;
     if (s->avctx->extradata_size == 12) {
         /* special case for magic carpet FLIs */
         s->fli_type = FLC_MAGIC_CARPET_SYNTHETIC_TYPE_CODE;
         depth = 8;
+    } else if (avctx->extradata_size == 1024) {
+        uint8_t *ptr = avctx->extradata;
+        int i;
+
+        for (i = 0; i < 256; i++) {
+            s->palette[i] = AV_RL32(ptr);
+            ptr += 4;
+        }
+        depth = 8;
+        /* FLI in MOV, see e.g. FFmpeg trac issue #626 */
+    } else if (avctx->extradata_size == 0 ||
+               avctx->extradata_size == 256 ||
+        /* see FFmpeg ticket #1234 */
+               avctx->extradata_size == 904) {
+        s->fli_type = FLI_TYPE_CODE;
+        depth = 8;
     } else {
+        s->fli_type = AV_RL16(&fli_header[4]);
         depth = AV_RL16(&fli_header[12]);
     }
 
@@ -116,7 +134,7 @@ static av_cold int flic_decode_init(AVCodecContext *avctx)
         case 15 : avctx->pix_fmt = AV_PIX_FMT_RGB555; break;
         case 16 : avctx->pix_fmt = AV_PIX_FMT_RGB565; break;
         case 24 : avctx->pix_fmt = AV_PIX_FMT_BGR24; /* Supposedly BGR, but no files to test with */
-                  av_log(avctx, AV_LOG_ERROR, "24Bpp FLC/FLX is unsupported due to no test files.\n");
+                  avpriv_request_sample(avctx, "24Bpp FLC/FLX");
                   return AVERROR_PATCHWELCOME;
         default :
                   av_log(avctx, AV_LOG_ERROR, "Unknown FLC/FLX depth of %d Bpp is unsupported.\n",depth);
@@ -139,7 +157,6 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
     FlicDecodeContext *s = avctx->priv_data;
 
     GetByteContext g2;
-    int stream_ptr_after_color_chunk;
     int pixel_ptr;
     int palette_ptr;
     unsigned char palette_idx1;
@@ -171,14 +188,16 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
 
     bytestream2_init(&g2, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     pixels = s->frame->data[0];
     pixel_limit = s->avctx->height * s->frame->linesize[0];
+    if (buf_size < 16 || buf_size > INT_MAX - (3 * 256 + AV_INPUT_BUFFER_PADDING_SIZE))
+        return AVERROR_INVALIDDATA;
     frame_size = bytestream2_get_le32(&g2);
+    if (frame_size > buf_size)
+        frame_size = buf_size;
     bytestream2_skip(&g2, 2); /* skip the magic number */
     num_chunks = bytestream2_get_le16(&g2);
     bytestream2_skip(&g2, 8);  /* skip padding */
@@ -186,15 +205,22 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
     frame_size -= 16;
 
     /* iterate through the chunks */
-    while ((frame_size > 0) && (num_chunks > 0)) {
+    while ((frame_size >= 6) && (num_chunks > 0) &&
+            bytestream2_get_bytes_left(&g2) >= 4) {
+        int stream_ptr_after_chunk;
         chunk_size = bytestream2_get_le32(&g2);
+        if (chunk_size > frame_size) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid chunk_size = %u > frame_size = %u\n", chunk_size, frame_size);
+            chunk_size = frame_size;
+        }
+        stream_ptr_after_chunk = bytestream2_tell(&g2) - 4 + chunk_size;
+
         chunk_type = bytestream2_get_le16(&g2);
 
         switch (chunk_type) {
         case FLI_256_COLOR:
         case FLI_COLOR:
-            stream_ptr_after_color_chunk = bytestream2_tell(&g2) + chunk_size - 6;
-
             /* check special case: If this file is from the Magic Carpet
              * game and uses 6-bit colors even though it reports 256-color
              * chunks in a 0xAF12-type file (fli_type is set to 0xAF13 during
@@ -217,6 +243,9 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                 if (color_changes == 0)
                     color_changes = 256;
 
+                if (bytestream2_tell(&g2) + color_changes * 3 > stream_ptr_after_chunk)
+                    break;
+
                 for (j = 0; j < color_changes; j++) {
                     unsigned int entry;
 
@@ -227,26 +256,22 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     r = bytestream2_get_byte(&g2) << color_shift;
                     g = bytestream2_get_byte(&g2) << color_shift;
                     b = bytestream2_get_byte(&g2) << color_shift;
-                    entry = (r << 16) | (g << 8) | b;
+                    entry = 0xFFU << 24 | r << 16 | g << 8 | b;
+                    if (color_shift == 2)
+                        entry |= entry >> 6 & 0x30303;
                     if (s->palette[palette_ptr] != entry)
                         s->new_palette = 1;
                     s->palette[palette_ptr++] = entry;
                 }
             }
-
-            /* color chunks sometimes have weird 16-bit alignment issues;
-             * therefore, take the hardline approach and skip
-             * to the value calculated w.r.t. the size specified by the color
-             * chunk header */
-            if (stream_ptr_after_color_chunk - bytestream2_tell(&g2) > 0)
-                bytestream2_skip(&g2, stream_ptr_after_color_chunk - bytestream2_tell(&g2));
-
             break;
 
         case FLI_DELTA:
             y_ptr = 0;
             compressed_lines = bytestream2_get_le16(&g2);
             while (compressed_lines > 0) {
+                if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                    break;
                 line_packets = bytestream2_get_le16(&g2);
                 if ((line_packets & 0xC000) == 0xC000) {
                     // line skip opcode
@@ -265,6 +290,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     CHECK_PIXEL_PTR(0);
                     pixel_countdown = s->avctx->width;
                     for (i = 0; i < line_packets; i++) {
+                        if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                            break;
                         /* account for the skip bytes */
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += pixel_skip;
@@ -281,6 +308,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                             }
                         } else {
                             CHECK_PIXEL_PTR(byte_run * 2);
+                            if (bytestream2_tell(&g2) + byte_run * 2 > stream_ptr_after_chunk)
+                                break;
                             for (j = 0; j < byte_run * 2; j++, pixel_countdown--) {
                                 pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             }
@@ -303,16 +332,22 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                 pixel_ptr = y_ptr;
                 CHECK_PIXEL_PTR(0);
                 pixel_countdown = s->avctx->width;
+                if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                    break;
                 line_packets = bytestream2_get_byte(&g2);
                 if (line_packets > 0) {
                     for (i = 0; i < line_packets; i++) {
                         /* account for the skip bytes */
+                        if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                            break;
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += pixel_skip;
                         pixel_countdown -= pixel_skip;
                         byte_run = sign_extend(bytestream2_get_byte(&g2),8);
                         if (byte_run > 0) {
                             CHECK_PIXEL_PTR(byte_run);
+                            if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                                break;
                             for (j = 0; j < byte_run; j++, pixel_countdown--) {
                                 pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             }
@@ -349,6 +384,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                  bytestream2_skip(&g2, 1);
                 pixel_countdown = s->avctx->width;
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (!byte_run) {
                         av_log(avctx, AV_LOG_ERROR, "Invalid byte run value.\n");
@@ -368,6 +405,8 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
                     } else {  /* copy bytes if byte_run < 0 */
                         byte_run = -byte_run;
                         CHECK_PIXEL_PTR(byte_run);
+                        if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                            break;
                         for (j = 0; j < byte_run; j++) {
                             pixels[pixel_ptr++] = bytestream2_get_byte(&g2);
                             pixel_countdown--;
@@ -384,22 +423,23 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
 
         case FLI_COPY:
             /* copy the chunk (uncompressed frame) */
-            if (chunk_size - 6 > s->avctx->width * s->avctx->height) {
+            if (chunk_size - 6 != FFALIGN(s->avctx->width, 4) * s->avctx->height) {
                 av_log(avctx, AV_LOG_ERROR, "In chunk FLI_COPY : source data (%d bytes) " \
-                       "bigger than image, skipping chunk\n", chunk_size - 6);
+                       "has incorrect size, skipping chunk\n", chunk_size - 6);
                 bytestream2_skip(&g2, chunk_size - 6);
             } else {
                 for (y_ptr = 0; y_ptr < s->frame->linesize[0] * s->avctx->height;
                      y_ptr += s->frame->linesize[0]) {
                     bytestream2_get_buffer(&g2, &pixels[y_ptr],
                                            s->avctx->width);
+                    if (s->avctx->width & 3)
+                        bytestream2_skip(&g2, 4 - (s->avctx->width & 3));
                 }
             }
             break;
 
         case FLI_MINI:
             /* some sort of a thumbnail? disregard this chunk... */
-            bytestream2_skip(&g2, chunk_size - 6);
             break;
 
         default:
@@ -407,14 +447,16 @@ static int flic_decode_frame_8BPP(AVCodecContext *avctx,
             break;
         }
 
+        if (stream_ptr_after_chunk - bytestream2_tell(&g2) > 0)
+            bytestream2_skip(&g2, stream_ptr_after_chunk - bytestream2_tell(&g2));
+
         frame_size -= chunk_size;
         num_chunks--;
     }
 
     /* by the end of the chunk, the stream ptr should equal the frame
-     * size (minus 1, possibly); if it doesn't, issue a warning */
-    if ((bytestream2_get_bytes_left(&g2) != 0) &&
-        (bytestream2_get_bytes_left(&g2) != 1))
+     * size (minus 1 or 2, possibly); if it doesn't, issue a warning */
+    if (bytestream2_get_bytes_left(&g2) > 2)
         av_log(avctx, AV_LOG_ERROR, "Processed FLI chunk where chunk size = %d " \
                "and final chunk ptr = %d\n", buf_size,
                buf_size - bytestream2_get_bytes_left(&g2));
@@ -467,10 +509,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
 
     bytestream2_init(&g2, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     pixels = s->frame->data[0];
     pixel_limit = s->avctx->height * s->frame->linesize[0];
@@ -479,14 +519,26 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
     bytestream2_skip(&g2, 2);  /* skip the magic number */
     num_chunks = bytestream2_get_le16(&g2);
     bytestream2_skip(&g2, 8);  /* skip padding */
+    if (frame_size > buf_size)
+        frame_size = buf_size;
 
     frame_size -= 16;
 
     /* iterate through the chunks */
-    while ((frame_size > 0) && (num_chunks > 0)) {
+    while ((frame_size > 0) && (num_chunks > 0) &&
+            bytestream2_get_bytes_left(&g2) >= 4) {
+        int stream_ptr_after_chunk;
         chunk_size = bytestream2_get_le32(&g2);
+        if (chunk_size > frame_size) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Invalid chunk_size = %u > frame_size = %u\n", chunk_size, frame_size);
+            chunk_size = frame_size;
+        }
+        stream_ptr_after_chunk = bytestream2_tell(&g2) - 4 + chunk_size;
+
         chunk_type = bytestream2_get_le16(&g2);
 
+
         switch (chunk_type) {
         case FLI_256_COLOR:
         case FLI_COLOR:
@@ -504,6 +556,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
             y_ptr = 0;
             compressed_lines = bytestream2_get_le16(&g2);
             while (compressed_lines > 0) {
+                if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                    break;
                 line_packets = bytestream2_get_le16(&g2);
                 if (line_packets < 0) {
                     line_packets = -line_packets;
@@ -515,6 +569,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                     pixel_countdown = s->avctx->width;
                     for (i = 0; i < line_packets; i++) {
                         /* account for the skip bytes */
+                        if (bytestream2_tell(&g2) + 2 > stream_ptr_after_chunk)
+                            break;
                         pixel_skip = bytestream2_get_byte(&g2);
                         pixel_ptr += (pixel_skip*2); /* Pixel is 2 bytes wide */
                         pixel_countdown -= pixel_skip;
@@ -528,6 +584,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                                 pixel_ptr += 2;
                             }
                         } else {
+                            if (bytestream2_tell(&g2) + 2*byte_run > stream_ptr_after_chunk)
+                                break;
                             CHECK_PIXEL_PTR(2 * byte_run);
                             for (j = 0; j < byte_run; j++, pixel_countdown--) {
                                 *((signed short*)(&pixels[pixel_ptr])) = bytestream2_get_le16(&g2);
@@ -562,6 +620,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                 pixel_countdown = (s->avctx->width * 2);
 
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (byte_run > 0) {
                         palette_idx1 = bytestream2_get_byte(&g2);
@@ -575,6 +635,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                         }
                     } else {  /* copy bytes if byte_run < 0 */
                         byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + byte_run > stream_ptr_after_chunk)
+                            break;
                         CHECK_PIXEL_PTR(byte_run);
                         for (j = 0; j < byte_run; j++) {
                             palette_idx1 = bytestream2_get_byte(&g2);
@@ -614,6 +676,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                 pixel_countdown = s->avctx->width; /* Width is in pixels, not bytes */
 
                 while (pixel_countdown > 0) {
+                    if (bytestream2_tell(&g2) + 1 > stream_ptr_after_chunk)
+                        break;
                     byte_run = sign_extend(bytestream2_get_byte(&g2), 8);
                     if (byte_run > 0) {
                         pixel    = bytestream2_get_le16(&g2);
@@ -628,6 +692,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                         }
                     } else {  /* copy pixels if byte_run < 0 */
                         byte_run = -byte_run;
+                        if (bytestream2_tell(&g2) + 2 * byte_run > stream_ptr_after_chunk)
+                            break;
                         CHECK_PIXEL_PTR(2 * byte_run);
                         for (j = 0; j < byte_run; j++) {
                             *((signed short*)(&pixels[pixel_ptr])) = bytestream2_get_le16(&g2);
@@ -647,7 +713,7 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
         case FLI_COPY:
         case FLI_DTA_COPY:
             /* copy the chunk (uncompressed frame) */
-            if (chunk_size - 6 > (unsigned int)(s->avctx->width * s->avctx->height)*2) {
+            if (chunk_size - 6 > (unsigned int)(FFALIGN(s->avctx->width, 2) * s->avctx->height)*2) {
                 av_log(avctx, AV_LOG_ERROR, "In chunk FLI_COPY : source data (%d bytes) " \
                        "bigger than image, skipping chunk\n", chunk_size - 6);
                 bytestream2_skip(&g2, chunk_size - 6);
@@ -663,6 +729,8 @@ static int flic_decode_frame_15_16BPP(AVCodecContext *avctx,
                       pixel_ptr += 2;
                       pixel_countdown--;
                     }
+                    if (s->avctx->width & 1)
+                        bytestream2_skip(&g2, 2);
                 }
             }
             break;
diff --git a/libavcodec/flv.h b/libavcodec/flv.h
index 801e357..561cfe0 100644
--- a/libavcodec/flv.h
+++ b/libavcodec/flv.h
@@ -1,20 +1,20 @@
 /*
  * FLV specific private header.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,5 @@ void ff_flv2_encode_ac_esc(PutBitContext *pb, int slevel, int level, int run,
                            int last);
 
 int ff_flv_decode_picture_header(MpegEncContext *s);
-void ff_flv2_decode_ac_esc(GetBitContext *gb, int *level, int *run, int *last);
 
 #endif /* AVCODEC_FLV_H */
diff --git a/libavcodec/flvdec.c b/libavcodec/flvdec.c
index f2d4929..f74ba3f 100644
--- a/libavcodec/flvdec.c
+++ b/libavcodec/flvdec.c
@@ -1,20 +1,20 @@
 /*
  * FLV decoding.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,17 +25,6 @@
 #include "mpegvideo.h"
 #include "mpegvideodata.h"
 
-void ff_flv2_decode_ac_esc(GetBitContext *gb, int *level, int *run, int *last)
-{
-    int is11 = get_bits1(gb);
-    *last = get_bits1(gb);
-    *run  = get_bits(gb, 6);
-    if (is11)
-        *level = get_sbits(gb, 11);
-    else
-        *level = get_sbits(gb, 7);
-}
-
 int ff_flv_decode_picture_header(MpegEncContext *s)
 {
     int format, width, height;
@@ -43,12 +32,12 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
     /* picture header */
     if (get_bits_long(&s->gb, 17) != 1) {
         av_log(s->avctx, AV_LOG_ERROR, "Bad picture start code\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     format = get_bits(&s->gb, 5);
     if (format != 0 && format != 1) {
         av_log(s->avctx, AV_LOG_ERROR, "Bad picture format\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     s->h263_flv       = format + 1;
     s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
@@ -87,7 +76,7 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
         break;
     }
     if (av_image_check_size(width, height, 0, s->avctx))
-        return -1;
+        return AVERROR(EINVAL);
     s->width  = width;
     s->height = height;
 
@@ -105,10 +94,14 @@ int ff_flv_decode_picture_header(MpegEncContext *s)
     s->h263_long_vectors = 0;
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
+
     s->f_code = 1;
 
+    if (s->ehc_mode)
+        s->avctx->sample_aspect_ratio= (AVRational){1,2};
+
     if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(s->avctx, AV_LOG_DEBUG, "%c esc_type:%d, qp:%d num:%d\n",
                s->droppable ? 'D' : av_get_picture_type_char(s->pict_type),
@@ -130,6 +123,7 @@ AVCodec ff_flv_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
                                                      AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/flvenc.c b/libavcodec/flvenc.c
index f7c72c5..15f794e 100644
--- a/libavcodec/flvenc.c
+++ b/libavcodec/flvenc.c
@@ -1,20 +1,20 @@
 /*
  * FLV Encoding specific code.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index f94d438..3b33af6 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,4 +63,6 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
         ff_fmt_convert_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_fmt_convert_init_x86(c, avctx);
+    if (HAVE_MIPSFPU)
+        ff_fmt_convert_init_mips(c);
 }
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index b2c2356..a1b17e4 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,5 +72,6 @@ void ff_fmt_convert_init_aarch64(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_mips(FmtConvertContext *c);
 
 #endif /* AVCODEC_FMTCONVERT_H */
diff --git a/libavcodec/frame_thread_encoder.c b/libavcodec/frame_thread_encoder.c
new file mode 100644
index 0000000..27ae356
--- /dev/null
+++ b/libavcodec/frame_thread_encoder.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "frame_thread_encoder.h"
+
+#include "libavutil/fifo.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/thread.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "thread.h"
+
+#define MAX_THREADS 64
+#define BUFFER_SIZE (2*MAX_THREADS)
+
+typedef struct{
+    void *indata;
+    void *outdata;
+    int64_t return_code;
+    unsigned index;
+} Task;
+
+typedef struct{
+    AVCodecContext *parent_avctx;
+    pthread_mutex_t buffer_mutex;
+
+    AVFifoBuffer *task_fifo;
+    pthread_mutex_t task_fifo_mutex;
+    pthread_cond_t task_fifo_cond;
+
+    Task finished_tasks[BUFFER_SIZE];
+    pthread_mutex_t finished_task_mutex;
+    pthread_cond_t finished_task_cond;
+
+    unsigned task_index;
+    unsigned finished_task_index;
+
+    pthread_t worker[MAX_THREADS];
+    int exit;
+} ThreadContext;
+
+static void * attribute_align_arg worker(void *v){
+    AVCodecContext *avctx = v;
+    ThreadContext *c = avctx->internal->frame_thread_encoder;
+    AVPacket *pkt = NULL;
+
+    while(!c->exit){
+        int got_packet, ret;
+        AVFrame *frame;
+        Task task;
+
+        if(!pkt) pkt= av_mallocz(sizeof(*pkt));
+        if(!pkt) continue;
+        av_init_packet(pkt);
+
+        pthread_mutex_lock(&c->task_fifo_mutex);
+        while (av_fifo_size(c->task_fifo) <= 0 || c->exit) {
+            if(c->exit){
+                pthread_mutex_unlock(&c->task_fifo_mutex);
+                goto end;
+            }
+            pthread_cond_wait(&c->task_fifo_cond, &c->task_fifo_mutex);
+        }
+        av_fifo_generic_read(c->task_fifo, &task, sizeof(task), NULL);
+        pthread_mutex_unlock(&c->task_fifo_mutex);
+        frame = task.indata;
+
+        ret = avcodec_encode_video2(avctx, pkt, frame, &got_packet);
+        pthread_mutex_lock(&c->buffer_mutex);
+        av_frame_unref(frame);
+        pthread_mutex_unlock(&c->buffer_mutex);
+        av_frame_free(&frame);
+        if(got_packet) {
+            int ret2 = av_dup_packet(pkt);
+            if (ret >= 0 && ret2 < 0)
+                ret = ret2;
+        } else {
+            pkt->data = NULL;
+            pkt->size = 0;
+        }
+        pthread_mutex_lock(&c->finished_task_mutex);
+        c->finished_tasks[task.index].outdata = pkt; pkt = NULL;
+        c->finished_tasks[task.index].return_code = ret;
+        pthread_cond_signal(&c->finished_task_cond);
+        pthread_mutex_unlock(&c->finished_task_mutex);
+    }
+end:
+    av_free(pkt);
+    pthread_mutex_lock(&c->buffer_mutex);
+    avcodec_close(avctx);
+    pthread_mutex_unlock(&c->buffer_mutex);
+    av_freep(&avctx);
+    return NULL;
+}
+
+int ff_frame_thread_encoder_init(AVCodecContext *avctx, AVDictionary *options){
+    int i=0;
+    ThreadContext *c;
+
+
+    if(   !(avctx->thread_type & FF_THREAD_FRAME)
+       || !(avctx->codec->capabilities & AV_CODEC_CAP_INTRA_ONLY))
+        return 0;
+
+    if(   !avctx->thread_count
+       && avctx->codec_id == AV_CODEC_ID_MJPEG
+       && !(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
+        av_log(avctx, AV_LOG_DEBUG,
+               "Forcing thread count to 1 for MJPEG encoding, use -thread_type slice "
+               "or a constant quantizer if you want to use multiple cpu cores\n");
+        avctx->thread_count = 1;
+    }
+    if(   avctx->thread_count > 1
+       && avctx->codec_id == AV_CODEC_ID_MJPEG
+       && !(avctx->flags & AV_CODEC_FLAG_QSCALE))
+        av_log(avctx, AV_LOG_WARNING,
+               "MJPEG CBR encoding works badly with frame multi-threading, consider "
+               "using -threads 1, -thread_type slice or a constant quantizer.\n");
+
+    if (avctx->codec_id == AV_CODEC_ID_HUFFYUV ||
+        avctx->codec_id == AV_CODEC_ID_FFVHUFF) {
+        int warn = 0;
+        int context_model = 0;
+        AVDictionaryEntry *con = av_dict_get(options, "context", NULL, AV_DICT_MATCH_CASE);
+
+        if (con && con->value)
+            context_model = atoi(con->value);
+
+        if (avctx->flags & AV_CODEC_FLAG_PASS1)
+            warn = 1;
+        else if(context_model > 0) {
+            AVDictionaryEntry *t = av_dict_get(options, "non_deterministic",
+                                               NULL, AV_DICT_MATCH_CASE);
+            warn = !t || !t->value || !atoi(t->value) ? 1 : 0;
+        }
+        // huffyuv does not support these with multiple frame threads currently
+        if (warn) {
+            av_log(avctx, AV_LOG_WARNING,
+               "Forcing thread count to 1 for huffyuv encoding with first pass or context 1\n");
+            avctx->thread_count = 1;
+        }
+    }
+
+    if(!avctx->thread_count) {
+        avctx->thread_count = av_cpu_count();
+        avctx->thread_count = FFMIN(avctx->thread_count, MAX_THREADS);
+    }
+
+    if(avctx->thread_count <= 1)
+        return 0;
+
+    if(avctx->thread_count > MAX_THREADS)
+        return AVERROR(EINVAL);
+
+    av_assert0(!avctx->internal->frame_thread_encoder);
+    c = avctx->internal->frame_thread_encoder = av_mallocz(sizeof(ThreadContext));
+    if(!c)
+        return AVERROR(ENOMEM);
+
+    c->parent_avctx = avctx;
+
+    c->task_fifo = av_fifo_alloc_array(BUFFER_SIZE, sizeof(Task));
+    if(!c->task_fifo)
+        goto fail;
+
+    pthread_mutex_init(&c->task_fifo_mutex, NULL);
+    pthread_mutex_init(&c->finished_task_mutex, NULL);
+    pthread_mutex_init(&c->buffer_mutex, NULL);
+    pthread_cond_init(&c->task_fifo_cond, NULL);
+    pthread_cond_init(&c->finished_task_cond, NULL);
+
+    for(i=0; i<avctx->thread_count ; i++){
+        AVDictionary *tmp = NULL;
+        void *tmpv;
+        AVCodecContext *thread_avctx = avcodec_alloc_context3(avctx->codec);
+        if(!thread_avctx)
+            goto fail;
+        tmpv = thread_avctx->priv_data;
+        *thread_avctx = *avctx;
+        thread_avctx->priv_data = tmpv;
+        thread_avctx->internal = NULL;
+        memcpy(thread_avctx->priv_data, avctx->priv_data, avctx->codec->priv_data_size);
+        thread_avctx->thread_count = 1;
+        thread_avctx->active_thread_type &= ~FF_THREAD_FRAME;
+
+        av_dict_copy(&tmp, options, 0);
+        av_dict_set(&tmp, "threads", "1", 0);
+        if(avcodec_open2(thread_avctx, avctx->codec, &tmp) < 0) {
+            av_dict_free(&tmp);
+            goto fail;
+        }
+        av_dict_free(&tmp);
+        av_assert0(!thread_avctx->internal->frame_thread_encoder);
+        thread_avctx->internal->frame_thread_encoder = c;
+        if(pthread_create(&c->worker[i], NULL, worker, thread_avctx)) {
+            goto fail;
+        }
+    }
+
+    avctx->active_thread_type = FF_THREAD_FRAME;
+
+    return 0;
+fail:
+    avctx->thread_count = i;
+    av_log(avctx, AV_LOG_ERROR, "ff_frame_thread_encoder_init failed\n");
+    ff_frame_thread_encoder_free(avctx);
+    return -1;
+}
+
+void ff_frame_thread_encoder_free(AVCodecContext *avctx){
+    int i;
+    ThreadContext *c= avctx->internal->frame_thread_encoder;
+
+    pthread_mutex_lock(&c->task_fifo_mutex);
+    c->exit = 1;
+    pthread_cond_broadcast(&c->task_fifo_cond);
+    pthread_mutex_unlock(&c->task_fifo_mutex);
+
+    for (i=0; i<avctx->thread_count; i++) {
+         pthread_join(c->worker[i], NULL);
+    }
+
+    pthread_mutex_destroy(&c->task_fifo_mutex);
+    pthread_mutex_destroy(&c->finished_task_mutex);
+    pthread_mutex_destroy(&c->buffer_mutex);
+    pthread_cond_destroy(&c->task_fifo_cond);
+    pthread_cond_destroy(&c->finished_task_cond);
+    av_fifo_freep(&c->task_fifo);
+    av_freep(&avctx->internal->frame_thread_encoder);
+}
+
+int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet_ptr){
+    ThreadContext *c = avctx->internal->frame_thread_encoder;
+    Task task;
+    int ret;
+
+    av_assert1(!*got_packet_ptr);
+
+    if(frame){
+        AVFrame *new = av_frame_alloc();
+        if(!new)
+            return AVERROR(ENOMEM);
+        ret = av_frame_ref(new, frame);
+        if(ret < 0) {
+            av_frame_free(&new);
+            return ret;
+        }
+
+        task.index = c->task_index;
+        task.indata = (void*)new;
+        pthread_mutex_lock(&c->task_fifo_mutex);
+        av_fifo_generic_write(c->task_fifo, &task, sizeof(task), NULL);
+        pthread_cond_signal(&c->task_fifo_cond);
+        pthread_mutex_unlock(&c->task_fifo_mutex);
+
+        c->task_index = (c->task_index+1) % BUFFER_SIZE;
+
+        if(!c->finished_tasks[c->finished_task_index].outdata && (c->task_index - c->finished_task_index) % BUFFER_SIZE <= avctx->thread_count)
+            return 0;
+    }
+
+    if(c->task_index == c->finished_task_index)
+        return 0;
+
+    pthread_mutex_lock(&c->finished_task_mutex);
+    while (!c->finished_tasks[c->finished_task_index].outdata) {
+        pthread_cond_wait(&c->finished_task_cond, &c->finished_task_mutex);
+    }
+    task = c->finished_tasks[c->finished_task_index];
+    *pkt = *(AVPacket*)(task.outdata);
+    if(pkt->data)
+        *got_packet_ptr = 1;
+    av_freep(&c->finished_tasks[c->finished_task_index].outdata);
+    c->finished_task_index = (c->finished_task_index+1) % BUFFER_SIZE;
+    pthread_mutex_unlock(&c->finished_task_mutex);
+
+    return task.return_code;
+}
diff --git a/libavcodec/frame_thread_encoder.h b/libavcodec/frame_thread_encoder.h
new file mode 100644
index 0000000..1f79553
--- /dev/null
+++ b/libavcodec/frame_thread_encoder.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FRAME_THREAD_ENCODER_H
+#define AVCODEC_FRAME_THREAD_ENCODER_H
+
+#include "avcodec.h"
+
+int ff_frame_thread_encoder_init(AVCodecContext *avctx, AVDictionary *options);
+void ff_frame_thread_encoder_free(AVCodecContext *avctx);
+int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet_ptr);
+
+#endif /* AVCODEC_FRAME_THREAD_ENCODER_H */
diff --git a/libavcodec/fraps.c b/libavcodec/fraps.c
index eb61c70..57e13f2 100644
--- a/libavcodec/fraps.c
+++ b/libavcodec/fraps.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Roine Gustafsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,7 @@
 #include "bytestream.h"
 #include "bswapdsp.h"
 #include "internal.h"
+#include "thread.h"
 
 #define FPS_TAG MKTAG('F', 'P', 'S', 'x')
 #define VLC_BITS 11
@@ -47,7 +48,6 @@
 typedef struct FrapsContext {
     AVCodecContext *avctx;
     BswapDSPContext bdsp;
-    AVFrame *frame;
     uint8_t *tmpbuf;
     int tmpbuf_size;
 } FrapsContext;
@@ -62,15 +62,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
 {
     FrapsContext * const s = avctx->priv_data;
 
-    avctx->pix_fmt     = AV_PIX_FMT_NONE; /* set in decode_frame */
-
     s->avctx  = avctx;
     s->tmpbuf = NULL;
 
-    s->frame = av_frame_alloc();
-    if (!s->frame)
-        return AVERROR(ENOMEM);
-
     ff_bswapdsp_init(&s->bdsp);
 
     return 0;
@@ -111,7 +105,9 @@ static int fraps2_decode_plane(FrapsContext *s, uint8_t *dst, int stride, int w,
     s->bdsp.bswap_buf((uint32_t *) s->tmpbuf,
                       (const uint32_t *) src, size >> 2);
 
-    init_get_bits(&gb, s->tmpbuf, size * 8);
+    if ((ret = init_get_bits8(&gb, s->tmpbuf, size)) < 0)
+        return ret;
+
     for (j = 0; j < h; j++) {
         for (i = 0; i < w*step; i += step) {
             dst[i] = get_vlc2(&gb, vlc.table, VLC_BITS, 3);
@@ -140,17 +136,17 @@ static int decode_frame(AVCodecContext *avctx,
     FrapsContext * const s = avctx->priv_data;
     const uint8_t *buf     = avpkt->data;
     int buf_size           = avpkt->size;
-    AVFrame *frame         = data;
-    AVFrame * const f      = s->frame;
+    ThreadFrame frame = { .f = data };
+    AVFrame * const f = data;
     uint32_t header;
     unsigned int version,header_size;
     unsigned int x, y;
     const uint32_t *buf32;
     uint32_t *luma1,*luma2,*cb,*cr;
     uint32_t offs[4];
-    int i, j, ret, is_chroma, planes;
-    enum AVPixelFormat pix_fmt;
-    int prev_pic_bit, expected_size;
+    int i, j, ret, is_chroma;
+    const int planes = 3;
+    uint8_t *out;
 
     if (buf_size < 4) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too short\n");
@@ -160,7 +156,6 @@ static int decode_frame(AVCodecContext *avctx,
     header      = AV_RL32(buf);
     version     = header & 0xff;
     header_size = (header & (1<<30))? 8 : 4; /* bit 30 means pad to 8 bytes */
-    prev_pic_bit = header & (1U << 31); /* bit 31 means same as previous pic */
 
     if (version > 5) {
         av_log(avctx, AV_LOG_ERROR,
@@ -169,89 +164,92 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_PATCHWELCOME;
     }
 
-    buf += 4;
-    if (header_size == 8)
-        buf += 4;
+    buf += header_size;
 
-    pix_fmt = version & 1 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
-    if (avctx->pix_fmt != pix_fmt && f->data[0]) {
-        av_frame_unref(f);
+    if (version < 2) {
+        unsigned needed_size = avctx->width * avctx->height * 3;
+        if (version == 0) needed_size /= 2;
+        needed_size += header_size;
+        /* bit 31 means same as previous pic */
+        if (header & (1U<<31)) {
+            *got_frame = 0;
+            return buf_size;
+        }
+        if (buf_size != needed_size) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid frame length %d (should be %d)\n",
+                   buf_size, needed_size);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        /* skip frame */
+        if (buf_size == 8) {
+            *got_frame = 0;
+            return buf_size;
+        }
+        if (AV_RL32(buf) != FPS_TAG || buf_size < planes*1024 + 24) {
+            av_log(avctx, AV_LOG_ERROR, "error in data stream\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < planes; i++) {
+            offs[i] = AV_RL32(buf + 4 + i * 4);
+            if (offs[i] >= buf_size - header_size || (i && offs[i] <= offs[i - 1] + 1024)) {
+                av_log(avctx, AV_LOG_ERROR, "plane %i offset is out of bounds\n", i);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        offs[planes] = buf_size - header_size;
+        for (i = 0; i < planes; i++) {
+            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size, offs[i + 1] - offs[i] - 1024);
+            if (!s->tmpbuf)
+                return AVERROR(ENOMEM);
+        }
     }
-    avctx->pix_fmt = pix_fmt;
+
+    f->pict_type = AV_PICTURE_TYPE_I;
+    f->key_frame = 1;
+
+    avctx->pix_fmt = version & 1 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
     avctx->color_range = version & 1 ? AVCOL_RANGE_UNSPECIFIED
                                      : AVCOL_RANGE_JPEG;
+    avctx->colorspace = version & 1 ? AVCOL_SPC_UNSPECIFIED : AVCOL_SPC_BT709;
 
-    expected_size = header_size;
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
 
     switch (version) {
     case 0:
     default:
         /* Fraps v0 is a reordered YUV420 */
-        if (!prev_pic_bit)
-            expected_size += avctx->width * avctx->height * 3 / 2;
-        if (buf_size != expected_size) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid frame length %d (should be %d)\n",
-                   buf_size, expected_size);
-            return AVERROR_INVALIDDATA;
-        }
-
         if (((avctx->width % 8) != 0) || ((avctx->height % 2) != 0)) {
             av_log(avctx, AV_LOG_ERROR, "Invalid frame size %dx%d\n",
                    avctx->width, avctx->height);
             return AVERROR_INVALIDDATA;
         }
 
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        f->pict_type = prev_pic_bit ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
-        f->key_frame = f->pict_type == AV_PICTURE_TYPE_I;
-
-        if (f->pict_type == AV_PICTURE_TYPE_I) {
-            buf32 = (const uint32_t*)buf;
-            for (y = 0; y < avctx->height / 2; y++) {
-                luma1 = (uint32_t*)&f->data[0][ y * 2      * f->linesize[0]];
-                luma2 = (uint32_t*)&f->data[0][(y * 2 + 1) * f->linesize[0]];
-                cr    = (uint32_t*)&f->data[1][ y          * f->linesize[1]];
-                cb    = (uint32_t*)&f->data[2][ y          * f->linesize[2]];
-                for (x = 0; x < avctx->width; x += 8) {
-                    *(luma1++) = *(buf32++);
-                    *(luma1++) = *(buf32++);
-                    *(luma2++) = *(buf32++);
-                    *(luma2++) = *(buf32++);
-                    *(cr++) = *(buf32++);
-                    *(cb++) = *(buf32++);
-                }
+        buf32 = (const uint32_t*)buf;
+        for (y = 0; y < avctx->height / 2; y++) {
+            luma1 = (uint32_t*)&f->data[0][  y * 2      * f->linesize[0] ];
+            luma2 = (uint32_t*)&f->data[0][ (y * 2 + 1) * f->linesize[0] ];
+            cr    = (uint32_t*)&f->data[1][  y          * f->linesize[1] ];
+            cb    = (uint32_t*)&f->data[2][  y          * f->linesize[2] ];
+            for (x = 0; x < avctx->width; x += 8) {
+                *luma1++ = *buf32++;
+                *luma1++ = *buf32++;
+                *luma2++ = *buf32++;
+                *luma2++ = *buf32++;
+                *cr++    = *buf32++;
+                *cb++    = *buf32++;
             }
         }
         break;
 
     case 1:
         /* Fraps v1 is an upside-down BGR24 */
-        if (!prev_pic_bit)
-            expected_size += avctx->width * avctx->height * 3;
-        if (buf_size != expected_size) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Invalid frame length %d (should be %d)\n",
-                   buf_size, expected_size);
-            return AVERROR_INVALIDDATA;
-        }
-
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        f->pict_type = prev_pic_bit ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
-        f->key_frame = f->pict_type == AV_PICTURE_TYPE_I;
-
-        if (f->pict_type == AV_PICTURE_TYPE_I) {
             for (y = 0; y<avctx->height; y++)
                 memcpy(&f->data[0][(avctx->height - y - 1) * f->linesize[0]],
                        &buf[y * avctx->width * 3],
                        3 * avctx->width);
-        }
         break;
 
     case 2:
@@ -260,37 +258,8 @@ static int decode_frame(AVCodecContext *avctx,
          * Fraps v2 is Huffman-coded YUV420 planes
          * Fraps v4 is virtually the same
          */
-        planes = 3;
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        /* skip frame */
-        if (buf_size == 8) {
-            f->pict_type = AV_PICTURE_TYPE_P;
-            f->key_frame = 0;
-            break;
-        }
-        f->pict_type = AV_PICTURE_TYPE_I;
-        f->key_frame = 1;
-        if ((AV_RL32(buf) != FPS_TAG) || (buf_size < (planes * 1024 + 24))) {
-            av_log(avctx, AV_LOG_ERROR, "Fraps: error in data stream\n");
-            return AVERROR_INVALIDDATA;
-        }
-        for (i = 0; i < planes; i++) {
-            offs[i] = AV_RL32(buf + 4 + i * 4);
-            if (offs[i] >= buf_size || (i && offs[i] <= offs[i - 1] + 1024)) {
-                av_log(avctx, AV_LOG_ERROR, "Fraps: plane %i offset is out of bounds\n", i);
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        offs[planes] = buf_size;
         for (i = 0; i < planes; i++) {
             is_chroma = !!i;
-            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size,
-                                  offs[i + 1] - offs[i] - 1024);
-            if (!s->tmpbuf)
-                return AVERROR(ENOMEM);
             if ((ret = fraps2_decode_plane(s, f->data[i], f->linesize[i],
                                            avctx->width  >> is_chroma,
                                            avctx->height >> is_chroma,
@@ -304,36 +273,7 @@ static int decode_frame(AVCodecContext *avctx,
     case 3:
     case 5:
         /* Virtually the same as version 4, but is for RGB24 */
-        planes = 3;
-        if ((ret = ff_reget_buffer(avctx, f)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
-            return ret;
-        }
-        /* skip frame */
-        if (buf_size == 8) {
-            f->pict_type = AV_PICTURE_TYPE_P;
-            f->key_frame = 0;
-            break;
-        }
-        f->pict_type = AV_PICTURE_TYPE_I;
-        f->key_frame = 1;
-        if ((AV_RL32(buf) != FPS_TAG)||(buf_size < (planes*1024 + 24))) {
-            av_log(avctx, AV_LOG_ERROR, "Fraps: error in data stream\n");
-            return AVERROR_INVALIDDATA;
-        }
         for (i = 0; i < planes; i++) {
-            offs[i] = AV_RL32(buf + 4 + i * 4);
-            if (offs[i] >= buf_size || (i && offs[i] <= offs[i - 1] + 1024)) {
-                av_log(avctx, AV_LOG_ERROR, "Fraps: plane %i offset is out of bounds\n", i);
-                return AVERROR_INVALIDDATA;
-            }
-        }
-        offs[planes] = buf_size;
-        for (i = 0; i < planes; i++) {
-            av_fast_padded_malloc(&s->tmpbuf, &s->tmpbuf_size,
-                                  offs[i + 1] - offs[i] - 1024);
-            if (!s->tmpbuf)
-                return AVERROR(ENOMEM);
             if ((ret = fraps2_decode_plane(s, f->data[0] + i + (f->linesize[0] * (avctx->height - 1)),
                                            -f->linesize[0], avctx->width, avctx->height,
                                            buf + offs[i], offs[i + 1] - offs[i], 0, 3)) < 0) {
@@ -341,18 +281,20 @@ static int decode_frame(AVCodecContext *avctx,
                 return ret;
             }
         }
+        out = f->data[0];
         // convert pseudo-YUV into real RGB
         for (j = 0; j < avctx->height; j++) {
-            for (i = 0; i < avctx->width; i++) {
-                f->data[0][0 + i*3 + j*f->linesize[0]] += f->data[0][1 + i*3 + j*f->linesize[0]];
-                f->data[0][2 + i*3 + j*f->linesize[0]] += f->data[0][1 + i*3 + j*f->linesize[0]];
+            uint8_t *line_end = out + 3*avctx->width;
+            while (out < line_end) {
+                out[0]  += out[1];
+                out[2]  += out[1];
+                out += 3;
             }
+            out += f->linesize[0] - 3*avctx->width;
         }
         break;
     }
 
-    if ((ret = av_frame_ref(frame, f)) < 0)
-        return ret;
     *got_frame = 1;
 
     return buf_size;
@@ -368,8 +310,6 @@ static av_cold int decode_end(AVCodecContext *avctx)
 {
     FrapsContext *s = (FrapsContext*)avctx->priv_data;
 
-    av_frame_free(&s->frame);
-
     av_freep(&s->tmpbuf);
     return 0;
 }
@@ -384,5 +324,5 @@ AVCodec ff_fraps_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/frwu.c b/libavcodec/frwu.c
index 61cd315..e68fda9 100644
--- a/libavcodec/frwu.c
+++ b/libavcodec/frwu.c
@@ -3,26 +3,32 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+    AVClass *av_class;
+    int change_field_order;
+} FRWUContext;
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
@@ -38,6 +44,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
+    FRWUContext *s = avctx->priv_data;
     int field, ret;
     AVFrame *pic = data;
     const uint8_t *buf = avpkt->data;
@@ -52,15 +59,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
         return ret;
-    }
 
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
-    pic->interlaced_frame = 1;
-    pic->top_field_first = 1;
 
     for (field = 0; field < 2; field++) {
         int i;
@@ -79,9 +82,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             av_log(avctx, AV_LOG_ERROR, "Packet is too small, need %i, have %i\n", field_size, (int)(buf_end - buf));
             return AVERROR_INVALIDDATA;
         }
-        if (field)
+        if (field ^ s->change_field_order) {
             dst += pic->linesize[0];
+        } else if (s->change_field_order) {
+            dst += 2 * pic->linesize[0];
+        }
         for (i = 0; i < field_h; i++) {
+            if (s->change_field_order && field && i == field_h - 1)
+                dst = pic->data[0];
             memcpy(dst, buf, avctx->width * 2);
             buf += avctx->width * 2;
             dst += pic->linesize[0] << 1;
@@ -94,12 +102,27 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     return avpkt->size;
 }
 
+static const AVOption frwu_options[] = {
+    {"change_field_order", "Change field order", offsetof(FRWUContext, change_field_order), AV_OPT_TYPE_BOOL,
+     {.i64 = 0}, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM},
+    {NULL}
+};
+
+static const AVClass frwu_class = {
+    .class_name = "frwu Decoder",
+    .item_name  = av_default_item_name,
+    .option     = frwu_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_frwu_decoder = {
     .name           = "frwu",
     .long_name      = NULL_IF_CONFIG_SMALL("Forward Uncompressed"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FRWU,
+    .priv_data_size = sizeof(FRWUContext),
     .init           = decode_init,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &frwu_class,
 };
diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c
index a89610d..b0af3ec 100644
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Konstantin Shishkov
  * Copyright (c) 2013 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -298,7 +298,8 @@ static int jpg_decode_data(JPGContext *c, int width, int height,
         return ret;
     jpg_unescape(src, src_size, c->buf, &unesc_size);
     memset(c->buf + unesc_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-    init_get_bits(&gb, c->buf, unesc_size * 8);
+    if((ret = init_get_bits8(&gb, c->buf, unesc_size)) < 0)
+        return ret;
 
     width = FFALIGN(width, 16);
     mb_w  =  width        >> 4;
@@ -554,6 +555,11 @@ static uint32_t epic_decode_pixel_pred(ePICContext *dc, int x, int y,
         B     = ((pred >> B_shift) & 0xFF) - TOSIGNED(delta);
     }
 
+    if (R<0 || G<0 || B<0) {
+        av_log(NULL, AV_LOG_ERROR, "RGB %d %d %d is out of range\n", R, G, B);
+        return 0;
+    }
+
     return (R << R_shift) | (G << G_shift) | (B << B_shift);
 }
 
@@ -625,6 +631,8 @@ static int epic_decode_run_length(ePICContext *dc, int x, int y, int tile_width,
               (NN  != N)  << 1 |
               (NNW != NW);
         WWneW = ff_els_decode_bit(&dc->els_ctx, &dc->W_ctx_rung[idx]);
+        if (WWneW < 0)
+            return WWneW;
     }
 
     if (WWneW)
@@ -831,10 +839,13 @@ static int epic_decode_tile(ePICContext *dc, uint8_t *out, int tile_height,
                 if (y < 2 || x < 2 || x == tile_width - 1) {
                     run       = 1;
                     got_pixel = epic_handle_edges(dc, x, y, curr_row, above_row, &pix);
-                } else
+                } else {
                     got_pixel = epic_decode_run_length(dc, x, y, tile_width,
                                                        curr_row, above_row,
                                                        above2_row, &pix, &run);
+                    if (got_pixel < 0)
+                        return got_pixel;
+                }
 
                 if (!got_pixel && !epic_predict_from_NW_NE(dc, x, y, run,
                                                            tile_width, curr_row,
@@ -889,7 +900,7 @@ static int epic_jb_decode_tile(G2MContext *c, int tile_x, int tile_y,
     }
 
     if (src_size < els_dsize) {
-        av_log(avctx, AV_LOG_ERROR, "ePIC: data too short, needed %zu, got %zu\n",
+        av_log(avctx, AV_LOG_ERROR, "ePIC: data too short, needed %"SIZE_SPECIFIER", got %"SIZE_SPECIFIER"\n",
                els_dsize, src_size);
         return AVERROR_INVALIDDATA;
     }
@@ -1005,7 +1016,7 @@ static int epic_jb_decode_tile(G2MContext *c, int tile_x, int tile_y,
     return 0;
 }
 
-static void kempf_restore_buf(const uint8_t *src, int len,
+static int kempf_restore_buf(const uint8_t *src, int len,
                               uint8_t *dst, int stride,
                               const uint8_t *jpeg_tile, int tile_stride,
                               int width, int height,
@@ -1013,9 +1024,11 @@ static void kempf_restore_buf(const uint8_t *src, int len,
 {
     GetBitContext gb;
     int i, j, nb, col;
+    int ret;
     int align_width = FFALIGN(width, 16);
 
-    init_get_bits(&gb, src, len * 8);
+    if ((ret = init_get_bits8(&gb, src, len)) < 0)
+        return ret;
 
     if (npal <= 2)       nb = 1;
     else if (npal <= 4)  nb = 2;
@@ -1034,6 +1047,8 @@ static void kempf_restore_buf(const uint8_t *src, int len,
         }
         skip_bits_long(&gb, nb * (align_width - width));
     }
+
+    return 0;
 }
 
 static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
@@ -1077,6 +1092,8 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
         src += 3;
     }
     npal = *src++ + 1;
+    if (src_end - src < npal * 3)
+        return AVERROR_INVALIDDATA;
     memcpy(pal, src, npal * 3);
     src += npal * 3;
     if (sub_type != 2) {
@@ -1093,7 +1110,7 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
     zsize = (src[0] << 8) | src[1];
     src  += 2;
 
-    if (src_end - src < zsize)
+    if (src_end - src < zsize + (sub_type != 2))
         return AVERROR_INVALIDDATA;
 
     ret = uncompress(c->kempf_buf, &dlen, src, zsize);
@@ -1115,6 +1132,8 @@ static int kempf_decode_tile(G2MContext *c, int tile_x, int tile_y,
     for (i = 0; i < (FFALIGN(height, 16) >> 4); i++) {
         for (j = 0; j < (FFALIGN(width, 16) >> 4); j++) {
             if (!bits) {
+                if (src >= src_end)
+                    return AVERROR_INVALIDDATA;
                 bitbuf = *src++;
                 bits   = 8;
             }
@@ -1148,10 +1167,10 @@ static int g2m_init_buffers(G2MContext *c)
     int aligned_height;
 
     if (!c->framebuf || c->old_width < c->width || c->old_height < c->height) {
-        c->framebuf_stride = FFALIGN(c->width * 3, 16);
-        aligned_height     = FFALIGN(c->height,    16);
+        c->framebuf_stride = FFALIGN(c->width + 15, 16) * 3;
+        aligned_height     = c->height + 15;
         av_free(c->framebuf);
-        c->framebuf = av_mallocz(c->framebuf_stride * aligned_height);
+        c->framebuf = av_mallocz_array(c->framebuf_stride, aligned_height);
         if (!c->framebuf)
             return AVERROR(ENOMEM);
     }
@@ -1159,14 +1178,15 @@ static int g2m_init_buffers(G2MContext *c)
         (c->compression == 2 && !c->epic_buf_base) ||
         c->old_tile_w < c->tile_width ||
         c->old_tile_h < c->tile_height) {
-        c->tile_stride     = FFALIGN(c->tile_width * 3, 16);
+        c->tile_stride     = FFALIGN(c->tile_width, 16) * 3;
         c->epic_buf_stride = FFALIGN(c->tile_width * 4, 16);
         aligned_height     = FFALIGN(c->tile_height,    16);
-        av_free(c->synth_tile);
-        av_free(c->jpeg_tile);
-        av_free(c->kempf_buf);
-        av_free(c->kempf_flags);
-        av_free(c->epic_buf_base);
+        av_freep(&c->synth_tile);
+        av_freep(&c->jpeg_tile);
+        av_freep(&c->kempf_buf);
+        av_freep(&c->kempf_flags);
+        av_freep(&c->epic_buf_base);
+        c->epic_buf    = NULL;
         c->synth_tile  = av_mallocz(c->tile_stride      * aligned_height);
         c->jpeg_tile   = av_mallocz(c->tile_stride      * aligned_height);
         c->kempf_buf   = av_mallocz((c->tile_width + 1) * aligned_height +
@@ -1203,7 +1223,7 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
     cursor_hot_y = bytestream2_get_byte(gb);
     cursor_fmt   = bytestream2_get_byte(gb);
 
-    cursor_stride = FFALIGN(cursor_w, 32) * 4;
+    cursor_stride = FFALIGN(cursor_w, cursor_fmt==1 ? 32 : 1) * 4;
 
     if (cursor_w < 1 || cursor_w > 256 ||
         cursor_h < 1 || cursor_h > 256) {
@@ -1253,7 +1273,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                     bits <<= 1;
                 }
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
 
         dst = c->cursor;
@@ -1285,7 +1304,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                     bits <<= 1;
                 }
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
         break;
     case 32: // full colour
@@ -1299,7 +1317,6 @@ static int g2m_load_cursor(AVCodecContext *avctx, G2MContext *c,
                 *dst++ = val >> 16;
                 *dst++ = val >> 24;
             }
-            dst += c->cursor_stride - c->cursor_w * 4;
         }
         break;
     default:
@@ -1402,6 +1419,7 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
         }
         switch (chunk_type) {
         case DISPLAY_INFO:
+            got_header =
             c->got_header = 0;
             if (chunk_size < 21) {
                 av_log(avctx, AV_LOG_ERROR, "Invalid display info size %"PRIu32"\n",
@@ -1420,19 +1438,22 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
             if (c->width != avctx->width || c->height != avctx->height) {
                 ret = ff_set_dimensions(avctx, c->width, c->height);
                 if (ret < 0)
-                    return ret;
+                    goto header_fail;
             }
             c->compression = bytestream2_get_be32(&bc);
             if (c->compression != 2 && c->compression != 3) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Unknown compression method %d\n",
                        c->compression);
-                return AVERROR_PATCHWELCOME;
+                ret = AVERROR_PATCHWELCOME;
+                goto header_fail;
             }
             c->tile_width  = bytestream2_get_be32(&bc);
             c->tile_height = bytestream2_get_be32(&bc);
-            if (!c->tile_width || !c->tile_height ||
-                ((c->tile_width | c->tile_height) & 0xF)) {
+            if (c->tile_width <= 0 || c->tile_height <= 0 ||
+                ((c->tile_width | c->tile_height) & 0xF) ||
+                c->tile_width * (uint64_t)c->tile_height >= INT_MAX / 4
+            ) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Invalid tile dimensions %dx%d\n",
                        c->tile_width, c->tile_height);
@@ -1447,7 +1468,8 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
                     (chunk_size - 21) < 16) {
                     av_log(avctx, AV_LOG_ERROR,
                            "Display info: missing bitmasks!\n");
-                    return AVERROR_INVALIDDATA;
+                    ret = AVERROR_INVALIDDATA;
+                    goto header_fail;
                 }
                 r_mask = bytestream2_get_be32(&bc);
                 g_mask = bytestream2_get_be32(&bc);
@@ -1456,11 +1478,13 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
                     av_log(avctx, AV_LOG_ERROR,
                            "Invalid or unsupported bitmasks: R=%"PRIX32", G=%"PRIX32", B=%"PRIX32"\n",
                            r_mask, g_mask, b_mask);
-                    return AVERROR_PATCHWELCOME;
+                    ret = AVERROR_PATCHWELCOME;
+                    goto header_fail;
                 }
             } else {
                 avpriv_request_sample(avctx, "bpp=%d", c->bpp);
-                return AVERROR_PATCHWELCOME;
+                ret = AVERROR_PATCHWELCOME;
+                goto header_fail;
             }
             if (g2m_init_buffers(c)) {
                 ret = AVERROR(ENOMEM);
@@ -1537,11 +1561,9 @@ static int g2m_decode_frame(AVCodecContext *avctx, void *data,
     if (got_header)
         c->got_header = 1;
 
-    if (c->width && c->height) {
-        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (c->width && c->height && c->framebuf) {
+        if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
             return ret;
-        }
 
         pic->key_frame = got_header;
         pic->pict_type = got_header ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
@@ -1562,6 +1584,8 @@ header_fail:
     c->height  = 0;
     c->tiles_x =
     c->tiles_y = 0;
+    c->tile_width =
+    c->tile_height = 0;
     return ret;
 }
 
@@ -1592,6 +1616,7 @@ static av_cold int g2m_decode_end(AVCodecContext *avctx)
     jpg_free_context(&c->jc);
 
     av_freep(&c->epic_buf_base);
+    c->epic_buf = NULL;
     av_freep(&c->kempf_buf);
     av_freep(&c->kempf_flags);
     av_freep(&c->synth_tile);
diff --git a/libavcodec/g722.c b/libavcodec/g722.c
index 830877e..ee3b85f 100644
--- a/libavcodec/g722.c
+++ b/libavcodec/g722.c
@@ -7,20 +7,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/g722.h b/libavcodec/g722.h
index 4830170..25676a3 100644
--- a/libavcodec/g722.h
+++ b/libavcodec/g722.h
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/g722dec.c b/libavcodec/g722dec.c
index c4c0ec8..0bfa82a 100644
--- a/libavcodec/g722dec.c
+++ b/libavcodec/g722dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,7 +79,7 @@ static const int16_t low_inv_quant5[32] = {
      587,   473,   370,   276,   190,   110,    35,   -35
 };
 
-static const int16_t *low_inv_quants[3] = { ff_g722_low_inv_quant6,
+static const int16_t * const low_inv_quants[3] = { ff_g722_low_inv_quant6,
                                                     low_inv_quant5,
                                             ff_g722_low_inv_quant4 };
 
@@ -96,10 +96,8 @@ static int g722_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avpkt->size * 2;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out_buf = (int16_t *)frame->data[0];
 
     init_get_bits(&gb, avpkt->data, avpkt->size * 8);
diff --git a/libavcodec/g722dsp.c b/libavcodec/g722dsp.c
index c7e41ff..f148053 100644
--- a/libavcodec/g722dsp.c
+++ b/libavcodec/g722dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,4 +71,6 @@ av_cold void ff_g722dsp_init(G722DSPContext *c)
 
     if (ARCH_ARM)
         ff_g722dsp_init_arm(c);
+    if (ARCH_X86)
+        ff_g722dsp_init_x86(c);
 }
diff --git a/libavcodec/g722dsp.h b/libavcodec/g722dsp.h
index ecd6a47..c956a1e 100644
--- a/libavcodec/g722dsp.h
+++ b/libavcodec/g722dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,5 +29,6 @@ typedef struct G722DSPContext {
 
 void ff_g722dsp_init(G722DSPContext *c);
 void ff_g722dsp_init_arm(G722DSPContext *c);
+void ff_g722dsp_init_x86(G722DSPContext *c);
 
 #endif /* AVCODEC_G722DSP_H */
diff --git a/libavcodec/g722enc.c b/libavcodec/g722enc.c
index 545825b..01a3db2 100644
--- a/libavcodec/g722enc.c
+++ b/libavcodec/g722enc.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2009 Kenan Gillet
  * Copyright (c) 2010 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
  * G.722 ADPCM audio encoder
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "g722.h"
@@ -74,9 +75,9 @@ static av_cold int g722_encode_init(AVCodecContext * avctx)
         int max_paths = frontier * FREEZE_INTERVAL;
         int i;
         for (i = 0; i < 2; i++) {
-            c->paths[i] = av_mallocz(max_paths * sizeof(**c->paths));
-            c->node_buf[i] = av_mallocz(2 * frontier * sizeof(**c->node_buf));
-            c->nodep_buf[i] = av_mallocz(2 * frontier * sizeof(**c->nodep_buf));
+            c->paths[i] = av_mallocz_array(max_paths, sizeof(**c->paths));
+            c->node_buf[i] = av_mallocz_array(frontier, 2 * sizeof(**c->node_buf));
+            c->nodep_buf[i] = av_mallocz_array(frontier, 2 * sizeof(**c->nodep_buf));
             if (!c->paths[i] || !c->node_buf[i] || !c->nodep_buf[i]) {
                 ret = AVERROR(ENOMEM);
                 goto error;
@@ -238,7 +239,7 @@ static void g722_encode_trellis(G722Context *c, int trellis,
                     continue;\
                 if (heap_pos[index] < frontier) {\
                     pos = heap_pos[index]++;\
-                    assert(pathn[index] < FREEZE_INTERVAL * frontier);\
+                    av_assert2(pathn[index] < FREEZE_INTERVAL * frontier);\
                     node = nodes_next[index][pos] = next[index]++;\
                     node->path = pathn[index]++;\
                 } else {\
@@ -357,10 +358,8 @@ static int g722_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int nb_samples, out_size, ret;
 
     out_size = (frame->nb_samples + 1) / 2;
-    if ((ret = ff_alloc_packet(avpkt, out_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
 
     nb_samples = frame->nb_samples - (frame->nb_samples & 1);
 
diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c
index 3d45f9d..a11fec8 100644
--- a/libavcodec/g723_1.c
+++ b/libavcodec/g723_1.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,8 +37,8 @@ int ff_g723_1_scale_vector(int16_t *dst, const int16_t *vector, int length)
     for (i = 0; i < length; i++)
         max |= FFABS(vector[i]);
 
-    max  = FFMIN(max, 0x7FFF);
-    bits = ff_g723_1_normalize_bits(max, 15);
+    bits= 14 - av_log2_16bit(max);
+    bits= FFMAX(bits, 0);
 
     for (i = 0; i < length; i++)
         dst[i] = vector[i] << bits >> 3;
@@ -97,16 +97,16 @@ void ff_g723_1_gen_acb_excitation(int16_t *vector, int16_t *prev_excitation,
     ff_g723_1_get_residual(residual, prev_excitation, lag);
 
     /* Select quantization table */
-    if (cur_rate == RATE_6300 && pitch_lag < SUBFRAME_LEN - 2)
+    if (cur_rate == RATE_6300 && pitch_lag < SUBFRAME_LEN - 2) {
         cb_ptr = adaptive_cb_gain85;
-    else
+    } else
         cb_ptr = adaptive_cb_gain170;
 
     /* Calculate adaptive vector */
     cb_ptr += subfrm->ad_cb_gain * 20;
     for (i = 0; i < SUBFRAME_LEN; i++) {
-        sum       = ff_g723_1_dot_product(residual + i, cb_ptr, PITCH_ORDER);
-        vector[i] = av_sat_dadd32(1 << 15, sum) >> 16;
+        sum = ff_dot_product(residual + i, cb_ptr, PITCH_ORDER);
+        vector[i] = av_sat_dadd32(1 << 15, av_sat_add32(sum, sum)) >> 16;
     }
 }
 
@@ -123,11 +123,11 @@ static void lsp2lpc(int16_t *lpc)
 
     /* Calculate negative cosine */
     for (j = 0; j < LPC_ORDER; j++) {
-        int index  = (lpc[j] >> 7) & 0x1FF;
-        int offset = lpc[j] & 0x7f;
-        int temp1  = cos_tab[index] << 16;
-        int temp2  = (cos_tab[index + 1] - cos_tab[index]) *
-                     ((offset << 8) + 0x80) << 1;
+        int index     = (lpc[j] >> 7) & 0x1FF;
+        int offset    = lpc[j] & 0x7f;
+        int temp1     = cos_tab[index] << 16;
+        int temp2     = (cos_tab[index + 1] - cos_tab[index]) *
+                          ((offset << 8) + 0x80) << 1;
 
         lpc[j] = -(av_sat_dadd32(1 << 15, temp1 + temp2) >> 16);
     }
@@ -162,8 +162,8 @@ static void lsp2lpc(int16_t *lpc)
 
         f1[0] >>= 1;
         f2[0] >>= 1;
-        f1[1]   = ((lpc[2 * i]     << 16 >> i) + f1[1]) >> 1;
-        f2[1]   = ((lpc[2 * i + 1] << 16 >> i) + f2[1]) >> 1;
+        f1[1] = ((lpc[2 * i]     << 16 >> i) + f1[1]) >> 1;
+        f2[1] = ((lpc[2 * i + 1] << 16 >> i) + f2[1]) >> 1;
     }
 
     /* Convert polynomial coefficients to LPC coefficients */
@@ -171,8 +171,7 @@ static void lsp2lpc(int16_t *lpc)
         int64_t ff1 = f1[i + 1] + f1[i];
         int64_t ff2 = f2[i + 1] - f2[i];
 
-        lpc[i]                 = av_clipl_int32(((ff1 + ff2) << 3) +
-                                                (1 << 15)) >> 16;
+        lpc[i] = av_clipl_int32(((ff1 + ff2) << 3) + (1 << 15)) >> 16;
         lpc[LPC_ORDER - i - 1] = av_clipl_int32(((ff1 - ff2) << 3) +
                                                 (1 << 15)) >> 16;
     }
@@ -234,7 +233,7 @@ void ff_g723_1_inverse_quant(int16_t *cur_lsp, int16_t *prev_lsp,
     }
 
     for (i = 0; i < LPC_ORDER; i++) {
-        cur_lsp[0]             = FFMAX(cur_lsp[0], 0x180);
+        cur_lsp[0]             = FFMAX(cur_lsp[0],  0x180);
         cur_lsp[LPC_ORDER - 1] = FFMIN(cur_lsp[LPC_ORDER - 1], 0x7e00);
 
         /* Stability check */
diff --git a/libavcodec/g723_1.h b/libavcodec/g723_1.h
index 166d897..40d6e70 100644
--- a/libavcodec/g723_1.h
+++ b/libavcodec/g723_1.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -142,7 +142,8 @@ typedef struct g723_1_context {
     int sid_gain;
     int cur_gain;
     int reflection_coef;
-    int pf_gain;
+    int pf_gain;                 ///< formant postfilter
+                                 ///< gain scaling unit memory
     int postfilter;
 
     int16_t audio[FRAME_LEN + LPC_ORDER + PITCH_MAX + 4];
@@ -215,16 +216,27 @@ void ff_g723_1_lsp_interpolate(int16_t *lpc, int16_t *cur_lsp,
 void ff_g723_1_inverse_quant(int16_t *cur_lsp, int16_t *prev_lsp,
                              uint8_t *lsp_index, int bad_frame);
 
-
 static const uint8_t frame_size[4] = { 24, 20, 4, 1 };
 
-/* Postfilter gain weighting factors scaled by 2^15 */
-static const int16_t ppf_gain_weight[2] = { 0x1800, 0x2000 };
+/**
+ * Postfilter gain weighting factors scaled by 2^15
+ */
+static const int16_t ppf_gain_weight[2] = {0x1800, 0x2000};
 
-/* LSP DC component */
+/**
+ * LSP DC component
+ */
 static const int16_t dc_lsp[LPC_ORDER] = {
-    0x0c3b, 0x1271, 0x1e0a, 0x2a36, 0x3630,
-    0x406f, 0x4d28, 0x56f4, 0x638c, 0x6c46
+    0x0c3b,
+    0x1271,
+    0x1e0a,
+    0x2a36,
+    0x3630,
+    0x406f,
+    0x4d28,
+    0x56f4,
+    0x638c,
+    0x6c46
 };
 
 /* Cosine table scaled by 2^14 */
@@ -296,7 +308,9 @@ static const int16_t cos_tab[COS_TBL_SIZE + 1] = {
     16384
 };
 
-/* LSP VQ tables */
+/**
+ *  LSP VQ tables
+ */
 static const int16_t lsp_band0[LSP_CB_SIZE][3] = {
     {    0,      0,      0}, { -270,  -1372,  -1032}, { -541,  -1650,  -1382},
     { -723,  -2011,  -2213}, { -941,  -1122,  -1942}, { -780,  -1145,  -2454},
@@ -606,12 +620,12 @@ static const int16_t lsp_band2[LSP_CB_SIZE][4] = {
     { 3633,   2336,   2408,   1453}, { 2923,   3517,   2567,   1318},
 };
 
-/*
+/**
  * Used for the coding/decoding of the pulses positions
  * for the MP-MLQ codebook
  */
 static const int32_t combinatorial_table[PULSE_MAX][SUBFRAME_LEN/GRID_SIZE] = {
-    {118755, 98280, 80730, 65780L, 53130,
+    {118755, 98280, 80730,  65780, 53130,
       42504, 33649, 26334,  20349, 15504,
       11628,  8568,  6188,   4368,  3003,
        2002,  1287,   792,    462,   252,
@@ -700,10 +714,14 @@ static const int16_t pitch_contrib[340] = {
     -2, 25144,  0, 17998
 };
 
-/* Number of non-zero pulses in the MP-MLQ excitation */
+/**
+ * Number of non-zero pulses in the MP-MLQ excitation
+ */
 static const int8_t pulses[4] = {6, 5, 6, 5};
 
-/* Size of the MP-MLQ fixed excitation codebooks */
+/**
+ * Size of the MP-MLQ fixed excitation codebooks
+ */
 static const int32_t max_pos[4] = {593775, 142506, 593775, 142506};
 
 static const int16_t fixed_cb_gain[GAIN_LEVELS] = {
@@ -1356,15 +1374,16 @@ static const int16_t adaptive_cb_gain170[170 * 20] = {
     -4534,  -2487,  -3932,  -4166,  -2113,  -3341,  -3540,  -3070
 };
 
-/* 0.65^i (Zero part) and 0.75^i (Pole part) scaled by 2^15 */
+/**
+ * 0.65^i (Zero part) and 0.75^i (Pole part) scaled by 2^15
+ */
 static const int16_t postfilter_tbl[2][LPC_ORDER] = {
     /* Zero */
-    { 21299, 13844,  8999,  5849, 3802, 2471, 1606, 1044,  679,  441 },
+    {21299, 13844,  8999,  5849, 3802, 2471, 1606, 1044,  679,  441},
     /* Pole */
-    { 24576, 18432, 13824, 10368, 7776, 5832, 4374, 3281, 2460, 1845 }
+    {24576, 18432, 13824, 10368, 7776, 5832, 4374, 3281, 2460, 1845}
 };
 
-
 /**
  * Hamming window coefficients scaled by 2^15
  */
diff --git a/libavcodec/g723_1dec.c b/libavcodec/g723_1dec.c
index 701e034..6f283b4 100644
--- a/libavcodec/g723_1dec.c
+++ b/libavcodec/g723_1dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2010 Mohamed Naufal Basheer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 #include "get_bits.h"
 #include "acelp_vectors.h"
 #include "celp_filters.h"
+#include "celp_math.h"
 #include "g723_1.h"
 #include "internal.h"
 
@@ -45,7 +46,6 @@ static av_cold int g723_1_decode_init(AVCodecContext *avctx)
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
     avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
     avctx->channels       = 1;
-    avctx->sample_rate    = 8000;
     p->pf_gain            = 1 << 12;
 
     memcpy(p->prev_lsp, dc_lsp, LPC_ORDER * sizeof(*p->prev_lsp));
@@ -129,13 +129,13 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
         }
     }
 
-    p->subframe[0].grid_index = get_bits(&gb, 1);
-    p->subframe[1].grid_index = get_bits(&gb, 1);
-    p->subframe[2].grid_index = get_bits(&gb, 1);
-    p->subframe[3].grid_index = get_bits(&gb, 1);
+    p->subframe[0].grid_index = get_bits1(&gb);
+    p->subframe[1].grid_index = get_bits1(&gb);
+    p->subframe[2].grid_index = get_bits1(&gb);
+    p->subframe[3].grid_index = get_bits1(&gb);
 
     if (p->cur_rate == RATE_6300) {
-        skip_bits(&gb, 1);  /* skip reserved bit */
+        skip_bits1(&gb);  /* skip reserved bit */
 
         /* Compute pulse_pos index using the 13-bit combined position index */
         temp = get_bits(&gb, 13);
@@ -179,31 +179,14 @@ static int unpack_bitstream(G723_1_Context *p, const uint8_t *buf,
 /**
  * Bitexact implementation of sqrt(val/2).
  */
-static int16_t square_root(int val)
+static int16_t square_root(unsigned val)
 {
-    int16_t res = 0;
-    int16_t exp = 0x4000;
-    int i;
+    av_assert2(!(val & 0x80000000));
 
-    for (i = 0; i < 14; i ++) {
-        int res_exp = res + exp;
-        if (val >= res_exp * res_exp << 1)
-            res += exp;
-        exp >>= 1;
-    }
-    return res;
+    return (ff_sqrt(val << 1) >> 1) & (~1);
 }
 
 /**
- * Bitexact implementation of 2ab scaled by 1/2^16.
- *
- * @param a 32 bit multiplicand
- * @param b 16 bit multiplier
- */
-#define MULL2(a, b) \
-        ((((a) >> 16) * (b) << 1) + (((a) & 0xffff) * (b) >> 15))
-
-/**
  * Generate fixed codebook excitation vector.
  *
  * @param vector    decoded excitation vector
@@ -476,9 +459,9 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag,
 
     temp = best_eng * *exc_eng >> 3;
 
-    if (temp < ccr * ccr)
+    if (temp < ccr * ccr) {
         return index;
-    else
+    } else
         return 0;
 }
 
@@ -518,21 +501,24 @@ static void residual_interp(int16_t *buf, int16_t *out, int lag,
  * @param iir_coef IIR coefficients
  * @param src      source vector
  * @param dest     destination vector
+ * @param width    width of the output, 16 bits(0) / 32 bits(1)
  */
-static void iir_filter(int16_t *fir_coef, int16_t *iir_coef,
-                       int16_t *src, int *dest)
-{
-    int m, n;
-
-    for (m = 0; m < SUBFRAME_LEN; m++) {
-        int64_t filter = 0;
-        for (n = 1; n <= LPC_ORDER; n++) {
-            filter -= fir_coef[n - 1] * src[m - n] -
-                      iir_coef[n - 1] * (dest[m - n] >> 16);
-        }
-
-        dest[m] = av_clipl_int32((src[m] << 16) + (filter << 3) + (1 << 15));
-    }
+#define iir_filter(fir_coef, iir_coef, src, dest, width)\
+{\
+    int m, n;\
+    int res_shift = 16 & ~-(width);\
+    int in_shift  = 16 - res_shift;\
+\
+    for (m = 0; m < SUBFRAME_LEN; m++) {\
+        int64_t filter = 0;\
+        for (n = 1; n <= LPC_ORDER; n++) {\
+            filter -= (fir_coef)[n - 1] * (src)[m - n] -\
+                      (iir_coef)[n - 1] * ((dest)[m - n] >> in_shift);\
+        }\
+\
+        (dest)[m] = av_clipl_int32(((src)[m] << 16) + (filter << 3) +\
+                                   (1 << 15)) >> res_shift;\
+    }\
 }
 
 /**
@@ -602,13 +588,12 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc,
             filter_coef[1][k] = (-lpc[k] * postfilter_tbl[1][k] +
                                  (1 << 14)) >> 15;
         }
-        iir_filter(filter_coef[0], filter_coef[1], buf + i, filter_signal + i);
+        iir_filter(filter_coef[0], filter_coef[1], buf + i, filter_signal + i, 1);
         lpc += LPC_ORDER;
     }
 
-    memcpy(p->fir_mem, buf + FRAME_LEN, LPC_ORDER * sizeof(*p->fir_mem));
-    memcpy(p->iir_mem, filter_signal + FRAME_LEN,
-           LPC_ORDER * sizeof(*p->iir_mem));
+    memcpy(p->fir_mem, buf + FRAME_LEN, LPC_ORDER * sizeof(int16_t));
+    memcpy(p->iir_mem, filter_signal + FRAME_LEN, LPC_ORDER * sizeof(int));
 
     buf += LPC_ORDER;
     signal_ptr = filter_signal + LPC_ORDER;
@@ -883,10 +868,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame->nb_samples = FRAME_LEN;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-         return ret;
-    }
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
 
     out = (int16_t *)frame->data[0];
 
@@ -1018,7 +1001,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data,
 #define AD     AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
 static const AVOption options[] = {
-    { "postfilter", "postfilter on/off", OFFSET(postfilter), AV_OPT_TYPE_INT,
+    { "postfilter", "enable postfilter", OFFSET(postfilter), AV_OPT_TYPE_BOOL,
       { .i64 = 1 }, 0, 1, AD },
     { NULL }
 };
diff --git a/libavcodec/g723_1enc.c b/libavcodec/g723_1enc.c
index 1ebd465..e7afa4d 100644
--- a/libavcodec/g723_1enc.c
+++ b/libavcodec/g723_1enc.c
@@ -2,20 +2,20 @@
  * G.723.1 compatible encoder
  * Copyright (c) Mohamed Naufal <naufal22@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -82,7 +82,7 @@ static void highpass_filter(int16_t *buf, int16_t *fir, int *iir)
     for (i = 0; i < FRAME_LEN; i++) {
         *iir   = (buf[i] << 15) + ((-*fir) << 15) + MULL2(*iir, 0x7f00);
         *fir   = buf[i];
-        buf[i] = av_clipl_int32((int64_t) *iir + (1 << 15)) >> 16;
+        buf[i] = av_clipl_int32((int64_t)*iir + (1 << 15)) >> 16;
     }
 }
 
@@ -1148,7 +1148,7 @@ static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         acb_search(p, residual, impulse_resp, in, i);
         ff_g723_1_gen_acb_excitation(residual, p->prev_excitation,
                                      p->pitch_lag[i >> 1], &p->subframe[i],
-                                     RATE_6300);
+                                     p->cur_rate);
         sub_acb_contrib(residual, impulse_resp, in);
 
         fcb_search(p, impulse_resp, in, i);
@@ -1180,12 +1180,12 @@ static int g723_1_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     av_free(start);
 
-    ret = ff_alloc_packet(avpkt, 24);
-    if (ret < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 24, 0)) < 0)
         return ret;
 
     *got_packet_ptr = 1;
-    return pack_bitstream(p, avpkt);
+    avpkt->size = pack_bitstream(p, avpkt);
+    return 0;
 }
 
 AVCodec ff_g723_1_encoder = {
diff --git a/libavcodec/g726.c b/libavcodec/g726.c
index e783e74..c7d138e 100644
--- a/libavcodec/g726.c
+++ b/libavcodec/g726.c
@@ -5,20 +5,20 @@
  * This is a very straightforward rendition of the G.726
  * Section 4 "Computational Details".
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <limits.h>
@@ -95,6 +95,7 @@ typedef struct G726Context {
     int sez;            /**< estimated second order prediction */
     int y;              /**< quantizer scaling factor for the next iteration */
     int code_size;
+    int little_endian;  /**< little-endian bitstream as used in aiff and Sun AU */
 } G726Context;
 
 static const int quant_tbl16[] =                  /**< 16kbit/s 2 bits per sample */
@@ -296,7 +297,7 @@ static int16_t g726_encode(G726Context* c, int16_t sig)
 {
     uint8_t i;
 
-    i = quant(c, sig/4 - c->se) & ((1<<c->code_size) - 1);
+    i = av_mod_uintp2(quant(c, sig/4 - c->se), c->code_size);
     g726_decode(c, i);
     return i;
 }
@@ -350,10 +351,8 @@ static int g726_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int i, ret, out_size;
 
     out_size = (frame->nb_samples * c->code_size + 7) / 8;
-    if ((ret = ff_alloc_packet(avpkt, out_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, out_size, 0)) < 0)
         return ret;
-    }
     init_put_bits(&pb, avpkt->data, avpkt->size);
 
     for (i = 0; i < frame->nb_samples; i++)
@@ -373,7 +372,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass g726_class = {
     .class_name = "g726",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -396,19 +395,25 @@ AVCodec ff_adpcm_g726_encoder = {
     .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &g726_class,
     .defaults       = defaults,
 };
 #endif
 
-#if CONFIG_ADPCM_G726_DECODER
+#if CONFIG_ADPCM_G726_DECODER || CONFIG_ADPCM_G726LE_DECODER
 static av_cold int g726_decode_init(AVCodecContext *avctx)
 {
     G726Context* c = avctx->priv_data;
 
+    if(avctx->channels > 1){
+        avpriv_request_sample(avctx, "Decoding more than one channel");
+        return AVERROR_PATCHWELCOME;
+    }
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
 
+    c->little_endian = !strcmp(avctx->codec->name, "g726le");
+
     c->code_size = avctx->bits_per_coded_sample;
     if (c->code_size < 2 || c->code_size > 5) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of bits %d\n", c->code_size);
@@ -436,16 +441,16 @@ static int g726_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = out_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     init_get_bits(&gb, buf, buf_size * 8);
 
     while (out_samples--)
-        *samples++ = g726_decode(c, get_bits(&gb, c->code_size));
+        *samples++ = g726_decode(c, c->little_endian ?
+                                    get_bits_le(&gb, c->code_size) :
+                                    get_bits(&gb, c->code_size));
 
     if (get_bits_left(&gb) > 0)
         av_log(avctx, AV_LOG_ERROR, "Frame invalidly split, missing parser?\n");
@@ -460,7 +465,9 @@ static void g726_decode_flush(AVCodecContext *avctx)
     G726Context *c = avctx->priv_data;
     g726_reset(c);
 }
+#endif
 
+#if CONFIG_ADPCM_G726_DECODER
 AVCodec ff_adpcm_g726_decoder = {
     .name           = "g726",
     .long_name      = NULL_IF_CONFIG_SMALL("G.726 ADPCM"),
@@ -473,3 +480,17 @@ AVCodec ff_adpcm_g726_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
 };
 #endif
+
+#if CONFIG_ADPCM_G726LE_DECODER
+AVCodec ff_adpcm_g726le_decoder = {
+    .name           = "g726le",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_ADPCM_G726LE,
+    .priv_data_size = sizeof(G726Context),
+    .init           = g726_decode_init,
+    .decode         = g726_decode_frame,
+    .flush          = g726_decode_flush,
+    .capabilities   = AV_CODEC_CAP_DR1,
+    .long_name      = NULL_IF_CONFIG_SMALL("G.726 ADPCM little-endian"),
+};
+#endif
diff --git a/libavcodec/g729.h b/libavcodec/g729.h
new file mode 100644
index 0000000..7c5f693
--- /dev/null
+++ b/libavcodec/g729.h
@@ -0,0 +1,33 @@
+/*
+ * G.729, G729 Annex D decoders
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_G729_H
+#define AVCODEC_G729_H
+
+/**
+ * subframe size
+ */
+#define SUBFRAME_SIZE 40
+
+/* bytes per block */
+#define G729_8K_BLOCK_SIZE     10
+#define G729D_6K4_BLOCK_SIZE   8
+
+#endif // AVCODEC_G729_H
diff --git a/libavcodec/g729_parser.c b/libavcodec/g729_parser.c
new file mode 100644
index 0000000..d13c990
--- /dev/null
+++ b/libavcodec/g729_parser.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015  Ganesh Ajjanagadde
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * G.729 audio parser
+ *
+ * Splits packets into individual blocks.
+ */
+
+#include "libavutil/avassert.h"
+#include "parser.h"
+#include "g729.h"
+
+typedef struct G729ParseContext {
+    ParseContext pc;
+    int block_size;
+    int duration;
+    int remaining;
+} G729ParseContext;
+
+static int g729_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
+                     const uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size)
+{
+    G729ParseContext *s = s1->priv_data;
+    ParseContext *pc = &s->pc;
+    int next;
+
+    if (!s->block_size) {
+        av_assert1(avctx->codec_id == AV_CODEC_ID_G729);
+        /* FIXME: replace this heuristic block_size with more precise estimate */
+        s->block_size = (avctx->bit_rate < 8000) ? G729D_6K4_BLOCK_SIZE : G729_8K_BLOCK_SIZE;
+        s->duration   = avctx->frame_size;
+    }
+
+    if (!s->remaining)
+        s->remaining = s->block_size;
+    if (s->remaining <= buf_size) {
+        next = s->remaining;
+        s->remaining = 0;
+    } else {
+        next = END_NOT_FOUND;
+        s->remaining -= buf_size;
+    }
+
+    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0 || !buf_size) {
+        *poutbuf      = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+
+    s1->duration = s->duration;
+
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+AVCodecParser ff_g729_parser = {
+    .codec_ids      = { AV_CODEC_ID_G729 },
+    .priv_data_size = sizeof(G729ParseContext),
+    .parser_parse   = g729_parse,
+    .parser_close   = ff_parse_close,
+};
diff --git a/libavcodec/g729data.h b/libavcodec/g729data.h
new file mode 100644
index 0000000..365ca47
--- /dev/null
+++ b/libavcodec/g729data.h
@@ -0,0 +1,382 @@
+/*
+ * data for G.729, G729 Annex D decoders
+ * Copyright (c) 2007 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_G729DATA_H
+#define AVCODEC_G729DATA_H
+
+#include <stdint.h>
+
+#define MA_NP                4  ///< Moving Average (MA) prediction order
+
+#define VQ_1ST_BITS          7  ///< first stage vector of quantizer (size in bits)
+#define VQ_2ND_BITS          5  ///< second stage vector of quantizer (size in bits)
+
+#define GC_1ST_IDX_BITS_8K   3  ///< gain codebook (first stage) index, 8k mode (size in bits)
+#define GC_2ND_IDX_BITS_8K   4  ///< gain codebook (second stage) index, 8k mode (size in bits)
+
+#define GC_1ST_IDX_BITS_6K4  3  ///< gain codebook (first stage) index, 6.4k mode (size in bits)
+#define GC_2ND_IDX_BITS_6K4  3  ///< gain codebook (second stage) index, 6.4k mode (size in bits)
+
+/**
+ * first stage LSP codebook
+ * (10-dimensional, with 128 entries (3.24 of G.729)
+ */
+static const int16_t cb_lsp_1st[1<<VQ_1ST_BITS][10] = { /* (2.13) */
+  { 1486,  2168,  3751,  9074, 12134, 13944, 17983, 19173, 21190, 21820},
+  { 1730,  2640,  3450,  4870,  6126,  7876, 15644, 17817, 20294, 21902},
+  { 1568,  2256,  3088,  4874, 11063, 13393, 18307, 19293, 21109, 21741},
+  { 1733,  2512,  3357,  4708,  6977, 10296, 17024, 17956, 19145, 20350},
+  { 1744,  2436,  3308,  8731, 10432, 12007, 15614, 16639, 21359, 21913},
+  { 1786,  2369,  3372,  4521,  6795, 12963, 17674, 18988, 20855, 21640},
+  { 1631,  2433,  3361,  6328, 10709, 12013, 13277, 13904, 19441, 21088},
+  { 1489,  2364,  3291,  6250,  9227, 10403, 13843, 15278, 17721, 21451},
+  { 1869,  2533,  3475,  4365,  9152, 14513, 15908, 17022, 20611, 21411},
+  { 2070,  3025,  4333,  5854,  7805,  9231, 10597, 16047, 20109, 21834},
+  { 1910,  2673,  3419,  4261, 11168, 15111, 16577, 17591, 19310, 20265},
+  { 1141,  1815,  2624,  4623,  6495,  9588, 13968, 16428, 19351, 21286},
+  { 2192,  3171,  4707,  5808, 10904, 12500, 14162, 15664, 21124, 21789},
+  { 1286,  1907,  2548,  3453,  9574, 11964, 15978, 17344, 19691, 22495},
+  { 1921,  2720,  4604,  6684, 11503, 12992, 14350, 15262, 16997, 20791},
+  { 2052,  2759,  3897,  5246,  6638, 10267, 15834, 16814, 18149, 21675},
+  { 1798,  2497,  5617, 11449, 13189, 14711, 17050, 18195, 20307, 21182},
+  { 1009,  1647,  2889,  5709,  9541, 12354, 15231, 18494, 20966, 22033},
+  { 3016,  3794,  5406,  7469, 12488, 13984, 15328, 16334, 19952, 20791},
+  { 2203,  3040,  3796,  5442, 11987, 13512, 14931, 16370, 17856, 18803},
+  { 2912,  4292,  7988,  9572, 11562, 13244, 14556, 16529, 20004, 21073},
+  { 2861,  3607,  5923,  7034,  9234, 12054, 13729, 18056, 20262, 20974},
+  { 3069,  4311,  5967,  7367, 11482, 12699, 14309, 16233, 18333, 19172},
+  { 2434,  3661,  4866,  5798, 10383, 11722, 13049, 15668, 18862, 19831},
+  { 2020,  2605,  3860,  9241, 13275, 14644, 16010, 17099, 19268, 20251},
+  { 1877,  2809,  3590,  4707, 11056, 12441, 15622, 17168, 18761, 19907},
+  { 2107,  2873,  3673,  5799, 13579, 14687, 15938, 17077, 18890, 19831},
+  { 1612,  2284,  2944,  3572,  8219, 13959, 15924, 17239, 18592, 20117},
+  { 2420,  3156,  6542, 10215, 12061, 13534, 15305, 16452, 18717, 19880},
+  { 1667,  2612,  3534,  5237, 10513, 11696, 12940, 16798, 18058, 19378},
+  { 2388,  3017,  4839,  9333, 11413, 12730, 15024, 16248, 17449, 18677},
+  { 1875,  2786,  4231,  6320,  8694, 10149, 11785, 17013, 18608, 19960},
+  {  679,  1411,  4654,  8006, 11446, 13249, 15763, 18127, 20361, 21567},
+  { 1838,  2596,  3578,  4608,  5650, 11274, 14355, 15886, 20579, 21754},
+  { 1303,  1955,  2395,  3322, 12023, 13764, 15883, 18077, 20180, 21232},
+  { 1438,  2102,  2663,  3462,  8328, 10362, 13763, 17248, 19732, 22344},
+  {  860,  1904,  6098,  7775,  9815, 12007, 14821, 16709, 19787, 21132},
+  { 1673,  2723,  3704,  6125,  7668,  9447, 13683, 14443, 20538, 21731},
+  { 1246,  1849,  2902,  4508,  7221, 12710, 14835, 16314, 19335, 22720},
+  { 1525,  2260,  3862,  5659,  7342, 11748, 13370, 14442, 18044, 21334},
+  { 1196,  1846,  3104,  7063, 10972, 12905, 14814, 17037, 19922, 22636},
+  { 2147,  3106,  4475,  6511,  8227,  9765, 10984, 12161, 18971, 21300},
+  { 1585,  2405,  2994,  4036, 11481, 13177, 14519, 15431, 19967, 21275},
+  { 1778,  2688,  3614,  4680,  9465, 11064, 12473, 16320, 19742, 20800},
+  { 1862,  2586,  3492,  6719, 11708, 13012, 14364, 16128, 19610, 20425},
+  { 1395,  2156,  2669,  3386, 10607, 12125, 13614, 16705, 18976, 21367},
+  { 1444,  2117,  3286,  6233,  9423, 12981, 14998, 15853, 17188, 21857},
+  { 2004,  2895,  3783,  4897,  6168,  7297, 12609, 16445, 19297, 21465},
+  { 1495,  2863,  6360,  8100, 11399, 14271, 15902, 17711, 20479, 22061},
+  { 2484,  3114,  5718,  7097,  8400, 12616, 14073, 14847, 20535, 21396},
+  { 2424,  3277,  5296,  6284, 11290, 12903, 16022, 17508, 19333, 20283},
+  { 2565,  3778,  5360,  6989,  8782, 10428, 14390, 15742, 17770, 21734},
+  { 2727,  3384,  6613,  9254, 10542, 12236, 14651, 15687, 20074, 21102},
+  { 1916,  2953,  6274,  8088,  9710, 10925, 12392, 16434, 20010, 21183},
+  { 3384,  4366,  5349,  7667, 11180, 12605, 13921, 15324, 19901, 20754},
+  { 3075,  4283,  5951,  7619,  9604, 11010, 12384, 14006, 20658, 21497},
+  { 1751,  2455,  5147,  9966, 11621, 13176, 14739, 16470, 20788, 21756},
+  { 1442,  2188,  3330,  6813,  8929, 12135, 14476, 15306, 19635, 20544},
+  { 2294,  2895,  4070,  8035, 12233, 13416, 14762, 17367, 18952, 19688},
+  { 1937,  2659,  4602,  6697,  9071, 12863, 14197, 15230, 16047, 18877},
+  { 2071,  2663,  4216,  9445, 10887, 12292, 13949, 14909, 19236, 20341},
+  { 1740,  2491,  3488,  8138,  9656, 11153, 13206, 14688, 20896, 21907},
+  { 2199,  2881,  4675,  8527, 10051, 11408, 14435, 15463, 17190, 20597},
+  { 1943,  2988,  4177,  6039,  7478,  8536, 14181, 15551, 17622, 21579},
+  { 1825,  3175,  7062,  9818, 12824, 15450, 18330, 19856, 21830, 22412},
+  { 2464,  3046,  4822,  5977,  7696, 15398, 16730, 17646, 20588, 21320},
+  { 2550,  3393,  5305,  6920, 10235, 14083, 18143, 19195, 20681, 21336},
+  { 3003,  3799,  5321,  6437,  7919, 11643, 15810, 16846, 18119, 18980},
+  { 3455,  4157,  6838,  8199,  9877, 12314, 15905, 16826, 19949, 20892},
+  { 3052,  3769,  4891,  5810,  6977, 10126, 14788, 15990, 19773, 20904},
+  { 3671,  4356,  5827,  6997,  8460, 12084, 14154, 14939, 19247, 20423},
+  { 2716,  3684,  5246,  6686,  8463, 10001, 12394, 14131, 16150, 19776},
+  { 1945,  2638,  4130,  7995, 14338, 15576, 17057, 18206, 20225, 20997},
+  { 2304,  2928,  4122,  4824,  5640, 13139, 15825, 16938, 20108, 21054},
+  { 1800,  2516,  3350,  5219, 13406, 15948, 17618, 18540, 20531, 21252},
+  { 1436,  2224,  2753,  4546,  9657, 11245, 15177, 16317, 17489, 19135},
+  { 2319,  2899,  4980,  6936,  8404, 13489, 15554, 16281, 20270, 20911},
+  { 2187,  2919,  4610,  5875,  7390, 12556, 14033, 16794, 20998, 21769},
+  { 2235,  2923,  5121,  6259,  8099, 13589, 15340, 16340, 17927, 20159},
+  { 1765,  2638,  3751,  5730,  7883, 10108, 13633, 15419, 16808, 18574},
+  { 3460,  5741,  9596, 11742, 14413, 16080, 18173, 19090, 20845, 21601},
+  { 3735,  4426,  6199,  7363,  9250, 14489, 16035, 17026, 19873, 20876},
+  { 3521,  4778,  6887,  8680, 12717, 14322, 15950, 18050, 20166, 21145},
+  { 2141,  2968,  6865,  8051, 10010, 13159, 14813, 15861, 17528, 18655},
+  { 4148,  6128,  9028, 10871, 12686, 14005, 15976, 17208, 19587, 20595},
+  { 4403,  5367,  6634,  8371, 10163, 11599, 14963, 16331, 17982, 18768},
+  { 4091,  5386,  6852,  8770, 11563, 13290, 15728, 16930, 19056, 20102},
+  { 2746,  3625,  5299,  7504, 10262, 11432, 13172, 15490, 16875, 17514},
+  { 2248,  3556,  8539, 10590, 12665, 14696, 16515, 17824, 20268, 21247},
+  { 1279,  1960,  3920,  7793, 10153, 14753, 16646, 18139, 20679, 21466},
+  { 2440,  3475,  6737,  8654, 12190, 14588, 17119, 17925, 19110, 19979},
+  { 1879,  2514,  4497,  7572, 10017, 14948, 16141, 16897, 18397, 19376},
+  { 2804,  3688,  7490, 10086, 11218, 12711, 16307, 17470, 20077, 21126},
+  { 2023,  2682,  3873,  8268, 10255, 11645, 15187, 17102, 18965, 19788},
+  { 2823,  3605,  5815,  8595, 10085, 11469, 16568, 17462, 18754, 19876},
+  { 2851,  3681,  5280,  7648,  9173, 10338, 14961, 16148, 17559, 18474},
+  { 1348,  2645,  5826,  8785, 10620, 12831, 16255, 18319, 21133, 22586},
+  { 2141,  3036,  4293,  6082,  7593, 10629, 17158, 18033, 21466, 22084},
+  { 1608,  2375,  3384,  6878,  9970, 11227, 16928, 17650, 20185, 21120},
+  { 2774,  3616,  5014,  6557,  7788,  8959, 17068, 18302, 19537, 20542},
+  { 1934,  4813,  6204,  7212,  8979, 11665, 15989, 17811, 20426, 21703},
+  { 2288,  3507,  5037,  6841,  8278,  9638, 15066, 16481, 21653, 22214},
+  { 2951,  3771,  4878,  7578,  9016, 10298, 14490, 15242, 20223, 20990},
+  { 3256,  4791,  6601,  7521,  8644,  9707, 13398, 16078, 19102, 20249},
+  { 1827,  2614,  3486,  6039, 12149, 13823, 16191, 17282, 21423, 22041},
+  { 1000,  1704,  3002,  6335,  8471, 10500, 14878, 16979, 20026, 22427},
+  { 1646,  2286,  3109,  7245, 11493, 12791, 16824, 17667, 18981, 20222},
+  { 1708,  2501,  3315,  6737,  8729,  9924, 16089, 17097, 18374, 19917},
+  { 2623,  3510,  4478,  5645,  9862, 11115, 15219, 18067, 19583, 20382},
+  { 2518,  3434,  4728,  6388,  8082,  9285, 13162, 18383, 19819, 20552},
+  { 1726,  2383,  4090,  6303,  7805, 12845, 14612, 17608, 19269, 20181},
+  { 2860,  3735,  4838,  6044,  7254,  8402, 14031, 16381, 18037, 19410},
+  { 4247,  5993,  7952,  9792, 12342, 14653, 17527, 18774, 20831, 21699},
+  { 3502,  4051,  5680,  6805,  8146, 11945, 16649, 17444, 20390, 21564},
+  { 3151,  4893,  5899,  7198, 11418, 13073, 15124, 17673, 20520, 21861},
+  { 3960,  4848,  5926,  7259,  8811, 10529, 15661, 16560, 18196, 20183},
+  { 4499,  6604,  8036,  9251, 10804, 12627, 15880, 17512, 20020, 21046},
+  { 4251,  5541,  6654,  8318,  9900, 11686, 15100, 17093, 20572, 21687},
+  { 3769,  5327,  7865,  9360, 10684, 11818, 13660, 15366, 18733, 19882},
+  { 3083,  3969,  6248,  8121,  9798, 10994, 12393, 13686, 17888, 19105},
+  { 2731,  4670,  7063,  9201, 11346, 13735, 16875, 18797, 20787, 22360},
+  { 1187,  2227,  4737,  7214,  9622, 12633, 15404, 17968, 20262, 23533},
+  { 1911,  2477,  3915, 10098, 11616, 12955, 16223, 17138, 19270, 20729},
+  { 1764,  2519,  3887,  6944,  9150, 12590, 16258, 16984, 17924, 18435},
+  { 1400,  3674,  7131,  8718, 10688, 12508, 15708, 17711, 19720, 21068},
+  { 2322,  3073,  4287,  8108,  9407, 10628, 15862, 16693, 19714, 21474},
+  { 2630,  3339,  4758,  8360, 10274, 11333, 12880, 17374, 19221, 19936},
+  { 1721,  2577,  5553,  7195,  8651, 10686, 15069, 16953, 18703, 19929}
+};
+
+/**
+ * second stage LSP codebook, high and low parts
+   (both 5-dimensional, with 32 entries (3.2.4 of G.729)
+ */
+static const int16_t cb_lsp_2nd[1<<VQ_2ND_BITS][10] = { /* (2.13) */
+  { -435,  -815,  -742,  1033,  -518,   582, -1201,   829,    86,   385},
+  { -833,  -891,   463,    -8, -1251,  1450,    72,  -231,   864,   661},
+  {-1021,   231,  -306,   321,  -220,  -163,  -526,  -754, -1633,   267},
+  {   57,  -198,  -339,   -33, -1468,   573,   796,  -169,  -631,   816},
+  {  171,  -350,   294,  1660,   453,   519,   291,   159,  -640, -1296},
+  { -701,  -842,   -58,   950,   892,  1549,   715,   527,  -714,  -193},
+  {  584,    31,  -289,   356,  -333,  -457,   612,  -283, -1381,  -741},
+  { -109,  -808,   231,    77,   -87,  -344,  1341,  1087,  -654,  -569},
+  { -859,  1236,   550,   854,   714,  -543, -1752,  -195,   -98,  -276},
+  { -877,  -954, -1248,  -299,   212,  -235,  -728,   949,  1517,   895},
+  {  -77,   344,  -620,   763,   413,   502,  -362,  -960,  -483,  1386},
+  { -314,  -307,  -256, -1260,  -429,   450,  -466,  -108,  1010,  2223},
+  {  711,   693,   521,   650,  1305,   -28,  -378,   744, -1005,   240},
+  { -112,  -271,  -500,   946,  1733,   271,   -15,   909,  -259,  1688},
+  {  575,   -10,  -468,  -199,  1101, -1011,   581,   -53,  -747,   878},
+  {  145,  -285, -1280,  -398,    36,  -498, -1377,    18,  -444,  1483},
+  {-1133,  -835,  1350,  1284,   -95,  1015,  -222,   443,   372,  -354},
+  {-1459, -1237,   416,  -213,   466,   669,   659,  1640,   932,   534},
+  {  -15,    66,   468,  1019,  -748,  1385,  -182,  -907,  -721,  -262},
+  { -338,   148,  1445,    75,  -760,   569,  1247,   337,   416,  -121},
+  {  389,   239,  1568,   981,   113,   369, -1003,  -507,  -587,  -904},
+  { -312,   -98,   949,    31,  1104,    72,  -141,  1465,    63,  -785},
+  { 1127,   584,   835,   277, -1159,   208,   301,  -882,   117,  -404},
+  {  539,  -114,   856,  -493,   223,  -912,   623,   -76,   276,  -440},
+  { 2197,  2337,  1268,   670,   304,  -267,  -525,   140,   882,  -139},
+  {-1596,   550,   801,  -456,   -56,  -697,   865,  1060,   413,   446},
+  { 1154,   593,   -77,  1237,   -31,   581, -1037,  -895,   669,   297},
+  {  397,   558,   203,  -797,  -919,     3,   692,  -292,  1050,   782},
+  {  334,  1475,   632,   -80,    48, -1061,  -484,   362,  -597,  -852},
+  { -545,  -330,  -429,  -680,  1133, -1182,  -744,  1340,   262,    63},
+  { 1320,   827,  -398,  -576,   341,  -774,  -483, -1247,   -70,    98},
+  { -163,   674,   -11,  -886,   531, -1125,  -265,  -242,   724,   934}
+};
+
+/**
+ * gain codebook (first stage), 8k mode (3.9.2 of G.729)
+ */
+static const int16_t cb_gain_1st_8k[1<<GC_1ST_IDX_BITS_8K][2] = { /*(0.14) (2.13) */
+  { 3242 ,  9949 },
+  { 1551 ,  2425 },
+  { 2678 , 27162 },
+  { 1921 ,  9291 },
+  { 1831 ,  5022 },
+  {    1 ,  1516 },
+  {  356 , 14756 },
+  {   57 ,  5404 },
+};
+
+/**
+ * gain codebook (second stage), 8k mode (3.9.2 of G.729)
+ */
+static const int16_t cb_gain_2nd_8k[1<<GC_2ND_IDX_BITS_8K][2] = { /*(1.14) (1.13) */
+  {  5142 ,   592 },
+  { 17299 ,  1861 },
+  {  6160 ,  2395 },
+  { 16112 ,  3392 },
+  {   826 ,  2005 },
+  { 18973 ,  5935 },
+  {  1994 ,     0 },
+  { 15434 ,   237 },
+  { 10573 ,  2966 },
+  { 15132 ,  4914 },
+  { 11569 ,  1196 },
+  { 14194 ,  1630 },
+  {  8091 ,  4861 },
+  { 15161 , 14276 },
+  {  9120 ,   525 },
+  { 13260 ,  3256 },
+};
+
+/**
+ * gain codebook (first stage), 6.4k mode (D.3.9.2 of G.729)
+ */
+static const int16_t cb_gain_1st_6k4[1<<GC_1ST_IDX_BITS_6K4][2] =
+{ /*(0.14) (1.14)*/
+ { 5849,     0 },
+ { 3171,  9280 },
+ { 3617,  6747 },
+ { 4987, 22294 },
+ { 2929,  1078 },
+ { 6068,  6093 },
+ { 9425,  2731 },
+ { 3915, 12872 },
+};
+
+/**
+ * gain codebook (second stage), 6.4k mode (D.3.9.2 of G.729)
+ */
+static const int16_t cb_gain_2nd_6k4[1<<GC_2ND_IDX_BITS_6K4][2] =
+{ /*(1.14) (1.14)*/
+ {    0,  4175 },
+ {10828, 27602 },
+ {16423, 15724 },
+ { 4478,  7324 },
+ { 3988,     0 },
+ {10291, 11385 },
+ {11956, 10735 },
+ { 7876,  7821 },
+};
+
+/**
+ * 4th order Moving Average (MA) Predictor codebook (3.2.4 of G.729)
+ *
+ * float cb_ma_predictor_float[2][MA_NP][10] = {
+ *   {
+ *     {0.2570, 0.2780, 0.2800, 0.2736, 0.2757, 0.2764, 0.2675, 0.2678, 0.2779, 0.2647},
+ *     {0.2142, 0.2194, 0.2331, 0.2230, 0.2272, 0.2252, 0.2148, 0.2123, 0.2115, 0.2096},
+ *     {0.1670, 0.1523, 0.1567, 0.1580, 0.1601, 0.1569, 0.1589, 0.1555, 0.1474, 0.1571},
+ *     {0.1238, 0.0925, 0.0798, 0.0923, 0.0890, 0.0828, 0.1010, 0.0988, 0.0872, 0.1060},
+ *   },
+ *   {
+ *     {0.2360, 0.2405, 0.2499, 0.2495, 0.2517, 0.2591, 0.2636, 0.2625, 0.2551, 0.2310},
+ *     {0.1285, 0.0925, 0.0779, 0.1060, 0.1183, 0.1176, 0.1277, 0.1268, 0.1193, 0.1211},
+ *     {0.0981, 0.0589, 0.0401, 0.0654, 0.0761, 0.0728, 0.0841, 0.0826, 0.0776, 0.0891},
+ *     {0.0923, 0.0486, 0.0287, 0.0498, 0.0526, 0.0482, 0.0621, 0.0636, 0.0584, 0.0794},
+ *   },
+ * };
+ *                                    15
+ * cb_ma_predictor[j][k][i] = floor( 2 * cb_ma_predictor_float[j][k][i] )
+ *
+ * j=0..1, i=0..9, k=0..MA_NP-1
+ */
+static const int16_t cb_ma_predictor[2][MA_NP][10] = { /* (0.15) */
+  {
+    { 8421,  9109,  9175,  8965,  9034,  9057,  8765,  8775,  9106,  8673},
+    { 7018,  7189,  7638,  7307,  7444,  7379,  7038,  6956,  6930,  6868},
+    { 5472,  4990,  5134,  5177,  5246,  5141,  5206,  5095,  4830,  5147},
+    { 4056,  3031,  2614,  3024,  2916,  2713,  3309,  3237,  2857,  3473}
+  },
+  {
+    { 7733,  7880,  8188,  8175,  8247,  8490,  8637,  8601,  8359,  7569},
+    { 4210,  3031,  2552,  3473,  3876,  3853,  4184,  4154,  3909,  3968},
+    { 3214,  1930,  1313,  2143,  2493,  2385,  2755,  2706,  2542,  2919},
+    { 3024,  1592,   940,  1631,  1723,  1579,  2034,  2084,  1913,  2601}
+  }
+};
+
+/**
+ *                                     15         3
+ * cb_ma_predictor_sum[j][i] = floor( 2 * (1.0 - sum ( cb_ma_predictor_float[j][k][i] ) ) )
+ *                                               k=0
+ * j=0..1, i=0..9
+ */
+static const int16_t cb_ma_predictor_sum[2][10] = { /* (0.15) */
+  { 7798,  8447,  8205,  8293,  8126,  8477,  8447,  8703,  9043,  8604},
+  {14585, 18333, 19772, 17344, 16426, 16459, 15155, 15220, 16043, 15708}
+};
+
+/**
+ *                                                           12
+ *                                                          2
+ * cb_ma_predictor_sum_inv[j][i] = floor(---------------------------------------------)
+ *                                               3
+ *                                        1.0 - sum ( cb_ma_predictor_float[j][k][i] )
+ *                                              k=0
+ * j=0..1, i=0..9
+ */
+static const int16_t cb_ma_predictor_sum_inv[2][10] = { /* (3.12) */
+  {17210, 15888, 16357, 16183, 16516, 15833, 15888, 15421, 14840, 15597},
+  { 9202,  7320,  6788,  7738,  8170,  8154,  8856,  8818,  8366,  8544}
+};
+
+/**
+ * MA prediction coefficients (3.9.1 of G.729, near Equation 69)
+ */
+static const uint16_t ma_prediction_coeff[4] = { /* (0.13) */
+  5571, 4751, 2785, 1556
+};
+
+/**
+ * initial LSP coefficients belongs to virtual frame preceding  the
+ * first frame of the stream
+ */
+static const int16_t lsp_init[10]= { /* (0.15) */
+   30000, 26000, 21000, 15000, 8000, 0, -8000,-15000,-21000,-26000
+};
+
+/**
+ * additional "phase" post-processing filter impulse response (D.6.2 of G.729)
+ *
+ * Table contains three impulse responses, correspond to
+ * different amounts of spreading.
+ */
+static const int16_t phase_filter[3][40] =
+{
+  { // maximum spreading (for noise-like segments)
+    14690, 11518,  1268, -2762, -5672,  7514,  -36, -2808, -3041,  4823,
+     2952, -8425,  3785,  1455,  2179, -8638, 8051, -2104, -1455,   777,
+     1108, -2386,  2254,  -364,  -675, -2104, 6046, -5682,  1072,  3123,
+    -5059,  5312, -2330, -3729,  6924, -3890,  675, -1776,    29, 10145,
+  },
+  { // medium spreading
+    30274,  3831, -4037,  2972, -1049, -1003,  2477, -3044,  2815, -2232,
+     1753, -1612,  1714, -1776,  1543, -1009,   429,  -170,   472, -1265,
+     2176, -2707,  2523, -1622,   344,   826, -1530,  1724, -1658,  1701,
+    -2064,  2644, -3061,  2897, -1979,   557,   780, -1370,   842,   655,
+  },
+  { // no spreading (for voiced speech)
+    32767, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  }
+};
+#endif /* AVCODEC_G729DATA_H */
diff --git a/libavcodec/g729dec.c b/libavcodec/g729dec.c
new file mode 100644
index 0000000..2e1bf18
--- /dev/null
+++ b/libavcodec/g729dec.c
@@ -0,0 +1,718 @@
+/*
+ * G.729, G729 Annex D decoders
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "avcodec.h"
+#include "libavutil/avutil.h"
+#include "get_bits.h"
+#include "audiodsp.h"
+#include "internal.h"
+
+
+#include "g729.h"
+#include "lsp.h"
+#include "celp_math.h"
+#include "celp_filters.h"
+#include "acelp_filters.h"
+#include "acelp_pitch_delay.h"
+#include "acelp_vectors.h"
+#include "g729data.h"
+#include "g729postfilter.h"
+
+/**
+ * minimum quantized LSF value (3.2.4)
+ * 0.005 in Q13
+ */
+#define LSFQ_MIN                   40
+
+/**
+ * maximum quantized LSF value (3.2.4)
+ * 3.135 in Q13
+ */
+#define LSFQ_MAX                   25681
+
+/**
+ * minimum LSF distance (3.2.4)
+ * 0.0391 in Q13
+ */
+#define LSFQ_DIFF_MIN              321
+
+/// interpolation filter length
+#define INTERPOL_LEN              11
+
+/**
+ * minimum gain pitch value (3.8, Equation 47)
+ * 0.2 in (1.14)
+ */
+#define SHARP_MIN                  3277
+
+/**
+ * maximum gain pitch value (3.8, Equation 47)
+ * (EE) This does not comply with the specification.
+ * Specification says about 0.8, which should be
+ * 13107 in (1.14), but reference C code uses
+ * 13017 (equals to 0.7945) instead of it.
+ */
+#define SHARP_MAX                  13017
+
+/**
+ * MR_ENERGY (mean removed energy) = mean_energy + 10 * log10(2^26  * subframe_size) in (7.13)
+ */
+#define MR_ENERGY 1018156
+
+#define DECISION_NOISE        0
+#define DECISION_INTERMEDIATE 1
+#define DECISION_VOICE        2
+
+typedef enum {
+    FORMAT_G729_8K = 0,
+    FORMAT_G729D_6K4,
+    FORMAT_COUNT,
+} G729Formats;
+
+typedef struct {
+    uint8_t ac_index_bits[2];   ///< adaptive codebook index for second subframe (size in bits)
+    uint8_t parity_bit;         ///< parity bit for pitch delay
+    uint8_t gc_1st_index_bits;  ///< gain codebook (first stage) index (size in bits)
+    uint8_t gc_2nd_index_bits;  ///< gain codebook (second stage) index (size in bits)
+    uint8_t fc_signs_bits;      ///< number of pulses in fixed-codebook vector
+    uint8_t fc_indexes_bits;    ///< size (in bits) of fixed-codebook index entry
+} G729FormatDescription;
+
+typedef struct {
+    AudioDSPContext adsp;
+
+    /// past excitation signal buffer
+    int16_t exc_base[2*SUBFRAME_SIZE+PITCH_DELAY_MAX+INTERPOL_LEN];
+
+    int16_t* exc;               ///< start of past excitation data in buffer
+    int pitch_delay_int_prev;   ///< integer part of previous subframe's pitch delay (4.1.3)
+
+    /// (2.13) LSP quantizer outputs
+    int16_t  past_quantizer_output_buf[MA_NP + 1][10];
+    int16_t* past_quantizer_outputs[MA_NP + 1];
+
+    int16_t lsfq[10];           ///< (2.13) quantized LSF coefficients from previous frame
+    int16_t lsp_buf[2][10];     ///< (0.15) LSP coefficients (previous and current frames) (3.2.5)
+    int16_t *lsp[2];            ///< pointers to lsp_buf
+
+    int16_t quant_energy[4];    ///< (5.10) past quantized energy
+
+    /// previous speech data for LP synthesis filter
+    int16_t syn_filter_data[10];
+
+
+    /// residual signal buffer (used in long-term postfilter)
+    int16_t residual[SUBFRAME_SIZE + RES_PREV_DATA_SIZE];
+
+    /// previous speech data for residual calculation filter
+    int16_t res_filter_data[SUBFRAME_SIZE+10];
+
+    /// previous speech data for short-term postfilter
+    int16_t pos_filter_data[SUBFRAME_SIZE+10];
+
+    /// (1.14) pitch gain of current and five previous subframes
+    int16_t past_gain_pitch[6];
+
+    /// (14.1) gain code from current and previous subframe
+    int16_t past_gain_code[2];
+
+    /// voice decision on previous subframe (0-noise, 1-intermediate, 2-voice), G.729D
+    int16_t voice_decision;
+
+    int16_t onset;              ///< detected onset level (0-2)
+    int16_t was_periodic;       ///< whether previous frame was declared as periodic or not (4.4)
+    int16_t ht_prev_data;       ///< previous data for 4.2.3, equation 86
+    int gain_coeff;             ///< (1.14) gain coefficient (4.2.4)
+    uint16_t rand_value;        ///< random number generator value (4.4.4)
+    int ma_predictor_prev;      ///< switched MA predictor of LSP quantizer from last good frame
+
+    /// (14.14) high-pass filter data (past input)
+    int hpf_f[2];
+
+    /// high-pass filter data (past output)
+    int16_t hpf_z[2];
+}  G729Context;
+
+static const G729FormatDescription format_g729_8k = {
+    .ac_index_bits     = {8,5},
+    .parity_bit        = 1,
+    .gc_1st_index_bits = GC_1ST_IDX_BITS_8K,
+    .gc_2nd_index_bits = GC_2ND_IDX_BITS_8K,
+    .fc_signs_bits     = 4,
+    .fc_indexes_bits   = 13,
+};
+
+static const G729FormatDescription format_g729d_6k4 = {
+    .ac_index_bits     = {8,4},
+    .parity_bit        = 0,
+    .gc_1st_index_bits = GC_1ST_IDX_BITS_6K4,
+    .gc_2nd_index_bits = GC_2ND_IDX_BITS_6K4,
+    .fc_signs_bits     = 2,
+    .fc_indexes_bits   = 9,
+};
+
+/**
+ * @brief pseudo random number generator
+ */
+static inline uint16_t g729_prng(uint16_t value)
+{
+    return 31821 * value + 13849;
+}
+
+/**
+ * Decodes LSF (Line Spectral Frequencies) from L0-L3 (3.2.4).
+ * @param[out] lsfq (2.13) quantized LSF coefficients
+ * @param[in,out] past_quantizer_outputs (2.13) quantizer outputs from previous frames
+ * @param ma_predictor switched MA predictor of LSP quantizer
+ * @param vq_1st first stage vector of quantizer
+ * @param vq_2nd_low second stage lower vector of LSP quantizer
+ * @param vq_2nd_high second stage higher vector of LSP quantizer
+ */
+static void lsf_decode(int16_t* lsfq, int16_t* past_quantizer_outputs[MA_NP + 1],
+                       int16_t ma_predictor,
+                       int16_t vq_1st, int16_t vq_2nd_low, int16_t vq_2nd_high)
+{
+    int i,j;
+    static const uint8_t min_distance[2]={10, 5}; //(2.13)
+    int16_t* quantizer_output = past_quantizer_outputs[MA_NP];
+
+    for (i = 0; i < 5; i++) {
+        quantizer_output[i]     = cb_lsp_1st[vq_1st][i    ] + cb_lsp_2nd[vq_2nd_low ][i    ];
+        quantizer_output[i + 5] = cb_lsp_1st[vq_1st][i + 5] + cb_lsp_2nd[vq_2nd_high][i + 5];
+    }
+
+    for (j = 0; j < 2; j++) {
+        for (i = 1; i < 10; i++) {
+            int diff = (quantizer_output[i - 1] - quantizer_output[i] + min_distance[j]) >> 1;
+            if (diff > 0) {
+                quantizer_output[i - 1] -= diff;
+                quantizer_output[i    ] += diff;
+            }
+        }
+    }
+
+    for (i = 0; i < 10; i++) {
+        int sum = quantizer_output[i] * cb_ma_predictor_sum[ma_predictor][i];
+        for (j = 0; j < MA_NP; j++)
+            sum += past_quantizer_outputs[j][i] * cb_ma_predictor[ma_predictor][j][i];
+
+        lsfq[i] = sum >> 15;
+    }
+
+    ff_acelp_reorder_lsf(lsfq, LSFQ_DIFF_MIN, LSFQ_MIN, LSFQ_MAX, 10);
+}
+
+/**
+ * Restores past LSP quantizer output using LSF from previous frame
+ * @param[in,out] lsfq (2.13) quantized LSF coefficients
+ * @param[in,out] past_quantizer_outputs (2.13) quantizer outputs from previous frames
+ * @param ma_predictor_prev MA predictor from previous frame
+ * @param lsfq_prev (2.13) quantized LSF coefficients from previous frame
+ */
+static void lsf_restore_from_previous(int16_t* lsfq,
+                                      int16_t* past_quantizer_outputs[MA_NP + 1],
+                                      int ma_predictor_prev)
+{
+    int16_t* quantizer_output = past_quantizer_outputs[MA_NP];
+    int i,k;
+
+    for (i = 0; i < 10; i++) {
+        int tmp = lsfq[i] << 15;
+
+        for (k = 0; k < MA_NP; k++)
+            tmp -= past_quantizer_outputs[k][i] * cb_ma_predictor[ma_predictor_prev][k][i];
+
+        quantizer_output[i] = ((tmp >> 15) * cb_ma_predictor_sum_inv[ma_predictor_prev][i]) >> 12;
+    }
+}
+
+/**
+ * Constructs new excitation signal and applies phase filter to it
+ * @param[out] out constructed speech signal
+ * @param in original excitation signal
+ * @param fc_cur (2.13) original fixed-codebook vector
+ * @param gain_code (14.1) gain code
+ * @param subframe_size length of the subframe
+ */
+static void g729d_get_new_exc(
+        int16_t* out,
+        const int16_t* in,
+        const int16_t* fc_cur,
+        int dstate,
+        int gain_code,
+        int subframe_size)
+{
+    int i;
+    int16_t fc_new[SUBFRAME_SIZE];
+
+    ff_celp_convolve_circ(fc_new, fc_cur, phase_filter[dstate], subframe_size);
+
+    for(i=0; i<subframe_size; i++)
+    {
+        out[i]  = in[i];
+        out[i] -= (gain_code * fc_cur[i] + 0x2000) >> 14;
+        out[i] += (gain_code * fc_new[i] + 0x2000) >> 14;
+    }
+}
+
+/**
+ * Makes decision about onset in current subframe
+ * @param past_onset decision result of previous subframe
+ * @param past_gain_code gain code of current and previous subframe
+ *
+ * @return onset decision result for current subframe
+ */
+static int g729d_onset_decision(int past_onset, const int16_t* past_gain_code)
+{
+    if((past_gain_code[0] >> 1) > past_gain_code[1])
+        return 2;
+    else
+        return FFMAX(past_onset-1, 0);
+}
+
+/**
+ * Makes decision about voice presence in current subframe
+ * @param onset onset level
+ * @param prev_voice_decision voice decision result from previous subframe
+ * @param past_gain_pitch pitch gain of current and previous subframes
+ *
+ * @return voice decision result for current subframe
+ */
+static int16_t g729d_voice_decision(int onset, int prev_voice_decision, const int16_t* past_gain_pitch)
+{
+    int i, low_gain_pitch_cnt, voice_decision;
+
+    if(past_gain_pitch[0] >= 14745)      // 0.9
+        voice_decision = DECISION_VOICE;
+    else if (past_gain_pitch[0] <= 9830) // 0.6
+        voice_decision = DECISION_NOISE;
+    else
+        voice_decision = DECISION_INTERMEDIATE;
+
+    for(i=0, low_gain_pitch_cnt=0; i<6; i++)
+        if(past_gain_pitch[i] < 9830)
+            low_gain_pitch_cnt++;
+
+    if(low_gain_pitch_cnt > 2 && !onset)
+        voice_decision = DECISION_NOISE;
+
+    if(!onset && voice_decision > prev_voice_decision + 1)
+        voice_decision--;
+
+    if(onset && voice_decision < DECISION_VOICE)
+        voice_decision++;
+
+    return voice_decision;
+}
+
+static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
+{
+    int res = 0;
+
+    while (order--)
+        res += *v1++ * *v2++;
+
+    return res;
+}
+
+static av_cold int decoder_init(AVCodecContext * avctx)
+{
+    G729Context* ctx = avctx->priv_data;
+    int i,k;
+
+    if (avctx->channels != 1) {
+        av_log(avctx, AV_LOG_ERROR, "Only mono sound is supported (requested channels: %d).\n", avctx->channels);
+        return AVERROR(EINVAL);
+    }
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+
+    /* Both 8kbit/s and 6.4kbit/s modes uses two subframes per frame. */
+    avctx->frame_size = SUBFRAME_SIZE << 1;
+
+    ctx->gain_coeff = 16384; // 1.0 in (1.14)
+
+    for (k = 0; k < MA_NP + 1; k++) {
+        ctx->past_quantizer_outputs[k] = ctx->past_quantizer_output_buf[k];
+        for (i = 1; i < 11; i++)
+            ctx->past_quantizer_outputs[k][i - 1] = (18717 * i) >> 3;
+    }
+
+    ctx->lsp[0] = ctx->lsp_buf[0];
+    ctx->lsp[1] = ctx->lsp_buf[1];
+    memcpy(ctx->lsp[0], lsp_init, 10 * sizeof(int16_t));
+
+    ctx->exc = &ctx->exc_base[PITCH_DELAY_MAX+INTERPOL_LEN];
+
+    ctx->pitch_delay_int_prev = PITCH_DELAY_MIN;
+
+    /* random seed initialization */
+    ctx->rand_value = 21845;
+
+    /* quantized prediction error */
+    for(i=0; i<4; i++)
+        ctx->quant_energy[i] = -14336; // -14 in (5.10)
+
+    ff_audiodsp_init(&ctx->adsp);
+    ctx->adsp.scalarproduct_int16 = scalarproduct_int16_c;
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
+                        AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+    int16_t *out_frame;
+    GetBitContext gb;
+    const G729FormatDescription *format;
+    int frame_erasure = 0;    ///< frame erasure detected during decoding
+    int bad_pitch = 0;        ///< parity check failed
+    int i;
+    int16_t *tmp;
+    G729Formats packet_type;
+    G729Context *ctx = avctx->priv_data;
+    int16_t lp[2][11];           // (3.12)
+    uint8_t ma_predictor;     ///< switched MA predictor of LSP quantizer
+    uint8_t quantizer_1st;    ///< first stage vector of quantizer
+    uint8_t quantizer_2nd_lo; ///< second stage lower vector of quantizer (size in bits)
+    uint8_t quantizer_2nd_hi; ///< second stage higher vector of quantizer (size in bits)
+
+    int pitch_delay_int[2];      // pitch delay, integer part
+    int pitch_delay_3x;          // pitch delay, multiplied by 3
+    int16_t fc[SUBFRAME_SIZE];   // fixed-codebook vector
+    int16_t synth[SUBFRAME_SIZE+10]; // fixed-codebook vector
+    int j, ret;
+    int gain_before, gain_after;
+    int is_periodic = 0;         // whether one of the subframes is declared as periodic or not
+    AVFrame *frame = data;
+
+    frame->nb_samples = SUBFRAME_SIZE<<1;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    out_frame = (int16_t*) frame->data[0];
+
+    if (buf_size % 10 == 0) {
+        packet_type = FORMAT_G729_8K;
+        format = &format_g729_8k;
+        //Reset voice decision
+        ctx->onset = 0;
+        ctx->voice_decision = DECISION_VOICE;
+        av_log(avctx, AV_LOG_DEBUG, "Packet type: %s\n", "G.729 @ 8kbit/s");
+    } else if (buf_size == 8) {
+        packet_type = FORMAT_G729D_6K4;
+        format = &format_g729d_6k4;
+        av_log(avctx, AV_LOG_DEBUG, "Packet type: %s\n", "G.729D @ 6.4kbit/s");
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Packet size %d is unknown.\n", buf_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i=0; i < buf_size; i++)
+        frame_erasure |= buf[i];
+    frame_erasure = !frame_erasure;
+
+    init_get_bits(&gb, buf, 8*buf_size);
+
+    ma_predictor     = get_bits(&gb, 1);
+    quantizer_1st    = get_bits(&gb, VQ_1ST_BITS);
+    quantizer_2nd_lo = get_bits(&gb, VQ_2ND_BITS);
+    quantizer_2nd_hi = get_bits(&gb, VQ_2ND_BITS);
+
+    if(frame_erasure)
+        lsf_restore_from_previous(ctx->lsfq, ctx->past_quantizer_outputs,
+                                  ctx->ma_predictor_prev);
+    else {
+        lsf_decode(ctx->lsfq, ctx->past_quantizer_outputs,
+                   ma_predictor,
+                   quantizer_1st, quantizer_2nd_lo, quantizer_2nd_hi);
+        ctx->ma_predictor_prev = ma_predictor;
+    }
+
+    tmp = ctx->past_quantizer_outputs[MA_NP];
+    memmove(ctx->past_quantizer_outputs + 1, ctx->past_quantizer_outputs,
+            MA_NP * sizeof(int16_t*));
+    ctx->past_quantizer_outputs[0] = tmp;
+
+    ff_acelp_lsf2lsp(ctx->lsp[1], ctx->lsfq, 10);
+
+    ff_acelp_lp_decode(&lp[0][0], &lp[1][0], ctx->lsp[1], ctx->lsp[0], 10);
+
+    FFSWAP(int16_t*, ctx->lsp[1], ctx->lsp[0]);
+
+    for (i = 0; i < 2; i++) {
+        int gain_corr_factor;
+
+        uint8_t ac_index;      ///< adaptive codebook index
+        uint8_t pulses_signs;  ///< fixed-codebook vector pulse signs
+        int fc_indexes;        ///< fixed-codebook indexes
+        uint8_t gc_1st_index;  ///< gain codebook (first stage) index
+        uint8_t gc_2nd_index;  ///< gain codebook (second stage) index
+
+        ac_index      = get_bits(&gb, format->ac_index_bits[i]);
+        if(!i && format->parity_bit)
+            bad_pitch = av_parity(ac_index >> 2) == get_bits1(&gb);
+        fc_indexes    = get_bits(&gb, format->fc_indexes_bits);
+        pulses_signs  = get_bits(&gb, format->fc_signs_bits);
+        gc_1st_index  = get_bits(&gb, format->gc_1st_index_bits);
+        gc_2nd_index  = get_bits(&gb, format->gc_2nd_index_bits);
+
+        if (frame_erasure)
+            pitch_delay_3x   = 3 * ctx->pitch_delay_int_prev;
+        else if(!i) {
+            if (bad_pitch)
+                pitch_delay_3x   = 3 * ctx->pitch_delay_int_prev;
+            else
+                pitch_delay_3x = ff_acelp_decode_8bit_to_1st_delay3(ac_index);
+        } else {
+            int pitch_delay_min = av_clip(ctx->pitch_delay_int_prev - 5,
+                                          PITCH_DELAY_MIN, PITCH_DELAY_MAX - 9);
+
+            if(packet_type == FORMAT_G729D_6K4)
+                pitch_delay_3x = ff_acelp_decode_4bit_to_2nd_delay3(ac_index, pitch_delay_min);
+            else
+                pitch_delay_3x = ff_acelp_decode_5_6_bit_to_2nd_delay3(ac_index, pitch_delay_min);
+        }
+
+        /* Round pitch delay to nearest (used everywhere except ff_acelp_interpolate). */
+        pitch_delay_int[i]  = (pitch_delay_3x + 1) / 3;
+        if (pitch_delay_int[i] > PITCH_DELAY_MAX) {
+            av_log(avctx, AV_LOG_WARNING, "pitch_delay_int %d is too large\n", pitch_delay_int[i]);
+            pitch_delay_int[i] = PITCH_DELAY_MAX;
+        }
+
+        if (frame_erasure) {
+            ctx->rand_value = g729_prng(ctx->rand_value);
+            fc_indexes   = av_mod_uintp2(ctx->rand_value, format->fc_indexes_bits);
+
+            ctx->rand_value = g729_prng(ctx->rand_value);
+            pulses_signs = ctx->rand_value;
+        }
+
+
+        memset(fc, 0, sizeof(int16_t) * SUBFRAME_SIZE);
+        switch (packet_type) {
+            case FORMAT_G729_8K:
+                ff_acelp_fc_pulse_per_track(fc, ff_fc_4pulses_8bits_tracks_13,
+                                            ff_fc_4pulses_8bits_track_4,
+                                            fc_indexes, pulses_signs, 3, 3);
+                break;
+            case FORMAT_G729D_6K4:
+                ff_acelp_fc_pulse_per_track(fc, ff_fc_2pulses_9bits_track1_gray,
+                                            ff_fc_2pulses_9bits_track2_gray,
+                                            fc_indexes, pulses_signs, 1, 4);
+                break;
+        }
+
+        /*
+          This filter enhances harmonic components of the fixed-codebook vector to
+          improve the quality of the reconstructed speech.
+
+                     / fc_v[i],                                    i < pitch_delay
+          fc_v[i] = <
+                     \ fc_v[i] + gain_pitch * fc_v[i-pitch_delay], i >= pitch_delay
+        */
+        ff_acelp_weighted_vector_sum(fc + pitch_delay_int[i],
+                                     fc + pitch_delay_int[i],
+                                     fc, 1 << 14,
+                                     av_clip(ctx->past_gain_pitch[0], SHARP_MIN, SHARP_MAX),
+                                     0, 14,
+                                     SUBFRAME_SIZE - pitch_delay_int[i]);
+
+        memmove(ctx->past_gain_pitch+1, ctx->past_gain_pitch, 5 * sizeof(int16_t));
+        ctx->past_gain_code[1] = ctx->past_gain_code[0];
+
+        if (frame_erasure) {
+            ctx->past_gain_pitch[0] = (29491 * ctx->past_gain_pitch[0]) >> 15; // 0.90 (0.15)
+            ctx->past_gain_code[0]  = ( 2007 * ctx->past_gain_code[0] ) >> 11; // 0.98 (0.11)
+
+            gain_corr_factor = 0;
+        } else {
+            if (packet_type == FORMAT_G729D_6K4) {
+                ctx->past_gain_pitch[0]  = cb_gain_1st_6k4[gc_1st_index][0] +
+                                           cb_gain_2nd_6k4[gc_2nd_index][0];
+                gain_corr_factor = cb_gain_1st_6k4[gc_1st_index][1] +
+                                   cb_gain_2nd_6k4[gc_2nd_index][1];
+
+                /* Without check below overflow can occur in ff_acelp_update_past_gain.
+                   It is not issue for G.729, because gain_corr_factor in it's case is always
+                   greater than 1024, while in G.729D it can be even zero. */
+                gain_corr_factor = FFMAX(gain_corr_factor, 1024);
+#ifndef G729_BITEXACT
+                gain_corr_factor >>= 1;
+#endif
+            } else {
+                ctx->past_gain_pitch[0]  = cb_gain_1st_8k[gc_1st_index][0] +
+                                           cb_gain_2nd_8k[gc_2nd_index][0];
+                gain_corr_factor = cb_gain_1st_8k[gc_1st_index][1] +
+                                   cb_gain_2nd_8k[gc_2nd_index][1];
+            }
+
+            /* Decode the fixed-codebook gain. */
+            ctx->past_gain_code[0] = ff_acelp_decode_gain_code(&ctx->adsp, gain_corr_factor,
+                                                               fc, MR_ENERGY,
+                                                               ctx->quant_energy,
+                                                               ma_prediction_coeff,
+                                                               SUBFRAME_SIZE, 4);
+#ifdef G729_BITEXACT
+            /*
+              This correction required to get bit-exact result with
+              reference code, because gain_corr_factor in G.729D is
+              two times larger than in original G.729.
+
+              If bit-exact result is not issue then gain_corr_factor
+              can be simpler divided by 2 before call to g729_get_gain_code
+              instead of using correction below.
+            */
+            if (packet_type == FORMAT_G729D_6K4) {
+                gain_corr_factor >>= 1;
+                ctx->past_gain_code[0] >>= 1;
+            }
+#endif
+        }
+        ff_acelp_update_past_gain(ctx->quant_energy, gain_corr_factor, 2, frame_erasure);
+
+        /* Routine requires rounding to lowest. */
+        ff_acelp_interpolate(ctx->exc + i * SUBFRAME_SIZE,
+                             ctx->exc + i * SUBFRAME_SIZE - pitch_delay_3x / 3,
+                             ff_acelp_interp_filter, 6,
+                             (pitch_delay_3x % 3) << 1,
+                             10, SUBFRAME_SIZE);
+
+        ff_acelp_weighted_vector_sum(ctx->exc + i * SUBFRAME_SIZE,
+                                     ctx->exc + i * SUBFRAME_SIZE, fc,
+                                     (!ctx->was_periodic && frame_erasure) ? 0 : ctx->past_gain_pitch[0],
+                                     ( ctx->was_periodic && frame_erasure) ? 0 : ctx->past_gain_code[0],
+                                     1 << 13, 14, SUBFRAME_SIZE);
+
+        memcpy(synth, ctx->syn_filter_data, 10 * sizeof(int16_t));
+
+        if (ff_celp_lp_synthesis_filter(
+            synth+10,
+            &lp[i][1],
+            ctx->exc  + i * SUBFRAME_SIZE,
+            SUBFRAME_SIZE,
+            10,
+            1,
+            0,
+            0x800))
+            /* Overflow occurred, downscale excitation signal... */
+            for (j = 0; j < 2 * SUBFRAME_SIZE + PITCH_DELAY_MAX + INTERPOL_LEN; j++)
+                ctx->exc_base[j] >>= 2;
+
+        /* ... and make synthesis again. */
+        if (packet_type == FORMAT_G729D_6K4) {
+            int16_t exc_new[SUBFRAME_SIZE];
+
+            ctx->onset = g729d_onset_decision(ctx->onset, ctx->past_gain_code);
+            ctx->voice_decision = g729d_voice_decision(ctx->onset, ctx->voice_decision, ctx->past_gain_pitch);
+
+            g729d_get_new_exc(exc_new, ctx->exc  + i * SUBFRAME_SIZE, fc, ctx->voice_decision, ctx->past_gain_code[0], SUBFRAME_SIZE);
+
+            ff_celp_lp_synthesis_filter(
+                    synth+10,
+                    &lp[i][1],
+                    exc_new,
+                    SUBFRAME_SIZE,
+                    10,
+                    0,
+                    0,
+                    0x800);
+        } else {
+            ff_celp_lp_synthesis_filter(
+                    synth+10,
+                    &lp[i][1],
+                    ctx->exc  + i * SUBFRAME_SIZE,
+                    SUBFRAME_SIZE,
+                    10,
+                    0,
+                    0,
+                    0x800);
+        }
+        /* Save data (without postfilter) for use in next subframe. */
+        memcpy(ctx->syn_filter_data, synth+SUBFRAME_SIZE, 10 * sizeof(int16_t));
+
+        /* Calculate gain of unfiltered signal for use in AGC. */
+        gain_before = 0;
+        for (j = 0; j < SUBFRAME_SIZE; j++)
+            gain_before += FFABS(synth[j+10]);
+
+        /* Call postfilter and also update voicing decision for use in next frame. */
+        ff_g729_postfilter(
+                &ctx->adsp,
+                &ctx->ht_prev_data,
+                &is_periodic,
+                &lp[i][0],
+                pitch_delay_int[0],
+                ctx->residual,
+                ctx->res_filter_data,
+                ctx->pos_filter_data,
+                synth+10,
+                SUBFRAME_SIZE);
+
+        /* Calculate gain of filtered signal for use in AGC. */
+        gain_after = 0;
+        for(j=0; j<SUBFRAME_SIZE; j++)
+            gain_after += FFABS(synth[j+10]);
+
+        ctx->gain_coeff = ff_g729_adaptive_gain_control(
+                gain_before,
+                gain_after,
+                synth+10,
+                SUBFRAME_SIZE,
+                ctx->gain_coeff);
+
+        if (frame_erasure)
+            ctx->pitch_delay_int_prev = FFMIN(ctx->pitch_delay_int_prev + 1, PITCH_DELAY_MAX);
+        else
+            ctx->pitch_delay_int_prev = pitch_delay_int[i];
+
+        memcpy(synth+8, ctx->hpf_z, 2*sizeof(int16_t));
+        ff_acelp_high_pass_filter(
+                out_frame + i*SUBFRAME_SIZE,
+                ctx->hpf_f,
+                synth+10,
+                SUBFRAME_SIZE);
+        memcpy(ctx->hpf_z, synth+8+SUBFRAME_SIZE, 2*sizeof(int16_t));
+    }
+
+    ctx->was_periodic = is_periodic;
+
+    /* Save signal for use in next frame. */
+    memmove(ctx->exc_base, ctx->exc_base + 2 * SUBFRAME_SIZE, (PITCH_DELAY_MAX+INTERPOL_LEN)*sizeof(int16_t));
+
+    *got_frame_ptr = 1;
+    return packet_type == FORMAT_G729_8K ? 10 : 8;
+}
+
+AVCodec ff_g729_decoder = {
+    .name           = "g729",
+    .long_name      = NULL_IF_CONFIG_SMALL("G.729"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_G729,
+    .priv_data_size = sizeof(G729Context),
+    .init           = decoder_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/g729postfilter.c b/libavcodec/g729postfilter.c
new file mode 100644
index 0000000..d9076ec
--- /dev/null
+++ b/libavcodec/g729postfilter.c
@@ -0,0 +1,614 @@
+/*
+ * G.729, G729 Annex D postfilter
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <inttypes.h>
+#include <limits.h>
+
+#include "avcodec.h"
+#include "g729.h"
+#include "acelp_pitch_delay.h"
+#include "g729postfilter.h"
+#include "celp_math.h"
+#include "acelp_filters.h"
+#include "acelp_vectors.h"
+#include "celp_filters.h"
+
+#define FRAC_BITS 15
+#include "mathops.h"
+
+/**
+ * short interpolation filter (of length 33, according to spec)
+ * for computing signal with non-integer delay
+ */
+static const int16_t ff_g729_interp_filt_short[(ANALYZED_FRAC_DELAYS+1)*SHORT_INT_FILT_LEN] = {
+      0, 31650, 28469, 23705, 18050, 12266,  7041,  2873,
+      0, -1597, -2147, -1992, -1492,  -933,  -484,  -188,
+};
+
+/**
+ * long interpolation filter (of length 129, according to spec)
+ * for computing signal with non-integer delay
+ */
+static const int16_t ff_g729_interp_filt_long[(ANALYZED_FRAC_DELAYS+1)*LONG_INT_FILT_LEN] = {
+   0, 31915, 29436, 25569, 20676, 15206,  9639,  4439,
+   0, -3390, -5579, -6549, -6414, -5392, -3773, -1874,
+   0,  1595,  2727,  3303,  3319,  2850,  2030,  1023,
+   0,  -887, -1527, -1860, -1876, -1614, -1150,  -579,
+   0,   501,   859,  1041,  1044,   892,   631,   315,
+   0,  -266,  -453,  -543,  -538,  -455,  -317,  -156,
+   0,   130,   218,   258,   253,   212,   147,    72,
+   0,   -59,  -101,  -122,  -123,  -106,   -77,   -40,
+};
+
+/**
+ * formant_pp_factor_num_pow[i] = FORMANT_PP_FACTOR_NUM^(i+1)
+ */
+static const int16_t formant_pp_factor_num_pow[10]= {
+  /* (0.15) */
+  18022, 9912, 5451, 2998, 1649, 907, 499, 274, 151, 83
+};
+
+/**
+ * formant_pp_factor_den_pow[i] = FORMANT_PP_FACTOR_DEN^(i+1)
+ */
+static const int16_t formant_pp_factor_den_pow[10] = {
+  /* (0.15) */
+  22938, 16057, 11240, 7868, 5508, 3856, 2699, 1889, 1322, 925
+};
+
+/**
+ * \brief Residual signal calculation (4.2.1 if G.729)
+ * \param out [out] output data filtered through A(z/FORMANT_PP_FACTOR_NUM)
+ * \param filter_coeffs (3.12) A(z/FORMANT_PP_FACTOR_NUM) filter coefficients
+ * \param in input speech data to process
+ * \param subframe_size size of one subframe
+ *
+ * \note in buffer must contain 10 items of previous speech data before top of the buffer
+ * \remark It is safe to pass the same buffer for input and output.
+ */
+static void residual_filter(int16_t* out, const int16_t* filter_coeffs, const int16_t* in,
+                            int subframe_size)
+{
+    int i, n;
+
+    for (n = subframe_size - 1; n >= 0; n--) {
+        int sum = 0x800;
+        for (i = 0; i < 10; i++)
+            sum += filter_coeffs[i] * in[n - i - 1];
+
+        out[n] = in[n] + (sum >> 12);
+    }
+}
+
+/**
+ * \brief long-term postfilter (4.2.1)
+ * \param dsp initialized DSP context
+ * \param pitch_delay_int integer part of the pitch delay in the first subframe
+ * \param residual filtering input data
+ * \param residual_filt [out] speech signal with applied A(z/FORMANT_PP_FACTOR_NUM) filter
+ * \param subframe_size size of subframe
+ *
+ * \return 0 if long-term prediction gain is less than 3dB, 1 -  otherwise
+ */
+static int16_t long_term_filter(AudioDSPContext *adsp, int pitch_delay_int,
+                                const int16_t* residual, int16_t *residual_filt,
+                                int subframe_size)
+{
+    int i, k, tmp, tmp2;
+    int sum;
+    int L_temp0;
+    int L_temp1;
+    int64_t L64_temp0;
+    int64_t L64_temp1;
+    int16_t shift;
+    int corr_int_num, corr_int_den;
+
+    int ener;
+    int16_t sh_ener;
+
+    int16_t gain_num,gain_den; //selected signal's gain numerator and denominator
+    int16_t sh_gain_num, sh_gain_den;
+    int gain_num_square;
+
+    int16_t gain_long_num,gain_long_den; //filtered through long interpolation filter signal's gain numerator and denominator
+    int16_t sh_gain_long_num, sh_gain_long_den;
+
+    int16_t best_delay_int, best_delay_frac;
+
+    int16_t delayed_signal_offset;
+    int lt_filt_factor_a, lt_filt_factor_b;
+
+    int16_t * selected_signal;
+    const int16_t * selected_signal_const; //Necessary to avoid compiler warning
+
+    int16_t sig_scaled[SUBFRAME_SIZE + RES_PREV_DATA_SIZE];
+    int16_t delayed_signal[ANALYZED_FRAC_DELAYS][SUBFRAME_SIZE+1];
+    int corr_den[ANALYZED_FRAC_DELAYS][2];
+
+    tmp = 0;
+    for(i=0; i<subframe_size + RES_PREV_DATA_SIZE; i++)
+        tmp |= FFABS(residual[i]);
+
+    if(!tmp)
+        shift = 3;
+    else
+        shift = av_log2(tmp) - 11;
+
+    if (shift > 0)
+        for (i = 0; i < subframe_size + RES_PREV_DATA_SIZE; i++)
+            sig_scaled[i] = residual[i] >> shift;
+    else
+        for (i = 0; i < subframe_size + RES_PREV_DATA_SIZE; i++)
+            sig_scaled[i] = residual[i] << -shift;
+
+    /* Start of best delay searching code */
+    gain_num = 0;
+
+    ener = adsp->scalarproduct_int16(sig_scaled + RES_PREV_DATA_SIZE,
+                                    sig_scaled + RES_PREV_DATA_SIZE,
+                                    subframe_size);
+    if (ener) {
+        sh_ener = av_log2(ener) - 14;
+        sh_ener = FFMAX(sh_ener, 0);
+        ener >>= sh_ener;
+        /* Search for best pitch delay.
+
+                       sum{ r(n) * r(k,n) ] }^2
+           R'(k)^2 := -------------------------
+                       sum{ r(k,n) * r(k,n) }
+
+
+           R(T)    :=  sum{ r(n) * r(n-T) ] }
+
+
+           where
+           r(n-T) is integer delayed signal with delay T
+           r(k,n) is non-integer delayed signal with integer delay best_delay
+           and fractional delay k */
+
+        /* Find integer delay best_delay which maximizes correlation R(T).
+
+           This is also equals to numerator of R'(0),
+           since the fine search (second step) is done with 1/8
+           precision around best_delay. */
+        corr_int_num = 0;
+        best_delay_int = pitch_delay_int - 1;
+        for (i = pitch_delay_int - 1; i <= pitch_delay_int + 1; i++) {
+            sum = adsp->scalarproduct_int16(sig_scaled + RES_PREV_DATA_SIZE,
+                                           sig_scaled + RES_PREV_DATA_SIZE - i,
+                                           subframe_size);
+            if (sum > corr_int_num) {
+                corr_int_num = sum;
+                best_delay_int = i;
+            }
+        }
+        if (corr_int_num) {
+            /* Compute denominator of pseudo-normalized correlation R'(0). */
+            corr_int_den = adsp->scalarproduct_int16(sig_scaled - best_delay_int + RES_PREV_DATA_SIZE,
+                                                    sig_scaled - best_delay_int + RES_PREV_DATA_SIZE,
+                                                    subframe_size);
+
+            /* Compute signals with non-integer delay k (with 1/8 precision),
+               where k is in [0;6] range.
+               Entire delay is qual to best_delay+(k+1)/8
+               This is archieved by applying an interpolation filter of
+               legth 33 to source signal. */
+            for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                ff_acelp_interpolate(&delayed_signal[k][0],
+                                     &sig_scaled[RES_PREV_DATA_SIZE - best_delay_int],
+                                     ff_g729_interp_filt_short,
+                                     ANALYZED_FRAC_DELAYS+1,
+                                     8 - k - 1,
+                                     SHORT_INT_FILT_LEN,
+                                     subframe_size + 1);
+            }
+
+            /* Compute denominator of pseudo-normalized correlation R'(k).
+
+                 corr_den[k][0] is square root of R'(k) denominator, for int(T) == int(T0)
+                 corr_den[k][1] is square root of R'(k) denominator, for int(T) == int(T0)+1
+
+              Also compute maximum value of above denominators over all k. */
+            tmp = corr_int_den;
+            for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                sum = adsp->scalarproduct_int16(&delayed_signal[k][1],
+                                               &delayed_signal[k][1],
+                                               subframe_size - 1);
+                corr_den[k][0] = sum + delayed_signal[k][0            ] * delayed_signal[k][0            ];
+                corr_den[k][1] = sum + delayed_signal[k][subframe_size] * delayed_signal[k][subframe_size];
+
+                tmp = FFMAX3(tmp, corr_den[k][0], corr_den[k][1]);
+            }
+
+            sh_gain_den = av_log2(tmp) - 14;
+            if (sh_gain_den >= 0) {
+
+                sh_gain_num =  FFMAX(sh_gain_den, sh_ener);
+                /* Loop through all k and find delay that maximizes
+                   R'(k) correlation.
+                   Search is done in [int(T0)-1; intT(0)+1] range
+                   with 1/8 precision. */
+                delayed_signal_offset = 1;
+                best_delay_frac = 0;
+                gain_den = corr_int_den >> sh_gain_den;
+                gain_num = corr_int_num >> sh_gain_num;
+                gain_num_square = gain_num * gain_num;
+                for (k = 0; k < ANALYZED_FRAC_DELAYS; k++) {
+                    for (i = 0; i < 2; i++) {
+                        int16_t gain_num_short, gain_den_short;
+                        int gain_num_short_square;
+                        /* Compute numerator of pseudo-normalized
+                           correlation R'(k). */
+                        sum = adsp->scalarproduct_int16(&delayed_signal[k][i],
+                                                       sig_scaled + RES_PREV_DATA_SIZE,
+                                                       subframe_size);
+                        gain_num_short = FFMAX(sum >> sh_gain_num, 0);
+
+                        /*
+                                      gain_num_short_square                gain_num_square
+                           R'(T)^2 = -----------------------, max R'(T)^2= --------------
+                                           den                                 gain_den
+                        */
+                        gain_num_short_square = gain_num_short * gain_num_short;
+                        gain_den_short = corr_den[k][i] >> sh_gain_den;
+
+                        tmp = MULL(gain_num_short_square, gain_den, FRAC_BITS);
+                        tmp2 = MULL(gain_num_square, gain_den_short, FRAC_BITS);
+
+                        // R'(T)^2 > max R'(T)^2
+                        if (tmp > tmp2) {
+                            gain_num = gain_num_short;
+                            gain_den = gain_den_short;
+                            gain_num_square = gain_num_short_square;
+                            delayed_signal_offset = i;
+                            best_delay_frac = k + 1;
+                        }
+                    }
+                }
+
+                /*
+                       R'(T)^2
+                  2 * --------- < 1
+                        R(0)
+                */
+                L64_temp0 =  (int64_t)gain_num_square  << ((sh_gain_num << 1) + 1);
+                L64_temp1 = ((int64_t)gain_den * ener) << (sh_gain_den + sh_ener);
+                if (L64_temp0 < L64_temp1)
+                    gain_num = 0;
+            } // if(sh_gain_den >= 0)
+        } // if(corr_int_num)
+    } // if(ener)
+    /* End of best delay searching code  */
+
+    if (!gain_num) {
+        memcpy(residual_filt, residual + RES_PREV_DATA_SIZE, subframe_size * sizeof(int16_t));
+
+        /* Long-term prediction gain is less than 3dB. Long-term postfilter is disabled. */
+        return 0;
+    }
+    if (best_delay_frac) {
+        /* Recompute delayed signal with an interpolation filter of length 129. */
+        ff_acelp_interpolate(residual_filt,
+                             &sig_scaled[RES_PREV_DATA_SIZE - best_delay_int + delayed_signal_offset],
+                             ff_g729_interp_filt_long,
+                             ANALYZED_FRAC_DELAYS + 1,
+                             8 - best_delay_frac,
+                             LONG_INT_FILT_LEN,
+                             subframe_size + 1);
+        /* Compute R'(k) correlation's numerator. */
+        sum = adsp->scalarproduct_int16(residual_filt,
+                                       sig_scaled + RES_PREV_DATA_SIZE,
+                                       subframe_size);
+
+        if (sum < 0) {
+            gain_long_num = 0;
+            sh_gain_long_num = 0;
+        } else {
+            tmp = av_log2(sum) - 14;
+            tmp = FFMAX(tmp, 0);
+            sum >>= tmp;
+            gain_long_num = sum;
+            sh_gain_long_num = tmp;
+        }
+
+        /* Compute R'(k) correlation's denominator. */
+        sum = adsp->scalarproduct_int16(residual_filt, residual_filt, subframe_size);
+
+        tmp = av_log2(sum) - 14;
+        tmp = FFMAX(tmp, 0);
+        sum >>= tmp;
+        gain_long_den = sum;
+        sh_gain_long_den = tmp;
+
+        /* Select between original and delayed signal.
+           Delayed signal will be selected if it increases R'(k)
+           correlation. */
+        L_temp0 = gain_num * gain_num;
+        L_temp0 = MULL(L_temp0, gain_long_den, FRAC_BITS);
+
+        L_temp1 = gain_long_num * gain_long_num;
+        L_temp1 = MULL(L_temp1, gain_den, FRAC_BITS);
+
+        tmp = ((sh_gain_long_num - sh_gain_num) << 1) - (sh_gain_long_den - sh_gain_den);
+        if (tmp > 0)
+            L_temp0 >>= tmp;
+        else
+            L_temp1 >>= -tmp;
+
+        /* Check if longer filter increases the values of R'(k). */
+        if (L_temp1 > L_temp0) {
+            /* Select long filter. */
+            selected_signal = residual_filt;
+            gain_num = gain_long_num;
+            gain_den = gain_long_den;
+            sh_gain_num = sh_gain_long_num;
+            sh_gain_den = sh_gain_long_den;
+        } else
+            /* Select short filter. */
+            selected_signal = &delayed_signal[best_delay_frac-1][delayed_signal_offset];
+
+        /* Rescale selected signal to original value. */
+        if (shift > 0)
+            for (i = 0; i < subframe_size; i++)
+                selected_signal[i] <<= shift;
+        else
+            for (i = 0; i < subframe_size; i++)
+                selected_signal[i] >>= -shift;
+
+        /* necessary to avoid compiler warning */
+        selected_signal_const = selected_signal;
+    } // if(best_delay_frac)
+    else
+        selected_signal_const = residual + RES_PREV_DATA_SIZE - (best_delay_int + 1 - delayed_signal_offset);
+#ifdef G729_BITEXACT
+    tmp = sh_gain_num - sh_gain_den;
+    if (tmp > 0)
+        gain_den >>= tmp;
+    else
+        gain_num >>= -tmp;
+
+    if (gain_num > gain_den)
+        lt_filt_factor_a = MIN_LT_FILT_FACTOR_A;
+    else {
+        gain_num >>= 2;
+        gain_den >>= 1;
+        lt_filt_factor_a = (gain_den << 15) / (gain_den + gain_num);
+    }
+#else
+    L64_temp0 = (((int64_t)gain_num) << sh_gain_num) >> 1;
+    L64_temp1 = ((int64_t)gain_den) << sh_gain_den;
+    lt_filt_factor_a = FFMAX((L64_temp1 << 15) / (L64_temp1 + L64_temp0), MIN_LT_FILT_FACTOR_A);
+#endif
+
+    /* Filter through selected filter. */
+    lt_filt_factor_b = 32767 - lt_filt_factor_a + 1;
+
+    ff_acelp_weighted_vector_sum(residual_filt, residual + RES_PREV_DATA_SIZE,
+                                 selected_signal_const,
+                                 lt_filt_factor_a, lt_filt_factor_b,
+                                 1<<14, 15, subframe_size);
+
+    // Long-term prediction gain is larger than 3dB.
+    return 1;
+}
+
+/**
+ * \brief Calculate reflection coefficient for tilt compensation filter (4.2.3).
+ * \param dsp initialized DSP context
+ * \param lp_gn (3.12) coefficients of A(z/FORMANT_PP_FACTOR_NUM) filter
+ * \param lp_gd (3.12) coefficients of A(z/FORMANT_PP_FACTOR_DEN) filter
+ * \param speech speech to update
+ * \param subframe_size size of subframe
+ *
+ * \return (3.12) reflection coefficient
+ *
+ * \remark The routine also calculates the gain term for the short-term
+ *         filter (gf) and multiplies the speech data by 1/gf.
+ *
+ * \note All members of lp_gn, except 10-19 must be equal to zero.
+ */
+static int16_t get_tilt_comp(AudioDSPContext *adsp, int16_t *lp_gn,
+                             const int16_t *lp_gd, int16_t* speech,
+                             int subframe_size)
+{
+    int rh1,rh0; // (3.12)
+    int temp;
+    int i;
+    int gain_term;
+
+    lp_gn[10] = 4096; //1.0 in (3.12)
+
+    /* Apply 1/A(z/FORMANT_PP_FACTOR_DEN) filter to hf. */
+    ff_celp_lp_synthesis_filter(lp_gn + 11, lp_gd + 1, lp_gn + 11, 22, 10, 0, 0, 0x800);
+    /* Now lp_gn (starting with 10) contains impulse response
+       of A(z/FORMANT_PP_FACTOR_NUM)/A(z/FORMANT_PP_FACTOR_DEN) filter. */
+
+    rh0 = adsp->scalarproduct_int16(lp_gn + 10, lp_gn + 10, 20);
+    rh1 = adsp->scalarproduct_int16(lp_gn + 10, lp_gn + 11, 20);
+
+    /* downscale to avoid overflow */
+    temp = av_log2(rh0) - 14;
+    if (temp > 0) {
+        rh0 >>= temp;
+        rh1 >>= temp;
+    }
+
+    if (FFABS(rh1) > rh0 || !rh0)
+        return 0;
+
+    gain_term = 0;
+    for (i = 0; i < 20; i++)
+        gain_term += FFABS(lp_gn[i + 10]);
+    gain_term >>= 2; // (3.12) -> (5.10)
+
+    if (gain_term > 0x400) { // 1.0 in (5.10)
+        temp = 0x2000000 / gain_term; // 1.0/gain_term in (0.15)
+        for (i = 0; i < subframe_size; i++)
+            speech[i] = (speech[i] * temp + 0x4000) >> 15;
+    }
+
+    return -(rh1 << 15) / rh0;
+}
+
+/**
+ * \brief Apply tilt compensation filter (4.2.3).
+ * \param res_pst [in/out] residual signal (partially filtered)
+ * \param k1 (3.12) reflection coefficient
+ * \param subframe_size size of subframe
+ * \param ht_prev_data previous data for 4.2.3, equation 86
+ *
+ * \return new value for ht_prev_data
+*/
+static int16_t apply_tilt_comp(int16_t* out, int16_t* res_pst, int refl_coeff,
+                               int subframe_size, int16_t ht_prev_data)
+{
+    int tmp, tmp2;
+    int i;
+    int gt, ga;
+    int fact, sh_fact;
+
+    if (refl_coeff > 0) {
+        gt = (refl_coeff * G729_TILT_FACTOR_PLUS + 0x4000) >> 15;
+        fact = 0x4000; // 0.5 in (0.15)
+        sh_fact = 15;
+    } else {
+        gt = (refl_coeff * G729_TILT_FACTOR_MINUS + 0x4000) >> 15;
+        fact = 0x800; // 0.5 in (3.12)
+        sh_fact = 12;
+    }
+    ga = (fact << 15) / av_clip_int16(32768 - FFABS(gt));
+    gt >>= 1;
+
+    /* Apply tilt compensation filter to signal. */
+    tmp = res_pst[subframe_size - 1];
+
+    for (i = subframe_size - 1; i >= 1; i--) {
+        tmp2 = (res_pst[i] << 15) + ((gt * res_pst[i-1]) << 1);
+        tmp2 = (tmp2 + 0x4000) >> 15;
+
+        tmp2 = (tmp2 * ga * 2 + fact) >> sh_fact;
+        out[i] = tmp2;
+    }
+    tmp2 = (res_pst[0] << 15) + ((gt * ht_prev_data) << 1);
+    tmp2 = (tmp2 + 0x4000) >> 15;
+    tmp2 = (tmp2 * ga * 2 + fact) >> sh_fact;
+    out[0] = tmp2;
+
+    return tmp;
+}
+
+void ff_g729_postfilter(AudioDSPContext *adsp, int16_t* ht_prev_data, int* voicing,
+                     const int16_t *lp_filter_coeffs, int pitch_delay_int,
+                     int16_t* residual, int16_t* res_filter_data,
+                     int16_t* pos_filter_data, int16_t *speech, int subframe_size)
+{
+    int16_t residual_filt_buf[SUBFRAME_SIZE+11];
+    int16_t lp_gn[33]; // (3.12)
+    int16_t lp_gd[11]; // (3.12)
+    int tilt_comp_coeff;
+    int i;
+
+    /* Zero-filling is necessary for tilt-compensation filter. */
+    memset(lp_gn, 0, 33 * sizeof(int16_t));
+
+    /* Calculate A(z/FORMANT_PP_FACTOR_NUM) filter coefficients. */
+    for (i = 0; i < 10; i++)
+        lp_gn[i + 11] = (lp_filter_coeffs[i + 1] * formant_pp_factor_num_pow[i] + 0x4000) >> 15;
+
+    /* Calculate A(z/FORMANT_PP_FACTOR_DEN) filter coefficients. */
+    for (i = 0; i < 10; i++)
+        lp_gd[i + 1] = (lp_filter_coeffs[i + 1] * formant_pp_factor_den_pow[i] + 0x4000) >> 15;
+
+    /* residual signal calculation (one-half of short-term postfilter) */
+    memcpy(speech - 10, res_filter_data, 10 * sizeof(int16_t));
+    residual_filter(residual + RES_PREV_DATA_SIZE, lp_gn + 11, speech, subframe_size);
+    /* Save data to use it in the next subframe. */
+    memcpy(res_filter_data, speech + subframe_size - 10, 10 * sizeof(int16_t));
+
+    /* long-term filter. If long-term prediction gain is larger than 3dB (returned value is
+       nonzero) then declare current subframe as periodic. */
+    i = long_term_filter(adsp, pitch_delay_int,
+                                                residual, residual_filt_buf + 10,
+                                                subframe_size);
+    *voicing = FFMAX(*voicing, i);
+
+    /* shift residual for using in next subframe */
+    memmove(residual, residual + subframe_size, RES_PREV_DATA_SIZE * sizeof(int16_t));
+
+    /* short-term filter tilt compensation */
+    tilt_comp_coeff = get_tilt_comp(adsp, lp_gn, lp_gd, residual_filt_buf + 10, subframe_size);
+
+    /* Apply second half of short-term postfilter: 1/A(z/FORMANT_PP_FACTOR_DEN) */
+    ff_celp_lp_synthesis_filter(pos_filter_data + 10, lp_gd + 1,
+                                residual_filt_buf + 10,
+                                subframe_size, 10, 0, 0, 0x800);
+    memcpy(pos_filter_data, pos_filter_data + subframe_size, 10 * sizeof(int16_t));
+
+    *ht_prev_data = apply_tilt_comp(speech, pos_filter_data + 10, tilt_comp_coeff,
+                                    subframe_size, *ht_prev_data);
+}
+
+/**
+ * \brief Adaptive gain control (4.2.4)
+ * \param gain_before gain of speech before applying postfilters
+ * \param gain_after  gain of speech after applying postfilters
+ * \param speech [in/out] signal buffer
+ * \param subframe_size length of subframe
+ * \param gain_prev (3.12) previous value of gain coefficient
+ *
+ * \return (3.12) last value of gain coefficient
+ */
+int16_t ff_g729_adaptive_gain_control(int gain_before, int gain_after, int16_t *speech,
+                                   int subframe_size, int16_t gain_prev)
+{
+    int gain; // (3.12)
+    int n;
+    int exp_before, exp_after;
+
+    if(!gain_after && gain_before)
+        return 0;
+
+    if (gain_before) {
+
+        exp_before  = 14 - av_log2(gain_before);
+        gain_before = bidir_sal(gain_before, exp_before);
+
+        exp_after  = 14 - av_log2(gain_after);
+        gain_after = bidir_sal(gain_after, exp_after);
+
+        if (gain_before < gain_after) {
+            gain = (gain_before << 15) / gain_after;
+            gain = bidir_sal(gain, exp_after - exp_before - 1);
+        } else {
+            gain = ((gain_before - gain_after) << 14) / gain_after + 0x4000;
+            gain = bidir_sal(gain, exp_after - exp_before);
+        }
+        gain = (gain * G729_AGC_FAC1 + 0x4000) >> 15; // gain * (1-0.9875)
+    } else
+        gain = 0;
+
+    for (n = 0; n < subframe_size; n++) {
+        // gain_prev = gain + 0.9875 * gain_prev
+        gain_prev = (G729_AGC_FACTOR * gain_prev + 0x4000) >> 15;
+        gain_prev = av_clip_int16(gain + gain_prev);
+        speech[n] = av_clip_int16((speech[n] * gain_prev + 0x2000) >> 14);
+    }
+    return gain_prev;
+}
diff --git a/libavcodec/g729postfilter.h b/libavcodec/g729postfilter.h
new file mode 100644
index 0000000..5c2aaf2
--- /dev/null
+++ b/libavcodec/g729postfilter.h
@@ -0,0 +1,116 @@
+/*
+ * G.729, G729 Annex D postfilter
+ * Copyright (c) 2008 Vladimir Voroshilov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_G729POSTFILTER_H
+#define AVCODEC_G729POSTFILTER_H
+
+#include <stdint.h>
+#include "audiodsp.h"
+
+/**
+ * tilt compensation factor (G.729, k1>0)
+ * 0.2 in Q15
+ */
+#define G729_TILT_FACTOR_PLUS       6554
+
+/**
+ * tilt compensation factor (G.729, k1<0)
+ * 0.9 in Q15
+ */
+#define G729_TILT_FACTOR_MINUS     29491
+
+/* 4.2.2 */
+#define FORMANT_PP_FACTOR_NUM  18022             //0.55 in Q15
+#define FORMANT_PP_FACTOR_DEN  22938             //0.70 in Q15
+
+/**
+ * gain adjustment factor (G.729, 4.2.4)
+ * 0.9875 in Q15
+ */
+#define G729_AGC_FACTOR            32358
+#define G729_AGC_FAC1 (32768-G729_AGC_FACTOR)
+
+/**
+ * 1.0 / (1.0 + 0.5) in Q15
+ * where 0.5 is the minimum value of
+ * weight factor, controlling amount of long-term postfiltering
+ */
+#define MIN_LT_FILT_FACTOR_A       21845
+
+/**
+ * Short interpolation filter length
+ */
+#define SHORT_INT_FILT_LEN         2
+
+/**
+ * Long interpolation filter length
+ */
+#define LONG_INT_FILT_LEN          8
+
+/**
+ * Number of analyzed fractional pitch delays in second stage of long-term
+ * postfilter
+ */
+#define ANALYZED_FRAC_DELAYS       7
+
+/**
+ * Amount of past residual signal data stored in buffer
+ */
+#define RES_PREV_DATA_SIZE (PITCH_DELAY_MAX + LONG_INT_FILT_LEN + 1)
+
+/**
+ * \brief Signal postfiltering (4.2)
+ * \param dsp initialized DSP context
+ * \param ht_prev_data [in/out] (Q12) pointer to variable receiving tilt
+ *                     compensation filter data from previous subframe
+ * \param voicing [in/out] (Q0) pointer to variable receiving voicing decision
+ * \param lp_filter_coeffs (Q12) LP filter coefficients
+ * \param pitch_delay_int integer part of the pitch delay
+ * \param residual [in/out] (Q0) residual signal buffer (used in long-term postfilter)
+ * \param res_filter_data [in/out] (Q0) speech data of previous subframe
+ * \param pos_filter_data [in/out] (Q0) previous speech data for short-term postfilter
+ * \param speech [in/out] (Q0) signal buffer
+ * \param subframe_size size of subframe
+ *
+ * Filtering has the following  stages:
+ *   Long-term postfilter (4.2.1)
+ *   Short-term postfilter (4.2.2).
+ *   Tilt-compensation (4.2.3)
+ */
+void ff_g729_postfilter(AudioDSPContext *adsp, int16_t* ht_prev_data, int* voicing,
+                     const int16_t *lp_filter_coeffs, int pitch_delay_int,
+                     int16_t* residual, int16_t* res_filter_data,
+                     int16_t* pos_filter_data, int16_t *speech,
+                     int subframe_size);
+
+/**
+ * \brief Adaptive gain control (4.2.4)
+ * \param gain_before (Q0) gain of speech before applying postfilters
+ * \param gain_after  (Q0) gain of speech after applying postfilters
+ * \param speech [in/out] (Q0) signal buffer
+ * \param subframe_size length of subframe
+ * \param gain_prev (Q12) previous value of gain coefficient
+ *
+ * \return (Q12) last value of gain coefficient
+ */
+int16_t ff_g729_adaptive_gain_control(int gain_before, int gain_after, int16_t *speech,
+                                   int subframe_size, int16_t gain_prev);
+
+#endif // AVCODEC_G729POSTFILTER_H
diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
index b8debb9..48c3c44 100644
--- a/libavcodec/get_bits.h
+++ b/libavcodec/get_bits.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
+#include "libavutil/avassert.h"
 #include "mathops.h"
 
 /*
@@ -54,9 +55,7 @@ typedef struct GetBitContext {
     const uint8_t *buffer, *buffer_end;
     int index;
     int size_in_bits;
-#if !UNCHECKED_BITSTREAM_READER
     int size_in_bits_plus8;
-#endif
 } GetBitContext;
 
 #define VLC_TYPE int16_t
@@ -114,6 +113,9 @@ typedef struct RL_VLC_ELEM {
  * LAST_SKIP_BITS(name, gb, num)
  *   Like SKIP_BITS, to be used if next call is UPDATE_CACHE or CLOSE_READER.
  *
+ * BITS_LEFT(name, gb)
+ *   Return the number of bits left
+ *
  * For examples see get_bits, show_bits, skip_bits, get_vlc.
  */
 
@@ -125,7 +127,7 @@ typedef struct RL_VLC_ELEM {
 
 #define OPEN_READER_NOSIZE(name, gb)            \
     unsigned int name ## _index = (gb)->index;  \
-    unsigned int av_unused name ## _cache = 0
+    unsigned int av_unused name ## _cache
 
 #if UNCHECKED_BITSTREAM_READER
 #define OPEN_READER(name, gb) OPEN_READER_NOSIZE(name, gb)
@@ -141,27 +143,34 @@ typedef struct RL_VLC_ELEM {
 
 #define CLOSE_READER(name, gb) (gb)->index = name ## _index
 
+# ifdef LONG_BITSTREAM_READER
+
+# define UPDATE_CACHE_LE(name, gb) name ## _cache = \
+      AV_RL64((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
+
+# define UPDATE_CACHE_BE(name, gb) name ## _cache = \
+      AV_RB64((gb)->buffer + (name ## _index >> 3)) >> (32 - (name ## _index & 7))
+
+#else
+
+# define UPDATE_CACHE_LE(name, gb) name ## _cache = \
+      AV_RL32((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
+
+# define UPDATE_CACHE_BE(name, gb) name ## _cache = \
+      AV_RB32((gb)->buffer + (name ## _index >> 3)) << (name ## _index & 7)
+
+#endif
+
+
 #ifdef BITSTREAM_READER_LE
 
-# ifdef LONG_BITSTREAM_READER
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RL64((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
-# else
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RL32((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7)
-# endif
+# define UPDATE_CACHE(name, gb) UPDATE_CACHE_LE(name, gb)
 
 # define SKIP_CACHE(name, gb, num) name ## _cache >>= (num)
 
 #else
 
-# ifdef LONG_BITSTREAM_READER
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RB64((gb)->buffer + (name ## _index >> 3)) >> (32 - (name ## _index & 7))
-# else
-#   define UPDATE_CACHE(name, gb) name ## _cache = \
-        AV_RB32((gb)->buffer + (name ## _index >> 3)) << (name ## _index & 7)
-# endif
+# define UPDATE_CACHE(name, gb) UPDATE_CACHE_BE(name, gb)
 
 # define SKIP_CACHE(name, gb, num) name ## _cache <<= (num)
 
@@ -174,6 +183,8 @@ typedef struct RL_VLC_ELEM {
     name ## _index = FFMIN(name ## _size_plus8, name ## _index + (num))
 #endif
 
+#define BITS_LEFT(name, gb) ((int)((gb)->size_in_bits - name ## _index))
+
 #define SKIP_BITS(name, gb, num)                \
     do {                                        \
         SKIP_CACHE(name, gb, num);              \
@@ -182,12 +193,18 @@ typedef struct RL_VLC_ELEM {
 
 #define LAST_SKIP_BITS(name, gb, num) SKIP_COUNTER(name, gb, num)
 
+#define SHOW_UBITS_LE(name, gb, num) zero_extend(name ## _cache, num)
+#define SHOW_SBITS_LE(name, gb, num) sign_extend(name ## _cache, num)
+
+#define SHOW_UBITS_BE(name, gb, num) NEG_USR32(name ## _cache, num)
+#define SHOW_SBITS_BE(name, gb, num) NEG_SSR32(name ## _cache, num)
+
 #ifdef BITSTREAM_READER_LE
-#   define SHOW_UBITS(name, gb, num) zero_extend(name ## _cache, num)
-#   define SHOW_SBITS(name, gb, num) sign_extend(name ## _cache, num)
+#   define SHOW_UBITS(name, gb, num) SHOW_UBITS_LE(name, gb, num)
+#   define SHOW_SBITS(name, gb, num) SHOW_SBITS_LE(name, gb, num)
 #else
-#   define SHOW_UBITS(name, gb, num) NEG_USR32(name ## _cache, num)
-#   define SHOW_SBITS(name, gb, num) NEG_SSR32(name ## _cache, num)
+#   define SHOW_UBITS(name, gb, num) SHOW_UBITS_BE(name, gb, num)
+#   define SHOW_SBITS(name, gb, num) SHOW_SBITS_BE(name, gb, num)
 #endif
 
 #define GET_CACHE(name, gb) ((uint32_t) name ## _cache)
@@ -207,7 +224,7 @@ static inline void skip_bits_long(GetBitContext *s, int n)
 }
 
 /**
- * Read MPEG-1 dc-style VLC (sign bit + mantisse with no MSB).
+ * Read MPEG-1 dc-style VLC (sign bit + mantissa with no MSB).
  * if MSB not set it is negative
  * @param n length in bits
  */
@@ -216,6 +233,7 @@ static inline int get_xbits(GetBitContext *s, int n)
     register int sign;
     register int32_t cache;
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     cache = GET_CACHE(re, s);
     sign  = ~cache >> 31;
@@ -228,6 +246,7 @@ static inline int get_sbits(GetBitContext *s, int n)
 {
     register int tmp;
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_SBITS(re, s, n);
     LAST_SKIP_BITS(re, s, n);
@@ -242,6 +261,7 @@ static inline unsigned int get_bits(GetBitContext *s, int n)
 {
     register int tmp;
     OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_UBITS(re, s, n);
     LAST_SKIP_BITS(re, s, n);
@@ -257,6 +277,18 @@ static av_always_inline int get_bitsz(GetBitContext *s, int n)
     return n ? get_bits(s, n) : 0;
 }
 
+static inline unsigned int get_bits_le(GetBitContext *s, int n)
+{
+    register int tmp;
+    OPEN_READER(re, s);
+    av_assert2(n>0 && n<=25);
+    UPDATE_CACHE_LE(re, s);
+    tmp = SHOW_UBITS_LE(re, s, n);
+    LAST_SKIP_BITS(re, s, n);
+    CLOSE_READER(re, s);
+    return tmp;
+}
+
 /**
  * Show 1-25 bits.
  */
@@ -264,6 +296,7 @@ static inline unsigned int show_bits(GetBitContext *s, int n)
 {
     register int tmp;
     OPEN_READER_NOSIZE(re, s);
+    av_assert2(n>0 && n<=25);
     UPDATE_CACHE(re, s);
     tmp = SHOW_UBITS(re, s, n);
     return tmp;
@@ -272,7 +305,6 @@ static inline unsigned int show_bits(GetBitContext *s, int n)
 static inline void skip_bits(GetBitContext *s, int n)
 {
     OPEN_READER(re, s);
-    UPDATE_CACHE(re, s);
     LAST_SKIP_BITS(re, s, n);
     CLOSE_READER(re, s);
 }
@@ -312,20 +344,22 @@ static inline void skip_bits1(GetBitContext *s)
  */
 static inline unsigned int get_bits_long(GetBitContext *s, int n)
 {
-    if (n <= MIN_CACHE_BITS) {
+    if (!n) {
+        return 0;
+    } else if (n <= MIN_CACHE_BITS) {
         return get_bits(s, n);
     } else {
 #ifdef BITSTREAM_READER_LE
-        int ret = get_bits(s, 16);
+        unsigned ret = get_bits(s, 16);
         return ret | (get_bits(s, n - 16) << 16);
 #else
-        int ret = get_bits(s, 16) << (n - 16);
+        unsigned ret = get_bits(s, 16) << (n - 16);
         return ret | get_bits(s, n - 16);
 #endif
     }
 }
 
-/*
+/**
  * Read 0-64 bits.
  */
 static inline uint64_t get_bits64(GetBitContext *s, int n)
@@ -368,7 +402,7 @@ static inline int check_marker(GetBitContext *s, const char *msg)
 {
     int bit = get_bits1(s);
     if (!bit)
-        av_log(NULL, AV_LOG_INFO, "Marker bit missing %s\n", msg);
+        av_log(NULL, AV_LOG_INFO, "Marker bit missing at %d of %d %s\n", get_bits_count(s) - 1, s->size_in_bits, msg);
 
     return bit;
 }
@@ -387,7 +421,7 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
     int buffer_size;
     int ret = 0;
 
-    if (bit_size > INT_MAX - 7 || bit_size < 0 || !buffer) {
+    if (bit_size >= INT_MAX - 7 || bit_size < 0 || !buffer) {
         bit_size    = 0;
         buffer      = NULL;
         ret         = AVERROR_INVALIDDATA;
@@ -397,9 +431,7 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
 
     s->buffer             = buffer;
     s->size_in_bits       = bit_size;
-#if !UNCHECKED_BITSTREAM_READER
     s->size_in_bits_plus8 = bit_size + 8;
-#endif
     s->buffer_end         = buffer + buffer_size;
     s->index              = 0;
 
@@ -417,8 +449,8 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
 static inline int init_get_bits8(GetBitContext *s, const uint8_t *buffer,
                                  int byte_size)
 {
-    if (byte_size > INT_MAX / 8)
-        return AVERROR_INVALIDDATA;
+    if (byte_size > INT_MAX / 8 || byte_size < 0)
+        byte_size = -1;
     return init_get_bits(s, buffer, byte_size * 8);
 }
 
@@ -494,7 +526,7 @@ void ff_free_vlc(VLC *vlc);
         SKIP_BITS(name, gb, n);                                 \
     } while (0)
 
-#define GET_RL_VLC(level, run, name, gb, table, bits,           \
+#define GET_RL_VLC_INTERNAL(level, run, name, gb, table, bits,  \
                    max_depth, need_update)                      \
     do {                                                        \
         int n, nb_bits;                                         \
@@ -577,6 +609,20 @@ static inline int get_bits_left(GetBitContext *gb)
     return gb->size_in_bits - get_bits_count(gb);
 }
 
+static inline int skip_1stop_8data_bits(GetBitContext *gb)
+{
+    if (get_bits_left(gb) <= 0)
+        return AVERROR_INVALIDDATA;
+
+    while (get_bits1(gb)) {
+        skip_bits(gb, 8);
+        if (get_bits_left(gb) <= 0)
+            return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
 //#define TRACE
 
 #ifdef TRACE
@@ -620,6 +666,25 @@ static inline int get_vlc_trace(GetBitContext *s, VLC_TYPE (*table)[2],
     return r;
 }
 
+#define GET_RL_VLC(level, run, name, gb, table, bits,           \
+                   max_depth, need_update)                      \
+    do {                                                        \
+        int show  = SHOW_UBITS(name, gb, 24);                   \
+        int len;                                                \
+        int pos = name ## _index;                               \
+                                                                \
+        GET_RL_VLC_INTERNAL(level, run, name, gb, table, bits,max_depth, need_update); \
+                                                                \
+        len = name ## _index - pos + 1;                         \
+        show = show >> (24 - len);                              \
+                                                                \
+        print_bin(show, len);                                   \
+                                                                \
+        av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d/%-3d rlv @%5d in %s %s:%d\n",\
+               show, len, run-1, level, pos, __FILE__, __PRETTY_FUNCTION__, __LINE__);\
+    } while (0)                                                 \
+
+
 static inline int get_xbits_trace(GetBitContext *s, int n, const char *file,
                                   const char *func, int line)
 {
@@ -639,6 +704,8 @@ static inline int get_xbits_trace(GetBitContext *s, int n, const char *file,
 
 #define get_vlc(s, vlc)             get_vlc_trace(s, (vlc)->table, (vlc)->bits,   3, __FILE__, __PRETTY_FUNCTION__, __LINE__)
 #define get_vlc2(s, tab, bits, max) get_vlc_trace(s,          tab,        bits, max, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#else //TRACE
+#define GET_RL_VLC GET_RL_VLC_INTERNAL
 #endif
 
 #endif /* AVCODEC_GET_BITS_H */
diff --git a/libavcodec/gif.c b/libavcodec/gif.c
index 451e335..6af1f4a 100644
--- a/libavcodec/gif.c
+++ b/libavcodec/gif.c
@@ -1,113 +1,202 @@
 /*
- * GIF encoder.
  * Copyright (c) 2000 Fabrice Bellard
  * Copyright (c) 2002 Francois Revol
  * Copyright (c) 2006 Baptiste Coudurier
  *
  * first version by Francois Revol <revol@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/*
- * Features and limitations:
- * - currently no compression is performed,
- *   in fact the size of the data is 9/8 the size of the image in 8bpp
- * - uses only a global standard palette
- * - tested with IE 5.0, Opera for BeOS, NetPositive (BeOS), and Mozilla (BeOS).
- *
- * Reference documents:
- * http://www.goice.co.jp/member/mo/formats/gif.html
- * http://astronomy.swin.edu.au/pbourke/dataformats/gif/
- * http://www.dcs.ed.ac.uk/home/mxr/gfx/2d/GIF89a.txt
- *
- * this url claims to have an LZW algorithm not covered by Unisys patent:
- * http://www.msg.net/utility/whirlgif/gifencod.html
- * could help reduce the size of the files _a lot_...
- * some sites mentions an RLE type compression also.
+/**
+ * @file
+ * GIF encoder
+ * @see http://www.w3.org/Graphics/GIF/spec-gif89a.txt
  */
 
+#define BITSTREAM_WRITER_LE
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "lzw.h"
-
-/* The GIF format uses reversed order for bitstreams... */
-/* at least they don't use PDP_ENDIAN :) */
-#define BITSTREAM_WRITER_LE
+#include "gif.h"
 
 #include "put_bits.h"
 
 typedef struct GIFContext {
+    const AVClass *class;
     LZWState *lzw;
     uint8_t *buf;
+    int buf_size;
+    AVFrame *last_frame;
+    int flags;
+    uint32_t palette[AVPALETTE_COUNT];  ///< local reference palette for !pal8
+    int palette_loaded;
+    int transparent_index;
+    uint8_t *pal_exdata;
+    uint8_t *tmpl;                      ///< temporary line buffer
 } GIFContext;
 
-/* GIF header */
-static int gif_image_write_header(AVCodecContext *avctx,
-                                  uint8_t **bytestream, uint32_t *palette)
+enum {
+    GF_OFFSETTING = 1<<0,
+    GF_TRANSDIFF  = 1<<1,
+};
+
+static int pick_palette_entry(const uint8_t *buf, int linesize, int w, int h)
 {
-    int i;
-    unsigned int v;
-
-    bytestream_put_buffer(bytestream, "GIF", 3);
-    bytestream_put_buffer(bytestream, "89a", 3);
-    bytestream_put_le16(bytestream, avctx->width);
-    bytestream_put_le16(bytestream, avctx->height);
-
-    bytestream_put_byte(bytestream, 0xf7); /* flags: global clut, 256 entries */
-    bytestream_put_byte(bytestream, 0x1f); /* background color index */
-    bytestream_put_byte(bytestream, 0); /* aspect ratio */
-
-    /* the global palette */
-    for(i=0;i<256;i++) {
-        v = palette[i];
-        bytestream_put_be24(bytestream, v);
-    }
+    int histogram[AVPALETTE_COUNT] = {0};
+    int x, y, i;
 
-    return 0;
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < w; x++)
+            histogram[buf[x]]++;
+        buf += linesize;
+    }
+    for (i = 0; i < FF_ARRAY_ELEMS(histogram); i++)
+        if (!histogram[i])
+            return i;
+    return -1;
 }
 
 static int gif_image_write_image(AVCodecContext *avctx,
                                  uint8_t **bytestream, uint8_t *end,
-                                 const uint8_t *buf, int linesize)
+                                 const uint32_t *palette,
+                                 const uint8_t *buf, const int linesize,
+                                 AVPacket *pkt)
 {
     GIFContext *s = avctx->priv_data;
-    int len = 0, height;
+    int len = 0, height = avctx->height, width = avctx->width, x, y;
+    int x_start = 0, y_start = 0, trans = s->transparent_index;
+    int honor_transparency = (s->flags & GF_TRANSDIFF) && s->last_frame;
     const uint8_t *ptr;
+
+    /* Crop image */
+    if ((s->flags & GF_OFFSETTING) && s->last_frame && !palette) {
+        const uint8_t *ref = s->last_frame->data[0];
+        const int ref_linesize = s->last_frame->linesize[0];
+        int x_end = avctx->width  - 1,
+            y_end = avctx->height - 1;
+
+        /* skip common lines */
+        while (y_start < y_end) {
+            if (memcmp(ref + y_start*ref_linesize, buf + y_start*linesize, width))
+                break;
+            y_start++;
+        }
+        while (y_end > y_start) {
+            if (memcmp(ref + y_end*ref_linesize, buf + y_end*linesize, width))
+                break;
+            y_end--;
+        }
+        height = y_end + 1 - y_start;
+
+        /* skip common columns */
+        while (x_start < x_end) {
+            int same_column = 1;
+            for (y = y_start; y <= y_end; y++) {
+                if (ref[y*ref_linesize + x_start] != buf[y*linesize + x_start]) {
+                    same_column = 0;
+                    break;
+                }
+            }
+            if (!same_column)
+                break;
+            x_start++;
+        }
+        while (x_end > x_start) {
+            int same_column = 1;
+            for (y = y_start; y <= y_end; y++) {
+                if (ref[y*ref_linesize + x_end] != buf[y*linesize + x_end]) {
+                    same_column = 0;
+                    break;
+                }
+            }
+            if (!same_column)
+                break;
+            x_end--;
+        }
+        width = x_end + 1 - x_start;
+
+        av_log(avctx, AV_LOG_DEBUG,"%dx%d image at pos (%d;%d) [area:%dx%d]\n",
+               width, height, x_start, y_start, avctx->width, avctx->height);
+    }
+
     /* image block */
+    bytestream_put_byte(bytestream, GIF_IMAGE_SEPARATOR);
+    bytestream_put_le16(bytestream, x_start);
+    bytestream_put_le16(bytestream, y_start);
+    bytestream_put_le16(bytestream, width);
+    bytestream_put_le16(bytestream, height);
 
-    bytestream_put_byte(bytestream, 0x2c);
-    bytestream_put_le16(bytestream, 0);
-    bytestream_put_le16(bytestream, 0);
-    bytestream_put_le16(bytestream, avctx->width);
-    bytestream_put_le16(bytestream, avctx->height);
-    bytestream_put_byte(bytestream, 0x00); /* flags */
-    /* no local clut */
+    if (!palette) {
+        bytestream_put_byte(bytestream, 0x00); /* flags */
+    } else {
+        unsigned i;
+        bytestream_put_byte(bytestream, 1<<7 | 0x7); /* flags */
+        for (i = 0; i < AVPALETTE_COUNT; i++) {
+            const uint32_t v = palette[i];
+            bytestream_put_be24(bytestream, v);
+        }
+    }
+
+    if (honor_transparency && trans < 0) {
+        trans = pick_palette_entry(buf + y_start*linesize + x_start,
+                                   linesize, width, height);
+        if (trans < 0) { // TODO, patch welcome
+            av_log(avctx, AV_LOG_DEBUG, "No available color, can not use transparency\n");
+        } else {
+            uint8_t *pal_exdata = s->pal_exdata;
+            if (!pal_exdata)
+                pal_exdata = av_packet_new_side_data(pkt, AV_PKT_DATA_PALETTE, AVPALETTE_SIZE);
+            if (!pal_exdata)
+                return AVERROR(ENOMEM);
+            memcpy(pal_exdata, s->palette, AVPALETTE_SIZE);
+            pal_exdata[trans*4 + 3*!HAVE_BIGENDIAN] = 0x00;
+        }
+    }
+    if (trans < 0)
+        honor_transparency = 0;
 
     bytestream_put_byte(bytestream, 0x08);
 
-    ff_lzw_encode_init(s->lzw, s->buf, avctx->width*avctx->height,
+    ff_lzw_encode_init(s->lzw, s->buf, s->buf_size,
                        12, FF_LZW_GIF, put_bits);
 
-    ptr = buf;
-    for (height = avctx->height; height--;) {
-        len += ff_lzw_encode(s->lzw, ptr, avctx->width);
-        ptr += linesize;
+    ptr = buf + y_start*linesize + x_start;
+    if (honor_transparency) {
+        const int ref_linesize = s->last_frame->linesize[0];
+        const uint8_t *ref = s->last_frame->data[0] + y_start*ref_linesize + x_start;
+
+        for (y = 0; y < height; y++) {
+            memcpy(s->tmpl, ptr, width);
+            for (x = 0; x < width; x++)
+                if (ref[x] == ptr[x])
+                    s->tmpl[x] = trans;
+            len += ff_lzw_encode(s->lzw, s->tmpl, width);
+            ptr += linesize;
+            ref += ref_linesize;
+        }
+    } else {
+        for (y = 0; y < height; y++) {
+            len += ff_lzw_encode(s->lzw, ptr, width);
+            ptr += linesize;
+        }
     }
     len += ff_lzw_encode_flush(s->lzw, flush_put_bits);
 
@@ -122,7 +211,6 @@ static int gif_image_write_image(AVCodecContext *avctx,
         len -= size;
     }
     bytestream_put_byte(bytestream, 0x00); /* end of image block */
-    bytestream_put_byte(bytestream, 0x3b);
     return 0;
 }
 
@@ -130,6 +218,10 @@ static av_cold int gif_encode_init(AVCodecContext *avctx)
 {
     GIFContext *s = avctx->priv_data;
 
+    if (avctx->width > 65535 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR, "GIF does not support resolutions above 65535x65535\n");
+        return AVERROR(EINVAL);
+    }
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
@@ -137,31 +229,86 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    s->transparent_index = -1;
+
     s->lzw = av_mallocz(ff_lzw_encode_state_size);
-    if (!s->lzw)
+    s->buf_size = avctx->width*avctx->height*2 + 1000;
+    s->buf = av_malloc(s->buf_size);
+    s->tmpl = av_malloc(avctx->width);
+    if (!s->tmpl || !s->buf || !s->lzw)
         return AVERROR(ENOMEM);
-    s->buf = av_malloc(avctx->width*avctx->height*2);
-    if (!s->buf)
-         return AVERROR(ENOMEM);
+
+    if (avpriv_set_systematic_pal2(s->palette, avctx->pix_fmt) < 0)
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_PAL8);
+
     return 0;
 }
 
-/* better than nothing gif encoder */
+/* FIXME: duplicated with lavc */
+static int get_palette_transparency_index(const uint32_t *palette)
+{
+    int transparent_color_index = -1;
+    unsigned i, smallest_alpha = 0xff;
+
+    if (!palette)
+        return -1;
+
+    for (i = 0; i < AVPALETTE_COUNT; i++) {
+        const uint32_t v = palette[i];
+        if (v >> 24 < smallest_alpha) {
+            smallest_alpha = v >> 24;
+            transparent_color_index = i;
+        }
+    }
+    return smallest_alpha < 128 ? transparent_color_index : -1;
+}
+
 static int gif_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                             const AVFrame *pict, int *got_packet)
 {
+    GIFContext *s = avctx->priv_data;
     uint8_t *outbuf_ptr, *end;
+    const uint32_t *palette = NULL;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width*avctx->height*7/5 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*7/5 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
     outbuf_ptr = pkt->data;
     end        = pkt->data + pkt->size;
 
-    gif_image_write_header(avctx, &outbuf_ptr, (uint32_t *)pict->data[1]);
-    gif_image_write_image(avctx, &outbuf_ptr, end, pict->data[0], pict->linesize[0]);
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        uint8_t *pal_exdata = av_packet_new_side_data(pkt, AV_PKT_DATA_PALETTE, AVPALETTE_SIZE);
+        if (!pal_exdata)
+            return AVERROR(ENOMEM);
+        memcpy(pal_exdata, pict->data[1], AVPALETTE_SIZE);
+        palette = (uint32_t*)pict->data[1];
+
+        s->pal_exdata = pal_exdata;
+
+        /* The first palette with PAL8 will be used as generic palette by the
+         * muxer so we don't need to write it locally in the packet. We store
+         * it as a reference here in case it changes later. */
+        if (!s->palette_loaded) {
+            memcpy(s->palette, palette, AVPALETTE_SIZE);
+            s->transparent_index = get_palette_transparency_index(palette);
+            s->palette_loaded = 1;
+            palette = NULL;
+        } else if (!memcmp(s->palette, palette, AVPALETTE_SIZE)) {
+            palette = NULL;
+        }
+    }
+
+    gif_image_write_image(avctx, &outbuf_ptr, end, palette,
+                          pict->data[0], pict->linesize[0], pkt);
+    if (!s->last_frame) {
+        s->last_frame = av_frame_alloc();
+        if (!s->last_frame)
+            return AVERROR(ENOMEM);
+    }
+    av_frame_unref(s->last_frame);
+    ret = av_frame_ref(s->last_frame, (AVFrame*)pict);
+    if (ret < 0)
+        return ret;
 
     pkt->size   = outbuf_ptr - pkt->data;
     pkt->flags |= AV_PKT_FLAG_KEY;
@@ -176,9 +323,28 @@ static int gif_encode_close(AVCodecContext *avctx)
 
     av_freep(&s->lzw);
     av_freep(&s->buf);
+    s->buf_size = 0;
+    av_frame_free(&s->last_frame);
+    av_freep(&s->tmpl);
     return 0;
 }
 
+#define OFFSET(x) offsetof(GIFContext, x)
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption gif_options[] = {
+    { "gifflags", "set GIF flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = GF_OFFSETTING|GF_TRANSDIFF}, 0, INT_MAX, FLAGS, "flags" },
+        { "offsetting", "enable picture offsetting", 0, AV_OPT_TYPE_CONST, {.i64=GF_OFFSETTING}, INT_MIN, INT_MAX, FLAGS, "flags" },
+        { "transdiff", "enable transparency detection between frames", 0, AV_OPT_TYPE_CONST, {.i64=GF_TRANSDIFF}, INT_MIN, INT_MAX, FLAGS, "flags" },
+    { NULL }
+};
+
+static const AVClass gif_class = {
+    .class_name = "GIF encoder",
+    .item_name  = av_default_item_name,
+    .option     = gif_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_gif_encoder = {
     .name           = "gif",
     .long_name      = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
@@ -192,4 +358,5 @@ AVCodec ff_gif_encoder = {
         AV_PIX_FMT_RGB8, AV_PIX_FMT_BGR8, AV_PIX_FMT_RGB4_BYTE, AV_PIX_FMT_BGR4_BYTE,
         AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8, AV_PIX_FMT_NONE
     },
+    .priv_class     = &gif_class,
 };
diff --git a/libavcodec/gif.h b/libavcodec/gif.h
new file mode 100644
index 0000000..9f35778
--- /dev/null
+++ b/libavcodec/gif.h
@@ -0,0 +1,49 @@
+/*
+ * GIF format definitions
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2006 Baptiste Coudurier
+ * Copyright (c) 2012 Vitaliy E Sugrobov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * GIF format definitions.
+ */
+
+#ifndef AVCODEC_GIF_H
+#define AVCODEC_GIF_H
+
+#include <stdint.h>
+
+static const uint8_t gif87a_sig[6] = "GIF87a";
+static const uint8_t gif89a_sig[6] = "GIF89a";
+
+#define GCE_DISPOSAL_NONE       0
+#define GCE_DISPOSAL_INPLACE    1
+#define GCE_DISPOSAL_BACKGROUND 2
+#define GCE_DISPOSAL_RESTORE    3
+
+#define GIF_TRAILER                 0x3b
+#define GIF_EXTENSION_INTRODUCER    0x21
+#define GIF_IMAGE_SEPARATOR         0x2c
+#define GIF_GCE_EXT_LABEL           0xf9
+#define GIF_APP_EXT_LABEL           0xff
+#define NETSCAPE_EXT_STR            "NETSCAPE2.0"
+
+#endif /* AVCODEC_GIF_H */
diff --git a/libavcodec/gifdec.c b/libavcodec/gifdec.c
index f08d501..20ae903 100644
--- a/libavcodec/gifdec.c
+++ b/libavcodec/gifdec.c
@@ -2,122 +2,268 @@
  * GIF decoder
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Baptiste Coudurier
+ * Copyright (c) 2012 Vitaliy E Sugrobov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
 #include "lzw.h"
+#include "gif.h"
 
-#define GCE_DISPOSAL_NONE       0
-#define GCE_DISPOSAL_INPLACE    1
-#define GCE_DISPOSAL_BACKGROUND 2
-#define GCE_DISPOSAL_RESTORE    3
+/* This value is intentionally set to "transparent white" color.
+ * It is much better to have white background instead of black
+ * when gif image converted to format which not support transparency.
+ */
+#define GIF_TRANSPARENT_COLOR    0x00ffffff
 
 typedef struct GifState {
+    const AVClass *class;
+    AVFrame *frame;
     int screen_width;
     int screen_height;
+    int has_global_palette;
     int bits_per_pixel;
+    uint32_t bg_color;
     int background_color_index;
     int transparent_color_index;
     int color_resolution;
-    uint32_t *image_palette;
+    /* intermediate buffer for storing color indices
+     * obtained from lzw-encoded data stream */
+    uint8_t *idx_line;
+    int idx_line_size;
 
     /* after the frame is displayed, the disposal method is used */
+    int gce_prev_disposal;
     int gce_disposal;
-    /* delay during which the frame is shown */
-    int gce_delay;
+    /* rectangle describing area that must be disposed */
+    int gce_l, gce_t, gce_w, gce_h;
+    /* depending on disposal method we store either part of the image
+     * drawn on the canvas or background color that
+     * should be used upon disposal */
+    uint32_t * stored_img;
+    int stored_img_size;
+    int stored_bg_color;
 
-    /* LZW compatible decoder */
     GetByteContext gb;
     LZWState *lzw;
 
     /* aux buffers */
-    uint8_t global_palette[256 * 3];
-    uint8_t local_palette[256 * 3];
+    uint32_t global_palette[256];
+    uint32_t local_palette[256];
 
-  AVCodecContext* avctx;
+    AVCodecContext *avctx;
+    int keyframe;
+    int keyframe_ok;
+    int trans_color;    /**< color value that is used instead of transparent color */
 } GifState;
 
-static const uint8_t gif87a_sig[6] = "GIF87a";
-static const uint8_t gif89a_sig[6] = "GIF89a";
+static void gif_read_palette(GifState *s, uint32_t *pal, int nb)
+{
+    int i;
+
+    for (i = 0; i < nb; i++, pal++)
+        *pal = (0xffu << 24) | bytestream2_get_be24u(&s->gb);
+}
+
+static void gif_fill(AVFrame *picture, uint32_t color)
+{
+    uint32_t *p = (uint32_t *)picture->data[0];
+    uint32_t *p_end = p + (picture->linesize[0] / sizeof(uint32_t)) * picture->height;
+
+    for (; p < p_end; p++)
+        *p = color;
+}
+
+static void gif_fill_rect(AVFrame *picture, uint32_t color, int l, int t, int w, int h)
+{
+    const int linesize = picture->linesize[0] / sizeof(uint32_t);
+    const uint32_t *py = (uint32_t *)picture->data[0] + t * linesize;
+    const uint32_t *pr, *pb = py + h * linesize;
+    uint32_t *px;
+
+    for (; py < pb; py += linesize) {
+        px = (uint32_t *)py + l;
+        pr = px + w;
+
+        for (; px < pr; px++)
+            *px = color;
+    }
+}
+
+static void gif_copy_img_rect(const uint32_t *src, uint32_t *dst,
+                              int linesize, int l, int t, int w, int h)
+{
+    const int y_start = t * linesize;
+    const uint32_t *src_px,
+                   *src_py = src + y_start,
+                   *dst_py = dst + y_start;
+    const uint32_t *src_pb = src_py + h * linesize;
+    uint32_t *dst_px;
+
+    for (; src_py < src_pb; src_py += linesize, dst_py += linesize) {
+        src_px = src_py + l;
+        dst_px = (uint32_t *)dst_py + l;
+
+        memcpy(dst_px, src_px, w * sizeof(uint32_t));
+    }
+}
 
 static int gif_read_image(GifState *s, AVFrame *frame)
 {
-    int left, top, width, height, bits_per_pixel, code_size, flags;
-    int is_interleaved, has_local_palette, y, pass, y1, linesize, n, i;
-    uint8_t *ptr, *spal, *palette, *ptr1;
-
-    left   = bytestream2_get_le16(&s->gb);
-    top    = bytestream2_get_le16(&s->gb);
-    width  = bytestream2_get_le16(&s->gb);
-    height = bytestream2_get_le16(&s->gb);
-    flags  = bytestream2_get_byte(&s->gb);
+    int left, top, width, height, bits_per_pixel, code_size, flags, pw;
+    int is_interleaved, has_local_palette, y, pass, y1, linesize, pal_size, lzwed_len;
+    uint32_t *ptr, *pal, *px, *pr, *ptr1;
+    int ret;
+    uint8_t *idx;
+
+    /* At least 9 bytes of Image Descriptor. */
+    if (bytestream2_get_bytes_left(&s->gb) < 9)
+        return AVERROR_INVALIDDATA;
+
+    left   = bytestream2_get_le16u(&s->gb);
+    top    = bytestream2_get_le16u(&s->gb);
+    width  = bytestream2_get_le16u(&s->gb);
+    height = bytestream2_get_le16u(&s->gb);
+    flags  = bytestream2_get_byteu(&s->gb);
     is_interleaved = flags & 0x40;
     has_local_palette = flags & 0x80;
     bits_per_pixel = (flags & 0x07) + 1;
 
-    ff_dlog(s->avctx, "gif: image x=%d y=%d w=%d h=%d\n", left, top, width, height);
+    ff_dlog(s->avctx, "image x=%d y=%d w=%d h=%d\n", left, top, width, height);
 
     if (has_local_palette) {
-        bytestream2_get_buffer(&s->gb, s->local_palette, 3 * (1 << bits_per_pixel));
-        palette = s->local_palette;
+        pal_size = 1 << bits_per_pixel;
+
+        if (bytestream2_get_bytes_left(&s->gb) < pal_size * 3)
+            return AVERROR_INVALIDDATA;
+
+        gif_read_palette(s, s->local_palette, pal_size);
+        pal = s->local_palette;
     } else {
-        palette = s->global_palette;
-        bits_per_pixel = s->bits_per_pixel;
+        if (!s->has_global_palette) {
+            av_log(s->avctx, AV_LOG_ERROR, "picture doesn't have either global or local palette.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        pal = s->global_palette;
+    }
+
+    if (s->keyframe) {
+        if (s->transparent_color_index == -1 && s->has_global_palette) {
+            /* transparency wasn't set before the first frame, fill with background color */
+            gif_fill(frame, s->bg_color);
+        } else {
+            /* otherwise fill with transparent color.
+             * this is necessary since by default picture filled with 0x80808080. */
+            gif_fill(frame, s->trans_color);
+        }
     }
 
     /* verify that all the image is inside the screen dimensions */
-    if (left + width > s->screen_width ||
-        top + height > s->screen_height ||
-        !width || !height) {
-        av_log(s->avctx, AV_LOG_ERROR, "Invalid image dimensions.\n");
+    if (!width || width > s->screen_width || left >= s->screen_width) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid image width.\n");
         return AVERROR_INVALIDDATA;
     }
+    if (!height || height > s->screen_height || top >= s->screen_height) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid image height.\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (left + width > s->screen_width) {
+        /* width must be kept around to avoid lzw vs line desync */
+        pw = s->screen_width - left;
+        av_log(s->avctx, AV_LOG_WARNING, "Image too wide by %d, truncating.\n",
+               left + width - s->screen_width);
+    } else {
+        pw = width;
+    }
+    if (top + height > s->screen_height) {
+        /* we don't care about the extra invisible lines */
+        av_log(s->avctx, AV_LOG_WARNING, "Image too high by %d, truncating.\n",
+               top + height - s->screen_height);
+        height = s->screen_height - top;
+    }
+
+    /* process disposal method */
+    if (s->gce_prev_disposal == GCE_DISPOSAL_BACKGROUND) {
+        gif_fill_rect(frame, s->stored_bg_color, s->gce_l, s->gce_t, s->gce_w, s->gce_h);
+    } else if (s->gce_prev_disposal == GCE_DISPOSAL_RESTORE) {
+        gif_copy_img_rect(s->stored_img, (uint32_t *)frame->data[0],
+            frame->linesize[0] / sizeof(uint32_t), s->gce_l, s->gce_t, s->gce_w, s->gce_h);
+    }
+
+    s->gce_prev_disposal = s->gce_disposal;
+
+    if (s->gce_disposal != GCE_DISPOSAL_NONE) {
+        s->gce_l = left;  s->gce_t = top;
+        s->gce_w = pw;    s->gce_h = height;
 
-    /* build the palette */
-    n = (1 << bits_per_pixel);
-    spal = palette;
-    for(i = 0; i < n; i++) {
-        s->image_palette[i] = (0xffu << 24) | AV_RB24(spal);
-        spal += 3;
+        if (s->gce_disposal == GCE_DISPOSAL_BACKGROUND) {
+            if (s->transparent_color_index >= 0)
+                s->stored_bg_color = s->trans_color;
+            else
+                s->stored_bg_color = s->bg_color;
+        } else if (s->gce_disposal == GCE_DISPOSAL_RESTORE) {
+            av_fast_malloc(&s->stored_img, &s->stored_img_size, frame->linesize[0] * frame->height);
+            if (!s->stored_img)
+                return AVERROR(ENOMEM);
+
+            gif_copy_img_rect((uint32_t *)frame->data[0], s->stored_img,
+                frame->linesize[0] / sizeof(uint32_t), left, top, pw, height);
+        }
     }
-    for(; i < 256; i++)
-        s->image_palette[i] = (0xffu << 24);
-    /* handle transparency */
-    if (s->transparent_color_index >= 0)
-        s->image_palette[s->transparent_color_index] = 0;
+
+    /* Expect at least 2 bytes: 1 for lzw code size and 1 for block size. */
+    if (bytestream2_get_bytes_left(&s->gb) < 2)
+        return AVERROR_INVALIDDATA;
 
     /* now get the image data */
-    code_size = bytestream2_get_byte(&s->gb);
-    ff_lzw_decode_init(s->lzw, code_size, s->gb.buffer,
-                       bytestream2_get_bytes_left(&s->gb), FF_LZW_GIF);
+    code_size = bytestream2_get_byteu(&s->gb);
+    if ((ret = ff_lzw_decode_init(s->lzw, code_size, s->gb.buffer,
+                                  bytestream2_get_bytes_left(&s->gb), FF_LZW_GIF)) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "LZW init failed\n");
+        return ret;
+    }
 
     /* read all the image */
-    linesize = frame->linesize[0];
-    ptr1 = frame->data[0] + top * linesize + left;
+    linesize = frame->linesize[0] / sizeof(uint32_t);
+    ptr1 = (uint32_t *)frame->data[0] + top * linesize + left;
     ptr = ptr1;
     pass = 0;
     y1 = 0;
     for (y = 0; y < height; y++) {
-        ff_lzw_decode(s->lzw, ptr, width);
+        int count = ff_lzw_decode(s->lzw, s->idx_line, width);
+        if (count != width) {
+            if (count)
+                av_log(s->avctx, AV_LOG_ERROR, "LZW decode failed\n");
+            goto decode_tail;
+        }
+
+        pr = ptr + pw;
+
+        for (px = ptr, idx = s->idx_line; px < pr; px++, idx++) {
+            if (*idx != s->transparent_color_index)
+                *px = pal[*idx];
+        }
+
         if (is_interleaved) {
             switch(pass) {
             default:
@@ -144,53 +290,77 @@ static int gif_read_image(GifState *s, AVFrame *frame)
             ptr += linesize;
         }
     }
+
+ decode_tail:
     /* read the garbage data until end marker is found */
-    ff_lzw_decode_tail(s->lzw);
+    lzwed_len = ff_lzw_decode_tail(s->lzw);
+    bytestream2_skipu(&s->gb, lzwed_len);
+
+    /* Graphic Control Extension's scope is single frame.
+     * Remove its influence. */
+    s->transparent_color_index = -1;
+    s->gce_disposal = GCE_DISPOSAL_NONE;
 
-    bytestream2_skip(&s->gb, ff_lzw_size_read(s->lzw));
     return 0;
 }
 
 static int gif_read_extension(GifState *s)
 {
-    int ext_code, ext_len, i, gce_flags, gce_transparent_index;
+    int ext_code, ext_len, gce_flags, gce_transparent_index;
+
+    /* There must be at least 2 bytes:
+     * 1 for extension label and 1 for extension length. */
+    if (bytestream2_get_bytes_left(&s->gb) < 2)
+        return AVERROR_INVALIDDATA;
 
-    /* extension */
-    ext_code = bytestream2_get_byte(&s->gb);
-    ext_len  = bytestream2_get_byte(&s->gb);
+    ext_code = bytestream2_get_byteu(&s->gb);
+    ext_len  = bytestream2_get_byteu(&s->gb);
 
-    ff_dlog(s->avctx, "gif: ext_code=0x%x len=%d\n", ext_code, ext_len);
+    ff_dlog(s->avctx, "ext_code=0x%x len=%d\n", ext_code, ext_len);
 
     switch(ext_code) {
-    case 0xf9:
+    case GIF_GCE_EXT_LABEL:
         if (ext_len != 4)
             goto discard_ext;
-        s->transparent_color_index = -1;
-        gce_flags    = bytestream2_get_byte(&s->gb);
-        s->gce_delay = bytestream2_get_le16(&s->gb);
-        gce_transparent_index = bytestream2_get_byte(&s->gb);
+
+        /* We need at least 5 bytes more: 4 is for extension body
+         * and 1 for next block size. */
+        if (bytestream2_get_bytes_left(&s->gb) < 5)
+            return AVERROR_INVALIDDATA;
+
+        gce_flags    = bytestream2_get_byteu(&s->gb);
+        bytestream2_skipu(&s->gb, 2);    // delay during which the frame is shown
+        gce_transparent_index = bytestream2_get_byteu(&s->gb);
         if (gce_flags & 0x01)
             s->transparent_color_index = gce_transparent_index;
         else
             s->transparent_color_index = -1;
         s->gce_disposal = (gce_flags >> 2) & 0x7;
 
-        ff_dlog(s->avctx, "gif: gce_flags=%x delay=%d tcolor=%d disposal=%d\n",
-               gce_flags, s->gce_delay,
+        ff_dlog(s->avctx, "gce_flags=%x tcolor=%d disposal=%d\n",
+               gce_flags,
                s->transparent_color_index, s->gce_disposal);
 
-        ext_len = bytestream2_get_byte(&s->gb);
+        if (s->gce_disposal > 3) {
+            s->gce_disposal = GCE_DISPOSAL_NONE;
+            ff_dlog(s->avctx, "invalid value in gce_disposal (%d). Using default value of 0.\n", ext_len);
+        }
+
+        ext_len = bytestream2_get_byteu(&s->gb);
         break;
     }
 
     /* NOTE: many extension blocks can come after */
  discard_ext:
-    while (ext_len != 0) {
-        for (i = 0; i < ext_len; i++)
-            bytestream2_get_byte(&s->gb);
-        ext_len = bytestream2_get_byte(&s->gb);
+    while (ext_len) {
+        /* There must be at least ext_len bytes and 1 for next block size byte. */
+        if (bytestream2_get_bytes_left(&s->gb) < ext_len + 1)
+            return AVERROR_INVALIDDATA;
+
+        bytestream2_skipu(&s->gb, ext_len);
+        ext_len = bytestream2_get_byteu(&s->gb);
 
-        ff_dlog(s->avctx, "gif: ext_len1=%d\n", ext_len);
+        ff_dlog(s->avctx, "ext_len1=%d\n", ext_len);
     }
     return 0;
 }
@@ -199,44 +369,48 @@ static int gif_read_header1(GifState *s)
 {
     uint8_t sig[6];
     int v, n;
-    int has_global_palette;
+    int background_color_index;
 
     if (bytestream2_get_bytes_left(&s->gb) < 13)
         return AVERROR_INVALIDDATA;
 
     /* read gif signature */
-    bytestream2_get_buffer(&s->gb, sig, 6);
-    if (memcmp(sig, gif87a_sig, 6) != 0 &&
-        memcmp(sig, gif89a_sig, 6) != 0)
+    bytestream2_get_bufferu(&s->gb, sig, 6);
+    if (memcmp(sig, gif87a_sig, 6) &&
+        memcmp(sig, gif89a_sig, 6))
         return AVERROR_INVALIDDATA;
 
     /* read screen header */
     s->transparent_color_index = -1;
-    s->screen_width  = bytestream2_get_le16(&s->gb);
-    s->screen_height = bytestream2_get_le16(&s->gb);
-    if(   (unsigned)s->screen_width  > 32767
-       || (unsigned)s->screen_height > 32767){
-        av_log(NULL, AV_LOG_ERROR, "picture size too large\n");
-        return AVERROR_INVALIDDATA;
-    }
+    s->screen_width  = bytestream2_get_le16u(&s->gb);
+    s->screen_height = bytestream2_get_le16u(&s->gb);
 
-    v = bytestream2_get_byte(&s->gb);
+    v = bytestream2_get_byteu(&s->gb);
     s->color_resolution = ((v & 0x70) >> 4) + 1;
-    has_global_palette = (v & 0x80);
+    s->has_global_palette = (v & 0x80);
     s->bits_per_pixel = (v & 0x07) + 1;
-    s->background_color_index = bytestream2_get_byte(&s->gb);
-    bytestream2_get_byte(&s->gb);                /* ignored */
+    background_color_index = bytestream2_get_byteu(&s->gb);
+    n = bytestream2_get_byteu(&s->gb);
+    if (n) {
+        s->avctx->sample_aspect_ratio.num = n + 15;
+        s->avctx->sample_aspect_ratio.den = 64;
+    }
 
-    ff_dlog(s->avctx, "gif: screen_w=%d screen_h=%d bpp=%d global_palette=%d\n",
+    ff_dlog(s->avctx, "screen_w=%d screen_h=%d bpp=%d global_palette=%d\n",
            s->screen_width, s->screen_height, s->bits_per_pixel,
-           has_global_palette);
+           s->has_global_palette);
 
-    if (has_global_palette) {
+    if (s->has_global_palette) {
+        s->background_color_index = background_color_index;
         n = 1 << s->bits_per_pixel;
         if (bytestream2_get_bytes_left(&s->gb) < n * 3)
             return AVERROR_INVALIDDATA;
-        bytestream2_get_buffer(&s->gb, s->global_palette, n * 3);
-    }
+
+        gif_read_palette(s, s->global_palette, n);
+        s->bg_color = s->global_palette[s->background_color_index];
+    } else
+        s->background_color_index = -1;
+
     return 0;
 }
 
@@ -246,23 +420,24 @@ static int gif_parse_next_image(GifState *s, AVFrame *frame)
         int code = bytestream2_get_byte(&s->gb);
         int ret;
 
-        ff_dlog(s->avctx, "gif: code=%02x '%c'\n", code, code);
+        av_log(s->avctx, AV_LOG_DEBUG, "code=%02x '%c'\n", code, code);
 
         switch (code) {
-        case ',':
+        case GIF_IMAGE_SEPARATOR:
             return gif_read_image(s, frame);
-        case '!':
+        case GIF_EXTENSION_INTRODUCER:
             if ((ret = gif_read_extension(s)) < 0)
                 return ret;
             break;
-        case ';':
+        case GIF_TRAILER:
             /* end of image */
+            return AVERROR_EOF;
         default:
-            /* error or erroneous EOF */
+            /* erroneous block label */
             return AVERROR_INVALIDDATA;
         }
     }
-    return AVERROR_INVALIDDATA;
+    return AVERROR_EOF;
 }
 
 static av_cold int gif_decode_init(AVCodecContext *avctx)
@@ -271,38 +446,74 @@ static av_cold int gif_decode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
+    avctx->pix_fmt = AV_PIX_FMT_RGB32;
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
     ff_lzw_decode_open(&s->lzw);
     return 0;
 }
 
-static int gif_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                            AVPacket *avpkt)
+static int gif_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     GifState *s = avctx->priv_data;
-    AVFrame *picture = data;
     int ret;
 
-    bytestream2_init(&s->gb, buf, buf_size);
-    if ((ret = gif_read_header1(s)) < 0)
-        return ret;
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
-    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    s->frame->pts     = avpkt->pts;
+    s->frame->pkt_pts = avpkt->pts;
+    s->frame->pkt_dts = avpkt->dts;
+    av_frame_set_pkt_duration(s->frame, avpkt->duration);
 
-    if ((ret = ff_set_dimensions(avctx, s->screen_width, s->screen_height)) < 0)
-        return ret;
+    if (avpkt->size >= 6) {
+        s->keyframe = memcmp(avpkt->data, gif87a_sig, 6) == 0 ||
+                      memcmp(avpkt->data, gif89a_sig, 6) == 0;
+    } else {
+        s->keyframe = 0;
+    }
 
-    if ((ret = ff_get_buffer(avctx, picture, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (s->keyframe) {
+        s->keyframe_ok = 0;
+        s->gce_prev_disposal = GCE_DISPOSAL_NONE;
+        if ((ret = gif_read_header1(s)) < 0)
+            return ret;
+
+        if ((ret = ff_set_dimensions(avctx, s->screen_width, s->screen_height)) < 0)
+            return ret;
+
+        av_frame_unref(s->frame);
+        if ((ret = ff_get_buffer(avctx, s->frame, 0)) < 0)
+            return ret;
+
+        av_fast_malloc(&s->idx_line, &s->idx_line_size, s->screen_width);
+        if (!s->idx_line)
+            return AVERROR(ENOMEM);
+
+        s->frame->pict_type = AV_PICTURE_TYPE_I;
+        s->frame->key_frame = 1;
+        s->keyframe_ok = 1;
+    } else {
+        if (!s->keyframe_ok) {
+            av_log(avctx, AV_LOG_ERROR, "cannot decode frame without keyframe\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+            return ret;
+
+        s->frame->pict_type = AV_PICTURE_TYPE_P;
+        s->frame->key_frame = 0;
     }
-    s->image_palette = (uint32_t *)picture->data[1];
-    ret = gif_parse_next_image(s, picture);
+
+    ret = gif_parse_next_image(s, s->frame);
     if (ret < 0)
         return ret;
 
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
     *got_frame = 1;
+
     return bytestream2_tell(&s->gb);
 }
 
@@ -311,9 +522,29 @@ static av_cold int gif_decode_close(AVCodecContext *avctx)
     GifState *s = avctx->priv_data;
 
     ff_lzw_decode_close(&s->lzw);
+    av_frame_free(&s->frame);
+    av_freep(&s->idx_line);
+    av_freep(&s->stored_img);
+
     return 0;
 }
 
+static const AVOption options[] = {
+    { "trans_color", "color value (ARGB) that is used instead of transparent color",
+      offsetof(GifState, trans_color), AV_OPT_TYPE_INT,
+      {.i64 = GIF_TRANSPARENT_COLOR}, 0, 0xffffffff,
+      AV_OPT_FLAG_DECODING_PARAM|AV_OPT_FLAG_VIDEO_PARAM },
+    { NULL },
+};
+
+static const AVClass decoder_class = {
+    .class_name = "gif decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DECODER,
+};
+
 AVCodec ff_gif_decoder = {
     .name           = "gif",
     .long_name      = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
@@ -324,4 +555,5 @@ AVCodec ff_gif_decoder = {
     .close          = gif_decode_close,
     .decode         = gif_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &decoder_class,
 };
diff --git a/libavcodec/golomb-test.c b/libavcodec/golomb-test.c
index e740a20..2dfe917 100644
--- a/libavcodec/golomb-test.c
+++ b/libavcodec/golomb-test.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,7 +58,7 @@ int main(void)
         }
     }
 
-#define EXTEND(i) (i << 3 | i & 7)
+#define EXTEND(i) ((i) << 3 | (i) & 7)
     init_put_bits(&pb, temp, SIZE);
     for (i = 0; i < COUNT; i++)
         set_ue_golomb(&pb, EXTEND(i));
diff --git a/libavcodec/golomb.c b/libavcodec/golomb.c
index 550c41e..937ac22 100644
--- a/libavcodec/golomb.c
+++ b/libavcodec/golomb.c
@@ -2,20 +2,20 @@
  * exp golomb vlc stuff
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/golomb.h b/libavcodec/golomb.h
index 22a87c6..d4df0b3 100644
--- a/libavcodec/golomb.h
+++ b/libavcodec/golomb.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,7 +48,7 @@ extern const  int8_t ff_interleaved_se_golomb_vlc_code[256];
 extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256];
 
 /**
- * read unsigned exp golomb code.
+ * Read an unsigned Exp-Golomb code in the range 0 to 8190.
  */
 static inline int get_ue_golomb(GetBitContext *gb)
 {
@@ -66,10 +66,14 @@ static inline int get_ue_golomb(GetBitContext *gb)
         return ff_ue_golomb_vlc_code[buf];
     } else {
         int log = 2 * av_log2(buf) - 31;
-        buf >>= log;
-        buf--;
         LAST_SKIP_BITS(re, gb, 32 - log);
         CLOSE_READER(re, gb);
+        if (log < 7) {
+            av_log(NULL, AV_LOG_ERROR, "Invalid UE golomb code\n");
+            return AVERROR_INVALIDDATA;
+        }
+        buf >>= log;
+        buf--;
 
         return buf;
     }
@@ -138,7 +142,7 @@ static inline unsigned svq3_get_ue_golomb(GetBitContext *gb)
             ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
             UPDATE_CACHE(re, gb);
             buf = GET_CACHE(re, gb);
-        } while (BITS_AVAILABLE(re, gb));
+        } while (ret<0x8000000U && BITS_AVAILABLE(re, gb));
 
         CLOSE_READER(re, gb);
         return ret - 1;
@@ -150,7 +154,7 @@ static inline unsigned svq3_get_ue_golomb(GetBitContext *gb)
  */
 static inline int get_te0_golomb(GetBitContext *gb, int range)
 {
-    assert(range >= 1);
+    av_assert2(range >= 1);
 
     if (range == 1)
         return 0;
@@ -165,7 +169,7 @@ static inline int get_te0_golomb(GetBitContext *gb, int range)
  */
 static inline int get_te_golomb(GetBitContext *gb, int range)
 {
-    assert(range >= 1);
+    av_assert2(range >= 1);
 
     if (range == 2)
         return get_bits1(gb) ^ 1;
@@ -191,16 +195,18 @@ static inline int get_se_golomb(GetBitContext *gb)
 
         return ff_se_golomb_vlc_code[buf];
     } else {
-        int log = 2 * av_log2(buf) - 31;
+        int log = av_log2(buf), sign;
+        LAST_SKIP_BITS(re, gb, 31 - log);
+        UPDATE_CACHE(re, gb);
+        buf = GET_CACHE(re, gb);
+
         buf >>= log;
 
         LAST_SKIP_BITS(re, gb, 32 - log);
         CLOSE_READER(re, gb);
 
-        if (buf & 1)
-            buf = -(buf >> 1);
-        else
-            buf = (buf >> 1);
+        sign = -(buf & 1);
+        buf  = ((buf >> 1) ^ sign) - sign;
 
         return buf;
     }
@@ -209,13 +215,8 @@ static inline int get_se_golomb(GetBitContext *gb)
 static inline int get_se_golomb_long(GetBitContext *gb)
 {
     unsigned int buf = get_ue_golomb_long(gb);
-
-    if (buf & 1)
-        buf = (buf + 1) >> 1;
-    else
-        buf = -(buf >> 1);
-
-    return buf;
+    int sign = (buf & 1) - 1;
+    return ((buf >> 1) ^ sign) + 1;
 }
 
 static inline int svq3_get_se_golomb(GetBitContext *gb)
@@ -256,13 +257,8 @@ static inline int dirac_get_se_golomb(GetBitContext *gb)
     uint32_t ret = svq3_get_ue_golomb(gb);
 
     if (ret) {
-        uint32_t buf;
-        OPEN_READER(re, gb);
-        UPDATE_CACHE(re, gb);
-        buf = SHOW_SBITS(re, gb, 1);
-        LAST_SKIP_BITS(re, gb, 1);
-        ret = (ret ^ buf) - buf;
-        CLOSE_READER(re, gb);
+        int sign = -get_bits1(gb);
+        ret = (ret ^ sign) - sign;
     }
 
     return ret;
@@ -285,7 +281,7 @@ static inline int get_ur_golomb(GetBitContext *gb, int k, int limit,
 
     if (log > 31 - limit) {
         buf >>= log - k;
-        buf  += (30 - log) << k;
+        buf  += (30U - log) << k;
         LAST_SKIP_BITS(re, gb, 32 + k - log);
         CLOSE_READER(re, gb);
 
@@ -321,14 +317,16 @@ static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
     if (log - k >= 32 - MIN_CACHE_BITS + (MIN_CACHE_BITS == 32) &&
         32 - log < limit) {
         buf >>= log - k;
-        buf  += (30 - log) << k;
+        buf  += (30U - log) << k;
         LAST_SKIP_BITS(re, gb, 32 + k - log);
         CLOSE_READER(re, gb);
 
         return buf;
     } else {
         int i;
-        for (i = 0; i < limit && SHOW_UBITS(re, gb, 1) == 0 && BITS_AVAILABLE(re, gb); i++) {
+        for (i = 0; i < limit && SHOW_UBITS(re, gb, 1) == 0; i++) {
+            if (gb->size_in_bits <= re_index)
+                return -1;
             LAST_SKIP_BITS(re, gb, 1);
             UPDATE_CACHE(re, gb);
         }
@@ -336,8 +334,16 @@ static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
 
         if (i < limit - 1) {
             if (k) {
-                buf = SHOW_UBITS(re, gb, k);
-                LAST_SKIP_BITS(re, gb, k);
+                if (k > MIN_CACHE_BITS - 1) {
+                    buf = SHOW_UBITS(re, gb, 16) << (k-16);
+                    LAST_SKIP_BITS(re, gb, 16);
+                    UPDATE_CACHE(re, gb);
+                    buf |= SHOW_UBITS(re, gb, k-16);
+                    LAST_SKIP_BITS(re, gb, k-16);
+                } else {
+                    buf = SHOW_UBITS(re, gb, k);
+                    LAST_SKIP_BITS(re, gb, k);
+                }
             } else {
                 buf = 0;
             }
@@ -361,15 +367,8 @@ static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit,
 static inline int get_sr_golomb(GetBitContext *gb, int k, int limit,
                                 int esc_len)
 {
-    int v = get_ur_golomb(gb, k, limit, esc_len);
-
-    v++;
-    if (v & 1)
-        return v >> 1;
-    else
-        return -(v >> 1);
-
-//    return (v>>1) ^ -(v&1);
+    unsigned v = get_ur_golomb(gb, k, limit, esc_len);
+    return (v >> 1) ^ -(v & 1);
 }
 
 /**
@@ -378,7 +377,7 @@ static inline int get_sr_golomb(GetBitContext *gb, int k, int limit,
 static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit,
                                      int esc_len)
 {
-    int v = get_ur_golomb_jpegls(gb, k, limit, esc_len);
+    unsigned v = get_ur_golomb_jpegls(gb, k, limit, esc_len);
     return (v >> 1) ^ -(v & 1);
 }
 
@@ -396,10 +395,7 @@ static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k)
 static inline int get_sr_golomb_shorten(GetBitContext *gb, int k)
 {
     int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
-    if (uvar & 1)
-        return ~(uvar >> 1);
-    else
-        return uvar >> 1;
+    return (uvar >> 1) ^ -(uvar & 1);
 }
 
 #ifdef TRACE
@@ -467,14 +463,8 @@ static inline int get_te(GetBitContext *s, int r, char *file, const char *func,
  */
 static inline void set_ue_golomb(PutBitContext *pb, int i)
 {
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
-#if 0
-    if (i = 0) {
-        put_bits(pb, 1, 1);
-        return;
-    }
-#endif
     if (i < 256)
         put_bits(pb, ff_ue_golomb_len[i], i + 1);
     else {
@@ -488,8 +478,8 @@ static inline void set_ue_golomb(PutBitContext *pb, int i)
  */
 static inline void set_te_golomb(PutBitContext *pb, int i, int range)
 {
-    assert(range >= 1);
-    assert(i <= range);
+    av_assert2(range >= 1);
+    av_assert2(i <= range);
 
     if (range == 2)
         put_bits(pb, 1, i ^ 1);
@@ -526,11 +516,11 @@ static inline void set_ur_golomb(PutBitContext *pb, int i, int k, int limit,
 {
     int e;
 
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
     e = i >> k;
     if (e < limit)
-        put_bits(pb, e + k + 1, (1 << k) + (i & ((1 << k) - 1)));
+        put_bits(pb, e + k + 1, (1 << k) + av_mod_uintp2(i, k));
     else
         put_bits(pb, limit + esc_len, i - limit + 1);
 }
@@ -543,7 +533,7 @@ static inline void set_ur_golomb_jpegls(PutBitContext *pb, int i, int k,
 {
     int e;
 
-    assert(i >= 0);
+    av_assert2(i >= 0);
 
     e = (i >> k) + 1;
     if (e < limit) {
diff --git a/libavcodec/gsm.h b/libavcodec/gsm.h
index 238cb73..53d65c4 100644
--- a/libavcodec/gsm.h
+++ b/libavcodec/gsm.h
@@ -1,20 +1,20 @@
 /*
  * GSM common header
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsm_parser.c b/libavcodec/gsm_parser.c
index c0befc7..1054a30 100644
--- a/libavcodec/gsm_parser.c
+++ b/libavcodec/gsm_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012  Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * Splits packets into individual blocks.
  */
 
+#include "libavutil/avassert.h"
 #include "parser.h"
 #include "gsm.h"
 
@@ -55,7 +56,7 @@ static int gsm_parse(AVCodecParserContext *s1, AVCodecContext *avctx,
             s->duration   = GSM_FRAME_SIZE * 2;
             break;
         default:
-            return AVERROR(EINVAL);
+            av_assert0(0);
         }
     }
 
diff --git a/libavcodec/gsmdec.c b/libavcodec/gsmdec.c
index a333e58..cd56995 100644
--- a/libavcodec/gsmdec.c
+++ b/libavcodec/gsmdec.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,10 +79,8 @@ static int gsm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avctx->frame_size;
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
     samples = (int16_t *)frame->data[0];
 
     switch (avctx->codec_id) {
@@ -112,6 +110,7 @@ static void gsm_flush(AVCodecContext *avctx)
     memset(s, 0, sizeof(*s));
 }
 
+#if CONFIG_GSM_DECODER
 AVCodec ff_gsm_decoder = {
     .name           = "gsm",
     .long_name      = NULL_IF_CONFIG_SMALL("GSM"),
@@ -123,7 +122,8 @@ AVCodec ff_gsm_decoder = {
     .flush          = gsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
+#if CONFIG_GSM_MS_DECODER
 AVCodec ff_gsm_ms_decoder = {
     .name           = "gsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("GSM Microsoft variant"),
@@ -135,3 +135,4 @@ AVCodec ff_gsm_ms_decoder = {
     .flush          = gsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/gsmdec_data.c b/libavcodec/gsmdec_data.c
index c9b3183..d90c69b 100644
--- a/libavcodec/gsmdec_data.c
+++ b/libavcodec/gsmdec_data.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder data
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsmdec_data.h b/libavcodec/gsmdec_data.h
index f5581d5..b57194b 100644
--- a/libavcodec/gsmdec_data.h
+++ b/libavcodec/gsmdec_data.h
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder data
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/gsmdec_template.c b/libavcodec/gsmdec_template.c
index 0b54dc5..4cb777c 100644
--- a/libavcodec/gsmdec_template.c
+++ b/libavcodec/gsmdec_template.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,7 @@ static inline int decode_log_area(int coded, int factor, int offset)
 {
     coded <<= 10;
     coded -= offset;
-    return gsm_mult(coded, factor) << 1;
+    return gsm_mult(coded, factor) * 2;
 }
 
 static av_noinline int get_rrp(int filtered)
@@ -121,7 +121,7 @@ static int postprocess(int16_t *data, int msr)
     int i;
     for (i = 0; i < 160; i++) {
         msr = av_clip_int16(data[i] + gsm_mult(msr, 28180));
-        data[i] = av_clip_int16(msr << 1) & ~7;
+        data[i] = av_clip_int16(msr * 2) & ~7;
     }
     return msr;
 }
diff --git a/libavcodec/h261.c b/libavcodec/h261.c
index 320d621..47bad4e 100644
--- a/libavcodec/h261.c
+++ b/libavcodec/h261.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h261.h b/libavcodec/h261.h
index fdfe560..399a404 100644
--- a/libavcodec/h261.h
+++ b/libavcodec/h261.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,6 @@ typedef struct H261Context {
     MpegEncContext s;
 
     int current_mba;
-    int previous_mba;
     int mba_diff;
     int mtype;
     int current_mv_x;
diff --git a/libavcodec/h261_parser.c b/libavcodec/h261_parser.c
index 59eed02..2299c1c 100644
--- a/libavcodec/h261_parser.c
+++ b/libavcodec/h261_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,11 +71,15 @@ static int h261_parse(AVCodecParserContext *s,
     ParseContext *pc = s->priv_data;
     int next;
 
-    next = h261_find_frame_end(pc, avctx, buf, buf_size);
-    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-        *poutbuf      = NULL;
-        *poutbuf_size = 0;
-        return buf_size;
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        next = h261_find_frame_end(pc, avctx, buf, buf_size);
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf      = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
     }
     *poutbuf      = buf;
     *poutbuf_size = buf_size;
diff --git a/libavcodec/h261data.c b/libavcodec/h261data.c
index a81ccdf..a9891ed 100644
--- a/libavcodec/h261data.c
+++ b/libavcodec/h261data.c
@@ -2,20 +2,20 @@
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h261dec.c b/libavcodec/h261dec.c
index 9a323ec..7f2fff8 100644
--- a/libavcodec/h261dec.c
+++ b/libavcodec/h261dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * H.261 decoder.
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "mpeg_er.h"
 #include "mpegutils.h"
@@ -75,14 +76,11 @@ static av_cold int h261_decode_init(AVCodecContext *avctx)
 
     // set defaults
     ff_mpv_decode_defaults(s);
-    s->avctx       = avctx;
-    s->width       = s->avctx->coded_width;
-    s->height      = s->avctx->coded_height;
-    s->codec_id    = s->avctx->codec->id;
+    ff_mpv_decode_init(s, avctx);
+
     s->out_format  = FMT_H261;
     s->low_delay   = 1;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-    s->codec_id    = avctx->codec->id;
 
     ff_h261_common_init();
     h261_decode_init_vlc(h);
@@ -127,12 +125,12 @@ static int h261_decode_gob_header(H261Context *h)
     }
 
     /* GEI */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     if (s->qscale == 0) {
         av_log(s->avctx, AV_LOG_ERROR, "qscale has forbidden 0 value\n");
-        if (s->avctx->err_recognition & AV_EF_BITSTREAM)
+        if (s->avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return -1;
     }
 
@@ -218,6 +216,13 @@ static int h261_decode_mb_skipped(H261Context *h, int mba1, int mba2)
         s->mb_skipped                  = 1;
         h->mtype                      &= ~MB_TYPE_H261_FIL;
 
+        if (s->current_picture.motion_val[0]) {
+            int b_stride = 2*s->mb_width + 1;
+            int b_xy     = 2 * s->mb_x + (2 * s->mb_y) * b_stride;
+            s->current_picture.motion_val[0][b_xy][0] = s->mv[0][0][0];
+            s->current_picture.motion_val[0][b_xy][1] = s->mv[0][0][1];
+        }
+
         ff_mpv_decode_mb(s, s->block);
     }
 
@@ -257,7 +262,7 @@ static int decode_mv_component(GetBitContext *gb, int v)
 static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
 {
     MpegEncContext *const s = &h->s;
-    int code, level, i, j, run;
+    int level, i, j, run;
     RLTable *rl = &ff_h261_rl_tcoeff;
     const uint8_t *scan_table;
 
@@ -302,39 +307,47 @@ static int h261_decode_block(H261Context *h, int16_t *block, int n, int coded)
         s->block_last_index[n] = i - 1;
         return 0;
     }
+    {
+    OPEN_READER(re, &s->gb);
+    i--; // offset by -1 to allow direct indexing of scan_table
     for (;;) {
-        code = get_vlc2(&s->gb, rl->vlc.table, TCOEFF_VLC_BITS, 2);
-        if (code < 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n",
-                   s->mb_x, s->mb_y);
-            return -1;
-        }
-        if (code == rl->n) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TCOEFF_VLC_BITS, 2, 0);
+        if (run == 66) {
+            if (level) {
+                CLOSE_READER(re, &s->gb);
+                av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n",
+                       s->mb_x, s->mb_y);
+                return -1;
+            }
             /* escape */
             /* The remaining combinations of (run, level) are encoded with a
              * 20-bit word consisting of 6 bits escape, 6 bits run and 8 bits
              * level. */
-            run   = get_bits(&s->gb, 6);
-            level = get_sbits(&s->gb, 8);
-        } else if (code == 0) {
+            run   = SHOW_UBITS(re, &s->gb, 6) + 1;
+            SKIP_CACHE(re, &s->gb, 6);
+            level = SHOW_SBITS(re, &s->gb, 8);
+            SKIP_COUNTER(re, &s->gb, 6 + 8);
+        } else if (level == 0) {
             break;
         } else {
-            run   = rl->table_run[code];
-            level = rl->table_level[code];
-            if (get_bits1(&s->gb))
+            if (SHOW_UBITS(re, &s->gb, 1))
                 level = -level;
+            SKIP_COUNTER(re, &s->gb, 1);
         }
         i += run;
         if (i >= 64) {
+            CLOSE_READER(re, &s->gb);
             av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d\n",
                    s->mb_x, s->mb_y);
             return -1;
         }
         j        = scan_table[i];
         block[j] = level;
-        i++;
     }
-    s->block_last_index[n] = i - 1;
+    CLOSE_READER(re, &s->gb);
+    }
+    s->block_last_index[n] = i;
     return 0;
 }
 
@@ -379,11 +392,12 @@ static int h261_decode_mb(H261Context *h)
 
     // Read mtype
     h->mtype = get_vlc2(&s->gb, h261_mtype_vlc.table, H261_MTYPE_VLC_BITS, 2);
-    if (h->mtype < 0 || h->mtype >= FF_ARRAY_ELEMS(ff_h261_mtype_map)) {
+    if (h->mtype < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid mtype index %d\n",
                h->mtype);
         return SLICE_ERROR;
     }
+    av_assert0(h->mtype < FF_ARRAY_ELEMS(ff_h261_mtype_map));
     h->mtype = ff_h261_mtype_map[h->mtype];
 
     // Read mquant
@@ -431,6 +445,13 @@ static int h261_decode_mb(H261Context *h)
     s->mv[0][0][0]                 = h->current_mv_x * 2; // gets divided by 2 in motion compensation
     s->mv[0][0][1]                 = h->current_mv_y * 2;
 
+    if (s->current_picture.motion_val[0]) {
+        int b_stride = 2*s->mb_width + 1;
+        int b_xy     = 2 * s->mb_x + (2 * s->mb_y) * b_stride;
+        s->current_picture.motion_val[0][b_xy][0] = s->mv[0][0][0];
+        s->current_picture.motion_val[0][b_xy][1] = s->mv[0][0][1];
+    }
+
 intra:
     /* decode each block */
     if (s->mb_intra || HAS_CBP(h->mtype)) {
@@ -506,8 +527,8 @@ static int h261_decode_picture_header(H261Context *h)
     skip_bits1(&s->gb); /* Reserved */
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     /* H.261 has no I-frames, but if we pass AV_PICTURE_TYPE_I for the first
      * frame, the codec crashes if it does not contain all I-blocks
@@ -634,12 +655,12 @@ retry:
     }
     ff_mpv_frame_end(s);
 
-    assert(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
-    assert(s->current_picture.f->pict_type == s->pict_type);
+    av_assert0(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
+    av_assert0(s->current_picture.f->pict_type == s->pict_type);
 
     if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
         return ret;
-    ff_print_debug_info(s, s->current_picture_ptr);
+    ff_print_debug_info(s, s->current_picture_ptr, pict);
 
     *got_frame = 1;
 
@@ -665,4 +686,5 @@ AVCodec ff_h261_decoder = {
     .close          = h261_decode_end,
     .decode         = h261_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
 };
diff --git a/libavcodec/h261enc.c b/libavcodec/h261enc.c
index 3cac882..315762c 100644
--- a/libavcodec/h261enc.c
+++ b/libavcodec/h261enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2004 Maarten Daniels
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
@@ -33,6 +34,9 @@
 #include "h261.h"
 #include "mpegvideodata.h"
 
+static uint8_t uni_h261_rl_len [64*64*2*2];
+#define UNI_ENC_INDEX(last,run,level) ((last)*128*64 + (run)*128 + (level))
+
 int ff_h261_get_picture_format(int width, int height)
 {
     // QCIF
@@ -43,7 +47,7 @@ int ff_h261_get_picture_format(int width, int height)
         return 1;
     // ERROR
     else
-        return -1;
+        return AVERROR(EINVAL);
 }
 
 void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
@@ -58,8 +62,8 @@ void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
 
     put_bits(&s->pb, 20, 0x10); /* PSC */
 
-    temp_ref = s->picture_number * (int64_t)30000 * s->avctx->time_base.num /
-               (1001 * (int64_t)s->avctx->time_base.den);   // FIXME maybe this should use a timestamp
+    temp_ref = s->picture_number * 30000LL * s->avctx->time_base.num /
+               (1001LL * s->avctx->time_base.den);   // FIXME maybe this should use a timestamp
     put_sbits(&s->pb, 5, temp_ref); /* TemporalReference */
 
     put_bits(&s->pb, 1, 0); /* split screen off */
@@ -78,7 +82,7 @@ void ff_h261_encode_picture_header(MpegEncContext *s, int picture_number)
         h->gob_number = -1;
     else
         h->gob_number = 0;
-    h->current_mba = 0;
+    s->mb_skip_run = 0;
 }
 
 /**
@@ -96,18 +100,21 @@ static void h261_encode_gob_header(MpegEncContext *s, int mb_line)
     put_bits(&s->pb, 4, h->gob_number); /* GN */
     put_bits(&s->pb, 5, s->qscale);     /* GQUANT */
     put_bits(&s->pb, 1, 0);             /* no GEI */
-    h->current_mba  = 0;
-    h->previous_mba = 0;
-    h->current_mv_x = 0;
-    h->current_mv_y = 0;
+    s->mb_skip_run = 0;
+    s->last_mv[0][0][0] = 0;
+    s->last_mv[0][0][1] = 0;
 }
 
 void ff_h261_reorder_mb_index(MpegEncContext *s)
 {
     int index = s->mb_x + s->mb_y * s->mb_width;
 
-    if (index % 33 == 0)
-        h261_encode_gob_header(s, 0);
+    if (index % 11 == 0) {
+        if (index % 33 == 0)
+            h261_encode_gob_header(s, 0);
+        s->last_mv[0][0][0] = 0;
+        s->last_mv[0][0][1] = 0;
+    }
 
     /* for CIF the GOB's are fragmented in the middle of a scanline
      * that's why we need to adjust the x and y index of the macroblocks */
@@ -214,8 +221,8 @@ static void h261_encode_block(H261Context *h, int16_t *block, int n)
             put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
             if (code == rl->n) {
                 put_bits(&s->pb, 6, run);
-                assert(slevel != 0);
-                assert(level <= 127);
+                av_assert1(slevel != 0);
+                av_assert1(level <= 127);
                 put_sbits(&s->pb, 8, slevel);
             } else {
                 put_bits(&s->pb, 1, sign);
@@ -235,7 +242,6 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
     cbp = 63; // avoid warning
     mvd = 0;
 
-    h->current_mba++;
     h->mtype = 0;
 
     if (!s->mb_intra) {
@@ -245,19 +251,22 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
         /* mvd indicates if this block is motion compensated */
         mvd = motion_x | motion_y;
 
-        if ((cbp | mvd | s->dquant) == 0) {
+        if ((cbp | mvd) == 0) {
             /* skip macroblock */
             s->skip_count++;
-            h->current_mv_x = 0;
-            h->current_mv_y = 0;
+            s->mb_skip_run++;
+            s->last_mv[0][0][0] = 0;
+            s->last_mv[0][0][1] = 0;
+            s->qscale -= s->dquant;
             return;
         }
     }
 
     /* MB is not skipped, encode MBA */
     put_bits(&s->pb,
-             ff_h261_mba_bits[(h->current_mba - h->previous_mba) - 1],
-             ff_h261_mba_code[(h->current_mba - h->previous_mba) - 1]);
+             ff_h261_mba_bits[s->mb_skip_run],
+             ff_h261_mba_code[s->mb_skip_run]);
+    s->mb_skip_run = 0;
 
     /* calculate MTYPE */
     if (!s->mb_intra) {
@@ -267,13 +276,15 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
             h->mtype += 3;
         if (s->loop_filter)
             h->mtype += 3;
-        if (cbp || s->dquant)
+        if (cbp)
             h->mtype++;
-        assert(h->mtype > 1);
+        av_assert1(h->mtype > 1);
     }
 
-    if (s->dquant)
+    if (s->dquant && cbp) {
         h->mtype++;
+    } else
+        s->qscale -= s->dquant;
 
     put_bits(&s->pb,
              ff_h261_mtype_bits[h->mtype],
@@ -287,18 +298,16 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
     }
 
     if (IS_16X16(h->mtype)) {
-        mv_diff_x       = (motion_x >> 1) - h->current_mv_x;
-        mv_diff_y       = (motion_y >> 1) - h->current_mv_y;
-        h->current_mv_x = (motion_x >> 1);
-        h->current_mv_y = (motion_y >> 1);
+        mv_diff_x       = (motion_x >> 1) - s->last_mv[0][0][0];
+        mv_diff_y       = (motion_y >> 1) - s->last_mv[0][0][1];
+        s->last_mv[0][0][0] = (motion_x >> 1);
+        s->last_mv[0][0][1] = (motion_y >> 1);
         h261_encode_motion(h, mv_diff_x);
         h261_encode_motion(h, mv_diff_y);
     }
 
-    h->previous_mba = h->current_mba;
-
     if (HAS_CBP(h->mtype)) {
-        assert(cbp > 0);
+        av_assert1(cbp > 0);
         put_bits(&s->pb,
                  ff_h261_cbp_tab[cbp - 1][1],
                  ff_h261_cbp_tab[cbp - 1][0]);
@@ -307,10 +316,49 @@ void ff_h261_encode_mb(MpegEncContext *s, int16_t block[6][64],
         /* encode each block */
         h261_encode_block(h, block[i], i);
 
-    if ((h->current_mba == 11) || (h->current_mba == 22) ||
-        (h->current_mba == 33) || (!IS_16X16(h->mtype))) {
-        h->current_mv_x = 0;
-        h->current_mv_y = 0;
+    if (!IS_16X16(h->mtype)) {
+        s->last_mv[0][0][0] = 0;
+        s->last_mv[0][0][1] = 0;
+    }
+}
+
+static av_cold void init_uni_h261_rl_tab(RLTable *rl, uint32_t *bits_tab,
+                                         uint8_t *len_tab)
+{
+    int slevel, run, last;
+
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN   >= 63);
+
+    for(slevel=-64; slevel<64; slevel++){
+        if(slevel==0) continue;
+        for(run=0; run<64; run++){
+            for(last=0; last<=1; last++){
+                const int index= UNI_ENC_INDEX(last, run, slevel+64);
+                int level= slevel < 0 ? -slevel : slevel;
+                int len, code;
+
+                len_tab[index]= 100;
+
+                /* ESC0 */
+                code= get_rl_index(rl, 0, run, level);
+                len=  rl->table_vlc[code][1] + 1;
+                if(last)
+                    len += 2;
+
+                if(code!=rl->n && len < len_tab[index]){
+                    len_tab [index]= len;
+                }
+                /* ESC */
+                len = rl->table_vlc[rl->n][1];
+                if(last)
+                    len += 2;
+
+                if(len < len_tab[index]){
+                    len_tab [index]= len;
+                }
+            }
+        }
     }
 }
 
@@ -322,6 +370,12 @@ av_cold void ff_h261_encode_init(MpegEncContext *s)
     s->max_qcoeff       = 127;
     s->y_dc_scale_table =
     s->c_dc_scale_table = ff_mpeg1_dc_scale_table;
+    s->ac_esc_length    = 6+6+8;
+
+    init_uni_h261_rl_tab(&ff_h261_rl_tcoeff, NULL, uni_h261_rl_len);
+
+    s->intra_ac_vlc_length      = s->inter_ac_vlc_length      = uni_h261_rl_len;
+    s->intra_ac_vlc_last_length = s->inter_ac_vlc_last_length = uni_h261_rl_len + 128*64;
 }
 
 static const AVClass h261_class = {
diff --git a/libavcodec/h263.c b/libavcodec/h263.c
index 85d58cc..4720c56 100644
--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h263.h b/libavcodec/h263.h
index 42c78f4..d154d36 100644
--- a/libavcodec/h263.h
+++ b/libavcodec/h263.h
@@ -1,20 +1,20 @@
 /*
  * H.263 internal header
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_H263_H
@@ -95,11 +95,10 @@ int av_const h263_get_picture_format(int width, int height);
 
 void ff_clean_h263_qscales(MpegEncContext *s);
 int ff_h263_resync(MpegEncContext *s);
-const uint8_t *ff_h263_find_resync_marker(const uint8_t *p, const uint8_t *end);
-void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code);
+void ff_h263_encode_motion(PutBitContext *pb, int val, int f_code);
 
 
-static inline int h263_get_motion_length(MpegEncContext * s, int val, int f_code){
+static inline int h263_get_motion_length(int val, int f_code){
     int l, bit_size, code;
 
     if (val == 0) {
@@ -119,11 +118,11 @@ static inline int h263_get_motion_length(MpegEncContext * s, int val, int f_code
 static inline void ff_h263_encode_motion_vector(MpegEncContext * s, int x, int y, int f_code){
     if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT) {
         skip_put_bits(&s->pb,
-            h263_get_motion_length(s, x, f_code)
-           +h263_get_motion_length(s, y, f_code));
+            h263_get_motion_length(x, f_code)
+           +h263_get_motion_length(y, f_code));
     }else{
-        ff_h263_encode_motion(s, x, f_code);
-        ff_h263_encode_motion(s, y, f_code);
+        ff_h263_encode_motion(&s->pb, x, f_code);
+        ff_h263_encode_motion(&s->pb, y, f_code);
     }
 }
 
diff --git a/libavcodec/h263_parser.c b/libavcodec/h263_parser.c
index 71e047a..2e7d493 100644
--- a/libavcodec/h263_parser.c
+++ b/libavcodec/h263_parser.c
@@ -2,20 +2,20 @@
  * H.263 parser
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,12 +70,16 @@ static int h263_parse(AVCodecParserContext *s,
     ParseContext *pc = s->priv_data;
     int next;
 
-    next= ff_h263_find_frame_end(pc, buf, buf_size);
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        next= ff_h263_find_frame_end(pc, buf, buf_size);
 
-    if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-        *poutbuf = NULL;
-        *poutbuf_size = 0;
-        return buf_size;
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
     }
 
     *poutbuf = buf;
diff --git a/libavcodec/h263_parser.h b/libavcodec/h263_parser.h
index 5bd715f..565a222 100644
--- a/libavcodec/h263_parser.h
+++ b/libavcodec/h263_parser.h
@@ -2,20 +2,20 @@
  * H.263 parser
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h263data.c b/libavcodec/h263data.c
index b7f4f20..f649d58 100644
--- a/libavcodec/h263data.c
+++ b/libavcodec/h263data.c
@@ -1,20 +1,20 @@
 /*
  * H.263+ tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -264,11 +264,11 @@ const uint8_t ff_h263_chroma_qscale_table[32] = {
     0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15
 };
 
-uint16_t ff_mba_max[6] = {
+const uint16_t ff_mba_max[6] = {
     47, 98, 395, 1583, 6335, 9215
 };
 
-uint8_t ff_mba_length[7] = {
+const uint8_t ff_mba_length[7] = {
     6, 7, 9, 11, 13, 14, 14
 };
 
diff --git a/libavcodec/h263data.h b/libavcodec/h263data.h
index a431d58..3da0e37 100644
--- a/libavcodec/h263data.h
+++ b/libavcodec/h263data.h
@@ -4,20 +4,20 @@
  * copyright (c) 2001 Juan J. Sierralta P
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,7 +71,7 @@ extern const uint8_t ff_modified_quant_tab[2][32];
 
 extern const uint8_t ff_h263_chroma_qscale_table[32];
 
-extern uint16_t ff_mba_max[6];
-extern uint8_t ff_mba_length[7];
+extern const uint16_t ff_mba_max[6];
+extern const uint8_t ff_mba_length[7];
 
 #endif /* AVCODEC_H263DATA_H */
diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c
index e4a7227..d0da1d3 100644
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  * H.263 decoder.
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/cpu.h"
 #include "avcodec.h"
 #include "error_resilience.h"
@@ -39,6 +41,7 @@
 #include "mpegvideo.h"
 #include "msmpeg4.h"
 #include "qpeldsp.h"
+#include "vdpau_compat.h"
 #include "thread.h"
 #include "wmv2.h"
 
@@ -47,6 +50,12 @@ static enum AVPixelFormat h263_get_format(AVCodecContext *avctx)
     if (avctx->codec->id == AV_CODEC_ID_MSS2)
         return AV_PIX_FMT_YUV420P;
 
+    if (CONFIG_GRAY && (avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        if (avctx->color_range == AVCOL_RANGE_UNSPECIFIED)
+            avctx->color_range = AVCOL_RANGE_MPEG;
+        return AV_PIX_FMT_GRAY8;
+    }
+
     return avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
 }
 
@@ -55,14 +64,12 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     int ret;
 
-    s->avctx           = avctx;
     s->out_format      = FMT_H263;
-    s->width           = avctx->coded_width;
-    s->height          = avctx->coded_height;
-    s->workaround_bugs = avctx->workaround_bugs;
 
     // set defaults
     ff_mpv_decode_defaults(s);
+    ff_mpv_decode_init(s, avctx);
+
     s->quant_precision = 5;
     s->decode_mb       = ff_h263_decode_mb;
     s->low_delay       = 1;
@@ -71,6 +78,7 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     /* select sub codec */
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H263:
+    case AV_CODEC_ID_H263P:
         s->unrestricted_mv = 0;
         avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
         break;
@@ -117,8 +125,13 @@ av_cold int ff_h263_decode_init(AVCodecContext *avctx)
     }
     s->codec_id    = avctx->codec->id;
 
+    if (avctx->codec_tag == AV_RL32("L263") || avctx->codec_tag == AV_RL32("S263"))
+        if (avctx->extradata_size == 56 && avctx->extradata[0] == 1)
+            s->ehc_mode = 1;
+
     /* for H.263, we allocate the images after having read the header */
     if (avctx->codec->id != AV_CODEC_ID_H263 &&
+        avctx->codec->id != AV_CODEC_ID_H263P &&
         avctx->codec->id != AV_CODEC_ID_MPEG4) {
         avctx->pix_fmt = h263_get_format(avctx);
         ff_mpv_idct_init(s);
@@ -174,7 +187,7 @@ static int decode_slice(MpegEncContext *s)
 {
     const int part_mask = s->partitioned_frame
                           ? (ER_AC_END | ER_AC_ERROR) : 0x7F;
-    const int mb_size = 16;
+    const int mb_size   = 16 >> s->avctx->lowres;
     int ret;
 
     s->last_resync_gb   = s->gb;
@@ -186,10 +199,10 @@ static int decode_slice(MpegEncContext *s)
 
     if (s->avctx->hwaccel) {
         const uint8_t *start = s->gb.buffer + get_bits_count(&s->gb) / 8;
-        const uint8_t *end   = ff_h263_find_resync_marker(start + 1,
-                                                          s->gb.buffer_end);
-        skip_bits_long(&s->gb, 8 * (end - start));
-        return s->avctx->hwaccel->decode_slice(s->avctx, start, end - start);
+        ret = s->avctx->hwaccel->decode_slice(s->avctx, start, s->gb.buffer_end - start);
+        // ensure we exit decode loop
+        s->mb_y = s->mb_height;
+        return ret;
     }
 
     if (s->partitioned_frame) {
@@ -238,6 +251,8 @@ static int decode_slice(MpegEncContext *s)
             s->mv_type = MV_TYPE_16X16;
             ff_dlog(s, "%d %06X\n",
                     get_bits_count(&s->gb), show_bits(&s->gb, 24));
+
+            ff_tlog(NULL, "Decoding MB at %dx%d\n", s->mb_x, s->mb_y);
             ret = s->decode_mb(s, s->block);
 
             if (s->pict_type != AV_PICTURE_TYPE_B)
@@ -274,6 +289,8 @@ static int decode_slice(MpegEncContext *s)
                 ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                                 s->mb_x, s->mb_y, ER_MB_ERROR & part_mask);
 
+                if (s->avctx->err_recognition & AV_EF_IGNORE_ERR)
+                    continue;
                 return AVERROR_INVALIDDATA;
             }
 
@@ -288,7 +305,7 @@ static int decode_slice(MpegEncContext *s)
         s->mb_x = 0;
     }
 
-    assert(s->mb_x == 0 && s->mb_y == s->mb_height);
+    av_assert1(s->mb_x == 0 && s->mb_y == s->mb_height);
 
     if (s->codec_id == AV_CODEC_ID_MPEG4         &&
         (s->workaround_bugs & FF_BUG_AUTODETECT) &&
@@ -301,7 +318,7 @@ static int decode_slice(MpegEncContext *s)
     if (s->codec_id == AV_CODEC_ID_MPEG4         &&
         (s->workaround_bugs & FF_BUG_AUTODETECT) &&
         get_bits_left(&s->gb) >= 0               &&
-        get_bits_left(&s->gb) < 48               &&
+        get_bits_left(&s->gb) < 137              &&
         !s->data_partitioning) {
         const int bits_count = get_bits_count(&s->gb);
         const int bits_left  = s->gb.size_in_bits - bits_count;
@@ -322,8 +339,27 @@ static int decode_slice(MpegEncContext *s)
         }
     }
 
+    if (s->codec_id == AV_CODEC_ID_H263          &&
+        (s->workaround_bugs & FF_BUG_AUTODETECT) &&
+        get_bits_left(&s->gb) >= 8               &&
+        get_bits_left(&s->gb) < 300              &&
+        s->pict_type == AV_PICTURE_TYPE_I        &&
+        show_bits(&s->gb, 8) == 0                &&
+        !s->data_partitioning) {
+
+        s->padding_bug_score += 32;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_H263          &&
+        (s->workaround_bugs & FF_BUG_AUTODETECT) &&
+        get_bits_left(&s->gb) >= 64              &&
+        AV_RB64(s->gb.buffer_end - 8) == 0xCDCDCDCDFC7F0000) {
+
+        s->padding_bug_score += 32;
+    }
+
     if (s->workaround_bugs & FF_BUG_AUTODETECT) {
-        if (s->codec_id == AV_CODEC_ID_H263 ||
+        if (
             (s->padding_bug_score > -2 && !s->data_partitioning))
             s->workaround_bugs |= FF_BUG_NO_PADDING;
         else
@@ -342,7 +378,7 @@ static int decode_slice(MpegEncContext *s)
         /* buggy padding but the frame should still end approximately at
          * the bitstream end */
         if ((s->workaround_bugs & FF_BUG_NO_PADDING) &&
-            (s->avctx->err_recognition & AV_EF_BUFFER))
+            (s->avctx->err_recognition & (AV_EF_BUFFER|AV_EF_AGGRESSIVE)))
             max_extra += 48;
         else if ((s->workaround_bugs & FF_BUG_NO_PADDING))
             max_extra += 256 * 256 * 256 * 64;
@@ -377,6 +413,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int buf_size       = avpkt->size;
     MpegEncContext *s  = avctx->priv_data;
     int ret;
+    int slice_ret = 0;
     AVFrame *pict = data;
 
     /* no supplementary picture */
@@ -400,6 +437,8 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             next = ff_mpeg4_find_frame_end(&s->parse_context, buf, buf_size);
         } else if (CONFIG_H263_DECODER && s->codec_id == AV_CODEC_ID_H263) {
             next = ff_h263_find_frame_end(&s->parse_context, buf, buf_size);
+        } else if (CONFIG_H263P_DECODER && s->codec_id == AV_CODEC_ID_H263P) {
+            next = ff_h263_find_frame_end(&s->parse_context, buf, buf_size);
         } else {
             av_log(s->avctx, AV_LOG_ERROR,
                    "this codec does not support truncated bitstreams\n");
@@ -411,13 +450,27 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return buf_size;
     }
 
-    if (s->bitstream_buffer_size && (s->divx_packed || buf_size < 20)) // divx 5.01+/xvid frame reorder
+retry:
+    if (s->divx_packed && s->bitstream_buffer_size) {
+        int i;
+        for(i=0; i < buf_size-3; i++) {
+            if (buf[i]==0 && buf[i+1]==0 && buf[i+2]==1) {
+                if (buf[i+3]==0xB0) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Discarding excessive bitstream in packed xvid\n");
+                    s->bitstream_buffer_size = 0;
+                }
+                break;
+            }
+        }
+    }
+
+    if (s->bitstream_buffer_size && (s->divx_packed || buf_size <= MAX_NVOP_SIZE)) // divx 5.01+/xvid frame reorder
         ret = init_get_bits8(&s->gb, s->bitstream_buffer,
                              s->bitstream_buffer_size);
     else
         ret = init_get_bits8(&s->gb, buf, buf_size);
-    s->bitstream_buffer_size = 0;
 
+    s->bitstream_buffer_size = 0;
     if (ret < 0)
         return ret;
 
@@ -434,11 +487,8 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (s->avctx->extradata_size && s->picture_number == 0) {
             GetBitContext gb;
 
-            ret = init_get_bits8(&gb, s->avctx->extradata,
-                                 s->avctx->extradata_size);
-            if (ret < 0)
-                return ret;
-            ff_mpeg4_decode_picture_header(avctx->priv_data, &gb);
+            if (init_get_bits8(&gb, s->avctx->extradata, s->avctx->extradata_size) >= 0 )
+                ff_mpeg4_decode_picture_header(avctx->priv_data, &gb);
         }
         ret = ff_mpeg4_decode_picture_header(avctx->priv_data, &s->gb);
     } else if (CONFIG_H263I_DECODER && s->codec_id == AV_CODEC_ID_H263I) {
@@ -449,6 +499,14 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ret = ff_h263_decode_picture_header(s);
     }
 
+    if (ret < 0 || ret == FRAME_SKIPPED) {
+        if (   s->width  != avctx->coded_width
+            || s->height != avctx->coded_height) {
+                av_log(s->avctx, AV_LOG_WARNING, "Reverting picture dimensions change due to header decoding failure\n");
+                s->width = avctx->coded_width;
+                s->height= avctx->coded_height;
+        }
+    }
     if (ret == FRAME_SKIPPED)
         return get_consumed_bytes(s, buf_size);
 
@@ -473,25 +531,9 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     avctx->has_b_frames = !s->low_delay;
 
-#define SET_QPEL_FUNC(postfix1, postfix2)                           \
-    s->qdsp.put_        ## postfix1 = ff_put_        ## postfix2;   \
-    s->qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2;   \
-    s->qdsp.avg_        ## postfix1 = ff_avg_        ## postfix2;
-
-    if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-        SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c)
-
-        SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c)
-        SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c)
+    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4) {
+        if (ff_mpeg4_workaround_bugs(avctx) == 1)
+            goto retry;
     }
 
     /* After H.263 & MPEG-4 header decode we have the height, width,
@@ -561,6 +603,13 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (!s->divx_packed && !avctx->hwaccel)
         ff_thread_finish_setup(avctx);
 
+#if FF_API_CAP_VDPAU
+    if (CONFIG_MPEG4_VDPAU_DECODER && (s->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)) {
+        ff_vdpau_mpeg4_decode_picture(avctx->priv_data, s->gb.buffer, s->gb.buffer_end - s->gb.buffer);
+        goto frame_end;
+    }
+#endif
+
     if (avctx->hwaccel) {
         ret = avctx->hwaccel->start_frame(avctx, s->gb.buffer,
                                           s->gb.buffer_end - s->gb.buffer);
@@ -578,14 +627,14 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (ret < 0)
             return ret;
         if (ret == 1)
-            goto intrax8_decoded;
+            goto frame_end;
     }
 
     /* decode each macroblock */
     s->mb_x = 0;
     s->mb_y = 0;
 
-    ret = decode_slice(s);
+    slice_ret = decode_slice(s);
     while (s->mb_y < s->mb_height) {
         if (s->msmpeg4_version) {
             if (s->slice_height == 0 || s->mb_x != 0 ||
@@ -603,7 +652,7 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_mpeg4_clean_buffers(s);
 
         if (decode_slice(s) < 0)
-            ret = AVERROR_INVALIDDATA;
+            slice_ret = AVERROR_INVALIDDATA;
     }
 
     if (s->msmpeg4_version && s->msmpeg4_version < 4 &&
@@ -612,12 +661,8 @@ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_msmpeg4_decode_ext_header(s, buf_size) < 0)
             s->er.error_status_table[s->mb_num - 1] = ER_MB_ERROR;
 
-    assert(s->bitstream_buffer_size == 0);
-
-    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4)
-        ff_mpeg4_frame_end(avctx, buf, buf_size);
-
-intrax8_decoded:
+    av_assert1(s->bitstream_buffer_size == 0);
+frame_end:
     ff_er_frame_end(&s->er);
 
     if (avctx->hwaccel) {
@@ -628,26 +673,46 @@ intrax8_decoded:
 
     ff_mpv_frame_end(s);
 
+    if (CONFIG_MPEG4_DECODER && avctx->codec_id == AV_CODEC_ID_MPEG4)
+        ff_mpeg4_frame_end(avctx, buf, buf_size);
+
     if (!s->divx_packed && avctx->hwaccel)
         ff_thread_finish_setup(avctx);
 
-    assert(s->current_picture.f->pict_type ==
-           s->current_picture_ptr->f->pict_type);
-    assert(s->current_picture.f->pict_type == s->pict_type);
+    av_assert1(s->current_picture.f->pict_type == s->current_picture_ptr->f->pict_type);
+    av_assert1(s->current_picture.f->pict_type == s->pict_type);
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
         if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->current_picture_ptr);
+        ff_print_debug_info(s, s->current_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
     } else if (s->last_picture_ptr) {
         if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->last_picture_ptr);
+        ff_print_debug_info(s, s->last_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
     }
 
-    if (s->last_picture_ptr || s->low_delay)
+    if (s->last_picture_ptr || s->low_delay) {
+        if (   pict->format == AV_PIX_FMT_YUV420P
+            && (s->codec_tag == AV_RL32("GEOV") || s->codec_tag == AV_RL32("GEOX"))) {
+            int x, y, p;
+            av_frame_make_writable(pict);
+            for (p=0; p<3; p++) {
+                int w = AV_CEIL_RSHIFT(pict-> width, !!p);
+                int h = AV_CEIL_RSHIFT(pict->height, !!p);
+                int linesize = pict->linesize[p];
+                for (y=0; y<(h>>1); y++)
+                    for (x=0; x<w; x++)
+                        FFSWAP(int,
+                               pict->data[p][x + y*linesize],
+                               pict->data[p][x + (h-1-y)*linesize]);
+            }
+        }
         *got_frame = 1;
+    }
 
-    if (ret && (avctx->err_recognition & AV_EF_EXPLODE))
+    if (slice_ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
         return ret;
     else
         return get_consumed_bytes(s, buf_size);
@@ -660,6 +725,9 @@ const enum AVPixelFormat ff_h263_hwaccel_pixfmt_list_420[] = {
 #if CONFIG_MPEG4_VDPAU_HWACCEL
     AV_PIX_FMT_VDPAU,
 #endif
+#if CONFIG_H263_VIDEOTOOLBOX_HWACCEL || CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL
+    AV_PIX_FMT_VIDEOTOOLBOX,
+#endif
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
 };
@@ -676,5 +744,22 @@ AVCodec ff_h263_decoder = {
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
                       AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY,
     .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
+    .pix_fmts       = ff_h263_hwaccel_pixfmt_list_420,
+};
+
+AVCodec ff_h263p_decoder = {
+    .name           = "h263p",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.263 / H.263-1996, H.263+ / H.263-1998 / H.263 version 2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H263P,
+    .priv_data_size = sizeof(MpegEncContext),
+    .init           = ff_h263_decode_init,
+    .close          = ff_h263_decode_end,
+    .decode         = ff_h263_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 |
+                      AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY,
+    .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
     .pix_fmts       = ff_h263_hwaccel_pixfmt_list_420,
 };
diff --git a/libavcodec/h263dsp.c b/libavcodec/h263dsp.c
index 70ecdb9..b3c0bcd 100644
--- a/libavcodec/h263dsp.c
+++ b/libavcodec/h263dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -121,4 +121,6 @@ av_cold void ff_h263dsp_init(H263DSPContext *ctx)
 
     if (ARCH_X86)
         ff_h263dsp_init_x86(ctx);
+    if (ARCH_MIPS)
+        ff_h263dsp_init_mips(ctx);
 }
diff --git a/libavcodec/h263dsp.h b/libavcodec/h263dsp.h
index 40f041c..1abea3c 100644
--- a/libavcodec/h263dsp.h
+++ b/libavcodec/h263dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,5 +30,6 @@ typedef struct H263DSPContext {
 
 void ff_h263dsp_init(H263DSPContext *ctx);
 void ff_h263dsp_init_x86(H263DSPContext *ctx);
+void ff_h263dsp_init_mips(H263DSPContext *ctx);
 
 #endif /* AVCODEC_H263DSP_H */
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 224ba2f..a61379c 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,9 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
+#include "libavutil/avassert.h"
 #include "libavutil/display.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/opt.h"
@@ -48,11 +51,18 @@
 #include "profiles.h"
 #include "rectangle.h"
 #include "thread.h"
+#include "vdpau_compat.h"
 
-#include <assert.h>
+static int h264_decode_end(AVCodecContext *avctx);
 
 const uint16_t ff_h264_mb_sizes[4] = { 256, 384, 512, 768 };
 
+int avpriv_h264_has_num_reorder_frames(AVCodecContext *avctx)
+{
+    H264Context *h = avctx->priv_data;
+    return h && h->ps.sps ? h->ps.sps->num_reorder_frames : 0;
+}
+
 static void h264_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
                               int (*mv)[2][4][2],
                               int mb_x, int mb_y, int mb_intra, int mb_skipped)
@@ -64,19 +74,28 @@ static void h264_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type,
     sl->mb_y = mb_y;
     sl->mb_xy = mb_x + mb_y * h->mb_stride;
     memset(sl->non_zero_count_cache, 0, sizeof(sl->non_zero_count_cache));
-    assert(ref >= 0);
+    av_assert1(ref >= 0);
     /* FIXME: It is possible albeit uncommon that slice references
      * differ between slices. We take the easy approach and ignore
      * it for now. If this turns out to have any relevance in
      * practice then correct remapping should be added. */
     if (ref >= sl->ref_count[0])
         ref = 0;
+    if (!sl->ref_list[0][ref].data[0]) {
+        av_log(h->avctx, AV_LOG_DEBUG, "Reference not available for error concealing\n");
+        ref = 0;
+    }
+    if ((sl->ref_list[0][ref].reference&3) != 3) {
+        av_log(h->avctx, AV_LOG_DEBUG, "Reference invalid\n");
+        return;
+    }
     fill_rectangle(&h->cur_pic.ref_index[0][4 * sl->mb_xy],
                    2, 2, 2, ref, 1);
     fill_rectangle(&sl->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
     fill_rectangle(sl->mv_cache[0][scan8[0]], 4, 4, 8,
                    pack16to32((*mv)[0][0][0], (*mv)[0][0][1]), 4);
-    assert(!FRAME_MBAFF(h));
+    sl->mb_mbaff =
+    sl->mb_field_decoding_flag = 0;
     ff_h264_hl_decode_mb(h, &h->slice_ctx[0]);
 }
 
@@ -161,11 +180,11 @@ void ff_h264_free_tables(H264Context *h)
 int ff_h264_alloc_tables(H264Context *h)
 {
     const int big_mb_num = h->mb_stride * (h->mb_height + 1);
-    const int row_mb_num = h->mb_stride * 2 * h->nb_slice_ctx;
+    const int row_mb_num = 2*h->mb_stride*FFMAX(h->nb_slice_ctx, 1);
     int x, y;
 
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->intra4x4_pred_mode,
-                      row_mb_num * 8 * sizeof(uint8_t), fail)
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->intra4x4_pred_mode,
+                      row_mb_num, 8 * sizeof(uint8_t), fail)
     h->slice_ctx[0].intra4x4_pred_mode = h->intra4x4_pred_mode;
 
     FF_ALLOCZ_OR_GOTO(h->avctx, h->non_zero_count,
@@ -176,10 +195,10 @@ int ff_h264_alloc_tables(H264Context *h)
                       big_mb_num * sizeof(uint16_t), fail)
     FF_ALLOCZ_OR_GOTO(h->avctx, h->chroma_pred_mode_table,
                       big_mb_num * sizeof(uint8_t), fail)
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->mvd_table[0],
-                      16 * row_mb_num * sizeof(uint8_t), fail);
-    FF_ALLOCZ_OR_GOTO(h->avctx, h->mvd_table[1],
-                      16 * row_mb_num * sizeof(uint8_t), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->mvd_table[0],
+                      row_mb_num, 16 * sizeof(uint8_t), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(h->avctx, h->mvd_table[1],
+                      row_mb_num, 16 * sizeof(uint8_t), fail);
     h->slice_ctx[0].mvd_table[0] = h->mvd_table[0];
     h->slice_ctx[0].mvd_table[1] = h->mvd_table[1];
 
@@ -232,7 +251,11 @@ int ff_h264_slice_context_init(H264Context *h, H264SliceContext *sl)
     sl->ref_cache[1][scan8[7]  + 1] =
     sl->ref_cache[1][scan8[13] + 1] = PART_NOT_AVAILABLE;
 
+    if (sl != h->slice_ctx) {
+        memset(er, 0, sizeof(*er));
+    } else
     if (CONFIG_ERROR_RESILIENCE) {
+
         /* init ER */
         er->avctx          = h->avctx;
         er->decode_mb      = h264_er_decode_mb;
@@ -282,6 +305,11 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     int i;
 
     h->avctx                 = avctx;
+    h->backup_width          = -1;
+    h->backup_height         = -1;
+    h->backup_pix_fmt        = AV_PIX_FMT_NONE;
+    h->current_sps_id        = -1;
+    h->cur_chroma_format_idc = -1;
 
     h->picture_structure     = PICT_FRAME;
     h->workaround_bugs       = avctx->workaround_bugs;
@@ -289,6 +317,9 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     h->poc.prev_poc_msb      = 1 << 16;
     h->recovery_frame        = -1;
     h->frame_recovered       = 0;
+    h->poc.prev_frame_num    = -1;
+    h->sei.frame_packing.frame_packing_arrangement_cancel_flag = -1;
+    h->sei.unregistered.x264_build = -1;
 
     h->next_outputed_poc = INT_MIN;
     for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
@@ -315,6 +346,10 @@ static int h264_init_context(AVCodecContext *avctx, H264Context *h)
     if (!h->cur_pic.f)
         return AVERROR(ENOMEM);
 
+    h->last_pic_for_ec.f = av_frame_alloc();
+    if (!h->last_pic_for_ec.f)
+        return AVERROR(ENOMEM);
+
     for (i = 0; i < h->nb_slice_ctx; i++)
         h->slice_ctx[i].h264 = h;
 
@@ -326,28 +361,29 @@ static av_cold int h264_decode_end(AVCodecContext *avctx)
     H264Context *h = avctx->priv_data;
     int i;
 
+    ff_h264_remove_all_refs(h);
     ff_h264_free_tables(h);
 
     for (i = 0; i < H264_MAX_PICTURE_COUNT; i++) {
         ff_h264_unref_picture(h, &h->DPB[i]);
         av_frame_free(&h->DPB[i].f);
     }
+    memset(h->delayed_pic, 0, sizeof(h->delayed_pic));
 
     h->cur_pic_ptr = NULL;
 
     av_freep(&h->slice_ctx);
     h->nb_slice_ctx = 0;
 
-    for (i = 0; i < MAX_SPS_COUNT; i++)
-        av_buffer_unref(&h->ps.sps_list[i]);
-
-    for (i = 0; i < MAX_PPS_COUNT; i++)
-        av_buffer_unref(&h->ps.pps_list[i]);
+    ff_h264_sei_uninit(&h->sei);
+    ff_h264_ps_uninit(&h->ps);
 
     ff_h2645_packet_uninit(&h->pkt);
 
     ff_h264_unref_picture(h, &h->cur_pic);
     av_frame_free(&h->cur_pic.f);
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+    av_frame_free(&h->last_pic_for_ec.f);
 
     return 0;
 }
@@ -370,19 +406,23 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx)
     }
 
     if (avctx->codec_id == AV_CODEC_ID_H264) {
-        if (avctx->ticks_per_frame == 1)
-            h->avctx->framerate.num *= 2;
+        if (avctx->ticks_per_frame == 1) {
+            if(h->avctx->time_base.den < INT_MAX/2) {
+                h->avctx->time_base.den *= 2;
+            } else
+                h->avctx->time_base.num /= 2;
+        }
         avctx->ticks_per_frame = 2;
     }
 
     if (avctx->extradata_size > 0 && avctx->extradata) {
-       ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
-                                      &h->ps, &h->is_avc, &h->nal_length_size,
-                                      avctx->err_recognition, avctx);
-       if (ret < 0) {
-           h264_decode_end(avctx);
-           return ret;
-       }
+        ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                       &h->ps, &h->is_avc, &h->nal_length_size,
+                                       avctx->err_recognition, avctx);
+        if (ret < 0) {
+            h264_decode_end(avctx);
+            return ret;
+        }
     }
 
     if (h->ps.sps && h->ps.sps->bitstream_restriction_flag &&
@@ -392,15 +432,21 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx)
 
     avctx->internal->allocate_progress = 1;
 
-    if (h->enable_er) {
+    ff_h264_flush_change(h);
+
+    if (h->enable_er < 0 && (avctx->active_thread_type & FF_THREAD_SLICE))
+        h->enable_er = 0;
+
+    if (h->enable_er && (avctx->active_thread_type & FF_THREAD_SLICE)) {
         av_log(avctx, AV_LOG_WARNING,
-               "Error resilience is enabled. It is unsafe and unsupported and may crash. "
+               "Error resilience with slice threads is enabled. It is unsafe and unsupported and may crash. "
                "Use it at your own risk\n");
     }
 
     return 0;
 }
 
+#if HAVE_THREADS
 static int decode_init_thread_copy(AVCodecContext *avctx)
 {
     H264Context *h = avctx->priv_data;
@@ -419,6 +465,7 @@ static int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 /**
  * Run setup operations that must be run after slice header decoding.
@@ -434,7 +481,6 @@ static void decode_postinit(H264Context *h, int setup_finished)
     H264Picture *out = h->cur_pic_ptr;
     H264Picture *cur = h->cur_pic_ptr;
     int i, pics, out_of_order, out_idx;
-    int invalid = 0, cnt = 0;
 
     if (h->next_output_pic)
         return;
@@ -446,7 +492,10 @@ static void decode_postinit(H264Context *h, int setup_finished)
          * yet, so we assume the worst for now. */
         // if (setup_finished)
         //    ff_thread_finish_setup(h->avctx);
-        return;
+        if (cur->field_poc[0] == INT_MAX && cur->field_poc[1] == INT_MAX)
+            return;
+        if (h->avctx->hwaccel || h->missing_fields <=1)
+            return;
     }
 
     cur->f->interlaced_frame = 0;
@@ -501,7 +550,7 @@ static void decode_postinit(H264Context *h, int setup_finished)
         /* Derive top_field_first from field pocs. */
         cur->f->top_field_first = cur->field_poc[0] < cur->field_poc[1];
     } else {
-        if (cur->f->interlaced_frame || sps->pic_struct_present_flag) {
+        if (sps->pic_struct_present_flag) {
             /* Use picture timing SEI information. Even if it is a
              * information of a past frame, better than nothing. */
             if (h->sei.picture_timing.pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM ||
@@ -509,6 +558,10 @@ static void decode_postinit(H264Context *h, int setup_finished)
                 cur->f->top_field_first = 1;
             else
                 cur->f->top_field_first = 0;
+        } else if (cur->f->interlaced_frame) {
+            /* Default to top field first when pic_struct_present_flag
+             * is not set but interlaced frame detected */
+            cur->f->top_field_first = 1;
         } else {
             /* Most likely progressive */
             cur->f->top_field_first = 0;
@@ -516,16 +569,13 @@ static void decode_postinit(H264Context *h, int setup_finished)
     }
 
     if (h->sei.frame_packing.present &&
-        h->sei.frame_packing.arrangement_type >= 0 &&
-        h->sei.frame_packing.arrangement_type <= 6 &&
+        h->sei.frame_packing.frame_packing_arrangement_type <= 6 &&
         h->sei.frame_packing.content_interpretation_type > 0 &&
         h->sei.frame_packing.content_interpretation_type < 3) {
         H264SEIFramePacking *fp = &h->sei.frame_packing;
         AVStereo3D *stereo = av_stereo3d_create_side_data(cur->f);
-        if (!stereo)
-            return;
-
-        switch (fp->arrangement_type) {
+        if (stereo) {
+        switch (fp->frame_packing_arrangement_type) {
         case 0:
             stereo->type = AV_STEREO3D_CHECKERBOARD;
             break;
@@ -536,7 +586,7 @@ static void decode_postinit(H264Context *h, int setup_finished)
             stereo->type = AV_STEREO3D_LINES;
             break;
         case 3:
-            if (fp->quincunx_subsampling)
+            if (fp->quincunx_sampling_flag)
                 stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
             else
                 stereo->type = AV_STEREO3D_SIDEBYSIDE;
@@ -554,6 +604,7 @@ static void decode_postinit(H264Context *h, int setup_finished)
 
         if (fp->content_interpretation_type == 2)
             stereo->flags = AV_STEREO3D_FLAG_INVERT;
+        }
     }
 
     if (h->sei.display_orientation.present &&
@@ -565,22 +616,21 @@ static void decode_postinit(H264Context *h, int setup_finished)
         AVFrameSideData *rotation = av_frame_new_side_data(cur->f,
                                                            AV_FRAME_DATA_DISPLAYMATRIX,
                                                            sizeof(int32_t) * 9);
-        if (!rotation)
-            return;
-
-        av_display_rotation_set((int32_t *)rotation->data, angle);
-        av_display_matrix_flip((int32_t *)rotation->data,
-                               o->hflip, o->vflip);
+        if (rotation) {
+            av_display_rotation_set((int32_t *)rotation->data, angle);
+            av_display_matrix_flip((int32_t *)rotation->data,
+                                   o->hflip, o->vflip);
+        }
     }
 
     if (h->sei.afd.present) {
         AVFrameSideData *sd = av_frame_new_side_data(cur->f, AV_FRAME_DATA_AFD,
                                                      sizeof(uint8_t));
-        if (!sd)
-            return;
 
-        *sd->data = h->sei.afd.active_format_description;
-        h->sei.afd.present = 0;
+        if (sd) {
+            *sd->data = h->sei.afd.active_format_description;
+            h->sei.afd.present = 0;
+        }
     }
 
     if (h->sei.a53_caption.a53_caption) {
@@ -588,119 +638,86 @@ static void decode_postinit(H264Context *h, int setup_finished)
         AVFrameSideData *sd = av_frame_new_side_data(cur->f,
                                                      AV_FRAME_DATA_A53_CC,
                                                      a53->a53_caption_size);
-        if (!sd)
-            return;
-
-        memcpy(sd->data, a53->a53_caption, a53->a53_caption_size);
+        if (sd)
+            memcpy(sd->data, a53->a53_caption, a53->a53_caption_size);
         av_freep(&a53->a53_caption);
         a53->a53_caption_size = 0;
+        h->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
     }
 
+    cur->mmco_reset = h->mmco_reset;
+    h->mmco_reset = 0;
+
     // FIXME do something with unavailable reference frames
 
     /* Sort B-frames into display order */
     if (sps->bitstream_restriction_flag ||
-        h->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL) {
+        h->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT) {
         h->avctx->has_b_frames = FFMAX(h->avctx->has_b_frames, sps->num_reorder_frames);
     }
 
+    for (i = 0; 1; i++) {
+        if(i == MAX_DELAYED_PIC_COUNT || cur->poc < h->last_pocs[i]){
+            if(i)
+                h->last_pocs[i-1] = cur->poc;
+            break;
+        } else if(i) {
+            h->last_pocs[i-1]= h->last_pocs[i];
+        }
+    }
+    out_of_order = MAX_DELAYED_PIC_COUNT - i;
+    if(   cur->f->pict_type == AV_PICTURE_TYPE_B
+       || (h->last_pocs[MAX_DELAYED_PIC_COUNT-2] > INT_MIN && h->last_pocs[MAX_DELAYED_PIC_COUNT-1] - h->last_pocs[MAX_DELAYED_PIC_COUNT-2] > 2))
+        out_of_order = FFMAX(out_of_order, 1);
+    if (out_of_order == MAX_DELAYED_PIC_COUNT) {
+        av_log(h->avctx, AV_LOG_VERBOSE, "Invalid POC %d<%d\n", cur->poc, h->last_pocs[0]);
+        for (i = 1; i < MAX_DELAYED_PIC_COUNT; i++)
+            h->last_pocs[i] = INT_MIN;
+        h->last_pocs[0] = cur->poc;
+        cur->mmco_reset = 1;
+    } else if(h->avctx->has_b_frames < out_of_order && !sps->bitstream_restriction_flag){
+        av_log(h->avctx, AV_LOG_INFO, "Increasing reorder buffer to %d\n", out_of_order);
+        h->avctx->has_b_frames = out_of_order;
+    }
+
     pics = 0;
     while (h->delayed_pic[pics])
         pics++;
 
-    assert(pics <= MAX_DELAYED_PIC_COUNT);
+    av_assert0(pics <= MAX_DELAYED_PIC_COUNT);
 
     h->delayed_pic[pics++] = cur;
     if (cur->reference == 0)
         cur->reference = DELAYED_PIC_REF;
 
-    /* Frame reordering. This code takes pictures from coding order and sorts
-     * them by their incremental POC value into display order. It supports POC
-     * gaps, MMCO reset codes and random resets.
-     * A "display group" can start either with a IDR frame (f.key_frame = 1),
-     * and/or can be closed down with a MMCO reset code. In sequences where
-     * there is no delay, we can't detect that (since the frame was already
-     * output to the user), so we also set h->mmco_reset to detect the MMCO
-     * reset code.
-     * FIXME: if we detect insufficient delays (as per h->avctx->has_b_frames),
-     * we increase the delay between input and output. All frames affected by
-     * the lag (e.g. those that should have been output before another frame
-     * that we already returned to the user) will be dropped. This is a bug
-     * that we will fix later. */
-    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++) {
-        cnt     += out->poc < h->last_pocs[i];
-        invalid += out->poc == INT_MIN;
-    }
-    if (!h->mmco_reset && !cur->f->key_frame &&
-        cnt + invalid == MAX_DELAYED_PIC_COUNT && cnt > 0) {
-        h->mmco_reset = 2;
-        if (pics > 1)
-            h->delayed_pic[pics - 2]->mmco_reset = 2;
-    }
-    if (h->mmco_reset || cur->f->key_frame) {
-        for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
-            h->last_pocs[i] = INT_MIN;
-        cnt     = 0;
-        invalid = MAX_DELAYED_PIC_COUNT;
-    }
     out     = h->delayed_pic[0];
     out_idx = 0;
-    for (i = 1; i < MAX_DELAYED_PIC_COUNT &&
-                h->delayed_pic[i] &&
-                !h->delayed_pic[i - 1]->mmco_reset &&
-                !h->delayed_pic[i]->f->key_frame;
+    for (i = 1; h->delayed_pic[i] &&
+                !h->delayed_pic[i]->f->key_frame &&
+                !h->delayed_pic[i]->mmco_reset;
          i++)
         if (h->delayed_pic[i]->poc < out->poc) {
             out     = h->delayed_pic[i];
             out_idx = i;
         }
     if (h->avctx->has_b_frames == 0 &&
-        (h->delayed_pic[0]->f->key_frame || h->mmco_reset))
+        (h->delayed_pic[0]->f->key_frame || h->delayed_pic[0]->mmco_reset))
         h->next_outputed_poc = INT_MIN;
-    out_of_order = !out->f->key_frame && !h->mmco_reset &&
-                   (out->poc < h->next_outputed_poc);
-
-    if (sps->bitstream_restriction_flag &&
-        h->avctx->has_b_frames >= sps->num_reorder_frames) {
-    } else if (out_of_order && pics - 1 == h->avctx->has_b_frames &&
-               h->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT) {
-        if (invalid + cnt < MAX_DELAYED_PIC_COUNT) {
-            h->avctx->has_b_frames = FFMAX(h->avctx->has_b_frames, cnt);
-        }
-    } else if (!h->avctx->has_b_frames &&
-               ((h->next_outputed_poc != INT_MIN &&
-                 out->poc > h->next_outputed_poc + 2) ||
-                cur->f->pict_type == AV_PICTURE_TYPE_B)) {
-        h->avctx->has_b_frames++;
-    }
+    out_of_order = out->poc < h->next_outputed_poc;
 
-    if (pics > h->avctx->has_b_frames) {
+    if (out_of_order || pics > h->avctx->has_b_frames) {
         out->reference &= ~DELAYED_PIC_REF;
         for (i = out_idx; h->delayed_pic[i]; i++)
             h->delayed_pic[i] = h->delayed_pic[i + 1];
     }
-    memmove(h->last_pocs, &h->last_pocs[1],
-            sizeof(*h->last_pocs) * (MAX_DELAYED_PIC_COUNT - 1));
-    h->last_pocs[MAX_DELAYED_PIC_COUNT - 1] = cur->poc;
     if (!out_of_order && pics > h->avctx->has_b_frames) {
         h->next_output_pic = out;
-        if (out->mmco_reset) {
-            if (out_idx > 0) {
-                h->next_outputed_poc                    = out->poc;
-                h->delayed_pic[out_idx - 1]->mmco_reset = out->mmco_reset;
-            } else {
-                h->next_outputed_poc = INT_MIN;
-            }
-        } else {
-            if (out_idx == 0 && pics > 1 && h->delayed_pic[0]->f->key_frame) {
-                h->next_outputed_poc = INT_MIN;
-            } else {
-                h->next_outputed_poc = out->poc;
-            }
-        }
-        h->mmco_reset = 0;
+        if (out_idx == 0 && h->delayed_pic[0] && (h->delayed_pic[0]->f->key_frame || h->delayed_pic[0]->mmco_reset)) {
+            h->next_outputed_poc = INT_MIN;
+        } else
+            h->next_outputed_poc = out->poc;
     } else {
-        av_log(h->avctx, AV_LOG_DEBUG, "no picture\n");
+        av_log(h->avctx, AV_LOG_DEBUG, "no picture %s\n", out_of_order ? "ooo" : "");
     }
 
     if (h->next_output_pic) {
@@ -725,28 +742,43 @@ static void decode_postinit(H264Context *h, int setup_finished)
  */
 static void idr(H264Context *h)
 {
+    int i;
     ff_h264_remove_all_refs(h);
     h->poc.prev_frame_num        =
-    h->poc.prev_frame_num_offset =
-    h->poc.prev_poc_msb          =
+    h->poc.prev_frame_num_offset = 0;
+    h->poc.prev_poc_msb          = 1<<16;
     h->poc.prev_poc_lsb          = 0;
+    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
+        h->last_pocs[i] = INT_MIN;
 }
 
 /* forget old pics after a seek */
 void ff_h264_flush_change(H264Context *h)
 {
-    int i;
-    for (i = 0; i < MAX_DELAYED_PIC_COUNT; i++)
-        h->last_pocs[i] = INT_MIN;
+    int i, j;
+
     h->next_outputed_poc = INT_MIN;
     h->prev_interlaced_frame = 1;
     idr(h);
-    if (h->cur_pic_ptr)
+
+    h->poc.prev_frame_num = -1;
+    if (h->cur_pic_ptr) {
         h->cur_pic_ptr->reference = 0;
+        for (j=i=0; h->delayed_pic[i]; i++)
+            if (h->delayed_pic[i] != h->cur_pic_ptr)
+                h->delayed_pic[j++] = h->delayed_pic[i];
+        h->delayed_pic[j] = NULL;
+    }
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+
     h->first_field = 0;
     ff_h264_sei_uninit(&h->sei);
     h->recovery_frame = -1;
     h->frame_recovered = 0;
+    h->current_slice = 0;
+    h->mmco_reset = 1;
+    for (i = 0; i < h->nb_slice_ctx; i++)
+        h->slice_ctx[i].list_count = 0;
 }
 
 /* forget old pics after a seek */
@@ -770,10 +802,16 @@ static void flush_dpb(AVCodecContext *avctx)
     h->context_initialized = 0;
 }
 
+#if FF_API_CAP_VDPAU
+static const uint8_t start_code[] = { 0x00, 0x00, 0x01 };
+#endif
+
 static int get_last_needed_nal(H264Context *h)
 {
     int nals_needed = 0;
+    int first_slice = 0;
     int i;
+    int ret;
 
     for (i = 0; i < h->pkt.nb_nals; i++) {
         H2645NAL *nal = &h->pkt.nals[i];
@@ -791,22 +829,60 @@ static int get_last_needed_nal(H264Context *h)
         case NAL_DPA:
         case NAL_IDR_SLICE:
         case NAL_SLICE:
-            init_get_bits(&gb, nal->data + 1, (nal->size - 1) * 8);
-            if (!get_ue_golomb(&gb))
+            ret = init_get_bits8(&gb, nal->data + 1, (nal->size - 1));
+            if (ret < 0)
+                return ret;
+            if (!get_ue_golomb_long(&gb) ||  // first_mb_in_slice
+                !first_slice ||
+                first_slice != nal->type)
                 nals_needed = i;
+            if (!first_slice)
+                first_slice = nal->type;
         }
     }
 
     return nals_needed;
 }
 
+static void debug_green_metadata(const H264SEIGreenMetaData *gm, void *logctx)
+{
+    av_log(logctx, AV_LOG_DEBUG, "Green Metadata Info SEI message\n");
+    av_log(logctx, AV_LOG_DEBUG, "  green_metadata_type: %d\n", gm->green_metadata_type);
+
+    if (gm->green_metadata_type == 0) {
+        av_log(logctx, AV_LOG_DEBUG, "  green_metadata_period_type: %d\n", gm->period_type);
+
+        if (gm->period_type == 2)
+            av_log(logctx, AV_LOG_DEBUG, "  green_metadata_num_seconds: %d\n", gm->num_seconds);
+        else if (gm->period_type == 3)
+            av_log(logctx, AV_LOG_DEBUG, "  green_metadata_num_pictures: %d\n", gm->num_pictures);
+
+        av_log(logctx, AV_LOG_DEBUG, "  SEI GREEN Complexity Metrics: %f %f %f %f\n",
+               (float)gm->percent_non_zero_macroblocks/255,
+               (float)gm->percent_intra_coded_macroblocks/255,
+               (float)gm->percent_six_tap_filtering/255,
+               (float)gm->percent_alpha_point_deblocking_instance/255);
+
+    } else if (gm->green_metadata_type == 1) {
+        av_log(logctx, AV_LOG_DEBUG, "  xsd_metric_type: %d\n", gm->xsd_metric_type);
+
+        if (gm->xsd_metric_type == 0)
+            av_log(logctx, AV_LOG_DEBUG, "  xsd_metric_value: %f\n",
+                   (float)gm->xsd_metric_value/100);
+    }
+}
+
 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
 {
     AVCodecContext *const avctx = h->avctx;
     unsigned context_count = 0;
     int nals_needed = 0; ///< number of NALs that need decoding before the next frame thread starts
+    int idr_cleared=0;
     int i, ret = 0;
 
+    h->nal_unit_type= 0;
+
+    h->max_contexts = h->nb_slice_ctx;
     if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS)) {
         h->current_slice = 0;
         if (!h->first_field)
@@ -814,6 +890,13 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
         ff_h264_sei_uninit(&h->sei);
     }
 
+    if (h->nal_length_size == 4) {
+        if (buf_size > 8 && AV_RB32(buf) == 1 && AV_RB32(buf+5) > (unsigned)buf_size) {
+            h->is_avc = 0;
+        }else if(buf_size > 3 && AV_RB32(buf) > 1 && AV_RB32(buf) <= (unsigned)buf_size)
+            h->is_avc = 1;
+    }
+
     ret = ff_h2645_packet_split(&h->pkt, buf, buf_size, avctx, h->is_avc,
                                 h->nal_length_size, avctx->codec_id);
     if (ret < 0) {
@@ -824,6 +907,8 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
 
     if (avctx->active_thread_type & FF_THREAD_FRAME)
         nals_needed = get_last_needed_nal(h);
+    if (nals_needed < 0)
+        return nals_needed;
 
     for (i = 0; i < h->pkt.nb_nals; i++) {
         H2645NAL *nal = &h->pkt.nals[i];
@@ -834,6 +919,7 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
             nal->ref_idc == 0 && nal->type != NAL_SEI)
             continue;
 
+again:
         // FIXME these should stop being context-global variables
         h->nal_ref_idc   = nal->ref_idc;
         h->nal_unit_type = nal->type;
@@ -841,28 +927,56 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
         err = 0;
         switch (nal->type) {
         case NAL_IDR_SLICE:
+            if ((nal->data[1] & 0xFC) == 0x98) {
+                av_log(h->avctx, AV_LOG_ERROR, "Invalid inter IDR frame\n");
+                h->next_outputed_poc = INT_MIN;
+                ret = -1;
+                goto end;
+            }
             if (nal->type != NAL_IDR_SLICE) {
                 av_log(h->avctx, AV_LOG_ERROR,
                        "Invalid mix of idr and non-idr slices\n");
                 ret = -1;
                 goto end;
             }
-            idr(h); // FIXME ensure we don't lose some frames if there is reordering
+            if(!idr_cleared) {
+                if (h->current_slice && (avctx->active_thread_type & FF_THREAD_SLICE)) {
+                    av_log(h, AV_LOG_ERROR, "invalid mixed IDR / non IDR frames cannot be decoded in slice multithreading mode\n");
+                    ret = AVERROR_INVALIDDATA;
+                    goto end;
+                }
+                idr(h); // FIXME ensure we don't lose some frames if there is reordering
+            }
+            idr_cleared = 1;
+            h->has_recovery_point = 1;
         case NAL_SLICE:
             sl->gb = nal->gb;
+            if (   nals_needed >= i
+                || (!(avctx->active_thread_type & FF_THREAD_FRAME) && !context_count))
+                h->au_pps_id = -1;
 
             if ((err = ff_h264_decode_slice_header(h, sl)))
                 break;
 
-            if (h->sei.recovery_point.recovery_frame_cnt >= 0 && h->recovery_frame < 0) {
-                h->recovery_frame = (h->poc.frame_num + h->sei.recovery_point.recovery_frame_cnt) &
-                                    ((1 << h->ps.sps->log2_max_frame_num) - 1);
+            if (h->sei.recovery_point.recovery_frame_cnt >= 0) {
+                const int sei_recovery_frame_cnt = h->sei.recovery_point.recovery_frame_cnt;
+
+                if (h->poc.frame_num != sei_recovery_frame_cnt || sl->slice_type_nos != AV_PICTURE_TYPE_I)
+                    h->valid_recovery_point = 1;
+
+                if (   h->recovery_frame < 0
+                    || av_mod_uintp2(h->recovery_frame - h->poc.frame_num, h->ps.sps->log2_max_frame_num) > sei_recovery_frame_cnt) {
+                    h->recovery_frame = av_mod_uintp2(h->poc.frame_num + sei_recovery_frame_cnt, h->ps.sps->log2_max_frame_num);
+
+                    if (!h->valid_recovery_point)
+                        h->recovery_frame = h->poc.frame_num;
+                }
             }
 
-            h->cur_pic_ptr->f->key_frame |=
-                (nal->type == NAL_IDR_SLICE) || (h->sei.recovery_point.recovery_frame_cnt >= 0);
+            h->cur_pic_ptr->f->key_frame |= (nal->type == NAL_IDR_SLICE);
 
-            if (nal->type == NAL_IDR_SLICE || h->recovery_frame == h->poc.frame_num) {
+            if (nal->type == NAL_IDR_SLICE ||
+                (h->recovery_frame == h->poc.frame_num && nal->ref_idc)) {
                 h->recovery_frame         = -1;
                 h->cur_pic_ptr->recovered = 1;
             }
@@ -870,28 +984,43 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
             // "recovered".
             if (nal->type == NAL_IDR_SLICE)
                 h->frame_recovered |= FRAME_RECOVERED_IDR;
+#if 1
+            h->cur_pic_ptr->recovered |= h->frame_recovered;
+#else
             h->cur_pic_ptr->recovered |= !!(h->frame_recovered & FRAME_RECOVERED_IDR);
+#endif
 
             if (h->current_slice == 1) {
                 if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS))
                     decode_postinit(h, i >= nals_needed);
 
                 if (h->avctx->hwaccel &&
-                    (ret = h->avctx->hwaccel->start_frame(h->avctx, NULL, 0)) < 0)
-                    return ret;
+                    (ret = h->avctx->hwaccel->start_frame(h->avctx, buf, buf_size)) < 0)
+                    goto end;
+#if FF_API_CAP_VDPAU
+                if (CONFIG_H264_VDPAU_DECODER &&
+                    h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+                    ff_vdpau_h264_picture_start(h);
+#endif
             }
 
-            if (sl->redundant_pic_count == 0 &&
-                (avctx->skip_frame < AVDISCARD_NONREF || nal->ref_idc) &&
-                (avctx->skip_frame < AVDISCARD_BIDIR  ||
-                 sl->slice_type_nos != AV_PICTURE_TYPE_B) &&
-                (avctx->skip_frame < AVDISCARD_NONKEY ||
-                 h->cur_pic_ptr->f->key_frame) &&
-                avctx->skip_frame < AVDISCARD_ALL) {
+            if (sl->redundant_pic_count == 0) {
                 if (avctx->hwaccel) {
-                    ret = avctx->hwaccel->decode_slice(avctx, nal->raw_data, nal->raw_size);
+                    ret = avctx->hwaccel->decode_slice(avctx,
+                                                       nal->raw_data,
+                                                       nal->raw_size);
                     if (ret < 0)
-                        return ret;
+                        goto end;
+#if FF_API_CAP_VDPAU
+                } else if (CONFIG_H264_VDPAU_DECODER &&
+                           h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU) {
+                    ff_vdpau_add_data_chunk(h->cur_pic_ptr->f->data[0],
+                                            start_code,
+                                            sizeof(start_code));
+                    ff_vdpau_add_data_chunk(h->cur_pic_ptr->f->data[0],
+                                            nal->raw_data,
+                                            nal->raw_size);
+#endif
                 } else
                     context_count++;
             }
@@ -900,19 +1029,32 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
         case NAL_DPB:
         case NAL_DPC:
             avpriv_request_sample(avctx, "data partitioning");
-            ret = AVERROR(ENOSYS);
-            goto end;
             break;
         case NAL_SEI:
             ret = ff_h264_sei_decode(&h->sei, &nal->gb, &h->ps, avctx);
+            h->has_recovery_point = h->has_recovery_point || h->sei.recovery_point.recovery_frame_cnt != -1;
+            if (avctx->debug & FF_DEBUG_GREEN_MD)
+                debug_green_metadata(&h->sei.green_metadata, h->avctx);
+#if FF_API_AFD
+FF_DISABLE_DEPRECATION_WARNINGS
+            h->avctx->dtg_active_format = h->sei.afd.active_format_description;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif /* FF_API_AFD */
             if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
                 goto end;
             break;
-        case NAL_SPS:
-            ret = ff_h264_decode_seq_parameter_set(&nal->gb, avctx, &h->ps);
-            if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
-                goto end;
+        case NAL_SPS: {
+            GetBitContext tmp_gb = nal->gb;
+            if (ff_h264_decode_seq_parameter_set(&tmp_gb, avctx, &h->ps, 0) >= 0)
+                break;
+            av_log(h->avctx, AV_LOG_DEBUG,
+                   "SPS decoding failure, trying again with the complete NAL\n");
+            init_get_bits8(&tmp_gb, nal->raw_data + 1, nal->raw_size - 1);
+            if (ff_h264_decode_seq_parameter_set(&tmp_gb, avctx, &h->ps, 0) >= 0)
+                break;
+            ff_h264_decode_seq_parameter_set(&nal->gb, avctx, &h->ps, 1);
             break;
+        }
         case NAL_PPS:
             ret = ff_h264_decode_picture_parameter_set(&nal->gb, avctx, &h->ps,
                                                        nal->size_bits);
@@ -933,16 +1075,27 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
                    nal->type, nal->size_bits);
         }
 
-        if (context_count == h->nb_slice_ctx) {
+        if (context_count == h->max_contexts) {
             ret = ff_h264_execute_decode_slices(h, context_count);
             if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
                 goto end;
             context_count = 0;
         }
 
-        if (err < 0) {
-            av_log(h->avctx, AV_LOG_ERROR, "decode_slice_header error\n");
+        if (err < 0 || err == SLICE_SKIPED) {
+            if (err < 0)
+                av_log(h->avctx, AV_LOG_ERROR, "decode_slice_header error\n");
             sl->ref_count[0] = sl->ref_count[1] = sl->list_count = 0;
+        } else if (err == SLICE_SINGLETHREAD) {
+            if (context_count > 0) {
+                ret = ff_h264_execute_decode_slices(h, context_count);
+                if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
+                    goto end;
+                context_count = 0;
+            }
+            /* Slice could not be decoded in parallel mode, restart. */
+            sl               = &h->slice_ctx[0];
+            goto again;
         }
     }
     if (context_count) {
@@ -953,6 +1106,50 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
 
     ret = 0;
 end:
+
+#if CONFIG_ERROR_RESILIENCE
+    /*
+     * FIXME: Error handling code does not seem to support interlaced
+     * when slices span multiple rows
+     * The ff_er_add_slice calls don't work right for bottom
+     * fields; they cause massive erroneous error concealing
+     * Error marking covers both fields (top and bottom).
+     * This causes a mismatched s->error_count
+     * and a bad error table. Further, the error count goes to
+     * INT_MAX when called for bottom field, because mb_y is
+     * past end by one (callers fault) and resync_mb_y != 0
+     * causes problems for the first MB line, too.
+     */
+    if (!FIELD_PICTURE(h) && h->current_slice &&
+        h->ps.sps == (const SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data &&
+        h->enable_er) {
+
+        H264SliceContext *sl = h->slice_ctx;
+        int use_last_pic = h->last_pic_for_ec.f->buf[0] && !sl->ref_count[0];
+
+        ff_h264_set_erpic(&sl->er.cur_pic, h->cur_pic_ptr);
+
+        if (use_last_pic) {
+            ff_h264_set_erpic(&sl->er.last_pic, &h->last_pic_for_ec);
+            sl->ref_list[0][0].parent = &h->last_pic_for_ec;
+            memcpy(sl->ref_list[0][0].data, h->last_pic_for_ec.f->data, sizeof(sl->ref_list[0][0].data));
+            memcpy(sl->ref_list[0][0].linesize, h->last_pic_for_ec.f->linesize, sizeof(sl->ref_list[0][0].linesize));
+            sl->ref_list[0][0].reference = h->last_pic_for_ec.reference;
+        } else if (sl->ref_count[0]) {
+            ff_h264_set_erpic(&sl->er.last_pic, sl->ref_list[0][0].parent);
+        } else
+            ff_h264_set_erpic(&sl->er.last_pic, NULL);
+
+        if (sl->ref_count[1])
+            ff_h264_set_erpic(&sl->er.next_pic, sl->ref_list[1][0].parent);
+
+        sl->er.ref_count = sl->ref_count[0];
+
+        ff_er_frame_end(&sl->er);
+        if (use_last_pic)
+            memset(&sl->ref_list[0][0], 0, sizeof(sl->ref_list[0][0]));
+    }
+#endif /* CONFIG_ERROR_RESILIENCE */
     /* clean up */
     if (h->cur_pic_ptr && !h->droppable) {
         ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
@@ -975,26 +1172,62 @@ static int get_consumed_bytes(int pos, int buf_size)
     return pos;
 }
 
-static int output_frame(H264Context *h, AVFrame *dst, AVFrame *src)
+static int output_frame(H264Context *h, AVFrame *dst, H264Picture *srcp)
 {
+    AVFrame *src = srcp->f;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
     int i;
     int ret = av_frame_ref(dst, src);
     if (ret < 0)
         return ret;
 
-    if (!h->ps.sps || !h->ps.sps->crop)
+    av_dict_set(&dst->metadata, "stereo_mode", ff_h264_sei_stereo_mode(&h->sei.frame_packing), 0);
+
+    h->backup_width   = h->avctx->width;
+    h->backup_height  = h->avctx->height;
+    h->backup_pix_fmt = h->avctx->pix_fmt;
+
+    h->avctx->width   = dst->width;
+    h->avctx->height  = dst->height;
+    h->avctx->pix_fmt = dst->format;
+
+    if (srcp->sei_recovery_frame_cnt == 0)
+        dst->key_frame = 1;
+    if (!srcp->crop)
         return 0;
 
-    for (i = 0; i < 3; i++) {
-        int hshift = (i > 0) ? h->chroma_x_shift : 0;
-        int vshift = (i > 0) ? h->chroma_y_shift : 0;
-        int off    = ((h->ps.sps->crop_left >> hshift) << h->pixel_shift) +
-                     (h->ps.sps->crop_top >> vshift) * dst->linesize[i];
+    for (i = 0; i < desc->nb_components; i++) {
+        int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+        int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+        int off    = ((srcp->crop_left >> hshift) << h->pixel_shift) +
+                      (srcp->crop_top  >> vshift) * dst->linesize[i];
         dst->data[i] += off;
     }
     return 0;
 }
 
+static int is_extra(const uint8_t *buf, int buf_size)
+{
+    int cnt= buf[5]&0x1f;
+    const uint8_t *p= buf+6;
+    while(cnt--){
+        int nalsize= AV_RB16(p) + 2;
+        if(nalsize > buf_size - (p-buf) || (p[2] & 0x9F) != 7)
+            return 0;
+        p += nalsize;
+    }
+    cnt = *(p++);
+    if(!cnt)
+        return 0;
+    while(cnt--){
+        int nalsize= AV_RB16(p) + 2;
+        if(nalsize > buf_size - (p-buf) || (p[2] & 0x9F) != 8)
+            return 0;
+        p += nalsize;
+    }
+    return 1;
+}
+
 static int h264_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
@@ -1003,18 +1236,34 @@ static int h264_decode_frame(AVCodecContext *avctx, void *data,
     H264Context *h     = avctx->priv_data;
     AVFrame *pict      = data;
     int buf_index      = 0;
+    H264Picture *out;
+    int i, out_idx;
     int ret;
 
     h->flags = avctx->flags;
     h->setup_finished = 0;
 
+    if (h->backup_width != -1) {
+        avctx->width    = h->backup_width;
+        h->backup_width = -1;
+    }
+    if (h->backup_height != -1) {
+        avctx->height    = h->backup_height;
+        h->backup_height = -1;
+    }
+    if (h->backup_pix_fmt != AV_PIX_FMT_NONE) {
+        avctx->pix_fmt    = h->backup_pix_fmt;
+        h->backup_pix_fmt = AV_PIX_FMT_NONE;
+    }
+
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
+
     /* end of stream, output what is still in the buffers */
-out:
     if (buf_size == 0) {
-        H264Picture *out;
-        int i, out_idx;
+ out:
 
         h->cur_pic_ptr = NULL;
+        h->first_field = 0;
 
         // FIXME factorize this with the output code below
         out     = h->delayed_pic[0];
@@ -1033,7 +1282,8 @@ out:
             h->delayed_pic[i] = h->delayed_pic[i + 1];
 
         if (out) {
-            ret = output_frame(h, pict, out->f);
+            out->reference &= ~DELAYED_PIC_REF;
+            ret = output_frame(h, pict, out);
             if (ret < 0)
                 return ret;
             *got_frame = 1;
@@ -1041,19 +1291,34 @@ out:
 
         return buf_index;
     }
+    if (h->is_avc && av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, NULL)) {
+        int side_size;
+        uint8_t *side = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+        if (is_extra(side, side_size))
+            ff_h264_decode_extradata(side, side_size,
+                                     &h->ps, &h->is_avc, &h->nal_length_size,
+                                     avctx->err_recognition, avctx);
+    }
+    if(h->is_avc && buf_size >= 9 && buf[0]==1 && buf[2]==0 && (buf[4]&0xFC)==0xFC && (buf[5]&0x1F) && buf[8]==0x67){
+        if (is_extra(buf, buf_size))
+            return ff_h264_decode_extradata(buf, buf_size,
+                                            &h->ps, &h->is_avc, &h->nal_length_size,
+                                            avctx->err_recognition, avctx);
+    }
 
     buf_index = decode_nal_units(h, buf, buf_size);
     if (buf_index < 0)
         return AVERROR_INVALIDDATA;
 
     if (!h->cur_pic_ptr && h->nal_unit_type == NAL_END_SEQUENCE) {
-        buf_size = 0;
+        av_assert0(buf_index <= buf_size);
         goto out;
     }
 
     if (!(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS) && !h->cur_pic_ptr) {
-        if (avctx->skip_frame >= AVDISCARD_NONREF)
-            return 0;
+        if (avctx->skip_frame >= AVDISCARD_NONREF ||
+            buf_size >= 4 && !memcmp("Q264", buf, 4))
+            return buf_size;
         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
         return AVERROR_INVALIDDATA;
     }
@@ -1063,22 +1328,58 @@ out:
         if (avctx->flags2 & AV_CODEC_FLAG2_CHUNKS)
             decode_postinit(h, 1);
 
-        ff_h264_field_end(h, &h->slice_ctx[0], 0);
+        if ((ret = ff_h264_field_end(h, &h->slice_ctx[0], 0)) < 0)
+            return ret;
 
+        /* Wait for second field. */
         *got_frame = 0;
         if (h->next_output_pic && ((avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) ||
+                                   (avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL) ||
                                    h->next_output_pic->recovered)) {
             if (!h->next_output_pic->recovered)
                 h->next_output_pic->f->flags |= AV_FRAME_FLAG_CORRUPT;
 
-            ret = output_frame(h, pict, h->next_output_pic->f);
+            if (!h->avctx->hwaccel &&
+                 (h->next_output_pic->field_poc[0] == INT_MAX ||
+                  h->next_output_pic->field_poc[1] == INT_MAX)
+            ) {
+                int p;
+                AVFrame *f = h->next_output_pic->f;
+                int field = h->next_output_pic->field_poc[0] == INT_MAX;
+                uint8_t *dst_data[4];
+                int linesizes[4];
+                const uint8_t *src_data[4];
+
+                av_log(h->avctx, AV_LOG_DEBUG, "Duplicating field %d to fill missing\n", field);
+
+                for (p = 0; p<4; p++) {
+                    dst_data[p] = f->data[p] + (field^1)*f->linesize[p];
+                    src_data[p] = f->data[p] +  field   *f->linesize[p];
+                    linesizes[p] = 2*f->linesize[p];
+                }
+
+                av_image_copy(dst_data, linesizes, src_data, linesizes,
+                              f->format, f->width, f->height>>1);
+            }
+
+            ret = output_frame(h, pict, h->next_output_pic);
             if (ret < 0)
                 return ret;
             *got_frame = 1;
+            if (CONFIG_MPEGVIDEO) {
+                ff_print_debug_info2(h->avctx, pict, NULL,
+                                    h->next_output_pic->mb_type,
+                                    h->next_output_pic->qscale_table,
+                                    h->next_output_pic->motion_val,
+                                    NULL,
+                                    h->mb_width, h->mb_height, h->mb_stride, 1);
+            }
         }
     }
 
-    assert(pict->buf[0] || !*got_frame);
+    av_assert0(pict->buf[0] || !*got_frame);
+
+    ff_h264_unref_picture(h, &h->last_pic_for_ec);
 
     return get_consumed_bytes(buf_index, buf_size);
 }
@@ -1086,12 +1387,14 @@ out:
 #define OFFSET(x) offsetof(H264Context, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption h264_options[] = {
-    { "enable_er", "Enable error resilience on damaged frames (unsafe)", OFFSET(enable_er), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VD },
+    {"is_avc", "is avc", offsetof(H264Context, is_avc), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, 0},
+    {"nal_length_size", "nal_length_size", offsetof(H264Context, nal_length_size), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 4, 0},
+    { "enable_er", "Enable error resilience on damaged frames (unsafe)", OFFSET(enable_er), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VD },
     { NULL },
 };
 
 static const AVClass h264_class = {
-    .class_name = "h264",
+    .class_name = "H264 Decoder",
     .item_name  = av_default_item_name,
     .option     = h264_options,
     .version    = LIBAVUTIL_VERSION_INT,
@@ -1116,3 +1419,29 @@ AVCodec ff_h264_decoder = {
     .profiles              = NULL_IF_CONFIG_SMALL(ff_h264_profiles),
     .priv_class            = &h264_class,
 };
+
+#if CONFIG_H264_VDPAU_DECODER && FF_API_VDPAU
+static const AVClass h264_vdpau_class = {
+    .class_name = "H264 VDPAU Decoder",
+    .item_name  = av_default_item_name,
+    .option     = h264_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_h264_vdpau_decoder = {
+    .name           = "h264_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(H264Context),
+    .init           = ff_h264_decode_init,
+    .close          = h264_decode_end,
+    .decode         = h264_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HWACCEL_VDPAU,
+    .flush          = flush_dpb,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_VDPAU_H264,
+                                                     AV_PIX_FMT_NONE},
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_h264_profiles),
+    .priv_class     = &h264_vdpau_class,
+};
+#endif
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index bfbcc81..efe3555 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,7 +48,7 @@
 #include "rectangle.h"
 #include "videodsp.h"
 
-#define H264_MAX_PICTURE_COUNT 32
+#define H264_MAX_PICTURE_COUNT 36
 
 #define MAX_SPS_COUNT          32
 #define MAX_PPS_COUNT         256
@@ -57,6 +57,8 @@
 
 #define MAX_DELAYED_PIC_COUNT  16
 
+#define MAX_MBPAIR_SIZE (256*1024) // a tighter bound could be calculated if someone cares about a few bytes
+
 /* Compiling in interlaced support reduces the speed
  * of progressive decoding by about 2%. */
 #define ALLOW_INTERLACE
@@ -70,17 +72,17 @@
 #define MAX_SLICES 32
 
 #ifdef ALLOW_INTERLACE
-#define MB_MBAFF(h)    h->mb_mbaff
-#define MB_FIELD(h)    h->mb_field_decoding_flag
-#define FRAME_MBAFF(h) h->mb_aff_frame
-#define FIELD_PICTURE(h) (h->picture_structure != PICT_FRAME)
+#define MB_MBAFF(h)    (h)->mb_mbaff
+#define MB_FIELD(sl)  (sl)->mb_field_decoding_flag
+#define FRAME_MBAFF(h) (h)->mb_aff_frame
+#define FIELD_PICTURE(h) ((h)->picture_structure != PICT_FRAME)
 #define LEFT_MBS 2
 #define LTOP     0
 #define LBOT     1
 #define LEFT(i)  (i)
 #else
 #define MB_MBAFF(h)      0
-#define MB_FIELD(h)      0
+#define MB_FIELD(sl)     0
 #define FRAME_MBAFF(h)   0
 #define FIELD_PICTURE(h) 0
 #undef  IS_INTERLACED
@@ -93,11 +95,12 @@
 #define FIELD_OR_MBAFF_PICTURE(h) (FRAME_MBAFF(h) || FIELD_PICTURE(h))
 
 #ifndef CABAC
-#define CABAC(h) h->ps.pps->cabac
+#define CABAC(h) (h)->ps.pps->cabac
 #endif
 
-#define CHROMA422(h) (h->ps.sps->chroma_format_idc == 2)
-#define CHROMA444(h) (h->ps.sps->chroma_format_idc == 3)
+#define CHROMA(h)    ((h)->ps.sps->chroma_format_idc)
+#define CHROMA422(h) ((h)->ps.sps->chroma_format_idc == 2)
+#define CHROMA444(h) ((h)->ps.sps->chroma_format_idc == 3)
 
 #define EXTENDED_SAR       255
 
@@ -106,7 +109,7 @@
 #define IS_REF0(a)         ((a) & MB_TYPE_REF0)
 #define IS_8x8DCT(a)       ((a) & MB_TYPE_8x8DCT)
 
-#define QP_MAX_NUM (51 + 2 * 6)           // The maximum supported qp
+#define QP_MAX_NUM (51 + 6*6)           // The maximum supported qp
 
 /* NAL unit types */
 enum {
@@ -187,6 +190,8 @@ typedef struct SPS {
     int bit_depth_chroma;                 ///< bit_depth_chroma_minus8 + 8
     int residual_color_transform_flag;    ///< residual_colour_transform_flag
     int constraint_set_flags;             ///< constraint_set[0-3]_flag
+    uint8_t data[4096];
+    size_t data_size;
 } SPS;
 
 /**
@@ -210,8 +215,10 @@ typedef struct PPS {
     int transform_8x8_mode;         ///< transform_8x8_mode_flag
     uint8_t scaling_matrix4[6][16];
     uint8_t scaling_matrix8[6][64];
-    uint8_t chroma_qp_table[2][64]; ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
+    uint8_t chroma_qp_table[2][QP_MAX_NUM+1];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
     int chroma_qp_diff;
+    uint8_t data[4096];
+    size_t data_size;
 
     uint32_t dequant4_buffer[6][QP_MAX_NUM + 1][16];
     uint32_t dequant8_buffer[6][QP_MAX_NUM + 1][64];
@@ -223,6 +230,8 @@ typedef struct H264ParamSets {
     AVBufferRef *sps_list[MAX_SPS_COUNT];
     AVBufferRef *pps_list[MAX_PPS_COUNT];
 
+    AVBufferRef *pps_ref;
+    AVBufferRef *sps_ref;
     /* currently active parameters sets */
     const PPS *pps;
     // FIXME this should properly be const
@@ -278,13 +287,19 @@ typedef struct H264Picture {
     int pic_id;             /**< pic_num (short -> no wrap version of pic_num,
                                  pic_num & max_pic_num; long -> long_pic_num) */
     int long_ref;           ///< 1->long term reference 0->short term reference
-    int ref_poc[2][2][32];  ///< POCs of the frames used as reference (FIXME need per slice)
+    int ref_poc[2][2][32];  ///< POCs of the frames/fields used as reference (FIXME need per slice)
     int ref_count[2][2];    ///< number of entries in ref_poc         (FIXME need per slice)
     int mbaff;              ///< 1 -> MBAFF frame 0-> not MBAFF
     int field_picture;      ///< whether or not picture was encoded in separate fields
 
     int reference;
     int recovered;          ///< picture at IDR or recovery point + recovery count
+    int invalid_gap;
+    int sei_recovery_frame_cnt;
+
+    int crop;
+    int crop_left;
+    int crop_top;
 } H264Picture;
 
 typedef struct H264Ref {
@@ -450,6 +465,7 @@ typedef struct H264Context {
     H264Picture DPB[H264_MAX_PICTURE_COUNT];
     H264Picture *cur_pic_ptr;
     H264Picture cur_pic;
+    H264Picture last_pic_for_ec;
 
     H264SliceContext *slice_ctx;
     int            nb_slice_ctx;
@@ -462,6 +478,14 @@ typedef struct H264Context {
     int width, height;
     int chroma_x_shift, chroma_y_shift;
 
+    /**
+     * Backup frame properties: needed, because they can be different
+     * between returned frame and last decoded frame.
+     **/
+    int backup_width;
+    int backup_height;
+    enum AVPixelFormat backup_pix_fmt;
+
     int droppable;
     int coded_picture_number;
 
@@ -492,6 +516,11 @@ typedef struct H264Context {
     uint32_t *mb2br_xy;
     int b_stride;       // FIXME use s->b4_stride
 
+
+    unsigned current_sps_id; ///< id of the current SPS
+
+    int au_pps_id; ///< pps_id of current access unit
+
     uint16_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
 
     // interlacing specific flags
@@ -515,12 +544,12 @@ typedef struct H264Context {
     uint8_t field_scan[16];
     uint8_t field_scan8x8[64];
     uint8_t field_scan8x8_cavlc[64];
-    const uint8_t *zigzag_scan_q0;
-    const uint8_t *zigzag_scan8x8_q0;
-    const uint8_t *zigzag_scan8x8_cavlc_q0;
-    const uint8_t *field_scan_q0;
-    const uint8_t *field_scan8x8_q0;
-    const uint8_t *field_scan8x8_cavlc_q0;
+    uint8_t zigzag_scan_q0[16];
+    uint8_t zigzag_scan8x8_q0[64];
+    uint8_t zigzag_scan8x8_cavlc_q0[64];
+    uint8_t field_scan_q0[16];
+    uint8_t field_scan8x8_q0[64];
+    uint8_t field_scan8x8_cavlc_q0[64];
 
     int mb_y;
     int mb_height, mb_width;
@@ -558,6 +587,7 @@ typedef struct H264Context {
      */
     int max_pic_num;
 
+    H264Ref default_ref[2];
     H264Picture *short_ref[32];
     H264Picture *long_ref[32];
     H264Picture *delayed_pic[MAX_DELAYED_PIC_COUNT + 2]; // FIXME size?
@@ -584,6 +614,20 @@ typedef struct H264Context {
      */
     int current_slice;
 
+    /**
+     * Max number of threads / contexts.
+     * This is equal to AVCodecContext.thread_count unless
+     * multithreaded decoding is impossible, in which case it is
+     * reduced to 1.
+     */
+    int max_contexts;
+
+    /**
+     *  1 if the single thread fallback warning has already been
+     *  displayed, 0 otherwise.
+     */
+    int single_decode_warning;
+
     /** @} */
 
     /**
@@ -595,6 +639,11 @@ typedef struct H264Context {
     int prev_interlaced_frame;
 
     /**
+     * Are the SEI recovery points looking valid.
+     */
+    int valid_recovery_point;
+
+    /**
      * recovery_frame is the frame_num at which the next frame should
      * be fully constructed.
      *
@@ -615,12 +664,20 @@ typedef struct H264Context {
 
     int frame_recovered;    ///< Initial frame has been completely recovered
 
-    /* for frame threading, this is set to 1
+    int has_recovery_point;
+
+    int missing_fields;
+
+/* for frame threading, this is set to 1
      * after finish_setup() has been called, so we cannot modify
      * some context properties (which are supposed to stay constant between
      * slices) anymore */
     int setup_finished;
 
+    int cur_chroma_format_idc;
+    int cur_bit_depth_luma;
+    int16_t slice_row[MAX_SLICES]; ///< to detect when MAX_SLICES is too low
+
     int enable_er;
 
     H264SEIContext sei;
@@ -635,10 +692,16 @@ typedef struct H264Context {
 extern const uint16_t ff_h264_mb_sizes[4];
 
 /**
+ * Uninit H264 param sets structure.
+ */
+
+void ff_h264_ps_uninit(H264ParamSets *ps);
+
+/**
  * Decode SPS
  */
 int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
-                                     H264ParamSets *ps);
+                                     H264ParamSets *ps, int ignore_truncation);
 
 /**
  * Decode PPS
@@ -657,7 +720,7 @@ int ff_h264_get_slice_type(const H264SliceContext *sl);
  */
 int ff_h264_alloc_tables(H264Context *h);
 
-int ff_h264_decode_ref_pic_list_reordering(const H264Context *h, H264SliceContext *sl);
+int ff_h264_decode_ref_pic_list_reordering(H264Context *h, H264SliceContext *sl);
 void ff_h264_fill_mbaff_ref_list(H264SliceContext *sl);
 void ff_h264_remove_all_refs(H264Context *h);
 
@@ -689,8 +752,6 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl);
 
 void ff_h264_init_cabac_states(const H264Context *h, H264SliceContext *sl);
 
-void ff_h264_init_dequant_tables(H264Context *h);
-
 void ff_h264_direct_dist_scale_factor(const H264Context *const h, H264SliceContext *sl);
 void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *sl);
 void ff_h264_pred_direct_motion(const H264Context *const h, H264SliceContext *sl,
@@ -753,7 +814,7 @@ static const uint8_t scan8[16 * 3 + 3] = {
     0 +  0 * 8, 0 +  5 * 8, 0 + 10 * 8
 };
 
-static av_always_inline uint32_t pack16to32(int a, int b)
+static av_always_inline uint32_t pack16to32(unsigned a, unsigned b)
 {
 #if HAVE_BIGENDIAN
     return (b & 0xFFFF) + (a << 16);
@@ -762,7 +823,7 @@ static av_always_inline uint32_t pack16to32(int a, int b)
 #endif
 }
 
-static av_always_inline uint16_t pack8to16(int a, int b)
+static av_always_inline uint16_t pack8to16(unsigned a, unsigned b)
 {
 #if HAVE_BIGENDIAN
     return (b & 0xFF) + (a << 8);
@@ -909,6 +970,16 @@ static av_always_inline int get_dct8x8_allowed(const H264Context *h, H264SliceCo
                   0x0001000100010001ULL));
 }
 
+static inline int find_start_code(const uint8_t *buf, int buf_size,
+                           int buf_index, int next_avc)
+{
+    uint32_t state = -1;
+
+    buf_index = avpriv_find_start_code(buf + buf_index, buf + next_avc + 1, &state) - buf - 1;
+
+    return FFMIN(buf_index, buf_size);
+}
+
 int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup);
 
 int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src);
@@ -919,6 +990,9 @@ int ff_h264_slice_context_init(H264Context *h, H264SliceContext *sl);
 void ff_h264_draw_horiz_band(const H264Context *h, H264SliceContext *sl, int y, int height);
 
 int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl);
+#define SLICE_SINGLETHREAD 1
+#define SLICE_SKIPED 2
+
 int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count);
 int ff_h264_update_thread_context(AVCodecContext *dst,
                                   const AVCodecContext *src);
@@ -927,4 +1001,6 @@ void ff_h264_flush_change(H264Context *h);
 
 void ff_h264_free_tables(H264Context *h);
 
+void ff_h264_set_erpic(ERPicture *dst, H264Picture *src);
+
 #endif /* AVCODEC_H264_H */
diff --git a/libavcodec/h2645_parse.c b/libavcodec/h2645_parse.c
index defe001..9979b63 100644
--- a/libavcodec/h2645_parse.c
+++ b/libavcodec/h2645_parse.c
@@ -1,20 +1,20 @@
 /*
  * H.264/HEVC common parsing code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 
+#include "hevc.h"
 #include "h2645_parse.h"
 
 int ff_h2645_extract_rbsp(const uint8_t *src, int length,
@@ -34,9 +35,10 @@ int ff_h2645_extract_rbsp(const uint8_t *src, int length,
     int i, si, di;
     uint8_t *dst;
 
+    nal->skipped_bytes = 0;
 #define STARTCODE_TEST                                                  \
         if (i + 2 < length && src[i + 1] == 0 && src[i + 2] <= 3) {     \
-            if (src[i + 2] != 3) {                                      \
+            if (src[i + 2] != 3 && src[i + 2] != 0) {                   \
                 /* startcode, so we must be past the end */             \
                 length = i;                                             \
             }                                                           \
@@ -101,12 +103,28 @@ int ff_h2645_extract_rbsp(const uint8_t *src, int length,
         if (src[si + 2] > 3) {
             dst[di++] = src[si++];
             dst[di++] = src[si++];
-        } else if (src[si] == 0 && src[si + 1] == 0) {
+        } else if (src[si] == 0 && src[si + 1] == 0 && src[si + 2] != 0) {
             if (src[si + 2] == 3) { // escape
                 dst[di++] = 0;
                 dst[di++] = 0;
                 si       += 3;
 
+                if (nal->skipped_bytes_pos) {
+                    nal->skipped_bytes++;
+                    if (nal->skipped_bytes_pos_size < nal->skipped_bytes) {
+                        nal->skipped_bytes_pos_size *= 2;
+                        av_assert0(nal->skipped_bytes_pos_size >= nal->skipped_bytes);
+                        av_reallocp_array(&nal->skipped_bytes_pos,
+                                nal->skipped_bytes_pos_size,
+                                sizeof(*nal->skipped_bytes_pos));
+                        if (!nal->skipped_bytes_pos) {
+                            nal->skipped_bytes_pos_size = 0;
+                            return AVERROR(ENOMEM);
+                        }
+                    }
+                    if (nal->skipped_bytes_pos)
+                        nal->skipped_bytes_pos[nal->skipped_bytes-1] = di - 1;
+                }
                 continue;
             } else // next start code
                 goto nsc;
@@ -127,6 +145,38 @@ nsc:
     return si;
 }
 
+static const char *nal_unit_name(int nal_type)
+{
+    switch(nal_type) {
+    case NAL_TRAIL_N    : return "TRAIL_N";
+    case NAL_TRAIL_R    : return "TRAIL_R";
+    case NAL_TSA_N      : return "TSA_N";
+    case NAL_TSA_R      : return "TSA_R";
+    case NAL_STSA_N     : return "STSA_N";
+    case NAL_STSA_R     : return "STSA_R";
+    case NAL_RADL_N     : return "RADL_N";
+    case NAL_RADL_R     : return "RADL_R";
+    case NAL_RASL_N     : return "RASL_N";
+    case NAL_RASL_R     : return "RASL_R";
+    case NAL_BLA_W_LP   : return "BLA_W_LP";
+    case NAL_BLA_W_RADL : return "BLA_W_RADL";
+    case NAL_BLA_N_LP   : return "BLA_N_LP";
+    case NAL_IDR_W_RADL : return "IDR_W_RADL";
+    case NAL_IDR_N_LP   : return "IDR_N_LP";
+    case NAL_CRA_NUT    : return "CRA_NUT";
+    case NAL_VPS        : return "VPS";
+    case NAL_SPS        : return "SPS";
+    case NAL_PPS        : return "PPS";
+    case NAL_AUD        : return "AUD";
+    case NAL_EOS_NUT    : return "EOS_NUT";
+    case NAL_EOB_NUT    : return "EOB_NUT";
+    case NAL_FD_NUT     : return "FD_NUT";
+    case NAL_SEI_PREFIX : return "SEI_PREFIX";
+    case NAL_SEI_SUFFIX : return "SEI_SUFFIX";
+    default : return "?";
+    }
+}
+
 static int get_bit_length(H2645NAL *nal, int skip_trailing_zeros)
 {
     int size = nal->size;
@@ -147,7 +197,7 @@ static int get_bit_length(H2645NAL *nal, int skip_trailing_zeros)
     /* remove the stop bit and following trailing zeros,
      * or nothing for damaged bitstreams */
     if (v)
-        size -= av_ctz(v) + 1;
+        size -= ff_ctz(v) + 1;
 
     return size;
 }
@@ -172,8 +222,8 @@ static int hevc_parse_nal_header(H2645NAL *nal, void *logctx)
         return AVERROR_INVALIDDATA;
 
     av_log(logctx, AV_LOG_DEBUG,
-           "nal_unit_type: %d, nuh_layer_id: %dtemporal_id: %d\n",
-           nal->type, nuh_layer_id, nal->temporal_id);
+           "nal_unit_type: %d(%s), nuh_layer_id: %d, temporal_id: %d\n",
+           nal->type, nal_unit_name(nal->type), nuh_layer_id, nal->temporal_id);
 
     return nuh_layer_id == 0;
 }
@@ -200,6 +250,7 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
                           enum AVCodecID codec_id)
 {
     int consumed, ret = 0;
+    const uint8_t *next_avc = is_nalff ? buf : buf + length;
 
     pkt->nb_nals = 0;
     while (length >= 4) {
@@ -207,7 +258,7 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
         int extract_length = 0;
         int skip_trailing_zeros = 1;
 
-        if (is_nalff) {
+        if (buf >= next_avc) {
             int i;
             for (i = 0; i < nal_length_size; i++)
                 extract_length = (extract_length << 8) | buf[i];
@@ -218,37 +269,70 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
                 av_log(logctx, AV_LOG_ERROR, "Invalid NAL unit size.\n");
                 return AVERROR_INVALIDDATA;
             }
+            next_avc = buf + extract_length;
         } else {
-            if (buf[2] == 0) {
-                length--;
-                buf++;
-                continue;
+            /* search start code */
+            while (buf[0] != 0 || buf[1] != 0 || buf[2] != 1) {
+                ++buf;
+                --length;
+                if (length < 4) {
+                    if (pkt->nb_nals > 0) {
+                        // No more start codes: we discarded some irrelevant
+                        // bytes at the end of the packet.
+                        return 0;
+                    } else {
+                        av_log(logctx, AV_LOG_ERROR, "No start code is found.\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                } else if (buf >= (next_avc - 3))
+                    break;
             }
-            if (buf[0] != 0 || buf[1] != 0 || buf[2] != 1)
-                return AVERROR_INVALIDDATA;
 
             buf           += 3;
             length        -= 3;
             extract_length = length;
+
+            if (buf >= next_avc) {
+                /* skip to the start of the next NAL */
+                int offset = next_avc - buf;
+                buf    += offset;
+                length -= offset;
+                continue;
+            }
         }
 
         if (pkt->nals_allocated < pkt->nb_nals + 1) {
             int new_size = pkt->nals_allocated + 1;
-            H2645NAL *tmp = av_realloc_array(pkt->nals, new_size, sizeof(*tmp));
+            void *tmp = av_realloc_array(pkt->nals, new_size, sizeof(*pkt->nals));
+
             if (!tmp)
                 return AVERROR(ENOMEM);
 
             pkt->nals = tmp;
             memset(pkt->nals + pkt->nals_allocated, 0,
-                   (new_size - pkt->nals_allocated) * sizeof(*tmp));
+                   (new_size - pkt->nals_allocated) * sizeof(*pkt->nals));
+
+            nal = &pkt->nals[pkt->nb_nals];
+            nal->skipped_bytes_pos_size = 1024; // initial buffer size
+            nal->skipped_bytes_pos = av_malloc_array(nal->skipped_bytes_pos_size, sizeof(*nal->skipped_bytes_pos));
+            if (!nal->skipped_bytes_pos)
+                return AVERROR(ENOMEM);
+
             pkt->nals_allocated = new_size;
         }
-        nal = &pkt->nals[pkt->nb_nals++];
+        nal = &pkt->nals[pkt->nb_nals];
 
         consumed = ff_h2645_extract_rbsp(buf, extract_length, nal);
         if (consumed < 0)
             return consumed;
 
+        if (is_nalff && (extract_length != consumed) && extract_length)
+            av_log(logctx, AV_LOG_DEBUG,
+                   "NALFF: Consumed only %d bytes instead of %d\n",
+                   consumed, extract_length);
+
+        pkt->nb_nals++;
+
         /* see commit 3566042a0 */
         if (consumed < length - 3 &&
             buf[consumed]     == 0x00 && buf[consumed + 1] == 0x00 &&
@@ -265,7 +349,7 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
             ret = hevc_parse_nal_header(nal, logctx);
         else
             ret = h264_parse_nal_header(nal, logctx);
-        if (ret <= 0) {
+        if (ret <= 0 || nal->size <= 0) {
             if (ret < 0) {
                 av_log(logctx, AV_LOG_ERROR, "Invalid NAL unit %d, skipping.\n",
                        nal->type);
@@ -283,8 +367,10 @@ int ff_h2645_packet_split(H2645Packet *pkt, const uint8_t *buf, int length,
 void ff_h2645_packet_uninit(H2645Packet *pkt)
 {
     int i;
-    for (i = 0; i < pkt->nals_allocated; i++)
+    for (i = 0; i < pkt->nals_allocated; i++) {
         av_freep(&pkt->nals[i].rbsp_buffer);
+        av_freep(&pkt->nals[i].skipped_bytes_pos);
+    }
     av_freep(&pkt->nals);
     pkt->nals_allocated = 0;
 }
diff --git a/libavcodec/h2645_parse.h b/libavcodec/h2645_parse.h
index 9cc4441..a3c7e1f 100644
--- a/libavcodec/h2645_parse.h
+++ b/libavcodec/h2645_parse.h
@@ -1,20 +1,20 @@
 /*
  * H.264/HEVC common parsing code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,6 +54,9 @@ typedef struct H2645NAL {
      */
     int temporal_id;
 
+    int skipped_bytes;
+    int skipped_bytes_pos_size;
+    int *skipped_bytes_pos;
     /**
      * H.264 only, nal_ref_idc
      */
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 5e7f3e7..68d7282 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,11 @@
  */
 
 #define CABAC(h) 1
+#define UNCHECKED_BITSTREAM_READER 1
 #define INT_BIT (CHAR_BIT * sizeof(int))
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/timer.h"
 #include "config.h"
 #include "cabac.h"
@@ -45,8 +47,6 @@
 #include "x86/h264_i386.h"
 #endif
 
-#include <assert.h>
-
 /* Cabac pre state table */
 
 static const int8_t cabac_context_init_I[1024][2] =
@@ -1284,7 +1284,7 @@ void ff_h264_init_cabac_states(const H264Context *h, H264SliceContext *sl)
 
 static int decode_cabac_field_decoding_flag(const H264Context *h, H264SliceContext *sl)
 {
-    const long mbb_xy = sl->mb_xy - 2L*h->mb_stride;
+    const int mbb_xy = sl->mb_xy - 2*h->mb_stride;
 
     unsigned long ctx = 0;
 
@@ -1501,7 +1501,7 @@ static int decode_cabac_mb_mvd(H264SliceContext *sl, int ctxbase, int amvd, int
     int mvd;
 
     if(!get_cabac(&sl->cabac, &sl->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
-//    if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
+//    if(!get_cabac(&sl->cabac, &sl->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
         *mvda= 0;
         return 0;
     }
@@ -1540,8 +1540,12 @@ static int decode_cabac_mb_mvd(H264SliceContext *sl, int ctxbase, int amvd, int
     int amvd1 = sl->mvd_cache[list][scan8[n] - 1][1] +\
                 sl->mvd_cache[list][scan8[n] - 8][1];\
 \
-    mx += decode_cabac_mb_mvd(sl, 40, amvd0, &mpx);\
-    my += decode_cabac_mb_mvd(sl, 47, amvd1, &mpy);\
+    int mxd = decode_cabac_mb_mvd(sl, 40, amvd0, &mpx);\
+    int myd = decode_cabac_mb_mvd(sl, 47, amvd1, &mpy);\
+    if (mxd == INT_MIN || myd == INT_MIN) \
+        return AVERROR_INVALIDDATA; \
+    mx += mxd;\
+    my += myd;\
 }
 
 static av_always_inline int get_cabac_cbf_ctx(H264SliceContext *sl,
@@ -1640,7 +1644,9 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
     cc.range     = sl->cabac.range;
     cc.low       = sl->cabac.low;
     cc.bytestream= sl->cabac.bytestream;
+#if !UNCHECKED_BITSTREAM_READER || ARCH_AARCH64
     cc.bytestream_end = sl->cabac.bytestream_end;
+#endif
 #else
 #define CC &sl->cabac
 #endif
@@ -1689,7 +1695,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
         }
 #endif
     }
-    assert(coeff_count > 0);
+    av_assert2(coeff_count > 0);
 
     if( is_dc ) {
         if( cat == 3 )
@@ -1701,7 +1707,7 @@ decode_cabac_residual_internal(const H264Context *h, H264SliceContext *sl,
         if( max_coeff == 64 )
             fill_rectangle(&sl->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
         else {
-            assert( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
+            av_assert2( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
             sl->non_zero_count_cache[scan8[n]] = coeff_count;
         }
     }
@@ -1917,6 +1923,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
 
     mb_xy = sl->mb_xy = sl->mb_x + sl->mb_y*h->mb_stride;
 
+    ff_tlog(h->avctx, "pic:%d mb:%d/%d\n", h->poc.frame_num, sl->mb_x, sl->mb_y);
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
         int skip;
         /* a skipped mb needs the aff flag from the following mb */
@@ -1955,7 +1962,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
 
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         int ctx = 0;
-        assert(sl->slice_type_nos == AV_PICTURE_TYPE_B);
+        av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_B);
 
         if (!IS_DIRECT(sl->left_type[LTOP] - 1))
             ctx++;
@@ -2008,7 +2015,7 @@ int ff_h264_decode_mb_cabac(const H264Context *h, H264SliceContext *sl)
         mb_type = decode_cabac_intra_mb_type(sl, 3, 1);
         if (sl->slice_type == AV_PICTURE_TYPE_SI && mb_type)
             mb_type--;
-        assert(sl->slice_type_nos == AV_PICTURE_TYPE_I);
+        av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_I);
 decode_intra_mb:
         partition_count = 0;
         cbp                      = ff_h264_i_mb_type_info[mb_type].cbp;
@@ -2024,6 +2031,7 @@ decode_intra_mb:
         const int mb_size = ff_h264_mb_sizes[sps->chroma_format_idc] *
                             sps->bit_depth_luma >> 3;
         const uint8_t *ptr;
+        int ret;
 
         // We assume these blocks are very rare so we do not optimize it.
         // FIXME The two following lines get the bitstream position in the cabac
@@ -2040,7 +2048,9 @@ decode_intra_mb:
         sl->intra_pcm_ptr = ptr;
         ptr += mb_size;
 
-        ff_init_cabac_decoder(&sl->cabac, ptr, sl->cabac.bytestream_end - ptr);
+        ret = ff_init_cabac_decoder(&sl->cabac, ptr, sl->cabac.bytestream_end - ptr);
+        if (ret < 0)
+            return ret;
 
         // All blocks are present
         h->cbp_table[mb_xy] = 0xf7ef;
@@ -2071,7 +2081,7 @@ decode_intra_mb:
                     int pred = pred_intra_mode(h, sl, i);
                     sl->intra4x4_pred_mode_cache[scan8[i]] = decode_cabac_mb_intra4x4_pred_mode(sl, pred);
 
-                    ff_dlog(h->avctx, "i4x4 pred=%d mode=%d\n", pred,
+                    ff_tlog(h->avctx, "i4x4 pred=%d mode=%d\n", pred,
                             sl->intra4x4_pred_mode_cache[scan8[i]]);
                 }
             }
@@ -2126,10 +2136,10 @@ decode_intra_mb:
                 for( i = 0; i < 4; i++ ) {
                     if(IS_DIRECT(sl->sub_mb_type[i])) continue;
                     if(IS_DIR(sl->sub_mb_type[i], 0, list)){
-                        int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                        unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                         if (rc > 1) {
                             ref[list][i] = decode_cabac_mb_ref(sl, list, 4 * i);
-                            if (ref[list][i] >= (unsigned) rc) {
+                            if (ref[list][i] >= rc) {
                                 av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], rc);
                                 return -1;
                             }
@@ -2212,10 +2222,11 @@ decode_intra_mb:
         if(IS_16X16(mb_type)){
             for (list = 0; list < sl->list_count; list++) {
                 if(IS_DIR(mb_type, 0, list)){
-                    int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                    int ref;
+                    unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                     if (rc > 1) {
                         ref= decode_cabac_mb_ref(sl, list, 0);
-                        if (ref >= (unsigned) rc) {
+                        if (ref >= rc) {
                             av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                             return -1;
                         }
@@ -2240,10 +2251,11 @@ decode_intra_mb:
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){
-                            int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            int ref;
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc > 1) {
                                 ref= decode_cabac_mb_ref(sl, list, 8 * i);
-                                if (ref >= (unsigned) rc) {
+                                if (ref >= rc) {
                                     av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                                     return -1;
                                 }
@@ -2271,14 +2283,15 @@ decode_intra_mb:
                 }
             }
         }else{
-            assert(IS_8X16(mb_type));
+            av_assert2(IS_8X16(mb_type));
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            int ref, rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            int ref;
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc > 1) {
                                 ref = decode_cabac_mb_ref(sl, list, 4 * i);
-                                if (ref >= (unsigned) rc) {
+                                if (ref >= rc) {
                                     av_log(h->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, rc);
                                     return -1;
                                 }
@@ -2317,6 +2330,11 @@ decode_intra_mb:
         cbp  = decode_cabac_mb_cbp_luma(sl);
         if(decode_chroma)
             cbp |= decode_cabac_mb_cbp_chroma(sl) << 4;
+    } else {
+        if (!decode_chroma && cbp>15) {
+            av_log(h->avctx, AV_LOG_ERROR, "gray chroma\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     h->cbp_table[mb_xy] = sl->cbp = cbp;
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index 1a80f76..d01586d 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... cavlc bitstream decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #define CABAC(h) 0
+#define UNCHECKED_BITSTREAM_READER 1
 
 #include "internal.h"
 #include "avcodec.h"
@@ -34,8 +35,8 @@
 #include "h264data.h"
 #include "golomb.h"
 #include "mpegutils.h"
+#include "libavutil/avassert.h"
 
-#include <assert.h>
 
 static const uint8_t golomb_to_inter_cbp_gray[16]={
  0, 1, 2, 4, 8, 3, 5,10,12,15, 7,11,13,14, 6, 9,
@@ -360,7 +361,7 @@ av_cold void ff_h264_decode_init_vlc(void){
          * the packed static coeff_token_vlc table sizes
          * were initialized correctly.
          */
-        assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
+        av_assert0(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
 
         for(i=0; i<3; i++){
             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
@@ -480,7 +481,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
 
     trailing_ones= coeff_token&3;
     ff_tlog(h->avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
-    assert(total_coeff<=16);
+    av_assert2(total_coeff<=16);
 
     i = show_bits(gb, 3);
     skip_bits(gb, trailing_ones);
@@ -512,7 +513,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                 else
                     level_code= prefix + get_bits(gb, 4); //part
             }else{
-                level_code= 30 + get_bits(gb, prefix-3); //part
+                level_code= 30;
                 if(prefix>=16){
                     if(prefix > 25+3){
                         av_log(h->avctx, AV_LOG_ERROR, "Invalid level prefix\n");
@@ -520,6 +521,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                     }
                     level_code += (1<<(prefix-3))-4096;
                 }
+                level_code += get_bits(gb, prefix-3); //part
             }
 
             if(trailing_ones < 3) level_code += 2;
@@ -549,9 +551,15 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
                 if(prefix<15){
                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
                 }else{
-                    level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
-                    if(prefix>=16)
+                    level_code = 15<<suffix_length;
+                    if (prefix>=16) {
+                        if(prefix > 25+3){
+                            av_log(h->avctx, AV_LOG_ERROR, "Invalid level prefix\n");
+                            return AVERROR_INVALIDDATA;
+                        }
                         level_code += (1<<(prefix-3))-4096;
+                    }
+                    level_code += get_bits(gb, prefix-3);
                 }
                 mask= -(level_code&1);
                 level_code= (((2+level_code)>>1) ^ mask) - mask;
@@ -566,13 +574,13 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
     else{
         if (max_coeff <= 8) {
             if (max_coeff == 4)
-                zeros_left = get_vlc2(gb, chroma_dc_total_zeros_vlc[total_coeff - 1].table,
+                zeros_left = get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[total_coeff].table,
                                       CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
             else
-                zeros_left = get_vlc2(gb, chroma422_dc_total_zeros_vlc[total_coeff - 1].table,
+                zeros_left = get_vlc2(gb, (chroma422_dc_total_zeros_vlc-1)[total_coeff].table,
                                       CHROMA422_DC_TOTAL_ZEROS_VLC_BITS, 1);
         } else {
-            zeros_left= get_vlc2(gb, total_zeros_vlc[total_coeff - 1].table, TOTAL_ZEROS_VLC_BITS, 1);
+            zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
         }
     }
 
@@ -582,7 +590,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         ((type*)block)[*scantable] = level[0]; \
         for(i=1;i<total_coeff && zeros_left > 0;i++) { \
             if(zeros_left < 7) \
-                run_before= get_vlc2(gb, run_vlc[zeros_left - 1].table, RUN_VLC_BITS, 1); \
+                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1); \
             else \
                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
             zeros_left -= run_before; \
@@ -597,7 +605,7 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         ((type*)block)[*scantable] = ((int)(level[0] * qmul[*scantable] + 32))>>6; \
         for(i=1;i<total_coeff && zeros_left > 0;i++) { \
             if(zeros_left < 7) \
-                run_before= get_vlc2(gb, run_vlc[zeros_left - 1].table, RUN_VLC_BITS, 1); \
+                run_before= get_vlc2(gb, (run_vlc-1)[zeros_left].table, RUN_VLC_BITS, 1); \
             else \
                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2); \
             zeros_left -= run_before; \
@@ -610,18 +618,17 @@ static int decode_residual(const H264Context *h, H264SliceContext *sl,
         } \
     }
 
-    if (zeros_left < 0) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "negative number of zero coeffs at %d %d\n", sl->mb_x, sl->mb_y);
-        return AVERROR_INVALIDDATA;
-    }
-
     if (h->pixel_shift) {
         STORE_BLOCK(int32_t)
     } else {
         STORE_BLOCK(int16_t)
     }
 
+    if(zeros_left<0){
+        av_log(h->avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", sl->mb_x, sl->mb_y);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -642,7 +649,7 @@ int decode_luma_residual(const H264Context *h, H264SliceContext *sl,
             return -1; //FIXME continue if partitioned and other return -1 too
         }
 
-        assert((cbp&15) == 0 || (cbp&15) == 15);
+        av_assert2((cbp&15) == 0 || (cbp&15) == 15);
 
         if(cbp&15){
             for(i8x8=0; i8x8<4; i8x8++){
@@ -707,11 +714,12 @@ int ff_h264_decode_mb_cavlc(const H264Context *h, H264SliceContext *sl)
 
     mb_xy = sl->mb_xy = sl->mb_x + sl->mb_y*h->mb_stride;
 
+    ff_tlog(h->avctx, "pic:%d mb:%d/%d\n", h->poc.frame_num, sl->mb_x, sl->mb_y);
     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
                 down the code */
     if (sl->slice_type_nos != AV_PICTURE_TYPE_I) {
         if (sl->mb_skip_run == -1)
-            sl->mb_skip_run = get_ue_golomb(&sl->gb);
+            sl->mb_skip_run = get_ue_golomb_long(&sl->gb);
 
         if (sl->mb_skip_run--) {
             if (FRAME_MBAFF(h) && (sl->mb_y & 1) == 0) {
@@ -747,7 +755,7 @@ int ff_h264_decode_mb_cavlc(const H264Context *h, H264SliceContext *sl)
             goto decode_intra_mb;
         }
     }else{
-       assert(sl->slice_type_nos == AV_PICTURE_TYPE_I);
+       av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_I);
         if (sl->slice_type == AV_PICTURE_TYPE_SI && mb_type)
             mb_type--;
 decode_intra_mb:
@@ -856,7 +864,7 @@ decode_intra_mb:
                 sl->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
             }
         }else{
-            assert(sl->slice_type_nos == AV_PICTURE_TYPE_P); //FIXME SP correct ?
+            av_assert2(sl->slice_type_nos == AV_PICTURE_TYPE_P); //FIXME SP correct ?
             for(i=0; i<4; i++){
                 sl->sub_mb_type[i]= get_ue_golomb_31(&sl->gb);
                 if(sl->sub_mb_type[i] >=4){
@@ -949,7 +957,7 @@ decode_intra_mb:
             for (list = 0; list < sl->list_count; list++) {
                     unsigned int val;
                     if(IS_DIR(mb_type, 0, list)){
-                        int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                        unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                         if (rc == 1) {
                             val= 0;
                         } else if (rc == 2) {
@@ -980,7 +988,7 @@ decode_intra_mb:
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){
-                            int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc == 1) {
                                 val= 0;
                             } else if (rc == 2) {
@@ -1013,12 +1021,12 @@ decode_intra_mb:
                 }
             }
         }else{
-            assert(IS_8X16(mb_type));
+            av_assert2(IS_8X16(mb_type));
             for (list = 0; list < sl->list_count; list++) {
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            int rc = sl->ref_count[list] << MB_MBAFF(sl);
+                            unsigned rc = sl->ref_count[list] << MB_MBAFF(sl);
                             if (rc == 1) {
                                 val= 0;
                             } else if (rc == 2) {
@@ -1076,6 +1084,11 @@ decode_intra_mb:
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
             else                     cbp= golomb_to_inter_cbp_gray[cbp];
         }
+    } else {
+        if (!decode_chroma && cbp>15) {
+            av_log(h->avctx, AV_LOG_ERROR, "gray chroma\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
@@ -1128,12 +1141,15 @@ decode_intra_mb:
             if (decode_luma_residual(h, sl, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ) {
                 return -1;
             }
-        } else if (CHROMA422(h)) {
+        } else {
+            const int num_c8x8 = h->ps.sps->chroma_format_idc;
+
             if(cbp&0x30){
                 for(chroma_idx=0; chroma_idx<2; chroma_idx++)
                     if (decode_residual(h, sl, gb, sl->mb + ((256 + 16*16*chroma_idx) << pixel_shift),
-                                        CHROMA_DC_BLOCK_INDEX + chroma_idx, ff_h264_chroma422_dc_scan,
-                                        NULL, 8) < 0) {
+                                        CHROMA_DC_BLOCK_INDEX + chroma_idx,
+                                        CHROMA422(h) ? ff_h264_chroma422_dc_scan : ff_h264_chroma_dc_scan,
+                                        NULL, 4 * num_c8x8) < 0) {
                         return -1;
                     }
             }
@@ -1142,7 +1158,7 @@ decode_intra_mb:
                 for(chroma_idx=0; chroma_idx<2; chroma_idx++){
                     const uint32_t *qmul = h->ps.pps->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][sl->chroma_qp[chroma_idx]];
                     int16_t *mb = sl->mb + (16*(16 + 16*chroma_idx) << pixel_shift);
-                    for (i8x8 = 0; i8x8 < 2; i8x8++) {
+                    for (i8x8 = 0; i8x8<num_c8x8; i8x8++) {
                         for (i4x4 = 0; i4x4 < 4; i4x4++) {
                             const int index = 16 + 16*chroma_idx + 8*i8x8 + i4x4;
                             if (decode_residual(h, sl, gb, mb, index, scan + 1, qmul, 15) < 0)
@@ -1155,29 +1171,6 @@ decode_intra_mb:
                 fill_rectangle(&sl->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
                 fill_rectangle(&sl->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
             }
-        } else /* yuv420 */ {
-            if(cbp&0x30){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++)
-                    if (decode_residual(h, sl, gb, sl->mb + ((256 + 16 * 16 * chroma_idx) << pixel_shift),
-                                        CHROMA_DC_BLOCK_INDEX + chroma_idx, ff_h264_chroma_dc_scan, NULL, 4) < 0) {
-                        return -1;
-                    }
-            }
-
-            if(cbp&0x20){
-                for(chroma_idx=0; chroma_idx<2; chroma_idx++){
-                    const uint32_t *qmul = h->ps.pps->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][sl->chroma_qp[chroma_idx]];
-                    for(i4x4=0; i4x4<4; i4x4++){
-                        const int index= 16 + 16*chroma_idx + i4x4;
-                        if( decode_residual(h, sl, gb, sl->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
-                            return -1;
-                        }
-                    }
-                }
-            }else{
-                fill_rectangle(&sl->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
-                fill_rectangle(&sl->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
-            }
         }
     }else{
         fill_rectangle(&sl->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
diff --git a/libavcodec/h264_direct.c b/libavcodec/h264_direct.c
index 177ec10..e137ff9 100644
--- a/libavcodec/h264_direct.c
+++ b/libavcodec/h264_direct.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -137,6 +137,10 @@ void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *
     if (h->picture_structure == PICT_FRAME) {
         int cur_poc  = h->cur_pic_ptr->poc;
         int *col_poc = sl->ref_list[1][0].parent->field_poc;
+        if (col_poc[0] == INT_MAX && col_poc[1] == INT_MAX) {
+            av_log(h->avctx, AV_LOG_ERROR, "co located POCs unavailable\n");
+            sl->col_parity = 1;
+        } else
         sl->col_parity = (FFABS(col_poc[0] - cur_poc) >=
                           FFABS(col_poc[1] - cur_poc));
         ref1sidx =
@@ -159,11 +163,11 @@ void ff_h264_direct_ref_list_init(const H264Context *const h, H264SliceContext *
     }
 }
 
-static void await_reference_mb_row(const H264Context *const h, H264Picture *ref,
+static void await_reference_mb_row(const H264Context *const h, H264Ref *ref,
                                    int mb_y)
 {
     int ref_field         = ref->reference - 1;
-    int ref_field_picture = ref->field_picture;
+    int ref_field_picture = ref->parent->field_picture;
     int ref_height        = 16 * h->mb_height >> ref_field_picture;
 
     if (!HAVE_THREADS || !(h->avctx->active_thread_type & FF_THREAD_FRAME))
@@ -172,7 +176,7 @@ static void await_reference_mb_row(const H264Context *const h, H264Picture *ref,
     /* FIXME: It can be safe to access mb stuff
      * even if pixels aren't deblocked yet. */
 
-    ff_thread_await_progress(&ref->tf,
+    ff_thread_await_progress(&ref->parent->tf,
                              FFMIN(16 * mb_y >> ref_field_picture,
                                    ref_height - 1),
                              ref_field_picture && ref_field);
@@ -196,7 +200,7 @@ static void pred_spatial_direct_motion(const H264Context *const h, H264SliceCont
 
     assert(sl->ref_list[1][0].reference & 3);
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent,
+    await_reference_mb_row(h, &sl->ref_list[1][0],
                            sl->mb_y + !!IS_INTERLACED(*mb_type));
 
 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16 | MB_TYPE_INTRA4x4 | \
@@ -237,6 +241,7 @@ static void pred_spatial_direct_motion(const H264Context *const h, H264SliceCont
                 else
                     mv[list] = AV_RN32A(C);
             }
+            av_assert2(ref[list] < (sl->ref_count[list] << !!FRAME_MBAFF(h)));
         } else {
             int mask = ~(MB_TYPE_L0 << (2 * list));
             mv[list]  = 0;
@@ -320,10 +325,10 @@ single_col:
         }
     }
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent, mb_y);
+    await_reference_mb_row(h, &sl->ref_list[1][0], mb_y);
 
-    l1mv0  = &sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
-    l1mv1  = &sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
+    l1mv0  = (void*)&sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
+    l1mv1  = (void*)&sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
     l1ref0 = &sl->ref_list[1][0].parent->ref_index[0][4 * mb_xy];
     l1ref1 = &sl->ref_list[1][0].parent->ref_index[1][4 * mb_xy];
     if (!b8_stride) {
@@ -479,7 +484,7 @@ static void pred_temp_direct_motion(const H264Context *const h, H264SliceContext
 
     assert(sl->ref_list[1][0].reference & 3);
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent,
+    await_reference_mb_row(h, &sl->ref_list[1][0],
                            sl->mb_y + !!IS_INTERLACED(*mb_type));
 
     if (IS_INTERLACED(sl->ref_list[1][0].parent->mb_type[mb_xy])) { // AFL/AFR/FR/FL -> AFL/FL
@@ -544,10 +549,10 @@ single_col:
         }
     }
 
-    await_reference_mb_row(h, sl->ref_list[1][0].parent, mb_y);
+    await_reference_mb_row(h, &sl->ref_list[1][0], mb_y);
 
-    l1mv0  = &sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
-    l1mv1  = &sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
+    l1mv0  = (void*)&sl->ref_list[1][0].parent->motion_val[0][h->mb2b_xy[mb_xy]];
+    l1mv1  = (void*)&sl->ref_list[1][0].parent->motion_val[1][h->mb2b_xy[mb_xy]];
     l1ref0 = &sl->ref_list[1][0].parent->ref_index[0][4 * mb_xy];
     l1ref1 = &sl->ref_list[1][0].parent->ref_index[1][4 * mb_xy];
     if (!b8_stride) {
diff --git a/libavcodec/h264_loopfilter.c b/libavcodec/h264_loopfilter.c
index 36bdd71..0ef0c12 100644
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... loop filter
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,8 +34,6 @@
 #include "mpegutils.h"
 #include "rectangle.h"
 
-#include <assert.h>
-
 /* Deblocking filter (p153) */
 static const uint8_t alpha_table[52*3] = {
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
@@ -244,7 +242,7 @@ static av_always_inline void h264_filter_mb_fast_internal(const H264Context *h,
                                                           unsigned int uvlinesize,
                                                           int pixel_shift)
 {
-    int chroma = !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
+    int chroma = CHROMA(h) && !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
     int chroma444 = CHROMA444(h);
     int chroma422 = CHROMA422(h);
 
@@ -358,7 +356,7 @@ static av_always_inline void h264_filter_mb_fast_internal(const H264Context *h,
         }
         return;
     } else {
-        LOCAL_ALIGNED_8(int16_t, bS, [2], [4][4]);
+        LOCAL_ALIGNED(8, int16_t, bS, [2], [4][4]);
         int edges;
         if( IS_8x8DCT(mb_type) && (sl->cbp&7) == 7 && !chroma444 ) {
             edges = 4;
@@ -421,7 +419,7 @@ void ff_h264_filter_mb_fast(const H264Context *h, H264SliceContext *sl,
                             uint8_t *img_cb, uint8_t *img_cr,
                             unsigned int linesize, unsigned int uvlinesize)
 {
-    assert(!FRAME_MBAFF(h));
+    av_assert2(!FRAME_MBAFF(h));
     if(!h->h264dsp.h264_loop_filter_strength || h->ps.pps->chroma_qp_diff) {
         ff_h264_filter_mb(h, sl, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
         return;
@@ -507,7 +505,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
             int j;
 
             for(j=0; j<2; j++, mbn_xy += h->mb_stride){
-                DECLARE_ALIGNED(8, int16_t, bS)[4];
+                LOCAL_ALIGNED(8, int16_t, bS, [4]);
                 int qp;
                 if (IS_INTRA(mb_type | h->cur_pic.mb_type[mbn_xy])) {
                     AV_WN64A(bS, 0x0003000300030003ULL);
@@ -544,7 +542,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
                 }
             }
         }else{
-            DECLARE_ALIGNED(8, int16_t, bS)[4];
+            LOCAL_ALIGNED(8, int16_t, bS, [4]);
             int qp;
 
             if( IS_INTRA(mb_type|mbm_type)) {
@@ -593,7 +591,9 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
             // value in IPCM macroblocks.
             if(bS[0]+bS[1]+bS[2]+bS[3]){
                 qp = (h->cur_pic.qscale_table[mb_xy] + h->cur_pic.qscale_table[mbm_xy] + 1) >> 1;
+                //ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], h->cur_pic.qscale_table[mbn_xy]);
                 ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+                //{ int i; for (i = 0; i < 4; i++) ff_tlog(h->avctx, " bS[%d]:%d", i, bS[i]); ff_tlog(h->avctx, "\n"); }
                 chroma_qp_avg[0] = (sl->chroma_qp[0] + get_chroma_qp(h, 0, h->cur_pic.qscale_table[mbm_xy]) + 1) >> 1;
                 chroma_qp_avg[1] = (sl->chroma_qp[1] + get_chroma_qp(h, 1, h->cur_pic.qscale_table[mbm_xy]) + 1) >> 1;
                 if( dir == 0 ) {
@@ -625,7 +625,7 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
 
     /* Calculate bS */
     for( edge = 1; edge < edges; edge++ ) {
-        DECLARE_ALIGNED(8, int16_t, bS)[4];
+        LOCAL_ALIGNED(8, int16_t, bS, [4]);
         int qp;
         const int deblock_edge = !IS_8x8DCT(mb_type & (edge<<24)); // (edge&1) && IS_8x8DCT(mb_type)
 
@@ -676,7 +676,9 @@ static av_always_inline void filter_mb_dir(const H264Context *h, H264SliceContex
         // Do not use s->qscale as luma quantizer because it has not the same
         // value in IPCM macroblocks.
         qp = h->cur_pic.qscale_table[mb_xy];
+        //ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], h->cur_pic.qscale_table[mbn_xy]);
         ff_tlog(h->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+        //{ int i; for (i = 0; i < 4; i++) ff_tlog(h->avctx, " bS[%d]:%d", i, bS[i]); ff_tlog(h->avctx, "\n"); }
         if( dir == 0 ) {
             filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, a, b, h, 0 );
             if (chroma) {
@@ -721,7 +723,7 @@ void ff_h264_filter_mb(const H264Context *h, H264SliceContext *sl,
     const int mb_type = h->cur_pic.mb_type[mb_xy];
     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
     int first_vertical_edge_done = 0;
-    int chroma = !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
+    int chroma = CHROMA(h) && !(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
     int qp_bd_offset = 6 * (h->ps.sps->bit_depth_luma - 8);
     int a = 52 + sl->slice_alpha_c0_offset - qp_bd_offset;
     int b = 52 + sl->slice_beta_offset - qp_bd_offset;
@@ -734,7 +736,7 @@ void ff_h264_filter_mb(const H264Context *h, H264SliceContext *sl,
         /* First vertical edge is different in MBAFF frames
          * There are 8 different bS to compute and 2 different Qp
          */
-        DECLARE_ALIGNED(8, int16_t, bS)[8];
+        LOCAL_ALIGNED(8, int16_t, bS, [8]);
         int qp[2];
         int bqp[2];
         int rqp[2];
diff --git a/libavcodec/h264_mb.c b/libavcodec/h264_mb.c
index 6f3c719..e33a59e 100644
--- a/libavcodec/h264_mb.c
+++ b/libavcodec/h264_mb.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,17 +39,17 @@ static inline int get_lowest_part_list_y(H264SliceContext *sl,
                                          int n, int height, int y_offset, int list)
 {
     int raw_my             = sl->mv_cache[list][scan8[n]][1];
-    int filter_height_up   = (raw_my & 3) ? 2 : 0;
     int filter_height_down = (raw_my & 3) ? 3 : 0;
     int full_my            = (raw_my >> 2) + y_offset;
-    int top                = full_my - filter_height_up;
     int bottom             = full_my + filter_height_down + height;
 
-    return FFMAX(abs(top), bottom);
+    av_assert2(height >= 0);
+
+    return FFMAX(0, bottom);
 }
 
 static inline void get_lowest_part_y(const H264Context *h, H264SliceContext *sl,
-                                     int refs[2][48], int n,
+                                     int16_t refs[2][48], int n,
                                      int height, int y_offset, int list0,
                                      int list1, int *nrefs)
 {
@@ -96,7 +96,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
 {
     const int mb_xy   = sl->mb_xy;
     const int mb_type = h->cur_pic.mb_type[mb_xy];
-    int refs[2][48];
+    int16_t refs[2][48];
     int nrefs[2] = { 0 };
     int ref, list;
 
@@ -118,7 +118,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
     } else {
         int i;
 
-        assert(IS_8X8(mb_type));
+        av_assert2(IS_8X8(mb_type));
 
         for (i = 0; i < 4; i++) {
             const int sub_mb_type = sl->sub_mb_type[i];
@@ -150,7 +150,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
                                   nrefs);
             } else {
                 int j;
-                assert(IS_SUB_4X4(sub_mb_type));
+                av_assert2(IS_SUB_4X4(sub_mb_type));
                 for (j = 0; j < 4; j++) {
                     int sub_y_offset = y_offset + 2 * (j & 2);
                     get_lowest_part_y(h, sl, refs, n + j, 4, sub_y_offset,
@@ -175,6 +175,7 @@ static void await_references(const H264Context *h, H264SliceContext *sl)
                 nrefs[list]--;
 
                 if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields
+                    av_assert2((ref_pic->parent->reference & 3) == 3);
                     ff_thread_await_progress(&ref_pic->parent->tf,
                                              FFMIN((row >> 1) - !(row & 1),
                                                    pic_height - 1),
@@ -214,7 +215,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
     const int mx      = sl->mv_cache[list][scan8[n]][0] + src_x_offset * 8;
     int my            = sl->mv_cache[list][scan8[n]][1] + src_y_offset * 8;
     const int luma_xy = (mx & 3) + ((my & 3) << 2);
-    ptrdiff_t offset  = ((mx >> 2) << pixel_shift) + (my >> 2) * sl->mb_linesize;
+    ptrdiff_t offset  = (mx >> 2) * (1 << pixel_shift) + (my >> 2) * sl->mb_linesize;
     uint8_t *src_y    = pic->data[0] + offset;
     uint8_t *src_cb, *src_cr;
     int extra_width  = 0;
@@ -289,9 +290,9 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
         emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1);
     }
 
-    src_cb = pic->data[1] + ((mx >> 3) << pixel_shift) +
+    src_cb = pic->data[1] + ((mx >> 3) * (1 << pixel_shift)) +
              (my >> ysh) * sl->mb_uvlinesize;
-    src_cr = pic->data[2] + ((mx >> 3) << pixel_shift) +
+    src_cr = pic->data[2] + ((mx >> 3) * (1 << pixel_shift)) +
              (my >> ysh) * sl->mb_uvlinesize;
 
     if (emu) {
@@ -303,7 +304,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
     }
     chroma_op(dest_cb, src_cb, sl->mb_uvlinesize,
               height >> (chroma_idc == 1 /* yuv420 */),
-              mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
+              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
 
     if (emu) {
         h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cr,
@@ -313,7 +314,7 @@ static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext
         src_cr = sl->edge_emu_buffer;
     }
     chroma_op(dest_cr, src_cr, sl->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
-              mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7);
+              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
 }
 
 static av_always_inline void mc_part_std(const H264Context *h, H264SliceContext *sl,
@@ -423,10 +424,12 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
             int weight1 = 64 - weight0;
             luma_weight_avg(dest_y, tmp_y, sl->mb_linesize,
                             height, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
-                              chroma_height, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
-                              chroma_height, 5, weight0, weight1, 0);
+            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
+                                  chroma_height, 5, weight0, weight1, 0);
+                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
+                                  chroma_height, 5, weight0, weight1, 0);
+            }
         } else {
             luma_weight_avg(dest_y, tmp_y, sl->mb_linesize, height,
                             sl->pwt.luma_log2_weight_denom,
@@ -434,18 +437,20 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
                             sl->pwt.luma_weight[refn1][1][0],
                             sl->pwt.luma_weight[refn0][0][1] +
                             sl->pwt.luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
-                              sl->pwt.chroma_log2_weight_denom,
-                              sl->pwt.chroma_weight[refn0][0][0][0],
-                              sl->pwt.chroma_weight[refn1][1][0][0],
-                              sl->pwt.chroma_weight[refn0][0][0][1] +
-                              sl->pwt.chroma_weight[refn1][1][0][1]);
-            chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
-                              sl->pwt.chroma_log2_weight_denom,
-                              sl->pwt.chroma_weight[refn0][0][1][0],
-                              sl->pwt.chroma_weight[refn1][1][1][0],
-                              sl->pwt.chroma_weight[refn0][0][1][1] +
-                              sl->pwt.chroma_weight[refn1][1][1][1]);
+            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
+                                  sl->pwt.chroma_log2_weight_denom,
+                                  sl->pwt.chroma_weight[refn0][0][0][0],
+                                  sl->pwt.chroma_weight[refn1][1][0][0],
+                                  sl->pwt.chroma_weight[refn0][0][0][1] +
+                                  sl->pwt.chroma_weight[refn1][1][0][1]);
+                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
+                                  sl->pwt.chroma_log2_weight_denom,
+                                  sl->pwt.chroma_weight[refn0][0][1][0],
+                                  sl->pwt.chroma_weight[refn1][1][1][0],
+                                  sl->pwt.chroma_weight[refn0][0][1][1] +
+                                  sl->pwt.chroma_weight[refn1][1][1][1]);
+            }
         }
     } else {
         int list     = list1 ? 1 : 0;
@@ -459,15 +464,17 @@ static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceCon
                        sl->pwt.luma_log2_weight_denom,
                        sl->pwt.luma_weight[refn][list][0],
                        sl->pwt.luma_weight[refn][list][1]);
-        if (sl->pwt.use_weight_chroma) {
-            chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
-                             sl->pwt.chroma_log2_weight_denom,
-                             sl->pwt.chroma_weight[refn][list][0][0],
-                             sl->pwt.chroma_weight[refn][list][0][1]);
-            chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
-                             sl->pwt.chroma_log2_weight_denom,
-                             sl->pwt.chroma_weight[refn][list][1][0],
-                             sl->pwt.chroma_weight[refn][list][1][1]);
+        if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+            if (sl->pwt.use_weight_chroma) {
+                chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
+                                 sl->pwt.chroma_log2_weight_denom,
+                                 sl->pwt.chroma_weight[refn][list][0][0],
+                                 sl->pwt.chroma_weight[refn][list][0][1]);
+                chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
+                                 sl->pwt.chroma_log2_weight_denom,
+                                 sl->pwt.chroma_weight[refn][list][1][0],
+                                 sl->pwt.chroma_weight[refn][list][1][1]);
+            }
         }
     }
 }
@@ -483,7 +490,7 @@ static av_always_inline void prefetch_motion(const H264Context *h, H264SliceCont
         const int mx  = (sl->mv_cache[list][scan8[0]][0] >> 2) + 16 * sl->mb_x + 8;
         const int my  = (sl->mv_cache[list][scan8[0]][1] >> 2) + 16 * sl->mb_y;
         uint8_t **src = sl->ref_list[list][refn].data;
-        int off       = (mx << pixel_shift) +
+        int off       =  mx * (1<< pixel_shift) +
                         (my + (sl->mb_x & 3) * 4) * sl->mb_linesize +
                         (64 << pixel_shift);
         h->vdsp.prefetch(src[0] + off, sl->linesize, 4);
@@ -491,9 +498,7 @@ static av_always_inline void prefetch_motion(const H264Context *h, H264SliceCont
             h->vdsp.prefetch(src[1] + off, sl->linesize, 4);
             h->vdsp.prefetch(src[2] + off, sl->linesize, 4);
         } else {
-            off = ((mx >> 1) << pixel_shift) +
-                  ((my >> 1) + (sl->mb_x & 7)) * sl->uvlinesize +
-                  (64 << pixel_shift);
+            off= ((mx>>1)+64) * (1<<pixel_shift) + ((my>>1) + (sl->mb_x&7))*sl->uvlinesize;
             h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
         }
     }
@@ -560,10 +565,8 @@ static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceConte
             XCHG(sl->top_borders[top_idx][sl->mb_x + 1],
                  src_y + (17 << pixel_shift), 1);
         }
-    }
-    if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
-        if (chroma444) {
-            if (deblock_top) {
+        if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
+            if (chroma444) {
                 if (deblock_topleft) {
                     XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                     XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
@@ -576,9 +579,7 @@ static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceConte
                     XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
                     XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
                 }
-            }
-        } else {
-            if (deblock_top) {
+            } else {
                 if (deblock_topleft) {
                     XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                     XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
@@ -635,7 +636,12 @@ static av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
                 uint8_t *const ptr = dest_y + block_offset[i];
                 const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
                 if (transform_bypass && h->ps.sps->profile_idc == 244 && dir <= 1) {
-                    h->hpc.pred8x8l_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
+                    if (h->sei.unregistered.x264_build != -1) {
+                        h->hpc.pred8x8l_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
+                    } else
+                        h->hpc.pred8x8l_filter_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift),
+                                                        (sl-> topleft_samples_available << i) & 0x8000,
+                                                        (sl->topright_samples_available << i) & 0x4000, linesize);
                 } else {
                     const int nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
                     h->hpc.pred8x8l[dir](ptr, (sl->topleft_samples_available << i) & 0x8000,
@@ -668,7 +674,7 @@ static av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
                     uint64_t tr_high;
                     if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
                         const int topright_avail = (sl->topright_samples_available << i) & 0x8000;
-                        assert(sl->mb_y || linesize <= block_offset[i]);
+                        av_assert2(sl->mb_y || linesize <= block_offset[i]);
                         if (!topright_avail) {
                             if (pixel_shift) {
                                 tr_high  = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL;
diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c
index 1f583df..d5ea26a 100644
--- a/libavcodec/h264_mb_template.c
+++ b/libavcodec/h264_mb_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -96,8 +96,8 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
     }
 
     if (!SIMPLE && IS_INTRA_PCM(mb_type)) {
+        const int bit_depth = h->ps.sps->bit_depth_luma;
         if (PIXEL_SHIFT) {
-            const int bit_depth = h->ps.sps->bit_depth_luma;
             int j;
             GetBitContext gb;
             init_get_bits(&gb, sl->intra_pcm_ptr,
@@ -112,13 +112,10 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
                 if (!h->ps.sps->chroma_format_idc) {
                     for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cb[j] = 1 << (bit_depth - 1);
-                    }
-                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cr[j] = 1 << (bit_depth - 1);
+                        for (j = 0; j < 8; j++) {
+                            tmp_cb[j] = tmp_cr[j] = 1 << (bit_depth - 1);
+                        }
                     }
                 } else {
                     for (i = 0; i < block_h; i++) {
@@ -138,9 +135,9 @@ static av_noinline void FUNC(hl_decode_mb)(const H264Context *h, H264SliceContex
                 memcpy(dest_y + i * linesize, sl->intra_pcm_ptr + i * 16, 16);
             if (SIMPLE || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
                 if (!h->ps.sps->chroma_format_idc) {
-                    for (i = 0; i < block_h; i++) {
-                        memset(dest_cb + i * uvlinesize, 128, 8);
-                        memset(dest_cr + i * uvlinesize, 128, 8);
+                    for (i = 0; i < 8; i++) {
+                        memset(dest_cb + i * uvlinesize, 1 << (bit_depth - 1), 8);
+                        memset(dest_cr + i * uvlinesize, 1 << (bit_depth - 1), 8);
                     }
                 } else {
                     const uint8_t *src_cb = sl->intra_pcm_ptr + 256;
diff --git a/libavcodec/h264_mc_template.c b/libavcodec/h264_mc_template.c
index 0c8a925..cd4a04e 100644
--- a/libavcodec/h264_mc_template.c
+++ b/libavcodec/h264_mc_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -74,7 +74,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
     const int mb_xy   = sl->mb_xy;
     const int mb_type = h->cur_pic.mb_type[mb_xy];
 
-    assert(IS_INTER(mb_type));
+    av_assert2(IS_INTER(mb_type));
 
     if (HAVE_THREADS && (h->avctx->active_thread_type & FF_THREAD_FRAME))
         await_references(h, sl);
@@ -106,7 +106,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
     } else {
         int i;
 
-        assert(IS_8X8(mb_type));
+        av_assert2(IS_8X8(mb_type));
 
         for (i = 0; i < 4; i++) {
             const int sub_mb_type = sl->sub_mb_type[i];
@@ -144,7 +144,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
             } else {
                 int j;
-                assert(IS_SUB_4X4(sub_mb_type));
+                av_assert2(IS_SUB_4X4(sub_mb_type));
                 for (j = 0; j < 4; j++) {
                     int sub_x_offset = x_offset + 2 * (j & 1);
                     int sub_y_offset = y_offset + (j & 2);
@@ -158,6 +158,7 @@ static void MCFUNC(hl_motion)(const H264Context *h, H264SliceContext *sl,
         }
     }
 
-    prefetch_motion(h, sl, 1, PIXEL_SHIFT, CHROMA_IDC);
+    if (USES_LIST(mb_type, 1))
+        prefetch_motion(h, sl, 1, PIXEL_SHIFT, CHROMA_IDC);
 }
 
diff --git a/libavcodec/h264_mp4toannexb_bsf.c b/libavcodec/h264_mp4toannexb_bsf.c
index c65aaeb..163d0f5 100644
--- a/libavcodec/h264_mp4toannexb_bsf.c
+++ b/libavcodec/h264_mp4toannexb_bsf.c
@@ -2,20 +2,20 @@
  * H.264 MP4 to Annex B byte stream format filter
  * Copyright (c) 2007 Benoit Fouet <benoit.fouet@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,12 @@
 #include "bsf.h"
 
 typedef struct H264BSFContext {
+    int32_t  sps_offset;
+    int32_t  pps_offset;
     uint8_t  length_size;
-    uint8_t  first_idr;
+    uint8_t  new_idr;
+    uint8_t  idr_sps_seen;
+    uint8_t  idr_pps_seen;
     int      extradata_parsed;
 } H264BSFContext;
 
@@ -61,6 +65,7 @@ static int alloc_and_copy(AVPacket *out,
 
 static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
 {
+    H264BSFContext *s = ctx->priv_data;
     uint16_t unit_size;
     uint64_t total_size                 = 0;
     uint8_t *out                        = NULL, unit_nb, sps_done = 0,
@@ -69,18 +74,14 @@ static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
     static const uint8_t nalu_header[4] = { 0, 0, 0, 1 };
     int length_size = (*extradata++ & 0x3) + 1; // retrieve length coded size
 
-    if (length_size == 3)
-        return AVERROR(EINVAL);
+    s->sps_offset = s->pps_offset = -1;
 
     /* retrieve sps and pps unit(s) */
     unit_nb = *extradata++ & 0x1f; /* number of sps unit(s) */
     if (!unit_nb) {
-        unit_nb = *extradata++; /* number of pps unit(s) */
-        sps_done++;
-
-        if (unit_nb)
-            pps_seen = 1;
+        goto pps;
     } else {
+        s->sps_offset = 0;
         sps_seen = 1;
     }
 
@@ -89,9 +90,15 @@ static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
 
         unit_size   = AV_RB16(extradata);
         total_size += unit_size + 4;
-        if (total_size > INT_MAX - padding ||
-            extradata + 2 + unit_size > ctx->par_in->extradata +
-            ctx->par_in->extradata_size) {
+        if (total_size > INT_MAX - padding) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Too big extradata size, corrupted stream or invalid MP4/AVCC bitstream\n");
+            av_free(out);
+            return AVERROR(EINVAL);
+        }
+        if (extradata + 2 + unit_size > ctx->par_in->extradata + ctx->par_in->extradata_size) {
+            av_log(ctx, AV_LOG_ERROR, "Packet header is not contained in global extradata, "
+                   "corrupted stream or invalid MP4/AVCC bitstream\n");
             av_free(out);
             return AVERROR(EINVAL);
         }
@@ -100,16 +107,18 @@ static int h264_extradata_to_annexb(AVBSFContext *ctx, const int padding)
         memcpy(out + total_size - unit_size - 4, nalu_header, 4);
         memcpy(out + total_size - unit_size, extradata + 2, unit_size);
         extradata += 2 + unit_size;
-
+pps:
         if (!unit_nb && !sps_done++) {
             unit_nb = *extradata++; /* number of pps unit(s) */
-            if (unit_nb)
+            if (unit_nb) {
+                s->pps_offset = total_size;
                 pps_seen = 1;
+            }
         }
     }
 
     if (out)
-        memset(out + total_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        memset(out + total_size, 0, padding);
 
     if (!sps_seen)
         av_log(ctx, AV_LOG_WARNING,
@@ -146,7 +155,9 @@ static int h264_mp4toannexb_init(AVBSFContext *ctx)
             return ret;
 
         s->length_size      = ret;
-        s->first_idr        = 1;
+        s->new_idr          = 1;
+        s->idr_sps_seen     = 0;
+        s->idr_pps_seen     = 0;
         s->extradata_parsed = 1;
     } else {
         av_log(ctx, AV_LOG_ERROR, "Invalid extradata size: %d\n", extra_size);
@@ -167,7 +178,7 @@ static int h264_mp4toannexb_filter(AVBSFContext *ctx, AVPacket *out)
     const uint8_t *buf;
     const uint8_t *buf_end;
     int            buf_size;
-    int ret = 0;
+    int ret = 0, i;
 
     ret = ff_bsf_get_packet(ctx, &in);
     if (ret < 0)
@@ -185,37 +196,73 @@ static int h264_mp4toannexb_filter(AVBSFContext *ctx, AVPacket *out)
     buf_end  = in->data + in->size;
 
     do {
+        ret= AVERROR(EINVAL);
         if (buf + s->length_size > buf_end)
             goto fail;
 
-        if (s->length_size == 1) {
-            nal_size = buf[0];
-        } else if (s->length_size == 2) {
-            nal_size = AV_RB16(buf);
-        } else
-            nal_size = AV_RB32(buf);
+        for (nal_size = 0, i = 0; i<s->length_size; i++)
+            nal_size = (nal_size << 8) | buf[i];
 
         buf += s->length_size;
         unit_type = *buf & 0x1f;
 
-        if (buf + nal_size > buf_end || nal_size < 0)
+        if (nal_size > buf_end - buf || nal_size < 0)
             goto fail;
 
-        /* prepend only to the first type 5 NAL unit of an IDR picture */
-        if (s->first_idr && unit_type == 5) {
-            if (alloc_and_copy(out,
+        if (unit_type == 7)
+            s->idr_sps_seen = s->new_idr = 1;
+        else if (unit_type == 8) {
+            s->idr_pps_seen = s->new_idr = 1;
+            /* if SPS has not been seen yet, prepend the AVCC one to PPS */
+            if (!s->idr_sps_seen) {
+                if (s->sps_offset == -1)
+                    av_log(ctx, AV_LOG_WARNING, "SPS not present in the stream, nor in AVCC, stream may be unreadable\n");
+                else {
+                    if ((ret = alloc_and_copy(out,
+                                         ctx->par_out->extradata + s->sps_offset,
+                                         s->pps_offset != -1 ? s->pps_offset : ctx->par_out->extradata_size - s->sps_offset,
+                                         buf, nal_size)) < 0)
+                        goto fail;
+                    s->idr_sps_seen = 1;
+                    goto next_nal;
+                }
+            }
+        }
+
+        /* if this is a new IDR picture following an IDR picture, reset the idr flag.
+         * Just check first_mb_in_slice to be 0 as this is the simplest solution.
+         * This could be checking idr_pic_id instead, but would complexify the parsing. */
+        if (!s->new_idr && unit_type == 5 && (buf[1] & 0x80))
+            s->new_idr = 1;
+
+        /* prepend only to the first type 5 NAL unit of an IDR picture, if no sps/pps are already present */
+        if (s->new_idr && unit_type == 5 && !s->idr_sps_seen && !s->idr_pps_seen) {
+            if ((ret=alloc_and_copy(out,
                                ctx->par_out->extradata, ctx->par_out->extradata_size,
-                               buf, nal_size) < 0)
+                               buf, nal_size)) < 0)
+                goto fail;
+            s->new_idr = 0;
+        /* if only SPS has been seen, also insert PPS */
+        } else if (s->new_idr && unit_type == 5 && s->idr_sps_seen && !s->idr_pps_seen) {
+            if (s->pps_offset == -1) {
+                av_log(ctx, AV_LOG_WARNING, "PPS not present in the stream, nor in AVCC, stream may be unreadable\n");
+                if ((ret = alloc_and_copy(out, NULL, 0, buf, nal_size)) < 0)
+                    goto fail;
+            } else if ((ret = alloc_and_copy(out,
+                                        ctx->par_out->extradata + s->pps_offset, ctx->par_out->extradata_size - s->pps_offset,
+                                        buf, nal_size)) < 0)
                 goto fail;
-            s->first_idr = 0;
         } else {
-            if (alloc_and_copy(out,
-                               NULL, 0, buf, nal_size) < 0)
+            if ((ret=alloc_and_copy(out, NULL, 0, buf, nal_size)) < 0)
                 goto fail;
-            if (!s->first_idr && unit_type == 1)
-                s->first_idr = 1;
+            if (!s->new_idr && unit_type == 1) {
+                s->new_idr = 1;
+                s->idr_sps_seen = 0;
+                s->idr_pps_seen = 0;
+            }
         }
 
+next_nal:
         buf        += nal_size;
         cumul_size += nal_size + s->length_size;
     } while (cumul_size < buf_size);
diff --git a/libavcodec/h264_mvpred.h b/libavcodec/h264_mvpred.h
index e9d2b62..be02e5a 100644
--- a/libavcodec/h264_mvpred.h
+++ b/libavcodec/h264_mvpred.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... motion vector prediction
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,8 @@
 #include "avcodec.h"
 #include "h264.h"
 #include "mpegutils.h"
+#include "libavutil/avassert.h"
 
-#include <assert.h>
 
 static av_always_inline int fetch_diagonal_mv(const H264Context *h, H264SliceContext *sl,
                                               const int16_t **C,
@@ -106,7 +106,7 @@ static av_always_inline void pred_motion(const H264Context *const h,
     const int16_t *C;
     int diagonal_ref, match_count;
 
-    assert(part_width == 1 || part_width == 2 || part_width == 4);
+    av_assert2(part_width == 1 || part_width == 2 || part_width == 4);
 
 /* mv_cache
  * B . . A T T T T
@@ -488,7 +488,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
                 } else {
                     int left_typei = h->cur_pic.mb_type[left_xy[LTOP] + h->mb_stride];
 
-                    assert(left_xy[LTOP] == left_xy[LBOT]);
+                    av_assert2(left_xy[LTOP] == left_xy[LBOT]);
                     if (!((left_typei & type_mask) && (left_type[LTOP] & type_mask))) {
                         sl->topleft_samples_available &= 0xDF5F;
                         sl->left_samples_available    &= 0x5F5F;
@@ -613,7 +613,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
             int16_t(*mv)[2]       = h->cur_pic.motion_val[list];
             if (!USES_LIST(mb_type, list))
                 continue;
-            assert(!(IS_DIRECT(mb_type) && !sl->direct_spatial_mv_pred));
+            av_assert2(!(IS_DIRECT(mb_type) && !sl->direct_spatial_mv_pred));
 
             if (USES_LIST(top_type, list)) {
                 const int b_xy = h->mb2b_xy[top_xy] + 3 * b_stride;
@@ -670,7 +670,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
                 ref_cache[4 - 1 * 8] = topright_type ? LIST_NOT_USED
                                                      : PART_NOT_AVAILABLE;
             }
-            if (ref_cache[4 - 1 * 8] < 0) {
+            if(ref_cache[2 - 1*8] < 0 || ref_cache[4 - 1 * 8] < 0) {
                 if (USES_LIST(topleft_type, list)) {
                     const int b_xy  = h->mb2b_xy[topleft_xy] + 3 + b_stride +
                                       (sl->topleft_partition & 2 * b_stride);
@@ -771,7 +771,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
 
 #define MAP_F2F(idx, mb_type)                                           \
     if (!IS_INTERLACED(mb_type) && sl->ref_cache[list][idx] >= 0) {     \
-        sl->ref_cache[list][idx]    <<= 1;                              \
+        sl->ref_cache[list][idx]     *= 2;                              \
         sl->mv_cache[list][idx][1]   /= 2;                              \
         sl->mvd_cache[list][idx][1] >>= 1;                              \
     }
@@ -783,7 +783,7 @@ static void fill_decode_caches(const H264Context *h, H264SliceContext *sl, int m
 #define MAP_F2F(idx, mb_type)                                           \
     if (IS_INTERLACED(mb_type) && sl->ref_cache[list][idx] >= 0) {      \
         sl->ref_cache[list][idx]    >>= 1;                              \
-        sl->mv_cache[list][idx][1]  <<= 1;                              \
+        sl->mv_cache[list][idx][1]   *= 2;                              \
         sl->mvd_cache[list][idx][1] <<= 1;                              \
     }
 
diff --git a/libavcodec/h264_parse.c b/libavcodec/h264_parse.c
index 7211c9d..a63530d 100644
--- a/libavcodec/h264_parse.c
+++ b/libavcodec/h264_parse.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,7 @@
 
 int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
                               const int *ref_count, int slice_type_nos,
-                              H264PredWeightTable *pwt)
+                              H264PredWeightTable *pwt, void *logctx)
 {
     int list, i;
     int luma_def, chroma_def;
@@ -34,6 +34,16 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
     pwt->luma_log2_weight_denom = get_ue_golomb(gb);
     if (sps->chroma_format_idc)
         pwt->chroma_log2_weight_denom = get_ue_golomb(gb);
+
+    if (pwt->luma_log2_weight_denom > 7U) {
+        av_log(logctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is out of range\n", pwt->luma_log2_weight_denom);
+        pwt->luma_log2_weight_denom = 0;
+    }
+    if (pwt->chroma_log2_weight_denom > 7U) {
+        av_log(logctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is out of range\n", pwt->chroma_log2_weight_denom);
+        pwt->chroma_log2_weight_denom = 0;
+    }
+
     luma_def   = 1 << pwt->luma_log2_weight_denom;
     chroma_def = 1 << pwt->chroma_log2_weight_denom;
 
@@ -106,7 +116,7 @@ int ff_h264_check_intra4x4_pred_mode(int8_t *pred_mode_cache, void *logctx,
             int status = top[pred_mode_cache[scan8[0] + i]];
             if (status < 0) {
                 av_log(logctx, AV_LOG_ERROR,
-                       "top block unavailable for requested intra4x4 mode %d\n",
+                       "top block unavailable for requested intra mode %d\n",
                        status);
                 return AVERROR_INVALIDDATA;
             } else if (status) {
@@ -162,17 +172,17 @@ int ff_h264_check_intra_pred_mode(void *logctx, int top_samples_available,
 
     if ((left_samples_available & 0x8080) != 0x8080) {
         mode = left[mode];
+        if (mode < 0) {
+            av_log(logctx, AV_LOG_ERROR,
+                   "left block unavailable for requested intra mode\n");
+            return AVERROR_INVALIDDATA;
+        }
         if (is_chroma && (left_samples_available & 0x8080)) {
             // mad cow disease mode, aka MBAFF + constrained_intra_pred
             mode = ALZHEIMER_DC_L0T_PRED8x8 +
                    (!(left_samples_available & 0x8000)) +
                    2 * (mode == DC_128_PRED8x8);
         }
-        if (mode < 0) {
-            av_log(logctx, AV_LOG_ERROR,
-                   "left block unavailable for requested intra mode\n");
-            return AVERROR_INVALIDDATA;
-        }
     }
 
     return mode;
@@ -180,27 +190,36 @@ int ff_h264_check_intra_pred_mode(void *logctx, int top_samples_available,
 
 int ff_h264_parse_ref_count(int *plist_count, int ref_count[2],
                             GetBitContext *gb, const PPS *pps,
-                            int slice_type_nos, int picture_structure)
+                            int slice_type_nos, int picture_structure, void *logctx)
 {
     int list_count;
-    int num_ref_idx_active_override_flag, max_refs;
+    int num_ref_idx_active_override_flag;
 
     // set defaults, might be overridden a few lines later
     ref_count[0] = pps->ref_count[0];
     ref_count[1] = pps->ref_count[1];
 
     if (slice_type_nos != AV_PICTURE_TYPE_I) {
+        unsigned max[2];
+        max[0] = max[1] = picture_structure == PICT_FRAME ? 15 : 31;
+
         num_ref_idx_active_override_flag = get_bits1(gb);
 
         if (num_ref_idx_active_override_flag) {
             ref_count[0] = get_ue_golomb(gb) + 1;
-            if (ref_count[0] < 1)
-                goto fail;
             if (slice_type_nos == AV_PICTURE_TYPE_B) {
                 ref_count[1] = get_ue_golomb(gb) + 1;
-                if (ref_count[1] < 1)
-                    goto fail;
-            }
+            } else
+                // full range is spec-ok in this case, even for frames
+                ref_count[1] = 1;
+        }
+
+        if (ref_count[0] - 1 > max[0] || ref_count[1] - 1 > max[1]) {
+            av_log(logctx, AV_LOG_ERROR, "reference overflow %u > %u or %u > %u\n",
+                   ref_count[0] - 1, max[0], ref_count[1] - 1, max[1]);
+            ref_count[0] = ref_count[1] = 0;
+            *plist_count = 0;
+            goto fail;
         }
 
         if (slice_type_nos == AV_PICTURE_TYPE_B)
@@ -212,11 +231,6 @@ int ff_h264_parse_ref_count(int *plist_count, int ref_count[2],
         ref_count[0] = ref_count[1] = 0;
     }
 
-    max_refs = picture_structure == PICT_FRAME ? 16 : 32;
-
-    if (ref_count[0] > max_refs || ref_count[1] > max_refs)
-        goto fail;
-
     *plist_count = list_count;
 
     return 0;
@@ -314,14 +328,16 @@ static int decode_extradata_ps(const uint8_t *data, int size, H264ParamSets *ps,
     int i, ret = 0;
 
     ret = ff_h2645_packet_split(&pkt, data, size, logctx, is_avc, 2, AV_CODEC_ID_H264);
-    if (ret < 0)
+    if (ret < 0) {
+        ret = 0;
         goto fail;
+    }
 
     for (i = 0; i < pkt.nb_nals; i++) {
         H2645NAL *nal = &pkt.nals[i];
         switch (nal->type) {
         case NAL_SPS:
-            ret = ff_h264_decode_seq_parameter_set(&nal->gb, logctx, ps);
+            ret = ff_h264_decode_seq_parameter_set(&nal->gb, logctx, ps, 0);
             if (ret < 0)
                 goto fail;
             break;
@@ -399,6 +415,9 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
 {
     int ret;
 
+    if (!data || size <= 0)
+        return -1;
+
     if (data[0] == 1) {
         int i, cnt, nalsize;
         const uint8_t *p = data;
@@ -415,7 +434,7 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
         p  += 6;
         for (i = 0; i < cnt; i++) {
             nalsize = AV_RB16(p) + 2;
-            if (p - data + nalsize > size)
+            if (nalsize > size - (p - data))
                 return AVERROR_INVALIDDATA;
             ret = decode_extradata_ps_mp4(p, nalsize, ps, err_recognition, logctx);
             if (ret < 0) {
@@ -429,7 +448,7 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
         cnt = *(p++); // Number of pps
         for (i = 0; i < cnt; i++) {
             nalsize = AV_RB16(p) + 2;
-            if (p - data + nalsize > size)
+            if (nalsize > size - (p - data))
                 return AVERROR_INVALIDDATA;
             ret = decode_extradata_ps_mp4(p, nalsize, ps, err_recognition, logctx);
             if (ret < 0) {
@@ -447,7 +466,7 @@ int ff_h264_decode_extradata(const uint8_t *data, int size, H264ParamSets *ps,
         if (ret < 0)
             return ret;
     }
-    return 0;
+    return size;
 }
 
 /**
diff --git a/libavcodec/h264_parse.h b/libavcodec/h264_parse.h
index fde1a45..40d88ed 100644
--- a/libavcodec/h264_parse.h
+++ b/libavcodec/h264_parse.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,7 +58,7 @@ struct H264ParamSets;
 
 int ff_h264_pred_weight_table(GetBitContext *gb, const struct SPS *sps,
                               const int *ref_count, int slice_type_nos,
-                              H264PredWeightTable *pwt);
+                              H264PredWeightTable *pwt, void *logctx);
 
 /**
  * Check if the top & left blocks are available if needed & change the
@@ -77,7 +77,7 @@ int ff_h264_check_intra_pred_mode(void *logctx, int top_samples_available,
 
 int ff_h264_parse_ref_count(int *plist_count, int ref_count[2],
                             GetBitContext *gb, const struct PPS *pps,
-                            int slice_type_nos, int picture_structure);
+                            int slice_type_nos, int picture_structure, void *logctx);
 
 int ff_h264_init_poc(int pic_field_poc[2], int *pic_poc,
                      const struct SPS *sps, H264POCContext *poc,
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index b86bf2b..ce4bab2 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... parser
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,8 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include <assert.h>
 #include <stdint.h>
 
@@ -53,24 +55,45 @@ typedef struct H264ParseContext {
     int nal_length_size;
     int got_first;
     int picture_structure;
+    uint8_t parse_history[6];
+    int parse_history_count;
+    int parse_last_mb;
 } H264ParseContext;
 
 
 static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-                               int buf_size)
+                               int buf_size, void *logctx)
 {
-    int i;
+    int i, j;
     uint32_t state;
     ParseContext *pc = &p->pc;
+
+    int next_avc = p->is_avc ? 0 : buf_size;
 //    mb_addr= pc->mb_addr - 1;
     state = pc->state;
     if (state > 13)
         state = 7;
 
+    if (p->is_avc && !p->nal_length_size)
+        av_log(logctx, AV_LOG_ERROR, "AVC-parser: nal length size invalid\n");
+
     for (i = 0; i < buf_size; i++) {
+        if (i >= next_avc) {
+            int nalsize = 0;
+            i = next_avc;
+            for (j = 0; j < p->nal_length_size; j++)
+                nalsize = (nalsize << 8) | buf[i++];
+            if (nalsize <= 0 || nalsize > buf_size - i) {
+                av_log(logctx, AV_LOG_ERROR, "AVC-parser: nal size %d remaining %d\n", nalsize, buf_size - i);
+                return buf_size;
+            }
+            next_avc = i + nalsize;
+            state    = 5;
+        }
+
         if (state == 7) {
-            i += p->h264dsp.startcode_find_candidate(buf + i, buf_size - i);
-            if (i < buf_size)
+            i += p->h264dsp.startcode_find_candidate(buf + i, next_avc - i);
+            if (i < next_avc)
                 state = 2;
         } else if (state <= 2) {
             if (buf[i] == 1)
@@ -89,31 +112,44 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
                 }
             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
                        nalu_type == NAL_IDR_SLICE) {
+                state += 8;
+                continue;
+            }
+            state = 7;
+        } else {
+            p->parse_history[p->parse_history_count++] = buf[i];
+            if (p->parse_history_count > 5) {
+                unsigned int mb, last_mb = p->parse_last_mb;
+                GetBitContext gb;
+
+                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+                p->parse_history_count = 0;
+                mb= get_ue_golomb_long(&gb);
+                p->parse_last_mb = mb;
                 if (pc->frame_start_found) {
-                    state += 8;
-                    continue;
+                    if (mb <= last_mb)
+                        goto found;
                 } else
                     pc->frame_start_found = 1;
+                state = 7;
             }
-            state = 7;
-        } else {
-            // first_mb_in_slice is 0, probably the first nal of a new slice
-            if (buf[i] & 0x80)
-                goto found;
-            state = 7;
         }
     }
     pc->state = state;
+    if (p->is_avc)
+        return next_avc;
     return END_NOT_FOUND;
 
 found:
     pc->state             = 7;
     pc->frame_start_found = 0;
-    return i - (state & 5);
+    if (p->is_avc)
+        return next_avc;
+    return i - (state & 5) - 5 * (state > 7);
 }
 
 static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
-                           AVCodecContext *avctx)
+                           void *logctx)
 {
     H264PredWeightTable pwt;
     int slice_type_nos = s->pict_type & 3;
@@ -128,7 +164,7 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
         get_bits1(gb); // direct_spatial_mv_pred
 
     if (ff_h264_parse_ref_count(&list_count, ref_count, gb, p->ps.pps,
-                                slice_type_nos, p->picture_structure) < 0)
+                                slice_type_nos, p->picture_structure, logctx) < 0)
         return AVERROR_INVALIDDATA;
 
     if (slice_type_nos != AV_PICTURE_TYPE_I) {
@@ -140,9 +176,9 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
                     unsigned int reordering_of_pic_nums_idc = get_ue_golomb_31(gb);
 
                     if (reordering_of_pic_nums_idc < 3)
-                        get_ue_golomb(gb);
+                        get_ue_golomb_long(gb);
                     else if (reordering_of_pic_nums_idc > 3) {
-                        av_log(avctx, AV_LOG_ERROR,
+                        av_log(logctx, AV_LOG_ERROR,
                                "illegal reordering_of_pic_nums_idc %d\n",
                                reordering_of_pic_nums_idc);
                         return AVERROR_INVALIDDATA;
@@ -150,7 +186,7 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
                         break;
 
                     if (index >= ref_count[list]) {
-                        av_log(avctx, AV_LOG_ERROR,
+                        av_log(logctx, AV_LOG_ERROR,
                                "reference count %d overflow\n", index);
                         return AVERROR_INVALIDDATA;
                     }
@@ -162,14 +198,14 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
     if ((p->ps.pps->weighted_pred && slice_type_nos == AV_PICTURE_TYPE_P) ||
         (p->ps.pps->weighted_bipred_idc == 1 && slice_type_nos == AV_PICTURE_TYPE_B))
         ff_h264_pred_weight_table(gb, p->ps.sps, ref_count, slice_type_nos,
-                                  &pwt);
+                                  &pwt, logctx);
 
     if (get_bits1(gb)) { // adaptive_ref_pic_marking_mode_flag
         int i;
         for (i = 0; i < MAX_MMCO_COUNT; i++) {
             MMCOOpcode opcode = get_ue_golomb_31(gb);
             if (opcode > (unsigned) MMCO_LONG) {
-                av_log(avctx, AV_LOG_ERROR,
+                av_log(logctx, AV_LOG_ERROR,
                        "illegal memory management control operation %d\n",
                        opcode);
                 return AVERROR_INVALIDDATA;
@@ -180,7 +216,7 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
                 return 1;
 
             if (opcode == MMCO_SHORT2UNUSED || opcode == MMCO_SHORT2LONG)
-                get_ue_golomb(gb);
+                get_ue_golomb_long(gb); // difference_of_pic_nums_minus1
             if (opcode == MMCO_SHORT2LONG || opcode == MMCO_LONG2UNUSED ||
                 opcode == MMCO_LONG || opcode == MMCO_SET_MAX_LONG)
                 get_ue_golomb_31(gb);
@@ -190,6 +226,26 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
     return 0;
 }
 
+static inline int get_avc_nalsize(H264ParseContext *p, const uint8_t *buf,
+                                  int buf_size, int *buf_index, void *logctx)
+{
+    int i, nalsize = 0;
+
+    if (*buf_index >= buf_size - p->nal_length_size) {
+        // the end of the buffer is reached, refill it
+        return AVERROR(EAGAIN);
+    }
+
+    for (i = 0; i < p->nal_length_size; i++)
+        nalsize = ((unsigned)nalsize << 8) | buf[(*buf_index)++];
+    if (nalsize <= 0 || nalsize > buf_size - *buf_index) {
+        av_log(logctx, AV_LOG_ERROR,
+               "AVC: nal size %d\n", nalsize);
+        return AVERROR_INVALIDDATA;
+    }
+    return nalsize;
+}
+
 /**
  * Parse NAL units of found picture and decode some basic information.
  *
@@ -200,16 +256,15 @@ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
  */
 static inline int parse_nal_units(AVCodecParserContext *s,
                                   AVCodecContext *avctx,
-                                  const uint8_t *buf, int buf_size)
+                                  const uint8_t * const buf, int buf_size)
 {
     H264ParseContext *p = s->priv_data;
-    const uint8_t *buf_end = buf + buf_size;
-
     H2645NAL nal = { NULL };
-
+    int buf_index, next_avc;
     unsigned int pps_id;
     unsigned int slice_type;
     int state = -1, got_reset = 0;
+    int q264 = buf_size >=4 && !memcmp("Q264", buf, 4);
     int field_poc[2];
     int ret;
 
@@ -219,18 +274,32 @@ static inline int parse_nal_units(AVCodecParserContext *s,
     s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
 
     ff_h264_sei_uninit(&p->sei);
+    p->sei.frame_packing.frame_packing_arrangement_cancel_flag = -1;
 
     if (!buf_size)
         return 0;
 
+    buf_index     = 0;
+    next_avc      = p->is_avc ? 0 : buf_size;
     for (;;) {
         const SPS *sps;
-        int src_length, consumed;
-        buf = avpriv_find_start_code(buf, buf_end, &state);
-        if (buf >= buf_end)
-            break;
-        --buf;
-        src_length = buf_end - buf;
+        int src_length, consumed, nalsize = 0;
+
+        if (buf_index >= next_avc) {
+            nalsize = get_avc_nalsize(p, buf, buf_size, &buf_index, avctx);
+            if (nalsize < 0)
+                break;
+            next_avc = buf_index + nalsize;
+        } else {
+            buf_index = find_start_code(buf, buf_size, buf_index, next_avc);
+            if (buf_index >= buf_size)
+                break;
+            if (buf_index >= next_avc)
+                continue;
+        }
+        src_length = next_avc - buf_index;
+
+        state = buf[buf_index];
         switch (state & 0x1f) {
         case NAL_SLICE:
         case NAL_IDR_SLICE:
@@ -247,12 +316,13 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             }
             break;
         }
-
-        consumed = ff_h2645_extract_rbsp(buf, src_length, &nal);
+        consumed = ff_h2645_extract_rbsp(buf + buf_index, src_length, &nal);
         if (consumed < 0)
             break;
 
-        ret = init_get_bits(&nal.gb, nal.data, nal.size * 8);
+        buf_index += consumed;
+
+        ret = init_get_bits8(&nal.gb, nal.data, nal.size);
         if (ret < 0)
             goto fail;
         get_bits1(&nal.gb);
@@ -261,7 +331,7 @@ static inline int parse_nal_units(AVCodecParserContext *s,
 
         switch (nal.type) {
         case NAL_SPS:
-            ff_h264_decode_seq_parameter_set(&nal.gb, avctx, &p->ps);
+            ff_h264_decode_seq_parameter_set(&nal.gb, avctx, &p->ps, 0);
             break;
         case NAL_PPS:
             ff_h264_decode_picture_parameter_set(&nal.gb, avctx, &p->ps,
@@ -279,7 +349,7 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             p->poc.prev_poc_lsb          = 0;
         /* fall through */
         case NAL_SLICE:
-            get_ue_golomb(&nal.gb);  // skip first_mb_in_slice
+            get_ue_golomb_long(&nal.gb);  // skip first_mb_in_slice
             slice_type   = get_ue_golomb_31(&nal.gb);
             s->pict_type = ff_h264_golomb_to_pict_type[slice_type % 5];
             if (p->sei.recovery_point.recovery_frame_cnt >= 0) {
@@ -307,6 +377,10 @@ static inline int parse_nal_units(AVCodecParserContext *s,
 
             sps = p->ps.sps;
 
+            // heuristic to detect non marked keyframes
+            if (p->ps.sps->ref_frame_count <= 1 && p->ps.pps->ref_count[0] <= 1 && s->pict_type == AV_PICTURE_TYPE_I)
+                s->key_frame = 1;
+
             p->poc.frame_num = get_bits(&nal.gb, sps->log2_max_frame_num);
 
             s->coded_width  = 16 * sps->mb_width;
@@ -352,7 +426,7 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             }
 
             if (nal.type == NAL_IDR_SLICE)
-                get_ue_golomb(&nal.gb); /* idr_pic_id */
+                get_ue_golomb_long(&nal.gb); /* idr_pic_id */
             if (sps->poc_type == 0) {
                 p->poc.poc_lsb = get_bits(&nal.gb, sps->log2_max_poc_lsb);
 
@@ -464,10 +538,13 @@ static inline int parse_nal_units(AVCodecParserContext *s,
             av_freep(&nal.rbsp_buffer);
             return 0; /* no need to evaluate the rest */
         }
-        buf += consumed;
+    }
+    if (q264) {
+        av_freep(&nal.rbsp_buffer);
+        return 0;
     }
     /* didn't find a picture! */
-    av_log(avctx, AV_LOG_ERROR, "missing picture in access unit\n");
+    av_log(avctx, AV_LOG_ERROR, "missing picture in access unit with size %d\n", buf_size);
 fail:
     av_freep(&nal.rbsp_buffer);
     return -1;
@@ -494,7 +571,7 @@ static int h264_parse(AVCodecParserContext *s,
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         next = buf_size;
     } else {
-        next = h264_find_frame_end(p, buf, buf_size);
+        next = h264_find_frame_end(p, buf, buf_size, avctx);
 
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf      = NULL;
@@ -503,13 +580,15 @@ static int h264_parse(AVCodecParserContext *s,
         }
 
         if (next < 0 && next != END_NOT_FOUND) {
-            assert(pc->last_index + next >= 0);
-            h264_find_frame_end(p, &pc->buffer[pc->last_index + next], -next); // update state
+            av_assert1(pc->last_index + next >= 0);
+            h264_find_frame_end(p, &pc->buffer[pc->last_index + next], -next, avctx); // update state
         }
     }
 
     parse_nal_units(s, avctx, buf, buf_size);
 
+    if (avctx->framerate.num)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
     if (p->sei.picture_timing.cpb_removal_delay >= 0) {
         s->dts_sync_point    = p->sei.buffering_period.present;
         s->dts_ref_dts_delta = p->sei.picture_timing.cpb_removal_delay;
@@ -532,31 +611,37 @@ static int h264_parse(AVCodecParserContext *s,
 static int h264_split(AVCodecContext *avctx,
                       const uint8_t *buf, int buf_size)
 {
-    int i;
     uint32_t state = -1;
     int has_sps    = 0;
+    int has_pps    = 0;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
+    int nalu_type;
 
-    for (i = 0; i <= buf_size; i++) {
-        if ((state & 0xFFFFFF1F) == 0x107)
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if ((state & 0xFFFFFF00) != 0x100)
+            break;
+        nalu_type = state & 0x1F;
+        if (nalu_type == NAL_SPS) {
             has_sps = 1;
-        /*  if((state&0xFFFFFF1F) == 0x101 ||
-         *     (state&0xFFFFFF1F) == 0x102 ||
-         *     (state&0xFFFFFF1F) == 0x105) {
+        } else if (nalu_type == NAL_PPS)
+            has_pps = 1;
+        /* else if (nalu_type == 0x01 ||
+         *     nalu_type == 0x02 ||
+         *     nalu_type == 0x05) {
          *  }
          */
-        if ((state & 0xFFFFFF00) == 0x100 && (state & 0xFFFFFF1F) != 0x106 &&
-            (state & 0xFFFFFF1F) != 0x107 && (state & 0xFFFFFF1F) != 0x108 &&
-            (state & 0xFFFFFF1F) != 0x109 && (state & 0xFFFFFF1F) != 0x10d &&
-            (state & 0xFFFFFF1F) != 0x10f) {
+        else if ((nalu_type != NAL_SEI || has_pps) &&
+                  nalu_type != NAL_AUD && nalu_type != NAL_SPS_EXT &&
+                  nalu_type != 0x0f) {
             if (has_sps) {
-                while (i > 4 && buf[i - 5] == 0)
-                    i--;
-                return i - 4;
+                while (ptr - 4 > buf && ptr[-5] == 0)
+                    ptr--;
+                return ptr - 4 - buf;
             }
         }
-        if (i < buf_size)
-            state = (state << 8) | buf[i];
     }
+
     return 0;
 }
 
@@ -564,17 +649,11 @@ static void h264_close(AVCodecParserContext *s)
 {
     H264ParseContext *p = s->priv_data;
     ParseContext *pc = &p->pc;
-    int i;
 
-    av_free(pc->buffer);
+    av_freep(&pc->buffer);
 
     ff_h264_sei_uninit(&p->sei);
-
-    for (i = 0; i < FF_ARRAY_ELEMS(p->ps.sps_list); i++)
-        av_buffer_unref(&p->ps.sps_list[i]);
-
-    for (i = 0; i < FF_ARRAY_ELEMS(p->ps.pps_list); i++)
-        av_buffer_unref(&p->ps.pps_list[i]);
+    ff_h264_ps_uninit(&p->ps);
 }
 
 static av_cold int init(AVCodecParserContext *s)
diff --git a/libavcodec/h264_picture.c b/libavcodec/h264_picture.c
index fb71fae..3e9b79a 100644
--- a/libavcodec/h264_picture.c
+++ b/libavcodec/h264_picture.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,6 +42,7 @@
 #include "mpegutils.h"
 #include "rectangle.h"
 #include "thread.h"
+#include "vdpau_compat.h"
 
 void ff_h264_unref_picture(H264Context *h, H264Picture *pic)
 {
@@ -114,7 +115,12 @@ int ff_h264_ref_picture(H264Context *h, H264Picture *dst, H264Picture *src)
     dst->mbaff         = src->mbaff;
     dst->field_picture = src->field_picture;
     dst->reference     = src->reference;
+    dst->crop          = src->crop;
+    dst->crop_left     = src->crop_left;
+    dst->crop_top      = src->crop_top;
     dst->recovered     = src->recovered;
+    dst->invalid_gap   = src->invalid_gap;
+    dst->sei_recovery_frame_cnt = src->sei_recovery_frame_cnt;
 
     return 0;
 fail:
@@ -122,11 +128,13 @@ fail:
     return ret;
 }
 
-#if CONFIG_ERROR_RESILIENCE
-static void h264_set_erpic(ERPicture *dst, H264Picture *src)
+void ff_h264_set_erpic(ERPicture *dst, H264Picture *src)
 {
+#if CONFIG_ERROR_RESILIENCE
     int i;
 
+    memset(dst, 0, sizeof(*dst));
+
     if (!src)
         return;
 
@@ -140,8 +148,8 @@ static void h264_set_erpic(ERPicture *dst, H264Picture *src)
 
     dst->mb_type = src->mb_type;
     dst->field_picture = src->field_picture;
-}
 #endif /* CONFIG_ERROR_RESILIENCE */
+}
 
 int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
 {
@@ -149,9 +157,11 @@ int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
     int err = 0;
     h->mb_y = 0;
 
-    if (!in_setup && !h->droppable)
-        ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
-                                  h->picture_structure == PICT_BOTTOM_FIELD);
+#if FF_API_CAP_VDPAU
+    if (CONFIG_H264_VDPAU_DECODER &&
+        h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+        ff_vdpau_h264_set_reference_frames(h);
+#endif
 
     if (in_setup || !(avctx->active_thread_type & FF_THREAD_FRAME)) {
         if (!h->droppable) {
@@ -164,34 +174,21 @@ int ff_h264_field_end(H264Context *h, H264SliceContext *sl, int in_setup)
     }
 
     if (avctx->hwaccel) {
-        if (avctx->hwaccel->end_frame(avctx) < 0)
+        err = avctx->hwaccel->end_frame(avctx);
+        if (err < 0)
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
     }
 
-#if CONFIG_ERROR_RESILIENCE
-    /*
-     * FIXME: Error handling code does not seem to support interlaced
-     * when slices span multiple rows
-     * The ff_er_add_slice calls don't work right for bottom
-     * fields; they cause massive erroneous error concealing
-     * Error marking covers both fields (top and bottom).
-     * This causes a mismatched s->error_count
-     * and a bad error table. Further, the error count goes to
-     * INT_MAX when called for bottom field, because mb_y is
-     * past end by one (callers fault) and resync_mb_y != 0
-     * causes problems for the first MB line, too.
-     */
-    if (!FIELD_PICTURE(h) && h->enable_er) {
-        h264_set_erpic(&sl->er.cur_pic, h->cur_pic_ptr);
-        h264_set_erpic(&sl->er.last_pic,
-                       sl->ref_count[0] ? sl->ref_list[0][0].parent : NULL);
-        h264_set_erpic(&sl->er.next_pic,
-                       sl->ref_count[1] ? sl->ref_list[1][0].parent : NULL);
-        ff_er_frame_end(&sl->er);
-    }
-#endif /* CONFIG_ERROR_RESILIENCE */
+#if FF_API_CAP_VDPAU
+    if (CONFIG_H264_VDPAU_DECODER &&
+        h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+        ff_vdpau_h264_picture_complete(h);
+#endif
 
+    if (!in_setup && !h->droppable)
+        ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                  h->picture_structure == PICT_BOTTOM_FIELD);
     emms_c();
 
     h->current_slice = 0;
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index f6cd1ca..943d953 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,26 +38,6 @@
 #define MAX_LOG2_MAX_FRAME_NUM    (12 + 4)
 #define MIN_LOG2_MAX_FRAME_NUM    4
 
-static const AVRational pixel_aspect[17] = {
-    {   0,  1 },
-    {   1,  1 },
-    {  12, 11 },
-    {  10, 11 },
-    {  16, 11 },
-    {  40, 33 },
-    {  24, 11 },
-    {  20, 11 },
-    {  32, 11 },
-    {  80, 33 },
-    {  18, 11 },
-    {  15, 11 },
-    {  64, 33 },
-    { 160, 99 },
-    {   4,  3 },
-    {   3,  2 },
-    {   2,  1 },
-};
-
 static const uint8_t default_scaling4[2][16] = {
     {  6, 13, 20, 28, 13, 20, 28, 32,
       20, 28, 32, 37, 28, 32, 37, 42 },
@@ -106,23 +86,20 @@ static const int level_max_dpb_mbs[][2] = {
 
 static void remove_pps(H264ParamSets *s, int id)
 {
-    if (s->pps_list[id] && s->pps == (const PPS*)s->pps_list[id]->data)
-        s->pps = NULL;
     av_buffer_unref(&s->pps_list[id]);
 }
 
 static void remove_sps(H264ParamSets *s, int id)
 {
+#if 0
     int i;
     if (s->sps_list[id]) {
-        if (s->sps == (SPS*)s->sps_list[id]->data)
-            s->sps = NULL;
-
         /* drop all PPS that depend on this SPS */
         for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
             if (s->pps_list[i] && ((PPS*)s->pps_list[i]->data)->sps_id == id)
                 remove_pps(s, i);
     }
+#endif
     av_buffer_unref(&s->sps_list[id]);
 }
 
@@ -165,8 +142,8 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
         if (aspect_ratio_idc == EXTENDED_SAR) {
             sps->sar.num = get_bits(gb, 16);
             sps->sar.den = get_bits(gb, 16);
-        } else if (aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)) {
-            sps->sar = pixel_aspect[aspect_ratio_idc];
+        } else if (aspect_ratio_idc < FF_ARRAY_ELEMS(ff_h264_pixel_aspect)) {
+            sps->sar = ff_h264_pixel_aspect[aspect_ratio_idc];
         } else {
             av_log(avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
             return AVERROR_INVALIDDATA;
@@ -205,15 +182,23 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
         get_ue_golomb(gb);  /* chroma_sample_location_type_bottom_field */
     }
 
+    if (show_bits1(gb) && get_bits_left(gb) < 10) {
+        av_log(avctx, AV_LOG_WARNING, "Truncated VUI\n");
+        return 0;
+    }
+
     sps->timing_info_present_flag = get_bits1(gb);
     if (sps->timing_info_present_flag) {
-        sps->num_units_in_tick = get_bits_long(gb, 32);
-        sps->time_scale        = get_bits_long(gb, 32);
-        if (!sps->num_units_in_tick || !sps->time_scale) {
+        unsigned num_units_in_tick = get_bits_long(gb, 32);
+        unsigned time_scale        = get_bits_long(gb, 32);
+        if (!num_units_in_tick || !time_scale) {
             av_log(avctx, AV_LOG_ERROR,
-                   "time_scale/num_units_in_tick invalid or unsupported (%"PRIu32"/%"PRIu32")\n",
-                   sps->time_scale, sps->num_units_in_tick);
-            return AVERROR_INVALIDDATA;
+                   "time_scale/num_units_in_tick invalid or unsupported (%u/%u)\n",
+                   time_scale, num_units_in_tick);
+            sps->timing_info_present_flag = 0;
+        } else {
+            sps->num_units_in_tick = num_units_in_tick;
+            sps->time_scale = time_scale;
         }
         sps->fixed_frame_rate_flag = get_bits1(gb);
     }
@@ -230,7 +215,8 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
         sps->vcl_hrd_parameters_present_flag)
         get_bits1(gb);     /* low_delay_hrd_flag */
     sps->pic_struct_present_flag = get_bits1(gb);
-
+    if (!get_bits_left(gb))
+        return 0;
     sps->bitstream_restriction_flag = get_bits1(gb);
     if (sps->bitstream_restriction_flag) {
         get_bits1(gb);     /* motion_vectors_over_pic_boundaries_flag */
@@ -255,11 +241,6 @@ static inline int decode_vui_parameters(GetBitContext *gb, AVCodecContext *avctx
             return AVERROR_INVALIDDATA;
         }
     }
-    if (get_bits_left(gb) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Overread VUI by %d bits\n", -get_bits_left(gb));
-        return AVERROR_INVALIDDATA;
-    }
 
     return 0;
 }
@@ -306,21 +287,36 @@ static void decode_scaling_matrices(GetBitContext *gb, SPS *sps,
         decode_scaling_list(gb, scaling_matrix4[5], 16, default_scaling4[1], scaling_matrix4[4]); // Inter, Cb
         if (is_sps || pps->transform_8x8_mode) {
             decode_scaling_list(gb, scaling_matrix8[0], 64, default_scaling8[0], fallback[2]); // Intra, Y
-            if (sps->chroma_format_idc == 3) {
-                decode_scaling_list(gb, scaling_matrix8[1], 64, default_scaling8[0], scaling_matrix8[0]); // Intra, Cr
-                decode_scaling_list(gb, scaling_matrix8[2], 64, default_scaling8[0], scaling_matrix8[1]); // Intra, Cb
-            }
             decode_scaling_list(gb, scaling_matrix8[3], 64, default_scaling8[1], fallback[3]); // Inter, Y
             if (sps->chroma_format_idc == 3) {
+                decode_scaling_list(gb, scaling_matrix8[1], 64, default_scaling8[0], scaling_matrix8[0]); // Intra, Cr
                 decode_scaling_list(gb, scaling_matrix8[4], 64, default_scaling8[1], scaling_matrix8[3]); // Inter, Cr
+                decode_scaling_list(gb, scaling_matrix8[2], 64, default_scaling8[0], scaling_matrix8[1]); // Intra, Cb
                 decode_scaling_list(gb, scaling_matrix8[5], 64, default_scaling8[1], scaling_matrix8[4]); // Inter, Cb
             }
         }
     }
 }
 
+void ff_h264_ps_uninit(H264ParamSets *ps)
+{
+    int i;
+
+    for (i = 0; i < MAX_SPS_COUNT; i++)
+        av_buffer_unref(&ps->sps_list[i]);
+
+    for (i = 0; i < MAX_PPS_COUNT; i++)
+        av_buffer_unref(&ps->pps_list[i]);
+
+    av_buffer_unref(&ps->sps_ref);
+    av_buffer_unref(&ps->pps_ref);
+
+    ps->pps = NULL;
+    ps->sps = NULL;
+}
+
 int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
-                                     H264ParamSets *ps)
+                                     H264ParamSets *ps, int ignore_truncation)
 {
     AVBufferRef *sps_buf;
     int profile_idc, level_idc, constraint_set_flags = 0;
@@ -328,6 +324,18 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     int i, log2_max_frame_num_minus4;
     SPS *sps;
 
+    sps_buf = av_buffer_allocz(sizeof(*sps));
+    if (!sps_buf)
+        return AVERROR(ENOMEM);
+    sps = (SPS*)sps_buf->data;
+
+    sps->data_size = gb->buffer_end - gb->buffer;
+    if (sps->data_size > sizeof(sps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS\n");
+        sps->data_size = sizeof(sps->data);
+    }
+    memcpy(sps->data, gb->buffer, sps->data_size);
+
     profile_idc           = get_bits(gb, 8);
     constraint_set_flags |= get_bits1(gb) << 0;   // constraint_set0_flag
     constraint_set_flags |= get_bits1(gb) << 1;   // constraint_set1_flag
@@ -341,23 +349,20 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
 
     if (sps_id >= MAX_SPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "sps_id %u out of range\n", sps_id);
-        return AVERROR_INVALIDDATA;
+        goto fail;
     }
 
-    sps_buf = av_buffer_allocz(sizeof(*sps));
-    if (!sps_buf)
-        return AVERROR(ENOMEM);
-    sps = (SPS*)sps_buf->data;
-
     sps->sps_id               = sps_id;
     sps->time_offset_length   = 24;
     sps->profile_idc          = profile_idc;
     sps->constraint_set_flags = constraint_set_flags;
     sps->level_idc            = level_idc;
+    sps->full_range           = -1;
 
     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
     sps->scaling_matrix_present = 0;
+    sps->colorspace = 2; //AVCOL_SPC_UNSPECIFIED
 
     if (sps->profile_idc == 100 ||  // High profile
         sps->profile_idc == 110 ||  // High10 profile
@@ -371,12 +376,16 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
         sps->profile_idc == 138 ||  // Multiview Depth High profile (MVCD)
         sps->profile_idc == 144) {  // old High444 profile
         sps->chroma_format_idc = get_ue_golomb_31(gb);
-        if (sps->chroma_format_idc > 3) {
+        if (sps->chroma_format_idc > 3U) {
             avpriv_request_sample(avctx, "chroma_format_idc %u",
                                   sps->chroma_format_idc);
             goto fail;
         } else if (sps->chroma_format_idc == 3) {
             sps->residual_color_transform_flag = get_bits1(gb);
+            if (sps->residual_color_transform_flag) {
+                av_log(avctx, AV_LOG_ERROR, "separate color planes are not supported\n");
+                goto fail;
+            }
         }
         sps->bit_depth_luma   = get_ue_golomb(gb) + 8;
         sps->bit_depth_chroma = get_ue_golomb(gb) + 8;
@@ -385,6 +394,12 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                                   "Different chroma and luma bit depth");
             goto fail;
         }
+        if (sps->bit_depth_luma   < 8 || sps->bit_depth_luma   > 14 ||
+            sps->bit_depth_chroma < 8 || sps->bit_depth_chroma > 14) {
+            av_log(avctx, AV_LOG_ERROR, "illegal bit depth value (%d, %d)\n",
+                   sps->bit_depth_luma, sps->bit_depth_chroma);
+            goto fail;
+        }
         sps->transform_bypass = get_bits1(gb);
         decode_scaling_matrices(gb, sps, NULL, 1,
                                 sps->scaling_matrix4, sps->scaling_matrix8);
@@ -407,7 +422,12 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     sps->poc_type = get_ue_golomb_31(gb);
 
     if (sps->poc_type == 0) { // FIXME #define
-        sps->log2_max_poc_lsb = get_ue_golomb(gb) + 4;
+        unsigned t = get_ue_golomb(gb);
+        if (t>12) {
+            av_log(avctx, AV_LOG_ERROR, "log2_max_poc_lsb (%d) is out of range\n", t);
+            goto fail;
+        }
+        sps->log2_max_poc_lsb = t + 4;
     } else if (sps->poc_type == 1) { // FIXME #define
         sps->delta_pic_order_always_zero_flag = get_bits1(gb);
         sps->offset_for_non_ref_pic           = get_se_golomb(gb);
@@ -429,8 +449,10 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     }
 
     sps->ref_frame_count = get_ue_golomb_31(gb);
+    if (avctx->codec_tag == MKTAG('S', 'M', 'V', '2'))
+        sps->ref_frame_count = FFMAX(2, sps->ref_frame_count);
     if (sps->ref_frame_count > H264_MAX_PICTURE_COUNT - 2 ||
-        sps->ref_frame_count >= 32U) {
+        sps->ref_frame_count > 16U) {
         av_log(avctx, AV_LOG_ERROR,
                "too many reference frames %d\n", sps->ref_frame_count);
         goto fail;
@@ -453,11 +475,6 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
         sps->mb_aff = 0;
 
     sps->direct_8x8_inference_flag = get_bits1(gb);
-    if (!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag) {
-        av_log(avctx, AV_LOG_ERROR,
-               "This stream was generated by a broken encoder, invalid 8x8 inference\n");
-        goto fail;
-    }
 
 #ifndef ALLOW_INTERLACE
     if (sps->mb_aff)
@@ -470,6 +487,8 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
         unsigned int crop_right  = get_ue_golomb(gb);
         unsigned int crop_top    = get_ue_golomb(gb);
         unsigned int crop_bottom = get_ue_golomb(gb);
+        int width  = 16 * sps->mb_width;
+        int height = 16 * sps->mb_height * (2 - sps->frame_mbs_only_flag);
 
         if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
             av_log(avctx, AV_LOG_DEBUG, "discarding sps cropping, original "
@@ -496,16 +515,15 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                        crop_left);
             }
 
-            if (INT_MAX / step_x             <= crop_left               ||
-                INT_MAX / step_x - crop_left <= crop_right              ||
-                16 * sps->mb_width <= step_x * (crop_left + crop_right) ||
-                INT_MAX / step_y             <= crop_top                ||
-                INT_MAX / step_y - crop_top  <= crop_bottom             ||
-                16 * sps->mb_height <= step_y * (crop_top + crop_bottom)) {
-                av_log(avctx, AV_LOG_WARNING, "Invalid crop parameters\n");
-                if (avctx->err_recognition & AV_EF_EXPLODE)
-                    goto fail;
-                crop_left = crop_right = crop_top = crop_bottom = 0;
+            if (crop_left  > (unsigned)INT_MAX / 4 / step_x ||
+                crop_right > (unsigned)INT_MAX / 4 / step_x ||
+                crop_top   > (unsigned)INT_MAX / 4 / step_y ||
+                crop_bottom> (unsigned)INT_MAX / 4 / step_y ||
+                (crop_left + crop_right ) * step_x >= width ||
+                (crop_top  + crop_bottom) * step_y >= height
+            ) {
+                av_log(avctx, AV_LOG_ERROR, "crop values invalid %d %d %d %d / %d %d\n", crop_left, crop_right, crop_top, crop_bottom, width, height);
+                goto fail;
             }
 
             sps->crop_left   = crop_left   * step_x;
@@ -524,7 +542,14 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     sps->vui_parameters_present_flag = get_bits1(gb);
     if (sps->vui_parameters_present_flag) {
         int ret = decode_vui_parameters(gb, avctx, sps);
-        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE)
+        if (ret < 0)
+            goto fail;
+    }
+
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, ignore_truncation ? AV_LOG_WARNING : AV_LOG_ERROR,
+               "Overread %s by %d bits\n", sps->vui_parameters_present_flag ? "VUI" : "SPS", -get_bits_left(gb));
+        if (!ignore_truncation)
             goto fail;
     }
 
@@ -547,7 +572,7 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
     if (avctx->debug & FF_DEBUG_PICT_INFO) {
         static const char csp[4][5] = { "Gray", "420", "422", "444" };
         av_log(avctx, AV_LOG_DEBUG,
-               "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%u/%u/%u/%u %s %s %"PRId32"/%"PRId32"\n",
+               "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%u/%u/%u/%u %s %s %"PRId32"/%"PRId32" b%d reo:%d\n",
                sps_id, sps->profile_idc, sps->level_idc,
                sps->poc_type,
                sps->ref_frame_count,
@@ -559,7 +584,10 @@ int ff_h264_decode_seq_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                sps->vui_parameters_present_flag ? "VUI" : "",
                csp[sps->chroma_format_idc],
                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
-               sps->timing_info_present_flag ? sps->time_scale : 0);
+               sps->timing_info_present_flag ? sps->time_scale : 0,
+               sps->bit_depth_luma,
+               sps->bitstream_restriction_flag ? sps->num_reorder_frames : -1
+               );
     }
 
     /* check if this is a repeat of an already parsed SPS, then keep the
@@ -637,6 +665,8 @@ static void init_dequant_tables(PPS *pps, const SPS *sps)
 {
     int i, x;
     init_dequant4_coeff_table(pps, sps);
+    memset(pps->dequant8_coeff, 0, sizeof(pps->dequant8_coeff));
+
     if (pps->transform_8x8_mode)
         init_dequant8_coeff_table(pps, sps);
     if (sps->transform_bypass) {
@@ -659,6 +689,20 @@ static void build_qp_table(PPS *pps, int t, int index, const int depth)
             ff_h264_chroma_qp[depth - 8][av_clip(i + index, 0, max_qp)];
 }
 
+static int more_rbsp_data_in_pps(const SPS *sps, void *logctx)
+{
+    int profile_idc = sps->profile_idc;
+
+    if ((profile_idc == 66 || profile_idc == 77 ||
+         profile_idc == 88) && (sps->constraint_set_flags & 7)) {
+        av_log(logctx, AV_LOG_VERBOSE,
+               "Current profile doesn't provide more RBSP data in PPS, skipping\n");
+        return 0;
+    }
+
+    return 1;
+}
+
 int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avctx,
                                          H264ParamSets *ps, int bit_length)
 {
@@ -680,6 +724,13 @@ int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avct
         return AVERROR(ENOMEM);
     pps = (PPS*)pps_buf->data;
 
+    pps->data_size = gb->buffer_end - gb->buffer;
+    if (pps->data_size > sizeof(pps->data)) {
+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS\n");
+        pps->data_size = sizeof(pps->data);
+    }
+    memcpy(pps->data, gb->buffer, pps->data_size);
+
     pps->sps_id = get_ue_golomb_31(gb);
     if ((unsigned)pps->sps_id >= MAX_SPS_COUNT ||
         !ps->sps_list[pps->sps_id]) {
@@ -688,10 +739,15 @@ int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avct
         goto fail;
     }
     sps = (SPS*)ps->sps_list[pps->sps_id]->data;
-
-    if (sps->bit_depth_luma > 10) {
+    if (sps->bit_depth_luma > 14) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid luma bit depth=%d\n",
+               sps->bit_depth_luma);
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    } else if (sps->bit_depth_luma == 11 || sps->bit_depth_luma == 13) {
         av_log(avctx, AV_LOG_ERROR,
-               "Unimplemented luma bit depth=%d (max=10)\n",
+               "Unimplemented luma bit depth=%d\n",
                sps->bit_depth_luma);
         ret = AVERROR_PATCHWELCOME;
         goto fail;
@@ -761,8 +817,7 @@ int ff_h264_decode_picture_parameter_set(GetBitContext *gb, AVCodecContext *avct
            sizeof(pps->scaling_matrix8));
 
     bits_left = bit_length - get_bits_count(gb);
-    if (bits_left && (bits_left > 8 ||
-                      show_bits(gb, bits_left) != 1 << (bits_left - 1))) {
+    if (bits_left > 0 && more_rbsp_data_in_pps(sps, avctx)) {
         pps->transform_8x8_mode = get_bits1(gb);
         decode_scaling_matrices(gb, sps, pps, 0,
                                 pps->scaling_matrix4, pps->scaling_matrix8);
diff --git a/libavcodec/h264_refs.c b/libavcodec/h264_refs.c
index 427930c..2f80de1 100644
--- a/libavcodec/h264_refs.c
+++ b/libavcodec/h264_refs.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 
 #include <inttypes.h>
 
+#include "libavutil/avassert.h"
 #include "internal.h"
 #include "avcodec.h"
 #include "h264.h"
@@ -79,16 +80,18 @@ static int build_def_list(H264Ref *def, int def_len,
     int  i[2] = { 0 };
     int index = 0;
 
-    while ((i[0] < len || i[1] < len) && index < def_len) {
+    while (i[0] < len || i[1] < len) {
         while (i[0] < len && !(in[i[0]] && (in[i[0]]->reference & sel)))
             i[0]++;
         while (i[1] < len && !(in[i[1]] && (in[i[1]]->reference & (sel ^ 3))))
             i[1]++;
-        if (i[0] < len && index < def_len) {
+        if (i[0] < len) {
+            av_assert0(index < def_len);
             in[i[0]]->pic_id = is_long ? i[0] : in[i[0]]->frame_num;
             split_field_copy(&def[index++], in[i[0]++], sel, 1);
         }
-        if (i[1] < len && index < def_len) {
+        if (i[1] < len) {
+            av_assert0(index < def_len);
             in[i[1]]->pic_id = is_long ? i[1] : in[i[1]]->frame_num;
             split_field_copy(&def[index++], in[i[1]++], sel ^ 3, 0);
         }
@@ -120,9 +123,18 @@ static int add_sorted(H264Picture **sorted, H264Picture * const *src,
     return out_i;
 }
 
-static void h264_initialise_ref_list(const H264Context *h, H264SliceContext *sl)
+static int mismatches_ref(const H264Context *h, const H264Picture *pic)
+{
+    const AVFrame *f = pic->f;
+    return (h->cur_pic_ptr->f->width  != f->width ||
+            h->cur_pic_ptr->f->height != f->height ||
+            h->cur_pic_ptr->f->format != f->format);
+}
+
+static void h264_initialise_ref_list(H264Context *h, H264SliceContext *sl)
 {
     int i, len;
+    int j;
 
     if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         H264Picture *sorted[32];
@@ -137,13 +149,14 @@ static void h264_initialise_ref_list(const H264Context *h, H264SliceContext *sl)
         for (list = 0; list < 2; list++) {
             len  = add_sorted(sorted,       h->short_ref, h->short_ref_count, cur_poc, 1 ^ list);
             len += add_sorted(sorted + len, h->short_ref, h->short_ref_count, cur_poc, 0 ^ list);
-            assert(len <= 32);
+            av_assert0(len <= 32);
 
             len  = build_def_list(sl->ref_list[list], FF_ARRAY_ELEMS(sl->ref_list[0]),
                                   sorted, len, 0, h->picture_structure);
             len += build_def_list(sl->ref_list[list] + len,
                                   FF_ARRAY_ELEMS(sl->ref_list[0]) - len,
                                   h->long_ref, 16, 1, h->picture_structure);
+            av_assert0(len <= 32);
 
             if (len < sl->ref_count[list])
                 memset(&sl->ref_list[list][len], 0, sizeof(H264Ref) * (sl->ref_count[list] - len));
@@ -164,10 +177,40 @@ static void h264_initialise_ref_list(const H264Context *h, H264SliceContext *sl)
         len += build_def_list(sl->ref_list[0] + len,
                               FF_ARRAY_ELEMS(sl->ref_list[0]) - len,
                               h-> long_ref, 16, 1, h->picture_structure);
+        av_assert0(len <= 32);
 
         if (len < sl->ref_count[0])
             memset(&sl->ref_list[0][len], 0, sizeof(H264Ref) * (sl->ref_count[0] - len));
     }
+#ifdef TRACE
+    for (i = 0; i < sl->ref_count[0]; i++) {
+        ff_tlog(h->avctx, "List0: %s fn:%d 0x%p\n",
+                (sl->ref_list[0][i].parent ? (sl->ref_list[0][i].parent->long_ref ? "LT" : "ST") : "??"),
+                sl->ref_list[0][i].pic_id,
+                sl->ref_list[0][i].data[0]);
+    }
+    if (sl->slice_type_nos == AV_PICTURE_TYPE_B) {
+        for (i = 0; i < sl->ref_count[1]; i++) {
+            ff_tlog(h->avctx, "List1: %s fn:%d 0x%p\n",
+                    (sl->ref_list[1][i].parent ? (sl->ref_list[1][i].parent->long_ref ? "LT" : "ST") : "??"),
+                    sl->ref_list[1][i].pic_id,
+                    sl->ref_list[1][i].data[0]);
+        }
+    }
+#endif
+
+    for (j = 0; j<1+(sl->slice_type_nos == AV_PICTURE_TYPE_B); j++) {
+        for (i = 0; i < sl->ref_count[j]; i++) {
+            if (sl->ref_list[j][i].parent) {
+                if (mismatches_ref(h, sl->ref_list[j][i].parent)) {
+                    av_log(h->avctx, AV_LOG_ERROR, "Discarding mismatching reference\n");
+                    memset(&sl->ref_list[j][i], 0, sizeof(sl->ref_list[j][i]));
+                }
+            }
+        }
+    }
+    for (i = 0; i < sl->list_count; i++)
+        h->default_ref[i] = sl->ref_list[i][0];
 }
 
 /**
@@ -227,7 +270,7 @@ static int pic_num_extract(const H264Context *h, int pic_num, int *structure)
     return pic_num;
 }
 
-int ff_h264_decode_ref_pic_list_reordering(const H264Context *h, H264SliceContext *sl)
+int ff_h264_decode_ref_pic_list_reordering(H264Context *h, H264SliceContext *sl)
 {
     int list, index, pic_structure;
 
@@ -257,7 +300,7 @@ int ff_h264_decode_ref_pic_list_reordering(const H264Context *h, H264SliceContex
                 switch (modification_of_pic_nums_idc) {
                 case 0:
                 case 1: {
-                    const unsigned int abs_diff_pic_num = get_ue_golomb(&sl->gb) + 1;
+                    const unsigned int abs_diff_pic_num = get_ue_golomb_long(&sl->gb) + 1;
                     int frame_num;
 
                     if (abs_diff_pic_num > h->max_pic_num) {
@@ -292,14 +335,14 @@ int ff_h264_decode_ref_pic_list_reordering(const H264Context *h, H264SliceContex
 
                     long_idx = pic_num_extract(h, pic_id, &pic_structure);
 
-                    if (long_idx > 31) {
+                    if (long_idx > 31U) {
                         av_log(h->avctx, AV_LOG_ERROR,
                                "long_term_pic_idx overflow\n");
                         return AVERROR_INVALIDDATA;
                     }
                     ref = h->long_ref[long_idx];
                     assert(!(ref && !ref->reference));
-                    if (ref && (ref->reference & pic_structure)) {
+                    if (ref && (ref->reference & pic_structure) && !mismatches_ref(h, ref)) {
                         ref->pic_id = pic_id;
                         assert(ref->long_ref);
                         i = 0;
@@ -339,13 +382,19 @@ int ff_h264_decode_ref_pic_list_reordering(const H264Context *h, H264SliceContex
     }
     for (list = 0; list < sl->list_count; list++) {
         for (index = 0; index < sl->ref_count[list]; index++) {
-            if (!sl->ref_list[list][index].parent) {
-                av_log(h->avctx, AV_LOG_ERROR, "Missing reference picture\n");
-                if (index == 0 || h->avctx->err_recognition & AV_EF_EXPLODE)
-                    return AVERROR_INVALIDDATA;
+            if (   !sl->ref_list[list][index].parent
+                || (!FIELD_PICTURE(h) && (sl->ref_list[list][index].reference&3) != 3)) {
+                int i;
+                av_log(h->avctx, AV_LOG_ERROR, "Missing reference picture, default is %d\n", h->default_ref[list].poc);
+                for (i = 0; i < FF_ARRAY_ELEMS(h->last_pocs); i++)
+                    h->last_pocs[i] = INT_MIN;
+                if (h->default_ref[list].parent
+                    && !(!FIELD_PICTURE(h) && (h->default_ref[list].reference&3) != 3))
+                    sl->ref_list[list][index] = h->default_ref[list];
                 else
-                    sl->ref_list[list][index] = sl->ref_list[list][index - 1];
+                    return -1;
             }
+            av_assert0(av_buffer_get_ref_count(sl->ref_list[list][index].parent->f->buf[0]) > 0);
         }
     }
 
@@ -355,7 +404,7 @@ int ff_h264_decode_ref_pic_list_reordering(const H264Context *h, H264SliceContex
 void ff_h264_fill_mbaff_ref_list(H264SliceContext *sl)
 {
     int list, i, j;
-    for (list = 0; list < sl->list_count; list++) { //FIXME try list_count
+    for (list = 0; list < sl->list_count; list++) {
         for (i = 0; i < sl->ref_count[list]; i++) {
             H264Ref *frame = &sl->ref_list[list][i];
             H264Ref *field = &sl->ref_list[list][16 + 2 * i];
@@ -500,11 +549,24 @@ void ff_h264_remove_all_refs(H264Context *h)
     }
     assert(h->long_ref_count == 0);
 
+    if (h->short_ref_count && !h->last_pic_for_ec.f->data[0]) {
+        ff_h264_unref_picture(h, &h->last_pic_for_ec);
+        if (h->short_ref[0]->f->buf[0])
+            ff_h264_ref_picture(h, &h->last_pic_for_ec, h->short_ref[0]);
+    }
+
     for (i = 0; i < h->short_ref_count; i++) {
         unreference_pic(h, h->short_ref[i], 0);
         h->short_ref[i] = NULL;
     }
     h->short_ref_count = 0;
+
+    memset(h->default_ref, 0, sizeof(h->default_ref));
+    for (i = 0; i < h->nb_slice_ctx; i++) {
+        H264SliceContext *sl = &h->slice_ctx[i];
+        sl->list_count = sl->ref_count[0] = sl->ref_count[1] = 0;
+        memset(sl->ref_list, 0, sizeof(sl->ref_list));
+    }
 }
 
 static int check_opcodes(MMCO *mmco1, MMCO *mmco2, int n_mmcos)
@@ -512,8 +574,11 @@ static int check_opcodes(MMCO *mmco1, MMCO *mmco2, int n_mmcos)
     int i;
 
     for (i = 0; i < n_mmcos; i++) {
-        if (mmco1[i].opcode != mmco2[i].opcode)
+        if (mmco1[i].opcode != mmco2[i].opcode) {
+            av_log(NULL, AV_LOG_ERROR, "MMCO opcode [%d, %d] at %d mismatches between slices\n",
+                   mmco1[i].opcode, mmco2[i].opcode, i);
             return -1;
+        }
     }
 
     return 0;
@@ -524,10 +589,8 @@ int ff_generate_sliding_window_mmcos(H264Context *h, int first_slice)
     MMCO mmco_temp[MAX_MMCO_COUNT], *mmco = first_slice ? h->mmco : mmco_temp;
     int mmco_index = 0, i = 0;
 
-    assert(h->long_ref_count + h->short_ref_count <= h->ps.sps->ref_frame_count);
-
     if (h->short_ref_count &&
-        h->long_ref_count + h->short_ref_count == h->ps.sps->ref_frame_count &&
+        h->long_ref_count + h->short_ref_count >= h->ps.sps->ref_frame_count &&
         !(FIELD_PICTURE(h) && !h->first_field && h->cur_pic_ptr->reference)) {
         mmco[0].opcode        = MMCO_SHORT2UNUSED;
         mmco[0].short_pic_num = h->short_ref[h->short_ref_count - 1]->frame_num;
@@ -546,8 +609,8 @@ int ff_generate_sliding_window_mmcos(H264Context *h, int first_slice)
                (mmco_index != h->mmco_index ||
                 (i = check_opcodes(h->mmco, mmco_temp, mmco_index)))) {
         av_log(h->avctx, AV_LOG_ERROR,
-               "Inconsistent MMCO state between slices [%d, %d, %d]\n",
-               mmco_index, h->mmco_index, i);
+               "Inconsistent MMCO state between slices [%d, %d]\n",
+               mmco_index, h->mmco_index);
         return AVERROR_INVALIDDATA;
     }
     return 0;
@@ -556,6 +619,7 @@ int ff_generate_sliding_window_mmcos(H264Context *h, int first_slice)
 int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
 {
     int i, av_uninit(j);
+    int pps_ref_count[2] = {0};
     int current_ref_assigned = 0, err = 0;
     H264Picture *av_uninit(pic);
 
@@ -576,7 +640,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
                 if (mmco[i].opcode != MMCO_SHORT2LONG ||
                     !h->long_ref[mmco[i].long_arg]    ||
                     h->long_ref[mmco[i].long_arg]->frame_num != frame_num) {
-                    av_log(h->avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
+                    av_log(h->avctx, h->short_ref_count ? AV_LOG_ERROR : AV_LOG_DEBUG, "mmco: unref short failure\n");
                     err = AVERROR_INVALIDDATA;
                 }
                 continue;
@@ -617,19 +681,24 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
                      * Report the problem and keep the pair where it is,
                      * and mark this field valid.
                      */
-            if (h->short_ref[0] == h->cur_pic_ptr)
+            if (h->short_ref[0] == h->cur_pic_ptr) {
+                av_log(h->avctx, AV_LOG_ERROR, "mmco: cannot assign current picture to short and long at the same time\n");
                 remove_short_at_index(h, 0);
+            }
 
             /* make sure the current picture is not already assigned as a long ref */
             if (h->cur_pic_ptr->long_ref) {
                 for (j = 0; j < FF_ARRAY_ELEMS(h->long_ref); j++) {
-                    if (h->long_ref[j] == h->cur_pic_ptr)
+                    if (h->long_ref[j] == h->cur_pic_ptr) {
+                        if (j != mmco[i].long_arg)
+                            av_log(h->avctx, AV_LOG_ERROR, "mmco: cannot assign current picture to 2 long term references\n");
                         remove_long(h, j, 0);
+                    }
                 }
             }
 
-
             if (h->long_ref[mmco[i].long_arg] != h->cur_pic_ptr) {
+                av_assert0(!h->cur_pic_ptr->long_ref);
                 remove_long(h, mmco[i].long_arg, 0);
 
                 h->long_ref[mmco[i].long_arg]           = h->cur_pic_ptr;
@@ -657,6 +726,8 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
             h->poc.frame_num = h->cur_pic_ptr->frame_num = 0;
             h->mmco_reset = 1;
             h->cur_pic_ptr->mmco_reset = 1;
+            for (j = 0; j < MAX_DELAYED_PIC_COUNT; j++)
+                h->last_pocs[j] = INT_MIN;
             break;
         default: assert(0);
         }
@@ -671,7 +742,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
          */
         if (h->short_ref_count && h->short_ref[0] == h->cur_pic_ptr) {
             /* Just mark the second field valid */
-            h->cur_pic_ptr->reference = PICT_FRAME;
+            h->cur_pic_ptr->reference |= h->picture_structure;
         } else if (h->cur_pic_ptr->long_ref) {
             av_log(h->avctx, AV_LOG_ERROR, "illegal short term reference "
                                            "assignment for second field "
@@ -695,8 +766,7 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
         }
     }
 
-    if (h->long_ref_count + h->short_ref_count -
-        (h->short_ref[0] == h->cur_pic_ptr) > h->ps.sps->ref_frame_count) {
+    if (h->long_ref_count + h->short_ref_count > FFMAX(h->ps.sps->ref_frame_count, 1)) {
 
         /* We have too many reference frames, probably due to corrupted
          * stream. Need to discard one frame. Prevents overrun of the
@@ -721,8 +791,37 @@ int ff_h264_execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count)
         }
     }
 
+    for (i = 0; i<h->short_ref_count; i++) {
+        pic = h->short_ref[i];
+        if (pic->invalid_gap) {
+            int d = av_mod_uintp2(h->cur_pic_ptr->frame_num - pic->frame_num, h->ps.sps->log2_max_frame_num);
+            if (d > h->ps.sps->ref_frame_count)
+                remove_short(h, pic->frame_num, 0);
+        }
+    }
+
     print_short_term(h);
     print_long_term(h);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.pps_list); i++) {
+        if (h->ps.pps_list[i]) {
+            const PPS *pps = (const PPS *)h->ps.pps_list[i]->data;
+            pps_ref_count[0] = FFMAX(pps_ref_count[0], pps->ref_count[0]);
+            pps_ref_count[1] = FFMAX(pps_ref_count[1], pps->ref_count[1]);
+        }
+    }
+
+    if (   err >= 0
+        && h->long_ref_count==0
+        && (   h->short_ref_count<=2
+            || pps_ref_count[0] <= 1 + (h->picture_structure != PICT_FRAME) && pps_ref_count[1] <= 1)
+        && pps_ref_count[0]<=2 + (h->picture_structure != PICT_FRAME) + (2*!h->has_recovery_point)
+        && h->cur_pic_ptr->f->pict_type == AV_PICTURE_TYPE_I){
+        h->cur_pic_ptr->recovered |= 1;
+        if(!h->avctx->has_b_frames)
+            h->frame_recovered |= FRAME_RECOVERED_SEI;
+    }
+
     return (h->avctx->err_recognition & AV_EF_EXPLODE) ? err : 0;
 }
 
@@ -730,7 +829,7 @@ int ff_h264_decode_ref_pic_marking(H264Context *h, GetBitContext *gb,
                                    int first_slice)
 {
     int i, ret;
-    MMCO mmco_temp[MAX_MMCO_COUNT], *mmco = first_slice ? h->mmco : mmco_temp;
+    MMCO mmco_temp[MAX_MMCO_COUNT], *mmco = mmco_temp;
     int mmco_index = 0;
 
     if (h->nal_unit_type == NAL_IDR_SLICE) { // FIXME fields
@@ -748,7 +847,7 @@ int ff_h264_decode_ref_pic_marking(H264Context *h, GetBitContext *gb,
                 mmco[i].opcode = opcode;
                 if (opcode == MMCO_SHORT2UNUSED || opcode == MMCO_SHORT2LONG) {
                     mmco[i].short_pic_num =
-                        (h->curr_pic_num - get_ue_golomb(gb) - 1) &
+                        (h->curr_pic_num - get_ue_golomb_long(gb) - 1) &
                             (h->max_pic_num - 1);
 #if 0
                     if (mmco[i].short_pic_num >= h->short_ref_count ||
@@ -796,6 +895,7 @@ int ff_h264_decode_ref_pic_marking(H264Context *h, GetBitContext *gb,
     }
 
     if (first_slice && mmco_index != -1) {
+        memcpy(h->mmco, mmco_temp, sizeof(h->mmco));
         h->mmco_index = mmco_index;
     } else if (!first_slice && mmco_index >= 0 &&
                (mmco_index != h->mmco_index ||
diff --git a/libavcodec/h264_sei.c b/libavcodec/h264_sei.c
index 0e3952d..62561fb 100644
--- a/libavcodec/h264_sei.c
+++ b/libavcodec/h264_sei.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... SEI decoding
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,6 @@ static const uint8_t sei_num_clock_ts_table[9] = {
 
 void ff_h264_sei_uninit(H264SEIContext *h)
 {
-    h->unregistered.x264_build           = -1;
     h->recovery_point.recovery_frame_cnt = -1;
 
     h->picture_timing.dpb_output_delay  = 0;
@@ -54,15 +53,24 @@ void ff_h264_sei_uninit(H264SEIContext *h)
 }
 
 static int decode_picture_timing(H264SEIPictureTiming *h, GetBitContext *gb,
-                                 const SPS *sps, void *logctx)
+                                 const H264ParamSets *ps, void *logctx)
 {
-    if (!sps)
-        return AVERROR_INVALIDDATA;
+    int i;
+    const SPS *sps = ps->sps;
+
+    for (i = 0; i<MAX_SPS_COUNT; i++)
+        if ((!sps || !sps->log2_max_frame_num) && ps->sps_list[i])
+            sps = (const SPS *)ps->sps_list[i]->data;
+
+    if (!sps) {
+        av_log(logctx, AV_LOG_ERROR, "SPS unavailable in decode_picture_timing\n");
+        return 0;
+    }
 
     if (sps->nal_hrd_parameters_present_flag ||
         sps->vcl_hrd_parameters_present_flag) {
-        h->cpb_removal_delay = get_bits(gb, sps->cpb_removal_delay_length);
-        h->dpb_output_delay  = get_bits(gb, sps->dpb_output_delay_length);
+        h->cpb_removal_delay = get_bits_long(gb, sps->cpb_removal_delay_length);
+        h->dpb_output_delay  = get_bits_long(gb, sps->dpb_output_delay_length);
     }
     if (sps->pic_struct_present_flag) {
         unsigned int i, num_clock_ts;
@@ -179,8 +187,6 @@ static int decode_registered_user_data_closed_caption(H264SEIA53Caption *h,
         }
     } else {
         int i;
-        avpriv_request_sample(logctx, "Subtitles with data type 0x%02x",
-                              user_data_type_code);
         for (i = 0; i < size - 1; i++)
             skip_bits(gb, 8);
     }
@@ -243,6 +249,8 @@ static int decode_unregistered_user_data(H264SEIUnregistered *h, GetBitContext *
     e = sscanf(user_data + 16, "x264 - core %d", &build);
     if (e == 1 && build > 0)
         h->x264_build = build;
+    if (e == 1 && build == 1 && !strncmp(user_data+16, "x264 - core 0000", 16))
+        h->x264_build = 67;
 
     if (strlen(user_data + 16) > 0)
         av_log(logctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data + 16);
@@ -253,7 +261,7 @@ static int decode_unregistered_user_data(H264SEIUnregistered *h, GetBitContext *
 
 static int decode_recovery_point(H264SEIRecoveryPoint *h, GetBitContext *gb)
 {
-    h->recovery_frame_cnt = get_ue_golomb(gb);
+    h->recovery_frame_cnt = get_ue_golomb_long(gb);
 
     /* 1b exact_match_flag,
      * 1b broken_link_flag,
@@ -282,7 +290,7 @@ static int decode_buffering_period(H264SEIBufferingPeriod *h, GetBitContext *gb,
     if (sps->nal_hrd_parameters_present_flag) {
         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
             h->initial_cpb_removal_delay[sched_sel_idx] =
-                get_bits(gb, sps->initial_cpb_removal_delay_length);
+                get_bits_long(gb, sps->initial_cpb_removal_delay_length);
             // initial_cpb_removal_delay_offset
             skip_bits(gb, sps->initial_cpb_removal_delay_length);
         }
@@ -290,7 +298,7 @@ static int decode_buffering_period(H264SEIBufferingPeriod *h, GetBitContext *gb,
     if (sps->vcl_hrd_parameters_present_flag) {
         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
             h->initial_cpb_removal_delay[sched_sel_idx] =
-                get_bits(gb, sps->initial_cpb_removal_delay_length);
+                get_bits_long(gb, sps->initial_cpb_removal_delay_length);
             // initial_cpb_removal_delay_offset
             skip_bits(gb, sps->initial_cpb_removal_delay_length);
         }
@@ -303,12 +311,13 @@ static int decode_buffering_period(H264SEIBufferingPeriod *h, GetBitContext *gb,
 static int decode_frame_packing_arrangement(H264SEIFramePacking *h,
                                             GetBitContext *gb)
 {
-    get_ue_golomb(gb);              // frame_packing_arrangement_id
-    h->present = !get_bits1(gb);
+    h->frame_packing_arrangement_id          = get_ue_golomb_long(gb);
+    h->frame_packing_arrangement_cancel_flag = get_bits1(gb);
+    h->present = !h->frame_packing_arrangement_cancel_flag;
 
     if (h->present) {
-        h->arrangement_type = get_bits(gb, 7);
-        h->quincunx_subsampling           = get_bits1(gb);
+        h->frame_packing_arrangement_type = get_bits(gb, 7);
+        h->quincunx_sampling_flag         = get_bits1(gb);
         h->content_interpretation_type    = get_bits(gb, 6);
 
         // the following skips: spatial_flipping_flag, frame0_flipped_flag,
@@ -316,10 +325,10 @@ static int decode_frame_packing_arrangement(H264SEIFramePacking *h,
         // frame0_self_contained_flag, frame1_self_contained_flag
         skip_bits(gb, 6);
 
-        if (!h->quincunx_subsampling && h->arrangement_type != 5)
+        if (!h->quincunx_sampling_flag && h->frame_packing_arrangement_type != 5)
             skip_bits(gb, 16);      // frame[01]_grid_position_[xy]
         skip_bits(gb, 8);           // frame_packing_arrangement_reserved_byte
-        get_ue_golomb(gb);          // frame_packing_arrangement_repetition_period
+        h->frame_packing_arrangement_repetition_period = get_ue_golomb_long(gb);
     }
     skip_bits1(gb);                 // frame_packing_arrangement_extension_flag
 
@@ -336,8 +345,33 @@ static int decode_display_orientation(H264SEIDisplayOrientation *h,
         h->vflip = get_bits1(gb);     // ver_flip
 
         h->anticlockwise_rotation = get_bits(gb, 16);
-        get_ue_golomb(gb);  // display_orientation_repetition_period
-        skip_bits1(gb);     // display_orientation_extension_flag
+        get_ue_golomb_long(gb);       // display_orientation_repetition_period
+        skip_bits1(gb);               // display_orientation_extension_flag
+    }
+
+    return 0;
+}
+
+static int decode_green_metadata(H264SEIGreenMetaData *h, GetBitContext *gb)
+{
+    h->green_metadata_type = get_bits(gb, 8);
+
+    if (h->green_metadata_type == 0) {
+        h->period_type = get_bits(gb, 8);
+
+        if (h->period_type == 2)
+            h->num_seconds = get_bits(gb, 16);
+        else if (h->period_type == 3)
+            h->num_pictures = get_bits(gb, 16);
+
+        h->percent_non_zero_macroblocks            = get_bits(gb, 8);
+        h->percent_intra_coded_macroblocks         = get_bits(gb, 8);
+        h->percent_six_tap_filtering               = get_bits(gb, 8);
+        h->percent_alpha_point_deblocking_instance = get_bits(gb, 8);
+
+    } else if (h->green_metadata_type == 1) {
+        h->xsd_metric_type  = get_bits(gb, 8);
+        h->xsd_metric_value = get_bits(gb, 16);
     }
 
     return 0;
@@ -346,34 +380,34 @@ static int decode_display_orientation(H264SEIDisplayOrientation *h,
 int ff_h264_sei_decode(H264SEIContext *h, GetBitContext *gb,
                        const H264ParamSets *ps, void *logctx)
 {
-    while (get_bits_left(gb) > 16) {
-        int size = 0;
+    while (get_bits_left(gb) > 16 && show_bits(gb, 16)) {
         int type = 0;
+        unsigned size = 0;
+        unsigned next;
         int ret  = 0;
-        int last = 0;
 
-        while (get_bits_left(gb) >= 8 &&
-               (last = get_bits(gb, 8)) == 255) {
-            type += 255;
-        }
-        type += last;
+        do {
+            if (get_bits_left(gb) < 8)
+                return AVERROR_INVALIDDATA;
+            type += show_bits(gb, 8);
+        } while (get_bits(gb, 8) == 255);
 
-        last = 0;
-        while (get_bits_left(gb) >= 8 &&
-               (last = get_bits(gb, 8)) == 255) {
-            size += 255;
-        }
-        size += last;
+        do {
+            if (get_bits_left(gb) < 8)
+                return AVERROR_INVALIDDATA;
+            size += show_bits(gb, 8);
+        } while (get_bits(gb, 8) == 255);
 
         if (size > get_bits_left(gb) / 8) {
-            av_log(logctx, AV_LOG_ERROR, "SEI type %d truncated at %d\n",
-                   type, get_bits_left(gb));
+            av_log(logctx, AV_LOG_ERROR, "SEI type %d size %d truncated at %d\n",
+                   type, 8*size, get_bits_left(gb));
             return AVERROR_INVALIDDATA;
         }
+        next = get_bits_count(gb) + 8 * size;
 
         switch (type) {
         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
-            ret = decode_picture_timing(&h->picture_timing, gb, ps->sps, logctx);
+            ret = decode_picture_timing(&h->picture_timing, gb, ps, logctx);
             break;
         case SEI_TYPE_USER_DATA_REGISTERED:
             ret = decode_registered_user_data(h, gb, logctx, size);
@@ -393,16 +427,65 @@ int ff_h264_sei_decode(H264SEIContext *h, GetBitContext *gb,
         case SEI_TYPE_DISPLAY_ORIENTATION:
             ret = decode_display_orientation(&h->display_orientation, gb);
             break;
+        case SEI_TYPE_GREEN_METADATA:
+            ret = decode_green_metadata(&h->green_metadata, gb);
+            break;
         default:
             av_log(logctx, AV_LOG_DEBUG, "unknown SEI type %d\n", type);
-            skip_bits(gb, 8 * size);
         }
         if (ret < 0)
             return ret;
 
+        skip_bits_long(gb, next - get_bits_count(gb));
+
         // FIXME check bits here
         align_get_bits(gb);
     }
 
     return 0;
 }
+
+const char *ff_h264_sei_stereo_mode(const H264SEIFramePacking *h)
+{
+    if (h->frame_packing_arrangement_cancel_flag == 0) {
+        switch (h->frame_packing_arrangement_type) {
+            case SEI_FPA_TYPE_CHECKERBOARD:
+                if (h->content_interpretation_type == 2)
+                    return "checkerboard_rl";
+                else
+                    return "checkerboard_lr";
+            case SEI_FPA_TYPE_INTERLEAVE_COLUMN:
+                if (h->content_interpretation_type == 2)
+                    return "col_interleaved_rl";
+                else
+                    return "col_interleaved_lr";
+            case SEI_FPA_TYPE_INTERLEAVE_ROW:
+                if (h->content_interpretation_type == 2)
+                    return "row_interleaved_rl";
+                else
+                    return "row_interleaved_lr";
+            case SEI_FPA_TYPE_SIDE_BY_SIDE:
+                if (h->content_interpretation_type == 2)
+                    return "right_left";
+                else
+                    return "left_right";
+            case SEI_FPA_TYPE_TOP_BOTTOM:
+                if (h->content_interpretation_type == 2)
+                    return "bottom_top";
+                else
+                    return "top_bottom";
+            case SEI_FPA_TYPE_INTERLEAVE_TEMPORAL:
+                if (h->content_interpretation_type == 2)
+                    return "block_rl";
+                else
+                    return "block_lr";
+            case SEI_FPA_TYPE_2D:
+            default:
+                return "mono";
+        }
+    } else if (h->frame_packing_arrangement_cancel_flag == 1) {
+        return "mono";
+    } else {
+        return NULL;
+    }
+}
diff --git a/libavcodec/h264_sei.h b/libavcodec/h264_sei.h
index 58f5ecc..9197795 100644
--- a/libavcodec/h264_sei.h
+++ b/libavcodec/h264_sei.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@ typedef enum {
     SEI_TYPE_RECOVERY_POINT         = 6,   ///< recovery point (frame # to decoder sync)
     SEI_TYPE_FRAME_PACKING          = 45,  ///< frame packing arrangement
     SEI_TYPE_DISPLAY_ORIENTATION    = 47,  ///< display orientation
+    SEI_TYPE_GREEN_METADATA         = 56   ///< GreenMPEG information
 } SEI_Type;
 
 /**
@@ -49,6 +50,19 @@ typedef enum {
     SEI_PIC_STRUCT_FRAME_TRIPLING    = 8  ///<  8: %frame tripling
 } SEI_PicStructType;
 
+/**
+ * frame_packing_arrangement types
+ */
+typedef enum {
+    SEI_FPA_TYPE_CHECKERBOARD        = 0,
+    SEI_FPA_TYPE_INTERLEAVE_COLUMN   = 1,
+    SEI_FPA_TYPE_INTERLEAVE_ROW      = 2,
+    SEI_FPA_TYPE_SIDE_BY_SIDE        = 3,
+    SEI_FPA_TYPE_TOP_BOTTOM          = 4,
+    SEI_FPA_TYPE_INTERLEAVE_TEMPORAL = 5,
+    SEI_FPA_TYPE_2D                  = 6,
+} SEI_FpaType;
+
 typedef struct H264SEIPictureTiming {
     SEI_PicStructType pic_struct;
 
@@ -102,9 +116,12 @@ typedef struct H264SEIBufferingPeriod {
 
 typedef struct H264SEIFramePacking {
     int present;
-    int arrangement_type;
+    int frame_packing_arrangement_id;
+    int frame_packing_arrangement_cancel_flag;  ///< is previous arrangement canceled, -1 if never received
+    SEI_FpaType frame_packing_arrangement_type;
+    int frame_packing_arrangement_repetition_period;
     int content_interpretation_type;
-    int quincunx_subsampling;
+    int quincunx_sampling_flag;
 } H264SEIFramePacking;
 
 typedef struct H264SEIDisplayOrientation {
@@ -113,6 +130,19 @@ typedef struct H264SEIDisplayOrientation {
     int hflip, vflip;
 } H264SEIDisplayOrientation;
 
+typedef struct H264SEIGreenMetaData {
+    uint8_t green_metadata_type;
+    uint8_t period_type;
+    uint16_t num_seconds;
+    uint16_t num_pictures;
+    uint8_t percent_non_zero_macroblocks;
+    uint8_t percent_intra_coded_macroblocks;
+    uint8_t percent_six_tap_filtering;
+    uint8_t percent_alpha_point_deblocking_instance;
+    uint8_t xsd_metric_type;
+    uint16_t xsd_metric_value;
+} H264SEIGreenMetaData;
+
 typedef struct H264SEIContext {
     H264SEIPictureTiming picture_timing;
     H264SEIAFD afd;
@@ -122,6 +152,7 @@ typedef struct H264SEIContext {
     H264SEIBufferingPeriod buffering_period;
     H264SEIFramePacking frame_packing;
     H264SEIDisplayOrientation display_orientation;
+    H264SEIGreenMetaData green_metadata;
 } H264SEIContext;
 
 struct H264ParamSets;
@@ -134,4 +165,9 @@ int ff_h264_sei_decode(H264SEIContext *h, GetBitContext *gb,
  */
 void ff_h264_sei_uninit(H264SEIContext *h);
 
+/**
+ * Get stereo_mode string from the h264 frame_packing_arrangement
+ */
+const char *ff_h264_sei_stereo_mode(const H264SEIFramePacking *h);
+
 #endif /* AVCODEC_H264_SEI_H */
diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
index 7031ee2..474400b 100644
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,14 +43,14 @@
 #include "rectangle.h"
 #include "thread.h"
 
-static const uint8_t field_scan[16] = {
+static const uint8_t field_scan[16+1] = {
     0 + 0 * 4, 0 + 1 * 4, 1 + 0 * 4, 0 + 2 * 4,
     0 + 3 * 4, 1 + 1 * 4, 1 + 2 * 4, 1 + 3 * 4,
     2 + 0 * 4, 2 + 1 * 4, 2 + 2 * 4, 2 + 3 * 4,
     3 + 0 * 4, 3 + 1 * 4, 3 + 2 * 4, 3 + 3 * 4,
 };
 
-static const uint8_t field_scan8x8[64] = {
+static const uint8_t field_scan8x8[64+1] = {
     0 + 0 * 8, 0 + 1 * 8, 0 + 2 * 8, 1 + 0 * 8,
     1 + 1 * 8, 0 + 3 * 8, 0 + 4 * 8, 1 + 2 * 8,
     2 + 0 * 8, 1 + 3 * 8, 0 + 5 * 8, 0 + 6 * 8,
@@ -69,7 +69,7 @@ static const uint8_t field_scan8x8[64] = {
     7 + 4 * 8, 7 + 5 * 8, 7 + 6 * 8, 7 + 7 * 8,
 };
 
-static const uint8_t field_scan8x8_cavlc[64] = {
+static const uint8_t field_scan8x8_cavlc[64+1] = {
     0 + 0 * 8, 1 + 1 * 8, 2 + 0 * 8, 0 + 7 * 8,
     2 + 2 * 8, 2 + 3 * 8, 2 + 4 * 8, 3 + 3 * 8,
     3 + 4 * 8, 4 + 3 * 8, 4 + 4 * 8, 5 + 3 * 8,
@@ -89,7 +89,7 @@ static const uint8_t field_scan8x8_cavlc[64] = {
 };
 
 // zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)]
-static const uint8_t zigzag_scan8x8_cavlc[64] = {
+static const uint8_t zigzag_scan8x8_cavlc[64+1] = {
     0 + 0 * 8, 1 + 1 * 8, 1 + 2 * 8, 2 + 2 * 8,
     4 + 1 * 8, 0 + 5 * 8, 3 + 3 * 8, 7 + 0 * 8,
     3 + 4 * 8, 1 + 7 * 8, 5 + 3 * 8, 6 + 3 * 8,
@@ -131,9 +131,9 @@ static int alloc_scratch_buffers(H264SliceContext *sl, int linesize)
     // (= 21x21 for  H.264)
     av_fast_malloc(&sl->edge_emu_buffer, &sl->edge_emu_buffer_allocated, alloc_size * 2 * 21);
 
-    av_fast_malloc(&sl->top_borders[0], &sl->top_borders_allocated[0],
+    av_fast_mallocz(&sl->top_borders[0], &sl->top_borders_allocated[0],
                    h->mb_width * 16 * 3 * sizeof(uint8_t) * 2);
-    av_fast_malloc(&sl->top_borders[1], &sl->top_borders_allocated[1],
+    av_fast_mallocz(&sl->top_borders[1], &sl->top_borders_allocated[1],
                    h->mb_width * 16 * 3 * sizeof(uint8_t) * 2);
 
     if (!sl->bipred_scratchpad || !sl->edge_emu_buffer ||
@@ -192,6 +192,10 @@ static int alloc_picture(H264Context *h, H264Picture *pic)
     if (ret < 0)
         goto fail;
 
+    pic->crop     = h->ps.sps->crop;
+    pic->crop_top = h->ps.sps->crop_top;
+    pic->crop_left= h->ps.sps->crop_left;
+
     if (h->avctx->hwaccel) {
         const AVHWAccel *hwaccel = h->avctx->hwaccel;
         av_assert0(!pic->hwaccel_picture_private);
@@ -202,6 +206,18 @@ static int alloc_picture(H264Context *h, H264Picture *pic)
             pic->hwaccel_picture_private = pic->hwaccel_priv_buf->data;
         }
     }
+    if (CONFIG_GRAY && !h->avctx->hwaccel && h->flags & AV_CODEC_FLAG_GRAY && pic->f->data[2]) {
+        int h_chroma_shift, v_chroma_shift;
+        av_pix_fmt_get_chroma_sub_sample(pic->f->format,
+                                         &h_chroma_shift, &v_chroma_shift);
+
+        for(i=0; i<AV_CEIL_RSHIFT(pic->f->height, v_chroma_shift); i++) {
+            memset(pic->f->data[1] + pic->f->linesize[1]*i,
+                   0x80, AV_CEIL_RSHIFT(pic->f->width, h_chroma_shift));
+            memset(pic->f->data[2] + pic->f->linesize[2]*i,
+                   0x80, AV_CEIL_RSHIFT(pic->f->width, h_chroma_shift));
+        }
+    }
 
     if (!h->qscale_table_pool) {
         ret = init_table_pools(h);
@@ -254,40 +270,13 @@ static int find_unused_picture(H264Context *h)
     return i;
 }
 
-static int initialize_cur_frame(H264Context *h)
-{
-    H264Picture *cur;
-    int ret;
-
-    release_unused_pictures(h, 1);
-    ff_h264_unref_picture(h, &h->cur_pic);
-    h->cur_pic_ptr = NULL;
-
-    ret = find_unused_picture(h);
-    if (ret < 0) {
-        av_log(h->avctx, AV_LOG_ERROR, "no frame buffer available\n");
-        return ret;
-    }
-    cur = &h->DPB[ret];
 
-    ret = alloc_picture(h, cur);
-    if (ret < 0)
-        return ret;
-
-    ret = ff_h264_ref_picture(h, &h->cur_pic, cur);
-    if (ret < 0)
-        return ret;
-    h->cur_pic_ptr = cur;
-
-    return 0;
-}
-
-#define IN_RANGE(a, b, size) (((a) >= (b)) && ((a) < ((b) + (size))))
+#define IN_RANGE(a, b, size) (((void*)(a) >= (void*)(b)) && ((void*)(a) < (void*)((b) + (size))))
 
 #define REBASE_PICTURE(pic, new_ctx, old_ctx)             \
-    ((pic && pic >= old_ctx->DPB &&                       \
-      pic < old_ctx->DPB + H264_MAX_PICTURE_COUNT) ?          \
-     &new_ctx->DPB[pic - old_ctx->DPB] : NULL)
+    (((pic) && (pic) >= (old_ctx)->DPB &&                       \
+      (pic) < (old_ctx)->DPB + H264_MAX_PICTURE_COUNT) ?          \
+     &(new_ctx)->DPB[(pic) - (old_ctx)->DPB] : NULL)
 
 static void copy_picture_range(H264Picture **to, H264Picture **from, int count,
                                H264Context *new_base,
@@ -296,10 +285,9 @@ static void copy_picture_range(H264Picture **to, H264Picture **from, int count,
     int i;
 
     for (i = 0; i < count; i++) {
-        assert((IN_RANGE(from[i], old_base, sizeof(*old_base)) ||
-                IN_RANGE(from[i], old_base->DPB,
-                         sizeof(H264Picture) * H264_MAX_PICTURE_COUNT) ||
-                !from[i]));
+        av_assert1(!from[i] ||
+                   IN_RANGE(from[i], old_base, 1) ||
+                   IN_RANGE(from[i], old_base->DPB, H264_MAX_PICTURE_COUNT));
         to[i] = REBASE_PICTURE(from[i], new_base, old_base);
     }
 }
@@ -314,11 +302,12 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     int need_reinit = 0;
     int i, ret;
 
-    if (dst == src || !h1->context_initialized)
+    if (dst == src)
         return 0;
 
-    if (!h1->ps.sps)
-        return AVERROR_INVALIDDATA;
+    // We can't fail if SPS isn't set at it breaks current skip_frame code
+    //if (!h1->ps.sps)
+    //    return AVERROR_INVALIDDATA;
 
     if (inited &&
         (h->width                 != h1->width                 ||
@@ -332,6 +321,9 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
         need_reinit = 1;
     }
 
+    /* copy block_offset since frame_start may not be called */
+    memcpy(h->block_offset, h1->block_offset, sizeof(h->block_offset));
+
     // SPS/PPS
     for (i = 0; i < FF_ARRAY_ELEMS(h->ps.sps_list); i++) {
         av_buffer_unref(&h->ps.sps_list[i]);
@@ -350,7 +342,22 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
         }
     }
 
-    h->ps.sps = h1->ps.sps;
+    av_buffer_unref(&h->ps.pps_ref);
+    av_buffer_unref(&h->ps.sps_ref);
+    h->ps.pps = NULL;
+    h->ps.sps = NULL;
+    if (h1->ps.pps_ref) {
+        h->ps.pps_ref = av_buffer_ref(h1->ps.pps_ref);
+        if (!h->ps.pps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.pps = (const PPS*)h->ps.pps_ref->data;
+    }
+    if (h1->ps.sps_ref) {
+        h->ps.sps_ref = av_buffer_ref(h1->ps.sps_ref);
+        if (!h->ps.sps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.sps = (SPS*)h->ps.sps_ref->data;
+    }
 
     if (need_reinit || !inited) {
         h->width     = h1->width;
@@ -361,11 +368,12 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
         h->mb_stride = h1->mb_stride;
         h->b_stride  = h1->b_stride;
 
-        if ((err = h264_slice_header_init(h)) < 0) {
-            av_log(h->avctx, AV_LOG_ERROR, "h264_slice_header_init() failed");
-            return err;
+        if (h->context_initialized || h1->context_initialized) {
+            if ((err = h264_slice_header_init(h)) < 0) {
+                av_log(h->avctx, AV_LOG_ERROR, "h264_slice_header_init() failed");
+                return err;
+            }
         }
-
         /* copy block_offset since frame_start may not be called */
         memcpy(h->block_offset, h1->block_offset, sizeof(h->block_offset));
     }
@@ -378,6 +386,9 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     h->first_field          = h1->first_field;
     h->picture_structure    = h1->picture_structure;
     h->droppable            = h1->droppable;
+    h->backup_width         = h1->backup_width;
+    h->backup_height        = h1->backup_height;
+    h->backup_pix_fmt       = h1->backup_pix_fmt;
 
     for (i = 0; i < H264_MAX_PICTURE_COUNT; i++) {
         ff_h264_unref_picture(h, &h->DPB[i]);
@@ -401,12 +412,14 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     // extradata/NAL handling
     h->is_avc = h1->is_avc;
     h->nal_length_size = h1->nal_length_size;
+    h->sei.unregistered.x264_build = h1->sei.unregistered.x264_build;
 
     memcpy(&h->poc,        &h1->poc,        sizeof(h->poc));
 
     h->curr_pic_num      = h1->curr_pic_num;
     h->max_pic_num       = h1->max_pic_num;
 
+    memcpy(h->default_ref, h1->default_ref, sizeof(h->default_ref));
     memcpy(h->short_ref,   h1->short_ref,   sizeof(h->short_ref));
     memcpy(h->long_ref,    h1->long_ref,    sizeof(h->long_ref));
     memcpy(h->delayed_pic, h1->delayed_pic, sizeof(h->delayed_pic));
@@ -426,6 +439,8 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     copy_picture_range(h->delayed_pic, h1->delayed_pic,
                        MAX_DELAYED_PIC_COUNT + 2, h, h1);
 
+    h->frame_recovered       = h1->frame_recovered;
+
     if (!h->cur_pic_ptr)
         return 0;
 
@@ -438,7 +453,6 @@ int ff_h264_update_thread_context(AVCodecContext *dst,
     h->poc.prev_frame_num        = h->poc.frame_num;
 
     h->recovery_frame        = h1->recovery_frame;
-    h->frame_recovered       = h1->frame_recovered;
 
     return err;
 }
@@ -448,12 +462,28 @@ static int h264_frame_start(H264Context *h)
     H264Picture *pic;
     int i, ret;
     const int pixel_shift = h->pixel_shift;
+    int c[4] = {
+        1<<(h->ps.sps->bit_depth_luma-1),
+        1<<(h->ps.sps->bit_depth_chroma-1),
+        1<<(h->ps.sps->bit_depth_chroma-1),
+        -1
+    };
 
-    ret = initialize_cur_frame(h);
-    if (ret < 0)
-        return ret;
+    if (!ff_thread_can_start_frame(h->avctx)) {
+        av_log(h->avctx, AV_LOG_ERROR, "Attempt to start a frame outside SETUP state\n");
+        return -1;
+    }
+
+    release_unused_pictures(h, 1);
+    h->cur_pic_ptr = NULL;
+
+    i = find_unused_picture(h);
+    if (i < 0) {
+        av_log(h->avctx, AV_LOG_ERROR, "no frame buffer available\n");
+        return i;
+    }
+    pic = &h->DPB[i];
 
-    pic = h->cur_pic_ptr;
     pic->reference              = h->droppable ? 0 : h->picture_structure;
     pic->f->coded_picture_number = h->coded_picture_number++;
     pic->field_picture          = h->picture_structure != PICT_FRAME;
@@ -466,11 +496,39 @@ static int h264_frame_start(H264Context *h)
     pic->f->key_frame = 0;
     pic->mmco_reset  = 0;
     pic->recovered   = 0;
+    pic->invalid_gap = 0;
+    pic->sei_recovery_frame_cnt = h->sei.recovery_point.recovery_frame_cnt;
 
     pic->f->pict_type = h->slice_ctx[0].slice_type;
 
-    if (CONFIG_ERROR_RESILIENCE && h->enable_er)
+    if ((ret = alloc_picture(h, pic)) < 0)
+        return ret;
+    if(!h->frame_recovered && !h->avctx->hwaccel
+#if FF_API_CAP_VDPAU
+       && !(h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU)
+#endif
+       )
+        ff_color_frame(pic->f, c);
+
+    h->cur_pic_ptr = pic;
+    ff_h264_unref_picture(h, &h->cur_pic);
+    if (CONFIG_ERROR_RESILIENCE) {
+        ff_h264_set_erpic(&h->slice_ctx[0].er.cur_pic, NULL);
+    }
+
+    if ((ret = ff_h264_ref_picture(h, &h->cur_pic, h->cur_pic_ptr)) < 0)
+        return ret;
+
+    for (i = 0; i < h->nb_slice_ctx; i++) {
+        h->slice_ctx[i].linesize   = h->cur_pic_ptr->f->linesize[0];
+        h->slice_ctx[i].uvlinesize = h->cur_pic_ptr->f->linesize[1];
+    }
+
+    if (CONFIG_ERROR_RESILIENCE && h->enable_er) {
         ff_er_frame_start(&h->slice_ctx[0].er);
+        ff_h264_set_erpic(&h->slice_ctx[0].er.last_pic, NULL);
+        ff_h264_set_erpic(&h->slice_ctx[0].er.next_pic, NULL);
+    }
 
     for (i = 0; i < 16; i++) {
         h->block_offset[i]           = (4 * ((scan8[i] - scan8[0]) & 7) << pixel_shift) + 4 * pic->f->linesize[0] * ((scan8[i] - scan8[0]) >> 3);
@@ -483,11 +541,6 @@ static int h264_frame_start(H264Context *h)
         h->block_offset[48 + 32 + i] = (4 * ((scan8[i] - scan8[0]) & 7) << pixel_shift) + 8 * pic->f->linesize[1] * ((scan8[i] - scan8[0]) >> 3);
     }
 
-    /* Some macroblocks can be accessed before they're available in case
-     * of lost slices, MBAFF or threading. */
-    memset(h->slice_table, -1,
-           (h->mb_height * h->mb_stride - 1) * sizeof(*h->slice_table));
-
     /* We mark the current picture as non-reference after allocating it, so
      * that if we break out due to an error it can be released automatically
      * in the next ff_mpv_frame_start().
@@ -623,7 +676,7 @@ static void implicit_weight_table(const H264Context *h, H264SliceContext *sl, in
             cur_poc = h->cur_pic_ptr->field_poc[h->picture_structure - 1];
         }
         if (sl->ref_count[0] == 1 && sl->ref_count[1] == 1 && !FRAME_MBAFF(h) &&
-            sl->ref_list[0][0].poc + sl->ref_list[1][0].poc == 2 * cur_poc) {
+            sl->ref_list[0][0].poc + (int64_t)sl->ref_list[1][0].poc == 2 * cur_poc) {
             sl->pwt.use_weight        = 0;
             sl->pwt.use_weight_chroma = 0;
             return;
@@ -644,7 +697,7 @@ static void implicit_weight_table(const H264Context *h, H264SliceContext *sl, in
     sl->pwt.chroma_log2_weight_denom = 5;
 
     for (ref0 = ref_start; ref0 < ref_count0; ref0++) {
-        int poc0 = sl->ref_list[0][ref0].poc;
+        int64_t poc0 = sl->ref_list[0][ref0].poc;
         for (ref1 = ref_start; ref1 < ref_count1; ref1++) {
             int w = 32;
             if (!sl->ref_list[0][ref0].parent->long_ref && !sl->ref_list[1][ref1].parent->long_ref) {
@@ -675,13 +728,13 @@ static void init_scan_tables(H264Context *h)
 {
     int i;
     for (i = 0; i < 16; i++) {
-#define TRANSPOSE(x) (x >> 2) | ((x << 2) & 0xF)
+#define TRANSPOSE(x) ((x) >> 2) | (((x) << 2) & 0xF)
         h->zigzag_scan[i] = TRANSPOSE(ff_zigzag_scan[i]);
         h->field_scan[i]  = TRANSPOSE(field_scan[i]);
 #undef TRANSPOSE
     }
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) ((x) >> 3) | (((x) & 7) << 3)
         h->zigzag_scan8x8[i]       = TRANSPOSE(ff_zigzag_direct[i]);
         h->zigzag_scan8x8_cavlc[i] = TRANSPOSE(zigzag_scan8x8_cavlc[i]);
         h->field_scan8x8[i]        = TRANSPOSE(field_scan8x8[i]);
@@ -689,31 +742,33 @@ static void init_scan_tables(H264Context *h)
 #undef TRANSPOSE
     }
     if (h->ps.sps->transform_bypass) { // FIXME same ugly
-        h->zigzag_scan_q0          = ff_zigzag_scan;
-        h->zigzag_scan8x8_q0       = ff_zigzag_direct;
-        h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
-        h->field_scan_q0           = field_scan;
-        h->field_scan8x8_q0        = field_scan8x8;
-        h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
+        memcpy(h->zigzag_scan_q0          , ff_zigzag_scan          , sizeof(h->zigzag_scan_q0         ));
+        memcpy(h->zigzag_scan8x8_q0       , ff_zigzag_direct        , sizeof(h->zigzag_scan8x8_q0      ));
+        memcpy(h->zigzag_scan8x8_cavlc_q0 , zigzag_scan8x8_cavlc    , sizeof(h->zigzag_scan8x8_cavlc_q0));
+        memcpy(h->field_scan_q0           , field_scan              , sizeof(h->field_scan_q0          ));
+        memcpy(h->field_scan8x8_q0        , field_scan8x8           , sizeof(h->field_scan8x8_q0       ));
+        memcpy(h->field_scan8x8_cavlc_q0  , field_scan8x8_cavlc     , sizeof(h->field_scan8x8_cavlc_q0 ));
     } else {
-        h->zigzag_scan_q0          = h->zigzag_scan;
-        h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
-        h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
-        h->field_scan_q0           = h->field_scan;
-        h->field_scan8x8_q0        = h->field_scan8x8;
-        h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
+        memcpy(h->zigzag_scan_q0          , h->zigzag_scan          , sizeof(h->zigzag_scan_q0         ));
+        memcpy(h->zigzag_scan8x8_q0       , h->zigzag_scan8x8       , sizeof(h->zigzag_scan8x8_q0      ));
+        memcpy(h->zigzag_scan8x8_cavlc_q0 , h->zigzag_scan8x8_cavlc , sizeof(h->zigzag_scan8x8_cavlc_q0));
+        memcpy(h->field_scan_q0           , h->field_scan           , sizeof(h->field_scan_q0          ));
+        memcpy(h->field_scan8x8_q0        , h->field_scan8x8        , sizeof(h->field_scan8x8_q0       ));
+        memcpy(h->field_scan8x8_cavlc_q0  , h->field_scan8x8_cavlc  , sizeof(h->field_scan8x8_cavlc_q0 ));
     }
 }
 
-static enum AVPixelFormat get_pixel_format(H264Context *h)
+static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
 {
 #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \
                      CONFIG_H264_D3D11VA_HWACCEL + \
                      CONFIG_H264_VAAPI_HWACCEL + \
                      (CONFIG_H264_VDA_HWACCEL * 2) + \
+                     CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \
                      CONFIG_H264_VDPAU_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
     const enum AVPixelFormat *choices = pix_fmts;
+    int i;
 
     switch (h->ps.sps->bit_depth_luma) {
     case 9:
@@ -738,6 +793,28 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
         else
             *fmt++ = AV_PIX_FMT_YUV420P10;
         break;
+    case 12:
+        if (CHROMA444(h)) {
+            if (h->avctx->colorspace == AVCOL_SPC_RGB) {
+                *fmt++ = AV_PIX_FMT_GBRP12;
+            } else
+                *fmt++ = AV_PIX_FMT_YUV444P12;
+        } else if (CHROMA422(h))
+            *fmt++ = AV_PIX_FMT_YUV422P12;
+        else
+            *fmt++ = AV_PIX_FMT_YUV420P12;
+        break;
+    case 14:
+        if (CHROMA444(h)) {
+            if (h->avctx->colorspace == AVCOL_SPC_RGB) {
+                *fmt++ = AV_PIX_FMT_GBRP14;
+            } else
+                *fmt++ = AV_PIX_FMT_YUV444P14;
+        } else if (CHROMA422(h))
+            *fmt++ = AV_PIX_FMT_YUV422P14;
+        else
+            *fmt++ = AV_PIX_FMT_YUV420P14;
+        break;
     case 8:
 #if CONFIG_H264_VDPAU_HWACCEL
         *fmt++ = AV_PIX_FMT_VDPAU;
@@ -768,6 +845,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
             *fmt++ = AV_PIX_FMT_VDA_VLD;
             *fmt++ = AV_PIX_FMT_VDA;
 #endif
+#if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+            *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+#endif
             if (h->avctx->codec->pix_fmts)
                 choices = h->avctx->codec->pix_fmts;
             else if (h->avctx->color_range == AVCOL_RANGE_JPEG)
@@ -784,7 +864,10 @@ static enum AVPixelFormat get_pixel_format(H264Context *h)
 
     *fmt = AV_PIX_FMT_NONE;
 
-    return ff_get_format(h->avctx, choices);
+    for (i=0; choices[i] != AV_PIX_FMT_NONE; i++)
+        if (choices[i] == h->avctx->pix_fmt && !force_callback)
+            return choices[i];
+    return ff_thread_get_format(h->avctx, choices);
 }
 
 /* export coded and cropped frame dimensions to AVCodecContext */
@@ -793,10 +876,15 @@ static int init_dimensions(H264Context *h)
     SPS *sps = h->ps.sps;
     int width  = h->width  - (sps->crop_right + sps->crop_left);
     int height = h->height - (sps->crop_top   + sps->crop_bottom);
+    av_assert0(sps->crop_right + sps->crop_left < (unsigned)h->width);
+    av_assert0(sps->crop_top + sps->crop_bottom < (unsigned)h->height);
 
     /* handle container cropping */
     if (FFALIGN(h->avctx->width,  16) == FFALIGN(width,  16) &&
-        FFALIGN(h->avctx->height, 16) == FFALIGN(height, 16)) {
+        FFALIGN(h->avctx->height, 16) == FFALIGN(height, 16) &&
+        h->avctx->width  <= width &&
+        h->avctx->height <= height
+    ) {
         width  = h->avctx->width;
         height = h->avctx->height;
     }
@@ -840,7 +928,7 @@ static int h264_slice_header_init(H264Context *h)
         if (h->sei.unregistered.x264_build < 44U)
             den *= 2;
         av_reduce(&h->avctx->framerate.den, &h->avctx->framerate.num,
-                  sps->num_units_in_tick, den, 1 << 30);
+                  sps->num_units_in_tick * h->avctx->ticks_per_frame, den, 1 << 30);
     }
 
     ff_h264_free_tables(h);
@@ -852,16 +940,32 @@ static int h264_slice_header_init(H264Context *h)
     ret = ff_h264_alloc_tables(h);
     if (ret < 0) {
         av_log(h->avctx, AV_LOG_ERROR, "Could not allocate memory\n");
-        return ret;
+        goto fail;
     }
 
-    if (sps->bit_depth_luma < 8 || sps->bit_depth_luma > 10) {
+#if FF_API_CAP_VDPAU
+    if (h->avctx->codec &&
+        h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU &&
+        (sps->bit_depth_luma != 8 || sps->chroma_format_idc > 1)) {
+        av_log(h->avctx, AV_LOG_ERROR,
+                "VDPAU decoding does not support video colorspace.\n");
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
+    }
+#endif
+
+    if (sps->bit_depth_luma < 8 || sps->bit_depth_luma > 14 ||
+        sps->bit_depth_luma == 11 || sps->bit_depth_luma == 13
+    ) {
         av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n",
                sps->bit_depth_luma);
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto fail;
     }
 
+    h->cur_bit_depth_luma         =
     h->avctx->bits_per_raw_sample = sps->bit_depth_luma;
+    h->cur_chroma_format_idc      = sps->chroma_format_idc;
     h->pixel_shift                = sps->bit_depth_luma > 8;
     h->chroma_format_idc          = sps->chroma_format_idc;
     h->bit_depth_luma             = sps->bit_depth_luma;
@@ -878,7 +982,7 @@ static int h264_slice_header_init(H264Context *h)
         ret = ff_h264_slice_context_init(h, &h->slice_ctx[0]);
         if (ret < 0) {
             av_log(h->avctx, AV_LOG_ERROR, "context_init() failed.\n");
-            return ret;
+            goto fail;
         }
     } else {
         for (i = 0; i < h->nb_slice_ctx; i++) {
@@ -891,7 +995,7 @@ static int h264_slice_header_init(H264Context *h)
 
             if ((ret = ff_h264_slice_context_init(h, sl)) < 0) {
                 av_log(h->avctx, AV_LOG_ERROR, "context_init() failed.\n");
-                return ret;
+                goto fail;
             }
         }
     }
@@ -899,6 +1003,21 @@ static int h264_slice_header_init(H264Context *h)
     h->context_initialized = 1;
 
     return 0;
+fail:
+    ff_h264_free_tables(h);
+    h->context_initialized = 0;
+    return ret;
+}
+
+static enum AVPixelFormat non_j_pixfmt(enum AVPixelFormat a)
+{
+    switch (a) {
+    case AV_PIX_FMT_YUVJ420P: return AV_PIX_FMT_YUV420P;
+    case AV_PIX_FMT_YUVJ422P: return AV_PIX_FMT_YUV422P;
+    case AV_PIX_FMT_YUVJ444P: return AV_PIX_FMT_YUV444P;
+    default:
+        return a;
+    }
 }
 
 /**
@@ -918,19 +1037,51 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     int ret;
     unsigned int slice_type, tmp, i, j;
     int last_pic_structure, last_pic_droppable;
+    int must_reinit;
     int needs_reinit = 0;
     int field_pic_flag, bottom_field_flag;
+    int first_slice = sl == h->slice_ctx && !h->current_slice;
     int frame_num, droppable, picture_structure;
-    int mb_aff_frame = 0;
+    int mb_aff_frame, last_mb_aff_frame;
+
+    if (first_slice)
+        av_assert0(!h->setup_finished);
 
-    first_mb_in_slice = get_ue_golomb(&sl->gb);
+    first_mb_in_slice = get_ue_golomb_long(&sl->gb);
 
     if (first_mb_in_slice == 0) { // FIXME better field boundary detection
-        if (h->current_slice && h->cur_pic_ptr && FIELD_PICTURE(h)) {
-            ff_h264_field_end(h, sl, 1);
+        if (h->current_slice) {
+            if (h->setup_finished) {
+                av_log(h->avctx, AV_LOG_ERROR, "Too many fields\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (h->max_contexts > 1) {
+                if (!h->single_decode_warning) {
+                    av_log(h->avctx, AV_LOG_WARNING, "Cannot decode multiple access units as slice threads\n");
+                    h->single_decode_warning = 1;
+                }
+                h->max_contexts = 1;
+                return SLICE_SINGLETHREAD;
+            }
+
+            if (h->cur_pic_ptr && FIELD_PICTURE(h) && h->first_field) {
+                ret = ff_h264_field_end(h, h->slice_ctx, 1);
+                h->current_slice = 0;
+                if (ret < 0)
+                    return ret;
+            } else if (h->cur_pic_ptr && !FIELD_PICTURE(h) && !h->first_field && h->nal_unit_type  == NAL_IDR_SLICE) {
+                av_log(h, AV_LOG_WARNING, "Broken frame packetizing\n");
+                ret = ff_h264_field_end(h, h->slice_ctx, 1);
+                h->current_slice = 0;
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 0);
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 1);
+                h->cur_pic_ptr = NULL;
+                if (ret < 0)
+                    return ret;
+            } else
+                return AVERROR_INVALIDDATA;
         }
 
-        h->current_slice = 0;
         if (!h->first_field) {
             if (h->cur_pic_ptr && !h->droppable) {
                 ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
@@ -940,6 +1091,9 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         }
     }
 
+    if (!h->current_slice)
+        av_assert0(sl == h->slice_ctx);
+
     slice_type = get_ue_golomb_31(&sl->gb);
     if (slice_type > 9) {
         av_log(h->avctx, AV_LOG_ERROR,
@@ -963,6 +1117,17 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         return AVERROR_INVALIDDATA;
     }
 
+    if (h->current_slice == 0 && !h->first_field) {
+        if (
+            (h->avctx->skip_frame >= AVDISCARD_NONREF && !h->nal_ref_idc) ||
+            (h->avctx->skip_frame >= AVDISCARD_BIDIR  && sl->slice_type_nos == AV_PICTURE_TYPE_B) ||
+            (h->avctx->skip_frame >= AVDISCARD_NONINTRA && sl->slice_type_nos != AV_PICTURE_TYPE_I) ||
+            (h->avctx->skip_frame >= AVDISCARD_NONKEY && h->nal_unit_type != NAL_IDR_SLICE && h->sei.recovery_point.recovery_frame_cnt < 0) ||
+            h->avctx->skip_frame >= AVDISCARD_ALL) {
+            return SLICE_SKIPED;
+        }
+    }
+
     pps_id = get_ue_golomb(&sl->gb);
     if (pps_id >= MAX_PPS_COUNT) {
         av_log(h->avctx, AV_LOG_ERROR, "pps_id %u out of range\n", pps_id);
@@ -974,22 +1139,60 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                pps_id);
         return AVERROR_INVALIDDATA;
     }
-    if (!h->setup_finished) {
-        h->ps.pps = (const PPS*)h->ps.pps_list[pps_id]->data;
-    } else if (h->ps.pps != (const PPS*)h->ps.pps_list[pps_id]->data) {
-        av_log(h->avctx, AV_LOG_ERROR, "PPS changed between slices\n");
+    if (h->au_pps_id >= 0 && pps_id != h->au_pps_id) {
+        av_log(h->avctx, AV_LOG_ERROR,
+               "PPS change from %d to %d forbidden\n",
+               h->au_pps_id, pps_id);
         return AVERROR_INVALIDDATA;
     }
 
-    if (!h->ps.sps_list[h->ps.pps->sps_id]) {
+    pps = (const PPS*)h->ps.pps_list[pps_id]->data;
+
+    if (!h->ps.sps_list[pps->sps_id]) {
         av_log(h->avctx, AV_LOG_ERROR,
                "non-existing SPS %u referenced\n",
-               h->ps.pps->sps_id);
+               pps->sps_id);
         return AVERROR_INVALIDDATA;
     }
 
-    if (h->ps.sps != (const SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data) {
-        h->ps.sps = (SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data;
+    if (first_slice) {
+        av_buffer_unref(&h->ps.pps_ref);
+        h->ps.pps = NULL;
+        h->ps.pps_ref = av_buffer_ref(h->ps.pps_list[pps_id]);
+        if (!h->ps.pps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.pps = (const PPS*)h->ps.pps_ref->data;
+    } else {
+        if (h->ps.pps->sps_id != pps->sps_id ||
+            h->ps.pps->transform_8x8_mode != pps->transform_8x8_mode /*||
+            (h->setup_finished && h->ps.pps != pps)*/) {
+            av_log(h->avctx, AV_LOG_ERROR, "PPS changed between slices\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (h->ps.sps != (const SPS*)h->ps.sps_list[h->ps.pps->sps_id]->data ||
+        pps->sps_id != h->current_sps_id) {
+
+        if (!first_slice) {
+            av_log(h->avctx, AV_LOG_ERROR,
+               "SPS changed in the middle of the frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        av_buffer_unref(&h->ps.sps_ref);
+        h->ps.sps = NULL;
+        h->ps.sps_ref = av_buffer_ref(h->ps.sps_list[h->ps.pps->sps_id]);
+        if (!h->ps.sps_ref)
+            return AVERROR(ENOMEM);
+        h->ps.sps = (const SPS*)h->ps.sps_ref->data;
+
+        if (h->mb_width  != h->ps.sps->mb_width ||
+            h->mb_height != h->ps.sps->mb_height * (2 - h->ps.sps->frame_mbs_only_flag) ||
+            h->cur_bit_depth_luma    != h->ps.sps->bit_depth_luma ||
+            h->cur_chroma_format_idc != h->ps.sps->chroma_format_idc
+        )
+            needs_reinit = 1;
 
         if (h->bit_depth_luma    != h->ps.sps->bit_depth_luma ||
             h->chroma_format_idc != h->ps.sps->chroma_format_idc)
@@ -999,15 +1202,26 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     pps = h->ps.pps;
     sps = h->ps.sps;
 
+    must_reinit = (h->context_initialized &&
+                    (   16*sps->mb_width != h->avctx->coded_width
+                     || 16*sps->mb_height * (2 - sps->frame_mbs_only_flag) != h->avctx->coded_height
+                     || h->cur_bit_depth_luma    != sps->bit_depth_luma
+                     || h->cur_chroma_format_idc != sps->chroma_format_idc
+                     || h->mb_width  != sps->mb_width
+                     || h->mb_height != sps->mb_height * (2 - sps->frame_mbs_only_flag)
+                    ));
+    if (h->avctx->pix_fmt == AV_PIX_FMT_NONE
+        || (non_j_pixfmt(h->avctx->pix_fmt) != non_j_pixfmt(get_pixel_format(h, 0))))
+        must_reinit = 1;
+
+    if (first_slice && av_cmp_q(sps->sar, h->avctx->sample_aspect_ratio))
+        must_reinit = 1;
+
     if (!h->setup_finished) {
         h->avctx->profile = ff_h264_get_profile(sps);
         h->avctx->level   = sps->level_idc;
         h->avctx->refs    = sps->ref_frame_count;
 
-        if (h->mb_width  != sps->mb_width ||
-            h->mb_height != sps->mb_height * (2 - sps->frame_mbs_only_flag))
-            needs_reinit = 1;
-
         h->mb_width  = sps->mb_width;
         h->mb_height = sps->mb_height * (2 - sps->frame_mbs_only_flag);
         h->mb_num    = h->mb_width * h->mb_height;
@@ -1025,8 +1239,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             return ret;
 
         if (sps->video_signal_type_present_flag) {
-            h->avctx->color_range = sps->full_range ? AVCOL_RANGE_JPEG
-                : AVCOL_RANGE_MPEG;
+            h->avctx->color_range = sps->full_range > 0 ? AVCOL_RANGE_JPEG
+                                                        : AVCOL_RANGE_MPEG;
             if (sps->colour_description_present_flag) {
                 if (h->avctx->colorspace != sps->colorspace)
                     needs_reinit = 1;
@@ -1037,7 +1251,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         }
     }
 
-    if (h->context_initialized && needs_reinit) {
+    if (h->context_initialized &&
+        (must_reinit || needs_reinit)) {
         h->context_initialized = 0;
         if (sl != h->slice_ctx) {
             av_log(h->avctx, AV_LOG_ERROR,
@@ -1049,14 +1264,16 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             return AVERROR_INVALIDDATA;
         }
 
+        av_assert1(first_slice);
+
         ff_h264_flush_change(h);
 
-        if ((ret = get_pixel_format(h)) < 0)
+        if ((ret = get_pixel_format(h, 1)) < 0)
             return ret;
         h->avctx->pix_fmt = ret;
 
         av_log(h->avctx, AV_LOG_INFO, "Reinit context to %dx%d, "
-               "pix_fmt: %d\n", h->width, h->height, h->avctx->pix_fmt);
+               "pix_fmt: %s\n", h->width, h->height, av_get_pix_fmt_name(h->avctx->pix_fmt));
 
         if ((ret = h264_slice_header_init(h)) < 0) {
             av_log(h->avctx, AV_LOG_ERROR,
@@ -1071,7 +1288,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             return AVERROR_PATCHWELCOME;
         }
 
-        if ((ret = get_pixel_format(h)) < 0)
+        if ((ret = get_pixel_format(h, 1)) < 0)
             return ret;
         h->avctx->pix_fmt = ret;
 
@@ -1083,11 +1300,20 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     }
 
     frame_num = get_bits(&sl->gb, sps->log2_max_frame_num);
+    if (!first_slice) {
+        if (h->poc.frame_num != frame_num) {
+            av_log(h->avctx, AV_LOG_ERROR, "Frame num change from %d to %d\n",
+                   h->poc.frame_num, frame_num);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     if (!h->setup_finished)
         h->poc.frame_num = frame_num;
 
     sl->mb_mbaff       = 0;
-
+    mb_aff_frame       = 0;
+    last_mb_aff_frame  = h->mb_aff_frame;
     last_pic_structure = h->picture_structure;
     last_pic_droppable = h->droppable;
 
@@ -1095,7 +1321,12 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     if (sps->frame_mbs_only_flag) {
         picture_structure = PICT_FRAME;
     } else {
+        if (!h->ps.sps->direct_8x8_inference_flag && slice_type == AV_PICTURE_TYPE_B) {
+            av_log(h->avctx, AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n");
+            return -1;
+        }
         field_pic_flag = get_bits1(&sl->gb);
+
         if (field_pic_flag) {
             bottom_field_flag = get_bits1(&sl->gb);
             picture_structure = PICT_TOP_FIELD + bottom_field_flag;
@@ -1104,16 +1335,11 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             mb_aff_frame      = sps->mb_aff;
         }
     }
-    if (!h->setup_finished) {
-        h->droppable         = droppable;
-        h->picture_structure = picture_structure;
-        h->mb_aff_frame      = mb_aff_frame;
-    }
-    sl->mb_field_decoding_flag = h->picture_structure != PICT_FRAME;
 
-    if (h->current_slice != 0) {
+    if (h->current_slice) {
         if (last_pic_structure != picture_structure ||
-            last_pic_droppable != droppable) {
+            last_pic_droppable != droppable ||
+            last_mb_aff_frame  != mb_aff_frame) {
             av_log(h->avctx, AV_LOG_ERROR,
                    "Changing field mode (%d -> %d) between slices is not allowed\n",
                    last_pic_structure, h->picture_structure);
@@ -1124,7 +1350,16 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                    h->current_slice + 1);
             return AVERROR_INVALIDDATA;
         }
-    } else {
+    }
+
+    if (!h->setup_finished) {
+        h->droppable         = droppable;
+        h->picture_structure = picture_structure;
+        h->mb_aff_frame      = mb_aff_frame;
+    }
+    sl->mb_field_decoding_flag = picture_structure != PICT_FRAME;
+
+    if (h->current_slice == 0) {
         /* Shorten frame num gaps so we don't have to allocate reference
          * frames just to throw them away */
         if (h->poc.frame_num != h->poc.prev_frame_num) {
@@ -1147,17 +1382,23 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
          * Here, we're using that to see if we should mark previously
          * decode frames as "finished".
          * We have to do that before the "dummy" in-between frame allocation,
-         * since that can modify s->current_picture_ptr. */
+         * since that can modify h->cur_pic_ptr. */
         if (h->first_field) {
-            assert(h->cur_pic_ptr);
-            assert(h->cur_pic_ptr->f->buf[0]);
+            av_assert0(h->cur_pic_ptr);
+            av_assert0(h->cur_pic_ptr->f->buf[0]);
             assert(h->cur_pic_ptr->reference != DELAYED_PIC_REF);
 
+            /* Mark old field/frame as completed */
+            if (h->cur_pic_ptr->tf.owner == h->avctx) {
+                ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                          last_pic_structure == PICT_BOTTOM_FIELD);
+            }
+
             /* figure out if we have a complementary field pair */
             if (!FIELD_PICTURE(h) || h->picture_structure == last_pic_structure) {
                 /* Previous field is unmatched. Don't display it, but let it
                  * remain for reference if marked as such. */
-                if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
+                if (last_pic_structure != PICT_FRAME) {
                     ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
                                               last_pic_structure == PICT_TOP_FIELD);
                 }
@@ -1167,7 +1408,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                      * different frame_nums. Consider this field first in
                      * pair. Throw away previous field except for reference
                      * purposes. */
-                    if (!last_pic_droppable && last_pic_structure != PICT_FRAME) {
+                    if (last_pic_structure != PICT_FRAME) {
                         ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
                                                   last_pic_structure == PICT_TOP_FIELD);
                     }
@@ -1194,12 +1435,15 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             }
         }
 
-        while (h->poc.frame_num != h->poc.prev_frame_num &&
+        while (h->poc.frame_num != h->poc.prev_frame_num && !h->first_field &&
                h->poc.frame_num != (h->poc.prev_frame_num + 1) % (1 << sps->log2_max_frame_num)) {
             H264Picture *prev = h->short_ref_count ? h->short_ref[0] : NULL;
             av_log(h->avctx, AV_LOG_DEBUG, "Frame num gap %d %d\n",
                    h->poc.frame_num, h->poc.prev_frame_num);
-            ret = initialize_cur_frame(h);
+            if (!sps->gaps_in_frame_num_allowed_flag)
+                for(i=0; i<FF_ARRAY_ELEMS(h->last_pocs); i++)
+                    h->last_pocs[i] = INT_MIN;
+            ret = h264_frame_start(h);
             if (ret < 0) {
                 h->first_field = 0;
                 return ret;
@@ -1208,6 +1452,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
             h->poc.prev_frame_num++;
             h->poc.prev_frame_num        %= 1 << sps->log2_max_frame_num;
             h->cur_pic_ptr->frame_num = h->poc.prev_frame_num;
+            h->cur_pic_ptr->invalid_gap = !sps->gaps_in_frame_num_allowed_flag;
             ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 0);
             ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX, 1);
             ret = ff_generate_sliding_window_mmcos(h, 1);
@@ -1233,8 +1478,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                                   (const uint8_t **)prev->f->data,
                                   prev->f->linesize,
                                   prev->f->format,
-                                  h->mb_width  * 16,
-                                  h->mb_height * 16);
+                                  prev->f->width,
+                                  prev->f->height);
                     h->short_ref[0]->poc = prev->poc + 2;
                 }
                 h->short_ref[0]->frame_num = h->poc.prev_frame_num;
@@ -1245,18 +1490,22 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
          * We're using that to see whether to continue decoding in that
          * frame, or to allocate a new one. */
         if (h->first_field) {
-            assert(h->cur_pic_ptr);
-            assert(h->cur_pic_ptr->f->buf[0]);
+            av_assert0(h->cur_pic_ptr);
+            av_assert0(h->cur_pic_ptr->f->buf[0]);
             assert(h->cur_pic_ptr->reference != DELAYED_PIC_REF);
 
             /* figure out if we have a complementary field pair */
             if (!FIELD_PICTURE(h) || h->picture_structure == last_pic_structure) {
                 /* Previous field is unmatched. Don't display it, but let it
                  * remain for reference if marked as such. */
+                h->missing_fields ++;
                 h->cur_pic_ptr = NULL;
                 h->first_field = FIELD_PICTURE(h);
             } else {
+                h->missing_fields = 0;
                 if (h->cur_pic_ptr->frame_num != h->poc.frame_num) {
+                    ff_thread_report_progress(&h->cur_pic_ptr->tf, INT_MAX,
+                                              h->picture_structure==PICT_BOTTOM_FIELD);
                     /* This and the previous field had different frame_nums.
                      * Consider this field first in pair. Throw away previous
                      * one except for reference purposes. */
@@ -1280,9 +1529,18 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         } else {
             release_unused_pictures(h, 0);
         }
+        /* Some macroblocks can be accessed before they're available in case
+        * of lost slices, MBAFF or threading. */
+        if (FIELD_PICTURE(h)) {
+            for(i = (h->picture_structure == PICT_BOTTOM_FIELD); i<h->mb_height; i++)
+                memset(h->slice_table + i*h->mb_stride, -1, (h->mb_stride - (i+1==h->mb_height)) * sizeof(*h->slice_table));
+        } else {
+            memset(h->slice_table, -1,
+                (h->mb_height * h->mb_stride - 1) * sizeof(*h->slice_table));
+        }
     }
 
-    assert(h->mb_num == h->mb_width * h->mb_height);
+    av_assert1(h->mb_num == h->mb_width * h->mb_height);
     if (first_mb_in_slice << FIELD_OR_MBAFF_PICTURE(h) >= h->mb_num ||
         first_mb_in_slice >= h->mb_num) {
         av_log(h->avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
@@ -1293,7 +1551,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                                  FIELD_OR_MBAFF_PICTURE(h);
     if (h->picture_structure == PICT_BOTTOM_FIELD)
         sl->resync_mb_y = sl->mb_y = sl->mb_y + 1;
-    assert(sl->mb_y < h->mb_height);
+    av_assert1(sl->mb_y < h->mb_height);
 
     if (h->picture_structure == PICT_FRAME) {
         h->curr_pic_num = h->poc.frame_num;
@@ -1304,7 +1562,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
     }
 
     if (h->nal_unit_type == NAL_IDR_SLICE)
-        get_ue_golomb(&sl->gb); /* idr_pic_id */
+        get_ue_golomb_long(&sl->gb); /* idr_pic_id */
 
     if (sps->poc_type == 0) {
         int poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
@@ -1345,7 +1603,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
 
     ret = ff_h264_parse_ref_count(&sl->list_count, sl->ref_count,
                                   &sl->gb, pps, sl->slice_type_nos,
-                                  h->picture_structure);
+                                  h->picture_structure, h->avctx);
     if (ret < 0)
         return ret;
 
@@ -1361,7 +1619,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
         (pps->weighted_bipred_idc == 1 &&
          sl->slice_type_nos == AV_PICTURE_TYPE_B))
         ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count,
-                                  sl->slice_type_nos, &sl->pwt);
+                                  sl->slice_type_nos, &sl->pwt, h->avctx);
     else if (pps->weighted_bipred_idc == 2 &&
              sl->slice_type_nos == AV_PICTURE_TYPE_B) {
         implicit_weight_table(h, sl, -1);
@@ -1455,6 +1713,8 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
 
     if (h->avctx->skip_loop_filter >= AVDISCARD_ALL ||
         (h->avctx->skip_loop_filter >= AVDISCARD_NONKEY &&
+         h->nal_unit_type != NAL_IDR_SLICE) ||
+        (h->avctx->skip_loop_filter >= AVDISCARD_NONINTRA &&
          sl->slice_type_nos != AV_PICTURE_TYPE_I) ||
         (h->avctx->skip_loop_filter >= AVDISCARD_BIDIR  &&
          sl->slice_type_nos == AV_PICTURE_TYPE_B) ||
@@ -1462,7 +1722,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
          h->nal_ref_idc == 0))
         sl->deblocking_filter = 0;
 
-    if (sl->deblocking_filter == 1 && h->nb_slice_ctx > 1) {
+    if (sl->deblocking_filter == 1 && h->max_contexts > 1) {
         if (h->avctx->flags2 & AV_CODEC_FLAG2_FAST) {
             /* Cheat slightly for speed:
              * Do not bother to deblock across slices. */
@@ -1479,9 +1739,14 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                    6 * (sps->bit_depth_luma - 8);
 
     sl->slice_num       = ++h->current_slice;
-    if (sl->slice_num >= MAX_SLICES) {
-        av_log(h->avctx, AV_LOG_ERROR,
-               "Too many slices, increase MAX_SLICES and recompile\n");
+
+    if (sl->slice_num)
+        h->slice_row[(sl->slice_num-1)&(MAX_SLICES-1)]= sl->resync_mb_y;
+    if (   h->slice_row[sl->slice_num&(MAX_SLICES-1)] + 3 >= sl->resync_mb_y
+        && h->slice_row[sl->slice_num&(MAX_SLICES-1)] <= sl->resync_mb_y
+        && sl->slice_num >= MAX_SLICES) {
+        //in case of ASO this check needs to be updated depending on how we decide to assign slice numbers in this case
+        av_log(h->avctx, AV_LOG_WARNING, "Possibly too many slices (%d >= %d), increase MAX_SLICES and recompile if there are artifacts\n", sl->slice_num, MAX_SLICES);
     }
 
     for (j = 0; j < 2; j++) {
@@ -1517,6 +1782,9 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
                              (sl->ref_list[j][i].reference & 3);
     }
 
+    h->au_pps_id = pps_id;
+    h->current_sps_id = h->ps.pps->sps_id;
+
     if (h->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(h->avctx, AV_LOG_DEBUG,
                "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
@@ -1574,12 +1842,12 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
         if (USES_LIST(top_type, list)) {
             const int b_xy  = h->mb2b_xy[top_xy] + 3 * b_stride;
             const int b8_xy = 4 * top_xy + 2;
-            int (*ref2frm)[64] = h->ref2frm[h->slice_table[top_xy] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
+            const int *ref2frm = h->ref2frm[h->slice_table[top_xy] & (MAX_SLICES - 1)][list] + (MB_MBAFF(sl) ? 20 : 2);
             AV_COPY128(mv_dst - 1 * 8, h->cur_pic.motion_val[list][b_xy + 0]);
             ref_cache[0 - 1 * 8] =
-            ref_cache[1 - 1 * 8] = ref2frm[list][h->cur_pic.ref_index[list][b8_xy + 0]];
+            ref_cache[1 - 1 * 8] = ref2frm[h->cur_pic.ref_index[list][b8_xy + 0]];
             ref_cache[2 - 1 * 8] =
-            ref_cache[3 - 1 * 8] = ref2frm[list][h->cur_pic.ref_index[list][b8_xy + 1]];
+            ref_cache[3 - 1 * 8] = ref2frm[h->cur_pic.ref_index[list][b8_xy + 1]];
         } else {
             AV_ZERO128(mv_dst - 1 * 8);
             AV_WN32A(&ref_cache[0 - 1 * 8], ((LIST_NOT_USED) & 0xFF) * 0x01010101u);
@@ -1589,15 +1857,15 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
             if (USES_LIST(left_type[LTOP], list)) {
                 const int b_xy  = h->mb2b_xy[left_xy[LTOP]] + 3;
                 const int b8_xy = 4 * left_xy[LTOP] + 1;
-                int (*ref2frm)[64] = h->ref2frm[h->slice_table[left_xy[LTOP]] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
+                const int *ref2frm = h->ref2frm[h->slice_table[left_xy[LTOP]] & (MAX_SLICES - 1)][list] + (MB_MBAFF(sl) ? 20 : 2);
                 AV_COPY32(mv_dst - 1 +  0, h->cur_pic.motion_val[list][b_xy + b_stride * 0]);
                 AV_COPY32(mv_dst - 1 +  8, h->cur_pic.motion_val[list][b_xy + b_stride * 1]);
                 AV_COPY32(mv_dst - 1 + 16, h->cur_pic.motion_val[list][b_xy + b_stride * 2]);
                 AV_COPY32(mv_dst - 1 + 24, h->cur_pic.motion_val[list][b_xy + b_stride * 3]);
                 ref_cache[-1 +  0] =
-                ref_cache[-1 +  8] = ref2frm[list][h->cur_pic.ref_index[list][b8_xy + 2 * 0]];
+                ref_cache[-1 +  8] = ref2frm[h->cur_pic.ref_index[list][b8_xy + 2 * 0]];
                 ref_cache[-1 + 16] =
-                ref_cache[-1 + 24] = ref2frm[list][h->cur_pic.ref_index[list][b8_xy + 2 * 1]];
+                ref_cache[-1 + 24] = ref2frm[h->cur_pic.ref_index[list][b8_xy + 2 * 1]];
             } else {
                 AV_ZERO32(mv_dst - 1 +  0);
                 AV_ZERO32(mv_dst - 1 +  8);
@@ -1622,9 +1890,9 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
 
     {
         int8_t *ref = &h->cur_pic.ref_index[list][4 * mb_xy];
-        int (*ref2frm)[64] = h->ref2frm[sl->slice_num & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
-        uint32_t ref01 = (pack16to32(ref2frm[list][ref[0]], ref2frm[list][ref[1]]) & 0x00FF00FF) * 0x0101;
-        uint32_t ref23 = (pack16to32(ref2frm[list][ref[2]], ref2frm[list][ref[3]]) & 0x00FF00FF) * 0x0101;
+        const int *ref2frm = h->ref2frm[sl->slice_num & (MAX_SLICES - 1)][list] + (MB_MBAFF(sl) ? 20 : 2);
+        uint32_t ref01 = (pack16to32(ref2frm[ref[0]], ref2frm[ref[1]]) & 0x00FF00FF) * 0x0101;
+        uint32_t ref23 = (pack16to32(ref2frm[ref[2]], ref2frm[ref[3]]) & 0x00FF00FF) * 0x0101;
         AV_WN32A(&ref_cache[0 * 8], ref01);
         AV_WN32A(&ref_cache[1 * 8], ref01);
         AV_WN32A(&ref_cache[2 * 8], ref23);
@@ -1892,7 +2160,7 @@ static void decode_finish_row(const H264Context *h, H264SliceContext *sl)
 
     ff_h264_draw_horiz_band(h, sl, top, height);
 
-    if (h->droppable)
+    if (h->droppable || sl->h264->slice_ctx[0].er.error_occurred)
         return;
 
     ff_thread_report_progress(&h->cur_pic_ptr->tf, top + height - 1,
@@ -1903,15 +2171,14 @@ static void er_add_slice(H264SliceContext *sl,
                          int startx, int starty,
                          int endx, int endy, int status)
 {
-#if CONFIG_ERROR_RESILIENCE
-    ERContext *er = &sl->er;
-
     if (!sl->h264->enable_er)
         return;
 
-    er->ref_count = sl->ref_count[0];
-    ff_er_add_slice(er, startx, starty, endx, endy, status);
-#endif
+    if (CONFIG_ERROR_RESILIENCE) {
+        ERContext *er = &sl->h264->slice_ctx[0].er;
+
+        ff_er_add_slice(er, startx, starty, endx, endy, status);
+    }
 }
 
 static int decode_slice(struct AVCodecContext *avctx, void *arg)
@@ -1931,6 +2198,8 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
 
     sl->mb_skip_run = -1;
 
+    av_assert0(h->block_offset[15] == (4 * ((scan8[15] - scan8[0]) & 7) << h->pixel_shift) + 4 * sl->linesize * ((scan8[15] - scan8[0]) >> 3));
+
     if (h->postpone_filter)
         sl->deblocking_filter = 0;
 
@@ -1938,24 +2207,37 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                      avctx->codec_id != AV_CODEC_ID_H264 ||
                      (CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
 
+    if (!(h->avctx->active_thread_type & FF_THREAD_SLICE) && h->picture_structure == PICT_FRAME && h->slice_ctx[0].er.error_status_table) {
+        const int start_i  = av_clip(sl->resync_mb_x + sl->resync_mb_y * h->mb_width, 0, h->mb_num - 1);
+        if (start_i) {
+            int prev_status = h->slice_ctx[0].er.error_status_table[h->slice_ctx[0].er.mb_index2xy[start_i - 1]];
+            prev_status &= ~ VP_START;
+            if (prev_status != (ER_MV_END | ER_DC_END | ER_AC_END))
+                h->slice_ctx[0].er.error_occurred = 1;
+        }
+    }
+
     if (h->ps.pps->cabac) {
         /* realign */
         align_get_bits(&sl->gb);
 
         /* init cabac */
-        ff_init_cabac_decoder(&sl->cabac,
+        ret = ff_init_cabac_decoder(&sl->cabac,
                               sl->gb.buffer + get_bits_count(&sl->gb) / 8,
                               (get_bits_left(&sl->gb) + 7) / 8);
+        if (ret < 0)
+            return ret;
 
         ff_h264_init_cabac_states(h, sl);
 
         for (;;) {
             // START_TIMER
             int ret, eos;
-
             if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
                 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at %d\n",
                        sl->next_slice_idx);
+                er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
+                             sl->mb_y, ER_MB_ERROR);
                 return AVERROR_INVALIDDATA;
             }
 
@@ -1985,9 +2267,11 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                     loop_filter(h, sl, lf_x_start, sl->mb_x + 1);
                 goto finish;
             }
-            if (ret < 0 || sl->cabac.bytestream > sl->cabac.bytestream_end + 2) {
+            if (sl->cabac.bytestream > sl->cabac.bytestream_end + 2 )
+                av_log(h->avctx, AV_LOG_DEBUG, "bytestream overread %"PTRDIFF_SPECIFIER"\n", sl->cabac.bytestream_end - sl->cabac.bytestream);
+            if (ret < 0 || sl->cabac.bytestream > sl->cabac.bytestream_end + 4) {
                 av_log(h->avctx, AV_LOG_ERROR,
-                       "error while decoding MB %d %d, bytestream %td\n",
+                       "error while decoding MB %d %d, bytestream %"PTRDIFF_SPECIFIER"\n",
                        sl->mb_x, sl->mb_y,
                        sl->cabac.bytestream_end - sl->cabac.bytestream);
                 er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
@@ -2024,6 +2308,8 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
             if (sl->mb_x + sl->mb_y * h->mb_width >= sl->next_slice_idx) {
                 av_log(h->avctx, AV_LOG_ERROR, "Slice overlaps with next at %d\n",
                        sl->next_slice_idx);
+                er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
+                             sl->mb_y, ER_MB_ERROR);
                 return AVERROR_INVALIDDATA;
             }
 
@@ -2064,14 +2350,15 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
                     ff_tlog(h->avctx, "slice end %d %d\n",
                             get_bits_count(&sl->gb), sl->gb.size_in_bits);
 
-                    if (get_bits_left(&sl->gb) == 0) {
+                    if (   get_bits_left(&sl->gb) == 0
+                        || get_bits_left(&sl->gb) > 0 && !(h->avctx->err_recognition & AV_EF_AGGRESSIVE)) {
                         er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
                                      sl->mb_x - 1, sl->mb_y, ER_MB_END);
 
                         goto finish;
                     } else {
                         er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
-                                     sl->mb_x - 1, sl->mb_y, ER_MB_END);
+                                     sl->mb_x, sl->mb_y, ER_MB_END);
 
                         return AVERROR_INVALIDDATA;
                     }
@@ -2116,7 +2403,15 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
     H264SliceContext *sl;
     int i, j;
 
-    if (h->avctx->hwaccel)
+    av_assert0(context_count && h->slice_ctx[context_count - 1].mb_y < h->mb_height);
+
+    h->slice_ctx[0].next_slice_idx = INT_MAX;
+
+    if (h->avctx->hwaccel
+#if FF_API_CAP_VDPAU
+        || h->avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+        )
         return 0;
     if (context_count == 1) {
         int ret;
@@ -2128,12 +2423,15 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
         h->mb_y = h->slice_ctx[0].mb_y;
         return ret;
     } else {
+        av_assert0(context_count > 0);
         for (i = 0; i < context_count; i++) {
             int next_slice_idx = h->mb_width * h->mb_height;
             int slice_idx;
 
             sl                 = &h->slice_ctx[i];
-            sl->er.error_count = 0;
+            if (CONFIG_ERROR_RESILIENCE) {
+                sl->er.error_count = 0;
+            }
 
             /* make sure none of those slices overlap */
             slice_idx = sl->mb_y * h->mb_width + sl->mb_x;
@@ -2154,8 +2452,10 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
         /* pull back stuff from slices to master context */
         sl                   = &h->slice_ctx[context_count - 1];
         h->mb_y              = sl->mb_y;
-        for (i = 1; i < context_count; i++)
-            h->slice_ctx[0].er.error_count += h->slice_ctx[i].er.error_count;
+        if (CONFIG_ERROR_RESILIENCE) {
+            for (i = 1; i < context_count; i++)
+                h->slice_ctx[0].er.error_count += h->slice_ctx[i].er.error_count;
+        }
 
         if (h->postpone_filter) {
             h->postpone_filter = 0;
diff --git a/libavcodec/h264addpx_template.c b/libavcodec/h264addpx_template.c
index e3adfe2..b71aaea 100644
--- a/libavcodec/h264addpx_template.c
+++ b/libavcodec/h264addpx_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index d5146de..c2f1f30 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,9 +32,11 @@
     c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_ ## depth ## _c; \
+    c->put_h264_chroma_pixels_tab[3] = put_h264_chroma_mc1_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_ ## depth ## _c; \
+    c->avg_h264_chroma_pixels_tab[3] = avg_h264_chroma_mc1_ ## depth ## _c; \
 
 av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
 {
@@ -52,4 +54,6 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
         ff_h264chroma_init_ppc(c, bit_depth);
     if (ARCH_X86)
         ff_h264chroma_init_x86(c, bit_depth);
+    if (ARCH_MIPS)
+        ff_h264chroma_init_mips(c, bit_depth);
 }
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index 93064fe..e0f45ad 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,8 +24,8 @@
 typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
 
 typedef struct H264ChromaContext {
-    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
-    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
+    h264_chroma_mc_func put_h264_chroma_pixels_tab[4];
+    h264_chroma_mc_func avg_h264_chroma_pixels_tab[4];
 } H264ChromaContext;
 
 void ff_h264chroma_init(H264ChromaContext *c, int bit_depth);
@@ -34,5 +34,6 @@ void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/h264chroma_template.c b/libavcodec/h264chroma_template.c
index 028ed13..072b5e0 100644
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@ -2,28 +2,62 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
+#include "libavutil/avassert.h"
 
 #include "bit_depth_template.c"
 
 #define H264_CHROMA_MC(OPNAME, OP)\
+static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
+    pixel *dst = (pixel*)_dst;\
+    pixel *src = (pixel*)_src;\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    stride >>= sizeof(pixel)-1;\
+    \
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    } else if (B + C) {\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    } else {\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0]));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
 static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *_src/*align 1*/, int stride, int h, int x, int y){\
     pixel *dst = (pixel*)_dst;\
     pixel *src = (pixel*)_src;\
@@ -32,9 +66,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst/*align 8*/, uint8_t *
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
@@ -70,9 +104,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc4)(uint8_t *_dst/*align 8*/, uint8_t *
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
@@ -114,9 +148,9 @@ static void FUNCC(OPNAME ## h264_chroma_mc8)(uint8_t *_dst/*align 8*/, uint8_t *
     const int C=(8-x)*(  y);\
     const int D=(  x)*(  y);\
     int i;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i=0; i<h; i++){\
diff --git a/libavcodec/h264data.c b/libavcodec/h264data.c
index 79c5b57..da203a9 100644
--- a/libavcodec/h264data.c
+++ b/libavcodec/h264data.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -174,13 +174,17 @@ const uint8_t ff_h264_dequant8_coeff_init[6][6] = {
 const uint8_t ff_h264_quant_rem6[QP_MAX_NUM + 1] = {
     0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
     3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+    3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+    0, 1, 2, 3,
 };
 
 const uint8_t ff_h264_quant_div6[QP_MAX_NUM + 1] = {
     0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
     3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
-    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+    7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10,
+   10,10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13, 13, 13, 13,
+   14,14,14,14,
 };
 
 #define QP(qP, depth) ((qP) + 6 * ((depth) - 8))
@@ -196,11 +200,33 @@ const uint8_t ff_h264_quant_div6[QP_MAX_NUM + 1] = {
     QP(37, d), QP(37, d), QP(37, d), QP(38, d), QP(38, d), QP(38, d),   \
     QP(39, d), QP(39, d), QP(39, d), QP(39, d)
 
-const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM + 1] = {
+const uint8_t ff_h264_chroma_qp[7][QP_MAX_NUM + 1] = {
     { CHROMA_QP_TABLE_END(8) },
     { 0, 1, 2, 3, 4, 5,
       CHROMA_QP_TABLE_END(9) },
-    { 0, 1, 2, 3, 4, 5,
+    { 0, 1, 2, 3,  4,  5,
       6, 7, 8, 9, 10, 11,
       CHROMA_QP_TABLE_END(10) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      CHROMA_QP_TABLE_END(11) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      CHROMA_QP_TABLE_END(12) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      24,25,26,27, 28, 29,
+      CHROMA_QP_TABLE_END(13) },
+    { 0,  1, 2, 3,  4,  5,
+      6,  7, 8, 9, 10, 11,
+      12,13,14,15, 16, 17,
+      18,19,20,21, 22, 23,
+      24,25,26,27, 28, 29,
+      30,31,32,33, 34, 35,
+      CHROMA_QP_TABLE_END(14) },
 };
diff --git a/libavcodec/h264data.h b/libavcodec/h264data.h
index ab96f08..988352a 100644
--- a/libavcodec/h264data.h
+++ b/libavcodec/h264data.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,12 +48,32 @@ extern const PMbInfo ff_h264_p_sub_mb_type_info[4];
 extern const PMbInfo ff_h264_b_mb_type_info[23];
 extern const PMbInfo ff_h264_b_sub_mb_type_info[13];
 
+static const AVRational ff_h264_pixel_aspect[17] = {
+    {   0,  1 },
+    {   1,  1 },
+    {  12, 11 },
+    {  10, 11 },
+    {  16, 11 },
+    {  40, 33 },
+    {  24, 11 },
+    {  20, 11 },
+    {  32, 11 },
+    {  80, 33 },
+    {  18, 11 },
+    {  15, 11 },
+    {  64, 33 },
+    { 160, 99 },
+    {   4,  3 },
+    {   3,  2 },
+    {   2,  1 },
+};
+
 extern const uint8_t ff_h264_dequant4_coeff_init[6][3];
 extern const uint8_t ff_h264_dequant8_coeff_init_scan[16];
 extern const uint8_t ff_h264_dequant8_coeff_init[6][6];
 extern const uint8_t ff_h264_quant_rem6[QP_MAX_NUM + 1];
 extern const uint8_t ff_h264_quant_div6[QP_MAX_NUM + 1];
 
-extern const uint8_t ff_h264_chroma_qp[3][QP_MAX_NUM + 1];
+extern const uint8_t ff_h264_chroma_qp[7][QP_MAX_NUM + 1];
 
 #endif /* AVCODEC_H264DATA_H */
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 684566b..d26f552 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+
 #include "avcodec.h"
 #include "h264dsp.h"
 #include "h264idct.h"
@@ -46,6 +48,14 @@
 #include "h264dsp_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "h264dsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264dsp_template.c"
+#undef BIT_DEPTH
+
 #define BIT_DEPTH 8
 #include "h264addpx_template.c"
 #undef BIT_DEPTH
@@ -130,7 +140,14 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
     case 10:
         H264_DSP(10);
         break;
+    case 12:
+        H264_DSP(12);
+        break;
+    case 14:
+        H264_DSP(14);
+        break;
     default:
+        av_assert0(bit_depth<=8);
         H264_DSP(8);
         break;
     }
@@ -140,4 +157,5 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
     if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
     if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
     if (ARCH_X86) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc);
+    if (ARCH_MIPS) ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 3a5b25b..7f24376 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -126,5 +126,7 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
                          const int chroma_format_idc);
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                          const int chroma_format_idc);
+void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
+                          const int chroma_format_idc);
 
 #endif /* AVCODEC_H264DSP_H */
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index c2d1394..d9dcf6b 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -1,21 +1,21 @@
 /*
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,8 +35,8 @@ static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int heig
 { \
     int y; \
     pixel *block = (pixel*)_block; \
-    stride /= sizeof(pixel); \
-    offset <<= (log2_denom + (BIT_DEPTH-8)); \
+    stride >>= sizeof(pixel)-1; \
+    offset = (unsigned)offset << (log2_denom + (BIT_DEPTH-8)); \
     if(log2_denom) offset += 1<<(log2_denom-1); \
     for (y = 0; y < height; y++, block += stride) { \
         op_scale1(0); \
@@ -66,9 +66,9 @@ static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int s
     int y; \
     pixel *dst = (pixel*)_dst; \
     pixel *src = (pixel*)_src; \
-    stride /= sizeof(pixel); \
-    offset <<= (BIT_DEPTH-8); \
-    offset = ((offset + 1) | 1) << log2_denom; \
+    stride >>= sizeof(pixel)-1; \
+    offset = (unsigned)offset << (BIT_DEPTH-8); \
+    offset = (unsigned)((offset + 1) | 1) << log2_denom; \
     for (y = 0; y < height; y++, dst += stride, src += stride) { \
         op_scale2(0); \
         op_scale2(1); \
@@ -101,16 +101,16 @@ H264_WEIGHT(2)
 #undef op_scale2
 #undef H264_WEIGHT
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int i, d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( i = 0; i < 4; i++ ) {
-        const int tc_orig = tc0[i] << (BIT_DEPTH - 8);
+        const int tc_orig = tc0[i] * (1 << (BIT_DEPTH - 8));
         if( tc_orig < 0 ) {
             pix += inner_iters*ystride;
             continue;
@@ -141,7 +141,7 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma)(uint8_t *_p
                     tc++;
                 }
 
-                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                i_delta = av_clip( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
                 pix[-xstride] = av_clip_pixel( p0 + i_delta );    /* p0' */
                 pix[0]        = av_clip_pixel( q0 - i_delta );    /* q0' */
             }
@@ -162,12 +162,12 @@ static void FUNCC(h264_h_loop_filter_luma_mbaff)(uint8_t *pix, int stride, int a
     FUNCC(h264_loop_filter_luma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_luma_intra)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( d = 0; d < 4 * inner_iters; d++ ) {
@@ -228,16 +228,16 @@ static void FUNCC(h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix, int stride,
     FUNCC(h264_loop_filter_luma_intra)(pix, sizeof(pixel), stride, 2, alpha, beta);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta, int8_t *tc0)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int i, d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     for( i = 0; i < 4; i++ ) {
-        const int tc = ((tc0[i] - 1) << (BIT_DEPTH - 8)) + 1;
+        const int tc = ((tc0[i] - 1U) << (BIT_DEPTH - 8)) + 1;
         if( tc <= 0 ) {
             pix += inner_iters*ystride;
             continue;
@@ -252,7 +252,7 @@ static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma)(uint8_t *
                 FFABS( p1 - p0 ) < beta &&
                 FFABS( q1 - q0 ) < beta ) {
 
-                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                int delta = av_clip( ((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc );
 
                 pix[-xstride] = av_clip_pixel( p0 + delta );    /* p0' */
                 pix[0]        = av_clip_pixel( q0 - delta );    /* q0' */
@@ -282,12 +282,12 @@ static void FUNCC(h264_h_loop_filter_chroma422_mbaff)(uint8_t *pix, int stride,
     FUNCC(h264_loop_filter_chroma)(pix, sizeof(pixel), stride, 2, alpha, beta, tc0);
 }
 
-static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
+static av_always_inline av_flatten void FUNCC(h264_loop_filter_chroma_intra)(uint8_t *p_pix, int xstride, int ystride, int inner_iters, int alpha, int beta)
 {
-    pixel *pix = (pixel*)_pix;
+    pixel *pix = (pixel*)p_pix;
     int d;
-    xstride /= sizeof(pixel);
-    ystride /= sizeof(pixel);
+    xstride >>= sizeof(pixel)-1;
+    ystride >>= sizeof(pixel)-1;
     alpha <<= BIT_DEPTH - 8;
     beta  <<= BIT_DEPTH - 8;
     for( d = 0; d < 4 * inner_iters; d++ ) {
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index ea08d03..6a771af 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -2,20 +2,20 @@
  * H.264 IDCT
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,3 +38,11 @@
 #define BIT_DEPTH 10
 #include "h264idct_template.c"
 #undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "h264idct_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264idct_template.c"
+#undef BIT_DEPTH
diff --git a/libavcodec/h264idct.h b/libavcodec/h264idct.h
index 816a825..17e0051 100644
--- a/libavcodec/h264idct.h
+++ b/libavcodec/h264idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,5 +38,7 @@ void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(int16_t *block, int qmul);
 H264_IDCT( 8)
 H264_IDCT( 9)
 H264_IDCT(10)
+H264_IDCT(12)
+H264_IDCT(14)
 
 #endif /* AVCODEC_H264IDCT_H */
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index 83c2a95..abf888e 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -2,20 +2,20 @@
  * H.264 IDCT
  * Copyright (c) 2004-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
     int i;
     pixel *dst = (pixel*)_dst;
     dctcoef *block = (dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     block[0] += 1 << 5;
 
@@ -70,7 +70,7 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
     int i;
     pixel *dst = (pixel*)_dst;
     dctcoef *block = (dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     block[0] += 32;
 
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 7627eb0..5632a58 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "h264pred.h"
@@ -42,6 +43,14 @@
 #include "h264pred_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "h264pred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264pred_template.c"
+#undef BIT_DEPTH
+
 static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright,
                                    ptrdiff_t stride)
 {
@@ -401,7 +410,7 @@ static void pred8x8_tm_vp8_c(uint8_t *src, ptrdiff_t stride)
  */
 av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
                                const int bit_depth,
-                               const int chroma_format_idc)
+                               int chroma_format_idc)
 {
 #undef FUNC
 #undef FUNCC
@@ -552,6 +561,8 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
     h->pred4x4_add  [ HOR_PRED   ]= FUNCC(pred4x4_horizontal_add          , depth);\
     h->pred8x8l_add [VERT_PRED   ]= FUNCC(pred8x8l_vertical_add           , depth);\
     h->pred8x8l_add [ HOR_PRED   ]= FUNCC(pred8x8l_horizontal_add         , depth);\
+    h->pred8x8l_filter_add [VERT_PRED   ]= FUNCC(pred8x8l_vertical_filter_add           , depth);\
+    h->pred8x8l_filter_add [ HOR_PRED   ]= FUNCC(pred8x8l_horizontal_filter_add         , depth);\
     if (chroma_format_idc <= 1) {\
     h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add            , depth);\
     h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add          , depth);\
@@ -569,7 +580,14 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
         case 10:
             H264_PRED(10)
             break;
+        case 12:
+            H264_PRED(12)
+            break;
+        case 14:
+            H264_PRED(14)
+            break;
         default:
+            av_assert0(bit_depth<=8);
             H264_PRED(8)
             break;
     }
@@ -580,4 +598,6 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
         ff_h264_pred_init_arm(h, codec_id, bit_depth, chroma_format_idc);
     if (ARCH_X86)
         ff_h264_pred_init_x86(h, codec_id, bit_depth, chroma_format_idc);
+    if (ARCH_MIPS)
+        ff_h264_pred_init_mips(h, codec_id, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index 60e7434..2863dc9 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -101,6 +101,8 @@ typedef struct H264PredContext {
                           int16_t *block /*align 16*/, ptrdiff_t stride);
     void(*pred8x8l_add[2])(uint8_t *pix /*align  8*/,
                            int16_t *block /*align 16*/, ptrdiff_t stride);
+    void(*pred8x8l_filter_add[2])(uint8_t *pix /*align  8*/,
+                           int16_t *block /*align 16*/, int topleft, int topright, ptrdiff_t stride);
     void(*pred8x8_add[3])(uint8_t *pix /*align  8*/,
                           const int *block_offset,
                           int16_t *block /*align 16*/, ptrdiff_t stride);
@@ -118,5 +120,7 @@ void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
                            const int bit_depth, const int chroma_format_idc);
 void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                            const int bit_depth, const int chroma_format_idc);
+void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
+                            const int bit_depth, const int chroma_format_idc);
 
 #endif /* AVCODEC_H264PRED_H */
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index 8492b2b..2b30fff 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
                                     ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a= AV_RN4PA(src-stride);
 
     AV_WN4PA(src+0*stride, a);
@@ -48,7 +48,7 @@ static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
                                       ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
     AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
     AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
@@ -59,7 +59,7 @@ static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
                               ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
@@ -74,7 +74,7 @@ static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
                                    ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
 
@@ -88,7 +88,7 @@ static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
     const pixel4 a = PIXEL_SPLAT_X4(dc);
 
@@ -102,7 +102,7 @@ static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
 
     AV_WN4PA(src+0*stride, a);
@@ -115,7 +115,7 @@ static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
 
     AV_WN4PA(src+0*stride, a);
@@ -128,7 +128,7 @@ static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright,
                                   ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
 
     AV_WN4PA(src+0*stride, a);
@@ -166,7 +166,7 @@ static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
                                       ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -194,7 +194,7 @@ static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
 {
     pixel *src = (pixel*)_src;
     const pixel *topright = (const pixel*)_topright;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_TOP_EDGE
     LOAD_TOP_RIGHT_EDGE
 //    LOAD_LEFT_EDGE
@@ -222,7 +222,7 @@ static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
                                           ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -251,7 +251,7 @@ static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
 {
     pixel *src = (pixel*)_src;
     const pixel *topright = (const pixel*)_topright;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_TOP_EDGE
     LOAD_TOP_RIGHT_EDGE
 
@@ -277,7 +277,7 @@ static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
                                          ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     LOAD_LEFT_EDGE
 
     src[0+0*stride]=(l0 + l1 + 1)>>1;
@@ -303,7 +303,7 @@ static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
                                            ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const int lt= src[-1-1*stride];
     LOAD_TOP_EDGE
     LOAD_LEFT_EDGE
@@ -330,7 +330,7 @@ static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
     const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
     const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
@@ -348,7 +348,7 @@ static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0; i<16; i++){
         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
@@ -374,7 +374,7 @@ static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[-1+i*stride];
@@ -393,7 +393,7 @@ static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[-1+i*stride];
@@ -408,7 +408,7 @@ static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
     int i, dc=0;
     pixel *src = (pixel*)_src;
     pixel4 dcsplat;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0;i<16; i++){
         dc+= src[i-stride];
@@ -423,7 +423,7 @@ static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
 {\
     int i;\
     pixel *src = (pixel*)_src;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
 }
 
@@ -440,7 +440,7 @@ static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
   int a;
   INIT_CLIP
   pixel *src = (pixel*)_src;
-  int stride = _stride/sizeof(pixel);
+  int stride = _stride>>(sizeof(pixel)-1);
   const pixel * const src0 = src +7-stride;
   const pixel *       src1 = src +8*stride-1;
   const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
@@ -489,7 +489,7 @@ static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
     const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
 
@@ -517,7 +517,7 @@ static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
 {
     int i;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     for(i=0; i<8; i++){
         const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
@@ -544,7 +544,7 @@ static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
     int i;\
     const pixel4 a = PIXEL_SPLAT_X4(v);\
     pixel *src = (pixel*)_src;\
-    stride /= sizeof(pixel);\
+    stride >>= sizeof(pixel)-1;\
     for(i=0; i<8; i++){\
         AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
         AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
@@ -567,7 +567,7 @@ static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc2;
     pixel4 dc0splat, dc2splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc2=0;
     for(i=0;i<4; i++){
@@ -599,7 +599,7 @@ static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc1;
     pixel4 dc0splat, dc1splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc1=0;
     for(i=0;i<4; i++){
@@ -647,7 +647,7 @@ static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
     int dc0, dc1, dc2;
     pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
     pixel *src = (pixel*)_src;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
 
     dc0=dc1=dc2=0;
     for(i=0;i<4; i++){
@@ -713,6 +713,7 @@ static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
     }
 }
 
+//the following 4 function should not be optimized!
 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
 {
     FUNCC(pred8x8_top_dc)(src, stride);
@@ -771,7 +772,7 @@ static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
   int a;
   INIT_CLIP
   pixel *src = (pixel*)_src;
-  int stride = _stride/sizeof(pixel);
+  int stride = _stride>>(sizeof(pixel)-1);
   const pixel * const src0 = src +3-stride;
   const pixel *       src1 = src +4*stride-1;
   const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
@@ -885,7 +886,7 @@ static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
                                    int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
 }
@@ -893,7 +894,7 @@ static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
                                     int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_LEFT;
     const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
@@ -903,7 +904,7 @@ static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
                                    int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_TOP;
     const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
@@ -913,7 +914,7 @@ static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
                                int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
 
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOP;
@@ -925,7 +926,7 @@ static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
                                        int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     pixel4 a;
 
     PREDICT_8x8_LOAD_LEFT;
@@ -940,7 +941,7 @@ static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
 {
     int y;
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     pixel4 a, b;
 
     PREDICT_8x8_LOAD_TOP;
@@ -963,7 +964,7 @@ static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
                                       int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_TOPRIGHT;
     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
@@ -986,7 +987,7 @@ static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
                                        int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1010,7 +1011,7 @@ static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
                                            int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1041,7 +1042,7 @@ static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
                                             int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_LEFT;
     PREDICT_8x8_LOAD_TOPLEFT;
@@ -1072,7 +1073,7 @@ static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
                                           int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_TOP;
     PREDICT_8x8_LOAD_TOPRIGHT;
     SRC(0,0)= (t0 + t1 + 1) >> 1;
@@ -1102,7 +1103,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
                                           int has_topright, ptrdiff_t _stride)
 {
     pixel *src = (pixel*)_src;
-    int stride = _stride/sizeof(pixel);
+    int stride = _stride>>(sizeof(pixel)-1);
     PREDICT_8x8_LOAD_LEFT;
     SRC(0,0)= (l0 + l1 + 1) >> 1;
     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
@@ -1123,6 +1124,79 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
 }
+
+static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
+                                     int has_topright, ptrdiff_t _stride)
+{
+    int i;
+    pixel *src = (pixel*)_src;
+    const dctcoef *block = (const dctcoef*)_block;
+    pixel pix[8];
+    int stride = _stride>>(sizeof(pixel)-1);
+    PREDICT_8x8_LOAD_TOP;
+
+    pix[0] = t0;
+    pix[1] = t1;
+    pix[2] = t2;
+    pix[3] = t3;
+    pix[4] = t4;
+    pix[5] = t5;
+    pix[6] = t6;
+    pix[7] = t7;
+
+    for(i=0; i<8; i++){
+        pixel v = pix[i];
+        src[0*stride]= v += block[0];
+        src[1*stride]= v += block[8];
+        src[2*stride]= v += block[16];
+        src[3*stride]= v += block[24];
+        src[4*stride]= v += block[32];
+        src[5*stride]= v += block[40];
+        src[6*stride]= v += block[48];
+        src[7*stride]= v +  block[56];
+        src++;
+        block++;
+    }
+
+    memset(_block, 0, sizeof(dctcoef) * 64);
+}
+
+static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
+                               int has_topright, ptrdiff_t _stride)
+{
+    int i;
+    pixel *src = (pixel*)_src;
+    const dctcoef *block = (const dctcoef*)_block;
+    pixel pix[8];
+    int stride = _stride>>(sizeof(pixel)-1);
+    PREDICT_8x8_LOAD_LEFT;
+
+    pix[0] = l0;
+    pix[1] = l1;
+    pix[2] = l2;
+    pix[3] = l3;
+    pix[4] = l4;
+    pix[5] = l5;
+    pix[6] = l6;
+    pix[7] = l7;
+
+    for(i=0; i<8; i++){
+        pixel v = pix[i];
+        src[0]= v += block[0];
+        src[1]= v += block[1];
+        src[2]= v += block[2];
+        src[3]= v += block[3];
+        src[4]= v += block[4];
+        src[5]= v += block[5];
+        src[6]= v += block[6];
+        src[7]= v +  block[7];
+        src+= stride;
+        block+= 8;
+    }
+
+    memset(_block, 0, sizeof(dctcoef) * 64);
+}
+
 #undef PREDICT_8x8_LOAD_LEFT
 #undef PREDICT_8x8_LOAD_TOP
 #undef PREDICT_8x8_LOAD_TOPLEFT
@@ -1139,7 +1213,7 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     pix -= stride;
     for(i=0; i<4; i++){
         pixel v = pix[0];
@@ -1160,7 +1234,7 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     for(i=0; i<4; i++){
         pixel v = pix[-1];
         pix[0]= v += block[0];
@@ -1180,7 +1254,7 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     pix -= stride;
     for(i=0; i<8; i++){
         pixel v = pix[0];
@@ -1205,7 +1279,7 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
     int i;
     pixel *pix = (pixel*)_pix;
     const dctcoef *block = (const dctcoef*)_block;
-    stride /= sizeof(pixel);
+    stride >>= sizeof(pixel)-1;
     for(i=0; i<8; i++){
         pixel v = pix[-1];
         pix[0]= v += block[0];
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index ec46da2..50e82e2 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -2,26 +2,27 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "h264qpel.h"
 
+#define pixeltmp int16_t
 #define BIT_DEPTH 8
 #include "h264qpel_template.c"
 #undef BIT_DEPTH
@@ -33,6 +34,17 @@
 #define BIT_DEPTH 10
 #include "h264qpel_template.c"
 #undef BIT_DEPTH
+#undef pixeltmp
+
+#define pixeltmp int32_t
+#define BIT_DEPTH 12
+#include "h264qpel_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 14
+#include "h264qpel_template.c"
+#undef BIT_DEPTH
+
 
 av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
 {
@@ -76,6 +88,12 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
     case 10:
         SET_QPEL(10);
         break;
+    case 12:
+        SET_QPEL(12);
+        break;
+    case 14:
+        SET_QPEL(14);
+        break;
     }
 
     if (ARCH_AARCH64)
@@ -86,4 +104,6 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
         ff_h264qpel_init_ppc(c, bit_depth);
     if (ARCH_X86)
         ff_h264qpel_init_x86(c, bit_depth);
+    if (ARCH_MIPS)
+        ff_h264qpel_init_mips(c, bit_depth);
 }
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 97ce195..7c57ad0 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,5 +35,6 @@ void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
+void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264QPEL_H */
diff --git a/libavcodec/h264qpel_template.c b/libavcodec/h264qpel_template.c
index e846ac9..27c5b8f 100644
--- a/libavcodec/h264qpel_template.c
+++ b/libavcodec/h264qpel_template.c
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,14 +75,14 @@ static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstS
 }
 
 #define H264_LOWPASS(OPNAME, OP, OP2) \
-static av_unused void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *_dst, const uint8_t *_src, int dstStride, int srcStride){\
+static av_unused void FUNC(OPNAME ## h264_qpel2_h_lowpass)(uint8_t *p_dst, const uint8_t *p_src, int dstStride, int srcStride){\
     const int h=2;\
     INIT_CLIP\
     int i;\
-    pixel *dst = (pixel*)_dst;\
-    const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    pixel *dst = (pixel*)p_dst;\
+    const pixel *src = (const pixel*)p_src;\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
@@ -98,8 +98,8 @@ static av_unused void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, const
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -116,16 +116,16 @@ static av_unused void FUNC(OPNAME ## h264_qpel2_v_lowpass)(uint8_t *_dst, const
     }\
 }\
 \
-static av_unused void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static av_unused void FUNC(OPNAME ## h264_qpel2_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=2;\
     const int w=2;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -156,8 +156,8 @@ static void FUNC(OPNAME ## h264_qpel4_h_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
@@ -175,8 +175,8 @@ static void FUNC(OPNAME ## h264_qpel4_v_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -197,16 +197,16 @@ static void FUNC(OPNAME ## h264_qpel4_v_lowpass)(uint8_t *_dst, const uint8_t *_
     }\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel4_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel4_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=4;\
     const int w=4;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -244,8 +244,8 @@ static void FUNC(OPNAME ## h264_qpel8_h_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
@@ -267,8 +267,8 @@ static void FUNC(OPNAME ## h264_qpel8_v_lowpass)(uint8_t *_dst, const uint8_t *_
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     for(i=0; i<w; i++)\
     {\
         const int srcB= src[-2*srcStride];\
@@ -297,16 +297,16 @@ static void FUNC(OPNAME ## h264_qpel8_v_lowpass)(uint8_t *_dst, const uint8_t *_
     }\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel8_hv_lowpass)(uint8_t *_dst, int16_t *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel8_hv_lowpass)(uint8_t *_dst, pixeltmp *tmp, const uint8_t *_src, int dstStride, int tmpStride, int srcStride){\
     const int h=8;\
     const int w=8;\
-    const int pad = (BIT_DEPTH > 9) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
+    const int pad = (BIT_DEPTH == 10) ? (-10 * ((1<<BIT_DEPTH)-1)) : 0;\
     INIT_CLIP\
     int i;\
     pixel *dst = (pixel*)_dst;\
     const pixel *src = (const pixel*)_src;\
-    dstStride /= sizeof(pixel);\
-    srcStride /= sizeof(pixel);\
+    dstStride >>= sizeof(pixel)-1;\
+    srcStride >>= sizeof(pixel)-1;\
     src -= 2*srcStride;\
     for(i=0; i<h+5; i++)\
     {\
@@ -368,7 +368,7 @@ static void FUNC(OPNAME ## h264_qpel16_h_lowpass)(uint8_t *dst, const uint8_t *s
     FUNC(OPNAME ## h264_qpel8_h_lowpass)(dst+8*sizeof(pixel), src+8*sizeof(pixel), dstStride, srcStride);\
 }\
 \
-static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, int16_t *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+static void FUNC(OPNAME ## h264_qpel16_hv_lowpass)(uint8_t *dst, pixeltmp *tmp, const uint8_t *src, int dstStride, int tmpStride, int srcStride){\
     FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst                , tmp  , src                , dstStride, tmpStride, srcStride);\
     FUNC(OPNAME ## h264_qpel8_hv_lowpass)(dst+8*sizeof(pixel), tmp+8, src+8*sizeof(pixel), dstStride, tmpStride, srcStride);\
     src += 8*srcStride;\
@@ -480,13 +480,13 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc33)(uint8_t *dst, const uint
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc22)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     FUNC(OPNAME ## h264_qpel ## SIZE ## _hv_lowpass)(dst, tmp, src, stride, SIZE*sizeof(pixel), stride);\
 }\
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src, SIZE*sizeof(pixel), stride);\
@@ -496,7 +496,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc21)(uint8_t *dst, const uint
 \
 static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc23)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfH[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(put_h264_qpel ## SIZE ## _h_lowpass)(halfH, src + stride, SIZE*sizeof(pixel), stride);\
@@ -508,7 +508,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc12)(uint8_t *dst, const uint
 {\
     uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(copy_block ## SIZE )(full, src - stride*2, SIZE*sizeof(pixel),  stride, SIZE + 5);\
@@ -521,7 +521,7 @@ static void FUNCC(OPNAME ## h264_qpel ## SIZE ## _mc32)(uint8_t *dst, const uint
 {\
     uint8_t full[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t * const full_mid= full + SIZE*2*sizeof(pixel);\
-    int16_t tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
+    pixeltmp tmp[SIZE*(SIZE+5)*sizeof(pixel)];\
     uint8_t halfV[SIZE*SIZE*sizeof(pixel)];\
     uint8_t halfHV[SIZE*SIZE*sizeof(pixel)];\
     FUNC(copy_block ## SIZE )(full, src - stride*2 + sizeof(pixel), SIZE*sizeof(pixel),  stride, SIZE + 5);\
diff --git a/libavcodec/hap.c b/libavcodec/hap.c
index 770142c..5b3af5e 100644
--- a/libavcodec/hap.c
+++ b/libavcodec/hap.c
@@ -2,20 +2,20 @@
  * Vidvox Hap utility functions
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hap.h b/libavcodec/hap.h
index 9d847f7..e4762ee 100644
--- a/libavcodec/hap.h
+++ b/libavcodec/hap.h
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hapdec.c b/libavcodec/hapdec.c
index 8f5365b..5a399dc 100644
--- a/libavcodec/hapdec.c
+++ b/libavcodec/hapdec.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -61,7 +61,7 @@ static int parse_section_header(GetByteContext *gbc, int *section_size,
         *section_size = bytestream2_get_le32(gbc);
     }
 
-    if (*section_size > bytestream2_get_bytes_left(gbc))
+    if (*section_size > bytestream2_get_bytes_left(gbc) || *section_size < 0)
         return AVERROR_INVALIDDATA;
     else
         return 0;
@@ -309,6 +309,7 @@ static int hap_decode(AVCodecContext *avctx, void *data,
     HapContext *ctx = avctx->priv_data;
     ThreadFrame tframe;
     int ret, i;
+    int tex_size;
 
     bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
 
@@ -322,12 +323,14 @@ static int hap_decode(AVCodecContext *avctx, void *data,
     ret = ff_thread_get_buffer(avctx, &tframe, 0);
     if (ret < 0)
         return ret;
-    ff_thread_finish_setup(avctx);
+    if (avctx->codec->update_thread_context)
+        ff_thread_finish_setup(avctx);
 
     /* Unpack the DXT texture */
     if (hap_can_use_tex_in_place(ctx)) {
         /* Only DXTC texture compression in a contiguous block */
         ctx->tex_data = ctx->gbc.buffer;
+        tex_size = bytestream2_get_bytes_left(&ctx->gbc);
     } else {
         /* Perform the second-stage decompression */
         ret = av_reallocp(&ctx->tex_buf, ctx->tex_size);
@@ -343,6 +346,14 @@ static int hap_decode(AVCodecContext *avctx, void *data,
         }
 
         ctx->tex_data = ctx->tex_buf;
+        tex_size = ctx->tex_size;
+    }
+
+    if (tex_size < (avctx->coded_width  / TEXTURE_BLOCK_W)
+                  *(avctx->coded_height / TEXTURE_BLOCK_H)
+                  *ctx->tex_rat) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient data\n");
+        return AVERROR_INVALIDDATA;
     }
 
     /* Use the decompress function on the texture, one block per thread */
@@ -372,9 +383,6 @@ static av_cold int hap_init(AVCodecContext *avctx)
     avctx->coded_width  = FFALIGN(avctx->width,  TEXTURE_BLOCK_W);
     avctx->coded_height = FFALIGN(avctx->height, TEXTURE_BLOCK_H);
 
-    /* Technically only one mode has alpha, but 32 bits are easier to handle */
-    avctx->pix_fmt = AV_PIX_FMT_RGBA;
-
     ff_texturedsp_init(&ctx->dxtc);
 
     switch (avctx->codec_tag) {
@@ -382,16 +390,19 @@ static av_cold int hap_init(AVCodecContext *avctx)
         texture_name = "DXT1";
         ctx->tex_rat = 8;
         ctx->tex_fun = ctx->dxtc.dxt1_block;
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
         break;
     case MKTAG('H','a','p','5'):
         texture_name = "DXT5";
         ctx->tex_rat = 16;
         ctx->tex_fun = ctx->dxtc.dxt5_block;
+        avctx->pix_fmt = AV_PIX_FMT_RGBA;
         break;
     case MKTAG('H','a','p','Y'):
         texture_name = "DXT5-YCoCg-scaled";
         ctx->tex_rat = 16;
         ctx->tex_fun = ctx->dxtc.dxt5ys_block;
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
         break;
     default:
         return AVERROR_DECODER_NOT_FOUND;
diff --git a/libavcodec/hapenc.c b/libavcodec/hapenc.c
index 9ebad4a..cb5dcfa 100644
--- a/libavcodec/hapenc.c
+++ b/libavcodec/hapenc.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Copyright (C) 2015 Tom Butterworth <bangnoise@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -196,7 +196,7 @@ static int hap_encode(AVCodecContext *avctx, AVPacket *pkt,
     int pktsize = FFMAX(ctx->tex_size, ctx->max_snappy * ctx->chunk_count) + header_length;
 
     /* Allocate maximum size packet, shrink later. */
-    ret = ff_alloc_packet(pkt, pktsize);
+    ret = ff_alloc_packet2(avctx, pkt, pktsize, header_length);
     if (ret < 0)
         return ret;
 
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index 3bc730a..b478065 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -1,32 +1,34 @@
 /*
- * HEVC video decoder
+ * HEVC video Decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Mickael Raulet
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2012 - 2013 Wassim Hamidouche
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/atomic.h"
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "libavutil/display.h"
 #include "libavutil/internal.h"
+#include "libavutil/mastering_display_metadata.h"
 #include "libavutil/md5.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
@@ -39,67 +41,7 @@
 #include "hevc.h"
 #include "profiles.h"
 
-const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 3 };
-const uint8_t ff_hevc_qpel_extra_after[4]  = { 0, 4, 4, 4 };
-const uint8_t ff_hevc_qpel_extra[4]        = { 0, 7, 7, 7 };
-
-static const uint8_t scan_1x1[1] = { 0 };
-
-static const uint8_t horiz_scan2x2_x[4] = { 0, 1, 0, 1 };
-
-static const uint8_t horiz_scan2x2_y[4] = { 0, 0, 1, 1 };
-
-static const uint8_t horiz_scan4x4_x[16] = {
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-};
-
-static const uint8_t horiz_scan4x4_y[16] = {
-    0, 0, 0, 0,
-    1, 1, 1, 1,
-    2, 2, 2, 2,
-    3, 3, 3, 3,
-};
-
-static const uint8_t horiz_scan8x8_inv[8][8] = {
-    {  0,  1,  2,  3, 16, 17, 18, 19, },
-    {  4,  5,  6,  7, 20, 21, 22, 23, },
-    {  8,  9, 10, 11, 24, 25, 26, 27, },
-    { 12, 13, 14, 15, 28, 29, 30, 31, },
-    { 32, 33, 34, 35, 48, 49, 50, 51, },
-    { 36, 37, 38, 39, 52, 53, 54, 55, },
-    { 40, 41, 42, 43, 56, 57, 58, 59, },
-    { 44, 45, 46, 47, 60, 61, 62, 63, },
-};
-
-static const uint8_t diag_scan2x2_x[4] = { 0, 0, 1, 1 };
-
-static const uint8_t diag_scan2x2_y[4] = { 0, 1, 0, 1 };
-
-static const uint8_t diag_scan2x2_inv[2][2] = {
-    { 0, 2, },
-    { 1, 3, },
-};
-
-static const uint8_t diag_scan4x4_inv[4][4] = {
-    { 0,  2,  5,  9, },
-    { 1,  4,  8, 12, },
-    { 3,  7, 11, 14, },
-    { 6, 10, 13, 15, },
-};
-
-static const uint8_t diag_scan8x8_inv[8][8] = {
-    {  0,  2,  5,  9, 14, 20, 27, 35, },
-    {  1,  4,  8, 13, 19, 26, 34, 42, },
-    {  3,  7, 12, 18, 25, 33, 41, 48, },
-    {  6, 11, 17, 24, 32, 40, 47, 53, },
-    { 10, 16, 23, 31, 39, 46, 52, 57, },
-    { 15, 22, 30, 38, 45, 51, 56, 60, },
-    { 21, 29, 37, 44, 50, 55, 59, 62, },
-    { 28, 36, 43, 49, 54, 58, 61, 63, },
-};
+const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
 
 /**
  * NOTE: Each function hls_foo correspond to the function foo in the
@@ -130,6 +72,10 @@ static void pic_arrays_free(HEVCContext *s)
     av_freep(&s->horizontal_bs);
     av_freep(&s->vertical_bs);
 
+    av_freep(&s->sh.entry_point_offset);
+    av_freep(&s->sh.size);
+    av_freep(&s->sh.offset);
+
     av_buffer_pool_uninit(&s->tab_mvf_pool);
     av_buffer_pool_uninit(&s->rpl_tab_pool);
 }
@@ -145,40 +91,40 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
     int ctb_count        = sps->ctb_width * sps->ctb_height;
     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
 
-    s->bs_width  = width  >> 3;
-    s->bs_height = height >> 3;
+    s->bs_width  = (width  >> 2) + 1;
+    s->bs_height = (height >> 2) + 1;
 
     s->sao           = av_mallocz_array(ctb_count, sizeof(*s->sao));
     s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
     if (!s->sao || !s->deblock)
         goto fail;
 
-    s->skip_flag    = av_malloc(pic_size_in_ctb);
-    s->tab_ct_depth = av_malloc(sps->min_cb_height * sps->min_cb_width);
+    s->skip_flag    = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
+    s->tab_ct_depth = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
     if (!s->skip_flag || !s->tab_ct_depth)
         goto fail;
 
-    s->cbf_luma = av_malloc(sps->min_tb_width * sps->min_tb_height);
+    s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height);
     s->tab_ipm  = av_mallocz(min_pu_size);
-    s->is_pcm   = av_malloc(min_pu_size);
+    s->is_pcm   = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
     if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
         goto fail;
 
-    s->filter_slice_edges = av_malloc(ctb_count);
-    s->tab_slice_address  = av_malloc(pic_size_in_ctb *
+    s->filter_slice_edges = av_mallocz(ctb_count);
+    s->tab_slice_address  = av_malloc_array(pic_size_in_ctb,
                                       sizeof(*s->tab_slice_address));
-    s->qp_y_tab           = av_malloc(pic_size_in_ctb *
+    s->qp_y_tab           = av_malloc_array(pic_size_in_ctb,
                                       sizeof(*s->qp_y_tab));
     if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
         goto fail;
 
-    s->horizontal_bs = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
-    s->vertical_bs   = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
+    s->horizontal_bs = av_mallocz_array(s->bs_width, s->bs_height);
+    s->vertical_bs   = av_mallocz_array(s->bs_width, s->bs_height);
     if (!s->horizontal_bs || !s->vertical_bs)
         goto fail;
 
     s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField),
-                                          av_buffer_alloc);
+                                          av_buffer_allocz);
     s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab),
                                           av_buffer_allocz);
     if (!s->tab_mvf_pool || !s->rpl_tab_pool)
@@ -199,11 +145,15 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
     uint8_t chroma_weight_l0_flag[16];
     uint8_t luma_weight_l1_flag[16];
     uint8_t chroma_weight_l1_flag[16];
+    int luma_log2_weight_denom;
 
-    s->sh.luma_log2_weight_denom = av_clip(get_ue_golomb_long(gb), 0, 7);
+    luma_log2_weight_denom = get_ue_golomb_long(gb);
+    if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7)
+        av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom);
+    s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3);
     if (s->ps.sps->chroma_format_idc != 0) {
         int delta = get_se_golomb(gb);
-        s->sh.chroma_log2_weight_denom = av_clip(s->sh.luma_log2_weight_denom + delta, 0, 7);
+        s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3);
     }
 
     for (i = 0; i < s->sh.nb_refs[L0]; i++) {
@@ -213,7 +163,7 @@ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
             s->sh.luma_offset_l0[i] = 0;
         }
     }
-    if (s->ps.sps->chroma_format_idc != 0) { // FIXME: invert "if" and "for"
+    if (s->ps.sps->chroma_format_idc != 0) {
         for (i = 0; i < s->sh.nb_refs[L0]; i++)
             chroma_weight_l0_flag[i] = get_bits1(gb);
     } else {
@@ -296,7 +246,7 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
         nb_sps = get_ue_golomb_long(gb);
     nb_sh = get_ue_golomb_long(gb);
 
-    if (nb_sh + nb_sps > FF_ARRAY_ELEMS(rps->poc))
+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
         return AVERROR_INVALIDDATA;
 
     rps->nb_refs = nb_sh + nb_sps;
@@ -378,11 +328,11 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
                   num, den, 1 << 30);
 }
 
-static int set_sps(HEVCContext *s, const HEVCSPS *sps)
+static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
 {
-    #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
+    #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
-    int ret;
+    int ret, i;
 
     pic_arrays_free(s);
     s->ps.sps = NULL;
@@ -397,36 +347,68 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps)
 
     export_stream_params(s->avctx, &s->ps, sps);
 
-    if (sps->pix_fmt == AV_PIX_FMT_YUV420P || sps->pix_fmt == AV_PIX_FMT_YUVJ420P) {
+    switch (sps->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUVJ420P:
 #if CONFIG_HEVC_DXVA2_HWACCEL
         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
 #endif
 #if CONFIG_HEVC_D3D11VA_HWACCEL
         *fmt++ = AV_PIX_FMT_D3D11VA_VLD;
 #endif
+#if CONFIG_HEVC_VAAPI_HWACCEL
+        *fmt++ = AV_PIX_FMT_VAAPI;
+#endif
 #if CONFIG_HEVC_VDPAU_HWACCEL
         *fmt++ = AV_PIX_FMT_VDPAU;
 #endif
+        break;
+    case AV_PIX_FMT_YUV420P10:
+#if CONFIG_HEVC_DXVA2_HWACCEL
+        *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+#endif
+#if CONFIG_HEVC_D3D11VA_HWACCEL
+        *fmt++ = AV_PIX_FMT_D3D11VA_VLD;
+#endif
+        break;
     }
 
-    *fmt++ = sps->pix_fmt;
-    *fmt = AV_PIX_FMT_NONE;
+    if (pix_fmt == AV_PIX_FMT_NONE) {
+        *fmt++ = sps->pix_fmt;
+        *fmt = AV_PIX_FMT_NONE;
 
-    ret = ff_get_format(s->avctx, pix_fmts);
-    if (ret < 0)
-        goto fail;
-    s->avctx->pix_fmt = ret;
+        ret = ff_thread_get_format(s->avctx, pix_fmts);
+        if (ret < 0)
+            goto fail;
+        s->avctx->pix_fmt = ret;
+    }
+    else {
+        s->avctx->pix_fmt = pix_fmt;
+    }
 
     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
 
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
+
     if (sps->sao_enabled && !s->avctx->hwaccel) {
-        av_frame_unref(s->tmp_frame);
-        ret = ff_get_buffer(s->avctx, s->tmp_frame, AV_GET_BUFFER_FLAG_REF);
-        if (ret < 0)
-            goto fail;
-        s->frame = s->tmp_frame;
+        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+        int c_idx;
+
+        for(c_idx = 0; c_idx < c_count; c_idx++) {
+            int w = sps->width >> sps->hshift[c_idx];
+            int h = sps->height >> sps->vshift[c_idx];
+            s->sao_pixel_buffer_h[c_idx] =
+                av_malloc((w * 2 * sps->ctb_height) <<
+                          sps->pixel_shift);
+            s->sao_pixel_buffer_v[c_idx] =
+                av_malloc((h * 2 * sps->ctb_width) <<
+                          sps->pixel_shift);
+        }
     }
 
     s->ps.sps = sps;
@@ -442,7 +424,7 @@ fail:
 
 static int hls_slice_header(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
     SliceHeader *sh   = &s->sh;
     int i, ret;
 
@@ -454,6 +436,7 @@ static int hls_slice_header(HEVCContext *s)
         if (IS_IDR(s))
             ff_hevc_clear_refs(s);
     }
+    sh->no_output_of_prior_pics_flag = 0;
     if (IS_IRAP(s))
         sh->no_output_of_prior_pics_flag = get_bits1(gb);
 
@@ -468,12 +451,20 @@ static int hls_slice_header(HEVCContext *s)
         return AVERROR_INVALIDDATA;
     }
     s->ps.pps = (HEVCPPS*)s->ps.pps_list[sh->pps_id]->data;
+    if (s->nal_unit_type == NAL_CRA_NUT && s->last_eos == 1)
+        sh->no_output_of_prior_pics_flag = 1;
 
     if (s->ps.sps != (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
+        const HEVCSPS* last_sps = s->ps.sps;
         s->ps.sps = (HEVCSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
-
+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != NAL_CRA_NUT) {
+            if (s->ps.sps->width !=  last_sps->width || s->ps.sps->height != last_sps->height ||
+                s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering !=
+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
+                sh->no_output_of_prior_pics_flag = 0;
+        }
         ff_hevc_clear_refs(s);
-        ret = set_sps(s, s->ps.sps);
+        ret = set_sps(s, s->ps.sps, AV_PIX_FMT_NONE);
         if (ret < 0)
             return ret;
 
@@ -490,7 +481,7 @@ static int hls_slice_header(HEVCContext *s)
 
         slice_address_length = av_ceil_log2(s->ps.sps->ctb_width *
                                             s->ps.sps->ctb_height);
-        sh->slice_segment_addr = slice_address_length ? get_bits(gb, slice_address_length) : 0;
+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
         if (sh->slice_segment_addr >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Invalid slice segment address: %u.\n",
@@ -602,8 +593,10 @@ static int hls_slice_header(HEVCContext *s)
 
         if (s->ps.sps->sao_enabled) {
             sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
-            sh->slice_sample_adaptive_offset_flag[1] =
-            sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
+            if (s->ps.sps->chroma_format_idc) {
+                sh->slice_sample_adaptive_offset_flag[1] =
+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
+            }
         } else {
             sh->slice_sample_adaptive_offset_flag[0] = 0;
             sh->slice_sample_adaptive_offset_flag[1] = 0;
@@ -701,6 +694,11 @@ static int hls_slice_header(HEVCContext *s)
             sh->slice_cr_qp_offset = 0;
         }
 
+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
+        else
+            sh->cu_chroma_qp_offset_enabled_flag = 0;
+
         if (s->ps.pps->deblocking_filter_control_present_flag) {
             int deblocking_filter_override_flag = 0;
 
@@ -739,23 +737,59 @@ static int hls_slice_header(HEVCContext *s)
 
     sh->num_entry_point_offsets = 0;
     if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
-        sh->num_entry_point_offsets = get_ue_golomb_long(gb);
+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
+        // It would be possible to bound this tighter but this here is simpler
+        if (num_entry_point_offsets > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
+            return AVERROR_INVALIDDATA;
+        }
+
+        sh->num_entry_point_offsets = num_entry_point_offsets;
         if (sh->num_entry_point_offsets > 0) {
             int offset_len = get_ue_golomb_long(gb) + 1;
 
-            for (i = 0; i < sh->num_entry_point_offsets; i++)
-                skip_bits(gb, offset_len);
-        }
+            if (offset_len < 1 || offset_len > 32) {
+                sh->num_entry_point_offsets = 0;
+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
+                return AVERROR_INVALIDDATA;
+            }
+
+            av_freep(&sh->entry_point_offset);
+            av_freep(&sh->offset);
+            av_freep(&sh->size);
+            sh->entry_point_offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(unsigned));
+            sh->offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
+            sh->size = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
+                sh->num_entry_point_offsets = 0;
+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
+                return AVERROR(ENOMEM);
+            }
+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
+                unsigned val = get_bits_long(gb, offset_len);
+                sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size
+            }
+            if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) {
+                s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here
+                s->threads_number = 1;
+            } else
+                s->enable_parallel_tiles = 0;
+        } else
+            s->enable_parallel_tiles = 0;
     }
 
     if (s->ps.pps->slice_header_extension_present_flag) {
         unsigned int length = get_ue_golomb_long(gb);
+        if (length*8LL > get_bits_left(gb)) {
+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
+            return AVERROR_INVALIDDATA;
+        }
         for (i = 0; i < length; i++)
             skip_bits(gb, 8);  // slice_header_extension_data_byte
     }
 
     // Inferred parameters
-    sh->slice_qp = 26 + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
     if (sh->slice_qp > 51 ||
         sh->slice_qp < -s->ps.sps->qp_bd_offset) {
         av_log(s->avctx, AV_LOG_ERROR,
@@ -773,13 +807,22 @@ static int hls_slice_header(HEVCContext *s)
         return AVERROR_INVALIDDATA;
     }
 
-    s->HEVClc.first_qp_group = !s->sh.dependent_slice_segment_flag;
+    if (get_bits_left(gb) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Overread slice header by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->HEVClc->first_qp_group = !s->sh.dependent_slice_segment_flag;
 
     if (!s->ps.pps->cu_qp_delta_enabled_flag)
-        s->HEVClc.qp_y = FFUMOD(s->sh.slice_qp + 52 + 2 * s->ps.sps->qp_bd_offset,
-                                52 + s->ps.sps->qp_bd_offset) - s->ps.sps->qp_bd_offset;
+        s->HEVClc->qp_y = s->sh.slice_qp;
 
     s->slice_initialized = 1;
+    s->HEVClc->tu.cu_qp_offset_cb = 0;
+    s->HEVClc->tu.cu_qp_offset_cr = 0;
+
+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == NAL_CRA_NUT && s->last_eos);
 
     return 0;
 }
@@ -800,10 +843,9 @@ do {                                                    \
 
 static void hls_sao_param(HEVCContext *s, int rx, int ry)
 {
-    HEVCLocalContext *lc    = &s->HEVClc;
+    HEVCLocalContext *lc    = s->HEVClc;
     int sao_merge_left_flag = 0;
     int sao_merge_up_flag   = 0;
-    int shift               = s->ps.sps->bit_depth - FFMIN(s->ps.sps->bit_depth, 10);
     SAOParams *sao          = &CTB(s->sao, rx, ry);
     int c_idx, i;
 
@@ -819,7 +861,10 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
         }
     }
 
-    for (c_idx = 0; c_idx < 3; c_idx++) {
+    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+        int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
+
         if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
             sao->type_idx[c_idx] = SAO_NOT_APPLIED;
             continue;
@@ -855,13 +900,14 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
         // Inferred parameters
         sao->offset_val[c_idx][0] = 0;
         for (i = 0; i < 4; i++) {
-            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i] << shift;
+            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i];
             if (sao->type_idx[c_idx] == SAO_EDGE) {
                 if (i > 1)
                     sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
             } else if (sao->offset_sign[c_idx][i]) {
                 sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
             }
+            sao->offset_val[c_idx][i + 1] *= 1 << log2_sao_offset_scale;
         }
     }
 }
@@ -869,384 +915,45 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
 #undef SET_SAO
 #undef CTB
 
-static void hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                int log2_trafo_size, enum ScanType scan_idx,
-                                int c_idx)
-{
-#define GET_COORD(offset, n)                                    \
-    do {                                                        \
-        x_c = (scan_x_cg[offset >> 4] << 2) + scan_x_off[n];    \
-        y_c = (scan_y_cg[offset >> 4] << 2) + scan_y_off[n];    \
-    } while (0)
-    HEVCLocalContext *lc    = &s->HEVClc;
-    int transform_skip_flag = 0;
-
-    int last_significant_coeff_x, last_significant_coeff_y;
-    int last_scan_pos;
-    int n_end;
-    int num_coeff    = 0;
-    int greater1_ctx = 1;
-
-    int num_last_subset;
-    int x_cg_last_sig, y_cg_last_sig;
-
-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
-
-    ptrdiff_t stride = s->frame->linesize[c_idx];
-    int hshift       = s->ps.sps->hshift[c_idx];
-    int vshift       = s->ps.sps->vshift[c_idx];
-    uint8_t *dst     = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-    DECLARE_ALIGNED(16, int16_t, coeffs[MAX_TB_SIZE * MAX_TB_SIZE]) = { 0 };
-    DECLARE_ALIGNED(8, uint8_t, significant_coeff_group_flag[8][8]) = { { 0 } };
-
-    int trafo_size = 1 << log2_trafo_size;
-    int i, qp, shift, add, scale, scale_m;
-    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-    const uint8_t *scale_matrix;
-    uint8_t dc_scale;
-
-    // Derive QP for dequant
-    if (!lc->cu.cu_transquant_bypass_flag) {
-        static const int qp_c[] = {
-            29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37
-        };
-
-        static const uint8_t rem6[51 + 2 * 6 + 1] = {
-            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
-            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
-        };
-
-        static const uint8_t div6[51 + 2 * 6 + 1] = {
-            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,  3,  3,  3,
-            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6,  6,  6,  6,
-            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
-        };
-        int qp_y = lc->qp_y;
-
-        if (c_idx == 0) {
-            qp = qp_y + s->ps.sps->qp_bd_offset;
-        } else {
-            int qp_i, offset;
-
-            if (c_idx == 1)
-                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset;
-            else
-                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset;
-
-            qp_i = av_clip(qp_y + offset, -s->ps.sps->qp_bd_offset, 57);
-            if (qp_i < 30)
-                qp = qp_i;
-            else if (qp_i > 43)
-                qp = qp_i - 6;
-            else
-                qp = qp_c[qp_i - 30];
-
-            qp += s->ps.sps->qp_bd_offset;
-        }
-
-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-        add      = 1 << (shift - 1);
-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-        scale_m  = 16; // default when no custom scaling lists.
-        dc_scale = 16;
-
-        if (s->ps.sps->scaling_list_enable_flag) {
-            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-                                    &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
-
-            if (log2_trafo_size != 5)
-                matrix_id = 3 * matrix_id + c_idx;
-
-            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-            if (log2_trafo_size >= 4)
-                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-        }
-    }
-
-    if (s->ps.pps->transform_skip_enabled_flag &&
-        !lc->cu.cu_transquant_bypass_flag   &&
-        log2_trafo_size == 2) {
-        transform_skip_flag = ff_hevc_transform_skip_flag_decode(s, c_idx);
-    }
-
-    last_significant_coeff_x =
-        ff_hevc_last_significant_coeff_x_prefix_decode(s, c_idx, log2_trafo_size);
-    last_significant_coeff_y =
-        ff_hevc_last_significant_coeff_y_prefix_decode(s, c_idx, log2_trafo_size);
-
-    if (last_significant_coeff_x > 3) {
-        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
-        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-                                   (2 + (last_significant_coeff_x & 1)) +
-                                   suffix;
-    }
+static int hls_cross_component_pred(HEVCContext *s, int idx) {
+    HEVCLocalContext *lc    = s->HEVClc;
+    int log2_res_scale_abs_plus1 = ff_hevc_log2_res_scale_abs(s, idx);
 
-    if (last_significant_coeff_y > 3) {
-        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
-        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-                                   (2 + (last_significant_coeff_y & 1)) +
-                                   suffix;
-    }
-
-    if (scan_idx == SCAN_VERT)
-        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
-
-    x_cg_last_sig = last_significant_coeff_x >> 2;
-    y_cg_last_sig = last_significant_coeff_y >> 2;
-
-    switch (scan_idx) {
-    case SCAN_DIAG: {
-        int last_x_c = last_significant_coeff_x & 3;
-        int last_y_c = last_significant_coeff_y & 3;
-
-        scan_x_off = ff_hevc_diag_scan4x4_x;
-        scan_y_off = ff_hevc_diag_scan4x4_y;
-        num_coeff  = diag_scan4x4_inv[last_y_c][last_x_c];
-        if (trafo_size == 4) {
-            scan_x_cg = scan_1x1;
-            scan_y_cg = scan_1x1;
-        } else if (trafo_size == 8) {
-            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = diag_scan2x2_x;
-            scan_y_cg  = diag_scan2x2_y;
-        } else if (trafo_size == 16) {
-            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = ff_hevc_diag_scan4x4_x;
-            scan_y_cg  = ff_hevc_diag_scan4x4_y;
-        } else { // trafo_size == 32
-            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-            scan_x_cg  = ff_hevc_diag_scan8x8_x;
-            scan_y_cg  = ff_hevc_diag_scan8x8_y;
-        }
-        break;
-    }
-    case SCAN_HORIZ:
-        scan_x_cg  = horiz_scan2x2_x;
-        scan_y_cg  = horiz_scan2x2_y;
-        scan_x_off = horiz_scan4x4_x;
-        scan_y_off = horiz_scan4x4_y;
-        num_coeff  = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-        break;
-    default: //SCAN_VERT
-        scan_x_cg  = horiz_scan2x2_y;
-        scan_y_cg  = horiz_scan2x2_x;
-        scan_x_off = horiz_scan4x4_y;
-        scan_y_off = horiz_scan4x4_x;
-        num_coeff  = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-        break;
+    if (log2_res_scale_abs_plus1 !=  0) {
+        int res_scale_sign_flag = ff_hevc_res_scale_sign_flag(s, idx);
+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
+                               (1 - 2 * res_scale_sign_flag);
+    } else {
+        lc->tu.res_scale_val = 0;
     }
-    num_coeff++;
-    num_last_subset = (num_coeff - 1) >> 4;
-
-    for (i = num_last_subset; i >= 0; i--) {
-        int n, m;
-        int x_cg, y_cg, x_c, y_c;
-        int implicit_non_zero_coeff = 0;
-        int64_t trans_coeff_level;
-        int prev_sig = 0;
-        int offset   = i << 4;
-
-        uint8_t significant_coeff_flag_idx[16];
-        uint8_t nb_significant_coeff_flag = 0;
-
-        x_cg = scan_x_cg[i];
-        y_cg = scan_y_cg[i];
-
-        if (i < num_last_subset && i > 0) {
-            int ctx_cg = 0;
-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-
-            significant_coeff_group_flag[x_cg][y_cg] =
-                ff_hevc_significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-            implicit_non_zero_coeff = 1;
-        } else {
-            significant_coeff_group_flag[x_cg][y_cg] =
-                ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-                 (x_cg == 0 && y_cg == 0));
-        }
-
-        last_scan_pos = num_coeff - offset - 1;
-
-        if (i == num_last_subset) {
-            n_end                         = last_scan_pos - 1;
-            significant_coeff_flag_idx[0] = last_scan_pos;
-            nb_significant_coeff_flag     = 1;
-        } else {
-            n_end = 15;
-        }
-
-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig = significant_coeff_group_flag[x_cg + 1][y_cg];
-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig += significant_coeff_group_flag[x_cg][y_cg + 1] << 1;
-
-        for (n = n_end; n >= 0; n--) {
-            GET_COORD(offset, n);
-
-            if (significant_coeff_group_flag[x_cg][y_cg] &&
-                (n > 0 || implicit_non_zero_coeff == 0)) {
-                if (ff_hevc_significant_coeff_flag_decode(s, c_idx, x_c, y_c,
-                                                          log2_trafo_size,
-                                                          scan_idx,
-                                                          prev_sig) == 1) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
-                    implicit_non_zero_coeff = 0;
-                }
-            } else {
-                int last_cg = (x_c == (x_cg << 2) && y_c == (y_cg << 2));
-                if (last_cg && implicit_non_zero_coeff && significant_coeff_group_flag[x_cg][y_cg]) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
-                }
-            }
-        }
-
-        n_end = nb_significant_coeff_flag;
-
-        if (n_end) {
-            int first_nz_pos_in_cg = 16;
-            int last_nz_pos_in_cg = -1;
-            int c_rice_param = 0;
-            int first_greater1_coeff_idx = -1;
-            uint8_t coeff_abs_level_greater1_flag[16] = { 0 };
-            uint16_t coeff_sign_flag;
-            int sum_abs = 0;
-            int sign_hidden = 0;
-
-            // initialize first elem of coeff_bas_level_greater1_flag
-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-
-            if (!(i == num_last_subset) && greater1_ctx == 0)
-                ctx_set++;
-            greater1_ctx      = 1;
-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-                int n_idx = significant_coeff_flag_idx[m];
-                int inc   = (ctx_set << 2) + greater1_ctx;
-                coeff_abs_level_greater1_flag[n_idx] =
-                    ff_hevc_coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-                if (coeff_abs_level_greater1_flag[n_idx]) {
-                    greater1_ctx = 0;
-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-                    greater1_ctx++;
-                }
 
-                if (coeff_abs_level_greater1_flag[n_idx] &&
-                    first_greater1_coeff_idx == -1)
-                    first_greater1_coeff_idx = n_idx;
-            }
-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-            sign_hidden        = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 &&
-                                 !lc->cu.cu_transquant_bypass_flag;
-
-            if (first_greater1_coeff_idx != -1) {
-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += ff_hevc_coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-            }
-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden) {
-                coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-            } else {
-                coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-            }
-
-            for (m = 0; m < n_end; m++) {
-                n = significant_coeff_flag_idx[m];
-                GET_COORD(offset, n);
-                trans_coeff_level = 1 + coeff_abs_level_greater1_flag[n];
-                if (trans_coeff_level == ((m < 8) ?
-                                          ((n == first_greater1_coeff_idx) ? 3 : 2) : 1)) {
-                    int last_coeff_abs_level_remaining = ff_hevc_coeff_abs_level_remaining(s, trans_coeff_level, c_rice_param);
-
-                    trans_coeff_level += last_coeff_abs_level_remaining;
-                    if ((trans_coeff_level) > (3 * (1 << c_rice_param)))
-                        c_rice_param = FFMIN(c_rice_param + 1, 4);
-                }
-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-                    sum_abs += trans_coeff_level;
-                    if (n == first_nz_pos_in_cg && ((sum_abs & 1) == 1))
-                        trans_coeff_level = -trans_coeff_level;
-                }
-                if (coeff_sign_flag >> 15)
-                    trans_coeff_level = -trans_coeff_level;
-                coeff_sign_flag <<= 1;
-                if (!lc->cu.cu_transquant_bypass_flag) {
-                    if (s->ps.sps->scaling_list_enable_flag) {
-                        if (y_c || x_c || log2_trafo_size < 4) {
-                            int pos;
-                            switch (log2_trafo_size) {
-                            case 3:  pos = (y_c        << 3) +  x_c;       break;
-                            case 4:  pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-                            case 5:  pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-                            default: pos = (y_c        << 2) +  x_c;
-                            }
-                            scale_m = scale_matrix[pos];
-                        } else {
-                            scale_m = dc_scale;
-                        }
-                    }
-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-                    if(trans_coeff_level < 0) {
-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-                            trans_coeff_level = -32768;
-                    } else {
-                        if (trans_coeff_level & 0xffffffffffff8000)
-                            trans_coeff_level = 32767;
-                    }
-                }
-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
-            }
-        }
-    }
 
-    if (lc->cu.cu_transquant_bypass_flag) {
-        s->hevcdsp.transquant_bypass[log2_trafo_size - 2](dst, coeffs, stride);
-    } else {
-        if (transform_skip_flag)
-            s->hevcdsp.transform_skip(dst, coeffs, stride);
-        else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 &&
-                 log2_trafo_size == 2)
-            s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
-        else
-            s->hevcdsp.transform_add[log2_trafo_size - 2](dst, coeffs, stride);
-    }
+    return 0;
 }
 
 static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
-                              int blk_idx, int cbf_luma, int cbf_cb, int cbf_cr)
+                              int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
+    const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1];
+    int i;
 
     if (lc->cu.pred_mode == MODE_INTRA) {
         int trafo_size = 1 << log2_trafo_size;
         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
 
         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
-        if (log2_trafo_size > 2) {
-            trafo_size = trafo_size << (s->ps.sps->hshift[1] - 1);
-            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-            s->hpc.intra_pred[log2_trafo_size - 3](s, x0, y0, 1);
-            s->hpc.intra_pred[log2_trafo_size - 3](s, x0, y0, 2);
-        } else if (blk_idx == 3) {
-            trafo_size = trafo_size << s->ps.sps->hshift[1];
-            ff_hevc_set_neighbour_available(s, xBase, yBase,
-                                            trafo_size, trafo_size);
-            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
-            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
-        }
     }
 
-    if (cbf_luma || cbf_cb || cbf_cr) {
+    if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+        (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) {
         int scan_idx   = SCAN_DIAG;
         int scan_idx_c = SCAN_DIAG;
+        int cbf_chroma = cbf_cb[0] || cbf_cr[0] ||
+                         (s->ps.sps->chroma_format_idc == 2 &&
+                         (cbf_cb[1] || cbf_cr[1]));
 
         if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) {
             lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(s);
@@ -1266,41 +973,167 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                 return AVERROR_INVALIDDATA;
             }
 
-            ff_hevc_set_qPy(s, x0, y0, cb_xBase, cb_yBase, log2_cb_size);
+            ff_hevc_set_qPy(s, cb_xBase, cb_yBase, log2_cb_size);
+        }
+
+        if (s->sh.cu_chroma_qp_offset_enabled_flag && cbf_chroma &&
+            !lc->cu.cu_transquant_bypass_flag  &&  !lc->tu.is_cu_chroma_qp_offset_coded) {
+            int cu_chroma_qp_offset_flag = ff_hevc_cu_chroma_qp_offset_flag(s);
+            if (cu_chroma_qp_offset_flag) {
+                int cu_chroma_qp_offset_idx  = 0;
+                if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
+                    cu_chroma_qp_offset_idx = ff_hevc_cu_chroma_qp_offset_idx(s);
+                    av_log(s->avctx, AV_LOG_ERROR,
+                        "cu_chroma_qp_offset_idx not yet tested.\n");
+                }
+                lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
+                lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
+            } else {
+                lc->tu.cu_qp_offset_cb = 0;
+                lc->tu.cu_qp_offset_cr = 0;
+            }
+            lc->tu.is_cu_chroma_qp_offset_coded = 1;
         }
 
         if (lc->cu.pred_mode == MODE_INTRA && log2_trafo_size < 4) {
-            if (lc->tu.cur_intra_pred_mode >= 6 &&
-                lc->tu.cur_intra_pred_mode <= 14) {
+            if (lc->tu.intra_pred_mode >= 6 &&
+                lc->tu.intra_pred_mode <= 14) {
                 scan_idx = SCAN_VERT;
-            } else if (lc->tu.cur_intra_pred_mode >= 22 &&
-                       lc->tu.cur_intra_pred_mode <= 30) {
+            } else if (lc->tu.intra_pred_mode >= 22 &&
+                       lc->tu.intra_pred_mode <= 30) {
                 scan_idx = SCAN_HORIZ;
             }
 
-            if (lc->pu.intra_pred_mode_c >=  6 &&
-                lc->pu.intra_pred_mode_c <= 14) {
+            if (lc->tu.intra_pred_mode_c >=  6 &&
+                lc->tu.intra_pred_mode_c <= 14) {
                 scan_idx_c = SCAN_VERT;
-            } else if (lc->pu.intra_pred_mode_c >= 22 &&
-                       lc->pu.intra_pred_mode_c <= 30) {
+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
+                       lc->tu.intra_pred_mode_c <= 30) {
                 scan_idx_c = SCAN_HORIZ;
             }
         }
 
+        lc->tu.cross_pf = 0;
+
         if (cbf_luma)
-            hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
-        if (log2_trafo_size > 2) {
-            if (cbf_cb)
-                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 1);
-            if (cbf_cr)
-                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 2);
+            ff_hevc_hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
+        if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
+            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+            lc->tu.cross_pf  = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma &&
+                                (lc->cu.pred_mode == MODE_INTER ||
+                                 (lc->tu.chroma_mode_c ==  4)));
+
+            if (lc->tu.cross_pf) {
+                hls_cross_component_pred(s, 0);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+                }
+                if (cbf_cb[i])
+                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+                                                log2_trafo_size_c, scan_idx_c, 1);
+                else
+                    if (lc->tu.cross_pf) {
+                        ptrdiff_t stride = s->frame->linesize[1];
+                        int hshift = s->ps.sps->hshift[1];
+                        int vshift = s->ps.sps->vshift[1];
+                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
+                        int size = 1 << log2_trafo_size_c;
+
+                        uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride +
+                                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+                        for (i = 0; i < (size * size); i++) {
+                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+                        }
+                        s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+                    }
+            }
+
+            if (lc->tu.cross_pf) {
+                hls_cross_component_pred(s, 1);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+                }
+                if (cbf_cr[i])
+                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+                                                log2_trafo_size_c, scan_idx_c, 2);
+                else
+                    if (lc->tu.cross_pf) {
+                        ptrdiff_t stride = s->frame->linesize[2];
+                        int hshift = s->ps.sps->hshift[2];
+                        int vshift = s->ps.sps->vshift[2];
+                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
+                        int size = 1 << log2_trafo_size_c;
+
+                        uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
+                                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+                        for (i = 0; i < (size * size); i++) {
+                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+                        }
+                        s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+                    }
+            }
+        } else if (s->ps.sps->chroma_format_idc && blk_idx == 3) {
+            int trafo_size_h = 1 << (log2_trafo_size + 1);
+            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                    trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+                }
+                if (cbf_cb[i])
+                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+                                                log2_trafo_size, scan_idx_c, 1);
+            }
+            for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                if (lc->cu.pred_mode == MODE_INTRA) {
+                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                trafo_size_h, trafo_size_v);
+                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+                }
+                if (cbf_cr[i])
+                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+                                                log2_trafo_size, scan_idx_c, 2);
+            }
+        }
+    } else if (s->ps.sps->chroma_format_idc && lc->cu.pred_mode == MODE_INTRA) {
+        if (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3) {
+            int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+            int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+            if (s->ps.sps->chroma_format_idc == 2) {
+                ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+                                                trafo_size_h, trafo_size_v);
+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+            }
         } else if (blk_idx == 3) {
-            if (cbf_cb)
-                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 1);
-            if (cbf_cr)
-                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 2);
+            int trafo_size_h = 1 << (log2_trafo_size + 1);
+            int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+            ff_hevc_set_neighbour_available(s, xBase, yBase,
+                                            trafo_size_h, trafo_size_v);
+            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+            if (s->ps.sps->chroma_format_idc == 2) {
+                ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+                                                trafo_size_h, trafo_size_v);
+                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+            }
         }
     }
+
     return 0;
 }
 
@@ -1323,17 +1156,34 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
                               int xBase, int yBase, int cb_xBase, int cb_yBase,
                               int log2_cb_size, int log2_trafo_size,
                               int trafo_depth, int blk_idx,
-                              int cbf_cb, int cbf_cr)
+                              const int *base_cbf_cb, const int *base_cbf_cr)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     uint8_t split_transform_flag;
+    int cbf_cb[2];
+    int cbf_cr[2];
     int ret;
 
+    cbf_cb[0] = base_cbf_cb[0];
+    cbf_cb[1] = base_cbf_cb[1];
+    cbf_cr[0] = base_cbf_cr[0];
+    cbf_cr[1] = base_cbf_cr[1];
+
     if (lc->cu.intra_split_flag) {
-        if (trafo_depth == 1)
-            lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
+        if (trafo_depth == 1) {
+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
+            if (s->ps.sps->chroma_format_idc == 3) {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
+            } else {
+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
+            }
+        }
     } else {
-        lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[0];
+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
     }
 
     if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
@@ -1352,14 +1202,21 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
                                inter_split;
     }
 
-    if (log2_trafo_size > 2 && (trafo_depth == 0 || cbf_cb))
-        cbf_cb = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-    else if (log2_trafo_size > 2 || trafo_depth == 0)
-        cbf_cb = 0;
-    if (log2_trafo_size > 2 && (trafo_depth == 0 || cbf_cr))
-        cbf_cr = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-    else if (log2_trafo_size > 2 || trafo_depth == 0)
-        cbf_cr = 0;
+    if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
+        if (trafo_depth == 0 || cbf_cb[0]) {
+            cbf_cb[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
+                cbf_cb[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            }
+        }
+
+        if (trafo_depth == 0 || cbf_cr[0]) {
+            cbf_cr[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
+                cbf_cr[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+            }
+        }
+    }
 
     if (split_transform_flag) {
         const int trafo_size_split = 1 << (log2_trafo_size - 1);
@@ -1388,8 +1245,10 @@ do {
         int cbf_luma         = 1;
 
         if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 ||
-            cbf_cb || cbf_cr)
+            cbf_cb[0] || cbf_cr[0] ||
+            (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) {
             cbf_luma = ff_hevc_cbf_luma_decode(s, trafo_depth);
+        }
 
         ret = hls_transform_unit(s, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
                                  log2_cb_size, log2_trafo_size,
@@ -1418,8 +1277,7 @@ do {
 
 static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
 {
-    //TODO: non-4:2:0 support
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     GetBitContext gb;
     int cb_size   = 1 << log2_cb_size;
     int stride0   = s->frame->linesize[0];
@@ -1429,7 +1287,10 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
     int   stride2 = s->frame->linesize[2];
     uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
 
-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth + ((cb_size * cb_size) >> 1) * s->ps.sps->pcm.bit_depth_chroma;
+    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+                          s->ps.sps->pcm.bit_depth_chroma;
     const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
     int ret;
 
@@ -1440,38 +1301,23 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
     if (ret < 0)
         return ret;
 
-    s->hevcdsp.put_pcm(dst0, stride0, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
-    s->hevcdsp.put_pcm(dst1, stride1, cb_size / 2, &gb, s->ps.sps->pcm.bit_depth_chroma);
-    s->hevcdsp.put_pcm(dst2, stride2, cb_size / 2, &gb, s->ps.sps->pcm.bit_depth_chroma);
-    return 0;
-}
-
-static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
-{
-    HEVCLocalContext *lc = &s->HEVClc;
-    int x = ff_hevc_abs_mvd_greater0_flag_decode(s);
-    int y = ff_hevc_abs_mvd_greater0_flag_decode(s);
-
-    if (x)
-        x += ff_hevc_abs_mvd_greater1_flag_decode(s);
-    if (y)
-        y += ff_hevc_abs_mvd_greater1_flag_decode(s);
-
-    switch (x) {
-    case 2: lc->pu.mvd.x = ff_hevc_mvd_decode(s);           break;
-    case 1: lc->pu.mvd.x = ff_hevc_mvd_sign_flag_decode(s); break;
-    case 0: lc->pu.mvd.x = 0;                               break;
+    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
+    if (s->ps.sps->chroma_format_idc) {
+        s->hevcdsp.put_pcm(dst1, stride1,
+                           cb_size >> s->ps.sps->hshift[1],
+                           cb_size >> s->ps.sps->vshift[1],
+                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+        s->hevcdsp.put_pcm(dst2, stride2,
+                           cb_size >> s->ps.sps->hshift[2],
+                           cb_size >> s->ps.sps->vshift[2],
+                           &gb, s->ps.sps->pcm.bit_depth_chroma);
     }
 
-    switch (y) {
-    case 2: lc->pu.mvd.y = ff_hevc_mvd_decode(s);           break;
-    case 1: lc->pu.mvd.y = ff_hevc_mvd_sign_flag_decode(s); break;
-    case 0: lc->pu.mvd.y = 0;                               break;
-    }
+    return 0;
 }
 
 /**
- * 8.5.3.2.2.1 Luma sample interpolation process
+ * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
  *
  * @param s HEVC decoding context
  * @param dst target buffer for block data at block position
@@ -1482,49 +1328,147 @@ static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
  * @param y_off vertical position of block from origin (0, 0)
  * @param block_w width of block
  * @param block_h height of block
+ * @param luma_weight weighting factor applied to the luma prediction
+ * @param luma_offset additive offset applied to the luma prediction value
  */
-static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
-                    AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                    int block_w, int block_h, int pred_idx)
+
+static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+                        int block_w, int block_h, int luma_weight, int luma_offset)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     uint8_t *src         = ref->data[0];
     ptrdiff_t srcstride  = ref->linesize[0];
     int pic_width        = s->ps.sps->width;
     int pic_height       = s->ps.sps->height;
-
-    int mx         = mv->x & 3;
-    int my         = mv->y & 3;
-    int extra_left = ff_hevc_qpel_extra_before[mx];
-    int extra_top  = ff_hevc_qpel_extra_before[my];
+    int mx               = mv->x & 3;
+    int my               = mv->y & 3;
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int idx              = ff_hevc_pel_weight[block_w];
 
     x_off += mv->x >> 2;
     y_off += mv->y >> 2;
     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
 
-    if (x_off < extra_left || y_off < extra_top ||
-        x_off >= pic_width - block_w - ff_hevc_qpel_extra_after[mx] ||
-        y_off >= pic_height - block_h - ff_hevc_qpel_extra_after[my]) {
+    if (x_off < QPEL_EXTRA_BEFORE || y_off < QPEL_EXTRA_AFTER ||
+        x_off >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off >= pic_height - block_h - QPEL_EXTRA_AFTER) {
         const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
-        int offset = extra_top * srcstride + (extra_left << s->ps.sps->pixel_shift);
-        int buf_offset = extra_top *
-                         edge_emu_stride + (extra_left << s->ps.sps->pixel_shift);
+        int offset     = QPEL_EXTRA_BEFORE * srcstride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
 
         s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src - offset,
                                  edge_emu_stride, srcstride,
-                                 block_w + ff_hevc_qpel_extra[mx],
-                                 block_h + ff_hevc_qpel_extra[my],
-                                 x_off - extra_left, y_off - extra_top,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off - QPEL_EXTRA_BEFORE, y_off - QPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
         src = lc->edge_emu_buffer + buf_offset;
         srcstride = edge_emu_stride;
     }
-    s->hevcdsp.put_hevc_qpel[!!my][!!mx][pred_idx](dst, dststride, src, srcstride,
-                                                   block_h, mx, my, lc->mc_buffer);
+
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_qpel_uni[idx][!!my][!!mx](dst, dststride, src, srcstride,
+                                                      block_h, mx, my, block_w);
+    else
+        s->hevcdsp.put_hevc_qpel_uni_w[idx][!!my][!!mx](dst, dststride, src, srcstride,
+                                                        block_h, s->sh.luma_log2_weight_denom,
+                                                        luma_weight, luma_offset, mx, my, block_w);
+}
+
+/**
+ * 8.5.3.2.2.1 Luma sample bidirectional interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst target buffer for block data at block position
+ * @param dststride stride of the dst buffer
+ * @param ref0 reference picture0 buffer at origin (0, 0)
+ * @param mv0 motion vector0 (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ * @param ref1 reference picture1 buffer at origin (0, 0)
+ * @param mv1 motion vector1 (relative to block position) to get pixel data from
+ * @param current_mv current motion vector structure
+ */
+ static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    ptrdiff_t src0stride  = ref0->linesize[0];
+    ptrdiff_t src1stride  = ref1->linesize[0];
+    int pic_width        = s->ps.sps->width;
+    int pic_height       = s->ps.sps->height;
+    int mx0              = mv0->x & 3;
+    int my0              = mv0->y & 3;
+    int mx1              = mv1->x & 3;
+    int my1              = mv1->y & 3;
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int x_off0           = x_off + (mv0->x >> 2);
+    int y_off0           = y_off + (mv0->y >> 2);
+    int x_off1           = x_off + (mv1->x >> 2);
+    int y_off1           = y_off + (mv1->y >> 2);
+    int idx              = ff_hevc_pel_weight[block_w];
+
+    uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+    uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+
+    if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+        x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset     = QPEL_EXTRA_BEFORE * src0stride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src0 - offset,
+                                 edge_emu_stride, src0stride,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off0 - QPEL_EXTRA_BEFORE, y_off0 - QPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+        src0 = lc->edge_emu_buffer + buf_offset;
+        src0stride = edge_emu_stride;
+    }
+
+    if (x_off1 < QPEL_EXTRA_BEFORE || y_off1 < QPEL_EXTRA_AFTER ||
+        x_off1 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+        y_off1 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset     = QPEL_EXTRA_BEFORE * src1stride       + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+        int buf_offset = QPEL_EXTRA_BEFORE * edge_emu_stride + (QPEL_EXTRA_BEFORE << s->ps.sps->pixel_shift);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer2, src1 - offset,
+                                 edge_emu_stride, src1stride,
+                                 block_w + QPEL_EXTRA,
+                                 block_h + QPEL_EXTRA,
+                                 x_off1 - QPEL_EXTRA_BEFORE, y_off1 - QPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+        src1 = lc->edge_emu_buffer2 + buf_offset;
+        src1stride = edge_emu_stride;
+    }
+
+    s->hevcdsp.put_hevc_qpel[idx][!!my0][!!mx0](lc->tmp, src0, src0stride,
+                                                block_h, mx0, my0, block_w);
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_qpel_bi[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, lc->tmp,
+                                                       block_h, mx1, my1, block_w);
+    else
+        s->hevcdsp.put_hevc_qpel_bi_w[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, lc->tmp,
+                                                         block_h, s->sh.luma_log2_weight_denom,
+                                                         s->sh.luma_weight_l0[current_mv->ref_idx[0]],
+                                                         s->sh.luma_weight_l1[current_mv->ref_idx[1]],
+                                                         s->sh.luma_offset_l0[current_mv->ref_idx[0]],
+                                                         s->sh.luma_offset_l1[current_mv->ref_idx[1]],
+                                                         mx1, my1, block_w);
+
 }
 
 /**
- * 8.5.3.2.2.2 Chroma sample interpolation process
+ * 8.5.3.2.2.2 Chroma sample uniprediction interpolation process
  *
  * @param s HEVC decoding context
  * @param dst1 target buffer for block data at block position (U plane)
@@ -1536,85 +1480,184 @@ static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride,
  * @param y_off vertical position of block from origin (0, 0)
  * @param block_w width of block
  * @param block_h height of block
+ * @param chroma_weight weighting factor applied to the chroma prediction
+ * @param chroma_offset additive offset applied to the chroma prediction value
  */
-static void chroma_mc(HEVCContext *s, int16_t *dst1, int16_t *dst2,
-                      ptrdiff_t dststride, AVFrame *ref, const Mv *mv,
-                      int x_off, int y_off, int block_w, int block_h, int pred_idx)
+
+static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
-    uint8_t *src1        = ref->data[1];
-    uint8_t *src2        = ref->data[2];
-    ptrdiff_t src1stride = ref->linesize[1];
-    ptrdiff_t src2stride = ref->linesize[2];
-    int pic_width        = s->ps.sps->width >> 1;
-    int pic_height       = s->ps.sps->height >> 1;
-
-    int mx = mv->x & 7;
-    int my = mv->y & 7;
-
-    x_off += mv->x >> 3;
-    y_off += mv->y >> 3;
-    src1  += y_off * src1stride + (x_off * (1 << s->ps.sps->pixel_shift));
-    src2  += y_off * src2stride + (x_off * (1 << s->ps.sps->pixel_shift));
+    HEVCLocalContext *lc = s->HEVClc;
+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+    const Mv *mv         = &current_mv->mv[reflist];
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int idx              = ff_hevc_pel_weight[block_w];
+    int hshift           = s->ps.sps->hshift[1];
+    int vshift           = s->ps.sps->vshift[1];
+    intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+    intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+    intptr_t _mx         = mx << (1 - hshift);
+    intptr_t _my         = my << (1 - vshift);
+
+    x_off += mv->x >> (2 + hshift);
+    y_off += mv->y >> (2 + vshift);
+    src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
 
     if (x_off < EPEL_EXTRA_BEFORE || y_off < EPEL_EXTRA_AFTER ||
         x_off >= pic_width - block_w - EPEL_EXTRA_AFTER ||
         y_off >= pic_height - block_h - EPEL_EXTRA_AFTER) {
         const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset0 = EPEL_EXTRA_BEFORE * (srcstride + (1 << s->ps.sps->pixel_shift));
+        int buf_offset0 = EPEL_EXTRA_BEFORE *
+                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src0 - offset0,
+                                 edge_emu_stride, srcstride,
+                                 block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
+                                 x_off - EPEL_EXTRA_BEFORE,
+                                 y_off - EPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+
+        src0 = lc->edge_emu_buffer + buf_offset0;
+        srcstride = edge_emu_stride;
+    }
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_epel_uni[idx][!!my][!!mx](dst0, dststride, src0, srcstride,
+                                                  block_h, _mx, _my, block_w);
+    else
+        s->hevcdsp.put_hevc_epel_uni_w[idx][!!my][!!mx](dst0, dststride, src0, srcstride,
+                                                        block_h, s->sh.chroma_log2_weight_denom,
+                                                        chroma_weight, chroma_offset, _mx, _my, block_w);
+}
+
+/**
+ * 8.5.3.2.2.2 Chroma sample bidirectional interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst target buffer for block data at block position
+ * @param dststride stride of the dst buffer
+ * @param ref0 reference picture0 buffer at origin (0, 0)
+ * @param mv0 motion vector0 (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ * @param ref1 reference picture1 buffer at origin (0, 0)
+ * @param mv1 motion vector1 (relative to block position) to get pixel data from
+ * @param current_mv current motion vector structure
+ * @param cidx chroma component(cb, cr)
+ */
+static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    uint8_t *src1        = ref0->data[cidx+1];
+    uint8_t *src2        = ref1->data[cidx+1];
+    ptrdiff_t src1stride = ref0->linesize[cidx+1];
+    ptrdiff_t src2stride = ref1->linesize[cidx+1];
+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+    Mv *mv0              = &current_mv->mv[0];
+    Mv *mv1              = &current_mv->mv[1];
+    int hshift = s->ps.sps->hshift[1];
+    int vshift = s->ps.sps->vshift[1];
+
+    intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+    intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+    intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+    intptr_t my1 = av_mod_uintp2(mv1->y, 2 + vshift);
+    intptr_t _mx0 = mx0 << (1 - hshift);
+    intptr_t _my0 = my0 << (1 - vshift);
+    intptr_t _mx1 = mx1 << (1 - hshift);
+    intptr_t _my1 = my1 << (1 - vshift);
+
+    int x_off0 = x_off + (mv0->x >> (2 + hshift));
+    int y_off0 = y_off + (mv0->y >> (2 + vshift));
+    int x_off1 = x_off + (mv1->x >> (2 + hshift));
+    int y_off1 = y_off + (mv1->y >> (2 + vshift));
+    int idx = ff_hevc_pel_weight[block_w];
+    src1  += y_off0 * src1stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+    src2  += y_off1 * src2stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+
+    if (x_off0 < EPEL_EXTRA_BEFORE || y_off0 < EPEL_EXTRA_AFTER ||
+        x_off0 >= pic_width - block_w - EPEL_EXTRA_AFTER ||
+        y_off0 >= pic_height - block_h - EPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
         int offset1 = EPEL_EXTRA_BEFORE * (src1stride + (1 << s->ps.sps->pixel_shift));
         int buf_offset1 = EPEL_EXTRA_BEFORE *
                           (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
-        int offset2 = EPEL_EXTRA_BEFORE * (src2stride + (1 << s->ps.sps->pixel_shift));
-        int buf_offset2 = EPEL_EXTRA_BEFORE *
-                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
 
         s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src1 - offset1,
                                  edge_emu_stride, src1stride,
                                  block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
-                                 x_off - EPEL_EXTRA_BEFORE,
-                                 y_off - EPEL_EXTRA_BEFORE,
+                                 x_off0 - EPEL_EXTRA_BEFORE,
+                                 y_off0 - EPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
 
         src1 = lc->edge_emu_buffer + buf_offset1;
         src1stride = edge_emu_stride;
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst1, dststride, src1, src1stride,
-                                                       block_h, mx, my, lc->mc_buffer);
+    }
+
+    if (x_off1 < EPEL_EXTRA_BEFORE || y_off1 < EPEL_EXTRA_AFTER ||
+        x_off1 >= pic_width - block_w - EPEL_EXTRA_AFTER ||
+        y_off1 >= pic_height - block_h - EPEL_EXTRA_AFTER) {
+        const int edge_emu_stride = EDGE_EMU_BUFFER_STRIDE << s->ps.sps->pixel_shift;
+        int offset1 = EPEL_EXTRA_BEFORE * (src2stride + (1 << s->ps.sps->pixel_shift));
+        int buf_offset1 = EPEL_EXTRA_BEFORE *
+                          (edge_emu_stride + (1 << s->ps.sps->pixel_shift));
 
-        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src2 - offset2,
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer2, src2 - offset1,
                                  edge_emu_stride, src2stride,
                                  block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
-                                 x_off - EPEL_EXTRA_BEFORE,
-                                 y_off - EPEL_EXTRA_BEFORE,
+                                 x_off1 - EPEL_EXTRA_BEFORE,
+                                 y_off1 - EPEL_EXTRA_BEFORE,
                                  pic_width, pic_height);
-        src2 = lc->edge_emu_buffer + buf_offset2;
-        src2stride = edge_emu_stride;
 
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst2, dststride, src2, src2stride,
-                                                       block_h, mx, my, lc->mc_buffer);
-    } else {
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst1, dststride, src1, src1stride,
-                                                       block_h, mx, my, lc->mc_buffer);
-        s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst2, dststride, src2, src2stride,
-                                                       block_h, mx, my, lc->mc_buffer);
+        src2 = lc->edge_emu_buffer2 + buf_offset1;
+        src2stride = edge_emu_stride;
     }
+
+    s->hevcdsp.put_hevc_epel[idx][!!my0][!!mx0](lc->tmp, src1, src1stride,
+                                                block_h, _mx0, _my0, block_w);
+    if (!weight_flag)
+        s->hevcdsp.put_hevc_epel_bi[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1],
+                                                       src2, src2stride, lc->tmp,
+                                                       block_h, _mx1, _my1, block_w);
+    else
+        s->hevcdsp.put_hevc_epel_bi_w[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1],
+                                                         src2, src2stride, lc->tmp,
+                                                         block_h,
+                                                         s->sh.chroma_log2_weight_denom,
+                                                         s->sh.chroma_weight_l0[current_mv->ref_idx[0]][cidx],
+                                                         s->sh.chroma_weight_l1[current_mv->ref_idx[1]][cidx],
+                                                         s->sh.chroma_offset_l0[current_mv->ref_idx[0]][cidx],
+                                                         s->sh.chroma_offset_l1[current_mv->ref_idx[1]][cidx],
+                                                         _mx1, _my1, block_w);
 }
 
 static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
                                 const Mv *mv, int y0, int height)
 {
-    int y = (mv->y >> 2) + y0 + height + 9;
-    ff_thread_await_progress(&ref->tf, y, 0);
+    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+
+    if (s->threads_type == FF_THREAD_FRAME )
+        ff_thread_await_progress(&ref->tf, y, 0);
 }
 
-static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
+static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
                                   int nPbH, int log2_cb_size, int part_idx,
                                   int merge_idx, MvField *mv)
 {
-    HEVCLocalContext *lc             = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     enum InterPredIdc inter_pred_idc = PRED_L0;
     int mvp_flag;
 
     ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH);
+    mv->pred_flag = 0;
     if (s->sh.slice_type == B_SLICE)
         inter_pred_idc = ff_hevc_inter_pred_idc_decode(s, nPbW, nPbH);
 
@@ -1622,8 +1665,8 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
         if (s->sh.nb_refs[L0])
             mv->ref_idx[0]= ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L0]);
 
-        mv->pred_flag[0] = 1;
-        hls_mvd_coding(s, x0, y0, 0);
+        mv->pred_flag = PF_L0;
+        ff_hevc_hls_mvd_coding(s, x0, y0, 0);
         mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
         ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                  part_idx, merge_idx, mv, mvp_flag, 0);
@@ -1638,10 +1681,10 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
         if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
             AV_ZERO32(&lc->pu.mvd);
         } else {
-            hls_mvd_coding(s, x0, y0, 1);
+            ff_hevc_hls_mvd_coding(s, x0, y0, 1);
         }
 
-        mv->pred_flag[1] = 1;
+        mv->pred_flag += PF_L1;
         mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
         ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                  part_idx, merge_idx, mv, mvp_flag, 1);
@@ -1652,17 +1695,12 @@ static void hevc_luma_mv_mpv_mode(HEVCContext *s, int x0, int y0, int nPbW,
 
 static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
                                 int nPbW, int nPbH,
-                                int log2_cb_size, int partIdx)
+                                int log2_cb_size, int partIdx, int idx)
 {
-    static const int pred_indices[] = {
-        [4] = 0, [8] = 1, [12] = 2, [16] = 3, [24] = 4, [32] = 5, [48] = 6, [64] = 7,
-    };
-    const int pred_idx = pred_indices[nPbW];
-
 #define POS(c_idx, x, y)                                                              \
     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int merge_idx = 0;
     struct MvField current_mv = {{{ 0 }}};
 
@@ -1670,10 +1708,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
 
     MvField *tab_mvf = s->ref->tab_mvf;
     RefPicList  *refPicList = s->ref->refPicList;
-    HEVCFrame *ref0, *ref1;
-
-    int tmpstride = MAX_PB_SIZE * sizeof(int16_t);
-
+    HEVCFrame *ref0 = NULL, *ref1 = NULL;
     uint8_t *dst0 = POS(0, x0, y0);
     uint8_t *dst1 = POS(1, x0, y0);
     uint8_t *dst2 = POS(2, x0, y0);
@@ -1698,7 +1733,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         ff_hevc_luma_mv_merge_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                                    partIdx, merge_idx, &current_mv);
     } else {
-        hevc_luma_mv_mpv_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
+        hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
                               partIdx, merge_idx, &current_mv);
     }
 
@@ -1709,139 +1744,74 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
         for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++)
             tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv;
 
-    if (current_mv.pred_flag[0]) {
+    if (current_mv.pred_flag & PF_L0) {
         ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
         if (!ref0)
             return;
         hevc_await_progress(s, ref0, &current_mv.mv[0], y0, nPbH);
     }
-    if (current_mv.pred_flag[1]) {
+    if (current_mv.pred_flag & PF_L1) {
         ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
         if (!ref1)
             return;
         hevc_await_progress(s, ref1, &current_mv.mv[1], y0, nPbH);
     }
 
-    if (current_mv.pred_flag[0] && !current_mv.pred_flag[1]) {
-        DECLARE_ALIGNED(16, int16_t,  tmp[MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
+    if (current_mv.pred_flag == PF_L0) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
 
-        luma_mc(s, tmp, tmpstride, ref0->frame,
-                &current_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx);
+        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                    s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                    s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
 
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred[pred_idx](s->sh.luma_log2_weight_denom,
-                                               s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                                               s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-                                               dst0, s->frame->linesize[0], tmp,
-                                               tmpstride, nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred[pred_idx](dst0, s->frame->linesize[0], tmp, tmpstride, nPbH);
-        }
-        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
-                  &current_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
-                                                      s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
-                                                      dst1, s->frame->linesize[1], tmp, tmpstride,
-                                                      nPbH / 2);
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
-                                                      s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
-                                                      dst2, s->frame->linesize[2], tmp2, tmpstride,
-                                                      nPbH / 2);
-        } else {
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst1, s->frame->linesize[1], tmp,  tmpstride, nPbH / 2);
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH / 2);
-        }
-    } else if (!current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
-        DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
-
-        luma_mc(s, tmp, tmpstride, ref1->frame,
-                &current_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred[pred_idx](s->sh.luma_log2_weight_denom,
-                                               s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                                               s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-                                               dst0, s->frame->linesize[0], tmp, tmpstride,
-                                               nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred[pred_idx](dst0, s->frame->linesize[0], tmp, tmpstride, nPbH);
-        }
-
-        chroma_mc(s, tmp, tmp2, tmpstride, ref1->frame,
-                  &current_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
-                                                      s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
-                                                      dst1, s->frame->linesize[1], tmp, tmpstride, nPbH/2);
-            s->hevcdsp.weighted_pred_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                      s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
-                                                      s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
-                                                      dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH/2);
-        } else {
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst1, s->frame->linesize[1], tmp,  tmpstride, nPbH / 2);
-            s->hevcdsp.put_unweighted_pred_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmpstride, nPbH / 2);
-        }
-    } else if (current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
-        DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp3[MAX_PB_SIZE * MAX_PB_SIZE]);
-        DECLARE_ALIGNED(16, int16_t, tmp4[MAX_PB_SIZE * MAX_PB_SIZE]);
-
-        luma_mc(s, tmp, tmpstride, ref0->frame,
-                &current_mv.mv[0], x0, y0, nPbW, nPbH, pred_idx);
-        luma_mc(s, tmp2, tmpstride, ref1->frame,
-                &current_mv.mv[1], x0, y0, nPbW, nPbH, pred_idx);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred_avg[pred_idx](s->sh.luma_log2_weight_denom,
-                                                   s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                                                   s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                                                   s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-                                                   s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-                                                   dst0, s->frame->linesize[0],
-                                                   tmp, tmp2, tmpstride, nPbH);
-        } else {
-            s->hevcdsp.put_unweighted_pred_avg[pred_idx](dst0, s->frame->linesize[0],
-                                                         tmp, tmp2, tmpstride, nPbH);
-        }
-
-        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
-                  &current_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
-        chroma_mc(s, tmp3, tmp4, tmpstride, ref1->frame,
-                  &current_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx);
-
-        if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) {
-            s->hevcdsp.weighted_pred_avg_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
-                                                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
-                                                          s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
-                                                          s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
-                                                          dst1, s->frame->linesize[1], tmp, tmp3,
-                                                          tmpstride, nPbH / 2);
-            s->hevcdsp.weighted_pred_avg_chroma[pred_idx](s->sh.chroma_log2_weight_denom,
-                                                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
-                                                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
-                                                          s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
-                                                          s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
-                                                          dst2, s->frame->linesize[2], tmp2, tmp4,
-                                                          tmpstride, nPbH / 2);
-        } else {
-            s->hevcdsp.put_unweighted_pred_avg_chroma[pred_idx](dst1, s->frame->linesize[1], tmp, tmp3,  tmpstride, nPbH/2);
-            s->hevcdsp.put_unweighted_pred_avg_chroma[pred_idx](dst2, s->frame->linesize[2], tmp2, tmp4, tmpstride, nPbH/2);
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+                          0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+                          0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
+        }
+    } else if (current_mv.pred_flag == PF_L1) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+
+        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+                    &current_mv.mv[1], x0, y0, nPbW, nPbH,
+                    s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                    s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+                          1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+
+            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+                          1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                          s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
+        }
+    } else if (current_mv.pred_flag == PF_BI) {
+        int x0_c = x0 >> s->ps.sps->hshift[1];
+        int y0_c = y0 >> s->ps.sps->vshift[1];
+        int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+        int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+
+        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+                   &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                   ref1->frame, &current_mv.mv[1], &current_mv);
+
+        if (s->ps.sps->chroma_format_idc) {
+            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+                         x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+
+            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+                         x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
         }
     }
 }
@@ -1852,13 +1822,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
 static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
                                 int prev_intra_luma_pred_flag)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
     int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int size_in_pus      = pu_size >> s->ps.sps->log2_min_pu_size;
-    int x0b              = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b              = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b              = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b              = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
     int cand_up   = (lc->ctb_up_flag || y0b) ?
                     s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC;
@@ -1922,15 +1892,7 @@ static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
                intra_pred_mode, size_in_pus);
 
         for (j = 0; j < size_in_pus; j++) {
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].is_intra     = 1;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag[0] = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag[1] = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].ref_idx[0]   = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].ref_idx[1]   = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[0].x      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[0].y      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[1].x      = 0;
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].mv[1].y      = 0;
+            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag = PF_INTRA;
         }
     }
 
@@ -1950,10 +1912,14 @@ static av_always_inline void set_ct_depth(HEVCContext *s, int x0, int y0,
                ct_depth, length);
 }
 
+static const uint8_t tab_mode_idx[] = {
+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
+
 static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
                                   int log2_cb_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
     uint8_t prev_intra_luma_pred_flag[4];
     int split   = lc->cu.part_mode == PART_NxN;
@@ -1979,14 +1945,42 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
         }
     }
 
-    chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
-    if (chroma_mode != 4) {
-        if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-            lc->pu.intra_pred_mode_c = 34;
-        else
-            lc->pu.intra_pred_mode_c = intra_chroma_table[chroma_mode];
-    } else {
-        lc->pu.intra_pred_mode_c = lc->pu.intra_pred_mode[0];
+    if (s->ps.sps->chroma_format_idc == 3) {
+        for (i = 0; i < side; i++) {
+            for (j = 0; j < side; j++) {
+                lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+                if (chroma_mode != 4) {
+                    if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode])
+                        lc->pu.intra_pred_mode_c[2 * i + j] = 34;
+                    else
+                        lc->pu.intra_pred_mode_c[2 * i + j] = intra_chroma_table[chroma_mode];
+                } else {
+                    lc->pu.intra_pred_mode_c[2 * i + j] = lc->pu.intra_pred_mode[2 * i + j];
+                }
+            }
+        }
+    } else if (s->ps.sps->chroma_format_idc == 2) {
+        int mode_idx;
+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                mode_idx = 34;
+            else
+                mode_idx = intra_chroma_table[chroma_mode];
+        } else {
+            mode_idx = lc->pu.intra_pred_mode[0];
+        }
+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
+    } else if (s->ps.sps->chroma_format_idc != 0) {
+        chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+        if (chroma_mode != 4) {
+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+                lc->pu.intra_pred_mode_c[0] = 34;
+            else
+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
+        } else {
+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
+        }
     }
 }
 
@@ -1994,7 +1988,7 @@ static void intra_prediction_unit_default_value(HEVCContext *s,
                                                 int x0, int y0,
                                                 int log2_cb_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int pb_size          = 1 << log2_cb_size;
     int size_in_pus      = pb_size >> s->ps.sps->log2_min_pu_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
@@ -2005,22 +1999,25 @@ static void intra_prediction_unit_default_value(HEVCContext *s,
 
     if (size_in_pus == 0)
         size_in_pus = 1;
-    for (j = 0; j < size_in_pus; j++) {
+    for (j = 0; j < size_in_pus; j++)
         memset(&s->tab_ipm[(y_pu + j) * min_pu_width + x_pu], INTRA_DC, size_in_pus);
-        for (k = 0; k < size_in_pus; k++)
-            tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].is_intra = lc->cu.pred_mode == MODE_INTRA;
-    }
+    if (lc->cu.pred_mode == MODE_INTRA)
+        for (j = 0; j < size_in_pus; j++)
+            for (k = 0; k < size_in_pus; k++)
+                tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA;
 }
 
 static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
 {
     int cb_size          = 1 << log2_cb_size;
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
     int length           = cb_size >> log2_min_cb_size;
     int min_cb_width     = s->ps.sps->min_cb_width;
     int x_cb             = x0 >> log2_min_cb_size;
     int y_cb             = y0 >> log2_min_cb_size;
+    int idx              = log2_cb_size - 2;
+    int qp_block_mask    = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
     int x, y, ret;
 
     lc->cu.x                = x0;
@@ -2048,10 +2045,16 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
             x += min_cb_width;
         }
         lc->cu.pred_mode = skip_flag ? MODE_SKIP : MODE_INTER;
+    } else {
+        x = y_cb * min_cb_width + x_cb;
+        for (y = 0; y < length; y++) {
+            memset(&s->skip_flag[x], 0, length);
+            x += min_cb_width;
+        }
     }
 
     if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
-        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
         intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
 
         if (!s->sh.disable_deblocking_filter_flag)
@@ -2089,37 +2092,37 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
             intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
             switch (lc->cu.part_mode) {
             case PART_2Nx2N:
-                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
                 break;
             case PART_2NxN:
-                hls_prediction_unit(s, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
                 break;
             case PART_Nx2N:
-                hls_prediction_unit(s, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
                 break;
             case PART_2NxnU:
-                hls_prediction_unit(s, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx);
                 break;
             case PART_2NxnD:
-                hls_prediction_unit(s, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0);
-                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx);
+                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
                 break;
             case PART_nLx2N:
-                hls_prediction_unit(s, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
+                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
                 break;
             case PART_nRx2N:
-                hls_prediction_unit(s, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1);
+                hls_prediction_unit(s, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2);
+                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
                 break;
             case PART_NxN:
-                hls_prediction_unit(s, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1);
-                hls_prediction_unit(s, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2);
-                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3);
+                hls_prediction_unit(s, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
+                hls_prediction_unit(s, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
                 break;
             }
         }
@@ -2132,12 +2135,13 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                 rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(s);
             }
             if (rqt_root_cbf) {
+                const static int cbf[2] = { 0 };
                 lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
                                          s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
                                          s->ps.sps->max_transform_hierarchy_depth_inter;
                 ret = hls_transform_tree(s, x0, y0, x0, y0, x0, y0,
                                          log2_cb_size,
-                                         log2_cb_size, 0, 0, 0, 0);
+                                         log2_cb_size, 0, 0, cbf, cbf);
                 if (ret < 0)
                     return ret;
             } else {
@@ -2148,7 +2152,7 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
     }
 
     if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0)
-        ff_hevc_set_qPy(s, x0, y0, x0, y0, log2_cb_size);
+        ff_hevc_set_qPy(s, x0, y0, log2_cb_size);
 
     x = y_cb * min_cb_width + x_cb;
     for (y = 0; y < length; y++) {
@@ -2156,7 +2160,12 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
         x += min_cb_width;
     }
 
-    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct.depth);
+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
+        lc->qPy_pred = lc->qp_y;
+    }
+
+    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth);
 
     return 0;
 }
@@ -2164,11 +2173,12 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
 static int hls_coding_quadtree(HEVCContext *s, int x0, int y0,
                                int log2_cb_size, int cb_depth)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     const int cb_size    = 1 << log2_cb_size;
+    int ret;
     int split_cu;
 
-    lc->ct.depth = cb_depth;
+    lc->ct_depth = cb_depth;
     if (x0 + cb_size <= s->ps.sps->width  &&
         y0 + cb_size <= s->ps.sps->height &&
         log2_cb_size > s->ps.sps->log2_min_cb_size) {
@@ -2182,31 +2192,64 @@ static int hls_coding_quadtree(HEVCContext *s, int x0, int y0,
         lc->tu.cu_qp_delta          = 0;
     }
 
+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
+        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth) {
+        lc->tu.is_cu_chroma_qp_offset_coded = 0;
+    }
+
     if (split_cu) {
+        int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
         const int cb_size_split = cb_size >> 1;
         const int x1 = x0 + cb_size_split;
         const int y1 = y0 + cb_size_split;
 
-        log2_cb_size--;
-        cb_depth++;
+        int more_data = 0;
 
-#define SUBDIVIDE(x, y)                                                \
-do {                                                                   \
-    if (x < s->ps.sps->width && y < s->ps.sps->height) {                     \
-        int ret = hls_coding_quadtree(s, x, y, log2_cb_size, cb_depth);\
-        if (ret < 0)                                                   \
-            return ret;                                                \
-    }                                                                  \
-} while (0)
+        more_data = hls_coding_quadtree(s, x0, y0, log2_cb_size - 1, cb_depth + 1);
+        if (more_data < 0)
+            return more_data;
+
+        if (more_data && x1 < s->ps.sps->width) {
+            more_data = hls_coding_quadtree(s, x1, y0, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, x0, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
+        if (more_data && x1 < s->ps.sps->width &&
+            y1 < s->ps.sps->height) {
+            more_data = hls_coding_quadtree(s, x1, y1, log2_cb_size - 1, cb_depth + 1);
+            if (more_data < 0)
+                return more_data;
+        }
 
-        SUBDIVIDE(x0, y0);
-        SUBDIVIDE(x1, y0);
-        SUBDIVIDE(x0, y1);
-        SUBDIVIDE(x1, y1);
+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
+            lc->qPy_pred = lc->qp_y;
+
+        if (more_data)
+            return ((x1 + cb_size_split) < s->ps.sps->width ||
+                    (y1 + cb_size_split) < s->ps.sps->height);
+        else
+            return 0;
     } else {
-        int ret = hls_coding_unit(s, x0, y0, log2_cb_size);
+        ret = hls_coding_unit(s, x0, y0, log2_cb_size);
         if (ret < 0)
             return ret;
+        if ((!((x0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (x0 + cb_size >= s->ps.sps->width)) &&
+            (!((y0 + cb_size) %
+               (1 << (s->ps.sps->log2_ctb_size))) ||
+             (y0 + cb_size >= s->ps.sps->height))) {
+            int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(s);
+            return !end_of_slice_flag;
+        } else {
+            return 1;
+        }
     }
 
     return 0;
@@ -2215,7 +2258,7 @@ do {                                                                   \
 static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
                                  int ctb_addr_ts)
 {
-    HEVCLocalContext *lc  = &s->HEVClc;
+    HEVCLocalContext *lc  = s->HEVClc;
     int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
     int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
     int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;
@@ -2229,7 +2272,6 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     } else if (s->ps.pps->tiles_enabled_flag) {
         if (ctb_addr_ts && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
             int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
-            lc->start_of_tiles_x = x_ctb;
             lc->end_of_tiles_x   = x_ctb + (s->ps.pps->column_width[idxX] << s->ps.sps->log2_ctb_size);
             lc->first_qp_group   = 1;
         }
@@ -2250,7 +2292,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
         if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
             lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
     } else {
-        if (!ctb_addr_in_slice)
+        if (ctb_addr_in_slice <= 0)
             lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
         if (ctb_addr_in_slice < s->ps.sps->ctb_width)
             lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
@@ -2262,14 +2304,27 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
 }
 
-static int hls_slice_data(HEVCContext *s)
+static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
 {
+    HEVCContext *s  = avctxt->priv_data;
     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
     int more_data   = 1;
     int x_ctb       = 0;
     int y_ctb       = 0;
     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-    int ret;
+
+    if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->sh.dependent_slice_segment_flag) {
+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
 
     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
@@ -2286,10 +2341,12 @@ static int hls_slice_data(HEVCContext *s)
         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
 
-        ret = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-        if (ret < 0)
-            return ret;
-        more_data = !ff_hevc_end_of_slice_flag_decode(s);
+        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+        if (more_data < 0) {
+            s->tab_slice_address[ctb_addr_rs] = -1;
+            return more_data;
+        }
+
 
         ctb_addr_ts++;
         ff_hevc_save_states(s, ctb_addr_ts);
@@ -2298,36 +2355,195 @@ static int hls_slice_data(HEVCContext *s)
 
     if (x_ctb + ctb_size >= s->ps.sps->width &&
         y_ctb + ctb_size >= s->ps.sps->height)
-        ff_hevc_hls_filter(s, x_ctb, y_ctb);
+        ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
 
     return ctb_addr_ts;
 }
 
-static void restore_tqb_pixels(HEVCContext *s)
+static int hls_slice_data(HEVCContext *s)
 {
-    int min_pu_size = 1 << s->ps.sps->log2_min_pu_size;
-    int x, y, c_idx;
-
-    for (c_idx = 0; c_idx < 3; c_idx++) {
-        ptrdiff_t stride = s->frame->linesize[c_idx];
-        int hshift       = s->ps.sps->hshift[c_idx];
-        int vshift       = s->ps.sps->vshift[c_idx];
-        for (y = 0; y < s->ps.sps->min_pu_height; y++) {
-            for (x = 0; x < s->ps.sps->min_pu_width; x++) {
-                if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
-                    int n;
-                    int len      = min_pu_size >> hshift;
-                    uint8_t *src = &s->frame->data[c_idx][((y << s->ps.sps->log2_min_pu_size) >> vshift) * stride + (((x << s->ps.sps->log2_min_pu_size) >> hshift) << s->ps.sps->pixel_shift)];
-                    uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->ps.sps->log2_min_pu_size) >> vshift) * stride + (((x << s->ps.sps->log2_min_pu_size) >> hshift) << s->ps.sps->pixel_shift)];
-                    for (n = 0; n < (min_pu_size >> vshift); n++) {
-                        memcpy(dst, src, len);
-                        src += stride;
-                        dst += stride;
-                    }
-                }
+    int arg[2];
+    int ret[2];
+
+    arg[0] = 0;
+    arg[1] = 1;
+
+    s->avctx->execute(s->avctx, hls_decode_entry, arg, ret , 1, sizeof(int));
+    return ret[0];
+}
+static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int job, int self_id)
+{
+    HEVCContext *s1  = avctxt->priv_data, *s;
+    HEVCLocalContext *lc;
+    int ctb_size    = 1<< s1->ps.sps->log2_ctb_size;
+    int more_data   = 1;
+    int *ctb_row_p    = input_ctb_row;
+    int ctb_row = ctb_row_p[job];
+    int ctb_addr_rs = s1->sh.slice_ctb_addr_rs + ctb_row * ((s1->ps.sps->width + ctb_size - 1) >> s1->ps.sps->log2_ctb_size);
+    int ctb_addr_ts = s1->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+    int thread = ctb_row % s1->threads_number;
+    int ret;
+
+    s = s1->sList[self_id];
+    lc = s->HEVClc;
+
+    if(ctb_row) {
+        ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+
+        if (ret < 0)
+            return ret;
+        ff_init_cabac_decoder(&lc->cc, s->data + s->sh.offset[(ctb_row)-1], s->sh.size[ctb_row - 1]);
+    }
+
+    while(more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+        int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
+        int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
+
+        hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+
+        ff_thread_await_progress2(s->avctx, ctb_row, thread, SHIFT_CTB_WPP);
+
+        if (avpriv_atomic_int_get(&s1->wpp_err)){
+            ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP);
+            return 0;
+        }
+
+        ff_hevc_cabac_init(s, ctb_addr_ts);
+        hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+
+        if (more_data < 0) {
+            s->tab_slice_address[ctb_addr_rs] = -1;
+            avpriv_atomic_int_set(&s1->wpp_err,  1);
+            ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+            return more_data;
+        }
+
+        ctb_addr_ts++;
+
+        ff_hevc_save_states(s, ctb_addr_ts);
+        ff_thread_report_progress2(s->avctx, ctb_row, thread, 1);
+        ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+
+        if (!more_data && (x_ctb+ctb_size) < s->ps.sps->width && ctb_row != s->sh.num_entry_point_offsets) {
+            avpriv_atomic_int_set(&s1->wpp_err,  1);
+            ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+            return 0;
+        }
+
+        if ((x_ctb+ctb_size) >= s->ps.sps->width && (y_ctb+ctb_size) >= s->ps.sps->height ) {
+            ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+            ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP);
+            return ctb_addr_ts;
+        }
+        ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+        x_ctb+=ctb_size;
+
+        if(x_ctb >= s->ps.sps->width) {
+            break;
+        }
+    }
+    ff_thread_report_progress2(s->avctx, ctb_row ,thread, SHIFT_CTB_WPP);
+
+    return 0;
+}
+
+static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal)
+{
+    const uint8_t *data = nal->data;
+    int length          = nal->size;
+    HEVCLocalContext *lc = s->HEVClc;
+    int *ret = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
+    int *arg = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
+    int64_t offset;
+    int64_t startheader, cmpt = 0;
+    int i, j, res = 0;
+
+    if (!ret || !arg) {
+        av_free(ret);
+        av_free(arg);
+        return AVERROR(ENOMEM);
+    }
+
+    if (s->sh.slice_ctb_addr_rs + s->sh.num_entry_point_offsets * s->ps.sps->ctb_width >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) {
+        av_log(s->avctx, AV_LOG_ERROR, "WPP ctb addresses are wrong (%d %d %d %d)\n",
+            s->sh.slice_ctb_addr_rs, s->sh.num_entry_point_offsets,
+            s->ps.sps->ctb_width, s->ps.sps->ctb_height
+        );
+        res = AVERROR_INVALIDDATA;
+        goto error;
+    }
+
+    ff_alloc_entries(s->avctx, s->sh.num_entry_point_offsets + 1);
+
+    if (!s->sList[1]) {
+        for (i = 1; i < s->threads_number; i++) {
+            s->sList[i] = av_malloc(sizeof(HEVCContext));
+            memcpy(s->sList[i], s, sizeof(HEVCContext));
+            s->HEVClcList[i] = av_mallocz(sizeof(HEVCLocalContext));
+            s->sList[i]->HEVClc = s->HEVClcList[i];
+        }
+    }
+
+    offset = (lc->gb.index >> 3);
+
+    for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+            startheader--;
+            cmpt++;
+        }
+    }
+
+    for (i = 1; i < s->sh.num_entry_point_offsets; i++) {
+        offset += (s->sh.entry_point_offset[i - 1] - cmpt);
+        for (j = 0, cmpt = 0, startheader = offset
+             + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) {
+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
+                startheader--;
+                cmpt++;
             }
         }
+        s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt;
+        s->sh.offset[i - 1] = offset;
+
+    }
+    if (s->sh.num_entry_point_offsets != 0) {
+        offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt;
+        if (length < offset) {
+            av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
+            res = AVERROR_INVALIDDATA;
+            goto error;
+        }
+        s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset;
+        s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset;
+
     }
+    s->data = data;
+
+    for (i = 1; i < s->threads_number; i++) {
+        s->sList[i]->HEVClc->first_qp_group = 1;
+        s->sList[i]->HEVClc->qp_y = s->sList[0]->HEVClc->qp_y;
+        memcpy(s->sList[i], s, sizeof(HEVCContext));
+        s->sList[i]->HEVClc = s->HEVClcList[i];
+    }
+
+    avpriv_atomic_int_set(&s->wpp_err, 0);
+    ff_reset_entries(s->avctx);
+
+    for (i = 0; i <= s->sh.num_entry_point_offsets; i++) {
+        arg[i] = i;
+        ret[i] = 0;
+    }
+
+    if (s->ps.pps->entropy_coding_sync_enabled_flag)
+        s->avctx->execute2(s->avctx, hls_decode_entry_wpp, arg, ret, s->sh.num_entry_point_offsets + 1);
+
+    for (i = 0; i <= s->sh.num_entry_point_offsets; i++)
+        res += ret[i];
+error:
+    av_free(ret);
+    av_free(arg);
+    return res;
 }
 
 static int set_side_data(HEVCContext *s)
@@ -2376,28 +2592,91 @@ static int set_side_data(HEVCContext *s)
                                s->sei_hflip, s->sei_vflip);
     }
 
+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
+    // so the side data persists for the entire coded video sequence.
+    if (s->sei_mastering_display_info_present > 0 &&
+        IS_IRAP(s) && s->no_rasl_output_flag) {
+        s->sei_mastering_display_info_present--;
+    }
+    if (s->sei_mastering_display_info_present) {
+        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
+        const int mapping[3] = {2, 0, 1};
+        const int chroma_den = 50000;
+        const int luma_den = 10000;
+        int i;
+        AVMasteringDisplayMetadata *metadata =
+            av_mastering_display_metadata_create_side_data(out);
+        if (!metadata)
+            return AVERROR(ENOMEM);
+
+        for (i = 0; i < 3; i++) {
+            const int j = mapping[i];
+            metadata->display_primaries[i][0].num = s->display_primaries[j][0];
+            metadata->display_primaries[i][0].den = chroma_den;
+            metadata->display_primaries[i][1].num = s->display_primaries[j][1];
+            metadata->display_primaries[i][1].den = chroma_den;
+        }
+        metadata->white_point[0].num = s->white_point[0];
+        metadata->white_point[0].den = chroma_den;
+        metadata->white_point[1].num = s->white_point[1];
+        metadata->white_point[1].den = chroma_den;
+
+        metadata->max_luminance.num = s->max_mastering_luminance;
+        metadata->max_luminance.den = luma_den;
+        metadata->min_luminance.num = s->min_mastering_luminance;
+        metadata->min_luminance.den = luma_den;
+        metadata->has_luminance = 1;
+        metadata->has_primaries = 1;
+
+        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
+               av_q2d(metadata->display_primaries[0][0]),
+               av_q2d(metadata->display_primaries[0][1]),
+               av_q2d(metadata->display_primaries[1][0]),
+               av_q2d(metadata->display_primaries[1][1]),
+               av_q2d(metadata->display_primaries[2][0]),
+               av_q2d(metadata->display_primaries[2][1]),
+               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "min_luminance=%f, max_luminance=%f\n",
+               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
+    }
+
+    if (s->a53_caption) {
+        AVFrameSideData* sd = av_frame_new_side_data(out,
+                                                     AV_FRAME_DATA_A53_CC,
+                                                     s->a53_caption_size);
+        if (sd)
+            memcpy(sd->data, s->a53_caption, s->a53_caption_size);
+        av_freep(&s->a53_caption);
+        s->a53_caption_size = 0;
+        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+    }
+
     return 0;
 }
 
 static int hevc_frame_start(HEVCContext *s)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
+    int pic_size_in_ctb  = ((s->ps.sps->width  >> s->ps.sps->log2_min_cb_size) + 1) *
+                           ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
     int ret;
 
-    memset(s->horizontal_bs, 0, 2 * s->bs_width * (s->bs_height + 1));
-    memset(s->vertical_bs,   0, 2 * s->bs_width * (s->bs_height + 1));
+    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
     memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-    memset(s->is_pcm,        0, s->ps.sps->min_pu_width * s->ps.sps->min_pu_height);
+    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
 
-    lc->start_of_tiles_x = 0;
     s->is_decoded        = 0;
     s->first_nal_type    = s->nal_unit_type;
 
     if (s->ps.pps->tiles_enabled_flag)
         lc->end_of_tiles_x = s->ps.pps->column_width[0] << s->ps.sps->log2_ctb_size;
 
-    ret = ff_hevc_set_new_ref(s, s->ps.sps->sao_enabled ? &s->sao_frame : &s->frame,
-                              s->poc);
+    ret = ff_hevc_set_new_ref(s, &s->frame, s->poc);
     if (ret < 0)
         goto fail;
 
@@ -2413,12 +2692,18 @@ static int hevc_frame_start(HEVCContext *s)
     if (ret < 0)
         goto fail;
 
+    s->frame->pict_type = 3 - s->sh.slice_type;
+
+    if (!IS_IRAP(s))
+        ff_hevc_bump_frame(s);
+
     av_frame_unref(s->output_frame);
     ret = ff_hevc_output_frame(s, s->output_frame, 0);
     if (ret < 0)
         goto fail;
 
-    ff_thread_finish_setup(s->avctx);
+    if (!s->avctx->hwaccel)
+        ff_thread_finish_setup(s->avctx);
 
     return 0;
 
@@ -2431,7 +2716,7 @@ fail:
 
 static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     GetBitContext *gb    = &lc->gb;
     int ctb_addr_ts, ret;
 
@@ -2537,13 +2822,12 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
             if (ret < 0)
                 goto fail;
         } else {
-            ctb_addr_ts = hls_slice_data(s);
+            if (s->threads_number > 1 && s->sh.num_entry_point_offsets > 0)
+                ctb_addr_ts = hls_slice_data_wpp(s, nal);
+            else
+                ctb_addr_ts = hls_slice_data(s);
             if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) {
                 s->is_decoded = 1;
-                if ((s->ps.pps->transquant_bypass_enable_flag ||
-                     (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) &&
-                    s->ps.sps->sao_enabled)
-                    restore_tqb_pixels(s);
             }
 
             if (ctb_addr_ts < 0) {
@@ -2577,6 +2861,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
     int i, ret = 0;
 
     s->ref = NULL;
+    s->last_eos = s->eos;
     s->eos = 0;
 
     /* split the input packet into NAL units, so we know the upper bound on the
@@ -2606,7 +2891,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
     }
 
 fail:
-    if (s->ref)
+    if (s->ref && s->threads_type == FF_THREAD_FRAME)
         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 
     return ret;
@@ -2706,9 +2991,12 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
         return ret;
 
     if (avctx->hwaccel) {
-        if (s->ref && avctx->hwaccel->end_frame(avctx) < 0)
+        if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
+            ff_hevc_unref_frame(s, s->ref, ~0);
+            return ret;
+        }
     } else {
         /* verify the SEI checksum */
         if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
@@ -2737,7 +3025,9 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
 
 static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
 {
-    int ret = ff_thread_ref_frame(&dst->tf, &src->tf);
+    int ret;
+
+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
     if (ret < 0)
         return ret;
 
@@ -2783,7 +3073,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
 
     av_freep(&s->md5_ctx);
 
-    av_frame_free(&s->tmp_frame);
+    av_freep(&s->cabac_state);
+
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
     av_frame_free(&s->output_frame);
 
     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -2797,6 +3092,24 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
         av_buffer_unref(&s->ps.sps_list[i]);
     for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
         av_buffer_unref(&s->ps.pps_list[i]);
+    s->ps.sps = NULL;
+    s->ps.pps = NULL;
+    s->ps.vps = NULL;
+
+    av_freep(&s->sh.entry_point_offset);
+    av_freep(&s->sh.offset);
+    av_freep(&s->sh.size);
+
+    for (i = 1; i < s->threads_number; i++) {
+        HEVCLocalContext *lc = s->HEVClcList[i];
+        if (lc) {
+            av_freep(&s->HEVClcList[i]);
+            av_freep(&s->sList[i]);
+        }
+    }
+    if (s->HEVClc == s->HEVClcList[0])
+        s->HEVClc = NULL;
+    av_freep(&s->HEVClcList[0]);
 
     ff_h2645_packet_uninit(&s->pkt);
 
@@ -2810,8 +3123,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
-    s->tmp_frame = av_frame_alloc();
-    if (!s->tmp_frame)
+    s->HEVClc = av_mallocz(sizeof(HEVCLocalContext));
+    if (!s->HEVClc)
+        goto fail;
+    s->HEVClcList[0] = s->HEVClc;
+    s->sList[0] = s;
+
+    s->cabac_state = av_malloc(HEVC_CONTEXTS);
+    if (!s->cabac_state)
         goto fail;
 
     s->output_frame = av_frame_alloc();
@@ -2834,6 +3153,9 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
     ff_bswapdsp_init(&s->bdsp);
 
     s->context_initialized = 1;
+    s->eos = 0;
+
+    ff_hevc_reset_sei(s);
 
     return 0;
 
@@ -2864,6 +3186,8 @@ static int hevc_update_thread_context(AVCodecContext *dst,
         }
     }
 
+    if (s->ps.sps != s0->ps.sps)
+        s->ps.sps = NULL;
     for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
         av_buffer_unref(&s->ps.vps_list[i]);
         if (s0->ps.vps_list[i]) {
@@ -2892,16 +3216,22 @@ static int hevc_update_thread_context(AVCodecContext *dst,
     }
 
     if (s->ps.sps != s0->ps.sps)
-        ret = set_sps(s, s0->ps.sps);
+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
+            return ret;
 
     s->seq_decode = s0->seq_decode;
     s->seq_output = s0->seq_output;
     s->pocTid0    = s0->pocTid0;
     s->max_ra     = s0->max_ra;
+    s->eos        = s0->eos;
+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
 
     s->is_nalff        = s0->is_nalff;
     s->nal_length_size = s0->nal_length_size;
 
+    s->threads_number      = s0->threads_number;
+    s->threads_type        = s0->threads_type;
+
     if (s0->eos) {
         s->seq_decode = (s->seq_decode + 1) & 0xff;
         s->max_ra = INT_MAX;
@@ -2995,6 +3325,15 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
     if (ret < 0)
         return ret;
 
+    s->enable_parallel_tiles = 0;
+    s->picture_struct = 0;
+    s->eos = 1;
+
+    if(avctx->active_thread_type & FF_THREAD_SLICE)
+        s->threads_number = avctx->thread_count;
+    else
+        s->threads_number = 1;
+
     if (avctx->extradata_size > 0 && avctx->extradata) {
         ret = hevc_decode_extradata(s);
         if (ret < 0) {
@@ -3003,6 +3342,11 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
         }
     }
 
+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+            s->threads_type = FF_THREAD_FRAME;
+        else
+            s->threads_type = FF_THREAD_SLICE;
+
     return 0;
 }
 
@@ -3025,6 +3369,7 @@ static void hevc_decode_flush(AVCodecContext *avctx)
     HEVCContext *s = avctx->priv_data;
     ff_hevc_flush_dpb(s);
     s->max_ra = INT_MAX;
+    s->eos = 1;
 }
 
 #define OFFSET(x) offsetof(HEVCContext, x)
@@ -3032,7 +3377,9 @@ static void hevc_decode_flush(AVCodecContext *avctx)
 
 static const AVOption options[] = {
     { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
-        AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, PAR },
+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
     { NULL },
 };
 
@@ -3057,6 +3404,6 @@ AVCodec ff_hevc_decoder = {
     .update_thread_context = hevc_update_thread_context,
     .init_thread_copy      = hevc_init_thread_copy,
     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-                             AV_CODEC_CAP_FRAME_THREADS,
+                             AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
 };
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index d15af71..be91010 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -3,29 +3,26 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_HEVC_H
 #define AVCODEC_HEVC_H
 
-#include <stddef.h>
-#include <stdint.h>
-
 #include "libavutil/buffer.h"
 #include "libavutil/md5.h"
 
@@ -33,6 +30,7 @@
 #include "bswapdsp.h"
 #include "cabac.h"
 #include "get_bits.h"
+#include "hevcpred.h"
 #include "h2645_parse.h"
 #include "hevcdsp.h"
 #include "internal.h"
@@ -42,6 +40,9 @@
 #define MAX_DPB_SIZE 16 // A.4.1
 #define MAX_REFS 16
 
+#define MAX_NB_THREADS 16
+#define SHIFT_CTB_WPP 2
+
 /**
  * 7.4.2.1
  */
@@ -56,12 +57,11 @@
 #define MAX_TRANSFORM_DEPTH 5
 
 #define MAX_TB_SIZE 32
-#define MAX_PB_SIZE 64
 #define MAX_LOG2_CTB_SIZE 6
 #define MAX_QP 51
 #define DEFAULT_INTRA_TC_OFFSET 2
 
-#define HEVC_CONTEXTS 183
+#define HEVC_CONTEXTS 199
 
 #define MRG_MAX_NUM_CANDS     5
 
@@ -71,6 +71,9 @@
 #define EPEL_EXTRA_BEFORE 1
 #define EPEL_EXTRA_AFTER  2
 #define EPEL_EXTRA        3
+#define QPEL_EXTRA_BEFORE 3
+#define QPEL_EXTRA_AFTER  4
+#define QPEL_EXTRA        7
 
 #define EDGE_EMU_BUFFER_STRIDE 80
 
@@ -80,13 +83,10 @@
 #define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
 #define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
 
-#define IS_IDR(s) (s->nal_unit_type == NAL_IDR_W_RADL || s->nal_unit_type == NAL_IDR_N_LP)
-#define IS_BLA(s) (s->nal_unit_type == NAL_BLA_W_RADL || s->nal_unit_type == NAL_BLA_W_LP || \
-                   s->nal_unit_type == NAL_BLA_N_LP)
-#define IS_IRAP(s) (s->nal_unit_type >= 16 && s->nal_unit_type <= 23)
-
-#define FFUDIV(a,b) (((a) > 0 ? (a) : (a) - (b) + 1) / (b))
-#define FFUMOD(a,b) ((a) - (b) * FFUDIV(a,b))
+#define IS_IDR(s) ((s)->nal_unit_type == NAL_IDR_W_RADL || (s)->nal_unit_type == NAL_IDR_N_LP)
+#define IS_BLA(s) ((s)->nal_unit_type == NAL_BLA_W_RADL || (s)->nal_unit_type == NAL_BLA_W_LP || \
+                   (s)->nal_unit_type == NAL_BLA_N_LP)
+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
 
 /**
  * Table 7-3: NAL unit type codes
@@ -168,6 +168,8 @@ enum SyntaxElement {
     CBF_LUMA,
     CBF_CB_CR,
     TRANSFORM_SKIP_FLAG,
+    EXPLICIT_RDPCM_FLAG,
+    EXPLICIT_RDPCM_DIR_FLAG,
     LAST_SIGNIFICANT_COEFF_X_PREFIX,
     LAST_SIGNIFICANT_COEFF_Y_PREFIX,
     LAST_SIGNIFICANT_COEFF_X_SUFFIX,
@@ -178,6 +180,10 @@ enum SyntaxElement {
     COEFF_ABS_LEVEL_GREATER2_FLAG,
     COEFF_ABS_LEVEL_REMAINING,
     COEFF_SIGN_FLAG,
+    LOG2_RES_SCALE_ABS,
+    RES_SCALE_SIGN_FLAG,
+    CU_CHROMA_QP_OFFSET_FLAG,
+    CU_CHROMA_QP_OFFSET_IDX,
 };
 
 enum PartMode {
@@ -203,6 +209,13 @@ enum InterPredIdc {
     PRED_BI,
 };
 
+enum PredFlag {
+    PF_INTRA = 0,
+    PF_L0,
+    PF_L1,
+    PF_BI,
+};
+
 enum IntraPredMode {
     INTRA_PLANAR = 0,
     INTRA_DC,
@@ -245,6 +258,7 @@ enum SAOType {
     SAO_NOT_APPLIED = 0,
     SAO_BAND,
     SAO_EDGE,
+    SAO_APPLIED
 };
 
 enum SAOEOClass {
@@ -383,7 +397,7 @@ typedef struct ScalingList {
 } ScalingList;
 
 typedef struct HEVCSPS {
-    int vps_id;
+    unsigned vps_id;
     int chroma_format_idc;
     uint8_t separate_colour_plane_flag;
 
@@ -444,6 +458,13 @@ typedef struct HEVCSPS {
     int max_transform_hierarchy_depth_inter;
     int max_transform_hierarchy_depth_intra;
 
+    int transform_skip_rotation_enabled_flag;
+    int transform_skip_context_enabled_flag;
+    int implicit_rdpcm_enabled_flag;
+    int explicit_rdpcm_enabled_flag;
+    int intra_smoothing_disabled_flag;
+    int persistent_rice_adaptation_enabled_flag;
+
     ///< coded frame dimension in various units
     int width;
     int height;
@@ -456,6 +477,7 @@ typedef struct HEVCSPS {
     int min_tb_height;
     int min_pu_width;
     int min_pu_height;
+    int tb_mask;
 
     int hshift[3];
     int vshift[3];
@@ -512,6 +534,15 @@ typedef struct HEVCPPS {
     int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
     int num_extra_slice_header_bits;
     uint8_t slice_header_extension_present_flag;
+    uint8_t log2_max_transform_skip_block_size;
+    uint8_t cross_component_prediction_enabled_flag;
+    uint8_t chroma_qp_offset_list_enabled_flag;
+    uint8_t diff_cu_chroma_qp_offset_depth;
+    uint8_t chroma_qp_offset_list_len_minus1;
+    int8_t  cb_qp_offset_list[5];
+    int8_t  cr_qp_offset_list[5];
+    uint8_t log2_sao_offset_scale_luma;
+    uint8_t log2_sao_offset_scale_chroma;
 
     // Inferred parameters
     unsigned int *column_width;  ///< ColumnWidth
@@ -525,6 +556,7 @@ typedef struct HEVCPPS {
     int *tile_id;           ///< TileId
     int *tile_pos_rs;       ///< TilePosRS
     int *min_tb_addr_zs;    ///< MinTbAddrZS
+    int *min_tb_addr_zs_tab;///< MinTbAddrZS
 } HEVCPPS;
 
 typedef struct HEVCParamSets {
@@ -584,11 +616,16 @@ typedef struct SliceHeader {
     int slice_cb_qp_offset;
     int slice_cr_qp_offset;
 
+    uint8_t cu_chroma_qp_offset_enabled_flag;
+
     int beta_offset;    ///< beta_offset_div2 * 2
     int tc_offset;      ///< tc_offset_div2 * 2
 
     unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
 
+    unsigned *entry_point_offset;
+    int * offset;
+    int * size;
     int num_entry_point_offsets;
 
     int8_t slice_qp;
@@ -610,10 +647,6 @@ typedef struct SliceHeader {
     int slice_ctb_addr_rs;
 } SliceHeader;
 
-typedef struct CodingTree {
-    int depth; ///< ctDepth
-} CodingTree;
-
 typedef struct CodingUnit {
     int x;
     int y;
@@ -635,8 +668,7 @@ typedef struct Mv {
 typedef struct MvField {
     DECLARE_ALIGNED(4, Mv, mv)[2];
     int8_t ref_idx[2];
-    int8_t pred_flag[2];
-    uint8_t is_intra;
+    int8_t pred_flag;
 } MvField;
 
 typedef struct NeighbourAvailable {
@@ -654,15 +686,24 @@ typedef struct PredictionUnit {
     uint8_t intra_pred_mode[4];
     Mv mvd;
     uint8_t merge_flag;
-    uint8_t intra_pred_mode_c;
+    uint8_t intra_pred_mode_c[4];
+    uint8_t chroma_mode_c[4];
 } PredictionUnit;
 
 typedef struct TransformUnit {
     int cu_qp_delta;
 
+    int res_scale_val;
+
     // Inferred parameters;
-    int cur_intra_pred_mode;
+    int intra_pred_mode;
+    int intra_pred_mode_c;
+    int chroma_mode_c;
     uint8_t is_cu_qp_delta_coded;
+    uint8_t is_cu_chroma_qp_offset_coded;
+    int8_t  cu_qp_offset_cb;
+    int8_t  cu_qp_offset_cr;
+    uint8_t cross_pf;
 } TransformUnit;
 
 typedef struct DBParams {
@@ -673,6 +714,7 @@ typedef struct DBParams {
 #define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
 #define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
 #define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
 
 typedef struct HEVCFrame {
     AVFrame *frame;
@@ -705,24 +747,11 @@ typedef struct HEVCFrame {
     uint8_t flags;
 } HEVCFrame;
 
-struct HEVCContext;
-
-typedef struct HEVCPredContext {
-    void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
-
-    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-                           const uint8_t *left, ptrdiff_t stride);
-    void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-                    ptrdiff_t stride, int log2_size, int c_idx);
-    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-                            const uint8_t *left, ptrdiff_t stride,
-                            int c_idx, int mode);
-} HEVCPredContext;
-
 typedef struct HEVCLocalContext {
-    DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 24) * MAX_PB_SIZE]);
     uint8_t cabac_state[HEVC_CONTEXTS];
 
+    uint8_t stat_coeff[4];
+
     uint8_t first_qp_group;
 
     GetBitContext gb;
@@ -731,18 +760,23 @@ typedef struct HEVCLocalContext {
     int8_t qp_y;
     int8_t curr_qp_y;
 
+    int qPy_pred;
+
     TransformUnit tu;
 
     uint8_t ctb_left_flag;
     uint8_t ctb_up_flag;
     uint8_t ctb_up_right_flag;
     uint8_t ctb_up_left_flag;
-    int     start_of_tiles_x;
     int     end_of_tiles_x;
     int     end_of_tiles_y;
     /* +7 is for subpixel interpolation, *2 for high bit depths */
     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-    CodingTree ct;
+    /* The extended size between the new edge emu buffer is abused by SAO */
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
+    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
+
+    int ct_depth;
     CodingUnit cu;
     PredictionUnit pu;
     NeighbourAvailable na;
@@ -760,17 +794,26 @@ typedef struct HEVCContext {
     const AVClass *c;  // needed by private avoptions
     AVCodecContext *avctx;
 
-    HEVCLocalContext HEVClc;
+    struct HEVCContext  *sList[MAX_NB_THREADS];
 
-    uint8_t cabac_state[HEVC_CONTEXTS];
+    HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+    HEVCLocalContext    *HEVClc;
+
+    uint8_t             threads_type;
+    uint8_t             threads_number;
+
+    int                 width;
+    int                 height;
+
+    uint8_t *cabac_state;
 
     /** 1 if the independent slice segment header was successfully parsed */
     uint8_t slice_initialized;
 
     AVFrame *frame;
-    AVFrame *sao_frame;
-    AVFrame *tmp_frame;
     AVFrame *output_frame;
+    uint8_t *sao_pixel_buffer_h[3];
+    uint8_t *sao_pixel_buffer_v[3];
 
     HEVCParamSets ps;
 
@@ -791,11 +834,13 @@ typedef struct HEVCContext {
     int pocTid0;
     int slice_idx; ///< number of the slice being currently decoded
     int eos;       ///< current packet contains an EOS/EOB NAL
+    int last_eos;  ///< last packet contains an EOS/EOB NAL
     int max_ra;
     int bs_width;
     int bs_height;
 
     int is_decoded;
+    int no_rasl_output_flag;
 
     HEVCPredContext hpc;
     HEVCDSPContext hevcdsp;
@@ -830,6 +875,11 @@ typedef struct HEVCContext {
     uint16_t seq_decode;
     uint16_t seq_output;
 
+    int enable_parallel_tiles;
+    int wpp_err;
+
+    const uint8_t *data;
+
     H2645Packet pkt;
     // type of the first VCL NAL of the current frame
     enum NALUnitType first_nal_type;
@@ -844,6 +894,8 @@ typedef struct HEVCContext {
                             ///< as a format defined in 14496-15
     int apply_defdispwin;
 
+    int active_seq_parameter_set_id;
+
     int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
     int nuh_layer_id;
 
@@ -857,6 +909,19 @@ typedef struct HEVCContext {
     int sei_display_orientation_present;
     int sei_anticlockwise_rotation;
     int sei_hflip, sei_vflip;
+
+    int picture_struct;
+
+    uint8_t* a53_caption;
+    int a53_caption_size;
+
+    /** mastering display */
+    int sei_mastering_display_info_present;
+    uint16_t display_primaries[3][2];
+    uint16_t white_point[2];
+    uint32_t max_mastering_luminance;
+    uint32_t min_mastering_luminance;
+
 } HEVCContext;
 
 int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
@@ -937,32 +1002,11 @@ int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH);
 int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx);
 int ff_hevc_mvp_lx_flag_decode(HEVCContext *s);
 int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s);
-int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s);
-int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s);
-int ff_hevc_mvd_decode(HEVCContext *s);
-int ff_hevc_mvd_sign_flag_decode(HEVCContext *s);
 int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size);
 int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth);
 int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth);
-int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx);
-int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size);
-int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size);
-int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
-                                                 int last_significant_coeff_prefix);
-int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx,
-                                                int ctx_cg);
-int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c,
-                                          int y_c, int log2_trafo_size,
-                                          int scan_idx, int prev_sig);
-int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx,
-                                                 int ctx_set);
-int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx,
-                                                 int inc);
-int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int base_level,
-                                      int rc_rice_param);
-int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb);
+int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx);
+int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx);
 
 /**
  * Get the number of candidate references for the current frame.
@@ -977,6 +1021,8 @@ int ff_hevc_set_new_ref(HEVCContext *s, AVFrame **frame, int poc);
  */
 int ff_hevc_output_frame(HEVCContext *s, AVFrame *frame, int flush);
 
+void ff_hevc_bump_frame(HEVCContext *s);
+
 void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags);
 
 void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
@@ -988,22 +1034,35 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0,
                               int nPbW, int nPbH, int log2_cb_size,
                               int part_idx, int merge_idx,
                               MvField *mv, int mvp_lx_flag, int LX);
-void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC, int xBase, int yBase,
+void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase,
                      int log2_cb_size);
 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size);
 int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s);
 int ff_hevc_cu_qp_delta_abs(HEVCContext *s);
-void ff_hevc_hls_filter(HEVCContext *s, int x, int y);
+int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s);
+int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s);
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size);
 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size);
+void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                 int log2_trafo_size, enum ScanType scan_idx,
+                                 int c_idx);
 
-void ff_hevc_pps_free(HEVCPPS **ppps);
+void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
 
-void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 
 int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
                            uint8_t *buf, int buf_size);
 
+/**
+ * Reset SEI values that are stored on the Context.
+ * e.g. Caption data that was extracted during NAL
+ * parsing.
+ *
+ * @param s HEVCContext.
+ */
+void ff_hevc_reset_sei(HEVCContext *s);
+
 extern const uint8_t ff_hevc_qpel_extra_before[4];
 extern const uint8_t ff_hevc_qpel_extra_after[4];
 extern const uint8_t ff_hevc_qpel_extra[4];
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index b01808f..d1bef83 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,65 +66,77 @@ av_unused static const int8_t num_bins_in_se[] = {
      2, // cbf_luma
      4, // cbf_cb, cbf_cr
      2, // transform_skip_flag[][]
+     2, // explicit_rdpcm_flag[][]
+     2, // explicit_rdpcm_dir_flag[][]
     18, // last_significant_coeff_x_prefix
     18, // last_significant_coeff_y_prefix
      0, // last_significant_coeff_x_suffix
      0, // last_significant_coeff_y_suffix
      4, // significant_coeff_group_flag
-    42, // significant_coeff_flag
+    44, // significant_coeff_flag
     24, // coeff_abs_level_greater1_flag
      6, // coeff_abs_level_greater2_flag
      0, // coeff_abs_level_remaining
      0, // coeff_sign_flag
+     8, // log2_res_scale_abs
+     2, // res_scale_sign_flag
+     1, // cu_chroma_qp_offset_flag
+     1, // cu_chroma_qp_offset_idx
 };
 
 /**
  * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
  */
 static const int elem_offset[sizeof(num_bins_in_se)] = {
-      0,
-      1,
-      2,
-      2,
-      2,
-      2,
-      2,
-      2,
-      5,
-      6,
-      9,
-     12,
-     13,
-     17,
-     17,
-     18,
-     18,
-     18,
-     20,
-     21,
-     22,
-     27,
-     29,
-     31,
-     33,
-     35,
-     35,
-     35,
-     36,
-     37,
-     40,
-     42,
-     46,
-     48,
-     66,
-     84,
-     84,
-     84,
-     88,
-    130,
-    154,
-    160,
-    160,
+    0, // sao_merge_flag
+    1, // sao_type_idx
+    2, // sao_eo_class
+    2, // sao_band_position
+    2, // sao_offset_abs
+    2, // sao_offset_sign
+    2, // end_of_slice_flag
+    2, // split_coding_unit_flag
+    5, // cu_transquant_bypass_flag
+    6, // skip_flag
+    9, // cu_qp_delta
+    12, // pred_mode
+    13, // part_mode
+    17, // pcm_flag
+    17, // prev_intra_luma_pred_mode
+    18, // mpm_idx
+    18, // rem_intra_luma_pred_mode
+    18, // intra_chroma_pred_mode
+    20, // merge_flag
+    21, // merge_idx
+    22, // inter_pred_idc
+    27, // ref_idx_l0
+    29, // ref_idx_l1
+    31, // abs_mvd_greater0_flag
+    33, // abs_mvd_greater1_flag
+    35, // abs_mvd_minus2
+    35, // mvd_sign_flag
+    35, // mvp_lx_flag
+    36, // no_residual_data_flag
+    37, // split_transform_flag
+    40, // cbf_luma
+    42, // cbf_cb, cbf_cr
+    46, // transform_skip_flag[][]
+    48, // explicit_rdpcm_flag[][]
+    50, // explicit_rdpcm_dir_flag[][]
+    52, // last_significant_coeff_x_prefix
+    70, // last_significant_coeff_y_prefix
+    88, // last_significant_coeff_x_suffix
+    88, // last_significant_coeff_y_suffix
+    88, // significant_coeff_group_flag
+    92, // significant_coeff_flag
+    136, // coeff_abs_level_greater1_flag
+    160, // coeff_abs_level_greater2_flag
+    166, // coeff_abs_level_remaining
+    166, // coeff_sign_flag
+    166, // log2_res_scale_abs
+    174, // res_scale_sign_flag
+    176, // cu_chroma_qp_offset_flag
+    177, // cu_chroma_qp_offset_idx
 };
 
 #define CNU 154
@@ -178,6 +190,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       94, 138, 182, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
        79, 108, 123,  63,
@@ -190,11 +206,21 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
       125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
       139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
+      141, 111,
       // coeff_abs_level_greater1_flag
       140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
       122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
       // coeff_abs_level_greater2_flag
-      138, 153, 136, 167, 152, 152, },
+      138, 153, 136, 167, 152, 152,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
     { // sao_merge_flag
       153,
       // sao_type_idx
@@ -241,6 +267,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       149, 107, 167, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
        94, 108, 123, 108,
@@ -253,11 +283,21 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
       154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
       153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
       // coeff_abs_level_greater1_flag
       154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
       136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
       // coeff_abs_level_greater2_flag
-      107, 167, 91, 122, 107, 167, },
+      107, 167, 91, 122, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
     { // sao_merge_flag
       153,
       // sao_type_idx
@@ -304,6 +344,10 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       149, 92, 167, 154,
       // transform_skip_flag
       139, 139,
+      // explicit_rdpcm_flag
+      139, 139,
+      // explicit_rdpcm_dir_flag
+      139, 139,
       // last_significant_coeff_x_prefix
       125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
        79, 108, 123,  93,
@@ -316,11 +360,89 @@ static const uint8_t init_values[3][HEVC_CONTEXTS] = {
       170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
       154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
       153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
+      140, 140,
       // coeff_abs_level_greater1_flag
       154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
       136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
       // coeff_abs_level_greater2_flag
-      107, 167, 91, 107, 107, 167, },
+      107, 167, 91, 107, 107, 167,
+      // log2_res_scale_abs
+      154, 154, 154, 154, 154, 154, 154, 154,
+      // res_scale_sign_flag
+      154, 154,
+      // cu_chroma_qp_offset_flag
+      154,
+      // cu_chroma_qp_offset_idx
+      154,
+    },
+};
+
+static const uint8_t scan_1x1[1] = {
+    0,
+};
+
+static const uint8_t horiz_scan2x2_x[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t horiz_scan2x2_y[4] = {
+    0, 0, 1, 1
+};
+
+static const uint8_t horiz_scan4x4_x[16] = {
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+};
+
+static const uint8_t horiz_scan4x4_y[16] = {
+    0, 0, 0, 0,
+    1, 1, 1, 1,
+    2, 2, 2, 2,
+    3, 3, 3, 3,
+};
+
+static const uint8_t horiz_scan8x8_inv[8][8] = {
+    {  0,  1,  2,  3, 16, 17, 18, 19, },
+    {  4,  5,  6,  7, 20, 21, 22, 23, },
+    {  8,  9, 10, 11, 24, 25, 26, 27, },
+    { 12, 13, 14, 15, 28, 29, 30, 31, },
+    { 32, 33, 34, 35, 48, 49, 50, 51, },
+    { 36, 37, 38, 39, 52, 53, 54, 55, },
+    { 40, 41, 42, 43, 56, 57, 58, 59, },
+    { 44, 45, 46, 47, 60, 61, 62, 63, },
+};
+
+static const uint8_t diag_scan2x2_x[4] = {
+    0, 0, 1, 1,
+};
+
+static const uint8_t diag_scan2x2_y[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t diag_scan2x2_inv[2][2] = {
+    { 0, 2, },
+    { 1, 3, },
+};
+
+static const uint8_t diag_scan4x4_inv[4][4] = {
+    { 0,  2,  5,  9, },
+    { 1,  4,  8, 12, },
+    { 3,  7, 11, 14, },
+    { 6, 10, 13, 15, },
+};
+
+static const uint8_t diag_scan8x8_inv[8][8] = {
+    {  0,  2,  5,  9, 14, 20, 27, 35, },
+    {  1,  4,  8, 13, 19, 26, 34, 42, },
+    {  3,  7, 12, 18, 25, 33, 41, 48, },
+    {  6, 11, 17, 24, 32, 40, 47, 53, },
+    { 10, 16, 23, 31, 39, 46, 52, 57, },
+    { 15, 22, 30, 38, 45, 51, 56, 60, },
+    { 21, 29, 37, 44, 50, 55, 59, 62, },
+    { 28, 36, 43, 49, 54, 58, 61, 63, },
 };
 
 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
@@ -329,13 +451,13 @@ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
         (ctb_addr_ts % s->ps.sps->ctb_width == 2 ||
          (s->ps.sps->ctb_width == 2 &&
           ctb_addr_ts % s->ps.sps->ctb_width == 0))) {
-        memcpy(s->cabac_state, s->HEVClc.cabac_state, HEVC_CONTEXTS);
+        memcpy(s->cabac_state, s->HEVClc->cabac_state, HEVC_CONTEXTS);
     }
 }
 
 static void load_states(HEVCContext *s)
 {
-    memcpy(s->HEVClc.cabac_state, s->cabac_state, HEVC_CONTEXTS);
+    memcpy(s->HEVClc->cabac_state, s->cabac_state, HEVC_CONTEXTS);
 }
 
 static void cabac_reinit(HEVCLocalContext *lc)
@@ -345,10 +467,10 @@ static void cabac_reinit(HEVCLocalContext *lc)
 
 static void cabac_init_decoder(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
     skip_bits(gb, 1);
     align_get_bits(gb);
-    ff_init_cabac_decoder(&s->HEVClc.cc,
+    ff_init_cabac_decoder(&s->HEVClc->cc,
                           gb->buffer + get_bits_count(gb) / 8,
                           (get_bits_left(gb) + 7) / 8);
 }
@@ -370,8 +492,11 @@ static void cabac_init_state(HEVCContext *s)
         pre ^= pre >> 31;
         if (pre > 124)
             pre = 124 + (pre & 1);
-        s->HEVClc.cabac_state[i] = pre;
+        s->HEVClc->cabac_state[i] = pre;
     }
+
+    for (i = 0; i < 4; i++)
+        s->HEVClc->stat_coeff[i] = 0;
 }
 
 void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
@@ -395,13 +520,19 @@ void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
     } else {
         if (s->ps.pps->tiles_enabled_flag &&
             s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
-            cabac_reinit(&s->HEVClc);
+            if (s->threads_number == 1)
+                cabac_reinit(s->HEVClc);
+            else
+                cabac_init_decoder(s);
             cabac_init_state(s);
         }
         if (s->ps.pps->entropy_coding_sync_enabled_flag) {
             if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {
-                get_cabac_terminate(&s->HEVClc.cc);
-                cabac_reinit(&s->HEVClc);
+                get_cabac_terminate(&s->HEVClc->cc);
+                if (s->threads_number == 1)
+                    cabac_reinit(s->HEVClc);
+                else
+                    cabac_init_decoder(s);
 
                 if (s->ps.sps->ctb_width == 1)
                     cabac_init_state(s);
@@ -412,7 +543,7 @@ void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
     }
 }
 
-#define GET_CABAC(ctx) get_cabac(&s->HEVClc.cc, &s->HEVClc.cabac_state[ctx])
+#define GET_CABAC(ctx) get_cabac(&s->HEVClc->cc, &s->HEVClc->cabac_state[ctx])
 
 int ff_hevc_sao_merge_flag_decode(HEVCContext *s)
 {
@@ -424,7 +555,7 @@ int ff_hevc_sao_type_idx_decode(HEVCContext *s)
     if (!GET_CABAC(elem_offset[SAO_TYPE_IDX]))
         return 0;
 
-    if (!get_cabac_bypass(&s->HEVClc.cc))
+    if (!get_cabac_bypass(&s->HEVClc->cc))
         return SAO_BAND;
     return SAO_EDGE;
 }
@@ -432,10 +563,10 @@ int ff_hevc_sao_type_idx_decode(HEVCContext *s)
 int ff_hevc_sao_band_position_decode(HEVCContext *s)
 {
     int i;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 0; i < 4; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
@@ -444,26 +575,26 @@ int ff_hevc_sao_offset_abs_decode(HEVCContext *s)
     int i = 0;
     int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
 
-    while (i < length && get_cabac_bypass(&s->HEVClc.cc))
+    while (i < length && get_cabac_bypass(&s->HEVClc->cc))
         i++;
     return i;
 }
 
 int ff_hevc_sao_offset_sign_decode(HEVCContext *s)
 {
-    return get_cabac_bypass(&s->HEVClc.cc);
+    return get_cabac_bypass(&s->HEVClc->cc);
 }
 
 int ff_hevc_sao_eo_class_decode(HEVCContext *s)
 {
-    int ret = get_cabac_bypass(&s->HEVClc.cc) << 1;
-    ret    |= get_cabac_bypass(&s->HEVClc.cc);
+    int ret = get_cabac_bypass(&s->HEVClc->cc) << 1;
+    ret    |= get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
 
 int ff_hevc_end_of_slice_flag_decode(HEVCContext *s)
 {
-    return get_cabac_terminate(&s->HEVClc.cc);
+    return get_cabac_terminate(&s->HEVClc->cc);
 }
 
 int ff_hevc_cu_transquant_bypass_flag_decode(HEVCContext *s)
@@ -475,12 +606,12 @@ int ff_hevc_skip_flag_decode(HEVCContext *s, int x0, int y0, int x_cb, int y_cb)
 {
     int min_cb_width = s->ps.sps->min_cb_width;
     int inc = 0;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
-    if (s->HEVClc.ctb_left_flag || x0b)
+    if (s->HEVClc->ctb_left_flag || x0b)
         inc = !!SAMPLE_CTB(s->skip_flag, x_cb - 1, y_cb);
-    if (s->HEVClc.ctb_up_flag || y0b)
+    if (s->HEVClc->ctb_up_flag || y0b)
         inc += !!SAMPLE_CTB(s->skip_flag, x_cb, y_cb - 1);
 
     return GET_CABAC(elem_offset[SKIP_FLAG] + inc);
@@ -498,7 +629,7 @@ int ff_hevc_cu_qp_delta_abs(HEVCContext *s)
     }
     if (prefix_val >= 5) {
         int k = 0;
-        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
             suffix_val += 1 << k;
             k++;
         }
@@ -506,14 +637,30 @@ int ff_hevc_cu_qp_delta_abs(HEVCContext *s)
             av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
 
         while (k--)
-            suffix_val += get_cabac_bypass(&s->HEVClc.cc) << k;
+            suffix_val += get_cabac_bypass(&s->HEVClc->cc) << k;
     }
     return prefix_val + suffix_val;
 }
 
 int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s)
 {
-    return get_cabac_bypass(&s->HEVClc.cc);
+    return get_cabac_bypass(&s->HEVClc->cc);
+}
+
+int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_FLAG]);
+}
+
+int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s)
+{
+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
+    int i = 0;
+
+    while (i < c_max && GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
+        i++;
+
+    return i;
 }
 
 int ff_hevc_pred_mode_decode(HEVCContext *s)
@@ -524,14 +671,14 @@ int ff_hevc_pred_mode_decode(HEVCContext *s)
 int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, int y0)
 {
     int inc = 0, depth_left = 0, depth_top = 0;
-    int x0b  = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b  = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    int x0b  = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b  = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
     int x_cb = x0 >> s->ps.sps->log2_min_cb_size;
     int y_cb = y0 >> s->ps.sps->log2_min_cb_size;
 
-    if (s->HEVClc.ctb_left_flag || x0b)
+    if (s->HEVClc->ctb_left_flag || x0b)
         depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1];
-    if (s->HEVClc.ctb_up_flag || y0b)
+    if (s->HEVClc->ctb_up_flag || y0b)
         depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb];
 
     inc += (depth_left > ct_depth);
@@ -545,7 +692,7 @@ int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
     if (GET_CABAC(elem_offset[PART_MODE])) // 1
         return PART_2Nx2N;
     if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
-        if (s->HEVClc.cu.pred_mode == MODE_INTRA) // 0
+        if (s->HEVClc->cu.pred_mode == MODE_INTRA) // 0
             return PART_NxN;
         if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01
             return PART_2NxN;
@@ -565,21 +712,21 @@ int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
     if (GET_CABAC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
         if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 011
             return PART_2NxN;
-        if (get_cabac_bypass(&s->HEVClc.cc)) // 0101
+        if (get_cabac_bypass(&s->HEVClc->cc)) // 0101
             return PART_2NxnD;
         return PART_2NxnU; // 0100
     }
 
     if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 001
         return PART_Nx2N;
-    if (get_cabac_bypass(&s->HEVClc.cc)) // 0001
+    if (get_cabac_bypass(&s->HEVClc->cc)) // 0001
         return PART_nRx2N;
     return PART_nLx2N;  // 0000
 }
 
 int ff_hevc_pcm_flag_decode(HEVCContext *s)
 {
-    return get_cabac_terminate(&s->HEVClc.cc);
+    return get_cabac_terminate(&s->HEVClc->cc);
 }
 
 int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
@@ -590,7 +737,7 @@ int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
 int ff_hevc_mpm_idx_decode(HEVCContext *s)
 {
     int i = 0;
-    while (i < 2 && get_cabac_bypass(&s->HEVClc.cc))
+    while (i < 2 && get_cabac_bypass(&s->HEVClc->cc))
         i++;
     return i;
 }
@@ -598,10 +745,10 @@ int ff_hevc_mpm_idx_decode(HEVCContext *s)
 int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCContext *s)
 {
     int i;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 0; i < 4; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
@@ -611,8 +758,8 @@ int ff_hevc_intra_chroma_pred_mode_decode(HEVCContext *s)
     if (!GET_CABAC(elem_offset[INTRA_CHROMA_PRED_MODE]))
         return 4;
 
-    ret  = get_cabac_bypass(&s->HEVClc.cc) << 1;
-    ret |= get_cabac_bypass(&s->HEVClc.cc);
+    ret  = get_cabac_bypass(&s->HEVClc->cc) << 1;
+    ret |= get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
 
@@ -621,7 +768,7 @@ int ff_hevc_merge_idx_decode(HEVCContext *s)
     int i = GET_CABAC(elem_offset[MERGE_IDX]);
 
     if (i != 0) {
-        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc.cc))
+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc->cc))
             i++;
     }
     return i;
@@ -636,7 +783,7 @@ int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH)
 {
     if (nPbW + nPbH == 12)
         return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
-    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc.ct.depth))
+    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc->ct_depth))
         return PRED_BI;
 
     return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
@@ -651,7 +798,7 @@ int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx)
     while (i < max_ctx && GET_CABAC(elem_offset[REF_IDX_L0] + i))
         i++;
     if (i == 2) {
-        while (i < max && get_cabac_bypass(&s->HEVClc.cc))
+        while (i < max && get_cabac_bypass(&s->HEVClc->cc))
             i++;
     }
 
@@ -668,35 +815,37 @@ int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s)
     return GET_CABAC(elem_offset[NO_RESIDUAL_DATA_FLAG]);
 }
 
-int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s)
+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCContext *s)
 {
     return GET_CABAC(elem_offset[ABS_MVD_GREATER0_FLAG]);
 }
 
-int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s)
+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCContext *s)
 {
     return GET_CABAC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
 }
 
-int ff_hevc_mvd_decode(HEVCContext *s)
+static av_always_inline int mvd_decode(HEVCContext *s)
 {
     int ret = 2;
     int k = 1;
 
-    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
-        ret += 1 << k;
+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
+        ret += 1U << k;
         k++;
     }
-    if (k == CABAC_MAX_BIN)
+    if (k == CABAC_MAX_BIN) {
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
+        return 0;
+    }
     while (k--)
-        ret += get_cabac_bypass(&s->HEVClc.cc) << k;
-    return get_cabac_bypass_sign(&s->HEVClc.cc, -ret);
+        ret += get_cabac_bypass(&s->HEVClc->cc) << k;
+    return get_cabac_bypass_sign(&s->HEVClc->cc, -ret);
 }
 
-int ff_hevc_mvd_sign_flag_decode(HEVCContext *s)
+static av_always_inline int mvd_sign_flag_decode(HEVCContext *s)
 {
-    return get_cabac_bypass_sign(&s->HEVClc.cc, -1);
+    return get_cabac_bypass_sign(&s->HEVClc->cc, -1);
 }
 
 int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size)
@@ -714,53 +863,73 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
 }
 
-int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
 {
     return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
 }
 
-#define LAST_SIG_COEFF(elem)                                                    \
-    int i = 0;                                                                  \
-    int max = (log2_size << 1) - 1;                                             \
-    int ctx_offset, ctx_shift;                                                  \
-                                                                                \
-    if (c_idx == 0) {                                                           \
-        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);             \
-        ctx_shift = (log2_size + 1) >> 2;                                       \
-    } else {                                                                    \
-        ctx_offset = 15;                                                        \
-        ctx_shift = log2_size - 2;                                              \
-    }                                                                           \
-    while (i < max &&                                                           \
-           GET_CABAC(elem_offset[elem] + (i >> ctx_shift) + ctx_offset))        \
-        i++;                                                                    \
-    return i;
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+{
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+}
 
-int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
 {
-    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_X_PREFIX)
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
 }
 
-int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
-                                                   int log2_size)
+int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+    int i =0;
+
+    while (i < 4 && GET_CABAC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
+        i++;
+
+    return i;
+}
+
+int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+    return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+}
+
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
 {
-    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_Y_PREFIX)
+    int i = 0;
+    int max = (log2_size << 1) - 1;
+    int ctx_offset, ctx_shift;
+
+    if (!c_idx) {
+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+        ctx_shift = (log2_size + 1) >> 2;
+    } else {
+        ctx_offset = 15;
+        ctx_shift = log2_size - 2;
+    }
+    while (i < max &&
+           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scx_prefix = i;
+
+    i = 0;
+    while (i < max &&
+           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
+        i++;
+    *last_scy_prefix = i;
 }
 
-int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
+static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
                                                  int last_significant_coeff_prefix)
 {
     int i;
     int length = (last_significant_coeff_prefix >> 1) - 1;
-    int value = get_cabac_bypass(&s->HEVClc.cc);
+    int value = get_cabac_bypass(&s->HEVClc->cc);
 
     for (i = 1; i < length; i++)
-        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return value;
 }
 
-int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
 {
     int inc;
 
@@ -768,58 +937,19 @@ int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int c
 
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
 }
-
-int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c, int y_c,
-                                          int log2_trafo_size, int scan_idx, int prev_sig)
+static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
+                                           int offset, const uint8_t *ctx_idx_map)
 {
-    static const uint8_t ctx_idx_map[] = {
-        0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8
-    };
-    int x_cg = x_c >> 2;
-    int y_cg = y_c >> 2;
-    int sig_ctx, inc;
-
-    if (x_c + y_c == 0) {
-        sig_ctx = 0;
-    } else if (log2_trafo_size == 2) {
-        sig_ctx = ctx_idx_map[(y_c << 2) + x_c];
-    } else {
-        switch (prev_sig) {
-        case 0: {
-                int x_off = x_c & 3;
-                int y_off = y_c & 3;
-                sig_ctx   = ((x_off + y_off) == 0) ? 2 : ((x_off + y_off) <= 2) ? 1 : 0;
-            }
-            break;
-        case 1:
-            sig_ctx = 2 - FFMIN(y_c & 3, 2);
-            break;
-        case 2:
-            sig_ctx = 2 - FFMIN(x_c & 3, 2);
-            break;
-        default:
-            sig_ctx = 2;
-        }
-
-        if (c_idx == 0 && (x_cg > 0 || y_cg > 0))
-            sig_ctx += 3;
-
-        if (log2_trafo_size == 3) {
-            sig_ctx += (scan_idx == SCAN_DIAG) ? 9 : 15;
-        } else {
-            sig_ctx += c_idx ? 12 : 21;
-        }
-    }
-
-    if (c_idx == 0)
-        inc = sig_ctx;
-    else
-        inc = sig_ctx + 27;
-
+    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+{
+    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+}
+
+static av_always_inline int coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc)
 {
 
     if (c_idx > 0)
@@ -828,7 +958,7 @@ int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc)
+static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc)
 {
     if (c_idx > 0)
         inc += 4;
@@ -836,37 +966,574 @@ int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
 }
 
-int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int base_level, int rc_rice_param)
+static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
 {
     int prefix = 0;
     int suffix = 0;
     int last_coeff_abs_level_remaining;
     int i;
 
-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc))
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
         prefix++;
-    if (prefix == CABAC_MAX_BIN)
+    if (prefix == CABAC_MAX_BIN) {
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+        return 0;
+    }
     if (prefix < 3) {
         for (i = 0; i < rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
     } else {
         int prefix_minus3 = prefix - 3;
         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
                                               << rc_rice_param) + suffix;
     }
     return last_coeff_abs_level_remaining;
 }
 
-int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb)
+static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
 {
     int i;
     int ret = 0;
 
     for (i = 0; i < nb; i++)
-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
     return ret;
 }
+
+void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                int log2_trafo_size, enum ScanType scan_idx,
+                                int c_idx)
+{
+#define GET_COORD(offset, n)                                    \
+    do {                                                        \
+        x_c = (x_cg << 2) + scan_x_off[n];                      \
+        y_c = (y_cg << 2) + scan_y_off[n];                      \
+    } while (0)
+    HEVCLocalContext *lc = s->HEVClc;
+    int transform_skip_flag = 0;
+
+    int last_significant_coeff_x, last_significant_coeff_y;
+    int last_scan_pos;
+    int n_end;
+    int num_coeff = 0;
+    int greater1_ctx = 1;
+
+    int num_last_subset;
+    int x_cg_last_sig, y_cg_last_sig;
+
+    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+
+    ptrdiff_t stride = s->frame->linesize[c_idx];
+    int hshift = s->ps.sps->hshift[c_idx];
+    int vshift = s->ps.sps->vshift[c_idx];
+    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+    int explicit_rdpcm_flag = 0;
+    int explicit_rdpcm_dir_flag;
+
+    int trafo_size = 1 << log2_trafo_size;
+    int i;
+    int qp,shift,add,scale,scale_m;
+    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+    const uint8_t *scale_matrix = NULL;
+    uint8_t dc_scale;
+    int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+                                         lc->tu.intra_pred_mode_c;
+
+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+
+    // Derive QP for dequant
+    if (!lc->cu.cu_transquant_bypass_flag) {
+        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+        static const uint8_t rem6[51 + 4 * 6 + 1] = {
+            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+            4, 5, 0, 1, 2, 3, 4, 5, 0, 1
+        };
+
+        static const uint8_t div6[51 + 4 * 6 + 1] = {
+            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
+            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
+            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+            10, 10, 11, 11, 11, 11, 11, 11, 12, 12
+        };
+        int qp_y = lc->qp_y;
+
+        if (s->ps.pps->transform_skip_enabled_flag &&
+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+        }
+
+        if (c_idx == 0) {
+            qp = qp_y + s->ps.sps->qp_bd_offset;
+        } else {
+            int qp_i, offset;
+
+            if (c_idx == 1)
+                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset +
+                         lc->tu.cu_qp_offset_cb;
+            else
+                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset +
+                         lc->tu.cu_qp_offset_cr;
+
+            qp_i = av_clip(qp_y + offset, - s->ps.sps->qp_bd_offset, 57);
+            if (s->ps.sps->chroma_format_idc == 1) {
+                if (qp_i < 30)
+                    qp = qp_i;
+                else if (qp_i > 43)
+                    qp = qp_i - 6;
+                else
+                    qp = qp_c[qp_i - 30];
+            } else {
+                if (qp_i > 51)
+                    qp = 51;
+                else
+                    qp = qp_i;
+            }
+
+            qp += s->ps.sps->qp_bd_offset;
+        }
+
+        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
+        add      = 1 << (shift-1);
+        scale    = level_scale[rem6[qp]] << (div6[qp]);
+        scale_m  = 16; // default when no custom scaling lists.
+        dc_scale = 16;
+
+        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
+            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+
+            matrix_id = 3 * matrix_id + c_idx;
+
+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            if (log2_trafo_size >= 4)
+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+        }
+    } else {
+        shift        = 0;
+        add          = 0;
+        scale        = 0;
+        dc_scale     = 0;
+    }
+
+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+        if (explicit_rdpcm_flag) {
+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+        }
+    }
+
+    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+                                           &last_significant_coeff_x, &last_significant_coeff_y);
+
+    if (last_significant_coeff_x > 3) {
+        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
+        (2 + (last_significant_coeff_x & 1)) +
+        suffix;
+    }
+
+    if (last_significant_coeff_y > 3) {
+        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
+        (2 + (last_significant_coeff_y & 1)) +
+        suffix;
+    }
+
+    if (scan_idx == SCAN_VERT)
+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
+
+    x_cg_last_sig = last_significant_coeff_x >> 2;
+    y_cg_last_sig = last_significant_coeff_y >> 2;
+
+    switch (scan_idx) {
+    case SCAN_DIAG: {
+        int last_x_c = last_significant_coeff_x & 3;
+        int last_y_c = last_significant_coeff_y & 3;
+
+        scan_x_off = ff_hevc_diag_scan4x4_x;
+        scan_y_off = ff_hevc_diag_scan4x4_y;
+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+        if (trafo_size == 4) {
+            scan_x_cg = scan_1x1;
+            scan_y_cg = scan_1x1;
+        } else if (trafo_size == 8) {
+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = diag_scan2x2_x;
+            scan_y_cg = diag_scan2x2_y;
+        } else if (trafo_size == 16) {
+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan4x4_x;
+            scan_y_cg = ff_hevc_diag_scan4x4_y;
+        } else { // trafo_size == 32
+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan8x8_x;
+            scan_y_cg = ff_hevc_diag_scan8x8_y;
+        }
+        break;
+    }
+    case SCAN_HORIZ:
+        scan_x_cg = horiz_scan2x2_x;
+        scan_y_cg = horiz_scan2x2_y;
+        scan_x_off = horiz_scan4x4_x;
+        scan_y_off = horiz_scan4x4_y;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+        break;
+    default: //SCAN_VERT
+        scan_x_cg = horiz_scan2x2_y;
+        scan_y_cg = horiz_scan2x2_x;
+        scan_x_off = horiz_scan4x4_y;
+        scan_y_off = horiz_scan4x4_x;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+        break;
+    }
+    num_coeff++;
+    num_last_subset = (num_coeff - 1) >> 4;
+
+    for (i = num_last_subset; i >= 0; i--) {
+        int n, m;
+        int x_cg, y_cg, x_c, y_c, pos;
+        int implicit_non_zero_coeff = 0;
+        int64_t trans_coeff_level;
+        int prev_sig = 0;
+        int offset = i << 4;
+        int rice_init = 0;
+
+        uint8_t significant_coeff_flag_idx[16];
+        uint8_t nb_significant_coeff_flag = 0;
+
+        x_cg = scan_x_cg[i];
+        y_cg = scan_y_cg[i];
+
+        if ((i < num_last_subset) && (i > 0)) {
+            int ctx_cg = 0;
+            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+
+            significant_coeff_group_flag[x_cg][y_cg] =
+                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+            implicit_non_zero_coeff = 1;
+        } else {
+            significant_coeff_group_flag[x_cg][y_cg] =
+            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+             (x_cg == 0 && y_cg == 0));
+        }
+
+        last_scan_pos = num_coeff - offset - 1;
+
+        if (i == num_last_subset) {
+            n_end = last_scan_pos - 1;
+            significant_coeff_flag_idx[0] = last_scan_pos;
+            nb_significant_coeff_flag = 1;
+        } else {
+            n_end = 15;
+        }
+
+        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
+        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+
+        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
+            static const uint8_t ctx_idx_map[] = {
+                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
+                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
+                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
+                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
+                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+            };
+            const uint8_t *ctx_idx_map_p;
+            int scf_offset = 0;
+            if (s->ps.sps->transform_skip_context_enabled_flag &&
+                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
+                if (c_idx == 0) {
+                    scf_offset = 40;
+                } else {
+                    scf_offset = 14 + 27;
+                }
+            } else {
+                if (c_idx != 0)
+                    scf_offset = 27;
+                if (log2_trafo_size == 2) {
+                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+                } else {
+                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
+                    if (c_idx == 0) {
+                        if ((x_cg > 0 || y_cg > 0))
+                            scf_offset += 3;
+                        if (log2_trafo_size == 3) {
+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+                        } else {
+                            scf_offset += 21;
+                        }
+                    } else {
+                        if (log2_trafo_size == 3)
+                            scf_offset += 9;
+                        else
+                            scf_offset += 12;
+                    }
+                }
+            }
+            for (n = n_end; n > 0; n--) {
+                x_c = scan_x_off[n];
+                y_c = scan_y_off[n];
+                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+                    nb_significant_coeff_flag++;
+                    implicit_non_zero_coeff = 0;
+                }
+            }
+            if (implicit_non_zero_coeff == 0) {
+                if (s->ps.sps->transform_skip_context_enabled_flag &&
+                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+                    if (c_idx == 0) {
+                        scf_offset = 42;
+                    } else {
+                        scf_offset = 16 + 27;
+                    }
+                } else {
+                    if (i == 0) {
+                        if (c_idx == 0)
+                            scf_offset = 0;
+                        else
+                            scf_offset = 27;
+                    } else {
+                        scf_offset = 2 + scf_offset;
+                    }
+                }
+                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                    nb_significant_coeff_flag++;
+                }
+            } else {
+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                nb_significant_coeff_flag++;
+            }
+        }
+
+        n_end = nb_significant_coeff_flag;
+
+
+        if (n_end) {
+            int first_nz_pos_in_cg;
+            int last_nz_pos_in_cg;
+            int c_rice_param = 0;
+            int first_greater1_coeff_idx = -1;
+            uint8_t coeff_abs_level_greater1_flag[8];
+            uint16_t coeff_sign_flag;
+            int sum_abs = 0;
+            int sign_hidden;
+            int sb_type;
+
+
+            // initialize first elem of coeff_bas_level_greater1_flag
+            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+
+            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+                else
+                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+                c_rice_param = lc->stat_coeff[sb_type] / 4;
+            }
+
+            if (!(i == num_last_subset) && greater1_ctx == 0)
+                ctx_set++;
+            greater1_ctx = 1;
+            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
+
+            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
+                int inc = (ctx_set << 2) + greater1_ctx;
+                coeff_abs_level_greater1_flag[m] =
+                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
+                if (coeff_abs_level_greater1_flag[m]) {
+                    greater1_ctx = 0;
+                    if (first_greater1_coeff_idx == -1)
+                        first_greater1_coeff_idx = m;
+                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
+                    greater1_ctx++;
+                }
+            }
+            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
+
+            if (lc->cu.cu_transquant_bypass_flag ||
+                (lc->cu.pred_mode ==  MODE_INTRA  &&
+                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
+                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
+                 explicit_rdpcm_flag)
+                sign_hidden = 0;
+            else
+                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+
+            if (first_greater1_coeff_idx != -1) {
+                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
+            }
+            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
+                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+            } else {
+                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
+            }
+
+            for (m = 0; m < n_end; m++) {
+                n = significant_coeff_flag_idx[m];
+                GET_COORD(offset, n);
+                if (m < 8) {
+                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
+                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
+                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+                        trans_coeff_level += last_coeff_abs_level_remaining;
+                        if (trans_coeff_level > (3 << c_rice_param))
+                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+                                lc->stat_coeff[sb_type]++;
+                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+                                if (lc->stat_coeff[sb_type] > 0)
+                                    lc->stat_coeff[sb_type]--;
+                            rice_init = 1;
+                        }
+                    }
+                } else {
+                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+
+                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
+                    if (trans_coeff_level > (3 << c_rice_param))
+                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+                            lc->stat_coeff[sb_type]++;
+                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+                            if (lc->stat_coeff[sb_type] > 0)
+                                lc->stat_coeff[sb_type]--;
+                        rice_init = 1;
+                    }
+                }
+                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
+                    sum_abs += trans_coeff_level;
+                    if (n == first_nz_pos_in_cg && (sum_abs&1))
+                        trans_coeff_level = -trans_coeff_level;
+                }
+                if (coeff_sign_flag >> 15)
+                    trans_coeff_level = -trans_coeff_level;
+                coeff_sign_flag <<= 1;
+                if(!lc->cu.cu_transquant_bypass_flag) {
+                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+                        if(y_c || x_c || log2_trafo_size < 4) {
+                            switch(log2_trafo_size) {
+                                case 3: pos = (y_c << 3) + x_c; break;
+                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+                                default: pos = (y_c << 2) + x_c; break;
+                            }
+                            scale_m = scale_matrix[pos];
+                        } else {
+                            scale_m = dc_scale;
+                        }
+                    }
+                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+                    if(trans_coeff_level < 0) {
+                        if((~trans_coeff_level) & 0xFffffffffff8000)
+                            trans_coeff_level = -32768;
+                    } else {
+                        if(trans_coeff_level & 0xffffffffffff8000)
+                            trans_coeff_level = 32767;
+                    }
+                }
+                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+            }
+        }
+    }
+
+    if (lc->cu.cu_transquant_bypass_flag) {
+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
+
+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+        }
+    } else {
+        if (transform_skip_flag) {
+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+                      log2_trafo_size == 2 &&
+                      lc->cu.pred_mode == MODE_INTRA;
+            if (rot) {
+                for (i = 0; i < 8; i++)
+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+            }
+
+            s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+
+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+                                        lc->cu.pred_mode == MODE_INTRA &&
+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
+
+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+            }
+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+            s->hevcdsp.idct_4x4_luma(coeffs);
+        } else {
+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+            if (max_xy == 0)
+                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+            else {
+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                if (max_xy < 4)
+                    col_limit = FFMIN(4, col_limit);
+                else if (max_xy < 8)
+                    col_limit = FFMIN(8, col_limit);
+                else if (max_xy < 12)
+                    col_limit = FFMIN(24, col_limit);
+                s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+            }
+        }
+    }
+    if (lc->tu.cross_pf) {
+        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+
+        for (i = 0; i < (trafo_size * trafo_size); i++) {
+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+        }
+    }
+    s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+}
+
+void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    HEVCLocalContext *lc = s->HEVClc;
+    int x = abs_mvd_greater0_flag_decode(s);
+    int y = abs_mvd_greater0_flag_decode(s);
+
+    if (x)
+        x += abs_mvd_greater1_flag_decode(s);
+    if (y)
+        y += abs_mvd_greater1_flag_decode(s);
+
+    switch (x) {
+    case 2: lc->pu.mvd.x = mvd_decode(s);           break;
+    case 1: lc->pu.mvd.x = mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.x = 0;                       break;
+    }
+
+    switch (y) {
+    case 2: lc->pu.mvd.y = mvd_decode(s);           break;
+    case 1: lc->pu.mvd.y = mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.y = 0;                       break;
+    }
+}
+
diff --git a/libavcodec/hevc_data.c b/libavcodec/hevc_data.c
index f4b6096..f74f272 100644
--- a/libavcodec/hevc_data.c
+++ b/libavcodec/hevc_data.c
@@ -1,20 +1,20 @@
 /*
  * HEVC shared tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 769977b..1f33b0c 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -5,20 +5,20 @@
  * Copyright (C) 2013 Seppo Tomperi
  * Copyright (C) 2013 Wassim Hamidouche
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,8 @@
 #include "golomb.h"
 #include "hevc.h"
 
+#include "bit_depth_template.c"
+
 #define LUMA 0
 #define CB 1
 #define CR 2
@@ -59,28 +61,30 @@ static int chroma_tc(HEVCContext *s, int qp_y, int c_idx, int tc_offset)
         offset = s->ps.pps->cr_qp_offset;
 
     qp_i = av_clip(qp_y + offset, 0, 57);
-    if (qp_i < 30)
-        qp = qp_i;
-    else if (qp_i > 43)
-        qp = qp_i - 6;
-    else
-        qp = qp_c[qp_i - 30];
+    if (s->ps.sps->chroma_format_idc == 1) {
+        if (qp_i < 30)
+            qp = qp_i;
+        else if (qp_i > 43)
+            qp = qp_i - 6;
+        else
+            qp = qp_c[qp_i - 30];
+    } else {
+        qp = av_clip(qp_i, 0, 51);
+    }
 
     idxt = av_clip(qp + DEFAULT_INTRA_TC_OFFSET + tc_offset, 0, 53);
     return tctable[idxt];
 }
 
-static int get_qPy_pred(HEVCContext *s, int xC, int yC,
-                        int xBase, int yBase, int log2_cb_size)
+static int get_qPy_pred(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
 {
-    HEVCLocalContext *lc     = &s->HEVClc;
+    HEVCLocalContext *lc     = s->HEVClc;
     int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
     int MinCuQpDeltaSizeMask = (1 << (s->ps.sps->log2_ctb_size -
                                       s->ps.pps->diff_cu_qp_delta_depth)) - 1;
     int xQgBase              = xBase - (xBase & MinCuQpDeltaSizeMask);
     int yQgBase              = yBase - (yBase & MinCuQpDeltaSizeMask);
     int min_cb_width         = s->ps.sps->min_cb_width;
-    int min_cb_height        = s->ps.sps->min_cb_height;
     int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
     int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
     int availableA           = (xBase   & ctb_size_mask) &&
@@ -94,46 +98,7 @@ static int get_qPy_pred(HEVCContext *s, int xC, int yC,
         lc->first_qp_group = !lc->tu.is_cu_qp_delta_coded;
         qPy_pred = s->sh.slice_qp;
     } else {
-        qPy_pred = lc->qp_y;
-        if (log2_cb_size < s->ps.sps->log2_ctb_size -
-                           s->ps.pps->diff_cu_qp_delta_depth) {
-            static const int offsetX[8][8] = {
-                { -1, 1, 3, 1, 7, 1, 3, 1 },
-                {  0, 0, 0, 0, 0, 0, 0, 0 },
-                {  1, 3, 1, 3, 1, 3, 1, 3 },
-                {  2, 2, 2, 2, 2, 2, 2, 2 },
-                {  3, 5, 7, 5, 3, 5, 7, 5 },
-                {  4, 4, 4, 4, 4, 4, 4, 4 },
-                {  5, 7, 5, 7, 5, 7, 5, 7 },
-                {  6, 6, 6, 6, 6, 6, 6, 6 }
-            };
-            static const int offsetY[8][8] = {
-                { 7, 0, 1, 2, 3, 4, 5, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 1, 0, 3, 2, 5, 4, 7, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 3, 0, 1, 2, 7, 4, 5, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 },
-                { 1, 0, 3, 2, 5, 4, 7, 6 },
-                { 0, 1, 2, 3, 4, 5, 6, 7 }
-            };
-            int xC0b = (xC - (xC & ctb_size_mask)) >> s->ps.sps->log2_min_cb_size;
-            int yC0b = (yC - (yC & ctb_size_mask)) >> s->ps.sps->log2_min_cb_size;
-            int idxX = (xQgBase  & ctb_size_mask)  >> s->ps.sps->log2_min_cb_size;
-            int idxY = (yQgBase  & ctb_size_mask)  >> s->ps.sps->log2_min_cb_size;
-            int idx_mask = ctb_size_mask >> s->ps.sps->log2_min_cb_size;
-            int x, y;
-
-            x = FFMIN(xC0b +  offsetX[idxX][idxY],             min_cb_width  - 1);
-            y = FFMIN(yC0b + (offsetY[idxX][idxY] & idx_mask), min_cb_height - 1);
-
-            if (xC0b == (lc->start_of_tiles_x >> s->ps.sps->log2_min_cb_size) &&
-                offsetX[idxX][idxY] == -1) {
-                x = (lc->end_of_tiles_x >> s->ps.sps->log2_min_cb_size) - 1;
-                y = yC0b - 1;
-            }
-            qPy_pred = s->qp_y_tab[y * min_cb_width + x];
-        }
+        qPy_pred = lc->qPy_pred;
     }
 
     // qPy_a
@@ -148,20 +113,22 @@ static int get_qPy_pred(HEVCContext *s, int xC, int yC,
     else
         qPy_b = s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width];
 
+    av_assert2(qPy_a >= -s->ps.sps->qp_bd_offset && qPy_a < 52);
+    av_assert2(qPy_b >= -s->ps.sps->qp_bd_offset && qPy_b < 52);
+
     return (qPy_a + qPy_b + 1) >> 1;
 }
 
-void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC,
-                     int xBase, int yBase, int log2_cb_size)
+void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
 {
-    int qp_y = get_qPy_pred(s, xC, yC, xBase, yBase, log2_cb_size);
+    int qp_y = get_qPy_pred(s, xBase, yBase, log2_cb_size);
 
-    if (s->HEVClc.tu.cu_qp_delta != 0) {
+    if (s->HEVClc->tu.cu_qp_delta != 0) {
         int off = s->ps.sps->qp_bd_offset;
-        s->HEVClc.qp_y = FFUMOD(qp_y + s->HEVClc.tu.cu_qp_delta + 52 + 2 * off,
-                                52 + off) - off;
+        s->HEVClc->qp_y = FFUMOD(qp_y + s->HEVClc->tu.cu_qp_delta + 52 + 2 * off,
+                                 52 + off) - off;
     } else
-        s->HEVClc.qp_y = qp_y;
+        s->HEVClc->qp_y = qp_y;
 }
 
 static int get_qPy(HEVCContext *s, int xC, int yC)
@@ -172,15 +139,106 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
 }
 
-static void copy_CTB(uint8_t *dst, uint8_t *src,
-                     int width, int height, int stride)
+static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+                     intptr_t stride_dst, intptr_t stride_src)
+{
+int i, j;
+
+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=8)
+                AV_COPY64U(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            for (j = 0; j < width; j+=16)
+                AV_COPY128(dst+j, src+j);
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+{
+    if (pixel_shift)
+        *(uint16_t *)dst = *(uint16_t *)src;
+    else
+        *dst = *src;
+}
+
+static void copy_vert(uint8_t *dst, const uint8_t *src,
+                      int pixel_shift, int height,
+                      int stride_dst, int stride_src)
 {
     int i;
+    if (pixel_shift == 0) {
+        for (i = 0; i < height; i++) {
+            *dst = *src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+                           int stride_src, int x, int y, int width, int height,
+                           int c_idx, int x_ctb, int y_ctb)
+{
+    int sh = s->ps.sps->pixel_shift;
+    int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+    int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
 
-    for (i = 0; i < height; i++) {
-        memcpy(dst, src, width);
-        dst += stride;
-        src += stride;
+    /* copy horizontal edges */
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
+        src, width << sh);
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
+        src + stride_src * (height - 1), width << sh);
+
+    /* copy vertical edges */
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
+
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
+}
+
+static void restore_tqb_pixels(HEVCContext *s,
+                               uint8_t *src1, const uint8_t *dst1,
+                               ptrdiff_t stride_src, ptrdiff_t stride_dst,
+                               int x0, int y0, int width, int height, int c_idx)
+{
+    if ( s->ps.pps->transquant_bypass_enable_flag ||
+            (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+        int x, y;
+        int min_pu_size  = 1 << s->ps.sps->log2_min_pu_size;
+        int hshift       = s->ps.sps->hshift[c_idx];
+        int vshift       = s->ps.sps->vshift[c_idx];
+        int x_min        = ((x0         ) >> s->ps.sps->log2_min_pu_size);
+        int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+        int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+        int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+        for (y = y_min; y < y_max; y++) {
+            for (x = x_min; x < x_max; x++) {
+                if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+                    int n;
+                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+                    for (n = 0; n < (min_pu_size >> vshift); n++) {
+                        memcpy(src, dst, len);
+                        src += stride_src;
+                        dst += stride_dst;
+                    }
+                }
+            }
+        }
     }
 }
 
@@ -188,128 +246,209 @@ static void copy_CTB(uint8_t *dst, uint8_t *src,
 
 static void sao_filter_CTB(HEVCContext *s, int x, int y)
 {
-    //  TODO: This should be easily parallelizable
-    //  TODO: skip CBs when (cu_transquant_bypass_flag || (pcm_loop_filter_disable_flag && pcm_flag))
-    int c_idx = 0;
-    int class = 1, class_index;
+    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+    HEVCLocalContext *lc = s->HEVClc;
+    int c_idx;
     int edges[4];  // 0 left 1 top 2 right 3 bottom
-    SAOParams *sao[4];
-    int classes[4];
-    int x_shift = 0, y_shift = 0;
-    int x_ctb = x >> s->ps.sps->log2_ctb_size;
-    int y_ctb = y >> s->ps.sps->log2_ctb_size;
-    int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb;
-    int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
-
+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+    SAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
     // flags indicating unfilterable edges
-    uint8_t vert_edge[]  = { 0, 0, 0, 0 };
-    uint8_t horiz_edge[] = { 0, 0, 0, 0 };
-    uint8_t diag_edge[]  = { 0, 0, 0, 0 };
-    uint8_t lfase[3]; // current, above, left
-    uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag &&
-                             !s->ps.pps->loop_filter_across_tiles_enabled_flag;
-    uint8_t left_tile_edge = 0, up_tile_edge = 0;
-
-    sao[0]     = &CTB(s->sao, x_ctb, y_ctb);
+    uint8_t vert_edge[]      = { 0, 0 };
+    uint8_t horiz_edge[]     = { 0, 0 };
+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
+    uint8_t restore          = no_tile_filter || !lfase;
+    uint8_t left_tile_edge   = 0;
+    uint8_t right_tile_edge  = 0;
+    uint8_t up_tile_edge     = 0;
+    uint8_t bottom_tile_edge = 0;
+
     edges[0]   = x_ctb == 0;
     edges[1]   = y_ctb == 0;
     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-    lfase[0]   = CTB(s->filter_slice_edges, x_ctb, y_ctb);
-    classes[0] = 0;
-
-    if (!edges[0]) {
-        left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-        sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb);
-        vert_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
-        vert_edge[2] = vert_edge[0];
-        lfase[2]     = CTB(s->filter_slice_edges, x_ctb - 1, y_ctb);
-        classes[class] = 2;
-        class++;
-        x_shift = 8;
-    }
-
-    if (!edges[1]) {
-        up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
-        sao[class] = &CTB(s->sao, x_ctb, y_ctb - 1);
-        horiz_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
-        horiz_edge[1] = horiz_edge[0];
-        lfase[1] = CTB(s->filter_slice_edges, x_ctb, y_ctb - 1);
-        classes[class] = 1;
-        class++;
-        y_shift = 4;
 
+    if (restore) {
         if (!edges[0]) {
-            classes[class] = 3;
-            sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb - 1);
-            class++;
-
-            // Tile check here is done current CTB row/col, not above/left like you'd expect,
-            //but that is because the tile boundary always extends through the whole pic
-            vert_edge[1] = (!lfase[1] && CTB(s->tab_slice_address, x_ctb, y_ctb - 1) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge;
-            vert_edge[3] = vert_edge[1];
-            horiz_edge[2] = (!lfase[2] && CTB(s->tab_slice_address, x_ctb - 1, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || up_tile_edge;
-            horiz_edge[3] = horiz_edge[2];
-            diag_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
-            diag_edge[3] = diag_edge[0];
-
-            // Does left CTB comes after above CTB?
-            if (CTB(s->tab_slice_address, x_ctb - 1, y_ctb) >
-                CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
-                diag_edge[2] = !lfase[2] || left_tile_edge || up_tile_edge;
-                diag_edge[1] = diag_edge[2];
-            } else if (CTB(s->tab_slice_address, x_ctb - 1, y_ctb) <
-                       CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
-                diag_edge[1] = !lfase[1] || left_tile_edge || up_tile_edge;
-                diag_edge[2] = diag_edge[1];
-            } else {
-                // Same slice, only consider tiles
-                diag_edge[2] = left_tile_edge || up_tile_edge;
-                diag_edge[1] = diag_edge[2];
-            }
+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
+        }
+        if (!edges[2]) {
+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
+        }
+        if (!edges[1]) {
+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
+        }
+        if (!edges[3]) {
+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[1]) {
+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
+        }
+        if (!edges[1] && !edges[2]) {
+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
+        }
+        if (!edges[2] && !edges[3]) {
+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
+        }
+        if (!edges[0] && !edges[3]) {
+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
         }
     }
 
-    for (c_idx = 0; c_idx < 3; c_idx++) {
-        int chroma = c_idx ? 1 : 0;
-        int x0 = x >> chroma;
-        int y0 = y >> chroma;
-        int stride = s->frame->linesize[c_idx];
-        int ctb_size = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->hshift[c_idx];
-        int width = FFMIN(ctb_size,
-                          (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0);
-        int height = FFMIN(ctb_size,
-                           (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
-
-        uint8_t *src = &s->frame->data[c_idx][y0 * stride + (x0 << s->ps.sps->pixel_shift)];
-        uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride + (x0 << s->ps.sps->pixel_shift)];
-        int offset = (y_shift >> chroma) * stride + ((x_shift >> chroma) << s->ps.sps->pixel_shift);
-
-        copy_CTB(dst - offset, src - offset,
-                 (edges[2] ? width  + (x_shift >> chroma) : width)  << s->ps.sps->pixel_shift,
-                 (edges[3] ? height + (y_shift >> chroma) : height), stride);
-
-        for (class_index = 0; class_index < class; class_index++) {
-
-            switch (sao[class_index]->type_idx[c_idx]) {
-            case SAO_BAND:
-                s->hevcdsp.sao_band_filter[classes[class_index]](dst, src,
-                                                                 stride,
-                                                                 sao[class_index],
-                                                                 edges, width,
-                                                                 height, c_idx);
-                break;
-            case SAO_EDGE:
-                s->hevcdsp.sao_edge_filter[classes[class_index]](dst, src,
-                                                                 stride,
-                                                                 sao[class_index],
-                                                                 edges, width,
-                                                                 height, c_idx,
-                                                                 vert_edge[classes[class_index]],
-                                                                 horiz_edge[classes[class_index]],
-                                                                 diag_edge[classes[class_index]]);
-                break;
+    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+        int x0       = x >> s->ps.sps->hshift[c_idx];
+        int y0       = y >> s->ps.sps->vshift[c_idx];
+        int stride_src = s->frame->linesize[c_idx];
+        int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->hshift[c_idx];
+        int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> s->ps.sps->vshift[c_idx];
+        int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+        int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+        int stride_dst;
+        uint8_t *dst;
+
+        switch (sao->type_idx[c_idx]) {
+        case SAO_BAND:
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            if (s->ps.pps->transquant_bypass_enable_flag ||
+                (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+            dst = lc->edge_emu_buffer;
+            stride_dst = 2*MAX_PB_SIZE;
+            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+                                            width, height);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
+            } else {
+            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+                                            width, height);
+            }
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        case SAO_EDGE:
+        {
+            int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+            int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+            int left_edge = edges[0];
+            int top_edge = edges[1];
+            int right_edge = edges[2];
+            int bottom_edge = edges[3];
+            int sh = s->ps.sps->pixel_shift;
+            int left_pixels, right_pixels;
+
+            stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+            dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+
+            if (!top_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst - stride_dst - (left << sh);
+                src1[0] = src - stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
             }
+            if (!bottom_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst + height * stride_dst - (left << sh);
+                src1[0] = src + height * stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
+            }
+            left_pixels = 0;
+            if (!left_edge) {
+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst - (1 << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    left_pixels = 1;
+                }
+            }
+            right_pixels = 0;
+            if (!right_edge) {
+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst + (width << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    right_pixels = 1;
+                }
+            }
+
+            copy_CTB(dst - (left_pixels << sh),
+                     src - (left_pixels << sh),
+                     (width + left_pixels + right_pixels) << sh,
+                     height, stride_dst, stride_src);
+
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
+            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+                                            sao->eo_class[c_idx], width, height);
+            s->hevcdsp.sao_edge_restore[restore](src, dst,
+                                                stride_src, stride_dst,
+                                                sao,
+                                                edges, width,
+                                                height, c_idx,
+                                                vert_edge,
+                                                horiz_edge,
+                                                diag_edge);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
+            sao->type_idx[c_idx] = SAO_APPLIED;
+            break;
+        }
         }
     }
 }
@@ -338,18 +477,21 @@ static int get_pcm(HEVCContext *s, int x, int y)
 static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
 {
     uint8_t *src;
-    int x, y, x_end, y_end, chroma;
-    int c_tc[2], tc[2], beta;
+    int x, y;
+    int chroma, beta;
+    int32_t c_tc[2], tc[2];
     uint8_t no_p[2] = { 0 };
     uint8_t no_q[2] = { 0 };
 
     int log2_ctb_size = s->ps.sps->log2_ctb_size;
+    int x_end, x_end2, y_end;
     int ctb_size        = 1 << log2_ctb_size;
     int ctb             = (x0 >> log2_ctb_size) +
                           (y0 >> log2_ctb_size) * s->ps.sps->ctb_width;
     int cur_tc_offset   = s->deblock[ctb].tc_offset;
     int cur_beta_offset = s->deblock[ctb].beta_offset;
-    int tc_offset, left_tc_offset, beta_offset, left_beta_offset;
+    int left_tc_offset, left_beta_offset;
+    int tc_offset, beta_offset;
     int pcmf = (s->ps.sps->pcm_enabled_flag &&
                 s->ps.sps->pcm.loop_filter_disable_flag) ||
                s->ps.pps->transquant_bypass_enable_flag;
@@ -357,6 +499,9 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     if (x0) {
         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+    } else {
+        left_tc_offset   = 0;
+        left_beta_offset = 0;
     }
 
     x_end = x0 + ctb_size;
@@ -369,11 +514,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
     tc_offset   = cur_tc_offset;
     beta_offset = cur_beta_offset;
 
-    // vertical filtering luma
+    x_end2 = x_end;
+    if (x_end2 != s->ps.sps->width)
+        x_end2 -= 8;
     for (y = y0; y < y_end; y += 8) {
+        // vertical filtering luma
         for (x = x0 ? x0 : 8; x < x_end; x += 8) {
-            const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
-            const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];
+            const int bs0 = s->vertical_bs[(x +  y      * s->bs_width) >> 2];
+            const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2];
             if (bs0 || bs1) {
                 const int qp = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
 
@@ -396,45 +544,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                        beta, tc, no_p, no_q);
             }
         }
-    }
 
-    // vertical filtering chroma
-    for (chroma = 1; chroma <= 2; chroma++) {
-        for (y = y0; y < y_end; y += 16) {
-            for (x = x0 ? x0 : 16; x < x_end; x += 16) {
-                const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
-                const int bs1 = s->vertical_bs[(x >> 3) + ((y + 8) >> 2) * s->bs_width];
-                if ((bs0 == 2) || (bs1 == 2)) {
-                    const int qp0 = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
-                    const int qp1 = (get_qPy(s, x - 1, y + 8) + get_qPy(s, x, y + 8) + 1) >> 1;
-
-                    c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
-                    c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
-                    src     = &s->frame->data[chroma][y / 2 * s->frame->linesize[chroma] + ((x / 2) << s->ps.sps->pixel_shift)];
-                    if (pcmf) {
-                        no_p[0] = get_pcm(s, x - 1, y);
-                        no_p[1] = get_pcm(s, x - 1, y + 8);
-                        no_q[0] = get_pcm(s, x, y);
-                        no_q[1] = get_pcm(s, x, y + 8);
-                        s->hevcdsp.hevc_v_loop_filter_chroma_c(src,
-                                                               s->frame->linesize[chroma],
-                                                               c_tc, no_p, no_q);
-                    } else
-                        s->hevcdsp.hevc_v_loop_filter_chroma(src,
-                                                             s->frame->linesize[chroma],
-                                                             c_tc, no_p, no_q);
-                }
-            }
-        }
-    }
+        if(!y)
+             continue;
 
-    // horizontal filtering luma
-    if (x_end != s->ps.sps->width)
-        x_end -= 8;
-    for (y = y0 ? y0 : 8; y < y_end; y += 8) {
-        for (x = x0 ? x0 - 8 : 0; x < x_end; x += 8) {
-            const int bs0 = s->horizontal_bs[(x +     y * s->bs_width) >> 2];
-            const int bs1 = s->horizontal_bs[(x + 4 + y * s->bs_width) >> 2];
+        // horizontal filtering luma
+        for (x = x0 ? x0 - 8 : 0; x < x_end2; x += 8) {
+            const int bs0 = s->horizontal_bs[( x      + y * s->bs_width) >> 2];
+            const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2];
             if (bs0 || bs1) {
                 const int qp = (get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1;
 
@@ -461,123 +578,135 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
         }
     }
 
-    // horizontal filtering chroma
-    for (chroma = 1; chroma <= 2; chroma++) {
-        for (y = y0 ? y0 : 16; y < y_end; y += 16) {
-            for (x = x0 - 8; x < x_end; x += 16) {
-                int bs0, bs1;
-                // to make sure no memory access over boundary when x = -8
-                // TODO: simplify with row based deblocking
-                if (x < 0) {
-                    bs0 = 0;
-                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
-                } else if (x >= x_end - 8) {
-                    bs0 = s->horizontal_bs[(x +     y * s->bs_width) >> 2];
-                    bs1 = 0;
-                } else {
-                    bs0 = s->horizontal_bs[(x + y     * s->bs_width) >> 2];
-                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
+    if (s->ps.sps->chroma_format_idc) {
+        for (chroma = 1; chroma <= 2; chroma++) {
+            int h = 1 << s->ps.sps->hshift[chroma];
+            int v = 1 << s->ps.sps->vshift[chroma];
+
+            // vertical filtering chroma
+            for (y = y0; y < y_end; y += (8 * v)) {
+                for (x = x0 ? x0 : 8 * h; x < x_end; x += (8 * h)) {
+                    const int bs0 = s->vertical_bs[(x +  y            * s->bs_width) >> 2];
+                    const int bs1 = s->vertical_bs[(x + (y + (4 * v)) * s->bs_width) >> 2];
+
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = (get_qPy(s, x - 1, y)           + get_qPy(s, x, y)           + 1) >> 1;
+                        const int qp1 = (get_qPy(s, x - 1, y + (4 * v)) + get_qPy(s, x, y + (4 * v)) + 1) >> 1;
+
+                        c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                        c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+                        if (pcmf) {
+                            no_p[0] = get_pcm(s, x - 1, y);
+                            no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+                            no_q[0] = get_pcm(s, x, y);
+                            no_q[1] = get_pcm(s, x, y + (4 * v));
+                            s->hevcdsp.hevc_v_loop_filter_chroma_c(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+                        } else
+                            s->hevcdsp.hevc_v_loop_filter_chroma(src,
+                                                                 s->frame->linesize[chroma],
+                                                                 c_tc, no_p, no_q);
+                    }
                 }
 
-                if ((bs0 == 2) || (bs1 == 2)) {
-                    const int qp0 = bs0 == 2 ? (get_qPy(s, x,     y - 1) + get_qPy(s, x,     y) + 1) >> 1 : 0;
-                    const int qp1 = bs1 == 2 ? (get_qPy(s, x + 8, y - 1) + get_qPy(s, x + 8, y) + 1) >> 1 : 0;
-
-                    tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset;
-                    c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
-                    c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
-                    src       = &s->frame->data[chroma][y / 2 * s->frame->linesize[chroma] + ((x / 2) << s->ps.sps->pixel_shift)];
-                    if (pcmf) {
-                        no_p[0] = get_pcm(s, x, y - 1);
-                        no_p[1] = get_pcm(s, x + 8, y - 1);
-                        no_q[0] = get_pcm(s, x, y);
-                        no_q[1] = get_pcm(s, x + 8, y);
-                        s->hevcdsp.hevc_h_loop_filter_chroma_c(src,
-                                                               s->frame->linesize[chroma],
-                                                               c_tc, no_p, no_q);
-                    } else
-                        s->hevcdsp.hevc_h_loop_filter_chroma(src,
-                                                             s->frame->linesize[chroma],
-                                                             c_tc, no_p, no_q);
+                if(!y)
+                    continue;
+
+                // horizontal filtering chroma
+                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+                x_end2 = x_end;
+                if (x_end != s->ps.sps->width)
+                    x_end2 = x_end - 8 * h;
+                for (x = x0 ? x0 - 8 * h : 0; x < x_end2; x += (8 * h)) {
+                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+                    if ((bs0 == 2) || (bs1 == 2)) {
+                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,           y - 1) + get_qPy(s, x,           y) + 1) >> 1 : 0;
+                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + (4 * h), y - 1) + get_qPy(s, x + (4 * h), y) + 1) >> 1 : 0;
+
+                        c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+                        c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+                        if (pcmf) {
+                            no_p[0] = get_pcm(s, x,           y - 1);
+                            no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+                            no_q[0] = get_pcm(s, x,           y);
+                            no_q[1] = get_pcm(s, x + (4 * h), y);
+                            s->hevcdsp.hevc_h_loop_filter_chroma_c(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+                        } else
+                            s->hevcdsp.hevc_h_loop_filter_chroma(src,
+                                                                 s->frame->linesize[chroma],
+                                                                 c_tc, no_p, no_q);
+                    }
                 }
             }
         }
     }
 }
 
-static int boundary_strength(HEVCContext *s, MvField *curr,
-                             uint8_t curr_cbf_luma, MvField *neigh,
-                             uint8_t neigh_cbf_luma,
-                             RefPicList *neigh_refPicList,
-                             int tu_border)
+static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+                             RefPicList *neigh_refPicList)
 {
-    int mvs = curr->pred_flag[0] + curr->pred_flag[1];
-
-    if (tu_border) {
-        if (curr->is_intra || neigh->is_intra)
-            return 2;
-        if (curr_cbf_luma || neigh_cbf_luma)
-            return 1;
-    }
-
-    if (mvs == neigh->pred_flag[0] + neigh->pred_flag[1]) {
-        if (mvs == 2) {
-            // same L0 and L1
-            if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
-                s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
-                neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
-                if ((abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                     abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-                    (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                     abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4))
-                    return 1;
-                else
-                    return 0;
-            } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                       neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-                if (abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-                    abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-                       neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-                if (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-                    abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else {
+    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+        // same L0 and L1
+        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
                 return 1;
-            }
-        } else { // 1 MV
-            Mv A, B;
-            int ref_A, ref_B;
-
-            if (curr->pred_flag[0]) {
-                A     = curr->mv[0];
-                ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
-            } else {
-                A     = curr->mv[1];
-                ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
-            }
+            else
+                return 0;
+        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+                return 1;
+            else
+                return 0;
+        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+                return 1;
+            else
+                return 0;
+        } else {
+            return 1;
+        }
+    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+        Mv A, B;
+        int ref_A, ref_B;
+
+        if (curr->pred_flag & 1) {
+            A     = curr->mv[0];
+            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+        } else {
+            A     = curr->mv[1];
+            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+        }
 
-            if (neigh->pred_flag[0]) {
-                B     = neigh->mv[0];
-                ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
-            } else {
-                B     = neigh->mv[1];
-                ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
-            }
+        if (neigh->pred_flag & 1) {
+            B     = neigh->mv[0];
+            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+        } else {
+            B     = neigh->mv[1];
+            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+        }
 
-            if (ref_A == ref_B) {
-                if (abs(A.x - B.x) >= 4 || abs(A.y - B.y) >= 4)
-                    return 1;
-                else
-                    return 0;
-            } else
+        if (ref_A == ref_B) {
+            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
                 return 1;
-        }
+            else
+                return 0;
+        } else
+            return 1;
     }
 
     return 1;
@@ -586,14 +715,14 @@ static int boundary_strength(HEVCContext *s, MvField *curr,
 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                            int log2_trafo_size)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     MvField *tab_mvf     = s->ref->tab_mvf;
     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
     int min_pu_width     = s->ps.sps->min_pu_width;
     int min_tu_width     = s->ps.sps->min_tb_width;
     int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
-                           (x0 >> log2_min_pu_size)].is_intra;
+                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
     int boundary_upper, boundary_left;
     int i, j, bs;
 
@@ -611,37 +740,11 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
                               s->ref->refPicList;
-
         int yp_pu = (y0 - 1) >> log2_min_pu_size;
         int yq_pu =  y0      >> log2_min_pu_size;
         int yp_tu = (y0 - 1) >> log2_min_tu_size;
         int yq_tu =  y0      >> log2_min_tu_size;
 
-        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-            int x_pu = (x0 + i) >> log2_min_pu_size;
-            int x_tu = (x0 + i) >> log2_min_tu_size;
-            MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-            MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-            uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
-            uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-
-            bs = boundary_strength(s, curr, curr_cbf_luma,
-                                   top, top_cbf_luma, rpl_top, 1);
-            if (bs)
-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
-        }
-    }
-
-    // bs for TU internal horizontal PU boundaries
-    if (log2_trafo_size > s->ps.sps->log2_min_pu_size && !is_intra) {
-        RefPicList *rpl = s->ref->refPicList;
-
-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
-            int yp_tu = (y0 + j - 1) >> log2_min_tu_size;
-            int yq_tu = (y0 + j)     >> log2_min_tu_size;
-
             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
                 int x_pu = (x0 + i) >> log2_min_pu_size;
                 int x_tu = (x0 + i) >> log2_min_tu_size;
@@ -650,12 +753,14 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                 uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
                 uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
 
-                bs = boundary_strength(s, curr, curr_cbf_luma,
-                                       top, top_cbf_luma, rpl, 0);
-                if (bs)
-                    s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+                    bs = 2;
+                else if (curr_cbf_luma || top_cbf_luma)
+                    bs = 1;
+                else
+                    bs = boundary_strength(s, curr, top, rpl_top);
+                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
             }
-        }
     }
 
     // bs for vertical TU boundaries
@@ -673,50 +778,59 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
                                s->ref->refPicList;
-
         int xp_pu = (x0 - 1) >> log2_min_pu_size;
         int xq_pu =  x0      >> log2_min_pu_size;
         int xp_tu = (x0 - 1) >> log2_min_tu_size;
         int xq_tu =  x0      >> log2_min_tu_size;
 
-        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-            int y_pu      = (y0 + i) >> log2_min_pu_size;
-            int y_tu      = (y0 + i) >> log2_min_tu_size;
-            MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-            MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-
-            uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-            uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+                int y_pu      = (y0 + i) >> log2_min_pu_size;
+                int y_tu      = (y0 + i) >> log2_min_tu_size;
+                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
 
-            bs = boundary_strength(s, curr, curr_cbf_luma,
-                                   left, left_cbf_luma, rpl_left, 1);
-            if (bs)
-                s->vertical_bs[(x0 >> 3) + ((y0 + i) >> 2) * s->bs_width] = bs;
-        }
+                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+                    bs = 2;
+                else if (curr_cbf_luma || left_cbf_luma)
+                    bs = 1;
+                else
+                    bs = boundary_strength(s, curr, left, rpl_left);
+                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+            }
     }
 
-    // bs for TU internal vertical PU boundaries
     if (log2_trafo_size > log2_min_pu_size && !is_intra) {
         RefPicList *rpl = s->ref->refPicList;
 
+        // bs for TU internal horizontal PU boundaries
+        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+
+            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+                int x_pu = (x0 + i) >> log2_min_pu_size;
+                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+
+                bs = boundary_strength(s, curr, top, rpl);
+                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+            }
+        }
+
+        // bs for TU internal vertical PU boundaries
         for (j = 0; j < (1 << log2_trafo_size); j += 4) {
             int y_pu = (y0 + j) >> log2_min_pu_size;
-            int y_tu = (y0 + j) >> log2_min_tu_size;
 
             for (i = 8; i < (1 << log2_trafo_size); i += 8) {
                 int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
                 int xq_pu = (x0 + i)     >> log2_min_pu_size;
-                int xp_tu = (x0 + i - 1) >> log2_min_tu_size;
-                int xq_tu = (x0 + i)     >> log2_min_tu_size;
                 MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
                 MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
 
-                bs = boundary_strength(s, curr, curr_cbf_luma,
-                                       left, left_cbf_luma, rpl, 0);
-                if (bs)
-                    s->vertical_bs[((x0 + i) >> 3) + ((y0 + j) >> 2) * s->bs_width] = bs;
+                bs = boundary_strength(s, curr, left, rpl);
+                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
             }
         }
     }
@@ -726,21 +840,39 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
 #undef CB
 #undef CR
 
-void ff_hevc_hls_filter(HEVCContext *s, int x, int y)
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
 {
-    deblocking_filter_CTB(s, x, y);
-    if (s->ps.sps->sao_enabled)
-        sao_filter_CTB(s, x, y);
+    int x_end = x >= s->ps.sps->width  - ctb_size;
+    if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+        deblocking_filter_CTB(s, x, y);
+    if (s->ps.sps->sao_enabled) {
+        int y_end = y >= s->ps.sps->height - ctb_size;
+        if (y && x)
+            sao_filter_CTB(s, x - ctb_size, y - ctb_size);
+        if (x && y_end)
+            sao_filter_CTB(s, x - ctb_size, y);
+        if (y && x_end) {
+            sao_filter_CTB(s, x, y - ctb_size);
+            if (s->threads_type & FF_THREAD_FRAME )
+                ff_thread_report_progress(&s->ref->tf, y, 0);
+        }
+        if (x_end && y_end) {
+            sao_filter_CTB(s, x , y);
+            if (s->threads_type & FF_THREAD_FRAME )
+                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+        }
+    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 }
 
 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
 {
+    int x_end = x_ctb >= s->ps.sps->width  - ctb_size;
+    int y_end = y_ctb >= s->ps.sps->height - ctb_size;
     if (y_ctb && x_ctb)
-        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size);
-    if (y_ctb && x_ctb >= s->ps.sps->width - ctb_size) {
-        ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size);
-        ff_thread_report_progress(&s->ref->tf, y_ctb - ctb_size, 0);
-    }
-    if (x_ctb && y_ctb >= s->ps.sps->height - ctb_size)
-        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb);
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size, ctb_size);
+    if (y_ctb && x_end)
+        ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size, ctb_size);
+    if (x_ctb && y_end)
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb, ctb_size);
 }
diff --git a/libavcodec/hevc_mp4toannexb_bsf.c b/libavcodec/hevc_mp4toannexb_bsf.c
index 8d7ac58..5cc642b 100644
--- a/libavcodec/hevc_mp4toannexb_bsf.c
+++ b/libavcodec/hevc_mp4toannexb_bsf.c
@@ -2,20 +2,20 @@
  * HEVC MP4 to Annex B byte stream format filter
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,7 +43,7 @@ static int hevc_extradata_to_annexb(AVBSFContext *ctx)
     int ret = 0;
 
     uint8_t *new_extradata = NULL;
-    size_t   new_extradata_size = 0;;
+    size_t   new_extradata_size = 0;
 
     bytestream2_init(&gb, ctx->par_in->extradata, ctx->par_in->extradata_size);
 
diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
index 446b83a..4a6dde0 100644
--- a/libavcodec/hevc_mvs.c
+++ b/libavcodec/hevc_mvs.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2013 Anand Meher Kotra
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,9 +41,9 @@ static const uint8_t l0_l1_cand_idx[12][2] = {
 void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
                                      int nPbW, int nPbH)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
+    HEVCLocalContext *lc = s->HEVClc;
+    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
 
     lc->na.cand_up       = (lc->ctb_up_flag   || y0b);
     lc->na.cand_left     = (lc->ctb_left_flag || x0b);
@@ -52,8 +52,7 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
             ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size)) ?
                     lc->ctb_up_right_flag && !y0b : lc->na.cand_up;
     lc->na.cand_up_right =
-            ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size) ?
-                    lc->ctb_up_right_flag && !y0b : lc->na.cand_up )
+            lc->na.cand_up_right_sap
                      && (x0 + nPbW) < lc->end_of_tiles_x;
     lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left;
 }
@@ -61,56 +60,29 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
 /*
  * 6.4.1 Derivation process for z-scan order block availability
  */
-static int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
+static av_always_inline int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
                               int xN, int yN)
 {
 #define MIN_TB_ADDR_ZS(x, y)                                            \
-    s->ps.pps->min_tb_addr_zs[(y) * s->ps.sps->min_tb_width + (x)]
-    int Curr = MIN_TB_ADDR_ZS(xCurr >> s->ps.sps->log2_min_tb_size,
-                              yCurr >> s->ps.sps->log2_min_tb_size);
-    int N;
-
-    if (xN < 0 || yN < 0 ||
-        xN >= s->ps.sps->width ||
-        yN >= s->ps.sps->height)
-        return 0;
-
-    N = MIN_TB_ADDR_ZS(xN >> s->ps.sps->log2_min_tb_size,
-                       yN >> s->ps.sps->log2_min_tb_size);
-
-    return N <= Curr;
-}
-
-static int same_prediction_block(HEVCLocalContext *lc, int log2_cb_size,
-                                 int x0, int y0, int nPbW, int nPbH,
-                                 int xA1, int yA1, int partIdx)
-{
-    return !(nPbW << 1 == 1 << log2_cb_size &&
-             nPbH << 1 == 1 << log2_cb_size && partIdx == 1 &&
-             lc->cu.x + nPbW > xA1 &&
-             lc->cu.y + nPbH <= yA1);
-}
+    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
 
-/*
- * 6.4.2 Derivation process for prediction block availability
- */
-static int check_prediction_block_available(HEVCContext *s, int log2_cb_size,
-                                            int x0, int y0, int nPbW, int nPbH,
-                                            int xA1, int yA1, int partIdx)
-{
-    HEVCLocalContext *lc = &s->HEVClc;
-
-    if (lc->cu.x < xA1 && lc->cu.y < yA1 &&
-        (lc->cu.x + (1 << log2_cb_size)) > xA1 &&
-        (lc->cu.y + (1 << log2_cb_size)) > yA1)
-        return same_prediction_block(lc, log2_cb_size, x0, y0,
-                                     nPbW, nPbH, xA1, yA1, partIdx);
-    else
-        return z_scan_block_avail(s, x0, y0, xA1, yA1);
+    int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size;
+    int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size;
+    int xN_ctb    = xN    >> s->ps.sps->log2_ctb_size;
+    int yN_ctb    = yN    >> s->ps.sps->log2_ctb_size;
+    if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb )
+        return 1;
+    else {
+        int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
+                (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
+        int N    = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
+                (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
+        return N <= Curr;
+    }
 }
 
 //check if the two luma locations belong to the same motion estimation region
-static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
+static av_always_inline int is_diff_mer(HEVCContext *s, int xN, int yN, int xP, int yP)
 {
     uint8_t plevel = s->ps.pps->log2_parallel_merge_level;
 
@@ -122,18 +94,20 @@ static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
 #define MATCH(x) (A.x == B.x)
 
 // check if the mv's and refidx are the same between A and B
-static int compareMVrefidx(struct MvField A, struct MvField B)
+static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField B)
 {
-    if (A.pred_flag[0] && A.pred_flag[1] && B.pred_flag[0] && B.pred_flag[1])
-        return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
-               MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-
-    if (A.pred_flag[0] && !A.pred_flag[1] && B.pred_flag[0] && !B.pred_flag[1])
-        return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
-
-    if (!A.pred_flag[0] && A.pred_flag[1] && !B.pred_flag[0] && B.pred_flag[1])
-        return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-
+    int a_pf = A.pred_flag;
+    int b_pf = B.pred_flag;
+    if (a_pf == b_pf) {
+        if (a_pf == PF_BI) {
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
+                   MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
+        } else if (a_pf == PF_L0) {
+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
+        } else if (a_pf == PF_L1) {
+            return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
+        }
+    }
     return 0;
 }
 
@@ -144,11 +118,11 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
     td = av_clip_int8(td);
     tb = av_clip_int8(tb);
     tx = (0x4000 + abs(td / 2)) / td;
-    scale_factor = av_clip((tb * tx + 32) >> 6, -4096, 4095);
+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
     dst->x = av_clip_int16((scale_factor * src->x + 127 +
-                             (scale_factor * src->x < 0)) >> 8);
+                           (scale_factor * src->x < 0)) >> 8);
     dst->y = av_clip_int16((scale_factor * src->y + 127 +
-                             (scale_factor * src->y < 0)) >> 8);
+                           (scale_factor * src->y < 0)) >> 8);
 }
 
 static int check_mvset(Mv *mvLXCol, Mv *mvCol,
@@ -169,10 +143,7 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
     col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol];
     cur_poc_diff = poc    - refPicList[X].list[refIdxLx];
 
-    if (!col_poc_diff)
-        col_poc_diff = 1;  // error resilience
-
-    if (cur_lt || col_poc_diff == cur_poc_diff) {
+    if (cur_lt || col_poc_diff == cur_poc_diff || !col_poc_diff) {
         mvLXCol->x = mvCol->x;
         mvLXCol->y = mvCol->y;
     } else {
@@ -194,32 +165,30 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
 {
     RefPicList *refPicList = s->ref->refPicList;
 
-    if (temp_col.is_intra) {
-        mvLXCol->x = 0;
-        mvLXCol->y = 0;
+    if (temp_col.pred_flag == PF_INTRA)
         return 0;
-    }
 
-    if (temp_col.pred_flag[0] == 0)
+    if (!(temp_col.pred_flag & PF_L0))
         return CHECK_MVSET(1);
-    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 0)
+    else if (temp_col.pred_flag == PF_L0)
         return CHECK_MVSET(0);
-    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 1) {
+    else if (temp_col.pred_flag == PF_BI) {
         int check_diffpicount = 0;
-        int i = 0;
-        for (i = 0; i < refPicList[0].nb_refs; i++) {
-            if (refPicList[0].list[i] > s->poc)
-                check_diffpicount++;
-        }
-        for (i = 0; i < refPicList[1].nb_refs; i++) {
-            if (refPicList[1].list[i] > s->poc)
-                check_diffpicount++;
+        int i, j;
+        for (j = 0; j < 2; j++) {
+            for (i = 0; i < refPicList[j].nb_refs; i++) {
+                if (refPicList[j].list[i] > s->poc) {
+                    check_diffpicount++;
+                    break;
+                }
+            }
         }
-        if (check_diffpicount == 0 && X == 0)
-            return CHECK_MVSET(0);
-        else if (check_diffpicount == 0 && X == 1)
-            return CHECK_MVSET(1);
-        else {
+        if (!check_diffpicount) {
+            if (X==0)
+                return CHECK_MVSET(0);
+            else
+                return CHECK_MVSET(1);
+        } else {
             if (s->sh.collocated_list == L1)
                 return CHECK_MVSET(0);
             else
@@ -234,7 +203,8 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
     tab_mvf[(y) * min_pu_width + x]
 
 #define TAB_MVF_PU(v)                                                   \
-    TAB_MVF(x ## v ## _pu, y ## v ## _pu)
+    TAB_MVF(((x ## v) >> s->ps.sps->log2_min_pu_size),                     \
+            ((y ## v) >> s->ps.sps->log2_min_pu_size))
 
 #define DERIVE_TEMPORAL_COLOCATED_MVS                                   \
     derive_temporal_colocated_mvs(s, temp_col,                          \
@@ -275,7 +245,8 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
         x < s->ps.sps->width) {
         x                 &= ~15;
         y                 &= ~15;
-        ff_thread_await_progress(&ref->tf, y, 0);
+        if (s->threads_type == FF_THREAD_FRAME)
+            ff_thread_await_progress(&ref->tf, y, 0);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
@@ -288,7 +259,8 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
         y                  = y0 + (nPbH >> 1);
         x                 &= ~15;
         y                 &= ~15;
-        ff_thread_await_progress(&ref->tf, y, 0);
+        if (s->threads_type == FF_THREAD_FRAME)
+            ff_thread_await_progress(&ref->tf, y, 0);
         x_pu               = x >> s->ps.sps->log2_min_pu_size;
         y_pu               = y >> s->ps.sps->log2_min_pu_size;
         temp_col           = TAB_MVF(x_pu, y_pu);
@@ -298,15 +270,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
 }
 
 #define AVAILABLE(cand, v)                                      \
-    (cand && !TAB_MVF_PU(v).is_intra)
+    (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA))
 
 #define PRED_BLOCK_AVAILABLE(v)                                 \
-    check_prediction_block_available(s, log2_cb_size,           \
-                                     x0, y0, nPbW, nPbH,        \
-                                     x ## v, y ## v, part_idx)
+    z_scan_block_avail(s, x0, y0, x ## v, y ## v)
 
 #define COMPARE_MV_REFIDX(a, b)                                 \
-    compareMVrefidx(TAB_MVF_PU(a), TAB_MVF_PU(b))
+    compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b))
 
 /*
  * 8.5.3.1.2  Derivation process for spatial merging candidates
@@ -318,7 +288,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
                                             int merge_idx,
                                             struct MvField mergecandlist[])
 {
-    HEVCLocalContext *lc   = &s->HEVClc;
+    HEVCLocalContext *lc   = s->HEVClc;
     RefPicList *refPicList = s->ref->refPicList;
     MvField *tab_mvf       = s->ref->tab_mvf;
 
@@ -332,33 +302,21 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
 
     const int xA1    = x0 - 1;
     const int yA1    = y0 + nPbH - 1;
-    const int xA1_pu = xA1 >> s->ps.sps->log2_min_pu_size;
-    const int yA1_pu = yA1 >> s->ps.sps->log2_min_pu_size;
 
     const int xB1    = x0 + nPbW - 1;
     const int yB1    = y0 - 1;
-    const int xB1_pu = xB1 >> s->ps.sps->log2_min_pu_size;
-    const int yB1_pu = yB1 >> s->ps.sps->log2_min_pu_size;
 
     const int xB0    = x0 + nPbW;
     const int yB0    = y0 - 1;
-    const int xB0_pu = xB0 >> s->ps.sps->log2_min_pu_size;
-    const int yB0_pu = yB0 >> s->ps.sps->log2_min_pu_size;
 
     const int xA0    = x0 - 1;
     const int yA0    = y0 + nPbH;
-    const int xA0_pu = xA0 >> s->ps.sps->log2_min_pu_size;
-    const int yA0_pu = yA0 >> s->ps.sps->log2_min_pu_size;
 
     const int xB2    = x0 - 1;
     const int yB2    = y0 - 1;
-    const int xB2_pu = xB2 >> s->ps.sps->log2_min_pu_size;
-    const int yB2_pu = yB2 >> s->ps.sps->log2_min_pu_size;
 
     const int nb_refs = (s->sh.slice_type == P_SLICE) ?
                         s->sh.nb_refs[0] : FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]);
-    int check_MER   = 1;
-    int check_MER_1 = 1;
 
     int zero_idx = 0;
 
@@ -370,57 +328,49 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     int is_available_b0;
     int is_available_b1;
     int is_available_b2;
-    int check_B0;
-    int check_A0;
 
-    //first left spatial merge candidate
-    is_available_a1 = AVAILABLE(cand_left, A1);
 
     if (!singleMCLFlag && part_idx == 1 &&
         (lc->cu.part_mode == PART_Nx2N ||
          lc->cu.part_mode == PART_nLx2N ||
          lc->cu.part_mode == PART_nRx2N) ||
-        isDiffMER(s, xA1, yA1, x0, y0)) {
+        is_diff_mer(s, xA1, yA1, x0, y0)) {
         is_available_a1 = 0;
+    } else {
+        is_available_a1 = AVAILABLE(cand_left, A1);
+        if (is_available_a1) {
+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1);
+            if (merge_idx == 0)
+                return;
+            nb_merge_cand++;
+        }
     }
 
-    if (is_available_a1) {
-        mergecandlist[0] = TAB_MVF_PU(A1);
-        if (merge_idx == 0)
-            return;
-        nb_merge_cand++;
-    }
-
-    // above spatial merge candidate
-    is_available_b1 = AVAILABLE(cand_up, B1);
-
     if (!singleMCLFlag && part_idx == 1 &&
         (lc->cu.part_mode == PART_2NxN ||
          lc->cu.part_mode == PART_2NxnU ||
          lc->cu.part_mode == PART_2NxnD) ||
-        isDiffMER(s, xB1, yB1, x0, y0)) {
+        is_diff_mer(s, xB1, yB1, x0, y0)) {
         is_available_b1 = 0;
+    } else {
+        is_available_b1 = AVAILABLE(cand_up, B1);
+        if (is_available_b1 &&
+            !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) {
+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1);
+            if (merge_idx == nb_merge_cand)
+                return;
+            nb_merge_cand++;
+        }
     }
 
-    if (is_available_a1 && is_available_b1)
-        check_MER = !COMPARE_MV_REFIDX(B1, A1);
-
-    if (is_available_b1 && check_MER)
-        mergecandlist[nb_merge_cand++] = TAB_MVF_PU(B1);
-
     // above right spatial merge candidate
-    check_MER = 1;
-    check_B0  = PRED_BLOCK_AVAILABLE(B0);
-
-    is_available_b0 = check_B0 && AVAILABLE(cand_up_right, B0);
+    is_available_b0 = AVAILABLE(cand_up_right, B0) &&
+                      xB0 < s->ps.sps->width &&
+                      PRED_BLOCK_AVAILABLE(B0) &&
+                      !is_diff_mer(s, xB0, yB0, x0, y0);
 
-    if (isDiffMER(s, xB0, yB0, x0, y0))
-        is_available_b0 = 0;
-
-    if (is_available_b1 && is_available_b0)
-        check_MER = !COMPARE_MV_REFIDX(B0, B1);
-
-    if (is_available_b0 && check_MER) {
+    if (is_available_b0 &&
+        !(is_available_b1 && COMPARE_MV_REFIDX(B0, B1))) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(B0);
         if (merge_idx == nb_merge_cand)
             return;
@@ -428,18 +378,13 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     }
 
     // left bottom spatial merge candidate
-    check_MER = 1;
-    check_A0  = PRED_BLOCK_AVAILABLE(A0);
-
-    is_available_a0 = check_A0 && AVAILABLE(cand_bottom_left, A0);
+    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
+                      yA0 < s->ps.sps->height &&
+                      PRED_BLOCK_AVAILABLE(A0) &&
+                      !is_diff_mer(s, xA0, yA0, x0, y0);
 
-    if (isDiffMER(s, xA0, yA0, x0, y0))
-        is_available_a0 = 0;
-
-    if (is_available_a1 && is_available_a0)
-        check_MER = !COMPARE_MV_REFIDX(A0, A1);
-
-    if (is_available_a0 && check_MER) {
+    if (is_available_a0 &&
+        !(is_available_a1 && COMPARE_MV_REFIDX(A0, A1))) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(A0);
         if (merge_idx == nb_merge_cand)
             return;
@@ -447,20 +392,13 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     }
 
     // above left spatial merge candidate
-    check_MER = 1;
+    is_available_b2 = AVAILABLE(cand_up_left, B2) &&
+                      !is_diff_mer(s, xB2, yB2, x0, y0);
 
-    is_available_b2 = AVAILABLE(cand_up_left, B2);
-
-    if (isDiffMER(s, xB2, yB2, x0, y0))
-        is_available_b2 = 0;
-
-    if (is_available_a1 && is_available_b2)
-        check_MER = !COMPARE_MV_REFIDX(B2, A1);
-
-    if (is_available_b1 && is_available_b2)
-        check_MER_1 = !COMPARE_MV_REFIDX(B2, B1);
-
-    if (is_available_b2 && check_MER && check_MER_1 && nb_merge_cand != 4) {
+    if (is_available_b2 &&
+        !(is_available_a1 && COMPARE_MV_REFIDX(B2, A1)) &&
+        !(is_available_b1 && COMPARE_MV_REFIDX(B2, B1)) &&
+        nb_merge_cand != 4) {
         mergecandlist[nb_merge_cand] = TAB_MVF_PU(B2);
         if (merge_idx == nb_merge_cand)
             return;
@@ -478,9 +416,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
                                                        0, &mv_l1_col, 1) : 0;
 
         if (available_l0 || available_l1) {
-            mergecandlist[nb_merge_cand].is_intra     = 0;
-            mergecandlist[nb_merge_cand].pred_flag[0] = available_l0;
-            mergecandlist[nb_merge_cand].pred_flag[1] = available_l1;
+            mergecandlist[nb_merge_cand].pred_flag = available_l0 + (available_l1 << 1);
             AV_ZERO16(mergecandlist[nb_merge_cand].ref_idx);
             mergecandlist[nb_merge_cand].mv[0]      = mv_l0_col;
             mergecandlist[nb_merge_cand].mv[1]      = mv_l1_col;
@@ -496,7 +432,7 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
     // combined bi-predictive merge candidates  (applies for B slices)
     if (s->sh.slice_type == B_SLICE && nb_orig_merge_cand > 1 &&
         nb_orig_merge_cand < s->sh.max_num_merge_cand) {
-        int comb_idx;
+        int comb_idx = 0;
 
         for (comb_idx = 0; nb_merge_cand < s->sh.max_num_merge_cand &&
                            comb_idx < nb_orig_merge_cand * (nb_orig_merge_cand - 1); comb_idx++) {
@@ -505,17 +441,15 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
             MvField l0_cand = mergecandlist[l0_cand_idx];
             MvField l1_cand = mergecandlist[l1_cand_idx];
 
-            if (l0_cand.pred_flag[0] && l1_cand.pred_flag[1] &&
+            if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) &&
                 (refPicList[0].list[l0_cand.ref_idx[0]] !=
                  refPicList[1].list[l1_cand.ref_idx[1]] ||
                  AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) {
                 mergecandlist[nb_merge_cand].ref_idx[0]   = l0_cand.ref_idx[0];
                 mergecandlist[nb_merge_cand].ref_idx[1]   = l1_cand.ref_idx[1];
-                mergecandlist[nb_merge_cand].pred_flag[0] = 1;
-                mergecandlist[nb_merge_cand].pred_flag[1] = 1;
+                mergecandlist[nb_merge_cand].pred_flag    = PF_BI;
                 AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]);
                 AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]);
-                mergecandlist[nb_merge_cand].is_intra     = 0;
                 if (merge_idx == nb_merge_cand)
                     return;
                 nb_merge_cand++;
@@ -525,11 +459,9 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
 
     // append Zero motion vector candidates
     while (nb_merge_cand < s->sh.max_num_merge_cand) {
-        mergecandlist[nb_merge_cand].pred_flag[0] = 1;
-        mergecandlist[nb_merge_cand].pred_flag[1] = s->sh.slice_type == B_SLICE;
+        mergecandlist[nb_merge_cand].pred_flag    = PF_L0 + ((s->sh.slice_type == B_SLICE) << 1);
         AV_ZERO32(mergecandlist[nb_merge_cand].mv + 0);
         AV_ZERO32(mergecandlist[nb_merge_cand].mv + 1);
-        mergecandlist[nb_merge_cand].is_intra     = 0;
         mergecandlist[nb_merge_cand].ref_idx[0]   = zero_idx < nb_refs ? zero_idx : 0;
         mergecandlist[nb_merge_cand].ref_idx[1]   = zero_idx < nb_refs ? zero_idx : 0;
 
@@ -552,7 +484,7 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
     LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]);
     int nPbW2 = nPbW;
     int nPbH2 = nPbH;
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
 
     if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) {
         singleMCLFlag = 1;
@@ -568,11 +500,9 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
                                     singleMCLFlag, part_idx,
                                     merge_idx, mergecand_list);
 
-    if (mergecand_list[merge_idx].pred_flag[0] == 1 &&
-        mergecand_list[merge_idx].pred_flag[1] == 1 &&
+    if (mergecand_list[merge_idx].pred_flag == PF_BI &&
         (nPbW2 + nPbH2) == 12) {
-        mergecand_list[merge_idx].ref_idx[1]   = -1;
-        mergecand_list[merge_idx].pred_flag[1] = 0;
+        mergecand_list[merge_idx].pred_flag = PF_L0;
     }
 
     *mv = mergecand_list[merge_idx];
@@ -603,7 +533,7 @@ static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index,
 
     RefPicList *refPicList = s->ref->refPicList;
 
-    if (TAB_MVF(x, y).pred_flag[pred_flag_index] == 1 &&
+    if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) &&
         refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) {
         *mv = TAB_MVF(x, y).mv[pred_flag_index];
         return 1;
@@ -618,82 +548,73 @@ static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index,
     int min_pu_width = s->ps.sps->min_pu_width;
 
     RefPicList *refPicList = s->ref->refPicList;
-    int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
 
-    int colIsLongTerm =
-        refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
+    if ((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) {
+        int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
 
-    if (TAB_MVF(x, y).pred_flag[pred_flag_index] &&
-        colIsLongTerm == currIsLongTerm) {
-        *mv = TAB_MVF(x, y).mv[pred_flag_index];
-        if (!currIsLongTerm)
-            dist_scale(s, mv, min_pu_width, x, y,
-                       pred_flag_index, ref_idx_curr, ref_idx);
-        return 1;
+        int colIsLongTerm =
+            refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
+
+        if (colIsLongTerm == currIsLongTerm) {
+            *mv = TAB_MVF(x, y).mv[pred_flag_index];
+            if (!currIsLongTerm)
+                dist_scale(s, mv, min_pu_width, x, y,
+                           pred_flag_index, ref_idx_curr, ref_idx);
+            return 1;
+        }
     }
     return 0;
 }
 
 #define MP_MX(v, pred, mx)                                      \
-    mv_mp_mode_mx(s, x ## v ## _pu, y ## v ## _pu, pred,        \
-                  &mx, ref_idx_curr, ref_idx)
+    mv_mp_mode_mx(s,                                            \
+                  (x ## v) >> s->ps.sps->log2_min_pu_size,         \
+                  (y ## v) >> s->ps.sps->log2_min_pu_size,         \
+                  pred, &mx, ref_idx_curr, ref_idx)
 
 #define MP_MX_LT(v, pred, mx)                                   \
-    mv_mp_mode_mx_lt(s, x ## v ## _pu, y ## v ## _pu, pred,     \
-                     &mx, ref_idx_curr, ref_idx)
+    mv_mp_mode_mx_lt(s,                                         \
+                     (x ## v) >> s->ps.sps->log2_min_pu_size,      \
+                     (y ## v) >> s->ps.sps->log2_min_pu_size,      \
+                     pred, &mx, ref_idx_curr, ref_idx)
 
 void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
                               int nPbH, int log2_cb_size, int part_idx,
                               int merge_idx, MvField *mv,
                               int mvp_lx_flag, int LX)
 {
-    HEVCLocalContext *lc = &s->HEVClc;
+    HEVCLocalContext *lc = s->HEVClc;
     MvField *tab_mvf = s->ref->tab_mvf;
     int isScaledFlag_L0 = 0;
-    int availableFlagLXA0 = 0;
-    int availableFlagLXB0 = 0;
+    int availableFlagLXA0 = 1;
+    int availableFlagLXB0 = 1;
     int numMVPCandLX = 0;
     int min_pu_width = s->ps.sps->min_pu_width;
 
     int xA0, yA0;
-    int xA0_pu, yA0_pu;
     int is_available_a0;
-
     int xA1, yA1;
-    int xA1_pu, yA1_pu;
     int is_available_a1;
-
     int xB0, yB0;
-    int xB0_pu, yB0_pu;
     int is_available_b0;
-
     int xB1, yB1;
-    int xB1_pu = 0, yB1_pu = 0;
-    int is_available_b1 = 0;
-
+    int is_available_b1;
     int xB2, yB2;
-    int xB2_pu = 0, yB2_pu = 0;
-    int is_available_b2 = 0;
+    int is_available_b2;
+
     Mv mvpcand_list[2] = { { 0 } };
-    Mv mxA = { 0 };
-    Mv mxB = { 0 };
-    int ref_idx_curr = 0;
+    Mv mxA;
+    Mv mxB;
+    int ref_idx_curr;
     int ref_idx = 0;
     int pred_flag_index_l0;
     int pred_flag_index_l1;
-    int x0b = x0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-    int y0b = y0 & ((1 << s->ps.sps->log2_ctb_size) - 1);
-
-    int cand_up = (lc->ctb_up_flag || y0b);
-    int cand_left = (lc->ctb_left_flag || x0b);
-    int cand_up_left =
-            (!x0b && !y0b) ? lc->ctb_up_left_flag : cand_left && cand_up;
-    int cand_up_right =
-            (x0b + nPbW == (1 << s->ps.sps->log2_ctb_size) ||
-             x0  + nPbW >= lc->end_of_tiles_x) ? lc->ctb_up_right_flag && !y0b
-                                               : cand_up;
-    int cand_bottom_left = (y0 + nPbH >= lc->end_of_tiles_y) ? 0 : cand_left;
 
+    const int cand_bottom_left = lc->na.cand_bottom_left;
+    const int cand_left        = lc->na.cand_left;
+    const int cand_up_left     = lc->na.cand_up_left;
+    const int cand_up          = lc->na.cand_up;
+    const int cand_up_right    = lc->na.cand_up_right_sap;
     ref_idx_curr       = LX;
     ref_idx            = mv->ref_idx[LX];
     pred_flag_index_l0 = LX;
@@ -702,97 +623,109 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
     // left bottom spatial candidate
     xA0 = x0 - 1;
     yA0 = y0 + nPbH;
-    xA0_pu = xA0 >> s->ps.sps->log2_min_pu_size;
-    yA0_pu = yA0 >> s->ps.sps->log2_min_pu_size;
 
-    is_available_a0 = PRED_BLOCK_AVAILABLE(A0) && AVAILABLE(cand_bottom_left, A0);
+    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
+                      yA0 < s->ps.sps->height &&
+                      PRED_BLOCK_AVAILABLE(A0);
 
     //left spatial merge candidate
     xA1    = x0 - 1;
     yA1    = y0 + nPbH - 1;
-    xA1_pu = xA1 >> s->ps.sps->log2_min_pu_size;
-    yA1_pu = yA1 >> s->ps.sps->log2_min_pu_size;
 
     is_available_a1 = AVAILABLE(cand_left, A1);
     if (is_available_a0 || is_available_a1)
         isScaledFlag_L0 = 1;
 
     if (is_available_a0) {
-        availableFlagLXA0 = MP_MX(A0, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX(A0, pred_flag_index_l1, mxA);
-    }
-
-    if (is_available_a1 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX(A1, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX(A1, pred_flag_index_l1, mxA);
+        if (MP_MX(A0, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX(A0, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (is_available_a0 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l1, mxA);
+    if (is_available_a1) {
+        if (MP_MX(A1, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX(A1, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (is_available_a1 && !availableFlagLXA0) {
-        availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l0, mxA);
-        if (!availableFlagLXA0)
-            availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l1, mxA);
+    if (is_available_a0) {
+        if (MP_MX_LT(A0, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX_LT(A0, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
 
-    if (availableFlagLXA0 && !mvp_lx_flag) {
-        mv->mv[LX] = mxA;
-        return;
+    if (is_available_a1) {
+        if (MP_MX_LT(A1, pred_flag_index_l0, mxA)) {
+            goto b_candidates;
+        }
+        if (MP_MX_LT(A1, pred_flag_index_l1, mxA)) {
+            goto b_candidates;
+        }
     }
+    availableFlagLXA0 = 0;
 
+b_candidates:
     // B candidates
     // above right spatial merge candidate
     xB0    = x0 + nPbW;
     yB0    = y0 - 1;
-    xB0_pu = xB0 >> s->ps.sps->log2_min_pu_size;
-    yB0_pu = yB0 >> s->ps.sps->log2_min_pu_size;
 
-    is_available_b0 = PRED_BLOCK_AVAILABLE(B0) && AVAILABLE(cand_up_right, B0);
+    is_available_b0 =  AVAILABLE(cand_up_right, B0) &&
+                       xB0 < s->ps.sps->width &&
+                       PRED_BLOCK_AVAILABLE(B0);
 
-    if (is_available_b0) {
-        availableFlagLXB0 = MP_MX(B0, pred_flag_index_l0, mxB);
-        if (!availableFlagLXB0)
-            availableFlagLXB0 = MP_MX(B0, pred_flag_index_l1, mxB);
-    }
-
-    if (!availableFlagLXB0) {
-        // above spatial merge candidate
-        xB1    = x0 + nPbW - 1;
-        yB1    = y0 - 1;
-        xB1_pu = xB1 >> s->ps.sps->log2_min_pu_size;
-        yB1_pu = yB1 >> s->ps.sps->log2_min_pu_size;
+    // above spatial merge candidate
+    xB1    = x0 + nPbW - 1;
+    yB1    = y0 - 1;
+    is_available_b1 = AVAILABLE(cand_up, B1);
 
-        is_available_b1 = AVAILABLE(cand_up, B1);
+    // above left spatial merge candidate
+    xB2 = x0 - 1;
+    yB2 = y0 - 1;
+    is_available_b2 = AVAILABLE(cand_up_left, B2);
 
-        if (is_available_b1) {
-            availableFlagLXB0 = MP_MX(B1, pred_flag_index_l0, mxB);
-            if (!availableFlagLXB0)
-                availableFlagLXB0 = MP_MX(B1, pred_flag_index_l1, mxB);
+    // above right spatial merge candidate
+    if (is_available_b0) {
+        if (MP_MX(B0, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B0, pred_flag_index_l1, mxB)) {
+            goto scalef;
         }
     }
 
-    if (!availableFlagLXB0) {
-        // above left spatial merge candidate
-        xB2 = x0 - 1;
-        yB2 = y0 - 1;
-        xB2_pu = xB2 >> s->ps.sps->log2_min_pu_size;
-        yB2_pu = yB2 >> s->ps.sps->log2_min_pu_size;
-        is_available_b2 = AVAILABLE(cand_up_left, B2);
+    // above spatial merge candidate
+    if (is_available_b1) {
+        if (MP_MX(B1, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B1, pred_flag_index_l1, mxB)) {
+            goto scalef;
+        }
+    }
 
-        if (is_available_b2) {
-            availableFlagLXB0 = MP_MX(B2, pred_flag_index_l0, mxB);
-            if (!availableFlagLXB0)
-                availableFlagLXB0 = MP_MX(B2, pred_flag_index_l1, mxB);
+    // above left spatial merge candidate
+    if (is_available_b2) {
+        if (MP_MX(B2, pred_flag_index_l0, mxB)) {
+            goto scalef;
+        }
+        if (MP_MX(B2, pred_flag_index_l1, mxB)) {
+            goto scalef;
         }
     }
+    availableFlagLXB0 = 0;
 
-    if (isScaledFlag_L0 == 0) {
+scalef:
+    if (!isScaledFlag_L0) {
         if (availableFlagLXB0) {
             availableFlagLXA0 = 1;
             mxA = mxB;
@@ -836,10 +769,5 @@ void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
             mvpcand_list[numMVPCandLX++] = mv_col;
     }
 
-    // insert zero motion vectors when the number of available candidates are less than 2
-    while (numMVPCandLX < 2)
-        mvpcand_list[numMVPCandLX++] = (Mv){ 0, 0 };
-
-    mv->mv[LX].x = mvpcand_list[mvp_lx_flag].x;
-    mv->mv[LX].y = mvpcand_list[mvp_lx_flag].y;
+    mv->mv[LX] = mvpcand_list[mvp_lx_flag];
 }
diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
index dc5fffc..b5633f1 100644
--- a/libavcodec/hevc_parser.c
+++ b/libavcodec/hevc_parser.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,8 @@
 
 #define IS_IRAP_NAL(nal) (nal->type >= 16 && nal->type <= 23)
 
+#define ADVANCED_PARSER CONFIG_HEVC_DECODER
+
 typedef struct HEVCParserContext {
     ParseContext pc;
 
@@ -38,8 +40,13 @@ typedef struct HEVCParserContext {
     HEVCParamSets ps;
 
     int parsed_extradata;
+
+#if ADVANCED_PARSER
+    HEVCContext h;
+#endif
 } HEVCParserContext;
 
+#if !ADVANCED_PARSER
 static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
                                    AVCodecContext *avctx)
 {
@@ -110,12 +117,19 @@ static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
         case NAL_RADL_N:
         case NAL_RADL_R:
         case NAL_RASL_N:
-        case NAL_RASL_R: hevc_parse_slice_header(s, nal, avctx); break;
+        case NAL_RASL_R:
+            if (buf == avctx->extradata) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid NAL unit: %d\n", nal->type);
+                return AVERROR_INVALIDDATA;
+            }
+            hevc_parse_slice_header(s, nal, avctx);
+            break;
         }
     }
 
     return 0;
 }
+#endif
 
 /**
  * Find the end of the current frame in the bitstream.
@@ -124,9 +138,8 @@ static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
 static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
                                int buf_size)
 {
-    HEVCParserContext *ctx = s->priv_data;
-    ParseContext       *pc = &ctx->pc;
     int i;
+    ParseContext *pc = s->priv_data;
 
     for (i = 0; i < buf_size; i++) {
         int nut;
@@ -150,7 +163,6 @@ static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
             if (first_slice_segment_in_pic_flag) {
                 if (!pc->frame_start_found) {
                     pc->frame_start_found = 1;
-                    s->key_frame = nut >= NAL_BLA_W_LP && nut <= NAL_CRA_NUT;
                 } else { // First slice of next frame found
                     pc->frame_start_found = 0;
                     return i - 5;
@@ -162,12 +174,219 @@ static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf,
     return END_NOT_FOUND;
 }
 
-static int hevc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
+#if ADVANCED_PARSER
+/**
+ * Parse NAL units of found picture and decode some basic information.
+ *
+ * @param s parser context.
+ * @param avctx codec context.
+ * @param buf buffer with field/frame data.
+ * @param buf_size size of the buffer.
+ */
+static inline int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
+                           int buf_size, AVCodecContext *avctx)
+{
+    HEVCParserContext *ctx = s->priv_data;
+    HEVCContext       *h   = &ctx->h;
+    GetBitContext      *gb;
+    SliceHeader        *sh = &h->sh;
+    HEVCParamSets *ps = &h->ps;
+    H2645Packet   *pkt = &ctx->pkt;
+    const uint8_t *buf_end = buf + buf_size;
+    int state = -1, i;
+    H2645NAL *nal;
+    int is_global = buf == avctx->extradata;
+
+    if (!h->HEVClc)
+        h->HEVClc = av_mallocz(sizeof(HEVCLocalContext));
+    if (!h->HEVClc)
+        return AVERROR(ENOMEM);
+
+    gb = &h->HEVClc->gb;
+
+    /* set some sane default values */
+    s->pict_type         = AV_PICTURE_TYPE_I;
+    s->key_frame         = 0;
+    s->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN;
+
+    h->avctx = avctx;
+
+    ff_hevc_reset_sei(h);
+
+    if (!buf_size)
+        return 0;
+
+    if (pkt->nals_allocated < 1) {
+        H2645NAL *tmp = av_realloc_array(pkt->nals, 1, sizeof(*tmp));
+        if (!tmp)
+            return AVERROR(ENOMEM);
+        pkt->nals = tmp;
+        memset(pkt->nals, 0, sizeof(*tmp));
+        pkt->nals_allocated = 1;
+    }
+
+    nal = &pkt->nals[0];
+
+    for (;;) {
+        int src_length, consumed;
+        int ret;
+        buf = avpriv_find_start_code(buf, buf_end, &state);
+        if (--buf + 2 >= buf_end)
+            break;
+        src_length = buf_end - buf;
+
+        h->nal_unit_type = (*buf >> 1) & 0x3f;
+        h->temporal_id   = (*(buf + 1) & 0x07) - 1;
+        if (h->nal_unit_type <= NAL_CRA_NUT) {
+            // Do not walk the whole buffer just to decode slice segment header
+            if (src_length > 20)
+                src_length = 20;
+        }
+
+        consumed = ff_h2645_extract_rbsp(buf, src_length, nal);
+        if (consumed < 0)
+            return consumed;
+
+        ret = init_get_bits8(gb, nal->data + 2, nal->size);
+        if (ret < 0)
+            return ret;
+
+        switch (h->nal_unit_type) {
+        case NAL_VPS:
+            ff_hevc_decode_nal_vps(gb, avctx, ps);
+            break;
+        case NAL_SPS:
+            ff_hevc_decode_nal_sps(gb, avctx, ps, 1);
+            break;
+        case NAL_PPS:
+            ff_hevc_decode_nal_pps(gb, avctx, ps);
+            break;
+        case NAL_SEI_PREFIX:
+        case NAL_SEI_SUFFIX:
+            ff_hevc_decode_nal_sei(h);
+            break;
+        case NAL_TRAIL_N:
+        case NAL_TRAIL_R:
+        case NAL_TSA_N:
+        case NAL_TSA_R:
+        case NAL_STSA_N:
+        case NAL_STSA_R:
+        case NAL_RADL_N:
+        case NAL_RADL_R:
+        case NAL_RASL_N:
+        case NAL_RASL_R:
+        case NAL_BLA_W_LP:
+        case NAL_BLA_W_RADL:
+        case NAL_BLA_N_LP:
+        case NAL_IDR_W_RADL:
+        case NAL_IDR_N_LP:
+        case NAL_CRA_NUT:
+
+            if (is_global) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid NAL unit: %d\n", h->nal_unit_type);
+                return AVERROR_INVALIDDATA;
+            }
+
+            sh->first_slice_in_pic_flag = get_bits1(gb);
+            s->picture_structure = h->picture_struct;
+            s->field_order = h->picture_struct;
+
+            if (IS_IRAP(h)) {
+                s->key_frame = 1;
+                sh->no_output_of_prior_pics_flag = get_bits1(gb);
+            }
+
+            sh->pps_id = get_ue_golomb(gb);
+            if (sh->pps_id >= MAX_PPS_COUNT || !ps->pps_list[sh->pps_id]) {
+                av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
+                return AVERROR_INVALIDDATA;
+            }
+            ps->pps = (HEVCPPS*)ps->pps_list[sh->pps_id]->data;
+
+            if (ps->pps->sps_id >= MAX_SPS_COUNT || !ps->sps_list[ps->pps->sps_id]) {
+                av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", ps->pps->sps_id);
+                return AVERROR_INVALIDDATA;
+            }
+            if (ps->sps != (HEVCSPS*)ps->sps_list[ps->pps->sps_id]->data) {
+                ps->sps = (HEVCSPS*)ps->sps_list[ps->pps->sps_id]->data;
+                ps->vps = (HEVCVPS*)ps->vps_list[ps->sps->vps_id]->data;
+            }
+
+            if (!sh->first_slice_in_pic_flag) {
+                int slice_address_length;
+
+                if (ps->pps->dependent_slice_segments_enabled_flag)
+                    sh->dependent_slice_segment_flag = get_bits1(gb);
+                else
+                    sh->dependent_slice_segment_flag = 0;
+
+                slice_address_length = av_ceil_log2_c(ps->sps->ctb_width *
+                                                      ps->sps->ctb_height);
+                sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
+                if (sh->slice_segment_addr >= ps->sps->ctb_width * ps->sps->ctb_height) {
+                    av_log(avctx, AV_LOG_ERROR, "Invalid slice segment address: %u.\n",
+                           sh->slice_segment_addr);
+                    return AVERROR_INVALIDDATA;
+                }
+            } else
+                sh->dependent_slice_segment_flag = 0;
+
+            if (sh->dependent_slice_segment_flag)
+                break;
+
+            for (i = 0; i < ps->pps->num_extra_slice_header_bits; i++)
+                skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
+
+            sh->slice_type = get_ue_golomb(gb);
+            if (!(sh->slice_type == I_SLICE || sh->slice_type == P_SLICE ||
+                  sh->slice_type == B_SLICE)) {
+                av_log(avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
+                       sh->slice_type);
+                return AVERROR_INVALIDDATA;
+            }
+            s->pict_type = sh->slice_type == B_SLICE ? AV_PICTURE_TYPE_B :
+                           sh->slice_type == P_SLICE ? AV_PICTURE_TYPE_P :
+                                                       AV_PICTURE_TYPE_I;
+
+            if (ps->pps->output_flag_present_flag)
+                sh->pic_output_flag = get_bits1(gb);
+
+            if (ps->sps->separate_colour_plane_flag)
+                sh->colour_plane_id = get_bits(gb, 2);
+
+            if (!IS_IDR(h)) {
+                sh->pic_order_cnt_lsb = get_bits(gb, ps->sps->log2_max_poc_lsb);
+                s->output_picture_number = h->poc = ff_hevc_compute_poc(h, sh->pic_order_cnt_lsb);
+            } else
+                s->output_picture_number = h->poc = 0;
+
+            if (h->temporal_id == 0 &&
+                h->nal_unit_type != NAL_TRAIL_N &&
+                h->nal_unit_type != NAL_TSA_N &&
+                h->nal_unit_type != NAL_STSA_N &&
+                h->nal_unit_type != NAL_RADL_N &&
+                h->nal_unit_type != NAL_RASL_N &&
+                h->nal_unit_type != NAL_RADL_R &&
+                h->nal_unit_type != NAL_RASL_R)
+                h->pocTid0 = h->poc;
+
+            return 0; /* no need to evaluate the rest */
+        }
+        buf += consumed;
+    }
+    /* didn't find a picture! */
+    if (!is_global)
+        av_log(h->avctx, AV_LOG_ERROR, "missing picture in access unit\n");
+    return -1;
+}
+#endif
+
+static int hevc_parse(AVCodecParserContext *s,
+                      AVCodecContext *avctx,
                       const uint8_t **poutbuf, int *poutbuf_size,
                       const uint8_t *buf, int buf_size)
 {
     int next;
-
     HEVCParserContext *ctx = s->priv_data;
     ParseContext *pc = &ctx->pc;
 
@@ -197,20 +416,31 @@ static int hevc_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 // Split after the parameter sets at the beginning of the stream if they exist.
 static int hevc_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
 {
-    int i;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
     uint32_t state = -1;
-    int has_ps = 0;
-
-    for (i = 0; i < buf_size; i++) {
-        state = (state << 8) | buf[i];
-        if (((state >> 8) & 0xFFFFFF) == START_CODE) {
-            int nut = (state >> 1) & 0x3F;
-            if (nut >= NAL_VPS && nut <= NAL_PPS)
-                has_ps = 1;
-            else if (has_ps)
-                return i - 3;
-            else // no parameter set at the beginning of the stream
-                return 0;
+    int has_vps = 0;
+    int has_sps = 0;
+    int has_pps = 0;
+    int nut;
+
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if ((state >> 8) != START_CODE)
+            break;
+        nut = (state >> 1) & 0x3F;
+        if (nut == NAL_VPS)
+            has_vps = 1;
+        else if (nut == NAL_SPS)
+            has_sps = 1;
+        else if (nut == NAL_PPS)
+            has_pps = 1;
+        else if ((nut != NAL_SEI_PREFIX || has_pps) &&
+                  nut != NAL_AUD) {
+            if (has_vps && has_sps) {
+                while (ptr - 4 > buf && ptr[-5] == 0)
+                    ptr--;
+                return ptr - 4 - buf;
+            }
         }
     }
     return 0;
@@ -221,6 +451,21 @@ static void hevc_parser_close(AVCodecParserContext *s)
     HEVCParserContext *ctx = s->priv_data;
     int i;
 
+#if ADVANCED_PARSER
+    HEVCContext  *h  = &ctx->h;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.vps_list); i++)
+        av_buffer_unref(&h->ps.vps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.sps_list); i++)
+        av_buffer_unref(&h->ps.sps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(h->ps.pps_list); i++)
+        av_buffer_unref(&h->ps.pps_list[i]);
+
+    h->ps.sps = NULL;
+
+    av_freep(&h->HEVClc);
+#endif
+
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.vps_list); i++)
         av_buffer_unref(&ctx->ps.vps_list[i]);
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.sps_list); i++)
@@ -228,6 +473,8 @@ static void hevc_parser_close(AVCodecParserContext *s)
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->ps.pps_list); i++)
         av_buffer_unref(&ctx->ps.pps_list[i]);
 
+    ctx->ps.sps = NULL;
+
     ff_h2645_packet_uninit(&ctx->pkt);
 
     av_freep(&ctx->pc.buffer);
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
index 583fa06..83f2ec2 100644
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@@ -6,25 +6,24 @@
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2013 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
-
 #include "golomb.h"
 #include "hevc.h"
 
@@ -88,6 +87,8 @@ static void remove_sps(HEVCParamSets *s, int id)
         for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
             if (s->pps_list[i] && ((HEVCPPS*)s->pps_list[i]->data)->sps_id == id)
                 remove_pps(s, i);
+
+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCSPS*)s->sps_list[id]->data));
     }
     av_buffer_unref(&s->sps_list[id]);
 }
@@ -121,7 +122,8 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 
     if (rps_predict) {
         const ShortTermRPS *rps_ridx;
-        int delta_rps, abs_delta_rps;
+        int delta_rps;
+        unsigned abs_delta_rps;
         uint8_t use_delta_flag = 0;
         uint8_t delta_rps_sign;
 
@@ -140,6 +142,12 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 
         delta_rps_sign = get_bits1(gb);
         abs_delta_rps  = get_ue_golomb_long(gb) + 1;
+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid value of abs_delta_rps: %d\n",
+                   abs_delta_rps);
+            return AVERROR_INVALIDDATA;
+        }
         delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
         for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
             int used = rps->used[k] = get_bits1(gb);
@@ -227,11 +235,14 @@ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
 }
 
 
-static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
+static int decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
                                       PTLCommon *ptl)
 {
     int i;
 
+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
+        return -1;
+
     ptl->profile_space = get_bits(gb, 2);
     ptl->tier_flag     = get_bits1(gb);
     ptl->profile_idc   = get_bits(gb, 5);
@@ -241,11 +252,17 @@ static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
         av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
     else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
         av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
     else
         av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
 
-    for (i = 0; i < 32; i++)
+    for (i = 0; i < 32; i++) {
         ptl->profile_compatibility_flag[i] = get_bits1(gb);
+
+        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
+            ptl->profile_idc = i;
+    }
     ptl->progressive_source_flag    = get_bits1(gb);
     ptl->interlaced_source_flag     = get_bits1(gb);
     ptl->non_packed_constraint_flag = get_bits1(gb);
@@ -254,28 +271,48 @@ static void decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
     skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
     skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
     skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
+
+    return 0;
 }
 
-static void parse_ptl(GetBitContext *gb, AVCodecContext *avctx,
+static int parse_ptl(GetBitContext *gb, AVCodecContext *avctx,
                       PTL *ptl, int max_num_sub_layers)
 {
     int i;
-    decode_profile_tier_level(gb, avctx, &ptl->general_ptl);
+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
+        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
+        return -1;
+    }
+
     ptl->general_ptl.level_idc = get_bits(gb, 8);
 
     for (i = 0; i < max_num_sub_layers - 1; i++) {
         ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
         ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
     }
-    if (max_num_sub_layers - 1 > 0)
+
+    if (max_num_sub_layers - 1> 0)
         for (i = max_num_sub_layers - 1; i < 8; i++)
             skip_bits(gb, 2); // reserved_zero_2bits[i]
     for (i = 0; i < max_num_sub_layers - 1; i++) {
-        if (ptl->sub_layer_profile_present_flag[i])
-            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]);
-        if (ptl->sub_layer_level_present_flag[i])
-            ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
+        if (ptl->sub_layer_profile_present_flag[i] &&
+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "PTL information for sublayer %i too short\n", i);
+            return -1;
+        }
+        if (ptl->sub_layer_level_present_flag[i]) {
+            if (get_bits_left(gb) < 8) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not enough data for sublayer %i level_idc\n", i);
+                return -1;
+            } else
+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
+        }
     }
+
+    return 0;
 }
 
 static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
@@ -295,7 +332,7 @@ static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
     }
 }
 
-static void decode_hrd(GetBitContext *gb, int common_inf_present,
+static int decode_hrd(GetBitContext *gb, int common_inf_present,
                        int max_sublayers)
 {
     int nal_params_present = 0, vcl_params_present = 0;
@@ -341,14 +378,20 @@ static void decode_hrd(GetBitContext *gb, int common_inf_present,
         else
             low_delay = get_bits1(gb);
 
-        if (!low_delay)
+        if (!low_delay) {
             nb_cpb = get_ue_golomb_long(gb) + 1;
+            if (nb_cpb < 1 || nb_cpb > 32) {
+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
+                return AVERROR_INVALIDDATA;
+            }
+        }
 
         if (nal_params_present)
             decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
         if (vcl_params_present)
             decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
     }
+    return 0;
 }
 
 int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
@@ -391,7 +434,8 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         goto err;
     }
 
-    parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers);
+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
+        goto err;
 
     vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
 
@@ -401,7 +445,7 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
         vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
 
-        if (vps->vps_max_dec_pic_buffering[i] > MAX_DPB_SIZE) {
+        if (vps->vps_max_dec_pic_buffering[i] > MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
             av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
                    vps->vps_max_dec_pic_buffering[i] - 1);
             goto err;
@@ -416,6 +460,12 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
 
     vps->vps_max_layer_id   = get_bits(gb, 6);
     vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
+        goto err;
+    }
+
     for (i = 1; i < vps->vps_num_layer_sets; i++)
         for (j = 0; j <= vps->vps_max_layer_id; j++)
             skip_bits(gb, 1);  // layer_id_included_flag[i][j]
@@ -428,6 +478,11 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
         if (vps->vps_poc_proportional_to_timing_flag)
             vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
         vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
+            goto err;
+        }
         for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
             int common_inf_present = 1;
 
@@ -439,6 +494,13 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
     }
     get_bits1(gb); /* vps_extension_flag */
 
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread VPS by %d bits\n", -get_bits_left(gb));
+        if (ps->vps_list[vps_id])
+            goto err;
+    }
+
     if (ps->vps_list[vps_id] &&
         !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
         av_buffer_unref(&vps_buf);
@@ -458,7 +520,8 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
                        int apply_defdispwin, HEVCSPS *sps)
 {
     VUI *vui          = &sps->vui;
-    int sar_present;
+    GetBitContext backup;
+    int sar_present, alt = 0;
 
     av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
 
@@ -498,6 +561,19 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
                 vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
             if (vui->matrix_coeffs >= AVCOL_SPC_NB)
                 vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
+            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
+                switch (sps->pix_fmt) {
+                case AV_PIX_FMT_YUV444P:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP;
+                    break;
+                case AV_PIX_FMT_YUV444P10:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
+                    break;
+                case AV_PIX_FMT_YUV444P12:
+                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
+                    break;
+                }
+            }
         }
     }
 
@@ -511,13 +587,21 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
     vui->field_seq_flag                = get_bits1(gb);
     vui->frame_field_info_present_flag = get_bits1(gb);
 
-    vui->default_display_window_flag = get_bits1(gb);
+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
+        vui->default_display_window_flag = 0;
+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
+    } else
+        vui->default_display_window_flag = get_bits1(gb);
+    // Backup context in case an alternate header is detected
+    memcpy(&backup, gb, sizeof(backup));
+
     if (vui->default_display_window_flag) {
-        //TODO: * 2 is only valid for 420
-        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * 2;
-        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * 2;
-        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) * 2;
-        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * 2;
+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
+        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
+        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
+        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
+        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
 
         if (apply_defdispwin &&
             avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
@@ -537,9 +621,24 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
     }
 
     vui->vui_timing_info_present_flag = get_bits1(gb);
+
     if (vui->vui_timing_info_present_flag) {
+        if( get_bits_left(gb) < 66) {
+            // The alternate syntax seem to have timing info located
+            // at where def_disp_win is normally located
+            av_log(avctx, AV_LOG_WARNING,
+                   "Strange VUI timing information, retrying...\n");
+            vui->default_display_window_flag = 0;
+            memset(&vui->def_disp_win, 0, sizeof(vui->def_disp_win));
+            memcpy(gb, &backup, sizeof(backup));
+            alt = 1;
+        }
         vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
         vui->vui_time_scale                      = get_bits_long(gb, 32);
+        if (alt) {
+            av_log(avctx, AV_LOG_INFO, "Retry got %i/%i fps\n",
+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
+        }
         vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
         if (vui->vui_poc_proportional_to_timing_flag)
             vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
@@ -584,19 +683,24 @@ static void set_default_scaling_list_data(ScalingList *sl)
     memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
     memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
     memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
-    memcpy(sl->sl[3][1], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
 }
 
-static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl)
+static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl, HEVCSPS *sps)
 {
-    uint8_t scaling_list_pred_mode_flag[4][6];
+    uint8_t scaling_list_pred_mode_flag;
     int32_t scaling_list_dc_coef[2][6];
-    int size_id, matrix_id, i, pos;
+    int size_id, matrix_id, pos;
+    int i;
 
     for (size_id = 0; size_id < 4; size_id++)
-        for (matrix_id = 0; matrix_id < (size_id == 3 ? 2 : 6); matrix_id++) {
-            scaling_list_pred_mode_flag[size_id][matrix_id] = get_bits1(gb);
-            if (!scaling_list_pred_mode_flag[size_id][matrix_id]) {
+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
+            scaling_list_pred_mode_flag = get_bits1(gb);
+            if (!scaling_list_pred_mode_flag) {
                 unsigned int delta = get_ue_golomb_long(gb);
                 /* Only need to handle non-zero delta. Zero means default,
                  * which should already be in the arrays. */
@@ -640,26 +744,58 @@ static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingLi
             }
         }
 
+    if (sps->chroma_format_idc == 3) {
+        for (i = 0; i < 64; i++) {
+            sl->sl[3][1][i] = sl->sl[2][1][i];
+            sl->sl[3][2][i] = sl->sl[2][2][i];
+            sl->sl[3][4][i] = sl->sl[2][4][i];
+            sl->sl[3][5][i] = sl->sl[2][5][i];
+        }
+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
+    }
+
+
     return 0;
 }
 
 static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
 {
     const AVPixFmtDescriptor *desc;
-    if (sps->chroma_format_idc == 1) {
-        switch (sps->bit_depth) {
-        case 8:  sps->pix_fmt = AV_PIX_FMT_YUV420P;   break;
-        case 9:  sps->pix_fmt = AV_PIX_FMT_YUV420P9;  break;
-        case 10: sps->pix_fmt = AV_PIX_FMT_YUV420P10; break;
-        default:
-            av_log(avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n",
-                   sps->bit_depth);
-            return AVERROR_PATCHWELCOME;
-        }
-    } else {
+    switch (sps->bit_depth) {
+    case 8:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+       break;
+    case 9:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P9;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P9;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P9;
+        break;
+    case 10:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
+        break;
+    case 12:
+        if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+        if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P12;
+        if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P12;
+        if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P12;
+        break;
+    default:
         av_log(avctx, AV_LOG_ERROR,
-               "non-4:2:0 support is currently unspecified.\n");
-        return AVERROR_PATCHWELCOME;
+               "4:2:0, 4:2:2, 4:4:4 supports are currently specified for 8, 10 and 12 bits.\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "chroma_format_idc is %d, depth is %d",
+               sps->chroma_format_idc, sps->bit_depth);
+        return AVERROR_INVALIDDATA;
     }
 
     desc = av_pix_fmt_desc_get(sps->pix_fmt);
@@ -688,59 +824,58 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->vps_id = get_bits(gb, 4);
     if (sps->vps_id >= MAX_VPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     if (vps_list && !vps_list[sps->vps_id]) {
         av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
                sps->vps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sps->max_sub_layers = get_bits(gb, 3) + 1;
     if (sps->max_sub_layers > MAX_SUB_LAYERS) {
         av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
                sps->max_sub_layers);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     skip_bits1(gb); // temporal_id_nesting_flag
 
-    parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers);
+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
+        return ret;
 
     *sps_id = get_ue_golomb_long(gb);
     if (*sps_id >= MAX_SPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sps->chroma_format_idc = get_ue_golomb_long(gb);
-    if (sps->chroma_format_idc != 1) {
-        avpriv_report_missing_feature(avctx, "chroma_format_idc %d",
-                                      sps->chroma_format_idc);
-        ret = AVERROR_PATCHWELCOME;
-        goto err;
+    if (sps->chroma_format_idc > 3U) {
+        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
+        return AVERROR_INVALIDDATA;
     }
 
     if (sps->chroma_format_idc == 3)
         sps->separate_colour_plane_flag = get_bits1(gb);
 
+    if (sps->separate_colour_plane_flag)
+        sps->chroma_format_idc = 0;
+
     sps->width  = get_ue_golomb_long(gb);
     sps->height = get_ue_golomb_long(gb);
     if ((ret = av_image_check_size(sps->width,
                                    sps->height, 0, avctx)) < 0)
-        goto err;
+        return ret;
 
     if (get_bits1(gb)) { // pic_conformance_flag
-        //TODO: * 2 is only valid for 420
-        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * 2;
-        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * 2;
-        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) * 2;
-        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * 2;
+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
+        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
+        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
+        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
+        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
 
         if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
             av_log(avctx, AV_LOG_DEBUG,
@@ -761,26 +896,23 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
 
     sps->bit_depth   = get_ue_golomb_long(gb) + 8;
     bit_depth_chroma = get_ue_golomb_long(gb) + 8;
-    if (bit_depth_chroma != sps->bit_depth) {
+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
         av_log(avctx, AV_LOG_ERROR,
                "Luma bit depth (%d) is different from chroma bit depth (%d), "
                "this is unsupported.\n",
                sps->bit_depth, bit_depth_chroma);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-
     ret = map_pixel_format(avctx, sps);
     if (ret < 0)
-        goto err;
+        return ret;
 
     sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
     if (sps->log2_max_poc_lsb > 16) {
         av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
                sps->log2_max_poc_lsb - 4);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
     sublayer_ordering_info = get_bits1(gb);
@@ -792,16 +924,14 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         if (sps->temporal_layer[i].max_dec_pic_buffering > MAX_DPB_SIZE) {
             av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
                    sps->temporal_layer[i].max_dec_pic_buffering - 1);
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
         if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
             av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
                    sps->temporal_layer[i].num_reorder_pics);
             if (avctx->err_recognition & AV_EF_EXPLODE ||
                 sps->temporal_layer[i].num_reorder_pics > MAX_DPB_SIZE - 1) {
-                ret = AVERROR_INVALIDDATA;
-                goto err;
+                return AVERROR_INVALIDDATA;
             }
             sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
         }
@@ -822,11 +952,26 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
                                                sps->log2_min_tb_size;
 
-    if (sps->log2_min_tb_size >= sps->log2_min_cb_size) {
+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_diff_max_min_coding_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
         av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
+        return AVERROR_INVALIDDATA;
     }
+
     sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
     sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
 
@@ -835,9 +980,9 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
         set_default_scaling_list_data(&sps->scaling_list);
 
         if (get_bits1(gb)) {
-            ret = scaling_list_data(gb, avctx, &sps->scaling_list);
+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
             if (ret < 0)
-                goto err;
+                return ret;
         }
     }
 
@@ -855,8 +1000,7 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
             av_log(avctx, AV_LOG_ERROR,
                    "PCM bit depth (%d) is greater than normal bit depth (%d)\n",
                    sps->pcm.bit_depth, sps->bit_depth);
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
 
         sps->pcm.loop_filter_disable_flag = get_bits1(gb);
@@ -866,18 +1010,22 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     if (sps->nb_st_rps > MAX_SHORT_TERM_RPS_COUNT) {
         av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
                sps->nb_st_rps);
-        ret = AVERROR_INVALIDDATA;
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     for (i = 0; i < sps->nb_st_rps; i++) {
         if ((ret = ff_hevc_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
                                                  sps, 0)) < 0)
-            goto err;
+            return ret;
     }
 
     sps->long_term_ref_pics_present_flag = get_bits1(gb);
     if (sps->long_term_ref_pics_present_flag) {
         sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
+        if (sps->num_long_term_ref_pics_sps > 31U) {
+            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
+                   sps->num_long_term_ref_pics_sps);
+            return AVERROR_INVALIDDATA;
+        }
         for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
             sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
             sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
@@ -890,8 +1038,42 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     vui_present = get_bits1(gb);
     if (vui_present)
         decode_vui(gb, avctx, apply_defdispwin, sps);
-    skip_bits1(gb); // sps_extension_flag
 
+    if (get_bits1(gb)) { // sps_extension_flag
+        int sps_extension_flag[1];
+        for (i = 0; i < 1; i++)
+            sps_extension_flag[i] = get_bits1(gb);
+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
+        if (sps_extension_flag[0]) {
+            int extended_precision_processing_flag;
+            int high_precision_offsets_enabled_flag;
+            int cabac_bypass_alignment_enabled_flag;
+
+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
+
+            extended_precision_processing_flag = get_bits1(gb);
+            if (extended_precision_processing_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "extended_precision_processing_flag not yet implemented\n");
+
+            sps->intra_smoothing_disabled_flag       = get_bits1(gb);
+            high_precision_offsets_enabled_flag  = get_bits1(gb);
+            if (high_precision_offsets_enabled_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "high_precision_offsets_enabled_flag not yet implemented\n");
+
+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
+
+            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
+            if (cabac_bypass_alignment_enabled_flag)
+                av_log(avctx, AV_LOG_WARNING,
+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
+        }
+    }
     if (apply_defdispwin) {
         sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
         sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
@@ -909,19 +1091,17 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
                          (sps->output_window.left_offset + sps->output_window.right_offset);
     sps->output_height = sps->height -
                          (sps->output_window.top_offset + sps->output_window.bottom_offset);
-    if (sps->output_width <= 0 || sps->output_height <= 0) {
+    if (sps->width  <= sps->output_window.left_offset + (int64_t)sps->output_window.right_offset  ||
+        sps->height <= sps->output_window.top_offset  + (int64_t)sps->output_window.bottom_offset) {
         av_log(avctx, AV_LOG_WARNING, "Invalid visible frame dimensions: %dx%d.\n",
                sps->output_width, sps->output_height);
         if (avctx->err_recognition & AV_EF_EXPLODE) {
-            ret = AVERROR_INVALIDDATA;
-            goto err;
+            return AVERROR_INVALIDDATA;
         }
         av_log(avctx, AV_LOG_WARNING,
                "Displaying the whole video surface.\n");
-        sps->output_window.left_offset   =
-        sps->output_window.right_offset  =
-        sps->output_window.top_offset    =
-        sps->output_window.bottom_offset = 0;
+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
+        memset(&sps->output_window, 0, sizeof(sps->output_window));
         sps->output_width               = sps->width;
         sps->output_height              = sps->height;
     }
@@ -931,6 +1111,19 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
                          sps->log2_diff_max_min_coding_block_size;
     sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
 
+    if (sps->log2_ctb_size > MAX_LOG2_CTB_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
+        return AVERROR_INVALIDDATA;
+    }
+    if (sps->log2_ctb_size < 4) {
+        av_log(avctx,
+               AV_LOG_ERROR,
+               "log2_ctb_size %d differs from the bounds of any known profile\n",
+               sps->log2_ctb_size);
+        avpriv_request_sample(avctx, "log2_ctb_size %d", sps->log2_ctb_size);
+        return AVERROR_INVALIDDATA;
+    }
+
     sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
     sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
     sps->ctb_size   = sps->ctb_width * sps->ctb_height;
@@ -941,40 +1134,40 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
     sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
     sps->min_pu_width  = sps->width  >> sps->log2_min_pu_size;
     sps->min_pu_height = sps->height >> sps->log2_min_pu_size;
+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
 
     sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
 
-    if (sps->width  & ((1 << sps->log2_min_cb_size) - 1) ||
-        sps->height & ((1 << sps->log2_min_cb_size) - 1)) {
+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
         av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-    if (sps->log2_ctb_size > MAX_LOG2_CTB_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
-        goto err;
-    }
     if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
         av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
                sps->max_transform_hierarchy_depth_inter);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
         av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
                sps->max_transform_hierarchy_depth_intra);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
     if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
         av_log(avctx, AV_LOG_ERROR,
                "max transform block size out of range: %d\n",
                sps->log2_max_trafo_size);
-        goto err;
+        return AVERROR_INVALIDDATA;
     }
 
-    return 0;
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread SPS by %d bits\n", -get_bits_left(gb));
+        return AVERROR_INVALIDDATA;
+    }
 
-err:
-    return ret < 0 ? ret : AVERROR_INVALIDDATA;
+    return 0;
 }
 
 int ff_hevc_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
@@ -1035,16 +1228,52 @@ static void hevc_pps_free(void *opaque, uint8_t *data)
     av_freep(&pps->ctb_addr_ts_to_rs);
     av_freep(&pps->tile_pos_rs);
     av_freep(&pps->tile_id);
-    av_freep(&pps->min_tb_addr_zs);
+    av_freep(&pps->min_tb_addr_zs_tab);
 
     av_freep(&pps);
 }
 
+static int pps_range_extensions(GetBitContext *gb, AVCodecContext *avctx,
+                                HEVCPPS *pps, HEVCSPS *sps) {
+    int i;
+
+    if (pps->transform_skip_enabled_flag) {
+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
+    }
+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
+    if (pps->chroma_qp_offset_list_enabled_flag) {
+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
+        if (pps->chroma_qp_offset_list_len_minus1 && pps->chroma_qp_offset_list_len_minus1 >= 5) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i <= pps->chroma_qp_offset_list_len_minus1; i++) {
+            pps->cb_qp_offset_list[i] = get_se_golomb_long(gb);
+            if (pps->cb_qp_offset_list[i]) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "cb_qp_offset_list not tested yet.\n");
+            }
+            pps->cr_qp_offset_list[i] = get_se_golomb_long(gb);
+            if (pps->cr_qp_offset_list[i]) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "cb_qp_offset_list not tested yet.\n");
+            }
+        }
+    }
+    pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
+    pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
+
+    return(0);
+}
+
 static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                             HEVCPPS *pps, HEVCSPS *sps)
 {
     int log2_diff;
-    int pic_area_in_ctbs, pic_area_in_min_tbs;
+    int pic_area_in_ctbs;
     int i, j, x, y, ctb_addr_rs, tile_id;
 
     // Inferred parameters
@@ -1091,14 +1320,13 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
      * 6.5
      */
     pic_area_in_ctbs     = sps->ctb_width    * sps->ctb_height;
-    pic_area_in_min_tbs  = sps->min_tb_width * sps->min_tb_height;
 
     pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
     pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
     pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-    pps->min_tb_addr_zs    = av_malloc_array(pic_area_in_min_tbs, sizeof(*pps->min_tb_addr_zs));
+    pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab));
     if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-        !pps->tile_id || !pps->min_tb_addr_zs) {
+        !pps->tile_id || !pps->min_tb_addr_zs_tab) {
         return AVERROR(ENOMEM);
     }
 
@@ -1151,8 +1379,13 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                 pps->row_bd[j] * sps->ctb_width + pps->col_bd[i];
 
     log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size;
-    for (y = 0; y < sps->min_tb_height; y++) {
-        for (x = 0; x < sps->min_tb_width; x++) {
+    pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1];
+    for (y = 0; y < sps->tb_mask+2; y++) {
+        pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1;
+        pps->min_tb_addr_zs_tab[y]    = -1;
+    }
+    for (y = 0; y < sps->tb_mask+1; y++) {
+        for (x = 0; x < sps->tb_mask+1; x++) {
             int tb_x = x >> log2_diff;
             int tb_y = y >> log2_diff;
             int rs   = sps->ctb_width * tb_y + tb_x;
@@ -1161,7 +1394,7 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
                 int m = 1 << i;
                 val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0);
             }
-            pps->min_tb_addr_zs[y * sps->min_tb_width + x] = val;
+            pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val;
         }
     }
 
@@ -1198,6 +1431,7 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     pps->disable_dbf                           = 0;
     pps->beta_offset                           = 0;
     pps->tc_offset                             = 0;
+    pps->log2_max_transform_skip_block_size    = 2;
 
     // Coded parameters
     pps_id = get_ue_golomb_long(gb);
@@ -1240,6 +1474,14 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     if (pps->cu_qp_delta_enabled_flag)
         pps->diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
 
+    if (pps->diff_cu_qp_delta_depth < 0 ||
+        pps->diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
+        av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
+               pps->diff_cu_qp_delta_depth);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
     pps->cb_qp_offset = get_se_golomb(gb);
     if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
         av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
@@ -1266,14 +1508,14 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     if (pps->tiles_enabled_flag) {
         pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
         pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
-        if (pps->num_tile_columns == 0 ||
+        if (pps->num_tile_columns <= 0 ||
             pps->num_tile_columns >= sps->width) {
             av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
                    pps->num_tile_columns - 1);
             ret = AVERROR_INVALIDDATA;
             goto err;
         }
-        if (pps->num_tile_rows == 0 ||
+        if (pps->num_tile_rows <= 0 ||
             pps->num_tile_rows >= sps->height) {
             av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
                    pps->num_tile_rows - 1);
@@ -1344,7 +1586,7 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     pps->scaling_list_data_present_flag = get_bits1(gb);
     if (pps->scaling_list_data_present_flag) {
         set_default_scaling_list_data(&pps->scaling_list);
-        ret = scaling_list_data(gb, avctx, &pps->scaling_list);
+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
         if (ret < 0)
             goto err;
     }
@@ -1358,12 +1600,26 @@ int ff_hevc_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
     }
 
     pps->slice_header_extension_present_flag = get_bits1(gb);
-    skip_bits1(gb);     // pps_extension_flag
+
+    if (get_bits1(gb)) { // pps_extension_present_flag
+        int pps_range_extensions_flag = get_bits1(gb);
+        /* int pps_extension_7bits = */ get_bits(gb, 7);
+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
+                goto err;
+        }
+    }
 
     ret = setup_pps(avctx, gb, pps, sps);
     if (ret < 0)
         goto err;
 
+    if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Overread PPS by %d bits\n", -get_bits_left(gb));
+        goto err;
+    }
+
     remove_pps(ps, pps_id);
     ps->pps_list[pps_id] = pps_buf;
 
diff --git a/libavcodec/hevc_ps_enc.c b/libavcodec/hevc_ps_enc.c
index 007a132..c05bf63 100644
--- a/libavcodec/hevc_ps_enc.c
+++ b/libavcodec/hevc_ps_enc.c
@@ -1,20 +1,20 @@
 /*
  * HEVC Parameter Set encoding
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
index b51f259..611ad45 100644
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2012 - 2013 Guillaume Martres
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -57,8 +57,7 @@ RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *ref, int x0, int y0)
 {
     int x_cb         = x0 >> s->ps.sps->log2_ctb_size;
     int y_cb         = y0 >> s->ps.sps->log2_ctb_size;
-    int pic_width_cb = (s->ps.sps->width + (1 << s->ps.sps->log2_ctb_size) - 1) >>
-        s->ps.sps->log2_ctb_size;
+    int pic_width_cb = s->ps.sps->ctb_width;
     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb];
     return (RefPicList *)ref->rpl_tab[ctb_addr_ts];
 }
@@ -109,6 +108,9 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         for (j = 0; j < frame->ctb_count; j++)
             frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
 
+        frame->frame->top_field_first  = s->picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+        frame->frame->interlaced_frame = (s->picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+
         if (s->avctx->hwaccel) {
             const AVHWAccel *hwaccel = s->avctx->hwaccel;
             av_assert0(!frame->hwaccel_picture_private);
@@ -121,7 +123,6 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
         }
 
         return frame;
-
 fail:
         ff_hevc_unref_frame(s, frame, ~0);
         return NULL;
@@ -173,12 +174,22 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
         int min_poc   = INT_MAX;
         int i, min_idx, ret;
 
+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+                HEVCFrame *frame = &s->DPB[i];
+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
+                        frame->sequence == s->seq_output) {
+                    ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+                }
+            }
+        }
+
         for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
             HEVCFrame *frame = &s->DPB[i];
             if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
                 frame->sequence == s->seq_output) {
                 nb_output++;
-                if (frame->poc < min_poc) {
+                if (frame->poc < min_poc || nb_output == 1) {
                     min_poc = frame->poc;
                     min_idx = i;
                 }
@@ -192,16 +203,16 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
 
         if (nb_output) {
             HEVCFrame *frame = &s->DPB[min_idx];
-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->frame->format);
-            int pixel_shift;
-
-            if (!desc)
-                return AVERROR_BUG;
-
-            pixel_shift = desc->comp[0].depth > 8;
-
-            ret = av_frame_ref(out, frame->frame);
-            ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+            AVFrame *dst = out;
+            AVFrame *src = frame->frame;
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
+            int pixel_shift = !!(desc->comp[0].depth > 8);
+
+            ret = av_frame_ref(out, src);
+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
+                ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
+            else
+                ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
             if (ret < 0)
                 return ret;
 
@@ -209,8 +220,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
                 int hshift = (i > 0) ? desc->log2_chroma_w : 0;
                 int vshift = (i > 0) ? desc->log2_chroma_h : 0;
                 int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
-                          (frame->window.top_offset   >> vshift) * out->linesize[i];
-                out->data[i] += off;
+                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
+                dst->data[i] += off;
             }
             av_log(s->avctx, AV_LOG_DEBUG,
                    "Output frame with POC %d.\n", frame->poc);
@@ -226,6 +237,46 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
     return 0;
 }
 
+void ff_hevc_bump_frame(HEVCContext *s)
+{
+    int dpb = 0;
+    int min_poc = INT_MAX;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *frame = &s->DPB[i];
+        if ((frame->flags) &&
+            frame->sequence == s->seq_output &&
+            frame->poc != s->poc) {
+            dpb++;
+        }
+    }
+
+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCFrame *frame = &s->DPB[i];
+            if ((frame->flags) &&
+                frame->sequence == s->seq_output &&
+                frame->poc != s->poc) {
+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
+                    min_poc = frame->poc;
+                }
+            }
+        }
+
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCFrame *frame = &s->DPB[i];
+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
+                frame->sequence == s->seq_output &&
+                frame->poc <= min_poc) {
+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
+            }
+        }
+
+        dpb--;
+    }
+}
+
 static int init_slice_rpl(HEVCContext *s)
 {
     HEVCFrame *frame = s->ref;
@@ -335,8 +386,9 @@ static HEVCFrame *find_ref_idx(HEVCContext *s, int poc)
         }
     }
 
-    av_log(s->avctx, AV_LOG_ERROR,
-           "Could not find ref with POC %d\n", poc);
+    if (s->nal_unit_type != NAL_CRA_NUT && !IS_BLA(s))
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Could not find ref with POC %d\n", poc);
     return NULL;
 }
 
@@ -374,7 +426,8 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
     frame->sequence = s->seq_decode;
     frame->flags    = 0;
 
-    ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+    if (s->threads_type == FF_THREAD_FRAME)
+        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
 
     return frame;
 }
diff --git a/libavcodec/hevc_sei.c b/libavcodec/hevc_sei.c
index 17cef67..148f246 100644
--- a/libavcodec/hevc_sei.c
+++ b/libavcodec/hevc_sei.c
@@ -5,20 +5,20 @@
  * Copyright (C) 2012 - 2013 Gildas Cocherel
  * Copyright (C) 2013 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,10 +56,13 @@ enum HEVC_SEI_TYPE {
 static int decode_nal_sei_decoded_picture_hash(HEVCContext *s)
 {
     int cIdx, i;
-    GetBitContext *gb = &s->HEVClc.gb;
-    uint8_t hash_type = get_bits(gb, 8);
+    uint8_t hash_type;
+    //uint16_t picture_crc;
+    //uint32_t picture_checksum;
+    GetBitContext *gb = &s->HEVClc->gb;
+    hash_type = get_bits(gb, 8);
 
-    for (cIdx = 0; cIdx < 3; cIdx++) {
+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
         if (hash_type == 0) {
             s->is_md5 = 1;
             for (i = 0; i < 16; i++)
@@ -68,18 +71,42 @@ static int decode_nal_sei_decoded_picture_hash(HEVCContext *s)
             // picture_crc = get_bits(gb, 16);
             skip_bits(gb, 16);
         } else if (hash_type == 2) {
-            // picture_checksum = get_bits(gb, 32);
+            // picture_checksum = get_bits_long(gb, 32);
             skip_bits(gb, 32);
         }
     }
     return 0;
 }
 
+static int decode_nal_sei_mastering_display_info(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc->gb;
+    int i;
+    // Mastering primaries
+    for (i = 0; i < 3; i++) {
+        s->display_primaries[i][0] = get_bits(gb, 16);
+        s->display_primaries[i][1] = get_bits(gb, 16);
+    }
+    // White point (x, y)
+    s->white_point[0] = get_bits(gb, 16);
+    s->white_point[1] = get_bits(gb, 16);
+
+    // Max and min luminance of mastering display
+    s->max_mastering_luminance = get_bits_long(gb, 32);
+    s->min_mastering_luminance = get_bits_long(gb, 32);
+
+    // As this SEI message comes before the first frame that references it,
+    // initialize the flag to 2 and decrement on IRAP access unit so it
+    // persists for the coded video sequence (e.g., between two IRAPs)
+    s->sei_mastering_display_info_present = 2;
+    return 0;
+}
+
 static int decode_nal_sei_frame_packing_arrangement(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
-    get_ue_golomb(gb);                  // frame_packing_arrangement_id
+    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
     s->sei_frame_packing_present = !get_bits1(gb);
 
     if (s->sei_frame_packing_present) {
@@ -103,7 +130,7 @@ static int decode_nal_sei_frame_packing_arrangement(HEVCContext *s)
 
 static int decode_nal_sei_display_orientation(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     s->sei_display_orientation_present = !get_bits1(gb);
 
@@ -118,9 +145,148 @@ static int decode_nal_sei_display_orientation(HEVCContext *s)
     return 0;
 }
 
+static int decode_pic_timing(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc->gb;
+    HEVCSPS *sps;
+
+    if (!s->ps.sps_list[s->active_seq_parameter_set_id])
+        return(AVERROR(ENOMEM));
+    sps = (HEVCSPS*)s->ps.sps_list[s->active_seq_parameter_set_id]->data;
+
+    if (sps->vui.frame_field_info_present_flag) {
+        int pic_struct = get_bits(gb, 4);
+        s->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
+        if (pic_struct == 2) {
+            av_log(s->avctx, AV_LOG_DEBUG, "BOTTOM Field\n");
+            s->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
+        } else if (pic_struct == 1) {
+            av_log(s->avctx, AV_LOG_DEBUG, "TOP Field\n");
+            s->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
+        }
+        get_bits(gb, 2);                   // source_scan_type
+        get_bits(gb, 1);                   // duplicate_flag
+    }
+    return 1;
+}
+
+static int decode_registered_user_data_closed_caption(HEVCContext *s, int size)
+{
+    int flag;
+    int user_data_type_code;
+    int cc_count;
+
+    GetBitContext *gb = &s->HEVClc->gb;
+
+    if (size < 3)
+       return AVERROR(EINVAL);
+
+    user_data_type_code = get_bits(gb, 8);
+    if (user_data_type_code == 0x3) {
+        skip_bits(gb, 1); // reserved
+
+        flag = get_bits(gb, 1); // process_cc_data_flag
+        if (flag) {
+            skip_bits(gb, 1);
+            cc_count = get_bits(gb, 5);
+            skip_bits(gb, 8); // reserved
+            size -= 2;
+
+            if (cc_count && size >= cc_count * 3) {
+                const uint64_t new_size = (s->a53_caption_size + cc_count
+                                           * UINT64_C(3));
+                int i, ret;
+
+                if (new_size > INT_MAX)
+                    return AVERROR(EINVAL);
+
+                /* Allow merging of the cc data from two fields. */
+                ret = av_reallocp(&s->a53_caption, new_size);
+                if (ret < 0)
+                    return ret;
+
+                for (i = 0; i < cc_count; i++) {
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
+                }
+                skip_bits(gb, 8); // marker_bits
+            }
+        }
+    } else {
+        int i;
+        for (i = 0; i < size - 1; i++)
+            skip_bits(gb, 8);
+    }
+
+    return 0;
+}
+
+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCContext *s, int size)
+{
+    uint32_t country_code;
+    uint32_t user_identifier;
+
+    GetBitContext *gb = &s->HEVClc->gb;
+
+    if (size < 7)
+        return AVERROR(EINVAL);
+    size -= 7;
+
+    country_code = get_bits(gb, 8);
+    if (country_code == 0xFF) {
+        skip_bits(gb, 8);
+        size--;
+    }
+
+    skip_bits(gb, 8);
+    skip_bits(gb, 8);
+
+    user_identifier = get_bits_long(gb, 32);
+
+    switch (user_identifier) {
+        case MKBETAG('G', 'A', '9', '4'):
+            return decode_registered_user_data_closed_caption(s, size);
+        default:
+            skip_bits_long(gb, size * 8);
+            break;
+    }
+    return 0;
+}
+
+static int active_parameter_sets(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc->gb;
+    int num_sps_ids_minus1;
+    int i;
+    unsigned active_seq_parameter_set_id;
+
+    get_bits(gb, 4); // active_video_parameter_set_id
+    get_bits(gb, 1); // self_contained_cvs_flag
+    get_bits(gb, 1); // num_sps_ids_minus1
+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
+
+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
+        av_log(s->avctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
+        return AVERROR_INVALIDDATA;
+    }
+
+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
+    if (active_seq_parameter_set_id >= MAX_SPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
+        return AVERROR_INVALIDDATA;
+    }
+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
+
+    for (i = 1; i <= num_sps_ids_minus1; i++)
+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
+
+    return 0;
+}
+
 static int decode_nal_sei_prefix(HEVCContext *s, int type, int size)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     switch (type) {
     case 256:  // Mismatched value from HM 8.1
@@ -129,6 +295,21 @@ static int decode_nal_sei_prefix(HEVCContext *s, int type, int size)
         return decode_nal_sei_frame_packing_arrangement(s);
     case SEI_TYPE_DISPLAY_ORIENTATION:
         return decode_nal_sei_display_orientation(s);
+    case SEI_TYPE_PICTURE_TIMING:
+        {
+            int ret = decode_pic_timing(s);
+            av_log(s->avctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
+            skip_bits(gb, 8 * size);
+            return ret;
+        }
+    case SEI_TYPE_MASTERING_DISPLAY_INFO:
+        return decode_nal_sei_mastering_display_info(s);
+    case SEI_TYPE_ACTIVE_PARAMETER_SETS:
+        active_parameter_sets(s);
+        av_log(s->avctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
+        return 0;
+    case SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
+        return decode_nal_sei_user_data_registered_itu_t_t35(s, size);
     default:
         av_log(s->avctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
         skip_bits_long(gb, 8 * size);
@@ -138,7 +319,7 @@ static int decode_nal_sei_prefix(HEVCContext *s, int type, int size)
 
 static int decode_nal_sei_suffix(HEVCContext *s, int type, int size)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     switch (type) {
     case SEI_TYPE_DECODED_PICTURE_HASH:
@@ -152,7 +333,7 @@ static int decode_nal_sei_suffix(HEVCContext *s, int type, int size)
 
 static int decode_nal_sei_message(HEVCContext *s)
 {
-    GetBitContext *gb = &s->HEVClc.gb;
+    GetBitContext *gb = &s->HEVClc->gb;
 
     int payload_type = 0;
     int payload_size = 0;
@@ -173,7 +354,7 @@ static int decode_nal_sei_message(HEVCContext *s)
     } else { /* nal_unit_type == NAL_SEI_SUFFIX */
         return decode_nal_sei_suffix(s, payload_type, payload_size);
     }
-    return 0;
+    return 1;
 }
 
 static int more_rbsp_data(GetBitContext *gb)
@@ -183,8 +364,18 @@ static int more_rbsp_data(GetBitContext *gb)
 
 int ff_hevc_decode_nal_sei(HEVCContext *s)
 {
+    int ret;
+
     do {
-        decode_nal_sei_message(s);
-    } while (more_rbsp_data(&s->HEVClc.gb));
-    return 0;
+        ret = decode_nal_sei_message(s);
+        if (ret < 0)
+            return(AVERROR(ENOMEM));
+    } while (more_rbsp_data(&s->HEVClc->gb));
+    return 1;
+}
+
+void ff_hevc_reset_sei(HEVCContext *s)
+{
+    s->a53_caption_size = 0;
+    av_freep(&s->a53_caption);
 }
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 15a712d..9d773d9 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -2,21 +2,23 @@
  * HEVC video decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -89,36 +91,20 @@ static const int8_t transform[32][32] = {
       90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
 };
 
-DECLARE_ALIGNED(16, const int16_t, ff_hevc_epel_coeffs[7][16]) = {
-    { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
-    { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
-    { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
-    { -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 },
-    { -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 },
-    { -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 },
-    { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
-};
-
-DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_coeffs8[7][16]) = {
-    { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
-    { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
-    { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
-    { -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 },
-    { -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 },
-    { -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 },
-    { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
-};
-
-DECLARE_ALIGNED(16, const int16_t, ff_hevc_qpel_coeffs[3][8]) = {
-    { -1, 4, -10, 58, 17, -5,  1,  0 },
-    { -1, 4, -11, 40, 40, -11, 4, -1 },
-    {  0, 1,  -5, 17, 58, -10, 4, -1 },
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][4]) = {
+    { -2, 58, 10, -2},
+    { -4, 54, 16, -2},
+    { -6, 46, 28, -4},
+    { -4, 36, 36, -4},
+    { -4, 28, 46, -6},
+    { -2, 16, 54, -4},
+    { -2, 10, 58, -2},
 };
 
-DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_coeffs8[3][16]) = {
-    { -1, 4, -10, 58, 17, -5,  1,  0, -1, 4, -10, 58, 17, -5,  1,  0 },
-    { -1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1 },
-    {  0, 1,  -5, 17, 58, -10, 4, -1,  0, 1,  -5, 17, 58, -10, 4, -1 },
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
 };
 
 #define BIT_DEPTH 8
@@ -133,93 +119,119 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_coeffs8[3][16]) = {
 #include "hevcdsp_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "hevcdsp_template.c"
+#undef BIT_DEPTH
+
 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 {
 #undef FUNC
 #define FUNC(a, depth) a ## _ ## depth
 
-#define QPEL_FUNC(i, width, depth)                                                  \
-    hevcdsp->put_hevc_qpel[0][0][i] = FUNC(put_hevc_qpel_pixels_ ## width, depth);  \
-    hevcdsp->put_hevc_qpel[0][1][i] = FUNC(put_hevc_qpel_h_      ## width, depth);  \
-    hevcdsp->put_hevc_qpel[1][0][i] = FUNC(put_hevc_qpel_v_      ## width, depth);  \
-    hevcdsp->put_hevc_qpel[1][1][i] = FUNC(put_hevc_qpel_hv_     ## width, depth);  \
-
-#define EPEL_FUNC(i, width, depth)                                                  \
-    hevcdsp->put_hevc_epel[0][0][i] = FUNC(put_hevc_epel_pixels_ ## width, depth);  \
-    hevcdsp->put_hevc_epel[0][1][i] = FUNC(put_hevc_epel_h_      ## width, depth);  \
-    hevcdsp->put_hevc_epel[1][0][i] = FUNC(put_hevc_epel_v_      ## width, depth);  \
-    hevcdsp->put_hevc_epel[1][1][i] = FUNC(put_hevc_epel_hv_     ## width, depth);  \
-
-#define PRED_FUNC(i, width, depth)                                                        \
-    hevcdsp->put_unweighted_pred[i]     = FUNC(put_unweighted_pred_ ## width, depth);     \
-    hevcdsp->put_unweighted_pred_avg[i] = FUNC(put_unweighted_pred_avg_ ## width, depth); \
-    hevcdsp->weighted_pred[i]           = FUNC(put_weighted_pred_ ## width, depth);       \
-    hevcdsp->weighted_pred_avg[i]       = FUNC(put_weighted_pred_avg_ ## width, depth);   \
-
-#define PRED_FUNC_CHROMA(i, width, depth)                                                        \
-    hevcdsp->put_unweighted_pred_chroma[i]     = FUNC(put_unweighted_pred_ ## width, depth);     \
-    hevcdsp->put_unweighted_pred_avg_chroma[i] = FUNC(put_unweighted_pred_avg_ ## width, depth); \
-    hevcdsp->weighted_pred_chroma[i]           = FUNC(put_weighted_pred_ ## width, depth);       \
-    hevcdsp->weighted_pred_avg_chroma[i]       = FUNC(put_weighted_pred_avg_ ## width, depth);   \
+#undef PEL_FUNC
+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
+    for(i = 0 ; i < 10 ; i++)                                                  \
+{                                                                              \
+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
+}
+
+#undef EPEL_FUNCS
+#define EPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
+
+#undef EPEL_UNI_FUNCS
+#define EPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
+
+#undef EPEL_BI_FUNCS
+#define EPEL_BI_FUNCS(depth)                                                \
+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
+
+#undef QPEL_FUNCS
+#define QPEL_FUNCS(depth)                                                     \
+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
+
+#undef QPEL_UNI_FUNCS
+#define QPEL_UNI_FUNCS(depth)                                                 \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
+
+#undef QPEL_BI_FUNCS
+#define QPEL_BI_FUNCS(depth)                                                  \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
 
 #define HEVC_DSP(depth)                                                     \
     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-    hevcdsp->transquant_bypass[0]   = FUNC(transquant_bypass4x4, depth);    \
-    hevcdsp->transquant_bypass[1]   = FUNC(transquant_bypass8x8, depth);    \
-    hevcdsp->transquant_bypass[2]   = FUNC(transquant_bypass16x16, depth);  \
-    hevcdsp->transquant_bypass[3]   = FUNC(transquant_bypass32x32, depth);  \
+    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
+    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
-    hevcdsp->transform_4x4_luma_add = FUNC(transform_4x4_luma_add, depth);  \
-    hevcdsp->transform_add[0]       = FUNC(transform_4x4_add, depth);       \
-    hevcdsp->transform_add[1]       = FUNC(transform_8x8_add, depth);       \
-    hevcdsp->transform_add[2]       = FUNC(transform_16x16_add, depth);     \
-    hevcdsp->transform_add[3]       = FUNC(transform_32x32_add, depth);     \
+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
                                                                             \
-    hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
-    hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
-    hevcdsp->sao_band_filter[2] = FUNC(sao_band_filter_2, depth);           \
-    hevcdsp->sao_band_filter[3] = FUNC(sao_band_filter_3, depth);           \
-                                                                            \
-    hevcdsp->sao_edge_filter[0] = FUNC(sao_edge_filter_0, depth);           \
-    hevcdsp->sao_edge_filter[1] = FUNC(sao_edge_filter_1, depth);           \
-    hevcdsp->sao_edge_filter[2] = FUNC(sao_edge_filter_2, depth);           \
-    hevcdsp->sao_edge_filter[3] = FUNC(sao_edge_filter_3, depth);           \
-                                                                            \
-    QPEL_FUNC(0, 4,  depth);                                                \
-    QPEL_FUNC(1, 8,  depth);                                                \
-    QPEL_FUNC(2, 12, depth);                                                \
-    QPEL_FUNC(3, 16, depth);                                                \
-    QPEL_FUNC(4, 24, depth);                                                \
-    QPEL_FUNC(5, 32, depth);                                                \
-    QPEL_FUNC(6, 48, depth);                                                \
-    QPEL_FUNC(7, 64, depth);                                                \
-                                                                            \
-    EPEL_FUNC(0, 2,  depth);                                                \
-    EPEL_FUNC(1, 4,  depth);                                                \
-    EPEL_FUNC(2, 6, depth);                                                 \
-    EPEL_FUNC(3, 8, depth);                                                 \
-    EPEL_FUNC(4, 12, depth);                                                \
-    EPEL_FUNC(5, 16, depth);                                                \
-    EPEL_FUNC(6, 24, depth);                                                \
-    EPEL_FUNC(7, 32, depth);                                                \
-                                                                            \
-    PRED_FUNC(0, 4,  depth);                                                \
-    PRED_FUNC(1, 8,  depth);                                                \
-    PRED_FUNC(2, 12, depth);                                                \
-    PRED_FUNC(3, 16, depth);                                                \
-    PRED_FUNC(4, 24, depth);                                                \
-    PRED_FUNC(5, 32, depth);                                                \
-    PRED_FUNC(6, 48, depth);                                                \
-    PRED_FUNC(7, 64, depth);                                                \
-    PRED_FUNC_CHROMA(0, 2,  depth);                                         \
-    PRED_FUNC_CHROMA(1, 4,  depth);                                         \
-    PRED_FUNC_CHROMA(2, 6, depth);                                          \
-    PRED_FUNC_CHROMA(3, 8, depth);                                          \
-    PRED_FUNC_CHROMA(4, 12, depth);                                         \
-    PRED_FUNC_CHROMA(5, 16, depth);                                         \
-    PRED_FUNC_CHROMA(6, 24, depth);                                         \
-    PRED_FUNC_CHROMA(7, 32, depth);                                         \
+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
                                                                             \
+    hevcdsp->sao_band_filter[0] =                                              \
+    hevcdsp->sao_band_filter[1] =                                              \
+    hevcdsp->sao_band_filter[2] =                                              \
+    hevcdsp->sao_band_filter[3] =                                              \
+    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
+    hevcdsp->sao_edge_filter[0] =                                              \
+    hevcdsp->sao_edge_filter[1] =                                              \
+    hevcdsp->sao_edge_filter[2] =                                              \
+    hevcdsp->sao_edge_filter[3] =                                              \
+    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+                                                                               \
+    QPEL_FUNCS(depth);                                                         \
+    QPEL_UNI_FUNCS(depth);                                                     \
+    QPEL_BI_FUNCS(depth);                                                      \
+    EPEL_FUNCS(depth);                                                         \
+    EPEL_UNI_FUNCS(depth);                                                     \
+    EPEL_BI_FUNCS(depth);                                                      \
+                                                                               \
     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
@@ -227,7 +239,8 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
     hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
     hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
-    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth);
+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
+int i = 0;
 
     switch (bit_depth) {
     case 9:
@@ -236,6 +249,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
     case 10:
         HEVC_DSP(10);
         break;
+    case 12:
+        HEVC_DSP(12);
+        break;
     default:
         HEVC_DSP(8);
         break;
@@ -243,4 +259,8 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
 
     if (ARCH_X86)
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+    if (ARCH_ARM)
+        ff_hevcdsp_init_arm(hevcdsp, bit_depth);
+    if (ARCH_MIPS)
+        ff_hevc_dsp_init_mips(hevcdsp, bit_depth);
 }
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 4097233..9f1f6dd 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -2,21 +2,23 @@
  * HEVC video decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,102 +27,107 @@
 
 #include "get_bits.h"
 
+#define MAX_PB_SIZE 64
+
 typedef struct SAOParams {
     int offset_abs[3][4];   ///< sao_offset_abs
     int offset_sign[3][4];  ///< sao_offset_sign
 
-    int band_position[3];   ///< sao_band_position
+    uint8_t band_position[3];   ///< sao_band_position
 
     int eo_class[3];        ///< sao_eo_class
 
-    int offset_val[3][5];   ///<SaoOffsetVal
+    int16_t offset_val[3][5];   ///<SaoOffsetVal
 
     uint8_t type_idx[3];    ///< sao_type_idx
 } SAOParams;
 
 typedef struct HEVCDSPContext {
-    void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
-                    GetBitContext *gb, int pcm_bit_depth);
-
-    void (*transquant_bypass[4])(uint8_t *dst, int16_t *coeffs,
-                                 ptrdiff_t stride);
-
-    void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-    void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs,
-                                   ptrdiff_t stride);
-    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-
-    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                               struct SAOParams *sao, int *borders,
-                               int width, int height, int c_idx);
-    void (*sao_edge_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-                               struct SAOParams *sao, int *borders, int width,
-                               int height, int c_idx, uint8_t vert_edge,
-                               uint8_t horiz_edge, uint8_t diag_edge);
-
-    void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height,
-                                   int mx, int my, int16_t *mcbuffer);
-    void (*put_hevc_epel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src,
-                                   ptrdiff_t srcstride, int height,
-                                   int mx, int my, int16_t *mcbuffer);
-
-    void (*put_unweighted_pred[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                                   ptrdiff_t srcstride, int height);
-    void (*put_unweighted_pred_chroma[8])(uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                                          ptrdiff_t srcstride, int height);
-    void (*put_unweighted_pred_avg[8])(uint8_t *dst, ptrdiff_t dststride,
-                                       int16_t *src1, int16_t *src2,
-                                       ptrdiff_t srcstride, int height);
-    void (*put_unweighted_pred_avg_chroma[8])(uint8_t *dst, ptrdiff_t dststride,
-                                              int16_t *src1, int16_t *src2,
-                                              ptrdiff_t srcstride, int height);
-    void (*weighted_pred[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                             uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                             ptrdiff_t srcstride, int height);
-    void (*weighted_pred_chroma[8])(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                                    uint8_t *dst, ptrdiff_t dststride, int16_t *src,
-                                    ptrdiff_t srcstride, int height);
-    void (*weighted_pred_avg[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
-                                 int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
-                                 ptrdiff_t dststride, int16_t *src1, int16_t *src2,
-                                 ptrdiff_t srcstride, int height);
-    void (*weighted_pred_avg_chroma[8])(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
-                                        int16_t ol0Flag, int16_t ol1Flag, uint8_t *dst,
-                                        ptrdiff_t dststride, int16_t *src1, int16_t *src2,
-                                        ptrdiff_t srcstride, int height);
+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                    struct GetBitContext *gb, int pcm_bit_depth);
+
+    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+
+    void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+
+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
+
+    void (*idct_4x4_luma)(int16_t *coeffs);
+
+    void (*idct[4])(int16_t *coeffs, int col_limit);
+
+    void (*idct_dc[4])(int16_t *coeffs);
+
+    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+
+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+
+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int wx1,
+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, intptr_t mx, intptr_t my, int width);
+
+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, int denom, int wx0, int ox0, int wx1,
+                                         int ox1, intptr_t mx, intptr_t my, int width);
 
     void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int beta, int *tc,
+                                    int beta, int32_t *tc,
                                     uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                    int beta, int *tc,
+                                    int beta, int32_t *tc,
                                     uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                      int *tc, uint8_t *no_p, uint8_t *no_q);
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                      int *tc, uint8_t *no_p, uint8_t *no_q);
+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int beta, int *tc,
+                                      int beta, int32_t *tc,
                                       uint8_t *no_p, uint8_t *no_q);
     void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-                                      int beta, int *tc,
+                                      int beta, int32_t *tc,
                                       uint8_t *no_p, uint8_t *no_q);
     void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                        int *tc, uint8_t *no_p,
+                                        int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                        int *tc, uint8_t *no_p,
+                                        int32_t *tc, uint8_t *no_p,
                                         uint8_t *no_q);
 } HEVCDSPContext;
 
 void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 
-void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
-
-extern const int16_t ff_hevc_epel_coeffs[7][16];
-extern const int8_t ff_hevc_epel_coeffs8[7][16];
-extern const int16_t ff_hevc_qpel_coeffs[3][8];
-extern const int8_t ff_hevc_qpel_coeffs8[3][16];
+extern const int8_t ff_hevc_epel_filters[7][4];
+extern const int8_t ff_hevc_qpel_filters[3][16];
 
+void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
+void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
 #endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 31a2e7a..b840d17 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,8 +24,10 @@
 #include "hevc.h"
 
 #include "bit_depth_template.c"
+#include "hevcdsp.h"
 
-static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
+
+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
                           GetBitContext *gb, int pcm_bit_depth)
 {
     int x, y;
@@ -33,8 +35,8 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
 
     stride /= sizeof(pixel);
 
-    for (y = 0; y < size; y++) {
-        for (x = 0; x < size; x++)
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
         dst += stride;
     }
@@ -57,48 +59,76 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
     }
 }
 
-static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
 }
 
-static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
 }
 
-static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
 }
 
-static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
 }
 
-static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
-                                 ptrdiff_t stride)
+
+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
 {
-    pixel *dst = (pixel *)_dst;
-    int shift  = 13 - BIT_DEPTH;
-#if BIT_DEPTH <= 13
-    int offset = 1 << (shift - 1);
-#else
-    int offset = 0;
-#endif
+    int16_t *coeffs = (int16_t *) _coeffs;
+    int x, y;
+    int size = 1 << log2_size;
+
+    if (mode) {
+        coeffs += size;
+        for (y = 0; y < size - 1; y++) {
+            for (x = 0; x < size; x++)
+                coeffs[x] += coeffs[x - size];
+            coeffs += size;
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 1; x < size; x++)
+                coeffs[x] += coeffs[x - 1];
+            coeffs += size;
+        }
+    }
+}
+
+static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
+{
+    int shift  = 15 - BIT_DEPTH - log2_size;
     int x, y;
+    int size = 1 << log2_size;
+    int16_t *coeffs = _coeffs;
 
-    stride /= sizeof(pixel);
 
-    for (y = 0; y < 4 * 4; y += 4) {
-        for (x = 0; x < 4; x++)
-            dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift));
-        dst += stride;
+    if (shift > 0) {
+        int offset = 1 << (shift - 1);
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = (*coeffs + offset) >> shift;
+                coeffs++;
+            }
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = *coeffs << -shift;
+                coeffs++;
+            }
+        }
     }
 }
 
@@ -122,17 +152,13 @@ static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
     } while (0)
 
-static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
-                                         ptrdiff_t stride)
+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 {
     int i;
-    pixel *dst   = (pixel *)_dst;
     int shift    = 7;
     int add      = 1 << (shift - 1);
     int16_t *src = coeffs;
 
-    stride /= sizeof(pixel);
-
     for (i = 0; i < 4; i++) {
         TR_4x4_LUMA(src, src, 4, SCALE);
         src++;
@@ -141,323 +167,226 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
     shift = 20 - BIT_DEPTH;
     add   = 1 << (shift - 1);
     for (i = 0; i < 4; i++) {
-        TR_4x4_LUMA(dst, coeffs, 1, ADD_AND_SCALE);
+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
         coeffs += 4;
-        dst    += stride;
     }
 }
 
 #undef TR_4x4_LUMA
 
-#define TR_4(dst, src, dstep, sstep, assign)                            \
-    do {                                                                \
-        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
-                       transform[8 * 2][0] * src[2 * sstep];            \
-        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
-                       transform[8 * 2][1] * src[2 * sstep];            \
-        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
-                       transform[8 * 3][0] * src[3 * sstep];            \
-        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
-                       transform[8 * 3][1] * src[3 * sstep];            \
-                                                                        \
-        assign(dst[0 * dstep], e0 + o0);                                \
-        assign(dst[1 * dstep], e1 + o1);                                \
-        assign(dst[2 * dstep], e1 - o1);                                \
-        assign(dst[3 * dstep], e0 - o0);                                \
+#define TR_4(dst, src, dstep, sstep, assign, end)                              \
+    do {                                                                       \
+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
+                                                                               \
+        assign(dst[0 * dstep], e0 + o0);                                       \
+        assign(dst[1 * dstep], e1 + o1);                                       \
+        assign(dst[2 * dstep], e1 - o1);                                       \
+        assign(dst[3 * dstep], e0 - o0);                                       \
     } while (0)
 
-static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 4; i++) {
-        TR_4(src, src, 4, 4, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 4; i++) {
-        TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 4;
-        dst    += stride;
-    }
-}
-
-#define TR_8(dst, src, dstep, sstep, assign)                      \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_8[4];                                               \
-        int o_8[4] = { 0 };                                       \
-        for (i = 0; i < 4; i++)                                   \
-            for (j = 1; j < 8; j += 2)                            \
-                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
-        TR_4(e_8, src, 1, 2 * sstep, SET);                        \
-                                                                  \
-        for (i = 0; i < 4; i++) {                                 \
-            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
-            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
-        }                                                         \
+#define TR_8(dst, src, dstep, sstep, assign, end)                              \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_8[4];                                                            \
+        int o_8[4] = { 0 };                                                    \
+        for (i = 0; i < 4; i++)                                                \
+            for (j = 1; j < end; j += 2)                                       \
+                o_8[i] += transform[4 * j][i] * src[j * sstep];                \
+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
+                                                                               \
+        for (i = 0; i < 4; i++) {                                              \
+            assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
+        }                                                                      \
     } while (0)
 
-#define TR_16(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_16[8];                                              \
-        int o_16[8] = { 0 };                                      \
-        for (i = 0; i < 8; i++)                                   \
-            for (j = 1; j < 16; j += 2)                           \
-                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
-        TR_8(e_16, src, 1, 2 * sstep, SET);                       \
-                                                                  \
-        for (i = 0; i < 8; i++) {                                 \
-            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
-            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
-        }                                                         \
+#define TR_16(dst, src, dstep, sstep, assign, end)                             \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_16[8];                                                           \
+        int o_16[8] = { 0 };                                                   \
+        for (i = 0; i < 8; i++)                                                \
+            for (j = 1; j < end; j += 2)                                       \
+                o_16[i] += transform[2 * j][i] * src[j * sstep];               \
+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
+                                                                               \
+        for (i = 0; i < 8; i++) {                                              \
+            assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
+        }                                                                      \
     } while (0)
 
-#define TR_32(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_32[16];                                             \
-        int o_32[16] = { 0 };                                     \
-        for (i = 0; i < 16; i++)                                  \
-            for (j = 1; j < 32; j += 2)                           \
-                o_32[i] += transform[j][i] * src[j * sstep];      \
-        TR_16(e_32, src, 1, 2 * sstep, SET);                      \
-                                                                  \
-        for (i = 0; i < 16; i++) {                                \
-            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
-            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
-        }                                                         \
+#define TR_32(dst, src, dstep, sstep, assign, end)                             \
+    do {                                                                       \
+        int i, j;                                                              \
+        int e_32[16];                                                          \
+        int o_32[16] = { 0 };                                                  \
+        for (i = 0; i < 16; i++)                                               \
+            for (j = 1; j < end; j += 2)                                       \
+                o_32[i] += transform[j][i] * src[j * sstep];                   \
+        TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
+                                                                               \
+        for (i = 0; i < 16; i++) {                                             \
+            assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
+        }                                                                      \
     } while (0)
 
-
-
-static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 8; i++) {
-        TR_8(src, src, 8, 8, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 8; i++) {
-        TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 8;
-        dst    += stride;
-    }
+#define IDCT_VAR4(H)                                                          \
+    int      limit2   = FFMIN(col_limit + 4, H)
+#define IDCT_VAR8(H)                                                          \
+        int      limit   = FFMIN(col_limit, H);                               \
+        int      limit2   = FFMIN(col_limit + 4, H)
+#define IDCT_VAR16(H)   IDCT_VAR8(H)
+#define IDCT_VAR32(H)   IDCT_VAR8(H)
+
+#define IDCT(H)                                                              \
+static void FUNC(idct_##H ##x ##H )(                                         \
+                   int16_t *coeffs, int col_limit) {                         \
+    int i;                                                                   \
+    int      shift   = 7;                                                    \
+    int      add     = 1 << (shift - 1);                                     \
+    int16_t *src     = coeffs;                                               \
+    IDCT_VAR ##H(H);                                                         \
+                                                                             \
+    for (i = 0; i < H; i++) {                                                \
+        TR_ ## H(src, src, H, H, SCALE, limit2);                             \
+        if (limit2 < H && i%4 == 0 && !!i)                                   \
+            limit2 -= 4;                                                     \
+        src++;                                                               \
+    }                                                                        \
+                                                                             \
+    shift   = 20 - BIT_DEPTH;                                                \
+    add     = 1 << (shift - 1);                                              \
+    for (i = 0; i < H; i++) {                                                \
+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
+        coeffs += H;                                                         \
+    }                                                                        \
 }
 
-static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 16; i++) {
-        TR_16(src, src, 16, 16, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 16; i++) {
-        TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 16;
-        dst    += stride;
-    }
+#define IDCT_DC(H)                                                           \
+static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
+                   int16_t *coeffs) {                                        \
+    int i, j;                                                                \
+    int      shift   = 14 - BIT_DEPTH;                                       \
+    int      add     = 1 << (shift - 1);                                     \
+    int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
+                                                                             \
+    for (j = 0; j < H; j++) {                                                \
+        for (i = 0; i < H; i++) {                                            \
+            coeffs[i+j*H] = coeff;                                           \
+        }                                                                    \
+    }                                                                        \
 }
 
-static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
+IDCT( 4)
+IDCT( 8)
+IDCT(16)
+IDCT(32)
 
-    stride /= sizeof(pixel);
+IDCT_DC( 4)
+IDCT_DC( 8)
+IDCT_DC(16)
+IDCT_DC(32)
 
-    for (i = 0; i < 32; i++) {
-        TR_32(src, src, 32, 32, SCALE);
-        src++;
-    }
-    src   = coeffs;
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 32; i++) {
-        TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 32;
-        dst    += stride;
-    }
-}
+#undef TR_4
+#undef TR_8
+#undef TR_16
+#undef TR_32
+
+#undef SET
+#undef SCALE
+#undef ADD_AND_SCALE
 
 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
-                                  ptrdiff_t stride, SAOParams *sao,
-                                  int *borders, int width, int height,
-                                  int c_idx, int class)
+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                  int16_t *sao_offset_val, int sao_left_class,
+                                  int width, int height)
 {
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
     int offset_table[32] = { 0 };
     int k, y, x;
-    int chroma = !!c_idx;
     int shift  = BIT_DEPTH - 5;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_left_class  = sao->band_position[c_idx];
-    int init_y = 0, init_x = 0;
-
-    stride /= sizeof(pixel);
 
-    switch (class) {
-    case 0:
-        if (!borders[2])
-            width -= (8 >> chroma) + 2;
-        if (!borders[3])
-            height -= (4 >> chroma) + 2;
-        break;
-    case 1:
-        init_y = -(4 >> chroma) - 2;
-        if (!borders[2])
-            width -= (8 >> chroma) + 2;
-        height = (4 >> chroma) + 2;
-        break;
-    case 2:
-        init_x = -(8 >> chroma) - 2;
-        width  =  (8 >> chroma) + 2;
-        if (!borders[3])
-            height -= (4 >> chroma) + 2;
-        break;
-    case 3:
-        init_y = -(4 >> chroma) - 2;
-        init_x = -(8 >> chroma) - 2;
-        width  =  (8 >> chroma) + 2;
-        height =  (4 >> chroma) + 2;
-        break;
-    }
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
     for (k = 0; k < 4; k++)
         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-        dst += stride;
-        src += stride;
+        dst += stride_dst;
+        src += stride_src;
     }
 }
 
-static void FUNC(sao_band_filter_0)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 0);
-}
+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
 
-static void FUNC(sao_band_filter_1)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 1);
-}
+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+                                  int eo, int width, int height) {
 
-static void FUNC(sao_band_filter_2)(uint8_t *dst, uint8_t *src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(dst, src, stride, sao, borders,
-                          width, height, c_idx, 2);
-}
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    static const int8_t pos[4][2][2] = {
+        { { -1,  0 }, {  1, 0 } }, // horizontal
+        { {  0, -1 }, {  0, 1 } }, // vertical
+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+    };
+    pixel *dst = (pixel *)_dst;
+    pixel *src = (pixel *)_src;
+    int a_stride, b_stride;
+    int x, y;
+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+    stride_dst /= sizeof(pixel);
 
-static void FUNC(sao_band_filter_3)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int width, int height,
-                                    int c_idx)
-{
-    FUNC(sao_band_filter)(_dst, _src, stride, sao, borders,
-                          width, height, c_idx, 3);
+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            int diff0 = CMP(src[x], src[x + a_stride]);
+            int diff1 = CMP(src[x], src[x + b_stride]);
+            int offset_val        = edge_idx[2 + diff0 + diff1];
+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
+        }
+        src += stride_src;
+        dst += stride_dst;
+    }
 }
 
-static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
                                     int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
+    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
+    int init_x = 0, width = _width, height = _height;
 
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    stride /= sizeof(pixel);
-
-    if (!borders[2])
-        width -= (8 >> chroma) + 2;
-    if (!borders[3])
-        height -= (4 >> chroma) + 2;
-
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
             int offset_val = sao_offset_val[0];
-            int y_stride   = 0;
             for (y = 0; y < height; y++) {
-                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
-                y_stride     += stride;
+                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
             }
             init_x = 1;
         }
         if (borders[2]) {
             int offset_val = sao_offset_val[0];
-            int x_stride   = width - 1;
+            int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
-                x_stride     += stride;
+                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
             }
             width--;
         }
@@ -467,180 +396,51 @@ static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
             int offset_val = sao_offset_val[0];
             for (x = init_x; x < width; x++)
                 dst[x] = av_clip_pixel(src[x] + offset_val);
-            init_y = 1;
         }
         if (borders[3]) {
-            int offset_val = sao_offset_val[0];
-            int y_stride   = stride * (height - 1);
+            int offset_val   = sao_offset_val[0];
+            int y_stride_dst = stride_dst * (height - 1);
+            int y_stride_src = stride_src * (height - 1);
             for (x = init_x; x < width; x++)
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
+                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
             height--;
         }
     }
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
-    }
-
-    {
-        // Restore pixels that can't be modified
-        int save_upper_left = !diag_edge && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
-        if (vert_edge && sao_eo_class != SAO_EO_VERT)
-            for (y = init_y+save_upper_left; y< height; y++)
-                dst[y*stride] = src[y*stride];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x+save_upper_left; x<width; x++)
-                dst[x] = src[x];
-        if(diag_edge && sao_eo_class == SAO_EO_135D)
-            dst[0] = src[0];
-    }
-
-#undef CMP
 }
 
-static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
                                     int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+                                    int c_idx, uint8_t *vert_edge,
+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
 {
     int x, y;
     pixel *dst = (pixel *)_dst;
     pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
+    int16_t *sao_offset_val = sao->offset_val[c_idx];
     int sao_eo_class    = sao->eo_class[c_idx];
     int init_x = 0, init_y = 0, width = _width, height = _height;
 
-    static const int8_t pos[4][2][2] = {
-        { { -1, 0  }, { 1,  0 } }, // horizontal
-        { { 0,  -1 }, { 0,  1 } }, // vertical
-        { { -1, -1 }, { 1,  1 } }, // 45 degree
-        { { 1,  -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-
-    stride /= sizeof(pixel);
-
-    init_y = -(4 >> chroma) - 2;
-    if (!borders[2])
-        width -= (8 >> chroma) + 2;
-    height = (4 >> chroma) + 2;
+    stride_dst /= sizeof(pixel);
+    stride_src /= sizeof(pixel);
 
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_VERT) {
         if (borders[0]) {
             int offset_val = sao_offset_val[0];
-            int y_stride   = 0;
             for (y = 0; y < height; y++) {
-                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
-                y_stride     += stride;
+                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
             }
             init_x = 1;
         }
         if (borders[2]) {
             int offset_val = sao_offset_val[0];
-            int x_stride   = width - 1;
+            int offset     = width - 1;
             for (x = 0; x < height; x++) {
-                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
-                x_stride     += stride;
+                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
             }
             width--;
         }
     }
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
-    }
-
-    {
-        // Restore pixels that can't be modified
-        int save_lower_left = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[0];
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y; y< height-save_lower_left; y++)
-                dst[y*stride] = src[y*stride];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x+save_lower_left; x<width; x++)
-                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
-        if(diag_edge && sao_eo_class == SAO_EO_45D)
-            dst[stride*(height-1)] = src[stride*(height-1)];
-    }
-
-#undef CMP
-}
-
-static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
-{
-    int x, y;
-    pixel *dst = (pixel *)_dst;
-    pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
-
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-
-    stride /= sizeof(pixel);
-
-    init_x = -(8 >> chroma) - 2;
-    width  =  (8 >> chroma) + 2;
-    if (!borders[3])
-        height -= (4 >> chroma) + 2;
-
-    dst = dst + (init_y * stride + init_x);
-    src = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
     if (sao_eo_class != SAO_EO_HORIZ) {
         if (borders[1]) {
             int offset_val = sao_offset_val[0];
@@ -649,430 +449,674 @@ static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
             init_y = 1;
         }
         if (borders[3]) {
-            int offset_val = sao_offset_val[0];
-            int y_stride   = stride * (height - 1);
+            int offset_val   = sao_offset_val[0];
+            int y_stride_dst = stride_dst * (height - 1);
+            int y_stride_src = stride_src * (height - 1);
             for (x = init_x; x < width; x++)
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
+                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
             height--;
         }
     }
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
-        }
-    }
 
     {
+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
+
         // Restore pixels that can't be modified
-        int save_upper_right = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[1];
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y+save_upper_right; y< height; y++)
-                dst[y*stride+width-1] = src[y*stride+width-1];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x; x<width-save_upper_right; x++)
+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
+                dst[y*stride_dst] = src[y*stride_src];
+        }
+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
+        }
+
+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
                 dst[x] = src[x];
-        if(diag_edge && sao_eo_class == SAO_EO_45D)
+        }
+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
+        }
+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
+            dst[0] = src[0];
+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
             dst[width-1] = src[width-1];
+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
+
     }
+}
+
 #undef CMP
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = src[x] << (14 - BIT_DEPTH);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
 }
 
-static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
-                                    ptrdiff_t stride, SAOParams *sao,
-                                    int *borders, int _width, int _height,
-                                    int c_idx, uint8_t vert_edge,
-                                    uint8_t horiz_edge, uint8_t diag_edge)
+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                          int height, intptr_t mx, intptr_t my, int width)
+{
+    int y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        memcpy(dst, src, width * sizeof(pixel));
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int16_t *src2,
+                                         int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    pixel *dst = (pixel *)_dst;
-    pixel *src = (pixel *)_src;
-    int chroma = !!c_idx;
-    int *sao_offset_val = sao->offset_val[c_idx];
-    int sao_eo_class    = sao->eo_class[c_idx];
-    int init_x = 0, init_y = 0, width = _width, height = _height;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    static const int8_t pos[4][2][2] = {
-        { { -1,  0 }, {  1, 0 } }, // horizontal
-        { {  0, -1 }, {  0, 1 } }, // vertical
-        { { -1, -1 }, {  1, 1 } }, // 45 degree
-        { {  1, -1 }, { -1, 1 } }, // 135 degree
-    };
-    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
-    stride /= sizeof(pixel);
+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-    init_y = -(4 >> chroma) - 2;
-    init_x = -(8 >> chroma) - 2;
-    width  =  (8 >> chroma) + 2;
-    height =  (4 >> chroma) + 2;
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
 
+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                           int16_t *src2,
+                                           int height, int denom, int wx0, int wx1,
+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src          = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    dst    = dst + (init_y * stride + init_x);
-    src    = src + (init_y * stride + init_x);
-    init_y = init_x = 0;
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
 
-    {
-        int y_stride = init_y * stride;
-        int pos_0_0  = pos[sao_eo_class][0][0];
-        int pos_0_1  = pos[sao_eo_class][0][1];
-        int pos_1_0  = pos[sao_eo_class][1][0];
-        int pos_1_1  = pos[sao_eo_class][1][1];
-
-        int y_stride_0_1 = (init_y + pos_0_1) * stride;
-        int y_stride_1_1 = (init_y + pos_1_1) * stride;
-
-        for (y = init_y; y < height; y++) {
-            for (x = init_x; x < width; x++) {
-                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
-                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
-                int offset_val    = edge_idx[2 + diff0 + diff1];
-                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
-            }
-            y_stride     += stride;
-            y_stride_0_1 += stride;
-            y_stride_1_1 += stride;
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
         }
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
     }
+}
 
-    {
-        // Restore pixels that can't be modified
-        int save_lower_right = !diag_edge && sao_eo_class == SAO_EO_135D;
-        if(vert_edge && sao_eo_class != SAO_EO_VERT)
-            for(y = init_y; y< height-save_lower_right; y++)
-                dst[y*stride+width-1] = src[y*stride+width-1];
-        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
-            for(x = init_x; x<width-save_lower_right; x++)
-                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
-        if(diag_edge && sao_eo_class == SAO_EO_135D)
-            dst[stride*(height-1)+width-1] = src[stride*(height-1)+width-1];
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define QPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - 3 * stride] +                                         \
+     filter[1] * src[x - 2 * stride] +                                         \
+     filter[2] * src[x -     stride] +                                         \
+     filter[3] * src[x             ] +                                         \
+     filter[4] * src[x +     stride] +                                         \
+     filter[5] * src[x + 2 * stride] +                                         \
+     filter[6] * src[x + 3 * stride] +                                         \
+     filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
     }
-#undef CMP
 }
 
-#undef SET
-#undef SCALE
-#undef ADD_AND_SCALE
-#undef TR_4
-#undef TR_8
-#undef TR_16
-#undef TR_32
+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    for (y = 0; y < height; y++)  {
+        for (x = 0; x < width; x++)
+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        dst += MAX_PB_SIZE;
+    }
+}
 
-static av_always_inline void
-FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
-                           uint8_t *_src, ptrdiff_t _srcstride,
-                           int width, int height, int mx, int my,
-                           int16_t* mcbuffer)
+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
+                                   uint8_t *_src,
+                                   ptrdiff_t _srcstride,
+                                   int height, intptr_t mx,
+                                   intptr_t my, int width)
 {
     int x, y;
-    pixel *src          = (pixel *)_src;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
 
-    dststride /= sizeof(*dst);
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = src[x] << (14 - BIT_DEPTH);
+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
+        tmp += MAX_PB_SIZE;
+        dst += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                      uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
         src += srcstride;
         dst += dststride;
     }
 }
 
-#define QPEL_FILTER_1(src, stride)      \
-    (1 * -src[x - 3 * stride] +         \
-     4 *  src[x - 2 * stride] -         \
-    10 *  src[x -     stride] +         \
-    58 *  src[x]              +         \
-    17 *  src[x +     stride] -         \
-     5 *  src[x + 2 * stride] +         \
-     1 *  src[x + 3 * stride])
-
-#define QPEL_FILTER_2(src, stride)      \
-    (1  * -src[x - 3 * stride] +        \
-     4  *  src[x - 2 * stride] -        \
-    11  *  src[x -     stride] +        \
-    40  *  src[x]              +        \
-    40  *  src[x +     stride] -        \
-    11  *  src[x + 2 * stride] +        \
-     4  *  src[x + 3 * stride] -        \
-     1  *  src[x + 4 * stride])
-
-#define QPEL_FILTER_3(src, stride)      \
-    (1  * src[x - 2 * stride] -         \
-     5  * src[x -     stride] +         \
-    17  * src[x]              +         \
-    58  * src[x + stride]     -         \
-    10  * src[x + 2 * stride] +         \
-     4  * src[x + 3 * stride] -         \
-     1  * src[x + 4 * stride])
-
-
-#define PUT_HEVC_QPEL_H(H)                                                     \
-static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
-                                       uint8_t *_src, ptrdiff_t _srcstride,    \
-                                       int width, int height,                  \
-                                       int16_t* mcbuffer)                      \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    dststride /= sizeof(*dst);                                                 \
-    for (y = 0; y < height; y++) {                                             \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
-        src += srcstride;                                                      \
-        dst += dststride;                                                      \
-    }                                                                          \
+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
 }
 
-#define PUT_HEVC_QPEL_V(V)                                                     \
-static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
-                                       uint8_t *_src, ptrdiff_t _srcstride,    \
-                                       int width, int height,                  \
-                                       int16_t* mcbuffer)                      \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    dststride /= sizeof(*dst);                                                 \
-    for (y = 0; y < height; y++)  {                                            \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8);     \
-        src += srcstride;                                                      \
-        dst += dststride;                                                      \
-    }                                                                          \
+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                     uint8_t *_src, ptrdiff_t _srcstride,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    int shift = 14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
 }
 
-#define PUT_HEVC_QPEL_HV(H, V)                                                 \
-static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,                 \
-                                                 ptrdiff_t dststride,          \
-                                                 uint8_t *_src,                \
-                                                 ptrdiff_t _srcstride,         \
-                                                 int width, int height,        \
-                                                 int16_t* mcbuffer)            \
-{                                                                              \
-    int x, y;                                                                  \
-    pixel *src = (pixel*)_src;                                                 \
-    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
-                                                                               \
-    int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE];                        \
-    int16_t *tmp = tmp_array;                                                  \
-                                                                               \
-    dststride /= sizeof(*dst);                                                 \
-    src -= ff_hevc_qpel_extra_before[V] * srcstride;                           \
-                                                                               \
-    for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                     \
-        for (x = 0; x < width; x++)                                            \
-            tmp[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
-        src += srcstride;                                                      \
-        tmp += MAX_PB_SIZE;                                                    \
-    }                                                                          \
-                                                                               \
-    tmp = tmp_array + ff_hevc_qpel_extra_before[V] * MAX_PB_SIZE;              \
-                                                                               \
-    for (y = 0; y < height; y++) {                                             \
-        for (x = 0; x < width; x++)                                            \
-            dst[x] = QPEL_FILTER_ ## V(tmp, MAX_PB_SIZE) >> 6;                 \
-        tmp += MAX_PB_SIZE;                                                    \
-        dst += dststride;                                                      \
-    }                                                                          \
+
+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
 }
 
-PUT_HEVC_QPEL_H(1)
-PUT_HEVC_QPEL_H(2)
-PUT_HEVC_QPEL_H(3)
-PUT_HEVC_QPEL_V(1)
-PUT_HEVC_QPEL_V(2)
-PUT_HEVC_QPEL_V(3)
-PUT_HEVC_QPEL_HV(1, 1)
-PUT_HEVC_QPEL_HV(1, 2)
-PUT_HEVC_QPEL_HV(1, 3)
-PUT_HEVC_QPEL_HV(2, 1)
-PUT_HEVC_QPEL_HV(2, 2)
-PUT_HEVC_QPEL_HV(2, 3)
-PUT_HEVC_QPEL_HV(3, 1)
-PUT_HEVC_QPEL_HV(3, 2)
-PUT_HEVC_QPEL_HV(3, 3)
-
-#define QPEL(W)                                                                             \
-static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride,             \
-                                             uint8_t *src, ptrdiff_t srcstride,             \
-                                             int height, int mx, int my,                    \
-                                             int16_t *mcbuffer)                             \
-{                                                                                           \
-    FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height,                   \
-                               mx, my, mcbuffer);                                           \
-}                                                                                           \
-                                                                                            \
-static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
-                                        uint8_t *src, ptrdiff_t srcstride,                  \
-                                        int height, int mx, int my,                         \
-                                        int16_t *mcbuffer)                                  \
-{                                                                                           \
-    if (mx == 1)                                                                            \
-        FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else if (mx == 2)                                                                       \
-        FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else                                                                                    \
-        FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-}                                                                                           \
-                                                                                            \
-static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
-                                             uint8_t *src, ptrdiff_t srcstride,             \
-                                             int height, int mx, int my,                    \
-                                             int16_t *mcbuffer)                             \
-{                                                                                           \
-    if (my == 1)                                                                            \
-        FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else if (my == 2)                                                                       \
-        FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-    else                                                                                    \
-        FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
-}                                                                                           \
-                                                                                            \
-static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,                 \
-                                             uint8_t *src, ptrdiff_t srcstride,             \
-                                             int height, int mx, int my,                    \
-                                             int16_t *mcbuffer)                             \
-{                                                                                           \
-    if (my == 1) {                                                                          \
-        if (mx == 1)                                                                        \
-            FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else if (mx == 2)                                                                   \
-            FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else                                                                                \
-            FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-    } else if (my == 2) {                                                                   \
-        if (mx == 1)                                                                        \
-            FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else if (mx == 2)                                                                   \
-            FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else                                                                                \
-            FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-    } else {                                                                                \
-        if (mx == 1)                                                                        \
-            FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else if (mx == 2)                                                                   \
-            FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-        else                                                                                \
-            FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
-    }                                                                                       \
+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                       uint8_t *_src, ptrdiff_t _srcstride,
+                                       int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift =  14 - BIT_DEPTH;
+
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
 }
 
-QPEL(64)
-QPEL(48)
-QPEL(32)
-QPEL(24)
-QPEL(16)
-QPEL(12)
-QPEL(8)
-QPEL(4)
-
-static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
-                                              uint8_t *_src, ptrdiff_t _srcstride,
-                                              int width, int height, int mx, int my,
-                                              int16_t* mcbuffer)
+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    pixel *src          = (pixel *)_src;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
 
-    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = src[x] << (14 - BIT_DEPTH);
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
         src += srcstride;
         dst += dststride;
     }
 }
 
-#define EPEL_FILTER(src, stride)                \
-    (filter_0 * src[x - stride] +               \
-     filter_1 * src[x]          +               \
-     filter_2 * src[x + stride] +               \
-     filter_3 * src[x + 2 * stride])
+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
+
+    int shift = 14  + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
 
-static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+                                        uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox,
+                                        intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel        *src       = (pixel*)_src;
+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
+
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
                                          uint8_t *_src, ptrdiff_t _srcstride,
-                                         int width, int height, int mx, int my,
-                                         int16_t* mcbuffer)
+                                         int height, int denom, int wx, int ox,
+                                         intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    ox = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    const int8_t *filter;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src   -= QPEL_EXTRA_BEFORE * srcstride;
+    filter = ff_hevc_qpel_filters[mx - 1];
+    for (y = 0; y < height + QPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_qpel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define EPEL_FILTER(src, stride)                                               \
+    (filter[0] * src[x - stride] +                                             \
+     filter[1] * src[x]          +                                             \
+     filter[2] * src[x + stride] +                                             \
+     filter[3] * src[x + 2 * stride])
+
+static void FUNC(put_hevc_epel_h)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-    const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
-    int8_t filter_0 = filter[0];
-    int8_t filter_1 = filter[1];
-    int8_t filter_2 = filter[2];
-    int8_t filter_3 = filter[3];
-    dststride /= sizeof(*dst);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
         src += srcstride;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *_src, ptrdiff_t _srcstride,
-                                         int width, int height, int mx, int my,
-                                         int16_t* mcbuffer)
+static void FUNC(put_hevc_epel_v)(int16_t *dst,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
-    int8_t filter_0 = filter[0];
-    int8_t filter_1 = filter[1];
-    int8_t filter_2 = filter[2];
-    int8_t filter_3 = filter[3];
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
 
-    dststride /= sizeof(*dst);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
         src += srcstride;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
-                                          uint8_t *_src, ptrdiff_t _srcstride,
-                                          int width, int height, int mx, int my,
-                                          int16_t* mcbuffer)
+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
+                                   uint8_t *_src, ptrdiff_t _srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
-    const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
-    int8_t filter_0 = filter_h[0];
-    int8_t filter_1 = filter_h[1];
-    int8_t filter_2 = filter_h[2];
-    int8_t filter_3 = filter_h[3];
-    int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
     int16_t *tmp = tmp_array;
 
-    dststride /= sizeof(*dst);
     src -= EPEL_EXTRA_BEFORE * srcstride;
 
     for (y = 0; y < height + EPEL_EXTRA; y++) {
@@ -1083,95 +1127,101 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
     }
 
     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter_0 = filter_v[0];
-    filter_1 = filter_v[1];
-    filter_2 = filter_v[2];
-    filter_3 = filter_v[3];
+    filter = ff_hevc_epel_filters[my - 1];
+
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
         tmp += MAX_PB_SIZE;
-        dst += dststride;
+        dst += MAX_PB_SIZE;
     }
 }
 
-#define EPEL(W)                                                                 \
-static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
-                                             uint8_t *src, ptrdiff_t srcstride, \
-                                             int height, int mx, int my,        \
-                                             int16_t *mcbuffer)                 \
-{                                                                               \
-    FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride,                  \
-                               W, height, mx, my, mcbuffer);                    \
-}                                                                               \
-static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
-                                        uint8_t *src, ptrdiff_t srcstride,      \
-                                        int height, int mx, int my,             \
-                                        int16_t *mcbuffer)                      \
-{                                                                               \
-    FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride,                       \
-                          W, height, mx, my, mcbuffer);                         \
-}                                                                               \
-static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
-                                        uint8_t *src, ptrdiff_t srcstride,      \
-                                        int height, int mx, int my,             \
-                                        int16_t *mcbuffer)                      \
-{                                                                               \
-    FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride,                       \
-                          W, height, mx, my, mcbuffer);                         \
-}                                                                               \
-static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,     \
-                                         uint8_t *src, ptrdiff_t srcstride,     \
-                                         int height, int mx, int my,            \
-                                         int16_t *mcbuffer)                     \
-{                                                                               \
-    FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride,                      \
-                           W, height, mx, my, mcbuffer);                        \
+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
+        src += srcstride;
+        dst += dststride;
+    }
 }
 
-EPEL(32)
-EPEL(24)
-EPEL(16)
-EPEL(12)
-EPEL(8)
-EPEL(6)
-EPEL(4)
-EPEL(2)
-
-static av_always_inline void
-FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
-                          int16_t *src, ptrdiff_t srcstride,
-                          int width, int height)
+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
+        }
+        dst  += dststride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
     int shift = 14 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
 #else
     int offset = 0;
 #endif
-    srcstride /= sizeof(*src);
+
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src[x] + offset) >> shift);
-        dst += dststride;
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
         src += srcstride;
+        dst += dststride;
     }
 }
 
-static av_always_inline void
-FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
-                              int16_t *src1, int16_t *src2,
-                              ptrdiff_t srcstride,
-                              int width, int height)
+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                     int16_t *src2,
+                                     int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
-
     int shift = 14 + 1 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
@@ -1179,119 +1229,275 @@ FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
     int offset = 0;
 #endif
 
-    srcstride /= sizeof(*src1);
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
         dst  += dststride;
-        src1 += srcstride;
-        src2 += srcstride;
+        src  += srcstride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int height, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
     }
 }
 
-static av_always_inline void
-FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-                    uint8_t *_dst, ptrdiff_t _dststride,
-                    int16_t *src, ptrdiff_t srcstride,
-                    int width, int height)
+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                      int16_t *src2,
+                                      int height, intptr_t mx, intptr_t my, int width)
 {
-    int shift, log2Wd, wx, ox, x, y, offset;
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
 
-    shift  = 14 - BIT_DEPTH;
-    log2Wd = denom + shift;
-    offset = 1 << (log2Wd - 1);
-    wx     = wlxFlag;
-    ox     = olxFlag * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
+
+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-    srcstride /= sizeof(*src);
+    ox     = ox * (1 << (BIT_DEPTH - 8));
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
-            if (log2Wd >= 1) {
-                dst[x] = av_clip_pixel(((src[x] * wx + offset) >> log2Wd) + ox);
-            } else {
-                dst[x] = av_clip_pixel(src[x] * wx + ox);
-            }
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
         }
         dst += dststride;
         src += srcstride;
     }
 }
 
-static av_always_inline void
-FUNC(weighted_pred_avg)(uint8_t denom,
-                        int16_t wl0Flag, int16_t wl1Flag,
-                        int16_t ol0Flag, int16_t ol1Flag,
-                        uint8_t *_dst, ptrdiff_t _dststride,
-                        int16_t *src1, int16_t *src2,
-                        ptrdiff_t srcstride,
-                        int width, int height)
+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 {
-    int shift, log2Wd, w0, w1, o0, o1, x, y;
-    pixel *dst = (pixel *)_dst;
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}
 
-    shift  = 14 - BIT_DEPTH;
-    log2Wd = denom + shift;
-    w0     = wl0Flag;
-    w1     = wl1Flag;
-    o0     = ol0Flag * (1 << (BIT_DEPTH - 8));
-    o1     = ol1Flag * (1 << (BIT_DEPTH - 8));
+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
 
-    srcstride /= sizeof(*src1);
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
+        }
+        dst += dststride;
+        src += srcstride;
+    }
+}
+
+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                       int16_t *src2,
+                                       int height, int denom, int wx0, int wx1,
+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
-                                    ((o0 + o1 + 1) << log2Wd)) >> (log2Wd + 1));
+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        src  += srcstride;
         dst  += dststride;
-        src1 += srcstride;
-        src2 += srcstride;
+        src2 += MAX_PB_SIZE;
     }
 }
 
-#define PUT_PRED(w)                                                                            \
-static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride,                 \
-                                            int16_t *src, ptrdiff_t srcstride,                 \
-                                            int height)                                        \
-{                                                                                              \
-    FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height);                      \
-}                                                                                              \
-static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride,             \
-                                                int16_t *src1, int16_t *src2,                  \
-                                                ptrdiff_t srcstride, int height)               \
-{                                                                                              \
-    FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height);           \
-}                                                                                              \
-static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset,       \
-                                          uint8_t *dst, ptrdiff_t dststride,                   \
-                                          int16_t *src, ptrdiff_t srcstride, int height)       \
-{                                                                                              \
-    FUNC(weighted_pred)(denom, weight, offset,                                                 \
-                        dst, dststride, src, srcstride, w, height);                            \
-}                                                                                              \
-static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \
-                                              int16_t offset0, int16_t offset1,                \
-                                              uint8_t *dst, ptrdiff_t dststride,               \
-                                              int16_t *src1, int16_t *src2,                    \
-                                              ptrdiff_t srcstride, int height)                 \
-{                                                                                              \
-    FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1,                         \
-                            dst, dststride, src1, src2, srcstride, w, height);                 \
+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    ox     = ox * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
 }
 
-PUT_PRED(64)
-PUT_PRED(48)
-PUT_PRED(32)
-PUT_PRED(24)
-PUT_PRED(16)
-PUT_PRED(12)
-PUT_PRED(8)
-PUT_PRED(6)
-PUT_PRED(4)
-PUT_PRED(2)
-
-// line zero
+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                        int16_t *src2,
+                                        int height, int denom, int wx0, int wx1,
+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+{
+    int x, y;
+    pixel *src = (pixel *)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    pixel *dst          = (pixel *)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+    int shift = 14 + 1 - BIT_DEPTH;
+    int log2Wd = denom + shift - 1;
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++)
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter = ff_hevc_epel_filters[my - 1];
+
+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+        tmp  += MAX_PB_SIZE;
+        dst  += dststride;
+        src2 += MAX_PB_SIZE;
+    }
+}// line zero
 #define P3 pix[-4 * xstride]
 #define P2 pix[-3 * xstride]
 #define P1 pix[-2 * xstride]
@@ -1442,21 +1648,21 @@ static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
 }
 
 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                            int *tc, uint8_t *no_p,
+                                            int32_t *tc, uint8_t *no_p,
                                             uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
 }
 
 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-                                            int *tc, uint8_t *no_p,
+                                            int32_t *tc, uint8_t *no_p,
                                             uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
 }
 
 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int beta, int *tc, uint8_t *no_p,
+                                          int beta, int32_t *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
@@ -1464,7 +1670,7 @@ static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
 }
 
 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-                                          int beta, int *tc, uint8_t *no_p,
+                                          int beta, int32_t *tc, uint8_t *no_p,
                                           uint8_t *no_q)
 {
     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
index 1ba2487..02c1766 100644
--- a/libavcodec/hevcpred.c
+++ b/libavcodec/hevcpred.c
@@ -1,27 +1,29 @@
 /*
- * HEVC video decoder
+ * HEVC video Decoder
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "hevc.h"
 
+#include "hevcpred.h"
+
 #define BIT_DEPTH 8
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
@@ -34,6 +36,10 @@
 #include "hevcpred_template.c"
 #undef BIT_DEPTH
 
+#define BIT_DEPTH 12
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
 {
 #undef FUNC
@@ -61,8 +67,14 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
     case 10:
         HEVC_PRED(10);
         break;
+    case 12:
+        HEVC_PRED(12);
+        break;
     default:
         HEVC_PRED(8);
         break;
     }
+
+    if (ARCH_MIPS)
+        ff_hevc_pred_init_mips(hpc, bit_depth);
 }
diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
new file mode 100644
index 0000000..eb17663
--- /dev/null
+++ b/libavcodec/hevcpred.h
@@ -0,0 +1,46 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HEVCPRED_H
+#define AVCODEC_HEVCPRED_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct HEVCContext;
+
+typedef struct HEVCPredContext {
+    void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+
+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
+                           const uint8_t *left, ptrdiff_t stride);
+    void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+                    ptrdiff_t stride, int log2_size, int c_idx);
+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+                            const uint8_t *left, ptrdiff_t stride,
+                            int c_idx, int mode);
+} HEVCPredContext;
+
+void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
+
+#endif /* AVCODEC_HEVCPRED_H */
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
index 039882b..6ae87cc 100644
--- a/libavcodec/hevcpred_template.c
+++ b/libavcodec/hevcpred_template.c
@@ -3,28 +3,27 @@
  *
  * Copyright (C) 2012 - 2013 Guillaume Martres
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/pixdesc.h"
 
-#include "hevc.h"
-
 #include "bit_depth_template.c"
+#include "hevcpred.h"
 
 #define POS(x, y) src[(x) + stride * (y)]
 
@@ -38,10 +37,9 @@ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
 #define MVF_PU(x, y) \
     MVF(PU(x0 + ((x) << hshift)), PU(y0 + ((y) << vshift)))
 #define IS_INTRA(x, y) \
-    MVF_PU(x, y).is_intra
+    (MVF_PU(x, y).pred_flag == PF_INTRA)
 #define MIN_TB_ADDR_ZS(x, y) \
-    s->ps.pps->min_tb_addr_zs[(y) * s->ps.sps->min_tb_width + (x)]
-
+    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
 #define EXTEND(ptr, val, len)         \
 do {                                  \
     pixel4 pix = PIXEL_SPLAT_X4(val); \
@@ -49,36 +47,43 @@ do {                                  \
         AV_WN4P(ptr + i, pix);        \
 } while (0)
 
+#define EXTEND_RIGHT_CIP(ptr, start, length)                                   \
+        for (i = start; i < (start) + (length); i += 4)                        \
+            if (!IS_INTRA(i, -1))                                              \
+                AV_WN4P(&ptr[i], a);                                           \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i+3])
 #define EXTEND_LEFT_CIP(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
+        for (i = start; i > (start) - (length); i--) \
             if (!IS_INTRA(i - 1, -1)) \
                 ptr[i - 1] = ptr[i]
-#define EXTEND_RIGHT_CIP(ptr, start, length) \
-        for (i = (start); i < (start) + (length); i++) \
-            if (!IS_INTRA(i, -1)) \
-                ptr[i] = ptr[i - 1]
-#define EXTEND_UP_CIP(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
-            if (!IS_INTRA(-1, i - 1)) \
-                ptr[i - 1] = ptr[i]
-#define EXTEND_UP_CIP_0(ptr, start, length) \
-        for (i = (start); i > (start) - (length); i--) \
-            ptr[i - 1] = ptr[i]
-#define EXTEND_DOWN_CIP(ptr, start, length) \
-        for (i = (start); i < (start) + (length); i++) \
-            if (!IS_INTRA(-1, i)) \
-                ptr[i] = ptr[i - 1]
-    HEVCLocalContext *lc = &s->HEVClc;
+#define EXTEND_UP_CIP(ptr, start, length)                                      \
+        for (i = (start); i > (start) - (length); i -= 4)                      \
+            if (!IS_INTRA(-1, i - 3))                                          \
+                AV_WN4P(&ptr[i - 3], a);                                       \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i - 3])
+#define EXTEND_DOWN_CIP(ptr, start, length)                                    \
+        for (i = start; i < (start) + (length); i += 4)                        \
+            if (!IS_INTRA(-1, i))                                              \
+                AV_WN4P(&ptr[i], a);                                           \
+            else                                                               \
+                a = PIXEL_SPLAT_X4(ptr[i + 3])
+
+    HEVCLocalContext *lc = s->HEVClc;
     int i;
     int hshift = s->ps.sps->hshift[c_idx];
     int vshift = s->ps.sps->vshift[c_idx];
     int size = (1 << log2_size);
-    int size_in_luma = size << hshift;
-    int size_in_tbs = size_in_luma >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_h = size << hshift;
+    int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_v = size << vshift;
+    int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
     int x = x0 >> hshift;
     int y = y0 >> vshift;
-    int x_tb = x0 >> s->ps.sps->log2_min_tb_size;
-    int y_tb = y0 >> s->ps.sps->log2_min_tb_size;
+    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
 
     ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
@@ -86,87 +91,77 @@ do {                                  \
 
     int min_pu_width = s->ps.sps->min_pu_width;
 
-    enum IntraPredMode mode = c_idx ? lc->pu.intra_pred_mode_c :
-                              lc->tu.cur_intra_pred_mode;
-
-    pixel left_array[2 * MAX_TB_SIZE + 1];
-    pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
-    pixel top_array[2 * MAX_TB_SIZE + 1];
-    pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
-
-    pixel *left          = left_array + 1;
-    pixel *top           = top_array  + 1;
-    pixel *filtered_left = filtered_left_array + 1;
-    pixel *filtered_top  = filtered_top_array  + 1;
-
-    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb - 1, y_tb + size_in_tbs);
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+                              lc->tu.intra_pred_mode;
+    pixel4 a;
+    pixel  left_array[2 * MAX_TB_SIZE + 1];
+    pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+    pixel  top_array[2 * MAX_TB_SIZE + 1];
+    pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+
+    pixel  *left          = left_array + 1;
+    pixel  *top           = top_array  + 1;
+    pixel  *filtered_left = filtered_left_array + 1;
+    pixel  *filtered_top  = filtered_top_array  + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
     int cand_left        = lc->na.cand_left;
     int cand_up_left     = lc->na.cand_up_left;
     int cand_up          = lc->na.cand_up;
-    int cand_up_right    = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb + size_in_tbs, y_tb - 1);
+    int cand_up_right    = lc->na.cand_up_right    && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1);
 
-    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma, s->ps.sps->height) -
-                            (y0 + size_in_luma)) >> vshift;
-    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma, s->ps.sps->width) -
-                            (x0 + size_in_luma)) >> hshift;
+    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) -
+                           (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                           (x0 + size_in_luma_h)) >> hshift;
 
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
-        int size_in_luma_pu = PU(size_in_luma);
-        int on_pu_edge_x    = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
-        int on_pu_edge_y    = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
-        if (!size_in_luma_pu)
-            size_in_luma_pu++;
+        int size_in_luma_pu_v = PU(size_in_luma_v);
+        int size_in_luma_pu_h = PU(size_in_luma_h);
+        int on_pu_edge_x    = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size);
+        int on_pu_edge_y    = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size);
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
         if (cand_bottom_left == 1 && on_pu_edge_x) {
             int x_left_pu   = PU(x0 - 1);
-            int y_bottom_pu = PU(y0 + size_in_luma);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_height - y_bottom_pu);
+            int y_bottom_pu = PU(y0 + size_in_luma_v);
+            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu);
             cand_bottom_left = 0;
-            for (i = 0; i < max; i++)
-                cand_bottom_left |= MVF(x_left_pu, y_bottom_pu + i).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA);
         }
         if (cand_left == 1 && on_pu_edge_x) {
             int x_left_pu   = PU(x0 - 1);
             int y_left_pu   = PU(y0);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_height - y_left_pu);
+            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu);
             cand_left = 0;
-            for (i = 0; i < max; i++)
-                cand_left |= MVF(x_left_pu, y_left_pu + i).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA);
         }
         if (cand_up_left == 1) {
             int x_left_pu   = PU(x0 - 1);
             int y_top_pu    = PU(y0 - 1);
-            cand_up_left = MVF(x_left_pu, y_top_pu).is_intra;
+            cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA;
         }
         if (cand_up == 1 && on_pu_edge_y) {
             int x_top_pu    = PU(x0);
             int y_top_pu    = PU(y0 - 1);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_width - x_top_pu);
+            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu);
             cand_up = 0;
-            for (i = 0; i < max; i++)
-                cand_up |= MVF(x_top_pu + i, y_top_pu).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA);
         }
         if (cand_up_right == 1 && on_pu_edge_y) {
             int y_top_pu    = PU(y0 - 1);
-            int x_right_pu  = PU(x0 + size_in_luma);
-            int max = FFMIN(size_in_luma_pu, s->ps.sps->min_pu_width - x_right_pu);
+            int x_right_pu  = PU(x0 + size_in_luma_h);
+            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu);
             cand_up_right = 0;
-            for (i = 0; i < max; i++)
-                cand_up_right |= MVF(x_right_pu + i, y_top_pu).is_intra;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA);
         }
-        for (i = 0; i < 2 * MAX_TB_SIZE; i++) {
-            left[i] = 128;
-            top[i]  = 128;
-        }
-    }
-    if (cand_bottom_left) {
-        for (i = size; i < size + bottom_left_size; i++)
-            left[i] = POS(-1, i);
-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
-               size - bottom_left_size);
+        memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel));
+        memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel));
+        top[-1] = 128;
     }
-    if (cand_left)
-        for (i = size - 1; i >= 0; i--)
-            left[i] = POS(-1, i);
     if (cand_up_left) {
         left[-1] = POS(-1, -1);
         top[-1]  = left[-1];
@@ -178,6 +173,15 @@ do {                                  \
         EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
                size - top_right_size);
     }
+    if (cand_left)
+        for (i = 0; i < size; i++)
+            left[i] = POS(-1, i);
+    if (cand_bottom_left) {
+        for (i = size; i < size + bottom_left_size; i++)
+            left[i] = POS(-1, i);
+        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+               size - bottom_left_size);
+    }
 
     if (s->ps.pps->constrained_intra_pred_flag == 1) {
         if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) {
@@ -203,7 +207,6 @@ do {                                  \
                         j++;
                     EXTEND_LEFT_CIP(top, j, j + 1);
                     left[-1] = top[-1];
-                    j        = 0;
                 }
             } else {
                 j = 0;
@@ -217,24 +220,30 @@ do {                                  \
                         top[-1] = top[0];
                     }
                 left[-1] = top[-1];
-                j        = 0;
             }
+            left[-1] = top[-1];
             if (cand_bottom_left || cand_left) {
-                EXTEND_DOWN_CIP(left, j, size_max_y - j);
+                a = PIXEL_SPLAT_X4(left[-1]);
+                EXTEND_DOWN_CIP(left, 0, size_max_y);
             }
             if (!cand_left)
                 EXTEND(left, left[-1], size);
             if (!cand_bottom_left)
                 EXTEND(left + size, left[size - 1], size);
             if (x0 != 0 && y0 != 0) {
+                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
                 EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
+                if (!IS_INTRA(-1, - 1))
+                    left[-1] = left[0];
             } else if (x0 == 0) {
-                EXTEND_UP_CIP_0(left, size_max_y - 1, size_max_y);
+                EXTEND(left, 0, size_max_y);
             } else {
-                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y - 1);
+                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
+                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
             }
             top[-1] = left[-1];
             if (y0 != 0) {
+                a = PIXEL_SPLAT_X4(left[-1]);
                 EXTEND_RIGHT_CIP(top, 0, size_max_x);
             }
         }
@@ -278,40 +287,42 @@ do {                                  \
     top[-1] = left[-1];
 
     // Filtering process
-    if (c_idx == 0 && mode != INTRA_DC && size != 4) {
-        int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-        int min_dist_vert_hor = FFMIN(FFABS((int)mode - 26),
-                                      FFABS((int)mode - 10));
-        if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
-            int threshold = 1 << (BIT_DEPTH - 5);
-            if (s->ps.sps->sps_strong_intra_smoothing_enable_flag &&
-                log2_size == 5 &&
-                FFABS(top[-1]  + top[63]  - 2 * top[31])  < threshold &&
-                FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
-                // We can't just overwrite values in top because it could be
-                // a pointer into src
-                filtered_top[-1] = top[-1];
-                filtered_top[63] = top[63];
-                for (i = 0; i < 63; i++)
-                    filtered_top[i] = ((64 - (i + 1)) * top[-1] +
-                                             (i + 1)  * top[63] + 32) >> 6;
-                for (i = 0; i < 63; i++)
-                    left[i] = ((64 - (i + 1)) * left[-1] +
-                                     (i + 1)  * left[63] + 32) >> 6;
-                top = filtered_top;
-            } else {
-                filtered_left[2 * size - 1] = left[2 * size - 1];
-                filtered_top[2 * size - 1]  = top[2 * size - 1];
-                for (i = 2 * size - 2; i >= 0; i--)
-                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
-                                        left[i - 1] + 2) >> 2;
-                filtered_top[-1]  =
-                filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
-                for (i = 2 * size - 2; i >= 0; i--)
-                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
-                                       top[i - 1] + 2) >> 2;
-                left = filtered_left;
-                top  = filtered_top;
+    if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && size != 4){
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)),
+                                          FFABS((int)(mode - 10U)));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
+                int threshold = 1 << (BIT_DEPTH - 5);
+                if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 &&
+                    log2_size == 5 &&
+                    FFABS(top[-1]  + top[63]  - 2 * top[31])  < threshold &&
+                    FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
+                    // We can't just overwrite values in top because it could be
+                    // a pointer into src
+                    filtered_top[-1] = top[-1];
+                    filtered_top[63] = top[63];
+                    for (i = 0; i < 63; i++)
+                        filtered_top[i] = ((64 - (i + 1)) * top[-1] +
+                                           (i + 1)  * top[63] + 32) >> 6;
+                    for (i = 0; i < 63; i++)
+                        left[i] = ((64 - (i + 1)) * left[-1] +
+                                   (i + 1)  * left[63] + 32) >> 6;
+                    top = filtered_top;
+                } else {
+                    filtered_left[2 * size - 1] = left[2 * size - 1];
+                    filtered_top[2 * size - 1]  = top[2 * size - 1];
+                    for (i = 2 * size - 2; i >= 0; i--)
+                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                            left[i - 1] + 2) >> 2;
+                    filtered_top[-1]  =
+                    filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                    for (i = 2 * size - 2; i >= 0; i--)
+                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                           top[i - 1] + 2) >> 2;
+                    left = filtered_left;
+                    top  = filtered_top;
+                }
             }
         }
     }
@@ -394,8 +405,8 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
     a = PIXEL_SPLAT_X4(dc);
 
     for (i = 0; i < size; i++)
-        for (j = 0; j < size / 4; j++)
-            AV_WN4PA(&POS(j * 4, i), a);
+        for (j = 0; j < size; j+=4)
+            AV_WN4P(&POS(j, i), a);
 
     if (c_idx == 0 && size < 32) {
         POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
@@ -427,7 +438,7 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     };
 
     int angle = intra_pred_angle[mode - 2];
-    pixel ref_array[3 * MAX_TB_SIZE + 1];
+    pixel ref_array[3 * MAX_TB_SIZE + 4];
     pixel *ref_tmp = ref_array + size;
     const pixel *ref;
     int last = (size * angle) >> 5;
@@ -435,8 +446,8 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     if (mode >= 18) {
         ref = top - 1;
         if (angle < 0 && last < -1) {
-            for (x = 0; x <= size; x++)
-                ref_tmp[x] = top[x - 1];
+            for (x = 0; x <= size; x += 4)
+                AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1]));
             for (x = last; x <= -1; x++)
                 ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
             ref = ref_tmp;
@@ -446,13 +457,19 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
             int idx  = ((y + 1) * angle) >> 5;
             int fact = ((y + 1) * angle) & 31;
             if (fact) {
-                for (x = 0; x < size; x++) {
-                    POS(x, y) = ((32 - fact) * ref[x + idx + 1] +
-                                       fact  * ref[x + idx + 2] + 16) >> 5;
+                for (x = 0; x < size; x += 4) {
+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
+                                           fact  * ref[x + idx + 2] + 16) >> 5;
+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
                 }
             } else {
-                for (x = 0; x < size; x++)
-                    POS(x, y) = ref[x + idx + 1];
+                for (x = 0; x < size; x += 4)
+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
             }
         }
         if (mode == 26 && c_idx == 0 && size < 32) {
@@ -462,8 +479,8 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
     } else {
         ref = left - 1;
         if (angle < 0 && last < -1) {
-            for (x = 0; x <= size; x++)
-                ref_tmp[x] = left[x - 1];
+            for (x = 0; x <= size; x += 4)
+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
             for (x = last; x <= -1; x++)
                 ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
             ref = ref_tmp;
@@ -483,8 +500,12 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
             }
         }
         if (mode == 10 && c_idx == 0 && size < 32) {
-            for (x = 0; x < size; x++)
-                POS(x, 0) = av_clip_pixel(left[0] + ((top[x] - top[-1]) >> 1));
+            for (x = 0; x < size; x += 4) {
+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - top[-1]) >> 1));
+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1));
+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1));
+            }
         }
     }
 }
diff --git a/libavcodec/hnm4video.c b/libavcodec/hnm4video.c
index 1dc6ed3..a64dbb1 100644
--- a/libavcodec/hnm4video.c
+++ b/libavcodec/hnm4video.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 David Kment
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,7 +79,7 @@ static void unpack_intraframe(AVCodecContext *avctx, uint8_t *src,
         if (getbit(&gb, &bitbuf, &bits)) {
             if (writeoffset >= hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
             hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
@@ -100,11 +100,11 @@ static void unpack_intraframe(AVCodecContext *avctx, uint8_t *src,
             count  += 2;
             offset += writeoffset;
             if (offset < 0 || offset + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
             } else if (writeoffset + count >= hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
             while (count--) {
@@ -147,7 +147,8 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
 {
     Hnm4VideoContext *hnm = avctx->priv_data;
     GetByteContext gb;
-    uint32_t writeoffset = 0, count, left, offset;
+    uint32_t writeoffset = 0;
+    int count, left, offset;
     uint8_t tag, previous, backline, backward, swap;
 
     bytestream2_init(&gb, src, size);
@@ -157,7 +158,12 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
         if (count == 0) {
             tag = bytestream2_get_byte(&gb) & 0xE0;
             tag = tag >> 5;
+
             if (tag == 0) {
+                if (writeoffset + 2 > hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
                 hnm->current[writeoffset++] = bytestream2_get_byte(&gb);
             } else if (tag == 1) {
@@ -168,6 +174,10 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
                 writeoffset += count;
             } else if (tag == 3) {
                 count = bytestream2_get_byte(&gb) * 2;
+                if (writeoffset + count > hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 while (count > 0) {
                     hnm->current[writeoffset++] = bytestream2_peek_byte(&gb);
                     count--;
@@ -176,6 +186,10 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
             } else {
                 break;
             }
+            if (writeoffset > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                break;
+            }
         } else {
             previous = bytestream2_peek_byte(&gb) & 0x20;
             backline = bytestream2_peek_byte(&gb) & 0x40;
@@ -188,17 +202,28 @@ static void decode_interframe_v4(AVCodecContext *avctx, uint8_t *src, uint32_t s
 
             left = count;
 
-            if (!backward && offset + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+            if (!backward && offset + 2*count > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
-            } else if (backward && offset >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+            } else if (backward && offset + 1 >= hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
-            } else if (writeoffset + count >= hnm->width * hnm->height) {
+            } else if (writeoffset + 2*count > hnm->width * hnm->height) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Attempting to write out of bounds");
+                       "Attempting to write out of bounds\n");
                 break;
             }
+            if(backward) {
+                if (offset < (!!backline)*(2 * hnm->width - 1) + 2*(left-1)) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
+            } else {
+                if (offset < (!!backline)*(2 * hnm->width - 1)) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
+            }
 
             if (previous) {
                 while (left > 0) {
@@ -263,6 +288,10 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             if (tag == 0) {
                 writeoffset += bytestream2_get_byte(&gb);
             } else if (tag == 1) {
+                if (writeoffset + hnm->width >= hnm->width * hnm->height) {
+                    av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                    break;
+                }
                 hnm->current[writeoffset]              = bytestream2_get_byte(&gb);
                 hnm->current[writeoffset + hnm->width] = bytestream2_get_byte(&gb);
                 writeoffset++;
@@ -271,6 +300,10 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             } else if (tag == 3) {
                 break;
             }
+            if (writeoffset > hnm->width * hnm->height) {
+                av_log(avctx, AV_LOG_ERROR, "writeoffset out of bounds\n");
+                break;
+            }
         } else {
             delta    = bytestream2_peek_byte(&gb) & 0x80;
             previous = bytestream2_peek_byte(&gb) & 0x40;
@@ -279,14 +312,19 @@ static void decode_interframe_v4a(AVCodecContext *avctx, uint8_t *src,
             offset  = writeoffset;
             offset += bytestream2_get_le16(&gb);
 
-            if (delta)
+            if (delta) {
+                if (offset < 0x10000) {
+                    av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
+                    break;
+                }
                 offset -= 0x10000;
+            }
 
             if (offset + hnm->width + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to read out of bounds\n");
                 break;
             } else if (writeoffset + hnm->width + count >= hnm->width * hnm->height) {
-                av_log(avctx, AV_LOG_ERROR, "Attempting to write out of bounds");
+                av_log(avctx, AV_LOG_ERROR, "Attempting to write out of bounds\n");
                 break;
             }
 
@@ -360,17 +398,23 @@ static int hnm_decode_frame(AVCodecContext *avctx, void *data,
     int ret;
     uint16_t chunk_id;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (avpkt->size < 8) {
+        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+        return AVERROR_INVALIDDATA;
     }
 
     chunk_id = AV_RL16(avpkt->data + 4);
 
     if (chunk_id == HNM4_CHUNK_ID_PL) {
         hnm_update_palette(avctx, avpkt->data, avpkt->size);
-        frame->palette_has_changed = 1;
     } else if (chunk_id == HNM4_CHUNK_ID_IZ) {
+        if (avpkt->size < 12) {
+            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+
         unpack_intraframe(avctx, avpkt->data + 12, avpkt->size - 12);
         memcpy(hnm->previous, hnm->current, hnm->width * hnm->height);
         if (hnm->version == 0x4a)
@@ -383,6 +427,9 @@ static int hnm_decode_frame(AVCodecContext *avctx, void *data,
         memcpy(frame->data[1], hnm->palette, 256 * 4);
         *got_frame = 1;
     } else if (chunk_id == HNM4_CHUNK_ID_IU) {
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+            return ret;
+
         if (hnm->version == 0x4a) {
             decode_interframe_v4a(avctx, avpkt->data + 8, avpkt->size - 8);
             memcpy(hnm->processed, hnm->current, hnm->width * hnm->height);
@@ -427,7 +474,9 @@ static av_cold int hnm_decode_init(AVCodecContext *avctx)
     hnm->buffer2   = av_mallocz(avctx->width * avctx->height);
     hnm->processed = av_mallocz(avctx->width * avctx->height);
 
-    if (!hnm->buffer1 || !hnm->buffer2 || !hnm->processed) {
+    if (   !hnm->buffer1 || !hnm->buffer2 || !hnm->processed
+        || avctx->width * avctx->height == 0
+        || avctx->height % 2) {
         av_log(avctx, AV_LOG_ERROR, "av_mallocz() failed\n");
         av_freep(&hnm->buffer1);
         av_freep(&hnm->buffer2);
diff --git a/libavcodec/hpel_template.c b/libavcodec/hpel_template.c
index 81d3892..fccfe76 100644
--- a/libavcodec/hpel_template.c
+++ b/libavcodec/hpel_template.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 25694c5..8e2fd8f 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -5,20 +5,20 @@
  *
  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -357,10 +357,14 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
 
     if (ARCH_AARCH64)
         ff_hpeldsp_init_aarch64(c, flags);
+    if (ARCH_ALPHA)
+        ff_hpeldsp_init_alpha(c, flags);
     if (ARCH_ARM)
         ff_hpeldsp_init_arm(c, flags);
     if (ARCH_PPC)
         ff_hpeldsp_init_ppc(c, flags);
     if (ARCH_X86)
         ff_hpeldsp_init_x86(c, flags);
+    if (ARCH_MIPS)
+        ff_hpeldsp_init_mips(c, flags);
 }
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index d037cba..1a3cea5 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,8 +95,10 @@ typedef struct HpelDSPContext {
 void ff_hpeldsp_init(HpelDSPContext *c, int flags);
 
 void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags);
+void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags);
 
 #endif /* AVCODEC_HPELDSP_H */
diff --git a/libavcodec/hq_hqa.c b/libavcodec/hq_hqa.c
index c63e5a8..8825f3d 100644
--- a/libavcodec/hq_hqa.c
+++ b/libavcodec/hq_hqa.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -121,7 +121,7 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
     uint32_t slice_off[21];
     int slice, start_off, next_off, i, ret;
 
-    if (prof_num >= NUM_HQ_PROFILES) {
+    if ((unsigned)prof_num >= NUM_HQ_PROFILES) {
         profile = &ff_hq_profile[0];
         avpriv_request_sample(ctx->avctx, "HQ Profile %d", prof_num);
     } else {
@@ -137,10 +137,8 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
     ctx->avctx->pix_fmt             = AV_PIX_FMT_YUV422P;
 
     ret = ff_get_buffer(ctx->avctx, pic, 0);
-    if (ret < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     /* Offsets are stored from CUV position, so adjust them accordingly. */
     for (i = 0; i < profile->num_slices + 1; i++)
@@ -156,7 +154,7 @@ static int hq_decode_frame(HQContext *ctx, AVFrame *pic,
             slice_off[slice] >= slice_off[slice + 1] ||
             slice_off[slice + 1] > data_size) {
             av_log(ctx->avctx, AV_LOG_ERROR,
-                   "Invalid slice size %zu.\n", data_size);
+                   "Invalid slice size %"SIZE_SPECIFIER".\n", data_size);
             break;
         }
         init_get_bits(&gb, src + slice_off[slice],
@@ -267,10 +265,8 @@ static int hqa_decode_frame(HQContext *ctx, AVFrame *pic, size_t data_size)
     }
 
     ret = ff_get_buffer(ctx->avctx, pic, 0);
-    if (ret < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     /* Offsets are stored from HQA1 position, so adjust them accordingly. */
     for (i = 0; i < num_slices + 1; i++)
@@ -281,7 +277,7 @@ static int hqa_decode_frame(HQContext *ctx, AVFrame *pic, size_t data_size)
             slice_off[slice] >= slice_off[slice + 1] ||
             slice_off[slice + 1] > data_size) {
             av_log(ctx->avctx, AV_LOG_ERROR,
-                   "Invalid slice size %zu.\n", data_size);
+                   "Invalid slice size %"SIZE_SPECIFIER".\n", data_size);
             break;
         }
         init_get_bits(&gb, src + slice_off[slice],
@@ -302,7 +298,8 @@ static int hq_hqa_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *pic = data;
     uint32_t info_tag;
     unsigned int data_size;
-    int tag, ret;
+    int ret;
+    unsigned tag;
 
     bytestream2_init(&ctx->gbc, avpkt->data, avpkt->size);
     if (bytestream2_get_bytes_left(&ctx->gbc) < 4 + 4) {
diff --git a/libavcodec/hq_hqa.h b/libavcodec/hq_hqa.h
index 6bd858d..4286dd0 100644
--- a/libavcodec/hq_hqa.h
+++ b/libavcodec/hq_hqa.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadata.c b/libavcodec/hq_hqadata.c
index 23fefc1..ae9231a 100644
--- a/libavcodec/hq_hqadata.c
+++ b/libavcodec/hq_hqadata.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadsp.c b/libavcodec/hq_hqadsp.c
index 93fc067..db1ea2e 100644
--- a/libavcodec/hq_hqadsp.c
+++ b/libavcodec/hq_hqadsp.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hq_hqadsp.h b/libavcodec/hq_hqadsp.h
index 22b1e61..420ed92 100644
--- a/libavcodec/hq_hqadsp.h
+++ b/libavcodec/hq_hqadsp.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQ/HQA decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqx.c b/libavcodec/hqx.c
index 7411d3f..138d960 100644
--- a/libavcodec/hqx.c
+++ b/libavcodec/hqx.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -458,7 +458,7 @@ static int hqx_decode_frame(AVCodecContext *avctx, void *data,
     }
     ret = av_image_check_size(ctx->width, ctx->height, 0, avctx);
     if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid stored dimenstions %dx%d.\n",
+        av_log(avctx, AV_LOG_ERROR, "Invalid stored dimensions %dx%d.\n",
                ctx->width, ctx->height);
         return AVERROR_INVALIDDATA;
     }
@@ -492,10 +492,8 @@ static int hqx_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     ret = ff_get_buffer(avctx, ctx->pic, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
+    if (ret < 0)
         return ret;
-    }
 
     avctx->execute2(avctx, decode_slice_thread, NULL, NULL, 16);
 
diff --git a/libavcodec/hqx.h b/libavcodec/hqx.h
index 7f32971..42d382d 100644
--- a/libavcodec/hqx.h
+++ b/libavcodec/hqx.h
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqxdsp.c b/libavcodec/hqxdsp.c
index 2a02299..feff9c0 100644
--- a/libavcodec/hqxdsp.c
+++ b/libavcodec/hqxdsp.c
@@ -1,20 +1,20 @@
 /*
  * HQX DSP routines
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -118,7 +118,7 @@ static void hqx_idct_put(uint16_t *dst, ptrdiff_t stride,
 
     for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
-            int v = av_clip(block[j + i * 8] + 0x800, 0, 0xFFF);
+            int v = av_clip_uintp2(block[j + i * 8] + 0x800, 12);
             dst[j] = (v << 4) | (v >> 8);
         }
         dst += stride >> 1;
diff --git a/libavcodec/hqxdsp.h b/libavcodec/hqxdsp.h
index 2cd2a8e..39ab3e2 100644
--- a/libavcodec/hqxdsp.h
+++ b/libavcodec/hqxdsp.h
@@ -1,20 +1,20 @@
 /*
  * HQX DSP routines
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/hqxvlc.c b/libavcodec/hqxvlc.c
index d185e86..06a8073 100644
--- a/libavcodec/hqxvlc.c
+++ b/libavcodec/hqxvlc.c
@@ -1,20 +1,20 @@
 /*
  * Canopus HQX decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/htmlsubtitles.c b/libavcodec/htmlsubtitles.c
new file mode 100644
index 0000000..a2cd40f
--- /dev/null
+++ b/libavcodec/htmlsubtitles.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/parseutils.h"
+#include "htmlsubtitles.h"
+
+static int html_color_parse(void *log_ctx, const char *str)
+{
+    uint8_t rgba[4];
+    if (av_parse_color(rgba, str, strcspn(str, "\" >"), log_ctx) < 0)
+        return -1;
+    return rgba[0] | rgba[1] << 8 | rgba[2] << 16;
+}
+
+enum {
+    PARAM_UNKNOWN = -1,
+    PARAM_SIZE,
+    PARAM_COLOR,
+    PARAM_FACE,
+    PARAM_NUMBER
+};
+
+typedef struct SrtStack {
+    char tag[128];
+    char param[PARAM_NUMBER][128];
+} SrtStack;
+
+static void rstrip_spaces_buf(AVBPrint *buf)
+{
+    while (buf->len > 0 && buf->str[buf->len - 1] == ' ')
+        buf->str[--buf->len] = 0;
+}
+
+void ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in)
+{
+    char *param, buffer[128], tmp[128];
+    int len, tag_close, sptr = 1, line_start = 1, an = 0, end = 0;
+    SrtStack stack[16];
+
+    stack[0].tag[0] = 0;
+    strcpy(stack[0].param[PARAM_SIZE],  "{\\fs}");
+    strcpy(stack[0].param[PARAM_COLOR], "{\\c}");
+    strcpy(stack[0].param[PARAM_FACE],  "{\\fn}");
+
+    for (; !end && *in; in++) {
+        switch (*in) {
+        case '\r':
+            break;
+        case '\n':
+            if (line_start) {
+                end = 1;
+                break;
+            }
+            rstrip_spaces_buf(dst);
+            av_bprintf(dst, "\\N");
+            line_start = 1;
+            break;
+        case ' ':
+            if (!line_start)
+                av_bprint_chars(dst, *in, 1);
+            break;
+        case '{':    /* skip all {\xxx} substrings except for {\an%d}
+                        and all microdvd like styles such as {Y:xxx} */
+            len = 0;
+            an += sscanf(in, "{\\an%*1u}%n", &len) >= 0 && len > 0;
+            if ((an != 1 && (len = 0, sscanf(in, "{\\%*[^}]}%n", &len) >= 0 && len > 0)) ||
+                (len = 0, sscanf(in, "{%*1[CcFfoPSsYy]:%*[^}]}%n", &len) >= 0 && len > 0)) {
+                in += len - 1;
+            } else
+                av_bprint_chars(dst, *in, 1);
+            break;
+        case '<':
+            tag_close = in[1] == '/';
+            len = 0;
+            if (sscanf(in+tag_close+1, "%127[^>]>%n", buffer, &len) >= 1 && len > 0) {
+                const char *tagname = buffer;
+                while (*tagname == ' ')
+                    tagname++;
+                if ((param = strchr(tagname, ' ')))
+                    *param++ = 0;
+                if ((!tag_close && sptr < FF_ARRAY_ELEMS(stack)) ||
+                    ( tag_close && sptr > 0 && !strcmp(stack[sptr-1].tag, tagname))) {
+                    int i, j, unknown = 0;
+                    in += len + tag_close;
+                    if (!tag_close)
+                        memset(stack+sptr, 0, sizeof(*stack));
+                    if (!strcmp(tagname, "font")) {
+                        if (tag_close) {
+                            for (i=PARAM_NUMBER-1; i>=0; i--)
+                                if (stack[sptr-1].param[i][0])
+                                    for (j=sptr-2; j>=0; j--)
+                                        if (stack[j].param[i][0]) {
+                                            av_bprintf(dst, "%s", stack[j].param[i]);
+                                            break;
+                                        }
+                        } else {
+                            while (param) {
+                                if (!strncmp(param, "size=", 5)) {
+                                    unsigned font_size;
+                                    param += 5 + (param[5] == '"');
+                                    if (sscanf(param, "%u", &font_size) == 1) {
+                                        snprintf(stack[sptr].param[PARAM_SIZE],
+                                             sizeof(stack[0].param[PARAM_SIZE]),
+                                             "{\\fs%u}", font_size);
+                                    }
+                                } else if (!strncmp(param, "color=", 6)) {
+                                    param += 6 + (param[6] == '"');
+                                    snprintf(stack[sptr].param[PARAM_COLOR],
+                                         sizeof(stack[0].param[PARAM_COLOR]),
+                                         "{\\c&H%X&}",
+                                         html_color_parse(log_ctx, param));
+                                } else if (!strncmp(param, "face=", 5)) {
+                                    param += 5 + (param[5] == '"');
+                                    len = strcspn(param,
+                                                  param[-1] == '"' ? "\"" :" ");
+                                    av_strlcpy(tmp, param,
+                                               FFMIN(sizeof(tmp), len+1));
+                                    param += len;
+                                    snprintf(stack[sptr].param[PARAM_FACE],
+                                             sizeof(stack[0].param[PARAM_FACE]),
+                                             "{\\fn%s}", tmp);
+                                }
+                                if ((param = strchr(param, ' ')))
+                                    param++;
+                            }
+                            for (i=0; i<PARAM_NUMBER; i++)
+                                if (stack[sptr].param[i][0])
+                                    av_bprintf(dst, "%s", stack[sptr].param[i]);
+                        }
+                    } else if (!tagname[1] && strspn(tagname, "bisu") == 1) {
+                        av_bprintf(dst, "{\\%c%d}", tagname[0], !tag_close);
+                    } else {
+                        unknown = 1;
+                        snprintf(tmp, sizeof(tmp), "</%s>", tagname);
+                    }
+                    if (tag_close) {
+                        sptr--;
+                    } else if (unknown && !strstr(in, tmp)) {
+                        in -= len + tag_close;
+                        av_bprint_chars(dst, *in, 1);
+                    } else
+                        av_strlcpy(stack[sptr++].tag, tagname,
+                                   sizeof(stack[0].tag));
+                    break;
+                }
+            }
+        default:
+            av_bprint_chars(dst, *in, 1);
+            break;
+        }
+        if (*in != ' ' && *in != '\r' && *in != '\n')
+            line_start = 0;
+    }
+
+    while (dst->len >= 2 && !strncmp(&dst->str[dst->len - 2], "\\N", 2))
+        dst->len -= 2;
+    dst->str[dst->len] = 0;
+    rstrip_spaces_buf(dst);
+}
diff --git a/libavcodec/htmlsubtitles.h b/libavcodec/htmlsubtitles.h
new file mode 100644
index 0000000..e10cdda
--- /dev/null
+++ b/libavcodec/htmlsubtitles.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HTMLSUBTITLES_H
+#define AVCODEC_HTMLSUBTITLES_H
+
+#include "libavutil/bprint.h"
+
+void ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in);
+
+#endif /* AVCODEC_HTMLSUBTITLES_H */
diff --git a/libavcodec/huffman.c b/libavcodec/huffman.c
index 2a3db87..1f5d8b9 100644
--- a/libavcodec/huffman.c
+++ b/libavcodec/huffman.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2006 Konstantin Shishkov
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 
 #include <stdint.h>
 
+#include "libavutil/qsort.h"
 #include "avcodec.h"
 #include "huffman.h"
 
@@ -51,18 +52,31 @@ static void heap_sift(HeapElem *h, int root, int size)
     }
 }
 
-void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats)
+int ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats, int stats_size, int skip0)
 {
-    HeapElem h[256];
-    int up[2*256];
-    int len[2*256];
+    HeapElem *h  = av_malloc_array(sizeof(*h), stats_size);
+    int *up      = av_malloc_array(sizeof(*up) * 2, stats_size);
+    uint8_t *len = av_malloc_array(sizeof(*len) * 2, stats_size);
+    uint16_t *map= av_malloc_array(sizeof(*map), stats_size);
     int offset, i, next;
-    int size = 256;
+    int size = 0;
+    int ret = 0;
+
+    if (!h || !up || !len || !map) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    for (i = 0; i<stats_size; i++) {
+        dst[i] = 255;
+        if (stats[i] || !skip0)
+            map[size++] = i;
+    }
 
     for (offset = 1; ; offset <<= 1) {
         for (i=0; i < size; i++) {
             h[i].name = i;
-            h[i].val = (stats[i] << 8) + offset;
+            h[i].val = (stats[map[i]] << 14) + offset;
         }
         for (i = size / 2 - 1; i >= 0; i--)
             heap_sift(h, i, size);
@@ -83,11 +97,17 @@ void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats)
         for (i = 2 * size - 3; i >= size; i--)
             len[i] = len[up[i]] + 1;
         for (i = 0; i < size; i++) {
-            dst[i] = len[up[i]] + 1;
-            if (dst[i] >= 32) break;
+            dst[map[i]] = len[up[i]] + 1;
+            if (dst[map[i]] >= 32) break;
         }
         if (i==size) break;
     }
+end:
+    av_free(h);
+    av_free(up);
+    av_free(len);
+    av_free(map);
+    return ret;
 }
 
 static void get_tree_codes(uint32_t *bits, int16_t *lens, uint8_t *xlat,
@@ -150,22 +170,23 @@ int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes, int nb_bit
                "Tree construction is not possible\n");
         return -1;
     }
-    qsort(nodes, nb_codes, sizeof(Node), cmp);
+    AV_QSORT(nodes, nb_codes, Node, cmp);
     cur_node = nb_codes;
     nodes[nb_codes*2-1].count = 0;
     for (i = 0; i < nb_codes * 2 - 1; i += 2) {
-        nodes[cur_node].sym = HNODE;
-        nodes[cur_node].count = nodes[i].count + nodes[i + 1].count;
-        nodes[cur_node].n0 = i;
-        for (j = cur_node; j > 0; j--) {
-            if (nodes[j].count > nodes[j - 1].count ||
-                (nodes[j].count == nodes[j - 1].count &&
-                 (!(flags & FF_HUFFMAN_FLAG_HNODE_FIRST) ||
-                  nodes[j].n0 == j - 1 || nodes[j].n0 == j - 2 ||
-                  (nodes[j].sym!=HNODE && nodes[j-1].sym!=HNODE))))
+        uint32_t cur_count = nodes[i].count + nodes[i+1].count;
+        // find correct place to insert new node, and
+        // make space for the new node while at it
+        for(j = cur_node; j > i + 2; j--){
+            if(cur_count > nodes[j-1].count ||
+               (cur_count == nodes[j-1].count &&
+                !(flags & FF_HUFFMAN_FLAG_HNODE_FIRST)))
                 break;
-            FFSWAP(Node, nodes[j], nodes[j - 1]);
+            nodes[j] = nodes[j - 1];
         }
+        nodes[j].sym = HNODE;
+        nodes[j].count = cur_count;
+        nodes[j].n0 = i;
         cur_node++;
     }
     if (build_huff_tree(vlc, nodes, nb_codes * 2 - 2, flags, nb_bits) < 0) {
diff --git a/libavcodec/huffman.h b/libavcodec/huffman.h
index c9eeb37..6ab23ae 100644
--- a/libavcodec/huffman.h
+++ b/libavcodec/huffman.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2007  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,6 @@ typedef int (*HuffCmp)(const void *va, const void *vb);
 int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes, int nb_bits,
                        Node *nodes, HuffCmp cmp, int flags);
 
-void ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats);
+int ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats, int n, int skip0);
 
 #endif /* AVCODEC_HUFFMAN_H */
diff --git a/libavcodec/huffyuv.c b/libavcodec/huffyuv.c
index da5c52f..4921555 100644
--- a/libavcodec/huffyuv.c
+++ b/libavcodec/huffyuv.c
@@ -1,25 +1,25 @@
 /*
  * huffyuv codec for libavcodec
  *
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,13 +36,13 @@
 #include "bswapdsp.h"
 #include "huffyuv.h"
 
-int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table)
+int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table, int n)
 {
     int len, index;
     uint32_t bits = 0;
 
     for (len = 32; len > 0; len--) {
-        for (index = 0; index < 256; index++) {
+        for (index = 0; index < n; index++) {
             if (len_table[index] == len)
                 dst[index] = bits++;
         }
@@ -59,16 +59,11 @@ av_cold int ff_huffyuv_alloc_temp(HYuvContext *s)
 {
     int i;
 
-    if (s->bitstream_bpp<24) {
-        for (i=0; i<3; i++) {
-            s->temp[i]= av_malloc(s->width + 16);
-            if (!s->temp[i])
-                return AVERROR(ENOMEM);
-        }
-    } else {
-        s->temp[0]= av_mallocz(4*s->width + 16);
-        if (!s->temp[0])
+    for (i=0; i<3; i++) {
+        s->temp[i]= av_malloc(4*s->width + 16);
+        if (!s->temp[i])
             return AVERROR(ENOMEM);
+        s->temp16[i] = (uint16_t*)s->temp[i];
     }
     return 0;
 }
@@ -81,17 +76,20 @@ av_cold void ff_huffyuv_common_init(AVCodecContext *avctx)
     s->flags = avctx->flags;
 
     ff_bswapdsp_init(&s->bdsp);
+    ff_llviddsp_init(&s->llviddsp, avctx);
 
     s->width = avctx->width;
     s->height = avctx->height;
-    assert(s->width>0 && s->height>0);
+
+    av_assert1(s->width > 0 && s->height > 0);
 }
 
-void ff_huffyuv_common_end(HYuvContext *s)
+av_cold void ff_huffyuv_common_end(HYuvContext *s)
 {
     int i;
 
     for(i = 0; i < 3; i++) {
         av_freep(&s->temp[i]);
+        s->temp16[i] = NULL;
     }
 }
diff --git a/libavcodec/huffyuv.h b/libavcodec/huffyuv.h
index a4a83b9..c18247e 100644
--- a/libavcodec/huffyuv.h
+++ b/libavcodec/huffyuv.h
@@ -1,23 +1,23 @@
 /*
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,20 +37,13 @@
 #include "huffyuvdsp.h"
 #include "huffyuvencdsp.h"
 #include "put_bits.h"
+#include "lossless_videodsp.h"
 
-#define VLC_BITS 11
+#define VLC_BITS 12
 
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
+#define MAX_BITS 16
+#define MAX_N (1<<MAX_BITS)
+#define MAX_VLC_N 16384
 
 typedef enum Predictor {
     LEFT = 0,
@@ -59,7 +52,7 @@ typedef enum Predictor {
 } Predictor;
 
 typedef struct HYuvContext {
-    const AVClass *class;
+    AVClass *class;
     AVCodecContext *avctx;
     Predictor predictor;
     GetBitContext gb;
@@ -70,27 +63,38 @@ typedef struct HYuvContext {
     int version;
     int yuy2;                               //use yuy2 instead of 422P
     int bgr32;                              //use bgr32 instead of bgr24
+    int bps;
+    int n;                                  // 1<<bps
+    int vlc_n;                              // number of vlc codes (FFMIN(1<<bps, MAX_VLC_N))
+    int alpha;
+    int chroma;
+    int yuv;
+    int chroma_h_shift;
+    int chroma_v_shift;
     int width, height;
     int flags;
     int context;
     int picture_number;
     int last_slice_end;
     uint8_t *temp[3];
-    uint64_t stats[3][256];
-    uint8_t len[3][256];
-    uint32_t bits[3][256];
+    uint16_t *temp16[3];                    ///< identical to temp but 16bit type
+    uint64_t stats[4][MAX_VLC_N];
+    uint8_t len[4][MAX_VLC_N];
+    uint32_t bits[4][MAX_VLC_N];
     uint32_t pix_bgr_map[1<<VLC_BITS];
-    VLC vlc[6];                             //Y,U,V,YY,YU,YV
+    VLC vlc[8];                             //Y,U,V,A,YY,YU,YV,AA
     uint8_t *bitstream_buffer;
     unsigned int bitstream_buffer_size;
     BswapDSPContext bdsp;
     HuffYUVDSPContext hdsp;
     HuffYUVEncDSPContext hencdsp;
+    LLVidDSPContext llviddsp;
+    int non_determ; // non-deterministic, multi-threaded encoder allowed
 } HYuvContext;
 
 void ff_huffyuv_common_init(AVCodecContext *s);
 void ff_huffyuv_common_end(HYuvContext *s);
 int  ff_huffyuv_alloc_temp(HYuvContext *s);
-int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table);
+int ff_huffyuv_generate_bits_table(uint32_t *dst, const uint8_t *len_table, int n);
 
 #endif /* AVCODEC_HUFFYUV_H */
diff --git a/libavcodec/huffyuvdec.c b/libavcodec/huffyuvdec.c
index 12eca26..7314519 100644
--- a/libavcodec/huffyuvdec.c
+++ b/libavcodec/huffyuvdec.c
@@ -1,26 +1,28 @@
 /*
  * huffyuv decoder
  *
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * yuva, gray, 4:4:4, 4:1:1, 4:1:0 and >8 bit per sample support sponsored by NOA
  */
 
 /**
@@ -28,17 +30,22 @@
  * huffyuv decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "avcodec.h"
 #include "get_bits.h"
 #include "huffyuv.h"
 #include "huffyuvdsp.h"
 #include "thread.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/pixdesc.h"
 
 #define classic_shift_luma_table_size 42
 static const unsigned char classic_shift_luma[classic_shift_luma_table_size + AV_INPUT_BUFFER_PADDING_SIZE] = {
     34, 36, 35, 69, 135, 232,   9, 16, 10, 24,  11,  23,  12,  16, 13, 10,
     14,  8, 15,  8,  16,   8,  17, 20, 16, 10, 207, 206, 205, 236, 11,  8,
-    10, 21,  9, 23,   8,   8, 199, 70, 69, 68,   0
+    10, 21,  9, 23,   8,   8, 199, 70, 69, 68,   0,
+  0,0,0,0,0,0,0,0,
 };
 
 #define classic_shift_chroma_table_size 59
@@ -46,7 +53,8 @@ static const unsigned char classic_shift_chroma[classic_shift_chroma_table_size
     66, 36,  37,  38, 39, 40,  41,  75,  76,  77, 110, 239, 144, 81, 82,  83,
     84, 85, 118, 183, 56, 57,  88,  89,  56,  89, 154,  57,  58, 57, 26, 141,
     57, 56,  58,  57, 58, 57, 184, 119, 214, 245, 116,  83,  82, 49, 80,  79,
-    78, 77,  44,  75, 41, 40,  39,  38,  37,  36,  34,  0
+    78, 77,  44,  75, 41, 40,  39,  38,  37,  36,  34,  0,
+  0,0,0,0,0,0,0,0,
 };
 
 static const unsigned char classic_add_luma[256] = {
@@ -87,16 +95,16 @@ static const unsigned char classic_add_chroma[256] = {
       6,  12,   8,  10,   7,   9,   6,   4,   6,   2,   2,   3,   3,   3,   3,   2,
 };
 
-static int read_len_table(uint8_t *dst, GetBitContext *gb)
+static int read_len_table(uint8_t *dst, GetBitContext *gb, int n)
 {
     int i, val, repeat;
 
-    for (i = 0; i < 256;) {
+    for (i = 0; i < n;) {
         repeat = get_bits(gb, 3);
         val    = get_bits(gb, 5);
         if (repeat == 0)
             repeat = get_bits(gb, 8);
-        if (i + repeat > 256 || get_bits_left(gb) < 0) {
+        if (i + repeat > n || get_bits_left(gb) < 0) {
             av_log(NULL, AV_LOG_ERROR, "Error reading huffman table\n");
             return AVERROR_INVALIDDATA;
         }
@@ -108,34 +116,43 @@ static int read_len_table(uint8_t *dst, GetBitContext *gb)
 
 static int generate_joint_tables(HYuvContext *s)
 {
-    uint16_t symbols[1 << VLC_BITS];
-    uint16_t bits[1 << VLC_BITS];
-    uint8_t len[1 << VLC_BITS];
     int ret;
+    uint16_t *symbols = av_mallocz(5 << VLC_BITS);
+    uint16_t *bits;
+    uint8_t *len;
+    if (!symbols)
+        return AVERROR(ENOMEM);
+    bits = symbols + (1 << VLC_BITS);
+    len = (uint8_t *)(bits + (1 << VLC_BITS));
 
-    if (s->bitstream_bpp < 24) {
+    if (s->bitstream_bpp < 24 || s->version > 2) {
         int p, i, y, u;
-        for (p = 0; p < 3; p++) {
-            for (i = y = 0; y < 256; y++) {
-                int len0  = s->len[0][y];
+        for (p = 0; p < 4; p++) {
+            int p0 = s->version > 2 ? p : 0;
+            for (i = y = 0; y < s->vlc_n; y++) {
+                int len0  = s->len[p0][y];
                 int limit = VLC_BITS - len0;
-                if (limit <= 0)
+                if (limit <= 0 || !len0)
+                    continue;
+                if ((sign_extend(y, 8) & (s->vlc_n-1)) != y)
                     continue;
-                for (u = 0; u < 256; u++) {
+                for (u = 0; u < s->vlc_n; u++) {
                     int len1 = s->len[p][u];
-                    if (len1 > limit)
+                    if (len1 > limit || !len1)
                         continue;
+                    if ((sign_extend(u, 8) & (s->vlc_n-1)) != u)
+                        continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i]     = len0 + len1;
-                    bits[i]    = (s->bits[0][y] << len1) + s->bits[p][u];
-                    symbols[i] = (y << 8) + u;
-                    if (symbols[i] != 0xffff) // reserved to mean "invalid"
+                    bits[i]    = (s->bits[p0][y] << len1) + s->bits[p][u];
+                    symbols[i] = (y << 8) + (u & 0xFF);
                         i++;
                 }
             }
-            ff_free_vlc(&s->vlc[3 + p]);
-            if ((ret = ff_init_vlc_sparse(&s->vlc[3 + p], VLC_BITS, i, len, 1, 1,
+            ff_free_vlc(&s->vlc[4 + p]);
+            if ((ret = ff_init_vlc_sparse(&s->vlc[4 + p], VLC_BITS, i, len, 1, 1,
                                           bits, 2, 2, symbols, 2, 2, 0)) < 0)
-                return ret;
+                goto out;
         }
     } else {
         uint8_t (*map)[4] = (uint8_t(*)[4]) s->pix_bgr_map;
@@ -148,18 +165,19 @@ static int generate_joint_tables(HYuvContext *s)
         for (i = 0, g = -16; g < 16; g++) {
             int len0   = s->len[p0][g & 255];
             int limit0 = VLC_BITS - len0;
-            if (limit0 < 2)
+            if (limit0 < 2 || !len0)
                 continue;
             for (b = -16; b < 16; b++) {
                 int len1   = s->len[p1][b & 255];
                 int limit1 = limit0 - len1;
-                if (limit1 < 1)
+                if (limit1 < 1 || !len1)
                     continue;
                 code = (s->bits[p0][g & 255] << len1) + s->bits[p1][b & 255];
                 for (r = -16; r < 16; r++) {
                     int len2 = s->len[2][r & 255];
-                    if (len2 > limit1)
+                    if (len2 > limit1 || !len2)
                         continue;
+                    av_assert0(i < (1 << VLC_BITS));
                     len[i]  = len0 + len1 + len2;
                     bits[i] = (code << len2) + s->bits[2][r & 255];
                     if (s->decorrelate) {
@@ -175,30 +193,37 @@ static int generate_joint_tables(HYuvContext *s)
                 }
             }
         }
-        ff_free_vlc(&s->vlc[3]);
-        if ((ret = init_vlc(&s->vlc[3], VLC_BITS, i, len, 1, 1,
+        ff_free_vlc(&s->vlc[4]);
+        if ((ret = init_vlc(&s->vlc[4], VLC_BITS, i, len, 1, 1,
                             bits, 2, 2, 0)) < 0)
-            return ret;
+            goto out;
     }
-    return 0;
+    ret = 0;
+out:
+    av_freep(&symbols);
+    return ret;
 }
 
 static int read_huffman_tables(HYuvContext *s, const uint8_t *src, int length)
 {
     GetBitContext gb;
     int i, ret;
+    int count = 3;
 
     if ((ret = init_get_bits(&gb, src, length * 8)) < 0)
         return ret;
 
-    for (i = 0; i < 3; i++) {
-        if ((ret = read_len_table(s->len[i], &gb)) < 0)
+    if (s->version > 2)
+        count = 1 + s->alpha + 2*s->chroma;
+
+    for (i = 0; i < count; i++) {
+        if ((ret = read_len_table(s->len[i], &gb, s->vlc_n)) < 0)
             return ret;
-        if ((ret = ff_huffyuv_generate_bits_table(s->bits[i], s->len[i])) < 0)
+        if ((ret = ff_huffyuv_generate_bits_table(s->bits[i], s->len[i], s->vlc_n)) < 0)
             return ret;
         ff_free_vlc(&s->vlc[i]);
-        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
-                            s->bits[i], 4, 4, 0)) < 0)
+        if ((ret = init_vlc(&s->vlc[i], VLC_BITS, s->vlc_n, s->len[i], 1, 1,
+                           s->bits[i], 4, 4, 0)) < 0)
             return ret;
     }
 
@@ -213,16 +238,14 @@ static int read_old_huffman_tables(HYuvContext *s)
     GetBitContext gb;
     int i, ret;
 
-    if ((ret = init_get_bits(&gb, classic_shift_luma,
-                             classic_shift_luma_table_size * 8)) < 0)
-        return ret;
-    if ((ret = read_len_table(s->len[0], &gb)) < 0)
+    init_get_bits(&gb, classic_shift_luma,
+                  classic_shift_luma_table_size * 8);
+    if ((ret = read_len_table(s->len[0], &gb, 256)) < 0)
         return ret;
 
-    if ((ret = init_get_bits(&gb, classic_shift_chroma,
-                             classic_shift_chroma_table_size * 8)) < 0)
-        return ret;
-    if ((ret = read_len_table(s->len[1], &gb)) < 0)
+    init_get_bits(&gb, classic_shift_chroma,
+                  classic_shift_chroma_table_size * 8);
+    if ((ret = read_len_table(s->len[1], &gb, 256)) < 0)
         return ret;
 
     for (i = 0; i < 256; i++)
@@ -237,7 +260,7 @@ static int read_old_huffman_tables(HYuvContext *s)
     memcpy(s->bits[2], s->bits[1], 256 * sizeof(uint32_t));
     memcpy(s->len[2], s->len[1], 256 * sizeof(uint8_t));
 
-    for (i = 0; i < 3; i++) {
+    for (i = 0; i < 4; i++) {
         ff_free_vlc(&s->vlc[i]);
         if ((ret = init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1,
                             s->bits[i], 4, 4, 0)) < 0)
@@ -250,28 +273,51 @@ static int read_old_huffman_tables(HYuvContext *s)
     return 0;
 }
 
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    HYuvContext *s = avctx->priv_data;
+    int i;
+
+    ff_huffyuv_common_end(s);
+    av_freep(&s->bitstream_buffer);
+
+    for (i = 0; i < 8; i++)
+        ff_free_vlc(&s->vlc[i]);
+
+    return 0;
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int ret;
 
-    ff_huffyuv_common_init(avctx);
+    ret = av_image_check_size(avctx->width, avctx->height, 0, avctx);
+    if (ret < 0)
+        return ret;
+
     ff_huffyuvdsp_init(&s->hdsp);
-    memset(s->vlc, 0, 3 * sizeof(VLC));
+    memset(s->vlc, 0, 4 * sizeof(VLC));
 
-    s->interlaced = s->height > 288;
+    s->interlaced = avctx->height > 288;
     s->bgr32      = 1;
 
     if (avctx->extradata_size) {
         if ((avctx->bits_per_coded_sample & 7) &&
             avctx->bits_per_coded_sample != 12)
             s->version = 1; // do such files exist at all?
-        else
+        else if (avctx->extradata_size > 3 && avctx->extradata[3] == 0)
             s->version = 2;
+        else
+            s->version = 3;
     } else
         s->version = 0;
 
-    if (s->version == 2) {
+    s->bps = 8;
+    s->n = 1<<s->bps;
+    s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+    s->chroma = 1;
+    if (s->version >= 2) {
         int method, interlace;
 
         if (avctx->extradata_size < 4)
@@ -280,16 +326,27 @@ static av_cold int decode_init(AVCodecContext *avctx)
         method           = avctx->extradata[0];
         s->decorrelate   = method & 64 ? 1 : 0;
         s->predictor     = method & 63;
-        s->bitstream_bpp = avctx->extradata[1];
-        if (s->bitstream_bpp == 0)
-            s->bitstream_bpp = avctx->bits_per_coded_sample & ~7;
+        if (s->version == 2) {
+            s->bitstream_bpp = avctx->extradata[1];
+            if (s->bitstream_bpp == 0)
+                s->bitstream_bpp = avctx->bits_per_coded_sample & ~7;
+        } else {
+            s->bps = (avctx->extradata[1] >> 4) + 1;
+            s->n = 1<<s->bps;
+            s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+            s->chroma_h_shift = avctx->extradata[1] & 3;
+            s->chroma_v_shift = (avctx->extradata[1] >> 2) & 3;
+            s->yuv   = !!(avctx->extradata[2] & 1);
+            s->chroma= !!(avctx->extradata[2] & 3);
+            s->alpha = !!(avctx->extradata[2] & 4);
+        }
         interlace     = (avctx->extradata[2] & 0x30) >> 4;
         s->interlaced = (interlace == 1) ? 1 : (interlace == 2) ? 0 : s->interlaced;
         s->context    = avctx->extradata[2] & 0x40 ? 1 : 0;
 
         if ((ret = read_huffman_tables(s, avctx->extradata + 4,
                                        avctx->extradata_size - 4)) < 0)
-            return ret;
+            goto error;
     } else {
         switch (avctx->bits_per_coded_sample & 7) {
         case 1:
@@ -317,55 +374,218 @@ static av_cold int decode_init(AVCodecContext *avctx)
         s->context       = 0;
 
         if ((ret = read_old_huffman_tables(s)) < 0)
-            return ret;
+            goto error;
     }
 
-    switch (s->bitstream_bpp) {
-    case 12:
-        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-        break;
-    case 16:
-        if (s->yuy2)
-            avctx->pix_fmt = AV_PIX_FMT_YUYV422;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_YUV422P;
-        break;
-    case 24:
-    case 32:
-        if (s->bgr32)
+    if (s->version <= 2) {
+        switch (s->bitstream_bpp) {
+        case 12:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            s->yuv = 1;
+            break;
+        case 16:
+            if (s->yuy2)
+                avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            s->yuv = 1;
+            break;
+        case 24:
+            if (s->bgr32)
+                avctx->pix_fmt = AV_PIX_FMT_0RGB32;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_BGR24;
+            break;
+        case 32:
+            av_assert0(s->bgr32);
             avctx->pix_fmt = AV_PIX_FMT_RGB32;
-        else
-            avctx->pix_fmt = AV_PIX_FMT_BGR24;
-        break;
-    default:
-        return AVERROR_INVALIDDATA;
+            s->alpha = 1;
+            break;
+        default:
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
+        av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                         &s->chroma_h_shift,
+                                         &s->chroma_v_shift);
+    } else {
+        switch ( (s->chroma<<10) | (s->yuv<<9) | (s->alpha<<8) | ((s->bps-1)<<4) | s->chroma_h_shift | (s->chroma_v_shift<<2)) {
+        case 0x070:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+            break;
+        case 0x0F0:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            break;
+        case 0x170:
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8A;
+            break;
+        case 0x470:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            break;
+        case 0x480:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP9;
+            break;
+        case 0x490:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+            break;
+        case 0x4B0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP12;
+            break;
+        case 0x4D0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP14;
+            break;
+        case 0x4F0:
+            avctx->pix_fmt = AV_PIX_FMT_GBRP16;
+            break;
+        case 0x570:
+            avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            break;
+        case 0x670:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+            break;
+        case 0x680:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P9;
+            break;
+        case 0x690:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+            break;
+        case 0x6B0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P12;
+            break;
+        case 0x6D0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P14;
+            break;
+        case 0x6F0:
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
+            break;
+        case 0x671:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            break;
+        case 0x681:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P9;
+            break;
+        case 0x691:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            break;
+        case 0x6B1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+            break;
+        case 0x6D1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P14;
+            break;
+        case 0x6F1:
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
+            break;
+        case 0x672:
+            avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+            break;
+        case 0x674:
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+            break;
+        case 0x675:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            break;
+        case 0x685:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
+            break;
+        case 0x695:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
+            break;
+        case 0x6B5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P12;
+            break;
+        case 0x6D5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P14;
+            break;
+        case 0x6F5:
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
+            break;
+        case 0x67A:
+            avctx->pix_fmt = AV_PIX_FMT_YUV410P;
+            break;
+        case 0x770:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+            break;
+        case 0x780:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P9;
+            break;
+        case 0x790:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P10;
+            break;
+        case 0x7F0:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA444P16;
+            break;
+        case 0x771:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+            break;
+        case 0x781:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P9;
+            break;
+        case 0x791:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P10;
+            break;
+        case 0x7F1:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA422P16;
+            break;
+        case 0x775:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+            break;
+        case 0x785:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P9;
+            break;
+        case 0x795:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P10;
+            break;
+        case 0x7F5:
+            avctx->pix_fmt = AV_PIX_FMT_YUVA420P16;
+            break;
+        default:
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
     }
 
+    ff_huffyuv_common_init(avctx);
+
+    if ((avctx->pix_fmt == AV_PIX_FMT_YUV422P || avctx->pix_fmt == AV_PIX_FMT_YUV420P) && avctx->width & 1) {
+        av_log(avctx, AV_LOG_ERROR, "width must be even for this colorspace\n");
+        ret = AVERROR_INVALIDDATA;
+        goto error;
+    }
     if (s->predictor == MEDIAN && avctx->pix_fmt == AV_PIX_FMT_YUV422P &&
         avctx->width % 4) {
-        av_log(avctx, AV_LOG_ERROR, "width must be multiple of 4 "
+        av_log(avctx, AV_LOG_ERROR, "width must be a multiple of 4 "
                "for this combination of colorspace and predictor type.\n");
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto error;
     }
 
-    if ((ret = ff_huffyuv_alloc_temp(s)) < 0)
-        return ret;
+    if ((ret = ff_huffyuv_alloc_temp(s)) < 0) {
+        ff_huffyuv_common_end(s);
+        goto error;
+    }
 
     return 0;
+  error:
+    decode_end(avctx);
+    return ret;
 }
 
+#if HAVE_THREADS
 static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int i, ret;
 
-    if ((ret = ff_huffyuv_alloc_temp(s)) < 0)
+    if ((ret = ff_huffyuv_alloc_temp(s)) < 0) {
+        ff_huffyuv_common_end(s);
         return ret;
+    }
 
-    for (i = 0; i < 6; i++)
+    for (i = 0; i < 8; i++)
         s->vlc[i].table = NULL;
 
-    if (s->version == 2) {
+    if (s->version >= 2) {
         if ((ret = read_huffman_tables(s, avctx->extradata + 4,
                                        avctx->extradata_size)) < 0)
             return ret;
@@ -376,49 +596,174 @@ static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
+
+/** Subset of GET_VLC for use in hand-roller VLC code */
+#define VLC_INTERN(dst, table, gb, name, bits, max_depth)   \
+    code = table[index][0];                                 \
+    n    = table[index][1];                                 \
+    if (max_depth > 1 && n < 0) {                           \
+        LAST_SKIP_BITS(name, gb, bits);                     \
+        UPDATE_CACHE(name, gb);                             \
+                                                            \
+        nb_bits = -n;                                       \
+        index   = SHOW_UBITS(name, gb, nb_bits) + code;     \
+        code    = table[index][0];                          \
+        n       = table[index][1];                          \
+        if (max_depth > 2 && n < 0) {                       \
+            LAST_SKIP_BITS(name, gb, nb_bits);              \
+            UPDATE_CACHE(name, gb);                         \
+                                                            \
+            nb_bits = -n;                                   \
+            index   = SHOW_UBITS(name, gb, nb_bits) + code; \
+            code    = table[index][0];                      \
+            n       = table[index][1];                      \
+        }                                                   \
+    }                                                       \
+    dst = code;                                             \
+    LAST_SKIP_BITS(name, gb, n)
+
+
+#define GET_VLC_DUAL(dst0, dst1, name, gb, dtable, table1, table2,  \
+                     bits, max_depth, OP)                           \
+    do {                                                            \
+        unsigned int index = SHOW_UBITS(name, gb, bits);            \
+        int          code, n = dtable[index][1];                    \
+                                                                    \
+        if (n<=0) {                                                 \
+            int nb_bits;                                            \
+            VLC_INTERN(dst0, table1, gb, name, bits, max_depth);    \
+                                                                    \
+            UPDATE_CACHE(re, gb);                                   \
+            index = SHOW_UBITS(name, gb, bits);                     \
+            VLC_INTERN(dst1, table2, gb, name, bits, max_depth);    \
+        } else {                                                    \
+            code = dtable[index][0];                                \
+            OP(dst0, dst1, code);                                   \
+            LAST_SKIP_BITS(name, gb, n);                            \
+        }                                                           \
+    } while (0)
+
+#define OP8bits(dst0, dst1, code) dst0 = code>>8; dst1 = code
 
-/* TODO instead of restarting the read when the code isn't in the first level
- * of the joint table, jump into the 2nd level of the individual table. */
 #define READ_2PIX(dst0, dst1, plane1)                                   \
-    {                                                                   \
-        uint16_t code = get_vlc2(&s->gb, s->vlc[3 + plane1].table,      \
-                                 VLC_BITS, 1);                          \
-        if (code != 0xffff) {                                           \
-            dst0 = code >> 8;                                           \
-            dst1 = code;                                                \
-        } else {                                                        \
-            dst0 = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);      \
-            dst1 = get_vlc2(&s->gb, s->vlc[plane1].table, VLC_BITS, 3); \
-        }                                                               \
-    }
+    UPDATE_CACHE(re, &s->gb);                                           \
+    GET_VLC_DUAL(dst0, dst1, re, &s->gb, s->vlc[4+plane1].table,        \
+                 s->vlc[0].table, s->vlc[plane1].table, VLC_BITS, 3, OP8bits)
 
 static void decode_422_bitstream(HYuvContext *s, int count)
 {
-    int i;
-
+    int i, icount;
+    OPEN_READER(re, &s->gb);
     count /= 2;
 
-    if (count >= (get_bits_left(&s->gb)) / (31 * 4)) {
-        for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+    icount = get_bits_left(&s->gb) / (32 * 4);
+    if (count >= icount) {
+        for (i = 0; i < icount; i++) {
             READ_2PIX(s->temp[0][2 * i],     s->temp[1][i], 1);
             READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
         }
+        for (; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+            READ_2PIX(s->temp[0][2 * i    ], s->temp[1][i], 1);
+            if (BITS_LEFT(re, &s->gb) <= 0) break;
+            READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
+        }
+        for (; i < count; i++)
+            s->temp[0][2 * i    ] = s->temp[1][i] =
+            s->temp[0][2 * i + 1] = s->temp[2][i] = 0;
     } else {
         for (i = 0; i < count; i++) {
             READ_2PIX(s->temp[0][2 * i],     s->temp[1][i], 1);
             READ_2PIX(s->temp[0][2 * i + 1], s->temp[2][i], 2);
         }
     }
+    CLOSE_READER(re, &s->gb);
+}
+
+#define READ_2PIX_PLANE(dst0, dst1, plane, OP) \
+    UPDATE_CACHE(re, &s->gb); \
+    GET_VLC_DUAL(dst0, dst1, re, &s->gb, s->vlc[4+plane].table, \
+                 s->vlc[plane].table, s->vlc[plane].table, VLC_BITS, 3, OP)
+
+#define OP14bits(dst0, dst1, code) dst0 = code>>8; dst1 = sign_extend(code, 8)
+
+/* TODO instead of restarting the read when the code isn't in the first level
+ * of the joint table, jump into the 2nd level of the individual table. */
+#define READ_2PIX_PLANE16(dst0, dst1, plane){\
+    dst0 = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;\
+    dst0 += get_bits(&s->gb, 2);\
+    dst1 = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;\
+    dst1 += get_bits(&s->gb, 2);\
+}
+static void decode_plane_bitstream(HYuvContext *s, int width, int plane)
+{
+    int i, count = width/2;
+
+    if (s->bps <= 8) {
+        OPEN_READER(re, &s->gb);
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+                READ_2PIX_PLANE(s->temp[0][2 * i], s->temp[0][2 * i + 1], plane, OP8bits);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE(s->temp[0][2 * i], s->temp[0][2 * i + 1], plane, OP8bits);
+            }
+        }
+        if( width&1 && BITS_LEFT(re, &s->gb)>0 ) {
+            unsigned int index;
+            int nb_bits, code, n;
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp[0][width-1], s->vlc[plane].table,
+                       &s->gb, re, VLC_BITS, 3);
+        }
+        CLOSE_READER(re, &s->gb);
+    } else if (s->bps <= 14) {
+        OPEN_READER(re, &s->gb);
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+                READ_2PIX_PLANE(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane, OP14bits);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane, OP14bits);
+            }
+        }
+        if( width&1 && BITS_LEFT(re, &s->gb)>0 ) {
+            unsigned int index;
+            int nb_bits, code, n;
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp16[0][width-1], s->vlc[plane].table,
+                       &s->gb, re, VLC_BITS, 3);
+        }
+        CLOSE_READER(re, &s->gb);
+    } else {
+        if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+            for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+                READ_2PIX_PLANE16(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane);
+            }
+        } else {
+            for(i=0; i<count; i++){
+                READ_2PIX_PLANE16(s->temp16[0][2 * i], s->temp16[0][2 * i + 1], plane);
+            }
+        }
+        if( width&1 && get_bits_left(&s->gb)>0 ) {
+            int dst = get_vlc2(&s->gb, s->vlc[plane].table, VLC_BITS, 3)<<2;
+            s->temp16[0][width-1] = dst + get_bits(&s->gb, 2);
+        }
+    }
 }
 
 static void decode_gray_bitstream(HYuvContext *s, int count)
 {
     int i;
-
+    OPEN_READER(re, &s->gb);
     count /= 2;
 
-    if (count >= (get_bits_left(&s->gb)) / (31 * 2)) {
-        for (i = 0; i < count && get_bits_left(&s->gb) > 0; i++) {
+    if (count >= (get_bits_left(&s->gb)) / (32 * 2)) {
+        for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
             READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
         }
     } else {
@@ -426,30 +771,65 @@ static void decode_gray_bitstream(HYuvContext *s, int count)
             READ_2PIX(s->temp[0][2 * i], s->temp[0][2 * i + 1], 0);
         }
     }
+    CLOSE_READER(re, &s->gb);
 }
 
 static av_always_inline void decode_bgr_1(HYuvContext *s, int count,
                                           int decorrelate, int alpha)
 {
     int i;
-    for (i = 0; i < count; i++) {
-        int code = get_vlc2(&s->gb, s->vlc[3].table, VLC_BITS, 1);
-        if (code != -1) {
+    OPEN_READER(re, &s->gb);
+
+    for (i = 0; i < count && BITS_LEFT(re, &s->gb) > 0; i++) {
+        unsigned int index;
+        int code, n, nb_bits;
+
+        UPDATE_CACHE(re, &s->gb);
+        index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+        n     = s->vlc[4].table[index][1];
+
+        if (n>0) {
+            code  = s->vlc[4].table[index][0];
             *(uint32_t *) &s->temp[0][4 * i] = s->pix_bgr_map[code];
-        } else if (decorrelate) {
-            s->temp[0][4 * i + G] = get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3);
-            s->temp[0][4 * i + B] = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) +
-                                    s->temp[0][4 * i + G];
-            s->temp[0][4 * i + R] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) +
-                                    s->temp[0][4 * i + G];
+            LAST_SKIP_BITS(re, &s->gb, n);
         } else {
-            s->temp[0][4 * i + B] = get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
-            s->temp[0][4 * i + G] = get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3);
-            s->temp[0][4 * i + R] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3);
+            if (decorrelate) {
+                VLC_INTERN(s->temp[0][4 * i + G], s->vlc[1].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(code, s->vlc[0].table, &s->gb, re, VLC_BITS, 3);
+                s->temp[0][4 * i + B] = code + s->temp[0][4 * i + G];
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(code, s->vlc[2].table, &s->gb, re, VLC_BITS, 3);
+                s->temp[0][4 * i + R] = code + s->temp[0][4 * i + G];
+            } else {
+                VLC_INTERN(s->temp[0][4 * i + B], s->vlc[0].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(s->temp[0][4 * i + G], s->vlc[1].table,
+                           &s->gb, re, VLC_BITS, 3);
+
+                UPDATE_CACHE(re, &s->gb);
+                index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+                VLC_INTERN(s->temp[0][4 * i + R], s->vlc[2].table,
+                           &s->gb, re, VLC_BITS, 3);
+            }
         }
-        if (alpha)
-            s->temp[0][4 * i + A] = get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3);
+        if (alpha) {
+            UPDATE_CACHE(re, &s->gb);
+            index = SHOW_UBITS(re, &s->gb, VLC_BITS);
+            VLC_INTERN(s->temp[0][4 * i + A], s->vlc[2].table,
+                       &s->gb, re, VLC_BITS, 3);
+        } else
+            s->temp[0][4 * i + A] = 0;
     }
+    CLOSE_READER(re, &s->gb);
 }
 
 static void decode_bgr_bitstream(HYuvContext *s, int count)
@@ -495,6 +875,32 @@ static void draw_slice(HYuvContext *s, AVFrame *frame, int y)
     s->last_slice_end = y + h;
 }
 
+static int left_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src, int w, int acc)
+{
+    if (s->bps <= 8) {
+        return s->hdsp.add_hfyu_left_pred(dst, src, w, acc);
+    } else {
+        return s->llviddsp.add_hfyu_left_pred_int16((      uint16_t *)dst, (const uint16_t *)src, s->n-1, w, acc);
+    }
+}
+
+static void add_bytes(HYuvContext *s, uint8_t *dst, uint8_t *src, int w)
+{
+    if (s->bps <= 8) {
+        s->hdsp.add_bytes(dst, src, w);
+    } else {
+        s->llviddsp.add_int16((uint16_t*)dst, (const uint16_t*)src, s->n - 1, w);
+    }
+}
+
+static void add_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src, const uint8_t *diff, int w, int *left, int *left_top)
+{
+    if (s->bps <= 8) {
+        s->hdsp.add_hfyu_median_pred(dst, src, diff, w, left, left_top);
+    } else {
+        s->llviddsp.add_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src, (const uint16_t *)diff, s->n-1, w, left, left_top);
+    }
+}
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
@@ -509,20 +915,17 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *const p = data;
     int table_size = 0, ret;
 
-    av_fast_malloc(&s->bitstream_buffer,
+    av_fast_padded_malloc(&s->bitstream_buffer,
                    &s->bitstream_buffer_size,
-                   buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+                   buf_size);
     if (!s->bitstream_buffer)
         return AVERROR(ENOMEM);
 
-    memset(s->bitstream_buffer + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     s->bdsp.bswap_buf((uint32_t *) s->bitstream_buffer,
                       (const uint32_t *) buf, buf_size / 4);
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
 
     if (s->context) {
         table_size = read_huffman_tables(s, s->bitstream_buffer, buf_size);
@@ -543,7 +946,72 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     s->last_slice_end = 0;
 
-    if (s->bitstream_bpp < 24) {
+    if (s->version > 2) {
+        int plane;
+        for(plane = 0; plane < 1 + 2*s->chroma + s->alpha; plane++) {
+            int left, lefttop, y;
+            int w = width;
+            int h = height;
+            int fake_stride = fake_ystride;
+
+            if (s->chroma && (plane == 1 || plane == 2)) {
+                w >>= s->chroma_h_shift;
+                h >>= s->chroma_v_shift;
+                fake_stride = plane == 1 ? fake_ustride : fake_vstride;
+            }
+
+            switch (s->predictor) {
+            case LEFT:
+            case PLANE:
+                decode_plane_bitstream(s, w, plane);
+                left = left_prediction(s, p->data[plane], s->temp[0], w, 0);
+
+                for (y = 1; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane]*y;
+
+                    decode_plane_bitstream(s, w, plane);
+                    left = left_prediction(s, dst, s->temp[0], w, left);
+                    if (s->predictor == PLANE) {
+                        if (y > s->interlaced) {
+                            add_bytes(s, dst, dst - fake_stride, w);
+                        }
+                    }
+                }
+
+                break;
+            case MEDIAN:
+                decode_plane_bitstream(s, w, plane);
+                left= left_prediction(s, p->data[plane], s->temp[0], w, 0);
+
+                y = 1;
+
+                /* second line is left predicted for interlaced case */
+                if (s->interlaced) {
+                    decode_plane_bitstream(s, w, plane);
+                    left = left_prediction(s, p->data[plane] + p->linesize[plane], s->temp[0], w, left);
+                    y++;
+                }
+
+                lefttop = p->data[plane][0];
+                decode_plane_bitstream(s, w, plane);
+                add_median_prediction(s, p->data[plane] + fake_stride, p->data[plane], s->temp[0], w, &left, &lefttop);
+                y++;
+
+                for (; y<h; y++) {
+                    uint8_t *dst;
+
+                    decode_plane_bitstream(s, w, plane);
+
+                    dst = p->data[plane] + p->linesize[plane] * y;
+
+                    add_median_prediction(s, dst, dst - fake_stride, s->temp[0], w, &left, &lefttop);
+                }
+
+                break;
+            }
+        }
+        draw_slice(s, p, height);
+    } else if (s->bitstream_bpp < 24) {
         int y, cy;
         int lefty, leftu, leftv;
         int lefttopy, lefttopu, lefttopv;
@@ -554,7 +1022,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             p->data[0][1] = get_bits(&s->gb, 8);
             p->data[0][0] = get_bits(&s->gb, 8);
 
-            avpriv_report_missing_feature(avctx, "YUY2 output");
+            av_log(avctx, AV_LOG_ERROR,
+                   "YUY2 output is not implemented yet\n");
             return AVERROR_PATCHWELCOME;
         } else {
             leftv         =
@@ -708,19 +1177,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     } else {
         int y;
-        int leftr, leftg, leftb, lefta;
+        uint8_t left[4];
         const int last_line = (height - 1) * p->linesize[0];
 
         if (s->bitstream_bpp == 32) {
-            lefta = p->data[0][last_line + A] = get_bits(&s->gb, 8);
-            leftr = p->data[0][last_line + R] = get_bits(&s->gb, 8);
-            leftg = p->data[0][last_line + G] = get_bits(&s->gb, 8);
-            leftb = p->data[0][last_line + B] = get_bits(&s->gb, 8);
+            left[A] = p->data[0][last_line + A] = get_bits(&s->gb, 8);
+            left[R] = p->data[0][last_line + R] = get_bits(&s->gb, 8);
+            left[G] = p->data[0][last_line + G] = get_bits(&s->gb, 8);
+            left[B] = p->data[0][last_line + B] = get_bits(&s->gb, 8);
         } else {
-            leftr = p->data[0][last_line + R] = get_bits(&s->gb, 8);
-            leftg = p->data[0][last_line + G] = get_bits(&s->gb, 8);
-            leftb = p->data[0][last_line + B] = get_bits(&s->gb, 8);
-            lefta = p->data[0][last_line + A] = 255;
+            left[R] = p->data[0][last_line + R] = get_bits(&s->gb, 8);
+            left[G] = p->data[0][last_line + G] = get_bits(&s->gb, 8);
+            left[B] = p->data[0][last_line + B] = get_bits(&s->gb, 8);
+            left[A] = p->data[0][last_line + A] = 255;
             skip_bits(&s->gb, 8);
         }
 
@@ -730,23 +1199,20 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             case PLANE:
                 decode_bgr_bitstream(s, width - 1);
                 s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + last_line + 4,
-                                                 s->temp[0], width - 1, &leftr,
-                                                 &leftg, &leftb, &lefta);
+                                                 s->temp[0], width - 1, left);
 
                 for (y = s->height - 2; y >= 0; y--) { // Yes it is stored upside down.
                     decode_bgr_bitstream(s, width);
 
                     s->hdsp.add_hfyu_left_pred_bgr32(p->data[0] + p->linesize[0] * y,
-                                                     s->temp[0], width, &leftr,
-                                                     &leftg, &leftb, &lefta);
+                                                     s->temp[0], width, left);
                     if (s->predictor == PLANE) {
                         if (s->bitstream_bpp != 32)
-                            lefta = 0;
-                        if ((y & s->interlaced) == 0 &&
-                            y < s->height - 1 - s->interlaced) {
+                            left[A] = 0;
+                        if (y < s->height - 1 - s->interlaced) {
                             s->hdsp.add_bytes(p->data[0] + p->linesize[0] * y,
                                               p->data[0] + p->linesize[0] * y +
-                                              fake_ystride, fake_ystride);
+                                              fake_ystride, 4 * width);
                         }
                     }
                 }
@@ -758,7 +1224,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                        "prediction type not supported!\n");
             }
         } else {
-            avpriv_report_missing_feature(avctx, "BGR24 output");
+            av_log(avctx, AV_LOG_ERROR,
+                   "BGR24 output is not implemented yet\n");
             return AVERROR_PATCHWELCOME;
         }
     }
@@ -769,20 +1236,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     return (get_bits_count(&s->gb) + 31) / 32 * 4 + table_size;
 }
 
-static av_cold int decode_end(AVCodecContext *avctx)
-{
-    HYuvContext *s = avctx->priv_data;
-    int i;
-
-    ff_huffyuv_common_end(s);
-    av_freep(&s->bitstream_buffer);
-
-    for (i = 0; i < 6; i++)
-        ff_free_vlc(&s->vlc[i]);
-
-    return 0;
-}
-
 AVCodec ff_huffyuv_decoder = {
     .name             = "huffyuv",
     .long_name        = NULL_IF_CONFIG_SMALL("Huffyuv / HuffYUV"),
diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c
index b5a714d..e8a05f6 100644
--- a/libavcodec/huffyuvdsp.c
+++ b/libavcodec/huffyuvdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #define pb_7f (~0UL / 255 * 0x7f)
 #define pb_80 (~0UL / 255 * 0x80)
 
-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     long i;
 
@@ -41,7 +41,7 @@ static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
 }
 
 static void add_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
-                                   const uint8_t *diff, int w,
+                                   const uint8_t *diff, intptr_t w,
                                    int *left, int *left_top)
 {
     int i;
@@ -60,7 +60,7 @@ static void add_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
     *left_top = lt;
 }
 
-static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, int w,
+static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, intptr_t w,
                                 int acc)
 {
     int i;
@@ -81,22 +81,11 @@ static int add_hfyu_left_pred_c(uint8_t *dst, const uint8_t *src, int w,
     return acc;
 }
 
-#if HAVE_BIGENDIAN
-#define B 3
-#define G 2
-#define R 1
-#define A 0
-#else
-#define B 0
-#define G 1
-#define R 2
-#define A 3
-#endif
 static void add_hfyu_left_pred_bgr32_c(uint8_t *dst, const uint8_t *src,
-                                       int w, int *red, int *green,
-                                       int *blue, int *alpha)
+                                       intptr_t w, uint8_t *left)
 {
-    int i, r = *red, g = *green, b = *blue, a = *alpha;
+    int i;
+    uint8_t r = left[R], g = left[G], b = left[B], a = left[A];
 
     for (i = 0; i < w; i++) {
         b += src[4 * i + B];
@@ -110,15 +99,11 @@ static void add_hfyu_left_pred_bgr32_c(uint8_t *dst, const uint8_t *src,
         dst[4 * i + A] = a;
     }
 
-    *red   = r;
-    *green = g;
-    *blue  = b;
-    *alpha = a;
+    left[B] = b;
+    left[G] = g;
+    left[R] = r;
+    left[A] = a;
 }
-#undef B
-#undef G
-#undef R
-#undef A
 
 av_cold void ff_huffyuvdsp_init(HuffYUVDSPContext *c)
 {
diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h
index 5e84e3a..db37728 100644
--- a/libavcodec/huffyuvdsp.h
+++ b/libavcodec/huffyuvdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,18 +20,30 @@
 #define AVCODEC_HUFFYUVDSP_H
 
 #include <stdint.h>
+#include "config.h"
+
+#if HAVE_BIGENDIAN
+#define B 3
+#define G 2
+#define R 1
+#define A 0
+#else
+#define B 0
+#define G 1
+#define R 2
+#define A 3
+#endif
 
 typedef struct HuffYUVDSPContext {
     void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
-                      int w);
+                      intptr_t w);
     void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
-                                 const uint8_t *diff, int w,
+                                 const uint8_t *diff, intptr_t w,
                                  int *left, int *left_top);
     int (*add_hfyu_left_pred)(uint8_t *dst, const uint8_t *src,
-                              int w, int left);
+                              intptr_t w, int left);
     void (*add_hfyu_left_pred_bgr32)(uint8_t *dst, const uint8_t *src,
-                                     int w, int *red, int *green,
-                                     int *blue, int *alpha);
+                                     intptr_t w, uint8_t *left);
 } HuffYUVDSPContext;
 
 void ff_huffyuvdsp_init(HuffYUVDSPContext *c);
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index a6ffd24..572de16 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -1,24 +1,26 @@
 /*
- * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2014 Michael Niedermayer <michaelni@gmx.at>
  *
  * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
  * the algorithm used
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * yuva, gray, 4:4:4, 4:1:1, 4:1:0 and >8 bit per sample support sponsored by NOA
  */
 
 /**
@@ -26,39 +28,70 @@
  * huffyuv encoder
  */
 
-#include "libavutil/opt.h"
-
 #include "avcodec.h"
 #include "huffyuv.h"
 #include "huffman.h"
 #include "huffyuvencdsp.h"
 #include "internal.h"
 #include "put_bits.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+static inline void diff_bytes(HYuvContext *s, uint8_t *dst,
+                              const uint8_t *src0, const uint8_t *src1, int w)
+{
+    if (s->bps <= 8) {
+        s->hencdsp.diff_bytes(dst, src0, src1, w);
+    } else {
+        s->llviddsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w);
+    }
+}
 
 static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst,
-                                      uint8_t *src, int w, int left)
+                                      const uint8_t *src, int w, int left)
 {
     int i;
-    if (w < 32) {
-        for (i = 0; i < w; i++) {
-            const int temp = src[i];
-            dst[i] = temp - left;
-            left   = temp;
+    if (s->bps <= 8) {
+        if (w < 32) {
+            for (i = 0; i < w; i++) {
+                const int temp = src[i];
+                dst[i] = temp - left;
+                left   = temp;
+            }
+            return left;
+        } else {
+            for (i = 0; i < 32; i++) {
+                const int temp = src[i];
+                dst[i] = temp - left;
+                left   = temp;
+            }
+            s->hencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32);
+            return src[w-1];
         }
-        return left;
     } else {
-        for (i = 0; i < 16; i++) {
-            const int temp = src[i];
-            dst[i] = temp - left;
-            left   = temp;
+        const uint16_t *src16 = (const uint16_t *)src;
+        uint16_t       *dst16 = (      uint16_t *)dst;
+        if (w < 32) {
+            for (i = 0; i < w; i++) {
+                const int temp = src16[i];
+                dst16[i] = temp - left;
+                left   = temp;
+            }
+            return left;
+        } else {
+            for (i = 0; i < 16; i++) {
+                const int temp = src16[i];
+                dst16[i] = temp - left;
+                left   = temp;
+            }
+            s->llviddsp.diff_int16(dst16 + 16, src16 + 16, src16 + 15, s->n - 1, w - 16);
+            return src16[w-1];
         }
-        s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16);
-        return src[w-1];
     }
 }
 
 static inline void sub_left_prediction_bgr32(HYuvContext *s, uint8_t *dst,
-                                             uint8_t *src, int w,
+                                             const uint8_t *src, int w,
                                              int *red, int *green, int *blue,
                                              int *alpha)
 {
@@ -120,20 +153,30 @@ static inline void sub_left_prediction_rgb24(HYuvContext *s, uint8_t *dst,
     *blue  = src[(w - 1) * 3 + 2];
 }
 
+static void sub_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top)
+{
+    if (s->bps <= 8) {
+        s->hencdsp.sub_hfyu_median_pred(dst, src1, src2, w , left, left_top);
+    } else {
+        s->llviddsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top);
+    }
+}
+
 static int store_table(HYuvContext *s, const uint8_t *len, uint8_t *buf)
 {
     int i;
     int index = 0;
+    int n = s->vlc_n;
 
-    for (i = 0; i < 256;) {
+    for (i = 0; i < n;) {
         int val = len[i];
         int repeat = 0;
 
-        for (; i < 256 && len[i] == val && repeat < 255; i++)
+        for (; i < n && len[i] == val && repeat < 255; i++)
             repeat++;
 
-        assert(val < 32 && val >0 && repeat<256 && repeat>0);
-        if ( repeat > 7) {
+        av_assert0(val < 32 && val >0 && repeat < 256 && repeat>0);
+        if (repeat > 7) {
             buf[index++] = val;
             buf[index++] = repeat;
         } else {
@@ -144,19 +187,48 @@ static int store_table(HYuvContext *s, const uint8_t *len, uint8_t *buf)
     return index;
 }
 
+static int store_huffman_tables(HYuvContext *s, uint8_t *buf)
+{
+    int i, ret;
+    int size = 0;
+    int count = 3;
+
+    if (s->version > 2)
+        count = 1 + s->alpha + 2*s->chroma;
+
+    for (i = 0; i < count; i++) {
+        if ((ret = ff_huff_gen_len_table(s->len[i], s->stats[i], s->vlc_n, 0)) < 0)
+            return ret;
+
+        if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i], s->vlc_n) < 0) {
+            return -1;
+        }
+
+        size += store_table(s, s->len[i], buf + size);
+    }
+    return size;
+}
+
 static av_cold int encode_init(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int i, j;
+    int ret;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
     ff_huffyuv_common_init(avctx);
     ff_huffyuvencdsp_init(&s->hencdsp);
 
-    avctx->extradata = av_mallocz(1024*30); // 256*3+4 == 772
-    avctx->stats_out = av_mallocz(1024*30); // 21*256*3(%llu ) + 3(\n) + 1(0) = 16132
+    avctx->extradata = av_mallocz(3*MAX_N + 4);
+    if (s->flags&AV_CODEC_FLAG_PASS1) {
+#define STATS_OUT_SIZE 21*MAX_N*3 + 4
+        avctx->stats_out = av_mallocz(STATS_OUT_SIZE); // 21*256*3(%llu ) + 3(\n) + 1(0) = 16132
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
+    }
     s->version = 2;
 
-    if (!avctx->extradata || !avctx->stats_out)
+    if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
 #if FF_API_CODED_FRAME
@@ -172,15 +244,66 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    s->bps = desc->comp[0].depth;
+    s->yuv = !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components >= 2;
+    s->chroma = desc->nb_components > 2;
+    s->alpha = !!(desc->flags & AV_PIX_FMT_FLAG_ALPHA);
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt,
+                                     &s->chroma_h_shift,
+                                     &s->chroma_v_shift);
+
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
         if (s->width & 1) {
             av_log(avctx, AV_LOG_ERROR, "Width must be even for this colorspace.\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
         s->bitstream_bpp = avctx->pix_fmt == AV_PIX_FMT_YUV420P ? 12 : 16;
         break;
+    case AV_PIX_FMT_YUV444P:
+    case AV_PIX_FMT_YUV410P:
+    case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUV440P:
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_GRAY16:
+    case AV_PIX_FMT_YUVA444P:
+    case AV_PIX_FMT_YUVA420P:
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GRAY8A:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV420P16:
+    case AV_PIX_FMT_YUV422P9:
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV422P16:
+    case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV444P14:
+    case AV_PIX_FMT_YUV444P16:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUVA420P16:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA444P9:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA444P16:
+        s->version = 3;
+        break;
     case AV_PIX_FMT_RGB32:
         s->bitstream_bpp = 32;
         break;
@@ -189,10 +312,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
+    s->n = 1<<s->bps;
+    s->vlc_n = FFMIN(s->n, MAX_VLC_N);
+
     avctx->bits_per_coded_sample = s->bitstream_bpp;
-    s->decorrelate = s->bitstream_bpp >= 24;
+    s->decorrelate = s->bitstream_bpp >= 24 && !s->yuv && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->prediction_method)
@@ -205,7 +331,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_log(avctx, AV_LOG_ERROR,
                    "context=1 is not compatible with "
                    "2 pass huffyuv encoding\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
     }
 
@@ -214,14 +340,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
             av_log(avctx, AV_LOG_ERROR,
                    "Error: YV12 is not supported by huffyuv; use "
                    "vcodec=ffvhuff or format=422p\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
 #if FF_API_PRIVATE_OPT
         if (s->context) {
             av_log(avctx, AV_LOG_ERROR,
                    "Error: per-frame huffman tables are not supported "
                    "by huffyuv; use vcodec=ffvhuff\n");
-            return -1;
+            return AVERROR(EINVAL);
+        }
+        if (s->version > 2) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Error: ver>2 is not supported "
+                   "by huffyuv; use vcodec=ffvhuff\n");
+            return AVERROR(EINVAL);
         }
 #endif
         if (s->interlaced != ( s->height > 288 ))
@@ -229,32 +361,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
                    "using huffyuv 2.2.0 or newer interlacing flag\n");
     }
 
-    if (s->bitstream_bpp >= 24 && s->predictor == MEDIAN) {
+    if (s->version > 3 && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_ERROR, "Ver > 3 is under development, files encoded with it may not be decodable with future versions!!!\n"
+               "Use vstrict=-2 / -strict -2 to use it anyway.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->bitstream_bpp >= 24 && s->predictor == MEDIAN && s->version <= 2) {
         av_log(avctx, AV_LOG_ERROR,
                "Error: RGB is incompatible with median predictor\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     ((uint8_t*)avctx->extradata)[0] = s->predictor | (s->decorrelate << 6);
-    ((uint8_t*)avctx->extradata)[1] = s->bitstream_bpp;
     ((uint8_t*)avctx->extradata)[2] = s->interlaced ? 0x10 : 0x20;
     if (s->context)
         ((uint8_t*)avctx->extradata)[2] |= 0x40;
-    ((uint8_t*)avctx->extradata)[3] = 0;
+    if (s->version < 3) {
+        ((uint8_t*)avctx->extradata)[1] = s->bitstream_bpp;
+        ((uint8_t*)avctx->extradata)[3] = 0;
+    } else {
+        ((uint8_t*)avctx->extradata)[1] = ((s->bps-1)<<4) | s->chroma_h_shift | (s->chroma_v_shift<<2);
+        if (s->chroma)
+            ((uint8_t*)avctx->extradata)[2] |= s->yuv ? 1 : 2;
+        if (s->alpha)
+            ((uint8_t*)avctx->extradata)[2] |= 4;
+        ((uint8_t*)avctx->extradata)[3] = 1;
+    }
     s->avctx->extradata_size = 4;
 
     if (avctx->stats_in) {
         char *p = avctx->stats_in;
 
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j] = 1;
 
         for (;;) {
-            for (i = 0; i < 3; i++) {
+            for (i = 0; i < 4; i++) {
                 char *next;
 
-                for (j = 0; j < 256; j++) {
+                for (j = 0; j < s->vlc_n; j++) {
                     s->stats[i][j] += strtol(p, &next, 0);
                     if (next == p) return -1;
                     p = next;
@@ -263,40 +410,37 @@ FF_ENABLE_DEPRECATION_WARNINGS
             if (p[0] == 0 || p[1] == 0 || p[2] == 0) break;
         }
     } else {
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++) {
-                int d = FFMIN(j, 256 - j);
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++) {
+                int d = FFMIN(j, s->vlc_n - j);
 
-                s->stats[i][j] = 100000000 / (d + 1);
+                s->stats[i][j] = 100000000 / (d*d + 1);
             }
     }
 
-    for (i = 0; i < 3; i++) {
-        ff_huff_gen_len_table(s->len[i], s->stats[i]);
-
-        if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i]) < 0) {
-            return -1;
-        }
-
-        s->avctx->extradata_size +=
-            store_table(s, s->len[i], &((uint8_t*)s->avctx->extradata)[s->avctx->extradata_size]);
-    }
+    ret = store_huffman_tables(s, s->avctx->extradata + s->avctx->extradata_size);
+    if (ret < 0)
+        return ret;
+    s->avctx->extradata_size += ret;
 
     if (s->context) {
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < 4; i++) {
             int pels = s->width * s->height / (i ? 40 : 10);
-            for (j = 0; j < 256; j++) {
-                int d = FFMIN(j, 256 - j);
-                s->stats[i][j] = pels/(d + 1);
+            for (j = 0; j < s->vlc_n; j++) {
+                int d = FFMIN(j, s->vlc_n - j);
+                s->stats[i][j] = pels/(d*d + 1);
             }
         }
     } else {
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j]= 0;
     }
 
-    ff_huffyuv_alloc_temp(s);
+    if (ff_huffyuv_alloc_temp(s)) {
+        ff_huffyuv_common_end(s);
+        return AVERROR(ENOMEM);
+    }
 
     s->picture_number=0;
 
@@ -357,6 +501,168 @@ static int encode_422_bitstream(HYuvContext *s, int offset, int count)
     return 0;
 }
 
+static int encode_plane_bitstream(HYuvContext *s, int width, int plane)
+{
+    int i, count = width/2;
+
+    if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) < count * s->bps / 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+        return -1;
+    }
+
+#define LOADEND\
+            int y0 = s->temp[0][width-1];
+#define LOADEND_14\
+            int y0 = s->temp16[0][width-1] & mask;
+#define LOADEND_16\
+            int y0 = s->temp16[0][width-1];
+#define STATEND\
+            s->stats[plane][y0]++;
+#define STATEND_16\
+            s->stats[plane][y0>>2]++;
+#define WRITEEND\
+            put_bits(&s->pb, s->len[plane][y0], s->bits[plane][y0]);
+#define WRITEEND_16\
+            put_bits(&s->pb, s->len[plane][y0>>2], s->bits[plane][y0>>2]);\
+            put_bits(&s->pb, 2, y0&3);
+
+#define LOAD2\
+            int y0 = s->temp[0][2 * i];\
+            int y1 = s->temp[0][2 * i + 1];
+#define LOAD2_14\
+            int y0 = s->temp16[0][2 * i] & mask;\
+            int y1 = s->temp16[0][2 * i + 1] & mask;
+#define LOAD2_16\
+            int y0 = s->temp16[0][2 * i];\
+            int y1 = s->temp16[0][2 * i + 1];
+#define STAT2\
+            s->stats[plane][y0]++;\
+            s->stats[plane][y1]++;
+#define STAT2_16\
+            s->stats[plane][y0>>2]++;\
+            s->stats[plane][y1>>2]++;
+#define WRITE2\
+            put_bits(&s->pb, s->len[plane][y0], s->bits[plane][y0]);\
+            put_bits(&s->pb, s->len[plane][y1], s->bits[plane][y1]);
+#define WRITE2_16\
+            put_bits(&s->pb, s->len[plane][y0>>2], s->bits[plane][y0>>2]);\
+            put_bits(&s->pb, 2, y0&3);\
+            put_bits(&s->pb, s->len[plane][y1>>2], s->bits[plane][y1>>2]);\
+            put_bits(&s->pb, 2, y1&3);
+
+    if (s->bps <= 8) {
+    if (s->flags & AV_CODEC_FLAG_PASS1) {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            STAT2;
+        }
+        if (width&1) {
+            LOADEND;
+            STATEND;
+        }
+    }
+    if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+        return 0;
+
+    if (s->context) {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            STAT2;
+            WRITE2;
+        }
+        if (width&1) {
+            LOADEND;
+            STATEND;
+            WRITEEND;
+        }
+    } else {
+        for (i = 0; i < count; i++) {
+            LOAD2;
+            WRITE2;
+        }
+        if (width&1) {
+            LOADEND;
+            WRITEEND;
+        }
+    }
+    } else if (s->bps <= 14) {
+        int mask = s->n - 1;
+        if (s->flags & AV_CODEC_FLAG_PASS1) {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                STAT2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                STATEND;
+            }
+        }
+        if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+            return 0;
+
+        if (s->context) {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                STAT2;
+                WRITE2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                STATEND;
+                WRITEEND;
+            }
+        } else {
+            for (i = 0; i < count; i++) {
+                LOAD2_14;
+                WRITE2;
+            }
+            if (width&1) {
+                LOADEND_14;
+                WRITEEND;
+            }
+        }
+    } else {
+        if (s->flags & AV_CODEC_FLAG_PASS1) {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                STAT2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                STATEND_16;
+            }
+        }
+        if (s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)
+            return 0;
+
+        if (s->context) {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                STAT2_16;
+                WRITE2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                STATEND_16;
+                WRITEEND_16;
+            }
+        } else {
+            for (i = 0; i < count; i++) {
+                LOAD2_16;
+                WRITE2_16;
+            }
+            if (width&1) {
+                LOADEND_16;
+                WRITEEND_16;
+            }
+        }
+    }
+#undef LOAD2
+#undef STAT2
+#undef WRITE2
+    return 0;
+}
+
 static int encode_gray_bitstream(HYuvContext *s, int count)
 {
     int i;
@@ -414,8 +720,8 @@ static inline int encode_bgra_bitstream(HYuvContext *s, int count, int planes)
 
 #define LOAD_GBRA                                                       \
     int g = s->temp[0][planes == 3 ? 3 * i + 1 : 4 * i + G];            \
-    int b = s->temp[0][planes == 3 ? 3 * i + 2 : 4 * i + B] - g & 0xFF; \
-    int r = s->temp[0][planes == 3 ? 3 * i + 0 : 4 * i + R] - g & 0xFF; \
+    int b =(s->temp[0][planes == 3 ? 3 * i + 2 : 4 * i + B] - g) & 0xFF;\
+    int r =(s->temp[0][planes == 3 ? 3 * i + 0 : 4 * i + R] - g) & 0xFF;\
     int a = s->temp[0][planes * i + A];
 
 #define STAT_BGRA                                                       \
@@ -466,22 +772,16 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame * const p = pict;
     int i, j, size = 0, ret;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, width * height * 3 * 4 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error allocating output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, width * height * 3 * 4 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (s->context) {
-        for (i = 0; i < 3; i++) {
-            ff_huff_gen_len_table(s->len[i], s->stats[i]);
-            if (ff_huffyuv_generate_bits_table(s->bits[i], s->len[i]) < 0)
-                return -1;
-            size += store_table(s, s->len[i], &pkt->data[size]);
-        }
+        size = store_huffman_tables(s, pkt->data);
+        if (size < 0)
+            return size;
 
-        for (i = 0; i < 3; i++)
-            for (j = 0; j < 256; j++)
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < s->vlc_n; j++)
                 s->stats[i][j] >>= 1;
     }
 
@@ -649,6 +949,59 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             }
             encode_bgra_bitstream(s, width, 3);
         }
+    } else if (s->version > 2) {
+        int plane;
+        for (plane = 0; plane < 1 + 2*s->chroma + s->alpha; plane++) {
+            int left, y;
+            int w = width;
+            int h = height;
+            int fake_stride = fake_ystride;
+
+            if (s->chroma && (plane == 1 || plane == 2)) {
+                w >>= s->chroma_h_shift;
+                h >>= s->chroma_v_shift;
+                fake_stride = plane == 1 ? fake_ustride : fake_vstride;
+            }
+
+            left = sub_left_prediction(s, s->temp[0], p->data[plane], w , 0);
+
+            encode_plane_bitstream(s, w, plane);
+
+            if (s->predictor==MEDIAN) {
+                int lefttop;
+                y = 1;
+                if (s->interlaced) {
+                    left = sub_left_prediction(s, s->temp[0], p->data[plane] + p->linesize[plane], w , left);
+
+                    encode_plane_bitstream(s, w, plane);
+                    y++;
+                }
+
+                lefttop = p->data[plane][0];
+
+                for (; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane] * y;
+
+                    sub_median_prediction(s, s->temp[0], dst - fake_stride, dst, w , &left, &lefttop);
+
+                    encode_plane_bitstream(s, w, plane);
+                }
+            } else {
+                for (y = 1; y < h; y++) {
+                    uint8_t *dst = p->data[plane] + p->linesize[plane] * y;
+
+                    if (s->predictor == PLANE && s->interlaced < y) {
+                        diff_bytes(s, s->temp[1], dst, dst - fake_stride, w);
+
+                        left = sub_left_prediction(s, s->temp[0], s->temp[1], w , left);
+                    } else {
+                        left = sub_left_prediction(s, s->temp[0], dst, w , left);
+                    }
+
+                    encode_plane_bitstream(s, w, plane);
+                }
+            }
+        }
     } else {
         av_log(avctx, AV_LOG_ERROR, "Format not supported!\n");
     }
@@ -662,17 +1015,19 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if ((s->flags & AV_CODEC_FLAG_PASS1) && (s->picture_number & 31) == 0) {
         int j;
         char *p = avctx->stats_out;
-        char *end = p + 1024*30;
-        for (i = 0; i < 3; i++) {
-            for (j = 0; j < 256; j++) {
+        char *end = p + STATS_OUT_SIZE;
+        for (i = 0; i < 4; i++) {
+            for (j = 0; j < s->vlc_n; j++) {
                 snprintf(p, end-p, "%"PRIu64" ", s->stats[i][j]);
                 p += strlen(p);
                 s->stats[i][j]= 0;
             }
             snprintf(p, end-p, "\n");
             p++;
+            if (end <= p)
+                return AVERROR(ENOMEM);
         }
-    } else
+    } else if (avctx->stats_out)
         avctx->stats_out[0] = '\0';
     if (!(s->avctx->flags2 & AV_CODEC_FLAG2_NO_OUTPUT)) {
         flush_put_bits(&s->pb);
@@ -703,26 +1058,39 @@ static av_cold int encode_end(AVCodecContext *avctx)
 #define OFFSET(x) offsetof(HYuvContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 
-#define HUFF_CLASS(variant)                  \
-static const AVClass variant ## _class = {   \
-    .class_name = # variant,                 \
-    .item_name  = av_default_item_name,      \
-    .option     = variant ## _options,       \
-    .version    = LIBAVUTIL_VERSION_INT,     \
-}
+#define COMMON_OPTIONS \
+    { "non_deterministic", "Allow multithreading for e.g. context=1 at the expense of determinism", \
+      OFFSET(non_determ), AV_OPT_TYPE_BOOL, { .i64 = 1 }, \
+      0, 1, VE }, \
+    { "pred", "Prediction method", OFFSET(predictor), AV_OPT_TYPE_INT, { .i64 = LEFT }, LEFT, MEDIAN, VE, "pred" }, \
+        { "left",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LEFT },   INT_MIN, INT_MAX, VE, "pred" }, \
+        { "plane",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PLANE },  INT_MIN, INT_MAX, VE, "pred" }, \
+        { "median", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MEDIAN }, INT_MIN, INT_MAX, VE, "pred" }, \
+
+static const AVOption normal_options[] = {
+    COMMON_OPTIONS
+    { NULL },
+};
 
-#define FF_HUFFYUV_COMMON_OPTS \
-{ "pred", "Prediction method", OFFSET(predictor), AV_OPT_TYPE_INT, { .i64 = LEFT }, LEFT, MEDIAN, VE, "pred" }, \
-    { "left",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LEFT },   INT_MIN, INT_MAX, VE, "pred" }, \
-    { "plane",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PLANE },  INT_MIN, INT_MAX, VE, "pred" }, \
-    { "median", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MEDIAN }, INT_MIN, INT_MAX, VE, "pred" }
+static const AVOption ff_options[] = {
+    COMMON_OPTIONS
+    { "context", "Set per-frame huffman tables", OFFSET(context), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { NULL },
+};
 
-static const AVOption huffyuv_options[] = {
-    FF_HUFFYUV_COMMON_OPTS,
-    { NULL},
+static const AVClass normal_class = {
+    .class_name = "huffyuv",
+    .item_name  = av_default_item_name,
+    .option     = normal_options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
-HUFF_CLASS(huffyuv);
+static const AVClass ff_class = {
+    .class_name = "ffvhuff",
+    .item_name  = av_default_item_name,
+    .option     = ff_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 AVCodec ff_huffyuv_encoder = {
     .name           = "huffyuv",
@@ -730,10 +1098,11 @@ AVCodec ff_huffyuv_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_HUFFYUV,
     .priv_data_size = sizeof(HYuvContext),
-    .priv_class     = &huffyuv_class,
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &normal_class,
     .pix_fmts       = (const enum AVPixelFormat[]){
         AV_PIX_FMT_YUV422P, AV_PIX_FMT_RGB24,
         AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
@@ -743,26 +1112,33 @@ AVCodec ff_huffyuv_encoder = {
 };
 
 #if CONFIG_FFVHUFF_ENCODER
-static const AVOption ffhuffyuv_options[] = {
-    FF_HUFFYUV_COMMON_OPTS,
-    { "context", "Set per-frame huffman tables", OFFSET(context), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { NULL }
-};
-
-HUFF_CLASS(ffhuffyuv);
-
 AVCodec ff_ffvhuff_encoder = {
     .name           = "ffvhuff",
     .long_name      = NULL_IF_CONFIG_SMALL("Huffyuv FFmpeg variant"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_FFVHUFF,
     .priv_data_size = sizeof(HYuvContext),
-    .priv_class     = &ffhuffyuv_class,
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .priv_class     = &ff_class,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_RGB24,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV440P,
+        AV_PIX_FMT_GBRP,
+        AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16,
+        AV_PIX_FMT_YUVA420P, AV_PIX_FMT_YUVA422P, AV_PIX_FMT_YUVA444P,
+        AV_PIX_FMT_GBRAP,
+        AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV420P16,
+        AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV422P16,
+        AV_PIX_FMT_YUV444P9, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV444P14, AV_PIX_FMT_YUV444P16,
+        AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16,
+        AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P16,
+        AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P16,
+        AV_PIX_FMT_RGB24,
         AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
     },
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c
index 6c30877..fdcd0b0 100644
--- a/libavcodec/huffyuvencdsp.c
+++ b/libavcodec/huffyuvencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,12 @@
 #define pb_7f (~0UL / 255 * 0x7f)
 #define pb_80 (~0UL / 255 * 0x80)
 
-static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
+static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w)
 {
     long i;
 
 #if !HAVE_FAST_UNALIGNED
-    if ((long) src2 & (sizeof(long) - 1)) {
+    if (((long)src1 | (long)src2) & (sizeof(long) - 1)) {
         for (i = 0; i + 7 < w; i += 8) {
             dst[i + 0] = src1[i + 0] - src2[i + 0];
             dst[i + 1] = src1[i + 1] - src2[i + 1];
@@ -54,7 +54,7 @@ static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
 }
 
 static void sub_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
-                                   const uint8_t *src2, int w,
+                                   const uint8_t *src2, intptr_t w,
                                    int *left, int *left_top)
 {
     int i;
diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h
index 603c36f..9d09095 100644
--- a/libavcodec/huffyuvencdsp.h
+++ b/libavcodec/huffyuvencdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,15 +23,15 @@
 
 typedef struct HuffYUVEncDSPContext {
     void (*diff_bytes)(uint8_t *dst /* align 16 */,
-                       uint8_t *src1 /* align 16 */,
-                       uint8_t *src2 /* align 1 */,
-                       int w);
+                       const uint8_t *src1 /* align 16 */,
+                       const uint8_t *src2 /* align 1 */,
+                       intptr_t w);
     /**
      * Subtract HuffYUV's variant of median prediction.
      * Note, this might read from src1[-1], src2[-1].
      */
     void (*sub_hfyu_median_pred)(uint8_t *dst, const uint8_t *src1,
-                                 const uint8_t *src2, int w,
+                                 const uint8_t *src2, intptr_t w,
                                  int *left, int *left_top);
 } HuffYUVEncDSPContext;
 
diff --git a/libavcodec/idcinvideo.c b/libavcodec/idcinvideo.c
index 67dcf1c..0870172 100644
--- a/libavcodec/idcinvideo.c
+++ b/libavcodec/idcinvideo.c
@@ -2,20 +2,20 @@
  * id Quake II CIN Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -75,7 +75,7 @@ typedef struct IdcinContext {
     uint32_t pal[256];
 } IdcinContext;
 
-/*
+/**
  * Find the lowest probability node in a Huffman table, and mark it as
  * being assigned to a higher probability.
  * @return the node index of the lowest unused node, or -1 if all nodes
@@ -169,7 +169,7 @@ static av_cold int idcin_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
+static int idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
 {
     hnode *hnodes;
     long x, y;
@@ -188,7 +188,7 @@ static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
                 if(!bit_pos) {
                     if(dat_pos >= s->size) {
                         av_log(s->avctx, AV_LOG_ERROR, "Huffman decode error.\n");
-                        return;
+                        return -1;
                     }
                     bit_pos = 8;
                     v = s->buf[dat_pos++];
@@ -203,6 +203,8 @@ static void idcin_decode_vlcs(IdcinContext *s, AVFrame *frame)
             prev = node_num;
         }
     }
+
+    return 0;
 }
 
 static int idcin_decode_frame(AVCodecContext *avctx,
@@ -219,12 +221,11 @@ static int idcin_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  id CIN Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
-    idcin_decode_vlcs(s, frame);
+    if (idcin_decode_vlcs(s, frame))
+        return AVERROR_INVALIDDATA;
 
     if (pal) {
         frame->palette_has_changed = 1;
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index a9b8727..63e9b52 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #include "faanidct.h"
 #include "idctdsp.h"
 #include "simple_idct.h"
+#include "xvididct.h"
 
 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
                                const uint8_t *src_scantable)
@@ -79,11 +80,11 @@ av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
     }
 }
 
-void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
+void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
 
-static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 int line_size)
+static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 ptrdiff_t line_size)
 {
     int i;
 
@@ -103,9 +104,41 @@ static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
     }
 }
 
+static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = av_clip_uint8(block[0]);
+        pixels[1] = av_clip_uint8(block[1]);
+        pixels[2] = av_clip_uint8(block[2]);
+        pixels[3] = av_clip_uint8(block[3]);
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = av_clip_uint8(block[0]);
+        pixels[1] = av_clip_uint8(block[1]);
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
 static void put_signed_pixels_clamped_c(const int16_t *block,
-                                        uint8_t *restrict pixels,
-                                        int line_size)
+                                        uint8_t *av_restrict pixels,
+                                        ptrdiff_t line_size)
 {
     int i, j;
 
@@ -124,8 +157,8 @@ static void put_signed_pixels_clamped_c(const int16_t *block,
     }
 }
 
-static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 int line_size)
+static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
+                                 ptrdiff_t line_size)
 {
     int i;
 
@@ -144,47 +177,139 @@ static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
     }
 }
 
+static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
+                          int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
+        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
+        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
+        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
+                          int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
+        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct4 (block);
+    put_pixels_clamped4_c(block, dest, line_size);
+}
+static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct4 (block);
+    add_pixels_clamped4_c(block, dest, line_size);
+}
+
+static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct2 (block);
+    put_pixels_clamped2_c(block, dest, line_size);
+}
+static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct2 (block);
+    add_pixels_clamped2_c(block, dest, line_size);
+}
+
+static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    dest[0] = av_clip_uint8((block[0] + 4)>>3);
+}
+static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
+}
+
 av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (avctx->bits_per_raw_sample == 10) {
-        c->idct_put  = ff_simple_idct_put_10;
-        c->idct_add  = ff_simple_idct_add_10;
-        c->idct      = ff_simple_idct_10;
+    if (avctx->lowres==1) {
+        c->idct_put  = ff_jref_idct4_put;
+        c->idct_add  = ff_jref_idct4_add;
+        c->idct      = ff_j_rev_dct4;
         c->perm_type = FF_IDCT_PERM_NONE;
-    } else if (avctx->idct_algo == FF_IDCT_INT) {
-        c->idct_put  = ff_jref_idct_put;
-        c->idct_add  = ff_jref_idct_add;
-        c->idct      = ff_j_rev_dct;
-        c->perm_type = FF_IDCT_PERM_LIBMPEG2;
-#if CONFIG_FAANIDCT
-    } else if (avctx->idct_algo == FF_IDCT_FAAN) {
-        c->idct_put  = ff_faanidct_put;
-        c->idct_add  = ff_faanidct_add;
-        c->idct      = ff_faanidct;
+    } else if (avctx->lowres==2) {
+        c->idct_put  = ff_jref_idct2_put;
+        c->idct_add  = ff_jref_idct2_add;
+        c->idct      = ff_j_rev_dct2;
         c->perm_type = FF_IDCT_PERM_NONE;
-#endif /* CONFIG_FAANIDCT */
-    } else { // accurate/default
-        c->idct_put  = ff_simple_idct_put_8;
-        c->idct_add  = ff_simple_idct_add_8;
-        c->idct      = ff_simple_idct_8;
+    } else if (avctx->lowres==3) {
+        c->idct_put  = ff_jref_idct1_put;
+        c->idct_add  = ff_jref_idct1_add;
+        c->idct      = ff_j_rev_dct1;
         c->perm_type = FF_IDCT_PERM_NONE;
+    } else {
+        if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) {
+            c->idct_put              = ff_simple_idct_put_10;
+            c->idct_add              = ff_simple_idct_add_10;
+            c->idct                  = ff_simple_idct_10;
+            c->perm_type             = FF_IDCT_PERM_NONE;
+        } else if (avctx->bits_per_raw_sample == 12) {
+            c->idct_put              = ff_simple_idct_put_12;
+            c->idct_add              = ff_simple_idct_add_12;
+            c->idct                  = ff_simple_idct_12;
+            c->perm_type             = FF_IDCT_PERM_NONE;
+        } else {
+            if (avctx->idct_algo == FF_IDCT_INT) {
+                c->idct_put  = ff_jref_idct_put;
+                c->idct_add  = ff_jref_idct_add;
+                c->idct      = ff_j_rev_dct;
+                c->perm_type = FF_IDCT_PERM_LIBMPEG2;
+#if CONFIG_FAANIDCT
+            } else if (avctx->idct_algo == FF_IDCT_FAAN) {
+                c->idct_put  = ff_faanidct_put;
+                c->idct_add  = ff_faanidct_add;
+                c->idct      = ff_faanidct;
+                c->perm_type = FF_IDCT_PERM_NONE;
+#endif /* CONFIG_FAANIDCT */
+            } else { // accurate/default
+                c->idct_put  = ff_simple_idct_put_8;
+                c->idct_add  = ff_simple_idct_add_8;
+                c->idct      = ff_simple_idct_8;
+                c->perm_type = FF_IDCT_PERM_NONE;
+            }
+        }
     }
 
     c->put_pixels_clamped        = put_pixels_clamped_c;
     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
     c->add_pixels_clamped        = add_pixels_clamped_c;
 
-    ff_put_pixels_clamped = c->put_pixels_clamped;
-    ff_add_pixels_clamped = c->add_pixels_clamped;
+    if (CONFIG_MPEG4_DECODER && avctx->idct_algo == FF_IDCT_XVID)
+        ff_xvid_idct_init(c, avctx);
 
+    if (ARCH_ALPHA)
+        ff_idctdsp_init_alpha(c, avctx, high_bit_depth);
     if (ARCH_ARM)
         ff_idctdsp_init_arm(c, avctx, high_bit_depth);
     if (ARCH_PPC)
         ff_idctdsp_init_ppc(c, avctx, high_bit_depth);
     if (ARCH_X86)
         ff_idctdsp_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_idctdsp_init_mips(c, avctx, high_bit_depth);
+
+    ff_put_pixels_clamped = c->put_pixels_clamped;
+    ff_add_pixels_clamped = c->add_pixels_clamped;
 
     ff_init_scantable_permutation(c->idct_permutation,
                                   c->perm_type);
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
index c49a4ca..b180a67 100644
--- a/libavcodec/idctdsp.h
+++ b/libavcodec/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,13 +52,13 @@ typedef struct IDCTDSPContext {
     /* pixel ops : interface with DCT */
     void (*put_pixels_clamped)(const int16_t *block /* align 16 */,
                                uint8_t *pixels /* align 8 */,
-                               int line_size);
+                               ptrdiff_t line_size);
     void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */,
                                       uint8_t *pixels /* align 8 */,
-                                      int line_size);
+                                      ptrdiff_t line_size);
     void (*add_pixels_clamped)(const int16_t *block /* align 16 */,
                                uint8_t *pixels /* align 8 */,
-                               int line_size);
+                               ptrdiff_t line_size);
 
     void (*idct)(int16_t *block /* align 16 */);
 
@@ -95,16 +95,20 @@ typedef struct IDCTDSPContext {
     enum idct_permutation_type perm_type;
 } IDCTDSPContext;
 
-extern void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-extern void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+extern void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
+extern void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size);
 
 void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx);
 
+void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx,
+                           unsigned high_bit_depth);
 void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
+void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth);
 
 #endif /* AVCODEC_IDCTDSP_H */
diff --git a/libavcodec/iff.c b/libavcodec/iff.c
index a186e31..ef9ce91 100644
--- a/libavcodec/iff.c
+++ b/libavcodec/iff.c
@@ -1,28 +1,29 @@
 /*
- * IFF PBM/ILBM bitmap decoder
+ * IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN bitmap decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  * Copyright (c) 2010 Sebastian Vater <cdgs.basty@googlemail.com>
+ * Copyright (c) 2016 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * IFF PBM/ILBM bitmap decoder
+ * IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN bitmap decoder
  */
 
 #include <stdint.h>
@@ -32,12 +33,39 @@
 #include "bytestream.h"
 #include "avcodec.h"
 #include "internal.h"
+#include "mathops.h"
+
+// TODO: masking bits
+typedef enum {
+    MASK_NONE,
+    MASK_HAS_MASK,
+    MASK_HAS_TRANSPARENT_COLOR,
+    MASK_LASSO
+} mask_type;
 
 typedef struct IffContext {
     AVFrame *frame;
     int planesize;
     uint8_t * planebuf;
+    uint8_t * ham_buf;      ///< temporary buffer for planar to chunky conversation
+    uint32_t *ham_palbuf;   ///< HAM decode table
+    uint32_t *mask_buf;     ///< temporary buffer for palette indices
+    uint32_t *mask_palbuf;  ///< masking palette table
+    unsigned  compression;  ///< delta compression method used
+    unsigned  is_short;     ///< short compression method used
+    unsigned  is_interlaced;///< video is interlaced
+    unsigned  is_brush;     ///< video is in ANBR format
+    unsigned  bpp;          ///< bits per plane to decode (differs from bits_per_coded_sample if HAM)
+    unsigned  ham;          ///< 0 if non-HAM or number of hold bits (6 for bpp > 6, 4 otherwise)
+    unsigned  flags;        ///< 1 for EHB, 0 is no extra half darkening
+    unsigned  transparency; ///< TODO: transparency color index in palette
+    unsigned  masking;      ///< TODO: masking method used
     int init; // 1 if buffer and palette data already initialized, 0 otherwise
+    int16_t   tvdc[16];     ///< TVDC lookup table
+    GetByteContext gb;
+    uint8_t *video[2];
+    unsigned video_size;
+    uint32_t *pal[2];
 } IffContext;
 
 #define LUT8_PART(plane, v)                             \
@@ -124,33 +152,229 @@ static av_always_inline uint32_t gray2rgb(const uint32_t x) {
  */
 static int cmap_read_palette(AVCodecContext *avctx, uint32_t *pal)
 {
+    IffContext *s = avctx->priv_data;
     int count, i;
+    const uint8_t *const palette = avctx->extradata + AV_RB16(avctx->extradata);
+    int palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
 
     if (avctx->bits_per_coded_sample > 8) {
-        av_log(avctx, AV_LOG_ERROR, "bit_per_coded_sample > 8 not supported\n");
+        av_log(avctx, AV_LOG_ERROR, "bits_per_coded_sample > 8 not supported\n");
         return AVERROR_INVALIDDATA;
     }
 
     count = 1 << avctx->bits_per_coded_sample;
     // If extradata is smaller than actually needed, fill the remaining with black.
-    count = FFMIN(avctx->extradata_size / 3, count);
+    count = FFMIN(palette_size / 3, count);
     if (count) {
         for (i = 0; i < count; i++)
-            pal[i] = 0xFF000000 | AV_RB24(avctx->extradata + i * 3);
+            pal[i] = 0xFF000000 | AV_RB24(palette + i*3);
+        if (s->flags && count >= 32) { // EHB
+            for (i = 0; i < 32; i++)
+                pal[i + 32] = 0xFF000000 | (AV_RB24(palette + i*3) & 0xFEFEFE) >> 1;
+            count = FFMAX(count, 64);
+        }
     } else { // Create gray-scale color palette for bps < 8
         count = 1 << avctx->bits_per_coded_sample;
 
         for (i = 0; i < count; i++)
             pal[i] = 0xFF000000 | gray2rgb((i * 255) >> avctx->bits_per_coded_sample);
     }
+    if (s->masking == MASK_HAS_MASK) {
+        memcpy(pal + (1 << avctx->bits_per_coded_sample), pal, count * 4);
+        for (i = 0; i < count; i++)
+            pal[i] &= 0xFFFFFF;
+    } else if (s->masking == MASK_HAS_TRANSPARENT_COLOR &&
+        s->transparency < 1 << avctx->bits_per_coded_sample)
+        pal[s->transparency] &= 0xFFFFFF;
+    return 0;
+}
+
+/**
+ * Extracts the IFF extra context and updates internal
+ * decoder structures.
+ *
+ * @param avctx the AVCodecContext where to extract extra context to
+ * @param avpkt the AVPacket to extract extra context from or NULL to use avctx
+ * @return >= 0 in case of success, a negative error code otherwise
+ */
+static int extract_header(AVCodecContext *const avctx,
+                          const AVPacket *const avpkt)
+{
+    IffContext *s = avctx->priv_data;
+    const uint8_t *buf;
+    unsigned buf_size = 0;
+    int i, palette_size;
+
+    if (avctx->extradata_size < 2) {
+        av_log(avctx, AV_LOG_ERROR, "not enough extradata\n");
+        return AVERROR_INVALIDDATA;
+    }
+    palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
+
+    if (avpkt && avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+        uint32_t chunk_id;
+        uint64_t data_size;
+        GetByteContext *gb = &s->gb;
+
+        bytestream2_skip(gb, 4);
+        while (bytestream2_get_bytes_left(gb) >= 1) {
+            chunk_id  = bytestream2_get_le32(gb);
+            data_size = bytestream2_get_be32(gb);
+
+            if (chunk_id == MKTAG('B', 'M', 'H', 'D')) {
+                bytestream2_skip(gb, data_size + (data_size & 1));
+            } else if (chunk_id == MKTAG('A', 'N', 'H', 'D')) {
+                unsigned extra;
+                if (data_size < 40)
+                    return AVERROR_INVALIDDATA;
+
+                s->compression = (bytestream2_get_byte(gb) << 8) | (s->compression & 0xFF);
+                bytestream2_skip(gb, 19);
+                extra = bytestream2_get_be32(gb);
+                s->is_short = !(extra & 1);
+                s->is_brush = extra == 2;
+                s->is_interlaced = !!(extra & 0x40);
+                data_size -= 24;
+                bytestream2_skip(gb, data_size + (data_size & 1));
+            } else if (chunk_id == MKTAG('D', 'L', 'T', 'A') ||
+                       chunk_id == MKTAG('B', 'O', 'D', 'Y')) {
+                if (chunk_id == MKTAG('B','O','D','Y'))
+                    s->compression &= 0xFF;
+                break;
+            } else if (chunk_id == MKTAG('C', 'M', 'A', 'P')) {
+                int count = data_size / 3;
+                uint32_t *pal = s->pal[0];
+
+                if (count > 256)
+                    return AVERROR_INVALIDDATA;
+                if (s->ham) {
+                    for (i = 0; i < count; i++)
+                        pal[i] = 0xFF000000 | bytestream2_get_le24(gb);
+                } else {
+                    for (i = 0; i < count; i++)
+                        pal[i] = 0xFF000000 | bytestream2_get_be24(gb);
+                }
+                bytestream2_skip(gb, data_size & 1);
+            } else {
+                bytestream2_skip(gb, data_size + (data_size&1));
+            }
+        }
+    } else if (!avpkt) {
+        buf = avctx->extradata;
+        buf_size = bytestream_get_be16(&buf);
+        if (buf_size <= 1 || palette_size < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid palette size received: %u -> palette data offset: %d\n",
+                   buf_size, palette_size);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (buf_size >= 41) {
+        s->compression  = bytestream_get_byte(&buf);
+        s->bpp          = bytestream_get_byte(&buf);
+        s->ham          = bytestream_get_byte(&buf);
+        s->flags        = bytestream_get_byte(&buf);
+        s->transparency = bytestream_get_be16(&buf);
+        s->masking      = bytestream_get_byte(&buf);
+        for (i = 0; i < 16; i++)
+            s->tvdc[i] = bytestream_get_be16(&buf);
+
+        if (s->masking == MASK_HAS_MASK) {
+            if (s->bpp >= 8 && !s->ham) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB32;
+                av_freep(&s->mask_buf);
+                av_freep(&s->mask_palbuf);
+                s->mask_buf = av_malloc((s->planesize * 32) + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!s->mask_buf)
+                    return AVERROR(ENOMEM);
+                if (s->bpp > 16) {
+                    av_log(avctx, AV_LOG_ERROR, "bpp %d too large for palette\n", s->bpp);
+                    av_freep(&s->mask_buf);
+                    return AVERROR(ENOMEM);
+                }
+                s->mask_palbuf = av_malloc((2 << s->bpp) * sizeof(uint32_t) + AV_INPUT_BUFFER_PADDING_SIZE);
+                if (!s->mask_palbuf) {
+                    av_freep(&s->mask_buf);
+                    return AVERROR(ENOMEM);
+                }
+            }
+            s->bpp++;
+        } else if (s->masking != MASK_NONE && s->masking != MASK_HAS_TRANSPARENT_COLOR) {
+            av_log(avctx, AV_LOG_ERROR, "Masking not supported\n");
+            return AVERROR_PATCHWELCOME;
+        }
+        if (!s->bpp || s->bpp > 32) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of bitplanes: %u\n", s->bpp);
+            return AVERROR_INVALIDDATA;
+        } else if (s->ham >= 8) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of hold bits for HAM: %u\n", s->ham);
+            return AVERROR_INVALIDDATA;
+        }
+
+        av_freep(&s->ham_buf);
+        av_freep(&s->ham_palbuf);
+
+        if (s->ham) {
+            int i, count = FFMIN(palette_size / 3, 1 << s->ham);
+            int ham_count;
+            const uint8_t *const palette = avctx->extradata + AV_RB16(avctx->extradata);
+
+            s->ham_buf = av_malloc((s->planesize * 8) + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->ham_buf)
+                return AVERROR(ENOMEM);
+
+            ham_count = 8 * (1 << s->ham);
+            s->ham_palbuf = av_malloc((ham_count << !!(s->masking == MASK_HAS_MASK)) * sizeof (uint32_t) + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!s->ham_palbuf) {
+                av_freep(&s->ham_buf);
+                return AVERROR(ENOMEM);
+            }
+
+            if (count) { // HAM with color palette attached
+                // prefill with black and palette and set HAM take direct value mask to zero
+                memset(s->ham_palbuf, 0, (1 << s->ham) * 2 * sizeof (uint32_t));
+                for (i=0; i < count; i++) {
+                    s->ham_palbuf[i*2+1] = 0xFF000000 | AV_RL24(palette + i*3);
+                }
+                count = 1 << s->ham;
+            } else { // HAM with grayscale color palette
+                count = 1 << s->ham;
+                for (i=0; i < count; i++) {
+                    s->ham_palbuf[i*2]   = 0xFF000000; // take direct color value from palette
+                    s->ham_palbuf[i*2+1] = 0xFF000000 | av_le2ne32(gray2rgb((i * 255) >> s->ham));
+                }
+            }
+            for (i=0; i < count; i++) {
+                uint32_t tmp = i << (8 - s->ham);
+                tmp |= tmp >> s->ham;
+                s->ham_palbuf[(i+count)*2]     = 0xFF00FFFF; // just modify blue color component
+                s->ham_palbuf[(i+count*2)*2]   = 0xFFFFFF00; // just modify red color component
+                s->ham_palbuf[(i+count*3)*2]   = 0xFFFF00FF; // just modify green color component
+                s->ham_palbuf[(i+count)*2+1]   = 0xFF000000 | tmp << 16;
+                s->ham_palbuf[(i+count*2)*2+1] = 0xFF000000 | tmp;
+                s->ham_palbuf[(i+count*3)*2+1] = 0xFF000000 | tmp << 8;
+            }
+            if (s->masking == MASK_HAS_MASK) {
+                for (i = 0; i < ham_count; i++)
+                    s->ham_palbuf[(1 << s->bpp) + i] = s->ham_palbuf[i] | 0xFF000000;
+            }
+        }
+    }
+
     return 0;
 }
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
     IffContext *s = avctx->priv_data;
-    av_frame_free(&s->frame);
     av_freep(&s->planebuf);
+    av_freep(&s->ham_buf);
+    av_freep(&s->ham_palbuf);
+    av_freep(&s->video[0]);
+    av_freep(&s->video[1]);
+    av_freep(&s->pal[0]);
+    av_freep(&s->pal[1]);
     return 0;
 }
 
@@ -160,11 +384,29 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int err;
 
     if (avctx->bits_per_coded_sample <= 8) {
-        avctx->pix_fmt = (avctx->bits_per_coded_sample < 8 ||
-                          avctx->extradata_size) ? AV_PIX_FMT_PAL8
-                                                 : AV_PIX_FMT_GRAY8;
+        int palette_size;
+
+        if (avctx->extradata_size >= 2)
+            palette_size = avctx->extradata_size - AV_RB16(avctx->extradata);
+        else
+            palette_size = 0;
+        avctx->pix_fmt = (avctx->bits_per_coded_sample < 8) ||
+                         (avctx->extradata_size >= 2 && palette_size) ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
     } else if (avctx->bits_per_coded_sample <= 32) {
-        avctx->pix_fmt = AV_PIX_FMT_BGR32;
+        if (avctx->codec_tag == MKTAG('R', 'G', 'B', '8')) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        } else if (avctx->codec_tag == MKTAG('R', 'G', 'B', 'N')) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB444;
+        } else if (avctx->codec_tag != MKTAG('D', 'E', 'E', 'P')) {
+            if (avctx->bits_per_coded_sample == 24) {
+                avctx->pix_fmt = AV_PIX_FMT_0BGR32;
+            } else if (avctx->bits_per_coded_sample == 32) {
+                avctx->pix_fmt = AV_PIX_FMT_BGR32;
+            } else {
+                avpriv_request_sample(avctx, "unknown bits_per_coded_sample");
+                return AVERROR_PATCHWELCOME;
+            }
+        }
     } else {
         return AVERROR_INVALIDDATA;
     }
@@ -176,12 +418,21 @@ static av_cold int decode_init(AVCodecContext *avctx)
     if (!s->planebuf)
         return AVERROR(ENOMEM);
 
-    s->frame = av_frame_alloc();
-    if (!s->frame) {
-        decode_end(avctx);
-        return AVERROR(ENOMEM);
+    s->bpp = avctx->bits_per_coded_sample;
+
+    if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+        s->video_size = FFALIGN(avctx->width, 2) * avctx->height * s->bpp;
+        s->video[0] = av_calloc(FFALIGN(avctx->width, 2) * avctx->height, s->bpp);
+        s->video[1] = av_calloc(FFALIGN(avctx->width, 2) * avctx->height, s->bpp);
+        s->pal[0] = av_calloc(256, sizeof(*s->pal[0]));
+        s->pal[1] = av_calloc(256, sizeof(*s->pal[1]));
+        if (!s->video[0] || !s->video[1] || !s->pal[0] || !s->pal[1])
+            return AVERROR(ENOMEM);
     }
 
+    if ((err = extract_header(avctx, NULL)) < 0)
+        return err;
+
     return 0;
 }
 
@@ -195,6 +446,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
 static void decodeplane8(uint8_t *dst, const uint8_t *buf, int buf_size, int plane)
 {
     const uint64_t *lut = plane8_lut[plane];
+    if (plane >= 8) {
+        av_log(NULL, AV_LOG_WARNING, "Ignoring extra planes beyond 8\n");
+        return;
+    }
     do {
         uint64_t v = AV_RN64A(dst) | lut[*buf++];
         AV_WN64A(dst, v);
@@ -227,6 +482,47 @@ static void decodeplane32(uint32_t *dst, const uint8_t *buf, int buf_size, int p
     } while (--buf_size);
 }
 
+#define DECODE_HAM_PLANE32(x)       \
+    first       = buf[x] << 1;      \
+    second      = buf[(x)+1] << 1;  \
+    delta      &= pal[first++];     \
+    delta      |= pal[first];       \
+    dst[x]      = delta;            \
+    delta      &= pal[second++];    \
+    delta      |= pal[second];      \
+    dst[(x)+1]  = delta
+
+/**
+ * Converts one line of HAM6/8-encoded chunky buffer to 24bpp.
+ *
+ * @param dst the destination 24bpp buffer
+ * @param buf the source 8bpp chunky buffer
+ * @param pal the HAM decode table
+ * @param buf_size the plane size in bytes
+ */
+static void decode_ham_plane32(uint32_t *dst, const uint8_t  *buf,
+                               const uint32_t *const pal, unsigned buf_size)
+{
+    uint32_t delta = pal[1]; /* first palette entry */
+    do {
+        uint32_t first, second;
+        DECODE_HAM_PLANE32(0);
+        DECODE_HAM_PLANE32(2);
+        DECODE_HAM_PLANE32(4);
+        DECODE_HAM_PLANE32(6);
+        buf += 8;
+        dst += 8;
+    } while (--buf_size);
+}
+
+static void lookup_pal_indicies(uint32_t *dst, const uint32_t *buf,
+                         const uint32_t *const pal, unsigned width)
+{
+    do {
+        *dst++ = pal[*buf++];
+    } while (--width);
+}
+
 /**
  * Decode one complete byterun1 encoded line.
  *
@@ -237,162 +533,1234 @@ static void decodeplane32(uint32_t *dst, const uint8_t *buf, int buf_size, int p
  * @return number of consumed bytes in byterun1 compressed bitstream
  */
 static int decode_byterun(uint8_t *dst, int dst_size,
-                          const uint8_t *buf, const uint8_t *const buf_end)
+                          GetByteContext *gb)
 {
-    const uint8_t *const buf_start = buf;
     unsigned x;
-    for (x = 0; x < dst_size && buf < buf_end;) {
+    for (x = 0; x < dst_size && bytestream2_get_bytes_left(gb) > 0;) {
         unsigned length;
-        const int8_t value = *buf++;
+        const int8_t value = bytestream2_get_byte(gb);
         if (value >= 0) {
-            length = value + 1;
-            memcpy(dst + x, buf, FFMIN3(length, dst_size - x, buf_end - buf));
-            buf += length;
+            length = FFMIN3(value + 1, dst_size - x, bytestream2_get_bytes_left(gb));
+            bytestream2_get_buffer(gb, dst + x, length);
+            if (length < value + 1)
+                bytestream2_skip(gb, value + 1 - length);
         } else if (value > -128) {
-            length = -value + 1;
-            memset(dst + x, *buf++, FFMIN(length, dst_size - x));
+            length = FFMIN(-value + 1, dst_size - x);
+            memset(dst + x, bytestream2_get_byte(gb), length);
         } else { // noop
             continue;
         }
         x += length;
     }
-    return buf - buf_start;
+    if (x < dst_size) {
+        av_log(NULL, AV_LOG_WARNING, "decode_byterun ended before plane size\n");
+        memset(dst+x, 0, dst_size - x);
+    }
+    return bytestream2_tell(gb);
 }
 
-static int decode_frame_ilbm(AVCodecContext *avctx,
-                             void *data, int *got_frame,
-                             AVPacket *avpkt)
+#define DECODE_RGBX_COMMON(type) \
+    if (!length) { \
+        length = bytestream2_get_byte(gb); \
+        if (!length) { \
+            length = bytestream2_get_be16(gb); \
+            if (!length) \
+                return; \
+        } \
+    } \
+    for (i = 0; i < length; i++) { \
+        *(type *)(dst + y*linesize + x * sizeof(type)) = pixel; \
+        x += 1; \
+        if (x >= width) { \
+            y += 1; \
+            if (y >= height) \
+                return; \
+            x = 0; \
+        } \
+    }
+
+/**
+ * Decode RGB8 buffer
+ * @param[out] dst Destination buffer
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_rgb8(GetByteContext *gb, uint8_t *dst, int width, int height, int linesize)
 {
-    IffContext *s          = avctx->priv_data;
-    const uint8_t *buf     = avpkt->data;
-    int buf_size           = avpkt->size;
-    const uint8_t *buf_end = buf + buf_size;
-    int y, plane, res;
+    int x = 0, y = 0, i, length;
+    while (bytestream2_get_bytes_left(gb) >= 4) {
+        uint32_t pixel = 0xFF000000 | bytestream2_get_be24(gb);
+        length = bytestream2_get_byte(gb) & 0x7F;
+        DECODE_RGBX_COMMON(uint32_t)
+    }
+}
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
-        return res;
+/**
+ * Decode RGBN buffer
+ * @param[out] dst Destination buffer
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_rgbn(GetByteContext *gb, uint8_t *dst, int width, int height, int linesize)
+{
+    int x = 0, y = 0, i, length;
+    while (bytestream2_get_bytes_left(gb) >= 2) {
+        uint32_t pixel = bytestream2_get_be16u(gb);
+        length = pixel & 0x7;
+        pixel >>= 4;
+        DECODE_RGBX_COMMON(uint16_t)
+    }
+}
 
-    if (!s->init && avctx->bits_per_coded_sample <= 8 &&
-        avctx->pix_fmt != AV_PIX_FMT_GRAY8) {
-        if ((res = cmap_read_palette(avctx, (uint32_t *)s->frame->data[1])) < 0)
-            return res;
+/**
+ * Decode DEEP RLE 32-bit buffer
+ * @param[out] dst Destination buffer
+ * @param[in] src Source buffer
+ * @param src_size Source buffer size (bytes)
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ */
+static void decode_deep_rle32(uint8_t *dst, const uint8_t *src, int src_size, int width, int height, int linesize)
+{
+    const uint8_t *src_end = src + src_size;
+    int x = 0, y = 0, i;
+    while (src + 5 <= src_end) {
+        int opcode;
+        opcode = *(int8_t *)src++;
+        if (opcode >= 0) {
+            int size = opcode + 1;
+            for (i = 0; i < size; i++) {
+                int length = FFMIN(size - i, width);
+                memcpy(dst + y*linesize + x * 4, src, length * 4);
+                src += length * 4;
+                x += length;
+                i += length;
+                if (x >= width) {
+                    x = 0;
+                    y += 1;
+                    if (y >= height)
+                        return;
+                }
+            }
+        } else {
+            int size = -opcode + 1;
+            uint32_t pixel = AV_RN32(src);
+            for (i = 0; i < size; i++) {
+                *(uint32_t *)(dst + y*linesize + x * 4) = pixel;
+                x += 1;
+                if (x >= width) {
+                    x = 0;
+                    y += 1;
+                    if (y >= height)
+                        return;
+                }
+            }
+            src += 4;
+        }
     }
-    s->init = 1;
+}
 
-    if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
-        if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
-            for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width);
-                for (plane = 0; plane < avctx->bits_per_coded_sample && buf < buf_end;
-                     plane++) {
-                    decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
-                    buf += s->planesize;
+/**
+ * Decode DEEP TVDC 32-bit buffer
+ * @param[out] dst Destination buffer
+ * @param[in] src Source buffer
+ * @param src_size Source buffer size (bytes)
+ * @param width Width of destination buffer (pixels)
+ * @param height Height of destination buffer (pixels)
+ * @param linesize Line size of destination buffer (bytes)
+ * @param[int] tvdc TVDC lookup table
+ */
+static void decode_deep_tvdc32(uint8_t *dst, const uint8_t *src, int src_size, int width, int height, int linesize, const int16_t *tvdc)
+{
+    int x = 0, y = 0, plane = 0;
+    int8_t pixel = 0;
+    int i, j;
+
+    for (i = 0; i < src_size * 2;) {
+#define GETNIBBLE ((i & 1) ?  (src[i>>1] & 0xF) : (src[i>>1] >> 4))
+        int d = tvdc[GETNIBBLE];
+        i++;
+        if (d) {
+            pixel += d;
+            dst[y * linesize + x*4 + plane] = pixel;
+            x++;
+        } else {
+            if (i >= src_size * 2)
+                return;
+            d = GETNIBBLE + 1;
+            i++;
+            d = FFMIN(d, width - x);
+            for (j = 0; j < d; j++) {
+                dst[y * linesize + x*4 + plane] = pixel;
+                x++;
+            }
+        }
+        if (x >= width) {
+            plane++;
+            if (plane >= 4) {
+                y++;
+                if (y >= height)
+                    return;
+                plane = 0;
+            }
+            x = 0;
+            pixel = 0;
+            i = (i + 1) & ~1;
+        }
+    }
+}
+
+static void decode_short_horizontal_delta(uint8_t *dst,
+                                          const uint8_t *buf, const uint8_t *buf_end,
+                                          int w, int bpp, int dst_size)
+{
+    int planepitch = FFALIGN(w, 16) >> 3;
+    int pitch = planepitch * bpp;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    unsigned ofssrc, pos;
+    int i, k;
+
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+        pos = 0;
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        while (bytestream2_peek_be16(&gb) != 0xFFFF && bytestream2_get_bytes_left(&gb) > 3) {
+            int16_t offset = bytestream2_get_be16(&gb);
+            unsigned noffset;
+
+            if (offset >= 0) {
+                unsigned data = bytestream2_get_be16(&gb);
+
+                pos += offset * 2;
+                noffset = (pos / planepitch) * pitch + (pos % planepitch) + k * planepitch;
+                bytestream2_seek_p(&pb, noffset, SEEK_SET);
+                bytestream2_put_be16(&pb, data);
+            } else {
+                uint16_t count = bytestream2_get_be16(&gb);
+
+                pos += 2 * -(offset + 2);
+                for (i = 0; i < count; i++) {
+                    uint16_t data = bytestream2_get_be16(&gb);
+
+                    pos += 2;
+                    noffset = (pos / planepitch) * pitch + (pos % planepitch) + k * planepitch;
+                    bytestream2_seek_p(&pb, noffset, SEEK_SET);
+                    bytestream2_put_be16(&pb, data);
                 }
             }
-        } else { // AV_PIX_FMT_BGR32
-            for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width << 2);
-                for (plane = 0; plane < avctx->bits_per_coded_sample && buf < buf_end;
-                     plane++) {
-                    decodeplane32((uint32_t *)row, buf,
-                                  FFMIN(s->planesize, buf_end - buf), plane);
-                    buf += s->planesize;
+        }
+    }
+}
+
+static void decode_byte_vertical_delta(uint8_t *dst,
+                                       const uint8_t *buf, const uint8_t *buf_end,
+                                       int w, int xor, int bpp, int dst_size)
+{
+    int ncolumns = ((w + 15) / 16) * 2;
+    int dstpitch = ncolumns * bpp;
+    unsigned ofsdst, ofssrc, opcode, x;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    int i, j, k;
+
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = j + k * ncolumns;
+
+            i = bytestream2_get_byte(&gb);
+            while (i > 0) {
+                opcode = bytestream2_get_byte(&gb);
+
+                if (opcode == 0) {
+                    opcode  = bytestream2_get_byte(&gb);
+                    x = bytestream2_get_byte(&gb);
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (xor && ofsdst < dst_size) {
+                            bytestream2_put_byte(&pb, dst[ofsdst] ^ x);
+                        } else {
+                            bytestream2_put_byte(&pb, x);
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x80) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7f;
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (xor && ofsdst < dst_size) {
+                            bytestream2_put_byte(&pb, dst[ofsdst] ^ bytestream2_get_byte(&gb));
+                        } else {
+                            bytestream2_put_byte(&pb, bytestream2_get_byte(&gb));
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
                 }
+                i--;
             }
         }
-    } else if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) { // IFF-PBM
-        for (y = 0; y < avctx->height && buf < buf_end; y++) {
-            uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-            memcpy(row, buf, FFMIN(avctx->width, buf_end - buf));
-            buf += avctx->width + (avctx->width % 2); // padding if odd
+    }
+}
+
+static void decode_delta_j(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int h, int bpp, int dst_size)
+{
+    int32_t pitch;
+    uint8_t *ptr;
+    uint32_t type, flag, cols, groups, rows, bytes;
+    uint32_t offset;
+    int planepitch_byte = (w + 7) / 8;
+    int planepitch = ((w + 15) / 16) * 2;
+    int kludge_j, b, g, r, d;
+    GetByteContext gb;
+
+    pitch = planepitch * bpp;
+    kludge_j = w < 320 ? (320 - w) / 8 / 2 : 0;
+
+    bytestream2_init(&gb, buf, buf_end - buf);
+
+    while (bytestream2_get_bytes_left(&gb) >= 2) {
+        type = bytestream2_get_be16(&gb);
+
+        switch (type) {
+        case 0:
+            return;
+        case 1:
+            flag   = bytestream2_get_be16(&gb);
+            cols   = bytestream2_get_be16(&gb);
+            groups = bytestream2_get_be16(&gb);
+
+            for (g = 0; g < groups; g++) {
+                offset = bytestream2_get_be16(&gb);
+
+                if (bytestream2_get_bytes_left(&gb) < 1)
+                    return;
+
+                if (kludge_j)
+                    offset = ((offset / (320 / 8)) * pitch) + (offset % (320 / 8)) - kludge_j;
+                else
+                    offset = ((offset / planepitch_byte) * pitch) + (offset % planepitch_byte);
+
+                for (b = 0; b < cols; b++) {
+                    for (d = 0; d < bpp; d++) {
+                        uint8_t value = bytestream2_get_byte(&gb);
+
+                        if (offset >= dst_size)
+                            return;
+                        ptr = dst + offset;
+
+                        if (flag)
+                            ptr[0] ^= value;
+                        else
+                            ptr[0]  = value;
+
+                        offset += planepitch;
+                    }
+                }
+                if ((cols * bpp) & 1)
+                    bytestream2_skip(&gb, 1);
+            }
+            break;
+        case 2:
+            flag   = bytestream2_get_be16(&gb);
+            rows   = bytestream2_get_be16(&gb);
+            bytes  = bytestream2_get_be16(&gb);
+            groups = bytestream2_get_be16(&gb);
+
+            for (g = 0; g < groups; g++) {
+                offset = bytestream2_get_be16(&gb);
+
+                if (kludge_j)
+                    offset = ((offset / (320 / 8)) * pitch) + (offset % (320/ 8)) - kludge_j;
+                else
+                    offset = ((offset / planepitch_byte) * pitch) + (offset % planepitch_byte);
+
+                for (r = 0; r < rows; r++) {
+                    for (d = 0; d < bpp; d++) {
+                        unsigned noffset = offset + (r * pitch) + d * planepitch;
+
+                        if (bytestream2_get_bytes_left(&gb) < 1)
+                            return;
+
+                        for (b = 0; b < bytes; b++) {
+                            uint8_t value = bytestream2_get_byte(&gb);
+
+                            if (noffset >= dst_size)
+                                return;
+                            ptr = dst + noffset;
+
+                            if (flag)
+                                ptr[0] ^= value;
+                            else
+                                ptr[0]  = value;
+
+                            noffset++;
+                        }
+                    }
+                }
+                if ((rows * bytes * bpp) & 1)
+                    bytestream2_skip(&gb, 1);
+            }
+            break;
+        default:
+            return;
         }
     }
+}
 
-    if ((res = av_frame_ref(data, s->frame)) < 0)
-        return res;
+static void decode_short_vertical_delta(uint8_t *dst,
+                                        const uint8_t *buf, const uint8_t *buf_end,
+                                        int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 15) >> 4;
+    int dstpitch = ncolumns * bpp * 2;
+    unsigned ofsdst, ofssrc, ofsdata, opcode, x;
+    GetByteContext ptrs, gb, dptrs, dgb;
+    PutByteContext pb;
+    int i, j, k;
 
-    *got_frame = 1;
+    if (buf_end - buf <= 64)
+        return;
 
-    return buf_size;
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init(&dptrs, buf + 32, (buf_end - buf) - 32);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+        ofsdata = bytestream2_get_be32(&dptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            return;
+
+        if (ofsdata >= buf_end - buf)
+            return;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        bytestream2_init(&dgb, buf + ofsdata, buf_end - (buf + ofsdata));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 2;
+
+            i = bytestream2_get_byte(&gb);
+            while (i > 0) {
+                opcode = bytestream2_get_byte(&gb);
+
+                if (opcode == 0) {
+                    opcode = bytestream2_get_byte(&gb);
+                    x = bytestream2_get_be16(&dgb);
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, x);
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x80) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7f;
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, bytestream2_get_be16(&dgb));
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
 }
 
-static int decode_frame_byterun1(AVCodecContext *avctx,
-                                 void *data, int *got_frame,
-                                 AVPacket *avpkt)
+static void decode_long_vertical_delta(uint8_t *dst,
+                                       const uint8_t *buf, const uint8_t *buf_end,
+                                       int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 31) >> 5;
+    int dstpitch = ((w + 15) / 16 * 2) * bpp;
+    unsigned ofsdst, ofssrc, ofsdata, opcode, x;
+    GetByteContext ptrs, gb, dptrs, dgb;
+    PutByteContext pb;
+    int i, j, k, h;
+
+    if (buf_end - buf <= 64)
+        return;
+
+    h = (((w + 15) / 16 * 2) != ((w + 31) / 32 * 4)) ? 1 : 0;
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init(&dptrs, buf + 32, (buf_end - buf) - 32);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+        ofsdata = bytestream2_get_be32(&dptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            return;
+
+        if (ofsdata >= buf_end - buf)
+            return;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        bytestream2_init(&dgb, buf + ofsdata, buf_end - (buf + ofsdata));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 4 - h * (2 * k);
+
+            i = bytestream2_get_byte(&gb);
+            while (i > 0) {
+                opcode = bytestream2_get_byte(&gb);
+
+                if (opcode == 0) {
+                    opcode = bytestream2_get_byte(&gb);
+                    if (h && (j == (ncolumns - 1))) {
+                        x = bytestream2_get_be16(&dgb);
+                        bytestream2_skip(&dgb, 2);
+                    } else {
+                        x = bytestream2_get_be32(&dgb);
+                    }
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == (ncolumns - 1))) {
+                            bytestream2_put_be16(&pb, x);
+                        } else {
+                            bytestream2_put_be32(&pb, x);
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x80) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7f;
+
+                    while (opcode) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == (ncolumns - 1))) {
+                            bytestream2_put_be16(&pb, bytestream2_get_be16(&dgb));
+                            bytestream2_skip(&dgb, 2);
+                        } else {
+                            bytestream2_put_be32(&pb, bytestream2_get_be32(&dgb));
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_short_vertical_delta2(uint8_t *dst,
+                                         const uint8_t *buf, const uint8_t *buf_end,
+                                         int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 15) >> 4;
+    int dstpitch = ncolumns * bpp * 2;
+    unsigned ofsdst, ofssrc, opcode, x;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    int i, j, k;
+
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 2;
+
+            i = bytestream2_get_be16(&gb);
+            while (i > 0 && bytestream2_get_bytes_left(&gb) > 4) {
+                opcode = bytestream2_get_be16(&gb);
+
+                if (opcode == 0) {
+                    opcode = bytestream2_get_be16(&gb);
+                    x = bytestream2_get_be16(&gb);
+
+                    while (opcode && bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, x);
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < 0x8000) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= 0x7fff;
+
+                    while (opcode && bytestream2_get_bytes_left(&gb) > 1 &&
+                           bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        bytestream2_put_be16(&pb, bytestream2_get_be16(&gb));
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_long_vertical_delta2(uint8_t *dst,
+                                        const uint8_t *buf, const uint8_t *buf_end,
+                                        int w, int bpp, int dst_size)
+{
+    int ncolumns = (w + 31) >> 5;
+    int dstpitch = ((w + 15) / 16 * 2) * bpp;
+    unsigned ofsdst, ofssrc, opcode, x;
+    unsigned skip = 0x80000000, mask = skip - 1;
+    GetByteContext ptrs, gb;
+    PutByteContext pb;
+    int i, j, k, h;
+
+    h = (((w + 15) / 16 * 2) != ((w + 31) / 32 * 4)) ? 1 : 0;
+    bytestream2_init(&ptrs, buf, buf_end - buf);
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+        for (j = 0; j < ncolumns; j++) {
+            ofsdst = (j + k * ncolumns) * 4 - h * (2 * k);
+
+            if (h && (j == (ncolumns - 1))) {
+                skip = 0x8000;
+                mask = skip - 1;
+            }
+
+            i = bytestream2_get_be32(&gb);
+            while (i > 0 && bytestream2_get_bytes_left(&gb) > 4) {
+                opcode = bytestream2_get_be32(&gb);
+
+                if (opcode == 0) {
+                    if (h && (j == ncolumns - 1)) {
+                        opcode = bytestream2_get_be16(&gb);
+                        x = bytestream2_get_be16(&gb);
+                    } else {
+                        opcode = bytestream2_get_be32(&gb);
+                        x = bytestream2_get_be32(&gb);
+                    }
+
+                    while (opcode && bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == ncolumns - 1))
+                            bytestream2_put_be16(&pb, x);
+                        else
+                            bytestream2_put_be32(&pb, x);
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                } else if (opcode < skip) {
+                    ofsdst += opcode * dstpitch;
+                } else {
+                    opcode &= mask;
+
+                    while (opcode && bytestream2_get_bytes_left(&gb) > 1 &&
+                           bytestream2_get_bytes_left_p(&pb) > 1) {
+                        bytestream2_seek_p(&pb, ofsdst, SEEK_SET);
+                        if (h && (j == ncolumns - 1)) {
+                            bytestream2_put_be16(&pb, bytestream2_get_be16(&gb));
+                        } else {
+                            bytestream2_put_be32(&pb, bytestream2_get_be32(&gb));
+                        }
+                        ofsdst += dstpitch;
+                        opcode--;
+                    }
+                }
+                i--;
+            }
+        }
+    }
+}
+
+static void decode_delta_d(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int flag, int bpp, int dst_size)
+{
+    int planepitch = FFALIGN(w, 16) >> 3;
+    int pitch = planepitch * bpp;
+    int planepitch_byte = (w + 7) / 8;
+    unsigned entries, ofssrc;
+    GetByteContext gb, ptrs;
+    PutByteContext pb;
+    int k;
+
+    if (buf_end - buf <= 4 * bpp)
+        return;
+
+    bytestream2_init_writer(&pb, dst, dst_size);
+    bytestream2_init(&ptrs, buf, bpp * 4);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+
+        entries = bytestream2_get_be32(&gb);
+        while (entries && bytestream2_get_bytes_left(&gb) >= 8) {
+            int32_t opcode  = bytestream2_get_be32(&gb);
+            unsigned offset = bytestream2_get_be32(&gb);
+
+            bytestream2_seek_p(&pb, (offset / planepitch_byte) * pitch + (offset % planepitch_byte) + k * planepitch, SEEK_SET);
+            if (opcode >= 0) {
+                uint32_t x = bytestream2_get_be32(&gb);
+                while (opcode && bytestream2_get_bytes_left_p(&pb) > 0) {
+                    bytestream2_put_be32(&pb, x);
+                    bytestream2_skip_p(&pb, pitch - 4);
+                    opcode--;
+                }
+            } else {
+                opcode = -opcode;
+                while (opcode && bytestream2_get_bytes_left(&gb) > 0) {
+                    bytestream2_put_be32(&pb, bytestream2_get_be32(&gb));
+                    bytestream2_skip_p(&pb, pitch - 4);
+                    opcode--;
+                }
+            }
+            entries--;
+        }
+    }
+}
+
+static void decode_delta_e(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int flag, int bpp, int dst_size)
+{
+    int planepitch = FFALIGN(w, 16) >> 3;
+    int pitch = planepitch * bpp;
+    int planepitch_byte = (w + 7) / 8;
+    unsigned entries, ofssrc;
+    GetByteContext gb, ptrs;
+    PutByteContext pb;
+    int k;
+
+    if (buf_end - buf <= 4 * bpp)
+        return;
+
+    bytestream2_init_writer(&pb, dst, dst_size);
+    bytestream2_init(&ptrs, buf, bpp * 4);
+
+    for (k = 0; k < bpp; k++) {
+        ofssrc = bytestream2_get_be32(&ptrs);
+
+        if (!ofssrc)
+            continue;
+
+        if (ofssrc >= buf_end - buf)
+            continue;
+
+        bytestream2_init(&gb, buf + ofssrc, buf_end - (buf + ofssrc));
+
+        entries = bytestream2_get_be16(&gb);
+        while (entries && bytestream2_get_bytes_left(&gb) >= 6) {
+            int16_t opcode  = bytestream2_get_be16(&gb);
+            unsigned offset = bytestream2_get_be32(&gb);
+
+            bytestream2_seek_p(&pb, (offset / planepitch_byte) * pitch + (offset % planepitch_byte) + k * planepitch, SEEK_SET);
+            if (opcode >= 0) {
+                uint16_t x = bytestream2_get_be16(&gb);
+                while (opcode && bytestream2_get_bytes_left_p(&pb) > 0) {
+                    bytestream2_put_be16(&pb, x);
+                    bytestream2_skip_p(&pb, pitch - 2);
+                    opcode--;
+                }
+            } else {
+                opcode = -opcode;
+                while (opcode && bytestream2_get_bytes_left(&gb) > 0) {
+                    bytestream2_put_be16(&pb, bytestream2_get_be16(&gb));
+                    bytestream2_skip_p(&pb, pitch - 2);
+                    opcode--;
+                }
+            }
+            entries--;
+        }
+    }
+}
+
+static void decode_delta_l(uint8_t *dst,
+                           const uint8_t *buf, const uint8_t *buf_end,
+                           int w, int flag, int bpp, int dst_size)
+{
+    GetByteContext off0, off1, dgb, ogb;
+    PutByteContext pb;
+    unsigned poff0, poff1;
+    int i, k, dstpitch;
+    int planepitch_byte = (w + 7) / 8;
+    int planepitch = ((w + 15) / 16) * 2;
+    int pitch = planepitch * bpp;
+
+    if (buf_end - buf <= 64)
+        return;
+
+    bytestream2_init(&off0, buf, buf_end - buf);
+    bytestream2_init(&off1, buf + 32, buf_end - (buf + 32));
+    bytestream2_init_writer(&pb, dst, dst_size);
+
+    dstpitch = flag ? (((w + 7) / 8) * bpp): 2;
+
+    for (k = 0; k < bpp; k++) {
+        poff0 = bytestream2_get_be32(&off0);
+        poff1 = bytestream2_get_be32(&off1);
+
+        if (!poff0)
+            continue;
+
+        if (2LL * poff0 >= buf_end - buf)
+            return;
+
+        if (2LL * poff1 >= buf_end - buf)
+            return;
+
+        bytestream2_init(&dgb, buf + 2 * poff0, buf_end - (buf + 2 * poff0));
+        bytestream2_init(&ogb, buf + 2 * poff1, buf_end - (buf + 2 * poff1));
+
+        while ((bytestream2_peek_be16(&ogb)) != 0xFFFF && bytestream2_get_bytes_left(&ogb) >= 4) {
+            uint32_t offset = bytestream2_get_be16(&ogb);
+            int16_t cnt = bytestream2_get_be16(&ogb);
+            uint16_t data;
+
+            offset = ((2 * offset) / planepitch_byte) * pitch + ((2 * offset) % planepitch_byte) + k * planepitch;
+            if (cnt < 0) {
+                bytestream2_seek_p(&pb, offset, SEEK_SET);
+                cnt = -cnt;
+                data = bytestream2_get_be16(&dgb);
+                for (i = 0; i < cnt; i++) {
+                    bytestream2_put_be16(&pb, data);
+                    bytestream2_skip_p(&pb, dstpitch - 2);
+                }
+            } else {
+                bytestream2_seek_p(&pb, offset, SEEK_SET);
+                for (i = 0; i < cnt; i++) {
+                    data = bytestream2_get_be16(&dgb);
+                    bytestream2_put_be16(&pb, data);
+                    bytestream2_skip_p(&pb, dstpitch - 2);
+                }
+            }
+        }
+    }
+}
+
+static int unsupported(AVCodecContext *avctx)
+{
+    IffContext *s = avctx->priv_data;
+    avpriv_request_sample(avctx, "bitmap (compression 0x%0x, bpp %i, ham %i, interlaced %i)", s->compression, s->bpp, s->ham, s->is_interlaced);
+    return AVERROR_INVALIDDATA;
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
 {
     IffContext *s          = avctx->priv_data;
+    AVFrame *frame         = data;
     const uint8_t *buf     = avpkt->data;
     int buf_size           = avpkt->size;
     const uint8_t *buf_end = buf + buf_size;
     int y, plane, res;
+    GetByteContext *gb = &s->gb;
+    const AVPixFmtDescriptor *desc;
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
+    bytestream2_init(gb, avpkt->data, avpkt->size);
+
+    if ((res = extract_header(avctx, avpkt)) < 0)
+        return res;
+
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
+    s->frame = frame;
+
+    buf      += bytestream2_tell(gb);
+    buf_size -= bytestream2_tell(gb);
+    desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
     if (!s->init && avctx->bits_per_coded_sample <= 8 &&
-        avctx->pix_fmt != AV_PIX_FMT_GRAY8) {
-        if ((res = cmap_read_palette(avctx, (uint32_t *)s->frame->data[1])) < 0)
+        avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+        if ((res = cmap_read_palette(avctx, (uint32_t *)frame->data[1])) < 0)
+            return res;
+    } else if (!s->init && avctx->bits_per_coded_sample <= 8 &&
+               avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+        if ((res = cmap_read_palette(avctx, s->mask_palbuf)) < 0)
             return res;
     }
     s->init = 1;
 
-    if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M')) { // interleaved
+    if (s->compression <= 0xff && (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M'))) {
+        if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+            memcpy(s->pal[0], s->frame->data[1], 256 * 4);
+    }
+
+    switch (s->compression) {
+    case 0x0:
+        if (avctx->codec_tag == MKTAG('A', 'C', 'B', 'M')) {
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                memset(frame->data[0], 0, avctx->height * frame->linesize[0]);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    for (y = 0; y < avctx->height && buf < buf_end; y++) {
+                        uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                        decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                memset(frame->data[0], 0, avctx->height * frame->linesize[0]);
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        const uint8_t * start = buf + (plane * avctx->height + y) * s->planesize;
+                        if (start >= buf_end)
+                            break;
+                        decodeplane8(s->ham_buf, start, FFMIN(s->planesize, buf_end - start), plane);
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        } else if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) {
+            int raw_width = avctx->width * (av_get_bits_per_pixel(desc) >> 3);
+            int x;
+            for (y = 0; y < avctx->height && buf < buf_end; y++) {
+                uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                memcpy(row, buf, FFMIN(raw_width, buf_end - buf));
+                buf += raw_width;
+                if (avctx->pix_fmt == AV_PIX_FMT_BGR32) {
+                    for (x = 0; x < avctx->width; x++)
+                        row[4 * x + 3] = row[4 * x + 3] & 0xF0 | (row[4 * x + 3] >> 4);
+                }
+            }
+        } else if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M') || // interleaved
+                   avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+            if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M'))
+                memcpy(s->video[0], buf, FFMIN(buf_end - buf, s->video_size));
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane8(row, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane8(s->ham_buf, buf, FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else { // AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width << 2);
+                    for (plane = 0; plane < s->bpp && buf < buf_end; plane++) {
+                        decodeplane32((uint32_t *)row, buf,
+                                      FFMIN(s->planesize, buf_end - buf), plane);
+                        buf += s->planesize;
+                    }
+                }
+            }
+        } else if (avctx->codec_tag == MKTAG('P', 'B', 'M', ' ')) { // IFF-PBM
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height && buf_end > buf; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memcpy(row, buf, FFMIN(avctx->width, buf_end - buf));
+                    buf += avctx->width + (avctx->width % 2); // padding if odd
+                }
+            } else if (s->ham) { // IFF-PBM: HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height && buf_end > buf; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memcpy(s->ham_buf, buf, FFMIN(avctx->width, buf_end - buf));
+                    buf += avctx->width + (avctx->width & 1); // padding if odd
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        }
+        break;
+    case 0x1:
+        if (avctx->codec_tag == MKTAG('I', 'L', 'B', 'M') || // interleaved
+            avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                uint8_t *video = s->video[0];
+
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+                            memcpy(video, s->planebuf, s->planesize);
+                            video += s->planesize;
+                        }
+                        decodeplane8(row, s->planebuf, s->planesize, plane);
+                    }
+                }
+            } else if (avctx->bits_per_coded_sample <= 8) { //8-bit (+ mask) to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->mask_buf, 0, avctx->width * sizeof(uint32_t));
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        decodeplane32(s->mask_buf, s->planebuf, s->planesize, plane);
+                    }
+                    lookup_pal_indicies((uint32_t *)row, s->mask_buf, s->mask_palbuf, avctx->width);
+                }
+            } else if (s->ham) { // HAM to AV_PIX_FMT_BGR32
+                uint8_t *video = s->video[0];
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(s->ham_buf, 0, s->planesize * 8);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        if (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M')) {
+                            memcpy(video, s->planebuf, s->planesize);
+                            video += s->planesize;
+                        }
+                        decodeplane8(s->ham_buf, s->planebuf, s->planesize, plane);
+                    }
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else { // AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    memset(row, 0, avctx->width << 2);
+                    for (plane = 0; plane < s->bpp; plane++) {
+                        buf += decode_byterun(s->planebuf, s->planesize, gb);
+                        decodeplane32((uint32_t *)row, s->planebuf, s->planesize, plane);
+                    }
+                }
+            }
+        } else if (avctx->codec_tag == MKTAG('P', 'B', 'M', ' ')) { // IFF-PBM
+            if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    buf += decode_byterun(row, avctx->width, gb);
+                }
+            } else if (s->ham) { // IFF-PBM: HAM to AV_PIX_FMT_BGR32
+                for (y = 0; y < avctx->height; y++) {
+                    uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                    buf += decode_byterun(s->ham_buf, avctx->width, gb);
+                    decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
+                }
+            } else
+                return unsupported(avctx);
+        } else if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) { // IFF-DEEP
+            if (av_get_bits_per_pixel(desc) == 32)
+                decode_deep_rle32(frame->data[0], buf, buf_size, avctx->width, avctx->height, frame->linesize[0]);
+            else
+                return unsupported(avctx);
+        }
+        break;
+    case 0x4:
+        if (avctx->codec_tag == MKTAG('R', 'G', 'B', '8') && avctx->pix_fmt == AV_PIX_FMT_RGB32)
+            decode_rgb8(gb, frame->data[0], avctx->width, avctx->height, frame->linesize[0]);
+        else if (avctx->codec_tag == MKTAG('R', 'G', 'B', 'N') && avctx->pix_fmt == AV_PIX_FMT_RGB444)
+            decode_rgbn(gb, frame->data[0], avctx->width, avctx->height, frame->linesize[0]);
+        else
+            return unsupported(avctx);
+        break;
+    case 0x5:
+        if (avctx->codec_tag == MKTAG('D', 'E', 'E', 'P')) {
+            if (av_get_bits_per_pixel(desc) == 32)
+                decode_deep_tvdc32(frame->data[0], buf, buf_size, avctx->width, avctx->height, frame->linesize[0], s->tvdc);
+            else
+                return unsupported(avctx);
+        } else
+            return unsupported(avctx);
+        break;
+    case 0x300:
+    case 0x301:
+        decode_short_horizontal_delta(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        break;
+    case 0x500:
+    case 0x501:
+        decode_byte_vertical_delta(s->video[0], buf, buf_end, avctx->width, s->is_brush, s->bpp, s->video_size);
+        break;
+    case 0x700:
+    case 0x701:
+        if (s->is_short)
+            decode_short_vertical_delta(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        else
+            decode_long_vertical_delta(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        break;
+    case 0x800:
+    case 0x801:
+        if (s->is_short)
+            decode_short_vertical_delta2(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        else
+            decode_long_vertical_delta2(s->video[0], buf, buf_end, avctx->width, s->bpp, s->video_size);
+        break;
+    case 0x4a00:
+    case 0x4a01:
+        decode_delta_j(s->video[0], buf, buf_end, avctx->width, avctx->height, s->bpp, s->video_size);
+        break;
+    case 0x6400:
+    case 0x6401:
+        if (s->is_interlaced)
+            return unsupported(avctx);
+        decode_delta_d(s->video[0], buf, buf_end, avctx->width, s->is_interlaced, s->bpp, s->video_size);
+        break;
+    case 0x6500:
+    case 0x6501:
+        if (s->is_interlaced)
+            return unsupported(avctx);
+        decode_delta_e(s->video[0], buf, buf_end, avctx->width, s->is_interlaced, s->bpp, s->video_size);
+        break;
+    case 0x6c00:
+    case 0x6c01:
+        decode_delta_l(s->video[0], buf, buf_end, avctx->width, s->is_short, s->bpp, s->video_size);
+        break;
+    default:
+        return unsupported(avctx);
+    }
+
+    if (s->compression <= 0xff && (avctx->codec_tag == MKTAG('A', 'N', 'I', 'M'))) {
+        memcpy(s->pal[1], s->pal[0], 256 * 4);
+        memcpy(s->video[1], s->video[0], s->video_size);
+    }
+
+    if (s->compression > 0xff) {
         if (avctx->pix_fmt == AV_PIX_FMT_PAL8 || avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+            buf = s->video[0];
             for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
+                uint8_t *row = &frame->data[0][y * frame->linesize[0]];
                 memset(row, 0, avctx->width);
-                for (plane = 0; plane < avctx->bits_per_coded_sample; plane++) {
-                    buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
-                    decodeplane8(row, s->planebuf, s->planesize, plane);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    decodeplane8(row, buf, s->planesize, plane);
+                    buf += s->planesize;
                 }
             }
-        } else { // AV_PIX_FMT_BGR32
+            memcpy(frame->data[1], s->pal[0], 256 * 4);
+        } else if (s->ham) {
+            int i, count = 1 << s->ham;
+
+            buf = s->video[0];
+            memset(s->ham_palbuf, 0, (1 << s->ham) * 2 * sizeof(uint32_t));
+            for (i = 0; i < count; i++) {
+                s->ham_palbuf[i*2+1] = s->pal[0][i];
+            }
+            for (i = 0; i < count; i++) {
+                uint32_t tmp = i << (8 - s->ham);
+                tmp |= tmp >> s->ham;
+                s->ham_palbuf[(i+count)*2]     = 0xFF00FFFF;
+                s->ham_palbuf[(i+count*2)*2]   = 0xFFFFFF00;
+                s->ham_palbuf[(i+count*3)*2]   = 0xFFFF00FF;
+                s->ham_palbuf[(i+count)*2+1]   = 0xFF000000 | tmp << 16;
+                s->ham_palbuf[(i+count*2)*2+1] = 0xFF000000 | tmp;
+                s->ham_palbuf[(i+count*3)*2+1] = 0xFF000000 | tmp << 8;
+            }
+            if (s->masking == MASK_HAS_MASK) {
+                for (i = 0; i < 8 * (1 << s->ham); i++)
+                    s->ham_palbuf[(1 << s->bpp) + i] = s->ham_palbuf[i] | 0xFF000000;
+            }
             for (y = 0; y < avctx->height; y++) {
-                uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-                memset(row, 0, avctx->width << 2);
-                for (plane = 0; plane < avctx->bits_per_coded_sample; plane++) {
-                    buf += decode_byterun(s->planebuf, s->planesize, buf, buf_end);
-                    decodeplane32((uint32_t *)row, s->planebuf, s->planesize, plane);
+                uint8_t *row = &frame->data[0][y * frame->linesize[0]];
+                memset(s->ham_buf, 0, s->planesize * 8);
+                for (plane = 0; plane < s->bpp; plane++) {
+                    decodeplane8(s->ham_buf, buf, s->planesize, plane);
+                    buf += s->planesize;
                 }
+                decode_ham_plane32((uint32_t *)row, s->ham_buf, s->ham_palbuf, s->planesize);
             }
+        } else {
+            return unsupported(avctx);
         }
-    } else {
-        for (y = 0; y < avctx->height; y++) {
-            uint8_t *row = &s->frame->data[0][y * s->frame->linesize[0]];
-            buf += decode_byterun(row, avctx->width, buf, buf_end);
+
+        if (!s->is_brush) {
+            FFSWAP(uint8_t *, s->video[0], s->video[1]);
+            FFSWAP(uint32_t *, s->pal[0], s->pal[1]);
         }
     }
 
-    if ((res = av_frame_ref(data, s->frame)) < 0)
-        return res;
+    if (avpkt->flags & AV_PKT_FLAG_KEY) {
+        frame->key_frame = 1;
+        frame->pict_type = AV_PICTURE_TYPE_I;
+    } else {
+        frame->key_frame = 0;
+        frame->pict_type = AV_PICTURE_TYPE_P;
+    }
 
     *got_frame = 1;
 
     return buf_size;
 }
 
+#if CONFIG_IFF_ILBM_DECODER
 AVCodec ff_iff_ilbm_decoder = {
-    .name           = "iff_ilbm",
-    .long_name      = NULL_IF_CONFIG_SMALL("IFF ILBM"),
+    .name           = "iff",
+    .long_name      = NULL_IF_CONFIG_SMALL("IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_IFF_ILBM,
     .priv_data_size = sizeof(IffContext),
     .init           = decode_init,
     .close          = decode_end,
-    .decode         = decode_frame_ilbm,
-    .capabilities   = AV_CODEC_CAP_DR1,
-};
-
-AVCodec ff_iff_byterun1_decoder = {
-    .name           = "iff_byterun1",
-    .long_name      = NULL_IF_CONFIG_SMALL("IFF ByteRun1"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_IFF_BYTERUN1,
-    .priv_data_size = sizeof(IffContext),
-    .init           = decode_init,
-    .close          = decode_end,
-    .decode         = decode_frame_byterun1,
+    .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/iirfilter-test.c b/libavcodec/iirfilter-test.c
index 5aa156c..cd250a3 100644
--- a/libavcodec/iirfilter-test.c
+++ b/libavcodec/iirfilter-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,7 +48,7 @@ int main(void)
     for (i = 0; i < SIZE; i++)
         printf("%6d %6d\n", x[i], y[i]);
 
-    ff_iir_filter_free_coeffs(fcoeffs);
-    ff_iir_filter_free_state(fstate);
+    ff_iir_filter_free_coeffsp(&fcoeffs);
+    ff_iir_filter_free_statep(&fstate);
     return 0;
 }
diff --git a/libavcodec/iirfilter.c b/libavcodec/iirfilter.c
index 442c837..a8c9b9b 100644
--- a/libavcodec/iirfilter.c
+++ b/libavcodec/iirfilter.c
@@ -2,20 +2,20 @@
  * IIR filter
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -197,7 +197,7 @@ av_cold struct FFIIRFilterCoeffs *ff_iir_filter_init_coeffs(void *avc,
         return c;
 
 init_fail:
-    ff_iir_filter_free_coeffs(c);
+    ff_iir_filter_free_coeffsp(&c);
     return NULL;
 }
 
@@ -302,16 +302,24 @@ void ff_iir_filter_flt(const struct FFIIRFilterCoeffs *c,
     }
 }
 
-av_cold void ff_iir_filter_free_state(struct FFIIRFilterState *state)
+av_cold void ff_iir_filter_free_statep(struct FFIIRFilterState **state)
 {
-    av_free(state);
+    av_freep(state);
 }
 
-av_cold void ff_iir_filter_free_coeffs(struct FFIIRFilterCoeffs *coeffs)
+av_cold void ff_iir_filter_free_coeffsp(struct FFIIRFilterCoeffs **coeffsp)
 {
+    struct FFIIRFilterCoeffs *coeffs = *coeffsp;
     if (coeffs) {
-        av_free(coeffs->cx);
-        av_free(coeffs->cy);
+        av_freep(&coeffs->cx);
+        av_freep(&coeffs->cy);
     }
-    av_free(coeffs);
+    av_freep(coeffsp);
+}
+
+void ff_iir_filter_init(FFIIRFilterContext *f) {
+    f->filter_flt = ff_iir_filter_flt;
+
+    if (HAVE_MIPSFPU)
+        ff_iir_filter_init_mips(f);
 }
diff --git a/libavcodec/iirfilter.h b/libavcodec/iirfilter.h
index bc65a96..6f7bba6 100644
--- a/libavcodec/iirfilter.h
+++ b/libavcodec/iirfilter.h
@@ -2,20 +2,20 @@
  * IIR filter
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,6 +47,29 @@ enum IIRFilterMode{
     FF_FILTER_MODE_BANDSTOP,
 };
 
+typedef struct FFIIRFilterContext {
+    /**
+    * Perform IIR filtering on floating-point input samples.
+    *
+    * @param coeffs pointer to filter coefficients
+    * @param state  pointer to filter state
+    * @param size   input length
+    * @param src    source samples
+    * @param sstep  source stride
+    * @param dst    filtered samples (destination may be the same as input)
+    * @param dstep  destination stride
+    */
+    void (*filter_flt)(const struct FFIIRFilterCoeffs *coeffs,
+                        struct FFIIRFilterState *state, int size,
+                        const float *src, int sstep, float *dst, int dstep);
+} FFIIRFilterContext;
+
+/**
+ * Initialize FFIIRFilterContext
+ */
+void ff_iir_filter_init(FFIIRFilterContext *f);
+void ff_iir_filter_init_mips(FFIIRFilterContext *f);
+
 /**
  * Initialize filter coefficients.
  *
@@ -81,14 +104,14 @@ struct FFIIRFilterState* ff_iir_filter_init_state(int order);
  *
  * @param coeffs pointer allocated with ff_iir_filter_init_coeffs()
  */
-void ff_iir_filter_free_coeffs(struct FFIIRFilterCoeffs *coeffs);
+void ff_iir_filter_free_coeffsp(struct FFIIRFilterCoeffs **coeffs);
 
 /**
- * Free filter state.
+ * Free and zero filter state.
  *
- * @param state pointer allocated with ff_iir_filter_init_state()
+ * @param state pointer to pointer allocated with ff_iir_filter_init_state()
  */
-void ff_iir_filter_free_state(struct FFIIRFilterState *state);
+void ff_iir_filter_free_statep(struct FFIIRFilterState **state);
 
 /**
  * Perform IIR filtering on signed 16-bit input samples.
diff --git a/libavcodec/imc.c b/libavcodec/imc.c
index 9a6912d..ac20920 100644
--- a/libavcodec/imc.c
+++ b/libavcodec/imc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,7 @@
 #include <stdio.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
@@ -94,7 +95,7 @@ typedef struct IMCContext {
     GetBitContext gb;
 
     BswapDSPContext bdsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext fft;
     DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS / 2];
     float *out_samples;
@@ -135,8 +136,8 @@ static av_cold void iac_generate_tabs(IMCContext *q, int sampling_rate)
 
         if (i > 0) {
             tb = bark - prev_bark;
-            q->weights1[i - 1] = pow(10.0, -1.0 * tb);
-            q->weights2[i - 1] = pow(10.0, -2.7 * tb);
+            q->weights1[i - 1] = ff_exp10(-1.0 * tb);
+            q->weights2[i - 1] = ff_exp10(-2.7 * tb);
         }
         prev_bark = bark;
 
@@ -178,6 +179,14 @@ static av_cold int imc_decode_init(AVCodecContext *avctx)
     IMCContext *q = avctx->priv_data;
     double r1, r2;
 
+    if (avctx->codec_id == AV_CODEC_ID_IAC && avctx->sample_rate > 96000) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Strange sample rate of %i, file likely corrupt or "
+               "needing a new table derivation method.\n",
+               avctx->sample_rate);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (avctx->codec_id == AV_CODEC_ID_IMC)
         avctx->channels = 1;
 
@@ -246,7 +255,13 @@ static av_cold int imc_decode_init(AVCodecContext *avctx)
         return ret;
     }
     ff_bswapdsp_init(&q->bdsp);
-    avpriv_float_dsp_init(&q->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    q->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!q->fdsp) {
+        ff_fft_end(&q->fft);
+
+        return AVERROR(ENOMEM);
+    }
+
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;
     avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO
                                                  : AV_CH_LAYOUT_STEREO;
@@ -355,7 +370,7 @@ static void imc_decode_level_coefficients(IMCContext *q, int *levlCoeffBuf,
     float tmp, tmp2;
     // maybe some frequency division thingy
 
-    flcoeffs1[0] = 20000.0 / pow (2, levlCoeffBuf[0] * 0.18945); // 0.18945 = log2(10) * 0.05703125
+    flcoeffs1[0] = 20000.0 / exp2 (levlCoeffBuf[0] * 0.18945); // 0.18945 = log2(10) * 0.05703125
     flcoeffs2[0] = log2f(flcoeffs1[0]);
     tmp  = flcoeffs1[0];
     tmp2 = flcoeffs2[0];
@@ -449,8 +464,13 @@ static int bit_allocation(IMCContext *q, IMCChannel *chctx,
     for (i = 0; i < BANDS; i++)
         highest = FFMAX(highest, chctx->flcoeffs1[i]);
 
-    for (i = 0; i < BANDS - 1; i++)
+    for (i = 0; i < BANDS - 1; i++) {
+        if (chctx->flcoeffs5[i] <= 0) {
+            av_log(NULL, AV_LOG_ERROR, "flcoeffs5 %f invalid\n", chctx->flcoeffs5[i]);
+            return AVERROR_INVALIDDATA;
+        }
         chctx->flcoeffs4[i] = chctx->flcoeffs3[i] - log2f(chctx->flcoeffs5[i]);
+    }
     chctx->flcoeffs4[BANDS - 1] = limit;
 
     highest = highest * 0.25;
@@ -769,7 +789,8 @@ static int inverse_quant_coeff(IMCContext *q, IMCChannel *chctx,
 }
 
 
-static int imc_get_coeffs(IMCContext *q, IMCChannel *chctx)
+static void imc_get_coeffs(AVCodecContext *avctx,
+                           IMCContext *q, IMCChannel *chctx)
 {
     int i, j, cw_len, cw;
 
@@ -781,19 +802,19 @@ static int imc_get_coeffs(IMCContext *q, IMCChannel *chctx)
                 cw_len = chctx->CWlengthT[j];
                 cw = 0;
 
-                if (get_bits_count(&q->gb) + cw_len > 512) {
-                    ff_dlog(NULL, "Band %i coeff %i cw_len %i\n", i, j, cw_len);
-                    return AVERROR_INVALIDDATA;
+                if (cw_len && (!chctx->bandFlagsBuf[i] || !chctx->skipFlags[j])) {
+                    if (get_bits_count(&q->gb) + cw_len > 512) {
+                        av_log(avctx, AV_LOG_WARNING,
+                            "Potential problem on band %i, coefficient %i"
+                            ": cw_len=%i\n", i, j, cw_len);
+                    } else
+                        cw = get_bits(&q->gb, cw_len);
                 }
 
-                if (cw_len && (!chctx->bandFlagsBuf[i] || !chctx->skipFlags[j]))
-                    cw = get_bits(&q->gb, cw_len);
-
                 chctx->codewords[j] = cw;
             }
         }
     }
-    return 0;
 }
 
 static void imc_refine_bit_allocation(IMCContext *q, IMCChannel *chctx)
@@ -886,6 +907,13 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
         imc_decode_level_coefficients2(q, chctx->levlCoeffBuf, chctx->old_floor,
                                        chctx->flcoeffs1, chctx->flcoeffs2);
 
+    for(i=0; i<BANDS; i++) {
+        if(chctx->flcoeffs1[i] > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "scalefactor out of range\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     memcpy(chctx->old_floor, chctx->flcoeffs1, 32 * sizeof(float));
 
     counter = 0;
@@ -967,11 +995,7 @@ static int imc_decode_block(AVCodecContext *avctx, IMCContext *q, int ch)
 
     memset(chctx->codewords, 0, sizeof(chctx->codewords));
 
-    if (imc_get_coeffs(q, chctx) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Read coefficients failed\n");
-        chctx->decoder_reset = 1;
-        return AVERROR_INVALIDDATA;
-    }
+    imc_get_coeffs(avctx, q, chctx);
 
     if (inverse_quant_coeff(q, chctx, stream_format_code) < 0) {
         av_log(avctx, AV_LOG_ERROR, "Inverse quantization of coefficients failed\n");
@@ -1005,10 +1029,8 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = COEFFS;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     for (i = 0; i < avctx->channels; i++) {
         q->out_samples = (float *)frame->extended_data[i];
@@ -1024,7 +1046,7 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (avctx->channels == 2) {
-        q->fdsp.butterflies_float((float *)frame->extended_data[0],
+        q->fdsp->butterflies_float((float *)frame->extended_data[0],
                                   (float *)frame->extended_data[1], COEFFS);
     }
 
@@ -1033,17 +1055,25 @@ static int imc_decode_frame(AVCodecContext *avctx, void *data,
     return IMC_BLOCK_SIZE * avctx->channels;
 }
 
-
 static av_cold int imc_decode_close(AVCodecContext * avctx)
 {
     IMCContext *q = avctx->priv_data;
 
     ff_fft_end(&q->fft);
+    av_freep(&q->fdsp);
 
     return 0;
 }
 
+static av_cold void flush(AVCodecContext *avctx)
+{
+    IMCContext *q = avctx->priv_data;
+
+    q->chctx[0].decoder_reset =
+    q->chctx[1].decoder_reset = 1;
+}
 
+#if CONFIG_IMC_DECODER
 AVCodec ff_imc_decoder = {
     .name           = "imc",
     .long_name      = NULL_IF_CONFIG_SMALL("IMC (Intel Music Coder)"),
@@ -1053,11 +1083,13 @@ AVCodec ff_imc_decoder = {
     .init           = imc_decode_init,
     .close          = imc_decode_close,
     .decode         = imc_decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_IAC_DECODER
 AVCodec ff_iac_decoder = {
     .name           = "iac",
     .long_name      = NULL_IF_CONFIG_SMALL("IAC (Indeo Audio Coder)"),
@@ -1067,7 +1099,9 @@ AVCodec ff_iac_decoder = {
     .init           = imc_decode_init,
     .close          = imc_decode_close,
     .decode         = imc_decode_frame,
+    .flush          = flush,
     .capabilities   = AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/imcdata.h b/libavcodec/imcdata.h
index 8e99391..64e7c71 100644
--- a/libavcodec/imcdata.h
+++ b/libavcodec/imcdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2006 Benjamin Larsson
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/imdct15.c b/libavcodec/imdct15.c
index e02e9ce..e91aa11 100644
--- a/libavcodec/imdct15.c
+++ b/libavcodec/imdct15.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,11 +105,11 @@ av_cold int ff_imdct15_init(IMDCT15Context **ps, int N)
     s->len4 = len2 / 2;
     s->len2 = len2;
 
-    s->tmp  = av_malloc(len * 2 * sizeof(*s->tmp));
+    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
     if (!s->tmp)
         goto fail;
 
-    s->twiddle_exptab  = av_malloc(s->len4 * sizeof(*s->twiddle_exptab));
+    s->twiddle_exptab  = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
     if (!s->twiddle_exptab)
         goto fail;
 
diff --git a/libavcodec/imdct15.h b/libavcodec/imdct15.h
index ed3f003..1979aa7 100644
--- a/libavcodec/imdct15.h
+++ b/libavcodec/imdct15.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/imgconvert-test.c b/libavcodec/imgconvert-test.c
new file mode 100644
index 0000000..96004d7
--- /dev/null
+++ b/libavcodec/imgconvert-test.c
@@ -0,0 +1,50 @@
+/*
+ * Misc image conversion routines
+ * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "imgconvert.c"
+
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+int main(void){
+    int i;
+    int err=0;
+    int skip = 0;
+
+    for (i=0; i<AV_PIX_FMT_NB*2; i++) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(i);
+        if(!desc || !desc->name) {
+            skip ++;
+            continue;
+        }
+        if (skip) {
+            av_log(NULL, AV_LOG_INFO, "%3d unused pixel format values\n", skip);
+            skip = 0;
+        }
+        av_log(NULL, AV_LOG_INFO, "pix fmt %s yuv_plan:%d avg_bpp:%d\n", desc->name, is_yuv_planar(desc), av_get_padded_bits_per_pixel(desc));
+        if ((!(desc->flags & AV_PIX_FMT_FLAG_ALPHA)) != (desc->nb_components != 2 && desc->nb_components != 4)) {
+            av_log(NULL, AV_LOG_ERROR, "Alpha flag mismatch\n");
+            err = 1;
+        }
+    }
+    return err;
+}
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif /* FF_API_AVPICTURE */
diff --git a/libavcodec/imgconvert.c b/libavcodec/imgconvert.c
index 4667a4f..46fa780 100644
--- a/libavcodec/imgconvert.c
+++ b/libavcodec/imgconvert.c
@@ -2,20 +2,20 @@
  * Misc image conversion routines
  * Copyright (c) 2001, 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
+#include "libavutil/avassert.h"
 #include "libavutil/colorspace.h"
 #include "libavutil/common.h"
 #include "libavutil/pixdesc.h"
@@ -37,122 +38,50 @@
 void avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
     *h_shift = desc->log2_chroma_w;
     *v_shift = desc->log2_chroma_h;
 }
 #endif
 
-static int is_gray(const AVPixFmtDescriptor *desc)
-{
-    return desc->nb_components - (desc->flags & AV_PIX_FMT_FLAG_ALPHA) == 1;
-}
-
 int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt,
                              enum AVPixelFormat src_pix_fmt,
                              int has_alpha)
 {
-    const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt);
-    const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt);
-    int loss, i, nb_components = FFMIN(src_desc->nb_components,
-                                       dst_desc->nb_components);
-
-    /* compute loss */
-    loss = 0;
-
-    if (dst_pix_fmt == src_pix_fmt)
-        return 0;
-
-    for (i = 0; i < nb_components; i++)
-        if (src_desc->comp[i].depth > dst_desc->comp[i].depth)
-            loss |= FF_LOSS_DEPTH;
-
-    if (dst_desc->log2_chroma_w > src_desc->log2_chroma_w ||
-        dst_desc->log2_chroma_h > src_desc->log2_chroma_h)
-        loss |= FF_LOSS_RESOLUTION;
-
-    if ((src_desc->flags & AV_PIX_FMT_FLAG_RGB) != (dst_desc->flags & AV_PIX_FMT_FLAG_RGB))
-        loss |= FF_LOSS_COLORSPACE;
-
-    if (has_alpha && !(dst_desc->flags & AV_PIX_FMT_FLAG_ALPHA) &&
-         (src_desc->flags & AV_PIX_FMT_FLAG_ALPHA))
-        loss |= FF_LOSS_ALPHA;
-
-    if (dst_pix_fmt == AV_PIX_FMT_PAL8 && !is_gray(src_desc))
-        return loss | FF_LOSS_COLORQUANT;
-
-    if (src_desc->nb_components > dst_desc->nb_components)
-        if (is_gray(dst_desc))
-            loss |= FF_LOSS_CHROMA;
-
-    return loss;
+    return av_get_pix_fmt_loss(dst_pix_fmt, src_pix_fmt, has_alpha);
 }
 
-static enum AVPixelFormat avcodec_find_best_pix_fmt1(enum AVPixelFormat *pix_fmt_list,
-                                      enum AVPixelFormat src_pix_fmt,
-                                      int has_alpha,
-                                      int loss_mask)
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr)
 {
-    int dist, i, loss, min_dist;
-    enum AVPixelFormat dst_pix_fmt;
-
-    /* find exact color match with smallest size */
-    dst_pix_fmt = AV_PIX_FMT_NONE;
-    min_dist = 0x7fffffff;
-    i = 0;
-    while (pix_fmt_list[i] != AV_PIX_FMT_NONE) {
-        enum AVPixelFormat pix_fmt = pix_fmt_list[i];
-
-        if (i > AV_PIX_FMT_NB) {
-            av_log(NULL, AV_LOG_ERROR, "Pixel format list longer than expected, "
-                   "it is either not properly terminated or contains duplicates\n");
-            return AV_PIX_FMT_NONE;
-        }
-
-        loss = avcodec_get_pix_fmt_loss(pix_fmt, src_pix_fmt, has_alpha) & loss_mask;
-        if (loss == 0) {
-            dist = av_get_bits_per_pixel(av_pix_fmt_desc_get(pix_fmt));
-            if (dist < min_dist) {
-                min_dist = dist;
-                dst_pix_fmt = pix_fmt;
-            }
-        }
-        i++;
-    }
-    return dst_pix_fmt;
+    return av_find_best_pix_fmt_of_2(dst_pix_fmt1, dst_pix_fmt2, src_pix_fmt, has_alpha, loss_ptr);
 }
 
-enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat *pix_fmt_list,
+#if AV_HAVE_INCOMPATIBLE_LIBAV_ABI
+enum AVPixelFormat avcodec_find_best_pix_fmt2(const enum AVPixelFormat *pix_fmt_list,
                                             enum AVPixelFormat src_pix_fmt,
-                                            int has_alpha, int *loss_ptr)
+                                            int has_alpha, int *loss_ptr){
+    return avcodec_find_best_pix_fmt_of_list(pix_fmt_list, src_pix_fmt, has_alpha, loss_ptr);
+}
+#else
+enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr)
 {
-    enum AVPixelFormat dst_pix_fmt;
-    int loss_mask, i;
-    static const int loss_mask_order[] = {
-        ~0, /* no loss first */
-        ~FF_LOSS_ALPHA,
-        ~FF_LOSS_RESOLUTION,
-        ~(FF_LOSS_COLORSPACE | FF_LOSS_RESOLUTION),
-        ~FF_LOSS_COLORQUANT,
-        ~FF_LOSS_DEPTH,
-        0,
-    };
-
-    /* try with successive loss */
-    i = 0;
-    for(;;) {
-        loss_mask = loss_mask_order[i++];
-        dst_pix_fmt = avcodec_find_best_pix_fmt1(pix_fmt_list, src_pix_fmt,
-                                                 has_alpha, loss_mask);
-        if (dst_pix_fmt >= 0)
-            goto found;
-        if (loss_mask == 0)
-            break;
-    }
-    return AV_PIX_FMT_NONE;
- found:
-    if (loss_ptr)
-        *loss_ptr = avcodec_get_pix_fmt_loss(dst_pix_fmt, src_pix_fmt, has_alpha);
-    return dst_pix_fmt;
+    return avcodec_find_best_pix_fmt_of_2(dst_pix_fmt1, dst_pix_fmt2, src_pix_fmt, has_alpha, loss_ptr);
+}
+#endif
+
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list,
+                                            enum AVPixelFormat src_pix_fmt,
+                                            int has_alpha, int *loss_ptr){
+    int i;
+
+    enum AVPixelFormat best = AV_PIX_FMT_NONE;
+
+    for(i=0; pix_fmt_list[i] != AV_PIX_FMT_NONE; i++)
+        best = avcodec_find_best_pix_fmt_of_2(best, pix_fmt_list[i], src_pix_fmt, has_alpha, loss_ptr);
+
+    return best;
 }
 
 #if FF_API_AVPICTURE
@@ -160,8 +89,22 @@ FF_DISABLE_DEPRECATION_WARNINGS
 /* return true if yuv planar */
 static inline int is_yuv_planar(const AVPixFmtDescriptor *desc)
 {
-    return (!(desc->flags & AV_PIX_FMT_FLAG_RGB) &&
-             (desc->flags & AV_PIX_FMT_FLAG_PLANAR));
+    int i;
+    int planes[4] = { 0 };
+
+    if (     desc->flags & AV_PIX_FMT_FLAG_RGB
+        || !(desc->flags & AV_PIX_FMT_FLAG_PLANAR))
+        return 0;
+
+    /* set the used planes */
+    for (i = 0; i < desc->nb_components; i++)
+        planes[desc->comp[i].plane] = 1;
+
+    /* if there is an unused plane, the format is not planar */
+    for (i = 0; i < desc->nb_components; i++)
+        if (!planes[i])
+            return 0;
+    return 1;
 }
 
 int av_picture_crop(AVPicture *dst, const AVPicture *src,
@@ -170,16 +113,24 @@ int av_picture_crop(AVPicture *dst, const AVPicture *src,
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     int y_shift;
     int x_shift;
+    int max_step[4];
 
-    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB || !is_yuv_planar(desc))
+    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB)
         return -1;
 
     y_shift = desc->log2_chroma_h;
     x_shift = desc->log2_chroma_w;
+    av_image_fill_max_pixsteps(max_step, NULL, desc);
 
+    if (is_yuv_planar(desc)) {
     dst->data[0] = src->data[0] + (top_band * src->linesize[0]) + left_band;
     dst->data[1] = src->data[1] + ((top_band >> y_shift) * src->linesize[1]) + (left_band >> x_shift);
     dst->data[2] = src->data[2] + ((top_band >> y_shift) * src->linesize[2]) + (left_band >> x_shift);
+    } else{
+        if(top_band % (1<<y_shift) || left_band % (1<<x_shift))
+            return -1;
+        dst->data[0] = src->data[0] + (top_band * src->linesize[0]) + (left_band * max_step[0]);
+    }
 
     dst->linesize[0] = src->linesize[0];
     dst->linesize[1] = src->linesize[1];
@@ -197,9 +148,41 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
     int x_shift;
     int yheight;
     int i, y;
+    int max_step[4];
 
-    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB ||
-        !is_yuv_planar(desc)) return -1;
+    if (pix_fmt < 0 || pix_fmt >= AV_PIX_FMT_NB)
+        return -1;
+
+    if (!is_yuv_planar(desc)) {
+        if (src)
+            return -1; //TODO: Not yet implemented
+
+        av_image_fill_max_pixsteps(max_step, NULL, desc);
+
+        if (padtop || padleft) {
+            memset(dst->data[0], color[0],
+                    dst->linesize[0] * padtop + (padleft * max_step[0]));
+        }
+
+        if (padleft || padright) {
+            optr = dst->data[0] + dst->linesize[0] * padtop +
+                    (dst->linesize[0] - (padright * max_step[0]));
+            yheight = height - 1 - (padtop + padbottom);
+            for (y = 0; y < yheight; y++) {
+                memset(optr, color[0], (padleft + padright) * max_step[0]);
+                optr += dst->linesize[0];
+            }
+        }
+
+        if (padbottom || padright) {
+            optr = dst->data[0] + dst->linesize[0] * (height - padbottom) -
+                    (padright * max_step[0]);
+            memset(optr, color[0], dst->linesize[0] * padbottom +
+                    (padright * max_step[0]));
+        }
+
+        return 0;
+    }
 
     for (i = 0; i < 3; i++) {
         x_shift = i ? desc->log2_chroma_w : 0;
@@ -245,8 +228,8 @@ int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width,
                 (padbottom >> y_shift) + (padright >> x_shift));
         }
     }
+
     return 0;
 }
-
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif /* FF_API_AVPICTURE */
diff --git a/libavcodec/imx_dump_header_bsf.c b/libavcodec/imx_dump_header_bsf.c
index 71bda02..9a9de05 100644
--- a/libavcodec/imx_dump_header_bsf.c
+++ b/libavcodec/imx_dump_header_bsf.c
@@ -2,20 +2,20 @@
  * imx dump header bitstream filter
  * Copyright (c) 2007 Baptiste Coudurier
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo2.c b/libavcodec/indeo2.c
index e667420..17f2367 100644
--- a/libavcodec/indeo2.c
+++ b/libavcodec/indeo2.c
@@ -2,20 +2,20 @@
  * Intel Indeo 2 codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,15 +54,13 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
     int i;
     int j;
     int out = 0;
-    int c;
-    int t;
 
     if (width & 1)
         return AVERROR_INVALIDDATA;
 
     /* first line contain absolute values, other lines contain deltas */
     while (out < width) {
-        c = ir2_get_code(&ctx->gb);
+        int c = ir2_get_code(&ctx->gb);
         if (c >= 0x80) { /* we have a run */
             c -= 0x7F;
             if (out + c*2 > width)
@@ -79,7 +77,7 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
     for (j = 1; j < height; j++) {
         out = 0;
         while (out < width) {
-            c = ir2_get_code(&ctx->gb);
+            int c = ir2_get_code(&ctx->gb);
             if (c >= 0x80) { /* we have a skip */
                 c -= 0x7F;
                 if (out + c*2 > width)
@@ -89,7 +87,7 @@ static int ir2_decode_plane(Ir2Context *ctx, int width, int height, uint8_t *dst
                     out++;
                 }
             } else { /* add two deltas from table */
-                t        = dst[out - pitch] + (table[c * 2] - 128);
+                int t    = dst[out - pitch] + (table[c * 2] - 128);
                 t        = av_clip_uint8(t);
                 dst[out] = t;
                 out++;
@@ -150,10 +148,8 @@ static int ir2_decode_frame(AVCodecContext *avctx,
     int start, ret;
     int ltab, ctab;
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
 
     start = 48; /* hardcoded for now */
 
@@ -170,7 +166,8 @@ static int ir2_decode_frame(AVCodecContext *avctx,
         buf[i] = ff_reverse[buf[i]];
 #endif
 
-    init_get_bits(&s->gb, buf + start, (buf_size - start) * 8);
+    if ((ret = init_get_bits8(&s->gb, buf + start, buf_size - start)) < 0)
+        return ret;
 
     ltab = buf[0x22] & 3;
     ctab = buf[0x22] >> 2;
diff --git a/libavcodec/indeo2data.h b/libavcodec/indeo2data.h
index 255f662..e05c91f 100644
--- a/libavcodec/indeo2data.h
+++ b/libavcodec/indeo2data.h
@@ -2,20 +2,20 @@
  * Intel Indeo 2 codec
  * copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo3.c b/libavcodec/indeo3.c
index f222a06..e161f83 100644
--- a/libavcodec/indeo3.c
+++ b/libavcodec/indeo3.c
@@ -2,20 +2,20 @@
  * Indeo Video v3 compatible decoder
  * Copyright (c) 2009 - 2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
+#include "copy_block.h"
 #include "bytestream.h"
 #include "get_bits.h"
 #include "hpeldsp.h"
@@ -93,7 +94,7 @@ typedef struct Indeo3DecodeContext {
 
     int16_t         width, height;
     uint32_t        frame_num;      ///< current frame number (zero-based)
-    uint32_t        data_size;      ///< size of the frame data in bytes
+    int             data_size;      ///< size of the frame data in bytes
     uint16_t        frame_flags;    ///< frame properties
     uint8_t         cb_offset;      ///< needed for selecting VQ tables
     uint8_t         buf_sel;        ///< active frame buffer: 0 - primary, 1 -secondary
@@ -117,8 +118,8 @@ static uint8_t requant_tab[8][128];
  */
 static av_cold void build_requant_tab(void)
 {
-    static int8_t offsets[8] = { 1, 1, 2, -3, -3, 3, 4, 4 };
-    static int8_t deltas [8] = { 0, 1, 0,  4,  4, 1, 0, 1 };
+    static const int8_t offsets[8] = { 1, 1, 2, -3, -3, 3, 4, 4 };
+    static const int8_t deltas [8] = { 0, 1, 0,  4,  4, 1, 0, 1 };
 
     int i, j, step;
 
@@ -147,15 +148,26 @@ static av_cold void build_requant_tab(void)
 }
 
 
+static av_cold void free_frame_buffers(Indeo3DecodeContext *ctx)
+{
+    int p;
+
+    ctx->width = ctx->height = 0;
+
+    for (p = 0; p < 3; p++) {
+        av_freep(&ctx->planes[p].buffers[0]);
+        av_freep(&ctx->planes[p].buffers[1]);
+        ctx->planes[p].pixels[0] = ctx->planes[p].pixels[1] = 0;
+    }
+}
+
+
 static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
-                                          AVCodecContext *avctx)
+                                          AVCodecContext *avctx, int luma_width, int luma_height)
 {
-    int p, luma_width, luma_height, chroma_width, chroma_height;
+    int p, chroma_width, chroma_height;
     int luma_pitch, chroma_pitch, luma_size, chroma_size;
 
-    luma_width  = ctx->width;
-    luma_height = ctx->height;
-
     if (luma_width  < 16 || luma_width  > 640 ||
         luma_height < 16 || luma_height > 480 ||
         luma_width  &  3 || luma_height &   3) {
@@ -164,6 +176,9 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
         return AVERROR_INVALIDDATA;
     }
 
+    ctx->width  = luma_width ;
+    ctx->height = luma_height;
+
     chroma_width  = FFALIGN(luma_width  >> 2, 4);
     chroma_height = FFALIGN(luma_height >> 2, 4);
 
@@ -187,6 +202,11 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
         ctx->planes[p].buffers[0] = av_malloc(!p ? luma_size : chroma_size);
         ctx->planes[p].buffers[1] = av_malloc(!p ? luma_size : chroma_size);
 
+        if (!ctx->planes[p].buffers[0] || !ctx->planes[p].buffers[1]) {
+            free_frame_buffers(ctx);
+            return AVERROR(ENOMEM);
+        }
+
         /* fill the INTRA prediction lines with the middle pixel value = 64 */
         memset(ctx->planes[p].buffers[0], 0x40, ctx->planes[p].pitch);
         memset(ctx->planes[p].buffers[1], 0x40, ctx->planes[p].pitch);
@@ -201,19 +221,6 @@ static av_cold int allocate_frame_buffers(Indeo3DecodeContext *ctx,
     return 0;
 }
 
-
-static av_cold void free_frame_buffers(Indeo3DecodeContext *ctx)
-{
-    int p;
-
-    for (p = 0; p < 3; p++) {
-        av_freep(&ctx->planes[p].buffers[0]);
-        av_freep(&ctx->planes[p].buffers[1]);
-        ctx->planes[p].pixels[0] = ctx->planes[p].pixels[1] = 0;
-    }
-}
-
-
 /**
  *  Copy pixels of the cell(x + mv_x, y + mv_y) from the previous frame into
  *  the cell(x, y) in the current frame.
@@ -230,8 +237,11 @@ static int copy_cell(Indeo3DecodeContext *ctx, Plane *plane, Cell *cell)
     /* setup output and reference pointers */
     offset_dst  = (cell->ypos << 2) * plane->pitch + (cell->xpos << 2);
     dst         = plane->pixels[ctx->buf_sel] + offset_dst;
+    if(cell->mv_ptr){
     mv_y        = cell->mv_ptr[0];
     mv_x        = cell->mv_ptr[1];
+    }else
+        mv_x= mv_y= 0;
 
     /* -1 because there is an extra line on top for prediction */
     if ((cell->ypos << 2) + mv_y < -1 || (cell->xpos << 2) + mv_x < 0 ||
@@ -333,7 +343,7 @@ if (*data_ptr >= last_ptr) \
 
 #define RLE_BLOCK_COPY \
     if (cell->mv_ptr || !skip_flag) \
-        ctx->hdsp.put_pixels_tab[2][0](dst, ref, row_offset, 4 << v_zoom)
+        copy_block4(dst, ref, row_offset, row_offset, 4 << v_zoom)
 
 #define RLE_BLOCK_COPY_8 \
     pix64 = AV_RN64(ref);\
@@ -345,7 +355,7 @@ if (*data_ptr >= last_ptr) \
         fill_64(dst, pix64, 8, row_offset)
 
 #define RLE_LINES_COPY \
-    ctx->hdsp.put_pixels_tab[2][0](dst, ref, row_offset, num_lines << v_zoom)
+    copy_block4(dst, ref, row_offset, row_offset, num_lines << v_zoom)
 
 #define RLE_LINES_COPY_M10 \
     pix64 = AV_RN64(ref);\
@@ -589,6 +599,7 @@ static int decode_cell(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     /* setup output and reference pointers */
     offset = (cell->ypos << 2) * plane->pitch + (cell->xpos << 2);
     block  =  plane->pixels[ctx->buf_sel] + offset;
+
     if (!cell->mv_ptr) {
         /* use previous line as reference for INTRA cells */
         ref_block = block - plane->pitch;
@@ -643,7 +654,7 @@ static int decode_cell(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     /* of the predicted cell in order to avoid overflows. */
     if (vq_index >= 8 && ref_block) {
         for (x = 0; x < cell->width << 2; x++)
-            ref_block[x] = requant_tab[vq_index & 7][ref_block[x]];
+            ref_block[x] = requant_tab[vq_index & 7][ref_block[x] & 127];
     }
 
     error = IV3_NOERR;
@@ -771,7 +782,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
             return AVERROR_INVALIDDATA;
     }
 
-    while (1) { /* loop until return */
+    while (get_bits_left(&ctx->gb) >= 2) { /* loop until return */
         RESYNC_BITSTREAM;
         switch (code = get_bits(&ctx->gb, 2)) {
         case H_SPLIT:
@@ -796,6 +807,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 CHECK_CELL
                 if (!curr_cell.mv_ptr)
                     return AVERROR_INVALIDDATA;
+
                 ret = copy_cell(ctx, plane, &curr_cell);
                 return ret;
             }
@@ -806,6 +818,10 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                 /* get motion vector index and setup the pointer to the mv set */
                 if (!ctx->need_resync)
                     ctx->next_cell_data = &ctx->gb.buffer[(get_bits_count(&ctx->gb) + 7) >> 3];
+                if (ctx->next_cell_data >= ctx->last_byte) {
+                    av_log(avctx, AV_LOG_ERROR, "motion vector out of array\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 mv_idx = *(ctx->next_cell_data++);
                 if (mv_idx >= ctx->num_vectors) {
                     av_log(avctx, AV_LOG_ERROR, "motion vector index out of range\n");
@@ -832,7 +848,7 @@ static int parse_bintree(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
         }
     }//while
 
-    return 0;
+    return AVERROR_INVALIDDATA;
 }
 
 
@@ -845,13 +861,13 @@ static int decode_plane(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
 
     /* each plane data starts with mc_vector_count field, */
     /* an optional array of motion vectors followed by the vq data */
-    num_vectors = bytestream_get_le32(&data);
+    num_vectors = bytestream_get_le32(&data); data_size -= 4;
     if (num_vectors > 256) {
         av_log(ctx->avctx, AV_LOG_ERROR,
                "Read invalid number of motion vectors %d\n", num_vectors);
         return AVERROR_INVALIDDATA;
     }
-    if (num_vectors * 2 >= data_size)
+    if (num_vectors * 2 > data_size)
         return AVERROR_INVALIDDATA;
 
     ctx->num_vectors = num_vectors;
@@ -862,7 +878,7 @@ static int decode_plane(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     ctx->skip_bits   = 0;
     ctx->need_resync = 0;
 
-    ctx->last_byte = data + data_size - 1;
+    ctx->last_byte = data + data_size;
 
     /* initialize the 1st cell and set its dimensions to whole plane */
     curr_cell.xpos   = curr_cell.ypos = 0;
@@ -883,7 +899,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     GetByteContext gb;
     const uint8_t   *bs_hdr;
     uint32_t        frame_num, word2, check_sum, data_size;
-    uint32_t        y_offset, u_offset, v_offset, starts[3], ends[3];
+    int             y_offset, u_offset, v_offset;
+    uint32_t        starts[3], ends[3];
     uint16_t        height, width;
     int             i, j;
 
@@ -937,12 +954,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
                    "Invalid picture dimensions: %d x %d!\n", width, height);
             return AVERROR_INVALIDDATA;
         }
-
-        ctx->width  = width;
-        ctx->height = height;
-
         free_frame_buffers(ctx);
-        if ((res = allocate_frame_buffers(ctx, avctx)) < 0)
+        if ((res = allocate_frame_buffers(ctx, avctx, width, height)) < 0)
              return res;
         if ((res = ff_set_dimensions(avctx, width, height)) < 0)
             return res;
@@ -969,7 +982,8 @@ static int decode_frame_headers(Indeo3DecodeContext *ctx, AVCodecContext *avctx,
     ctx->y_data_size = ends[0] - starts[0];
     ctx->v_data_size = ends[1] - starts[1];
     ctx->u_data_size = ends[2] - starts[2];
-    if (FFMAX3(y_offset, v_offset, u_offset) >= ctx->data_size - 16 ||
+    if (FFMIN3(y_offset, v_offset, u_offset) < 0 ||
+        FFMAX3(y_offset, v_offset, u_offset) >= ctx->data_size - 16 ||
         FFMIN3(y_offset, v_offset, u_offset) < gb.buffer - bs_hdr + 16 ||
         FFMIN3(ctx->y_data_size, ctx->v_data_size, ctx->u_data_size) <= 0) {
         av_log(avctx, AV_LOG_ERROR, "One of the y/u/v offsets is invalid\n");
@@ -1040,17 +1054,13 @@ static av_cold int decode_init(AVCodecContext *avctx)
     Indeo3DecodeContext *ctx = avctx->priv_data;
 
     ctx->avctx     = avctx;
-    ctx->width     = avctx->width;
-    ctx->height    = avctx->height;
     avctx->pix_fmt = AV_PIX_FMT_YUV410P;
 
     build_requant_tab();
 
     ff_hpeldsp_init(&ctx->hdsp, avctx->flags);
 
-    allocate_frame_buffers(ctx, avctx);
-
-    return 0;
+    return allocate_frame_buffers(ctx, avctx, avctx->width, avctx->height);
 }
 
 
@@ -1086,6 +1096,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     /* use BS_BUFFER flag for buffer switching */
     ctx->buf_sel = (ctx->frame_flags >> BS_BUFFER) & 1;
 
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
+        return res;
+
     /* decode luma plane */
     if ((res = decode_plane(ctx, avctx, ctx->planes, ctx->y_data_ptr, ctx->y_data_size, 40)))
         return res;
@@ -1097,11 +1110,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if ((res = decode_plane(ctx, avctx, &ctx->planes[2], ctx->v_data_ptr, ctx->v_data_size, 10)))
         return res;
 
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(ctx->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return res;
-    }
-
     output_plane(&ctx->planes[0], ctx->buf_sel,
                  frame->data[0], frame->linesize[0],
                  avctx->height);
diff --git a/libavcodec/indeo3data.h b/libavcodec/indeo3data.h
index 41a29e5..fbe76af 100644
--- a/libavcodec/indeo3data.h
+++ b/libavcodec/indeo3data.h
@@ -2,20 +2,20 @@
  * Indeo Video v3 compatible decoder
  * Copyright (c) 2009 - 2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/indeo4.c b/libavcodec/indeo4.c
index 217311f..69f78c9 100644
--- a/libavcodec/indeo4.c
+++ b/libavcodec/indeo4.c
@@ -2,20 +2,20 @@
  * Indeo Video Interactive v4 compatible decoder
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -175,6 +175,7 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
 
     /* decode subdivision of the planes */
     pic_conf.luma_bands = decode_plane_subdivision(&ctx->gb);
+    pic_conf.chroma_bands = 0;
     if (pic_conf.luma_bands)
         pic_conf.chroma_bands = decode_plane_subdivision(&ctx->gb);
     ctx->is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
@@ -262,6 +263,7 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
 {
     int plane, band_num, indx, transform_id, scan_indx;
     int i;
+    int quant_mat;
 
     plane    = get_bits(&ctx->gb, 2);
     band_num = get_bits(&ctx->gb, 4);
@@ -317,19 +319,26 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                 return AVERROR_PATCHWELCOME;
             }
 
+            if (transform_id < 10 && band->blk_size < 8) {
+                av_log(avctx, AV_LOG_ERROR, "wrong transform size!\n");
+                return AVERROR_INVALIDDATA;
+            }
             if ((transform_id >= 0 && transform_id <= 2) || transform_id == 10)
                 ctx->uses_haar = 1;
 
             band->inv_transform = transforms[transform_id].inv_trans;
             band->dc_transform  = transforms[transform_id].dc_trans;
             band->is_2d_trans   = transforms[transform_id].is_2d_trans;
+
             if (transform_id < 10)
                 band->transform_size = 8;
             else
                 band->transform_size = 4;
 
-            if (band->blk_size != band->transform_size)
+            if (band->blk_size != band->transform_size) {
+                av_log(avctx, AV_LOG_ERROR, "transform and block size mismatch (%d != %d)\n", band->transform_size, band->blk_size);
                 return AVERROR_INVALIDDATA;
+            }
 
             scan_indx = get_bits(&ctx->gb, 4);
             if (scan_indx == 15) {
@@ -337,25 +346,29 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                 return AVERROR_INVALIDDATA;
             }
             if (scan_indx > 4 && scan_indx < 10) {
-                if (band->blk_size != 4)
+                if (band->blk_size != 4) {
+                    av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
                     return AVERROR_INVALIDDATA;
-            } else if (band->blk_size != 8)
+                }
+            } else if (band->blk_size != 8) {
+                av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
                 return AVERROR_INVALIDDATA;
+            }
 
             band->scan = scan_index_to_tab[scan_indx];
+            band->scan_size = band->blk_size;
 
-            band->quant_mat = get_bits(&ctx->gb, 5);
-            if (band->quant_mat >= FF_ARRAY_ELEMS(quant_index_to_tab)) {
-
-                if (band->quant_mat == 31)
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Custom quant matrix encountered!\n");
-                else
-                    avpriv_request_sample(avctx, "Quantization matrix %d",
-                                          band->quant_mat);
-                band->quant_mat = -1;
+            quant_mat = get_bits(&ctx->gb, 5);
+            if (quant_mat == 31) {
+                av_log(avctx, AV_LOG_ERROR, "Custom quant matrix encountered!\n");
                 return AVERROR_INVALIDDATA;
             }
+            if (quant_mat >= FF_ARRAY_ELEMS(quant_index_to_tab)) {
+                avpriv_request_sample(avctx, "Quantization matrix %d",
+                                      quant_mat);
+                return AVERROR_INVALIDDATA;
+            }
+            band->quant_mat = quant_mat;
         } else {
             if (old_blk_size != band->blk_size) {
                 av_log(avctx, AV_LOG_ERROR,
@@ -363,10 +376,19 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
                        "inherited\n");
                 return AVERROR_INVALIDDATA;
             }
-            if (band->quant_mat < 0) {
-                av_log(avctx, AV_LOG_ERROR, "Invalid quant_mat inherited\n");
-                return AVERROR_INVALIDDATA;
-            }
+        }
+        if (quant_index_to_tab[band->quant_mat] > 4 && band->blk_size == 4) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid quant matrix for 4x4 block encountered!\n");
+            band->quant_mat = 0;
+            return AVERROR_INVALIDDATA;
+        }
+        if (band->scan_size != band->blk_size) {
+            av_log(avctx, AV_LOG_ERROR, "mismatching scan table!\n");
+            return AVERROR_INVALIDDATA;
+        }
+        if (band->transform_size == 8 && band->blk_size < 8) {
+            av_log(avctx, AV_LOG_ERROR, "mismatching transform_size!\n");
+            return AVERROR_INVALIDDATA;
         }
 
         /* decode block huffman codebook */
@@ -410,6 +432,11 @@ static int decode_band_hdr(IVI45DecContext *ctx, IVIBandDesc *band,
 
     align_get_bits(&ctx->gb);
 
+    if (!band->scan) {
+        av_log(avctx, AV_LOG_ERROR, "band->scan not set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
@@ -428,7 +455,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                           IVITile *tile, AVCodecContext *avctx)
 {
     int         x, y, mv_x, mv_y, mv_delta, offs, mb_offset, blks_per_mb,
-                mv_scale, mb_type_bits;
+                mv_scale, mb_type_bits, s;
     IVIMbInfo   *mb, *ref_mb;
     int         row_offset = band->mb_size * band->pitch;
 
@@ -443,6 +470,11 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
     mv_scale = (ctx->planes[0].bands[0].mb_size >> 3) - (band->mb_size >> 3);
     mv_x = mv_y = 0;
 
+    if (((tile->width + band->mb_size-1)/band->mb_size) * ((tile->height + band->mb_size-1)/band->mb_size) != tile->num_MBs) {
+        av_log(avctx, AV_LOG_ERROR, "num_MBs mismatch %d %d %d %d\n", tile->width, tile->height, band->mb_size, tile->num_MBs);
+        return -1;
+    }
+
     for (y = tile->ypos; y < tile->ypos + tile->height; y += band->mb_size) {
         mb_offset = offs;
 
@@ -482,8 +514,10 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
             } else {
                 if (band->inherit_mv) {
                     /* copy mb_type from corresponding reference mb */
-                    if (!ref_mb)
+                    if (!ref_mb) {
+                        av_log(avctx, AV_LOG_ERROR, "ref_mb unavailable\n");
                         return AVERROR_INVALIDDATA;
+                    }
                     mb->type = ref_mb->type;
                 } else if (ctx->frame_type == IVI4_FRAMETYPE_INTRA ||
                            ctx->frame_type == IVI4_FRAMETYPE_INTRA1) {
@@ -549,6 +583,15 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
             }
 
+            s= band->is_halfpel;
+            if (mb->type)
+            if ( x +  (mb->mv_x   >>s) +                 (y+               (mb->mv_y   >>s))*band->pitch < 0 ||
+                 x + ((mb->mv_x+s)>>s) + band->mb_size - 1
+                   + (y+band->mb_size - 1 +((mb->mv_y+s)>>s))*band->pitch > band->bufsize -1) {
+                av_log(avctx, AV_LOG_ERROR, "motion vector %d %d outside reference\n", x*s + mb->mv_x, y*s + mb->mv_y);
+                return AVERROR_INVALIDDATA;
+            }
+
             mb++;
             if (ref_mb)
                 ref_mb++;
diff --git a/libavcodec/indeo4data.h b/libavcodec/indeo4data.h
index be7c413..cc497c2 100644
--- a/libavcodec/indeo4data.h
+++ b/libavcodec/indeo4data.h
@@ -2,20 +2,20 @@
  * Indeo Video Interactive 4 compatible decoder
  * Copyright (c) 2009-2010 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -60,7 +60,7 @@ static const uint8_t ivi4_horizontal_scan_4x4[16] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
-static const uint8_t *scan_index_to_tab[15] = {
+static const uint8_t * const scan_index_to_tab[15] = {
     // for 8x8 transforms
     ff_zigzag_direct,
     ivi4_alternate_scan_8x8,
diff --git a/libavcodec/indeo5.c b/libavcodec/indeo5.c
index bed9153..5f931c8 100644
--- a/libavcodec/indeo5.c
+++ b/libavcodec/indeo5.c
@@ -2,20 +2,20 @@
  * Indeo Video Interactive v5 compatible decoder
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,7 +58,7 @@ enum {
  */
 static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
 {
-    int             result, i, p, tile_size, pic_size_indx, mb_size, blk_size;
+    int             result, i, p, tile_size, pic_size_indx, mb_size, blk_size, is_scalable;
     int             quant_mat, blk_size_changed = 0;
     IVIBandDesc     *band, *band1, *band2;
     IVIPicConfig    pic_conf;
@@ -80,8 +80,8 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
     /* num_levels * 3 + 1 */
     pic_conf.luma_bands   = get_bits(&ctx->gb, 2) * 3 + 1;
     pic_conf.chroma_bands = get_bits1(&ctx->gb)   * 3 + 1;
-    ctx->is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
-    if (ctx->is_scalable && (pic_conf.luma_bands != 4 || pic_conf.chroma_bands != 1)) {
+    is_scalable = pic_conf.luma_bands != 1 || pic_conf.chroma_bands != 1;
+    if (is_scalable && (pic_conf.luma_bands != 4 || pic_conf.chroma_bands != 1)) {
         av_log(avctx, AV_LOG_ERROR, "Scalability: unsupported subdivision! Luma bands: %d, chroma bands: %d\n",
                pic_conf.luma_bands, pic_conf.chroma_bands);
         return AVERROR_INVALIDDATA;
@@ -119,6 +119,7 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             return result;
         }
         ctx->pic_conf = pic_conf;
+        ctx->is_scalable = is_scalable;
         blk_size_changed = 1; /* force reallocation of the internal structures */
     }
 
@@ -132,6 +133,11 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             blk_size = 8 >> get_bits1(&ctx->gb);
             mb_size  = blk_size << !mb_size;
 
+            if (p==0 && blk_size==4) {
+                av_log(avctx, AV_LOG_ERROR, "4x4 luma blocks are unsupported!\n");
+                return AVERROR_PATCHWELCOME;
+            }
+
             blk_size_changed = mb_size != band->mb_size || blk_size != band->blk_size;
             if (blk_size_changed) {
                 band->mb_size  = mb_size;
@@ -184,8 +190,10 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             band->is_2d_trans = band->inv_transform == ff_ivi_inverse_slant_8x8 ||
                                 band->inv_transform == ff_ivi_inverse_slant_4x4;
 
-            if (band->transform_size != band->blk_size)
+            if (band->transform_size != band->blk_size) {
+                av_log(avctx, AV_LOG_ERROR, "transform and block size mismatch (%d != %d)\n", band->transform_size, band->blk_size);
                 return AVERROR_INVALIDDATA;
+            }
 
             /* select dequant matrix according to plane and band number */
             if (!p) {
@@ -195,6 +203,10 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
             }
 
             if (band->blk_size == 8) {
+                if(quant_mat >= 5){
+                    av_log(avctx, AV_LOG_ERROR, "quant_mat %d too large!\n", quant_mat);
+                    return -1;
+                }
                 band->intra_base  = &ivi5_base_quant_8x8_intra[quant_mat][0];
                 band->inter_base  = &ivi5_base_quant_8x8_inter[quant_mat][0];
                 band->intra_scale = &ivi5_scale_quant_8x8_intra[quant_mat][0];
@@ -231,6 +243,7 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
         band2->inv_transform = band1->inv_transform;
         band2->dc_transform  = band1->dc_transform;
         band2->is_2d_trans   = band1->is_2d_trans;
+        band2->transform_size= band1->transform_size;
     }
 
     /* reallocate internal structures if needed */
@@ -276,14 +289,18 @@ static int decode_gop_header(IVI45DecContext *ctx, AVCodecContext *avctx)
  *
  *  @param[in,out]  gb  the GetBit context
  */
-static inline void skip_hdr_extension(GetBitContext *gb)
+static inline int skip_hdr_extension(GetBitContext *gb)
 {
     int i, len;
 
     do {
         len = get_bits(gb, 8);
+        if (8*len > get_bits_left(gb))
+            return AVERROR_INVALIDDATA;
         for (i = 0; i < len; i++) skip_bits(gb, 8);
     } while(len);
+
+    return 0;
 }
 
 
@@ -321,6 +338,12 @@ static int decode_pic_hdr(IVI45DecContext *ctx, AVCodecContext *avctx)
         ctx->gop_invalid = 0;
     }
 
+    if (ctx->frame_type == FRAMETYPE_INTER_SCAL && !ctx->is_scalable) {
+        av_log(avctx, AV_LOG_ERROR, "Scalable inter frame in non scalable stream\n");
+        ctx->frame_type = FRAMETYPE_INTER;
+        return AVERROR_INVALIDDATA;
+    }
+
     if (ctx->frame_type != FRAMETYPE_NULL) {
         ctx->frame_flags = get_bits(&ctx->gb, 8);
 
@@ -431,7 +454,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                           IVITile *tile, AVCodecContext *avctx)
 {
     int         x, y, mv_x, mv_y, mv_delta, offs, mb_offset,
-                mv_scale, blks_per_mb;
+                mv_scale, blks_per_mb, s;
     IVIMbInfo   *mb, *ref_mb;
     int         row_offset = band->mb_size * band->pitch;
 
@@ -477,7 +500,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
 
                 mb->mv_x = mb->mv_y = 0; /* no motion vector coded */
-                if (band->inherit_mv){
+                if (band->inherit_mv && ref_mb){
                     /* motion vector inheritance */
                     if (mv_scale) {
                         mb->mv_x = ivi_scale_mv(ref_mb->mv_x, mv_scale);
@@ -488,7 +511,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                     }
                 }
             } else {
-                if (band->inherit_mv) {
+                if (band->inherit_mv && ref_mb) {
                     mb->type = ref_mb->type; /* copy mb_type from corresponding reference mb */
                 } else if (ctx->frame_type == FRAMETYPE_INTRA) {
                     mb->type = 0; /* mb_type is always INTRA for intra-frames */
@@ -514,7 +537,7 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 if (!mb->type) {
                     mb->mv_x = mb->mv_y = 0; /* there is no motion vector in intra-macroblocks */
                 } else {
-                    if (band->inherit_mv){
+                    if (band->inherit_mv && ref_mb){
                         /* motion vector inheritance */
                         if (mv_scale) {
                             mb->mv_x = ivi_scale_mv(ref_mb->mv_x, mv_scale);
@@ -537,6 +560,15 @@ static int decode_mb_info(IVI45DecContext *ctx, IVIBandDesc *band,
                 }
             }
 
+            s= band->is_halfpel;
+            if (mb->type)
+            if ( x +  (mb->mv_x   >>s) +                 (y+               (mb->mv_y   >>s))*band->pitch < 0 ||
+                 x + ((mb->mv_x+s)>>s) + band->mb_size - 1
+                   + (y+band->mb_size - 1 +((mb->mv_y+s)>>s))*band->pitch > band->bufsize - 1) {
+                av_log(avctx, AV_LOG_ERROR, "motion vector %d %d outside reference\n", x*s + mb->mv_x, y*s + mb->mv_y);
+                return AVERROR_INVALIDDATA;
+            }
+
             mb++;
             if (ref_mb)
                 ref_mb++;
@@ -647,7 +679,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-
 AVCodec ff_indeo5_decoder = {
     .name           = "indeo5",
     .long_name      = NULL_IF_CONFIG_SMALL("Intel Indeo Video Interactive 5"),
diff --git a/libavcodec/indeo5data.h b/libavcodec/indeo5data.h
index f4252b5..a6217d0 100644
--- a/libavcodec/indeo5data.h
+++ b/libavcodec/indeo5data.h
@@ -2,20 +2,20 @@
  * Indeo Video Interactive 5 compatible decoder
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intelh263dec.c b/libavcodec/intelh263dec.c
index cd1971f..6734d44 100644
--- a/libavcodec/intelh263dec.c
+++ b/libavcodec/intelh263dec.c
@@ -1,20 +1,20 @@
 /*
  * H.263i decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,8 +39,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     }
     s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
 
-    if (get_bits1(&s->gb) != 1) {
-        av_log(s->avctx, AV_LOG_ERROR, "Bad marker\n");
+    if (check_marker(&s->gb, "after picture_number") != 1) {
         return -1;      /* marker */
     }
     if (get_bits1(&s->gb) != 0) {
@@ -60,14 +59,14 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
 
     s->pict_type = AV_PICTURE_TYPE_I + get_bits1(&s->gb);
 
-    s->unrestricted_mv = get_bits1(&s->gb);
-    s->h263_long_vectors = s->unrestricted_mv;
+    s->h263_long_vectors = get_bits1(&s->gb);
 
     if (get_bits1(&s->gb) != 0) {
         av_log(s->avctx, AV_LOG_ERROR, "SAC not supported\n");
         return -1;      /* SAC: off */
     }
     s->obmc= get_bits1(&s->gb);
+    s->unrestricted_mv = s->obmc || s->h263_long_vectors;
     s->pb_frame = get_bits1(&s->gb);
 
     if (format < 6) {
@@ -83,7 +82,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
         }
         if(get_bits(&s->gb, 2))
             av_log(s->avctx, AV_LOG_ERROR, "Bad value for reserved field\n");
-        s->loop_filter = get_bits1(&s->gb);
+        s->loop_filter = get_bits1(&s->gb) * !s->avctx->lowres;
         if(get_bits1(&s->gb))
             av_log(s->avctx, AV_LOG_ERROR, "Bad value for reserved field\n");
         if(get_bits1(&s->gb))
@@ -96,7 +95,7 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     if(format == 6){
         int ar = get_bits(&s->gb, 4);
         skip_bits(&s->gb, 9); // display width
-        skip_bits1(&s->gb);
+        check_marker(&s->gb, "in dimensions");
         skip_bits(&s->gb, 9); // display height
         if(ar == 15){
             s->avctx->sample_aspect_ratio.num = get_bits(&s->gb, 8); // aspect ratio - width
@@ -117,9 +116,8 @@ int ff_intel_h263_decode_picture_header(MpegEncContext *s)
     }
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0) {
-        skip_bits(&s->gb, 8);
-    }
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
     s->f_code = 1;
 
     s->y_dc_scale_table=
diff --git a/libavcodec/internal.h b/libavcodec/internal.h
index 4bde09a..000fe26 100644
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,17 +53,16 @@
  * from the input AVPacket.
  */
 #define FF_CODEC_CAP_SETS_PKT_DTS           (1 << 2)
-
-#ifdef DEBUG
-#   define ff_dlog(ctx, ...) av_log(ctx, AV_LOG_DEBUG, __VA_ARGS__)
-#else
-#   define ff_dlog(ctx, ...) do { } while (0)
-#endif
+/**
+ * The decoder extracts and fills its parameters even if the frame is
+ * skipped due to the skip_frame setting.
+ */
+#define FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM  (1 << 3)
 
 #ifdef TRACE
 #   define ff_tlog(ctx, ...) av_log(ctx, AV_LOG_TRACE, __VA_ARGS__)
 #else
-#   define ff_tlog(ctx, ...) do { } while (0)
+#   define ff_tlog(ctx, ...) do { } while(0)
 #endif
 
 
@@ -71,9 +70,17 @@
 #define FF_DEFAULT_QUANT_BIAS 999999
 #endif
 
-#define FF_SANE_NB_CHANNELS 63U
+#define FF_SANE_NB_CHANNELS 64U
 
-#define FF_SIGNBIT(x) (x >> CHAR_BIT * sizeof(x) - 1)
+#define FF_SIGNBIT(x) ((x) >> CHAR_BIT * sizeof(x) - 1)
+
+#if HAVE_AVX
+#   define STRIDE_ALIGN 32
+#elif HAVE_SIMD_ALIGN_16
+#   define STRIDE_ALIGN 16
+#else
+#   define STRIDE_ALIGN 8
+#endif
 
 typedef struct FramePool {
     /**
@@ -137,6 +144,19 @@ typedef struct AVCodecInternal {
     AVPacket *pkt;
 
     /**
+     * temporary buffer used for encoders to store their bitstream
+     */
+    uint8_t *byte_buffer;
+    unsigned int byte_buffer_size;
+
+    void *frame_thread_encoder;
+
+    /**
+     * Number of audio samples to skip at the start of the next decoded frame
+     */
+    int skip_samples;
+
+    /**
      * hwaccel-specific private data
      */
     void *hwaccel_priv_data;
@@ -160,6 +180,8 @@ struct AVCodecDefault {
     const uint8_t *value;
 };
 
+extern const uint8_t ff_log2_run[41];
+
 /**
  * Return the index into tab at which {a,b} match elements {[0],[1]} of tab.
  * If there is no such matching pair then size is returned.
@@ -168,6 +190,18 @@ int ff_match_2uint16(const uint16_t (*tab)[2], int size, int a, int b);
 
 unsigned int avpriv_toupper4(unsigned int x);
 
+/**
+ * does needed setup of pkt_pts/pos and such for (re)get_buffer();
+ */
+int ff_init_buffer_info(AVCodecContext *s, AVFrame *frame);
+
+
+void ff_color_frame(AVFrame *frame, const int color[4]);
+
+extern volatile int ff_avcodec_locked;
+int ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec);
+int ff_unlock_avcodec(const AVCodec *codec);
+
 int avpriv_lock_avformat(void);
 int avpriv_unlock_avformat(void);
 
@@ -185,6 +219,7 @@ int avpriv_unlock_avformat(void);
  * ensure the output packet data is large enough, whether provided by the user
  * or allocated in this function.
  *
+ * @param avctx   the AVCodecContext of the encoder
  * @param avpkt   the AVPacket
  *                If avpkt->data is already set, avpkt->size is checked
  *                to ensure it is large enough.
@@ -192,9 +227,20 @@ int avpriv_unlock_avformat(void);
  *                avpkt->size is set to the specified size.
  *                All other AVPacket fields will be reset with av_init_packet().
  * @param size    the minimum required packet size
- * @return        0 on success, negative error code on failure
+ * @param min_size This is a hint to the allocation algorithm, which indicates
+ *                to what minimal size the caller might later shrink the packet
+ *                to. Encoders often allocate packets which are larger than the
+ *                amount of data that is written into them as the exact amount is
+ *                not known at the time of allocation. min_size represents the
+ *                size a packet might be shrunk to by the caller. Can be set to
+ *                0. setting this roughly correctly allows the allocation code
+ *                to choose between several allocation strategies to improve
+ *                speed slightly.
+ * @return        non negative on success, negative error code on failure
  */
-int ff_alloc_packet(AVPacket *avpkt, int size);
+int ff_alloc_packet2(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int64_t min_size);
+
+attribute_deprecated int ff_alloc_packet(AVPacket *avpkt, int size);
 
 /**
  * Rescale from sample rate to AVCodecContext.time_base.
@@ -202,11 +248,32 @@ int ff_alloc_packet(AVPacket *avpkt, int size);
 static av_always_inline int64_t ff_samples_to_time_base(AVCodecContext *avctx,
                                                         int64_t samples)
 {
+    if(samples == AV_NOPTS_VALUE)
+        return AV_NOPTS_VALUE;
     return av_rescale_q(samples, (AVRational){ 1, avctx->sample_rate },
                         avctx->time_base);
 }
 
 /**
+ * 2^(x) for integer x
+ * @return correctly rounded float
+ */
+static av_always_inline float ff_exp2fi(int x) {
+    /* Normal range */
+    if (-126 <= x && x <= 128)
+        return av_int2float(x+127 << 23);
+    /* Too large */
+    else if (x > 128)
+        return INFINITY;
+    /* Subnormal numbers */
+    else if (x > -150)
+        return av_int2float(1 << (x+149));
+    /* Negligibly small */
+    else
+        return 0;
+}
+
+/**
  * Get a buffer for a frame. This is a wrapper around
  * AVCodecContext.get_buffer() and should be used instead calling get_buffer()
  * directly.
@@ -219,9 +286,27 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags);
  */
 int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame);
 
-const uint8_t *avpriv_find_start_code(const uint8_t *restrict p,
+int ff_thread_can_start_frame(AVCodecContext *avctx);
+
+int avpriv_h264_has_num_reorder_frames(AVCodecContext *avctx);
+
+/**
+ * Call avcodec_open2 recursively by decrementing counter, unlocking mutex,
+ * calling the function and then restoring again. Assumes the mutex is
+ * already locked
+ */
+int ff_codec_open2_recursive(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options);
+
+/**
+ * Finalize buf into extradata and set its size appropriately.
+ */
+int avpriv_bprint_to_extradata(AVCodecContext *avctx, struct AVBPrint *buf);
+
+const uint8_t *avpriv_find_start_code(const uint8_t *p,
                                       const uint8_t *end,
-                                      uint32_t *restrict state);
+                                      uint32_t *state);
+
+int avpriv_codec_get_cap_skip_frame_fill_param(const AVCodec *codec);
 
 /**
  * Check that the provided frame dimensions are valid and set them on the codec
@@ -258,4 +343,21 @@ int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame);
  */
 AVCPBProperties *ff_add_cpb_side_data(AVCodecContext *avctx);
 
+int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type);
+
+/**
+ * Check AVFrame for A53 side data and allocate and fill SEI message with A53 info
+ *
+ * @param frame      Raw frame to get A53 side data from
+ * @param prefix_len Number of bytes to allocate before SEI message
+ * @param data       Pointer to a variable to store allocated memory
+ *                   Upon return the variable will hold NULL on error or if frame has no A53 info.
+ *                   Otherwise it will point to prefix_len uninitialized bytes followed by
+ *                   *sei_size SEI message
+ * @param sei_size   Pointer to a variable to store generated SEI message length
+ * @return           Zero on success, negative error code on failure
+ */
+int ff_alloc_a53_sei(const AVFrame *frame, size_t prefix_len,
+                     void **data, size_t *sei_size);
+
 #endif /* AVCODEC_INTERNAL_H */
diff --git a/libavcodec/interplayacm.c b/libavcodec/interplayacm.c
new file mode 100644
index 0000000..a676bcb
--- /dev/null
+++ b/libavcodec/interplayacm.c
@@ -0,0 +1,615 @@
+/*
+ * Interplay ACM decoder
+ *
+ * Copyright (c) 2004-2008 Marko Kreen
+ * Copyright (c) 2008 Adam Gashlin
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#define BITSTREAM_READER_LE
+#include "get_bits.h"
+#include "internal.h"
+
+static const int8_t map_1bit[]      = { -1, +1 };
+static const int8_t map_2bit_near[] = { -2, -1, +1, +2 };
+static const int8_t map_2bit_far[]  = { -3, -2, +2, +3 };
+static const int8_t map_3bit[]      = { -4, -3, -2, -1, +1, +2, +3, +4 };
+
+static int mul_3x3 [3 * 3 * 3];
+static int mul_3x5 [5 * 5 * 5];
+static int mul_2x11[11  *  11];
+
+typedef struct InterplayACMContext {
+    GetBitContext gb;
+    uint8_t *bitstream;
+    int max_framesize;
+    int bitstream_size;
+    int bitstream_index;
+
+    int level;
+    int rows;
+    int cols;
+    int wrapbuf_len;
+    int block_len;
+    int skip;
+
+    int *block;
+    int *wrapbuf;
+    int *ampbuf;
+    int *midbuf;
+} InterplayACMContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    InterplayACMContext *s = avctx->priv_data;
+    int x1, x2, x3;
+
+    if (avctx->extradata_size < 14)
+        return AVERROR_INVALIDDATA;
+
+    s->level = AV_RL16(avctx->extradata + 12) & 0xf;
+    s->rows  = AV_RL16(avctx->extradata + 12) >>  4;
+    s->cols  = 1 << s->level;
+    s->wrapbuf_len = 2 * s->cols - 2;
+    s->block_len = s->rows * s->cols;
+    s->max_framesize = s->block_len;
+
+    s->block   = av_calloc(s->block_len, sizeof(int));
+    s->wrapbuf = av_calloc(s->wrapbuf_len, sizeof(int));
+    s->ampbuf  = av_calloc(0x10000, sizeof(int));
+    s->bitstream = av_calloc(s->max_framesize, sizeof(*s->bitstream));
+    if (!s->block || !s->wrapbuf || !s->ampbuf || !s->bitstream)
+        return AVERROR(ENOMEM);
+
+    s->midbuf  = s->ampbuf + 0x8000;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+
+    for (x3 = 0; x3 < 3; x3++)
+        for (x2 = 0; x2 < 3; x2++)
+            for (x1 = 0; x1 < 3; x1++)
+                mul_3x3[x1 + x2 * 3 + x3* 3 * 3] = x1 + (x2 << 4) + (x3 << 8);
+    for (x3 = 0; x3 < 5; x3++)
+        for (x2 = 0; x2 < 5; x2++)
+            for (x1 = 0; x1 < 5; x1++)
+                mul_3x5[x1 + x2 * 5 + x3 * 5 * 5] = x1 + (x2 << 4) + (x3 << 8);
+    for (x2 = 0; x2 < 11; x2++)
+        for (x1 = 0; x1 < 11; x1++)
+            mul_2x11[x1 + x2 * 11] = x1 + (x2 << 4);
+
+    return 0;
+}
+
+#define set_pos(s, r, c, idx) do {               \
+        unsigned pos = ((r) << s->level) + (c);  \
+        s->block[pos] = s->midbuf[(idx)];        \
+    } while (0)
+
+static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    unsigned i;
+
+    for (i = 0; i < s->rows; i++)
+        set_pos(s, i, col, 0);
+    return 0;
+}
+
+static int bad(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    return AVERROR_INVALIDDATA;
+}
+
+static int linear(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned int i;
+    int b, middle = 1 << (ind - 1);
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits(gb, ind);
+        set_pos(s, i, col, b - middle);
+    }
+    return 0;
+}
+
+static int k13(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i++, col, 0);
+            if (i >= s->rows)
+                break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+        b = get_bits1(gb);
+        set_pos(s, i, col, map_1bit[b]);
+    }
+    return 0;
+}
+
+static int k12(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        set_pos(s, i, col, map_1bit[b]);
+    }
+    return 0;
+}
+
+static int k24(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i++, col, 0);
+            if (i >= s->rows) break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_near[b]);
+    }
+    return 0;
+}
+
+static int k23(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_near[b]);
+    }
+    return 0;
+}
+
+static int k35(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i++, col, 0);
+            if (i >= s->rows)
+                break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            b = get_bits1(gb);
+            set_pos(s, i, col, map_1bit[b]);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_far[b]);
+    }
+    return 0;
+}
+
+static int k34(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            b = get_bits1(gb);
+            set_pos(s, i, col, map_1bit[b]);
+            continue;
+        }
+
+        b = get_bits(gb, 2);
+        set_pos(s, i, col, map_2bit_far[b]);
+    }
+    return 0;
+}
+
+static int k45(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0); i++;
+            if (i >= s->rows)
+                break;
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 3);
+        set_pos(s, i, col, map_3bit[b]);
+    }
+    return 0;
+}
+
+static int k44(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+
+    for (i = 0; i < s->rows; i++) {
+        b = get_bits1(gb);
+        if (b == 0) {
+            set_pos(s, i, col, 0);
+            continue;
+        }
+
+        b = get_bits(gb, 3);
+        set_pos(s, i, col, map_3bit[b]);
+    }
+    return 0;
+}
+
+static int t15(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+    int n1, n2, n3;
+
+    for (i = 0; i < s->rows; i++) {
+        /* b = (x1) + (x2 * 3) + (x3 * 9) */
+        b = get_bits(gb, 5);
+
+        n1 =  (mul_3x3[b] & 0x0F) - 1;
+        n2 = ((mul_3x3[b] >> 4) & 0x0F) - 1;
+        n3 = ((mul_3x3[b] >> 8) & 0x0F) - 1;
+
+        set_pos(s, i++, col, n1);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i++, col, n2);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i, col, n3);
+    }
+    return 0;
+}
+
+static int t27(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+    int n1, n2, n3;
+
+    for (i = 0; i < s->rows; i++) {
+        /* b = (x1) + (x2 * 5) + (x3 * 25) */
+        b = get_bits(gb, 7);
+
+        n1 =  (mul_3x5[b] & 0x0F) - 2;
+        n2 = ((mul_3x5[b] >> 4) & 0x0F) - 2;
+        n3 = ((mul_3x5[b] >> 8) & 0x0F) - 2;
+
+        set_pos(s, i++, col, n1);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i++, col, n2);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i, col, n3);
+    }
+    return 0;
+}
+
+static int t37(InterplayACMContext *s, unsigned ind, unsigned col)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, b;
+    int n1, n2;
+    for (i = 0; i < s->rows; i++) {
+        /* b = (x1) + (x2 * 11) */
+        b = get_bits(gb, 7);
+
+        n1 =  (mul_2x11[b] & 0x0F) - 5;
+        n2 = ((mul_2x11[b] >> 4) & 0x0F) - 5;
+
+        set_pos(s, i++, col, n1);
+        if (i >= s->rows)
+            break;
+        set_pos(s, i, col, n2);
+    }
+    return 0;
+}
+
+typedef int (*filler)(InterplayACMContext *s, unsigned ind, unsigned col);
+
+static const filler filler_list[] = {
+    zero,   bad,    bad,    linear,
+    linear, linear, linear, linear,
+    linear, linear, linear, linear,
+    linear, linear, linear, linear,
+    linear, k13,    k12,    t15,
+    k24,    k23,    t27,    k35,
+    k34,    bad,    k45,    k44,
+    bad,    t37,    bad,    bad,
+};
+
+static int fill_block(InterplayACMContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    unsigned i, ind;
+    int ret;
+
+    for (i = 0; i < s->cols; i++) {
+        ind = get_bits(gb, 5);
+        ret = filler_list[ind](s, ind, i);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
+}
+
+static void juggle(int *wrap_p, int *block_p, unsigned sub_len, unsigned sub_count)
+{
+    unsigned i, j;
+    int *p, r0, r1, r2, r3;
+
+    for (i = 0; i < sub_len; i++) {
+        p = block_p;
+        r0 = wrap_p[0];
+        r1 = wrap_p[1];
+        for (j = 0; j < sub_count/2; j++) {
+            r2 = *p;
+            *p = r1 * 2 + (r0 + r2);
+            p += sub_len;
+            r3 = *p;
+            *p = r2 * 2 - (r1 + r3);
+            p += sub_len;
+            r0 = r2;
+            r1 = r3;
+        }
+
+        *wrap_p++ = r0;
+        *wrap_p++ = r1;
+        block_p++;
+    }
+}
+
+static void juggle_block(InterplayACMContext *s)
+{
+    unsigned sub_count, sub_len, todo_count, step_subcount, i;
+    int *wrap_p, *block_p, *p;
+
+    /* juggle only if subblock_len > 1 */
+    if (s->level == 0)
+        return;
+
+    /* 2048 / subblock_len */
+    if (s->level > 9)
+        step_subcount = 1;
+    else
+        step_subcount = (2048 >> s->level) - 2;
+
+    /* Apply juggle()  (rows)x(cols)
+     * from (step_subcount * 2)            x (subblock_len/2)
+     * to   (step_subcount * subblock_len) x (1)
+     */
+    todo_count = s->rows;
+    block_p = s->block;
+    while (1) {
+        wrap_p = s->wrapbuf;
+        sub_count = step_subcount;
+        if (sub_count > todo_count)
+            sub_count = todo_count;
+
+        sub_len = s->cols / 2;
+        sub_count *= 2;
+
+        juggle(wrap_p, block_p, sub_len, sub_count);
+        wrap_p += sub_len * 2;
+
+        for (i = 0, p = block_p; i < sub_count; i++) {
+            p[0]++;
+            p += sub_len;
+        }
+
+        while (sub_len > 1) {
+            sub_len /= 2;
+            sub_count *= 2;
+            juggle(wrap_p, block_p, sub_len, sub_count);
+            wrap_p += sub_len * 2;
+        }
+
+        if (todo_count <= step_subcount)
+            break;
+
+        todo_count -= step_subcount;
+        block_p += step_subcount << s->level;
+    }
+}
+
+static int decode_block(InterplayACMContext *s)
+{
+    GetBitContext *gb = &s->gb;
+    int pwr, count, val, i, x, ret;
+
+    pwr = get_bits(gb, 4);
+    val = get_bits(gb, 16);
+
+    count = 1 << pwr;
+
+    for (i = 0, x = 0; i < count; i++) {
+        s->midbuf[i] = x;
+        x += val;
+    }
+
+    for (i = 1, x = -val; i <= count; i++) {
+        s->midbuf[-i] = x;
+        x -= val;
+    }
+
+    ret = fill_block(s);
+    if (ret < 0)
+        return ret;
+
+    juggle_block(s);
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data,
+                        int *got_frame_ptr, AVPacket *pkt)
+{
+    InterplayACMContext *s = avctx->priv_data;
+    GetBitContext *gb = &s->gb;
+    AVFrame *frame = data;
+    const uint8_t *buf;
+    int16_t *samples;
+    int ret, n, buf_size, input_buf_size;
+
+    if (!pkt->size && !s->bitstream_size) {
+        *got_frame_ptr = 0;
+        return 0;
+    }
+
+    buf_size = FFMIN(pkt->size, s->max_framesize - s->bitstream_size);
+    input_buf_size = buf_size;
+    if (s->bitstream_index + s->bitstream_size + buf_size > s->max_framesize) {
+        memmove(s->bitstream, &s->bitstream[s->bitstream_index], s->bitstream_size);
+        s->bitstream_index = 0;
+    }
+    if (pkt->data)
+        memcpy(&s->bitstream[s->bitstream_index + s->bitstream_size], pkt->data, buf_size);
+    buf                = &s->bitstream[s->bitstream_index];
+    buf_size          += s->bitstream_size;
+    s->bitstream_size  = buf_size;
+    if (buf_size < s->max_framesize && pkt->data) {
+        *got_frame_ptr = 0;
+        return input_buf_size;
+    }
+
+    if ((ret = init_get_bits8(gb, buf, buf_size)) < 0)
+        return ret;
+
+    frame->nb_samples = s->block_len / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    skip_bits(gb, s->skip);
+    ret = decode_block(s);
+    if (ret < 0)
+        return ret;
+
+    samples = (int16_t *)frame->data[0];
+    for (n = 0; n < frame->nb_samples * avctx->channels; n++) {
+        int val = s->block[n] >> s->level;
+        *samples++ = val;
+    }
+
+    *got_frame_ptr = 1;
+    s->skip = get_bits_count(gb) - 8 * (get_bits_count(gb) / 8);
+    n = get_bits_count(gb) / 8;
+
+    if (n > buf_size && pkt->data) {
+        s->bitstream_size = 0;
+        s->bitstream_index = 0;
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->bitstream_size) {
+        s->bitstream_index += n;
+        s->bitstream_size  -= n;
+        return input_buf_size;
+    }
+    return n;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    InterplayACMContext *s = avctx->priv_data;
+
+    av_freep(&s->block);
+    av_freep(&s->wrapbuf);
+    av_freep(&s->ampbuf);
+    av_freep(&s->bitstream);
+    s->bitstream_size = 0;
+
+    return 0;
+}
+
+AVCodec ff_interplay_acm_decoder = {
+    .name           = "interplayacm",
+    .long_name      = NULL_IF_CONFIG_SMALL("Interplay ACM"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_INTERPLAY_ACM,
+    .init           = decode_init,
+    .close          = decode_close,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+    .priv_data_size = sizeof(InterplayACMContext),
+};
diff --git a/libavcodec/interplayvideo.c b/libavcodec/interplayvideo.c
index e41fc34..88c610d 100644
--- a/libavcodec/interplayvideo.c
+++ b/libavcodec/interplayvideo.c
@@ -2,20 +2,20 @@
  * Interplay MVE Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "hpeldsp.h"
@@ -72,10 +73,10 @@ static int copy_from(IpvideoContext *s, AVFrame *src, AVFrame *dst, int delta_x,
     int motion_offset = current_offset + delta_y * dst->linesize[0]
                        + delta_x * (1 + s->is_16bpp);
     if (motion_offset < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, " Interplay video: motion offset < 0 (%d)\n", motion_offset);
+        av_log(s->avctx, AV_LOG_ERROR, "motion offset < 0 (%d)\n", motion_offset);
         return AVERROR_INVALIDDATA;
     } else if (motion_offset > s->upper_motion_limit_offset) {
-        av_log(s->avctx, AV_LOG_ERROR, " Interplay video: motion offset above limit (%d >= %d)\n",
+        av_log(s->avctx, AV_LOG_ERROR, "motion offset above limit (%d >= %d)\n",
             motion_offset, s->upper_motion_limit_offset);
         return AVERROR_INVALIDDATA;
     }
@@ -118,7 +119,7 @@ static int ipvideo_decode_block_opcode_0x2(IpvideoContext *s, AVFrame *frame)
         y =   8 + ((B - 56) / 29);
     }
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, s->second_last_frame, frame, x, y);
 }
 
@@ -144,7 +145,7 @@ static int ipvideo_decode_block_opcode_0x3(IpvideoContext *s, AVFrame *frame)
         y = -(  8 + ((B - 56) / 29));
     }
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, frame, frame, x, y);
 }
 
@@ -165,7 +166,7 @@ static int ipvideo_decode_block_opcode_0x4(IpvideoContext *s, AVFrame *frame)
     x = -8 + BL;
     y = -8 + BH;
 
-    ff_dlog(NULL, "    motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
+    ff_tlog(s->avctx, "motion byte = %d, (x, y) = (%d, %d)\n", B, x, y);
     return copy_from(s, s->last_frame, frame, x, y);
 }
 
@@ -178,14 +179,14 @@ static int ipvideo_decode_block_opcode_0x5(IpvideoContext *s, AVFrame *frame)
     x = bytestream2_get_byte(&s->stream_ptr);
     y = bytestream2_get_byte(&s->stream_ptr);
 
-    ff_dlog(NULL, "    motion bytes = %d, %d\n", x, y);
+    ff_tlog(s->avctx, "motion bytes = %d, %d\n", x, y);
     return copy_from(s, s->last_frame, frame, x, y);
 }
 
 static int ipvideo_decode_block_opcode_0x6(IpvideoContext *s, AVFrame *frame)
 {
     /* mystery opcode? skip multiple blocks? */
-    av_log(s->avctx, AV_LOG_ERROR, "  Interplay video: Help! Mystery opcode 0x6 seen\n");
+    av_log(s->avctx, AV_LOG_ERROR, "Help! Mystery opcode 0x6 seen\n");
 
     /* report success */
     return 0;
@@ -197,6 +198,11 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s, AVFrame *frame)
     unsigned char P[2];
     unsigned int flags;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x7\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 2-color encoding */
     P[0] = bytestream2_get_byte(&s->stream_ptr);
     P[1] = bytestream2_get_byte(&s->stream_ptr);
@@ -236,6 +242,11 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s, AVFrame *frame)
     unsigned char P[4];
     unsigned int flags = 0;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 12) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x8\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 2-color encoding for each 4x4 quadrant, or 2-color encoding on
      * either top and bottom or left and right halves */
     P[0] = bytestream2_get_byte(&s->stream_ptr);
@@ -308,6 +319,11 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s, AVFrame *frame)
     int x, y;
     unsigned char P[4];
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 8) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0x9\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 4-color encoding */
     bytestream2_get_buffer(&s->stream_ptr, P, 4);
 
@@ -374,6 +390,11 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s, AVFrame *frame)
     unsigned char P[8];
     int flags = 0;
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 16) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0xA\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     bytestream2_get_buffer(&s->stream_ptr, P, 4);
 
     /* 4-color encoding for each 4x4 quadrant, or 4-color encoding on
@@ -467,6 +488,11 @@ static int ipvideo_decode_block_opcode_0xD(IpvideoContext *s, AVFrame *frame)
     int y;
     unsigned char P[2];
 
+    if (bytestream2_get_bytes_left(&s->stream_ptr) < 4) {
+        av_log(s->avctx, AV_LOG_ERROR, "too little data for opcode 0xD\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* 4-color block encoding: each 4x4 block is a different color */
     for (y = 0; y < 8; y++) {
         if (!(y & 3)) {
@@ -528,7 +554,7 @@ static int ipvideo_decode_block_opcode_0x6_16(IpvideoContext *s, AVFrame *frame)
     x = bytestream2_get_byte(&s->stream_ptr);
     y = bytestream2_get_byte(&s->stream_ptr);
 
-    ff_dlog(NULL, "    motion bytes = %d, %d\n", x, y);
+    ff_tlog(s->avctx, "motion bytes = %d, %d\n", x, y);
     return copy_from(s, s->second_last_frame, frame, x, y);
 }
 
@@ -903,7 +929,7 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
         for (x = 0; x < s->avctx->width; x += 8) {
             opcode = get_bits(&gb, 4);
 
-            ff_dlog(s->avctx,
+            ff_tlog(s->avctx,
                     "  block @ (%3d, %3d): encoding 0x%X, data ptr offset %d\n",
                     x, y, opcode, bytestream2_tell(&s->stream_ptr));
 
@@ -917,15 +943,15 @@ static void ipvideo_decode_opcodes(IpvideoContext *s, AVFrame *frame)
                 ret = ipvideo_decode_block16[opcode](s, frame);
             }
             if (ret != 0) {
-                av_log(s->avctx, AV_LOG_ERROR, " Interplay video: decode problem on frame %d, @ block (%d, %d)\n",
+                av_log(s->avctx, AV_LOG_ERROR, "decode problem on frame %d, @ block (%d, %d)\n",
                        s->avctx->frame_number, x, y);
                 return;
             }
         }
     }
     if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "Interplay video: decode finished with %d bytes left over\n",
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "decode finished with %d bytes left over\n",
                bytestream2_get_bytes_left(&s->stream_ptr));
     }
 }
@@ -962,22 +988,28 @@ static int ipvideo_decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     int ret;
 
+    if (buf_size < 2)
+        return AVERROR_INVALIDDATA;
+
     /* decoding map contains 4 bits of information per 8x8 block */
-    s->decoding_map_size = avctx->width * avctx->height / (8 * 8 * 2);
+    s->decoding_map_size = AV_RL16(avpkt->data);
 
     /* compressed buffer needs to be large enough to at least hold an entire
      * decoding map */
-    if (buf_size < s->decoding_map_size)
+    if (buf_size < s->decoding_map_size + 2)
         return buf_size;
 
-    s->decoding_map = buf;
-    bytestream2_init(&s->stream_ptr, buf + s->decoding_map_size,
+    if (av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, NULL)) {
+        av_frame_unref(s->last_frame);
+        av_frame_unref(s->second_last_frame);
+    }
+
+    s->decoding_map = buf + 2;
+    bytestream2_init(&s->stream_ptr, buf + 2 + s->decoding_map_size,
                      buf_size - s->decoding_map_size);
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  Interplay Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (!s->is_16bpp) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
diff --git a/libavcodec/intrax8.c b/libavcodec/intrax8.c
index eb4c1ef..1e881fc 100644
--- a/libavcodec/intrax8.c
+++ b/libavcodec/intrax8.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
  * @brief IntraX8 (J-Frame) subdecoder, used by WMV2 and VC-1
  */
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "idctdsp.h"
@@ -134,7 +135,7 @@ static inline void x8_select_ac_table(IntraX8Context *const w, int mode)
 {
     int table_index;
 
-    assert(mode < 4);
+    av_assert2(mode < 4);
 
     if (w->j_ac_vlc[mode])
         return;
@@ -142,8 +143,7 @@ static inline void x8_select_ac_table(IntraX8Context *const w, int mode)
     table_index       = get_bits(w->gb, 3);
     // 2 modes use same tables
     w->j_ac_vlc[mode] = &j_ac_vlc[w->quant < 13][mode >> 1][table_index];
-
-    assert(w->j_ac_vlc[mode]);
+    av_assert2(w->j_ac_vlc[mode]);
 }
 
 static inline int x8_get_orient_vlc(IntraX8Context *w)
@@ -152,8 +152,6 @@ static inline int x8_get_orient_vlc(IntraX8Context *w)
         int table_index = get_bits(w->gb, 1 + (w->quant < 13));
         w->j_orient_vlc = &j_orient_vlc[w->quant < 13][table_index];
     }
-    assert(w->j_orient_vlc);
-    assert(w->j_orient_vlc->table);
 
     return get_vlc2(w->gb, w->j_orient_vlc->table, OR_VLC_BITS, OR_VLC_MTD);
 }
@@ -290,14 +288,12 @@ static int x8_get_dc_rlf(IntraX8Context *const w, const int mode,
 {
     int i, e, c;
 
-    assert(mode < 3);
+    av_assert2(mode < 3);
     if (!w->j_dc_vlc[mode]) {
         int table_index = get_bits(w->gb, 3);
         // 4 modes, same table
         w->j_dc_vlc[mode] = &j_dc_vlc[w->quant < 13][table_index];
     }
-    assert(w->j_dc_vlc);
-    assert(w->j_dc_vlc[mode]->table);
 
     i = get_vlc2(w->gb, w->j_dc_vlc[mode]->table, DC_VLC_BITS, DC_VLC_MTD);
 
@@ -354,7 +350,7 @@ static int x8_setup_spatial_predictor(IntraX8Context *const w, const int chroma)
     if (chroma)
         return 0;
 
-    assert(w->orient < 3);
+    av_assert2(w->orient < 3);
     if (range < 2 * w->quant) {
         if ((w->edges & 3) == 0) {
             if (w->orient == 1)
@@ -374,9 +370,9 @@ static int x8_setup_spatial_predictor(IntraX8Context *const w, const int chroma)
         w->raw_orient = x8_get_orient_vlc(w);
         if (w->raw_orient < 0)
             return -1;
-        assert(w->raw_orient < 12);
-        assert(w->orient < 3);
-        w->orient = prediction_table[w->orient][w->raw_orient];
+        av_assert2(w->raw_orient < 12);
+        av_assert2(w->orient < 3);
+        w->orient=prediction_table[w->orient][w->raw_orient];
     }
     return 0;
 }
@@ -480,7 +476,7 @@ static void x8_ac_compensation(IntraX8Context *const w, const int direction,
                                const int dc_level)
 {
     int t;
-#define B(x, y) w->block[0][w->idsp.idct_permutation[(x) + (y) * 8]]
+#define B(x,y)  w->block[0][w->idct_permutation[(x) + (y) * 8]]
 #define T(x)  ((x) * dc_level + 0x8000) >> 16;
     switch (direction) {
     case 0:
@@ -578,7 +574,7 @@ static int x8_decode_intra_mb(IntraX8Context *const w, const int chroma)
     int use_quant_matrix;
     int sign;
 
-    assert(w->orient < 12);
+    av_assert2(w->orient < 12);
     w->bdsp.clear_block(w->block[0]);
 
     if (chroma)
@@ -690,7 +686,7 @@ static int x8_decode_intra_mb(IntraX8Context *const w, const int chroma)
                                                w->frame->linesize[!!chroma]);
     }
     if (!zeros_only)
-        w->idsp.idct_add(w->dest[chroma],
+        w->wdsp.idct_add(w->dest[chroma],
                          w->frame->linesize[!!chroma],
                          w->block[0]);
 
@@ -751,11 +747,16 @@ av_cold int ff_intrax8_common_init(AVCodecContext *avctx,
     if (!w->prediction_table)
         return AVERROR(ENOMEM);
 
-    ff_init_scantable(w->idsp.idct_permutation, &w->scantable[0],
+    ff_wmv2dsp_init(&w->wdsp);
+
+    ff_init_scantable_permutation(w->idct_permutation,
+                                  w->wdsp.idct_perm);
+
+    ff_init_scantable(w->idct_permutation, &w->scantable[0],
                       ff_wmv1_scantable[0]);
-    ff_init_scantable(w->idsp.idct_permutation, &w->scantable[1],
+    ff_init_scantable(w->idct_permutation, &w->scantable[1],
                       ff_wmv1_scantable[2]);
-    ff_init_scantable(w->idsp.idct_permutation, &w->scantable[2],
+    ff_init_scantable(w->idct_permutation, &w->scantable[2],
                       ff_wmv1_scantable[3]);
 
     ff_intrax8dsp_init(&w->dsp);
diff --git a/libavcodec/intrax8.h b/libavcodec/intrax8.h
index ad172b1..5b8946e 100644
--- a/libavcodec/intrax8.h
+++ b/libavcodec/intrax8.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 #include "get_bits.h"
 #include "idctdsp.h"
 #include "intrax8dsp.h"
+#include "wmv2dsp.h"
 #include "mpegpicture.h"
 
 typedef struct IntraX8Context {
@@ -35,6 +36,8 @@ typedef struct IntraX8Context {
     // set by ff_intrax8_common_init
     uint8_t *prediction_table; // 2 * (mb_w * 2)
     ScanTable scantable[3];
+    WMV2DSPContext wdsp;
+    uint8_t idct_permutation[64];
     AVCodecContext *avctx;
     int *block_last_index;  ///< last nonzero coefficient in block
     int16_t (*block)[64];
@@ -96,6 +99,7 @@ void ff_intrax8_common_end(IntraX8Context *w);
 
 /**
  * Decode single IntraX8 frame.
+ * lowres decoding is theoretically impossible.
  * @param w pointer to IntraX8Context
  * @param pict the output Picture containing an AVFrame
  * @param gb open bitstream reader
diff --git a/libavcodec/intrax8dsp.c b/libavcodec/intrax8dsp.c
index 108cfe3..5520e3c 100644
--- a/libavcodec/intrax8dsp.c
+++ b/libavcodec/intrax8dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intrax8dsp.h b/libavcodec/intrax8dsp.h
index 4ba1a0b..b5d1607 100644
--- a/libavcodec/intrax8dsp.h
+++ b/libavcodec/intrax8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/intrax8huf.h b/libavcodec/intrax8huf.h
index 684fdb7..558d0e5 100644
--- a/libavcodec/intrax8huf.h
+++ b/libavcodec/intrax8huf.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c
index 9199f5c..e4a7ad4 100644
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
  * H.263 decoder.
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
 #include <limits.h>
 
 #include "libavutil/attributes.h"
@@ -106,11 +107,9 @@ static VLC cbpc_b_vlc;
 /* XXX: find a better solution to handle static init */
 av_cold void ff_h263_decode_init_vlc(void)
 {
-    static int done = 0;
+    static volatile int done = 0;
 
     if (!done) {
-        done = 1;
-
         INIT_VLC_STATIC(&ff_h263_intra_MCBPC_vlc, INTRA_MCBPC_VLC_BITS, 9,
                  ff_h263_intra_MCBPC_bits, 1, 1,
                  ff_h263_intra_MCBPC_code, 1, 1, 72);
@@ -133,6 +132,7 @@ av_cold void ff_h263_decode_init_vlc(void)
         INIT_VLC_STATIC(&cbpc_b_vlc, CBPC_B_VLC_BITS, 4,
                  &ff_cbpc_b_tab[0][1], 2, 1,
                  &ff_cbpc_b_tab[0][0], 2, 1, 8);
+        done = 1;
     }
 }
 
@@ -167,7 +167,7 @@ static int h263_decode_gob_header(MpegEncContext *s)
         /* We have a GBSC probably with GSTUFF */
     skip_bits(&s->gb, 16); /* Drop the zeros */
     left= get_bits_left(&s->gb);
-    //MN: we must check the bits left or we might end in a infinite loop (or segfault)
+    //MN: we must check the bits left or we might end in an infinite loop (or segfault)
     for(;left>13; left--){
         if(get_bits1(&s->gb)) break; /* Seek the '1' bit */
     }
@@ -175,17 +175,17 @@ static int h263_decode_gob_header(MpegEncContext *s)
         return -1;
 
     if(s->h263_slice_structured){
-        if(get_bits1(&s->gb)==0)
+        if(check_marker(&s->gb, "before MBA")==0)
             return -1;
 
         ff_h263_decode_mba(s);
 
         if(s->mb_num > 1583)
-            if(get_bits1(&s->gb)==0)
+            if(check_marker(&s->gb, "after MBA")==0)
                 return -1;
 
         s->qscale = get_bits(&s->gb, 5); /* SQUANT */
-        if(get_bits1(&s->gb)==0)
+        if(check_marker(&s->gb, "after SQUANT")==0)
             return -1;
         skip_bits(&s->gb, 2); /* GFID */
     }else{
@@ -206,27 +206,6 @@ static int h263_decode_gob_header(MpegEncContext *s)
 }
 
 /**
- * Find the next resync_marker.
- * @param p pointer to buffer to scan
- * @param end pointer to the end of the buffer
- * @return pointer to the next resync_marker, or end if none was found
- */
-const uint8_t *ff_h263_find_resync_marker(const uint8_t *restrict p, const uint8_t * restrict end)
-{
-    assert(p < end);
-
-    end-=2;
-    p++;
-    for(;p<end; p+=2){
-        if(!*p){
-            if     (!p[-1] && p[1]) return p - 1;
-            else if(!p[ 1] && p[2]) return p;
-        }
-    }
-    return end+2;
-}
-
-/**
  * Decode the group of blocks / video packet header.
  * @return bit position of the resync_marker, or <0 if none was found
  */
@@ -328,13 +307,13 @@ static int h263p_decode_umotion(MpegEncContext * s, int pred)
    code >>= 1;
 
    code = (sign) ? (pred - code) : (pred + code);
-   ff_dlog(s->avctx,"H.263+ UMV Motion = %d\n", code);
+   ff_tlog(s->avctx,"H.263+ UMV Motion = %d\n", code);
    return code;
 
 }
 
 /**
- * read the next MVs for OBMC. yes this is a ugly hack, feel free to send a patch :)
+ * read the next MVs for OBMC. yes this is an ugly hack, feel free to send a patch :)
  */
 static void preview_obmc(MpegEncContext *s){
     GetBitContext gb= s->gb;
@@ -350,7 +329,7 @@ static void preview_obmc(MpegEncContext *s){
         s->block_index[i]+= 1;
     s->mb_x++;
 
-    assert(s->pict_type == AV_PICTURE_TYPE_P);
+    av_assert2(s->pict_type == AV_PICTURE_TYPE_P);
 
     do{
         if (get_bits1(&s->gb)) {
@@ -444,7 +423,7 @@ static void h263_decode_dquant(MpegEncContext *s){
 static int h263_decode_block(MpegEncContext * s, int16_t * block,
                              int n, int coded)
 {
-    int code, level, i, j, last, run;
+    int level, i, j, run;
     RLTable *rl = &ff_h263_rl_inter;
     const uint8_t *scan_table;
     GetBitContext gb= s->gb;
@@ -485,7 +464,7 @@ static int h263_decode_block(MpegEncContext * s, int16_t * block,
             level = get_bits(&s->gb, 8);
             if((level&0x7F) == 0){
                 av_log(s->avctx, AV_LOG_ERROR, "illegal dc %d at %d %d\n", level, s->mb_x, s->mb_y);
-                if (s->avctx->err_recognition & AV_EF_BITSTREAM)
+                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))
                     return -1;
             }
             if (level == 255)
@@ -503,39 +482,67 @@ static int h263_decode_block(MpegEncContext * s, int16_t * block,
         return 0;
     }
 retry:
+    {
+    OPEN_READER(re, &s->gb);
+    i--; // offset by -1 to allow direct indexing of scan_table
     for(;;) {
-        code = get_vlc2(&s->gb, rl->vlc.table, TEX_VLC_BITS, 2);
-        if (code < 0){
-            av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n", s->mb_x, s->mb_y);
-            return -1;
-        }
-        if (code == rl->n) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+        if (run == 66) {
+            if (level){
+                CLOSE_READER(re, &s->gb);
+                av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
             /* escape */
             if (CONFIG_FLV_DECODER && s->h263_flv > 1) {
-                ff_flv2_decode_ac_esc(&s->gb, &level, &run, &last);
+                int is11 = SHOW_UBITS(re, &s->gb, 1);
+                SKIP_CACHE(re, &s->gb, 1);
+                run = SHOW_UBITS(re, &s->gb, 7) + 1;
+                if (is11) {
+                    SKIP_COUNTER(re, &s->gb, 1 + 7);
+                    UPDATE_CACHE(re, &s->gb);
+                    level = SHOW_SBITS(re, &s->gb, 11);
+                    SKIP_COUNTER(re, &s->gb, 11);
+                } else {
+                    SKIP_CACHE(re, &s->gb, 7);
+                    level = SHOW_SBITS(re, &s->gb, 7);
+                    SKIP_COUNTER(re, &s->gb, 1 + 7 + 7);
+                }
             } else {
-                last = get_bits1(&s->gb);
-                run = get_bits(&s->gb, 6);
-                level = (int8_t)get_bits(&s->gb, 8);
+                run = SHOW_UBITS(re, &s->gb, 7) + 1;
+                SKIP_CACHE(re, &s->gb, 7);
+                level = (int8_t)SHOW_UBITS(re, &s->gb, 8);
+                SKIP_COUNTER(re, &s->gb, 7 + 8);
                 if(level == -128){
+                    UPDATE_CACHE(re, &s->gb);
                     if (s->codec_id == AV_CODEC_ID_RV10) {
                         /* XXX: should patch encoder too */
-                        level = get_sbits(&s->gb, 12);
+                        level = SHOW_SBITS(re, &s->gb, 12);
+                        SKIP_COUNTER(re, &s->gb, 12);
                     }else{
-                        level = get_bits(&s->gb, 5);
-                        level |= get_sbits(&s->gb, 6)<<5;
+                        level = SHOW_UBITS(re, &s->gb, 5);
+                        SKIP_CACHE(re, &s->gb, 5);
+                        level |= SHOW_SBITS(re, &s->gb, 6)<<5;
+                        SKIP_COUNTER(re, &s->gb, 5 + 6);
                     }
                 }
             }
         } else {
-            run = rl->table_run[code];
-            level = rl->table_level[code];
-            last = code >= rl->last;
-            if (get_bits1(&s->gb))
+            if (SHOW_UBITS(re, &s->gb, 1))
                 level = -level;
+            SKIP_COUNTER(re, &s->gb, 1);
         }
         i += run;
         if (i >= 64){
+            CLOSE_READER(re, &s->gb);
+            // redo update without last flag, revert -1 offset
+            i = i - run + ((run-1)&63) + 1;
+            if (i < 64) {
+                // only last marker, no overrun
+                block[scan_table[i]] = level;
+                break;
+            }
             if(s->alt_inter_vlc && rl == &ff_h263_rl_inter && !s->mb_intra){
                 //Looks like a hack but no, it's the way it is supposed to work ...
                 rl = &ff_rl_intra_aic;
@@ -549,9 +556,7 @@ retry:
         }
         j = scan_table[i];
         block[j] = level;
-        if (last)
-            break;
-        i++;
+    }
     }
 not_coded:
     if (s->mb_intra && s->h263_aic) {
@@ -566,11 +571,13 @@ static int h263_skip_b_part(MpegEncContext *s, int cbp)
 {
     LOCAL_ALIGNED_16(int16_t, dblock, [64]);
     int i, mbi;
+    int bli[6];
 
     /* we have to set s->mb_intra to zero to decode B-part of PB-frame correctly
      * but real value should be restored in order to be used later (in OBMC condition)
      */
     mbi = s->mb_intra;
+    memcpy(bli, s->block_last_index, sizeof(bli));
     s->mb_intra = 0;
     for (i = 0; i < 6; i++) {
         if (h263_decode_block(s, dblock, i, cbp&32) < 0)
@@ -578,6 +585,7 @@ static int h263_skip_b_part(MpegEncContext *s, int cbp)
         cbp+=cbp;
     }
     s->mb_intra = mbi;
+    memcpy(s->block_last_index, bli, sizeof(bli));
     return 0;
 }
 
@@ -607,7 +615,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
     const int xy= s->mb_x + s->mb_y * s->mb_stride;
     int cbpb = 0, pb_mv_count = 0;
 
-    assert(!s->h263_pred);
+    av_assert2(!s->h263_pred);
 
     if (s->pict_type == AV_PICTURE_TYPE_P) {
         do{
@@ -747,15 +755,13 @@ int ff_h263_decode_mb(MpegEncContext *s,
         }else
             cbp=0;
 
-        assert(!s->mb_intra);
+        av_assert2(!s->mb_intra);
 
         if(IS_QUANT(mb_type)){
             h263_decode_dquant(s);
         }
 
         if(IS_DIRECT(mb_type)){
-            if (!s->pp_time)
-                return AVERROR_INVALIDDATA;
             s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
             mb_type |= ff_mpeg4_set_direct_mv(s, 0, 0);
         }else{
@@ -874,6 +880,10 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
 
     align_get_bits(&s->gb);
 
+    if (show_bits(&s->gb, 2) == 2 && s->avctx->frame_number == 0) {
+         av_log(s->avctx, AV_LOG_WARNING, "Header looks like RTP instead of H.263\n");
+    }
+
     startcode= get_bits(&s->gb, 22-8);
 
     for(i= get_bits_left(&s->gb); i>24; i-=8) {
@@ -894,9 +904,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     s->picture_number= (s->picture_number&~0xFF) + i;
 
     /* PTYPE starts here */
-    if (get_bits1(&s->gb) != 1) {
-        /* marker */
-        av_log(s->avctx, AV_LOG_ERROR, "Bad marker\n");
+    if (check_marker(&s->gb, "in PTYPE") != 1) {
         return -1;
     }
     if (get_bits1(&s->gb) != 0) {
@@ -920,6 +928,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
         /* H.263v1 */
         width = ff_h263_format[format][0];
         height = ff_h263_format[format][1];
+        if (!width)
+            return -1;
 
         s->pict_type = AV_PICTURE_TYPE_I + get_bits1(&s->gb);
 
@@ -961,6 +971,8 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
             s->h263_aic = get_bits1(&s->gb); /* Advanced Intra Coding (AIC) */
             s->loop_filter= get_bits1(&s->gb);
             s->unrestricted_mv = s->umvplus || s->obmc || s->loop_filter;
+            if(s->avctx->lowres)
+                s->loop_filter = 0;
 
             s->h263_slice_structured= get_bits1(&s->gb);
             if (get_bits1(&s->gb) != 0) {
@@ -1013,7 +1025,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
                 6-14 - reserved
                 */
                 width = (get_bits(&s->gb, 9) + 1) * 4;
-                skip_bits1(&s->gb);
+                check_marker(&s->gb, "in dimensions");
                 height = get_bits(&s->gb, 9) * 4;
                 ff_dlog(s->avctx, "\nH.263+ Custom picture: %dx%d\n",width,height);
                 if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
@@ -1028,6 +1040,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
                 height = ff_h263_format[format][1];
                 s->avctx->sample_aspect_ratio= (AVRational){12,11};
             }
+            s->avctx->sample_aspect_ratio.den <<= s->ehc_mode;
             if ((width == 0) || (height == 0))
                 return -1;
             s->width = width;
@@ -1103,20 +1116,17 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     }
 
     /* PEI */
-    while (get_bits1(&s->gb) != 0) {
-        skip_bits(&s->gb, 8);
-    }
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     if(s->h263_slice_structured){
-        if (get_bits1(&s->gb) != 1) {
-            av_log(s->avctx, AV_LOG_ERROR, "SEPB1 marker missing\n");
+        if (check_marker(&s->gb, "SEPB1") != 1) {
             return -1;
         }
 
         ff_h263_decode_mba(s);
 
-        if (get_bits1(&s->gb) != 1) {
-            av_log(s->avctx, AV_LOG_ERROR, "SEPB2 marker missing\n");
+        if (check_marker(&s->gb, "SEPB2") != 1) {
             return -1;
         }
     }
@@ -1131,7 +1141,7 @@ int ff_h263_decode_picture_header(MpegEncContext *s)
     }
 
         ff_h263_show_pict_info(s);
-    if (s->pict_type == AV_PICTURE_TYPE_I && s->codec_tag == AV_RL32("ZYGO")){
+    if (s->pict_type == AV_PICTURE_TYPE_I && s->codec_tag == AV_RL32("ZYGO") && get_bits_left(&s->gb) >= 85 + 13*3*16 + 50){
         int i,j;
         for(i=0; i<85; i++) av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb));
         av_log(s->avctx, AV_LOG_DEBUG, "\n");
diff --git a/libavcodec/ituh263enc.c b/libavcodec/ituh263enc.c
index a62d202..a8bfe91 100644
--- a/libavcodec/ituh263enc.c
+++ b/libavcodec/ituh263enc.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2001 Juan J. Sierralta P
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,7 @@
 /**
  * Table of number of bits a motion vector component needs.
  */
-static uint8_t mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
+static uint8_t mv_penalty[MAX_FCODE+1][MAX_DMV*2+1];
 
 /**
  * Minimal fcode that a motion vector component would need.
@@ -90,7 +90,7 @@ static const uint8_t wrong_run[102] = {
 av_const int ff_h263_aspect_to_info(AVRational aspect){
     int i;
 
-    if(aspect.num==0) aspect= (AVRational){1,1};
+    if(aspect.num==0 || aspect.den==0) aspect= (AVRational){1,1};
 
     for(i=1; i<6; i++){
         if(av_cmp_q(ff_h263_pixel_aspect[i], aspect) == 0){
@@ -228,19 +228,11 @@ void ff_h263_encode_picture_header(MpegEncContext * s, int picture_number)
     if(s->h263_slice_structured){
         put_bits(&s->pb, 1, 1);
 
-        assert(s->mb_x == 0 && s->mb_y == 0);
+        av_assert1(s->mb_x == 0 && s->mb_y == 0);
         ff_h263_encode_mba(s);
 
         put_bits(&s->pb, 1, 1);
     }
-
-    if(s->h263_aic){
-         s->y_dc_scale_table=
-         s->c_dc_scale_table= ff_aic_dc_scale_table;
-    }else{
-        s->y_dc_scale_table=
-        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
-    }
 }
 
 /**
@@ -395,7 +387,7 @@ static void h263_encode_block(MpegEncContext * s, int16_t * block, int n)
                 put_bits(&s->pb, 1, last);
                 put_bits(&s->pb, 6, run);
 
-                assert(slevel != 0);
+                av_assert2(slevel != 0);
 
                 if(level < 128)
                     put_sbits(&s->pb, 8, slevel);
@@ -416,7 +408,7 @@ static void h263_encode_block(MpegEncContext * s, int16_t * block, int n)
 }
 
 /* Encode MV differences on H.263+ with Unrestricted MV mode */
-static void h263p_encode_umotion(MpegEncContext * s, int val)
+static void h263p_encode_umotion(PutBitContext *pb, int val)
 {
     short sval = 0;
     short i = 0;
@@ -426,11 +418,11 @@ static void h263p_encode_umotion(MpegEncContext * s, int val)
     int tcode;
 
     if ( val == 0)
-        put_bits(&s->pb, 1, 1);
+        put_bits(pb, 1, 1);
     else if (val == 1)
-        put_bits(&s->pb, 3, 0);
+        put_bits(pb, 3, 0);
     else if (val == -1)
-        put_bits(&s->pb, 3, 2);
+        put_bits(pb, 3, 2);
     else {
 
         sval = ((val < 0) ? (short)(-val):(short)val);
@@ -449,7 +441,7 @@ static void h263p_encode_umotion(MpegEncContext * s, int val)
             i--;
         }
         code = ((code << 1) | (val < 0)) << 1;
-        put_bits(&s->pb, (2*n_bits)+1, code);
+        put_bits(pb, (2*n_bits)+1, code);
     }
 }
 
@@ -506,8 +498,8 @@ void ff_h263_encode_mb(MpegEncContext * s,
                                                 motion_y - pred_y, 1);
             }
             else {
-                h263p_encode_umotion(s, motion_x - pred_x);
-                h263p_encode_umotion(s, motion_y - pred_y);
+                h263p_encode_umotion(&s->pb, motion_x - pred_x);
+                h263p_encode_umotion(&s->pb, motion_y - pred_y);
                 if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
                     /* To prevent Start Code emulation */
                     put_bits(&s->pb,1,1);
@@ -535,8 +527,8 @@ void ff_h263_encode_mb(MpegEncContext * s,
                                                     motion_y - pred_y, 1);
                 }
                 else {
-                    h263p_encode_umotion(s, motion_x - pred_x);
-                    h263p_encode_umotion(s, motion_y - pred_y);
+                    h263p_encode_umotion(&s->pb, motion_x - pred_x);
+                    h263p_encode_umotion(&s->pb, motion_y - pred_y);
                     if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
                         /* To prevent Start Code emulation */
                         put_bits(&s->pb,1,1);
@@ -548,7 +540,7 @@ void ff_h263_encode_mb(MpegEncContext * s,
             s->mv_bits+= get_bits_diff(s);
         }
     } else {
-        assert(s->mb_intra);
+        av_assert2(s->mb_intra);
 
         cbp = 0;
         if (s->h263_aic) {
@@ -652,14 +644,14 @@ void ff_h263_encode_mb(MpegEncContext * s,
     }
 }
 
-void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
+void ff_h263_encode_motion(PutBitContext *pb, int val, int f_code)
 {
     int range, bit_size, sign, code, bits;
 
     if (val == 0) {
         /* zero vector */
         code = 0;
-        put_bits(&s->pb, ff_mvtab[code][1], ff_mvtab[code][0]);
+        put_bits(pb, ff_mvtab[code][1], ff_mvtab[code][0]);
     } else {
         bit_size = f_code - 1;
         range = 1 << bit_size;
@@ -673,9 +665,9 @@ void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
         code = (val >> bit_size) + 1;
         bits = val & (range - 1);
 
-        put_bits(&s->pb, ff_mvtab[code][1] + 1, (ff_mvtab[code][0] << 1) | sign);
+        put_bits(pb, ff_mvtab[code][1] + 1, (ff_mvtab[code][0] << 1) | sign);
         if (bit_size > 0) {
-            put_bits(&s->pb, bit_size, bits);
+            put_bits(pb, bit_size, bits);
         }
     }
 }
@@ -686,7 +678,7 @@ static av_cold void init_mv_penalty_and_fcode(MpegEncContext *s)
     int mv;
 
     for(f_code=1; f_code<=MAX_FCODE; f_code++){
-        for(mv=-MAX_MV; mv<=MAX_MV; mv++){
+        for(mv=-MAX_DMV; mv<=MAX_DMV; mv++){
             int len;
 
             if(mv==0) len= ff_mvtab[0][1];
@@ -707,7 +699,7 @@ static av_cold void init_mv_penalty_and_fcode(MpegEncContext *s)
                 }
             }
 
-            mv_penalty[f_code][mv+MAX_MV]= len;
+            mv_penalty[f_code][mv+MAX_DMV]= len;
         }
     }
 
@@ -727,8 +719,8 @@ static av_cold void init_uni_h263_rl_tab(RLTable *rl, uint32_t *bits_tab,
 {
     int slevel, run, last;
 
-    assert(MAX_LEVEL >= 64);
-    assert(MAX_RUN   >= 63);
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN   >= 63);
 
     for(slevel=-64; slevel<64; slevel++){
         if(slevel==0) continue;
@@ -817,12 +809,15 @@ av_cold void ff_h263_encode_init(MpegEncContext *s)
             s->min_qcoeff= -127;
             s->max_qcoeff=  127;
         }
-        s->y_dc_scale_table=
-        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
         break;
     default: //nothing needed - default table already set in mpegvideo.c
         s->min_qcoeff= -127;
         s->max_qcoeff=  127;
+    }
+    if(s->h263_aic){
+         s->y_dc_scale_table=
+         s->c_dc_scale_table= ff_aic_dc_scale_table;
+    }else{
         s->y_dc_scale_table=
         s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
     }
diff --git a/libavcodec/ivi.c b/libavcodec/ivi.c
index caa3fe6..a1eab94 100644
--- a/libavcodec/ivi.c
+++ b/libavcodec/ivi.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 
 #define BITSTREAM_READER_LE
 #include "libavutil/attributes.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "get_bits.h"
@@ -87,12 +88,9 @@ static int ivi_mc(IVIBandDesc *band, ivi_mc_func mc, ivi_mc_avg_func mc_avg,
     int ref_size = (mc_type > 1) * band->pitch + (mc_type & 1);
 
     if (mc_type != -1) {
-        if (offs < 0 || ref_offs < 0 || !band->ref_buf)
-            return AVERROR_INVALIDDATA;
-        if (buf_size - min_size < offs)
-            return AVERROR_INVALIDDATA;
-        if (buf_size - min_size - ref_size < ref_offs)
-            return AVERROR_INVALIDDATA;
+        av_assert0(offs >= 0 && ref_offs >= 0 && band->ref_buf);
+        av_assert0(buf_size - min_size >= offs);
+        av_assert0(buf_size - min_size - ref_size >= ref_offs);
     }
 
     if (mc_type2 == -1) {
@@ -136,7 +134,7 @@ static uint16_t inv_bits(uint16_t val, int nbits)
 
 /*
  *  Generate a huffman codebook from the given descriptor
- *  and convert it into the Libav VLC table.
+ *  and convert it into the FFmpeg VLC table.
  *
  *  @param[in]   cb    pointer to codebook descriptor
  *  @param[out]  vlc   where to place the generated VLC table
@@ -250,7 +248,7 @@ int ff_ivi_dec_huff_desc(GetBitContext *gb, int desc_coded, int which_tab,
             new_huff.xbits[i] = get_bits(gb, 4);
 
         /* Have we got the same custom table? Rebuild if not. */
-        if (ivi_huff_desc_cmp(&new_huff, &huff_tab->cust_desc)) {
+        if (ivi_huff_desc_cmp(&new_huff, &huff_tab->cust_desc) || !huff_tab->cust_tab.table) {
             ivi_huff_desc_copy(&huff_tab->cust_desc, &new_huff);
 
             if (huff_tab->cust_tab.table)
@@ -285,6 +283,7 @@ static av_cold void ivi_free_buffers(IVIPlaneDesc *planes)
     int p, b, t;
 
     for (p = 0; p < 3; p++) {
+        if (planes[p].bands)
         for (b = 0; b < planes[p].num_bands; b++) {
             av_freep(&planes[p].bands[b].bufs[0]);
             av_freep(&planes[p].bands[b].bufs[1]);
@@ -312,7 +311,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
 
     ivi_free_buffers(planes);
 
-    if (cfg->pic_width < 1 || cfg->pic_height < 1 ||
+    if (av_image_check_size(cfg->pic_width, cfg->pic_height, 0, NULL) < 0 ||
         cfg->luma_bands < 1 || cfg->chroma_bands < 1)
         return AVERROR_INVALIDDATA;
 
@@ -327,7 +326,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
     planes[1].num_bands = planes[2].num_bands = cfg->chroma_bands;
 
     for (p = 0; p < 3; p++) {
-        planes[p].bands = av_mallocz(planes[p].num_bands * sizeof(IVIBandDesc));
+        planes[p].bands = av_mallocz_array(planes[p].num_bands, sizeof(IVIBandDesc));
         if (!planes[p].bands)
             return AVERROR(ENOMEM);
 
@@ -356,6 +355,7 @@ av_cold int ff_ivi_init_planes(IVIPlaneDesc *planes, const IVIPicConfig *cfg,
             band->aheight  = height_aligned;
             band->bufs[0]  = av_mallocz(buf_size);
             band->bufs[1]  = av_mallocz(buf_size);
+            band->bufsize  = buf_size/2;
             if (!band->bufs[0] || !band->bufs[1])
                 return AVERROR(ENOMEM);
 
@@ -397,14 +397,16 @@ static int ivi_init_tiles(IVIBandDesc *band, IVITile *ref_tile,
                                               band->mb_size);
 
             av_freep(&tile->mbs);
-            tile->mbs = av_malloc(tile->num_MBs * sizeof(IVIMbInfo));
+            tile->mbs = av_mallocz_array(tile->num_MBs, sizeof(IVIMbInfo));
             if (!tile->mbs)
                 return AVERROR(ENOMEM);
 
             tile->ref_mbs = 0;
             if (p || b) {
-                if (tile->num_MBs != ref_tile->num_MBs)
+                if (tile->num_MBs != ref_tile->num_MBs) {
+                    av_log(NULL, AV_LOG_DEBUG, "ref_tile mismatch\n");
                     return AVERROR_INVALIDDATA;
+                }
                 tile->ref_mbs = ref_tile->mbs;
                 ref_tile++;
             }
@@ -429,6 +431,8 @@ av_cold int ff_ivi_init_tiles(IVIPlaneDesc *planes,
             t_width  >>= 1;
             t_height >>= 1;
         }
+        if(t_width<=0 || t_height<=0)
+            return AVERROR(EINVAL);
 
         for (b = 0; b < planes[p].num_bands; b++) {
             band = &planes[p].bands[b];
@@ -437,7 +441,7 @@ av_cold int ff_ivi_init_tiles(IVIPlaneDesc *planes,
             band->num_tiles = x_tiles * y_tiles;
 
             av_freep(&band->tiles);
-            band->tiles = av_mallocz(band->num_tiles * sizeof(IVITile));
+            band->tiles = av_mallocz_array(band->num_tiles, sizeof(IVITile));
             if (!band->tiles)
                 return AVERROR(ENOMEM);
 
@@ -486,10 +490,6 @@ static int ivi_dc_transform(IVIBandDesc *band, int *prev_dc, int buf_offs,
     int buf_size = band->pitch * band->aheight - buf_offs;
     int min_size = (blk_size - 1) * band->pitch + blk_size;
 
-    if (!band->dc_transform)
-        return 0;
-
-
     if (min_size > buf_size)
         return AVERROR_INVALIDDATA;
 
@@ -583,6 +583,11 @@ static int ivi_decode_coded_blocks(GetBitContext *gb, IVIBandDesc *band,
         col_flags[0] |= !!*prev_dc;
     }
 
+    if(band->transform_size > band->blk_size){
+        av_log(NULL, AV_LOG_ERROR, "Too large transform\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* apply inverse transform */
     band->inv_transform(trvec, band->buf + offs,
                         band->pitch, col_flags);
@@ -642,7 +647,7 @@ static int ivi_decode_blocks(GetBitContext *gb, IVIBandDesc *band,
 
         quant = band->glob_quant + mb->q_delta;
         if (avctx->codec_id == AV_CODEC_ID_INDEO4)
-            quant = av_clip(quant, 0, 31);
+            quant = av_clip_uintp2(quant, 5);
         else
             quant = av_clip(quant, 0, 23);
 
@@ -805,6 +810,22 @@ static int ivi_process_empty_tile(AVCodecContext *avctx, IVIBandDesc *band,
                     mb->mv_y = ref_mb->mv_y;
                 }
                 need_mc |= mb->mv_x || mb->mv_y; /* tracking non-zero motion vectors */
+                {
+                    int dmv_x, dmv_y, cx, cy;
+
+                    dmv_x = mb->mv_x >> band->is_halfpel;
+                    dmv_y = mb->mv_y >> band->is_halfpel;
+                    cx    = mb->mv_x &  band->is_halfpel;
+                    cy    = mb->mv_y &  band->is_halfpel;
+
+                    if (   mb->xpos + dmv_x < 0
+                        || mb->xpos + dmv_x + band->mb_size + cx > band->pitch
+                        || mb->ypos + dmv_y < 0
+                        || mb->ypos + dmv_y + band->mb_size + cy > band->aheight) {
+                        av_log(avctx, AV_LOG_ERROR, "MV out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
             }
 
             mb++;
@@ -946,6 +967,10 @@ static int decode_band(IVI45DecContext *ctx,
         idx2 = band->corr[i * 2 + 1];
         FFSWAP(uint8_t, band->rv_map->runtab[idx1], band->rv_map->runtab[idx2]);
         FFSWAP(int16_t, band->rv_map->valtab[idx1], band->rv_map->valtab[idx2]);
+        if (idx1 == band->rv_map->eob_sym || idx2 == band->rv_map->eob_sym)
+            band->rv_map->eob_sym ^= idx1 ^ idx2;
+        if (idx1 == band->rv_map->esc_sym || idx2 == band->rv_map->esc_sym)
+            band->rv_map->esc_sym ^= idx1 ^ idx2;
     }
 
     pos = get_bits_count(&ctx->gb);
@@ -969,7 +994,8 @@ static int decode_band(IVI45DecContext *ctx,
             tile->data_size = ivi_dec_tile_data_size(&ctx->gb);
             if (!tile->data_size) {
                 av_log(avctx, AV_LOG_ERROR, "Tile data size is zero!\n");
-                return AVERROR_INVALIDDATA;
+                result = AVERROR_INVALIDDATA;
+                break;
             }
 
             result = ctx->decode_mb_info(ctx, band, tile, avctx);
@@ -1001,6 +1027,10 @@ static int decode_band(IVI45DecContext *ctx,
         idx2 = band->corr[i*2+1];
         FFSWAP(uint8_t, band->rv_map->runtab[idx1], band->rv_map->runtab[idx2]);
         FFSWAP(int16_t, band->rv_map->valtab[idx1], band->rv_map->valtab[idx2]);
+        if (idx1 == band->rv_map->eob_sym || idx2 == band->rv_map->eob_sym)
+            band->rv_map->eob_sym ^= idx1 ^ idx2;
+        if (idx1 == band->rv_map->esc_sym || idx2 == band->rv_map->esc_sym)
+            band->rv_map->esc_sym ^= idx1 ^ idx2;
     }
 
 #ifdef DEBUG
@@ -1068,6 +1098,7 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     //{ START_TIMER;
 
     if (ctx->is_nonnull_frame(ctx)) {
+        ctx->buf_invalid[ctx->dst_buf] = 1;
         for (p = 0; p < 3; p++) {
             for (b = 0; b < ctx->planes[p].num_bands; b++) {
                 result = decode_band(ctx, &ctx->planes[p].bands[b], avctx);
@@ -1078,6 +1109,7 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 }
             }
         }
+        ctx->buf_invalid[ctx->dst_buf] = 0;
     } else {
         if (ctx->is_scalable)
             return AVERROR_INVALIDDATA;
@@ -1087,17 +1119,20 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 return AVERROR_INVALIDDATA;
         }
     }
+    if (ctx->buf_invalid[ctx->dst_buf])
+        return -1;
 
     //STOP_TIMER("decode_planes"); }
 
+    if (!ctx->is_nonnull_frame(ctx))
+        return buf_size;
+
     result = ff_set_dimensions(avctx, ctx->planes[0].width, ctx->planes[0].height);
     if (result < 0)
         return result;
 
-    if ((result = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((result = ff_get_buffer(avctx, frame, 0)) < 0)
         return result;
-    }
 
     if (ctx->is_scalable) {
         if (ctx->is_indeo4)
@@ -1121,7 +1156,11 @@ int ff_ivi_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (ctx->is_indeo4 && ctx->frame_type == IVI4_FRAMETYPE_INTRA) {
         int left;
 
-        while (get_bits(&ctx->gb, 8)); // skip version string
+            // skip version string
+        while (get_bits(&ctx->gb, 8)) {
+            if (get_bits_left(&ctx->gb) < 8)
+                return AVERROR_INVALIDDATA;
+        }
         left = get_bits_count(&ctx->gb) & 0x18;
         skip_bits_long(&ctx->gb, 64 - left);
         if (get_bits_left(&ctx->gb) > 18 &&
diff --git a/libavcodec/ivi.h b/libavcodec/ivi.h
index 9b4824b..3571808 100644
--- a/libavcodec/ivi.h
+++ b/libavcodec/ivi.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -164,6 +164,7 @@ typedef struct IVIBandDesc {
     int             quant_mat;      ///< dequant matrix index
     int             glob_quant;     ///< quant base for this band
     const uint8_t   *scan;          ///< ptr to the scan pattern
+    int             scan_size;      ///< size of the scantable
 
     IVIHuffTab      blk_vlc;        ///< vlc table for decoding block data
 
@@ -261,6 +262,7 @@ typedef struct IVI45DecContext {
     int             (*is_nonnull_frame)(struct IVI45DecContext *ctx);
 
     int gop_invalid;
+    int buf_invalid[4];
 
     int is_indeo4;
 
diff --git a/libavcodec/ivi_dsp.c b/libavcodec/ivi_dsp.c
index bf0bec1..bc9de1a 100644
--- a/libavcodec/ivi_dsp.c
+++ b/libavcodec/ivi_dsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
     int32_t         b0_1, b0_2, b1_1, b1_2, b1_3, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6;
     int32_t         b3_1, b3_2, b3_3, b3_4, b3_5, b3_6, b3_7, b3_8, b3_9;
     int32_t         pitch, back_pitch;
-    const short    *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
+    const short     *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
     const int       num_bands = 4;
 
     /* all bands should have the same pitch */
@@ -54,6 +54,9 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
     b3_ptr = plane->bands[3].buf;
 
     for (y = 0; y < plane->height; y += 2) {
+
+        if (y+2 >= plane->height)
+            pitch= 0;
         /* load storage variables with values */
         if (num_bands > 0) {
             b0_1 = b0_ptr[0];
@@ -83,6 +86,13 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
         }
 
         for (x = 0, indx = 0; x < plane->width; x+=2, indx++) {
+            if (x+2 >= plane->width) {
+                b0_ptr --;
+                b1_ptr --;
+                b2_ptr --;
+                b3_ptr --;
+            }
+
             /* some values calculated in the previous iterations can */
             /* be reused in the next ones, so do appropriate copying */
             b2_1 = b2_2; // b2[x-1,y  ] = b2[x,  y  ]
@@ -170,10 +180,10 @@ void ff_ivi_recompose53(const IVIPlaneDesc *plane, uint8_t *dst,
 
         back_pitch = -pitch;
 
-        b0_ptr += pitch;
-        b1_ptr += pitch;
-        b2_ptr += pitch;
-        b3_ptr += pitch;
+        b0_ptr += pitch + 1;
+        b1_ptr += pitch + 1;
+        b2_ptr += pitch + 1;
+        b3_ptr += pitch + 1;
     }
 }
 
@@ -181,7 +191,7 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
                            const int dst_pitch)
 {
     int             x, y, indx, b0, b1, b2, b3, p0, p1, p2, p3;
-    const short    *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
+    const short     *b0_ptr, *b1_ptr, *b2_ptr, *b3_ptr;
     int32_t         pitch;
 
     /* all bands should have the same pitch */
@@ -225,15 +235,15 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
 
 /** butterfly operation for the inverse Haar transform */
 #define IVI_HAAR_BFLY(s1, s2, o1, o2, t) \
-    t  = (s1 - s2) >> 1;\
-    o1 = (s1 + s2) >> 1;\
-    o2 = t;\
+    t  = ((s1) - (s2)) >> 1;\
+    o1 = ((s1) + (s2)) >> 1;\
+    o2 = (t);\
 
 /** inverse 8-point Haar transform */
 #define INV_HAAR8(s1, s5, s3, s7, s2, s4, s6, s8,\
                   d1, d2, d3, d4, d5, d6, d7, d8,\
                   t0, t1, t2, t3, t4, t5, t6, t7, t8) {\
-    t1 = s1 << 1; t5 = s5 << 1;\
+    t1 = (s1) << 1; t5 = (s5) << 1;\
     IVI_HAAR_BFLY(t1, t5, t1, t5, t0); IVI_HAAR_BFLY(t1, s3, t1, t3, t0);\
     IVI_HAAR_BFLY(t5, s7, t5, t7, t0); IVI_HAAR_BFLY(t1, s2, t1, t2, t0);\
     IVI_HAAR_BFLY(t3, s4, t3, t4, t0); IVI_HAAR_BFLY(t5, s6, t5, t6, t0);\
@@ -475,21 +485,21 @@ void ff_ivi_dc_haar_2d(const int32_t *in, int16_t *out, uint32_t pitch,
 
 /** butterfly operation for the inverse slant transform */
 #define IVI_SLANT_BFLY(s1, s2, o1, o2, t) \
-    t  = s1 - s2;\
-    o1 = s1 + s2;\
-    o2 = t;\
+    t  = (s1) - (s2);\
+    o1 = (s1) + (s2);\
+    o2 = (t);\
 
 /** This is a reflection a,b = 1/2, 5/4 for the inverse slant transform */
 #define IVI_IREFLECT(s1, s2, o1, o2, t) \
-    t  = ((s1 + s2*2 + 2) >> 2) + s1;\
-    o2 = ((s1*2 - s2 + 2) >> 2) - s2;\
-    o1 = t;\
+    t  = (((s1) + (s2)*2 + 2) >> 2) + (s1);\
+    o2 = (((s1)*2 - (s2) + 2) >> 2) - (s2);\
+    o1 = (t);\
 
 /** This is a reflection a,b = 1/2, 7/8 for the inverse slant transform */
 #define IVI_SLANT_PART4(s1, s2, o1, o2, t) \
-    t  = s2 + ((s1*4  - s2 + 4) >> 3);\
-    o2 = s1 + ((-s1 - s2*4 + 4) >> 3);\
-    o1 = t;\
+    t  = (s2) + (((s1)*4  - (s2) + 4) >> 3);\
+    o2 = (s1) + ((-(s1) - (s2)*4 + 4) >> 3);\
+    o1 = (t);\
 
 /** inverse slant8 transform */
 #define IVI_INV_SLANT8(s1, s4, s8, s5, s2, s6, s3, s7,\
@@ -547,7 +557,7 @@ void ff_ivi_inverse_slant_8x8(const int32_t *in, int16_t *out, uint32_t pitch, c
     }
 #undef COMPENSATE
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     src = tmp;
     for (i = 0; i < 8; i++) {
         if (!src[0] && !src[1] && !src[2] && !src[3] && !src[4] && !src[5] && !src[6] && !src[7]) {
@@ -587,7 +597,7 @@ void ff_ivi_inverse_slant_4x4(const int32_t *in, int16_t *out, uint32_t pitch, c
     }
 #undef COMPENSATE
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     src = tmp;
     for (i = 0; i < 4; i++) {
         if (!src[0] && !src[1] && !src[2] && !src[3]) {
@@ -621,7 +631,7 @@ void ff_ivi_row_slant8(const int32_t *in, int16_t *out, uint32_t pitch, const ui
     int     i;
     int     t0, t1, t2, t3, t4, t5, t6, t7, t8;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 8; i++) {
         if (!in[0] && !in[1] && !in[2] && !in[3] && !in[4] && !in[5] && !in[6] && !in[7]) {
             memset(out, 0, 8*sizeof(out[0]));
@@ -663,7 +673,7 @@ void ff_ivi_col_slant8(const int32_t *in, int16_t *out, uint32_t pitch, const ui
     row4 = pitch << 2;
     row8 = pitch << 3;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 8; i++) {
         if (flags[i]) {
             IVI_INV_SLANT8(in[0], in[8], in[16], in[24], in[32], in[40], in[48], in[56],
@@ -700,7 +710,7 @@ void ff_ivi_row_slant4(const int32_t *in, int16_t *out, uint32_t pitch, const ui
     int     i;
     int     t0, t1, t2, t3, t4;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 4; i++) {
         if (!in[0] && !in[1] && !in[2] && !in[3]) {
             memset(out, 0, 4*sizeof(out[0]));
@@ -722,7 +732,7 @@ void ff_ivi_col_slant4(const int32_t *in, int16_t *out, uint32_t pitch, const ui
 
     row2 = pitch << 1;
 
-#define COMPENSATE(x) ((x + 1)>>1)
+#define COMPENSATE(x) (((x) + 1)>>1)
     for (i = 0; i < 4; i++) {
         if (flags[i]) {
             IVI_INV_SLANT4(in[0], in[4], in[8], in[12],
diff --git a/libavcodec/ivi_dsp.h b/libavcodec/ivi_dsp.h
index ac9dcbc..c38bb3b 100644
--- a/libavcodec/ivi_dsp.h
+++ b/libavcodec/ivi_dsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,10 @@ void ff_ivi_recompose_haar(const IVIPlaneDesc *plane, uint8_t *dst,
  */
 void ff_ivi_inverse_haar_8x8(const int32_t *in, int16_t *out, uint32_t pitch,
                              const uint8_t *flags);
+void ff_ivi_inverse_haar_8x1(const int32_t *in, int16_t *out, uint32_t pitch,
+                             const uint8_t *flags);
+void ff_ivi_inverse_haar_1x8(const int32_t *in, int16_t *out, uint32_t pitch,
+                             const uint8_t *flags);
 
 /**
  *  one-dimensional inverse 8-point Haar transform on rows for Indeo 4
diff --git a/libavcodec/j2kenc.c b/libavcodec/j2kenc.c
new file mode 100644
index 0000000..c8d3861
--- /dev/null
+++ b/libavcodec/j2kenc.c
@@ -0,0 +1,1218 @@
+/*
+ * JPEG2000 image encoder
+ * Copyright (c) 2007 Kamil Nowosad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * **********************************************************************************************************************
+ *
+ *
+ *
+ * This source code incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium
+ * Copyright (c) 2002-2007, Professor Benoit Macq
+ * Copyright (c) 2001-2003, David Janssens
+ * Copyright (c) 2002-2003, Yannick Verschueren
+ * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe
+ * Copyright (c) 2005, Herve Drolon, FreeImage Team
+ * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/**
+ * JPEG2000 image encoder
+ * @file
+ * @author Kamil Nowosad
+ */
+
+#include <float.h>
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "jpeg2000.h"
+#include "libavutil/common.h"
+#include "libavutil/opt.h"
+
+#define NMSEDEC_BITS 7
+#define NMSEDEC_FRACBITS (NMSEDEC_BITS-1)
+#define WMSEDEC_SHIFT 13 ///< must be >= 13
+#define LAMBDA_SCALE (100000000LL << (WMSEDEC_SHIFT - 13))
+
+#define CODEC_JP2 1
+#define CODEC_J2K 0
+
+static int lut_nmsedec_ref [1<<NMSEDEC_BITS],
+           lut_nmsedec_ref0[1<<NMSEDEC_BITS],
+           lut_nmsedec_sig [1<<NMSEDEC_BITS],
+           lut_nmsedec_sig0[1<<NMSEDEC_BITS];
+
+static const int dwt_norms[2][4][10] = { // [dwt_type][band][rlevel] (multiplied by 10000)
+    {{10000, 19650, 41770,  84030, 169000, 338400,  676900, 1353000, 2706000, 5409000},
+     {20220, 39890, 83550, 170400, 342700, 686300, 1373000, 2746000, 5490000},
+     {20220, 39890, 83550, 170400, 342700, 686300, 1373000, 2746000, 5490000},
+     {20800, 38650, 83070, 171800, 347100, 695900, 1393000, 2786000, 5572000}},
+
+    {{10000, 15000, 27500, 53750, 106800, 213400, 426700, 853300, 1707000, 3413000},
+     {10380, 15920, 29190, 57030, 113300, 226400, 452500, 904800, 1809000},
+     {10380, 15920, 29190, 57030, 113300, 226400, 452500, 904800, 1809000},
+     { 7186,  9218, 15860, 30430,  60190, 120100, 240000, 479700,  959300}}
+};
+
+typedef struct {
+   Jpeg2000Component *comp;
+} Jpeg2000Tile;
+
+typedef struct {
+    AVClass *class;
+    AVCodecContext *avctx;
+    const AVFrame *picture;
+
+    int width, height; ///< image width and height
+    uint8_t cbps[4]; ///< bits per sample in particular components
+    int chroma_shift[2];
+    uint8_t planar;
+    int ncomponents;
+    int tile_width, tile_height; ///< tile size
+    int numXtiles, numYtiles;
+
+    uint8_t *buf_start;
+    uint8_t *buf;
+    uint8_t *buf_end;
+    int bit_index;
+
+    int64_t lambda;
+
+    Jpeg2000CodingStyle codsty;
+    Jpeg2000QuantStyle  qntsty;
+
+    Jpeg2000Tile *tile;
+
+    int format;
+    int pred;
+} Jpeg2000EncoderContext;
+
+
+/* debug */
+#if 0
+#undef ifprintf
+#undef printf
+
+static void nspaces(FILE *fd, int n)
+{
+    while(n--) putc(' ', fd);
+}
+
+static void printcomp(Jpeg2000Component *comp)
+{
+    int i;
+    for (i = 0; i < comp->y1 - comp->y0; i++)
+        ff_jpeg2000_printv(comp->i_data + i * (comp->x1 - comp->x0), comp->x1 - comp->x0);
+}
+
+static void dump(Jpeg2000EncoderContext *s, FILE *fd)
+{
+    int tileno, compno, reslevelno, bandno, precno;
+    fprintf(fd, "XSiz = %d, YSiz = %d, tile_width = %d, tile_height = %d\n"
+                "numXtiles = %d, numYtiles = %d, ncomponents = %d\n"
+                "tiles:\n",
+            s->width, s->height, s->tile_width, s->tile_height,
+            s->numXtiles, s->numYtiles, s->ncomponents);
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        nspaces(fd, 2);
+        fprintf(fd, "tile %d:\n", tileno);
+        for(compno = 0; compno < s->ncomponents; compno++){
+            Jpeg2000Component *comp = tile->comp + compno;
+            nspaces(fd, 4);
+            fprintf(fd, "component %d:\n", compno);
+            nspaces(fd, 4);
+            fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d\n",
+                        comp->x0, comp->x1, comp->y0, comp->y1);
+            for(reslevelno = 0; reslevelno < s->nreslevels; reslevelno++){
+                Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+                nspaces(fd, 6);
+                fprintf(fd, "reslevel %d:\n", reslevelno);
+                nspaces(fd, 6);
+                fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d, nbands = %d\n",
+                        reslevel->x0, reslevel->x1, reslevel->y0,
+                        reslevel->y1, reslevel->nbands);
+                for(bandno = 0; bandno < reslevel->nbands; bandno++){
+                    Jpeg2000Band *band = reslevel->band + bandno;
+                    nspaces(fd, 8);
+                    fprintf(fd, "band %d:\n", bandno);
+                    nspaces(fd, 8);
+                    fprintf(fd, "x0 = %d, x1 = %d, y0 = %d, y1 = %d,"
+                                "codeblock_width = %d, codeblock_height = %d cblknx = %d cblkny = %d\n",
+                                band->x0, band->x1,
+                                band->y0, band->y1,
+                                band->codeblock_width, band->codeblock_height,
+                                band->cblknx, band->cblkny);
+                    for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                        Jpeg2000Prec *prec = band->prec + precno;
+                        nspaces(fd, 10);
+                        fprintf(fd, "prec %d:\n", precno);
+                        nspaces(fd, 10);
+                        fprintf(fd, "xi0 = %d, xi1 = %d, yi0 = %d, yi1 = %d\n",
+                                     prec->xi0, prec->xi1, prec->yi0, prec->yi1);
+                    }
+                }
+            }
+        }
+    }
+}
+#endif
+
+/* bitstream routines */
+
+/** put n times val bit */
+static void put_bits(Jpeg2000EncoderContext *s, int val, int n) // TODO: optimize
+{
+    while (n-- > 0){
+        if (s->bit_index == 8)
+        {
+            s->bit_index = *s->buf == 0xff;
+            *(++s->buf) = 0;
+        }
+        *s->buf |= val << (7 - s->bit_index++);
+    }
+}
+
+/** put n least significant bits of a number num */
+static void put_num(Jpeg2000EncoderContext *s, int num, int n)
+{
+    while(--n >= 0)
+        put_bits(s, (num >> n) & 1, 1);
+}
+
+/** flush the bitstream */
+static void j2k_flush(Jpeg2000EncoderContext *s)
+{
+    if (s->bit_index){
+        s->bit_index = 0;
+        s->buf++;
+    }
+}
+
+/* tag tree routines */
+
+/** code the value stored in node */
+static void tag_tree_code(Jpeg2000EncoderContext *s, Jpeg2000TgtNode *node, int threshold)
+{
+    Jpeg2000TgtNode *stack[30];
+    int sp = 1, curval = 0;
+    stack[0] = node;
+
+    node = node->parent;
+    while(node){
+        if (node->vis){
+            curval = node->val;
+            break;
+        }
+        node->vis++;
+        stack[sp++] = node;
+        node = node->parent;
+    }
+    while(--sp >= 0){
+        if (stack[sp]->val >= threshold){
+            put_bits(s, 0, threshold - curval);
+            break;
+        }
+        put_bits(s, 0, stack[sp]->val - curval);
+        put_bits(s, 1, 1);
+        curval = stack[sp]->val;
+    }
+}
+
+/** update the value in node */
+static void tag_tree_update(Jpeg2000TgtNode *node)
+{
+    int lev = 0;
+    while (node->parent){
+        if (node->parent->val <= node->val)
+            break;
+        node->parent->val = node->val;
+        node = node->parent;
+        lev++;
+    }
+}
+
+static int put_siz(Jpeg2000EncoderContext *s)
+{
+    int i;
+
+    if (s->buf_end - s->buf < 40 + 3 * s->ncomponents)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_SIZ);
+    bytestream_put_be16(&s->buf, 38 + 3 * s->ncomponents); // Lsiz
+    bytestream_put_be16(&s->buf, 0); // Rsiz
+    bytestream_put_be32(&s->buf, s->width); // width
+    bytestream_put_be32(&s->buf, s->height); // height
+    bytestream_put_be32(&s->buf, 0); // X0Siz
+    bytestream_put_be32(&s->buf, 0); // Y0Siz
+
+    bytestream_put_be32(&s->buf, s->tile_width); // XTSiz
+    bytestream_put_be32(&s->buf, s->tile_height); // YTSiz
+    bytestream_put_be32(&s->buf, 0); // XT0Siz
+    bytestream_put_be32(&s->buf, 0); // YT0Siz
+    bytestream_put_be16(&s->buf, s->ncomponents); // CSiz
+
+    for (i = 0; i < s->ncomponents; i++){ // Ssiz_i XRsiz_i, YRsiz_i
+        bytestream_put_byte(&s->buf, 7);
+        bytestream_put_byte(&s->buf, i?1<<s->chroma_shift[0]:1);
+        bytestream_put_byte(&s->buf, i?1<<s->chroma_shift[1]:1);
+    }
+    return 0;
+}
+
+static int put_cod(Jpeg2000EncoderContext *s)
+{
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    if (s->buf_end - s->buf < 14)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_COD);
+    bytestream_put_be16(&s->buf, 12); // Lcod
+    bytestream_put_byte(&s->buf, 0);  // Scod
+    // SGcod
+    bytestream_put_byte(&s->buf, 0); // progression level
+    bytestream_put_be16(&s->buf, 1); // num of layers
+    if(s->avctx->pix_fmt == AV_PIX_FMT_YUV444P){
+        bytestream_put_byte(&s->buf, 0); // unspecified
+    }else{
+        bytestream_put_byte(&s->buf, 0); // unspecified
+    }
+    // SPcod
+    bytestream_put_byte(&s->buf, codsty->nreslevels - 1); // num of decomp. levels
+    bytestream_put_byte(&s->buf, codsty->log2_cblk_width-2); // cblk width
+    bytestream_put_byte(&s->buf, codsty->log2_cblk_height-2); // cblk height
+    bytestream_put_byte(&s->buf, 0); // cblk style
+    bytestream_put_byte(&s->buf, codsty->transform == FF_DWT53); // transformation
+    return 0;
+}
+
+static int put_qcd(Jpeg2000EncoderContext *s, int compno)
+{
+    int i, size;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    if (qntsty->quantsty == JPEG2000_QSTY_NONE)
+        size = 4 + 3 * (codsty->nreslevels-1);
+    else // QSTY_SE
+        size = 5 + 6 * (codsty->nreslevels-1);
+
+    if (s->buf_end - s->buf < size + 2)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_QCD);
+    bytestream_put_be16(&s->buf, size);  // LQcd
+    bytestream_put_byte(&s->buf, (qntsty->nguardbits << 5) | qntsty->quantsty);  // Sqcd
+    if (qntsty->quantsty == JPEG2000_QSTY_NONE)
+        for (i = 0; i < codsty->nreslevels * 3 - 2; i++)
+            bytestream_put_byte(&s->buf, qntsty->expn[i] << 3);
+    else // QSTY_SE
+        for (i = 0; i < codsty->nreslevels * 3 - 2; i++)
+            bytestream_put_be16(&s->buf, (qntsty->expn[i] << 11) | qntsty->mant[i]);
+    return 0;
+}
+
+static int put_com(Jpeg2000EncoderContext *s, int compno)
+{
+    int size = 4 + strlen(LIBAVCODEC_IDENT);
+
+    if (s->avctx->flags & AV_CODEC_FLAG_BITEXACT)
+        return 0;
+
+    if (s->buf_end - s->buf < size + 2)
+        return -1;
+
+    bytestream_put_be16(&s->buf, JPEG2000_COM);
+    bytestream_put_be16(&s->buf, size);
+    bytestream_put_be16(&s->buf, 1); // General use (ISO/IEC 8859-15 (Latin) values)
+
+    bytestream_put_buffer(&s->buf, LIBAVCODEC_IDENT, strlen(LIBAVCODEC_IDENT));
+
+    return 0;
+}
+
+static uint8_t *put_sot(Jpeg2000EncoderContext *s, int tileno)
+{
+    uint8_t *psotptr;
+
+    if (s->buf_end - s->buf < 12)
+        return NULL;
+
+    bytestream_put_be16(&s->buf, JPEG2000_SOT);
+    bytestream_put_be16(&s->buf, 10); // Lsot
+    bytestream_put_be16(&s->buf, tileno); // Isot
+
+    psotptr = s->buf;
+    bytestream_put_be32(&s->buf, 0); // Psot (filled in later)
+
+    bytestream_put_byte(&s->buf, 0); // TPsot
+    bytestream_put_byte(&s->buf, 1); // TNsot
+    return psotptr;
+}
+
+/**
+ * compute the sizes of tiles, resolution levels, bands, etc.
+ * allocate memory for them
+ * divide the input image into tile-components
+ */
+static int init_tiles(Jpeg2000EncoderContext *s)
+{
+    int tileno, tilex, tiley, compno;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    s->numXtiles = ff_jpeg2000_ceildiv(s->width, s->tile_width);
+    s->numYtiles = ff_jpeg2000_ceildiv(s->height, s->tile_height);
+
+    s->tile = av_malloc_array(s->numXtiles, s->numYtiles * sizeof(Jpeg2000Tile));
+    if (!s->tile)
+        return AVERROR(ENOMEM);
+    for (tileno = 0, tiley = 0; tiley < s->numYtiles; tiley++)
+        for (tilex = 0; tilex < s->numXtiles; tilex++, tileno++){
+            Jpeg2000Tile *tile = s->tile + tileno;
+
+            tile->comp = av_mallocz_array(s->ncomponents, sizeof(Jpeg2000Component));
+            if (!tile->comp)
+                return AVERROR(ENOMEM);
+            for (compno = 0; compno < s->ncomponents; compno++){
+                Jpeg2000Component *comp = tile->comp + compno;
+                int ret, i, j;
+
+                comp->coord[0][0] = comp->coord_o[0][0] = tilex * s->tile_width;
+                comp->coord[0][1] = comp->coord_o[0][1] = FFMIN((tilex+1)*s->tile_width, s->width);
+                comp->coord[1][0] = comp->coord_o[1][0] = tiley * s->tile_height;
+                comp->coord[1][1] = comp->coord_o[1][1] = FFMIN((tiley+1)*s->tile_height, s->height);
+                if (compno > 0)
+                    for (i = 0; i < 2; i++)
+                        for (j = 0; j < 2; j++)
+                            comp->coord[i][j] = comp->coord_o[i][j] = ff_jpeg2000_ceildivpow2(comp->coord[i][j], s->chroma_shift[i]);
+
+                if ((ret = ff_jpeg2000_init_component(comp,
+                                                codsty,
+                                                qntsty,
+                                                s->cbps[compno],
+                                                compno?1<<s->chroma_shift[0]:1,
+                                                compno?1<<s->chroma_shift[1]:1,
+                                                s->avctx
+                                               )) < 0)
+                    return ret;
+            }
+        }
+    return 0;
+}
+
+static void copy_frame(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno, i, y, x;
+    uint8_t *line;
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        if (s->planar){
+            for (compno = 0; compno < s->ncomponents; compno++){
+                Jpeg2000Component *comp = tile->comp + compno;
+                int *dst = comp->i_data;
+                line = s->picture->data[compno]
+                       + comp->coord[1][0] * s->picture->linesize[compno]
+                       + comp->coord[0][0];
+                for (y = comp->coord[1][0]; y < comp->coord[1][1]; y++){
+                    uint8_t *ptr = line;
+                    for (x = comp->coord[0][0]; x < comp->coord[0][1]; x++)
+                        *dst++ = *ptr++ - (1 << 7);
+                    line += s->picture->linesize[compno];
+                }
+            }
+        } else{
+            line = s->picture->data[0] + tile->comp[0].coord[1][0] * s->picture->linesize[0]
+                   + tile->comp[0].coord[0][0] * s->ncomponents;
+
+            i = 0;
+            for (y = tile->comp[0].coord[1][0]; y < tile->comp[0].coord[1][1]; y++){
+                uint8_t *ptr = line;
+                for (x = tile->comp[0].coord[0][0]; x < tile->comp[0].coord[0][1]; x++, i++){
+                    for (compno = 0; compno < s->ncomponents; compno++){
+                        tile->comp[compno].i_data[i] = *ptr++  - (1 << 7);
+                    }
+                }
+                line += s->picture->linesize[0];
+            }
+        }
+    }
+}
+
+static void init_quantization(Jpeg2000EncoderContext *s)
+{
+    int compno, reslevelno, bandno;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (compno = 0; compno < s->ncomponents; compno++){
+        int gbandno = 0;
+        for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+            int nbands, lev = codsty->nreslevels - reslevelno - 1;
+            nbands = reslevelno ? 3 : 1;
+            for (bandno = 0; bandno < nbands; bandno++, gbandno++){
+                int expn, mant = 0;
+
+                if (codsty->transform == FF_DWT97_INT){
+                    int bandpos = bandno + (reslevelno>0),
+                        ss = 81920000 / dwt_norms[0][bandpos][lev],
+                        log = av_log2(ss);
+                    mant = (11 - log < 0 ? ss >> log - 11 : ss << 11 - log) & 0x7ff;
+                    expn = s->cbps[compno] - log + 13;
+                } else
+                    expn = ((bandno&2)>>1) + (reslevelno>0) + s->cbps[compno];
+
+                qntsty->expn[gbandno] = expn;
+                qntsty->mant[gbandno] = mant;
+            }
+        }
+    }
+}
+
+static void init_luts(void)
+{
+    int i, a,
+        mask = ~((1<<NMSEDEC_FRACBITS)-1);
+
+    for (i = 0; i < (1 << NMSEDEC_BITS); i++){
+        lut_nmsedec_sig[i]  = FFMAX(6*i - (9<<NMSEDEC_FRACBITS-1) << 12-NMSEDEC_FRACBITS, 0);
+        lut_nmsedec_sig0[i] = FFMAX((i*i + (1<<NMSEDEC_FRACBITS-1) & mask) << 1, 0);
+
+        a = (i >> (NMSEDEC_BITS-2)&2) + 1;
+        lut_nmsedec_ref[i]  = FFMAX((-2*i + (1<<NMSEDEC_FRACBITS) + a*i - (a*a<<NMSEDEC_FRACBITS-2))
+                                    << 13-NMSEDEC_FRACBITS, 0);
+        lut_nmsedec_ref0[i] = FFMAX(((i*i + (1-4*i << NMSEDEC_FRACBITS-1) + (1<<2*NMSEDEC_FRACBITS)) & mask)
+                                    << 1, 0);
+    }
+}
+
+/* tier-1 routines */
+static int getnmsedec_sig(int x, int bpno)
+{
+    if (bpno > NMSEDEC_FRACBITS)
+        return lut_nmsedec_sig[(x >> (bpno - NMSEDEC_FRACBITS)) & ((1 << NMSEDEC_BITS) - 1)];
+    return lut_nmsedec_sig0[x & ((1 << NMSEDEC_BITS) - 1)];
+}
+
+static int getnmsedec_ref(int x, int bpno)
+{
+    if (bpno > NMSEDEC_FRACBITS)
+        return lut_nmsedec_ref[(x >> (bpno - NMSEDEC_FRACBITS)) & ((1 << NMSEDEC_BITS) - 1)];
+    return lut_nmsedec_ref0[x & ((1 << NMSEDEC_BITS) - 1)];
+}
+
+static void encode_sigpass(Jpeg2000T1Context *t1, int width, int height, int bandno, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++)
+            for (y = y0; y < height && y < y0+4; y++){
+                if (!(t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG) && (t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG_NB)){
+                    int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno),
+                        bit = t1->data[(y) * t1->stride + x] & mask ? 1 : 0;
+                    ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, bit);
+                    if (bit){
+                        int xorbit;
+                        int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                        ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                        *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                        ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_VIS;
+                }
+            }
+}
+
+static void encode_refpass(Jpeg2000T1Context *t1, int width, int height, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++)
+            for (y = y0; y < height && y < y0+4; y++)
+                if ((t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG){
+                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[(y+1) * t1->stride + x+1]);
+                    *nmsedec += getnmsedec_ref(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                    ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                    t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_REF;
+                }
+}
+
+static void encode_clnpass(Jpeg2000T1Context *t1, int width, int height, int bandno, int *nmsedec, int bpno)
+{
+    int y0, x, y, mask = 1 << (bpno + NMSEDEC_FRACBITS);
+    for (y0 = 0; y0 < height; y0 += 4)
+        for (x = 0; x < width; x++){
+            if (y0 + 3 < height && !(
+            (t1->flags[(y0+1) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+2) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+3) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+            (t1->flags[(y0+4) * t1->stride + x+1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG))))
+            {
+                // aggregation mode
+                int rlen;
+                for (rlen = 0; rlen < 4; rlen++)
+                    if (t1->data[(y0+rlen) * t1->stride + x] & mask)
+                        break;
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_RL, rlen != 4);
+                if (rlen == 4)
+                    continue;
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_UNI, rlen >> 1);
+                ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + MQC_CX_UNI, rlen & 1);
+                for (y = y0 + rlen; y < y0 + 4; y++){
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))){
+                        int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno);
+                        if (y > y0 + rlen)
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                        if (t1->data[(y) * t1->stride + x] & mask){ // newly significant
+                            int xorbit;
+                            int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                            *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                            ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                        }
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] &= ~JPEG2000_T1_VIS;
+                }
+            } else{
+                for (y = y0; y < y0 + 4 && y < height; y++){
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))){
+                        int ctxno = ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1], bandno);
+                        ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, t1->data[(y) * t1->stride + x] & mask ? 1:0);
+                        if (t1->data[(y) * t1->stride + x] & mask){ // newly significant
+                            int xorbit;
+                            int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1], &xorbit);
+                            *nmsedec += getnmsedec_sig(t1->data[(y) * t1->stride + x], bpno + NMSEDEC_FRACBITS);
+                            ff_mqc_encode(&t1->mqc, t1->mqc.cx_states + ctxno, (t1->flags[(y+1) * t1->stride + x+1] >> 15) ^ xorbit);
+                            ff_jpeg2000_set_significance(t1, x, y, t1->flags[(y+1) * t1->stride + x+1] >> 15);
+                        }
+                    }
+                    t1->flags[(y+1) * t1->stride + x+1] &= ~JPEG2000_T1_VIS;
+                }
+            }
+        }
+}
+
+static void encode_cblk(Jpeg2000EncoderContext *s, Jpeg2000T1Context *t1, Jpeg2000Cblk *cblk, Jpeg2000Tile *tile,
+                        int width, int height, int bandpos, int lev)
+{
+    int pass_t = 2, passno, x, y, max=0, nmsedec, bpno;
+    int64_t wmsedec = 0;
+
+    memset(t1->flags, 0, t1->stride * (height + 2) * sizeof(*t1->flags));
+
+    for (y = 0; y < height; y++){
+        for (x = 0; x < width; x++){
+            if (t1->data[(y) * t1->stride + x] < 0){
+                t1->flags[(y+1) * t1->stride + x+1] |= JPEG2000_T1_SGN;
+                t1->data[(y) * t1->stride + x] = -t1->data[(y) * t1->stride + x];
+            }
+            max = FFMAX(max, t1->data[(y) * t1->stride + x]);
+        }
+    }
+
+    if (max == 0){
+        cblk->nonzerobits = 0;
+        bpno = 0;
+    } else{
+        cblk->nonzerobits = av_log2(max) + 1 - NMSEDEC_FRACBITS;
+        bpno = cblk->nonzerobits - 1;
+    }
+
+    ff_mqc_initenc(&t1->mqc, cblk->data);
+
+    for (passno = 0; bpno >= 0; passno++){
+        nmsedec=0;
+
+        switch(pass_t){
+            case 0: encode_sigpass(t1, width, height, bandpos, &nmsedec, bpno);
+                    break;
+            case 1: encode_refpass(t1, width, height, &nmsedec, bpno);
+                    break;
+            case 2: encode_clnpass(t1, width, height, bandpos, &nmsedec, bpno);
+                    break;
+        }
+
+        cblk->passes[passno].rate = ff_mqc_flush_to(&t1->mqc, cblk->passes[passno].flushed, &cblk->passes[passno].flushed_len);
+        wmsedec += (int64_t)nmsedec << (2*bpno);
+        cblk->passes[passno].disto = wmsedec;
+
+        if (++pass_t == 3){
+            pass_t = 0;
+            bpno--;
+        }
+    }
+    cblk->npasses = passno;
+    cblk->ninclpasses = passno;
+
+    cblk->passes[passno-1].rate = ff_mqc_flush_to(&t1->mqc, cblk->passes[passno-1].flushed, &cblk->passes[passno-1].flushed_len);
+}
+
+/* tier-2 routines: */
+
+static void putnumpasses(Jpeg2000EncoderContext *s, int n)
+{
+    if (n == 1)
+        put_num(s, 0, 1);
+    else if (n == 2)
+        put_num(s, 2, 2);
+    else if (n <= 5)
+        put_num(s, 0xc | (n-3), 4);
+    else if (n <= 36)
+        put_num(s, 0x1e0 | (n-6), 9);
+    else
+        put_num(s, 0xff80 | (n-37), 16);
+}
+
+
+static int encode_packet(Jpeg2000EncoderContext *s, Jpeg2000ResLevel *rlevel, int precno,
+                          uint8_t *expn, int numgbits)
+{
+    int bandno, empty = 1;
+
+    // init bitstream
+    *s->buf = 0;
+    s->bit_index = 0;
+
+    // header
+
+    // is the packet empty?
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        if (rlevel->band[bandno].coord[0][0] < rlevel->band[bandno].coord[0][1]
+        &&  rlevel->band[bandno].coord[1][0] < rlevel->band[bandno].coord[1][1]){
+            empty = 0;
+            break;
+        }
+    }
+
+    put_bits(s, !empty, 1);
+    if (empty){
+        j2k_flush(s);
+        return 0;
+    }
+
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        Jpeg2000Band *band = rlevel->band + bandno;
+        Jpeg2000Prec *prec = band->prec + precno;
+        int yi, xi, pos;
+        int cblknw = prec->nb_codeblocks_width;
+
+        if (band->coord[0][0] == band->coord[0][1]
+        ||  band->coord[1][0] == band->coord[1][1])
+            continue;
+
+        for (pos=0, yi = 0; yi < prec->nb_codeblocks_height; yi++){
+            for (xi = 0; xi < cblknw; xi++, pos++){
+                prec->cblkincl[pos].val = prec->cblk[yi * cblknw + xi].ninclpasses == 0;
+                tag_tree_update(prec->cblkincl + pos);
+                prec->zerobits[pos].val = expn[bandno] + numgbits - 1 - prec->cblk[yi * cblknw + xi].nonzerobits;
+                tag_tree_update(prec->zerobits + pos);
+            }
+        }
+
+        for (pos=0, yi = 0; yi < prec->nb_codeblocks_height; yi++){
+            for (xi = 0; xi < cblknw; xi++, pos++){
+                int pad = 0, llen, length;
+                Jpeg2000Cblk *cblk = prec->cblk + yi * cblknw + xi;
+
+                if (s->buf_end - s->buf < 20) // approximately
+                    return -1;
+
+                // inclusion information
+                tag_tree_code(s, prec->cblkincl + pos, 1);
+                if (!cblk->ninclpasses)
+                    continue;
+                // zerobits information
+                tag_tree_code(s, prec->zerobits + pos, 100);
+                // number of passes
+                putnumpasses(s, cblk->ninclpasses);
+
+                length = cblk->passes[cblk->ninclpasses-1].rate;
+                llen = av_log2(length) - av_log2(cblk->ninclpasses) - 2;
+                if (llen < 0){
+                    pad = -llen;
+                    llen = 0;
+                }
+                // length of code block
+                put_bits(s, 1, llen);
+                put_bits(s, 0, 1);
+                put_num(s, length, av_log2(length)+1+pad);
+            }
+        }
+    }
+    j2k_flush(s);
+    for (bandno = 0; bandno < rlevel->nbands; bandno++){
+        Jpeg2000Band *band = rlevel->band + bandno;
+        Jpeg2000Prec *prec = band->prec + precno;
+        int yi, cblknw = prec->nb_codeblocks_width;
+        for (yi =0; yi < prec->nb_codeblocks_height; yi++){
+            int xi;
+            for (xi = 0; xi < cblknw; xi++){
+                Jpeg2000Cblk *cblk = prec->cblk + yi * cblknw + xi;
+                if (cblk->ninclpasses){
+                    if (s->buf_end - s->buf < cblk->passes[cblk->ninclpasses-1].rate)
+                        return -1;
+                    bytestream_put_buffer(&s->buf, cblk->data,   cblk->passes[cblk->ninclpasses-1].rate
+                                                               - cblk->passes[cblk->ninclpasses-1].flushed_len);
+                    bytestream_put_buffer(&s->buf, cblk->passes[cblk->ninclpasses-1].flushed,
+                                                   cblk->passes[cblk->ninclpasses-1].flushed_len);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int encode_packets(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile, int tileno)
+{
+    int compno, reslevelno, ret;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "tier2\n");
+    // lay-rlevel-comp-pos progression
+    for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+        for (compno = 0; compno < s->ncomponents; compno++){
+            int precno;
+            Jpeg2000ResLevel *reslevel = s->tile[tileno].comp[compno].reslevel + reslevelno;
+            for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                if ((ret = encode_packet(s, reslevel, precno, qntsty->expn + (reslevelno ? 3*reslevelno-2 : 0),
+                              qntsty->nguardbits)) < 0)
+                    return ret;
+            }
+        }
+    }
+    av_log(s->avctx, AV_LOG_DEBUG, "after tier2\n");
+    return 0;
+}
+
+static int getcut(Jpeg2000Cblk *cblk, int64_t lambda, int dwt_norm)
+{
+    int passno, res = 0;
+    for (passno = 0; passno < cblk->npasses; passno++){
+        int dr;
+        int64_t dd;
+
+        dr = cblk->passes[passno].rate
+           - (res ? cblk->passes[res-1].rate:0);
+        dd = cblk->passes[passno].disto
+           - (res ? cblk->passes[res-1].disto:0);
+
+        if (((dd * dwt_norm) >> WMSEDEC_SHIFT) * dwt_norm >= dr * lambda)
+            res = passno+1;
+    }
+    return res;
+}
+
+static void truncpasses(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile)
+{
+    int precno, compno, reslevelno, bandno, cblkno, lev;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (compno = 0; compno < s->ncomponents; compno++){
+        Jpeg2000Component *comp = tile->comp + compno;
+
+        for (reslevelno = 0, lev = codsty->nreslevels-1; reslevelno < codsty->nreslevels; reslevelno++, lev--){
+            Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+
+            for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++){
+                for (bandno = 0; bandno < reslevel->nbands ; bandno++){
+                    int bandpos = bandno + (reslevelno > 0);
+                    Jpeg2000Band *band = reslevel->band + bandno;
+                    Jpeg2000Prec *prec = band->prec + precno;
+
+                    for (cblkno = 0; cblkno < prec->nb_codeblocks_height * prec->nb_codeblocks_width; cblkno++){
+                        Jpeg2000Cblk *cblk = prec->cblk + cblkno;
+
+                        cblk->ninclpasses = getcut(cblk, s->lambda,
+                                (int64_t)dwt_norms[codsty->transform == FF_DWT53][bandpos][lev] * (int64_t)band->i_stepsize >> 15);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static int encode_tile(Jpeg2000EncoderContext *s, Jpeg2000Tile *tile, int tileno)
+{
+    int compno, reslevelno, bandno, ret;
+    Jpeg2000T1Context t1;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    for (compno = 0; compno < s->ncomponents; compno++){
+        Jpeg2000Component *comp = s->tile[tileno].comp + compno;
+
+        t1.stride = (1<<codsty->log2_cblk_width) + 2;
+
+        av_log(s->avctx, AV_LOG_DEBUG,"dwt\n");
+        if ((ret = ff_dwt_encode(&comp->dwt, comp->i_data)) < 0)
+            return ret;
+        av_log(s->avctx, AV_LOG_DEBUG,"after dwt -> tier1\n");
+
+        for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++){
+            Jpeg2000ResLevel *reslevel = comp->reslevel + reslevelno;
+
+            for (bandno = 0; bandno < reslevel->nbands ; bandno++){
+                Jpeg2000Band *band = reslevel->band + bandno;
+                Jpeg2000Prec *prec = band->prec; // we support only 1 precinct per band ATM in the encoder
+                int cblkx, cblky, cblkno=0, xx0, x0, xx1, y0, yy0, yy1, bandpos;
+                yy0 = bandno == 0 ? 0 : comp->reslevel[reslevelno-1].coord[1][1] - comp->reslevel[reslevelno-1].coord[1][0];
+                y0 = yy0;
+                yy1 = FFMIN(ff_jpeg2000_ceildivpow2(band->coord[1][0] + 1, band->log2_cblk_height) << band->log2_cblk_height,
+                            band->coord[1][1]) - band->coord[1][0] + yy0;
+
+                if (band->coord[0][0] == band->coord[0][1] || band->coord[1][0] == band->coord[1][1])
+                    continue;
+
+                bandpos = bandno + (reslevelno > 0);
+
+                for (cblky = 0; cblky < prec->nb_codeblocks_height; cblky++){
+                    if (reslevelno == 0 || bandno == 1)
+                        xx0 = 0;
+                    else
+                        xx0 = comp->reslevel[reslevelno-1].coord[0][1] - comp->reslevel[reslevelno-1].coord[0][0];
+                    x0 = xx0;
+                    xx1 = FFMIN(ff_jpeg2000_ceildivpow2(band->coord[0][0] + 1, band->log2_cblk_width) << band->log2_cblk_width,
+                                band->coord[0][1]) - band->coord[0][0] + xx0;
+
+                    for (cblkx = 0; cblkx < prec->nb_codeblocks_width; cblkx++, cblkno++){
+                        int y, x;
+                        if (codsty->transform == FF_DWT53){
+                            for (y = yy0; y < yy1; y++){
+                                int *ptr = t1.data + (y-yy0)*t1.stride;
+                                for (x = xx0; x < xx1; x++){
+                                    *ptr++ = comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * y + x] << NMSEDEC_FRACBITS;
+                                }
+                            }
+                        } else{
+                            for (y = yy0; y < yy1; y++){
+                                int *ptr = t1.data + (y-yy0)*t1.stride;
+                                for (x = xx0; x < xx1; x++){
+                                    *ptr = (comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * y + x]);
+                                    *ptr = (int64_t)*ptr * (int64_t)(16384 * 65536 / band->i_stepsize) >> 15 - NMSEDEC_FRACBITS;
+                                    ptr++;
+                                }
+                            }
+                        }
+                        encode_cblk(s, &t1, prec->cblk + cblkno, tile, xx1 - xx0, yy1 - yy0,
+                                    bandpos, codsty->nreslevels - reslevelno - 1);
+                        xx0 = xx1;
+                        xx1 = FFMIN(xx1 + (1 << band->log2_cblk_width), band->coord[0][1] - band->coord[0][0] + x0);
+                    }
+                    yy0 = yy1;
+                    yy1 = FFMIN(yy1 + (1 << band->log2_cblk_height), band->coord[1][1] - band->coord[1][0] + y0);
+                }
+            }
+        }
+        av_log(s->avctx, AV_LOG_DEBUG, "after tier1\n");
+    }
+
+    av_log(s->avctx, AV_LOG_DEBUG, "rate control\n");
+    truncpasses(s, tile);
+    if ((ret = encode_packets(s, tile, tileno)) < 0)
+        return ret;
+    av_log(s->avctx, AV_LOG_DEBUG, "after rate control\n");
+    return 0;
+}
+
+static void cleanup(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        for (compno = 0; compno < s->ncomponents; compno++){
+            Jpeg2000Component *comp = s->tile[tileno].comp + compno;
+            ff_jpeg2000_cleanup(comp, codsty);
+        }
+        av_freep(&s->tile[tileno].comp);
+    }
+    av_freep(&s->tile);
+}
+
+static void reinit(Jpeg2000EncoderContext *s)
+{
+    int tileno, compno;
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        Jpeg2000Tile *tile = s->tile + tileno;
+        for (compno = 0; compno < s->ncomponents; compno++)
+            ff_jpeg2000_reinit(tile->comp + compno, &s->codsty);
+    }
+}
+
+static void update_size(uint8_t *size, const uint8_t *end)
+{
+    AV_WB32(size, end-size);
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    int tileno, ret;
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+    uint8_t *chunkstart, *jp2cstart, *jp2hstart;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*9 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    // init:
+    s->buf = s->buf_start = pkt->data;
+    s->buf_end = pkt->data + pkt->size;
+
+    s->picture = pict;
+
+    s->lambda = s->picture->quality * LAMBDA_SCALE;
+
+    copy_frame(s);
+    reinit(s);
+
+    if (s->format == CODEC_JP2) {
+        av_assert0(s->buf == pkt->data);
+
+        bytestream_put_be32(&s->buf, 0x0000000C);
+        bytestream_put_be32(&s->buf, 0x6A502020);
+        bytestream_put_be32(&s->buf, 0x0D0A870A);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "ftyp", 4);
+        bytestream_put_buffer(&s->buf, "jp2\040\040", 4);
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2\040", 4);
+        update_size(chunkstart, s->buf);
+
+        jp2hstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2h", 4);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "ihdr", 4);
+        bytestream_put_be32(&s->buf, avctx->height);
+        bytestream_put_be32(&s->buf, avctx->width);
+        bytestream_put_be16(&s->buf, s->ncomponents);
+        bytestream_put_byte(&s->buf, s->cbps[0]);
+        bytestream_put_byte(&s->buf, 7);
+        bytestream_put_byte(&s->buf, 0);
+        bytestream_put_byte(&s->buf, 0);
+        update_size(chunkstart, s->buf);
+
+        chunkstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "colr", 4);
+        bytestream_put_byte(&s->buf, 1);
+        bytestream_put_byte(&s->buf, 0);
+        bytestream_put_byte(&s->buf, 0);
+        if (s->ncomponents == 1) {
+            bytestream_put_be32(&s->buf, 17);
+        } else if (avctx->pix_fmt == AV_PIX_FMT_RGB24) {
+            bytestream_put_be32(&s->buf, 16);
+        } else {
+            bytestream_put_be32(&s->buf, 18);
+        }
+        update_size(chunkstart, s->buf);
+        update_size(jp2hstart, s->buf);
+
+        jp2cstart = s->buf;
+        bytestream_put_be32(&s->buf, 0);
+        bytestream_put_buffer(&s->buf, "jp2c", 4);
+    }
+
+    if (s->buf_end - s->buf < 2)
+        return -1;
+    bytestream_put_be16(&s->buf, JPEG2000_SOC);
+    if ((ret = put_siz(s)) < 0)
+        return ret;
+    if ((ret = put_cod(s)) < 0)
+        return ret;
+    if ((ret = put_qcd(s, 0)) < 0)
+        return ret;
+    if ((ret = put_com(s, 0)) < 0)
+        return ret;
+
+    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++){
+        uint8_t *psotptr;
+        if (!(psotptr = put_sot(s, tileno)))
+            return -1;
+        if (s->buf_end - s->buf < 2)
+            return -1;
+        bytestream_put_be16(&s->buf, JPEG2000_SOD);
+        if ((ret = encode_tile(s, s->tile + tileno, tileno)) < 0)
+            return ret;
+        bytestream_put_be32(&psotptr, s->buf - psotptr + 6);
+    }
+    if (s->buf_end - s->buf < 2)
+        return -1;
+    bytestream_put_be16(&s->buf, JPEG2000_EOC);
+
+    if (s->format == CODEC_JP2)
+        update_size(jp2cstart, s->buf);
+
+    av_log(s->avctx, AV_LOG_DEBUG, "end\n");
+    pkt->size = s->buf - s->buf_start;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int j2kenc_init(AVCodecContext *avctx)
+{
+    int i, ret;
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+    Jpeg2000CodingStyle *codsty = &s->codsty;
+    Jpeg2000QuantStyle  *qntsty = &s->qntsty;
+
+    s->avctx = avctx;
+    av_log(s->avctx, AV_LOG_DEBUG, "init\n");
+
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->prediction_method)
+        s->pred = avctx->prediction_method;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    // defaults:
+    // TODO: implement setting non-standard precinct size
+    memset(codsty->log2_prec_widths , 15, sizeof(codsty->log2_prec_widths ));
+    memset(codsty->log2_prec_heights, 15, sizeof(codsty->log2_prec_heights));
+    codsty->nreslevels2decode=
+    codsty->nreslevels       = 7;
+    codsty->log2_cblk_width  = 4;
+    codsty->log2_cblk_height = 4;
+    codsty->transform        = s->pred ? FF_DWT53 : FF_DWT97_INT;
+
+    qntsty->nguardbits       = 1;
+
+    if ((s->tile_width  & (s->tile_width -1)) ||
+        (s->tile_height & (s->tile_height-1))) {
+        av_log(avctx, AV_LOG_WARNING, "Tile dimension not a power of 2\n");
+    }
+
+    if (codsty->transform == FF_DWT53)
+        qntsty->quantsty = JPEG2000_QSTY_NONE;
+    else
+        qntsty->quantsty = JPEG2000_QSTY_SE;
+
+    s->width = avctx->width;
+    s->height = avctx->height;
+
+    for (i = 0; i < 3; i++)
+        s->cbps[i] = 8;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB24){
+        s->ncomponents = 3;
+    } else if (avctx->pix_fmt == AV_PIX_FMT_GRAY8){
+        s->ncomponents = 1;
+    } else{ // planar YUV
+        s->planar = 1;
+        s->ncomponents = 3;
+        avcodec_get_chroma_sub_sample(avctx->pix_fmt,
+                s->chroma_shift, s->chroma_shift + 1);
+    }
+
+    ff_jpeg2000_init_tier1_luts();
+    ff_mqc_init_context_tables();
+    init_luts();
+
+    init_quantization(s);
+    if ((ret=init_tiles(s)) < 0)
+        return ret;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "after init\n");
+
+    return 0;
+}
+
+static int j2kenc_destroy(AVCodecContext *avctx)
+{
+    Jpeg2000EncoderContext *s = avctx->priv_data;
+
+    cleanup(s);
+    return 0;
+}
+
+// taken from the libopenjpeg wraper so it matches
+
+#define OFFSET(x) offsetof(Jpeg2000EncoderContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "format",        "Codec Format",      OFFSET(format),        AV_OPT_TYPE_INT,   { .i64 = CODEC_JP2   }, CODEC_J2K, CODEC_JP2,   VE, "format"      },
+    { "j2k",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_J2K   }, 0,         0,           VE, "format"      },
+    { "jp2",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_JP2   }, 0,         0,           VE, "format"      },
+    { "tile_width",    "Tile Width",        OFFSET(tile_width),    AV_OPT_TYPE_INT,   { .i64 = 256         }, 1,     1<<30,           VE, },
+    { "tile_height",   "Tile Height",       OFFSET(tile_height),   AV_OPT_TYPE_INT,   { .i64 = 256         }, 1,     1<<30,           VE, },
+    { "pred",          "DWT Type",          OFFSET(pred),          AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE, "pred"        },
+    { "dwt97int",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = 0           }, INT_MIN, INT_MAX,       VE, "pred"        },
+    { "dwt53",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = 0           }, INT_MIN, INT_MAX,       VE, "pred"        },
+
+    { NULL }
+};
+
+static const AVClass j2k_class = {
+    .class_name = "jpeg 2000 encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_jpeg2000_encoder = {
+    .name           = "jpeg2000",
+    .long_name      = NULL_IF_CONFIG_SMALL("JPEG 2000"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_JPEG2000,
+    .priv_data_size = sizeof(Jpeg2000EncoderContext),
+    .init           = j2kenc_init,
+    .encode2        = encode_frame,
+    .close          = j2kenc_destroy,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_YUV444P, AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &j2k_class,
+};
diff --git a/libavcodec/jacosub.h b/libavcodec/jacosub.h
new file mode 100644
index 0000000..c3665ae
--- /dev/null
+++ b/libavcodec/jacosub.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * JACOsub shared utils
+ */
+
+#ifndef AVCODEC_JACOSUB_H
+#define AVCODEC_JACOSUB_H
+
+#include "libavutil/common.h"
+
+#define JSS_MAX_LINESIZE 512
+
+static av_always_inline int jss_whitespace(char c)
+{
+    return c == ' ' || (c >= '\t' && c <= '\r');
+}
+
+static av_always_inline const char *jss_skip_whitespace(const char *p)
+{
+    while (jss_whitespace(*p))
+        p++;
+    return p;
+}
+
+#endif /* AVCODEC_JACOSUB_H */
diff --git a/libavcodec/jacosubdec.c b/libavcodec/jacosubdec.c
new file mode 100644
index 0000000..cdb372a
--- /dev/null
+++ b/libavcodec/jacosubdec.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * JACOsub subtitle decoder
+ * @see http://unicorn.us.com/jacosub/jscripts.html
+ */
+
+#include <time.h>
+#include "ass.h"
+#include "jacosub.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "libavutil/time_internal.h"
+
+#undef time
+
+static int insert_text(AVBPrint *dst, const char *in, const char *arg)
+{
+    av_bprintf(dst, "%s", arg);
+    return 0;
+}
+
+static int insert_datetime(AVBPrint *dst, const char *in, const char *arg)
+{
+    char buf[16] = {0};
+    time_t now = time(0);
+    struct tm ltime;
+
+    localtime_r(&now, &ltime);
+    if (strftime(buf, sizeof(buf), arg, &ltime))
+        av_bprintf(dst, "%s", buf);
+    return 0;
+}
+
+static int insert_color(AVBPrint *dst, const char *in, const char *arg)
+{
+    return 1; // skip id
+}
+
+static int insert_font(AVBPrint *dst, const char *in, const char *arg)
+{
+    return 1; // skip id
+}
+
+static const struct {
+    const char *from;
+    const char *arg;
+    int (*func)(AVBPrint *dst, const char *in, const char *arg);
+} ass_codes_map[] = {
+    {"\\~", "~",        insert_text},       // tilde doesn't need escaping
+    {"~",   "{\\h}",    insert_text},       // hard space
+    {"\\n", "\\N",      insert_text},       // newline
+    {"\\D", "%d %b %Y", insert_datetime},   // current date
+    {"\\T", "%H:%M",    insert_datetime},   // current time
+    {"\\N", "{\\r}",    insert_text},       // reset to default style
+    {"\\I", "{\\i1}",   insert_text},       // italic on
+    {"\\i", "{\\i0}",   insert_text},       // italic off
+    {"\\B", "{\\b1}",   insert_text},       // bold on
+    {"\\b", "{\\b0}",   insert_text},       // bold off
+    {"\\U", "{\\u1}",   insert_text},       // underline on
+    {"\\u", "{\\u0}",   insert_text},       // underline off
+    {"\\C", "",         insert_color},      // TODO: color
+    {"\\F", "",         insert_font},       // TODO: font
+};
+
+enum {
+    ALIGN_VB = 1<<0, // vertical bottom, default
+    ALIGN_VM = 1<<1, // vertical middle
+    ALIGN_VT = 1<<2, // vertical top
+    ALIGN_JC = 1<<3, // justify center, default
+    ALIGN_JL = 1<<4, // justify left
+    ALIGN_JR = 1<<5, // justify right
+};
+
+static void jacosub_to_ass(AVCodecContext *avctx, AVBPrint *dst, const char *src)
+{
+    int i, valign = 0, halign = 0;
+    char c = av_toupper(*src);
+    char directives[128] = {0};
+
+    /* extract the optional directives */
+    if ((c >= 'A' && c <= 'Z') || c == '[') {
+        char *p    = directives;
+        char *pend = directives + sizeof(directives) - 1;
+
+        do *p++ = av_toupper(*src++);
+        while (*src && !jss_whitespace(*src) && p < pend);
+        *p = 0;
+        src = jss_skip_whitespace(src);
+    }
+
+    /* handle directives (TODO: handle more of them, and more reliably) */
+    if      (strstr(directives, "VB")) valign = ALIGN_VB;
+    else if (strstr(directives, "VM")) valign = ALIGN_VM;
+    else if (strstr(directives, "VT")) valign = ALIGN_VT;
+    if      (strstr(directives, "JC")) halign = ALIGN_JC;
+    else if (strstr(directives, "JL")) halign = ALIGN_JL;
+    else if (strstr(directives, "JR")) halign = ALIGN_JR;
+    if (valign || halign) {
+        if (!valign) valign = ALIGN_VB;
+        if (!halign) halign = ALIGN_JC;
+        switch (valign | halign) {
+        case ALIGN_VB | ALIGN_JL: av_bprintf(dst, "{\\an1}"); break; // bottom left
+        case ALIGN_VB | ALIGN_JC: av_bprintf(dst, "{\\an2}"); break; // bottom center
+        case ALIGN_VB | ALIGN_JR: av_bprintf(dst, "{\\an3}"); break; // bottom right
+        case ALIGN_VM | ALIGN_JL: av_bprintf(dst, "{\\an4}"); break; // middle left
+        case ALIGN_VM | ALIGN_JC: av_bprintf(dst, "{\\an5}"); break; // middle center
+        case ALIGN_VM | ALIGN_JR: av_bprintf(dst, "{\\an6}"); break; // middle right
+        case ALIGN_VT | ALIGN_JL: av_bprintf(dst, "{\\an7}"); break; // top left
+        case ALIGN_VT | ALIGN_JC: av_bprintf(dst, "{\\an8}"); break; // top center
+        case ALIGN_VT | ALIGN_JR: av_bprintf(dst, "{\\an9}"); break; // top right
+        }
+    }
+
+    /* process timed line */
+    while (*src && *src != '\n') {
+
+        /* text continue on the next line */
+        if (src[0] == '\\' && src[1] == '\n') {
+            src += 2;
+            while (jss_whitespace(*src))
+                src++;
+            continue;
+        }
+
+        /* special character codes */
+        for (i = 0; i < FF_ARRAY_ELEMS(ass_codes_map); i++) {
+            const char *from = ass_codes_map[i].from;
+            const char *arg  = ass_codes_map[i].arg;
+            size_t codemap_len = strlen(from);
+
+            if (!strncmp(src, from, codemap_len)) {
+                src += codemap_len;
+                src += ass_codes_map[i].func(dst, src, arg);
+                break;
+            }
+        }
+
+        /* simple char copy */
+        if (i == FF_ARRAY_ELEMS(ass_codes_map))
+            av_bprintf(dst, "%c", *src++);
+    }
+}
+
+static int jacosub_decode_frame(AVCodecContext *avctx,
+                                void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+
+    if (avpkt->size <= 0)
+        goto end;
+
+    if (*ptr) {
+        AVBPrint buffer;
+
+        // skip timers
+        ptr = jss_skip_whitespace(ptr);
+        ptr = strchr(ptr, ' '); if (!ptr) goto end; ptr++;
+        ptr = strchr(ptr, ' '); if (!ptr) goto end; ptr++;
+
+        av_bprint_init(&buffer, JSS_MAX_LINESIZE, JSS_MAX_LINESIZE);
+        jacosub_to_ass(avctx, &buffer, ptr);
+        ret = ff_ass_add_rect(sub, buffer.str, s->readorder++, 0, NULL, NULL);
+        av_bprint_finalize(&buffer, NULL);
+        if (ret < 0)
+            return ret;
+    }
+
+end:
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_jacosub_decoder = {
+    .name           = "jacosub",
+    .long_name      = NULL_IF_CONFIG_SMALL("JACOsub subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_JACOSUB,
+    .init           = ff_ass_subtitle_header_default,
+    .decode         = jacosub_decode_frame,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/jfdctint.c b/libavcodec/jfdctint.c
index ed6b7ff..6a39578 100644
--- a/libavcodec/jfdctint.c
+++ b/libavcodec/jfdctint.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jni.c b/libavcodec/jni.c
new file mode 100644
index 0000000..32456f5
--- /dev/null
+++ b/libavcodec/jni.c
@@ -0,0 +1,80 @@
+/*
+ * JNI public API functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+
+#include "config.h"
+#include "jni.h"
+
+#if CONFIG_JNI
+
+#include <errno.h>
+#include <jni.h>
+#include <pthread.h>
+
+#include "libavutil/log.h"
+#include "libavutil/error.h"
+#include "ffjni.h"
+
+void *java_vm;
+pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+int av_jni_set_java_vm(void *vm, void *log_ctx)
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&lock);
+    if (java_vm == NULL) {
+        java_vm = vm;
+    } else if (java_vm != vm) {
+        ret = AVERROR(EINVAL);
+        av_log(log_ctx, AV_LOG_ERROR, "A Java virtual machine has already been set");
+    }
+    pthread_mutex_unlock(&lock);
+
+    return ret;
+}
+
+void *av_jni_get_java_vm(void *log_ctx)
+{
+    void *vm;
+
+    pthread_mutex_lock(&lock);
+    vm = java_vm;
+    pthread_mutex_unlock(&lock);
+
+    return vm;
+}
+
+#else
+
+int av_jni_set_java_vm(void *vm, void *log_ctx)
+{
+    return 0;
+}
+
+void *av_jni_get_java_vm(void *log_ctx)
+{
+    return NULL;
+}
+
+#endif
diff --git a/libavcodec/jni.h b/libavcodec/jni.h
new file mode 100644
index 0000000..dd99e92
--- /dev/null
+++ b/libavcodec/jni.h
@@ -0,0 +1,46 @@
+/*
+ * JNI public API functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_JNI_H
+#define AVCODEC_JNI_H
+
+/*
+ * Manually set a Java virtual machine which will be used to retrieve the JNI
+ * environment. Once a Java VM is set it cannot be changed afterwards, meaning
+ * you can call multiple times av_jni_set_java_vm with the same Java VM pointer
+ * however it will error out if you try to set a different Java VM.
+ *
+ * @param vm Java virtual machine
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int av_jni_set_java_vm(void *vm, void *log_ctx);
+
+/*
+ * Get the Java virtual machine which has been set with av_jni_set_java_vm.
+ *
+ * @param vm Java virtual machine
+ * @return a pointer to the Java virtual machine
+ */
+void *av_jni_get_java_vm(void *log_ctx);
+
+#endif /* AVCODEC_JNI_H */
diff --git a/libavcodec/jpeg2000.c b/libavcodec/jpeg2000.c
index ef5ffa6..94efc94 100644
--- a/libavcodec/jpeg2000.c
+++ b/libavcodec/jpeg2000.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,12 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 #include "avcodec.h"
+#include "internal.h"
 #include "jpeg2000.h"
 
 #define SHL(a, n) ((n) >= 0 ? (a) << (n) : (a) >> -(n))
@@ -36,13 +39,12 @@
 /* tag tree routines */
 
 /* allocate the memory for tag tree */
-static int32_t tag_tree_size(uint16_t w, uint16_t h)
+static int32_t tag_tree_size(int w, int h)
 {
-    uint32_t res = 0;
+    int64_t res = 0;
     while (w > 1 || h > 1) {
-        res += w * h;
-        if (res + 1 >= INT32_MAX)
-            return -1;
+        res += w * (int64_t)h;
+        av_assert0(res + 1 < INT32_MAX);
         w = (w + 1) >> 1;
         h = (h + 1) >> 1;
     }
@@ -56,8 +58,6 @@ static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
     int32_t tt_size;
 
     tt_size = tag_tree_size(w, h);
-    if (tt_size == -1)
-        return NULL;
 
     t = res = av_mallocz_array(tt_size, sizeof(*t));
     if (!res)
@@ -82,6 +82,16 @@ static Jpeg2000TgtNode *ff_jpeg2000_tag_tree_init(int w, int h)
     return res;
 }
 
+static void tag_tree_zero(Jpeg2000TgtNode *t, int w, int h)
+{
+    int i, siz = tag_tree_size(w, h);
+
+    for (i = 0; i < siz; i++) {
+        t[i].val = 0;
+        t[i].vis = 0;
+    }
+}
+
 uint8_t ff_jpeg2000_sigctxno_lut[256][4];
 
 static int getsigctxno(int flag, int bandno)
@@ -96,45 +106,33 @@ static int getsigctxno(int flag, int bandno)
         ((flag & JPEG2000_T1_SIG_NW) ? 1 : 0) +
         ((flag & JPEG2000_T1_SIG_SE) ? 1 : 0) +
         ((flag & JPEG2000_T1_SIG_SW) ? 1 : 0);
+
     if (bandno < 3) {
         if (bandno == 1)
             FFSWAP(int, h, v);
-        if (h == 2)
-            return 8;
+        if (h == 2) return 8;
         if (h == 1) {
-            if (v >= 1)
-                return 7;
-            if (d >= 1)
-                return 6;
+            if (v >= 1) return 7;
+            if (d >= 1) return 6;
             return 5;
         }
-        if (v == 2)
-            return 4;
-        if (v == 1)
-            return 3;
-        if (d >= 2)
-            return 2;
-        if (d == 1)
-            return 1;
+        if (v == 2) return 4;
+        if (v == 1) return 3;
+        if (d >= 2) return 2;
+        if (d == 1) return 1;
     } else {
-        if (d >= 3)
-            return 8;
+        if (d >= 3) return 8;
         if (d == 2) {
-            if (h + v >= 1)
-                return 7;
+            if (h+v >= 1) return 7;
             return 6;
         }
         if (d == 1) {
-            if (h + v >= 2)
-                return 5;
-            if (h + v == 1)
-                return 4;
+            if (h+v >= 2) return 5;
+            if (h+v == 1) return 4;
             return 3;
         }
-        if (h + v >= 2)
-            return 2;
-        if (h + v == 1)
-            return 1;
+        if (h+v >= 2) return 2;
+        if (h+v == 1) return 1;
     }
     return 0;
 }
@@ -175,25 +173,25 @@ void ff_jpeg2000_set_significance(Jpeg2000T1Context *t1, int x, int y,
 {
     x++;
     y++;
-    t1->flags[y][x] |= JPEG2000_T1_SIG;
+    t1->flags[(y) * t1->stride + x] |= JPEG2000_T1_SIG;
     if (negative) {
-        t1->flags[y][x + 1] |= JPEG2000_T1_SIG_W | JPEG2000_T1_SGN_W;
-        t1->flags[y][x - 1] |= JPEG2000_T1_SIG_E | JPEG2000_T1_SGN_E;
-        t1->flags[y + 1][x] |= JPEG2000_T1_SIG_N | JPEG2000_T1_SGN_N;
-        t1->flags[y - 1][x] |= JPEG2000_T1_SIG_S | JPEG2000_T1_SGN_S;
+        t1->flags[(y) * t1->stride + x + 1] |= JPEG2000_T1_SIG_W | JPEG2000_T1_SGN_W;
+        t1->flags[(y) * t1->stride + x - 1] |= JPEG2000_T1_SIG_E | JPEG2000_T1_SGN_E;
+        t1->flags[(y + 1) * t1->stride + x] |= JPEG2000_T1_SIG_N | JPEG2000_T1_SGN_N;
+        t1->flags[(y - 1) * t1->stride + x] |= JPEG2000_T1_SIG_S | JPEG2000_T1_SGN_S;
     } else {
-        t1->flags[y][x + 1] |= JPEG2000_T1_SIG_W;
-        t1->flags[y][x - 1] |= JPEG2000_T1_SIG_E;
-        t1->flags[y + 1][x] |= JPEG2000_T1_SIG_N;
-        t1->flags[y - 1][x] |= JPEG2000_T1_SIG_S;
+        t1->flags[(y) * t1->stride + x + 1] |= JPEG2000_T1_SIG_W;
+        t1->flags[(y) * t1->stride + x - 1] |= JPEG2000_T1_SIG_E;
+        t1->flags[(y + 1) * t1->stride + x] |= JPEG2000_T1_SIG_N;
+        t1->flags[(y - 1) * t1->stride + x] |= JPEG2000_T1_SIG_S;
     }
-    t1->flags[y + 1][x + 1] |= JPEG2000_T1_SIG_NW;
-    t1->flags[y + 1][x - 1] |= JPEG2000_T1_SIG_NE;
-    t1->flags[y - 1][x + 1] |= JPEG2000_T1_SIG_SW;
-    t1->flags[y - 1][x - 1] |= JPEG2000_T1_SIG_SE;
+    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_SIG_NW;
+    t1->flags[(y + 1) * t1->stride + x - 1] |= JPEG2000_T1_SIG_NE;
+    t1->flags[(y - 1) * t1->stride + x + 1] |= JPEG2000_T1_SIG_SW;
+    t1->flags[(y - 1) * t1->stride + x - 1] |= JPEG2000_T1_SIG_SE;
 }
 
-static const uint8_t lut_gain[2][4] = { { 0, 0, 0, 0 }, { 0, 1, 1, 2 } };
+// static const uint8_t lut_gain[2][4] = { { 0, 0, 0, 0 }, { 0, 1, 1, 2 } }; (unused)
 
 static void init_band_stepsize(AVCodecContext *avctx,
                                Jpeg2000Band *band,
@@ -206,29 +204,25 @@ static void init_band_stepsize(AVCodecContext *avctx,
      * see ISO/IEC 15444-1:2002 E.1 and A.6.4. */
     switch (qntsty->quantsty) {
         uint8_t gain;
-        int numbps;
     case JPEG2000_QSTY_NONE:
         /* TODO: to verify. No quantization in this case */
         band->f_stepsize = 1;
         break;
     case JPEG2000_QSTY_SI:
         /*TODO: Compute formula to implement. */
-        numbps = cbps +
-                 lut_gain[codsty->transform == FF_DWT53][bandno + (reslevelno > 0)];
-        band->f_stepsize = SHL(2048 + qntsty->mant[gbandno],
-                               2 + numbps - qntsty->expn[gbandno]);
-        break;
+//         numbps = cbps +
+//                  lut_gain[codsty->transform == FF_DWT53][bandno + (reslevelno > 0)];
+//         band->f_stepsize = SHL(2048 + qntsty->mant[gbandno],
+//                                2 + numbps - qntsty->expn[gbandno]);
+//         break;
     case JPEG2000_QSTY_SE:
         /* Exponent quantization step.
          * Formula:
          * delta_b = 2 ^ (R_b - expn_b) * (1 + (mant_b / 2 ^ 11))
          * R_b = R_I + log2 (gain_b )
          * see ISO/IEC 15444-1:2002 E.1.1 eqn. E-3 and E-4 */
-        /* TODO/WARN: value of log2 (gain_b ) not taken into account
-         * but it works (compared to OpenJPEG). Why?
-         * Further investigation needed. */
         gain            = cbps;
-        band->f_stepsize  = pow(2.0, gain - qntsty->expn[gbandno]);
+        band->f_stepsize  = ff_exp2fi(gain - qntsty->expn[gbandno]);
         band->f_stepsize *= qntsty->mant[gbandno] / 2048.0 + 1.0;
         break;
     default:
@@ -236,12 +230,29 @@ static void init_band_stepsize(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Unknown quantization format\n");
         break;
     }
-    /* FIXME: In OpenJPEG code stespize = stepsize * 0.5. Why?
+    if (codsty->transform != FF_DWT53) {
+        int lband = 0;
+        switch (bandno + (reslevelno > 0)) {
+            case 1:
+            case 2:
+                band->f_stepsize *= F_LFTG_X * 2;
+                lband = 1;
+                break;
+            case 3:
+                band->f_stepsize *= F_LFTG_X * F_LFTG_X * 4;
+                break;
+        }
+        if (codsty->transform == FF_DWT97) {
+            band->f_stepsize *= pow(F_LFTG_K, 2*(codsty->nreslevels2decode - reslevelno) + lband - 2);
+        }
+    }
+
+    band->i_stepsize = band->f_stepsize * (1 << 15);
+
+    /* FIXME: In OpenJPEG code stepsize = stepsize * 0.5. Why?
      * If not set output of entropic decoder is not correct. */
     if (!av_codec_is_encoder(avctx->codec))
         band->f_stepsize *= 0.5;
-
-    band->i_stepsize = band->f_stepsize * (1 << 16);
 }
 
 static int init_prec(Jpeg2000Band *band,
@@ -254,37 +265,40 @@ static int init_prec(Jpeg2000Band *band,
     Jpeg2000Prec *prec = band->prec + precno;
     int nb_codeblocks, cblkno;
 
+    prec->decoded_layers = 0;
+
     /* TODO: Explain formula for JPEG200 DCINEMA. */
     /* TODO: Verify with previous count of codeblocks per band */
 
     /* Compute P_x0 */
-    prec->coord[0][0] = (precno % reslevel->num_precincts_x) *
+    prec->coord[0][0] = ((band->coord[0][0] >> log2_band_prec_width) + precno % reslevel->num_precincts_x) *
                         (1 << log2_band_prec_width);
-    prec->coord[0][0] = FFMAX(prec->coord[0][0], band->coord[0][0]);
 
     /* Compute P_y0 */
-    prec->coord[1][0] = (precno / reslevel->num_precincts_x) *
+    prec->coord[1][0] = ((band->coord[1][0] >> log2_band_prec_height) + precno / reslevel->num_precincts_x) *
                         (1 << log2_band_prec_height);
-    prec->coord[1][0] = FFMAX(prec->coord[1][0], band->coord[1][0]);
 
     /* Compute P_x1 */
     prec->coord[0][1] = prec->coord[0][0] +
                         (1 << log2_band_prec_width);
+    prec->coord[0][0] = FFMAX(prec->coord[0][0], band->coord[0][0]);
     prec->coord[0][1] = FFMIN(prec->coord[0][1], band->coord[0][1]);
 
     /* Compute P_y1 */
     prec->coord[1][1] = prec->coord[1][0] +
                         (1 << log2_band_prec_height);
+    prec->coord[1][0] = FFMAX(prec->coord[1][0], band->coord[1][0]);
     prec->coord[1][1] = FFMIN(prec->coord[1][1], band->coord[1][1]);
 
     prec->nb_codeblocks_width =
-        ff_jpeg2000_ceildivpow2(prec->coord[0][1] -
-                                prec->coord[0][0],
-                                band->log2_cblk_width);
+        ff_jpeg2000_ceildivpow2(prec->coord[0][1],
+                                band->log2_cblk_width)
+        - (prec->coord[0][0] >> band->log2_cblk_width);
     prec->nb_codeblocks_height =
-        ff_jpeg2000_ceildivpow2(prec->coord[1][1] -
-                                prec->coord[1][0],
-                                band->log2_cblk_height);
+        ff_jpeg2000_ceildivpow2(prec->coord[1][1],
+                                band->log2_cblk_height)
+        - (prec->coord[1][0] >> band->log2_cblk_height);
+
 
     /* Tag trees initialization */
     prec->cblkincl =
@@ -299,22 +313,26 @@ static int init_prec(Jpeg2000Band *band,
     if (!prec->zerobits)
         return AVERROR(ENOMEM);
 
+    if (prec->nb_codeblocks_width * (uint64_t)prec->nb_codeblocks_height > INT_MAX) {
+        prec->cblk = NULL;
+        return AVERROR(ENOMEM);
+    }
     nb_codeblocks = prec->nb_codeblocks_width * prec->nb_codeblocks_height;
     prec->cblk = av_mallocz_array(nb_codeblocks, sizeof(*prec->cblk));
     if (!prec->cblk)
         return AVERROR(ENOMEM);
     for (cblkno = 0; cblkno < nb_codeblocks; cblkno++) {
         Jpeg2000Cblk *cblk = prec->cblk + cblkno;
-        uint16_t Cx0, Cy0;
+        int Cx0, Cy0;
 
         /* Compute coordinates of codeblocks */
         /* Compute Cx0*/
-        Cx0 = (prec->coord[0][0] >> band->log2_cblk_width) << band->log2_cblk_width;
+        Cx0 = ((prec->coord[0][0]) >> band->log2_cblk_width) << band->log2_cblk_width;
         Cx0 = Cx0 + ((cblkno % prec->nb_codeblocks_width)  << band->log2_cblk_width);
         cblk->coord[0][0] = FFMAX(Cx0, prec->coord[0][0]);
 
         /* Compute Cy0*/
-        Cy0 = (prec->coord[1][0] >> band->log2_cblk_height) << band->log2_cblk_height;
+        Cy0 = ((prec->coord[1][0]) >> band->log2_cblk_height) << band->log2_cblk_height;
         Cy0 = Cy0 + ((cblkno / prec->nb_codeblocks_width)   << band->log2_cblk_height);
         cblk->coord[1][0] = FFMAX(Cy0, prec->coord[1][0]);
 
@@ -342,7 +360,7 @@ static int init_prec(Jpeg2000Band *band,
         cblk->zero      = 0;
         cblk->lblock    = 3;
         cblk->length    = 0;
-        cblk->lengthinc = 0;
+        memset(cblk->lengthinc, 0, sizeof(cblk->lengthinc));
         cblk->npasses   = 0;
     }
 
@@ -366,7 +384,6 @@ static int init_band(AVCodecContext *avctx,
 
     init_band_stepsize(avctx, band, codsty, qntsty, bandno, gbandno, reslevelno, cbps);
 
-
     /* computation of tbx_0, tbx_1, tby_0, tby_1
      * see ISO/IEC 15444-1:2002 B.5 eq. B-15 and tbl B.1
      * codeblock width and height is computed for
@@ -376,7 +393,7 @@ static int init_band(AVCodecContext *avctx,
         for (i = 0; i < 2; i++)
             for (j = 0; j < 2; j++)
                 band->coord[i][j] =
-                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] - comp->coord_o[i][0],
+                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j],
                                             declvl - 1);
         log2_band_prec_width  = reslevel->log2_prec_width;
         log2_band_prec_height = reslevel->log2_prec_height;
@@ -392,8 +409,8 @@ static int init_band(AVCodecContext *avctx,
             for (j = 0; j < 2; j++)
                 /* Formula example for tbx_0 = ceildiv((tcx_0 - 2 ^ (declvl - 1) * x0_b) / declvl) */
                 band->coord[i][j] =
-                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] - comp->coord_o[i][0] -
-                                            (((bandno + 1 >> i) & 1) << declvl - 1),
+                    ff_jpeg2000_ceildivpow2(comp->coord_o[i][j] -
+                                            (((bandno + 1 >> i) & 1LL) << declvl - 1),
                                             declvl);
         /* TODO: Manage case of 3 band offsets here or
          * in coding/decoding function? */
@@ -408,11 +425,10 @@ static int init_band(AVCodecContext *avctx,
         log2_band_prec_height = reslevel->log2_prec_height - 1;
     }
 
-    for (j = 0; j < 2; j++)
-        band->coord[0][j] = ff_jpeg2000_ceildiv(band->coord[0][j], dx);
-    for (j = 0; j < 2; j++)
-        band->coord[1][j] = ff_jpeg2000_ceildiv(band->coord[1][j], dy);
-
+    if (reslevel->num_precincts_x * (uint64_t)reslevel->num_precincts_y > INT_MAX) {
+        band->prec = NULL;
+        return AVERROR(ENOMEM);
+    }
     nb_precincts = reslevel->num_precincts_x * reslevel->num_precincts_y;
     band->prec = av_mallocz_array(nb_precincts, sizeof(*band->prec));
     if (!band->prec)
@@ -438,8 +454,8 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
     int reslevelno, bandno, gbandno = 0, ret, i, j;
     uint32_t csize;
 
-    if (!codsty->nreslevels2decode) {
-        av_log(avctx, AV_LOG_ERROR, "nreslevels2decode uninitialized\n");
+    if (codsty->nreslevels2decode <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "nreslevels2decode %d invalid or uninitialized\n", codsty->nreslevels2decode);
         return AVERROR_INVALIDDATA;
     }
 
@@ -447,18 +463,28 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                    codsty->nreslevels2decode - 1,
                                    codsty->transform))
         return ret;
-    // component size comp->coord is uint16_t so ir cannot overflow
+
+    if (av_image_check_size(comp->coord[0][1] - comp->coord[0][0],
+                            comp->coord[1][1] - comp->coord[1][0], 0, avctx))
+        return AVERROR_INVALIDDATA;
     csize = (comp->coord[0][1] - comp->coord[0][0]) *
             (comp->coord[1][1] - comp->coord[1][0]);
+    if (comp->coord[0][1] - comp->coord[0][0] > 32768 ||
+        comp->coord[1][1] - comp->coord[1][0] > 32768) {
+        av_log(avctx, AV_LOG_ERROR, "component size too large\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     if (codsty->transform == FF_DWT97) {
+        csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->f_data);
         comp->i_data = NULL;
-        comp->f_data = av_malloc_array(csize, sizeof(*comp->f_data));
+        comp->f_data = av_mallocz_array(csize, sizeof(*comp->f_data));
         if (!comp->f_data)
             return AVERROR(ENOMEM);
     } else {
+        csize += AV_INPUT_BUFFER_PADDING_SIZE / sizeof(*comp->i_data);
         comp->f_data = NULL;
-        comp->i_data = av_malloc_array(csize, sizeof(*comp->i_data));
+        comp->i_data = av_mallocz_array(csize, sizeof(*comp->i_data));
         if (!comp->i_data)
             return AVERROR(ENOMEM);
     }
@@ -526,6 +552,27 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
     return 0;
 }
 
+void ff_jpeg2000_reinit(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
+{
+    int reslevelno, bandno, cblkno, precno;
+    for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++) {
+        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+        for (bandno = 0; bandno < rlevel->nbands; bandno++) {
+            Jpeg2000Band *band = rlevel->band + bandno;
+            for(precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++) {
+                Jpeg2000Prec *prec = band->prec + precno;
+                tag_tree_zero(prec->zerobits, prec->nb_codeblocks_width, prec->nb_codeblocks_height);
+                tag_tree_zero(prec->cblkincl, prec->nb_codeblocks_width, prec->nb_codeblocks_height);
+                for (cblkno = 0; cblkno < prec->nb_codeblocks_width * prec->nb_codeblocks_height; cblkno++) {
+                    Jpeg2000Cblk *cblk = prec->cblk + cblkno;
+                    cblk->length = 0;
+                    cblk->lblock = 3;
+                }
+            }
+        }
+    }
+}
+
 void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
 {
     int reslevelno, bandno, precno;
@@ -546,16 +593,12 @@ void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty)
 
             band = reslevel->band + bandno;
             for (precno = 0; precno < reslevel->num_precincts_x * reslevel->num_precincts_y; precno++) {
-                Jpeg2000Prec *prec;
-
-                if (!band->prec)
-                    continue;
-
-                prec = band->prec + precno;
-                av_freep(&prec->zerobits);
-                av_freep(&prec->cblkincl);
-                av_freep(&prec->cblk);
-
+                if (band->prec) {
+                    Jpeg2000Prec *prec = band->prec + precno;
+                    av_freep(&prec->zerobits);
+                    av_freep(&prec->cblkincl);
+                    av_freep(&prec->cblk);
+                }
             }
 
             av_freep(&band->prec);
diff --git a/libavcodec/jpeg2000.h b/libavcodec/jpeg2000.h
index b96b7e2..ed3b421 100644
--- a/libavcodec/jpeg2000.h
+++ b/libavcodec/jpeg2000.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -58,19 +58,20 @@ enum Jpeg2000Markers {
     JPEG2000_EOC = 0xffd9, // end of codestream
 };
 
+#define JPEG2000_SOP_FIXED_BYTES 0xFF910004
+#define JPEG2000_SOP_BYTE_LENGTH 6
+
 enum Jpeg2000Quantsty { // quantization style
     JPEG2000_QSTY_NONE, // no quantization
     JPEG2000_QSTY_SI,   // scalar derived
     JPEG2000_QSTY_SE    // scalar expounded
 };
 
-#define JPEG2000_MAX_CBLKW 64
-#define JPEG2000_MAX_CBLKH 64
-
-
-#define JPEG2000_MAX_DECLEVELS 32
+#define JPEG2000_MAX_DECLEVELS 33
 #define JPEG2000_MAX_RESLEVELS (JPEG2000_MAX_DECLEVELS + 1)
 
+#define JPEG2000_MAX_PASSES 100
+
 // T1 flags
 // flags determining significance of neighbor coefficients
 #define JPEG2000_T1_SIG_N  0x0001
@@ -118,9 +119,10 @@ enum Jpeg2000Quantsty { // quantization style
 #define JPEG2000_PGOD_CPRL      0x04  // Component-position-resolution level-layer progression
 
 typedef struct Jpeg2000T1Context {
-    int data[JPEG2000_MAX_CBLKW][JPEG2000_MAX_CBLKH];
-    int flags[JPEG2000_MAX_CBLKW + 2][JPEG2000_MAX_CBLKH + 2];
+    int data[6144];
+    uint16_t flags[6156];
     MqcState mqc;
+    int stride;
 } Jpeg2000T1Context;
 
 typedef struct Jpeg2000TgtNode {
@@ -130,8 +132,8 @@ typedef struct Jpeg2000TgtNode {
 } Jpeg2000TgtNode;
 
 typedef struct Jpeg2000CodingStyle {
-    uint8_t nreslevels;       // number of resolution levels
-    uint8_t nreslevels2decode; // number of resolution levels to decode
+    int nreslevels;           // number of resolution levels
+    int nreslevels2decode;    // number of resolution levels to decode
     uint8_t log2_cblk_width,
             log2_cblk_height; // exponent of codeblock size
     uint8_t transform;        // DWT type
@@ -146,34 +148,47 @@ typedef struct Jpeg2000CodingStyle {
 
 typedef struct Jpeg2000QuantStyle {
     uint8_t expn[JPEG2000_MAX_DECLEVELS * 3];  // quantization exponent
-    uint32_t mant[JPEG2000_MAX_DECLEVELS * 3]; // quantization mantissa
+    uint16_t mant[JPEG2000_MAX_DECLEVELS * 3]; // quantization mantissa
     uint8_t quantsty;      // quantization style
     uint8_t nguardbits;    // number of guard bits
 } Jpeg2000QuantStyle;
 
+typedef struct Jpeg2000Pass {
+    uint16_t rate;
+    int64_t disto;
+    uint8_t flushed[4];
+    int flushed_len;
+} Jpeg2000Pass;
+
 typedef struct Jpeg2000Cblk {
     uint8_t npasses;
     uint8_t ninclpasses; // number coding of passes included in codestream
     uint8_t nonzerobits;
     uint16_t length;
-    uint16_t lengthinc;
+    uint16_t lengthinc[JPEG2000_MAX_PASSES];
+    uint8_t nb_lengthinc;
     uint8_t lblock;
     uint8_t zero;
     uint8_t data[8192];
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    int nb_terminations;
+    int nb_terminationsinc;
+    int data_start[JPEG2000_MAX_PASSES];
+    Jpeg2000Pass passes[JPEG2000_MAX_PASSES];
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Cblk; // code block
 
 typedef struct Jpeg2000Prec {
-    uint16_t nb_codeblocks_width;
-    uint16_t nb_codeblocks_height;
+    int nb_codeblocks_width;
+    int nb_codeblocks_height;
     Jpeg2000TgtNode *zerobits;
     Jpeg2000TgtNode *cblkincl;
     Jpeg2000Cblk *cblk;
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    int decoded_layers;
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Prec; // precinct
 
 typedef struct Jpeg2000Band {
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
     uint16_t log2_cblk_width, log2_cblk_height;
     int i_stepsize; // quantization stepsize
     float f_stepsize; // quantization stepsize
@@ -182,8 +197,8 @@ typedef struct Jpeg2000Band {
 
 typedef struct Jpeg2000ResLevel {
     uint8_t nbands;
-    uint16_t coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
-    uint16_t num_precincts_x, num_precincts_y; // number of precincts in x/y direction
+    int coord[2][2]; // border coordinates {{x0, x1}, {y0, y1}}
+    int num_precincts_x, num_precincts_y; // number of precincts in x/y direction
     uint8_t log2_prec_width, log2_prec_height; // exponent of precinct size
     Jpeg2000Band *band;
 } Jpeg2000ResLevel; // resolution level
@@ -193,14 +208,14 @@ typedef struct Jpeg2000Component {
     DWTContext dwt;
     float *f_data;
     int *i_data;
-    uint16_t coord[2][2];   // border coordinates {{x0, x1}, {y0, y1}} -- can be reduced with lowres option
-    uint16_t coord_o[2][2]; // border coordinates {{x0, x1}, {y0, y1}} -- original values from jpeg2000 headers
+    int coord[2][2];   // border coordinates {{x0, x1}, {y0, y1}} -- can be reduced with lowres option
+    int coord_o[2][2]; // border coordinates {{x0, x1}, {y0, y1}} -- original values from jpeg2000 headers
 } Jpeg2000Component;
 
 /* misc tools */
 static inline int ff_jpeg2000_ceildivpow2(int a, int b)
 {
-    return (a + (1 << b) - 1) >> b;
+    return -(((int64_t)(-a)) >> b);
 }
 
 static inline int ff_jpeg2000_ceildiv(int a, int b)
@@ -252,6 +267,25 @@ int ff_jpeg2000_init_component(Jpeg2000Component *comp,
                                int cbps, int dx, int dy,
                                AVCodecContext *ctx);
 
+void ff_jpeg2000_reinit(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty);
+
 void ff_jpeg2000_cleanup(Jpeg2000Component *comp, Jpeg2000CodingStyle *codsty);
 
+static inline int needs_termination(int style, int passno) {
+    if (style & JPEG2000_CBLK_BYPASS) {
+        int type = passno % 3;
+        passno /= 3;
+        if (type == 0 && passno > 2)
+            return 2;
+        if (type == 2 && passno > 2)
+            return 1;
+        if (style & JPEG2000_CBLK_TERMALL) {
+            return passno > 2 ? 2 : 1;
+        }
+    }
+    if (style & JPEG2000_CBLK_TERMALL)
+        return 1;
+    return 0;
+}
+
 #endif /* AVCODEC_JPEG2000_H */
diff --git a/libavcodec/jpeg2000dec.c b/libavcodec/jpeg2000dec.c
index deab1e8..e9f5f51 100644
--- a/libavcodec/jpeg2000dec.c
+++ b/libavcodec/jpeg2000dec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,11 @@
 #include <inttypes.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -41,10 +44,28 @@
 #define JP2_SIG_TYPE    0x6A502020
 #define JP2_SIG_VALUE   0x0D0A870A
 #define JP2_CODESTREAM  0x6A703263
+#define JP2_HEADER      0x6A703268
 
 #define HAD_COC 0x01
 #define HAD_QCC 0x02
 
+#define MAX_POCS 32
+
+typedef struct Jpeg2000POCEntry {
+    uint16_t LYEpoc;
+    uint16_t CSpoc;
+    uint16_t CEpoc;
+    uint8_t RSpoc;
+    uint8_t REpoc;
+    uint8_t Ppoc;
+} Jpeg2000POCEntry;
+
+typedef struct Jpeg2000POC {
+    Jpeg2000POCEntry poc[MAX_POCS];
+    int nb_poc;
+    int is_default;
+} Jpeg2000POC;
+
 typedef struct Jpeg2000TilePart {
     uint8_t tile_index;                 // Tile index who refers the tile-part
     const uint8_t *tp_end;
@@ -58,14 +79,16 @@ typedef struct Jpeg2000Tile {
     uint8_t             properties[4];
     Jpeg2000CodingStyle codsty[4];
     Jpeg2000QuantStyle  qntsty[4];
-    Jpeg2000TilePart    tile_part[3];
+    Jpeg2000POC         poc;
+    Jpeg2000TilePart    tile_part[256];
     uint16_t tp_idx;                    // Tile-part index
+    int coord[2][2];                    // border coordinates {{x0, x1}, {y0, y1}}
 } Jpeg2000Tile;
 
 typedef struct Jpeg2000DecoderContext {
     AVClass         *class;
     AVCodecContext  *avctx;
-    GetByteContext g;
+    GetByteContext  g;
 
     int             width, height;
     int             image_offset_x, image_offset_y;
@@ -76,16 +99,22 @@ typedef struct Jpeg2000DecoderContext {
     int             cdx[4], cdy[4];
     int             precision;
     int             ncomponents;
+    int             colour_space;
+    uint32_t        palette[256];
+    int8_t          pal8;
+    int             cdef[4];
     int             tile_width, tile_height;
     unsigned        numXtiles, numYtiles;
     int             maxtilelen;
 
     Jpeg2000CodingStyle codsty[4];
     Jpeg2000QuantStyle  qntsty[4];
+    Jpeg2000POC         poc;
 
     int             bit_index;
 
-    int16_t         curtileno;
+    int             curtileno;
+
     Jpeg2000Tile    *tile;
     Jpeg2000DSPContext dsp;
 
@@ -100,6 +129,7 @@ typedef struct Jpeg2000DecoderContext {
 static int get_bits(Jpeg2000DecoderContext *s, int n)
 {
     int res = 0;
+
     while (--n >= 0) {
         res <<= 1;
         if (s->bit_index == 0) {
@@ -125,8 +155,10 @@ static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *node,
     Jpeg2000TgtNode *stack[30];
     int sp = -1, curval = 0;
 
-    if (!node)
+    if (!node) {
+        av_log(s->avctx, AV_LOG_ERROR, "missing node\n");
         return AVERROR_INVALIDDATA;
+    }
 
     while (node && !node->vis) {
         stack[++sp] = node;
@@ -157,15 +189,82 @@ static int tag_tree_decode(Jpeg2000DecoderContext *s, Jpeg2000TgtNode *node,
     return curval;
 }
 
+static int pix_fmt_match(enum AVPixelFormat pix_fmt, int components,
+                         int bpc, uint32_t log2_chroma_wh, int pal8)
+{
+    int match = 1;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+
+    av_assert2(desc);
+
+    if (desc->nb_components != components) {
+        return 0;
+    }
+
+    switch (components) {
+    case 4:
+        match = match && desc->comp[3].depth >= bpc &&
+                         (log2_chroma_wh >> 14 & 3) == 0 &&
+                         (log2_chroma_wh >> 12 & 3) == 0;
+    case 3:
+        match = match && desc->comp[2].depth >= bpc &&
+                         (log2_chroma_wh >> 10 & 3) == desc->log2_chroma_w &&
+                         (log2_chroma_wh >>  8 & 3) == desc->log2_chroma_h;
+    case 2:
+        match = match && desc->comp[1].depth >= bpc &&
+                         (log2_chroma_wh >>  6 & 3) == desc->log2_chroma_w &&
+                         (log2_chroma_wh >>  4 & 3) == desc->log2_chroma_h;
+
+    case 1:
+        match = match && desc->comp[0].depth >= bpc &&
+                         (log2_chroma_wh >>  2 & 3) == 0 &&
+                         (log2_chroma_wh       & 3) == 0 &&
+                         (desc->flags & AV_PIX_FMT_FLAG_PAL) == pal8 * AV_PIX_FMT_FLAG_PAL;
+    }
+    return match;
+}
+
+// pix_fmts with lower bpp have to be listed before
+// similar pix_fmts with higher bpp.
+#define RGB_PIXEL_FORMATS   AV_PIX_FMT_PAL8,AV_PIX_FMT_RGB24,AV_PIX_FMT_RGBA,AV_PIX_FMT_RGB48,AV_PIX_FMT_RGBA64
+#define GRAY_PIXEL_FORMATS  AV_PIX_FMT_GRAY8,AV_PIX_FMT_GRAY8A,AV_PIX_FMT_GRAY16,AV_PIX_FMT_YA16
+#define YUV_PIXEL_FORMATS   AV_PIX_FMT_YUV410P,AV_PIX_FMT_YUV411P,AV_PIX_FMT_YUVA420P, \
+                            AV_PIX_FMT_YUV420P,AV_PIX_FMT_YUV422P,AV_PIX_FMT_YUVA422P, \
+                            AV_PIX_FMT_YUV440P,AV_PIX_FMT_YUV444P,AV_PIX_FMT_YUVA444P, \
+                            AV_PIX_FMT_YUV420P9,AV_PIX_FMT_YUV422P9,AV_PIX_FMT_YUV444P9, \
+                            AV_PIX_FMT_YUVA420P9,AV_PIX_FMT_YUVA422P9,AV_PIX_FMT_YUVA444P9, \
+                            AV_PIX_FMT_YUV420P10,AV_PIX_FMT_YUV422P10,AV_PIX_FMT_YUV444P10, \
+                            AV_PIX_FMT_YUVA420P10,AV_PIX_FMT_YUVA422P10,AV_PIX_FMT_YUVA444P10, \
+                            AV_PIX_FMT_YUV420P12,AV_PIX_FMT_YUV422P12,AV_PIX_FMT_YUV444P12, \
+                            AV_PIX_FMT_YUV420P14,AV_PIX_FMT_YUV422P14,AV_PIX_FMT_YUV444P14, \
+                            AV_PIX_FMT_YUV420P16,AV_PIX_FMT_YUV422P16,AV_PIX_FMT_YUV444P16, \
+                            AV_PIX_FMT_YUVA420P16,AV_PIX_FMT_YUVA422P16,AV_PIX_FMT_YUVA444P16
+#define XYZ_PIXEL_FORMATS   AV_PIX_FMT_XYZ12
+
+static const enum AVPixelFormat rgb_pix_fmts[]  = {RGB_PIXEL_FORMATS};
+static const enum AVPixelFormat gray_pix_fmts[] = {GRAY_PIXEL_FORMATS};
+static const enum AVPixelFormat yuv_pix_fmts[]  = {YUV_PIXEL_FORMATS};
+static const enum AVPixelFormat xyz_pix_fmts[]  = {XYZ_PIXEL_FORMATS,
+                                                   YUV_PIXEL_FORMATS};
+static const enum AVPixelFormat all_pix_fmts[]  = {RGB_PIXEL_FORMATS,
+                                                   GRAY_PIXEL_FORMATS,
+                                                   YUV_PIXEL_FORMATS,
+                                                   XYZ_PIXEL_FORMATS};
+
 /* marker segments */
 /* get sizes and offsets of image, tiles; number of components */
 static int get_siz(Jpeg2000DecoderContext *s)
 {
     int i;
     int ncomponents;
+    uint32_t log2_chroma_wh = 0;
+    const enum AVPixelFormat *possible_fmts = NULL;
+    int possible_fmts_nb = 0;
 
-    if (bytestream2_get_bytes_left(&s->g) < 36)
+    if (bytestream2_get_bytes_left(&s->g) < 36) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for SIZ\n");
         return AVERROR_INVALIDDATA;
+    }
 
     s->avctx->profile = bytestream2_get_be16u(&s->g); // Rsiz
     s->width          = bytestream2_get_be32u(&s->g); // Width
@@ -178,6 +277,15 @@ static int get_siz(Jpeg2000DecoderContext *s)
     s->tile_offset_y  = bytestream2_get_be32u(&s->g); // YT0Siz
     ncomponents       = bytestream2_get_be16u(&s->g); // CSiz
 
+    if (s->image_offset_x || s->image_offset_y) {
+        avpriv_request_sample(s->avctx, "Support for image offsets");
+        return AVERROR_PATCHWELCOME;
+    }
+    if (av_image_check_size(s->width, s->height, 0, s->avctx)) {
+        avpriv_request_sample(s->avctx, "Large Dimensions");
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (ncomponents <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid number of components: %d\n",
                s->ncomponents);
@@ -186,21 +294,22 @@ static int get_siz(Jpeg2000DecoderContext *s)
 
     if (ncomponents > 4) {
         avpriv_request_sample(s->avctx, "Support for %d components",
-                              s->ncomponents);
+                              ncomponents);
         return AVERROR_PATCHWELCOME;
     }
 
     s->ncomponents = ncomponents;
 
-    if (s->tile_width <= 0 || s->tile_height <= 0 ||
-        s->tile_width > s->width || s->tile_height > s->height) {
+    if (s->tile_width <= 0 || s->tile_height <= 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid tile dimension %dx%d.\n",
                s->tile_width, s->tile_height);
         return AVERROR_INVALIDDATA;
     }
 
-    if (bytestream2_get_bytes_left(&s->g) < 3 * s->ncomponents)
+    if (bytestream2_get_bytes_left(&s->g) < 3 * s->ncomponents) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for %d components in SIZ\n", s->ncomponents);
         return AVERROR_INVALIDDATA;
+    }
 
     for (i = 0; i < s->ncomponents; i++) { // Ssiz_i XRsiz_i, YRsiz_i
         uint8_t x    = bytestream2_get_byteu(&s->g);
@@ -209,21 +318,22 @@ static int get_siz(Jpeg2000DecoderContext *s)
         s->sgnd[i]   = !!(x & 0x80);
         s->cdx[i]    = bytestream2_get_byteu(&s->g);
         s->cdy[i]    = bytestream2_get_byteu(&s->g);
-
-        if (s->cdx[i] != 1 || s->cdy[i] != 1) {
-            avpriv_request_sample(s->avctx,
-                                  "CDxy values %d %d for component %d",
-                                  s->cdx[i], s->cdy[i], i);
-            if (!s->cdx[i] || !s->cdy[i])
-                return AVERROR_INVALIDDATA;
-            else
-                return AVERROR_PATCHWELCOME;
+        if (   !s->cdx[i] || s->cdx[i] == 3 || s->cdx[i] > 4
+            || !s->cdy[i] || s->cdy[i] == 3 || s->cdy[i] > 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid sample separation %d/%d\n", s->cdx[i], s->cdy[i]);
+            return AVERROR_INVALIDDATA;
         }
+        log2_chroma_wh |= s->cdy[i] >> 1 << i * 4 | s->cdx[i] >> 1 << i * 4 + 2;
     }
 
     s->numXtiles = ff_jpeg2000_ceildiv(s->width  - s->tile_offset_x, s->tile_width);
     s->numYtiles = ff_jpeg2000_ceildiv(s->height - s->tile_offset_y, s->tile_height);
 
+    if (s->numXtiles * (uint64_t)s->numYtiles > INT_MAX/sizeof(*s->tile)) {
+        s->numXtiles = s->numYtiles = 0;
+        return AVERROR(EINVAL);
+    }
+
     s->tile = av_mallocz_array(s->numXtiles * s->numYtiles, sizeof(*s->tile));
     if (!s->tile) {
         s->numXtiles = s->numYtiles = 0;
@@ -244,36 +354,74 @@ static int get_siz(Jpeg2000DecoderContext *s)
     s->avctx->height = ff_jpeg2000_ceildivpow2(s->height - s->image_offset_y,
                                                s->reduction_factor);
 
-    switch (s->ncomponents) {
-    case 1:
-        if (s->precision > 8)
-            s->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
-        else
-            s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
-        break;
-    case 3:
-        switch (s->avctx->profile) {
-        case FF_PROFILE_JPEG2000_DCINEMA_2K:
-        case FF_PROFILE_JPEG2000_DCINEMA_4K:
-            /* XYZ color-space for digital cinema profiles */
-            s->avctx->pix_fmt = AV_PIX_FMT_XYZ12;
+    if (s->avctx->profile == FF_PROFILE_JPEG2000_DCINEMA_2K ||
+        s->avctx->profile == FF_PROFILE_JPEG2000_DCINEMA_4K) {
+        possible_fmts = xyz_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(xyz_pix_fmts);
+    } else {
+        switch (s->colour_space) {
+        case 16:
+            possible_fmts = rgb_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(rgb_pix_fmts);
+            break;
+        case 17:
+            possible_fmts = gray_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(gray_pix_fmts);
+            break;
+        case 18:
+            possible_fmts = yuv_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(yuv_pix_fmts);
             break;
         default:
-            if (s->precision > 8)
-                s->avctx->pix_fmt = AV_PIX_FMT_RGB48;
-            else
-                s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            possible_fmts = all_pix_fmts;
+            possible_fmts_nb = FF_ARRAY_ELEMS(all_pix_fmts);
             break;
         }
-        break;
-    case 4:
-        s->avctx->pix_fmt = AV_PIX_FMT_RGBA;
-        break;
-    default:
-        /* pixel format can not be identified */
-        s->avctx->pix_fmt = AV_PIX_FMT_NONE;
-        break;
     }
+    for (i = 0; i < possible_fmts_nb; ++i) {
+        if (pix_fmt_match(possible_fmts[i], ncomponents, s->precision, log2_chroma_wh, s->pal8)) {
+            s->avctx->pix_fmt = possible_fmts[i];
+            break;
+        }
+    }
+
+    if (i == possible_fmts_nb) {
+        if (ncomponents == 4 &&
+            s->cdy[0] == 1 && s->cdx[0] == 1 &&
+            s->cdy[1] == 1 && s->cdx[1] == 1 &&
+            s->cdy[2] == s->cdy[3] && s->cdx[2] == s->cdx[3]) {
+            if (s->precision == 8 && s->cdy[2] == 2 && s->cdx[2] == 2 && !s->pal8) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+                s->cdef[0] = 0;
+                s->cdef[1] = 1;
+                s->cdef[2] = 2;
+                s->cdef[3] = 3;
+                i = 0;
+            }
+        }
+    }
+
+
+    if (i == possible_fmts_nb) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Unknown pix_fmt, profile: %d, colour_space: %d, "
+               "components: %d, precision: %d\n"
+               "cdx[0]: %d, cdy[0]: %d\n"
+               "cdx[1]: %d, cdy[1]: %d\n"
+               "cdx[2]: %d, cdy[2]: %d\n"
+               "cdx[3]: %d, cdy[3]: %d\n",
+               s->avctx->profile, s->colour_space, ncomponents, s->precision,
+               s->cdx[0],
+               s->cdy[0],
+               ncomponents > 1 ? s->cdx[1] : 0,
+               ncomponents > 1 ? s->cdy[1] : 0,
+               ncomponents > 2 ? s->cdx[2] : 0,
+               ncomponents > 2 ? s->cdy[2] : 0,
+               ncomponents > 3 ? s->cdx[3] : 0,
+               ncomponents > 3 ? s->cdy[3] : 0);
+        return AVERROR_PATCHWELCOME;
+    }
+    s->avctx->bits_per_raw_sample = s->precision;
     return 0;
 }
 
@@ -282,24 +430,34 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
 {
     uint8_t byte;
 
-    if (bytestream2_get_bytes_left(&s->g) < 5)
+    if (bytestream2_get_bytes_left(&s->g) < 5) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COX\n");
         return AVERROR_INVALIDDATA;
+    }
 
     /*  nreslevels = number of resolution levels
                    = number of decomposition level +1 */
     c->nreslevels = bytestream2_get_byteu(&s->g) + 1;
-
-    if (c->nreslevels > JPEG2000_MAX_RESLEVELS)
+    if (c->nreslevels >= JPEG2000_MAX_RESLEVELS) {
+        av_log(s->avctx, AV_LOG_ERROR, "nreslevels %d is invalid\n", c->nreslevels);
         return AVERROR_INVALIDDATA;
+    }
+
+    if (c->nreslevels <= s->reduction_factor) {
+        /* we are forced to update reduction_factor as its requested value is
+           not compatible with this bitstream, and as we might have used it
+           already in setup earlier we have to fail this frame until
+           reinitialization is implemented */
+        av_log(s->avctx, AV_LOG_ERROR, "reduction_factor too large for this bitstream, max is %d\n", c->nreslevels - 1);
+        s->reduction_factor = c->nreslevels - 1;
+        return AVERROR(EINVAL);
+    }
 
     /* compute number of resolution levels to decode */
-    if (c->nreslevels < s->reduction_factor)
-        c->nreslevels2decode = 1;
-    else
-        c->nreslevels2decode = c->nreslevels - s->reduction_factor;
+    c->nreslevels2decode = c->nreslevels - s->reduction_factor;
 
-    c->log2_cblk_width  = bytestream2_get_byteu(&s->g) + 2; // cblk width
-    c->log2_cblk_height = bytestream2_get_byteu(&s->g) + 2; // cblk height
+    c->log2_cblk_width  = (bytestream2_get_byteu(&s->g) & 15) + 2; // cblk width
+    c->log2_cblk_height = (bytestream2_get_byteu(&s->g) & 15) + 2; // cblk height
 
     if (c->log2_cblk_width > 10 || c->log2_cblk_height > 10 ||
         c->log2_cblk_width + c->log2_cblk_height > 12) {
@@ -309,13 +467,17 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
 
     c->cblk_style = bytestream2_get_byteu(&s->g);
     if (c->cblk_style != 0) { // cblk style
-        avpriv_request_sample(s->avctx, "Support for extra cblk styles");
-        return AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "extra cblk styles %X\n", c->cblk_style);
+        if (c->cblk_style & JPEG2000_CBLK_BYPASS)
+            av_log(s->avctx, AV_LOG_WARNING, "Selective arithmetic coding bypass\n");
     }
     c->transform = bytestream2_get_byteu(&s->g); // DWT transformation type
     /* set integer 9/7 DWT in case of BITEXACT flag */
     if ((s->avctx->flags & AV_CODEC_FLAG_BITEXACT) && (c->transform == FF_DWT97))
         c->transform = FF_DWT97_INT;
+    else if (c->transform == FF_DWT53) {
+        s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
+    }
 
     if (c->csty & JPEG2000_CSTY_PREC) {
         int i;
@@ -323,6 +485,13 @@ static int get_cox(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c)
             byte = bytestream2_get_byte(&s->g);
             c->log2_prec_widths[i]  =  byte       & 0x0F;    // precinct PPx
             c->log2_prec_heights[i] = (byte >> 4) & 0x0F;    // precinct PPy
+            if (i)
+                if (c->log2_prec_widths[i] == 0 || c->log2_prec_heights[i] == 0) {
+                    av_log(s->avctx, AV_LOG_ERROR, "PPx %d PPy %d invalid\n",
+                           c->log2_prec_widths[i], c->log2_prec_heights[i]);
+                    c->log2_prec_widths[i] = c->log2_prec_heights[i] = 1;
+                    return AVERROR_INVALIDDATA;
+                }
         }
     } else {
         memset(c->log2_prec_widths , 15, sizeof(c->log2_prec_widths ));
@@ -338,8 +507,10 @@ static int get_cod(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c,
     Jpeg2000CodingStyle tmp;
     int compno, ret;
 
-    if (bytestream2_get_bytes_left(&s->g) < 5)
+    if (bytestream2_get_bytes_left(&s->g) < 5) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COD\n");
         return AVERROR_INVALIDDATA;
+    }
 
     tmp.csty = bytestream2_get_byteu(&s->g);
 
@@ -372,8 +543,10 @@ static int get_coc(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *c,
 {
     int compno, ret;
 
-    if (bytestream2_get_bytes_left(&s->g) < 2)
+    if (bytestream2_get_bytes_left(&s->g) < 2) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for COC\n");
         return AVERROR_INVALIDDATA;
+    }
 
     compno = bytestream2_get_byteu(&s->g);
 
@@ -410,7 +583,7 @@ static int get_qcx(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q)
     if (q->quantsty == JPEG2000_QSTY_NONE) {
         n -= 3;
         if (bytestream2_get_bytes_left(&s->g) < n ||
-            n > JPEG2000_MAX_DECLEVELS)
+            n > JPEG2000_MAX_DECLEVELS*3)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < n; i++)
             q->expn[i] = bytestream2_get_byteu(&s->g) >> 3;
@@ -428,7 +601,7 @@ static int get_qcx(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q)
     } else {
         n = (n - 3) >> 1;
         if (bytestream2_get_bytes_left(&s->g) < 2 * n ||
-            n > JPEG2000_MAX_DECLEVELS)
+            n > JPEG2000_MAX_DECLEVELS*3)
             return AVERROR_INVALIDDATA;
         for (i = 0; i < n; i++) {
             x          = bytestream2_get_be16u(&s->g);
@@ -446,6 +619,8 @@ static int get_qcd(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q,
     Jpeg2000QuantStyle tmp;
     int compno, ret;
 
+    memset(&tmp, 0, sizeof(tmp));
+
     if ((ret = get_qcx(s, n, &tmp)) < 0)
         return ret;
     for (compno = 0; compno < s->ncomponents; compno++)
@@ -477,40 +652,99 @@ static int get_qcc(Jpeg2000DecoderContext *s, int n, Jpeg2000QuantStyle *q,
     return get_qcx(s, n - 1, q + compno);
 }
 
+static int get_poc(Jpeg2000DecoderContext *s, int size, Jpeg2000POC *p)
+{
+    int i;
+    int elem_size = s->ncomponents <= 257 ? 7 : 9;
+    Jpeg2000POC tmp = {{{0}}};
+
+    if (bytestream2_get_bytes_left(&s->g) < 5 || size < 2 + elem_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for POC\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (elem_size > 7) {
+        avpriv_request_sample(s->avctx, "Fat POC not supported");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    tmp.nb_poc = (size - 2) / elem_size;
+    if (tmp.nb_poc > MAX_POCS) {
+        avpriv_request_sample(s->avctx, "Too many POCs (%d)", tmp.nb_poc);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    for (i = 0; i<tmp.nb_poc; i++) {
+        Jpeg2000POCEntry *e = &tmp.poc[i];
+        e->RSpoc  = bytestream2_get_byteu(&s->g);
+        e->CSpoc  = bytestream2_get_byteu(&s->g);
+        e->LYEpoc = bytestream2_get_be16u(&s->g);
+        e->REpoc  = bytestream2_get_byteu(&s->g);
+        e->CEpoc  = bytestream2_get_byteu(&s->g);
+        e->Ppoc   = bytestream2_get_byteu(&s->g);
+        if (!e->CEpoc)
+            e->CEpoc = 256;
+        if (e->CEpoc > s->ncomponents)
+            e->CEpoc = s->ncomponents;
+        if (   e->RSpoc >= e->REpoc || e->REpoc > 33
+            || e->CSpoc >= e->CEpoc || e->CEpoc > s->ncomponents
+            || !e->LYEpoc) {
+            av_log(s->avctx, AV_LOG_ERROR, "POC Entry %d is invalid (%d, %d, %d, %d, %d, %d)\n", i,
+                e->RSpoc, e->CSpoc, e->LYEpoc, e->REpoc, e->CEpoc, e->Ppoc
+            );
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (!p->nb_poc || p->is_default) {
+        *p = tmp;
+    } else {
+        if (p->nb_poc + tmp.nb_poc > MAX_POCS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Insufficient space for POC\n");
+            return AVERROR_INVALIDDATA;
+        }
+        memcpy(p->poc + p->nb_poc, tmp.poc, tmp.nb_poc * sizeof(tmp.poc[0]));
+        p->nb_poc += tmp.nb_poc;
+    }
+
+    p->is_default = 0;
+
+    return 0;
+}
+
+
 /* Get start of tile segment. */
 static int get_sot(Jpeg2000DecoderContext *s, int n)
 {
     Jpeg2000TilePart *tp;
     uint16_t Isot;
     uint32_t Psot;
-    uint8_t TPsot;
+    unsigned TPsot;
 
     if (bytestream2_get_bytes_left(&s->g) < 8)
         return AVERROR_INVALIDDATA;
 
+    s->curtileno = 0;
     Isot = bytestream2_get_be16u(&s->g);        // Isot
     if (Isot >= s->numXtiles * s->numYtiles)
         return AVERROR_INVALIDDATA;
 
-    if (Isot) {
-        avpriv_request_sample(s->avctx, "Support for more than one tile");
-        return AVERROR_PATCHWELCOME;
-    }
+    s->curtileno = Isot;
     Psot  = bytestream2_get_be32u(&s->g);       // Psot
     TPsot = bytestream2_get_byteu(&s->g);       // TPsot
 
     /* Read TNSot but not used */
     bytestream2_get_byteu(&s->g);               // TNsot
 
-    if (Psot > bytestream2_get_bytes_left(&s->g) + n + 2) {
+    if (!Psot)
+        Psot = bytestream2_get_bytes_left(&s->g) - 2 + n + 2;
+
+    if (Psot > bytestream2_get_bytes_left(&s->g) - 2 + n + 2) {
         av_log(s->avctx, AV_LOG_ERROR, "Psot %"PRIu32" too big\n", Psot);
         return AVERROR_INVALIDDATA;
     }
 
-    if (TPsot >= FF_ARRAY_ELEMS(s->tile[Isot].tile_part)) {
-        avpriv_request_sample(s->avctx, "Support for %"PRIu8" components", TPsot);
-        return AVERROR_PATCHWELCOME;
-    }
+    av_assert0(TPsot < FF_ARRAY_ELEMS(s->tile[Isot].tile_part));
 
     s->tile[Isot].tp_idx = TPsot;
     tp             = s->tile[Isot].tile_part + TPsot;
@@ -523,6 +757,8 @@ static int get_sot(Jpeg2000DecoderContext *s, int n)
         /* copy defaults */
         memcpy(tile->codsty, s->codsty, s->ncomponents * sizeof(Jpeg2000CodingStyle));
         memcpy(tile->qntsty, s->qntsty, s->ncomponents * sizeof(Jpeg2000QuantStyle));
+        memcpy(&tile->poc  , &s->poc  , sizeof(tile->poc));
+        tile->poc.is_default = 1;
     }
 
     return 0;
@@ -570,6 +806,22 @@ static uint8_t get_tlm(Jpeg2000DecoderContext *s, int n)
     return 0;
 }
 
+static uint8_t get_plt(Jpeg2000DecoderContext *s, int n)
+{
+    int i;
+
+    av_log(s->avctx, AV_LOG_DEBUG,
+            "PLT marker at pos 0x%X\n", bytestream2_tell(&s->g) - 4);
+
+    /*Zplt =*/ bytestream2_get_byte(&s->g);
+
+    for (i = 0; i < n - 3; i++) {
+        bytestream2_get_byte(&s->g);
+    }
+
+    return 0;
+}
+
 static int init_tile(Jpeg2000DecoderContext *s, int tileno)
 {
     int compno;
@@ -580,16 +832,27 @@ static int init_tile(Jpeg2000DecoderContext *s, int tileno)
     if (!tile->comp)
         return AVERROR(ENOMEM);
 
+    tile->coord[0][0] = av_clip(tilex       * (int64_t)s->tile_width  + s->tile_offset_x, s->image_offset_x, s->width);
+    tile->coord[0][1] = av_clip((tilex + 1) * (int64_t)s->tile_width  + s->tile_offset_x, s->image_offset_x, s->width);
+    tile->coord[1][0] = av_clip(tiley       * (int64_t)s->tile_height + s->tile_offset_y, s->image_offset_y, s->height);
+    tile->coord[1][1] = av_clip((tiley + 1) * (int64_t)s->tile_height + s->tile_offset_y, s->image_offset_y, s->height);
+
     for (compno = 0; compno < s->ncomponents; compno++) {
         Jpeg2000Component *comp = tile->comp + compno;
         Jpeg2000CodingStyle *codsty = tile->codsty + compno;
         Jpeg2000QuantStyle  *qntsty = tile->qntsty + compno;
         int ret; // global bandno
 
-        comp->coord_o[0][0] = FFMAX(tilex       * s->tile_width  + s->tile_offset_x, s->image_offset_x);
-        comp->coord_o[0][1] = FFMIN((tilex + 1) * s->tile_width  + s->tile_offset_x, s->width);
-        comp->coord_o[1][0] = FFMAX(tiley       * s->tile_height + s->tile_offset_y, s->image_offset_y);
-        comp->coord_o[1][1] = FFMIN((tiley + 1) * s->tile_height + s->tile_offset_y, s->height);
+        comp->coord_o[0][0] = tile->coord[0][0];
+        comp->coord_o[0][1] = tile->coord[0][1];
+        comp->coord_o[1][0] = tile->coord[1][0];
+        comp->coord_o[1][1] = tile->coord[1][1];
+        if (compno) {
+            comp->coord_o[0][0] /= s->cdx[compno];
+            comp->coord_o[0][1] /= s->cdx[compno];
+            comp->coord_o[1][0] /= s->cdy[compno];
+            comp->coord_o[1][1] /= s->cdy[compno];
+        }
 
         comp->coord[0][0] = ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], s->reduction_factor);
         comp->coord[0][1] = ff_jpeg2000_ceildivpow2(comp->coord_o[0][1], s->reduction_factor);
@@ -631,12 +894,26 @@ static int getlblockinc(Jpeg2000DecoderContext *s)
     return res;
 }
 
-static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
+static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile, int *tp_index,
                                   Jpeg2000CodingStyle *codsty,
                                   Jpeg2000ResLevel *rlevel, int precno,
                                   int layno, uint8_t *expn, int numgbits)
 {
     int bandno, cblkno, ret, nb_code_blocks;
+    int cwsno;
+
+    if (layno < rlevel->band[0].prec[precno].decoded_layers)
+        return 0;
+    rlevel->band[0].prec[precno].decoded_layers = layno + 1;
+
+    if (bytestream2_get_bytes_left(&s->g) == 0 && s->bit_index == 8) {
+        if (*tp_index < FF_ARRAY_ELEMS(tile->tile_part) - 1) {
+            s->g = tile->tile_part[++(*tp_index)].tpg;
+        }
+    }
+
+    if (bytestream2_peek_be32(&s->g) == JPEG2000_SOP_FIXED_BYTES)
+        bytestream2_skip(&s->g, JPEG2000_SOP_BYTE_LENGTH);
 
     if (!(ret = get_bits(s, 1))) {
         jpeg2000_flush(s);
@@ -678,19 +955,46 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
             }
             if ((newpasses = getnpasses(s)) < 0)
                 return newpasses;
+            av_assert2(newpasses > 0);
+            if (cblk->npasses + newpasses >= JPEG2000_MAX_PASSES) {
+                avpriv_request_sample(s->avctx, "Too many passes");
+                return AVERROR_PATCHWELCOME;
+            }
             if ((llen = getlblockinc(s)) < 0)
                 return llen;
-            cblk->lblock += llen;
-            if ((ret = get_bits(s, av_log2(newpasses) + cblk->lblock)) < 0)
-                return ret;
-            if (ret > sizeof(cblk->data)) {
+            if (cblk->lblock + llen + av_log2(newpasses) > 16) {
                 avpriv_request_sample(s->avctx,
-                                      "Block with lengthinc greater than %zu",
-                                      sizeof(cblk->data));
+                                      "Block with length beyond 16 bits");
                 return AVERROR_PATCHWELCOME;
             }
-            cblk->lengthinc = ret;
-            cblk->npasses  += newpasses;
+
+            cblk->lblock += llen;
+
+            cblk->nb_lengthinc = 0;
+            cblk->nb_terminationsinc = 0;
+            do {
+                int newpasses1 = 0;
+
+                while (newpasses1 < newpasses) {
+                    newpasses1 ++;
+                    if (needs_termination(codsty->cblk_style, cblk->npasses + newpasses1 - 1)) {
+                        cblk->nb_terminationsinc ++;
+                        break;
+                    }
+                }
+
+                if ((ret = get_bits(s, av_log2(newpasses1) + cblk->lblock)) < 0)
+                    return ret;
+                if (ret > sizeof(cblk->data)) {
+                    avpriv_request_sample(s->avctx,
+                                        "Block with lengthinc greater than %"SIZE_SPECIFIER"",
+                                        sizeof(cblk->data));
+                    return AVERROR_PATCHWELCOME;
+                }
+                cblk->lengthinc[cblk->nb_lengthinc++] = ret;
+                cblk->npasses  += newpasses1;
+                newpasses -= newpasses1;
+            } while(newpasses);
         }
     }
     jpeg2000_flush(s);
@@ -699,7 +1003,7 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
         if (bytestream2_peek_be16(&s->g) == JPEG2000_EPH)
             bytestream2_skip(&s->g, 2);
         else
-            av_log(s->avctx, AV_LOG_ERROR, "EPH marker not found.\n");
+            av_log(s->avctx, AV_LOG_ERROR, "EPH marker not found. instead %X\n", bytestream2_peek_be32(&s->g));
     }
 
     for (bandno = 0; bandno < rlevel->nbands; bandno++) {
@@ -709,148 +1013,334 @@ static int jpeg2000_decode_packet(Jpeg2000DecoderContext *s,
         nb_code_blocks = prec->nb_codeblocks_height * prec->nb_codeblocks_width;
         for (cblkno = 0; cblkno < nb_code_blocks; cblkno++) {
             Jpeg2000Cblk *cblk = prec->cblk + cblkno;
-            if (bytestream2_get_bytes_left(&s->g) < cblk->lengthinc)
-                return AVERROR_INVALIDDATA;
-            /* Code-block data can be empty. In that case initialize data
-             * with 0xFFFF. */
-            if (cblk->lengthinc > 0) {
-                bytestream2_get_bufferu(&s->g, cblk->data, cblk->lengthinc);
-            } else {
-                cblk->data[0] = 0xFF;
-                cblk->data[1] = 0xFF;
-            }
-            cblk->length   += cblk->lengthinc;
-            cblk->lengthinc = 0;
+            for (cwsno = 0; cwsno < cblk->nb_lengthinc; cwsno ++) {
+                if (   bytestream2_get_bytes_left(&s->g) < cblk->lengthinc[cwsno]
+                    || sizeof(cblk->data) < cblk->length + cblk->lengthinc[cwsno] + 4
+                ) {
+                    av_log(s->avctx, AV_LOG_ERROR,
+                        "Block length %"PRIu16" or lengthinc %d is too large, left %d\n",
+                        cblk->length, cblk->lengthinc[cwsno], bytestream2_get_bytes_left(&s->g));
+                    return AVERROR_INVALIDDATA;
+                }
 
-            if (cblk->length > sizeof(cblk->data)) {
-                av_log(s->avctx, AV_LOG_ERROR,
-                       "Block length %"PRIu16" > data size %zd\n",
-                       cblk->length, sizeof(cblk->data));
-                return AVERROR_INVALIDDATA;
+                bytestream2_get_bufferu(&s->g, cblk->data + cblk->length, cblk->lengthinc[cwsno]);
+                cblk->length   += cblk->lengthinc[cwsno];
+                cblk->lengthinc[cwsno] = 0;
+                if (cblk->nb_terminationsinc) {
+                    cblk->nb_terminationsinc--;
+                    cblk->nb_terminations++;
+                    cblk->data[cblk->length++] = 0xFF;
+                    cblk->data[cblk->length++] = 0xFF;
+                    cblk->data_start[cblk->nb_terminations] = cblk->length;
+                }
             }
         }
     }
     return 0;
 }
 
-static int decode_pgod_lrcp(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
+static int jpeg2000_decode_packets_po_iteration(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
+                                             int RSpoc, int CSpoc,
+                                             int LYEpoc, int REpoc, int CEpoc,
+                                             int Ppoc, int *tp_index)
 {
+    int ret = 0;
     int layno, reslevelno, compno, precno, ok_reslevel;
-    int ret;
+    int x, y;
+    int step_x, step_y;
 
-    for (layno = 0; layno < tile->codsty[0].nlayers; layno++) {
+    switch (Ppoc) {
+    case JPEG2000_PGOD_RLCP:
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order RLCP\n");
         ok_reslevel = 1;
-        for (reslevelno = 0; ok_reslevel; reslevelno++) {
+        for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
             ok_reslevel = 0;
-            for (compno = 0; compno < s->ncomponents; compno++) {
-                Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-                Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
-                if (reslevelno < codsty->nreslevels) {
-                    Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
-                                               reslevelno;
-                    ok_reslevel = 1;
-                    for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
-                        if ((ret = jpeg2000_decode_packet(s,
-                                                          codsty, rlevel,
-                                                          precno, layno,
-                                                          qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
-                                                          qntsty->nguardbits)) < 0)
-                            return ret;
+            for (layno = 0; layno < LYEpoc; layno++) {
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    if (reslevelno < codsty->nreslevels) {
+                        Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
+                                                reslevelno;
+                        ok_reslevel = 1;
+                        for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                              codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                    }
                 }
             }
         }
-    }
-
-    return 0;
-}
-
-static int decode_pgod_cprl(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
-{
-    int layno, reslevelno, compno, precno;
-    int ret, x, y;
-
-    for (compno = 0; compno < s->ncomponents; compno++) {
-        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
-        Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
-
-        /* Set bit stream buffer address according to tile-part.
-         * For DCinema one tile-part per component, so can be
-         * indexed by component. */
-        s->g = tile->tile_part[compno].tpg;
-
-        /* Position loop (y axis)
-         * TODO: Automate computing of step 256.
-         * Fixed here, but to be computed before entering here. */
-        for (y = 0; y < s->height; y += 256) {
-            /* Position loop (y axis)
-             * TODO: automate computing of step 256.
-             * Fixed here, but to be computed before entering here. */
-            for (x = 0; x < s->width; x += 256) {
-                for (reslevelno = 0; reslevelno < codsty->nreslevels; reslevelno++) {
-                    uint16_t prcx, prcy;
-                    uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
-                    Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel + reslevelno;
-
-                    if (!((y % (1 << (rlevel->log2_prec_height + reducedresno)) == 0) ||
-                          (y == 0))) // TODO: 2nd condition simplified as try0 always =0 for dcinema
-                        continue;
-
-                    if (!((x % (1 << (rlevel->log2_prec_width + reducedresno)) == 0) ||
-                          (x == 0))) // TODO: 2nd condition simplified as try0 always =0 for dcinema
-                        continue;
+        break;
 
-                    // check if a precinct exists
-                    prcx   = ff_jpeg2000_ceildivpow2(x, reducedresno) >> rlevel->log2_prec_width;
-                    prcy   = ff_jpeg2000_ceildivpow2(y, reducedresno) >> rlevel->log2_prec_height;
-                    precno = prcx + rlevel->num_precincts_x * prcy;
-                    for (layno = 0; layno < tile->codsty[0].nlayers; layno++) {
-                        if ((ret = jpeg2000_decode_packet(s, codsty, rlevel,
-                                                          precno, layno,
-                                                          qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
-                                                          qntsty->nguardbits)) < 0)
-                            return ret;
+    case JPEG2000_PGOD_LRCP:
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order LRCP\n");
+        for (layno = 0; layno < LYEpoc; layno++) {
+            ok_reslevel = 1;
+            for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
+                ok_reslevel = 0;
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    if (reslevelno < codsty->nreslevels) {
+                        Jpeg2000ResLevel *rlevel = tile->comp[compno].reslevel +
+                                                reslevelno;
+                        ok_reslevel = 1;
+                        for (precno = 0; precno < rlevel->num_precincts_x * rlevel->num_precincts_y; precno++)
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                              codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
                     }
                 }
             }
         }
-    }
-
-    return 0;
-}
-
-static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
-{
-    int ret = 0;
-
-    s->bit_index = 8;
-    switch (tile->codsty[0].prog_order) {
-    case JPEG2000_PGOD_LRCP:
-        ret = decode_pgod_lrcp(s, tile);
         break;
 
     case JPEG2000_PGOD_CPRL:
-        ret = decode_pgod_cprl(s, tile);
-        break;
-
-    case JPEG2000_PGOD_RLCP:
-        avpriv_request_sample(s->avctx, "Progression order RLCP");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_DEBUG, "Progression order CPRL\n");
+        for (compno = CSpoc; compno < CEpoc; compno++) {
+            Jpeg2000Component *comp     = tile->comp + compno;
+            Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+            Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+            step_x = 32;
+            step_y = 32;
+
+            for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+            }
+            av_assert0(step_x < 32 && step_y < 32);
+            step_x = 1<<step_x;
+            step_y = 1<<step_y;
+
+            for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+                for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                    for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                        unsigned prcx, prcy;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                        int xc = x / s->cdx[compno];
+                        int yc = y / s->cdy[compno];
+
+                        if (yc % (1 << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1 << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                        for (layno = 0; layno < LYEpoc; layno++) {
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index, codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                        }
+                    }
+                }
+            }
+        }
         break;
 
     case JPEG2000_PGOD_RPCL:
-        avpriv_request_sample(s->avctx, "Progression order RPCL");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "Progression order RPCL\n");
+        ok_reslevel = 1;
+        for (reslevelno = RSpoc; ok_reslevel && reslevelno < REpoc; reslevelno++) {
+            ok_reslevel = 0;
+            step_x = 30;
+            step_y = 30;
+            for (compno = CSpoc; compno < CEpoc; compno++) {
+                Jpeg2000Component *comp     = tile->comp + compno;
+                Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+                if (reslevelno < codsty->nreslevels) {
+                    uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                    Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                    step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                    step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+                }
+            }
+            step_x = 1<<step_x;
+            step_y = 1<<step_y;
+
+            for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+                for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                    for (compno = CSpoc; compno < CEpoc; compno++) {
+                        Jpeg2000Component *comp     = tile->comp + compno;
+                        Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                        Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                        unsigned prcx, prcy;
+
+                        int xc = x / s->cdx[compno];
+                        int yc = y / s->cdy[compno];
+
+                        if (reslevelno >= codsty->nreslevels)
+                            continue;
+
+                        if (yc % (1 << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1 << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        ok_reslevel = 1;
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                            for (layno = 0; layno < LYEpoc; layno++) {
+                                if ((ret = jpeg2000_decode_packet(s, tile, tp_index,
+                                                                codsty, rlevel,
+                                                                precno, layno,
+                                                                qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                                qntsty->nguardbits)) < 0)
+                                    return ret;
+                            }
+                    }
+                }
+            }
+        }
         break;
 
     case JPEG2000_PGOD_PCRL:
-        avpriv_request_sample(s->avctx, "Progression order PCRL");
-        ret = AVERROR_PATCHWELCOME;
+        av_log(s->avctx, AV_LOG_WARNING, "Progression order PCRL\n");
+        step_x = 32;
+        step_y = 32;
+        for (compno = CSpoc; compno < CEpoc; compno++) {
+            Jpeg2000Component *comp     = tile->comp + compno;
+            Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+            for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+                step_x = FFMIN(step_x, rlevel->log2_prec_width  + reducedresno);
+                step_y = FFMIN(step_y, rlevel->log2_prec_height + reducedresno);
+            }
+        }
+        if (step_x >= 31 || step_y >= 31){
+            avpriv_request_sample(s->avctx, "PCRL with large step");
+            return AVERROR_PATCHWELCOME;
+        }
+        step_x = 1<<step_x;
+        step_y = 1<<step_y;
+
+        for (y = tile->coord[1][0]; y < tile->coord[1][1]; y = (y/step_y + 1)*step_y) {
+            for (x = tile->coord[0][0]; x < tile->coord[0][1]; x = (x/step_x + 1)*step_x) {
+                for (compno = CSpoc; compno < CEpoc; compno++) {
+                    Jpeg2000Component *comp     = tile->comp + compno;
+                    Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+                    Jpeg2000QuantStyle *qntsty  = tile->qntsty + compno;
+                    int xc = x / s->cdx[compno];
+                    int yc = y / s->cdy[compno];
+
+                    for (reslevelno = RSpoc; reslevelno < FFMIN(codsty->nreslevels, REpoc); reslevelno++) {
+                        unsigned prcx, prcy;
+                        uint8_t reducedresno = codsty->nreslevels - 1 -reslevelno; //  ==> N_L - r
+                        Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
+
+                        if (yc % (1 << (rlevel->log2_prec_height + reducedresno)) && y != tile->coord[1][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        if (xc % (1 << (rlevel->log2_prec_width + reducedresno)) && x != tile->coord[0][0]) //FIXME this is a subset of the check
+                            continue;
+
+                        // check if a precinct exists
+                        prcx   = ff_jpeg2000_ceildivpow2(xc, reducedresno) >> rlevel->log2_prec_width;
+                        prcy   = ff_jpeg2000_ceildivpow2(yc, reducedresno) >> rlevel->log2_prec_height;
+                        prcx  -= ff_jpeg2000_ceildivpow2(comp->coord_o[0][0], reducedresno) >> rlevel->log2_prec_width;
+                        prcy  -= ff_jpeg2000_ceildivpow2(comp->coord_o[1][0], reducedresno) >> rlevel->log2_prec_height;
+
+                        precno = prcx + rlevel->num_precincts_x * prcy;
+
+                        if (prcx >= rlevel->num_precincts_x || prcy >= rlevel->num_precincts_y) {
+                            av_log(s->avctx, AV_LOG_WARNING, "prc %d %d outside limits %d %d\n",
+                                   prcx, prcy, rlevel->num_precincts_x, rlevel->num_precincts_y);
+                            continue;
+                        }
+
+                        for (layno = 0; layno < LYEpoc; layno++) {
+                            if ((ret = jpeg2000_decode_packet(s, tile, tp_index, codsty, rlevel,
+                                                              precno, layno,
+                                                              qntsty->expn + (reslevelno ? 3 * (reslevelno - 1) + 1 : 0),
+                                                              qntsty->nguardbits)) < 0)
+                                return ret;
+                        }
+                    }
+                }
+            }
+        }
         break;
 
     default:
         break;
     }
 
+    return ret;
+}
+
+static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
+{
+    int ret = AVERROR_BUG;
+    int i;
+    int tp_index = 0;
+
+    s->bit_index = 8;
+    if (tile->poc.nb_poc) {
+        for (i=0; i<tile->poc.nb_poc; i++) {
+            Jpeg2000POCEntry *e = &tile->poc.poc[i];
+            ret = jpeg2000_decode_packets_po_iteration(s, tile,
+                e->RSpoc, e->CSpoc,
+                FFMIN(e->LYEpoc, tile->codsty[0].nlayers),
+                e->REpoc,
+                FFMIN(e->CEpoc, s->ncomponents),
+                e->Ppoc, &tp_index
+                );
+            if (ret < 0)
+                return ret;
+        }
+    } else {
+        ret = jpeg2000_decode_packets_po_iteration(s, tile,
+            0, 0,
+            tile->codsty[0].nlayers,
+            33,
+            s->ncomponents,
+            tile->codsty[0].prog_order,
+            &tp_index
+        );
+    }
     /* EOC marker reached */
     bytestream2_skip(&s->g, 2);
 
@@ -859,7 +1349,7 @@ static int jpeg2000_decode_packets(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
 
 /* TIER-1 routines */
 static void decode_sigpass(Jpeg2000T1Context *t1, int width, int height,
-                           int bpno, int bandno, int bpass_csty_symbol,
+                           int bpno, int bandno,
                            int vert_causal_ctx_csty_symbol)
 {
     int mask = 3 << (bpno - 1), y0, x, y;
@@ -867,29 +1357,29 @@ static void decode_sigpass(Jpeg2000T1Context *t1, int width, int height,
     for (y0 = 0; y0 < height; y0 += 4)
         for (x = 0; x < width; x++)
             for (y = y0; y < height && y < y0 + 4; y++) {
-                if ((t1->flags[y+1][x+1] & JPEG2000_T1_SIG_NB)
-                && !(t1->flags[y+1][x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
-                    int flags_mask = -1;
-                    if (vert_causal_ctx_csty_symbol && y == y0 + 3)
-                        flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE);
-                    if (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[y+1][x+1] & flags_mask, bandno))) {
-                        int xorbit, ctxno = ff_jpeg2000_getsgnctxno(t1->flags[y+1][x+1], &xorbit);
-                        if (bpass_csty_symbol)
-                             t1->data[y][x] = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ? -mask : mask;
+                int flags_mask = -1;
+                if (vert_causal_ctx_csty_symbol && y == y0 + 3)
+                    flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
+                if ((t1->flags[(y+1) * t1->stride + x+1] & JPEG2000_T1_SIG_NB & flags_mask)
+                && !(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
+                    if (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask, bandno))) {
+                        int xorbit, ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask, &xorbit);
+                        if (t1->mqc.raw)
+                             t1->data[(y) * t1->stride + x] = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ? -mask : mask;
                         else
-                             t1->data[y][x] = (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ^ xorbit) ?
+                             t1->data[(y) * t1->stride + x] = (ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ctxno) ^ xorbit) ?
                                                -mask : mask;
 
                         ff_jpeg2000_set_significance(t1, x, y,
-                                                     t1->data[y][x] < 0);
+                                                     t1->data[(y) * t1->stride + x] < 0);
                     }
-                    t1->flags[y + 1][x + 1] |= JPEG2000_T1_VIS;
+                    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_VIS;
                 }
             }
 }
 
 static void decode_refpass(Jpeg2000T1Context *t1, int width, int height,
-                           int bpno)
+                           int bpno, int vert_causal_ctx_csty_symbol)
 {
     int phalf, nhalf;
     int y0, x, y;
@@ -900,13 +1390,15 @@ static void decode_refpass(Jpeg2000T1Context *t1, int width, int height,
     for (y0 = 0; y0 < height; y0 += 4)
         for (x = 0; x < width; x++)
             for (y = y0; y < height && y < y0 + 4; y++)
-                if ((t1->flags[y + 1][x + 1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG) {
-                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[y + 1][x + 1]);
+                if ((t1->flags[(y + 1) * t1->stride + x + 1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS)) == JPEG2000_T1_SIG) {
+                    int flags_mask = (vert_causal_ctx_csty_symbol && y == y0 + 3) ?
+                        ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S) : -1;
+                    int ctxno = ff_jpeg2000_getrefctxno(t1->flags[(y + 1) * t1->stride + x + 1] & flags_mask);
                     int r     = ff_mqc_decode(&t1->mqc,
                                               t1->mqc.cx_states + ctxno)
                                 ? phalf : nhalf;
-                    t1->data[y][x]          += t1->data[y][x] < 0 ? -r : r;
-                    t1->flags[y + 1][x + 1] |= JPEG2000_T1_REF;
+                    t1->data[(y) * t1->stride + x]          += t1->data[(y) * t1->stride + x] < 0 ? -r : r;
+                    t1->flags[(y + 1) * t1->stride + x + 1] |= JPEG2000_T1_REF;
                 }
 }
 
@@ -918,11 +1410,14 @@ static void decode_clnpass(Jpeg2000DecoderContext *s, Jpeg2000T1Context *t1,
 
     for (y0 = 0; y0 < height; y0 += 4) {
         for (x = 0; x < width; x++) {
+            int flags_mask = -1;
+            if (vert_causal_ctx_csty_symbol)
+                flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
             if (y0 + 3 < height &&
-                !((t1->flags[y0 + 1][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 2][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 3][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
-                  (t1->flags[y0 + 4][x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)))) {
+                !((t1->flags[(y0 + 1) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 2) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 3) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG)) ||
+                  (t1->flags[(y0 + 4) * t1->stride + x + 1] & (JPEG2000_T1_SIG_NB | JPEG2000_T1_VIS | JPEG2000_T1_SIG) & flags_mask))) {
                 if (!ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + MQC_CX_RL))
                     continue;
                 runlen = ff_mqc_decode(&t1->mqc,
@@ -937,27 +1432,27 @@ static void decode_clnpass(Jpeg2000DecoderContext *s, Jpeg2000T1Context *t1,
             }
 
             for (y = y0 + runlen; y < y0 + 4 && y < height; y++) {
+                int flags_mask = -1;
+                if (vert_causal_ctx_csty_symbol && y == y0 + 3)
+                    flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE | JPEG2000_T1_SGN_S);
                 if (!dec) {
-                    if (!(t1->flags[y+1][x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
-                        int flags_mask = -1;
-                        if (vert_causal_ctx_csty_symbol && y == y0 + 3)
-                            flags_mask &= ~(JPEG2000_T1_SIG_S | JPEG2000_T1_SIG_SW | JPEG2000_T1_SIG_SE);
-                        dec = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[y+1][x+1] & flags_mask,
+                    if (!(t1->flags[(y+1) * t1->stride + x+1] & (JPEG2000_T1_SIG | JPEG2000_T1_VIS))) {
+                        dec = ff_mqc_decode(&t1->mqc, t1->mqc.cx_states + ff_jpeg2000_getsigctxno(t1->flags[(y+1) * t1->stride + x+1] & flags_mask,
                                                                                              bandno));
                     }
                 }
                 if (dec) {
                     int xorbit;
-                    int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[y + 1][x + 1],
+                    int ctxno = ff_jpeg2000_getsgnctxno(t1->flags[(y + 1) * t1->stride + x + 1] & flags_mask,
                                                         &xorbit);
-                    t1->data[y][x] = (ff_mqc_decode(&t1->mqc,
+                    t1->data[(y) * t1->stride + x] = (ff_mqc_decode(&t1->mqc,
                                                     t1->mqc.cx_states + ctxno) ^
                                       xorbit)
                                      ? -mask : mask;
-                    ff_jpeg2000_set_significance(t1, x, y, t1->data[y][x] < 0);
+                    ff_jpeg2000_set_significance(t1, x, y, t1->data[(y) * t1->stride + x] < 0);
                 }
                 dec = 0;
-                t1->flags[y + 1][x + 1] &= ~JPEG2000_T1_VIS;
+                t1->flags[(y + 1) * t1->stride + x + 1] &= ~JPEG2000_T1_VIS;
             }
         }
     }
@@ -977,52 +1472,77 @@ static int decode_cblk(Jpeg2000DecoderContext *s, Jpeg2000CodingStyle *codsty,
                        Jpeg2000T1Context *t1, Jpeg2000Cblk *cblk,
                        int width, int height, int bandpos)
 {
-    int passno = cblk->npasses, pass_t = 2, bpno = cblk->nonzerobits - 1, y;
-    int clnpass_cnt = 0;
-    int bpass_csty_symbol           = codsty->cblk_style & JPEG2000_CBLK_BYPASS;
+    int passno = cblk->npasses, pass_t = 2, bpno = cblk->nonzerobits - 1;
+    int pass_cnt = 0;
     int vert_causal_ctx_csty_symbol = codsty->cblk_style & JPEG2000_CBLK_VSC;
+    int term_cnt = 0;
+    int coder_type;
+
+    av_assert0(width <= 1024U && height <= 1024U);
+    av_assert0(width*height <= 4096);
 
-    for (y = 0; y < height; y++)
-        memset(t1->data[y], 0, width * sizeof(**t1->data));
+    memset(t1->data, 0, t1->stride * height * sizeof(*t1->data));
 
     /* If code-block contains no compressed data: nothing to do. */
     if (!cblk->length)
         return 0;
-    for (y = 0; y < height + 2; y++)
-        memset(t1->flags[y], 0, (width + 2) * sizeof(**t1->flags));
 
-    ff_mqc_initdec(&t1->mqc, cblk->data);
-    cblk->data[cblk->length]     = 0xff;
-    cblk->data[cblk->length + 1] = 0xff;
+    memset(t1->flags, 0, t1->stride * (height + 2) * sizeof(*t1->flags));
+
+    cblk->data[cblk->length] = 0xff;
+    cblk->data[cblk->length+1] = 0xff;
+    ff_mqc_initdec(&t1->mqc, cblk->data, 0, 1);
 
     while (passno--) {
-        switch (pass_t) {
+        if (bpno < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "bpno became negative\n");
+            return AVERROR_INVALIDDATA;
+        }
+        switch(pass_t) {
         case 0:
             decode_sigpass(t1, width, height, bpno + 1, bandpos,
-                           bpass_csty_symbol && (clnpass_cnt >= 4),
                            vert_causal_ctx_csty_symbol);
             break;
         case 1:
-            decode_refpass(t1, width, height, bpno + 1);
-            if (bpass_csty_symbol && clnpass_cnt >= 4)
-                ff_mqc_initdec(&t1->mqc, cblk->data);
+            decode_refpass(t1, width, height, bpno + 1, vert_causal_ctx_csty_symbol);
             break;
         case 2:
+            av_assert2(!t1->mqc.raw);
             decode_clnpass(s, t1, width, height, bpno + 1, bandpos,
                            codsty->cblk_style & JPEG2000_CBLK_SEGSYM,
                            vert_causal_ctx_csty_symbol);
-            clnpass_cnt = clnpass_cnt + 1;
-            if (bpass_csty_symbol && clnpass_cnt >= 4)
-                ff_mqc_initdec(&t1->mqc, cblk->data);
             break;
         }
+        if (codsty->cblk_style & JPEG2000_CBLK_RESET) // XXX no testcase for just this
+            ff_mqc_init_contexts(&t1->mqc);
+
+        if (passno && (coder_type = needs_termination(codsty->cblk_style, pass_cnt))) {
+            if (term_cnt >= cblk->nb_terminations) {
+                av_log(s->avctx, AV_LOG_ERROR, "Missing needed termination \n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (FFABS(cblk->data + cblk->data_start[term_cnt + 1] - 2 - t1->mqc.bp) > 0) {
+                av_log(s->avctx, AV_LOG_WARNING, "Mid mismatch %"PTRDIFF_SPECIFIER" in pass %d of %d\n",
+                    cblk->data + cblk->data_start[term_cnt + 1] - 2 - t1->mqc.bp,
+                    pass_cnt, cblk->npasses);
+            }
+
+            ff_mqc_initdec(&t1->mqc, cblk->data + cblk->data_start[++term_cnt], coder_type == 2, 0);
+        }
 
         pass_t++;
         if (pass_t == 3) {
             bpno--;
             pass_t = 0;
         }
+        pass_cnt ++;
     }
+
+    if (cblk->data + cblk->length - 2*(term_cnt < cblk->nb_terminations) != t1->mqc.bp) {
+        av_log(s->avctx, AV_LOG_WARNING, "End mismatch %"PTRDIFF_SPECIFIER"\n",
+               cblk->data + cblk->length - 2*(term_cnt < cblk->nb_terminations) - t1->mqc.bp);
+    }
+
     return 0;
 }
 
@@ -1041,7 +1561,7 @@ static void dequantization_float(int x, int y, Jpeg2000Cblk *cblk,
     int w = cblk->coord[0][1] - cblk->coord[0][0];
     for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
         float *datap = &comp->f_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
-        int *src = t1->data[j];
+        int *src = t1->data + j*t1->stride;
         for (i = 0; i < w; ++i)
             datap[i] = src[i] * band->f_stepsize;
     }
@@ -1056,9 +1576,29 @@ static void dequantization_int(int x, int y, Jpeg2000Cblk *cblk,
     int w = cblk->coord[0][1] - cblk->coord[0][0];
     for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
         int32_t *datap = &comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
-        int *src = t1->data[j];
+        int *src = t1->data + j*t1->stride;
+        if (band->i_stepsize == 32768) {
+            for (i = 0; i < w; ++i)
+                datap[i] = src[i] / 2;
+        } else {
+            // This should be VERY uncommon
+            for (i = 0; i < w; ++i)
+                datap[i] = (src[i] * (int64_t)band->i_stepsize) / 65536;
+        }
+    }
+}
+
+static void dequantization_int_97(int x, int y, Jpeg2000Cblk *cblk,
+                               Jpeg2000Component *comp,
+                               Jpeg2000T1Context *t1, Jpeg2000Band *band)
+{
+    int i, j;
+    int w = cblk->coord[0][1] - cblk->coord[0][0];
+    for (j = 0; j < (cblk->coord[1][1] - cblk->coord[1][0]); ++j) {
+        int32_t *datap = &comp->i_data[(comp->coord[0][1] - comp->coord[0][0]) * (y + j) + x];
+        int *src = t1->data + j*t1->stride;
         for (i = 0; i < w; ++i)
-            datap[i] = (src[i] * band->i_stepsize + (1 << 15)) >> 16;
+            datap[i] = (src[i] * (int64_t)band->i_stepsize + (1<<15)) >> 16;
     }
 }
 
@@ -1067,6 +1607,17 @@ static inline void mct_decode(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile)
     int i, csize = 1;
     void *src[3];
 
+    for (i = 1; i < 3; i++) {
+        if (tile->codsty[0].transform != tile->codsty[i].transform) {
+            av_log(s->avctx, AV_LOG_ERROR, "Transforms mismatch, MCT not supported\n");
+            return;
+        }
+        if (memcmp(tile->comp[0].coord, tile->comp[i].coord, sizeof(tile->comp[0].coord))) {
+            av_log(s->avctx, AV_LOG_ERROR, "Coords mismatch, MCT not supported\n");
+            return;
+        }
+    }
+
     for (i = 0; i < 3; i++)
         if (tile->codsty[0].transform == FF_DWT97)
             src[i] = tile->comp[i].f_data;
@@ -1086,18 +1637,21 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
     int compno, reslevelno, bandno;
 
     /* Loop on tile components */
-
     for (compno = 0; compno < s->ncomponents; compno++) {
         Jpeg2000Component *comp     = tile->comp + compno;
         Jpeg2000CodingStyle *codsty = tile->codsty + compno;
+
+        t1.stride = (1<<codsty->log2_cblk_width) + 2;
+
         /* Loop on resolution levels */
         for (reslevelno = 0; reslevelno < codsty->nreslevels2decode; reslevelno++) {
             Jpeg2000ResLevel *rlevel = comp->reslevel + reslevelno;
             /* Loop on bands */
             for (bandno = 0; bandno < rlevel->nbands; bandno++) {
-                uint16_t nb_precincts, precno;
+                int nb_precincts, precno;
                 Jpeg2000Band *band = rlevel->band + bandno;
                 int cblkno = 0, bandpos;
+
                 bandpos = bandno + (reslevelno > 0);
 
                 if (band->coord[0][0] == band->coord[0][1] ||
@@ -1120,11 +1674,13 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
                                     cblk->coord[1][1] - cblk->coord[1][0],
                                     bandpos);
 
-                        x = cblk->coord[0][0];
-                        y = cblk->coord[1][0];
+                        x = cblk->coord[0][0] - band->coord[0][0];
+                        y = cblk->coord[1][0] - band->coord[1][0];
 
                         if (codsty->transform == FF_DWT97)
                             dequantization_float(x, y, cblk, comp, &t1, band);
+                        else if (codsty->transform == FF_DWT97_INT)
+                            dequantization_int_97(x, y, cblk, comp, &t1, band);
                         else
                             dequantization_int(x, y, cblk, comp, &t1, band);
                    } /* end cblk */
@@ -1139,9 +1695,12 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
 
 #define WRITE_FRAME(D, PIXEL)                                                                     \
     static inline void write_frame_ ## D(Jpeg2000DecoderContext * s, Jpeg2000Tile * tile,         \
-                                         AVFrame * picture)                                       \
+                                         AVFrame * picture, int precision)                        \
     {                                                                                             \
-        int linesize = picture->linesize[0] / sizeof(PIXEL);                                      \
+        const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(s->avctx->pix_fmt);               \
+        int planar    = !!(pixdesc->flags & AV_PIX_FMT_FLAG_PLANAR);                              \
+        int pixelsize = planar ? 1 : pixdesc->nb_components;                                      \
+                                                                                                  \
         int compno;                                                                               \
         int x, y;                                                                                 \
                                                                                                   \
@@ -1153,35 +1712,39 @@ static inline void tile_codeblocks(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile
             int32_t *i_datap = comp->i_data;                                                      \
             int cbps         = s->cbps[compno];                                                   \
             int w            = tile->comp[compno].coord[0][1] - s->image_offset_x;                \
+            int plane        = 0;                                                                 \
                                                                                                   \
-            y    = tile->comp[compno].coord[1][0] - s->image_offset_y;                            \
-            line = (PIXEL *)picture->data[0] + y * linesize;                                      \
-            for (; y < tile->comp[compno].coord[1][1] - s->image_offset_y; y += s->cdy[compno]) { \
+            if (planar)                                                                           \
+                plane = s->cdef[compno] ? s->cdef[compno]-1 : (s->ncomponents-1);                 \
+                                                                                                  \
+            y    = tile->comp[compno].coord[1][0] - s->image_offset_y / s->cdy[compno];           \
+            line = (PIXEL *)picture->data[plane] + y * (picture->linesize[plane] / sizeof(PIXEL));\
+            for (; y < tile->comp[compno].coord[1][1] - s->image_offset_y; y++) {                 \
                 PIXEL *dst;                                                                       \
                                                                                                   \
-                x   = tile->comp[compno].coord[0][0] - s->image_offset_x;                         \
-                dst = line + x * s->ncomponents + compno;                                         \
+                x   = tile->comp[compno].coord[0][0] - s->image_offset_x / s->cdx[compno];        \
+                dst = line + x * pixelsize + compno*!planar;                                      \
                                                                                                   \
                 if (codsty->transform == FF_DWT97) {                                              \
-                    for (; x < w; x += s->cdx[compno]) {                                          \
+                    for (; x < w; x++) {                                                          \
                         int val = lrintf(*datap) + (1 << (cbps - 1));                             \
                         /* DC level shift and clip see ISO 15444-1:2002 G.1.2 */                  \
                         val  = av_clip(val, 0, (1 << cbps) - 1);                                  \
-                        *dst = val << (8 * sizeof(PIXEL) - cbps);                                 \
+                        *dst = val << (precision - cbps);                                         \
                         datap++;                                                                  \
-                        dst += s->ncomponents;                                                    \
+                        dst += pixelsize;                                                         \
                     }                                                                             \
                 } else {                                                                          \
-                    for (; x < w; x += s->cdx[compno]) {                                          \
+                    for (; x < w; x++) {                                                          \
                         int val = *i_datap + (1 << (cbps - 1));                                   \
                         /* DC level shift and clip see ISO 15444-1:2002 G.1.2 */                  \
                         val  = av_clip(val, 0, (1 << cbps) - 1);                                  \
-                        *dst = val << (8 * sizeof(PIXEL) - cbps);                                 \
+                        *dst = val << (precision - cbps);                                         \
                         i_datap++;                                                                \
-                        dst += s->ncomponents;                                                    \
+                        dst += pixelsize;                                                         \
                     }                                                                             \
                 }                                                                                 \
-                line += linesize;                                                                 \
+                line += picture->linesize[plane] / sizeof(PIXEL);                                 \
             }                                                                                     \
         }                                                                                         \
                                                                                                   \
@@ -1192,19 +1755,40 @@ WRITE_FRAME(16, uint16_t)
 
 #undef WRITE_FRAME
 
-static int jpeg2000_decode_tile(Jpeg2000DecoderContext *s, Jpeg2000Tile *tile,
-                                AVFrame *picture)
+static int jpeg2000_decode_tile(AVCodecContext *avctx, void *td,
+                                int jobnr, int threadnr)
 {
+    Jpeg2000DecoderContext *s = avctx->priv_data;
+    AVFrame *picture = td;
+    Jpeg2000Tile *tile = s->tile + jobnr;
+    int x;
+
     tile_codeblocks(s, tile);
 
     /* inverse MCT transformation */
     if (tile->codsty[0].mct)
         mct_decode(s, tile);
 
+    for (x = 0; x < s->ncomponents; x++) {
+        if (s->cdef[x] < 0) {
+            for (x = 0; x < s->ncomponents; x++) {
+                s->cdef[x] = x + 1;
+            }
+            if ((s->ncomponents & 1) == 0)
+                s->cdef[s->ncomponents-1] = 0;
+            break;
+        }
+    }
+
     if (s->precision <= 8) {
-        write_frame_8(s, tile, picture);
+        write_frame_8(s, tile, picture, 8);
     } else {
-        write_frame_16(s, tile, picture);
+        int precision = picture->format == AV_PIX_FMT_XYZ12 ||
+                        picture->format == AV_PIX_FMT_RGB48 ||
+                        picture->format == AV_PIX_FMT_RGBA64 ||
+                        picture->format == AV_PIX_FMT_GRAY16 ? 16 : s->precision;
+
+        write_frame_16(s, tile, picture, precision);
     }
 
     return 0;
@@ -1214,22 +1798,30 @@ static void jpeg2000_dec_cleanup(Jpeg2000DecoderContext *s)
 {
     int tileno, compno;
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
-        for (compno = 0; compno < s->ncomponents; compno++) {
-            Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
-            Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
+        if (s->tile[tileno].comp) {
+            for (compno = 0; compno < s->ncomponents; compno++) {
+                Jpeg2000Component *comp     = s->tile[tileno].comp   + compno;
+                Jpeg2000CodingStyle *codsty = s->tile[tileno].codsty + compno;
 
-            ff_jpeg2000_cleanup(comp, codsty);
+                ff_jpeg2000_cleanup(comp, codsty);
+            }
+            av_freep(&s->tile[tileno].comp);
         }
-        av_freep(&s->tile[tileno].comp);
     }
     av_freep(&s->tile);
+    memset(s->codsty, 0, sizeof(s->codsty));
+    memset(s->qntsty, 0, sizeof(s->qntsty));
+    memset(s->properties, 0, sizeof(s->properties));
+    memset(&s->poc  , 0, sizeof(s->poc));
     s->numXtiles = s->numYtiles = 0;
+    s->ncomponents = 0;
 }
 
 static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
 {
     Jpeg2000CodingStyle *codsty = s->codsty;
     Jpeg2000QuantStyle *qntsty  = s->qntsty;
+    Jpeg2000POC         *poc    = &s->poc;
     uint8_t *properties         = s->properties;
 
     for (;;) {
@@ -1249,17 +1841,21 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
             Jpeg2000Tile *tile;
             Jpeg2000TilePart *tp;
 
-            if (s->curtileno < 0) {
-                av_log(s->avctx, AV_LOG_ERROR, "Missing SOT\n");
-                return AVERROR_INVALIDDATA;
-            }
             if (!s->tile) {
                 av_log(s->avctx, AV_LOG_ERROR, "Missing SIZ\n");
                 return AVERROR_INVALIDDATA;
             }
+            if (s->curtileno < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "Missing SOT\n");
+                return AVERROR_INVALIDDATA;
+            }
 
             tile = s->tile + s->curtileno;
             tp = tile->tile_part + tile->tp_idx;
+            if (tp->tp_end < s->g.buffer) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid tpend\n");
+                return AVERROR_INVALIDDATA;
+            }
             bytestream2_init(&tp->tpg, s->g.buffer, tp->tp_end - s->g.buffer);
             bytestream2_skip(&s->g, tp->tp_end - s->g.buffer);
 
@@ -1268,13 +1864,21 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
         if (marker == JPEG2000_EOC)
             break;
 
-        len = bytestream2_get_be16u(&s->g);
-        if (len < 2 || bytestream2_get_bytes_left(&s->g) < len - 2)
+        len = bytestream2_get_be16(&s->g);
+        if (len < 2 || bytestream2_get_bytes_left(&s->g) < len - 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid len %d left=%d\n", len, bytestream2_get_bytes_left(&s->g));
             return AVERROR_INVALIDDATA;
+        }
 
         switch (marker) {
         case JPEG2000_SIZ:
+            if (s->ncomponents) {
+                av_log(s->avctx, AV_LOG_ERROR, "Duplicate SIZ\n");
+                return AVERROR_INVALIDDATA;
+            }
             ret = get_siz(s);
+            if (!s->tile)
+                s->numXtiles = s->numYtiles = 0;
             break;
         case JPEG2000_COC:
             ret = get_coc(s, codsty, properties);
@@ -1288,15 +1892,18 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
         case JPEG2000_QCD:
             ret = get_qcd(s, len, qntsty, properties);
             break;
+        case JPEG2000_POC:
+            ret = get_poc(s, len, poc);
+            break;
         case JPEG2000_SOT:
             if (!(ret = get_sot(s, len))) {
+                av_assert1(s->curtileno >= 0);
                 codsty = s->tile[s->curtileno].codsty;
                 qntsty = s->tile[s->curtileno].qntsty;
+                poc    = &s->tile[s->curtileno].poc;
                 properties = s->tile[s->curtileno].properties;
             }
             break;
-        case JPEG2000_PLT:
-            // the PLT marker is ignored
         case JPEG2000_PLM:
             // the PLM marker is ignored
         case JPEG2000_COM:
@@ -1307,6 +1914,10 @@ static int jpeg2000_read_main_headers(Jpeg2000DecoderContext *s)
             // Tile-part lengths
             ret = get_tlm(s, len);
             break;
+        case JPEG2000_PLT:
+            // Packet length, tile-part header
+            ret = get_plt(s, len);
+            break;
         default:
             av_log(s->avctx, AV_LOG_ERROR,
                    "unsupported marker 0x%.4"PRIX16" at pos 0x%X\n",
@@ -1333,11 +1944,11 @@ static int jpeg2000_read_bitstream_packets(Jpeg2000DecoderContext *s)
     for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++) {
         Jpeg2000Tile *tile = s->tile + tileno;
 
-        if (ret = init_tile(s, tileno))
+        if ((ret = init_tile(s, tileno)) < 0)
             return ret;
 
         s->g = tile->tile_part[0].tpg;
-        if (ret = jpeg2000_decode_packets(s, tile))
+        if ((ret = jpeg2000_decode_packets(s, tile)) < 0)
             return ret;
     }
 
@@ -1346,26 +1957,101 @@ static int jpeg2000_read_bitstream_packets(Jpeg2000DecoderContext *s)
 
 static int jp2_find_codestream(Jpeg2000DecoderContext *s)
 {
-    uint32_t atom_size, atom;
-    int found_codestream = 0, search_range = 10;
+    uint32_t atom_size, atom, atom_end;
+    int search_range = 10;
 
-    while(!found_codestream && search_range
-          &&
-          bytestream2_get_bytes_left(&s->g) >= 8) {
+    while (search_range
+           &&
+           bytestream2_get_bytes_left(&s->g) >= 8) {
         atom_size = bytestream2_get_be32u(&s->g);
         atom      = bytestream2_get_be32u(&s->g);
-        if (atom == JP2_CODESTREAM) {
-            found_codestream = 1;
+        atom_end  = bytestream2_tell(&s->g) + atom_size - 8;
+
+        if (atom == JP2_CODESTREAM)
+            return 1;
+
+        if (bytestream2_get_bytes_left(&s->g) < atom_size || atom_end < atom_size)
+            return 0;
+
+        if (atom == JP2_HEADER &&
+                   atom_size >= 16) {
+            uint32_t atom2_size, atom2, atom2_end;
+            do {
+                atom2_size = bytestream2_get_be32u(&s->g);
+                atom2      = bytestream2_get_be32u(&s->g);
+                atom2_end  = bytestream2_tell(&s->g) + atom2_size - 8;
+                if (atom2_size < 8 || atom2_end > atom_end || atom2_end < atom2_size)
+                    break;
+                if (atom2 == JP2_CODESTREAM) {
+                    return 1;
+                } else if (atom2 == MKBETAG('c','o','l','r') && atom2_size >= 7) {
+                    int method = bytestream2_get_byteu(&s->g);
+                    bytestream2_skipu(&s->g, 2);
+                    if (method == 1) {
+                        s->colour_space = bytestream2_get_be32u(&s->g);
+                    }
+                } else if (atom2 == MKBETAG('p','c','l','r') && atom2_size >= 6) {
+                    int i, size, colour_count, colour_channels, colour_depth[3];
+                    uint32_t r, g, b;
+                    colour_count = bytestream2_get_be16u(&s->g);
+                    colour_channels = bytestream2_get_byteu(&s->g);
+                    // FIXME: Do not ignore channel_sign
+                    colour_depth[0] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    colour_depth[1] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    colour_depth[2] = (bytestream2_get_byteu(&s->g) & 0x7f) + 1;
+                    size = (colour_depth[0] + 7 >> 3) * colour_count +
+                           (colour_depth[1] + 7 >> 3) * colour_count +
+                           (colour_depth[2] + 7 >> 3) * colour_count;
+                    if (colour_count > 256   ||
+                        colour_channels != 3 ||
+                        colour_depth[0] > 16 ||
+                        colour_depth[1] > 16 ||
+                        colour_depth[2] > 16 ||
+                        atom2_size < size) {
+                        avpriv_request_sample(s->avctx, "Unknown palette");
+                        bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+                        continue;
+                    }
+                    s->pal8 = 1;
+                    for (i = 0; i < colour_count; i++) {
+                        if (colour_depth[0] <= 8) {
+                            r = bytestream2_get_byteu(&s->g) << 8 - colour_depth[0];
+                            r |= r >> colour_depth[0];
+                        } else {
+                            r = bytestream2_get_be16u(&s->g) >> colour_depth[0] - 8;
+                        }
+                        if (colour_depth[1] <= 8) {
+                            g = bytestream2_get_byteu(&s->g) << 8 - colour_depth[1];
+                            r |= r >> colour_depth[1];
+                        } else {
+                            g = bytestream2_get_be16u(&s->g) >> colour_depth[1] - 8;
+                        }
+                        if (colour_depth[2] <= 8) {
+                            b = bytestream2_get_byteu(&s->g) << 8 - colour_depth[2];
+                            r |= r >> colour_depth[2];
+                        } else {
+                            b = bytestream2_get_be16u(&s->g) >> colour_depth[2] - 8;
+                        }
+                        s->palette[i] = 0xffu << 24 | r << 16 | g << 8 | b;
+                    }
+                } else if (atom2 == MKBETAG('c','d','e','f') && atom2_size >= 2) {
+                    int n = bytestream2_get_be16u(&s->g);
+                    for (; n>0; n--) {
+                        int cn   = bytestream2_get_be16(&s->g);
+                        int av_unused typ  = bytestream2_get_be16(&s->g);
+                        int asoc = bytestream2_get_be16(&s->g);
+                        if (cn < 4 && asoc < 4)
+                            s->cdef[cn] = asoc;
+                    }
+                }
+                bytestream2_seek(&s->g, atom2_end, SEEK_SET);
+            } while (atom_end - atom2_end >= 8);
         } else {
-            if (bytestream2_get_bytes_left(&s->g) < atom_size - 8)
-                return 0;
-            bytestream2_skipu(&s->g, atom_size - 8);
             search_range--;
         }
+        bytestream2_seek(&s->g, atom_end, SEEK_SET);
     }
 
-    if (found_codestream)
-        return 1;
     return 0;
 }
 
@@ -1384,11 +2070,12 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
     Jpeg2000DecoderContext *s = avctx->priv_data;
     ThreadFrame frame = { .f = data };
     AVFrame *picture = data;
-    int tileno, ret;
+    int ret;
 
     s->avctx     = avctx;
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
-    s->curtileno = 0; // TODO: only one tile in DCI JP2K. to implement for more tiles
+    s->curtileno = -1;
+    memset(s->cdef, -1, sizeof(s->cdef));
 
     if (bytestream2_get_bytes_left(&s->g) < 2) {
         ret = AVERROR_INVALIDDATA;
@@ -1410,6 +2097,9 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
         bytestream2_seek(&s->g, 0, SEEK_SET);
     }
 
+    while (bytestream2_get_bytes_left(&s->g) >= 3 && bytestream2_peek_be16(&s->g) != JPEG2000_SOC)
+        bytestream2_skip(&s->g, 1);
+
     if (bytestream2_get_be16u(&s->g) != JPEG2000_SOC) {
         av_log(avctx, AV_LOG_ERROR, "SOC marker not present\n");
         ret = AVERROR_INVALIDDATA;
@@ -1419,23 +2109,23 @@ static int jpeg2000_decode_frame(AVCodecContext *avctx, void *data,
         goto end;
 
     /* get picture buffer */
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed.\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         goto end;
-    }
     picture->pict_type = AV_PICTURE_TYPE_I;
     picture->key_frame = 1;
 
     if (ret = jpeg2000_read_bitstream_packets(s))
         goto end;
-    for (tileno = 0; tileno < s->numXtiles * s->numYtiles; tileno++)
-        if (ret = jpeg2000_decode_tile(s, s->tile + tileno, picture))
-            goto end;
+
+    avctx->execute2(avctx, jpeg2000_decode_tile, picture, NULL, s->numXtiles * s->numYtiles);
 
     jpeg2000_dec_cleanup(s);
 
     *got_frame = 1;
 
+    if (s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        memcpy(picture->data[1], s->palette, 256 * sizeof(uint32_t));
+
     return bytestream2_tell(&s->g);
 
 end:
@@ -1458,7 +2148,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass jpeg2000_class = {
     .class_name = "jpeg2000",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -1470,11 +2160,12 @@ AVCodec ff_jpeg2000_decoder = {
     .long_name        = NULL_IF_CONFIG_SMALL("JPEG 2000"),
     .type             = AVMEDIA_TYPE_VIDEO,
     .id               = AV_CODEC_ID_JPEG2000,
-    .capabilities     = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_DR1,
+    .capabilities     = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_DR1,
     .priv_data_size   = sizeof(Jpeg2000DecoderContext),
     .init_static_data = jpeg2000_init_static_data,
     .init             = jpeg2000_decode_init,
     .decode           = jpeg2000_decode_frame,
-    .priv_class       = &class,
+    .priv_class       = &jpeg2000_class,
+    .max_lowres       = 5,
     .profiles         = NULL_IF_CONFIG_SMALL(ff_jpeg2000_profiles)
 };
diff --git a/libavcodec/jpeg2000dsp.c b/libavcodec/jpeg2000dsp.c
index 6e04c3a..d183cbb 100644
--- a/libavcodec/jpeg2000dsp.c
+++ b/libavcodec/jpeg2000dsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,4 +95,7 @@ av_cold void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c)
     c->mct_decode[FF_DWT97]     = ict_float;
     c->mct_decode[FF_DWT53]     = rct_int;
     c->mct_decode[FF_DWT97_INT] = ict_int;
+
+    if (ARCH_X86)
+        ff_jpeg2000dsp_init_x86(c);
 }
diff --git a/libavcodec/jpeg2000dsp.h b/libavcodec/jpeg2000dsp.h
index 45a32c0..1ae5b95 100644
--- a/libavcodec/jpeg2000dsp.h
+++ b/libavcodec/jpeg2000dsp.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,5 +31,6 @@ typedef struct Jpeg2000DSPContext {
 } Jpeg2000DSPContext;
 
 void ff_jpeg2000dsp_init(Jpeg2000DSPContext *c);
+void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c);
 
 #endif /* AVCODEC_JPEG2000DSP_H */
diff --git a/libavcodec/jpeg2000dwt-test.c b/libavcodec/jpeg2000dwt-test.c
new file mode 100644
index 0000000..30f1ce1
--- /dev/null
+++ b/libavcodec/jpeg2000dwt-test.c
@@ -0,0 +1,141 @@
+/*
+ * Discrete wavelet transform
+ * Copyright (c) 2007 Kamil Nowosad
+ * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "jpeg2000dwt.c"
+
+#include "libavutil/lfg.h"
+
+#define MAX_W 256
+
+static int test_dwt(int *array, int *ref, int border[2][2], int decomp_levels, int type, int max_diff) {
+    int ret, j;
+    DWTContext s1={{{0}}}, *s= &s1;
+    int64_t err2 = 0;
+
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, type);
+    if (ret < 0) {
+        fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
+        return 1;
+    }
+    ret = ff_dwt_encode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    ret = ff_dwt_decode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    for (j = 0; j<MAX_W * MAX_W; j++) {
+        if (FFABS(array[j] - ref[j]) > max_diff) {
+            fprintf(stderr, "missmatch at %d (%d != %d) decomp:%d border %d %d %d %d\n",
+                    j, array[j], ref[j],decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1]);
+            return 2;
+        }
+        err2 += (array[j] - ref[j]) * (array[j] - ref[j]);
+        array[j] = ref[j];
+    }
+    ff_dwt_destroy(s);
+
+    printf("%s, decomp:%2d border %3d %3d %3d %3d milli-err2:%9"PRId64"\n",
+           type == FF_DWT53 ? "5/3i" : "9/7i",
+           decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1],
+           1000*err2 / ((border[0][1] - border[0][0])*(border[1][1] - border[1][0])));
+
+    return 0;
+}
+
+static int test_dwtf(float *array, float *ref, int border[2][2], int decomp_levels, float max_diff) {
+    int ret, j;
+    DWTContext s1={{{0}}}, *s= &s1;
+    double err2 = 0;
+
+    ret = ff_jpeg2000_dwt_init(s,  border, decomp_levels, FF_DWT97);
+    if (ret < 0) {
+        fprintf(stderr, "ff_jpeg2000_dwt_init failed\n");
+        return 1;
+    }
+    ret = ff_dwt_encode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    ret = ff_dwt_decode(s, array);
+    if (ret < 0) {
+        fprintf(stderr, "ff_dwt_encode failed\n");
+        return 1;
+    }
+    for (j = 0; j<MAX_W * MAX_W; j++) {
+        if (FFABS(array[j] - ref[j]) > max_diff) {
+            fprintf(stderr, "missmatch at %d (%f != %f) decomp:%d border %d %d %d %d\n",
+                    j, array[j], ref[j],decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1]);
+            return 2;
+        }
+        err2 += (array[j] - ref[j]) * (array[j] - ref[j]);
+        array[j] = ref[j];
+    }
+    ff_dwt_destroy(s);
+
+    printf("9/7f, decomp:%2d border %3d %3d %3d %3d err2:%20.3f\n",
+           decomp_levels, border[0][0], border[0][1], border[1][0], border[1][1],
+           err2 / ((border[0][1] - border[0][0])*(border[1][1] - border[1][0])));
+
+    return 0;
+}
+
+static int array[MAX_W * MAX_W];
+static int ref  [MAX_W * MAX_W];
+static float arrayf[MAX_W * MAX_W];
+static float reff  [MAX_W * MAX_W];
+
+int main(void) {
+    AVLFG prng;
+    int i,j;
+    int border[2][2];
+    int ret, decomp_levels;
+
+    av_lfg_init(&prng, 1);
+
+    for (i = 0; i<MAX_W * MAX_W; i++)
+        arrayf[i] = reff[i] = array[i] = ref[i] =  av_lfg_get(&prng) % 2048;
+
+    for (i = 0; i < 100; i++) {
+        for (j=0; j<4; j++)
+            border[j>>1][j&1] = av_lfg_get(&prng) % MAX_W;
+        if (border[0][0] >= border[0][1] || border[1][0] >= border[1][1])
+            continue;
+        decomp_levels = av_lfg_get(&prng) % FF_DWT_MAX_DECLVLS;
+
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT53, 0);
+        if (ret)
+            return ret;
+        ret = test_dwt(array, ref, border, decomp_levels, FF_DWT97_INT, FFMIN(7+5*decomp_levels, 15+3*decomp_levels));
+        if (ret)
+            return ret;
+        ret = test_dwtf(arrayf, reff, border, decomp_levels, 0.05);
+        if (ret)
+            return ret;
+    }
+
+    return 0;
+}
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 6642a53..188cc26 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Kamil Nowosad
  * Copyright (c) 2013 Nicolas Bertrand <nicoinattendu@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
  * Discrete wavelet transform
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/mem.h"
 #include "jpeg2000dwt.h"
@@ -36,21 +37,16 @@
 #define F_LFTG_BETA   0.052980118572961f
 #define F_LFTG_GAMMA  0.882911075530934f
 #define F_LFTG_DELTA  0.443506852043971f
-#define F_LFTG_K      1.230174104914001f
-#define F_LFTG_X      1.625732422f
-/* FIXME: Why use 1.625732422 instead of 1/F_LFTG_K?
- * Incorrect value in JPEG2000 norm.
- * see (ISO/IEC 15444:1 (version 2002) F.3.8.2 */
 
 /* Lifting parameters in integer format.
  * Computed as param = (float param) * (1 << 16) */
-#define I_LFTG_ALPHA  103949
-#define I_LFTG_BETA     3472
-#define I_LFTG_GAMMA   57862
-#define I_LFTG_DELTA   29066
-#define I_LFTG_K       80621
-#define I_LFTG_X      106544
-
+#define I_LFTG_ALPHA  103949ll
+#define I_LFTG_BETA     3472ll
+#define I_LFTG_GAMMA   57862ll
+#define I_LFTG_DELTA   29066ll
+#define I_LFTG_K       80621ll
+#define I_LFTG_X       53274ll
+#define I_PRESHIFT 8
 
 static inline void extend53(int *p, int i0, int i1)
 {
@@ -80,18 +76,250 @@ static inline void extend97_int(int32_t *p, int i0, int i1)
     }
 }
 
+static void sd_1d53(int *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] <<= 1;
+        return;
+    }
+
+    extend53(p, i0, i1);
+
+    for (i = ((i0+1)>>1) - 1; i < (i1+1)>>1; i++)
+        p[2*i+1] -= (p[2*i] + p[2*i+2]) >> 1;
+    for (i = ((i0+1)>>1); i < (i1+1)>>1; i++)
+        p[2*i] += (p[2*i-1] + p[2*i+1] + 2) >> 2;
+}
+
+static void dwt_encode53(DWTContext *s, int *t)
+{
+    int lev,
+        w = s->linelen[s->ndeclevels-1][0];
+    int *line = s->i_linebuf;
+    line += 3;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        int *l;
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d53(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d53(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+    }
+}
+static void sd_1d97_float(float *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] *= F_LFTG_X * 2;
+        else
+            p[0] *= F_LFTG_K;
+        return;
+    }
+
+    extend97_float(p, i0, i1);
+    i0++; i1++;
+
+    for (i = (i0>>1) - 2; i < (i1>>1) + 1; i++)
+        p[2*i+1] -= 1.586134 * (p[2*i] + p[2*i+2]);
+    for (i = (i0>>1) - 1; i < (i1>>1) + 1; i++)
+        p[2*i] -= 0.052980 * (p[2*i-1] + p[2*i+1]);
+    for (i = (i0>>1) - 1; i < (i1>>1); i++)
+        p[2*i+1] += 0.882911 * (p[2*i] + p[2*i+2]);
+    for (i = (i0>>1); i < (i1>>1); i++)
+        p[2*i] += 0.443506 * (p[2*i-1] + p[2*i+1]);
+}
+
+static void dwt_encode97_float(DWTContext *s, float *t)
+{
+    int lev,
+        w = s->linelen[s->ndeclevels-1][0];
+    float *line = s->f_linebuf;
+    line += 5;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        float *l;
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d97_float(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d97_float(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+    }
+}
+
+static void sd_1d97_int(int *p, int i0, int i1)
+{
+    int i;
+
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] = (p[1] * I_LFTG_X + (1<<14)) >> 15;
+        else
+            p[0] = (p[0] * I_LFTG_K + (1<<15)) >> 16;
+        return;
+    }
+
+    extend97_int(p, i0, i1);
+    i0++; i1++;
+
+    for (i = (i0>>1) - 2; i < (i1>>1) + 1; i++)
+        p[2 * i + 1] -= (I_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0>>1) - 1; i < (i1>>1) + 1; i++)
+        p[2 * i]     -= (I_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+    for (i = (i0>>1) - 1; i < (i1>>1); i++)
+        p[2 * i + 1] += (I_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
+    for (i = (i0>>1); i < (i1>>1); i++)
+        p[2 * i]     += (I_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
+}
+
+static void dwt_encode97_int(DWTContext *s, int *t)
+{
+    int lev;
+    int w = s->linelen[s->ndeclevels-1][0];
+    int h = s->linelen[s->ndeclevels-1][1];
+    int i;
+    int *line = s->i_linebuf;
+    line += 5;
+
+    for (i = 0; i < w * h; i++)
+        t[i] <<= I_PRESHIFT;
+
+    for (lev = s->ndeclevels-1; lev >= 0; lev--){
+        int lh = s->linelen[lev][0],
+            lv = s->linelen[lev][1],
+            mh = s->mod[lev][0],
+            mv = s->mod[lev][1],
+            lp;
+        int *l;
+
+        // VER_SD
+        l = line + mv;
+        for (lp = 0; lp < lh; lp++) {
+            int i, j = 0;
+
+            for (i = 0; i < lv; i++)
+                l[i] = t[w*i + lp];
+
+            sd_1d97_int(line, mv, mv + lv);
+
+            // copy back and deinterleave
+            for (i =   mv; i < lv; i+=2, j++)
+                t[w*j + lp] = ((l[i] * I_LFTG_X) + (1 << 15)) >> 16;
+            for (i = 1-mv; i < lv; i+=2, j++)
+                t[w*j + lp] = l[i];
+        }
+
+        // HOR_SD
+        l = line + mh;
+        for (lp = 0; lp < lv; lp++){
+            int i, j = 0;
+
+            for (i = 0; i < lh; i++)
+                l[i] = t[w*lp + i];
+
+            sd_1d97_int(line, mh, mh + lh);
+
+            // copy back and deinterleave
+            for (i =   mh; i < lh; i+=2, j++)
+                t[w*lp + j] = ((l[i] * I_LFTG_X) + (1 << 15)) >> 16;
+            for (i = 1-mh; i < lh; i+=2, j++)
+                t[w*lp + j] = l[i];
+        }
+
+    }
+
+    for (i = 0; i < w * h; i++)
+        t[i] = (t[i] + ((1<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
+}
+
 static void sr_1d53(int *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] >>= 1;
         return;
+    }
 
     extend53(p, i0, i1);
 
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
         p[2 * i] -= (p[2 * i - 1] + p[2 * i + 1] + 2) >> 2;
-    for (i = i0 / 2; i < i1 / 2; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
         p[2 * i + 1] += (p[2 * i] + p[2 * i + 2]) >> 1;
 }
 
@@ -148,21 +376,26 @@ static void sr_1d97_float(float *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] *= F_LFTG_K/2;
+        else
+            p[0] *= F_LFTG_X;
         return;
+    }
 
     extend97_float(p, i0, i1);
 
-    for (i = i0 / 2 - 1; i < i1 / 2 + 2; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 2; i++)
         p[2 * i]     -= F_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]);
     /* step 4 */
-    for (i = i0 / 2 - 1; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 1; i++)
         p[2 * i + 1] -= F_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]);
     /*step 5*/
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
         p[2 * i]     += F_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]);
     /* step 6 */
-    for (i = i0 / 2; i < i1 / 2; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
         p[2 * i + 1] += F_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]);
 }
 
@@ -188,9 +421,9 @@ static void dwt_decode97_float(DWTContext *s, float *t)
             int i, j = 0;
             // copy with interleaving
             for (i = mh; i < lh; i += 2, j++)
-                l[i] = data[w * lp + j] * F_LFTG_K;
+                l[i] = data[w * lp + j];
             for (i = 1 - mh; i < lh; i += 2, j++)
-                l[i] = data[w * lp + j] * F_LFTG_X;
+                l[i] = data[w * lp + j];
 
             sr_1d97_float(line, mh, mh + lh);
 
@@ -204,9 +437,9 @@ static void dwt_decode97_float(DWTContext *s, float *t)
             int i, j = 0;
             // copy with interleaving
             for (i = mv; i < lv; i += 2, j++)
-                l[i] = data[w * j + lp] * F_LFTG_K;
+                l[i] = data[w * j + lp];
             for (i = 1 - mv; i < lv; i += 2, j++)
-                l[i] = data[w * j + lp] * F_LFTG_X;
+                l[i] = data[w * j + lp];
 
             sr_1d97_float(line, mv, mv + lv);
 
@@ -220,21 +453,26 @@ static void sr_1d97_int(int32_t *p, int i0, int i1)
 {
     int i;
 
-    if (i1 == i0 + 1)
+    if (i1 <= i0 + 1) {
+        if (i0 == 1)
+            p[1] = (p[1] * I_LFTG_K + (1<<16)) >> 17;
+        else
+            p[0] = (p[0] * I_LFTG_X + (1<<15)) >> 16;
         return;
+    }
 
     extend97_int(p, i0, i1);
 
-    for (i = i0 / 2 - 1; i < i1 / 2 + 2; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 2; i++)
         p[2 * i]     -= (I_LFTG_DELTA * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
     /* step 4 */
-    for (i = i0 / 2 - 1; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1) - 1; i < (i1 >> 1) + 1; i++)
         p[2 * i + 1] -= (I_LFTG_GAMMA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
     /*step 5*/
-    for (i = i0 / 2; i < i1 / 2 + 1; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1) + 1; i++)
         p[2 * i]     += (I_LFTG_BETA  * (p[2 * i - 1] + p[2 * i + 1]) + (1 << 15)) >> 16;
     /* step 6 */
-    for (i = i0 / 2; i < i1 / 2; i++)
+    for (i = (i0 >> 1); i < (i1 >> 1); i++)
         p[2 * i + 1] += (I_LFTG_ALPHA * (p[2 * i]     + p[2 * i + 2]) + (1 << 15)) >> 16;
 }
 
@@ -242,11 +480,16 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
 {
     int lev;
     int w       = s->linelen[s->ndeclevels - 1][0];
+    int h       = s->linelen[s->ndeclevels - 1][1];
+    int i;
     int32_t *line = s->i_linebuf;
     int32_t *data = t;
     /* position at index O of line range [0-5,w+5] cf. extend function */
     line += 5;
 
+    for (i = 0; i < w * h; i++)
+        data[i] <<= I_PRESHIFT;
+
     for (lev = 0; lev < s->ndeclevels; lev++) {
         int lh = s->linelen[lev][0],
             lv = s->linelen[lev][1],
@@ -262,7 +505,7 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
             for (i = mh; i < lh; i += 2, j++)
                 l[i] = ((data[w * lp + j] * I_LFTG_K) + (1 << 15)) >> 16;
             for (i = 1 - mh; i < lh; i += 2, j++)
-                l[i] = ((data[w * lp + j] * I_LFTG_X) + (1 << 15)) >> 16;
+                l[i] = data[w * lp + j];
 
             sr_1d97_int(line, mh, mh + lh);
 
@@ -278,7 +521,7 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
             for (i = mv; i < lv; i += 2, j++)
                 l[i] = ((data[w * j + lp] * I_LFTG_K) + (1 << 15)) >> 16;
             for (i = 1 - mv; i < lv; i += 2, j++)
-                l[i] = ((data[w * j + lp] * I_LFTG_X) + (1 << 15)) >> 16;
+                l[i] = data[w * j + lp];
 
             sr_1d97_int(line, mv, mv + lv);
 
@@ -286,9 +529,12 @@ static void dwt_decode97_int(DWTContext *s, int32_t *t)
                 data[w * i + lp] = l[i];
         }
     }
+
+    for (i = 0; i < w * h; i++)
+        data[i] = (data[i] + ((1<<I_PRESHIFT)>>1)) >> I_PRESHIFT;
 }
 
-int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
+int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
                          int decomp_levels, int type)
 {
     int i, j, lev = decomp_levels, maxlen,
@@ -312,17 +558,17 @@ int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
         }
     switch (type) {
     case FF_DWT97:
-        s->f_linebuf = av_malloc((maxlen + 12) * sizeof(*s->f_linebuf));
+        s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
         if (!s->f_linebuf)
             return AVERROR(ENOMEM);
         break;
      case FF_DWT97_INT:
-        s->i_linebuf = av_malloc((maxlen + 12) * sizeof(*s->i_linebuf));
+        s->i_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
     case FF_DWT53:
-        s->i_linebuf = av_malloc((maxlen +  6) * sizeof(*s->i_linebuf));
+        s->i_linebuf = av_malloc_array((maxlen +  6), sizeof(*s->i_linebuf));
         if (!s->i_linebuf)
             return AVERROR(ENOMEM);
         break;
@@ -332,8 +578,29 @@ int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
     return 0;
 }
 
+int ff_dwt_encode(DWTContext *s, void *t)
+{
+    if (s->ndeclevels == 0)
+        return 0;
+
+    switch(s->type){
+        case FF_DWT97:
+            dwt_encode97_float(s, t); break;
+        case FF_DWT97_INT:
+            dwt_encode97_int(s, t); break;
+        case FF_DWT53:
+            dwt_encode53(s, t); break;
+        default:
+            return -1;
+    }
+    return 0;
+}
+
 int ff_dwt_decode(DWTContext *s, void *t)
 {
+    if (s->ndeclevels == 0)
+        return 0;
+
     switch (s->type) {
     case FF_DWT97:
         dwt_decode97_float(s, t);
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index f08340d..718d183 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -2,20 +2,20 @@
  * Discrete wavelet transform
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 #include <stdint.h>
 
 #define FF_DWT_MAX_DECLVLS 32 ///< max number of decomposition levels
+#define F_LFTG_K      1.230174104914001f
+#define F_LFTG_X      0.812893066115961f
 
 enum DWTType {
     FF_DWT97,
@@ -40,7 +42,7 @@ enum DWTType {
 
 typedef struct DWTContext {
     /// line lengths { horizontal, vertical } in consecutive decomposition levels
-    uint16_t linelen[FF_DWT_MAX_DECLVLS][2];
+    int linelen[FF_DWT_MAX_DECLVLS][2];
     uint8_t mod[FF_DWT_MAX_DECLVLS][2];  ///< coordinates (x0, y0) of decomp. levels mod 2
     uint8_t ndeclevels;                  ///< number of decomposition levels
     uint8_t type;                        ///< 0 for 9/7; 1 for 5/3
@@ -55,9 +57,10 @@ typedef struct DWTContext {
  * @param decomp_levels     number of decomposition levels
  * @param type              0 for DWT 9/7; 1 for DWT 5/3
  */
-int ff_jpeg2000_dwt_init(DWTContext *s, uint16_t border[2][2],
+int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
                          int decomp_levels, int type);
 
+int ff_dwt_encode(DWTContext *s, void *t);
 int ff_dwt_decode(DWTContext *s, void *t);
 
 void ff_dwt_destroy(DWTContext *s);
diff --git a/libavcodec/jpegls.c b/libavcodec/jpegls.c
index 19d461f..7f9fa8d 100644
--- a/libavcodec/jpegls.c
+++ b/libavcodec/jpegls.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,10 +39,8 @@ void ff_jpegls_init_state(JLSState *state)
     for (state->qbpp = 0; (1 << state->qbpp) < state->range; state->qbpp++)
         ;
 
-    if (state->bpp < 8)
-        state->limit = 2 * state->bpp - state->qbpp + 16;
-    else
-        state->limit = 4 * state->bpp - state->qbpp;
+    state->bpp   = FFMAX(av_log2(state->maxval) + 1, 2);
+    state->limit = 2*(state->bpp + FFMAX(state->bpp, 8)) - state->qbpp;
 
     for (i = 0; i < 367; i++) {
         state->A[i] = FFMAX(state->range + 32 >> 6, 2);
diff --git a/libavcodec/jpegls.h b/libavcodec/jpegls.h
index eae3943..c8997c7 100644
--- a/libavcodec/jpegls.h
+++ b/libavcodec/jpegls.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 
 #include "libavutil/common.h"
 #include "avcodec.h"
+#include "internal.h"
 
 typedef struct JpeglsContext {
     AVCodecContext *avctx;
@@ -40,11 +41,9 @@ typedef struct JLSState {
     int A[367], B[367], C[365], N[367];
     int limit, reset, bpp, qbpp, maxval, range;
     int near, twonear;
-    int run_index[3];
+    int run_index[4];
 } JLSState;
 
-extern const uint8_t ff_log2_run[32];
-
 /**
  * Calculate initial JPEG-LS parameters
  */
@@ -98,6 +97,8 @@ static inline void ff_jpegls_downscale_state(JLSState *state, int Q)
 static inline int ff_jpegls_update_state_regular(JLSState *state,
                                                  int Q, int err)
 {
+    if(FFABS(err) > 0xFFFF)
+        return -0x10000;
     state->A[Q] += FFABS(err);
     err         *= state->twonear;
     state->B[Q] += err;
diff --git a/libavcodec/jpeglsdec.c b/libavcodec/jpeglsdec.c
index 9f8ccec..68151cb 100644
--- a/libavcodec/jpeglsdec.c
+++ b/libavcodec/jpeglsdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,7 +40,7 @@
  * (or test broken JPEG-LS decoder) and slow down ordinary decoding a bit.
  *
  * There is no Golomb code with length >= 32 bits possible, so check and
- * avoid situation of 32 zeros, Libav Golomb decoder is painfully slow
+ * avoid situation of 32 zeros, FFmpeg Golomb decoder is painfully slow
  * on this errors.
  */
 //#define JLS_BROKEN
@@ -51,27 +51,87 @@
 int ff_jpegls_decode_lse(MJpegDecodeContext *s)
 {
     int id;
+    int tid, wt, maxtab, i, j;
 
-    skip_bits(&s->gb, 16);  /* length: FIXME: verify field validity */
+    int len = get_bits(&s->gb, 16);
     id = get_bits(&s->gb, 8);
 
     switch (id) {
     case 1:
+        if (len < 13)
+            return AVERROR_INVALIDDATA;
+
         s->maxval = get_bits(&s->gb, 16);
         s->t1     = get_bits(&s->gb, 16);
         s->t2     = get_bits(&s->gb, 16);
         s->t3     = get_bits(&s->gb, 16);
         s->reset  = get_bits(&s->gb, 16);
 
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
+            av_log(s->avctx, AV_LOG_DEBUG, "Coding parameters maxval:%d T1:%d T2:%d T3:%d reset:%d\n",
+                   s->maxval, s->t1, s->t2, s->t3, s->reset);
+        }
+
 //        ff_jpegls_reset_coding_parameters(s, 0);
         //FIXME quant table?
         break;
     case 2:
+        s->palette_index = 0;
     case 3:
-        av_log(s->avctx, AV_LOG_ERROR, "palette not supported\n");
-        return AVERROR(ENOSYS);
+        tid= get_bits(&s->gb, 8);
+        wt = get_bits(&s->gb, 8);
+
+        if (len < 5)
+            return AVERROR_INVALIDDATA;
+
+        if (wt < 1 || wt > MAX_COMPONENTS) {
+            avpriv_request_sample(s->avctx, "wt %d", wt);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        if (!s->maxval)
+            maxtab = 255;
+        else if ((5 + wt*(s->maxval+1)) < 65535)
+            maxtab = s->maxval;
+        else
+            maxtab = 65530/wt - 1;
+
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO) {
+            av_log(s->avctx, AV_LOG_DEBUG, "LSE palette %d tid:%d wt:%d maxtab:%d\n", id, tid, wt, maxtab);
+        }
+        if (maxtab >= 256) {
+            avpriv_request_sample(s->avctx, ">8bit palette");
+            return AVERROR_PATCHWELCOME;
+        }
+        maxtab = FFMIN(maxtab, (len - 5) / wt + s->palette_index);
+
+        if (s->palette_index > maxtab)
+            return AVERROR_INVALIDDATA;
+
+        if ((s->avctx->pix_fmt == AV_PIX_FMT_GRAY8 || s->avctx->pix_fmt == AV_PIX_FMT_PAL8) &&
+            (s->picture_ptr->format == AV_PIX_FMT_GRAY8 || s->picture_ptr->format == AV_PIX_FMT_PAL8)) {
+            uint32_t *pal = (uint32_t *)s->picture_ptr->data[1];
+            int shift = 0;
+
+            if (s->avctx->bits_per_raw_sample > 0 && s->avctx->bits_per_raw_sample < 8) {
+                maxtab = FFMIN(maxtab, (1<<s->avctx->bits_per_raw_sample)-1);
+                shift = 8 - s->avctx->bits_per_raw_sample;
+            }
+
+            s->picture_ptr->format =
+            s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            for (i=s->palette_index; i<=maxtab; i++) {
+                uint8_t k = i << shift;
+                pal[k] = 0;
+                for (j=0; j<wt; j++) {
+                    pal[k] |= get_bits(&s->gb, 8) << (8*(wt-j-1));
+                }
+            }
+            s->palette_index = i;
+        }
+        break;
     case 4:
-        av_log(s->avctx, AV_LOG_ERROR, "oversize image not supported\n");
+        avpriv_request_sample(s->avctx, "oversize image");
         return AVERROR(ENOSYS);
     default:
         av_log(s->avctx, AV_LOG_ERROR, "invalid id %d\n", id);
@@ -149,6 +209,8 @@ static inline int ls_get_code_runterm(GetBitContext *gb, JLSState *state,
         ret = ret >> 1;
     }
 
+    if(FFABS(ret) > 0xFFFF)
+        return -0x10000;
     /* update state */
     state->A[Q] += FFABS(ret) - RItype;
     ret         *= state->twonear;
@@ -208,11 +270,20 @@ static inline void ls_decode_line(JLSState *state, MJpegDecodeContext *s,
             r = ff_log2_run[state->run_index[comp]];
             if (r)
                 r = get_bits_long(&s->gb, r);
+            if (x + r * stride > w) {
+                r = (w - x) / stride;
+            }
             for (i = 0; i < r; i++) {
                 W(dst, x, Ra);
                 x += stride;
             }
 
+            if (x >= w) {
+                av_log(NULL, AV_LOG_ERROR, "run overflow\n");
+                av_assert0(x <= w);
+                return;
+            }
+
             /* decode run termination value */
             Rb     = R(last, x);
             RItype = (FFABS(Ra - Rb) <= state->near) ? 1 : 0;
@@ -304,21 +375,23 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
     else
         shift = point_transform + (16 - s->bits);
 
-    ff_dlog(s->avctx,
-            "JPEG-LS params: %ix%i NEAR=%i MV=%i T(%i,%i,%i) "
-            "RESET=%i, LIMIT=%i, qbpp=%i, RANGE=%i\n",
-            s->width, s->height, state->near, state->maxval,
-            state->T1, state->T2, state->T3,
-            state->reset, state->limit, state->qbpp, state->range);
-    ff_dlog(s->avctx, "JPEG params: ILV=%i Pt=%i BPP=%i, scan = %i\n",
-            ilv, point_transform, s->bits, s->cur_scan);
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "JPEG-LS params: %ix%i NEAR=%i MV=%i T(%i,%i,%i) "
+               "RESET=%i, LIMIT=%i, qbpp=%i, RANGE=%i\n",
+                s->width, s->height, state->near, state->maxval,
+                state->T1, state->T2, state->T3,
+                state->reset, state->limit, state->qbpp, state->range);
+        av_log(s->avctx, AV_LOG_DEBUG, "JPEG params: ILV=%i Pt=%i BPP=%i, scan = %i\n",
+                ilv, point_transform, s->bits, s->cur_scan);
+    }
     if (ilv == 0) { /* separate planes */
         if (s->cur_scan > s->nb_components) {
             ret = AVERROR_INVALIDDATA;
             goto end;
         }
-        off    = s->cur_scan - 1;
         stride = (s->nb_components > 1) ? 3 : 1;
+        off    = av_clip(s->cur_scan - 1, 0, stride - 1);
         width  = s->width * stride;
         cur   += off;
         for (i = 0; i < s->height; i++) {
@@ -340,12 +413,13 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
     } else if (ilv == 1) { /* line interleaving */
         int j;
         int Rc[3] = { 0, 0, 0 };
+        stride = (s->nb_components > 1) ? 3 : 1;
         memset(cur, 0, s->picture_ptr->linesize[0]);
-        width = s->width * 3;
+        width = s->width * stride;
         for (i = 0; i < s->height; i++) {
-            for (j = 0; j < 3; j++) {
+            for (j = 0; j < stride; j++) {
                 ls_decode_line(state, s, last + j, cur + j,
-                               Rc[j], width, 3, j, 8);
+                               Rc[j], width, stride, j, 8);
                 Rc[j] = last[j];
 
                 if (s->restart_interval && !--s->restart_count) {
@@ -362,6 +436,53 @@ int ff_jpegls_decode_picture(MJpegDecodeContext *s, int near,
         goto end;
     }
 
+    if (s->xfrm && s->nb_components == 3) {
+        int x, w;
+
+        w = s->width * s->nb_components;
+
+        if (s->bits <= 8) {
+            uint8_t *src = s->picture_ptr->data[0];
+
+            for (i = 0; i < s->height; i++) {
+                switch(s->xfrm) {
+                case 1:
+                    for (x = off; x < w; x += 3) {
+                        src[x  ] += src[x+1] + 128;
+                        src[x+2] += src[x+1] + 128;
+                    }
+                    break;
+                case 2:
+                    for (x = off; x < w; x += 3) {
+                        src[x  ] += src[x+1] + 128;
+                        src[x+2] += ((src[x  ] + src[x+1])>>1) + 128;
+                    }
+                    break;
+                case 3:
+                    for (x = off; x < w; x += 3) {
+                        int g = src[x+0] - ((src[x+2]+src[x+1])>>2) + 64;
+                        src[x+0] = src[x+2] + g + 128;
+                        src[x+2] = src[x+1] + g + 128;
+                        src[x+1] = g;
+                    }
+                    break;
+                case 4:
+                    for (x = off; x < w; x += 3) {
+                        int r    = src[x+0] - ((                       359 * (src[x+2]-128) + 490) >> 8);
+                        int g    = src[x+0] - (( 88 * (src[x+1]-128) - 183 * (src[x+2]-128) +  30) >> 8);
+                        int b    = src[x+0] + ((454 * (src[x+1]-128)                        + 574) >> 8);
+                        src[x+0] = av_clip_uint8(r);
+                        src[x+1] = av_clip_uint8(g);
+                        src[x+2] = av_clip_uint8(b);
+                    }
+                    break;
+                }
+                src += s->picture_ptr->linesize[0];
+            }
+        }else
+            avpriv_report_missing_feature(s->avctx, "16bit xfrm");
+    }
+
     if (shift) { /* we need to do point transform or normalize samples */
         int x, w;
 
diff --git a/libavcodec/jpeglsdec.h b/libavcodec/jpeglsdec.h
index d60a87b..0cafaba 100644
--- a/libavcodec/jpeglsdec.h
+++ b/libavcodec/jpeglsdec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jpeglsenc.c b/libavcodec/jpeglsenc.c
index cb9b71e..1208cda 100644
--- a/libavcodec/jpeglsenc.c
+++ b/libavcodec/jpeglsenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Michael Niedermayer
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 
 #include "avcodec.h"
 #include "get_bits.h"
+#include "put_bits.h"
 #include "golomb.h"
 #include "internal.h"
 #include "mathops.h"
@@ -263,7 +264,7 @@ static int encode_picture_ls(AVCodecContext *avctx, AVPacket *pkt,
     uint8_t *zero = NULL;
     uint8_t *cur  = NULL;
     uint8_t *last = NULL;
-    JLSState *state;
+    JLSState *state = NULL;
     int i, size, ret;
     int comps;
 
@@ -280,11 +281,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     else
         comps = 3;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height * comps * 4 +
-                               AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width  *avctx->height * comps * 4 +
+                                AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     buf2 = av_malloc(pkt->size);
     if (!buf2)
@@ -330,7 +329,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     ls_store_lse(state, &pb);
 
-    zero = last = av_mallocz(p->linesize[0]);
+    zero = last = av_mallocz(FFABS(p->linesize[0]));
     if (!zero)
         goto memfail;
 
@@ -472,6 +471,7 @@ AVCodec ff_jpegls_encoder = {
     .priv_data_size = sizeof(JPEGLSContext),
     .priv_class     = &jpegls_class,
     .init           = encode_init_ls,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .encode2        = encode_picture_ls,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_BGR24, AV_PIX_FMT_RGB24,
diff --git a/libavcodec/jpegtables.c b/libavcodec/jpegtables.c
index ce2bae2..cbe5523 100644
--- a/libavcodec/jpegtables.c
+++ b/libavcodec/jpegtables.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
-const unsigned char std_luminance_quant_tbl[64] = {
+static const unsigned char std_luminance_quant_tbl[64] = {
     16,  11,  10,  16,  24,  40,  51,  61,
     12,  12,  14,  19,  26,  58,  60,  55,
     14,  13,  16,  24,  40,  57,  69,  56,
@@ -48,7 +48,7 @@ const unsigned char std_luminance_quant_tbl[64] = {
     49,  64,  78,  87, 103, 121, 120, 101,
     72,  92,  95,  98, 112, 100, 103,  99
 };
-const unsigned char std_chrominance_quant_tbl[64] = {
+static const unsigned char std_chrominance_quant_tbl[64] = {
     17,  18,  24,  47,  99,  99,  99,  99,
     18,  21,  26,  66,  99,  99,  99,  99,
     24,  26,  56,  99,  99,  99,  99,  99,
diff --git a/libavcodec/jpegtables.h b/libavcodec/jpegtables.h
index 1a909be..6833b4b 100644
--- a/libavcodec/jpegtables.h
+++ b/libavcodec/jpegtables.h
@@ -1,20 +1,20 @@
 /*
  * JPEG-related tables
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/jrevdct.c b/libavcodec/jrevdct.c
index 96a884a..55a7392 100644
--- a/libavcodec/jrevdct.c
+++ b/libavcodec/jrevdct.c
@@ -943,6 +943,219 @@ void ff_j_rev_dct(DCTBLOCK data)
   }
 }
 
+#undef DCTSIZE
+#define DCTSIZE 4
+#define DCTSTRIDE 8
+
+void ff_j_rev_dct4(DCTBLOCK data)
+{
+  int32_t tmp0, tmp1, tmp2, tmp3;
+  int32_t tmp10, tmp11, tmp12, tmp13;
+  int32_t z1;
+  int32_t d0, d2, d4, d6;
+  register int16_t *dataptr;
+  int rowctr;
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  data[0] += 4;
+
+  dataptr = data;
+
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any row in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * row DCT calculations can be simplified this way.
+     */
+
+    register int *idataptr = (int*)dataptr;
+
+    d0 = dataptr[0];
+    d2 = dataptr[1];
+    d4 = dataptr[2];
+    d6 = dataptr[3];
+
+    if ((d2 | d4 | d6) == 0) {
+      /* AC terms all zero */
+      if (d0) {
+          /* Compute a 32 bit value to assign. */
+          int16_t dcval = (int16_t) (d0 << PASS1_BITS);
+          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
+
+          idataptr[0] = v;
+          idataptr[1] = v;
+      }
+
+      dataptr += DCTSTRIDE;     /* advance pointer to next row */
+      continue;
+    }
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            }
+    } else {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+            }
+      }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSTRIDE;       /* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  dataptr = data;
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Columns of zeroes can be exploited in the same way as we did with rows.
+     * However, the row calculation has created many nonzero AC terms, so the
+     * simplification applies less often (typically 5% to 10% of the time).
+     * On machines with very fast multiplication, it's possible that the
+     * test takes more time than it's worth.  In that case this section
+     * may be commented out.
+     */
+
+    d0 = dataptr[DCTSTRIDE*0];
+    d2 = dataptr[DCTSTRIDE*1];
+    d4 = dataptr[DCTSTRIDE*2];
+    d6 = dataptr[DCTSTRIDE*3];
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            }
+    } else {
+            if (d2) {
+                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+                    tmp0 = (d0 + d4) << CONST_BITS;
+                    tmp1 = (d0 - d4) << CONST_BITS;
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp13 = tmp0 - tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp1 - tmp2;
+            } else {
+                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+            }
+    }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
+
+    dataptr++;                  /* advance pointer to next column */
+  }
+}
+
+void ff_j_rev_dct2(DCTBLOCK data){
+  int d00, d01, d10, d11;
+
+  data[0] += 4;
+  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
+  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
+  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
+  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
+
+  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
+  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
+  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
+  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
+}
+
+void ff_j_rev_dct1(DCTBLOCK data){
+  data[0] = (data[0] + 4)>>3;
+}
+
+#undef FIX
+#undef CONST_BITS
+
 void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
 {
     ff_j_rev_dct(block);
diff --git a/libavcodec/jvdec.c b/libavcodec/jvdec.c
index c532b75..cbe83d3 100644
--- a/libavcodec/jvdec.c
+++ b/libavcodec/jvdec.c
@@ -2,20 +2,20 @@
  * Bitmap Brothers JV video decoder
  * Copyright (c) 2011 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -147,24 +147,28 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
     JvContext *s = avctx->priv_data;
-    int buf_size = avpkt->size;
     const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = buf + buf_size;
+    const uint8_t *buf_end = buf + avpkt->size;
     int video_size, video_type, i, j, ret;
 
+    if (avpkt->size < 6)
+        return AVERROR_INVALIDDATA;
+
     video_size = AV_RL32(buf);
     video_type = buf[4];
     buf += 5;
 
     if (video_size) {
-        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return ret;
+        if (video_size < 0 || video_size > avpkt->size - 5) {
+            av_log(avctx, AV_LOG_ERROR, "video size %d invalid\n", video_size);
+            return AVERROR_INVALIDDATA;
         }
+        if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+            return ret;
 
         if (video_type == 0 || video_type == 1) {
             GetBitContext gb;
-            init_get_bits(&gb, buf, 8 * FFMIN(video_size, buf_end - buf));
+            init_get_bits(&gb, buf, 8 * video_size);
 
             for (j = 0; j < avctx->height; j += 8)
                 for (i = 0; i < avctx->width; i += 8)
@@ -174,12 +178,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
             buf += video_size;
         } else if (video_type == 2) {
-            if (buf + 1 <= buf_end) {
-                int v = *buf++;
-                for (j = 0; j < avctx->height; j++)
-                    memset(s->frame->data[0] + j * s->frame->linesize[0],
-                           v, avctx->width);
-            }
+            int v = *buf++;
+            for (j = 0; j < avctx->height; j++)
+                memset(s->frame->data[0] + j * s->frame->linesize[0],
+                       v, avctx->width);
         } else {
             av_log(avctx, AV_LOG_WARNING,
                    "unsupported frame type %i\n", video_type);
@@ -187,9 +189,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
-    if (buf < buf_end) {
-        for (i = 0; i < AVPALETTE_COUNT && buf + 3 <= buf_end; i++) {
-            s->palette[i] = AV_RB24(buf) << 2;
+    if (buf_end - buf >= AVPALETTE_COUNT * 3) {
+        for (i = 0; i < AVPALETTE_COUNT; i++) {
+            uint32_t pal = AV_RB24(buf);
+            s->palette[i] = 0xFFU << 24 | pal << 2 | ((pal >> 4) & 0x30303);
             buf += 3;
         }
         s->palette_has_changed = 1;
@@ -207,7 +210,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         *got_frame = 1;
     }
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_close(AVCodecContext *avctx)
diff --git a/libavcodec/kbdwin.c b/libavcodec/kbdwin.c
index 1b7313d..bf32aeb 100644
--- a/libavcodec/kbdwin.c
+++ b/libavcodec/kbdwin.c
@@ -1,22 +1,22 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
+#include "libavutil/avassert.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/attributes.h"
 #include "kbdwin.h"
@@ -30,7 +30,7 @@ av_cold void ff_kbd_window_init(float *window, float alpha, int n)
    double local_window[FF_KBD_WINDOW_MAX];
    double alpha2 = (alpha * M_PI / n) * (alpha * M_PI / n);
 
-   assert(n <= FF_KBD_WINDOW_MAX);
+   av_assert0(n <= FF_KBD_WINDOW_MAX);
 
    for (i = 0; i < n; i++) {
        tmp = i * (n - i) * alpha2;
@@ -45,3 +45,13 @@ av_cold void ff_kbd_window_init(float *window, float alpha, int n)
    for (i = 0; i < n; i++)
        window[i] = sqrt(local_window[i] / sum);
 }
+
+av_cold void ff_kbd_window_init_fixed(int32_t *window, float alpha, int n)
+{
+    int i;
+    float local_window[FF_KBD_WINDOW_MAX];
+
+    ff_kbd_window_init(local_window, alpha, n);
+    for (i = 0; i < n; i++)
+        window[i] = (int)floor(2147483647.0 * local_window[i] + 0.5);
+}
diff --git a/libavcodec/kbdwin.h b/libavcodec/kbdwin.h
index 89b569a..4185c42 100644
--- a/libavcodec/kbdwin.h
+++ b/libavcodec/kbdwin.h
@@ -1,24 +1,26 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_KBDWIN_H
 #define AVCODEC_KBDWIN_H
 
+#include <stdint.h>
+
 /**
  * Maximum window size for ff_kbd_window_init.
  */
@@ -31,5 +33,6 @@
  * @param   n       size of half window, max FF_KBD_WINDOW_MAX
  */
 void ff_kbd_window_init(float *window, float alpha, int n);
+void ff_kbd_window_init_fixed(int32_t *window, float alpha, int n);
 
 #endif /* AVCODEC_KBDWIN_H */
diff --git a/libavcodec/kgv1dec.c b/libavcodec/kgv1dec.c
index 0bf322e..5359411 100644
--- a/libavcodec/kgv1dec.c
+++ b/libavcodec/kgv1dec.c
@@ -2,20 +2,20 @@
  * Kega Game Video (KGV1) decoder
  * Copyright (c) 2010 Daniel Verkamp
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,6 @@
 #include "internal.h"
 
 typedef struct KgvContext {
-    AVCodecContext *avctx;
     uint16_t *frame_buffer;
     uint16_t *last_frame_buffer;
 } KgvContext;
@@ -52,7 +51,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     const uint8_t *buf_end = buf + avpkt->size;
     KgvContext * const c = avctx->priv_data;
     int offsets[8];
-    uint16_t *out, *prev;
+    uint8_t *out, *prev;
     int outcnt = 0, maxcnt;
     int w, h, i, res;
 
@@ -83,22 +82,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    out  = c->frame_buffer;
-    prev = c->last_frame_buffer;
+    out  = (uint8_t*)c->frame_buffer;
+    prev = (uint8_t*)c->last_frame_buffer;
 
     for (i = 0; i < 8; i++)
         offsets[i] = -1;
 
-    while (outcnt < maxcnt && buf_end - 2 > buf) {
+    while (outcnt < maxcnt && buf_end - 2 >= buf) {
         int code = AV_RL16(buf);
         buf += 2;
 
         if (!(code & 0x8000)) {
-            out[outcnt++] = code; // rgb555 pixel coded directly
+            AV_WN16A(&out[2 * outcnt], code); // rgb555 pixel coded directly
+            outcnt++;
         } else {
             int count;
-            int inp_off;
-            uint16_t *inp;
 
             if ((code & 0x6000) == 0x6000) {
                 // copy from previous frame
@@ -116,7 +114,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
                 start = (outcnt + offsets[oidx]) % maxcnt;
 
-                if (maxcnt - start < count)
+                if (maxcnt - start < count || maxcnt - outcnt < count)
                     break;
 
                 if (!prev) {
@@ -125,8 +123,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     break;
                 }
 
-                inp = prev;
-                inp_off = start;
+                memcpy(out + 2 * outcnt, prev + 2 * start, 2 * count);
             } else {
                 // copy from earlier in this frame
                 int offset = (code & 0x1FFF) + 1;
@@ -141,19 +138,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     count = 4 + *buf++;
                 }
 
-                if (outcnt < offset)
+                if (outcnt < offset || maxcnt - outcnt < count)
                     break;
 
-                inp = out;
-                inp_off = outcnt - offset;
-            }
-
-            if (maxcnt - outcnt < count)
-                break;
-
-            for (i = inp_off; i < count + inp_off; i++) {
-                out[outcnt++] = inp[i];
+                av_memcpy_backptr(out + 2 * outcnt, 2 * offset, 2 * count);
             }
+            outcnt += count;
         }
     }
 
@@ -172,9 +162,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    KgvContext * const c = avctx->priv_data;
-
-    c->avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_RGB555;
 
     return 0;
diff --git a/libavcodec/kmvc.c b/libavcodec/kmvc.c
index ca6b79f..7acaba7 100644
--- a/libavcodec/kmvc.c
+++ b/libavcodec/kmvc.c
@@ -2,20 +2,20 @@
  * KMVC decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -107,6 +107,10 @@ static int kmvc_decode_intra_8x8(KmvcContext * ctx, int w, int h)
                             val = bytestream2_get_byte(&ctx->g);
                             mx = val & 0xF;
                             my = val >> 4;
+                            if ((l0x-mx) + 320*(l0y-my) < 0 || (l0x-mx) + 320*(l0y-my) > 320*197 - 4) {
+                                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                return AVERROR_INVALIDDATA;
+                            }
                             for (j = 0; j < 16; j++)
                                 BLK(ctx->cur, l0x + (j & 3), l0y + (j >> 2)) =
                                     BLK(ctx->cur, l0x + (j & 3) - mx, l0y + (j >> 2) - my);
@@ -128,6 +132,10 @@ static int kmvc_decode_intra_8x8(KmvcContext * ctx, int w, int h)
                                     val = bytestream2_get_byte(&ctx->g);
                                     mx = val & 0xF;
                                     my = val >> 4;
+                                    if ((l1x-mx) + 320*(l1y-my) < 0 || (l1x-mx) + 320*(l1y-my) > 320*199 - 2) {
+                                        av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
                                     BLK(ctx->cur, l1x, l1y) = BLK(ctx->cur, l1x - mx, l1y - my);
                                     BLK(ctx->cur, l1x + 1, l1y) =
                                         BLK(ctx->cur, l1x + 1 - mx, l1y - my);
@@ -199,6 +207,10 @@ static int kmvc_decode_inter_8x8(KmvcContext * ctx, int w, int h)
                             val = bytestream2_get_byte(&ctx->g);
                             mx = (val & 0xF) - 8;
                             my = (val >> 4) - 8;
+                            if ((l0x+mx) + 320*(l0y+my) < 0 || (l0x+mx) + 320*(l0y+my) > 320*197 - 4) {
+                                av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                return AVERROR_INVALIDDATA;
+                            }
                             for (j = 0; j < 16; j++)
                                 BLK(ctx->cur, l0x + (j & 3), l0y + (j >> 2)) =
                                     BLK(ctx->prev, l0x + (j & 3) + mx, l0y + (j >> 2) + my);
@@ -220,6 +232,10 @@ static int kmvc_decode_inter_8x8(KmvcContext * ctx, int w, int h)
                                     val = bytestream2_get_byte(&ctx->g);
                                     mx = (val & 0xF) - 8;
                                     my = (val >> 4) - 8;
+                                    if ((l1x+mx) + 320*(l1y+my) < 0 || (l1x+mx) + 320*(l1y+my) > 320*199 - 2) {
+                                        av_log(ctx->avctx, AV_LOG_ERROR, "Invalid MV\n");
+                                        return AVERROR_INVALIDDATA;
+                                    }
                                     BLK(ctx->cur, l1x, l1y) = BLK(ctx->prev, l1x + mx, l1y + my);
                                     BLK(ctx->cur, l1x + 1, l1y) =
                                         BLK(ctx->prev, l1x + 1 + mx, l1y + my);
@@ -256,10 +272,8 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
 
     bytestream2_init(&ctx->g, avpkt->data, avpkt->size);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     header = bytestream2_get_byte(&ctx->g);
 
@@ -267,7 +281,7 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
     if (bytestream2_peek_byte(&ctx->g) == 127) {
         bytestream2_skip(&ctx->g, 3);
         for (i = 0; i < 127; i++) {
-            ctx->pal[i + (header & 0x81)] = bytestream2_get_be24(&ctx->g);
+            ctx->pal[i + (header & 0x81)] = 0xFFU << 24 | bytestream2_get_be24(&ctx->g);
             bytestream2_skip(&ctx->g, 1);
         }
         bytestream2_seek(&ctx->g, -127 * 4 - 3, SEEK_CUR);
@@ -285,7 +299,7 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame,
         frame->palette_has_changed = 1;
         // palette starts from index 1 and has 127 entries
         for (i = 1; i <= ctx->palsize; i++) {
-            ctx->pal[i] = bytestream2_get_be24(&ctx->g);
+            ctx->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&ctx->g);
         }
     }
 
@@ -369,7 +383,7 @@ static av_cold int decode_init(AVCodecContext * avctx)
     c->prev = c->frm1;
 
     for (i = 0; i < 256; i++) {
-        c->pal[i] = i * 0x10101;
+        c->pal[i] = 0xFFU << 24 | i * 0x10101;
     }
 
     if (avctx->extradata_size < 12) {
@@ -378,7 +392,8 @@ static av_cold int decode_init(AVCodecContext * avctx)
         c->palsize = 127;
     } else {
         c->palsize = AV_RL16(avctx->extradata + 10);
-        if (c->palsize >= MAX_PALSIZE) {
+        if (c->palsize >= (unsigned)MAX_PALSIZE) {
+            c->palsize = 127;
             av_log(avctx, AV_LOG_ERROR, "KMVC palette too large\n");
             return AVERROR_INVALIDDATA;
         }
diff --git a/libavcodec/lagarith.c b/libavcodec/lagarith.c
index 55e10bc..93d1344 100644
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@@ -2,20 +2,20 @@
  * Lagarith lossless decoder
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static int lag_decode_prob(GetBitContext *gb, uint32_t *value)
     }
 
     val  = get_bits_long(gb, bits);
-    val |= 1 << bits;
+    val |= 1U << bits;
 
     *value = val - 1;
 
@@ -160,8 +160,8 @@ static int lag_read_prob_header(lag_rac *rac, GetBitContext *gb)
                 av_log(rac->avctx, AV_LOG_ERROR, "Invalid probability run encountered.\n");
                 return -1;
             }
-            if (prob > 257 - i)
-                prob = 257 - i;
+            if (prob > 256 - i)
+                prob = 256 - i;
             for (j = 0; j < prob; j++)
                 rac->prob[++i] = 0;
         }
@@ -177,7 +177,15 @@ static int lag_read_prob_header(lag_rac *rac, GetBitContext *gb)
 
     if (cumul_prob & (cumul_prob - 1)) {
         uint64_t mul = softfloat_reciprocal(cumul_prob);
-        for (i = 1; i < 257; i++) {
+        for (i = 1; i <= 128; i++) {
+            rac->prob[i] = softfloat_mul(rac->prob[i], mul);
+            scaled_cumul_prob += rac->prob[i];
+        }
+        if (scaled_cumul_prob <= 0) {
+            av_log(rac->avctx, AV_LOG_ERROR, "Scaled probabilities invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (; i < 257; i++) {
             rac->prob[i] = softfloat_mul(rac->prob[i], mul);
             scaled_cumul_prob += rac->prob[i];
         }
@@ -251,11 +259,8 @@ static void lag_pred_line(LagarithContext *l, uint8_t *buf,
     int L, TL;
 
     if (!line) {
-        int i, align_width = (width - 1) & ~31;
         /* Left prediction only for first line */
-        L = l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);
-        for (i = align_width + 1; i < width; i++)
-            buf[i] += buf[i - 1];
+        L = l->hdsp.add_hfyu_left_pred(buf, buf, width, 0);
     } else {
         /* Left pixel is actually prev_row[width] */
         L = buf[width - stride - 1];
@@ -281,18 +286,12 @@ static void lag_pred_line_yuy2(LagarithContext *l, uint8_t *buf,
     int L, TL;
 
     if (!line) {
-        int i, align_width;
-        if (is_luma) {
-            buf++;
-            width--;
-        }
-
-        align_width = (width - 1) & ~31;
-        l->hdsp.add_hfyu_left_pred(buf + 1, buf + 1, align_width, buf[0]);
-
-        for (i = align_width + 1; i < width; i++)
-            buf[i] += buf[i - 1];
-
+        L= buf[0];
+        if (is_luma)
+            buf[0] = 0;
+        l->hdsp.add_hfyu_left_pred(buf, buf, width, 0);
+        if (is_luma)
+            buf[0] = L;
         return;
     }
     if (line == 1) {
@@ -371,6 +370,10 @@ static int lag_decode_zero_run_line(LagarithContext *l, uint8_t *dst,
     uint8_t mask2 = -(esc_count < 3);
     uint8_t *end = dst + (width - 2);
 
+    avpriv_request_sample(l->avctx, "zero_run_line");
+
+    memset(dst, 0, width);
+
 output_zeros:
     if (l->zeros_rem) {
         count = FFMIN(l->zeros_rem, width - i);
@@ -388,7 +391,7 @@ output_zeros:
         i = 0;
         while (!zero_run && dst + i < end) {
             i++;
-            if (src + i >= src_end)
+            if (i+2 >= src_end - src)
                 return AVERROR_INVALIDDATA;
             zero_run =
                 !(src[i] | (src[i + 1] & mask1) | (src[i + 2] & mask2));
@@ -408,7 +411,7 @@ output_zeros:
             dst += i;
         }
     }
-    return src_start - src;
+    return  src - src_start;
 }
 
 
@@ -421,22 +424,30 @@ static int lag_decode_arith_plane(LagarithContext *l, uint8_t *dst,
     int read = 0;
     uint32_t length;
     uint32_t offset = 1;
-    int esc_count = src[0];
+    int esc_count;
     GetBitContext gb;
     lag_rac rac;
     const uint8_t *src_end = src + src_size;
+    int ret;
 
     rac.avctx = l->avctx;
     l->zeros = 0;
 
+    if(src_size < 2)
+        return AVERROR_INVALIDDATA;
+
+    esc_count = src[0];
     if (esc_count < 4) {
         length = width * height;
+        if(src_size < 5)
+            return AVERROR_INVALIDDATA;
         if (esc_count && AV_RL32(src + 1) < length) {
             length = AV_RL32(src + 1);
             offset += 4;
         }
 
-        init_get_bits(&gb, src + offset, src_size * 8);
+        if ((ret = init_get_bits8(&gb, src + offset, src_size - offset)) < 0)
+            return ret;
 
         if (lag_read_prob_header(&rac, &gb) < 0)
             return -1;
@@ -453,6 +464,8 @@ static int lag_decode_arith_plane(LagarithContext *l, uint8_t *dst,
                    length);
     } else if (esc_count < 8) {
         esc_count -= 4;
+        src ++;
+        src_size --;
         if (esc_count > 0) {
             /* Zero run coding only, no range coding. */
             for (i = 0; i < height; i++) {
@@ -513,7 +526,7 @@ static int lag_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_frame, AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
+    unsigned int buf_size = avpkt->size;
     LagarithContext *l = avctx->priv_data;
     ThreadFrame frame = { .f = data };
     AVFrame *const p  = data;
@@ -522,6 +535,7 @@ static int lag_decode_frame(AVCodecContext *avctx,
     uint32_t offs[4];
     uint8_t *srcs[4], *dst;
     int i, j, planes = 3;
+    int ret;
 
     p->key_frame = 1;
 
@@ -533,18 +547,53 @@ static int lag_decode_frame(AVCodecContext *avctx,
     switch (frametype) {
     case FRAME_SOLID_RGBA:
         avctx->pix_fmt = AV_PIX_FMT_RGB32;
+    case FRAME_SOLID_GRAY:
+        if (frametype == FRAME_SOLID_GRAY)
+            if (avctx->bits_per_coded_sample == 24) {
+                avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_0RGB32;
+                planes = 4;
+            }
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         dst = p->data[0];
+        if (frametype == FRAME_SOLID_RGBA) {
         for (j = 0; j < avctx->height; j++) {
             for (i = 0; i < avctx->width; i++)
                 AV_WN32(dst + i * 4, offset_gu);
             dst += p->linesize[0];
         }
+        } else {
+            for (j = 0; j < avctx->height; j++) {
+                memset(dst, buf[1], avctx->width * planes);
+                dst += p->linesize[0];
+            }
+        }
+        break;
+    case FRAME_SOLID_COLOR:
+        if (avctx->bits_per_coded_sample == 24) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else {
+            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+            offset_gu |= 0xFFU << 24;
+        }
+
+        if ((ret = ff_thread_get_buffer(avctx, &frame,0)) < 0)
+            return ret;
+
+        dst = p->data[0];
+        for (j = 0; j < avctx->height; j++) {
+            for (i = 0; i < avctx->width; i++)
+                if (avctx->bits_per_coded_sample == 24) {
+                    AV_WB24(dst + i * 3, offset_gu);
+                } else {
+                    AV_WN32(dst + i * 4, offset_gu);
+                }
+            dst += p->linesize[0];
+        }
         break;
     case FRAME_ARITH_RGBA:
         avctx->pix_fmt = AV_PIX_FMT_RGB32;
@@ -556,10 +605,8 @@ static int lag_decode_frame(AVCodecContext *avctx,
         if (frametype == FRAME_ARITH_RGB24 || frametype == FRAME_U_RGB24)
             avctx->pix_fmt = AV_PIX_FMT_RGB24;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         offs[0] = offset_bv;
         offs[1] = offset_gu;
@@ -574,14 +621,13 @@ static int lag_decode_frame(AVCodecContext *avctx,
         }
         for (i = 0; i < planes; i++)
             srcs[i] = l->rgb_planes + (i + 1) * l->rgb_stride * avctx->height - l->rgb_stride;
-        if (offset_ry >= buf_size ||
-            offset_gu >= buf_size ||
-            offset_bv >= buf_size ||
-            (planes == 4 && offs[3] >= buf_size)) {
-            av_log(avctx, AV_LOG_ERROR,
-                    "Invalid frame offsets\n");
-            return AVERROR_INVALIDDATA;
-        }
+        for (i = 0; i < planes; i++)
+            if (buf_size <= offs[i]) {
+                av_log(avctx, AV_LOG_ERROR,
+                        "Invalid frame offsets\n");
+                return AVERROR_INVALIDDATA;
+            }
+
         for (i = 0; i < planes; i++)
             lag_decode_arith_plane(l, srcs[i],
                                    avctx->width, avctx->height,
@@ -615,10 +661,8 @@ static int lag_decode_frame(AVCodecContext *avctx,
     case FRAME_ARITH_YUY2:
         avctx->pix_fmt = AV_PIX_FMT_YUV422P;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
 
         if (offset_ry >= buf_size ||
             offset_gu >= buf_size ||
@@ -631,19 +675,20 @@ static int lag_decode_frame(AVCodecContext *avctx,
         lag_decode_arith_plane(l, p->data[0], avctx->width, avctx->height,
                                p->linesize[0], buf + offset_ry,
                                buf_size - offset_ry);
-        lag_decode_arith_plane(l, p->data[1], avctx->width / 2,
+        lag_decode_arith_plane(l, p->data[1], (avctx->width + 1) / 2,
                                avctx->height, p->linesize[1],
                                buf + offset_gu, buf_size - offset_gu);
-        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
+        lag_decode_arith_plane(l, p->data[2], (avctx->width + 1) / 2,
                                avctx->height, p->linesize[2],
                                buf + offset_bv, buf_size - offset_bv);
         break;
     case FRAME_ARITH_YV12:
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
-        if (ff_thread_get_buffer(avctx, &frame, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-            return -1;
+        if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+            return ret;
+        if (buf_size <= offset_ry || buf_size <= offset_gu || buf_size <= offset_bv) {
+            return AVERROR_INVALIDDATA;
         }
 
         if (offset_ry >= buf_size ||
@@ -657,17 +702,17 @@ static int lag_decode_frame(AVCodecContext *avctx,
         lag_decode_arith_plane(l, p->data[0], avctx->width, avctx->height,
                                p->linesize[0], buf + offset_ry,
                                buf_size - offset_ry);
-        lag_decode_arith_plane(l, p->data[2], avctx->width / 2,
-                               avctx->height / 2, p->linesize[2],
+        lag_decode_arith_plane(l, p->data[2], (avctx->width + 1) / 2,
+                               (avctx->height + 1) / 2, p->linesize[2],
                                buf + offset_gu, buf_size - offset_gu);
-        lag_decode_arith_plane(l, p->data[1], avctx->width / 2,
-                               avctx->height / 2, p->linesize[1],
+        lag_decode_arith_plane(l, p->data[1], (avctx->width + 1) / 2,
+                               (avctx->height + 1) / 2, p->linesize[1],
                                buf + offset_bv, buf_size - offset_bv);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported Lagarith frame type: %#"PRIx8"\n", frametype);
-        return -1;
+        return AVERROR_PATCHWELCOME;
     }
 
     *got_frame = 1;
diff --git a/libavcodec/lagarithrac.c b/libavcodec/lagarithrac.c
index f9e4e5c..3d36d1b 100644
--- a/libavcodec/lagarithrac.c
+++ b/libavcodec/lagarithrac.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,19 +41,16 @@ void ff_lag_rac_init(lag_rac *l, GetBitContext *gb, int length)
     left                = get_bits_left(gb) >> 3;
     l->bytestream_start =
     l->bytestream       = gb->buffer + get_bits_count(gb) / 8;
-    l->bytestream_end   = l->bytestream_start + FFMIN(length, left);
+    l->bytestream_end   = l->bytestream_start + left;
 
     l->range        = 0x80;
     l->low          = *l->bytestream >> 1;
-    l->hash_shift   = FFMAX(l->scale, 8) - 8;
+    l->hash_shift   = FFMAX(l->scale, 10) - 10;
 
-    for (i = j = 0; i < 256; i++) {
+    for (i = j = 0; i < 1024; i++) {
         unsigned r = i << l->hash_shift;
         while (l->prob[j + 1] <= r)
             j++;
         l->range_hash[i] = j;
     }
-
-    /* Add conversion factor to hash_shift so we don't have to in lag_get_rac. */
-    l->hash_shift += 23;
 }
diff --git a/libavcodec/lagarithrac.h b/libavcodec/lagarithrac.h
index e4f066e..dfdfea0 100644
--- a/libavcodec/lagarithrac.h
+++ b/libavcodec/lagarithrac.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Nathan Caldwell <saintdev (at) gmail.com>
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,7 +48,7 @@ typedef struct lag_rac {
     const uint8_t *bytestream_end;    /**< End position of input bytestream. */
 
     uint32_t prob[258];         /**< Table of cumulative probability for each symbol. */
-    uint8_t  range_hash[256];   /**< Hash table mapping upper byte to approximate symbol. */
+    uint8_t  range_hash[1024];   /**< Hash table mapping upper byte to approximate symbol. */
 } lag_rac;
 
 void ff_lag_rac_init(lag_rac *l, GetBitContext *gb, int length);
@@ -72,9 +72,8 @@ static inline void lag_rac_refill(lag_rac *l)
  */
 static inline uint8_t lag_get_rac(lag_rac *l)
 {
-    unsigned range_scaled, low_scaled, div;
+    unsigned range_scaled, low_scaled;
     int val;
-    uint8_t shift;
 
     lag_rac_refill(l);
 
@@ -85,18 +84,9 @@ static inline uint8_t lag_get_rac(lag_rac *l)
         if (l->low < range_scaled * l->prob[1]) {
             val = 0;
         } else {
-            /* FIXME __builtin_clz is ~20% faster here, but not allowed in generic code. */
-            shift = 30 - av_log2(range_scaled);
-            div = ((range_scaled << shift) + (1 << 23) - 1) >> 23;
-            /* low>>24 ensures that any cases too big for exact FASTDIV are
-             * under- rather than over-estimated
-             */
-            low_scaled = FASTDIV(l->low - (l->low >> 24), div);
-            shift -= l->hash_shift;
-            shift &= 31;
-            low_scaled = (low_scaled << shift) | (low_scaled >> (32 - shift));
-            /* low_scaled is now a lower bound of low/range_scaled */
-            val = l->range_hash[(uint8_t) low_scaled];
+            low_scaled = l->low / (range_scaled<<(l->hash_shift));
+
+            val = l->range_hash[low_scaled];
             while (l->low >= range_scaled * l->prob[val + 1])
                 val++;
         }
diff --git a/libavcodec/latm_parser.c b/libavcodec/latm_parser.c
index 6fdb897..3820f58 100644
--- a/libavcodec/latm_parser.c
+++ b/libavcodec/latm_parser.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2008 Paul Kendall <paul@kcbbs.gen.nz>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,6 @@ static int latm_find_frame_end(AVCodecParserContext *s1, const uint8_t *buf,
     pic_found = pc->frame_start_found;
     state     = pc->state;
 
-    i = 0;
     if (!pic_found) {
         for (i = 0; i < buf_size; i++) {
             state = (state<<8) | buf[i];
diff --git a/libavcodec/lcl.h b/libavcodec/lcl.h
index 4e7e170..b60c0e9 100644
--- a/libavcodec/lcl.h
+++ b/libavcodec/lcl.h
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lcldec.c b/libavcodec/lcldec.c
index ba608f9..b1335e1 100644
--- a/libavcodec/lcldec.c
+++ b/libavcodec/lcldec.c
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,6 +41,7 @@
 #include <stdlib.h>
 
 #include "libavutil/mem.h"
+#include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -91,7 +92,13 @@ static unsigned int mszh_decomp(const unsigned char * srcptr, int srclen, unsign
             ofs = FFMIN(ofs, destptr - destptr_bak);
             cnt *= 4;
             cnt = FFMIN(cnt, destptr_end - destptr);
-            av_memcpy_backptr(destptr, ofs, cnt);
+            if (ofs) {
+                av_memcpy_backptr(destptr, ofs, cnt);
+            } else {
+                // Not known what the correct behaviour is, but
+                // this at least avoids uninitialized data.
+                memset(destptr, 0, cnt);
+            }
             destptr += cnt;
         }
         maskbit >>= 1;
@@ -128,7 +135,7 @@ static int zlib_decomp(AVCodecContext *avctx, const uint8_t *src, int src_len, i
         av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", zret);
         return AVERROR_UNKNOWN;
     }
-    c->zstream.next_in = src;
+    c->zstream.next_in = (uint8_t *)src;
     c->zstream.avail_in = src_len;
     c->zstream.next_out = c->decomp_buf + offset;
     c->zstream.avail_out = c->decomp_size - offset;
@@ -155,7 +162,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     LclDecContext * const c = avctx->priv_data;
     unsigned int pixel_ptr;
     int row, col;
-    unsigned char *encoded, *outptr;
+    unsigned char *encoded = avpkt->data, *outptr;
     uint8_t *y_out, *u_out, *v_out;
     unsigned int width = avctx->width; // Real image width
     unsigned int height = avctx->height; // Real image height
@@ -164,11 +171,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int uqvq, ret;
     unsigned int mthread_inlen, mthread_outlen;
     unsigned int len = buf_size;
+    int linesize;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     outptr = frame->data[0]; // Output image pointer
 
@@ -177,8 +183,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     case AV_CODEC_ID_MSZH:
         switch (c->compression) {
         case COMP_MSZH:
-            if (c->flags & FLAG_MULTITHREAD) {
+            if (c->imgtype == IMGTYPE_RGB24 && len == FFALIGN(width * 3, 4) * height ||
+                c->imgtype == IMGTYPE_YUV111 && len == width * height * 3) {
+                ;
+            } else if (c->flags & FLAG_MULTITHREAD) {
                 mthread_inlen = AV_RL32(buf);
+                if (len < 8) {
+                    av_log(avctx, AV_LOG_ERROR, "len %d is too small\n", len);
+                    return AVERROR_INVALIDDATA;
+                }
                 mthread_inlen = FFMIN(mthread_inlen, len - 8);
                 mthread_outlen = AV_RL32(buf + 4);
                 mthread_outlen = FFMIN(mthread_outlen, c->decomp_size);
@@ -390,10 +403,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         }
         break;
     case IMGTYPE_RGB24:
+        linesize = len < FFALIGN(3 * width, 4) * height ? 3 * width : FFALIGN(3 * width, 4);
         for (row = height - 1; row >= 0; row--) {
             pixel_ptr = row * frame->linesize[0];
             memcpy(outptr + pixel_ptr, encoded, 3 * width);
-            encoded += 3 * width;
+            encoded += linesize;
         }
         break;
     case IMGTYPE_YUV411:
@@ -457,6 +471,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     unsigned int max_basesize = FFALIGN(avctx->width,  4) *
                                 FFALIGN(avctx->height, 4);
     unsigned int max_decomp_size;
+    int subsample_h, subsample_v;
 
     if (avctx->extradata_size < 8) {
         av_log(avctx, AV_LOG_ERROR, "Extradata size too small.\n");
@@ -482,6 +497,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
         max_decomp_size = max_basesize * 2;
         avctx->pix_fmt = AV_PIX_FMT_YUV422P;
         av_log(avctx, AV_LOG_DEBUG, "Image type is YUV 4:2:2.\n");
+        if (avctx->width % 4) {
+            avpriv_request_sample(avctx, "Unsupported dimensions");
+            return AVERROR_INVALIDDATA;
+        }
         break;
     case IMGTYPE_RGB24:
         c->decomp_size = basesize * 3;
@@ -512,6 +531,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &subsample_h, &subsample_v);
+    if (avctx->width % (1<<subsample_h) || avctx->height % (1<<subsample_v)) {
+        avpriv_request_sample(avctx, "Unsupported dimensions");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* Detect compression method */
     c->compression = (int8_t)avctx->extradata[5];
     switch (avctx->codec_id) {
diff --git a/libavcodec/lclenc.c b/libavcodec/lclenc.c
index 2e00807..357313d 100644
--- a/libavcodec/lclenc.c
+++ b/libavcodec/lclenc.c
@@ -2,20 +2,20 @@
  * LCL (LossLess Codec Library) Codec
  * Copyright (c) 2002-2004 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "lcl.h"
@@ -62,19 +63,15 @@ typedef struct LclEncContext {
 } LclEncContext;
 
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+                        const AVFrame *p, int *got_packet)
 {
     LclEncContext *c = avctx->priv_data;
-    const AVFrame * const p = pict;
     int i, ret;
     int zret; // Zlib return code
     int max_size = deflateBound(&c->zstream, avctx->width * avctx->height * 3);
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, max_size)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Error allocating packet of size %d.\n", max_size);
-            return ret;
-    }
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_size, 0)) < 0)
+        return ret;
 
     if(avctx->pix_fmt != AV_PIX_FMT_BGR24){
         av_log(avctx, AV_LOG_ERROR, "Format not supported!\n");
@@ -118,9 +115,9 @@ static av_cold int encode_init(AVCodecContext *avctx)
 
     c->avctx= avctx;
 
-    assert(avctx->width && avctx->height);
+    av_assert0(avctx->width && avctx->height);
 
-    avctx->extradata= av_mallocz(8);
+    avctx->extradata = av_mallocz(8 + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
@@ -131,8 +128,9 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    // Will be user settable someday
-    c->compression = 6;
+    c->compression = avctx->compression_level == FF_COMPRESSION_DEFAULT ?
+                            COMP_ZLIB_NORMAL :
+                            av_clip(avctx->compression_level, 0, 9);
     c->flags = 0;
     c->imgtype = IMGTYPE_RGB24;
     avctx->bits_per_coded_sample= 24;
@@ -178,6 +176,7 @@ AVCodec ff_zlib_encoder = {
     .init           = encode_init,
     .encode2        = encode_frame,
     .close          = encode_end,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_BGR24, AV_PIX_FMT_NONE },
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
                       FF_CODEC_CAP_INIT_CLEANUP,
diff --git a/libavcodec/libavcodec.v b/libavcodec/libavcodec.v
index bf14807..c923cd3 100644
--- a/libavcodec/libavcodec.v
+++ b/libavcodec/libavcodec.v
@@ -1,4 +1,7 @@
 LIBAVCODEC_$MAJOR {
         global: av*;
+                #deprecated, remove after next bump
+                audio_resample;
+                audio_resample_close;
         local:  *;
 };
diff --git a/libavcodec/libcelt_dec.c b/libavcodec/libcelt_dec.c
new file mode 100644
index 0000000..878e4cc
--- /dev/null
+++ b/libavcodec/libcelt_dec.c
@@ -0,0 +1,140 @@
+/*
+ * Xiph CELT decoder using libcelt
+ * Copyright (c) 2011 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <celt/celt.h>
+#include <celt/celt_header.h>
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/intreadwrite.h"
+
+struct libcelt_context {
+    CELTMode *mode;
+    CELTDecoder *dec;
+    int discard;
+};
+
+static int ff_celt_error_to_averror(int err)
+{
+    switch (err) {
+        case CELT_BAD_ARG:          return AVERROR(EINVAL);
+#ifdef CELT_BUFFER_TOO_SMALL
+        case CELT_BUFFER_TOO_SMALL: return AVERROR(ENOBUFS);
+#endif
+        case CELT_INTERNAL_ERROR:   return AVERROR(EFAULT);
+        case CELT_CORRUPTED_DATA:   return AVERROR_INVALIDDATA;
+        case CELT_UNIMPLEMENTED:    return AVERROR(ENOSYS);
+#ifdef ENOTRECOVERABLE
+        case CELT_INVALID_STATE:    return AVERROR(ENOTRECOVERABLE);
+#endif
+        case CELT_ALLOC_FAIL:       return AVERROR(ENOMEM);
+        default:                    return AVERROR(EINVAL);
+    }
+}
+
+static int ff_celt_bitstream_version_hack(CELTMode *mode)
+{
+    CELTHeader header = { .version_id = 0 };
+    celt_header_init(&header, mode, 960, 2);
+    return header.version_id;
+}
+
+static av_cold int libcelt_dec_init(AVCodecContext *c)
+{
+    struct libcelt_context *celt = c->priv_data;
+    int err;
+
+    if (!c->channels || !c->frame_size ||
+        c->frame_size > INT_MAX / sizeof(int16_t) / c->channels)
+        return AVERROR(EINVAL);
+    celt->mode = celt_mode_create(c->sample_rate, c->frame_size, &err);
+    if (!celt->mode)
+        return ff_celt_error_to_averror(err);
+    celt->dec = celt_decoder_create_custom(celt->mode, c->channels, &err);
+    if (!celt->dec) {
+        celt_mode_destroy(celt->mode);
+        return ff_celt_error_to_averror(err);
+    }
+    if (c->extradata_size >= 4) {
+        celt->discard = AV_RL32(c->extradata);
+        if (celt->discard < 0 || celt->discard >= c->frame_size) {
+            av_log(c, AV_LOG_WARNING,
+                   "Invalid overlap (%d), ignored.\n", celt->discard);
+            celt->discard = 0;
+        }
+    }
+    if (c->extradata_size >= 8) {
+        unsigned version = AV_RL32(c->extradata + 4);
+        unsigned lib_version = ff_celt_bitstream_version_hack(celt->mode);
+        if (version != lib_version)
+            av_log(c, AV_LOG_WARNING,
+                   "CELT bitstream version 0x%x may be "
+                   "improperly decoded by libcelt for version 0x%x.\n",
+                   version, lib_version);
+    }
+    c->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+}
+
+static av_cold int libcelt_dec_close(AVCodecContext *c)
+{
+    struct libcelt_context *celt = c->priv_data;
+
+    celt_decoder_destroy(celt->dec);
+    celt_mode_destroy(celt->mode);
+    return 0;
+}
+
+static int libcelt_dec_decode(AVCodecContext *c, void *data,
+                              int *got_frame_ptr, AVPacket *pkt)
+{
+    struct libcelt_context *celt = c->priv_data;
+    AVFrame *frame = data;
+    int err;
+    int16_t *pcm;
+
+    frame->nb_samples = c->frame_size;
+    if ((err = ff_get_buffer(c, frame, 0)) < 0)
+        return err;
+    pcm = (int16_t *)frame->data[0];
+    err = celt_decode(celt->dec, pkt->data, pkt->size, pcm, c->frame_size);
+    if (err < 0)
+        return ff_celt_error_to_averror(err);
+    if (celt->discard) {
+        frame->nb_samples -= celt->discard;
+        memmove(pcm, pcm + celt->discard * c->channels,
+                frame->nb_samples * c->channels * sizeof(int16_t));
+        celt->discard = 0;
+    }
+    *got_frame_ptr = 1;
+    return pkt->size;
+}
+
+AVCodec ff_libcelt_decoder = {
+    .name           = "libcelt",
+    .long_name      = NULL_IF_CONFIG_SMALL("Xiph CELT decoder using libcelt"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_CELT,
+    .priv_data_size = sizeof(struct libcelt_context),
+    .init           = libcelt_dec_init,
+    .close          = libcelt_dec_close,
+    .decode         = libcelt_dec_decode,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/libdcadec.c b/libavcodec/libdcadec.c
deleted file mode 100644
index b88f807..0000000
--- a/libavcodec/libdcadec.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * libdcadec decoder wrapper
- * Copyright (C) 2015 Hendrik Leppkes
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <libdcadec/dca_context.h>
-
-#include "libavutil/channel_layout.h"
-#include "libavutil/common.h"
-#include "libavutil/opt.h"
-
-#include "avcodec.h"
-#include "dca.h"
-#include "dca_syncwords.h"
-#include "internal.h"
-
-typedef struct DCADecContext {
-    struct dcadec_context *ctx;
-    uint8_t *buffer;
-    int buffer_size;
-} DCADecContext;
-
-static int dcadec_decode_frame(AVCodecContext *avctx, void *data,
-                               int *got_frame_ptr, AVPacket *avpkt)
-{
-    DCADecContext *s = avctx->priv_data;
-    AVFrame *frame = data;
-    int ret, i, k;
-    int **samples, nsamples, channel_mask, sample_rate, bits_per_sample, profile;
-    uint32_t mrk;
-    uint8_t *input = avpkt->data;
-    int input_size = avpkt->size;
-
-    /* convert bytestream syntax to RAW BE format if required */
-    if (input_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Input size too small\n");
-        return AVERROR_INVALIDDATA;
-    }
-    mrk = AV_RB32(input);
-    if (mrk != DCA_SYNCWORD_CORE_BE && mrk != DCA_SYNCWORD_SUBSTREAM) {
-        s->buffer = av_fast_realloc(s->buffer, &s->buffer_size, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!s->buffer)
-            return AVERROR(ENOMEM);
-
-        if ((ret = ff_dca_convert_bitstream(avpkt->data, avpkt->size, s->buffer, s->buffer_size)) < 0)
-            return ret;
-
-        input      = s->buffer;
-        input_size = ret;
-    }
-
-    if ((ret = dcadec_context_parse(s->ctx, input, input_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "dcadec_context_parse() failed: %d (%s)\n", -ret, dcadec_strerror(ret));
-        return AVERROR_UNKNOWN;
-    }
-    if ((ret = dcadec_context_filter(s->ctx, &samples, &nsamples, &channel_mask,
-                                     &sample_rate, &bits_per_sample, &profile)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "dcadec_context_filter() failed: %d (%s)\n", -ret, dcadec_strerror(ret));
-        return AVERROR_UNKNOWN;
-    }
-
-    avctx->channels       = av_get_channel_layout_nb_channels(channel_mask);
-    avctx->channel_layout = channel_mask;
-    avctx->sample_rate    = sample_rate;
-
-    if (bits_per_sample == 16)
-        avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
-    else if (bits_per_sample <= 24)
-        avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
-    else {
-        av_log(avctx, AV_LOG_ERROR, "Unsupported number of bits per sample: %d\n",
-               bits_per_sample);
-        return AVERROR(ENOSYS);
-    }
-
-    avctx->bits_per_raw_sample = bits_per_sample;
-
-    switch (profile) {
-    case DCADEC_PROFILE_DS:
-        avctx->profile = FF_PROFILE_DTS;
-        break;
-    case DCADEC_PROFILE_DS_96_24:
-        avctx->profile = FF_PROFILE_DTS_96_24;
-        break;
-    case DCADEC_PROFILE_DS_ES:
-        avctx->profile = FF_PROFILE_DTS_ES;
-        break;
-    case DCADEC_PROFILE_HD_HRA:
-        avctx->profile = FF_PROFILE_DTS_HD_HRA;
-        break;
-    case DCADEC_PROFILE_HD_MA:
-        avctx->profile = FF_PROFILE_DTS_HD_MA;
-        break;
-    case DCADEC_PROFILE_EXPRESS:
-        avctx->profile = FF_PROFILE_DTS_EXPRESS;
-        break;
-    case DCADEC_PROFILE_UNKNOWN:
-    default:
-        avctx->profile = FF_PROFILE_UNKNOWN;
-        break;
-    }
-
-    /* bitrate is only meaningful if there are no HD extensions, as they distort the bitrate */
-    if (profile == DCADEC_PROFILE_DS || profile == DCADEC_PROFILE_DS_96_24 || profile == DCADEC_PROFILE_DS_ES) {
-        struct dcadec_core_info *info = dcadec_context_get_core_info(s->ctx);
-        avctx->bit_rate = info->bit_rate;
-        dcadec_context_free_core_info(info);
-    } else
-        avctx->bit_rate = 0;
-
-    frame->nb_samples = nsamples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
-        return ret;
-
-    for (i = 0; i < avctx->channels; i++) {
-        if (frame->format == AV_SAMPLE_FMT_S16P) {
-            int16_t *plane = (int16_t *)frame->extended_data[i];
-            for (k = 0; k < nsamples; k++)
-                plane[k] = samples[i][k];
-        } else {
-            int32_t *plane = (int32_t *)frame->extended_data[i];
-            int shift = 32 - bits_per_sample;
-            for (k = 0; k < nsamples; k++)
-                plane[k] = samples[i][k] << shift;
-        }
-    }
-
-    *got_frame_ptr = 1;
-
-    return avpkt->size;
-}
-
-static av_cold void dcadec_flush(AVCodecContext *avctx)
-{
-    DCADecContext *s = avctx->priv_data;
-    dcadec_context_clear(s->ctx);
-}
-
-static av_cold int dcadec_close(AVCodecContext *avctx)
-{
-    DCADecContext *s = avctx->priv_data;
-
-    dcadec_context_destroy(s->ctx);
-    s->ctx = NULL;
-
-    av_freep(&s->buffer);
-
-    return 0;
-}
-
-static av_cold int dcadec_init(AVCodecContext *avctx)
-{
-    DCADecContext *s = avctx->priv_data;
-
-    s->ctx = dcadec_context_create(0);
-    if (!s->ctx)
-        return AVERROR(ENOMEM);
-
-    avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
-    avctx->bits_per_raw_sample = 24;
-
-    return 0;
-}
-
-static const AVProfile profiles[] = {
-    { FF_PROFILE_DTS,         "DTS"         },
-    { FF_PROFILE_DTS_ES,      "DTS-ES"      },
-    { FF_PROFILE_DTS_96_24,   "DTS 96/24"   },
-    { FF_PROFILE_DTS_HD_HRA,  "DTS-HD HRA"  },
-    { FF_PROFILE_DTS_HD_MA,   "DTS-HD MA"   },
-    { FF_PROFILE_DTS_EXPRESS, "DTS Express" },
-    { FF_PROFILE_UNKNOWN },
-};
-
-AVCodec ff_libdcadec_decoder = {
-    .name           = "libdcadec",
-    .long_name      = NULL_IF_CONFIG_SMALL("dcadec DCA decoder"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_DTS,
-    .priv_data_size = sizeof(DCADecContext),
-    .init           = dcadec_init,
-    .decode         = dcadec_decode_frame,
-    .close          = dcadec_close,
-    .flush          = dcadec_flush,
-    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_CHANNEL_CONF,
-    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S32P, AV_SAMPLE_FMT_S16P,
-                                                      AV_SAMPLE_FMT_NONE },
-    .profiles       = NULL_IF_CONFIG_SMALL(profiles),
-};
diff --git a/libavcodec/libfaac.c b/libavcodec/libfaac.c
index 5cdbe27..98b3ba8 100644
--- a/libavcodec/libfaac.c
+++ b/libavcodec/libfaac.c
@@ -2,20 +2,20 @@
  * Interface to libfaac for aac encoding
  * Copyright (c) 2002 Gildas Bazin <gbazin@netcourrier.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,7 +41,6 @@ typedef struct FaacAudioContext {
     AudioFrameQueue afq;
 } FaacAudioContext;
 
-
 static av_cold int Faac_encode_close(AVCodecContext *avctx)
 {
     FaacAudioContext *s = avctx->priv_data;
@@ -152,9 +151,20 @@ static av_cold int Faac_encode_init(AVCodecContext *avctx)
     }
 
     if (!faacEncSetConfiguration(s->faac_handle, faac_cfg)) {
-        av_log(avctx, AV_LOG_ERROR, "libfaac doesn't support this output format!\n");
-        ret = AVERROR(EINVAL);
-        goto error;
+        int i;
+        for (i = avctx->bit_rate/1000; i ; i--) {
+            faac_cfg->bitRate = 1000*i / avctx->channels;
+            if (faacEncSetConfiguration(s->faac_handle, faac_cfg))
+                break;
+        }
+        if (!i) {
+            av_log(avctx, AV_LOG_ERROR, "libfaac doesn't support this output format!\n");
+            ret = AVERROR(EINVAL);
+            goto error;
+        } else {
+            avctx->bit_rate = 1000*i;
+            av_log(avctx, AV_LOG_WARNING, "libfaac doesn't support the specified bitrate, using %dkbit/s instead\n", i);
+        }
     }
 
     avctx->initial_padding = FAAC_DELAY_SAMPLES;
@@ -174,10 +184,8 @@ static int Faac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     int num_samples  = frame ? frame->nb_samples : 0;
     void *samples    = frame ? frame->data[0]    : NULL;
 
-    if ((ret = ff_alloc_packet(avpkt, (7 + 768) * avctx->channels))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, (7 + 768) * avctx->channels, 0)) < 0)
         return ret;
-    }
 
     bytes_written = faacEncEncode(s->faac_handle, samples,
                                   num_samples * avctx->channels,
diff --git a/libavcodec/libfdk-aacdec.c b/libavcodec/libfdk-aacdec.c
index cdf7a05..e5f7c4e 100644
--- a/libavcodec/libfdk-aacdec.c
+++ b/libavcodec/libfdk-aacdec.c
@@ -2,7 +2,7 @@
  * AAC decoder wrapper
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -44,7 +44,7 @@ typedef struct FDKAACDecContext {
     uint8_t *decoder_buffer;
     int decoder_buffer_size;
     uint8_t *anc_buffer;
-    enum ConcealMethod conceal_method;
+    int conceal_method;
     int drc_level;
     int drc_boost;
     int drc_heavy;
@@ -199,8 +199,8 @@ static av_cold int fdk_aac_decode_close(AVCodecContext *avctx)
 
     if (s->handle)
         aacDecoder_Close(s->handle);
-    av_free(s->decoder_buffer);
-    av_free(s->anc_buffer);
+    av_freep(&s->decoder_buffer);
+    av_freep(&s->anc_buffer);
 
     return 0;
 }
@@ -341,10 +341,9 @@ static int fdk_aac_decode_frame(AVCodecContext *avctx, void *data,
         goto end;
     frame->nb_samples = avctx->frame_size;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         goto end;
-    }
+
     memcpy(frame->extended_data[0], s->decoder_buffer,
            avctx->channels * avctx->frame_size *
            av_get_bytes_per_sample(avctx->sample_fmt));
diff --git a/libavcodec/libfdk-aacenc.c b/libavcodec/libfdk-aacenc.c
index 2cea58f..98a817b 100644
--- a/libavcodec/libfdk-aacenc.c
+++ b/libavcodec/libfdk-aacenc.c
@@ -2,7 +2,7 @@
  * AAC encoder wrapper
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -215,8 +215,8 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
         }
         if ((err = aacEncoder_SetParam(s->handle, AACENC_BITRATE,
                                        avctx->bit_rate)) != AACENC_OK) {
-            av_log(avctx, AV_LOG_ERROR, "Unable to set the bitrate %d: %s\n",
-                   avctx->bit_rate, aac_get_error(err));
+            av_log(avctx, AV_LOG_ERROR, "Unable to set the bitrate %"PRId64": %s\n",
+                   (int64_t)avctx->bit_rate, aac_get_error(err));
             goto error;
         }
     }
@@ -342,10 +342,8 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
 
     /* The maximum packet size is 6144 bits aka 768 bytes per channel. */
-    if ((ret = ff_alloc_packet(avpkt, FFMAX(8192, 768 * avctx->channels)))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, FFMAX(8192, 768 * avctx->channels), 0)) < 0)
         return ret;
-    }
 
     out_ptr                   = avpkt->data;
     out_buffer_size           = avpkt->size;
diff --git a/libavcodec/libgsmdec.c b/libavcodec/libgsmdec.c
index 4c21ff6..a503215 100644
--- a/libavcodec/libgsmdec.c
+++ b/libavcodec/libgsmdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Alban Bedel <albeu@free.fr>
  * Copyright (c) 2006, 2007 Michel Bardiaux <mbardiaux@mediaxim.be>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,7 +50,8 @@ static av_cold int libgsm_decode_init(AVCodecContext *avctx) {
 
     avctx->channels       = 1;
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
-    avctx->sample_rate    = 8000;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000;
     avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
 
     s->state = gsm_create();
@@ -96,10 +97,8 @@ static int libgsm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = avctx->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     for (i = 0; i < avctx->frame_size / GSM_FRAME_SIZE; i++) {
@@ -124,6 +123,7 @@ static void libgsm_flush(AVCodecContext *avctx) {
         gsm_option(s->state, GSM_OPT_WAV49, &one);
 }
 
+#if CONFIG_LIBGSM_DECODER
 AVCodec ff_libgsm_decoder = {
     .name           = "libgsm",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM"),
@@ -136,7 +136,8 @@ AVCodec ff_libgsm_decoder = {
     .flush          = libgsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
+#if CONFIG_LIBGSM_MS_DECODER
 AVCodec ff_libgsm_ms_decoder = {
     .name           = "libgsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM Microsoft variant"),
@@ -149,3 +150,4 @@ AVCodec ff_libgsm_ms_decoder = {
     .flush          = libgsm_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
+#endif
diff --git a/libavcodec/libgsmenc.c b/libavcodec/libgsmenc.c
index 8f51321..69ce439 100644
--- a/libavcodec/libgsmenc.c
+++ b/libavcodec/libgsmenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 Alban Bedel <albeu@free.fr>
  * Copyright (c) 2006, 2007 Michel Bardiaux <mbardiaux@mediaxim.be>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,12 @@
 #include "internal.h"
 #include "gsm.h"
 
+static av_cold int libgsm_encode_close(AVCodecContext *avctx) {
+    gsm_destroy(avctx->priv_data);
+    avctx->priv_data = NULL;
+    return 0;
+}
+
 static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     if (avctx->channels > 1) {
         av_log(avctx, AV_LOG_ERROR, "Mono required for GSM, got %d channels\n",
@@ -56,13 +62,15 @@ static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     if (avctx->bit_rate != 13000 /* Official */ &&
         avctx->bit_rate != 13200 /* Very common */ &&
         avctx->bit_rate != 0 /* Unknown; a.o. mov does not set bitrate when decoding */ ) {
-        av_log(avctx, AV_LOG_ERROR, "Bitrate 13000bps required for GSM, got %dbps\n",
-               avctx->bit_rate);
+        av_log(avctx, AV_LOG_ERROR, "Bitrate 13000bps required for GSM, got %"PRId64"bps\n",
+               (int64_t)avctx->bit_rate);
         if (avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL)
             return -1;
     }
 
     avctx->priv_data = gsm_create();
+    if (!avctx->priv_data)
+        goto error;
 
     switch(avctx->codec_id) {
     case AV_CODEC_ID_GSM:
@@ -78,12 +86,9 @@ static av_cold int libgsm_encode_init(AVCodecContext *avctx) {
     }
 
     return 0;
-}
-
-static av_cold int libgsm_encode_close(AVCodecContext *avctx) {
-    gsm_destroy(avctx->priv_data);
-    avctx->priv_data = NULL;
-    return 0;
+error:
+    libgsm_encode_close(avctx);
+    return -1;
 }
 
 static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
@@ -93,10 +98,8 @@ static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     gsm_signal *samples = (gsm_signal *)frame->data[0];
     struct gsm_state *state = avctx->priv_data;
 
-    if ((ret = ff_alloc_packet(avpkt, avctx->block_align))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, avctx->block_align, 0)) < 0)
         return ret;
-    }
 
     switch(avctx->codec_id) {
     case AV_CODEC_ID_GSM:
@@ -112,6 +115,7 @@ static int libgsm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 }
 
 
+#if CONFIG_LIBGSM_ENCODER
 AVCodec ff_libgsm_encoder = {
     .name           = "libgsm",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM"),
@@ -123,7 +127,8 @@ AVCodec ff_libgsm_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_LIBGSM_MS_ENCODER
 AVCodec ff_libgsm_ms_encoder = {
     .name           = "libgsm_ms",
     .long_name      = NULL_IF_CONFIG_SMALL("libgsm GSM Microsoft variant"),
@@ -135,3 +140,4 @@ AVCodec ff_libgsm_ms_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/libilbc.c b/libavcodec/libilbc.c
index c5053f0..c4c054f 100644
--- a/libavcodec/libilbc.c
+++ b/libavcodec/libilbc.c
@@ -2,20 +2,20 @@
  * iLBC decoder/encoder stub
  * Copyright (c) 2012 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,7 +51,10 @@ static const AVOption ilbc_dec_options[] = {
 };
 
 static const AVClass ilbc_dec_class = {
-    "libilbc", av_default_item_name, ilbc_dec_options, LIBAVUTIL_VERSION_INT
+    .class_name = "libilbc",
+    .item_name  = av_default_item_name,
+    .option     = ilbc_dec_options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static av_cold int ilbc_decode_init(AVCodecContext *avctx)
@@ -90,13 +93,10 @@ static int ilbc_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     frame->nb_samples = s->decoder.blockl;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
-    WebRtcIlbcfix_DecodeImpl((WebRtc_Word16*) frame->data[0],
-                             (const WebRtc_UWord16*) buf, &s->decoder, 1);
+    WebRtcIlbcfix_DecodeImpl((int16_t *) frame->data[0], (const uint16_t *) buf, &s->decoder, 1);
 
     *got_frame_ptr = 1;
 
@@ -127,7 +127,10 @@ static const AVOption ilbc_enc_options[] = {
 };
 
 static const AVClass ilbc_enc_class = {
-    "libilbc", av_default_item_name, ilbc_enc_options, LIBAVUTIL_VERSION_INT
+    .class_name = "libilbc",
+    .item_name  = av_default_item_name,
+    .option     = ilbc_enc_options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
 static av_cold int ilbc_encode_init(AVCodecContext *avctx)
@@ -163,12 +166,10 @@ static int ilbc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ILBCEncContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, 50))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 50, 0)) < 0)
         return ret;
-    }
 
-    WebRtcIlbcfix_EncodeImpl((WebRtc_UWord16*) avpkt->data, (const WebRtc_Word16*) frame->data[0], &s->encoder);
+    WebRtcIlbcfix_EncodeImpl((uint16_t *) avpkt->data, (const int16_t *) frame->data[0], &s->encoder);
 
     avpkt->size     = s->encoder.no_of_bytes;
     *got_packet_ptr = 1;
diff --git a/libavcodec/libkvazaar.c b/libavcodec/libkvazaar.c
index 19122e0..79fde41 100644
--- a/libavcodec/libkvazaar.c
+++ b/libavcodec/libkvazaar.c
@@ -3,26 +3,27 @@
  *
  * Copyright (c) 2015 Tampere University of Technology
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <kvazaar.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/dict.h"
 #include "libavutil/error.h"
 #include "libavutil/imgutils.h"
@@ -50,12 +51,6 @@ static av_cold int libkvazaar_init(AVCodecContext *avctx)
     kvz_config *cfg = NULL;
     kvz_encoder *enc = NULL;
 
-    if (avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Set -strict experimental to use this encoder.\n");
-        return AVERROR_EXPERIMENTAL;
-    }
-
     /* Kvazaar requires width and height to be multiples of eight. */
     if (avctx->width % 8 || avctx->height % 8) {
         av_log(avctx, AV_LOG_ERROR,
@@ -80,6 +75,11 @@ static av_cold int libkvazaar_init(AVCodecContext *avctx)
     cfg->width  = avctx->width;
     cfg->height = avctx->height;
 
+    if (avctx->ticks_per_frame > INT_MAX / avctx->time_base.num) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Could not set framerate for kvazaar: integer overflow\n");
+        return AVERROR(EINVAL);
+    }
     cfg->framerate_num   = avctx->time_base.den;
     cfg->framerate_denom = avctx->time_base.num * avctx->ticks_per_frame;
     cfg->target_bitrate = avctx->bit_rate;
@@ -162,6 +162,8 @@ static int libkvazaar_encode(AVCodecContext *avctx,
     uint32_t len_out = 0;
     int retval = 0;
 
+    *got_packet_ptr = 0;
+
     if (frame) {
         if (frame->width != ctx->config->width ||
                 frame->height != ctx->config->height) {
@@ -218,6 +220,8 @@ static int libkvazaar_encode(AVCodecContext *avctx,
         retval = AVERROR_INVALIDDATA;
         goto done;
     }
+    else
+        retval = 0; /* kvazaar returns 1 on success */
 
     if (data_out) {
         kvz_data_chunk *chunk = NULL;
@@ -230,6 +234,7 @@ static int libkvazaar_encode(AVCodecContext *avctx,
         }
 
         for (chunk = data_out; chunk != NULL; chunk = chunk->next) {
+            av_assert0(written + chunk->len <= len_out);
             memcpy(avpkt->data + written, chunk->data, chunk->len);
             written += chunk->len;
         }
@@ -264,7 +269,6 @@ static const enum AVPixelFormat pix_fmts[] = {
 static const AVOption options[] = {
     { "kvazaar-params", "Set kvazaar parameters as a comma-separated list of key=value pairs.",
         OFFSET(kvz_params), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VE },
-
     { NULL },
 };
 
diff --git a/libavcodec/libmp3lame.c b/libavcodec/libmp3lame.c
index e4d0e00..5642264 100644
--- a/libavcodec/libmp3lame.c
+++ b/libavcodec/libmp3lame.c
@@ -2,20 +2,20 @@
  * Interface to libmp3lame for mp3 encoding
  * Copyright (c) 2002 Lennert Buytenhek <buytenh@gnu.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
 #include "mpegaudio.h"
 #include "mpegaudiodecheader.h"
 
-#define BUFFER_SIZE (7200 + 2 * MPA_FRAME_SIZE + MPA_FRAME_SIZE / 4)
+#define BUFFER_SIZE (7200 + 2 * MPA_FRAME_SIZE + MPA_FRAME_SIZE / 4+1000) // FIXME: Buffer size to small? Adding 1000 to make up for it.
 
 typedef struct LAMEContext {
     AVClass *class;
@@ -52,7 +52,7 @@ typedef struct LAMEContext {
     int abr;
     float *samples_flt[2];
     AudioFrameQueue afq;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 } LAMEContext;
 
 
@@ -79,6 +79,7 @@ static av_cold int mp3lame_encode_close(AVCodecContext *avctx)
     av_freep(&s->samples_flt[0]);
     av_freep(&s->samples_flt[1]);
     av_freep(&s->buffer);
+    av_freep(&s->fdsp);
 
     ff_af_queue_close(&s->afq);
 
@@ -97,6 +98,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (!(s->gfp = lame_init()))
         return AVERROR(ENOMEM);
 
+
     lame_set_num_channels(s->gfp, avctx->channels);
     lame_set_mode(s->gfp, avctx->channels > 1 ? s->joint_stereo ? JOINT_STEREO : STEREO : MONO);
 
@@ -105,9 +107,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     lame_set_out_samplerate(s->gfp, avctx->sample_rate);
 
     /* algorithmic quality */
-    if (avctx->compression_level == FF_COMPRESSION_DEFAULT)
-        lame_set_quality(s->gfp, 5);
-    else
+    if (avctx->compression_level != FF_COMPRESSION_DEFAULT)
         lame_set_quality(s->gfp, avctx->compression_level);
 
     /* rate control */
@@ -146,7 +146,7 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (avctx->sample_fmt == AV_SAMPLE_FMT_FLTP) {
         int ch;
         for (ch = 0; ch < avctx->channels; ch++) {
-            s->samples_flt[ch] = av_malloc(avctx->frame_size *
+            s->samples_flt[ch] = av_malloc_array(avctx->frame_size,
                                            sizeof(*s->samples_flt[ch]));
             if (!s->samples_flt[ch]) {
                 ret = AVERROR(ENOMEM);
@@ -159,7 +159,12 @@ static av_cold int mp3lame_encode_init(AVCodecContext *avctx)
     if (ret < 0)
         goto error;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
+
 
     return 0;
 error:
@@ -198,7 +203,7 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                 return AVERROR(EINVAL);
             }
             for (ch = 0; ch < avctx->channels; ch++) {
-                s->fdsp.vector_fmul_scalar(s->samples_flt[ch],
+                s->fdsp->vector_fmul_scalar(s->samples_flt[ch],
                                            (const float *)frame->data[ch],
                                            32768.0f,
                                            FFALIGN(frame->nb_samples, 8));
@@ -208,6 +213,8 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         default:
             return AVERROR_BUG;
         }
+    } else if (!s->afq.frame_alloc) {
+        lame_result = 0;
     } else {
         lame_result = lame_encode_flush(s->gfp, s->buffer + s->buffer_index,
                                         s->buffer_size - s->buffer_index);
@@ -252,10 +259,8 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ff_dlog(avctx, "in:%d packet-len:%d index:%d\n", avctx->frame_size, len,
             s->buffer_index);
     if (len <= s->buffer_index) {
-        if ((ret = ff_alloc_packet(avpkt, len))) {
-            av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        if ((ret = ff_alloc_packet2(avctx, avpkt, len, 0)) < 0)
             return ret;
-        }
         memcpy(avpkt->data, s->buffer, len);
         s->buffer_index -= len;
         memmove(s->buffer, s->buffer + len, s->buffer_index);
@@ -273,9 +278,9 @@ static int mp3lame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 #define OFFSET(x) offsetof(LAMEContext, x)
 #define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "reservoir", "Use bit reservoir.", OFFSET(reservoir), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AE },
-    { "joint_stereo", "Use joint stereo.", OFFSET(joint_stereo), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AE },
-    { "abr", "Use ABR", OFFSET(abr), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AE },
+    { "reservoir",    "use bit reservoir", OFFSET(reservoir),    AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, AE },
+    { "joint_stereo", "use joint stereo",  OFFSET(joint_stereo), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, AE },
+    { "abr",          "use ABR",           OFFSET(abr),          AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AE },
     { NULL },
 };
 
diff --git a/libavcodec/libopencore-amr.c b/libavcodec/libopencore-amr.c
index 27ab7d2..8545ffe 100644
--- a/libavcodec/libopencore-amr.c
+++ b/libavcodec/libopencore-amr.c
@@ -2,20 +2,20 @@
  * AMR Audio decoder stub
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,8 @@ static int amr_decode_fix_avctx(AVCodecContext *avctx)
 {
     const int is_amr_wb = 1 + (avctx->codec_id == AV_CODEC_ID_AMR_WB);
 
-    avctx->sample_rate = 8000 * is_amr_wb;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000 * is_amr_wb;
 
     if (avctx->channels > 1) {
         avpriv_report_missing_feature(avctx, "multi-channel AMR");
@@ -105,10 +106,8 @@ static int amr_nb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 160;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     dec_mode    = (buf[0] >> 3) & 0x000F;
     packet_size = block_size[dec_mode] + 1;
@@ -183,7 +182,7 @@ static const AVOption options[] = {
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass amrnb_class = {
     "libopencore_amrnb", av_default_item_name, options, LIBAVUTIL_VERSION_INT
 };
 
@@ -191,7 +190,7 @@ static av_cold int amr_nb_encode_init(AVCodecContext *avctx)
 {
     AMRContext *s = avctx->priv_data;
 
-    if (avctx->sample_rate != 8000) {
+    if (avctx->sample_rate != 8000 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
         av_log(avctx, AV_LOG_ERROR, "Only 8000Hz sample rate supported\n");
         return AVERROR(ENOSYS);
     }
@@ -239,14 +238,12 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         s->enc_bitrate = avctx->bit_rate;
     }
 
-    if ((ret = ff_alloc_packet(avpkt, 32))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 32, 0)) < 0)
         return ret;
-    }
 
     if (frame) {
         if (frame->nb_samples < avctx->frame_size) {
-            flush_buf = av_mallocz(avctx->frame_size * sizeof(*flush_buf));
+            flush_buf = av_mallocz_array(avctx->frame_size, sizeof(*flush_buf));
             if (!flush_buf)
                 return AVERROR(ENOMEM);
             memcpy(flush_buf, samples, frame->nb_samples * sizeof(*flush_buf));
@@ -261,7 +258,7 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     } else {
         if (s->enc_last_frame < 0)
             return 0;
-        flush_buf = av_mallocz(avctx->frame_size * sizeof(*flush_buf));
+        flush_buf = av_mallocz_array(avctx->frame_size, sizeof(*flush_buf));
         if (!flush_buf)
             return AVERROR(ENOMEM);
         samples = flush_buf;
@@ -270,8 +267,8 @@ static int amr_nb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     written = Encoder_Interface_Encode(s->enc_state, s->enc_mode, samples,
                                        avpkt->data, 0);
-    ff_dlog(avctx, "amr_nb_encode_frame encoded %d bytes, bitrate %d, first byte was %#02"PRIx8"\n",
-            written, s->enc_mode, *frame->data[0]);
+    ff_dlog(avctx, "amr_nb_encode_frame encoded %u bytes, bitrate %u, first byte was %#02x\n",
+            written, s->enc_mode, avpkt->data[0]);
 
     /* Get the next frame pts/duration */
     ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
@@ -295,7 +292,7 @@ AVCodec ff_libopencore_amrnb_encoder = {
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &amrnb_class,
 };
 #endif /* CONFIG_LIBOPENCORE_AMRNB_ENCODER */
 
@@ -337,10 +334,8 @@ static int amr_wb_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 320;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     mode        = (buf[0] >> 3) & 0x000F;
     packet_size = block_size[mode];
@@ -350,6 +345,10 @@ static int amr_wb_decode_frame(AVCodecContext *avctx, void *data,
                buf_size, packet_size + 1);
         return AVERROR_INVALIDDATA;
     }
+    if (!packet_size) {
+        av_log(avctx, AV_LOG_ERROR, "amr packet_size invalid\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     D_IF_decode(s->state, buf, (short *)frame->data[0], _good_frame);
 
diff --git a/libavcodec/libopenh264enc.c b/libavcodec/libopenh264enc.c
index daab41f..24bc228 100644
--- a/libavcodec/libopenh264enc.c
+++ b/libavcodec/libopenh264enc.c
@@ -2,20 +2,20 @@
  * OpenH264 video encoder
  * Copyright (C) 2014 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,15 +51,15 @@ typedef struct SVCContext {
 #define OFFSET(x) offsetof(SVCContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "slice_mode", "Slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_AUTO_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
-    { "fixed", "A fixed number of slices", 0, AV_OPT_TYPE_CONST, { .i64 = SM_FIXEDSLCNUM_SLICE }, 0, 0, VE, "slice_mode" },
-    { "rowmb", "One slice per row of macroblocks", 0, AV_OPT_TYPE_CONST, { .i64 = SM_ROWMB_SLICE }, 0, 0, VE, "slice_mode" },
-    { "auto", "Automatic number of slices according to number of threads", 0, AV_OPT_TYPE_CONST, { .i64 = SM_AUTO_SLICE }, 0, 0, VE, "slice_mode" },
-    { "dyn", "Dynamic slicing", 0, AV_OPT_TYPE_CONST, { .i64 = SM_DYN_SLICE }, 0, 0, VE, "slice_mode" },
-    { "loopfilter", "Enable loop filter", OFFSET(loopfilter), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
-    { "profile", "Set profile restrictions", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VE },
-    { "max_nal_size", "Set maximum NAL size in bytes", OFFSET(max_nal_size), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
-    { "allow_skip_frames", "Allow skipping frames to hit the target bitrate", OFFSET(skip_frames), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "slice_mode", "set slice mode", OFFSET(slice_mode), AV_OPT_TYPE_INT, { .i64 = SM_AUTO_SLICE }, SM_SINGLE_SLICE, SM_RESERVED, VE, "slice_mode" },
+        { "fixed", "a fixed number of slices", 0, AV_OPT_TYPE_CONST, { .i64 = SM_FIXEDSLCNUM_SLICE }, 0, 0, VE, "slice_mode" },
+        { "rowmb", "one slice per row of macroblocks", 0, AV_OPT_TYPE_CONST, { .i64 = SM_ROWMB_SLICE }, 0, 0, VE, "slice_mode" },
+        { "auto", "automatic number of slices according to number of threads", 0, AV_OPT_TYPE_CONST, { .i64 = SM_AUTO_SLICE }, 0, 0, VE, "slice_mode" },
+        { "dyn", "Dynamic slicing", 0, AV_OPT_TYPE_CONST, { .i64 = SM_DYN_SLICE }, 0, 0, VE, "slice_mode" },
+    { "loopfilter", "enable loop filter", OFFSET(loopfilter), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
+    { "profile", "set profile restrictions", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, VE },
+    { "max_nal_size", "set maximum NAL size in bytes", OFFSET(max_nal_size), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "allow_skip_frames", "allow skipping frames to hit the target bitrate", OFFSET(skip_frames), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "cabac", "Enable cabac", OFFSET(cabac), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { NULL }
 };
@@ -68,8 +68,8 @@ static const AVClass class = {
     "libopenh264enc", av_default_item_name, options, LIBAVUTIL_VERSION_INT
 };
 
-// Convert libopenh264 log level to equivalent libav log level.
-static int libopenh264_to_libav_log_level(int libopenh264_log_level)
+// Convert libopenh264 log level to equivalent ffmpeg log level.
+static int libopenh264_to_ffmpeg_log_level(int libopenh264_log_level)
 {
     if      (libopenh264_log_level >= WELS_LOG_DETAIL)  return AV_LOG_TRACE;
     else if (libopenh264_log_level >= WELS_LOG_DEBUG)   return AV_LOG_DEBUG;
@@ -87,10 +87,10 @@ static int libopenh264_to_libav_log_level(int libopenh264_log_level)
 
 static void libopenh264_trace_callback(void *ctx, int level, const char *msg)
 {
-    // The message will be logged only if the requested EQUIVALENT libav log level is
-    // less than or equal to the current libav log level.
-    int equiv_libav_log_level = libopenh264_to_libav_log_level(level);
-    av_log(ctx, equiv_libav_log_level, "%s\n", msg);
+    // The message will be logged only if the requested EQUIVALENT ffmpeg log level is
+    // less than or equal to the current ffmpeg log level.
+    int equiv_ffmpeg_log_level = libopenh264_to_ffmpeg_log_level(level);
+    av_log(ctx, equiv_ffmpeg_log_level, "%s\n", msg);
 }
 
 static av_cold int svc_encode_close(AVCodecContext *avctx)
@@ -149,7 +149,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    param.fMaxFrameRate              = avctx->time_base.den / avctx->time_base.num;
+    param.fMaxFrameRate              = 1/av_q2d(avctx->time_base);
     param.iPicWidth                  = avctx->width;
     param.iPicHeight                 = avctx->height;
     param.iTargetBitrate             = avctx->bit_rate;
@@ -288,7 +288,7 @@ static int svc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
     av_log(avctx, AV_LOG_DEBUG, "%d slices\n", fbi.sLayerInfo[fbi.iLayerNum - 1].iNalCount);
 
-    if ((ret = ff_alloc_packet(avpkt, size))) {
+    if ((ret = ff_alloc_packet2(avctx, avpkt, size, size))) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
         return ret;
     }
diff --git a/libavcodec/libopenjpegdec.c b/libavcodec/libopenjpegdec.c
index 401ea9b..65167e6 100644
--- a/libavcodec/libopenjpegdec.c
+++ b/libavcodec/libopenjpegdec.c
@@ -2,20 +2,20 @@
  * JPEG 2000 decoding support via OpenJPEG
  * Copyright (c) 2009 Jaikrishnan Menon <realityman@gmx.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,6 @@
  */
 
 #define  OPJ_STATIC
-#include <openjpeg.h>
 
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
@@ -37,6 +36,24 @@
 #include "internal.h"
 #include "thread.h"
 
+#if HAVE_OPENJPEG_2_1_OPENJPEG_H
+#  include <openjpeg-2.1/openjpeg.h>
+#elif HAVE_OPENJPEG_2_0_OPENJPEG_H
+#  include <openjpeg-2.0/openjpeg.h>
+#elif HAVE_OPENJPEG_1_5_OPENJPEG_H
+#  include <openjpeg-1.5/openjpeg.h>
+#else
+#  include <openjpeg.h>
+#endif
+
+#if HAVE_OPENJPEG_2_1_OPENJPEG_H || HAVE_OPENJPEG_2_0_OPENJPEG_H
+#  define OPENJPEG_MAJOR_VERSION 2
+#  define OPJ(x) OPJ_##x
+#else
+#  define OPENJPEG_MAJOR_VERSION 1
+#  define OPJ(x) x
+#endif
+
 #define JP2_SIG_TYPE    0x6A502020
 #define JP2_SIG_VALUE   0x0D0A870A
 
@@ -46,72 +63,149 @@
                            AV_PIX_FMT_RGB48, AV_PIX_FMT_RGBA64
 
 #define GRAY_PIXEL_FORMATS AV_PIX_FMT_GRAY8, AV_PIX_FMT_YA8,                  \
-                           AV_PIX_FMT_GRAY16
-
-#define YUV_PIXEL_FORMATS  AV_PIX_FMT_YUV410P,   AV_PIX_FMT_YUV411P,          \
-                           AV_PIX_FMT_YUVA420P,                               \
-                           AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,          \
-                           AV_PIX_FMT_YUV440P,   AV_PIX_FMT_YUV444P,          \
-                           AV_PIX_FMT_YUV420P9,  AV_PIX_FMT_YUV422P9,         \
-                           AV_PIX_FMT_YUV444P9,                               \
-                           AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10,        \
-                           AV_PIX_FMT_YUV444P10,                              \
-                           AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16,        \
-                           AV_PIX_FMT_YUV444P16
+                           AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA16
+
+#define YUV_PIXEL_FORMATS  AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUVA420P, \
+                           AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVA422P, \
+                           AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA444P, \
+                           AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9, \
+                           AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA444P9, \
+                           AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10, \
+                           AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA444P10, \
+                           AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12, \
+                           AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14, \
+                           AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16, \
+                           AV_PIX_FMT_YUVA420P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA444P16
 
 #define XYZ_PIXEL_FORMATS  AV_PIX_FMT_XYZ12
 
-static const enum AVPixelFormat rgb_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_rgb_pix_fmts[]  = {
     RGB_PIXEL_FORMATS
 };
-static const enum AVPixelFormat gray_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_gray_pix_fmts[] = {
     GRAY_PIXEL_FORMATS
 };
-static const enum AVPixelFormat yuv_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_yuv_pix_fmts[]  = {
     YUV_PIXEL_FORMATS
 };
-static const enum AVPixelFormat any_pix_fmts[] = {
+static const enum AVPixelFormat libopenjpeg_all_pix_fmts[]  = {
     RGB_PIXEL_FORMATS, GRAY_PIXEL_FORMATS, YUV_PIXEL_FORMATS, XYZ_PIXEL_FORMATS
 };
 
 typedef struct LibOpenJPEGContext {
     AVClass *class;
     opj_dparameters_t dec_params;
-    int lowres;
+#if OPENJPEG_MAJOR_VERSION == 1
+    opj_event_mgr_t event_mgr;
+#endif // OPENJPEG_MAJOR_VERSION == 1
     int lowqual;
 } LibOpenJPEGContext;
 
-static int libopenjpeg_matches_pix_fmt(const opj_image_t *img,
-                                       enum AVPixelFormat pix_fmt)
+static void error_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_ERROR, "%s", msg);
+}
+
+static void warning_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_WARNING, "%s", msg);
+}
+
+static void info_callback(const char *msg, void *data)
+{
+    av_log(data, AV_LOG_DEBUG, "%s", msg);
+}
+
+#if OPENJPEG_MAJOR_VERSION == 2
+typedef struct BufferReader {
+    int pos;
+    int size;
+    const uint8_t *buffer;
+} BufferReader;
+
+static OPJ_SIZE_T stream_read(void *out_buffer, OPJ_SIZE_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    int remaining;
+
+    if (reader->pos == reader->size) {
+        return (OPJ_SIZE_T)-1;
+    }
+    remaining = reader->size - reader->pos;
+    if (nb_bytes > remaining) {
+        nb_bytes = remaining;
+    }
+    memcpy(out_buffer, reader->buffer + reader->pos, nb_bytes);
+    reader->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_OFF_T stream_skip(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    if (nb_bytes < 0) {
+        if (reader->pos == 0) {
+            return (OPJ_SIZE_T)-1;
+        }
+        if (nb_bytes + reader->pos < 0) {
+            nb_bytes = -reader->pos;
+        }
+    } else {
+        int remaining;
+
+        if (reader->pos == reader->size) {
+            return (OPJ_SIZE_T)-1;
+        }
+        remaining = reader->size - reader->pos;
+        if (nb_bytes > remaining) {
+            nb_bytes = remaining;
+        }
+    }
+    reader->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_BOOL stream_seek(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    BufferReader *reader = user_data;
+    if (nb_bytes < 0 || nb_bytes > reader->size) {
+        return OPJ_FALSE;
+    }
+    reader->pos = (int)nb_bytes;
+    return OPJ_TRUE;
+}
+#endif // OPENJPEG_MAJOR_VERSION == 2
+
+static inline int libopenjpeg_matches_pix_fmt(const opj_image_t *image, enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     int match = 1;
 
-    if (desc->nb_components != img->numcomps) {
+    if (desc->nb_components != image->numcomps) {
         return 0;
     }
 
     switch (desc->nb_components) {
     case 4:
         match = match &&
-                desc->comp[3].depth >= img->comps[3].prec &&
-                1 == img->comps[3].dx &&
-                1 == img->comps[3].dy;
+                desc->comp[3].depth >= image->comps[3].prec &&
+                1 == image->comps[3].dx &&
+                1 == image->comps[3].dy;
     case 3:
         match = match &&
-                desc->comp[2].depth >= img->comps[2].prec &&
-                1 << desc->log2_chroma_w == img->comps[2].dx &&
-                1 << desc->log2_chroma_h == img->comps[2].dy;
+                desc->comp[2].depth >= image->comps[2].prec &&
+                1 << desc->log2_chroma_w == image->comps[2].dx &&
+                1 << desc->log2_chroma_h == image->comps[2].dy;
     case 2:
         match = match &&
-                desc->comp[1].depth >= img->comps[1].prec &&
-                1 << desc->log2_chroma_w == img->comps[1].dx &&
-                1 << desc->log2_chroma_h == img->comps[1].dy;
+                desc->comp[1].depth >= image->comps[1].prec &&
+                1 << desc->log2_chroma_w == image->comps[1].dx &&
+                1 << desc->log2_chroma_h == image->comps[1].dy;
     case 1:
         match = match &&
-                desc->comp[0].depth >= img->comps[0].prec &&
-                1 == img->comps[0].dx &&
-                1 == img->comps[0].dy;
+                desc->comp[0].depth >= image->comps[0].prec &&
+                1 == image->comps[0].dx &&
+                1 == image->comps[0].dy;
     default:
         break;
     }
@@ -119,28 +213,27 @@ static int libopenjpeg_matches_pix_fmt(const opj_image_t *img,
     return match;
 }
 
-static enum AVPixelFormat libopenjpeg_guess_pix_fmt(const opj_image_t *image)
-{
+static inline enum AVPixelFormat libopenjpeg_guess_pix_fmt(const opj_image_t *image) {
     int index;
     const enum AVPixelFormat *possible_fmts = NULL;
     int possible_fmts_nb = 0;
 
     switch (image->color_space) {
-    case CLRSPC_SRGB:
-        possible_fmts    = rgb_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(rgb_pix_fmts);
+    case OPJ(CLRSPC_SRGB):
+        possible_fmts    = libopenjpeg_rgb_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_rgb_pix_fmts);
         break;
-    case CLRSPC_GRAY:
-        possible_fmts    = gray_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(gray_pix_fmts);
+    case OPJ(CLRSPC_GRAY):
+        possible_fmts    = libopenjpeg_gray_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_gray_pix_fmts);
         break;
-    case CLRSPC_SYCC:
-        possible_fmts    = yuv_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(yuv_pix_fmts);
+    case OPJ(CLRSPC_SYCC):
+        possible_fmts    = libopenjpeg_yuv_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_yuv_pix_fmts);
         break;
     default:
-        possible_fmts    = any_pix_fmts;
-        possible_fmts_nb = FF_ARRAY_ELEMS(any_pix_fmts);
+        possible_fmts    = libopenjpeg_all_pix_fmts;
+        possible_fmts_nb = FF_ARRAY_ELEMS(libopenjpeg_all_pix_fmts);
         break;
     }
 
@@ -167,40 +260,37 @@ static inline int libopenjpeg_ispacked(enum AVPixelFormat pix_fmt)
     return 1;
 }
 
-static void libopenjpeg_copy_to_packed8(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copy_to_packed8(AVFrame *picture, opj_image_t *image) {
     uint8_t *img_ptr;
     int index, x, y, c;
-
     for (y = 0; y < picture->height; y++) {
         index   = y * picture->width;
         img_ptr = picture->data[0] + y * picture->linesize[0];
         for (x = 0; x < picture->width; x++, index++)
             for (c = 0; c < image->numcomps; c++)
-                *img_ptr++ = image->comps[c].data[index];
+                *img_ptr++ = 0x80 * image->comps[c].sgnd + image->comps[c].data[index];
     }
 }
 
-static void libopenjpeg_copy_to_packed16(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copy_to_packed16(AVFrame *picture, opj_image_t *image) {
     uint16_t *img_ptr;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(picture->format);
     int index, x, y, c;
     int adjust[4];
-
     for (x = 0; x < image->numcomps; x++)
-        adjust[x] = FFMAX(FFMIN(16 - image->comps[x].prec, 8), 0);
+        adjust[x] = FFMAX(FFMIN(desc->comp[x].depth - image->comps[x].prec, 8), 0) + desc->comp[x].shift;
 
     for (y = 0; y < picture->height; y++) {
         index   = y * picture->width;
         img_ptr = (uint16_t *) (picture->data[0] + y * picture->linesize[0]);
         for (x = 0; x < picture->width; x++, index++)
             for (c = 0; c < image->numcomps; c++)
-                *img_ptr++ = image->comps[c].data[index] << adjust[c];
+                *img_ptr++ = (1 << image->comps[c].prec - 1) * image->comps[c].sgnd +
+                             (unsigned)image->comps[c].data[index] << adjust[c];
     }
 }
 
-static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
-{
+static inline void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image) {
     int *comp_data;
     uint8_t *img_ptr;
     int index, x, y;
@@ -210,7 +300,7 @@ static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
         for (y = 0; y < image->comps[index].h; y++) {
             img_ptr = picture->data[index] + y * picture->linesize[index];
             for (x = 0; x < image->comps[index].w; x++) {
-                *img_ptr = (uint8_t) *comp_data;
+                *img_ptr = 0x80 * image->comps[index].sgnd + *comp_data;
                 img_ptr++;
                 comp_data++;
             }
@@ -218,18 +308,22 @@ static void libopenjpeg_copyto8(AVFrame *picture, opj_image_t *image)
     }
 }
 
-static void libopenjpeg_copyto16(AVFrame *p, opj_image_t *image)
-{
+static inline void libopenjpeg_copyto16(AVFrame *picture, opj_image_t *image) {
     int *comp_data;
     uint16_t *img_ptr;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(picture->format);
     int index, x, y;
+    int adjust[4];
+    for (x = 0; x < image->numcomps; x++)
+        adjust[x] = FFMAX(FFMIN(desc->comp[x].depth - image->comps[x].prec, 8), 0) + desc->comp[x].shift;
 
     for (index = 0; index < image->numcomps; index++) {
         comp_data = image->comps[index].data;
         for (y = 0; y < image->comps[index].h; y++) {
-            img_ptr = (uint16_t *)(p->data[index] + y * p->linesize[index]);
+            img_ptr = (uint16_t *)(picture->data[index] + y * picture->linesize[index]);
             for (x = 0; x < image->comps[index].w; x++) {
-                *img_ptr = *comp_data;
+                *img_ptr = (1 << image->comps[index].prec - 1) * image->comps[index].sgnd +
+                           (unsigned)*comp_data << adjust[index];
                 img_ptr++;
                 comp_data++;
             }
@@ -255,13 +349,19 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     ThreadFrame frame       = { .f = data };
     AVFrame *picture        = data;
     const AVPixFmtDescriptor *desc;
-    opj_dinfo_t *dec;
-    opj_cio_t *stream;
-    opj_image_t *image;
     int width, height, ret;
     int pixel_size = 0;
     int ispacked   = 0;
     int i;
+    opj_image_t *image = NULL;
+#if OPENJPEG_MAJOR_VERSION == 1
+    opj_dinfo_t *dec = NULL;
+    opj_cio_t *stream = NULL;
+#else // OPENJPEG_MAJOR_VERSION == 2
+    BufferReader reader = {0, avpkt->size, avpkt->data};
+    opj_codec_t *dec = NULL;
+    opj_stream_t *stream = NULL;
+#endif // OPENJPEG_MAJOR_VERSION == 1
 
     *got_frame = 0;
 
@@ -269,53 +369,89 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
     if ((AV_RB32(buf) == 12) &&
         (AV_RB32(buf + 4) == JP2_SIG_TYPE) &&
         (AV_RB32(buf + 8) == JP2_SIG_VALUE)) {
-        dec = opj_create_decompress(CODEC_JP2);
+        dec = opj_create_decompress(OPJ(CODEC_JP2));
     } else {
         /* If the AVPacket contains a jp2c box, then skip to
          * the starting byte of the codestream. */
         if (AV_RB32(buf + 4) == AV_RB32("jp2c"))
             buf += 8;
-        dec = opj_create_decompress(CODEC_J2K);
+        dec = opj_create_decompress(OPJ(CODEC_J2K));
     }
 
     if (!dec) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing decoder.\n");
-        return AVERROR_UNKNOWN;
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
-    opj_set_event_mgr((opj_common_ptr) dec, NULL, NULL);
 
+#if OPENJPEG_MAJOR_VERSION == 1
+    memset(&ctx->event_mgr, 0, sizeof(ctx->event_mgr));
+    ctx->event_mgr.info_handler    = info_callback;
+    ctx->event_mgr.error_handler   = error_callback;
+    ctx->event_mgr.warning_handler = warning_callback;
+    opj_set_event_mgr((opj_common_ptr) dec, &ctx->event_mgr, avctx);
     ctx->dec_params.cp_limit_decoding = LIMIT_TO_MAIN_HEADER;
-    ctx->dec_params.cp_reduce         = ctx->lowres;
     ctx->dec_params.cp_layer          = ctx->lowqual;
+#else // OPENJPEG_MAJOR_VERSION == 2
+    if (!opj_set_error_handler(dec, error_callback, avctx) ||
+        !opj_set_warning_handler(dec, warning_callback, avctx) ||
+        !opj_set_info_handler(dec, info_callback, avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting decoder handlers.\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    ctx->dec_params.cp_layer = ctx->lowqual;
+    ctx->dec_params.cp_reduce = avctx->lowres;
+#endif // OPENJPEG_MAJOR_VERSION == 1
+
     // Tie decoder with decoding parameters
     opj_setup_decoder(dec, &ctx->dec_params);
+
+#if OPENJPEG_MAJOR_VERSION == 1
     stream = opj_cio_open((opj_common_ptr) dec, buf, buf_size);
+#else // OPENJPEG_MAJOR_VERSION == 2
+    stream = opj_stream_default_create(OPJ_STREAM_READ);
+#endif // OPENJPEG_MAJOR_VERSION == 1
 
     if (!stream) {
         av_log(avctx, AV_LOG_ERROR,
                "Codestream could not be opened for reading.\n");
-        opj_destroy_decompress(dec);
-        return AVERROR_UNKNOWN;
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
 
+#if OPENJPEG_MAJOR_VERSION == 1
     // Decode the header only.
     image = opj_decode_with_info(dec, stream, NULL);
     opj_cio_close(stream);
+    stream = NULL;
+    ret = !image;
+#else // OPENJPEG_MAJOR_VERSION == 2
+    opj_stream_set_read_function(stream, stream_read);
+    opj_stream_set_skip_function(stream, stream_skip);
+    opj_stream_set_seek_function(stream, stream_seek);
+#if HAVE_OPENJPEG_2_1_OPENJPEG_H
+    opj_stream_set_user_data(stream, &reader, NULL);
+#elif HAVE_OPENJPEG_2_0_OPENJPEG_H
+    opj_stream_set_user_data(stream, &reader);
+#else
+#error Missing call to opj_stream_set_user_data
+#endif
+    opj_stream_set_user_data_length(stream, avpkt->size);
+    // Decode the header only.
+    ret = !opj_read_header(stream, dec, &image);
+#endif // OPENJPEG_MAJOR_VERSION == 1
 
-    if (!image) {
-        av_log(avctx, AV_LOG_ERROR, "Error decoding codestream.\n");
-        opj_destroy_decompress(dec);
-        return AVERROR_UNKNOWN;
+    if (ret) {
+        av_log(avctx, AV_LOG_ERROR, "Error decoding codestream header.\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
 
     width  = image->x1 - image->x0;
     height = image->y1 - image->y0;
 
-    if (ctx->lowres) {
-        width  = (width + (1 << ctx->lowres) - 1) >> ctx->lowres;
-        height = (height + (1 << ctx->lowres) - 1) >> ctx->lowres;
-    }
-
     ret = ff_set_dimensions(avctx, width, height);
     if (ret < 0)
         goto done;
@@ -328,42 +464,52 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
         avctx->pix_fmt = libopenjpeg_guess_pix_fmt(image);
 
     if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to determine pixel format\n");
-        ret = AVERROR_INVALIDDATA;
+        av_log(avctx, AV_LOG_ERROR, "Unable to determine pixel format.\n");
+        ret = AVERROR_UNKNOWN;
         goto done;
     }
-
     for (i = 0; i < image->numcomps; i++)
         if (image->comps[i].prec > avctx->bits_per_raw_sample)
             avctx->bits_per_raw_sample = image->comps[i].prec;
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "ff_thread_get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         goto done;
-    }
 
+#if OPENJPEG_MAJOR_VERSION == 1
     ctx->dec_params.cp_limit_decoding = NO_LIMITATION;
+    ctx->dec_params.cp_reduce = avctx->lowres;
     // Tie decoder with decoding parameters.
     opj_setup_decoder(dec, &ctx->dec_params);
     stream = opj_cio_open((opj_common_ptr) dec, buf, buf_size);
     if (!stream) {
         av_log(avctx, AV_LOG_ERROR,
                "Codestream could not be opened for reading.\n");
-        ret = AVERROR_UNKNOWN;
+        ret = AVERROR_EXTERNAL;
         goto done;
     }
-
     opj_image_destroy(image);
     // Decode the codestream
     image = opj_decode_with_info(dec, stream, NULL);
-    opj_cio_close(stream);
+    ret = !image;
+#else // OPENJPEG_MAJOR_VERSION == 2
+    ret = !opj_decode(dec, stream, image);
+#endif // OPENJPEG_MAJOR_VERSION == 1
 
-    if (!image) {
+    if (ret) {
         av_log(avctx, AV_LOG_ERROR, "Error decoding codestream.\n");
-        ret = AVERROR_UNKNOWN;
+        ret = AVERROR_EXTERNAL;
         goto done;
     }
 
+    for (i = 0; i < image->numcomps; i++) {
+        if (!image->comps[i].data) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Image component %d contains no data.\n", i);
+            ret = AVERROR_INVALIDDATA;
+            goto done;
+        }
+    }
+
     desc       = av_pix_fmt_desc_get(avctx->pix_fmt);
     pixel_size = desc->comp[0].step;
     ispacked   = libopenjpeg_ispacked(avctx->pix_fmt);
@@ -406,22 +552,35 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
 
 done:
     opj_image_destroy(image);
+#if OPENJPEG_MAJOR_VERSION == 2
+    opj_stream_destroy(stream);
+    opj_destroy_codec(dec);
+#else
+    opj_cio_close(stream);
     opj_destroy_decompress(dec);
+#endif
     return ret;
 }
 
+static av_cold void libopenjpeg_static_init(AVCodec *codec)
+{
+    const char *version = opj_version();
+    int major, minor;
+
+    if (sscanf(version, "%d.%d", &major, &minor) == 2 && 1000*major + minor <= 1003)
+        codec->capabilities |= AV_CODEC_CAP_EXPERIMENTAL;
+}
+
 #define OFFSET(x) offsetof(LibOpenJPEGContext, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 
 static const AVOption options[] = {
     { "lowqual", "Limit the number of layers used for decoding",
         OFFSET(lowqual), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VD },
-    { "lowres",  "Lower the decoding resolution by a power of two",
-        OFFSET(lowres),  AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VD },
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass openjpeg_class = {
     .class_name = "libopenjpeg",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -437,5 +596,7 @@ AVCodec ff_libopenjpeg_decoder = {
     .init           = libopenjpeg_decode_init,
     .decode         = libopenjpeg_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
-    .priv_class     = &class,
+    .max_lowres     = 31,
+    .priv_class     = &openjpeg_class,
+    .init_static_data = libopenjpeg_static_init,
 };
diff --git a/libavcodec/libopenjpegenc.c b/libavcodec/libopenjpegenc.c
index d1af021..56c8219 100644
--- a/libavcodec/libopenjpegenc.c
+++ b/libavcodec/libopenjpegenc.c
@@ -1,21 +1,21 @@
 /*
  * JPEG 2000 encoding support via OpenJPEG
- * Copyright (c) 2011 Michael Bradshaw <mbradshaw@sorensonmedia.com>
+ * Copyright (c) 2011 Michael Bradshaw <mjbshaw gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,8 @@
  */
 
 #define  OPJ_STATIC
-#include <openjpeg.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
@@ -34,12 +34,31 @@
 #include "avcodec.h"
 #include "internal.h"
 
+#if HAVE_OPENJPEG_2_1_OPENJPEG_H
+#  include <openjpeg-2.1/openjpeg.h>
+#elif HAVE_OPENJPEG_2_0_OPENJPEG_H
+#  include <openjpeg-2.0/openjpeg.h>
+#elif HAVE_OPENJPEG_1_5_OPENJPEG_H
+#  include <openjpeg-1.5/openjpeg.h>
+#else
+#  include <openjpeg.h>
+#endif
+
+#if HAVE_OPENJPEG_2_1_OPENJPEG_H || HAVE_OPENJPEG_2_0_OPENJPEG_H
+#  define OPENJPEG_MAJOR_VERSION 2
+#  define OPJ(x) OPJ_##x
+#else
+#  define OPENJPEG_MAJOR_VERSION 1
+#  define OPJ(x) x
+#endif
+
 typedef struct LibOpenJPEGContext {
     AVClass *avclass;
     opj_image_t *image;
     opj_cparameters_t enc_params;
-    opj_cinfo_t *compress;
+#if OPENJPEG_MAJOR_VERSION == 1
     opj_event_mgr_t event_mgr;
+#endif // OPENJPEG_MAJOR_VERSION == 1
     int format;
     int profile;
     int prog_order;
@@ -66,38 +85,152 @@ static void info_callback(const char *msg, void *data)
     av_log(data, AV_LOG_DEBUG, "%s\n", msg);
 }
 
-static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
-                                             opj_cparameters_t *parameters)
+#if OPENJPEG_MAJOR_VERSION == 2
+typedef struct PacketWriter {
+    int pos;
+    AVPacket *packet;
+} PacketWriter;
+
+static OPJ_SIZE_T stream_write(void *out_buffer, OPJ_SIZE_T nb_bytes, void *user_data)
+{
+    PacketWriter *writer = user_data;
+    AVPacket *packet = writer->packet;
+    int remaining = packet->size - writer->pos;
+    if (nb_bytes > remaining) {
+        OPJ_SIZE_T needed = nb_bytes - remaining;
+        int max_growth = INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - packet->size;
+        if (needed > max_growth) {
+            return (OPJ_SIZE_T)-1;
+        }
+        if (av_grow_packet(packet, (int)needed)) {
+            return (OPJ_SIZE_T)-1;
+        }
+    }
+    memcpy(packet->data + writer->pos, out_buffer, nb_bytes);
+    writer->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_OFF_T stream_skip(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    PacketWriter *writer = user_data;
+    AVPacket *packet = writer->packet;
+    if (nb_bytes < 0) {
+        if (writer->pos == 0) {
+            return (OPJ_SIZE_T)-1;
+        }
+        if (nb_bytes + writer->pos < 0) {
+            nb_bytes = -writer->pos;
+        }
+    } else {
+        int remaining = packet->size - writer->pos;
+        if (nb_bytes > remaining) {
+            OPJ_SIZE_T needed = nb_bytes - remaining;
+            int max_growth = INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE - packet->size;
+            if (needed > max_growth) {
+                return (OPJ_SIZE_T)-1;
+            }
+            if (av_grow_packet(packet, (int)needed)) {
+                return (OPJ_SIZE_T)-1;
+            }
+        }
+    }
+    writer->pos += (int)nb_bytes;
+    return nb_bytes;
+}
+
+static OPJ_BOOL stream_seek(OPJ_OFF_T nb_bytes, void *user_data)
+{
+    PacketWriter *writer = user_data;
+    AVPacket *packet = writer->packet;
+    if (nb_bytes < 0) {
+        return OPJ_FALSE;
+    }
+    if (nb_bytes > packet->size) {
+        if (nb_bytes > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE ||
+            av_grow_packet(packet, (int)nb_bytes - packet->size)) {
+            return OPJ_FALSE;
+        }
+    }
+    writer->pos = (int)nb_bytes;
+    return OPJ_TRUE;
+}
+#endif // OPENJPEG_MAJOR_VERSION == 2
+
+static void cinema_parameters(opj_cparameters_t *p)
+{
+    p->tile_size_on = 0;
+    p->cp_tdx = 1;
+    p->cp_tdy = 1;
+
+    /* Tile part */
+    p->tp_flag = 'C';
+    p->tp_on = 1;
+
+    /* Tile and Image shall be at (0, 0) */
+    p->cp_tx0 = 0;
+    p->cp_ty0 = 0;
+    p->image_offset_x0 = 0;
+    p->image_offset_y0 = 0;
+
+    /* Codeblock size= 32 * 32 */
+    p->cblockw_init = 32;
+    p->cblockh_init = 32;
+    p->csty |= 0x01;
+
+    /* The progression order shall be CPRL */
+    p->prog_order = OPJ(CPRL);
+
+    /* No ROI */
+    p->roi_compno = -1;
+
+    /* No subsampling */
+    p->subsampling_dx = 1;
+    p->subsampling_dy = 1;
+
+    /* 9-7 transform */
+    p->irreversible = 1;
+
+    p->tcp_mct = 1;
+}
+
+static opj_image_t *mj2_create_image(AVCodecContext *avctx, opj_cparameters_t *parameters)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
-    opj_image_cmptparm_t *cmptparm;
-    OPJ_COLOR_SPACE color_space;
+    opj_image_cmptparm_t cmptparm[4] = {{0}};
     opj_image_t *img;
     int i;
     int sub_dx[4];
     int sub_dy[4];
-    int numcomps = desc->nb_components;
+    int numcomps;
+    OPJ_COLOR_SPACE color_space = OPJ(CLRSPC_UNKNOWN);
+
+    sub_dx[0] = sub_dx[3] = 1;
+    sub_dy[0] = sub_dy[3] = 1;
+    sub_dx[1] = sub_dx[2] = 1 << desc->log2_chroma_w;
+    sub_dy[1] = sub_dy[2] = 1 << desc->log2_chroma_h;
 
-    sub_dx[0] =
-    sub_dx[3] = 1;
-    sub_dy[0] =
-    sub_dy[3] = 1;
-    sub_dx[1] =
-    sub_dx[2] = 1 << desc->log2_chroma_w;
-    sub_dy[1] =
-    sub_dy[2] = 1 << desc->log2_chroma_h;
+    numcomps = desc->nb_components;
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_GRAY8:
-    case AV_PIX_FMT_GRAY16:
     case AV_PIX_FMT_YA8:
-        color_space = CLRSPC_GRAY;
+    case AV_PIX_FMT_GRAY16:
+    case AV_PIX_FMT_YA16:
+        color_space = OPJ(CLRSPC_GRAY);
         break;
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_RGB48:
     case AV_PIX_FMT_RGBA64:
-        color_space = CLRSPC_SRGB;
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+    case AV_PIX_FMT_XYZ12:
+        color_space = OPJ(CLRSPC_SRGB);
         break;
     case AV_PIX_FMT_YUV410P:
     case AV_PIX_FMT_YUV411P:
@@ -106,16 +239,33 @@ static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUVA420P:
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_YUVA444P:
     case AV_PIX_FMT_YUV420P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA444P9:
     case AV_PIX_FMT_YUV420P10:
     case AV_PIX_FMT_YUV422P10:
     case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV444P14:
     case AV_PIX_FMT_YUV420P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV444P16:
-        color_space = CLRSPC_SYCC;
+    case AV_PIX_FMT_YUVA420P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA444P16:
+        color_space = OPJ(CLRSPC_SYCC);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
@@ -124,183 +274,368 @@ static opj_image_t *libopenjpeg_create_image(AVCodecContext *avctx,
         return NULL;
     }
 
-    cmptparm = av_mallocz(numcomps * sizeof(*cmptparm));
-    if (!cmptparm) {
-        av_log(avctx, AV_LOG_ERROR, "Not enough memory");
-        return NULL;
-    }
-
     for (i = 0; i < numcomps; i++) {
         cmptparm[i].prec = desc->comp[i].depth;
         cmptparm[i].bpp  = desc->comp[i].depth;
         cmptparm[i].sgnd = 0;
-        cmptparm[i].dx   = sub_dx[i];
-        cmptparm[i].dy   = sub_dy[i];
-        cmptparm[i].w    = avctx->width / sub_dx[i];
-        cmptparm[i].h    = avctx->height / sub_dy[i];
+        cmptparm[i].dx = sub_dx[i];
+        cmptparm[i].dy = sub_dy[i];
+        cmptparm[i].w = (avctx->width + sub_dx[i] - 1) / sub_dx[i];
+        cmptparm[i].h = (avctx->height + sub_dy[i] - 1) / sub_dy[i];
     }
 
     img = opj_image_create(numcomps, cmptparm, color_space);
-    av_freep(&cmptparm);
+
+    if (!img)
+        return NULL;
+
+    // x0, y0 is the top left corner of the image
+    // x1, y1 is the width, height of the reference grid
+    img->x0 = 0;
+    img->y0 = 0;
+    img->x1 = (avctx->width  - 1) * parameters->subsampling_dx + 1;
+    img->y1 = (avctx->height - 1) * parameters->subsampling_dy + 1;
+
     return img;
 }
 
 static av_cold int libopenjpeg_encode_init(AVCodecContext *avctx)
 {
     LibOpenJPEGContext *ctx = avctx->priv_data;
-    int err = AVERROR(ENOMEM);
+    int err = 0;
 
     opj_set_default_encoder_parameters(&ctx->enc_params);
 
-    ctx->enc_params.cp_rsiz          = ctx->profile;
-    ctx->enc_params.mode             = !!avctx->global_quality;
-    ctx->enc_params.cp_cinema        = ctx->cinema_mode;
-    ctx->enc_params.prog_order       = ctx->prog_order;
-    ctx->enc_params.numresolution    = ctx->numresolution;
-    ctx->enc_params.cp_disto_alloc   = ctx->disto_alloc;
-    ctx->enc_params.cp_fixed_alloc   = ctx->fixed_alloc;
+#if HAVE_OPENJPEG_2_1_OPENJPEG_H
+    switch (ctx->cinema_mode) {
+    case OPJ_CINEMA2K_24:
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_2K;
+        ctx->enc_params.max_cs_size = OPJ_CINEMA_24_CS;
+        ctx->enc_params.max_comp_size = OPJ_CINEMA_24_COMP;
+        break;
+    case OPJ_CINEMA2K_48:
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_2K;
+        ctx->enc_params.max_cs_size = OPJ_CINEMA_48_CS;
+        ctx->enc_params.max_comp_size = OPJ_CINEMA_48_COMP;
+        break;
+    case OPJ_CINEMA4K_24:
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_4K;
+        ctx->enc_params.max_cs_size = OPJ_CINEMA_24_CS;
+        ctx->enc_params.max_comp_size = OPJ_CINEMA_24_COMP;
+        break;
+    }
+
+    switch (ctx->profile) {
+    case OPJ_CINEMA2K:
+        if (ctx->enc_params.rsiz == OPJ_PROFILE_CINEMA_4K) {
+            err = AVERROR(EINVAL);
+            break;
+        }
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_2K;
+        break;
+    case OPJ_CINEMA4K:
+        if (ctx->enc_params.rsiz == OPJ_PROFILE_CINEMA_2K) {
+            err = AVERROR(EINVAL);
+            break;
+        }
+        ctx->enc_params.rsiz = OPJ_PROFILE_CINEMA_4K;
+        break;
+    }
+
+    if (err) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Invalid parameter pairing: cinema_mode and profile conflict.\n");
+        goto fail;
+    }
+#else
+    ctx->enc_params.cp_rsiz = ctx->profile;
+    ctx->enc_params.cp_cinema = ctx->cinema_mode;
+#endif
+
+    ctx->enc_params.mode = !!avctx->global_quality;
+    ctx->enc_params.prog_order = ctx->prog_order;
+    ctx->enc_params.numresolution = ctx->numresolution;
+    ctx->enc_params.cp_disto_alloc = ctx->disto_alloc;
+    ctx->enc_params.cp_fixed_alloc = ctx->fixed_alloc;
     ctx->enc_params.cp_fixed_quality = ctx->fixed_quality;
-    ctx->enc_params.tcp_numlayers    = ctx->numlayers;
-    ctx->enc_params.tcp_rates[0]     = FFMAX(avctx->compression_level, 0) * 2;
+    ctx->enc_params.tcp_numlayers = ctx->numlayers;
+    ctx->enc_params.tcp_rates[0] = FFMAX(avctx->compression_level, 0) * 2;
 
-    ctx->compress = opj_create_compress(ctx->format);
-    if (!ctx->compress) {
-        av_log(avctx, AV_LOG_ERROR, "Error creating the compressor\n");
-        return AVERROR(ENOMEM);
+    if (ctx->cinema_mode > 0) {
+        cinema_parameters(&ctx->enc_params);
     }
 
-    ctx->image = libopenjpeg_create_image(avctx, &ctx->enc_params);
+    ctx->image = mj2_create_image(avctx, &ctx->enc_params);
     if (!ctx->image) {
         av_log(avctx, AV_LOG_ERROR, "Error creating the mj2 image\n");
         err = AVERROR(EINVAL);
         goto fail;
     }
 
-    ctx->event_mgr.info_handler    = info_callback;
-    ctx->event_mgr.error_handler   = error_callback;
-    ctx->event_mgr.warning_handler = warning_callback;
-    opj_set_event_mgr((opj_common_ptr) ctx->compress, &ctx->event_mgr, avctx);
-
     return 0;
 
 fail:
-    av_freep(&ctx->compress);
+    opj_image_destroy(ctx->image);
+    ctx->image = NULL;
     return err;
 }
 
-static void libopenjpeg_copy_packed8(AVCodecContext *avctx,
-                                     const AVFrame *frame, opj_image_t *image)
+static int libopenjpeg_copy_packed8(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
 
-    for (compno = 0; compno < numcomps; ++compno)
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         for (y = 0; y < avctx->height; ++y) {
-            image_index = y * avctx->width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * frame->linesize[0] + compno;
             for (x = 0; x < avctx->width; ++x) {
-                image->comps[compno].data[image_index++] =
-                    frame->data[0][frame_index];
+                image_line[x] = frame->data[0][frame_index];
                 frame_index += numcomps;
             }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
+    }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_packed16(AVCodecContext *avctx,
-                                      const AVFrame *frame, opj_image_t *image)
+// for XYZ 12 bit
+static int libopenjpeg_copy_packed12(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
     int x, y;
-    int image_index, frame_index;
+    int *image_line;
+    int frame_index;
     const int numcomps  = image->numcomps;
     uint16_t *frame_ptr = (uint16_t *)frame->data[0];
 
-    for (compno = 0; compno < numcomps; ++compno)
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         for (y = 0; y < avctx->height; ++y) {
-            image_index = y * avctx->width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * (frame->linesize[0] / 2) + compno;
             for (x = 0; x < avctx->width; ++x) {
-                image->comps[compno].data[image_index++] =
-                    frame_ptr[frame_index];
+                image_line[x] = frame_ptr[frame_index] >> 4;
                 frame_index += numcomps;
             }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
+    }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_unpacked8(AVCodecContext *avctx,
-                                       const AVFrame *frame, opj_image_t *image)
+static int libopenjpeg_copy_packed16(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int width, height;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int *image_line;
+    int frame_index;
+    const int numcomps = image->numcomps;
+    uint16_t *frame_ptr = (uint16_t*)frame->data[0];
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[0] / numcomps) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
+        for (y = 0; y < avctx->height; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            frame_index = y * (frame->linesize[0] / 2) + compno;
+            for (x = 0; x < avctx->width; ++x) {
+                image_line[x] = frame_ptr[frame_index];
+                frame_index += numcomps;
+            }
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
+        }
+    }
+
+    return 1;
+}
+
+static int libopenjpeg_copy_unpacked8(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
+{
+    int compno;
+    int x;
+    int y;
+    int width;
+    int height;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
 
     for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[compno]) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         width  = avctx->width / image->comps[compno].dx;
         height = avctx->height / image->comps[compno].dy;
         for (y = 0; y < height; ++y) {
-            image_index = y * width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * frame->linesize[compno];
             for (x = 0; x < width; ++x)
-                image->comps[compno].data[image_index++] =
-                    frame->data[compno][frame_index++];
+                image_line[x] = frame->data[compno][frame_index++];
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
     }
+
+    return 1;
 }
 
-static void libopenjpeg_copy_unpacked16(AVCodecContext *avctx,
-                                        const AVFrame *frame,
-                                        opj_image_t *image)
+static int libopenjpeg_copy_unpacked16(AVCodecContext *avctx, const AVFrame *frame, opj_image_t *image)
 {
     int compno;
-    int x, y;
-    int width, height;
-    int image_index, frame_index;
+    int x;
+    int y;
+    int width;
+    int height;
+    int *image_line;
+    int frame_index;
     const int numcomps = image->numcomps;
     uint16_t *frame_ptr;
 
     for (compno = 0; compno < numcomps; ++compno) {
+        if (image->comps[compno].w > frame->linesize[compno]) {
+            av_log(avctx, AV_LOG_ERROR, "Error: frame's linesize is too small for the image\n");
+            return 0;
+        }
+    }
+
+    for (compno = 0; compno < numcomps; ++compno) {
         width     = avctx->width / image->comps[compno].dx;
         height    = avctx->height / image->comps[compno].dy;
         frame_ptr = (uint16_t *)frame->data[compno];
         for (y = 0; y < height; ++y) {
-            image_index = y * width;
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
             frame_index = y * (frame->linesize[compno] / 2);
             for (x = 0; x < width; ++x)
-                image->comps[compno].data[image_index++] =
-                    frame_ptr[frame_index++];
+                image_line[x] = frame_ptr[frame_index++];
+            for (; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - 1];
+            }
+        }
+        for (; y < image->comps[compno].h; ++y) {
+            image_line = image->comps[compno].data + y * image->comps[compno].w;
+            for (x = 0; x < image->comps[compno].w; ++x) {
+                image_line[x] = image_line[x - image->comps[compno].w];
+            }
         }
     }
+
+    return 1;
 }
 
 static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                     const AVFrame *frame, int *got_packet)
 {
     LibOpenJPEGContext *ctx = avctx->priv_data;
-    opj_cinfo_t *compress   = ctx->compress;
     opj_image_t *image      = ctx->image;
-    opj_cio_t *stream;
-    int ret, len;
-
-    // x0, y0 is the top left corner of the image
-    // x1, y1 is the width, height of the reference grid
-    image->x0 = 0;
-    image->y0 = 0;
-    image->x1 = (avctx->width - 1) * ctx->enc_params.subsampling_dx + 1;
-    image->y1 = (avctx->height - 1) * ctx->enc_params.subsampling_dy + 1;
+#if OPENJPEG_MAJOR_VERSION == 1
+    opj_cinfo_t *compress   = NULL;
+    opj_cio_t *stream       = NULL;
+    int len;
+#else // OPENJPEG_MAJOR_VERSION == 2
+    opj_codec_t *compress   = NULL;
+    opj_stream_t *stream    = NULL;
+    PacketWriter writer     = { 0 };
+#endif // OPENJPEG_MAJOR_VERSION == 1
+    int cpyresult = 0;
+    int ret;
+    AVFrame *gbrframe;
 
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_YA8:
-        libopenjpeg_copy_packed8(avctx, frame, image);
+        cpyresult = libopenjpeg_copy_packed8(avctx, frame, image);
+        break;
+    case AV_PIX_FMT_XYZ12:
+        cpyresult = libopenjpeg_copy_packed12(avctx, frame, image);
         break;
     case AV_PIX_FMT_RGB48:
     case AV_PIX_FMT_RGBA64:
-        libopenjpeg_copy_packed16(avctx, frame, image);
+    case AV_PIX_FMT_YA16:
+        cpyresult = libopenjpeg_copy_packed16(avctx, frame, image);
+        break;
+    case AV_PIX_FMT_GBR24P:
+    case AV_PIX_FMT_GBRP9:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+    case AV_PIX_FMT_GBRP14:
+    case AV_PIX_FMT_GBRP16:
+        gbrframe = av_frame_clone(frame);
+        if (!gbrframe)
+            return AVERROR(ENOMEM);
+        gbrframe->data[0] = frame->data[2]; // swap to be rgb
+        gbrframe->data[1] = frame->data[0];
+        gbrframe->data[2] = frame->data[1];
+        gbrframe->linesize[0] = frame->linesize[2];
+        gbrframe->linesize[1] = frame->linesize[0];
+        gbrframe->linesize[2] = frame->linesize[1];
+        if (avctx->pix_fmt == AV_PIX_FMT_GBR24P) {
+            cpyresult = libopenjpeg_copy_unpacked8(avctx, gbrframe, image);
+        } else {
+            cpyresult = libopenjpeg_copy_unpacked16(avctx, gbrframe, image);
+        }
+        av_frame_free(&gbrframe);
         break;
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_YUV410P:
@@ -310,19 +645,36 @@ static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUVA420P:
-        libopenjpeg_copy_unpacked8(avctx, frame, image);
+    case AV_PIX_FMT_YUVA422P:
+    case AV_PIX_FMT_YUVA444P:
+        cpyresult = libopenjpeg_copy_unpacked8(avctx, frame, image);
         break;
     case AV_PIX_FMT_GRAY16:
     case AV_PIX_FMT_YUV420P9:
     case AV_PIX_FMT_YUV422P9:
     case AV_PIX_FMT_YUV444P9:
+    case AV_PIX_FMT_YUVA420P9:
+    case AV_PIX_FMT_YUVA422P9:
+    case AV_PIX_FMT_YUVA444P9:
     case AV_PIX_FMT_YUV444P10:
     case AV_PIX_FMT_YUV422P10:
     case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUVA444P10:
+    case AV_PIX_FMT_YUVA422P10:
+    case AV_PIX_FMT_YUVA420P10:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+    case AV_PIX_FMT_YUV420P14:
+    case AV_PIX_FMT_YUV422P14:
+    case AV_PIX_FMT_YUV444P14:
     case AV_PIX_FMT_YUV444P16:
     case AV_PIX_FMT_YUV422P16:
     case AV_PIX_FMT_YUV420P16:
-        libopenjpeg_copy_unpacked16(avctx, frame, image);
+    case AV_PIX_FMT_YUVA444P16:
+    case AV_PIX_FMT_YUVA422P16:
+    case AV_PIX_FMT_YUVA420P16:
+        cpyresult = libopenjpeg_copy_unpacked16(avctx, frame, image);
         break;
     default:
         av_log(avctx, AV_LOG_ERROR,
@@ -332,71 +684,146 @@ static int libopenjpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         break;
     }
 
+    if (!cpyresult) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Could not copy the frame data to the internal image buffer\n");
+        return -1;
+    }
+
+#if OPENJPEG_MAJOR_VERSION == 2
+    if ((ret = ff_alloc_packet2(avctx, pkt, 1024, 0)) < 0) {
+        return ret;
+    }
+#endif // OPENJPEG_MAJOR_VERSION == 2
+
+    compress = opj_create_compress(ctx->format);
+    if (!compress) {
+        av_log(avctx, AV_LOG_ERROR, "Error creating the compressor\n");
+        ret = AVERROR(ENOMEM);
+        goto done;
+    }
+
+#if OPENJPEG_MAJOR_VERSION == 1
     opj_setup_encoder(compress, &ctx->enc_params, image);
     stream = opj_cio_open((opj_common_ptr) compress, NULL, 0);
+#else // OPENJPEG_MAJOR_VERSION == 2
+    if (!opj_set_error_handler(compress, error_callback, avctx) ||
+        !opj_set_warning_handler(compress, warning_callback, avctx) ||
+        !opj_set_info_handler(compress, info_callback, avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting the compressor handlers\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    if (!opj_setup_encoder(compress, &ctx->enc_params, image)) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting up the compressor\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+    stream = opj_stream_default_create(OPJ_STREAM_WRITE);
+#endif // OPENJPEG_MAJOR_VERSION == 1
+
     if (!stream) {
         av_log(avctx, AV_LOG_ERROR, "Error creating the cio stream\n");
-        return AVERROR(ENOMEM);
+        ret = AVERROR(ENOMEM);
+        goto done;
     }
-
+#if OPENJPEG_MAJOR_VERSION == 1
+    memset(&ctx->event_mgr, 0, sizeof(ctx->event_mgr));
+    ctx->event_mgr.info_handler    = info_callback;
+    ctx->event_mgr.error_handler   = error_callback;
+    ctx->event_mgr.warning_handler = warning_callback;
+    opj_set_event_mgr((opj_common_ptr) compress, &ctx->event_mgr, avctx);
     if (!opj_encode(compress, stream, image, NULL)) {
-        opj_cio_close(stream);
         av_log(avctx, AV_LOG_ERROR, "Error during the opj encode\n");
-        return -1;
+        ret = AVERROR_EXTERNAL;
+        goto done;
     }
 
     len = cio_tell(stream);
-    if ((ret = ff_alloc_packet(pkt, len)) < 0) {
-        opj_cio_close(stream);
-        return ret;
+    if ((ret = ff_alloc_packet2(avctx, pkt, len, 0)) < 0) {
+        goto done;
     }
 
     memcpy(pkt->data, stream->buffer, len);
+#else // OPENJPEG_MAJOR_VERSION == 2
+    writer.packet = pkt;
+    opj_stream_set_write_function(stream, stream_write);
+    opj_stream_set_skip_function(stream, stream_skip);
+    opj_stream_set_seek_function(stream, stream_seek);
+#if HAVE_OPENJPEG_2_1_OPENJPEG_H
+    opj_stream_set_user_data(stream, &writer, NULL);
+#elif HAVE_OPENJPEG_2_0_OPENJPEG_H
+    opj_stream_set_user_data(stream, &writer);
+#else
+#error Missing call to opj_stream_set_user_data
+#endif
+
+    if (!opj_start_compress(compress, ctx->image, stream) ||
+        !opj_encode(compress, stream) ||
+        !opj_end_compress(compress, stream)) {
+        av_log(avctx, AV_LOG_ERROR, "Error during the opj encode\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    av_shrink_packet(pkt, writer.pos);
+#endif // OPENJPEG_MAJOR_VERSION == 1
+
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
+    ret = 0;
+
+done:
+#if OPENJPEG_MAJOR_VERSION == 2
+    opj_stream_destroy(stream);
+    opj_destroy_codec(compress);
+#else
     opj_cio_close(stream);
-    return 0;
+    opj_destroy_compress(compress);
+#endif
+    return ret;
 }
 
 static av_cold int libopenjpeg_encode_close(AVCodecContext *avctx)
 {
     LibOpenJPEGContext *ctx = avctx->priv_data;
 
-    opj_destroy_compress(ctx->compress);
     opj_image_destroy(ctx->image);
+    ctx->image = NULL;
     return 0;
 }
 
 #define OFFSET(x) offsetof(LibOpenJPEGContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "format",        "Codec Format",      OFFSET(format),        AV_OPT_TYPE_INT,   { .i64 = CODEC_JP2   }, CODEC_J2K, CODEC_JP2,   VE, "format"      },
-    { "j2k",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_J2K   }, 0,         0,           VE, "format"      },
-    { "jp2",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CODEC_JP2   }, 0,         0,           VE, "format"      },
-    { "profile",       NULL,                OFFSET(profile),       AV_OPT_TYPE_INT,   { .i64 = STD_RSIZ    }, STD_RSIZ,  CINEMA4K,    VE, "profile"     },
-    { "jpeg2000",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = STD_RSIZ    }, 0,         0,           VE, "profile"     },
-    { "cinema2k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA2K    }, 0,         0,           VE, "profile"     },
-    { "cinema4k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA4K    }, 0,         0,           VE, "profile"     },
-    { "cinema_mode",   "Digital Cinema",    OFFSET(cinema_mode),   AV_OPT_TYPE_INT,   { .i64 = OFF         }, OFF,       CINEMA4K_24, VE, "cinema_mode" },
-    { "off",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OFF         }, 0,         0,           VE, "cinema_mode" },
-    { "2k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA2K_24 }, 0,         0,           VE, "cinema_mode" },
-    { "2k_48",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA2K_48 }, 0,         0,           VE, "cinema_mode" },
-    { "4k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CINEMA4K_24 }, 0,         0,           VE, "cinema_mode" },
-    { "prog_order",    "Progression Order", OFFSET(prog_order),    AV_OPT_TYPE_INT,   { .i64 = LRCP        }, LRCP,      CPRL,        VE, "prog_order"  },
-    { "lrcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = LRCP        }, 0,         0,           VE, "prog_order"  },
-    { "rlcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = RLCP        }, 0,         0,           VE, "prog_order"  },
-    { "rpcl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = RPCL        }, 0,         0,           VE, "prog_order"  },
-    { "pcrl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = PCRL        }, 0,         0,           VE, "prog_order"  },
-    { "cprl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = CPRL        }, 0,         0,           VE, "prog_order"  },
-    { "numresolution", NULL,                OFFSET(numresolution), AV_OPT_TYPE_INT,   { .i64 = 6           }, 1,         10,          VE },
-    { "numlayers",     NULL,                OFFSET(numlayers),     AV_OPT_TYPE_INT,   { .i64 = 1           }, 1,         10,          VE },
-    { "disto_alloc",   NULL,                OFFSET(disto_alloc),   AV_OPT_TYPE_INT,   { .i64 = 1           }, 0,         1,           VE },
-    { "fixed_alloc",   NULL,                OFFSET(fixed_alloc),   AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE },
-    { "fixed_quality", NULL,                OFFSET(fixed_quality), AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE },
+    { "format",        "Codec Format",      OFFSET(format),        AV_OPT_TYPE_INT,   { .i64 = OPJ(CODEC_JP2)   }, OPJ(CODEC_J2K), OPJ(CODEC_JP2),   VE, "format"      },
+    { "j2k",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CODEC_J2K)   }, 0,         0,           VE, "format"      },
+    { "jp2",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CODEC_JP2)   }, 0,         0,           VE, "format"      },
+    { "profile",       NULL,                OFFSET(profile),       AV_OPT_TYPE_INT,   { .i64 = OPJ(STD_RSIZ)    }, OPJ(STD_RSIZ),  OPJ(CINEMA4K),    VE, "profile"     },
+    { "jpeg2000",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(STD_RSIZ)    }, 0,         0,           VE, "profile"     },
+    { "cinema2k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CINEMA2K)    }, 0,         0,           VE, "profile"     },
+    { "cinema4k",      NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CINEMA4K)    }, 0,         0,           VE, "profile"     },
+    { "cinema_mode",   "Digital Cinema",    OFFSET(cinema_mode),   AV_OPT_TYPE_INT,   { .i64 = OPJ(OFF)         }, OPJ(OFF),       OPJ(CINEMA4K_24), VE, "cinema_mode" },
+    { "off",           NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(OFF)         }, 0,         0,           VE, "cinema_mode" },
+    { "2k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CINEMA2K_24) }, 0,         0,           VE, "cinema_mode" },
+    { "2k_48",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CINEMA2K_48) }, 0,         0,           VE, "cinema_mode" },
+    { "4k_24",         NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CINEMA4K_24) }, 0,         0,           VE, "cinema_mode" },
+    { "prog_order",    "Progression Order", OFFSET(prog_order),    AV_OPT_TYPE_INT,   { .i64 = OPJ(LRCP)    }, OPJ(LRCP),  OPJ(CPRL),    VE, "prog_order"  },
+    { "lrcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(LRCP)    }, 0,         0,           VE, "prog_order"  },
+    { "rlcp",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(RLCP)    }, 0,         0,           VE, "prog_order"  },
+    { "rpcl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(RPCL)    }, 0,         0,           VE, "prog_order"  },
+    { "pcrl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(PCRL)    }, 0,         0,           VE, "prog_order"  },
+    { "cprl",          NULL,                0,                     AV_OPT_TYPE_CONST, { .i64 = OPJ(CPRL)    }, 0,         0,           VE, "prog_order"  },
+    { "numresolution", NULL,                OFFSET(numresolution), AV_OPT_TYPE_INT,   { .i64 = 6           }, 1,         INT_MAX,     VE                },
+    { "numlayers",     NULL,                OFFSET(numlayers),     AV_OPT_TYPE_INT,   { .i64 = 1           }, 1,         10,          VE                },
+    { "disto_alloc",   NULL,                OFFSET(disto_alloc),   AV_OPT_TYPE_INT,   { .i64 = 1           }, 0,         1,           VE                },
+    { "fixed_alloc",   NULL,                OFFSET(fixed_alloc),   AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE                },
+    { "fixed_quality", NULL,                OFFSET(fixed_quality), AV_OPT_TYPE_INT,   { .i64 = 0           }, 0,         1,           VE                },
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass openjpeg_class = {
     .class_name = "libopenjpeg",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -412,18 +839,25 @@ AVCodec ff_libopenjpeg_encoder = {
     .init           = libopenjpeg_encode_init,
     .encode2        = libopenjpeg_encode_frame,
     .close          = libopenjpeg_encode_close,
-    .capabilities   = 0,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA, AV_PIX_FMT_RGB48,
-        AV_PIX_FMT_RGBA64,
-        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA8,
+        AV_PIX_FMT_RGBA64, AV_PIX_FMT_GBR24P,
+        AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_YA8, AV_PIX_FMT_GRAY16, AV_PIX_FMT_YA16,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUVA420P,
-        AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P,
+        AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA422P,
+        AV_PIX_FMT_YUV411P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUVA444P,
         AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9,
+        AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA444P9,
         AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+        AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA444P10,
+        AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
+        AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14,
         AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
+        AV_PIX_FMT_YUVA420P16, AV_PIX_FMT_YUVA422P16, AV_PIX_FMT_YUVA444P16,
+        AV_PIX_FMT_XYZ12,
         AV_PIX_FMT_NONE
     },
-    .priv_class     = &class,
+    .priv_class     = &openjpeg_class,
 };
diff --git a/libavcodec/libopus.c b/libavcodec/libopus.c
index 9a0d5b0..16395c7 100644
--- a/libavcodec/libopus.c
+++ b/libavcodec/libopus.c
@@ -2,20 +2,20 @@
  * libopus encoder/decoder common code
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libopus.h b/libavcodec/libopus.h
index 2334c84..a8223d1 100644
--- a/libavcodec/libopus.h
+++ b/libavcodec/libopus.h
@@ -2,20 +2,20 @@
  * libopus encoder/decoder common code
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libopusdec.c b/libavcodec/libopusdec.c
index c6e1573..acc62f1 100644
--- a/libavcodec/libopusdec.c
+++ b/libavcodec/libopusdec.c
@@ -2,26 +2,27 @@
  * Opus decoder using libopus
  * Copyright (c) 2012 Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <opus.h>
 #include <opus_multistream.h>
 
+#include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
@@ -32,6 +33,10 @@
 
 struct libopus_context {
     OpusMSDecoder *dec;
+    int pre_skip;
+#ifndef OPUS_SET_GAIN
+    union { int i; double d; } gain;
+#endif
 };
 
 #define OPUS_HEAD_SIZE 19
@@ -49,6 +54,7 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
                           ff_vorbis_channel_layouts[avc->channels - 1];
 
     if (avc->extradata_size >= OPUS_HEAD_SIZE) {
+        opus->pre_skip = AV_RL16(avc->extradata + 10);
         gain_db     = sign_extend(AV_RL16(avc->extradata + 16), 16);
         channel_map = AV_RL8 (avc->extradata + 18);
     }
@@ -73,7 +79,7 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
         const uint8_t *vorbis_offset = ff_vorbis_channel_layout_offsets[avc->channels - 1];
         int ch;
 
-        /* Remap channels from Vorbis order to libav order */
+        /* Remap channels from Vorbis order to ffmpeg order */
         for (ch = 0; ch < avc->channels; ch++)
             mapping_arr[ch] = mapping[vorbis_offset[ch]];
         mapping = mapping_arr;
@@ -88,12 +94,23 @@ static av_cold int libopus_decode_init(AVCodecContext *avc)
         return ff_opus_error_to_averror(ret);
     }
 
+#ifdef OPUS_SET_GAIN
     ret = opus_multistream_decoder_ctl(opus->dec, OPUS_SET_GAIN(gain_db));
     if (ret != OPUS_OK)
         av_log(avc, AV_LOG_WARNING, "Failed to set gain: %s\n",
                opus_strerror(ret));
+#else
+    {
+        double gain_lin = ff_exp10(gain_db / (20.0 * 256));
+        if (avc->sample_fmt == AV_SAMPLE_FMT_FLT)
+            opus->gain.d = gain_lin;
+        else
+            opus->gain.i = FFMIN(gain_lin * 65536, INT_MAX);
+    }
+#endif
 
-    avc->delay = 3840;  /* Decoder delay (in samples) at 48kHz */
+    /* Decoder delay (in samples) at 48kHz */
+    avc->delay = avc->internal->skip_samples = opus->pre_skip;
 
     return 0;
 }
@@ -116,11 +133,8 @@ static int libopus_decode(AVCodecContext *avc, void *data,
     int ret, nb_samples;
 
     frame->nb_samples = MAX_FRAME_SIZE;
-    ret = ff_get_buffer(avc, frame, 0);
-    if (ret < 0) {
-        av_log(avc, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avc, frame, 0)) < 0)
         return ret;
-    }
 
     if (avc->sample_fmt == AV_SAMPLE_FMT_S16)
         nb_samples = opus_multistream_decode(opus->dec, pkt->data, pkt->size,
@@ -137,6 +151,21 @@ static int libopus_decode(AVCodecContext *avc, void *data,
         return ff_opus_error_to_averror(nb_samples);
     }
 
+#ifndef OPUS_SET_GAIN
+    {
+        int i = avc->channels * nb_samples;
+        if (avc->sample_fmt == AV_SAMPLE_FMT_FLT) {
+            float *pcm = (float *)frame->data[0];
+            for (; i > 0; i--, pcm++)
+                *pcm = av_clipf(*pcm * opus->gain.d, -1, 1);
+        } else {
+            int16_t *pcm = (int16_t *)frame->data[0];
+            for (; i > 0; i--, pcm++)
+                *pcm = av_clip_int16(((int64_t)opus->gain.i * *pcm) >> 16);
+        }
+    }
+#endif
+
     frame->nb_samples = nb_samples;
     *got_frame_ptr    = 1;
 
@@ -148,6 +177,9 @@ static void libopus_flush(AVCodecContext *avc)
     struct libopus_context *opus = avc->priv_data;
 
     opus_multistream_decoder_ctl(opus->dec, OPUS_RESET_STATE);
+    /* The stream can have been extracted by a tool that is not Opus-aware.
+       Therefore, any packet can become the first of the stream. */
+    avc->internal->skip_samples = opus->pre_skip;
 }
 
 AVCodec ff_libopus_decoder = {
diff --git a/libavcodec/libopusenc.c b/libavcodec/libopusenc.c
index 1fb597b..3f3e80d 100644
--- a/libavcodec/libopusenc.c
+++ b/libavcodec/libopusenc.c
@@ -2,20 +2,20 @@
  * Opus encoder using libopus
  * Copyright (c) 2012 Nathan Caldwell
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,8 +65,8 @@ static const uint8_t opus_vorbis_channel_map[8][8] = {
     { 0, 6, 1, 2, 3, 4, 5, 7 },
 };
 
-/* libav to libopus channel order mapping, passed to libopus */
-static const uint8_t libav_libopus_channel_map[8][8] = {
+/* libavcodec to libopus channel order mapping, passed to libopus */
+static const uint8_t libavcodec_libopus_channel_map[8][8] = {
     { 0 },
     { 0, 1 },
     { 0, 1, 2 },
@@ -107,6 +107,13 @@ static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
 {
     int ret;
 
+    if (avctx->global_quality) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Quality-based encoding not supported, "
+               "please specify a bitrate and VBR setting.\n");
+        return AVERROR(EINVAL);
+    }
+
     ret = opus_multistream_encoder_ctl(enc, OPUS_SET_BITRATE(avctx->bit_rate));
     if (ret != OPUS_OK) {
         av_log(avctx, AV_LOG_ERROR,
@@ -149,7 +156,7 @@ static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
     return OPUS_OK;
 }
 
-static int av_cold libopus_encode_init(AVCodecContext *avctx)
+static av_cold int libopus_encode_init(AVCodecContext *avctx)
 {
     LibopusEncContext *opus = avctx->priv_data;
     const uint8_t *channel_mapping;
@@ -159,7 +166,7 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
 
     coupled_stream_count = opus_coupled_streams[avctx->channels - 1];
     opus->stream_count   = avctx->channels - coupled_stream_count;
-    channel_mapping      = libav_libopus_channel_map[avctx->channels - 1];
+    channel_mapping      = libavcodec_libopus_channel_map[avctx->channels - 1];
 
     /* FIXME: Opus can handle up to 255 channels. However, the mapping for
      * anything greater than 8 is undefined. */
@@ -173,12 +180,12 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
         avctx->bit_rate = 64000 * opus->stream_count +
                           32000 * coupled_stream_count;
         av_log(avctx, AV_LOG_WARNING,
-               "No bit rate set. Defaulting to %d bps.\n", avctx->bit_rate);
+               "No bit rate set. Defaulting to %"PRId64" bps.\n", (int64_t)avctx->bit_rate);
     }
 
     if (avctx->bit_rate < 500 || avctx->bit_rate > 256000 * avctx->channels) {
-        av_log(avctx, AV_LOG_ERROR, "The bit rate %d bps is unsupported. "
-               "Please choose a value between 500 and %d.\n", avctx->bit_rate,
+        av_log(avctx, AV_LOG_ERROR, "The bit rate %"PRId64" bps is unsupported. "
+               "Please choose a value between 500 and %d.\n", (int64_t)avctx->bit_rate,
                256000 * avctx->channels);
         return AVERROR(EINVAL);
     }
@@ -270,7 +277,7 @@ static int av_cold libopus_encode_init(AVCodecContext *avctx)
     }
     avctx->extradata_size = header_size;
 
-    opus->samples = av_mallocz(frame_size * avctx->channels *
+    opus->samples = av_mallocz_array(frame_size, avctx->channels *
                                av_get_bytes_per_sample(avctx->sample_fmt));
     if (!opus->samples) {
         av_log(avctx, AV_LOG_ERROR, "Failed to allocate samples buffer.\n");
@@ -307,6 +314,7 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
                               av_get_bytes_per_sample(avctx->sample_fmt);
     uint8_t *audio;
     int ret;
+    int discard_padding;
 
     if (frame) {
         ret = ff_af_queue_add(&opus->afq, frame);
@@ -318,7 +326,7 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
         } else
             audio = frame->data[0];
     } else {
-        if (!opus->afq.remaining_samples)
+        if (!opus->afq.remaining_samples || (!opus->afq.frame_alloc && !opus->afq.frame_count))
             return 0;
         audio = opus->samples;
         memset(audio, 0, opus->opts.packet_size * sample_size);
@@ -327,10 +335,8 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
     /* Maximum packet size taken from opusenc in opus-tools. 60ms packets
      * consist of 3 frames in one packet. The maximum frame size is 1275
      * bytes along with the largest possible packet header of 7 bytes. */
-    if (ret = ff_alloc_packet(avpkt, (1275 * 3 + 7) * opus->stream_count)) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, (1275 * 3 + 7) * opus->stream_count, 0)) < 0)
         return ret;
-    }
 
     if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT)
         ret = opus_multistream_encode_float(opus->enc, (float *)audio,
@@ -352,12 +358,31 @@ static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
     ff_af_queue_remove(&opus->afq, opus->opts.packet_size,
                        &avpkt->pts, &avpkt->duration);
 
+    discard_padding = opus->opts.packet_size - avpkt->duration;
+    // Check if subtraction resulted in an overflow
+    if ((discard_padding < opus->opts.packet_size) != (avpkt->duration > 0)) {
+        av_packet_unref(avpkt);
+        av_free(avpkt);
+        return AVERROR(EINVAL);
+    }
+    if (discard_padding > 0) {
+        uint8_t* side_data = av_packet_new_side_data(avpkt,
+                                                     AV_PKT_DATA_SKIP_SAMPLES,
+                                                     10);
+        if(!side_data) {
+            av_packet_unref(avpkt);
+            av_free(avpkt);
+            return AVERROR(ENOMEM);
+        }
+        AV_WL32(side_data + 4, discard_padding);
+    }
+
     *got_packet_ptr = 1;
 
     return 0;
 }
 
-static int av_cold libopus_encode_close(AVCodecContext *avctx)
+static av_cold int libopus_encode_close(AVCodecContext *avctx)
 {
     LibopusEncContext *opus = avctx->priv_data;
 
diff --git a/libavcodec/libschroedinger.c b/libavcodec/libschroedinger.c
index 16e0fe8..0b02b2c 100644
--- a/libavcodec/libschroedinger.c
+++ b/libavcodec/libschroedinger.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -192,7 +192,6 @@ SchroFrame *ff_create_schro_frame(AVCodecContext *avctx,
 
     if (ff_get_buffer(avctx, p_pic, AV_GET_BUFFER_FLAG_REF) < 0) {
         av_frame_free(&p_pic);
-        av_log(avctx, AV_LOG_ERROR, "Unable to allocate buffer\n");
         return NULL;
     }
 
diff --git a/libavcodec/libschroedinger.h b/libavcodec/libschroedinger.h
index 5481f92..12fe57c 100644
--- a/libavcodec/libschroedinger.h
+++ b/libavcodec/libschroedinger.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libschroedingerdec.c b/libavcodec/libschroedingerdec.c
index fb0781e..152cbe7 100644
--- a/libavcodec/libschroedingerdec.c
+++ b/libavcodec/libschroedingerdec.c
@@ -2,20 +2,20 @@
  * Dirac decoder support via Schroedinger libraries
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -130,7 +130,7 @@ static SchroBuffer *find_next_parse_unit(SchroParseUnitContext *parse_ctx)
 }
 
 /**
-* Returns Libav chroma format.
+* Returns FFmpeg chroma format.
 */
 static enum AVPixelFormat get_chroma_format(SchroChromaFormat schro_pix_fmt)
 {
@@ -175,7 +175,7 @@ static void libschroedinger_handle_first_access_unit(AVCodecContext *avctx)
 
     p_schro_params->format = schro_decoder_get_video_format(decoder);
 
-    /* Tell Libav about sequence details. */
+    /* Tell FFmpeg about sequence details. */
     if (av_image_check_size(p_schro_params->format->width,
                             p_schro_params->format->height, 0, avctx) < 0) {
         av_log(avctx, AV_LOG_ERROR, "invalid dimensions (%dx%d)\n",
@@ -308,10 +308,10 @@ static int libschroedinger_decode_frame(AVCodecContext *avctx,
     framewithpts = ff_schro_queue_pop(&p_schro_params->dec_frame_queue);
 
     if (framewithpts && framewithpts->frame) {
-        if (ff_get_buffer(avctx, avframe, 0) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "Unable to allocate buffer\n");
-            return AVERROR(ENOMEM);
-        }
+        int ret;
+
+        if ((ret = ff_get_buffer(avctx, avframe, 0)) < 0)
+            return ret;
 
         memcpy(avframe->data[0],
                framewithpts->frame->components[0].data,
diff --git a/libavcodec/libschroedingerenc.c b/libavcodec/libschroedingerenc.c
index bf03cb7..cf4baa1 100644
--- a/libavcodec/libschroedingerenc.c
+++ b/libavcodec/libschroedingerenc.c
@@ -2,20 +2,20 @@
  * Dirac encoder support via Schroedinger libraries
  * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include <schroedinger/schrovideoformat.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/opt.h"
 
@@ -43,6 +44,8 @@
 
 /** libschroedinger encoder private data */
 typedef struct SchroEncoderParams {
+    AVClass        *class;
+
     /** Schroedinger video format */
     SchroVideoFormat *format;
 
@@ -390,10 +393,8 @@ static int libschroedinger_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     pkt_size = p_frame_output->size;
     if (last_frame_in_sequence && p_schro_params->enc_buf_size > 0)
         pkt_size += p_schro_params->enc_buf_size;
-    if ((ret = ff_alloc_packet(pkt, pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
         goto error;
-    }
 
     memcpy(pkt->data, p_frame_output->p_encbuf, p_frame_output->size);
 #if FF_API_CODED_FRAME
diff --git a/libavcodec/libshine.c b/libavcodec/libshine.c
new file mode 100644
index 0000000..f4cf598
--- /dev/null
+++ b/libavcodec/libshine.c
@@ -0,0 +1,149 @@
+/*
+ * Interface to libshine for mp3 encoding
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <shine/layer3.h>
+
+#include "libavutil/intreadwrite.h"
+#include "audio_frame_queue.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "mpegaudio.h"
+#include "mpegaudiodecheader.h"
+
+#define BUFFER_SIZE (4096 * 20)
+
+typedef struct SHINEContext {
+    shine_config_t  config;
+    shine_t         shine;
+    uint8_t         buffer[BUFFER_SIZE];
+    int             buffer_index;
+    AudioFrameQueue afq;
+} SHINEContext;
+
+static av_cold int libshine_encode_init(AVCodecContext *avctx)
+{
+    SHINEContext *s = avctx->priv_data;
+
+    if (avctx->channels <= 0 || avctx->channels > 2){
+        av_log(avctx, AV_LOG_ERROR, "only mono or stereo is supported\n");
+        return AVERROR(EINVAL);
+    }
+
+    shine_set_config_mpeg_defaults(&s->config.mpeg);
+    if (avctx->bit_rate)
+        s->config.mpeg.bitr = avctx->bit_rate / 1000;
+    s->config.mpeg.mode = avctx->channels == 2 ? STEREO : MONO;
+    s->config.wave.samplerate = avctx->sample_rate;
+    s->config.wave.channels   = avctx->channels == 2 ? PCM_STEREO : PCM_MONO;
+    if (shine_check_config(s->config.wave.samplerate, s->config.mpeg.bitr) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "invalid configuration\n");
+        return AVERROR(EINVAL);
+    }
+    s->shine = shine_initialise(&s->config);
+    if (!s->shine)
+        return AVERROR(ENOMEM);
+    avctx->frame_size = shine_samples_per_pass(s->shine);
+    ff_af_queue_init(avctx, &s->afq);
+    return 0;
+}
+
+static int libshine_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                 const AVFrame *frame, int *got_packet_ptr)
+{
+    SHINEContext *s = avctx->priv_data;
+    MPADecodeHeader hdr;
+    unsigned char *data;
+    int written;
+    int ret, len;
+
+    if (frame)
+        data = shine_encode_buffer(s->shine, (int16_t **)frame->data, &written);
+    else
+        data = shine_flush(s->shine, &written);
+    if (written < 0)
+        return -1;
+    if (written > 0) {
+        if (s->buffer_index + written > BUFFER_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "internal buffer too small\n");
+            return AVERROR_BUG;
+        }
+        memcpy(s->buffer + s->buffer_index, data, written);
+        s->buffer_index += written;
+    }
+    if (frame) {
+        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
+            return ret;
+    }
+
+    if (s->buffer_index < 4 || !s->afq.frame_count)
+        return 0;
+    if (avpriv_mpegaudio_decode_header(&hdr, AV_RB32(s->buffer))) {
+        av_log(avctx, AV_LOG_ERROR, "free format output not supported\n");
+        return -1;
+    }
+
+    len = hdr.frame_size;
+    if (len <= s->buffer_index) {
+        if ((ret = ff_alloc_packet2(avctx, avpkt, len, 0)))
+            return ret;
+        memcpy(avpkt->data, s->buffer, len);
+        s->buffer_index -= len;
+        memmove(s->buffer, s->buffer + len, s->buffer_index);
+
+        ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
+                           &avpkt->duration);
+
+        avpkt->size = len;
+        *got_packet_ptr = 1;
+    }
+    return 0;
+}
+
+static av_cold int libshine_encode_close(AVCodecContext *avctx)
+{
+    SHINEContext *s = avctx->priv_data;
+
+    ff_af_queue_close(&s->afq);
+    shine_close(s->shine);
+    return 0;
+}
+
+static const int libshine_sample_rates[] = {
+    44100, 48000, 32000, 0
+};
+
+AVCodec ff_libshine_encoder = {
+    .name                  = "libshine",
+    .long_name             = NULL_IF_CONFIG_SMALL("libshine MP3 (MPEG audio layer 3)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP3,
+    .priv_data_size        = sizeof(SHINEContext),
+    .init                  = libshine_encode_init,
+    .encode2               = libshine_encode_frame,
+    .close                 = libshine_encode_close,
+    .capabilities          = AV_CODEC_CAP_DELAY,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16P,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = libshine_sample_rates,
+    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
+                                                  AV_CH_LAYOUT_STEREO,
+                                                  0 },
+};
diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index 949a934..044883a 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@ typedef struct LibSpeexContext {
     SpeexStereoState stereo;
     void *dec_state;
     int frame_size;
+    int pktsize;
 } LibSpeexContext;
 
 
@@ -43,14 +44,30 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
     SpeexHeader *header = NULL;
     int spx_mode;
 
-    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
     if (avctx->extradata && avctx->extradata_size >= 80) {
         header = speex_packet_to_header(avctx->extradata,
                                         avctx->extradata_size);
         if (!header)
             av_log(avctx, AV_LOG_WARNING, "Invalid Speex header\n");
     }
-    if (header) {
+    if (avctx->codec_tag == MKTAG('S', 'P', 'X', 'N')) {
+        int quality;
+        if (!avctx->extradata || avctx->extradata && avctx->extradata_size < 47) {
+            av_log(avctx, AV_LOG_ERROR, "Missing or invalid extradata.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        quality = avctx->extradata[37];
+        if (quality > 10) {
+            av_log(avctx, AV_LOG_ERROR, "Unsupported quality mode %d.\n", quality);
+            return AVERROR_PATCHWELCOME;
+        }
+
+        s->pktsize = ((const int[]){5,10,15,20,20,28,28,38,38,46,62})[quality];
+
+        spx_mode           = 0;
+    } else if (header) {
+        avctx->sample_rate = header->rate;
         avctx->channels    = header->nb_channels;
         spx_mode           = header->mode;
         speex_header_free(header);
@@ -73,8 +90,9 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", spx_mode);
         return AVERROR_INVALIDDATA;
     }
-    avctx->sample_rate = 8000 << spx_mode;
     s->frame_size      =  160 << spx_mode;
+    if (!avctx->sample_rate)
+        avctx->sample_rate = 8000 << spx_mode;
 
     if (avctx->channels < 1 || avctx->channels > 2) {
         /* libspeex can handle mono or stereo if initialized as stereo */
@@ -113,13 +131,12 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
     AVFrame *frame     = data;
     int16_t *output;
     int ret, consumed = 0;
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
 
     /* get output buffer */
     frame->nb_samples = s->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output = (int16_t *)frame->data[0];
 
     /* if there is not enough data left for the smallest possible frame or the
@@ -133,9 +150,11 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
             *got_frame_ptr = 0;
             return buf_size;
         }
+        if (s->pktsize && buf_size == 62)
+            buf_size = s->pktsize;
         /* set new buffer */
         speex_bits_read_from(&s->bits, buf, buf_size);
-        consumed = buf_size;
+        consumed = avpkt->size;
     }
 
     /* decode a single frame */
@@ -149,6 +168,8 @@ static int libspeex_decode_frame(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 1;
 
+    if (!avctx->bit_rate)
+        speex_decoder_ctl(s->dec_state, SPEEX_GET_BITRATE, &avctx->bit_rate);
     return consumed;
 }
 
diff --git a/libavcodec/libspeexenc.c b/libavcodec/libspeexenc.c
index f3a31e9..4bdb961 100644
--- a/libavcodec/libspeexenc.c
+++ b/libavcodec/libspeexenc.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2009 Justin Ruggles
  * Copyright (c) 2009 Xuggle Incorporated
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,7 +76,7 @@
  *     encodes them with just enough bits to reproduce the background noise.
  *
  * Discontinuous Transmission (DTX)
- *     DTX is an addition to VAD/VBR operation, that allows to stop transmitting
+ *     DTX is an addition to VAD/VBR operation, that makes it possible to stop transmitting
  *     completely when the background noise is stationary.
  *     In file-based operation only 5 bits are used for such frames.
  */
@@ -92,6 +92,7 @@
 #include "internal.h"
 #include "audio_frame_queue.h"
 
+/* TODO: Think about converting abr, vad, dtx and such flags to a bit field */
 typedef struct LibSpeexEncContext {
     AVClass *class;             ///< AVClass for private options
     SpeexBits bits;             ///< libspeex bitwriter context
@@ -124,10 +125,10 @@ static av_cold void print_enc_params(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_DEBUG, "  quality: %f\n", s->vbr_quality);
     } else if (s->abr) {
         av_log(avctx, AV_LOG_DEBUG, "rate control: ABR\n");
-        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %d bps\n", avctx->bit_rate);
+        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %"PRId64" bps\n", (int64_t)avctx->bit_rate);
     } else {
         av_log(avctx, AV_LOG_DEBUG, "rate control: CBR\n");
-        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %d bps\n", avctx->bit_rate);
+        av_log(avctx, AV_LOG_DEBUG, "  bitrate: %"PRId64" bps\n", (int64_t)avctx->bit_rate);
     }
     av_log(avctx, AV_LOG_DEBUG, "complexity: %d\n",
            avctx->compression_level);
@@ -293,10 +294,8 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     /* write output if all frames for the packet have been encoded */
     if (s->pkt_frame_count == s->frames_per_packet) {
         s->pkt_frame_count = 0;
-        if ((ret = ff_alloc_packet(avpkt, speex_bits_nbytes(&s->bits)))) {
-            av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        if ((ret = ff_alloc_packet2(avctx, avpkt, speex_bits_nbytes(&s->bits), 0)) < 0)
             return ret;
-        }
         ret = speex_bits_write(&s->bits, avpkt->data, avpkt->size);
         speex_bits_reset(&s->bits);
 
@@ -335,7 +334,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass speex_class = {
     .class_name = "libspeex",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -364,6 +363,6 @@ AVCodec ff_libspeex_encoder = {
                                            AV_CH_LAYOUT_STEREO,
                                            0 },
     .supported_samplerates = (const int[]){ 8000, 16000, 32000, 0 },
-    .priv_class     = &class,
+    .priv_class     = &speex_class,
     .defaults       = defaults,
 };
diff --git a/libavcodec/libtheoraenc.c b/libavcodec/libtheoraenc.c
index b329ed3..fae55e8 100644
--- a/libavcodec/libtheoraenc.c
+++ b/libavcodec/libtheoraenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Paul Richards <paul.richards@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
  * and o_ prefixes on variables which are libogg types.
  */
 
-/* Libav includes */
+/* FFmpeg includes */
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/pixdesc.h"
@@ -96,13 +96,14 @@ static int get_stats(AVCodecContext *avctx, int eos)
     bytes = th_encode_ctl(h->t_state, TH_ENCCTL_2PASS_OUT, &buf, sizeof(buf));
     if (bytes < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting first pass stats\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
     if (!eos) {
-        h->stats = av_fast_realloc(h->stats, &h->stats_size,
+        void *tmp = av_fast_realloc(h->stats, &h->stats_size,
                                    h->stats_offset + bytes);
-        if (!h->stats)
+        if (!tmp)
             return AVERROR(ENOMEM);
+        h->stats = tmp;
         memcpy(h->stats + h->stats_offset, buf, bytes);
         h->stats_offset += bytes;
     } else {
@@ -117,7 +118,7 @@ static int get_stats(AVCodecContext *avctx, int eos)
     return 0;
 #else
     av_log(avctx, AV_LOG_ERROR, "libtheora too old to support 2pass\n");
-    return -1;
+    return AVERROR(ENOSUP);
 #endif
 }
 
@@ -131,12 +132,14 @@ static int submit_stats(AVCodecContext *avctx)
     if (!h->stats) {
         if (!avctx->stats_in) {
             av_log(avctx, AV_LOG_ERROR, "No statsfile for second pass\n");
-            return -1;
+            return AVERROR(EINVAL);
         }
         h->stats_size = strlen(avctx->stats_in) * 3/4;
         h->stats      = av_malloc(h->stats_size);
-        if (!h->stats)
+        if (!h->stats) {
+            h->stats_size = 0;
             return AVERROR(ENOMEM);
+        }
         h->stats_size = av_base64_decode(h->stats, avctx->stats_in, h->stats_size);
     }
     while (h->stats_size - h->stats_offset > 0) {
@@ -145,7 +148,7 @@ static int submit_stats(AVCodecContext *avctx)
                               h->stats_size - h->stats_offset);
         if (bytes < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error submitting stats\n");
-            return -1;
+            return AVERROR_EXTERNAL;
         }
         if (!bytes)
             return 0;
@@ -154,7 +157,7 @@ static int submit_stats(AVCodecContext *avctx)
     return 0;
 #else
     av_log(avctx, AV_LOG_ERROR, "libtheora too old to support 2pass\n");
-    return -1;
+    return AVERROR(ENOSUP);
 #endif
 }
 
@@ -166,6 +169,7 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     unsigned int offset;
     TheoraContext *h = avc_context->priv_data;
     uint32_t gop_size = avc_context->gop_size;
+    int ret;
 
     /* Set up the theora_info struct */
     th_info_init(&t_info);
@@ -202,17 +206,16 @@ static av_cold int encode_init(AVCodecContext* avc_context)
         t_info.pixel_fmt = TH_PF_444;
     else {
         av_log(avc_context, AV_LOG_ERROR, "Unsupported pix_fmt\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
-    av_pix_fmt_get_chroma_sub_sample(avc_context->pix_fmt,
-                                     &h->uv_hshift, &h->uv_vshift);
+    avcodec_get_chroma_sub_sample(avc_context->pix_fmt, &h->uv_hshift, &h->uv_vshift);
 
     if (avc_context->flags & AV_CODEC_FLAG_QSCALE) {
-        /* to be constant with the libvorbis implementation, clip global_quality to 0 - 10
-           Theora accepts a quality parameter p, which is:
-                * 0 <= p <=63
-                * an int value
-         */
+        /* Clip global_quality in QP units to the [0 - 10] range
+           to be consistent with the libvorbis implementation.
+           Theora accepts a quality parameter which is an int value in
+           the [0 - 63] range.
+        */
         t_info.quality        = av_clipf(avc_context->global_quality / (float)FF_QP2LAMBDA, 0, 10) * 6.3;
         t_info.target_bitrate = 0;
     } else {
@@ -224,7 +227,7 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     h->t_state = th_encode_alloc(&t_info);
     if (!h->t_state) {
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_init failed\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     h->keyframe_mask = (1 << t_info.keyframe_granule_shift) - 1;
@@ -234,16 +237,16 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     if (th_encode_ctl(h->t_state, TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
                       &gop_size, sizeof(gop_size))) {
         av_log(avc_context, AV_LOG_ERROR, "Error setting GOP size\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     // need to enable 2 pass (via TH_ENCCTL_2PASS_) before encoding headers
     if (avc_context->flags & AV_CODEC_FLAG_PASS1) {
-        if (get_stats(avc_context, 0))
-            return -1;
+        if ((ret = get_stats(avc_context, 0)) < 0)
+            return ret;
     } else if (avc_context->flags & AV_CODEC_FLAG_PASS2) {
-        if (submit_stats(avc_context))
-            return -1;
+        if ((ret = submit_stats(avc_context)) < 0)
+            return ret;
     }
 
     /*
@@ -259,8 +262,8 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     th_comment_init(&t_comment);
 
     while (th_encode_flushheader(h->t_state, &t_comment, &o_packet))
-        if (concatenate_packet(&offset, avc_context, &o_packet))
-            return -1;
+        if ((ret = concatenate_packet(&offset, avc_context, &o_packet)) < 0)
+            return ret;
 
     th_comment_clear(&t_comment);
 
@@ -279,8 +282,8 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
     if (!frame) {
         th_encode_packetout(h->t_state, 1, &o_packet);
         if (avc_context->flags & AV_CODEC_FLAG_PASS1)
-            if (get_stats(avc_context, 1))
-                return -1;
+            if ((ret = get_stats(avc_context, 1)) < 0)
+                return ret;
         return 0;
     }
 
@@ -293,8 +296,8 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
     }
 
     if (avc_context->flags & AV_CODEC_FLAG_PASS2)
-        if (submit_stats(avc_context))
-            return -1;
+        if ((ret = submit_stats(avc_context)) < 0)
+            return ret;
 
     /* Now call into theora_encode_YUVin */
     result = th_encode_ycbcr_in(h->t_state, t_yuv_buffer);
@@ -312,12 +315,12 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
             break;
         }
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_YUVin failed (%s) [%d]\n", message, result);
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     if (avc_context->flags & AV_CODEC_FLAG_PASS1)
-        if (get_stats(avc_context, 0))
-            return -1;
+        if ((ret = get_stats(avc_context, 0)) < 0)
+            return ret;
 
     /* Pick up returned ogg_packet */
     result = th_encode_packetout(h->t_state, 0, &o_packet);
@@ -330,14 +333,12 @@ static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
         break;
     default:
         av_log(avc_context, AV_LOG_ERROR, "theora_encode_packetout failed [%d]\n", result);
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     /* Copy ogg_packet content out to buffer */
-    if ((ret = ff_alloc_packet(pkt, o_packet.bytes)) < 0) {
-        av_log(avc_context, AV_LOG_ERROR, "Error getting output packet of size %ld.\n", o_packet.bytes);
+    if ((ret = ff_alloc_packet2(avc_context, pkt, o_packet.bytes, 0)) < 0)
         return ret;
-    }
     memcpy(pkt->data, o_packet.packet, o_packet.bytes);
 
     // HACK: assumes no encoder delay, this is true until libtheora becomes
diff --git a/libavcodec/libtwolame.c b/libavcodec/libtwolame.c
index 714c30a..12d71e7 100644
--- a/libavcodec/libtwolame.c
+++ b/libavcodec/libtwolame.c
@@ -2,20 +2,20 @@
  * Interface to libtwolame for mp2 encoding
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,6 +77,10 @@ static av_cold int twolame_encode_init(AVCodecContext *avctx)
     twolame_set_num_channels(s->glopts, avctx->channels);
     twolame_set_in_samplerate(s->glopts, avctx->sample_rate);
     twolame_set_out_samplerate(s->glopts, avctx->sample_rate);
+
+    if (!avctx->bit_rate)
+        avctx->bit_rate = avctx->sample_rate < 28000 ? 160000 : 384000;
+
     if (avctx->flags & AV_CODEC_FLAG_QSCALE || !avctx->bit_rate) {
         twolame_set_VBR(s->glopts, TRUE);
         twolame_set_VBR_level(s->glopts,
@@ -102,7 +106,7 @@ static int twolame_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     TWOLAMEContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, MPA_MAX_CODED_FRAME_SIZE)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MPA_MAX_CODED_FRAME_SIZE, 0)) < 0)
         return ret;
 
     if (frame) {
@@ -190,7 +194,7 @@ static const AVClass twolame_class = {
 };
 
 static const AVCodecDefault twolame_defaults[] = {
-    { "b", "384000" },
+    { "b", "0" },
     { NULL },
 };
 
diff --git a/libavcodec/libutvideo.h b/libavcodec/libutvideo.h
new file mode 100644
index 0000000..0c03097
--- /dev/null
+++ b/libavcodec/libutvideo.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2011-2012 Derek Buitenhuis
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation;
+ * version 2 of the License.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs:
+ *     'ULY0' (YCbCr 4:2:0), 'ULY2' (YCbCr 4:2:2), 'ULRG' (RGB), 'ULRA' (RGBA),
+ *     'ULH0' (YCbCr 4:2:0 BT.709), 'ULH2' (YCbCr 4:2:2 BT.709)
+ */
+
+#ifndef AVCODEC_LIBUTVIDEO_H
+#define AVCODEC_LIBUTVIDEO_H
+
+#include <stdlib.h>
+#include <utvideo/utvideo.h>
+#include <utvideo/Codec.h>
+
+/*
+ * Ut Video version 12.0.0 changed the RGB format names and removed
+ * the _WIN names, so if the new names are absent, define them
+ * against the old names so compatibility with pre-v12 versions
+ * is maintained.
+ */
+#if !defined(UTVF_NFCC_BGR_BU)
+#define UTVF_NFCC_BGR_BU UTVF_RGB24_WIN
+#endif
+
+#if !defined(UTVF_NFCC_BGRA_BU)
+#define UTVF_NFCC_BGRA_BU UTVF_RGB32_WIN
+#endif
+
+/*
+ * Ut Video version 13.0.1 introduced new BT.709 variants.
+ * Special-case these and only use them if v13 is detected.
+ */
+#if defined(UTVF_HDYC)
+#define UTV_BT709
+#endif
+
+typedef struct {
+    uint32_t version;
+    uint32_t original_format;
+    uint32_t frameinfo_size;
+    uint32_t flags;
+} UtVideoExtra;
+
+typedef struct {
+    const AVClass *c;
+    CCodec *codec;
+    unsigned int buf_size;
+    uint8_t *buffer;
+    int pred;
+} UtVideoContext;
+
+#endif /* AVCODEC_LIBUTVIDEO_H */
diff --git a/libavcodec/libutvideodec.cpp b/libavcodec/libutvideodec.cpp
new file mode 100644
index 0000000..29e6db1
--- /dev/null
+++ b/libavcodec/libutvideodec.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2011 Derek Buitenhuis
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation;
+ * version 2 of the License.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs:
+ *     'ULY0' (YCbCr 4:2:0), 'ULY2' (YCbCr 4:2:2), 'ULRG' (RGB), 'ULRA' (RGBA),
+ *     'ULH0' (YCbCr 4:2:0 BT.709), 'ULH2' (YCbCr 4:2:2 BT.709)
+ */
+
+extern "C" {
+#include "avcodec.h"
+#include "libavutil/imgutils.h"
+}
+
+#include "libutvideo.h"
+#include "get_bits.h"
+
+static av_cold int utvideo_decode_init(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    UtVideoExtra info;
+    int format;
+    int begin_ret;
+
+    if (avctx->extradata_size != 16 && avctx->extradata_size != 8 ) {
+        av_log(avctx, AV_LOG_ERROR, "Extradata size (%d) mismatch.\n", avctx->extradata_size);
+        return -1;
+    }
+
+    /* Read extradata */
+    info.version = AV_RL32(avctx->extradata);
+    info.original_format = AV_RL32(avctx->extradata + 4);
+    info.frameinfo_size = AV_RL32(avctx->extradata + 8);
+    info.flags = AV_RL32(avctx->extradata + 12);
+
+    /* Pick format based on FOURCC */
+    switch (avctx->codec_tag) {
+#ifdef UTV_BT709
+    case MKTAG('U', 'L', 'H', '0'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        avctx->colorspace = AVCOL_SPC_BT709;
+        format = UTVF_YV12;
+        break;
+    case MKTAG('U', 'L', 'H', '2'):
+        avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+        avctx->colorspace = AVCOL_SPC_BT709;
+        format = UTVF_YUY2;
+        break;
+#endif
+    case MKTAG('U', 'L', 'Y', '0'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        format = UTVF_YV12;
+        break;
+    case MKTAG('U', 'L', 'Y', '2'):
+        avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+        format = UTVF_YUY2;
+        break;
+    case MKTAG('U', 'L', 'R', 'G'):
+        avctx->pix_fmt = AV_PIX_FMT_BGR24;
+        format = UTVF_NFCC_BGR_BU;
+        break;
+    case MKTAG('U', 'L', 'R', 'A'):
+        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        format = UTVF_NFCC_BGRA_BU;
+        break;
+#ifdef UTVF_UQY2
+    case MKTAG('U', 'Q', 'Y', '2'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        format = UTVF_v210;
+        break;
+#endif
+    default:
+        av_log(avctx, AV_LOG_ERROR,
+              "Not a Ut Video FOURCC: %X\n", avctx->codec_tag);
+        return -1;
+    }
+
+    /* Only allocate the buffer once */
+    utv->buf_size = av_image_get_buffer_size(avctx->pix_fmt, avctx->width, avctx->height, 1);
+#ifdef UTVF_UQY2
+    if (format == UTVF_v210)
+        utv->buf_size += avctx->height * ((avctx->width + 47) / 48) * 128; // the linesize used by the decoder, this does not seem to be exported
+#endif
+    utv->buffer = (uint8_t *)av_malloc(utv->buf_size * sizeof(uint8_t));
+
+    if (utv->buffer == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Unable to allocate output buffer.\n");
+        return -1;
+    }
+
+    /* Allocate the output frame */
+    avctx->coded_frame = av_frame_alloc();
+
+    /* Is it interlaced? */
+    avctx->coded_frame->interlaced_frame = info.flags & 0x800 ? 1 : 0;
+
+    /* Apparently Ut Video doesn't store this info... */
+    avctx->coded_frame->top_field_first = 1;
+
+    /*
+     * Create a Ut Video instance. Since the function wants
+     * an "interface name" string, pass it the name of the lib.
+     */
+    utv->codec = CCodec::CreateInstance(UNFCC(avctx->codec_tag), "libavcodec");
+
+    /* Initialize Decoding */
+    begin_ret = utv->codec->DecodeBegin(format, avctx->width, avctx->height,
+                            CBGROSSWIDTH_WINDOWS, &info, sizeof(UtVideoExtra));
+
+    /* Check to see if the decoder initlized properly */
+    if (begin_ret != 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Could not initialize decoder: %d\n", begin_ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int utvideo_decode_frame(AVCodecContext *avctx, void *data,
+                                int *got_frame, AVPacket *avpkt)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    AVFrame *pic = avctx->coded_frame;
+    int w = avctx->width, h = avctx->height;
+
+    /* Set flags */
+    pic->pict_type = AV_PICTURE_TYPE_I;
+    pic->key_frame = 1;
+
+    /* Decode the frame */
+    utv->codec->DecodeFrame(utv->buffer, avpkt->data, true);
+
+    /* Set the output data depending on the colorspace */
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        pic->linesize[0] = w;
+        pic->linesize[1] = pic->linesize[2] = w / 2;
+        pic->data[0] = utv->buffer;
+        pic->data[2] = utv->buffer + (w * h);
+        pic->data[1] = pic->data[2] + (w * h / 4);
+        break;
+    case AV_PIX_FMT_YUYV422:
+        pic->linesize[0] = w * 2;
+        pic->data[0] = utv->buffer;
+        break;
+    case AV_PIX_FMT_YUV422P10: {
+        uint16_t *y, *u, *v;
+        int i,j;
+        int linesize = ((w + 47) / 48) * 128;
+
+        pic->linesize[0] = w * 2;
+        pic->linesize[1] =
+        pic->linesize[2] = w;
+        pic->data[0] = utv->buffer + linesize * h;
+        pic->data[1] = pic->data[0] + h*pic->linesize[0];
+        pic->data[2] = pic->data[1] + h*pic->linesize[1];
+        y = (uint16_t*)pic->data[0];
+        u = (uint16_t*)pic->data[1];
+        v = (uint16_t*)pic->data[2];
+        for (j = 0; j < h; j++) {
+            const uint8_t *in = utv->buffer + j * linesize;
+
+            for (i = 0; i + 1 < w; i += 6, in += 4) {
+                unsigned a,b;
+                a = AV_RL32(in);
+                in += 4;
+                b = AV_RL32(in);
+                *u++ = (a    ) & 0x3FF;
+                *y++ = (a>>10) & 0x3FF;
+                *v++ = (a>>20) & 0x3FF;
+                *y++ = (b    ) & 0x3FF;
+
+                if (i + 3 >= w)
+                    break;
+
+                in += 4;
+                a = AV_RL32(in);
+                *u++ = (b>>10) & 0x3FF;
+                *y++ = (b>>20) & 0x3FF;
+                *v++ = (a    ) & 0x3FF;
+                *y++ = (a>>10) & 0x3FF;
+
+                if (i + 5 >= w)
+                    break;
+
+                in += 4;
+                b = AV_RL32(in);
+                *u++ = (a>>20) & 0x3FF;
+                *y++ = (b    ) & 0x3FF;
+                *v++ = (b>>10) & 0x3FF;
+                *y++ = (b>>20) & 0x3FF;
+            }
+        }
+        break;
+    }
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB32:
+        /* Make the linesize negative, since Ut Video uses bottom-up BGR */
+        pic->linesize[0] = -1 * w * (avctx->pix_fmt == AV_PIX_FMT_BGR24 ? 3 : 4);
+        pic->data[0] = utv->buffer + utv->buf_size + pic->linesize[0];
+        break;
+    }
+    pic->width  = w;
+    pic->height = h;
+    pic->format = avctx->pix_fmt;
+
+    if (avctx->refcounted_frames) {
+        int ret = av_frame_ref((AVFrame*)data, pic);
+        if (ret < 0)
+             return ret;
+    } else {
+        av_frame_move_ref((AVFrame*)data, pic);
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int utvideo_decode_close(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+
+    /* Free output */
+    av_frame_free(&avctx->coded_frame);
+    av_freep(&utv->buffer);
+
+    /* Finish decoding and clean up the instance */
+    utv->codec->DecodeEnd();
+    CCodec::DeleteInstance(utv->codec);
+
+    return 0;
+}
+
+AVCodec ff_libutvideo_decoder = {
+    "libutvideo",
+    NULL_IF_CONFIG_SMALL("Ut Video"),
+    AVMEDIA_TYPE_VIDEO,
+    AV_CODEC_ID_UTVIDEO,
+    0,    //capabilities
+    NULL, //supported_framerates
+    NULL, //pix_fmts
+    NULL, //supported_samplerates
+    NULL, //sample_fmts
+    NULL, //channel_layouts
+    0,    //max_lowres
+    NULL, //priv_class
+    NULL, //profiles
+    sizeof(UtVideoContext),
+    NULL, //next
+    NULL, //init_thread_copy
+    NULL, //update_thread_context
+    NULL, //defaults
+    NULL, //init_static_data
+    utvideo_decode_init,
+    NULL, //encode
+    NULL, //encode2
+    utvideo_decode_frame,
+    utvideo_decode_close,
+};
diff --git a/libavcodec/libutvideoenc.cpp b/libavcodec/libutvideoenc.cpp
new file mode 100644
index 0000000..d03d515
--- /dev/null
+++ b/libavcodec/libutvideoenc.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2012 Derek Buitenhuis
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation;
+ * version 2 of the License.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs:
+ *     'ULY0' (YCbCr 4:2:0), 'ULY2' (YCbCr 4:2:2), 'ULRG' (RGB), 'ULRA' (RGBA),
+ *     'ULH0' (YCbCr 4:2:0 BT.709), 'ULH2' (YCbCr 4:2:2 BT.709)
+ */
+
+extern "C" {
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+#include "avcodec.h"
+#include "internal.h"
+}
+
+#include "libutvideo.h"
+#include "put_bits.h"
+
+static av_cold int utvideo_encode_init(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    UtVideoExtra *info;
+    uint32_t flags, in_format;
+    int ret;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        in_format = UTVF_YV12;
+        avctx->bits_per_coded_sample = 12;
+        if (avctx->colorspace == AVCOL_SPC_BT709)
+            avctx->codec_tag = MKTAG('U', 'L', 'H', '0');
+        else
+            avctx->codec_tag = MKTAG('U', 'L', 'Y', '0');
+        break;
+    case AV_PIX_FMT_YUYV422:
+        in_format = UTVF_YUYV;
+        avctx->bits_per_coded_sample = 16;
+        if (avctx->colorspace == AVCOL_SPC_BT709)
+            avctx->codec_tag = MKTAG('U', 'L', 'H', '2');
+        else
+            avctx->codec_tag = MKTAG('U', 'L', 'Y', '2');
+        break;
+    case AV_PIX_FMT_BGR24:
+        in_format = UTVF_NFCC_BGR_BU;
+        avctx->bits_per_coded_sample = 24;
+        avctx->codec_tag = MKTAG('U', 'L', 'R', 'G');
+        break;
+    case AV_PIX_FMT_RGB32:
+        in_format = UTVF_NFCC_BGRA_BU;
+        avctx->bits_per_coded_sample = 32;
+        avctx->codec_tag = MKTAG('U', 'L', 'R', 'A');
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->prediction_method)
+        utv->pred = avctx->prediction_method;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    /* Check before we alloc anything */
+    if (utv->pred != 0 && utv->pred != 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid prediction method.\n");
+        return AVERROR(EINVAL);
+    }
+
+    flags = ((utv->pred + 1) << 8) | (avctx->thread_count - 1);
+
+    avctx->priv_data = utv;
+
+    /* Alloc extradata buffer */
+    info = (UtVideoExtra *)av_malloc(sizeof(*info));
+
+    if (!info) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate extradata buffer.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    /*
+     * We use this buffer to hold the data that Ut Video returns,
+     * since we cannot decode planes separately with it.
+     */
+    ret = av_image_get_buffer_size(avctx->pix_fmt, avctx->width, avctx->height, 1);
+    if (ret < 0) {
+        av_free(info);
+        return ret;
+    }
+    utv->buf_size = ret;
+
+    utv->buffer = (uint8_t *)av_malloc(utv->buf_size);
+
+    if (utv->buffer == NULL) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate output buffer.\n");
+        av_free(info);
+        return AVERROR(ENOMEM);
+    }
+
+    /*
+     * Create a Ut Video instance. Since the function wants
+     * an "interface name" string, pass it the name of the lib.
+     */
+    utv->codec = CCodec::CreateInstance(UNFCC(avctx->codec_tag), "libavcodec");
+
+    /* Initialize encoder */
+    utv->codec->EncodeBegin(in_format, avctx->width, avctx->height,
+                            CBGROSSWIDTH_WINDOWS);
+
+    /* Get extradata from encoder */
+    avctx->extradata_size = utv->codec->EncodeGetExtraDataSize();
+    utv->codec->EncodeGetExtraData(info, avctx->extradata_size, in_format,
+                                   avctx->width, avctx->height,
+                                   CBGROSSWIDTH_WINDOWS);
+    avctx->extradata = (uint8_t *)info;
+
+    /* Set flags */
+    utv->codec->SetState(&flags, sizeof(flags));
+
+    return 0;
+}
+
+static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                const AVFrame *pic, int *got_packet)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+    int w = avctx->width, h = avctx->height;
+    int ret, rgb_size, i;
+    bool keyframe;
+    uint8_t *y, *u, *v;
+    uint8_t *dst;
+
+    /* Alloc buffer */
+    if ((ret = ff_alloc_packet2(avctx, pkt, utv->buf_size, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    /* Move input if needed data into Ut Video friendly buffer */
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        y = utv->buffer;
+        u = y + w * h;
+        v = u + w * h / 4;
+        for (i = 0; i < h; i++) {
+            memcpy(y, pic->data[0] + i * pic->linesize[0], w);
+            y += w;
+        }
+        for (i = 0; i < h / 2; i++) {
+            memcpy(u, pic->data[2] + i * pic->linesize[2], w >> 1);
+            memcpy(v, pic->data[1] + i * pic->linesize[1], w >> 1);
+            u += w >> 1;
+            v += w >> 1;
+        }
+        break;
+    case AV_PIX_FMT_YUYV422:
+        for (i = 0; i < h; i++)
+            memcpy(utv->buffer + i * (w << 1),
+                   pic->data[0] + i * pic->linesize[0], w << 1);
+        break;
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB32:
+        /* Ut Video takes bottom-up BGR */
+        rgb_size = avctx->pix_fmt == AV_PIX_FMT_BGR24 ? 3 : 4;
+        for (i = 0; i < h; i++)
+            memcpy(utv->buffer + (h - i - 1) * w * rgb_size,
+                   pic->data[0] + i * pic->linesize[0],
+                   w * rgb_size);
+        break;
+    default:
+        return AVERROR(EINVAL);
+    }
+
+    /* Encode frame */
+    pkt->size = utv->codec->EncodeFrame(dst, &keyframe, utv->buffer);
+
+    if (!pkt->size) {
+        av_log(avctx, AV_LOG_ERROR, "EncodeFrame failed!\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*
+     * Ut Video is intra-only and every frame is a keyframe,
+     * and the API always returns true. In case something
+     * durastic changes in the future, such as inter support,
+     * assert that this is true.
+     */
+    av_assert2(keyframe == true);
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int utvideo_encode_close(AVCodecContext *avctx)
+{
+    UtVideoContext *utv = (UtVideoContext *)avctx->priv_data;
+
+    av_freep(&avctx->extradata);
+    av_freep(&utv->buffer);
+
+    utv->codec->EncodeEnd();
+    CCodec::DeleteInstance(utv->codec);
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(UtVideoContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "pred", "Prediction method", OFFSET(pred), AV_OPT_TYPE_INT, 0, 0, 2, VE, "pred" },
+    { "left",   NULL, 0, AV_OPT_TYPE_CONST, 0, INT_MIN, INT_MAX, VE, "pred" },
+    { "median",   NULL, 0, AV_OPT_TYPE_CONST, 2, INT_MIN, INT_MAX, VE, "pred" },
+    { NULL },
+};
+
+static const AVClass utvideo_class = {
+    "libutvideo",
+    av_default_item_name,
+    options,
+    LIBAVUTIL_VERSION_INT,
+    0,
+    0,
+    NULL,
+    NULL,
+    AV_CLASS_CATEGORY_NA,
+    NULL,
+    NULL,
+};
+
+AVCodec ff_libutvideo_encoder = {
+    "libutvideo",
+    NULL_IF_CONFIG_SMALL("Ut Video"),
+    AVMEDIA_TYPE_VIDEO,
+    AV_CODEC_ID_UTVIDEO,
+    AV_CODEC_CAP_AUTO_THREADS | (int)AV_CODEC_CAP_LOSSLESS | AV_CODEC_CAP_INTRA_ONLY,
+    NULL, /* supported_framerates */
+    (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUYV422, AV_PIX_FMT_BGR24,
+        AV_PIX_FMT_RGB32, AV_PIX_FMT_NONE
+    },
+    NULL, /* supported_samplerates */
+    NULL, /* sample_fmts */
+    NULL, /* channel_layouts */
+    0,    /* max_lowres */
+    &utvideo_class, /* priv_class */
+    NULL, /* profiles */
+    sizeof(UtVideoContext),
+    NULL, /* next */
+    NULL, /* init_thread_copy */
+    NULL, /* update_thread_context */
+    NULL, /* defaults */
+    NULL, /* init_static_data */
+    utvideo_encode_init,
+    NULL, /* encode */
+    utvideo_encode_frame,
+    NULL, /* decode */
+    utvideo_encode_close,
+    NULL, /* flush */
+};
diff --git a/libavcodec/libvo-aacenc.c b/libavcodec/libvo-aacenc.c
deleted file mode 100644
index 876ef4c..0000000
--- a/libavcodec/libvo-aacenc.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * AAC encoder wrapper
- * Copyright (c) 2010 Martin Storsjo
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <vo-aacenc/voAAC.h>
-#include <vo-aacenc/cmnMemory.h>
-
-#include "avcodec.h"
-#include "audio_frame_queue.h"
-#include "internal.h"
-#include "mpeg4audio.h"
-
-#define FRAME_SIZE 1024
-#define ENC_DELAY  1600
-
-typedef struct AACContext {
-    VO_AUDIO_CODECAPI codec_api;
-    VO_HANDLE handle;
-    VO_MEM_OPERATOR mem_operator;
-    VO_CODEC_INIT_USERDATA user_data;
-    VO_PBYTE end_buffer;
-    AudioFrameQueue afq;
-    int last_frame;
-    int last_samples;
-} AACContext;
-
-
-static int aac_encode_close(AVCodecContext *avctx)
-{
-    AACContext *s = avctx->priv_data;
-
-    s->codec_api.Uninit(s->handle);
-    av_freep(&avctx->extradata);
-    ff_af_queue_close(&s->afq);
-    av_freep(&s->end_buffer);
-
-    return 0;
-}
-
-static av_cold int aac_encode_init(AVCodecContext *avctx)
-{
-    AACContext *s = avctx->priv_data;
-    AACENC_PARAM params = { 0 };
-    int index, ret;
-
-    avctx->frame_size = FRAME_SIZE;
-    avctx->initial_padding = ENC_DELAY;
-    s->last_frame     = 2;
-    ff_af_queue_init(avctx, &s->afq);
-
-    s->end_buffer = av_mallocz(avctx->frame_size * avctx->channels * 2);
-    if (!s->end_buffer) {
-        ret = AVERROR(ENOMEM);
-        goto error;
-    }
-
-    voGetAACEncAPI(&s->codec_api);
-
-    s->mem_operator.Alloc = cmnMemAlloc;
-    s->mem_operator.Copy = cmnMemCopy;
-    s->mem_operator.Free = cmnMemFree;
-    s->mem_operator.Set = cmnMemSet;
-    s->mem_operator.Check = cmnMemCheck;
-    s->user_data.memflag = VO_IMF_USERMEMOPERATOR;
-    s->user_data.memData = &s->mem_operator;
-    s->codec_api.Init(&s->handle, VO_AUDIO_CodingAAC, &s->user_data);
-
-    params.sampleRate = avctx->sample_rate;
-    params.bitRate    = avctx->bit_rate;
-    params.nChannels  = avctx->channels;
-    params.adtsUsed   = !(avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER);
-    if (s->codec_api.SetParam(s->handle, VO_PID_AAC_ENCPARAM, &params)
-        != VO_ERR_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to set encoding parameters\n");
-        ret = AVERROR(EINVAL);
-        goto error;
-    }
-
-    for (index = 0; index < 16; index++)
-        if (avctx->sample_rate == avpriv_mpeg4audio_sample_rates[index])
-            break;
-    if (index == 16) {
-        av_log(avctx, AV_LOG_ERROR, "Unsupported sample rate %d\n",
-                                    avctx->sample_rate);
-        ret = AVERROR(ENOSYS);
-        goto error;
-    }
-    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
-        avctx->extradata_size = 2;
-        avctx->extradata      = av_mallocz(avctx->extradata_size +
-                                           AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!avctx->extradata) {
-            ret = AVERROR(ENOMEM);
-            goto error;
-        }
-
-        avctx->extradata[0] = 0x02 << 3 | index >> 1;
-        avctx->extradata[1] = (index & 0x01) << 7 | avctx->channels << 3;
-    }
-    return 0;
-error:
-    aac_encode_close(avctx);
-    return ret;
-}
-
-static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
-                            const AVFrame *frame, int *got_packet_ptr)
-{
-    AACContext *s = avctx->priv_data;
-    VO_CODECBUFFER input = { 0 }, output = { 0 };
-    VO_AUDIO_OUTPUTINFO output_info = { { 0 } };
-    VO_PBYTE samples;
-    int ret;
-
-    /* handle end-of-stream small frame and flushing */
-    if (!frame) {
-        if (s->last_frame <= 0)
-            return 0;
-        if (s->last_samples > 0 && s->last_samples < ENC_DELAY - FRAME_SIZE) {
-            s->last_samples = 0;
-            s->last_frame--;
-        }
-        s->last_frame--;
-        memset(s->end_buffer, 0, 2 * avctx->channels * avctx->frame_size);
-        samples = s->end_buffer;
-    } else {
-        if (frame->nb_samples < avctx->frame_size) {
-            s->last_samples = frame->nb_samples;
-            memcpy(s->end_buffer, frame->data[0], 2 * avctx->channels * frame->nb_samples);
-            samples = s->end_buffer;
-        } else {
-            samples = (VO_PBYTE)frame->data[0];
-        }
-        /* add current frame to the queue */
-        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
-            return ret;
-    }
-
-    if ((ret = ff_alloc_packet(avpkt, FFMAX(8192, 768 * avctx->channels)))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
-        return ret;
-    }
-
-    input.Buffer  = samples;
-    input.Length  = 2 * avctx->channels * avctx->frame_size;
-    output.Buffer = avpkt->data;
-    output.Length = avpkt->size;
-
-    s->codec_api.SetInputData(s->handle, &input);
-    if (s->codec_api.GetOutputData(s->handle, &output, &output_info)
-        != VO_ERR_NONE) {
-        av_log(avctx, AV_LOG_ERROR, "Unable to encode frame\n");
-        return AVERROR(EINVAL);
-    }
-
-    /* Get the next frame pts/duration */
-    ff_af_queue_remove(&s->afq, avctx->frame_size, &avpkt->pts,
-                       &avpkt->duration);
-
-    avpkt->size = output.Length;
-    *got_packet_ptr = 1;
-    return 0;
-}
-
-AVCodec ff_libvo_aacenc_encoder = {
-    .name           = "libvo_aacenc",
-    .long_name      = NULL_IF_CONFIG_SMALL("Android VisualOn AAC (Advanced Audio Coding)"),
-    .type           = AVMEDIA_TYPE_AUDIO,
-    .id             = AV_CODEC_ID_AAC,
-    .priv_data_size = sizeof(AACContext),
-    .init           = aac_encode_init,
-    .encode2        = aac_encode_frame,
-    .close          = aac_encode_close,
-    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_DELAY,
-    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
-                                                     AV_SAMPLE_FMT_NONE },
-};
diff --git a/libavcodec/libvo-amrwbenc.c b/libavcodec/libvo-amrwbenc.c
index 5fc904c..2a15650 100644
--- a/libavcodec/libvo-amrwbenc.c
+++ b/libavcodec/libvo-amrwbenc.c
@@ -2,20 +2,20 @@
  * AMR Audio encoder stub
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,7 +45,7 @@ static const AVOption options[] = {
     { NULL }
 };
 
-static const AVClass class = {
+static const AVClass amrwb_class = {
     "libvo_amrwbenc", av_default_item_name, options, LIBAVUTIL_VERSION_INT
 };
 
@@ -79,7 +79,7 @@ static av_cold int amr_wb_encode_init(AVCodecContext *avctx)
 {
     AMRWBContext *s = avctx->priv_data;
 
-    if (avctx->sample_rate != 16000) {
+    if (avctx->sample_rate != 16000 && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
         av_log(avctx, AV_LOG_ERROR, "Only 16000Hz sample rate supported\n");
         return AVERROR(ENOSYS);
     }
@@ -115,10 +115,8 @@ static int amr_wb_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     const int16_t *samples = (const int16_t *)frame->data[0];
     int size, ret;
 
-    if ((ret = ff_alloc_packet(avpkt, MAX_PACKET_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MAX_PACKET_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (s->last_bitrate != avctx->bit_rate) {
         s->mode         = get_wb_bitrate_mode(avctx->bit_rate, avctx);
@@ -150,5 +148,5 @@ AVCodec ff_libvo_amrwbenc_encoder = {
     .close          = amr_wb_encode_close,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &amrwb_class,
 };
diff --git a/libavcodec/libvorbisdec.c b/libavcodec/libvorbisdec.c
new file mode 100644
index 0000000..ecf690a
--- /dev/null
+++ b/libavcodec/libvorbisdec.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <vorbis/vorbisenc.h>
+
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+
+typedef struct OggVorbisDecContext {
+    vorbis_info vi;                     /**< vorbis_info used during init   */
+    vorbis_dsp_state vd;                /**< DSP state used for analysis    */
+    vorbis_block vb;                    /**< vorbis_block used for analysis */
+    vorbis_comment vc;                  /**< VorbisComment info             */
+    ogg_packet op;                      /**< ogg packet                     */
+} OggVorbisDecContext;
+
+static int oggvorbis_decode_close(AVCodecContext *avccontext);
+
+static int oggvorbis_decode_init(AVCodecContext *avccontext) {
+    OggVorbisDecContext *context = avccontext->priv_data ;
+    uint8_t *p= avccontext->extradata;
+    int i, hsizes[3], ret;
+    unsigned char *headers[3], *extradata = avccontext->extradata;
+
+    if(! avccontext->extradata_size || ! p) {
+        av_log(avccontext, AV_LOG_ERROR, "vorbis extradata absent\n");
+        return AVERROR(EINVAL);
+    }
+
+    vorbis_info_init(&context->vi) ;
+    vorbis_comment_init(&context->vc) ;
+
+    if(p[0] == 0 && p[1] == 30) {
+        for(i = 0; i < 3; i++){
+            hsizes[i] = bytestream_get_be16((const uint8_t **)&p);
+            headers[i] = p;
+            p += hsizes[i];
+        }
+    } else if(*p == 2) {
+        unsigned int offset = 1;
+        p++;
+        for(i=0; i<2; i++) {
+            hsizes[i] = 0;
+            while((*p == 0xFF) && (offset < avccontext->extradata_size)) {
+                hsizes[i] += 0xFF;
+                offset++;
+                p++;
+            }
+            if(offset >= avccontext->extradata_size - 1) {
+                av_log(avccontext, AV_LOG_ERROR,
+                       "vorbis header sizes damaged\n");
+                ret = AVERROR_INVALIDDATA;
+                goto error;
+            }
+            hsizes[i] += *p;
+            offset++;
+            p++;
+        }
+        hsizes[2] = avccontext->extradata_size - hsizes[0]-hsizes[1]-offset;
+#if 0
+        av_log(avccontext, AV_LOG_DEBUG,
+               "vorbis header sizes: %d, %d, %d, / extradata_len is %d \n",
+               hsizes[0], hsizes[1], hsizes[2], avccontext->extradata_size);
+#endif
+        headers[0] = extradata + offset;
+        headers[1] = extradata + offset + hsizes[0];
+        headers[2] = extradata + offset + hsizes[0] + hsizes[1];
+    } else {
+        av_log(avccontext, AV_LOG_ERROR,
+               "vorbis initial header len is wrong: %d\n", *p);
+        ret = AVERROR_INVALIDDATA;
+        goto error;
+    }
+
+    for(i=0; i<3; i++){
+        context->op.b_o_s= i==0;
+        context->op.bytes = hsizes[i];
+        context->op.packet = headers[i];
+        if(vorbis_synthesis_headerin(&context->vi, &context->vc, &context->op)<0){
+            av_log(avccontext, AV_LOG_ERROR, "%d. vorbis header damaged\n", i+1);
+            ret = AVERROR_INVALIDDATA;
+            goto error;
+        }
+    }
+
+    avccontext->channels = context->vi.channels;
+    avccontext->sample_rate = context->vi.rate;
+    avccontext->sample_fmt = AV_SAMPLE_FMT_S16;
+    avccontext->time_base= (AVRational){1, avccontext->sample_rate};
+
+    vorbis_synthesis_init(&context->vd, &context->vi);
+    vorbis_block_init(&context->vd, &context->vb);
+
+    return 0 ;
+
+  error:
+    oggvorbis_decode_close(avccontext);
+    return ret;
+}
+
+
+static inline int conv(int samples, float **pcm, char *buf, int channels) {
+    int i, j;
+    ogg_int16_t *ptr, *data = (ogg_int16_t*)buf ;
+    float *mono ;
+
+    for(i = 0 ; i < channels ; i++){
+        ptr = &data[i];
+        mono = pcm[i] ;
+
+        for(j = 0 ; j < samples ; j++) {
+            *ptr = av_clip_int16(mono[j] * 32767.f);
+            ptr += channels;
+        }
+    }
+
+    return 0 ;
+}
+
+static int oggvorbis_decode_frame(AVCodecContext *avccontext, void *data,
+                        int *got_frame_ptr, AVPacket *avpkt)
+{
+    OggVorbisDecContext *context = avccontext->priv_data ;
+    AVFrame *frame = data;
+    float **pcm ;
+    ogg_packet *op= &context->op;
+    int samples, total_samples, total_bytes;
+    int ret;
+    int16_t *output;
+
+    if(!avpkt->size){
+    //FIXME flush
+        return 0;
+    }
+
+    frame->nb_samples = 8192*4;
+    if ((ret = ff_get_buffer(avccontext, frame, 0)) < 0)
+        return ret;
+    output = (int16_t *)frame->data[0];
+
+
+    op->packet = avpkt->data;
+    op->bytes  = avpkt->size;
+
+//    av_log(avccontext, AV_LOG_DEBUG, "%d %d %d %"PRId64" %"PRId64" %d %d\n", op->bytes, op->b_o_s, op->e_o_s, op->granulepos, op->packetno, buf_size, context->vi.rate);
+
+/*    for(i=0; i<op->bytes; i++)
+      av_log(avccontext, AV_LOG_DEBUG, "%02X ", op->packet[i]);
+    av_log(avccontext, AV_LOG_DEBUG, "\n");*/
+
+    if(vorbis_synthesis(&context->vb, op) == 0)
+        vorbis_synthesis_blockin(&context->vd, &context->vb) ;
+
+    total_samples = 0 ;
+    total_bytes = 0 ;
+
+    while((samples = vorbis_synthesis_pcmout(&context->vd, &pcm)) > 0) {
+        conv(samples, pcm, (char*)output + total_bytes, context->vi.channels) ;
+        total_bytes += samples * 2 * context->vi.channels ;
+        total_samples += samples ;
+        vorbis_synthesis_read(&context->vd, samples) ;
+    }
+
+    frame->nb_samples = total_samples;
+    *got_frame_ptr   = total_samples > 0;
+    return avpkt->size;
+}
+
+
+static int oggvorbis_decode_close(AVCodecContext *avccontext) {
+    OggVorbisDecContext *context = avccontext->priv_data ;
+
+    vorbis_block_clear(&context->vb);
+    vorbis_dsp_clear(&context->vd);
+    vorbis_info_clear(&context->vi) ;
+    vorbis_comment_clear(&context->vc) ;
+
+    return 0 ;
+}
+
+
+AVCodec ff_libvorbis_decoder = {
+    .name           = "libvorbis",
+    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_VORBIS,
+    .priv_data_size = sizeof(OggVorbisDecContext),
+    .init           = oggvorbis_decode_init,
+    .decode         = oggvorbis_decode_frame,
+    .close          = oggvorbis_decode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+};
diff --git a/libavcodec/libvorbis.c b/libavcodec/libvorbisenc.c
index 86c1ed6..3ca5b55 100644
--- a/libavcodec/libvorbis.c
+++ b/libavcodec/libvorbisenc.c
@@ -1,42 +1,34 @@
 /*
- * copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/**
- * @file
- * Vorbis encoding support via libvorbisenc.
- * @author Mark Hills <mark@pogo.org.uk>
- */
-
 #include <vorbis/vorbisenc.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "audio_frame_queue.h"
-#include "bytestream.h"
 #include "internal.h"
 #include "vorbis.h"
 #include "vorbis_parser.h"
 
-#undef NDEBUG
-#include <assert.h>
 
 /* Number of samples the user should send in each call.
  * This value is used because it is the LCD of all possible frame sizes, so
@@ -47,7 +39,7 @@
 
 #define BUFFER_SIZE (1024 * 64)
 
-typedef struct LibvorbisContext {
+typedef struct LibvorbisEncContext {
     AVClass *av_class;                  /**< class for AVOptions            */
     vorbis_info vi;                     /**< vorbis_info used during init   */
     vorbis_dsp_state vd;                /**< DSP state used for analysis    */
@@ -56,14 +48,13 @@ typedef struct LibvorbisContext {
     int eof;                            /**< end-of-file flag               */
     int dsp_initialized;                /**< vd has been initialized        */
     vorbis_comment vc;                  /**< VorbisComment info             */
-    ogg_packet op;                      /**< ogg packet                     */
     double iblock;                      /**< impulse block bias option      */
     AVVorbisParseContext *vp;           /**< parse context to get durations */
     AudioFrameQueue afq;                /**< frame queue for timestamps     */
-} LibvorbisContext;
+} LibvorbisEncContext;
 
 static const AVOption options[] = {
-    { "iblock", "Sets the impulse block bias", offsetof(LibvorbisContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
+    { "iblock", "Sets the impulse block bias", offsetof(LibvorbisEncContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
     { NULL }
 };
 
@@ -72,8 +63,12 @@ static const AVCodecDefault defaults[] = {
     { NULL },
 };
 
-static const AVClass class = { "libvorbis", av_default_item_name, options, LIBAVUTIL_VERSION_INT };
-
+static const AVClass vorbis_class = {
+    .class_name = "libvorbis",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
 
 static int vorbis_error_to_averror(int ov_err)
 {
@@ -87,7 +82,7 @@ static int vorbis_error_to_averror(int ov_err)
 
 static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     double cfreq;
     int ret;
 
@@ -117,14 +112,14 @@ static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
         /* variable bitrate by estimate, disable slow rate management */
         if (minrate == -1 && maxrate == -1)
             if ((ret = vorbis_encode_ctl(vi, OV_ECTL_RATEMANAGE2_SET, NULL)))
-                goto error;
+                goto error; /* should not happen */
     }
 
     /* cutoff frequency */
     if (avctx->cutoff > 0) {
         cfreq = avctx->cutoff / 1000.0;
         if ((ret = vorbis_encode_ctl(vi, OV_ECTL_LOWPASS_SET, &cfreq)))
-            goto error;
+            goto error; /* should not happen */
     }
 
     /* impulse block bias */
@@ -133,6 +128,35 @@ static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
             goto error;
     }
 
+    if (avctx->channels == 3 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_STEREO|AV_CH_FRONT_CENTER) ||
+        avctx->channels == 4 &&
+            avctx->channel_layout != AV_CH_LAYOUT_2_2 &&
+            avctx->channel_layout != AV_CH_LAYOUT_QUAD ||
+        avctx->channels == 5 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT0_BACK ||
+        avctx->channels == 6 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1 &&
+            avctx->channel_layout != AV_CH_LAYOUT_5POINT1_BACK ||
+        avctx->channels == 7 &&
+            avctx->channel_layout != (AV_CH_LAYOUT_5POINT1|AV_CH_BACK_CENTER) ||
+        avctx->channels == 8 &&
+            avctx->channel_layout != AV_CH_LAYOUT_7POINT1) {
+        if (avctx->channel_layout) {
+            char name[32];
+            av_get_channel_layout_string(name, sizeof(name), avctx->channels,
+                                         avctx->channel_layout);
+            av_log(avctx, AV_LOG_ERROR, "%s not supported by Vorbis: "
+                                             "output stream will have incorrect "
+                                             "channel layout.\n", name);
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+                                               "will use Vorbis channel layout for "
+                                               "%d channels.\n", avctx->channels);
+        }
+    }
+
     if ((ret = vorbis_encode_setup_init(vi)))
         goto error;
 
@@ -149,7 +173,7 @@ static int xiph_len(int l)
 
 static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
 
     /* notify vorbisenc this is EOF */
     if (s->dsp_initialized)
@@ -159,7 +183,7 @@ static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
     vorbis_dsp_clear(&s->vd);
     vorbis_info_clear(&s->vi);
 
-    av_fifo_free(s->pkt_fifo);
+    av_fifo_freep(&s->pkt_fifo);
     ff_af_queue_close(&s->afq);
     av_freep(&avctx->extradata);
 
@@ -170,7 +194,7 @@ static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
 
 static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     ogg_packet header, header_comm, header_code;
     uint8_t *p;
     unsigned int offset;
@@ -194,7 +218,8 @@ static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
     }
 
     vorbis_comment_init(&s->vc);
-    vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
+    if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT))
+        vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
 
     if ((ret = vorbis_analysis_headerout(&s->vd, &s->vc, &header, &header_comm,
                                          &header_code))) {
@@ -221,7 +246,7 @@ static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
     offset += header_comm.bytes;
     memcpy(&p[offset], header_code.packet, header_code.bytes);
     offset += header_code.bytes;
-    assert(offset == avctx->extradata_size);
+    av_assert0(offset == avctx->extradata_size);
 
     s->vp = av_vorbis_parse_init(avctx->extradata, avctx->extradata_size);
     if (!s->vp) {
@@ -249,7 +274,7 @@ error:
 static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                                   const AVFrame *frame, int *got_packet_ptr)
 {
-    LibvorbisContext *s = avctx->priv_data;
+    LibvorbisEncContext *s = avctx->priv_data;
     ogg_packet op;
     int ret, duration;
 
@@ -273,7 +298,7 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
             return ret;
     } else {
-        if (!s->eof)
+        if (!s->eof && s->afq.frame_alloc)
             if ((ret = vorbis_analysis_wrote(&s->vd, 0)) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n");
                 return vorbis_error_to_averror(ret);
@@ -291,7 +316,7 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         /* add any available packets to the output packet buffer */
         while ((ret = vorbis_bitrate_flushpacket(&s->vd, &op)) == 1) {
             if (av_fifo_space(s->pkt_fifo) < sizeof(ogg_packet) + op.bytes) {
-                av_log(avctx, AV_LOG_ERROR, "packet buffer is too small");
+                av_log(avctx, AV_LOG_ERROR, "packet buffer is too small\n");
                 return AVERROR_BUG;
             }
             av_fifo_generic_write(s->pkt_fifo, &op, sizeof(ogg_packet), NULL);
@@ -313,10 +338,8 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
     av_fifo_generic_read(s->pkt_fifo, &op, sizeof(ogg_packet), NULL);
 
-    if ((ret = ff_alloc_packet(avpkt, op.bytes))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, op.bytes, 0)) < 0)
         return ret;
-    }
     av_fifo_generic_read(s->pkt_fifo, avpkt->data, op.bytes, NULL);
 
     avpkt->pts = ff_samples_to_time_base(avctx, op.granulepos);
@@ -325,9 +348,12 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     if (duration > 0) {
         /* we do not know encoder delay until we get the first packet from
          * libvorbis, so we have to update the AudioFrameQueue counts */
-        if (!avctx->initial_padding) {
+        if (!avctx->initial_padding && s->afq.frames) {
             avctx->initial_padding    = duration;
-            s->afq.remaining_delay   += duration;
+            av_assert0(!s->afq.remaining_delay);
+            s->afq.frames->duration  += duration;
+            if (s->afq.frames->pts != AV_NOPTS_VALUE)
+                s->afq.frames->pts       -= duration;
             s->afq.remaining_samples += duration;
         }
         ff_af_queue_remove(&s->afq, duration, &avpkt->pts, &avpkt->duration);
@@ -339,16 +365,16 @@ static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 
 AVCodec ff_libvorbis_encoder = {
     .name           = "libvorbis",
-    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis Vorbis"),
+    .long_name      = NULL_IF_CONFIG_SMALL("libvorbis"),
     .type           = AVMEDIA_TYPE_AUDIO,
     .id             = AV_CODEC_ID_VORBIS,
-    .priv_data_size = sizeof(LibvorbisContext),
+    .priv_data_size = sizeof(LibvorbisEncContext),
     .init           = libvorbis_encode_init,
     .encode2        = libvorbis_encode_frame,
     .close          = libvorbis_encode_close,
-    .capabilities   = AV_CODEC_CAP_DELAY,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &vorbis_class,
     .defaults       = defaults,
 };
diff --git a/libavcodec/libvpx.c b/libavcodec/libvpx.c
index 49f966d..55edc7e 100644
--- a/libavcodec/libvpx.c
+++ b/libavcodec/libvpx.c
@@ -1,27 +1,91 @@
 /*
  * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <vpx/vpx_codec.h>
-
 #include "libvpx.h"
+#include "config.h"
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+#include <vpx/vpx_encoder.h>
+#include <vpx/vp8cx.h>
+#endif
 
+static const enum AVPixelFormat vp9_pix_fmts_def[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+static const enum AVPixelFormat vp9_pix_fmts_highcol[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV440P,
+    AV_PIX_FMT_YUV444P,
+#if VPX_IMAGE_ABI_VERSION >= 3
+    AV_PIX_FMT_GBRP,
+#endif
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat vp9_pix_fmts_highbd[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV440P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV420P10,
+    AV_PIX_FMT_YUV422P10,
+    AV_PIX_FMT_YUV440P10,
+    AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_YUV420P12,
+    AV_PIX_FMT_YUV422P12,
+    AV_PIX_FMT_YUV440P12,
+    AV_PIX_FMT_YUV444P12,
+#if VPX_IMAGE_ABI_VERSION >= 3
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_GBRP10,
+    AV_PIX_FMT_GBRP12,
+#endif
+    AV_PIX_FMT_NONE
+};
+#endif
+
+av_cold void ff_vp9_init_static(AVCodec *codec)
+{
+    if (    vpx_codec_version_major() < 1
+        || (vpx_codec_version_major() == 1 && vpx_codec_version_minor() < 3))
+        codec->capabilities |= AV_CODEC_CAP_EXPERIMENTAL;
+    codec->pix_fmts = vp9_pix_fmts_def;
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (    vpx_codec_version_major() > 1
+        || (vpx_codec_version_major() == 1 && vpx_codec_version_minor() >= 4)) {
+#ifdef VPX_CODEC_CAP_HIGHBITDEPTH
+        vpx_codec_caps_t codec_caps = vpx_codec_get_caps(vpx_codec_vp9_cx());
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH)
+            codec->pix_fmts = vp9_pix_fmts_highbd;
+        else
+#endif
+            codec->pix_fmts = vp9_pix_fmts_highcol;
+    }
+#endif
+}
+#if 0
 enum AVPixelFormat ff_vpx_imgfmt_to_pixfmt(vpx_img_fmt_t img)
 {
     switch (img) {
@@ -77,3 +141,4 @@ vpx_img_fmt_t ff_vpx_pixfmt_to_imgfmt(enum AVPixelFormat pix)
     default:                      return VPX_IMG_FMT_NONE;
     }
 }
+#endif
diff --git a/libavcodec/libvpx.h b/libavcodec/libvpx.h
index b437f37..22b697f 100644
--- a/libavcodec/libvpx.h
+++ b/libavcodec/libvpx.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Guillaume Martres <smarter@ubuntu.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,10 @@
 
 #include "avcodec.h"
 
+void ff_vp9_init_static(AVCodec *codec);
+#if 0
 enum AVPixelFormat ff_vpx_imgfmt_to_pixfmt(vpx_img_fmt_t img);
 vpx_img_fmt_t ff_vpx_pixfmt_to_imgfmt(enum AVPixelFormat pix);
+#endif
 
 #endif /* AVCODEC_LIBVPX_H */
diff --git a/libavcodec/libvpxdec.c b/libavcodec/libvpxdec.c
index 28b7733..adbc6d0 100644
--- a/libavcodec/libvpxdec.c
+++ b/libavcodec/libvpxdec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "libvpx.h"
+#include "profiles.h"
 
 typedef struct VP8DecoderContext {
     struct vpx_codec_ctx decoder;
@@ -59,6 +60,114 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     return 0;
 }
 
+// returns 0 on success, AVERROR_INVALIDDATA otherwise
+static int set_pix_fmt(AVCodecContext *avctx, struct vpx_image *img)
+{
+#if VPX_IMAGE_ABI_VERSION >= 3
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+#if VPX_IMAGE_ABI_VERSION >= 4
+    static const enum AVColorRange color_ranges[] = {
+        AVCOL_RANGE_MPEG, AVCOL_RANGE_JPEG
+    };
+    avctx->color_range = color_ranges[img->range];
+#endif
+    avctx->colorspace = colorspaces[img->cs];
+#endif
+    if (avctx->codec_id == AV_CODEC_ID_VP8 && img->fmt != VPX_IMG_FMT_I420)
+        return AVERROR_INVALIDDATA;
+    switch (img->fmt) {
+    case VPX_IMG_FMT_I420:
+        if (avctx->codec_id == AV_CODEC_ID_VP9)
+            avctx->profile = FF_PROFILE_VP9_0;
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        return 0;
+#if CONFIG_LIBVPX_VP9_DECODER
+    case VPX_IMG_FMT_I422:
+        avctx->profile = FF_PROFILE_VP9_1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        return 0;
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case VPX_IMG_FMT_I440:
+        avctx->profile = FF_PROFILE_VP9_1;
+        avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+        return 0;
+#endif
+    case VPX_IMG_FMT_I444:
+        avctx->profile = FF_PROFILE_VP9_1;
+#if VPX_IMAGE_ABI_VERSION >= 3
+        avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                         AV_PIX_FMT_GBRP : AV_PIX_FMT_YUV444P;
+#else
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+#endif
+        return 0;
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+    case VPX_IMG_FMT_I42016:
+        avctx->profile = FF_PROFILE_VP9_2;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV420P12;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+    case VPX_IMG_FMT_I42216:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV422P12;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case VPX_IMG_FMT_I44016:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P10;
+            return 0;
+        } else if (img->bit_depth == 12) {
+            avctx->pix_fmt = AV_PIX_FMT_YUV440P12;
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+#endif
+    case VPX_IMG_FMT_I44416:
+        avctx->profile = FF_PROFILE_VP9_3;
+        if (img->bit_depth == 10) {
+#if VPX_IMAGE_ABI_VERSION >= 3
+            avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                             AV_PIX_FMT_GBRP10 : AV_PIX_FMT_YUV444P10;
+#else
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+#endif
+            return 0;
+        } else if (img->bit_depth == 12) {
+#if VPX_IMAGE_ABI_VERSION >= 3
+            avctx->pix_fmt = avctx->colorspace == AVCOL_SPC_RGB ?
+                             AV_PIX_FMT_GBRP12 : AV_PIX_FMT_YUV444P12;
+#else
+            avctx->pix_fmt = AV_PIX_FMT_YUV444P12;
+#endif
+            return 0;
+        } else {
+            return AVERROR_INVALIDDATA;
+        }
+#endif
+#endif
+    default:
+        return AVERROR_INVALIDDATA;
+    }
+}
+
 static int vp8_decode(AVCodecContext *avctx,
                       void *data, int *got_frame, AVPacket *avpkt)
 {
@@ -81,11 +190,15 @@ static int vp8_decode(AVCodecContext *avctx,
     }
 
     if ((img = vpx_codec_get_frame(&ctx->decoder, &iter))) {
-        avctx->pix_fmt = ff_vpx_imgfmt_to_pixfmt(img->fmt);
-        if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
-            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d)\n",
-                   img->fmt);
-            return AVERROR_INVALIDDATA;
+        if ((ret = set_pix_fmt(avctx, img)) < 0) {
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d) / bit_depth (%d)\n",
+                   img->fmt, img->bit_depth);
+#else
+            av_log(avctx, AV_LOG_ERROR, "Unsupported output colorspace (%d) / bit_depth (%d)\n",
+                   img->fmt, 8);
+#endif
+            return ret;
         }
 
         if ((int) img->d_w != avctx->width || (int) img->d_h != avctx->height) {
@@ -97,7 +210,7 @@ static int vp8_decode(AVCodecContext *avctx,
         }
         if ((ret = ff_get_buffer(avctx, picture, 0)) < 0)
             return ret;
-        av_image_copy(picture->data, picture->linesize, img->planes,
+        av_image_copy(picture->data, picture->linesize, (const uint8_t **)img->planes,
                       img->stride, avctx->pix_fmt, img->d_w, img->d_h);
         *got_frame           = 1;
     }
@@ -145,6 +258,8 @@ AVCodec ff_libvpx_vp9_decoder = {
     .init           = vp9_init,
     .close          = vp8_free,
     .decode         = vp8_decode,
-    .capabilities   = AV_CODEC_CAP_AUTO_THREADS,
+    .capabilities   = AV_CODEC_CAP_AUTO_THREADS | AV_CODEC_CAP_DR1,
+    .init_static_data = ff_vp9_init_static,
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
 };
 #endif /* CONFIG_LIBVPX_VP9_DECODER */
diff --git a/libavcodec/libvpxenc.c b/libavcodec/libvpxenc.c
index 26afaf5..4ea932d 100644
--- a/libavcodec/libvpxenc.c
+++ b/libavcodec/libvpxenc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010, Google, Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,13 @@
 
 #include "avcodec.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libvpx.h"
+#include "profiles.h"
 #include "libavutil/base64.h"
 #include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/opt.h"
 
@@ -43,11 +47,16 @@
 struct FrameListData {
     void *buf;                       /**< compressed data buffer */
     size_t sz;                       /**< length of compressed data */
+    void *buf_alpha;
+    size_t sz_alpha;
     int64_t pts;                     /**< time stamp to show frame
                                           (in timebase units) */
     unsigned long duration;          /**< duration to show frame
                                           (in timebase units) */
     uint32_t flags;                  /**< flags for this frame */
+    uint64_t sse[4];
+    int have_sse;                    /**< true if we have pending sse[] */
+    uint64_t frame_number;
     struct FrameListData *next;
 };
 
@@ -55,33 +64,77 @@ typedef struct VP8EncoderContext {
     AVClass *class;
     struct vpx_codec_ctx encoder;
     struct vpx_image rawimg;
+    struct vpx_codec_ctx encoder_alpha;
+    struct vpx_image rawimg_alpha;
+    uint8_t is_alpha;
     struct vpx_fixed_buf twopass_stats;
-    unsigned long deadline; //i.e., RT/GOOD/BEST
+    int deadline; //i.e., RT/GOOD/BEST
+    uint64_t sse[4];
+    int have_sse; /**< true if we have pending sse[] */
+    uint64_t frame_number;
     struct FrameListData *coded_frame_list;
+
     int cpu_used;
+    /**
+     * VP8 specific flags, see VP8F_* below.
+     */
+    int flags;
+#define VP8F_ERROR_RESILIENT 0x00000001 ///< Enable measures appropriate for streaming over lossy links
+#define VP8F_AUTO_ALT_REF    0x00000002 ///< Enable automatic alternate reference frame generation
+
     int auto_alt_ref;
+
     int arnr_max_frames;
     int arnr_strength;
     int arnr_type;
+
+    int tune;
+
     int lag_in_frames;
     int error_resilient;
     int crf;
     int static_thresh;
+    int max_intra_rate;
+    int rc_undershoot_pct;
+    int rc_overshoot_pct;
+
+    // VP9-only
+    int lossless;
+    int tile_columns;
+    int tile_rows;
+    int frame_parallel;
+    int aq_mode;
     int drop_threshold;
     int noise_sensitivity;
+    int vpx_cs;
 } VP8Context;
 
 /** String mappings for enum vp8e_enc_control_id */
 static const char *const ctlidstr[] = {
-    [VP8E_SET_ARNR_MAXFRAMES]    = "VP8E_SET_ARNR_MAXFRAMES",
-    [VP8E_SET_ARNR_STRENGTH]     = "VP8E_SET_ARNR_STRENGTH",
-    [VP8E_SET_ARNR_TYPE]         = "VP8E_SET_ARNR_TYPE",
     [VP8E_SET_CPUUSED]           = "VP8E_SET_CPUUSED",
-    [VP8E_SET_CQ_LEVEL]          = "VP8E_SET_CQ_LEVEL",
     [VP8E_SET_ENABLEAUTOALTREF]  = "VP8E_SET_ENABLEAUTOALTREF",
     [VP8E_SET_NOISE_SENSITIVITY] = "VP8E_SET_NOISE_SENSITIVITY",
     [VP8E_SET_STATIC_THRESHOLD]  = "VP8E_SET_STATIC_THRESHOLD",
     [VP8E_SET_TOKEN_PARTITIONS]  = "VP8E_SET_TOKEN_PARTITIONS",
+    [VP8E_SET_ARNR_MAXFRAMES]    = "VP8E_SET_ARNR_MAXFRAMES",
+    [VP8E_SET_ARNR_STRENGTH]     = "VP8E_SET_ARNR_STRENGTH",
+    [VP8E_SET_ARNR_TYPE]         = "VP8E_SET_ARNR_TYPE",
+    [VP8E_SET_TUNING]            = "VP8E_SET_TUNING",
+    [VP8E_SET_CQ_LEVEL]          = "VP8E_SET_CQ_LEVEL",
+    [VP8E_SET_MAX_INTRA_BITRATE_PCT] = "VP8E_SET_MAX_INTRA_BITRATE_PCT",
+#if CONFIG_LIBVPX_VP9_ENCODER
+    [VP9E_SET_LOSSLESS]                = "VP9E_SET_LOSSLESS",
+    [VP9E_SET_TILE_COLUMNS]            = "VP9E_SET_TILE_COLUMNS",
+    [VP9E_SET_TILE_ROWS]               = "VP9E_SET_TILE_ROWS",
+    [VP9E_SET_FRAME_PARALLEL_DECODING] = "VP9E_SET_FRAME_PARALLEL_DECODING",
+    [VP9E_SET_AQ_MODE]                 = "VP9E_SET_AQ_MODE",
+#if VPX_ENCODER_ABI_VERSION > 8
+    [VP9E_SET_COLOR_SPACE]             = "VP9E_SET_COLOR_SPACE",
+#endif
+#if VPX_ENCODER_ABI_VERSION >= 11
+    [VP9E_SET_COLOR_RANGE]             = "VP9E_SET_COLOR_RANGE",
+#endif
+#endif
 };
 
 static av_cold void log_encoder_error(AVCodecContext *avctx, const char *desc)
@@ -104,19 +157,26 @@ static av_cold void dump_enc_cfg(AVCodecContext *avctx,
     av_log(avctx, level, "vpx_codec_enc_cfg\n");
     av_log(avctx, level, "generic settings\n"
            "  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n"
+#if CONFIG_LIBVPX_VP9_ENCODER && defined(VPX_IMG_FMT_HIGHBITDEPTH)
+           "  %*s%u\n  %*s%u\n"
+#endif
            "  %*s{%u/%u}\n  %*s%u\n  %*s%d\n  %*s%u\n",
            width, "g_usage:",           cfg->g_usage,
            width, "g_threads:",         cfg->g_threads,
            width, "g_profile:",         cfg->g_profile,
            width, "g_w:",               cfg->g_w,
            width, "g_h:",               cfg->g_h,
+#if CONFIG_LIBVPX_VP9_ENCODER && defined(VPX_IMG_FMT_HIGHBITDEPTH)
+           width, "g_bit_depth:",       cfg->g_bit_depth,
+           width, "g_input_bit_depth:", cfg->g_input_bit_depth,
+#endif
            width, "g_timebase:",        cfg->g_timebase.num, cfg->g_timebase.den,
            width, "g_error_resilient:", cfg->g_error_resilient,
            width, "g_pass:",            cfg->g_pass,
            width, "g_lag_in_frames:",   cfg->g_lag_in_frames);
     av_log(avctx, level, "rate control settings\n"
            "  %*s%u\n  %*s%u\n  %*s%u\n  %*s%u\n"
-           "  %*s%d\n  %*s%p(%zu)\n  %*s%u\n",
+           "  %*s%d\n  %*s%p(%"SIZE_SPECIFIER")\n  %*s%u\n",
            width, "rc_dropframe_thresh:",   cfg->rc_dropframe_thresh,
            width, "rc_resize_allowed:",     cfg->rc_resize_allowed,
            width, "rc_resize_up_thresh:",   cfg->rc_resize_up_thresh,
@@ -163,6 +223,8 @@ static void coded_frame_add(void *list, struct FrameListData *cx_frame)
 static av_cold void free_coded_frame(struct FrameListData *cx_frame)
 {
     av_freep(&cx_frame->buf);
+    if (cx_frame->buf_alpha)
+        av_freep(&cx_frame->buf_alpha);
     av_freep(&cx_frame);
 }
 
@@ -203,28 +265,189 @@ static av_cold int vp8_free(AVCodecContext *avctx)
     VP8Context *ctx = avctx->priv_data;
 
     vpx_codec_destroy(&ctx->encoder);
+    if (ctx->is_alpha)
+        vpx_codec_destroy(&ctx->encoder_alpha);
     av_freep(&ctx->twopass_stats.buf);
     av_freep(&avctx->stats_out);
     free_frame_list(ctx->coded_frame_list);
     return 0;
 }
 
+#if CONFIG_LIBVPX_VP9_ENCODER
+static int set_pix_fmt(AVCodecContext *avctx, vpx_codec_caps_t codec_caps,
+                       struct vpx_codec_enc_cfg *enccfg, vpx_codec_flags_t *flags,
+                       vpx_img_fmt_t *img_fmt)
+{
+    VP8Context av_unused *ctx = avctx->priv_data;
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+    enccfg->g_bit_depth = enccfg->g_input_bit_depth = 8;
+#endif
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUV420P:
+        enccfg->g_profile = 0;
+        *img_fmt = VPX_IMG_FMT_I420;
+        return 0;
+    case AV_PIX_FMT_YUV422P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I422;
+        return 0;
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case AV_PIX_FMT_YUV440P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I440;
+        return 0;
+    case AV_PIX_FMT_GBRP:
+        ctx->vpx_cs = VPX_CS_SRGB;
+#endif
+    case AV_PIX_FMT_YUV444P:
+        enccfg->g_profile = 1;
+        *img_fmt = VPX_IMG_FMT_I444;
+        return 0;
+#ifdef VPX_IMG_FMT_HIGHBITDEPTH
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV420P10 ? 10 : 12;
+            enccfg->g_profile = 2;
+            *img_fmt = VPX_IMG_FMT_I42016;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV422P10 ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I42216;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+#if VPX_IMAGE_ABI_VERSION >= 3
+    case AV_PIX_FMT_YUV440P10:
+    case AV_PIX_FMT_YUV440P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV440P10 ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I44016;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+        ctx->vpx_cs = VPX_CS_SRGB;
+#endif
+    case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
+        if (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH) {
+            enccfg->g_bit_depth = enccfg->g_input_bit_depth =
+                avctx->pix_fmt == AV_PIX_FMT_YUV444P10 ||
+                avctx->pix_fmt == AV_PIX_FMT_GBRP10 ? 10 : 12;
+            enccfg->g_profile = 3;
+            *img_fmt = VPX_IMG_FMT_I44416;
+            *flags |= VPX_CODEC_USE_HIGHBITDEPTH;
+            return 0;
+        }
+        break;
+#endif
+    default:
+        break;
+    }
+    av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format.\n");
+    return AVERROR_INVALIDDATA;
+}
+
+#if VPX_ENCODER_ABI_VERSION > 8
+static void set_colorspace(AVCodecContext *avctx)
+{
+    enum vpx_color_space vpx_cs;
+    VP8Context *ctx = avctx->priv_data;
+
+    if (ctx->vpx_cs) {
+        vpx_cs = ctx->vpx_cs;
+    } else {
+        switch (avctx->colorspace) {
+        case AVCOL_SPC_RGB:         vpx_cs = VPX_CS_SRGB;      break;
+        case AVCOL_SPC_BT709:       vpx_cs = VPX_CS_BT_709;    break;
+        case AVCOL_SPC_UNSPECIFIED: vpx_cs = VPX_CS_UNKNOWN;   break;
+        case AVCOL_SPC_RESERVED:    vpx_cs = VPX_CS_RESERVED;  break;
+        case AVCOL_SPC_BT470BG:     vpx_cs = VPX_CS_BT_601;    break;
+        case AVCOL_SPC_SMPTE170M:   vpx_cs = VPX_CS_SMPTE_170; break;
+        case AVCOL_SPC_SMPTE240M:   vpx_cs = VPX_CS_SMPTE_240; break;
+        case AVCOL_SPC_BT2020_NCL:  vpx_cs = VPX_CS_BT_2020;   break;
+        default:
+            av_log(avctx, AV_LOG_WARNING, "Unsupported colorspace (%d)\n",
+                   avctx->colorspace);
+            return;
+        }
+    }
+    codecctl_int(avctx, VP9E_SET_COLOR_SPACE, vpx_cs);
+}
+#endif
+
+#if VPX_ENCODER_ABI_VERSION >= 11
+static void set_color_range(AVCodecContext *avctx)
+{
+    enum vpx_color_range vpx_cr;
+    switch (avctx->color_range) {
+    case AVCOL_RANGE_UNSPECIFIED:
+    case AVCOL_RANGE_MPEG:       vpx_cr = VPX_CR_STUDIO_RANGE; break;
+    case AVCOL_RANGE_JPEG:       vpx_cr = VPX_CR_FULL_RANGE;   break;
+    default:
+        av_log(avctx, AV_LOG_WARNING, "Unsupported color range (%d)\n",
+               avctx->color_range);
+        return;
+    }
+
+    codecctl_int(avctx, VP9E_SET_COLOR_RANGE, vpx_cr);
+}
+#endif
+#endif
+
 static av_cold int vpx_init(AVCodecContext *avctx,
                             const struct vpx_codec_iface *iface)
 {
     VP8Context *ctx = avctx->priv_data;
     struct vpx_codec_enc_cfg enccfg = { 0 };
+    struct vpx_codec_enc_cfg enccfg_alpha;
+    vpx_codec_flags_t flags = (avctx->flags & AV_CODEC_FLAG_PSNR) ? VPX_CODEC_USE_PSNR : 0;
     AVCPBProperties *cpb_props;
     int res;
+    vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420;
+#if CONFIG_LIBVPX_VP9_ENCODER
+    vpx_codec_caps_t codec_caps = vpx_codec_get_caps(iface);
+#endif
 
     av_log(avctx, AV_LOG_INFO, "%s\n", vpx_codec_version_str());
     av_log(avctx, AV_LOG_VERBOSE, "%s\n", vpx_codec_build_config());
 
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P)
+        ctx->is_alpha = 1;
+
     if ((res = vpx_codec_enc_config_default(iface, &enccfg, 0)) != VPX_CODEC_OK) {
         av_log(avctx, AV_LOG_ERROR, "Failed to get config: %s\n",
                vpx_codec_err_to_string(res));
         return AVERROR(EINVAL);
     }
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9) {
+        if (set_pix_fmt(avctx, codec_caps, &enccfg, &flags, &img_fmt))
+            return AVERROR(EINVAL);
+    }
+#endif
+
+    if(!avctx->bit_rate)
+        if(avctx->rc_max_rate || avctx->rc_buffer_size || avctx->rc_initial_buffer_occupancy) {
+            av_log( avctx, AV_LOG_ERROR, "Rate control parameters set without a bitrate\n");
+            return AVERROR(EINVAL);
+        }
+
     dump_enc_cfg(avctx, &enccfg);
 
     enccfg.g_w            = avctx->width;
@@ -232,9 +455,7 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     enccfg.g_timebase.num = avctx->time_base.num;
     enccfg.g_timebase.den = avctx->time_base.den;
     enccfg.g_threads      = avctx->thread_count;
-
-    if (ctx->lag_in_frames >= 0)
-        enccfg.g_lag_in_frames = ctx->lag_in_frames;
+    enccfg.g_lag_in_frames= ctx->lag_in_frames;
 
     if (avctx->flags & AV_CODEC_FLAG_PASS1)
         enccfg.g_pass = VPX_RC_FIRST_PASS;
@@ -243,22 +464,56 @@ static av_cold int vpx_init(AVCodecContext *avctx,
     else
         enccfg.g_pass = VPX_RC_ONE_PASS;
 
-    if (!avctx->bit_rate)
-        avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
-    else
+    if (avctx->rc_min_rate == avctx->rc_max_rate &&
+        avctx->rc_min_rate == avctx->bit_rate && avctx->bit_rate) {
+        enccfg.rc_end_usage = VPX_CBR;
+    } else if (ctx->crf >= 0) {
+        enccfg.rc_end_usage = VPX_CQ;
+#if CONFIG_LIBVPX_VP9_ENCODER
+        if (!avctx->bit_rate && avctx->codec_id == AV_CODEC_ID_VP9)
+            enccfg.rc_end_usage = VPX_Q;
+#endif
+    }
+
+    if (avctx->bit_rate) {
         enccfg.rc_target_bitrate = av_rescale_rnd(avctx->bit_rate, 1, 1000,
-                                              AV_ROUND_NEAR_INF);
+                                                  AV_ROUND_NEAR_INF);
+#if CONFIG_LIBVPX_VP9_ENCODER
+    } else if (enccfg.rc_end_usage == VPX_Q) {
+#endif
+    } else {
+        if (enccfg.rc_end_usage == VPX_CQ) {
+            enccfg.rc_target_bitrate = 1000000;
+        } else {
+            avctx->bit_rate = enccfg.rc_target_bitrate * 1000;
+            av_log(avctx, AV_LOG_WARNING,
+                   "Neither bitrate nor constrained quality specified, using default bitrate of %dkbit/sec\n",
+                   enccfg.rc_target_bitrate);
+        }
+    }
 
-    if (ctx->crf)
-        enccfg.rc_end_usage = VPX_CQ;
-    else if (avctx->rc_min_rate == avctx->rc_max_rate &&
-             avctx->rc_min_rate == avctx->bit_rate)
-        enccfg.rc_end_usage = VPX_CBR;
+    if (avctx->codec_id == AV_CODEC_ID_VP9 && ctx->lossless == 1) {
+        enccfg.rc_min_quantizer =
+        enccfg.rc_max_quantizer = 0;
+    } else {
+        if (avctx->qmin >= 0)
+            enccfg.rc_min_quantizer = avctx->qmin;
+        if (avctx->qmax >= 0)
+            enccfg.rc_max_quantizer = avctx->qmax;
+    }
 
-    if (avctx->qmin > 0)
-        enccfg.rc_min_quantizer = avctx->qmin;
-    if (avctx->qmax > 0)
-        enccfg.rc_max_quantizer = avctx->qmax;
+    if (enccfg.rc_end_usage == VPX_CQ
+#if CONFIG_LIBVPX_VP9_ENCODER
+        || enccfg.rc_end_usage == VPX_Q
+#endif
+       ) {
+        if (ctx->crf < enccfg.rc_min_quantizer || ctx->crf > enccfg.rc_max_quantizer) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "CQ level %d must be between minimum and maximum quantizer value (%d-%d)\n",
+                   ctx->crf, enccfg.rc_min_quantizer, enccfg.rc_max_quantizer);
+            return AVERROR(EINVAL);
+        }
+    }
 
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -269,9 +524,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
     enccfg.rc_dropframe_thresh = ctx->drop_threshold;
 
     //0-100 (0 => CBR, 100 => VBR)
-    enccfg.rc_2pass_vbr_bias_pct           = round(avctx->qcompress * 100);
-    enccfg.rc_2pass_vbr_minsection_pct     =
-        avctx->rc_min_rate * 100LL / avctx->bit_rate;
+    enccfg.rc_2pass_vbr_bias_pct           = lrint(avctx->qcompress * 100);
+    if (avctx->bit_rate)
+        enccfg.rc_2pass_vbr_minsection_pct =
+            avctx->rc_min_rate * 100LL / avctx->bit_rate;
     if (avctx->rc_max_rate)
         enccfg.rc_2pass_vbr_maxsection_pct =
             avctx->rc_max_rate * 100LL / avctx->bit_rate;
@@ -283,6 +539,19 @@ FF_ENABLE_DEPRECATION_WARNINGS
         enccfg.rc_buf_initial_sz =
             avctx->rc_initial_buffer_occupancy * 1000LL / avctx->bit_rate;
     enccfg.rc_buf_optimal_sz     = enccfg.rc_buf_sz * 5 / 6;
+#if FF_API_MPV_OPT
+    FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->rc_buffer_aggressivity != 1.0) {
+        av_log(avctx, AV_LOG_WARNING, "The rc_buffer_aggressivity option is "
+               "deprecated, use the undershoot-pct private option instead.\n");
+        enccfg.rc_undershoot_pct = lrint(avctx->rc_buffer_aggressivity * 100);
+    }
+    FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+    if (ctx->rc_undershoot_pct >= 0)
+        enccfg.rc_undershoot_pct = ctx->rc_undershoot_pct;
+    if (ctx->rc_overshoot_pct >= 0)
+        enccfg.rc_overshoot_pct = ctx->rc_overshoot_pct;
 
     //_enc_init() will balk if kf_min_dist differs from max w/VPX_KF_AUTO
     if (avctx->keyint_min >= 0 && avctx->keyint_min == avctx->gop_size)
@@ -304,8 +573,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
         ret = av_reallocp(&ctx->twopass_stats.buf, ctx->twopass_stats.sz);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "Stat buffer alloc (%zu bytes) failed\n",
+                   "Stat buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                    ctx->twopass_stats.sz);
+            ctx->twopass_stats.sz = 0;
             return ret;
         }
         decode_size = av_base64_decode(ctx->twopass_stats.buf, avctx->stats_in,
@@ -324,25 +594,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
        quality. */
     if (avctx->profile != FF_PROFILE_UNKNOWN)
         enccfg.g_profile = avctx->profile;
-    else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P)
-        avctx->profile = enccfg.g_profile = FF_PROFILE_VP9_0;
-    else
-        avctx->profile = enccfg.g_profile = FF_PROFILE_VP9_1;
 
-    enccfg.g_error_resilient = ctx->error_resilient;
+    enccfg.g_error_resilient = ctx->error_resilient || ctx->flags & VP8F_ERROR_RESILIENT;
 
     dump_enc_cfg(avctx, &enccfg);
     /* Construct Encoder Context */
-    res = vpx_codec_enc_init(&ctx->encoder, iface, &enccfg, 0);
+    res = vpx_codec_enc_init(&ctx->encoder, iface, &enccfg, flags);
     if (res != VPX_CODEC_OK) {
         log_encoder_error(avctx, "Failed to initialize encoder");
         return AVERROR(EINVAL);
     }
 
+    if (ctx->is_alpha) {
+        enccfg_alpha = enccfg;
+        res = vpx_codec_enc_init(&ctx->encoder_alpha, iface, &enccfg_alpha, flags);
+        if (res != VPX_CODEC_OK) {
+            log_encoder_error(avctx, "Failed to initialize alpha encoder");
+            return AVERROR(EINVAL);
+        }
+    }
+
     //codec control failures are currently treated only as warnings
     av_log(avctx, AV_LOG_DEBUG, "vpx_codec_control\n");
-    if (ctx->cpu_used != INT_MIN)
-        codecctl_int(avctx, VP8E_SET_CPUUSED,          ctx->cpu_used);
+    codecctl_int(avctx, VP8E_SET_CPUUSED,          ctx->cpu_used);
+    if (ctx->flags & VP8F_AUTO_ALT_REF)
+        ctx->auto_alt_ref = 1;
     if (ctx->auto_alt_ref >= 0)
         codecctl_int(avctx, VP8E_SET_ENABLEAUTOALTREF, ctx->auto_alt_ref);
     if (ctx->arnr_max_frames >= 0)
@@ -351,8 +627,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
         codecctl_int(avctx, VP8E_SET_ARNR_STRENGTH,    ctx->arnr_strength);
     if (ctx->arnr_type >= 0)
         codecctl_int(avctx, VP8E_SET_ARNR_TYPE,        ctx->arnr_type);
+    if (ctx->tune >= 0)
+        codecctl_int(avctx, VP8E_SET_TUNING,           ctx->tune);
 
-    if (CONFIG_LIBVPX_VP8_ENCODER && iface == &vpx_codec_vp8_cx_algo) {
+    if (CONFIG_LIBVPX_VP8_ENCODER && avctx->codec_id == AV_CODEC_ID_VP8) {
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
         if (avctx->noise_reduction)
@@ -372,11 +650,45 @@ FF_ENABLE_DEPRECATION_WARNINGS
     FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     codecctl_int(avctx, VP8E_SET_STATIC_THRESHOLD,  ctx->static_thresh);
-    codecctl_int(avctx, VP8E_SET_CQ_LEVEL,          ctx->crf);
+    if (ctx->crf >= 0)
+        codecctl_int(avctx, VP8E_SET_CQ_LEVEL,          ctx->crf);
+    if (ctx->max_intra_rate >= 0)
+        codecctl_int(avctx, VP8E_SET_MAX_INTRA_BITRATE_PCT, ctx->max_intra_rate);
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+    if (avctx->codec_id == AV_CODEC_ID_VP9) {
+        if (ctx->lossless >= 0)
+            codecctl_int(avctx, VP9E_SET_LOSSLESS, ctx->lossless);
+        if (ctx->tile_columns >= 0)
+            codecctl_int(avctx, VP9E_SET_TILE_COLUMNS, ctx->tile_columns);
+        if (ctx->tile_rows >= 0)
+            codecctl_int(avctx, VP9E_SET_TILE_ROWS, ctx->tile_rows);
+        if (ctx->frame_parallel >= 0)
+            codecctl_int(avctx, VP9E_SET_FRAME_PARALLEL_DECODING, ctx->frame_parallel);
+        if (ctx->aq_mode >= 0)
+            codecctl_int(avctx, VP9E_SET_AQ_MODE, ctx->aq_mode);
+#if VPX_ENCODER_ABI_VERSION > 8
+        set_colorspace(avctx);
+#endif
+#if VPX_ENCODER_ABI_VERSION >= 11
+        set_color_range(avctx);
+#endif
+    }
+#endif
+
+    av_log(avctx, AV_LOG_DEBUG, "Using deadline: %d\n", ctx->deadline);
 
     //provide dummy value to initialize wrapper, values will be updated each _encode()
-    vpx_img_wrap(&ctx->rawimg, ff_vpx_pixfmt_to_imgfmt(avctx->pix_fmt),
-                 avctx->width, avctx->height, 1, (unsigned char *)1);
+    vpx_img_wrap(&ctx->rawimg, img_fmt, avctx->width, avctx->height, 1,
+                 (unsigned char*)1);
+#if CONFIG_LIBVPX_VP9_ENCODER && defined(VPX_IMG_FMT_HIGHBITDEPTH)
+    if (avctx->codec_id == AV_CODEC_ID_VP9 && (codec_caps & VPX_CODEC_CAP_HIGHBITDEPTH))
+        ctx->rawimg.bit_depth = enccfg.g_bit_depth;
+#endif
+
+    if (ctx->is_alpha)
+        vpx_img_wrap(&ctx->rawimg_alpha, VPX_IMG_FMT_I420, avctx->width, avctx->height, 1,
+                     (unsigned char*)1);
 
     cpb_props = ff_add_cpb_side_data(avctx);
     if (!cpb_props)
@@ -394,13 +706,38 @@ FF_ENABLE_DEPRECATION_WARNINGS
 }
 
 static inline void cx_pktcpy(struct FrameListData *dst,
-                             const struct vpx_codec_cx_pkt *src)
+                             const struct vpx_codec_cx_pkt *src,
+                             const struct vpx_codec_cx_pkt *src_alpha,
+                             VP8Context *ctx)
 {
     dst->pts      = src->data.frame.pts;
     dst->duration = src->data.frame.duration;
     dst->flags    = src->data.frame.flags;
     dst->sz       = src->data.frame.sz;
     dst->buf      = src->data.frame.buf;
+    dst->have_sse = 0;
+    /* For alt-ref frame, don't store PSNR or increment frame_number */
+    if (!(dst->flags & VPX_FRAME_IS_INVISIBLE)) {
+        dst->frame_number = ++ctx->frame_number;
+        dst->have_sse = ctx->have_sse;
+        if (ctx->have_sse) {
+            /* associate last-seen SSE to the frame. */
+            /* Transfers ownership from ctx to dst. */
+            /* WARNING! This makes the assumption that PSNR_PKT comes
+               just before the frame it refers to! */
+            memcpy(dst->sse, ctx->sse, sizeof(dst->sse));
+            ctx->have_sse = 0;
+        }
+    } else {
+        dst->frame_number = -1;   /* sanity marker */
+    }
+    if (src_alpha) {
+        dst->buf_alpha = src_alpha->data.frame.buf;
+        dst->sz_alpha = src_alpha->data.frame.sz;
+    } else {
+        dst->buf_alpha = NULL;
+        dst->sz_alpha = 0;
+    }
 }
 
 /**
@@ -413,8 +750,10 @@ static inline void cx_pktcpy(struct FrameListData *dst,
 static int storeframe(AVCodecContext *avctx, struct FrameListData *cx_frame,
                       AVPacket *pkt)
 {
-    int ret = ff_alloc_packet(pkt, cx_frame->sz);
+    int ret = ff_alloc_packet2(avctx, pkt, cx_frame->sz, 0);
+    uint8_t *side_data;
     if (ret >= 0) {
+        int pict_type;
         memcpy(pkt->data, cx_frame->buf, pkt->size);
         pkt->pts = pkt->dts = cx_frame->pts;
 #if FF_API_CODED_FRAME
@@ -425,22 +764,54 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
         if (!!(cx_frame->flags & VPX_FRAME_IS_KEY)) {
+            pict_type = AV_PICTURE_TYPE_I;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+            avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
             pkt->flags |= AV_PKT_FLAG_KEY;
         } else {
+            pict_type = AV_PICTURE_TYPE_P;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+            avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
         }
+
+        ff_side_data_set_encoder_stats(pkt, 0, cx_frame->sse + 1,
+                                       cx_frame->have_sse ? 3 : 0, pict_type);
+
+        if (cx_frame->have_sse) {
+            int i;
+            /* Beware of the Y/U/V/all order! */
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+            avctx->coded_frame->error[0] = cx_frame->sse[1];
+            avctx->coded_frame->error[1] = cx_frame->sse[2];
+            avctx->coded_frame->error[2] = cx_frame->sse[3];
+            avctx->coded_frame->error[3] = 0;    // alpha
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+            for (i = 0; i < 3; ++i) {
+                avctx->error[i] += cx_frame->sse[i + 1];
+            }
+            cx_frame->have_sse = 0;
+        }
+        if (cx_frame->sz_alpha > 0) {
+            side_data = av_packet_new_side_data(pkt,
+                                                AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+                                                cx_frame->sz_alpha + 8);
+            if(!side_data) {
+                av_packet_unref(pkt);
+                av_free(pkt);
+                return AVERROR(ENOMEM);
+            }
+            AV_WB64(side_data, 1);
+            memcpy(side_data + 8, cx_frame->buf_alpha, cx_frame->sz_alpha);
+        }
     } else {
-        av_log(avctx, AV_LOG_ERROR,
-               "Error getting output packet of size %zu.\n", cx_frame->sz);
         return ret;
     }
     return pkt->size;
@@ -458,7 +829,9 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 {
     VP8Context *ctx = avctx->priv_data;
     const struct vpx_codec_cx_pkt *pkt;
+    const struct vpx_codec_cx_pkt *pkt_alpha = NULL;
     const void *iter = NULL;
+    const void *iter_alpha = NULL;
     int size = 0;
 
     if (ctx->coded_frame_list) {
@@ -473,7 +846,9 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 
     /* consume all available output from the encoder before returning. buffers
        are only good through the next vpx_codec call */
-    while ((pkt = vpx_codec_get_cx_data(&ctx->encoder, &iter))) {
+    while ((pkt = vpx_codec_get_cx_data(&ctx->encoder, &iter)) &&
+           (!ctx->is_alpha ||
+            (ctx->is_alpha && (pkt_alpha = vpx_codec_get_cx_data(&ctx->encoder_alpha, &iter_alpha))))) {
         switch (pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT:
             if (!size) {
@@ -481,8 +856,8 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
 
                 /* avoid storing the frame when the list is empty and we haven't yet
                    provided a frame for output */
-                assert(!ctx->coded_frame_list);
-                cx_pktcpy(&cx_frame, pkt);
+                av_assert0(!ctx->coded_frame_list);
+                cx_pktcpy(&cx_frame, pkt, pkt_alpha, ctx);
                 size = storeframe(avctx, &cx_frame, pkt_out);
                 if (size < 0)
                     return size;
@@ -495,17 +870,28 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
                            "Frame queue element alloc failed\n");
                     return AVERROR(ENOMEM);
                 }
-                cx_pktcpy(cx_frame, pkt);
+                cx_pktcpy(cx_frame, pkt, pkt_alpha, ctx);
                 cx_frame->buf = av_malloc(cx_frame->sz);
 
                 if (!cx_frame->buf) {
                     av_log(avctx, AV_LOG_ERROR,
-                           "Data buffer alloc (%zu bytes) failed\n",
+                           "Data buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
                            cx_frame->sz);
                     av_freep(&cx_frame);
                     return AVERROR(ENOMEM);
                 }
                 memcpy(cx_frame->buf, pkt->data.frame.buf, pkt->data.frame.sz);
+                if (ctx->is_alpha) {
+                    cx_frame->buf_alpha = av_malloc(cx_frame->sz_alpha);
+                    if (!cx_frame->buf_alpha) {
+                        av_log(avctx, AV_LOG_ERROR,
+                               "Data buffer alloc (%"SIZE_SPECIFIER" bytes) failed\n",
+                               cx_frame->sz_alpha);
+                        av_free(cx_frame);
+                        return AVERROR(ENOMEM);
+                    }
+                    memcpy(cx_frame->buf_alpha, pkt_alpha->data.frame.buf, pkt_alpha->data.frame.sz);
+                }
                 coded_frame_add(&ctx->coded_frame_list, cx_frame);
             }
             break;
@@ -524,7 +910,14 @@ static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out)
             stats->sz += pkt->data.twopass_stats.sz;
             break;
         }
-        case VPX_CODEC_PSNR_PKT: //FIXME add support for AV_CODEC_FLAG_PSNR
+        case VPX_CODEC_PSNR_PKT:
+            av_assert0(!ctx->have_sse);
+            ctx->sse[0] = pkt->data.psnr.sse[0];
+            ctx->sse[1] = pkt->data.psnr.sse[1];
+            ctx->sse[2] = pkt->data.psnr.sse[2];
+            ctx->sse[3] = pkt->data.psnr.sse[3];
+            ctx->have_sse = 1;
+            break;
         case VPX_CODEC_CUSTOM_PKT:
             //ignore unsupported/unrecognized packet types
             break;
@@ -539,6 +932,7 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
 {
     VP8Context *ctx = avctx->priv_data;
     struct vpx_image *rawimg = NULL;
+    struct vpx_image *rawimg_alpha = NULL;
     int64_t timestamp = 0;
     int res, coded_size;
     vpx_enc_frame_flags_t flags = 0;
@@ -551,6 +945,25 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
         rawimg->stride[VPX_PLANE_Y] = frame->linesize[0];
         rawimg->stride[VPX_PLANE_U] = frame->linesize[1];
         rawimg->stride[VPX_PLANE_V] = frame->linesize[2];
+        if (ctx->is_alpha) {
+            uint8_t *u_plane, *v_plane;
+            rawimg_alpha = &ctx->rawimg_alpha;
+            rawimg_alpha->planes[VPX_PLANE_Y] = frame->data[3];
+            u_plane = av_malloc(frame->linesize[1] * frame->height);
+            v_plane = av_malloc(frame->linesize[2] * frame->height);
+            if (!u_plane || !v_plane) {
+                av_free(u_plane);
+                av_free(v_plane);
+                return AVERROR(ENOMEM);
+            }
+            memset(u_plane, 0x80, frame->linesize[1] * frame->height);
+            rawimg_alpha->planes[VPX_PLANE_U] = u_plane;
+            memset(v_plane, 0x80, frame->linesize[2] * frame->height);
+            rawimg_alpha->planes[VPX_PLANE_V] = v_plane;
+            rawimg_alpha->stride[VPX_PLANE_Y] = frame->linesize[0];
+            rawimg_alpha->stride[VPX_PLANE_U] = frame->linesize[1];
+            rawimg_alpha->stride[VPX_PLANE_V] = frame->linesize[2];
+        }
         timestamp                   = frame->pts;
         if (frame->pict_type == AV_PICTURE_TYPE_I)
             flags |= VPX_EFLAG_FORCE_KF;
@@ -562,6 +975,16 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
         log_encoder_error(avctx, "Error encoding frame");
         return AVERROR_INVALIDDATA;
     }
+
+    if (ctx->is_alpha) {
+        res = vpx_codec_encode(&ctx->encoder_alpha, rawimg_alpha, timestamp,
+                               avctx->ticks_per_frame, flags, ctx->deadline);
+        if (res != VPX_CODEC_OK) {
+            log_encoder_error(avctx, "Error encoding alpha frame");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     coded_size = queue_frames(avctx, pkt);
 
     if (!frame && avctx->flags & AV_CODEC_FLAG_PASS1) {
@@ -577,42 +1000,95 @@ static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
                          ctx->twopass_stats.sz);
     }
 
+    if (rawimg_alpha) {
+        av_freep(&rawimg_alpha->planes[VPX_PLANE_U]);
+        av_freep(&rawimg_alpha->planes[VPX_PLANE_V]);
+    }
+
     *got_packet = !!coded_size;
     return 0;
 }
 
 #define OFFSET(x) offsetof(VP8Context, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "cpu-used",        "Quality/Speed ratio modifier",           OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1}, INT_MIN, INT_MAX, VE},
-    { "auto-alt-ref",    "Enable use of alternate reference "
-                         "frames (2-pass only)",                   OFFSET(auto_alt_ref),    AV_OPT_TYPE_INT, {.i64 = -1},      -1,      1,       VE},
-    { "lag-in-frames",   "Number of frames to look ahead for "
-                         "alternate reference frame selection",    OFFSET(lag_in_frames),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-maxframes",  "altref noise reduction max frame count", OFFSET(arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-strength",   "altref noise reduction filter strength", OFFSET(arnr_strength),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE},
-    { "arnr-type",       "altref noise reduction filter type",     OFFSET(arnr_type),       AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "arnr_type"},
-    { "backward",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "arnr_type" },
-    { "forward",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "arnr_type" },
-    { "centered",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "arnr_type" },
-    { "deadline",        "Time to spend encoding, in microseconds.", OFFSET(deadline),      AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"},
-    { "best",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_BEST_QUALITY}, 0, 0, VE, "quality"},
-    { "good",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_GOOD_QUALITY}, 0, 0, VE, "quality"},
-    { "realtime",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_REALTIME},     0, 0, VE, "quality"},
-    { "error-resilient", "Error resilience configuration", OFFSET(error_resilient), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, VE, "er"},
-#ifdef VPX_ERROR_RESILIENT_DEFAULT
-    { "default",         "Improve resiliency against losses of whole frames", 0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_DEFAULT}, 0, 0, VE, "er"},
-    { "partitions",      "The frame partitions are independently decodable "
-                         "by the bool decoder, meaning that partitions can be decoded even "
-                         "though earlier partitions have been lost. Note that intra predicition"
-                         " is still done over the partition boundary.",       0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_PARTITIONS}, 0, 0, VE, "er"},
-#endif
-    { "crf",              "Select the quality for constant quality mode", offsetof(VP8Context, crf), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 63, VE },
-    { "static-thresh",    "A change threshold on blocks below which they will be skipped by the encoder", OFFSET(static_thresh), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
-    { "drop-threshold",   "Frame drop threshold", offsetof(VP8Context, drop_threshold), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, VE },
-    { "noise-sensitivity", "Noise sensitivity", OFFSET(noise_sensitivity), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 4, VE},
+
+#ifndef VPX_ERROR_RESILIENT_DEFAULT
+#define VPX_ERROR_RESILIENT_DEFAULT 1
+#define VPX_ERROR_RESILIENT_PARTITIONS 2
+#endif
+
+#define COMMON_OPTIONS \
+    { "auto-alt-ref",    "Enable use of alternate reference " \
+                         "frames (2-pass only)",                   OFFSET(auto_alt_ref),    AV_OPT_TYPE_BOOL, {.i64 = -1},     -1,      1,       VE}, \
+    { "lag-in-frames",   "Number of frames to look ahead for " \
+                         "alternate reference frame selection",    OFFSET(lag_in_frames),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-maxframes",  "altref noise reduction max frame count", OFFSET(arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-strength",   "altref noise reduction filter strength", OFFSET(arnr_strength),   AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE}, \
+    { "arnr-type",       "altref noise reduction filter type",     OFFSET(arnr_type),       AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "arnr_type"}, \
+    { "backward",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "arnr_type" }, \
+    { "forward",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "arnr_type" }, \
+    { "centered",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "arnr_type" }, \
+    { "tune",            "Tune the encoding to a specific scenario", OFFSET(tune),          AV_OPT_TYPE_INT, {.i64 = -1},      -1,      INT_MAX, VE, "tune"}, \
+    { "psnr",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VP8_TUNE_PSNR}, 0, 0, VE, "tune"}, \
+    { "ssim",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VP8_TUNE_SSIM}, 0, 0, VE, "tune"}, \
+    { "deadline",        "Time to spend encoding, in microseconds.", OFFSET(deadline),      AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"}, \
+    { "best",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_BEST_QUALITY}, 0, 0, VE, "quality"}, \
+    { "good",            NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_GOOD_QUALITY}, 0, 0, VE, "quality"}, \
+    { "realtime",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = VPX_DL_REALTIME},     0, 0, VE, "quality"}, \
+    { "error-resilient", "Error resilience configuration", OFFSET(error_resilient), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, VE, "er"}, \
+    { "max-intra-rate",  "Maximum I-frame bitrate (pct) 0=unlimited",  OFFSET(max_intra_rate),  AV_OPT_TYPE_INT,  {.i64 = -1}, -1,      INT_MAX, VE}, \
+    { "default",         "Improve resiliency against losses of whole frames", 0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_DEFAULT}, 0, 0, VE, "er"}, \
+    { "partitions",      "The frame partitions are independently decodable " \
+                         "by the bool decoder, meaning that partitions can be decoded even " \
+                         "though earlier partitions have been lost. Note that intra predicition" \
+                         " is still done over the partition boundary.",       0, AV_OPT_TYPE_CONST, {.i64 = VPX_ERROR_RESILIENT_PARTITIONS}, 0, 0, VE, "er"}, \
+    { "crf",              "Select the quality for constant quality mode", offsetof(VP8Context, crf), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 63, VE }, \
+    { "static-thresh",    "A change threshold on blocks below which they will be skipped by the encoder", OFFSET(static_thresh), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE }, \
+    { "drop-threshold",   "Frame drop threshold", offsetof(VP8Context, drop_threshold), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, VE }, \
+    { "noise-sensitivity", "Noise sensitivity", OFFSET(noise_sensitivity), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 4, VE}, \
+    { "undershoot-pct",  "Datarate undershoot (min) target (%)", OFFSET(rc_undershoot_pct), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 100, VE }, \
+    { "overshoot-pct",   "Datarate overshoot (max) target (%)", OFFSET(rc_overshoot_pct), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1000, VE }, \
+
+#define LEGACY_OPTIONS \
+    {"speed", "", offsetof(VP8Context, cpu_used), AV_OPT_TYPE_INT, {.i64 = 1}, -16, 16, VE}, \
+    {"quality", "", offsetof(VP8Context, deadline), AV_OPT_TYPE_INT, {.i64 = VPX_DL_GOOD_QUALITY}, INT_MIN, INT_MAX, VE, "quality"}, \
+    {"vp8flags", "", offsetof(VP8Context, flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, UINT_MAX, VE, "flags"}, \
+    {"error_resilient", "enable error resilience", 0, AV_OPT_TYPE_CONST, {.i64 = VP8F_ERROR_RESILIENT}, INT_MIN, INT_MAX, VE, "flags"}, \
+    {"altref", "enable use of alternate reference frames (VP8/2-pass only)", 0, AV_OPT_TYPE_CONST, {.i64 = VP8F_AUTO_ALT_REF}, INT_MIN, INT_MAX, VE, "flags"}, \
+    {"arnr_max_frames", "altref noise reduction max frame count", offsetof(VP8Context, arnr_max_frames), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 15, VE}, \
+    {"arnr_strength", "altref noise reduction filter strength", offsetof(VP8Context, arnr_strength), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 6, VE}, \
+    {"arnr_type", "altref noise reduction filter type", offsetof(VP8Context, arnr_type), AV_OPT_TYPE_INT, {.i64 = 3}, 1, 3, VE}, \
+    {"rc_lookahead", "Number of frames to look ahead for alternate reference frame selection", offsetof(VP8Context, lag_in_frames), AV_OPT_TYPE_INT, {.i64 = 25}, 0, 25, VE}, \
+
+#if CONFIG_LIBVPX_VP8_ENCODER
+static const AVOption vp8_options[] = {
+    COMMON_OPTIONS
+    { "cpu-used",        "Quality/Speed ratio modifier",                OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1}, -16, 16, VE},
+    LEGACY_OPTIONS
     { NULL }
 };
+#endif
+
+#if CONFIG_LIBVPX_VP9_ENCODER
+static const AVOption vp9_options[] = {
+    COMMON_OPTIONS
+    { "cpu-used",        "Quality/Speed ratio modifier",                OFFSET(cpu_used),        AV_OPT_TYPE_INT, {.i64 = 1},  -8, 8, VE},
+    { "lossless",        "Lossless mode",                               OFFSET(lossless),        AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, VE},
+    { "tile-columns",    "Number of tile columns to use, log2",         OFFSET(tile_columns),    AV_OPT_TYPE_INT, {.i64 = -1}, -1, 6, VE},
+    { "tile-rows",       "Number of tile rows to use, log2",            OFFSET(tile_rows),       AV_OPT_TYPE_INT, {.i64 = -1}, -1, 2, VE},
+    { "frame-parallel",  "Enable frame parallel decodability features", OFFSET(frame_parallel),  AV_OPT_TYPE_BOOL,{.i64 = -1}, -1, 1, VE},
+    { "aq-mode",         "adaptive quantization mode",                  OFFSET(aq_mode),         AV_OPT_TYPE_INT, {.i64 = -1}, -1, 3, VE, "aq_mode"},
+    { "none",            "Aq not used",         0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 0, VE, "aq_mode" },
+    { "variance",        "Variance based Aq",   0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 0, VE, "aq_mode" },
+    { "complexity",      "Complexity based Aq", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 0, VE, "aq_mode" },
+    { "cyclic",          "Cyclic Refresh Aq",   0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 0, VE, "aq_mode" },
+    LEGACY_OPTIONS
+    { NULL }
+};
+#endif
+
+#undef COMMON_OPTIONS
+#undef LEGACY_OPTIONS
 
 static const AVCodecDefault defaults[] = {
     { "qmin",             "-1" },
@@ -625,13 +1101,13 @@ static const AVCodecDefault defaults[] = {
 #if CONFIG_LIBVPX_VP8_ENCODER
 static av_cold int vp8_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp8_cx_algo);
+    return vpx_init(avctx, vpx_codec_vp8_cx());
 }
 
 static const AVClass class_vp8 = {
-    .class_name = "libvpx encoder",
+    .class_name = "libvpx-vp8 encoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = vp8_options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
@@ -645,7 +1121,7 @@ AVCodec ff_libvpx_vp8_encoder = {
     .encode2        = vp8_encode,
     .close          = vp8_free,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P, AV_PIX_FMT_NONE },
     .priv_class     = &class_vp8,
     .defaults       = defaults,
 };
@@ -654,24 +1130,16 @@ AVCodec ff_libvpx_vp8_encoder = {
 #if CONFIG_LIBVPX_VP9_ENCODER
 static av_cold int vp9_init(AVCodecContext *avctx)
 {
-    return vpx_init(avctx, &vpx_codec_vp9_cx_algo);
+    return vpx_init(avctx, vpx_codec_vp9_cx());
 }
 
 static const AVClass class_vp9 = {
-    .class_name = "libvpx encoder",
+    .class_name = "libvpx-vp9 encoder",
     .item_name  = av_default_item_name,
-    .option     = options,
+    .option     = vp9_options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static const AVProfile profiles[] = {
-    { FF_PROFILE_VP9_0, "Profile 0" },
-    { FF_PROFILE_VP9_1, "Profile 1" },
-    { FF_PROFILE_VP9_2, "Profile 2" },
-    { FF_PROFILE_VP9_3, "Profile 3" },
-    { FF_PROFILE_UNKNOWN },
-};
-
 AVCodec ff_libvpx_vp9_encoder = {
     .name           = "libvpx-vp9",
     .long_name      = NULL_IF_CONFIG_SMALL("libvpx VP9"),
@@ -682,17 +1150,9 @@ AVCodec ff_libvpx_vp9_encoder = {
     .encode2        = vp8_encode,
     .close          = vp8_free,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_YUV420P,
-#if VPX_IMAGE_ABI_VERSION >= 3
-        AV_PIX_FMT_YUV422P,
-        AV_PIX_FMT_YUV444P,
-        AV_PIX_FMT_YUV440P,
-#endif
-        AV_PIX_FMT_NONE,
-    },
-    .profiles       = NULL_IF_CONFIG_SMALL(profiles),
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
     .priv_class     = &class_vp9,
     .defaults       = defaults,
+    .init_static_data = ff_vp9_init_static,
 };
 #endif /* CONFIG_LIBVPX_VP9_ENCODER */
diff --git a/libavcodec/libwavpackenc.c b/libavcodec/libwavpackenc.c
index 1455d91..6d57089 100644
--- a/libavcodec/libwavpackenc.c
+++ b/libavcodec/libwavpackenc.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/libwebpenc.c b/libavcodec/libwebpenc.c
index ef311b7..0bcf628 100644
--- a/libavcodec/libwebpenc.c
+++ b/libavcodec/libwebpenc.c
@@ -2,213 +2,48 @@
  * WebP encoding support via libwebp
  * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * WebP encoder using libwebp
+ * WebP encoder using libwebp (WebPEncode API)
  */
 
-#include <webp/encode.h>
+#include "libwebpenc_common.h"
 
-#include "libavutil/common.h"
-#include "libavutil/frame.h"
-#include "libavutil/imgutils.h"
-#include "libavutil/opt.h"
-#include "avcodec.h"
-#include "internal.h"
-
-typedef struct LibWebPContext {
-    AVClass *class;         // class for AVOptions
-    float quality;          // lossy quality 0 - 100
-    int lossless;           // use lossless encoding
-    int preset;             // configuration preset
-    int chroma_warning;     // chroma linesize mismatch warning has been printed
-    int conversion_warning; // pixel format conversion warning has been printed
-    WebPConfig config;      // libwebp configuration
-} LibWebPContext;
-
-static int libwebp_error_to_averror(int err)
-{
-    switch (err) {
-    case VP8_ENC_ERROR_OUT_OF_MEMORY:
-    case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
-        return AVERROR(ENOMEM);
-    case VP8_ENC_ERROR_NULL_PARAMETER:
-    case VP8_ENC_ERROR_INVALID_CONFIGURATION:
-    case VP8_ENC_ERROR_BAD_DIMENSION:
-        return AVERROR(EINVAL);
-    }
-    return AVERROR_UNKNOWN;
-}
+typedef LibWebPContextCommon LibWebPContext;
 
 static av_cold int libwebp_encode_init(AVCodecContext *avctx)
 {
-    LibWebPContext *s = avctx->priv_data;
-    int ret;
-
-    if (avctx->global_quality < 0)
-        avctx->global_quality = 75 * FF_QP2LAMBDA;
-    s->quality = av_clipf(avctx->global_quality / (float)FF_QP2LAMBDA,
-                          0.0f, 100.0f);
-
-    if (avctx->compression_level < 0 || avctx->compression_level > 6) {
-        av_log(avctx, AV_LOG_WARNING, "invalid compression level: %d\n",
-               avctx->compression_level);
-        avctx->compression_level = av_clip(avctx->compression_level, 0, 6);
-    }
-
-    if (s->preset >= WEBP_PRESET_DEFAULT) {
-        ret = WebPConfigPreset(&s->config, s->preset, s->quality);
-        if (!ret)
-            return AVERROR_UNKNOWN;
-        s->lossless              = s->config.lossless;
-        s->quality               = s->config.quality;
-        avctx->compression_level = s->config.method;
-    } else {
-        ret = WebPConfigInit(&s->config);
-        if (!ret)
-            return AVERROR_UNKNOWN;
-
-        s->config.lossless = s->lossless;
-        s->config.quality  = s->quality;
-        s->config.method   = avctx->compression_level;
-
-        ret = WebPValidateConfig(&s->config);
-        if (!ret)
-            return AVERROR(EINVAL);
-    }
-
-    av_log(avctx, AV_LOG_DEBUG, "%s - quality=%.1f method=%d\n",
-           s->lossless ? "Lossless" : "Lossy", s->quality,
-           avctx->compression_level);
-
-    return 0;
+    return ff_libwebp_encode_init_common(avctx);
 }
 
 static int libwebp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                 const AVFrame *frame, int *got_packet)
 {
     LibWebPContext *s  = avctx->priv_data;
-    AVFrame *alt_frame = NULL;
     WebPPicture *pic = NULL;
+    AVFrame *alt_frame = NULL;
     WebPMemoryWriter mw = { 0 };
-    int ret;
 
-    if (avctx->width > WEBP_MAX_DIMENSION || avctx->height > WEBP_MAX_DIMENSION) {
-        av_log(avctx, AV_LOG_ERROR, "Picture size is too large. Max is %dx%d.\n",
-               WEBP_MAX_DIMENSION, WEBP_MAX_DIMENSION);
-        return AVERROR(EINVAL);
-    }
-
-    pic = av_malloc(sizeof(*pic));
-    if (!pic)
-        return AVERROR(ENOMEM);
-
-    ret = WebPPictureInit(pic);
-    if (!ret) {
-        ret = AVERROR_UNKNOWN;
+    int ret = ff_libwebp_get_frame(avctx, s, frame, &alt_frame, &pic);
+    if (ret < 0)
         goto end;
-    }
-    pic->width  = avctx->width;
-    pic->height = avctx->height;
-
-    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
-        if (!s->lossless) {
-            /* libwebp will automatically convert RGB input to YUV when
-               encoding lossy. */
-            if (!s->conversion_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Using libwebp for RGB-to-YUV conversion. You may want "
-                       "to consider passing in YUV instead for lossy "
-                       "encoding.\n");
-                s->conversion_warning = 1;
-            }
-        }
-        pic->use_argb    = 1;
-        pic->argb        = (uint32_t *)frame->data[0];
-        pic->argb_stride = frame->linesize[0] / 4;
-    } else {
-        if (frame->linesize[1] != frame->linesize[2]) {
-            if (!s->chroma_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Copying frame due to differing chroma linesizes.\n");
-                s->chroma_warning = 1;
-            }
-            alt_frame = av_frame_alloc();
-            if (!alt_frame) {
-                ret = AVERROR(ENOMEM);
-                goto end;
-            }
-            alt_frame->width  = frame->width;
-            alt_frame->height = frame->height;
-            alt_frame->format = frame->format;
-            ret = av_frame_get_buffer(alt_frame, 32);
-            if (ret < 0)
-                goto end;
-            av_frame_copy(alt_frame, frame);
-            frame = alt_frame;
-        }
-        pic->use_argb  = 0;
-        pic->y         = frame->data[0];
-        pic->u         = frame->data[1];
-        pic->v         = frame->data[2];
-        pic->y_stride  = frame->linesize[0];
-        pic->uv_stride = frame->linesize[1];
-        if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
-            pic->colorspace = WEBP_YUV420A;
-            pic->a          = frame->data[3];
-            pic->a_stride   = frame->linesize[3];
-        } else {
-            pic->colorspace = WEBP_YUV420;
-        }
-
-        if (s->lossless) {
-            /* We do not have a way to automatically prioritize RGB over YUV
-               in automatic pixel format conversion based on whether we're
-               encoding lossless or lossy, so we do conversion with libwebp as
-               a convenience. */
-            if (!s->conversion_warning) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Using libwebp for YUV-to-RGB conversion. You may want "
-                       "to consider passing in RGB instead for lossless "
-                       "encoding.\n");
-                s->conversion_warning = 1;
-            }
-
-#if (WEBP_ENCODER_ABI_VERSION <= 0x201)
-            /* libwebp should do the conversion automatically, but there is a
-               bug that causes it to return an error instead, so a work-around
-               is required.
-               See https://code.google.com/p/webp/issues/detail?id=178 */
-            pic->memory_ = (void*)1;  /* something non-null */
-            ret = WebPPictureYUVAToARGB(pic);
-            if (!ret) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "WebPPictureYUVAToARGB() failed with error: %d\n",
-                       pic->error_code);
-                ret = libwebp_error_to_averror(pic->error_code);
-                goto end;
-            }
-            pic->memory_ = NULL;  /* restore pointer */
-#endif
-        }
-    }
 
     WebPMemoryWriterInit(&mw);
     pic->custom_ptr = &mw;
@@ -218,11 +53,11 @@ static int libwebp_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (!ret) {
         av_log(avctx, AV_LOG_ERROR, "WebPEncode() failed with error: %d\n",
                pic->error_code);
-        ret = libwebp_error_to_averror(pic->error_code);
+        ret = ff_libwebp_error_to_averror(pic->error_code);
         goto end;
     }
 
-    ret = ff_alloc_packet(pkt, mw.size);
+    ret = ff_alloc_packet2(avctx, pkt, mw.size, mw.size);
     if (ret < 0)
         goto end;
     memcpy(pkt->data, mw.mem, mw.size);
@@ -243,20 +78,13 @@ end:
     return ret;
 }
 
-#define OFFSET(x) offsetof(LibWebPContext, x)
-#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
-static const AVOption options[] = {
-    { "lossless",   "Use lossless mode",       OFFSET(lossless), AV_OPT_TYPE_INT,   { .i64 =  0 },  0, 1,                           VE           },
-    { "preset",     "Configuration preset",    OFFSET(preset),   AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, WEBP_PRESET_TEXT,            VE, "preset" },
-    { "none",       "do not use a preset",                              0, AV_OPT_TYPE_CONST, { .i64 = -1                  }, 0, 0, VE, "preset" },
-    { "default",    "default preset",                                   0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DEFAULT }, 0, 0, VE, "preset" },
-    { "picture",    "digital picture, like portrait, inner shot",       0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PICTURE }, 0, 0, VE, "preset" },
-    { "photo",      "outdoor photograph, with natural lighting",        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PHOTO   }, 0, 0, VE, "preset" },
-    { "drawing",    "hand or line drawing, with high-contrast details", 0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DRAWING }, 0, 0, VE, "preset" },
-    { "icon",       "small-sized colorful images",                      0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_ICON    }, 0, 0, VE, "preset" },
-    { "text",       "text-like",                                        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_TEXT    }, 0, 0, VE, "preset" },
-    { NULL },
-};
+static int libwebp_encode_close(AVCodecContext *avctx)
+{
+    LibWebPContextCommon *s  = avctx->priv_data;
+    av_frame_free(&s->ref);
+
+    return 0;
+}
 
 static const AVClass class = {
     .class_name = "libwebp",
@@ -265,12 +93,6 @@ static const AVClass class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static const AVCodecDefault libwebp_defaults[] = {
-    { "compression_level",  "4"  },
-    { "global_quality",     "-1" },
-    { NULL },
-};
-
 AVCodec ff_libwebp_encoder = {
     .name           = "libwebp",
     .long_name      = NULL_IF_CONFIG_SMALL("libwebp WebP image"),
@@ -279,6 +101,7 @@ AVCodec ff_libwebp_encoder = {
     .priv_data_size = sizeof(LibWebPContext),
     .init           = libwebp_encode_init,
     .encode2        = libwebp_encode_frame,
+    .close          = libwebp_encode_close,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB32,
         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P,
diff --git a/libavcodec/libwebpenc_animencoder.c b/libavcodec/libwebpenc_animencoder.c
new file mode 100644
index 0000000..91bf64c
--- /dev/null
+++ b/libavcodec/libwebpenc_animencoder.c
@@ -0,0 +1,151 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2015 Urvang Joshi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp (WebPAnimEncoder API)
+ */
+
+#include "config.h"
+#include "libwebpenc_common.h"
+
+#include <webp/mux.h>
+
+typedef struct LibWebPAnimContext {
+    LibWebPContextCommon cc;
+    WebPAnimEncoder *enc;     // the main AnimEncoder object
+    int64_t prev_frame_pts;   // pts of the previously encoded frame.
+    int done;                 // If true, we have assembled the bitstream already
+} LibWebPAnimContext;
+
+static av_cold int libwebp_anim_encode_init(AVCodecContext *avctx)
+{
+    int ret = ff_libwebp_encode_init_common(avctx);
+    if (!ret) {
+        LibWebPAnimContext *s = avctx->priv_data;
+        WebPAnimEncoderOptions enc_options = { { 0 } };
+        WebPAnimEncoderOptionsInit(&enc_options);
+        enc_options.verbose = av_log_get_level() >= AV_LOG_VERBOSE;
+        // TODO(urvang): Expose some options on command-line perhaps.
+        s->enc = WebPAnimEncoderNew(avctx->width, avctx->height, &enc_options);
+        if (!s->enc)
+            return AVERROR(EINVAL);
+        s->prev_frame_pts = -1;
+        s->done = 0;
+    }
+    return ret;
+}
+
+static int libwebp_anim_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                     const AVFrame *frame, int *got_packet) {
+    LibWebPAnimContext *s = avctx->priv_data;
+    int ret;
+
+    if (!frame) {
+        if (s->done) {  // Second flush: return empty package to denote finish.
+            *got_packet = 0;
+            return 0;
+        } else {  // First flush: assemble bitstream and return it.
+            WebPData assembled_data = { 0 };
+            ret = WebPAnimEncoderAssemble(s->enc, &assembled_data);
+            if (ret) {
+                ret = ff_alloc_packet2(avctx, pkt, assembled_data.size, assembled_data.size);
+                if (ret < 0)
+                    return ret;
+                memcpy(pkt->data, assembled_data.bytes, assembled_data.size);
+                s->done = 1;
+                pkt->flags |= AV_PKT_FLAG_KEY;
+                pkt->pts = pkt->dts = s->prev_frame_pts + 1;
+                *got_packet = 1;
+                return 0;
+            } else {
+                av_log(s, AV_LOG_ERROR,
+                       "WebPAnimEncoderAssemble() failed with error: %d\n",
+                       VP8_ENC_ERROR_OUT_OF_MEMORY);
+                return AVERROR(ENOMEM);
+            }
+        }
+    } else {
+        int timestamp_ms;
+        WebPPicture *pic = NULL;
+        AVFrame *alt_frame = NULL;
+        ret = ff_libwebp_get_frame(avctx, &s->cc, frame, &alt_frame, &pic);
+        if (ret < 0)
+            goto end;
+
+        timestamp_ms =
+            avctx->time_base.num * frame->pts * 1000 / avctx->time_base.den;
+        ret = WebPAnimEncoderAdd(s->enc, pic, timestamp_ms, &s->cc.config);
+        if (!ret) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Encoding WebP frame failed with error: %d\n",
+                   pic->error_code);
+            ret = ff_libwebp_error_to_averror(pic->error_code);
+            goto end;
+        }
+
+        pkt->pts = pkt->dts = frame->pts;
+        s->prev_frame_pts = frame->pts;  // Save for next frame.
+        ret = 0;
+        *got_packet = 1;
+
+end:
+        WebPPictureFree(pic);
+        av_freep(&pic);
+        av_frame_free(&alt_frame);
+        return ret;
+    }
+}
+
+static int libwebp_anim_encode_close(AVCodecContext *avctx)
+{
+    LibWebPAnimContext *s = avctx->priv_data;
+    av_frame_free(&s->cc.ref);
+    WebPAnimEncoderDelete(s->enc);
+
+    return 0;
+}
+
+static const AVClass class = {
+    .class_name = "libwebp_anim",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libwebp_anim_encoder = {
+    .name           = "libwebp_anim",
+    .long_name      = NULL_IF_CONFIG_SMALL("libwebp WebP image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WEBP,
+    .priv_data_size = sizeof(LibWebPAnimContext),
+    .init           = libwebp_anim_encode_init,
+    .encode2        = libwebp_anim_encode_frame,
+    .close          = libwebp_anim_encode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB32,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUVA420P,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &class,
+    .defaults       = libwebp_defaults,
+};
diff --git a/libavcodec/libwebpenc_common.c b/libavcodec/libwebpenc_common.c
new file mode 100644
index 0000000..21d7ada
--- /dev/null
+++ b/libavcodec/libwebpenc_common.c
@@ -0,0 +1,254 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp: common structs and methods.
+ */
+
+#include "libwebpenc_common.h"
+
+int ff_libwebp_error_to_averror(int err)
+{
+    switch (err) {
+    case VP8_ENC_ERROR_OUT_OF_MEMORY:
+    case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
+        return AVERROR(ENOMEM);
+    case VP8_ENC_ERROR_NULL_PARAMETER:
+    case VP8_ENC_ERROR_INVALID_CONFIGURATION:
+    case VP8_ENC_ERROR_BAD_DIMENSION:
+        return AVERROR(EINVAL);
+    }
+    return AVERROR_UNKNOWN;
+}
+
+av_cold int ff_libwebp_encode_init_common(AVCodecContext *avctx)
+{
+    LibWebPContextCommon *s = avctx->priv_data;
+    int ret;
+
+    if (avctx->global_quality >= 0)
+        s->quality = av_clipf(avctx->global_quality / (float)FF_QP2LAMBDA,
+                              0.0f, 100.0f);
+
+    if (avctx->compression_level < 0 || avctx->compression_level > 6) {
+        av_log(avctx, AV_LOG_WARNING, "invalid compression level: %d\n",
+               avctx->compression_level);
+        avctx->compression_level = av_clip(avctx->compression_level, 0, 6);
+    }
+
+    if (s->preset >= WEBP_PRESET_DEFAULT) {
+        ret = WebPConfigPreset(&s->config, s->preset, s->quality);
+        if (!ret)
+            return AVERROR_UNKNOWN;
+        s->lossless              = s->config.lossless;
+        s->quality               = s->config.quality;
+        avctx->compression_level = s->config.method;
+    } else {
+        ret = WebPConfigInit(&s->config);
+        if (!ret)
+            return AVERROR_UNKNOWN;
+
+        s->config.lossless = s->lossless;
+        s->config.quality  = s->quality;
+        s->config.method   = avctx->compression_level;
+
+        ret = WebPValidateConfig(&s->config);
+        if (!ret)
+            return AVERROR(EINVAL);
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "%s - quality=%.1f method=%d\n",
+           s->lossless ? "Lossless" : "Lossy", s->quality,
+           avctx->compression_level);
+
+    return 0;
+}
+
+int ff_libwebp_get_frame(AVCodecContext *avctx, LibWebPContextCommon *s,
+                         const AVFrame *frame, AVFrame **alt_frame_ptr,
+                         WebPPicture **pic_ptr) {
+    int ret;
+    WebPPicture *pic = NULL;
+    AVFrame *alt_frame = NULL;
+
+    if (avctx->width > WEBP_MAX_DIMENSION || avctx->height > WEBP_MAX_DIMENSION) {
+        av_log(avctx, AV_LOG_ERROR, "Picture size is too large. Max is %dx%d.\n",
+               WEBP_MAX_DIMENSION, WEBP_MAX_DIMENSION);
+        return AVERROR(EINVAL);
+    }
+
+    *pic_ptr = av_malloc(sizeof(*pic));
+    pic = *pic_ptr;
+    if (!pic)
+        return AVERROR(ENOMEM);
+
+    ret = WebPPictureInit(pic);
+    if (!ret) {
+        ret = AVERROR_UNKNOWN;
+        goto end;
+    }
+    pic->width  = avctx->width;
+    pic->height = avctx->height;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+        if (!s->lossless) {
+            /* libwebp will automatically convert RGB input to YUV when
+               encoding lossy. */
+            if (!s->conversion_warning) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Using libwebp for RGB-to-YUV conversion. You may want "
+                       "to consider passing in YUV instead for lossy "
+                       "encoding.\n");
+                s->conversion_warning = 1;
+            }
+        }
+        pic->use_argb    = 1;
+        pic->argb        = (uint32_t *)frame->data[0];
+        pic->argb_stride = frame->linesize[0] / 4;
+    } else {
+        if (frame->linesize[1] != frame->linesize[2] || s->cr_threshold) {
+            if (!s->chroma_warning && !s->cr_threshold) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Copying frame due to differing chroma linesizes.\n");
+                s->chroma_warning = 1;
+            }
+            *alt_frame_ptr = av_frame_alloc();
+            alt_frame = *alt_frame_ptr;
+            if (!alt_frame) {
+                ret = AVERROR(ENOMEM);
+                goto end;
+            }
+            alt_frame->width  = frame->width;
+            alt_frame->height = frame->height;
+            alt_frame->format = frame->format;
+            if (s->cr_threshold)
+                alt_frame->format = AV_PIX_FMT_YUVA420P;
+            ret = av_frame_get_buffer(alt_frame, 32);
+            if (ret < 0)
+                goto end;
+            alt_frame->format = frame->format;
+            av_frame_copy(alt_frame, frame);
+            frame = alt_frame;
+            if (s->cr_threshold) {
+                int x,y, x2, y2, p;
+                int bs = s->cr_size;
+
+                if (!s->ref) {
+                    s->ref = av_frame_clone(frame);
+                    if (!s->ref) {
+                        ret = AVERROR(ENOMEM);
+                        goto end;
+                    }
+                }
+
+                alt_frame->format = AV_PIX_FMT_YUVA420P;
+                for (y = 0; y < frame->height; y+= bs) {
+                    for (x = 0; x < frame->width; x+= bs) {
+                        int skip;
+                        int sse = 0;
+                        for (p = 0; p < 3; p++) {
+                            int bs2 = bs >> !!p;
+                            int w = AV_CEIL_RSHIFT(frame->width , !!p);
+                            int h = AV_CEIL_RSHIFT(frame->height, !!p);
+                            int xs = x >> !!p;
+                            int ys = y >> !!p;
+                            for (y2 = ys; y2 < FFMIN(ys + bs2, h); y2++) {
+                                for (x2 = xs; x2 < FFMIN(xs + bs2, w); x2++) {
+                                    int diff =  frame->data[p][frame->linesize[p] * y2 + x2]
+                                              -s->ref->data[p][frame->linesize[p] * y2 + x2];
+                                    sse += diff*diff;
+                                }
+                            }
+                        }
+                        skip = sse < s->cr_threshold && frame->data[3] != s->ref->data[3];
+                        if (!skip)
+                            for (p = 0; p < 3; p++) {
+                                int bs2 = bs >> !!p;
+                                int w = AV_CEIL_RSHIFT(frame->width , !!p);
+                                int h = AV_CEIL_RSHIFT(frame->height, !!p);
+                                int xs = x >> !!p;
+                                int ys = y >> !!p;
+                                for (y2 = ys; y2 < FFMIN(ys + bs2, h); y2++) {
+                                    memcpy(&s->ref->data[p][frame->linesize[p] * y2 + xs],
+                                            & frame->data[p][frame->linesize[p] * y2 + xs], FFMIN(bs2, w-xs));
+                                }
+                            }
+                        for (y2 = y; y2 < FFMIN(y+bs, frame->height); y2++) {
+                            memset(&frame->data[3][frame->linesize[3] * y2 + x],
+                                    skip ? 0 : 255,
+                                    FFMIN(bs, frame->width-x));
+                        }
+                    }
+                }
+            }
+        }
+
+        pic->use_argb  = 0;
+        pic->y         = frame->data[0];
+        pic->u         = frame->data[1];
+        pic->v         = frame->data[2];
+        pic->y_stride  = frame->linesize[0];
+        pic->uv_stride = frame->linesize[1];
+        if (frame->format == AV_PIX_FMT_YUVA420P) {
+            pic->colorspace = WEBP_YUV420A;
+            pic->a          = frame->data[3];
+            pic->a_stride   = frame->linesize[3];
+            if (alt_frame)
+                WebPCleanupTransparentArea(pic);
+        } else {
+            pic->colorspace = WEBP_YUV420;
+        }
+
+        if (s->lossless) {
+            /* We do not have a way to automatically prioritize RGB over YUV
+               in automatic pixel format conversion based on whether we're
+               encoding lossless or lossy, so we do conversion with libwebp as
+               a convenience. */
+            if (!s->conversion_warning) {
+                av_log(avctx, AV_LOG_WARNING,
+                       "Using libwebp for YUV-to-RGB conversion. You may want "
+                       "to consider passing in RGB instead for lossless "
+                       "encoding.\n");
+                s->conversion_warning = 1;
+            }
+
+#if (WEBP_ENCODER_ABI_VERSION <= 0x201)
+            /* libwebp should do the conversion automatically, but there is a
+               bug that causes it to return an error instead, so a work-around
+               is required.
+               See https://code.google.com/p/webp/issues/detail?id=178 */
+            pic->memory_ = (void*)1;  /* something non-null */
+            ret = WebPPictureYUVAToARGB(pic);
+            if (!ret) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "WebPPictureYUVAToARGB() failed with error: %d\n",
+                       pic->error_code);
+                ret = libwebp_error_to_averror(pic->error_code);
+                goto end;
+            }
+            pic->memory_ = NULL;  /* restore pointer */
+#endif
+        }
+    }
+end:
+    return ret;
+}
diff --git a/libavcodec/libwebpenc_common.h b/libavcodec/libwebpenc_common.h
new file mode 100644
index 0000000..e74e579
--- /dev/null
+++ b/libavcodec/libwebpenc_common.h
@@ -0,0 +1,84 @@
+/*
+ * WebP encoding support via libwebp
+ * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebP encoder using libwebp: common structs and methods.
+ */
+
+#ifndef AVCODEC_LIBWEBPENC_COMMON_H
+#define AVCODEC_LIBWEBPENC_COMMON_H
+
+#include <webp/encode.h>
+
+#include "libavutil/common.h"
+#include "libavutil/frame.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+
+typedef struct LibWebPContextCommon {
+    AVClass *class;         // class for AVOptions
+    float quality;          // lossy quality 0 - 100
+    int lossless;           // use lossless encoding
+    int preset;             // configuration preset
+    int chroma_warning;     // chroma linesize mismatch warning has been printed
+    int conversion_warning; // pixel format conversion warning has been printed
+    WebPConfig config;      // libwebp configuration
+    AVFrame *ref;
+    int cr_size;
+    int cr_threshold;
+} LibWebPContextCommon;
+
+int ff_libwebp_error_to_averror(int err);
+
+av_cold int ff_libwebp_encode_init_common(AVCodecContext *avctx);
+
+int ff_libwebp_get_frame(AVCodecContext *avctx, LibWebPContextCommon *s,
+                         const AVFrame *frame, AVFrame **alt_frame_ptr,
+                         WebPPicture **pic_ptr);
+
+#define OFFSET(x) offsetof(LibWebPContextCommon, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "lossless",   "Use lossless mode",       OFFSET(lossless), AV_OPT_TYPE_INT,   { .i64 =  0 },  0, 1,                           VE           },
+    { "preset",     "Configuration preset",    OFFSET(preset),   AV_OPT_TYPE_INT,   { .i64 = -1 }, -1, WEBP_PRESET_TEXT,            VE, "preset" },
+    { "none",       "do not use a preset",                              0, AV_OPT_TYPE_CONST, { .i64 = -1                  }, 0, 0, VE, "preset" },
+    { "default",    "default preset",                                   0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DEFAULT }, 0, 0, VE, "preset" },
+    { "picture",    "digital picture, like portrait, inner shot",       0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PICTURE }, 0, 0, VE, "preset" },
+    { "photo",      "outdoor photograph, with natural lighting",        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_PHOTO   }, 0, 0, VE, "preset" },
+    { "drawing",    "hand or line drawing, with high-contrast details", 0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_DRAWING }, 0, 0, VE, "preset" },
+    { "icon",       "small-sized colorful images",                      0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_ICON    }, 0, 0, VE, "preset" },
+    { "text",       "text-like",                                        0, AV_OPT_TYPE_CONST, { .i64 = WEBP_PRESET_TEXT    }, 0, 0, VE, "preset" },
+    { "cr_threshold","Conditional replenishment threshold",     OFFSET(cr_threshold), AV_OPT_TYPE_INT, { .i64 =  0  },  0, INT_MAX, VE           },
+    { "cr_size"     ,"Conditional replenishment block size",    OFFSET(cr_size)     , AV_OPT_TYPE_INT, { .i64 =  16 },  0, 256,     VE           },
+    { "quality"     ,"Quality",                OFFSET(quality),  AV_OPT_TYPE_FLOAT, { .dbl =  75 }, 0, 100,                         VE           },
+    { NULL },
+};
+
+static const AVCodecDefault libwebp_defaults[] = {
+    { "compression_level",  "4"  },
+    { "global_quality",     "-1" },
+    { NULL },
+};
+
+#endif /* AVCODEC_LIBWEBPENC_COMMON_H */
diff --git a/libavcodec/libx264.c b/libavcodec/libx264.c
index fddf1b3..85f1996 100644
--- a/libavcodec/libx264.c
+++ b/libavcodec/libx264.c
@@ -2,28 +2,30 @@
  * H.264 encoding using the x264 library
  * Copyright (C) 2005  Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/eval.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/stereo3d.h"
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
 
@@ -48,7 +50,10 @@ typedef struct X264Context {
     char *preset;
     char *tune;
     char *profile;
+    char *level;
     int fastfirstpass;
+    char *wpredp;
+    char *x264opts;
     float crf;
     float crf_max;
     int cqp;
@@ -76,9 +81,11 @@ typedef struct X264Context {
     int slice_max_size;
     char *stats;
     int nal_hrd;
+    int avcintra_class;
     int motion_est;
     int forced_idr;
     int coder;
+    int a53_cc;
     int b_frame_strategy;
     int chroma_offset;
     int scenechange_threshold;
@@ -104,7 +111,7 @@ static void X264_log(void *p, int level, const char *fmt, va_list args)
 
 
 static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
-                       x264_nal_t *nals, int nnal)
+                       const x264_nal_t *nals, int nnal)
 {
     X264Context *x4 = ctx->priv_data;
     uint8_t *p;
@@ -116,16 +123,21 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     for (i = 0; i < nnal; i++)
         size += nals[i].i_payload;
 
-    if ((ret = ff_alloc_packet(pkt, size)) < 0)
+    if ((ret = ff_alloc_packet2(ctx, pkt, size, 0)) < 0)
         return ret;
 
     p = pkt->data;
 
     /* Write the SEI as part of the first frame. */
     if (x4->sei_size > 0 && nnal > 0) {
+        if (x4->sei_size > size) {
+            av_log(ctx, AV_LOG_ERROR, "Error: nal buffer is too small\n");
+            return -1;
+        }
         memcpy(p, x4->sei, x4->sei_size);
         p += x4->sei_size;
         x4->sei_size = 0;
+        av_freep(&x4->sei);
     }
 
     for (i = 0; i < nnal; i++){
@@ -136,18 +148,39 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     return 1;
 }
 
+static int avfmt2_num_planes(int avfmt)
+{
+    switch (avfmt) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUVJ420P:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV444P:
+        return 3;
+
+    case AV_PIX_FMT_BGR0:
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB24:
+        return 1;
+
+    default:
+        return 3;
+    }
+}
+
 static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
 {
     X264Context *x4 = ctx->priv_data;
     AVFrameSideData *side_data;
 
 
-    if (x4->params.b_tff != frame->top_field_first) {
+  if (x4->avcintra_class < 0) {
+    if (x4->params.b_interlaced && x4->params.b_tff != frame->top_field_first) {
+
         x4->params.b_tff = frame->top_field_first;
         x264_encoder_reconfig(x4->enc, &x4->params);
     }
-    if (x4->params.vui.i_sar_height != ctx->sample_aspect_ratio.den ||
-        x4->params.vui.i_sar_width  != ctx->sample_aspect_ratio.num) {
+    if (x4->params.vui.i_sar_height*ctx->sample_aspect_ratio.num != ctx->sample_aspect_ratio.den * x4->params.vui.i_sar_width) {
         x4->params.vui.i_sar_height = ctx->sample_aspect_ratio.den;
         x4->params.vui.i_sar_width  = ctx->sample_aspect_ratio.num;
         x264_encoder_reconfig(x4->enc, &x4->params);
@@ -174,6 +207,7 @@ static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
     }
 
     if (x4->params.rc.i_rc_method == X264_RC_CQP &&
+        x4->cqp >= 0 &&
         x4->params.rc.i_qp_constant != x4->cqp) {
         x4->params.rc.i_qp_constant = x4->cqp;
         x264_encoder_reconfig(x4->enc, &x4->params);
@@ -184,6 +218,7 @@ static void reconfig_encoder(AVCodecContext *ctx, const AVFrame *frame)
         x4->params.rc.f_rf_constant_max = x4->crf_max;
         x264_encoder_reconfig(x4->enc, &x4->params);
     }
+  }
 
     side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_STEREO3D);
     if (side_data) {
@@ -239,16 +274,18 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
     X264Context *x4 = ctx->priv_data;
     x264_nal_t *nal;
     int nnal, i, ret;
-    x264_picture_t pic_out;
+    x264_picture_t pic_out = {0};
+    int pict_type;
+    AVFrameSideData *side_data;
 
     x264_picture_init( &x4->pic );
     x4->pic.img.i_csp   = x4->params.i_csp;
     if (x264_bit_depth > 8)
         x4->pic.img.i_csp |= X264_CSP_HIGH_DEPTH;
-    x4->pic.img.i_plane = 3;
+    x4->pic.img.i_plane = avfmt2_num_planes(ctx->pix_fmt);
 
     if (frame) {
-        for (i = 0; i < 3; i++) {
+        for (i = 0; i < x4->pic.img.i_plane; i++) {
             x4->pic.img.plane[i]    = frame->data[i];
             x4->pic.img.i_stride[i] = frame->linesize[i];
         }
@@ -257,8 +294,8 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
 
         switch (frame->pict_type) {
         case AV_PICTURE_TYPE_I:
-            x4->pic.i_type = x4->forced_idr ? X264_TYPE_IDR
-                                            : X264_TYPE_KEYFRAME;
+            x4->pic.i_type = x4->forced_idr >= 0 ? X264_TYPE_IDR
+                                                 : X264_TYPE_KEYFRAME;
             break;
         case AV_PICTURE_TYPE_P:
             x4->pic.i_type = X264_TYPE_P;
@@ -271,10 +308,34 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
             break;
         }
         reconfig_encoder(ctx, frame);
+
+        if (x4->a53_cc) {
+            void *sei_data;
+            size_t sei_size;
+
+            ret = ff_alloc_a53_sei(frame, 0, &sei_data, &sei_size);
+            if (ret < 0) {
+                av_log(ctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+            } else if (sei_data) {
+                x4->pic.extra_sei.payloads = av_mallocz(sizeof(x4->pic.extra_sei.payloads[0]));
+                if (x4->pic.extra_sei.payloads == NULL) {
+                    av_log(ctx, AV_LOG_ERROR, "Not enough memory for closed captions, skipping\n");
+                    av_free(sei_data);
+                } else {
+                    x4->pic.extra_sei.sei_free = av_free;
+
+                    x4->pic.extra_sei.payloads[0].payload_size = sei_size;
+                    x4->pic.extra_sei.payloads[0].payload = sei_data;
+                    x4->pic.extra_sei.num_payloads = 1;
+                    x4->pic.extra_sei.payloads[0].payload_type = 4;
+                }
+            }
+        }
     }
+
     do {
         if (x264_encoder_encode(x4->enc, &nal, &nnal, frame? &x4->pic: NULL, &pic_out) < 0)
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
 
         ret = encode_nals(ctx, pkt, nal, nnal);
         if (ret < 0)
@@ -284,31 +345,31 @@ static int X264_frame(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame,
     pkt->pts = pic_out.i_pts;
     pkt->dts = pic_out.i_dts;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
+
     switch (pic_out.i_type) {
     case X264_TYPE_IDR:
     case X264_TYPE_I:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case X264_TYPE_P:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case X264_TYPE_B:
     case X264_TYPE_BREF:
-        ctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
+    default:
+        pict_type = AV_PICTURE_TYPE_NONE;
     }
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    ctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     pkt->flags |= AV_PKT_FLAG_KEY*pic_out.b_keyframe;
     if (ret) {
-        uint8_t *sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                              sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA;
+        ff_side_data_set_encoder_stats(pkt, (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
 
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -336,6 +397,20 @@ static av_cold int X264_close(AVCodecContext *avctx)
     return 0;
 }
 
+#define OPT_STR(opt, param)                                                   \
+    do {                                                                      \
+        int ret;                                                              \
+        if ((ret = x264_param_parse(&x4->params, opt, param)) < 0) { \
+            if(ret == X264_PARAM_BAD_NAME)                                    \
+                av_log(avctx, AV_LOG_ERROR,                                   \
+                        "bad option '%s': '%s'\n", opt, param);               \
+            else                                                              \
+                av_log(avctx, AV_LOG_ERROR,                                   \
+                        "bad value for '%s': '%s'\n", opt, param);            \
+            return -1;                                                        \
+        }                                                                     \
+    } while (0)
+
 static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
 {
     switch (pix_fmt) {
@@ -350,6 +425,15 @@ static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
     case AV_PIX_FMT_YUVJ444P:
     case AV_PIX_FMT_YUV444P9:
     case AV_PIX_FMT_YUV444P10: return X264_CSP_I444;
+#if CONFIG_LIBX264RGB_ENCODER
+    case AV_PIX_FMT_BGR0:
+        return X264_CSP_BGRA;
+    case AV_PIX_FMT_BGR24:
+        return X264_CSP_BGR;
+
+    case AV_PIX_FMT_RGB24:
+        return X264_CSP_RGB;
+#endif
     case AV_PIX_FMT_NV12:      return X264_CSP_NV12;
     case AV_PIX_FMT_NV16:
     case AV_PIX_FMT_NV20:      return X264_CSP_NV16;
@@ -370,21 +454,33 @@ static av_cold int X264_init(AVCodecContext *avctx)
 {
     X264Context *x4 = avctx->priv_data;
     AVCPBProperties *cpb_props;
+    int sw,sh;
+
+    if (avctx->global_quality > 0)
+        av_log(avctx, AV_LOG_WARNING, "-qscale is ignored, -crf is recommended.\n");
 
 #if CONFIG_LIBX262_ENCODER
     if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
         x4->params.b_mpeg2 = 1;
         x264_param_default_mpeg2(&x4->params);
     } else
-#else
-    x264_param_default(&x4->params);
 #endif
+    x264_param_default(&x4->params);
 
     x4->params.b_deblocking_filter         = avctx->flags & AV_CODEC_FLAG_LOOP_FILTER;
 
     if (x4->preset || x4->tune)
         if (x264_param_default_preset(&x4->params, x4->preset, x4->tune) < 0) {
+            int i;
             av_log(avctx, AV_LOG_ERROR, "Error setting preset/tune %s/%s.\n", x4->preset, x4->tune);
+            av_log(avctx, AV_LOG_INFO, "Possible presets:");
+            for (i = 0; x264_preset_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_preset_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
+            av_log(avctx, AV_LOG_INFO, "Possible tunes:");
+            for (i = 0; x264_tune_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_tune_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
             return AVERROR(EINVAL);
         }
 
@@ -396,6 +492,8 @@ static av_cold int X264_init(AVCodecContext *avctx)
     x4->params.i_log_level          = X264_LOG_DEBUG;
     x4->params.i_csp                = convert_pix_fmt(avctx->pix_fmt);
 
+    PARSE_X264_OPT("weightp", wpredp);
+
     if (avctx->bit_rate) {
         x4->params.rc.i_bitrate   = avctx->bit_rate / 1000;
         x4->params.rc.i_rc_method = X264_RC_ABR;
@@ -424,9 +522,12 @@ static av_cold int X264_init(AVCodecContext *avctx)
             (float)avctx->rc_initial_buffer_occupancy / avctx->rc_buffer_size;
     }
 
+    PARSE_X264_OPT("level", level);
+
     if (avctx->i_quant_factor > 0)
         x4->params.rc.f_ip_factor         = 1 / fabs(avctx->i_quant_factor);
-    x4->params.rc.f_pb_factor             = avctx->b_quant_factor;
+    if (avctx->b_quant_factor > 0)
+        x4->params.rc.f_pb_factor         = avctx->b_quant_factor;
 
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -463,6 +564,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
         x4->params.rc.f_qcompress       = avctx->qcompress; /* 0.0 => cbr, 1.0 => constant qp */
     if (avctx->refs >= 0)
         x4->params.i_frame_reference    = avctx->refs;
+    else if (x4->level) {
+        int i;
+        int mbn = AV_CEIL_RSHIFT(avctx->width, 4) * AV_CEIL_RSHIFT(avctx->height, 4);
+        int level_id = -1;
+        char *tail;
+        int scale = X264_BUILD < 129 ? 384 : 1;
+
+        if (!strcmp(x4->level, "1b")) {
+            level_id = 9;
+        } else if (strlen(x4->level) <= 3){
+            level_id = av_strtod(x4->level, &tail) * 10 + 0.5;
+            if (*tail)
+                level_id = -1;
+        }
+        if (level_id <= 0)
+            av_log(avctx, AV_LOG_WARNING, "Failed to parse level\n");
+
+        for (i = 0; i<x264_levels[i].level_idc; i++)
+            if (x264_levels[i].level_idc == level_id)
+                x4->params.i_frame_reference = av_clip(x264_levels[i].dpb / mbn / scale, 1, x4->params.i_frame_reference);
+    }
+
     if (avctx->trellis >= 0)
         x4->params.analyse.i_trellis    = avctx->trellis;
     if (avctx->me_range >= 0)
@@ -521,6 +644,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         x4->params.b_bluray_compat = x4->bluray_compat;
         x4->params.b_vfr_input = 0;
     }
+    if (x4->avcintra_class >= 0)
+#if X264_BUILD >= 142
+        x4->params.i_avcintra_class = x4->avcintra_class;
+#else
+        av_log(avctx, AV_LOG_ERROR,
+               "x264 too old for AVC Intra, at least version 142 needed\n");
+#endif
     if (x4->b_bias != INT_MIN)
         x4->params.i_bframe_bias              = x4->b_bias;
     if (x4->b_pyramid >= 0)
@@ -544,6 +674,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (x4->fastfirstpass)
         x264_param_apply_fastfirstpass(&x4->params);
 
+    /* Allow specifying the x264 profile through AVCodecContext. */
+    if (!x4->profile)
+        switch (avctx->profile) {
+        case FF_PROFILE_H264_BASELINE:
+            x4->profile = av_strdup("baseline");
+            break;
+        case FF_PROFILE_H264_HIGH:
+            x4->profile = av_strdup("high");
+            break;
+        case FF_PROFILE_H264_HIGH_10:
+            x4->profile = av_strdup("high10");
+            break;
+        case FF_PROFILE_H264_HIGH_422:
+            x4->profile = av_strdup("high422");
+            break;
+        case FF_PROFILE_H264_HIGH_444:
+            x4->profile = av_strdup("high444");
+            break;
+        case FF_PROFILE_H264_MAIN:
+            x4->profile = av_strdup("main");
+            break;
+        default:
+            break;
+        }
+
     if (x4->nal_hrd >= 0)
         x4->params.i_nal_hrd = x4->nal_hrd;
 
@@ -574,16 +729,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (x4->profile)
         if (x264_param_apply_profile(&x4->params, x4->profile) < 0) {
+            int i;
             av_log(avctx, AV_LOG_ERROR, "Error setting profile %s.\n", x4->profile);
+            av_log(avctx, AV_LOG_INFO, "Possible profiles:");
+            for (i = 0; x264_profile_names[i]; i++)
+                av_log(avctx, AV_LOG_INFO, " %s", x264_profile_names[i]);
+            av_log(avctx, AV_LOG_INFO, "\n");
             return AVERROR(EINVAL);
         }
 
     x4->params.i_width          = avctx->width;
     x4->params.i_height         = avctx->height;
-    x4->params.vui.i_sar_width  = avctx->sample_aspect_ratio.num;
-    x4->params.vui.i_sar_height = avctx->sample_aspect_ratio.den;
-    x4->params.i_fps_num = x4->params.i_timebase_den = avctx->time_base.den;
-    x4->params.i_fps_den = x4->params.i_timebase_num = avctx->time_base.num;
+    av_reduce(&sw, &sh, avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den, 4096);
+    x4->params.vui.i_sar_width  = sw;
+    x4->params.vui.i_sar_height = sh;
+    x4->params.i_timebase_den = avctx->time_base.den;
+    x4->params.i_timebase_num = avctx->time_base.num;
+    x4->params.i_fps_num = avctx->time_base.den;
+    x4->params.i_fps_den = avctx->time_base.num * avctx->ticks_per_frame;
 
     x4->params.analyse.b_psnr = avctx->flags & AV_CODEC_FLAG_PSNR;
 
@@ -602,14 +765,29 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                  avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
                                  avctx->color_range == AVCOL_RANGE_JPEG;
 
-    // x264 validates the values internally
-    x4->params.vui.i_colorprim = avctx->color_primaries;
-    x4->params.vui.i_transfer  = avctx->color_trc;
-    x4->params.vui.i_colmatrix = avctx->colorspace;
+    if (avctx->colorspace != AVCOL_SPC_UNSPECIFIED)
+        x4->params.vui.i_colmatrix = avctx->colorspace;
+    if (avctx->color_primaries != AVCOL_PRI_UNSPECIFIED)
+        x4->params.vui.i_colorprim = avctx->color_primaries;
+    if (avctx->color_trc != AVCOL_TRC_UNSPECIFIED)
+        x4->params.vui.i_transfer  = avctx->color_trc;
 
     if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)
         x4->params.b_repeat_headers = 0;
 
+    if(x4->x264opts){
+        const char *p= x4->x264opts;
+        while(p){
+            char param[256]={0}, val[256]={0};
+            if(sscanf(p, "%255[^:=]=%255[^:]", param, val) == 1){
+                OPT_STR(param, "1");
+            }else
+                OPT_STR(param, val);
+            p= strchr(p, ':');
+            p+=!!p;
+        }
+    }
+
     if (x4->x264_params) {
         AVDictionary *dict    = NULL;
         AVDictionaryEntry *en = NULL;
@@ -636,7 +814,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     x4->enc = x264_encoder_open(&x4->params);
     if (!x4->enc)
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
 
     if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
         x264_nal_t *nal;
@@ -701,6 +879,14 @@ static const enum AVPixelFormat pix_fmts_10bit[] = {
     AV_PIX_FMT_NV20,
     AV_PIX_FMT_NONE
 };
+#if CONFIG_LIBX264RGB_ENCODER
+static const enum AVPixelFormat pix_fmts_8bit_rgb[] = {
+    AV_PIX_FMT_BGR0,
+    AV_PIX_FMT_BGR24,
+    AV_PIX_FMT_RGB24,
+    AV_PIX_FMT_NONE
+};
+#endif
 
 static av_cold void X264_init_static(AVCodec *codec)
 {
@@ -718,36 +904,44 @@ static const AVOption options[] = {
     { "preset",        "Set the encoding preset (cf. x264 --fullhelp)",   OFFSET(preset),        AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE},
     { "tune",          "Tune the encoding params (cf. x264 --fullhelp)",  OFFSET(tune),          AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE},
     { "profile",       "Set profile restrictions (cf. x264 --fullhelp) ", OFFSET(profile),       AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE},
-    { "fastfirstpass", "Use fast settings when encoding first pass",      OFFSET(fastfirstpass), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 1, VE},
+    { "fastfirstpass", "Use fast settings when encoding first pass",      OFFSET(fastfirstpass), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VE},
+    {"level", "Specify level (as defined by Annex A)", OFFSET(level), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"passlogfile", "Filename for 2 pass stats", OFFSET(stats), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"wpredp", "Weighted prediction for P-frames", OFFSET(wpredp), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
+    {"a53cc",          "Use A53 Closed Captions (if available)",          OFFSET(a53_cc),        AV_OPT_TYPE_BOOL,   {.i64 = 0}, 0, 1, VE},
+    {"x264opts", "x264 options", OFFSET(x264opts), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, VE},
     { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE },
     { "crf_max",       "In CRF mode, prevents VBV from lowering quality beyond this point.",OFFSET(crf_max), AV_OPT_TYPE_FLOAT, {.dbl = -1 }, -1, FLT_MAX, VE },
     { "qp",            "Constant quantization parameter rate control method",OFFSET(cqp),        AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE },
     { "aq-mode",       "AQ method",                                       OFFSET(aq_mode),       AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE, "aq_mode"},
     { "none",          NULL,                              0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_NONE},         INT_MIN, INT_MAX, VE, "aq_mode" },
     { "variance",      "Variance AQ (complexity mask)",   0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_VARIANCE},     INT_MIN, INT_MAX, VE, "aq_mode" },
-    { "autovariance",  "Auto-variance AQ (experimental)", 0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE}, INT_MIN, INT_MAX, VE, "aq_mode" },
+    { "autovariance",  "Auto-variance AQ",                0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE}, INT_MIN, INT_MAX, VE, "aq_mode" },
+#if X264_BUILD >= 144
+    { "autovariance-biased", "Auto-variance AQ with bias to dark scenes", 0, AV_OPT_TYPE_CONST, {.i64 = X264_AQ_AUTOVARIANCE_BIASED}, INT_MIN, INT_MAX, VE, "aq_mode" },
+#endif
     { "aq-strength",   "AQ strength. Reduces blocking and blurring in flat and textured areas.", OFFSET(aq_strength), AV_OPT_TYPE_FLOAT, {.dbl = -1}, -1, FLT_MAX, VE},
-    { "psy",           "Use psychovisual optimizations.",                 OFFSET(psy),           AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
+    { "psy",           "Use psychovisual optimizations.",                 OFFSET(psy),           AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
     { "psy-rd",        "Strength of psychovisual optimization, in <psy-rd>:<psy-trellis> format.", OFFSET(psy_rd), AV_OPT_TYPE_STRING,  {0 }, 0, 0, VE},
     { "rc-lookahead",  "Number of frames to look ahead for frametype and ratecontrol", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, VE },
-    { "weightb",       "Weighted prediction for B-frames.",               OFFSET(weightb),       AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
+    { "weightb",       "Weighted prediction for B-frames.",               OFFSET(weightb),       AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
     { "weightp",       "Weighted prediction analysis method.",            OFFSET(weightp),       AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE, "weightp" },
     { "none",          NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_WEIGHTP_NONE},   INT_MIN, INT_MAX, VE, "weightp" },
     { "simple",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_WEIGHTP_SIMPLE}, INT_MIN, INT_MAX, VE, "weightp" },
     { "smart",         NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_WEIGHTP_SMART},  INT_MIN, INT_MAX, VE, "weightp" },
-    { "ssim",          "Calculate and print SSIM stats.",                 OFFSET(ssim),          AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
-    { "intra-refresh", "Use Periodic Intra Refresh instead of IDR frames.",OFFSET(intra_refresh),AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
-    { "bluray-compat", "Bluray compatibility workarounds.",               OFFSET(bluray_compat) ,AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE },
+    { "ssim",          "Calculate and print SSIM stats.",                 OFFSET(ssim),          AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
+    { "intra-refresh", "Use Periodic Intra Refresh instead of IDR frames.",OFFSET(intra_refresh),AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
+    { "bluray-compat", "Bluray compatibility workarounds.",               OFFSET(bluray_compat) ,AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
     { "b-bias",        "Influences how often B-frames are used",          OFFSET(b_bias),        AV_OPT_TYPE_INT,    { .i64 = INT_MIN}, INT_MIN, INT_MAX, VE },
     { "b-pyramid",     "Keep some B-frames as references.",               OFFSET(b_pyramid),     AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, INT_MAX, VE, "b_pyramid" },
     { "none",          NULL,                                  0, AV_OPT_TYPE_CONST, {.i64 = X264_B_PYRAMID_NONE},   INT_MIN, INT_MAX, VE, "b_pyramid" },
     { "strict",        "Strictly hierarchical pyramid",       0, AV_OPT_TYPE_CONST, {.i64 = X264_B_PYRAMID_STRICT}, INT_MIN, INT_MAX, VE, "b_pyramid" },
     { "normal",        "Non-strict (not Blu-ray compatible)", 0, AV_OPT_TYPE_CONST, {.i64 = X264_B_PYRAMID_NORMAL}, INT_MIN, INT_MAX, VE, "b_pyramid" },
-    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_INT, { .i64 = -1}, -1, 1, VE },
-    { "8x8dct",        "High profile 8x8 transform.",                     OFFSET(dct8x8),        AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
-    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
-    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
-    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, 1, VE},
+    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_BOOL, { .i64 = -1}, -1, 1, VE },
+    { "8x8dct",        "High profile 8x8 transform.",                     OFFSET(dct8x8),        AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
+    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
+    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
+    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE},
     { "deblock",       "Loop filter parameters, in <alpha:beta> form.",   OFFSET(deblock),       AV_OPT_TYPE_STRING, { 0 },  0, 0, VE},
     { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE},
     { "partitions",    "A comma-separated list of partitions to consider. "
@@ -764,17 +958,20 @@ static const AVOption options[] = {
     { "none",          NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_NONE}, INT_MIN, INT_MAX, VE, "nal-hrd" },
     { "vbr",           NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_VBR},  INT_MIN, INT_MAX, VE, "nal-hrd" },
     { "cbr",           NULL, 0, AV_OPT_TYPE_CONST, {.i64 = X264_NAL_HRD_CBR},  INT_MIN, INT_MAX, VE, "nal-hrd" },
+    { "avcintra-class","AVC-Intra class 50/100/200",                      OFFSET(avcintra_class),AV_OPT_TYPE_INT,     { .i64 = -1 }, -1, 200   , VE},
     { "motion-est",   "Set motion estimation method",                     OFFSET(motion_est),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, X264_ME_TESA, VE, "motion-est"},
     { "dia",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_DIA },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "hex",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_HEX },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "umh",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_UMH },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "esa",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_ESA },  INT_MIN, INT_MAX, VE, "motion-est" },
     { "tesa",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = X264_ME_TESA }, INT_MIN, INT_MAX, VE, "motion-est" },
-    { "forced-idr",   "If forwarding iframes, require them to be IDR frames.", OFFSET(forced_idr),  AV_OPT_TYPE_INT,    { .i64 = 0 }, 0, 1, VE },
+    { "forced-idr",   "If forcing keyframes, force them as IDR frames.",                                  OFFSET(forced_idr),  AV_OPT_TYPE_BOOL,   { .i64 = -1 }, -1, 1, VE },
     { "coder",    "Coder type",                                           OFFSET(coder), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VE, "coder" },
     { "default",          NULL, 0, AV_OPT_TYPE_CONST, { .i64 = -1 }, INT_MIN, INT_MAX, VE, "coder" },
     { "cavlc",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 },  INT_MIN, INT_MAX, VE, "coder" },
     { "cabac",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 },  INT_MIN, INT_MAX, VE, "coder" },
+    { "vlc",              NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 },  INT_MIN, INT_MAX, VE, "coder" },
+    { "ac",               NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 },  INT_MIN, INT_MAX, VE, "coder" },
     { "b_strategy",   "Strategy to choose between I/P/B-frames",          OFFSET(b_frame_strategy), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 2, VE },
     { "chromaoffset", "QP difference between chroma and luma",            OFFSET(chroma_offset), AV_OPT_TYPE_INT, { .i64 = -1 }, INT_MIN, INT_MAX, VE },
     { "sc_threshold", "Scene change threshold",                           OFFSET(scenechange_threshold), AV_OPT_TYPE_INT, { .i64 = -1 }, INT_MIN, INT_MAX, VE },
@@ -787,13 +984,16 @@ static const AVOption options[] = {
 static const AVCodecDefault x264_defaults[] = {
     { "b",                "0" },
     { "bf",               "-1" },
+    { "flags2",           "0" },
     { "g",                "-1" },
     { "i_qfactor",        "-1" },
+    { "b_qfactor",        "-1" },
     { "qmin",             "-1" },
     { "qmax",             "-1" },
     { "qdiff",            "-1" },
     { "qblur",            "-1" },
     { "qcomp",            "-1" },
+//     { "rc_lookahead",     "-1" },
     { "refs",             "-1" },
 #if FF_API_PRIVATE_OPT
     { "sc_threshold",     "-1" },
@@ -823,7 +1023,7 @@ static const AVCodecDefault x264_defaults[] = {
 };
 
 #if CONFIG_LIBX264_ENCODER
-static const AVClass class = {
+static const AVClass x264_class = {
     .class_name = "libx264",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -840,7 +1040,7 @@ AVCodec ff_libx264_encoder = {
     .encode2          = X264_frame,
     .close            = X264_close,
     .capabilities     = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-    .priv_class       = &class,
+    .priv_class       = &x264_class,
     .defaults         = x264_defaults,
     .init_static_data = X264_init_static,
     .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
@@ -848,6 +1048,30 @@ AVCodec ff_libx264_encoder = {
 };
 #endif
 
+#if CONFIG_LIBX264RGB_ENCODER
+static const AVClass rgbclass = {
+    .class_name = "libx264rgb",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libx264rgb_encoder = {
+    .name           = "libx264rgb",
+    .long_name      = NULL_IF_CONFIG_SMALL("libx264 H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 RGB"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(X264Context),
+    .init           = X264_init,
+    .encode2        = X264_frame,
+    .close          = X264_close,
+    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
+    .priv_class     = &rgbclass,
+    .defaults       = x264_defaults,
+    .pix_fmts       = pix_fmts_8bit_rgb,
+};
+#endif
+
 #if CONFIG_LIBX262_ENCODER
 static const AVClass X262_class = {
     .class_name = "libx262",
diff --git a/libavcodec/libx265.c b/libavcodec/libx265.c
index f5d3d0f..11088b2 100644
--- a/libavcodec/libx265.c
+++ b/libavcodec/libx265.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013-2014 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -82,14 +82,6 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
     if (!ctx->api)
         ctx->api = x265_api_get(0);
 
-    if (avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL &&
-        !av_pix_fmt_desc_get(avctx->pix_fmt)->log2_chroma_w) {
-        av_log(avctx, AV_LOG_ERROR,
-               "4:2:2 and 4:4:4 support is not fully defined for HEVC yet. "
-               "Set -strict experimental to encode anyway.\n");
-        return AVERROR(ENOSYS);
-    }
-
     ctx->params = ctx->api->param_alloc();
     if (!ctx->params) {
         av_log(avctx, AV_LOG_ERROR, "Could not allocate x265 param structure.\n");
@@ -154,14 +146,23 @@ static av_cold int libx265_encode_init(AVCodecContext *avctx)
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV420P12:
         ctx->params->internalCsp = X265_CSP_I420;
         break;
     case AV_PIX_FMT_YUV422P:
     case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV422P12:
         ctx->params->internalCsp = X265_CSP_I422;
         break;
+    case AV_PIX_FMT_GBRP:
+    case AV_PIX_FMT_GBRP10:
+    case AV_PIX_FMT_GBRP12:
+        ctx->params->vui.matrixCoeffs = AVCOL_SPC_RGB;
+        ctx->params->vui.bEnableVideoSignalTypePresentFlag  = 1;
+        ctx->params->vui.bEnableColorDescriptionPresentFlag = 1;
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUV444P10:
+    case AV_PIX_FMT_YUV444P12:
         ctx->params->internalCsp = X265_CSP_I444;
         break;
     }
@@ -272,7 +273,7 @@ static int libx265_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     ret = ctx->api->encoder_encode(ctx->encoder, &nal, &nnal,
                                    pic ? &x265pic : NULL, &x265pic_out);
     if (ret < 0)
-        return AVERROR_UNKNOWN;
+        return AVERROR_EXTERNAL;
 
     if (!nnal)
         return 0;
@@ -323,6 +324,19 @@ static const enum AVPixelFormat x265_csp_eight[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_YUV422P,
     AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat x265_csp_ten[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
+    AV_PIX_FMT_YUV420P10,
+    AV_PIX_FMT_YUV422P10,
+    AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_GBRP10,
     AV_PIX_FMT_NONE
 };
 
@@ -330,18 +344,26 @@ static const enum AVPixelFormat x265_csp_twelve[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_YUV422P,
     AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_GBRP,
     AV_PIX_FMT_YUV420P10,
     AV_PIX_FMT_YUV422P10,
     AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_GBRP10,
+    AV_PIX_FMT_YUV420P12,
+    AV_PIX_FMT_YUV422P12,
+    AV_PIX_FMT_YUV444P12,
+    AV_PIX_FMT_GBRP12,
     AV_PIX_FMT_NONE
 };
 
 static av_cold void libx265_encode_init_csp(AVCodec *codec)
 {
-    if (x265_max_bit_depth == 8)
-        codec->pix_fmts = x265_csp_eight;
-    else if (x265_max_bit_depth == 12)
+    if (x265_api_get(12))
         codec->pix_fmts = x265_csp_twelve;
+    else if (x265_api_get(10))
+        codec->pix_fmts = x265_csp_ten;
+    else if (x265_api_get(8))
+        codec->pix_fmts = x265_csp_eight;
 }
 
 #define OFFSET(x) offsetof(libx265Context, x)
diff --git a/libavcodec/libxavs.c b/libavcodec/libxavs.c
index b7a6c41..f257e55 100644
--- a/libavcodec/libxavs.c
+++ b/libavcodec/libxavs.c
@@ -2,20 +2,20 @@
  * AVS encoding using the xavs library
  * Copyright (C) 2010 Amanda, Y.N. Wu <amanda11192003@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -93,10 +93,8 @@ static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
     for (i = 0; i < nnal; i++)
         size += nals[i].i_payload;
 
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", size);
+    if ((ret = ff_alloc_packet2(ctx, pkt, size, 0)) < 0)
         return ret;
-    }
     p = pkt->data;
 
     /* Write the SEI as part of the first frame. */
@@ -124,7 +122,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
     xavs_nal_t *nal;
     int nnal, i, ret;
     xavs_picture_t pic_out;
-    uint8_t *sd;
+    int pict_type;
 
     x4->pic.img.i_csp   = XAVS_CSP_I420;
     x4->pic.img.i_plane = 3;
@@ -151,7 +149,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     if (!ret) {
         if (!frame && !(x4->end_of_stream)) {
-            if ((ret = ff_alloc_packet(pkt, 4)) < 0)
+            if ((ret = ff_alloc_packet2(avctx, pkt, 4, 0)) < 0)
                 return ret;
 
             pkt->data[0] = 0x0;
@@ -159,7 +157,7 @@ static int XAVS_frame(AVCodecContext *avctx, AVPacket *pkt,
             pkt->data[2] = 0x01;
             pkt->data[3] = 0xb1;
             pkt->dts = 2*x4->pts_buffer[(x4->out_frame_count-1)%(avctx->max_b_frames+1)] -
-                       x4->pts_buffer[(x4->out_frame_count-2)%(avctx->max_b_frames+1)];
+                         x4->pts_buffer[(x4->out_frame_count-2)%(avctx->max_b_frames+1)];
             x4->end_of_stream = END_OF_STREAM;
             *got_packet = 1;
         }
@@ -180,21 +178,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
     } else
         pkt->dts = pkt->pts;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
     switch (pic_out.i_type) {
     case XAVS_TYPE_IDR:
     case XAVS_TYPE_I:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case XAVS_TYPE_P:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case XAVS_TYPE_B:
     case XAVS_TYPE_BREF:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
+    default:
+        pict_type = AV_PICTURE_TYPE_NONE;
     }
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
@@ -215,10 +216,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA;
+    ff_side_data_set_encoder_stats(pkt, (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
 
     x4->out_frame_count++;
     *got_packet = ret;
@@ -230,7 +228,7 @@ static av_cold int XAVS_close(AVCodecContext *avctx)
     XavsContext *x4 = avctx->priv_data;
 
     av_freep(&avctx->extradata);
-    av_free(x4->sei);
+    av_freep(&x4->sei);
     av_freep(&x4->pts_buffer);
 
     if (x4->enc)
@@ -419,12 +417,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (!x4->enc)
         return -1;
 
-    if (!(x4->pts_buffer = av_mallocz((avctx->max_b_frames+1) * sizeof(*x4->pts_buffer))))
+    if (!(x4->pts_buffer = av_mallocz_array((avctx->max_b_frames+1), sizeof(*x4->pts_buffer))))
         return AVERROR(ENOMEM);
 
     /* TAG: Do we have GLOBAL HEADER in AVS */
     /* We Have PPS and SPS in AVS */
-    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER && 0) {
         xavs_nal_t *nal;
         int nnal, s, i, size;
         uint8_t *p;
@@ -454,19 +452,19 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define OFFSET(x) offsetof(XavsContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {-1 }, -1, FLT_MAX, VE },
+    { "crf",           "Select the quality for constant quality mode",    OFFSET(crf),           AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE },
     { "qp",            "Constant quantization parameter rate control method",OFFSET(cqp),        AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, INT_MAX, VE },
     { "b-bias",        "Influences how often B-frames are used",          OFFSET(b_bias),        AV_OPT_TYPE_INT,    {.i64 = INT_MIN}, INT_MIN, INT_MAX, VE },
-    { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {-1 }, -1, FLT_MAX, VE},
+    { "cplxblur",      "Reduce fluctuations in QP (before curve compression)", OFFSET(cplxblur), AV_OPT_TYPE_FLOAT,  {.dbl = -1 }, -1, FLT_MAX, VE},
     { "direct-pred",   "Direct MV prediction mode",                       OFFSET(direct_pred),   AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, INT_MAX, VE, "direct-pred" },
     { "none",          NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_NONE },     0, 0, VE, "direct-pred" },
     { "spatial",       NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_SPATIAL },  0, 0, VE, "direct-pred" },
     { "temporal",      NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_TEMPORAL }, 0, 0, VE, "direct-pred" },
     { "auto",          NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_DIRECT_PRED_AUTO },     0, 0, VE, "direct-pred" },
-    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, 1, VE},
-    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, 1, VE},
-    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_INT, {.i64 = -1}, -1, 1, VE },
-    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_INT,    {.i64 = -1 }, -1, 1, VE},
+    { "aud",           "Use access unit delimiters.",                     OFFSET(aud),           AV_OPT_TYPE_BOOL,    {.i64 = -1 }, -1, 1, VE},
+    { "mbtree",        "Use macroblock tree ratecontrol.",                OFFSET(mbtree),        AV_OPT_TYPE_BOOL,    {.i64 = -1 }, -1, 1, VE},
+    { "mixed-refs",    "One reference per partition, as opposed to one reference per macroblock", OFFSET(mixed_refs), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, VE },
+    { "fast-pskip",    NULL,                                              OFFSET(fast_pskip),    AV_OPT_TYPE_BOOL,    {.i64 = -1 }, -1, 1, VE},
     { "motion-est",   "Set motion estimation method",                     OFFSET(motion_est),    AV_OPT_TYPE_INT,    { .i64 = -1 }, -1, XAVS_ME_TESA, VE, "motion-est"},
     { "dia",           NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_ME_DIA },               INT_MIN, INT_MAX, VE, "motion-est" },
     { "hex",           NULL,      0,    AV_OPT_TYPE_CONST, { .i64 = XAVS_ME_HEX },               INT_MIN, INT_MAX, VE, "motion-est" },
@@ -481,7 +479,7 @@ static const AVOption options[] = {
     { NULL },
 };
 
-static const AVClass class = {
+static const AVClass xavs_class = {
     .class_name = "libxavs",
     .item_name  = av_default_item_name,
     .option     = options,
@@ -504,6 +502,6 @@ AVCodec ff_libxavs_encoder = {
     .close          = XAVS_close,
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
     .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
-    .priv_class     = &class,
+    .priv_class     = &xavs_class,
     .defaults       = xavs_defaults,
 };
diff --git a/libavcodec/libxvid.c b/libavcodec/libxvid.c
index 9cbe9c1..d916f11 100644
--- a/libavcodec/libxvid.c
+++ b/libavcodec/libxvid.c
@@ -2,20 +2,20 @@
  * Interface to xvidcore for MPEG-4 encoding
  * Copyright (c) 2004 Adam Thayer <krevnik@comcast.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,10 +27,11 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <unistd.h>
 #include <xvid.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
+#include "libavutil/file.h"
 #include "libavutil/internal.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
@@ -42,6 +43,14 @@
 #include "libxvid.h"
 #include "mpegutils.h"
 
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if HAVE_IO_H
+#include <io.h>
+#endif
+
 /**
  * Buffer management macros.
  */
@@ -54,7 +63,7 @@
  * This stores all the private context for the codec.
  */
 struct xvid_context {
-    AVClass *class;                /**< Handle for Xvid encoder */
+    AVClass *class;
     void *encoder_handle;          /**< Handle for Xvid encoder */
     int xsize;                     /**< Frame x size */
     int ysize;                     /**< Frame y size */
@@ -66,6 +75,7 @@ struct xvid_context {
     char *twopassbuffer;           /**< Character buffer for two-pass */
     char *old_twopassbuffer;       /**< Old character buffer (two-pass) */
     char *twopassfile;             /**< second pass temp file name */
+    int twopassfd;
     unsigned char *intra_matrix;   /**< P-Frame Quant Matrix */
     unsigned char *inter_matrix;   /**< I-Frame Quant Matrix */
     int lumi_aq;                   /**< Lumi masking as an aq method */
@@ -85,6 +95,7 @@ struct xvid_ff_pass1 {
     struct xvid_context *context;   /**< Pointer to private context */
 };
 
+static int xvid_encode_close(AVCodecContext *avctx);
 static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                              const AVFrame *picture, int *got_packet);
 
@@ -119,7 +130,7 @@ static int xvid_ff_2pass_create(xvid_plg_create_t *param, void **handle)
     /* This is because we can safely prevent a buffer overflow */
     log[0] = 0;
     snprintf(log, BUFFER_REMAINING(log),
-             "# avconv 2-pass log file, using xvid codec\n");
+             "# ffmpeg 2-pass log file, using xvid codec\n");
     snprintf(BUFFER_CAT(log), BUFFER_REMAINING(log),
              "# Do not modify. libxvidcore version: %d.%d.%d\n\n",
              XVID_VERSION_MAJOR(XVID_VERSION),
@@ -361,32 +372,34 @@ static void xvid_correct_framerate(AVCodecContext *avctx)
 
 static av_cold int xvid_encode_init(AVCodecContext *avctx)
 {
-    int xerr, i;
+    int xerr, i, ret = -1;
     int xvid_flags = avctx->flags;
     struct xvid_context *x = avctx->priv_data;
     uint16_t *intra, *inter;
     int fd;
 
-    xvid_plugin_single_t single         = { 0 };
-    struct xvid_ff_pass1 rc2pass1       = { 0 };
-    xvid_plugin_2pass2_t rc2pass2       = { 0 };
-    xvid_plugin_lumimasking_t masking_l = { 0 }; /* For lumi masking */
-    xvid_plugin_lumimasking_t masking_v = { 0 }; /* For variance AQ */
-    xvid_plugin_ssim_t ssim             = { 0 };
-    xvid_gbl_init_t xvid_gbl_init       = { 0 };
-    xvid_enc_create_t xvid_enc_create   = { 0 };
-    xvid_enc_plugin_t plugins[7];
-
-    /* Bring in VOP flags from avconv command-line */
-    x->vop_flags = XVID_VOP_HALFPEL; /* Bare minimum quality */
+    xvid_plugin_single_t      single          = { 0 };
+    struct xvid_ff_pass1      rc2pass1        = { 0 };
+    xvid_plugin_2pass2_t      rc2pass2        = { 0 };
+    xvid_plugin_lumimasking_t masking_l       = { 0 }; /* For lumi masking */
+    xvid_plugin_lumimasking_t masking_v       = { 0 }; /* For variance AQ */
+    xvid_plugin_ssim_t        ssim            = { 0 };
+    xvid_gbl_init_t           xvid_gbl_init   = { 0 };
+    xvid_enc_create_t         xvid_enc_create = { 0 };
+    xvid_enc_plugin_t         plugins[4];
+
+    x->twopassfd = -1;
+
+    /* Bring in VOP flags from ffmpeg command-line */
+    x->vop_flags = XVID_VOP_HALFPEL;              /* Bare minimum quality */
     if (xvid_flags & AV_CODEC_FLAG_4MV)
-        x->vop_flags |= XVID_VOP_INTER4V; /* Level 3 */
+        x->vop_flags    |= XVID_VOP_INTER4V;      /* Level 3 */
     if (avctx->trellis)
-        x->vop_flags |= XVID_VOP_TRELLISQUANT; /* Level 5 */
+        x->vop_flags    |= XVID_VOP_TRELLISQUANT; /* Level 5 */
     if (xvid_flags & AV_CODEC_FLAG_AC_PRED)
-        x->vop_flags |= XVID_VOP_HQACPRED; /* Level 6 */
+        x->vop_flags    |= XVID_VOP_HQACPRED;     /* Level 6 */
     if (xvid_flags & AV_CODEC_FLAG_GRAY)
-        x->vop_flags |= XVID_VOP_GREYSCALE;
+        x->vop_flags    |= XVID_VOP_GREYSCALE;
 
     /* Decide which ME quality setting to use */
     x->me_flags = 0;
@@ -448,7 +461,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     }
 
-    /* Bring in VOL flags from avconv command-line */
+    /* Bring in VOL flags from ffmpeg command-line */
 #if FF_API_GMC
     if (avctx->flags & CODEC_FLAG_GMC)
         x->gmc = 1;
@@ -490,6 +503,18 @@ FF_ENABLE_DEPRECATION_WARNINGS
     xvid_enc_create.num_zones = 0;
 
     xvid_enc_create.num_threads = avctx->thread_count;
+#if (XVID_VERSION <= 0x010303) && (XVID_VERSION >= 0x010300)
+    /* workaround for a bug in libxvidcore */
+    if (avctx->height <= 16) {
+        if (avctx->thread_count < 2) {
+            xvid_enc_create.num_threads = 0;
+        } else {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Too small height for threads > 1.");
+            return AVERROR(EINVAL);
+        }
+    }
+#endif
 
     xvid_enc_create.plugins     = plugins;
     xvid_enc_create.num_plugins = 0;
@@ -519,26 +544,29 @@ FF_ENABLE_DEPRECATION_WARNINGS
         rc2pass2.version = XVID_VERSION;
         rc2pass2.bitrate = avctx->bit_rate;
 
-        fd = ff_tempfile("xvidff.", &x->twopassfile);
+        fd = avpriv_tempfile("xvidff.", &x->twopassfile, 0, avctx);
         if (fd < 0) {
             av_log(avctx, AV_LOG_ERROR, "Xvid: Cannot write 2-pass pipe\n");
             return fd;
         }
+        x->twopassfd = fd;
 
         if (!avctx->stats_in) {
             av_log(avctx, AV_LOG_ERROR,
                    "Xvid: No 2-pass information loaded for second pass\n");
-            return AVERROR_INVALIDDATA;
+            return AVERROR(EINVAL);
         }
 
-        if (strlen(avctx->stats_in) >
-            write(fd, avctx->stats_in, strlen(avctx->stats_in))) {
-            close(fd);
+        ret = write(fd, avctx->stats_in, strlen(avctx->stats_in));
+        if (ret == -1)
+            ret = AVERROR(errno);
+        else if (strlen(avctx->stats_in) > ret) {
             av_log(avctx, AV_LOG_ERROR, "Xvid: Cannot write to 2-pass pipe\n");
-            return AVERROR(EIO);
+            ret = AVERROR(EIO);
         }
+        if (ret < 0)
+            return ret;
 
-        close(fd);
         rc2pass2.filename                          = x->twopassfile;
         plugins[xvid_enc_create.num_plugins].func  = xvid_plugin_2pass2;
         plugins[xvid_enc_create.num_plugins].param = &rc2pass2;
@@ -556,12 +584,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->lumi_masking != 0.0)
         x->lumi_aq = 1;
 
-    if (x->lumi_aq && x->variance_aq) {
-        x->variance_aq = 0;
-        av_log(avctx, AV_LOG_WARNING,
-               "variance_aq is ignored when lumi_aq is set.\n");
-    }
-
     /* Luminance Masking */
     if (x->lumi_aq) {
         masking_l.method                          = 0;
@@ -582,6 +604,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
         xvid_enc_create.num_plugins++;
     }
 
+    if (x->lumi_aq && x->variance_aq )
+        av_log(avctx, AV_LOG_INFO,
+               "Both lumi_aq and variance_aq are enabled. The resulting quality"
+               "will be the worse one of the two effects made by the AQ.\n");
+
     /* SSIM */
     if (x->ssim) {
         plugins[xvid_enc_create.num_plugins].func  = xvid_plugin_ssim;
@@ -681,11 +708,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->max_b_frames > 0 && !x->quicktime_format)
         xvid_enc_create.global |= XVID_GLOBAL_PACKED;
 
+    av_assert0(xvid_enc_create.num_plugins + (!!x->ssim) + (!!x->variance_aq) + (!!x->lumi_aq) <= FF_ARRAY_ELEMS(plugins));
+
     /* Encode a dummy frame to get the extradata immediately */
     if (x->quicktime_format) {
         AVFrame *picture;
-        AVPacket packet;
-        int got_packet, ret;
+        AVPacket packet = {0};
+        int size, got_packet, ret;
 
         av_init_packet(&packet);
 
@@ -694,26 +723,26 @@ FF_ENABLE_DEPRECATION_WARNINGS
             return AVERROR(ENOMEM);
 
         xerr = xvid_encore(NULL, XVID_ENC_CREATE, &xvid_enc_create, NULL);
-        if (xerr) {
+        if( xerr ) {
             av_frame_free(&picture);
             av_log(avctx, AV_LOG_ERROR, "Xvid: Could not create encoder reference\n");
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
         }
         x->encoder_handle = xvid_enc_create.handle;
-
-        picture->width  = avctx->width;
-        picture->height = avctx->height;
-        picture->format = avctx->pix_fmt;
-
-        if ((ret = av_frame_get_buffer(picture, 32)) < 0) {
-            xvid_encore(x->encoder_handle, XVID_ENC_DESTROY, NULL, NULL);
+        size = ((avctx->width + 1) & ~1) * ((avctx->height + 1) & ~1);
+        picture->data[0] = av_malloc(size + size / 2);
+        if (!picture->data[0]) {
             av_frame_free(&picture);
-            return ret;
+            return AVERROR(ENOMEM);
         }
-
+        picture->data[1] = picture->data[0] + size;
+        picture->data[2] = picture->data[1] + size / 4;
+        memset(picture->data[0], 0, size);
+        memset(picture->data[1], 128, size / 2);
         ret = xvid_encode_frame(avctx, &packet, picture, &got_packet);
         if (!ret && got_packet)
             av_packet_unref(&packet);
+        av_free(picture->data[0]);
         av_frame_free(&picture);
         xvid_encore(x->encoder_handle, XVID_ENC_DESTROY, NULL, NULL);
     }
@@ -722,7 +751,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     xerr = xvid_encore(NULL, XVID_ENC_CREATE, &xvid_enc_create, NULL);
     if (xerr) {
         av_log(avctx, AV_LOG_ERROR, "Xvid: Could not create encoder reference\n");
-        return -1;
+        return AVERROR_EXTERNAL;
     }
 
     x->encoder_handle  = xvid_enc_create.handle;
@@ -742,11 +771,8 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     xvid_enc_frame_t xvid_enc_frame = { 0 };
     xvid_enc_stats_t xvid_enc_stats = { 0 };
 
-    if (!user_packet &&
-        (ret = av_new_packet(pkt, mb_width * mb_height * MAX_MB_BYTES + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, mb_width*(int64_t)mb_height*MAX_MB_BYTES + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     /* Start setting up the frame */
     xvid_enc_frame.version = XVID_VERSION;
@@ -760,7 +786,7 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (avctx->pix_fmt != AV_PIX_FMT_YUV420P) {
         av_log(avctx, AV_LOG_ERROR,
                "Xvid: Color spaces other than 420P not supported\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     xvid_enc_frame.input.csp = XVID_CSP_PLANAR; /* YUV420P */
@@ -781,11 +807,13 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                                   XVID_TYPE_AUTO;
 
     /* Pixel aspect ratio setting */
-    if (avctx->sample_aspect_ratio.num < 1 || avctx->sample_aspect_ratio.num > 255 ||
-        avctx->sample_aspect_ratio.den < 1 || avctx->sample_aspect_ratio.den > 255) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid pixel aspect ratio %i/%i\n",
+    if (avctx->sample_aspect_ratio.num < 0 || avctx->sample_aspect_ratio.num > 255 ||
+        avctx->sample_aspect_ratio.den < 0 || avctx->sample_aspect_ratio.den > 255) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Invalid pixel aspect ratio %i/%i, limit is 255/255 reducing\n",
                avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den);
-        return -1;
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den, 255);
     }
     xvid_enc_frame.par        = XVID_PAR_EXT;
     xvid_enc_frame.par_width  = avctx->sample_aspect_ratio.num;
@@ -818,27 +846,28 @@ static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     if (xerr > 0) {
-        uint8_t *sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                              sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = xvid_enc_stats.quant * FF_QP2LAMBDA;
+        int pict_type;
 
         *got_packet = 1;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-        avctx->coded_frame->quality = xvid_enc_stats.quant * FF_QP2LAMBDA;
         if (xvid_enc_stats.type == XVID_TYPE_PVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+            pict_type = AV_PICTURE_TYPE_P;
         else if (xvid_enc_stats.type == XVID_TYPE_BVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+            pict_type = AV_PICTURE_TYPE_B;
         else if (xvid_enc_stats.type == XVID_TYPE_SVOP)
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_S;
+            pict_type = AV_PICTURE_TYPE_S;
         else
-            avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+            pict_type = AV_PICTURE_TYPE_I;
+
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+        avctx->coded_frame->pict_type = pict_type;
+        avctx->coded_frame->quality = xvid_enc_stats.quant * FF_QP2LAMBDA;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+
+        ff_side_data_set_encoder_stats(pkt, xvid_enc_stats.quant * FF_QP2LAMBDA, NULL, 0, pict_type);
+
         if (xvid_enc_frame.out_flags & XVID_KEYFRAME) {
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -867,7 +896,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             return 0;
         av_log(avctx, AV_LOG_ERROR,
                "Xvid: Encoding Error Occurred: %i\n", xerr);
-        return xerr;
+        return AVERROR_EXTERNAL;
     }
 }
 
@@ -882,12 +911,18 @@ static av_cold int xvid_encode_close(AVCodecContext *avctx)
 
     av_freep(&avctx->extradata);
     if (x->twopassbuffer) {
-        av_free(x->twopassbuffer);
-        av_free(x->old_twopassbuffer);
+        av_freep(&x->twopassbuffer);
+        av_freep(&x->old_twopassbuffer);
+        avctx->stats_out = NULL;
+    }
+    if (x->twopassfd>=0) {
+        unlink(x->twopassfile);
+        close(x->twopassfd);
+        x->twopassfd = -1;
     }
-    av_free(x->twopassfile);
-    av_free(x->intra_matrix);
-    av_free(x->inter_matrix);
+    av_freep(&x->twopassfile);
+    av_freep(&x->intra_matrix);
+    av_freep(&x->inter_matrix);
 
     return 0;
 }
diff --git a/libavcodec/libxvid.h b/libavcodec/libxvid.h
index 15f908f..ef9a5a9 100644
--- a/libavcodec/libxvid.h
+++ b/libavcodec/libxvid.h
@@ -1,20 +1,20 @@
 /*
  * copyright (C) 2006 Corey Hickey
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,4 @@
  * common functions for use with the Xvid wrappers
  */
 
-int ff_tempfile(const char *prefix, char **filename);
-
 #endif /* AVCODEC_LIBXVID_H */
diff --git a/libavcodec/libxvid_rc.c b/libavcodec/libxvid_rc.c
index 26f3c49..0e25a07 100644
--- a/libavcodec/libxvid_rc.c
+++ b/libavcodec/libxvid_rc.c
@@ -3,29 +3,33 @@
  *
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
-#if !HAVE_MKSTEMP
-#include <fcntl.h>
+#if HAVE_IO_H
+#include <io.h>
 #endif
+
+#if HAVE_UNISTD_H
 #include <unistd.h>
+#endif
+
 #include <xvid.h>
 
 #include "libavutil/attributes.h"
@@ -35,43 +39,6 @@
 #include "libxvid.h"
 #include "mpegvideo.h"
 
-/* Wrapper to work around the lack of mkstemp() on mingw.
- * Also, tries to create file in /tmp first, if possible.
- * *prefix can be a character constant; *filename will be allocated internally.
- * @return file descriptor of opened file (or -1 on error)
- * and opened file name in **filename. */
-int ff_tempfile(const char *prefix, char **filename)
-{
-    int fd = -1;
-#if !HAVE_MKSTEMP
-    *filename = tempnam(".", prefix);
-#else
-    size_t len = strlen(prefix) + 12; /* room for "/tmp/" and "XXXXXX\0" */
-    *filename  = av_malloc(len);
-#endif
-    /* -----common section-----*/
-    if (!(*filename)) {
-        av_log(NULL, AV_LOG_ERROR, "ff_tempfile: Cannot allocate file name\n");
-        return AVERROR(ENOMEM);
-    }
-#if !HAVE_MKSTEMP
-    fd = avpriv_open(*filename, O_RDWR | O_BINARY | O_CREAT, 0444);
-#else
-    snprintf(*filename, len, "/tmp/%sXXXXXX", prefix);
-    fd = mkstemp(*filename);
-    if (fd < 0) {
-        snprintf(*filename, len, "./%sXXXXXX", prefix);
-        fd = mkstemp(*filename);
-    }
-#endif
-    /* -----common section-----*/
-    if (fd < 0) {
-        av_log(NULL, AV_LOG_ERROR, "ff_tempfile: Cannot open temporary file %s\n", *filename);
-        return AVERROR(EIO);
-    }
-    return fd; /* success */
-}
-
 av_cold int ff_xvid_rate_control_init(MpegEncContext *s)
 {
     char *tmp_name;
@@ -79,7 +46,7 @@ av_cold int ff_xvid_rate_control_init(MpegEncContext *s)
     xvid_plg_create_t xvid_plg_create = { 0 };
     xvid_plugin_2pass2_t xvid_2pass2  = { 0 };
 
-    fd = ff_tempfile("xvidrc.", &tmp_name);
+    fd = avpriv_tempfile("xvidrc.", &tmp_name, 0, s->avctx);
     if (fd < 0) {
         av_log(NULL, AV_LOG_ERROR, "Can't create temporary pass2 file.\n");
         return fd;
@@ -100,7 +67,13 @@ av_cold int ff_xvid_rate_control_init(MpegEncContext *s)
                  (rce->i_tex_bits + rce->p_tex_bits + rce->misc_bits + 7) / 8,
                  (rce->header_bits + rce->mv_bits + 7) / 8);
 
-        write(fd, tmp, strlen(tmp));
+        if (write(fd, tmp, strlen(tmp)) < 0) {
+            int ret = AVERROR(errno);
+            av_log(NULL, AV_LOG_ERROR, "Error %s writing 2pass logfile\n", av_err2str(ret));
+            av_free(tmp_name);
+            close(fd);
+            return ret;
+        }
     }
 
     close(fd);
diff --git a/libavcodec/libzvbi-teletextdec.c b/libavcodec/libzvbi-teletextdec.c
new file mode 100644
index 0000000..d1f0a9f
--- /dev/null
+++ b/libavcodec/libzvbi-teletextdec.c
@@ -0,0 +1,576 @@
+/*
+ * Teletext decoding for ffmpeg
+ * Copyright (c) 2005-2010, 2012 Wolfram Gloger
+ * Copyright (c) 2013 Marton Balint
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "libavcodec/ass.h"
+#include "libavcodec/dvbtxt.h"
+#include "libavutil/opt.h"
+#include "libavutil/bprint.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+
+#include <libzvbi.h>
+
+#define TEXT_MAXSZ    (25 * (56 + 1) * 4 + 2)
+#define VBI_NB_COLORS 40
+#define VBI_TRANSPARENT_BLACK 8
+#define RGBA(r,g,b,a) (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
+#define VBI_R(rgba)   (((rgba) >> 0) & 0xFF)
+#define VBI_G(rgba)   (((rgba) >> 8) & 0xFF)
+#define VBI_B(rgba)   (((rgba) >> 16) & 0xFF)
+#define VBI_A(rgba)   (((rgba) >> 24) & 0xFF)
+#define MAX_BUFFERED_PAGES 25
+#define BITMAP_CHAR_WIDTH  12
+#define BITMAP_CHAR_HEIGHT 10
+#define MAX_SLICES 64
+
+typedef struct TeletextPage
+{
+    AVSubtitleRect *sub_rect;
+    int pgno;
+    int subno;
+    int64_t pts;
+} TeletextPage;
+
+typedef struct TeletextContext
+{
+    AVClass        *class;
+    char           *pgno;
+    int             x_offset;
+    int             y_offset;
+    int             format_id; /* 0 = bitmap, 1 = text/ass */
+    int             chop_top;
+    int             sub_duration; /* in msec */
+    int             transparent_bg;
+    int             opacity;
+    int             chop_spaces;
+
+    int             lines_processed;
+    TeletextPage    *pages;
+    int             nb_pages;
+    int64_t         pts;
+    int             handler_ret;
+
+    vbi_decoder *   vbi;
+#ifdef DEBUG
+    vbi_export *    ex;
+#endif
+    vbi_sliced      sliced[MAX_SLICES];
+
+    int             readorder;
+} TeletextContext;
+
+static int chop_spaces_utf8(const unsigned char* t, int len)
+{
+    t += len;
+    while (len > 0) {
+        if (*--t != ' ' || (len-1 > 0 && *(t-1) & 0x80))
+            break;
+        --len;
+    }
+    return len;
+}
+
+static void subtitle_rect_free(AVSubtitleRect **sub_rect)
+{
+    av_freep(&(*sub_rect)->data[0]);
+    av_freep(&(*sub_rect)->data[1]);
+    av_freep(&(*sub_rect)->ass);
+    av_freep(sub_rect);
+}
+
+static char *create_ass_text(TeletextContext *ctx, const char *text)
+{
+    char *dialog;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    ff_ass_bprint_text_event(&buf, text, strlen(text), "", 0);
+    if (!av_bprint_is_complete(&buf)) {
+        av_bprint_finalize(&buf, NULL);
+        return NULL;
+    }
+    dialog = ff_ass_get_dialog(ctx->readorder++, 0, NULL, NULL, buf.str);
+    av_bprint_finalize(&buf, NULL);
+    return dialog;
+}
+
+/* Draw a page as text */
+static int gen_sub_text(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page, int chop_top)
+{
+    const char *in;
+    AVBPrint buf;
+    char *vbi_text = av_malloc(TEXT_MAXSZ);
+    int sz;
+
+    if (!vbi_text)
+        return AVERROR(ENOMEM);
+
+    sz = vbi_print_page_region(page, vbi_text, TEXT_MAXSZ-1, "UTF-8",
+                                   /*table mode*/ TRUE, FALSE,
+                                   0,             chop_top,
+                                   page->columns, page->rows-chop_top);
+    if (sz <= 0) {
+        av_log(ctx, AV_LOG_ERROR, "vbi_print error\n");
+        av_free(vbi_text);
+        return AVERROR_EXTERNAL;
+    }
+    vbi_text[sz] = '\0';
+    in  = vbi_text;
+    av_bprint_init(&buf, 0, TEXT_MAXSZ);
+
+    if (ctx->chop_spaces) {
+        for (;;) {
+            int nl, sz;
+
+            // skip leading spaces and newlines
+            in += strspn(in, " \n");
+            // compute end of row
+            for (nl = 0; in[nl]; ++nl)
+                if (in[nl] == '\n' && (nl==0 || !(in[nl-1] & 0x80)))
+                    break;
+            if (!in[nl])
+                break;
+            // skip trailing spaces
+            sz = chop_spaces_utf8(in, nl);
+            av_bprint_append_data(&buf, in, sz);
+            av_bprintf(&buf, "\n");
+            in += nl;
+        }
+    } else {
+        av_bprintf(&buf, "%s\n", vbi_text);
+    }
+    av_free(vbi_text);
+
+    if (!av_bprint_is_complete(&buf)) {
+        av_bprint_finalize(&buf, NULL);
+        return AVERROR(ENOMEM);
+    }
+
+    if (buf.len) {
+        sub_rect->type = SUBTITLE_ASS;
+        sub_rect->ass = create_ass_text(ctx, buf.str);
+
+        if (!sub_rect->ass) {
+            av_bprint_finalize(&buf, NULL);
+            return AVERROR(ENOMEM);
+        }
+        av_log(ctx, AV_LOG_DEBUG, "subtext:%s:txetbus\n", sub_rect->ass);
+    } else {
+        sub_rect->type = SUBTITLE_NONE;
+    }
+    av_bprint_finalize(&buf, NULL);
+    return 0;
+}
+
+static void fix_transparency(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page,
+                             int chop_top, int resx, int resy)
+{
+    int iy;
+
+    // Hack for transparency, inspired by VLC code...
+    for (iy = 0; iy < resy; iy++) {
+        uint8_t *pixel = sub_rect->data[0] + iy * sub_rect->linesize[0];
+        vbi_char *vc = page->text + (iy / BITMAP_CHAR_HEIGHT + chop_top) * page->columns;
+        vbi_char *vcnext = vc + page->columns;
+        for (; vc < vcnext; vc++) {
+            uint8_t *pixelnext = pixel + BITMAP_CHAR_WIDTH;
+            switch (vc->opacity) {
+                case VBI_TRANSPARENT_SPACE:
+                    memset(pixel, VBI_TRANSPARENT_BLACK, BITMAP_CHAR_WIDTH);
+                    break;
+                case VBI_OPAQUE:
+                    if (!ctx->transparent_bg)
+                        break;
+                case VBI_SEMI_TRANSPARENT:
+                    if (ctx->opacity > 0) {
+                        if (ctx->opacity < 255)
+                            for(; pixel < pixelnext; pixel++)
+                                if (*pixel == vc->background)
+                                    *pixel += VBI_NB_COLORS;
+                        break;
+                    }
+                case VBI_TRANSPARENT_FULL:
+                    for(; pixel < pixelnext; pixel++)
+                        if (*pixel == vc->background)
+                            *pixel = VBI_TRANSPARENT_BLACK;
+                    break;
+            }
+            pixel = pixelnext;
+        }
+    }
+}
+
+/* Draw a page as bitmap */
+static int gen_sub_bitmap(TeletextContext *ctx, AVSubtitleRect *sub_rect, vbi_page *page, int chop_top)
+{
+    int resx = page->columns * BITMAP_CHAR_WIDTH;
+    int resy = (page->rows - chop_top) * BITMAP_CHAR_HEIGHT;
+    uint8_t ci;
+    vbi_char *vc = page->text + (chop_top * page->columns);
+    vbi_char *vcend = page->text + (page->rows * page->columns);
+
+    for (; vc < vcend; vc++) {
+        if (vc->opacity != VBI_TRANSPARENT_SPACE)
+            break;
+    }
+
+    if (vc >= vcend) {
+        av_log(ctx, AV_LOG_DEBUG, "dropping empty page %3x\n", page->pgno);
+        sub_rect->type = SUBTITLE_NONE;
+        return 0;
+    }
+
+    sub_rect->data[0] = av_mallocz(resx * resy);
+    sub_rect->linesize[0] = resx;
+    if (!sub_rect->data[0])
+        return AVERROR(ENOMEM);
+
+    vbi_draw_vt_page_region(page, VBI_PIXFMT_PAL8,
+                            sub_rect->data[0], sub_rect->linesize[0],
+                            0, chop_top, page->columns, page->rows - chop_top,
+                            /*reveal*/ 1, /*flash*/ 1);
+
+    fix_transparency(ctx, sub_rect, page, chop_top, resx, resy);
+    sub_rect->x = ctx->x_offset;
+    sub_rect->y = ctx->y_offset + chop_top * BITMAP_CHAR_HEIGHT;
+    sub_rect->w = resx;
+    sub_rect->h = resy;
+    sub_rect->nb_colors = ctx->opacity > 0 && ctx->opacity < 255 ? 2 * VBI_NB_COLORS : VBI_NB_COLORS;
+    sub_rect->data[1] = av_mallocz(AVPALETTE_SIZE);
+    if (!sub_rect->data[1]) {
+        av_freep(&sub_rect->data[0]);
+        return AVERROR(ENOMEM);
+    }
+    for (ci = 0; ci < VBI_NB_COLORS; ci++) {
+        int r, g, b, a;
+
+        r = VBI_R(page->color_map[ci]);
+        g = VBI_G(page->color_map[ci]);
+        b = VBI_B(page->color_map[ci]);
+        a = VBI_A(page->color_map[ci]);
+        ((uint32_t *)sub_rect->data[1])[ci] = RGBA(r, g, b, a);
+        ((uint32_t *)sub_rect->data[1])[ci + VBI_NB_COLORS] = RGBA(r, g, b, ctx->opacity);
+        ff_dlog(ctx, "palette %0x\n", ((uint32_t *)sub_rect->data[1])[ci]);
+    }
+    ((uint32_t *)sub_rect->data[1])[VBI_TRANSPARENT_BLACK] = RGBA(0, 0, 0, 0);
+    ((uint32_t *)sub_rect->data[1])[VBI_TRANSPARENT_BLACK + VBI_NB_COLORS] = RGBA(0, 0, 0, 0);
+    sub_rect->type = SUBTITLE_BITMAP;
+    return 0;
+}
+
+static void handler(vbi_event *ev, void *user_data)
+{
+    TeletextContext *ctx = user_data;
+    TeletextPage *new_pages;
+    vbi_page page;
+    int res;
+    char pgno_str[12];
+    vbi_subno subno;
+    vbi_page_type vpt;
+    int chop_top;
+    char *lang;
+
+    snprintf(pgno_str, sizeof pgno_str, "%03x", ev->ev.ttx_page.pgno);
+    av_log(ctx, AV_LOG_DEBUG, "decoded page %s.%02x\n",
+           pgno_str, ev->ev.ttx_page.subno & 0xFF);
+
+    if (strcmp(ctx->pgno, "*") && !strstr(ctx->pgno, pgno_str))
+        return;
+    if (ctx->handler_ret < 0)
+        return;
+
+    res = vbi_fetch_vt_page(ctx->vbi, &page,
+                            ev->ev.ttx_page.pgno,
+                            ev->ev.ttx_page.subno,
+                            VBI_WST_LEVEL_3p5, 25, TRUE);
+
+    if (!res)
+        return;
+
+#ifdef DEBUG
+    fprintf(stderr, "\nSaving res=%d dy0=%d dy1=%d...\n",
+            res, page.dirty.y0, page.dirty.y1);
+    fflush(stderr);
+
+    if (!vbi_export_stdio(ctx->ex, stderr, &page))
+        fprintf(stderr, "failed: %s\n", vbi_export_errstr(ctx->ex));
+#endif
+
+    vpt = vbi_classify_page(ctx->vbi, ev->ev.ttx_page.pgno, &subno, &lang);
+    chop_top = ctx->chop_top ||
+        ((page.rows > 1) && (vpt == VBI_SUBTITLE_PAGE));
+
+    av_log(ctx, AV_LOG_DEBUG, "%d x %d page chop:%d\n",
+           page.columns, page.rows, chop_top);
+
+    if (ctx->nb_pages < MAX_BUFFERED_PAGES) {
+        if ((new_pages = av_realloc_array(ctx->pages, ctx->nb_pages + 1, sizeof(TeletextPage)))) {
+            TeletextPage *cur_page = new_pages + ctx->nb_pages;
+            ctx->pages = new_pages;
+            cur_page->sub_rect = av_mallocz(sizeof(*cur_page->sub_rect));
+            cur_page->pts = ctx->pts;
+            cur_page->pgno = ev->ev.ttx_page.pgno;
+            cur_page->subno = ev->ev.ttx_page.subno;
+            if (cur_page->sub_rect) {
+                res = (ctx->format_id == 0) ?
+                    gen_sub_bitmap(ctx, cur_page->sub_rect, &page, chop_top) :
+                    gen_sub_text  (ctx, cur_page->sub_rect, &page, chop_top);
+                if (res < 0) {
+                    av_freep(&cur_page->sub_rect);
+                    ctx->handler_ret = res;
+                } else {
+                    ctx->pages[ctx->nb_pages++] = *cur_page;
+                }
+            } else {
+                ctx->handler_ret = AVERROR(ENOMEM);
+            }
+        } else {
+            ctx->handler_ret = AVERROR(ENOMEM);
+        }
+    } else {
+        //TODO: If multiple packets contain more than one page, pages may got queued up, and this may happen...
+        av_log(ctx, AV_LOG_ERROR, "Buffered too many pages, dropping page %s.\n", pgno_str);
+        ctx->handler_ret = AVERROR(ENOSYS);
+    }
+
+    vbi_unref_page(&page);
+}
+
+static int slice_to_vbi_lines(TeletextContext *ctx, uint8_t* buf, int size)
+{
+    int lines = 0;
+    while (size >= 2 && lines < MAX_SLICES) {
+        int data_unit_id     = buf[0];
+        int data_unit_length = buf[1];
+        if (data_unit_length + 2 > size)
+            return AVERROR_INVALIDDATA;
+        if (ff_data_unit_id_is_teletext(data_unit_id)) {
+            if (data_unit_length != 0x2c)
+                return AVERROR_INVALIDDATA;
+            else {
+                int line_offset  = buf[2] & 0x1f;
+                int field_parity = buf[2] & 0x20;
+                int i;
+                ctx->sliced[lines].id = VBI_SLICED_TELETEXT_B;
+                ctx->sliced[lines].line = (line_offset > 0 ? (line_offset + (field_parity ? 0 : 313)) : 0);
+                for (i = 0; i < 42; i++)
+                    ctx->sliced[lines].data[i] = vbi_rev8(buf[4 + i]);
+                lines++;
+            }
+        }
+        size -= data_unit_length + 2;
+        buf += data_unit_length + 2;
+    }
+    if (size)
+        av_log(ctx, AV_LOG_WARNING, "%d bytes remained after slicing data\n", size);
+    return lines;
+}
+
+static int teletext_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *pkt)
+{
+    TeletextContext *ctx = avctx->priv_data;
+    AVSubtitle      *sub = data;
+    int             ret = 0;
+    int j;
+
+    if (!ctx->vbi) {
+        if (!(ctx->vbi = vbi_decoder_new()))
+            return AVERROR(ENOMEM);
+        if (!vbi_event_handler_add(ctx->vbi, VBI_EVENT_TTX_PAGE, handler, ctx)) {
+            vbi_decoder_delete(ctx->vbi);
+            ctx->vbi = NULL;
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    if (avctx->pkt_timebase.num && pkt->pts != AV_NOPTS_VALUE)
+        ctx->pts = av_rescale_q(pkt->pts, avctx->pkt_timebase, AV_TIME_BASE_Q);
+
+    if (pkt->size) {
+        int lines;
+        const int full_pes_size = pkt->size + 45; /* PES header is 45 bytes */
+
+        // We allow unreasonably big packets, even if the standard only allows a max size of 1472
+        if (full_pes_size < 184 || full_pes_size > 65504 || full_pes_size % 184 != 0)
+            return AVERROR_INVALIDDATA;
+
+        ctx->handler_ret = pkt->size;
+
+        if (ff_data_identifier_is_teletext(*pkt->data)) {
+            if ((lines = slice_to_vbi_lines(ctx, pkt->data + 1, pkt->size - 1)) < 0)
+                return lines;
+            ff_dlog(avctx, "ctx=%p buf_size=%d lines=%u pkt_pts=%7.3f\n",
+                    ctx, pkt->size, lines, (double)pkt->pts/90000.0);
+            if (lines > 0) {
+#ifdef DEBUG
+                int i;
+                av_log(avctx, AV_LOG_DEBUG, "line numbers:");
+                for(i = 0; i < lines; i++)
+                    av_log(avctx, AV_LOG_DEBUG, " %d", ctx->sliced[i].line);
+                av_log(avctx, AV_LOG_DEBUG, "\n");
+#endif
+                vbi_decode(ctx->vbi, ctx->sliced, lines, 0.0);
+                ctx->lines_processed += lines;
+            }
+        }
+        ctx->pts = AV_NOPTS_VALUE;
+        ret = ctx->handler_ret;
+    }
+
+    if (ret < 0)
+        return ret;
+
+    // is there a subtitle to pass?
+    if (ctx->nb_pages) {
+        int i;
+        sub->format = ctx->format_id;
+        sub->start_display_time = 0;
+        sub->end_display_time = ctx->sub_duration;
+        sub->num_rects = 0;
+        sub->pts = ctx->pages->pts;
+
+        if (ctx->pages->sub_rect->type != SUBTITLE_NONE) {
+            sub->rects = av_malloc(sizeof(*sub->rects));
+            if (sub->rects) {
+                sub->num_rects = 1;
+                sub->rects[0] = ctx->pages->sub_rect;
+#if FF_API_AVPICTURE
+FF_DISABLE_DEPRECATION_WARNINGS
+                for (j = 0; j < 4; j++) {
+                    sub->rects[0]->pict.data[j] = sub->rects[0]->data[j];
+                    sub->rects[0]->pict.linesize[j] = sub->rects[0]->linesize[j];
+                }
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+            } else {
+                ret = AVERROR(ENOMEM);
+            }
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "sending empty sub\n");
+            sub->rects = NULL;
+        }
+        if (!sub->rects) // no rect was passed
+            subtitle_rect_free(&ctx->pages->sub_rect);
+
+        for (i = 0; i < ctx->nb_pages - 1; i++)
+            ctx->pages[i] = ctx->pages[i + 1];
+        ctx->nb_pages--;
+
+        if (ret >= 0)
+            *data_size = 1;
+    } else
+        *data_size = 0;
+
+    return ret;
+}
+
+static int teletext_init_decoder(AVCodecContext *avctx)
+{
+    TeletextContext *ctx = avctx->priv_data;
+    unsigned int maj, min, rev;
+
+    vbi_version(&maj, &min, &rev);
+    if (!(maj > 0 || min > 2 || min == 2 && rev >= 26)) {
+        av_log(avctx, AV_LOG_ERROR, "decoder needs zvbi version >= 0.2.26.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    if (ctx->format_id == 0) {
+        avctx->width  = 41 * BITMAP_CHAR_WIDTH;
+        avctx->height = 25 * BITMAP_CHAR_HEIGHT;
+    }
+
+    ctx->vbi = NULL;
+    ctx->pts = AV_NOPTS_VALUE;
+
+    if (ctx->opacity == -1)
+        ctx->opacity = ctx->transparent_bg ? 0 : 255;
+
+#ifdef DEBUG
+    {
+        char *t;
+        ctx->ex = vbi_export_new("text", &t);
+    }
+#endif
+    av_log(avctx, AV_LOG_VERBOSE, "page filter: %s\n", ctx->pgno);
+    return (ctx->format_id == 1) ? ff_ass_subtitle_header_default(avctx) : 0;
+}
+
+static int teletext_close_decoder(AVCodecContext *avctx)
+{
+    TeletextContext *ctx = avctx->priv_data;
+
+    ff_dlog(avctx, "lines_total=%u\n", ctx->lines_processed);
+    while (ctx->nb_pages)
+        subtitle_rect_free(&ctx->pages[--ctx->nb_pages].sub_rect);
+    av_freep(&ctx->pages);
+
+    vbi_decoder_delete(ctx->vbi);
+    ctx->vbi = NULL;
+    ctx->pts = AV_NOPTS_VALUE;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        ctx->readorder = 0;
+    return 0;
+}
+
+static void teletext_flush(AVCodecContext *avctx)
+{
+    teletext_close_decoder(avctx);
+}
+
+#define OFFSET(x) offsetof(TeletextContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    {"txt_page",        "list of teletext page numbers to decode, * is all", OFFSET(pgno),           AV_OPT_TYPE_STRING, {.str = "*"},      0, 0,        SD},
+    {"txt_chop_top",    "discards the top teletext line",                    OFFSET(chop_top),       AV_OPT_TYPE_INT,    {.i64 = 1},        0, 1,        SD},
+    {"txt_format",      "format of the subtitles (bitmap or text)",          OFFSET(format_id),      AV_OPT_TYPE_INT,    {.i64 = 0},        0, 1,        SD,  "txt_format"},
+    {"bitmap",          NULL,                                                0,                      AV_OPT_TYPE_CONST,  {.i64 = 0},        0, 0,        SD,  "txt_format"},
+    {"text",            NULL,                                                0,                      AV_OPT_TYPE_CONST,  {.i64 = 1},        0, 0,        SD,  "txt_format"},
+    {"txt_left",        "x offset of generated bitmaps",                     OFFSET(x_offset),       AV_OPT_TYPE_INT,    {.i64 = 0},        0, 65535,    SD},
+    {"txt_top",         "y offset of generated bitmaps",                     OFFSET(y_offset),       AV_OPT_TYPE_INT,    {.i64 = 0},        0, 65535,    SD},
+    {"txt_chop_spaces", "chops leading and trailing spaces from text",       OFFSET(chop_spaces),    AV_OPT_TYPE_INT,    {.i64 = 1},        0, 1,        SD},
+    {"txt_duration",    "display duration of teletext pages in msecs",       OFFSET(sub_duration),   AV_OPT_TYPE_INT,    {.i64 = 30000},    0, 86400000, SD},
+    {"txt_transparent", "force transparent background of the teletext",      OFFSET(transparent_bg), AV_OPT_TYPE_INT,    {.i64 = 0},        0, 1,        SD},
+    {"txt_opacity",     "set opacity of the transparent background",         OFFSET(opacity),        AV_OPT_TYPE_INT,    {.i64 = -1},      -1, 255,      SD},
+    { NULL },
+};
+
+static const AVClass teletext_class = {
+    .class_name = "libzvbi_teletextdec",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_libzvbi_teletext_decoder = {
+    .name      = "libzvbi_teletextdec",
+    .long_name = NULL_IF_CONFIG_SMALL("Libzvbi DVB teletext decoder"),
+    .type      = AVMEDIA_TYPE_SUBTITLE,
+    .id        = AV_CODEC_ID_DVB_TELETEXT,
+    .priv_data_size = sizeof(TeletextContext),
+    .init      = teletext_init_decoder,
+    .close     = teletext_close_decoder,
+    .decode    = teletext_decode_frame,
+    .capabilities = AV_CODEC_CAP_DELAY,
+    .flush     = teletext_flush,
+    .priv_class= &teletext_class,
+};
diff --git a/libavcodec/ljpegenc.c b/libavcodec/ljpegenc.c
index b6d73a4..afaab05 100644
--- a/libavcodec/ljpegenc.c
+++ b/libavcodec/ljpegenc.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,8 +48,8 @@ typedef struct LJpegEncContext {
     ScanTable scantable;
     uint16_t matrix[64];
 
-    int vsample[3];
-    int hsample[3];
+    int vsample[4];
+    int hsample[4];
 
     uint16_t huff_code_dc_luminance[12];
     uint16_t huff_code_dc_chrominance[12];
@@ -68,7 +68,7 @@ static int ljpeg_encode_bgr(AVCodecContext *avctx, PutBitContext *pb,
     const int height      = frame->height;
     const int linesize    = frame->linesize[0];
     uint16_t (*buffer)[4] = s->scratch;
-    int left[3], top[3], topleft[3];
+    int left[4], top[4], topleft[4];
     int x, y, i;
 
 #if FF_API_PRIVATE_OPT
@@ -78,27 +78,35 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    for (i = 0; i < 3; i++)
+    for (i = 0; i < 4; i++)
         buffer[0][i] = 1 << (9 - 1);
 
     for (y = 0; y < height; y++) {
         const int modified_predictor = y ? s->pred : 1;
         uint8_t *ptr = frame->data[0] + (linesize * y);
 
-        if (pb->buf_end - pb->buf - (put_bits_count(pb) >> 3) < width * 3 * 3) {
+        if (pb->buf_end - pb->buf - (put_bits_count(pb) >> 3) < width * 4 * 4) {
             av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
             return -1;
         }
 
-        for (i = 0; i < 3; i++)
+        for (i = 0; i < 4; i++)
             top[i]= left[i]= topleft[i]= buffer[0][i];
 
         for (x = 0; x < width; x++) {
-            buffer[x][1] =  ptr[3 * x + 0] -     ptr[3 * x + 1] + 0x100;
-            buffer[x][2] =  ptr[3 * x + 2] -     ptr[3 * x + 1] + 0x100;
-            buffer[x][0] = (ptr[3 * x + 0] + 2 * ptr[3 * x + 1] + ptr[3 * x + 2]) >> 2;
+            if(avctx->pix_fmt == AV_PIX_FMT_BGR24){
+                buffer[x][1] =  ptr[3 * x + 0] -     ptr[3 * x + 1] + 0x100;
+                buffer[x][2] =  ptr[3 * x + 2] -     ptr[3 * x + 1] + 0x100;
+                buffer[x][0] = (ptr[3 * x + 0] + 2 * ptr[3 * x + 1] + ptr[3 * x + 2]) >> 2;
+            }else{
+                buffer[x][1] =  ptr[4 * x + 0] -     ptr[4 * x + 1] + 0x100;
+                buffer[x][2] =  ptr[4 * x + 2] -     ptr[4 * x + 1] + 0x100;
+                buffer[x][0] = (ptr[4 * x + 0] + 2 * ptr[4 * x + 1] + ptr[4 * x + 2]) >> 2;
+                if (avctx->pix_fmt == AV_PIX_FMT_BGRA)
+                    buffer[x][3] =  ptr[4 * x + 3];
+            }
 
-            for (i = 0; i < 3; i++) {
+            for (i = 0; i < 3 + (avctx->pix_fmt == AV_PIX_FMT_BGRA); i++) {
                 int pred, diff;
 
                 PREDICT(pred, topleft[i], top[i], left[i], modified_predictor);
@@ -110,7 +118,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
                 diff       = ((left[i] - pred + 0x100) & 0x1FF) - 0x100;
 
-                if (i == 0)
+                if (i == 0 || i == 3)
                     ff_mjpeg_encode_dc(pb, diff, s->huff_size_dc_luminance, s->huff_code_dc_luminance); //FIXME ugly
                 else
                     ff_mjpeg_encode_dc(pb, diff, s->huff_size_dc_chrominance, s->huff_code_dc_chrominance);
@@ -227,25 +235,29 @@ static int ljpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int max_pkt_size = AV_INPUT_BUFFER_MIN_SIZE;
     int ret, header_bits;
 
-    if (avctx->pix_fmt == AV_PIX_FMT_BGR24)
-        max_pkt_size += width * height * 3 * 3;
+    if(    avctx->pix_fmt == AV_PIX_FMT_BGR0
+        || avctx->pix_fmt == AV_PIX_FMT_BGR24)
+        max_pkt_size += width * height * 3 * 4;
+    else if(avctx->pix_fmt == AV_PIX_FMT_BGRA)
+        max_pkt_size += width * height * 4 * 4;
     else {
         max_pkt_size += mb_width * mb_height * 3 * 4
                         * s->hsample[0] * s->vsample[0];
     }
-    if ((ret = ff_alloc_packet(pkt, max_pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", max_pkt_size);
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_pkt_size, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&pb, pkt->data, pkt->size);
 
     ff_mjpeg_encode_picture_header(avctx, &pb, &s->scantable,
-                                   s->pred, s->matrix);
+                                   s->pred, s->matrix, s->matrix);
 
     header_bits = put_bits_count(&pb);
 
-    if (avctx->pix_fmt == AV_PIX_FMT_BGR24)
+    if(    avctx->pix_fmt == AV_PIX_FMT_BGR0
+        || avctx->pix_fmt == AV_PIX_FMT_BGRA
+        || avctx->pix_fmt == AV_PIX_FMT_BGR24)
         ret = ljpeg_encode_bgr(avctx, &pb, pict);
     else
         ret = ljpeg_encode_yuv(avctx, &pb, pict);
@@ -254,6 +266,7 @@ static int ljpeg_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     emms_c();
 
+    ff_mjpeg_escape_FF(&pb, header_bits >> 3);
     ff_mjpeg_encode_picture_trailer(&pb, header_bits);
 
     flush_put_bits(&pb);
@@ -276,7 +289,6 @@ static av_cold int ljpeg_encode_close(AVCodecContext *avctx)
 static av_cold int ljpeg_encode_init(AVCodecContext *avctx)
 {
     LJpegEncContext *s = avctx->priv_data;
-    int chroma_v_shift, chroma_h_shift;
 
     if ((avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
          avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
@@ -297,26 +309,14 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     s->scratch = av_malloc_array(avctx->width + 1, sizeof(*s->scratch));
+    if (!s->scratch)
+        goto fail;
 
     ff_idctdsp_init(&s->idsp, avctx);
     ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
                       ff_zigzag_direct);
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift,
-                                     &chroma_v_shift);
-
-    if (avctx->pix_fmt   == AV_PIX_FMT_BGR24) {
-        s->vsample[0] = s->hsample[0] =
-        s->vsample[1] = s->hsample[1] =
-        s->vsample[2] = s->hsample[2] = 1;
-    } else {
-        s->vsample[0] = 2;
-        s->vsample[1] = 2 >> chroma_v_shift;
-        s->vsample[2] = 2 >> chroma_v_shift;
-        s->hsample[0] = 2;
-        s->hsample[1] = 2 >> chroma_h_shift;
-        s->hsample[2] = 2 >> chroma_h_shift;
-    }
+    ff_mjpeg_init_hvsample(avctx, s->hsample, s->vsample);
 
     ff_mjpeg_build_huffman_codes(s->huff_size_dc_luminance,
                                  s->huff_code_dc_luminance,
@@ -328,6 +328,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                  avpriv_mjpeg_val_dc);
 
     return 0;
+fail:
+    ljpeg_encode_close(avctx);
+    return AVERROR(ENOMEM);
 }
 
 #define OFFSET(x) offsetof(LJpegEncContext, x)
@@ -358,12 +361,10 @@ AVCodec ff_ljpeg_encoder = {
     .init           = ljpeg_encode_init,
     .encode2        = ljpeg_encode_frame,
     .close          = ljpeg_encode_close,
-    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVJ420P,
-                                                    AV_PIX_FMT_YUVJ422P,
-                                                    AV_PIX_FMT_YUVJ444P,
-                                                    AV_PIX_FMT_BGR24,
-                                                    AV_PIX_FMT_YUV420P,
-                                                    AV_PIX_FMT_YUV422P,
-                                                    AV_PIX_FMT_YUV444P,
-                                                    AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_BGR24   , AV_PIX_FMT_BGRA    , AV_PIX_FMT_BGR0,
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
+        AV_PIX_FMT_YUV420P , AV_PIX_FMT_YUV444P , AV_PIX_FMT_YUV422P,
+        AV_PIX_FMT_NONE},
 };
diff --git a/libavcodec/loco.c b/libavcodec/loco.c
index f25ef61..9d0f144 100644
--- a/libavcodec/loco.c
+++ b/libavcodec/loco.c
@@ -2,20 +2,20 @@
  * LOCO codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@ enum LOCO_MODE {
 typedef struct LOCOContext {
     AVCodecContext *avctx;
     int lossy;
-    int mode;
+    enum LOCO_MODE mode;
 } LOCOContext;
 
 typedef struct RICEContext {
@@ -130,9 +130,15 @@ static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int heigh
 {
     RICEContext rc;
     int val;
+    int ret;
     int i, j;
 
-    init_get_bits(&rc.gb, buf, buf_size*8);
+    if(buf_size<=0)
+        return -1;
+
+    if ((ret = init_get_bits8(&rc.gb, buf, buf_size)) < 0)
+        return ret;
+
     rc.save  = 0;
     rc.run   = 0;
     rc.run2  = 0;
@@ -165,6 +171,23 @@ static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int heigh
     return (get_bits_count(&rc.gb) + 7) >> 3;
 }
 
+static void rotate_faulty_loco(uint8_t *data, int width, int height, int stride, int step)
+{
+    int y;
+
+    for (y=1; y<height; y++) {
+        if (width>=y) {
+            memmove(data + y*stride,
+                    data + y*(stride + step),
+                    step*(width-y));
+            if (y+1 < height)
+                memmove(data + y*stride + step*(width-y),
+                        data + (y+1)*stride,
+                        step*y);
+        }
+    }
+}
+
 static int decode_frame(AVCodecContext *avctx,
                         void *data, int *got_frame,
                         AVPacket *avpkt)
@@ -175,88 +198,72 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p     = data;
     int decoded, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->key_frame = 1;
 
+#define ADVANCE_BY_DECODED do { \
+    if (decoded < 0 || decoded >= buf_size) goto buf_too_small; \
+    buf += decoded; buf_size -= decoded; \
+} while(0)
     switch(l->mode) {
     case LOCO_CYUY2: case LOCO_YUY2: case LOCO_UYVY:
         decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
                                     p->linesize[0], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height,
                                     p->linesize[1], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height,
                                     p->linesize[2], buf, buf_size, 1);
         break;
     case LOCO_CYV12: case LOCO_YV12:
         decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
                                     p->linesize[0], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height / 2,
                                     p->linesize[2], buf, buf_size, 1);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height / 2,
                                     p->linesize[1], buf, buf_size, 1);
         break;
     case LOCO_CRGB: case LOCO_RGB:
         decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height,
                                     -p->linesize[0], buf, buf_size, 3);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 1, avctx->width, avctx->height,
                                     -p->linesize[0], buf, buf_size, 3);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
+        ADVANCE_BY_DECODED;
         decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 2, avctx->width, avctx->height,
                                     -p->linesize[0], buf, buf_size, 3);
+        if (avctx->width & 1)
+            rotate_faulty_loco(p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height, -p->linesize[0], 3);
         break;
+    case LOCO_CRGBA:
     case LOCO_RGBA:
-        decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 1, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 2, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
-        if (decoded >= buf_size)
-            goto buf_too_small;
-        buf += decoded; buf_size -= decoded;
-
-        decoded = loco_decode_plane(l, p->data[0] + 3, avctx->width, avctx->height,
-                                    p->linesize[0], buf, buf_size, 4);
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 1, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 2, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
+        ADVANCE_BY_DECODED;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 3, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 4);
         break;
+    default:
+        av_assert0(0);
     }
 
+    if (decoded < 0 || decoded > buf_size)
+        goto buf_too_small;
+    buf_size -= decoded;
+
     *got_frame      = 1;
 
-    return buf_size;
+    return avpkt->size - buf_size;
 buf_too_small:
     av_log(avctx, AV_LOG_ERROR, "Input data too small.\n");
     return AVERROR(EINVAL);
@@ -303,7 +310,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case LOCO_CRGBA:
     case LOCO_RGBA:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_BGRA;
         break;
     default:
         av_log(avctx, AV_LOG_INFO, "Unknown colorspace, index = %i\n", l->mode);
diff --git a/libavcodec/lossless_audiodsp.c b/libavcodec/lossless_audiodsp.c
new file mode 100644
index 0000000..3a9f9b2
--- /dev/null
+++ b/libavcodec/lossless_audiodsp.c
@@ -0,0 +1,67 @@
+/*
+ * Monkey's Audio lossless audio decoder
+ * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
+ *  based upon libdemac from Dave Chapman.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "lossless_audiodsp.h"
+
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul)
+{
+    int res = 0;
+
+    do {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    } while (order-=2);
+    return res;
+}
+
+static int32_t scalarproduct_and_madd_int32_c(int16_t *v1, const int32_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul)
+{
+    int res = 0;
+
+    do {
+        res   += *v1 * (uint32_t)*v2++;
+        *v1++ += mul * *v3++;
+        res   += *v1 * (uint32_t)*v2++;
+        *v1++ += mul * *v3++;
+    } while (order-=2);
+    return res;
+}
+
+av_cold void ff_llauddsp_init(LLAudDSPContext *c)
+{
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+    c->scalarproduct_and_madd_int32 = scalarproduct_and_madd_int32_c;
+
+    if (ARCH_ARM)
+        ff_llauddsp_init_arm(c);
+    if (ARCH_PPC)
+        ff_llauddsp_init_ppc(c);
+    if (ARCH_X86)
+        ff_llauddsp_init_x86(c);
+}
diff --git a/libavcodec/apedsp.h b/libavcodec/lossless_audiodsp.h
index 64e2749..eea5d49 100644
--- a/libavcodec/apedsp.h
+++ b/libavcodec/lossless_audiodsp.h
@@ -3,42 +3,49 @@
  * Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
  *  based upon libdemac from Dave Chapman.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_APEDSP_H
-#define AVCODEC_APEDSP_H
+#ifndef AVCODEC_LOSSLESS_AUDIODSP_H
+#define AVCODEC_LOSSLESS_AUDIODSP_H
 
 #include <stdint.h>
 
-typedef struct APEDSPContext {
+typedef struct LLAudDSPContext {
     /**
      * Calculate scalar product of v1 and v2,
      * and v1[i] += v3[i] * mul
-     * @param len length of vectors, should be multiple of 16
+     * @param len length of vectors, should be multiple of 16,
+     *            or padd v3 and v1 or v2 with zeros.
      */
     int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
                                             const int16_t *v2,
                                             const int16_t *v3,
                                             int len, int mul);
-} APEDSPContext;
 
-void ff_apedsp_init_arm(APEDSPContext *c);
-void ff_apedsp_init_ppc(APEDSPContext *c);
-void ff_apedsp_init_x86(APEDSPContext *c);
+    int32_t (*scalarproduct_and_madd_int32)(int16_t *v1 /* align 16 */,
+                                            const int32_t *v2,
+                                            const int16_t *v3,
+                                            int len, int mul);
+} LLAudDSPContext;
+
+void ff_llauddsp_init(LLAudDSPContext *c);
+void ff_llauddsp_init_arm(LLAudDSPContext *c);
+void ff_llauddsp_init_ppc(LLAudDSPContext *c);
+void ff_llauddsp_init_x86(LLAudDSPContext *c);
 
-#endif /* AVCODEC_APEDSP_H */
+#endif /* AVCODEC_LOSSLESS_AUDIODSP_H */
diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c
new file mode 100644
index 0000000..3491621
--- /dev/null
+++ b/libavcodec/lossless_videodsp.c
@@ -0,0 +1,128 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "lossless_videodsp.h"
+#include "libavcodec/mathops.h"
+
+static void add_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w){
+    long i;
+    unsigned long pw_lsb = (mask >> 1) * 0x0001000100010001ULL;
+    unsigned long pw_msb = pw_lsb +  0x0001000100010001ULL;
+    for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) {
+        long a = *(long*)(src+i);
+        long b = *(long*)(dst+i);
+        *(long*)(dst+i) = ((a&pw_lsb) + (b&pw_lsb)) ^ ((a^b)&pw_msb);
+    }
+    for(; i<w; i++)
+        dst[i] = (dst[i] + src[i]) & mask;
+}
+
+static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){
+    long i;
+#if !HAVE_FAST_UNALIGNED
+    if((long)src2 & (sizeof(long)-1)){
+        for(i=0; i+3<w; i+=4){
+            dst[i+0] = (src1[i+0]-src2[i+0]) & mask;
+            dst[i+1] = (src1[i+1]-src2[i+1]) & mask;
+            dst[i+2] = (src1[i+2]-src2[i+2]) & mask;
+            dst[i+3] = (src1[i+3]-src2[i+3]) & mask;
+        }
+    }else
+#endif
+    {
+        unsigned long pw_lsb = (mask >> 1) * 0x0001000100010001ULL;
+        unsigned long pw_msb = pw_lsb +  0x0001000100010001ULL;
+
+        for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) {
+            long a = *(long*)(src1+i);
+            long b = *(long*)(src2+i);
+            *(long*)(dst+i) = ((a|pw_msb) - (b&pw_lsb)) ^ ((a^b^pw_msb)&pw_msb);
+        }
+    }
+    for (; i<w; i++)
+        dst[i] = (src1[i] - src2[i]) & mask;
+}
+
+static void add_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top){
+    int i;
+    uint16_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for(i=0; i<w; i++){
+        l  = (mid_pred(l, src[i], (l + src[i] - lt) & mask) + diff[i]) & mask;
+        lt = src[i];
+        dst[i] = l;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){
+    int i;
+    uint16_t l, lt;
+
+    l  = *left;
+    lt = *left_top;
+
+    for(i=0; i<w; i++){
+        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & mask);
+        lt = src1[i];
+        l  = src2[i];
+        dst[i] = (l - pred) & mask;
+    }
+
+    *left     = l;
+    *left_top = lt;
+}
+
+static int add_hfyu_left_pred_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc){
+    int i;
+
+    for(i=0; i<w-1; i++){
+        acc+= src[i];
+        dst[i]= acc & mask;
+        i++;
+        acc+= src[i];
+        dst[i]= acc & mask;
+    }
+
+    for(; i<w; i++){
+        acc+= src[i];
+        dst[i]= acc & mask;
+    }
+
+    return acc;
+}
+
+
+void ff_llviddsp_init(LLVidDSPContext *c, AVCodecContext *avctx)
+{
+    c->add_int16 = add_int16_c;
+    c->diff_int16= diff_int16_c;
+    c->add_hfyu_left_pred_int16   = add_hfyu_left_pred_int16_c;
+    c->add_hfyu_median_pred_int16 = add_hfyu_median_pred_int16_c;
+    c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
+
+    if (ARCH_X86)
+        ff_llviddsp_init_x86(c, avctx);
+}
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h
new file mode 100644
index 0000000..040902e
--- /dev/null
+++ b/libavcodec/lossless_videodsp.h
@@ -0,0 +1,40 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#ifndef AVCODEC_LOSSLESS_VIDEODSP_H
+#define AVCODEC_LOSSLESS_VIDEODSP_H
+
+#include "avcodec.h"
+#include "libavutil/cpu.h"
+
+typedef struct LLVidDSPContext {
+    void (*add_int16)(uint16_t *dst/*align 16*/, const uint16_t *src/*align 16*/, unsigned mask, int w);
+    void (*diff_int16)(uint16_t *dst/*align 16*/, const uint16_t *src1/*align 16*/, const uint16_t *src2/*align 1*/, unsigned mask, int w);
+
+    void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+    void (*add_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+    int  (*add_hfyu_left_pred_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned left);
+} LLVidDSPContext;
+
+void ff_llviddsp_init(LLVidDSPContext *llviddsp, AVCodecContext *avctx);
+void ff_llviddsp_init_x86(LLVidDSPContext *llviddsp, AVCodecContext *avctx);
+
+#endif //AVCODEC_LOSSLESS_VIDEODSP_H
diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index 1482e57..052aeaa 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -2,20 +2,20 @@
  * LPC utility code
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #define LPC_USE_DOUBLE
 #include "lpc.h"
+#include "libavutil/avassert.h"
 
 
 /**
@@ -36,13 +37,19 @@ static void lpc_apply_welch_window_c(const int32_t *data, int len,
     double w;
     double c;
 
-    /* The optimization in commit fa4ed8c does not support odd len.
-     * If someone wants odd len extend that change. */
-    assert(!(len & 1));
-
     n2 = (len >> 1);
     c = 2.0 / (len - 1.0);
 
+    if (len & 1) {
+        for(i=0; i<n2; i++) {
+            w = c - i - 1.0;
+            w = 1.0 - (w * w);
+            w_data[i] = data[i] * w;
+            w_data[len-1-i] = data[len-1-i] * w;
+        }
+        return;
+    }
+
     w_data+=n2;
       data+=n2;
     for(i=0; i<n2; i++) {
@@ -160,6 +167,29 @@ int ff_lpc_calc_ref_coefs(LPCContext *s,
     return order;
 }
 
+double ff_lpc_calc_ref_coefs_f(LPCContext *s, const float *samples, int len,
+                               int order, double *ref)
+{
+    int i;
+    double signal = 0.0f, avg_err = 0.0f;
+    double autoc[MAX_LPC_ORDER+1] = {0}, error[MAX_LPC_ORDER+1] = {0};
+    const double a = 0.5f, b = 1.0f - a;
+
+    /* Apply windowing */
+    for (i = 0; i <= len / 2; i++) {
+        double weight = a - b*cos((2*M_PI*i)/(len - 1));
+        s->windowed_samples[i] = weight*samples[i];
+        s->windowed_samples[len-1-i] = weight*samples[len-1-i];
+    }
+
+    s->lpc_compute_autocorr(s->windowed_samples, len, order, autoc);
+    signal = autoc[0];
+    compute_ref_coefs(autoc, order, ref, error);
+    for (i = 0; i < order; i++)
+        avg_err = (avg_err + error[i])/2.0f;
+    return signal/avg_err;
+}
+
 /**
  * Calculate LPC coefficients for multiple orders
  *
@@ -179,8 +209,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
     int i, j, pass = 0;
     int opt_order;
 
-    assert(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
+    av_assert2(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER &&
            lpc_type > FF_LPC_TYPE_FIXED);
+    av_assert0(lpc_type == FF_LPC_TYPE_CHOLESKY || lpc_type == FF_LPC_TYPE_LEVINSON);
 
     /* reinit LPC context if parameters have changed */
     if (blocksize != s->blocksize || max_order != s->max_order ||
@@ -189,6 +220,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
         ff_lpc_init(s, blocksize, max_order, lpc_type);
     }
 
+    if(lpc_passes <= 0)
+        lpc_passes = 2;
+
     if (lpc_type == FF_LPC_TYPE_LEVINSON || (lpc_type == FF_LPC_TYPE_CHOLESKY && lpc_passes > 1)) {
         s->lpc_apply_welch_window(samples, blocksize, s->windowed_samples);
 
@@ -203,7 +237,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
     }
 
     if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
-        LLSModel m[2];
+        LLSModel *m = s->lls_models;
         LOCAL_ALIGNED(32, double, var, [FFALIGN(MAX_LPC_ORDER+1,4)]);
         double av_uninit(weight);
         memset(var, 0, FFALIGN(MAX_LPC_ORDER+1,4)*sizeof(*var));
@@ -244,6 +278,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
         for(i=max_order-1; i>0; i--)
             ref[i] = ref[i-1] - ref[i];
     }
+
     opt_order = max_order;
 
     if(omethod == ORDER_METHOD_EST) {
diff --git a/libavcodec/lpc.h b/libavcodec/lpc.h
index 642854c..edb1a6b 100644
--- a/libavcodec/lpc.h
+++ b/libavcodec/lpc.h
@@ -2,20 +2,20 @@
  * LPC utility code
  * Copyright (c) 2006  Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,9 @@
 #define AVCODEC_LPC_H
 
 #include <stdint.h>
+#include "libavutil/avassert.h"
+#include "libavutil/lls.h"
+#include "aac_defines.h"
 
 #define ORDER_METHOD_EST     0
 #define ORDER_METHOD_2LEVEL  1
@@ -66,7 +69,7 @@ typedef struct LPCContext {
     /**
      * Perform autocorrelation on input samples with delay of 0 to lag.
      * @param data  input samples.
-     *              constraints: no alignment needed, but must have have at
+     *              constraints: no alignment needed, but must have at
      *              least lag*sizeof(double) valid bytes preceding it, and
      *              size must be at least (len+1)*sizeof(double) if data is
      *              16-byte aligned or (len+2)*sizeof(double) if data is
@@ -78,6 +81,9 @@ typedef struct LPCContext {
      */
     void (*lpc_compute_autocorr)(const double *data, int len, int lag,
                                  double *autoc);
+
+    // TODO: these should be allocated to reduce ABI compatibility issues
+    LLSModel lls_models[2];
 } LPCContext;
 
 
@@ -94,6 +100,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
 int ff_lpc_calc_ref_coefs(LPCContext *s,
                           const int32_t *samples, int order, double *ref);
 
+double ff_lpc_calc_ref_coefs_f(LPCContext *s, const float *samples, int len,
+                               int order, double *ref);
+
 /**
  * Initialize LPCContext.
  */
@@ -106,11 +115,15 @@ void ff_lpc_init_x86(LPCContext *s);
  */
 void ff_lpc_end(LPCContext *s);
 
+#if USE_FIXED
+typedef int LPC_TYPE;
+#else
 #ifdef LPC_USE_DOUBLE
-#define LPC_TYPE double
+typedef double LPC_TYPE;
 #else
-#define LPC_TYPE float
+typedef float LPC_TYPE;
 #endif
+#endif // USE_FIXED
 
 /**
  * Schur recursion.
@@ -147,7 +160,7 @@ static inline void compute_ref_coefs(const LPC_TYPE *autoc, int max_order,
  * Levinson-Durbin recursion.
  * Produce LPC coefficients from autocorrelation data.
  */
-static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
+static inline int AAC_RENAME(compute_lpc_coefs)(const LPC_TYPE *autoc, int max_order,
                                     LPC_TYPE *lpc, int lpc_stride, int fail,
                                     int normalize)
 {
@@ -155,6 +168,8 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
     LPC_TYPE err = 0;
     LPC_TYPE *lpc_last = lpc;
 
+    av_assert2(normalize || !fail);
+
     if (normalize)
         err = *autoc++;
 
@@ -162,14 +177,14 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
         return -1;
 
     for(i=0; i<max_order; i++) {
-        LPC_TYPE r = -autoc[i];
+        LPC_TYPE r = AAC_SRA_R(-autoc[i], 5);
 
         if (normalize) {
             for(j=0; j<i; j++)
                 r -= lpc_last[j] * autoc[i-j-1];
 
             r /= err;
-            err *= 1.0 - (r * r);
+            err *= FIXR(1.0) - (r * r);
         }
 
         lpc[i] = r;
@@ -177,8 +192,8 @@ static inline int compute_lpc_coefs(const LPC_TYPE *autoc, int max_order,
         for(j=0; j < (i+1)>>1; j++) {
             LPC_TYPE f = lpc_last[    j];
             LPC_TYPE b = lpc_last[i-1-j];
-            lpc[    j] = f + r * b;
-            lpc[i-1-j] = b + r * f;
+            lpc[    j] = f + AAC_MUL26(r, b);
+            lpc[i-1-j] = b + AAC_MUL26(r, f);
         }
 
         if (fail && err < 0)
diff --git a/libavcodec/lsp.c b/libavcodec/lsp.c
index 982c87e..9aba020 100644
--- a/libavcodec/lsp.c
+++ b/libavcodec/lsp.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet (QCELP decoder)
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,8 @@
 #define FRAC_BITS 14
 #include "mathops.h"
 #include "lsp.h"
+#include "libavcodec/mips/lsp_mips.h"
+#include "libavutil/avassert.h"
 
 void ff_acelp_reorder_lsf(int16_t* lsfq, int lsfq_min_distance, int lsfq_min, int lsfq_max, int lp_order)
 {
@@ -73,7 +75,7 @@ static int16_t ff_cos(uint16_t arg)
     uint8_t offset= arg;
     uint8_t ind = arg >> 8;
 
-    assert(arg <= 0x3fff);
+    av_assert2(arg <= 0x3fff);
 
     return tab_cos[ind] + (offset * (tab_cos[ind+1] - tab_cos[ind]) >> 8);
 }
@@ -173,7 +175,11 @@ void ff_acelp_lp_decode(int16_t* lp_1st, int16_t* lp_2nd, const int16_t* lsp_2nd
 
     /* LSP values for first subframe (3.2.5 of G.729, Equation 24)*/
     for(i=0; i<lp_order; i++)
+#ifdef G729_BITEXACT
+        lsp_1st[i] = (lsp_2nd[i] >> 1) + (lsp_prev[i] >> 1);
+#else
         lsp_1st[i] = (lsp_2nd[i] + lsp_prev[i]) >> 1;
+#endif
 
     ff_acelp_lsp2lpc(lp_1st, lsp_1st, lp_order >> 1);
 
@@ -181,6 +187,7 @@ void ff_acelp_lp_decode(int16_t* lp_1st, int16_t* lp_2nd, const int16_t* lsp_2nd
     ff_acelp_lsp2lpc(lp_2nd, lsp_2nd, lp_order >> 1);
 }
 
+#ifndef ff_lsp2polyf
 void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
 {
     int i, j;
@@ -197,13 +204,14 @@ void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
         f[1] += val;
     }
 }
+#endif /* ff_lsp2polyf */
 
 void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
 {
     double pa[MAX_LP_HALF_ORDER+1], qa[MAX_LP_HALF_ORDER+1];
     float *lpc2 = lpc + (lp_half_order << 1) - 1;
 
-    assert(lp_half_order <= MAX_LP_HALF_ORDER);
+    av_assert2(lp_half_order <= MAX_LP_HALF_ORDER);
 
     ff_lsp2polyf(lsp,     pa, lp_half_order);
     ff_lsp2polyf(lsp + 1, qa, lp_half_order);
diff --git a/libavcodec/lsp.h b/libavcodec/lsp.h
index 1f9481c..621ebea 100644
--- a/libavcodec/lsp.h
+++ b/libavcodec/lsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2008 Vladimir Voroshilov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzf.c b/libavcodec/lzf.c
index 35b932b..409a7ff 100644
--- a/libavcodec/lzf.c
+++ b/libavcodec/lzf.c
@@ -2,20 +2,20 @@
  * lzf decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzf.h b/libavcodec/lzf.h
index 4951f25..0ad73d9 100644
--- a/libavcodec/lzf.h
+++ b/libavcodec/lzf.h
@@ -2,20 +2,20 @@
  * lzf decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/lzw.c b/libavcodec/lzw.c
index fae5687..b0b9a34 100644
--- a/libavcodec/lzw.c
+++ b/libavcodec/lzw.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -93,13 +93,7 @@ static int lzw_get_code(struct LZWState * s)
     return c & s->curmask;
 }
 
-int ff_lzw_size_read(LZWState *p)
-{
-    struct LZWState *s = p;
-    return bytestream2_tell(&s->gb);
-}
-
-void ff_lzw_decode_tail(LZWState *p)
+int ff_lzw_decode_tail(LZWState *p)
 {
     struct LZWState *s = (struct LZWState *)p;
 
@@ -110,6 +104,7 @@ void ff_lzw_decode_tail(LZWState *p)
         }
     }else
         bytestream2_skip(&s->gb, bytestream2_get_bytes_left(&s->gb));
+    return bytestream2_tell(&s->gb);
 }
 
 av_cold void ff_lzw_decode_open(LZWState **p)
diff --git a/libavcodec/lzw.h b/libavcodec/lzw.h
index d925d35..6af8a6b 100644
--- a/libavcodec/lzw.h
+++ b/libavcodec/lzw.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,8 +47,7 @@ void ff_lzw_decode_open(LZWState **p);
 void ff_lzw_decode_close(LZWState **p);
 int ff_lzw_decode_init(LZWState *s, int csize, const uint8_t *buf, int buf_size, int mode);
 int ff_lzw_decode(LZWState *s, uint8_t *buf, int len);
-int ff_lzw_size_read(LZWState *lzw);
-void ff_lzw_decode_tail(LZWState *lzw);
+int ff_lzw_decode_tail(LZWState *lzw);
 
 /** LZW encode state */
 struct LZWEncodeState;
diff --git a/libavcodec/lzwenc.c b/libavcodec/lzwenc.c
index 7c37bf2..03080ee 100644
--- a/libavcodec/lzwenc.c
+++ b/libavcodec/lzwenc.c
@@ -2,20 +2,20 @@
  * LZW encoder
  * Copyright (c) 2007 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,7 +77,7 @@ static inline int hash(int head, const int add)
     head ^= (add << LZW_HASH_SHIFT);
     if (head >= LZW_HASH_SIZE)
         head -= LZW_HASH_SIZE;
-    assert(head >= 0 && head < LZW_HASH_SIZE);
+    av_assert2(head >= 0 && head < LZW_HASH_SIZE);
     return head;
 }
 
@@ -112,7 +112,7 @@ static inline int hashOffset(const int head)
  */
 static inline void writeCode(LZWEncodeState * s, int c)
 {
-    assert(0 <= c && c < 1 << s->bits);
+    av_assert2(0 <= c && c < 1 << s->bits);
     s->put_bits(&s->pb, s->bits, c);
 }
 
@@ -208,7 +208,7 @@ void ff_lzw_encode_init(LZWEncodeState *s, uint8_t *outbuf, int outsize,
     s->maxbits = maxbits;
     init_put_bits(&s->pb, outbuf, outsize);
     s->bufsize = outsize;
-    assert(s->maxbits >= 9 && s->maxbits <= LZW_MAXBITS);
+    av_assert0(s->maxbits >= 9 && s->maxbits <= LZW_MAXBITS);
     s->maxcode = 1 << s->maxbits;
     s->output_bytes = 0;
     s->last_code = LZW_PREFIX_EMPTY;
@@ -263,6 +263,9 @@ int ff_lzw_encode_flush(LZWEncodeState *s,
     if (s->last_code != -1)
         writeCode(s, s->last_code);
     writeCode(s, s->end_code);
+    if (s->mode == FF_LZW_GIF)
+        s->put_bits(&s->pb, 1, 0);
+
     lzw_flush_put_bits(&s->pb);
     s->last_code = -1;
 
diff --git a/libavcodec/m101.c b/libavcodec/m101.c
new file mode 100644
index 0000000..939d337
--- /dev/null
+++ b/libavcodec/m101.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+
+#include "avcodec.h"
+#include "internal.h"
+
+
+static av_cold int m101_decode_init(AVCodecContext *avctx)
+{
+    if (avctx->extradata_size < 6*4) {
+        avpriv_request_sample(avctx, "Missing or too small extradata (size %d)\n", avctx->extradata_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (avctx->extradata[2*4] == 10)
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+    else if (avctx->extradata[2*4] == 8) {
+        avctx->pix_fmt = AV_PIX_FMT_YUYV422;
+    } else {
+        avpriv_request_sample(avctx, "BPS %d\n", avctx->extradata[2*4]);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int m101_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                      AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int stride, ret;
+    int x, y;
+    int min_stride = 2 * avctx->width;
+    int bits = avctx->extradata[2*4];
+    AVFrame *frame = data;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+
+    stride = AV_RL32(avctx->extradata + 5*4);
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV422P10)
+        min_stride = (avctx->width + 15) / 16 * 20;
+
+    if (stride < min_stride || avpkt->size < stride * (uint64_t)avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "stride (%d) is invalid for packet sized %d\n",
+               stride, avpkt->size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    frame->interlaced_frame = ((avctx->extradata[3*4] & 3) != 3);
+    if (frame->interlaced_frame)
+        frame->top_field_first = avctx->extradata[3*4] & 1;
+
+    for (y = 0; y < avctx->height; y++) {
+        int src_y = y;
+        if (frame->interlaced_frame)
+            src_y = ((y&1)^frame->top_field_first) ? y/2 : (y/2 + avctx->height/2);
+        if (bits == 8) {
+            uint8_t *line = frame->data[0] + y*frame->linesize[0];
+            memcpy(line, buf + src_y*stride, 2*avctx->width);
+        } else {
+            int block;
+            uint16_t *luma = (uint16_t*)&frame->data[0][y*frame->linesize[0]];
+            uint16_t *cb   = (uint16_t*)&frame->data[1][y*frame->linesize[1]];
+            uint16_t *cr   = (uint16_t*)&frame->data[2][y*frame->linesize[2]];
+            for (block = 0; 16*block < avctx->width; block ++) {
+                const uint8_t *buf_src = buf + src_y*stride + 40*block;
+                for (x = 0; x < 16 && x + 16*block < avctx->width; x++) {
+                    int xd = x + 16*block;
+                    if (x&1) {
+                        luma [xd] = (4*buf_src[2*x + 0]) + ((buf_src[32 + (x>>1)]>>4)&3);
+                    } else {
+                        luma [xd] = (4*buf_src[2*x + 0]) +  (buf_src[32 + (x>>1)]    &3);
+                        cb[xd>>1] = (4*buf_src[2*x + 1]) + ((buf_src[32 + (x>>1)]>>2)&3);
+                        cr[xd>>1] = (4*buf_src[2*x + 3]) +  (buf_src[32 + (x>>1)]>>6);
+                    }
+                }
+            }
+        }
+    }
+
+    *got_frame = 1;
+    return avpkt->size;
+}
+
+AVCodec ff_m101_decoder = {
+    .name           = "m101",
+    .long_name      = NULL_IF_CONFIG_SMALL("Matrox Uncompressed SD"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_M101,
+    .init           = m101_decode_init,
+    .decode         = m101_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/mace.c b/libavcodec/mace.c
index c6eddc0..e332a72 100644
--- a/libavcodec/mace.c
+++ b/libavcodec/mace.c
@@ -2,20 +2,20 @@
  * MACE decoder
  * Copyright (c) 2002 Laszlo Torok <torokl@alpha.dfmk.hu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -244,12 +244,17 @@ static int mace_decode_frame(AVCodecContext *avctx, void *data,
     int i, j, k, l, ret;
     int is_mace3 = (avctx->codec_id == AV_CODEC_ID_MACE3);
 
+    if (buf_size % (avctx->channels << is_mace3)) {
+        av_log(avctx, AV_LOG_ERROR, "buffer size %d is odd\n", buf_size);
+        buf_size -= buf_size % (avctx->channels << is_mace3);
+        if (!buf_size)
+            return AVERROR_INVALIDDATA;
+    }
+
     /* get output buffer */
     frame->nb_samples = 3 * (buf_size << (1 - is_mace3)) / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t **)frame->extended_data;
 
     for(i = 0; i < avctx->channels; i++) {
diff --git a/libavcodec/magicyuv.c b/libavcodec/magicyuv.c
new file mode 100644
index 0000000..3bb4c5a
--- /dev/null
+++ b/libavcodec/magicyuv.c
@@ -0,0 +1,479 @@
+/*
+ * MagicYUV decoder
+ * Copyright (c) 2016 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/qsort.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "get_bits.h"
+#include "huffyuvdsp.h"
+#include "internal.h"
+#include "thread.h"
+
+typedef struct Slice {
+    uint32_t start;
+    uint32_t size;
+} Slice;
+
+typedef enum Prediction {
+    LEFT = 1,
+    GRADIENT,
+    MEDIAN,
+} Prediction;
+
+typedef struct MagicYUVContext {
+    AVFrame            *p;
+    int                 slice_height;
+    int                 nb_slices;
+    int                 planes;         // number of encoded planes in bitstream
+    int                 decorrelate;    // postprocessing work
+    int                 interlaced;     // video is interlaced
+    uint8_t             *buf;           // pointer to AVPacket->data
+    int                 hshift[4];
+    int                 vshift[4];
+    Slice               *slices[4];     // slice positions and size in bitstream for each plane
+    int                 slices_size[4];
+    uint8_t             len[4][256];    // table of code lengths for each plane
+    VLC                 vlc[4];         // VLC for each plane
+    HuffYUVDSPContext   hdsp;
+} MagicYUVContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    ff_huffyuvdsp_init(&s->hdsp);
+    return 0;
+}
+
+typedef struct HuffEntry {
+    uint8_t  sym;
+    uint8_t  len;
+    uint32_t code;
+} HuffEntry;
+
+static int ff_magy_huff_cmp_len(const void *a, const void *b)
+{
+    const HuffEntry *aa = a, *bb = b;
+    return (aa->len - bb->len) * 256 + aa->sym - bb->sym;
+}
+
+static int build_huff(VLC *vlc, uint8_t *len)
+{
+    HuffEntry he[256];
+    uint32_t codes[256];
+    uint8_t bits[256];
+    uint8_t syms[256];
+    uint32_t code;
+    int i;
+
+    for (i = 0; i < 256; i++) {
+        he[i].sym = 255 - i;
+        he[i].len = len[i];
+    }
+    AV_QSORT(he, 256, HuffEntry, ff_magy_huff_cmp_len);
+
+    code = 1;
+    for (i = 255; i >= 0; i--) {
+        codes[i] = code >> (32 - he[i].len);
+        bits[i]  = he[i].len;
+        syms[i]  = he[i].sym;
+        code += 0x80000000u >> (he[i].len - 1);
+    }
+
+    ff_free_vlc(vlc);
+    return ff_init_vlc_sparse(vlc, FFMIN(he[255].len, 12), 256,
+                              bits,  sizeof(*bits),  sizeof(*bits),
+                              codes, sizeof(*codes), sizeof(*codes),
+                              syms,  sizeof(*syms),  sizeof(*syms), 0);
+}
+
+static int decode_slice(AVCodecContext *avctx, void *tdata,
+                        int j, int threadnr)
+{
+    MagicYUVContext *s = avctx->priv_data;
+    int interlaced = s->interlaced;
+    AVFrame *p = s->p;
+    int i, k, x, ret;
+    GetBitContext b;
+    uint8_t *dst;
+
+    for (i = 0; i < s->planes; i++) {
+        int height = AV_CEIL_RSHIFT(FFMIN(s->slice_height, avctx->coded_height - j * s->slice_height), s->vshift[i]);
+        int width = AV_CEIL_RSHIFT(avctx->coded_width, s->hshift[i]);
+        int sheight = AV_CEIL_RSHIFT(s->slice_height, s->vshift[i]);
+        int fake_stride = p->linesize[i] * (1 + interlaced);
+        int stride = p->linesize[i];
+        int flags, pred;
+
+        if ((ret = init_get_bits8(&b, s->buf + s->slices[i][j].start, s->slices[i][j].size)) < 0)
+            return ret;
+
+        flags = get_bits(&b, 8);
+        pred  = get_bits(&b, 8);
+
+        dst = p->data[i] + j * sheight * stride;
+        if (flags & 1) {
+            for (k = 0; k < height; k++) {
+                for (x = 0; x < width; x++) {
+                    dst[x] = get_bits(&b, 8);
+                }
+                dst += stride;
+            }
+        } else {
+            for (k = 0; k < height; k++) {
+                for (x = 0; x < width; x++) {
+                    int pix;
+                    if (get_bits_left(&b) <= 0) {
+                        return AVERROR_INVALIDDATA;
+                    }
+                    pix = get_vlc2(&b, s->vlc[i].table, s->vlc[i].bits, 3);
+                    if (pix < 0) {
+                        return AVERROR_INVALIDDATA;
+                    }
+                    dst[x] = 255 - pix;
+                }
+                dst += stride;
+            }
+        }
+
+        if (pred == LEFT) {
+            dst = p->data[i] + j * sheight * stride;
+            s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+            dst += stride;
+            if (interlaced) {
+                s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+                dst += stride;
+            }
+            for (k = 1 + interlaced; k < height; k++) {
+                s->hdsp.add_hfyu_left_pred(dst, dst, width, dst[-fake_stride]);
+                dst += stride;
+            }
+        } else if (pred == GRADIENT) {
+            int left, lefttop, top;
+
+            dst = p->data[i] + j * sheight * stride;
+            s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+            left = lefttop = 0;
+            dst += stride;
+            if (interlaced) {
+                s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+                left = lefttop = 0;
+                dst += stride;
+            }
+            for (k = 1 + interlaced; k < height; k++) {
+                top = dst[-fake_stride];
+                left = top + dst[0];
+                dst[0] = left;
+                for (x = 1; x < width; x++) {
+                    top = dst[x - fake_stride];
+                    lefttop = dst[x - (fake_stride + 1)];
+                    left += top - lefttop + dst[x];
+                    dst[x] = left;
+                }
+                dst += stride;
+            }
+        } else if (pred == MEDIAN) {
+            int left, lefttop;
+
+            dst = p->data[i] + j * sheight * stride;
+            lefttop = left = dst[0];
+            s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+            dst += stride;
+            if (interlaced) {
+                lefttop = left = dst[0];
+                s->hdsp.add_hfyu_left_pred(dst, dst, width, 0);
+                dst += stride;
+            }
+            for (k = 1 + interlaced; k < height; k++) {
+                s->hdsp.add_hfyu_median_pred(dst, dst - fake_stride, dst, width, &left, &lefttop);
+                lefttop = left = dst[0];
+                dst += stride;
+            }
+        } else {
+            avpriv_request_sample(avctx, "unknown prediction: %d", pred);
+        }
+    }
+
+    if (s->decorrelate) {
+        int height = FFMIN(s->slice_height, avctx->coded_height - j * s->slice_height);
+        int width = avctx->coded_width;
+        uint8_t *b = p->data[0] + j * s->slice_height * p->linesize[0];
+        uint8_t *g = p->data[1] + j * s->slice_height * p->linesize[1];
+        uint8_t *r = p->data[2] + j * s->slice_height * p->linesize[2];
+
+        for (i = 0; i < height; i++) {
+            s->hdsp.add_bytes(b, g, width);
+            s->hdsp.add_bytes(r, g, width);
+            b += p->linesize[0];
+            g += p->linesize[1];
+            r += p->linesize[2];
+        }
+    }
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    uint32_t first_offset, offset, next_offset, header_size, slice_width;
+    int ret, format, version, table_size;
+    MagicYUVContext *s = avctx->priv_data;
+    ThreadFrame frame = { .f = data };
+    AVFrame *p = data;
+    GetByteContext gb;
+    GetBitContext b;
+    int i, j, k, width, height;
+
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
+    if (bytestream2_get_le32(&gb) != MKTAG('M','A','G','Y'))
+        return AVERROR_INVALIDDATA;
+
+    header_size = bytestream2_get_le32(&gb);
+    if (header_size < 32 || header_size >= avpkt->size)
+        return AVERROR_INVALIDDATA;
+
+    version = bytestream2_get_byte(&gb);
+    if (version != 7) {
+        avpriv_request_sample(avctx, "unsupported version: %d", version);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    s->hshift[1] = s->vshift[1] = 0;
+    s->hshift[2] = s->vshift[2] = 0;
+    s->decorrelate = 0;
+
+    format = bytestream2_get_byte(&gb);
+    switch (format) {
+    case 0x65:
+        avctx->pix_fmt = AV_PIX_FMT_GBRP;
+        s->decorrelate = 1;
+        s->planes = 3;
+        break;
+    case 0x66:
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+        s->decorrelate = 1;
+        s->planes = 4;
+        break;
+    case 0x67:
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        s->planes = 3;
+        break;
+    case 0x68:
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        s->planes = 3;
+        s->hshift[1] = s->hshift[2] = 1;
+        break;
+    case 0x69:
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+        s->planes = 3;
+        s->hshift[1] = s->vshift[1] = 1;
+        s->hshift[2] = s->vshift[2] = 1;
+        break;
+    case 0x6a:
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        s->planes = 4;
+        break;
+    case 0x6b:
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        s->planes = 1;
+        break;
+    default:
+        avpriv_request_sample(avctx, "unsupported format: 0x%X", format);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    bytestream2_skip(&gb, 2);
+    s->interlaced = !!(bytestream2_get_byte(&gb) & 2);
+    bytestream2_skip(&gb, 3);
+
+    width  = bytestream2_get_le32(&gb);
+    height = bytestream2_get_le32(&gb);
+    if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
+        return ret;
+
+    slice_width = bytestream2_get_le32(&gb);
+    if (slice_width != avctx->coded_width) {
+        avpriv_request_sample(avctx, "unsupported slice width: %d", slice_width);
+        return AVERROR_PATCHWELCOME;
+    }
+    s->slice_height = bytestream2_get_le32(&gb);
+    if ((s->slice_height <= 0) || (s->slice_height > INT_MAX - avctx->coded_height)) {
+        av_log(avctx, AV_LOG_ERROR, "invalid slice height: %d\n", s->slice_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&gb, 4);
+
+    s->nb_slices = (avctx->coded_height + s->slice_height - 1) / s->slice_height;
+    if (s->nb_slices > INT_MAX / sizeof(Slice)) {
+        av_log(avctx, AV_LOG_ERROR, "invalid number of slices: %d\n", s->nb_slices);
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (i = 0; i < s->planes; i++) {
+        av_fast_malloc(&s->slices[i], &s->slices_size[i], s->nb_slices * sizeof(Slice));
+        if (!s->slices[i])
+            return AVERROR(ENOMEM);
+
+        offset = bytestream2_get_le32(&gb);
+        if (offset >= avpkt->size - header_size)
+            return AVERROR_INVALIDDATA;
+
+        if (i == 0)
+            first_offset = offset;
+
+        for (j = 0; j < s->nb_slices - 1; j++) {
+            s->slices[i][j].start = offset + header_size;
+            next_offset = bytestream2_get_le32(&gb);
+            s->slices[i][j].size  = next_offset - offset;
+            offset = next_offset;
+
+            if (offset >= avpkt->size - header_size)
+                return AVERROR_INVALIDDATA;
+        }
+
+        s->slices[i][j].start = offset + header_size;
+        s->slices[i][j].size  = avpkt->size - s->slices[i][j].start;
+    }
+
+    if (bytestream2_get_byte(&gb) != s->planes)
+        return AVERROR_INVALIDDATA;
+
+    bytestream2_skip(&gb, s->nb_slices * s->planes);
+
+    table_size = header_size + first_offset - bytestream2_tell(&gb);
+    if (table_size < 2)
+        return AVERROR_INVALIDDATA;
+
+    if ((ret = init_get_bits8(&b, avpkt->data + bytestream2_tell(&gb), table_size)) < 0)
+        return ret;
+
+    memset(s->len, 0, sizeof(s->len));
+    j = i = 0;
+    while (get_bits_left(&b) >= 8) {
+        int l = get_bits(&b, 4);
+        int x = get_bits(&b, 4);
+        int L = get_bitsz(&b, l) + 1;
+
+        for (k = 0; k < L; k++) {
+            if (j + k < 256)
+                s->len[i][j + k] = x;
+        }
+
+        j += L;
+        if (j == 256) {
+            j = 0;
+            if (build_huff(&s->vlc[i], s->len[i])) {
+                av_log(avctx, AV_LOG_ERROR, "Cannot build Huffman codes\n");
+                return AVERROR_INVALIDDATA;
+            }
+            i++;
+            if (i == s->planes) {
+                break;
+            }
+        } else if (j > 256) {
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (i != s->planes) {
+        av_log(avctx, AV_LOG_ERROR, "Huffman tables too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    p->pict_type = AV_PICTURE_TYPE_I;
+    p->key_frame = 1;
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    s->buf = avpkt->data;
+    s->p = p;
+    avctx->execute2(avctx, decode_slice, NULL, NULL, s->nb_slices);
+
+    if (avctx->pix_fmt == AV_PIX_FMT_GBRP ||
+        avctx->pix_fmt == AV_PIX_FMT_GBRAP) {
+        FFSWAP(uint8_t*, p->data[0], p->data[1]);
+        FFSWAP(int, p->linesize[0], p->linesize[1]);
+    }
+
+    *got_frame = 1;
+
+    if (ret < 0)
+        return ret;
+    return avpkt->size;
+}
+
+#if HAVE_THREADS
+static int decode_init_thread_copy(AVCodecContext *avctx)
+{
+    MagicYUVContext *s = avctx->priv_data;
+
+    s->slices[0] = 0;
+    s->slices[1] = 0;
+    s->slices[2] = 0;
+    s->slices[3] = 0;
+    s->slices_size[0] = 0;
+    s->slices_size[1] = 0;
+    s->slices_size[2] = 0;
+    s->slices_size[3] = 0;
+
+    return 0;
+}
+#endif
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    MagicYUVContext * const s = avctx->priv_data;
+
+    av_freep(&s->slices[0]);
+    av_freep(&s->slices[1]);
+    av_freep(&s->slices[2]);
+    av_freep(&s->slices[3]);
+    s->slices_size[0] = 0;
+    s->slices_size[1] = 0;
+    s->slices_size[2] = 0;
+    s->slices_size[3] = 0;
+    ff_free_vlc(&s->vlc[0]);
+    ff_free_vlc(&s->vlc[1]);
+    ff_free_vlc(&s->vlc[2]);
+    ff_free_vlc(&s->vlc[3]);
+
+    return 0;
+}
+
+AVCodec ff_magicyuv_decoder = {
+    .name             = "magicyuv",
+    .long_name        = NULL_IF_CONFIG_SMALL("MagicYUV Lossless Video"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_MAGICYUV,
+    .priv_data_size   = sizeof(MagicYUVContext),
+    .init             = decode_init,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(decode_init_thread_copy),
+    .close            = decode_end,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
+};
diff --git a/libavcodec/mathops-test.c b/libavcodec/mathops-test.c
new file mode 100644
index 0000000..d47f144
--- /dev/null
+++ b/libavcodec/mathops-test.c
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mathops.h"
+
+#include <stdlib.h>
+
+int main(void)
+{
+    unsigned u;
+
+    for(u=0; u<65536; u++) {
+        unsigned s = u*u;
+        unsigned root = ff_sqrt(s);
+        unsigned root_m1 = ff_sqrt(s-1);
+        if (s && root != u) {
+            fprintf(stderr, "ff_sqrt failed at %u with %u\n", s, root);
+            return 1;
+        }
+        if (u && root_m1 != u - 1) {
+            fprintf(stderr, "ff_sqrt failed at %u with %u\n", s, root);
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
index bd85dd7..5168dc2 100644
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2001, 2002 Fabrice Bellard
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_MATHOPS_H
@@ -30,18 +30,15 @@
 #define MAX_NEG_CROP 1024
 
 extern const uint32_t ff_inverse[257];
-extern const uint8_t  ff_reverse[256];
 extern const uint8_t ff_sqrt_tab[256];
 extern const uint8_t ff_crop_tab[256 + 2 * MAX_NEG_CROP];
 extern const uint8_t ff_zigzag_direct[64];
-extern const uint8_t ff_zigzag_scan[16];
+extern const uint8_t ff_zigzag_scan[16+1];
 
 #if   ARCH_ARM
 #   include "arm/mathops.h"
 #elif ARCH_AVR32
 #   include "avr32/mathops.h"
-#elif ARCH_BFIN
-#   include "bfin/mathops.h"
 #elif ARCH_MIPS
 #   include "mips/mathops.h"
 #elif ARCH_PPC
@@ -124,6 +121,20 @@ static inline av_const int mid_pred(int a, int b, int c)
 }
 #endif
 
+#ifndef median4
+#define median4 median4
+static inline av_const int median4(int a, int b, int c, int d)
+{
+    if (a < b) {
+        if (c < d) return (FFMIN(b, d) + FFMAX(a, c)) / 2;
+        else       return (FFMIN(b, c) + FFMAX(a, d)) / 2;
+    } else {
+        if (c < d) return (FFMIN(a, d) + FFMAX(b, c)) / 2;
+        else       return (FFMIN(a, c) + FFMAX(b, d)) / 2;
+    }
+}
+#endif
+
 #ifndef sign_extend
 static inline av_const int sign_extend(int val, unsigned bits)
 {
@@ -200,6 +211,8 @@ if ((y) < (x)) {\
 #   define FASTDIV(a,b) ((uint32_t)((((uint64_t)a) * ff_inverse[b]) >> 32))
 #endif /* FASTDIV */
 
+#ifndef ff_sqrt
+#define ff_sqrt ff_sqrt
 static inline av_const unsigned int ff_sqrt(unsigned int a)
 {
     unsigned int b;
@@ -219,6 +232,12 @@ static inline av_const unsigned int ff_sqrt(unsigned int a)
 
     return b - (a < b * b);
 }
+#endif
+
+static inline av_const float ff_sqrf(float a)
+{
+    return a*a;
+}
 
 static inline int8_t ff_u8_to_s8(uint8_t a)
 {
diff --git a/libavcodec/mathtables.c b/libavcodec/mathtables.c
index d198225..81eabc7 100644
--- a/libavcodec/mathtables.c
+++ b/libavcodec/mathtables.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,25 +71,6 @@ const uint8_t ff_sqrt_tab[256]={
 240,240,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255,255,255
 };
 
-const uint8_t ff_reverse[256] = {
-0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
-0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
-0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
-0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
-0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
-0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
-0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
-0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
-0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
-0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
-0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
-0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
-0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
-0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
-0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
-0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF,
-};
-
 #define times4(x) x, x, x, x
 #define times256(x) times4(times4(times4(times4(times4(x)))))
 
@@ -123,7 +106,7 @@ const uint8_t ff_zigzag_direct[64] = {
     53, 60, 61, 54, 47, 55, 62, 63
 };
 
-const uint8_t ff_zigzag_scan[16] = {
+const uint8_t ff_zigzag_scan[16+1] = {
     0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4,
     1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
     1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4,
diff --git a/libavcodec/mdct_fixed.c b/libavcodec/mdct_fixed.c
index 9e06861..a32cb00 100644
--- a/libavcodec/mdct_fixed.c
+++ b/libavcodec/mdct_fixed.c
@@ -1,22 +1,23 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 0
+#define FFT_FIXED_32 0
 #include "mdct_template.c"
 
 /* same as ff_mdct_calcw_c with double-width unscaled output */
diff --git a/libavcodec/mdct_fixed_32.c b/libavcodec/mdct_fixed_32.c
new file mode 100644
index 0000000..5a34dfe
--- /dev/null
+++ b/libavcodec/mdct_fixed_32.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Stanislav Ocovaj (socovaj@mips.com)
+ *           Goran Cordasic   (goran@mips.com)
+ *           Djordje Pesut    (djordje@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define FFT_FLOAT 0
+#define FFT_FIXED_32 1
+#include "mdct_template.c"
diff --git a/libavcodec/mdct_float.c b/libavcodec/mdct_float.c
index a0a62b3..cff2d21 100644
--- a/libavcodec/mdct_float.c
+++ b/libavcodec/mdct_float.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #define FFT_FLOAT 1
+#define FFT_FIXED_32 0
 #include "mdct_template.c"
diff --git a/libavcodec/mdct_template.c b/libavcodec/mdct_template.c
index 5b3a6ff..04396b4 100644
--- a/libavcodec/mdct_template.c
+++ b/libavcodec/mdct_template.c
@@ -2,26 +2,27 @@
  * MDCT/IMDCT transforms
  * Copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdlib.h>
 #include <string.h>
 #include "libavutil/common.h"
+#include "libavutil/libm.h"
 #include "libavutil/mathematics.h"
 #include "fft.h"
 #include "fft-internal.h"
@@ -34,7 +35,11 @@
 #if FFT_FLOAT
 #   define RSCALE(x) (x)
 #else
+#if FFT_FIXED_32
+#   define RSCALE(x) (((x) + 32) >> 6)
+#else /* FFT_FIXED_32 */
 #   define RSCALE(x) ((x) >> 1)
+#endif /* FFT_FIXED_32 */
 #endif
 
 /**
@@ -56,27 +61,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
     if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
         goto fail;
 
-    s->imdct_calc  = ff_imdct_calc_c;
-    s->imdct_half  = ff_imdct_half_c;
-    s->mdct_calc   = ff_mdct_calc_c;
-
-#if FFT_FLOAT
-    if (ARCH_AARCH64)
-        ff_mdct_init_aarch64(s);
-    if (ARCH_ARM)
-        ff_mdct_init_arm(s);
-    if (ARCH_PPC)
-        ff_mdct_init_ppc(s);
-    if (ARCH_X86)
-        ff_mdct_init_x86(s);
-    s->mdct_calcw  = s->mdct_calc;
-#else
-    s->mdct_calcw  = ff_mdct_calcw_c;
-    if (ARCH_ARM)
-        ff_mdct_fixed_init_arm(s);
-#endif
-
-    s->tcos = av_malloc(n/2 * sizeof(FFTSample));
+    s->tcos = av_malloc_array(n/2, sizeof(FFTSample));
     if (!s->tcos)
         goto fail;
 
@@ -97,8 +82,13 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
     scale = sqrt(fabs(scale));
     for(i=0;i<n4;i++) {
         alpha = 2 * M_PI * (i + theta) / n;
+#if FFT_FIXED_32
+        s->tcos[i*tstep] = lrint(-cos(alpha) * 2147483648.0);
+        s->tsin[i*tstep] = lrint(-sin(alpha) * 2147483648.0);
+#else
         s->tcos[i*tstep] = FIX15(-cos(alpha) * scale);
         s->tsin[i*tstep] = FIX15(-sin(alpha) * scale);
+#endif
     }
     return 0;
  fail:
diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c
index 39395e2..1cc4ca4 100644
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -4,20 +4,20 @@
  *
  * based upon code from Sebastian Jedruszkiewicz <elf@frogger.rules.pl>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "bswapdsp.h"
 #include "idctdsp.h"
 #include "mpeg12.h"
 #include "thread.h"
@@ -36,6 +37,7 @@
 typedef struct MDECContext {
     AVCodecContext *avctx;
     BlockDSPContext bdsp;
+    BswapDSPContext bbdsp;
     IDCTDSPContext idsp;
     ThreadFrame frame;
     GetBitContext gb;
@@ -129,7 +131,7 @@ static inline int mdec_decode_block_intra(MDECContext *a, int16_t *block, int n)
 static inline int decode_mb(MDECContext *a, int16_t block[6][64])
 {
     int i, ret;
-    const int block_index[6] = { 5, 4, 0, 1, 2, 3 };
+    static const int block_index[6] = { 5, 4, 0, 1, 2, 3 };
 
     a->bdsp.clear_blocks(block[0]);
 
@@ -171,23 +173,19 @@ static int decode_frame(AVCodecContext *avctx,
     const uint8_t *buf    = avpkt->data;
     int buf_size          = avpkt->size;
     ThreadFrame frame     = { .f = data };
-    int i, ret;
+    int ret;
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
     frame.f->pict_type = AV_PICTURE_TYPE_I;
     frame.f->key_frame = 1;
 
-    av_fast_malloc(&a->bitstream_buffer, &a->bitstream_buffer_size, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&a->bitstream_buffer, &a->bitstream_buffer_size, buf_size);
     if (!a->bitstream_buffer)
         return AVERROR(ENOMEM);
-    for (i = 0; i < buf_size; i += 2) {
-        a->bitstream_buffer[i]     = buf[i + 1];
-        a->bitstream_buffer[i + 1] = buf[i];
-    }
-    init_get_bits(&a->gb, a->bitstream_buffer, buf_size * 8);
+    a->bbdsp.bswap16_buf((uint16_t *)a->bitstream_buffer, (uint16_t *)buf, (buf_size + 1) / 2);
+    if ((ret = init_get_bits8(&a->gb, a->bitstream_buffer, buf_size)) < 0)
+        return ret;
 
     /* skip over 4 preamble bytes in stream (typically 0xXX 0xXX 0x00 0x38) */
     skip_bits(&a->gb, 32);
@@ -221,6 +219,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     a->avctx           = avctx;
 
     ff_blockdsp_init(&a->bdsp, avctx);
+    ff_bswapdsp_init(&a->bbdsp);
     ff_idctdsp_init(&a->idsp, avctx);
     ff_mpeg12_init_vlcs();
     ff_init_scantable(a->idsp.idct_permutation, &a->scantable,
@@ -234,6 +233,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 {
     MDECContext * const a = avctx->priv_data;
@@ -242,6 +242,7 @@ static av_cold int decode_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 static av_cold int decode_end(AVCodecContext *avctx)
 {
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index eb98a72..dc76b07 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -1,22 +1,27 @@
 /*
- * This file is part of Libav.
+ * DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/internal.h"
 #include "avcodec.h"
 #include "copy_block.h"
 #include "simple_idct.h"
@@ -103,8 +108,8 @@ static int sum_abs_dctelem_c(int16_t *block)
     return sum;
 }
 
-#define avg2(a, b) ((a + b + 1) >> 1)
-#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
+#define avg2(a, b) (((a) + (b) + 1) >> 1)
+#define avg4(a, b, c, d) (((a) + (b) + (c) + (d) + 2) >> 2)
 
 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                               ptrdiff_t stride, int h)
@@ -409,6 +414,14 @@ void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type)
         case FF_CMP_NSSE:
             cmp[i] = c->nsse[i];
             break;
+#if CONFIG_DWT
+        case FF_CMP_W53:
+            cmp[i]= c->w53[i];
+            break;
+        case FF_CMP_W97:
+            cmp[i]= c->w97[i];
+            break;
+#endif
         default:
             av_log(NULL, AV_LOG_ERROR,
                    "internal error in cmp function selection\n");
@@ -436,7 +449,7 @@ static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
 {
     int i, temp[64], sum = 0;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     for (i = 0; i < 8; i++) {
         // FIXME: try pointer walks
@@ -488,7 +501,7 @@ static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
 {
     int i, temp[64], sum = 0;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     for (i = 0; i < 8; i++) {
         // FIXME: try pointer walks
@@ -540,7 +553,7 @@ static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
 {
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
     s->fdsp.fdct(temp);
@@ -607,7 +620,7 @@ static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
     int sum = 0, i;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
     s->fdsp.fdct(temp);
@@ -625,7 +638,7 @@ static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
     int16_t *const bak = temp + 64;
     int sum = 0, i;
 
-    assert(h == 8);
+    av_assert2(h == 8);
     s->mb_intra = 0;
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
@@ -654,7 +667,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
     const int esc_length = s->ac_esc_length;
     uint8_t *length, *last_length;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     copy_block8(lsrc1, src1, 8, stride, 8);
     copy_block8(lsrc2, src2, 8, stride, 8);
@@ -698,7 +711,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
 
         level = temp[i] + 64;
 
-        assert(level - 64);
+        av_assert2(level - 64);
 
         if ((level & (~127)) == 0) {
             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
@@ -729,7 +742,7 @@ static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
     const int esc_length = s->ac_esc_length;
     uint8_t *length, *last_length;
 
-    assert(h == 8);
+    av_assert2(h == 8);
 
     s->pdsp.diff_pixels(temp, src1, src2, stride);
 
@@ -770,7 +783,7 @@ static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
 
         level = temp[i] + 64;
 
-        assert(level - 64);
+        av_assert2(level - 64);
 
         if ((level & (~127)) == 0)
             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
@@ -803,20 +816,24 @@ static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
 VSAD_INTRA(8)
 VSAD_INTRA(16)
 
-static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
-                    ptrdiff_t stride, int h)
-{
-    int score = 0, x, y;
-
-    for (y = 1; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
-        s1 += stride;
-        s2 += stride;
-    }
-
-    return score;
+#define VSAD(size)                                                             \
+static int vsad ## size ## _c(MpegEncContext *c,                               \
+                              uint8_t *s1, uint8_t *s2,                        \
+                              ptrdiff_t stride, int h)                               \
+{                                                                              \
+    int score = 0, x, y;                                                       \
+                                                                               \
+    for (y = 1; y < h; y++) {                                                  \
+        for (x = 0; x < size; x++)                                             \
+            score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);   \
+        s1 += stride;                                                          \
+        s2 += stride;                                                          \
+    }                                                                          \
+                                                                               \
+    return score;                                                              \
 }
+VSAD(8)
+VSAD(16)
 
 #define SQ(a) ((a) * (a))
 #define VSSE_INTRA(size)                                                \
@@ -841,20 +858,23 @@ static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
 VSSE_INTRA(8)
 VSSE_INTRA(16)
 
-static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
-                    ptrdiff_t stride, int h)
-{
-    int score = 0, x, y;
-
-    for (y = 1; y < h; y++) {
-        for (x = 0; x < 16; x++)
-            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
-        s1 += stride;
-        s2 += stride;
-    }
-
-    return score;
+#define VSSE(size)                                                             \
+static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,     \
+                              ptrdiff_t stride, int h)                         \
+{                                                                              \
+    int score = 0, x, y;                                                       \
+                                                                               \
+    for (y = 1; y < h; y++) {                                                  \
+        for (x = 0; x < size; x++)                                             \
+            score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);      \
+        s1 += stride;                                                          \
+        s2 += stride;                                                          \
+    }                                                                          \
+                                                                               \
+    return score;                                                              \
 }
+VSSE(8)
+VSSE(16)
 
 #define WRAPPER8_16_SQ(name8, name16)                                   \
 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
@@ -892,8 +912,31 @@ av_cold void ff_me_cmp_init_static(void)
         ff_square_tab[i] = (i - 256) * (i - 256);
 }
 
+int ff_check_alignment(void)
+{
+    static int did_fail = 0;
+    LOCAL_ALIGNED_16(int, aligned, [4]);
+
+    if ((intptr_t)aligned & 15) {
+        if (!did_fail) {
+#if HAVE_MMX || HAVE_ALTIVEC
+            av_log(NULL, AV_LOG_ERROR,
+                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
+                "and may be very slow or crash. This is not a bug in libavcodec,\n"
+                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
+                "Do not report crashes to FFmpeg developers.\n");
+#endif
+            did_fail=1;
+        }
+        return -1;
+    }
+    return 0;
+}
+
 av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
 {
+    ff_check_alignment();
+
     c->sum_abs_dctelem = sum_abs_dctelem_c;
 
     /* TODO [0] 16  [1] 8 */
@@ -927,18 +970,27 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
     SET_CMP_FUNC(rd)
     SET_CMP_FUNC(bit)
     c->vsad[0] = vsad16_c;
+    c->vsad[1] = vsad8_c;
     c->vsad[4] = vsad_intra16_c;
     c->vsad[5] = vsad_intra8_c;
     c->vsse[0] = vsse16_c;
+    c->vsse[1] = vsse8_c;
     c->vsse[4] = vsse_intra16_c;
     c->vsse[5] = vsse_intra8_c;
     c->nsse[0] = nsse16_c;
     c->nsse[1] = nsse8_c;
+#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
+    ff_dsputil_init_dwt(c);
+#endif
 
+    if (ARCH_ALPHA)
+        ff_me_cmp_init_alpha(c, avctx);
     if (ARCH_ARM)
         ff_me_cmp_init_arm(c, avctx);
     if (ARCH_PPC)
         ff_me_cmp_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_me_cmp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_me_cmp_init_mips(c, avctx);
 }
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index 725f9b2..a3603ec 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,20 @@
 
 extern uint32_t ff_square_tab[512];
 
+
+/* minimum alignment rules ;)
+ * If you notice errors in the align stuff, need more alignment for some ASM code
+ * for some CPU or need to use a function with less aligned data then send a mail
+ * to the ffmpeg-devel mailing list, ...
+ *
+ * !warning These alignments might not match reality, (missing attribute((align))
+ * stuff somewhere possible).
+ * I (Michael) did not check them, these are just the alignments which I think
+ * could be reached easily ...
+ *
+ * !future video codecs might need functions with less strict alignment
+ */
+
 struct MpegEncContext;
 /* Motion estimation:
  * h is limited to { width / 2, width, 2 * width },
@@ -49,6 +63,8 @@ typedef struct MECmpContext {
     me_cmp_func vsad[6];
     me_cmp_func vsse[6];
     me_cmp_func nsse[6];
+    me_cmp_func w53[6];
+    me_cmp_func w97[6];
     me_cmp_func dct_max[6];
     me_cmp_func dct264_sad[6];
 
@@ -64,11 +80,17 @@ typedef struct MECmpContext {
 
 void ff_me_cmp_init_static(void);
 
+int ff_check_alignment(void);
+
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
 
 void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type);
 
+void ff_dsputil_init_dwt(MECmpContext *c);
+
 #endif /* AVCODEC_ME_CMP_H */
diff --git a/libavcodec/mediacodec_sw_buffer.c b/libavcodec/mediacodec_sw_buffer.c
new file mode 100644
index 0000000..7baf120
--- /dev/null
+++ b/libavcodec/mediacodec_sw_buffer.c
@@ -0,0 +1,339 @@
+/*
+ * Android MediaCodec software buffer copy functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include <sys/types.h>
+
+#include "libavutil/frame.h"
+#include "libavutil/mem.h"
+
+#include "avcodec.h"
+#include "mediacodecdec.h"
+#include "mediacodec_wrapper.h"
+#include "mediacodec_sw_buffer.h"
+
+#define QCOM_TILE_WIDTH 64
+#define QCOM_TILE_HEIGHT 32
+#define QCOM_TILE_SIZE (QCOM_TILE_WIDTH * QCOM_TILE_HEIGHT)
+#define QCOM_TILE_GROUP_SIZE (4 * QCOM_TILE_SIZE)
+
+/**
+ * The code handling the various YUV color formats is taken from the
+ * GStreamer project.
+ *
+ * Gstreamer reference:
+ * https://cgit.freedesktop.org/gstreamer/gst-plugins-bad/tree/sys/androidmedia/
+ *
+ * Copyright (C) 2012, Collabora Ltd.
+ *   Author: Sebastian Dröge <sebastian.droege@collabora.co.uk>
+ *
+ * Copyright (C) 2012, Rafaël Carré <funman@videolanorg>
+ *
+ * Copyright (C) 2015, Sebastian Dröge <sebastian@centricular.com>
+ *
+ * Copyright (C) 2014-2015, Collabora Ltd.
+ *   Author: Matthieu Bouron <matthieu.bouron@gcollabora.com>
+ *
+ * Copyright (C) 2015, Edward Hervey
+ *   Author: Edward Hervey <bilboed@gmail.com>
+ *
+ * Copyright (C) 2015, Matthew Waters <matthew@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+void ff_mediacodec_sw_buffer_copy_yuv420_planar(AVCodecContext *avctx,
+                                                MediaCodecDecContext *s,
+                                                uint8_t *data,
+                                                size_t size,
+                                                FFAMediaCodecBufferInfo *info,
+                                                AVFrame *frame)
+{
+    int i;
+    uint8_t *src = NULL;
+
+    for (i = 0; i < 3; i++) {
+        int stride = s->stride;
+        int height;
+
+        src = data + info->offset;
+        if (i == 0) {
+            height = avctx->height;
+
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        } else {
+            height = avctx->height / 2;
+            stride = (s->stride + 1) / 2;
+
+            src += s->slice_height * s->stride;
+
+            if (i == 2) {
+                src += ((s->slice_height + 1) / 2) * stride;
+            }
+
+            src += s->crop_top * stride;
+            src += (s->crop_left / 2);
+        }
+
+        if (frame->linesize[i] == stride) {
+            memcpy(frame->data[i], src, height * stride);
+        } else {
+            int j, width;
+            uint8_t *dst = frame->data[i];
+
+            if (i == 0) {
+                width = avctx->width;
+            } else if (i >= 1) {
+                width = FFMIN(frame->linesize[i], FFALIGN(avctx->width, 2) / 2);
+            }
+
+            for (j = 0; j < height; j++) {
+                memcpy(dst, src, width);
+                src += stride;
+                dst += frame->linesize[i];
+            }
+        }
+    }
+}
+
+void ff_mediacodec_sw_buffer_copy_yuv420_semi_planar(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame)
+{
+    int i;
+    uint8_t *src = NULL;
+
+    for (i = 0; i < 2; i++) {
+        int height;
+
+        src = data + info->offset;
+        if (i == 0) {
+            height = avctx->height;
+
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        } else if (i == 1) {
+            height = avctx->height / 2;
+
+            src += s->slice_height * s->stride;
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        }
+
+        if (frame->linesize[i] == s->stride) {
+            memcpy(frame->data[i], src, height * s->stride);
+        } else {
+            int j, width;
+            uint8_t *dst = frame->data[i];
+
+            if (i == 0) {
+                width = avctx->width;
+            } else if (i == 1) {
+                width = FFMIN(frame->linesize[i], FFALIGN(avctx->width, 2));
+            }
+
+            for (j = 0; j < height; j++) {
+                memcpy(dst, src, width);
+                src += s->stride;
+                dst += frame->linesize[i];
+            }
+        }
+    }
+}
+
+
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar(AVCodecContext *avctx,
+                                                            MediaCodecDecContext *s,
+                                                            uint8_t *data,
+                                                            size_t size,
+                                                            FFAMediaCodecBufferInfo *info,
+                                                            AVFrame *frame)
+{
+    int i;
+    uint8_t *src = NULL;
+
+    for (i = 0; i < 2; i++) {
+        int height;
+
+        src = data + info->offset;
+        if (i == 0) {
+            height = avctx->height;
+        } else if (i == 1) {
+            height = avctx->height / 2;
+
+            src += (s->slice_height - s->crop_top / 2) * s->stride;
+
+            src += s->crop_top * s->stride;
+            src += s->crop_left;
+        }
+
+        if (frame->linesize[i] == s->stride) {
+            memcpy(frame->data[i], src, height * s->stride);
+        } else {
+            int j, width;
+            uint8_t *dst = frame->data[i];
+
+            if (i == 0) {
+                width = avctx->width;
+            } else if (i == 1) {
+                width = FFMIN(frame->linesize[i], FFALIGN(avctx->width, 2));
+            }
+
+            for (j = 0; j < height; j++) {
+                memcpy(dst, src, width);
+                src += s->stride;
+                dst += frame->linesize[i];
+            }
+        }
+    }
+}
+
+/**
+ * The code handling the QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka
+ * color format is taken from the VLC project.
+ *
+ * VLC reference:
+ * http://git.videolan.org/?p=vlc.git;a=blob;f=modules/codec/omxil/qcom.c;hb=HEAD
+ *
+ * VLC copyright notice:
+ *
+ *****************************************************************************
+ * qcom.c : pixel format translation for Qualcomm tiled nv12
+ *****************************************************************************
+ * Copyright © 2012 Rafaël Carré
+ *
+ * Authors: Rafaël Carré <funman@videolanorg>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *
+ */
+
+static size_t qcom_tile_pos(size_t x, size_t y, size_t w, size_t h)
+{
+  size_t flim = x + (y & ~1) * w;
+
+  if (y & 1) {
+    flim += (x & ~3) + 2;
+  } else if ((h & 1) == 0 || y != (h - 1)) {
+    flim += (x + 2) & ~3;
+  }
+
+  return flim;
+}
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar_64x32Tile2m8ka(AVCodecContext *avctx,
+                                                                           MediaCodecDecContext *s,
+                                                                           uint8_t *data,
+                                                                           size_t size,
+                                                                           FFAMediaCodecBufferInfo *info,
+                                                                           AVFrame *frame)
+{
+    size_t width = frame->width;
+    size_t linesize = frame->linesize[0];
+    size_t height = frame->height;
+
+    const size_t tile_w = (width - 1) / QCOM_TILE_WIDTH + 1;
+    const size_t tile_w_align = (tile_w + 1) & ~1;
+    const size_t tile_h_luma = (height - 1) / QCOM_TILE_HEIGHT + 1;
+    const size_t tile_h_chroma = (height / 2 - 1) / QCOM_TILE_HEIGHT + 1;
+
+    size_t luma_size = tile_w_align * tile_h_luma * QCOM_TILE_SIZE;
+    if((luma_size % QCOM_TILE_GROUP_SIZE) != 0)
+        luma_size = (((luma_size - 1) / QCOM_TILE_GROUP_SIZE) + 1) * QCOM_TILE_GROUP_SIZE;
+
+    for(size_t y = 0; y < tile_h_luma; y++) {
+        size_t row_width = width;
+        for(size_t x = 0; x < tile_w; x++) {
+            size_t tile_width = row_width;
+            size_t tile_height = height;
+            /* dest luma memory index for this tile */
+            size_t luma_idx = y * QCOM_TILE_HEIGHT * linesize + x * QCOM_TILE_WIDTH;
+            /* dest chroma memory index for this tile */
+            /* XXX: remove divisions */
+            size_t chroma_idx = (luma_idx / linesize) * linesize / 2 + (luma_idx % linesize);
+
+            /* luma source pointer for this tile */
+            const uint8_t *src_luma  = data
+                + qcom_tile_pos(x, y,tile_w_align, tile_h_luma) * QCOM_TILE_SIZE;
+
+            /* chroma source pointer for this tile */
+            const uint8_t *src_chroma = data + luma_size
+                + qcom_tile_pos(x, y/2, tile_w_align, tile_h_chroma) * QCOM_TILE_SIZE;
+            if (y & 1)
+                src_chroma += QCOM_TILE_SIZE/2;
+
+            /* account for right columns */
+            if (tile_width > QCOM_TILE_WIDTH)
+                tile_width = QCOM_TILE_WIDTH;
+
+            /* account for bottom rows */
+            if (tile_height > QCOM_TILE_HEIGHT)
+                tile_height = QCOM_TILE_HEIGHT;
+
+            tile_height /= 2;
+            while (tile_height--) {
+                memcpy(frame->data[0] + luma_idx, src_luma, tile_width);
+                src_luma += QCOM_TILE_WIDTH;
+                luma_idx += linesize;
+
+                memcpy(frame->data[0] + luma_idx, src_luma, tile_width);
+                src_luma += QCOM_TILE_WIDTH;
+                luma_idx += linesize;
+
+                memcpy(frame->data[1] + chroma_idx, src_chroma, tile_width);
+                src_chroma += QCOM_TILE_WIDTH;
+                chroma_idx += linesize;
+            }
+            row_width -= QCOM_TILE_WIDTH;
+        }
+        height -= QCOM_TILE_HEIGHT;
+    }
+}
diff --git a/libavcodec/mediacodec_sw_buffer.h b/libavcodec/mediacodec_sw_buffer.h
new file mode 100644
index 0000000..c29de08
--- /dev/null
+++ b/libavcodec/mediacodec_sw_buffer.h
@@ -0,0 +1,62 @@
+/*
+ * Android MediaCodec software buffer copy functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODEC_SW_BUFFER_H
+#define AVCODEC_MEDIACODEC_SW_BUFFER_H
+
+#include <sys/types.h>
+
+#include "libavutil/frame.h"
+
+#include "avcodec.h"
+#include "mediacodecdec.h"
+#include "mediacodec_wrapper.h"
+
+void ff_mediacodec_sw_buffer_copy_yuv420_planar(AVCodecContext *avctx,
+                                                MediaCodecDecContext *s,
+                                                uint8_t *data,
+                                                size_t size,
+                                                FFAMediaCodecBufferInfo *info,
+                                                AVFrame *frame);
+
+void ff_mediacodec_sw_buffer_copy_yuv420_semi_planar(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame);
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame);
+
+void ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar_64x32Tile2m8ka(AVCodecContext *avctx,
+                                                     MediaCodecDecContext *s,
+                                                     uint8_t *data,
+                                                     size_t size,
+                                                     FFAMediaCodecBufferInfo *info,
+                                                     AVFrame *frame);
+
+#endif /* AVCODEC_MEDIACODEC_SW_BUFFER_H */
diff --git a/libavcodec/mediacodec_wrapper.c b/libavcodec/mediacodec_wrapper.c
new file mode 100644
index 0000000..546768f
--- /dev/null
+++ b/libavcodec/mediacodec_wrapper.c
@@ -0,0 +1,1808 @@
+/*
+ * Android MediaCodec Wrapper
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <jni.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/avstring.h"
+
+#include "avcodec.h"
+#include "ffjni.h"
+#include "version.h"
+#include "mediacodec_wrapper.h"
+
+struct JNIAMediaCodecListFields {
+
+    jclass mediacodec_list_class;
+    jmethodID init_id;
+    jmethodID find_decoder_for_format_id;
+
+    jmethodID get_codec_count_id;
+    jmethodID get_codec_info_at_id;
+
+    jclass mediacodec_info_class;
+    jmethodID get_name_id;
+    jmethodID get_codec_capabilities_id;
+    jmethodID get_supported_types_id;
+    jmethodID is_encoder_id;
+
+    jclass codec_capabilities_class;
+    jfieldID color_formats_id;
+    jfieldID profile_levels_id;
+
+    jclass codec_profile_level_class;
+    jfieldID profile_id;
+    jfieldID level_id;
+
+    jfieldID avc_profile_baseline_id;
+    jfieldID avc_profile_main_id;
+    jfieldID avc_profile_extended_id;
+    jfieldID avc_profile_high_id;
+    jfieldID avc_profile_high10_id;
+    jfieldID avc_profile_high422_id;
+    jfieldID avc_profile_high444_id;
+
+} JNIAMediaCodecListFields;
+
+static const struct FFJniField jni_amediacodeclist_mapping[] = {
+    { "android/media/MediaCodecList", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, mediacodec_list_class), 1 },
+        { "android/media/MediaCodecList", "<init>", "(I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, init_id), 0 },
+        { "android/media/MediaCodecList", "findDecoderForFormat", "(Landroid/media/MediaFormat;)Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, find_decoder_for_format_id), 0 },
+
+        { "android/media/MediaCodecList", "getCodecCount", "()I", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_count_id), 1 },
+        { "android/media/MediaCodecList", "getCodecInfoAt", "(I)Landroid/media/MediaCodecInfo;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_info_at_id), 1 },
+
+    { "android/media/MediaCodecInfo", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, mediacodec_info_class), 1 },
+        { "android/media/MediaCodecInfo", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_name_id), 1 },
+        { "android/media/MediaCodecInfo", "getCapabilitiesForType", "(Ljava/lang/String;)Landroid/media/MediaCodecInfo$CodecCapabilities;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_capabilities_id), 1 },
+        { "android/media/MediaCodecInfo", "getSupportedTypes", "()[Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_supported_types_id), 1 },
+        { "android/media/MediaCodecInfo", "isEncoder", "()Z", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, is_encoder_id), 1 },
+
+    { "android/media/MediaCodecInfo$CodecCapabilities", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, codec_capabilities_class), 1 },
+        { "android/media/MediaCodecInfo$CodecCapabilities", "colorFormats", "[I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, color_formats_id), 1 },
+        { "android/media/MediaCodecInfo$CodecCapabilities", "profileLevels", "[Landroid/media/MediaCodecInfo$CodecProfileLevel;", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, profile_levels_id), 1 },
+
+    { "android/media/MediaCodecInfo$CodecProfileLevel", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, codec_profile_level_class), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "profile", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, profile_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "level", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, level_id), 1 },
+
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileBaseline", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_baseline_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileMain", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_main_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileExtended", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_extended_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh10", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high10_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh422", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high422_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "AVCProfileHigh444", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecListFields, avc_profile_high444_id), 1 },
+
+    { NULL }
+};
+
+struct JNIAMediaFormatFields {
+
+    jclass mediaformat_class;
+
+    jmethodID init_id;
+
+    jmethodID get_integer_id;
+    jmethodID get_long_id;
+    jmethodID get_float_id;
+    jmethodID get_bytebuffer_id;
+    jmethodID get_string_id;
+
+    jmethodID set_integer_id;
+    jmethodID set_long_id;
+    jmethodID set_float_id;
+    jmethodID set_bytebuffer_id;
+    jmethodID set_string_id;
+
+    jmethodID to_string_id;
+
+} JNIAMediaFormatFields;
+
+static const struct FFJniField jni_amediaformat_mapping[] = {
+    { "android/media/MediaFormat", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaFormatFields, mediaformat_class), 1 },
+
+        { "android/media/MediaFormat", "<init>", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, init_id), 1 },
+
+        { "android/media/MediaFormat", "getInteger", "(Ljava/lang/String;)I", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_integer_id), 1 },
+        { "android/media/MediaFormat", "getLong", "(Ljava/lang/String;)J", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_long_id), 1 },
+        { "android/media/MediaFormat", "getFloat", "(Ljava/lang/String;)F", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_float_id), 1 },
+        { "android/media/MediaFormat", "getByteBuffer", "(Ljava/lang/String;)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_bytebuffer_id), 1 },
+        { "android/media/MediaFormat", "getString", "(Ljava/lang/String;)Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_string_id), 1 },
+
+        { "android/media/MediaFormat", "setInteger", "(Ljava/lang/String;I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_integer_id), 1 },
+        { "android/media/MediaFormat", "setLong", "(Ljava/lang/String;J)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_long_id), 1 },
+        { "android/media/MediaFormat", "setFloat", "(Ljava/lang/String;F)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_float_id), 1 },
+        { "android/media/MediaFormat", "setByteBuffer", "(Ljava/lang/String;Ljava/nio/ByteBuffer;)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_bytebuffer_id), 1 },
+        { "android/media/MediaFormat", "setString", "(Ljava/lang/String;Ljava/lang/String;)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_string_id), 1 },
+
+        { "android/media/MediaFormat", "toString", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, to_string_id), 1 },
+
+    { NULL }
+};
+
+static const AVClass amediaformat_class = {
+    .class_name = "amediaformat",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVCODEC_VERSION_INT,
+};
+
+struct FFAMediaFormat {
+
+    const AVClass *class;
+    struct JNIAMediaFormatFields jfields;
+    jobject object;
+};
+
+struct JNIAMediaCodecFields {
+
+    jclass mediacodec_class;
+
+    jfieldID info_try_again_later_id;
+    jfieldID info_output_buffers_changed_id;
+    jfieldID info_output_format_changed_id;
+
+    jfieldID buffer_flag_codec_config_id;
+    jfieldID buffer_flag_end_of_stream_id;
+    jfieldID buffer_flag_key_frame_id;
+
+    jfieldID configure_flag_encode_id;
+
+    jmethodID create_by_codec_name_id;
+    jmethodID create_decoder_by_type_id;
+    jmethodID create_encoder_by_type_id;
+
+    jmethodID get_name_id;
+
+    jmethodID configure_id;
+    jmethodID start_id;
+    jmethodID flush_id;
+    jmethodID stop_id;
+    jmethodID release_id;
+
+    jmethodID get_output_format_id;
+
+    jmethodID dequeue_input_buffer_id;
+    jmethodID queue_input_buffer_id;
+    jmethodID get_input_buffer_id;
+    jmethodID get_input_buffers_id;
+
+    jmethodID dequeue_output_buffer_id;
+    jmethodID get_output_buffer_id;
+    jmethodID get_output_buffers_id;
+    jmethodID release_output_buffer_id;
+    jmethodID release_output_buffer_at_time_id;
+
+    jclass mediainfo_class;
+
+    jmethodID init_id;
+
+    jfieldID flags_id;
+    jfieldID offset_id;
+    jfieldID presentation_time_us_id;
+    jfieldID size_id;
+
+} JNIAMediaCodecFields;
+
+static const struct FFJniField jni_amediacodec_mapping[] = {
+    { "android/media/MediaCodec", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecFields, mediacodec_class), 1 },
+
+        { "android/media/MediaCodec", "INFO_TRY_AGAIN_LATER", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_try_again_later_id), 1 },
+        { "android/media/MediaCodec", "INFO_OUTPUT_BUFFERS_CHANGED", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_output_buffers_changed_id), 1 },
+        { "android/media/MediaCodec", "INFO_OUTPUT_FORMAT_CHANGED", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_output_format_changed_id), 1 },
+
+        { "android/media/MediaCodec", "BUFFER_FLAG_CODEC_CONFIG", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_codec_config_id), 1 },
+        { "android/media/MediaCodec", "BUFFER_FLAG_END_OF_STREAM", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_end_of_stream_id), 1 },
+        { "android/media/MediaCodec", "BUFFER_FLAG_KEY_FRAME", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_key_frame_id), 0 },
+
+        { "android/media/MediaCodec", "CONFIGURE_FLAG_ENCODE", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, configure_flag_encode_id), 1 },
+
+        { "android/media/MediaCodec", "createByCodecName", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_by_codec_name_id), 1 },
+        { "android/media/MediaCodec", "createDecoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_decoder_by_type_id), 1 },
+        { "android/media/MediaCodec", "createEncoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_encoder_by_type_id), 1 },
+
+        { "android/media/MediaCodec", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_name_id), 1 },
+
+        { "android/media/MediaCodec", "configure", "(Landroid/media/MediaFormat;Landroid/view/Surface;Landroid/media/MediaCrypto;I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, configure_id), 1 },
+        { "android/media/MediaCodec", "start", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, start_id), 1 },
+        { "android/media/MediaCodec", "flush", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, flush_id), 1 },
+        { "android/media/MediaCodec", "stop", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, stop_id), 1 },
+        { "android/media/MediaCodec", "release", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_id), 1 },
+
+        { "android/media/MediaCodec", "getOutputFormat", "()Landroid/media/MediaFormat;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_format_id), 1 },
+
+        { "android/media/MediaCodec", "dequeueInputBuffer", "(J)I", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, dequeue_input_buffer_id), 1 },
+        { "android/media/MediaCodec", "queueInputBuffer", "(IIIJI)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, queue_input_buffer_id), 1 },
+        { "android/media/MediaCodec", "getInputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_input_buffer_id), 0 },
+        { "android/media/MediaCodec", "getInputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_input_buffers_id), 1 },
+
+        { "android/media/MediaCodec", "dequeueOutputBuffer", "(Landroid/media/MediaCodec$BufferInfo;J)I", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, dequeue_output_buffer_id), 1 },
+        { "android/media/MediaCodec", "getOutputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_buffer_id), 0 },
+        { "android/media/MediaCodec", "getOutputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_buffers_id), 1 },
+        { "android/media/MediaCodec", "releaseOutputBuffer", "(IZ)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_output_buffer_id), 1 },
+        { "android/media/MediaCodec", "releaseOutputBuffer", "(IJ)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_output_buffer_at_time_id), 0 },
+
+    { "android/media/MediaCodec$BufferInfo", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecFields, mediainfo_class), 1 },
+
+        { "android/media/MediaCodec.BufferInfo", "<init>", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, init_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "flags", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, flags_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "offset", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, offset_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "presentationTimeUs", "J", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, presentation_time_us_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "size", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, size_id), 1 },
+
+    { NULL }
+};
+
+static const AVClass amediacodec_class = {
+    .class_name = "amediacodec",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVCODEC_VERSION_INT,
+};
+
+struct FFAMediaCodec {
+
+    const AVClass *class;
+
+    struct JNIAMediaCodecFields jfields;
+
+    jobject object;
+
+    jobject input_buffers;
+    jobject output_buffers;
+
+    int INFO_TRY_AGAIN_LATER;
+    int INFO_OUTPUT_BUFFERS_CHANGED;
+    int INFO_OUTPUT_FORMAT_CHANGED;
+
+    int BUFFER_FLAG_CODEC_CONFIG;
+    int BUFFER_FLAG_END_OF_STREAM;
+    int BUFFER_FLAG_KEY_FRAME;
+
+    int CONFIGURE_FLAG_ENCODE;
+
+    int has_get_i_o_buffer;
+};
+
+#define JNI_ATTACH_ENV_OR_RETURN(env, attached, log_ctx, ret) do { \
+    (env) = ff_jni_attach_env(attached, log_ctx);                  \
+    if (!(env)) {                                                  \
+        return ret;                                                \
+    }                                                              \
+} while (0)
+
+#define JNI_ATTACH_ENV_OR_RETURN_VOID(env, attached, log_ctx) do { \
+    (env) = ff_jni_attach_env(attached, log_ctx);              \
+    if (!(env)) {                                                  \
+        return;                                                    \
+    }                                                              \
+} while (0)
+
+#define JNI_DETACH_ENV(attached, log_ctx) do { \
+    if (attached)                              \
+        ff_jni_detach_env(log_ctx);            \
+} while (0)
+
+
+
+int ff_AMediaCodecProfile_getProfileFromAVCodecContext(AVCodecContext *avctx)
+{
+    int ret = -1;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    struct JNIAMediaCodecListFields jfields = { 0 };
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, avctx, -1);
+
+    if (ff_jni_init_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, avctx) < 0) {
+        goto done;
+    }
+
+    if (avctx->codec_id == AV_CODEC_ID_H264) {
+        jfieldID field_id = 0;
+
+        switch(avctx->profile) {
+        case FF_PROFILE_H264_BASELINE:
+        case FF_PROFILE_H264_CONSTRAINED_BASELINE:
+            field_id = jfields.avc_profile_baseline_id;
+            break;
+        case FF_PROFILE_H264_MAIN:
+            field_id = jfields.avc_profile_main_id;
+            break;
+        case FF_PROFILE_H264_EXTENDED:
+            field_id = jfields.avc_profile_extended_id;
+            break;
+        case FF_PROFILE_H264_HIGH:
+            field_id = jfields.avc_profile_high_id;
+            break;
+        case FF_PROFILE_H264_HIGH_10:
+        case FF_PROFILE_H264_HIGH_10_INTRA:
+            field_id = jfields.avc_profile_high10_id;
+            break;
+        case FF_PROFILE_H264_HIGH_422:
+        case FF_PROFILE_H264_HIGH_422_INTRA:
+            field_id = jfields.avc_profile_high422_id;
+            break;
+        case FF_PROFILE_H264_HIGH_444:
+        case FF_PROFILE_H264_HIGH_444_INTRA:
+        case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
+            field_id = jfields.avc_profile_high444_id;
+            break;
+        }
+
+        if (field_id) {
+            ret = (*env)->GetStaticIntField(env, jfields.codec_profile_level_class, field_id);
+            if (ff_jni_exception_check(env, 1, avctx) < 0) {
+                ret = -1;
+                goto done;
+            }
+        }
+    }
+
+done:
+    ff_jni_reset_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, avctx);
+
+    JNI_DETACH_ENV(attached, avctx);
+
+    return ret;
+}
+
+char *ff_AMediaCodecList_getCodecNameByType(const char *mime, int profile, int encoder, void *log_ctx)
+{
+    int ret;
+    int i;
+    int codec_count;
+    int found_codec = 0;
+    char *name = NULL;
+    char *supported_type = NULL;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    struct JNIAMediaCodecListFields jfields = { 0 };
+    struct JNIAMediaFormatFields mediaformat_jfields = { 0 };
+
+    jobject format = NULL;
+    jobject codec = NULL;
+    jobject codec_name = NULL;
+
+    jobject info = NULL;
+    jobject type = NULL;
+    jobjectArray types = NULL;
+
+    jobject capabilities = NULL;
+    jobject profile_level = NULL;
+    jobjectArray profile_levels = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, log_ctx, NULL);
+
+    if ((ret = ff_jni_init_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, log_ctx)) < 0) {
+        goto done;
+    }
+
+    if ((ret = ff_jni_init_jfields(env, &mediaformat_jfields, jni_amediaformat_mapping, 0, log_ctx)) < 0) {
+        goto done;
+    }
+
+    codec_count = (*env)->CallStaticIntMethod(env, jfields.mediacodec_list_class, jfields.get_codec_count_id);
+    if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+        goto done;
+    }
+
+    for(i = 0; i < codec_count; i++) {
+        int j;
+        int type_count;
+        int is_encoder;
+
+        info = (*env)->CallStaticObjectMethod(env, jfields.mediacodec_list_class, jfields.get_codec_info_at_id, i);
+        if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+            goto done;
+        }
+
+        types = (*env)->CallObjectMethod(env, info, jfields.get_supported_types_id);
+        if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+            goto done;
+        }
+
+        is_encoder = (*env)->CallBooleanMethod(env, info, jfields.is_encoder_id);
+        if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+            goto done;
+        }
+
+        if (is_encoder != encoder) {
+            goto done_with_info;
+        }
+
+        type_count = (*env)->GetArrayLength(env, types);
+        for (j = 0; j < type_count; j++) {
+            int k;
+            int profile_count;
+
+            type = (*env)->GetObjectArrayElement(env, types, j);
+            if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                goto done;
+            }
+
+            supported_type = ff_jni_jstring_to_utf_chars(env, type, log_ctx);
+            if (!supported_type) {
+                goto done;
+            }
+
+            if (!av_strcasecmp(supported_type, mime)) {
+                codec_name = (*env)->CallObjectMethod(env, info, jfields.get_name_id);
+                if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                    goto done;
+                }
+
+                name = ff_jni_jstring_to_utf_chars(env, codec_name, log_ctx);
+                if (!name) {
+                    goto done;
+                }
+
+                if (strstr(name, "OMX.google")) {
+                    av_freep(&name);
+                    goto done_with_type;
+                }
+
+                capabilities = (*env)->CallObjectMethod(env, info, jfields.get_codec_capabilities_id, type);
+                if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                    goto done;
+                }
+
+                profile_levels = (*env)->GetObjectField(env, capabilities, jfields.profile_levels_id);
+                if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                    goto done;
+                }
+
+                profile_count = (*env)->GetArrayLength(env, profile_levels);
+                for (k = 0; k < profile_count; k++) {
+                    int supported_profile = 0;
+
+                    if (profile < 0) {
+                        found_codec = 1;
+                        break;
+                    }
+
+                    profile_level = (*env)->GetObjectArrayElement(env, profile_levels, k);
+                    if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                        goto done;
+                    }
+
+                    supported_profile = (*env)->GetIntField(env, profile_level, jfields.profile_id);
+                    if (ff_jni_exception_check(env, 1, log_ctx) < 0) {
+                        goto done;
+                    }
+
+                    found_codec = profile == supported_profile;
+
+                    if (profile_level) {
+                        (*env)->DeleteLocalRef(env, profile_level);
+                        profile_level = NULL;
+                    }
+
+                    if (found_codec) {
+                        break;
+                    }
+                }
+            }
+
+done_with_type:
+            if (profile_levels) {
+                (*env)->DeleteLocalRef(env, profile_levels);
+                profile_levels = NULL;
+            }
+
+            if (capabilities) {
+                (*env)->DeleteLocalRef(env, capabilities);
+                capabilities = NULL;
+            }
+
+            if (type) {
+                (*env)->DeleteLocalRef(env, type);
+                type = NULL;
+            }
+
+            av_freep(&supported_type);
+
+            if (found_codec) {
+                break;
+            }
+
+            av_freep(&name);
+        }
+
+done_with_info:
+        if (info) {
+            (*env)->DeleteLocalRef(env, info);
+            info = NULL;
+        }
+
+        if (types) {
+            (*env)->DeleteLocalRef(env, types);
+            types = NULL;
+        }
+
+        if (found_codec) {
+            break;
+        }
+    }
+
+done:
+    if (format) {
+        (*env)->DeleteLocalRef(env, format);
+    }
+
+    if (codec) {
+        (*env)->DeleteLocalRef(env, codec);
+    }
+
+    if (codec_name) {
+        (*env)->DeleteLocalRef(env, codec_name);
+    }
+
+    if (info) {
+        (*env)->DeleteLocalRef(env, info);
+    }
+
+    if (type) {
+        (*env)->DeleteLocalRef(env, type);
+    }
+
+    if (types) {
+        (*env)->DeleteLocalRef(env, types);
+    }
+
+    if (capabilities) {
+        (*env)->DeleteLocalRef(env, capabilities);
+    }
+
+    if (profile_level) {
+        (*env)->DeleteLocalRef(env, profile_level);
+    }
+
+    if (profile_levels) {
+        (*env)->DeleteLocalRef(env, profile_levels);
+    }
+
+    av_freep(&supported_type);
+
+    ff_jni_reset_jfields(env, &jfields, jni_amediacodeclist_mapping, 0, log_ctx);
+    ff_jni_reset_jfields(env, &mediaformat_jfields, jni_amediaformat_mapping, 0, log_ctx);
+
+    JNI_DETACH_ENV(attached, log_ctx);
+
+    if (!found_codec) {
+        av_freep(&name);
+    }
+
+    return name;
+}
+
+FFAMediaFormat *ff_AMediaFormat_new(void)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    FFAMediaFormat *format = NULL;
+
+    format = av_mallocz(sizeof(FFAMediaFormat));
+    if (!format) {
+        return NULL;
+    }
+    format->class = &amediaformat_class;
+
+    env = ff_jni_attach_env(&attached, format);
+    if (!env) {
+        av_freep(&format);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format) < 0) {
+        goto fail;
+    }
+
+    format->object = (*env)->NewObject(env, format->jfields.mediaformat_class, format->jfields.init_id);
+    if (!format->object) {
+        goto fail;
+    }
+
+    format->object = (*env)->NewGlobalRef(env, format->object);
+    if (!format->object) {
+        goto fail;
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return format;
+fail:
+    ff_jni_reset_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format);
+
+    JNI_DETACH_ENV(attached, format);
+
+    av_freep(&format);
+
+    return NULL;
+}
+
+static FFAMediaFormat *ff_AMediaFormat_newFromObject(void *object)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    FFAMediaFormat *format = NULL;
+
+    format = av_mallocz(sizeof(FFAMediaFormat));
+    if (!format) {
+        return NULL;
+    }
+    format->class = &amediaformat_class;
+
+    env = ff_jni_attach_env(&attached, format);
+    if (!env) {
+        av_freep(&format);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format) < 0) {
+        goto fail;
+    }
+
+    format->object = (*env)->NewGlobalRef(env, object);
+    if (!format->object) {
+        goto fail;
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return format;
+fail:
+    ff_jni_reset_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format);
+
+    JNI_DETACH_ENV(attached, format);
+
+    av_freep(&format);
+
+    return NULL;
+}
+
+int ff_AMediaFormat_delete(FFAMediaFormat* format)
+{
+    int ret = 0;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    if (!format) {
+        return 0;
+    }
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, format, AVERROR_EXTERNAL);
+
+    (*env)->DeleteGlobalRef(env, format->object);
+    format->object = NULL;
+
+    ff_jni_reset_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format);
+
+    JNI_DETACH_ENV(attached, format);
+
+    av_freep(&format);
+
+    return ret;
+}
+
+char* ff_AMediaFormat_toString(FFAMediaFormat* format)
+{
+    char *ret = NULL;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring description = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, format, NULL);
+
+    description = (*env)->CallObjectMethod(env, format->object, format->jfields.to_string_id);
+    if (ff_jni_exception_check(env, 1, NULL) < 0) {
+        goto fail;
+    }
+
+    ret = ff_jni_jstring_to_utf_chars(env, description, format);
+fail:
+
+    if (description) {
+        (*env)->DeleteLocalRef(env, description);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return ret;
+}
+
+int ff_AMediaFormat_getInt32(FFAMediaFormat* format, const char *name, int32_t *out)
+{
+    int ret = 1;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = (*env)->CallIntMethod(env, format->object, format->jfields.get_integer_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return ret;
+}
+
+int ff_AMediaFormat_getInt64(FFAMediaFormat* format, const char *name, int64_t *out)
+{
+    int ret = 1;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = (*env)->CallLongMethod(env, format->object, format->jfields.get_long_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return ret;
+}
+
+int ff_AMediaFormat_getFloat(FFAMediaFormat* format, const char *name, float *out)
+{
+    int ret = 1;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = (*env)->CallFloatMethod(env, format->object, format->jfields.get_float_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return ret;
+}
+
+int ff_AMediaFormat_getBuffer(FFAMediaFormat* format, const char *name, void** data, size_t *size)
+{
+    int ret = 1;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jobject result = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    result = (*env)->CallObjectMethod(env, format->object, format->jfields.get_bytebuffer_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    *data = (*env)->GetDirectBufferAddress(env, result);
+    *size = (*env)->GetDirectBufferCapacity(env, result);
+
+    if (*data && *size) {
+        void *src = *data;
+        *data = av_malloc(*size);
+        if (!*data) {
+            ret = 0;
+            goto fail;
+        }
+
+        memcpy(*data, src, *size);
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (result) {
+        (*env)->DeleteLocalRef(env, result);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return ret;
+}
+
+int ff_AMediaFormat_getString(FFAMediaFormat* format, const char *name, const char **out)
+{
+    int ret = 1;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jstring result = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, format, 0);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        ret = 0;
+        goto fail;
+    }
+
+    result = (*env)->CallObjectMethod(env, format->object, format->jfields.get_string_id, key);
+    if ((ret = ff_jni_exception_check(env, 1, format)) < 0) {
+        ret = 0;
+        goto fail;
+    }
+
+    *out = ff_jni_jstring_to_utf_chars(env, result, format);
+    if (!*out) {
+        ret = 0;
+        goto fail;
+    }
+
+    ret = 1;
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (result) {
+        (*env)->DeleteLocalRef(env, result);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+
+    return ret;
+}
+
+void ff_AMediaFormat_setInt32(FFAMediaFormat* format, const char* name, int32_t value)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN_VOID(env, &attached, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_integer_id, key, value);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+}
+
+void ff_AMediaFormat_setInt64(FFAMediaFormat* format, const char* name, int64_t value)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN_VOID(env, &attached, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_long_id, key, value);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    JNI_DETACH_ENV(attached, NULL);
+}
+
+void ff_AMediaFormat_setFloat(FFAMediaFormat* format, const char* name, float value)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN_VOID(env, &attached, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_float_id, key, value);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    JNI_DETACH_ENV(attached, NULL);
+}
+
+void ff_AMediaFormat_setString(FFAMediaFormat* format, const char* name, const char* value)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jstring string = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN_VOID(env, &attached, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    string = ff_jni_utf_chars_to_jstring(env, value, format);
+    if (!string) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_string_id, key, string);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (string) {
+        (*env)->DeleteLocalRef(env, string);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+}
+
+void ff_AMediaFormat_setBuffer(FFAMediaFormat* format, const char* name, void* data, size_t size)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jstring key = NULL;
+    jobject buffer = NULL;
+    void *buffer_data = NULL;
+
+    av_assert0(format != NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN_VOID(env, &attached, format);
+
+    key = ff_jni_utf_chars_to_jstring(env, name, format);
+    if (!key) {
+        goto fail;
+    }
+
+    if (!data || !size) {
+        goto fail;
+    }
+
+    buffer_data = av_malloc(size);
+    if (!buffer_data) {
+        goto fail;
+    }
+
+    memcpy(buffer_data, data, size);
+
+    buffer = (*env)->NewDirectByteBuffer(env, buffer_data, size);
+    if (!buffer) {
+        goto fail;
+    }
+
+    (*env)->CallVoidMethod(env, format->object, format->jfields.set_bytebuffer_id, key, buffer);
+    if (ff_jni_exception_check(env, 1, format) < 0) {
+        goto fail;
+    }
+
+fail:
+    if (key) {
+        (*env)->DeleteLocalRef(env, key);
+    }
+
+    if (buffer) {
+        (*env)->DeleteLocalRef(env, buffer);
+    }
+
+    JNI_DETACH_ENV(attached, format);
+}
+
+static int codec_init_static_fields(FFAMediaCodec *codec)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    codec->INFO_TRY_AGAIN_LATER = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_try_again_later_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->BUFFER_FLAG_CODEC_CONFIG = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.buffer_flag_codec_config_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->BUFFER_FLAG_END_OF_STREAM = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.buffer_flag_end_of_stream_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    if (codec->jfields.buffer_flag_key_frame_id) {
+        codec->BUFFER_FLAG_KEY_FRAME = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.buffer_flag_key_frame_id);
+        if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+            goto fail;
+        }
+    }
+
+    codec->CONFIGURE_FLAG_ENCODE = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.configure_flag_encode_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->INFO_TRY_AGAIN_LATER = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_try_again_later_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->INFO_OUTPUT_BUFFERS_CHANGED = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_output_buffers_changed_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+    codec->INFO_OUTPUT_FORMAT_CHANGED = (*env)->GetStaticIntField(env, codec->jfields.mediacodec_class, codec->jfields.info_output_format_changed_id);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, NULL);
+
+    return ret;
+}
+
+FFAMediaCodec* ff_AMediaCodec_createCodecByName(const char *name)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    FFAMediaCodec *codec = NULL;
+    jstring codec_name = NULL;
+
+    codec = av_mallocz(sizeof(FFAMediaCodec));
+    if (!codec) {
+        return NULL;
+    }
+    codec->class = &amediacodec_class;
+
+    env = ff_jni_attach_env(&attached, codec);
+    if (!env) {
+        av_freep(&codec);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec) < 0) {
+        goto fail;
+    }
+
+    codec_name = ff_jni_utf_chars_to_jstring(env, name, codec);
+    if (!codec_name) {
+        goto fail;
+    }
+
+    codec->object = (*env)->CallStaticObjectMethod(env, codec->jfields.mediacodec_class, codec->jfields.create_by_codec_name_id, codec_name);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    codec->object = (*env)->NewGlobalRef(env, codec->object);
+    if (!codec->object) {
+        goto fail;
+    }
+
+    if (codec_init_static_fields(codec) < 0) {
+        goto fail;
+    }
+
+    if (codec->jfields.get_input_buffer_id && codec->jfields.get_output_buffer_id) {
+        codec->has_get_i_o_buffer = 1;
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    return codec;
+fail:
+    ff_jni_reset_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec);
+
+    if (codec_name) {
+        (*env)->DeleteLocalRef(env, codec_name);
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    av_freep(&codec);
+
+    return NULL;
+}
+
+FFAMediaCodec* ff_AMediaCodec_createDecoderByType(const char *mime)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    FFAMediaCodec *codec = NULL;
+    jstring mime_type = NULL;
+
+    codec = av_mallocz(sizeof(FFAMediaCodec));
+    if (!codec) {
+        return NULL;
+    }
+    codec->class = &amediacodec_class;
+
+    env = ff_jni_attach_env(&attached, codec);
+    if (!env) {
+        av_freep(&codec);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec) < 0) {
+        goto fail;
+    }
+
+    mime_type = ff_jni_utf_chars_to_jstring(env, mime, codec);
+    if (!mime_type) {
+        goto fail;
+    }
+
+    codec->object = (*env)->CallStaticObjectMethod(env, codec->jfields.mediacodec_class, codec->jfields.create_decoder_by_type_id, mime_type);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    codec->object = (*env)->NewGlobalRef(env, codec->object);
+    if (!codec->object) {
+        goto fail;
+    }
+
+    if (codec_init_static_fields(codec) < 0) {
+        goto fail;
+    }
+
+    if (codec->jfields.get_input_buffer_id && codec->jfields.get_output_buffer_id) {
+        codec->has_get_i_o_buffer = 1;
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    return codec;
+fail:
+    ff_jni_reset_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec);
+
+    if (mime_type) {
+        (*env)->DeleteLocalRef(env, mime_type);
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    av_freep(&codec);
+
+    return NULL;
+}
+
+FFAMediaCodec* ff_AMediaCodec_createEncoderByType(const char *mime)
+{
+    int attached = 0;
+    JNIEnv *env = NULL;
+    FFAMediaCodec *codec = NULL;
+    jstring mime_type = NULL;
+
+    codec = av_mallocz(sizeof(FFAMediaCodec));
+    if (!codec) {
+        return NULL;
+    }
+    codec->class = &amediacodec_class;
+
+    env = ff_jni_attach_env(&attached, codec);
+    if (!env) {
+        av_freep(&codec);
+        return NULL;
+    }
+
+    if (ff_jni_init_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec) < 0) {
+        goto fail;
+    }
+
+    mime_type = ff_jni_utf_chars_to_jstring(env, mime, codec);
+    if (!mime_type) {
+        goto fail;
+    }
+
+    codec->object = (*env)->CallStaticObjectMethod(env, codec->jfields.mediacodec_class, codec->jfields.create_encoder_by_type_id, mime_type);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    codec->object = (*env)->NewGlobalRef(env, codec->object);
+    if (!codec->object) {
+        goto fail;
+    }
+
+    if (codec_init_static_fields(codec) < 0) {
+        goto fail;
+    }
+
+    if (codec->jfields.get_input_buffer_id && codec->jfields.get_output_buffer_id) {
+        codec->has_get_i_o_buffer = 1;
+    }
+
+    JNI_DETACH_ENV(attached, NULL);
+
+    return codec;
+fail:
+    ff_jni_reset_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec);
+
+    if (mime_type) {
+        (*env)->DeleteLocalRef(env, mime_type);
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    av_freep(&codec);
+
+    return NULL;
+}
+
+int ff_AMediaCodec_delete(FFAMediaCodec* codec)
+{
+    int ret = 0;
+
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    if (!codec) {
+        return 0;
+    }
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.release_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+    }
+
+    (*env)->DeleteGlobalRef(env, codec->object);
+    codec->object = NULL;
+
+    ff_jni_reset_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec);
+
+    JNI_DETACH_ENV(attached, codec);
+
+    av_freep(&codec);
+
+    return ret;
+}
+
+char *ff_AMediaCodec_getName(FFAMediaCodec *codec)
+{
+    char *ret = NULL;
+    int attached = 0;
+    JNIEnv *env = NULL;
+    jobject *name = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, NULL);
+
+    name = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_name_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    ret = ff_jni_jstring_to_utf_chars(env, name, codec);
+
+fail:
+    JNI_DETACH_ENV(attached, NULL);
+
+    return ret;
+}
+
+int ff_AMediaCodec_configure(FFAMediaCodec* codec, const FFAMediaFormat* format, void* surface, void *crypto, uint32_t flags)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    /* TODO: implement surface handling */
+    av_assert0(surface == NULL);
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.configure_id, format->object, NULL, NULL, flags);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, NULL);
+
+    return ret;
+}
+
+int ff_AMediaCodec_start(FFAMediaCodec* codec)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.start_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+int ff_AMediaCodec_stop(FFAMediaCodec* codec)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.stop_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+int ff_AMediaCodec_flush(FFAMediaCodec* codec)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.flush_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+int ff_AMediaCodec_releaseOutputBuffer(FFAMediaCodec* codec, size_t idx, int render)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.release_output_buffer_id, idx, render);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+int ff_AMediaCodec_releaseOutputBufferAtTime(FFAMediaCodec *codec, size_t idx, int64_t timestampNs)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.release_output_buffer_at_time_id, idx, timestampNs);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+ssize_t ff_AMediaCodec_dequeueInputBuffer(FFAMediaCodec* codec, int64_t timeoutUs)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    ret = (*env)->CallIntMethod(env, codec->object, codec->jfields.dequeue_input_buffer_id, timeoutUs);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+int ff_AMediaCodec_queueInputBuffer(FFAMediaCodec* codec, size_t idx, off_t offset, size_t size, uint64_t time, uint32_t flags)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    (*env)->CallVoidMethod(env, codec->object, codec->jfields.queue_input_buffer_id, idx, offset, size, time, flags);
+    if ((ret = ff_jni_exception_check(env, 1, codec)) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+fail:
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+ssize_t ff_AMediaCodec_dequeueOutputBuffer(FFAMediaCodec* codec, FFAMediaCodecBufferInfo *info, int64_t timeoutUs)
+{
+    int ret = 0;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    jobject mediainfo = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, AVERROR_EXTERNAL);
+
+    mediainfo = (*env)->NewObject(env, codec->jfields.mediainfo_class, codec->jfields.init_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    ret = (*env)->CallIntMethod(env, codec->object, codec->jfields.dequeue_output_buffer_id, mediainfo, timeoutUs);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    info->flags = (*env)->GetIntField(env, mediainfo, codec->jfields.flags_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    info->offset = (*env)->GetIntField(env, mediainfo, codec->jfields.offset_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    info->presentationTimeUs = (*env)->GetLongField(env, mediainfo, codec->jfields.presentation_time_us_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    info->size = (*env)->GetIntField(env, mediainfo, codec->jfields.size_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+fail:
+    if (mediainfo) {
+        (*env)->DeleteLocalRef(env, mediainfo);
+    }
+
+    JNI_DETACH_ENV(attached, NULL);
+
+    return ret;
+}
+
+uint8_t* ff_AMediaCodec_getInputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size)
+{
+    uint8_t *ret = NULL;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    jobject buffer = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, NULL);
+
+    if (codec->has_get_i_o_buffer) {
+        buffer = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_input_buffer_id, idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    } else {
+        if (!codec->input_buffers) {
+            codec->input_buffers = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_input_buffers_id);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+
+            codec->input_buffers = (*env)->NewGlobalRef(env, codec->input_buffers);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+        }
+
+        buffer = (*env)->GetObjectArrayElement(env, codec->input_buffers, idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    }
+
+    ret = (*env)->GetDirectBufferAddress(env, buffer);
+    *out_size = (*env)->GetDirectBufferCapacity(env, buffer);
+fail:
+    if (buffer) {
+        (*env)->DeleteLocalRef(env, buffer);
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+uint8_t* ff_AMediaCodec_getOutputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size)
+{
+    uint8_t *ret = NULL;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    jobject buffer = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, NULL);
+
+    if (codec->has_get_i_o_buffer) {
+        buffer = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_output_buffer_id, idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    } else {
+        if (!codec->output_buffers) {
+            codec->output_buffers = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_output_buffers_id);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+
+            codec->output_buffers = (*env)->NewGlobalRef(env, codec->output_buffers);
+            if (ff_jni_exception_check(env, 1, codec) < 0) {
+                goto fail;
+            }
+        }
+
+        buffer = (*env)->GetObjectArrayElement(env, codec->output_buffers, idx);
+        if (ff_jni_exception_check(env, 1, codec) < 0) {
+            goto fail;
+        }
+    }
+
+    ret = (*env)->GetDirectBufferAddress(env, buffer);
+    *out_size = (*env)->GetDirectBufferCapacity(env, buffer);
+fail:
+    if (buffer) {
+        (*env)->DeleteLocalRef(env, buffer);
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+FFAMediaFormat* ff_AMediaCodec_getOutputFormat(FFAMediaCodec* codec)
+{
+    FFAMediaFormat *ret = NULL;
+    int attached = 0;
+    JNIEnv *env = NULL;
+
+    jobject mediaformat = NULL;
+
+    JNI_ATTACH_ENV_OR_RETURN(env, &attached, codec, NULL);
+
+    mediaformat = (*env)->CallObjectMethod(env, codec->object, codec->jfields.get_output_format_id);
+    if (ff_jni_exception_check(env, 1, codec) < 0) {
+        goto fail;
+    }
+
+    ret = ff_AMediaFormat_newFromObject(mediaformat);
+fail:
+    if (mediaformat) {
+        (*env)->DeleteLocalRef(env, mediaformat);
+    }
+
+    JNI_DETACH_ENV(attached, codec);
+
+    return ret;
+}
+
+int ff_AMediaCodec_infoTryAgainLater(FFAMediaCodec *codec, ssize_t idx)
+{
+    return idx == codec->INFO_TRY_AGAIN_LATER;
+}
+
+int ff_AMediaCodec_infoOutputBuffersChanged(FFAMediaCodec *codec, ssize_t idx)
+{
+    return idx == codec->INFO_OUTPUT_BUFFERS_CHANGED;
+}
+
+int ff_AMediaCodec_infoOutputFormatChanged(FFAMediaCodec *codec, ssize_t idx)
+{
+    return idx == codec->INFO_OUTPUT_FORMAT_CHANGED;
+}
+
+int ff_AMediaCodec_getBufferFlagCodecConfig(FFAMediaCodec *codec)
+{
+    return codec->BUFFER_FLAG_CODEC_CONFIG;
+}
+
+int ff_AMediaCodec_getBufferFlagEndOfStream(FFAMediaCodec *codec)
+{
+    return codec->BUFFER_FLAG_END_OF_STREAM;
+}
+
+int ff_AMediaCodec_getBufferFlagKeyFrame(FFAMediaCodec *codec)
+{
+    return codec->BUFFER_FLAG_KEY_FRAME;
+}
+
+int ff_AMediaCodec_getConfigureFlagEncode(FFAMediaCodec *codec)
+{
+    return codec->CONFIGURE_FLAG_ENCODE;
+}
+
+int ff_AMediaCodec_cleanOutputBuffers(FFAMediaCodec *codec)
+{
+    int ret = 0;
+
+    if (!codec->has_get_i_o_buffer) {
+        if (codec->output_buffers) {
+            int attached = 0;
+            JNIEnv *env = NULL;
+
+            env = ff_jni_attach_env(&attached, codec);
+            if (!env) {
+                ret = AVERROR_EXTERNAL;
+                goto fail;
+            }
+
+            (*env)->DeleteGlobalRef(env, codec->output_buffers);
+            codec->output_buffers = NULL;
+
+            JNI_DETACH_ENV(attached, codec);
+        }
+    }
+
+fail:
+    return ret;
+}
diff --git a/libavcodec/mediacodec_wrapper.h b/libavcodec/mediacodec_wrapper.h
new file mode 100644
index 0000000..cddd420
--- /dev/null
+++ b/libavcodec/mediacodec_wrapper.h
@@ -0,0 +1,127 @@
+/*
+ * Android MediaCodec Wrapper
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODEC_WRAPPER_H
+#define AVCODEC_MEDIACODEC_WRAPPER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+/**
+ * The following API around MediaCodec and MediaFormat is based on the
+ * NDK one provided by Google since Android 5.0.
+ *
+ * Differences from the NDK API:
+ *
+ * Buffers returned by ff_AMediaFormat_toString and ff_AMediaFormat_getString
+ * are newly allocated buffer and must be freed by the user after use.
+ *
+ * The MediaCrypto API is not implemented.
+ *
+ * ff_AMediaCodec_infoTryAgainLater, ff_AMediaCodec_infoOutputBuffersChanged,
+ * ff_AMediaCodec_infoOutputFormatChanged, ff_AMediaCodec_cleanOutputBuffers
+ * ff_AMediaCodec_getName and ff_AMediaCodec_getBufferFlagEndOfStream are not
+ * part of the original NDK API and are convenience functions to hide JNI
+ * implementation.
+ *
+ * The API around MediaCodecList is not part of the NDK (and is lacking as
+ * we still need to retreive the codec name to work around faulty decoders
+ * and encoders).
+ *
+ * For documentation, please refers to NdkMediaCodec.h NdkMediaFormat.h and
+ * http://developer.android.com/reference/android/media/MediaCodec.html.
+ *
+ */
+
+int ff_AMediaCodecProfile_getProfileFromAVCodecContext(AVCodecContext *avctx);
+
+char *ff_AMediaCodecList_getCodecNameByType(const char *mime, int profile, int encoder, void *log_ctx);
+
+struct FFAMediaFormat;
+typedef struct FFAMediaFormat FFAMediaFormat;
+
+FFAMediaFormat *ff_AMediaFormat_new(void);
+int ff_AMediaFormat_delete(FFAMediaFormat* format);
+
+char* ff_AMediaFormat_toString(FFAMediaFormat* format);
+
+int ff_AMediaFormat_getInt32(FFAMediaFormat* format, const char *name, int32_t *out);
+int ff_AMediaFormat_getInt64(FFAMediaFormat* format, const char *name, int64_t *out);
+int ff_AMediaFormat_getFloat(FFAMediaFormat* format, const char *name, float *out);
+int ff_AMediaFormat_getBuffer(FFAMediaFormat* format, const char *name, void** data, size_t *size);
+int ff_AMediaFormat_getString(FFAMediaFormat* format, const char *name, const char **out);
+
+void ff_AMediaFormat_setInt32(FFAMediaFormat* format, const char* name, int32_t value);
+void ff_AMediaFormat_setInt64(FFAMediaFormat* format, const char* name, int64_t value);
+void ff_AMediaFormat_setFloat(FFAMediaFormat* format, const char* name, float value);
+void ff_AMediaFormat_setString(FFAMediaFormat* format, const char* name, const char* value);
+void ff_AMediaFormat_setBuffer(FFAMediaFormat* format, const char* name, void* data, size_t size);
+
+struct FFAMediaCodec;
+typedef struct FFAMediaCodec FFAMediaCodec;
+typedef struct FFAMediaCodecCryptoInfo FFAMediaCodecCryptoInfo;
+
+struct FFAMediaCodecBufferInfo {
+    int32_t offset;
+    int32_t size;
+    int64_t presentationTimeUs;
+    uint32_t flags;
+};
+typedef struct FFAMediaCodecBufferInfo FFAMediaCodecBufferInfo;
+
+char *ff_AMediaCodec_getName(FFAMediaCodec *codec);
+
+FFAMediaCodec* ff_AMediaCodec_createCodecByName(const char *name);
+FFAMediaCodec* ff_AMediaCodec_createDecoderByType(const char *mime_type);
+FFAMediaCodec* ff_AMediaCodec_createEncoderByType(const char *mime_type);
+
+int ff_AMediaCodec_configure(FFAMediaCodec* codec, const FFAMediaFormat* format, void* surface, void *crypto, uint32_t flags);
+int ff_AMediaCodec_start(FFAMediaCodec* codec);
+int ff_AMediaCodec_stop(FFAMediaCodec* codec);
+int ff_AMediaCodec_flush(FFAMediaCodec* codec);
+int ff_AMediaCodec_delete(FFAMediaCodec* codec);
+
+uint8_t* ff_AMediaCodec_getInputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size);
+uint8_t* ff_AMediaCodec_getOutputBuffer(FFAMediaCodec* codec, size_t idx, size_t *out_size);
+
+ssize_t ff_AMediaCodec_dequeueInputBuffer(FFAMediaCodec* codec, int64_t timeoutUs);
+int ff_AMediaCodec_queueInputBuffer(FFAMediaCodec* codec, size_t idx, off_t offset, size_t size, uint64_t time, uint32_t flags);
+
+ssize_t ff_AMediaCodec_dequeueOutputBuffer(FFAMediaCodec* codec, FFAMediaCodecBufferInfo *info, int64_t timeoutUs);
+FFAMediaFormat* ff_AMediaCodec_getOutputFormat(FFAMediaCodec* codec);
+
+int ff_AMediaCodec_releaseOutputBuffer(FFAMediaCodec* codec, size_t idx, int render);
+int ff_AMediaCodec_releaseOutputBufferAtTime(FFAMediaCodec *codec, size_t idx, int64_t timestampNs);
+
+int ff_AMediaCodec_infoTryAgainLater(FFAMediaCodec *codec, ssize_t idx);
+int ff_AMediaCodec_infoOutputBuffersChanged(FFAMediaCodec *codec, ssize_t idx);
+int ff_AMediaCodec_infoOutputFormatChanged(FFAMediaCodec *codec, ssize_t indx);
+
+int ff_AMediaCodec_getBufferFlagCodecConfig (FFAMediaCodec *codec);
+int ff_AMediaCodec_getBufferFlagEndOfStream(FFAMediaCodec *codec);
+int ff_AMediaCodec_getBufferFlagKeyFrame(FFAMediaCodec *codec);
+
+int ff_AMediaCodec_getConfigureFlagEncode(FFAMediaCodec *codec);
+
+int ff_AMediaCodec_cleanOutputBuffers(FFAMediaCodec *codec);
+
+#endif /* AVCODEC_MEDIACODEC_WRAPPER_H */
diff --git a/libavcodec/mediacodecdec.c b/libavcodec/mediacodecdec.c
new file mode 100644
index 0000000..68df885
--- /dev/null
+++ b/libavcodec/mediacodecdec.c
@@ -0,0 +1,574 @@
+/*
+ * Android MediaCodec decoder
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include <sys/types.h>
+
+#include "libavutil/common.h"
+#include "libavutil/mem.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/time.h"
+#include "libavutil/timestamp.h"
+
+#include "avcodec.h"
+#include "internal.h"
+
+#include "mediacodec_sw_buffer.h"
+#include "mediacodec_wrapper.h"
+#include "mediacodecdec.h"
+
+/**
+ * OMX.k3.video.decoder.avc, OMX.NVIDIA.* OMX.SEC.avc.dec and OMX.google
+ * codec workarounds used in various place are taken from the Gstreamer
+ * project.
+ *
+ * Gstreamer references:
+ * https://cgit.freedesktop.org/gstreamer/gst-plugins-bad/tree/sys/androidmedia/
+ *
+ * Gstreamer copyright notice:
+ *
+ * Copyright (C) 2012, Collabora Ltd.
+ *   Author: Sebastian Dröge <sebastian.droege@collabora.co.uk>
+ *
+ * Copyright (C) 2012, Rafaël Carré <funman@videolanorg>
+ *
+ * Copyright (C) 2015, Sebastian Dröge <sebastian@centricular.com>
+ *
+ * Copyright (C) 2014-2015, Collabora Ltd.
+ *   Author: Matthieu Bouron <matthieu.bouron@gcollabora.com>
+ *
+ * Copyright (C) 2015, Edward Hervey
+ *   Author: Edward Hervey <bilboed@gmail.com>
+ *
+ * Copyright (C) 2015, Matthew Waters <matthew@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#define INPUT_DEQUEUE_TIMEOUT_US 8000
+#define OUTPUT_DEQUEUE_TIMEOUT_US 8000
+#define OUTPUT_DEQUEUE_BLOCK_TIMEOUT_US 1000000
+
+enum {
+    COLOR_FormatYUV420Planar                              = 0x13,
+    COLOR_FormatYUV420SemiPlanar                          = 0x15,
+    COLOR_FormatYCbYCr                                    = 0x19,
+    COLOR_FormatAndroidOpaque                             = 0x7F000789,
+    COLOR_QCOM_FormatYUV420SemiPlanar                     = 0x7fa30c00,
+    COLOR_QCOM_FormatYUV420SemiPlanar32m                  = 0x7fa30c04,
+    COLOR_QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka = 0x7fa30c03,
+    COLOR_TI_FormatYUV420PackedSemiPlanar                 = 0x7f000100,
+    COLOR_TI_FormatYUV420PackedSemiPlanarInterlaced       = 0x7f000001,
+};
+
+static const struct {
+
+    int color_format;
+    enum AVPixelFormat pix_fmt;
+
+} color_formats[] = {
+
+    { COLOR_FormatYUV420Planar,                              AV_PIX_FMT_YUV420P },
+    { COLOR_FormatYUV420SemiPlanar,                          AV_PIX_FMT_NV12    },
+    { COLOR_QCOM_FormatYUV420SemiPlanar,                     AV_PIX_FMT_NV12    },
+    { COLOR_QCOM_FormatYUV420SemiPlanar32m,                  AV_PIX_FMT_NV12    },
+    { COLOR_QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka, AV_PIX_FMT_NV12    },
+    { COLOR_TI_FormatYUV420PackedSemiPlanar,                 AV_PIX_FMT_NV12    },
+    { COLOR_TI_FormatYUV420PackedSemiPlanarInterlaced,       AV_PIX_FMT_NV12    },
+    { 0 }
+};
+
+static enum AVPixelFormat mcdec_map_color_format(AVCodecContext *avctx,
+                                                 MediaCodecDecContext *s,
+                                                 int color_format)
+{
+    int i;
+    enum AVPixelFormat ret = AV_PIX_FMT_NONE;
+
+    if (!strcmp(s->codec_name, "OMX.k3.video.decoder.avc") && color_format == COLOR_FormatYCbYCr) {
+        s->color_format = color_format = COLOR_TI_FormatYUV420PackedSemiPlanar;
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(color_formats); i++) {
+        if (color_formats[i].color_format == color_format) {
+            return color_formats[i].pix_fmt;
+        }
+    }
+
+    av_log(avctx, AV_LOG_ERROR, "Output color format 0x%x (value=%d) is not supported\n",
+        color_format, color_format);
+
+    return ret;
+}
+
+static int mediacodec_wrap_buffer(AVCodecContext *avctx,
+                                  MediaCodecDecContext *s,
+                                  uint8_t *data,
+                                  size_t size,
+                                  ssize_t index,
+                                  FFAMediaCodecBufferInfo *info,
+                                  AVFrame *frame)
+{
+    int ret = 0;
+    int status = 0;
+
+    frame->width = avctx->width;
+    frame->height = avctx->height;
+    frame->format = avctx->pix_fmt;
+
+    /* MediaCodec buffers needs to be copied to our own refcounted buffers
+     * because the flush command invalidates all input and output buffers.
+     */
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer\n");
+        goto done;
+    }
+
+    /* Override frame->pkt_pts as ff_get_buffer will override its value based
+     * on the last avpacket received which is not in sync with the frame:
+     *   * N avpackets can be pushed before 1 frame is actually returned
+     *   * 0-sized avpackets are pushed to flush remaining frames at EOS */
+    frame->pkt_pts = info->presentationTimeUs;
+    frame->pkt_dts = AV_NOPTS_VALUE;
+
+    av_log(avctx, AV_LOG_DEBUG,
+            "Frame: width=%d stride=%d height=%d slice-height=%d "
+            "crop-top=%d crop-bottom=%d crop-left=%d crop-right=%d encoder=%s\n"
+            "destination linesizes=%d,%d,%d\n" ,
+            avctx->width, s->stride, avctx->height, s->slice_height,
+            s->crop_top, s->crop_bottom, s->crop_left, s->crop_right, s->codec_name,
+            frame->linesize[0], frame->linesize[1], frame->linesize[2]);
+
+    switch (s->color_format) {
+    case COLOR_FormatYUV420Planar:
+        ff_mediacodec_sw_buffer_copy_yuv420_planar(avctx, s, data, size, info, frame);
+        break;
+    case COLOR_FormatYUV420SemiPlanar:
+    case COLOR_QCOM_FormatYUV420SemiPlanar:
+    case COLOR_QCOM_FormatYUV420SemiPlanar32m:
+        ff_mediacodec_sw_buffer_copy_yuv420_semi_planar(avctx, s, data, size, info, frame);
+        break;
+    case COLOR_TI_FormatYUV420PackedSemiPlanar:
+    case COLOR_TI_FormatYUV420PackedSemiPlanarInterlaced:
+        ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar(avctx, s, data, size, info, frame);
+        break;
+    case COLOR_QCOM_FormatYUV420PackedSemiPlanar64x32Tile2m8ka:
+        ff_mediacodec_sw_buffer_copy_yuv420_packed_semi_planar_64x32Tile2m8ka(avctx, s, data, size, info, frame);
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported color format 0x%x (value=%d)\n",
+            s->color_format, s->color_format);
+        ret = AVERROR(EINVAL);
+        goto done;
+    }
+
+    ret = 0;
+done:
+    status = ff_AMediaCodec_releaseOutputBuffer(s->codec, index, 0);
+    if (status < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to release output buffer\n");
+        ret = AVERROR_EXTERNAL;
+    }
+
+    return ret;
+}
+
+static int mediacodec_dec_parse_format(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    int width = 0;
+    int height = 0;
+    int32_t value = 0;
+    char *format = NULL;
+
+    if (!s->format) {
+        av_log(avctx, AV_LOG_ERROR, "Output MediaFormat is not set\n");
+        return AVERROR(EINVAL);
+    }
+
+    format = ff_AMediaFormat_toString(s->format);
+    if (!format) {
+        return AVERROR_EXTERNAL;
+    }
+    av_log(avctx, AV_LOG_DEBUG, "Parsing MediaFormat %s\n", format);
+    av_freep(&format);
+
+    /* Mandatory fields */
+    if (!ff_AMediaFormat_getInt32(s->format, "width", &value)) {
+        format = ff_AMediaFormat_toString(s->format);
+        av_log(avctx, AV_LOG_ERROR, "Could not get %s from format %s\n", "width", format);
+        av_freep(&format);
+        return AVERROR_EXTERNAL;
+    }
+    s->width = value;
+
+    if (!ff_AMediaFormat_getInt32(s->format, "height", &value)) {
+        format = ff_AMediaFormat_toString(s->format);
+        av_log(avctx, AV_LOG_ERROR, "Could not get %s from format %s\n", "height", format);
+        av_freep(&format);
+        return AVERROR_EXTERNAL;
+    }
+    s->height = value;
+
+    if (!ff_AMediaFormat_getInt32(s->format, "stride", &value)) {
+        format = ff_AMediaFormat_toString(s->format);
+        av_log(avctx, AV_LOG_ERROR, "Could not get %s from format %s\n", "stride", format);
+        av_freep(&format);
+        return AVERROR_EXTERNAL;
+    }
+    s->stride = value > 0 ? value : s->width;
+
+    if (!ff_AMediaFormat_getInt32(s->format, "slice-height", &value)) {
+        format = ff_AMediaFormat_toString(s->format);
+        av_log(avctx, AV_LOG_ERROR, "Could not get %s from format %s\n", "slice-height", format);
+        av_freep(&format);
+        return AVERROR_EXTERNAL;
+    }
+    s->slice_height = value > 0 ? value : s->height;
+
+    if (strstr(s->codec_name, "OMX.Nvidia.")) {
+        s->slice_height = FFALIGN(s->height, 16);
+    } else if (strstr(s->codec_name, "OMX.SEC.avc.dec")) {
+        s->slice_height = avctx->height;
+        s->stride = avctx->width;
+    }
+
+    if (!ff_AMediaFormat_getInt32(s->format, "color-format", &value)) {
+        format = ff_AMediaFormat_toString(s->format);
+        av_log(avctx, AV_LOG_ERROR, "Could not get %s from format %s\n", "color-format", format);
+        av_freep(&format);
+        return AVERROR_EXTERNAL;
+    }
+    s->color_format = value;
+
+    s->pix_fmt = avctx->pix_fmt = mcdec_map_color_format(avctx, s, value);
+    if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
+        av_log(avctx, AV_LOG_ERROR, "Output color format is not supported\n");
+        return AVERROR(EINVAL);
+    }
+
+    /* Optional fields */
+    if (ff_AMediaFormat_getInt32(s->format, "crop-top", &value))
+        s->crop_top = value;
+
+    if (ff_AMediaFormat_getInt32(s->format, "crop-bottom", &value))
+        s->crop_bottom = value;
+
+    if (ff_AMediaFormat_getInt32(s->format, "crop-left", &value))
+        s->crop_left = value;
+
+    if (ff_AMediaFormat_getInt32(s->format, "crop-right", &value))
+        s->crop_right = value;
+
+    width = s->crop_right + 1 - s->crop_left;
+    height = s->crop_bottom + 1 - s->crop_top;
+
+    av_log(avctx, AV_LOG_INFO,
+        "Output crop parameters top=%d bottom=%d left=%d right=%d, "
+        "resulting dimensions width=%d height=%d\n",
+        s->crop_top, s->crop_bottom, s->crop_left, s->crop_right,
+        width, height);
+
+    return ff_set_dimensions(avctx, width, height);
+}
+
+int ff_mediacodec_dec_init(AVCodecContext *avctx, MediaCodecDecContext *s,
+                           const char *mime, FFAMediaFormat *format)
+{
+    int ret = 0;
+    int status;
+    int profile;
+
+    s->first_buffer_at = av_gettime();
+
+    profile = ff_AMediaCodecProfile_getProfileFromAVCodecContext(avctx);
+    if (profile < 0) {
+        av_log(avctx, AV_LOG_WARNING, "Unsupported or unknown profile");
+    }
+
+    s->codec_name = ff_AMediaCodecList_getCodecNameByType(mime, profile, 0, avctx);
+    if (!s->codec_name) {
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "Found decoder %s\n", s->codec_name);
+    s->codec = ff_AMediaCodec_createCodecByName(s->codec_name);
+    if (!s->codec) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create media decoder for type %s and name %s\n", mime, s->codec_name);
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    status = ff_AMediaCodec_configure(s->codec, format, NULL, NULL, 0);
+    if (status < 0) {
+        char *desc = ff_AMediaFormat_toString(format);
+        av_log(avctx, AV_LOG_ERROR,
+            "Failed to configure codec (status = %d) with format %s\n",
+            status, desc);
+        av_freep(&desc);
+
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    status = ff_AMediaCodec_start(s->codec);
+    if (status < 0) {
+        char *desc = ff_AMediaFormat_toString(format);
+        av_log(avctx, AV_LOG_ERROR,
+            "Failed to start codec (status = %d) with format %s\n",
+            status, desc);
+        av_freep(&desc);
+        ret = AVERROR_EXTERNAL;
+        goto fail;
+    }
+
+    s->format = ff_AMediaCodec_getOutputFormat(s->codec);
+    if (s->format) {
+        if ((ret = mediacodec_dec_parse_format(avctx, s)) < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                "Failed to configure context\n");
+            goto fail;
+        }
+    }
+
+    av_log(avctx, AV_LOG_DEBUG, "MediaCodec %p started successfully\n", s->codec);
+
+    return 0;
+
+fail:
+    av_log(avctx, AV_LOG_ERROR, "MediaCodec %p failed to start\n", s->codec);
+    ff_mediacodec_dec_close(avctx, s);
+    return ret;
+}
+
+int ff_mediacodec_dec_decode(AVCodecContext *avctx, MediaCodecDecContext *s,
+                             AVFrame *frame, int *got_frame,
+                             AVPacket *pkt)
+{
+    int ret;
+    int offset = 0;
+    int need_flushing = 0;
+    uint8_t *data;
+    ssize_t index;
+    size_t size;
+    FFAMediaCodec *codec = s->codec;
+    FFAMediaCodecBufferInfo info = { 0 };
+
+    int status;
+
+    int64_t input_dequeue_timeout_us = INPUT_DEQUEUE_TIMEOUT_US;
+    int64_t output_dequeue_timeout_us = OUTPUT_DEQUEUE_TIMEOUT_US;
+
+    if (pkt->size == 0) {
+        need_flushing = 1;
+    }
+
+    if (s->flushing && need_flushing && s->queued_buffer_nb <= 0) {
+        return 0;
+    }
+
+    while (offset < pkt->size || (need_flushing && !s->flushing)) {
+        int size;
+
+        index = ff_AMediaCodec_dequeueInputBuffer(codec, input_dequeue_timeout_us);
+        if (ff_AMediaCodec_infoTryAgainLater(codec, index)) {
+            break;
+        }
+
+        if (index < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to dequeue input buffer (status=%zd)\n", index);
+            return AVERROR_EXTERNAL;
+        }
+
+        data = ff_AMediaCodec_getInputBuffer(codec, index, &size);
+        if (!data) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get input buffer\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        if (need_flushing) {
+            uint32_t flags = ff_AMediaCodec_getBufferFlagEndOfStream(codec);
+
+            av_log(avctx, AV_LOG_DEBUG, "Sending End Of Stream signal\n");
+
+            status = ff_AMediaCodec_queueInputBuffer(codec, index, 0, 0, pkt->pts, flags);
+            if (status < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to queue input empty buffer (status = %d)\n", status);
+                return AVERROR_EXTERNAL;
+            }
+
+            s->flushing = 1;
+            break;
+        } else {
+            size = FFMIN(pkt->size - offset, size);
+
+            memcpy(data, pkt->data + offset, size);
+            offset += size;
+
+            status = ff_AMediaCodec_queueInputBuffer(codec, index, 0, size, pkt->pts, 0);
+            if (status < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to queue input buffer (status = %d)\n", status);
+                return AVERROR_EXTERNAL;
+            }
+
+            s->queued_buffer_nb++;
+            if (s->queued_buffer_nb > s->queued_buffer_max)
+                s->queued_buffer_max = s->queued_buffer_nb;
+        }
+    }
+
+    if (s->flushing) {
+        /* If the codec is flushing, block for a fair amount of time to
+        * ensure we got a frame */
+        output_dequeue_timeout_us = OUTPUT_DEQUEUE_BLOCK_TIMEOUT_US;
+    } else if (s->dequeued_buffer_nb == 0) {
+        /* If the codec hasn't produced any frames, do not block so we
+         * can push data to it as fast as possible, and get the first
+         * frame */
+        output_dequeue_timeout_us = 0;
+    }
+
+    index = ff_AMediaCodec_dequeueOutputBuffer(codec, &info, output_dequeue_timeout_us);
+    if (index >= 0) {
+        int ret;
+
+        if (!s->first_buffer++) {
+            av_log(avctx, AV_LOG_DEBUG, "Got first buffer after %fms\n", (av_gettime() - s->first_buffer_at) / 1000);
+        }
+
+        av_log(avctx, AV_LOG_DEBUG, "Got output buffer %zd"
+                " offset=%" PRIi32 " size=%" PRIi32 " ts=%" PRIi64
+                " flags=%" PRIu32 "\n", index, info.offset, info.size,
+                info.presentationTimeUs, info.flags);
+
+        data = ff_AMediaCodec_getOutputBuffer(codec, index, &size);
+        if (!data) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get output buffer\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        if ((ret = mediacodec_wrap_buffer(avctx, s, data, size, index, &info, frame)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to wrap MediaCodec buffer\n");
+            return ret;
+        }
+
+        *got_frame = 1;
+        s->queued_buffer_nb--;
+        s->dequeued_buffer_nb++;
+
+    } else if (ff_AMediaCodec_infoOutputFormatChanged(codec, index)) {
+        char *format = NULL;
+
+        if (s->format) {
+            status = ff_AMediaFormat_delete(s->format);
+            if (status < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Failed to delete MediaFormat %p\n", s->format);
+            }
+        }
+
+        s->format = ff_AMediaCodec_getOutputFormat(codec);
+        if (!s->format) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to get output format\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        format = ff_AMediaFormat_toString(s->format);
+        if (!format) {
+            return AVERROR_EXTERNAL;
+        }
+        av_log(avctx, AV_LOG_INFO, "Output MediaFormat changed to %s\n", format);
+        av_freep(&format);
+
+        if ((ret = mediacodec_dec_parse_format(avctx, s)) < 0) {
+            return ret;
+        }
+
+    } else if (ff_AMediaCodec_infoOutputBuffersChanged(codec, index)) {
+        ff_AMediaCodec_cleanOutputBuffers(codec);
+    } else if (ff_AMediaCodec_infoTryAgainLater(codec, index)) {
+        if (s->flushing) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to dequeue output buffer within %" PRIi64 "ms "
+                                        "while flushing remaining frames, output will probably lack last %d frames\n",
+                                        output_dequeue_timeout_us / 1000, s->queued_buffer_nb);
+        } else {
+            av_log(avctx, AV_LOG_DEBUG, "No output buffer available, try again later\n");
+        }
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Failed to dequeue output buffer (status=%zd)\n", index);
+        return AVERROR_EXTERNAL;
+    }
+
+    return offset;
+}
+
+int ff_mediacodec_dec_flush(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    FFAMediaCodec *codec = s->codec;
+    int status;
+
+    s->queued_buffer_nb = 0;
+    s->dequeued_buffer_nb = 0;
+
+    s->flushing = 0;
+
+    status = ff_AMediaCodec_flush(codec);
+    if (status < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to flush codec\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    s->first_buffer = 0;
+    s->first_buffer_at = av_gettime();
+
+    return 0;
+}
+
+int ff_mediacodec_dec_close(AVCodecContext *avctx, MediaCodecDecContext *s)
+{
+    if (s->codec) {
+        ff_AMediaCodec_delete(s->codec);
+        s->codec = NULL;
+    }
+
+    if (s->format) {
+        ff_AMediaFormat_delete(s->format);
+        s->format = NULL;
+    }
+
+    av_freep(&s->codec_name);
+
+    return 0;
+}
diff --git a/libavcodec/mediacodecdec.h b/libavcodec/mediacodecdec.h
new file mode 100644
index 0000000..36fdbf5
--- /dev/null
+++ b/libavcodec/mediacodecdec.h
@@ -0,0 +1,82 @@
+/*
+ * Android MediaCodec decoder
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODECDEC_H
+#define AVCODEC_MEDIACODECDEC_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "libavutil/frame.h"
+#include "libavutil/pixfmt.h"
+
+#include "avcodec.h"
+#include "mediacodec_wrapper.h"
+
+typedef struct MediaCodecDecContext {
+
+    char *codec_name;
+
+    FFAMediaCodec *codec;
+    FFAMediaFormat *format;
+
+    int started;
+    int flushing;
+
+    int width;
+    int height;
+    int stride;
+    int slice_height;
+    int color_format;
+    enum AVPixelFormat pix_fmt;
+    int crop_top;
+    int crop_bottom;
+    int crop_left;
+    int crop_right;
+
+    int queued_buffer_nb;
+    int queued_buffer_max;
+    uint64_t dequeued_buffer_nb;
+
+    int first_buffer;
+    double first_buffer_at;
+
+} MediaCodecDecContext;
+
+int ff_mediacodec_dec_init(AVCodecContext *avctx,
+                           MediaCodecDecContext *s,
+                           const char *mime,
+                           FFAMediaFormat *format);
+
+int ff_mediacodec_dec_decode(AVCodecContext *avctx,
+                             MediaCodecDecContext *s,
+                             AVFrame *frame,
+                             int *got_frame,
+                             AVPacket *pkt);
+
+int ff_mediacodec_dec_flush(AVCodecContext *avctx,
+                            MediaCodecDecContext *s);
+
+int ff_mediacodec_dec_close(AVCodecContext *avctx,
+                            MediaCodecDecContext *s);
+
+#endif /* AVCODEC_MEDIACODECDEC_H */
diff --git a/libavcodec/mediacodecdec_h264.c b/libavcodec/mediacodecdec_h264.c
new file mode 100644
index 0000000..eb63ab5
--- /dev/null
+++ b/libavcodec/mediacodecdec_h264.c
@@ -0,0 +1,272 @@
+/*
+ * Android MediaCodec H.264 decoder
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/fifo.h"
+#include "libavutil/opt.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/atomic.h"
+
+#include "avcodec.h"
+#include "h264.h"
+#include "internal.h"
+#include "mediacodecdec.h"
+#include "mediacodec_wrapper.h"
+
+#define CODEC_MIME "video/avc"
+
+typedef struct MediaCodecH264DecContext {
+
+    MediaCodecDecContext ctx;
+
+    AVBSFContext *bsf;
+
+    AVFifoBuffer *fifo;
+
+    AVPacket filtered_pkt;
+
+} MediaCodecH264DecContext;
+
+static av_cold int mediacodec_decode_close(AVCodecContext *avctx)
+{
+    MediaCodecH264DecContext *s = avctx->priv_data;
+
+    ff_mediacodec_dec_close(avctx, &s->ctx);
+
+    av_fifo_free(s->fifo);
+
+    av_bsf_free(&s->bsf);
+    av_packet_unref(&s->filtered_pkt);
+
+    return 0;
+}
+
+static av_cold int mediacodec_decode_init(AVCodecContext *avctx)
+{
+    int i;
+    int ret;
+
+    H264ParamSets ps;
+    const PPS *pps = NULL;
+    const SPS *sps = NULL;
+    int is_avc = 0;
+    int nal_length_size = 0;
+
+    FFAMediaFormat *format = NULL;
+    MediaCodecH264DecContext *s = avctx->priv_data;
+
+    memset(&ps, 0, sizeof(ps));
+
+    format = ff_AMediaFormat_new();
+    if (!format) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to create media format\n");
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    ff_AMediaFormat_setString(format, "mime", CODEC_MIME);
+    ff_AMediaFormat_setInt32(format, "width", avctx->width);
+    ff_AMediaFormat_setInt32(format, "height", avctx->height);
+
+    ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
+                                   &ps, &is_avc, &nal_length_size, 0, avctx);
+    if (ret < 0) {
+        goto done;
+    }
+
+    for (i = 0; i < MAX_PPS_COUNT; i++) {
+        if (ps.pps_list[i]) {
+            pps = (const PPS*)ps.pps_list[i]->data;
+            break;
+        }
+    }
+
+    if (pps) {
+        if (ps.sps_list[pps->sps_id]) {
+            sps = (const SPS*)ps.sps_list[pps->sps_id]->data;
+        }
+    }
+
+    if (pps && sps) {
+        ff_AMediaFormat_setBuffer(format, "csd-0", (void*)sps->data, sps->data_size);
+        ff_AMediaFormat_setBuffer(format, "csd-1", (void*)pps->data, pps->data_size);
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Could not extract PPS/SPS from extradata");
+        ret = AVERROR_INVALIDDATA;
+        goto done;
+    }
+
+    if ((ret = ff_mediacodec_dec_init(avctx, &s->ctx, CODEC_MIME, format)) < 0) {
+        goto done;
+    }
+
+    av_log(avctx, AV_LOG_INFO, "MediaCodec started successfully, ret = %d\n", ret);
+
+    s->fifo = av_fifo_alloc(sizeof(AVPacket));
+    if (!s->fifo) {
+        ret = AVERROR(ENOMEM);
+        goto done;
+    }
+
+    const AVBitStreamFilter *bsf = av_bsf_get_by_name("h264_mp4toannexb");
+    if(!bsf) {
+        ret = AVERROR_BSF_NOT_FOUND;
+        goto done;
+    }
+
+    if ((ret = av_bsf_alloc(bsf, &s->bsf))) {
+        goto done;
+    }
+
+    if (((ret = avcodec_parameters_from_context(s->bsf->par_in, avctx)) < 0) ||
+        ((ret = av_bsf_init(s->bsf)) < 0)) {
+          goto done;
+    }
+
+    av_init_packet(&s->filtered_pkt);
+
+done:
+    if (format) {
+        ff_AMediaFormat_delete(format);
+    }
+
+    if (ret < 0) {
+        mediacodec_decode_close(avctx);
+    }
+
+    ff_h264_ps_uninit(&ps);
+
+    return ret;
+}
+
+
+static int mediacodec_process_data(AVCodecContext *avctx, AVFrame *frame,
+                                   int *got_frame, AVPacket *pkt)
+{
+    MediaCodecH264DecContext *s = avctx->priv_data;
+
+    return ff_mediacodec_dec_decode(avctx, &s->ctx, frame, got_frame, pkt);
+}
+
+static int mediacodec_decode_frame(AVCodecContext *avctx, void *data,
+                                   int *got_frame, AVPacket *avpkt)
+{
+    MediaCodecH264DecContext *s = avctx->priv_data;
+    AVFrame *frame    = data;
+    int ret;
+
+    /* buffer the input packet */
+    if (avpkt->size) {
+        AVPacket input_pkt = { 0 };
+
+        if (av_fifo_space(s->fifo) < sizeof(input_pkt)) {
+            ret = av_fifo_realloc2(s->fifo,
+                                   av_fifo_size(s->fifo) + sizeof(input_pkt));
+            if (ret < 0)
+                return ret;
+        }
+
+        ret = av_packet_ref(&input_pkt, avpkt);
+        if (ret < 0)
+            return ret;
+        av_fifo_generic_write(s->fifo, &input_pkt, sizeof(input_pkt), NULL);
+    }
+
+    /* process buffered data */
+    while (!*got_frame) {
+        /* prepare the input data -- convert to Annex B if needed */
+        if (s->filtered_pkt.size <= 0) {
+            AVPacket input_pkt = { 0 };
+
+            av_packet_unref(&s->filtered_pkt);
+
+            /* no more data */
+            if (av_fifo_size(s->fifo) < sizeof(AVPacket)) {
+                return avpkt->size ? avpkt->size :
+                    ff_mediacodec_dec_decode(avctx, &s->ctx, frame, got_frame, avpkt);
+            }
+
+            av_fifo_generic_read(s->fifo, &input_pkt, sizeof(input_pkt), NULL);
+
+            ret = av_bsf_send_packet(s->bsf, &input_pkt);
+            if (ret < 0) {
+                return ret;
+            }
+
+            ret = av_bsf_receive_packet(s->bsf, &s->filtered_pkt);
+            if (ret == AVERROR(EAGAIN)) {
+                goto done;
+            }
+
+            /* h264_mp4toannexb is used here and does not requires flushing */
+            av_assert0(ret != AVERROR_EOF);
+
+            if (ret < 0) {
+                return ret;
+            }
+        }
+
+        ret = mediacodec_process_data(avctx, frame, got_frame, &s->filtered_pkt);
+        if (ret < 0)
+            return ret;
+
+        s->filtered_pkt.size -= ret;
+        s->filtered_pkt.data += ret;
+    }
+done:
+    return avpkt->size;
+}
+
+static void mediacodec_decode_flush(AVCodecContext *avctx)
+{
+    MediaCodecH264DecContext *s = avctx->priv_data;
+
+    while (av_fifo_size(s->fifo)) {
+        AVPacket pkt;
+        av_fifo_generic_read(s->fifo, &pkt, sizeof(pkt), NULL);
+        av_packet_unref(&pkt);
+    }
+    av_fifo_reset(s->fifo);
+
+    av_packet_unref(&s->filtered_pkt);
+
+    ff_mediacodec_dec_flush(avctx, &s->ctx);
+}
+
+AVCodec ff_h264_mediacodec_decoder = {
+    .name           = "h264_mediacodec",
+    .long_name      = NULL_IF_CONFIG_SMALL("H.264 Android MediaCodec decoder"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(MediaCodecH264DecContext),
+    .init           = mediacodec_decode_init,
+    .decode         = mediacodec_decode_frame,
+    .flush          = mediacodec_decode_flush,
+    .close          = mediacodec_decode_close,
+    .capabilities   = CODEC_CAP_DELAY,
+    .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS,
+};
diff --git a/libavcodec/metasound.c b/libavcodec/metasound.c
index dbb2a63..5a7f4c3 100644
--- a/libavcodec/metasound.c
+++ b/libavcodec/metasound.c
@@ -4,20 +4,20 @@
  * based on TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -236,7 +236,7 @@ static int metasound_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
             skip_bits(&gb, 4 - (get_bits_count(&gb) & 3));
     }
 
-    return 0;
+    return (get_bits_count(&gb) + 7) / 8;
 }
 
 typedef struct MetasoundProps {
diff --git a/libavcodec/metasound_data.c b/libavcodec/metasound_data.c
index 8aa53e5..6d87117 100644
--- a/libavcodec/metasound_data.c
+++ b/libavcodec/metasound_data.c
@@ -2,20 +2,20 @@
  * MetaSound decoder
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -11208,6 +11208,14 @@ static const int16_t fcb16m[] = {
       -688,   -209,    915,    622,  -1038,   -474,   -343,    -91,
       -173,   -104,    255,     96,   1547,    773,   -625,   2272,
        -90,   -509,   -527,   -247,   -147,   -234,    -45,    166,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
 };
 
 static const int16_t fcb16sl[] = {
@@ -12739,185 +12747,6 @@ static const float lsp16[] = {
     -0.0429, -0.0615, -0.0893, -0.0618, -0.0384, -0.0134, -0.0232, -0.0238,
 };
 
-static const float lsp16s[] = {
-     0.1813,  0.3911,  0.6301,  0.8012,  1.0057,  1.2041,  1.4271,  1.6943,
-     1.9402,  2.1733,  2.3521,  2.4989,  2.5839,  2.6846,  2.7634,  2.8950,
-     0.1311,  0.3183,  0.4659,  0.5601,  0.6658,  0.7828,  1.0065,  1.2717,
-     1.5185,  1.7339,  1.9530,  2.2189,  2.3739,  2.4991,  2.6984,  2.9256,
-     0.1627,  0.4519,  0.6323,  0.7012,  0.7848,  0.9801,  1.1810,  1.3222,
-     1.5413,  1.8129,  1.9338,  2.0809,  2.3180,  2.5189,  2.7066,  2.9514,
-     0.1475,  0.2447,  0.4240,  0.5669,  0.7872,  0.9838,  1.1823,  1.3814,
-     1.5358,  1.6820,  1.8794,  2.1419,  2.4132,  2.6112,  2.7911,  2.9511,
-     0.1224,  0.2876,  0.5013,  0.6985,  0.8902,  1.0901,  1.2835,  1.4768,
-     1.6596,  1.8538,  2.0467,  2.2304,  2.4124,  2.5942,  2.7729,  2.9531,
-     0.1741,  0.3034,  0.4677,  0.5879,  0.7258,  0.9648,  1.1417,  1.3220,
-     1.5081,  1.7151,  1.9212,  2.1286,  2.3208,  2.4938,  2.6765,  2.8891,
-     0.1657,  0.3174,  0.4907,  0.6559,  0.8295,  1.0254,  1.2071,  1.3880,
-     1.5737,  1.7845,  1.9027,  2.1139,  2.3323,  2.5157,  2.7323,  2.9015,
-     0.1592,  0.2758,  0.4417,  0.6315,  0.8257,  0.9873,  1.1277,  1.2830,
-     1.4337,  1.6315,  1.8899,  2.1356,  2.3572,  2.5632,  2.7468,  2.9420,
-     0.1524,  0.4325,  0.5931,  0.7036,  0.7696,  0.8923,  1.1739,  1.4773,
-     1.6609,  1.7911,  1.9666,  2.1972,  2.3754,  2.5045,  2.6613,  2.8882,
-     0.2130,  0.3013,  0.3721,  0.4257,  0.5079,  0.7015,  0.9815,  1.2554,
-     1.4648,  1.6966,  1.9138,  2.1075,  2.3318,  2.5292,  2.7453,  2.9347,
-     0.1142,  0.3748,  0.6205,  0.7642,  0.8121,  0.9022,  0.9843,  1.1558,
-     1.4467,  1.7422,  1.9574,  2.1302,  2.3812,  2.5898,  2.7720,  2.9583,
-     0.1255,  0.2339,  0.3570,  0.5323,  0.7458,  1.0003,  1.1729,  1.3567,
-     1.5217,  1.6977,  1.8924,  2.0942,  2.3145,  2.5408,  2.7553,  2.9337,
-     0.1316,  0.2289,  0.4327,  0.6663,  0.8509,  0.9994,  1.1697,  1.3804,
-     1.5609,  1.6903,  1.8572,  2.1019,  2.3687,  2.5789,  2.7715,  2.9472,
-     0.1502,  0.2546,  0.3883,  0.5333,  0.6976,  0.9163,  1.1071,  1.3364,
-     1.5420,  1.7525,  1.8948,  2.0839,  2.2819,  2.4651,  2.6875,  2.8987,
-     0.1593,  0.3014,  0.4573,  0.6354,  0.8157,  0.9805,  1.1783,  1.3747,
-     1.5678,  1.7326,  1.9286,  2.1340,  2.3253,  2.5280,  2.7180,  2.9298,
-     0.1811,  0.3167,  0.4655,  0.6507,  0.8198,  1.0075,  1.1892,  1.3743,
-     1.5227,  1.7090,  1.8849,  2.0743,  2.2750,  2.4830,  2.6896,  2.8953,
-     0.1846,  0.3577,  0.5315,  0.7290,  0.9176,  1.1016,  1.2654,  1.4525,
-     1.6315,  1.8268,  2.0238,  2.1934,  2.3868,  2.5753,  2.7682,  2.9469,
-     0.0876,  0.1439,  0.2048,  0.3654,  0.6281,  0.8853,  1.0907,  1.2992,
-     1.5227,  1.7373,  1.9395,  2.1419,  2.3488,  2.5486,  2.7466,  2.9348,
-     0.1391,  0.4170,  0.6561,  0.7953,  0.8734,  0.9986,  1.1870,  1.4520,
-     1.6042,  1.7910,  2.0135,  2.1870,  2.3358,  2.5066,  2.7409,  2.9955,
-     0.0804,  0.1355,  0.2599,  0.4998,  0.7408,  0.9474,  1.1276,  1.3428,
-     1.5556,  1.7712,  1.9699,  2.1535,  2.3605,  2.5548,  2.7489,  2.9325,
-     0.1304,  0.3087,  0.4979,  0.6584,  0.8414,  1.0329,  1.2244,  1.4189,
-     1.6118,  1.8200,  1.9985,  2.1893,  2.3915,  2.5794,  2.7647,  2.9344,
-     0.1895,  0.2849,  0.3705,  0.4126,  0.6265,  0.9207,  1.1774,  1.3762,
-     1.5757,  1.7728,  1.9568,  2.1662,  2.3615,  2.5575,  2.7561,  2.9416,
-     0.1800,  0.3078,  0.4805,  0.6796,  0.8503,  1.0046,  1.1703,  1.3269,
-     1.4862,  1.6502,  1.8454,  2.0873,  2.3175,  2.5356,  2.7516,  2.9469,
-     0.1950,  0.3233,  0.4568,  0.5940,  0.7589,  0.9978,  1.1701,  1.3383,
-     1.5017,  1.6565,  1.8243,  2.0605,  2.2938,  2.5147,  2.7419,  2.9396,
-     0.2531,  0.4391,  0.5790,  0.7170,  0.8998,  1.1430,  1.3577,  1.5326,
-     1.6328,  1.7627,  1.9726,  2.1762,  2.3563,  2.5478,  2.7385,  2.9067,
-     0.1805,  0.2788,  0.3591,  0.3881,  0.5441,  0.8055,  1.0766,  1.3165,
-     1.5316,  1.7508,  1.9477,  2.1374,  2.3438,  2.5484,  2.7501,  2.9410,
-     0.2044,  0.3671,  0.5396,  0.7042,  0.8582,  0.9831,  1.1261,  1.3194,
-     1.4769,  1.6979,  1.8717,  2.0463,  2.2620,  2.4739,  2.7054,  2.9208,
-     0.1048,  0.2175,  0.4206,  0.5923,  0.7483,  0.9400,  1.1356,  1.3799,
-     1.5958,  1.7320,  1.8984,  2.1296,  2.3594,  2.5492,  2.7387,  2.9305,
-     0.0842,  0.1729,  0.3951,  0.6447,  0.8688,  1.0605,  1.2472,  1.4330,
-     1.6232,  1.8144,  2.0216,  2.1915,  2.3878,  2.5763,  2.7685,  2.9464,
-     0.1461,  0.2593,  0.4105,  0.5677,  0.7328,  0.8919,  1.0484,  1.2302,
-     1.4386,  1.6635,  1.8873,  2.1024,  2.3116,  2.5268,  2.7273,  2.9269,
-     0.1503,  0.3108,  0.4756,  0.6731,  0.8600,  1.0233,  1.2115,  1.3971,
-     1.5915,  1.7892,  1.9517,  2.1603,  2.3487,  2.5460,  2.7308,  2.8998,
-     0.2163,  0.3669,  0.5125,  0.6709,  0.8143,  0.9930,  1.2095,  1.4205,
-     1.6176,  1.7112,  1.8398,  2.0896,  2.3513,  2.5290,  2.6667,  2.8960,
-     0.2133,  0.4382,  0.6287,  0.8702,  1.1088,  1.3749,  1.6062,  1.7446,
-     1.8333,  1.9122,  1.9614,  2.0669,  2.1789,  2.3449,  2.6038,  2.8849,
-     0.1598,  0.2719,  0.3877,  0.4815,  0.5926,  0.7795,  1.0449,  1.3045,
-     1.5210,  1.7391,  1.9462,  2.1397,  2.3553,  2.5458,  2.7540,  2.9392,
-     0.2918,  0.5607,  0.6801,  0.7404,  0.8285,  0.9431,  1.1579,  1.4080,
-     1.6332,  1.8472,  1.9738,  2.0771,  2.2890,  2.5178,  2.7445,  2.9830,
-     0.1664,  0.2842,  0.3965,  0.5463,  0.8162,  1.0346,  1.1849,  1.3446,
-     1.5122,  1.7563,  1.9960,  2.2002,  2.3796,  2.5689,  2.7712,  2.9550,
-     0.0911,  0.2397,  0.5052,  0.7868,  1.0299,  1.1311,  1.2244,  1.3333,
-     1.4395,  1.6790,  1.9369,  2.1717,  2.3689,  2.5538,  2.7340,  2.9326,
-     0.1647,  0.2931,  0.3836,  0.4978,  0.6255,  0.9243,  1.1339,  1.3001,
-     1.5269,  1.8010,  1.9715,  2.1419,  2.3784,  2.5503,  2.6719,  2.8745,
-     0.2440,  0.3802,  0.4756,  0.6613,  0.8627,  1.0292,  1.2291,  1.4060,
-     1.5198,  1.7354,  1.9044,  2.1010,  2.3147,  2.4996,  2.7171,  2.9041,
-     0.1590,  0.2876,  0.4572,  0.5996,  0.7713,  0.9490,  1.1205,  1.2815,
-     1.4516,  1.6385,  1.8179,  2.0457,  2.2759,  2.4785,  2.6861,  2.9080,
-     0.2297,  0.4309,  0.5712,  0.6717,  0.8138,  1.0463,  1.2492,  1.4560,
-     1.6796,  1.8458,  1.9642,  2.1452,  2.3636,  2.5395,  2.7456,  2.9495,
-     0.2975,  0.4678,  0.4996,  0.5809,  0.6279,  0.6884,  0.8606,  1.1386,
-     1.4412,  1.6876,  1.8760,  2.0932,  2.3178,  2.5166,  2.7345,  2.9280,
-     0.1278,  0.3737,  0.6004,  0.7069,  0.8147,  1.0180,  1.2581,  1.3812,
-     1.4855,  1.7268,  1.9970,  2.1258,  2.2936,  2.5702,  2.7563,  2.8983,
-     0.1314,  0.2508,  0.3999,  0.5680,  0.7424,  0.9367,  1.1286,  1.3175,
-     1.5336,  1.7404,  1.9317,  2.1404,  2.3514,  2.5562,  2.7510,  2.9402,
-     0.1043,  0.2367,  0.4293,  0.6376,  0.8160,  0.9836,  1.1779,  1.3850,
-     1.5835,  1.7875,  1.9765,  2.1593,  2.3654,  2.5577,  2.7465,  2.9398,
-     0.1529,  0.2515,  0.3454,  0.4374,  0.7011,  0.9015,  1.0744,  1.3532,
-     1.5699,  1.7545,  2.0021,  2.1259,  2.2278,  2.4546,  2.7264,  2.9425,
-     0.1429,  0.2808,  0.4395,  0.6334,  0.8069,  0.9705,  1.1520,  1.3250,
-     1.5109,  1.7285,  1.9356,  2.1469,  2.3479,  2.5554,  2.7512,  2.9348,
-     0.1625,  0.3022,  0.4756,  0.6315,  0.8032,  0.9924,  1.1596,  1.3204,
-     1.4994,  1.6929,  1.8955,  2.1090,  2.3025,  2.5018,  2.6908,  2.8980,
-     0.1692,  0.3427,  0.5228,  0.7756,  0.9688,  1.0950,  1.3056,  1.4360,
-     1.5675,  1.8049,  1.9376,  2.1151,  2.3407,  2.5012,  2.7192,  2.9258,
-     0.0474,  0.1251,  0.1939,  0.3841,  0.6501,  0.9231,  1.1153,  1.3240,
-     1.5478,  1.7599,  1.9651,  2.1510,  2.3645,  2.5552,  2.7542,  2.9393,
-     0.2196,  0.4656,  0.7492,  0.9922,  1.1678,  1.2489,  1.3112,  1.3657,
-     1.4223,  1.5302,  1.7212,  1.9996,  2.2523,  2.4844,  2.7036,  2.9145,
-     0.1128,  0.2368,  0.3704,  0.5476,  0.7723,  0.9968,  1.1930,  1.3992,
-     1.6013,  1.7957,  1.9888,  2.1857,  2.3825,  2.5705,  2.7616,  2.9434,
-     0.1341,  0.2768,  0.4510,  0.6359,  0.8332,  1.0335,  1.2004,  1.3952,
-     1.5762,  1.7681,  1.9815,  2.1735,  2.3657,  2.5552,  2.7514,  2.9498,
-     0.1247,  0.2559,  0.3516,  0.4726,  0.6861,  0.9483,  1.1852,  1.3858,
-     1.5851,  1.7815,  1.9778,  2.1737,  2.3729,  2.5664,  2.7620,  2.9429,
-     0.1988,  0.3320,  0.4777,  0.6737,  0.8425,  1.0265,  1.1694,  1.3655,
-     1.5463,  1.7135,  1.9385,  2.1650,  2.3529,  2.5367,  2.7545,  2.9585,
-     0.1376,  0.2620,  0.4273,  0.6169,  0.7755,  0.9441,  1.1169,  1.3157,
-     1.5179,  1.7020,  1.8931,  2.1059,  2.3112,  2.5136,  2.7169,  2.9198,
-     0.2112,  0.4385,  0.6091,  0.7618,  0.9553,  1.1543,  1.3445,  1.5396,
-     1.7153,  1.9192,  2.1263,  2.3593,  2.5958,  2.8171,  2.9394,  3.0409,
-     0.1347,  0.2099,  0.2646,  0.3453,  0.5266,  0.7869,  1.0513,  1.2795,
-     1.4880,  1.7181,  1.9294,  2.1332,  2.3362,  2.5442,  2.7433,  2.9362,
-     0.3141,  0.5935,  0.7517,  0.8313,  0.8568,  0.9570,  1.0250,  1.1275,
-     1.3422,  1.6303,  1.8577,  2.0705,  2.2957,  2.5095,  2.7244,  2.9262,
-     0.0962,  0.2116,  0.3961,  0.5641,  0.7122,  0.8883,  1.1023,  1.3481,
-     1.5623,  1.7554,  1.9618,  2.1675,  2.3706,  2.5556,  2.7430,  2.9337,
-     0.0898,  0.1510,  0.3060,  0.5820,  0.8221,  1.0388,  1.2261,  1.4289,
-     1.6054,  1.8103,  1.9941,  2.1844,  2.3742,  2.5711,  2.7632,  2.9474,
-     0.1326,  0.2316,  0.3761,  0.5177,  0.6782,  0.8761,  1.0952,  1.3175,
-     1.5078,  1.7034,  1.9051,  2.1245,  2.3424,  2.5484,  2.7444,  2.9389,
-     0.1740,  0.3293,  0.5174,  0.6824,  0.8394,  1.0372,  1.2046,  1.3723,
-     1.5656,  1.7444,  1.9442,  2.1386,  2.3139,  2.4960,  2.7071,  2.9297,
-     0.2304,  0.3775,  0.4865,  0.6182,  0.7842,  0.9208,  1.1151,  1.2843,
-     1.4641,  1.6988,  1.9209,  2.1260,  2.3099,  2.5229,  2.7414,  2.9276,
-     0.0094,  0.0261, -0.0037,  0.0041, -0.0092, -0.0044, -0.0232, -0.0073,
-    -0.0047, -0.0021,  0.0250, -0.0580, -0.0140, -0.0342, -0.0586,  0.0020,
-     0.0449,  0.0155, -0.0523, -0.0279,  0.0299, -0.0183, -0.0736, -0.0639,
-    -0.0017,  0.0336,  0.0209,  0.0046,  0.0077, -0.0148, -0.0114, -0.0120,
-     0.0115, -0.0050,  0.0445,  0.0048,  0.0188, -0.0137, -0.0080,  0.0239,
-    -0.0184, -0.0524, -0.0195, -0.0126,  0.0284,  0.0632,  0.0141, -0.0093,
-    -0.0096,  0.0196,  0.0230,  0.0379,  0.0308,  0.0237, -0.0224, -0.0600,
-    -0.0755, -0.1074, -0.0988, -0.0606, -0.1038, -0.1552, -0.1480, -0.0672,
-     0.0504,  0.0676,  0.0336, -0.0042,  0.0729,  0.1013,  0.0868,  0.0846,
-     0.0954,  0.0515, -0.0066, -0.0851, -0.0485,  0.0294,  0.0395,  0.0087,
-     0.0078,  0.0446,  0.0881,  0.0672, -0.0384, -0.0025,  0.0415,  0.0353,
-     0.0080,  0.0052,  0.0190,  0.0182,  0.0069,  0.0168,  0.0374,  0.0037,
-    -0.0292, -0.0429,  0.0302,  0.0681, -0.0233, -0.0238, -0.0003, -0.0043,
-     0.0054, -0.0029, -0.0149,  0.0642,  0.0622,  0.0341, -0.0232, -0.0461,
-    -0.0082, -0.0469, -0.0618, -0.0326, -0.0452, -0.0649, -0.0597, -0.0398,
-    -0.0318, -0.0116,  0.0011,  0.0009, -0.0384, -0.0384, -0.0156, -0.0260,
-    -0.0007,  0.0473,  0.0111, -0.0358, -0.0484, -0.0204, -0.0029, -0.0090,
-    -0.0285, -0.0495, -0.0376,  0.0917,  0.1192,  0.1026,  0.0745,  0.0397,
-     0.0463,  0.0253,  0.0025,  0.0465,  0.0100,  0.0488,  0.0416,  0.0223,
-     0.0263,  0.0072, -0.0053,  0.0595,  0.0060, -0.0518, -0.0316, -0.0043,
-    -0.0133, -0.0233, -0.0075, -0.0251,  0.0277, -0.0067, -0.0136, -0.0004,
-     0.0235,  0.0112, -0.0182, -0.0324, -0.0210, -0.0035, -0.0395, -0.0384,
-     0.0005, -0.0150, -0.0356,  0.0127, -0.0033, -0.0034,  0.0205,  0.0747,
-     0.1138,  0.1015,  0.0995, -0.0161, -0.0045,  0.0129,  0.0472,  0.0575,
-     0.0222,  0.0091,  0.0037, -0.0471,  0.0371,  0.0132,  0.0208,  0.0247,
-     0.0117,  0.0164,  0.0225,  0.0124, -0.0023,  0.0088, -0.0046,  0.0047,
-    -0.0393,  0.0018,  0.0148,  0.0020,  0.0044,  0.0165,  0.0229, -0.0208,
-    -0.0477, -0.0310, -0.0164, -0.0390, -0.0764, -0.0525, -0.0094,  0.0075,
-    -0.0102, -0.0045, -0.0504, -0.0709,  0.0822,  0.0710,  0.0426,  0.0014,
-    -0.0371, -0.0400, -0.0157, -0.0155, -0.0173, -0.0138, -0.0015,  0.0134,
-    -0.0418, -0.0682, -0.0256,  0.0050,  0.0360,  0.0354,  0.0074, -0.0396,
-    -0.0235,  0.0284,  0.0494,  0.0153,  0.0448,  0.0025, -0.0061,  0.0252,
-     0.1000,  0.2260,  0.2158,  0.2116,  0.2198,  0.2055,  0.2110,  0.1873,
-     0.1907,  0.2071,  0.2164,  0.2009,  0.2059,  0.2124,  0.2141,  0.2093,
-     0.0875,  0.0981,  0.1177,  0.1071,  0.1033,  0.1248,  0.1048,  0.1238,
-     0.1166,  0.1008,  0.1062,  0.0992,  0.0994,  0.1067,  0.0999,  0.1187,
-     0.0750,  0.0794,  0.0828,  0.0854,  0.0859,  0.0801,  0.0891,  0.0933,
-     0.0969,  0.0920,  0.0915,  0.0862,  0.0868,  0.0891,  0.0842,  0.0824,
-     0.0625,  0.0930,  0.0815,  0.0853,  0.0898,  0.0828,  0.0822,  0.0910,
-     0.0873,  0.0906,  0.0856,  0.0840,  0.0774,  0.0785,  0.0684,  0.0711,
-     0.3319,  0.4219,  0.4588,  0.4090,  0.4092,  0.4014,  0.3548,  0.3353,
-     0.3708,  0.3352,  0.3720,  0.3538,  0.4084,  0.4289,  0.4060,  0.4210,
-     0.0588,  0.0209, -0.0082, -0.0115, -0.0343, -0.0621, -0.0541, -0.0346,
-    -0.0346, -0.0366, -0.0220, -0.0265, -0.0102,  0.0374,  0.0306,  0.0404,
-     0.0306,  0.0090, -0.0054,  0.0333,  0.0047,  0.0238,  0.0141,  0.0165,
-     0.0306,  0.0420,  0.0159,  0.0124,  0.0414,  0.0158, -0.0237,  0.0141,
-     0.0765,  0.0057, -0.0260, -0.0426, -0.0395, -0.0126, -0.0579, -0.0417,
-    -0.0429, -0.0615, -0.0893, -0.0618, -0.0384, -0.0134, -0.0232, -0.0238,
-};
-
 static const float lsp22[] = {
      0.0664,  0.1875,  0.4300,  0.6730,  0.8793,  1.0640,  1.2563,  1.4433,
      1.6394,  1.8176,  2.0029,  2.1921,  2.3796,  2.5671,  2.7595,  2.9536,
@@ -15117,9 +14946,10 @@ static const uint16_t bark_tab_s16_128[] = {
     2, 2, 2, 3, 3, 5, 7, 12, 25, 67
 };
 
+/* unused
 static const uint16_t bark_tab_s16_64[] = {
     1, 1, 2, 2, 3, 6, 11, 38
-};
+}; */
 
 static const uint16_t bark_tab_l16s_1024[] = {
       9,   9,   8,   9,  10,   9,  10,  10,
diff --git a/libavcodec/metasound_data.h b/libavcodec/metasound_data.h
index 4925516..5c33411 100644
--- a/libavcodec/metasound_data.h
+++ b/libavcodec/metasound_data.h
@@ -2,20 +2,20 @@
  * MetaSound decoder
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/microdvddec.c b/libavcodec/microdvddec.c
new file mode 100644
index 0000000..e8d2719
--- /dev/null
+++ b/libavcodec/microdvddec.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * MicroDVD subtitle decoder
+ *
+ * Based on the specifications found here:
+ * https://trac.videolan.org/vlc/ticket/1825#comment:6
+ */
+
+#include "libavutil/avstring.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/bprint.h"
+#include "avcodec.h"
+#include "ass.h"
+
+static int indexof(const char *s, int c)
+{
+    char *f = strchr(s, c);
+    return f ? (f - s) : -1;
+}
+
+struct microdvd_tag {
+    char key;
+    int persistent;
+    uint32_t data1;
+    uint32_t data2;
+    char *data_string;
+    int data_string_len;
+};
+
+#define MICRODVD_PERSISTENT_OFF     0
+#define MICRODVD_PERSISTENT_ON      1
+#define MICRODVD_PERSISTENT_OPENED  2
+
+// Color, Font, Size, cHarset, stYle, Position, cOordinate
+#define MICRODVD_TAGS "cfshyYpo"
+
+static void microdvd_set_tag(struct microdvd_tag *tags, struct microdvd_tag tag)
+{
+    int tag_index = indexof(MICRODVD_TAGS, tag.key);
+
+    if (tag_index < 0)
+        return;
+    memcpy(&tags[tag_index], &tag, sizeof(tag));
+}
+
+// italic, bold, underline, strike-through
+#define MICRODVD_STYLES "ibus"
+
+/* some samples have lines that start with a / indicating non persistent italic
+ * marker */
+static char *check_for_italic_slash_marker(struct microdvd_tag *tags, char *s)
+{
+    if (*s == '/') {
+        struct microdvd_tag tag = tags[indexof(MICRODVD_TAGS, 'y')];
+        tag.key = 'y';
+        tag.data1 |= 1 << 0 /* 'i' position in MICRODVD_STYLES */;
+        microdvd_set_tag(tags, tag);
+        s++;
+    }
+    return s;
+}
+
+static char *microdvd_load_tags(struct microdvd_tag *tags, char *s)
+{
+    s = check_for_italic_slash_marker(tags, s);
+
+    while (*s == '{') {
+        char *start = s;
+        char tag_char = *(s + 1);
+        struct microdvd_tag tag = {0};
+
+        if (!tag_char || *(s + 2) != ':')
+            break;
+        s += 3;
+
+        switch (tag_char) {
+
+        /* Style */
+        case 'Y':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'y':
+            while (*s && *s != '}') {
+                int style_index = indexof(MICRODVD_STYLES, *s);
+
+                if (style_index >= 0)
+                    tag.data1 |= (1 << style_index);
+                s++;
+            }
+            if (*s != '}')
+                break;
+            /* We must distinguish persistent and non-persistent styles
+             * to handle this kind of style tags: {y:ib}{Y:us} */
+            tag.key = tag_char;
+            break;
+
+        /* Color */
+        case 'C':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'c':
+            while (*s == '$' || *s == '#')
+                s++;
+            tag.data1 = strtol(s, &s, 16) & 0x00ffffff;
+            if (*s != '}')
+                break;
+            tag.key = 'c';
+            break;
+
+        /* Font name */
+        case 'F':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 'f': {
+            int len = indexof(s, '}');
+            if (len < 0)
+                break;
+            tag.data_string = s;
+            tag.data_string_len = len;
+            s += len;
+            tag.key = 'f';
+            break;
+        }
+
+        /* Font size */
+        case 'S':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+        case 's':
+            tag.data1 = strtol(s, &s, 10);
+            if (*s != '}')
+                break;
+            tag.key = 's';
+            break;
+
+        /* Charset */
+        case 'H': {
+            //TODO: not yet handled, just parsed.
+            int len = indexof(s, '}');
+            if (len < 0)
+                break;
+            tag.data_string = s;
+            tag.data_string_len = len;
+            s += len;
+            tag.key = 'h';
+            break;
+        }
+
+        /* Position */
+        case 'P':
+            if (!*s)
+                break;
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+            tag.data1 = (*s++ == '1');
+            if (*s != '}')
+                break;
+            tag.key = 'p';
+            break;
+
+        /* Coordinates */
+        case 'o':
+            tag.persistent = MICRODVD_PERSISTENT_ON;
+            tag.data1 = strtol(s, &s, 10);
+            if (*s != ',')
+                break;
+            s++;
+            tag.data2 = strtol(s, &s, 10);
+            if (*s != '}')
+                break;
+            tag.key = 'o';
+            break;
+
+        default:    /* Unknown tag, we consider it's text */
+            break;
+        }
+
+        if (tag.key == 0)
+            return start;
+
+        microdvd_set_tag(tags, tag);
+        s++;
+    }
+    return check_for_italic_slash_marker(tags, s);
+}
+
+static void microdvd_open_tags(AVBPrint *new_line, struct microdvd_tag *tags)
+{
+    int i, sidx;
+    for (i = 0; i < sizeof(MICRODVD_TAGS) - 1; i++) {
+        if (tags[i].persistent == MICRODVD_PERSISTENT_OPENED)
+            continue;
+        switch (tags[i].key) {
+        case 'Y':
+        case 'y':
+            for (sidx = 0; sidx < sizeof(MICRODVD_STYLES) - 1; sidx++)
+                if (tags[i].data1 & (1 << sidx))
+                    av_bprintf(new_line, "{\\%c1}", MICRODVD_STYLES[sidx]);
+            break;
+
+        case 'c':
+            av_bprintf(new_line, "{\\c&H%06X&}", tags[i].data1);
+            break;
+
+        case 'f':
+            av_bprintf(new_line, "{\\fn%.*s}",
+                       tags[i].data_string_len, tags[i].data_string);
+            break;
+
+        case 's':
+            av_bprintf(new_line, "{\\fs%d}", tags[i].data1);
+            break;
+
+        case 'p':
+            if (tags[i].data1 == 0)
+                av_bprintf(new_line, "{\\an8}");
+            break;
+
+        case 'o':
+            av_bprintf(new_line, "{\\pos(%d,%d)}",
+                       tags[i].data1, tags[i].data2);
+            break;
+        }
+        if (tags[i].persistent == MICRODVD_PERSISTENT_ON)
+            tags[i].persistent = MICRODVD_PERSISTENT_OPENED;
+    }
+}
+
+static void microdvd_close_no_persistent_tags(AVBPrint *new_line,
+                                              struct microdvd_tag *tags)
+{
+    int i, sidx;
+
+    for (i = sizeof(MICRODVD_TAGS) - 2; i >= 0; i--) {
+        if (tags[i].persistent != MICRODVD_PERSISTENT_OFF)
+            continue;
+        switch (tags[i].key) {
+
+        case 'y':
+            for (sidx = sizeof(MICRODVD_STYLES) - 2; sidx >= 0; sidx--)
+                if (tags[i].data1 & (1 << sidx))
+                    av_bprintf(new_line, "{\\%c0}", MICRODVD_STYLES[sidx]);
+            break;
+
+        case 'c':
+            av_bprintf(new_line, "{\\c}");
+            break;
+
+        case 'f':
+            av_bprintf(new_line, "{\\fn}");
+            break;
+
+        case 's':
+            av_bprintf(new_line, "{\\fs}");
+            break;
+        }
+        tags[i].key = 0;
+    }
+}
+
+static int microdvd_decode_frame(AVCodecContext *avctx,
+                                 void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    AVBPrint new_line;
+    char *line = avpkt->data;
+    char *end = avpkt->data + avpkt->size;
+    FFASSDecoderContext *s = avctx->priv_data;
+    struct microdvd_tag tags[sizeof(MICRODVD_TAGS) - 1] = {{0}};
+
+    if (avpkt->size <= 0)
+        return avpkt->size;
+
+    av_bprint_init(&new_line, 0, 2048);
+
+    // subtitle content
+    while (line < end && *line) {
+
+        // parse MicroDVD tags, and open them in ASS
+        line = microdvd_load_tags(tags, line);
+        microdvd_open_tags(&new_line, tags);
+
+        // simple copy until EOL or forced carriage return
+        while (line < end && *line && *line != '|') {
+            av_bprint_chars(&new_line, *line, 1);
+            line++;
+        }
+
+        // line split
+        if (line < end && *line == '|') {
+            microdvd_close_no_persistent_tags(&new_line, tags);
+            av_bprintf(&new_line, "\\N");
+            line++;
+        }
+    }
+    if (new_line.len) {
+        int ret = ff_ass_add_rect(sub, new_line.str, s->readorder++, 0, NULL, NULL);
+        av_bprint_finalize(&new_line, NULL);
+        if (ret < 0)
+            return ret;
+    }
+
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static int microdvd_init(AVCodecContext *avctx)
+{
+    int i, sidx;
+    AVBPrint font_buf;
+    int font_size    = ASS_DEFAULT_FONT_SIZE;
+    int color        = ASS_DEFAULT_COLOR;
+    int bold         = ASS_DEFAULT_BOLD;
+    int italic       = ASS_DEFAULT_ITALIC;
+    int underline    = ASS_DEFAULT_UNDERLINE;
+    int alignment    = ASS_DEFAULT_ALIGNMENT;
+    struct microdvd_tag tags[sizeof(MICRODVD_TAGS) - 1] = {{0}};
+
+    av_bprint_init(&font_buf, 0, AV_BPRINT_SIZE_AUTOMATIC);
+    av_bprintf(&font_buf, "%s", ASS_DEFAULT_FONT);
+
+    if (avctx->extradata) {
+        microdvd_load_tags(tags, avctx->extradata);
+        for (i = 0; i < sizeof(MICRODVD_TAGS) - 1; i++) {
+            switch (av_tolower(tags[i].key)) {
+            case 'y':
+                for (sidx = 0; sidx < sizeof(MICRODVD_STYLES) - 1; sidx++) {
+                    if (tags[i].data1 & (1 << sidx)) {
+                        switch (MICRODVD_STYLES[sidx]) {
+                        case 'i': italic    = 1; break;
+                        case 'b': bold      = 1; break;
+                        case 'u': underline = 1; break;
+                        }
+                    }
+                }
+                break;
+
+            case 'c': color     = tags[i].data1; break;
+            case 's': font_size = tags[i].data1; break;
+            case 'p': alignment =             8; break;
+
+            case 'f':
+                av_bprint_clear(&font_buf);
+                av_bprintf(&font_buf, "%.*s",
+                           tags[i].data_string_len, tags[i].data_string);
+                break;
+            }
+        }
+    }
+    return ff_ass_subtitle_header(avctx, font_buf.str, font_size, color,
+                                  ASS_DEFAULT_BACK_COLOR, bold, italic,
+                                  underline, ASS_DEFAULT_BORDERSTYLE,
+                                  alignment);
+}
+
+AVCodec ff_microdvd_decoder = {
+    .name         = "microdvd",
+    .long_name    = NULL_IF_CONFIG_SMALL("MicroDVD subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_MICRODVD,
+    .init         = microdvd_init,
+    .decode       = microdvd_decode_frame,
+    .flush        = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index 6f43723..06fb393 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2005  Ole André Vadla Ravnås <oleavr@gmail.com>
  * Copyright (C) 2008  Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -119,7 +119,8 @@ static av_cold int mimic_decode_end(AVCodecContext *avctx)
     MimicContext *ctx = avctx->priv_data;
     int i;
 
-    av_free(ctx->swap_buf);
+    av_freep(&ctx->swap_buf);
+    ctx->swap_buf_size = 0;
 
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->frames); i++) {
         if (ctx->frames[i].f)
@@ -165,6 +166,7 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+#if HAVE_THREADS
 static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCodecContext *avctx_from)
 {
     MimicContext *dst = avctx->priv_data, *src = avctx_from->priv_data;
@@ -178,7 +180,7 @@ static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCod
 
     for (i = 0; i < FF_ARRAY_ELEMS(dst->frames); i++) {
         ff_thread_release_buffer(avctx, &dst->frames[i]);
-        if (src->frames[i].f->data[0]) {
+        if (i != src->next_cur_index && src->frames[i].f->data[0]) {
             ret = ff_thread_ref_frame(&dst->frames[i], &src->frames[i]);
             if (ret < 0)
                 return ret;
@@ -187,6 +189,7 @@ static int mimic_decode_update_thread_context(AVCodecContext *avctx, const AVCod
 
     return 0;
 }
+#endif
 
 static const int8_t vlcdec_lookup[9][64] = {
     {    0, },
@@ -254,7 +257,7 @@ static int vlc_decode_block(MimicContext *ctx, int num_coeffs, int qscale)
 
         value = get_bits(&ctx->gb, num_bits);
 
-        /* Libav's IDCT behaves somewhat different from the original code, so
+        /* FFmpeg's IDCT behaves somewhat different from the original code, so
          * a factor of 4 was added to the input */
 
         coeff = vlcdec_lookup[num_bits][value];
@@ -409,10 +412,8 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     ctx->frames[ctx->cur_index].f->pict_type = is_pframe ? AV_PICTURE_TYPE_P :
                                                            AV_PICTURE_TYPE_I;
     if ((res = ff_thread_get_buffer(avctx, &ctx->frames[ctx->cur_index],
-                                    AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                                    AV_GET_BUFFER_FLAG_REF)) < 0)
         return res;
-    }
 
     ctx->next_prev_index = ctx->cur_index;
     ctx->next_cur_index  = (ctx->cur_index - 1) & 15;
@@ -451,6 +452,7 @@ static int mimic_decode_frame(AVCodecContext *avctx, void *data,
     return buf_size;
 }
 
+#if HAVE_THREADS
 static av_cold int mimic_init_thread_copy(AVCodecContext *avctx)
 {
     MimicContext *ctx = avctx->priv_data;
@@ -466,6 +468,7 @@ static av_cold int mimic_init_thread_copy(AVCodecContext *avctx)
 
     return 0;
 }
+#endif
 
 AVCodec ff_mimic_decoder = {
     .name                  = "mimic",
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
new file mode 100644
index 0000000..3c43600
--- /dev/null
+++ b/libavcodec/mips/Makefile
@@ -0,0 +1,80 @@
+MIPSFPU-OBJS-$(CONFIG_AMRNB_DECODER)      += mips/acelp_filters_mips.o     \
+                                             mips/celp_filters_mips.o      \
+                                             mips/celp_math_mips.o         \
+                                             mips/acelp_vectors_mips.o
+MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER)      += mips/acelp_filters_mips.o     \
+                                             mips/celp_filters_mips.o      \
+                                             mips/amrwbdec_mips.o          \
+                                             mips/celp_math_mips.o         \
+                                             mips/acelp_vectors_mips.o
+MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP)       += mips/mpegaudiodsp_mips_float.o
+MIPSDSP-OBJS-$(CONFIG_MPEGAUDIODSP)       += mips/mpegaudiodsp_mips_fixed.o
+MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
+MIPSFPU-OBJS-$(CONFIG_FMTCONVERT)         += mips/fmtconvert_mips.o
+OBJS-$(CONFIG_AC3DSP)                     += mips/ac3dsp_mips.o
+OBJS-$(CONFIG_AAC_DECODER)                += mips/aacdec_mips.o            \
+                                             mips/aacsbr_mips.o            \
+                                             mips/sbrdsp_mips.o            \
+                                             mips/aacpsdsp_mips.o
+MIPSDSP-OBJS-$(CONFIG_AAC_ENCODER)        += mips/aaccoder_mips.o
+MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER)        += mips/iirfilter_mips.o
+OBJS-$(CONFIG_HEVC_DECODER)               += mips/hevcdsp_init_mips.o      \
+                                             mips/hevcpred_init_mips.o
+OBJS-$(CONFIG_VP9_DECODER)                += mips/vp9dsp_init_mips.o
+OBJS-$(CONFIG_VP8_DECODER)                += mips/vp8dsp_init_mips.o
+OBJS-$(CONFIG_H264DSP)                    += mips/h264dsp_init_mips.o
+OBJS-$(CONFIG_H264QPEL)                   += mips/h264qpel_init_mips.o
+OBJS-$(CONFIG_H264CHROMA)                 += mips/h264chroma_init_mips.o
+OBJS-$(CONFIG_H264PRED)                   += mips/h264pred_init_mips.o
+OBJS-$(CONFIG_H263DSP)                    += mips/h263dsp_init_mips.o
+OBJS-$(CONFIG_QPELDSP)                    += mips/qpeldsp_init_mips.o
+OBJS-$(CONFIG_HPELDSP)                    += mips/hpeldsp_init_mips.o
+OBJS-$(CONFIG_BLOCKDSP)                   += mips/blockdsp_init_mips.o
+OBJS-$(CONFIG_PIXBLOCKDSP)                += mips/pixblockdsp_init_mips.o
+OBJS-$(CONFIG_IDCTDSP)                    += mips/idctdsp_init_mips.o
+OBJS-$(CONFIG_MPEGVIDEO)                  += mips/mpegvideo_init_mips.o
+OBJS-$(CONFIG_MPEGVIDEOENC)               += mips/mpegvideoencdsp_init_mips.o
+OBJS-$(CONFIG_ME_CMP)                     += mips/me_cmp_init_mips.o
+OBJS-$(CONFIG_MPEG4_DECODER)              += mips/xvididct_init_mips.o
+MSA-OBJS-$(CONFIG_HEVC_DECODER)           += mips/hevcdsp_msa.o            \
+                                             mips/hevc_mc_uni_msa.o        \
+                                             mips/hevc_mc_uniw_msa.o       \
+                                             mips/hevc_mc_bi_msa.o         \
+                                             mips/hevc_mc_biw_msa.o        \
+                                             mips/hevc_idct_msa.o          \
+                                             mips/hevc_lpf_sao_msa.o       \
+                                             mips/hevcpred_msa.o
+MSA-OBJS-$(CONFIG_VP9_DECODER)            += mips/vp9_mc_msa.o             \
+                                             mips/vp9_lpf_msa.o            \
+                                             mips/vp9_idct_msa.o           \
+                                             mips/vp9_intra_msa.o
+MSA-OBJS-$(CONFIG_VP8_DECODER)            += mips/vp8_mc_msa.o             \
+                                             mips/vp8_idct_msa.o           \
+                                             mips/vp8_lpf_msa.o
+MSA-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_msa.o            \
+                                             mips/h264idct_msa.o
+MSA-OBJS-$(CONFIG_H264QPEL)               += mips/h264qpel_msa.o
+MSA-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_msa.o
+MSA-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_msa.o
+MSA-OBJS-$(CONFIG_H263DSP)                += mips/h263dsp_msa.o
+MSA-OBJS-$(CONFIG_QPELDSP)                += mips/qpeldsp_msa.o
+MSA-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_msa.o
+MSA-OBJS-$(CONFIG_BLOCKDSP)               += mips/blockdsp_msa.o
+MSA-OBJS-$(CONFIG_PIXBLOCKDSP)            += mips/pixblockdsp_msa.o
+MSA-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_msa.o           \
+                                             mips/simple_idct_msa.o
+MSA-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_msa.o
+MSA-OBJS-$(CONFIG_MPEGVIDEOENC)           += mips/mpegvideoencdsp_msa.o
+MSA-OBJS-$(CONFIG_ME_CMP)                 += mips/me_cmp_msa.o
+MMI-OBJS                                  += mips/constants.o
+MMI-OBJS-$(CONFIG_H264DSP)                += mips/h264dsp_mmi.o
+MMI-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_mmi.o
+MMI-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_mmi.o
+MMI-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_mmi.o
+MMI-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_mmi.o           \
+                                             mips/simple_idct_mmi.o
+MMI-OBJS-$(CONFIG_MPEG4_DECODER)          += mips/xvid_idct_mmi.o
+MMI-OBJS-$(CONFIG_BLOCKDSP)               += mips/blockdsp_mmi.o
+MMI-OBJS-$(CONFIG_PIXBLOCKDSP)            += mips/pixblockdsp_mmi.o
+MMI-OBJS-$(CONFIG_H264QPEL)               += mips/h264qpel_mmi.o
+MMI-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_mmi.o
diff --git a/libavcodec/mips/aaccoder_mips.c b/libavcodec/mips/aaccoder_mips.c
new file mode 100644
index 0000000..d690c8c
--- /dev/null
+++ b/libavcodec/mips/aaccoder_mips.c
@@ -0,0 +1,2502 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ *          Szabolcs Pal     (sabolc@mips.com)
+ *
+ * AAC coefficients encoder optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aaccoder.c
+ */
+
+#include "libavutil/libm.h"
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/put_bits.h"
+#include "libavcodec/aac.h"
+#include "libavcodec/aacenc.h"
+#include "libavcodec/aactab.h"
+#include "libavcodec/aacenctab.h"
+#include "libavcodec/aacenc_utils.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+typedef struct BandCodingPath {
+    int prev_idx;
+    float cost;
+    int run;
+} BandCodingPath;
+
+static const uint8_t uquad_sign_bits[81] = {
+    0, 1, 1, 1, 2, 2, 1, 2, 2,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    1, 2, 2, 2, 3, 3, 2, 3, 3,
+    2, 3, 3, 3, 4, 4, 3, 4, 4,
+    2, 3, 3, 3, 4, 4, 3, 4, 4
+};
+
+static const uint8_t upair7_sign_bits[64] = {
+    0, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const uint8_t upair12_sign_bits[169] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+static const uint8_t esc_sign_bits[289] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+/**
+ * Functions developed from template function and optimized for quantizing and encoding band
+ */
+static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        put_bits(pb, p_bits[curidx], p_codes[curidx]);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = vec[0] * IQ;
+            e2 = vec[1] * IQ;
+            e3 = vec[2] * IQ;
+            e4 = vec[3] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx, sign, count;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      2       \n\t"
+            "ori    %[sign],    $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign],    %[t0],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc2]  \n\t"
+            "slt    %[t4],      $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count],   $zero,      %[qc3]  \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t2]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count],   %[count],   %[t4]   \n\t"
+            "addu   %[count],   %[count],   %[t1]   \n\t"
+            "sll    %[t0],      %[sign],    1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign],    %[t0],      %[qc4]  \n\t"
+            "addu   %[count],   %[count],   %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign]"=&r"(sign), [count]"=&r"(count),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
+        v_bits  = p_bits[curidx] + count;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = copysignf(vec[0] * IQ, in[i+0]);
+            e2 = copysignf(vec[1] * IQ, in[i+1]);
+            e3 = copysignf(vec[2] * IQ, in[i+2]);
+            e4 = copysignf(vec[3] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
+                                                     PutBitContext *pb, const float *in, float *out,
+                                                     const float *scaled, int size, int scale_idx,
+                                                     int cb, const float lambda, const float uplim,
+                                                     int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  4       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2 = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
+        v_bits  = p_bits[curidx] + p_bits[curidx2];
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx*2 ];
+            vec2 = &p_vec[curidx2*2];
+            e1 = vec1[0] * IQ;
+            e2 = vec1[1] * IQ;
+            e3 = vec2[0] * IQ;
+            e4 = vec2[1] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
+                                                      PutBitContext *pb, const float *in, float *out,
+                                                      const float *scaled, int size, int scale_idx,
+                                                      int cb, const float lambda, const float uplim,
+                                                      int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx1, curidx2, sign1, count1, sign2, count2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      7       \n\t"
+            "ori    %[sign1],   $zero,      0       \n\t"
+            "ori    %[sign2],   $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "sll    %[t0],      %[sign1],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign2],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count1],  %[count1],  %[t1]   \n\t"
+            "addu   %[count2],  %[count2],  %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "t0", "t1", "t2", "t3", "t4",
+              "memory"
+        );
+
+        curidx1  = 8 * qc1;
+        curidx1 += qc2;
+
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
+        put_bits(pb, v_bits, v_codes);
+
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
+
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
+                                                       PutBitContext *pb, const float *in, float *out,
+                                                       const float *scaled, int size, int scale_idx,
+                                                       int cb, const float lambda, const float uplim,
+                                                       int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+    for (i = 0; i < size; i += 4) {
+        int curidx1, curidx2, sign1, count1, sign2, count2;
+        int *in_int = (int *)&in[i];
+        uint8_t v_bits;
+        unsigned int v_codes;
+        int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                              \n\t"
+            ".set noreorder                         \n\t"
+
+            "ori    %[t4],      $zero,      12      \n\t"
+            "ori    %[sign1],   $zero,      0       \n\t"
+            "ori    %[sign2],   $zero,      0       \n\t"
+            "slt    %[t0],      %[t4],      %[qc1]  \n\t"
+            "slt    %[t1],      %[t4],      %[qc2]  \n\t"
+            "slt    %[t2],      %[t4],      %[qc3]  \n\t"
+            "slt    %[t3],      %[t4],      %[qc4]  \n\t"
+            "movn   %[qc1],     %[t4],      %[t0]   \n\t"
+            "movn   %[qc2],     %[t4],      %[t1]   \n\t"
+            "movn   %[qc3],     %[t4],      %[t2]   \n\t"
+            "movn   %[qc4],     %[t4],      %[t3]   \n\t"
+            "lw     %[t0],      0(%[in_int])        \n\t"
+            "lw     %[t1],      4(%[in_int])        \n\t"
+            "lw     %[t2],      8(%[in_int])        \n\t"
+            "lw     %[t3],      12(%[in_int])       \n\t"
+            "slt    %[t0],      %[t0],      $zero   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
+            "slt    %[t2],      %[t2],      $zero   \n\t"
+            "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
+            "slt    %[t1],      %[t1],      $zero   \n\t"
+            "sll    %[t0],      %[sign1],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t1]   \n\t"
+            "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
+            "slt    %[t3],      %[t3],      $zero   \n\t"
+            "sll    %[t0],      %[sign2],   1       \n\t"
+            "or     %[t0],      %[t0],      %[t3]   \n\t"
+            "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
+            "slt    %[count1],  $zero,      %[qc1]  \n\t"
+            "slt    %[t1],      $zero,      %[qc2]  \n\t"
+            "slt    %[count2],  $zero,      %[qc3]  \n\t"
+            "slt    %[t2],      $zero,      %[qc4]  \n\t"
+            "addu   %[count1],  %[count1],  %[t1]   \n\t"
+            "addu   %[count2],  %[count2],  %[t2]   \n\t"
+
+            ".set pop                               \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx1  = 13 * qc1;
+        curidx1 += qc2;
+
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
+        put_bits(pb, v_bits, v_codes);
+
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
+
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
+        put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
+                                                   PutBitContext *pb, const float *in, float *out,
+                                                   const float *scaled, int size, int scale_idx,
+                                                   int cb, const float lambda, const float uplim,
+                                                   int *bits, float *energy, const float ROUNDING)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
+
+    uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
+    uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
+
+    abs_pow34_v(s->scoefs, in, size);
+    scaled = s->scoefs;
+
+    if (cb < 11) {
+        for (i = 0; i < size; i += 4) {
+            int curidx, curidx2, sign1, count1, sign2, count2;
+            int *in_int = (int *)&in[i];
+            uint8_t v_bits;
+            unsigned int v_codes;
+            int t0, t1, t2, t3, t4;
+            const float *vec1, *vec2;
+
+            qc1 = scaled[i  ] * Q34 + ROUNDING;
+            qc2 = scaled[i+1] * Q34 + ROUNDING;
+            qc3 = scaled[i+2] * Q34 + ROUNDING;
+            qc4 = scaled[i+3] * Q34 + ROUNDING;
+
+            __asm__ volatile (
+                ".set push                                  \n\t"
+                ".set noreorder                             \n\t"
+
+                "ori        %[t4],      $zero,      16      \n\t"
+                "ori        %[sign1],   $zero,      0       \n\t"
+                "ori        %[sign2],   $zero,      0       \n\t"
+                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
+                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
+                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
+                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
+                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
+                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
+                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
+                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
+                "lw         %[t0],      0(%[in_int])        \n\t"
+                "lw         %[t1],      4(%[in_int])        \n\t"
+                "lw         %[t2],      8(%[in_int])        \n\t"
+                "lw         %[t3],      12(%[in_int])       \n\t"
+                "slt        %[t0],      %[t0],      $zero   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
+                "slt        %[t2],      %[t2],      $zero   \n\t"
+                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
+                "slt        %[t1],      %[t1],      $zero   \n\t"
+                "sll        %[t0],      %[sign1],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t1]   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
+                "slt        %[t3],      %[t3],      $zero   \n\t"
+                "sll        %[t0],      %[sign2],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t3]   \n\t"
+                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
+                "slt        %[t1],      $zero,      %[qc2]  \n\t"
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
+                "slt        %[t2],      $zero,      %[qc4]  \n\t"
+                "addu       %[count1],  %[count1],  %[t1]   \n\t"
+                "addu       %[count2],  %[count2],  %[t2]   \n\t"
+
+                ".set pop                                   \n\t"
+
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+                  [t4]"=&r"(t4)
+                : [in_int]"r"(in_int)
+                : "memory"
+            );
+
+            curidx = 17 * qc1;
+            curidx += qc2;
+            curidx2 = 17 * qc3;
+            curidx2 += qc4;
+
+            v_codes = (p_codes[curidx] << count1) | sign1;
+            v_bits  = p_bits[curidx] + count1;
+            put_bits(pb, v_bits, v_codes);
+
+            v_codes = (p_codes[curidx2] << count2) | sign2;
+            v_bits  = p_bits[curidx2] + count2;
+            put_bits(pb, v_bits, v_codes);
+
+            if (out || energy) {
+                float e1,e2,e3,e4;
+                vec1 = &p_vectors[curidx*2 ];
+                vec2 = &p_vectors[curidx2*2];
+                e1 = copysignf(vec1[0] * IQ, in[i+0]);
+                e2 = copysignf(vec1[1] * IQ, in[i+1]);
+                e3 = copysignf(vec2[0] * IQ, in[i+2]);
+                e4 = copysignf(vec2[1] * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+            }
+        }
+    } else {
+        for (i = 0; i < size; i += 4) {
+            int curidx, curidx2, sign1, count1, sign2, count2;
+            int *in_int = (int *)&in[i];
+            uint8_t v_bits;
+            unsigned int v_codes;
+            int c1, c2, c3, c4;
+            int t0, t1, t2, t3, t4;
+
+            qc1 = scaled[i  ] * Q34 + ROUNDING;
+            qc2 = scaled[i+1] * Q34 + ROUNDING;
+            qc3 = scaled[i+2] * Q34 + ROUNDING;
+            qc4 = scaled[i+3] * Q34 + ROUNDING;
+
+            __asm__ volatile (
+                ".set push                                  \n\t"
+                ".set noreorder                             \n\t"
+
+                "ori        %[t4],      $zero,      16      \n\t"
+                "ori        %[sign1],   $zero,      0       \n\t"
+                "ori        %[sign2],   $zero,      0       \n\t"
+                "shll_s.w   %[c1],      %[qc1],     18      \n\t"
+                "shll_s.w   %[c2],      %[qc2],     18      \n\t"
+                "shll_s.w   %[c3],      %[qc3],     18      \n\t"
+                "shll_s.w   %[c4],      %[qc4],     18      \n\t"
+                "srl        %[c1],      %[c1],      18      \n\t"
+                "srl        %[c2],      %[c2],      18      \n\t"
+                "srl        %[c3],      %[c3],      18      \n\t"
+                "srl        %[c4],      %[c4],      18      \n\t"
+                "slt        %[t0],      %[t4],      %[qc1]  \n\t"
+                "slt        %[t1],      %[t4],      %[qc2]  \n\t"
+                "slt        %[t2],      %[t4],      %[qc3]  \n\t"
+                "slt        %[t3],      %[t4],      %[qc4]  \n\t"
+                "movn       %[qc1],     %[t4],      %[t0]   \n\t"
+                "movn       %[qc2],     %[t4],      %[t1]   \n\t"
+                "movn       %[qc3],     %[t4],      %[t2]   \n\t"
+                "movn       %[qc4],     %[t4],      %[t3]   \n\t"
+                "lw         %[t0],      0(%[in_int])        \n\t"
+                "lw         %[t1],      4(%[in_int])        \n\t"
+                "lw         %[t2],      8(%[in_int])        \n\t"
+                "lw         %[t3],      12(%[in_int])       \n\t"
+                "slt        %[t0],      %[t0],      $zero   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
+                "slt        %[t2],      %[t2],      $zero   \n\t"
+                "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
+                "slt        %[t1],      %[t1],      $zero   \n\t"
+                "sll        %[t0],      %[sign1],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t1]   \n\t"
+                "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
+                "slt        %[t3],      %[t3],      $zero   \n\t"
+                "sll        %[t0],      %[sign2],   1       \n\t"
+                "or         %[t0],      %[t0],      %[t3]   \n\t"
+                "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
+                "slt        %[count1],  $zero,      %[qc1]  \n\t"
+                "slt        %[t1],      $zero,      %[qc2]  \n\t"
+                "slt        %[count2],  $zero,      %[qc3]  \n\t"
+                "slt        %[t2],      $zero,      %[qc4]  \n\t"
+                "addu       %[count1],  %[count1],  %[t1]   \n\t"
+                "addu       %[count2],  %[count2],  %[t2]   \n\t"
+
+                ".set pop                                   \n\t"
+
+                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+                  [c1]"=&r"(c1), [c2]"=&r"(c2),
+                  [c3]"=&r"(c3), [c4]"=&r"(c4),
+                  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+                  [t4]"=&r"(t4)
+                : [in_int]"r"(in_int)
+                : "memory"
+            );
+
+            curidx = 17 * qc1;
+            curidx += qc2;
+
+            curidx2 = 17 * qc3;
+            curidx2 += qc4;
+
+            v_codes = (p_codes[curidx] << count1) | sign1;
+            v_bits  = p_bits[curidx] + count1;
+            put_bits(pb, v_bits, v_codes);
+
+            if (p_vectors[curidx*2  ] == 64.0f) {
+                int len = av_log2(c1);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
+                put_bits(pb, len * 2 - 3, v_codes);
+            }
+            if (p_vectors[curidx*2+1] == 64.0f) {
+                int len = av_log2(c2);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
+                put_bits(pb, len*2-3, v_codes);
+            }
+
+            v_codes = (p_codes[curidx2] << count2) | sign2;
+            v_bits  = p_bits[curidx2] + count2;
+            put_bits(pb, v_bits, v_codes);
+
+            if (p_vectors[curidx2*2  ] == 64.0f) {
+                int len = av_log2(c3);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
+                put_bits(pb, len* 2 - 3, v_codes);
+            }
+            if (p_vectors[curidx2*2+1] == 64.0f) {
+                int len = av_log2(c4);
+                v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
+                put_bits(pb, len * 2 - 3, v_codes);
+            }
+
+            if (out || energy) {
+                float e1, e2, e3, e4;
+                e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
+                e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
+                e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
+                e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+            }
+        }
+    }
+    if (energy)
+        *energy = qenergy;
+}
+
+static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, float *energy, const float ROUNDING) {
+    av_assert0(0);
+}
+
+static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, float *energy, const float ROUNDING) {
+    int i;
+    if (bits)
+        *bits = 0;
+    if (out) {
+        for (i = 0; i < size; i += 4) {
+           out[i  ] = 0.0f;
+           out[i+1] = 0.0f;
+           out[i+2] = 0.0f;
+           out[i+3] = 0.0f;
+        }
+    }
+    if (energy)
+        *energy = 0.0f;
+}
+
+static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
+                                                         PutBitContext *pb, const float *in, float *out,
+                                                         const float *scaled, int size, int scale_idx,
+                                                         int cb, const float lambda, const float uplim,
+                                                         int *bits, float *energy, const float ROUNDING) = {
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_SQUAD_mips,
+    quantize_and_encode_band_cost_SQUAD_mips,
+    quantize_and_encode_band_cost_UQUAD_mips,
+    quantize_and_encode_band_cost_UQUAD_mips,
+    quantize_and_encode_band_cost_SPAIR_mips,
+    quantize_and_encode_band_cost_SPAIR_mips,
+    quantize_and_encode_band_cost_UPAIR7_mips,
+    quantize_and_encode_band_cost_UPAIR7_mips,
+    quantize_and_encode_band_cost_UPAIR12_mips,
+    quantize_and_encode_band_cost_UPAIR12_mips,
+    quantize_and_encode_band_cost_ESC_mips,
+    quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_ZERO_mips,
+    quantize_and_encode_band_cost_ZERO_mips,
+};
+
+#define quantize_and_encode_band_cost(                                       \
+                                s, pb, in, out, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy, ROUNDING)       \
+    quantize_and_encode_band_cost_arr[cb](                                   \
+                                s, pb, in, out, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy, ROUNDING)
+
+static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
+                                          const float *in, float *out, int size, int scale_idx,
+                                          int cb, const float lambda, int rtz)
+{
+    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
+                                  INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
+}
+
+/**
+ * Functions developed from template function and optimized for getting the number of bits
+ */
+static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits)
+{
+    return 0;
+}
+
+static float get_band_numbits_NONE_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits)
+{
+    av_assert0(0);
+    return 0;
+}
+
+static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "slt    %[qc1], $zero,  %[qc1]  \n\t"
+            "slt    %[qc2], $zero,  %[qc2]  \n\t"
+            "slt    %[qc3], $zero,  %[qc3]  \n\t"
+            "slt    %[qc4], $zero,  %[qc4]  \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        curbits += p_bits[curidx];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int curbits = 0;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  2       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += uquad_sign_bits[curidx];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
+                                         PutBitContext *pb, const float *in,
+                                         const float *scaled, int size, int scale_idx,
+                                         int cb, const float lambda, const float uplim,
+                                         int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int *in_int = (int *)&in[i];
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  4       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+            "lw     %[t0],  0(%[in_int])    \n\t"
+            "lw     %[t1],  4(%[in_int])    \n\t"
+            "lw     %[t2],  8(%[in_int])    \n\t"
+            "lw     %[t3],  12(%[in_int])   \n\t"
+            "srl    %[t0],  %[t0],  31      \n\t"
+            "srl    %[t1],  %[t1],  31      \n\t"
+            "srl    %[t2],  %[t2],  31      \n\t"
+            "srl    %[t3],  %[t3],  31      \n\t"
+            "subu   %[t4],  $zero,  %[qc1]  \n\t"
+            "subu   %[t5],  $zero,  %[qc2]  \n\t"
+            "subu   %[t6],  $zero,  %[qc3]  \n\t"
+            "subu   %[t7],  $zero,  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t5],  %[t1]   \n\t"
+            "movn   %[qc3], %[t6],  %[t2]   \n\t"
+            "movn   %[qc4], %[t7],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx  = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2  = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        curbits += p_bits[curidx] + p_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
+                                          PutBitContext *pb, const float *in,
+                                          const float *scaled, int size, int scale_idx,
+                                          int cb, const float lambda, const float uplim,
+                                          int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  7       \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx  = 8 * qc1;
+        curidx += qc2;
+
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx] +
+                   upair7_sign_bits[curidx] +
+                   p_bits[curidx2] +
+                   upair7_sign_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
+                                           PutBitContext *pb, const float *in,
+                                           const float *scaled, int size, int scale_idx,
+                                           int cb, const float lambda, const float uplim,
+                                           int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                      \n\t"
+            ".set noreorder                 \n\t"
+
+            "ori    %[t4],  $zero,  12      \n\t"
+            "slt    %[t0],  %[t4],  %[qc1]  \n\t"
+            "slt    %[t1],  %[t4],  %[qc2]  \n\t"
+            "slt    %[t2],  %[t4],  %[qc3]  \n\t"
+            "slt    %[t3],  %[t4],  %[qc4]  \n\t"
+            "movn   %[qc1], %[t4],  %[t0]   \n\t"
+            "movn   %[qc2], %[t4],  %[t1]   \n\t"
+            "movn   %[qc3], %[t4],  %[t2]   \n\t"
+            "movn   %[qc4], %[t4],  %[t3]   \n\t"
+
+            ".set pop                       \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx  = 13 * qc1;
+        curidx += qc2;
+
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx] +
+                   p_bits[curidx2] +
+                   upair12_sign_bits[curidx] +
+                   upair12_sign_bits[curidx2];
+    }
+    return curbits;
+}
+
+static float get_band_numbits_ESC_mips(struct AACEncContext *s,
+                                       PutBitContext *pb, const float *in,
+                                       const float *scaled, int size, int scale_idx,
+                                       int cb, const float lambda, const float uplim,
+                                       int *bits)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    int i;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        int curidx, curidx2;
+        int cond0, cond1, cond2, cond3;
+        int c1, c2, c3, c4;
+        int t4, t5;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],      $zero,  15          \n\t"
+            "ori        %[t5],      $zero,  16          \n\t"
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
+            "srl        %[c1],      %[c1],  18          \n\t"
+            "srl        %[c2],      %[c2],  18          \n\t"
+            "srl        %[c3],      %[c3],  18          \n\t"
+            "srl        %[c4],      %[c4],  18          \n\t"
+            "slt        %[cond0],   %[t4],  %[qc1]      \n\t"
+            "slt        %[cond1],   %[t4],  %[qc2]      \n\t"
+            "slt        %[cond2],   %[t4],  %[qc3]      \n\t"
+            "slt        %[cond3],   %[t4],  %[qc4]      \n\t"
+            "movn       %[qc1],     %[t5],  %[cond0]    \n\t"
+            "movn       %[qc2],     %[t5],  %[cond1]    \n\t"
+            "movn       %[qc3],     %[t5],  %[cond2]    \n\t"
+            "movn       %[qc4],     %[t5],  %[cond3]    \n\t"
+            "ori        %[t5],      $zero,  31          \n\t"
+            "clz        %[c1],      %[c1]               \n\t"
+            "clz        %[c2],      %[c2]               \n\t"
+            "clz        %[c3],      %[c3]               \n\t"
+            "clz        %[c4],      %[c4]               \n\t"
+            "subu       %[c1],      %[t5],  %[c1]       \n\t"
+            "subu       %[c2],      %[t5],  %[c2]       \n\t"
+            "subu       %[c3],      %[t5],  %[c3]       \n\t"
+            "subu       %[c4],      %[t5],  %[c4]       \n\t"
+            "sll        %[c1],      %[c1],  1           \n\t"
+            "sll        %[c2],      %[c2],  1           \n\t"
+            "sll        %[c3],      %[c3],  1           \n\t"
+            "sll        %[c4],      %[c4],  1           \n\t"
+            "addiu      %[c1],      %[c1],  -3          \n\t"
+            "addiu      %[c2],      %[c2],  -3          \n\t"
+            "addiu      %[c3],      %[c3],  -3          \n\t"
+            "addiu      %[c4],      %[c4],  -3          \n\t"
+            "subu       %[cond0],   $zero,  %[cond0]    \n\t"
+            "subu       %[cond1],   $zero,  %[cond1]    \n\t"
+            "subu       %[cond2],   $zero,  %[cond2]    \n\t"
+            "subu       %[cond3],   $zero,  %[cond3]    \n\t"
+            "and        %[c1],      %[c1],  %[cond0]    \n\t"
+            "and        %[c2],      %[c2],  %[cond1]    \n\t"
+            "and        %[c3],      %[c3],  %[cond2]    \n\t"
+            "and        %[c4],      %[c4],  %[cond3]    \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
+              [c3]"=&r"(c3), [c4]"=&r"(c4),
+              [t4]"=&r"(t4), [t5]"=&r"(t5)
+        );
+
+        curidx = 17 * qc1;
+        curidx += qc2;
+
+        curidx2 = 17 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += esc_sign_bits[curidx];
+        curbits += p_bits[curidx2];
+        curbits += esc_sign_bits[curidx2];
+
+        curbits += c1;
+        curbits += c2;
+        curbits += c3;
+        curbits += c4;
+    }
+    return curbits;
+}
+
+static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
+                                             PutBitContext *pb, const float *in,
+                                             const float *scaled, int size, int scale_idx,
+                                             int cb, const float lambda, const float uplim,
+                                             int *bits) = {
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_SQUAD_mips,
+    get_band_numbits_SQUAD_mips,
+    get_band_numbits_UQUAD_mips,
+    get_band_numbits_UQUAD_mips,
+    get_band_numbits_SPAIR_mips,
+    get_band_numbits_SPAIR_mips,
+    get_band_numbits_UPAIR7_mips,
+    get_band_numbits_UPAIR7_mips,
+    get_band_numbits_UPAIR12_mips,
+    get_band_numbits_UPAIR12_mips,
+    get_band_numbits_ESC_mips,
+    get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_ZERO_mips,
+    get_band_numbits_ZERO_mips,
+};
+
+#define get_band_numbits(                                  \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)                    \
+    get_band_numbits_arr[cb](                              \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits)
+
+static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits, float *energy, int rtz)
+{
+    return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
+}
+
+/**
+ * Functions developed from template function and optimized for getting the band cost
+ */
+#if HAVE_MIPSFPU
+static float get_band_cost_ZERO_mips(struct AACEncContext *s,
+                                     PutBitContext *pb, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits, float *energy)
+{
+    int i;
+    float cost = 0;
+
+    for (i = 0; i < size; i += 4) {
+        cost += in[i  ] * in[i  ];
+        cost += in[i+1] * in[i+1];
+        cost += in[i+2] * in[i+2];
+        cost += in[i+3] * in[i+3];
+    }
+    if (bits)
+        *bits = 0;
+    if (energy)
+        *energy = 0.0f;
+    return cost * lambda;
+}
+
+static float get_band_cost_NONE_mips(struct AACEncContext *s,
+                                     PutBitContext *pb, const float *in,
+                                     const float *scaled, int size, int scale_idx,
+                                     int cb, const float lambda, const float uplim,
+                                     int *bits, float *energy)
+{
+    av_assert0(0);
+    return 0;
+}
+
+static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec;
+        int curidx;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "slt        %[qc1], $zero,  %[qc1]          \n\t"
+            "slt        %[qc2], $zero,  %[qc2]          \n\t"
+            "slt        %[qc3], $zero,  %[qc3]          \n\t"
+            "slt        %[qc4], $zero,  %[qc4]          \n\t"
+            "lw         %[t0],  0(%[in_int])            \n\t"
+            "lw         %[t1],  4(%[in_int])            \n\t"
+            "lw         %[t2],  8(%[in_int])            \n\t"
+            "lw         %[t3],  12(%[in_int])           \n\t"
+            "srl        %[t0],  %[t0],  31              \n\t"
+            "srl        %[t1],  %[t1],  31              \n\t"
+            "srl        %[t2],  %[t2],  31              \n\t"
+            "srl        %[t3],  %[t3],  31              \n\t"
+            "subu       %[t4],  $zero,  %[qc1]          \n\t"
+            "subu       %[t5],  $zero,  %[qc2]          \n\t"
+            "subu       %[t6],  $zero,  %[qc3]          \n\t"
+            "subu       %[t7],  $zero,  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t5],  %[t1]           \n\t"
+            "movn       %[qc3], %[t6],  %[t2]           \n\t"
+            "movn       %[qc4], %[t7],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+        curidx += 40;
+
+        curbits += p_bits[curidx];
+        vec     = &p_codes[curidx*4];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
+            "lwc1       $f1,    0(%[vec])               \n\t"
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
+            "lwc1       $f3,    4(%[vec])               \n\t"
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
+            "lwc1       $f5,    8(%[vec])               \n\t"
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
+            "lwc1       $f7,    12(%[vec])              \n\t"
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "$f4", "$f5", "$f6", "$f7",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int curbits = 0;
+    int qc1, qc2, qc3, qc4;
+
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec;
+        int curidx;
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],  $zero,  2               \n\t"
+            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
+            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
+            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
+            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t4],  %[t1]           \n\t"
+            "movn       %[qc3], %[t4],  %[t2]           \n\t"
+            "movn       %[qc4], %[t4],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+        );
+
+        curidx = qc1;
+        curidx *= 3;
+        curidx += qc2;
+        curidx *= 3;
+        curidx += qc3;
+        curidx *= 3;
+        curidx += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += uquad_sign_bits[curidx];
+        vec     = &p_codes[curidx*4];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       %[di0], 0(%[in_pos])            \n\t"
+            "lwc1       %[di1], 4(%[in_pos])            \n\t"
+            "lwc1       %[di2], 8(%[in_pos])            \n\t"
+            "lwc1       %[di3], 12(%[in_pos])           \n\t"
+            "abs.s      %[di0], %[di0]                  \n\t"
+            "abs.s      %[di1], %[di1]                  \n\t"
+            "abs.s      %[di2], %[di2]                  \n\t"
+            "abs.s      %[di3], %[di3]                  \n\t"
+            "lwc1       $f0,    0(%[vec])               \n\t"
+            "lwc1       $f1,    4(%[vec])               \n\t"
+            "lwc1       $f2,    8(%[vec])               \n\t"
+            "lwc1       $f3,    12(%[vec])              \n\t"
+            "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
+                                      PutBitContext *pb, const float *in,
+                                      const float *scaled, int size, int scale_idx,
+                                      int cb, const float lambda, const float uplim,
+                                      int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4, t5, t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t4],  $zero,  4               \n\t"
+            "slt        %[t0],  %[t4],  %[qc1]          \n\t"
+            "slt        %[t1],  %[t4],  %[qc2]          \n\t"
+            "slt        %[t2],  %[t4],  %[qc3]          \n\t"
+            "slt        %[t3],  %[t4],  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t4],  %[t1]           \n\t"
+            "movn       %[qc3], %[t4],  %[t2]           \n\t"
+            "movn       %[qc4], %[t4],  %[t3]           \n\t"
+            "lw         %[t0],  0(%[in_int])            \n\t"
+            "lw         %[t1],  4(%[in_int])            \n\t"
+            "lw         %[t2],  8(%[in_int])            \n\t"
+            "lw         %[t3],  12(%[in_int])           \n\t"
+            "srl        %[t0],  %[t0],  31              \n\t"
+            "srl        %[t1],  %[t1],  31              \n\t"
+            "srl        %[t2],  %[t2],  31              \n\t"
+            "srl        %[t3],  %[t3],  31              \n\t"
+            "subu       %[t4],  $zero,  %[qc1]          \n\t"
+            "subu       %[t5],  $zero,  %[qc2]          \n\t"
+            "subu       %[t6],  $zero,  %[qc3]          \n\t"
+            "subu       %[t7],  $zero,  %[qc4]          \n\t"
+            "movn       %[qc1], %[t4],  %[t0]           \n\t"
+            "movn       %[qc2], %[t5],  %[t1]           \n\t"
+            "movn       %[qc3], %[t6],  %[t2]           \n\t"
+            "movn       %[qc4], %[t7],  %[t3]           \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 9 * qc1;
+        curidx += qc2 + 40;
+
+        curidx2 = 9 * qc3;
+        curidx2 += qc4 + 40;
+
+        curbits += p_bits[curidx];
+        curbits += p_bits[curidx2];
+
+        vec     = &p_codes[curidx*2];
+        vec2    = &p_codes[curidx2*2];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "lwc1       $f0,    0(%[in_pos])            \n\t"
+            "lwc1       $f1,    0(%[vec])               \n\t"
+            "lwc1       $f2,    4(%[in_pos])            \n\t"
+            "lwc1       $f3,    4(%[vec])               \n\t"
+            "lwc1       $f4,    8(%[in_pos])            \n\t"
+            "lwc1       $f5,    0(%[vec2])              \n\t"
+            "lwc1       $f6,    12(%[in_pos])           \n\t"
+            "lwc1       $f7,    4(%[vec2])              \n\t"
+            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
+            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
+            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "$f4", "$f5", "$f6", "$f7",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
+                                       PutBitContext *pb, const float *in,
+                                       const float *scaled, int size, int scale_idx,
+                                       int cb, const float lambda, const float uplim,
+                                       int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2, sign1, count1, sign2, count2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "ori        %[t4],      $zero,      7               \n\t"
+            "ori        %[sign1],   $zero,      0               \n\t"
+            "ori        %[sign2],   $zero,      0               \n\t"
+            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
+            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
+            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
+            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
+            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
+            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
+            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
+            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
+            "lw         %[t0],      0(%[in_int])                \n\t"
+            "lw         %[t1],      4(%[in_int])                \n\t"
+            "lw         %[t2],      8(%[in_int])                \n\t"
+            "lw         %[t3],      12(%[in_int])               \n\t"
+            "slt        %[t0],      %[t0],      $zero           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
+            "slt        %[t2],      %[t2],      $zero           \n\t"
+            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
+            "slt        %[t1],      %[t1],      $zero           \n\t"
+            "sll        %[t0],      %[sign1],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t1]           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
+            "slt        %[t3],      %[t3],      $zero           \n\t"
+            "sll        %[t0],      %[sign2],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t3]           \n\t"
+            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
+            "slt        %[t1],      $zero,      %[qc2]          \n\t"
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
+            "slt        %[t2],      $zero,      %[qc4]          \n\t"
+            "addu       %[count1],  %[count1],  %[t1]           \n\t"
+            "addu       %[count2],  %[count2],  %[t2]           \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 8 * qc1;
+        curidx += qc2;
+
+        curidx2 = 8 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += upair7_sign_bits[curidx];
+        vec     = &p_codes[curidx*2];
+
+        curbits += p_bits[curidx2];
+        curbits += upair7_sign_bits[curidx2];
+        vec2    = &p_codes[curidx2*2];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
+            "abs.s      %[di0],     %[di0]                      \n\t"
+            "abs.s      %[di1],     %[di1]                      \n\t"
+            "abs.s      %[di2],     %[di2]                      \n\t"
+            "abs.s      %[di3],     %[di3]                      \n\t"
+            "lwc1       $f0,        0(%[vec])                   \n\t"
+            "lwc1       $f1,        4(%[vec])                   \n\t"
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
+                                        PutBitContext *pb, const float *in,
+                                        const float *scaled, int size, int scale_idx,
+                                        int cb, const float lambda, const float uplim,
+                                        int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        int sign1, count1, sign2, count2;
+        int   *in_int = (int   *)&in[i];
+        float *in_pos = (float *)&in[i];
+        float di0, di1, di2, di3;
+        int t0, t1, t2, t3, t4;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "ori        %[t4],      $zero,      12              \n\t"
+            "ori        %[sign1],   $zero,      0               \n\t"
+            "ori        %[sign2],   $zero,      0               \n\t"
+            "slt        %[t0],      %[t4],      %[qc1]          \n\t"
+            "slt        %[t1],      %[t4],      %[qc2]          \n\t"
+            "slt        %[t2],      %[t4],      %[qc3]          \n\t"
+            "slt        %[t3],      %[t4],      %[qc4]          \n\t"
+            "movn       %[qc1],     %[t4],      %[t0]           \n\t"
+            "movn       %[qc2],     %[t4],      %[t1]           \n\t"
+            "movn       %[qc3],     %[t4],      %[t2]           \n\t"
+            "movn       %[qc4],     %[t4],      %[t3]           \n\t"
+            "lw         %[t0],      0(%[in_int])                \n\t"
+            "lw         %[t1],      4(%[in_int])                \n\t"
+            "lw         %[t2],      8(%[in_int])                \n\t"
+            "lw         %[t3],      12(%[in_int])               \n\t"
+            "slt        %[t0],      %[t0],      $zero           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
+            "slt        %[t2],      %[t2],      $zero           \n\t"
+            "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
+            "slt        %[t1],      %[t1],      $zero           \n\t"
+            "sll        %[t0],      %[sign1],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t1]           \n\t"
+            "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
+            "slt        %[t3],      %[t3],      $zero           \n\t"
+            "sll        %[t0],      %[sign2],   1               \n\t"
+            "or         %[t0],      %[t0],      %[t3]           \n\t"
+            "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
+            "slt        %[count1],  $zero,      %[qc1]          \n\t"
+            "slt        %[t1],      $zero,      %[qc2]          \n\t"
+            "slt        %[count2],  $zero,      %[qc3]          \n\t"
+            "slt        %[t2],      $zero,      %[qc4]          \n\t"
+            "addu       %[count1],  %[count1],  %[t1]           \n\t"
+            "addu       %[count2],  %[count2],  %[t2]           \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
+              [sign2]"=&r"(sign2), [count2]"=&r"(count2),
+              [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
+              [t4]"=&r"(t4)
+            : [in_int]"r"(in_int)
+            : "memory"
+        );
+
+        curidx = 13 * qc1;
+        curidx += qc2;
+
+        curidx2 = 13 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += p_bits[curidx2];
+        curbits += upair12_sign_bits[curidx];
+        curbits += upair12_sign_bits[curidx2];
+        vec     = &p_codes[curidx*2];
+        vec2    = &p_codes[curidx2*2];
+
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
+        __asm__ volatile (
+            ".set push                                          \n\t"
+            ".set noreorder                                     \n\t"
+
+            "lwc1       %[di0],     0(%[in_pos])                \n\t"
+            "lwc1       %[di1],     4(%[in_pos])                \n\t"
+            "lwc1       %[di2],     8(%[in_pos])                \n\t"
+            "lwc1       %[di3],     12(%[in_pos])               \n\t"
+            "abs.s      %[di0],     %[di0]                      \n\t"
+            "abs.s      %[di1],     %[di1]                      \n\t"
+            "abs.s      %[di2],     %[di2]                      \n\t"
+            "abs.s      %[di3],     %[di3]                      \n\t"
+            "lwc1       $f0,        0(%[vec])                   \n\t"
+            "lwc1       $f1,        4(%[vec])                   \n\t"
+            "lwc1       $f2,        0(%[vec2])                  \n\t"
+            "lwc1       $f3,        4(%[vec2])                  \n\t"
+            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
+            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
+            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
+            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
+
+            ".set pop                                           \n\t"
+
+            : [di0]"=&f"(di0), [di1]"=&f"(di1),
+              [di2]"=&f"(di2), [di3]"=&f"(di3)
+            : [in_pos]"r"(in_pos), [vec]"r"(vec),
+              [vec2]"r"(vec2), [IQ]"f"(IQ)
+            : "$f0", "$f1", "$f2", "$f3",
+              "memory"
+        );
+
+        cost += di0 * di0 + di1 * di1
+                + di2 * di2 + di3 * di3;
+    }
+
+    if (bits)
+        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
+    return cost * lambda + curbits;
+}
+
+static float get_band_cost_ESC_mips(struct AACEncContext *s,
+                                    PutBitContext *pb, const float *in,
+                                    const float *scaled, int size, int scale_idx,
+                                    int cb, const float lambda, const float uplim,
+                                    int *bits, float *energy)
+{
+    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+    const float CLIPPED_ESCAPE = 165140.0f * IQ;
+    int i;
+    float cost = 0;
+    float qenergy = 0.0f;
+    int qc1, qc2, qc3, qc4;
+    int curbits = 0;
+
+    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
+    float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
+
+    for (i = 0; i < size; i += 4) {
+        const float *vec, *vec2;
+        int curidx, curidx2;
+        float t1, t2, t3, t4, V;
+        float di1, di2, di3, di4;
+        int cond0, cond1, cond2, cond3;
+        int c1, c2, c3, c4;
+        int t6, t7;
+
+        qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
+        qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
+        qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
+        qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
+
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+
+            "ori        %[t6],      $zero,  15          \n\t"
+            "ori        %[t7],      $zero,  16          \n\t"
+            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
+            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
+            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
+            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
+            "srl        %[c1],      %[c1],  18          \n\t"
+            "srl        %[c2],      %[c2],  18          \n\t"
+            "srl        %[c3],      %[c3],  18          \n\t"
+            "srl        %[c4],      %[c4],  18          \n\t"
+            "slt        %[cond0],   %[t6],  %[qc1]      \n\t"
+            "slt        %[cond1],   %[t6],  %[qc2]      \n\t"
+            "slt        %[cond2],   %[t6],  %[qc3]      \n\t"
+            "slt        %[cond3],   %[t6],  %[qc4]      \n\t"
+            "movn       %[qc1],     %[t7],  %[cond0]    \n\t"
+            "movn       %[qc2],     %[t7],  %[cond1]    \n\t"
+            "movn       %[qc3],     %[t7],  %[cond2]    \n\t"
+            "movn       %[qc4],     %[t7],  %[cond3]    \n\t"
+
+            ".set pop                                   \n\t"
+
+            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
+              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
+              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
+              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
+              [c1]"=&r"(c1), [c2]"=&r"(c2),
+              [c3]"=&r"(c3), [c4]"=&r"(c4),
+              [t6]"=&r"(t6), [t7]"=&r"(t7)
+        );
+
+        curidx = 17 * qc1;
+        curidx += qc2;
+
+        curidx2 = 17 * qc3;
+        curidx2 += qc4;
+
+        curbits += p_bits[curidx];
+        curbits += esc_sign_bits[curidx];
+        vec     = &p_codes[curidx*2];
+
+        curbits += p_bits[curidx2];
+        curbits += esc_sign_bits[curidx2];
+        vec2     = &p_codes[curidx2*2];
+
+        curbits += (av_log2(c1) * 2 - 3) & (-cond0);
+        curbits += (av_log2(c2) * 2 - 3) & (-cond1);
+        curbits += (av_log2(c3) * 2 - 3) & (-cond2);
+        curbits += (av_log2(c4) * 2 - 3) & (-cond3);
+
+        t1 = fabsf(in[i  ]);
+        t2 = fabsf(in[i+1]);
+        t3 = fabsf(in[i+2]);
+        t4 = fabsf(in[i+3]);
+
+        if (cond0) {
+            if (t1 >= CLIPPED_ESCAPE) {
+                di1 = t1 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di1 = t1 - (V = vec[0] * IQ);
+            qenergy += V*V;
+        }
+
+        if (cond1) {
+            if (t2 >= CLIPPED_ESCAPE) {
+                di2 = t2 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di2 = t2 - (V = vec[1] * IQ);
+            qenergy += V*V;
+        }
+
+        if (cond2) {
+            if (t3 >= CLIPPED_ESCAPE) {
+                di3 = t3 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di3 = t3 - (V = vec2[0] * IQ);
+            qenergy += V*V;
+        }
+
+        if (cond3) {
+            if (t4 >= CLIPPED_ESCAPE) {
+                di4 = t4 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
+            } else {
+                di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
+                qenergy += V*V;
+            }
+        } else {
+            di4 = t4 - (V = vec2[1]*IQ);
+            qenergy += V*V;
+        }
+
+        cost += di1 * di1 + di2 * di2
+                + di3 * di3 + di4 * di4;
+    }
+
+    if (bits)
+        *bits = curbits;
+    return cost * lambda + curbits;
+}
+
+static float (*const get_band_cost_arr[])(struct AACEncContext *s,
+                                          PutBitContext *pb, const float *in,
+                                          const float *scaled, int size, int scale_idx,
+                                          int cb, const float lambda, const float uplim,
+                                          int *bits, float *energy) = {
+    get_band_cost_ZERO_mips,
+    get_band_cost_SQUAD_mips,
+    get_band_cost_SQUAD_mips,
+    get_band_cost_UQUAD_mips,
+    get_band_cost_UQUAD_mips,
+    get_band_cost_SPAIR_mips,
+    get_band_cost_SPAIR_mips,
+    get_band_cost_UPAIR7_mips,
+    get_band_cost_UPAIR7_mips,
+    get_band_cost_UPAIR12_mips,
+    get_band_cost_UPAIR12_mips,
+    get_band_cost_ESC_mips,
+    get_band_cost_NONE_mips, /* cb 12 doesn't exist */
+    get_band_cost_ZERO_mips,
+    get_band_cost_ZERO_mips,
+    get_band_cost_ZERO_mips,
+};
+
+#define get_band_cost(                                  \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy)            \
+    get_band_cost_arr[cb](                              \
+                                s, pb, in, scaled, size, scale_idx, cb, \
+                                lambda, uplim, bits, energy)
+
+static float quantize_band_cost(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, float *energy, int rtz)
+{
+    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
+}
+
+#include "libavcodec/aacenc_quantization_misc.h"
+
+#include "libavcodec/aaccoder_twoloop.h"
+
+static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
+{
+    int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
+    uint8_t nextband0[128], nextband1[128];
+    float M[128], S[128];
+    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
+    const float lambda = s->lambda;
+    const float mslambda = FFMIN(1.0f, lambda / 120.f);
+    SingleChannelElement *sce0 = &cpe->ch[0];
+    SingleChannelElement *sce1 = &cpe->ch[1];
+    if (!cpe->common_window)
+        return;
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce0, nextband0);
+    ff_init_nextband_map(sce1, nextband1);
+
+    prev_mid = sce0->sf_idx[0];
+    prev_side = sce1->sf_idx[0];
+    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
+        for (g = 0;  g < sce0->ics.num_swb; g++) {
+            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
+            if (!cpe->is_mask[w*16+g])
+                cpe->ms_mask[w*16+g] = 0;
+            if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
+                float Mmax = 0.0f, Smax = 0.0f;
+
+                /* Must compute mid/side SF and book for the whole window group */
+                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                        S[i] =  M[i]
+                              - sce1->coeffs[start+(w+w2)*128+i];
+                    }
+                    abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
+                        Mmax = FFMAX(Mmax, M34[i]);
+                        Smax = FFMAX(Smax, S34[i]);
+                    }
+                }
+
+                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
+                    float dist1 = 0.0f, dist2 = 0.0f;
+                    int B0 = 0, B1 = 0;
+                    int minidx;
+                    int mididx, sididx;
+                    int midcb, sidcb;
+
+                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
+                    mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
+                        && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
+                            || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
+                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
+                        continue;
+                    }
+
+                    midcb = find_min_book(Mmax, mididx);
+                    sidcb = find_min_book(Smax, sididx);
+
+                    /* No CB can be zero */
+                    midcb = FFMAX(1,midcb);
+                    sidcb = FFMAX(1,sidcb);
+
+                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                        float minthr = FFMIN(band0->threshold, band1->threshold);
+                        int b1,b2,b3,b4;
+                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                            S[i] =  M[i]
+                                  - sce1->coeffs[start+(w+w2)*128+i];
+                        }
+
+                        abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
+                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                    L34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[w*16+g],
+                                                    sce0->band_type[w*16+g],
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
+                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                    R34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[w*16+g],
+                                                    sce1->band_type[w*16+g],
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
+                        dist2 += quantize_band_cost(s, M,
+                                                    M34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    mididx,
+                                                    midcb,
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
+                        dist2 += quantize_band_cost(s, S,
+                                                    S34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sididx,
+                                                    sidcb,
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
+                        B0 += b1+b2;
+                        B1 += b3+b4;
+                        dist1 -= b1+b2;
+                        dist2 -= b3+b4;
+                    }
+                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
+                    if (cpe->ms_mask[w*16+g]) {
+                        if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
+                            sce0->sf_idx[w*16+g] = mididx;
+                            sce1->sf_idx[w*16+g] = sididx;
+                            sce0->band_type[w*16+g] = midcb;
+                            sce1->band_type[w*16+g] = sidcb;
+                        } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
+                            /* ms_mask unneeded, and it confuses some decoders */
+                            cpe->ms_mask[w*16+g] = 0;
+                        }
+                        break;
+                    } else if (B1 > B0) {
+                        /* More boost won't fix this */
+                        break;
+                    }
+                }
+            }
+            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
+                prev_mid = sce0->sf_idx[w*16+g];
+            if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                prev_side = sce1->sf_idx[w*16+g];
+            start += sce0->ics.swb_sizes[g];
+        }
+    }
+}
+#endif /*HAVE_MIPSFPU */
+
+#include "libavcodec/aaccoder_trellis.h"
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aac_coder_init_mips(AACEncContext *c) {
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    AACCoefficientsEncoder *e = c->coder;
+    int option = c->options.coder;
+
+    if (option == 2) {
+        e->quantize_and_encode_band = quantize_and_encode_band_mips;
+        e->encode_window_bands_info = codebook_trellis_rate;
+#if HAVE_MIPSFPU
+        e->search_for_quantizers    = search_for_quantizers_twoloop;
+#endif /* HAVE_MIPSFPU */
+    }
+#if HAVE_MIPSFPU
+    e->search_for_ms            = search_for_ms_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacdec_mips.c b/libavcodec/mips/aacdec_mips.c
new file mode 100644
index 0000000..253cdeb
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#include "libavcodec/aac.h"
+#include "aacdec_mips.h"
+#include "libavcodec/aactab.h"
+#include "libavcodec/sinewin.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static av_always_inline void float_copy(float *dst, const float *src, int count)
+{
+    // Copy 'count' floats from src to dst
+    const float *loop_end = src + count;
+    int temp[8];
+
+    // count must be a multiple of 8
+    av_assert2(count % 8 == 0);
+
+    // loop unrolled 8 times
+    __asm__ volatile (
+        ".set push                               \n\t"
+        ".set noreorder                          \n\t"
+    "1:                                          \n\t"
+        "lw      %[temp0],    0(%[src])          \n\t"
+        "lw      %[temp1],    4(%[src])          \n\t"
+        "lw      %[temp2],    8(%[src])          \n\t"
+        "lw      %[temp3],    12(%[src])         \n\t"
+        "lw      %[temp4],    16(%[src])         \n\t"
+        "lw      %[temp5],    20(%[src])         \n\t"
+        "lw      %[temp6],    24(%[src])         \n\t"
+        "lw      %[temp7],    28(%[src])         \n\t"
+        PTR_ADDIU "%[src],    %[src],      32    \n\t"
+        "sw      %[temp0],    0(%[dst])          \n\t"
+        "sw      %[temp1],    4(%[dst])          \n\t"
+        "sw      %[temp2],    8(%[dst])          \n\t"
+        "sw      %[temp3],    12(%[dst])         \n\t"
+        "sw      %[temp4],    16(%[dst])         \n\t"
+        "sw      %[temp5],    20(%[dst])         \n\t"
+        "sw      %[temp6],    24(%[dst])         \n\t"
+        "sw      %[temp7],    28(%[dst])         \n\t"
+        "bne     %[src],      %[loop_end], 1b    \n\t"
+        PTR_ADDIU "%[dst],    %[dst],      32    \n\t"
+        ".set pop                                \n\t"
+
+        : [temp0]"=&r"(temp[0]), [temp1]"=&r"(temp[1]),
+          [temp2]"=&r"(temp[2]), [temp3]"=&r"(temp[3]),
+          [temp4]"=&r"(temp[4]), [temp5]"=&r"(temp[5]),
+          [temp6]"=&r"(temp[6]), [temp7]"=&r"(temp[7]),
+          [src]"+r"(src), [dst]"+r"(dst)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+}
+
+static av_always_inline int lcg_random(unsigned previous_val)
+{
+    union { unsigned u; int s; } v = { previous_val * 1664525u + 1013904223 };
+    return v.s;
+}
+
+static void imdct_and_windowing_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *in    = sce->coeffs;
+    float *out   = sce->ret;
+    float *saved = sce->saved;
+    const float *swindow      = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    const float *lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
+    float *buf  = ac->buf_mdct;
+    int i;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        for (i = 0; i < 1024; i += 128)
+            ac->mdct_small.imdct_half(&ac->mdct_small, buf + i, in + i);
+    } else
+        ac->mdct.imdct_half(&ac->mdct, buf, in);
+
+    /* window overlapping
+     * NOTE: To simplify the overlapping code, all 'meaningless' short to long
+     * and long to short transitions are considered to be short to short
+     * transitions. This leaves just two cases (long to long and short to short)
+     * with a little special sauce for EIGHT_SHORT_SEQUENCE.
+     */
+    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
+            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
+        ac->fdsp->vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+    } else {
+        float_copy(out, saved, 448);
+
+        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+            {
+                float wi;
+                float wj;
+                int i;
+                float temp0, temp1, temp2, temp3;
+                float *dst0 = out + 448 + 0*128;
+                float *dst1 = dst0 + 64 + 63;
+                float *dst2 = saved + 63;
+                float *win0 = (float*)swindow;
+                float *win1 = win0 + 64 + 63;
+                float *win0_prev = (float*)swindow_prev;
+                float *win1_prev = win0_prev + 64 + 63;
+                float *src0_prev = saved + 448;
+                float *src1_prev = buf + 0*128 + 63;
+                float *src0 = buf + 0*128 + 64;
+                float *src1 = buf + 1*128 + 63;
+
+                for(i = 0; i < 64; i++)
+                {
+                    temp0 = src0_prev[0];
+                    temp1 = src1_prev[0];
+                    wi = *win0_prev;
+                    wj = *win1_prev;
+                    temp2 = src0[0];
+                    temp3 = src1[0];
+                    dst0[0] = temp0 * wj - temp1 * wi;
+                    dst1[0] = temp0 * wi + temp1 * wj;
+
+                    wi = *win0;
+                    wj = *win1;
+
+                    temp0 = src0[128];
+                    temp1 = src1[128];
+                    dst0[128] = temp2 * wj - temp3 * wi;
+                    dst1[128] = temp2 * wi + temp3 * wj;
+
+                    temp2 = src0[256];
+                    temp3 = src1[256];
+                    dst0[256] = temp0 * wj - temp1 * wi;
+                    dst1[256] = temp0 * wi + temp1 * wj;
+                    dst0[384] = temp2 * wj - temp3 * wi;
+                    dst1[384] = temp2 * wi + temp3 * wj;
+
+                    temp0 = src0[384];
+                    temp1 = src1[384];
+                    dst0[512] = temp0 * wj - temp1 * wi;
+                    dst2[0] = temp0 * wi + temp1 * wj;
+
+                    src0++;
+                    src1--;
+                    src0_prev++;
+                    src1_prev--;
+                    win0++;
+                    win1--;
+                    win0_prev++;
+                    win1_prev--;
+                    dst0++;
+                    dst1--;
+                    dst2--;
+                }
+            }
+        } else {
+            ac->fdsp->vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            float_copy(out + 576, buf + 64, 448);
+        }
+    }
+
+    // buffer update
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        ac->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        float_copy(saved + 448, buf + 7*128 + 64, 64);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float_copy(saved, buf + 512, 448);
+        float_copy(saved + 448, buf + 7*128 + 64, 64);
+    } else { // LONG_STOP or ONLY_LONG
+        float_copy(saved, buf + 512, 512);
+    }
+}
+
+static void apply_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    const LongTermPrediction *ltp = &sce->ics.ltp;
+    const uint16_t *offsets = sce->ics.swb_offset;
+    int i, sfb;
+    int j, k;
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        float *predTime = sce->ret;
+        float *predFreq = ac->buf_mdct;
+        float *p_predTime;
+        int16_t num_samples = 2048;
+
+        if (ltp->lag < 1024)
+            num_samples = ltp->lag + 1024;
+            j = (2048 - num_samples) >> 2;
+            k = (2048 - num_samples) & 3;
+            p_predTime = &predTime[num_samples];
+
+        for (i = 0; i < num_samples; i++)
+            predTime[i] = sce->ltp_state[i + 2048 - ltp->lag] * ltp->coef;
+        for (i = 0; i < j; i++) {
+
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                "sw      $0,              4(%[p_predTime])        \n\t"
+                "sw      $0,              8(%[p_predTime])        \n\t"
+                "sw      $0,              12(%[p_predTime])       \n\t"
+                PTR_ADDIU "%[p_predTime], %[p_predTime],     16   \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+        for (i = 0; i < k; i++) {
+
+            __asm__ volatile (
+                "sw      $0,              0(%[p_predTime])        \n\t"
+                PTR_ADDIU "%[p_predTime], %[p_predTime],     4    \n\t"
+
+                : [p_predTime]"+r"(p_predTime)
+                :
+                : "memory"
+            );
+        }
+
+        ac->windowing_and_mdct_ltp(ac, predFreq, predTime, &sce->ics);
+
+        if (sce->tns.present)
+            ac->apply_tns(predFreq, &sce->tns, &sce->ics, 0);
+
+        for (sfb = 0; sfb < FFMIN(sce->ics.max_sfb, MAX_LTP_LONG_SFB); sfb++)
+            if (ltp->used[sfb])
+                for (i = offsets[sfb]; i < offsets[sfb + 1]; i++)
+                    sce->coeffs[i] += predFreq[i];
+    }
+}
+
+#if HAVE_MIPSFPU
+static av_always_inline void fmul_and_reverse(float *dst, const float *src0, const float *src1, int count)
+{
+    /* Multiply 'count' floats in src0 by src1 and store the results in dst in reverse */
+    /* This should be equivalent to a normal fmul, followed by reversing dst */
+
+    // count must be a multiple of 4
+    av_assert2(count % 4 == 0);
+
+    // move src0 and src1 to the last element of their arrays
+    src0 += count - 1;
+    src1 += count - 1;
+
+    for (; count > 0; count -= 4){
+        float temp[12];
+
+        /* loop unrolled 4 times */
+        __asm__ volatile (
+            "lwc1    %[temp0],    0(%[ptr2])                \n\t"
+            "lwc1    %[temp1],    -4(%[ptr2])               \n\t"
+            "lwc1    %[temp2],    -8(%[ptr2])               \n\t"
+            "lwc1    %[temp3],    -12(%[ptr2])              \n\t"
+            "lwc1    %[temp4],    0(%[ptr3])                \n\t"
+            "lwc1    %[temp5],    -4(%[ptr3])               \n\t"
+            "lwc1    %[temp6],    -8(%[ptr3])               \n\t"
+            "lwc1    %[temp7],    -12(%[ptr3])              \n\t"
+            "mul.s   %[temp8],    %[temp0],     %[temp4]    \n\t"
+            "mul.s   %[temp9],    %[temp1],     %[temp5]    \n\t"
+            "mul.s   %[temp10],   %[temp2],     %[temp6]    \n\t"
+            "mul.s   %[temp11],   %[temp3],     %[temp7]    \n\t"
+            "swc1    %[temp8],    0(%[ptr1])                \n\t"
+            "swc1    %[temp9],    4(%[ptr1])                \n\t"
+            "swc1    %[temp10],   8(%[ptr1])                \n\t"
+            "swc1    %[temp11],   12(%[ptr1])               \n\t"
+            PTR_ADDIU "%[ptr1],   %[ptr1],      16          \n\t"
+            PTR_ADDIU "%[ptr2],   %[ptr2],      -16         \n\t"
+            PTR_ADDIU "%[ptr3],   %[ptr3],      -16         \n\t"
+
+            : [temp0]"=&f"(temp[0]), [temp1]"=&f"(temp[1]),
+              [temp2]"=&f"(temp[2]), [temp3]"=&f"(temp[3]),
+              [temp4]"=&f"(temp[4]), [temp5]"=&f"(temp[5]),
+              [temp6]"=&f"(temp[6]), [temp7]"=&f"(temp[7]),
+              [temp8]"=&f"(temp[8]), [temp9]"=&f"(temp[9]),
+              [temp10]"=&f"(temp[10]), [temp11]"=&f"(temp[11]),
+              [ptr1]"+r"(dst), [ptr2]"+r"(src0), [ptr3]"+r"(src1)
+            :
+            : "memory"
+        );
+    }
+}
+
+static void update_ltp_mips(AACContext *ac, SingleChannelElement *sce)
+{
+    IndividualChannelStream *ics = &sce->ics;
+    float *saved     = sce->saved;
+    float *saved_ltp = sce->coeffs;
+    const float *lwindow = ics->use_kb_window[0] ? ff_aac_kbd_long_1024 : ff_sine_1024;
+    const float *swindow = ics->use_kb_window[0] ? ff_aac_kbd_short_128 : ff_sine_128;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
+        float *p_saved_ltp = saved_ltp + 576;
+        float *loop_end1 = p_saved_ltp + 448;
+
+        float_copy(saved_ltp, saved, 512);
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+        "1:                                                   \n\t"
+            "sw     $0,              0(%[p_saved_ltp])        \n\t"
+            "sw     $0,              4(%[p_saved_ltp])        \n\t"
+            "sw     $0,              8(%[p_saved_ltp])        \n\t"
+            "sw     $0,              12(%[p_saved_ltp])       \n\t"
+            "sw     $0,              16(%[p_saved_ltp])       \n\t"
+            "sw     $0,              20(%[p_saved_ltp])       \n\t"
+            "sw     $0,              24(%[p_saved_ltp])       \n\t"
+            "sw     $0,              28(%[p_saved_ltp])       \n\t"
+            PTR_ADDIU "%[p_saved_ltp],%[p_saved_ltp],    32   \n\t"
+            "bne    %[p_saved_ltp],  %[loop_end1],       1b   \n\t"
+
+            : [p_saved_ltp]"+r"(p_saved_ltp)
+            : [loop_end1]"r"(loop_end1)
+            : "memory"
+        );
+
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64);
+    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
+        float *buff0 = saved;
+        float *buff1 = saved_ltp;
+        float *loop_end = saved + 448;
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            ".set push                                  \n\t"
+            ".set noreorder                             \n\t"
+        "1:                                             \n\t"
+            "lw      %[temp0],    0(%[src])             \n\t"
+            "lw      %[temp1],    4(%[src])             \n\t"
+            "lw      %[temp2],    8(%[src])             \n\t"
+            "lw      %[temp3],    12(%[src])            \n\t"
+            "lw      %[temp4],    16(%[src])            \n\t"
+            "lw      %[temp5],    20(%[src])            \n\t"
+            "lw      %[temp6],    24(%[src])            \n\t"
+            "lw      %[temp7],    28(%[src])            \n\t"
+            PTR_ADDIU "%[src],    %[src],         32    \n\t"
+            "sw      %[temp0],    0(%[dst])             \n\t"
+            "sw      %[temp1],    4(%[dst])             \n\t"
+            "sw      %[temp2],    8(%[dst])             \n\t"
+            "sw      %[temp3],    12(%[dst])            \n\t"
+            "sw      %[temp4],    16(%[dst])            \n\t"
+            "sw      %[temp5],    20(%[dst])            \n\t"
+            "sw      %[temp6],    24(%[dst])            \n\t"
+            "sw      %[temp7],    28(%[dst])            \n\t"
+            "sw      $0,          2304(%[dst])          \n\t"
+            "sw      $0,          2308(%[dst])          \n\t"
+            "sw      $0,          2312(%[dst])          \n\t"
+            "sw      $0,          2316(%[dst])          \n\t"
+            "sw      $0,          2320(%[dst])          \n\t"
+            "sw      $0,          2324(%[dst])          \n\t"
+            "sw      $0,          2328(%[dst])          \n\t"
+            "sw      $0,          2332(%[dst])          \n\t"
+            "bne     %[src],      %[loop_end],    1b    \n\t"
+            PTR_ADDIU "%[dst],    %[dst],         32    \n\t"
+            ".set pop                                   \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [src]"+r"(buff0), [dst]"+r"(buff1)
+            : [loop_end]"r"(loop_end)
+            : "memory"
+        );
+        ac->fdsp->vector_fmul_reverse(saved_ltp + 448, ac->buf_mdct + 960,     &swindow[64],      64);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 960, swindow, 64);
+    } else { // LONG_STOP or ONLY_LONG
+        ac->fdsp->vector_fmul_reverse(saved_ltp,       ac->buf_mdct + 512,     &lwindow[512],     512);
+        fmul_and_reverse(saved_ltp + 512, ac->buf_mdct + 512, lwindow, 512);
+    }
+
+    float_copy(sce->ltp_state, sce->ltp_state + 1024, 1024);
+    float_copy(sce->ltp_state + 1024, sce->ret, 1024);
+    float_copy(sce->ltp_state + 2048, saved_ltp, 1024);
+}
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aacdec_init_mips(AACContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->imdct_and_windowing         = imdct_and_windowing_mips;
+    c->apply_ltp                   = apply_ltp_mips;
+#if HAVE_MIPSFPU
+    c->update_ltp                  = update_ltp_mips;
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacdec_mips.h b/libavcodec/mips/aacdec_mips.h
new file mode 100644
index 0000000..758266f
--- /dev/null
+++ b/libavcodec/mips/aacdec_mips.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * AAC Spectral Band Replication decoding functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacdec.c
+ */
+
+#ifndef AVCODEC_MIPS_AACDEC_MIPS_H
+#define AVCODEC_MIPS_AACDEC_MIPS_H
+
+#include "libavcodec/aac.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static inline float *VMUL2_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    float temp0, temp1, temp2;
+    int temp3, temp4;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp3],  %[idx],       0x0F         \n\t"
+        "andi    %[temp4],  %[idx],       0xF0         \n\t"
+        "sll     %[temp3],  %[temp3],     2            \n\t"
+        "srl     %[temp4],  %[temp4],     2            \n\t"
+        "lwc1    %[temp2],  0(%[scale])                \n\t"
+        "lwxc1   %[temp0],  %[temp3](%[v])             \n\t"
+        "lwxc1   %[temp1],  %[temp4](%[v])             \n\t"
+        "mul.s   %[temp0],  %[temp0],     %[temp2]     \n\t"
+        "mul.s   %[temp1],  %[temp1],     %[temp2]     \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       8            \n\t"
+        "swc1    %[temp0],  0(%[dst])                  \n\t"
+        "swc1    %[temp1],  4(%[dst])                  \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4_mips(float *dst, const float *v, unsigned idx,
+                           const float *scale)
+{
+    int temp0, temp1, temp2, temp3;
+    float temp4, temp5, temp6, temp7, temp8;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       0x03        \n\t"
+        "andi    %[temp1],  %[idx],       0x0C        \n\t"
+        "andi    %[temp2],  %[idx],       0x30        \n\t"
+        "andi    %[temp3],  %[idx],       0xC0        \n\t"
+        "sll     %[temp0],  %[temp0],     2           \n\t"
+        "srl     %[temp2],  %[temp2],     2           \n\t"
+        "srl     %[temp3],  %[temp3],     4           \n\t"
+        "lwc1    %[temp4],  0(%[scale])               \n\t"
+        "lwxc1   %[temp5],  %[temp0](%[v])            \n\t"
+        "lwxc1   %[temp6],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp7],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp8],  %[temp3](%[v])            \n\t"
+        "mul.s   %[temp5],  %[temp5],     %[temp4]    \n\t"
+        "mul.s   %[temp6],  %[temp6],     %[temp4]    \n\t"
+        "mul.s   %[temp7],  %[temp7],     %[temp4]    \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp4]    \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       16          \n\t"
+        "swc1    %[temp5],  0(%[dst])                 \n\t"
+        "swc1    %[temp6],  4(%[dst])                 \n\t"
+        "swc1    %[temp7],  8(%[dst])                 \n\t"
+        "swc1    %[temp8],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL2S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9;
+    float *ret;
+
+    __asm__ volatile(
+        "andi    %[temp0],  %[idx],       0x0F       \n\t"
+        "andi    %[temp1],  %[idx],       0xF0       \n\t"
+        "lw      %[temp4],  0(%[scale])              \n\t"
+        "srl     %[temp2],  %[sign],      1          \n\t"
+        "sll     %[temp3],  %[sign],      31         \n\t"
+        "sll     %[temp2],  %[temp2],     31         \n\t"
+        "sll     %[temp0],  %[temp0],     2          \n\t"
+        "srl     %[temp1],  %[temp1],     2          \n\t"
+        "lwxc1   %[temp8],  %[temp0](%[v])           \n\t"
+        "lwxc1   %[temp9],  %[temp1](%[v])           \n\t"
+        "xor     %[temp5],  %[temp4],     %[temp2]   \n\t"
+        "xor     %[temp4],  %[temp4],     %[temp3]   \n\t"
+        "mtc1    %[temp5],  %[temp6]                 \n\t"
+        "mtc1    %[temp4],  %[temp7]                 \n\t"
+        "mul.s   %[temp8],  %[temp8],     %[temp6]   \n\t"
+        "mul.s   %[temp9],  %[temp9],     %[temp7]   \n\t"
+        PTR_ADDIU "%[ret],  %[dst],       8          \n\t"
+        "swc1    %[temp8],  0(%[dst])                \n\t"
+        "swc1    %[temp9],  4(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+          [temp8]"=&f"(temp8), [temp9]"=&f"(temp9),
+          [ret]"=&r"(ret)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [sign]"r"(sign)
+        : "memory"
+    );
+    return ret;
+}
+
+static inline float *VMUL4S_mips(float *dst, const float *v, unsigned idx,
+                            unsigned sign, const float *scale)
+{
+    int temp0, temp1, temp2, temp3, temp4;
+    float temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+    float *ret;
+    unsigned int mask = 1U << 31;
+
+    __asm__ volatile(
+        "lw      %[temp0],   0(%[scale])               \n\t"
+        "andi    %[temp1],  %[idx],       0x03         \n\t"
+        "andi    %[temp2],  %[idx],       0x0C         \n\t"
+        "andi    %[temp3],  %[idx],       0x30         \n\t"
+        "andi    %[temp4],  %[idx],       0xC0         \n\t"
+        "sll     %[temp1],  %[temp1],     2            \n\t"
+        "srl     %[temp3],  %[temp3],     2            \n\t"
+        "srl     %[temp4],  %[temp4],     4            \n\t"
+        "lwxc1   %[temp10],  %[temp1](%[v])            \n\t"
+        "lwxc1   %[temp11],  %[temp2](%[v])            \n\t"
+        "lwxc1   %[temp12],  %[temp3](%[v])            \n\t"
+        "lwxc1   %[temp13],  %[temp4](%[v])            \n\t"
+        "and     %[temp1],   %[sign],      %[mask]     \n\t"
+        "srl     %[temp2],   %[idx],       12          \n\t"
+        "srl     %[temp3],   %[idx],       13          \n\t"
+        "srl     %[temp4],   %[idx],       14          \n\t"
+        "andi    %[temp2],   %[temp2],     1           \n\t"
+        "andi    %[temp3],   %[temp3],     1           \n\t"
+        "andi    %[temp4],   %[temp4],     1           \n\t"
+        "sllv    %[sign],    %[sign],      %[temp2]    \n\t"
+        "xor     %[temp1],   %[temp0],     %[temp1]    \n\t"
+        "and     %[temp2],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp1],   %[temp14]                 \n\t"
+        "xor     %[temp2],   %[temp0],     %[temp2]    \n\t"
+        "sllv    %[sign],    %[sign],      %[temp3]    \n\t"
+        "mtc1    %[temp2],   %[temp15]                 \n\t"
+        "and     %[temp3],   %[sign],      %[mask]     \n\t"
+        "sllv    %[sign],    %[sign],      %[temp4]    \n\t"
+        "xor     %[temp3],   %[temp0],     %[temp3]    \n\t"
+        "and     %[temp4],   %[sign],      %[mask]     \n\t"
+        "mtc1    %[temp3],   %[temp16]                 \n\t"
+        "xor     %[temp4],   %[temp0],     %[temp4]    \n\t"
+        "mtc1    %[temp4],   %[temp17]                 \n\t"
+        "mul.s   %[temp10],  %[temp10],    %[temp14]   \n\t"
+        "mul.s   %[temp11],  %[temp11],    %[temp15]   \n\t"
+        "mul.s   %[temp12],  %[temp12],    %[temp16]   \n\t"
+        "mul.s   %[temp13],  %[temp13],    %[temp17]   \n\t"
+        PTR_ADDIU "%[ret],   %[dst],       16          \n\t"
+        "swc1    %[temp10],  0(%[dst])                 \n\t"
+        "swc1    %[temp11],  4(%[dst])                 \n\t"
+        "swc1    %[temp12],  8(%[dst])                 \n\t"
+        "swc1    %[temp13],  12(%[dst])                \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp10]"=&f"(temp10),
+          [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+          [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+          [temp15]"=&f"(temp15), [temp16]"=&f"(temp16),
+          [temp17]"=&f"(temp17), [ret]"=&r"(ret),
+          [sign]"+r"(sign)
+        : [idx]"r"(idx), [scale]"r"(scale), [v]"r"(v),
+          [dst]"r"(dst), [mask]"r"(mask)
+        : "memory"
+    );
+    return ret;
+}
+
+#define VMUL2 VMUL2_mips
+#define VMUL4 VMUL4_mips
+#define VMUL2S VMUL2S_mips
+#define VMUL4S VMUL4S_mips
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+
+#endif /* AVCODEC_MIPS_AACDEC_MIPS_H */
diff --git a/libavcodec/mips/aacpsdsp_mips.c b/libavcodec/mips/aacpsdsp_mips.c
new file mode 100644
index 0000000..83fdc2f
--- /dev/null
+++ b/libavcodec/mips/aacpsdsp_mips.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacpsdsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/aacpsdsp.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void ps_hybrid_analysis_ileave_mips(float (*out)[32][2], float L[2][38][64],
+                                        int i, int len)
+{
+    int temp0, temp1, temp2, temp3;
+    int temp4, temp5, temp6, temp7;
+    float *out1=&out[i][0][0];
+    float *L1=&L[0][0][i];
+    float *j=out1+ len*2;
+
+    for (; i < 64; i++) {
+
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+        "1:                                          \n\t"
+            "lw      %[temp0],   0(%[L1])            \n\t"
+            "lw      %[temp1],   9728(%[L1])         \n\t"
+            "lw      %[temp2],   256(%[L1])          \n\t"
+            "lw      %[temp3],   9984(%[L1])         \n\t"
+            "lw      %[temp4],   512(%[L1])          \n\t"
+            "lw      %[temp5],   10240(%[L1])        \n\t"
+            "lw      %[temp6],   768(%[L1])          \n\t"
+            "lw      %[temp7],   10496(%[L1])        \n\t"
+            "sw      %[temp0],   0(%[out1])          \n\t"
+            "sw      %[temp1],   4(%[out1])          \n\t"
+            "sw      %[temp2],   8(%[out1])          \n\t"
+            "sw      %[temp3],   12(%[out1])         \n\t"
+            "sw      %[temp4],   16(%[out1])         \n\t"
+            "sw      %[temp5],   20(%[out1])         \n\t"
+            "sw      %[temp6],   24(%[out1])         \n\t"
+            "sw      %[temp7],   28(%[out1])         \n\t"
+            PTR_ADDIU "%[out1],  %[out1],      32    \n\t"
+            PTR_ADDIU "%[L1],    %[L1],        1024  \n\t"
+            "bne     %[out1],    %[j],         1b    \n\t"
+
+            : [out1]"+r"(out1), [L1]"+r"(L1), [j]"+r"(j),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            : [len]"r"(len)
+            : "memory"
+        );
+        out1-=(len<<1)-64;
+        L1-=(len<<6)-1;
+        j+=len*2;
+    }
+}
+
+static void ps_hybrid_synthesis_deint_mips(float out[2][38][64],
+                                        float (*in)[32][2],
+                                        int i, int len)
+{
+    int n;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    float *out1 = (float*)out + i;
+    float *out2 = (float*)out + 2432 + i;
+    float *in1 = (float*)in + 64 * i;
+    float *in2 = (float*)in + 64 * i + 1;
+
+    for (; i < 64; i++) {
+        for (n = 0; n < 7; n++) {
+
+            /* loop unrolled 8 times */
+            __asm__ volatile (
+                 "lw      %[temp0],   0(%[in1])               \n\t"
+                 "lw      %[temp1],   0(%[in2])               \n\t"
+                 "lw      %[temp2],   8(%[in1])               \n\t"
+                 "lw      %[temp3],   8(%[in2])               \n\t"
+                 "lw      %[temp4],   16(%[in1])              \n\t"
+                 "lw      %[temp5],   16(%[in2])              \n\t"
+                 "lw      %[temp6],   24(%[in1])              \n\t"
+                 "lw      %[temp7],   24(%[in2])              \n\t"
+                 PTR_ADDIU "%[out1],  %[out1],         1024   \n\t"
+                 PTR_ADDIU "%[out2],  %[out2],         1024   \n\t"
+                 PTR_ADDIU "%[in1],   %[in1],          32     \n\t"
+                 PTR_ADDIU "%[in2],   %[in2],          32     \n\t"
+                 "sw      %[temp0],   -1024(%[out1])          \n\t"
+                 "sw      %[temp1],   -1024(%[out2])          \n\t"
+                 "sw      %[temp2],   -768(%[out1])           \n\t"
+                 "sw      %[temp3],   -768(%[out2])           \n\t"
+                 "sw      %[temp4],   -512(%[out1])           \n\t"
+                 "sw      %[temp5],   -512(%[out2])           \n\t"
+                 "sw      %[temp6],   -256(%[out1])           \n\t"
+                 "sw      %[temp7],   -256(%[out2])           \n\t"
+
+                 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                   [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                   [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                   [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                   [out1]"+r"(out1), [out2]"+r"(out2),
+                   [in1]"+r"(in1), [in2]"+r"(in2)
+                 :
+                 : "memory"
+            );
+        }
+        /* loop unrolled 8 times */
+        __asm__ volatile (
+            "lw      %[temp0],   0(%[in1])               \n\t"
+            "lw      %[temp1],   0(%[in2])               \n\t"
+            "lw      %[temp2],   8(%[in1])               \n\t"
+            "lw      %[temp3],   8(%[in2])               \n\t"
+            "lw      %[temp4],   16(%[in1])              \n\t"
+            "lw      %[temp5],   16(%[in2])              \n\t"
+            "lw      %[temp6],   24(%[in1])              \n\t"
+            "lw      %[temp7],   24(%[in2])              \n\t"
+            PTR_ADDIU "%[out1],  %[out1],        -7164   \n\t"
+            PTR_ADDIU "%[out2],  %[out2],        -7164   \n\t"
+            PTR_ADDIU "%[in1],   %[in1],         32      \n\t"
+            PTR_ADDIU "%[in2],   %[in2],         32      \n\t"
+            "sw      %[temp0],   7164(%[out1])           \n\t"
+            "sw      %[temp1],   7164(%[out2])           \n\t"
+            "sw      %[temp2],   7420(%[out1])           \n\t"
+            "sw      %[temp3],   7420(%[out2])           \n\t"
+            "sw      %[temp4],   7676(%[out1])           \n\t"
+            "sw      %[temp5],   7676(%[out2])           \n\t"
+            "sw      %[temp6],   7932(%[out1])           \n\t"
+            "sw      %[temp7],   7932(%[out2])           \n\t"
+
+            : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+              [out1]"+r"(out1), [out2]"+r"(out2),
+              [in1]"+r"(in1), [in2]"+r"(in2)
+            :
+            : "memory"
+        );
+    }
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ps_add_squares_mips(float *dst, const float (*src)[2], int n)
+{
+    int i;
+    float temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9, temp10, temp11;
+    float *src0 = (float*)&src[0][0];
+    float *dst0 = &dst[0];
+
+    for (i = 0; i < 8; i++) {
+        /* loop unrolled 4 times */
+        __asm__ volatile (
+            "lwc1     %[temp0],    0(%[src0])                          \n\t"
+            "lwc1     %[temp1],    4(%[src0])                          \n\t"
+            "lwc1     %[temp2],    8(%[src0])                          \n\t"
+            "lwc1     %[temp3],    12(%[src0])                         \n\t"
+            "lwc1     %[temp4],    16(%[src0])                         \n\t"
+            "lwc1     %[temp5],    20(%[src0])                         \n\t"
+            "lwc1     %[temp6],    24(%[src0])                         \n\t"
+            "lwc1     %[temp7],    28(%[src0])                         \n\t"
+            "lwc1     %[temp8],    0(%[dst0])                          \n\t"
+            "lwc1     %[temp9],    4(%[dst0])                          \n\t"
+            "lwc1     %[temp10],   8(%[dst0])                          \n\t"
+            "lwc1     %[temp11],   12(%[dst0])                         \n\t"
+            "mul.s    %[temp1],    %[temp1],    %[temp1]               \n\t"
+            "mul.s    %[temp3],    %[temp3],    %[temp3]               \n\t"
+            "mul.s    %[temp5],    %[temp5],    %[temp5]               \n\t"
+            "mul.s    %[temp7],    %[temp7],    %[temp7]               \n\t"
+            "madd.s   %[temp0],    %[temp1],    %[temp0],   %[temp0]   \n\t"
+            "madd.s   %[temp2],    %[temp3],    %[temp2],   %[temp2]   \n\t"
+            "madd.s   %[temp4],    %[temp5],    %[temp4],   %[temp4]   \n\t"
+            "madd.s   %[temp6],    %[temp7],    %[temp6],   %[temp6]   \n\t"
+            "add.s    %[temp0],    %[temp8],    %[temp0]               \n\t"
+            "add.s    %[temp2],    %[temp9],    %[temp2]               \n\t"
+            "add.s    %[temp4],    %[temp10],   %[temp4]               \n\t"
+            "add.s    %[temp6],    %[temp11],   %[temp6]               \n\t"
+            "swc1     %[temp0],    0(%[dst0])                          \n\t"
+            "swc1     %[temp2],    4(%[dst0])                          \n\t"
+            "swc1     %[temp4],    8(%[dst0])                          \n\t"
+            "swc1     %[temp6],    12(%[dst0])                         \n\t"
+            PTR_ADDIU "%[dst0],    %[dst0],     16                     \n\t"
+            PTR_ADDIU "%[src0],    %[src0],     32                     \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [dst0]"+r"(dst0), [src0]"+r"(src0),
+              [temp10]"=&f"(temp10), [temp11]"=&f"(temp11)
+            :
+            : "memory"
+        );
+   }
+}
+
+static void ps_mul_pair_single_mips(float (*dst)[2], float (*src0)[2], float *src1,
+                                 int n)
+{
+    float temp0, temp1, temp2;
+    float *p_d, *p_s0, *p_s1, *end;
+    p_d = &dst[0][0];
+    p_s0 = &src0[0][0];
+    p_s1 = &src1[0];
+    end = p_s1 + n;
+
+    __asm__ volatile(
+        ".set push                                      \n\t"
+        ".set noreorder                                 \n\t"
+        "1:                                             \n\t"
+        "lwc1     %[temp2],   0(%[p_s1])                \n\t"
+        "lwc1     %[temp0],   0(%[p_s0])                \n\t"
+        "lwc1     %[temp1],   4(%[p_s0])                \n\t"
+        PTR_ADDIU "%[p_d],    %[p_d],       8           \n\t"
+        "mul.s    %[temp0],   %[temp0],     %[temp2]    \n\t"
+        "mul.s    %[temp1],   %[temp1],     %[temp2]    \n\t"
+        PTR_ADDIU "%[p_s0],   %[p_s0],      8           \n\t"
+        "swc1     %[temp0],   -8(%[p_d])                \n\t"
+        "swc1     %[temp1],   -4(%[p_d])                \n\t"
+        "bne      %[p_s1],    %[end],       1b          \n\t"
+        PTR_ADDIU "%[p_s1],   %[p_s1],      4           \n\t"
+        ".set pop                                       \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [p_d]"+r"(p_d),
+          [p_s0]"+r"(p_s0), [p_s1]"+r"(p_s1)
+        : [end]"r"(end)
+        : "memory"
+    );
+}
+
+static void ps_decorrelate_mips(float (*out)[2], float (*delay)[2],
+                             float (*ap_delay)[PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2],
+                             const float phi_fract[2], const float (*Q_fract)[2],
+                             const float *transient_gain,
+                             float g_decay_slope,
+                             int len)
+{
+    float *p_delay = &delay[0][0];
+    float *p_out = &out[0][0];
+    float *p_ap_delay = &ap_delay[0][0][0];
+    const float *p_t_gain = transient_gain;
+    const float *p_Q_fract = &Q_fract[0][0];
+    float ag0, ag1, ag2;
+    float phi_fract0 = phi_fract[0];
+    float phi_fract1 = phi_fract[1];
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+
+    float *p_delay_end = (p_delay + (len << 1));
+
+    /* merged 2 loops */
+    __asm__ volatile(
+        ".set    push                                                    \n\t"
+        ".set    noreorder                                               \n\t"
+        "li.s    %[ag0],        0.65143905753106                         \n\t"
+        "li.s    %[ag1],        0.56471812200776                         \n\t"
+        "li.s    %[ag2],        0.48954165955695                         \n\t"
+        "mul.s   %[ag0],        %[ag0],        %[g_decay_slope]          \n\t"
+        "mul.s   %[ag1],        %[ag1],        %[g_decay_slope]          \n\t"
+        "mul.s   %[ag2],        %[ag2],        %[g_decay_slope]          \n\t"
+    "1:                                                                  \n\t"
+        "lwc1    %[temp0],      0(%[p_delay])                            \n\t"
+        "lwc1    %[temp1],      4(%[p_delay])                            \n\t"
+        "lwc1    %[temp4],      16(%[p_ap_delay])                        \n\t"
+        "lwc1    %[temp5],      20(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp3],      %[temp0],      %[phi_fract1]             \n\t"
+        "lwc1    %[temp6],      0(%[p_Q_fract])                          \n\t"
+        "mul.s   %[temp2],      %[temp1],      %[phi_fract1]             \n\t"
+        "lwc1    %[temp7],      4(%[p_Q_fract])                          \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[temp1], %[phi_fract0]   \n\t"
+        "msub.s  %[temp2],      %[temp2],      %[temp0], %[phi_fract0]   \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "lwc1    %[temp7],      12(%[p_Q_fract])                         \n\t"
+        "mul.s   %[temp0],      %[ag0],        %[temp2]                  \n\t"
+        "mul.s   %[temp1],      %[ag0],        %[temp3]                  \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "lwc1    %[temp4],      304(%[p_ap_delay])                       \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "lwc1    %[temp5],      308(%[p_ap_delay])                       \n\t"
+        "sub.s   %[temp0],      %[temp8],      %[temp0]                  \n\t"
+        "sub.s   %[temp1],      %[temp9],      %[temp1]                  \n\t"
+        "madd.s  %[temp2],      %[temp2],      %[ag0],   %[temp0]        \n\t"
+        "lwc1    %[temp6],      8(%[p_Q_fract])                          \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[ag0],   %[temp1]        \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "lwc1    %[temp7],      20(%[p_Q_fract])                         \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "swc1    %[temp2],      40(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp2],      %[ag1],        %[temp0]                  \n\t"
+        "swc1    %[temp3],      44(%[p_ap_delay])                        \n\t"
+        "mul.s   %[temp3],      %[ag1],        %[temp1]                  \n\t"
+        "lwc1    %[temp4],      592(%[p_ap_delay])                       \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "lwc1    %[temp5],      596(%[p_ap_delay])                       \n\t"
+        "sub.s   %[temp2],      %[temp8],      %[temp2]                  \n\t"
+        "sub.s   %[temp3],      %[temp9],      %[temp3]                  \n\t"
+        "lwc1    %[temp6],      16(%[p_Q_fract])                         \n\t"
+        "madd.s  %[temp0],      %[temp0],      %[ag1],   %[temp2]        \n\t"
+        "madd.s  %[temp1],      %[temp1],      %[ag1],   %[temp3]        \n\t"
+        "mul.s   %[temp8],      %[temp5],      %[temp7]                  \n\t"
+        "mul.s   %[temp9],      %[temp4],      %[temp7]                  \n\t"
+        "msub.s  %[temp8],      %[temp8],      %[temp4], %[temp6]        \n\t"
+        "madd.s  %[temp9],      %[temp9],      %[temp5], %[temp6]        \n\t"
+        "swc1    %[temp0],      336(%[p_ap_delay])                       \n\t"
+        "mul.s   %[temp0],      %[ag2],        %[temp2]                  \n\t"
+        "swc1    %[temp1],      340(%[p_ap_delay])                       \n\t"
+        "mul.s   %[temp1],      %[ag2],        %[temp3]                  \n\t"
+        "lwc1    %[temp4],      0(%[p_t_gain])                           \n\t"
+        "sub.s   %[temp0],      %[temp8],      %[temp0]                  \n\t"
+        PTR_ADDIU "%[p_ap_delay], %[p_ap_delay], 8                       \n\t"
+        "sub.s   %[temp1],      %[temp9],      %[temp1]                  \n\t"
+        PTR_ADDIU "%[p_t_gain], %[p_t_gain],   4                         \n\t"
+        "madd.s  %[temp2],      %[temp2],      %[ag2],   %[temp0]        \n\t"
+        PTR_ADDIU "%[p_delay],  %[p_delay],    8                         \n\t"
+        "madd.s  %[temp3],      %[temp3],      %[ag2],   %[temp1]        \n\t"
+        PTR_ADDIU "%[p_out],    %[p_out],      8                         \n\t"
+        "mul.s   %[temp5],      %[temp4],      %[temp0]                  \n\t"
+        "mul.s   %[temp6],      %[temp4],      %[temp1]                  \n\t"
+        "swc1    %[temp2],      624(%[p_ap_delay])                       \n\t"
+        "swc1    %[temp3],      628(%[p_ap_delay])                       \n\t"
+        "swc1    %[temp5],      -8(%[p_out])                             \n\t"
+        "swc1    %[temp6],      -4(%[p_out])                             \n\t"
+        "bne     %[p_delay],    %[p_delay_end],1b                        \n\t"
+        " swc1   %[temp6],      -4(%[p_out])                             \n\t"
+        ".set    pop                                                     \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+          [temp9]"=&f"(temp9), [p_delay]"+r"(p_delay), [p_ap_delay]"+r"(p_ap_delay),
+          [p_Q_fract]"+r"(p_Q_fract), [p_t_gain]"+r"(p_t_gain), [p_out]"+r"(p_out),
+          [ag0]"=&f"(ag0), [ag1]"=&f"(ag1), [ag2]"=&f"(ag2)
+        : [phi_fract0]"f"(phi_fract0), [phi_fract1]"f"(phi_fract1),
+          [p_delay_end]"r"(p_delay_end), [g_decay_slope]"f"(g_decay_slope)
+        : "memory"
+    );
+}
+
+static void ps_stereo_interpolate_mips(float (*l)[2], float (*r)[2],
+                                    float h[2][4], float h_step[2][4],
+                                    int len)
+{
+    float h0 = h[0][0];
+    float h1 = h[0][1];
+    float h2 = h[0][2];
+    float h3 = h[0][3];
+    float hs0 = h_step[0][0];
+    float hs1 = h_step[0][1];
+    float hs2 = h_step[0][2];
+    float hs3 = h_step[0][3];
+    float temp0, temp1, temp2, temp3;
+    float l_re, l_im, r_re, r_im;
+
+    float *l_end = ((float *)l + (len << 1));
+
+    __asm__ volatile(
+        ".set    push                                     \n\t"
+        ".set    noreorder                                \n\t"
+    "1:                                                   \n\t"
+        "add.s   %[h0],     %[h0],     %[hs0]             \n\t"
+        "lwc1    %[l_re],   0(%[l])                       \n\t"
+        "add.s   %[h1],     %[h1],     %[hs1]             \n\t"
+        "lwc1    %[r_re],   0(%[r])                       \n\t"
+        "add.s   %[h2],     %[h2],     %[hs2]             \n\t"
+        "lwc1    %[l_im],   4(%[l])                       \n\t"
+        "add.s   %[h3],     %[h3],     %[hs3]             \n\t"
+        "lwc1    %[r_im],   4(%[r])                       \n\t"
+        "mul.s   %[temp0],  %[h0],     %[l_re]            \n\t"
+        PTR_ADDIU "%[l],    %[l],      8                  \n\t"
+        "mul.s   %[temp2],  %[h1],     %[l_re]            \n\t"
+        PTR_ADDIU "%[r],    %[r],      8                  \n\t"
+        "madd.s  %[temp0],  %[temp0],  %[h2],   %[r_re]   \n\t"
+        "madd.s  %[temp2],  %[temp2],  %[h3],   %[r_re]   \n\t"
+        "mul.s   %[temp1],  %[h0],     %[l_im]            \n\t"
+        "mul.s   %[temp3],  %[h1],     %[l_im]            \n\t"
+        "madd.s  %[temp1],  %[temp1],  %[h2],   %[r_im]   \n\t"
+        "madd.s  %[temp3],  %[temp3],  %[h3],   %[r_im]   \n\t"
+        "swc1    %[temp0],  -8(%[l])                      \n\t"
+        "swc1    %[temp2],  -8(%[r])                      \n\t"
+        "swc1    %[temp1],  -4(%[l])                      \n\t"
+        "bne     %[l],      %[l_end],  1b                 \n\t"
+        " swc1   %[temp3],  -4(%[r])                      \n\t"
+        ".set    pop                                      \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
+          [h0]"+f"(h0), [h1]"+f"(h1), [h2]"+f"(h2),
+          [h3]"+f"(h3), [l]"+r"(l), [r]"+r"(r),
+          [l_re]"=&f"(l_re), [l_im]"=&f"(l_im),
+          [r_re]"=&f"(r_re), [r_im]"=&f"(r_im)
+        : [hs0]"f"(hs0), [hs1]"f"(hs1), [hs2]"f"(hs2),
+          [hs3]"f"(hs3), [l_end]"r"(l_end)
+        : "memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_psdsp_init_mips(PSDSPContext *s)
+{
+#if HAVE_INLINE_ASM
+    s->hybrid_analysis_ileave = ps_hybrid_analysis_ileave_mips;
+    s->hybrid_synthesis_deint = ps_hybrid_synthesis_deint_mips;
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->add_squares            = ps_add_squares_mips;
+    s->mul_pair_single        = ps_mul_pair_single_mips;
+    s->decorrelate            = ps_decorrelate_mips;
+    s->stereo_interpolate[0]  = ps_stereo_interpolate_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacpsy_mips.h b/libavcodec/mips/aacpsy_mips.h
new file mode 100644
index 0000000..a1fe5cc
--- /dev/null
+++ b/libavcodec/mips/aacpsy_mips.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic   (bojan@mips.com)
+ *
+ * AAC encoder psychoacoustic model routines optimized
+ * for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacpsy.c
+ */
+
+#ifndef AVCODEC_MIPS_AACPSY_MIPS_H
+#define AVCODEC_MIPS_AACPSY_MIPS_H
+
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU && ( PSY_LAME_FIR_LEN == 21 )
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void calc_thr_3gpp_mips(const FFPsyWindowInfo *wi, const int num_bands,
+                               AacPsyChannel *pch, const uint8_t *band_sizes,
+                               const float *coefs, const int cutoff)
+{
+    int i, w, g;
+    int start = 0, wstart = 0;
+    for (w = 0; w < wi->num_windows*16; w += 16) {
+        wstart = 0;
+        for (g = 0; g < num_bands; g++) {
+            AacPsyBand *band = &pch->band[w+g];
+
+            float form_factor = 0.0f;
+            float Temp;
+            band->energy = 0.0f;
+            if (wstart < cutoff) {
+                for (i = 0; i < band_sizes[g]; i+=4) {
+                    float a, b, c, d;
+                    float ax, bx, cx, dx;
+                    float *cf = (float *)&coefs[start+i];
+
+                    __asm__ volatile (
+                        "lwc1   %[a],   0(%[cf])                \n\t"
+                        "lwc1   %[b],   4(%[cf])                \n\t"
+                        "lwc1   %[c],   8(%[cf])                \n\t"
+                        "lwc1   %[d],   12(%[cf])               \n\t"
+                        "abs.s  %[a],   %[a]                    \n\t"
+                        "abs.s  %[b],   %[b]                    \n\t"
+                        "abs.s  %[c],   %[c]                    \n\t"
+                        "abs.s  %[d],   %[d]                    \n\t"
+                        "sqrt.s %[ax],  %[a]                    \n\t"
+                        "sqrt.s %[bx],  %[b]                    \n\t"
+                        "sqrt.s %[cx],  %[c]                    \n\t"
+                        "sqrt.s %[dx],  %[d]                    \n\t"
+                        "madd.s %[e],   %[e],   %[a],   %[a]    \n\t"
+                        "madd.s %[e],   %[e],   %[b],   %[b]    \n\t"
+                        "madd.s %[e],   %[e],   %[c],   %[c]    \n\t"
+                        "madd.s %[e],   %[e],   %[d],   %[d]    \n\t"
+                        "add.s  %[f],   %[f],   %[ax]           \n\t"
+                        "add.s  %[f],   %[f],   %[bx]           \n\t"
+                        "add.s  %[f],   %[f],   %[cx]           \n\t"
+                        "add.s  %[f],   %[f],   %[dx]           \n\t"
+
+                        : [a]"=&f"(a), [b]"=&f"(b),
+                          [c]"=&f"(c), [d]"=&f"(d),
+                          [e]"+f"(band->energy), [f]"+f"(form_factor),
+                          [ax]"=&f"(ax), [bx]"=&f"(bx),
+                          [cx]"=&f"(cx), [dx]"=&f"(dx)
+                        : [cf]"r"(cf)
+                        : "memory"
+                    );
+                }
+            }
+
+            Temp = sqrtf((float)band_sizes[g] / band->energy);
+            band->thr      = band->energy * 0.001258925f;
+            band->nz_lines = form_factor * sqrtf(Temp);
+            start += band_sizes[g];
+            wstart += band_sizes[g];
+        }
+    }
+}
+
+static void psy_hp_filter_mips(const float *firbuf, float *hpfsmpl, const float * psy_fir_coeffs)
+{
+    float sum1, sum2, sum3, sum4;
+    float *fb = (float*)firbuf;
+    float *fb_end = fb + AAC_BLOCK_SIZE_LONG;
+    float *hp = hpfsmpl;
+
+    float coeff0 = psy_fir_coeffs[1];
+    float coeff1 = psy_fir_coeffs[3];
+    float coeff2 = psy_fir_coeffs[5];
+    float coeff3 = psy_fir_coeffs[7];
+    float coeff4 = psy_fir_coeffs[9];
+
+    __asm__ volatile (
+        ".set push                                          \n\t"
+        ".set noreorder                                     \n\t"
+
+        "li.s   $f12,       32768                           \n\t"
+        "1:                                                 \n\t"
+        "lwc1   $f0,        40(%[fb])                       \n\t"
+        "lwc1   $f1,        4(%[fb])                        \n\t"
+        "lwc1   $f2,        80(%[fb])                       \n\t"
+        "lwc1   $f3,        44(%[fb])                       \n\t"
+        "lwc1   $f4,        8(%[fb])                        \n\t"
+        "madd.s %[sum1],    $f0,        $f1,    %[coeff0]   \n\t"
+        "lwc1   $f5,        84(%[fb])                       \n\t"
+        "lwc1   $f6,        48(%[fb])                       \n\t"
+        "madd.s %[sum2],    $f3,        $f4,    %[coeff0]   \n\t"
+        "lwc1   $f7,        12(%[fb])                       \n\t"
+        "madd.s %[sum1],    %[sum1],    $f2,    %[coeff0]   \n\t"
+        "lwc1   $f8,        88(%[fb])                       \n\t"
+        "lwc1   $f9,        52(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f5,    %[coeff0]   \n\t"
+        "madd.s %[sum3],    $f6,        $f7,    %[coeff0]   \n\t"
+        "lwc1   $f10,       16(%[fb])                       \n\t"
+        "lwc1   $f11,       92(%[fb])                       \n\t"
+        "madd.s %[sum1],    %[sum1],    $f7,    %[coeff1]   \n\t"
+        "lwc1   $f1,        72(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff0]   \n\t"
+        "madd.s %[sum4],    $f9,        $f10,   %[coeff0]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f10,   %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f1,    %[coeff1]   \n\t"
+        "lwc1   $f4,        76(%[fb])                       \n\t"
+        "lwc1   $f8,        20(%[fb])                       \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff0]   \n\t"
+        "lwc1   $f11,       24(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f4,    %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f8,    %[coeff2]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff1]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff1]   \n\t"
+        "lwc1   $f7,        64(%[fb])                       \n\t"
+        "madd.s %[sum2],    %[sum2],    $f11,   %[coeff2]   \n\t"
+        "lwc1   $f10,       68(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f2,    %[coeff1]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f5,    %[coeff1]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f7,    %[coeff2]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f10,   %[coeff2]   \n\t"
+        "lwc1   $f2,        28(%[fb])                       \n\t"
+        "lwc1   $f5,        32(%[fb])                       \n\t"
+        "lwc1   $f8,        56(%[fb])                       \n\t"
+        "lwc1   $f11,       60(%[fb])                       \n\t"
+        "madd.s %[sum3],    %[sum3],    $f2,    %[coeff2]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f5,    %[coeff2]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f2,    %[coeff3]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f5,    %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f1,    %[coeff2]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f4,    %[coeff2]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f8,    %[coeff3]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f11,   %[coeff3]   \n\t"
+        "lwc1   $f1,        36(%[fb])                       \n\t"
+        PTR_ADDIU "%[fb],   %[fb],      16                  \n\t"
+        "madd.s %[sum4],    %[sum4],    $f0,    %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f1,    %[coeff3]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f1,    %[coeff4]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f0,    %[coeff4]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f10,   %[coeff3]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f7,    %[coeff3]   \n\t"
+        "madd.s %[sum1],    %[sum1],    $f6,    %[coeff4]   \n\t"
+        "madd.s %[sum2],    %[sum2],    $f9,    %[coeff4]   \n\t"
+        "madd.s %[sum4],    %[sum4],    $f6,    %[coeff4]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f3,    %[coeff4]   \n\t"
+        "mul.s  %[sum1],    %[sum1],    $f12                \n\t"
+        "mul.s  %[sum2],    %[sum2],    $f12                \n\t"
+        "madd.s %[sum4],    %[sum4],    $f11,   %[coeff4]   \n\t"
+        "madd.s %[sum3],    %[sum3],    $f8,    %[coeff4]   \n\t"
+        "swc1   %[sum1],    0(%[hp])                        \n\t"
+        "swc1   %[sum2],    4(%[hp])                        \n\t"
+        "mul.s  %[sum4],    %[sum4],    $f12                \n\t"
+        "mul.s  %[sum3],    %[sum3],    $f12                \n\t"
+        "swc1   %[sum4],    12(%[hp])                       \n\t"
+        "swc1   %[sum3],    8(%[hp])                        \n\t"
+        "bne    %[fb],      %[fb_end],  1b                  \n\t"
+        PTR_ADDIU "%[hp],   %[hp],      16                  \n\t"
+
+        ".set pop                                           \n\t"
+
+        : [sum1]"=&f"(sum1), [sum2]"=&f"(sum2),
+          [sum3]"=&f"(sum3), [sum4]"=&f"(sum4),
+          [fb]"+r"(fb), [hp]"+r"(hp)
+        : [coeff0]"f"(coeff0), [coeff1]"f"(coeff1),
+          [coeff2]"f"(coeff2), [coeff3]"f"(coeff3),
+          [coeff4]"f"(coeff4), [fb_end]"r"(fb_end)
+        : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6",
+          "$f7", "$f8", "$f9", "$f10", "$f11", "$f12",
+          "memory"
+    );
+}
+
+#define calc_thr_3gpp calc_thr_3gpp_mips
+#define psy_hp_filter psy_hp_filter_mips
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+#endif /* AVCODEC_MIPS_AACPSY_MIPS_H */
diff --git a/libavcodec/mips/aacsbr_mips.c b/libavcodec/mips/aacsbr_mips.c
new file mode 100644
index 0000000..56aa4e8
--- /dev/null
+++ b/libavcodec/mips/aacsbr_mips.c
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacsbr.c
+ */
+
+#include "libavcodec/aac.h"
+#include "libavcodec/aacsbr.h"
+#include "libavutil/mips/asmdefs.h"
+
+#define ENVELOPE_ADJUSTMENT_OFFSET 2
+
+#if HAVE_INLINE_ASM
+static int sbr_lf_gen_mips(AACContext *ac, SpectralBandReplication *sbr,
+                      float X_low[32][40][2], const float W[2][32][32][2],
+                      int buf_idx)
+{
+    int i, k;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    float *p_x_low = &X_low[0][8][0];
+    float *p_w = (float*)&W[buf_idx][0][0][0];
+    float *p_x1_low = &X_low[0][0][0];
+    float *p_w1 = (float*)&W[1-buf_idx][24][0][0];
+
+    float *loop_end=p_x1_low + 2560;
+
+    /* loop unrolled 8 times */
+    __asm__ volatile (
+    "1:                                                 \n\t"
+        "sw     $0,            0(%[p_x1_low])           \n\t"
+        "sw     $0,            4(%[p_x1_low])           \n\t"
+        "sw     $0,            8(%[p_x1_low])           \n\t"
+        "sw     $0,            12(%[p_x1_low])          \n\t"
+        "sw     $0,            16(%[p_x1_low])          \n\t"
+        "sw     $0,            20(%[p_x1_low])          \n\t"
+        "sw     $0,            24(%[p_x1_low])          \n\t"
+        "sw     $0,            28(%[p_x1_low])          \n\t"
+        PTR_ADDIU "%[p_x1_low],%[p_x1_low],      32     \n\t"
+        "bne    %[p_x1_low],   %[loop_end],      1b     \n\t"
+        PTR_ADDIU "%[p_x1_low],%[p_x1_low],      -10240 \n\t"
+
+        : [p_x1_low]"+r"(p_x1_low)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+        for (i = 0; i < 32; i+=4) {
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lw     %[temp0],   0(%[p_w])               \n\t"
+                "lw     %[temp1],   4(%[p_w])               \n\t"
+                "lw     %[temp2],   256(%[p_w])             \n\t"
+                "lw     %[temp3],   260(%[p_w])             \n\t"
+                "lw     %[temp4],   512(%[p_w])             \n\t"
+                "lw     %[temp5],   516(%[p_w])             \n\t"
+                "lw     %[temp6],   768(%[p_w])             \n\t"
+                "lw     %[temp7],   772(%[p_w])             \n\t"
+                "sw     %[temp0],   0(%[p_x_low])           \n\t"
+                "sw     %[temp1],   4(%[p_x_low])           \n\t"
+                "sw     %[temp2],   8(%[p_x_low])           \n\t"
+                "sw     %[temp3],   12(%[p_x_low])          \n\t"
+                "sw     %[temp4],   16(%[p_x_low])          \n\t"
+                "sw     %[temp5],   20(%[p_x_low])          \n\t"
+                "sw     %[temp6],   24(%[p_x_low])          \n\t"
+                "sw     %[temp7],   28(%[p_x_low])          \n\t"
+                PTR_ADDIU "%[p_x_low], %[p_x_low],  32      \n\t"
+                PTR_ADDIU "%[p_w],     %[p_w],      1024    \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
+                :
+                : "memory"
+            );
+        }
+        p_x_low += 16;
+        p_w -= 2046;
+    }
+
+    for (k = 0; k < sbr->kx[0]; k++) {
+        for (i = 0; i < 2; i++) {
+
+            /* loop unrolled 4 times */
+            __asm__ volatile (
+                "lw     %[temp0],    0(%[p_w1])             \n\t"
+                "lw     %[temp1],    4(%[p_w1])             \n\t"
+                "lw     %[temp2],    256(%[p_w1])           \n\t"
+                "lw     %[temp3],    260(%[p_w1])           \n\t"
+                "lw     %[temp4],    512(%[p_w1])           \n\t"
+                "lw     %[temp5],    516(%[p_w1])           \n\t"
+                "lw     %[temp6],    768(%[p_w1])           \n\t"
+                "lw     %[temp7],    772(%[p_w1])           \n\t"
+                "sw     %[temp0],    0(%[p_x1_low])         \n\t"
+                "sw     %[temp1],    4(%[p_x1_low])         \n\t"
+                "sw     %[temp2],    8(%[p_x1_low])         \n\t"
+                "sw     %[temp3],    12(%[p_x1_low])        \n\t"
+                "sw     %[temp4],    16(%[p_x1_low])        \n\t"
+                "sw     %[temp5],    20(%[p_x1_low])        \n\t"
+                "sw     %[temp6],    24(%[p_x1_low])        \n\t"
+                "sw     %[temp7],    28(%[p_x1_low])        \n\t"
+                PTR_ADDIU "%[p_x1_low], %[p_x1_low], 32     \n\t"
+                PTR_ADDIU "%[p_w1],     %[p_w1],     1024   \n\t"
+
+                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
+                  [p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
+                :
+                : "memory"
+            );
+        }
+        p_x1_low += 64;
+        p_w1 -= 510;
+    }
+    return 0;
+}
+
+static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
+                     const float Y0[38][64][2], const float Y1[38][64][2],
+                     const float X_low[32][40][2], int ch)
+{
+    int k, i;
+    const int i_f = 32;
+    int temp0, temp1, temp2, temp3;
+    const float *X_low1, *Y01, *Y11;
+    float *x1=&X[0][0][0];
+    float *j=x1+4864;
+    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
+
+    /* loop unrolled 8 times */
+    __asm__ volatile (
+    "1:                                       \n\t"
+        "sw     $0,      0(%[x1])             \n\t"
+        "sw     $0,      4(%[x1])             \n\t"
+        "sw     $0,      8(%[x1])             \n\t"
+        "sw     $0,      12(%[x1])            \n\t"
+        "sw     $0,      16(%[x1])            \n\t"
+        "sw     $0,      20(%[x1])            \n\t"
+        "sw     $0,      24(%[x1])            \n\t"
+        "sw     $0,      28(%[x1])            \n\t"
+        PTR_ADDIU "%[x1],%[x1],      32       \n\t"
+        "bne    %[x1],   %[j],       1b       \n\t"
+        PTR_ADDIU "%[x1],%[x1],      -19456   \n\t"
+
+        : [x1]"+r"(x1)
+        : [j]"r"(j)
+        : "memory"
+    );
+
+    if (i_Temp != 0) {
+
+        X_low1=&X_low[0][2][0];
+
+        for (k = 0; k < sbr->kx[0]; k++) {
+
+            __asm__ volatile (
+                "move    %[i],        $zero                  \n\t"
+            "2:                                              \n\t"
+                "lw      %[temp0],    0(%[X_low1])           \n\t"
+                "lw      %[temp1],    4(%[X_low1])           \n\t"
+                "sw      %[temp0],    0(%[x1])               \n\t"
+                "sw      %[temp1],    9728(%[x1])            \n\t"
+                PTR_ADDIU "%[x1],     %[x1],         256     \n\t"
+                PTR_ADDIU "%[X_low1], %[X_low1],     8       \n\t"
+                "addiu   %[i],        %[i],          1       \n\t"
+                "bne     %[i],        %[i_Temp],     2b      \n\t"
+
+                : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
+                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+                : [i_Temp]"r"(i_Temp)
+                : "memory"
+            );
+            x1-=(i_Temp<<6)-1;
+            X_low1-=(i_Temp<<1)-80;
+        }
+
+        x1=&X[0][0][k];
+        Y01=(float*)&Y0[32][k][0];
+
+        for (; k < sbr->kx[0] + sbr->m[0]; k++) {
+            __asm__ volatile (
+                "move    %[i],       $zero               \n\t"
+            "3:                                          \n\t"
+                "lw      %[temp0],   0(%[Y01])           \n\t"
+                "lw      %[temp1],   4(%[Y01])           \n\t"
+                "sw      %[temp0],   0(%[x1])            \n\t"
+                "sw      %[temp1],   9728(%[x1])         \n\t"
+                PTR_ADDIU "%[x1],    %[x1],      256     \n\t"
+                PTR_ADDIU "%[Y01],   %[Y01],     512     \n\t"
+                "addiu   %[i],       %[i],       1       \n\t"
+                "bne     %[i],       %[i_Temp],  3b      \n\t"
+
+                : [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
+                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+                : [i_Temp]"r"(i_Temp)
+                : "memory"
+            );
+            x1 -=(i_Temp<<6)-1;
+            Y01 -=(i_Temp<<7)-2;
+        }
+    }
+
+    x1=&X[0][i_Temp][0];
+    X_low1=&X_low[0][i_Temp+2][0];
+    temp3=38;
+
+    for (k = 0; k < sbr->kx[1]; k++) {
+
+        __asm__ volatile (
+            "move    %[i],       %[i_Temp]              \n\t"
+        "4:                                             \n\t"
+            "lw      %[temp0],   0(%[X_low1])           \n\t"
+            "lw      %[temp1],   4(%[X_low1])           \n\t"
+            "sw      %[temp0],   0(%[x1])               \n\t"
+            "sw      %[temp1],   9728(%[x1])            \n\t"
+            PTR_ADDIU "%[x1],    %[x1],         256     \n\t"
+            PTR_ADDIU "%[X_low1],%[X_low1],     8       \n\t"
+            "addiu   %[i],       %[i],          1       \n\t"
+            "bne     %[i],       %[temp3],      4b      \n\t"
+
+            : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2)
+            : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
+            : "memory"
+        );
+        x1 -= ((38-i_Temp)<<6)-1;
+        X_low1 -= ((38-i_Temp)<<1)- 80;
+    }
+
+    x1=&X[0][i_Temp][k];
+    Y11=&Y1[i_Temp][k][0];
+    temp2=32;
+
+    for (; k < sbr->kx[1] + sbr->m[1]; k++) {
+
+        __asm__ volatile (
+           "move    %[i],       %[i_Temp]               \n\t"
+        "5:                                             \n\t"
+           "lw      %[temp0],   0(%[Y11])               \n\t"
+           "lw      %[temp1],   4(%[Y11])               \n\t"
+           "sw      %[temp0],   0(%[x1])                \n\t"
+           "sw      %[temp1],   9728(%[x1])             \n\t"
+           PTR_ADDIU "%[x1],    %[x1],          256     \n\t"
+           PTR_ADDIU "%[Y11],   %[Y11],         512     \n\t"
+           "addiu   %[i],       %[i],           1       \n\t"
+           "bne     %[i],       %[temp2],       5b      \n\t"
+
+           : [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
+             [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+           : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
+             [temp2]"r"(temp2)
+           : "memory"
+        );
+
+        x1 -= ((32-i_Temp)<<6)-1;
+        Y11 -= ((32-i_Temp)<<7)-2;
+   }
+      return 0;
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void sbr_hf_assemble_mips(float Y1[38][64][2],
+                            const float X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2])
+{
+    int e, i, j, m;
+    const int h_SL = 4 * !sbr->bs_smoothing_mode;
+    const int kx = sbr->kx[1];
+    const int m_max = sbr->m[1];
+    static const float h_smooth[5] = {
+        0.33333333333333,
+        0.30150283239582,
+        0.21816949906249,
+        0.11516383427084,
+        0.03183050093751,
+    };
+
+    float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
+    int indexnoise = ch_data->f_indexnoise;
+    int indexsine  = ch_data->f_indexsine;
+    float *g_temp1, *q_temp1, *pok, *pok1;
+    float temp1, temp2, temp3, temp4;
+    int size = m_max;
+
+    if (sbr->reset) {
+        for (i = 0; i < h_SL; i++) {
+            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
+            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
+        }
+    } else if (h_SL) {
+        memcpy(g_temp[2*ch_data->t_env[0]], g_temp[2*ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
+        memcpy(q_temp[2*ch_data->t_env[0]], q_temp[2*ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            g_temp1 = g_temp[h_SL + i];
+            pok = sbr->gain[e];
+            q_temp1 = q_temp[h_SL + i];
+            pok1 = sbr->q_m[e];
+
+            /* loop unrolled 4 times */
+            for (j=0; j<(size>>2); j++) {
+                __asm__ volatile (
+                    "lw      %[temp1],   0(%[pok])               \n\t"
+                    "lw      %[temp2],   4(%[pok])               \n\t"
+                    "lw      %[temp3],   8(%[pok])               \n\t"
+                    "lw      %[temp4],   12(%[pok])              \n\t"
+                    "sw      %[temp1],   0(%[g_temp1])           \n\t"
+                    "sw      %[temp2],   4(%[g_temp1])           \n\t"
+                    "sw      %[temp3],   8(%[g_temp1])           \n\t"
+                    "sw      %[temp4],   12(%[g_temp1])          \n\t"
+                    "lw      %[temp1],   0(%[pok1])              \n\t"
+                    "lw      %[temp2],   4(%[pok1])              \n\t"
+                    "lw      %[temp3],   8(%[pok1])              \n\t"
+                    "lw      %[temp4],   12(%[pok1])             \n\t"
+                    "sw      %[temp1],   0(%[q_temp1])           \n\t"
+                    "sw      %[temp2],   4(%[q_temp1])           \n\t"
+                    "sw      %[temp3],   8(%[q_temp1])           \n\t"
+                    "sw      %[temp4],   12(%[q_temp1])          \n\t"
+                    PTR_ADDIU "%[pok],     %[pok],         16    \n\t"
+                    PTR_ADDIU "%[g_temp1], %[g_temp1],     16    \n\t"
+                    PTR_ADDIU "%[pok1],    %[pok1],        16    \n\t"
+                    PTR_ADDIU "%[q_temp1], %[q_temp1],     16    \n\t"
+
+                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
+                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
+                    :
+                    : "memory"
+                );
+            }
+
+            for (j=0; j<(size&3); j++) {
+                __asm__ volatile (
+                    "lw      %[temp1],   0(%[pok])              \n\t"
+                    "lw      %[temp2],   0(%[pok1])             \n\t"
+                    "sw      %[temp1],   0(%[g_temp1])          \n\t"
+                    "sw      %[temp2],   0(%[q_temp1])          \n\t"
+                    PTR_ADDIU "%[pok],     %[pok],        4     \n\t"
+                    PTR_ADDIU "%[g_temp1], %[g_temp1],    4     \n\t"
+                    PTR_ADDIU "%[pok1],    %[pok1],       4     \n\t"
+                    PTR_ADDIU "%[q_temp1], %[q_temp1],    4     \n\t"
+
+                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
+                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
+                    :
+                    : "memory"
+                );
+            }
+        }
+    }
+
+    for (e = 0; e < ch_data->bs_num_env; e++) {
+        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
+            LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
+            LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
+            float *g_filt, *q_filt;
+
+            if (h_SL && e != e_a[0] && e != e_a[1]) {
+                g_filt = g_filt_tab;
+                q_filt = q_filt_tab;
+
+                for (m = 0; m < m_max; m++) {
+                    const int idx1 = i + h_SL;
+                    g_filt[m] = 0.0f;
+                    q_filt[m] = 0.0f;
+
+                    for (j = 0; j <= h_SL; j++) {
+                        g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
+                        q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
+                    }
+                }
+            } else {
+                g_filt = g_temp[i + h_SL];
+                q_filt = q_temp[i];
+            }
+
+            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
+                               i + ENVELOPE_ADJUSTMENT_OFFSET);
+
+            if (e != e_a[0] && e != e_a[1]) {
+                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
+                                                   q_filt, indexnoise,
+                                                   kx, m_max);
+            } else {
+                int idx = indexsine&1;
+                int A = (1-((indexsine+(kx & 1))&2));
+                int B = (A^(-idx)) + idx;
+                float *out = &Y1[i][kx][idx];
+                float *in  = sbr->s_m[e];
+                float temp0, temp1, temp2, temp3, temp4, temp5;
+                float A_f = (float)A;
+                float B_f = (float)B;
+
+                for (m = 0; m+1 < m_max; m+=2) {
+
+                    temp2 = out[0];
+                    temp3 = out[2];
+
+                    __asm__ volatile(
+                        "lwc1    %[temp0],  0(%[in])                     \n\t"
+                        "lwc1    %[temp1],  4(%[in])                     \n\t"
+                        "madd.s  %[temp4],  %[temp2],  %[temp0], %[A_f]  \n\t"
+                        "madd.s  %[temp5],  %[temp3],  %[temp1], %[B_f]  \n\t"
+                        "swc1    %[temp4],  0(%[out])                    \n\t"
+                        "swc1    %[temp5],  8(%[out])                    \n\t"
+                        PTR_ADDIU "%[in],   %[in],     8                 \n\t"
+                        PTR_ADDIU "%[out],  %[out],    16                \n\t"
+
+                        : [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
+                          [temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
+                          [in]"+r"(in), [out]"+r"(out)
+                        : [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
+                          [temp3]"f"(temp3)
+                        : "memory"
+                    );
+                }
+                if(m_max&1)
+                    out[2*m  ] += in[m  ] * A;
+            }
+            indexnoise = (indexnoise + m_max) & 0x1ff;
+            indexsine = (indexsine + 1) & 3;
+        }
+    }
+    ch_data->f_indexnoise = indexnoise;
+    ch_data->f_indexsine  = indexsine;
+}
+
+static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
+                                  float (*alpha0)[2], float (*alpha1)[2],
+                                  const float X_low[32][40][2], int k0)
+{
+    int k;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
+    float *phi1, *alpha_1, *alpha_0, res1, res2, temp_real, temp_im;
+
+    c = 1.000001f;
+
+    for (k = 0; k < k0; k++) {
+        LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
+        float dk;
+        phi1 = &phi[0][0][0];
+        alpha_1 = &alpha1[k][0];
+        alpha_0 = &alpha0[k][0];
+        dsp->autocorrelate(X_low[k], phi);
+
+        __asm__ volatile (
+            "lwc1    %[temp0],  40(%[phi1])                       \n\t"
+            "lwc1    %[temp1],  16(%[phi1])                       \n\t"
+            "lwc1    %[temp2],  24(%[phi1])                       \n\t"
+            "lwc1    %[temp3],  28(%[phi1])                       \n\t"
+            "mul.s   %[dk],     %[temp0],    %[temp1]             \n\t"
+            "lwc1    %[temp4],  0(%[phi1])                        \n\t"
+            "mul.s   %[res2],   %[temp2],    %[temp2]             \n\t"
+            "lwc1    %[temp5],  4(%[phi1])                        \n\t"
+            "madd.s  %[res2],   %[res2],     %[temp3],  %[temp3]  \n\t"
+            "lwc1    %[temp6],  8(%[phi1])                        \n\t"
+            "div.s   %[res2],   %[res2],     %[c]                 \n\t"
+            "lwc1    %[temp0],  12(%[phi1])                       \n\t"
+            "sub.s   %[dk],     %[dk],       %[res2]              \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
+            : [phi1]"r"(phi1), [c]"f"(c)
+            : "memory"
+        );
+
+        if (!dk) {
+            alpha_1[0] = 0;
+            alpha_1[1] = 0;
+        } else {
+            __asm__ volatile (
+                "mul.s   %[temp_real], %[temp4],     %[temp2]            \n\t"
+                "nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3]  \n\t"
+                "nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1]  \n\t"
+                "mul.s   %[temp_im],   %[temp4],     %[temp3]            \n\t"
+                "madd.s  %[temp_im],   %[temp_im],   %[temp5], %[temp2]  \n\t"
+                "nmsub.s %[temp_im],   %[temp_im],   %[temp0], %[temp1]  \n\t"
+                "div.s   %[temp_real], %[temp_real], %[dk]               \n\t"
+                "div.s   %[temp_im],   %[temp_im],   %[dk]               \n\t"
+                "swc1    %[temp_real], 0(%[alpha_1])                     \n\t"
+                "swc1    %[temp_im],   4(%[alpha_1])                     \n\t"
+
+                : [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
+                : [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
+                  [temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
+                  [temp5]"f"(temp5), [temp6]"f"(temp6),
+                  [alpha_1]"r"(alpha_1), [dk]"f"(dk)
+                : "memory"
+            );
+        }
+
+        if (!phi1[4]) {
+            alpha_0[0] = 0;
+            alpha_0[1] = 0;
+        } else {
+            __asm__ volatile (
+                "lwc1    %[temp6],     0(%[alpha_1])                     \n\t"
+                "lwc1    %[temp7],     4(%[alpha_1])                     \n\t"
+                "mul.s   %[temp_real], %[temp6],     %[temp2]            \n\t"
+                "add.s   %[temp_real], %[temp_real], %[temp4]            \n\t"
+                "madd.s  %[temp_real], %[temp_real], %[temp7], %[temp3]  \n\t"
+                "mul.s   %[temp_im],   %[temp7],     %[temp2]            \n\t"
+                "add.s   %[temp_im],   %[temp_im],   %[temp5]            \n\t"
+                "nmsub.s %[temp_im],   %[temp_im],   %[temp6], %[temp3]  \n\t"
+                "div.s   %[temp_real], %[temp_real], %[temp1]            \n\t"
+                "div.s   %[temp_im],   %[temp_im],   %[temp1]            \n\t"
+                "neg.s   %[temp_real], %[temp_real]                      \n\t"
+                "neg.s   %[temp_im],   %[temp_im]                        \n\t"
+                "swc1    %[temp_real], 0(%[alpha_0])                     \n\t"
+                "swc1    %[temp_im],   4(%[alpha_0])                     \n\t"
+
+                : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
+                  [res1]"=&f"(res1), [res2]"=&f"(res2)
+                : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
+                  [temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
+                  [temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
+                : "memory"
+            );
+        }
+
+        __asm__ volatile (
+            "lwc1    %[temp1],      0(%[alpha_1])                           \n\t"
+            "lwc1    %[temp2],      4(%[alpha_1])                           \n\t"
+            "lwc1    %[temp_real],  0(%[alpha_0])                           \n\t"
+            "lwc1    %[temp_im],    4(%[alpha_0])                           \n\t"
+            "mul.s   %[res1],       %[temp1],      %[temp1]                 \n\t"
+            "madd.s  %[res1],       %[res1],       %[temp2],    %[temp2]    \n\t"
+            "mul.s   %[res2],       %[temp_real],  %[temp_real]             \n\t"
+            "madd.s  %[res2],       %[res2],       %[temp_im],  %[temp_im]  \n\t"
+
+            : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
+              [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [res1]"=&f"(res1), [res2]"=&f"(res2)
+            : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
+            : "memory"
+        );
+
+        if (res1 >= 16.0f || res2 >= 16.0f) {
+            alpha_1[0] = 0;
+            alpha_1[1] = 0;
+            alpha_0[0] = 0;
+            alpha_0[1] = 0;
+        }
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->sbr_lf_gen            = sbr_lf_gen_mips;
+    c->sbr_x_gen             = sbr_x_gen_mips;
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
+    c->sbr_hf_assemble       = sbr_hf_assemble_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/aacsbr_mips.h b/libavcodec/mips/aacsbr_mips.h
new file mode 100644
index 0000000..4461e76
--- /dev/null
+++ b/libavcodec/mips/aacsbr_mips.h
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/aacsbr.c
+ */
+
+#ifndef AVCODEC_MIPS_AACSBR_MIPS_H
+#define AVCODEC_MIPS_AACSBR_MIPS_H
+
+#include "libavcodec/aac.h"
+#include "libavcodec/sbr.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void sbr_qmf_analysis_mips(AVFloatDSPContext *fdsp, FFTContext *mdct,
+                             SBRDSPContext *sbrdsp, const float *in, float *x,
+                             float z[320], float W[2][32][32][2], int buf_idx)
+{
+    int i;
+    float *w0;
+    float *w1;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    w0 = x;
+    w1 = x + 1024;
+    for(i = 0; i < 36; i++)
+    {
+        /* loop unrolled 8 times */
+        __asm__ volatile(
+            "lw      %[temp0],   0(%[w1])         \n\t"
+            "lw      %[temp1],   4(%[w1])         \n\t"
+            "lw      %[temp2],   8(%[w1])         \n\t"
+            "lw      %[temp3],   12(%[w1])        \n\t"
+            "lw      %[temp4],   16(%[w1])        \n\t"
+            "lw      %[temp5],   20(%[w1])        \n\t"
+            "lw      %[temp6],   24(%[w1])        \n\t"
+            "lw      %[temp7],   28(%[w1])        \n\t"
+            "sw      %[temp0],   0(%[w0])         \n\t"
+            "sw      %[temp1],   4(%[w0])         \n\t"
+            "sw      %[temp2],   8(%[w0])         \n\t"
+            "sw      %[temp3],   12(%[w0])        \n\t"
+            "sw      %[temp4],   16(%[w0])        \n\t"
+            "sw      %[temp5],   20(%[w0])        \n\t"
+            "sw      %[temp6],   24(%[w0])        \n\t"
+            "sw      %[temp7],   28(%[w0])        \n\t"
+            PTR_ADDIU " %[w0],      %[w0],     32 \n\t"
+            PTR_ADDIU " %[w1],      %[w1],     32 \n\t"
+
+            : [w0]"+r"(w0), [w1]"+r"(w1),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            :
+            : "memory"
+        );
+    }
+
+    w0 = x + 288;
+    w1 = (float*)in;
+    for(i = 0; i < 128; i++)
+    {
+        /* loop unrolled 8 times */
+        __asm__ volatile(
+            "lw       %[temp0],    0(%[w1])        \n\t"
+            "lw       %[temp1],    4(%[w1])        \n\t"
+            "lw       %[temp2],    8(%[w1])        \n\t"
+            "lw       %[temp3],    12(%[w1])       \n\t"
+            "lw       %[temp4],    16(%[w1])       \n\t"
+            "lw       %[temp5],    20(%[w1])       \n\t"
+            "lw       %[temp6],    24(%[w1])       \n\t"
+            "lw       %[temp7],    28(%[w1])       \n\t"
+            "sw       %[temp0],    0(%[w0])        \n\t"
+            "sw       %[temp1],    4(%[w0])        \n\t"
+            "sw       %[temp2],    8(%[w0])        \n\t"
+            "sw       %[temp3],    12(%[w0])       \n\t"
+            "sw       %[temp4],    16(%[w0])       \n\t"
+            "sw       %[temp5],    20(%[w0])       \n\t"
+            "sw       %[temp6],    24(%[w0])       \n\t"
+            "sw       %[temp7],    28(%[w0])       \n\t"
+            PTR_ADDIU "  %[w0],       %[w0],    32 \n\t"
+            PTR_ADDIU "  %[w1],       %[w1],    32 \n\t"
+
+            : [w0]"+r"(w0), [w1]"+r"(w1),
+              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+              [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+              [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+              [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+            :
+            : "memory"
+        );
+    }
+
+    for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
+                               // are not supported
+        fdsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
+        sbrdsp->sum64x5(z);
+        sbrdsp->qmf_pre_shuffle(z);
+        mdct->imdct_half(mdct, z, z+64);
+        sbrdsp->qmf_post_shuffle(W[buf_idx][i], z);
+        x += 32;
+    }
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void sbr_qmf_synthesis_mips(FFTContext *mdct,
+                              SBRDSPContext *sbrdsp, AVFloatDSPContext *fdsp,
+                              float *out, float X[2][38][64],
+                              float mdct_buf[2][64],
+                              float *v0, int *v_off, const unsigned int div)
+{
+    int i, n;
+    const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
+    const int step = 128 >> div;
+    float *v;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+    float temp14, temp15, temp16, temp17, temp18, temp19;
+    float *vv0, *s0, *dst;
+    dst = out;
+
+    for (i = 0; i < 32; i++) {
+        if (*v_off < step) {
+            int saved_samples = (1280 - 128) >> div;
+            memcpy(&v0[SBR_SYNTHESIS_BUF_SIZE - saved_samples], v0, saved_samples * sizeof(float));
+            *v_off = SBR_SYNTHESIS_BUF_SIZE - saved_samples - step;
+        } else {
+            *v_off -= step;
+        }
+        v = v0 + *v_off;
+        if (div) {
+            for (n = 0; n < 32; n++) {
+                X[0][i][   n] = -X[0][i][n];
+                X[0][i][32+n] =  X[1][i][31-n];
+            }
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
+        } else {
+            sbrdsp->neg_odd_64(X[1][i]);
+            mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
+            mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
+            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
+        }
+
+        if(div == 0)
+        {
+            float *v0_end;
+            vv0 = v;
+            v0_end = v + 60;
+            s0 = (float*)sbr_qmf_window;
+
+            /* 10 calls of function vector_fmul_add merged into one loop
+               and loop unrolled 4 times */
+            __asm__ volatile(
+                ".set    push                                           \n\t"
+                ".set    noreorder                                      \n\t"
+                "lwc1    %[temp4],   0(%[v0])                           \n\t"
+                "lwc1    %[temp5],   0(%[s0])                           \n\t"
+                "lwc1    %[temp6],   4(%[v0])                           \n\t"
+                "lwc1    %[temp7],   4(%[s0])                           \n\t"
+                "lwc1    %[temp8],   8(%[v0])                           \n\t"
+                "lwc1    %[temp9],   8(%[s0])                           \n\t"
+                "lwc1    %[temp10],  12(%[v0])                          \n\t"
+                "lwc1    %[temp11],  12(%[s0])                          \n\t"
+                "lwc1    %[temp12],  768(%[v0])                         \n\t"
+                "lwc1    %[temp13],  256(%[s0])                         \n\t"
+                "lwc1    %[temp14],  772(%[v0])                         \n\t"
+                "lwc1    %[temp15],  260(%[s0])                         \n\t"
+                "lwc1    %[temp16],  776(%[v0])                         \n\t"
+                "lwc1    %[temp17],  264(%[s0])                         \n\t"
+                "lwc1    %[temp18],  780(%[v0])                         \n\t"
+                "lwc1    %[temp19],  268(%[s0])                         \n\t"
+            "1:                                                         \n\t"
+                "mul.s   %[temp0],   %[temp4],   %[temp5]               \n\t"
+                "lwc1    %[temp4],   1024(%[v0])                        \n\t"
+                "mul.s   %[temp1],   %[temp6],   %[temp7]               \n\t"
+                "lwc1    %[temp5],   512(%[s0])                         \n\t"
+                "mul.s   %[temp2],   %[temp8],   %[temp9]               \n\t"
+                "lwc1    %[temp6],   1028(%[v0])                        \n\t"
+                "mul.s   %[temp3],   %[temp10],  %[temp11]              \n\t"
+                "lwc1    %[temp7],   516(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   1032(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   520(%[s0])                         \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  1036(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  524(%[s0])                         \n\t"
+                "lwc1    %[temp12],  1792(%[v0])                        \n\t"
+                "lwc1    %[temp13],  768(%[s0])                         \n\t"
+                "lwc1    %[temp14],  1796(%[v0])                        \n\t"
+                "lwc1    %[temp15],  772(%[s0])                         \n\t"
+                "lwc1    %[temp16],  1800(%[v0])                        \n\t"
+                "lwc1    %[temp17],  776(%[s0])                         \n\t"
+                "lwc1    %[temp18],  1804(%[v0])                        \n\t"
+                "lwc1    %[temp19],  780(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   2048(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1024(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   2052(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1028(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   2056(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1032(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  2060(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1036(%[s0])                        \n\t"
+                "lwc1    %[temp12],  2816(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1280(%[s0])                        \n\t"
+                "lwc1    %[temp14],  2820(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1284(%[s0])                        \n\t"
+                "lwc1    %[temp16],  2824(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1288(%[s0])                        \n\t"
+                "lwc1    %[temp18],  2828(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1292(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   3072(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1536(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   3076(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1540(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   3080(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1544(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  3084(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1548(%[s0])                        \n\t"
+                "lwc1    %[temp12],  3840(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1792(%[s0])                        \n\t"
+                "lwc1    %[temp14],  3844(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1796(%[s0])                        \n\t"
+                "lwc1    %[temp16],  3848(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1800(%[s0])                        \n\t"
+                "lwc1    %[temp18],  3852(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1804(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   4096(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   2048(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4100(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   2052(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   4104(%[v0])                        \n\t"
+                PTR_ADDIU "%[dst],     %[dst],      16                  \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   2056(%[s0])                        \n\t"
+                PTR_ADDIU " %[s0],      %[s0],      16                  \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  4108(%[v0])                        \n\t"
+                PTR_ADDIU " %[v0],      %[v0],      16                  \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  2044(%[s0])                        \n\t"
+                "lwc1    %[temp12],  4848(%[v0])                        \n\t"
+                "lwc1    %[temp13],  2288(%[s0])                        \n\t"
+                "lwc1    %[temp14],  4852(%[v0])                        \n\t"
+                "lwc1    %[temp15],  2292(%[s0])                        \n\t"
+                "lwc1    %[temp16],  4856(%[v0])                        \n\t"
+                "lwc1    %[temp17],  2296(%[s0])                        \n\t"
+                "lwc1    %[temp18],  4860(%[v0])                        \n\t"
+                "lwc1    %[temp19],  2300(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   0(%[v0])                           \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   0(%[s0])                           \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4(%[v0])                           \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   4(%[s0])                           \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   8(%[v0])                           \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   8(%[s0])                           \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  12(%[v0])                          \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  12(%[s0])                          \n\t"
+                "lwc1    %[temp12],  768(%[v0])                         \n\t"
+                "lwc1    %[temp13],  256(%[s0])                         \n\t"
+                "lwc1    %[temp14],  772(%[v0])                         \n\t"
+                "lwc1    %[temp15],  260(%[s0])                         \n\t"
+                "lwc1    %[temp16],  776(%[v0])                         \n\t"
+                "lwc1    %[temp17],  264(%[s0])                         \n\t"
+                "lwc1    %[temp18],  780(%[v0])                         \n\t"
+                "lwc1    %[temp19],  268(%[s0])                         \n\t"
+                "swc1    %[temp0],   -16(%[dst])                        \n\t"
+                "swc1    %[temp1],   -12(%[dst])                        \n\t"
+                "swc1    %[temp2],   -8(%[dst])                         \n\t"
+                "bne     %[v0],      %[v0_end],  1b                     \n\t"
+                " swc1   %[temp3],   -4(%[dst])                         \n\t"
+                "mul.s   %[temp0],   %[temp4],   %[temp5]               \n\t"
+                "lwc1    %[temp4],   1024(%[v0])                        \n\t"
+                "mul.s   %[temp1],   %[temp6],   %[temp7]               \n\t"
+                "lwc1    %[temp5],   512(%[s0])                         \n\t"
+                "mul.s   %[temp2],   %[temp8],   %[temp9]               \n\t"
+                "lwc1    %[temp6],   1028(%[v0])                        \n\t"
+                "mul.s   %[temp3],   %[temp10],  %[temp11]              \n\t"
+                "lwc1    %[temp7],   516(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   1032(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   520(%[s0])                         \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  1036(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  524(%[s0])                         \n\t"
+                "lwc1    %[temp12],  1792(%[v0])                        \n\t"
+                "lwc1    %[temp13],  768(%[s0])                         \n\t"
+                "lwc1    %[temp14],  1796(%[v0])                        \n\t"
+                "lwc1    %[temp15],  772(%[s0])                         \n\t"
+                "lwc1    %[temp16],  1800(%[v0])                        \n\t"
+                "lwc1    %[temp17],  776(%[s0])                         \n\t"
+                "lwc1    %[temp18],  1804(%[v0])                        \n\t"
+                "lwc1    %[temp19],  780(%[s0])                         \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   2048(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1024(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   2052(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1028(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   2056(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1032(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  2060(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1036(%[s0])                        \n\t"
+                "lwc1    %[temp12],  2816(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1280(%[s0])                        \n\t"
+                "lwc1    %[temp14],  2820(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1284(%[s0])                        \n\t"
+                "lwc1    %[temp16],  2824(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1288(%[s0])                        \n\t"
+                "lwc1    %[temp18],  2828(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1292(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   3072(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   1536(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   3076(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   1540(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   3080(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   1544(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  3084(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  1548(%[s0])                        \n\t"
+                "lwc1    %[temp12],  3840(%[v0])                        \n\t"
+                "lwc1    %[temp13],  1792(%[s0])                        \n\t"
+                "lwc1    %[temp14],  3844(%[v0])                        \n\t"
+                "lwc1    %[temp15],  1796(%[s0])                        \n\t"
+                "lwc1    %[temp16],  3848(%[v0])                        \n\t"
+                "lwc1    %[temp17],  1800(%[s0])                        \n\t"
+                "lwc1    %[temp18],  3852(%[v0])                        \n\t"
+                "lwc1    %[temp19],  1804(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp4],   4096(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp5],   2048(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp6],   4100(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp7],   2052(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                "lwc1    %[temp8],   4104(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "lwc1    %[temp9],   2056(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "lwc1    %[temp10],  4108(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "lwc1    %[temp11],  2060(%[s0])                        \n\t"
+                "lwc1    %[temp12],  4864(%[v0])                        \n\t"
+                "lwc1    %[temp13],  2304(%[s0])                        \n\t"
+                "lwc1    %[temp14],  4868(%[v0])                        \n\t"
+                "lwc1    %[temp15],  2308(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp4],   %[temp5]   \n\t"
+                "lwc1    %[temp16],  4872(%[v0])                        \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp6],   %[temp7]   \n\t"
+                "lwc1    %[temp17],  2312(%[s0])                        \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp8],   %[temp9]   \n\t"
+                "lwc1    %[temp18],  4876(%[v0])                        \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp10],  %[temp11]  \n\t"
+                "lwc1    %[temp19],  2316(%[s0])                        \n\t"
+                "madd.s  %[temp0],   %[temp0],   %[temp12],  %[temp13]  \n\t"
+                PTR_ADDIU "%[dst],     %[dst],     16                   \n\t"
+                "madd.s  %[temp1],   %[temp1],   %[temp14],  %[temp15]  \n\t"
+                "madd.s  %[temp2],   %[temp2],   %[temp16],  %[temp17]  \n\t"
+                "madd.s  %[temp3],   %[temp3],   %[temp18],  %[temp19]  \n\t"
+                "swc1    %[temp0],   -16(%[dst])                        \n\t"
+                "swc1    %[temp1],   -12(%[dst])                        \n\t"
+                "swc1    %[temp2],   -8(%[dst])                         \n\t"
+                "swc1    %[temp3],   -4(%[dst])                         \n\t"
+                ".set    pop                                            \n\t"
+
+                : [dst]"+r"(dst), [v0]"+r"(vv0), [s0]"+r"(s0),
+                  [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+                  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+                  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+                  [temp12]"=&f"(temp12), [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+                  [temp15]"=&f"(temp15), [temp16]"=&f"(temp16), [temp17]"=&f"(temp17),
+                  [temp18]"=&f"(temp18), [temp19]"=&f"(temp19)
+                : [v0_end]"r"(v0_end)
+                : "memory"
+            );
+        }
+        else
+        {
+            fdsp->vector_fmul   (out, v                , sbr_qmf_window                       , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
+            fdsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
+            out += 64 >> div;
+        }
+    }
+}
+
+#define sbr_qmf_analysis sbr_qmf_analysis_mips
+#define sbr_qmf_synthesis sbr_qmf_synthesis_mips
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_AACSBR_MIPS_H */
diff --git a/libavcodec/mips/ac3dsp_mips.c b/libavcodec/mips/ac3dsp_mips.c
new file mode 100644
index 0000000..f9aaf15
--- /dev/null
+++ b/libavcodec/mips/ac3dsp_mips.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Branimir Vasic (bvasic@mips.com)
+ *           Nedeljko Babic (nbabic@mips.com)
+ *
+ * Various AC-3 DSP Utils optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/ac3dsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/ac3dsp.h"
+#include "libavcodec/ac3.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSP
+static void ac3_bit_alloc_calc_bap_mips(int16_t *mask, int16_t *psd,
+                                        int start, int end,
+                                        int snr_offset, int floor,
+                                        const uint8_t *bap_tab, uint8_t *bap)
+{
+    int band, band_end, cond;
+    int m, address1, address2;
+    int16_t *psd1, *psd_end;
+    uint8_t *bap1;
+
+    if (snr_offset == -960) {
+        memset(bap, 0, AC3_MAX_COEFS);
+        return;
+    }
+
+    psd1 = &psd[start];
+    bap1 = &bap[start];
+    band = ff_ac3_bin_to_band_tab[start];
+
+    do {
+        m = (FFMAX(mask[band] - snr_offset - floor, 0) & 0x1FE0) + floor;
+        band_end = ff_ac3_band_start_tab[++band];
+        band_end = FFMIN(band_end, end);
+        psd_end = psd + band_end - 1;
+
+        __asm__ volatile (
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "beqz       %[cond],        1f                          \n\t"
+            "2:                                                     \n\t"
+            "lh         %[address1],    0(%[psd1])                  \n\t"
+            "lh         %[address2],    2(%[psd1])                  \n\t"
+            PTR_ADDIU " %[psd1],        %[psd1],        4           \n\t"
+            "subu       %[address1],    %[address1],    %[m]        \n\t"
+            "sra        %[address1],    %[address1],    5           \n\t"
+            "addiu      %[address1],    %[address1],    -32         \n\t"
+            "shll_s.w   %[address1],    %[address1],    26          \n\t"
+            "subu       %[address2],    %[address2],    %[m]        \n\t"
+            "sra        %[address2],    %[address2],    5           \n\t"
+            "sra        %[address1],    %[address1],    26          \n\t"
+            "addiu      %[address1],    %[address1],    32          \n\t"
+            "lbux       %[address1],    %[address1](%[bap_tab])     \n\t"
+            "addiu      %[address2],    %[address2],    -32         \n\t"
+            "shll_s.w   %[address2],    %[address2],    26          \n\t"
+            "sb         %[address1],    0(%[bap1])                  \n\t"
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "sra        %[address2],    %[address2],    26          \n\t"
+            "addiu      %[address2],    %[address2],    32          \n\t"
+            "lbux       %[address2],    %[address2](%[bap_tab])     \n\t"
+            "sb         %[address2],    1(%[bap1])                  \n\t"
+            PTR_ADDIU " %[bap1],        %[bap1],        2           \n\t"
+            "bnez       %[cond],        2b                          \n\t"
+            PTR_ADDIU " %[psd_end],     %[psd_end],     2           \n\t"
+            "slt        %[cond],        %[psd1],        %[psd_end]  \n\t"
+            "beqz       %[cond],        3f                          \n\t"
+            "1:                                                     \n\t"
+            "lh         %[address1],    0(%[psd1])                  \n\t"
+            PTR_ADDIU " %[psd1],        %[psd1],        2           \n\t"
+            "subu       %[address1],    %[address1],    %[m]        \n\t"
+            "sra        %[address1],    %[address1],    5           \n\t"
+            "addiu      %[address1],    %[address1],    -32         \n\t"
+            "shll_s.w   %[address1],    %[address1],    26          \n\t"
+            "sra        %[address1],    %[address1],    26          \n\t"
+            "addiu      %[address1],    %[address1],    32          \n\t"
+            "lbux       %[address1],    %[address1](%[bap_tab])     \n\t"
+            "sb         %[address1],    0(%[bap1])                  \n\t"
+            PTR_ADDIU " %[bap1],        %[bap1],        1           \n\t"
+            "3:                                                     \n\t"
+
+            : [address1]"=&r"(address1), [address2]"=&r"(address2),
+              [cond]"=&r"(cond), [bap1]"+r"(bap1),
+              [psd1]"+r"(psd1), [psd_end]"+r"(psd_end)
+            : [m]"r"(m), [bap_tab]"r"(bap_tab)
+            : "memory"
+        );
+    } while (end > band_end);
+}
+
+static void ac3_update_bap_counts_mips(uint16_t mant_cnt[16], uint8_t *bap,
+                                       int len)
+{
+    void *temp0, *temp2, *temp4, *temp5, *temp6, *temp7;
+    int temp1, temp3;
+
+    __asm__ volatile (
+        "andi   %[temp3],   %[len],         3               \n\t"
+        PTR_ADDU "%[temp2], %[bap],         %[len]          \n\t"
+        PTR_ADDU "%[temp4], %[bap],         %[temp3]        \n\t"
+        "beq    %[temp2],   %[temp4],       4f              \n\t"
+        "1:                                                 \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        "lbu    %[temp5],   -2(%[temp2])                    \n\t"
+        "lbu    %[temp6],   -3(%[temp2])                    \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        PTR_ADDU "%[temp0], %[mant_cnt],    %[temp0]        \n\t"
+        "sll    %[temp5],   %[temp5],       1               \n\t"
+        PTR_ADDU "%[temp5], %[mant_cnt],    %[temp5]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "sll    %[temp6],   %[temp6],       1               \n\t"
+        PTR_ADDU "%[temp6], %[mant_cnt],    %[temp6]        \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "lhu    %[temp1],   0(%[temp5])                     \n\t"
+        "lbu    %[temp7],   -4(%[temp2])                    \n\t"
+        PTR_ADDIU "%[temp2],%[temp2],       -4              \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp5])                     \n\t"
+        "lhu    %[temp1],   0(%[temp6])                     \n\t"
+        "sll    %[temp7],   %[temp7],       1               \n\t"
+        PTR_ADDU "%[temp7], %[mant_cnt],    %[temp7]        \n\t"
+        "addiu  %[temp1],   %[temp1],1                      \n\t"
+        "sh     %[temp1],   0(%[temp6])                     \n\t"
+        "lhu    %[temp1],   0(%[temp7])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp7])                     \n\t"
+        "bne    %[temp2],   %[temp4],       1b              \n\t"
+        "4:                                                 \n\t"
+        "beqz   %[temp3],   2f                              \n\t"
+        "3:                                                 \n\t"
+        "addiu  %[temp3],   %[temp3],       -1              \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        PTR_ADDIU "%[temp2],%[temp2],       -1              \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        PTR_ADDU "%[temp0], %[mant_cnt],    %[temp0]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "bgtz   %[temp3],   3b                              \n\t"
+        "2:                                                 \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
+          [temp6] "=&r" (temp6), [temp7] "=&r" (temp7)
+        : [len] "r" (len), [bap] "r" (bap),
+          [mant_cnt] "r" (mant_cnt)
+        : "memory"
+    );
+}
+#endif
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void float_to_fixed24_mips(int32_t *dst, const float *src, unsigned int len)
+{
+    const float scale = 1 << 24;
+    float src0, src1, src2, src3, src4, src5, src6, src7;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    do {
+        __asm__ volatile (
+            "lwc1       %[src0],    0(%[src])               \n\t"
+            "lwc1       %[src1],    4(%[src])               \n\t"
+            "lwc1       %[src2],    8(%[src])               \n\t"
+            "lwc1       %[src3],    12(%[src])              \n\t"
+            "lwc1       %[src4],    16(%[src])              \n\t"
+            "lwc1       %[src5],    20(%[src])              \n\t"
+            "lwc1       %[src6],    24(%[src])              \n\t"
+            "lwc1       %[src7],    28(%[src])              \n\t"
+            "mul.s      %[src0],    %[src0],    %[scale]    \n\t"
+            "mul.s      %[src1],    %[src1],    %[scale]    \n\t"
+            "mul.s      %[src2],    %[src2],    %[scale]    \n\t"
+            "mul.s      %[src3],    %[src3],    %[scale]    \n\t"
+            "mul.s      %[src4],    %[src4],    %[scale]    \n\t"
+            "mul.s      %[src5],    %[src5],    %[scale]    \n\t"
+            "mul.s      %[src6],    %[src6],    %[scale]    \n\t"
+            "mul.s      %[src7],    %[src7],    %[scale]    \n\t"
+            "cvt.w.s    %[src0],    %[src0]                 \n\t"
+            "cvt.w.s    %[src1],    %[src1]                 \n\t"
+            "cvt.w.s    %[src2],    %[src2]                 \n\t"
+            "cvt.w.s    %[src3],    %[src3]                 \n\t"
+            "cvt.w.s    %[src4],    %[src4]                 \n\t"
+            "cvt.w.s    %[src5],    %[src5]                 \n\t"
+            "cvt.w.s    %[src6],    %[src6]                 \n\t"
+            "cvt.w.s    %[src7],    %[src7]                 \n\t"
+            "mfc1       %[temp0],   %[src0]                 \n\t"
+            "mfc1       %[temp1],   %[src1]                 \n\t"
+            "mfc1       %[temp2],   %[src2]                 \n\t"
+            "mfc1       %[temp3],   %[src3]                 \n\t"
+            "mfc1       %[temp4],   %[src4]                 \n\t"
+            "mfc1       %[temp5],   %[src5]                 \n\t"
+            "mfc1       %[temp6],   %[src6]                 \n\t"
+            "mfc1       %[temp7],   %[src7]                 \n\t"
+            "sw         %[temp0],   0(%[dst])               \n\t"
+            "sw         %[temp1],   4(%[dst])               \n\t"
+            "sw         %[temp2],   8(%[dst])               \n\t"
+            "sw         %[temp3],   12(%[dst])              \n\t"
+            "sw         %[temp4],   16(%[dst])              \n\t"
+            "sw         %[temp5],   20(%[dst])              \n\t"
+            "sw         %[temp6],   24(%[dst])              \n\t"
+            "sw         %[temp7],   28(%[dst])              \n\t"
+
+            : [dst] "+r" (dst), [src] "+r" (src),
+              [src0] "=&f" (src0), [src1] "=&f" (src1),
+              [src2] "=&f" (src2), [src3] "=&f" (src3),
+              [src4] "=&f" (src4), [src5] "=&f" (src5),
+              [src6] "=&f" (src6), [src7] "=&f" (src7),
+              [temp0] "=r" (temp0), [temp1] "=r" (temp1),
+              [temp2] "=r" (temp2), [temp3] "=r" (temp3),
+              [temp4] "=r" (temp4), [temp5] "=r" (temp5),
+              [temp6] "=r" (temp6), [temp7] "=r" (temp7)
+            : [scale] "f" (scale)
+            : "memory"
+        );
+        src = src + 8;
+        dst = dst + 8;
+        len -= 8;
+    } while (len > 0);
+}
+
+static void ac3_downmix_mips(float **samples, float (*matrix)[2],
+                          int out_ch, int in_ch, int len)
+{
+    int i, j, i1, i2, i3;
+    float v0, v1, v2, v3;
+    float v4, v5, v6, v7;
+    float samples0, samples1, samples2, samples3, matrix_j, matrix_j2;
+    float *samples_p, *samples_sw, *matrix_p, **samples_x, **samples_end;
+
+    __asm__ volatile(
+        ".set   push                                                \n\t"
+        ".set   noreorder                                           \n\t"
+
+        "li     %[i1],          2                                   \n\t"
+        "sll    %[len],         2                                   \n\t"
+        "move   %[i],           $zero                               \n\t"
+        "sll    %[j],           %[in_ch],             " PTRLOG "    \n\t"
+
+        "bne    %[out_ch],      %[i1],                  3f          \n\t"   // if (out_ch == 2)
+        " li    %[i2],          1                                   \n\t"
+
+        "2:                                                         \n\t"   // start of the for loop (for (i = 0; i < len; i+=4))
+        "move   %[matrix_p],    %[matrix]                           \n\t"
+        "move   %[samples_x],   %[samples]                          \n\t"
+        "mtc1   $zero,          %[v0]                               \n\t"
+        "mtc1   $zero,          %[v1]                               \n\t"
+        "mtc1   $zero,          %[v2]                               \n\t"
+        "mtc1   $zero,          %[v3]                               \n\t"
+        "mtc1   $zero,          %[v4]                               \n\t"
+        "mtc1   $zero,          %[v5]                               \n\t"
+        "mtc1   $zero,          %[v6]                               \n\t"
+        "mtc1   $zero,          %[v7]                               \n\t"
+        "addiu  %[i1],          %[i],                  4            \n\t"
+        "addiu  %[i2],          %[i],                  8            \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+        "addiu  %[i3],          %[i],                  12           \n\t"
+        PTR_ADDU "%[samples_end],%[samples_x],         %[j]         \n\t"
+        "move   %[samples_sw],  %[samples_p]                        \n\t"
+
+        "1:                                                         \n\t"   // start of the inner for loop (for (j = 0; j < in_ch; j++))
+        "lwc1   %[matrix_j],    0(%[matrix_p])                      \n\t"
+        "lwc1   %[matrix_j2],   4(%[matrix_p])                      \n\t"
+        "lwxc1  %[samples0],    %[i](%[samples_p])                  \n\t"
+        "lwxc1  %[samples1],    %[i1](%[samples_p])                 \n\t"
+        "lwxc1  %[samples2],    %[i2](%[samples_p])                 \n\t"
+        "lwxc1  %[samples3],    %[i3](%[samples_p])                 \n\t"
+        PTR_ADDIU "%[matrix_p], 8                                   \n\t"
+        PTR_ADDIU "%[samples_x]," PTRSIZE "                         \n\t"
+        "madd.s %[v0],          %[v0],  %[samples0],    %[matrix_j] \n\t"
+        "madd.s %[v1],          %[v1],  %[samples1],    %[matrix_j] \n\t"
+        "madd.s %[v2],          %[v2],  %[samples2],    %[matrix_j] \n\t"
+        "madd.s %[v3],          %[v3],  %[samples3],    %[matrix_j] \n\t"
+        "madd.s %[v4],          %[v4],  %[samples0],    %[matrix_j2]\n\t"
+        "madd.s %[v5],          %[v5],  %[samples1],    %[matrix_j2]\n\t"
+        "madd.s %[v6],          %[v6],  %[samples2],    %[matrix_j2]\n\t"
+        "madd.s %[v7],          %[v7],  %[samples3],    %[matrix_j2]\n\t"
+        "bne    %[samples_x],   %[samples_end],         1b          \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+
+        PTR_L " %[samples_p],  " PTRSIZE "(%[samples])              \n\t"
+        "swxc1  %[v0],          %[i](%[samples_sw])                 \n\t"
+        "swxc1  %[v1],          %[i1](%[samples_sw])                \n\t"
+        "swxc1  %[v2],          %[i2](%[samples_sw])                \n\t"
+        "swxc1  %[v3],          %[i3](%[samples_sw])                \n\t"
+        "swxc1  %[v4],          %[i](%[samples_p])                  \n\t"
+        "addiu  %[i],           16                                  \n\t"
+        "swxc1  %[v5],          %[i1](%[samples_p])                 \n\t"
+        "swxc1  %[v6],          %[i2](%[samples_p])                 \n\t"
+        "bne    %[i],           %[len],                 2b          \n\t"
+        " swxc1 %[v7],          %[i3](%[samples_p])                 \n\t"
+
+        "3:                                                         \n\t"
+        "bne    %[out_ch],      %[i2],                  6f          \n\t"   // if (out_ch == 1)
+        " nop                                                       \n\t"
+
+        "5:                                                         \n\t"   // start of the outer for loop (for (i = 0; i < len; i+=4))
+        "move   %[matrix_p],    %[matrix]                           \n\t"
+        "move   %[samples_x],   %[samples]                          \n\t"
+        "mtc1   $zero,          %[v0]                               \n\t"
+        "mtc1   $zero,          %[v1]                               \n\t"
+        "mtc1   $zero,          %[v2]                               \n\t"
+        "mtc1   $zero,          %[v3]                               \n\t"
+        "addiu  %[i1],          %[i],                  4            \n\t"
+        "addiu  %[i2],          %[i],                  8            \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+        "addiu  %[i3],          %[i],                  12           \n\t"
+        PTR_ADDU "%[samples_end],%[samples_x],         %[j]         \n\t"
+        "move   %[samples_sw],  %[samples_p]                        \n\t"
+
+        "4:                                                         \n\t"   // start of the inner for loop (for (j = 0; j < in_ch; j++))
+        "lwc1   %[matrix_j],    0(%[matrix_p])                      \n\t"
+        "lwxc1  %[samples0],    %[i](%[samples_p])                  \n\t"
+        "lwxc1  %[samples1],    %[i1](%[samples_p])                 \n\t"
+        "lwxc1  %[samples2],    %[i2](%[samples_p])                 \n\t"
+        "lwxc1  %[samples3],    %[i3](%[samples_p])                 \n\t"
+        PTR_ADDIU "%[matrix_p], 8                                   \n\t"
+        PTR_ADDIU "%[samples_x]," PTRSIZE "                         \n\t"
+        "madd.s %[v0],          %[v0],  %[samples0],    %[matrix_j] \n\t"
+        "madd.s %[v1],          %[v1],  %[samples1],    %[matrix_j] \n\t"
+        "madd.s %[v2],          %[v2],  %[samples2],    %[matrix_j] \n\t"
+        "madd.s %[v3],          %[v3],  %[samples3],    %[matrix_j] \n\t"
+        "bne    %[samples_x],   %[samples_end],         4b          \n\t"
+        PTR_L " %[samples_p],   0(%[samples_x])                     \n\t"
+
+        "swxc1  %[v0],          %[i](%[samples_sw])                 \n\t"
+        "addiu  %[i],           16                                  \n\t"
+        "swxc1  %[v1],          %[i1](%[samples_sw])                \n\t"
+        "swxc1  %[v2],          %[i2](%[samples_sw])                \n\t"
+        "bne    %[i],           %[len],                 5b          \n\t"
+        " swxc1 %[v3],          %[i3](%[samples_sw])                \n\t"
+        "6:                                                         \n\t"
+
+        ".set   pop"
+        :[samples_p]"=&r"(samples_p), [matrix_j]"=&f"(matrix_j), [matrix_j2]"=&f"(matrix_j2),
+         [samples0]"=&f"(samples0), [samples1]"=&f"(samples1),
+         [samples2]"=&f"(samples2), [samples3]"=&f"(samples3),
+         [v0]"=&f"(v0), [v1]"=&f"(v1), [v2]"=&f"(v2), [v3]"=&f"(v3),
+         [v4]"=&f"(v4), [v5]"=&f"(v5), [v6]"=&f"(v6), [v7]"=&f"(v7),
+         [samples_x]"=&r"(samples_x), [matrix_p]"=&r"(matrix_p),
+         [samples_end]"=&r"(samples_end), [samples_sw]"=&r"(samples_sw),
+         [i1]"=&r"(i1), [i2]"=&r"(i2), [i3]"=&r"(i3), [i]"=&r"(i),
+         [j]"=&r"(j), [len]"+r"(len)
+        :[samples]"r"(samples), [matrix]"r"(matrix),
+         [in_ch]"r"(in_ch), [out_ch]"r"(out_ch)
+        :"memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact) {
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSP
+    c->bit_alloc_calc_bap = ac3_bit_alloc_calc_bap_mips;
+    c->update_bap_counts  = ac3_update_bap_counts_mips;
+#endif
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->float_to_fixed24 = float_to_fixed24_mips;
+    c->downmix          = ac3_downmix_mips;
+#endif
+#endif
+
+#endif
+}
diff --git a/libavcodec/mips/acelp_filters_mips.c b/libavcodec/mips/acelp_filters_mips.c
new file mode 100644
index 0000000..478db85
--- /dev/null
+++ b/libavcodec/mips/acelp_filters_mips.c
@@ -0,0 +1,221 @@
+ /*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * various filters for ACELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/acelp_filters.c
+ */
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/acelp_filters.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_acelp_interpolatef_mips(float *out, const float *in,
+                           const float *filter_coeffs, int precision,
+                           int frac_pos, int filter_length, int length)
+{
+    int n, i;
+    int prec = precision * 4;
+    int fc_offset = precision - frac_pos;
+    float in_val_p, in_val_m, fc_val_p, fc_val_m;
+
+    for (n = 0; n < length; n++) {
+        /**
+        * four pointers are defined in order to minimize number of
+        * computations done in inner loop
+        */
+        const float *p_in_p = &in[n];
+        const float *p_in_m = &in[n-1];
+        const float *p_filter_coeffs_p = &filter_coeffs[frac_pos];
+        const float *p_filter_coeffs_m = filter_coeffs + fc_offset;
+        float v = 0;
+
+        for (i = 0; i < filter_length;i++) {
+            __asm__ volatile (
+                "lwc1   %[in_val_p],           0(%[p_in_p])                    \n\t"
+                "lwc1   %[fc_val_p],           0(%[p_filter_coeffs_p])         \n\t"
+                "lwc1   %[in_val_m],           0(%[p_in_m])                    \n\t"
+                "lwc1   %[fc_val_m],           0(%[p_filter_coeffs_m])         \n\t"
+                PTR_ADDIU "%[p_in_p],          %[p_in_p],              4       \n\t"
+                "madd.s %[v],%[v],             %[in_val_p],%[fc_val_p]         \n\t"
+                PTR_ADDIU "%[p_in_m],          %[p_in_m],              -4      \n\t"
+                PTR_ADDU "%[p_filter_coeffs_p],%[p_filter_coeffs_p],   %[prec] \n\t"
+                PTR_ADDU "%[p_filter_coeffs_m],%[p_filter_coeffs_m],   %[prec] \n\t"
+                "madd.s %[v],%[v],%[in_val_m], %[fc_val_m]                     \n\t"
+
+                : [v] "+&f" (v),[p_in_p] "+r" (p_in_p), [p_in_m] "+r" (p_in_m),
+                  [p_filter_coeffs_p] "+r" (p_filter_coeffs_p),
+                  [in_val_p] "=&f" (in_val_p), [in_val_m] "=&f" (in_val_m),
+                  [fc_val_p] "=&f" (fc_val_p), [fc_val_m] "=&f" (fc_val_m),
+                  [p_filter_coeffs_m] "+r" (p_filter_coeffs_m)
+                : [prec] "r" (prec)
+                : "memory"
+            );
+        }
+        out[n] = v;
+    }
+}
+
+static void ff_acelp_apply_order_2_transfer_function_mips(float *out, const float *in,
+                                              const float zero_coeffs[2],
+                                              const float pole_coeffs[2],
+                                              float gain, float mem[2], int n)
+{
+    /**
+    * loop is unrolled eight times
+    */
+
+    __asm__ volatile (
+        "lwc1   $f0,    0(%[mem])                                              \n\t"
+        "blez   %[n],   ff_acelp_apply_order_2_transfer_function_end%=         \n\t"
+        "lwc1   $f1,    4(%[mem])                                              \n\t"
+        "lwc1   $f2,    0(%[pole_coeffs])                                      \n\t"
+        "lwc1   $f3,    4(%[pole_coeffs])                                      \n\t"
+        "lwc1   $f4,    0(%[zero_coeffs])                                      \n\t"
+        "lwc1   $f5,    4(%[zero_coeffs])                                      \n\t"
+
+        "ff_acelp_apply_order_2_transfer_function_madd%=:                      \n\t"
+
+        "lwc1   $f6,    0(%[in])                                               \n\t"
+        "mul.s  $f9,    $f3,      $f1                                          \n\t"
+        "mul.s  $f7,    $f2,      $f0                                          \n\t"
+        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
+        "sub.s  $f7,    $f7,      $f9                                          \n\t"
+        "madd.s $f8,    $f7,      $f4,     $f0                                 \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f1                                 \n\t"
+        "lwc1   $f11,   4(%[in])                                               \n\t"
+        "mul.s  $f12,   $f3,      $f0                                          \n\t"
+        "mul.s  $f13,   $f2,      $f7                                          \n\t"
+        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
+        "sub.s  $f13,   $f13,     $f12                                         \n\t"
+        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f0                                 \n\t"
+        "swc1   $f8,    0(%[out])                                              \n\t"
+        "lwc1   $f6,    8(%[in])                                               \n\t"
+        "mul.s  $f9,    $f3,      $f7                                          \n\t"
+        "mul.s  $f15,   $f2,      $f13                                         \n\t"
+        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
+        "sub.s  $f15,   $f15,     $f9                                          \n\t"
+        "madd.s $f8,    $f15,     $f4,     $f13                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
+        "swc1   $f14,   4(%[out])                                              \n\t"
+        "lwc1   $f11,   12(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f13                                         \n\t"
+        "mul.s  $f16,   $f2,      $f15                                         \n\t"
+        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
+        "sub.s  $f16,   $f16,     $f12                                         \n\t"
+        "madd.s $f14,   $f16,     $f4,     $f15                                \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
+        "swc1   $f8,    8(%[out])                                              \n\t"
+        "lwc1   $f6,    16(%[in])                                              \n\t"
+        "mul.s  $f9,    $f3,      $f15                                         \n\t"
+        "mul.s  $f7,    $f2,      $f16                                         \n\t"
+        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
+        "sub.s  $f7,    $f7,      $f9                                          \n\t"
+        "madd.s $f8,    $f7,      $f4,     $f16                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f15                                \n\t"
+        "swc1   $f14,   12(%[out])                                             \n\t"
+        "lwc1   $f11,   20(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f16                                         \n\t"
+        "mul.s  $f13,   $f2,      $f7                                          \n\t"
+        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
+        "sub.s  $f13,   $f13,     $f12                                         \n\t"
+        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f16                                \n\t"
+        "swc1   $f8,    16(%[out])                                             \n\t"
+        "lwc1   $f6,    24(%[in])                                              \n\t"
+        "mul.s  $f9,    $f3,      $f7                                          \n\t"
+        "mul.s  $f15,   $f2,      $f13                                         \n\t"
+        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
+        "sub.s  $f1,    $f15,     $f9                                          \n\t"
+        "madd.s $f8,    $f1,      $f4,     $f13                                \n\t"
+        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
+        "swc1   $f14,   20(%[out])                                             \n\t"
+        "lwc1   $f11,   28(%[in])                                              \n\t"
+        "mul.s  $f12,   $f3,      $f13                                         \n\t"
+        "mul.s  $f16,   $f2,      $f1                                          \n\t"
+        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
+        "sub.s  $f0,    $f16,     $f12                                         \n\t"
+        "madd.s $f14,   $f0,      $f4,     $f1                                 \n\t"
+        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
+        "swc1   $f8,    24(%[out])                                             \n\t"
+        PTR_ADDIU "%[out], 32                                                  \n\t"
+        PTR_ADDIU "%[in],  32                                                  \n\t"
+        "addiu  %[n],   -8                                                     \n\t"
+        "swc1   $f14,   -4(%[out])                                             \n\t"
+        "bnez   %[n],   ff_acelp_apply_order_2_transfer_function_madd%=        \n\t"
+        "swc1   $f1,    4(%[mem])                                              \n\t"
+        "swc1   $f0,    0(%[mem])                                              \n\t"
+
+        "ff_acelp_apply_order_2_transfer_function_end%=:                       \n\t"
+
+         : [out] "+r" (out),
+           [in] "+r" (in), [gain] "+f" (gain),
+           [n] "+r" (n), [mem] "+r" (mem)
+         : [zero_coeffs] "r" (zero_coeffs),
+           [pole_coeffs] "r" (pole_coeffs)
+         : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
+           "$f6", "$f7",  "$f8", "$f9", "$f10", "$f11",
+           "$f12", "$f13", "$f14", "$f15", "$f16", "memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_acelp_filter_init_mips(ACELPFContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->acelp_interpolatef                      = ff_acelp_interpolatef_mips;
+    c->acelp_apply_order_2_transfer_function   = ff_acelp_apply_order_2_transfer_function_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/acelp_vectors_mips.c b/libavcodec/mips/acelp_vectors_mips.c
new file mode 100644
index 0000000..0ab2b6a
--- /dev/null
+++ b/libavcodec/mips/acelp_vectors_mips.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * adaptive and fixed codebook vector operations for ACELP-based codecs
+ * optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/acelp_vectors.c
+ */
+#include "config.h"
+#include "libavcodec/acelp_vectors.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_weighted_vector_sumf_mips(
+                  float *out, const float *in_a, const float *in_b,
+                  float weight_coeff_a, float weight_coeff_b, int length)
+{
+    const float *a_end = in_a + length;
+
+    /* loop unrolled two times */
+    __asm__ volatile (
+        "blez   %[length], ff_weighted_vector_sumf_end%=                     \n\t"
+
+        "ff_weighted_vector_sumf_madd%=:                                     \n\t"
+        "lwc1   $f0,       0(%[in_a])                                        \n\t"
+        "lwc1   $f3,       4(%[in_a])                                        \n\t"
+        "lwc1   $f1,       0(%[in_b])                                        \n\t"
+        "lwc1   $f4,       4(%[in_b])                                        \n\t"
+        "mul.s  $f2,       %[weight_coeff_a], $f0                            \n\t"
+        "mul.s  $f5,       %[weight_coeff_a], $f3                            \n\t"
+        "madd.s $f2,       $f2,               %[weight_coeff_b], $f1         \n\t"
+        "madd.s $f5,       $f5,               %[weight_coeff_b], $f4         \n\t"
+        PTR_ADDIU "%[in_a],8                                                 \n\t"
+        PTR_ADDIU "%[in_b],8                                                 \n\t"
+        "swc1   $f2,       0(%[out])                                         \n\t"
+        "swc1   $f5,       4(%[out])                                         \n\t"
+        PTR_ADDIU "%[out], 8                                                 \n\t"
+        "bne   %[in_a],    %[a_end],          ff_weighted_vector_sumf_madd%= \n\t"
+
+        "ff_weighted_vector_sumf_end%=:                                      \n\t"
+
+        : [out] "+r" (out), [in_a] "+r" (in_a),   [in_b] "+r" (in_b)
+        : [weight_coeff_a] "f" (weight_coeff_a),
+          [weight_coeff_b] "f" (weight_coeff_b),
+          [length] "r" (length), [a_end]"r"(a_end)
+        : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory"
+    );
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_acelp_vectors_init_mips(ACELPVContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->weighted_vector_sumf = ff_weighted_vector_sumf_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/amrwbdec_mips.c b/libavcodec/mips/amrwbdec_mips.c
new file mode 100644
index 0000000..5dc0543
--- /dev/null
+++ b/libavcodec/mips/amrwbdec_mips.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/amrwbdec.c
+ */
+#include "libavutil/avutil.h"
+#include "libavcodec/amrwbdata.h"
+#include "amrwbdec_mips.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+void ff_hb_fir_filter_mips(float *out, const float fir_coef[HB_FIR_SIZE + 1],
+                          float mem[HB_FIR_SIZE], const float *in)
+{
+    int i;
+    float data[AMRWB_SFR_SIZE_16k + HB_FIR_SIZE]; // past and current samples
+
+    memcpy(data, mem, HB_FIR_SIZE * sizeof(float));
+    memcpy(data + HB_FIR_SIZE, in, AMRWB_SFR_SIZE_16k * sizeof(float));
+
+    for (i = 0; i < AMRWB_SFR_SIZE_16k; i++) {
+        float output;
+        float * p_data = (data+i);
+
+        /**
+        * inner loop is entirely unrolled and instructions are scheduled
+        * to minimize pipeline stall
+        */
+        __asm__ volatile(
+            "mtc1       $zero,     %[output]                      \n\t"
+            "lwc1       $f0,       0(%[p_data])                   \n\t"
+            "lwc1       $f1,       0(%[fir_coef])                 \n\t"
+            "lwc1       $f2,       4(%[p_data])                   \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f3,       4(%[fir_coef])                 \n\t"
+            "lwc1       $f4,       8(%[p_data])                   \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       8(%[fir_coef])                 \n\t"
+
+            "lwc1       $f0,       12(%[p_data])                  \n\t"
+            "lwc1       $f1,       12(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f2,       16(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f3,       16(%[fir_coef])                \n\t"
+            "lwc1       $f4,       20(%[p_data])                  \n\t"
+            "lwc1       $f5,       20(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       24(%[p_data])                  \n\t"
+            "lwc1       $f1,       24(%[fir_coef])                \n\t"
+            "lwc1       $f2,       28(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       28(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       32(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       32(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+
+            "lwc1       $f0,       36(%[p_data])                  \n\t"
+            "lwc1       $f1,       36(%[fir_coef])                \n\t"
+            "lwc1       $f2,       40(%[p_data])                  \n\t"
+            "lwc1       $f3,       40(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       44(%[p_data])                  \n\t"
+            "lwc1       $f5,       44(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       48(%[p_data])                  \n\t"
+            "lwc1       $f1,       48(%[fir_coef])                \n\t"
+            "lwc1       $f2,       52(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       52(%[fir_coef])                \n\t"
+            "lwc1       $f4,       56(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       56(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       60(%[p_data])                  \n\t"
+            "lwc1       $f1,       60(%[fir_coef])                \n\t"
+            "lwc1       $f2,       64(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       64(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       68(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f5,       68(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+
+            "lwc1       $f0,       72(%[p_data])                  \n\t"
+            "lwc1       $f1,       72(%[fir_coef])                \n\t"
+            "lwc1       $f2,       76(%[p_data])                  \n\t"
+            "lwc1       $f3,       76(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       80(%[p_data])                  \n\t"
+            "lwc1       $f5,       80(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       84(%[p_data])                  \n\t"
+            "lwc1       $f1,       84(%[fir_coef])                \n\t"
+            "lwc1       $f2,       88(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       88(%[fir_coef])                \n\t"
+            "lwc1       $f4,       92(%[p_data])                  \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       92(%[fir_coef])                \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       96(%[p_data])                  \n\t"
+            "lwc1       $f1,       96(%[fir_coef])                \n\t"
+            "lwc1       $f2,       100(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f3,       100(%[fir_coef])               \n\t"
+            "lwc1       $f4,       104(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f5,       104(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+
+            "lwc1       $f0,       108(%[p_data])                 \n\t"
+            "lwc1       $f1,       108(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "lwc1       $f2,       112(%[p_data])                 \n\t"
+            "lwc1       $f3,       112(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+            "lwc1       $f4,       116(%[p_data])                 \n\t"
+            "lwc1       $f5,       116(%[fir_coef])               \n\t"
+            "lwc1       $f0,       120(%[p_data])                 \n\t"
+            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
+            "lwc1       $f1,       120(%[fir_coef])               \n\t"
+            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
+            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
+
+            : [output]"=&f"(output)
+            : [fir_coef]"r"(fir_coef), [p_data]"r"(p_data)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "memory"
+        );
+        out[i] = output;
+    }
+    memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/mips/amrwbdec_mips.h b/libavcodec/mips/amrwbdec_mips.h
new file mode 100644
index 0000000..a9f66fe
--- /dev/null
+++ b/libavcodec/mips/amrwbdec_mips.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/amrwbdec.c
+ */
+#ifndef AVCODEC_MIPS_AMRWBDEC_MIPS_H
+#define AVCODEC_MIPS_AMRWBDEC_MIPS_H
+#include "config.h"
+
+#if HAVE_MIPSFPU && HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+void ff_hb_fir_filter_mips(float *out, const float fir_coef[],
+                          float mem[], const float *in);
+#define hb_fir_filter ff_hb_fir_filter_mips
+#endif
+#endif
+
+#endif /* AVCODEC_MIPS_AMRWBDEC_MIPS_H  */
diff --git a/libavcodec/mips/blockdsp_init_mips.c b/libavcodec/mips/blockdsp_init_mips.c
new file mode 100644
index 0000000..30ae95f
--- /dev/null
+++ b/libavcodec/mips/blockdsp_init_mips.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "blockdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void blockdsp_init_msa(BlockDSPContext *c)
+{
+    c->clear_block = ff_clear_block_msa;
+    c->clear_blocks = ff_clear_blocks_msa;
+
+    c->fill_block_tab[0] = ff_fill_block16_msa;
+    c->fill_block_tab[1] = ff_fill_block8_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void blockdsp_init_mmi(BlockDSPContext *c)
+{
+    c->clear_block = ff_clear_block_mmi;
+    c->clear_blocks = ff_clear_blocks_mmi;
+
+    c->fill_block_tab[0] = ff_fill_block16_mmi;
+    c->fill_block_tab[1] = ff_fill_block8_mmi;
+}
+#endif /* HAVE_MMI */
+
+void ff_blockdsp_init_mips(BlockDSPContext *c)
+{
+#if HAVE_MSA
+    blockdsp_init_msa(c);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    blockdsp_init_mmi(c);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/blockdsp_mips.h b/libavcodec/mips/blockdsp_mips.h
new file mode 100644
index 0000000..9559d40
--- /dev/null
+++ b/libavcodec/mips/blockdsp_mips.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H
+#define AVCODEC_MIPS_BLOCKDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_fill_block16_msa(uint8_t *src, uint8_t val, int stride, int height);
+void ff_fill_block8_msa(uint8_t *src, uint8_t val, int stride, int height);
+void ff_clear_block_msa(int16_t *block);
+void ff_clear_blocks_msa(int16_t *block);
+
+void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h);
+void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h);
+void ff_clear_block_mmi(int16_t *block);
+void ff_clear_blocks_mmi(int16_t *block);
+
+#endif  // #ifndef AVCODEC_MIPS_BLOCKDSP_MIPS_H
diff --git a/libavcodec/mips/blockdsp_mmi.c b/libavcodec/mips/blockdsp_mmi.c
new file mode 100644
index 0000000..6eb2bd7
--- /dev/null
+++ b/libavcodec/mips/blockdsp_mmi.c
@@ -0,0 +1,158 @@
+/*
+ * Loongson SIMD optimized blockdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "blockdsp_mips.h"
+#include "libavutil/mips/asmdefs.h"
+
+void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
+{
+    double ftmp[1];
+
+    __asm__ volatile (
+        "mtc1       %[value],   %[ftmp0]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
+        PTR_ADDI    "%[h],      %[h],           -0x01                   \n\t"
+        "gssdlc1    %[ftmp0],   0x0f(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x08(%[block])                          \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [block]"+&r"(block),              [h]"+&r"(h),
+          [ftmp0]"=&f"(ftmp[0])
+        : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h)
+{
+    double ftmp0;
+
+    __asm__ volatile (
+        "mtc1       %[value],   %[ftmp0]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
+        PTR_ADDI   "%[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [block]"+&r"(block),              [h]"+&r"(h),
+          [ftmp0]"=&f"(ftmp0)
+        : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_clear_block_mmi(int16_t *block)
+{
+    double ftmp[2];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x00(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x10(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x20(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x30(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x40(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x50(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x60(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x70(%[block])          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
+        : [block]"r"(block)
+        : "memory"
+    );
+}
+
+void ff_clear_blocks_mmi(int16_t *block)
+{
+    double ftmp[2];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x00(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x10(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x20(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x30(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x40(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x50(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x60(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x70(%[block])          \n\t"
+
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x80(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x90(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0xa0(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0xb0(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0xc0(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0xd0(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0xe0(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0xf0(%[block])          \n\t"
+
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x100(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x110(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x120(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x130(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x140(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x150(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x160(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x170(%[block])         \n\t"
+
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x180(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x190(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x1a0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x1b0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x1c0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x1d0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x1e0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x1f0(%[block])         \n\t"
+
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x200(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x210(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x220(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x230(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x240(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x250(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x260(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x270(%[block])         \n\t"
+
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x280(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x290(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x2a0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x2b0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x2c0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x2d0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x2e0(%[block])         \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp1],       0x2f0(%[block])         \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
+        : [block]"r"((mips_reg)block)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/blockdsp_msa.c b/libavcodec/mips/blockdsp_msa.c
new file mode 100644
index 0000000..32ac858
--- /dev/null
+++ b/libavcodec/mips/blockdsp_msa.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (parag.salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "blockdsp_mips.h"
+
+static void copy_8bit_value_width8_msa(uint8_t *src, uint8_t val,
+                                       int32_t src_stride, int32_t height)
+{
+    int32_t cnt;
+    uint64_t dst0;
+    v16u8 val0;
+
+    val0 = (v16u8) __msa_fill_b(val);
+    dst0 = __msa_copy_u_d((v2i64) val0, 0);
+
+    for (cnt = (height >> 2); cnt--;) {
+        SD4(dst0, dst0, dst0, dst0, src, src_stride);
+        src += (4 * src_stride);
+    }
+}
+
+static void copy_8bit_value_width16_msa(uint8_t *src, uint8_t val,
+                                        int32_t src_stride, int32_t height)
+{
+    int32_t cnt;
+    v16u8 val0;
+
+    val0 = (v16u8) __msa_fill_b(val);
+
+    for (cnt = (height >> 3); cnt--;) {
+        ST_UB8(val0, val0, val0, val0, val0, val0, val0, val0, src, src_stride);
+        src += (8 * src_stride);
+    }
+}
+
+static void memset_zero_16width_msa(uint8_t *src, int32_t stride,
+                                    int32_t height)
+{
+    int8_t cnt;
+    v16u8 zero = { 0 };
+
+    for (cnt = (height / 2); cnt--;) {
+        ST_UB(zero, src);
+        src += stride;
+        ST_UB(zero, src);
+        src += stride;
+    }
+}
+
+void ff_fill_block16_msa(uint8_t *src, uint8_t val, int stride, int height)
+{
+    copy_8bit_value_width16_msa(src, val, stride, height);
+}
+
+void ff_fill_block8_msa(uint8_t *src, uint8_t val, int stride, int height)
+{
+    copy_8bit_value_width8_msa(src, val, stride, height);
+}
+
+void ff_clear_block_msa(int16_t *block)
+{
+    memset_zero_16width_msa((uint8_t *) block, 16, 8);
+}
+
+void ff_clear_blocks_msa(int16_t *block)
+{
+    memset_zero_16width_msa((uint8_t *) block, 16, 8 * 6);
+}
diff --git a/libavcodec/mips/celp_filters_mips.c b/libavcodec/mips/celp_filters_mips.c
new file mode 100644
index 0000000..926f1cb
--- /dev/null
+++ b/libavcodec/mips/celp_filters_mips.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * various filters for CELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/celp_filters.c
+ */
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavcodec/celp_filters.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_celp_lp_synthesis_filterf_mips(float *out,
+                                  const float *filter_coeffs,
+                                  const float* in, int buffer_length,
+                                  int filter_length)
+{
+    int i,n;
+
+    float out0, out1, out2, out3;
+    float old_out0, old_out1, old_out2, old_out3;
+    float a,b,c;
+    const float *p_filter_coeffs;
+    float *p_out;
+
+    a = filter_coeffs[0];
+    b = filter_coeffs[1];
+    c = filter_coeffs[2];
+    b -= filter_coeffs[0] * filter_coeffs[0];
+    c -= filter_coeffs[1] * filter_coeffs[0];
+    c -= filter_coeffs[0] * b;
+
+    old_out0 = out[-4];
+    old_out1 = out[-3];
+    old_out2 = out[-2];
+    old_out3 = out[-1];
+    for (n = 0; n <= buffer_length - 4; n+=4) {
+        p_filter_coeffs = filter_coeffs;
+        p_out = out;
+
+        out0 = in[0];
+        out1 = in[1];
+        out2 = in[2];
+        out3 = in[3];
+
+        __asm__ volatile(
+            "lwc1       $f2,     8(%[filter_coeffs])                        \n\t"
+            "lwc1       $f1,     4(%[filter_coeffs])                        \n\t"
+            "lwc1       $f0,     0(%[filter_coeffs])                        \n\t"
+            "nmsub.s    %[out0], %[out0],             $f2, %[old_out1]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f2, %[old_out2]      \n\t"
+            "nmsub.s    %[out2], %[out2],             $f2, %[old_out3]      \n\t"
+            "lwc1       $f3,     12(%[filter_coeffs])                       \n\t"
+            "nmsub.s    %[out0], %[out0],             $f1, %[old_out2]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f1, %[old_out3]      \n\t"
+            "nmsub.s    %[out2], %[out2],             $f3, %[old_out2]      \n\t"
+            "nmsub.s    %[out0], %[out0],             $f0, %[old_out3]      \n\t"
+            "nmsub.s    %[out3], %[out3],             $f3, %[old_out3]      \n\t"
+            "nmsub.s    %[out1], %[out1],             $f3, %[old_out1]      \n\t"
+            "nmsub.s    %[out0], %[out0],             $f3, %[old_out0]      \n\t"
+
+            : [out0]"+f"(out0), [out1]"+f"(out1),
+              [out2]"+f"(out2), [out3]"+f"(out3)
+            : [old_out0]"f"(old_out0), [old_out1]"f"(old_out1),
+              [old_out2]"f"(old_out2), [old_out3]"f"(old_out3),
+              [filter_coeffs]"r"(filter_coeffs)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "memory"
+        );
+
+        for (i = 5; i <= filter_length; i += 2) {
+            __asm__ volatile(
+                "lwc1    %[old_out3], -20(%[p_out])                         \n\t"
+                "lwc1    $f5,         16(%[p_filter_coeffs])                \n\t"
+                PTR_ADDIU "%[p_out],  -8                                    \n\t"
+                PTR_ADDIU "%[p_filter_coeffs], 8                            \n\t"
+                "nmsub.s %[out1],     %[out1],      $f5, %[old_out0]        \n\t"
+                "nmsub.s %[out3],     %[out3],      $f5, %[old_out2]        \n\t"
+                "lwc1    $f4,         12(%[p_filter_coeffs])                \n\t"
+                "lwc1    %[old_out2], -16(%[p_out])                         \n\t"
+                "nmsub.s %[out0],     %[out0],      $f5, %[old_out3]        \n\t"
+                "nmsub.s %[out2],     %[out2],      $f5, %[old_out1]        \n\t"
+                "nmsub.s %[out1],     %[out1],      $f4, %[old_out3]        \n\t"
+                "nmsub.s %[out3],     %[out3],      $f4, %[old_out1]        \n\t"
+                "mov.s   %[old_out1], %[old_out3]                           \n\t"
+                "nmsub.s %[out0],     %[out0],      $f4, %[old_out2]        \n\t"
+                "nmsub.s %[out2],     %[out2],      $f4, %[old_out0]        \n\t"
+
+                : [out0]"+f"(out0), [out1]"+f"(out1),
+                  [out2]"+f"(out2), [out3]"+f"(out3), [old_out0]"+f"(old_out0),
+                  [old_out1]"+f"(old_out1), [old_out2]"+f"(old_out2),
+                  [old_out3]"+f"(old_out3),[p_filter_coeffs]"+r"(p_filter_coeffs),
+                  [p_out]"+r"(p_out)
+                :
+                : "$f4", "$f5", "memory"
+            );
+            FFSWAP(float, old_out0, old_out2);
+        }
+
+        __asm__ volatile(
+            "nmsub.s    %[out3], %[out3], %[a], %[out2]                     \n\t"
+            "nmsub.s    %[out2], %[out2], %[a], %[out1]                     \n\t"
+            "nmsub.s    %[out3], %[out3], %[b], %[out1]                     \n\t"
+            "nmsub.s    %[out1], %[out1], %[a], %[out0]                     \n\t"
+            "nmsub.s    %[out2], %[out2], %[b], %[out0]                     \n\t"
+            "nmsub.s    %[out3], %[out3], %[c], %[out0]                     \n\t"
+
+            : [out0]"+f"(out0), [out1]"+f"(out1),
+              [out2]"+f"(out2), [out3]"+f"(out3)
+            : [a]"f"(a), [b]"f"(b), [c]"f"(c)
+        );
+
+        out[0] = out0;
+        out[1] = out1;
+        out[2] = out2;
+        out[3] = out3;
+
+        old_out0 = out0;
+        old_out1 = out1;
+        old_out2 = out2;
+        old_out3 = out3;
+
+        out += 4;
+        in  += 4;
+    }
+
+    out -= n;
+    in -= n;
+    for (; n < buffer_length; n++) {
+        float out_val, out_val_i, fc_val;
+        p_filter_coeffs = filter_coeffs;
+        p_out = &out[n];
+        out_val = in[n];
+        for (i = 1; i <= filter_length; i++) {
+            __asm__ volatile(
+                "lwc1    %[fc_val],          0(%[p_filter_coeffs])                        \n\t"
+                "lwc1    %[out_val_i],       -4(%[p_out])                                 \n\t"
+                PTR_ADDIU "%[p_filter_coeffs], 4                                          \n\t"
+                PTR_ADDIU "%[p_out],         -4                                           \n\t"
+                "nmsub.s %[out_val],         %[out_val],          %[fc_val], %[out_val_i] \n\t"
+
+                : [fc_val]"=&f"(fc_val), [out_val]"+f"(out_val),
+                  [out_val_i]"=&f"(out_val_i), [p_out]"+r"(p_out),
+                  [p_filter_coeffs]"+r"(p_filter_coeffs)
+                :
+                : "memory"
+            );
+        }
+        out[n] = out_val;
+    }
+}
+
+static void ff_celp_lp_zero_synthesis_filterf_mips(float *out,
+                                       const float *filter_coeffs,
+                                       const float *in, int buffer_length,
+                                       int filter_length)
+{
+    int i,n;
+    float sum_out8, sum_out7, sum_out6, sum_out5, sum_out4, fc_val;
+    float sum_out3, sum_out2, sum_out1;
+    const float *p_filter_coeffs, *p_in;
+
+    for (n = 0; n < buffer_length; n+=8) {
+        p_in = &in[n];
+        p_filter_coeffs = filter_coeffs;
+        sum_out8 = in[n+7];
+        sum_out7 = in[n+6];
+        sum_out6 = in[n+5];
+        sum_out5 = in[n+4];
+        sum_out4 = in[n+3];
+        sum_out3 = in[n+2];
+        sum_out2 = in[n+1];
+        sum_out1 = in[n];
+        i = filter_length;
+
+        /* i is always greater than 0
+        * outer loop is unrolled eight times so there is less memory access
+        * inner loop is unrolled two times
+        */
+        __asm__ volatile(
+            "filt_lp_inner%=:                                               \n\t"
+            "lwc1   %[fc_val],   0(%[p_filter_coeffs])                      \n\t"
+            "lwc1   $f7,         6*4(%[p_in])                               \n\t"
+            "lwc1   $f6,         5*4(%[p_in])                               \n\t"
+            "lwc1   $f5,         4*4(%[p_in])                               \n\t"
+            "lwc1   $f4,         3*4(%[p_in])                               \n\t"
+            "lwc1   $f3,         2*4(%[p_in])                               \n\t"
+            "lwc1   $f2,         4(%[p_in])                                 \n\t"
+            "lwc1   $f1,         0(%[p_in])                                 \n\t"
+            "lwc1   $f0,         -4(%[p_in])                                \n\t"
+            "addiu  %[i],        -2                                         \n\t"
+            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f7       \n\t"
+            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f6       \n\t"
+            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f5       \n\t"
+            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f4       \n\t"
+            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f3       \n\t"
+            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f2       \n\t"
+            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f1       \n\t"
+            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f0       \n\t"
+            "lwc1   %[fc_val],   4(%[p_filter_coeffs])                      \n\t"
+            "lwc1   $f7,         -8(%[p_in])                                \n\t"
+            PTR_ADDIU "%[p_filter_coeffs], 8                                \n\t"
+            PTR_ADDIU "%[p_in],  -8                                         \n\t"
+            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f6       \n\t"
+            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f5       \n\t"
+            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f4       \n\t"
+            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f3       \n\t"
+            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f2       \n\t"
+            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f1       \n\t"
+            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f0       \n\t"
+            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f7       \n\t"
+            "bgtz   %[i],        filt_lp_inner%=                            \n\t"
+
+            : [sum_out8]"+f"(sum_out8), [sum_out7]"+f"(sum_out7),
+              [sum_out6]"+f"(sum_out6), [sum_out5]"+f"(sum_out5),
+              [sum_out4]"+f"(sum_out4), [sum_out3]"+f"(sum_out3),
+              [sum_out2]"+f"(sum_out2), [sum_out1]"+f"(sum_out1),
+              [fc_val]"=&f"(fc_val), [p_filter_coeffs]"+r"(p_filter_coeffs),
+              [p_in]"+r"(p_in), [i]"+r"(i)
+            :
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "memory"
+        );
+
+        out[n+7] = sum_out8;
+        out[n+6] = sum_out7;
+        out[n+5] = sum_out6;
+        out[n+4] = sum_out5;
+        out[n+3] = sum_out4;
+        out[n+2] = sum_out3;
+        out[n+1] = sum_out2;
+        out[n] = sum_out1;
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_celp_filter_init_mips(CELPFContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->celp_lp_synthesis_filterf        = ff_celp_lp_synthesis_filterf_mips;
+    c->celp_lp_zero_synthesis_filterf   = ff_celp_lp_zero_synthesis_filterf_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/celp_math_mips.c b/libavcodec/mips/celp_math_mips.c
new file mode 100644
index 0000000..ce711bd
--- /dev/null
+++ b/libavcodec/mips/celp_math_mips.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * Math operations optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/celp_math.c
+ */
+#include "config.h"
+#include "libavcodec/celp_math.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static float ff_dot_productf_mips(const float* a, const float* b,
+                                              int length)
+{
+    float sum;
+    const float* a_end = a + length;
+
+    __asm__ volatile (
+        "mtc1   $zero,      %[sum]                              \n\t"
+        "blez   %[length],  ff_dot_productf_end%=               \n\t"
+        "ff_dot_productf_madd%=:                                \n\t"
+        "lwc1   $f2,        0(%[a])                             \n\t"
+        "lwc1   $f1,        0(%[b])                             \n\t"
+        PTR_ADDIU "%[a],    %[a],      4                        \n\t"
+        PTR_ADDIU "%[b],    %[b],      4                        \n\t"
+        "madd.s %[sum],     %[sum],    $f1, $f2                 \n\t"
+        "bne   %[a],        %[a_end],  ff_dot_productf_madd%=   \n\t"
+        "ff_dot_productf_end%=:                                 \n\t"
+
+        : [sum] "=&f" (sum), [a] "+r" (a), [b] "+r" (b)
+        : [a_end]"r"(a_end), [length] "r" (length)
+        : "$f1", "$f2", "memory"
+    );
+    return sum;
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_celp_math_init_mips(CELPMContext *c)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    c->dot_productf = ff_dot_productf_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/compute_antialias_fixed.h b/libavcodec/mips/compute_antialias_fixed.h
new file mode 100644
index 0000000..a967f67
--- /dev/null
+++ b/libavcodec/mips/compute_antialias_fixed.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * Compute antialias function optimised for MIPS fixed-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodec.c
+ */
+
+#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H
+#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H
+
+#if HAVE_INLINE_ASM
+static void compute_antialias_mips_fixed(MPADecodeContext *s,
+                                        GranuleDef *g)
+{
+    int32_t *ptr, *csa;
+    int n, i;
+    int MAX_lo = 0xffffffff;
+
+    /* we antialias only "long" bands */
+    if (g->block_type == 2) {
+        if (!g->switch_point)
+            return;
+        /* XXX: check this for 8000Hz case */
+        n = 1;
+    } else {
+        n = SBLIMIT - 1;
+    }
+
+
+    ptr = g->sb_hybrid + 18;
+
+    for(i = n;i > 0;i--) {
+        int tmp0, tmp1, tmp2, tmp00, tmp11;
+        int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
+        csa = &csa_table[0][0];
+
+        /**
+         * instructions are scheduled to minimize pipeline stall.
+         */
+        __asm__ volatile (
+            "lw   %[tmp0],      -1*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      0*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 0*4(%[csa])                             \n\t"
+            "lw   %[temp_reg2], 2*4(%[csa])                             \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 3*4(%[csa])                             \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "lw   %[tmp00],     -2*4(%[ptr])                            \n\t"
+            "lw   %[tmp11],     1*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg4], 4*4(%[csa])                             \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 6*4(%[csa])                             \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg6], 7*4(%[csa])                             \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sw   %[temp_reg1], -1*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "sw   %[temp_reg2], 0*4(%[ptr])                             \n\t"
+            "lw   %[tmp0],      -3*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      2*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 8*4(%[csa])                             \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sw   %[temp_reg4], -2*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg5], 1*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg2], 10*4(%[csa])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "lw   %[temp_reg3], 11*4(%[csa])                            \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "lw   %[tmp00],     -4*4(%[ptr])                            \n\t"
+            "lw   %[tmp11],     3*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[temp_reg4], 12*4(%[csa])                            \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg5], 14*4(%[csa])                            \n\t"
+            "lw   %[temp_reg6], 15*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "sw   %[temp_reg1], -3*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "sw   %[temp_reg2], 2*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "lw   %[tmp0],      -5*4(%[ptr])                            \n\t"
+            "lw   %[tmp1],      4*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 16*4(%[csa])                            \n\t"
+            "lw   %[temp_reg2], 18*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 19*4(%[csa])                            \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -4*4(%[ptr])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "sw   %[temp_reg5], 3*4(%[ptr])                             \n\t"
+            "lw   %[tmp00],     -6*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[tmp11],     5*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg4], 20*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 22*4(%[csa])                            \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg6], 23*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sw   %[temp_reg1], -5*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg2], 4*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "lw   %[tmp0],      -7*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "lw   %[tmp1],      6*4(%[ptr])                             \n\t"
+            "lw   %[temp_reg1], 24*4(%[csa])                            \n\t"
+            "lw   %[temp_reg2], 26*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp0],      %[tmp1]                   \n\t"
+            "lw   %[temp_reg3], 27*4(%[csa])                            \n\t"
+            "mult $ac0,         %[tmp2],      %[temp_reg1]              \n\t"
+            "mult $ac1,         %[tmp2],      %[temp_reg1]              \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -6*4(%[ptr])                            \n\t"
+            "mtlo %[MAX_lo],    $ac0                                    \n\t"
+            "msub $ac0,         %[tmp1],      %[temp_reg2]              \n\t"
+            "mtlo $zero,        $ac1                                    \n\t"
+            "madd $ac1,         %[tmp0],      %[temp_reg3]              \n\t"
+            "sw   %[temp_reg5], 5*4(%[ptr])                             \n\t"
+            "lw   %[tmp00],     -8*4(%[ptr])                            \n\t"
+            "mfhi %[temp_reg1], $ac0                                    \n\t"
+            "lw   %[tmp11],     7*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg2], $ac1                                    \n\t"
+            "lw   %[temp_reg4], 28*4(%[csa])                            \n\t"
+            "add  %[tmp2],      %[tmp00],     %[tmp11]                  \n\t"
+            "lw   %[temp_reg5], 30*4(%[csa])                            \n\t"
+            "mult $ac2,         %[tmp2],      %[temp_reg4]              \n\t"
+            "mult $ac3,         %[tmp2],      %[temp_reg4]              \n\t"
+            "lw   %[temp_reg6], 31*4(%[csa])                            \n\t"
+            "sll  %[temp_reg1], %[temp_reg1], 2                         \n\t"
+            "sll  %[temp_reg2], %[temp_reg2], 2                         \n\t"
+            "mtlo %[MAX_lo],    $ac2                                    \n\t"
+            "msub $ac2,         %[tmp11],     %[temp_reg5]              \n\t"
+            "mtlo $zero,        $ac3                                    \n\t"
+            "madd $ac3,         %[tmp00],     %[temp_reg6]              \n\t"
+            "sw   %[temp_reg1], -7*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg2], 6*4(%[ptr])                             \n\t"
+            "mfhi %[temp_reg4], $ac2                                    \n\t"
+            "mfhi %[temp_reg5], $ac3                                    \n\t"
+            "sll  %[temp_reg4], %[temp_reg4], 2                         \n\t"
+            "sll  %[temp_reg5], %[temp_reg5], 2                         \n\t"
+            "sw   %[temp_reg4], -8*4(%[ptr])                            \n\t"
+            "sw   %[temp_reg5], 7*4(%[ptr])                             \n\t"
+
+            : [tmp0] "=&r" (tmp0), [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+              [tmp00] "=&r" (tmp00), [tmp11] "=&r" (tmp11),
+              [temp_reg1] "=&r" (temp_reg1), [temp_reg2] "=&r" (temp_reg2),
+              [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
+              [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6)
+            : [csa] "r" (csa), [ptr] "r" (ptr),
+              [MAX_lo] "r" (MAX_lo)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+         );
+
+        ptr += 18;
+    }
+}
+#define compute_antialias compute_antialias_mips_fixed
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FIXED_H */
diff --git a/libavcodec/mips/compute_antialias_float.h b/libavcodec/mips/compute_antialias_float.h
new file mode 100644
index 0000000..e2b4f29
--- /dev/null
+++ b/libavcodec/mips/compute_antialias_float.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * Compute antialias function optimised for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodec.c
+ */
+
+#ifndef AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H
+#define AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H
+
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void compute_antialias_mips_float(MPADecodeContext *s,
+                                        GranuleDef *g)
+{
+    float *ptr, *ptr_end;
+    float *csa = &csa_table[0][0];
+    /* temporary variables */
+    float in1, in2, in3, in4, in5, in6, in7, in8;
+    float out1, out2, out3, out4;
+
+    ptr = g->sb_hybrid + 18;
+    /* we antialias only "long" bands */
+    if (g->block_type == 2) {
+        if (!g->switch_point)
+            return;
+        /* XXX: check this for 8000Hz case */
+        ptr_end = ptr + 18;
+    } else {
+        ptr_end = ptr + 558;
+    }
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    */
+
+    __asm__ volatile (
+        "compute_antialias_float_loop%=:                                \t\n"
+        "lwc1    %[in1],  -1*4(%[ptr])                                  \t\n"
+        "lwc1    %[in2],  0(%[csa])                                     \t\n"
+        "lwc1    %[in3],  1*4(%[csa])                                   \t\n"
+        "lwc1    %[in4],  0(%[ptr])                                     \t\n"
+        "lwc1    %[in5],  -2*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  4*4(%[csa])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in7],  5*4(%[csa])                                   \t\n"
+        "lwc1    %[in8],  1*4(%[ptr])                                   \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "lwc1    %[in1],  -3*4(%[ptr])                                  \t\n"
+        "swc1    %[out1], -1*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 0(%[ptr])                                     \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  8*4(%[csa])                                   \t\n"
+        "swc1    %[out3], -2*4(%[ptr])                                  \t\n"
+        "swc1    %[out4], 1*4(%[ptr])                                   \t\n"
+        "lwc1    %[in3],  9*4(%[csa])                                   \t\n"
+        "lwc1    %[in4],  2*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "lwc1    %[in5],  -4*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  12*4(%[csa])                                  \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in7],  13*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "lwc1    %[in8],  3*4(%[ptr])                                   \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -3*4(%[ptr])                                  \t\n"
+        "lwc1    %[in1],  -5*4(%[ptr])                                  \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "swc1    %[out2], 2*4(%[ptr])                                   \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  16*4(%[csa])                                  \t\n"
+        "lwc1    %[in3],  17*4(%[csa])                                  \t\n"
+        "swc1    %[out3], -4*4(%[ptr])                                  \t\n"
+        "lwc1    %[in4],  4*4(%[ptr])                                   \t\n"
+        "swc1    %[out4], 3*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in5],  -6*4(%[ptr])                                  \t\n"
+        "lwc1    %[in6],  20*4(%[csa])                                  \t\n"
+        "lwc1    %[in7],  21*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "lwc1    %[in8],  5*4(%[ptr])                                   \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -5*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 4*4(%[ptr])                                   \t\n"
+        "lwc1    %[in1],  -7*4(%[ptr])                                  \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "lwc1    %[in2],  24*4(%[csa])                                  \t\n"
+        "lwc1    %[in3],  25*4(%[csa])                                  \t\n"
+        "lwc1    %[in4],  6*4(%[ptr])                                   \t\n"
+        "swc1    %[out3], -6*4(%[ptr])                                  \t\n"
+        "swc1    %[out4], 5*4(%[ptr])                                   \t\n"
+        "mul.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "lwc1    %[in5],  -8*4(%[ptr])                                  \t\n"
+        "mul.s   %[out2], %[in1],  %[in3]                               \t\n"
+        "lwc1    %[in6],  28*4(%[csa])                                  \t\n"
+        "lwc1    %[in7],  29*4(%[csa])                                  \t\n"
+        "nmsub.s %[out1], %[out1], %[in3], %[in4]                       \t\n"
+        "lwc1    %[in8],  7*4(%[ptr])                                   \t\n"
+        "madd.s  %[out2], %[out2], %[in2], %[in4]                       \t\n"
+        "mul.s   %[out3], %[in5],  %[in6]                               \t\n"
+        "mul.s   %[out4], %[in5],  %[in7]                               \t\n"
+        "swc1    %[out1], -7*4(%[ptr])                                  \t\n"
+        "swc1    %[out2], 6*4(%[ptr])                                   \t\n"
+        PTR_ADDIU "%[ptr],%[ptr],  72                                   \t\n"
+        "nmsub.s %[out3], %[out3], %[in7], %[in8]                       \t\n"
+        "madd.s  %[out4], %[out4], %[in6], %[in8]                       \t\n"
+        "swc1    %[out3], -26*4(%[ptr])                                 \t\n"
+        "swc1    %[out4], -11*4(%[ptr])                                 \t\n"
+        "bne     %[ptr],  %[ptr_end],  compute_antialias_float_loop%=   \t\n"
+
+        : [ptr] "+r" (ptr),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [in7] "=&f" (in7), [in8] "=&f" (in8),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4)
+        : [csa] "r" (csa), [ptr_end] "r" (ptr_end)
+        : "memory"
+    );
+}
+#define compute_antialias compute_antialias_mips_float
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_MIPS_COMPUTE_ANTIALIAS_FLOAT_H */
diff --git a/libavcodec/mips/constants.c b/libavcodec/mips/constants.c
new file mode 100644
index 0000000..3503fad
--- /dev/null
+++ b/libavcodec/mips/constants.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/mem.h"
+#include "constants.h"
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_1) =       {0x0001000100010001ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_2) =       {0x0002000200020002ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_3) =       {0x0003000300030003ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_4) =       {0x0004000400040004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_5) =       {0x0005000500050005ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_8) =       {0x0008000800080008ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_9) =       {0x0009000900090009ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_10) =      {0x000A000A000A000AULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_16) =      {0x0010001000100010ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_18) =      {0x0012001200120012ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) =      {0x0014001400140014ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_28) =      {0x001C001C001C001CULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_32) =      {0x0020002000200020ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) =      {0x0035003500350035ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_64) =      {0x0040004000400040ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) =     {0x0080008000800080ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_512) =     {0x0200020002000200ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_m8tom5) =  {0xFFFBFFFAFFF9FFF8ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_m4tom1) =  {0xFFFFFFFEFFFDFFFCULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_1to4) =    {0x0004000300020001ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_5to8) =    {0x0008000700060005ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_0to3) =    {0x0003000200010000ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_4to7) =    {0x0007000600050004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_8tob) =    {0x000b000a00090008ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_ctof) =    {0x000f000e000d000cULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_1) =       {0x0101010101010101ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_3) =       {0x0303030303030303ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_80) =      {0x8080808080808080ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1) =      {0xA1A1A1A1A1A1A1A1ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_FE) =      {0xFEFEFEFEFEFEFEFEULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd) =        {0x0004000400040004ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd2) =       {0x0040004000400040ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_rnd3) =       {0x0020002000200020ULL};
+
+DECLARE_ALIGNED(8, const uint64_t, ff_wm1010) =     {0xFFFF0000FFFF0000ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_d40000) =     {0x0000000000040000ULL};
diff --git a/libavcodec/mips/constants.h b/libavcodec/mips/constants.h
new file mode 100644
index 0000000..19d2d73
--- /dev/null
+++ b/libavcodec/mips/constants.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_CONSTANTS_H
+#define AVCODEC_MIPS_CONSTANTS_H
+
+#include <stdint.h>
+
+extern const uint64_t ff_pw_1;
+extern const uint64_t ff_pw_2;
+extern const uint64_t ff_pw_3;
+extern const uint64_t ff_pw_4;
+extern const uint64_t ff_pw_5;
+extern const uint64_t ff_pw_8;
+extern const uint64_t ff_pw_9;
+extern const uint64_t ff_pw_10;
+extern const uint64_t ff_pw_16;
+extern const uint64_t ff_pw_18;
+extern const uint64_t ff_pw_20;
+extern const uint64_t ff_pw_28;
+extern const uint64_t ff_pw_32;
+extern const uint64_t ff_pw_53;
+extern const uint64_t ff_pw_64;
+extern const uint64_t ff_pw_128;
+extern const uint64_t ff_pw_512;
+extern const uint64_t ff_pw_m8tom5;
+extern const uint64_t ff_pw_m4tom1;
+extern const uint64_t ff_pw_1to4;
+extern const uint64_t ff_pw_5to8;
+extern const uint64_t ff_pw_0to3;
+extern const uint64_t ff_pw_4to7;
+extern const uint64_t ff_pw_8tob;
+extern const uint64_t ff_pw_ctof;
+
+extern const uint64_t ff_pb_1;
+extern const uint64_t ff_pb_3;
+extern const uint64_t ff_pb_80;
+extern const uint64_t ff_pb_A1;
+extern const uint64_t ff_pb_FE;
+
+extern const uint64_t ff_rnd;
+extern const uint64_t ff_rnd2;
+extern const uint64_t ff_rnd3;
+
+extern const uint64_t ff_wm1010;
+extern const uint64_t ff_d40000;
+
+#endif /* AVCODEC_MIPS_CONSTANTS_H */
diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
new file mode 100644
index 0000000..03dcbad
--- /dev/null
+++ b/libavcodec/mips/fft_mips.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Stanislav Ocovaj (socovaj@mips.com)
+ * Author:  Zoran Lukic (zoranl@mips.com)
+ *
+ * Optimized MDCT/IMDCT and FFT transforms
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft_table.h"
+#include "libavutil/mips/asmdefs.h"
+
+/**
+ * FFT transform
+ */
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
+{
+    int nbits, i, n, num_transforms, offset, step;
+    int n4, n2, n34;
+    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    FFTComplex *tmpz;
+    float w_re, w_im;
+    float *w_re_ptr, *w_im_ptr;
+    const int fft_size = (1 << s->nbits);
+    float pom,  pom1,  pom2,  pom3;
+    float temp, temp1, temp3, temp4;
+    FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
+    FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
+
+    num_transforms = (21845 >> (17 - s->nbits)) | 1;
+
+    for (n=0; n<num_transforms; n++) {
+        offset = ff_fft_offsets_lut[n] << 2;
+        tmpz = z + offset;
+
+        tmp1 = tmpz[0].re + tmpz[1].re;
+        tmp5 = tmpz[2].re + tmpz[3].re;
+        tmp2 = tmpz[0].im + tmpz[1].im;
+        tmp6 = tmpz[2].im + tmpz[3].im;
+        tmp3 = tmpz[0].re - tmpz[1].re;
+        tmp8 = tmpz[2].im - tmpz[3].im;
+        tmp4 = tmpz[0].im - tmpz[1].im;
+        tmp7 = tmpz[2].re - tmpz[3].re;
+
+        tmpz[0].re = tmp1 + tmp5;
+        tmpz[2].re = tmp1 - tmp5;
+        tmpz[0].im = tmp2 + tmp6;
+        tmpz[2].im = tmp2 - tmp6;
+        tmpz[1].re = tmp3 + tmp8;
+        tmpz[3].re = tmp3 - tmp8;
+        tmpz[1].im = tmp4 - tmp7;
+        tmpz[3].im = tmp4 + tmp7;
+
+    }
+
+    if (fft_size < 8)
+        return;
+
+    num_transforms = (num_transforms >> 1) | 1;
+
+    for (n=0; n<num_transforms; n++) {
+        offset = ff_fft_offsets_lut[n] << 3;
+        tmpz = z + offset;
+
+        __asm__ volatile (
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
+            "lwc1  %[pom1], 56(%[tmpz])                     \n\t"
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 44(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
+            "add.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re + tmpz[5].re;
+            "add.s %[tmp3], %[tmp3],    %[pom1]             \n\t"  // tmp3 = tmpz[6].re + tmpz[7].re;
+            "add.s %[tmp2], %[tmp2],    %[pom2]             \n\t"  // tmp2 = tmpz[4].im + tmpz[5].im;
+            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
+            "add.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im + tmpz[7].im;
+            "add.s %[tmp5], %[tmp1],    %[tmp3]             \n\t"  // tmp5 = tmp1 + tmp3;
+            "sub.s %[tmp7], %[tmp1],    %[tmp3]             \n\t"  // tmp7 = tmp1 - tmp3;
+            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
+            "lwc1  %[pom1], 44(%[tmpz])                     \n\t"
+            "add.s %[tmp6], %[tmp2],    %[tmp4]             \n\t"  // tmp6 = tmp2 + tmp4;
+            "sub.s %[tmp8], %[tmp2],    %[tmp4]             \n\t"  // tmp8 = tmp2 - tmp4;
+            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 56(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
+            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
+            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
+            "sub.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re - tmpz[5].re;
+            "lwc1  %[pom],  0(%[tmpz])                      \n\t"
+            "sub.s %[tmp2], %[tmp2],    %[pom1]             \n\t"  // tmp2 = tmpz[4].im - tmpz[5].im;
+            "sub.s %[tmp3], %[tmp3],    %[pom2]             \n\t"  // tmp3 = tmpz[6].re - tmpz[7].re;
+            "lwc1  %[pom2], 4(%[tmpz])                      \n\t"
+            "sub.s %[pom1], %[pom],     %[tmp5]             \n\t"
+            "sub.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im - tmpz[7].im;
+            "add.s %[pom3], %[pom],     %[tmp5]             \n\t"
+            "sub.s %[pom],  %[pom2],    %[tmp6]             \n\t"
+            "add.s %[pom2], %[pom2],    %[tmp6]             \n\t"
+            "swc1  %[pom1], 32(%[tmpz])                     \n\t"  // tmpz[4].re = tmpz[0].re - tmp5;
+            "swc1  %[pom3], 0(%[tmpz])                      \n\t"  // tmpz[0].re = tmpz[0].re + tmp5;
+            "swc1  %[pom],  36(%[tmpz])                     \n\t"  // tmpz[4].im = tmpz[0].im - tmp6;
+            "swc1  %[pom2], 4(%[tmpz])                      \n\t"  // tmpz[0].im = tmpz[0].im + tmp6;
+            "lwc1  %[pom1], 16(%[tmpz])                     \n\t"
+            "lwc1  %[pom3], 20(%[tmpz])                     \n\t"
+            "li.s  %[pom],  0.7071067812                    \n\t"  // float pom = 0.7071067812f;
+            "add.s %[temp1],%[tmp1],    %[tmp2]             \n\t"
+            "sub.s %[temp], %[pom1],    %[tmp8]             \n\t"
+            "add.s %[pom2], %[pom3],    %[tmp7]             \n\t"
+            "sub.s %[temp3],%[tmp3],    %[tmp4]             \n\t"
+            "sub.s %[temp4],%[tmp2],    %[tmp1]             \n\t"
+            "swc1  %[temp], 48(%[tmpz])                     \n\t"  // tmpz[6].re = tmpz[2].re - tmp8;
+            "swc1  %[pom2], 52(%[tmpz])                     \n\t"  // tmpz[6].im = tmpz[2].im + tmp7;
+            "add.s %[pom1], %[pom1],    %[tmp8]             \n\t"
+            "sub.s %[pom3], %[pom3],    %[tmp7]             \n\t"
+            "add.s %[tmp3], %[tmp3],    %[tmp4]             \n\t"
+            "mul.s %[tmp5], %[pom],     %[temp1]            \n\t"  // tmp5 = pom * (tmp1 + tmp2);
+            "mul.s %[tmp7], %[pom],     %[temp3]            \n\t"  // tmp7 = pom * (tmp3 - tmp4);
+            "mul.s %[tmp6], %[pom],     %[temp4]            \n\t"  // tmp6 = pom * (tmp2 - tmp1);
+            "mul.s %[tmp8], %[pom],     %[tmp3]             \n\t"  // tmp8 = pom * (tmp3 + tmp4);
+            "swc1  %[pom1], 16(%[tmpz])                     \n\t"  // tmpz[2].re = tmpz[2].re + tmp8;
+            "swc1  %[pom3], 20(%[tmpz])                     \n\t"  // tmpz[2].im = tmpz[2].im - tmp7;
+            "add.s %[tmp1], %[tmp5],    %[tmp7]             \n\t"  // tmp1 = tmp5 + tmp7;
+            "sub.s %[tmp3], %[tmp5],    %[tmp7]             \n\t"  // tmp3 = tmp5 - tmp7;
+            "add.s %[tmp2], %[tmp6],    %[tmp8]             \n\t"  // tmp2 = tmp6 + tmp8;
+            "sub.s %[tmp4], %[tmp6],    %[tmp8]             \n\t"  // tmp4 = tmp6 - tmp8;
+            "lwc1  %[temp], 8(%[tmpz])                      \n\t"
+            "lwc1  %[temp1],12(%[tmpz])                     \n\t"
+            "lwc1  %[pom],  24(%[tmpz])                     \n\t"
+            "lwc1  %[pom2], 28(%[tmpz])                     \n\t"
+            "sub.s %[temp4],%[temp],    %[tmp1]             \n\t"
+            "sub.s %[temp3],%[temp1],   %[tmp2]             \n\t"
+            "add.s %[temp], %[temp],    %[tmp1]             \n\t"
+            "add.s %[temp1],%[temp1],   %[tmp2]             \n\t"
+            "sub.s %[pom1], %[pom],     %[tmp4]             \n\t"
+            "add.s %[pom3], %[pom2],    %[tmp3]             \n\t"
+            "add.s %[pom],  %[pom],     %[tmp4]             \n\t"
+            "sub.s %[pom2], %[pom2],    %[tmp3]             \n\t"
+            "swc1  %[temp4],40(%[tmpz])                     \n\t"  // tmpz[5].re = tmpz[1].re - tmp1;
+            "swc1  %[temp3],44(%[tmpz])                     \n\t"  // tmpz[5].im = tmpz[1].im - tmp2;
+            "swc1  %[temp], 8(%[tmpz])                      \n\t"  // tmpz[1].re = tmpz[1].re + tmp1;
+            "swc1  %[temp1],12(%[tmpz])                     \n\t"  // tmpz[1].im = tmpz[1].im + tmp2;
+            "swc1  %[pom1], 56(%[tmpz])                     \n\t"  // tmpz[7].re = tmpz[3].re - tmp4;
+            "swc1  %[pom3], 60(%[tmpz])                     \n\t"  // tmpz[7].im = tmpz[3].im + tmp3;
+            "swc1  %[pom],  24(%[tmpz])                     \n\t"  // tmpz[3].re = tmpz[3].re + tmp4;
+            "swc1  %[pom2], 28(%[tmpz])                     \n\t"  // tmpz[3].im = tmpz[3].im - tmp3;
+            : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),   [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
+              [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5),  [tmp7]"=&f"(tmp7),
+              [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
+            : [tmpz]"r"(tmpz)
+            : "memory"
+        );
+    }
+
+    step = 1 << (MAX_LOG2_NFFT - 4);
+    n4 = 4;
+
+    for (nbits=4; nbits<=s->nbits; nbits++) {
+        num_transforms = (num_transforms >> 1) | 1;
+        n2  = 2 * n4;
+        n34 = 3 * n4;
+
+        for (n=0; n<num_transforms; n++) {
+            offset = ff_fft_offsets_lut[n] << nbits;
+            tmpz = z + offset;
+
+            tmpz_n2  = tmpz +  n2;
+            tmpz_n4  = tmpz +  n4;
+            tmpz_n34 = tmpz +  n34;
+
+            __asm__ volatile (
+                "lwc1  %[pom1], 0(%[tmpz_n2])            \n\t"
+                "lwc1  %[pom],  0(%[tmpz_n34])           \n\t"
+                "lwc1  %[pom2], 4(%[tmpz_n2])            \n\t"
+                "lwc1  %[pom3], 4(%[tmpz_n34])           \n\t"
+                "lwc1  %[temp1],0(%[tmpz])               \n\t"
+                "lwc1  %[temp3],4(%[tmpz])               \n\t"
+                "add.s %[tmp5], %[pom1],      %[pom]     \n\t"   //  tmp5 = tmpz[ n2].re + tmpz[n34].re;
+                "sub.s %[tmp1], %[pom1],      %[pom]     \n\t"   //  tmp1 = tmpz[ n2].re - tmpz[n34].re;
+                "add.s %[tmp6], %[pom2],      %[pom3]    \n\t"   //  tmp6 = tmpz[ n2].im + tmpz[n34].im;
+                "sub.s %[tmp2], %[pom2],      %[pom3]    \n\t"   //  tmp2 = tmpz[ n2].im - tmpz[n34].im;
+                "sub.s %[temp], %[temp1],     %[tmp5]    \n\t"
+                "add.s %[temp1],%[temp1],     %[tmp5]    \n\t"
+                "sub.s %[temp4],%[temp3],     %[tmp6]    \n\t"
+                "add.s %[temp3],%[temp3],     %[tmp6]    \n\t"
+                "swc1  %[temp], 0(%[tmpz_n2])            \n\t"   //  tmpz[ n2].re = tmpz[ 0].re - tmp5;
+                "swc1  %[temp1],0(%[tmpz])               \n\t"   //  tmpz[  0].re = tmpz[ 0].re + tmp5;
+                "lwc1  %[pom1], 0(%[tmpz_n4])            \n\t"
+                "swc1  %[temp4],4(%[tmpz_n2])            \n\t"   //  tmpz[ n2].im = tmpz[ 0].im - tmp6;
+                "lwc1  %[temp], 4(%[tmpz_n4])            \n\t"
+                "swc1  %[temp3],4(%[tmpz])               \n\t"   //  tmpz[  0].im = tmpz[ 0].im + tmp6;
+                "sub.s %[pom],  %[pom1],      %[tmp2]    \n\t"
+                "add.s %[pom1], %[pom1],      %[tmp2]    \n\t"
+                "add.s %[temp1],%[temp],      %[tmp1]    \n\t"
+                "sub.s %[temp], %[temp],      %[tmp1]    \n\t"
+                "swc1  %[pom],  0(%[tmpz_n34])           \n\t"   //  tmpz[n34].re = tmpz[n4].re - tmp2;
+                "swc1  %[pom1], 0(%[tmpz_n4])            \n\t"   //  tmpz[ n4].re = tmpz[n4].re + tmp2;
+                "swc1  %[temp1],4(%[tmpz_n34])           \n\t"   //  tmpz[n34].im = tmpz[n4].im + tmp1;
+                "swc1  %[temp], 4(%[tmpz_n4])            \n\t"   //  tmpz[ n4].im = tmpz[n4].im - tmp1;
+                : [tmp5]"=&f"(tmp5),
+                  [tmp1]"=&f"(tmp1), [pom]"=&f"(pom),        [pom1]"=&f"(pom1),        [pom2]"=&f"(pom2),
+                  [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6),          [pom3]"=&f"(pom3),
+                  [temp]"=&f"(temp), [temp1]"=&f"(temp1),     [temp3]"=&f"(temp3),       [temp4]"=&f"(temp4)
+                : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
+                : "memory"
+            );
+
+            w_re_ptr = (float*)(ff_cos_131072 + step);
+            w_im_ptr = (float*)(ff_cos_131072 + MAX_FFT_SIZE/4 - step);
+
+            for (i=1; i<n4; i++) {
+                w_re = w_re_ptr[0];
+                w_im = w_im_ptr[0];
+                tmpz_n2_i = tmpz_n2  + i;
+                tmpz_n4_i = tmpz_n4  + i;
+                tmpz_n34_i= tmpz_n34 + i;
+                tmpz_i    = tmpz     + i;
+
+                __asm__ volatile (
+                    "lwc1     %[temp],  0(%[tmpz_n2_i])               \n\t"
+                    "lwc1     %[temp1], 4(%[tmpz_n2_i])               \n\t"
+                    "lwc1     %[pom],   0(%[tmpz_n34_i])              \n\t"
+                    "lwc1     %[pom1],  4(%[tmpz_n34_i])              \n\t"
+                    "mul.s    %[temp3], %[w_im],    %[temp]           \n\t"
+                    "mul.s    %[temp4], %[w_im],    %[temp1]          \n\t"
+                    "mul.s    %[pom2],  %[w_im],    %[pom1]           \n\t"
+                    "mul.s    %[pom3],  %[w_im],    %[pom]            \n\t"
+                    "msub.s   %[tmp2],  %[temp3],   %[w_re], %[temp1] \n\t"  // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
+                    "madd.s   %[tmp1],  %[temp4],   %[w_re], %[temp]  \n\t"  // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
+                    "msub.s   %[tmp3],  %[pom2],    %[w_re], %[pom]   \n\t"  // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
+                    "madd.s   %[tmp4],  %[pom3],    %[w_re], %[pom1]  \n\t"  // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
+                    "lwc1     %[temp],  0(%[tmpz_i])                  \n\t"
+                    "lwc1     %[pom],   4(%[tmpz_i])                  \n\t"
+                    "add.s    %[tmp5],  %[tmp1],    %[tmp3]           \n\t"  // tmp5 = tmp1 + tmp3;
+                    "sub.s    %[tmp1],  %[tmp1],    %[tmp3]           \n\t"  // tmp1 = tmp1 - tmp3;
+                    "add.s    %[tmp6],  %[tmp2],    %[tmp4]           \n\t"  // tmp6 = tmp2 + tmp4;
+                    "sub.s    %[tmp2],  %[tmp2],    %[tmp4]           \n\t"  // tmp2 = tmp2 - tmp4;
+                    "sub.s    %[temp1], %[temp],    %[tmp5]           \n\t"
+                    "add.s    %[temp],  %[temp],    %[tmp5]           \n\t"
+                    "sub.s    %[pom1],  %[pom],     %[tmp6]           \n\t"
+                    "add.s    %[pom],   %[pom],     %[tmp6]           \n\t"
+                    "lwc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"
+                    "lwc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"
+                    "swc1     %[temp1], 0(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].re = tmpz[   i].re - tmp5;
+                    "swc1     %[temp],  0(%[tmpz_i])                  \n\t"  // tmpz[    i].re = tmpz[   i].re + tmp5;
+                    "swc1     %[pom1],  4(%[tmpz_n2_i])               \n\t"  // tmpz[ n2+i].im = tmpz[   i].im - tmp6;
+                    "swc1     %[pom] ,  4(%[tmpz_i])                  \n\t"  // tmpz[    i].im = tmpz[   i].im + tmp6;
+                    "sub.s    %[temp4], %[temp3],   %[tmp2]           \n\t"
+                    "add.s    %[pom3],  %[pom2],    %[tmp1]           \n\t"
+                    "add.s    %[temp3], %[temp3],   %[tmp2]           \n\t"
+                    "sub.s    %[pom2],  %[pom2],    %[tmp1]           \n\t"
+                    "swc1     %[temp4], 0(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+                    "swc1     %[pom3],  4(%[tmpz_n34_i])              \n\t"  // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+                    "swc1     %[temp3], 0(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+                    "swc1     %[pom2],  4(%[tmpz_n4_i])               \n\t"  // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+                    : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
+                      [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
+                      [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+                      [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
+                    : [w_re]"f"(w_re), [w_im]"f"(w_im),
+                      [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
+                      [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
+                    : "memory"
+                );
+                w_re_ptr += step;
+                w_im_ptr -= step;
+            }
+        }
+        step >>= 1;
+        n4   <<= 1;
+    }
+}
+
+/**
+ * MDCT/IMDCT transforms.
+ */
+
+static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k, n8, n4, n2, n, j;
+    const uint16_t *revtab = s->revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    const FFTSample *in1, *in2, *in3, *in4;
+    FFTComplex *z = (FFTComplex *)output;
+
+    int j1;
+    const float *tcos1, *tsin1, *tcos2, *tsin2;
+    float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+        temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+    FFTComplex *z1, *z2;
+
+    n = 1 << s->mdct_bits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    /* pre rotation */
+    in1 = input;
+    in2 = input + n2 - 1;
+    in3 = input + 2;
+    in4 = input + n2 - 3;
+
+    tcos1 = tcos;
+    tsin1 = tsin;
+
+    /* n4 = 64 or 128 */
+    for(k = 0; k < n4; k += 2) {
+        j  = revtab[k    ];
+        j1 = revtab[k + 1];
+
+        __asm__ volatile (
+            "lwc1           %[temp1],       0(%[in2])                           \t\n"
+            "lwc1           %[temp2],       0(%[tcos1])                         \t\n"
+            "lwc1           %[temp3],       0(%[tsin1])                         \t\n"
+            "lwc1           %[temp4],       0(%[in1])                           \t\n"
+            "lwc1           %[temp5],       0(%[in4])                           \t\n"
+            "mul.s          %[temp9],       %[temp1],   %[temp2]                \t\n"
+            "mul.s          %[temp10],      %[temp1],   %[temp3]                \t\n"
+            "lwc1           %[temp6],       4(%[tcos1])                         \t\n"
+            "lwc1           %[temp7],       4(%[tsin1])                         \t\n"
+            "nmsub.s        %[temp9],       %[temp9],   %[temp4],   %[temp3]    \t\n"
+            "madd.s         %[temp10],      %[temp10],  %[temp4],   %[temp2]    \t\n"
+            "mul.s          %[temp11],      %[temp5],   %[temp6]                \t\n"
+            "mul.s          %[temp12],      %[temp5],   %[temp7]                \t\n"
+            "lwc1           %[temp8],       0(%[in3])                           \t\n"
+            PTR_ADDIU "     %[tcos1],       %[tcos1],   8                       \t\n"
+            PTR_ADDIU "     %[tsin1],       %[tsin1],   8                       \t\n"
+            PTR_ADDIU "     %[in1],         %[in1],     16                      \t\n"
+            "nmsub.s        %[temp11],      %[temp11],  %[temp8],   %[temp7]    \t\n"
+            "madd.s         %[temp12],      %[temp12],  %[temp8],   %[temp6]    \t\n"
+            PTR_ADDIU "     %[in2],         %[in2],     -16                     \t\n"
+            PTR_ADDIU "     %[in3],         %[in3],     16                      \t\n"
+            PTR_ADDIU "     %[in4],         %[in4],     -16                     \t\n"
+
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+              [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
+              [in1]"+r"(in1), [in2]"+r"(in2),
+              [in3]"+r"(in3), [in4]"+r"(in4)
+            :
+            : "memory"
+        );
+
+        z[j ].re = temp9;
+        z[j ].im = temp10;
+        z[j1].re = temp11;
+        z[j1].im = temp12;
+    }
+
+    s->fft_calc(s, z);
+
+    /* post rotation + reordering */
+    /* n8 = 32 or 64 */
+    for(k = 0; k < n8; k += 2) {
+        tcos1 = &tcos[n8 - k - 2];
+        tsin1 = &tsin[n8 - k - 2];
+        tcos2 = &tcos[n8 + k];
+        tsin2 = &tsin[n8 + k];
+        z1 = &z[n8 - k - 2];
+        z2 = &z[n8 + k    ];
+
+        __asm__ volatile (
+            "lwc1       %[temp1],   12(%[z1])                           \t\n"
+            "lwc1       %[temp2],   4(%[tsin1])                         \t\n"
+            "lwc1       %[temp3],   4(%[tcos1])                         \t\n"
+            "lwc1       %[temp4],   8(%[z1])                            \t\n"
+            "lwc1       %[temp5],   4(%[z1])                            \t\n"
+            "mul.s      %[temp9],   %[temp1],   %[temp2]                \t\n"
+            "mul.s      %[temp10],  %[temp1],   %[temp3]                \t\n"
+            "lwc1       %[temp6],   0(%[tsin1])                         \t\n"
+            "lwc1       %[temp7],   0(%[tcos1])                         \t\n"
+            "nmsub.s    %[temp9],   %[temp9],   %[temp4],   %[temp3]    \t\n"
+            "madd.s     %[temp10],  %[temp10],  %[temp4],   %[temp2]    \t\n"
+            "mul.s      %[temp11],  %[temp5],   %[temp6]                \t\n"
+            "mul.s      %[temp12],  %[temp5],   %[temp7]                \t\n"
+            "lwc1       %[temp8],   0(%[z1])                            \t\n"
+            "lwc1       %[temp1],   4(%[z2])                            \t\n"
+            "lwc1       %[temp2],   0(%[tsin2])                         \t\n"
+            "lwc1       %[temp3],   0(%[tcos2])                         \t\n"
+            "nmsub.s    %[temp11],  %[temp11],  %[temp8],   %[temp7]    \t\n"
+            "madd.s     %[temp12],  %[temp12],  %[temp8],   %[temp6]    \t\n"
+            "mul.s      %[temp13],  %[temp1],   %[temp2]                \t\n"
+            "mul.s      %[temp14],  %[temp1],   %[temp3]                \t\n"
+            "lwc1       %[temp4],   0(%[z2])                            \t\n"
+            "lwc1       %[temp5],   12(%[z2])                           \t\n"
+            "lwc1       %[temp6],   4(%[tsin2])                         \t\n"
+            "lwc1       %[temp7],   4(%[tcos2])                         \t\n"
+            "nmsub.s    %[temp13],  %[temp13],  %[temp4],   %[temp3]    \t\n"
+            "madd.s     %[temp14],  %[temp14],  %[temp4],   %[temp2]    \t\n"
+            "mul.s      %[temp15],  %[temp5],   %[temp6]                \t\n"
+            "mul.s      %[temp16],  %[temp5],   %[temp7]                \t\n"
+            "lwc1       %[temp8],   8(%[z2])                            \t\n"
+            "nmsub.s    %[temp15],  %[temp15],  %[temp8],   %[temp7]    \t\n"
+            "madd.s     %[temp16],  %[temp16],  %[temp8],   %[temp6]    \t\n"
+            : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+              [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+              [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+              [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+              [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+              [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
+            : [z1]"r"(z1), [z2]"r"(z2),
+              [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
+              [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
+            : "memory"
+        );
+
+        z1[1].re = temp9;
+        z1[1].im = temp14;
+        z2[0].re = temp13;
+        z2[0].im = temp10;
+
+        z1[0].re = temp11;
+        z1[0].im = temp16;
+        z2[1].re = temp15;
+        z2[1].im = temp12;
+    }
+}
+
+/**
+ * Compute inverse MDCT of size N = 2^nbits
+ * @param output N samples
+ * @param input N/2 samples
+ */
+static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k;
+    int n = 1 << s->mdct_bits;
+    int n2 = n >> 1;
+    int n4 = n >> 2;
+
+    ff_imdct_half_mips(s, output+n4, input);
+
+    for(k = 0; k < n4; k+=4) {
+        output[k] = -output[n2-k-1];
+        output[k+1] = -output[n2-k-2];
+        output[k+2] = -output[n2-k-3];
+        output[k+3] = -output[n2-k-4];
+
+        output[n-k-1] = output[n2+k];
+        output[n-k-2] = output[n2+k+1];
+        output[n-k-3] = output[n2+k+2];
+        output[n-k-4] = output[n2+k+3];
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_fft_init_mips(FFTContext *s)
+{
+    int n=0;
+
+    ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n);
+    ff_init_ff_cos_tabs(17);
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->fft_calc     = ff_fft_calc_mips;
+#if CONFIG_MDCT
+    s->imdct_calc   = ff_imdct_calc_mips;
+    s->imdct_half   = ff_imdct_half_mips;
+#endif
+#endif
+#endif
+}
diff --git a/libavcodec/mips/fmtconvert_mips.c b/libavcodec/mips/fmtconvert_mips.c
new file mode 100644
index 0000000..9909584
--- /dev/null
+++ b/libavcodec/mips/fmtconvert_mips.c
@@ -0,0 +1,141 @@
+/*
+ * Format Conversion Utils for MIPS
+ *
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Zoran Lukic (zoranl@mips.com)
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void int32_to_float_fmul_scalar_mips(float *dst, const int *src,
+        float mul, int len)
+{
+    /*
+     * variables used in inline assembler
+     */
+    float temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15;
+
+    int rpom1, rpom2, rpom11, rpom21, rpom12, rpom22, rpom13, rpom23;
+    const int *src_end = src + len;
+    /*
+     * loop is 8 times unrolled in assembler in order to achieve better performance
+     */
+    __asm__ volatile (
+        "i32tf_lp%=:                                    \n\t"
+        "lw       %[rpom11],     0(%[src])              \n\t"
+        "lw       %[rpom21],     4(%[src])              \n\t"
+        "lw       %[rpom1],      8(%[src])              \n\t"
+        "lw       %[rpom2],      12(%[src])             \n\t"
+        "mtc1     %[rpom11],     %[temp1]               \n\t"
+        "mtc1     %[rpom21],     %[temp3]               \n\t"
+        "mtc1     %[rpom1],      %[temp5]               \n\t"
+        "mtc1     %[rpom2],      %[temp7]               \n\t"
+
+        "lw       %[rpom13],     16(%[src])             \n\t"
+        "lw       %[rpom23],     20(%[src])             \n\t"
+        "lw       %[rpom12],     24(%[src])             \n\t"
+        "lw       %[rpom22],     28(%[src])             \n\t"
+        "mtc1     %[rpom13],     %[temp9]               \n\t"
+        "mtc1     %[rpom23],     %[temp11]              \n\t"
+        "mtc1     %[rpom12],     %[temp13]              \n\t"
+        "mtc1     %[rpom22],     %[temp15]              \n\t"
+
+        PTR_ADDIU "%[src],       32                     \n\t"
+        "cvt.s.w  %[temp1],      %[temp1]               \n\t"
+        "cvt.s.w  %[temp3],      %[temp3]               \n\t"
+        "cvt.s.w  %[temp5],      %[temp5]               \n\t"
+        "cvt.s.w  %[temp7],      %[temp7]               \n\t"
+
+        "cvt.s.w  %[temp9],      %[temp9]               \n\t"
+        "cvt.s.w  %[temp11],     %[temp11]              \n\t"
+        "cvt.s.w  %[temp13],     %[temp13]              \n\t"
+        "cvt.s.w  %[temp15],     %[temp15]              \n\t"
+
+        "mul.s   %[temp1],       %[temp1],    %[mul]    \n\t"
+        "mul.s   %[temp3],       %[temp3],    %[mul]    \n\t"
+        "mul.s   %[temp5],       %[temp5],    %[mul]    \n\t"
+        "mul.s   %[temp7],       %[temp7],    %[mul]    \n\t"
+
+        "mul.s   %[temp9],       %[temp9],    %[mul]    \n\t"
+        "mul.s   %[temp11],      %[temp11],   %[mul]    \n\t"
+        "mul.s   %[temp13],      %[temp13],   %[mul]    \n\t"
+        "mul.s   %[temp15],      %[temp15],   %[mul]    \n\t"
+
+        "swc1    %[temp1],       0(%[dst])              \n\t" /*dst[i] = src[i] * mul;    */
+        "swc1    %[temp3],       4(%[dst])              \n\t" /*dst[i+1] = src[i+1] * mul;*/
+        "swc1    %[temp5],       8(%[dst])              \n\t" /*dst[i+2] = src[i+2] * mul;*/
+        "swc1    %[temp7],       12(%[dst])             \n\t" /*dst[i+3] = src[i+3] * mul;*/
+
+        "swc1    %[temp9],       16(%[dst])             \n\t" /*dst[i+4] = src[i+4] * mul;*/
+        "swc1    %[temp11],      20(%[dst])             \n\t" /*dst[i+5] = src[i+5] * mul;*/
+        "swc1    %[temp13],      24(%[dst])             \n\t" /*dst[i+6] = src[i+6] * mul;*/
+        "swc1    %[temp15],      28(%[dst])             \n\t" /*dst[i+7] = src[i+7] * mul;*/
+        PTR_ADDIU "%[dst],       32                     \n\t"
+        "bne     %[src],        %[src_end], i32tf_lp%=  \n\t"
+        : [temp1]"=&f"(temp1),   [temp11]"=&f"(temp11),
+          [temp13]"=&f"(temp13), [temp15]"=&f"(temp15),
+          [temp3]"=&f"(temp3),   [temp5]"=&f"(temp5),
+          [temp7]"=&f"(temp7),   [temp9]"=&f"(temp9),
+          [rpom1]"=&r"(rpom1),   [rpom2]"=&r"(rpom2),
+          [rpom11]"=&r"(rpom11), [rpom21]"=&r"(rpom21),
+          [rpom12]"=&r"(rpom12), [rpom22]"=&r"(rpom22),
+          [rpom13]"=&r"(rpom13), [rpom23]"=&r"(rpom23),
+          [dst]"+r"(dst),       [src]"+r"(src)
+        : [mul]"f"(mul),        [src_end]"r"(src_end)
+        : "memory"
+    );
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c)
+{
+#if HAVE_INLINE_ASM
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips;
+#endif
+}
diff --git a/libavcodec/mips/h263dsp_init_mips.c b/libavcodec/mips/h263dsp_init_mips.c
new file mode 100644
index 0000000..09bd937
--- /dev/null
+++ b/libavcodec/mips/h263dsp_init_mips.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h263dsp_init_msa(H263DSPContext *c)
+{
+    c->h263_h_loop_filter = ff_h263_h_loop_filter_msa;
+    c->h263_v_loop_filter = ff_h263_v_loop_filter_msa;
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_h263dsp_init_mips(H263DSPContext *c)
+{
+#if HAVE_MSA
+    h263dsp_init_msa(c);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/h263dsp_mips.h b/libavcodec/mips/h263dsp_mips.h
new file mode 100644
index 0000000..99a43cd
--- /dev/null
+++ b/libavcodec/mips/h263dsp_mips.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H263DSP_MIPS_H
+#define AVCODEC_MIPS_H263DSP_MIPS_H
+
+#include "libavcodec/mpegvideo.h"
+
+void ff_h263_h_loop_filter_msa(uint8_t *src, int stride, int q_scale);
+void ff_h263_v_loop_filter_msa(uint8_t *src, int stride, int q_scale);
+void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block,
+                                       int32_t index, int32_t q_scale);
+void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block,
+                                      int32_t index, int32_t q_scale);
+void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block,
+                                      int32_t index, int32_t q_scale);
+int ff_pix_sum_msa(uint8_t *pix, int line_size);
+
+#endif  // #ifndef AVCODEC_MIPS_H263DSP_MIPS_H
diff --git a/libavcodec/mips/h263dsp_msa.c b/libavcodec/mips/h263dsp_msa.c
new file mode 100644
index 0000000..472bcbd
--- /dev/null
+++ b/libavcodec/mips/h263dsp_msa.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h263dsp_mips.h"
+
+static const uint8_t h263_loop_filter_strength_msa[32] = {
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7,
+    7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12
+};
+
+static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
+{
+    int32_t strength = h263_loop_filter_strength_msa[qscale];
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 temp0, temp1, temp2;
+    v8i16 diff0, diff2, diff4, diff6, diff8;
+    v8i16 d0, a_d0, str_x2, str;
+
+    src -= 2;
+    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in3, in2, in1);
+
+    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
+    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
+    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
+    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
+    temp2 <<= 2;
+    diff0 = a_d0 + temp2;
+    diff2 = -(-diff0 >> 3);
+    str_x2 = __msa_fill_h(-(strength << 1));
+    temp0 = (str_x2 <= diff2);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff2;
+    str = __msa_fill_h(-strength);
+    temp0 = (diff2 < str);
+    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
+    diff4 = diff0 >> 3;
+    str_x2 = __msa_fill_h(strength << 1);
+    temp0 = (diff4 <= str_x2);
+    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff4;
+    str = __msa_fill_h(strength);
+    temp0 = (str < diff4);
+    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(diff0, 0);
+    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff2 = -diff2 >> 1;
+    diff4 >>= 1;
+    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff6 = (-a_d0) >> 2;
+    diff6 = -(diff6);
+    temp2 = -diff8;
+    temp0 = (diff6 < temp2);
+    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
+    diff2 = a_d0 >> 2;
+    temp0 = (diff2 <= diff8);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(a_d0, 0);
+    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
+    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
+    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
+    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
+    in3 = __msa_xori_b(in3, 128);
+    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
+    in3 = __msa_xori_b(in3, 128);
+    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
+    ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
+    in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
+    in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride);
+    src += 4 * stride;
+    ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
+    src += 4 * stride;
+}
+
+static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
+{
+    int32_t strength = h263_loop_filter_strength_msa[qscale];
+    uint64_t res0, res1, res2, res3;
+    v16u8 in0, in1, in2, in3;
+    v8i16 temp0, temp2, diff0, diff2, diff4, diff6, diff8;
+    v8i16 d0, a_d0, str_x2, str;
+
+    src -= 2 * stride;
+    LD_UB4(src, stride, in0, in3, in2, in1);
+    temp0 = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in1);
+    a_d0 = __msa_hsub_u_h((v16u8) temp0, (v16u8) temp0);
+    temp2 = (v8i16) __msa_ilvr_b((v16i8) in2, (v16i8) in3);
+    temp2 = __msa_hsub_u_h((v16u8) temp2, (v16u8) temp2);
+    temp2 <<= 2;
+    diff0 = a_d0 + temp2;
+    diff2 = -(-diff0 >> 3);
+    str_x2 = __msa_fill_h(-(strength << 1));
+    temp0 = (str_x2 <= diff2);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff2;
+    str = __msa_fill_h(-strength);
+    temp0 = (diff2 < str);
+    diff2 = (v8i16) __msa_bmnz_v((v16u8) diff2, (v16u8) temp2, (v16u8) temp0);
+    diff4 = diff0 >> 3;
+    str_x2 = __msa_fill_h(strength << 1);
+    temp0 = (diff4 <= str_x2);
+    diff4 = (v8i16) __msa_bmz_v((v16u8) diff4, (v16u8) temp0, (v16u8) temp0);
+    temp2 = str_x2 - diff4;
+    str = __msa_fill_h(strength);
+    temp0 = (str < diff4);
+    diff4 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) temp2, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(diff0, 0);
+    d0 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff2 = -diff2 >> 1;
+    diff4 >>= 1;
+    diff8 = (v8i16) __msa_bmnz_v((v16u8) diff4, (v16u8) diff2, (v16u8) temp0);
+    diff6 = (-a_d0) >> 2;
+    diff6 = -(diff6);
+    temp2 = -diff8;
+    temp0 = (diff6 < temp2);
+    diff6 = (v8i16) __msa_bmnz_v((v16u8) diff6, (v16u8) temp2, (v16u8) temp0);
+    diff2 = a_d0 >> 2;
+    temp0 = (diff2 <= diff8);
+    diff2 = (v8i16) __msa_bmz_v((v16u8) diff2, (v16u8) diff8, (v16u8) temp0);
+    temp0 = __msa_clti_s_h(a_d0, 0);
+    diff6 = (v8i16) __msa_bmz_v((v16u8) diff6, (v16u8) diff2, (v16u8) temp0);
+    PCKEV_B2_SH(a_d0, diff6, a_d0, d0, diff6, d0);
+    in0 = (v16u8) ((v16i8) in0 - (v16i8) diff6);
+    in1 = (v16u8) ((v16i8) in1 + (v16i8) diff6);
+    in3 = __msa_xori_b(in3, 128);
+    in3 = (v16u8) __msa_adds_s_b((v16i8) in3, (v16i8) d0);
+    in3 = __msa_xori_b(in3, 128);
+    in2 = __msa_subsus_u_b(in2, (v16i8) d0);
+    res0 = __msa_copy_u_d((v2i64) in0, 0);
+    res1 = __msa_copy_u_d((v2i64) in3, 0);
+    res2 = __msa_copy_u_d((v2i64) in2, 0);
+    res3 = __msa_copy_u_d((v2i64) in1, 0);
+    SD4(res0, res1, res2, res3, src, stride);
+}
+
+void ff_h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
+{
+    h263_h_loop_filter_msa(src, stride, q_scale);
+}
+
+void ff_h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t q_scale)
+{
+    h263_v_loop_filter_msa(src, stride, q_scale);
+}
diff --git a/libavcodec/mips/h264chroma_init_mips.c b/libavcodec/mips/h264chroma_init_mips.c
new file mode 100644
index 0000000..122148d
--- /dev/null
+++ b/libavcodec/mips/h264chroma_init_mips.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264chroma_init_msa(H264ChromaContext *c, int bit_depth)
+{
+    const int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_msa;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_msa;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_msa;
+
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_msa;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_msa;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
+{
+    int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmi;
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmi;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmi;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_MSA
+    h264chroma_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264chroma_init_mmi(c, bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264chroma_mips.h b/libavcodec/mips/h264chroma_mips.h
new file mode 100644
index 0000000..0ef6c74
--- /dev/null
+++ b/libavcodec/mips/h264chroma_mips.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H264CHROMA_MIPS_H
+#define AVCODEC_MIPS_H264CHROMA_MIPS_H
+
+#include "libavcodec/h264.h"
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
+                                int height, int x, int y);
+
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y);
+
+#endif /* AVCODEC_MIPS_H264CHROMA_MIPS_H */
diff --git a/libavcodec/mips/h264chroma_mmi.c b/libavcodec/mips/h264chroma_mmi.c
new file mode 100644
index 0000000..3dd123d
--- /dev/null
+++ b/libavcodec/mips/h264chroma_mmi.c
@@ -0,0 +1,717 @@
+/*
+ * Loongson SIMD optimized h264chroma
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_mips.h"
+#include "constants.h"
+#include "libavutil/mips/asmdefs.h"
+
+void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    const int E = B + C;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+
+    if (D) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x08(%[src])                        \n\t"
+            "gsldrc1    %[ftmp2],   0x01(%[src])                        \n\t"
+            "gsldlc1    %[ftmp3],   0x07(%[addr0])                      \n\t"
+            "gsldrc1    %[ftmp3],   0x00(%[addr0])                      \n\t"
+            "gsldlc1    %[ftmp4],   0x08(%[addr0])                      \n\t"
+            "gsldrc1    %[ftmp4],   0x01(%[addr0])                      \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[B]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[A]                \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[B]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[D]                \n\t"
+            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[C]                \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[D]                \n\t"
+            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (E) {
+        const int step = C ? stride : 1;
+
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[addr0])                      \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[addr0])                      \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[E]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]            \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[A]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[E]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x02               \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A)
+            : "memory"
+        );
+    }
+}
+
+void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    const int E = B + C;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+
+    if (D) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x08(%[src])                        \n\t"
+            "gsldrc1    %[ftmp2],   0x01(%[src])                        \n\t"
+            "gsldlc1    %[ftmp3],   0x07(%[addr0])                      \n\t"
+            "gsldrc1    %[ftmp3],   0x00(%[addr0])                      \n\t"
+            "gsldlc1    %[ftmp4],   0x08(%[addr0])                      \n\t"
+            "gsldrc1    %[ftmp4],   0x01(%[addr0])                      \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[B]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[A]                \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[B]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[D]                \n\t"
+            "paddh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[C]                \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[D]                \n\t"
+            "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (E) {
+        const int step = C ? stride : 1;
+
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[addr0])                      \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[addr0])                      \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[E]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp5]            \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[A]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[E]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "pmullh     %[ftmp2],   %[ftmp3],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x02               \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A)
+            : "memory"
+        );
+    }
+}
+
+void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) * (8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) *  y;
+    const int D = x *  y;
+    const int E = B + C;
+    double ftmp[8];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    uint64_t low32;
+
+    if (D) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "uld        %[low32],   0x01(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "uld        %[low32],   0x00(%[addr0])                      \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "uld        %[low32],   0x01(%[addr0])                      \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (E) {
+        const int step = C ? stride : 1;
+
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "uld        %[low32],   0x00(%[addr0])                      \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+            "1:                                                         \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "addi       %[h],       %[h],           -0x02               \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A)
+            : "memory"
+        );
+    }
+}
+
+void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int h, int x, int y)
+{
+    const int A = (8 - x) *(8 - y);
+    const int B = x * (8 - y);
+    const int C = (8 - x) * y;
+    const int D = x * y;
+    const int E = B + C;
+    double ftmp[8];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+    uint64_t low32;
+
+    if (D) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[B],       %[B],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
+            "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "uld        %[low32],   0x01(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "uld        %[low32],   0x00(%[addr0])                      \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "uld        %[low32],   0x01(%[addr0])                      \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[A]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[B]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+
+            "punpcklbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp6],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp5],   %[ftmp5],       %[C]                \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[D]                \n\t"
+            "paddh      %[ftmp2],   %[ftmp5],       %[ftmp6]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [B]"f"(B),
+              [C]"f"(C),                    [D]"f"(D)
+            : "memory"
+        );
+    } else if (E) {
+        const int step = C ? stride : 1;
+
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+            "1:                                                         \n\t"
+            PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "uld        %[low32],   0x00(%[addr0])                      \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+
+            "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp3],   %[ftmp3],       %[A]                \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[E]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp3],       %[ftmp4]            \n\t"
+
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [tmp0]"=&r"(tmp[0]),
+              [addr0]"=&r"(addr[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
+              [ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A),                    [E]"f"(E)
+            : "memory"
+        );
+    } else {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+            "1:                                                         \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
+            "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "addi       %[h],       %[h],           -0x02               \n\t"
+            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+
+            PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [tmp0]"=&r"(tmp[0]),
+              [dst]"+&r"(dst),              [src]"+&r"(src),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
+              [A]"f"(A)
+            : "memory"
+        );
+    }
+}
diff --git a/libavcodec/mips/h264chroma_msa.c b/libavcodec/mips/h264chroma_msa.c
new file mode 100644
index 0000000..67d0bc1
--- /dev/null
+++ b/libavcodec/mips/h264chroma_msa.c
@@ -0,0 +1,2003 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264chroma_mips.h"
+
+static const uint8_t chroma_mask_arr[16 * 5] = {
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    out0 = __msa_copy_u_h(res, 0);
+    out1 = __msa_copy_u_h(res, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+
+    ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    res_r = __msa_dotp_u_h(src4, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16i8 src0, src1;
+    v8u16 res_r;
+    v4i32 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_hz_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          uint32_t coeff0, uint32_t coeff1,
+                                          int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3;
+    v8u16 res0_r, res1_r;
+    v4i32 res0, res1;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+        DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_chroma_hz_4w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else {
+        avc_chroma_hz_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
+                                      coeff1, height);
+    }
+}
+
+static void avc_chroma_hz_8w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+
+    if (0 != (height % 4)) {
+        for (row = (height % 4); row--;) {
+            src0 = LD_UB(src);
+            src += src_stride;
+
+            src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+
+            res0 = __msa_dotp_u_h(src0, coeff_vec);
+            res0 <<= 3;
+            res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
+            res0 = __msa_sat_u_h(res0, 7);
+            res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
+
+            ST8x1_UB(res0, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    v16i8 src0, src1, src2;
+    v16u8 tmp0, tmp1;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+
+    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    out0 = __msa_copy_u_h(res, 0);
+    out1 = __msa_copy_u_h(res, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_2x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_2x4_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_2x8_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_4x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2;
+    v16u8 tmp0, tmp1;
+    v4i32 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_vt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          uint32_t coeff0, uint32_t coeff1,
+                                          int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res0_r, res1_r;
+    v4i32 res0, res1;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   tmp0, tmp1, tmp2, tmp3);
+        ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+        DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_vt_4w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_4x2_msa(src, src_stride, dst, dst_stride, coeff0, coeff1);
+    } else {
+        avc_chroma_vt_4x4multiple_msa(src, src_stride, dst, dst_stride, coeff0,
+                                      coeff1, height);
+    }
+}
+
+static void avc_chroma_vt_8w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coeff0, uint32_t coeff1,
+                                 int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   src0, src1, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    uint16_t out0, out1;
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res_vert;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    out0 = __msa_copy_u_h(res_vert, 0);
+    out1 = __msa_copy_u_h(res_vert, 1);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v8i16 res;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_2x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_2x4_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_2x8_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 mask;
+    v4i32 res;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+    LD_UB3(src, src_stride, src0, src1, src2);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          uint32_t coef_hor0,
+                                          uint32_t coef_hor1,
+                                          uint32_t coef_ver0,
+                                          uint32_t coef_ver1,
+                                          int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+    v4i32 res0, res1;
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                    res_hz3);
+        MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+             coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+        ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+        SRARI_H2_UH(res_vt0, res_vt1, 6);
+        SAT_UH2_UH(res_vt0, res_vt1, 7);
+        PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_4x2_msa(src, src_stride, dst, dst_stride, coef_hor0,
+                              coef_hor1, coef_ver0, coef_ver1);
+    } else {
+        avc_chroma_hv_4x4multiple_msa(src, src_stride, dst, dst_stride,
+                                      coef_hor0, coef_hor1, coef_ver0,
+                                      coef_ver1, height);
+    }
+}
+
+static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint32_t coef_hor0, uint32_t coef_hor1,
+                                 uint32_t coef_ver0, uint32_t coef_ver1,
+                                 int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+        VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+        DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                    res_hz4);
+        MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+             coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+
+        res_vt0 += (res_hz0 * coeff_vt_vec1);
+        res_vt1 += (res_hz1 * coeff_vt_vec1);
+        res_vt2 += (res_hz2 * coeff_vt_vec1);
+        res_vt3 += (res_hz3 * coeff_vt_vec1);
+
+        SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+        SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+        PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+
+        res_hz0 = res_hz4;
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    uint32_t load0, load1;
+    v16i8 src0, src1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16u8 res;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b(res, dst_data);
+
+    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
+    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 res_r;
+    v16i8 res, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[64]);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+
+    src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
+
+    res_r = __msa_dotp_u_h(src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 res0_r, res1_r;
+    v16u8 res0, res1, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_UB(&chroma_mask_arr[64]);
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
+    ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
+    DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
+
+    res0_r <<= 3;
+    res1_r <<= 3;
+
+    SRARI_H2_UH(res0_r, res1_r, 6);
+    SAT_UH2_UH(res0_r, res1_r, 7);
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst4, dst0, dst4);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST2x4_UB(dst4, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16i8 src0, src1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16i8 res, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_SB2(src, src_stride, src0, src1);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+
+    src0 = __msa_vshf_b(mask, src1, src0);
+
+    res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
+
+    ST4x2_UB(dst_data, dst, dst_stride);
+}
+
+static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       uint32_t coeff0,
+                                                       uint32_t coeff1,
+                                                       int32_t height)
+{
+    uint32_t load0, load1;
+    uint32_t row;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0 = { 0 };
+    v16u8 dst1 = { 0 };
+    v8u16 res0_r, res1_r;
+    v16u8 res0, res1, mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_UB(&chroma_mask_arr[0]);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        load0 = LW(dst);
+        load1 = LW(dst + dst_stride);
+
+        INSERT_W2_UB(load0, load1, dst0);
+
+        load0 = LW(dst + 2 * dst_stride);
+        load1 = LW(dst + 3 * dst_stride);
+
+        INSERT_W2_UB(load0, load1, dst1);
+
+        VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
+        DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+        ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hz_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else {
+        avc_chroma_hz_and_aver_dst_4x4multiple_msa(src, src_stride,
+                                                   dst, dst_stride,
+                                                   coeff0, coeff1, height);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint16_t out0, out1;
+    uint32_t load0, load1;
+    v16i8 src0, src1, src2, tmp0, tmp1, res;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+
+    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    dst_data = __msa_aver_u_b((v16u8) res, dst_data);
+    out0 = __msa_copy_u_h((v8i16) dst_data, 0);
+    out1 = __msa_copy_u_h((v8i16) dst_data, 2);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_r;
+    v8i16 res;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+    v16u8 dst_data = { 0 };
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
+
+    load0 = LW(dst + 2 * dst_stride);
+    load1 = LW(dst + 3 * dst_stride);
+
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
+    dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1, load2, load3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 res;
+    v8u16 res_r;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+    v16u8 dst_data0 = { 0 };
+    v16u8 dst_data1 = { 0 };
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+
+    LW4(dst, dst_stride, load0, load1, load2, load3);
+
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
+    dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
+
+    LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
+
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
+    dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               tmp0, tmp1, tmp2, tmp3);
+
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+               tmp0, tmp1, tmp2, tmp3);
+
+    ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+
+    tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h(tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+
+    res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
+
+    ST2x4_UB(res, 0, dst, dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coeff0, uint32_t coeff1)
+{
+    uint32_t load0, load1;
+    v16i8 src0, src1, src2, tmp0, tmp1;
+    v16u8 dst_data = { 0 };
+    v8u16 res_r;
+    v16u8 res;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    INSERT_W2_UB(load0, load1, dst_data);
+    ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
+
+    tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
+
+    res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
+    res_r <<= 3;
+    res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
+    res_r = __msa_sat_u_h(res_r, 7);
+    res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
+    res = __msa_aver_u_b(res, dst_data);
+
+    ST4x2_UB(res, dst, dst_stride);
+}
+
+static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  uint32_t coeff0,
+                                                  uint32_t coeff1,
+                                                  int32_t height)
+{
+    uint32_t load0, load1, row;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 dst0 = { 0 };
+    v16u8 dst1 = { 0 };
+    v8u16 res0_r, res1_r;
+    v16u8 res0, res1;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        load0 = LW(dst);
+        load1 = LW(dst + dst_stride);
+
+        INSERT_W2_UB(load0, load1, dst0);
+        load0 = LW(dst + 2 * dst_stride);
+        load1 = LW(dst + 3 * dst_stride);
+        INSERT_W2_UB(load0, load1, dst1);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   tmp0, tmp1, tmp2, tmp3);
+        ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+        DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
+
+        res0_r <<= 3;
+        res1_r <<= 3;
+
+        SRARI_H2_UH(res0_r, res1_r, 6);
+        SAT_UH2_UH(res0_r, res1_r, 7);
+        PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
+        AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+                                           coeff0, coeff1);
+    } else {
+        avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
+                                              coeff0, coeff1, height);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coeff0, uint32_t coeff1,
+                                              int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 out0, out1;
+    v8u16 res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 coeff_vec0 = __msa_fill_b(coeff0);
+    v16i8 coeff_vec1 = __msa_fill_b(coeff1);
+    v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = height >> 2; row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   src0, src1, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
+                    coeff_vec, res0, res1, res2, res3);
+        SLLI_4V(res0, res1, res2, res3, 3);
+        SRARI_H4_UH(res0, res1, res2, res3, 6);
+        SAT_UH4_UH(res0, res1, res2, res3, 7);
+        PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
+        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    uint16_t out0, out1;
+    v16u8 dst0, dst1;
+    v16u8 src0, src1, src2;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+    out0 = __msa_copy_u_h((v8i16) dst0, 0);
+    out1 = __msa_copy_u_h((v8i16) dst0, 1);
+
+    SH(out0, dst);
+    dst += dst_stride;
+    SH(out1, dst);
+}
+
+static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[48]);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    LD_UB4(src, src_stride, src5, src6, src7, src8);
+
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
+    dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
+
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
+    dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
+
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
+    VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
+    ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST2x4_UB(dst0, 0, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst4 = __msa_aver_u_b((v16u8) res, dst4);
+
+    ST2x4_UB(dst4, 0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1)
+{
+    v16u8 src0, src1, src2;
+    v16u8 dst0, dst1;
+    v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
+    v16i8 res, mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+    DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
+    MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+
+    res_vt0 += res_vt1;
+    res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
+    res_vt0 = __msa_sat_u_h(res_vt0, 7);
+    res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
+    dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+    dst0 = __msa_aver_u_b((v16u8) res, dst0);
+
+    ST4x2_UB(dst0, dst, dst_stride);
+}
+
+static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  uint32_t coef_hor0,
+                                                  uint32_t coef_hor1,
+                                                  uint32_t coef_ver0,
+                                                  uint32_t coef_ver1,
+                                                  int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+    v16u8 res0, res1;
+
+    mask = LD_SB(&chroma_mask_arr[0]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
+        VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
+        DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                    res_hz3);
+        MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+             coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+        ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
+        SRARI_H2_UH(res_vt0, res_vt1, 6);
+        SAT_UH2_UH(res_vt0, res_vt1, 7);
+        PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
+
+        dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+        dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
+
+        AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+        ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    if (2 == height) {
+        avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
+                                           coef_hor0, coef_hor1,
+                                           coef_ver0, coef_ver1);
+    } else {
+        avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
+                                              coef_hor0, coef_hor1,
+                                              coef_ver0, coef_ver1, height);
+    }
+}
+
+static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint32_t coef_hor0,
+                                              uint32_t coef_hor1,
+                                              uint32_t coef_ver0,
+                                              uint32_t coef_ver1,
+                                              int32_t height)
+{
+    uint32_t row;
+    v16u8 src0, src1, src2, src3, src4, out0, out1;
+    v8u16 res_hz0, res_hz1, res_hz2;
+    v8u16 res_hz3, res_hz4;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
+    v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
+    v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
+    v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
+    v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
+
+    mask = LD_SB(&chroma_mask_arr[32]);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
+    res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
+
+    for (row = (height >> 2); row--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
+        VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
+        DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                    coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                    res_hz4);
+        MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+             coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+             res_vt3);
+
+        res_vt0 += (res_hz0 * coeff_vt_vec1);
+        res_vt1 += (res_hz1 * coeff_vt_vec1);
+        res_vt2 += (res_hz2 * coeff_vt_vec1);
+        res_vt3 += (res_hz3 * coeff_vt_vec1);
+
+        SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
+        SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
+
+        PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
+        PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        res_hz0 = res_hz4;
+    }
+}
+
+static void copy_width8_msa(uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width4_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_8w_msa(src, stride, dst,
+                             stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_8w_msa(src, stride, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_8w_msa(src, stride, dst, stride, y, (8 - y), height);
+    } else {
+        copy_width8_msa(src, stride, dst, stride, height);
+    }
+}
+
+void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_4w_msa(src, stride, dst,
+                             stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_4w_msa(src, stride, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_4w_msa(src, stride, dst, stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            *((uint32_t *) dst) = *((uint32_t *) src);
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
+
+void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_2w_msa(src, stride, dst,
+                             stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_2w_msa(src, stride, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_2w_msa(src, stride, dst, stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            *((uint16_t *) dst) = *((uint16_t *) src);
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
+
+void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst,
+                                          stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_8w_msa(src, stride, dst,
+                                          stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst,
+                                          stride, y, (8 - y), height);
+    } else {
+        avg_width8_msa(src, stride, dst, stride, height);
+    }
+}
+
+void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst,
+                                          stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_4w_msa(src, stride, dst,
+                                          stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst,
+                                          stride, y, (8 - y), height);
+    } else {
+        avg_width4_msa(src, stride, dst, stride, height);
+    }
+}
+
+void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
+                                int stride, int height, int x, int y)
+{
+    int32_t cnt;
+
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (x && y) {
+        avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst,
+                                          stride, x, (8 - x), y,
+                                          (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_2w_msa(src, stride, dst,
+                                          stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst,
+                                          stride, y, (8 - y), height);
+    } else {
+        for (cnt = height; cnt--;) {
+            dst[0] = (dst[0] + src[0] + 1) >> 1;
+            dst[1] = (dst[1] + src[1] + 1) >> 1;
+
+            src += stride;
+            dst += stride;
+        }
+    }
+}
diff --git a/libavcodec/mips/h264dsp_init_mips.c b/libavcodec/mips/h264dsp_init_mips.c
new file mode 100644
index 0000000..1fe7f84
--- /dev/null
+++ b/libavcodec/mips/h264dsp_init_mips.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264dsp_init_msa(H264DSPContext *c,
+                                     const int bit_depth,
+                                     const int chroma_format_idc)
+{
+    if (8 == bit_depth) {
+        c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_inter_msa;
+        c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_inter_msa;
+        c->h264_h_loop_filter_luma_mbaff =
+            ff_h264_h_loop_filter_luma_mbaff_msa;
+        c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_msa;
+        c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_msa;
+        c->h264_h_loop_filter_luma_mbaff_intra =
+            ff_h264_h_loop_filter_luma_mbaff_intra_msa;
+        c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_inter_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_inter_msa;
+        else
+            c->h264_h_loop_filter_chroma =
+                ff_h264_h_loop_filter_chroma422_msa;
+
+        if (chroma_format_idc > 1)
+            c->h264_h_loop_filter_chroma_mbaff =
+                ff_h264_h_loop_filter_chroma422_mbaff_msa;
+
+        c->h264_v_loop_filter_chroma_intra =
+            ff_h264_v_lpf_chroma_intra_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_h_loop_filter_chroma_intra =
+                ff_h264_h_lpf_chroma_intra_msa;
+
+        /* Weighted MC */
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_msa;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_msa;
+        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_msa;
+
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_msa;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_msa;
+        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_msa;
+
+        c->h264_idct_add = ff_h264_idct_add_msa;
+        c->h264_idct8_add = ff_h264_idct8_addblk_msa;
+        c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_msa;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_msa;
+        c->h264_idct_add16 = ff_h264_idct_add16_msa;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_msa;
+
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8 = ff_h264_idct_add8_msa;
+        else
+            c->h264_idct_add8 = ff_h264_idct_add8_422_msa;
+
+        c->h264_idct_add16intra = ff_h264_idct_add16_intra_msa;
+        c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_msa;
+    }  // if (8 == bit_depth)
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264dsp_init_mmi(H264DSPContext * c, const int bit_depth,
+        const int chroma_format_idc)
+{
+    if (bit_depth == 8) {
+        c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_mmi;
+        c->h264_idct_add = ff_h264_idct_add_8_mmi;
+        c->h264_idct8_add = ff_h264_idct8_add_8_mmi;
+        c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmi;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmi;
+        c->h264_idct_add16 = ff_h264_idct_add16_8_mmi;
+        c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmi;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmi;
+
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8 = ff_h264_idct_add8_8_mmi;
+        else
+            c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmi;
+
+        c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_mmi;
+
+        if (chroma_format_idc <= 1)
+            c->h264_chroma_dc_dequant_idct =
+                ff_h264_chroma_dc_dequant_idct_8_mmi;
+        else
+            c->h264_chroma_dc_dequant_idct =
+                ff_h264_chroma422_dc_dequant_idct_8_mmi;
+
+        c->weight_h264_pixels_tab[0] = ff_h264_weight_pixels16_8_mmi;
+        c->weight_h264_pixels_tab[1] = ff_h264_weight_pixels8_8_mmi;
+        c->weight_h264_pixels_tab[2] = ff_h264_weight_pixels4_8_mmi;
+
+        c->biweight_h264_pixels_tab[0] = ff_h264_biweight_pixels16_8_mmi;
+        c->biweight_h264_pixels_tab[1] = ff_h264_biweight_pixels8_8_mmi;
+        c->biweight_h264_pixels_tab[2] = ff_h264_biweight_pixels4_8_mmi;
+
+        c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmi;
+        c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmi;
+
+        if (chroma_format_idc <= 1) {
+            c->h264_h_loop_filter_chroma =
+                ff_deblock_h_chroma_8_mmi;
+            c->h264_h_loop_filter_chroma_intra =
+                ff_deblock_h_chroma_intra_8_mmi;
+        }
+
+        c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmi;
+        c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmi;
+        c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmi;
+        c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
+                                  const int chroma_format_idc)
+{
+#if HAVE_MSA
+    h264dsp_init_msa(c, bit_depth, chroma_format_idc);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264dsp_init_mmi(c, bit_depth, chroma_format_idc);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264dsp_mips.h b/libavcodec/mips/h264dsp_mips.h
new file mode 100644
index 0000000..2fdfd11
--- /dev/null
+++ b/libavcodec/mips/h264dsp_mips.h
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+                      Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H264DSP_MIPS_H
+#define AVCODEC_MIPS_H264DSP_MIPS_H
+
+#include "libavcodec/h264.h"
+#include "constants.h"
+
+void ff_h264_h_lpf_luma_inter_msa(uint8_t *src, int stride,
+                                  int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_msa(uint8_t *src, int stride,
+                                  int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_chroma_inter_msa(uint8_t *src, int stride,
+                                    int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_chroma_inter_msa(uint8_t *src, int stride,
+                                    int alpha, int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
+                                         int32_t alpha, int32_t beta,
+                                         int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
+                                               int32_t alpha, int32_t beta,
+                                               int8_t *tc0);
+void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src, int32_t stride,
+                                          int32_t alpha, int32_t beta,
+                                          int8_t *tc0);
+
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride);
+void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                  int32_t de_q_val);
+void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset,
+                            int16_t *block, int32_t stride,
+                            const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset,
+                                  int16_t *block, int32_t dst_stride,
+                                  const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset,
+                           int16_t *block, int32_t dst_stride,
+                           const uint8_t nnzc[15 * 8]);
+void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nnzc[15 * 8]);
+void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                 int32_t dst_stride);
+void ff_h264_idct8_add4_msa(uint8_t *dst, const int *blk_offset,
+                            int16_t *blk, int dst_stride,
+                            const uint8_t nnzc[15 * 8]);
+
+void ff_h264_h_lpf_luma_intra_msa(uint8_t *src, int stride,
+                                  int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_msa(uint8_t *src, int stride,
+                                  int alpha, int beta);
+void ff_h264_h_lpf_chroma_intra_msa(uint8_t *src, int stride,
+                                    int alpha, int beta);
+void ff_h264_v_lpf_chroma_intra_msa(uint8_t *src, int stride,
+                                    int alpha, int beta);
+void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int stride,
+                                                int alpha, int beta);
+
+void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
+                                     int stride, int height, int log2_denom,
+                                     int weightd, int weights, int offset);
+void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height, int log2_denom,
+                                    int weightd, int weights, int offset);
+void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height, int log2_denom,
+                                    int weightd, int weights, int offset);
+void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride, int height,
+                                   int log2_denom, int weight, int offset);
+void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride, int height,
+                                  int log2_denom, int weight, int offset);
+void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride, int height,
+                                  int log2_denom, int weight, int offset);
+
+void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t dst_stride);
+
+void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride);
+
+void ff_h264_add_pixels4_8_mmi(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8]);
+void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
+        int qmul);
+void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul);
+void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul);
+
+void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset);
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
+        int stride, int height, int log2_denom, int weightd, int weights,
+        int offset);
+
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0);
+void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta);
+
+void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t dst_stride);
+
+#endif  // #ifndef AVCODEC_MIPS_H264DSP_MIPS_H
diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
new file mode 100644
index 0000000..a62bbab
--- /dev/null
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -0,0 +1,2824 @@
+/*
+ * Loongson SIMD optimized h264dsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *                    Heiher <r@hev.cc>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+#include "h264dsp_mips.h"
+#include "libavutil/mips/asmdefs.h"
+
+void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
+{
+    double ftmp[9];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "ldc1       %[ftmp1],   0x00(%[src])                            \n\t"
+        "ldc1       %[ftmp2],   0x08(%[src])                            \n\t"
+        "ldc1       %[ftmp3],   0x10(%[src])                            \n\t"
+        "ldc1       %[ftmp4],   0x18(%[src])                            \n\t"
+        "uld        %[low32],   0x00(%[dst0])                           \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "uld        %[low32],   0x00(%[dst1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "uld        %[low32],   0x00(%[dst2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "uld        %[low32],   0x00(%[dst3])                           \n\t"
+        "mtc1       %[low32],   %[ftmp8]                                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
+        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [low32]"=&r"(low32)
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [src]"r"(src)
+        : "memory"
+    );
+
+    memset(src, 0, 32);
+}
+
+void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    double ftmp[12];
+    uint64_t tmp[1];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
+        "dli        %[tmp0],    0x06                                    \n\t"
+        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
+        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
+        "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp10],  %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp11],  %[ftmp5],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp2],   %[ftmp10],      %[ftmp5]                \n\t"
+        "paddh      %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp11],      %[ftmp10]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp11],      %[ftmp10]               \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp10],  %[ftmp1],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_32]             \n\t"
+        "psrah      %[ftmp4],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp10],      %[ftmp5]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp7],   0x00(%[block])                          \n\t"
+        "sdc1       %[ftmp7],   0x08(%[block])                          \n\t"
+        "sdc1       %[ftmp7],   0x10(%[block])                          \n\t"
+        "sdc1       %[ftmp7],   0x18(%[block])                          \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
+        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),
+          [low32]"=&r"(low32)
+        : [dst]"r"(dst),                    [block]"r"(block),
+          [stride]"r"((mips_reg)stride),    [ff_pw_32]"f"(ff_pw_32)
+        : "memory"
+    );
+
+    memset(block, 0, 32);
+}
+
+void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    double ftmp[16];
+    uint64_t tmp[8];
+    mips_reg addr[1];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "lhu       %[tmp0],     0x00(%[block])                          \n\t"
+        PTR_ADDI  "$29,         $29,            -0x20                   \n\t"
+        PTR_ADDIU "%[tmp0],     %[tmp0],        0x20                    \n\t"
+        "ldc1      %[ftmp1],    0x10(%[block])                          \n\t"
+        "sh        %[tmp0],     0x00(%[block])                          \n\t"
+        "ldc1      %[ftmp2],    0x20(%[block])                          \n\t"
+        "dli       %[tmp0],     0x01                                    \n\t"
+        "ldc1      %[ftmp3],    0x30(%[block])                          \n\t"
+        "mtc1      %[tmp0],     %[ftmp8]                                \n\t"
+        "ldc1      %[ftmp5],    0x50(%[block])                          \n\t"
+        "ldc1      %[ftmp6],    0x60(%[block])                          \n\t"
+        "ldc1      %[ftmp7],    0x70(%[block])                          \n\t"
+        "mov.d     %[ftmp0],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp5],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp3]                \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp8]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "dli       %[tmp0],     0x02                                    \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "mtc1      %[tmp0],     %[ftmp9]                                \n\t"
+        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp3],    %[ftmp4],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d     %[ftmp5],    %[ftmp6]                                \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
+        "ldc1      %[ftmp2],    0x00(%[block])                          \n\t"
+        "ldc1      %[ftmp5],    0x40(%[block])                          \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
+        "sdc1      %[ftmp6],    0x00(%[block])                          \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
+        "punpckhhw %[ftmp6],    %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklhw %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "punpckhhw %[ftmp0],    %[ftmp3],       %[ftmp1]                \n\t"
+        "punpcklhw %[ftmp3],    %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhwd %[ftmp1],    %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd %[ftmp3],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklwd %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "ldc1      %[ftmp0],    0x00(%[block])                          \n\t"
+        "sdc1      %[ftmp7],    0x00($29)                               \n\t"
+        "sdc1      %[ftmp1],    0x10($29)                               \n\t"
+        "dmfc1     %[tmp1],     %[ftmp6]                                \n\t"
+        "dmfc1     %[tmp3],     %[ftmp3]                                \n\t"
+        "punpckhhw %[ftmp3],    %[ftmp5],       %[ftmp2]                \n\t"
+        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
+        "punpckhhw %[ftmp2],    %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhwd %[ftmp0],    %[ftmp5],       %[ftmp4]                \n\t"
+        "punpcklwd %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
+        "punpckhwd %[ftmp4],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "sdc1      %[ftmp5],    0x08($29)                               \n\t"
+        "sdc1      %[ftmp0],    0x18($29)                               \n\t"
+        "dmfc1     %[tmp2],     %[ftmp3]                                \n\t"
+        "dmfc1     %[tmp4],     %[ftmp4]                                \n\t"
+        "ldc1      %[ftmp1],    0x18(%[block])                          \n\t"
+        "ldc1      %[ftmp6],    0x28(%[block])                          \n\t"
+        "ldc1      %[ftmp2],    0x38(%[block])                          \n\t"
+        "ldc1      %[ftmp0],    0x58(%[block])                          \n\t"
+        "ldc1      %[ftmp3],    0x68(%[block])                          \n\t"
+        "ldc1      %[ftmp4],    0x78(%[block])                          \n\t"
+        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp5],    %[ftmp0],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp2],    %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "mov.d     %[ftmp0],    %[ftmp3]                                \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp5],    %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
+        "ldc1      %[ftmp6],    0x08(%[block])                          \n\t"
+        "ldc1      %[ftmp0],    0x48(%[block])                          \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "sdc1      %[ftmp3],    0x08(%[block])                          \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhhw %[ftmp3],    %[ftmp4],       %[ftmp7]                \n\t"
+        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhhw %[ftmp7],    %[ftmp2],       %[ftmp1]                \n\t"
+        "punpcklhw %[ftmp2],    %[ftmp2],       %[ftmp1]                \n\t"
+        "punpckhwd %[ftmp1],    %[ftmp4],       %[ftmp2]                \n\t"
+        "punpcklwd %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
+        "punpckhwd %[ftmp2],    %[ftmp3],       %[ftmp7]                \n\t"
+        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
+        "ldc1      %[ftmp7],    0x08(%[block])                          \n\t"
+        "dmfc1     %[tmp5],     %[ftmp4]                                \n\t"
+        "dmfc1     %[tmp7],     %[ftmp1]                                \n\t"
+        "mov.d     %[ftmp12],   %[ftmp3]                                \n\t"
+        "mov.d     %[ftmp14],   %[ftmp2]                                \n\t"
+        "punpckhhw %[ftmp2],    %[ftmp0],       %[ftmp6]                \n\t"
+        "punpcklhw %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "punpckhhw %[ftmp6],    %[ftmp5],       %[ftmp7]                \n\t"
+        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "punpckhwd %[ftmp7],    %[ftmp0],       %[ftmp5]                \n\t"
+        "punpcklwd %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
+        "punpckhwd %[ftmp5],    %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
+        "dmfc1     %[tmp6],     %[ftmp0]                                \n\t"
+        "mov.d     %[ftmp11],   %[ftmp7]                                \n\t"
+        "mov.d     %[ftmp13],   %[ftmp2]                                \n\t"
+        "mov.d     %[ftmp15],   %[ftmp5]                                \n\t"
+        PTR_ADDIU "%[addr0],    %[dst],         0x04                    \n\t"
+        "dmtc1     %[tmp7],     %[ftmp7]                                \n\t"
+        "dmtc1     %[tmp3],     %[ftmp6]                                \n\t"
+        "ldc1      %[ftmp1],    0x10($29)                               \n\t"
+        "dmtc1     %[tmp1],     %[ftmp3]                                \n\t"
+        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp7],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp14]               \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp14]               \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp14]               \n\t"
+        "psrah     %[ftmp5],    %[ftmp14],      %[ftmp8]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d     %[ftmp5],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp6],    %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp4]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d     %[ftmp7],    %[ftmp12]                               \n\t"
+        "psrah     %[ftmp2],    %[ftmp12],      %[ftmp8]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
+        "ldc1      %[ftmp3],    0x00($29)                               \n\t"
+        "dmtc1     %[tmp5],     %[ftmp7]                                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
+        "sdc1      %[ftmp3],    0x00($29)                               \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
+        "sdc1      %[ftmp0],    0x10($29)                               \n\t"
+        "dmfc1     %[tmp1],     %[ftmp2]                                \n\t"
+        "xor       %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "sdc1      %[ftmp2],    0x00(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x08(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x10(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x18(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x20(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x28(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x30(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x38(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x40(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x48(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x50(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x58(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x60(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x68(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x70(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x78(%[block])                          \n\t"
+        "dli       %[tmp3],     0x06                                    \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "mtc1      %[tmp3],     %[ftmp10]                               \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "ldc1      %[ftmp5],    0x00($29)                               \n\t"
+        "ldc1      %[ftmp4],    0x10($29)                               \n\t"
+        "dmtc1     %[tmp1],     %[ftmp6]                                \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "dmtc1     %[tmp4],     %[ftmp1]                                \n\t"
+        "dmtc1     %[tmp2],     %[ftmp6]                                \n\t"
+        "ldc1      %[ftmp4],    0x18($29)                               \n\t"
+        "mov.d     %[ftmp5],    %[ftmp4]                                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp11],      %[ftmp8]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp15]               \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp11]               \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp1]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp11],      %[ftmp1]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp15]               \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp15]               \n\t"
+        "psrah     %[ftmp2],    %[ftmp15],      %[ftmp8]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "mov.d     %[ftmp2],    %[ftmp4]                                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
+        "mov.d     %[ftmp3],    %[ftmp13]                               \n\t"
+        "psrah     %[ftmp0],    %[ftmp13],      %[ftmp8]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
+        "ldc1      %[ftmp6],    0x08($29)                               \n\t"
+        "dmtc1     %[tmp6],     %[ftmp3]                                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "sdc1      %[ftmp6],    0x08($29)                               \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
+        "sdc1      %[ftmp7],    0x18($29)                               \n\t"
+        "dmfc1     %[tmp2],     %[ftmp0]                                \n\t"
+        "xor       %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "ldc1      %[ftmp2],    0x08($29)                               \n\t"
+        "ldc1      %[ftmp5],    0x18($29)                               \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "dmtc1     %[tmp2],     %[ftmp1]                                \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        PTR_ADDIU "$29,         $29,            0x20                    \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
+          [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
+          [tmp6]"=&r"(tmp[6]),              [tmp7]"=&r"(tmp[7]),
+          [addr0]"=&r"(addr[0]),
+          [low32]"=&r"(low32)
+        : [dst]"r"(dst),                    [block]"r"(block),
+          [stride]"r"((mips_reg)stride)
+        : "$29","memory"
+    );
+
+    memset(block, 0, 128);
+}
+
+void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    double ftmp[6];
+    uint64_t low32;
+
+    block[0] = 0;
+
+    __asm__ volatile (
+        "mtc1       %[dc],      %[ftmp5]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "uld        %[low32],   0x00(%[dst0])                           \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[dst1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[dst2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "uld        %[low32],   0x00(%[dst3])                           \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
+        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [low32]"=&r"(low32)
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [dc]"r"(dc)
+        : "memory"
+    );
+}
+
+void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    double ftmp[10];
+
+    block[0] = 0;
+
+    __asm__ volatile (
+        "mtc1       %[dc],      %[ftmp5]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "ldc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "ldc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "ldc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "ldc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "sdc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "sdc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "sdc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "sdc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
+
+        "ldc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
+        "ldc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
+        "ldc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
+        "ldc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "sdc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
+        "sdc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
+        "sdc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
+        "sdc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9])
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [dst4]"r"(dst+4*stride),          [dst5]"r"(dst+5*stride),
+          [dst6]"r"(dst+6*stride),          [dst7]"r"(dst+7*stride),
+          [dc]"r"(dc)
+        : "memory"
+    );
+}
+
+void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+            else
+                ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+        }
+    }
+}
+
+void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_8_mmi(dst + block_offset[i], block + i*16, stride);
+        else if(((int16_t*)block)[i*16])
+            ff_h264_idct_dc_add_8_mmi(dst + block_offset[i], block + i*16,
+                    stride);
+    }
+}
+
+void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && ((int16_t*)block)[i*16])
+                ff_h264_idct8_dc_add_8_mmi(dst + block_offset[i],
+                        block + i*16, stride);
+            else
+                ff_h264_idct8_add_8_mmi(dst + block_offset[i], block + i*16,
+                        stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i, j;
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
+        int16_t *block, int stride, const uint8_t nnzc[15*8])
+{
+    int i, j;
+
+    for(j=1; j<3; j++){
+        for(i=j*16; i<j*16+4; i++){
+            if(nnzc[ scan8[i] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i],
+                        block + i*16, stride);
+        }
+    }
+
+    for(j=1; j<3; j++){
+        for(i=j*16+4; i<j*16+8; i++){
+            if(nnzc[ scan8[i+4] ])
+                ff_h264_idct_add_8_mmi(dest[j-1] + block_offset[i+4],
+                        block + i*16, stride);
+            else if(((int16_t*)block)[i*16])
+                ff_h264_idct_dc_add_8_mmi(dest[j-1] + block_offset[i+4],
+                        block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
+        int qmul)
+{
+    double ftmp[10];
+    uint64_t tmp[2];
+
+    __asm__ volatile (
+        ".set       noreorder                                           \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "ldc1       %[ftmp3],   0x18(%[input])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "ldc1       %[ftmp2],   0x10(%[input])                          \n\t"
+        "dli        %[tmp0],    0x20                                    \n\t"
+        "ldc1       %[ftmp1],   0x08(%[input])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "ldc1       %[ftmp0],   0x00(%[input])                          \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "mov.d      %[ftmp0],   %[ftmp4]                                \n\t"
+        "punpcklwd  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp2]                                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp4]                                \n\t"
+        "daddi      %[tmp0],    %[qmul],        -0x7fff                 \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "bgtz       %[tmp0],    1f                                      \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "ori        %[tmp0],    $0,             0x80                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp0],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
+        "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp0]                                \n\t"
+        "sh         %[tmp1],    0x00(%[output])                         \n\t"
+        "sh         %[input],   0x80(%[output])                         \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x20(%[output])                         \n\t"
+        "sh         %[input],   0xa0(%[output])                         \n\t"
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp2]                                \n\t"
+        "sh         %[tmp1],    0x40(%[output])                         \n\t"
+        "sh         %[input],   0xc0(%[output])                         \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x60(%[output])                         \n\t"
+        "sh         %[input],   0xe0(%[output])                         \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp3]                                \n\t"
+        "sh         %[tmp1],    0x100(%[output])                        \n\t"
+        "sh         %[input],   0x180(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x120(%[output])                        \n\t"
+        "sh         %[input],   0x1a0(%[output])                        \n\t"
+        "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
+        "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp4]                                \n\t"
+        "sh         %[tmp1],    0x140(%[output])                        \n\t"
+        "sh         %[input],   0x1c0(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x160(%[output])                        \n\t"
+        "j          2f                                                  \n\t"
+        "sh         %[input],   0x1e0(%[output])                        \n\t"
+        "1:                                                             \n\t"
+        "ori        %[tmp0],    $0,             0x1f                    \n\t"
+        "clz        %[tmp1],    %[qmul]                                 \n\t"
+        "ori        %[input],   $0,             0x07                    \n\t"
+        "dsubu      %[tmp1],    %[tmp0],        %[tmp1]                 \n\t"
+        "ori        %[tmp0],    $0,             0x80                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
+        "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        "dsubu      %[tmp0],    %[tmp1],        %[input]                \n\t"
+        "movn       %[tmp1],    %[input],       %[tmp0]                 \n\t"
+        PTR_ADDIU  "%[input],   %[input],       0x01                    \n\t"
+        "andi       %[tmp0],    %[tmp1],        0xff                    \n\t"
+        "srlv       %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        PTR_SUBU   "%[input],   %[input],       %[tmp1]                 \n\t"
+        "mtc1       %[input],   %[ftmp6]                                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
+        "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "sh         %[tmp1],    0x00(%[output])                         \n\t"
+        "mfc1       %[input],   %[ftmp0]                                \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        "sh         %[input],   0x80(%[output])                         \n\t"
+        "sh         %[tmp1],    0x20(%[output])                         \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "sh         %[input],   0xa0(%[output])                         \n\t"
+        "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
+        "sh         %[tmp1],    0x40(%[output])                         \n\t"
+        "mfc1       %[input],   %[ftmp2]                                \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        "sh         %[input],   0xc0(%[output])                         \n\t"
+        "sh         %[tmp1],    0x60(%[output])                         \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[input],   0xe0(%[output])                         \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp3]                                \n\t"
+        "sh         %[tmp1],    0x100(%[output])                        \n\t"
+        "sh         %[input],   0x180(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x120(%[output])                        \n\t"
+        "sh         %[input],   0x1a0(%[output])                        \n\t"
+        "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
+        "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp4]                                \n\t"
+        "sh         %[tmp1],    0x140(%[output])                        \n\t"
+        "sh         %[input],   0x1c0(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x160(%[output])                        \n\t"
+        "sh         %[input],   0x1e0(%[output])                        \n\t"
+        "2:                                                             \n\t"
+        ".set       reorder                                             \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [output]"+&r"(output),            [input]"+&r"(input),
+          [qmul]"+&r"(qmul)
+        : [ff_pw_1]"f"(ff_pw_1)
+        : "memory"
+    );
+}
+
+void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
+{
+    int temp[8];
+    int t[8];
+
+    temp[0] = block[0] + block[16];
+    temp[1] = block[0] - block[16];
+    temp[2] = block[32] + block[48];
+    temp[3] = block[32] - block[48];
+    temp[4] = block[64] + block[80];
+    temp[5] = block[64] - block[80];
+    temp[6] = block[96] + block[112];
+    temp[7] = block[96] - block[112];
+
+    t[0] = temp[0] + temp[4] + temp[2] + temp[6];
+    t[1] = temp[0] - temp[4] + temp[2] - temp[6];
+    t[2] = temp[0] - temp[4] - temp[2] + temp[6];
+    t[3] = temp[0] + temp[4] - temp[2] - temp[6];
+    t[4] = temp[1] + temp[5] + temp[3] + temp[7];
+    t[5] = temp[1] - temp[5] + temp[3] - temp[7];
+    t[6] = temp[1] - temp[5] - temp[3] + temp[7];
+    t[7] = temp[1] + temp[5] - temp[3] - temp[7];
+
+    block[  0]= (t[0]*qmul + 128) >> 8;
+    block[ 32]= (t[1]*qmul + 128) >> 8;
+    block[ 64]= (t[2]*qmul + 128) >> 8;
+    block[ 96]= (t[3]*qmul + 128) >> 8;
+    block[ 16]= (t[4]*qmul + 128) >> 8;
+    block[ 48]= (t[5]*qmul + 128) >> 8;
+    block[ 80]= (t[6]*qmul + 128) >> 8;
+    block[112]= (t[7]*qmul + 128) >> 8;
+}
+
+void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
+{
+    int a,b,c,d;
+
+    d = block[0] - block[16];
+    a = block[0] + block[16];
+    b = block[32] - block[48];
+    c = block[32] + block[48];
+    block[0] = ((a+c)*qmul) >> 7;
+    block[16]= ((d+b)*qmul) >> 7;
+    block[32]= ((a-c)*qmul) >> 7;
+    block[48]= ((d-b)*qmul) >> 7;
+}
+
+void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+    double ftmp[8];
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[block0])                     \n\t"
+            "ldc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            "mtc1       %[weight],  %[ftmp3]                            \n\t"
+            "mtc1       %[offset],  %[ftmp4]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
+            "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[block0])                     \n\t"
+            "sdc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7])
+            : [block0]"r"(block),           [block1]"r"(block+8),
+              [weight]"r"(weight),          [offset]"r"(offset),
+              [log2_denom]"r"(log2_denom)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int height, int log2_denom, int weightd, int weights, int offset)
+{
+    int y;
+    double ftmp[9];
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[src0])                       \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst0])                       \n\t"
+            "mtc1       %[weights], %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst0])                       \n\t"
+            "ldc1       %[ftmp1],   0x00(%[src1])                       \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst1])                       \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst1])                       \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8])
+            : [dst0]"r"(dst),               [dst1]"r"(dst+8),
+              [src0]"r"(src),               [src1]"r"(src+8),
+              [weights]"r"(weights),        [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+    double ftmp[6];
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            "mtc1       %[weight],  %[ftmp2]                            \n\t"
+            "mtc1       %[offset],  %[ftmp3]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
+            "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5])
+            : [block]"r"(block),            [weight]"r"(weight),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int height, int log2_denom, int weightd, int weights, int offset)
+{
+    int y;
+    double ftmp[9];
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[src])                        \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "mtc1       %[weights], %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8])
+            : [dst]"r"(dst),                [src]"r"(src),
+              [weights]"r"(weights),        [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset)
+{
+    int y;
+    double ftmp[5];
+    uint64_t low32;
+
+    offset <<= log2_denom;
+
+    if (log2_denom)
+        offset += 1 << (log2_denom - 1);
+
+    for (y=0; y<height; y++, block+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "uld        %[low32],   0x00(%[block])                      \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "mtc1       %[weight],  %[ftmp2]                            \n\t"
+            "mtc1       %[offset],  %[ftmp3]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp4]            \n\t"
+            "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "gsswlc1    %[ftmp1],   0x03(%[block])                      \n\t"
+            "gsswrc1    %[ftmp1],   0x00(%[block])                      \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),
+              [low32]"=&r"(low32)
+            : [block]"r"(block),            [weight]"r"(weight),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
+            : "memory"
+        );
+    }
+}
+
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int height, int log2_denom, int weightd, int weights, int offset)
+{
+    int y;
+    double ftmp[7];
+    uint64_t low32;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+
+    for (y=0; y<height; y++, dst+=stride, src+=stride) {
+        __asm__ volatile (
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "uld        %[low32],   0x00(%[dst])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "mtc1       %[weight],  %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "gsswlc1    %[ftmp1],   0x03(%[dst])                        \n\t"
+            "gsswrc1    %[ftmp1],   0x00(%[dst])                        \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),
+              [low32]"=&r"(low32)
+            : [dst]"r"(dst),                [src]"r"(src),
+              [weight]"r"(weights),         [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
+        );
+    }
+}
+
+void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    double ftmp[12];
+    mips_reg addr[2];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        PTR_ADDU   "%[addr1],   %[stride],      %[addr0]                \n\t"
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        PTR_SUBU   "%[addr1],   $0,             %[addr1]                \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[pix]                  \n\t"
+        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        "gsldxc1    %[ftmp1],   0x00(%[addr1],  %[stride])              \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "uld        %[low32],   0x00(%[tc0])                            \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp5]                \n\t"
+        "pcmpgtb    %[ftmp5],   %[ftmp9],       %[ftmp4]                \n\t"
+        "ldc1       %[ftmp4],   0x00(%[addr1])                          \n\t"
+        "and        %[ftmp10],  %[ftmp5],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp4],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "and        %[ftmp5],   %[ftmp10],      %[ftmp9]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp5],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp2],       %[ftmp3]                \n\t"
+        "ldc1       %[ftmp11],  0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp11]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddusb    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "pmaxub     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "gssdxc1    %[ftmp4],   0x00(%[addr1],  %[stride])              \n\t"
+        "gsldxc1    %[ftmp5],   0x00(%[pix],    %[addr0])               \n\t"
+        "psubusb    %[ftmp4],   %[ftmp5],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp6],   %[ftmp9],       %[ftmp7]                \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+        "gsldxc1    %[ftmp11],  0x00(%[pix],    %[addr0])               \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ff_pb_1]              \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "pmaxub     %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pminub     %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
+        "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "gssdxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
+        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [low32]"=&r"(low32)
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"((mips_reg)alpha),      [beta]"r"((mips_reg)beta),
+          [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
+          [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
+    );
+}
+
+static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    DECLARE_ALIGNED(8, const uint64_t, stack[0x0a]);
+    double ftmp[16];
+    uint64_t tmp[1];
+    mips_reg addr[3];
+
+__asm__ volatile (
+"ori        %[tmp0],    $0,             0x01                    \n\t"
+"xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+"mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
+PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
+PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
+PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
+"bltz       %[alpha],   1f                                      \n\t"
+PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
+PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
+"bltz       %[beta],    1f                                      \n\t"
+PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
+PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
+"ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+"gsldxc1    %[ftmp1],   0x00(%[addr0],  %[addr2])               \n\t"
+"gsldxc1    %[ftmp2],   0x00(%[addr0],  %[addr1])               \n\t"
+"gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+"mtc1       %[alpha],   %[ftmp5]                                \n\t"
+"mtc1       %[beta],    %[ftmp6]                                \n\t"
+"pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+"pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+"packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+"psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+"packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+"or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"sdc1       %[ftmp5],   0x10+%[stack]                           \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp5],   0x10+%[stack]                           \n\t"
+"pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
+"sdc1       %[ftmp8],   0x20+%[stack]                           \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+"psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+"ldc1       %[ftmp15],  0x20+%[stack]                           \n\t"
+"pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+"gsldxc1    %[ftmp15],  0x00(%[addr0],  %[stride])              \n\t"
+"psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"gsldxc1    %[ftmp14],  0x00(%[pix],    %[addr2])               \n\t"
+"sdc1       %[ftmp5],   0x30+%[stack]                           \n\t"
+"psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"sdc1       %[ftmp5],   0x40+%[stack]                           \n\t"
+"pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
+"paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
+"paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
+"psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
+"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp13],  0x10+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+"xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"ldc1       %[ftmp13],  0x30+%[stack]                           \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+"ldc1       %[ftmp12],  0x20+%[stack]                           \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[addr1])               \n\t"
+"ldc1       %[ftmp6],   0x00(%[addr0])                          \n\t"
+"paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"ldc1       %[ftmp12],  0x30+%[stack]                           \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+"gssdxc1    %[ftmp5],   0x00(%[addr0],  %[addr2])               \n\t"
+"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[stride])              \n\t"
+"pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
+"paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
+"paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
+"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+"psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
+"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp12],  0x10+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+"xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
+"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"ldc1       %[ftmp13],  0x20+%[stack]                           \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+"sdc1       %[ftmp6],   0x00(%[pix])                            \n\t"
+"gsldxc1    %[ftmp6],   0x00(%[pix],    %[addr1])               \n\t"
+"paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+"gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
+"gssdxc1    %[ftmp6],   0x00(%[pix],    %[addr2])               \n\t"
+"1:                                                             \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
+  [tmp0]"=&r"(tmp[0]),
+  [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+  [addr2]"=&r"(addr[2]),
+  [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+  [stack]"m"(stack[0]),             [ff_pb_1]"m"(ff_pb_1)
+: "memory"
+);
+}
+
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    double ftmp[9];
+    mips_reg addr[1];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        "or         %[addr0],   $0,             %[pix]                  \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "uld        %[low32],   0x00(%[tc0])                            \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+
+        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0]),
+          [low32]"=&r"(low32)
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"(alpha),                [beta]"r"(beta),
+          [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
+          [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
+    );
+}
+
+void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    double ftmp[9];
+    mips_reg addr[1];
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        "or         %[addr0],   $0,             %[pix]                  \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp3]                                \n\t"
+        "xor        %[ftmp5],   %[ftmp2],       %[ftmp4]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "xor        %[ftmp5],   %[ftmp3],       %[ftmp1]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "and        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "and        %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "paddb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+
+        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"(alpha),                [beta]"r"(beta),
+          [ff_pb_1]"f"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    double ftmp[11];
+    mips_reg addr[6];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
+        "or         %[addr5],   $0,             %[pix]                  \n\t"
+        PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[pix])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "mov.d      %[ftmp9],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp10],  %[ftmp3]                                \n\t"
+
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "mtc1       %[alpha],   %[ftmp4]                                \n\t"
+        "mtc1       %[beta],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "uld        %[low32],   0x00(%[tc0])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "xor        %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ff_pb_A1],    %[ftmp3]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "pminub     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "punpckhwd  %[ftmp4],   %[ftmp9],       %[ftmp9]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp9],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp10],      %[ftmp10]               \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
+        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [pix]"+&r"(pix),
+          [low32]"=&r"(low32)
+        : [alpha]"r"(alpha),                [beta]"r"(beta),
+          [stride]"r"((mips_reg)stride),    [tc0]"r"(tc0),
+          [ff_pb_1]"f"(ff_pb_1),            [ff_pb_3]"f"(ff_pb_3),
+          [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
+    );
+}
+
+void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    double ftmp[11];
+    mips_reg addr[6];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
+        "or         %[addr5],   $0,             %[pix]                  \n\t"
+        PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[pix])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "mtc1       %[alpha],   %[ftmp4]                                \n\t"
+        "mtc1       %[beta],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
+        "xor        %[ftmp4],   %[ftmp1],       %[ftmp3]                \n\t"
+        "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "xor        %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
+        "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "and        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "and        %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
+        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [pix]"+&r"(pix),
+          [low32]"=&r"(low32)
+        : [alpha]"r"(alpha),                [beta]"r"(beta),
+          [stride]"r"((mips_reg)stride),    [ff_pb_1]"f"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    if ((tc0[0] & tc0[1]) >= 0)
+        ff_deblock_v8_luma_8_mmi(pix + 0, stride, alpha, beta, tc0);
+    if ((tc0[2] & tc0[3]) >= 0)
+        ff_deblock_v8_luma_8_mmi(pix + 8, stride, alpha, beta, tc0 + 2);
+}
+
+void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
+    deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
+}
+
+void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
+        int8_t *tc0)
+{
+    uint64_t stack[0xd];
+    double ftmp[9];
+    mips_reg addr[8];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x4                    \n\t"
+        PTR_ADDU   "%[addr2],   %[stride],      %[addr0]                \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
+        "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
+        "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "ldc1       %[ftmp8],   0x10(%[stack])                          \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[stack])                          \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
+        "sdc1       %[ftmp3],   0x20(%[stack])                          \n\t"
+        "sdc1       %[ftmp7],   0x30(%[stack])                          \n\t"
+        "sdc1       %[ftmp5],   0x40(%[stack])                          \n\t"
+        "sdc1       %[ftmp6],   0x50(%[stack])                          \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
+        "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
+        "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "ldc1       %[ftmp8],   0x18(%[stack])                          \n\t"
+        "sdc1       %[ftmp0],   0x08(%[stack])                          \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
+        "sdc1       %[ftmp3],   0x28(%[stack])                          \n\t"
+        "sdc1       %[ftmp7],   0x38(%[stack])                          \n\t"
+        "sdc1       %[ftmp5],   0x48(%[stack])                          \n\t"
+        "sdc1       %[ftmp6],   0x58(%[stack])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6]),            [addr7]"=&r"(addr[7])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [stack]"r"(stack)
+        : "memory"
+    );
+
+    ff_deblock_v_luma_8_mmi((uint8_t *) &stack[6], 0x10, alpha, beta, tc0);
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],          -0x02                  \n\t"
+        PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        "ldc1       %[ftmp0],   0x10(%[stack])                          \n\t"
+        "ldc1       %[ftmp1],   0x20(%[stack])                          \n\t"
+        "ldc1       %[ftmp2],   0x30(%[stack])                          \n\t"
+        "ldc1       %[ftmp3],   0x40(%[stack])                          \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
+        "ldc1       %[ftmp0],   0x18(%[stack])                          \n\t"
+        "ldc1       %[ftmp1],   0x28(%[stack])                          \n\t"
+        "ldc1       %[ftmp2],   0x38(%[stack])                          \n\t"
+        "ldc1       %[ftmp3],   0x48(%[stack])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
+        PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6]),            [addr7]"=&r"(addr[7])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [stack]"r"(stack)
+        : "memory"
+    );
+}
+
+void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+        int beta)
+{
+    uint64_t ptmp[0x11];
+    uint64_t pdat[4];
+    double ftmp[9];
+    mips_reg addr[7];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x04                   \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr3],   %[addr0],       %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "sdc1       %[ftmp3],   0x00(%[ptmp])                           \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "sdc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp2],   0x00(%[ptmp])                           \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp5],   0x10(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp7],   0x40(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp4],   0x50(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp8],   0x20(%[ptmp])                           \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
+        "sdc1       %[ftmp3],   0x20(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp0],   0x30(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp5],   0x70(%[ptmp])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "sdc1       %[ftmp3],   0x08(%[ptmp])                           \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "sdc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp2],   0x08(%[ptmp])                           \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "sdc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp5],   0x18(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp7],   0x48(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp4],   0x58(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp8],   0x28(%[ptmp])                           \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "sdc1       %[ftmp3],   0x28(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp0],   0x38(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
+        "sdc1       %[ftmp5],   0x78(%[ptmp])                           \n\t"
+        PTR_S      "%[addr1],   0x00(%[pdat])                           \n\t"
+        PTR_S      "%[addr2],   0x08(%[pdat])                           \n\t"
+        PTR_S      "%[addr0],   0x10(%[pdat])                           \n\t"
+        PTR_S      "%[addr3],   0x18(%[pdat])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [ptmp]"r"(ptmp),                  [pdat]"r"(pdat)
+        : "memory"
+    );
+
+    ff_deblock_v_luma_intra_8_mmi((uint8_t *) &ptmp[8], 0x10, alpha, beta);
+
+    __asm__ volatile (
+        PTR_L      "%[addr1],   0x00(%[pdat])                           \n\t"
+        PTR_L      "%[addr2],   0x08(%[pdat])                           \n\t"
+        PTR_L      "%[addr0],   0x10(%[pdat])                           \n\t"
+        PTR_L      "%[addr3],   0x18(%[pdat])                           \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        "ldc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp1],   0x18(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp3],   0x38(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp4],   0x48(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp5],   0x58(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "ldc1       %[ftmp8],   0x78(%[ptmp])                           \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
+        "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
+        "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
+        "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
+        "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
+        "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
+        "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        PTR_SUBU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
+        PTR_SUBU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
+        "ldc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp1],   0x10(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp3],   0x30(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp4],   0x40(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp5],   0x50(%[ptmp])                           \n\t"
+        "ldc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "ldc1       %[ftmp8],   0x70(%[ptmp])                           \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
+        "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
+        "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
+        "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
+        "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
+        "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
+        "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
+        "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [ptmp]"r"(ptmp),                  [pdat]"r"(pdat)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c
new file mode 100644
index 0000000..fce01ac
--- /dev/null
+++ b/libavcodec/mips/h264dsp_msa.c
@@ -0,0 +1,2544 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
+                            int32_t log2_denom, int32_t src_weight,
+                            int32_t offset_in)
+{
+    uint32_t data0, data1;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1;
+    v4i32 res0, res1;
+    v8i16 temp0, temp1, vec0, vec1, wgt, denom, offset;
+    v8u16 out0, out1;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom);
+
+    data0 = LW(data);
+    data1 = LW(data + stride);
+
+    src0 = (v16u8) __msa_fill_w(data0);
+    src1 = (v16u8) __msa_fill_w(data1);
+
+    ILVR_B2_SH(zero, src0, zero, src1, vec0, vec1);
+    MUL2(wgt, vec0, wgt, vec1, temp0, temp1);
+    ADDS_SH2_SH(temp0, offset, temp1, offset, temp0, temp1);
+    MAXI_SH2_SH(temp0, temp1, 0);
+
+    out0 = (v8u16) __msa_srl_h(temp0, denom);
+    out1 = (v8u16) __msa_srl_h(temp1, denom);
+
+    SAT_UH2_UH(out0, out1, 7);
+    PCKEV_B2_SW(out0, out0, out1, out1, res0, res1);
+
+    data0 = __msa_copy_u_w(res0, 0);
+    data1 = __msa_copy_u_w(res1, 0);
+    SW(data0, data);
+    data += stride;
+    SW(data1, data);
+}
+
+static void avc_wgt_4x4multiple_msa(uint8_t *data, int32_t stride,
+                                    int32_t height, int32_t log2_denom,
+                                    int32_t src_weight, int32_t offset_in)
+{
+    uint8_t cnt;
+    uint32_t data0, data1, data2, data3;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v8u16 temp0, temp1, temp2, temp3, wgt;
+    v8i16 denom, offset;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = (v8u16) __msa_fill_h(src_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom);
+
+    for (cnt = height / 4; cnt--;) {
+        LW4(data, stride, data0, data1, data2, data3);
+
+        src0 = (v16u8) __msa_fill_w(data0);
+        src1 = (v16u8) __msa_fill_w(data1);
+        src2 = (v16u8) __msa_fill_w(data2);
+        src3 = (v16u8) __msa_fill_w(data3);
+
+        ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   temp0, temp1, temp2, temp3);
+        MUL4(wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+             temp0, temp1, temp2, temp3);
+        ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                    temp0, temp1, temp2, temp3);
+        MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
+        SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
+        SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
+        PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, data, stride);
+        data += (4 * stride);
+    }
+}
+
+static void avc_wgt_4width_msa(uint8_t *data, int32_t stride,
+                               int32_t height, int32_t log2_denom,
+                               int32_t src_weight, int32_t offset_in)
+{
+    if (2 == height) {
+        avc_wgt_4x2_msa(data, stride, log2_denom, src_weight, offset_in);
+    } else {
+        avc_wgt_4x4multiple_msa(data, stride, height, log2_denom, src_weight,
+                                offset_in);
+    }
+}
+
+static void avc_wgt_8width_msa(uint8_t *data, int32_t stride,
+                               int32_t height, int32_t log2_denom,
+                               int32_t src_weight, int32_t offset_in)
+{
+    uint8_t cnt;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v8u16 src0_r, src1_r, src2_r, src3_r;
+    v8u16 temp0, temp1, temp2, temp3;
+    v8u16 wgt, denom, offset;
+    v16i8 out0, out1;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = (v8u16) __msa_fill_h(src_weight);
+    offset = (v8u16) __msa_fill_h(offset_in);
+    denom = (v8u16) __msa_fill_h(log2_denom);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_UB4(data, stride, src0, src1, src2, src3);
+        ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0_r, src1_r, src2_r, src3_r);
+        MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r,
+             temp0, temp1, temp2, temp3);
+        ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                    temp0, temp1, temp2, temp3);
+        MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
+        SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
+        SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
+        PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
+        ST8x4_UB(out0, out1, data, stride);
+        data += (4 * stride);
+    }
+}
+
+static void avc_wgt_16width_msa(uint8_t *data, int32_t stride,
+                                int32_t height, int32_t log2_denom,
+                                int32_t src_weight, int32_t offset_in)
+{
+    uint8_t cnt;
+    v16i8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8u16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8u16 wgt, denom, offset;
+
+    offset_in <<= (log2_denom);
+
+    if (log2_denom) {
+        offset_in += (1 << (log2_denom - 1));
+    }
+
+    wgt = (v8u16) __msa_fill_h(src_weight);
+    offset = (v8u16) __msa_fill_h(offset_in);
+    denom = (v8u16) __msa_fill_h(log2_denom);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_UB4(data, stride, src0, src1, src2, src3);
+        ILVR_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVL_B4_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0_l, src1_l, src2_l, src3_l);
+        MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l,
+             temp0, temp1, temp2, temp3);
+        MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l,
+             temp4, temp5, temp6, temp7);
+        ADDS_SH4_UH(temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                    temp0, temp1, temp2, temp3);
+        ADDS_SH4_UH(temp4, offset, temp5, offset, temp6, offset, temp7, offset,
+                    temp4, temp5, temp6, temp7);
+        MAXI_SH4_UH(temp0, temp1, temp2, temp3, 0);
+        MAXI_SH4_UH(temp4, temp5, temp6, temp7, 0);
+        SRL_H4_UH(temp0, temp1, temp2, temp3, denom);
+        SRL_H4_UH(temp4, temp5, temp6, temp7, denom);
+        SAT_UH4_UH(temp0, temp1, temp2, temp3, 7);
+        SAT_UH4_UH(temp4, temp5, temp6, temp7, 7);
+        PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                    dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, data, stride);
+        data += 4 * stride;
+    }
+}
+
+static void avc_biwgt_4x2_msa(uint8_t *src, int32_t src_stride,
+                              uint8_t *dst, int32_t dst_stride,
+                              int32_t log2_denom, int32_t src_weight,
+                              int32_t dst_weight, int32_t offset_in)
+{
+    uint32_t load0, load1, out0, out1;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, dst0, dst1;
+    v8i16 temp0, temp1, denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    load0 = LW(src);
+    src += src_stride;
+    load1 = LW(src);
+
+    src0 = (v16i8) __msa_fill_w(load0);
+    src1 = (v16i8) __msa_fill_w(load1);
+
+    load0 = LW(dst);
+    load1 = LW(dst + dst_stride);
+
+    dst0 = (v16i8) __msa_fill_w(load0);
+    dst1 = (v16i8) __msa_fill_w(load1);
+
+    XORI_B4_128_SB(src0, src1, dst0, dst1);
+    ILVR_B2_SH(dst0, src0, dst1, src1, temp0, temp1);
+
+    temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
+    temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+
+    temp0 >>= denom;
+    temp1 >>= denom;
+
+    CLIP_SH2_0_255(temp0, temp1);
+    PCKEV_B2_SB(temp0, temp0, temp1, temp1, dst0, dst1);
+
+    out0 = __msa_copy_u_w((v4i32) dst0, 0);
+    out1 = __msa_copy_u_w((v4i32) dst1, 0);
+    SW(out0, dst);
+    dst += dst_stride;
+    SW(out1, dst);
+}
+
+static void avc_biwgt_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int32_t height, int32_t log2_denom,
+                                      int32_t src_weight, int32_t dst_weight,
+                                      int32_t offset_in)
+{
+    uint8_t cnt;
+    uint32_t load0, load1, load2, load3;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 temp0, temp1, temp2, temp3;
+    v8i16 denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    for (cnt = height / 4; cnt--;) {
+        LW4(src, src_stride, load0, load1, load2, load3);
+        src += (4 * src_stride);
+
+        src0 = (v16i8) __msa_fill_w(load0);
+        src1 = (v16i8) __msa_fill_w(load1);
+        src2 = (v16i8) __msa_fill_w(load2);
+        src3 = (v16i8) __msa_fill_w(load3);
+
+        LW4(dst, dst_stride, load0, load1, load2, load3);
+
+        dst0 = (v16i8) __msa_fill_w(load0);
+        dst1 = (v16i8) __msa_fill_w(load1);
+        dst2 = (v16i8) __msa_fill_w(load2);
+        dst3 = (v16i8) __msa_fill_w(load3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   temp0, temp1, temp2, temp3);
+
+        temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
+        temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+        temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
+        temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
+
+        SRA_4V(temp0, temp1, temp2, temp3, denom);
+        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+        PCKEV_ST4x4_UB(temp0, temp1, temp2, temp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_biwgt_4width_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int32_t height, int32_t log2_denom,
+                                 int32_t src_weight, int32_t dst_weight,
+                                 int32_t offset_in)
+{
+    if (2 == height) {
+        avc_biwgt_4x2_msa(src, src_stride, dst, dst_stride, log2_denom,
+                          src_weight, dst_weight, offset_in);
+    } else {
+        avc_biwgt_4x4multiple_msa(src, src_stride, dst, dst_stride, height,
+                                  log2_denom, src_weight, dst_weight,
+                                  offset_in);
+    }
+}
+
+static void avc_biwgt_8width_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int32_t height, int32_t log2_denom,
+                                 int32_t src_weight, int32_t dst_weight,
+                                 int32_t offset_in)
+{
+    uint8_t cnt;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v16i8 out0, out1;
+    v8i16 temp0, temp1, temp2, temp3;
+    v8i16 denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   temp0, temp1, temp2, temp3);
+
+        temp0 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp0);
+        temp1 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp1);
+        temp2 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp2);
+        temp3 = __msa_dpadd_s_h(offset, wgt, (v16i8) temp3);
+
+        SRA_4V(temp0, temp1, temp2, temp3, denom);
+        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+        PCKEV_B2_SB(temp1, temp0, temp3, temp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += 4 * dst_stride;
+    }
+}
+
+static void avc_biwgt_16width_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t log2_denom,
+                                  int32_t src_weight, int32_t dst_weight,
+                                  int32_t offset_in)
+{
+    uint8_t cnt;
+    v16i8 src_wgt, dst_wgt, wgt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 denom, offset, add_val;
+    int32_t val = 128 * (src_weight + dst_weight);
+
+    offset_in = ((offset_in + 1) | 1) << log2_denom;
+
+    src_wgt = __msa_fill_b(src_weight);
+    dst_wgt = __msa_fill_b(dst_weight);
+    offset = __msa_fill_h(offset_in);
+    denom = __msa_fill_h(log2_denom + 1);
+    add_val = __msa_fill_h(val);
+    offset += add_val;
+
+    wgt = __msa_ilvev_b(dst_wgt, src_wgt);
+
+    for (cnt = height / 4; cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_SB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        XORI_B4_128_SB(dst0, dst1, dst2, dst3);
+        ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   vec0, vec2, vec4, vec6);
+        ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
+                   vec1, vec3, vec5, vec7);
+
+        temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
+        temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
+        temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
+        temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
+        temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
+        temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
+        temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
+        temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
+
+        SRA_4V(temp0, temp1, temp2, temp3, denom);
+        SRA_4V(temp4, temp5, temp6, temp7, denom);
+        CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+        CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+        PCKEV_B4_SB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                    dst0, dst1, dst2, dst3);
+        ST_SB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += 4 * dst_stride;
+    }
+}
+
+#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
+                                 q3_or_p3_org_in, p1_or_q1_org_in,          \
+                                 p2_or_q2_org_in, q1_or_p1_org_in,          \
+                                 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
+{                                                                           \
+    v8i16 threshold;                                                        \
+    v8i16 const3 = __msa_ldi_h(3);                                          \
+                                                                            \
+    threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \
+    threshold += (p1_or_q1_org_in);                                         \
+                                                                            \
+    (p0_or_q0_out) = threshold << 1;                                        \
+    (p0_or_q0_out) += (p2_or_q2_org_in);                                    \
+    (p0_or_q0_out) += (q1_or_p1_org_in);                                    \
+    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \
+                                                                            \
+    (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \
+    (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \
+                                                                            \
+    (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \
+    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
+    (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
+    (p2_or_q2_out) += threshold;                                            \
+    (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \
+}
+
+/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
+#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \
+                         p1_or_q1_org_in, p0_or_q0_out)      \
+{                                                            \
+    (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \
+    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
+    (p0_or_q0_out) += (p1_or_q1_org_in);                     \
+    (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \
+}
+
+#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \
+                         p1_or_q1_org_in, p2_or_q2_org_in,    \
+                         negate_tc_in, tc_in, p1_or_q1_out)   \
+{                                                             \
+    v8i16 clip3, temp;                                        \
+                                                              \
+    clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \
+                                   (v8u16) q0_or_p0_org_in);  \
+    temp = p1_or_q1_org_in << 1;                              \
+    clip3 = clip3 - temp;                                     \
+    clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
+    clip3 = CLIP_SH(clip3, negate_tc_in, tc_in);              \
+    p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
+}
+
+#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \
+                     p1_or_q1_org_in, q1_or_p1_org_in,          \
+                     negate_threshold_in, threshold_in,         \
+                     p0_or_q0_out, q0_or_p0_out)                \
+{                                                               \
+    v8i16 q0_sub_p0, p1_sub_q1, delta;                          \
+                                                                \
+    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \
+    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \
+    q0_sub_p0 <<= 2;                                            \
+    p1_sub_q1 += 4;                                             \
+    delta = q0_sub_p0 + p1_sub_q1;                              \
+    delta >>= 3;                                                \
+                                                                \
+    delta = CLIP_SH(delta, negate_threshold_in, threshold_in);  \
+                                                                \
+    p0_or_q0_out = p0_or_q0_org_in + delta;                     \
+    q0_or_p0_out = q0_or_p0_org_in - delta;                     \
+                                                                \
+    CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \
+}
+
+#define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \
+{                                                                        \
+    uint32_t load0, load1, load2, load3;                                 \
+    v16u8 src0 = { 0 };                                                  \
+    v16u8 src1 = { 0 };                                                  \
+    v16u8 src2 = { 0 };                                                  \
+    v16u8 src3 = { 0 };                                                  \
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \
+    v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \
+    v8i16 res0_r, res1_r;                                                \
+    v16i8 zeros = { 0 };                                                 \
+    v16u8 res0, res1;                                                    \
+                                                                         \
+    LW4((src - 2), stride, load0, load1, load2, load3);                  \
+    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \
+    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \
+    src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \
+    src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \
+                                                                         \
+    TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \
+                                                                         \
+    p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \
+    p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \
+    q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \
+                                                                         \
+    tc = __msa_fill_h(tc_val);                                           \
+                                                                         \
+    is_less_than_alpha = (p0_asub_q0 < alpha);                           \
+    is_less_than_beta = (p1_asub_p0 < beta);                             \
+    is_less_than = is_less_than_alpha & is_less_than_beta;               \
+    is_less_than_beta = (q1_asub_q0 < beta);                             \
+    is_less_than = is_less_than_beta & is_less_than;                     \
+                                                                         \
+    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \
+    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \
+                                                                         \
+    q0_sub_p0 <<= 2;                                                     \
+    delta = q0_sub_p0 + p1_sub_q1;                                       \
+    delta = __msa_srari_h(delta, 3);                                     \
+                                                                         \
+    delta = CLIP_SH(delta, -tc, tc);                                     \
+                                                                         \
+    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
+                                                                         \
+    res0_r += delta;                                                     \
+    res1_r -= delta;                                                     \
+                                                                         \
+    CLIP_SH2_0_255(res0_r, res1_r);                                      \
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \
+                                                                         \
+    res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \
+    res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \
+                                                                         \
+    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \
+}
+
+#define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \
+{                                                            \
+    v16i8 zero_m = { 0 };                                    \
+                                                             \
+    out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
+    out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \
+    SLDI_B2_0_UB(out1, out2, out2, out3, 2);                 \
+}
+
+#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
+{                                                                          \
+    uint32_t load0, load1;                                                 \
+    v16u8 src0 = { 0 };                                                    \
+    v16u8 src1 = { 0 };                                                    \
+    v16u8 src2 = { 0 };                                                    \
+    v16u8 src3 = { 0 };                                                    \
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \
+    v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \
+    v16i8 zeros = { 0 };                                                   \
+    v16u8 res0, res1;                                                      \
+                                                                           \
+    load0 = LW(src - 2);                                                   \
+    load1 = LW(src - 2 + stride);                                          \
+                                                                           \
+    src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \
+    src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \
+                                                                           \
+    TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \
+                                                                           \
+    p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \
+    p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \
+    q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \
+                                                                           \
+    tc = __msa_fill_h(tc_val);                                             \
+                                                                           \
+    is_less_than_alpha = (p0_asub_q0 < alpha);                             \
+    is_less_than_beta = (p1_asub_p0 < beta);                               \
+    is_less_than = is_less_than_alpha & is_less_than_beta;                 \
+    is_less_than_beta = (q1_asub_q0 < beta);                               \
+    is_less_than = is_less_than_beta & is_less_than;                       \
+                                                                           \
+    ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \
+    HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \
+                                                                           \
+    q0_sub_p0 <<= 2;                                                       \
+    delta = q0_sub_p0 + p1_sub_q1;                                         \
+    delta = __msa_srari_h(delta, 3);                                       \
+    delta = CLIP_SH(delta, -tc, tc);                                       \
+                                                                           \
+    ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
+                                                                           \
+    res0_r += delta;                                                       \
+    res1_r -= delta;                                                       \
+                                                                           \
+    CLIP_SH2_0_255(res0_r, res1_r);                                        \
+    PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \
+                                                                           \
+    res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \
+    res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \
+                                                                           \
+    res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \
+}
+
+static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
+    v16u8 alpha, beta;
+    v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
+    v16u8 p2, p1, p0, q0, q1, q2;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v8i16 p2_r = { 0 };
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 q2_r = { 0 };
+    v8i16 p2_l = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v8i16 q2_l = { 0 };
+    v16u8 tmp_flag;
+    v16i8 zero = { 0 };
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
+
+    {
+        v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        q2_org = LD_UB(data + (2 * img_width));
+        p3_org = LD_UB(data - (img_width << 2));
+        p2_org = LD_UB(data - (3 * img_width));
+
+        UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+        UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+        UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = (p0_asub_q0 < tmp_flag);
+
+        p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+        is_less_than_beta = (p2_asub_p0 < beta);
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+        {
+            v8u16 is_less_than_beta_l, is_less_than_beta_r;
+
+            q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+
+            is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                v8i16 p3_org_r;
+
+                ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
+                                         p2_r, q1_org_r, p0_r, p1_r, p2_r);
+            }
+
+            q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+
+            is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                v8i16 p3_org_l;
+
+                ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
+                                         p2_l, q1_org_l, p0_l, p1_l, p2_l);
+            }
+        }
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
+            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
+
+            ST_UB(p1_org, data - (2 * img_width));
+            ST_UB(p2_org, data - (3 * img_width));
+        }
+        {
+            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
+            }
+
+            negate_is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
+            }
+        }
+        /* combine */
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
+            p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
+        }
+
+        ST_UB(p0_org, data - img_width);
+
+        /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
+        q3_org = LD_UB(data + (3 * img_width));
+        q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+        is_less_than_beta = (q2_asub_q0 < beta);
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        {
+            v8u16 is_less_than_beta_l, is_less_than_beta_r;
+            is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                v8i16 q3_org_r;
+
+                ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
+                                         q2_r, p1_org_r, q0_r, q1_r, q2_r);
+            }
+            is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+            if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                v8i16 q3_org_l;
+
+                ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
+                                         q2_l, p1_org_l, q0_l, q1_l, q2_l);
+            }
+        }
+
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
+            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
+
+            ST_UB(q1_org, data + img_width);
+            ST_UB(q2_org, data + 2 * img_width);
+        }
+        {
+            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
+            negate_is_less_than_beta_r =
+                (v8u16) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
+            }
+
+            negate_is_less_than_beta_l =
+                (v8u16) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v((v16u8) negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
+            }
+        }
+        /* combine */
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
+            q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
+        }
+        ST_UB(q0_org, data);
+    }
+}
+
+static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    uint8_t *src;
+    v16u8 alpha, beta, p0_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than;
+    v16u8 is_less_than_beta, negate_is_less_than_beta;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v8i16 p2_r = { 0 };
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 q2_r = { 0 };
+    v8i16 p2_l = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v8i16 q2_l = { 0 };
+    v16i8 zero = { 0 };
+    v16u8 tmp_flag;
+
+    src = data - 4;
+    {
+        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+        v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+        LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
+        LD_UB8(src + (8 * img_width), img_width,
+               row8, row9, row10, row11, row12, row13, row14, row15);
+
+        TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
+                            row4, row5, row6, row7,
+                            row8, row9, row10, row11,
+                            row12, row13, row14, row15,
+                            p3_org, p2_org, p1_org, p0_org,
+                            q0_org, q1_org, q2_org, q3_org);
+    }
+    UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+    UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+    UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+    UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
+
+    /*  if ( ((unsigned)ABS(p0-q0) < thresholds->alpha_in) &&
+       ((unsigned)ABS(p1-p0) < thresholds->beta_in)  &&
+       ((unsigned)ABS(q1-q0) < thresholds->beta_in) )   */
+    {
+        v16u8 p1_asub_p0, q1_asub_q0;
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = (p0_asub_q0 < tmp_flag);
+
+        {
+            v16u8 p2_asub_p0;
+
+            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+            is_less_than_beta = (p2_asub_p0 < beta);
+        }
+        is_less_than_beta = tmp_flag & is_less_than_beta;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* right */
+        {
+            v16u8 is_less_than_beta_r;
+
+            is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                v8i16 p3_org_r;
+
+                ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
+                                         p2_r, q1_org_r, p0_r, p1_r, p2_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 is_less_than_beta_l;
+
+            is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                v8i16 p3_org_l;
+
+                ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
+                                         p2_l, q1_org_l, p0_l, p1_l, p2_l);
+            }
+        }
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v16u8 p0, p2, p1;
+
+            PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
+            p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+            p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
+        }
+        /* right */
+        {
+            v16u8 negate_is_less_than_beta_r;
+
+            negate_is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+
+            if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
+            }
+        }
+
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            v16u8 p0;
+
+            p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
+            p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
+        }
+
+        {
+            v16u8 q2_asub_q0;
+
+            q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+            is_less_than_beta = (q2_asub_q0 < beta);
+        }
+
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
+
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        /* right */
+        {
+            v16u8 is_less_than_beta_r;
+
+            is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                v8i16 q3_org_r;
+
+                ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
+                                         q2_r, p1_org_r, q0_r, q1_r, q2_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 is_less_than_beta_l;
+
+            is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+            if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                v8i16 q3_org_l;
+
+                ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
+                                         q2_l, p1_org_l, q0_l, q1_l, q2_l);
+            }
+        }
+        /* combine and store */
+        if (!__msa_test_bz_v(is_less_than_beta)) {
+            v16u8 q0, q1, q2;
+
+            PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
+            q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+            q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
+        }
+
+        /* right */
+        {
+            v16u8 negate_is_less_than_beta_r;
+
+            negate_is_less_than_beta_r =
+                (v16u8) __msa_sldi_b((v16i8) negate_is_less_than_beta, zero, 8);
+            if (!__msa_test_bz_v(negate_is_less_than_beta_r)) {
+                AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
+            }
+        }
+        /* left */
+        {
+            v16u8 negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_l =
+                (v16u8) __msa_sldi_b(zero, (v16i8) negate_is_less_than_beta, 8);
+            if (!__msa_test_bz_v(negate_is_less_than_beta_l)) {
+                AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
+            }
+        }
+        if (!__msa_test_bz_v(negate_is_less_than_beta)) {
+            v16u8 q0;
+
+            q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
+            q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
+        }
+    }
+    {
+        v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+        ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
+        ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
+        ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
+
+        ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
+        ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
+
+        src = data - 3;
+        ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp2, 0, src + 4, img_width);
+        src += 4 * img_width;
+        ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp2, 4, src + 4, img_width);
+        src += 4 * img_width;
+
+        ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp5, 0, src + 4, img_width);
+        src += 4 * img_width;
+        ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
+        ST2x4_UB(tmp5, 4, src + 4, img_width);
+    }
+}
+
+static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
+                                                   int32_t alpha_in,
+                                                   int32_t beta_in)
+{
+    uint64_t load0, load1;
+    uint32_t out0, out2;
+    uint16_t out1, out3;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
+    v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
+    v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 alpha, beta;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
+    v16u8 is_less_than_beta1, is_less_than_beta2;
+    v16i8 src0 = { 0 };
+    v16i8 src1 = { 0 };
+    v16i8 src2 = { 0 };
+    v16i8 src3 = { 0 };
+    v16i8 src4 = { 0 };
+    v16i8 src5 = { 0 };
+    v16i8 src6 = { 0 };
+    v16i8 src7 = { 0 };
+    v16i8 zeros = { 0 };
+
+    load0 = LD(src - 4);
+    load1 = LD(src + stride - 4);
+    src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
+    src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
+
+    load0 = LD(src + (2 * stride) - 4);
+    load1 = LD(src + (3 * stride) - 4);
+    src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
+    src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
+
+    load0 = LD(src + (4 * stride) - 4);
+    load1 = LD(src + (5 * stride) - 4);
+    src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
+    src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
+
+    load0 = LD(src + (6 * stride) - 4);
+    load1 = LD(src + (7 * stride) - 4);
+    src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
+    src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+               src0, src1, src2, src3);
+
+    ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
+    ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
+
+    ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
+    ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
+    SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8);
+
+    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
+    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
+    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_alpha & is_less_than_beta;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than & is_less_than_beta;
+
+    alpha >>= 2;
+    alpha += 2;
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+
+    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
+    is_less_than_beta1 = (p2_asub_p0 < beta);
+    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
+    is_less_than_beta2 = (q2_asub_q0 < beta);
+
+    ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
+               src4_r, src5_r, src6_r, src7_r);
+
+    dst2_x_r = src1_r + src2_r + src3_r;
+    dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
+    dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
+    dst1_r = src0_r + src1_r + src2_r + src3_r;
+    dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
+
+    dst0_r = (2 * src6_r) + (3 * src0_r);
+    dst0_r += src1_r + src2_r + src3_r;
+    dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
+    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
+    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
+
+    PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
+    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
+
+    dst3_x_r = src2_r + src3_r + src4_r;
+    dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
+    dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
+    dst4_r = src2_r + src3_r + src4_r + src5_r;
+    dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
+
+    dst5_r = (2 * src7_r) + (3 * src5_r);
+    dst5_r += src4_r + src3_r + src2_r;
+    dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
+    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
+    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
+
+    PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
+    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
+
+    dst2_y_r = (2 * src1_r) + src2_r + src4_r;
+    dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
+    dst3_y_r = (2 * src4_r) + src3_r + src1_r;
+    dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
+
+    PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
+
+    dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
+    dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
+    dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
+    dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
+
+    is_less_than = is_less_than_alpha & is_less_than;
+    dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
+    is_less_than_beta1 = is_less_than_beta1 & is_less_than;
+    dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
+
+    dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+    dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
+    dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
+    is_less_than_beta2 = is_less_than_beta2 & is_less_than;
+    dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
+    dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
+    dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
+
+    ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
+    dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
+    ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
+    ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
+
+    ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
+    SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8);
+    dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
+    dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
+    SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8);
+
+    out0 = __msa_copy_u_w((v4i32) dst0, 0);
+    out1 = __msa_copy_u_h((v8i16) dst0, 2);
+    out2 = __msa_copy_u_w((v4i32) dst1, 0);
+    out3 = __msa_copy_u_h((v8i16) dst1, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
+    out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
+    out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
+    out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst4, 0);
+    out1 = __msa_copy_u_h((v8i16) dst4, 2);
+    out2 = __msa_copy_u_w((v4i32) dst5, 0);
+    out3 = __msa_copy_u_h((v8i16) dst5, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+    src += stride;
+
+    out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
+    out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
+    out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
+    out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
+
+    SW(out0, (src - 3));
+    SH(out1, (src + 1));
+    src += stride;
+    SW(out2, (src - 3));
+    SH(out3, (src + 1));
+}
+
+static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v16u8 alpha, beta;
+    v16u8 is_less_than;
+    v8i16 p0_or_q0, q0_or_p0;
+    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
+    v16i8 zero = { 0 };
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than_beta;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
+           p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
+
+    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
+                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
+        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
+
+        p0_or_q0_org =
+            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
+        q0_or_p0_org =
+            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
+
+        ST_UB(q0_or_p0_org, data_cb_or_cr);
+        ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v8i16 tmp1;
+    v16u8 alpha, beta, is_less_than;
+    v8i16 p0_or_q0, q0_or_p0;
+    v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
+    v16i8 zero = { 0 };
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than_beta;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+    {
+        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+
+        LD_UB8((data_cb_or_cr - 2), img_width,
+               row0, row1, row2, row3, row4, row5, row6, row7);
+
+        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                           p1_or_q1_org, p0_or_q0_org,
+                           q0_or_p0_org, q1_or_p1_org);
+    }
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
+    p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
+    q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_beta & is_less_than_alpha;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+    is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+    if (!__msa_test_bz_v(is_less_than)) {
+        ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
+                   zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+        AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
+        AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
+
+        /* convert 16 bit output into 8 bit output */
+        PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
+
+        p0_or_q0_org =
+            __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
+        q0_or_p0_org =
+            __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
+        tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
+
+        data_cb_or_cr -= 1;
+        ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
+        data_cb_or_cr += 4 * img_width;
+        ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
+                                                   uint8_t bs0, uint8_t bs1,
+                                                   uint8_t bs2, uint8_t bs3,
+                                                   uint8_t tc0, uint8_t tc1,
+                                                   uint8_t tc2, uint8_t tc3,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t img_width)
+{
+    uint8_t *src;
+    v16u8 beta, tmp_vec, bs = { 0 };
+    v16u8 tc = { 0 };
+    v16u8 is_less_than, is_less_than_beta;
+    v16u8 p1, p0, q0, q1;
+    v8i16 p0_r, q0_r, p1_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 p0_l, q0_l, p1_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
+    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
+    v8i16 tc_r, tc_l;
+    v16i8 zero = { 0 };
+    v16u8 is_bs_greater_than0;
+
+    tmp_vec = (v16u8) __msa_fill_b(bs0);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs1);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs2);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs3);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
+
+    if (!__msa_test_bz_v(bs)) {
+        tmp_vec = (v16u8) __msa_fill_b(tc0);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc1);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc2);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc3);
+        tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
+
+        is_bs_greater_than0 = (zero < bs);
+
+        {
+            v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+            v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+            src = data;
+            src -= 4;
+
+            LD_UB8(src, img_width,
+                   row0, row1, row2, row3, row4, row5, row6, row7);
+            src += (8 * img_width);
+            LD_UB8(src, img_width,
+                   row8, row9, row10, row11, row12, row13, row14, row15);
+
+            TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                                row8, row9, row10, row11,
+                                row12, row13, row14, row15,
+                                p3_org, p2_org, p1_org, p0_org,
+                                q0_org, q1_org, q2_org, q3_org);
+        }
+        {
+            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
+            v16u8 is_less_than_alpha;
+
+            p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+            p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+            q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+            alpha = (v16u8) __msa_fill_b(alpha_in);
+            beta = (v16u8) __msa_fill_b(beta_in);
+
+            is_less_than_alpha = (p0_asub_q0 < alpha);
+            is_less_than_beta = (p1_asub_p0 < beta);
+            is_less_than = is_less_than_beta & is_less_than_alpha;
+            is_less_than_beta = (q1_asub_q0 < beta);
+            is_less_than = is_less_than_beta & is_less_than;
+            is_less_than = is_less_than & is_bs_greater_than0;
+        }
+        if (!__msa_test_bz_v(is_less_than)) {
+            v16i8 negate_tc, sign_negate_tc;
+            v8i16 negate_tc_r, i16_negatetc_l;
+
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
+
+            UNPCK_UB_SH(tc, tc_r, tc_l);
+            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+            {
+                v16u8 p2_asub_p0;
+                v16u8 is_less_than_beta_r, is_less_than_beta_l;
+
+                p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+                is_less_than_beta = (p2_asub_p0 < beta);
+                is_less_than_beta = is_less_than_beta & is_less_than;
+
+                is_less_than_beta_r =
+                    (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                    p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                     negate_tc_r, tc_r, p1_r);
+                }
+
+                is_less_than_beta_l =
+                    (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                    p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                     i16_negatetc_l, tc_l, p1_l);
+                }
+            }
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
+                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + is_less_than_beta;
+            }
+
+            {
+                v16u8 u8_q2asub_q0;
+                v16u8 is_less_than_beta_l, is_less_than_beta_r;
+
+                u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+                is_less_than_beta = (u8_q2asub_q0 < beta);
+                is_less_than_beta = is_less_than_beta & is_less_than;
+
+                q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+
+                is_less_than_beta_r =
+                    (v16u8) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_r)) {
+                    q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                     negate_tc_r, tc_r, q1_r);
+                }
+
+                q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+
+                is_less_than_beta_l =
+                    (v16u8) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+                if (!__msa_test_bz_v(is_less_than_beta_l)) {
+                    q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                     i16_negatetc_l, tc_l, q1_l);
+                }
+            }
+
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
+                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + is_less_than_beta;
+            }
+
+            {
+                v8i16 threshold_r, negate_thresh_r;
+                v8i16 threshold_l, negate_thresh_l;
+                v16i8 negate_thresh, sign_negate_thresh;
+
+                negate_thresh = zero - (v16i8) tc;
+                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
+
+                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
+                           threshold_r, negate_thresh_r);
+
+                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                             negate_thresh_r, threshold_r, p0_r, q0_r);
+
+                threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
+                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
+                                                       negate_thresh);
+
+                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                             negate_thresh_l, threshold_l, p0_l, q0_l);
+            }
+
+            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+        }
+        {
+            v16i8 tp0, tp1, tp2, tp3;
+            v8i16 tmp2, tmp5;
+            v4i32 tmp3, tmp4, tmp6, tmp7;
+            uint32_t out0, out2;
+            uint16_t out1, out3;
+
+            src = data - 3;
+
+            ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
+            ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
+            ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
+
+            ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
+            ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
+
+            out0 = __msa_copy_u_w(tmp3, 0);
+            out1 = __msa_copy_u_h(tmp2, 0);
+            out2 = __msa_copy_u_w(tmp3, 1);
+            out3 = __msa_copy_u_h(tmp2, 1);
+
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp3, 2);
+            out1 = __msa_copy_u_h(tmp2, 2);
+            out2 = __msa_copy_u_w(tmp3, 3);
+            out3 = __msa_copy_u_h(tmp2, 3);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp4, 0);
+            out1 = __msa_copy_u_h(tmp2, 4);
+            out2 = __msa_copy_u_w(tmp4, 1);
+            out3 = __msa_copy_u_h(tmp2, 5);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp4, 2);
+            out1 = __msa_copy_u_h(tmp2, 6);
+            out2 = __msa_copy_u_w(tmp4, 3);
+            out3 = __msa_copy_u_h(tmp2, 7);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp6, 0);
+            out1 = __msa_copy_u_h(tmp5, 0);
+            out2 = __msa_copy_u_w(tmp6, 1);
+            out3 = __msa_copy_u_h(tmp5, 1);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp6, 2);
+            out1 = __msa_copy_u_h(tmp5, 2);
+            out2 = __msa_copy_u_w(tmp6, 3);
+            out3 = __msa_copy_u_h(tmp5, 3);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp7, 0);
+            out1 = __msa_copy_u_h(tmp5, 4);
+            out2 = __msa_copy_u_w(tmp7, 1);
+            out3 = __msa_copy_u_h(tmp5, 5);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+
+            out0 = __msa_copy_u_w(tmp7, 2);
+            out1 = __msa_copy_u_h(tmp5, 6);
+            out2 = __msa_copy_u_w(tmp7, 3);
+            out3 = __msa_copy_u_h(tmp5, 7);
+
+            src += img_width;
+            SW(out0, src);
+            SH(out1, (src + 4));
+            src += img_width;
+            SW(out2, src);
+            SH(out3, (src + 4));
+        }
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
+                                                   uint8_t bs0, uint8_t bs1,
+                                                   uint8_t bs2, uint8_t bs3,
+                                                   uint8_t tc0, uint8_t tc1,
+                                                   uint8_t tc2, uint8_t tc3,
+                                                   uint8_t alpha_in,
+                                                   uint8_t beta_in,
+                                                   uint32_t image_width)
+{
+    v16u8 p2_asub_p0, u8_q2asub_q0;
+    v16u8 alpha, beta, is_less_than, is_less_than_beta;
+    v16u8 p1, p0, q0, q1;
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r, q0_r, q1_r = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l, q0_l, q1_l = { 0 };
+    v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
+    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
+    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
+    v16i8 zero = { 0 };
+    v16u8 tmp_vec;
+    v16u8 bs = { 0 };
+    v16i8 tc = { 0 };
+
+    tmp_vec = (v16u8) __msa_fill_b(bs0);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs1);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs2);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
+    tmp_vec = (v16u8) __msa_fill_b(bs3);
+    bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
+
+    if (!__msa_test_bz_v(bs)) {
+        tmp_vec = (v16u8) __msa_fill_b(tc0);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc1);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc2);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
+        tmp_vec = (v16u8) __msa_fill_b(tc3);
+        tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        LD_UB5(data - (3 * image_width), image_width,
+               p2_org, p1_org, p0_org, q0_org, q1_org);
+
+        {
+            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+            v16u8 is_less_than_alpha, is_bs_greater_than0;
+
+            is_bs_greater_than0 = ((v16u8) zero < bs);
+            p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+            p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+            q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+            is_less_than_alpha = (p0_asub_q0 < alpha);
+            is_less_than_beta = (p1_asub_p0 < beta);
+            is_less_than = is_less_than_beta & is_less_than_alpha;
+            is_less_than_beta = (q1_asub_q0 < beta);
+            is_less_than = is_less_than_beta & is_less_than;
+            is_less_than = is_less_than & is_bs_greater_than0;
+        }
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            v16i8 sign_negate_tc, negate_tc;
+            v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
+
+            q2_org = LD_UB(data + (2 * image_width));
+            negate_tc = zero - tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
+
+            UNPCK_UB_SH(tc, tc_r, tc_l);
+            UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
+            UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
+            UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
+
+            p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
+            is_less_than_beta = (p2_asub_p0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+            {
+                v8u16 is_less_than_beta_r, is_less_than_beta_l;
+
+                is_less_than_beta_r =
+                    (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                    p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                     negate_tc_r, tc_r, p1_r);
+                }
+
+                is_less_than_beta_l =
+                    (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                    p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                     i16_negatetc_l, tc_l, p1_l);
+                }
+            }
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
+                p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
+                ST_UB(p1_org, data - (2 * image_width));
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + (v16i8) is_less_than_beta;
+            }
+
+            u8_q2asub_q0 = __msa_asub_u_b(q2_org, q0_org);
+            is_less_than_beta = (u8_q2asub_q0 < beta);
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            {
+                v8u16 is_less_than_beta_r, is_less_than_beta_l;
+                is_less_than_beta_r =
+                    (v8u16) __msa_sldi_b((v16i8) is_less_than_beta, zero, 8);
+
+                q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_r)) {
+                    q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                     negate_tc_r, tc_r, q1_r);
+                }
+                is_less_than_beta_l =
+                    (v8u16) __msa_sldi_b(zero, (v16i8) is_less_than_beta, 8);
+
+                q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
+                if (!__msa_test_bz_v((v16u8) is_less_than_beta_l)) {
+                    q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
+
+                    AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                     i16_negatetc_l, tc_l, q1_l);
+                }
+            }
+            if (!__msa_test_bz_v(is_less_than_beta)) {
+                q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
+                q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
+                ST_UB(q1_org, data + image_width);
+
+                is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
+                tc = tc + (v16i8) is_less_than_beta;
+            }
+            {
+                v16i8 negate_thresh, sign_negate_thresh;
+                v8i16 threshold_r, threshold_l;
+                v8i16 negate_thresh_l, negate_thresh_r;
+
+                negate_thresh = zero - tc;
+                sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
+
+                ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
+                           threshold_r, negate_thresh_r);
+                AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                             negate_thresh_r, threshold_r, p0_r, q0_r);
+
+                threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
+                negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
+                                                       negate_thresh);
+                AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                             negate_thresh_l, threshold_l, p0_l, q0_l);
+            }
+
+            PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+
+            ST_UB(p0_org, (data - image_width));
+            ST_UB(q0_org, data);
+        }
+    }
+}
+
+static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, int32_t stride,
+                                             int32_t alpha_in, int32_t beta_in,
+                                             int8_t *tc0)
+{
+    uint8_t *data = in;
+    uint32_t out0, out1, out2, out3;
+    uint64_t load;
+    uint32_t tc_val;
+    v16u8 alpha, beta;
+    v16i8 inp0 = { 0 };
+    v16i8 inp1 = { 0 };
+    v16i8 inp2 = { 0 };
+    v16i8 inp3 = { 0 };
+    v16i8 inp4 = { 0 };
+    v16i8 inp5 = { 0 };
+    v16i8 inp6 = { 0 };
+    v16i8 inp7 = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 src4, src5, src6, src7;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
+    v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
+    v16u8 is_less_than_beta1, is_less_than_beta2;
+    v8i16 tc, tc_orig_r, tc_plus1;
+    v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
+    v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
+    v8u16 src2_r, src3_r;
+    v8i16 p2_r, p1_r, q2_r, q1_r;
+    v16u8 p2, q2, p0, q0;
+    v4i32 dst0, dst1;
+    v16i8 zeros = { 0 };
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    if (tc0[0] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
+        load = LD(data - 3 + stride);
+        inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[1] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
+        load = LD(data - 3 + stride);
+        inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[2] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
+        load = LD(data - 3 + stride);
+        inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
+        data += (2 * stride);
+    }
+
+    if (tc0[3] < 0) {
+        data += (2 * stride);
+    } else {
+        load = LD(data - 3);
+        inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
+        load = LD(data - 3 + stride);
+        inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
+        data += (2 * stride);
+    }
+
+    ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
+               src0, src1, src2, src3);
+
+    ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
+    ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
+
+    src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
+    src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
+    src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
+    src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
+    src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
+    src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
+
+    p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
+    p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
+    q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
+    p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
+    q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
+
+    is_less_than_alpha = (p0_asub_q0 < alpha);
+    is_less_than_beta = (p1_asub_p0 < beta);
+    is_less_than = is_less_than_alpha & is_less_than_beta;
+    is_less_than_beta = (q1_asub_q0 < beta);
+    is_less_than = is_less_than_beta & is_less_than;
+
+    is_less_than_beta1 = (p2_asub_p0 < beta);
+    is_less_than_beta2 = (q2_asub_q0 < beta);
+
+    p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
+    p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
+    p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
+
+    ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
+    p2_r += p0_add_q0;
+    p2_r >>= 1;
+    p2_r -= p1_r;
+    ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
+    q2_r += p0_add_q0;
+    q2_r >>= 1;
+    q2_r -= q1_r;
+
+    tc_val = LW(tc0);
+    tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
+    tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
+    is_tc_orig1 = tc_orig;
+    is_tc_orig2 = tc_orig;
+    tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
+    tc = tc_orig_r;
+
+    p2_r = CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
+    q2_r = CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
+
+    p2_r += p1_r;
+    q2_r += q1_r;
+
+    PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
+
+    is_tc_orig1 = (zeros < is_tc_orig1);
+    is_tc_orig2 = is_tc_orig1;
+    is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
+    is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
+    is_tc_orig1 = is_less_than & is_tc_orig1;
+    is_tc_orig2 = is_less_than & is_tc_orig2;
+
+    p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
+    q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
+
+    q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
+    q0_sub_p0 <<= 2;
+    p1_sub_q1 = p1_r - q1_r;
+    q0_sub_p0 += p1_sub_q1;
+    q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
+
+    tc_plus1 = tc + 1;
+    is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
+                                              (v16i8) is_less_than_beta1);
+    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
+    tc_plus1 = tc + 1;
+    is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
+                                              (v16i8) is_less_than_beta2);
+    tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
+
+    q0_sub_p0 = CLIP_SH(q0_sub_p0, -tc, tc);
+
+    ILVR_B2_UH(zeros, src2, zeros, src3, src2_r, src3_r);
+    src2_r += q0_sub_p0;
+    src3_r -= q0_sub_p0;
+
+    src2_r = (v8u16) CLIP_SH_0_255(src2_r);
+    src3_r = (v8u16) CLIP_SH_0_255(src3_r);
+
+    PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
+
+    p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
+    q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
+
+    ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
+
+    ILVRL_H2_SW(q2, p2, dst0, dst1);
+
+    data = in;
+
+    out0 = __msa_copy_u_w(dst0, 0);
+    out1 = __msa_copy_u_w(dst0, 1);
+    out2 = __msa_copy_u_w(dst0, 2);
+    out3 = __msa_copy_u_w(dst0, 3);
+
+    if (tc0[0] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out0, (data - 2));
+        data += stride;
+        SW(out1, (data - 2));
+        data += stride;
+    }
+
+    if (tc0[1] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out2, (data - 2));
+        data += stride;
+        SW(out3, (data - 2));
+        data += stride;
+    }
+
+    out0 = __msa_copy_u_w(dst1, 0);
+    out1 = __msa_copy_u_w(dst1, 1);
+    out2 = __msa_copy_u_w(dst1, 2);
+    out3 = __msa_copy_u_w(dst1, 3);
+
+    if (tc0[2] < 0) {
+        data += (2 * stride);
+    } else {
+        SW(out0, (data - 2));
+        data += stride;
+        SW(out1, (data - 2));
+        data += stride;
+    }
+
+    if (tc0[3] >= 0) {
+        SW(out2, (data - 2));
+        data += stride;
+        SW(out3, (data - 2));
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
+                                                       uint8_t bs0, uint8_t bs1,
+                                                       uint8_t bs2, uint8_t bs3,
+                                                       uint8_t tc0, uint8_t tc1,
+                                                       uint8_t tc2, uint8_t tc3,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    v16u8 alpha, beta;
+    v8i16 tmp_vec;
+    v8i16 bs = { 0 };
+    v8i16 tc = { 0 };
+    v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than;
+    v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
+    v8i16 p0_r, q0_r;
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v16i8 negate_tc, sign_negate_tc;
+    v8i16 tc_r, negate_tc_r;
+    v16i8 zero = { 0 };
+
+    tmp_vec = (v8i16) __msa_fill_b(bs0);
+    bs = __msa_insve_h(bs, 0, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs1);
+    bs = __msa_insve_h(bs, 1, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs2);
+    bs = __msa_insve_h(bs, 2, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs3);
+    bs = __msa_insve_h(bs, 3, tmp_vec);
+
+    if (!__msa_test_bz_v((v16u8) bs)) {
+        tmp_vec = (v8i16) __msa_fill_b(tc0);
+        tc = __msa_insve_h(tc, 0, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc1);
+        tc = __msa_insve_h(tc, 1, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc2);
+        tc = __msa_insve_h(tc, 2, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc3);
+        tc = __msa_insve_h(tc, 3, tmp_vec);
+
+        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        LD_UB4(data - (img_width << 1), img_width,
+               p1_org, p0_org, q0_org, q1_org);
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_less_than & is_bs_greater_than0;
+
+        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
+
+            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
+                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
+                         tc_r, p0_r, q0_r);
+
+            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+
+            ST_UB(q0_org, data);
+            ST_UB(p0_org, (data - img_width));
+        }
+    }
+}
+
+static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
+                                                       uint8_t bs0, uint8_t bs1,
+                                                       uint8_t bs2, uint8_t bs3,
+                                                       uint8_t tc0, uint8_t tc1,
+                                                       uint8_t tc2, uint8_t tc3,
+                                                       uint8_t alpha_in,
+                                                       uint8_t beta_in,
+                                                       uint32_t img_width)
+{
+    uint8_t *src;
+    v16u8 alpha, beta;
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
+    v16u8 p0, q0;
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v16u8 is_bs_greater_than0;
+    v8i16 tc_r, negate_tc_r;
+    v16i8 negate_tc, sign_negate_tc;
+    v16i8 zero = { 0 };
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v8i16 tmp1, tmp_vec, bs = { 0 };
+    v8i16 tc = { 0 };
+
+    tmp_vec = (v8i16) __msa_fill_b(bs0);
+    bs = __msa_insve_h(bs, 0, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs1);
+    bs = __msa_insve_h(bs, 1, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs2);
+    bs = __msa_insve_h(bs, 2, tmp_vec);
+    tmp_vec = (v8i16) __msa_fill_b(bs3);
+    bs = __msa_insve_h(bs, 3, tmp_vec);
+
+    if (!__msa_test_bz_v((v16u8) bs)) {
+        tmp_vec = (v8i16) __msa_fill_b(tc0);
+        tc = __msa_insve_h(tc, 0, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc1);
+        tc = __msa_insve_h(tc, 1, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc2);
+        tc = __msa_insve_h(tc, 2, tmp_vec);
+        tmp_vec = (v8i16) __msa_fill_b(tc3);
+        tc = __msa_insve_h(tc, 3, tmp_vec);
+
+        is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
+
+        LD_UB8((data - 2), img_width,
+               row0, row1, row2, row3, row4, row5, row6, row7);
+
+        TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
+                           row4, row5, row6, row7,
+                           p1_org, p0_org, q0_org, q1_org);
+
+        p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
+        p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
+        q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
+
+        alpha = (v16u8) __msa_fill_b(alpha_in);
+        beta = (v16u8) __msa_fill_b(beta_in);
+
+        is_less_than_alpha = (p0_asub_q0 < alpha);
+        is_less_than_beta = (p1_asub_p0 < beta);
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = (q1_asub_q0 < beta);
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_bs_greater_than0 & is_less_than;
+
+        is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
+
+        if (!__msa_test_bz_v(is_less_than)) {
+            ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
+                       p1_org_r, p0_org_r, q0_org_r, q1_org_r);
+
+            negate_tc = zero - (v16i8) tc;
+            sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
+
+            ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
+
+            AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
+                         tc_r, p0_r, q0_r);
+
+            PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
+
+            p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
+            q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
+            tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
+            src = data - 1;
+            ST2x4_UB(tmp1, 0, src, img_width);
+            src += 4 * img_width;
+            ST2x4_UB(tmp1, 4, src, img_width);
+        }
+    }
+}
+
+static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
+                                            int32_t alpha_in, int32_t beta_in,
+                                            int8_t *tc0)
+{
+    int32_t col, tc_val;
+    v16u8 alpha, beta, res;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    for (col = 0; col < 4; col++) {
+        tc_val = (tc0[col] - 1) + 1;
+
+        if (tc_val <= 0) {
+            src += (4 * stride);
+            continue;
+        }
+
+        AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
+        ST2x4_UB(res, 0, (src - 1), stride);
+        src += (4 * stride);
+    }
+}
+
+static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src, int32_t stride,
+                                                  int32_t alpha_in,
+                                                  int32_t beta_in,
+                                                  int8_t *tc0)
+{
+    int32_t col, tc_val;
+    int16_t out0, out1;
+    v16u8 alpha, beta, res;
+
+    alpha = (v16u8) __msa_fill_b(alpha_in);
+    beta = (v16u8) __msa_fill_b(beta_in);
+
+    for (col = 0; col < 4; col++) {
+        tc_val = (tc0[col] - 1) + 1;
+
+        if (tc_val <= 0) {
+            src += 4 * stride;
+            continue;
+        }
+
+        AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
+
+        out0 = __msa_copy_s_h((v8i16) res, 0);
+        out1 = __msa_copy_s_h((v8i16) res, 1);
+
+        SH(out0, (src - 1));
+        src += stride;
+        SH(out1, (src - 1));
+        src += stride;
+    }
+}
+
+void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
+                                           tc[0], tc[1], tc[2], tc[3],
+                                           alpha, beta, img_width);
+}
+
+void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta, int8_t *tc)
+{
+
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
+                                           tc[0], tc[1], tc[2], tc[3],
+                                           alpha, beta, img_width);
+}
+
+void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
+                                               tc[0], tc[1], tc[2], tc[3],
+                                               alpha, beta, img_width);
+}
+
+void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta, int8_t *tc)
+{
+    uint8_t bs0 = 1;
+    uint8_t bs1 = 1;
+    uint8_t bs2 = 1;
+    uint8_t bs3 = 1;
+
+    if (tc[0] < 0)
+        bs0 = 0;
+    if (tc[1] < 0)
+        bs1 = 0;
+    if (tc[2] < 0)
+        bs2 = 0;
+    if (tc[3] < 0)
+        bs3 = 0;
+
+    avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
+                                               tc[0], tc[1], tc[2], tc[3],
+                                               alpha, beta, img_width);
+}
+
+void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta)
+{
+    avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
+                                           (uint8_t) beta,
+                                           (unsigned int) img_width);
+}
+
+void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, int img_width,
+                                  int alpha, int beta)
+{
+    avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
+                                           (uint8_t) beta,
+                                           (unsigned int) img_width);
+}
+
+void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta)
+{
+    avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
+                                               (uint8_t) beta,
+                                               (unsigned int) img_width);
+}
+
+void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, int img_width,
+                                    int alpha, int beta)
+{
+    avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
+                                               (uint8_t) beta,
+                                               (unsigned int) img_width);
+}
+
+void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
+                                         int32_t ystride,
+                                         int32_t alpha, int32_t beta,
+                                         int8_t *tc0)
+{
+    avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
+                                               int32_t ystride,
+                                               int32_t alpha,
+                                               int32_t beta,
+                                               int8_t *tc0)
+{
+    avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
+                                          int32_t ystride,
+                                          int32_t alpha,
+                                          int32_t beta,
+                                          int8_t *tc0)
+{
+    avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
+}
+
+void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
+                                                int32_t ystride,
+                                                int32_t alpha,
+                                                int32_t beta)
+{
+    avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
+}
+
+void ff_weight_h264_pixels16_8_msa(uint8_t *src, int stride,
+                                   int height, int log2_denom,
+                                   int weight_src, int offset)
+{
+    avc_wgt_16width_msa(src, stride, height, log2_denom, weight_src, offset);
+}
+
+void ff_weight_h264_pixels8_8_msa(uint8_t *src, int stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset)
+{
+    avc_wgt_8width_msa(src, stride, height, log2_denom, weight_src, offset);
+}
+
+void ff_weight_h264_pixels4_8_msa(uint8_t *src, int stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset)
+{
+    avc_wgt_4width_msa(src, stride, height, log2_denom, weight_src, offset);
+}
+
+void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
+                                     int stride, int height,
+                                     int log2_denom, int weight_dst,
+                                     int weight_src, int offset)
+{
+    avc_biwgt_16width_msa(src, stride, dst, stride, height, log2_denom,
+                          weight_src, weight_dst, offset);
+}
+
+void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset)
+{
+    avc_biwgt_8width_msa(src, stride, dst, stride, height, log2_denom,
+                         weight_src, weight_dst, offset);
+}
+
+void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
+                                    int stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset)
+{
+    avc_biwgt_4width_msa(src, stride, dst, stride, height, log2_denom,
+                         weight_src, weight_dst, offset);
+}
diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
new file mode 100644
index 0000000..fac1e7a
--- /dev/null
+++ b/libavcodec/mips/h264idct_msa.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+
+#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)          \
+{                                                                         \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                          \
+    tmp0_m = in0 + in2;                                                   \
+    tmp1_m = in0 - in2;                                                   \
+    tmp2_m = in1 >> 1;                                                    \
+    tmp2_m = tmp2_m - in3;                                                \
+    tmp3_m = in3 >> 1;                                                    \
+    tmp3_m = in1 + tmp3_m;                                                \
+                                                                          \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3);  \
+}
+
+static void avc_idct4x4_addblk_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride)
+{
+    v8i16 src0, src1, src2, src3;
+    v8i16 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v8i16 zeros = { 0 };
+
+    LD4x4_SH(src, src0, src1, src2, src3);
+    AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
+    TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+    AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
+    SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
+    ADDBLK_ST4x4_UB(vres0, vres1, vres2, vres3, dst, dst_stride);
+    ST_SH2(zeros, zeros, src, 8);
+}
+
+static void avc_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                      int32_t dst_stride)
+{
+    int16_t dc;
+    uint32_t src0, src1, src2, src3;
+    v16u8 pred = { 0 };
+    v16i8 out;
+    v8i16 input_dc, pred_r, pred_l;
+
+    dc = (src[0] + 32) >> 6;
+    input_dc = __msa_fill_h(dc);
+    src[0] = 0;
+
+    LW4(dst, dst_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, pred);
+    UNPCK_UB_SH(pred, pred_r, pred_l);
+
+    pred_r += input_dc;
+    pred_l += input_dc;
+
+    CLIP_SH2_0_255(pred_r, pred_l);
+    out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                     int32_t de_q_val)
+{
+#define DC_DEST_STRIDE 16
+    int16_t out0, out1, out2, out3;
+    v8i16 src0, src1, src2, src3;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
+    v4i32 de_q_vec = __msa_fill_w(de_q_val);
+
+    LD4x4_SH(src, src0, src1, src2, src3);
+    TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, src0, src1, src2, src3);
+    BUTTERFLY_4(src0, src2, src3, src1, vec0, vec3, vec2, vec1);
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
+    TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+    BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
+    UNPCK_R_SH_SW(vres0, vres0_r);
+    UNPCK_R_SH_SW(vres1, vres1_r);
+    UNPCK_R_SH_SW(vres2, vres2_r);
+    UNPCK_R_SH_SW(vres3, vres3_r);
+
+    vres0_r *= de_q_vec;
+    vres1_r *= de_q_vec;
+    vres2_r *= de_q_vec;
+    vres3_r *= de_q_vec;
+
+    SRARI_W4_SW(vres0_r, vres1_r, vres2_r, vres3_r, 8);
+    PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1);
+
+    out0 = __msa_copy_s_h(vec0, 0);
+    out1 = __msa_copy_s_h(vec0, 1);
+    out2 = __msa_copy_s_h(vec0, 2);
+    out3 = __msa_copy_s_h(vec0, 3);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+    dst += DC_DEST_STRIDE;
+
+    out0 = __msa_copy_s_h(vec0, 4);
+    out1 = __msa_copy_s_h(vec0, 5);
+    out2 = __msa_copy_s_h(vec0, 6);
+    out3 = __msa_copy_s_h(vec0, 7);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+    dst += (3 * DC_DEST_STRIDE);
+
+    out0 = __msa_copy_s_h(vec1, 0);
+    out1 = __msa_copy_s_h(vec1, 1);
+    out2 = __msa_copy_s_h(vec1, 2);
+    out3 = __msa_copy_s_h(vec1, 3);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+    dst += DC_DEST_STRIDE;
+
+    out0 = __msa_copy_s_h(vec1, 4);
+    out1 = __msa_copy_s_h(vec1, 5);
+    out2 = __msa_copy_s_h(vec1, 6);
+    out3 = __msa_copy_s_h(vec1, 7);
+    SH(out0, dst);
+    SH(out1, (dst + 2 * DC_DEST_STRIDE));
+    SH(out2, (dst + 8 * DC_DEST_STRIDE));
+    SH(out3, (dst + 10 * DC_DEST_STRIDE));
+
+#undef DC_DEST_STRIDE
+}
+
+static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
+{
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
+    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
+    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
+    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
+    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 zeros = { 0 };
+
+    src[0] += 32;
+
+    LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+
+    vec0 = src0 + src4;
+    vec1 = src0 - src4;
+    vec2 = src2 >> 1;
+    vec2 = vec2 - src6;
+    vec3 = src6 >> 1;
+    vec3 = src2 + vec3;
+
+    BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
+
+    vec0 = src7 >> 1;
+    vec0 = src5 - vec0 - src3 - src7;
+    vec1 = src3 >> 1;
+    vec1 = src1 - vec1 + src7 - src3;
+    vec2 = src5 >> 1;
+    vec2 = vec2 - src1 + src7 + src5;
+    vec3 = src1 >> 1;
+    vec3 = vec3 + src3 + src5 + src1;
+    tmp4 = vec3 >> 2;
+    tmp4 += vec0;
+    tmp5 = vec2 >> 2;
+    tmp5 += vec1;
+    tmp6 = vec1 >> 2;
+    tmp6 -= vec2;
+    tmp7 = vec0 >> 2;
+    tmp7 = vec3 - tmp7;
+
+    BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                res0, res1, res2, res3, res4, res5, res6, res7);
+    TRANSPOSE8x8_SH_SH(res0, res1, res2, res3, res4, res5, res6, res7,
+                       res0, res1, res2, res3, res4, res5, res6, res7);
+    UNPCK_SH_SW(res0, tmp0_r, tmp0_l);
+    UNPCK_SH_SW(res1, tmp1_r, tmp1_l);
+    UNPCK_SH_SW(res2, tmp2_r, tmp2_l);
+    UNPCK_SH_SW(res3, tmp3_r, tmp3_l);
+    UNPCK_SH_SW(res4, tmp4_r, tmp4_l);
+    UNPCK_SH_SW(res5, tmp5_r, tmp5_l);
+    UNPCK_SH_SW(res6, tmp6_r, tmp6_l);
+    UNPCK_SH_SW(res7, tmp7_r, tmp7_l);
+    BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r);
+
+    vec2_r = tmp2_r >> 1;
+    vec2_l = tmp2_l >> 1;
+    vec2_r -= tmp6_r;
+    vec2_l -= tmp6_l;
+    vec3_r = tmp6_r >> 1;
+    vec3_l = tmp6_l >> 1;
+    vec3_r += tmp2_r;
+    vec3_l += tmp2_l;
+
+    BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r);
+    BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l);
+
+    vec0_r = tmp7_r >> 1;
+    vec0_l = tmp7_l >> 1;
+    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
+    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
+    vec1_r = tmp3_r >> 1;
+    vec1_l = tmp3_l >> 1;
+    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
+    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
+    vec2_r = tmp5_r >> 1;
+    vec2_l = tmp5_l >> 1;
+    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
+    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
+    vec3_r = tmp1_r >> 1;
+    vec3_l = tmp1_l >> 1;
+    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
+    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
+    tmp1_r = vec3_r >> 2;
+    tmp1_l = vec3_l >> 2;
+    tmp1_r += vec0_r;
+    tmp1_l += vec0_l;
+    tmp3_r = vec2_r >> 2;
+    tmp3_l = vec2_l >> 2;
+    tmp3_r += vec1_r;
+    tmp3_l += vec1_l;
+    tmp5_r = vec1_r >> 2;
+    tmp5_l = vec1_l >> 2;
+    tmp5_r -= vec2_r;
+    tmp5_l -= vec2_l;
+    tmp7_r = vec0_r >> 2;
+    tmp7_l = vec0_l >> 2;
+    tmp7_r = vec3_r - tmp7_r;
+    tmp7_l = vec3_l - tmp7_l;
+
+    BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r);
+    BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r);
+    BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r);
+    BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r);
+    SRA_4V(res0_r, res0_l, res1_r, res1_l, 6);
+    SRA_4V(res2_r, res2_l, res3_r, res3_l, 6);
+    SRA_4V(res4_r, res4_l, res5_r, res5_l, 6);
+    SRA_4V(res6_r, res6_l, res7_r, res7_l, 6);
+    PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
+                res0, res1, res2, res3);
+    PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
+                res4, res5, res6, res7);
+    LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
+               tmp4, tmp5, tmp6, tmp7);
+    ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
+         res0, res1, res2, res3);
+    ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
+         res4, res5, res6, res7);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    CLIP_SH4_0_255(res4, res5, res6, res7);
+    PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                dst0, dst1, dst2, dst3);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x4_UB(dst2, dst3, dst, dst_stride);
+}
+
+static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                    int32_t dst_stride)
+{
+    int32_t dc_val;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dc;
+    v16i8 zeros = { 0 };
+
+    dc_val = (src[0] + 32) >> 6;
+    dc = __msa_fill_h(dc_val);
+
+    src[0] = 0;
+
+    LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
+               dst0_r, dst1_r, dst2_r, dst3_r);
+    ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
+               dst4_r, dst5_r, dst6_r, dst7_r);
+    ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc,
+         dst0_r, dst1_r, dst2_r, dst3_r);
+    ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
+         dst4_r, dst5_r, dst6_r, dst7_r);
+    CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
+    CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
+    PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
+                dst0, dst1, dst2, dst3);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x4_UB(dst2, dst3, dst, dst_stride);
+}
+
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src,
+                          int32_t dst_stride)
+{
+    avc_idct4x4_addblk_msa(dst, src, dst_stride);
+    memset(src, 0, 16 * sizeof(dctcoef));
+}
+
+void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
+                              int32_t dst_stride)
+{
+    avc_idct8_addblk_msa(dst, src, dst_stride);
+    memset(src, 0, 64 * sizeof(dctcoef));
+}
+
+void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
+                                   int32_t dst_stride)
+{
+    avc_idct4x4_addblk_dc_msa(dst, src, dst_stride);
+}
+
+void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
+                                 int32_t dst_stride)
+{
+    avc_idct8_dc_addblk_msa(dst, src, dst_stride);
+}
+
+void ff_h264_idct_add16_msa(uint8_t *dst,
+                            const int32_t *blk_offset,
+                            int16_t *block, int32_t dst_stride,
+                            const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        int32_t nnz = nzc[scan8[i]];
+
+        if (nnz) {
+            if (nnz == 1 && ((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+            else
+                ff_h264_idct_add_msa(dst + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset,
+                            int16_t *block, int32_t dst_stride,
+                            const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz) {
+            if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
+                ff_h264_idct8_dc_addblk_msa(dst + blk_offset[cnt],
+                                            block + cnt * 16 * sizeof(pixel),
+                                            dst_stride);
+            else
+                ff_h264_idct8_addblk_msa(dst + blk_offset[cnt],
+                                         block + cnt * 16 * sizeof(pixel),
+                                         dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_msa(uint8_t **dst,
+                           const int32_t *blk_offset,
+                           int16_t *block, int32_t dst_stride,
+                           const uint8_t nzc[15 * 8])
+{
+    int32_t i, j;
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16); i < (j * 16 + 4); i++) {
+            if (nzc[scan8[i]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_422_msa(uint8_t **dst,
+                               const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8])
+{
+    int32_t i, j;
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16); i < (j * 16 + 4); i++) {
+            if (nzc[scan8[i]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+
+    for (j = 1; j < 3; j++) {
+        for (i = (j * 16 + 4); i < (j * 16 + 8); i++) {
+            if (nzc[scan8[i + 4]])
+                ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i + 4],
+                                     block + i * 16 * sizeof(pixel),
+                                     dst_stride);
+            else if (((dctcoef *) block)[i * 16])
+                ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i + 4],
+                                              block + i * 16 * sizeof(pixel),
+                                              dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct_add16_intra_msa(uint8_t *dst,
+                                  const int32_t *blk_offset,
+                                  int16_t *block,
+                                  int32_t dst_stride,
+                                  const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_msa(dst + blk_offset[i],
+                                 block + i * 16 * sizeof(pixel), dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
+                                          block + i * 16 * sizeof(pixel),
+                                          dst_stride);
+    }
+}
+
+void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
+                                  int32_t de_qval)
+{
+    avc_deq_idct_luma_dc_msa(dst, src, de_qval);
+}
diff --git a/libavcodec/mips/h264pred_init_mips.c b/libavcodec/mips/h264pred_init_mips.c
new file mode 100644
index 0000000..c33d8f7
--- /dev/null
+++ b/libavcodec/mips/h264pred_init_mips.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "h264dsp_mips.h"
+#include "h264pred_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264_pred_init_msa(H264PredContext *h, int codec_id,
+                                       const int bit_depth,
+                                       const int chroma_format_idc)
+{
+    if (8 == bit_depth) {
+        if (chroma_format_idc == 1) {
+            h->pred8x8[VERT_PRED8x8] = ff_h264_intra_pred_vert_8x8_msa;
+            h->pred8x8[HOR_PRED8x8] = ff_h264_intra_pred_horiz_8x8_msa;
+        }
+
+        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[PLANE_PRED8x8] = ff_h264_intra_predict_plane_8x8_msa;
+            }
+        }
+        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7
+            && codec_id != AV_CODEC_ID_VP8) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[DC_PRED8x8] = ff_h264_intra_predict_dc_4blk_8x8_msa;
+                h->pred8x8[LEFT_DC_PRED8x8] =
+                    ff_h264_intra_predict_hor_dc_8x8_msa;
+                h->pred8x8[TOP_DC_PRED8x8] =
+                    ff_h264_intra_predict_vert_dc_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa;
+                h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] =
+                    ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa;
+            }
+        } else {
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+                h->pred8x8[7] = ff_vp8_pred8x8_127_dc_8_msa;
+                h->pred8x8[8] = ff_vp8_pred8x8_129_dc_8_msa;
+            }
+        }
+
+        if (chroma_format_idc == 1) {
+            h->pred8x8[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_8x8_msa;
+        }
+
+        h->pred16x16[DC_PRED8x8] = ff_h264_intra_pred_dc_16x16_msa;
+        h->pred16x16[VERT_PRED8x8] = ff_h264_intra_pred_vert_16x16_msa;
+        h->pred16x16[HOR_PRED8x8] = ff_h264_intra_pred_horiz_16x16_msa;
+
+        switch (codec_id) {
+        case AV_CODEC_ID_SVQ3:
+            ;
+            break;
+        case AV_CODEC_ID_RV40:
+            ;
+            break;
+        case AV_CODEC_ID_VP7:
+        case AV_CODEC_ID_VP8:
+            h->pred16x16[7] = ff_vp8_pred16x16_127_dc_8_msa;
+            h->pred16x16[8] = ff_vp8_pred16x16_129_dc_8_msa;
+            break;
+        default:
+            h->pred16x16[PLANE_PRED8x8] =
+                ff_h264_intra_predict_plane_16x16_msa;
+            break;
+        }
+
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_h264_intra_pred_dc_left_16x16_msa;
+        h->pred16x16[TOP_DC_PRED8x8] = ff_h264_intra_pred_dc_top_16x16_msa;
+        h->pred16x16[DC_128_PRED8x8] = ff_h264_intra_pred_dc_128_16x16_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264_pred_init_mmi(H264PredContext *h, int codec_id,
+        const int bit_depth, const int chroma_format_idc)
+{
+    if (bit_depth == 8) {
+        if (chroma_format_idc == 1) {
+            h->pred8x8  [VERT_PRED8x8       ] = ff_pred8x8_vertical_8_mmi;
+            h->pred8x8  [HOR_PRED8x8        ] = ff_pred8x8_horizontal_8_mmi;
+        } else {
+            h->pred8x8  [VERT_PRED8x8       ] = ff_pred8x16_vertical_8_mmi;
+            h->pred8x8  [HOR_PRED8x8        ] = ff_pred8x16_horizontal_8_mmi;
+        }
+
+        h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_8_mmi;
+        h->pred16x16[VERT_PRED8x8           ] = ff_pred16x16_vertical_8_mmi;
+        h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_8_mmi;
+        h->pred8x8l [TOP_DC_PRED            ] = ff_pred8x8l_top_dc_8_mmi;
+        h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_8_mmi;
+
+#if ARCH_MIPS64
+        switch (codec_id) {
+        case AV_CODEC_ID_SVQ3:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_svq3_8_mmi;
+            break;
+        case AV_CODEC_ID_RV40:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_rv40_8_mmi;
+            break;
+        case AV_CODEC_ID_VP7:
+        case AV_CODEC_ID_VP8:
+            break;
+        default:
+            h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_plane_h264_8_mmi;
+            break;
+        }
+#endif
+
+        if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
+            if (chroma_format_idc == 1) {
+                h->pred8x8[TOP_DC_PRED8x8   ] = ff_pred8x8_top_dc_8_mmi;
+                h->pred8x8[DC_PRED8x8       ] = ff_pred8x8_dc_8_mmi;
+            }
+        }
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
+                                    int bit_depth,
+                                    const int chroma_format_idc)
+{
+#if HAVE_MSA
+    h264_pred_init_msa(h, codec_id, bit_depth, chroma_format_idc);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264_pred_init_mmi(h, codec_id, bit_depth, chroma_format_idc);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264pred_mips.h b/libavcodec/mips/h264pred_mips.h
new file mode 100644
index 0000000..136e291
--- /dev/null
+++ b/libavcodec/mips/h264pred_mips.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_H264PRED_MIPS_H
+#define AVCODEC_MIPS_H264PRED_MIPS_H
+
+#include "constants.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride);
+void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride);
+void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride);
+void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
+        ptrdiff_t stride);
+void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride);
+
+#endif  /* AVCODEC_MIPS_H264PRED_MIPS_H */
diff --git a/libavcodec/mips/h264pred_mmi.c b/libavcodec/mips/h264pred_mmi.c
new file mode 100644
index 0000000..bb795a1
--- /dev/null
+++ b/libavcodec/mips/h264pred_mmi.c
@@ -0,0 +1,992 @@
+/*
+ * Loongson SIMD optimized h264pred
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264pred_mips.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavutil/mips/asmdefs.h"
+#include "constants.h"
+
+void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[2];
+    uint64_t tmp[1];
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[srcA])                           \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[srcA])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x0f(%[srcA])                           \n\t"
+        "gsldrc1    %[ftmp1],   0x08(%[srcA])                           \n\t"
+        "1:                                                             \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdlc1    %[ftmp1],   0x0f(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x08(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdlc1    %[ftmp1],   0x0f(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x08(%[src])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride),    [srcA]"r"((mips_reg)(src-stride))
+        : "memory"
+    );
+}
+
+void ff_pred16x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[3];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "dli        %[tmp2],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp1],    %[tmp0],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp1],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x00(%[addr1])                          \n\t"
+        "swl        %[tmp1],    0x0f(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x08(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp1],    %[tmp0],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp1],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x00(%[addr1])                          \n\t"
+        "swl        %[tmp1],    0x0f(%[addr1])                          \n\t"
+        "swr        %[tmp1],    0x08(%[addr1])                          \n\t"
+        "daddi      %[tmp2],    %[tmp2],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "bnez       %[tmp2],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred16x16_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[4];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "xor        %[tmp3],    %[tmp3],        %[tmp3]                 \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+
+        "dli        %[tmp0],    0x08                                    \n\t"
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        "2:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x01                    \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        "daddu      %[tmp3],    %[tmp3],        %[tmp1]                 \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x01                    \n\t"
+        "bnez       %[tmp0],    2b                                      \n\t"
+
+        "daddiu     %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsra       %[tmp3],    0x05                                    \n\t"
+        "dmul       %[tmp2],    %[tmp3],        %[ff_pb_1]              \n\t"
+        PTR_ADDU   "%[addr0],   %[src],         $0                      \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "3:                                                             \n\t"
+        "swl        %[tmp2],    0x07(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr0])                          \n\t"
+        "swl        %[tmp2],    0x0f(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x08(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "swl        %[tmp2],    0x07(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr0])                          \n\t"
+        "swl        %[tmp2],    0x0f(%[addr0])                          \n\t"
+        "swr        %[tmp2],    0x08(%[addr0])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "bnez       %[tmp0],    3b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride)
+{
+    uint32_t dc;
+    double ftmp[11];
+    mips_reg tmp[3];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gsldlc1    %[ftmp10],  0x07(%[srcA])                           \n\t"
+        "gsldrc1    %[ftmp10],  0x00(%[srcA])                           \n\t"
+        "gsldlc1    %[ftmp9],   0x07(%[src0])                           \n\t"
+        "gsldrc1    %[ftmp9],   0x00(%[src0])                           \n\t"
+        "gsldlc1    %[ftmp8],   0x07(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp8],   0x00(%[src1])                           \n\t"
+
+        "punpcklbh  %[ftmp7],   %[ftmp10],      %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp10],      %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp9],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp9],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp8],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp2],   %[ftmp8],       %[ftmp0]                \n\t"
+        "bnez       %[has_topleft],             1f                      \n\t"
+        "pinsrh_0   %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+
+        "1:                                                             \n\t"
+        "bnez       %[has_topright],            2f                      \n\t"
+        "pinsrh_3   %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+
+        "2:                                                             \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_2]              \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_2]              \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ff_pw_2]              \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ff_pw_2]              \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp9],   %[ftmp7],       %[ftmp6]                \n\t"
+        "biadd      %[ftmp10],  %[ftmp9]                                \n\t"
+        "mfc1       %[tmp1],    %[ftmp10]                               \n\t"
+        "addiu      %[tmp1],    %[tmp1],        0x04                    \n\t"
+        "srl        %[tmp1],    %[tmp1],        0x03                    \n\t"
+        "mul        %[dc],      %[tmp1],        %[ff_pb_1]              \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [dc]"=r"(dc)
+        : [srcA]"r"((mips_reg)(src-stride-1)),
+          [src0]"r"((mips_reg)(src-stride)),
+          [src1]"r"((mips_reg)(src-stride+1)),
+          [has_topleft]"r"(has_topleft),    [has_topright]"r"(has_topright),
+          [ff_pb_1]"r"(ff_pb_1),            [ff_pw_2]"f"(ff_pw_2)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+        "1:                                                             \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src)
+        : [dc]"f"(dc),                      [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
+        ptrdiff_t stride)
+{
+    uint32_t dc, dc1, dc2;
+    double ftmp[14];
+    mips_reg tmp[1];
+
+    const int l0 = ((has_topleft ? src[-1+-1*stride] : src[-1+0*stride]) + 2*src[-1+0*stride] + src[-1+1*stride] + 2) >> 2;
+    const int l1 = (src[-1+0*stride] + 2*src[-1+1*stride] + src[-1+2*stride] + 2) >> 2;
+    const int l2 = (src[-1+1*stride] + 2*src[-1+2*stride] + src[-1+3*stride] + 2) >> 2;
+    const int l3 = (src[-1+2*stride] + 2*src[-1+3*stride] + src[-1+4*stride] + 2) >> 2;
+    const int l4 = (src[-1+3*stride] + 2*src[-1+4*stride] + src[-1+5*stride] + 2) >> 2;
+    const int l5 = (src[-1+4*stride] + 2*src[-1+5*stride] + src[-1+6*stride] + 2) >> 2;
+    const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
+    const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
+
+    __asm__ volatile (
+        "gsldlc1    %[ftmp4],   0x07(%[srcA])                           \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[srcA])                           \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[src0])                           \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[src0])                           \n\t"
+        "gsldlc1    %[ftmp6],   0x07(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp6],   0x00(%[src1])                           \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x03                                    \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp11],  %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp12],  %[ftmp6],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp8],       %[ftmp1]                \n\t"
+        "pshufh     %[ftmp13],  %[ftmp12],      %[ftmp1]                \n\t"
+        "pinsrh_3   %[ftmp8],   %[ftmp8],       %[ftmp13]               \n\t"
+        "pinsrh_3   %[ftmp12],  %[ftmp12],      %[ftmp3]                \n\t"
+        "bnez       %[has_topleft],             1f                      \n\t"
+        "pinsrh_0   %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+
+        "1:                                                             \n\t"
+        "bnez       %[has_topright],            2f                      \n\t"
+        "pshufh     %[ftmp13],  %[ftmp10],      %[ftmp1]                \n\t"
+        "pinsrh_3   %[ftmp8],   %[ftmp8],       %[ftmp13]               \n\t"
+
+        "2:                                                             \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp9],   %[ftmp9],       %[ftmp2]                \n\t"
+        "pmullh     %[ftmp10],  %[ftmp10],      %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp12]               \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp8],   %[ftmp8],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp7],       %[ftmp8]                \n\t"
+        "biadd      %[ftmp4],   %[ftmp5]                                \n\t"
+        "mfc1       %[dc2],     %[ftmp4]                                \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [tmp0]"=&r"(tmp[0]),              [dc2]"=r"(dc2)
+        : [srcA]"r"((mips_reg)(src-stride-1)),
+          [src0]"r"((mips_reg)(src-stride)),
+          [src1]"r"((mips_reg)(src-stride+1)),
+          [has_topleft]"r"(has_topleft),    [has_topright]"r"(has_topright)
+        : "memory"
+    );
+
+    dc1 = l0+l1+l2+l3+l4+l5+l6+l7;
+    dc = ((dc1+dc2+8)>>4)*0x01010101U;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+        "1:                                                             \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src)
+        : [dc]"f"(dc),                      [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
+        int has_topright, ptrdiff_t stride)
+{
+    double ftmp[12];
+    mips_reg tmp[1];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[srcA])                           \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[srcA])                           \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[src0])                           \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[src0])                           \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[src1])                           \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp11],  %[ftmp5],       %[ftmp0]                \n\t"
+        "bnez       %[has_topleft],             1f                      \n\t"
+        "pinsrh_0   %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+
+        "1:                                                             \n\t"
+        "bnez       %[has_topright],            2f                      \n\t"
+        "pinsrh_3   %[ftmp11],  %[ftmp11],      %[ftmp9]                \n\t"
+
+        "2:                                                             \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "pmullh     %[ftmp9],   %[ftmp9],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp6],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp4],   0x00(%[src])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),
+          [src]"=r"(src)
+        : [srcA]"r"((mips_reg)(src-stride-1)),
+          [src0]"r"((mips_reg)(src-stride)),
+          [src1]"r"((mips_reg)(src-stride+1)),
+          [has_topleft]"r"(has_topleft),    [has_topright]"r"(has_topright)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "1:                                                             \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
+        ptrdiff_t stride)
+{
+    const int dc = (src[-stride] + src[1-stride] + src[2-stride]
+                 + src[3-stride] + src[-1+0*stride] + src[-1+1*stride]
+                 + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
+    uint64_t tmp[2];
+    mips_reg addr[1];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[tmp0],    %[dc],          $0                      \n\t"
+        "dmul       %[tmp1],    %[tmp0],        %[ff_pb_1]              \n\t"
+        "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
+        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [addr0]"=&r"(addr[0])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [dc]"r"(dc),                      [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[2];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "ldl        %[tmp0],    0x07(%[addr0])                          \n\t"
+        "ldr        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "dli        %[tmp1],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        "sdl        %[tmp0],    0x07(%[addr1])                          \n\t"
+        "sdr        %[tmp0],    0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr1],   %[stride]                               \n\t"
+        "sdl        %[tmp0],    0x07(%[addr1])                          \n\t"
+        "sdr        %[tmp0],    0x00(%[addr1])                          \n\t"
+        "daddi      %[tmp1],    -0x01                                   \n\t"
+        PTR_ADDU   "%[addr1],   %[stride]                               \n\t"
+        "bnez       %[tmp1],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[3];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[4];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "biadd      %[ftmp2],   %[ftmp2]                                \n\t"
+        "biadd      %[ftmp3],   %[ftmp3]                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "paddush    %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+        "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [tmp0]"=&r"(tmp[0]),
+          [addr0]"=&r"(addr[0]),
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[5];
+    mips_reg addr[7];
+
+    __asm__ volatile (
+        "negu       %[addr0],   %[stride]                               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[src]                  \n\t"
+        PTR_ADDIU  "%[addr1],   %[addr0],       0x04                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   $0,             %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   $0,             %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr0],   0x01                                    \n\t"
+        "lbu        %[addr2],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr4],       %[addr2]                \n\t"
+        PTR_ADDIU  "%[addr1],   0x01                                    \n\t"
+        "dli        %[addr2],  -0x01                                    \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[src]                  \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   $0,             %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr5],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr5],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr5],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   $0,             %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr6],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr6],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr2],   %[addr2],       %[stride]               \n\t"
+        "lbu        %[addr1],   0x00(%[addr2])                          \n\t"
+        PTR_ADDU   "%[addr6],   %[addr6],       %[addr1]                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr3],       %[addr5]                \n\t"
+        PTR_ADDIU  "%[addr3],   %[addr3],       0x04                    \n\t"
+        PTR_ADDIU  "%[addr4],   %[addr4],       0x02                    \n\t"
+        PTR_ADDIU  "%[addr1],   %[addr6],       0x02                    \n\t"
+        PTR_ADDU   "%[addr2],   %[addr4],       %[addr1]                \n\t"
+        PTR_SRL    "%[addr3],   0x03                                    \n\t"
+        PTR_SRL    "%[addr4],   0x02                                    \n\t"
+        PTR_SRL    "%[addr1],   0x02                                    \n\t"
+        PTR_SRL    "%[addr2],   0x03                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr3],   %[ftmp1]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr4],   %[ftmp2]                                \n\t"
+        "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr1],   %[ftmp3]                                \n\t"
+        "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "dmtc1      %[addr2],   %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp3],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[addr0],   $0,             %[src]                  \n\t"
+        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [addr6]"=&r"(addr[6])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
+
+void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    double ftmp[1];
+    uint64_t tmp[1];
+
+    __asm__ volatile (
+        "gsldlc1    %[ftmp0],   0x07(%[srcA])                           \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[srcA])                           \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src)
+        : [stride]"r"((mips_reg)stride),    [srcA]"r"((mips_reg)(src-stride))
+        : "memory"
+    );
+}
+
+void ff_pred8x16_horizontal_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    uint64_t tmp[3];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDI   "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[src],         $0                      \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "lbu        %[tmp1],    0x00(%[addr0])                          \n\t"
+        "dmul       %[tmp2],    %[tmp1],        %[ff_pb_1]              \n\t"
+        "swl        %[tmp2],    0x07(%[addr1])                          \n\t"
+        "swr        %[tmp2],    0x00(%[addr1])                          \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[stride]               \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
+        : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
+          [ff_pb_1]"r"(ff_pb_1)
+        : "memory"
+    );
+}
+
+static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
+        const int svq3, const int rv40)
+{
+    double ftmp[11];
+    uint64_t tmp[7];
+    mips_reg addr[1];
+
+    __asm__ volatile(
+        PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
+        "dli        %[tmp2],    0x20                                    \n\t"
+        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "gsldlc1    %[ftmp0],   0x06(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp0],   -0x01(%[addr0])                         \n\t"
+        "gsldrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        "dsrl       %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pmullh     %[ftmp0],   %[ftmp0],       %[ff_pw_m8tom5]         \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_m4tom1]         \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_1to4]           \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5to8]           \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "dli        %[tmp2],    0x0e                                    \n\t"
+        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "dli        %[tmp2],    0x01                                    \n\t"
+        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp5],   %[ftmp0],       %[ftmp1]                \n\t"
+
+        PTR_ADDIU  "%[addr0],   %[src],         -0x01                   \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        "lbu        %[tmp6],    0x10(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp0]                                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp1]                                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp2]                                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
+        "daddu      %[tmp6],    %[tmp6],        %[tmp5]                 \n\t"
+        "daddiu     %[tmp6],    %[tmp6],        0x01                    \n\t"
+        "dsll       %[tmp6],    %[tmp6],        0x04                    \n\t"
+
+        "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
+        "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
+        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
+        "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
+        "dmtc1      %[tmp2],    %[ftmp3]                                \n\t"
+
+        "pmullh     %[ftmp0],   %[ftmp0],       %[ff_pw_m8tom5]         \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_m4tom1]         \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_1to4]           \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5to8]           \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "dli        %[tmp2],    0x0e                                    \n\t"
+        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+
+        "dli        %[tmp2],    0x01                                    \n\t"
+        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp0],       %[ftmp1]                \n\t"
+
+        "dmfc1      %[tmp0],    %[ftmp5]                                \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "dsra       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "dmfc1      %[tmp1],    %[ftmp6]                                \n\t"
+        "dsll       %[tmp1],    %[tmp1],        0x30                    \n\t"
+        "dsra       %[tmp1],    %[tmp1],        0x30                    \n\t"
+
+        "beqz       %[svq3],    1f                                      \n\t"
+        "dli        %[tmp2],    0x04                                    \n\t"
+        "ddiv       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "ddiv       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "dli        %[tmp2],    0x05                                    \n\t"
+        "dmul       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "dmul       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "dli        %[tmp2],    0x10                                    \n\t"
+        "ddiv       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "ddiv       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "daddu      %[tmp2],    %[tmp0],        $0                      \n\t"
+        "daddu      %[tmp0],    %[tmp1],        $0                      \n\t"
+        "daddu      %[tmp1],    %[tmp2],        $0                      \n\t"
+        "b          2f                                                  \n\t"
+
+        "1:                                                             \n\t"
+        "beqz       %[rv40],    1f                                      \n\t"
+        "dsra       %[tmp2],    %[tmp0],        0x02                    \n\t"
+        "daddu      %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "dsra       %[tmp2],    %[tmp1],        0x02                    \n\t"
+        "daddu      %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "dsra       %[tmp0],    %[tmp0],        0x04                    \n\t"
+        "dsra       %[tmp1],    %[tmp1],        0x04                    \n\t"
+        "b          2f                                                  \n\t"
+
+        "1:                                                             \n\t"
+        "dli        %[tmp2],    0x05                                    \n\t"
+        "dmul       %[tmp0],    %[tmp0],        %[tmp2]                 \n\t"
+        "dmul       %[tmp1],    %[tmp1],        %[tmp2]                 \n\t"
+        "daddiu     %[tmp0],    %[tmp0],        0x20                    \n\t"
+        "daddiu     %[tmp1],    %[tmp1],        0x20                    \n\t"
+        "dsra       %[tmp0],    %[tmp0],        0x06                    \n\t"
+        "dsra       %[tmp1],    %[tmp1],        0x06                    \n\t"
+
+        "2:                                                             \n\t"
+        "daddu      %[tmp3],    %[tmp0],        %[tmp1]                 \n\t"
+        "dli        %[tmp2],    0x07                                    \n\t"
+        "dmul       %[tmp3],    %[tmp3],        %[tmp2]                 \n\t"
+        "dsubu      %[tmp6],    %[tmp6],        %[tmp3]                 \n\t"
+
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp0]                                \n\t"
+        "pshufh     %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "dmtc1      %[tmp1],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "dmtc1      %[tmp6],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "dli        %[tmp2],    0x05                                    \n\t"
+        "dmtc1      %[tmp2],    %[ftmp7]                                \n\t"
+        "pmullh     %[ftmp1],   %[ff_pw_0to3],  %[ftmp0]                \n\t"
+        "dmtc1      %[ff_pw_4to7],              %[ftmp2]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "dmtc1      %[ff_pw_8tob],              %[ftmp3]                \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "dmtc1      %[ff_pw_ctof],              %[ftmp4]                \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+
+        "dli        %[tmp0],    0x10                                    \n\t"
+        PTR_ADDU   "%[addr0],   %[src],         $0                      \n\t"
+        "1:                                                             \n\t"
+        "paddsh     %[ftmp8],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp8],       %[ftmp9]                \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
+
+        "paddsh     %[ftmp8],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp8],       %[ftmp9]                \n\t"
+        "gssdlc1    %[ftmp0],   0x0f(%[addr0])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x08(%[addr0])                          \n\t"
+
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "daddiu     %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
+          [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
+          [tmp6]"=&r"(tmp[6]),
+          [addr0]"=&r"(addr[0])
+        : [src]"r"(src),                    [stride]"r"((mips_reg)stride),
+          [svq3]"r"(svq3),                  [rv40]"r"(rv40),
+          [ff_pw_m8tom5]"f"(ff_pw_m8tom5),  [ff_pw_m4tom1]"f"(ff_pw_m4tom1),
+          [ff_pw_1to4]"f"(ff_pw_1to4),      [ff_pw_5to8]"f"(ff_pw_5to8),
+          [ff_pw_0to3]"f"(ff_pw_0to3),      [ff_pw_4to7]"r"(ff_pw_4to7),
+          [ff_pw_8tob]"r"(ff_pw_8tob),      [ff_pw_ctof]"r"(ff_pw_ctof)
+        : "memory"
+    );
+}
+
+void ff_pred16x16_plane_h264_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    pred16x16_plane_compat_mmi(src, stride, 0, 0);
+}
+
+void ff_pred16x16_plane_svq3_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    pred16x16_plane_compat_mmi(src, stride, 1, 0);
+}
+
+void ff_pred16x16_plane_rv40_8_mmi(uint8_t *src, ptrdiff_t stride)
+{
+    pred16x16_plane_compat_mmi(src, stride, 0, 1);
+}
diff --git a/libavcodec/mips/h264pred_msa.c b/libavcodec/mips/h264pred_msa.c
new file mode 100644
index 0000000..cddcd2e
--- /dev/null
+++ b/libavcodec/mips/h264pred_msa.c
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride)
+{
+    uint32_t row;
+    uint32_t src_data1, src_data2;
+
+    src_data1 = LW(src);
+    src_data2 = LW(src + 4);
+
+    for (row = 8; row--;) {
+        SW(src_data1, dst);
+        SW(src_data2, (dst + 4));
+        dst += dst_stride;
+    }
+}
+
+static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    uint32_t row;
+    v16u8 src0;
+
+    src0 = LD_UB(src);
+
+    for (row = 16; row--;) {
+        ST_UB(src0, dst);
+        dst += dst_stride;
+    }
+}
+
+static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
+                                        uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+    out0 = src[0 * src_stride] * 0x0101010101010101;
+    out1 = src[1 * src_stride] * 0x0101010101010101;
+    out2 = src[2 * src_stride] * 0x0101010101010101;
+    out3 = src[3 * src_stride] * 0x0101010101010101;
+    out4 = src[4 * src_stride] * 0x0101010101010101;
+    out5 = src[5 * src_stride] * 0x0101010101010101;
+    out6 = src[6 * src_stride] * 0x0101010101010101;
+    out7 = src[7 * src_stride] * 0x0101010101010101;
+
+    SD4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16u8 src0, src1, src2, src3;
+
+    for (row = 4; row--;) {
+        inp0 = src[0];
+        src += src_stride;
+        inp1 = src[0];
+        src += src_stride;
+        inp2 = src[0];
+        src += src_stride;
+        inp3 = src[0];
+        src += src_stride;
+
+        src0 = (v16u8) __msa_fill_b(inp0);
+        src1 = (v16u8) __msa_fill_b(inp1);
+        src2 = (v16u8) __msa_fill_b(inp2);
+        src3 = (v16u8) __msa_fill_b(inp3);
+
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
+                                     int32_t src_stride_left,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     uint8_t is_above, uint8_t is_left)
+{
+    uint32_t row;
+    uint32_t out, addition = 0;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    if (is_left && is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        addition = __msa_copy_u_w((v4i32) sum, 0);
+
+        for (row = 0; row < 8; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 8) >> 4;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_left) {
+        for (row = 0; row < 8; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 4) >> 3;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum = (v2u64) __msa_srari_d((v2i64) sum, 3);
+        store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+    } else {
+        store = (v16u8) __msa_ldi_b(128);
+    }
+
+    out = __msa_copy_u_w((v4i32) store, 0);
+
+    for (row = 8; row--;) {
+        SW(out, dst);
+        SW(out, (dst + 4));
+        dst += dst_stride;
+    }
+}
+
+static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left,
+                                       int32_t src_stride_left,
+                                       uint8_t *dst, int32_t dst_stride,
+                                       uint8_t is_above, uint8_t is_left)
+{
+    uint32_t row;
+    uint32_t addition = 0;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    if (is_left && is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        addition = __msa_copy_u_w((v4i32) sum, 0);
+
+        for (row = 0; row < 16; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 16) >> 5;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_left) {
+        for (row = 0; row < 16; row++) {
+            addition += src_left[row * src_stride_left];
+        }
+
+        addition = (addition + 8) >> 4;
+        store = (v16u8) __msa_fill_b(addition);
+    } else if (is_above) {
+        src_above = LD_UB(src_top);
+
+        sum_above = __msa_hadd_u_h(src_above, src_above);
+        sum_top = __msa_hadd_u_w(sum_above, sum_above);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+        sum = __msa_hadd_u_d(sum_top, sum_top);
+        sum = (v2u64) __msa_srari_d((v2i64) sum, 4);
+        store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+    } else {
+        store = (v16u8) __msa_ldi_b(128);
+    }
+
+    for (row = 16; row--;) {
+        ST_UB(store, dst);
+        dst += dst_stride;
+    }
+}
+
+#define INTRA_PREDICT_VALDC_8X8_MSA(val)                         \
+static void intra_predict_##val##dc_8x8_msa(uint8_t *dst,        \
+                                            int32_t dst_stride)  \
+{                                                                \
+    uint32_t row, out;                                           \
+    v16i8 store;                                                 \
+                                                                 \
+    store = __msa_ldi_b(val);                                    \
+    out = __msa_copy_u_w((v4i32) store, 0);                      \
+                                                                 \
+    for (row = 8; row--;) {                                      \
+        SW(out, dst);                                            \
+        SW(out, (dst + 4));                                      \
+        dst += dst_stride;                                       \
+    }                                                            \
+}
+
+INTRA_PREDICT_VALDC_8X8_MSA(127);
+INTRA_PREDICT_VALDC_8X8_MSA(129);
+
+#define INTRA_PREDICT_VALDC_16X16_MSA(val)                         \
+static void intra_predict_##val##dc_16x16_msa(uint8_t *dst,        \
+                                              int32_t dst_stride)  \
+{                                                                  \
+    uint32_t row;                                                  \
+    v16u8 store;                                                   \
+                                                                   \
+    store = (v16u8) __msa_ldi_b(val);                              \
+                                                                   \
+    for (row = 16; row--;) {                                       \
+        ST_UB(store, dst);                                         \
+        dst += dst_stride;                                         \
+    }                                                              \
+}
+
+INTRA_PREDICT_VALDC_16X16_MSA(127);
+INTRA_PREDICT_VALDC_16X16_MSA(129);
+
+static void intra_predict_plane_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lpcnt;
+    int32_t res, res0, res1, res2, res3;
+    uint64_t out0, out1;
+    v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 src_top;
+    v8i16 vec9, vec10, vec11;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
+    v2i64 sum;
+
+    src_top = LD_UB(src - (stride + 1));
+    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
+
+    vec9 = __msa_hsub_u_h(src_top, src_top);
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w(vec9, vec9);
+    sum = __msa_hadd_s_d(vec8, vec8);
+
+    res0 = __msa_copy_s_w((v4i32) sum, 0);
+
+    res1 = (src[4 * stride - 1] - src[2 * stride - 1]) +
+        2 * (src[5 * stride - 1] - src[stride - 1]) +
+        3 * (src[6 * stride - 1] - src[-1]) +
+        4 * (src[7 * stride - 1] - src[-stride - 1]);
+
+    res0 *= 17;
+    res1 *= 17;
+    res0 = (res0 + 16) >> 5;
+    res1 = (res1 + 16) >> 5;
+
+    res3 = 3 * (res0 + res1);
+    res2 = 16 * (src[7 * stride - 1] + src[-stride + 7] + 1);
+    res = res2 - res3;
+
+    vec8 = __msa_fill_w(res0);
+    vec4 = __msa_fill_w(res);
+    vec2 = __msa_fill_w(res1);
+    vec5 = vec8 * int_multiplier;
+    vec3 = vec8 * 4;
+
+    for (lpcnt = 4; lpcnt--;) {
+        vec0 = vec5;
+        vec0 += vec4;
+        vec1 = vec0 + vec3;
+        vec6 = vec5;
+        vec4 += vec2;
+        vec6 += vec4;
+        vec7 = vec6 + vec3;
+
+        SRA_4V(vec0, vec1, vec6, vec7, 5);
+        PCKEV_H2_SH(vec1, vec0, vec7, vec6, vec10, vec11);
+        CLIP_SH2_0_255(vec10, vec11);
+        PCKEV_B2_SH(vec10, vec10, vec11, vec11, vec10, vec11);
+
+        out0 = __msa_copy_s_d((v2i64) vec10, 0);
+        out1 = __msa_copy_s_d((v2i64) vec11, 0);
+        SD(out0, src);
+        src += stride;
+        SD(out1, src);
+        src += stride;
+
+        vec4 += vec2;
+    }
+}
+
+static void intra_predict_plane_16x16_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lpcnt;
+    int32_t res0, res1, res2, res3;
+    uint64_t load0, load1;
+    v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 src_top = { 0 };
+    v8i16 vec9, vec10;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
+
+    load0 = LD(src - (stride + 1));
+    load1 = LD(src - (stride + 1) + 9);
+
+    INSERT_D2_UB(load0, load1, src_top);
+
+    src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
+
+    vec9 = __msa_hsub_u_h(src_top, src_top);
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w(vec9, vec9);
+    res_add = (v4i32) __msa_hadd_s_d(vec8, vec8);
+
+    res0 = __msa_copy_s_w(res_add, 0) + __msa_copy_s_w(res_add, 2);
+
+    res1 = (src[8 * stride - 1] - src[6 * stride - 1]) +
+        2 * (src[9 * stride - 1] - src[5 * stride - 1]) +
+        3 * (src[10 * stride - 1] - src[4 * stride - 1]) +
+        4 * (src[11 * stride - 1] - src[3 * stride - 1]) +
+        5 * (src[12 * stride - 1] - src[2 * stride - 1]) +
+        6 * (src[13 * stride - 1] - src[stride - 1]) +
+        7 * (src[14 * stride - 1] - src[-1]) +
+        8 * (src[15 * stride - 1] - src[-1 * stride - 1]);
+
+    res0 *= 5;
+    res1 *= 5;
+    res0 = (res0 + 32) >> 6;
+    res1 = (res1 + 32) >> 6;
+
+    res3 = 7 * (res0 + res1);
+    res2 = 16 * (src[15 * stride - 1] + src[-stride + 15] + 1);
+    res2 -= res3;
+
+    vec8 = __msa_fill_w(res0);
+    vec4 = __msa_fill_w(res2);
+    vec5 = __msa_fill_w(res1);
+    vec6 = vec8 * 4;
+    vec7 = vec8 * int_multiplier;
+
+    for (lpcnt = 16; lpcnt--;) {
+        vec0 = vec7;
+        vec0 += vec4;
+        vec1 = vec0 + vec6;
+        vec2 = vec1 + vec6;
+        vec3 = vec2 + vec6;
+
+        SRA_4V(vec0, vec1, vec2, vec3, 5);
+        PCKEV_H2_SH(vec1, vec0, vec3, vec2, vec9, vec10);
+        CLIP_SH2_0_255(vec9, vec10);
+        PCKEV_ST_SB(vec9, vec10, src);
+        src += stride;
+
+        vec4 += vec5;
+    }
+}
+
+static void intra_predict_dc_4blk_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0, src1, src3, src2 = 0;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h((v16u8) src_top, (v16u8) src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[lp_cnt * stride - 1];
+        src2 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 4) >> 3;
+    src3 = (src1 + src2 + 4) >> 3;
+    src1 = (src1 + 2) >> 2;
+    src2 = (src2 + 2) >> 2;
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+    out3 = src3 * 0x01010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SW(out0, src);
+        SW(out1, (src + 4));
+        SW(out2, (src + 4 * stride));
+        SW(out3, (src + 4 * stride + 4));
+        src += stride;
+    }
+}
+
+static void intra_predict_hor_dc_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0 = 0, src1 = 0;
+    uint64_t out0, out1;
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[lp_cnt * stride - 1];
+        src1 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+    src1 = (src1 + 2) >> 2;
+    out0 = src0 * 0x0101010101010101;
+    out1 = src1 * 0x0101010101010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SD(out0, src);
+        SD(out1, (src + 4 * stride));
+        src += stride;
+    }
+}
+
+static void intra_predict_vert_dc_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t out0 = 0, out1 = 0;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+    v4i32 res0, res1;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    sum = (v4u32) __msa_srari_w((v4i32) sum, 2);
+    res0 = (v4i32) __msa_splati_b((v16i8) sum, 0);
+    res1 = (v4i32) __msa_splati_b((v16i8) sum, 4);
+    out0 = __msa_copy_u_w(res0, 0);
+    out1 = __msa_copy_u_w(res1, 0);
+
+    for (lp_cnt = 8; lp_cnt--;) {
+        SW(out0, src);
+        SW(out1, src + 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0, src1, src2 = 0;
+    uint32_t out0, out1, out2;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src2 += src[lp_cnt * stride - 1];
+    }
+    src2 = (src0 + src2 + 4) >> 3;
+    src0 = (src0 + 2) >> 2;
+    src1 = (src1 + 2) >> 2;
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SW(out2, src);
+        SW(out1, src + 4);
+        SW(out0, src + stride * 4);
+        SW(out1, src + stride * 4 + 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0, src1, src2 = 0, src3;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    src_top = LD_UB(src - stride);
+    add = __msa_hadd_u_h(src_top, src_top);
+    sum = __msa_hadd_u_w(add, add);
+    src0 = __msa_copy_u_w((v4i32) sum, 0);
+    src1 = __msa_copy_u_w((v4i32) sum, 1);
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src2 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+    src3 = (src1 + src2 + 4) >> 3;
+    src1 = (src1 + 2) >> 2;
+    src2 = (src2 + 2) >> 2;
+
+    out0 = src0 * 0x01010101;
+    out1 = src1 * 0x01010101;
+    out2 = src2 * 0x01010101;
+    out3 = src3 * 0x01010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SW(out0, src);
+        SW(out1, src + 4);
+        SW(out2, src + stride * 4);
+        SW(out3, src + stride * 4 + 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0 = 0;
+    uint64_t out0, out1;
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[lp_cnt * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+    out0 = src0 * 0x0101010101010101;
+    out1 = 0x8080808080808080;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SD(out0, src);
+        SD(out1, src + stride * 4);
+        src += stride;
+    }
+}
+
+static void intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, int32_t stride)
+{
+    uint8_t lp_cnt;
+    uint32_t src0 = 0;
+    uint64_t out0, out1;
+
+    for (lp_cnt = 0; lp_cnt < 4; lp_cnt++) {
+        src0 += src[(4 + lp_cnt) * stride - 1];
+    }
+
+    src0 = (src0 + 2) >> 2;
+
+    out0 = 0x8080808080808080;
+    out1 = src0 * 0x0101010101010101;
+
+    for (lp_cnt = 4; lp_cnt--;) {
+        SD(out0, src);
+        SD(out1, src + stride * 4);
+        src += stride;
+    }
+}
+
+void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_plane_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_dc_4blk_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_hor_dc_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_vert_dc_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_l0t_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_0lt_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_l00_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    intra_predict_mad_cow_dc_0l0_8x8_msa(src, stride);
+}
+
+void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_plane_16x16_msa(src, stride);
+}
+
+void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_vert_8x8_msa(src - stride, dst, stride);
+}
+
+void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_horiz_8x8_msa(src - 1, stride, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 1, 1);
+}
+
+void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_vert_16x16_msa(src - stride, dst, stride);
+}
+
+void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *dst = src;
+
+    intra_predict_horiz_16x16_msa(src - 1, stride, dst, stride);
+}
+
+void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 0, 1);
+}
+
+void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 1, 0);
+}
+
+void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_8x8_msa(src_top, src_left, stride, dst, stride, 0, 0);
+}
+
+void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride)
+{
+    uint8_t *src_top = src - stride;
+    uint8_t *src_left = src - 1;
+    uint8_t *dst = src;
+
+    intra_predict_dc_16x16_msa(src_top, src_left, stride, dst, stride, 0, 0);
+}
+
+void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_127dc_8x8_msa(src, stride);
+}
+
+void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_129dc_8x8_msa(src, stride);
+}
+
+void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_127dc_16x16_msa(src, stride);
+}
+
+void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
+{
+    intra_predict_129dc_16x16_msa(src, stride);
+}
diff --git a/libavcodec/mips/h264qpel_init_mips.c b/libavcodec/mips/h264qpel_init_mips.c
new file mode 100644
index 0000000..92219f8
--- /dev/null
+++ b/libavcodec/mips/h264qpel_init_mips.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void h264qpel_init_msa(H264QpelContext *c, int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_msa;
+        c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_msa;
+        c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_msa;
+        c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_msa;
+        c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_msa;
+        c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_msa;
+        c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_msa;
+        c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_msa;
+        c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_msa;
+        c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_msa;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_msa;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_msa;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_msa;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_msa;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_msa;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_msa;
+
+        c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_msa;
+        c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_msa;
+        c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_msa;
+        c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_msa;
+        c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_msa;
+        c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_msa;
+        c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_msa;
+        c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_msa;
+        c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_msa;
+        c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_msa;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_msa;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_msa;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_msa;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_msa;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_msa;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_msa;
+
+        c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_msa;
+        c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_msa;
+        c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_msa;
+        c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_msa;
+        c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_msa;
+        c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_msa;
+        c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_msa;
+        c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_msa;
+        c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_msa;
+        c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_msa;
+        c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_msa;
+        c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_msa;
+        c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_msa;
+        c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_msa;
+        c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_msa;
+
+        c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_msa;
+        c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_msa;
+        c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_msa;
+        c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_msa;
+        c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_msa;
+        c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_msa;
+        c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_msa;
+        c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_msa;
+        c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_msa;
+        c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_msa;
+        c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_msa;
+        c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_msa;
+        c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_msa;
+        c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_msa;
+        c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_msa;
+        c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void h264qpel_init_mmi(H264QpelContext *c, int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_mmi;
+
+        c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_mmi;
+
+        c->put_h264_qpel_pixels_tab[2][0] = ff_put_h264_qpel4_mc00_mmi;
+        c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_mmi;
+        c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_mmi;
+        c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_mmi;
+        c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_mmi;
+        c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_mmi;
+        c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_mmi;
+        c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_mmi;
+        c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_mmi;
+        c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_mmi;
+        c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_mmi;
+        c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_mmi;
+        c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_mmi;
+        c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_mmi;
+        c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_mmi;
+        c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_mmi;
+
+        c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_mmi;
+        c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_mmi;
+        c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_mmi;
+        c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_mmi;
+        c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_mmi;
+        c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_mmi;
+        c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_mmi;
+        c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_mmi;
+        c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_mmi;
+        c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_mmi;
+        c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_mmi;
+        c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_mmi;
+        c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_mmi;
+        c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_mmi;
+        c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_mmi;
+        c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_MSA
+    h264qpel_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    h264qpel_init_mmi(c, bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/h264qpel_mmi.c b/libavcodec/mips/h264qpel_mmi.c
new file mode 100644
index 0000000..b4e83e4
--- /dev/null
+++ b/libavcodec/mips/h264qpel_mmi.c
@@ -0,0 +1,3263 @@
+/*
+ * Loongson SIMD optimized h264qpel
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_mips.h"
+#include "hpeldsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavutil/mips/asmdefs.h"
+
+static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    double ftmp[1];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[dst])                            \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
+        "addi       %[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h),
+          [low32]"=&r"(low32)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride)
+        : "memory"
+    );
+}
+
+static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    double ftmp[1];
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[dst])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
+        "addi       %[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride)
+        : "memory"
+    );
+}
+
+static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride, int h)
+{
+    double ftmp[1];
+    uint64_t tmp[1];
+
+    __asm__ volatile (
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "ldl        %[tmp0],    0x0f(%[src])                            \n\t"
+        "ldr        %[tmp0],    0x08(%[src])                            \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[dst])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
+        "sdl        %[tmp0],    0x0f(%[dst])                            \n\t"
+        "sdr        %[tmp0],    0x08(%[dst])                            \n\t"
+        "addi       %[h],       %[h],           -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),
+          [tmp0]"=&r"(tmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [h]"+&r"(h)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride)
+        : "memory"
+    );
+}
+
+#define op2_avg(a, b)  a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
+#define op2_put(a, b)  a = CLIP(((b) + 512)>>10)
+static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[10];
+    uint64_t tmp[1];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        "uld        %[low32],   -0x02(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   -0x01(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "uld        %[low32],   0x01(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x02(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "uld        %[low32],   0x03(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
+        "gsswlc1    %[ftmp9],   0x03(%[dst])                            \n\t"
+        "gsswrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [low32]"=&r"(low32)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[11];
+    uint64_t tmp[1];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp1],   0x05(%[src])                            \n\t"
+        "gsldrc1    %[ftmp1],   -0x02(%[src])                           \n\t"
+        "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
+        "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[src])                            \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[src])                            \n\t"
+        "gsldlc1    %[ftmp4],   0x08(%[src])                            \n\t"
+        "gsldrc1    %[ftmp4],   0x01(%[src])                            \n\t"
+        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
+        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
+        "gsldlc1    %[ftmp6],   0x0a(%[src])                            \n\t"
+        "gsldrc1    %[ftmp6],   0x03(%[src])                            \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
+        "gssdlc1    %[ftmp9],   0x07(%[dst])                            \n\t"
+        "gssdrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[11];
+    uint64_t tmp[1];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "1:                                                             \n\t"
+        "uld        %[low32],   -0x02(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   -0x01(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "uld        %[low32],   0x01(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x02(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "uld        %[low32],   0x03(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
+        "lwc1       %[ftmp10],  0x00(%[dst])                            \n\t"
+        "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
+        "gsswlc1    %[ftmp9],   0x03(%[dst])                            \n\t"
+        "gsswrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [low32]"=&r"(low32)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[11];
+    uint64_t tmp[1];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp1],   0x05(%[src])                            \n\t"
+        "gsldrc1    %[ftmp1],   -0x02(%[src])                           \n\t"
+        "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
+        "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[src])                            \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[src])                            \n\t"
+        "gsldlc1    %[ftmp4],   0x08(%[src])                            \n\t"
+        "gsldrc1    %[ftmp4],   0x01(%[src])                            \n\t"
+        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
+        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
+        "gsldlc1    %[ftmp6],   0x0a(%[src])                            \n\t"
+        "gsldrc1    %[ftmp6],   0x03(%[src])                            \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp4],   %[ftmp4],       %[ff_pw_20]             \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp5],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp5],   %[ftmp8],       %[ftmp10]               \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ff_pw_5]              \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp10],  %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
+        "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
+        "ldc1       %[ftmp10],  0x00(%[dst])                            \n\t"
+        "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
+        "sdc1       %[ftmp9],   0x00(%[dst])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [tmp0]"=&r"(tmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
+          [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[12];
+    uint64_t tmp[1];
+    uint64_t low32;
+
+    src -= 2 * srcStride;
+
+    __asm__ volatile (
+        ".set       push                                                \n\t"
+        ".set       noreorder                                           \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "paddh      %[ftmp7],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "paddh      %[ftmp7],   %[ftmp5],       %[ftmp6]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "paddh      %[ftmp7],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        ".set       pop                                                 \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [low32]"=&r"(low32)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int w = 2;
+    int h = 8;
+    double ftmp[10];
+    uint64_t tmp[1];
+    uint64_t low32;
+
+    src -= 2 * srcStride;
+
+    while (w--) {
+        __asm__ volatile (
+            ".set       push                                            \n\t"
+            ".set       noreorder                                       \n\t"
+            "dli        %[tmp0],    0x02                                \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "dli        %[tmp0],    0x05                                \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3] ,  %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "bne        %[h],       0x10,           2f                  \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "2:                                                         \n\t"
+            ".set       pop                                             \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp[0]),
+              [src]"+&r"(src),              [dst]"+&r"(dst),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [dstStride]"r"((mips_reg)dstStride),
+              [srcStride]"r"((mips_reg)srcStride),
+              [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+            : "memory"
+        );
+
+        src += 4 - (h + 5) * srcStride;
+        dst += 4 - h * dstStride;
+    }
+}
+
+static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    double ftmp[10];
+    uint64_t tmp[1];
+
+    src -= 2 * srcStride;
+
+    __asm__ volatile (
+        ".set       push                                                \n\t"
+        ".set       noreorder                                           \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        "lwc1       %[ftmp0],   0x00(%[src])                            \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "lwc1       %[ftmp1],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "lwc1       %[ftmp2],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "lwc1       %[ftmp3],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "lwc1       %[ftmp4],   0x00(%[src])                            \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "lwc1       %[ftmp5],   0x00(%[src])                            \n\t"
+        "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "lwc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "lwc1       %[ftmp0],   0x00(%[src])                            \n\t"
+        "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "lwc1       %[ftmp1],   0x00(%[dst])                            \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "lwc1       %[ftmp1],   0x00(%[src])                            \n\t"
+        "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "lwc1       %[ftmp2],   0x00(%[dst])                            \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        "lwc1       %[ftmp2],   0x00(%[src])                            \n\t"
+        "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]              \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "lwc1       %[ftmp3],   0x00(%[dst])                            \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        ".set       pop                                                 \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src),              [dst]"+&r"(dst)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    int w = 2;
+    int h = 8;
+    double ftmp[10];
+    uint64_t tmp[1];
+    uint64_t low32;
+
+    src -= 2 * srcStride;
+
+    while (w--) {
+        __asm__ volatile (
+            ".set       push                                            \n\t"
+            ".set       noreorder                                       \n\t"
+            "dli        %[tmp0],    0x02                                \n\t"
+            "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "dli        %[tmp0],    0x05                                \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp4],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp5],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "bne        %[h],       0x10,           2f                  \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp4],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp5],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
+            "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+            "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "2:                                                         \n\t"
+            ".set       pop                                             \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp[0]),
+              [src]"+&r"(src),              [dst]"+&r"(dst),
+              [h]"+&r"(h),
+              [low32]"=&r"(low32)
+            : [dstStride]"r"((mips_reg)dstStride),
+              [srcStride]"r"((mips_reg)srcStride),
+              [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+            : "memory"
+        );
+
+        src += 4 - (h + 5) * srcStride;
+        dst += 4 - h * dstStride;
+    }
+}
+
+static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    INIT_CLIP
+    int i;
+    int16_t _tmp[36];
+    int16_t *tmp = _tmp;
+    double ftmp[10];
+    uint64_t tmp0;
+    uint64_t low32;
+
+    src -= 2*srcStride;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x09                                    \n\t"
+        "1:                                                             \n\t"
+        "uld        %[low32],   -0x02(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   -0x01(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "uld        %[low32],   0x01(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x02(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "uld        %[low32],   0x03(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        "sdc1       %[ftmp9],   0x00(%[tmp])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp0),
+          [tmp]"+&r"(tmp),                  [src]"+&r"(src),
+          [low32]"=&r"(low32)
+        : [tmpStride]"r"(8),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
+        : "memory"
+    );
+
+    tmp -= 28;
+
+    for (i=0; i<4; i++) {
+        const int16_t tmpB= tmp[-8];
+        const int16_t tmpA= tmp[-4];
+        const int16_t tmp0= tmp[ 0];
+        const int16_t tmp1= tmp[ 4];
+        const int16_t tmp2= tmp[ 8];
+        const int16_t tmp3= tmp[12];
+        const int16_t tmp4= tmp[16];
+        const int16_t tmp5= tmp[20];
+        const int16_t tmp6= tmp[24];
+        op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        dst++;
+        tmp++;
+    }
+}
+
+static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
+        const uint8_t *src, ptrdiff_t tmpStride, ptrdiff_t srcStride, int size)
+{
+    int w = (size + 8) >> 2;
+    double ftmp[11];
+    uint64_t tmp0;
+    uint64_t low32;
+
+    src -= 2 * srcStride + 2;
+
+    while (w--) {
+        __asm__ volatile (
+            "dli        %[tmp0],    0x02                                \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "sdc1       %[ftmp6],   0x00(%[tmp])                        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "sdc1       %[ftmp6],   0x30(%[tmp])                        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "sdc1       %[ftmp6],   0x60(%[tmp])                        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "sdc1       %[ftmp6],   0x90(%[tmp])                        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "sdc1       %[ftmp6],   0xc0(%[tmp])                        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "sdc1       %[ftmp6],   0xf0(%[tmp])                        \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "sdc1       %[ftmp6],   0x120(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "sdc1       %[ftmp6],   0x150(%[tmp])                       \n\t"
+            "bne        %[size],    0x10,           2f                  \n\t"
+
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "sdc1       %[ftmp6],   0x180(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "sdc1       %[ftmp6],   0x1b0(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "sdc1       %[ftmp6],   0x1e0(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "sdc1       %[ftmp6],   0x210(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "sdc1       %[ftmp6],   0x240(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "sdc1       %[ftmp6],   0x270(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
+            "sdc1       %[ftmp6],   0x2a0(%[tmp])                       \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+            "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ff_pw_5]          \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
+            PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
+            "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "sdc1       %[ftmp6],   0x2d0(%[tmp])                       \n\t"
+            "2:                                                         \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [ftmp10]"=&f"(ftmp[10]),
+              [tmp0]"=&r"(tmp0),
+              [src]"+&r"(src),
+              [low32]"=&r"(low32)
+            : [tmp]"r"(tmp),                [size]"r"(size),
+              [srcStride]"r"((mips_reg)srcStride),
+              [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
+            : "memory"
+        );
+
+        tmp += 4;
+        src += 4 - (size + 5) * srcStride;
+    }
+}
+
+static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
+        int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
+{
+    int w = size >> 4;
+    double ftmp[10];
+    uint64_t tmp0;
+
+    do {
+        int h = size;
+
+        __asm__ volatile (
+            "dli        %[tmp0],    0x02                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "1:                                                         \n\t"
+            "ldc1       %[ftmp0],   0x00(%[tmp])                        \n\t"
+            "ldc1       %[ftmp3],   0x08(%[tmp])                        \n\t"
+            "ldc1       %[ftmp6],   0x10(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp1],   0x09(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x02(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x11(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp4],   0x0a(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp5],   0x19(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x12(%[tmp])                        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
+            "gsldlc1    %[ftmp2],   0x0b(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp2],   0x04(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp6],   0x0d(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp6],   0x06(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp5],   0x13(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x0c(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp7],   0x15(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x0e(%[tmp])                        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp3] ,  %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp8]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            "gssdlc1    %[ftmp0],   0x07(%[dst])                        \n\t"
+            "gssdrc1    %[ftmp0],   0x00(%[dst])                        \n\t"
+            PTR_ADDIU  "%[tmp],     %[tmp],         0x30                \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [tmp0]"=&r"(tmp0),
+              [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
+              [h]"+&r"(h)
+            : [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        tmp += 8 - size * 24;
+        dst += 8 - size * dstStride;
+    } while (w--);
+}
+
+static void put_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride, int size)
+{
+    put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
+    put_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
+}
+
+static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 8);
+}
+
+static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    put_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 16);
+}
+
+static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    int h = 8;
+    double ftmp[9];
+    uint64_t tmp[1];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        "gsldlc1    %[ftmp3],   0x08(%[src])                            \n\t"
+        "gsldrc1    %[ftmp3],   0x01(%[src])                            \n\t"
+        "punpckhbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psllh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "gsldlc1    %[ftmp3],   0x06(%[src])                            \n\t"
+        "gsldrc1    %[ftmp3],   -0x01(%[src])                           \n\t"
+        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
+        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
+        "uld        %[low32],   -0x02(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "uld        %[low32],   0x07(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[src2])                           \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[src2])                           \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[h],       %[h],           -0x01                   \n\t"
+        "sdc1       %[ftmp1],   0x00(%[dst])                            \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
+        "bgtz       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [tmp0]"=&r"(tmp[0]),
+          [src]"+&r"(src),                  [dst]"+&r"(dst),
+          [src2]"+&r"(src2),                [h]"+&r"(h),
+          [low32]"=&r"(low32)
+        : [src2Stride]"r"((mips_reg)src2Stride),
+          [dstStride]"r"((mips_reg)dstStride),
+          [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void put_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
+{
+    double ftmp[7];
+    uint64_t tmp0;
+
+    do {
+        __asm__ volatile (
+            "dli        %[tmp0],    0x05                                \n\t"
+            "gsldlc1    %[ftmp0],   0x07(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp0],   0x00(%[src16])                      \n\t"
+            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+            "gsldlc1    %[ftmp1],   0x0f(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp1],   0x08(%[src16])                      \n\t"
+            "gsldlc1    %[ftmp2],   0x37(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp2],   0x30(%[src16])                      \n\t"
+            "gsldlc1    %[ftmp3],   0x3f(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp3],   0x38(%[src16])                      \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
+            "ldc1       %[ftmp5],   0x00(%[src8])                       \n\t"
+            "gsldxc1    %[ftmp4],   0x00(%[src8],   %[src8Stride])      \n\t"
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
+            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            "gssdxc1    %[ftmp2],   0x00(%[dst],    %[dstStride])       \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),
+              [tmp0]"=&r"(tmp0)
+            : [src8]"r"(src8),              [src16]"r"(src16),
+              [dst]"r"(dst),
+              [src8Stride]"r"((mips_reg)src8Stride),
+              [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        src8  += 2 * src8Stride;
+        src16 += 48;
+        dst   += 2 * dstStride;
+    } while (h -= 2);
+}
+
+static void put_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+
+    src += 8 * dstStride;
+    dst += 8 * dstStride;
+    src2 += 8 * src2Stride;
+
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+}
+
+static void put_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int h)
+{
+    put_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, h);
+    put_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
+            src8Stride, h);
+}
+
+static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
+        int dstStride, int srcStride)
+{
+    INIT_CLIP
+    int i;
+    int16_t _tmp[36];
+    int16_t *tmp = _tmp;
+    double ftmp[10];
+    uint64_t tmp0;
+    uint64_t low32;
+
+    src -= 2*srcStride;
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dli        %[tmp0],    0x09                                    \n\t"
+        "1:                                                             \n\t"
+        "uld        %[low32],   -0x02(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   -0x01(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "uld        %[low32],   0x01(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x02(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "uld        %[low32],   0x03(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp1],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ff_pw_20]             \n\t"
+        "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
+        "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
+        "sdc1       %[ftmp9],   0x00(%[tmp])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
+        PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
+        "bnez       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp0),
+          [tmp]"+&r"(tmp),                  [src]"+&r"(src),
+          [low32]"=&r"(low32)
+        : [tmpStride]"r"(8),
+          [srcStride]"r"((mips_reg)srcStride),
+          [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
+        : "memory"
+    );
+
+    tmp -= 28;
+
+    for (i=0; i<4; i++) {
+        const int16_t tmpB= tmp[-8];
+        const int16_t tmpA= tmp[-4];
+        const int16_t tmp0= tmp[ 0];
+        const int16_t tmp1= tmp[ 4];
+        const int16_t tmp2= tmp[ 8];
+        const int16_t tmp3= tmp[12];
+        const int16_t tmp4= tmp[16];
+        const int16_t tmp5= tmp[20];
+        const int16_t tmp6= tmp[24];
+        op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
+        op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
+        op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
+        op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
+        dst++;
+        tmp++;
+    }
+}
+
+static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
+        int16_t *tmp, ptrdiff_t dstStride, ptrdiff_t tmpStride, int size)
+{
+    int w = size >> 4;
+    double ftmp[11];
+    uint64_t tmp0;
+
+    do {
+        int h = size;
+        __asm__ volatile (
+            "dli        %[tmp0],    0x02                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
+            "dli        %[tmp0],    0x06                                \n\t"
+            "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
+            "1:                                                         \n\t"
+            "ldc1       %[ftmp0],   0x00(%[tmp])                        \n\t"
+            "ldc1       %[ftmp3],   0x08(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp1],   0x09(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x02(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x11(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp4],   0x0a(%[tmp])                        \n\t"
+            "ldc1       %[ftmp7],   0x10(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x19(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp8],   0x12(%[tmp])                        \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
+            "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
+            "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
+            "gsldlc1    %[ftmp2],   0x0b(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp2],   0x04(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp5],   0x13(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x0c(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp7],   0x0d(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x06(%[tmp])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x15(%[tmp])                        \n\t"
+            "gsldrc1    %[ftmp8],   0x0e(%[tmp])                        \n\t"
+            "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "paddh      %[ftmp5],   %[ftmp5],       %[ftmp8]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+            "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+            "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]            \n\t"
+            "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+            "ldc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
+            "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            "addi       %[h],       %[h],           -0x01               \n\t"
+            PTR_ADDI   "%[tmp],     %[tmp],         0x30                \n\t"
+            PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
+            "bnez       %[h],       1b                                  \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
+              [ftmp10]"=&f"(ftmp[10]),
+              [tmp0]"=&r"(tmp0),
+              [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
+              [h]"+&r"(h)
+            : [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        tmp += 8 - size * 24;
+        dst += 8 - size * dstStride;
+    } while (w--);
+}
+
+static void avg_h264_qpel8or16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride, int size)
+{
+    put_h264_qpel8or16_hv1_lowpass_mmi(tmp, src, tmpStride, srcStride, size);
+    avg_h264_qpel8or16_hv2_lowpass_mmi(dst, tmp, dstStride, tmpStride, size);
+}
+
+static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 8);
+}
+
+static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, int16_t *tmp,
+        const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t tmpStride,
+        ptrdiff_t srcStride)
+{
+    avg_h264_qpel8or16_hv_lowpass_mmi(dst, tmp, src, dstStride, tmpStride,
+            srcStride, 16);
+}
+
+static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    double ftmp[10];
+    uint64_t tmp[2];
+    uint64_t low32;
+
+    __asm__ volatile (
+        "dli        %[tmp1],    0x02                                    \n\t"
+        "ori        %[tmp0],    $0,             0x8                     \n\t"
+        "mtc1       %[tmp1],    %[ftmp7]                                \n\t"
+        "dli        %[tmp1],    0x05                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp1],    %[ftmp8]                                \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[src])                            \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        "gsldlc1    %[ftmp2],   0x08(%[src])                            \n\t"
+        "gsldrc1    %[ftmp2],   0x01(%[src])                            \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psllh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
+        "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
+        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
+        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
+        "uld        %[low32],   -0x02(%[src])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x07(%[src])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]             \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[src2])                           \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[src2])                           \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "ldc1       %[ftmp9],   0x00(%[dst])                            \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
+        "sdc1       %[ftmp1],   0x00(%[dst])                            \n\t"
+        "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
+        "bgtz       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [dst]"+&r"(dst),                  [src]"+&r"(src),
+          [src2]"+&r"(src2),
+          [low32]"=&r"(low32)
+        : [dstStride]"r"((mips_reg)dstStride),
+          [src2Stride]"r"((mips_reg)src2Stride),
+          [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
+        : "memory"
+    );
+}
+
+static void avg_h264_qpel16_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
+        const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src2Stride)
+{
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+
+    src += 8 * dstStride;
+    dst += 8 * dstStride;
+    src2 += 8 * src2Stride;
+
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, src2, dstStride, src2Stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst + 8, src + 8, src2 + 8, dstStride,
+            src2Stride);
+}
+
+static void avg_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
+{
+    double ftmp[8];
+    uint64_t tmp0;
+
+    do {
+        __asm__ volatile (
+            "dli        %[tmp0],    0x05                                \n\t"
+            "gsldlc1    %[ftmp0],   0x07(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp0],   0x00(%[src16])                      \n\t"
+            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+            "gsldlc1    %[ftmp1],   0x0f(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp1],   0x08(%[src16])                      \n\t"
+            "gsldlc1    %[ftmp2],   0x37(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp2],   0x30(%[src16])                      \n\t"
+            "gsldlc1    %[ftmp3],   0x3f(%[src16])                      \n\t"
+            "gsldrc1    %[ftmp3],   0x38(%[src16])                      \n\t"
+            "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
+            "ldc1       %[ftmp4],   0x00(%[src8])                       \n\t"
+            "gsldxc1    %[ftmp5],   0x00(%[src8],   %[src8Stride])      \n\t"
+            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
+            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
+            "ldc1       %[ftmp7],   0x00(%[dst])                        \n\t"
+            "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            "gsldxc1    %[ftmp7],   0x00(%[dst],    %[dstStride])       \n\t"
+            "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "gssdxc1    %[ftmp2],   0x00(%[dst],    %[dstStride])       \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [tmp0]"=&r"(tmp0)
+            : [src8]"r"(src8),              [src16]"r"(src16),
+              [dst]"r"(dst),
+              [src8Stride]"r"((mips_reg)src8Stride),
+              [dstStride]"r"((mips_reg)dstStride)
+            : "memory"
+        );
+
+        src8  += 2 * src8Stride;
+        src16 += 48;
+        dst   += 2 * dstStride;
+    } while (b -= 2);
+}
+
+static void avg_pixels16_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
+        const uint8_t *src8, ptrdiff_t dstStride, ptrdiff_t src8Stride, int b)
+{
+    avg_pixels8_l2_shift5_mmi(dst, src16, src8, dstStride, src8Stride, b);
+    avg_pixels8_l2_shift5_mmi(dst + 8, src16 + 8, src8 + 8, dstStride,
+            src8Stride, b);
+}
+
+//DEF_H264_MC_MMI(put_, 4)
+void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_put_pixels4_8_mmi(dst, src, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_put_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+//DEF_H264_MC_MMI(avg_, 4)
+void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_avg_pixels4_8_mmi(dst, src, stride, 4);
+}
+
+void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, src, half, stride, stride, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[16];
+    put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, src+1, half, stride, stride, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, full_mid, half, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
+}
+
+void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t half[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfH[16];
+    uint8_t halfV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t halfH[16];
+    uint8_t halfHV[16];
+    put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[36];
+    uint8_t * const full_mid= full + 8;
+    uint8_t halfV[16];
+    uint8_t halfHV[16];
+    copy_block4_mmi(full, src - stride*2 + 1, 4,  stride, 9);
+    put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
+    put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
+    ff_avg_pixels4_l2_8_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
+}
+
+//DEF_H264_MC_MMI(put_, 8)
+void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_put_pixels8_8_mmi(dst, src, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_put_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_put_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_put_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[192];
+
+    put_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
+}
+
+void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
+}
+
+void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    put_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
+}
+
+//DEF_H264_MC_MMI(avg_, 8)
+void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_avg_pixels8_8_mmi(dst, src, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_avg_pixels8_l2_8_mmi(dst, src, half, stride, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[64];
+    put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
+    ff_avg_pixels8_l2_8_mmi(dst, src+1, half, stride, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, full_mid, half, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t half[64];
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[104];
+    uint8_t * const full_mid= full + 16;
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+    put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
+    copy_block8_mmi(full, src - stride*2 + 1, 8,  stride, 13);
+    put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
+    ff_avg_pixels8_l2_8_mmi(dst, halfH, halfV, stride, 8, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[192];
+
+    avg_h264_qpel8_hv_lowpass_mmi(dst, temp, src, stride, 8, stride);
+}
+
+void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_h264_qpel8_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_pixels8_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 8, 8);
+}
+
+void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[448];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 64);
+
+    put_h264_qpel8_hv_lowpass_mmi(halfHV, halfV, src, 8, 8, stride);
+    avg_pixels8_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 8, 8);
+}
+
+//DEF_H264_MC_MMI(put_, 16)
+void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_put_pixels16_8_mmi(dst, src, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_put_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_put_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_put_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[384];
+
+    put_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
+}
+
+void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
+}
+
+void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    put_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
+}
+
+//DEF_H264_MC_MMI(avg_, 16)
+void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    ff_avg_pixels16_8_mmi(dst, src, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_avg_pixels16_l2_8_mmi(dst, src, half, stride, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t half[256];
+    put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
+    ff_avg_pixels16_l2_8_mmi(dst, src+1, half, stride, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, full_mid, half, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t half[256];
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t full[336];
+    uint8_t * const full_mid= full + 32;
+    uint8_t halfH[256];
+    uint8_t halfV[256];
+    put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
+    copy_block16_mmi(full, src - stride*2 + 1, 16,  stride, 21);
+    put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
+    ff_avg_pixels16_l2_8_mmi(dst, halfH, halfV, stride, 16, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint16_t __attribute__ ((aligned(8))) temp[384];
+
+    avg_h264_qpel16_hv_lowpass_mmi(dst, temp, src, stride, 16, stride);
+}
+
+void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_h264_qpel16_h_lowpass_l2_mmi(dst, src, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_h264_qpel16_h_lowpass_l2_mmi(dst, src + stride, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_pixels16_l2_shift5_mmi(dst, halfV + 2, halfHV, stride, 16, 16);
+}
+
+void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
+        ptrdiff_t stride)
+{
+    uint8_t __attribute__ ((aligned(8))) temp[1024];
+    uint8_t *const halfHV = temp;
+    int16_t *const halfV = (int16_t *) (temp + 256);
+
+    put_h264_qpel16_hv_lowpass_mmi(halfHV, halfV, src, 16, 16, stride);
+    avg_pixels16_l2_shift5_mmi(dst, halfV + 3, halfHV, stride, 16, 16);
+}
+
+#undef op2_avg
+#undef op2_put
diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c
new file mode 100644
index 0000000..c38f1f7
--- /dev/null
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -0,0 +1,3600 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h264dsp_mips.h"
+
+#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5)    \
+( {                                                                      \
+    v4i32 tmp0_m, tmp1_m;                                                \
+    v8i16 out0_m, out1_m, out2_m, out3_m;                                \
+    v8i16 minus5h_m = __msa_ldi_h(-5);                                   \
+    v8i16 plus20h_m = __msa_ldi_h(20);                                   \
+                                                                         \
+    ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m);                               \
+                                                                         \
+    tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m);             \
+    tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m);             \
+                                                                         \
+    ILVRL_H2_SH(in1, in4, out0_m, out1_m);                               \
+    DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m);  \
+    ILVRL_H2_SH(in2, in3, out2_m, out3_m);                               \
+    DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m);  \
+                                                                         \
+    SRARI_W2_SW(tmp0_m, tmp1_m, 10);                                     \
+    SAT_SW2_SW(tmp0_m, tmp1_m, 7);                                       \
+    out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m);              \
+                                                                         \
+    out0_m;                                                              \
+} )
+
+#define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2)     \
+( {                                                     \
+    v8i16 out0_m, out1_m;                               \
+    v16i8 tmp0_m, tmp1_m;                               \
+    v16i8 minus5b = __msa_ldi_b(-5);                    \
+    v16i8 plus20b = __msa_ldi_b(20);                    \
+                                                        \
+    tmp0_m = __msa_vshf_b((v16i8) mask0, in, in);       \
+    out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m);            \
+                                                        \
+    tmp0_m = __msa_vshf_b((v16i8) mask1, in, in);       \
+    out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m);  \
+                                                        \
+    tmp1_m = __msa_vshf_b((v16i8) (mask2), in, in);     \
+    out1_m = __msa_dpadd_s_h(out0_m, plus20b, tmp1_m);  \
+                                                        \
+    out1_m;                                             \
+} )
+
+static const uint8_t luma_mask_arr[16 * 8] = {
+    /* 8 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
+    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
+    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+
+    /* 4 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
+    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
+    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
+
+    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
+    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
+};
+
+#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,  \
+                                        out1, out2)                          \
+{                                                                            \
+    v16i8 tmp0_m, tmp1_m;                                                    \
+    v16i8 minus5b_m = __msa_ldi_b(-5);                                       \
+    v16i8 plus20b_m = __msa_ldi_b(20);                                       \
+                                                                             \
+    ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m);                                 \
+    HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2);                                 \
+    ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m);                                 \
+    DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2);          \
+    ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m);                                 \
+    DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2);          \
+}
+
+#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)  \
+( {                                                                            \
+    v8i16 tmp1_m;                                                              \
+    v16i8 tmp0_m, tmp2_m;                                                      \
+    v16i8 minus5b_m = __msa_ldi_b(-5);                                         \
+    v16i8 plus20b_m = __msa_ldi_b(20);                                         \
+                                                                               \
+    tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0);                 \
+    tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m);                   \
+                                                                               \
+    ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m);                        \
+    DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m);        \
+                                                                               \
+    tmp1_m;                                                                    \
+} )
+
+#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)  \
+( {                                                                            \
+    v4i32 tmp1_m;                                                              \
+    v8i16 tmp2_m, tmp3_m;                                                      \
+    v8i16 minus5h_m = __msa_ldi_h(-5);                                         \
+    v8i16 plus20h_m = __msa_ldi_h(20);                                         \
+                                                                               \
+    tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0);                 \
+    tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m);                   \
+                                                                               \
+    ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m);                        \
+    DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m);        \
+                                                                               \
+    tmp1_m = __msa_srari_w(tmp1_m, 10);                                        \
+    tmp1_m = __msa_sat_s_w(tmp1_m, 7);                                         \
+                                                                               \
+    tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m);                    \
+                                                                               \
+    tmp2_m;                                                                    \
+} )
+
+#define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,              \
+                                                    mask0, mask1, mask2)     \
+( {                                                                          \
+    v8i16 hz_out_m;                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m;                                            \
+    v16i8 minus5b_m = __msa_ldi_b(-5);                                       \
+    v16i8 plus20b_m = __msa_ldi_b(20);                                       \
+                                                                             \
+    vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0);        \
+    hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m);                               \
+                                                                             \
+    VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m);        \
+    DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m);  \
+                                                                             \
+    hz_out_m;                                                                \
+} )
+
+static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 res0, res1;
+    v16u8 out;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+        HADD_SB2_SH(vec0, vec1, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+        SRARI_H2_SH(res0, res1, 5);
+        SAT_SH2_SH(res0, res1, 7);
+        out = PCKEV_XORI128_UB(res0, res1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_8w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 res0, res1, res2, res3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+    v16u8 out0, out1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        out0 = PCKEV_XORI128_UB(res0, res1);
+        out1 = PCKEV_XORI128_UB(res2, res3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_16w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+
+        LD_SB2(src, 8, src4, src5);
+        src += src_stride;
+        LD_SB2(src, 8, src6, src7);
+        src += src_stride;
+
+        XORI_B4_128_SB(src4, src5, src6, src7);
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                    vec0, vec1, vec2, vec3);
+        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
+
+        ST_SB4(vec0, vec1, vec2, vec3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t hor_offset)
+{
+    uint8_t slide;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 res0, res1;
+    v16i8 res, mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    slide = 2 + hor_offset;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+        HADD_SB2_SH(vec0, vec1, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+        DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+        SRARI_H2_SH(res0, res1, 5);
+        SAT_SH2_SH(res0, res1, 7);
+
+        res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
+        src0 = __msa_sld_b(src0, src0, slide);
+        src1 = __msa_sld_b(src1, src1, slide);
+        src2 = __msa_sld_b(src2, src2, slide);
+        src3 = __msa_sld_b(src3, src3, slide);
+        src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+        src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+        src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
+        res = __msa_aver_s_b(res, src0);
+        res = (v16i8) __msa_xori_b((v16u8) res, 128);
+
+        ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t hor_offset)
+{
+    uint8_t slide;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 tmp0, tmp1;
+    v8i16 res0, res1, res2, res3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    slide = 2 + hor_offset;
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+
+        src0 = __msa_sld_b(src0, src0, slide);
+        src1 = __msa_sld_b(src1, src1, slide);
+        src2 = __msa_sld_b(src2, src2, slide);
+        src3 = __msa_sld_b(src3, src3, slide);
+
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
+        PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
+
+        tmp0 = __msa_aver_s_b(tmp0, src0);
+        tmp1 = __msa_aver_s_b(tmp1, src1);
+
+        XORI_B2_128_SB(tmp0, tmp1);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                    uint8_t *dst, int32_t dst_stride,
+                                    int32_t height, uint8_t hor_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 dst0, dst1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0, mask1, mask2, vshf;
+    v8i16 res0, res1, res2, res3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        vshf = LD_SB(&luma_mask_arr[16 + 96]);
+    } else {
+        vshf = LD_SB(&luma_mask_arr[96]);
+    }
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
+
+        dst0 = __msa_aver_s_b(dst0, src0);
+        dst1 = __msa_aver_s_b(dst1, src2);
+
+        XORI_B2_128_SB(dst0, dst1);
+
+        ST_SB2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void avc_luma_vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v16i8 filt0, filt1, filt2;
+    v8i16 out10, out32;
+    v16u8 out;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, 5);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src2110 = src6554;
+        src4332 = src8776;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+    v16i8 filt0, filt1, filt2;
+    v16u8 out0, out1;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        out0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        out1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16u8 res0, res1, res2, res3;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+
+        ST_UB4(res0, res1, res2, res3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v8i16 out10, out32;
+    v16i8 filt0, filt1, filt2;
+    v16u8 out;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, 5);
+        SAT_SH2_SH(out10, out32, 7);
+
+        out = PCKEV_XORI128_UB(out10, out32);
+
+        if (ver_offset) {
+            src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
+            src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
+        } else {
+            src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+            src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
+        }
+
+        src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+        out = __msa_aver_u_b(out, (v16u8) src32_r);
+
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src2110 = src6554;
+        src4332 = src8776;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   int32_t height, uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+    v16i8 res0, res1;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
+
+        if (ver_offset) {
+            PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
+        } else {
+            PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
+        }
+
+        res0 = __msa_aver_s_b(res0, (v16i8) src10_r);
+        res1 = __msa_aver_s_b(res1, (v16i8) src32_r);
+
+        XORI_B2_128_SB(res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src2 = src8;
+        src3 = src9;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                    uint8_t *dst, int32_t dst_stride,
+                                    int32_t height, uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16u8 res0, res1, res2, res3;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+
+        if (ver_offset) {
+            res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
+            res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
+            res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
+            res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
+        } else {
+            res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
+            res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
+            res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
+            res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
+        }
+
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                              mask0, mask1,
+                                                              mask2);
+        hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                              mask0, mask1,
+                                                              mask2);
+
+        PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
+
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
+                                                 hz_out3, hz_out4, hz_out5);
+        dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
+                                                 hz_out4, hz_out5, hz_out6);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
+                                                 hz_out5, hz_out6, hz_out7);
+        dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
+                                                 hz_out6, hz_out7, hz_out8);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
+        XORI_B2_128_SB(src0, src1);
+
+        ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16u8 out0, out1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+        out0 = PCKEV_XORI128_UB(dst0, dst1);
+        out1 = PCKEV_XORI128_UB(dst2, dst3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        hz_out3 = hz_out7;
+        hz_out1 = hz_out5;
+        hz_out5 = hz_out4;
+        hz_out4 = hz_out8;
+        hz_out2 = hz_out6;
+        hz_out0 = hz_out5;
+    }
+}
+
+static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_mid_8w_msa(src, src_stride, dst, dst_stride, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t horiz_offset)
+{
+    uint32_t row;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
+    v4i32 hz_res0, hz_res1;
+    v8i16 dst0, dst1;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+    v16u8 out;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = (height >> 1); row--;) {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
+                   mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
+                   mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+        SRARI_W2_SW(hz_res0, hz_res1, 10);
+        SAT_SW2_SW(hz_res0, hz_res1, 7);
+
+        dst0 = __msa_srari_h(shf_vec2, 5);
+        dst1 = __msa_srari_h(shf_vec5, 5);
+
+        SAT_SH2_SH(dst0, dst1, 7);
+
+        if (horiz_offset) {
+            dst0 = __msa_ilvod_h(zeros, dst0);
+            dst1 = __msa_ilvod_h(zeros, dst1);
+        } else {
+            ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
+        }
+
+        hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
+        hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
+        dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
+
+        out = PCKEV_XORI128_UB(dst0, dst0);
+        ST4x2_UB(out, dst, dst_stride);
+
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
+                                 horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int32_t height, uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
+                                 horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midv_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t ver_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                              mask0, mask1,
+                                                              mask2);
+        hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                              mask0, mask1,
+                                                              mask2);
+
+        PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
+
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+
+        if (ver_offset) {
+            dst1 = __msa_srari_h(hz_out3, 5);
+            dst3 = __msa_srari_h(hz_out4, 5);
+            dst5 = __msa_srari_h(hz_out5, 5);
+            dst7 = __msa_srari_h(hz_out6, 5);
+        } else {
+            dst1 = __msa_srari_h(hz_out2, 5);
+            dst3 = __msa_srari_h(hz_out3, 5);
+            dst5 = __msa_srari_h(hz_out4, 5);
+            dst7 = __msa_srari_h(hz_out5, 5);
+        }
+
+        SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
+
+        dst0 = __msa_aver_s_h(dst0, dst1);
+        dst1 = __msa_aver_s_h(dst2, dst3);
+        dst2 = __msa_aver_s_h(dst4, dst5);
+        dst3 = __msa_aver_s_h(dst6, dst7);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
+        XORI_B2_128_SB(src0, src1);
+
+        ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_midv_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int32_t height, uint8_t ver_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 out;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+
+        if (ver_offset) {
+            dst1 = __msa_srari_h(hz_out3, 5);
+            dst3 = __msa_srari_h(hz_out4, 5);
+            dst5 = __msa_srari_h(hz_out5, 5);
+            dst7 = __msa_srari_h(hz_out6, 5);
+        } else {
+            dst1 = __msa_srari_h(hz_out2, 5);
+            dst3 = __msa_srari_h(hz_out3, 5);
+            dst5 = __msa_srari_h(hz_out4, 5);
+            dst7 = __msa_srari_h(hz_out5, 5);
+        }
+
+        SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
+
+        dst0 = __msa_aver_s_h(dst0, dst1);
+        dst1 = __msa_aver_s_h(dst2, dst3);
+        dst2 = __msa_aver_s_h(dst4, dst5);
+        dst3 = __msa_aver_s_h(dst6, dst7);
+
+        out = PCKEV_XORI128_UB(dst0, dst0);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(dst1, dst1);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(dst2, dst2);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(dst3, dst3);
+        ST8x1_UB(out, dst);
+        dst += dst_stride;
+
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_midv_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int32_t height, uint8_t vert_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midv_qrt_8w_msa(src, src_stride, dst, dst_stride, height,
+                                 vert_offset);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                   int32_t src_stride, uint8_t *dst,
+                                   int32_t dst_stride, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
+    v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
+    v8i16 out0, out1;
+    v16u8 out;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+        src_x += (4 * src_stride);
+
+        XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+
+        hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0,
+                                                              src_hz1, mask0,
+                                                              mask1, mask2);
+        hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2,
+                                                              src_hz3, mask0,
+                                                              mask1, mask2);
+
+        SRARI_H2_SH(hz_out0, hz_out1, 5);
+        SAT_SH2_SH(hz_out0, hz_out1, 7);
+
+        LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+        src_y += (4 * src_stride);
+
+        src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
+        src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
+        src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
+        src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
+
+        XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+
+        /* filter calc */
+        vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
+                                                      src_vt2, src_vt3,
+                                                      src_vt4, src_vt5);
+        vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
+                                                      src_vt4, src_vt5,
+                                                      src_vt6, src_vt7);
+
+        SRARI_H2_SH(vert_out0, vert_out1, 5);
+        SAT_SH2_SH(vert_out0, vert_out1, 7);
+
+        out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+        out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+
+        SAT_SH2_SH(out0, out1, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src_vt3 = src_vt7;
+        src_vt1 = src_vt5;
+        src_vt0 = src_vt4;
+        src_vt4 = src_vt8;
+        src_vt2 = src_vt6;
+    }
+}
+
+static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                   int32_t src_stride, uint8_t *dst,
+                                   int32_t dst_stride, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
+    v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
+    v8i16 out0, out1, out2, out3;
+    v16u8 tmp0, tmp1;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+        XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+        src_x += (4 * src_stride);
+
+        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
+
+        SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+        SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+
+        LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+        src_y += (4 * src_stride);
+
+        src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
+        src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
+        src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
+        src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
+
+        XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+
+        /* filter calc */
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
+                                        src_vt4, src_vt5, vert_out0, vert_out1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
+                                        src_vt6, src_vt7, vert_out2, vert_out3);
+
+        SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
+        SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
+
+        out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+        out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+        out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
+        out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
+
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        src_vt3 = src_vt7;
+        src_vt1 = src_vt5;
+        src_vt5 = src_vt4;
+        src_vt4 = src_vt8;
+        src_vt2 = src_vt6;
+        src_vt0 = src_vt5;
+    }
+}
+
+static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y,
+                                    int32_t src_stride, uint8_t *dst,
+                                    int32_t dst_stride, int32_t height)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride,
+                               height);
+
+        src_x += 8;
+        src_y += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3, res;
+    v8i16 res0, res1;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
+    SRARI_H2_SH(res0, res1, 5);
+    SAT_SH2_SH(res0, res1, 7);
+    res = PCKEV_XORI128_UB(res0, res1);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+    res = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 res0, res1, res2, res3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask0, mask1, mask2;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        LD_SB2(src, 8, src4, src5);
+        src += src_stride;
+        LD_SB2(src, 8, src6, src7);
+        src += src_stride;
+        XORI_B4_128_SB(src4, src5, src6, src7);
+        VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res4, res5, res6, res7);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res4, res5, res6, res7);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SRARI_H4_SH(res4, res5, res6, res7, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        SAT_SH4_SH(res4, res5, res6, res7, 7);
+        PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                    vec0, vec1, vec2, vec3);
+        XORI_B4_128_SB(vec0, vec1, vec2, vec3);
+        AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                    dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t hor_offset)
+{
+    uint8_t slide;
+    v16i8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 out0, out1;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+    v16u8 res0, res1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        slide = 3;
+    } else {
+        slide = 2;
+    }
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
+    HADD_SB2_SH(vec0, vec1, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
+    DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
+    DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
+    SRARI_H2_SH(out0, out1, 5);
+    SAT_SH2_SH(out0, out1, 7);
+
+    PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
+
+    src0 = __msa_sld_b(src0, src0, slide);
+    src1 = __msa_sld_b(src1, src1, slide);
+    src2 = __msa_sld_b(src2, src2, slide);
+    src3 = __msa_sld_b(src3, src3, slide);
+    src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
+    src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+    res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
+    res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
+
+    XORI_B2_128_UB(res0, res1);
+
+    dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+    dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
+
+    AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
+
+    ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t hor_offset)
+{
+    uint8_t slide;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0, mask1, mask2;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v8i16 out0, out1, out2, out3;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+    v16i8 res0, res1, res2, res3;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        slide = 3;
+    } else {
+        slide = 2;
+    }
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
+        HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
+                     out0, out1, out2, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
+                     plus20b, out0, out1, out2, out3);
+
+        src0 = __msa_sld_b(src0, src0, slide);
+        src1 = __msa_sld_b(src1, src1, slide);
+        src2 = __msa_sld_b(src2, src2, slide);
+        src3 = __msa_sld_b(src3, src3, slide);
+
+        SRARI_H4_SH(out0, out1, out2, out3, 5);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
+                    res0, res1, res2, res3);
+
+        res0 = __msa_aver_s_b(res0, src0);
+        res1 = __msa_aver_s_b(res1, src1);
+        res2 = __msa_aver_s_b(res2, src2);
+        res3 = __msa_aver_s_b(res3, src3);
+
+        XORI_B4_128_SB(res0, res1, res2, res3);
+        AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
+                      dst, dst_stride);
+
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   uint8_t hor_offset)
+{
+    uint32_t loop_cnt;
+    v16i8 out0, out1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0, mask1, mask2, vshf;
+    v16u8 dst0, dst1;
+    v8i16 res0, res1, res2, res3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b(-5);
+    v16i8 plus20b = __msa_ldi_b(20);
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    if (hor_offset) {
+        vshf = LD_SB(&luma_mask_arr[16 + 96]);
+    } else {
+        vshf = LD_SB(&luma_mask_arr[96]);
+    }
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SB2(src, 8, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 8, src2, src3);
+        src += src_stride;
+
+        LD_UB2(dst, dst_stride, dst0, dst1);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
+        HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                     minus5b, res0, res1, res2, res3);
+        DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                     plus20b, res0, res1, res2, res3);
+        VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
+        SRARI_H4_SH(res0, res1, res2, res3, 5);
+        SAT_SH4_SH(res0, res1, res2, res3, 7);
+        PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
+
+        out0 = __msa_aver_s_b(out0, src0);
+        out1 = __msa_aver_s_b(out1, src2);
+
+        XORI_B2_128_SB(out0, out1);
+        AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
+        ST_UB2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v8i16 out10, out32;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+               src54_r, src65_r, src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    res = PCKEV_XORI128_UB(out10, out32);
+
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+    dst0 = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0, out1, out2, out3;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0, out1, out2, out3, 5);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res0, res1, res2, res3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, res0, res1, res2, res3);
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
+                    res0, res1, res2, res3);
+        ST_UB4(res0, res1, res2, res3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t ver_offset)
+{
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776;
+    v8i16 out10, out32;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    LD_SB4(src, src_stride, src5, src6, src7, src8);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+               src54_r, src65_r, src76_r, src87_r);
+    ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+    XORI_B2_128_SB(src6554, src8776);
+    out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+    out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+    SRARI_H2_SH(out10, out32, 5);
+    SAT_SH2_SH(out10, out32, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    res = PCKEV_XORI128_UB(out10, out32);
+
+    if (ver_offset) {
+        src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
+        src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
+    } else {
+        src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
+        src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
+    }
+
+    src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
+    res = __msa_aver_u_b(res, (v16u8) src32_r);
+
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
+    dst0 = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+    v16i8 res0, res1;
+    v16u8 vec0, vec1;
+    v16i8 filt0, filt1, filt2;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
+
+        if (ver_offset) {
+            PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
+        } else {
+            PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
+        }
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+
+        vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
+        vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
+
+        XORI_B2_128_UB(vec0, vec1);
+        AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src2 = src8;
+        src3 = src9;
+        src4 = src10;
+    }
+}
+
+static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int16_t filt_const0 = 0xfb01;
+    int16_t filt_const1 = 0x1414;
+    int16_t filt_const2 = 0x1fb;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16i8 out0, out1, out2, out3;
+    v16i8 filt0, filt1, filt2;
+    v16u8 res0, res1, res2, res3;
+
+    filt0 = (v16i8) __msa_fill_h(filt_const0);
+    filt1 = (v16i8) __msa_fill_h(filt_const1);
+    filt2 = (v16i8) __msa_fill_h(filt_const2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_l, src21_l, src32_l, src43_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_r, src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   src54_l, src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, out0, out1, out2, out3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        if (ver_offset) {
+            res0 = (v16u8) __msa_aver_s_b(out0, src3);
+            res1 = (v16u8) __msa_aver_s_b(out1, src4);
+            res2 = (v16u8) __msa_aver_s_b(out2, src5);
+            res3 = (v16u8) __msa_aver_s_b(out3, src6);
+        } else {
+            res0 = (v16u8) __msa_aver_s_b(out0, src2);
+            res1 = (v16u8) __msa_aver_s_b(out1, src3);
+            res2 = (v16u8) __msa_aver_s_b(out2, src4);
+            res3 = (v16u8) __msa_aver_s_b(out3, src5);
+        }
+
+        XORI_B4_128_UB(res0, res1, res2, res3);
+        AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
+                    dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src2 = src6;
+        src3 = src7;
+        src4 = src8;
+    }
+}
+
+static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
+
+    res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
+                                             hz_out3, hz_out4, hz_out5);
+    res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
+                                             hz_out4, hz_out5, hz_out6);
+    res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
+                                             hz_out5, hz_out6, hz_out7);
+    res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
+                                             hz_out6, hz_out7, hz_out8);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = PCKEV_XORI128_UB(res0, res1);
+    tmp1 = PCKEV_XORI128_UB(res2, res3);
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
+    AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+
+    ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 res0, res1, res2, res3;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+
+        res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+
+        dst += (4 * dst_stride);
+        hz_out3 = hz_out7;
+        hz_out1 = hz_out5;
+        hz_out5 = hz_out4;
+        hz_out4 = hz_out8;
+        hz_out2 = hz_out6;
+        hz_out0 = hz_out5;
+    }
+}
+
+static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride)
+{
+    avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
+    avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                                     16);
+}
+
+static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t horiz_offset)
+{
+    uint32_t row;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16u8 dst0, dst1, res;
+    v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
+    v4i32 hz_res0, hz_res1;
+    v8i16 res0, res1;
+    v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
+    v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
+    v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
+    v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
+    v8i16 minus5h = __msa_ldi_h(-5);
+    v8i16 plus20h = __msa_ldi_h(20);
+    v8i16 zeros = { 0 };
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    for (row = (height >> 1); row--;) {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+
+        dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
+
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
+                                        vt_res0, vt_res1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
+                                        vt_res2, vt_res3);
+        VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
+                   mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
+        VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
+                   mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
+
+        hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
+        DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
+
+        hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
+        DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
+
+        SRARI_W2_SW(hz_res0, hz_res1, 10);
+        SAT_SW2_SW(hz_res0, hz_res1, 7);
+
+        res0 = __msa_srari_h(shf_vec2, 5);
+        res1 = __msa_srari_h(shf_vec5, 5);
+
+        SAT_SH2_SH(res0, res1, 7);
+
+        if (horiz_offset) {
+            res0 = __msa_ilvod_h(zeros, res0);
+            res1 = __msa_ilvod_h(zeros, res1);
+        } else {
+            ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
+        }
+        hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
+        hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
+        res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
+
+        res = PCKEV_XORI128_UB(res0, res0);
+
+        dst0 = __msa_aver_u_b(res, dst0);
+
+        ST4x2_UB(dst0, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+    }
+}
+
+static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
+                                              height, horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height,
+                                                   uint8_t horiz_offset)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
+                                              height, horiz_offset);
+
+        src += 4;
+        dst += 4;
+    }
+}
+
+static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t ver_offset)
+{
+    int32_t loop_cnt;
+    int32_t out0, out1;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6;
+    v8i16 res0, res1, res2, res3;
+    v16u8 vec0, vec1;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                          mask0, mask1, mask2);
+    hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
+                                                          mask0, mask1, mask2);
+
+    PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
+
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
+                                                              mask0, mask1,
+                                                              mask2);
+        hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
+        res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
+                                                 hz_out3, hz_out4, hz_out5);
+        res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
+                                                 hz_out4, hz_out5, hz_out6);
+
+        if (ver_offset) {
+            res1 = __msa_srari_h(hz_out3, 5);
+            res3 = __msa_srari_h(hz_out4, 5);
+        } else {
+            res1 = __msa_srari_h(hz_out2, 5);
+            res3 = __msa_srari_h(hz_out3, 5);
+        }
+
+        SAT_SH2_SH(res1, res3, 7);
+
+        res0 = __msa_aver_s_h(res0, res1);
+        res1 = __msa_aver_s_h(res2, res3);
+
+        vec0 = PCKEV_XORI128_UB(res0, res0);
+        vec1 = PCKEV_XORI128_UB(res1, res1);
+
+        AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
+
+        out0 = __msa_copy_u_w((v4i32) dst0, 0);
+        out1 = __msa_copy_u_w((v4i32) dst1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        hz_out0 = hz_out2;
+        hz_out1 = hz_out3;
+        hz_out2 = hz_out4;
+        hz_out3 = hz_out5;
+        hz_out4 = hz_out6;
+    }
+}
+
+static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int32_t height,
+                                                  uint8_t vert_offset)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 res0, res1, res2, res3;
+    v8i16 res4, res5, res6, res7;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+    hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+    hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+    hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+    hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
+        hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
+        hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
+        hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
+
+        res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
+                                               hz_out3, hz_out4, hz_out5);
+        res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
+                                               hz_out4, hz_out5, hz_out6);
+        res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
+                                               hz_out5, hz_out6, hz_out7);
+        res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
+                                               hz_out6, hz_out7, hz_out8);
+
+        if (vert_offset) {
+            res1 = __msa_srari_h(hz_out3, 5);
+            res3 = __msa_srari_h(hz_out4, 5);
+            res5 = __msa_srari_h(hz_out5, 5);
+            res7 = __msa_srari_h(hz_out6, 5);
+        } else {
+            res1 = __msa_srari_h(hz_out2, 5);
+            res3 = __msa_srari_h(hz_out3, 5);
+            res5 = __msa_srari_h(hz_out4, 5);
+            res7 = __msa_srari_h(hz_out5, 5);
+        }
+
+        SAT_SH4_SH(res1, res3, res5, res7, 7);
+
+        res0 = __msa_aver_s_h(res0, res1);
+        res1 = __msa_aver_s_h(res2, res3);
+        res2 = __msa_aver_s_h(res4, res5);
+        res3 = __msa_aver_s_h(res6, res7);
+
+        CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out0 = hz_out4;
+        hz_out1 = hz_out5;
+        hz_out2 = hz_out6;
+        hz_out3 = hz_out7;
+        hz_out4 = hz_out8;
+    }
+}
+
+static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height,
+                                                   uint8_t vert_offset)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              height, vert_offset);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
+                                                 const uint8_t *src_y,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride)
+{
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
+    v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
+    v8i16 res0, res1;
+    v16u8 res;
+
+    LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+    LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+    hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1,
+                                                          mask0, mask1, mask2);
+    hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3,
+                                                          mask0, mask1, mask2);
+    SRARI_H2_SH(hz_out0, hz_out1, 5);
+    SAT_SH2_SH(hz_out0, hz_out1, 7);
+    LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+
+    src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
+    src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
+    src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
+    src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
+
+    XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+
+    /* filter calc */
+    vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2,
+                                                  src_vt3, src_vt4, src_vt5);
+    vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4,
+                                                  src_vt5, src_vt6, src_vt7);
+    SRARI_H2_SH(vert_out0, vert_out1, 5);
+    SAT_SH2_SH(vert_out0, vert_out1, 7);
+
+    res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+    res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+
+    SAT_SH2_SH(res0, res1, 7);
+    res = PCKEV_XORI128_UB(res0, res1);
+
+    dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
+    dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst0 = __msa_aver_u_b(res, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
+                                                 const uint8_t *src_y,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride)
+{
+    uint32_t loop_cnt;
+    v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
+    v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
+    v8i16 out0, out1, out2, out3;
+
+    LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
+
+    LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
+    src_y += (5 * src_stride);
+
+    src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
+    src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
+    src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
+    src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
+
+    XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
+        XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
+        src_x += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
+        hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
+        hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
+        hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
+        SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
+        SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
+        LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
+        src_y += (4 * src_stride);
+
+        src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
+        src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
+        src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
+        src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
+
+        XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
+                                        src_vt4, src_vt5, vert_out0, vert_out1);
+        AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
+                                        src_vt6, src_vt7, vert_out2, vert_out3);
+        SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
+        SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
+
+        out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
+        out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
+        out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
+        out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
+
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src_vt0 = src_vt4;
+        src_vt1 = src_vt5;
+        src_vt2 = src_vt6;
+        src_vt3 = src_vt7;
+        src_vt4 = src_vt8;
+    }
+}
+
+static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
+                                                   const uint8_t *src_y,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    uint32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
+                                             dst, dst_stride);
+
+        src_x += 8;
+        src_y += 8;
+        dst += 8;
+    }
+
+    src_x += (8 * src_stride) - 16;
+    src_y += (8 * src_stride) - 16;
+    dst += (8 * dst_stride) - 16;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
+                                             dst, dst_stride);
+
+        src_x += 8;
+        src_y += 8;
+        dst += 8;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    copy_width16_msa(src, stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    copy_width8_msa(src, stride, dst, stride, 8);
+}
+
+void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_width16_msa(src, stride, dst, stride, 16);
+}
+
+void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_width8_msa(src, stride, dst, stride, 8);
+}
+
+void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_width4_msa(src, stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_16w_msa(src - 2, stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_8w_msa(src - 2, stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_4w_msa(src - 2, stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src - 2,
+                            src - (stride * 2), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src - 2,
+                            src - (stride * 2) +
+                            sizeof(uint8_t), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src + stride - 2,
+                            src - (stride * 2), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_16w_msa(src + stride - 2,
+                            src - (stride * 2) +
+                            sizeof(uint8_t), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src + stride - 2,
+                           src - (stride * 2), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_8w_msa(src + stride - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 8);
+}
+
+
+void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src + stride - 2,
+                           src - (stride * 2), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_4w_msa(src + stride - 2,
+                           src - (stride * 2) +
+                           sizeof(uint8_t), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_16w_msa(src - (stride * 2), stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_8w_msa(src - (stride * 2), stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_4w_msa(src - (stride * 2), stride, dst, stride, 4);
+}
+
+void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 0);
+}
+
+void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
+                              stride, dst, stride, 16, 1);
+}
+
+void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
+}
+
+void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
+}
+
+void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
+}
+
+void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
+}
+
+void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_mid_16w_msa(src - (2 * stride) - 2, stride, dst, stride, 16);
+}
+
+void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8);
+}
+
+void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4);
+}
+
+void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
+                                           stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
+                                           stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
+                                         stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
+                                         stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
+                                         stride, dst, stride, 0);
+}
+
+void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
+                                         stride, dst, stride, 1);
+}
+
+void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
+                                           src - (stride * 2),
+                                           stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
+                                           src - (stride * 2) +
+                                           sizeof(uint8_t), stride,
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
+                                           src - (stride * 2),
+                                           stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
+                                           src - (stride * 2) +
+                                           sizeof(uint8_t), stride,
+                                           dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+
+void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
+                                         src - (stride * 2),
+                                         stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
+                                         src - (stride * 2) +
+                                         sizeof(uint8_t), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 0);
+}
+
+void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 1);
+}
+
+void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 0);
+}
+
+void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 1);
+}
+
+void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 0);
+}
+
+void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 1);
+}
+
+void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride);
+}
+
+void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 0);
+}
+
+void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
+                                           stride, dst, stride, 16, 1);
+}
+
+void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 0);
+}
+
+void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 8, 1);
+}
+
+void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 0);
+}
+
+void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
+                                          stride, dst, stride, 4, 1);
+}
+
+void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2,
+                                        stride, dst, stride);
+}
+
+void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2,
+                                     stride, dst, stride, 8);
+}
+
+void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2,
+                                      stride, dst, stride);
+}
diff --git a/libavcodec/mips/hevc_idct_msa.c b/libavcodec/mips/hevc_idct_msa.c
new file mode 100644
index 0000000..975d91f
--- /dev/null
+++ b/libavcodec/mips/hevc_idct_msa.c
@@ -0,0 +1,939 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+static const int16_t gt8x8_cnst[16] = {
+    64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+static const int16_t gt16x16_cnst[64] = {
+    64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+    64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+    64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+    64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+static const int16_t gt32x32_cnst0[256] = {
+    90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+    90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+    88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+    85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+    82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+    78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+    73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+    67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+    61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+    54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+    46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+    38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+    31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+    22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+    13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+    4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+static const int16_t gt32x32_cnst1[64] = {
+    90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+    80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+    57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+    25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+static const int16_t gt32x32_cnst2[16] = {
+    89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+static const int16_t gt32x32_cnst3[16] = {
+    64, 64, 64, 64, 83, 36, -36, -83, 64, -64, -64, 64, 36, -83, 83, -36
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,          \
+                         sum0, sum1, sum2, sum3, shift)       \
+{                                                             \
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5;                 \
+    v4i32 cnst64 = __msa_ldi_w(64);                           \
+    v4i32 cnst83 = __msa_ldi_w(83);                           \
+    v4i32 cnst36 = __msa_ldi_w(36);                           \
+                                                              \
+    DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64,   \
+                cnst83, cnst36, vec0, vec2, vec1, vec3);      \
+    DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5);    \
+                                                              \
+    sum0 = vec0 + vec2;                                       \
+    sum1 = vec0 - vec2;                                       \
+    sum3 = sum0;                                              \
+    sum2 = sum1;                                              \
+                                                              \
+    vec1 += vec3;                                             \
+    vec4 -= vec5;                                             \
+                                                              \
+    sum0 += vec1;                                             \
+    sum1 += vec4;                                             \
+    sum2 -= vec4;                                             \
+    sum3 -= vec1;                                             \
+                                                              \
+    SRARI_W4_SW(sum0, sum1, sum2, sum3, shift);               \
+    SAT_SW4_SW(sum0, sum1, sum2, sum3, 15);                   \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
+{                                                                        \
+    v8i16 src0_r, src1_r, src2_r, src3_r;                                \
+    v8i16 src0_l, src1_l, src2_l, src3_l;                                \
+    v8i16 filt0, filter0, filter1, filter2, filter3;                     \
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r;          \
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l;          \
+    v4i32 sum0_r, sum1_r, sum2_r, sum3_r;                                \
+    v4i32 sum0_l, sum1_l, sum2_l, sum3_l;                                \
+                                                                         \
+    ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7,                   \
+               src0_r, src1_r, src2_r, src3_r);                          \
+    ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7,                   \
+               src0_l, src1_l, src2_l, src3_l);                          \
+                                                                         \
+    filt0 = LD_SH(filter);                                               \
+    SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);             \
+    DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0,        \
+                filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l);   \
+                                                                         \
+    BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,      \
+                sum1_l, sum1_r);                                         \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l,  filter2, filter2,       \
+                filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l);   \
+                                                                         \
+    temp2_r += temp3_r;                                                  \
+    temp2_l += temp3_l;                                                  \
+    sum0_r += temp2_r;                                                   \
+    sum0_l += temp2_l;                                                   \
+    sum3_r -= temp2_r;                                                   \
+    sum3_l -= temp2_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift);                  \
+    SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15);                      \
+    PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7);               \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l,  filter3, filter3,       \
+                filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l);   \
+                                                                         \
+    temp4_r -= temp5_r;                                                  \
+    temp4_l -= temp5_l;                                                  \
+    sum1_r += temp4_r;                                                   \
+    sum1_l += temp4_l;                                                   \
+    sum2_r -= temp4_r;                                                   \
+    sum2_l -= temp4_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift);                  \
+    SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15);                      \
+    PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4);               \
+                                                                         \
+    filt0 = LD_SH(filter + 8);                                           \
+    SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);             \
+    DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l,  filter0, filter0,       \
+                filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l);   \
+                                                                         \
+    BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,      \
+                sum1_l, sum1_r);                                         \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2,        \
+                filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l);   \
+                                                                         \
+    temp2_r += temp3_r;                                                  \
+    temp2_l += temp3_l;                                                  \
+    sum0_r += temp2_r;                                                   \
+    sum0_l += temp2_l;                                                   \
+    sum3_r -= temp2_r;                                                   \
+    sum3_l -= temp2_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift);                  \
+    SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15);                      \
+    PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6);               \
+    DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3,        \
+                filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l);   \
+                                                                         \
+    temp4_r -= temp5_r;                                                  \
+    temp4_l -= temp5_l;                                                  \
+    sum1_r -= temp4_r;                                                   \
+    sum1_l -= temp4_l;                                                   \
+    sum2_r += temp4_r;                                                   \
+    sum2_l += temp4_l;                                                   \
+                                                                         \
+    SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift);                  \
+    SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15);                      \
+    PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5);               \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,                \
+                           src4_r, src5_r, src6_r, src7_r,                \
+                           src0_l, src1_l, src2_l, src3_l,                \
+                           src4_l, src5_l, src6_l, src7_l, shift)         \
+{                                                                         \
+    int16_t *ptr0, *ptr1;                                                 \
+    v8i16 filt0, filt1, dst0, dst1;                                       \
+    v8i16 filter0, filter1, filter2, filter3;                             \
+    v4i32 temp0_r, temp1_r, temp0_l, temp1_l;                             \
+    v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l;         \
+    v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l;                         \
+                                                                          \
+    ptr0 = (buf_ptr + 112);                                               \
+    ptr1 = (buf_ptr + 128);                                               \
+    k = -1;                                                               \
+                                                                          \
+    for (j = 0; j < 4; j++)                                               \
+    {                                                                     \
+        LD_SH2(filter, 8, filt0, filt1)                                   \
+        filter += 16;                                                     \
+        SPLATI_W2_SH(filt0, 0, filter0, filter1);                         \
+        SPLATI_W2_SH(filt1, 0, filter2, filter3);                         \
+        DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l,  filter0, filter0,    \
+                    filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l);    \
+        DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l);    \
+        DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l,  filter1, filter1,   \
+                     filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l);   \
+        DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l);   \
+                                                                          \
+        sum1_r = sum0_r;                                                  \
+        sum1_l = sum0_l;                                                  \
+                                                                          \
+        SPLATI_W2_SH(filt0, 2, filter0, filter1);                         \
+        SPLATI_W2_SH(filt1, 2, filter2, filter3);                         \
+        DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l);  \
+        DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l);   \
+        DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l);  \
+                                                                          \
+        sum0_r += temp0_r;                                                \
+        sum0_l += temp0_l;                                                \
+        sum1_r -= temp0_r;                                                \
+        sum1_l -= temp0_l;                                                \
+                                                                          \
+        sum3_r = temp1_r - sum3_r;                                        \
+        sum3_l = temp1_l - sum3_l;                                        \
+                                                                          \
+        DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l);  \
+        DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3,    \
+                     filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l);   \
+                                                                          \
+        sum0_r += temp0_r;                                                \
+        sum0_l += temp0_l;                                                \
+        sum1_r -= temp0_r;                                                \
+        sum1_l -= temp0_l;                                                \
+                                                                          \
+        BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l,       \
+                    res1_l, res1_r);                                      \
+        SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift);               \
+        SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15);                   \
+        PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1);          \
+        ST_SH(dst0, buf_ptr);                                             \
+        ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16)));                   \
+                                                                          \
+        BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l,       \
+                    res1_l, res1_r);                                      \
+        SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift);               \
+        SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15);                   \
+        PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1);          \
+        ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16)));           \
+        ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16)));           \
+                                                                          \
+        k *= -1;                                                          \
+        buf_ptr += 16;                                                    \
+    }                                                                     \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)  \
+{                                                                     \
+    LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l);                  \
+    tmp1_r = sum0_r;                                                  \
+    tmp1_l = sum0_l;                                                  \
+    sum0_r += tmp0_r;                                                 \
+    sum0_l += tmp0_l;                                                 \
+    ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4);                \
+    tmp1_r -= tmp0_r;                                                 \
+    tmp1_l -= tmp0_l;                                                 \
+    ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4);               \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,     \
+                              res0, res1, res2, res3, shift)  \
+{                                                             \
+    v4i32 vec0, vec1, vec2, vec3;                             \
+    v4i32 cnst74 = __msa_ldi_w(74);                           \
+    v4i32 cnst55 = __msa_ldi_w(55);                           \
+    v4i32 cnst29 = __msa_ldi_w(29);                           \
+                                                              \
+    vec0 = in_r0 + in_r1;                                     \
+    vec2 = in_r0 - in_l1;                                     \
+    res0 = vec0 * cnst29;                                     \
+    res1 = vec2 * cnst55;                                     \
+    res2 = in_r0 - in_r1;                                     \
+    vec1 = in_r1 + in_l1;                                     \
+    res2 += in_l1;                                            \
+    vec3 = in_l0 * cnst74;                                    \
+    res3 = vec0 * cnst55;                                     \
+                                                              \
+    res0 += vec1 * cnst55;                                    \
+    res1 -= vec1 * cnst29;                                    \
+    res2 *= cnst74;                                           \
+    res3 += vec2 * cnst29;                                    \
+                                                              \
+    res0 += vec3;                                             \
+    res1 += vec3;                                             \
+    res3 -= vec3;                                             \
+                                                              \
+    SRARI_W4_SW(res0, res1, res2, res3, shift);               \
+    SAT_SW4_SW(res0, res1, res2, res3, 15);                   \
+}
+
+static void hevc_idct_4x4_msa(int16_t *coeffs)
+{
+    v8i16 in0, in1;
+    v4i32 in_r0, in_l0, in_r1, in_l1;
+    v4i32 sum0, sum1, sum2, sum3;
+    v8i16 zeros = { 0 };
+
+    LD_SH2(coeffs, 8, in0, in1);
+    ILVRL_H2_SW(zeros, in0, in_r0, in_l0);
+    ILVRL_H2_SW(zeros, in1, in_r1, in_l1);
+
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+    TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+    TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, sum0, sum1, sum2, sum3);
+    PCKEV_H2_SH(sum1, sum0, sum3, sum2, in0, in1);
+    ST_SH2(in0, in1, coeffs, 8);
+}
+
+static void hevc_idct_8x8_msa(int16_t *coeffs)
+{
+    int16_t *filter = &gt8x8_cnst[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
+}
+
+static void hevc_idct_16x16_msa(int16_t *coeffs)
+{
+    int16_t i, j, k;
+    int16_t buf[256];
+    int16_t *buf_ptr = &buf[0];
+    int16_t *src = coeffs;
+    int16_t *filter = &gt16x16_cnst[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+    for (i = 2; i--;) {
+        LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
+                in8, in9, in10, in11, in12, in13, in14, in15);
+
+        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_r, src5_r, src6_r, src7_r);
+        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_l, src1_l, src2_l, src3_l);
+        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 7);
+
+        src += 8;
+        buf_ptr = (&buf[0] + 8);
+        filter = &gt16x16_cnst[0];
+    }
+
+    src = &buf[0];
+    buf_ptr = coeffs;
+    filter = &gt16x16_cnst[0];
+
+    for (i = 2; i--;) {
+        LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
+                in4, in12, in5, in13, in6, in14, in7, in15);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                           in8, in9, in10, in11, in12, in13, in14, in15);
+        ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_r, src1_r, src2_r, src3_r);
+        ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_r, src5_r, src6_r, src7_r);
+        ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
+                   src0_l, src1_l, src2_l, src3_l);
+        ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
+                   src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 12);
+
+        src += 128;
+        buf_ptr = coeffs + 8;
+        filter = &gt16x16_cnst[0];
+    }
+
+    LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);
+
+    LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
+    TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);
+
+    LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
+}
+
+static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch,
+                                      uint8_t round)
+{
+    uint8_t i;
+    int16_t *filter_ptr0 = &gt32x32_cnst0[0];
+    int16_t *filter_ptr1 = &gt32x32_cnst1[0];
+    int16_t *filter_ptr2 = &gt32x32_cnst2[0];
+    int16_t *filter_ptr3 = &gt32x32_cnst3[0];
+    int16_t *src0 = (coeffs + buf_pitch);
+    int16_t *src1 = (coeffs + 2 * buf_pitch);
+    int16_t *src2 = (coeffs + 4 * buf_pitch);
+    int16_t *src3 = (coeffs);
+    int32_t cnst0, cnst1;
+    int32_t tmp_buf[8 * 32];
+    int32_t *tmp_buf_ptr = &tmp_buf[0];
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+    v8i16 filt0, filter0, filter1, filter2, filter3;
+    v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+    /* process coeff 4, 12, 20, 28 */
+    LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
+    ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
+    ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 4; i++) {
+        /* processing single column of constants */
+        cnst0 = LW(filter_ptr2);
+        cnst1 = LW(filter_ptr2 + 2);
+
+        filter0 = (v8i16) __msa_fill_w(cnst0);
+        filter1 = (v8i16) __msa_fill_w(cnst1);
+
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
+        ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + i * 8), 4);
+
+        filter_ptr2 += 4;
+    }
+
+    /* process coeff 0, 8, 16, 24 */
+    LD_SH2(src3, 16 * buf_pitch, in0, in2);
+    LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in1, in3);
+
+    ILVR_H2_SH(in2, in0, in3, in1, src0_r, src1_r);
+    ILVL_H2_SH(in2, in0, in3, in1, src0_l, src1_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 2; i++) {
+        /* processing first column of filter constants */
+        cnst0 = LW(filter_ptr3);
+        cnst1 = LW(filter_ptr3 + 4);
+
+        filter0 = (v8i16) __msa_fill_w(cnst0);
+        filter1 = (v8i16) __msa_fill_w(cnst1);
+
+        DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, filter1,
+                    filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+
+        sum1_r = sum0_r;
+        sum1_l = sum0_l;
+        sum0_r += tmp1_r;
+        sum0_l += tmp1_l;
+
+        sum1_r -= tmp1_r;
+        sum1_l -= tmp1_l;
+
+        HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i));
+        HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i));
+
+        filter_ptr3 += 8;
+    }
+
+    /* process coeff 2 6 10 14 18 22 26 30 */
+    LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_l, src1_l, src2_l, src3_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 8; i++) {
+        /* processing single column of constants */
+        filt0 = LD_SH(filter_ptr1);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
+
+        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r += sum0_r;
+        tmp0_l += sum0_l;
+        ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
+        tmp1_r -= sum0_r;
+        tmp1_l -= sum0_l;
+        ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);
+
+        filter_ptr1 += 8;
+    }
+
+    /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    src0 += 16 * buf_pitch;
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_r, src1_r, src2_r, src3_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src0_l, src1_l, src2_l, src3_l);
+
+    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src4_r, src5_r, src6_r, src7_r);
+    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
+               src4_l, src5_l, src6_l, src7_l);
+
+    /* loop for all columns of filter constants */
+    for (i = 0; i < 16; i++) {
+        /* processing single column of constants */
+        filt0 = LD_SH(filter_ptr0);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
+
+        tmp1_r = sum0_r;
+        tmp1_l = sum0_l;
+
+        filt0 = LD_SH(filter_ptr0 + 8);
+        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
+        DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
+        DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
+                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
+        DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);
+
+        sum0_r += tmp1_r;
+        sum0_l += tmp1_l;
+
+        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r += sum0_r;
+        tmp0_l += sum0_l;
+        sum1_r = __msa_fill_w(round);
+        SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r);
+        SAT_SW2_SW(tmp0_r, tmp0_l, 15);
+        in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
+        ST_SH(in0, (coeffs + i * buf_pitch));
+        tmp1_r -= sum0_r;
+        tmp1_l -= sum0_l;
+        SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r);
+        SAT_SW2_SW(tmp1_r, tmp1_l, 15);
+        in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
+        ST_SH(in0, (coeffs + (31 - i) * buf_pitch));
+
+        filter_ptr0 += 16;
+    }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+    uint8_t i;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8);
+    }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+    uint8_t i;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+        TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
+    }
+}
+
+static void hevc_idct_32x32_msa(int16_t *coeffs)
+{
+    uint8_t row_cnt, col_cnt;
+    int16_t *src = coeffs;
+    int16_t tmp_buf[8 * 32];
+    int16_t *tmp_buf_ptr = &tmp_buf[0];
+    uint8_t round;
+    uint8_t buf_pitch;
+
+    /* column transform */
+    round = 7;
+    buf_pitch = 32;
+    for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+        /* process 8x32 blocks */
+        hevc_idct_8x32_column_msa((coeffs + col_cnt * 8), buf_pitch, round);
+    }
+
+    /* row transform */
+    round = 12;
+    buf_pitch = 8;
+    for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+        /* process 32x8 blocks */
+        src = (coeffs + 32 * 8 * row_cnt);
+
+        hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+        hevc_idct_8x32_column_msa(tmp_buf_ptr, buf_pitch, round);
+        hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+    }
+}
+
+static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
+{
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    ST_SH2(dst, dst, coeffs, 8);
+}
+
+static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
+{
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+}
+
+static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
+{
+    uint8_t loop;
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    for (loop = 4; loop--;) {
+        ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+        coeffs += 8 * 8;
+    }
+}
+
+static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
+{
+    uint8_t loop;
+    int32_t val;
+    v8i16 dst;
+
+    val = (coeffs[0] + 1) >> 1;
+    val = (val + 32) >> 6;
+    dst = __msa_fill_h(val);
+
+    for (loop = 16; loop--;) {
+        ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
+        coeffs += 8 * 8;
+    }
+}
+
+static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint32_t dst0, dst1, dst2, dst3;
+    v8i16 dst_r0, dst_l0, in0, in1;
+    v4i32 dst_vec = { 0 };
+    v16u8 zeros = { 0 };
+
+    LD_SH2(coeffs, 8, in0, in1);
+    LW4(dst, stride, dst0, dst1, dst2, dst3);
+    INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
+    ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
+    ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
+    CLIP_SH2_0_255(dst_r0, dst_l0);
+    dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
+    ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t *temp_dst = dst;
+    uint64_t dst0, dst1, dst2, dst3;
+    v2i64 dst_vec0 = { 0 };
+    v2i64 dst_vec1 = { 0 };
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 zeros = { 0 };
+
+    LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
+    temp_dst += (4 * stride);
+
+    INSERT_D2_SD(dst0, dst1, dst_vec0);
+    INSERT_D2_SD(dst2, dst3, dst_vec1);
+    ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
+    ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
+    ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
+         dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
+    ST8x4_UB(dst_r0, dst_r1, dst, stride);
+    dst += (4 * stride);
+
+    LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
+    INSERT_D2_SD(dst0, dst1, dst_vec0);
+    INSERT_D2_SD(dst2, dst3, dst_vec1);
+    UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
+    UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
+    ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
+         dst_r0, dst_l0, dst_r1, dst_l1);
+    CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+    PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
+    ST8x4_UB(dst_r0, dst_r1, dst, stride);
+}
+
+static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t loop_cnt;
+    uint8_t *temp_dst = dst;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        LD_SH4(coeffs, 16, in0, in2, in4, in6);
+        LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
+        coeffs += 64;
+        LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3);
+        temp_dst += (4 * stride);
+
+        UNPCK_UB_SH(dst0, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst1, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst2, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst3, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
+{
+    uint8_t loop_cnt;
+    uint8_t *temp_dst = dst;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        LD_SH4(coeffs, 32, in0, in2, in4, in6);
+        LD_SH4((coeffs + 8), 32, in1, in3, in5, in7);
+        LD_UB4(temp_dst, stride, dst0, dst1, dst2, dst3);
+
+        UNPCK_UB_SH(dst0, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst1, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst2, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst3, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+        ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
+
+        LD_SH4((coeffs + 16), 32, in0, in2, in4, in6);
+        LD_SH4((coeffs + 24), 32, in1, in3, in5, in7);
+        coeffs += 128;
+        LD_UB4((temp_dst + 16), stride, dst0, dst1, dst2, dst3);
+        temp_dst += (4 * stride);
+
+        UNPCK_UB_SH(dst0, dst_r0, dst_l0);
+        UNPCK_UB_SH(dst1, dst_r1, dst_l1);
+        UNPCK_UB_SH(dst2, dst_r2, dst_l2);
+        UNPCK_UB_SH(dst3, dst_r3, dst_l3);
+
+        dst_r0 += in0;
+        dst_l0 += in1;
+        dst_r1 += in2;
+        dst_l1 += in3;
+        dst_r2 += in4;
+        dst_l2 += in5;
+        dst_r3 += in6;
+        dst_l3 += in7;
+
+        CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
+        CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
+        PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
+                    dst_r3, dst0, dst1, dst2, dst3);
+
+        ST_UB4(dst0, dst1, dst2, dst3, (dst + 16), stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
+{
+    v8i16 in0, in1, dst0, dst1;
+    v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;
+
+    LD_SH2(coeffs, 8, in0, in1);
+    UNPCK_SH_SW(in0, in_r0, in_l0);
+    UNPCK_SH_SW(in1, in_r1, in_l1);
+    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
+                          7);
+    TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
+                          12);
+    TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, res0, res1, res2, res3);
+    PCKEV_H2_SH(res1, res0, res3, res2, dst0, dst1);
+    ST_SH2(dst0, dst1, coeffs, 8);
+}
+
+void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_4x4_msa(coeffs);
+}
+
+void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_8x8_msa(coeffs);
+}
+
+void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_16x16_msa(coeffs);
+}
+
+void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
+{
+    hevc_idct_32x32_msa(coeffs);
+}
+
+void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_4x4_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_8x8_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_16x16_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    hevc_addblk_32x32_msa(coeffs, dst, stride);
+}
+
+void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_4x4_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_8x8_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_16x16_msa(coeffs);
+}
+
+void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
+{
+    hevc_idct_dc_32x32_msa(coeffs);
+}
+
+void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
+{
+    hevc_idct_luma_4x4_msa(coeffs);
+}
diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c
new file mode 100644
index 0000000..da1db51
--- /dev/null
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -0,0 +1,2088 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
+                                         int32_t beta, int32_t *tc,
+                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    uint8_t *p3 = src - (stride << 2);
+    uint8_t *p2 = src - ((stride << 1) + stride);
+    uint8_t *p1 = src - (stride << 1);
+    uint8_t *p0 = src - stride;
+    uint8_t *q0 = src;
+    uint8_t *q1 = src + stride;
+    uint8_t *q2 = src + (stride << 1);
+    uint8_t *q3 = src + (stride << 1) + stride;
+    uint8_t flag0, flag1;
+    int32_t dp00, dq00, dp30, dq30, d00, d30;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+    uint64_t dst_val0, dst_val1;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
+    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+    v8u16 temp0, temp1;
+    v8i16 temp2;
+    v8i16 tc_pos, tc_neg;
+    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    v16i8 zero = { 0 };
+    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
+    dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
+    dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
+    dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    p_is_pcm0 = p_is_pcm[0];
+    q_is_pcm0 = q_is_pcm[0];
+    dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
+    dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
+    dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
+    dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm4 = q_is_pcm[1];
+
+    if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
+        if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
+            p3_src = LD_UH(p3);
+            p2_src = LD_UH(p2);
+            p1_src = LD_UH(p1);
+            p0_src = LD_UH(p0);
+            q0_src = LD_UH(q0);
+            q1_src = LD_UH(q1);
+            q2_src = LD_UH(q2);
+            q3_src = LD_UH(q3);
+
+            tc0 = tc[0];
+            beta30 = beta >> 3;
+            beta20 = beta >> 2;
+            tc250 = ((tc0 * 5 + 1) >> 1);
+            tc4 = tc[1];
+            tc254 = ((tc4 * 5 + 1) >> 1);
+
+            flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+                     abs(p0[0] - q0[0]) < tc250 &&
+                     abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+                     abs(p0[3] - q0[3]) < tc250 &&
+                     (d00 << 1) < beta20 && (d30 << 1) < beta20);
+            cmp0 = __msa_fill_d(flag0);
+
+            flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+                     abs(p0[4] - q0[4]) < tc254 &&
+                     abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+                     abs(p0[7] - q0[7]) < tc254 &&
+                     (d04 << 1) < beta20 && (d34 << 1) < beta20);
+            cmp1 = __msa_fill_d(flag1);
+            cmp2 = __msa_ilvev_d(cmp1, cmp0);
+            cmp2 = __msa_ceqi_d(cmp2, 0);
+
+            ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+                       zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+                       p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
+                       q3_src);
+
+            cmp0 = (v2i64) __msa_fill_h(tc0);
+            cmp1 = (v2i64) __msa_fill_h(tc4);
+            tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+            tc_pos <<= 1;
+            tc_neg = -tc_pos;
+
+            temp0 = (p1_src + p0_src + q0_src);
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            cmp0 = __msa_fill_d(p_is_pcm0);
+            cmp1 = __msa_fill_d(p_is_pcm4);
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            temp0 = (q1_src + p0_src + q0_src);
+
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            cmp0 = __msa_fill_d(q_is_pcm0);
+            cmp1 = __msa_fill_d(q_is_pcm4);
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (diff0 << 3) + diff0;
+            diff1 = (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+
+            temp0 = (v8u16) (delta0 + p0_src);
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) (q0_src - delta0);
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            tmp = (beta + (beta >> 1)) >> 3;
+            cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
+            cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
+            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+
+            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                                         (v16u8) abs_delta0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) abs_delta0);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) abs_delta0);
+            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                                         (v16u8) abs_delta0);
+
+            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
+            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
+            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
+            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
+            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
+            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
+
+            cmp0 = __msa_fill_d(d00 + d30 >= beta);
+            cmp1 = __msa_fill_d(d04 + d34 >= beta);
+            cmp0 = __msa_ilvev_d(cmp1, cmp0);
+            cmp0 = __msa_ceqi_d(cmp0, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp0);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp0);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp0);
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp0);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp0);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp0);
+
+            PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+            dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+
+            dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
+            dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
+
+            ST8x4_UB(dst0, dst1, p2, stride);
+            p2 += (4 * stride);
+            SD(dst_val0, p2);
+            p2 += stride;
+            SD(dst_val1, p2);
+        }
+    }
+}
+
+static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
+                                         int32_t beta, int32_t *tc,
+                                         uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    uint8_t *p3 = src;
+    uint8_t *p2 = src + 3 * stride;
+    uint8_t *p1 = src + (stride << 2);
+    uint8_t *p0 = src + 7 * stride;
+    uint8_t flag0, flag1;
+    uint16_t tmp0, tmp1;
+    uint32_t tmp2, tmp3;
+    int32_t dp00, dq00, dp30, dq30, d00, d30;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+    v8u16 temp0, temp1;
+    v8i16 temp2;
+    v8i16 tc_pos, tc_neg;
+    v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    v16i8 zero = { 0 };
+    v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
+    dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
+    dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
+    dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    p_is_pcm0 = p_is_pcm[0];
+    q_is_pcm0 = q_is_pcm[0];
+
+    dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
+    dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
+    dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
+    dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm4 = q_is_pcm[1];
+
+    if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
+        if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
+            src -= 4;
+            LD_UH8(src, stride,
+                   p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
+                   q3_src);
+
+            tc0 = tc[0];
+            beta30 = beta >> 3;
+            beta20 = beta >> 2;
+            tc250 = ((tc0 * 5 + 1) >> 1);
+
+            tc4 = tc[1];
+            tc254 = ((tc4 * 5 + 1) >> 1);
+
+            TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+                               q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+                               q0_src, q1_src, q2_src, q3_src);
+
+            flag0 = (abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+                     abs(p3[-1] - p3[0]) < tc250 &&
+                     abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+                     abs(p2[-1] - p2[0]) < tc250 &&
+                     (d00 << 1) < beta20 && (d30 << 1) < beta20);
+            cmp0 = __msa_fill_d(flag0);
+
+            flag1 = (abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+                     abs(p1[-1] - p1[0]) < tc254 &&
+                     abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+                     abs(p0[-1] - p0[0]) < tc254 &&
+                     (d04 << 1) < beta20 && (d34 << 1) < beta20);
+            cmp1 = __msa_fill_d(flag1);
+            cmp2 = __msa_ilvev_d(cmp1, cmp0);
+            cmp2 = __msa_ceqi_d(cmp2, 0);
+
+            ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+                       zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+                       p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
+                       q3_src);
+
+            cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
+            cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
+            tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+            tc_neg = -tc_pos;
+
+            temp0 = (p1_src + p0_src + q0_src);
+
+            temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+            temp1 = temp0 + p2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - p1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+            temp1 = (temp0 << 1) + p2_src + q1_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - p0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+            cmp0 = __msa_fill_d(p_is_pcm0);
+            cmp1 = __msa_fill_d(p_is_pcm4);
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+            temp0 = (q1_src + p0_src + q0_src);
+            temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q2_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+            temp1 = temp0 + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+            temp2 = (v8i16) (temp1 - q1_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+            temp1 = (temp0 << 1) + p1_src + q2_src;
+            temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+            temp2 = (v8i16) (temp1 - q0_src);
+            temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+            dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+            cmp0 = __msa_fill_d(q_is_pcm0);
+            cmp1 = __msa_fill_d(q_is_pcm4);
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            diff0 = (v8i16) (q0_src - p0_src);
+            diff1 = (v8i16) (q1_src - p1_src);
+            diff0 = (v8i16) (diff0 << 3) + diff0;
+            diff1 = (v8i16) (diff1 << 1) + diff1;
+            delta0 = diff0 - diff1;
+            delta0 = __msa_srari_h(delta0, 4);
+
+            temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+            abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+            abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+            delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+            temp0 = (v8u16) delta0 + p0_src;
+            temp0 = (v8u16) CLIP_SH_0_255(temp0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) p_is_pcm_vec);
+
+            temp2 = (v8i16) q0_src - delta0;
+            temp2 = CLIP_SH_0_255(temp2);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) q_is_pcm_vec);
+
+            tmp = ((beta + (beta >> 1)) >> 3);
+            cmp0 = __msa_fill_d(!p_is_pcm0 && (dp00 + dp30 < tmp));
+            cmp1 = __msa_fill_d(!p_is_pcm4 && (dp04 + dp34 < tmp));
+            p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+            cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
+            cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
+            q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+            q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+            tc_pos >>= 1;
+            tc_neg = -tc_pos;
+
+            delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+            delta1 -= (v8i16) p1_src;
+            delta1 += delta0;
+            delta1 >>= 1;
+            delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+            delta1 = (v8i16) p1_src + (v8i16) delta1;
+            delta1 = CLIP_SH_0_255(delta1);
+            delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+                                          (v16u8) p_is_pcm_vec);
+
+            delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+            delta2 = delta2 - (v8i16) q1_src;
+            delta2 = delta2 - delta0;
+            delta2 = delta2 >> 1;
+            delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+            delta2 = (v8i16) q1_src + (v8i16) delta2;
+            delta2 = CLIP_SH_0_255(delta2);
+            delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+                                          (v16u8) q_is_pcm_vec);
+            delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+                                         (v16u8) abs_delta0);
+            temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+                                        (v16u8) abs_delta0);
+            temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+                                        (v16u8) abs_delta0);
+            delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+                                         (v16u8) abs_delta0);
+
+            dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
+            dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
+            dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
+            dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
+            dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
+            dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
+
+            cmp0 = __msa_fill_d(d00 + d30 >= beta);
+            dst7 = (v16u8) __msa_fill_d(d04 + d34 >= beta);
+            cmp0 = __msa_ilvev_d((v2i64) dst7, cmp0);
+            dst6 = (v16u8) __msa_ceqi_d(cmp0, 0);
+
+            dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, dst6);
+            dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, dst6);
+            dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, dst6);
+            dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, dst6);
+            dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, dst6);
+            dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, dst6);
+
+            PCKEV_B4_UB(dst0, dst0, dst1, dst1, dst2, dst2, dst3, dst3,
+                        dst0, dst1, dst2, dst3);
+            PCKEV_B2_UB(dst4, dst4, dst5, dst5, dst4, dst5);
+
+            TRANSPOSE8x8_UB_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
+                               dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+            src += 1;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst0, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst1, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst1, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst2, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst2, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst3, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst4, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst4, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst5, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst5, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+            src += stride;
+
+            tmp2 = __msa_copy_u_w((v4i32) dst6, 0);
+            tmp0 = __msa_copy_u_h((v8i16) dst6, 2);
+            tmp3 = __msa_copy_u_w((v4i32) dst7, 0);
+            tmp1 = __msa_copy_u_h((v8i16) dst7, 2);
+            SW(tmp2, src);
+            SH(tmp0, src + 4);
+            src += stride;
+            SW(tmp3, src);
+            SH(tmp1, src + 4);
+        }
+    }
+}
+
+static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
+                                           int32_t *tc, uint8_t *p_is_pcm,
+                                           uint8_t *q_is_pcm)
+{
+    uint8_t *p1_ptr = src - (stride << 1);
+    uint8_t *p0_ptr = src - stride;
+    uint8_t *q0_ptr = src;
+    uint8_t *q1_ptr = src + stride;
+    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    v8u16 p1, p0, q0, q1;
+    v8i16 tc_pos, tc_neg;
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        cmp0 = (v2i64) __msa_fill_h(tc[0]);
+        cmp1 = (v2i64) __msa_fill_h(tc[1]);
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+        tc_neg = -tc_pos;
+
+        cmp0 = __msa_fill_d(p_is_pcm[0]);
+        cmp1 = __msa_fill_d(p_is_pcm[1]);
+        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+        cmp0 = __msa_fill_d(q_is_pcm[0]);
+        cmp1 = __msa_fill_d(q_is_pcm[1]);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        p1 = LD_UH(p1_ptr);
+        p0 = LD_UH(p0_ptr);
+        q0 = LD_UH(q0_ptr);
+        q1 = LD_UH(q1_ptr);
+
+        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
+
+        temp0 = (v8i16) (q0 - p0);
+        temp1 = (v8i16) (p1 - q1);
+        temp0 <<= 2;
+        temp0 += temp1;
+        delta = __msa_srari_h((v8i16) temp0, 3);
+        delta = CLIP_SH(delta, tc_neg, tc_pos);
+
+        temp0 = (v8i16) ((v8i16) p0 + delta);
+        temp0 = CLIP_SH_0_255(temp0);
+        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
+                                    (v16u8) p_is_pcm_vec);
+
+        temp1 = (v8i16) ((v8i16) q0 - delta);
+        temp1 = CLIP_SH_0_255(temp1);
+        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
+                                    (v16u8) q_is_pcm_vec);
+
+        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
+        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
+        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
+
+        temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
+        ST8x2_UB(temp0, p0_ptr, stride);
+    }
+}
+
+static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
+                                           int32_t *tc, uint8_t *p_is_pcm,
+                                           uint8_t *q_is_pcm)
+{
+    v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 p1, p0, q0, q1;
+    v8i16 tc_pos, tc_neg;
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        cmp0 = (v2i64) __msa_fill_h(tc[0]);
+        cmp1 = (v2i64) __msa_fill_h(tc[1]);
+        tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+        tc_neg = -tc_pos;
+
+        cmp0 = __msa_fill_d(p_is_pcm[0]);
+        cmp1 = __msa_fill_d(p_is_pcm[1]);
+        p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+        cmp0 = __msa_fill_d(q_is_pcm[0]);
+        cmp1 = __msa_fill_d(q_is_pcm[1]);
+        q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+        q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+        src -= 2;
+        LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7,
+                           p1, p0, q0, q1);
+        ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
+
+        temp0 = (v8i16) (q0 - p0);
+        temp1 = (v8i16) (p1 - q1);
+        temp0 <<= 2;
+        temp0 += temp1;
+        delta = __msa_srari_h((v8i16) temp0, 3);
+        delta = CLIP_SH(delta, tc_neg, tc_pos);
+
+        temp0 = (v8i16) ((v8i16) p0 + delta);
+        temp0 = CLIP_SH_0_255(temp0);
+        temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
+                                    (v16u8) p_is_pcm_vec);
+
+        temp1 = (v8i16) ((v8i16) q0 - delta);
+        temp1 = CLIP_SH_0_255(temp1);
+        temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
+                                    (v16u8) q_is_pcm_vec);
+
+        tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
+        temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
+        temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
+
+        temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
+
+        src += 1;
+        ST2x4_UB(temp0, 0, src, stride);
+        src += (4 * stride);
+        ST2x4_UB(temp0, 4, src, stride);
+    }
+}
+
+static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
+                                            uint8_t *src, int32_t src_stride,
+                                            int32_t sao_left_class,
+                                            int16_t *sao_offset_val,
+                                            int32_t height)
+{
+    int32_t h_cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 src0_r, src1_r;
+    v16i8 offset, offset_val, mask;
+    v16i8 offset0 = { 0 };
+    v16i8 offset1 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, dst0, dst1;
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (h_cnt = height >> 2; h_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
+
+        src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
+        mask = __msa_srli_b(src0_r, 3);
+        offset = __msa_vshf_b(mask, offset1, offset0);
+
+        UNPCK_SB_SH(offset, temp0, temp1);
+        ILVRL_B2_SH(zero, src0_r, dst0, dst1);
+        ADD2(dst0, temp0, dst1, temp1, dst0, dst1);
+        CLIP_SH2_0_255(dst0, dst1);
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
+                                            uint8_t *src, int32_t src_stride,
+                                            int32_t sao_left_class,
+                                            int16_t *sao_offset_val,
+                                            int32_t height)
+{
+    int32_t h_cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 src0_r, src1_r, mask0, mask1;
+    v16i8 offset, offset_val;
+    v16i8 offset0 = { 0 };
+    v16i8 offset1 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (h_cnt = height >> 2; h_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
+
+        mask0 = __msa_srli_b(src0_r, 3);
+        mask1 = __msa_srli_b(src1_r, 3);
+
+        offset = __msa_vshf_b(mask0, offset1, offset0);
+        UNPCK_SB_SH(offset, temp0, temp1);
+
+        offset = __msa_vshf_b(mask1, offset1, offset0);
+        UNPCK_SB_SH(offset, temp2, temp3);
+
+        UNPCK_UB_SH(src0_r, dst0, dst1);
+        UNPCK_UB_SH(src1_r, dst2, dst3);
+        ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
+             dst0, dst1, dst2, dst3);
+        CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst2);
+        ST8x4_UB(dst0, dst2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
+                                                int32_t dst_stride,
+                                                uint8_t *src,
+                                                int32_t src_stride,
+                                                int32_t sao_left_class,
+                                                int16_t *sao_offset_val,
+                                                int32_t width, int32_t height)
+{
+    int32_t h_cnt, w_cnt;
+    v16u8 src0, src1, src2, src3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 out0, out1, out2, out3;
+    v16i8 mask0, mask1, mask2, mask3;
+    v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
+    v16i8 offset0 = { 0 };
+    v16i8 offset1 = { 0 };
+    v16i8 zero = { 0 };
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    offset_val = LD_SB(sao_offset_val + 1);
+    offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
+    offset_val = __msa_pckev_b(offset_val, offset_val);
+    offset1 = (v16i8) __msa_insve_w((v4i32) offset1, 3, (v4i32) offset_val);
+    offset0 = __msa_sld_b(offset1, offset0, 28 - ((sao_left_class) & 31));
+    offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
+
+    if (!((sao_left_class > 12) & (sao_left_class < 29))) {
+        SWAP(offset0, offset1);
+    }
+
+    for (h_cnt = height >> 2; h_cnt--;) {
+        for (w_cnt = 0; w_cnt < (width >> 4); w_cnt++) {
+            LD_UB4(src + w_cnt * 16, src_stride, src0, src1, src2, src3);
+
+            mask0 = __msa_srli_b((v16i8) src0, 3);
+            mask1 = __msa_srli_b((v16i8) src1, 3);
+            mask2 = __msa_srli_b((v16i8) src2, 3);
+            mask3 = __msa_srli_b((v16i8) src3, 3);
+
+            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
+                       tmp0, tmp1);
+            VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
+                       tmp2, tmp3);
+            UNPCK_SB_SH(tmp0, temp0, temp1);
+            UNPCK_SB_SH(tmp1, temp2, temp3);
+            UNPCK_SB_SH(tmp2, temp4, temp5);
+            UNPCK_SB_SH(tmp3, temp6, temp7);
+            ILVRL_B2_SH(zero, src0, dst0, dst1);
+            ILVRL_B2_SH(zero, src1, dst2, dst3);
+            ILVRL_B2_SH(zero, src2, dst4, dst5);
+            ILVRL_B2_SH(zero, src3, dst6, dst7);
+            ADD4(dst0, temp0, dst1, temp1, dst2, temp2, dst3, temp3,
+                 dst0, dst1, dst2, dst3);
+            ADD4(dst4, temp4, dst5, temp5, dst6, temp6, dst7, temp7,
+                 dst4, dst5, dst6, dst7);
+            CLIP_SH4_0_255(dst0, dst1, dst2, dst3);
+            CLIP_SH4_0_255(dst4, dst5, dst6, dst7);
+            PCKEV_B4_SB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                        out0, out1, out2, out3);
+            ST_SB4(out0, out1, out2, out3, dst + w_cnt * 16, dst_stride);
+        }
+
+        src += src_stride << 2;
+        dst += dst_stride << 2;
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11;
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, src_plus10, src_plus11, dst0;
+    v8i16 offset_mask0, offset_mask1;
+    v8i16 sao_offset, src00, src01;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src -= 1;
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src, src_stride, src_minus10, src_minus11);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
+                   src_minus10, src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    uint8_t *src_minus1;
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 dst0, dst1;
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        src_minus1 = src - 1;
+        LD_UB2(src_minus1, src_stride, src_minus10, src_minus11);
+
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
+                   src_minus10, src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
+                   src_zero0, src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+        src += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        uint8_t *src,
+                                                        int32_t src_stride,
+                                                        int16_t *sao_offset_val,
+                                                        int32_t width,
+                                                        int32_t height)
+{
+    uint8_t *dst_ptr, *src_minus1;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 sao_offset;
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13;
+    v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
+    v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
+    v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (h_cnt = (height >> 2); h_cnt--;) {
+        src_minus1 = src - 1;
+        LD_UB4(src_minus1, src_stride,
+               src_minus10, src_minus11, src_minus12, src_minus13);
+
+        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+            src_minus1 += 16;
+            dst_ptr = dst + (v_cnt << 4);
+            LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
+
+            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
+                       src_zero1, 1);
+            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
+                       src_zero3, 1);
+            SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
+                       src_plus11, 2);
+            SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
+                       src_plus13, 2);
+
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
+            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
+            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                       offset_mask0, offset_mask0, offset_mask0);
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                       offset_mask1, offset_mask1, offset_mask1);
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
+                       offset_mask2, offset_mask2, offset_mask2);
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
+                       offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_zero0, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src_zero1, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src_zero2, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src_zero3, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                        dst0, dst1, dst2, dst3);
+
+            src_minus10 = src10;
+            ST_UB(dst0, dst_ptr);
+            src_minus11 = src11;
+            ST_UB(dst1, dst_ptr + dst_stride);
+            src_minus12 = src12;
+            ST_UB(dst2, dst_ptr + (dst_stride << 1));
+            src_minus13 = src13;
+            ST_UB(dst3, dst_ptr + (dst_stride * 3));
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 dst0;
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11, src10, src11;
+    v16i8 src_zero0, src_zero1;
+    v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src + src_stride, src_stride, src10, src11);
+
+        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+        src += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, dst0, dst1;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src_minus11, src10, src11;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src + src_stride, src_stride, src10, src11);
+
+        src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
+        src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
+        src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
+        src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+        src += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13;
+    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
+    v16u8 src12, dst2, src13, dst3;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+        src = src_orig + (v_cnt << 4);
+        dst = dst_orig + (v_cnt << 4);
+
+        LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
+
+        for (h_cnt = (height >> 2); h_cnt--;) {
+            LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
+
+            cmp_minus10 = (src_minus11 == src_minus10);
+            cmp_plus10 = (src_minus11 == src10);
+            cmp_minus11 = (src10 == src_minus11);
+            cmp_plus11 = (src10 == src11);
+            cmp_minus12 = (src11 == src10);
+            cmp_plus12 = (src11 == src12);
+            cmp_minus13 = (src12 == src11);
+            cmp_plus13 = (src12 == src13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < src_minus11);
+            cmp_plus10 = (src10 < src_minus11);
+            cmp_minus11 = (src_minus11 < src10);
+            cmp_plus11 = (src11 < src10);
+            cmp_minus12 = (src10 < src11);
+            cmp_plus12 = (src12 < src11);
+            cmp_minus13 = (src11 < src12);
+            cmp_plus13 = (src13 < src12);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_minus11, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src10, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src11, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src12, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                        dst0, dst1, dst2, dst3);
+
+            src_minus10 = src12;
+            src_minus11 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+
+            src += (src_stride << 2);
+            dst += (dst_stride << 2);
+        }
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus11, src10, src11;
+    v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
+    v8i16 sao_offset, src00, src01, offset_mask0, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+
+    src_orig = src - 1;
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
+
+        ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0, dst1;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src_orig = src - 1;
+
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
+
+        ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
+                   src_minus10, src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
+                   src_zero0, src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+
+        dst += dst_stride;
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    v16u8 diff_plus13, src_minus14, src_plus13;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
+    v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
+    v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
+    v16i8 src_zero3, sao_offset;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (h_cnt = (height >> 2); h_cnt--;) {
+        src_orig = src - 1;
+        dst_orig = dst;
+        LD_UB4(src_orig, src_stride,
+               src_minus11, src_minus12, src_minus13, src_minus14);
+
+        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+            src_minus10 = LD_UB(src_orig - src_stride);
+            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
+            src_plus13 = LD_UB(src + 1 + (v_cnt << 4) + (src_stride << 2));
+            src_orig += 16;
+
+            SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
+                       src_zero1, 1);
+            SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
+                       src_zero3, 1);
+            SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
+                       src_plus11, 2);
+
+            src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
+
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
+            cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
+            cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_zero0, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src_zero1, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src_zero2, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src_zero3, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4,
+                        temp7, temp6, dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_minus12 = src11;
+            src_minus13 = src12;
+            src_minus14 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
+            dst_orig += 16;
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint32_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, dst0;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v8i16 offset_mask0, offset_mask1, sao_offset, src00, src01;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src_orig = src - 1;
+
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+
+        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        dst0 = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
+        dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
+
+        SW(dst_val0, dst);
+        dst += dst_stride;
+        SW(dst_val1, dst);
+
+        dst += dst_stride;
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    int32_t h_cnt;
+    uint64_t dst_val0, dst_val1;
+    v8i16 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16i8 zero = { 0 };
+    v16i8 src_zero0, src_zero1, dst0, dst1;
+    v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    v16u8 src_minus10, src10, src_minus11, src11;
+    v8i16 sao_offset, src00, offset_mask0, src01, offset_mask1;
+
+    sao_offset = LD_SH(sao_offset_val);
+    src_orig = src - 1;
+
+    LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
+
+    for (h_cnt = (height >> 1); h_cnt--;) {
+        LD_UB2(src_orig + src_stride, src_stride, src10, src11);
+
+        SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
+        SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+        ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
+                   src_minus11);
+        ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
+                   src_zero1);
+
+        cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+        diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+        cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+        diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+
+        cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+        diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+        cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+        diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+
+        offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
+        offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
+
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
+                   offset_mask0, offset_mask0, offset_mask0);
+        VSHF_H2_SH(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
+                   offset_mask1, offset_mask1, offset_mask1);
+        ILVEV_B2_SH(src_zero0, zero, src_zero1, zero, src00, src01);
+        ADD2(offset_mask0, src00, offset_mask1, src01, offset_mask0,
+             offset_mask1);
+        CLIP_SH2_0_255(offset_mask0, offset_mask1);
+        PCKEV_B2_SB(offset_mask0, offset_mask0, offset_mask1, offset_mask1,
+                    dst0, dst1);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+
+        SD(dst_val0, dst);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        dst += dst_stride;
+
+        src_orig += (src_stride << 1);
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst,
+                                                          int32_t dst_stride,
+                                                          uint8_t *src,
+                                                          int32_t src_stride,
+                                                          int16_t *
+                                                          sao_offset_val,
+                                                          int32_t width,
+                                                          int32_t height)
+{
+    uint8_t *src_orig, *dst_orig;
+    int32_t h_cnt, v_cnt;
+    v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16u8 const1 = (v16u8) __msa_ldi_b(1);
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
+    v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
+    v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
+    v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
+    v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
+    v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
+    v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    sao_offset = LD_SB(sao_offset_val);
+    sao_offset = __msa_pckev_b(sao_offset, sao_offset);
+
+    for (h_cnt = (height >> 2); h_cnt--;) {
+        src_orig = src - 1;
+        dst_orig = dst;
+
+        LD_UB4(src_orig, src_stride,
+               src_minus11, src_plus10, src_plus11, src_plus12);
+
+        for (v_cnt = 0; v_cnt < (width >> 4); v_cnt++) {
+            src_minus10 = LD_UB(src_orig + 2 - src_stride);
+            LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
+            src_plus13 = LD_UB(src_orig + (src_stride << 2));
+            src_orig += 16;
+
+            src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
+            cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
+            cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
+
+            src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
+            src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
+                                               (v16i8) src_minus11, 2);
+            cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
+            cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
+
+            src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
+            src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
+            cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
+            cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
+
+            src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
+            src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
+            cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
+            cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
+
+            diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
+            diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
+            diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
+            diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
+            diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
+            diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
+            diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
+            diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
+
+            cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
+            cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
+            cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
+            cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
+            cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
+            cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
+            cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
+            cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
+
+            diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
+            diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
+            diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
+            diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
+            diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
+            diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
+            diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
+            diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
+
+            offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
+            offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
+            offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
+            offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
+
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask0, offset_mask0, offset_mask0, offset_mask0);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask1, offset_mask1, offset_mask1, offset_mask1);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask2, offset_mask2, offset_mask2, offset_mask2);
+            VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
+                       offset_mask3, offset_mask3, offset_mask3, offset_mask3);
+
+            UNPCK_UB_SH(src_zero0, src0, src1);
+            UNPCK_SB_SH(offset_mask0, temp0, temp1);
+            UNPCK_UB_SH(src_zero1, src2, src3);
+            UNPCK_SB_SH(offset_mask1, temp2, temp3);
+            UNPCK_UB_SH(src_zero2, src4, src5);
+            UNPCK_SB_SH(offset_mask2, temp4, temp5);
+            UNPCK_UB_SH(src_zero3, src6, src7);
+            UNPCK_SB_SH(offset_mask3, temp6, temp7);
+
+            ADD4(temp0, src0, temp1, src1, temp2, src2, temp3, src3, temp0,
+                 temp1, temp2, temp3);
+            ADD4(temp4, src4, temp5, src5, temp6, src6, temp7, src7, temp4,
+                 temp5, temp6, temp7);
+            CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+            CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
+            PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                        dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_plus10 = src11;
+            src_plus11 = src12;
+            src_plus12 = src13;
+
+            ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
+            dst_orig += 16;
+        }
+
+        src += (src_stride << 2);
+        dst += (dst_stride << 2);
+    }
+}
+
+void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q)
+{
+    hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q)
+{
+    hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q)
+{
+    hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q);
+}
+
+void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q)
+{
+    hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q);
+}
+
+void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                     int16_t *sao_offset_val, int sao_left_class,
+                                     int width, int height)
+{
+    if (width >> 4) {
+        hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src,
+                                            sao_left_class, sao_offset_val,
+                                            width - (width % 16), height);
+        dst += width - (width % 16);
+        src += width - (width % 16);
+        width %= 16;
+    }
+
+    if (width >> 3) {
+        hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src,
+                                        sao_left_class, sao_offset_val, height);
+        dst += 8;
+        src += 8;
+        width %= 8;
+    }
+
+    if (width) {
+        hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src,
+                                        sao_left_class, sao_offset_val, height);
+    }
+}
+
+void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height)
+{
+    ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
+
+    switch (eo) {
+    case 0:
+        if (width >> 4) {
+            hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst,
+                                                        src, stride_src,
+                                                        sao_offset_val,
+                                                        width - (width % 16),
+                                                        height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+        }
+        break;
+
+    case 1:
+        if (width >> 4) {
+            hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width % 16),
+                                                         height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 2:
+        if (width >> 4) {
+            hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width % 16),
+                                                         height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 3:
+        if (width >> 4) {
+            hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst,
+                                                          src, stride_src,
+                                                          sao_offset_val,
+                                                          width - (width % 16),
+                                                          height);
+            dst += width - (width % 16);
+            src += width - (width % 16);
+            width %= 16;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width %= 8;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+        }
+        break;
+    }
+}
diff --git a/libavcodec/mips/hevc_macros_msa.h b/libavcodec/mips/hevc_macros_msa.h
new file mode 100644
index 0000000..b06c5ad
--- /dev/null
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVC_MACROS_MSA_H
+#define AVCODEC_MIPS_HEVC_MACROS_MSA_H
+
+#define HEVC_PCK_SW_SB2(in0, in1, out)                            \
+{                                                                 \
+    v8i16 tmp0_m;                                                 \
+                                                                  \
+    tmp0_m = __msa_pckev_h((v8i16) in0, (v8i16) in1);             \
+    out = (v4i32) __msa_pckev_b((v16i8) tmp0_m, (v16i8) tmp0_m);  \
+}
+
+#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)                  \
+{                                                                 \
+    v8i16 tmp0_m, tmp1_m;                                         \
+                                                                  \
+    PCKEV_H2_SH(in0, in1, in2, in3, tmp0_m, tmp1_m);              \
+    out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m);  \
+}
+
+#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)  \
+{                                                                            \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+                                                                             \
+    PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,                      \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                             \
+    PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                 \
+}
+
+#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,   \
+                         in8, in9, in10, in11, out0, out1, out2)   \
+{                                                                  \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;          \
+                                                                   \
+    PCKEV_H4_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
+    PCKEV_H2_SH(in8, in9, in10, in11, tmp4_m, tmp5_m);             \
+    PCKEV_B2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);       \
+    out2 = (v4i32) __msa_pckev_b((v16i8) tmp5_m, (v16i8) tmp4_m);  \
+}
+
+#define HEVC_FILT_8TAP(in0, in1, in2, in3,                       \
+                       filt0, filt1, filt2, filt3)               \
+( {                                                              \
+    v4i32 out_m;                                                 \
+                                                                 \
+    out_m = __msa_dotp_s_w((v8i16) in0, (v8i16) filt0);          \
+    out_m = __msa_dpadd_s_w(out_m, (v8i16) in1, (v8i16) filt1);  \
+    DPADD_SH2_SW(in2, in3, filt2, filt3, out_m, out_m);          \
+    out_m;                                                       \
+} )
+
+#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)           \
+( {                                                      \
+    v4i32 out_m;                                         \
+                                                         \
+    out_m = __msa_dotp_s_w(in0, (v8i16) filt0);          \
+    out_m = __msa_dpadd_s_w(out_m, in1, (v8i16) filt1);  \
+    out_m;                                               \
+} )
+
+#endif  /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */
diff --git a/libavcodec/mips/hevc_mc_bi_msa.c b/libavcodec/mips/hevc_mc_bi_msa.c
new file mode 100644
index 0000000..8208be3
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_bi_msa.c
@@ -0,0 +1,4462 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
+{                                                                     \
+    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                    \
+    SRARI_H2_SH(out0, out1, rnd_val);                                 \
+    CLIP_SH2_0_255(out0, out1);                                       \
+}
+
+#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,                      \
+                          vec0, vec1, vec2, vec3, rnd_val,         \
+                          out0, out1, out2, out3)                  \
+{                                                                  \
+    HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
+    HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
+}
+
+static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 dst0, in0, in1;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+        dst0 += in0;
+        dst0 = __msa_srari_h(dst0, 7);
+        dst0 = CLIP_SH_0_255(dst0);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
+        ST4x2_UB(dst0, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 dst0, dst1;
+        v8i16 in0, in1, in2, in3;
+
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 dst0, dst1, dst2, dst3;
+        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src0_ptr, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src0_ptr += (8 * src_stride);
+
+            LD_SH8(src1_ptr, src2_stride,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+            src1_ptr += (8 * src2_stride);
+
+            ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+            ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+            ST4x8_UB(dst0, dst1, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   dst4, dst5, dst6, dst7);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
+
+        PCKEV_B2_SH(dst5, dst4, dst7, dst6, dst4, dst5);
+        ST6x4_UB(dst4, dst5, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
+                                int32_t src_stride,
+                                int16_t *src1_ptr,
+                                int32_t src2_stride,
+                                uint8_t *dst,
+                                int32_t dst_stride,
+                                int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1;
+        v8i16 dst0, dst1;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+        dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST8x2_UB(dst0, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1, dst2, dst3;
+
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 in0, in1, in2, in3, in4, in5;
+        v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+
+        LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(dst2, dst, dst_stride);
+    } else if (0 == height % 8) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1, dst2, dst3;
+        uint32_t loop_cnt;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+            src0_ptr += (4 * src_stride);
+            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+            src1_ptr += (4 * src2_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+            ST8x4_UB(dst0, dst1, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+            src0_ptr += (4 * src_stride);
+            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+            src1_ptr += (4 * src2_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+            ST8x4_UB(dst0, dst1, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v16i8 zero = { 0 };
+
+    for (loop_cnt = (16 >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_bi_copy_16multx4mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int32_t height,
+                                          int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 zero = { 0 };
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            v16i8 src0, src1, src2, src3;
+            v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+            v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+            v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+
+            LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
+            src1_ptr_tmp += (4 * src2_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0_r, dst1_r, dst2_r, dst3_r);
+            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+
+            SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
+            SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
+            HEVC_BI_RND_CLIP4(in0, in1, in4, in5,
+                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                              dst0_r, dst1_r, dst0_l, dst1_l);
+
+            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            HEVC_BI_RND_CLIP4(in2, in3, in6, in7,
+                              dst2_r, dst3_r, dst2_l, dst3_l, 7,
+                              dst2_r, dst3_r, dst2_l, dst3_l);
+
+            PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
+            ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 16);
+}
+
+static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 16);
+
+    hevc_bi_copy_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                        dst + 16, dst_stride, height);
+}
+
+static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 32);
+}
+
+static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 48);
+}
+
+static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 int32_t height)
+{
+    hevc_bi_copy_16multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, height, 64);
+}
+
+static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src0_ptr -= 3;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
+               src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_hz_bi_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                         dst, dst_stride, filter, height);
+    hevc_hz_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                         dst + 8, dst_stride, filter, height);
+}
+
+static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        src1_ptr += src2_stride;
+        LD_SH2(src1_ptr, 8, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0;
+    v16i8 src0, src1, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2;
+    v8i16 in0, in1, in2;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr = src0_ptr - 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        src1_ptr += src2_stride;
+        XORI_B2_128_SB(src0, src1);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+        dst2 = __msa_adds_s_h(dst2, in2);
+        dst2 = __msa_srari_h(dst2, 7);
+        dst2 = CLIP_SH_0_255(dst2);
+
+        PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
+        dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
+        ST_SB(tmp0, dst);
+        SD(dst_val0, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+        ST_SB2(tmp0, tmp1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 tmp0, tmp1, tmp2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        XORI_B2_128_SB(src0, src1);
+        LD_SH2(src1_ptr, 8, in0, in1);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+
+        HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+        tmp0 = __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+        ST_SB(tmp0, dst);
+
+        LD_SB2(src0_ptr + 32, 8, src2, src3);
+        XORI_B2_128_SB(src2, src3);
+        src0_ptr += src_stride;
+
+        LD_SH2(src1_ptr + 16, 8, in2, in3);
+
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
+
+        tmp1 = __msa_pckev_b((v16i8) dst3, (v16i8) dst2);
+        ST_SB(tmp1, dst + 16);
+
+        LD_SH2(src1_ptr + 32, 8, in4, in5);
+        src1_ptr += src2_stride;
+
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST_SB(tmp2, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint8_t *src0_ptr_tmp;
+    uint8_t *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    v16i8 src0, src1, src2, tmp0, tmp1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 3;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src0_ptr_tmp, 16, src0, src1);
+            src2 = LD_SB(src0_ptr_tmp + 24);
+            src0_ptr_tmp += 32;
+            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
+            src1_ptr_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst0 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst0, dst0, dst0, dst0);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec0, vec1, vec2, vec3);
+            dst1 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst1, dst1, dst1, dst1);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst2 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst2, dst2, dst2, dst2);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst3 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst3, dst3, dst3, dst3);
+
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0, dst1, dst2, dst3, 7,
+                              dst0, dst1, dst2, dst3);
+
+            PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
+            ST_SB2(tmp0, tmp1, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src1_ptr += src2_stride;
+        src0_ptr += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst10, dst32, dst54, dst76, 7,
+                          dst10, dst32, dst54, dst76);
+
+        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
+        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+        dst0_l = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3,
+                     dst0_l, dst0_l, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3,
+                     dst1_l, dst1_l, dst1_l, dst1_l);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
+
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
+        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
+                                           int32_t src_stride,
+                                           int16_t *src1_ptr,
+                                           int32_t src2_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter,
+                                           int32_t height, int32_t width)
+{
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0_r, dst1_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            src0_ptr_tmp += (2 * src_stride);
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
+            src1_ptr_tmp += (2 * src2_stride);
+            XORI_B2_128_SB(src7, src8);
+
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            dst0_r = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3,
+                         dst0_r, dst0_r, dst0_r, dst0_r);
+            dst1_r = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3,
+                         dst1_r, dst1_r, dst1_r, dst1_r);
+            dst0_l = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3,
+                         dst0_l, dst0_l, dst0_l, dst0_l);
+            dst1_l = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3,
+                         dst1_l, dst1_l, dst1_l, dst1_l);
+
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                              dst0_r, dst1_r, dst0_l, dst1_l);
+
+            PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+            ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 16);
+}
+
+static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 16);
+    hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                         dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 32);
+}
+
+static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 48);
+}
+
+static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                   dst, dst_stride, filter, height, 64);
+}
+
+static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r, in0_r, in0_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src0_ptr -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src7, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        src1_ptr += (2 * src2_stride);
+
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+        UNPCK_SH_SW(in0, in0_r, in0_l);
+        dst0_r = __msa_adds_s_w(dst0_r, in0_r);
+        dst1_r = __msa_adds_s_w(dst1_r, in0_l);
+        SRARI_W2_SW(dst0_r, dst1_r, 7);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst1_r = CLIP_SW_0_255(dst1_r);
+
+        HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_bi_8t_8multx2mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height, int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v4i32 in0_r, in0_l, in1_r, in1_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            /* row 7 */
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src0_ptr_tmp += 2 * src_stride;
+
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            UNPCK_SH_SW(in0, in0_r, in0_l);
+            UNPCK_SH_SW(in1, in1_r, in1_l);
+            in0_r = __msa_adds_s_w(in0_r, dst0_r);
+            in0_l = __msa_adds_s_w(in0_l, dst0_l);
+            in1_r = __msa_adds_s_w(in1_r, dst1_r);
+            in1_l = __msa_adds_s_w(in1_l, dst1_l);
+            SRARI_W4_SW(in0_r, in0_l, in1_r, in1_l, 7);
+            in0_r = CLIP_SW_0_255(in0_r);
+            in0_l = CLIP_SW_0_255(in0_l);
+            in1_r = CLIP_SW_0_255(in1_r);
+            in1_l = CLIP_SW_0_255(in1_l);
+
+            HEVC_PCK_SW_SB4(in0_l, in0_r, in1_l, in1_r, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 8);
+}
+
+static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 8);
+
+    hevc_hv_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                         dst + 8, dst_stride, filter_x, filter_y, height);
+}
+
+static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 16);
+}
+
+static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 24);
+}
+
+static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 32);
+}
+
+static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 48);
+}
+
+static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 64);
+}
+
+static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, dst0, vec0, vec1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v8i16 tmp0;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    tmp0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
+
+    tmp0 = __msa_adds_s_h(tmp0, in0);
+    tmp0 = __msa_srari_h(tmp0, 7);
+    tmp0 = CLIP_SH_0_255(tmp0);
+    dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
+
+    ST4x2_UB(dst0, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v8i16 tmp0, tmp1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    tmp0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
+    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
+    dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 dst0, dst1;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        tmp0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
+        ST4x8_UB(dst0, dst1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (4 == height) {
+        hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (8 == height || 16 == height) {
+        hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST6x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
+
+    dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
+    ST8x2_UB(dst0, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    LD_SH2(src1_ptr, src2_stride, in4, in5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+    HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+    PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+    dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+    ST8x4_UB(dst0, dst1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2, dst, dst_stride);
+}
+
+static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (6 == height) {
+        hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (0 == (height % 4)) {
+        hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+        ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
+
+        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
+                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    dst_tmp = dst + 16;
+    src1_ptr_tmp = src1_ptr + 16;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
+
+        PCKEV_B4_SH(dst1, dst0, dst3, dst2,
+                    dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+        src1_ptr_tmp += (4 * src2_stride);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= 1;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, 16);
+        dst += dst_stride;
+
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
+
+        PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
+        ST_SH2(dst0, dst1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v8i16 dst10;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst10 = __msa_adds_s_h(dst10, in0);
+    dst10 = __msa_srari_h(dst10, 7);
+    dst10 = CLIP_SH_0_255(dst10);
+
+    dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
+    ST4x2_UB(dst10, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
+
+    dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
+    ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+        src0_ptr += (6 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src0_ptr, src_stride, src9, src2);
+        src0_ptr += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst10, dst32, dst54, dst76, 7,
+                          dst10, dst32, dst54, dst76);
+
+        PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
+        ST4x8_UB(dst10, dst54, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (4 == height) {
+        hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else {
+        hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, dst0_r, dst1_r;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
+    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+    dst2_r = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
+    dst3_r = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
+    dst4_r = const_vec;
+    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
+    dst5_r = const_vec;
+    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                      dst0_r, dst1_r, dst2_r, dst3_r);
+    HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
+
+    PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+    dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter,
+                                          int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else if (6 == height) {
+        hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter, height);
+    } else {
+        hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height);
+    }
+}
+
+static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 dst0_l, dst1_l, filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= (1 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B2_128_SB(src3, src4);
+
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst2_r, dst3_r, 7,
+                          dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+        dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
+        ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* 8width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        /* 8width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        /* 16width */
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        /* 8width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+        HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src0_ptr -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    /* next 16width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
+        LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        /* 16width */
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        /* 16width */
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          dst0_r, dst1_r, dst0_l, dst1_l, 7,
+                          dst0_r, dst1_r, dst0_l, dst1_l);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* next 16width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+        /* next 16width */
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
+        /* next 16width */
+        HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
+                          dst2_r, dst3_r, dst2_l, dst3_l, 7,
+                          dst2_r, dst3_r, dst2_l, dst3_l);
+
+        PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
+        ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v8i16 in0, in1;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src3, src4);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+    dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0);
+    dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7);
+    dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
+
+    dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v8i16 in0, in1, in2, in3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 dst0_r, dst1_r;
+    v4i32 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    tmp0 >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    tmp1 >>= 6;
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    tmp2 = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    tmp2 >>= 6;
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    tmp3 = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    tmp3 >>= 6;
+    PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+    HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
+
+    dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        /* row 7 */
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+        /* row 8 */
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+        /* row 9 */
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+        /* row 10 */
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+        ST4x8_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else if (4 == height) {
+        hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else if (0 == (height % 8)) {
+        hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride,
+                                      filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                          tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+
+        PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 tmp0, tmp1;
+    v8i16 in0, in1;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+    HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
+
+    dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+    tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+    tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
+
+    LD_SB2(src0_ptr, src_stride, src5, src6);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+    tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+    tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
+
+    LD_SB2(src0_ptr, src_stride, src7, src8);
+    XORI_B2_128_SB(src7, src8);
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+    tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+    tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
+
+    HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                      tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+    HEVC_BI_RND_CLIP2(in4, in5, tmp4, tmp5, 7, tmp4, tmp5);
+
+    PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+    dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
+                                          int32_t src_stride,
+                                          int16_t *src1_ptr,
+                                          int32_t src2_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_x,
+                                          const int8_t *filter_y,
+                                          int32_t height,
+                                          int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+        src0_ptr_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            src1_ptr_tmp += (4 * src2_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+            HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
+                              tmp0, tmp1, tmp2, tmp3, 7,
+                              tmp0, tmp1, tmp2, tmp3);
+
+            PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
+                                 int32_t src_stride,
+                                 int16_t *src1_ptr,
+                                 int32_t src2_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride,
+                                 const int8_t *filter_x,
+                                 const int8_t *filter_y,
+                                 int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else if (6 == height) {
+        hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                              dst, dst_stride, filter_x, filter_y, height);
+    } else {
+        hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride,
+                                      filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 8);
+    hevc_hv_bi_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                         dst + 8, dst_stride, filter_x, filter_y, height);
+}
+
+static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 16);
+}
+
+static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 24);
+}
+
+static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
+                                  int32_t src_stride,
+                                  int16_t *src1_ptr,
+                                  int32_t src2_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                  dst, dst_stride, filter_x, filter_y,
+                                  height, 32);
+}
+
+#define BI_MC_COPY(WIDTH)                                                 \
+void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                   ptrdiff_t dst_stride,  \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int16_t *src_16bit,    \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
+                                dst, dst_stride, height);                 \
+}
+
+BI_MC_COPY(4);
+BI_MC_COPY(6);
+BI_MC_COPY(8);
+BI_MC_COPY(12);
+BI_MC_COPY(16);
+BI_MC_COPY(24);
+BI_MC_COPY(32);
+BI_MC_COPY(48);
+BI_MC_COPY(64);
+
+#undef BI_MC_COPY
+
+#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                            \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
+                                                        ptrdiff_t dst_stride,  \
+                                                        uint8_t *src,          \
+                                                        ptrdiff_t src_stride,  \
+                                                        int16_t *src_16bit,    \
+                                                        int height,            \
+                                                        intptr_t mx,           \
+                                                        intptr_t my,           \
+                                                        int width)             \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
+                                             MAX_PB_SIZE, dst, dst_stride,     \
+                                             filter, height);                  \
+}
+
+BI_MC(qpel, h, 4, 8, hz, mx);
+BI_MC(qpel, h, 8, 8, hz, mx);
+BI_MC(qpel, h, 12, 8, hz, mx);
+BI_MC(qpel, h, 16, 8, hz, mx);
+BI_MC(qpel, h, 24, 8, hz, mx);
+BI_MC(qpel, h, 32, 8, hz, mx);
+BI_MC(qpel, h, 48, 8, hz, mx);
+BI_MC(qpel, h, 64, 8, hz, mx);
+
+BI_MC(qpel, v, 4, 8, vt, my);
+BI_MC(qpel, v, 8, 8, vt, my);
+BI_MC(qpel, v, 12, 8, vt, my);
+BI_MC(qpel, v, 16, 8, vt, my);
+BI_MC(qpel, v, 24, 8, vt, my);
+BI_MC(qpel, v, 32, 8, vt, my);
+BI_MC(qpel, v, 48, 8, vt, my);
+BI_MC(qpel, v, 64, 8, vt, my);
+
+BI_MC(epel, h, 4, 4, hz, mx);
+BI_MC(epel, h, 8, 4, hz, mx);
+BI_MC(epel, h, 6, 4, hz, mx);
+BI_MC(epel, h, 12, 4, hz, mx);
+BI_MC(epel, h, 16, 4, hz, mx);
+BI_MC(epel, h, 24, 4, hz, mx);
+BI_MC(epel, h, 32, 4, hz, mx);
+
+BI_MC(epel, v, 4, 4, vt, my);
+BI_MC(epel, v, 8, 4, vt, my);
+BI_MC(epel, v, 6, 4, vt, my);
+BI_MC(epel, v, 12, 4, vt, my);
+BI_MC(epel, v, 16, 4, vt, my);
+BI_MC(epel, v, 24, 4, vt, my);
+BI_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_MC
+
+#define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                                   \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
+                                                        ptrdiff_t dst_stride,  \
+                                                        uint8_t *src,          \
+                                                        ptrdiff_t src_stride,  \
+                                                        int16_t *src_16bit,    \
+                                                        int height,            \
+                                                        intptr_t mx,           \
+                                                        intptr_t my,           \
+                                                        int width)             \
+{                                                                              \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                  \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                  \
+                                                                               \
+    hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit,       \
+                                             MAX_PB_SIZE, dst, dst_stride,     \
+                                             filter_x, filter_y,               \
+                                             height);                          \
+}
+
+BI_MC_HV(qpel, hv, 4, 8, hv);
+BI_MC_HV(qpel, hv, 8, 8, hv);
+BI_MC_HV(qpel, hv, 12, 8, hv);
+BI_MC_HV(qpel, hv, 16, 8, hv);
+BI_MC_HV(qpel, hv, 24, 8, hv);
+BI_MC_HV(qpel, hv, 32, 8, hv);
+BI_MC_HV(qpel, hv, 48, 8, hv);
+BI_MC_HV(qpel, hv, 64, 8, hv);
+
+BI_MC_HV(epel, hv, 4, 4, hv);
+BI_MC_HV(epel, hv, 8, 4, hv);
+BI_MC_HV(epel, hv, 6, 4, hv);
+BI_MC_HV(epel, hv, 12, 4, hv);
+BI_MC_HV(epel, hv, 16, 4, hv);
+BI_MC_HV(epel, hv, 24, 4, hv);
+BI_MC_HV(epel, hv, 32, 4, hv);
+
+#undef BI_MC_HV
diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c
new file mode 100644
index 0000000..05a28ec
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_biw_msa.c
@@ -0,0 +1,5572 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,  \
+                           out0_r, out1_r, out0_l, out1_l)          \
+{                                                                   \
+    ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r);               \
+    ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l);               \
+                                                                    \
+    out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt);  \
+    out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt);  \
+    out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt);  \
+    out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt);  \
+                                                                    \
+    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                \
+                                                                    \
+    out0_r = CLIP_SW_0_255(out0_r);                                 \
+    out1_r = CLIP_SW_0_255(out1_r);                                 \
+    out0_l = CLIP_SW_0_255(out0_l);                                 \
+    out1_l = CLIP_SW_0_255(out1_l);                                 \
+}
+
+#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3,  \
+                           wgt, rnd, offset,                            \
+                           out0_r, out1_r, out2_r, out3_r,              \
+                           out0_l, out1_l, out2_l, out3_l)              \
+{                                                                       \
+    HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,          \
+                       out0_r, out1_r, out0_l, out1_l)                  \
+    HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset,          \
+                       out2_r, out3_r, out2_l, out3_l)                  \
+}
+
+#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)  \
+{                                                                     \
+    ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1);                    \
+    SRARI_H2_SH(out0, out1, rnd_val);                                 \
+    CLIP_SH2_0_255(out0, out1);                                       \
+}
+
+#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,                      \
+                          vec0, vec1, vec2, vec3, rnd_val,         \
+                          out0, out1, out2, out3)                  \
+{                                                                  \
+    HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1);  \
+    HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3);  \
+}
+
+static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1, dst0;
+        v4i32 dst0_r, dst0_l;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+
+        ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
+        dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r,
+                                 (v8i16) weight_vec);
+        dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst0_l = CLIP_SW_0_255(dst0_l);
+
+        HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1;
+        v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src0_ptr, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src0_ptr += (8 * src_stride);
+            LD_SH8(src1_ptr, src2_stride,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+            src1_ptr += (8 * src2_stride);
+
+            ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+            ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
+                                   int32_t src_stride,
+                                   int16_t *src1_ptr,
+                                   int32_t src2_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   int32_t height,
+                                   int32_t weight0,
+                                   int32_t weight1,
+                                   int32_t offset0,
+                                   int32_t offset1,
+                                   int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1, dst0, dst1;
+        v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+        LD_SB2(src0_ptr, src_stride, src0, src1);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 in0, in1, in2, in3, in4, in5;
+        v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+        v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+        LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+        LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(dst2_r, dst, dst_stride);
+    } else if (0 == height % 4) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+            src0_ptr += (4 * src_stride);
+            LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+            src1_ptr += (4 * src2_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (16 >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_biwgt_copy_16multx4mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    int32_t offset, weight;
+    v16i8 zero = { 0 };
+    v4i32 offset_vec, weight_vec, rnd_vec;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            v16i8 src0, src1, src2, src3;
+            v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+            v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+            v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+            v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+
+            LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
+            src1_ptr_tmp += (4 * src2_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       tmp0, tmp1, tmp2, tmp3);
+            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       tmp4, tmp5, tmp6, tmp7);
+
+            SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+            SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
+            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                               in0, in1, in4, in5,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
+                               in2, in3, in6, in7,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 16);
+}
+
+static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 16);
+    hevc_biwgt_copy_8w_msa(src0_ptr + 16, src_stride,
+                           src1_ptr + 16, src2_stride,
+                           dst + 16, dst_stride, height, weight0,
+                           weight1, offset0, offset1, rnd_val);
+}
+
+static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 32);
+}
+
+static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 48);
+}
+
+static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 64);
+}
+
+static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1;
+    v8i16 in0, in1, in2, in3;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src0_ptr -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l, dst2_r, dst3_r, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hz_biwgt_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                            dst, dst_stride, filter, height,
+                            weight0, weight1, offset0, offset1, rnd_val);
+    hevc_hz_biwgt_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                            dst + 8, dst_stride, filter, height,
+                            weight0, weight1, offset0, offset1, rnd_val);
+}
+
+static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        LD_SB2(src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        src1_ptr += src2_stride;
+        LD_SH2(src1_ptr, 8, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0;
+    int32_t offset, weight;
+    v16i8 src0, src1;
+    v8i16 in0, in1, in2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2;
+    v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src0_ptr = src0_ptr - 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        src1_ptr += src2_stride;
+        XORI_B2_128_SB(src0, src1);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = CLIP_SW_0_255(dst2_r);
+        dst2_l = CLIP_SW_0_255(dst2_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
+        dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        ST_SW(dst0_r, dst);
+        SD(dst_val0, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    uint64_t dst_val0;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src0_ptr, 16, src0, src1, src2);
+        src3 = LD_SB(src0_ptr + 40);
+        src0_ptr += src_stride;
+        LD_SH2(src1_ptr, 8, in0, in1);
+        in2 = LD_SH(src1_ptr + 16);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = CLIP_SW_0_255(dst2_r);
+        dst2_l = CLIP_SW_0_255(dst2_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
+        dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        ST_SW(dst0_r, dst);
+        SD(dst_val0, dst + 16);
+
+        LD_SH2(src1_ptr + 24, 8, in3, in4);
+        in5 = LD_SH(src1_ptr + 40);
+        src1_ptr += src2_stride;
+
+        HEVC_BIW_RND_CLIP2(dst3, dst4, in3, in4,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst3_r, dst4_r, dst3_l, dst4_l);
+
+        ILVRL_H2_SW(dst5, in5, dst5_r, dst5_l);
+        dst5_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_r,
+                                 (v8i16) weight_vec);
+        dst5_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst5_r, dst5_l, rnd_vec);
+        dst5_r = CLIP_SW_0_255(dst5_r);
+        dst5_l = CLIP_SW_0_255(dst5_l);
+
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        HEVC_PCK_SW_SB2(dst3_l, dst3_r, dst3_r);
+        dst_val0 = __msa_copy_u_d((v2i64) dst3_r, 0);
+        SD(dst_val0, dst + 24);
+        ST_SW(dst4_r, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint8_t *src0_ptr_tmp;
+    uint8_t *dst_tmp;
+    int16_t *src1_ptr_tmp;
+    uint32_t loop_cnt, cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 3;
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src0_ptr_tmp, 16, src0, src1);
+            src2 = LD_SB(src0_ptr_tmp + 24);
+            src0_ptr_tmp += 32;
+            LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
+            src1_ptr_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst0 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst0, dst0, dst0, dst0);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec0, vec1, vec2, vec3);
+            dst1 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst1, dst1, dst1, dst1);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst2 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst2, dst2, dst2, dst2);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst3 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst3, dst3, dst3, dst3);
+
+            HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src0_ptr += src_stride;
+        src1_ptr += src2_stride;
+        dst += dst_stride;
+
+    }
+}
+
+static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, src12, src13, src14;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
+    v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
+                     filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst10_r, dst32_r, dst54_r, dst76_r,
+                           dst10_l, dst32_l, dst54_l, dst76_l);
+
+        HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r,
+                        dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
+        ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src7, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
+        XORI_B2_128_SB(src7, src8);
+
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+        src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+
+        HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
+        dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
+                                 (v8i16) weight_vec);
+        dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
+                                 (v8i16) weight_vec);
+        SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
+        dst2_r = CLIP_SW_0_255(dst2_r);
+        dst2_l = CLIP_SW_0_255(dst2_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
+        ST8x2_UB(dst0_r, dst, dst_stride);
+        ST4x2_UB(dst2_r, dst + 8, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src32_r = src54_r;
+        src54_r = src76_r;
+        src21_r = src43_r;
+        src43_r = src65_r;
+        src65_r = src87_r;
+        src2110 = src4332;
+        src4332 = src6554;
+        src6554 = src8776;
+        src6 = src8;
+    }
+}
+
+static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
+                                              int32_t src_stride,
+                                              int16_t *src1_ptr,
+                                              int32_t src2_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight0,
+                                              int32_t weight1,
+                                              int32_t offset0,
+                                              int32_t offset1,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (3 * src_stride);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            src0_ptr_tmp += (2 * src_stride);
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            XORI_B2_128_SB(src7, src8);
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            tmp0 = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+            tmp1 = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+            tmp2 = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+            tmp3 = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 16);
+}
+
+static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 16);
+    hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
+                            src1_ptr + 16, src2_stride,
+                            dst + 16, dst_stride, filter, height,
+                            weight0, weight1, offset0, offset1, rnd_val);
+}
+
+static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 32);
+}
+
+static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 48);
+}
+
+static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
+                                      src1_ptr, src2_stride,
+                                      dst, dst_stride, filter, height,
+                                      weight0, weight1, offset0, offset1,
+                                      rnd_val, 64);
+}
+
+static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r;
+    v4i32 tmp1, tmp2;
+    v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec0 = __msa_fill_w(weight0);
+    weight_vec1 = __msa_fill_w(weight1);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src0_ptr += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src7, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        src1_ptr += (2 * src2_stride);
+
+        in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+
+        ILVRL_H2_SW(in0, in0, tmp1, tmp2);
+        tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1, (v8i16) weight_vec0);
+        tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2, (v8i16) weight_vec0);
+        tmp1 += dst0_r * weight_vec1;
+        tmp2 += dst1_r * weight_vec1;
+        SRAR_W2_SW(tmp1, tmp2, rnd_vec);
+        tmp1 = CLIP_SW_0_255(tmp1);
+        tmp2 = CLIP_SW_0_255(tmp2);
+
+        HEVC_PCK_SW_SB2(tmp2, tmp1, tmp1);
+        ST4x2_UB(tmp1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    int32_t offset;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v4i32 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
+
+    src0_ptr -= ((3 * src_stride) + 3);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec0 = __msa_fill_w(weight0);
+    weight_vec1 = __msa_fill_w(weight1);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB7(src0_ptr_tmp, src_stride,
+               src0, src1, src2, src3, src4, src5, src6);
+        src0_ptr_tmp += (7 * src_stride);
+
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        /* row 4 row 5 row 6 */
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src0_ptr_tmp += 2 * src_stride;
+
+            LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
+            src1_ptr_tmp += (2 * src2_stride);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 8 */
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            ILVRL_H2_SW(in0, in0, tmp0, tmp1);
+            ILVRL_H2_SW(in1, in1, tmp2, tmp3);
+            tmp0 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp0,
+                                   (v8i16) weight_vec0);
+            tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1,
+                                   (v8i16) weight_vec0);
+            tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2,
+                                   (v8i16) weight_vec0);
+            tmp3 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp3,
+                                   (v8i16) weight_vec0);
+
+            tmp0 += (dst0_r * weight_vec1);
+            tmp1 += (dst0_l * weight_vec1);
+            tmp2 += (dst1_r * weight_vec1);
+            tmp3 += (dst1_l * weight_vec1);
+
+            SRAR_W4_SW(tmp0, tmp1, tmp2, tmp3, rnd_vec);
+            tmp0 = CLIP_SW_0_255(tmp0);
+            tmp1 = CLIP_SW_0_255(tmp1);
+            tmp2 = CLIP_SW_0_255(tmp2);
+            tmp3 = CLIP_SW_0_255(tmp3);
+            HEVC_PCK_SW_SB4(tmp1, tmp0, tmp3, tmp2, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src0_ptr += 8;
+        src1_ptr += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 8);
+}
+
+static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 8);
+    hevc_hv_biwgt_8t_4w_msa(src0_ptr + 8, src_stride,
+                            src1_ptr + 8, src2_stride,
+                            dst + 8, dst_stride, filter_x, filter_y,
+                            height, weight0, weight1, offset0, offset1,
+                            rnd_val);
+}
+
+static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 16);
+}
+
+static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 24);
+}
+
+static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 32);
+}
+
+static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 48);
+}
+
+static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride, filter_x, filter_y,
+                                     height, weight0, weight1, offset0,
+                                     offset1, rnd_val, 64);
+}
+
+static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
+    dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
+    dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v8i16 dst0, dst1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t weight, offset;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src0, src1, src2, src3, src4, src5, src6, src7);
+        src0_ptr += (8 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v8i16 in0, in1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src0_ptr, src_stride, src0, src1);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
+
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    LD_SH2(src1_ptr, src2_stride, in4, in5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst2_r, dst3_r,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+    HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (6 == height) {
+        hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
+        LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
+        LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
+        src1_ptr += (4 * src2_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
+                           in4, in5, in6, in7,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src0, src2);
+        LD_SB2(src0_ptr + 16, src_stride, src1, src3);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in2);
+        LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src0_ptr, 16, src0, src1);
+        src2 = LD_SB(src0_ptr + 24);
+        src0_ptr += src_stride;
+        LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, dst10;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v4i32 dst10_r, dst10_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    src1_ptr += (2 * src2_stride);
+
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+
+    ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
+    dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
+    dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
+    dst10_r = CLIP_SW_0_255(dst10_r);
+    dst10_l = CLIP_SW_0_255(dst10_l);
+
+    HEVC_PCK_SW_SB2(dst10_l, dst10_r, dst10_r);
+    ST4x2_UB(dst10_r, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v4i32 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    src0_ptr += (4 * src_stride);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    src1_ptr += (4 * src2_stride);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+
+    HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst10_r, dst32_r, dst10_l, dst32_l);
+
+    HEVC_PCK_SW_SB4(dst10_l, dst10_r, dst32_l, dst32_r, dst10_r);
+    ST4x4_UB(dst10_r, dst10_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
+    v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+        src0_ptr += (6 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src0_ptr, src_stride, src9, src2);
+        src0_ptr += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+        HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst10_r, dst32_r, dst54_r, dst76_r,
+                           dst10_l, dst32_l, dst54_l, dst76_l);
+
+        HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r,
+                        dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
+        ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (4 == height) {
+        hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src0_ptr, src_stride, src1, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, tmp0, tmp1;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    tmp2 = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
+    tmp3 = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
+    tmp4 = const_vec;
+    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
+    tmp5 = const_vec;
+    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
+    HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst2_r, dst3_r,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+    HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src0_ptr, src_stride, src1, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else if (6 == height) {
+        hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter, height,
+                                 weight0, weight1, offset0, offset1, rnd_val);
+    } else {
+        hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter, height,
+                                         weight0, weight1, offset0, offset1,
+                                         rnd_val);
+    }
+}
+
+static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src0_ptr -= (1 * src_stride);
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
+        src1_ptr += (4 * src2_stride);
+        ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
+        XORI_B2_128_SB(src3, src4);
+
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
+
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
+
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        src0_ptr += (2 * src_stride);
+
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* 8width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        /* 16width */
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+        /* 8width */
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        /* 8width */
+        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+        /* 16width */
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        /* 8width */
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src5, src2);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        /* 8width */
+        LD_SB2(src0_ptr + 16, src_stride, src11, src8);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        /* 16width */
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
+        /* 8width */
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        /* 8width */
+        HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+        /* 16width */
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        /* 8width */
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
+
+    src0_ptr -= src_stride;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    /* 16width */
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    /* next 16width */
+    LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16width */
+        LD_SB2(src0_ptr, src_stride, src3, src4);
+        LD_SH2(src1_ptr, src2_stride, in0, in1);
+        LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 16width */
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+        /* 16width */
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+        /* 16width */
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        /* next 16width */
+        LD_SB2(src0_ptr + 16, src_stride, src9, src10);
+        src0_ptr += (2 * src_stride);
+        LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
+        LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
+        src1_ptr += (2 * src2_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+        /* next 16width */
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp6 = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+        tmp7 = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
+        /* next 16width */
+        HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
+                           in4, in5, in6, in7,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst4_r, dst5_r, dst6_r, dst7_r,
+                           dst4_l, dst5_l, dst6_l, dst7_l);
+
+        /* next 16width */
+        HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r,
+                        dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
+        ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 in0, in1;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r, dst0_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
+    XORI_B2_128_SB(src3, src4);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+
+    ILVRL_H2_SW(dst1_r, in0, dst0_r, dst0_l);
+    dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
+    dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t offset, weight;
+    v8i16 in0, in1, in2, in3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 tmp0, tmp1;
+    v4i32 dst0_l, dst1_l;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+    LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src0_ptr, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src0_ptr += (8 * src_stride);
+        LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+        src1_ptr += (8 * src2_stride);
+        ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+        ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (4 == height) {
+        hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter_x, filter_y,
+                                         height, weight0, weight1,
+                                         offset0, offset1, rnd_val);
+    }
+}
+
+static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    int32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
+        src0_ptr += (4 * src_stride);
+        LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
+        src1_ptr += (4 * src2_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+        HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                           in0, in1, in2, in3,
+                           weight_vec, rnd_vec, offset_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r,
+                           dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    int32_t weight, offset;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v8i16 in0, in1;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 tmp0, tmp1;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+
+    LD_SH2(src1_ptr, src2_stride, in0, in1);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+    tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+    tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
+
+    HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst0_l, dst1_l);
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    uint32_t offset, weight;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v8i16 in0, in1, in2, in3, in4, in5;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    LD_SB3(src0_ptr, src_stride, src0, src1, src2);
+    src0_ptr += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src0_ptr, src_stride, src3, src4);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+    LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+    tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+    tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
+
+    LD_SB2(src0_ptr, src_stride, src5, src6);
+    src0_ptr += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+    tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
+
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+    tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
+
+    HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                       in0, in1, in2, in3,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst0_r, dst1_r, dst2_r, dst3_r,
+                       dst0_l, dst1_l, dst2_l, dst3_l);
+
+    HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB2(src0_ptr, src_stride, src7, src8);
+    XORI_B2_128_SB(src7, src8);
+
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+    tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
+
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+    tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
+
+    HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
+                       weight_vec, rnd_vec, offset_vec,
+                       dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst2_r);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
+                                             int32_t src_stride,
+                                             int16_t *src1_ptr,
+                                             int32_t src2_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter_x,
+                                             const int8_t *filter_y,
+                                             int32_t height,
+                                             int32_t weight0,
+                                             int32_t weight1,
+                                             int32_t offset0,
+                                             int32_t offset1,
+                                             int32_t rnd_val,
+                                             int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    int32_t offset, weight;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 in0, in1, in2, in3;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src0_ptr -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    offset = (offset0 + offset1) << rnd_val;
+    weight0 = weight0 & 0x0000FFFF;
+    weight = weight0 | (weight1 << 16);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    offset_vec = __msa_fill_w(offset);
+    weight_vec = __msa_fill_w(weight);
+    rnd_vec = __msa_fill_w(rnd_val + 1);
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
+        src0_ptr_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
+            src0_ptr_tmp += (4 * src_stride);
+            LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
+            src1_ptr_tmp += (4 * src2_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+            HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                               in0, in1, in2, in3,
+                               weight_vec, rnd_vec, offset_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r,
+                               dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
+                                    int32_t src_stride,
+                                    int16_t *src1_ptr,
+                                    int32_t src2_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    const int8_t *filter_x,
+                                    const int8_t *filter_y,
+                                    int32_t height,
+                                    int32_t weight0,
+                                    int32_t weight1,
+                                    int32_t offset0,
+                                    int32_t offset1,
+                                    int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (6 == height) {
+        hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                 dst, dst_stride, filter_x, filter_y,
+                                 height, weight0, weight1, offset0, offset1,
+                                 rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                         src1_ptr, src2_stride,
+                                         dst, dst_stride, filter_x, filter_y,
+                                         height, weight0,
+                                         weight1, offset0, offset1, rnd_val, 8);
+    }
+}
+
+static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 8);
+
+    hevc_hv_biwgt_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
+                            dst + 8, dst_stride, filter_x, filter_y,
+                            height, weight0, weight1, offset0,
+                            offset1, rnd_val);
+}
+
+static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 16);
+}
+
+static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 24);
+}
+
+static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
+                                     int32_t src_stride,
+                                     int16_t *src1_ptr,
+                                     int32_t src2_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight0,
+                                     int32_t weight1,
+                                     int32_t offset0,
+                                     int32_t offset1,
+                                     int32_t rnd_val)
+{
+    hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
+                                     src1_ptr, src2_stride,
+                                     dst, dst_stride,
+                                     filter_x, filter_y, height, weight0,
+                                     weight1, offset0, offset1, rnd_val, 32);
+}
+
+#define BI_W_MC_COPY(WIDTH)                                                  \
+void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,           \
+                                                     ptrdiff_t dst_stride,   \
+                                                     uint8_t *src,           \
+                                                     ptrdiff_t src_stride,   \
+                                                     int16_t *src_16bit,     \
+                                                     int height,             \
+                                                     int denom,              \
+                                                     int weight0,            \
+                                                     int weight1,            \
+                                                     int offset0,            \
+                                                     int offset1,            \
+                                                     intptr_t mx,            \
+                                                     intptr_t my,            \
+                                                     int width)              \
+{                                                                            \
+    int shift = 14 + 1 - 8;                                                  \
+    int log2Wd = denom + shift - 1;                                          \
+                                                                             \
+    hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE,  \
+                                   dst, dst_stride, height,                  \
+                                   weight0, weight1, offset0,                \
+                                   offset1, log2Wd);                         \
+}
+
+BI_W_MC_COPY(4);
+BI_W_MC_COPY(6);
+BI_W_MC_COPY(8);
+BI_W_MC_COPY(12);
+BI_W_MC_COPY(16);
+BI_W_MC_COPY(24);
+BI_W_MC_COPY(32);
+BI_W_MC_COPY(48);
+BI_W_MC_COPY(64);
+
+#undef BI_W_MC_COPY
+
+#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                          ptrdiff_t            \
+                                                          dst_stride,          \
+                                                          uint8_t *src,        \
+                                                          ptrdiff_t            \
+                                                          src_stride,          \
+                                                          int16_t *src_16bit,  \
+                                                          int height,          \
+                                                          int denom,           \
+                                                          int weight0,         \
+                                                          int weight1,         \
+                                                          int offset0,         \
+                                                          int offset1,         \
+                                                          intptr_t mx,         \
+                                                          intptr_t my,         \
+                                                          int width)           \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    int shift = 14 + 1 - 8;                                                    \
+    int log2Wd = denom + shift - 1;                                            \
+                                                                               \
+    hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride,               \
+                                                src_16bit, MAX_PB_SIZE,        \
+                                                dst, dst_stride,               \
+                                                filter, height,                \
+                                                weight0, weight1, offset0,     \
+                                                offset1, log2Wd);              \
+}
+
+BI_W_MC(qpel, h, 4, 8, hz, mx);
+BI_W_MC(qpel, h, 8, 8, hz, mx);
+BI_W_MC(qpel, h, 12, 8, hz, mx);
+BI_W_MC(qpel, h, 16, 8, hz, mx);
+BI_W_MC(qpel, h, 24, 8, hz, mx);
+BI_W_MC(qpel, h, 32, 8, hz, mx);
+BI_W_MC(qpel, h, 48, 8, hz, mx);
+BI_W_MC(qpel, h, 64, 8, hz, mx);
+
+BI_W_MC(qpel, v, 4, 8, vt, my);
+BI_W_MC(qpel, v, 8, 8, vt, my);
+BI_W_MC(qpel, v, 12, 8, vt, my);
+BI_W_MC(qpel, v, 16, 8, vt, my);
+BI_W_MC(qpel, v, 24, 8, vt, my);
+BI_W_MC(qpel, v, 32, 8, vt, my);
+BI_W_MC(qpel, v, 48, 8, vt, my);
+BI_W_MC(qpel, v, 64, 8, vt, my);
+
+BI_W_MC(epel, h, 4, 4, hz, mx);
+BI_W_MC(epel, h, 8, 4, hz, mx);
+BI_W_MC(epel, h, 6, 4, hz, mx);
+BI_W_MC(epel, h, 12, 4, hz, mx);
+BI_W_MC(epel, h, 16, 4, hz, mx);
+BI_W_MC(epel, h, 24, 4, hz, mx);
+BI_W_MC(epel, h, 32, 4, hz, mx);
+
+BI_W_MC(epel, v, 4, 4, vt, my);
+BI_W_MC(epel, v, 8, 4, vt, my);
+BI_W_MC(epel, v, 6, 4, vt, my);
+BI_W_MC(epel, v, 12, 4, vt, my);
+BI_W_MC(epel, v, 16, 4, vt, my);
+BI_W_MC(epel, v, 24, 4, vt, my);
+BI_W_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_W_MC
+
+#define BI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                                 \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                          ptrdiff_t            \
+                                                          dst_stride,          \
+                                                          uint8_t *src,        \
+                                                          ptrdiff_t            \
+                                                          src_stride,          \
+                                                          int16_t *src_16bit,  \
+                                                          int height,          \
+                                                          int denom,           \
+                                                          int weight0,         \
+                                                          int weight1,         \
+                                                          int offset0,         \
+                                                          int offset1,         \
+                                                          intptr_t mx,         \
+                                                          intptr_t my,         \
+                                                          int width)           \
+{                                                                              \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                  \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                  \
+                                                                               \
+    int shift = 14 + 1 - 8;                                                    \
+    int log2Wd = denom + shift - 1;                                            \
+                                                                               \
+    hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride,               \
+                                                src_16bit, MAX_PB_SIZE,        \
+                                                dst, dst_stride,               \
+                                                filter_x, filter_y,            \
+                                                height, weight0, weight1,      \
+                                                offset0, offset1, log2Wd);     \
+}
+
+BI_W_MC_HV(qpel, hv, 4, 8, hv);
+BI_W_MC_HV(qpel, hv, 8, 8, hv);
+BI_W_MC_HV(qpel, hv, 12, 8, hv);
+BI_W_MC_HV(qpel, hv, 16, 8, hv);
+BI_W_MC_HV(qpel, hv, 24, 8, hv);
+BI_W_MC_HV(qpel, hv, 32, 8, hv);
+BI_W_MC_HV(qpel, hv, 48, 8, hv);
+BI_W_MC_HV(qpel, hv, 64, 8, hv);
+
+BI_W_MC_HV(epel, hv, 4, 4, hv);
+BI_W_MC_HV(epel, hv, 8, 4, hv);
+BI_W_MC_HV(epel, hv, 6, 4, hv);
+BI_W_MC_HV(epel, hv, 12, 4, hv);
+BI_W_MC_HV(epel, hv, 16, 4, hv);
+BI_W_MC_HV(epel, hv, 24, 4, hv);
+BI_W_MC_HV(epel, hv, 32, 4, hv);
+
+#undef BI_W_MC_HV
diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c
new file mode 100644
index 0000000..754fbdb
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_uni_msa.c
@@ -0,0 +1,3964 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static void copy_width8_msa(uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_width12_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+}
+
+static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width24_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height);
+}
+
+static void copy_width32_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width48_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
+}
+
+static void copy_width64_msa(uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
+                            filt0, filt1, filt2, filt3)         \
+( {                                                             \
+    v8i16 tmp0, tmp1;                                           \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+    tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \
+    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
+    tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1)                              \
+{                                                                           \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                   \
+                                                                            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2, mask3,                \
+                                   filt0, filt1, filt2, filt3,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                res0_m, res1_m, res2_m, res3_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+                res4_m, res5_m, res6_m, res7_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+                 res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+                 res4_m, res5_m, res6_m, res7_m);                             \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+                res7_m, out0, out1, out2, out3);                              \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
+( {                                                             \
+    v8i16 tmp0;                                                 \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    SRAR_H2_SH(out0, out1, rnd_vec);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter, uint8_t rnd_val)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    if (4 == height) {
+        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (8 == height) {
+        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (16 == height) {
+        common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter,
+                              rnd_val);
+    }
+}
+
+static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (4 == height) {
+        common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else {
+        common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height, rnd_val);
+    }
+}
+
+static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint8_t *src1_ptr, *dst1;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
+    v8i16 rnd_vec;
+
+    mask00 = LD_UB(&mc_filt_mask_arr[0]);
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    src1_ptr = src - 3;
+    dst1 = dst;
+
+    dst = dst1 + 8;
+    src = src1_ptr + 8;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask00 + 2;
+    mask2 = mask00 + 4;
+    mask3 = mask00 + 6;
+    mask4 = mask0 + 2;
+    mask5 = mask0 + 4;
+    mask6 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 8 width */
+        LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src1_ptr += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
+        dst1 += (4 * dst_stride);
+
+        /* 4 width */
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
+                                   mask6, filt0, filt1, filt2, filt3, out0,
+                                   out1);
+        SRAR_H2_SH(out0, out1, rnd_vec);
+        SAT_SH2_SH(out0, out1, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
+    v16i8 vec11;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
+    v8i16 out11, filt;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 16, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
+        VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
+        DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
+                    out8, out2, out9);
+        DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
+        VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
+        VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
+        DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
+                    out10, out6, out11);
+        DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
+        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
+                     out0, out8, out2, out9);
+        DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
+        VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
+        DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
+                     out4, out10, out6, out11);
+        DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7);
+        ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
+                    out8, out2, out9);
+        ADDS_SH2_SH(out1, out5, out3, out7, out1, out3);
+        SRAR_H4_SH(out0, out8, out2, out9, rnd_vec);
+        SRAR_H2_SH(out1, out3, rnd_vec);
+        SAT_SH4_SH(out0, out8, out2, out9, 7);
+        SAT_SH2_SH(out1, out3, 7);
+        out = PCKEV_XORI128_UB(out8, out9);
+        ST8x2_UB(out, dst + 16, dst_stride);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
+    v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src2, src3);
+        src1 = __msa_sldi_b(src2, src0, 8);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
+        VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
+        out5 = __msa_dpadd_s_h(out5, vec2, filt3);
+        ADDS_SH2_SH(out0, out3, out1, out4, out0, out1);
+        out2 = __msa_adds_s_h(out2, out5);
+        SRAR_H2_SH(out0, out1, rnd_vec);
+        out6 = __msa_srar_h(out2, rnd_vec);
+        SAT_SH3_SH(out0, out1, out6, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+
+        src1 = LD_SB(src + 40);
+        src += src_stride;
+        src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
+
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
+        out2 = __msa_dpadd_s_h(out2, vec2, filt1);
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
+                   vec0, vec1, vec2);
+        DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
+        VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
+                   vec0, vec1, vec2);
+        DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
+        out5 = __msa_dpadd_s_h(out5, vec2, filt3);
+        ADDS_SH2_SH(out0, out3, out1, out4, out3, out4);
+        out5 = __msa_adds_s_h(out2, out5);
+        SRAR_H3_SH(out3, out4, out5, rnd_vec);
+        SAT_SH3_SH(out3, out4, out5, 7);
+        out = PCKEV_XORI128_UB(out6, out3);
+        ST_UB(out, dst + 16);
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+
+        src0 = LD_SB(src + 32);
+        src2 = LD_SB(src + 48);
+        src3 = LD_SB(src + 56);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst + 32);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRAR_H2_SH(out10, out32, rnd_vec);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    int32_t loop_cnt;
+    uint32_t out2, out3;
+    uint64_t out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
+    v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
+    v8i16 filt, filt0, filt1, filt2, filt3;
+    v8i16 rnd_vec;
+    v4i32 mask = { 2, 6, 2, 6 };
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* 4 width */
+    VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
+    VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
+    VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        XORI_B2_128_SB(src7, src8);
+        src += (2 * src_stride);
+
+        ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
+                   vec01, vec23, vec45, vec67);
+        tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
+                                   filt2, filt3);
+        ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
+                   vec45, vec67);
+        tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
+                                   filt2, filt3);
+
+        /* 4 width */
+        VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
+        ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
+                   vec45, vec67);
+        tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
+                                   filt2, filt3);
+        SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
+        SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
+        PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
+        XORI_B3_128_SB(res0, res1, res2);
+
+        out0 = __msa_copy_u_d((v2i64) res0, 0);
+        out1 = __msa_copy_u_d((v2i64) res1, 0);
+        out2 = __msa_copy_u_w((v4i32) res2, 0);
+        out3 = __msa_copy_u_w((v4i32) res2, 1);
+        SD(out0, dst);
+        SW(out2, (dst + 8));
+        dst += dst_stride;
+        SD(out1, dst);
+        SW(out3, (dst + 8));
+        dst += dst_stride;
+
+        src0 = src2;
+        src1 = src3;
+        src2 = src4;
+        src3 = src5;
+        src4 = src6;
+        src5 = src7;
+        src6 = src8;
+        vec0 = vec2;
+        vec1 = vec3;
+        vec2 = vec4;
+        vec3 = vec5;
+        vec4 = vec6;
+        vec5 = vec7;
+    }
+}
+
+static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                   src87_l, src98_l, src109_l);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                     filt1, filt2, filt3);
+        out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                     filt1, filt2, filt3);
+        out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                     filt1, filt2, filt3);
+        out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                     filt1, filt2, filt3);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src54_l = src98_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src65_l = src109_l;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      uint8_t rnd_val, int32_t width)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v8i16 rnd_vec;
+
+    src -= (3 * src_stride);
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+            SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 16);
+
+    common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
+                        height, rnd_val);
+}
+
+static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 32);
+}
+
+static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 48);
+}
+
+static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height, uint8_t rnd_val)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              rnd_val, 64);
+}
+
+static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        src += 2 * src_stride;
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h(mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+        SRARI_W2_SW(dst0_r, dst1_r, 6);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst1_r = CLIP_SW_0_255(dst1_r);
+
+        HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter_x,
+                                           const int8_t *filter_y,
+                                           int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= ((3 * src_stride) + 3);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src_tmp += 2 * src_stride;
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+            SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+            dst0_r = CLIP_SW_0_255(dst0_r);
+            dst0_l = CLIP_SW_0_255(dst0_l);
+            dst1_r = CLIP_SW_0_255(dst1_r);
+            dst1_l = CLIP_SW_0_255(dst1_l);
+
+            HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+
+    hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                          filter_x, filter_y, height);
+}
+
+static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 64);
+}
+
+static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
+    v16u8 out;
+    v8i16 filt, res0;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
+    res0 = __msa_srar_h(res0, rnd_vec);
+    res0 = __msa_sat_s_h(res0, 7);
+    out = PCKEV_XORI128_UB(res0, res0);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRAR_H2_SH(out0, out1, rnd_vec);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (2 == height) {
+        common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (4 == height) {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (8 == height) {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (16 == height) {
+        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter,
+                              rnd_val);
+    }
+}
+
+static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out4, out5;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        out4 = PCKEV_XORI128_UB(out0, out1);
+        out5 = PCKEV_XORI128_UB(out2, out3);
+        ST6x4_UB(out4, out5, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, vec0, vec1, vec2, vec3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+        VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
+        DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
+        VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
+        DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
+        SRAR_H2_SH(vec0, vec1, rnd_vec);
+        SAT_SH2_SH(vec0, vec1, 7);
+        out = PCKEV_XORI128_UB(vec0, vec1);
+        ST8x2_UB(out, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if ((2 == height) || (6 == height)) {
+        common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height, rnd_val);
+    } else {
+        common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height, rnd_val);
+    }
+}
+
+static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    v16i8 vec10, vec11;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    mask2 = LD_SB(&mc_filt_mask_arr[32]);
+
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
+        VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
+        DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out2, out3, out4, out5);
+        DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
+        VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
+        VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
+        VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
+                     out2, out3, out4, out5);
+        DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SRAR_H2_SH(out4, out5, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH2_SH(out4, out5, 7);
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        tmp1 = PCKEV_XORI128_UB(out4, out5);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                                   filt1, out4, out5, out6, out7);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint8_t *dst1 = dst + 16;
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
+    v8i16 filt, out0, out1, out2, out3;
+    v16u8 tmp0, tmp1;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask00 = mask0 + 8;
+    mask11 = mask0 + 10;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
+        VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
+        VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
+        VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+
+        VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
+        VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
+        VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
+        VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+        tmp0 = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(tmp0, dst);
+        dst += dst_stride;
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
+        VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
+        VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
+        VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
+
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                     out0, out1, out2, out3);
+
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
+        dst1 += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 rnd_vec;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
+        src += src_stride;
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   filt0, filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                                   filt0, filt1, out4, out5, out6, out7);
+        SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
+        SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v16u8 out;
+    v8i16 filt, out10;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src, src_stride, src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+    out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+    out10 = __msa_srar_h(out10, rnd_vec);
+    out10 = __msa_sat_s_h(out10, 7);
+    out = PCKEV_XORI128_UB(out10, out10);
+    ST4x2_UB(out, dst, dst_stride);
+}
+
+static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride,
+                                         const int8_t *filter, int32_t height,
+                                         uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+        SRAR_H2_SH(out10, out32, rnd_vec);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (2 == height) {
+        common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else {
+        common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
+                                     height, rnd_val);
+    }
+}
+
+static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
+    v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, filt0, filt1;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_UB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
+    vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
+    vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src3, src0, src1, src2);
+        src += (4 * src_stride);
+
+        vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
+        ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
+
+        vec0 = __msa_xori_b((v16u8) src0, 128);
+        ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
+
+        vec1 = __msa_xori_b((v16u8) src1, 128);
+        vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
+
+        vec2 = __msa_xori_b((v16u8) src2, 128);
+        vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
+
+        SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST6x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
+    v16u8 out;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
+    tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
+    ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
+    tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
+    SRAR_H2_SH(tmp0, tmp1, rnd_vec);
+    SAT_SH2_SH(tmp0, tmp1, 7);
+    out = PCKEV_XORI128_UB(tmp0, tmp1);
+    ST8x2_UB(out, dst, dst_stride);
+}
+
+static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t out0, out1, out2;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
+    v8i16 filt, filt0, filt1;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+
+        XORI_B3_128_SB(src3, src4, src5);
+        ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
+        SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
+        SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
+        PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
+        XORI_B2_128_SH(tmp0, tmp2);
+
+        out0 = __msa_copy_u_d((v2i64) tmp0, 0);
+        out1 = __msa_copy_u_d((v2i64) tmp0, 1);
+        out2 = __msa_copy_u_d((v2i64) tmp2, 0);
+        SD(out0, dst);
+        dst += dst_stride;
+        SD(out1, dst);
+        dst += dst_stride;
+        SD(out2, dst);
+        dst += dst_stride;
+
+        src2 = src5;
+        vec0 = vec3;
+        vec2 = vec4;
+    }
+}
+
+static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height,
+                                     uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height,
+                                uint8_t rnd_val)
+{
+    if (2 == height) {
+        common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else if (6 == height) {
+        common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
+    } else {
+        common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
+                                 filter, height, rnd_val);
+    }
+}
+
+static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16u8 out0, out1;
+    v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
+    v4u32 mask = { 2, 6, 2, 6 };
+    v8i16 rnd_vec;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
+
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    src -= src_stride;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
+        VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
+        VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
+        tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
+        ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
+                   src21, src43, src54, src65);
+        tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
+        tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
+        tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
+        ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
+        tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
+        tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
+        SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
+        SRAR_H2_SH(tmp4, tmp5, rnd_vec);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH2_SH(tmp4, tmp5, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        out0 = PCKEV_XORI128_UB(tmp4, tmp5);
+        ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src1 = src5;
+        src2 = src6;
+        vec0 = vec4;
+        vec1 = vec5;
+        src2 = src6;
+    }
+}
+
+static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint64_t out0, out1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src11, filt0, filt1;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
+    v16u8 out;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
+    v8i16 rnd_vec;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    /* 16 width */
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    /* 8 width */
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16 width */
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 8 width */
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        /* 16 width */
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+
+        /* 8 width */
+        out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+
+        /* 16 + 8 width */
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H2_SH(out0_l, out1_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH2_SH(out0_l, out1_l, 7);
+        out = PCKEV_XORI128_UB(out0_r, out0_l);
+        ST_UB(out, dst);
+        PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
+        XORI_B2_128_SH(out2_r, out3_r);
+        out0 = __msa_copy_u_d((v2i64) out2_r, 0);
+        out1 = __msa_copy_u_d((v2i64) out3_r, 0);
+        SD(out0, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out1_r, out1_l);
+        ST_UB(out, dst);
+        SD(out1, dst + 16);
+        dst += dst_stride;
+
+        /* 16 width */
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        /* 8 width */
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        /* 16 width */
+        out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
+
+        /* 8 width */
+        out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
+
+        /* 16 + 8 width */
+        SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
+        SRAR_H2_SH(out0_l, out1_l, rnd_vec);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH2_SH(out0_l, out1_l, 7);
+        out = PCKEV_XORI128_UB(out0_r, out0_l);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2_r, out2_r);
+        ST8x1_UB(out, dst + 16);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out1_r, out1_l);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out3_r, out3_r);
+        ST8x1_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      uint8_t rnd_val, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *dst_tmp, *src_tmp;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt;
+    v16i8 filt0, filt1;
+    v8i16 rnd_vec;
+    v16u8 out;
+
+    src -= src_stride;
+    rnd_vec = __msa_fill_h(rnd_val);
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    for (cnt = (width >> 5); cnt--;) {
+        dst_tmp = dst;
+        src_tmp = src;
+
+        /* 16 width */
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+        ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+        /* next 16 width */
+        LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src6, src7, src8);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            /* 16 width */
+            LD_SB2(src_tmp, src_stride, src3, src4);
+            XORI_B2_128_SB(src3, src4);
+            ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+            ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+            /* 16 width */
+            out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+            out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+            out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+            out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+
+            /* 16 width */
+            SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec);
+            SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
+            out = PCKEV_XORI128_UB(out0_r, out0_l);
+            ST_UB(out, dst_tmp);
+            out = PCKEV_XORI128_UB(out1_r, out1_l);
+            ST_UB(out, dst_tmp + dst_stride);
+
+            src10_r = src32_r;
+            src21_r = src43_r;
+            src10_l = src32_l;
+            src21_l = src43_l;
+            src2 = src4;
+
+            /* next 16 width */
+            LD_SB2(src_tmp + 16, src_stride, src9, src10);
+            src_tmp += (2 * src_stride);
+            XORI_B2_128_SB(src9, src10);
+            ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+            ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+            /* next 16 width */
+            out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
+            out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
+            out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+            out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
+
+            /* next 16 width */
+            SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec);
+            SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
+            out = PCKEV_XORI128_UB(out2_r, out2_l);
+            ST_UB(out, dst_tmp + 16);
+            out = PCKEV_XORI128_UB(out3_r, out3_l);
+            ST_UB(out, dst_tmp + 16 + dst_stride);
+
+            dst_tmp += 2 * dst_stride;
+
+            src76_r = src98_r;
+            src87_r = src109_r;
+            src76_l = src98_l;
+            src87_l = src109_l;
+            src8 = src10;
+        }
+
+        src += 32;
+        dst += 32;
+    }
+}
+
+static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 uint8_t rnd_val)
+{
+    common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
+                              filter, height, rnd_val, 32);
+}
+
+static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+    dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
+    dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
+    dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
+
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 out0_r, out1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+
+    PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
+    SRARI_H2_SH(out0_r, out1_r, 6);
+    CLIP_SH2_0_255(out0_r, out1_r);
+    out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
+
+    ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride,
+                                           const int8_t *filter_x,
+                                           const int8_t *filter_y,
+                                           int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+
+        /* row 7 */
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+
+        /* row 8 */
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+
+        /* row 9 */
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+
+        /* row 10 */
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+
+        PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r,
+                    out0_r, out1_r, out2_r, out3_r);
+
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+
+        PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+        ST4x8_UB(out0_r, out1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (4 == height) {
+        hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (0 == (height % 8)) {
+        hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                       filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+
+        PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r,
+                    out0_r, out1_r, out2_r, out3_r);
+
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+        CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+
+        PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+        ST6x4_UB(out0_r, out1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 out0_r, out1_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
+    SRARI_H2_SH(out0_r, out1_r, 6);
+    CLIP_SH2_0_255(out0_r, out1_r);
+    out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
+
+    ST8x2_UB(out0_r, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src5, src6);
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+
+    LD_SB2(src, src_stride, src7, src8);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src7, src8);
+
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+
+    PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
+    PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+    SRARI_H2_SH(out4_r, out5_r, 6);
+    CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+    CLIP_SH2_0_255(out4_r, out5_r);
+
+    PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+    out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
+
+    ST8x4_UB(out0_r, out1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(out2_r, dst, dst_stride);
+}
+
+static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height,
+                                       int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            /* row 3 */
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 4 */
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            /* row 5 */
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            /* row 6 */
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r,
+                        out0_r, out1_r, out2_r, out3_r);
+
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
+            CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
+
+            PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
+            ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int8_t *filter_x,
+                                  const int8_t *filter_y,
+                                  int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (6 == height) {
+        hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height);
+    } else if (0 == (height % 4)) {
+        hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+
+    hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                          filter_x, filter_y, height);
+}
+
+static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y,
+                                   int32_t height)
+{
+    hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+#define UNI_MC_COPY(WIDTH)                                                 \
+void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                    ptrdiff_t dst_stride,  \
+                                                    uint8_t *src,          \
+                                                    ptrdiff_t src_stride,  \
+                                                    int height,            \
+                                                    intptr_t mx,           \
+                                                    intptr_t my,           \
+                                                    int width)             \
+{                                                                          \
+    copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height);     \
+}
+
+UNI_MC_COPY(8);
+UNI_MC_COPY(12);
+UNI_MC_COPY(16);
+UNI_MC_COPY(24);
+UNI_MC_COPY(32);
+UNI_MC_COPY(48);
+UNI_MC_COPY(64);
+
+#undef UNI_MC_COPY
+
+#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
+                                                         ptrdiff_t             \
+                                                         dst_stride,           \
+                                                         uint8_t *src,         \
+                                                         ptrdiff_t             \
+                                                         src_stride,           \
+                                                         int height,           \
+                                                         intptr_t mx,          \
+                                                         intptr_t my,          \
+                                                         int width)            \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride,  \
+                                            filter, height, 6);                \
+}
+
+UNI_MC(qpel, h, 4, 8, hz, mx);
+UNI_MC(qpel, h, 8, 8, hz, mx);
+UNI_MC(qpel, h, 12, 8, hz, mx);
+UNI_MC(qpel, h, 16, 8, hz, mx);
+UNI_MC(qpel, h, 24, 8, hz, mx);
+UNI_MC(qpel, h, 32, 8, hz, mx);
+UNI_MC(qpel, h, 48, 8, hz, mx);
+UNI_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_MC(qpel, v, 4, 8, vt, my);
+UNI_MC(qpel, v, 8, 8, vt, my);
+UNI_MC(qpel, v, 12, 8, vt, my);
+UNI_MC(qpel, v, 16, 8, vt, my);
+UNI_MC(qpel, v, 24, 8, vt, my);
+UNI_MC(qpel, v, 32, 8, vt, my);
+UNI_MC(qpel, v, 48, 8, vt, my);
+UNI_MC(qpel, v, 64, 8, vt, my);
+
+UNI_MC(epel, h, 4, 4, hz, mx);
+UNI_MC(epel, h, 6, 4, hz, mx);
+UNI_MC(epel, h, 8, 4, hz, mx);
+UNI_MC(epel, h, 12, 4, hz, mx);
+UNI_MC(epel, h, 16, 4, hz, mx);
+UNI_MC(epel, h, 24, 4, hz, mx);
+UNI_MC(epel, h, 32, 4, hz, mx);
+
+UNI_MC(epel, v, 4, 4, vt, my);
+UNI_MC(epel, v, 6, 4, vt, my);
+UNI_MC(epel, v, 8, 4, vt, my);
+UNI_MC(epel, v, 12, 4, vt, my);
+UNI_MC(epel, v, 16, 4, vt, my);
+UNI_MC(epel, v, 24, 4, vt, my);
+UNI_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_MC
+
+#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                           \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,    \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)     \
+{                                                                       \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];           \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];           \
+                                                                        \
+    hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,     \
+                                              dst_stride, filter_x,     \
+                                              filter_y, height);        \
+}
+
+UNI_MC_HV(qpel, hv, 4, 8, hv);
+UNI_MC_HV(qpel, hv, 8, 8, hv);
+UNI_MC_HV(qpel, hv, 12, 8, hv);
+UNI_MC_HV(qpel, hv, 16, 8, hv);
+UNI_MC_HV(qpel, hv, 24, 8, hv);
+UNI_MC_HV(qpel, hv, 32, 8, hv);
+UNI_MC_HV(qpel, hv, 48, 8, hv);
+UNI_MC_HV(qpel, hv, 64, 8, hv);
+
+UNI_MC_HV(epel, hv, 4, 4, hv);
+UNI_MC_HV(epel, hv, 6, 4, hv);
+UNI_MC_HV(epel, hv, 8, 4, hv);
+UNI_MC_HV(epel, hv, 12, 4, hv);
+UNI_MC_HV(epel, hv, 16, 4, hv);
+UNI_MC_HV(epel, hv, 24, 4, hv);
+UNI_MC_HV(epel, hv, 32, 4, hv);
+
+#undef UNI_MC_HV
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c b/libavcodec/mips/hevc_mc_uniw_msa.c
new file mode 100644
index 0000000..ce10f41
--- /dev/null
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -0,0 +1,4790 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+#define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,       \
+                               out0, out1, out2, out3)                     \
+{                                                                          \
+    MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3);  \
+    SRAR_W4_SW(out0, out1, out2, out3, rnd);                               \
+    ADD4(out0, offset, out1, offset, out2, offset, out3, offset,           \
+         out0, out1, out2, out3);                                          \
+    out0 = CLIP_SW_0_255(out0);                                            \
+    out1 = CLIP_SW_0_255(out1);                                            \
+    out2 = CLIP_SW_0_255(out2);                                            \
+    out3 = CLIP_SW_0_255(out3);                                            \
+}
+
+#define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,              \
+                            out0_r, out1_r, out0_l, out1_l)          \
+{                                                                    \
+    ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r);                  \
+    ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l);                  \
+    DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt,  \
+                out0_r, out1_r, out0_l, out1_l);                     \
+    SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd);                 \
+    ADD4(out0_r, offset, out1_r, offset,                             \
+         out0_l, offset, out1_l, offset,                             \
+         out0_r, out1_r, out0_l, out1_l);                            \
+    out0_r = CLIP_SW_0_255(out0_r);                                  \
+    out1_r = CLIP_SW_0_255(out1_r);                                  \
+    out0_l = CLIP_SW_0_255(out0_l);                                  \
+    out1_l = CLIP_SW_0_255(out1_l);                                  \
+}
+
+#define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,  \
+                            out0_r, out1_r, out2_r, out3_r,        \
+                            out0_l, out1_l, out2_l, out3_l)        \
+{                                                                  \
+    HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,                \
+                        out0_r, out1_r, out0_l, out1_l);           \
+    HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd,                \
+                        out2_r, out3_r, out2_l, out3_l);           \
+}
+
+static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 dst0;
+        v4i32 dst0_r, dst0_l;
+
+        LD_SB2(src, src_stride, src0, src1);
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+        dst0 = (v8i16) __msa_ilvr_b(zero, src0);
+        dst0 <<= 6;
+
+        ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
+        DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+        SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+        ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst0_l = CLIP_SW_0_255(dst0_l);
+
+        HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 dst0, dst1;
+        v4i32 dst0_r, dst1_r;
+        v4i32 dst0_l, dst1_l;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+        dst0 <<= 6;
+        dst1 <<= 6;
+
+        HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+        v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   dst4, dst5, dst6, dst7);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        SLLI_4V(dst4, dst5, dst6, dst7, 6);
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
+                                    int32_t src_stride,
+                                    uint8_t *dst,
+                                    int32_t dst_stride,
+                                    int32_t height,
+                                    int32_t weight,
+                                    int32_t offset,
+                                    int32_t rnd_val)
+{
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 dst0, dst1;
+        v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+        LD_SB2(src, src_stride, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
+
+        dst0 <<= 6;
+        dst1 <<= 6;
+        HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+        v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+        LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+        ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+        ST8x2_UB(dst2_r, dst, dst_stride);
+    } else if (0 == height % 4) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3;
+        v8i16 dst0, dst1, dst2, dst3;
+        v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       dst0, dst1, dst2, dst3);
+
+            SLLI_4V(dst0, dst1, dst2, dst3, 6);
+            HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   dst0, dst1, dst2, dst3);
+
+        SLLI_4V(dst0, dst1, dst2, dst3, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
+        dst4 <<= 6;
+        dst5 <<= 6;
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_uniwgt_copy_16multx4mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v16i8 zero = { 0 };
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (cnt = width >> 4; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src0, src1, src2, src3);
+            src_tmp += (4 * src_stride);
+            ILVR_B2_SH(zero, src0, zero, src1, tmp0, tmp1);
+            ILVL_B2_SH(zero, src0, zero, src1, tmp2, tmp3);
+
+            SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+            HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            ILVR_B2_SH(zero, src2, zero, src3, tmp0, tmp1);
+            ILVL_B2_SH(zero, src2, zero, src3, tmp2, tmp3);
+
+            SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
+            HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_uniwgt_copy_16w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 16);
+}
+
+static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 16);
+
+    hevc_uniwgt_copy_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                            height, weight, offset, rnd_val);
+}
+
+static void hevc_uniwgt_copy_32w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 32);
+}
+
+static void hevc_uniwgt_copy_48w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 48);
+}
+
+static void hevc_uniwgt_copy_64w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_uniwgt_copy_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      height, weight, offset, rnd_val, 64);
+}
+
+static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 3;
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hz_uniwgt_8t_8w_msa(src, src_stride, dst, dst_stride,
+                             filter, height, weight, offset, rnd_val);
+    hevc_hz_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                             filter, height, weight, offset, rnd_val);
+}
+
+static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 16, src2, src3);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst3_l, dst3_r, dst4_l, dst4_r, dst0_r, dst1_r);
+        HEVC_PCK_SW_SB4(dst2_l, dst2_r, dst5_l, dst5_r, dst2_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst2_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src3 = LD_SB(src + 40);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        ST_SW(dst2_r, dst + 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (cnt = 2; cnt--;) {
+            LD_SB2(src_tmp, 16, src0, src1);
+            src2 = LD_SB(src_tmp + 24);
+            src_tmp += 32;
+            XORI_B3_128_SB(src0, src1, src2);
+
+            VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst0 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst0, dst0, dst0, dst0);
+            VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                       vec0, vec1, vec2, vec3);
+            dst1 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst1, dst1, dst1, dst1);
+            VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst2 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst2, dst2, dst2, dst2);
+            VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst3 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst3, dst3, dst3, dst3);
+
+            HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
+            dst_tmp += 32;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src9, src10, src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    ILVR_D3_SB(src21_r, src10_r, src43_r,
+               src32_r, src65_r, src54_r, src2110, src4332, src6554);
+
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src += (8 * src_stride);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
+                   src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
+                     filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        tmp0 = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+        tmp4 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, tmp4, tmp4, tmp4, tmp4);
+        tmp5 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, tmp5, tmp5, tmp5, tmp5);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_16multx2mult_msa(uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               const int8_t *filter,
+                                               int32_t height,
+                                               int32_t weight,
+                                               int32_t offset,
+                                               int32_t rnd_val,
+                                               int32_t width)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    int32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v16i8 src10_l, src32_l, src54_l, src76_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight = weight & 0x0000FFFF;
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            src_tmp += (2 * src_stride);
+            XORI_B2_128_SB(src7, src8);
+            ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+            ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+            tmp0 = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
+            tmp1 = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
+            tmp2 = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
+            tmp3 = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
+
+            HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                                weight_vec, offset_vec, rnd_vec,
+                                dst0_r, dst1_r, dst2_r, dst3_r,
+                                dst0_l, dst1_l, dst2_l, dst3_l);
+
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                            dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 16);
+}
+
+static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 16);
+
+    hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                             filter, height, weight, offset, rnd_val);
+}
+
+static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 32);
+}
+
+static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 48);
+}
+
+static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
+                                       filter, height, weight,
+                                       offset, rnd_val, 64);
+}
+
+static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    /* row 0 row 1 row 2 row 3 */
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+        MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+        SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
+        ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
+        dst0_r = CLIP_SW_0_255(dst0_r);
+        dst1_r = CLIP_SW_0_255(dst1_r);
+
+        HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+        ST4x2_UB(dst0_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= ((3 * src_stride) + 3);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            src_tmp += 2 * src_stride;
+            XORI_B2_128_SB(src7, src8);
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 8 */
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0_r, dst1_r, dst0_l, dst1_l);
+
+            HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+            ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 8);
+}
+
+static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 8);
+    hevc_hv_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                             filter_x, filter_y, height, weight, offset,
+                             rnd_val);
+}
+
+static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 16);
+}
+
+static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 24);
+}
+
+static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 32);
+}
+
+static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 48);
+}
+
+static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 64);
+}
+
+static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, vec0, vec1;
+    v16i8 mask1;
+    v8i16 dst0;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    weight = weight & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
+    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    /* rearranging filter */
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    weight = weight & 0x0000FFFF;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (8 == height || 16 == height) {
+        hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight,
+                                          offset, rnd_val);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1, dst0, dst1;
+    v16i8 src0, src1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+    LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+    XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                        weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst2_r, dst3_r,
+                        dst0_l, dst1_l, dst2_l, dst3_l);
+
+    HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                        dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (6 == height) {
+        hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else {
+        hevc_hz_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask3;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16 width */
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 16, src_stride, src1, src3);
+        src += (2 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst0_l, dst1_l);
+
+        HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+        ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    weight = weight & 0x0000FFFF;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332;
+    v8i16 dst10;
+    v4i32 dst0_r, dst0_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+    LD_SB2(src, src_stride, src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+    src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+
+    ILVRL_H2_SW(dst10, dst10, dst0_r, dst0_l);
+    DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
+    SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
+    ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst0_l = CLIP_SW_0_255(dst0_l);
+
+    HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
+    XORI_B2_128_SB(src4332, src6554);
+
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+    HEVC_UNIW_RND_CLIP2(dst10, dst32, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
+        src += (6 * src_stride);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src, src_stride, src9, src2);
+        src += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+        HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (4 == height) {
+        hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    HEVC_UNIW_RND_CLIP2(tmp0, tmp1, weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst0_l, dst1_l);
+
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
+    XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
+    ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+               src32_r, src43_r, src54_r, src65_r);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    tmp0 = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+    tmp1 = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+    tmp2 = const_vec;
+    DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
+    tmp3 = const_vec;
+    DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
+    tmp4 = const_vec;
+    DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
+    tmp5 = const_vec;
+    DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
+    HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                        weight_vec, offset_vec, rnd_vec,
+                        dst0_r, dst1_r, dst2_r, dst3_r,
+                        dst0_l, dst1_l, dst2_l, dst3_l);
+    HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
+                        dst4_r, dst5_r, dst4_l, dst5_l);
+
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else if (6 == height) {
+        hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter, height, weight, offset, rnd_val);
+    } else {
+        hevc_vt_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                          filter, height, weight, offset,
+                                          rnd_val);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src -= (1 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                         dst2_l, dst2_r, dst3_l, dst3_r,
+                         dst4_l, dst4_r, dst5_l, dst5_r,
+                         dst0_r, dst1_r, dst2_r);
+        ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst4_l, dst5_l);
+
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        ST8x2_UB(dst4_r, dst + 16, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    int32_t loop_cnt;
+    uint8_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+    weight = weight & 0x0000FFFF;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        tmp0 = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
+        tmp4 = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
+        tmp1 = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
+        tmp5 = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
+
+        HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst0_r, dst1_r, dst2_r, dst3_r,
+                            dst0_l, dst1_l, dst2_l, dst3_l);
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
+                        dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+        tmp2 = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
+        tmp6 = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
+        tmp3 = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
+        tmp7 = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
+
+        HEVC_UNIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
+                            weight_vec, offset_vec, rnd_vec,
+                            dst4_r, dst5_r, dst6_r, dst7_r,
+                            dst4_l, dst5_l, dst6_l, dst7_l);
+
+        HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r,
+                        dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
+        ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
+        dst_tmp += (2 * dst_stride);
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 dst0_r, dst1_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
+    SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
+    ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
+    dst0_r = CLIP_SW_0_255(dst0_r);
+    dst1_r = CLIP_SW_0_255(dst1_r);
+
+    HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
+    ST4x2_UB(dst0_r, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+
+    HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst0_r, dst1_r, dst2_r, dst3_r);
+    HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
+    ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+
+        HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst0_r, dst1_r, dst2_r, dst3_r);
+        HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst6_r, dst7_r,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst4_r, dst5_r, dst6_r, dst7_r);
+        HEVC_PCK_SW_SB4(dst5_r, dst4_r, dst7_r, dst6_r, dst0_r);
+        ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    if (2 == height) {
+        hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (4 == height) {
+        hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (0 == (height % 8)) {
+        hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                          filter_x, filter_y, height, weight,
+                                          offset, rnd_val);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        /* row 3 */
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        /* row 4 */
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        /* row 5 */
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        /* row 6 */
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+
+        HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst0_r, dst1_r, dst0_l, dst1_l);
+        HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
+                               weight_vec, offset_vec, rnd_vec,
+                               dst2_r, dst3_r, dst2_l, dst3_l);
+        HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+        ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+    HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
+    ST8x2_UB(dst0_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src5, src6);
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+
+    LD_SB2(src, src_stride, src7, src8);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src7, src8);
+
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+
+    HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst0_r, dst1_r, dst0_l, dst1_l);
+    HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst2_r, dst3_r, dst2_l, dst3_l);
+    HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst4_l, dst5_l,
+                           weight_vec, offset_vec, rnd_vec,
+                           dst4_r, dst5_r, dst4_l, dst5_l);
+    HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
+                     dst2_l, dst2_r, dst3_l, dst3_r,
+                     dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST8x2_UB(dst2_r, dst, dst_stride);
+}
+
+static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter_x,
+                                              const int8_t *filter_y,
+                                              int32_t height,
+                                              int32_t weight,
+                                              int32_t offset,
+                                              int32_t rnd_val,
+                                              int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v4i32 weight_vec, offset_vec, rnd_vec;
+    v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    weight_vec = __msa_fill_w(weight);
+    offset_vec = __msa_fill_w(offset);
+    rnd_vec = __msa_fill_w(rnd_val);
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst0_r, dst1_r, dst0_l, dst1_l);
+            HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
+                                   weight_vec, offset_vec, rnd_vec,
+                                   dst2_r, dst3_r, dst2_l, dst3_l);
+            HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
+                            dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
+            ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int8_t *filter_x,
+                                     const int8_t *filter_y,
+                                     int32_t height,
+                                     int32_t weight,
+                                     int32_t offset,
+                                     int32_t rnd_val)
+{
+
+    if (2 == height) {
+        hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (6 == height) {
+        hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                                  filter_x, filter_y, height, weight,
+                                  offset, rnd_val);
+    } else if (0 == (height % 4)) {
+        hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                          filter_x, filter_y, height, weight,
+                                          offset, rnd_val, 8);
+    }
+}
+
+static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 8);
+    hevc_hv_uniwgt_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                             filter_x, filter_y, height, weight,
+                             offset, rnd_val);
+}
+
+static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 16);
+}
+
+static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 24);
+}
+
+static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int8_t *filter_x,
+                                      const int8_t *filter_y,
+                                      int32_t height,
+                                      int32_t weight,
+                                      int32_t offset,
+                                      int32_t rnd_val)
+{
+    hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_x, filter_y, height, weight,
+                                      offset, rnd_val, 32);
+}
+
+#define UNIWGT_MC_COPY(WIDTH)                                                \
+void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst,          \
+                                                      ptrdiff_t dst_stride,  \
+                                                      uint8_t *src,          \
+                                                      ptrdiff_t src_stride,  \
+                                                      int height,            \
+                                                      int denom,             \
+                                                      int weight,            \
+                                                      int offset,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)             \
+{                                                                            \
+    int shift = denom + 14 - 8;                                              \
+    hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride,        \
+                                    height, weight, offset, shift);          \
+}
+
+UNIWGT_MC_COPY(4);
+UNIWGT_MC_COPY(6);
+UNIWGT_MC_COPY(8);
+UNIWGT_MC_COPY(12);
+UNIWGT_MC_COPY(16);
+UNIWGT_MC_COPY(24);
+UNIWGT_MC_COPY(32);
+UNIWGT_MC_COPY(48);
+UNIWGT_MC_COPY(64);
+
+#undef UNIWGT_MC_COPY
+
+#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                        \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,        \
+                                                           ptrdiff_t          \
+                                                           dst_stride,        \
+                                                           uint8_t *src,      \
+                                                           ptrdiff_t          \
+                                                           src_stride,        \
+                                                           int height,        \
+                                                           int denom,         \
+                                                           int weight,        \
+                                                           int offset,        \
+                                                           intptr_t mx,       \
+                                                           intptr_t my,       \
+                                                           int width)         \
+{                                                                             \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];             \
+    int shift = denom + 14 - 8;                                               \
+                                                                              \
+    hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,        \
+                                                 dst_stride, filter, height,  \
+                                                 weight, offset, shift);      \
+}
+
+UNI_W_MC(qpel, h, 4, 8, hz, mx);
+UNI_W_MC(qpel, h, 8, 8, hz, mx);
+UNI_W_MC(qpel, h, 12, 8, hz, mx);
+UNI_W_MC(qpel, h, 16, 8, hz, mx);
+UNI_W_MC(qpel, h, 24, 8, hz, mx);
+UNI_W_MC(qpel, h, 32, 8, hz, mx);
+UNI_W_MC(qpel, h, 48, 8, hz, mx);
+UNI_W_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_W_MC(qpel, v, 4, 8, vt, my);
+UNI_W_MC(qpel, v, 8, 8, vt, my);
+UNI_W_MC(qpel, v, 12, 8, vt, my);
+UNI_W_MC(qpel, v, 16, 8, vt, my);
+UNI_W_MC(qpel, v, 24, 8, vt, my);
+UNI_W_MC(qpel, v, 32, 8, vt, my);
+UNI_W_MC(qpel, v, 48, 8, vt, my);
+UNI_W_MC(qpel, v, 64, 8, vt, my);
+
+UNI_W_MC(epel, h, 4, 4, hz, mx);
+UNI_W_MC(epel, h, 6, 4, hz, mx);
+UNI_W_MC(epel, h, 8, 4, hz, mx);
+UNI_W_MC(epel, h, 12, 4, hz, mx);
+UNI_W_MC(epel, h, 16, 4, hz, mx);
+UNI_W_MC(epel, h, 24, 4, hz, mx);
+UNI_W_MC(epel, h, 32, 4, hz, mx);
+
+UNI_W_MC(epel, v, 4, 4, vt, my);
+UNI_W_MC(epel, v, 6, 4, vt, my);
+UNI_W_MC(epel, v, 8, 4, vt, my);
+UNI_W_MC(epel, v, 12, 4, vt, my);
+UNI_W_MC(epel, v, 16, 4, vt, my);
+UNI_W_MC(epel, v, 24, 4, vt, my);
+UNI_W_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_W_MC
+
+#define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                              \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,       \
+                                                           ptrdiff_t         \
+                                                           dst_stride,       \
+                                                           uint8_t *src,     \
+                                                           ptrdiff_t         \
+                                                           src_stride,       \
+                                                           int height,       \
+                                                           int denom,        \
+                                                           int weight,       \
+                                                           int offset,       \
+                                                           intptr_t mx,      \
+                                                           intptr_t my,      \
+                                                           int width)        \
+{                                                                            \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                \
+    int shift = denom + 14 - 8;                                              \
+                                                                             \
+    hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,       \
+                                                 dst_stride, filter_x,       \
+                                                 filter_y,  height, weight,  \
+                                                 offset, shift);             \
+}
+
+UNI_W_MC_HV(qpel, hv, 4, 8, hv);
+UNI_W_MC_HV(qpel, hv, 8, 8, hv);
+UNI_W_MC_HV(qpel, hv, 12, 8, hv);
+UNI_W_MC_HV(qpel, hv, 16, 8, hv);
+UNI_W_MC_HV(qpel, hv, 24, 8, hv);
+UNI_W_MC_HV(qpel, hv, 32, 8, hv);
+UNI_W_MC_HV(qpel, hv, 48, 8, hv);
+UNI_W_MC_HV(qpel, hv, 64, 8, hv);
+
+UNI_W_MC_HV(epel, hv, 4, 4, hv);
+UNI_W_MC_HV(epel, hv, 6, 4, hv);
+UNI_W_MC_HV(epel, hv, 8, 4, hv);
+UNI_W_MC_HV(epel, hv, 12, 4, hv);
+UNI_W_MC_HV(epel, hv, 16, 4, hv);
+UNI_W_MC_HV(epel, hv, 24, 4, hv);
+UNI_W_MC_HV(epel, hv, 32, 4, hv);
+
+#undef UNI_W_MC_HV
diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c
new file mode 100644
index 0000000..3675b93
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/mips/hevcdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
+                                      const int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+        c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+        c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+        c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+        c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+        c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+        c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+        c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_msa;
+        c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_msa;
+        c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_msa;
+        c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_msa;
+        c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_msa;
+        c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_msa;
+        c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_msa;
+        c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_msa;
+        c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_msa;
+
+        c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_msa;
+        c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_msa;
+        c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_msa;
+        c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_msa;
+        c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_msa;
+        c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_msa;
+        c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_msa;
+        c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_msa;
+
+        c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_msa;
+        c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_msa;
+        c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_msa;
+        c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_msa;
+        c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_msa;
+        c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_msa;
+        c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_msa;
+        c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_msa;
+
+        c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_msa;
+        c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_msa;
+        c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_msa;
+        c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_msa;
+        c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_msa;
+        c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_msa;
+        c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_msa;
+
+        c->put_hevc_epel[1][0][1] = ff_hevc_put_hevc_epel_h4_8_msa;
+        c->put_hevc_epel[2][0][1] = ff_hevc_put_hevc_epel_h6_8_msa;
+        c->put_hevc_epel[3][0][1] = ff_hevc_put_hevc_epel_h8_8_msa;
+        c->put_hevc_epel[4][0][1] = ff_hevc_put_hevc_epel_h12_8_msa;
+        c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_8_msa;
+        c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_8_msa;
+        c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_msa;
+
+        c->put_hevc_epel[1][1][0] = ff_hevc_put_hevc_epel_v4_8_msa;
+        c->put_hevc_epel[2][1][0] = ff_hevc_put_hevc_epel_v6_8_msa;
+        c->put_hevc_epel[3][1][0] = ff_hevc_put_hevc_epel_v8_8_msa;
+        c->put_hevc_epel[4][1][0] = ff_hevc_put_hevc_epel_v12_8_msa;
+        c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_msa;
+        c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_msa;
+        c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_msa;
+
+        c->put_hevc_epel[1][1][1] = ff_hevc_put_hevc_epel_hv4_8_msa;
+        c->put_hevc_epel[2][1][1] = ff_hevc_put_hevc_epel_hv6_8_msa;
+        c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_msa;
+        c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_msa;
+        c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_msa;
+        c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_msa;
+        c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+        c->put_hevc_qpel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+        c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+        c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+        c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+        c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_msa;
+        c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_uni_qpel_h4_8_msa;
+        c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_uni_qpel_h8_8_msa;
+        c->put_hevc_qpel_uni[4][0][1] = ff_hevc_put_hevc_uni_qpel_h12_8_msa;
+        c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_8_msa;
+        c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_8_msa;
+        c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_msa;
+        c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_msa;
+        c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_uni[1][1][0] = ff_hevc_put_hevc_uni_qpel_v4_8_msa;
+        c->put_hevc_qpel_uni[3][1][0] = ff_hevc_put_hevc_uni_qpel_v8_8_msa;
+        c->put_hevc_qpel_uni[4][1][0] = ff_hevc_put_hevc_uni_qpel_v12_8_msa;
+        c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_8_msa;
+        c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_msa;
+        c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa;
+        c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa;
+        c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa;
+        c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa;
+        c->put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa;
+        c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa;
+        c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa;
+        c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
+        c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
+        c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
+        c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
+        c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
+        c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
+        c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
+        c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
+        c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
+        c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
+        c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
+        c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
+        c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
+        c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
+        c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
+        c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
+        c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
+        c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
+        c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;
+
+        c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
+        c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
+        c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
+        c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
+        c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
+        c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
+        c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+        c->put_hevc_qpel_uni_w[3][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+        c->put_hevc_qpel_uni_w[4][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+        c->put_hevc_qpel_uni_w[5][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+        c->put_hevc_qpel_uni_w[6][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+        c->put_hevc_qpel_uni_w[7][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+        c->put_hevc_qpel_uni_w[8][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels48_8_msa;
+        c->put_hevc_qpel_uni_w[9][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_qpel_h4_8_msa;
+        c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_qpel_h8_8_msa;
+        c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_qpel_h12_8_msa;
+        c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_qpel_h16_8_msa;
+        c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_qpel_h24_8_msa;
+        c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_qpel_h32_8_msa;
+        c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_uni_w_qpel_h48_8_msa;
+        c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_uni_w_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_qpel_v4_8_msa;
+        c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_qpel_v8_8_msa;
+        c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_qpel_v12_8_msa;
+        c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_qpel_v16_8_msa;
+        c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_qpel_v24_8_msa;
+        c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_qpel_v32_8_msa;
+        c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_uni_w_qpel_v48_8_msa;
+        c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_uni_w_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv4_8_msa;
+        c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_msa;
+        c->put_hevc_qpel_uni_w[4][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv12_8_msa;
+        c->put_hevc_qpel_uni_w[5][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv16_8_msa;
+        c->put_hevc_qpel_uni_w[6][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv24_8_msa;
+        c->put_hevc_qpel_uni_w[7][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv32_8_msa;
+        c->put_hevc_qpel_uni_w[8][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv48_8_msa;
+        c->put_hevc_qpel_uni_w[9][1][1] =
+            ff_hevc_put_hevc_uni_w_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_uni_w[1][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
+        c->put_hevc_epel_uni_w[2][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels6_8_msa;
+        c->put_hevc_epel_uni_w[3][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels8_8_msa;
+        c->put_hevc_epel_uni_w[4][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels12_8_msa;
+        c->put_hevc_epel_uni_w[5][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels16_8_msa;
+        c->put_hevc_epel_uni_w[6][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels24_8_msa;
+        c->put_hevc_epel_uni_w[7][0][0] =
+            ff_hevc_put_hevc_uni_w_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][0][1] = ff_hevc_put_hevc_uni_w_epel_h4_8_msa;
+        c->put_hevc_epel_uni_w[2][0][1] = ff_hevc_put_hevc_uni_w_epel_h6_8_msa;
+        c->put_hevc_epel_uni_w[3][0][1] = ff_hevc_put_hevc_uni_w_epel_h8_8_msa;
+        c->put_hevc_epel_uni_w[4][0][1] = ff_hevc_put_hevc_uni_w_epel_h12_8_msa;
+        c->put_hevc_epel_uni_w[5][0][1] = ff_hevc_put_hevc_uni_w_epel_h16_8_msa;
+        c->put_hevc_epel_uni_w[6][0][1] = ff_hevc_put_hevc_uni_w_epel_h24_8_msa;
+        c->put_hevc_epel_uni_w[7][0][1] = ff_hevc_put_hevc_uni_w_epel_h32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][1][0] = ff_hevc_put_hevc_uni_w_epel_v4_8_msa;
+        c->put_hevc_epel_uni_w[2][1][0] = ff_hevc_put_hevc_uni_w_epel_v6_8_msa;
+        c->put_hevc_epel_uni_w[3][1][0] = ff_hevc_put_hevc_uni_w_epel_v8_8_msa;
+        c->put_hevc_epel_uni_w[4][1][0] = ff_hevc_put_hevc_uni_w_epel_v12_8_msa;
+        c->put_hevc_epel_uni_w[5][1][0] = ff_hevc_put_hevc_uni_w_epel_v16_8_msa;
+        c->put_hevc_epel_uni_w[6][1][0] = ff_hevc_put_hevc_uni_w_epel_v24_8_msa;
+        c->put_hevc_epel_uni_w[7][1][0] = ff_hevc_put_hevc_uni_w_epel_v32_8_msa;
+
+        c->put_hevc_epel_uni_w[1][1][1] = ff_hevc_put_hevc_uni_w_epel_hv4_8_msa;
+        c->put_hevc_epel_uni_w[2][1][1] = ff_hevc_put_hevc_uni_w_epel_hv6_8_msa;
+        c->put_hevc_epel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_epel_hv8_8_msa;
+        c->put_hevc_epel_uni_w[4][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv12_8_msa;
+        c->put_hevc_epel_uni_w[5][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv16_8_msa;
+        c->put_hevc_epel_uni_w[6][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv24_8_msa;
+        c->put_hevc_epel_uni_w[7][1][1] =
+            ff_hevc_put_hevc_uni_w_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+        c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+        c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+        c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+        c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+        c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+        c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_msa;
+        c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_bi_qpel_h4_8_msa;
+        c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_bi_qpel_h8_8_msa;
+        c->put_hevc_qpel_bi[4][0][1] = ff_hevc_put_hevc_bi_qpel_h12_8_msa;
+        c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_msa;
+        c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_msa;
+        c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_msa;
+        c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_msa;
+        c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_bi[1][1][0] = ff_hevc_put_hevc_bi_qpel_v4_8_msa;
+        c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_msa;
+        c->put_hevc_qpel_bi[4][1][0] = ff_hevc_put_hevc_bi_qpel_v12_8_msa;
+        c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_msa;
+        c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_msa;
+        c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_msa;
+        c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_msa;
+        c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_bi[1][1][1] = ff_hevc_put_hevc_bi_qpel_hv4_8_msa;
+        c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_msa;
+        c->put_hevc_qpel_bi[4][1][1] = ff_hevc_put_hevc_bi_qpel_hv12_8_msa;
+        c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_msa;
+        c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_msa;
+        c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_msa;
+        c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_msa;
+        c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_msa;
+        c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_msa;
+        c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_msa;
+        c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_msa;
+        c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_msa;
+        c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_msa;
+        c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_bi[1][0][1] = ff_hevc_put_hevc_bi_epel_h4_8_msa;
+        c->put_hevc_epel_bi[2][0][1] = ff_hevc_put_hevc_bi_epel_h6_8_msa;
+        c->put_hevc_epel_bi[3][0][1] = ff_hevc_put_hevc_bi_epel_h8_8_msa;
+        c->put_hevc_epel_bi[4][0][1] = ff_hevc_put_hevc_bi_epel_h12_8_msa;
+        c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_8_msa;
+        c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_msa;
+        c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_msa;
+
+        c->put_hevc_epel_bi[1][1][0] = ff_hevc_put_hevc_bi_epel_v4_8_msa;
+        c->put_hevc_epel_bi[2][1][0] = ff_hevc_put_hevc_bi_epel_v6_8_msa;
+        c->put_hevc_epel_bi[3][1][0] = ff_hevc_put_hevc_bi_epel_v8_8_msa;
+        c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_msa;
+        c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_msa;
+        c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_msa;
+        c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_msa;
+
+        c->put_hevc_epel_bi[1][1][1] = ff_hevc_put_hevc_bi_epel_hv4_8_msa;
+        c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_msa;
+        c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_msa;
+        c->put_hevc_epel_bi[4][1][1] = ff_hevc_put_hevc_bi_epel_hv12_8_msa;
+        c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_msa;
+        c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_msa;
+        c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+        c->put_hevc_qpel_bi_w[3][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+        c->put_hevc_qpel_bi_w[4][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+        c->put_hevc_qpel_bi_w[5][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+        c->put_hevc_qpel_bi_w[6][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+        c->put_hevc_qpel_bi_w[7][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+        c->put_hevc_qpel_bi_w[8][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels48_8_msa;
+        c->put_hevc_qpel_bi_w[9][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_qpel_h4_8_msa;
+        c->put_hevc_qpel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_qpel_h8_8_msa;
+        c->put_hevc_qpel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_qpel_h12_8_msa;
+        c->put_hevc_qpel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_qpel_h16_8_msa;
+        c->put_hevc_qpel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_qpel_h24_8_msa;
+        c->put_hevc_qpel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_qpel_h32_8_msa;
+        c->put_hevc_qpel_bi_w[8][0][1] = ff_hevc_put_hevc_bi_w_qpel_h48_8_msa;
+        c->put_hevc_qpel_bi_w[9][0][1] = ff_hevc_put_hevc_bi_w_qpel_h64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_qpel_v4_8_msa;
+        c->put_hevc_qpel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_qpel_v8_8_msa;
+        c->put_hevc_qpel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_qpel_v12_8_msa;
+        c->put_hevc_qpel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_qpel_v16_8_msa;
+        c->put_hevc_qpel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_qpel_v24_8_msa;
+        c->put_hevc_qpel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_qpel_v32_8_msa;
+        c->put_hevc_qpel_bi_w[8][1][0] = ff_hevc_put_hevc_bi_w_qpel_v48_8_msa;
+        c->put_hevc_qpel_bi_w[9][1][0] = ff_hevc_put_hevc_bi_w_qpel_v64_8_msa;
+
+        c->put_hevc_qpel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv4_8_msa;
+        c->put_hevc_qpel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv8_8_msa;
+        c->put_hevc_qpel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv12_8_msa;
+        c->put_hevc_qpel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv16_8_msa;
+        c->put_hevc_qpel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv24_8_msa;
+        c->put_hevc_qpel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv32_8_msa;
+        c->put_hevc_qpel_bi_w[8][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv48_8_msa;
+        c->put_hevc_qpel_bi_w[9][1][1] = ff_hevc_put_hevc_bi_w_qpel_hv64_8_msa;
+
+        c->put_hevc_epel_bi_w[1][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels4_8_msa;
+        c->put_hevc_epel_bi_w[2][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels6_8_msa;
+        c->put_hevc_epel_bi_w[3][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels8_8_msa;
+        c->put_hevc_epel_bi_w[4][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels12_8_msa;
+        c->put_hevc_epel_bi_w[5][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels16_8_msa;
+        c->put_hevc_epel_bi_w[6][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels24_8_msa;
+        c->put_hevc_epel_bi_w[7][0][0] =
+            ff_hevc_put_hevc_bi_w_pel_pixels32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][0][1] = ff_hevc_put_hevc_bi_w_epel_h4_8_msa;
+        c->put_hevc_epel_bi_w[2][0][1] = ff_hevc_put_hevc_bi_w_epel_h6_8_msa;
+        c->put_hevc_epel_bi_w[3][0][1] = ff_hevc_put_hevc_bi_w_epel_h8_8_msa;
+        c->put_hevc_epel_bi_w[4][0][1] = ff_hevc_put_hevc_bi_w_epel_h12_8_msa;
+        c->put_hevc_epel_bi_w[5][0][1] = ff_hevc_put_hevc_bi_w_epel_h16_8_msa;
+        c->put_hevc_epel_bi_w[6][0][1] = ff_hevc_put_hevc_bi_w_epel_h24_8_msa;
+        c->put_hevc_epel_bi_w[7][0][1] = ff_hevc_put_hevc_bi_w_epel_h32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][1][0] = ff_hevc_put_hevc_bi_w_epel_v4_8_msa;
+        c->put_hevc_epel_bi_w[2][1][0] = ff_hevc_put_hevc_bi_w_epel_v6_8_msa;
+        c->put_hevc_epel_bi_w[3][1][0] = ff_hevc_put_hevc_bi_w_epel_v8_8_msa;
+        c->put_hevc_epel_bi_w[4][1][0] = ff_hevc_put_hevc_bi_w_epel_v12_8_msa;
+        c->put_hevc_epel_bi_w[5][1][0] = ff_hevc_put_hevc_bi_w_epel_v16_8_msa;
+        c->put_hevc_epel_bi_w[6][1][0] = ff_hevc_put_hevc_bi_w_epel_v24_8_msa;
+        c->put_hevc_epel_bi_w[7][1][0] = ff_hevc_put_hevc_bi_w_epel_v32_8_msa;
+
+        c->put_hevc_epel_bi_w[1][1][1] = ff_hevc_put_hevc_bi_w_epel_hv4_8_msa;
+        c->put_hevc_epel_bi_w[2][1][1] = ff_hevc_put_hevc_bi_w_epel_hv6_8_msa;
+        c->put_hevc_epel_bi_w[3][1][1] = ff_hevc_put_hevc_bi_w_epel_hv8_8_msa;
+        c->put_hevc_epel_bi_w[4][1][1] = ff_hevc_put_hevc_bi_w_epel_hv12_8_msa;
+        c->put_hevc_epel_bi_w[5][1][1] = ff_hevc_put_hevc_bi_w_epel_hv16_8_msa;
+        c->put_hevc_epel_bi_w[6][1][1] = ff_hevc_put_hevc_bi_w_epel_hv24_8_msa;
+        c->put_hevc_epel_bi_w[7][1][1] = ff_hevc_put_hevc_bi_w_epel_hv32_8_msa;
+
+        c->sao_band_filter[0] =
+        c->sao_band_filter[1] =
+        c->sao_band_filter[2] =
+        c->sao_band_filter[3] =
+        c->sao_band_filter[4] = ff_hevc_sao_band_filter_0_8_msa;
+
+        c->sao_edge_filter[0] =
+        c->sao_edge_filter[1] =
+        c->sao_edge_filter[2] =
+        c->sao_edge_filter[3] =
+        c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_msa;
+
+        c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_msa;
+        c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_msa;
+
+        c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_msa;
+        c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_msa;
+
+        c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_msa;
+        c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_msa;
+
+        c->hevc_h_loop_filter_chroma_c =
+            ff_hevc_loop_filter_chroma_h_8_msa;
+        c->hevc_v_loop_filter_chroma_c =
+            ff_hevc_loop_filter_chroma_v_8_msa;
+
+        c->idct[0] = ff_hevc_idct_4x4_msa;
+        c->idct[1] = ff_hevc_idct_8x8_msa;
+        c->idct[2] = ff_hevc_idct_16x16_msa;
+        c->idct[3] = ff_hevc_idct_32x32_msa;
+        c->idct_dc[0] = ff_hevc_idct_dc_4x4_msa;
+        c->idct_dc[1] = ff_hevc_idct_dc_8x8_msa;
+        c->idct_dc[2] = ff_hevc_idct_dc_16x16_msa;
+        c->idct_dc[3] = ff_hevc_idct_dc_32x32_msa;
+        c->transform_add[0] = ff_hevc_addblk_4x4_msa;
+        c->transform_add[1] = ff_hevc_addblk_8x8_msa;
+        c->transform_add[2] = ff_hevc_addblk_16x16_msa;
+        c->transform_add[3] = ff_hevc_addblk_32x32_msa;
+        c->idct_4x4_luma = ff_hevc_idct_luma_4x4_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth)
+{
+#if HAVE_MSA
+    hevc_dsp_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h
new file mode 100644
index 0000000..1573d1c
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H
+#define AVCODEC_MIPS_HEVCDSP_MIPS_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define MC(PEL, DIR, WIDTH)                                                 \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,            \
+                                                     uint8_t *src,          \
+                                                     ptrdiff_t src_stride,  \
+                                                     int height,            \
+                                                     intptr_t mx,           \
+                                                     intptr_t my,           \
+                                                     int width)
+
+MC(pel, pixels, 4);
+MC(pel, pixels, 6);
+MC(pel, pixels, 8);
+MC(pel, pixels, 12);
+MC(pel, pixels, 16);
+MC(pel, pixels, 24);
+MC(pel, pixels, 32);
+MC(pel, pixels, 48);
+MC(pel, pixels, 64);
+
+MC(qpel, h, 4);
+MC(qpel, h, 8);
+MC(qpel, h, 12);
+MC(qpel, h, 16);
+MC(qpel, h, 24);
+MC(qpel, h, 32);
+MC(qpel, h, 48);
+MC(qpel, h, 64);
+
+MC(qpel, v, 4);
+MC(qpel, v, 8);
+MC(qpel, v, 12);
+MC(qpel, v, 16);
+MC(qpel, v, 24);
+MC(qpel, v, 32);
+MC(qpel, v, 48);
+MC(qpel, v, 64);
+
+MC(qpel, hv, 4);
+MC(qpel, hv, 8);
+MC(qpel, hv, 12);
+MC(qpel, hv, 16);
+MC(qpel, hv, 24);
+MC(qpel, hv, 32);
+MC(qpel, hv, 48);
+MC(qpel, hv, 64);
+
+MC(epel, h, 4);
+MC(epel, h, 6);
+MC(epel, h, 8);
+MC(epel, h, 12);
+MC(epel, h, 16);
+MC(epel, h, 24);
+MC(epel, h, 32);
+MC(epel, h, 48);
+MC(epel, h, 64);
+
+MC(epel, v, 4);
+MC(epel, v, 6);
+MC(epel, v, 8);
+MC(epel, v, 12);
+MC(epel, v, 16);
+MC(epel, v, 24);
+MC(epel, v, 32);
+MC(epel, v, 48);
+MC(epel, v, 64);
+
+MC(epel, hv, 4);
+MC(epel, hv, 6);
+MC(epel, hv, 8);
+MC(epel, hv, 12);
+MC(epel, hv, 16);
+MC(epel, hv, 24);
+MC(epel, hv, 32);
+MC(epel, hv, 48);
+MC(epel, hv, 64);
+
+#undef MC
+
+#define UNI_MC(PEL, DIR, WIDTH)                                                \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,           \
+                                                         ptrdiff_t dst_stride, \
+                                                         uint8_t *src,         \
+                                                         ptrdiff_t src_stride, \
+                                                         int height,           \
+                                                         intptr_t mx,          \
+                                                         intptr_t my,          \
+                                                         int width)
+
+UNI_MC(pel, pixels, 4);
+UNI_MC(pel, pixels, 6);
+UNI_MC(pel, pixels, 8);
+UNI_MC(pel, pixels, 12);
+UNI_MC(pel, pixels, 16);
+UNI_MC(pel, pixels, 24);
+UNI_MC(pel, pixels, 32);
+UNI_MC(pel, pixels, 48);
+UNI_MC(pel, pixels, 64);
+
+UNI_MC(qpel, h, 4);
+UNI_MC(qpel, h, 8);
+UNI_MC(qpel, h, 12);
+UNI_MC(qpel, h, 16);
+UNI_MC(qpel, h, 24);
+UNI_MC(qpel, h, 32);
+UNI_MC(qpel, h, 48);
+UNI_MC(qpel, h, 64);
+
+UNI_MC(qpel, v, 4);
+UNI_MC(qpel, v, 8);
+UNI_MC(qpel, v, 12);
+UNI_MC(qpel, v, 16);
+UNI_MC(qpel, v, 24);
+UNI_MC(qpel, v, 32);
+UNI_MC(qpel, v, 48);
+UNI_MC(qpel, v, 64);
+
+UNI_MC(qpel, hv, 4);
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 12);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
+UNI_MC(epel, h, 4);
+UNI_MC(epel, h, 6);
+UNI_MC(epel, h, 8);
+UNI_MC(epel, h, 12);
+UNI_MC(epel, h, 16);
+UNI_MC(epel, h, 24);
+UNI_MC(epel, h, 32);
+UNI_MC(epel, h, 48);
+UNI_MC(epel, h, 64);
+
+UNI_MC(epel, v, 4);
+UNI_MC(epel, v, 6);
+UNI_MC(epel, v, 8);
+UNI_MC(epel, v, 12);
+UNI_MC(epel, v, 16);
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+UNI_MC(epel, v, 48);
+UNI_MC(epel, v, 64);
+
+UNI_MC(epel, hv, 4);
+UNI_MC(epel, hv, 6);
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+UNI_MC(epel, hv, 48);
+UNI_MC(epel, hv, 64);
+
+#undef UNI_MC
+
+#define UNI_W_MC(PEL, DIR, WIDTH)                                         \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,    \
+                                                           ptrdiff_t      \
+                                                           dst_stride,    \
+                                                           uint8_t *src,  \
+                                                           ptrdiff_t      \
+                                                           src_stride,    \
+                                                           int height,    \
+                                                           int denom,     \
+                                                           int weight,    \
+                                                           int offset,    \
+                                                           intptr_t mx,   \
+                                                           intptr_t my,   \
+                                                           int width)
+
+UNI_W_MC(pel, pixels, 4);
+UNI_W_MC(pel, pixels, 6);
+UNI_W_MC(pel, pixels, 8);
+UNI_W_MC(pel, pixels, 12);
+UNI_W_MC(pel, pixels, 16);
+UNI_W_MC(pel, pixels, 24);
+UNI_W_MC(pel, pixels, 32);
+UNI_W_MC(pel, pixels, 48);
+UNI_W_MC(pel, pixels, 64);
+
+UNI_W_MC(qpel, h, 4);
+UNI_W_MC(qpel, h, 8);
+UNI_W_MC(qpel, h, 12);
+UNI_W_MC(qpel, h, 16);
+UNI_W_MC(qpel, h, 24);
+UNI_W_MC(qpel, h, 32);
+UNI_W_MC(qpel, h, 48);
+UNI_W_MC(qpel, h, 64);
+
+UNI_W_MC(qpel, v, 4);
+UNI_W_MC(qpel, v, 8);
+UNI_W_MC(qpel, v, 12);
+UNI_W_MC(qpel, v, 16);
+UNI_W_MC(qpel, v, 24);
+UNI_W_MC(qpel, v, 32);
+UNI_W_MC(qpel, v, 48);
+UNI_W_MC(qpel, v, 64);
+
+UNI_W_MC(qpel, hv, 4);
+UNI_W_MC(qpel, hv, 8);
+UNI_W_MC(qpel, hv, 12);
+UNI_W_MC(qpel, hv, 16);
+UNI_W_MC(qpel, hv, 24);
+UNI_W_MC(qpel, hv, 32);
+UNI_W_MC(qpel, hv, 48);
+UNI_W_MC(qpel, hv, 64);
+
+UNI_W_MC(epel, h, 4);
+UNI_W_MC(epel, h, 6);
+UNI_W_MC(epel, h, 8);
+UNI_W_MC(epel, h, 12);
+UNI_W_MC(epel, h, 16);
+UNI_W_MC(epel, h, 24);
+UNI_W_MC(epel, h, 32);
+UNI_W_MC(epel, h, 48);
+UNI_W_MC(epel, h, 64);
+
+UNI_W_MC(epel, v, 4);
+UNI_W_MC(epel, v, 6);
+UNI_W_MC(epel, v, 8);
+UNI_W_MC(epel, v, 12);
+UNI_W_MC(epel, v, 16);
+UNI_W_MC(epel, v, 24);
+UNI_W_MC(epel, v, 32);
+UNI_W_MC(epel, v, 48);
+UNI_W_MC(epel, v, 64);
+
+UNI_W_MC(epel, hv, 4);
+UNI_W_MC(epel, hv, 6);
+UNI_W_MC(epel, hv, 8);
+UNI_W_MC(epel, hv, 12);
+UNI_W_MC(epel, hv, 16);
+UNI_W_MC(epel, hv, 24);
+UNI_W_MC(epel, hv, 32);
+UNI_W_MC(epel, hv, 48);
+UNI_W_MC(epel, hv, 64);
+
+#undef UNI_W_MC
+
+#define BI_MC(PEL, DIR, WIDTH)                                                 \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,            \
+                                                        ptrdiff_t dst_stride,  \
+                                                        uint8_t *src,          \
+                                                        ptrdiff_t src_stride,  \
+                                                        int16_t *src_16bit,    \
+                                                        int height,            \
+                                                        intptr_t mx,           \
+                                                        intptr_t my,           \
+                                                        int width)
+
+BI_MC(pel, pixels, 4);
+BI_MC(pel, pixels, 6);
+BI_MC(pel, pixels, 8);
+BI_MC(pel, pixels, 12);
+BI_MC(pel, pixels, 16);
+BI_MC(pel, pixels, 24);
+BI_MC(pel, pixels, 32);
+BI_MC(pel, pixels, 48);
+BI_MC(pel, pixels, 64);
+
+BI_MC(qpel, h, 4);
+BI_MC(qpel, h, 8);
+BI_MC(qpel, h, 12);
+BI_MC(qpel, h, 16);
+BI_MC(qpel, h, 24);
+BI_MC(qpel, h, 32);
+BI_MC(qpel, h, 48);
+BI_MC(qpel, h, 64);
+
+BI_MC(qpel, v, 4);
+BI_MC(qpel, v, 8);
+BI_MC(qpel, v, 12);
+BI_MC(qpel, v, 16);
+BI_MC(qpel, v, 24);
+BI_MC(qpel, v, 32);
+BI_MC(qpel, v, 48);
+BI_MC(qpel, v, 64);
+
+BI_MC(qpel, hv, 4);
+BI_MC(qpel, hv, 8);
+BI_MC(qpel, hv, 12);
+BI_MC(qpel, hv, 16);
+BI_MC(qpel, hv, 24);
+BI_MC(qpel, hv, 32);
+BI_MC(qpel, hv, 48);
+BI_MC(qpel, hv, 64);
+
+BI_MC(epel, h, 4);
+BI_MC(epel, h, 6);
+BI_MC(epel, h, 8);
+BI_MC(epel, h, 12);
+BI_MC(epel, h, 16);
+BI_MC(epel, h, 24);
+BI_MC(epel, h, 32);
+BI_MC(epel, h, 48);
+BI_MC(epel, h, 64);
+
+BI_MC(epel, v, 4);
+BI_MC(epel, v, 6);
+BI_MC(epel, v, 8);
+BI_MC(epel, v, 12);
+BI_MC(epel, v, 16);
+BI_MC(epel, v, 24);
+BI_MC(epel, v, 32);
+BI_MC(epel, v, 48);
+BI_MC(epel, v, 64);
+
+BI_MC(epel, hv, 4);
+BI_MC(epel, hv, 6);
+BI_MC(epel, hv, 8);
+BI_MC(epel, hv, 12);
+BI_MC(epel, hv, 16);
+BI_MC(epel, hv, 24);
+BI_MC(epel, hv, 32);
+BI_MC(epel, hv, 48);
+BI_MC(epel, hv, 64);
+
+#undef BI_MC
+
+#define BI_W_MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst,          \
+                                                          ptrdiff_t            \
+                                                          dst_stride,          \
+                                                          uint8_t *src,        \
+                                                          ptrdiff_t            \
+                                                          src_stride,          \
+                                                          int16_t *src_16bit,  \
+                                                          int height,          \
+                                                          int denom,           \
+                                                          int weight0,         \
+                                                          int weight1,         \
+                                                          int offset0,         \
+                                                          int offset1,         \
+                                                          intptr_t mx,         \
+                                                          intptr_t my,         \
+                                                          int width)
+
+BI_W_MC(pel, pixels, 4);
+BI_W_MC(pel, pixels, 6);
+BI_W_MC(pel, pixels, 8);
+BI_W_MC(pel, pixels, 12);
+BI_W_MC(pel, pixels, 16);
+BI_W_MC(pel, pixels, 24);
+BI_W_MC(pel, pixels, 32);
+BI_W_MC(pel, pixels, 48);
+BI_W_MC(pel, pixels, 64);
+
+BI_W_MC(qpel, h, 4);
+BI_W_MC(qpel, h, 8);
+BI_W_MC(qpel, h, 12);
+BI_W_MC(qpel, h, 16);
+BI_W_MC(qpel, h, 24);
+BI_W_MC(qpel, h, 32);
+BI_W_MC(qpel, h, 48);
+BI_W_MC(qpel, h, 64);
+
+BI_W_MC(qpel, v, 4);
+BI_W_MC(qpel, v, 8);
+BI_W_MC(qpel, v, 12);
+BI_W_MC(qpel, v, 16);
+BI_W_MC(qpel, v, 24);
+BI_W_MC(qpel, v, 32);
+BI_W_MC(qpel, v, 48);
+BI_W_MC(qpel, v, 64);
+
+BI_W_MC(qpel, hv, 4);
+BI_W_MC(qpel, hv, 8);
+BI_W_MC(qpel, hv, 12);
+BI_W_MC(qpel, hv, 16);
+BI_W_MC(qpel, hv, 24);
+BI_W_MC(qpel, hv, 32);
+BI_W_MC(qpel, hv, 48);
+BI_W_MC(qpel, hv, 64);
+
+BI_W_MC(epel, h, 4);
+BI_W_MC(epel, h, 6);
+BI_W_MC(epel, h, 8);
+BI_W_MC(epel, h, 12);
+BI_W_MC(epel, h, 16);
+BI_W_MC(epel, h, 24);
+BI_W_MC(epel, h, 32);
+BI_W_MC(epel, h, 48);
+BI_W_MC(epel, h, 64);
+
+BI_W_MC(epel, v, 4);
+BI_W_MC(epel, v, 6);
+BI_W_MC(epel, v, 8);
+BI_W_MC(epel, v, 12);
+BI_W_MC(epel, v, 16);
+BI_W_MC(epel, v, 24);
+BI_W_MC(epel, v, 32);
+BI_W_MC(epel, v, 48);
+BI_W_MC(epel, v, 64);
+
+BI_W_MC(epel, hv, 4);
+BI_W_MC(epel, hv, 6);
+BI_W_MC(epel, hv, 8);
+BI_W_MC(epel, hv, 12);
+BI_W_MC(epel, hv, 16);
+BI_W_MC(epel, hv, 24);
+BI_W_MC(epel, hv, 32);
+BI_W_MC(epel, hv, 48);
+BI_W_MC(epel, hv, 64);
+
+#undef BI_W_MC
+
+void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+
+void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
+                                      ptrdiff_t src_stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *no_p, uint8_t *no_q);
+
+void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+
+void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
+                                        ptrdiff_t src_stride,
+                                        int32_t *tc, uint8_t *no_p,
+                                        uint8_t *no_q);
+
+void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                     int16_t *sao_offset_val, int sao_left_class,
+                                     int width, int height);
+
+void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height);
+
+void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs);
+void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs);
+void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                            ptrdiff_t stride);
+void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                            ptrdiff_t stride);
+void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                              ptrdiff_t stride);
+void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *pi16Coeffs,
+                              ptrdiff_t stride);
+void ff_hevc_idct_luma_4x4_msa(int16_t *pi16Coeffs);
+
+#endif  // #ifndef AVCODEC_MIPS_HEVCDSP_MIPS_H
diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
new file mode 100644
index 0000000..f2bc748
--- /dev/null
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -0,0 +1,3878 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hevcdsp_mips.h"
+#include "libavcodec/mips/hevc_macros_msa.h"
+
+static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0;
+
+        LD_SB2(src, src_stride, src0, src1);
+
+        src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
+        in0 = (v8i16) __msa_ilvr_b(zero, src0);
+        in0 <<= 6;
+        ST8x2_UB(in0, dst, 2 * dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST8x4_UB(in0, in1, dst, 2 * dst_stride);
+    } else if (0 == height % 8) {
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3;
+        uint32_t loop_cnt;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
+                       src0, src1, src2, src3);
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0, in1, in2, in3);
+            SLLI_4V(in0, in1, in2, in3, 6);
+            ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in4, in5, in6, in7);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        SLLI_4V(in4, in5, in6, in7, 6);
+        ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (2 == height) {
+        v16i8 src0, src1;
+        v8i16 in0, in1;
+
+        LD_SB2(src, src_stride, src0, src1);
+
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH2(in0, in1, dst, dst_stride);
+    } else if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0, in1, in2, in3;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        ST_SH4(in0, in1, in2, in3, dst, dst_stride);
+    } else if (6 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5;
+        v8i16 in0, in1, in2, in3, in4, in5;
+
+        LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0, in1, in2, in3);
+        ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
+        SLLI_4V(in0, in1, in2, in3, 6);
+        in4 <<= 6;
+        in5 <<= 6;
+        ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
+    } else if (0 == height % 8) {
+        uint32_t loop_cnt;
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0, in1, in2, in3);
+            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                       in4, in5, in6, in7);
+            SLLI_4V(in0, in1, in2, in3, 6);
+            SLLI_4V(in4, in5, in6, in7, 6);
+            ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
+            dst += (8 * dst_stride);
+        }
+    }
+}
+
+static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_r, in1_r, in2_r, in3_r);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
+        ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
+        in0 <<= 6;
+        in1 <<= 6;
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_copy_16multx8mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       int32_t height,
+                                       int32_t width)
+{
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 zero = { 0 };
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 in0_r, in1_r, in2_r, in3_r;
+    v8i16 in0_l, in1_l, in2_l, in3_l;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_SB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0_r, in1_r, in2_r, in3_r);
+            ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                       in0_l, in1_l, in2_l, in3_l);
+            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
+            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                       in0_r, in1_r, in2_r, in3_r);
+            ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                       in0_l, in1_l, in2_l, in3_l);
+            SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+            SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+            ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
+            ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
+            dst_tmp += (4 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    v16i8 zero = { 0 };
+
+    if (4 == height) {
+        v16i8 src0, src1, src2, src3;
+        v8i16 in0_r, in1_r, in2_r, in3_r;
+        v8i16 in0_l, in1_l, in2_l, in3_l;
+
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+    } else if (12 == height) {
+        v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+        v16i8 src8, src9, src10, src11;
+        v8i16 in0_r, in1_r, in2_r, in3_r;
+        v8i16 in0_l, in1_l, in2_l, in3_l;
+
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_SB4(src, src_stride, src8, src9, src10, src11);
+
+        ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+        dst += (4 * dst_stride);
+
+        ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
+                   in0_r, in1_r, in2_r, in3_r);
+        ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
+                   in0_l, in1_l, in2_l, in3_l);
+        SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
+        SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
+        ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
+        ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
+    } else if (0 == (height % 8)) {
+        hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride,
+                                   height, 16);
+    }
+}
+
+static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    hevc_copy_8w_msa(src + 16, src_stride, dst + 16, dst_stride, height);
+}
+
+static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+}
+
+static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
+}
+
+static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    hevc_copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_hz_8t_8w_msa(src, src_stride, dst, dst_stride, filter, height);
+    hevc_hz_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+        VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst7 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst7, dst7, dst7, dst7);
+
+        ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
+        ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src += src_stride;
+        LD_SB2(src, 16, src2, src3);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        ST_SH(dst2, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst3, dst4, dst, 8);
+        ST_SH(dst5, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB3(src, 16, src0, src1, src2);
+        src3 = LD_SB(src + 40);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+
+        ST_SH6(dst0, dst1, dst2, dst3, dst4, dst5, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1, filt2, filt3;
+    v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    v16i8 vec0, vec1, vec2, vec3;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= 3;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+    mask4 = mask0 + 8;
+    mask5 = mask0 + 10;
+    mask6 = mask0 + 12;
+    mask7 = mask0 + 14;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src, 16, src0, src1, src2, src3);
+        src4 = LD_SB(src + 56);
+        src += src_stride;
+        XORI_B5_128_SB(src0, src1, src2, src3, src4);
+
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        ST_SH(dst0, dst);
+
+        VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        ST_SH(dst1, dst + 8);
+
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        ST_SH(dst2, dst + 16);
+
+        VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+        ST_SH(dst3, dst + 24);
+
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        ST_SH(dst4, dst + 32);
+
+        VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
+                   vec0, vec1, vec2, vec3);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        ST_SH(dst5, dst + 40);
+
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+        ST_SH(dst6, dst + 48);
+
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst7 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst7, dst7, dst7, dst7);
+        ST_SH(dst7, dst + 56);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src9, src10, src11, src12, src13, src14;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v16i8 src12111110, src14131312;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1, filt2, filt3;
+    v8i16 filter_vec, const_vec;
+
+    src -= (3 * src_stride);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src7, src8, src9, src10, src11, src12, src13, src14);
+        src += (8 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
+                   src1110_r, src1211_r, src1312_r, src1413_r);
+        ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
+                   src1211_r, src1110_r, src1413_r, src1312_r,
+                   src8776, src10998, src12111110, src14131312);
+        XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
+
+        dst10 = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
+                     filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
+        dst76 = const_vec;
+        DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
+                     filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
+
+        ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+}
+
+static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 src2110, src4332, src6554, src8776, src10998;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_r, src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+               src10_l, src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+    ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
+               src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_r, src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                   src76_l, src87_l, src98_l, src109_l);
+        ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
+
+        dst0_r = const_vec;
+        DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                     filt0, filt1, filt2, filt3,
+                     dst0_r, dst0_r, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                     filt0, filt1, filt2, filt3,
+                     dst1_r, dst1_r, dst1_r, dst1_r);
+        dst2_r = const_vec;
+        DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                     filt0, filt1, filt2, filt3,
+                     dst2_r, dst2_r, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                     filt0, filt1, filt2, filt3,
+                     dst3_r, dst3_r, dst3_r, dst3_r);
+        dst0_l = const_vec;
+        DPADD_SB4_SH(src2110, src4332, src6554, src8776,
+                     filt0, filt1, filt2, filt3,
+                     dst0_l, dst0_l, dst0_l, dst0_l);
+        dst1_l = const_vec;
+        DPADD_SB4_SH(src4332, src6554, src8776, src10998,
+                     filt0, filt1, filt2, filt3,
+                     dst1_l, dst1_l, dst1_l, dst1_l);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
+                                        int32_t src_stride,
+                                        int16_t *dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter,
+                                        int32_t height,
+                                        int32_t width)
+{
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filter_vec, const_vec;
+    v8i16 filt0, filt1, filt2, filt3;
+
+    src -= (3 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = width >> 4; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_r, src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
+                   src10_l, src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_r, src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
+                       src76_l, src87_l, src98_l, src109_l);
+
+            dst0_r = const_vec;
+            DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
+                         filt0, filt1, filt2, filt3,
+                         dst0_r, dst0_r, dst0_r, dst0_r);
+            dst1_r = const_vec;
+            DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
+                         filt0, filt1, filt2, filt3,
+                         dst1_r, dst1_r, dst1_r, dst1_r);
+            dst2_r = const_vec;
+            DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
+                         filt0, filt1, filt2, filt3,
+                         dst2_r, dst2_r, dst2_r, dst2_r);
+            dst3_r = const_vec;
+            DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
+                         filt0, filt1, filt2, filt3,
+                         dst3_r, dst3_r, dst3_r, dst3_r);
+            dst0_l = const_vec;
+            DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
+                         filt0, filt1, filt2, filt3,
+                         dst0_l, dst0_l, dst0_l, dst0_l);
+            dst1_l = const_vec;
+            DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
+                         filt0, filt1, filt2, filt3,
+                         dst1_l, dst1_l, dst1_l, dst1_l);
+            dst2_l = const_vec;
+            DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
+                         filt0, filt1, filt2, filt3,
+                         dst2_l, dst2_l, dst2_l, dst2_l);
+            dst3_l = const_vec;
+            DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
+                         filt0, filt1, filt2, filt3,
+                         dst3_l, dst3_l, dst3_l, dst3_l);
+
+            ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
+            ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+    hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
+                      filter, height);
+}
+
+static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
+                                filter, height, 64);
+}
+
+static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v16i8 mask0 = {
+        0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+    };
+    v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+    VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
+    VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
+    VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
+               vec8, vec9, vec10, vec11);
+    VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
+               vec12, vec13, vec14, vec15);
+    dst30 = const_vec;
+    DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                 dst30, dst30, dst30, dst30);
+    dst41 = const_vec;
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                 dst41, dst41, dst41, dst41);
+    dst52 = const_vec;
+    DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                 dst52, dst52, dst52, dst52);
+    dst63 = const_vec;
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                 dst63, dst63, dst63, dst63);
+
+    ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
+               dst10_r, dst21_r, dst32_r);
+    dst43_r = __msa_ilvl_h(dst41, dst30);
+    dst54_r = __msa_ilvl_h(dst52, dst41);
+    dst65_r = __msa_ilvl_h(dst63, dst52);
+    dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src7, src8);
+
+        VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        dst87 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst87, dst87, dst87, dst87);
+        dst76_r = __msa_ilvr_h(dst87, dst66);
+        dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
+        dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                filt_h0, filt_h1, filt_h2, filt_h3);
+        dst0_r >>= 6;
+        dst1_r >>= 6;
+
+        dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+        ST8x2_UB(dst0_r, dst, (2 * dst_stride));
+        dst += (2 * dst_stride);
+
+        dst10_r = dst32_r;
+        dst32_r = dst54_r;
+        dst54_r = dst76_r;
+        dst21_r = dst43_r;
+        dst43_r = dst65_r;
+        dst65_r = dst87_r;
+        dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
+    }
+}
+
+static void hevc_hv_8t_8multx2mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1, filt2, filt3;
+    v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+    v16i8 mask1, mask2, mask3;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+    v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+    src -= ((3 * src_stride) + 3);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+
+        /* row 0 row 1 row 2 row 3 */
+        VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
+                   vec12, vec13, vec14, vec15);
+        dst0 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst0, dst0, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst1, dst1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst2, dst2, dst2, dst2);
+        dst3 = const_vec;
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
+                     dst3, dst3, dst3, dst3);
+
+        /* row 4 row 5 row 6 */
+        VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
+                   vec0, vec1, vec2, vec3);
+        VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
+                   vec4, vec5, vec6, vec7);
+        VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
+                   vec8, vec9, vec10, vec11);
+        dst4 = const_vec;
+        DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                     dst4, dst4, dst4, dst4);
+        dst5 = const_vec;
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
+                     dst5, dst5, dst5, dst5);
+        dst6 = const_vec;
+        DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
+                     dst6, dst6, dst6, dst6);
+
+        ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_r, dst32_r, dst54_r, dst21_r);
+        ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
+                   dst10_l, dst32_l, dst54_l, dst21_l);
+        ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            LD_SB2(src_tmp, src_stride, src7, src8);
+            XORI_B2_128_SB(src7, src8);
+            src_tmp += 2 * src_stride;
+
+            VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst7 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst7, dst7, dst7, dst7);
+
+            ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+            dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+            ST_SW(dst0_r, dst_tmp);
+            dst_tmp += dst_stride;
+
+            VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
+                       vec0, vec1, vec2, vec3);
+            dst8 = const_vec;
+            DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
+                         dst8, dst8, dst8, dst8);
+
+            ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+            dst6 = dst8;
+            dst0_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
+                                    filt_h0, filt_h1, filt_h2, filt_h3);
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
+            ST_SW(dst0_r, dst_tmp);
+            dst_tmp += dst_stride;
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+
+    hevc_hv_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                      filter_x, filter_y, height);
+}
+
+static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB2(src, src_stride, src0, src1);
+    XORI_B2_128_SB(src0, src1);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    ST8x2_UB(dst0, dst, 2 * dst_stride);
+}
+
+static void hevc_hz_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+    VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+    ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
+}
+
+static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 mask1, vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (4 == height) {
+        hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (0 == height % 8) {
+        hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_hz_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1, dst0, dst1;
+    v16i8 src0, src1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src0, src1);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        ST_SH2(dst0, dst1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height || 6 == height) {
+        hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    } else {
+        hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_hz_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v8i16 filt0, filt1;
+    v16i8 src0, src1, src2, src3;
+    v16i8 mask1;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v8i16 filter_vec, const_vec;
+    v16i8 mask3;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask2 = {
+        8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+    };
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask3 = mask2 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+        VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
+        ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3;
+    v16i8 src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
+        ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    int16_t *dst_tmp = dst + 16;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask00, mask11;
+    v16i8 vec0, vec1;
+    v8i16 dst0, dst1, dst2, dst3;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+    mask00 = mask0 + 8;
+    mask11 = mask0 + 10;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16 width */
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH2(dst0, dst1, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        /* 8 width */
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+        dst_tmp += (4 * dst_stride);
+    }
+}
+
+static void hevc_hz_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2;
+    v8i16 filt0, filt1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1, mask2, mask3;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16i8 vec0, vec1;
+    v8i16 filter_vec, const_vec;
+
+    src -= 1;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 8;
+    mask3 = mask0 + 10;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, 16, src0, src1);
+        src2 = LD_SB(src + 24);
+        src += src_stride;
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+
+        VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
+
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src2110, src4332;
+    v8i16 dst10;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+
+    ST8x2_UB(dst10, dst, 2 * dst_stride);
+}
+
+static void hevc_vt_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, src6554;
+    v8i16 dst10, dst32;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src2110, src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+    dst10 = const_vec;
+    DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+    dst32 = const_vec;
+    DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+
+    ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
+}
+
+static void hevc_vt_4t_4x8multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+    v16i8 src2110, src4332, src6554, src8776;
+    v8i16 dst10, dst32, dst54, dst76;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
+        src += (6 * src_stride);
+
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+        ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
+                   src4332, src6554, src8776);
+        XORI_B3_128_SB(src4332, src6554, src8776);
+
+        dst10 = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
+        dst32 = const_vec;
+        DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
+        dst54 = const_vec;
+        DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
+
+        LD_SB2(src, src_stride, src9, src2);
+        src += (2 * src_stride);
+        ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        dst76 = const_vec;
+        DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
+
+        ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_vt_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (4 == height) {
+        hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
+    } else if (0 == (height % 8)) {
+        hevc_vt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_vt_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    int32_t loop_cnt;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src1, src2);
+        ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_8x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_vt_4t_8x6_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_SB2(src, src_stride, src1, src2);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src1, src2);
+
+    ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+    dst0_r = const_vec;
+    DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+    dst1_r = const_vec;
+    DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+    ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter,
+                                       int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+
+        ST_SH2(dst0_r, dst1_r, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void hevc_vt_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (6 == height) {
+        hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
+                                   filter, height);
+    }
+}
+
+static void hevc_vt_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    v16i8 src2110, src4332;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= (1 * src_stride);
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
+
+        ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
+        ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
+        dst += (4 * dst_stride);
+    }
+}
+
+static void hevc_vt_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src21_l, src43_l;
+    v8i16 dst0_l, dst1_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        ST_SH(dst2_r, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        ST_SH(dst3_r, dst + 16);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+
+        ST_SH2(dst0_r, dst0_l, dst, 8);
+        ST_SH(dst2_r, dst + 16);
+        dst += dst_stride;
+        ST_SH2(dst1_r, dst1_l, dst, 8);
+        ST_SH(dst3_r, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src6, src7, src8, src9, src10, src11;
+    v16i8 src10_r, src32_r, src76_r, src98_r;
+    v16i8 src21_r, src43_r, src87_r, src109_r;
+    v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
+    v16i8 src10_l, src32_l, src76_l, src98_l;
+    v16i8 src21_l, src43_l, src87_l, src109_l;
+    v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
+    v8i16 filt0, filt1;
+    v8i16 filter_vec, const_vec;
+
+    src -= src_stride;
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    filter_vec = LD_SH(filter);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    LD_SB3(src + 16, src_stride, src6, src7, src8);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src6, src7, src8);
+    ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
+    ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src3, src4);
+        XORI_B2_128_SB(src3, src4);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
+
+        LD_SB2(src + 16, src_stride, src9, src10);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src9, src10);
+        ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
+        ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
+
+        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
+        dst += dst_stride;
+
+        LD_SB2(src, src_stride, src5, src2);
+        XORI_B2_128_SB(src5, src2);
+        ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
+        ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
+
+        LD_SB2(src + 16, src_stride, src11, src8);
+        src += (2 * src_stride);
+        XORI_B2_128_SB(src11, src8);
+        ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
+        ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
+
+        dst0_r = const_vec;
+        DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
+        dst0_l = const_vec;
+        DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
+        dst1_r = const_vec;
+        DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
+        dst1_l = const_vec;
+        DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
+        dst2_r = const_vec;
+        DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
+        dst2_l = const_vec;
+        DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
+        dst3_r = const_vec;
+        DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
+        dst3_l = const_vec;
+        DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
+
+        ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
+        dst += dst_stride;
+        ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hv_4t_4x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst1_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB2(src, src_stride, src3, src4);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
+    ST8x2_UB(dst0_r, dst, 2 * dst_stride);
+}
+
+static void hevc_hv_4t_4x4_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    LD_SB4(src, src_stride, src3, src4, src5, src6);
+    XORI_B4_128_SB(src3, src4, src5, src6);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    dst32_r = __msa_ilvr_h(dst3, dst2);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_r >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    dst43_r = __msa_ilvr_h(dst4, dst3);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_r >>= 6;
+
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    dst10_r = __msa_ilvr_h(dst5, dst4);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+    dst2_r >>= 6;
+
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+    dst21_r = __msa_ilvr_h(dst2, dst5);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+    dst3_r >>= 6;
+
+    PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
+    ST8x4_UB(dst0_r, dst1_r, dst, 2 * dst_stride);
+}
+
+
+static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src7, src8, src9, src10;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
+    v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
+    v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+    v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+
+    for (loop_cnt = height >> 3; loop_cnt--;) {
+        LD_SB8(src, src_stride,
+               src3, src4, src5, src6, src7, src8, src9, src10);
+        src += (8 * src_stride);
+        XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        dst32_r = __msa_ilvr_h(dst3, dst2);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_r >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        dst43_r = __msa_ilvr_h(dst4, dst3);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_r >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        dst54_r = __msa_ilvr_h(dst5, dst4);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+        dst2_r >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst6 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+        dst65_r = __msa_ilvr_h(dst6, dst5);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+        dst3_r >>= 6;
+
+        VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+        dst7 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+        dst76_r = __msa_ilvr_h(dst7, dst6);
+        dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+        dst4_r >>= 6;
+
+        VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+        dst8 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+        dst87_r = __msa_ilvr_h(dst8, dst7);
+        dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+        dst5_r >>= 6;
+
+        VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
+        dst9 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
+
+        dst10_r = __msa_ilvr_h(dst9, dst8);
+        dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
+        dst6_r >>= 6;
+
+        VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        dst21_r = __msa_ilvr_h(dst2, dst9);
+        dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
+        dst7_r >>= 6;
+
+        PCKEV_H4_SW(dst1_r, dst0_r, dst3_r, dst2_r,
+                    dst5_r, dst4_r, dst7_r, dst6_r,
+                    dst0_r, dst1_r, dst2_r, dst3_r);
+        ST8x8_UB(dst0_r, dst1_r, dst2_r, dst3_r, dst, 2 * dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void hevc_hv_4t_4w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (0 == (height % 8)) {
+        hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height);
+    }
+}
+
+static void hevc_hv_4t_6w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
+    uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+        dst3 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+        ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+        dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+        dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+        dst0_r >>= 6;
+        dst0_l >>= 6;
+
+        VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+        dst4 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+        ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+        dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+        dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+        dst1_r >>= 6;
+        dst1_l >>= 6;
+
+        VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+        dst5 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+        ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+        dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+        dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+        dst2_r >>= 6;
+        dst2_l >>= 6;
+
+        VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+        dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+        dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+        dst3_r >>= 6;
+        dst3_l >>= 6;
+
+        PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                    dst2_l, dst2_r, dst3_l, dst3_r,
+                    dst0_r, dst1_r, dst2_r, dst3_r);
+
+        dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
+        dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
+        dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
+        dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
+
+        dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
+        dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
+        dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
+        dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
+
+        SD(dst_val0, dst);
+        SW(dst_val_int0, dst + 4);
+        dst += dst_stride;
+        SD(dst_val1, dst);
+        SW(dst_val_int1, dst + 4);
+        dst += dst_stride;
+        SD(dst_val2, dst);
+        SW(dst_val_int2, dst + 4);
+        dst += dst_stride;
+        SD(dst_val3, dst);
+        SW(dst_val_int3, dst + 4);
+        dst += dst_stride;
+
+    }
+}
+
+static void hevc_hv_4t_8x2_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+    XORI_B2_128_SB(src3, src4);
+
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+}
+
+static void hevc_hv_4t_8x6_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
+    v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
+    v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
+    v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
+    v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+
+    VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+    VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+    VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+    dst0 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+    dst1 = const_vec;
+    DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+    dst2 = const_vec;
+    DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+    ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+    ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+    LD_SB2(src, src_stride, src3, src4);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src3, src4);
+
+    /* row 3 */
+    VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+    dst3 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+    ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+    dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+
+    dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+    dst0_r >>= 6;
+    dst0_l >>= 6;
+
+    /* row 4 */
+    VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+    dst4 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+    ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+    dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+    dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+    dst1_r >>= 6;
+    dst1_l >>= 6;
+
+    LD_SB2(src, src_stride, src5, src6);
+    src += (2 * src_stride);
+
+    XORI_B2_128_SB(src5, src6);
+
+    /* row 5 */
+    VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+    dst5 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+    ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
+    dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+    dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+    dst2_r >>= 6;
+    dst2_l >>= 6;
+
+    /* row 6 */
+    VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+    dst6 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
+
+    ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
+    dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+    dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+    dst3_r >>= 6;
+    dst3_l >>= 6;
+
+    LD_SB2(src, src_stride, src7, src8);
+
+    XORI_B2_128_SB(src7, src8);
+
+    /* row 7 */
+    VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
+    dst7 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
+
+    ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
+    dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+    dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
+    dst4_r >>= 6;
+    dst4_l >>= 6;
+
+    /* row 8 */
+    VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
+    dst8 = const_vec;
+    DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
+
+    ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
+    dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+    dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
+    dst5_r >>= 6;
+    dst5_l >>= 6;
+
+    PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+    PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
+
+    ST_SW2(dst0_r, dst1_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+    ST_SW2(dst2_r, dst3_r, dst, dst_stride);
+    dst += (2 * dst_stride);
+    ST_SW2(dst4_r, dst5_r, dst, dst_stride);
+}
+
+static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height,
+                                       int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v8i16 filt0, filt1;
+    v4i32 filt_h0, filt_h1;
+    v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16i8 mask1;
+    v8i16 filter_vec, const_vec;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
+    v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
+    v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+
+    filter_vec = LD_SH(filter_x);
+    SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+    filter_vec = LD_SH(filter_y);
+    vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
+    filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+
+    SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
+
+    mask1 = mask0 + 2;
+
+    const_vec = __msa_ldi_h(128);
+    const_vec <<= 6;
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB3(src_tmp, src_stride, src0, src1, src2);
+        src_tmp += (3 * src_stride);
+
+        XORI_B3_128_SB(src0, src1, src2);
+
+        VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+        VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+        VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+        dst0 = const_vec;
+        DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
+        dst1 = const_vec;
+        DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
+        dst2 = const_vec;
+        DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
+
+        ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
+        ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+            src_tmp += (4 * src_stride);
+            XORI_B4_128_SB(src3, src4, src5, src6);
+
+            VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+            dst3 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
+
+            ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
+            dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+            dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+
+            dst0_r >>= 6;
+            dst0_l >>= 6;
+
+            /* row 4 */
+            VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
+            dst4 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
+
+            ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
+            dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+            dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+            dst1_r >>= 6;
+            dst1_l >>= 6;
+
+            /* row 5 */
+            VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
+            dst5 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
+
+            ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
+            dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
+            dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
+
+            dst2_r >>= 6;
+            dst2_l >>= 6;
+
+            /* row 6 */
+            VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
+            dst2 = const_vec;
+            DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
+
+            ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
+            dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
+            dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
+
+            dst3_r >>= 6;
+            dst3_l >>= 6;
+
+            PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
+                        dst2_l, dst2_r, dst3_l, dst3_r,
+                        dst0_r, dst1_r, dst2_r, dst3_r);
+
+            ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+            ST_SW2(dst2_r, dst3_r, dst_tmp, dst_stride);
+            dst_tmp += (2 * dst_stride);
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_4t_8w_msa(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+
+    if (2 == height) {
+        hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y, height);
+    } else if (6 == height) {
+        hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y, height);
+    } else if (0 == (height % 4)) {
+        hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_4t_12w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+
+    hevc_hv_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
+                      filter_x, filter_y, height);
+
+}
+
+static void hevc_hv_4t_16w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_4t_24w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_4t_32w_msa(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+#define MC_COPY(WIDTH)                                                    \
+void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst,             \
+                                                uint8_t *src,             \
+                                                ptrdiff_t src_stride,     \
+                                                int height,               \
+                                                intptr_t mx,              \
+                                                intptr_t my,              \
+                                                int width)                \
+{                                                                         \
+    hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height);  \
+}
+
+MC_COPY(4);
+MC_COPY(6);
+MC_COPY(8);
+MC_COPY(12);
+MC_COPY(16);
+MC_COPY(24);
+MC_COPY(32);
+MC_COPY(48);
+MC_COPY(64);
+
+#undef MC_COPY
+
+#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                            \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,            \
+                                                     uint8_t *src,          \
+                                                     ptrdiff_t src_stride,  \
+                                                     int height,            \
+                                                     intptr_t mx,           \
+                                                     intptr_t my,           \
+                                                     int width)             \
+{                                                                           \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];           \
+                                                                            \
+    hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst,             \
+                                          MAX_PB_SIZE, filter, height);     \
+}
+
+MC(qpel, h, 4, 8, hz, mx);
+MC(qpel, h, 8, 8, hz, mx);
+MC(qpel, h, 12, 8, hz, mx);
+MC(qpel, h, 16, 8, hz, mx);
+MC(qpel, h, 24, 8, hz, mx);
+MC(qpel, h, 32, 8, hz, mx);
+MC(qpel, h, 48, 8, hz, mx);
+MC(qpel, h, 64, 8, hz, mx);
+
+MC(qpel, v, 4, 8, vt, my);
+MC(qpel, v, 8, 8, vt, my);
+MC(qpel, v, 12, 8, vt, my);
+MC(qpel, v, 16, 8, vt, my);
+MC(qpel, v, 24, 8, vt, my);
+MC(qpel, v, 32, 8, vt, my);
+MC(qpel, v, 48, 8, vt, my);
+MC(qpel, v, 64, 8, vt, my);
+
+MC(epel, h, 4, 4, hz, mx);
+MC(epel, h, 6, 4, hz, mx);
+MC(epel, h, 8, 4, hz, mx);
+MC(epel, h, 12, 4, hz, mx);
+MC(epel, h, 16, 4, hz, mx);
+MC(epel, h, 24, 4, hz, mx);
+MC(epel, h, 32, 4, hz, mx);
+
+MC(epel, v, 4, 4, vt, my);
+MC(epel, v, 6, 4, vt, my);
+MC(epel, v, 8, 4, vt, my);
+MC(epel, v, 12, 4, vt, my);
+MC(epel, v, 16, 4, vt, my);
+MC(epel, v, 24, 4, vt, my);
+MC(epel, v, 32, 4, vt, my);
+
+#undef MC
+
+#define MC_HV(PEL, DIR, WIDTH, TAP, DIR1)                                     \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst,              \
+                                                     uint8_t *src,            \
+                                                     ptrdiff_t src_stride,    \
+                                                     int height,              \
+                                                     intptr_t mx,             \
+                                                     intptr_t my,             \
+                                                     int width)               \
+{                                                                             \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                 \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                 \
+                                                                              \
+    hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE,  \
+                                          filter_x, filter_y, height);        \
+}
+
+MC_HV(qpel, hv, 4, 8, hv);
+MC_HV(qpel, hv, 8, 8, hv);
+MC_HV(qpel, hv, 12, 8, hv);
+MC_HV(qpel, hv, 16, 8, hv);
+MC_HV(qpel, hv, 24, 8, hv);
+MC_HV(qpel, hv, 32, 8, hv);
+MC_HV(qpel, hv, 48, 8, hv);
+MC_HV(qpel, hv, 64, 8, hv);
+
+MC_HV(epel, hv, 4, 4, hv);
+MC_HV(epel, hv, 6, 4, hv);
+MC_HV(epel, hv, 8, 4, hv);
+MC_HV(epel, hv, 12, 4, hv);
+MC_HV(epel, hv, 16, 4, hv);
+MC_HV(epel, hv, 24, 4, hv);
+MC_HV(epel, hv, 32, 4, hv);
+
+#undef MC_HV
diff --git a/libavcodec/mips/hevcpred_init_mips.c b/libavcodec/mips/hevcpred_init_mips.c
new file mode 100644
index 0000000..331cfac
--- /dev/null
+++ b/libavcodec/mips/hevcpred_init_mips.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/hevc.h"
+#include "libavcodec/mips/hevcpred_mips.h"
+
+#if HAVE_MSA
+static av_cold void hevc_pred_init_msa(HEVCPredContext *c, const int bit_depth)
+{
+    if (8 == bit_depth) {
+        c->intra_pred[2] = ff_intra_pred_8_16x16_msa;
+        c->intra_pred[3] = ff_intra_pred_8_32x32_msa;
+        c->pred_planar[0] = ff_hevc_intra_pred_planar_0_msa;
+        c->pred_planar[1] = ff_hevc_intra_pred_planar_1_msa;
+        c->pred_planar[2] = ff_hevc_intra_pred_planar_2_msa;
+        c->pred_planar[3] = ff_hevc_intra_pred_planar_3_msa;
+        c->pred_dc = ff_hevc_intra_pred_dc_msa;
+        c->pred_angular[0] = ff_pred_intra_pred_angular_0_msa;
+        c->pred_angular[1] = ff_pred_intra_pred_angular_1_msa;
+        c->pred_angular[2] = ff_pred_intra_pred_angular_2_msa;
+        c->pred_angular[3] = ff_pred_intra_pred_angular_3_msa;
+    }
+}
+#endif  // #if HAVE_MSA
+
+void ff_hevc_pred_init_mips(HEVCPredContext *c, const int bit_depth)
+{
+#if HAVE_MSA
+    hevc_pred_init_msa(c, bit_depth);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/hevcpred_mips.h b/libavcodec/mips/hevcpred_mips.h
new file mode 100644
index 0000000..12f57a2
--- /dev/null
+++ b/libavcodec/mips/hevcpred_mips.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H
+#define AVCODEC_MIPS_HEVCPRED_MIPS_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride);
+
+void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
+                               const uint8_t *src_left,
+                               ptrdiff_t stride, int log2, int c_idx);
+
+void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode);
+
+void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx);
+void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx);
+
+#endif  // #ifndef AVCODEC_MIPS_HEVCPRED_MIPS_H
diff --git a/libavcodec/mips/hevcpred_msa.c b/libavcodec/mips/hevcpred_msa.c
new file mode 100644
index 0000000..6a3b281
--- /dev/null
+++ b/libavcodec/mips/hevcpred_msa.c
@@ -0,0 +1,3084 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/hevc.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "hevcpred_mips.h"
+
+static const int8_t intra_pred_angle_up[17] = {
+    -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
+};
+
+static const int8_t intra_pred_angle_low[16] = {
+    32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
+};
+
+#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,          \
+                              mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,  \
+                              res0, res1, mul_val_b0, mul_val_b1, round)       \
+{                                                                              \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
+                                                                               \
+    MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1,                 \
+         mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m);                    \
+                                                                               \
+    res0_m += mul_val_h1 * tmp0;                                               \
+    res1_m += mul_val_h3 * tmp0;                                               \
+    res2_m += mul_val_h1 * tmp0;                                               \
+    res3_m += mul_val_h3 * tmp0;                                               \
+                                                                               \
+    res0_m += mul_val_b0 * src0_r;                                             \
+    res1_m += mul_val_b0 * src0_l;                                             \
+    res2_m += (mul_val_b0 - 1) * src0_r;                                       \
+    res3_m += (mul_val_b0 - 1) * src0_l;                                       \
+                                                                               \
+    res0_m += mul_val_b1 * tmp1;                                               \
+    res1_m += mul_val_b1 * tmp1;                                               \
+    res2_m += (mul_val_b1 + 1) * tmp1;                                         \
+    res3_m += (mul_val_b1 + 1) * tmp1;                                         \
+                                                                               \
+    SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round);                        \
+    PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1);                   \
+}
+
+static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint32_t col;
+    uint32_t src_data;
+    v8i16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_data = LW(src_top);
+    SW4(src_data, src_data, src_data, src_data, dst, stride);
+
+    if (0 == flag) {
+        src_data = LW(src_left);
+
+        vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
+        vec2 -= vec0;
+        vec2 >>= 1;
+        vec2 += vec1;
+        vec2 = CLIP_SH_0_255(vec2);
+
+        for (col = 0; col < 4; col++) {
+            dst[stride * col] = (uint8_t) vec2[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    uint16_t val0, val1, val2, val3;
+    uint64_t src_data1;
+    v8i16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_data1 = LD(src_top);
+
+    for (row = 8; row--;) {
+        SD(src_data1, tmp_dst);
+        tmp_dst += stride;
+    }
+
+    if (0 == flag) {
+        src_data1 = LD(src_left);
+
+        vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
+        vec2 -= vec0;
+        vec2 >>= 1;
+        vec2 += vec1;
+        vec2 = CLIP_SH_0_255(vec2);
+
+        val0 = vec2[0];
+        val1 = vec2[1];
+        val2 = vec2[2];
+        val3 = vec2[3];
+
+        dst[0] = val0;
+        dst[stride] = val1;
+        dst[2 * stride] = val2;
+        dst[3 * stride] = val3;
+
+        val0 = vec2[4];
+        val1 = vec2[5];
+        val2 = vec2[6];
+        val3 = vec2[7];
+
+        dst[4 * stride] = val0;
+        dst[5 * stride] = val1;
+        dst[6 * stride] = val2;
+        dst[7 * stride] = val3;
+    }
+}
+
+static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
+                                           const uint8_t *src_left,
+                                           uint8_t *dst, int32_t stride,
+                                           int32_t flag)
+{
+    int32_t col;
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    v16u8 src;
+    v8i16 vec0, vec1, vec2, vec3;
+
+    src = LD_UB(src_top);
+
+    for (row = 16; row--;) {
+        ST_UB(src, tmp_dst);
+        tmp_dst += stride;
+    }
+
+    if (0 == flag) {
+        src = LD_UB(src_left);
+
+        vec0 = __msa_fill_h(src_left[-1]);
+        vec1 = __msa_fill_h(src_top[0]);
+
+        UNPCK_UB_SH(src, vec2, vec3);
+        SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
+
+        vec2 >>= 1;
+        vec3 >>= 1;
+
+        ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
+        CLIP_SH2_0_255(vec2, vec3);
+
+        src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
+
+        for (col = 0; col < 16; col++) {
+            dst[stride * col] = src[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          int32_t flag)
+{
+    uint32_t val0, val1, val2, val3;
+    v16i8 src0;
+    v8i16 src0_r, src_top_val, src_left_val;
+    v16i8 zero = { 0 };
+
+    val0 = src_left[0] * 0x01010101;
+    val1 = src_left[1] * 0x01010101;
+    val2 = src_left[2] * 0x01010101;
+    val3 = src_left[3] * 0x01010101;
+    SW4(val0, val1, val2, val3, dst, stride);
+
+    if (0 == flag) {
+        val0 = LW(src_top);
+        src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
+        src_top_val = __msa_fill_h(src_top[-1]);
+        src_left_val = __msa_fill_h(src_left[0]);
+
+        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
+
+        src0_r -= src_top_val;
+        src0_r >>= 1;
+        src0_r += src_left_val;
+        src0_r = CLIP_SH_0_255(src0_r);
+        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
+        val0 = __msa_copy_s_w((v4i32) src0, 0);
+        SW(val0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          int32_t flag)
+{
+    uint64_t val0, val1, val2, val3;
+    v16i8 src0;
+    v8i16 src0_r, src_top_val, src_left_val;
+    v16i8 zero = { 0 };
+
+    val0 = src_left[0] * 0x0101010101010101;
+    val1 = src_left[1] * 0x0101010101010101;
+    val2 = src_left[2] * 0x0101010101010101;
+    val3 = src_left[3] * 0x0101010101010101;
+    SD4(val0, val1, val2, val3, dst, stride);
+
+    val0 = src_left[4] * 0x0101010101010101;
+    val1 = src_left[5] * 0x0101010101010101;
+    val2 = src_left[6] * 0x0101010101010101;
+    val3 = src_left[7] * 0x0101010101010101;
+    SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
+
+    if (0 == flag) {
+        val0 = LD(src_top);
+        src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
+        src_top_val = __msa_fill_h(src_top[-1]);
+        src_left_val = __msa_fill_h(src_left[0]);
+
+        src0_r = (v8i16) __msa_ilvr_b(zero, src0);
+
+        src0_r -= src_top_val;
+        src0_r >>= 1;
+        src0_r += src_left_val;
+        src0_r = CLIP_SH_0_255(src0_r);
+        src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
+        val0 = __msa_copy_s_d((v2i64) src0, 0);
+        SD(val0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride,
+                                            int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16i8 src0, src1, src2, src3;
+    v8i16 src0_r, src0_l, src_left_val, src_top_val;
+
+    src_left_val = __msa_fill_h(src_left[0]);
+
+    for (row = 4; row--;) {
+        inp0 = src_left[0];
+        inp1 = src_left[1];
+        inp2 = src_left[2];
+        inp3 = src_left[3];
+        src_left += 4;
+
+        src0 = __msa_fill_b(inp0);
+        src1 = __msa_fill_b(inp1);
+        src2 = __msa_fill_b(inp2);
+        src3 = __msa_fill_b(inp3);
+
+        ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
+        tmp_dst += (4 * stride);
+    }
+
+    if (0 == flag) {
+        src0 = LD_SB(src_top);
+        src_top_val = __msa_fill_h(src_top[-1]);
+
+        UNPCK_UB_SH(src0, src0_r, src0_l);
+        SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
+
+        src0_r >>= 1;
+        src0_l >>= 1;
+
+        ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
+        CLIP_SH2_0_255(src0_r, src0_l);
+        src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
+        ST_SB(src0, dst);
+    }
+}
+
+static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    uint32_t row;
+    uint8_t inp0, inp1, inp2, inp3;
+    v16i8 src0, src1, src2, src3;
+
+    for (row = 0; row < 8; row++) {
+        inp0 = src_left[row * 4];
+        inp1 = src_left[row * 4 + 1];
+        inp2 = src_left[row * 4 + 2];
+        inp3 = src_left[row * 4 + 3];
+
+        src0 = __msa_fill_b(inp0);
+        src1 = __msa_fill_b(inp1);
+        src2 = __msa_fill_b(inp2);
+        src3 = __msa_fill_b(inp3);
+
+        ST_SB2(src0, src0, dst, 16);
+        dst += stride;
+        ST_SB2(src1, src1, dst, 16);
+        dst += stride;
+        ST_SB2(src2, src2, dst, 16);
+        dst += stride;
+        ST_SB2(src3, src3, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t stride,
+                                       int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t addition = 0;
+    uint32_t val0, val1, val2;
+    v16i8 src = { 0 };
+    v16u8 store;
+    v16i8 zero = { 0 };
+    v8u16 sum, vec0, vec1;
+
+    val0 = LW(src_top);
+    val1 = LW(src_left);
+    INSERT_W2_SB(val0, val1, src);
+    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+    val0 = __msa_copy_u_w((v4i32) store, 0);
+    SW4(val0, val0, val0, val0, dst, stride)
+
+        if (0 == flag) {
+        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
+
+        vec1 += vec0;
+        vec0 += vec0;
+        vec1 += vec0;
+
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
+        val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
+        val0 = __msa_copy_u_w((v4i32) store, 0);
+        SW(val0, tmp_dst);
+
+        val0 = src_left[1];
+        val1 = src_left[2];
+        val2 = src_left[3];
+
+        addition *= 3;
+
+        ADD2(val0, addition, val1, addition, val0, val1);
+        val2 += addition;
+
+        val0 += 2;
+        val1 += 2;
+        val2 += 2;
+        val0 >>= 2;
+        val1 >>= 2;
+        val2 >>= 2;
+
+        tmp_dst[stride * 1] = val0;
+        tmp_dst[stride * 2] = val1;
+        tmp_dst[stride * 3] = val2;
+    }
+}
+
+static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t stride,
+                                       int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row, col, val;
+    uint32_t addition = 0;
+    uint64_t val0, val1;
+    v16u8 src = { 0 };
+    v16u8 store;
+    v8u16 sum, vec0, vec1;
+    v16i8 zero = { 0 };
+
+    val0 = LD(src_top);
+    val1 = LD(src_left);
+    INSERT_D2_UB(val0, val1, src);
+    sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+    val0 = __msa_copy_u_d((v2i64) store, 0);
+
+    for (row = 8; row--;) {
+        SD(val0, dst);
+        dst += stride;
+    }
+
+    if (0 == flag) {
+        ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
+
+        vec1 += vec0;
+        vec0 += vec0;
+        vec1 += vec0;
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
+        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
+        val0 = __msa_copy_u_d((v2i64) store, 0);
+        SD(val0, tmp_dst);
+
+        val0 = LD(src_left);
+        src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
+        vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
+        vec0 = (v8u16) __msa_fill_h(addition);
+        vec0 *= 3;
+        vec1 += vec0;
+        vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
+
+        for (col = 1; col < 8; col++) {
+            tmp_dst[stride * col] = vec1[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride,
+                                         int32_t flag)
+{
+    uint8_t *tmp_dst = dst;
+    uint32_t row, col, val;
+    uint32_t addition = 0;
+    v16u8 src_above1, store, src_left1;
+    v8u16 sum, sum_above, sum_left;
+    v8u16 vec0, vec1, vec2;
+    v16i8 zero = { 0 };
+
+    src_above1 = LD_UB(src_top);
+    src_left1 = LD_UB(src_left);
+
+    HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
+    sum = sum_above + sum_left;
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
+    addition = __msa_copy_u_w((v4i32) sum, 0);
+    store = (v16u8) __msa_fill_b(addition);
+
+    for (row = 16; row--;) {
+        ST_UB(store, dst);
+        dst += stride;
+    }
+
+    if (0 == flag) {
+        vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
+        ILVRL_B2_UH(zero, src_above1, vec1, vec2);
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        vec0 += vec0;
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        SRARI_H2_UH(vec1, vec2, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
+        val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
+        store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
+        ST_UB(store, tmp_dst);
+
+        ILVRL_B2_UH(zero, src_left1, vec1, vec2);
+        vec0 = (v8u16) __msa_fill_h(addition);
+        vec0 *= 3;
+        ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
+        SRARI_H2_UH(vec1, vec2, 2);
+        store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
+
+        for (col = 1; col < 16; col++) {
+            tmp_dst[stride * col] = store[col];
+        }
+    }
+}
+
+static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
+                                         const uint8_t *src_left,
+                                         uint8_t *dst, int32_t stride)
+{
+    uint32_t row;
+    v16u8 src_above1, src_above2, store, src_left1, src_left2;
+    v8u16 sum_above1, sum_above2;
+    v8u16 sum_left1, sum_left2;
+    v8u16 sum, sum_above, sum_left;
+
+    LD_UB2(src_top, 16, src_above1, src_above2);
+    LD_UB2(src_left, 16, src_left1, src_left2);
+    HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
+    HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
+    sum_above = sum_above1 + sum_above2;
+    sum_left = sum_left1 + sum_left2;
+    sum = sum_above + sum_left;
+    sum = (v8u16) __msa_hadd_u_w(sum, sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+    sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
+    sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
+    store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+
+    for (row = 16; row--;) {
+        ST_UB2(store, store, dst, 16);
+        dst += stride;
+        ST_UB2(store, store, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride)
+{
+    uint32_t src0, src1;
+    v16i8 src_vec0, src_vec1;
+    v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
+    v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
+    v16i8 zero = { 0 };
+
+    src0 = LW(src_top);
+    src1 = LW(src_left);
+
+    mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
+
+    src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
+    src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
+
+    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
+    SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
+
+    tmp0 = __msa_fill_h(src_top[4]);
+    tmp1 = __msa_fill_h(src_left[4]);
+
+    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
+         res0, res1, res2, res3);
+
+    res0 += mul_val1 * tmp0;
+    res1 += mul_val1 * tmp0;
+    res2 += mul_val1 * tmp0;
+    res3 += mul_val1 * tmp0;
+
+    res0 += 3 * src_vec0_r;
+    res1 += 2 * src_vec0_r;
+    res2 += src_vec0_r;
+    res0 += tmp1;
+    res1 += 2 * tmp1;
+    res2 += 3 * tmp1;
+    res3 += 4 * tmp1;
+
+    PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
+    SRARI_H2_SH(res0, res1, 3);
+    src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
+    ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride)
+{
+    uint64_t src0, src1;
+    v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
+    v8i16 src_vec0_r, src_vec1_r;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v8i16 tmp0, tmp1, tmp2;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16i8 zero = { 0 };
+
+    src0 = LD(src_top);
+    src1 = LD(src_left);
+
+    src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
+    src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
+
+    ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
+    SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
+    SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
+
+    tmp0 = __msa_fill_h(src_top[8]);
+    tmp1 = __msa_fill_h(src_left[8]);
+
+    MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
+         res0, res1, res2, res3);
+    MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
+         res4, res5, res6, res7);
+
+    tmp2 = mul_val1 * tmp0;
+    res0 += tmp2;
+    res1 += tmp2;
+    res2 += tmp2;
+    res3 += tmp2;
+    res4 += tmp2;
+    res5 += tmp2;
+    res6 += tmp2;
+    res7 += tmp2;
+
+    res0 += 7 * src_vec0_r;
+    res1 += 6 * src_vec0_r;
+    res2 += 5 * src_vec0_r;
+    res3 += 4 * src_vec0_r;
+    res4 += 3 * src_vec0_r;
+    res5 += 2 * src_vec0_r;
+    res6 += src_vec0_r;
+
+    res0 += tmp1;
+    res1 += 2 * tmp1;
+    res2 += 3 * tmp1;
+    res3 += 4 * tmp1;
+    res4 += 5 * tmp1;
+    res5 += 6 * tmp1;
+    res6 += 7 * tmp1;
+    res7 += 8 * tmp1;
+
+    SRARI_H4_SH(res0, res1, res2, res3, 4);
+    SRARI_H4_SH(res4, res5, res6, res7, 4);
+    PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
+                src_vec0, src_vec1, src_vec2, src_vec3);
+
+    ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
+}
+
+static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    v16u8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1;
+    v8i16 res0, res1, tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
+
+    src0 = LD_UB(src_top);
+    src1 = LD_UB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    tmp0 = __msa_fill_h(src_top[16]);
+    tmp1 = __msa_fill_h(src_left[16]);
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 15, 1, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 13, 3, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 11, 5, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 9, 7, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 7, 9, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 5, 11, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 3, 13, 5);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 1, 15, 5);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void process_intra_upper_16x16_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          uint8_t offset)
+{
+    v16i8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1, res0, res1;
+    v8i16 tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    tmp0 = __msa_fill_h(src_top[32 - offset]);
+    tmp1 = __msa_fill_h(src_left[32]);
+
+    src0 = LD_SB(src_top);
+    src1 = LD_SB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val1 += offset;
+    mul_val0 -= offset;
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 31, 1, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 29, 3, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 27, 5, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 25, 7, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 23, 9, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 21, 11, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 19, 13, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 17, 15, 6);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void process_intra_lower_16x16_msa(const uint8_t *src_top,
+                                          const uint8_t *src_left,
+                                          uint8_t *dst, int32_t stride,
+                                          uint8_t offset)
+{
+    v16i8 src0, src1;
+    v8i16 src0_r, src1_r, src0_l, src1_l;
+    v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
+    v8i16 mul_val2, mul_val3;
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    tmp0 = __msa_fill_h(src_top[32 - offset]);
+    tmp1 = __msa_fill_h(src_left[16]);
+
+    src0 = LD_SB(src_top);
+    src1 = LD_SB(src_left);
+
+    UNPCK_UB_SH(src0, src0_r, src0_l);
+    UNPCK_UB_SH(src1, src1_r, src1_l);
+
+    mul_val1 += offset;
+    mul_val0 -= offset;
+    mul_val2 = mul_val0 - 8;
+    mul_val3 = mul_val1 + 8;
+
+    SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 15, 17, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 13, 19, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 11, 21, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 9, 23, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 7, 25, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 5, 27, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 3, 29, 6);
+    ST_SH2(res0, res1, dst, stride);
+    dst += (2 * stride);
+
+    SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
+    HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
+                          mul_val0, mul_val1, mul_val2, mul_val3,
+                          res0, res1, 1, 31, 6);
+    ST_SH2(res0, res1, dst, stride);
+}
+
+static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t stride)
+{
+    process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
+    process_intra_upper_16x16_msa((src_top + 16), src_left,
+                                  (dst + 16), stride, 16);
+    dst += (16 * stride);
+    src_left += 16;
+
+    process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
+    process_intra_lower_16x16_msa((src_top + 16), src_left,
+                                  (dst + 16), stride, 16);
+}
+
+static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 4;
+    const uint8_t *ref;
+    int32_t last;
+    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, offset;
+    uint64_t tmp0;
+    v16i8 top0, top1, top2, top3;
+    v16i8 dst_val0;
+    v16i8 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = (angle) >> 3;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (angle < 0 && last < -1) {
+        inv_angle_val = inv_angle[mode - 18];
+
+        tmp0 = LD(ref);
+        SD(tmp0, ref_tmp);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
+            ref_tmp[h_cnt] = src_left[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    idx0 = angle_loop >> 5;
+    fact_val0 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx1 = angle_loop >> 5;
+    fact_val1 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx2 = angle_loop >> 5;
+    fact_val2 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx3 = angle_loop >> 5;
+    fact_val3 = angle_loop & 31;
+
+    top0 = LD_SB(ref + idx0 + 1);
+    top1 = LD_SB(ref + idx1 + 1);
+    top2 = LD_SB(ref + idx2 + 1);
+    top3 = LD_SB(ref + idx3 + 1);
+
+    fact0 = __msa_fill_h(fact_val0);
+    fact1 = __msa_fill_h(32 - fact_val0);
+
+    fact2 = __msa_fill_h(fact_val1);
+    fact3 = __msa_fill_h(32 - fact_val1);
+
+    fact4 = __msa_fill_h(fact_val2);
+    fact5 = __msa_fill_h(32 - fact_val2);
+
+    fact6 = __msa_fill_h(fact_val3);
+    fact7 = __msa_fill_h(32 - fact_val3);
+
+    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
+    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
+    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
+               diff0, diff2, diff4, diff6);
+    SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
+    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
+    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
+    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
+
+    diff1 += diff0 * fact1;
+    diff3 += diff2 * fact3;
+
+    SRARI_H2_SH(diff1, diff3, 5);
+    dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
+    ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
+}
+
+static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 8;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t last, offset;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    int32_t tmp0, tmp1, tmp2;
+    v16i8 top0, top1, top2, top3;
+    v16u8 dst_val0, dst_val1;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = (angle) >> 2;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+
+        tmp0 = LW(ref);
+        tmp1 = LW(ref + 4);
+        tmp2 = LW(ref + 8);
+        SW(tmp0, ref_tmp);
+        SW(tmp1, ref_tmp + 4);
+        SW(tmp2, ref_tmp + 8);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx2 = (angle_loop) >> 5;
+        fact_val2 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx3 = (angle_loop) >> 5;
+        fact_val3 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top1 = LD_SB(ref + idx1 + 1);
+        top2 = LD_SB(ref + idx2 + 1);
+        top3 = LD_SB(ref + idx3 + 1);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+
+        SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
+        SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
+        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
+             diff1, diff3, diff5, diff7);
+
+        diff1 += diff0 * fact1;
+        diff3 += diff2 * fact3;
+        diff5 += diff4 * fact5;
+        diff7 += diff6 * fact7;
+
+        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
+        PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
+        ST8x4_UB(dst_val0, dst_val1, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t tmp0;
+    int32_t angle, angle_loop, offset;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 16;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t last;
+    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = angle >> 1;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+
+        top0 = LD_UB(ref);
+        tmp0 = LW(ref + 16);
+        ST_UB(top0, ref_tmp);
+        SW(tmp0, ref_tmp + 16);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 4; v_cnt--;) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx2 = (angle_loop) >> 5;
+        fact_val2 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx3 = (angle_loop) >> 5;
+        fact_val3 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        LD_UB2(ref + idx0 + 1, 16, top0, top1);
+        LD_UB2(ref + idx1 + 1, 16, top2, top3);
+        LD_UB2(ref + idx2 + 1, 16, top4, top5);
+        LD_UB2(ref + idx3 + 1, 16, top6, top7);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact3;
+        diff7 += diff5 * fact3;
+        diff10 += diff8 * fact5;
+        diff11 += diff9 * fact5;
+        diff14 += diff12 * fact7;
+        diff15 += diff13 * fact7;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst0, dst1, dst2, dst3);
+        ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
+        dst += (4 * stride);
+    }
+}
+
+static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp;
+    const uint8_t *ref;
+    const uint8_t *src_left_tmp = src_left - 1;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t tmp0, tmp1, tmp2, tmp3;
+    int32_t angle, angle_loop;
+    int32_t inv_angle_val, inv_angle_val_loop;
+    int32_t last, offset;
+    v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 fact0, fact1, fact2, fact3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+
+    ref_tmp = ref_array + 32;
+
+    angle = intra_pred_angle_up[mode - 18];
+    inv_angle_val = inv_angle[mode - 18];
+    last = angle;
+    angle_loop = angle;
+
+    ref = src_top - 1;
+    if (last < -1) {
+        inv_angle_val_loop = inv_angle_val * last;
+        LD_UB2(ref, 16, top0, top1);
+        tmp0 = ref[32];
+        tmp1 = ref[33];
+        tmp2 = ref[34];
+        tmp3 = ref[35];
+
+        ST_UB2(top0, top1, ref_tmp, 16);
+        ref_tmp[32] = tmp0;
+        ref_tmp[33] = tmp1;
+        ref_tmp[34] = tmp2;
+        ref_tmp[35] = tmp3;
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (inv_angle_val_loop + 128) >> 8;
+            ref_tmp[h_cnt] = src_left_tmp[offset];
+            inv_angle_val_loop += inv_angle_val;
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 16; v_cnt--;) {
+        idx0 = (angle_loop) >> 5;
+        fact_val0 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        idx1 = (angle_loop) >> 5;
+        fact_val1 = (angle_loop) & 31;
+        angle_loop += angle;
+
+        top0 = LD_UB(ref + idx0 + 1);
+        top4 = LD_UB(ref + idx1 + 1);
+        top1 = LD_UB(ref + idx0 + 17);
+        top5 = LD_UB(ref + idx1 + 17);
+        top3 = LD_UB(ref + idx0 + 33);
+        top7 = LD_UB(ref + idx1 + 33);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+
+        top2 = top1;
+        top6 = top5;
+
+        SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact1;
+        diff7 += diff5 * fact1;
+        diff10 += diff8 * fact3;
+        diff11 += diff9 * fact3;
+        diff14 += diff12 * fact3;
+        diff15 += diff13 * fact3;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst0, dst1, dst2, dst3);
+
+        ST_SB2(dst0, dst1, dst, 16);
+        dst += stride;
+        ST_SB2(dst2, dst3, dst, 16);
+        dst += stride;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 4;
+    const uint8_t *ref;
+    int32_t last, offset;
+    int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop, inv_angle_val;
+    uint64_t tmp0;
+    v16i8 dst_val0, dst_val1;
+    v16u8 top0, top1, top2, top3;
+    v16u8 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = angle >> 3;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        tmp0 = LD(ref);
+        SD(tmp0, ref_tmp);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
+            ref_tmp[h_cnt] = src_top[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    idx0 = angle_loop >> 5;
+    fact_val0 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx1 = angle_loop >> 5;
+    fact_val1 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx2 = angle_loop >> 5;
+    fact_val2 = angle_loop & 31;
+    angle_loop += angle;
+
+    idx3 = angle_loop >> 5;
+    fact_val3 = angle_loop & 31;
+
+    top0 = LD_UB(ref + idx0 + 1);
+    top1 = LD_UB(ref + idx1 + 1);
+    top2 = LD_UB(ref + idx2 + 1);
+    top3 = LD_UB(ref + idx3 + 1);
+
+    fact0 = __msa_fill_h(fact_val0);
+    fact1 = __msa_fill_h(32 - fact_val0);
+    fact2 = __msa_fill_h(fact_val1);
+    fact3 = __msa_fill_h(32 - fact_val1);
+    fact4 = __msa_fill_h(fact_val2);
+    fact5 = __msa_fill_h(32 - fact_val2);
+    fact6 = __msa_fill_h(fact_val3);
+    fact7 = __msa_fill_h(32 - fact_val3);
+
+    ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
+    ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
+    ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
+               diff0, diff2, diff4, diff6);
+    SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
+    ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
+    ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
+    MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
+
+    diff1 += diff0 * fact1;
+    diff3 += diff2 * fact3;
+
+    SRARI_H2_SH(diff1, diff3, 5);
+    PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
+
+    diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
+    diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
+
+    diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
+
+    dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
+    dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
+
+    ST4x2_UB(dst_val0, dst, stride);
+    dst += (2 * stride);
+    ST4x2_UB(dst_val1, dst, stride);
+}
+
+static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
+                                                     const uint8_t *src_left,
+                                                     uint8_t *dst,
+                                                     int32_t stride,
+                                                     int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 8;
+    const uint8_t *ref;
+    const uint8_t *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last, offset, tmp0, tmp1, tmp2;
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3;
+    int32_t angle, angle_loop, inv_angle_val;
+    v16i8 top0, top1, top2, top3;
+    v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = (angle) >> 2;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        tmp0 = LW(ref);
+        tmp1 = LW(ref + 4);
+        tmp2 = LW(ref + 8);
+        SW(tmp0, ref_tmp);
+        SW(tmp1, ref_tmp + 4);
+        SW(tmp2, ref_tmp + 8);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 2; v_cnt++) {
+        dst_org = dst;
+
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx2 = angle_loop >> 5;
+        fact_val2 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx3 = angle_loop >> 5;
+        fact_val3 = angle_loop & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top1 = LD_SB(ref + idx1 + 1);
+        top2 = LD_SB(ref + idx2 + 1);
+        top3 = LD_SB(ref + idx3 + 1);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
+        SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
+        MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
+             diff1, diff3, diff5, diff7);
+
+        diff1 += diff0 * fact1;
+        diff3 += diff2 * fact3;
+        diff5 += diff4 * fact5;
+        diff7 += diff6 * fact7;
+
+        SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
+        PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
+        ILVRL_H2_SH(diff1, diff0, diff3, diff4);
+        ST4x8_UB(diff3, diff4, dst_org, stride);
+        dst += 4;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
+    int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
+    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
+    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
+    v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+    int32_t angle, angle_loop, inv_angle_val, offset;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 16;
+    const uint8_t *ref, *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = (angle) >> 1;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        top0 = LD_SB(ref);
+        tmp0 = LW(ref + 16);
+        ST_SB(top0, ref_tmp);
+        SW(tmp0, ref_tmp + 16);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 4; v_cnt++) {
+        dst_org = dst;
+
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx2 = angle_loop >> 5;
+        fact_val2 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx3 = angle_loop >> 5;
+        fact_val3 = angle_loop & 31;
+        angle_loop += angle;
+
+        LD_SB2(ref + idx0 + 1, 16, top0, top1);
+        LD_SB2(ref + idx1 + 1, 16, top2, top3);
+        LD_SB2(ref + idx2 + 1, 16, top4, top5);
+        LD_SB2(ref + idx3 + 1, 16, top6, top7);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+        fact4 = __msa_fill_h(fact_val2);
+        fact5 = __msa_fill_h(32 - fact_val2);
+        fact6 = __msa_fill_h(fact_val3);
+        fact7 = __msa_fill_h(32 - fact_val3);
+
+        SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact3;
+        diff7 += diff5 * fact3;
+        diff10 += diff8 * fact5;
+        diff11 += diff9 * fact5;
+        diff14 += diff12 * fact7;
+        diff15 += diff13 * fact7;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
+        ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
+        ILVRL_H2_SH(diff1, diff0, diff4, diff5);
+        ILVRL_H2_SH(diff3, diff2, diff6, diff7);
+        ST4x8_UB(diff4, diff5, dst_org, stride);
+        dst_org += (8 * stride);
+        ST4x8_UB(diff6, diff7, dst_org, stride);
+        dst += 4;
+    }
+}
+
+static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
+                                                      const uint8_t *src_left,
+                                                      uint8_t *dst,
+                                                      int32_t stride,
+                                                      int32_t mode)
+{
+    int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
+    int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
+    v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
+    v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
+    v8i16 fact0, fact1, fact2, fact3;
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
+    int32_t angle, angle_loop, inv_angle_val, offset;
+    uint8_t ref_array[3 * 32 + 4];
+    uint8_t *ref_tmp = ref_array + 32;
+    const uint8_t *ref, *src_top_tmp = src_top - 1;
+    uint8_t *dst_org;
+    int32_t last;
+
+    angle = intra_pred_angle_low[mode - 2];
+    last = angle;
+    angle_loop = angle;
+
+    ref = src_left - 1;
+    if (last < -1) {
+        inv_angle_val = inv_angle[mode - 11];
+
+        LD_SB2(ref, 16, top0, top1);
+        tmp0 = LW(ref + 32);
+        ST_SB2(top0, top1, ref_tmp, 16);
+        SW(tmp0, ref_tmp + 32);
+
+        for (h_cnt = last; h_cnt <= -1; h_cnt++) {
+            offset = (h_cnt * inv_angle_val + 128) >> 8;
+            ref_tmp[h_cnt] = src_top_tmp[offset];
+        }
+
+        ref = ref_tmp;
+    }
+
+    for (v_cnt = 0; v_cnt < 16; v_cnt++) {
+        dst_org = dst;
+        idx0 = angle_loop >> 5;
+        fact_val0 = angle_loop & 31;
+        angle_loop += angle;
+
+        idx1 = angle_loop >> 5;
+        fact_val1 = angle_loop & 31;
+        angle_loop += angle;
+
+        top0 = LD_SB(ref + idx0 + 1);
+        top4 = LD_SB(ref + idx1 + 1);
+        top1 = LD_SB(ref + idx0 + 17);
+        top5 = LD_SB(ref + idx1 + 17);
+        top3 = LD_SB(ref + idx0 + 33);
+        top7 = LD_SB(ref + idx1 + 33);
+
+        fact0 = __msa_fill_h(fact_val0);
+        fact1 = __msa_fill_h(32 - fact_val0);
+        fact2 = __msa_fill_h(fact_val1);
+        fact3 = __msa_fill_h(32 - fact_val1);
+
+        top2 = top1;
+        top6 = top5;
+
+        SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
+        SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
+
+        UNPCK_UB_SH(top0, diff0, diff1);
+        UNPCK_UB_SH(top1, diff2, diff3);
+        UNPCK_UB_SH(top2, diff4, diff5);
+        UNPCK_UB_SH(top3, diff6, diff7);
+        UNPCK_UB_SH(top4, diff8, diff9);
+        UNPCK_UB_SH(top5, diff10, diff11);
+        UNPCK_UB_SH(top6, diff12, diff13);
+        UNPCK_UB_SH(top7, diff14, diff15);
+
+        MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
+             diff2, diff3, diff6, diff7);
+        MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
+             diff10, diff11, diff14, diff15);
+
+        diff2 += diff0 * fact1;
+        diff3 += diff1 * fact1;
+        diff6 += diff4 * fact1;
+        diff7 += diff5 * fact1;
+        diff10 += diff8 * fact3;
+        diff11 += diff9 * fact3;
+        diff14 += diff12 * fact3;
+        diff15 += diff13 * fact3;
+
+        SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
+        SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
+        PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
+                    dst_val0, dst_val1, dst_val2, dst_val3);
+        ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
+        ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
+
+        ST2x4_UB(diff0, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff0, 4, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff1, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff1, 4, dst_org, stride);
+        dst_org += (4 * stride);
+
+        ST2x4_UB(diff2, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff2, 4, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff3, 0, dst_org, stride);
+        dst_org += (4 * stride);
+        ST2x4_UB(diff3, 4, dst_org, stride);
+        dst_org += (4 * stride);
+
+        dst += 2;
+    }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    uint32_t row;
+    v16u8 src1, src2;
+
+    src1 = LD_UB(src);
+    src2 = LD_UB(src + 16);
+
+    for (row = 32; row--;) {
+        ST_UB2(src1, src2, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
+                                     const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     ptrdiff_t stride)
+{
+    hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
+}
+
+void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
+                               const uint8_t *src_left,
+                               ptrdiff_t stride, int log2, int c_idx)
+{
+    switch (log2) {
+    case 2:
+        hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 3:
+        hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 4:
+        hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
+        break;
+
+    case 5:
+        hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
+        break;
+    }
+}
+
+void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
+                                                 dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode == 26) {
+        hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    }
+}
+
+void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
+                                      const uint8_t *src_top,
+                                      const uint8_t *src_left,
+                                      ptrdiff_t stride, int c_idx, int mode)
+{
+    if (mode == 10) {
+        hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
+    } else if (mode == 26) {
+        intra_predict_vert_32x32_msa(src_top, dst, stride);
+    } else if (mode >= 18) {
+        hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    } else {
+        hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
+                                                  dst, stride, mode);
+    }
+}
+
+void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
+{
+    v16u8 vec0;
+    HEVCLocalContext *lc = s->HEVClc;
+    int i;
+    int hshift = s->ps.sps->hshift[c_idx];
+    int vshift = s->ps.sps->vshift[c_idx];
+    int size_in_luma_h = 16 << hshift;
+    int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_v = 16 << vshift;
+    int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+    int x = x0 >> hshift;
+    int y = y0 >> vshift;
+    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
+    int cur_tb_addr =
+        s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
+
+    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
+    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
+
+    int min_pu_width = s->ps.sps->min_pu_width;
+
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+        lc->tu.intra_pred_mode;
+    uint32_t a;
+    uint8_t left_array[2 * 32 + 1];
+    uint8_t filtered_left_array[2 * 32 + 1];
+    uint8_t top_array[2 * 32 + 1];
+    uint8_t filtered_top_array[2 * 32 + 1];
+
+    uint8_t *left = left_array + 1;
+    uint8_t *top = top_array + 1;
+    uint8_t *filtered_left = filtered_left_array + 1;
+    uint8_t *filtered_top = filtered_top_array + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
+                               (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
+    int cand_left = lc->na.cand_left;
+    int cand_up_left = lc->na.cand_up_left;
+    int cand_up = lc->na.cand_up;
+    int cand_up_right = lc->na.cand_up_right
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
+                               ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
+
+    int bottom_left_size =
+        (((y0 + 2 * size_in_luma_v) >
+          (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
+                                                 2 * size_in_luma_v)) -
+         (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size =
+        (((x0 + 2 * size_in_luma_h) >
+          (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
+         (x0 + size_in_luma_h)) >> hshift;
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+        int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+        int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
+        if (cand_bottom_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_bottom_pu =
+                ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_bottom_pu) ? (s->ps.sps->min_pu_height -
+                                  y_bottom_pu) : (size_in_luma_pu_v));
+            cand_bottom_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_bottom_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_left_pu) ? (s->ps.sps->min_pu_height -
+                                y_left_pu) : (size_in_luma_pu_v));
+            cand_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_left_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_up_left == 1) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            cand_up_left =
+                (s->ref->tab_mvf[(x_left_pu) +
+                                 (y_top_pu) * min_pu_width]).pred_flag ==
+                PF_INTRA;
+        }
+        if (cand_up == 1 && on_pu_edge_y) {
+            int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_top_pu) ? (s->ps.sps->min_pu_width -
+                               x_top_pu) : (size_in_luma_pu_h));
+            cand_up = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up |=
+                    ((s->ref->tab_mvf[(x_top_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        if (cand_up_right == 1 && on_pu_edge_y) {
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int x_right_pu =
+                ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_right_pu) ? (s->ps.sps->min_pu_width -
+                                 x_right_pu) : (size_in_luma_pu_h));
+            cand_up_right = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |=
+                    ((s->ref->tab_mvf[(x_right_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+
+        vec0 = (v16u8) __msa_ldi_b(128);
+
+        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+
+        top[-1] = 128;
+    }
+    if (cand_up_left) {
+        left[-1] = src[(-1) + stride * (-1)];
+        top[-1] = left[-1];
+    }
+    if (cand_up) {
+        vec0 = LD_UB(src - stride);
+        ST_UB(vec0, top);
+    }
+    if (cand_up_right) {
+        vec0 = LD_UB(src - stride + 16);
+        ST_UB(vec0, (top + 16));
+
+        do {
+            uint32_t pix =
+                ((src[(16 + top_right_size - 1) + stride * (-1)]) *
+                 0x01010101U);
+            for (i = 0; i < (16 - top_right_size); i += 4)
+                ((((union unaligned_32 *) (top + 16 + top_right_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+    if (cand_left)
+        for (i = 0; i < 16; i++)
+            left[i] = src[(-1) + stride * (i)];
+    if (cand_bottom_left) {
+        for (i = 16; i < 16 + bottom_left_size; i++)
+            left[i] = src[(-1) + stride * (i)];
+        do {
+            uint32_t pix =
+                ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
+                 0x01010101U);
+            for (i = 0; i < (16 - bottom_left_size); i += 4)
+                ((((union unaligned_32 *) (left + 16 + bottom_left_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        if (cand_bottom_left || cand_left || cand_up_left || cand_up
+            || cand_up_right) {
+            int size_max_x =
+                x0 + ((2 * 16) << hshift) <
+                s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
+            int size_max_y =
+                y0 + ((2 * 16) << vshift) <
+                s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
+            int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
+            if (!cand_up_right) {
+                size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
+                    16 : (s->ps.sps->width - x0) >> hshift;
+            }
+            if (!cand_bottom_left) {
+                size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
+                    16 : (s->ps.sps->height - y0) >> vshift;
+            }
+            if (cand_bottom_left || cand_left || cand_up_left) {
+                while (j > -1
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((j) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j--;
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((j)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA)) {
+                    j = 0;
+                    while (j < size_max_x
+                           &&
+                           !((s->ref->tab_mvf[(((x0 +
+                                                 ((j) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                        j++;
+                    for (i = j; i > (j) - (j + 1); i--)
+                        if (!
+                            ((s->ref->tab_mvf[(((x0 +
+                                                 ((i -
+                                                   1) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                            top[i - 1] = top[i];
+                    left[-1] = top[-1];
+                }
+            } else {
+                j = 0;
+                while (j < size_max_x
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((j) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j++;
+                if (j > 0)
+                    if (x0 > 0) {
+                        for (i = j; i > (j) - (j + 1); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                    } else {
+                        for (i = j; i > (j) - (j); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                        top[-1] = top[0];
+                    }
+                left[-1] = top[-1];
+            }
+            left[-1] = top[-1];
+            if (cand_bottom_left || cand_left) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_y); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
+                    else
+                        a = ((left[i + 3]) * 0x01010101U);
+            }
+            if (!cand_left) {
+                vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+                ST_UB(vec0, left);
+            }
+            if (!cand_bottom_left) {
+
+                vec0 = (v16u8) __msa_fill_b(left[15]);
+
+                ST_UB(vec0, (left + 16));
+            }
+            if (x0 != 0 && y0 != 0) {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((-1)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA))
+                    left[-1] = left[0];
+            } else if (x0 == 0) {
+                do {
+                    uint32_t pix = ((0) * 0x01010101U);
+                    for (i = 0; i < (size_max_y); i += 4)
+                        ((((union unaligned_32 *) (left + i))->l) = (pix));
+                } while (0);
+            } else {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+            }
+            top[-1] = left[-1];
+            if (y0 != 0) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_x); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((i) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
+                    else
+                        a = ((top[i + 3]) * 0x01010101U);
+            }
+        }
+    }
+
+    if (!cand_bottom_left) {
+        if (cand_left) {
+            vec0 = (v16u8) __msa_fill_b(left[15]);
+
+            ST_UB(vec0, (left + 16));
+
+        } else if (cand_up_left) {
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_left = 1;
+        } else if (cand_up) {
+            left[-1] = top[0];
+
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_up_left = 1;
+            cand_left = 1;
+        } else if (cand_up_right) {
+            vec0 = (v16u8) __msa_fill_b(top[16]);
+
+            ST_UB(vec0, top);
+
+            left[-1] = top[16];
+
+            ST_UB2(vec0, vec0, left, 16);
+
+            cand_up = 1;
+            cand_up_left = 1;
+            cand_left = 1;
+        } else {
+            left[-1] = 128;
+            vec0 = (v16u8) __msa_ldi_b(128);
+
+            ST_UB2(vec0, vec0, top, 16);
+            ST_UB2(vec0, vec0, left, 16);
+        }
+    }
+
+    if (!cand_left) {
+        vec0 = (v16u8) __msa_fill_b(left[16]);
+        ST_UB(vec0, left);
+    }
+    if (!cand_up_left) {
+        left[-1] = left[0];
+    }
+    if (!cand_up) {
+        vec0 = (v16u8) __msa_fill_b(left[-1]);
+        ST_UB(vec0, top);
+    }
+    if (!cand_up_right) {
+        vec0 = (v16u8) __msa_fill_b(top[15]);
+        ST_UB(vec0, (top + 16));
+    }
+
+    top[-1] = left[-1];
+
+
+    if (!s->ps.sps->intra_smoothing_disabled_flag
+        && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && 16 != 4) {
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor =
+                (((((int) (mode - 26U)) >=
+                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
+                 ((((int) (mode - 10U)) >=
+                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 ? ((((int) (mode - 10U)) >=
+                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 : ((((int) (mode - 26U)) >=
+                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
+                filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
+                filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
+                for (i = 2 * 16 - 2; i >= 0; i--)
+                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                        left[i - 1] + 2) >> 2;
+                filtered_top[-1] =
+                    filtered_left[-1] =
+                    (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                for (i = 2 * 16 - 2; i >= 0; i--)
+                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                       top[i - 1] + 2) >> 2;
+                left = filtered_left;
+                top = filtered_top;
+            }
+        }
+    }
+
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
+                                   (uint8_t *) left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
+                       (uint8_t *) left, stride, 4, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
+                                    (uint8_t *) left, stride, c_idx, mode);
+        break;
+    }
+}
+
+void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
+{
+    v16u8 vec0, vec1;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3;
+    v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
+    v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    HEVCLocalContext *lc = s->HEVClc;
+    int i;
+    int hshift = s->ps.sps->hshift[c_idx];
+    int vshift = s->ps.sps->vshift[c_idx];
+    int size_in_luma_h = 32 << hshift;
+    int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+    int size_in_luma_v = 32 << vshift;
+    int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+    int x = x0 >> hshift;
+    int y = y0 >> vshift;
+    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
+    int cur_tb_addr =
+        s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
+
+    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
+    uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
+
+    int min_pu_width = s->ps.sps->min_pu_width;
+
+    enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+        lc->tu.intra_pred_mode;
+    uint32_t a;
+    uint8_t left_array[2 * 32 + 1];
+    uint8_t filtered_left_array[2 * 32 + 1];
+    uint8_t top_array[2 * 32 + 1];
+    uint8_t filtered_top_array[2 * 32 + 1];
+
+    uint8_t *left = left_array + 1;
+    uint8_t *top = top_array + 1;
+    uint8_t *filtered_left = filtered_left_array + 1;
+    uint8_t *filtered_top = filtered_top_array + 1;
+    int cand_bottom_left = lc->na.cand_bottom_left
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
+                               (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
+    int cand_left = lc->na.cand_left;
+    int cand_up_left = lc->na.cand_up_left;
+    int cand_up = lc->na.cand_up;
+    int cand_up_right = lc->na.cand_up_right
+        && cur_tb_addr >
+        s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
+                               ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
+
+    int bottom_left_size =
+        (((y0 + 2 * size_in_luma_v) >
+          (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
+                                                 2 * size_in_luma_v)) -
+         (y0 + size_in_luma_v)) >> vshift;
+    int top_right_size =
+        (((x0 + 2 * size_in_luma_h) >
+          (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
+         (x0 + size_in_luma_h)) >> hshift;
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+        int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+        int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
+        if (!size_in_luma_pu_h)
+            size_in_luma_pu_h++;
+        if (cand_bottom_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_bottom_pu =
+                ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_bottom_pu) ? (s->ps.sps->min_pu_height -
+                                  y_bottom_pu) : (size_in_luma_pu_v));
+            cand_bottom_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_bottom_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_bottom_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_left == 1 && on_pu_edge_x) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_v) >
+                 (s->ps.sps->min_pu_height -
+                  y_left_pu) ? (s->ps.sps->min_pu_height -
+                                y_left_pu) : (size_in_luma_pu_v));
+            cand_left = 0;
+            for (i = 0; i < max; i += 2)
+                cand_left |=
+                    ((s->ref->tab_mvf[(x_left_pu) +
+                                      (y_left_pu +
+                                       i) * min_pu_width]).pred_flag ==
+                     PF_INTRA);
+        }
+        if (cand_up_left == 1) {
+            int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            cand_up_left =
+                (s->ref->tab_mvf[(x_left_pu) +
+                                 (y_top_pu) * min_pu_width]).pred_flag ==
+                PF_INTRA;
+        }
+        if (cand_up == 1 && on_pu_edge_y) {
+            int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_top_pu) ? (s->ps.sps->min_pu_width -
+                               x_top_pu) : (size_in_luma_pu_h));
+            cand_up = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up |=
+                    ((s->ref->tab_mvf[(x_top_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        if (cand_up_right == 1 && on_pu_edge_y) {
+            int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
+            int x_right_pu =
+                ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
+            int max =
+                ((size_in_luma_pu_h) >
+                 (s->ps.sps->min_pu_width -
+                  x_right_pu) ? (s->ps.sps->min_pu_width -
+                                 x_right_pu) : (size_in_luma_pu_h));
+            cand_up_right = 0;
+            for (i = 0; i < max; i += 2)
+                cand_up_right |=
+                    ((s->ref->tab_mvf[(x_right_pu + i) +
+                                      (y_top_pu) *
+                                      min_pu_width]).pred_flag == PF_INTRA);
+        }
+        vec0 = (v16u8) __msa_ldi_b(128);
+
+        ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+        ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+
+        top[-1] = 128;
+    }
+    if (cand_up_left) {
+        left[-1] = src[(-1) + stride * (-1)];
+        top[-1] = left[-1];
+    }
+    if (cand_up) {
+        LD_UB2(src - stride, 16, vec0, vec1);
+        ST_UB2(vec0, vec1, top, 16);
+    }
+
+    if (cand_up_right) {
+        LD_UB2(src - stride + 32, 16, vec0, vec1);
+        ST_UB2(vec0, vec1, (top + 32), 16);
+        do {
+            uint32_t pix =
+                ((src[(32 + top_right_size - 1) + stride * (-1)]) *
+                 0x01010101U);
+            for (i = 0; i < (32 - top_right_size); i += 4)
+                ((((union unaligned_32 *) (top + 32 + top_right_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+    if (cand_left)
+        for (i = 0; i < 32; i++)
+            left[i] = src[(-1) + stride * (i)];
+    if (cand_bottom_left) {
+        for (i = 32; i < 32 + bottom_left_size; i++)
+            left[i] = src[(-1) + stride * (i)];
+        do {
+            uint32_t pix =
+                ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
+                 0x01010101U);
+            for (i = 0; i < (32 - bottom_left_size); i += 4)
+                ((((union unaligned_32 *) (left + 32 + bottom_left_size +
+                                           i))->l) = (pix));
+        } while (0);
+    }
+
+    if (s->ps.pps->constrained_intra_pred_flag == 1) {
+        if (cand_bottom_left || cand_left || cand_up_left || cand_up
+            || cand_up_right) {
+            int size_max_x =
+                x0 + ((2 * 32) << hshift) <
+                s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
+            int size_max_y =
+                y0 + ((2 * 32) << vshift) <
+                s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
+            int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
+            if (!cand_up_right) {
+                size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
+                    32 : (s->ps.sps->width - x0) >> hshift;
+            }
+            if (!cand_bottom_left) {
+                size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
+                    32 : (s->ps.sps->height - y0) >> vshift;
+            }
+            if (cand_bottom_left || cand_left || cand_up_left) {
+                while (j > -1
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((j) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j--;
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((j)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA)) {
+                    j = 0;
+                    while (j < size_max_x
+                           &&
+                           !((s->ref->tab_mvf[(((x0 +
+                                                 ((j) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                        j++;
+                    for (i = j; i > (j) - (j + 1); i--)
+                        if (!
+                            ((s->ref->tab_mvf[(((x0 +
+                                                 ((i -
+                                                   1) << hshift)) >> s->ps.sps->
+                                                log2_min_pu_size)) + (((y0 +
+                                                                        ((-1) <<
+                                                                         vshift))
+                                                                       >> s->
+                                                                       ps.sps->
+                                                                       log2_min_pu_size))
+                                              * min_pu_width]).pred_flag ==
+                             PF_INTRA))
+                            top[i - 1] = top[i];
+                    left[-1] = top[-1];
+                }
+            } else {
+                j = 0;
+                while (j < size_max_x
+                       &&
+                       !((s->ref->tab_mvf[(((x0 +
+                                             ((j) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                    j++;
+                if (j > 0)
+                    if (x0 > 0) {
+                        for (i = j; i > (j) - (j + 1); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                    } else {
+                        for (i = j; i > (j) - (j); i--)
+                            if (!
+                                ((s->ref->tab_mvf[(((x0 +
+                                                     ((i -
+                                                       1) << hshift)) >>
+                                                    s->ps.sps->log2_min_pu_size))
+                                                  + (((y0 + ((-1)
+                                                             << vshift))
+                                                      >>
+                                                      s->ps.sps->log2_min_pu_size))
+                                                  *
+                                                  min_pu_width]).pred_flag ==
+                                 PF_INTRA))
+                                top[i - 1] = top[i];
+                        top[-1] = top[0];
+                    }
+                left[-1] = top[-1];
+            }
+            left[-1] = top[-1];
+            if (cand_bottom_left || cand_left) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_y); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i]))->l) = (a));
+                    else
+                        a = ((left[i + 3]) * 0x01010101U);
+            }
+            if (!cand_left) {
+                vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+                ST_UB2(vec0, vec0, left, 16);
+            }
+            if (!cand_bottom_left) {
+                vec0 = (v16u8) __msa_fill_b(left[31]);
+
+                ST_UB2(vec0, vec0, (left + 32), 16);
+            }
+            if (x0 != 0 && y0 != 0) {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+                if (!
+                    ((s->ref->tab_mvf[(((x0 +
+                                         ((-1) << hshift)) >> s->ps.sps->
+                                        log2_min_pu_size)) + (((y0 + ((-1)
+                                                                      <<
+                                                                      vshift))
+                                                               >> s->ps.sps->
+                                                               log2_min_pu_size))
+                                      * min_pu_width]).pred_flag == PF_INTRA))
+                    left[-1] = left[0];
+            } else if (x0 == 0) {
+                do {
+                    uint32_t pix = ((0) * 0x01010101U);
+                    for (i = 0; i < (size_max_y); i += 4)
+                        ((((union unaligned_32 *) (left + i))->l) = (pix));
+                } while (0);
+            } else {
+                a = ((left[size_max_y - 1]) * 0x01010101U);
+                for (i = (size_max_y - 1);
+                     i > (size_max_y - 1) - (size_max_y); i -= 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((-1) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 +
+                                                                    ((i -
+                                                                      3) <<
+                                                                     vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
+                    else
+                        a = ((left[i - 3]) * 0x01010101U);
+            }
+            top[-1] = left[-1];
+            if (y0 != 0) {
+                a = ((left[-1]) * 0x01010101U);
+                for (i = 0; i < (0) + (size_max_x); i += 4)
+                    if (!
+                        ((s->ref->tab_mvf[(((x0 +
+                                             ((i) << hshift)) >> s->ps.sps->
+                                            log2_min_pu_size)) + (((y0 + ((-1)
+                                                                          <<
+                                                                          vshift))
+                                                                   >> s->ps.sps->
+                                                                   log2_min_pu_size))
+                                          * min_pu_width]).pred_flag ==
+                         PF_INTRA))
+                        ((((union unaligned_32 *) (&top[i]))->l) = (a));
+                    else
+                        a = ((top[i + 3]) * 0x01010101U);
+            }
+        }
+    }
+
+    if (!cand_bottom_left) {
+        if (cand_left) {
+            vec0 = (v16u8) __msa_fill_b(left[31]);
+
+            ST_UB2(vec0, vec0, (left + 32), 16);
+        } else if (cand_up_left) {
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_left = 1;
+        } else if (cand_up) {
+            left[-1] = top[0];
+
+            vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_up_left = 1;
+            cand_left = 1;
+        } else if (cand_up_right) {
+            vec0 = (v16u8) __msa_fill_b(top[32]);
+
+            ST_UB2(vec0, vec0, top, 16);
+
+            left[-1] = top[32];
+
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+
+            cand_up = 1;
+            cand_up_left = 1;
+            cand_left = 1;
+        } else {
+            left[-1] = 128;
+
+            vec0 = (v16u8) __msa_ldi_b(128);
+
+            ST_UB4(vec0, vec0, vec0, vec0, top, 16);
+            ST_UB4(vec0, vec0, vec0, vec0, left, 16);
+        }
+    }
+
+    if (!cand_left) {
+        vec0 = (v16u8) __msa_fill_b(left[32]);
+
+        ST_UB2(vec0, vec0, left, 16);
+    }
+    if (!cand_up_left) {
+        left[-1] = left[0];
+    }
+    if (!cand_up) {
+        vec0 = (v16u8) __msa_fill_b(left[-1]);
+
+        ST_UB2(vec0, vec0, top, 16);
+    }
+    if (!cand_up_right) {
+        vec0 = (v16u8) __msa_fill_b(top[31]);
+
+        ST_UB2(vec0, vec0, (top + 32), 16);
+    }
+
+    top[-1] = left[-1];
+
+
+    if (!s->ps.sps->intra_smoothing_disabled_flag
+        && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
+        if (mode != INTRA_DC && 32 != 4) {
+            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+            int min_dist_vert_hor =
+                (((((int) (mode - 26U)) >=
+                   0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
+                 ((((int) (mode - 10U)) >=
+                   0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 ? ((((int) (mode - 10U)) >=
+                     0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
+                 : ((((int) (mode - 26U)) >=
+                     0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
+            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
+                int threshold = 1 << (8 - 5);
+                if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
+                    && c_idx == 0
+                    && ((top[-1] + top[63] - 2 * top[31]) >=
+                        0 ? (top[-1] + top[63] -
+                             2 * top[31]) : (-(top[-1] + top[63] -
+                                               2 * top[31]))) < threshold
+                    && ((left[-1] + left[63] - 2 * left[31]) >=
+                        0 ? (left[-1] + left[63] -
+                             2 * left[31]) : (-(left[-1] + left[63] -
+                                                2 * left[31]))) < threshold) {
+
+
+                    filtered_top[-1] = top[-1];
+                    filtered_top[63] = top[63];
+
+
+                    for (i = 0; i < 63; i++) {
+                        filtered_top[i] =
+                            ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
+                    }
+
+                    tmp0 = __msa_fill_h(top[-1]);
+                    tmp1 = __msa_fill_h(top[63]);
+
+                    tmp2 = mul_val0 - 8;
+                    tmp3 = mul_val0 - 16;
+                    tmp4 = mul_val0 - 24;
+                    tmp5 = mul_val1 + 8;
+                    tmp6 = mul_val1 + 16;
+                    tmp7 = mul_val1 + 24;
+
+                    res0 = mul_val0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res3 = tmp4 * tmp0;
+                    res0 += mul_val1 * tmp1;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, filtered_top, 16);
+
+                    res0 = mul_val0 - 32;
+                    tmp2 = mul_val0 - 40;
+                    tmp3 = mul_val0 - 48;
+                    tmp4 = mul_val0 - 56;
+                    res3 = mul_val1 + 32;
+                    tmp5 = mul_val1 + 40;
+                    tmp6 = mul_val1 + 48;
+                    tmp7 = mul_val1 + 56;
+
+                    res0 = res0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res0 += res3 * tmp1;
+                    res3 = tmp4 * tmp0;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, (filtered_top + 32), 16);
+
+                    filtered_top[63] = top[63];
+
+                    tmp0 = __msa_fill_h(left[-1]);
+                    tmp1 = __msa_fill_h(left[63]);
+
+                    tmp2 = mul_val0 - 8;
+                    tmp3 = mul_val0 - 16;
+                    tmp4 = mul_val0 - 24;
+                    tmp5 = mul_val1 + 8;
+                    tmp6 = mul_val1 + 16;
+                    tmp7 = mul_val1 + 24;
+
+                    res0 = mul_val0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res3 = tmp4 * tmp0;
+                    res0 += mul_val1 * tmp1;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, left, 16);
+
+                    res0 = mul_val0 - 32;
+                    tmp2 = mul_val0 - 40;
+                    tmp3 = mul_val0 - 48;
+                    tmp4 = mul_val0 - 56;
+                    res3 = mul_val1 + 32;
+                    tmp5 = mul_val1 + 40;
+                    tmp6 = mul_val1 + 48;
+                    tmp7 = mul_val1 + 56;
+
+                    res0 = res0 * tmp0;
+                    res1 = tmp2 * tmp0;
+                    res2 = tmp3 * tmp0;
+                    res0 += res3 * tmp1;
+                    res3 = tmp4 * tmp0;
+                    res1 += tmp5 * tmp1;
+                    res2 += tmp6 * tmp1;
+                    res3 += tmp7 * tmp1;
+
+                    res0 = __msa_srari_h(res0, 6);
+                    res1 = __msa_srari_h(res1, 6);
+                    res2 = __msa_srari_h(res2, 6);
+                    res3 = __msa_srari_h(res3, 6);
+
+                    vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
+                    vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
+
+                    ST_UB2(vec0, vec1, (left + 32), 16);
+
+                    left[63] = tmp1[0];
+
+                    top = filtered_top;
+                } else {
+                    filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
+                    filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
+                    for (i = 2 * 32 - 2; i >= 0; i--)
+                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                            left[i - 1] + 2) >> 2;
+                    filtered_top[-1] =
+                        filtered_left[-1] =
+                        (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
+                    for (i = 2 * 32 - 2; i >= 0; i--)
+                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                           top[i - 1] + 2) >> 2;
+                    left = filtered_left;
+                    top = filtered_top;
+                }
+            }
+        }
+    }
+
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
+                               (uint8_t *) left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
+                       (uint8_t *) left, stride, 5, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
+                                (uint8_t *) left, stride, c_idx, mode);
+        break;
+    }
+}
diff --git a/libavcodec/mips/hpeldsp_init_mips.c b/libavcodec/mips/hpeldsp_init_mips.c
new file mode 100644
index 0000000..363a045
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_init_mips.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../hpeldsp.h"
+#include "libavcodec/mips/hpeldsp_mips.h"
+
+#if HAVE_MSA
+static void ff_hpeldsp_init_msa(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_msa;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_msa;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_msa;
+
+    c->put_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_msa;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_msa;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_msa;
+
+    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_msa;
+    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_msa;
+    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_msa;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_msa;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_msa;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_msa;
+
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_msa;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_msa;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_msa;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_msa;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_msa;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_msa;
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_msa;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_msa;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_msa;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_msa;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_msa;
+
+    c->avg_pixels_tab[2][0] = ff_avg_pixels4_msa;
+    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_msa;
+    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_msa;
+    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static void ff_hpeldsp_init_mmi(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_8_mmi;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_8_mmi;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_8_mmi;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_8_mmi;
+
+    c->put_pixels_tab[1][0] = ff_put_pixels8_8_mmi;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_8_mmi;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_8_mmi;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_8_mmi;
+
+    c->put_pixels_tab[2][0] = ff_put_pixels4_8_mmi;
+    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_8_mmi;
+    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_8_mmi;
+    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_8_mmi;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_8_mmi;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_8_mmi;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_8_mmi;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_8_mmi;
+
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_8_mmi;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_8_mmi;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_8_mmi;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_8_mmi;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_8_mmi;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_8_mmi;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_8_mmi;
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_8_mmi;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_8_mmi;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_8_mmi;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_8_mmi;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_8_mmi;
+
+    c->avg_pixels_tab[2][0] = ff_avg_pixels4_8_mmi;
+    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_8_mmi;
+    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_8_mmi;
+    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_8_mmi;
+}
+#endif  // #if HAVE_MMI
+
+void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags)
+{
+#if HAVE_MSA
+    ff_hpeldsp_init_msa(c, flags);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    ff_hpeldsp_init_mmi(c, flags);
+#endif  // #if HAVE_MMI
+}
diff --git a/libavcodec/mips/hpeldsp_mips.h b/libavcodec/mips/hpeldsp_mips.h
new file mode 100644
index 0000000..f527c1d
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_mips.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
+#define AVCODEC_MIPS_HPELDSP_MIPS_H
+
+#include "libavcodec/bit_depth_template.c"
+
+void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int32_t h);
+
+void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_no_rnd_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+
+void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+
+#endif  // #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
diff --git a/libavcodec/mips/hpeldsp_mmi.c b/libavcodec/mips/hpeldsp_mmi.c
new file mode 100644
index 0000000..4c46f00
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_mmi.c
@@ -0,0 +1,1257 @@
+/*
+ * Loongson SIMD optimized qpeldsp
+ *
+ * Copyright (c) 2016 Loongson Technology Corporation Limited
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hpeldsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavutil/mips/asmdefs.h"
+#include "constants.h"
+
+void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[2];
+    mips_reg addr[2];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [low32]"=&r"(low32),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[2];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[3];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[block])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[block])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [low32]"=&r"(low32),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[3];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[8];
+    mips_reg addr[3];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[4];
+    mips_reg addr[5];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [low32]"=&r"(low32),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[4];
+    mips_reg addr[5];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[8];
+    mips_reg addr[5];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[6];
+    mips_reg addr[6];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [low32]"=&r"(low32),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[6];
+    mips_reg addr[6];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
+            src_stride2, h);
+    ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
+            src_stride1, src_stride2, h);
+}
+
+void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[5];
+    mips_reg addr[5];
+
+    __asm__ volatile (
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
+            line_size, line_size, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
+}
+
+void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int i;
+    const uint32_t a = AV_RN32(pixels);
+    const uint32_t b = AV_RN32(pixels + 1);
+    uint32_t l0 = (a & 0x03030303UL) +
+                  (b & 0x03030303UL) +
+                       0x02020202UL;
+    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                  ((b & 0xFCFCFCFCUL) >> 2);
+    uint32_t l1, h1;
+
+    pixels += line_size;
+    for (i = 0; i < h; i += 2) {
+        uint32_t a = AV_RN32(pixels);
+        uint32_t b = AV_RN32(pixels + 1);
+        l1 = (a & 0x03030303UL) +
+             (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block  += line_size;
+        a  = AV_RN32(pixels);
+        b  = AV_RN32(pixels + 1);
+        l0 = (a & 0x03030303UL) +
+             (b & 0x03030303UL) +
+                  0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block  += line_size;
+    }
+}
+
+void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+#if 1
+    double ftmp[10];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "dli        %[addr0],   0x0f                                    \n\t"
+        "pcmpeqw    %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
+        "dli        %[addr0],   0x01                                    \n\t"
+        "psrlh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+
+        "dli        %[addr0],   0x02                                    \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
+        "gsldlc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp4],   0x01(%[pixels])                         \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        ".p2align   3                                                   \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x08(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x01(%[addr1])                          \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "gssdxc1    %[ftmp4],   0x00(%[block],  %[addr0])               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
+        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp4],   0x08(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp4],   0x01(%[addr1])                          \n\t"
+        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "gssdxc1    %[ftmp0],   0x00(%[block],  %[addr0])               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
+        PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [h]"+&r"(h),                      [pixels]"+&r"(pixels)
+        : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+#else
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x02020202UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x02020202UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+#endif
+}
+
+void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int i;
+    const uint32_t a = AV_RN32(pixels);
+    const uint32_t b = AV_RN32(pixels + 1);
+    uint32_t l0 = (a & 0x03030303UL) +
+                  (b & 0x03030303UL) +
+                       0x02020202UL;
+    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                  ((b & 0xFCFCFCFCUL) >> 2);
+    uint32_t l1, h1;
+
+    pixels += line_size;
+    for (i = 0; i < h; i += 2) {
+        uint32_t a = AV_RN32(pixels);
+        uint32_t b = AV_RN32(pixels + 1);
+        l1 = (a & 0x03030303UL) +
+             (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+        pixels += line_size;
+        block  += line_size;
+        a  = AV_RN32(pixels);
+        b  = AV_RN32(pixels + 1);
+        l0 = (a & 0x03030303UL) +
+             (b & 0x03030303UL) +
+                  0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+        pixels += line_size;
+        block  += line_size;
+    }
+}
+
+void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x02020202UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x02020202UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+}
+
+void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x01010101UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x01010101UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+}
+
+void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
diff --git a/libavcodec/mips/hpeldsp_msa.c b/libavcodec/mips/hpeldsp_msa.c
new file mode 100644
index 0000000..40a0dca
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_msa.c
@@ -0,0 +1,1498 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "libavcodec/mips/hpeldsp_mips.h"
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
+{                                                             \
+    v16u8 tmp_m;                                              \
+                                                              \
+    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
+    ST_UB(tmp_m, (pdst));                                     \
+}
+
+#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                           \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                                   \
+                                                                            \
+    PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7,                     \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                            \
+    ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride);                 \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride)                                \
+{                                                                       \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                               \
+                                                                        \
+    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                           \
+}
+
+static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        SLDI_B4_0_SB(src0, src1, src2, src3,
+                     src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+        AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                      src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src8, src9, src10, src11, src12, src13, src14, src15);
+        src += (8 * src_stride);
+
+        AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+    v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    SLDI_B4_0_SB(src0, src1, src2, src3,
+                 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+    SLDI_B4_0_SB(src4, src5, src6, src7,
+                 src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1);
+
+    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
+                 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    SLDI_B4_0_SB(src0, src1, src2, src3,
+                 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+    AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
+                 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+    src += (8 * src_stride);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
+    src += (4 * src_stride);
+
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB4(src, src_stride, src4, src5, src6, src7);
+    LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
+    src += (4 * src_stride);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+}
+
+static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+
+    AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                  dst, dst_stride);
+}
+
+static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t dst0, dst1, out0, out1;
+    v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
+    v16u8 tmp0 = { 0 };
+    v16u8 tmp1 = { 0 };
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+
+        SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+
+        dst0 = LW(dst);
+        dst1 = LW(dst + dst_stride);
+        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
+        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
+
+        AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
+        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        SLDI_B4_0_SB(src0, src1, src2, src3,
+                     src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+
+        AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
+                          src3, src3_sld1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src8, src9, src10, src11, src12, src13, src14, src15);
+        src += (8 * src_stride);
+
+        AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+        AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16u8 src0, src1, src2, res0, res1;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                      dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+    }
+}
+
+static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+        AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                       dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                 dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                 dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4;
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                 dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+    src += (8 * src_stride);
+    src16 = LD_UB(src);
+
+    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src12, src13, src13, src14,
+                  src14, src15, src15, src16, dst, dst_stride);
+}
+
+static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                  dst, dst_stride);
+    dst += (4 * dst_stride);
+    AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                  dst, dst_stride);
+}
+
+static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1, dst0, dst1;
+    v16u8 src0, src1, src2;
+    v16u8 tmp0 = { 0 };
+    v16u8 tmp1 = { 0 };
+    v16u8 res0, res1;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+        dst0 = LW(dst);
+        dst1 = LW(dst + dst_stride);
+        tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
+        tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
+        AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
+        AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+        src0 = src2;
+    }
+}
+
+static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                          dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+        AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
+                    res0, res1, res2, res3);
+        AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
+                    res4, res5, res6, res7);
+
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
+                    res0, res1, res2, res3);
+        AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
+                    res4, res5, res6, res7);
+        ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
+        dst += (8 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t res0, res1;
+    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
+    v16u8 src0_r, src1_r, src2_r, res;
+    v8u16 add0, add1, add2, sum0, sum1;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
+                   src0_r, src1_r, src2_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        ADD2(add0, add1, add1, add2, sum0, sum1);
+        SRARI_H2_UH(sum0, sum1, 2);
+        res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
+        res0 = __msa_copy_u_w((v4i32) res, 0);
+        res1 = __msa_copy_u_w((v4i32) res, 2);
+        SW(res0, dst);
+        dst += dst_stride;
+        SW(res1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB2_UH(src3_r, src4_r, add3, add4);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
+             sum0, sum1, sum2, sum3);
+        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
+        PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
+        ST8x4_UB(src0, src1, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src9, src10, src11, src12, src13, src14, src15, src16);
+        src += (8 * src_stride);
+
+        src8 = LD_UB(src);
+        src17 = LD_UB(src + 1);
+
+        ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+        ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+        ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+        ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+        ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+        ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+        ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+        ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+        ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+        HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+        HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+        HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+        HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+        HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+        ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
+             sum0_r, sum1_r, sum2_r, sum3_r);
+        ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
+             sum4_r, sum5_r, sum6_r, sum7_r);
+        ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
+             sum0_l, sum1_l, sum2_l, sum3_l);
+        ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
+             sum4_l, sum5_l, sum6_l, sum7_l);
+        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+        PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
+                     sum3_l, sum3_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+        PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
+                     sum7_l, sum7_r, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+    v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
+    v8u16 src0_r, src1_r, src2_r, src3_r;
+    v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
+    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
+    v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+    v16i8 out0, out1;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+
+    SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1,
+                 src3_sld1, 1);
+    SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1);
+    SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1);
+    ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
+               src3, src0_r, src1_r, src2_r, src3_r);
+    ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
+               src5_r, src6_r);
+    ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
+    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
+
+    sum0 = add0 + add1 + 1;
+    sum1 = add1 + add2 + 1;
+    sum2 = add2 + add3 + 1;
+    sum3 = add3 + add4 + 1;
+    sum4 = add4 + add5 + 1;
+    sum5 = add5 + add6 + 1;
+    sum6 = add6 + add7 + 1;
+    sum7 = add7 + add8 + 1;
+
+    SRA_4V(sum0, sum1, sum2, sum3, 2);
+    SRA_4V(sum4, sum5, sum6, sum7, 2);
+    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+    v16i8 out0, out1;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    src4 = LD_SB(src);
+
+    SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+    SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+    ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+               src1_r, src2_r);
+    ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+    HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+    HADD_UB2_UH(src3_r, src4_r, add3, add4);
+
+    sum0 = add0 + add1 + 1;
+    sum1 = add1 + add2 + 1;
+    sum2 = add2 + add3 + 1;
+    sum3 = add3 + add4 + 1;
+
+    SRA_4V(sum0, sum1, sum2, sum3, 2);
+    PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+}
+
+static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
+    v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v8u16 src7_l, src8_l;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8((src + 1), src_stride,
+           src9, src10, src11, src12, src13, src14, src15, src16);
+    src += (8 * src_stride);
+    src8 = LD_UB(src);
+    src17 = LD_UB(src + 1);
+
+    ILVRL_B2_UH(src9, src0, src0_r, src0_l);
+    ILVRL_B2_UH(src10, src1, src1_r, src1_l);
+    ILVRL_B2_UH(src11, src2, src2_r, src2_l);
+    ILVRL_B2_UH(src12, src3, src3_r, src3_l);
+    ILVRL_B2_UH(src13, src4, src4_r, src4_l);
+    ILVRL_B2_UH(src14, src5, src5_r, src5_l);
+    ILVRL_B2_UH(src15, src6, src6_r, src6_l);
+    ILVRL_B2_UH(src16, src7, src7_r, src7_l);
+    ILVRL_B2_UH(src17, src8, src8_r, src8_l);
+
+    HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
+    HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
+    HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
+    HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
+    HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
+    HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
+
+    sum0_r = src0_r + src1_r + 1;
+    sum1_r = src1_r + src2_r + 1;
+    sum2_r = src2_r + src3_r + 1;
+    sum3_r = src3_r + src4_r + 1;
+    sum4_r = src4_r + src5_r + 1;
+    sum5_r = src5_r + src6_r + 1;
+    sum6_r = src6_r + src7_r + 1;
+    sum7_r = src7_r + src8_r + 1;
+    sum0_l = src0_l + src1_l + 1;
+    sum1_l = src1_l + src2_l + 1;
+    sum2_l = src2_l + src3_l + 1;
+    sum3_l = src3_l + src4_l + 1;
+    sum4_l = src4_l + src5_l + 1;
+    sum5_l = src5_l + src6_l + 1;
+    sum6_l = src6_l + src7_l + 1;
+    sum7_l = src7_l + src8_l + 1;
+
+    SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+    SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+    SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+    SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+    PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
+                 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
+    dst += (4 * dst_stride);
+    PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
+                 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
+}
+
+static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    uint32_t out0, out1;
+    v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
+    v16u8 src0_r, src1_r, src2_r;
+    v8u16 add0, add1, add2, sum0, sum1;
+    v16u8 dst0, dst1, res0, res1;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src1, src2);
+        src += (2 * src_stride);
+
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        ADD2(add0, add1, add1, add2, sum0, sum1);
+        SRARI_H2_UH(sum0, sum1, 2);
+        PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+
+        out0 = __msa_copy_u_w((v4i32) res0, 0);
+        out1 = __msa_copy_u_w((v4i32) res1, 0);
+        SW(out0, dst);
+        dst += dst_stride;
+        SW(out1, dst);
+        dst += dst_stride;
+
+        src0 = src2;
+    }
+}
+
+static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
+    v8u16 add0, add1, add2, add3, add4;
+    v8u16 sum0, sum1, sum2, sum3;
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+        SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+        ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
+                   src1_r, src2_r);
+        ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB2_UH(src3_r, src4_r, add3, add4);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
+             sum0, sum1, sum2, sum3);
+        SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
+        PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
+                           sum2, dst2, sum3, dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+        src0 = src4;
+    }
+}
+
+static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst, int32_t dst_stride,
+                                               uint8_t height)
+{
+    uint8_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 src11, src12, src13, src14, src15, src16, src17;
+    v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
+    v16u8 src7_l, src8_l;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
+    v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        LD_UB8((src + 1), src_stride,
+               src9, src10, src11, src12, src13, src14, src15, src16);
+        src += (8 * src_stride);
+
+        src8 = LD_UB(src);
+        src17 = LD_UB(src + 1);
+
+        ILVRL_B2_UB(src9, src0, src0_r, src0_l);
+        ILVRL_B2_UB(src10, src1, src1_r, src1_l);
+        ILVRL_B2_UB(src11, src2, src2_r, src2_l);
+        ILVRL_B2_UB(src12, src3, src3_r, src3_l);
+        ILVRL_B2_UB(src13, src4, src4_r, src4_l);
+        ILVRL_B2_UB(src14, src5, src5_r, src5_l);
+        ILVRL_B2_UB(src15, src6, src6_r, src6_l);
+        ILVRL_B2_UB(src16, src7, src7_r, src7_l);
+        ILVRL_B2_UB(src17, src8, src8_r, src8_l);
+        HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
+        HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
+        HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
+             sum2_r, sum3_r);
+        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
+             sum6_r, sum7_r);
+        HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
+        HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
+        HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
+        ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
+             sum2_l, sum3_l);
+        ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
+             sum6_l, sum7_l);
+        SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
+        SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
+        SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
+        SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+        PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h)
+{
+    copy_width16_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h)
+{
+    common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    copy_width8_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
+                                    const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h)
+{
+    if (h == 16) {
+        common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
+    } else if (h == 8) {
+        common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                                  ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                                   ptrdiff_t line_size, int h)
+{
+    if (h == 8) {
+        common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
+    } else if (h == 4) {
+        common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
+    }
+}
+
+void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
+                         ptrdiff_t line_size, int h)
+{
+    avg_width16_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    avg_width8_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h)
+{
+    avg_width4_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h)
+{
+    common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
+
+void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h)
+{
+    common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
+}
diff --git a/libavcodec/mips/idctdsp_init_mips.c b/libavcodec/mips/idctdsp_init_mips.c
new file mode 100644
index 0000000..8c26bca
--- /dev/null
+++ b/libavcodec/mips/idctdsp_init_mips.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void idctdsp_init_msa(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+        (avctx->bits_per_raw_sample != 10) &&
+        (avctx->bits_per_raw_sample != 12) &&
+        (avctx->idct_algo == FF_IDCT_AUTO)) {
+                c->idct_put = ff_simple_idct_put_msa;
+                c->idct_add = ff_simple_idct_add_msa;
+                c->idct = ff_simple_idct_msa;
+                c->perm_type = FF_IDCT_PERM_NONE;
+    }
+
+    c->put_pixels_clamped = ff_put_pixels_clamped_msa;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_msa;
+    c->add_pixels_clamped = ff_add_pixels_clamped_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void idctdsp_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+    if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+        (avctx->bits_per_raw_sample != 10) &&
+        (avctx->bits_per_raw_sample != 12) &&
+        (avctx->idct_algo == FF_IDCT_AUTO)) {
+                c->idct = ff_simple_idct_mmi;
+                c->perm_type = FF_IDCT_PERM_NONE;
+    }
+
+    c->put_pixels_clamped = ff_put_pixels_clamped_mmi;
+    c->add_pixels_clamped = ff_add_pixels_clamped_mmi;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth)
+{
+#if HAVE_MSA
+    idctdsp_init_msa(c, avctx, high_bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    idctdsp_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/idctdsp_mips.h b/libavcodec/mips/idctdsp_mips.h
new file mode 100644
index 0000000..19267e6
--- /dev/null
+++ b/libavcodec/mips/idctdsp_mips.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
+#define AVCODEC_MIPS_IDCTDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_put_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_msa(const int16_t *block,
+                                      uint8_t *av_restrict pixels,
+                                      ptrdiff_t line_size);
+void ff_add_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size);
+void ff_j_rev_dct_msa(int16_t *data);
+void ff_jref_idct_put_msa(uint8_t *dest, int32_t stride, int16_t *block);
+void ff_jref_idct_add_msa(uint8_t *dest, int32_t stride, int16_t *block);
+void ff_simple_idct_msa(int16_t *block);
+void ff_simple_idct_put_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
+void ff_simple_idct_add_msa(uint8_t *dest, int32_t stride_dst, int16_t *block);
+
+void ff_put_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_add_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size);
+void ff_simple_idct_mmi(int16_t *block);
+void ff_simple_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+void ff_simple_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+
+#endif  // #ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H
diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c
new file mode 100644
index 0000000..24beb62
--- /dev/null
+++ b/libavcodec/mips/idctdsp_mmi.c
@@ -0,0 +1,208 @@
+/*
+ * Loongson SIMD optimized idctdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "constants.h"
+#include "libavutil/mips/asmdefs.h"
+
+void ff_put_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    double ftmp[8];
+    mips_reg addr[1];
+
+    __asm__ volatile (
+        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
+        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
+        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
+        "ldc1       %[ftmp4],   0x20(%[block])                          \n\t"
+        "ldc1       %[ftmp5],   0x28(%[block])                          \n\t"
+        "ldc1       %[ftmp6],   0x30(%[block])                          \n\t"
+        "ldc1       %[ftmp7],   0x38(%[block])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        "gssdxc1    %[ftmp4],   0x00(%[addr0],  %[line_size])           \n\t"
+        "gssdxc1    %[ftmp6],   0x00(%[pixels], %[line_sizex3])         \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [addr0]"=&r"(addr[0]),
+          [pixels]"+&r"(pixels)
+        : [line_size]"r"((mips_reg)line_size),
+          [line_sizex3]"r"((mips_reg)(line_size*3)),
+          [block]"r"(block)
+        : "memory"
+    );
+
+    pixels += line_size*4;
+    block += 32;
+
+    __asm__ volatile (
+        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
+        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
+        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
+        "ldc1       %[ftmp4],   0x20(%[block])                          \n\t"
+        "ldc1       %[ftmp5],   0x28(%[block])                          \n\t"
+        "ldc1       %[ftmp6],   0x30(%[block])                          \n\t"
+        "ldc1       %[ftmp7],   0x38(%[block])                          \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        "gssdxc1    %[ftmp4],   0x00(%[addr0],  %[line_size])           \n\t"
+        "gssdxc1    %[ftmp6],   0x00(%[pixels], %[line_sizex3])         \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [addr0]"=&r"(addr[0]),
+          [pixels]"+&r"(pixels)
+        : [line_size]"r"((mips_reg)line_size),
+          [line_sizex3]"r"((mips_reg)(line_size*3)),
+          [block]"r"(block)
+        : "memory"
+    );
+}
+
+void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
+    uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    int64_t line_skip = line_size;
+    int64_t line_skip3 = 0;
+    double ftmp[5];
+    mips_reg addr[1];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[line_skip3],  %[line_skip],   %[line_skip]        \n\t"
+        "ldc1       %[ftmp1],       0x00(%[block])                      \n\t"
+        "ldc1       %[ftmp0],       0x08(%[block])                      \n\t"
+        "packsshb   %[ftmp1],       %[ftmp1],       %[ftmp0]            \n\t"
+        "ldc1       %[ftmp2],       0x10(%[block])                      \n\t"
+        "ldc1       %[ftmp0],       0x18(%[block])                      \n\t"
+        "packsshb   %[ftmp2],       %[ftmp2],       %[ftmp0]            \n\t"
+        "ldc1       %[ftmp3],       0x20(%[block])                      \n\t"
+        "ldc1       %[ftmp0],       0x28(%[block])                      \n\t"
+        "packsshb   %[ftmp3],       %[ftmp3],       %[ftmp0]            \n\t"
+        "ldc1       %[ftmp4],       48(%[block])                        \n\t"
+        "ldc1       %[ftmp0],       56(%[block])                        \n\t"
+        "packsshb   %[ftmp4],       %[ftmp4],       %[ftmp0]            \n\t"
+        "paddb      %[ftmp1],       %[ftmp1],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp2],       %[ftmp2],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp3],       %[ftmp3],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp4],       %[ftmp4],       %[ff_pb_80]         \n\t"
+        "sdc1       %[ftmp1],       0x00(%[pixels])                     \n\t"
+        "gssdxc1    %[ftmp2],       0x00(%[pixels], %[line_skip])       \n\t"
+        "gssdxc1    %[ftmp3],       0x00(%[pixels], %[line_skip3])      \n\t"
+        PTR_ADDU   "%[line_skip3],  %[line_skip3],  %[line_skip]        \n\t"
+        "gssdxc1    %[ftmp4],       0x00(%[pixels], %[line_skip3])      \n\t"
+        PTR_ADDU   "%[addr0],       %[line_skip3],  %[line_skip]        \n\t"
+        PTR_ADDU   "%[pixels],      %[pixels],      %[addr0]            \n\t"
+        "ldc1       %[ftmp1],       0x40(%[block])                      \n\t"
+        "ldc1       %[ftmp0],       0x48(%[block])                      \n\t"
+        "packsshb   %[ftmp1],       %[ftmp1],       %[ftmp0]            \n\t"
+        "ldc1       %[ftmp2],       0x50(%[block])                      \n\t"
+        "ldc1       %[ftmp0],       0x58(%[block])                      \n\t"
+        "packsshb   %[ftmp2],       %[ftmp2],       %[ftmp0]            \n\t"
+        "ldc1       %[ftmp3],       0x60(%[block])                      \n\t"
+        "ldc1       %[ftmp0],       0x68(%[block])                      \n\t"
+        "packsshb   %[ftmp3],       %[ftmp3],       %[ftmp0]            \n\t"
+        "ldc1       %[ftmp4],       0x70(%[block])                      \n\t"
+        "ldc1       %[ftmp0],       0x78(%[block])                      \n\t"
+        "packsshb   %[ftmp4],       %[ftmp4],       %[ftmp0]            \n\t"
+        "paddb      %[ftmp1],       %[ftmp1],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp2],       %[ftmp2],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp3],       %[ftmp3],       %[ff_pb_80]         \n\t"
+        "paddb      %[ftmp4],       %[ftmp4],       %[ff_pb_80]         \n\t"
+        "sdc1       %[ftmp1],       0x00(%[pixels])                     \n\t"
+        "gssdxc1    %[ftmp2],       0x00(%[pixels], %[line_skip])       \n\t"
+        PTR_ADDU   "%[addr0],       %[line_skip],   %[line_skip]        \n\t"
+        "gssdxc1    %[ftmp3],       0x00(%[pixels], %[addr0])           \n\t"
+        "gssdxc1    %[ftmp4],       0x00(%[pixels], %[line_skip3])      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [addr0]"=&r"(addr[0]),
+          [pixels]"+&r"(pixels),            [line_skip3]"+&r"(line_skip3)
+        : [block]"r"(block),
+          [line_skip]"r"((mips_reg)line_skip),
+          [ff_pb_80]"f"(ff_pb_80)
+        : "memory"
+    );
+}
+
+void ff_add_pixels_clamped_mmi(const int16_t *block,
+        uint8_t *av_restrict pixels, ptrdiff_t line_size)
+{
+    double ftmp[8];
+    uint64_t tmp[1];
+
+    __asm__ volatile (
+        "li         %[tmp0],    0x04                                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        "ldc1       %[ftmp1],   0x00(%[block])                          \n\t"
+        "ldc1       %[ftmp2],   0x08(%[block])                          \n\t"
+        "ldc1       %[ftmp3],   0x10(%[block])                          \n\t"
+        "ldc1       %[ftmp4],   0x18(%[block])                          \n\t"
+        "ldc1       %[ftmp5],   0x00(%[pixels])                         \n\t"
+        "gsldxc1    %[ftmp6],   0x00(%[pixels], %[line_size])           \n\t"
+        "mov.d      %[ftmp7],   %[ftmp5]                                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp1],   0x00(%[pixels])                         \n\t"
+        "gssdxc1    %[ftmp3],   0x00(%[pixels], %[line_size])           \n\t"
+        "addi       %[tmp0],    %[tmp0],        -0x01                   \n\t"
+        PTR_ADDIU  "%[block],   %[block],       0x20                    \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        "bnez       %[tmp0],    1b"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [tmp0]"=&r"(tmp[0]),
+          [pixels]"+&r"(pixels),            [block]"+&r"(block)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/idctdsp_msa.c b/libavcodec/mips/idctdsp_msa.c
new file mode 100644
index 0000000..b29e420
--- /dev/null
+++ b/libavcodec/mips/idctdsp_msa.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "idctdsp_mips.h"
+
+static void put_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                   int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+static void put_signed_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                          int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+    in0 += 128;
+    in1 += 128;
+    in2 += 128;
+    in3 += 128;
+    in4 += 128;
+    in5 += 128;
+    in6 += 128;
+    in7 += 128;
+
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+static void add_pixels_clamped_msa(const int16_t *block, uint8_t *pixels,
+                                   int32_t stride)
+{
+    uint64_t in0_d, in1_d, in2_d, in3_d, in4_d, in5_d, in6_d, in7_d;
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 pix_in0, pix_in1, pix_in2, pix_in3;
+    v16u8 pix_in4, pix_in5, pix_in6, pix_in7;
+    v8u16 pix0, pix1, pix2, pix3, pix4, pix5, pix6, pix7;
+    v8i16 zero = { 0 };
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    LD_UB8(pixels, stride, pix_in0, pix_in1, pix_in2,
+           pix_in3, pix_in4, pix_in5, pix_in6, pix_in7);
+
+    ILVR_B4_UH(zero, pix_in0, zero, pix_in1, zero, pix_in2, zero, pix_in3,
+               pix0, pix1, pix2, pix3);
+    ILVR_B4_UH(zero, pix_in4, zero, pix_in5, zero, pix_in6, zero, pix_in7,
+               pix4, pix5, pix6, pix7);
+
+    in0 += (v8i16) pix0;
+    in1 += (v8i16) pix1;
+    in2 += (v8i16) pix2;
+    in3 += (v8i16) pix3;
+    in4 += (v8i16) pix4;
+    in5 += (v8i16) pix5;
+    in6 += (v8i16) pix6;
+    in7 += (v8i16) pix7;
+
+    CLIP_SH4_0_255(in0, in1, in2, in3);
+    CLIP_SH4_0_255(in4, in5, in6, in7);
+    PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, in0, in1, in2, in3);
+    PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, in4, in5, in6, in7);
+
+    in0_d = __msa_copy_u_d((v2i64) in0, 0);
+    in1_d = __msa_copy_u_d((v2i64) in1, 0);
+    in2_d = __msa_copy_u_d((v2i64) in2, 0);
+    in3_d = __msa_copy_u_d((v2i64) in3, 0);
+    in4_d = __msa_copy_u_d((v2i64) in4, 0);
+    in5_d = __msa_copy_u_d((v2i64) in5, 0);
+    in6_d = __msa_copy_u_d((v2i64) in6, 0);
+    in7_d = __msa_copy_u_d((v2i64) in7, 0);
+    SD4(in0_d, in1_d, in2_d, in3_d, pixels, stride);
+    pixels += 4 * stride;
+    SD4(in4_d, in5_d, in6_d, in7_d, pixels, stride);
+}
+
+void ff_put_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size)
+{
+    put_pixels_clamped_msa(block, pixels, line_size);
+}
+
+void ff_put_signed_pixels_clamped_msa(const int16_t *block,
+                                      uint8_t *av_restrict pixels,
+                                      ptrdiff_t line_size)
+{
+    put_signed_pixels_clamped_msa(block, pixels, line_size);
+}
+
+void ff_add_pixels_clamped_msa(const int16_t *block,
+                               uint8_t *av_restrict pixels,
+                               ptrdiff_t line_size)
+{
+    add_pixels_clamped_msa(block, pixels, line_size);
+}
diff --git a/libavcodec/mips/iirfilter_mips.c b/libavcodec/mips/iirfilter_mips.c
new file mode 100644
index 0000000..87db9ff
--- /dev/null
+++ b/libavcodec/mips/iirfilter_mips.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * IIR filter optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ /**
+ * @file
+ * Reference: libavcodec/iirfilter.c
+ */
+
+#include "libavcodec/iirfilter.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+typedef struct FFIIRFilterCoeffs {
+    int   order;
+    float gain;
+    int   *cx;
+    float *cy;
+} FFIIRFilterCoeffs;
+
+typedef struct FFIIRFilterState {
+    float x[1];
+} FFIIRFilterState;
+
+static void ff_iir_filter_flt_mips(const struct FFIIRFilterCoeffs *c,
+                                   struct FFIIRFilterState *s, int size,
+                                   const float *src, int sstep, float *dst, int dstep)
+{
+    if (c->order == 2) {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        for (i = 0; i < size; i++) {
+            float in = *src0 * c->gain  + s->x[0] * c->cy[0] + s->x[1] * c->cy[1];
+            *dst0 = s->x[0] + in + s->x[1] * c->cx[1];
+            s->x[0] = s->x[1];
+            s->x[1] = in;
+            src0 += sstep;
+            dst0 += dstep;
+        }
+    } else if (c->order == 4) {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        float four = 4.0;
+        float six  = 6.0;
+        for (i = 0; i < size; i += 4) {
+            float in1, in2, in3, in4;
+            float res1, res2, res3, res4;
+            float *x  = s->x;
+            float *cy = c->cy;
+            float gain = c->gain;
+            float src0_0 = src0[0      ];
+            float src0_1 = src0[sstep  ];
+            float src0_2 = src0[2*sstep];
+            float src0_3 = src0[3*sstep];
+
+            __asm__ volatile (
+                "lwc1   $f0,        0(%[cy])                    \n\t"
+                "lwc1   $f4,        0(%[x])                     \n\t"
+                "lwc1   $f5,        4(%[x])                     \n\t"
+                "lwc1   $f6,        8(%[x])                     \n\t"
+                "lwc1   $f7,        12(%[x])                    \n\t"
+                "mul.s  %[in1],     %[src0_0],  %[gain]         \n\t"
+                "mul.s  %[in2],     %[src0_1],  %[gain]         \n\t"
+                "mul.s  %[in3],     %[src0_2],  %[gain]         \n\t"
+                "mul.s  %[in4],     %[src0_3],  %[gain]         \n\t"
+                "lwc1   $f1,        4(%[cy])                    \n\t"
+                "madd.s %[in1],     %[in1],     $f0,    $f4     \n\t"
+                "madd.s %[in2],     %[in2],     $f0,    $f5     \n\t"
+                "madd.s %[in3],     %[in3],     $f0,    $f6     \n\t"
+                "madd.s %[in4],     %[in4],     $f0,    $f7     \n\t"
+                "lwc1   $f2,        8(%[cy])                    \n\t"
+                "madd.s %[in1],     %[in1],     $f1,    $f5     \n\t"
+                "madd.s %[in2],     %[in2],     $f1,    $f6     \n\t"
+                "madd.s %[in3],     %[in3],     $f1,    $f7     \n\t"
+                "lwc1   $f3,        12(%[cy])                   \n\t"
+                "add.s  $f8,        $f5,        $f7             \n\t"
+                "madd.s %[in1],     %[in1],     $f2,    $f6     \n\t"
+                "madd.s %[in2],     %[in2],     $f2,    $f7     \n\t"
+                "mul.s  $f9,        $f6,        %[six]          \n\t"
+                "mul.s  $f10,       $f7,        %[six]          \n\t"
+                "madd.s %[in1],     %[in1],     $f3,    $f7     \n\t"
+                "madd.s %[in2],     %[in2],     $f3,    %[in1]  \n\t"
+                "madd.s %[in3],     %[in3],     $f2,    %[in1]  \n\t"
+                "madd.s %[in4],     %[in4],     $f1,    %[in1]  \n\t"
+                "add.s  %[res1],    $f4,        %[in1]          \n\t"
+                "swc1   %[in1],     0(%[x])                     \n\t"
+                "add.s  $f0,        $f6,        %[in1]          \n\t"
+                "madd.s %[in3],     %[in3],     $f3,    %[in2]  \n\t"
+                "madd.s %[in4],     %[in4],     $f2,    %[in2]  \n\t"
+                "add.s  %[res2],    $f5,        %[in2]          \n\t"
+                "madd.s %[res1],    %[res1],    $f8,    %[four] \n\t"
+                "add.s  $f8,        $f7,        %[in2]          \n\t"
+                "swc1   %[in2],     4(%[x])                     \n\t"
+                "madd.s %[in4],     %[in4],     $f3,    %[in3]  \n\t"
+                "add.s  %[res3],    $f6,        %[in3]          \n\t"
+                "add.s  %[res1],    %[res1],    $f9             \n\t"
+                "madd.s %[res2],    %[res2],    $f0,    %[four] \n\t"
+                "swc1   %[in3],     8(%[x])                     \n\t"
+                "add.s  %[res4],    $f7,        %[in4]          \n\t"
+                "madd.s %[res3],    %[res3],    $f8,    %[four] \n\t"
+                "swc1   %[in4],     12(%[x])                    \n\t"
+                "add.s  %[res2],    %[res2],    $f10            \n\t"
+                "add.s  $f8,        %[in1],     %[in3]          \n\t"
+                "madd.s %[res3],    %[res3],    %[in1], %[six]  \n\t"
+                "madd.s %[res4],    %[res4],    $f8,    %[four] \n\t"
+                "madd.s %[res4],    %[res4],    %[in2], %[six]  \n\t"
+
+                : [in1]"=&f"(in1), [in2]"=&f"(in2),
+                  [in3]"=&f"(in3), [in4]"=&f"(in4),
+                  [res1]"=&f"(res1), [res2]"=&f"(res2),
+                  [res3]"=&f"(res3), [res4]"=&f"(res4)
+                : [src0_0]"f"(src0_0), [src0_1]"f"(src0_1),
+                  [src0_2]"f"(src0_2), [src0_3]"f"(src0_3),
+                  [gain]"f"(gain), [x]"r"(x), [cy]"r"(cy),
+                  [four]"f"(four), [six]"f"(six)
+                : "$f0", "$f1", "$f2", "$f3",
+                  "$f4", "$f5", "$f6", "$f7",
+                  "$f8", "$f9", "$f10",
+                  "memory"
+            );
+
+            dst0[0      ] = res1;
+            dst0[sstep  ] = res2;
+            dst0[2*sstep] = res3;
+            dst0[3*sstep] = res4;
+
+            src0 += 4*sstep;
+            dst0 += 4*dstep;
+        }
+    } else {
+        int i;
+        const float *src0 = src;
+        float       *dst0 = dst;
+        for (i = 0; i < size; i++) {
+            int j;
+            float in, res;
+            in = *src0 * c->gain;
+            for(j = 0; j < c->order; j++)
+                in += c->cy[j] * s->x[j];
+            res = s->x[0] + in + s->x[c->order >> 1] * c->cx[c->order >> 1];
+            for(j = 1; j < c->order >> 1; j++)
+                res += (s->x[j] + s->x[c->order - j]) * c->cx[j];
+            for(j = 0; j < c->order - 1; j++)
+                s->x[j] = s->x[j + 1];
+            *dst0 = res;
+            s->x[c->order - 1] = in;
+            src0 += sstep;
+            dst0 += dstep;
+        }
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_iir_filter_init_mips(FFIIRFilterContext *f) {
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    f->filter_flt = ff_iir_filter_flt_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/lsp_mips.h b/libavcodec/mips/lsp_mips.h
new file mode 100644
index 0000000..6219c5a
--- /dev/null
+++ b/libavcodec/mips/lsp_mips.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nedeljko Babic (nbabic@mips.com)
+ *
+ * LSP routines for ACELP-based codecs optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/lsp.c
+ */
+#ifndef AVCODEC_MIPS_LSP_MIPS_H
+#define AVCODEC_MIPS_LSP_MIPS_H
+
+#if HAVE_MIPSFPU && HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+#include "libavutil/mips/asmdefs.h"
+
+static av_always_inline void ff_lsp2polyf_mips(const double *lsp, double *f, int lp_half_order)
+{
+    int i, j = 0;
+    double * p_fi = f;
+    double * p_f = 0;
+
+    f[0] = 1.0;
+    f[1] = -2 * lsp[0];
+    lsp -= 2;
+
+    for(i=2; i<=lp_half_order; i++)
+    {
+        double tmp, f_j_2, f_j_1, f_j;
+        double val = lsp[2*i];
+
+        __asm__ volatile(
+            "move   %[p_f],     %[p_fi]                         \n\t"
+            "add.d  %[val],     %[val],     %[val]              \n\t"
+            PTR_ADDIU "%[p_fi], 8                               \n\t"
+            "ldc1   %[f_j_1],   0(%[p_f])                       \n\t"
+            "ldc1   %[f_j],     8(%[p_f])                       \n\t"
+            "neg.d  %[val],     %[val]                          \n\t"
+            "add.d  %[tmp],     %[f_j_1],   %[f_j_1]            \n\t"
+            "madd.d %[tmp],     %[tmp],     %[f_j], %[val]      \n\t"
+            "addiu  %[j],       %[i], -2                        \n\t"
+            "ldc1   %[f_j_2],   -8(%[p_f])                      \n\t"
+            "sdc1   %[tmp],     16(%[p_f])                      \n\t"
+            "beqz   %[j],       ff_lsp2polyf_lp_j_end%=         \n\t"
+            "ff_lsp2polyf_lp_j%=:                               \n\t"
+            "add.d  %[tmp],     %[f_j],     %[f_j_2]            \n\t"
+            "madd.d %[tmp],     %[tmp],     %[f_j_1], %[val]    \n\t"
+            "mov.d  %[f_j],     %[f_j_1]                        \n\t"
+            "addiu  %[j],       -1                              \n\t"
+            "mov.d  %[f_j_1],   %[f_j_2]                        \n\t"
+            "ldc1   %[f_j_2],   -16(%[p_f])                     \n\t"
+            "sdc1   %[tmp],     8(%[p_f])                       \n\t"
+            PTR_ADDIU "%[p_f], -8                              \n\t"
+            "bgtz   %[j],       ff_lsp2polyf_lp_j%=             \n\t"
+            "ff_lsp2polyf_lp_j_end%=:                           \n\t"
+
+            : [f_j_2]"=&f"(f_j_2), [f_j_1]"=&f"(f_j_1), [val]"+f"(val),
+              [tmp]"=&f"(tmp), [f_j]"=&f"(f_j), [p_f]"+r"(p_f),
+              [j]"+r"(j), [p_fi]"+r"(p_fi)
+            : [i]"r"(i)
+            : "memory"
+        );
+        f[1] += val;
+    }
+}
+#define ff_lsp2polyf ff_lsp2polyf_mips
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU && HAVE_INLINE_ASM */
+#endif /* AVCODEC_MIPS_LSP_MIPS_H */
diff --git a/libavcodec/mips/mathops.h b/libavcodec/mips/mathops.h
index 573d325..bb9dc83 100644
--- a/libavcodec/mips/mathops.h
+++ b/libavcodec/mips/mathops.h
@@ -1,20 +1,21 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,58 +28,39 @@
 
 #if HAVE_INLINE_ASM
 
-#if HAVE_LOONGSON
-#if ARCH_MIPS64
+#if HAVE_LOONGSON3
 
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
+#define MULH MULH
+static inline av_const int MULH(int a, int b)
 {
-    int64_t m;
-    __asm__ ("dmult %2, %3     \n\t"
-             "mflo  %1         \n\t"
-             "daddu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
+    int c;
+    __asm__ ("dmult %1, %2      \n\t"
+             "mflo %0           \n\t"
+             "dsrl %0, %0, 32   \n\t"
+             : "=r"(c)
+             : "r"(a),"r"(b)
              : "hi", "lo");
-    return d;
+    return c;
 }
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
 
-static inline av_const int64_t MLS64(int64_t d, int a, int b)
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
 {
-    int64_t m;
-    __asm__ ("dmult %2, %3     \n\t"
-             "mflo  %1         \n\t"
-             "dsubu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
-             : "hi", "lo");
-    return d;
-}
-#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
-
-#else
-
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
-{
-    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
-             "daddu   %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
-    return d;
+    int t = b;
+    __asm__ ("sgt $8, %1, %2    \n\t"
+             "movn %0, %1, $8   \n\t"
+             "movn %1, %2, $8   \n\t"
+             "sgt $8, %1, %3    \n\t"
+             "movz %1, %3, $8   \n\t"
+             "sgt $8, %0, %1    \n\t"
+             "movn %0, %1, $8   \n\t"
+             : "+&r"(t),"+&r"(a)
+             : "r"(b),"r"(c)
+             : "$8");
+    return t;
 }
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
-
-static inline av_const int64_t MLS64(int64_t d, int a, int b)
-{
-    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
-             "dsubu   %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
-    return d;
-}
-#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
-
-#endif
 
-#endif /* HAVE_LOONGSON */
+#endif /* HAVE_LOONGSON3 */
 
 #endif /* HAVE_INLINE_ASM */
 
diff --git a/libavcodec/mips/me_cmp_init_mips.c b/libavcodec/mips/me_cmp_init_mips.c
new file mode 100644
index 0000000..219a0dc
--- /dev/null
+++ b/libavcodec/mips/me_cmp_init_mips.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "me_cmp_mips.h"
+
+#if HAVE_MSA
+static av_cold void me_cmp_msa(MECmpContext *c, AVCodecContext *avctx)
+{
+#if BIT_DEPTH == 8
+    c->pix_abs[0][0] = ff_pix_abs16_msa;
+    c->pix_abs[0][1] = ff_pix_abs16_x2_msa;
+    c->pix_abs[0][2] = ff_pix_abs16_y2_msa;
+    c->pix_abs[0][3] = ff_pix_abs16_xy2_msa;
+    c->pix_abs[1][0] = ff_pix_abs8_msa;
+    c->pix_abs[1][1] = ff_pix_abs8_x2_msa;
+    c->pix_abs[1][2] = ff_pix_abs8_y2_msa;
+    c->pix_abs[1][3] = ff_pix_abs8_xy2_msa;
+
+    c->hadamard8_diff[0] = ff_hadamard8_diff16_msa;
+    c->hadamard8_diff[1] = ff_hadamard8_diff8x8_msa;
+
+    c->hadamard8_diff[4] = ff_hadamard8_intra16_msa;
+    c->hadamard8_diff[5] = ff_hadamard8_intra8x8_msa;
+
+    c->sad[0] = ff_pix_abs16_msa;
+    c->sad[1] = ff_pix_abs8_msa;
+    c->sse[0] = ff_sse16_msa;
+    c->sse[1] = ff_sse8_msa;
+    c->sse[2] = ff_sse4_msa;
+#endif
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx)
+{
+#if HAVE_MSA
+    me_cmp_msa(c, avctx);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/me_cmp_mips.h b/libavcodec/mips/me_cmp_mips.h
new file mode 100644
index 0000000..e0d0f51
--- /dev/null
+++ b/libavcodec/mips/me_cmp_mips.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
+#define AVCODEC_MIPS_ME_CMP_MIPS_H
+
+#include "../mpegvideo.h"
+#include "libavcodec/bit_depth_template.c"
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h);
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                              ptrdiff_t stride, int h);
+int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                            ptrdiff_t stride, int h);
+int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h);
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_sse16_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                 ptrdiff_t stride, int i32Height);
+int ff_sse8_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                ptrdiff_t stride, int i32Height);
+int ff_sse4_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+                ptrdiff_t stride, int i32Height);
+void ff_add_pixels8_msa(uint8_t *av_restrict pixels, int16_t *block,
+                        ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
diff --git a/libavcodec/mips/me_cmp_msa.c b/libavcodec/mips/me_cmp_msa.c
new file mode 100644
index 0000000..0e3165c
--- /dev/null
+++ b/libavcodec/mips/me_cmp_msa.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "me_cmp_mips.h"
+
+static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
+                               uint8_t *ref, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                    src0, src1, ref0, ref1);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
+                                uint8_t *ref, int32_t ref_stride,
+                                int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, ref0, ref1;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *ref,
+                                                     int32_t ref_stride,
+                                                     int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+        SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+        SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+        SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+        SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+        PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+        AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *ref,
+                                                      int32_t ref_stride,
+                                                      int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+        ref += (4 * ref_stride);
+
+        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+        LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+        ref += (4 * ref_stride);
+
+        AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *ref,
+                                                    int32_t ref_stride,
+                                                    int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+        ref += (4 * ref_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+        PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+        PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+        AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *ref,
+                                                     int32_t ref_stride,
+                                                     int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp0, comp1;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+        ref += (5 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+        ref4 = ref3;
+
+        LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+        ref += (3 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+        sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+        AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+        sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *ref,
+                                                  int32_t ref_stride,
+                                                  int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, temp0, temp1, diff;
+    v16u8 ref0, ref1, ref2, ref3, ref4;
+    v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v8u16 comp0, comp1, comp2, comp3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+        ref += (4 * ref_stride);
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+
+        VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp0 += comp1;
+        comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
+        comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
+
+        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
+        comp2 = __msa_hadd_u_h(temp0, temp0);
+        comp1 += comp2;
+        comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
+        comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
+        comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
+        diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
+        comp3 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp3;
+        comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
+        comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
+
+        temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp3 += comp0;
+        comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
+        comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
+        comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
+        diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
+        sad += __msa_hadd_u_h(diff, diff);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *ref,
+                                                   int32_t ref_stride,
+                                                   int32_t height)
+{
+    int32_t ht_cnt;
+    v16u8 src0, src1, src2, src3, comp, diff;
+    v16u8 temp0, temp1, temp2, temp3;
+    v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
+    v8u16 comp0, comp1, comp2, comp3;
+    v8u16 sad = { 0 };
+
+    for (ht_cnt = (height >> 3); ht_cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
+        LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
+        ref += (5 * ref_stride);
+
+        ILVRL_B2_UB(ref14, ref04, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src0, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src1, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src2, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src3, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
+        LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
+        ref += (3 * ref_stride);
+
+        ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src0, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src1, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+        comp2 = __msa_hadd_u_h(temp2, temp2);
+        comp3 = __msa_hadd_u_h(temp3, temp3);
+        comp0 += comp2;
+        comp1 += comp3;
+        SRARI_H2_UH(comp0, comp1, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+        diff = __msa_asub_u_b(src2, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+
+        ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+        comp0 = __msa_hadd_u_h(temp0, temp0);
+        comp1 = __msa_hadd_u_h(temp1, temp1);
+        comp2 += comp0;
+        comp3 += comp1;
+        SRARI_H2_UH(comp2, comp3, 2);
+        comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+        diff = __msa_asub_u_b(src3, comp);
+        sad += __msa_hadd_u_h(diff, diff);
+    }
+
+    return (HADD_UH_U32(sad));
+}
+
+#define CALC_MSE_B(src, ref, var)                                    \
+{                                                                    \
+    v16u8 src_l0_m, src_l1_m;                                        \
+    v8i16 res_l0_m, res_l1_m;                                        \
+                                                                     \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+}
+
+static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
+                               uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    uint32_t src0, src1, src2, src3;
+    uint32_t ref0, ref1, ref2, ref3;
+    v16u8 src = { 0 };
+    v16u8 ref = { 0 };
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LW4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+        ref_ptr += (4 * ref_stride);
+
+        INSERT_W4_UB(src0, src1, src2, src3, src);
+        INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+        CALC_MSE_B(src, ref, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
+                               uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref2, ref3;
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+        ref_ptr += (4 * ref_stride);
+
+        PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                    src0, src1, ref0, ref1);
+        CALC_MSE_B(src0, ref0, var);
+        CALC_MSE_B(src1, ref1, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
+                                uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height)
+{
+    int32_t ht_cnt;
+    uint32_t sse;
+    v16u8 src, ref;
+    v4i32 var = { 0 };
+
+    for (ht_cnt = (height >> 2); ht_cnt--;) {
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+
+        src = LD_UB(src_ptr);
+        src_ptr += src_stride;
+        ref = LD_UB(ref_ptr);
+        ref_ptr += ref_stride;
+        CALC_MSE_B(src, ref, var);
+    }
+
+    sse = HADD_SW_S32(var);
+
+    return sse;
+}
+
+static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *ref, int32_t ref_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 sum = { 0 };
+    v8i16 zero = { 0 };
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+    ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
+               src4, ref4, src5, ref5, src6, ref6, src7, ref7,
+               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
+    HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
+    TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
+                       diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+         diff0, diff1, diff2, diff3);
+    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+    sum += __msa_add_a_h((v8i16) diff0, zero);
+    sum += __msa_add_a_h((v8i16) diff1, zero);
+    sum += __msa_add_a_h((v8i16) diff2, zero);
+    sum += __msa_add_a_h((v8i16) diff3, zero);
+
+    return (HADD_UH_U32(sum));
+}
+
+static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *ref, int32_t ref_stride)
+{
+    int32_t sum_res = 0;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 sum = { 0 };
+    v16i8 zero = { 0 };
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
+                       src0, src1, src2, src3, src4, src5, src6, src7);
+    ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+               zero, src4, zero, src5, zero, src6, zero, src7,
+               diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+    BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+                temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+    BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+                diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+    BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+                temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+    TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+                       temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+    BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+                diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+    BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+                temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+    ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+         diff0, diff1, diff2, diff3);
+    sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+    sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+    sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+    sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+    sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
+    sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
+    sum_res = (HADD_UH_U32(sum));
+    sum_res -= abs(temp0[0] + temp4[0]);
+
+    return sum_res;
+}
+
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                     ptrdiff_t stride, int height)
+{
+    return sad_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                    ptrdiff_t stride, int height)
+{
+    return sad_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h)
+{
+    return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h)
+{
+    return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h)
+{
+    return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                        ptrdiff_t stride, int h)
+{
+    return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                 ptrdiff_t stride, int height)
+{
+    return sse_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                ptrdiff_t stride, int height)
+{
+    return sse_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+                ptrdiff_t stride, int height)
+{
+    return sse_4width_msa(src, stride, ref, stride, height);
+}
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                             ptrdiff_t stride, int h)
+{
+    return hadamard_diff_8x8_msa(src, stride, dst, stride);
+}
+
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+                              ptrdiff_t stride, int h)
+{
+    return hadamard_intra_8x8_msa(src, stride, dst, stride);
+}
+
+/* Hadamard Transform functions */
+#define WRAPPER8_16_SQ(name8, name16)                      \
+int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \
+           ptrdiff_t stride, int h)                        \
+{                                                          \
+    int score = 0;                                         \
+    score += name8(s, dst, src, stride, 8);                \
+    score += name8(s, dst + 8, src + 8, stride, 8);        \
+    if(h == 16) {                                          \
+        dst += 8 * stride;                                 \
+        src += 8 * stride;                                 \
+        score +=name8(s, dst, src, stride, 8);             \
+        score +=name8(s, dst + 8, src + 8, stride, 8);     \
+    }                                                      \
+    return score;                                          \
+}
+
+WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
+WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
diff --git a/libavcodec/mips/mpegaudiodsp_mips_fixed.c b/libavcodec/mips/mpegaudiodsp_mips_fixed.c
new file mode 100644
index 0000000..ed8c890
--- /dev/null
+++ b/libavcodec/mips/mpegaudiodsp_mips_fixed.c
@@ -0,0 +1,918 @@
+    /*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * MPEG Audio decoder optimized for MIPS fixed-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodsp_template.c
+ */
+
+#include <string.h>
+
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+
+static void ff_mpadsp_apply_window_mips_fixed(int32_t *synth_buf, int32_t *window,
+                               int *dither_state, int16_t *samples, int incr)
+{
+    register const int32_t *w, *w2, *p;
+    int j;
+    int16_t *samples2;
+    int w_asm, p_asm, w_asm1, p_asm1, w_asm2, p_asm2;
+    int w2_asm, w2_asm1, *p_temp1, *p_temp2;
+    int sum1 = 0;
+    int const min_asm = -32768, max_asm = 32767;
+    int temp1, temp2 = 0, temp3 = 0;
+    int64_t sum;
+
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+    samples2 = samples + 31 * incr;
+    w = window;
+    w2 = window + 31;
+    sum = *dither_state;
+    p = synth_buf + 16;
+    p_temp1 = synth_buf + 16;
+    p_temp2 = synth_buf + 48;
+    temp1 = sum;
+
+    /**
+    * use of round_sample function from the original code is eliminated,
+    * changed with appropriate assembly instructions.
+    */
+    __asm__ volatile (
+         "mthi   $zero                                                    \n\t"
+         "mtlo   %[temp1]                                                 \n\t"
+         "lw     %[w_asm],  0(%[w])                                       \n\t"
+         "lw     %[p_asm],  0(%[p])                                       \n\t"
+         "lw     %[w_asm1], 64*4(%[w])                                    \n\t"
+         "lw     %[p_asm1], 64*4(%[p])                                    \n\t"
+         "lw     %[w_asm2], 128*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 128*4(%[p])                                   \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "madd   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  192*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  192*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 256*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 256*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 320*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 320*4(%[p])                                   \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "madd   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  384*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  384*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 448*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 448*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 32*4(%[w])                                    \n\t"
+         "lw     %[p_asm2], 32*4(%[p])                                    \n\t"
+         "madd   %[w_asm],  %[p_asm]                                      \n\t"
+         "madd   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  96*4(%[w])                                    \n\t"
+         "lw     %[p_asm],  96*4(%[p])                                    \n\t"
+         "lw     %[w_asm1], 160*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 160*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 224*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 224*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "msub   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+         "lw     %[w_asm],  288*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  288*4(%[p])                                   \n\t"
+         "lw     %[w_asm1], 352*4(%[w])                                   \n\t"
+         "lw     %[p_asm1], 352*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "lw     %[w_asm],  480*4(%[w])                                   \n\t"
+         "lw     %[p_asm],  480*4(%[p])                                   \n\t"
+         "lw     %[w_asm2], 416*4(%[w])                                   \n\t"
+         "lw     %[p_asm2], 416*4(%[p])                                   \n\t"
+         "msub   %[w_asm],  %[p_asm]                                      \n\t"
+         "msub   %[w_asm1], %[p_asm1]                                     \n\t"
+         "msub   %[w_asm2], %[p_asm2]                                     \n\t"
+
+         /*round_sample function from the original code is eliminated,
+          * changed with appropriate assembly instructions
+          * code example:
+
+         "extr.w  %[sum1],$ac0,24                                       \n\t"
+         "mflo %[temp3],  $ac0                                          \n\t"
+         "and  %[temp1],  %[temp3],  0x00ffffff                         \n\t"
+         "slt  %[temp2],  %[sum1],   %[min_asm]                         \n\t"
+         "movn %[sum1],   %[min_asm],%[temp2]                           \n\t"
+         "slt  %[temp2],  %[max_asm],%[sum1]                            \n\t"
+         "movn %[sum1],   %[max_asm],%[temp2]                           \n\t"
+         "sh   %[sum1],   0(%[samples])                                 \n\t"
+         */
+
+         "extr.w %[sum1],   $ac0,       24                                \n\t"
+         "mflo   %[temp3]                                                 \n\t"
+         PTR_ADDIU "%[w],   %[w],       4                                 \n\t"
+         "and    %[temp1],  %[temp3],   0x00ffffff                        \n\t"
+         "slt    %[temp2],  %[sum1],    %[min_asm]                        \n\t"
+         "movn   %[sum1],   %[min_asm], %[temp2]                          \n\t"
+         "slt    %[temp2],  %[max_asm], %[sum1]                           \n\t"
+         "movn   %[sum1],   %[max_asm], %[temp2]                          \n\t"
+         "sh     %[sum1],   0(%[samples])                                 \n\t"
+
+        : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+          [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+          [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2),
+          [sum1] "+r" (sum1), [w] "+r" (w), [temp3] "+r" (temp3)
+        : [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
+          [max_asm] "r" (max_asm)
+        : "memory", "hi","lo"
+     );
+
+     samples += incr;
+
+    /* we calculate two samples at the same time to avoid one memory
+       access per two sample */
+
+    for(j = 1; j < 16; j++) {
+        __asm__ volatile (
+             "mthi   $0,         $ac1                                      \n\t"
+             "mtlo   $0,         $ac1                                      \n\t"
+             "mthi   $0                                                    \n\t"
+             "mtlo   %[temp1]                                              \n\t"
+             PTR_ADDIU "%[p_temp1], %[p_temp1],    4                       \n\t"
+             "lw     %[w_asm],   0(%[w])                                   \n\t"
+             "lw     %[p_asm],   0(%[p_temp1])                             \n\t"
+             "lw     %[w2_asm],  0(%[w2])                                  \n\t"
+             "lw     %[w_asm1],  64*4(%[w])                                \n\t"
+             "lw     %[p_asm1],  64*4(%[p_temp1])                          \n\t"
+             "lw     %[w2_asm1], 64*4(%[w2])                               \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   128*4(%[w])                               \n\t"
+             "lw     %[p_asm],   128*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  128*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  192*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  192*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 192*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   256*4(%[w])                               \n\t"
+             "lw     %[p_asm],   256*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  256*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  320*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  320*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 320*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   384*4(%[w])                               \n\t"
+             "lw     %[p_asm],   384*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm],  384*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  448*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  448*4(%[p_temp1])                         \n\t"
+             "lw     %[w2_asm1], 448*4(%[w2])                              \n\t"
+             "madd   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "madd   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             PTR_ADDIU "%[p_temp2], %[p_temp2],   -4                      \n\t"
+             "lw     %[w_asm],   32*4(%[w])                                \n\t"
+             "lw     %[p_asm],   0(%[p_temp2])                             \n\t"
+             "lw     %[w2_asm],  32*4(%[w2])                               \n\t"
+             "lw     %[w_asm1],  96*4(%[w])                                \n\t"
+             "lw     %[p_asm1],  64*4(%[p_temp2])                          \n\t"
+             "lw     %[w2_asm1], 96*4(%[w2])                               \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   160*4(%[w])                               \n\t"
+             "lw     %[p_asm],   128*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  160*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  224*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  192*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 224*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   288*4(%[w])                               \n\t"
+             "lw     %[p_asm],   256*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  288*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  352*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  320*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 352*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             "lw     %[w_asm],   416*4(%[w])                               \n\t"
+             "lw     %[p_asm],   384*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm],  416*4(%[w2])                              \n\t"
+             "lw     %[w_asm1],  480*4(%[w])                               \n\t"
+             "lw     %[p_asm1],  448*4(%[p_temp2])                         \n\t"
+             "lw     %[w2_asm1], 480*4(%[w2])                              \n\t"
+             "msub   %[w_asm],   %[p_asm]                                  \n\t"
+             "msub   %[w_asm1],  %[p_asm1]                                 \n\t"
+             "msub   $ac1,       %[w2_asm],        %[p_asm]                \n\t"
+             "msub   $ac1,       %[w2_asm1],       %[p_asm1]               \n\t"
+             PTR_ADDIU "%[w],    %[w],             4                       \n\t"
+             PTR_ADDIU "%[w2],   %[w2],            -4                      \n\t"
+             "mflo   %[temp2]                                              \n\t"
+             "extr.w %[sum1],    $ac0,             24                      \n\t"
+             "li     %[temp3],   1                                         \n\t"
+             "and    %[temp1],   %[temp2],         0x00ffffff              \n\t"
+             "madd   $ac1,       %[temp1],         %[temp3]                \n\t"
+             "slt    %[temp2],   %[sum1],          %[min_asm]              \n\t"
+             "movn   %[sum1],    %[min_asm],       %[temp2]                \n\t"
+             "slt    %[temp2],   %[max_asm],       %[sum1]                 \n\t"
+             "movn   %[sum1],    %[max_asm],       %[temp2]                \n\t"
+             "sh     %[sum1],    0(%[samples])                             \n\t"
+             "mflo   %[temp3],   $ac1                                      \n\t"
+             "extr.w %[sum1],    $ac1,             24                      \n\t"
+             "and    %[temp1],   %[temp3],         0x00ffffff              \n\t"
+             "slt    %[temp2],   %[sum1],          %[min_asm]              \n\t"
+             "movn   %[sum1],    %[min_asm],       %[temp2]                \n\t"
+             "slt    %[temp2],   %[max_asm],       %[sum1]                 \n\t"
+             "movn   %[sum1],    %[max_asm],       %[temp2]                \n\t"
+             "sh     %[sum1],    0(%[samples2])                            \n\t"
+
+            : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+              [p_asm1] "=&r" (p_asm1), [w2_asm1] "=&r" (w2_asm1),
+              [w2_asm] "=&r" (w2_asm), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+              [p_temp1] "+r" (p_temp1), [p_temp2] "+r" (p_temp2), [sum1] "+r" (sum1),
+              [w] "+r" (w), [w2] "+r" (w2), [samples] "+r" (samples),
+              [samples2] "+r" (samples2), [temp3] "+r" (temp3)
+            : [min_asm] "r" (min_asm), [max_asm] "r" (max_asm)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
+        );
+
+        samples += incr;
+        samples2 -= incr;
+    }
+
+    p = synth_buf + 32;
+
+    __asm__ volatile (
+        "mthi   $0                                                        \n\t"
+        "mtlo   %[temp1]                                                  \n\t"
+        "lw     %[w_asm],  32*4(%[w])                                     \n\t"
+        "lw     %[p_asm],  0(%[p])                                        \n\t"
+        "lw     %[w_asm1], 96*4(%[w])                                     \n\t"
+        "lw     %[p_asm1], 64*4(%[p])                                     \n\t"
+        "lw     %[w_asm2], 160*4(%[w])                                    \n\t"
+        "lw     %[p_asm2], 128*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "msub   %[w_asm2], %[p_asm2]                                      \n\t"
+        "lw     %[w_asm],  224*4(%[w])                                    \n\t"
+        "lw     %[p_asm],  192*4(%[p])                                    \n\t"
+        "lw     %[w_asm1], 288*4(%[w])                                    \n\t"
+        "lw     %[p_asm1], 256*4(%[p])                                    \n\t"
+        "lw     %[w_asm2], 352*4(%[w])                                    \n\t"
+        "lw     %[p_asm2], 320*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "msub   %[w_asm2], %[p_asm2]                                      \n\t"
+        "lw     %[w_asm],  416*4(%[w])                                    \n\t"
+        "lw     %[p_asm],  384*4(%[p])                                    \n\t"
+        "lw     %[w_asm1], 480*4(%[w])                                    \n\t"
+        "lw     %[p_asm1], 448*4(%[p])                                    \n\t"
+        "msub   %[w_asm],  %[p_asm]                                       \n\t"
+        "msub   %[w_asm1], %[p_asm1]                                      \n\t"
+        "extr.w %[sum1],   $ac0,       24                                 \n\t"
+        "mflo   %[temp2]                                                  \n\t"
+        "and    %[temp1],  %[temp2],   0x00ffffff                         \n\t"
+        "slt    %[temp2],  %[sum1],    %[min_asm]                         \n\t"
+        "movn   %[sum1],   %[min_asm], %[temp2]                           \n\t"
+        "slt    %[temp2],  %[max_asm], %[sum1]                            \n\t"
+        "movn   %[sum1],   %[max_asm], %[temp2]                           \n\t"
+        "sh     %[sum1],   0(%[samples])                                  \n\t"
+
+        : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
+          [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
+          [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), [sum1] "+r" (sum1)
+        : [w] "r" (w), [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
+          [max_asm] "r" (max_asm)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
+     );
+
+    *dither_state= temp1;
+}
+
+static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win)
+{
+    int j;
+    int t0, t1, t2, t3, s0, s1, s2, s3;
+    int tmp[18], *tmp1, *in1;
+    /* temporary variables */
+    int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
+    int t4, t5, t6, t8, t7;
+
+   /* values defined in macros and tables are
+    * eliminated - they are directly loaded in appropriate variables
+    */
+    int const C_1  =  4229717092; /* cos(pi*1/18)*2  */
+    int const C_2  =  4035949074; /* cos(pi*2/18)*2  */
+    int const C_3  =  575416510;  /* -cos(pi*3/18)*2 */
+    int const C_3A =  3719550786; /* cos(pi*3/18)*2  */
+    int const C_4  =  1004831466; /* -cos(pi*4/18)*2 */
+    int const C_5  =  1534215534; /* -cos(pi*5/18)*2 */
+    int const C_7  = -1468965330; /* -cos(pi*7/18)*2 */
+    int const C_8  = -745813244;  /* -cos(pi*8/18)*2 */
+
+   /*
+    * instructions of the first two loops are reorganized and loops are unrolled,
+    * in order to eliminate unnecessary readings and writings in array
+    */
+
+    __asm__ volatile (
+        "lw   %[t1], 17*4(%[in])                                         \n\t"
+        "lw   %[t2], 16*4(%[in])                                         \n\t"
+        "lw   %[t3], 15*4(%[in])                                         \n\t"
+        "lw   %[t4], 14*4(%[in])                                         \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "addu %[t2], %[t2],      %[t3]                                   \n\t"
+        "addu %[t3], %[t3],      %[t4]                                   \n\t"
+        "lw   %[t5], 13*4(%[in])                                         \n\t"
+        "addu %[t1], %[t1],      %[t3]                                   \n\t"
+        "sw   %[t2], 16*4(%[in])                                         \n\t"
+        "lw   %[t6], 12*4(%[in])                                         \n\t"
+        "sw   %[t1], 17*4(%[in])                                         \n\t"
+        "addu %[t4], %[t4],      %[t5]                                   \n\t"
+        "addu %[t5], %[t5],      %[t6]                                   \n\t"
+        "lw   %[t7], 11*4(%[in])                                         \n\t"
+        "addu %[t3], %[t3],      %[t5]                                   \n\t"
+        "sw   %[t4], 14*4(%[in])                                         \n\t"
+        "lw   %[t8], 10*4(%[in])                                         \n\t"
+        "sw   %[t3], 15*4(%[in])                                         \n\t"
+        "addu %[t6], %[t6],      %[t7]                                   \n\t"
+        "addu %[t7], %[t7],      %[t8]                                   \n\t"
+        "sw   %[t6], 12*4(%[in])                                         \n\t"
+        "addu %[t5], %[t5],      %[t7]                                   \n\t"
+        "lw   %[t1], 9*4(%[in])                                          \n\t"
+        "lw   %[t2], 8*4(%[in])                                          \n\t"
+        "sw   %[t5], 13*4(%[in])                                         \n\t"
+        "addu %[t8], %[t8],      %[t1]                                   \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "sw   %[t8], 10*4(%[in])                                         \n\t"
+        "addu %[t7], %[t7],      %[t1]                                   \n\t"
+        "lw   %[t3], 7*4(%[in])                                          \n\t"
+        "lw   %[t4], 6*4(%[in])                                          \n\t"
+        "sw   %[t7], 11*4(%[in])                                         \n\t"
+        "addu %[t2], %[t2],      %[t3]                                   \n\t"
+        "addu %[t3], %[t3],      %[t4]                                   \n\t"
+        "sw   %[t2], 8*4(%[in])                                          \n\t"
+        "addu %[t1], %[t1],      %[t3]                                   \n\t"
+        "lw   %[t5], 5*4(%[in])                                          \n\t"
+        "lw   %[t6], 4*4(%[in])                                          \n\t"
+        "sw   %[t1], 9*4(%[in])                                          \n\t"
+        "addu %[t4], %[t4],      %[t5]                                   \n\t"
+        "addu %[t5], %[t5],      %[t6]                                   \n\t"
+        "sw   %[t4], 6*4(%[in])                                          \n\t"
+        "addu %[t3], %[t3],      %[t5]                                   \n\t"
+        "lw   %[t7], 3*4(%[in])                                          \n\t"
+        "lw   %[t8], 2*4(%[in])                                          \n\t"
+        "sw   %[t3], 7*4(%[in])                                          \n\t"
+        "addu %[t6], %[t6],      %[t7]                                   \n\t"
+        "addu %[t7], %[t7],      %[t8]                                   \n\t"
+        "sw   %[t6], 4*4(%[in])                                          \n\t"
+        "addu %[t5], %[t5],      %[t7]                                   \n\t"
+        "lw   %[t1], 1*4(%[in])                                          \n\t"
+        "lw   %[t2], 0*4(%[in])                                          \n\t"
+        "sw   %[t5], 5*4(%[in])                                          \n\t"
+        "addu %[t8], %[t8],      %[t1]                                   \n\t"
+        "addu %[t1], %[t1],      %[t2]                                   \n\t"
+        "sw   %[t8], 2*4(%[in])                                          \n\t"
+        "addu %[t7], %[t7],      %[t1]                                   \n\t"
+        "sw   %[t7], 3*4(%[in])                                          \n\t"
+        "sw   %[t1], 1*4(%[in])                                          \n\t"
+
+        : [in] "+r" (in), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
+          [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6),
+          [t7] "=&r" (t7), [t8] "=&r" (t8)
+        :
+        : "memory"
+    );
+
+    for(j = 0; j < 2; j++) {
+
+        tmp1 = tmp + j;
+        in1 = in + j;
+
+         /**
+         *  Original constants are multiplied by two in advanced
+         *  for assembly optimization (e.g. C_2 = 2 * C2).
+         *  That can lead to overflow in operations where they are used.
+         *
+         *  Example of the solution:
+         *
+         *  in original code:
+         *  t0 = ((int64_t)(in1[2*2] + in1[2*4]) * (int64_t)(2*C2))>>32
+         *
+         *  in assembly:
+         *  C_2 = 2 * C2;
+         *   .
+         *   .
+         *  "lw   %[t7],       4*4(%[in1])                               \n\t"
+         *  "lw   %[t8],       8*4(%[in1])                               \n\t"
+         *  "addu %[temp_reg2],%[t7],       %[t8]                        \n\t"
+         *  "multu %[C_2],     %[temp_reg2]                              \n\t"
+         *  "mfhi %[temp_reg1]                                           \n\t"
+         *  "sra  %[temp_reg2],%[temp_reg2],31                           \n\t"
+         *  "move %[t0],       $0                                        \n\t"
+         *  "movn %[t0],       %[C_2],      %[temp_reg2]                 \n\t"
+         *  "sub  %[t0],       %[temp_reg1],%[t0]                        \n\t"
+         */
+
+        __asm__ volatile (
+            "lw    %[t7],        4*4(%[in1])                               \n\t"
+            "lw    %[t8],        8*4(%[in1])                               \n\t"
+            "lw    %[t6],        16*4(%[in1])                              \n\t"
+            "lw    %[t4],        0*4(%[in1])                               \n\t"
+            "addu  %[temp_reg2], %[t7],        %[t8]                       \n\t"
+            "addu  %[t2],        %[t6],        %[t8]                       \n\t"
+            "multu %[C_2],       %[temp_reg2]                              \n\t"
+            "lw    %[t5],        12*4(%[in1])                              \n\t"
+            "sub   %[t2],        %[t2],        %[t7]                       \n\t"
+            "sub   %[t1],        %[t4],        %[t5]                       \n\t"
+            "sra   %[t3],        %[t5],        1                           \n\t"
+            "sra   %[temp_reg1], %[t2],        1                           \n\t"
+            "addu  %[t3],        %[t3],        %[t4]                       \n\t"
+            "sub   %[temp_reg1], %[t1],        %[temp_reg1]                \n\t"
+            "sra   %[temp_reg2], %[temp_reg2], 31                          \n\t"
+            "sw    %[temp_reg1], 6*4(%[tmp1])                              \n\t"
+            "move  %[t0],        $0                                        \n\t"
+            "movn  %[t0],        %[C_2],       %[temp_reg2]                \n\t"
+            "mfhi  %[temp_reg1]                                            \n\t"
+            "addu  %[t1],        %[t1],        %[t2]                       \n\t"
+            "sw    %[t1],        16*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg4], %[t8],        %[t6]                       \n\t"
+            "add   %[temp_reg2], %[t7],        %[t6]                       \n\t"
+            "mult  $ac1,         %[C_8],       %[temp_reg4]                \n\t"
+            "multu $ac2,         %[C_4],       %[temp_reg2]                \n\t"
+            "sub   %[t0],        %[temp_reg1], %[t0]                       \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "move  %[t2],        $0                                        \n\t"
+            "movn  %[t2],        %[C_4],       %[temp_reg1]                \n\t"
+            "mfhi  %[t1],        $ac1                                      \n\t"
+            "mfhi  %[temp_reg1], $ac2                                      \n\t"
+            "lw    %[t6],        10*4(%[in1])                              \n\t"
+            "lw    %[t8],        14*4(%[in1])                              \n\t"
+            "lw    %[t7],        2*4(%[in1])                               \n\t"
+            "lw    %[t4],        6*4(%[in1])                               \n\t"
+            "sub   %[temp_reg3], %[t3],        %[t0]                       \n\t"
+            "add   %[temp_reg4], %[t3],        %[t0]                       \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "add   %[temp_reg4], %[temp_reg4], %[t1]                       \n\t"
+            "sub   %[t2],        %[temp_reg1], %[t2]                       \n\t"
+            "sw    %[temp_reg4], 2*4(%[tmp1])                              \n\t"
+            "sub   %[temp_reg3], %[temp_reg3], %[t2]                       \n\t"
+            "add   %[temp_reg1], %[t3],        %[t2]                       \n\t"
+            "sw    %[temp_reg3], 10*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[t1]                       \n\t"
+            "addu  %[temp_reg2], %[t6],        %[t8]                       \n\t"
+            "sw    %[temp_reg1], 14*4(%[tmp1])                             \n\t"
+            "sub   %[temp_reg2], %[temp_reg2], %[t7]                       \n\t"
+            "addu  %[temp_reg3], %[t7],        %[t6]                       \n\t"
+            "multu $ac3,         %[C_3],       %[temp_reg2]                \n\t"
+            "multu %[C_1],       %[temp_reg3]                              \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "move  %[t1],        $0                                        \n\t"
+            "sra   %[temp_reg3], %[temp_reg3], 31                          \n\t"
+            "movn  %[t1],        %[C_3],       %[temp_reg1]                \n\t"
+            "mfhi  %[temp_reg1], $ac3                                      \n\t"
+            "mfhi  %[temp_reg4]                                            \n\t"
+            "move  %[t2],        $0                                        \n\t"
+            "movn  %[t2],        %[C_1],       %[temp_reg3]                \n\t"
+            "sub   %[temp_reg3], %[t6],        %[t8]                       \n\t"
+            "sub   %[t2],        %[temp_reg4], %[t2]                       \n\t"
+            "multu $ac1,         %[C_7],       %[temp_reg3]                \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "sra   %[temp_reg4], %[temp_reg3], 31                          \n\t"
+            "sub   %[t1],        %[temp_reg1], %[t1]                       \n\t"
+            "move  %[t3],        $0                                        \n\t"
+            "sw    %[t1],        4*4(%[tmp1])                              \n\t"
+            "movn  %[t3],        %[C_7],       %[temp_reg4]                \n\t"
+            "multu $ac2,         %[C_3A],      %[t4]                       \n\t"
+            "add   %[temp_reg2], %[t7],        %[t8]                       \n\t"
+            "move  %[t1],        $0                                        \n\t"
+            "mfhi  %[temp_reg4], $ac1                                      \n\t"
+            "multu $ac3,%[C_5],  %[temp_reg2]                              \n\t"
+            "move  %[t0],        $0                                        \n\t"
+            "sra   %[temp_reg1], %[temp_reg2], 31                          \n\t"
+            "movn  %[t1],%[C_5], %[temp_reg1]                              \n\t"
+            "sub   %[temp_reg4], %[temp_reg4], %[temp_reg3]                \n\t"
+            "mfhi  %[temp_reg1], $ac3                                      \n\t"
+            "sra   %[temp_reg3], %[t4],        31                          \n\t"
+            "movn  %[t0],        %[C_3A],      %[temp_reg3]                \n\t"
+            "mfhi  %[temp_reg3], $ac2                                      \n\t"
+            "sub   %[t3],        %[temp_reg4], %[t3]                       \n\t"
+            "add   %[temp_reg4], %[t3],        %[t2]                       \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[temp_reg2]                \n\t"
+            "sub   %[t1],        %[temp_reg1], %[t1]                       \n\t"
+            "sub   %[t0],        %[temp_reg3], %[t0]                       \n\t"
+            "add   %[temp_reg1], %[t2],        %[t1]                       \n\t"
+            "add   %[temp_reg4], %[temp_reg4], %[t0]                       \n\t"
+            "sub   %[temp_reg2], %[t3],        %[t1]                       \n\t"
+            "sw    %[temp_reg4], 0*4(%[tmp1])                              \n\t"
+            "sub   %[temp_reg1], %[temp_reg1], %[t0]                       \n\t"
+            "sub   %[temp_reg2], %[temp_reg2], %[t0]                       \n\t"
+            "sw    %[temp_reg1], 12*4(%[tmp1])                             \n\t"
+            "sw    %[temp_reg2], 8*4(%[tmp1])                              \n\t"
+
+            : [t7] "=&r" (t7), [temp_reg1] "=&r" (temp_reg1),
+              [temp_reg2] "=&r" (temp_reg2), [temp_reg4] "=&r" (temp_reg4),
+              [temp_reg3] "=&r" (temp_reg3), [t8] "=&r" (t8), [t0] "=&r" (t0),
+              [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r"(t6), [t2] "=&r" (t2),
+              [t3] "=&r" (t3), [t1] "=&r" (t1)
+            : [C_2] "r" (C_2), [in1] "r" (in1), [tmp1] "r" (tmp1), [C_8] "r" (C_8),
+              [C_4] "r" (C_4), [C_3] "r" (C_3), [C_1] "r" (C_1), [C_7] "r" (C_7),
+              [C_3A] "r" (C_3A), [C_5] "r" (C_5)
+            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+              "$ac3hi", "$ac3lo"
+         );
+    }
+
+    /**
+    * loop is unrolled four times
+    *
+    * values defined in tables(icos36[] and icos36h[]) are not loaded from
+    * these tables - they are directly loaded in appropriate registers
+    *
+    */
+
+    __asm__ volatile (
+        "lw     %[t2],        1*4(%[tmp])                                  \n\t"
+        "lw     %[t3],        3*4(%[tmp])                                  \n\t"
+        "lw     %[t0],        0*4(%[tmp])                                  \n\t"
+        "lw     %[t1],        2*4(%[tmp])                                  \n\t"
+        "addu   %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0x807D2B1E                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg2], %[temp_reg1]                                 \n\t"
+        "sra    %[temp_reg1], %[temp_reg1], 31                             \n\t"
+        "movn   %[s1],        %[temp_reg2], %[temp_reg1]                   \n\t"
+        "sub    %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x2de5151                                    \n\t"
+        "mfhi   %[temp_reg2]                                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg5], 9*4(%[win])                                  \n\t"
+        "mult   $ac1,         %[temp_reg4], %[temp_reg3]                   \n\t"
+        "lw     %[temp_reg6], 4*9*4(%[buf])                                \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg3], 29*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg2], %[s1]                          \n\t"
+        "lw     %[temp_reg4], 28*4(%[win])                                 \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "extr.w %[s3],        $ac1,23                                      \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg3]                   \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "lw     %[temp_reg1], 4*8*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg5]                                 \n\t"
+        "lw     %[temp_reg2], 8*4(%[win])                                  \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg4]                   \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "sw     %[temp_reg3], 4*9*4(%[buf])                                \n\t"
+        "mfhi   %[temp_reg4], $ac3                                         \n\t"
+        "lw     %[temp_reg3], 37*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "lw     %[temp_reg6], 17*4(%[win])                                 \n\t"
+        "sw     %[temp_reg5], 32*9*4(%[out])                               \n\t"
+        "sw     %[temp_reg4], 4*8*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg6]                                 \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg2], 0*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 4*17*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 8*32*4(%[out])                               \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg4], 20*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 0(%[buf])                                    \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg3]                   \n\t"
+        "mult   %[t0],        %[temp_reg4]                                 \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "lw     %[t0],        4*4(%[tmp])                                  \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg4]                                               \n\t"
+        "sw     %[temp_reg5], 17*32*4(%[out])                              \n\t"
+        "lw     %[t1],        6*4(%[tmp])                                  \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[t2],        5*4(%[tmp])                                  \n\t"
+        "sw     %[temp_reg1], 0*32*4(%[out])                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg3], 4*17*4(%[buf])                               \n\t"
+        "lw     %[t3],        7*4(%[tmp])                                  \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg4], 0(%[buf])                                    \n\t"
+        "addu   %[temp_reg5], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg6], 0x8483EE0C                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg6], %[temp_reg5]                                 \n\t"
+        "sub    %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0xf746ea                                     \n\t"
+        "sra    %[temp_reg5], %[temp_reg5], 31                             \n\t"
+        "mult   $ac1,         %[temp_reg2], %[temp_reg1]                   \n\t"
+        "movn   %[s1],        %[temp_reg6], %[temp_reg5]                   \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "lw     %[temp_reg3], 10*4(%[win])                                 \n\t"
+        "lw     %[temp_reg4], 4*10*4(%[buf])                               \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg1], 4*7*4(%[buf])                                \n\t"
+        "lw     %[temp_reg2], 7*4(%[win])                                  \n\t"
+        "lw     %[temp_reg6], 30*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg5], %[s1]                          \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg3]                   \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   %[t0],        %[temp_reg6]                                 \n\t"
+        "lw     %[temp_reg5], 27*4(%[win])                                 \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg5]                   \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg2], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[temp_reg4], 16*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg5], $ac1                                         \n\t"
+        "sw     %[temp_reg3], 32*10*4(%[out])                              \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg3], 4*16*4(%[buf])                               \n\t"
+        "sw     %[temp_reg6], 4*10*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 7*32*4(%[out])                               \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg4]                   \n\t"
+        "sw     %[temp_reg5], 4*7*4(%[buf])                                \n\t"
+        "lw     %[temp_reg6], 1*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 4*1*4(%[buf])                                \n\t"
+        "lw     %[temp_reg1], 36*4(%[win])                                 \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg6]                   \n\t"
+        "lw     %[temp_reg2], 21*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg4], $ac2                                         \n\t"
+        "mult   %[t0],        %[temp_reg1]                                 \n\t"
+        "mult   $ac1,         %[t0],%[temp_reg2]                           \n\t"
+        "lw     %[t0],        8*4(%[tmp])                                  \n\t"
+        "mfhi   %[temp_reg6], $ac3                                         \n\t"
+        "lw     %[t1],        10*4(%[tmp])                                 \n\t"
+        "lw     %[t3],        11*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg1]                                               \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[t2],        9*4(%[tmp])                                  \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "sw     %[temp_reg3], 16*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg5], 1*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg1], 4*16*4(%[buf])                               \n\t"
+        "addu   %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x8D3B7CD6                                   \n\t"
+        "sw     %[temp_reg2], 4*1*4(%[buf])                                \n\t"
+        "multu  %[temp_reg4],%[temp_reg3]                                  \n\t"
+        "sra    %[temp_reg3], %[temp_reg3], 31                             \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "movn   %[s1],        %[temp_reg4], %[temp_reg3]                   \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "mfhi   %[temp_reg3]                                               \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "sub    %[temp_reg5], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg6], 0x976fd9                                     \n\t"
+        "lw     %[temp_reg2], 11*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 4*11*4(%[buf])                               \n\t"
+        "mult   $ac1,         %[temp_reg6], %[temp_reg5]                   \n\t"
+        "subu   %[s1],        %[temp_reg3], %[s1]                          \n\t"
+        "lw     %[temp_reg5], 31*4(%[win])                                 \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   %[t0],        %[temp_reg5]                                 \n\t"
+        "lw     %[temp_reg4], 6*4(%[win])                                  \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg3], 4*6*4(%[buf])                                \n\t"
+        "mfhi   %[temp_reg2], $ac2                                         \n\t"
+        "lw     %[temp_reg6], 26*4(%[win])                                 \n\t"
+        "mfhi   %[temp_reg5]                                               \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg4]                   \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg6]                   \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg2], %[temp_reg2], %[temp_reg1]                   \n\t"
+        "mfhi   %[temp_reg4], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6], $ac1                                         \n\t"
+        "sw     %[temp_reg5], 4*11*4(%[buf])                               \n\t"
+        "sw     %[temp_reg2], 32*11*4(%[out])                              \n\t"
+        "lw     %[temp_reg1], 4*15*4(%[buf])                               \n\t"
+        "add    %[temp_reg3], %[temp_reg3], %[temp_reg4]                   \n\t"
+        "lw     %[temp_reg2], 15*4(%[win])                                 \n\t"
+        "sw     %[temp_reg3], 6*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg6], 4*6*4(%[buf])                                \n\t"
+        "mult   %[t1],        %[temp_reg2]                                 \n\t"
+        "lw     %[temp_reg3], 2*4(%[win])                                  \n\t"
+        "lw     %[temp_reg4], 4*2*4(%[buf])                                \n\t"
+        "lw     %[temp_reg5], 35*4(%[win])                                 \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg3]                   \n\t"
+        "mfhi   %[temp_reg2]                                               \n\t"
+        "lw     %[temp_reg6], 22*4(%[win])                                 \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg5]                   \n\t"
+        "lw     %[t1],        14*4(%[tmp])                                 \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg6]                   \n\t"
+        "lw     %[t0],        12*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg3], $ac1                                         \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "mfhi   %[temp_reg5], $ac2                                         \n\t"
+        "sw     %[temp_reg1], 15*32*4(%[out])                              \n\t"
+        "mfhi   %[temp_reg6], $ac3                                         \n\t"
+        "lw     %[t2],        13*4(%[tmp])                                 \n\t"
+        "lw     %[t3],        15*4(%[tmp])                                 \n\t"
+        "add    %[temp_reg4], %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sw     %[temp_reg5], 4*15*4(%[buf])                               \n\t"
+        "addu   %[temp_reg1], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg2], 0x9C42577C                                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "multu  %[temp_reg2], %[temp_reg1]                                 \n\t"
+        "sw     %[temp_reg4], 2*32*4(%[out])                               \n\t"
+        "sra    %[temp_reg1], %[temp_reg1], 31                             \n\t"
+        "movn   %[s1],        %[temp_reg2], %[temp_reg1]                   \n\t"
+        "sub    %[temp_reg3], %[t3],        %[t2]                          \n\t"
+        "li     %[temp_reg4], 0x6f94a2                                     \n\t"
+        "mfhi   %[temp_reg1]                                               \n\t"
+        "addu   %[s0],        %[t1],        %[t0]                          \n\t"
+        "sw     %[temp_reg6], 4*2*4(%[buf])                                \n\t"
+        "mult   $ac1,         %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sub    %[s2],        %[t1],        %[t0]                          \n\t"
+        "lw     %[temp_reg5], 12*4(%[win])                                 \n\t"
+        "lw     %[temp_reg6], 4*12*4(%[buf])                               \n\t"
+        "subu   %[s1],        %[temp_reg1], %[s1]                          \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "lw     %[temp_reg3], 32*4(%[win])                                 \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg5]                   \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "extr.w %[s3],        $ac1,         23                             \n\t"
+        "lw     %[temp_reg2], 5*4(%[win])                                  \n\t"
+        "mult   %[t0],        %[temp_reg3]                                 \n\t"
+        "mfhi   %[temp_reg5], $ac2                                         \n\t"
+        "lw     %[temp_reg4], 25*4(%[win])                                 \n\t"
+        "lw     %[temp_reg1], 4*5*4(%[buf])                                \n\t"
+        "mult   $ac3,         %[t1],        %[temp_reg2]                   \n\t"
+        "mult   $ac1,         %[t0],        %[temp_reg4]                   \n\t"
+        "mfhi   %[temp_reg3]                                               \n\t"
+        "add    %[t0],        %[s2],        %[s3]                          \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg2], $ac3                                         \n\t"
+        "mfhi   %[temp_reg4], $ac1                                         \n\t"
+        "sub    %[t1],        %[s2],        %[s3]                          \n\t"
+        "sw     %[temp_reg5], 32*12*4(%[out])                              \n\t"
+        "sw     %[temp_reg3], 4*12*4(%[buf])                               \n\t"
+        "lw     %[temp_reg6], 14*4(%[win])                                 \n\t"
+        "lw     %[temp_reg5], 4*14*4(%[buf])                               \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "sw     %[temp_reg4], 4*5*4(%[buf])                                \n\t"
+        "sw     %[temp_reg1], 5*32*4(%[out])                               \n\t"
+        "mult   %[t1],        %[temp_reg6]                                 \n\t"
+        "lw     %[temp_reg4], 34*4(%[win])                                 \n\t"
+        "lw     %[temp_reg2], 3*4(%[win])                                  \n\t"
+        "lw     %[temp_reg1], 4*3*4(%[buf])                                \n\t"
+        "mult   $ac2,         %[t0],        %[temp_reg4]                   \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg2]                   \n\t"
+        "lw     %[temp_reg3], 23*4(%[win])                                 \n\t"
+        "lw     %[s0],        16*4(%[tmp])                                 \n\t"
+        "mfhi   %[temp_reg4], $ac2                                         \n\t"
+        "lw     %[t1],        17*4(%[tmp])                                 \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg3]                   \n\t"
+        "move   %[s1],        $0                                           \n\t"
+        "add    %[temp_reg5], %[temp_reg5], %[temp_reg6]                   \n\t"
+        "mfhi   %[temp_reg2], $ac1                                         \n\t"
+        "sw     %[temp_reg5], 14*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg4], 4*14*4(%[buf])                               \n\t"
+        "mfhi   %[temp_reg3], $ac3                                         \n\t"
+        "li     %[temp_reg5], 0xB504F334                                   \n\t"
+        "add    %[temp_reg1], %[temp_reg1], %[temp_reg2]                   \n\t"
+        "multu  %[temp_reg5], %[t1]                                        \n\t"
+        "lw     %[temp_reg2], 4*13*4(%[buf])                               \n\t"
+        "sw     %[temp_reg1], 3*32*4(%[out])                               \n\t"
+        "sra    %[t1],        %[t1],        31                             \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "movn   %[s1],        %[temp_reg5], %[t1]                          \n\t"
+        "sw     %[temp_reg3], 4*3*4(%[buf])                                \n\t"
+        "lw     %[temp_reg1], 13*4(%[win])                                 \n\t"
+        "lw     %[temp_reg4], 4*4*4(%[buf])                                \n\t"
+        "lw     %[temp_reg3], 4*4(%[win])                                  \n\t"
+        "lw     %[temp_reg5], 33*4(%[win])                                 \n\t"
+        "subu   %[s1],        %[temp_reg6], %[s1]                          \n\t"
+        "lw     %[temp_reg6], 24*4(%[win])                                 \n\t"
+        "sub    %[t1],        %[s0],        %[s1]                          \n\t"
+        "add    %[t0],        %[s0],        %[s1]                          \n\t"
+        "mult   $ac1,         %[t1],        %[temp_reg1]                   \n\t"
+        "mult   $ac2,         %[t1],        %[temp_reg3]                   \n\t"
+        "mult   $ac3,         %[t0],        %[temp_reg5]                   \n\t"
+        "mult   %[t0],        %[temp_reg6]                                 \n\t"
+        "mfhi   %[temp_reg1], $ac1                                         \n\t"
+        "mfhi   %[temp_reg3], $ac2                                         \n\t"
+        "mfhi   %[temp_reg5], $ac3                                         \n\t"
+        "mfhi   %[temp_reg6]                                               \n\t"
+        "add    %[temp_reg2], %[temp_reg2], %[temp_reg1]                   \n\t"
+        "add    %[temp_reg4], %[temp_reg4], %[temp_reg3]                   \n\t"
+        "sw     %[temp_reg2], 13*32*4(%[out])                              \n\t"
+        "sw     %[temp_reg4], 4*32*4(%[out])                               \n\t"
+        "sw     %[temp_reg5], 4*13*4(%[buf])                               \n\t"
+        "sw     %[temp_reg6], 4*4*4(%[buf])                                \n\t"
+
+        : [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
+          [s0] "=&r" (s0), [s2] "=&r" (s2), [temp_reg1] "=&r" (temp_reg1),
+          [temp_reg2] "=&r" (temp_reg2), [s1] "=&r" (s1), [s3] "=&r" (s3),
+          [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
+          [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6),
+          [out] "+r" (out)
+        : [tmp] "r" (tmp), [win] "r" (win), [buf] "r" (buf)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
+          "$ac3hi", "$ac3lo"
+    );
+}
+
+static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in,
+                               int count, int switch_point, int block_type)
+{
+    int j;
+    for (j=0 ; j < count; j++) {
+        /* apply window & overlap with previous buffer */
+
+        /* select window */
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;
+        int *win = ff_mdct_win_fixed[win_idx + (4 & -(j & 1))];
+
+        imdct36_mips_fixed(out, buf, in, win);
+
+        in  += 18;
+        buf += ((j&3) != 3 ? 1 : (72-3));
+        out++;
+    }
+}
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_mpadsp_init_mipsdsp(MPADSPContext *s)
+{
+#if HAVE_INLINE_ASM
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->apply_window_fixed   = ff_mpadsp_apply_window_mips_fixed;
+    s->imdct36_blocks_fixed = ff_imdct36_blocks_mips_fixed;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/mpegaudiodsp_mips_float.c b/libavcodec/mips/mpegaudiodsp_mips_float.c
new file mode 100644
index 0000000..270838e
--- /dev/null
+++ b/libavcodec/mips/mpegaudiodsp_mips_float.c
@@ -0,0 +1,1261 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Bojan Zivkovic (bojan@mips.com)
+ *
+ * MPEG Audio decoder optimized for MIPS floating-point architecture
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/mpegaudiodsp_template.c
+ *            libavcodec/dct32.c
+ */
+
+#include <string.h>
+
+#include "libavutil/mips/asmdefs.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+
+static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window,
+                               int *dither_state, float *samples, int incr)
+{
+    register const float *w, *w2, *p;
+    int j;
+    float *samples2;
+    float sum, sum2;
+    /* temporary variables */
+    int incr1 = incr << 2;
+    int t_sample;
+    float in1, in2, in3, in4, in5, in6, in7, in8;
+    float *p2;
+
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    * use of round_sample function from the original code is
+    * changed with appropriate assembly instructions.
+    */
+
+    __asm__ volatile (
+        "lwc1    %[sum],      0(%[dither_state])                            \t\n"
+        "sll     %[t_sample], %[incr1],     5                               \t\n"
+        "sub     %[t_sample], %[t_sample],  %[incr1]                        \n\t"
+        "li      %[j],        4                                             \t\n"
+        "lwc1    %[in1],      0(%[window])                                  \t\n"
+        "lwc1    %[in2],      16*4(%[synth_buf])                            \t\n"
+        "sw      $zero,       0(%[dither_state])                            \t\n"
+        "lwc1    %[in3],      64*4(%[window])                               \t\n"
+        "lwc1    %[in4],      80*4(%[synth_buf])                            \t\n"
+        PTR_ADDU "%[samples2],%[samples],   %[t_sample]                     \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      128*4(%[window])                              \t\n"
+        "lwc1    %[in6],      144*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in7],      192*4(%[window])                              \t\n"
+        "madd.s  %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in8],      208*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in1],      256*4(%[window])                              \t\n"
+        "lwc1    %[in2],      272*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      320*4(%[window])                              \t\n"
+        "lwc1    %[in4],      336*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in5],      384*4(%[window])                              \t\n"
+        "madd.s  %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in6],      400*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in7],      448*4(%[window])                              \t\n"
+        "lwc1    %[in8],      464*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in1],      32*4(%[window])                               \t\n"
+        "lwc1    %[in2],      48*4(%[synth_buf])                            \t\n"
+        "madd.s  %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in3],      96*4(%[window])                               \t\n"
+        "lwc1    %[in4],      112*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in5],      160*4(%[window])                              \t\n"
+        "lwc1    %[in6],      176*4(%[synth_buf])                           \t\n"
+        "madd.s  %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in7],      224*4(%[window])                              \t\n"
+        "lwc1    %[in8],      240*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in1],      288*4(%[window])                              \t\n"
+        "lwc1    %[in2],      304*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in3],      352*4(%[window])                              \t\n"
+        "lwc1    %[in4],      368*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "lwc1    %[in5],      416*4(%[window])                              \t\n"
+        "lwc1    %[in6],      432*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in7],      480*4(%[window])                              \t\n"
+        "lwc1    %[in8],      496*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        PTR_ADDU "%[w],       %[window],    4                               \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        PTR_ADDU "%[w2],      %[window],    124                             \t\n"
+        PTR_ADDIU "%[p],      %[synth_buf], 68                              \t\n"
+        PTR_ADDIU "%[p2],     %[synth_buf], 188                             \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+        PTR_ADDU "%[samples], %[samples],   %[incr1]                        \t\n"
+
+        /* calculate two samples at the same time to avoid one memory
+           access per two sample */
+
+        "ff_mpadsp_apply_window_loop%=:                                     \t\n"
+        "lwc1    %[in1],      0(%[w])                                       \t\n"
+        "lwc1    %[in2],      0(%[p])                                       \t\n"
+        "lwc1    %[in3],      0(%[w2])                                      \t\n"
+        "lwc1    %[in4],      64*4(%[w])                                    \t\n"
+        "lwc1    %[in5],      64*4(%[p])                                    \t\n"
+        "lwc1    %[in6],      64*4(%[w2])                                   \t\n"
+        "mul.s   %[sum],      %[in1],       %[in2]                          \t\n"
+        "mul.s   %[sum2],     %[in2],       %[in3]                          \t\n"
+        "lwc1    %[in1],      128*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      128*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmadd.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      128*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      192*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      192*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      192*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      256*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      256*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      256*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      320*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      320*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      320*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      384*4(%[w])                                   \t\n"
+        "lwc1    %[in2],      384*4(%[p])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      384*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      448*4(%[w])                                   \t\n"
+        "madd.s  %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in5],      448*4(%[p])                                   \t\n"
+        "lwc1    %[in6],      448*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "madd.s  %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in1],      32*4(%[w])                                    \t\n"
+        "lwc1    %[in2],      0(%[p2])                                      \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      32*4(%[w2])                                   \t\n"
+        "lwc1    %[in4],      96*4(%[w])                                    \t\n"
+        "lwc1    %[in5],      64*4(%[p2])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      96*4(%[w2])                                   \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      160*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      128*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      160*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      224*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      192*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      224*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      288*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      256*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      288*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      352*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      320*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      352*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        "lwc1    %[in1],      416*4(%[w])                                   \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        "lwc1    %[in2],      384*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "lwc1    %[in3],      416*4(%[w2])                                  \t\n"
+        "lwc1    %[in4],      480*4(%[w])                                   \t\n"
+        "lwc1    %[in5],      448*4(%[p2])                                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in6],      480*4(%[w2])                                  \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in2], %[in3]                  \t\n"
+        PTR_ADDIU "%[w],      %[w],         4                               \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in4], %[in5]                  \t\n"
+        PTR_ADDIU "%[w2],     %[w2],        -4                              \t\n"
+        "nmsub.s %[sum2],     %[sum2],      %[in5], %[in6]                  \t\n"
+        "addu    %[j],        %[j],         4                               \t\n"
+        PTR_ADDIU "%[p],      4                                             \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+        PTR_ADDIU "%[p2],     -4                                            \t\n"
+        "swc1    %[sum2],     0(%[samples2])                                \t\n"
+        PTR_ADDU "%[samples], %[samples],   %[incr1]                        \t\n"
+        PTR_SUBU "%[samples2],%[samples2],  %[incr1]                        \t\n"
+        "bne     %[j],        64,           ff_mpadsp_apply_window_loop%=   \t\n"
+
+        "lwc1    %[in1],      48*4(%[window])                               \t\n"
+        "lwc1    %[in2],      32*4(%[synth_buf])                            \t\n"
+        "lwc1    %[in3],      112*4(%[window])                              \t\n"
+        "lwc1    %[in4],      96*4(%[synth_buf])                            \t\n"
+        "lwc1    %[in5],      176*4(%[window])                              \t\n"
+        "lwc1    %[in6],      160*4(%[synth_buf])                           \t\n"
+        "mul.s   %[sum],      %[in1],       %[in2]                          \t\n"
+        "lwc1    %[in7],      240*4(%[window])                              \t\n"
+        "lwc1    %[in8],      224*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in1],      304*4(%[window])                              \t\n"
+        "nmadd.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "lwc1    %[in2],      288*4(%[synth_buf])                           \t\n"
+        "lwc1    %[in3],      368*4(%[window])                              \t\n"
+        "lwc1    %[in4],      352*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "lwc1    %[in5],      432*4(%[window])                              \t\n"
+        "lwc1    %[in6],      416*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in1], %[in2]                  \t\n"
+        "lwc1    %[in7],      496*4(%[window])                              \t\n"
+        "lwc1    %[in8],      480*4(%[synth_buf])                           \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in3], %[in4]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in5], %[in6]                  \t\n"
+        "nmsub.s %[sum],      %[sum],       %[in7], %[in8]                  \t\n"
+        "swc1    %[sum],      0(%[samples])                                 \t\n"
+
+        : [sum] "=&f" (sum), [sum2] "=&f" (sum2),
+          [w2] "=&r" (w2),   [w] "=&r" (w),
+          [p] "=&r" (p), [p2] "=&r" (p2), [j] "=&r" (j),
+          [samples] "+r" (samples), [samples2] "=&r" (samples2),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [in7] "=&f" (in7), [in8] "=&f" (in8),
+          [t_sample] "=&r" (t_sample)
+        : [synth_buf] "r" (synth_buf), [window] "r" (window),
+          [dither_state] "r" (dither_state), [incr1] "r" (incr1)
+        : "memory"
+    );
+}
+
+static void ff_dct32_mips_float(float *out, const float *tab)
+{
+    float val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7,
+          val8 , val9 , val10, val11, val12, val13, val14, val15,
+          val16, val17, val18, val19, val20, val21, val22, val23,
+          val24, val25, val26, val27, val28, val29, val30, val31;
+    float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8,
+          fTmp9, fTmp10, fTmp11;
+
+    /**
+    * instructions are scheduled to minimize pipeline stall.
+    */
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       0*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       31*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       15*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       16*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.50241928618815570551                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.50060299823519630134                  \n\t"
+        "li.s       %[fTmp11],      10.19000812354805681150                 \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val0],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val15],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       7*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       24*4(%[tab])                            \n\t"
+        "madd.s     %[val16],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val31],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val15],       %[val15],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       8*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp4],       23*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val31],       %[val31],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       5.10114861868916385802                  \n\t"
+        "li.s       %[fTmp10],      0.67480834145500574602                  \n\t"
+        "li.s       %[fTmp11],      0.74453627100229844977                  \n\t"
+        "add.s      %[val7],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val8],        %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.50979557910415916894                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val7]                 \n\t"
+        "mul.s      %[val8],        %[val8],        %[fTmp7]                \n\t"
+        "madd.s     %[val23],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val24],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "add.s      %[val0],        %[val0],        %[val7]                 \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val8]                 \n\t"
+        "add.s      %[val8],        %[val15],       %[val8]                 \n\t"
+        "mul.s      %[val24],       %[val24],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val16],       %[val23]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val23]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp4],       %[val31],       %[val24]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp3]                \n\t"
+        "add.s      %[val24],       %[val31],       %[val24]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp4]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val0]  "=f" (val0),  [val7]  "=f" (val7),
+          [val8]  "=f" (val8),  [val15] "=f" (val15),
+          [val16] "=f" (val16), [val23] "=f" (val23),
+          [val24] "=f" (val24), [val31] "=f" (val31)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       3*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       28*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       12*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       19*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.64682178335999012954                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.53104259108978417447                  \n\t"
+        "li.s       %[fTmp11],      1.48416461631416627724                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val3],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val12],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       4*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       27*4(%[tab])                            \n\t"
+        "madd.s     %[val19],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val28],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val12],       %[val12],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       11*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       20*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val28],       %[val28],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "li.s       %[fTmp7],       0.78815462345125022473                  \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.55310389603444452782                  \n\t"
+        "li.s       %[fTmp11],      1.16943993343288495515                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val4],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val11],       %[fTmp5],       %[fTmp6]                \n\t"
+        "li.s       %[fTmp1],       2.56291544774150617881                  \n\t"
+        "madd.s     %[val20],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val27],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val11],       %[val11],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp2],       %[val3],        %[val4]                 \n\t"
+        "add.s      %[val3],        %[val3],        %[val4]                 \n\t"
+        "sub.s      %[fTmp4],       %[val19],       %[val20]                \n\t"
+        "mul.s      %[val27],       %[val27],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val12],       %[val11]                \n\t"
+        "mul.s      %[val4],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val11],       %[val12],       %[val11]                \n\t"
+        "add.s      %[val19],       %[val19],       %[val20]                \n\t"
+        "mul.s      %[val20],       %[fTmp1],       %[fTmp4]                \n\t"
+        "mul.s      %[val12],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val28],       %[val27]                \n\t"
+        "add.s      %[val27],       %[val28],       %[val27]                \n\t"
+        "mul.s      %[val28],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val3]  "=f" (val3),  [val4]  "=f" (val4),
+          [val11] "=f" (val11), [val12] "=f" (val12),
+          [val19] "=f" (val19), [val20] "=f" (val20),
+          [val27] "=f" (val27), [val28] "=f" (val28)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       0.54119610014619698439                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val3]                 \n\t"
+        "add.s      %[val0],        %[val0],        %[val3]                 \n\t"
+        "sub.s      %[fTmp3],       %[val7],        %[val4]                 \n\t"
+        "add.s      %[val4],        %[val7],        %[val4]                 \n\t"
+        "sub.s      %[fTmp4],       %[val8],        %[val11]                \n\t"
+        "mul.s      %[val3],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val8],        %[val8],        %[val11]                \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val12]                \n\t"
+        "mul.s      %[val11],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val12],       %[val15],       %[val12]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [val0]  "+f" (val0),   [val3] "+f" (val3),
+          [val4]  "+f" (val4),   [val7] "+f" (val7),
+          [val8]  "+f" (val8),   [val11] "+f" (val11),
+          [val12] "+f" (val12),  [val15] "+f" (val15),
+          [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4)
+        :
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val16],       %[val19]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val19]                \n\t"
+        "sub.s      %[fTmp3],       %[val23],       %[val20]                \n\t"
+        "add.s      %[val20],       %[val23],       %[val20]                \n\t"
+        "sub.s      %[fTmp4],       %[val24],       %[val27]                \n\t"
+        "mul.s      %[val19],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val27]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val31],       %[val28]                \n\t"
+        "mul.s      %[val27],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val28],       %[val31],       %[val28]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20),
+          [val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27),
+          [val28] "+f" (val28), [val31] "+f" (val31)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       1*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       30*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       14*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       17*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.52249861493968888062                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.50547095989754365998                  \n\t"
+        "li.s       %[fTmp11],      3.40760841846871878570                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val1],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val14],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       6*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       25*4(%[tab])                            \n\t"
+        "madd.s     %[val17],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val30],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val14],       %[val14],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       9*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp4],       22*4(%[tab])                            \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "mul.s      %[val30],       %[val30],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       1.72244709823833392782                  \n\t"
+        "li.s       %[fTmp10],      0.62250412303566481615                  \n\t"
+        "li.s       %[fTmp11],      0.83934964541552703873                  \n\t"
+        "add.s      %[val6],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val9],        %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.60134488693504528054                  \n\t"
+        "sub.s      %[fTmp2],       %[val1],        %[val6]                 \n\t"
+        "add.s      %[val1],        %[val1],        %[val6]                 \n\t"
+        "mul.s      %[val9],        %[val9],        %[fTmp7]                \n\t"
+        "madd.s     %[val22],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val25],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val6],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val14],       %[val9]                 \n\t"
+        "add.s      %[val9],        %[val14],       %[val9]                 \n\t"
+        "mul.s      %[val25],       %[val25],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp3],       %[val17],       %[val22]                \n\t"
+        "add.s      %[val17],       %[val17],       %[val22]                \n\t"
+        "mul.s      %[val14],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp2],       %[val30],       %[val25]                \n\t"
+        "mul.s      %[val22],       %[fTmp1],       %[fTmp3]                \n\t"
+        "add.s      %[val25],       %[val30],       %[val25]                \n\t"
+        "mul.s      %[val30],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val1]  "=f" (val1),  [val6]  "=f" (val6),
+          [val9]  "=f" (val9),  [val14] "=f" (val14),
+          [val17] "=f" (val17), [val22] "=f" (val22),
+          [val25] "=f" (val25), [val30] "=f" (val30)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "lwc1       %[fTmp1],       2*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       29*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp3],       13*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       18*4(%[tab])                            \n\t"
+        "li.s       %[fTmp7],       0.56694403481635770368                  \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp10],      0.51544730992262454697                  \n\t"
+        "li.s       %[fTmp11],      2.05778100995341155085                  \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "add.s      %[val2],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val13],       %[fTmp5],       %[fTmp6]                \n\t"
+        "lwc1       %[fTmp1],       5*4(%[tab])                             \n\t"
+        "lwc1       %[fTmp2],       26*4(%[tab])                            \n\t"
+        "madd.s     %[val18],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val29],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "mul.s      %[val13],       %[val13],       %[fTmp7]                \n\t"
+        "lwc1       %[fTmp3],       10*4(%[tab])                            \n\t"
+        "lwc1       %[fTmp4],       21*4(%[tab])                            \n\t"
+        "mul.s      %[val29],       %[val29],       %[fTmp7]                \n\t"
+        "add.s      %[fTmp5],       %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp8],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[fTmp6],       %[fTmp3],       %[fTmp4]                \n\t"
+        "sub.s      %[fTmp9],       %[fTmp3],       %[fTmp4]                \n\t"
+        "li.s       %[fTmp7],       1.06067768599034747134                  \n\t"
+        "li.s       %[fTmp10],      0.58293496820613387367                  \n\t"
+        "li.s       %[fTmp11],      0.97256823786196069369                  \n\t"
+        "add.s      %[val5],        %[fTmp5],       %[fTmp6]                \n\t"
+        "sub.s      %[val10],       %[fTmp5],       %[fTmp6]                \n\t"
+        "mul.s      %[fTmp8],       %[fTmp8],       %[fTmp10]               \n\t"
+        "li.s       %[fTmp1],       0.89997622313641570463                  \n\t"
+        "sub.s      %[fTmp2],       %[val2],        %[val5]                 \n\t"
+        "mul.s      %[val10],       %[val10],       %[fTmp7]                \n\t"
+        "madd.s     %[val21],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "nmsub.s    %[val26],       %[fTmp8],       %[fTmp9],   %[fTmp11]   \n\t"
+        "add.s      %[val2],        %[val2],        %[val5]                 \n\t"
+        "mul.s      %[val5],        %[fTmp1],       %[fTmp2]                \n\t"
+        "sub.s      %[fTmp3],       %[val13],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val13],       %[val10]                \n\t"
+        "mul.s      %[val26],       %[val26],       %[fTmp7]                \n\t"
+        "sub.s      %[fTmp4],       %[val18],       %[val21]                \n\t"
+        "add.s      %[val18],       %[val18],       %[val21]                \n\t"
+        "mul.s      %[val13],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val29],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val29],       %[val26]                \n\t"
+        "mul.s      %[val21],       %[fTmp1],       %[fTmp4]                \n\t"
+        "mul.s      %[val29],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1]  "=&f" (fTmp1),  [fTmp2]  "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
+          [fTmp4]  "=&f" (fTmp4),  [fTmp5]  "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
+          [fTmp7]  "=&f" (fTmp7),  [fTmp8]  "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
+          [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
+          [val2]  "=f" (val2),  [val5]  "=f" (val5),
+          [val10] "=f" (val10), [val13] "=f" (val13),
+          [val18] "=f" (val18), [val21] "=f" (val21),
+          [val26] "=f" (val26), [val29] "=f" (val29)
+        : [tab] "r" (tab)
+        : "memory"
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       1.30656296487637652785                  \n\t"
+        "sub.s      %[fTmp2],       %[val1],        %[val2]                 \n\t"
+        "add.s      %[val1],        %[val1],        %[val2]                 \n\t"
+        "sub.s      %[fTmp3],       %[val6],        %[val5]                 \n\t"
+        "add.s      %[val5],        %[val6],        %[val5]                 \n\t"
+        "sub.s      %[fTmp4],       %[val9],        %[val10]                \n\t"
+        "mul.s      %[val2],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val9],        %[val9],        %[val10]                \n\t"
+        "mul.s      %[val6],        %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val14],       %[val13]                \n\t"
+        "mul.s      %[val10],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val13],       %[val14],       %[val13]                \n\t"
+        "mul.s      %[val14],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val1]  "+f" (val1),  [val2]  "+f" (val2),
+          [val5]  "+f" (val5),  [val6]  "+f" (val6),
+          [val9]  "+f" (val9),  [val10] "+f" (val10),
+          [val13] "+f" (val13), [val14] "+f" (val14)
+        :
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val17],       %[val18]                \n\t"
+        "add.s      %[val17],       %[val17],       %[val18]                \n\t"
+        "sub.s      %[fTmp3],       %[val22],       %[val21]                \n\t"
+        "add.s      %[val21],       %[val22],       %[val21]                \n\t"
+        "sub.s      %[fTmp4],       %[val25],       %[val26]                \n\t"
+        "mul.s      %[val18],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val25],       %[val25],       %[val26]                \n\t"
+        "mul.s      %[val22],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val30],       %[val29]                \n\t"
+        "mul.s      %[val26],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val29],       %[val30],       %[val29]                \n\t"
+        "mul.s      %[val30],       %[fTmp1],       %[fTmp2]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21),
+          [val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26),
+          [val29] "+f" (val29), [val30] "+f" (val30)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "li.s       %[fTmp1],       0.70710678118654752439                  \n\t"
+        "sub.s      %[fTmp2],       %[val0],        %[val1]                 \n\t"
+        "add.s      %[val0],        %[val0],        %[val1]                 \n\t"
+        "sub.s      %[fTmp3],       %[val3],        %[val2]                 \n\t"
+        "add.s      %[val2],        %[val3],        %[val2]                 \n\t"
+        "sub.s      %[fTmp4],       %[val4],        %[val5]                 \n\t"
+        "mul.s      %[val1],        %[fTmp1],       %[fTmp2]                \n\t"
+        "swc1       %[val0],        0(%[out])                               \n\t"
+        "mul.s      %[val3],        %[fTmp3],       %[fTmp1]                \n\t"
+        "add.s      %[val4],        %[val4],        %[val5]                 \n\t"
+        "mul.s      %[val5],        %[fTmp1],       %[fTmp4]                \n\t"
+        "swc1       %[val1],        16*4(%[out])                            \n\t"
+        "sub.s      %[fTmp2],       %[val7],        %[val6]                 \n\t"
+        "add.s      %[val2],        %[val2],        %[val3]                 \n\t"
+        "swc1       %[val3],        24*4(%[out])                            \n\t"
+        "add.s      %[val6],        %[val7],        %[val6]                 \n\t"
+        "mul.s      %[val7],        %[fTmp1],       %[fTmp2]                \n\t"
+        "swc1       %[val2],        8*4(%[out])                             \n\t"
+        "add.s      %[val6],        %[val6],        %[val7]                 \n\t"
+        "swc1       %[val7],        28*4(%[out])                            \n\t"
+        "add.s      %[val4],        %[val4],        %[val6]                 \n\t"
+        "add.s      %[val6],        %[val6],        %[val5]                 \n\t"
+        "add.s      %[val5],        %[val5],        %[val7]                 \n\t"
+        "swc1       %[val4],        4*4(%[out])                             \n\t"
+        "swc1       %[val5],        20*4(%[out])                            \n\t"
+        "swc1       %[val6],        12*4(%[out])                            \n\t"
+
+        : [fTmp1] "=f"  (fTmp1), [fTmp2] "=&f" (fTmp2),
+          [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val0] "+f" (val0), [val1] "+f" (val1),
+          [val2] "+f" (val2), [val3] "+f" (val3),
+          [val4] "+f" (val4), [val5] "+f" (val5),
+          [val6] "+f" (val6), [val7] "+f" (val7)
+        : [out] "r" (out)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val8],        %[val9]                 \n\t"
+        "add.s      %[val8],        %[val8],        %[val9]                 \n\t"
+        "sub.s      %[fTmp3],       %[val11],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val11],       %[val10]                \n\t"
+        "sub.s      %[fTmp4],       %[val12],       %[val13]                \n\t"
+        "mul.s      %[val9],        %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val13]                \n\t"
+        "mul.s      %[val11],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val15],       %[val14]                \n\t"
+        "mul.s      %[val13],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val14],       %[val15],       %[val14]                \n\t"
+        "add.s      %[val10],       %[val10],       %[val11]                \n\t"
+        "mul.s      %[val15],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val15]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val14]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val13]                \n\t"
+        "add.s      %[val13],       %[val13],       %[val15]                \n\t"
+        "add.s      %[val8],        %[val8],        %[val12]                \n\t"
+        "add.s      %[val12],       %[val12],       %[val10]                \n\t"
+        "add.s      %[val10],       %[val10],       %[val14]                \n\t"
+        "add.s      %[val14],       %[val14],       %[val9]                 \n\t"
+        "add.s      %[val9],        %[val9],        %[val13]                \n\t"
+        "add.s      %[val13],       %[val13],       %[val11]                \n\t"
+        "add.s      %[val11],       %[val11],       %[val15]                \n\t"
+        "swc1       %[val8],         2*4(%[out])                            \n\t"
+        "swc1       %[val9],        18*4(%[out])                            \n\t"
+        "swc1       %[val10],       10*4(%[out])                            \n\t"
+        "swc1       %[val11],       26*4(%[out])                            \n\t"
+        "swc1       %[val12],        6*4(%[out])                            \n\t"
+        "swc1       %[val13],       22*4(%[out])                            \n\t"
+        "swc1       %[val14],       14*4(%[out])                            \n\t"
+        "swc1       %[val15],       30*4(%[out])                            \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val8]  "+f" (val8),  [val9]  "+f" (val9),  [val10] "+f" (val10),
+          [val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13),
+          [val14] "+f" (val14), [val15] "+f" (val15)
+        : [fTmp1] "f" (fTmp1), [out] "r" (out)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val16],       %[val17]                \n\t"
+        "add.s      %[val16],       %[val16],       %[val17]                \n\t"
+        "sub.s      %[fTmp3],       %[val19],       %[val18]                \n\t"
+        "add.s      %[val18],       %[val19],       %[val18]                \n\t"
+        "sub.s      %[fTmp4],       %[val20],       %[val21]                \n\t"
+        "mul.s      %[val17],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val20],       %[val20],       %[val21]                \n\t"
+        "mul.s      %[val19],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val23],       %[val22]                \n\t"
+        "mul.s      %[val21],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val22],       %[val23],       %[val22]                \n\t"
+        "add.s      %[val18],       %[val18],       %[val19]                \n\t"
+        "mul.s      %[val23],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val22],       %[val22],       %[val23]                \n\t"
+        "add.s      %[val20],       %[val20],       %[val22]                \n\t"
+        "add.s      %[val22],       %[val22],       %[val21]                \n\t"
+        "add.s      %[val21],       %[val21],       %[val23]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18),
+          [val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21),
+          [val22] "+f" (val22), [val23] "+f" (val23)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    __asm__ volatile (
+        "sub.s      %[fTmp2],       %[val24],       %[val25]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val25]                \n\t"
+        "sub.s      %[fTmp3],       %[val27],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val27],       %[val26]                \n\t"
+        "sub.s      %[fTmp4],       %[val28],       %[val29]                \n\t"
+        "mul.s      %[val25],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val29]                \n\t"
+        "mul.s      %[val27],       %[fTmp1],       %[fTmp3]                \n\t"
+        "sub.s      %[fTmp2],       %[val31],       %[val30]                \n\t"
+        "mul.s      %[val29],       %[fTmp1],       %[fTmp4]                \n\t"
+        "add.s      %[val30],       %[val31],       %[val30]                \n\t"
+        "add.s      %[val26],       %[val26],       %[val27]                \n\t"
+        "mul.s      %[val31],       %[fTmp1],       %[fTmp2]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val31]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val30]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val29]                \n\t"
+        "add.s      %[val29],       %[val29],       %[val31]                \n\t"
+        "add.s      %[val24],       %[val24],       %[val28]                \n\t"
+        "add.s      %[val28],       %[val28],       %[val26]                \n\t"
+        "add.s      %[val26],       %[val26],       %[val30]                \n\t"
+        "add.s      %[val30],       %[val30],       %[val25]                \n\t"
+        "add.s      %[val25],       %[val25],       %[val29]                \n\t"
+        "add.s      %[val29],       %[val29],       %[val27]                \n\t"
+        "add.s      %[val27],       %[val27],       %[val31]                \n\t"
+
+        : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
+          [val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26),
+          [val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29),
+          [val30] "+f" (val30), [val31] "+f" (val31)
+        : [fTmp1] "f" (fTmp1)
+    );
+
+    out[ 1] = val16 + val24;
+    out[17] = val17 + val25;
+    out[ 9] = val18 + val26;
+    out[25] = val19 + val27;
+    out[ 5] = val20 + val28;
+    out[21] = val21 + val29;
+    out[13] = val22 + val30;
+    out[29] = val23 + val31;
+    out[ 3] = val24 + val20;
+    out[19] = val25 + val21;
+    out[11] = val26 + val22;
+    out[27] = val27 + val23;
+    out[ 7] = val28 + val18;
+    out[23] = val29 + val19;
+    out[15] = val30 + val17;
+    out[31] = val31;
+}
+
+static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
+{
+    float t0, t1, t2, t3, s0, s1, s2, s3;
+    float tmp[18];
+    /* temporary variables */
+    float in1, in2, in3, in4, in5, in6;
+    float out1, out2, out3, out4, out5;
+    float c1, c2, c3, c4, c5, c6, c7, c8, c9;
+
+    /**
+    * all loops are unrolled totally, and instructions are scheduled to
+    * minimize pipeline stall. instructions of the first two loops are
+    * reorganized, in order to eliminate unnecessary readings and
+    * writings into array. values defined in macros and tables are
+    * eliminated - they are directly loaded in appropriate variables
+    */
+
+    /* loop 1 and 2 */
+    __asm__ volatile (
+        "lwc1   %[in1],  17*4(%[in])                                    \t\n"
+        "lwc1   %[in2],  16*4(%[in])                                    \t\n"
+        "lwc1   %[in3],  15*4(%[in])                                    \t\n"
+        "lwc1   %[in4],  14*4(%[in])                                    \t\n"
+        "lwc1   %[in5],  13*4(%[in])                                    \t\n"
+        "lwc1   %[in6],  12*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[in1],  %[in2]                                \t\n"
+        "add.s  %[out2], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out3], %[in3],  %[in4]                                \t\n"
+        "add.s  %[out4], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out5], %[in5],  %[in6]                                \t\n"
+        "lwc1   %[in1],  11*4(%[in])                                    \t\n"
+        "swc1   %[out2], 16*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[out1], %[out3]                               \t\n"
+        "swc1   %[out4], 14*4(%[in])                                    \t\n"
+        "add.s  %[out3], %[out3], %[out5]                               \t\n"
+        "lwc1   %[in2],  10*4(%[in])                                    \t\n"
+        "lwc1   %[in3],  9*4(%[in])                                     \t\n"
+        "swc1   %[out1], 17*4(%[in])                                    \t\n"
+        "lwc1   %[in4],  8*4(%[in])                                     \t\n"
+        "swc1   %[out3], 15*4(%[in])                                    \t\n"
+        "add.s  %[out1], %[in6],  %[in1]                                \t\n"
+        "add.s  %[out2], %[in1],  %[in2]                                \t\n"
+        "add.s  %[out3], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out4], %[in3],  %[in4]                                \t\n"
+        "lwc1   %[in5],  7*4(%[in])                                     \t\n"
+        "swc1   %[out1], 12*4(%[in])                                    \t\n"
+        "add.s  %[out5], %[out5], %[out2]                               \t\n"
+        "swc1   %[out3], 10*4(%[in])                                    \t\n"
+        "add.s  %[out2], %[out2], %[out4]                               \t\n"
+        "lwc1   %[in6],  6*4(%[in])                                     \t\n"
+        "lwc1   %[in1],  5*4(%[in])                                     \t\n"
+        "swc1   %[out5], 13*4(%[in])                                    \t\n"
+        "lwc1   %[in2],  4*4(%[in])                                     \t\n"
+        "swc1   %[out2], 11*4(%[in])                                    \t\n"
+        "add.s  %[out5], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out1], %[in5],  %[in6]                                \t\n"
+        "add.s  %[out2], %[in6],  %[in1]                                \t\n"
+        "add.s  %[out3], %[in1],  %[in2]                                \t\n"
+        "lwc1   %[in3],  3*4(%[in])                                     \t\n"
+        "swc1   %[out5], 8*4(%[in])                                     \t\n"
+        "add.s  %[out4], %[out4], %[out1]                               \t\n"
+        "swc1   %[out2], 6*4(%[in])                                     \t\n"
+        "add.s  %[out1], %[out1], %[out3]                               \t\n"
+        "lwc1   %[in4],  2*4(%[in])                                     \t\n"
+        "lwc1   %[in5],  1*4(%[in])                                     \t\n"
+        "swc1   %[out4], 9*4(%[in])                                     \t\n"
+        "lwc1   %[in6],  0(%[in])                                       \t\n"
+        "swc1   %[out1], 7*4(%[in])                                     \t\n"
+        "add.s  %[out4], %[in2],  %[in3]                                \t\n"
+        "add.s  %[out5], %[in3],  %[in4]                                \t\n"
+        "add.s  %[out1], %[in4],  %[in5]                                \t\n"
+        "add.s  %[out2], %[in5],  %[in6]                                \t\n"
+        "swc1   %[out4], 4*4(%[in])                                     \t\n"
+        "add.s  %[out3], %[out3], %[out5]                               \t\n"
+        "swc1   %[out1], 2*4(%[in])                                     \t\n"
+        "add.s  %[out5], %[out5], %[out2]                               \t\n"
+        "swc1   %[out2], 1*4(%[in])                                     \t\n"
+        "swc1   %[out3], 5*4(%[in])                                     \t\n"
+        "swc1   %[out5], 3*4(%[in])                                     \t\n"
+
+        : [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4),
+          [out5] "=&f" (out5)
+        : [in] "r" (in)
+        : "memory"
+    );
+
+    /* loop 3 */
+    __asm__ volatile (
+        "li.s    %[c1],   0.5                                           \t\n"
+        "lwc1    %[in1],  8*4(%[in])                                    \t\n"
+        "lwc1    %[in2],  16*4(%[in])                                   \t\n"
+        "lwc1    %[in3],  4*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  0(%[in])                                      \t\n"
+        "lwc1    %[in5],  12*4(%[in])                                   \t\n"
+        "li.s    %[c2],   0.93969262078590838405                        \t\n"
+        "add.s   %[t2],   %[in1],  %[in2]                               \t\n"
+        "add.s   %[t0],   %[in1],  %[in3]                               \t\n"
+        "li.s    %[c3],   -0.76604444311897803520                       \t\n"
+        "madd.s  %[t3],   %[in4],  %[in5], %[c1]                        \t\n"
+        "sub.s   %[t1],   %[in4],  %[in5]                               \t\n"
+        "sub.s   %[t2],   %[t2],   %[in3]                               \t\n"
+        "mul.s   %[t0],   %[t0],   %[c2]                                \t\n"
+        "li.s    %[c4],   -0.17364817766693034885                       \t\n"
+        "li.s    %[c5],   -0.86602540378443864676                       \t\n"
+        "li.s    %[c6],   0.98480775301220805936                        \t\n"
+        "nmsub.s %[out1], %[t1],   %[t2],  %[c1]                        \t\n"
+        "add.s   %[out2], %[t1],   %[t2]                                \t\n"
+        "add.s   %[t2],   %[in2],  %[in3]                               \t\n"
+        "sub.s   %[t1],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[t3],   %[t0]                                \t\n"
+        "swc1    %[out1], 6*4(%[tmp])                                   \t\n"
+        "swc1    %[out2], 16*4(%[tmp])                                  \t\n"
+        "mul.s   %[t2],   %[t2],   %[c3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c4]                                \t\n"
+        "add.s   %[out1], %[t3],   %[t0]                                \t\n"
+        "lwc1    %[in1],  10*4(%[in])                                   \t\n"
+        "lwc1    %[in2],  14*4(%[in])                                   \t\n"
+        "sub.s   %[out3], %[out3], %[t2]                                \t\n"
+        "add.s   %[out2], %[t3],   %[t2]                                \t\n"
+        "add.s   %[out1], %[out1], %[t1]                                \t\n"
+        "lwc1    %[in3],  2*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  6*4(%[in])                                    \t\n"
+        "swc1    %[out3], 10*4(%[tmp])                                  \t\n"
+        "sub.s   %[out2], %[out2], %[t1]                                \t\n"
+        "swc1    %[out1], 2*4(%[tmp])                                   \t\n"
+        "add.s   %[out1], %[in1],  %[in2]                               \t\n"
+        "add.s   %[t2],   %[in1],  %[in3]                               \t\n"
+        "sub.s   %[t3],   %[in1],  %[in2]                               \t\n"
+        "swc1    %[out2], 14*4(%[tmp])                                  \t\n"
+        "li.s    %[c7],   -0.34202014332566873304                       \t\n"
+        "sub.s   %[out1], %[out1], %[in3]                               \t\n"
+        "mul.s   %[t2],   %[t2],   %[c6]                                \t\n"
+        "mul.s   %[t3],   %[t3],   %[c7]                                \t\n"
+        "li.s    %[c8],   0.86602540378443864676                        \t\n"
+        "mul.s   %[t0],   %[in4],  %[c8]                                \t\n"
+        "mul.s   %[out1], %[out1], %[c5]                                \t\n"
+        "add.s   %[t1],   %[in2],  %[in3]                               \t\n"
+        "li.s    %[c9],   -0.64278760968653932632                       \t\n"
+        "add.s   %[out2], %[t2],   %[t3]                                \t\n"
+        "lwc1    %[in1],  9*4(%[in])                                    \t\n"
+        "swc1    %[out1], 4*4(%[tmp])                                   \t\n"
+        "mul.s   %[t1],   %[t1],   %[c9]                                \t\n"
+        "lwc1    %[in2],  17*4(%[in])                                   \t\n"
+        "add.s   %[out2], %[out2], %[t0]                                \t\n"
+        "lwc1    %[in3],  5*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  1*4(%[in])                                    \t\n"
+        "add.s   %[out3], %[t2],   %[t1]                                \t\n"
+        "sub.s   %[out1], %[t3],   %[t1]                                \t\n"
+        "swc1    %[out2], 0(%[tmp])                                     \t\n"
+        "lwc1    %[in5],  13*4(%[in])                                   \t\n"
+        "add.s   %[t2],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[out3], %[t0]                                \t\n"
+        "sub.s   %[out1], %[out1], %[t0]                                \t\n"
+        "add.s   %[t0],   %[in1],  %[in3]                               \t\n"
+        "madd.s  %[t3],   %[in4],  %[in5], %[c1]                        \t\n"
+        "sub.s   %[t2],   %[t2],   %[in3]                               \t\n"
+        "swc1    %[out3], 12*4(%[tmp])                                  \t\n"
+        "swc1    %[out1], 8*4(%[tmp])                                   \t\n"
+        "sub.s   %[t1],   %[in4],  %[in5]                               \t\n"
+        "mul.s   %[t0],   %[t0],   %[c2]                                \t\n"
+        "nmsub.s %[out1], %[t1],   %[t2],  %[c1]                        \t\n"
+        "add.s   %[out2], %[t1],   %[t2]                                \t\n"
+        "add.s   %[t2],   %[in2],  %[in3]                               \t\n"
+        "sub.s   %[t1],   %[in1],  %[in2]                               \t\n"
+        "sub.s   %[out3], %[t3],   %[t0]                                \t\n"
+        "swc1    %[out1], 7*4(%[tmp])                                   \t\n"
+        "swc1    %[out2], 17*4(%[tmp])                                  \t\n"
+        "mul.s   %[t2],   %[t2],   %[c3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c4]                                \t\n"
+        "add.s   %[out1], %[t3],   %[t0]                                \t\n"
+        "lwc1    %[in1],  11*4(%[in])                                   \t\n"
+        "lwc1    %[in2],  15*4(%[in])                                   \t\n"
+        "sub.s   %[out3], %[out3], %[t2]                                \t\n"
+        "add.s   %[out2], %[t3],   %[t2]                                \t\n"
+        "add.s   %[out1], %[out1], %[t1]                                \t\n"
+        "lwc1    %[in3],  3*4(%[in])                                    \t\n"
+        "lwc1    %[in4],  7*4(%[in])                                    \t\n"
+        "swc1    %[out3], 11*4(%[tmp])                                  \t\n"
+        "sub.s   %[out2], %[out2], %[t1]                                \t\n"
+        "swc1    %[out1], 3*4(%[tmp])                                   \t\n"
+        "add.s   %[out3], %[in1],  %[in2]                               \t\n"
+        "add.s   %[t2],   %[in1],  %[in3]                               \t\n"
+        "sub.s   %[t3],   %[in1],  %[in2]                               \t\n"
+        "swc1    %[out2], 15*4(%[tmp])                                  \t\n"
+        "mul.s   %[t0],   %[in4],  %[c8]                                \t\n"
+        "sub.s   %[out3], %[out3], %[in3]                               \t\n"
+        "mul.s   %[t2],   %[t2],   %[c6]                                \t\n"
+        "mul.s   %[t3],   %[t3],   %[c7]                                \t\n"
+        "add.s   %[t1],   %[in2],  %[in3]                               \t\n"
+        "mul.s   %[out3], %[out3], %[c5]                                \t\n"
+        "add.s   %[out1], %[t2],   %[t3]                                \t\n"
+        "mul.s   %[t1],   %[t1],   %[c9]                                \t\n"
+        "swc1    %[out3], 5*4(%[tmp])                                   \t\n"
+        "add.s   %[out1], %[out1], %[t0]                                \t\n"
+        "add.s   %[out2], %[t2],   %[t1]                                \t\n"
+        "sub.s   %[out3], %[t3],   %[t1]                                \t\n"
+        "swc1    %[out1], 1*4(%[tmp])                                   \t\n"
+        "sub.s   %[out2], %[out2], %[t0]                                \t\n"
+        "sub.s   %[out3], %[out3], %[t0]                                \t\n"
+        "swc1    %[out2], 13*4(%[tmp])                                  \t\n"
+        "swc1    %[out3], 9*4(%[tmp])                                   \t\n"
+
+        : [t0] "=&f" (t0), [t1] "=&f" (t1),
+          [t2] "=&f" (t2), [t3] "=&f" (t3),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3),
+          [c1] "=&f" (c1), [c2] "=&f" (c2),
+          [c3] "=&f" (c3), [c4] "=&f" (c4),
+          [c5] "=&f" (c5), [c6] "=&f" (c6),
+          [c7] "=&f" (c7), [c8] "=&f" (c8),
+          [c9] "=&f" (c9)
+        : [in] "r" (in), [tmp] "r" (tmp)
+        : "memory"
+    );
+
+    /* loop 4 */
+    __asm__ volatile (
+        "lwc1   %[in1],  2*4(%[tmp])                                    \t\n"
+        "lwc1   %[in2],  0(%[tmp])                                      \t\n"
+        "lwc1   %[in3],  3*4(%[tmp])                                    \t\n"
+        "lwc1   %[in4],  1*4(%[tmp])                                    \t\n"
+        "li.s   %[c1],   0.50190991877167369479                         \t\n"
+        "li.s   %[c2],   5.73685662283492756461                         \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  9*4(%[win])                                    \t\n"
+        "lwc1   %[in2],  4*9*4(%[buf])                                  \t\n"
+        "lwc1   %[in3],  8*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*8*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  29*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  28*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "li.s   %[c1],   0.51763809020504152469                         \t\n"
+        "li.s   %[c2],   1.93185165257813657349                         \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out3], 4*9*4(%[buf])                                  \t\n"
+        "swc1   %[out1], 288*4(%[out])                                  \t\n"
+        "swc1   %[out2], 256*4(%[out])                                  \t\n"
+        "swc1   %[out4], 4*8*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  17*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*17*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  0(%[win])                                      \t\n"
+        "lwc1   %[in4],  0(%[buf])                                      \t\n"
+        "lwc1   %[in5],  37*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  20*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  6*4(%[tmp])                                    \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 544*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  4*4(%[tmp])                                    \t\n"
+        "swc1   %[out2], 0(%[out])                                      \t\n"
+        "swc1   %[out3], 4*17*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 0(%[buf])                                      \t\n"
+        "lwc1   %[in3],  7*4(%[tmp])                                    \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in4],  5*4(%[tmp])                                    \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  10*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*10*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  7*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[in4],  4*7*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  30*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  27*4(%[win])                                   \t\n"
+        "li.s   %[c1],   0.55168895948124587824                         \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out1], 320*4(%[out])                                  \t\n"
+        "swc1   %[out2], 224*4(%[out])                                  \t\n"
+        "swc1   %[out3], 4*10*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*7*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  16*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*16*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  1*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*1*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  36*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  21*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  10*4(%[tmp])                                   \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "swc1   %[out1], 512*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  8*4(%[tmp])                                    \t\n"
+        "swc1   %[out2], 32*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*16*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*1*4(%[buf])                                  \t\n"
+        "li.s   %[c2],   1.18310079157624925896                         \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in3],  11*4(%[tmp])                                   \t\n"
+        "lwc1   %[in4],  9*4(%[tmp])                                    \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  11*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*11*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  6*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*6*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  31*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  26*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "swc1   %[out3], 4*11*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*6*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out1], 352*4(%[out])                                  \t\n"
+        "swc1   %[out2], 192*4(%[out])                                  \t\n"
+        "lwc1   %[in1],  15*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*15*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  2*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*2*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  35*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  22*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "lwc1   %[in1],  14*4(%[tmp])                                   \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 480*4(%[out])                                  \t\n"
+        "lwc1   %[in2],  12*4(%[tmp])                                   \t\n"
+        "swc1   %[out2], 64*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*15*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*2*4(%[buf])                                  \t\n"
+        "lwc1   %[in3],  15*4(%[tmp])                                   \t\n"
+        "add.s  %[s0],   %[in1], %[in2]                                 \t\n"
+        "sub.s  %[s2],   %[in1], %[in2]                                 \t\n"
+        "lwc1   %[in4],  13*4(%[tmp])                                   \t\n"
+        "li.s   %[c1],   0.61038729438072803416                         \t\n"
+        "li.s   %[c2],   0.87172339781054900991                         \t\n"
+        "add.s  %[s1],   %[in3], %[in4]                                 \t\n"
+        "sub.s  %[s3],   %[in3], %[in4]                                 \t\n"
+        "lwc1   %[in1],  12*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*12*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  5*4(%[win])                                    \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "mul.s  %[s3],   %[s3],  %[c2]                                  \t\n"
+        "lwc1   %[in4],  4*5*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  32*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  25*4(%[win])                                   \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[s0],   16*4(%[tmp])                                   \t\n"
+        "lwc1   %[s1],   17*4(%[tmp])                                   \t\n"
+        "li.s   %[c1],   0.70710678118654752439                         \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "add.s  %[t0],   %[s2],  %[s3]                                  \t\n"
+        "swc1   %[out3], 4*12*4(%[buf])                                 \t\n"
+        "swc1   %[out1], 384*4(%[out])                                  \t\n"
+        "swc1   %[out2], 160*4(%[out])                                  \t\n"
+        "swc1   %[out4], 4*5*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s2],  %[s3]                                  \t\n"
+        "lwc1   %[in1],  14*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*14*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  3*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*3*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  34*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  23*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "mul.s  %[s1],   %[s1],  %[c1]                                  \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[in5], %[t0]                                  \t\n"
+        "mul.s  %[out4], %[in6], %[t0]                                  \t\n"
+        "swc1   %[out1], 448*4(%[out])                                  \t\n"
+        "add.s  %[t0],   %[s0],  %[s1]                                  \t\n"
+        "swc1   %[out2], 96*4(%[out])                                   \t\n"
+        "swc1   %[out3], 4*14*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*3*4(%[buf])                                  \t\n"
+        "sub.s  %[t1],   %[s0],  %[s1]                                  \t\n"
+        "lwc1   %[in1],  13*4(%[win])                                   \t\n"
+        "lwc1   %[in2],  4*13*4(%[buf])                                 \t\n"
+        "lwc1   %[in3],  4*4(%[win])                                    \t\n"
+        "lwc1   %[in4],  4*4*4(%[buf])                                  \t\n"
+        "lwc1   %[in5],  33*4(%[win])                                   \t\n"
+        "lwc1   %[in6],  24*4(%[win])                                   \t\n"
+        "madd.s %[out1], %[in2], %[in1], %[t1]                          \t\n"
+        "madd.s %[out2], %[in4], %[in3], %[t1]                          \t\n"
+        "mul.s  %[out3], %[t0],  %[in5]                                 \t\n"
+        "mul.s  %[out4], %[t0],  %[in6]                                 \t\n"
+        "swc1   %[out1], 416*4(%[out])                                  \t\n"
+        "swc1   %[out2], 128*4(%[out])                                  \t\n"
+        "swc1   %[out3], 4*13*4(%[buf])                                 \t\n"
+        "swc1   %[out4], 4*4*4(%[buf])                                  \t\n"
+
+        : [c1] "=&f" (c1), [c2] "=&f" (c2),
+          [in1] "=&f" (in1), [in2] "=&f" (in2),
+          [in3] "=&f" (in3), [in4] "=&f" (in4),
+          [in5] "=&f" (in5), [in6] "=&f" (in6),
+          [out1] "=&f" (out1), [out2] "=&f" (out2),
+          [out3] "=&f" (out3), [out4] "=&f" (out4),
+          [t0] "=&f" (t0), [t1] "=&f" (t1),
+          [t2] "=&f" (t2), [t3] "=&f" (t3),
+          [s0] "=&f" (s0), [s1] "=&f" (s1),
+          [s2] "=&f" (s2), [s3] "=&f" (s3)
+        : [tmp] "r" (tmp), [win] "r" (win),
+          [buf] "r" (buf), [out] "r" (out)
+        : "memory"
+    );
+}
+
+static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in,
+                               int count, int switch_point, int block_type)
+{
+    int j;
+    for (j=0 ; j < count; j++) {
+        /* apply window & overlap with previous buffer */
+
+        /* select window */
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;
+        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];
+
+        imdct36_mips_float(out, buf, in, win);
+
+        in  += 18;
+        buf += ((j&3) != 3 ? 1 : (72-3));
+        out++;
+    }
+}
+
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
+
+void ff_mpadsp_init_mipsfpu(MPADSPContext *s)
+{
+#if HAVE_INLINE_ASM && HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->apply_window_float   = ff_mpadsp_apply_window_mips_float;
+    s->imdct36_blocks_float = ff_imdct36_blocks_mips_float;
+    s->dct32_float          = ff_dct32_mips_float;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/mpegvideo_init_mips.c b/libavcodec/mips/mpegvideo_init_mips.c
new file mode 100644
index 0000000..1918da5
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_init_mips.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+#include "mpegvideo_mips.h"
+
+#if HAVE_MSA
+static av_cold void dct_unquantize_init_msa(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_msa;
+    s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_msa;
+    if (!s->q_scale_type)
+        s->dct_unquantize_mpeg2_inter = ff_dct_unquantize_mpeg2_inter_msa;
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void dct_unquantize_init_mmi(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_mmi;
+    s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_mmi;
+    s->dct_unquantize_mpeg1_intra = ff_dct_unquantize_mpeg1_intra_mmi;
+    s->dct_unquantize_mpeg1_inter = ff_dct_unquantize_mpeg1_inter_mmi;
+
+    if (!(s->avctx->flags & AV_CODEC_FLAG_BITEXACT))
+        if (!s->q_scale_type)
+            s->dct_unquantize_mpeg2_intra = ff_dct_unquantize_mpeg2_intra_mmi;
+
+    s->denoise_dct= ff_denoise_dct_mmi;
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_mpv_common_init_mips(MpegEncContext *s)
+{
+#if HAVE_MSA
+    dct_unquantize_init_msa(s);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    dct_unquantize_init_mmi(s);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
new file mode 100644
index 0000000..760d7b3
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_MPEGVIDEO_MIPS_H
+#define AVCODEC_MIPS_MPEGVIDEO_MIPS_H
+
+#include "libavcodec/mpegvideo.h"
+
+void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale);
+void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block);
+
+#endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c
new file mode 100644
index 0000000..450a18c
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_mmi.c
@@ -0,0 +1,492 @@
+/*
+ * Loongson SIMD optimized mpegvideo
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mpegvideo_mips.h"
+#include "libavutil/mips/asmdefs.h"
+
+void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t level, qmul, qadd, nCoeffs;
+    double ftmp[6];
+    mips_reg addr[1];
+
+    qmul = qscale << 1;
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+
+    if (!s->h263_aic) {
+        if (n<4)
+            level = block[0] * s->y_dc_scale;
+        else
+            level = block[0] * s->c_dc_scale;
+        qadd = (qscale-1) | 1;
+    } else {
+        qadd = 0;
+        level = block[0];
+    }
+
+    if(s->ac_pred)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    __asm__ volatile (
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[qadd]                 \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        ".p2align   4                                                   \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[block],       %[nCoeffs]              \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[qmul]                 \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[qmul]                 \n\t"
+        "pcmpgth    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "pcmpgth    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "pcmpeqh    %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pcmpeqh    %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pandn      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pandn      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        PTR_ADDIU  "%[nCoeffs], %[nCoeffs],     0x10                    \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gssdlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
+        "gssdrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        "blez       %[nCoeffs], 1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qmul]"f"(qmul),                  [qadd]"f"(qadd)
+        : "memory"
+    );
+
+    block[0] = level;
+}
+
+void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t qmul, qadd, nCoeffs;
+    double ftmp[6];
+    mips_reg addr[1];
+
+    qmul = qscale << 1;
+    qadd = (qscale - 1) | 1;
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    __asm__ volatile (
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "packsswh   %[qmul],    %[qmul],        %[qmul]                 \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "packsswh   %[qadd],    %[qadd],        %[qadd]                 \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[qadd]                 \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        ".p2align   4                                                   \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[block],       %[nCoeffs]              \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[qmul]                 \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[qmul]                 \n\t"
+        "pcmpgth    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "pcmpgth    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "pcmpeqh    %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "pcmpeqh    %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "pandn      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pandn      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        PTR_ADDIU  "%[nCoeffs], %[nCoeffs],     0x10                    \n\t"
+        "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gssdlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
+        "gssdrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        "blez       %[nCoeffs], 1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qmul]"f"(qmul),                  [qadd]"f"(qadd)
+        : "memory"
+    );
+}
+
+void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+
+    av_assert2(s->block_last_index[n]>=0);
+    nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
+
+    if (n<4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+
+    /* XXX: only mpeg1 */
+    quant_matrix = s->intra_matrix;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x0f                                    \n\t"
+        "pcmpeqh    %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "dmtc1      %[qscale],  %[ftmp1]                                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
+        ".p2align   4                                                   \n\t"
+        "1:                                                             \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[block])               \n\t"
+        "gsldxc1    %[ftmp3],   0x08(%[addr0],  %[block])               \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp3]                                \n\t"
+        "gsldxc1    %[ftmp6],   0x00(%[addr0],  %[quant])               \n\t"
+        "gsldxc1    %[ftmp7],   0x08(%[addr0],  %[quant])               \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "pcmpgth    %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "pcmpgth    %[ftmp9],   %[ftmp9],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "dli        %[tmp0],    0x03                                    \n\t"
+        "pcmpeqh    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "or         %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "or         %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "pandn      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "gssdxc1    %[ftmp6],   0x00(%[addr0],  %[block])               \n\t"
+        "gssdxc1    %[ftmp7],   0x08(%[addr0],  %[block])               \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
+        "bltz       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qscale]"r"(qscale)
+        : "memory"
+    );
+
+    block[0] = block0;
+}
+
+void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    int64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+
+    av_assert2(s->block_last_index[n] >= 0);
+    nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
+    quant_matrix = s->inter_matrix;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x0f                                    \n\t"
+        "pcmpeqh    %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "dmtc1      %[qscale],  %[ftmp1]                                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
+        ".p2align   4                                                   \n\t"
+        "1:                                                             \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[block])               \n\t"
+        "gsldxc1    %[ftmp3],   0x08(%[addr0],  %[block])               \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp3]                                \n\t"
+        "gsldxc1    %[ftmp6],   0x00(%[addr0],  %[quant])               \n\t"
+        "gsldxc1    %[ftmp7],   0x08(%[addr0],  %[quant])               \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "pcmpgth    %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "pcmpgth    %[ftmp9],   %[ftmp9],       %[ftmp3]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "dli        %[tmp0],    0x04                                    \n\t"
+        "pcmpeqh    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "or         %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "or         %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "pandn      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "gssdxc1    %[ftmp6],   0x00(%[addr0],  %[block])               \n\t"
+        "gssdxc1    %[ftmp7],   0x08(%[addr0],  %[block])               \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
+        "bltz       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qscale]"r"(qscale)
+        : "memory"
+    );
+}
+
+void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
+        int n, int qscale)
+{
+    uint64_t nCoeffs;
+    const uint16_t *quant_matrix;
+    int block0;
+    double ftmp[10];
+    uint64_t tmp[1];
+    mips_reg addr[1];
+
+    assert(s->block_last_index[n]>=0);
+
+    if (s->alternate_scan)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
+
+    if (n < 4)
+        block0 = block[0] * s->y_dc_scale;
+    else
+        block0 = block[0] * s->c_dc_scale;
+
+    quant_matrix = s->intra_matrix;
+
+    __asm__ volatile (
+        "dli        %[tmp0],    0x0f                                    \n\t"
+        "pcmpeqh    %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "mtc1       %[qscale],  %[ftmp9]                                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "packsswh   %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "packsswh   %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
+        "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
+        ".p2align   4                                                   \n\t"
+        "1:                                                             \n\t"
+        "gsldxc1    %[ftmp1],   0x00(%[addr0],  %[block])               \n\t"
+        "gsldxc1    %[ftmp2],   0x08(%[addr0],  %[block])               \n\t"
+        "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "gsldxc1    %[ftmp5],   0x00(%[addr0],  %[quant])               \n\t"
+        "gsldxc1    %[ftmp6],   0x00(%[addr0],  %[quant])               \n\t"
+        "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "pcmpgth    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "pcmpgth    %[ftmp8],   %[ftmp8],       %[ftmp2]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "dli        %[tmp0],    0x03                                    \n\t"
+        "pcmpeqh    %[ftmp6] ,  %[ftmp6],       %[ftmp4]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp3]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "pandn      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
+        "gssdxc1    %[ftmp5],   0x00(%[addr0],  %[block])               \n\t"
+        "gssdxc1    %[ftmp6],   0x08(%[addr0],  %[block])               \n\t"
+        "blez       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),
+          [addr0]"=&r"(addr[0])
+        : [block]"r"((mips_reg)(block+nCoeffs)),
+          [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
+          [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
+          [qscale]"r"(qscale)
+        : "memory"
+    );
+
+    block[0]= block0;
+}
+
+void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
+{
+    const int intra = s->mb_intra;
+    int *sum = s->dct_error_sum[intra];
+    uint16_t *offset = s->dct_offset[intra];
+    double ftmp[8];
+    mips_reg addr[1];
+
+    s->dct_count[intra]++;
+
+    __asm__ volatile(
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        "ldc1       %[ftmp1],   0x00(%[block])                          \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "ldc1       %[ftmp3],   0x08(%[block])                          \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "pcmpgth    %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "pcmpgth    %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "ldc1       %[ftmp6],   0x00(%[offset])                         \n\t"
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "psubush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "ldc1       %[ftmp6],   0x08(%[offset])                         \n\t"
+        "mov.d      %[ftmp7],   %[ftmp3]                                \n\t"
+        "psubush    %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp1],   0x00(%[block])                          \n\t"
+        "sdc1       %[ftmp3],   0x08(%[block])                          \n\t"
+        "mov.d      %[ftmp1],   %[ftmp5]                                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp7]                                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "ldc1       %[ftmp2],   0x00(%[sum])                            \n\t"
+        "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "ldc1       %[ftmp2],   0x08(%[sum])                            \n\t"
+        "paddw      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "ldc1       %[ftmp2],   0x10(%[sum])                            \n\t"
+        "paddw      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "ldc1       %[ftmp2],   0x18(%[sum])                            \n\t"
+        "paddw      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "sdc1       %[ftmp5],   0x00(%[sum])                            \n\t"
+        "sdc1       %[ftmp1],   0x08(%[sum])                            \n\t"
+        "sdc1       %[ftmp7],   0x10(%[sum])                            \n\t"
+        "sdc1       %[ftmp3],   0x18(%[sum])                            \n\t"
+        PTR_ADDIU  "%[block],   %[block],       0x10                    \n\t"
+        PTR_ADDIU  "%[sum],     %[sum],         0x20                    \n\t"
+        PTR_SUBU   "%[addr0],   %[block1],      %[block]                \n\t"
+        PTR_ADDIU  "%[offset],  %[offset],      0x10                    \n\t"
+        "bgtz       %[addr0],   1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [addr0]"=&r"(addr[0]),
+          [block]"+&r"(block),              [sum]"+&r"(sum),
+          [offset]"+&r"(offset)
+        : [block1]"r"(block+64)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c
new file mode 100644
index 0000000..aa9ef77
--- /dev/null
+++ b/libavcodec/mips/mpegvideo_msa.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "h263dsp_mips.h"
+
+static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
+                                    int16_t qadd, int8_t n_coeffs,
+                                    uint8_t loop_start)
+{
+    int16_t *block_dup = block;
+    int32_t level, cnt;
+    v8i16 block_vec, qmul_vec, qadd_vec, sub;
+    v8i16 add, mask, mul, zero_mask;
+
+    qmul_vec = __msa_fill_h(qmul);
+    qadd_vec = __msa_fill_h(qadd);
+    for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
+        block_vec = LD_SH(block_dup + loop_start);
+        mask = __msa_clti_s_h(block_vec, 0);
+        zero_mask = __msa_ceqi_h(block_vec, 0);
+        mul = block_vec * qmul_vec;
+        sub = mul - qadd_vec;
+        add = mul + qadd_vec;
+        add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
+                                         (v16u8) zero_mask);
+        ST_SH(block_vec, block_dup + loop_start);
+        block_dup += 8;
+    }
+
+    cnt = ((n_coeffs >> 3) * 8) + loop_start;
+
+    for (; cnt <= n_coeffs; cnt++) {
+        level = block[cnt];
+        if (level) {
+            if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[cnt] = level;
+        }
+    }
+}
+
+static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block,
+                                              int32_t qscale,
+                                              const int16_t *quant_matrix)
+{
+    int32_t cnt, sum_res = -1;
+    v8i16 block_vec, block_neg, qscale_vec, mask;
+    v8i16 block_org0, block_org1, block_org2, block_org3;
+    v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
+    v8i16 sum, mul, zero_mask;
+    v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
+    v4i32 block_l, block_r, sad;
+
+    qscale_vec = __msa_fill_h(qscale);
+    for (cnt = 0; cnt < 2; cnt++) {
+        LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
+        LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
+        mask = __msa_clti_s_h(block_org0, 0);
+        zero_mask = __msa_ceqi_h(block_org0, 0);
+        block_neg = -block_org0;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org1, 0);
+        zero_mask = __msa_ceqi_h(block_org1, 0);
+        block_neg = - block_org1;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org2, 0);
+        zero_mask = __msa_ceqi_h(block_org2, 0);
+        block_neg = - block_org2;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+        mask = __msa_clti_s_h(block_org3, 0);
+        zero_mask = __msa_ceqi_h(block_org3, 0);
+        block_neg = - block_org3;
+        block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
+                                         (v16u8) mask);
+        block_vec <<= 1;
+        block_vec += 1;
+        UNPCK_SH_SW(block_vec, block_r, block_l);
+        UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
+        UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
+        mul_vec = block_l * qscale_l;
+        mul_vec *= quant_m_l;
+        block_l = mul_vec >> 4;
+        mul_vec = block_r * qscale_r;
+        mul_vec *= quant_m_r;
+        block_r = mul_vec >> 4;
+        mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
+        block_neg = - mul;
+        sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
+                                   (v16u8) mask);
+        sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
+                                   (v16u8) zero_mask);
+        ST_SH(sum, block);
+
+        block += 8;
+        quant_matrix += 8;
+        sad = __msa_hadd_s_w(sum, sum);
+        sum_res += HADD_SW_S32(sad);
+    }
+
+    return sum_res;
+}
+
+void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
+                                      int16_t *block, int32_t index,
+                                      int32_t qscale)
+{
+    int32_t qmul, qadd;
+    int32_t nCoeffs;
+
+    av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
+
+    qmul = qscale << 1;
+
+    if (!s->h263_aic) {
+        block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    } else {
+        qadd = 0;
+    }
+    if (s->ac_pred)
+        nCoeffs = 63;
+    else
+        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
+
+    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
+}
+
+void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
+                                      int16_t *block, int32_t index,
+                                      int32_t qscale)
+{
+    int32_t qmul, qadd;
+    int32_t nCoeffs;
+
+    av_assert2(s->block_last_index[index] >= 0);
+
+    qadd = (qscale - 1) | 1;
+    qmul = qscale << 1;
+
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
+
+    h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
+}
+
+void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
+                                       int16_t *block, int32_t index,
+                                       int32_t qscale)
+{
+    const uint16_t *quant_matrix;
+    int32_t sum = -1;
+
+    quant_matrix = s->inter_matrix;
+
+    sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
+
+    block[63] ^= sum & 1;
+}
diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c b/libavcodec/mips/mpegvideoencdsp_init_mips.c
new file mode 100644
index 0000000..9bfe94e
--- /dev/null
+++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/bit_depth_template.c"
+#include "h263dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void mpegvideoencdsp_init_msa(MpegvideoEncDSPContext *c,
+                                             AVCodecContext *avctx)
+{
+#if BIT_DEPTH == 8
+    c->pix_sum = ff_pix_sum_msa;
+#endif
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
+                                          AVCodecContext *avctx)
+{
+#if HAVE_MSA
+    mpegvideoencdsp_init_msa(c, avctx);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/mpegvideoencdsp_msa.c b/libavcodec/mips/mpegvideoencdsp_msa.c
new file mode 100644
index 0000000..46473da
--- /dev/null
+++ b/libavcodec/mips/mpegvideoencdsp_msa.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h263dsp_mips.h"
+#include "libavutil/mips/generic_macros_msa.h"
+
+static int32_t sum_u8src_16width_msa(uint8_t *src, int32_t stride)
+{
+    uint32_t sum = 0;
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 in8, in9, in10, in11, in12, in13, in14, in15;
+
+    LD_UB8(src, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+    src += (8 * stride);
+    LD_UB8(src, stride, in8, in9, in10, in11, in12, in13, in14, in15);
+
+    HADD_UB4_UB(in0, in1, in2, in3, in0, in1, in2, in3);
+    HADD_UB4_UB(in4, in5, in6, in7, in4, in5, in6, in7);
+    HADD_UB4_UB(in8, in9, in10, in11, in8, in9, in10, in11);
+    HADD_UB4_UB(in12, in13, in14, in15, in12, in13, in14, in15);
+
+    sum = HADD_UH_U32(in0);
+    sum += HADD_UH_U32(in1);
+    sum += HADD_UH_U32(in2);
+    sum += HADD_UH_U32(in3);
+    sum += HADD_UH_U32(in4);
+    sum += HADD_UH_U32(in5);
+    sum += HADD_UH_U32(in6);
+    sum += HADD_UH_U32(in7);
+    sum += HADD_UH_U32(in8);
+    sum += HADD_UH_U32(in9);
+    sum += HADD_UH_U32(in10);
+    sum += HADD_UH_U32(in11);
+    sum += HADD_UH_U32(in12);
+    sum += HADD_UH_U32(in13);
+    sum += HADD_UH_U32(in14);
+    sum += HADD_UH_U32(in15);
+
+    return sum;
+}
+
+int ff_pix_sum_msa(uint8_t *pix, int line_size)
+{
+    return sum_u8src_16width_msa(pix, line_size);
+}
diff --git a/libavcodec/mips/pixblockdsp_init_mips.c b/libavcodec/mips/pixblockdsp_init_mips.c
new file mode 100644
index 0000000..1b3741e
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_init_mips.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "pixblockdsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void pixblockdsp_init_msa(PixblockDSPContext *c,
+                                         AVCodecContext *avctx,
+                                         unsigned high_bit_depth)
+{
+    c->diff_pixels = ff_diff_pixels_msa;
+
+    switch (avctx->bits_per_raw_sample) {
+    case 9:
+    case 10:
+    case 12:
+    case 14:
+        c->get_pixels = ff_get_pixels_16_msa;
+        break;
+    default:
+        if (avctx->bits_per_raw_sample <= 8 || avctx->codec_type !=
+            AVMEDIA_TYPE_VIDEO) {
+            c->get_pixels = ff_get_pixels_8_msa;
+        }
+        break;
+    }
+}
+#endif  // #if HAVE_MSA
+
+#if HAVE_MMI
+static av_cold void pixblockdsp_init_mmi(PixblockDSPContext *c,
+        AVCodecContext *avctx, unsigned high_bit_depth)
+{
+    c->diff_pixels = ff_diff_pixels_mmi;
+
+    if (!high_bit_depth || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
+        c->get_pixels = ff_get_pixels_8_mmi;
+    }
+}
+#endif /* HAVE_MMI */
+
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth)
+{
+#if HAVE_MSA
+    pixblockdsp_init_msa(c, avctx, high_bit_depth);
+#endif  // #if HAVE_MSA
+#if HAVE_MMI
+    pixblockdsp_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/pixblockdsp_mips.h b/libavcodec/mips/pixblockdsp_mips.h
new file mode 100644
index 0000000..7f8cc96
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_mips.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *                    Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+#define AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
+                        const uint8_t *src2, int stride);
+void ff_get_pixels_16_msa(int16_t *restrict dst, const uint8_t *src,
+                          ptrdiff_t stride);
+void ff_get_pixels_8_msa(int16_t *restrict dst, const uint8_t *src,
+                         ptrdiff_t stride);
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
+        ptrdiff_t line_size);
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
+        const uint8_t *src2, int stride);
+
+#endif  // #ifndef AVCODEC_MIPS_PIXBLOCKDSP_MIPS_H
diff --git a/libavcodec/mips/pixblockdsp_mmi.c b/libavcodec/mips/pixblockdsp_mmi.c
new file mode 100644
index 0000000..3ff84c0
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson SIMD optimized pixblockdsp
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "pixblockdsp_mips.h"
+#include "libavutil/mips/asmdefs.h"
+
+void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
+        ptrdiff_t line_size)
+{
+    double ftmp[6];
+    mips_reg tmp[2];
+
+    __asm__ volatile (
+        "li         %[tmp1],    0x08                                    \n\t"
+        "move       %[tmp0],    $0                                      \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[pixels])                         \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
+        "gssdxc1    %[ftmp2],   0x00(%[block],  %[tmp0])                \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[block],  %[tmp0])                \n\t"
+        PTR_ADDI   "%[tmp1],    %[tmp1],       -0x01                    \n\t"
+        PTR_ADDIU  "%[tmp0],    %[tmp0],        0x10                    \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        "bnez       %[tmp1],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [pixels]"+&r"(pixels)
+        : [block]"r"((mips_reg)block),      [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
+        const uint8_t *src2, int stride)
+{
+    double ftmp[5];
+    mips_reg tmp[1];
+
+    __asm__ volatile (
+        "li         %[tmp0],    0x08                                    \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "or         %[ftmp1],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "or         %[ftmp3],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdlc1    %[ftmp1],   0x0f(%[block])                          \n\t"
+        "gssdrc1    %[ftmp1],   0x08(%[block])                          \n\t"
+        PTR_ADDI   "%[tmp0],    %[tmp0], -0x01                          \n\t"
+        PTR_ADDIU  "%[block],   %[block], 0x10                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[stride]               \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[stride]               \n\t"
+        "bgtz       %[tmp0],    1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [tmp0]"=&r"(tmp[0]),
+          [block]"+&r"(block),              [src1]"+&r"(src1),
+          [src2]"+&r"(src2)
+        : [stride]"r"((mips_reg)stride)
+        : "memory"
+    );
+}
diff --git a/libavcodec/mips/pixblockdsp_msa.c b/libavcodec/mips/pixblockdsp_msa.c
new file mode 100644
index 0000000..966e11a
--- /dev/null
+++ b/libavcodec/mips/pixblockdsp_msa.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "pixblockdsp_mips.h"
+
+static void diff_pixels_msa(int16_t *block, const uint8_t *src1,
+                            const uint8_t *src2, int32_t stride)
+{
+    v16u8 in10, in11, in12, in13, in14, in15, in16, in17;
+    v16u8 in20, in21, in22, in23, in24, in25, in26, in27;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+
+    LD_UB8(src1, stride, in10, in11, in12, in13, in14, in15, in16, in17);
+    LD_UB8(src2, stride, in20, in21, in22, in23, in24, in25, in26, in27);
+    ILVR_B4_SH(in10, in20, in11, in21, in12, in22, in13, in23,
+               out0, out1, out2, out3);
+    ILVR_B4_SH(in14, in24, in15, in25, in16, in26, in17, in27,
+               out4, out5, out6, out7);
+    HSUB_UB4_SH(out0, out1, out2, out3, out0, out1, out2, out3);
+    HSUB_UB4_SH(out4, out5, out6, out7, out4, out5, out6, out7);
+    ST_SH8(out0, out1, out2, out3, out4, out5, out6, out7, block, 8);
+}
+
+static void copy_8bit_to_16bit_width8_msa(const uint8_t *src, int32_t src_stride,
+                                          int16_t *dst, int32_t dst_stride,
+                                          int32_t height)
+{
+    uint8_t *dst_ptr;
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3;
+    v16i8 zero = { 0 };
+
+    dst_ptr = (uint8_t *) dst;
+
+    for (cnt = (height >> 2); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        ILVR_B4_UB(zero, src0, zero, src1, zero, src2, zero, src3,
+                   src0, src1, src2, src3);
+
+        ST_UB4(src0, src1, src2, src3, dst_ptr, (dst_stride * 2));
+        dst_ptr += (4 * 2 * dst_stride);
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+void ff_get_pixels_16_msa(int16_t *av_restrict dest, const uint8_t *src,
+                          ptrdiff_t stride)
+{
+    copy_width16_msa(src, stride, (uint8_t *) dest, 16, 8);
+}
+
+void ff_get_pixels_8_msa(int16_t *av_restrict dest, const uint8_t *src,
+                         ptrdiff_t stride)
+{
+    copy_8bit_to_16bit_width8_msa(src, stride, dest, 8, 8);
+}
+
+void ff_diff_pixels_msa(int16_t *av_restrict block, const uint8_t *src1,
+                        const uint8_t *src2, int stride)
+{
+    diff_pixels_msa(block, src1, src2, stride);
+}
diff --git a/libavcodec/mips/qpeldsp_init_mips.c b/libavcodec/mips/qpeldsp_init_mips.c
new file mode 100644
index 0000000..140e8f8
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_init_mips.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "qpeldsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void qpeldsp_init_msa(QpelDSPContext *c)
+{
+    c->put_qpel_pixels_tab[0][0] = ff_copy_16x16_msa;
+    c->put_qpel_pixels_tab[0][1] = ff_horiz_mc_qpel_aver_src0_16width_msa;
+    c->put_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_16width_msa;
+    c->put_qpel_pixels_tab[0][3] = ff_horiz_mc_qpel_aver_src1_16width_msa;
+    c->put_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_aver_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][5] = ff_hv_mc_qpel_aver_hv_src00_16x16_msa;
+    c->put_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_aver_v_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][7] = ff_hv_mc_qpel_aver_hv_src10_16x16_msa;
+    c->put_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_16x16_msa;
+    c->put_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_aver_h_src0_16x16_msa;
+    c->put_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_16x16_msa;
+    c->put_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_aver_h_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_aver_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][13] = ff_hv_mc_qpel_aver_hv_src01_16x16_msa;
+    c->put_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_aver_v_src1_16x16_msa;
+    c->put_qpel_pixels_tab[0][15] = ff_hv_mc_qpel_aver_hv_src11_16x16_msa;
+
+    c->put_qpel_pixels_tab[1][0] = ff_copy_8x8_msa;
+    c->put_qpel_pixels_tab[1][1] = ff_horiz_mc_qpel_aver_src0_8width_msa;
+    c->put_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_8width_msa;
+    c->put_qpel_pixels_tab[1][3] = ff_horiz_mc_qpel_aver_src1_8width_msa;
+    c->put_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_aver_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_aver_hv_src00_8x8_msa;
+    c->put_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_aver_v_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_aver_hv_src10_8x8_msa;
+    c->put_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_8x8_msa;
+    c->put_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_aver_h_src0_8x8_msa;
+    c->put_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_8x8_msa;
+    c->put_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_aver_h_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_aver_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_aver_hv_src01_8x8_msa;
+    c->put_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_aver_v_src1_8x8_msa;
+    c->put_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_aver_hv_src11_8x8_msa;
+
+    c->put_no_rnd_qpel_pixels_tab[0][0] = ff_copy_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][1] =
+        ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_no_rnd_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][3] =
+        ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][4] =
+        ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][5] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][6] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][7] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_no_rnd_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][9] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_no_rnd_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][11] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][12] =
+        ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][13] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][14] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa;
+    c->put_no_rnd_qpel_pixels_tab[0][15] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa;
+
+    c->put_no_rnd_qpel_pixels_tab[1][0] = ff_copy_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][1] =
+        ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_no_rnd_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][3] =
+        ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][4] =
+        ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][5] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][6] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][7] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_no_rnd_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][9] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_no_rnd_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][11] =
+        ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][12] =
+        ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][13] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][14] =
+        ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa;
+    c->put_no_rnd_qpel_pixels_tab[1][15] =
+        ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa;
+
+    c->avg_qpel_pixels_tab[0][0] = ff_avg_width16_msa;
+    c->avg_qpel_pixels_tab[0][1] =
+        ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa;
+    c->avg_qpel_pixels_tab[0][2] = ff_horiz_mc_qpel_avg_dst_16width_msa;
+    c->avg_qpel_pixels_tab[0][3] =
+        ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa;
+    c->avg_qpel_pixels_tab[0][4] = ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][5] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa;
+    c->avg_qpel_pixels_tab[0][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][7] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa;
+    c->avg_qpel_pixels_tab[0][8] = ff_vert_mc_qpel_avg_dst_16x16_msa;
+    c->avg_qpel_pixels_tab[0][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa;
+    c->avg_qpel_pixels_tab[0][10] = ff_hv_mc_qpel_avg_dst_16x16_msa;
+    c->avg_qpel_pixels_tab[0][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][12] = ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][13] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa;
+    c->avg_qpel_pixels_tab[0][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa;
+    c->avg_qpel_pixels_tab[0][15] =
+        ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa;
+
+    c->avg_qpel_pixels_tab[1][0] = ff_avg_width8_msa;
+    c->avg_qpel_pixels_tab[1][1] =
+        ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa;
+    c->avg_qpel_pixels_tab[1][2] = ff_horiz_mc_qpel_avg_dst_8width_msa;
+    c->avg_qpel_pixels_tab[1][3] =
+        ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa;
+    c->avg_qpel_pixels_tab[1][4] = ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][5] = ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa;
+    c->avg_qpel_pixels_tab[1][6] = ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][7] = ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa;
+    c->avg_qpel_pixels_tab[1][8] = ff_vert_mc_qpel_avg_dst_8x8_msa;
+    c->avg_qpel_pixels_tab[1][9] = ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa;
+    c->avg_qpel_pixels_tab[1][10] = ff_hv_mc_qpel_avg_dst_8x8_msa;
+    c->avg_qpel_pixels_tab[1][11] = ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][12] = ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][13] = ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa;
+    c->avg_qpel_pixels_tab[1][14] = ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa;
+    c->avg_qpel_pixels_tab[1][15] = ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa;
+}
+#endif  // #if HAVE_MSA
+
+void ff_qpeldsp_init_mips(QpelDSPContext *c)
+{
+#if HAVE_MSA
+    qpeldsp_init_msa(c);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/qpeldsp_mips.h b/libavcodec/mips/qpeldsp_mips.h
new file mode 100644
index 0000000..704d221
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_mips.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_QPELDSP_MIPS_H
+#define AVCODEC_MIPS_QPELDSP_MIPS_H
+
+#include "../mpegvideo.h"
+
+void ff_copy_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_copy_16x16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_width8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_width16_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dst, const uint8_t *src,
+                                            ptrdiff_t stride);
+void ff_horiz_mc_qpel_8width_msa(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_horiz_mc_qpel_16width_msa(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dst, const uint8_t *src,
+                                            ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dst,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dst, const uint8_t *src,
+                                          ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dst,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride);
+
+void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_vert_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t stride);
+void ff_vert_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src,
+                               ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+
+void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_16x16_msa(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t stride);
+void ff_hv_mc_qpel_8x8_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t stride);
+void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                     ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dst,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dst,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride);
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dst,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dst,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dst,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride);
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dst,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_QPELDSP_MIPS_H
diff --git a/libavcodec/mips/qpeldsp_msa.c b/libavcodec/mips/qpeldsp_msa.c
new file mode 100644
index 0000000..4710b3f
--- /dev/null
+++ b/libavcodec/mips/qpeldsp_msa.c
@@ -0,0 +1,6518 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "qpeldsp_mips.h"
+
+#define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2)  \
+( {                                                                     \
+    v16u8 out, tmp0, tmp1;                                              \
+    v16u8 data0, data1, data2, data3, data4, data5;                     \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
+    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
+    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
+    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
+    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
+    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
+    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
+    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
+    sum0_r *= (v8u16) (coef0);                                          \
+    sum0_l *= (v8u16) (coef0);                                          \
+    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
+    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
+    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    SRARI_H2_SH(res_r, res_l, 5);                                       \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,                       \
+                                      mask0, mask1, mask2, mask3,       \
+                                      coef0, coef1, coef2)              \
+( {                                                                     \
+    v16u8 out;                                                          \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                               \
+    v8i16 res0_r, res1_r;                                               \
+                                                                        \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);   \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);   \
+    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                        \
+    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);          \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);   \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);   \
+    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);         \
+    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);         \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
+    res1_r = (v8i16) (sum4_r - sum7_r);                                 \
+    SRARI_H2_SH(res0_r, res1_r, 5);                                     \
+    CLIP_SH2_0_255(res0_r, res1_r);                                     \
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);        \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,                        \
+                                           mask0, mask1, mask2, mask3,  \
+                                           coef0, coef1, coef2)         \
+( {                                                                     \
+    v16u8 out;                                                          \
+    v8i16 res0_r;                                                       \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+                                                                        \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);   \
+    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);            \
+    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);             \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);   \
+    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);         \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                 \
+    res0_r = __msa_srari_h(res0_r, 5);                                  \
+    res0_r = CLIP_SH_0_255(res0_r);                                     \
+    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);        \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,   \
+                                                    mask2, mask3, coef0,  \
+                                                    coef1, coef2)         \
+( {                                                                       \
+    v16u8 out;                                                            \
+    v8i16 res0_r;                                                         \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                 \
+                                                                          \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);     \
+    sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);              \
+    sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);               \
+    VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);     \
+    DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);           \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                   \
+    res0_r += 15;                                                         \
+    res0_r >>= 5;                                                         \
+    res0_r = CLIP_SH_0_255(res0_r);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);          \
+                                                                          \
+    out;                                                                  \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,              \
+                                         coef0, coef1, coef2)           \
+( {                                                                     \
+    v16u8 out, tmp0, tmp1;                                              \
+    v16u8 data0, data1, data2, data3, data4, data5;                     \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
+    ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
+    data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
+    data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
+    HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
+    ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
+    data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
+    data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
+    sum0_r *= (v8u16) (coef0);                                          \
+    sum0_l *= (v8u16) (coef0);                                          \
+    ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
+    data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
+    data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    res_r += 15;                                                        \
+    res_l += 15;                                                        \
+    res_r >>= 5;                                                        \
+    res_l >>= 5;                                                        \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    out;                                                                \
+} )
+
+#define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1,                  \
+                                               mask0, mask1, mask2, mask3,  \
+                                               coef0, coef1, coef2)         \
+( {                                                                         \
+    v16u8 out;                                                              \
+    v8i16 res0_r, res1_r;                                                   \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                   \
+    v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                                   \
+                                                                            \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);       \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);       \
+    HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                            \
+    DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);              \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);       \
+    VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);       \
+    DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);             \
+    DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);             \
+    res0_r = (v8i16) (sum0_r - sum3_r);                                     \
+    res1_r = (v8i16) (sum4_r - sum7_r);                                     \
+    res0_r += 15;                                                           \
+    res1_r += 15;                                                           \
+    res0_r >>= 5;                                                           \
+    res1_r >>= 5;                                                           \
+    CLIP_SH2_0_255(res0_r, res1_r);                                         \
+    out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);            \
+                                                                            \
+    out;                                                                    \
+} )
+
+#define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3,                  \
+                               inp4, inp5, inp6, inp7,                  \
+                               coef0, coef1, coef2)                     \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
+    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
+    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
+    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    SRARI_H2_SH(res_r, res_l, 5);                                       \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03,        \
+                                     inp04, inp05, inp06, inp07,        \
+                                     inp10, inp11, inp12, inp13,        \
+                                     inp14, inp15, inp16, inp17,        \
+                                     coef0, coef1, coef2)               \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 val0, val1;                                                   \
+    v8u16 sum00, sum01, sum02, sum03;                                   \
+    v8u16 sum10, sum11, sum12, sum13;                                   \
+                                                                        \
+    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,  \
+               sum00, sum10, sum03, sum13);                             \
+    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);              \
+    HADD_UB2_UH(sum03, sum13, sum03, sum13);                            \
+    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,  \
+               sum02, sum12, sum01, sum11);                             \
+    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);             \
+    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);             \
+    val0 = (v8i16) (sum00 - sum03);                                     \
+    val1 = (v8i16) (sum10 - sum13);                                     \
+    SRARI_H2_SH(val0, val1, 5);                                         \
+    CLIP_SH2_0_255(val0, val1);                                         \
+    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);            \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3,         \
+                                        inp4, inp5, inp6, inp7,         \
+                                        coef0, coef1, coef2)            \
+( {                                                                     \
+    v16u8 res;                                                          \
+    v8i16 res_r, res_l;                                                 \
+    v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
+    v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
+                                                                        \
+    ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
+    ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
+    DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
+    HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
+    ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
+    ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
+    DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
+    DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
+    res_r = (v8i16) (sum0_r - sum3_r);                                  \
+    res_l = (v8i16) (sum0_l - sum3_l);                                  \
+    res_r += 15;                                                        \
+    res_l += 15;                                                        \
+    res_r >>= 5;                                                        \
+    res_l >>= 5;                                                        \
+    CLIP_SH2_0_255(res_r, res_l);                                       \
+    res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
+                                                                        \
+    res;                                                                \
+} )
+
+#define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03,  \
+                                              inp04, inp05, inp06, inp07,  \
+                                              inp10, inp11, inp12, inp13,  \
+                                              inp14, inp15, inp16, inp17,  \
+                                              coef0, coef1, coef2)         \
+( {                                                                        \
+    v16u8 res;                                                             \
+    v8i16 val0, val1;                                                      \
+    v8u16 sum00, sum01, sum02, sum03;                                      \
+    v8u16 sum10, sum11, sum12, sum13;                                      \
+                                                                           \
+    ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,     \
+               sum00, sum10, sum03, sum13);                                \
+    DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);                 \
+    HADD_UB2_UH(sum03, sum13, sum03, sum13);                               \
+    ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,     \
+               sum02, sum12, sum01, sum11);                                \
+    DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);                \
+    DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);                \
+    val0 = (v8i16) (sum00 - sum03);                                        \
+    val1 = (v8i16) (sum10 - sum13);                                        \
+    val0 += 15;                                                            \
+    val1 += 15;                                                            \
+    val0 >>= 5;                                                            \
+    val1 >>= 5;                                                            \
+    CLIP_SH2_0_255(val0, val1);                                            \
+    res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);               \
+                                                                           \
+    res;                                                                   \
+} )
+
+static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride,
+                                                int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_8width_msa(const uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_16width_msa(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride,
+                                               int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                uint8_t *dst,
+                                                int32_t dst_stride,
+                                                int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        res0 = __msa_ave_u_b(inp0, res0);
+        res1 = __msa_ave_u_b(inp2, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src,
+                                            int32_t src_stride,
+                                            uint8_t *dst,
+                                            int32_t dst_stride,
+                                            int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                      mask2, mask3, const20,
+                                                      const6, const3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        res0 = __msa_ave_u_b(inp0, res0);
+        res1 = __msa_ave_u_b(inp2, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src,
+                                                        int32_t src_stride,
+                                                        uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1;
+    v16u8 dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1;
+    v16u8 dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                             mask0, mask1, mask2, mask3,
+                                             const20, const6, const3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+        SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+        inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+        inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+        dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+        dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+        AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+        ST8x4_UB(res0, res1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src,
+                                                        int32_t src_stride,
+                                                        uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+        res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                       const20, const6, const3);
+        res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                       const20, const6, const3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
+        AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
+        ST_UB2(res0, res1, dst, dst_stride);
+        dst += (2 * dst_stride);
+    }
+}
+
+
+static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp0);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp8, inp9);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp10, inp11);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp12, inp13);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    LD_UB2(src, src_stride, inp14, inp15);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp15);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_8x8_msa(const uint8_t *src,
+                                 int32_t src_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_16x16_msa(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    inp4 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+}
+
+static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    inp4 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    res0 = __msa_aver_u_b(res0, inp16);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp0);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+}
+
+static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
+                                        int32_t src_stride,
+                                        uint8_t *dst,
+                                        int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                                 inp1, inp2, inp3, inp4,
+                                                 inp1, inp0, inp0, inp1,
+                                                 inp2, inp3, inp4, inp5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                                 inp3, inp4, inp5, inp6,
+                                                 inp3, inp2, inp1, inp0,
+                                                 inp4, inp5, inp6, inp7,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                                 inp5, inp6, inp7, inp8,
+                                                 inp5, inp4, inp3, inp2,
+                                                 inp6, inp7, inp8, inp8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                                 inp7, inp8, inp8, inp7,
+                                                 inp7, inp6, inp5, inp4,
+                                                 inp8, inp8, inp7, inp6,
+                                                 const20, const6, const3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    res0 = __msa_ave_u_b(res0, tmp0);
+    res1 = __msa_ave_u_b(res1, tmp1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
+                                           inp1, inp2, inp3, inp4,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp1);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
+                                           inp2, inp3, inp4, inp5,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp2);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
+                                           inp3, inp4, inp5, inp6,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp3);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
+                                           inp4, inp5, inp6, inp7,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp4);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
+                                           inp5, inp6, inp7, inp8,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp5);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
+                                           inp6, inp7, inp8, inp9,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp6);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
+                                           inp7, inp8, inp9, inp10,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp7);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
+                                           inp8, inp9, inp10, inp11,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp8);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
+                                           inp9, inp10, inp11, inp12,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp9);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
+                                           inp10, inp11, inp12, inp13,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp10);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
+                                           inp11, inp12, inp13, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp11);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
+                                           inp12, inp13, inp14, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp12);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
+                                           inp13, inp14, inp15, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp13);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
+                                           inp14, inp15, inp16, inp16,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp14);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
+                                           inp15, inp16, inp16, inp15,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp15);
+    ST_UB(res0, dst);
+    dst += dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
+                                           inp16, inp16, inp15, inp14,
+                                           const20, const6, const3);
+    res0 = __msa_ave_u_b(res0, inp16);
+    ST_UB(res0, dst);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp8, inp9);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp10, inp11);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp12, inp13);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp14, inp15);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16u8 tmp0, tmp1, res0, res1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    LD_UB2(src, src_stride, inp4, inp5);
+    src += (2 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
+                                        inp1, inp2, inp3, inp4,
+                                        inp1, inp0, inp0, inp1,
+                                        inp2, inp3, inp4, inp5,
+                                        const20, const6, const3);
+    LD_UB2(src, src_stride, inp6, inp7);
+    src += (2 * src_stride);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
+                                        inp3, inp4, inp5, inp6,
+                                        inp3, inp2, inp1, inp0,
+                                        inp4, inp5, inp6, inp7,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    inp8 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
+                                        inp5, inp6, inp7, inp8,
+                                        inp5, inp4, inp3, inp2,
+                                        inp6, inp7, inp8, inp8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
+                                        inp7, inp8, inp8, inp7,
+                                        inp7, inp6, inp5, inp4,
+                                        inp8, inp8, inp7, inp6,
+                                        const20, const6, const3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
+    tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
+    dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
+    dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
+    AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
+    AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
+    v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
+    v16u8 res0, res1, dst0, dst1;
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
+    src += (5 * src_stride);
+    res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
+                                  inp1, inp2, inp3, inp4,
+                                  const20, const6, const3);
+    inp5 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
+                                  inp2, inp3, inp4, inp5,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp6 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
+                                  inp3, inp4, inp5, inp6,
+                                  const20, const6, const3);
+    inp7 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
+                                  inp4, inp5, inp6, inp7,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp8 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
+                                  inp5, inp6, inp7, inp8,
+                                  const20, const6, const3);
+    inp9 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
+                                  inp6, inp7, inp8, inp9,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp10 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
+                                  inp7, inp8, inp9, inp10,
+                                  const20, const6, const3);
+    inp11 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
+                                  inp8, inp9, inp10, inp11,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp12 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
+                                  inp9, inp10, inp11, inp12,
+                                  const20, const6, const3);
+    inp13 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
+                                  inp10, inp11, inp12, inp13,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp14 = LD_UB(src);
+    src += src_stride;
+    res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
+                                  inp11, inp12, inp13, inp14,
+                                  const20, const6, const3);
+    inp15 = LD_UB(src);
+    src += src_stride;
+    res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
+                                  inp12, inp13, inp14, inp15,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp16 = LD_UB(src);
+    res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
+                                  inp13, inp14, inp15, inp16,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
+                                  inp14, inp15, inp16, inp16,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
+                                  inp15, inp16, inp16, inp15,
+                                  const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
+                                  inp16, inp16, inp15, inp14,
+                                  const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+    ST_UB2(res0, res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    res = __msa_ave_u_b(inp0, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
+                                               const20, const6, const3);
+        res = __msa_ave_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
+                                           const20, const6, const3);
+    res = __msa_ave_u_b(inp1, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
+                                        int32_t src_stride,
+                                        uint8_t *dst,
+                                        int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                    mask2, mask3, const20,
+                                                    const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                         mask2, mask3, const20,
+                                                         const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                                      int32_t src_stride,
+                                                      uint8_t *dst,
+                                                      int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    uint8_t *dst,
+                                                    int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_ave_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_ave_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_ave_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                                 horiz1, horiz2, horiz3, horiz4,
+                                                 horiz1, horiz0, horiz0, horiz1,
+                                                 horiz2, horiz3, horiz4, horiz5,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_ave_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
+                                                  mask2, mask3, const20,
+                                                  const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_ave_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                                 horiz3, horiz4, horiz5, horiz6,
+                                                 horiz3, horiz2, horiz1, horiz0,
+                                                 horiz4, horiz5, horiz6, horiz7,
+                                                 const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
+                                                       mask2, mask3, const20,
+                                                       const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_ave_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                                 horiz5, horiz6, horiz7, horiz8,
+                                                 horiz5, horiz4, horiz3, horiz2,
+                                                 horiz6, horiz7, horiz8, horiz8,
+                                                 const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                                 horiz7, horiz8, horiz8, horiz7,
+                                                 horiz7, horiz6, horiz5, horiz4,
+                                                 horiz8, horiz8, horiz7, horiz6,
+                                                 const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_ave_u_b(avg0, res0);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_ave_u_b(avg1, res1);
+    ST8x4_UB(res0, res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp0, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp2, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp4, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(inp6, res);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    res = __msa_aver_u_b(inp0, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
+                                            int32_t src_stride,
+                                            uint8_t *dst,
+                                            int32_t dst_stride,
+                                            int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 int32_t height)
+{
+    uint8_t loop_count;
+    v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
+    v16u8 res;
+    v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+    v8u16 const20 = (v8u16) __msa_ldi_h(20);
+
+    for (loop_count = (height >> 2); loop_count--;) {
+        LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
+        LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
+        src += (4 * src_stride);
+        res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp1);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp3);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp5);
+        ST_UB(res, dst);
+        dst += dst_stride;
+
+        res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
+                                      const20, const6, const3);
+        res = __msa_aver_u_b(res, inp7);
+        ST_UB(res, dst);
+        dst += dst_stride;
+    }
+
+    LD_UB2(src, 1, inp0, inp1);
+    res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
+    res = __msa_aver_u_b(inp1, res);
+    ST_UB(res, dst);
+}
+
+static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = __msa_aver_u_b(avg1, res1);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_16x16_msa(const uint8_t *src,
+                                 int32_t src_stride,
+                                 uint8_t *dst,
+                                 int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src,
+                                           int32_t src_stride,
+                                           uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
+    res1 = __msa_aver_u_b(avg1, res1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
+    res0 = __msa_aver_u_b(avg0, res0);
+
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+    avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                               int32_t src_stride,
+                                               uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
+    src += (4 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                         mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += 2 * dst_stride;
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+
+}
+
+static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
+                                       uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
+                                        horiz5, horiz6, horiz7, horiz8,
+                                        horiz5, horiz4, horiz3, horiz2,
+                                        horiz6, horiz7, horiz8, horiz8,
+                                        const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
+                                        horiz7, horiz8, horiz8, horiz7,
+                                        horiz7, horiz6, horiz5, horiz4,
+                                        horiz8, horiz8, horiz7, horiz6,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
+                                        horiz1, horiz2, horiz3, horiz4,
+                                        horiz1, horiz0, horiz0, horiz1,
+                                        horiz2, horiz3, horiz4, horiz5,
+                                        const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
+                                           mask0, mask1, mask2, mask3,
+                                           const20, const6, const3);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
+                                        horiz3, horiz4, horiz5, horiz6,
+                                        horiz3, horiz2, horiz1, horiz0,
+                                        horiz4, horiz5, horiz6, horiz7,
+                                        const20, const6, const3);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
+                                                mask0, mask1, mask2, mask3,
+                                                const20, const6, const3);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
+                                        horiz6, horiz7, horiz8, horiz5, horiz4,
+                                        horiz3, horiz2, horiz6, horiz7, horiz8,
+                                        horiz8, const20, const6, const3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
+                                        horiz8, horiz8, horiz7, horiz7, horiz6,
+                                        horiz5, horiz4, horiz8, horiz8, horiz7,
+                                        horiz6, const20, const6, const3);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = __msa_aver_u_b(avg0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = __msa_aver_u_b(avg1, res1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride)
+{
+    uint8_t buff[272];
+
+    hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
+}
+
+static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     uint8_t *dst,
+                                                     int32_t dst_stride)
+{
+    v16u8 inp0, inp1, inp2, inp3;
+    v16u8 res0, res1, avg0, avg1;
+    v16u8 horiz0, horiz1, horiz2, horiz3;
+    v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
+    v16u8 dst0, dst1;
+    v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+    v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
+    v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
+    v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
+    v16u8 const20 = (v16u8) __msa_ldi_b(20);
+    v16u8 const6 = (v16u8) __msa_ldi_b(6);
+    v16u8 const3 = (v16u8) __msa_ldi_b(3);
+
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz0 = __msa_aver_u_b(inp0, res0);
+    horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    LD_UB2(src, src_stride, inp0, inp1);
+    src += (2 * src_stride);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz2 = __msa_aver_u_b(inp2, res1);
+    horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+
+    inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
+    horiz4 = __msa_aver_u_b(inp0, res0);
+    horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
+                                        horiz2, horiz3, horiz4, horiz1, horiz0,
+                                        horiz0, horiz1, horiz2, horiz3, horiz4,
+                                        horiz5, const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(src, src_stride, inp2, inp3);
+    src += (2 * src_stride);
+    res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
+                                         const20, const6, const3);
+    SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+
+    inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
+    horiz6 = __msa_aver_u_b(inp2, res1);
+    horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
+                                        horiz4, horiz5, horiz6, horiz3, horiz2,
+                                        horiz1, horiz0, horiz4, horiz5, horiz6,
+                                        horiz7, const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    inp0 = LD_UB(src);
+    res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
+                                              const20, const6, const3);
+    inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
+    horiz8 = __msa_aver_u_b(inp0, res0);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
+    res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
+                                        horiz6, horiz7, horiz8, horiz5, horiz4,
+                                        horiz3, horiz2, horiz6, horiz7, horiz8,
+                                        horiz8, const20, const6, const3);
+    res0 = __msa_aver_u_b(avg0, res0);
+    avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res0 = __msa_aver_u_b(avg0, res0);
+    ST8x2_UB(res0, dst, dst_stride);
+    dst += (2 * dst_stride);
+
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
+    res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
+                                        horiz8, horiz8, horiz7, horiz7, horiz6,
+                                        horiz5, horiz4, horiz8, horiz8, horiz7,
+                                        horiz6, const20, const6, const3);
+    res1 = __msa_aver_u_b(avg1, res1);
+    avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    res1 = __msa_aver_u_b(avg1, res1);
+    ST8x2_UB(res1, dst, dst_stride);
+}
+
+static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t src0, src1;
+    int32_t loop_cnt;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        src0 = LD(src);
+        src += src_stride;
+        src1 = LD(src);
+        src += src_stride;
+
+        SD(src0, dst);
+        dst += dst_stride;
+        SD(src1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride,
+           src8, src9, src10, src11, src12, src13, src14, src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+           dst, dst_stride);
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    copy_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    copy_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest,
+                                            const uint8_t *src,
+                                            ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_16width_msa(uint8_t *dest,
+                                  const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest,
+                                            const uint8_t *src,
+                                            ptrdiff_t stride)
+{
+    horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest,
+                                        const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    avg_width8_msa(src, stride, dest, stride, 8);
+}
+
+void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
+{
+    avg_width16_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest,
+                                          const uint8_t *src, ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
+}
+
+void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest,
+                                                    const uint8_t *src,
+                                                    ptrdiff_t stride)
+{
+    horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
+}
+
+
+void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
+                             ptrdiff_t stride)
+{
+    vert_mc_qpel_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
+                               ptrdiff_t stride)
+{
+    vert_mc_qpel_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
+                                    const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
+                                      const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
+                                     const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
+}
+
+/* HV cases */
+void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
+                             ptrdiff_t stride)
+{
+    hv_mc_qpel_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
+                           ptrdiff_t stride)
+{
+    hv_mc_qpel_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest,
+                                       const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                           const uint8_t *src,
+                                           ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                         const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
+                                     const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
+                                   const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest,
+                                               const uint8_t *src,
+                                               ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                                   const uint8_t *src,
+                                                   ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                                 const uint8_t *src,
+                                                 ptrdiff_t stride)
+{
+    hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
+                                    const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
+                                  const uint8_t *src, ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest,
+                                              const uint8_t *src,
+                                              ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest,
+                                                  const uint8_t *src,
+                                                  ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride);
+}
+
+void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest,
+                                                const uint8_t *src,
+                                                ptrdiff_t stride)
+{
+    hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride);
+}
diff --git a/libavcodec/mips/sbrdsp_mips.c b/libavcodec/mips/sbrdsp_mips.c
new file mode 100644
index 0000000..1b0a106
--- /dev/null
+++ b/libavcodec/mips/sbrdsp_mips.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors:  Darko Laus      (darko@mips.com)
+ *           Djordje Pesut   (djordje@mips.com)
+ *           Mirjana Vulin   (mvulin@mips.com)
+ *
+ * AAC Spectral Band Replication decoding functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/sbrdsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/sbrdsp.h"
+#include "libavutil/mips/asmdefs.h"
+
+#if HAVE_INLINE_ASM
+static void sbr_qmf_pre_shuffle_mips(float *z)
+{
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6;
+    float *z1 = &z[66];
+    float *z2 = &z[59];
+    float *z3 = &z[2];
+    float *z4 = z1 + 60;
+
+    /* loop unrolled 5 times */
+    __asm__ volatile (
+        "lui    %[Temp6],   0x8000                  \n\t"
+    "1:                                             \n\t"
+        "lw     %[Temp1],   0(%[z2])                \n\t"
+        "lw     %[Temp2],   4(%[z2])                \n\t"
+        "lw     %[Temp3],   8(%[z2])                \n\t"
+        "lw     %[Temp4],   12(%[z2])               \n\t"
+        "lw     %[Temp5],   16(%[z2])               \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
+        "xor    %[Temp2],   %[Temp2],   %[Temp6]    \n\t"
+        "xor    %[Temp3],   %[Temp3],   %[Temp6]    \n\t"
+        "xor    %[Temp4],   %[Temp4],   %[Temp6]    \n\t"
+        "xor    %[Temp5],   %[Temp5],   %[Temp6]    \n\t"
+        PTR_ADDIU "%[z2],   %[z2],      -20         \n\t"
+        "sw     %[Temp1],   32(%[z1])               \n\t"
+        "sw     %[Temp2],   24(%[z1])               \n\t"
+        "sw     %[Temp3],   16(%[z1])               \n\t"
+        "sw     %[Temp4],   8(%[z1])                \n\t"
+        "sw     %[Temp5],   0(%[z1])                \n\t"
+        "lw     %[Temp1],   0(%[z3])                \n\t"
+        "lw     %[Temp2],   4(%[z3])                \n\t"
+        "lw     %[Temp3],   8(%[z3])                \n\t"
+        "lw     %[Temp4],   12(%[z3])               \n\t"
+        "lw     %[Temp5],   16(%[z3])               \n\t"
+        "sw     %[Temp1],   4(%[z1])                \n\t"
+        "sw     %[Temp2],   12(%[z1])               \n\t"
+        "sw     %[Temp3],   20(%[z1])               \n\t"
+        "sw     %[Temp4],   28(%[z1])               \n\t"
+        "sw     %[Temp5],   36(%[z1])               \n\t"
+        PTR_ADDIU "%[z3],   %[z3],      20          \n\t"
+        PTR_ADDIU "%[z1],   %[z1],      40          \n\t"
+        "bne    %[z1],      %[z4],      1b          \n\t"
+        "lw     %[Temp1],   132(%[z])               \n\t"
+        "lw     %[Temp2],   128(%[z])               \n\t"
+        "lw     %[Temp3],   0(%[z])                 \n\t"
+        "lw     %[Temp4],   4(%[z])                 \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp6]    \n\t"
+        "sw     %[Temp1],   504(%[z])               \n\t"
+        "sw     %[Temp2],   508(%[z])               \n\t"
+        "sw     %[Temp3],   256(%[z])               \n\t"
+        "sw     %[Temp4],   260(%[z])               \n\t"
+
+        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
+          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
+          [Temp5]"=&r"(Temp5), [Temp6]"=&r"(Temp6),
+          [z1]"+r"(z1), [z2]"+r"(z2), [z3]"+r"(z3)
+        : [z4]"r"(z4), [z]"r"(z)
+        : "memory"
+    );
+}
+
+static void sbr_qmf_post_shuffle_mips(float W[32][2], const float *z)
+{
+    int Temp1, Temp2, Temp3, Temp4, Temp5;
+    float *W_ptr = (float *)W;
+    float *z1    = (float *)z;
+    float *z2    = (float *)&z[60];
+    float *z_end = z1 + 32;
+
+     /* loop unrolled 4 times */
+    __asm__ volatile (
+        "lui    %[Temp5],   0x8000                  \n\t"
+    "1:                                             \n\t"
+        "lw     %[Temp1],   0(%[z2])                \n\t"
+        "lw     %[Temp2],   4(%[z2])                \n\t"
+        "lw     %[Temp3],   8(%[z2])                \n\t"
+        "lw     %[Temp4],   12(%[z2])               \n\t"
+        "xor    %[Temp1],   %[Temp1],   %[Temp5]    \n\t"
+        "xor    %[Temp2],   %[Temp2],   %[Temp5]    \n\t"
+        "xor    %[Temp3],   %[Temp3],   %[Temp5]    \n\t"
+        "xor    %[Temp4],   %[Temp4],   %[Temp5]    \n\t"
+        PTR_ADDIU "%[z2],   %[z2],      -16         \n\t"
+        "sw     %[Temp1],   24(%[W_ptr])            \n\t"
+        "sw     %[Temp2],   16(%[W_ptr])            \n\t"
+        "sw     %[Temp3],   8(%[W_ptr])             \n\t"
+        "sw     %[Temp4],   0(%[W_ptr])             \n\t"
+        "lw     %[Temp1],   0(%[z1])                \n\t"
+        "lw     %[Temp2],   4(%[z1])                \n\t"
+        "lw     %[Temp3],   8(%[z1])                \n\t"
+        "lw     %[Temp4],   12(%[z1])               \n\t"
+        "sw     %[Temp1],   4(%[W_ptr])             \n\t"
+        "sw     %[Temp2],   12(%[W_ptr])            \n\t"
+        "sw     %[Temp3],   20(%[W_ptr])            \n\t"
+        "sw     %[Temp4],   28(%[W_ptr])            \n\t"
+        PTR_ADDIU "%[z1],   %[z1],      16          \n\t"
+        PTR_ADDIU "%[W_ptr],%[W_ptr],   32          \n\t"
+        "bne    %[z1],      %[z_end],   1b          \n\t"
+
+        : [Temp1]"=&r"(Temp1), [Temp2]"=&r"(Temp2),
+          [Temp3]"=&r"(Temp3), [Temp4]"=&r"(Temp4),
+          [Temp5]"=&r"(Temp5), [z1]"+r"(z1),
+          [z2]"+r"(z2), [W_ptr]"+r"(W_ptr)
+        : [z_end]"r"(z_end)
+        : "memory"
+    );
+}
+
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+static void sbr_sum64x5_mips(float *z)
+{
+    int k;
+    float *z1;
+    float f1, f2, f3, f4, f5, f6, f7, f8;
+    for (k = 0; k < 64; k += 8) {
+
+        z1 = &z[k];
+
+         /* loop unrolled 8 times */
+        __asm__ volatile (
+            "lwc1   $f0,    0(%[z1])        \n\t"
+            "lwc1   $f1,    256(%[z1])      \n\t"
+            "lwc1   $f2,    4(%[z1])        \n\t"
+            "lwc1   $f3,    260(%[z1])      \n\t"
+            "lwc1   $f4,    8(%[z1])        \n\t"
+            "add.s  %[f1],  $f0,    $f1     \n\t"
+            "lwc1   $f5,    264(%[z1])      \n\t"
+            "add.s  %[f2],  $f2,    $f3     \n\t"
+            "lwc1   $f6,    12(%[z1])       \n\t"
+            "lwc1   $f7,    268(%[z1])      \n\t"
+            "add.s  %[f3],  $f4,    $f5     \n\t"
+            "lwc1   $f8,    16(%[z1])       \n\t"
+            "lwc1   $f9,    272(%[z1])      \n\t"
+            "add.s  %[f4],  $f6,    $f7     \n\t"
+            "lwc1   $f10,   20(%[z1])       \n\t"
+            "lwc1   $f11,   276(%[z1])      \n\t"
+            "add.s  %[f5],  $f8,    $f9     \n\t"
+            "lwc1   $f12,   24(%[z1])       \n\t"
+            "lwc1   $f13,   280(%[z1])      \n\t"
+            "add.s  %[f6],  $f10,   $f11    \n\t"
+            "lwc1   $f14,   28(%[z1])       \n\t"
+            "lwc1   $f15,   284(%[z1])      \n\t"
+            "add.s  %[f7],  $f12,   $f13    \n\t"
+            "lwc1   $f0,    512(%[z1])      \n\t"
+            "lwc1   $f1,    516(%[z1])      \n\t"
+            "add.s  %[f8],  $f14,   $f15    \n\t"
+            "lwc1   $f2,    520(%[z1])      \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    524(%[z1])      \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    528(%[z1])      \n\t"
+            "lwc1   $f5,    532(%[z1])      \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    536(%[z1])      \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    540(%[z1])      \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "lwc1   $f0,    768(%[z1])      \n\t"
+            "lwc1   $f1,    772(%[z1])      \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "lwc1   $f2,    776(%[z1])      \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    780(%[z1])      \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    784(%[z1])      \n\t"
+            "lwc1   $f5,    788(%[z1])      \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    792(%[z1])      \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    796(%[z1])      \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "lwc1   $f0,    1024(%[z1])     \n\t"
+            "lwc1   $f1,    1028(%[z1])     \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "lwc1   $f2,    1032(%[z1])     \n\t"
+            "add.s  %[f1],  %[f1],  $f0     \n\t"
+            "add.s  %[f2],  %[f2],  $f1     \n\t"
+            "lwc1   $f3,    1036(%[z1])     \n\t"
+            "add.s  %[f3],  %[f3],  $f2     \n\t"
+            "lwc1   $f4,    1040(%[z1])     \n\t"
+            "lwc1   $f5,    1044(%[z1])     \n\t"
+            "add.s  %[f4],  %[f4],  $f3     \n\t"
+            "lwc1   $f6,    1048(%[z1])     \n\t"
+            "add.s  %[f5],  %[f5],  $f4     \n\t"
+            "add.s  %[f6],  %[f6],  $f5     \n\t"
+            "lwc1   $f7,    1052(%[z1])     \n\t"
+            "add.s  %[f7],  %[f7],  $f6     \n\t"
+            "swc1   %[f1],  0(%[z1])        \n\t"
+            "swc1   %[f2],  4(%[z1])        \n\t"
+            "add.s  %[f8],  %[f8],  $f7     \n\t"
+            "swc1   %[f3],  8(%[z1])        \n\t"
+            "swc1   %[f4],  12(%[z1])       \n\t"
+            "swc1   %[f5],  16(%[z1])       \n\t"
+            "swc1   %[f6],  20(%[z1])       \n\t"
+            "swc1   %[f7],  24(%[z1])       \n\t"
+            "swc1   %[f8],  28(%[z1])       \n\t"
+
+            : [f1]"=&f"(f1), [f2]"=&f"(f2), [f3]"=&f"(f3),
+              [f4]"=&f"(f4), [f5]"=&f"(f5), [f6]"=&f"(f6),
+              [f7]"=&f"(f7), [f8]"=&f"(f8)
+            : [z1]"r"(z1)
+            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
+              "$f6", "$f7", "$f8", "$f9", "$f10", "$f11",
+              "$f12", "$f13", "$f14", "$f15",
+              "memory"
+        );
+    }
+}
+
+static float sbr_sum_square_mips(float (*x)[2], int n)
+{
+    float sum0 = 0.0f, sum1 = 0.0f;
+    float *p_x;
+    float temp0, temp1, temp2, temp3;
+    float *loop_end;
+    p_x = &x[0][0];
+    loop_end = p_x + (n >> 1)*4 - 4;
+
+    __asm__ volatile (
+        ".set      push                                             \n\t"
+        ".set      noreorder                                        \n\t"
+        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
+        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
+        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
+        "lwc1      %[temp3],   12(%[p_x])                           \n\t"
+    "1:                                                             \n\t"
+        PTR_ADDIU "%[p_x],     %[p_x],       16                     \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
+        "lwc1      %[temp0],   0(%[p_x])                            \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
+        "lwc1      %[temp1],   4(%[p_x])                            \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
+        "lwc1      %[temp2],   8(%[p_x])                            \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
+        "bne       %[p_x],     %[loop_end],  1b                     \n\t"
+        " lwc1     %[temp3],   12(%[p_x])                           \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp0],   %[temp0]   \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp1],   %[temp1]   \n\t"
+        "madd.s    %[sum0],    %[sum0],      %[temp2],   %[temp2]   \n\t"
+        "madd.s    %[sum1],    %[sum1],      %[temp3],   %[temp3]   \n\t"
+        ".set      pop                                              \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [sum0]"+f"(sum0), [sum1]"+f"(sum1),
+          [p_x]"+r"(p_x)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+    return sum0 + sum1;
+}
+
+static void sbr_qmf_deint_bfly_mips(float *v, const float *src0, const float *src1)
+{
+    int i;
+    float temp0, temp1, temp2, temp3, temp4, temp5;
+    float temp6, temp7, temp8, temp9, temp10, temp11;
+    float *v0 = v;
+    float *v1 = &v[127];
+    float *psrc0 = (float*)src0;
+    float *psrc1 = (float*)&src1[63];
+
+    for (i = 0; i < 4; i++) {
+
+         /* loop unrolled 16 times */
+        __asm__ volatile(
+            "lwc1       %[temp0],   0(%[src0])             \n\t"
+            "lwc1       %[temp1],   0(%[src1])             \n\t"
+            "lwc1       %[temp3],   4(%[src0])             \n\t"
+            "lwc1       %[temp4],   -4(%[src1])            \n\t"
+            "lwc1       %[temp6],   8(%[src0])             \n\t"
+            "lwc1       %[temp7],   -8(%[src1])            \n\t"
+            "lwc1       %[temp9],   12(%[src0])            \n\t"
+            "lwc1       %[temp10],  -12(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   0(%[v1])               \n\t"
+            "swc1       %[temp0],   0(%[v0])               \n\t"
+            "swc1       %[temp5],   -4(%[v1])              \n\t"
+            "swc1       %[temp3],   4(%[v0])               \n\t"
+            "swc1       %[temp8],   -8(%[v1])              \n\t"
+            "swc1       %[temp6],   8(%[v0])               \n\t"
+            "swc1       %[temp11],  -12(%[v1])             \n\t"
+            "swc1       %[temp9],   12(%[v0])              \n\t"
+            "lwc1       %[temp0],   16(%[src0])            \n\t"
+            "lwc1       %[temp1],   -16(%[src1])           \n\t"
+            "lwc1       %[temp3],   20(%[src0])            \n\t"
+            "lwc1       %[temp4],   -20(%[src1])           \n\t"
+            "lwc1       %[temp6],   24(%[src0])            \n\t"
+            "lwc1       %[temp7],   -24(%[src1])           \n\t"
+            "lwc1       %[temp9],   28(%[src0])            \n\t"
+            "lwc1       %[temp10],  -28(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -16(%[v1])             \n\t"
+            "swc1       %[temp0],   16(%[v0])              \n\t"
+            "swc1       %[temp5],   -20(%[v1])             \n\t"
+            "swc1       %[temp3],   20(%[v0])              \n\t"
+            "swc1       %[temp8],   -24(%[v1])             \n\t"
+            "swc1       %[temp6],   24(%[v0])              \n\t"
+            "swc1       %[temp11],  -28(%[v1])             \n\t"
+            "swc1       %[temp9],   28(%[v0])              \n\t"
+            "lwc1       %[temp0],   32(%[src0])            \n\t"
+            "lwc1       %[temp1],   -32(%[src1])           \n\t"
+            "lwc1       %[temp3],   36(%[src0])            \n\t"
+            "lwc1       %[temp4],   -36(%[src1])           \n\t"
+            "lwc1       %[temp6],   40(%[src0])            \n\t"
+            "lwc1       %[temp7],   -40(%[src1])           \n\t"
+            "lwc1       %[temp9],   44(%[src0])            \n\t"
+            "lwc1       %[temp10],  -44(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -32(%[v1])             \n\t"
+            "swc1       %[temp0],   32(%[v0])              \n\t"
+            "swc1       %[temp5],   -36(%[v1])             \n\t"
+            "swc1       %[temp3],   36(%[v0])              \n\t"
+            "swc1       %[temp8],   -40(%[v1])             \n\t"
+            "swc1       %[temp6],   40(%[v0])              \n\t"
+            "swc1       %[temp11],  -44(%[v1])             \n\t"
+            "swc1       %[temp9],   44(%[v0])              \n\t"
+            "lwc1       %[temp0],   48(%[src0])            \n\t"
+            "lwc1       %[temp1],   -48(%[src1])           \n\t"
+            "lwc1       %[temp3],   52(%[src0])            \n\t"
+            "lwc1       %[temp4],   -52(%[src1])           \n\t"
+            "lwc1       %[temp6],   56(%[src0])            \n\t"
+            "lwc1       %[temp7],   -56(%[src1])           \n\t"
+            "lwc1       %[temp9],   60(%[src0])            \n\t"
+            "lwc1       %[temp10],  -60(%[src1])           \n\t"
+            "add.s      %[temp2],   %[temp0],   %[temp1]   \n\t"
+            "add.s      %[temp5],   %[temp3],   %[temp4]   \n\t"
+            "add.s      %[temp8],   %[temp6],   %[temp7]   \n\t"
+            "add.s      %[temp11],  %[temp9],   %[temp10]  \n\t"
+            "sub.s      %[temp0],   %[temp0],   %[temp1]   \n\t"
+            "sub.s      %[temp3],   %[temp3],   %[temp4]   \n\t"
+            "sub.s      %[temp6],   %[temp6],   %[temp7]   \n\t"
+            "sub.s      %[temp9],   %[temp9],   %[temp10]  \n\t"
+            "swc1       %[temp2],   -48(%[v1])             \n\t"
+            "swc1       %[temp0],   48(%[v0])              \n\t"
+            "swc1       %[temp5],   -52(%[v1])             \n\t"
+            "swc1       %[temp3],   52(%[v0])              \n\t"
+            "swc1       %[temp8],   -56(%[v1])             \n\t"
+            "swc1       %[temp6],   56(%[v0])              \n\t"
+            "swc1       %[temp11],  -60(%[v1])             \n\t"
+            "swc1       %[temp9],   60(%[v0])              \n\t"
+            PTR_ADDIU " %[src0],    %[src0],    64         \n\t"
+            PTR_ADDIU " %[src1],    %[src1],    -64        \n\t"
+            PTR_ADDIU " %[v0],      %[v0],      64         \n\t"
+            PTR_ADDIU " %[v1],      %[v1],      -64        \n\t"
+
+            : [v0]"+r"(v0), [v1]"+r"(v1), [src0]"+r"(psrc0), [src1]"+r"(psrc1),
+              [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11)
+            :
+            :"memory"
+        );
+    }
+}
+
+static void sbr_autocorrelate_mips(const float x[40][2], float phi[3][2][2])
+{
+    int i;
+    float real_sum_0 = 0.0f;
+    float real_sum_1 = 0.0f;
+    float real_sum_2 = 0.0f;
+    float imag_sum_1 = 0.0f;
+    float imag_sum_2 = 0.0f;
+    float *p_x, *p_phi;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+    float temp7, temp_r, temp_r1, temp_r2, temp_r3, temp_r4;
+    p_x = (float*)&x[0][0];
+    p_phi = &phi[0][0][0];
+
+    __asm__ volatile (
+        "lwc1    %[temp0],      8(%[p_x])                           \n\t"
+        "lwc1    %[temp1],      12(%[p_x])                          \n\t"
+        "lwc1    %[temp2],      16(%[p_x])                          \n\t"
+        "lwc1    %[temp3],      20(%[p_x])                          \n\t"
+        "lwc1    %[temp4],      24(%[p_x])                          \n\t"
+        "lwc1    %[temp5],      28(%[p_x])                          \n\t"
+        "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
+        "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
+        "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
+        "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
+        "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
+        "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
+        "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
+        "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
+        "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
+        "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
+        "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+        "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+        "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+        "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+        "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+        PTR_ADDIU "%[p_x],      %[p_x],        8                    \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+          [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
+          [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1), [temp_r2]"=&f"(temp_r2),
+          [temp_r3]"=&f"(temp_r3), [temp_r4]"=&f"(temp_r4),
+          [p_x]"+r"(p_x), [imag_sum_2]"+f"(imag_sum_2)
+        :
+        : "memory"
+    );
+
+    for (i = 0; i < 12; i++) {
+        __asm__ volatile (
+            "lwc1    %[temp0],      8(%[p_x])                           \n\t"
+            "lwc1    %[temp1],      12(%[p_x])                          \n\t"
+            "lwc1    %[temp2],      16(%[p_x])                          \n\t"
+            "lwc1    %[temp3],      20(%[p_x])                          \n\t"
+            "lwc1    %[temp4],      24(%[p_x])                          \n\t"
+            "lwc1    %[temp5],      28(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp1],      %[temp1]             \n\t"
+            "mul.s   %[temp_r1],    %[temp1],      %[temp3]             \n\t"
+            "mul.s   %[temp_r2],    %[temp1],      %[temp2]             \n\t"
+            "mul.s   %[temp_r3],    %[temp1],      %[temp5]             \n\t"
+            "mul.s   %[temp_r4],    %[temp1],      %[temp4]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp0],  %[temp0]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp0],  %[temp2]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp0],  %[temp3]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp0],  %[temp4]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp0],  %[temp5]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            "lwc1    %[temp0],      32(%[p_x])                          \n\t"
+            "lwc1    %[temp1],      36(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp3],      %[temp3]             \n\t"
+            "mul.s   %[temp_r1],    %[temp3],      %[temp5]             \n\t"
+            "mul.s   %[temp_r2],    %[temp3],      %[temp4]             \n\t"
+            "mul.s   %[temp_r3],    %[temp3],      %[temp1]             \n\t"
+            "mul.s   %[temp_r4],    %[temp3],      %[temp0]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp2],  %[temp2]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp2],  %[temp4]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp2],  %[temp5]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp2],  %[temp0]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp2],  %[temp1]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            "lwc1    %[temp2],      40(%[p_x])                          \n\t"
+            "lwc1    %[temp3],      44(%[p_x])                          \n\t"
+            "mul.s   %[temp_r],     %[temp5],      %[temp5]             \n\t"
+            "mul.s   %[temp_r1],    %[temp5],      %[temp1]             \n\t"
+            "mul.s   %[temp_r2],    %[temp5],      %[temp0]             \n\t"
+            "mul.s   %[temp_r3],    %[temp5],      %[temp3]             \n\t"
+            "mul.s   %[temp_r4],    %[temp5],      %[temp2]             \n\t"
+            "madd.s  %[temp_r],     %[temp_r],     %[temp4],  %[temp4]  \n\t"
+            "madd.s  %[temp_r1],    %[temp_r1],    %[temp4],  %[temp0]  \n\t"
+            "msub.s  %[temp_r2],    %[temp_r2],    %[temp4],  %[temp1]  \n\t"
+            "madd.s  %[temp_r3],    %[temp_r3],    %[temp4],  %[temp2]  \n\t"
+            "msub.s  %[temp_r4],    %[temp_r4],    %[temp4],  %[temp3]  \n\t"
+            "add.s   %[real_sum_0], %[real_sum_0], %[temp_r]            \n\t"
+            "add.s   %[real_sum_1], %[real_sum_1], %[temp_r1]           \n\t"
+            "add.s   %[imag_sum_1], %[imag_sum_1], %[temp_r2]           \n\t"
+            "add.s   %[real_sum_2], %[real_sum_2], %[temp_r3]           \n\t"
+            "add.s   %[imag_sum_2], %[imag_sum_2], %[temp_r4]           \n\t"
+            PTR_ADDIU "%[p_x],      %[p_x],        24                   \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+              [imag_sum_1]"+f"(imag_sum_1), [real_sum_2]"+f"(real_sum_2),
+              [temp_r]"=&f"(temp_r), [temp_r1]"=&f"(temp_r1),
+              [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
+              [temp_r4]"=&f"(temp_r4), [p_x]"+r"(p_x),
+              [imag_sum_2]"+f"(imag_sum_2)
+            :
+            : "memory"
+        );
+    }
+    __asm__ volatile (
+        "lwc1    %[temp0],    -296(%[p_x])                        \n\t"
+        "lwc1    %[temp1],    -292(%[p_x])                        \n\t"
+        "lwc1    %[temp2],    8(%[p_x])                           \n\t"
+        "lwc1    %[temp3],    12(%[p_x])                          \n\t"
+        "lwc1    %[temp4],    -288(%[p_x])                        \n\t"
+        "lwc1    %[temp5],    -284(%[p_x])                        \n\t"
+        "lwc1    %[temp6],    -280(%[p_x])                        \n\t"
+        "lwc1    %[temp7],    -276(%[p_x])                        \n\t"
+        "madd.s  %[temp_r],   %[real_sum_0], %[temp0],  %[temp0]  \n\t"
+        "madd.s  %[temp_r1],  %[real_sum_0], %[temp2],  %[temp2]  \n\t"
+        "madd.s  %[temp_r2],  %[real_sum_1], %[temp0],  %[temp4]  \n\t"
+        "madd.s  %[temp_r3],  %[imag_sum_1], %[temp0],  %[temp5]  \n\t"
+        "madd.s  %[temp_r],   %[temp_r],     %[temp1],  %[temp1]  \n\t"
+        "madd.s  %[temp_r1],  %[temp_r1],    %[temp3],  %[temp3]  \n\t"
+        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp5]  \n\t"
+        "nmsub.s  %[temp_r3], %[temp_r3],    %[temp1],  %[temp4]  \n\t"
+        "lwc1    %[temp4],    16(%[p_x])                          \n\t"
+        "lwc1    %[temp5],    20(%[p_x])                          \n\t"
+        "swc1    %[temp_r],   40(%[p_phi])                        \n\t"
+        "swc1    %[temp_r1],  16(%[p_phi])                        \n\t"
+        "swc1    %[temp_r2],  24(%[p_phi])                        \n\t"
+        "swc1    %[temp_r3],  28(%[p_phi])                        \n\t"
+        "madd.s  %[temp_r],   %[real_sum_1], %[temp2],  %[temp4]  \n\t"
+        "madd.s  %[temp_r1],  %[imag_sum_1], %[temp2],  %[temp5]  \n\t"
+        "madd.s  %[temp_r2],  %[real_sum_2], %[temp0],  %[temp6]  \n\t"
+        "madd.s  %[temp_r3],  %[imag_sum_2], %[temp0],  %[temp7]  \n\t"
+        "madd.s  %[temp_r],   %[temp_r],     %[temp3],  %[temp5]  \n\t"
+        "nmsub.s %[temp_r1],  %[temp_r1],    %[temp3],  %[temp4]  \n\t"
+        "madd.s  %[temp_r2],  %[temp_r2],    %[temp1],  %[temp7]  \n\t"
+        "nmsub.s %[temp_r3],  %[temp_r3],    %[temp1],  %[temp6]  \n\t"
+        "swc1    %[temp_r],   0(%[p_phi])                         \n\t"
+        "swc1    %[temp_r1],  4(%[p_phi])                         \n\t"
+        "swc1    %[temp_r2],  8(%[p_phi])                         \n\t"
+        "swc1    %[temp_r3],  12(%[p_phi])                        \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+          [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+          [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp_r]"=&f"(temp_r),
+          [real_sum_0]"+f"(real_sum_0), [real_sum_1]"+f"(real_sum_1),
+          [real_sum_2]"+f"(real_sum_2), [imag_sum_1]"+f"(imag_sum_1),
+          [temp_r2]"=&f"(temp_r2), [temp_r3]"=&f"(temp_r3),
+          [temp_r1]"=&f"(temp_r1), [p_phi]"+r"(p_phi),
+          [imag_sum_2]"+f"(imag_sum_2)
+        : [p_x]"r"(p_x)
+        : "memory"
+    );
+}
+
+static void sbr_hf_gen_mips(float (*X_high)[2], const float (*X_low)[2],
+                         const float alpha0[2], const float alpha1[2],
+                         float bw, int start, int end)
+{
+    float alpha[4];
+    int i;
+    float *p_x_low = (float*)&X_low[0][0] + 2*start;
+    float *p_x_high = &X_high[0][0] + 2*start;
+    float temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+    float temp7, temp8, temp9, temp10, temp11, temp12;
+
+    alpha[0] = alpha1[0] * bw * bw;
+    alpha[1] = alpha1[1] * bw * bw;
+    alpha[2] = alpha0[0] * bw;
+    alpha[3] = alpha0[1] * bw;
+
+    for (i = start; i < end; i++) {
+        __asm__ volatile (
+            "lwc1    %[temp0],    -16(%[p_x_low])                        \n\t"
+            "lwc1    %[temp1],    -12(%[p_x_low])                        \n\t"
+            "lwc1    %[temp2],    -8(%[p_x_low])                         \n\t"
+            "lwc1    %[temp3],    -4(%[p_x_low])                         \n\t"
+            "lwc1    %[temp5],    0(%[p_x_low])                          \n\t"
+            "lwc1    %[temp6],    4(%[p_x_low])                          \n\t"
+            "lwc1    %[temp7],    0(%[alpha])                            \n\t"
+            "lwc1    %[temp8],    4(%[alpha])                            \n\t"
+            "lwc1    %[temp9],    8(%[alpha])                            \n\t"
+            "lwc1    %[temp10],   12(%[alpha])                           \n\t"
+            PTR_ADDIU "%[p_x_high], %[p_x_high],   8                     \n\t"
+            PTR_ADDIU "%[p_x_low],  %[p_x_low],    8                     \n\t"
+            "mul.s   %[temp11],   %[temp1],        %[temp8]              \n\t"
+            "msub.s  %[temp11],   %[temp11],       %[temp0],  %[temp7]   \n\t"
+            "madd.s  %[temp11],   %[temp11],       %[temp2],  %[temp9]   \n\t"
+            "nmsub.s %[temp11],   %[temp11],       %[temp3],  %[temp10]  \n\t"
+            "add.s   %[temp11],   %[temp11],       %[temp5]              \n\t"
+            "swc1    %[temp11],   -8(%[p_x_high])                        \n\t"
+            "mul.s   %[temp12],   %[temp1],        %[temp7]              \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp0],  %[temp8]   \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp3],  %[temp9]   \n\t"
+            "madd.s  %[temp12],   %[temp12],       %[temp2],  %[temp10]  \n\t"
+            "add.s   %[temp12],   %[temp12],       %[temp6]              \n\t"
+            "swc1    %[temp12],   -4(%[p_x_high])                        \n\t"
+
+            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+              [temp6]"=&f"(temp6), [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+              [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), [temp11]"=&f"(temp11),
+              [temp12]"=&f"(temp12), [p_x_high]"+r"(p_x_high),
+              [p_x_low]"+r"(p_x_low)
+            : [alpha]"r"(alpha)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_g_filt_mips(float (*Y)[2], const float (*X_high)[40][2],
+                            const float *g_filt, int m_max, intptr_t ixh)
+{
+    const float *p_x, *p_g, *loop_end;
+    float *p_y;
+    float temp0, temp1, temp2;
+
+    p_g = &g_filt[0];
+    p_y = &Y[0][0];
+    p_x = &X_high[0][ixh][0];
+    loop_end = p_g + m_max;
+
+    __asm__ volatile(
+        ".set    push                                \n\t"
+        ".set    noreorder                           \n\t"
+    "1:                                              \n\t"
+        "lwc1    %[temp0],   0(%[p_g])               \n\t"
+        "lwc1    %[temp1],   0(%[p_x])               \n\t"
+        "lwc1    %[temp2],   4(%[p_x])               \n\t"
+        "mul.s   %[temp1],   %[temp1],     %[temp0]  \n\t"
+        "mul.s   %[temp2],   %[temp2],     %[temp0]  \n\t"
+        PTR_ADDIU "%[p_g],   %[p_g],       4         \n\t"
+        PTR_ADDIU "%[p_x],   %[p_x],       320       \n\t"
+        "swc1    %[temp1],   0(%[p_y])               \n\t"
+        "swc1    %[temp2],   4(%[p_y])               \n\t"
+        "bne     %[p_g],     %[loop_end],  1b        \n\t"
+        PTR_ADDIU "%[p_y],   %[p_y],       8         \n\t"
+        ".set    pop                                 \n\t"
+
+        : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1),
+          [temp2]"=&f"(temp2), [p_x]"+r"(p_x),
+          [p_y]"+r"(p_y), [p_g]"+r"(p_g)
+        : [loop_end]"r"(loop_end)
+        : "memory"
+    );
+}
+
+static void sbr_hf_apply_noise_0_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    int m;
+
+    for (m = 0; m < m_max; m++){
+
+        float *Y1=&Y[m][0];
+        float *ff_table;
+        float y0,y1, temp1, temp2, temp4, temp5;
+        int temp0, temp3;
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1    %[y0],       0(%[Y1])                                    \n\t"
+            "lwc1    %[temp1],    0(%[s_m1])                                  \n\t"
+            "addiu   %[noise],    %[noise],              1                    \n\t"
+            "andi    %[noise],    %[noise],              0x1ff                \n\t"
+            "sll     %[temp0],    %[noise], 3                                 \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]             \n\t"
+            "add.s   %[y0],       %[y0],                 %[temp1]             \n\t"
+            "mfc1    %[temp3],    %[temp1]                                    \n\t"
+            "bne     %[temp3],    $0,                    1f                   \n\t"
+            "lwc1    %[y1],       4(%[Y1])                                    \n\t"
+            "lwc1    %[temp2],    0(%[q_filt1])                               \n\t"
+            "lwc1    %[temp4],    0(%[ff_table])                              \n\t"
+            "lwc1    %[temp5],    4(%[ff_table])                              \n\t"
+            "madd.s  %[y0],       %[y0],                 %[temp2],  %[temp4]  \n\t"
+            "madd.s  %[y1],       %[y1],                 %[temp2],  %[temp5]  \n\t"
+            "swc1    %[y1],       4(%[Y1])                                    \n\t"
+        "1:                                                                   \n\t"
+            "swc1    %[y0],       0(%[Y1])                                    \n\t"
+
+            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
+              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_apply_noise_1_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    float y0,y1,temp1, temp2, temp4, temp5;
+    int temp0, temp3, m;
+    float phi_sign = 1 - 2 * (kx & 1);
+
+    for (m = 0; m < m_max; m++) {
+
+        float *ff_table;
+        float *Y1=&Y[m][0];
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1   %[y1],       4(%[Y1])                                     \n\t"
+            "lwc1   %[temp1],    0(%[s_m1])                                   \n\t"
+            "lw     %[temp3],    0(%[s_m1])                                   \n\t"
+            "addiu  %[noise],    %[noise],               1                    \n\t"
+            "andi   %[noise],    %[noise],               0x1ff                \n\t"
+            "sll    %[temp0],    %[noise],               3                    \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]              \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
+            "bne    %[temp3],    $0,                    1f                    \n\t"
+            "lwc1   %[y0],       0(%[Y1])                                     \n\t"
+            "lwc1   %[temp2],    0(%[q_filt1])                                \n\t"
+            "lwc1   %[temp4],    0(%[ff_table])                               \n\t"
+            "lwc1   %[temp5],    4(%[ff_table])                               \n\t"
+            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
+            "swc1   %[y0],       0(%[Y1])                                     \n\t"
+        "1:                                                                   \n\t"
+            "swc1   %[y1],       4(%[Y1])                                     \n\t"
+
+            : [ff_table] "=&r" (ff_table), [y0] "=&f" (y0), [y1] "=&f" (y1),
+              [temp0] "=&r" (temp0), [temp1] "=&f" (temp1), [temp2] "=&f" (temp2),
+              [temp3] "=&r" (temp3), [temp4] "=&f" (temp4), [temp5] "=&f" (temp5)
+            : [ff_sbr_noise_table] "r" (ff_sbr_noise_table), [noise] "r" (noise),
+              [Y1] "r" (Y1), [s_m1] "r" (s_m1), [q_filt1] "r" (q_filt1),
+              [phi_sign] "f" (phi_sign)
+            : "memory"
+        );
+        phi_sign = -phi_sign;
+    }
+}
+
+static void sbr_hf_apply_noise_2_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    int m;
+    float *ff_table;
+    float y0,y1, temp0, temp1, temp2, temp3, temp4, temp5;
+
+    for (m = 0; m < m_max; m++) {
+
+        float *Y1=&Y[m][0];
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1   %[y0],       0(%[Y1])                                  \n\t"
+            "lwc1   %[temp1],    0(%[s_m1])                                \n\t"
+            "addiu  %[noise],    %[noise],              1                  \n\t"
+            "andi   %[noise],    %[noise],              0x1ff              \n\t"
+            "sll    %[temp0],    %[noise],              3                  \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table],%[temp0]           \n\t"
+            "sub.s  %[y0],       %[y0],                 %[temp1]           \n\t"
+            "mfc1   %[temp3],    %[temp1]                                  \n\t"
+            "bne    %[temp3],    $0,                    1f                 \n\t"
+            "lwc1   %[y1],       4(%[Y1])                                  \n\t"
+            "lwc1   %[temp2],    0(%[q_filt1])                             \n\t"
+            "lwc1   %[temp4],    0(%[ff_table])                            \n\t"
+            "lwc1   %[temp5],    4(%[ff_table])                            \n\t"
+            "madd.s %[y0],       %[y0],                 %[temp2], %[temp4] \n\t"
+            "madd.s %[y1],       %[y1],                 %[temp2], %[temp5] \n\t"
+            "swc1   %[y1],       4(%[Y1])                                  \n\t"
+        "1:                                                                \n\t"
+            "swc1   %[y0],       0(%[Y1])                                  \n\t"
+
+            : [temp0]"=&r"(temp0), [ff_table]"=&r"(ff_table), [y0]"=&f"(y0),
+              [y1]"=&f"(y1), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1)
+            : "memory"
+        );
+    }
+}
+
+static void sbr_hf_apply_noise_3_mips(float (*Y)[2], const float *s_m,
+                                 const float *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    float phi_sign = 1 - 2 * (kx & 1);
+    int m;
+
+    for (m = 0; m < m_max; m++) {
+
+        float *Y1=&Y[m][0];
+        float *ff_table;
+        float y0,y1, temp1, temp2, temp4, temp5;
+        int temp0, temp3;
+        const float *s_m1=&s_m[m];
+        const float *q_filt1= &q_filt[m];
+
+        __asm__ volatile(
+            "lwc1    %[y1],       4(%[Y1])                                     \n\t"
+            "lwc1    %[temp1],    0(%[s_m1])                                   \n\t"
+            "addiu   %[noise],    %[noise],              1                     \n\t"
+            "andi    %[noise],    %[noise],              0x1ff                 \n\t"
+            "sll     %[temp0],    %[noise],              3                     \n\t"
+            PTR_ADDU "%[ff_table],%[ff_sbr_noise_table], %[temp0]              \n\t"
+            "nmsub.s %[y1],       %[y1],                 %[temp1], %[phi_sign] \n\t"
+            "mfc1    %[temp3],    %[temp1]                                     \n\t"
+            "bne     %[temp3],    $0,                    1f                    \n\t"
+            "lwc1    %[y0],       0(%[Y1])                                     \n\t"
+            "lwc1    %[temp2],    0(%[q_filt1])                                \n\t"
+            "lwc1    %[temp4],    0(%[ff_table])                               \n\t"
+            "lwc1    %[temp5],    4(%[ff_table])                               \n\t"
+            "madd.s  %[y0],       %[y0],                 %[temp2], %[temp4]    \n\t"
+            "madd.s  %[y1],       %[y1],                 %[temp2], %[temp5]    \n\t"
+            "swc1    %[y0],       0(%[Y1])                                     \n\t"
+            "1:                                                                \n\t"
+            "swc1    %[y1],       4(%[Y1])                                     \n\t"
+
+            : [ff_table]"=&r"(ff_table), [y0]"=&f"(y0), [y1]"=&f"(y1),
+              [temp0]"=&r"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+              [temp3]"=&r"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5)
+            : [ff_sbr_noise_table]"r"(ff_sbr_noise_table), [noise]"r"(noise),
+              [Y1]"r"(Y1), [s_m1]"r"(s_m1), [q_filt1]"r"(q_filt1),
+              [phi_sign]"f"(phi_sign)
+            : "memory"
+        );
+       phi_sign = -phi_sign;
+    }
+}
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+
+void ff_sbrdsp_init_mips(SBRDSPContext *s)
+{
+#if HAVE_INLINE_ASM
+    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_mips;
+    s->qmf_post_shuffle = sbr_qmf_post_shuffle_mips;
+#if HAVE_MIPSFPU
+#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
+    s->sum64x5 = sbr_sum64x5_mips;
+    s->sum_square = sbr_sum_square_mips;
+    s->qmf_deint_bfly = sbr_qmf_deint_bfly_mips;
+    s->autocorrelate = sbr_autocorrelate_mips;
+    s->hf_gen = sbr_hf_gen_mips;
+    s->hf_g_filt = sbr_hf_g_filt_mips;
+
+    s->hf_apply_noise[0] = sbr_hf_apply_noise_0_mips;
+    s->hf_apply_noise[1] = sbr_hf_apply_noise_1_mips;
+    s->hf_apply_noise[2] = sbr_hf_apply_noise_2_mips;
+    s->hf_apply_noise[3] = sbr_hf_apply_noise_3_mips;
+#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
+#endif /* HAVE_MIPSFPU */
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c
new file mode 100644
index 0000000..628e13f
--- /dev/null
+++ b/libavcodec/mips/simple_idct_mmi.c
@@ -0,0 +1,816 @@
+/*
+ * Loongson SIMD optimized simple idct
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *                    Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "constants.h"
+
+#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
+#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
+    1<<(ROW_SHIFT-1),   0, 1<<(ROW_SHIFT-1),   0,
+    1<<(ROW_SHIFT-1),   1, 1<<(ROW_SHIFT-1),   0,
+                  C4,  C4,               C4,  C4,
+                  C4, -C4,               C4, -C4,
+                  C2,  C6,               C2,  C6,
+                  C6, -C2,               C6, -C2,
+                  C1,  C3,               C1,  C3,
+                  C5,  C7,               C5,  C7,
+                  C3, -C7,               C3, -C7,
+                 -C1, -C5,              -C1, -C5,
+                  C5, -C1,               C5, -C1,
+                  C7,  C3,               C7,  C3,
+                  C7, -C5,               C7, -C5,
+                  C3, -C1,               C3, -C1
+};
+
+void ff_simple_idct_mmi(int16_t *block)
+{
+        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+        int16_t * const temp= (int16_t*)align_tmp;
+
+        __asm__ volatile (
+#undef  DC_COND_IDCT
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift)      \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, %3                   \n\t"                                \
+        "and  $f8, $f8, $f0             \n\t"                                \
+        "or $f8, $f8, $f2               \n\t"                                \
+        "or $f8, $f8, $f4               \n\t"                                \
+        "or $f8, $f8, $f6               \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t"                                \
+        "li $11, " #shift "             \n\t"                                \
+        "mfc1 $10, $f8                  \n\t"                                \
+        "mtc1 $11, $f18                 \n\t"                                \
+        "beqz $10, 1f                   \n\t"                                \
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        #rounder " $f8, $f8, $f16       \n\t"                                \
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "ldc1 $f10, 56(%2)              \n\t" /* C7     C5      C7      C5 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f10, $f10, $f6        \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        #rounder " $f0, $f0, $f16       \n\t"                                \
+        "paddw $f2, $f2, $f0            \n\t" /* A1             a1         */\
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "paddw $f0, $f0, $f0            \n\t"                                \
+        "psubw $f0, $f0, $f2            \n\t" /* A2             a2         */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f10         \n\t" /* B0             b0         */\
+        "ldc1 $f10, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "pmaddhw $f10, $f10, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "paddw $f10, $f10, $f4          \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f2                 \n\t" /* A1             a1         */\
+        "paddw $f2, $f2, $f10           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f10           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f2       \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0   */\
+        "packsswh $f4, $f4, $f8         \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1   */\
+        "sdc1 $f14, " #dst "            \n\t"                                \
+        "ldc1 $f2, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "sdc1 $f4, 24+" #dst "          \n\t"                                \
+        "pmaddhw $f8, $f8, $f2          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "pmaddhw $f2, $f2, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f0, $f0, $f8            \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f2            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "packsswh $f4, $f4, $f12        \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2   */\
+        "sdc1 $f4, 8+" #dst "           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f0         \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3   */\
+        "sdc1 $f8, 16+" #dst "          \n\t"                                \
+        "b 2f                           \n\t"                                \
+        "1:                             \n\t"                                \
+        "li $10, 16                     \n\t"                                \
+        "mtc1 $10, $f16                 \n\t"                                \
+        "psllw $f0, $f0, $f16           \n\t"                                \
+        "ldc1 $f16, %4                  \n\t"                                \
+        "paddw $f0, $f0, $f16           \n\t"                                \
+        "li $10, 13                     \n\t"                                \
+        "mtc1 $10, $f16                 \n\t"                                \
+        "psraw $f0, $f0, $f16           \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t"                                \
+        "sdc1 $f0, " #dst "             \n\t"                                \
+        "sdc1 $f0, 8+" #dst "           \n\t"                                \
+        "sdc1 $f0, 16+" #dst "          \n\t"                                \
+        "sdc1 $f0, 24+" #dst "          \n\t"                                \
+        "2:                             \n\t"
+
+#undef  Z_COND_IDCT
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, rarg, shift, bt)   \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "mov.d $f8, $f0                 \n\t"                                \
+        "or $f8, $f8, $f2               \n\t"                                \
+        "or $f8, $f8, $f4               \n\t"                                \
+        "or $f8, $f8, $f6               \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t"                                \
+        "mfc1 $10, $f8                  \n\t"                                \
+        "beqz $10, " #bt "              \n\t"                                \
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        #rounder " $f8, $f8, $f16       \n\t"                                \
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "ldc1 $f10, 56(%2)              \n\t" /* C7     C5      C7      C5 */\
+        "ldc1 $f16, " #rarg "           \n\t"                                \
+        "pmaddhw $f10, $f10, $f6        \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        #rounder " $f0, $f0, $f16       \n\t"                                \
+        "paddw $f2, $f2, $f0            \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f0            \n\t"                                \
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "psubw $f0, $f0, $f2            \n\t" /* A2             a2         */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f10         \n\t" /* B0             b0         */\
+        "ldc1 $f10, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "pmaddhw $f10, $f10, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "li $10, " #shift "             \n\t"                                \
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f10, $f10, $f4          \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f2                 \n\t" /* A1             a1         */\
+        "paddw $f2, $f2, $f10           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f10           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f2       \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0   */\
+        "packsswh $f4, $f4, $f8         \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1   */\
+        "sdc1 $f14, " #dst "            \n\t"                                \
+        "ldc1 $f2, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "sdc1 $f4, 24+" #dst "          \n\t"                                \
+        "pmaddhw $f8, $f8, $f2          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "pmaddhw $f2, $f2, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f0, $f0, $f8            \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f2            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "packsswh $f4, $f4, $f12        \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2   */\
+        "sdc1 $f4, 8+" #dst "           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f0         \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3   */\
+        "sdc1 $f8, 16+" #dst "          \n\t"                                \
+
+        //IDCT(       src0,   src4,   src1,   src5,    dst,     rounder, shift)
+        DC_COND_IDCT(0(%0),  8(%0), 16(%0), 24(%0),  0(%1), paddw,8(%2), 11)
+        Z_COND_IDCT(32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddw,(%2), 11, 4f)
+        Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddw,(%2), 11, 2f)
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1), paddw,(%2), 11, 1f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "paddw $f14, $f14, $f2          \n\t" /* B0             b0         */\
+        "ldc1 $f2, 72(%2)               \n\t" /* -C5    -C1     -C5    -C1 */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "paddw $f2, $f2, $f4            \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f2            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f0, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "pmaddhw $f0, $f0, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f0            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "4:                             \n\t"
+        Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddw,(%2), 11, 6f)
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 5f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "ldc1 $f14, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "pmaddhw $f14, $f14, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f2, $f2, $f8            \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f2            \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f14           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f14           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f2, " #dst "             \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f2, 88(%2)               \n\t" /* C3     C7      C3      C7 */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f4, $f4, $f2            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f2          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f2, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f2, $f2, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f2, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "6:                             \n\t"
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 7f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "ldc1 $f14, 72(%2)              \n\t" /* -C5    -C1     -C5    -C1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f14, $f14, $f6        \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f2, $f2, $f8            \n\t" /* A0+B0          a0+b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f2            \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f14           \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f14           \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f2, " #dst "             \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f2, 88(%2)               \n\t" /* C3     C7      C3      C7 */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f4, $f4, $f2            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f2          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f2, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f2, $f2, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f2, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "2:                             \n\t"
+        Z_COND_IDCT(96(%0),104(%0),112(%0),120(%0), 96(%1),paddw,(%2), 11, 3f)
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f6, " #src5 "            \n\t" /* R7     R5      r7      r5 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f2, 56(%2)               \n\t" /* C7     C5      C7      C5 */\
+        "pmaddhw $f2, $f2, $f6          \n\t" /* C7R7+C5R5      C7r7+C5r5  */\
+        "ldc1 $f16, 64(%2)              \n\t"                                \
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f2          \n\t" /* B0             b0         */\
+        "ldc1 $f2, 72(%2)               \n\t" /* -C5    -C1     -C5    -C1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f2, $f2, $f6          \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "paddw $f2, $f2, $f4            \n\t" /* B1             b1         */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f4, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f4, $f4, $f2            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0          a0+b0      */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1          a1+b1      */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A1-B1          a1-b1      */\
+        "swc1 $f4, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0          a0-b0      */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f0, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "ldc1 $f14, 88(%2)              \n\t" /* C3     C7      C3      C7 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "pmaddhw $f0, $f0, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C3R7+C7R5      C3r7+C7r5  */\
+        "mov.d $f4, $f10                \n\t" /* A2             a2         */\
+        "ldc1 $f16, 104(%2)             \n\t"                                \
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddw $f8, $f8, $f14           \n\t" /* B2             b2         */\
+        "paddw $f4, $f4, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f6, $f6, $f0            \n\t" /* B3             b3         */\
+        "paddw $f12, $f12, $f6          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f6            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f4, $f4, $f4         \n\t" /* A2+B2          a2+b2      */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f4, 32+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "3:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f6, 64(%2)               \n\t"                                \
+        "pmaddhw $f6, $f6, $f4          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "li $10, " #shift "             \n\t"                                \
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f2, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f6            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f2, $f2, $f6            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0  a0+b0              */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1  a1+b1              */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A1-B1  a1-b1              */\
+        "swc1 $f2, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0  a0-b0              */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "pmaddhw $f8, $f8, $f4          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "mov.d $f2, $f10                \n\t" /* A2             a2         */\
+        "paddw $f2, $f2, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f4          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f4            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f2, $f2, $f2         \n\t" /* A2+B2  a2+b2              */\
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3  a3+b3              */\
+        "swc1 $f2, 32+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3  a3-b3              */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2  a2-b2              */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "5:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f4, 8+" #src0 "          \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f6, 8+" #src4 "          \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f2, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f2, $f2, $f4          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f4, $f4, $f14         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f14, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "ldc1 $f16, 40(%2)              \n\t"                                \
+        "pmaddhw $f14, $f14, $f6        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "pmaddhw $f6, $f6, $f16         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "paddw $f14, $f14, $f2          \n\t" /* A0             a0         */\
+        "paddw $f2, $f2, $f2            \n\t" /* 2C0            2c0        */\
+        "psubw $f2, $f2, $f14           \n\t" /* A3             a3         */\
+        "li $10, " #shift "             \n\t"                                \
+        "paddw $f6, $f6, $f4            \n\t" /* A1             a1         */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "paddw $f4, $f4, $f4            \n\t" /* 2C1            2c1        */\
+        "psubw $f4, $f4, $f6            \n\t" /* A2             a2         */\
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f6, $f6, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f14        \n\t" /* A0             a0         */\
+        "sdc1 $f8, " #dst "             \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "packsswh $f0, $f0, $f6         \n\t" /* A1             a1         */\
+        "sdc1 $f0, 16+" #dst "          \n\t"                                \
+        "sdc1 $f0, 96+" #dst "          \n\t"                                \
+        "sdc1 $f8, 112+" #dst "         \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f10, $f10, $f4       \n\t" /* A2-B2          a2-b2      */\
+        "sdc1 $f10, 32+" #dst "         \n\t"                                \
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f12, $f12, $f2       \n\t" /* A3+B3          a3+b3      */\
+        "sdc1 $f12, 48+" #dst "         \n\t"                                \
+        "sdc1 $f12, 64+" #dst "         \n\t"                                \
+        "sdc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "1:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, " #src4 "            \n\t" /* R6     R2      r6      r2 */\
+        "ldc1 $f4, " #src1 "            \n\t" /* R3     R1      r3      r1 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f10, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "pmaddhw $f10, $f10, $f2        \n\t" /* C6R6+C2R2      C6r6+C2r2  */\
+        "ldc1 $f12, 40(%2)              \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddhw $f2, $f2, $f12         \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "mov.d $f12, $f8                \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 48(%2)              \n\t" /* C3     C1      C3      C1 */\
+        "pmaddhw $f14, $f14, $f4        \n\t" /* C3R3+C1R1      C3r3+C1r1  */\
+        "paddw $f8, $f8, $f10           \n\t" /* A0             a0         */\
+        "psubw $f12, $f12, $f10         \n\t" /* A3             a3         */\
+        "mov.d $f10, $f0                \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1             a1         */\
+        "psubw $f10, $f10, $f2          \n\t" /* A2             a2         */\
+        "ldc1 $f2, 64(%2)               \n\t"                                \
+        "pmaddhw $f2, $f2, $f4          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddw $f14, $f14, $f8          \n\t" /* A0+B0          a0+b0      */\
+        "paddw $f8, $f8, $f8            \n\t" /* 2A0            2a0        */\
+        "psubw $f8, $f8, $f14           \n\t" /* A0-B0          a0-b0      */\
+        "psraw $f14, $f14, $f18         \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "mov.d $f6, $f0                 \n\t" /* A1             a1         */\
+        "paddw $f0, $f0, $f2            \n\t" /* A1+B1          a1+b1      */\
+        "psubw $f6, $f6, $f2            \n\t" /* A1-B1          a1-b1      */\
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "psraw $f6, $f6, $f18           \n\t"                                \
+        "packsswh $f14, $f14, $f14      \n\t" /* A0+B0  a0+b0              */\
+        "swc1 $f14, " #dst "            \n\t"                                \
+        "packsswh $f0, $f0, $f0         \n\t" /* A1+B1  a1+b1              */\
+        "swc1 $f0, 16+" #dst "          \n\t"                                \
+        "packsswh $f6, $f6, $f6         \n\t" /* A1-B1  a1-b1              */\
+        "swc1 $f6, 96+" #dst "          \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A0-B0  a0-b0              */\
+        "swc1 $f8, 112+" #dst "         \n\t"                                \
+        "ldc1 $f8, 80(%2)               \n\t" /* -C1    C5      -C1     C5 */\
+        "ldc1 $f16, 96(%2)              \n\t"                                \
+        "pmaddhw $f8, $f8, $f4          \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "pmaddhw $f4, $f4, $f16         \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "mov.d $f6, $f10                \n\t" /* A2             a2         */\
+        "paddw $f6, $f6, $f8            \n\t" /* A2+B2          a2+b2      */\
+        "psubw $f10, $f10, $f8          \n\t" /* a2-B2          a2-b2      */\
+        "psraw $f6, $f6, $f18           \n\t"                                \
+        "psraw $f10, $f10, $f18         \n\t"                                \
+        "mov.d $f8, $f12                \n\t" /* A3             a3         */\
+        "paddw $f12, $f12, $f4          \n\t" /* A3+B3          a3+b3      */\
+        "psubw $f8, $f8, $f4            \n\t" /* a3-B3          a3-b3      */\
+        "psraw $f12, $f12, $f18         \n\t"                                \
+        "packsswh $f6, $f6, $f6         \n\t" /* A2+B2          a2+b2      */\
+        "swc1 $f6, 32+" #dst "          \n\t"                                \
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "packsswh $f12, $f12, $f12      \n\t" /* A3+B3          a3+b3      */\
+        "swc1 $f12, 48+" #dst "         \n\t"                                \
+        "packsswh $f8, $f8, $f8         \n\t" /* A3-B3          a3-b3      */\
+        "packsswh $f10, $f10, $f10      \n\t" /* A2-B2          a2-b2      */\
+        "swc1 $f8, 64+" #dst "          \n\t"                                \
+        "swc1 $f10, 80+" #dst "         \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+        IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),    20)
+        "b 9f                           \n\t"
+
+        "# .p2align 4                   \n\t"
+        "7:                             \n\t"
+
+#undef  IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift)                             \
+        "ldc1 $f0, " #src0 "            \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f8, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "li $10, " #shift "             \n\t"                                \
+        "pmaddhw $f8, $f8, $f0          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "mtc1 $10, $f18                 \n\t"                                \
+        "ldc1 $f10, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f0, $f0, $f10         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "psraw $f8, $f8, $f18           \n\t"                                \
+        "psraw $f0, $f0, $f18           \n\t"                                \
+        "ldc1 $f4, 8+" #src0 "          \n\t" /* R4     R0      r4      r0 */\
+        "ldc1 $f2, 16(%2)               \n\t" /* C4     C4      C4      C4 */\
+        "pmaddhw $f2, $f2, $f4          \n\t" /* C4R4+C4R0      C4r4+C4r0  */\
+        "ldc1 $f14, 24(%2)              \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddhw $f4, $f4, $f14         \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "ldc1 $f14, 32(%2)              \n\t" /* C6     C2      C6      C2 */\
+        "psraw $f2, $f2, $f18           \n\t"                                \
+        "packsswh $f8, $f8, $f2         \n\t" /* A0             a0         */\
+        "sdc1 $f8, " #dst "             \n\t"                                \
+        "psraw $f4, $f4, $f18           \n\t"                                \
+        "packsswh $f0, $f0, $f4         \n\t" /* A1             a1         */\
+        "sdc1 $f0, 16+" #dst "          \n\t"                                \
+        "sdc1 $f0, 96+" #dst "          \n\t"                                \
+        "sdc1 $f8, 112+" #dst "         \n\t"                                \
+        "sdc1 $f0, 32+" #dst "          \n\t"                                \
+        "sdc1 $f8, 48+" #dst "          \n\t"                                \
+        "sdc1 $f8, 64+" #dst "          \n\t"                                \
+        "sdc1 $f0, 80+" #dst "          \n\t"
+
+        //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+        IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),    20)
+        IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),    20)
+
+        "9:                             \n\t"
+        ::"r"(block),"r"(temp),"r"(coeffs),"m"(ff_wm1010),"m"(ff_d40000)
+        : "$10","$11"
+    );
+}
diff --git a/libavcodec/mips/simple_idct_msa.c b/libavcodec/mips/simple_idct_msa.c
new file mode 100644
index 0000000..bd8b310
--- /dev/null
+++ b/libavcodec/mips/simple_idct_msa.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "idctdsp_mips.h"
+
+static void simple_idct_msa(int16_t *block)
+{
+    int32_t const_val;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    temp = in0 << 3;
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp1_r, temp1_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp2_r, temp2_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
+         a1_r, a1_l, a2_r, a2_l);
+    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
+         a3_r, a3_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp1_r, temp1_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
+         temp2_r, temp2_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
+         a1_r, a1_l, a2_r, a2_l);
+    ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
+         a3_r, a3_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
+           block, 8);
+}
+
+static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
+                                int16_t *block)
+{
+    int32_t const_val;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    temp = in0 << 3;
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
+    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
+    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
+    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
+    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
+                temp2_r, temp2_r, temp3_r, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += 4 * dst_stride;
+    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
+    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
+    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
+    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
+    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
+                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
+    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
+    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += 4 * dst_stride;
+}
+
+static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
+                                int16_t *block)
+{
+    int32_t const_val;
+    uint64_t tmp0, tmp1, tmp2, tmp3;
+    v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 w1, w3, w5, w7;
+    v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
+    v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
+    v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
+    v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
+    v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
+    v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
+    v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
+    v4i32 w2, w4, w6;
+    v8i16 select_vec, temp;
+    v8i16 zero = { 0 };
+    v4i32 const_val0 = __msa_ldi_w(1);
+    v4i32 const_val1 = __msa_ldi_w(1);
+
+    const_val0 <<= 10;
+    const_val = 16383 * ((1 << 19) / 16383);
+    const_val1 = __msa_insert_w(const_val0, 0, const_val);
+    const_val1 = __msa_splati_w(const_val1, 0);
+    LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
+    select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    UNPCK_SH_SW(in4, temp4_r, temp4_l);
+    UNPCK_SH_SW(in6, temp7_r, temp7_l);
+    ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
+    temp = in0 << 3;
+    SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
+    ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
+               const0, const1, const2, const3);
+    ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
+    const5 = __msa_ilvod_h(-w1, -w5);
+    const7 = __msa_ilvod_h(w3, -w1);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    w2 = (v4i32) __msa_splati_h(weights, 2);
+    w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
+    w4 = (v4i32) __msa_splati_h(weights, 4);
+    w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
+    w6 = (v4i32) __msa_splati_h(weights, 6);
+    w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
+    MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
+    MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
+    ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
+    SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
+    SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
+    ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
+    ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
+    SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
+    ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
+    SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
+                temp2_l, temp2_r, temp3_l, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
+                               (v16u8) select_vec);
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
+    in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
+    in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
+    in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    UNPCK_SH_SW(in0, a0_r, a0_l);
+    UNPCK_SH_SW(in2, temp3_r, temp3_l);
+    MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
+    ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
+    BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
+                temp2_l, temp2_r, temp1_l, temp1_r,
+                a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
+    UNPCK_SH_SW(in4, temp0_r, temp0_l);
+    UNPCK_SH_SW(in6, temp3_r, temp3_l);
+    MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
+    MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
+    MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
+    ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
+    SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
+    SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
+    ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
+    ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
+    SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
+    ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
+    SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
+    ILVRL_H2_SW(in1, in3, b3_r, b3_l);
+    ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
+    DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
+                b0_r, b1_r, b2_r, b3_r);
+    DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
+                b0_l, b1_l, b2_l, b3_l);
+    DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
+                 const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
+    DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
+                 const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
+    BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
+                 b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
+                 temp0_r, temp0_l, temp1_r, temp1_l,
+                 temp2_r, temp2_l, temp3_r, temp3_l,
+                 a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
+    SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
+    SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
+    LD_SH4(dst, dst_stride, in0, in1, in2, in3);
+    PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
+                temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
+    ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
+               temp0_l, temp1_l, temp2_l, temp3_l);
+    temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
+    temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
+    temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
+    temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
+    temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
+    temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
+    temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
+    temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
+    PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
+                temp2_r, temp2_r, temp3_r, temp3_r,
+                temp0_r, temp1_r, temp2_r, temp3_r);
+    tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+
+    SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
+    SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
+    LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
+    PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
+                a0_r, a1_r, a2_r, a3_r);
+    ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
+               a3_l, a2_l, a1_l, a0_l);
+    a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
+    a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
+    a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
+    a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
+    a3_r = (v4i32) CLIP_SH_0_255(a3_r);
+    a2_r = (v4i32) CLIP_SH_0_255(a2_r);
+    a1_r = (v4i32) CLIP_SH_0_255(a1_r);
+    a0_r = (v4i32) CLIP_SH_0_255(a0_r);
+    PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
+                a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
+    tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
+    tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
+    tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
+    tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
+    SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_simple_idct_msa(int16_t *block)
+{
+    simple_idct_msa(block);
+}
+
+void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
+{
+    simple_idct_put_msa(dst, dst_stride, block);
+}
+
+void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
+{
+    simple_idct_add_msa(dst, dst_stride, block);
+}
diff --git a/libavcodec/mips/vp8_idct_msa.c b/libavcodec/mips/vp8_idct_msa.c
new file mode 100644
index 0000000..11ac9ff
--- /dev/null
+++ b/libavcodec/mips/vp8_idct_msa.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)    \
+{                                                                    \
+    v4i32 a1_m, b1_m, c1_m, d1_m;                                    \
+    v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
+    v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
+                                                                     \
+    const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);     \
+    sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                      \
+    a1_m = in0 + in2;                                                \
+    b1_m = in0 - in2;                                                \
+    c_tmp1_m = ((in1) * sinpi8_sqrt2_m) >> 16;                       \
+    c_tmp2_m = in3 + (((in3) * const_cospi8sqrt2minus1_m) >> 16);    \
+    c1_m = c_tmp1_m - c_tmp2_m;                                      \
+    d_tmp1_m = (in1) + (((in1) * const_cospi8sqrt2minus1_m) >> 16);  \
+    d_tmp2_m = ((in3) * sinpi8_sqrt2_m) >> 16;                       \
+    d1_m = d_tmp1_m + d_tmp2_m;                                      \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);     \
+}
+
+void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v4i32 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1;
+    v16i8 mask = { 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    /* load short vector elements of 4x4 block */
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(dst, stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+               res0, res1, res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+               res0, res1, res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
+    ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+
+    memset(input, 0, 4 * 4 * sizeof(*input));
+}
+
+void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride)
+{
+    v8i16 vec;
+    v8i16 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1;
+    v16i8 mask = { 0, 2, 4, 6, 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    vec = __msa_fill_h(in_dc[0]);
+    vec = __msa_srari_h(vec, 3);
+    LD_SB4(dst, stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+               res0, res1, res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
+    ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+
+    in_dc[0] = 0;
+}
+
+void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t input[16])
+{
+    int16_t *mb_dq_coeff = &block[0][0][0];
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
+    v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+
+    /* load short vector elements of 4x4 block */
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
+    /* transpose the block */
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
+    ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
+    SRA_4V(vt0, vt1, vt2, vt3, 3);
+    mb_dq_coeff[0] = __msa_copy_s_h((v8i16) vt0, 0);
+    mb_dq_coeff[16] = __msa_copy_s_h((v8i16) vt1, 0);
+    mb_dq_coeff[32] = __msa_copy_s_h((v8i16) vt2, 0);
+    mb_dq_coeff[48] = __msa_copy_s_h((v8i16) vt3, 0);
+    mb_dq_coeff[64] = __msa_copy_s_h((v8i16) vt0, 2);
+    mb_dq_coeff[80] = __msa_copy_s_h((v8i16) vt1, 2);
+    mb_dq_coeff[96] = __msa_copy_s_h((v8i16) vt2, 2);
+    mb_dq_coeff[112] = __msa_copy_s_h((v8i16) vt3, 2);
+    mb_dq_coeff[128] = __msa_copy_s_h((v8i16) vt0, 4);
+    mb_dq_coeff[144] = __msa_copy_s_h((v8i16) vt1, 4);
+    mb_dq_coeff[160] = __msa_copy_s_h((v8i16) vt2, 4);
+    mb_dq_coeff[176] = __msa_copy_s_h((v8i16) vt3, 4);
+    mb_dq_coeff[192] = __msa_copy_s_h((v8i16) vt0, 6);
+    mb_dq_coeff[208] = __msa_copy_s_h((v8i16) vt1, 6);
+    mb_dq_coeff[224] = __msa_copy_s_h((v8i16) vt2, 6);
+    mb_dq_coeff[240] = __msa_copy_s_h((v8i16) vt3, 6);
+
+    memset(input, 0, 4 * 4 * sizeof(int16_t));
+}
+
+void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16],
+                              ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 8, &block[2][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 12, &block[3][0], stride);
+}
+
+void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride)
+{
+    ff_vp8_idct_dc_add_msa(dst, &block[0][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + 4, &block[1][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + stride * 4, &block[2][0], stride);
+    ff_vp8_idct_dc_add_msa(dst + stride * 4 + 4, &block[3][0], stride);
+}
diff --git a/libavcodec/mips/vp8_lpf_msa.c b/libavcodec/mips/vp8_lpf_msa.c
new file mode 100644
index 0000000..3590961
--- /dev/null
+++ b/libavcodec/mips/vp8_lpf_msa.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)           \
+{                                                                \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                              \
+                                                                 \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                        \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                        \
+    p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);      \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);             \
+    mask = ((v16u8) mask <= b_limit);                            \
+}
+
+#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
+                           mask_in, hev_in)                             \
+{                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80);                       \
+    p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80);                       \
+    q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80);                       \
+    q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80);                       \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8) hev_in;                                       \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);           \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);  \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                     \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0);           \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h);  \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                     \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);               \
+    filt = filt & (v16i8) mask_in;                                      \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80);                       \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80);                       \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                        \
+    filt = filt & (v16i8) hev_in;                                       \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80);                       \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80);                       \
+}
+
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)           \
+{                                                                   \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;        \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;            \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;         \
+                                                                    \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                       \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                       \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                       \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                       \
+                                                                    \
+    filt = __msa_subs_s_b(p1_m, q1_m);                              \
+                                                                    \
+    q0_sub_p0 = q0_m - p0_m;                                        \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+                                                                    \
+    cnst3h = __msa_ldi_h(3);                                        \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                          \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \
+    filt_r += q0_sub_p0_r;                                          \
+    filt_r = __msa_sat_s_h(filt_r, 7);                              \
+                                                                    \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                          \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \
+    filt_l += q0_sub_p0_l;                                          \
+    filt_l = __msa_sat_s_h(filt_l, 7);                              \
+                                                                    \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \
+    filt = filt & (v16i8) (mask);                                   \
+                                                                    \
+    cnst4b = __msa_ldi_b(4);                                        \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                           \
+    filt1 >>= 3;                                                    \
+                                                                    \
+    cnst3b = __msa_ldi_b(3);                                        \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                           \
+    filt2 >>= 3;                                                    \
+                                                                    \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                             \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                             \
+    q0_in = __msa_xori_b((v16u8) q0_m, 0x80);                       \
+    p0_in = __msa_xori_b((v16u8) p0_m, 0x80);                       \
+}
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)             \
+{                                                                   \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                       \
+    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                          \
+    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;               \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;       \
+    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                        \
+                                                                    \
+    cnst3h = __msa_ldi_h(3);                                        \
+                                                                    \
+    p2_m = (v16i8) __msa_xori_b(p2, 0x80);                          \
+    p1_m = (v16i8) __msa_xori_b(p1, 0x80);                          \
+    p0_m = (v16i8) __msa_xori_b(p0, 0x80);                          \
+    q0_m = (v16i8) __msa_xori_b(q0, 0x80);                          \
+    q1_m = (v16i8) __msa_xori_b(q1, 0x80);                          \
+    q2_m = (v16i8) __msa_xori_b(q2, 0x80);                          \
+                                                                    \
+    filt = __msa_subs_s_b(p1_m, q1_m);                              \
+    q0_sub_p0 = q0_m - p0_m;                                        \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                  \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+                                                                    \
+    /* right part */                                                \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                          \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                 \
+    filt_r = filt_r + q0_sub_p0_r;                                  \
+    filt_r = __msa_sat_s_h(filt_r, 7);                              \
+                                                                    \
+    /* left part */                                                 \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                          \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                 \
+    filt_l = filt_l + q0_sub_p0_l;                                  \
+    filt_l = __msa_sat_s_h(filt_l, 7);                              \
+                                                                    \
+    /* combine left and right part */                               \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);           \
+    filt = filt & (v16i8) mask;                                     \
+    filt2 = filt & (v16i8) hev;                                     \
+                                                                    \
+    /* filt_val &= ~hev */                                          \
+    hev = __msa_xori_b(hev, 0xff);                                  \
+    filt = filt & (v16i8) hev;                                      \
+    cnst4b = __msa_ldi_b(4);                                        \
+    filt1 = __msa_adds_s_b(filt2, cnst4b);                          \
+    filt1 >>= 3;                                                    \
+    cnst3b = __msa_ldi_b(3);                                        \
+    filt2 = __msa_adds_s_b(filt2, cnst3b);                          \
+    filt2 >>= 3;                                                    \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                             \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                             \
+                                                                    \
+    filt_sign = __msa_clti_s_b(filt, 0);                            \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                   \
+                                                                    \
+    cnst27h = __msa_ldi_h(27);                                      \
+    cnst63h = __msa_ldi_h(63);                                      \
+                                                                    \
+    /* right part */                                                \
+    u_r = filt_r * cnst27h;                                         \
+    u_r += cnst63h;                                                 \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+    /* left part */                                                 \
+    u_l = filt_l * cnst27h;                                         \
+    u_l += cnst63h;                                                 \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q0_m = __msa_subs_s_b(q0_m, u);                                 \
+    q0 = __msa_xori_b((v16u8) q0_m, 0x80);                          \
+    p0_m = __msa_adds_s_b(p0_m, u);                                 \
+    p0 = __msa_xori_b((v16u8) p0_m, 0x80);                          \
+    cnst18h = __msa_ldi_h(18);                                      \
+    u_r = filt_r * cnst18h;                                         \
+    u_r += cnst63h;                                                 \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+                                                                    \
+    /* left part */                                                 \
+    u_l = filt_l * cnst18h;                                         \
+    u_l += cnst63h;                                                 \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q1_m = __msa_subs_s_b(q1_m, u);                                 \
+    q1 = __msa_xori_b((v16u8) q1_m, 0x80);                          \
+    p1_m = __msa_adds_s_b(p1_m, u);                                 \
+    p1 = __msa_xori_b((v16u8) p1_m, 0x80);                          \
+    u_r = filt_r << 3;                                              \
+    u_r += filt_r + cnst63h;                                        \
+    u_r >>= 7;                                                      \
+    u_r = __msa_sat_s_h(u_r, 7);                                    \
+                                                                    \
+    /* left part */                                                 \
+    u_l = filt_l << 3;                                              \
+    u_l += filt_l + cnst63h;                                        \
+    u_l >>= 7;                                                      \
+    u_l = __msa_sat_s_h(u_l, 7);                                    \
+    /* combine left and right part */                               \
+    u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r);                    \
+    q2_m = __msa_subs_s_b(q2_m, u);                                 \
+    q2 = __msa_xori_b((v16u8) q2_m, 0x80);                          \
+    p2_m = __msa_adds_s_b(p2_m, u);                                 \
+    p2 = __msa_xori_b((v16u8) p2_m, 0x80);                          \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    /* absolute subtraction of pixel values */                     \
+    p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \
+    p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \
+    p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \
+    q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \
+    q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \
+    q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \
+    p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \
+    p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \
+    /* calculation of hev */                                       \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = (thresh_in) < (v16u8) flat_out;                      \
+    /* calculation of mask */                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+    mask_out = (b_limit_in) < p0_asub_q0_m;                        \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+    mask_out = (limit_in) < (v16u8) mask_out;                      \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \
+{                                                               \
+    uint16_t tmp0_h;                                            \
+    uint32_t tmp0_w;                                            \
+                                                                \
+    tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx);              \
+    tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx);              \
+    SW(tmp0_w, pdst);                                           \
+    SH(tmp0_h, pdst + stride);                                  \
+}
+
+void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    /* load vector elements */
+    temp_src = src - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    /* store vector elements */
+    temp_src = src - 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
+    temp_src += (4 * pitch);
+    ST_UB2(q1, q2, temp_src, pitch);
+}
+
+void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
+                                 int thresh_in)
+{
+    uint8_t *temp_src;
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+
+    temp_src = src_u - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    temp_src = src_v - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+
+    /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    p2_d = __msa_copy_u_d((v2i64) p2, 0);
+    p1_d = __msa_copy_u_d((v2i64) p1, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1, 0);
+    q2_d = __msa_copy_u_d((v2i64) q2, 0);
+    src_u -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
+    src_u += 4 * pitch;
+    SD(q1_d, src_u);
+    src_u += pitch;
+    SD(q2_d, src_u);
+
+    p2_d = __msa_copy_u_d((v2i64) p2, 1);
+    p1_d = __msa_copy_u_d((v2i64) p1, 1);
+    p0_d = __msa_copy_u_d((v2i64) p0, 1);
+    q0_d = __msa_copy_u_d((v2i64) q0, 1);
+    q1_d = __msa_copy_u_d((v2i64) q1, 1);
+    q2_d = __msa_copy_u_d((v2i64) q2, 1);
+    src_v -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
+    src_v += 4 * pitch;
+    SD(q1_d, src_v);
+    src_v += pitch;
+    SD(q2_d, src_v);
+}
+
+void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in,
+                                int limit_in, int thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    temp_src = src - 4;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    temp_src = src - 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+void ff_vp8_h_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                 ptrdiff_t pitch, int b_limit_in, int limit_in,
+                                 int thresh_in)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    src_u -= 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
+
+    src_v -= 3;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
+                                     int b_limit_ptr)
+{
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    /* load vector elements */
+    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ST_UB2(p0, q0, (src - pitch), pitch);
+}
+
+void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
+                                     int b_limit_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1;
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    temp_src = src - 2;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+
+    src -= 1;
+    ST2x4_UB(tmp1, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp1, 4, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 4, src, pitch);
+    src += 4 * pitch;
+}
+
+void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
+                                       ptrdiff_t pitch, int b_limit_in,
+                                       int limit_in, int thresh_in)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+
+    src_u = src_u - (pitch << 2);
+    LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    src_u += (5 * pitch);
+    src_v = src_v - (pitch << 2);
+    LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+    src_v += (5 * pitch);
+
+    /* right 8 element of p3 are u pixel and
+       left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    p1_d = __msa_copy_u_d((v2i64) p1, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1, 0);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
+
+    p1_d = __msa_copy_u_d((v2i64) p1, 1);
+    p0_d = __msa_copy_u_d((v2i64) p0, 1);
+    q0_d = __msa_copy_u_d((v2i64) q0, 1);
+    q1_d = __msa_copy_u_d((v2i64) q1, 1);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
+}
+
+void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
+                                       ptrdiff_t pitch, int b_limit_in,
+                                       int limit_in, int thresh_in)
+{
+    uint8_t *temp_src_u, *temp_src_v;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    thresh = (v16u8) __msa_fill_b(thresh_in);
+    limit = (v16u8) __msa_fill_b(limit_in);
+    b_limit = (v16u8) __msa_fill_b(b_limit_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+    tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1);
+    tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
+    ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+
+    temp_src_u = src_u - 2;
+    ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
+    temp_src_u += 4 * pitch;
+    ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
+
+    temp_src_v = src_v - 2;
+    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
+    temp_src_v += 4 * pitch;
+    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+}
+
+void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    thresh = (v16u8) __msa_fill_b(h);
+    b_limit = (v16u8) __msa_fill_b(e);
+    limit = (v16u8) __msa_fill_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
+                                      int32_t e, int32_t i, int32_t h)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(h);
+    b_limit = (v16u8) __msa_fill_b(e);
+    limit = (v16u8) __msa_fill_b(i);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/libavcodec/mips/vp8_mc_msa.c b/libavcodec/mips/vp8_mc_msa.c
new file mode 100644
index 0000000..2bf0abd
--- /dev/null
+++ b/libavcodec/mips/vp8_mc_msa.c
@@ -0,0 +1,2332 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp8dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp8dsp_mips.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static const int8_t subpel_filters_msa[7][8] = {
+    {-6, 123, 12, -1, 0, 0, 0, 0},
+    {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-9, 93, 50, -6, 0, 0, 0, 0},
+    {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
+    {-6, 50, 93, -9, 0, 0, 0, 0},
+    {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
+    {-1, 12, 123, -6, 0, 0, 0, 0},
+};
+
+static const int8_t bilinear_filters_msa[7][2] = {
+    {112, 16},
+    {96, 32},
+    {80, 48},
+    {64, 64},
+    {48, 80},
+    {32, 96},
+    {16, 112}
+};
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
+                        filt_h0, filt_h1, filt_h2)                       \
+( {                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m;                                        \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
+               vec0_m, vec1_m, vec2_m);                                  \
+    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
+                            filt_h0, filt_h1, filt_h2);                  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+} )
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, mask2,                \
+                                   filt0, filt1, filt2,                \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
+    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
+}
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2,                       \
+                                   filt0, filt1, filt2,                       \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
+( {                                                             \
+    v8i16 tmp0;                                                 \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
+( {                                                                    \
+    v16i8 vec0_m, vec1_m;                                              \
+    v8i16 hz_out_m;                                                    \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
+    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
+                                                                       \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                             \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
+                                                                       \
+    hz_out_m;                                                          \
+} )
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 2;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (4 * src_stride);
+
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SRARI_H4_SH(out4, out5, out6, out7, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src4 = src8;
+    }
+}
+
+void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
+               src10_r, src32_r, src21_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
+               src32_r, src43_r, src21_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
+               src32_l, src43_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
+                              filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
+                              filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
+                              filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
+                              filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
+                              filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
+                              filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
+                              filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
+                              filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 tmp0, tmp1;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (2 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src7, src8);
+        hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (2 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out7;
+        out3 = out5;
+        out4 = out6;
+    }
+}
+
+
+void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (16 == height) {
+        common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= 1;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                                   filt1, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SRARI_H4_SH(out4, out5, out6, out7, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
+    src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
+        src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
+        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
+        src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
+        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                             uint8_t *src, ptrdiff_t src_stride,
+                             int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                              uint8_t *src, ptrdiff_t src_stride,
+                              int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (1 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B2_128_SB(src3, src4);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out0, out1;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (1 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        vec0 = vec4;
+        vec2 = vec1;
+    }
+}
+
+void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 res0, res1, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (2 + 1 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+        XORI_B2_128_UB(res0, res1);
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+    v16u8 out0, out1;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= (2 + src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v16u8 out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[16]);
+
+    src -= (1 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                               uint8_t *src, ptrdiff_t src_stride,
+                               int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
+    const int8_t *filter_vert = subpel_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 vec0, vec1;
+
+    mask0 = LD_SB(&mc_filt_mask_arr[0]);
+    src -= (1 + 2 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+        out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out6;
+        out3 = out5;
+        out4 = out7;
+    }
+}
+
+void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
+                                  mx, my);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, 7);
+    SRARI_H4_UH(out4, out5, out6, out7, 7);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                uint8_t *src, ptrdiff_t src_stride,
+                                int height, int mx, int my)
+{
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
+                                          uint8_t *dst, int32_t dst_stride,
+                                          const int8_t *filter_horiz,
+                                          const int8_t *filter_vert,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, 7);
+        SAT_UH2_UH(tmp3, tmp4, 7);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                 uint8_t *src, ptrdiff_t src_stride,
+                                 int height, int mx, int my)
+{
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                                  uint8_t *src, ptrdiff_t src_stride,
+                                  int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                            uint8_t *src, ptrdiff_t src_stride,
+                            int height, int mx, int my)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                            uint8_t *src, ptrdiff_t src_stride,
+                            int height, int mx, int my)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3;
+
+    if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
diff --git a/libavcodec/mips/vp8dsp_init_mips.c b/libavcodec/mips/vp8dsp_init_mips.c
new file mode 100644
index 0000000..58d1b6c
--- /dev/null
+++ b/libavcodec/mips/vp8dsp_init_mips.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * VP8 compatible video decoder
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp_mips.h"
+
+#define VP8_MC_MIPS_FUNC(IDX, SIZE)            \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][1] =  \
+        ff_put_vp8_epel##SIZE##_h4_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][2] =  \
+        ff_put_vp8_epel##SIZE##_h6_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][0] =  \
+        ff_put_vp8_epel##SIZE##_v4_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][1] =  \
+        ff_put_vp8_epel##SIZE##_h4v4_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][1][2] =  \
+        ff_put_vp8_epel##SIZE##_h6v4_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][0] =  \
+        ff_put_vp8_epel##SIZE##_v6_msa;        \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][1] =  \
+        ff_put_vp8_epel##SIZE##_h4v6_msa;      \
+    dsp->put_vp8_epel_pixels_tab[IDX][2][2] =  \
+        ff_put_vp8_epel##SIZE##_h6v6_msa
+
+#define VP8_BILINEAR_MC_MIPS_FUNC(IDX, SIZE)       \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][1] =  \
+        ff_put_vp8_bilinear##SIZE##_h_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][2] =  \
+        ff_put_vp8_bilinear##SIZE##_h_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][0] =  \
+        ff_put_vp8_bilinear##SIZE##_v_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][1] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][1][2] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][0] =  \
+        ff_put_vp8_bilinear##SIZE##_v_msa;         \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa;        \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] =  \
+        ff_put_vp8_bilinear##SIZE##_hv_msa
+
+#define VP8_MC_MIPS_COPY(IDX, SIZE)                \
+    dsp->put_vp8_epel_pixels_tab[IDX][0][0] =      \
+        ff_put_vp8_pixels##SIZE##_msa;             \
+    dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] =  \
+        ff_put_vp8_pixels##SIZE##_msa;
+
+#if HAVE_MSA
+static av_cold void vp8dsp_init_msa(VP8DSPContext *dsp)
+{
+    dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_msa;
+    dsp->vp8_idct_add = ff_vp8_idct_add_msa;
+    dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_msa;
+    dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_msa;
+    dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_msa;
+
+    VP8_MC_MIPS_FUNC(0, 16);
+    VP8_MC_MIPS_FUNC(1, 8);
+    VP8_MC_MIPS_FUNC(2, 4);
+
+    VP8_BILINEAR_MC_MIPS_FUNC(0, 16);
+    VP8_BILINEAR_MC_MIPS_FUNC(1, 8);
+    VP8_BILINEAR_MC_MIPS_FUNC(2, 4);
+
+    VP8_MC_MIPS_COPY(0, 16);
+    VP8_MC_MIPS_COPY(1, 8);
+
+    dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_msa;
+    dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_msa;
+    dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_msa;
+    dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_msa;
+
+    dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_msa;
+    dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_msa;
+    dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_msa;
+    dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_msa;
+
+    dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_msa;
+    dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_msa;
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_vp8dsp_init_mips(VP8DSPContext *dsp)
+{
+#if HAVE_MSA
+    vp8dsp_init_msa(dsp);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/vp8dsp_mips.h b/libavcodec/mips/vp8dsp_mips.h
new file mode 100644
index 0000000..8e715b5
--- /dev/null
+++ b/libavcodec/mips/vp8dsp_mips.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VP8DSP_MIPS_H
+#define AVCODEC_MIPS_VP8DSP_MIPS_H
+
+void ff_put_vp8_pixels4_msa(uint8_t *dst, ptrdiff_t dststride,
+                            uint8_t *src, ptrdiff_t srcstride,
+                            int h, int x, int y);
+void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dststride,
+                            uint8_t *src, ptrdiff_t srcstride,
+                            int h, int x, int y);
+void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int x, int y);
+
+void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                              uint8_t *src, ptrdiff_t srcstride,
+                              int h, int mx, int my);
+void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+
+void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+
+void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                             uint8_t *src, ptrdiff_t srcstride,
+                             int h, int mx, int my);
+void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dststride,
+                               uint8_t *src, ptrdiff_t srcstride,
+                               int h, int mx, int my);
+
+void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                  uint8_t *src, ptrdiff_t srcstride,
+                                  int h, int mx, int my);
+
+void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+
+void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dststride,
+                                uint8_t *src, ptrdiff_t srcstride,
+                                int h, int mx, int my);
+void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dststride,
+                                 uint8_t *src, ptrdiff_t srcstride,
+                                 int h, int mx, int my);
+
+/* loop filter */
+void ff_vp8_h_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+void ff_vp8_v_loop_filter16_inner_msa(uint8_t *dst, ptrdiff_t stride,
+                                      int32_t e, int32_t i, int32_t h);
+void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                       ptrdiff_t stride,
+                                       int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                       ptrdiff_t stride,
+                                       int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter16_msa(uint8_t *dst, ptrdiff_t stride,
+                                int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_v_loop_filter8uv_msa(uint8_t *dst_u, uint8_t *dst_v,
+                                 ptrdiff_t stride,
+                                 int flim_e, int flim_i, int hev_thresh);
+void ff_vp8_h_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim);
+void ff_vp8_v_loop_filter_simple_msa(uint8_t *dst, ptrdiff_t stride, int flim);
+
+/* Idct functions */
+void ff_vp8_luma_dc_wht_msa(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_idct_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_msa(uint8_t *dst, int16_t block[4][16],
+                               ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_msa(uint8_t *dst, int16_t block[4][16],
+                              ptrdiff_t stride);
+
+#endif  // #ifndef AVCODEC_MIPS_VP8DSP_MIPS_H
diff --git a/libavcodec/mips/vp9_idct_msa.c b/libavcodec/mips/vp9_idct_msa.c
new file mode 100644
index 0000000..25ea16c
--- /dev/null
+++ b/libavcodec/mips/vp9_idct_msa.c
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define VP9_DCT_CONST_BITS   14
+#define ROUND_POWER_OF_TWO(value, n)  (((value) + (1 << ((n) - 1))) >> (n))
+
+static const int32_t cospi_1_64 = 16364;
+static const int32_t cospi_2_64 = 16305;
+static const int32_t cospi_3_64 = 16207;
+static const int32_t cospi_4_64 = 16069;
+static const int32_t cospi_5_64 = 15893;
+static const int32_t cospi_6_64 = 15679;
+static const int32_t cospi_7_64 = 15426;
+static const int32_t cospi_8_64 = 15137;
+static const int32_t cospi_9_64 = 14811;
+static const int32_t cospi_10_64 = 14449;
+static const int32_t cospi_11_64 = 14053;
+static const int32_t cospi_12_64 = 13623;
+static const int32_t cospi_13_64 = 13160;
+static const int32_t cospi_14_64 = 12665;
+static const int32_t cospi_15_64 = 12140;
+static const int32_t cospi_16_64 = 11585;
+static const int32_t cospi_17_64 = 11003;
+static const int32_t cospi_18_64 = 10394;
+static const int32_t cospi_19_64 = 9760;
+static const int32_t cospi_20_64 = 9102;
+static const int32_t cospi_21_64 = 8423;
+static const int32_t cospi_22_64 = 7723;
+static const int32_t cospi_23_64 = 7005;
+static const int32_t cospi_24_64 = 6270;
+static const int32_t cospi_25_64 = 5520;
+static const int32_t cospi_26_64 = 4756;
+static const int32_t cospi_27_64 = 3981;
+static const int32_t cospi_28_64 = 3196;
+static const int32_t cospi_29_64 = 2404;
+static const int32_t cospi_30_64 = 1606;
+static const int32_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const int32_t sinpi_1_9 = 5283;
+static const int32_t sinpi_2_9 = 9929;
+static const int32_t sinpi_3_9 = 13377;
+static const int32_t sinpi_4_9 = 15212;
+
+#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)  \
+{                                                                  \
+    v8i16 k0_m = __msa_fill_h(cnst0);                              \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                  \
+                                                                   \
+    s0_m = (v4i32) __msa_fill_h(cnst1);                            \
+    k0_m = __msa_ilvev_h((v8i16) s0_m, k0_m);                      \
+                                                                   \
+    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                        \
+    ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                           \
+    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);               \
+    SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS);                   \
+    out0 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m);              \
+                                                                   \
+    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);               \
+    SRARI_W2_SW(s1_m, s0_m, VP9_DCT_CONST_BITS);                   \
+    out1 = __msa_pckev_h((v8i16) s0_m, (v8i16) s1_m);              \
+}
+
+#define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                                      dst0, dst1, dst2, dst3)              \
+{                                                                          \
+    v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                               \
+    v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                               \
+                                                                           \
+    DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5,                    \
+                tp0_m, tp2_m, tp3_m, tp4_m);                               \
+    DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7,                    \
+                tp5_m, tp6_m, tp7_m, tp8_m);                               \
+    BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);   \
+    BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);   \
+    SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, VP9_DCT_CONST_BITS);           \
+    SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, VP9_DCT_CONST_BITS);           \
+    PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m,    \
+                dst0, dst1, dst2, dst3);                                   \
+}
+
+#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2)          \
+( {                                                       \
+    v8i16 dst_m;                                          \
+    v4i32 tp0_m, tp1_m;                                   \
+                                                          \
+    DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);        \
+    SRARI_W2_SW(tp1_m, tp0_m, VP9_DCT_CONST_BITS);        \
+    dst_m = __msa_pckev_h((v8i16) tp1_m, (v8i16) tp0_m);  \
+                                                          \
+    dst_m;                                                \
+} )
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,                 \
+                  out0, out1, out2, out3, out4, out5, out6, out7)         \
+{                                                                         \
+    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                    \
+    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                     \
+    v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,  \
+        cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };             \
+    v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
+        -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                  \
+                                                                          \
+    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                       \
+    cnst2_m = -cnst0_m;                                                   \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                       \
+    cnst4_m = -cnst2_m;                                                   \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                          \
+    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst1_m, cnst2_m, cnst3_m, in7, in0,        \
+                              in4, in3);                                  \
+                                                                          \
+    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                       \
+    cnst2_m = -cnst0_m;                                                   \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                       \
+    cnst4_m = -cnst2_m;                                                   \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                          \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+                                                                          \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst1_m, cnst2_m, cnst3_m, in5, in2,        \
+                              in6, in1);                                  \
+    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                \
+    out7 = -s0_m;                                                         \
+    out0 = s1_m;                                                          \
+                                                                          \
+    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
+                 cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
+                                                                          \
+    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);    \
+    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+    cnst1_m = cnst0_m;                                                    \
+                                                                          \
+    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+    VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,    \
+                              cnst2_m, cnst3_m, cnst1_m, out1, out6,      \
+                              s0_m, s1_m);                                \
+                                                                          \
+    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                       \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+                                                                          \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                              \
+    out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);            \
+    out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);            \
+    out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);            \
+    out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);            \
+                                                                          \
+    out1 = -out1;                                                         \
+    out3 = -out3;                                                         \
+    out5 = -out5;                                                         \
+}
+
+#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1)                        \
+{                                                                         \
+    v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                             \
+    v8i16 madd_s0_m, madd_s1_m;                                           \
+                                                                          \
+    ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                            \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,               \
+                c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);      \
+    SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, VP9_DCT_CONST_BITS);  \
+    PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);          \
+}
+
+#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,       \
+                    out0, out1, out2, out3)                               \
+{                                                                         \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                     \
+                                                                          \
+    ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                        \
+    ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                        \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+                cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+                m4_m, m5_m, tmp3_m, tmp2_m);                              \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);          \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                  \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+                cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+                m4_m, m5_m, tmp3_m, tmp2_m);                              \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);          \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                  \
+}
+
+#define VP9_SET_COSPI_PAIR(c0_h, c1_h)   \
+( {                                      \
+    v8i16 out0_m, r0_m, r1_m;            \
+                                         \
+    r0_m = __msa_fill_h(c0_h);           \
+    r1_m = __msa_fill_h(c1_h);           \
+    out0_m = __msa_ilvev_h(r1_m, r0_m);  \
+                                         \
+    out0_m;                              \
+} )
+
+#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)  \
+{                                                                 \
+    uint8_t *dst_m = (uint8_t *) (dst);                           \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                         \
+    v16i8 tmp0_m, tmp1_m;                                         \
+    v16i8 zero_m = { 0 };                                         \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                         \
+                                                                  \
+    LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);    \
+    ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,    \
+               zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);   \
+    ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,      \
+         res0_m, res1_m, res2_m, res3_m);                         \
+    CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);               \
+    PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);  \
+    ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                  \
+}
+
+#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
+{                                                                     \
+    v8i16 c0_m, c1_m, c2_m, c3_m;                                     \
+    v8i16 step0_m, step1_m;                                           \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+                                                                      \
+    c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+    c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+    step0_m = __msa_ilvr_h(in2, in0);                                 \
+    DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);        \
+                                                                      \
+    c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+    c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+    step1_m = __msa_ilvr_h(in3, in1);                                 \
+    DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);        \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);  \
+                                                                      \
+    PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);      \
+    SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                  \
+    BUTTERFLY_4((v8i16) tmp0_m, (v8i16) tmp1_m,                       \
+                (v8i16) tmp2_m, (v8i16) tmp3_m,                       \
+                out0, out1, out2, out3);                              \
+}
+
+#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)      \
+{                                                                     \
+    v8i16 res0_m, res1_m, c0_m, c1_m;                                 \
+    v8i16 k1_m, k2_m, k3_m, k4_m;                                     \
+    v8i16 zero_m = { 0 };                                             \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+    v4i32 int0_m, int1_m, int2_m, int3_m;                             \
+    v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
+        sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                \
+        -sinpi_4_9 };                                                 \
+                                                                      \
+    SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);         \
+    ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                  \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);          \
+    int0_m = tmp2_m + tmp1_m;                                         \
+                                                                      \
+    SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                           \
+    ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                  \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+    int1_m = tmp0_m + tmp1_m;                                         \
+                                                                      \
+    c0_m = __msa_splati_h(mask_m, 6);                                 \
+    ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                 \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+    int2_m = tmp0_m + tmp1_m;                                         \
+                                                                      \
+    c0_m = __msa_splati_h(mask_m, 6);                                 \
+    c0_m = __msa_ilvev_h(c0_m, k1_m);                                 \
+                                                                      \
+    res0_m = __msa_ilvr_h((in1), (in3));                              \
+    tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                            \
+    int3_m = tmp2_m + tmp0_m;                                         \
+                                                                      \
+    res0_m = __msa_ilvr_h((in2), (in3));                              \
+    c1_m = __msa_ilvev_h(k4_m, k3_m);                                 \
+                                                                      \
+    tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                            \
+    res1_m = __msa_ilvr_h((in0), (in2));                              \
+    c1_m = __msa_ilvev_h(k1_m, zero_m);                               \
+                                                                      \
+    tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                            \
+    int3_m += tmp2_m;                                                 \
+    int3_m += tmp3_m;                                                 \
+                                                                      \
+    SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, VP9_DCT_CONST_BITS);  \
+    PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);          \
+    PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);          \
+}
+
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,          \
+                           out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                           \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                   \
+    v8i16 zero_m = { 0 };                                                   \
+                                                                            \
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                      \
+               tmp0_n, tmp1_n, tmp2_n, tmp3_n);                             \
+    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                            \
+    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                            \
+                                                                            \
+    out0 = (v8i16) __msa_ilvr_d((v2i64) tmp1_m, (v2i64) tmp0_m);            \
+    out1 = (v8i16) __msa_ilvl_d((v2i64) tmp1_m, (v2i64) tmp0_m);            \
+    out2 = (v8i16) __msa_ilvr_d((v2i64) tmp3_m, (v2i64) tmp2_m);            \
+    out3 = (v8i16) __msa_ilvl_d((v2i64) tmp3_m, (v2i64) tmp2_m);            \
+                                                                            \
+    out4 = zero_m;                                                          \
+    out5 = zero_m;                                                          \
+    out6 = zero_m;                                                          \
+    out7 = zero_m;                                                          \
+}
+
+static void vp9_idct4x4_1_add_msa(int16_t *input, uint8_t *dst,
+                                  int32_t dst_stride)
+{
+    int16_t out;
+    v8i16 vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 4);
+    vec = __msa_fill_h(out);
+
+    ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
+}
+
+static void vp9_idct4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* rows */
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_iadst4x4_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* rows */
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_iadst_idct_4x4_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* cols */
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+static void vp9_idct_iadst_4x4_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3;
+
+    /* load vector elements of 4x4 block */
+    LD4x4_SH(input, in0, in1, in2, in3);
+    /* cols */
+    VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* columns */
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+    VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+    /* rounding (add 2^3, divide by 2^4) */
+    SRARI_H4_SH(in0, in1, in2, in3, 4);
+    ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)     \
+( {                                                    \
+    v8i16 c0_m, c1_m;                                  \
+                                                       \
+    SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m);  \
+    c0_m = __msa_ilvev_h(c1_m, c0_m);                  \
+                                                       \
+    c0_m;                                              \
+} )
+
+/* multiply and add macro */
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,          \
+                 out0, out1, out2, out3)                                  \
+{                                                                         \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                          \
+    ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                        \
+    ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                        \
+    DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
+                cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);      \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);              \
+    DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
+                cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);      \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);              \
+}
+
+/* idct 8x8 macro */
+#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,                 \
+                       out0, out1, out2, out3, out4, out5, out6, out7)         \
+{                                                                              \
+    v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;              \
+    v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;              \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
+       cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };                 \
+                                                                               \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
+    k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
+    k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
+    k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
+    VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
+    SUB2(in1, in3, in7, in5, res0_m, res1_m);                                  \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
+    k1_m = __msa_splati_h(mask_m, 4);                                          \
+                                                                               \
+    ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                               \
+    DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,        \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                               \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS);           \
+    tp4_m = in1 + in3;                                                         \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                 \
+    tp7_m = in7 + in5;                                                         \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
+    VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
+             in0, in4, in2, in6);                                              \
+    BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);               \
+    BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
+                out0, out1, out2, out3, out4, out5, out6, out7);               \
+}
+
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,              \
+                        out0, out1, out2, out3, out4, out5, out6, out7)      \
+{                                                                            \
+    v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                    \
+    v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                \
+    v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;          \
+    v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
+        cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
+    v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
+        cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };    \
+    v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
+        -cospi_16_64, 0, 0, 0, 0 };                                          \
+                                                                             \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                \
+    ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                \
+    ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                     \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                         \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                \
+    ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                \
+    ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                     \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                         \
+    ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                     \
+    BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);        \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                \
+    ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                   \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                r0_m, r1_m, r2_m, r3_m);                                     \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+                r4_m, r5_m, r6_m, r7_m);                                     \
+    ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                          \
+    SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+         m0_m, m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                           \
+    k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                \
+    ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                     \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+                m0_m, m1_m, m2_m, m3_m);                                     \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                          \
+    ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                       \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+                m0_m, m1_m, m2_m, m3_m);                                     \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, VP9_DCT_CONST_BITS);                 \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                          \
+                                                                             \
+    out1 = -in1;                                                             \
+    out3 = -in3;                                                             \
+    out5 = -in5;                                                             \
+    out7 = -in7;                                                             \
+}
+
+static void vp9_idct8x8_1_add_msa(int16_t *input, uint8_t *dst,
+                                  int32_t dst_stride)
+{
+    int16_t out;
+    int32_t val;
+    v8i16 vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    val = ROUND_POWER_OF_TWO(out, 5);
+    vec = __msa_fill_h(val);
+
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+}
+
+static void vp9_idct8x8_12_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
+    v4i32 tmp0, tmp1, tmp2, tmp3;
+    v8i16 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
+    ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
+    //TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    /* stage1 */
+    ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
+    k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+    DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+    SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+    PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+    BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
+
+    /* stage2 */
+    ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
+    k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+    DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+    SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+    PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+    BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
+
+    /* stage3 */
+    s0 = __msa_ilvr_h(s6, s5);
+
+    k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+    DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
+    SRARI_W2_SW(tmp0, tmp1, VP9_DCT_CONST_BITS);
+    PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
+
+    /* stage4 */
+    BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_idct8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                          int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_iadst8x8_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 cnst0, cnst1, cnst2, cnst3, cnst4;
+    v8i16 temp0, temp1, temp2, temp3, s0, s1;
+    v16i8 zero = { 0 };
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* 1D adst8x8 */
+    VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+              in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    cnst0 = __msa_fill_h(cospi_2_64);
+    cnst1 = __msa_fill_h(cospi_30_64);
+    cnst2 = -cnst0;
+    ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1);
+    cnst2 = __msa_fill_h(cospi_18_64);
+    cnst3 = __msa_fill_h(cospi_14_64);
+    cnst4 = -cnst2;
+    ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3);
+
+    ILVRL_H2_SH(in0, in7, temp1, temp0);
+    ILVRL_H2_SH(in4, in3, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2,
+                              cnst3, in7, in0, in4, in3);
+
+    cnst0 = __msa_fill_h(cospi_10_64);
+    cnst1 = __msa_fill_h(cospi_22_64);
+    cnst2 = -cnst0;
+    ILVEV_H2_SH(cnst0, cnst1, cnst1, cnst2, cnst0, cnst1);
+    cnst2 = __msa_fill_h(cospi_26_64);
+    cnst3 = __msa_fill_h(cospi_6_64);
+    cnst4 = -cnst2;
+    ILVEV_H2_SH(cnst2, cnst3, cnst3, cnst4, cnst2, cnst3);
+
+    ILVRL_H2_SH(in2, in5, temp1, temp0);
+    ILVRL_H2_SH(in6, in1, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst1, cnst2,
+                              cnst3, in5, in2, in6, in1);
+    BUTTERFLY_4(in7, in0, in2, in5, s1, s0, in2, in5);
+    out7 = -s0;
+    out0 = s1;
+    SRARI_H2_SH(out0, out7, 5);
+    dst0 = LD_UB(dst + 0 * dst_stride);
+    dst7 = LD_UB(dst + 7 * dst_stride);
+
+    res0 = (v8i16) __msa_ilvr_b(zero, (v16i8) dst0);
+    res0 += out0;
+    res0 = CLIP_SH_0_255(res0);
+    res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
+    ST8x1_UB(res0, dst);
+
+    res7 = (v8i16) __msa_ilvr_b(zero, (v16i8) dst7);
+    res7 += out7;
+    res7 = CLIP_SH_0_255(res7);
+    res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7);
+    ST8x1_UB(res7, dst + 7 * dst_stride);
+
+    cnst1 = __msa_fill_h(cospi_24_64);
+    cnst0 = __msa_fill_h(cospi_8_64);
+    cnst3 = -cnst1;
+    cnst2 = -cnst0;
+
+    ILVEV_H2_SH(cnst3, cnst0, cnst1, cnst2, cnst3, cnst2);
+    cnst0 = __msa_ilvev_h(cnst1, cnst0);
+    cnst1 = cnst0;
+
+    ILVRL_H2_SH(in4, in3, temp1, temp0);
+    ILVRL_H2_SH(in6, in1, temp3, temp2);
+    VP9_DOT_ADD_SUB_SRARI_PCK(temp0, temp1, temp2, temp3, cnst0, cnst2, cnst3,
+                              cnst1, out1, out6, s0, s1);
+    out1 = -out1;
+    SRARI_H2_SH(out1, out6, 5);
+    dst1 = LD_UB(dst + 1 * dst_stride);
+    dst6 = LD_UB(dst + 6 * dst_stride);
+    ILVR_B2_SH(zero, dst1, zero, dst6, res1, res6);
+    ADD2(res1, out1, res6, out6, res1, res6);
+    CLIP_SH2_0_255(res1, res6);
+    PCKEV_B2_SH(res1, res1, res6, res6, res1, res6);
+    ST8x1_UB(res1, dst + dst_stride);
+    ST8x1_UB(res6, dst + 6 * dst_stride);
+
+    cnst0 = __msa_fill_h(cospi_16_64);
+    cnst1 = -cnst0;
+    cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+    ILVRL_H2_SH(in2, in5, temp1, temp0);
+    ILVRL_H2_SH(s0, s1, temp3, temp2);
+    out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst0);
+    out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp0, temp1, cnst1);
+    out3 = -out3;
+    SRARI_H2_SH(out3, out4, 5);
+    dst3 = LD_UB(dst + 3 * dst_stride);
+    dst4 = LD_UB(dst + 4 * dst_stride);
+    ILVR_B2_SH(zero, dst3, zero, dst4, res3, res4);
+    ADD2(res3, out3, res4, out4, res3, res4);
+    CLIP_SH2_0_255(res3, res4);
+    PCKEV_B2_SH(res3, res3, res4, res4, res3, res4);
+    ST8x1_UB(res3, dst + 3 * dst_stride);
+    ST8x1_UB(res4, dst + 4 * dst_stride);
+
+    out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst0);
+    out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst1);
+    out5 = -out5;
+    SRARI_H2_SH(out2, out5, 5);
+    dst2 = LD_UB(dst + 2 * dst_stride);
+    dst5 = LD_UB(dst + 5 * dst_stride);
+    ILVR_B2_SH(zero, dst2, zero, dst5, res2, res5);
+    ADD2(res2, out2, res5, out5, res2, res5);
+    CLIP_SH2_0_255(res2, res5);
+    PCKEV_B2_SH(res2, res2, res5, res5, res2, res5);
+    ST8x1_UB(res2, dst + 2 * dst_stride);
+    ST8x1_UB(res5, dst + 5 * dst_stride);
+}
+
+static void vp9_iadst_idct_8x8_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in1, in6, in3, in4, in5, in2, in7, in0);
+    /* 1D idct8x8 */
+    VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                    in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+static void vp9_idct_iadst_8x8_add_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride, int32_t eob)
+{
+    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+    /* load vector elements of 8x8 block */
+    LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+    /* 1D idct8x8 */
+    VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+    /* columns transform */
+    TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in1, in6, in3, in4, in5, in2, in7, in0);
+    /* 1D idct8x8 */
+    VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                    in0, in1, in2, in3, in4, in5, in6, in7);
+    /* final rounding (add 2^4, divide by 2^5) and shift */
+    SRARI_H4_SH(in0, in1, in2, in3, 5);
+    SRARI_H4_SH(in4, in5, in6, in7, 5);
+    /* add block and store 8x8 */
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+    dst += (4 * dst_stride);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,          \
+                         r9, r10, r11, r12, r13, r14, r15,            \
+                         out0, out1, out2, out3, out4, out5,          \
+                         out6, out7, out8, out9, out10, out11,        \
+                         out12, out13, out14, out15)                  \
+{                                                                     \
+    v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;             \
+    v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;       \
+    v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;             \
+    v8i16 h8_m, h9_m, h10_m, h11_m;                                   \
+    v8i16 k0_m, k1_m, k2_m, k3_m;                                     \
+                                                                      \
+    /* stage 1 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);              \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);             \
+    VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,              \
+                g0_m, g1_m, g2_m, g3_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);              \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);             \
+    VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,             \
+                g4_m, g5_m, g6_m, g7_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);               \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);              \
+    VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,             \
+                g8_m, g9_m, g10_m, g11_m);                            \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);             \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);               \
+    k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);              \
+    VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,              \
+                g12_m, g13_m, g14_m, g15_m);                          \
+                                                                      \
+    /* stage 2 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);              \
+    VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,      \
+                h0_m, h1_m, h2_m, h3_m);                              \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);             \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);             \
+    VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,     \
+                h4_m, h5_m, h6_m, h7_m);                              \
+    BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);    \
+    BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
+                h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
+                                                                      \
+    /* stage 3 */                                                     \
+    BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);  \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+    k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+    k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);              \
+    VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,       \
+                out4, out6, out5, out7);                              \
+    VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,       \
+                out12, out14, out13, out15);                          \
+                                                                      \
+    /* stage 4 */                                                     \
+    k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+    k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);            \
+    k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+    k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);             \
+    VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);             \
+    VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);               \
+    VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);           \
+    VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);           \
+}
+
+static void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    v8i16 loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+    v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+    v8i16 tmp5, tmp6, tmp7;
+
+    /* load up 8x8 */
+    LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    input += 8 * 16;
+    /* load bottom 8x8 */
+    LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+    BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+    BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+    reg0 = reg2 - loc1;
+    reg2 = reg2 + loc1;
+    reg12 = reg14 - loc0;
+    reg14 = reg14 + loc0;
+    reg4 = reg6 - loc3;
+    reg6 = reg6 + loc3;
+    reg8 = reg10 - loc2;
+    reg10 = reg10 + loc2;
+
+    /* stage 2 */
+    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+    reg9 = reg1 - loc2;
+    reg1 = reg1 + loc2;
+    reg7 = reg15 - loc3;
+    reg15 = reg15 + loc3;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+    BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+    loc1 = reg15 + reg3;
+    reg3 = reg15 - reg3;
+    loc2 = reg2 + loc1;
+    reg15 = reg2 - loc1;
+
+    loc1 = reg1 + reg13;
+    reg13 = reg1 - reg13;
+    loc0 = reg0 + loc1;
+    loc1 = reg0 - loc1;
+    tmp6 = loc0;
+    tmp7 = loc1;
+    reg0 = loc2;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+    VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5,
+                        reg11);
+
+    loc0 = reg9 + reg5;
+    reg5 = reg9 - reg5;
+    reg2 = reg6 + loc0;
+    reg1 = reg6 - loc0;
+
+    loc0 = reg7 + reg11;
+    reg11 = reg7 - reg11;
+    loc1 = reg4 + loc0;
+    loc2 = reg4 - loc0;
+    tmp5 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+    BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+    reg10 = loc0;
+    reg11 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+    BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+    reg13 = loc2;
+
+    /* Transpose and store the output */
+    reg12 = tmp5;
+    reg14 = tmp6;
+    reg3 = tmp7;
+
+    SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
+    dst += (4 * dst_stride);
+    SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
+}
+
+static void vp9_idct16_1d_columns_msa(int16_t *input, int16_t *output)
+{
+    v8i16 loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+    v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+    v8i16 tmp5, tmp6, tmp7;
+
+    /* load up 8x8 */
+    LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    input += 8 * 16;
+    /* load bottom 8x8 */
+    LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+    VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+    VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+    BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+    VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+    VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+    VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+    BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+    reg0 = reg2 - loc1;
+    reg2 = reg2 + loc1;
+    reg12 = reg14 - loc0;
+    reg14 = reg14 + loc0;
+    reg4 = reg6 - loc3;
+    reg6 = reg6 + loc3;
+    reg8 = reg10 - loc2;
+    reg10 = reg10 + loc2;
+
+    /* stage 2 */
+    VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+    VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+    reg9 = reg1 - loc2;
+    reg1 = reg1 + loc2;
+    reg7 = reg15 - loc3;
+    reg15 = reg15 + loc3;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+    VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+    BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+    loc1 = reg15 + reg3;
+    reg3 = reg15 - reg3;
+    loc2 = reg2 + loc1;
+    reg15 = reg2 - loc1;
+
+    loc1 = reg1 + reg13;
+    reg13 = reg1 - reg13;
+    loc0 = reg0 + loc1;
+    loc1 = reg0 - loc1;
+    tmp6 = loc0;
+    tmp7 = loc1;
+    reg0 = loc2;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+    VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5,
+                        reg11);
+
+    loc0 = reg9 + reg5;
+    reg5 = reg9 - reg5;
+    reg2 = reg6 + loc0;
+    reg1 = reg6 - loc0;
+
+    loc0 = reg7 + reg11;
+    reg11 = reg7 - reg11;
+    loc1 = reg4 + loc0;
+    loc2 = reg4 - loc0;
+
+    tmp5 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+    BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+    reg10 = loc0;
+    reg11 = loc1;
+
+    VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+    BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+    reg13 = loc2;
+
+    /* Transpose and store the output */
+    reg12 = tmp5;
+    reg14 = tmp6;
+    reg3 = tmp7;
+
+    /* transpose block */
+    TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
+                       reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+    ST_SH4(reg0, reg2, reg4, reg6, output, 16);
+    ST_SH4(reg8, reg10, reg12, reg14, (output + 4 * 16), 16);
+
+    /* transpose block */
+    TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
+                       reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+    ST_SH4(reg3, reg13, reg11, reg5, (output + 8), 16);
+    ST_SH4(reg7, reg9, reg1, reg15, (output + 8 + 4 * 16), 16);
+}
+
+static void vp9_idct16x16_1_add_msa(int16_t *input, uint8_t *dst,
+                                    int32_t dst_stride)
+{
+    uint8_t i;
+    int16_t out;
+    v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 6);
+
+    vec = __msa_fill_h(out);
+
+    for (i = 4; i--;)
+    {
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        UNPCK_UB_SH(dst0, res0, res4);
+        UNPCK_UB_SH(dst1, res1, res5);
+        UNPCK_UB_SH(dst2, res2, res6);
+        UNPCK_UB_SH(dst3, res3, res7);
+        ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2,
+             res3);
+        ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
+             res7);
+        CLIP_SH4_0_255(res0, res1, res2, res3);
+        CLIP_SH4_0_255(res4, res5, res6, res7);
+        PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                    tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void vp9_idct16x16_10_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+
+    /* transform rows */
+    vp9_idct16_1d_columns_msa(input, out);
+
+    /* short case just considers top 4 rows as valid output */
+    out += 4 * 16;
+    for (i = 12; i--;) {
+        __asm__ volatile (
+            "sw     $zero,   0(%[out])     \n\t"
+            "sw     $zero,   4(%[out])     \n\t"
+            "sw     $zero,   8(%[out])     \n\t"
+            "sw     $zero,  12(%[out])     \n\t"
+            "sw     $zero,  16(%[out])     \n\t"
+            "sw     $zero,  20(%[out])     \n\t"
+            "sw     $zero,  24(%[out])     \n\t"
+            "sw     $zero,  28(%[out])     \n\t"
+
+            :
+            : [out] "r" (out)
+        );
+
+        out += 16;
+    }
+
+    out = out_arr;
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+    }
+}
+
+static void vp9_idct16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                            int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_msa((input + (i << 3)), (out + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+    }
+}
+
+static void vp9_iadst16_1d_columns_msa(int16_t *input, int16_t *output)
+{
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+    v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+    /* load input data */
+    LD_SH16(input, 16,
+            l0, l1, l2, l3, l4, l5, l6, l7,
+            l8, l9, l10, l11, l12, l13, l14, l15);
+
+    /* ADST in horizontal */
+    VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
+                     l8, l9, l10, l11, l12, l13, l14, l15,
+                     r0, r1, r2, r3, r4, r5, r6, r7,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+
+    l1 = -r8;
+    l3 = -r4;
+    l13 = -r13;
+    l15 = -r1;
+
+    TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
+                       l0, l1, l2, l3, l4, l5, l6, l7);
+    ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
+    TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
+                       l8, l9, l10, l11, l12, l13, l14, l15);
+    ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
+}
+
+static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                              int32_t dst_stride)
+{
+    v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+    v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+    v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+    v16i8 zero = { 0 };
+
+    r0 = LD_SH(input + 0 * 16);
+    r3 = LD_SH(input + 3 * 16);
+    r4 = LD_SH(input + 4 * 16);
+    r7 = LD_SH(input + 7 * 16);
+    r8 = LD_SH(input + 8 * 16);
+    r11 = LD_SH(input + 11 * 16);
+    r12 = LD_SH(input + 12 * 16);
+    r15 = LD_SH(input + 15 * 16);
+
+    /* stage 1 */
+    k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+    VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+    k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+    VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+    BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+    k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+    VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+    r1 = LD_SH(input + 1 * 16);
+    r2 = LD_SH(input + 2 * 16);
+    r5 = LD_SH(input + 5 * 16);
+    r6 = LD_SH(input + 6 * 16);
+    r9 = LD_SH(input + 9 * 16);
+    r10 = LD_SH(input + 10 * 16);
+    r13 = LD_SH(input + 13 * 16);
+    r14 = LD_SH(input + 14 * 16);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+    VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+    k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+    k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+    VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+    BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+    BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+    out1 = -out1;
+    SRARI_H2_SH(out0, out1, 6);
+    dst0 = LD_UB(dst + 0 * dst_stride);
+    dst1 = LD_UB(dst + 15 * dst_stride);
+    ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
+    ADD2(res0, out0, res1, out1, res0, res1);
+    CLIP_SH2_0_255(res0, res1);
+    PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
+    ST8x1_UB(res0, dst);
+    ST8x1_UB(res1, dst + 15 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+    k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+    VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+    BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+    out8 = -out8;
+
+    SRARI_H2_SH(out8, out9, 6);
+    dst8 = LD_UB(dst + 1 * dst_stride);
+    dst9 = LD_UB(dst + 14 * dst_stride);
+    ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
+    ADD2(res8, out8, res9, out9, res8, res9);
+    CLIP_SH2_0_255(res8, res9);
+    PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
+    ST8x1_UB(res8, dst + dst_stride);
+    ST8x1_UB(res9, dst + 14 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+    k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+    k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+    VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+    out4 = -out4;
+    SRARI_H2_SH(out4, out5, 6);
+    dst4 = LD_UB(dst + 3 * dst_stride);
+    dst5 = LD_UB(dst + 12 * dst_stride);
+    ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
+    ADD2(res4, out4, res5, out5, res4, res5);
+    CLIP_SH2_0_255(res4, res5);
+    PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
+    ST8x1_UB(res4, dst + 3 * dst_stride);
+    ST8x1_UB(res5, dst + 12 * dst_stride);
+
+    VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+    out13 = -out13;
+    SRARI_H2_SH(out12, out13, 6);
+    dst12 = LD_UB(dst + 2 * dst_stride);
+    dst13 = LD_UB(dst + 13 * dst_stride);
+    ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
+    ADD2(res12, out12, res13, out13, res12, res13);
+    CLIP_SH2_0_255(res12, res13);
+    PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
+    ST8x1_UB(res12, dst + 2 * dst_stride);
+    ST8x1_UB(res13, dst + 13 * dst_stride);
+
+    k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+    k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+    VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+    SRARI_H2_SH(out6, out7, 6);
+    dst6 = LD_UB(dst + 4 * dst_stride);
+    dst7 = LD_UB(dst + 11 * dst_stride);
+    ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
+    ADD2(res6, out6, res7, out7, res6, res7);
+    CLIP_SH2_0_255(res6, res7);
+    PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
+    ST8x1_UB(res6, dst + 4 * dst_stride);
+    ST8x1_UB(res7, dst + 11 * dst_stride);
+
+    VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+    SRARI_H2_SH(out10, out11, 6);
+    dst10 = LD_UB(dst + 6 * dst_stride);
+    dst11 = LD_UB(dst + 9 * dst_stride);
+    ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
+    ADD2(res10, out10, res11, out11, res10, res11);
+    CLIP_SH2_0_255(res10, res11);
+    PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
+    ST8x1_UB(res10, dst + 6 * dst_stride);
+    ST8x1_UB(res11, dst + 9 * dst_stride);
+
+    k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+    k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+    VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+    SRARI_H2_SH(out2, out3, 6);
+    dst2 = LD_UB(dst + 7 * dst_stride);
+    dst3 = LD_UB(dst + 8 * dst_stride);
+    ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
+    ADD2(res2, out2, res3, out3, res2, res3);
+    CLIP_SH2_0_255(res2, res3);
+    PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
+    ST8x1_UB(res2, dst + 7 * dst_stride);
+    ST8x1_UB(res3, dst + 8 * dst_stride);
+
+    VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+    SRARI_H2_SH(out14, out15, 6);
+    dst14 = LD_UB(dst + 5 * dst_stride);
+    dst15 = LD_UB(dst + 10 * dst_stride);
+    ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
+    ADD2(res14, out14, res15, out15, res14, res15);
+    CLIP_SH2_0_255(res14, res15);
+    PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
+    ST8x1_UB(res14, dst + 5 * dst_stride);
+    ST8x1_UB(res15, dst + 10 * dst_stride);
+}
+
+static void vp9_iadst16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                             int32_t dst_stride)
+{
+    int16_t out_arr[16 * 16] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out = out_arr;
+    int32_t i;
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 16 * 8 block */
+        vp9_iadst16_1d_columns_msa((input + (i << 3)), (out + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                          dst_stride);
+    }
+}
+
+static void vp9_iadst_idct_16x16_add_msa(int16_t *input, uint8_t *dst,
+                                         int32_t dst_stride, int32_t eob)
+{
+    int32_t i;
+    int16_t out[16 * 16];
+    int16_t *out_ptr = &out[0];
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                         (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct_iadst_16x16_add_msa(int16_t *input, uint8_t *dst,
+                                         int32_t dst_stride, int32_t eob)
+{
+    int32_t i;
+    int16_t out[16 * 16];
+    int16_t *out_ptr = &out[0];
+
+    /* transform rows */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 7)));
+    }
+
+    /* transform columns */
+    for (i = 0; i < 2; i++) {
+        /* process 8 * 16 block */
+        vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf,
+                                               int16_t *tmp_odd_buf,
+                                               int16_t *dst)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+    /* FINAL BUTTERFLY : Dependency on Even & Odd */
+    vec0 = LD_SH(tmp_odd_buf);
+    vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+    loc0 = LD_SH(tmp_eve_buf);
+    loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+
+    ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+    ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+    ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+    ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+
+    /* Transpose : 16 vectors */
+    /* 1st & 2nd 8x8 */
+    TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                       m0, n0, m1, n1, m2, n2, m3, n3);
+    ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
+    ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
+
+    TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                       m4, n4, m5, n5, m6, n6, m7, n7);
+    ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
+    ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
+
+    /* 3rd & 4th 8x8 */
+    LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+    LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+    TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                       m0, n0, m1, n1, m2, n2, m3, n3);
+    ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
+    ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
+
+    TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                       m4, n4, m5, n5, m6, n6, m7, n7);
+    ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
+    ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
+}
+
+static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                                   int16_t *tmp_eve_buf)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+    v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+    /* Even stage 1 */
+    LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+    tmp_buf += (2 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+    VP9_DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+    BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+    VP9_DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+    loc1 = vec3;
+    loc0 = vec1;
+
+    VP9_DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+    VP9_DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+    BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+    BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+    BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+    /* Even stage 2 */
+    /* Load 8 */
+    LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+    vec0 = reg0 + reg4;
+    reg0 = reg0 - reg4;
+    reg4 = reg6 + reg2;
+    reg6 = reg6 - reg2;
+    reg2 = reg1 + reg5;
+    reg1 = reg1 - reg5;
+    reg5 = reg7 + reg3;
+    reg7 = reg7 - reg3;
+    reg3 = vec0;
+
+    vec1 = reg2;
+    reg2 = reg3 + reg4;
+    reg3 = reg3 - reg4;
+    reg4 = reg5 - vec1;
+    reg5 = reg5 + vec1;
+
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+    vec0 = reg0 - reg6;
+    reg0 = reg0 + reg6;
+    vec1 = reg7 - reg1;
+    reg7 = reg7 + reg1;
+
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+    /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+    /* Store 8 */
+    BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, tmp_eve_buf, 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
+
+    BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
+
+    /* Store 8 */
+    BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
+
+    BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+    ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
+    ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
+}
+
+static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                                  int16_t *tmp_odd_buf)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+    /* Odd stage 1 */
+    reg0 = LD_SH(tmp_buf + 32);
+    reg1 = LD_SH(tmp_buf + 7 * 32);
+    reg2 = LD_SH(tmp_buf + 9 * 32);
+    reg3 = LD_SH(tmp_buf + 15 * 32);
+    reg4 = LD_SH(tmp_buf + 17 * 32);
+    reg5 = LD_SH(tmp_buf + 23 * 32);
+    reg6 = LD_SH(tmp_buf + 25 * 32);
+    reg7 = LD_SH(tmp_buf + 31 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+    VP9_DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+    vec0 = reg0 + reg3;
+    reg0 = reg0 - reg3;
+    reg3 = reg7 + reg4;
+    reg7 = reg7 - reg4;
+    reg4 = reg1 + reg2;
+    reg1 = reg1 - reg2;
+    reg2 = reg6 + reg5;
+    reg6 = reg6 - reg5;
+    reg5 = vec0;
+
+    /* 4 Stores */
+    ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+    SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+    ST_SH2(vec0, vec1, tmp_odd_buf, 8);
+
+    /* 4 Stores */
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+    VP9_DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+    BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+    VP9_DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+    ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+    /* Odd stage 2 */
+    /* 8 loads */
+    reg0 = LD_SH(tmp_buf + 3 * 32);
+    reg1 = LD_SH(tmp_buf + 5 * 32);
+    reg2 = LD_SH(tmp_buf + 11 * 32);
+    reg3 = LD_SH(tmp_buf + 13 * 32);
+    reg4 = LD_SH(tmp_buf + 19 * 32);
+    reg5 = LD_SH(tmp_buf + 21 * 32);
+    reg6 = LD_SH(tmp_buf + 27 * 32);
+    reg7 = LD_SH(tmp_buf + 29 * 32);
+
+    VP9_DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+    VP9_DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+    VP9_DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+    VP9_DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+    /* 4 Stores */
+    SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
+         vec0, vec1, vec2, vec3);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+    BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+    VP9_DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+    ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+    /* 4 Stores */
+    ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7,
+         vec0, vec1, vec2, vec3);
+    BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+    ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
+    VP9_DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+    ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+    /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+    /* Load 8 & Store 8 */
+    LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+    LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+    ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+         loc0, loc1, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+    SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+    SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+    /* Load 8 & Store 8 */
+    LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+    LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+    ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+         loc0, loc1, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+    SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+    SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+    VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+    ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                                 int16_t *tmp_odd_buf,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride)
+{
+    v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+    v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+    /* FINAL BUTTERFLY : Dependency on Even & Odd */
+    vec0 = LD_SH(tmp_odd_buf);
+    vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+    loc0 = LD_SH(tmp_eve_buf);
+    loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+    SRARI_H4_SH(m0, m2, m4, m6, 6);
+    VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
+    SRARI_H4_SH(m0, m2, m4, m6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
+                        m0, m2, m4, m6);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+    SRARI_H4_SH(m1, m3, m5, m7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
+                        m1, m3, m5, m7);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
+    SRARI_H4_SH(m1, m3, m5, m7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
+                        m1, m3, m5, m7);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+    SRARI_H4_SH(n0, n2, n4, n6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
+                        n0, n2, n4, n6);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
+    SRARI_H4_SH(n0, n2, n4, n6, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
+                        n0, n2, n4, n6);
+
+    /* Load 8 & Store 8 */
+    vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+    vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+    vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+    vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+    loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+    loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+    loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+    loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+    ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+    SRARI_H4_SH(n1, n3, n5, n7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
+                        n1, n3, n5, n7);
+
+    SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
+    SRARI_H4_SH(n1, n3, n5, n7, 6);
+    VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
+                        n1, n3, n5, n7);
+}
+
+static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+
+    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+    vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
+                                         dst, dst_stride);
+}
+
+static void vp9_idct8x32_1d_columns_msa(int16_t *input, int16_t *output,
+                                        int16_t *tmp_buf)
+{
+    int16_t tmp_odd_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t tmp_eve_buf[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+
+    vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+    vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+    vp9_idct_butterfly_transpose_store(tmp_buf, &tmp_eve_buf[0],
+                                       &tmp_odd_buf[0], output);
+}
+
+static void vp9_idct32x32_1_add_msa(int16_t *input, uint8_t *dst,
+                                    int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+    out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO((out * cospi_16_64), VP9_DCT_CONST_BITS);
+    out = ROUND_POWER_OF_TWO(out, 6);
+
+    vec = __msa_fill_h(out);
+
+    for (i = 16; i--;)
+    {
+        LD_UB2(dst, 16, dst0, dst1);
+        LD_UB2(dst + dst_stride, 16, dst2, dst3);
+
+        UNPCK_UB_SH(dst0, res0, res4);
+        UNPCK_UB_SH(dst1, res1, res5);
+        UNPCK_UB_SH(dst2, res2, res6);
+        UNPCK_UB_SH(dst3, res3, res7);
+        ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2,
+             res3);
+        ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6,
+             res7);
+        CLIP_SH4_0_255(res0, res1, res2, res3);
+        CLIP_SH4_0_255(res4, res5, res6, res7);
+        PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                    tmp0, tmp1, tmp2, tmp3);
+
+        ST_UB2(tmp0, tmp1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(tmp2, tmp3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void vp9_idct32x32_34_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                               int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out_ptr = out_arr;
+    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT);
+
+    for (i = 32; i--;) {
+        __asm__ volatile (
+            "sw     $zero,       (%[out_ptr])     \n\t"
+            "sw     $zero,      4(%[out_ptr])     \n\t"
+            "sw     $zero,      8(%[out_ptr])     \n\t"
+            "sw     $zero,     12(%[out_ptr])     \n\t"
+            "sw     $zero,     16(%[out_ptr])     \n\t"
+            "sw     $zero,     20(%[out_ptr])     \n\t"
+            "sw     $zero,     24(%[out_ptr])     \n\t"
+            "sw     $zero,     28(%[out_ptr])     \n\t"
+            "sw     $zero,     32(%[out_ptr])     \n\t"
+            "sw     $zero,     36(%[out_ptr])     \n\t"
+            "sw     $zero,     40(%[out_ptr])     \n\t"
+            "sw     $zero,     44(%[out_ptr])     \n\t"
+            "sw     $zero,     48(%[out_ptr])     \n\t"
+            "sw     $zero,     52(%[out_ptr])     \n\t"
+            "sw     $zero,     56(%[out_ptr])     \n\t"
+            "sw     $zero,     60(%[out_ptr])     \n\t"
+
+            :
+            : [out_ptr] "r" (out_ptr)
+        );
+
+        out_ptr += 32;
+    }
+
+    out_ptr = out_arr;
+
+    /* process 8*32 block */
+    vp9_idct8x32_1d_columns_msa(input, out_ptr, &tmp_buf[0]);
+
+    /* transform columns */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                           (dst + (i << 3)), dst_stride);
+    }
+}
+
+static void vp9_idct32x32_colcol_addblk_msa(int16_t *input, uint8_t *dst,
+                                            int32_t dst_stride)
+{
+    int32_t i;
+    int16_t out_arr[32 * 32] ALLOC_ALIGNED(ALIGNMENT);
+    int16_t *out_ptr = out_arr;
+    int16_t tmp_buf[8 * 32] ALLOC_ALIGNED(ALIGNMENT);
+
+    /* transform rows */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_msa((input + (i << 3)), (out_ptr + (i << 8)),
+                                    &tmp_buf[0]);
+    }
+
+    /* transform columns */
+    for (i = 0; i < 4; i++) {
+        /* process 8*32 block */
+        vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                           (dst + (i << 3)), dst_stride);
+    }
+}
+
+void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob)
+{
+    if (eob > 1) {
+        vp9_idct4x4_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 4 * 4 * sizeof(*block));
+    }
+    else {
+        vp9_idct4x4_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+}
+
+void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob)
+{
+    if (eob == 1) {
+        vp9_idct8x8_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+    else if (eob <= 12) {
+        vp9_idct8x8_12_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 4 * 8 * sizeof(*block));
+    }
+    else {
+        vp9_idct8x8_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 8 * 8 * sizeof(*block));
+    }
+}
+
+void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    int i;
+
+    if (eob == 1) {
+        /* DC only DCT coefficient. */
+        vp9_idct16x16_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+    else if (eob <= 10) {
+        vp9_idct16x16_10_colcol_addblk_msa(block, dst, stride);
+        for (i = 0; i < 4; ++i) {
+            memset(block, 0, 4 * sizeof(*block));
+            block += 16;
+        }
+    }
+    else {
+        vp9_idct16x16_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 16 * 16 * sizeof(*block));
+    }
+}
+
+void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    int i;
+
+    if (eob == 1) {
+        vp9_idct32x32_1_add_msa(block, dst, stride);
+        block[0] = 0;
+    }
+    else if (eob <= 34) {
+        vp9_idct32x32_34_colcol_addblk_msa(block, dst, stride);
+        for (i = 0; i < 8; ++i) {
+            memset(block, 0, 8 * sizeof(*block));
+            block += 32;
+        }
+    }
+    else {
+        vp9_idct32x32_colcol_addblk_msa(block, dst, stride);
+        memset(block, 0, 32 * 32 * sizeof(*block));
+    }
+}
+
+void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    vp9_iadst4x4_colcol_addblk_msa(block, dst, stride);
+    memset(block, 0, 4 * 4 * sizeof(*block));
+}
+
+void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob)
+{
+    vp9_iadst8x8_colcol_addblk_msa(block, dst, stride);
+    memset(block, 0, 8 * 8 * sizeof(*block));
+}
+
+void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                  int16_t *block, int eob)
+{
+    vp9_iadst16x16_colcol_addblk_msa(block, dst, stride);
+    memset(block, 0, 16 * 16 * sizeof(*block));
+}
+
+void ff_idct_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_idct_iadst_4x4_add_msa(block, dst, stride, eob);
+    memset(block, 0, 4 * 4 * sizeof(*block));
+}
+
+void ff_idct_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_idct_iadst_8x8_add_msa(block, dst, stride, eob);
+    memset(block, 0, 8 * 8 * sizeof(*block));
+}
+
+void ff_idct_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob)
+{
+    vp9_idct_iadst_16x16_add_msa(block, dst, stride, eob);
+    memset(block, 0, 16 * 16 * sizeof(*block));
+}
+
+void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_iadst_idct_4x4_add_msa(block, dst, stride, eob);
+    memset(block, 0, 4 * 4 * sizeof(*block));
+}
+
+void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob)
+{
+    vp9_iadst_idct_8x8_add_msa(block, dst, stride, eob);
+    memset(block, 0, 8 * 8 * sizeof(*block));
+}
+
+void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob)
+{
+    vp9_iadst_idct_16x16_add_msa(block, dst, stride, eob);
+    memset(block, 0, 16 * 16 * sizeof(*block));
+}
diff --git a/libavcodec/mips/vp9_intra_msa.c b/libavcodec/mips/vp9_intra_msa.c
new file mode 100644
index 0000000..54cf0ae
--- /dev/null
+++ b/libavcodec/mips/vp9_intra_msa.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)  \
+{                                                \
+    out0 = __msa_subs_u_h(out0, in0);            \
+    out1 = __msa_subs_u_h(out1, in1);            \
+}
+
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    uint32_t row;
+    v16u8 src0;
+
+    src0 = LD_UB(src);
+
+    for (row = 16; row--;) {
+        ST_UB(src0, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
+                       const uint8_t *src)
+{
+    uint32_t row;
+    v16u8 src1, src2;
+
+    src1 = LD_UB(src);
+    src2 = LD_UB(src + 16);
+
+    for (row = 32; row--;) {
+        ST_UB2(src1, src2, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    uint32_t row, inp;
+    v16u8 src0, src1, src2, src3;
+
+    src += 12;
+    for (row = 4; row--;) {
+        inp = LW(src);
+        src -= 4;
+
+        src0 = (v16u8) __msa_fill_b(inp >> 24);
+        src1 = (v16u8) __msa_fill_b(inp >> 16);
+        src2 = (v16u8) __msa_fill_b(inp >> 8);
+        src3 = (v16u8) __msa_fill_b(inp);
+
+        ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
+                      const uint8_t *top)
+{
+    uint32_t row, inp;
+    v16u8 src0, src1, src2, src3;
+
+    src += 28;
+    for (row = 8; row--;) {
+        inp = LW(src);
+        src -= 4;
+
+        src0 = (v16u8) __msa_fill_b(inp >> 24);
+        src1 = (v16u8) __msa_fill_b(inp >> 16);
+        src2 = (v16u8) __msa_fill_b(inp >> 8);
+        src3 = (v16u8) __msa_fill_b(inp);
+
+        ST_UB2(src0, src0, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src1, src1, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src2, src2, dst, 16);
+        dst += dst_stride;
+        ST_UB2(src3, src3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    uint32_t val0, val1;
+    v16i8 store, src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    val0 = LW(src_top);
+    val1 = LW(src_left);
+    INSERT_W2_SB(val0, val1, src);
+    sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);
+    store = __msa_splati_b((v16i8) sum_w, 0);
+    val0 = __msa_copy_u_w((v4i32) store, 0);
+
+    SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_4x4(dir)                                    \
+void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                           const uint8_t *left,                 \
+                           const uint8_t *top)                  \
+{                                                               \
+    uint32_t val0;                                              \
+    v16i8 store, data = { 0 };                                  \
+    v8u16 sum_h;                                                \
+    v4u32 sum_w;                                                \
+                                                                \
+    val0 = LW(dir);                                             \
+    data = (v16i8) __msa_insert_w((v4i32) data, 0, val0);       \
+    sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data);         \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2);            \
+    store = __msa_splati_b((v16i8) sum_w, 0);                   \
+    val0 = __msa_copy_u_w((v4i32) store, 0);                    \
+                                                                \
+    SW4(val0, val0, val0, val0, dst, dst_stride);               \
+}
+INTRA_DC_TL_4x4(top);
+INTRA_DC_TL_4x4(left);
+
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
+                   const uint8_t *src_top)
+{
+    uint64_t val0, val1;
+    v16i8 store;
+    v16u8 src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    val0 = LD(src_top);
+    val1 = LD(src_left);
+    INSERT_D2_UB(val0, val1, src);
+    sum_h = __msa_hadd_u_h(src, src);
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);
+    store = __msa_splati_b((v16i8) sum_w, 0);
+    val0 = __msa_copy_u_d((v2i64) store, 0);
+
+    SD4(val0, val0, val0, val0, dst, dst_stride);
+    dst += (4 * dst_stride);
+    SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_8x8(dir)                                    \
+void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                           const uint8_t *left,                 \
+                           const uint8_t *top)                  \
+{                                                               \
+    uint64_t val0;                                              \
+    v16i8 store;                                                \
+    v16u8 data = { 0 };                                         \
+    v8u16 sum_h;                                                \
+    v4u32 sum_w;                                                \
+    v2u64 sum_d;                                                \
+                                                                \
+    val0 = LD(dir);                                             \
+    data = (v16u8) __msa_insert_d((v2i64) data, 0, val0);       \
+    sum_h = __msa_hadd_u_h(data, data);                         \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                       \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                       \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);            \
+    store = __msa_splati_b((v16i8) sum_w, 0);                   \
+    val0 = __msa_copy_u_d((v2i64) store, 0);                    \
+                                                                \
+    SD4(val0, val0, val0, val0, dst, dst_stride);               \
+    dst += (4 * dst_stride);                                    \
+    SD4(val0, val0, val0, val0, dst, dst_stride);               \
+}
+
+INTRA_DC_TL_8x8(top);
+INTRA_DC_TL_8x8(left);
+
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    v16u8 top, left, out;
+    v8u16 sum_h, sum_top, sum_left;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    top = LD_UB(src_top);
+    left = LD_UB(src_left);
+    HADD_UB2_UH(top, left, sum_top, sum_left);
+    sum_h = sum_top + sum_left;
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
+
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+#define INTRA_DC_TL_16x16(dir)                                        \
+void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,      \
+                             const uint8_t *left,                     \
+                             const uint8_t *top)                      \
+{                                                                     \
+    v16u8 data, out;                                                  \
+    v8u16 sum_h;                                                      \
+    v4u32 sum_w;                                                      \
+    v2u64 sum_d;                                                      \
+                                                                      \
+    data = LD_UB(dir);                                                \
+    sum_h = __msa_hadd_u_h(data, data);                               \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                             \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);      \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                             \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);                  \
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);                   \
+                                                                      \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+    dst += (8 * dst_stride);                                          \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);  \
+}
+INTRA_DC_TL_16x16(top);
+INTRA_DC_TL_16x16(left);
+
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top)
+{
+    uint32_t row;
+    v16u8 top0, top1, left0, left1, out;
+    v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    LD_UB2(src_top, 16, top0, top1);
+    LD_UB2(src_left, 16, left0, left1);
+    HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+    HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+    sum_h = sum_top0 + sum_top1;
+    sum_h += sum_left0 + sum_left1;
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6);
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
+
+    for (row = 16; row--;)
+    {
+        ST_UB2(out, out, dst, 16);
+        dst += dst_stride;
+        ST_UB2(out, out, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+#define INTRA_DC_TL_32x32(dir)                                    \
+void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,  \
+                             const uint8_t *left,                 \
+                             const uint8_t *top)                  \
+{                                                                 \
+    uint32_t row;                                                 \
+    v16u8 data0, data1, out;                                      \
+    v8u16 sum_h, sum_data0, sum_data1;                            \
+    v4u32 sum_w;                                                  \
+    v2u64 sum_d;                                                  \
+                                                                  \
+    LD_UB2(dir, 16, data0, data1);                                \
+    HADD_UB2_UH(data0, data1, sum_data0, sum_data1);              \
+    sum_h = sum_data0 + sum_data1;                                \
+    sum_w = __msa_hadd_u_w(sum_h, sum_h);                         \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
+    sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);  \
+    sum_d = __msa_hadd_u_d(sum_w, sum_w);                         \
+    sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);              \
+    out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);               \
+                                                                  \
+    for (row = 16; row--;)                                        \
+    {                                                             \
+        ST_UB2(out, out, dst, 16);                                \
+        dst += dst_stride;                                        \
+        ST_UB2(out, out, dst, 16);                                \
+        dst += dst_stride;                                        \
+    }                                                             \
+}
+INTRA_DC_TL_32x32(top);
+INTRA_DC_TL_32x32(left);
+
+#define INTRA_PREDICT_VALDC_16X16_MSA(val)                             \
+void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
+                             const uint8_t *left, const uint8_t *top)  \
+{                                                                      \
+    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
+                                                                       \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
+    dst += (8 * dst_stride);                                           \
+    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);   \
+}
+
+INTRA_PREDICT_VALDC_16X16_MSA(127);
+INTRA_PREDICT_VALDC_16X16_MSA(128);
+INTRA_PREDICT_VALDC_16X16_MSA(129);
+
+#define INTRA_PREDICT_VALDC_32X32_MSA(val)                             \
+void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,       \
+                             const uint8_t *left, const uint8_t *top)  \
+{                                                                      \
+    uint32_t row;                                                      \
+    v16u8 out = (v16u8) __msa_ldi_b(val);                              \
+                                                                       \
+    for (row = 16; row--;)                                             \
+    {                                                                  \
+        ST_UB2(out, out, dst, 16);                                     \
+        dst += dst_stride;                                             \
+        ST_UB2(out, out, dst, 16);                                     \
+        dst += dst_stride;                                             \
+    }                                                                  \
+}
+
+INTRA_PREDICT_VALDC_32X32_MSA(127);
+INTRA_PREDICT_VALDC_32X32_MSA(128);
+INTRA_PREDICT_VALDC_32X32_MSA(129);
+
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint32_t left;
+    uint8_t top_left = src_top_ptr[-1];
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+    v16u8 src0, src1, src2, src3;
+    v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+    src_top = LD_SB(src_top_ptr);
+    left = LW(src_left);
+    src_left0 = __msa_fill_b(left >> 24);
+    src_left1 = __msa_fill_b(left >> 16);
+    src_left2 = __msa_fill_b(left >> 8);
+    src_left3 = __msa_fill_b(left);
+
+    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+               src_left3, src_top, src0, src1, src2, src3);
+    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+    ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                   const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+    v8u16 src_top_left, vec0, vec1, vec2, vec3;
+    v16u8 src0, src1, src2, src3;
+
+    src_top = LD_SB(src_top_ptr);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 4;
+    for (loop_cnt = 2; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+                   src_left3, src_top, src0, src1, src2, src3);
+        HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+        SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+    v8u16 src_top_left, res_r, res_l;
+
+    src_top = LD_SB(src_top_ptr);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 12;
+    for (loop_cnt = 4; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+
+        ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+        HADD_UB2_UH(res_r, res_l, res_r, res_l);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+        SAT_UH2_UH(res_r, res_l, 7);
+        PCKEV_ST_SB(res_r, res_l, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                     const uint8_t *src_left, const uint8_t *src_top_ptr)
+{
+    uint8_t top_left = src_top_ptr[-1];
+    uint32_t loop_cnt, left;
+    v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+    v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+    src_top0 = LD_SB(src_top_ptr);
+    src_top1 = LD_SB(src_top_ptr + 16);
+    src_top_left = (v8u16) __msa_fill_h(top_left);
+
+    src_left += 28;
+    for (loop_cnt = 8; loop_cnt--;) {
+        left = LW(src_left);
+        src_left0 = __msa_fill_b(left >> 24);
+        src_left1 = __msa_fill_b(left >> 16);
+        src_left2 = __msa_fill_b(left >> 8);
+        src_left3 = __msa_fill_b(left);
+        src_left -= 4;
+
+        ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+
+        ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+        ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+        HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+                    res_l1);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+        IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+        SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+        PCKEV_ST_SB(res_r0, res_l0, dst);
+        PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+        dst += dst_stride;
+    }
+}
diff --git a/libavcodec/mips/vp9_lpf_msa.c b/libavcodec/mips/vp9_lpf_msa.c
new file mode 100644
index 0000000..eef8afc
--- /dev/null
+++ b/libavcodec/mips/vp9_lpf_msa.c
@@ -0,0 +1,2599 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out)               \
+{                                                                        \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                  \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                            \
+    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                   \
+                                                                         \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
+                                                                         \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                   \
+    filt = filt & (v16i8) hev_in;                                        \
+    q0_sub_p0 = q0_m - p0_m;                                             \
+    filt_sign = __msa_clti_s_b(filt, 0);                                 \
+                                                                         \
+    cnst3h = __msa_ldi_h(3);                                             \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);   \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                               \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                   \
+                                                                         \
+    /* combine left and right part */                                    \
+    filt = __msa_pckev_b((v16i8) filt_r, (v16i8) filt_r);                \
+                                                                         \
+    filt = filt & (v16i8) mask_in;                                       \
+    cnst4b = __msa_ldi_b(4);                                             \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                                \
+    filt1 >>= 3;                                                         \
+                                                                         \
+    cnst3b = __msa_ldi_b(3);                                             \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                                \
+    filt2 >>= 3;                                                         \
+                                                                         \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
+    q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
+    p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
+                                                                         \
+    filt = __msa_srari_b(filt1, 1);                                      \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                   \
+    q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                   \
+    p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
+}
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out)               \
+{                                                                        \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                  \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                            \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;              \
+                                                                         \
+    p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
+                                                                         \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                   \
+                                                                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q0_sub_p0 = q0_m - p0_m;                                             \
+    filt_sign = __msa_clti_s_b(filt, 0);                                 \
+                                                                         \
+    cnst3h = __msa_ldi_h(3);                                             \
+    q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h);   \
+    filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                               \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                   \
+                                                                         \
+    q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h);   \
+    filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                               \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                   \
+                                                                         \
+    filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r);                \
+    filt = filt & (v16i8) mask_in;                                       \
+                                                                         \
+    cnst4b = __msa_ldi_b(4);                                             \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                                \
+    filt1 >>= 3;                                                         \
+                                                                         \
+    cnst3b = __msa_ldi_b(3);                                             \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                                \
+    filt2 >>= 3;                                                         \
+                                                                         \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
+    q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
+    p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
+                                                                         \
+    filt = __msa_srari_b(filt1, 1);                                      \
+    hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
+    filt = filt & (v16i8) hev_in;                                        \
+                                                                         \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                   \
+    q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                   \
+    p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
+}
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)  \
+{                                                                      \
+    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;     \
+    v16u8 zero_in = { 0 };                                             \
+                                                                       \
+    tmp = __msa_ori_b(zero_in, 1);                                     \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                        \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                        \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                        \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                        \
+                                                                       \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);             \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                   \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);             \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                   \
+                                                                       \
+    flat_out = (tmp < (v16u8) flat_out);                               \
+    flat_out = __msa_xori_b(flat_out, 0xff);                           \
+    flat_out = flat_out & (mask);                                      \
+}
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
+                  q5_in, q6_in, q7_in, flat_in, flat2_out)          \
+{                                                                   \
+    v16u8 tmp, zero_in = { 0 };                                     \
+    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;       \
+    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;       \
+                                                                    \
+    tmp = __msa_ori_b(zero_in, 1);                                  \
+    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                     \
+    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                     \
+    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                     \
+    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                     \
+    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                     \
+    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                     \
+    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                     \
+    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                     \
+                                                                    \
+    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);            \
+    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);              \
+    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);              \
+    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);          \
+    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);              \
+                                                                    \
+    flat2_out = (tmp < (v16u8) flat2_out);                          \
+    flat2_out = __msa_xori_b(flat2_out, 0xff);                      \
+    flat2_out = flat2_out & flat_in;                                \
+}
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                \
+                    q0_in, q1_in, q2_in, q3_in,                \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out,  \
+                    q0_filt8_out, q1_filt8_out, q2_filt8_out)  \
+{                                                              \
+    v8u16 tmp0, tmp1, tmp2;                                    \
+                                                               \
+    tmp2 = p2_in + p1_in + p0_in;                              \
+    tmp0 = p3_in << 1;                                         \
+                                                               \
+    tmp0 = tmp0 + tmp2 + q0_in;                                \
+    tmp1 = tmp0 + p3_in + p2_in;                               \
+    p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = tmp0 + p1_in + q1_in;                               \
+    p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = q2_in + q1_in + q0_in;                              \
+    tmp2 = tmp2 + tmp1;                                        \
+    tmp0 = tmp2 + (p0_in);                                     \
+    tmp0 = tmp0 + (p3_in);                                     \
+    p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3);     \
+                                                               \
+    tmp0 = q2_in + q3_in;                                      \
+    tmp0 = p0_in + tmp1 + tmp0;                                \
+    tmp1 = q3_in + q3_in;                                      \
+    tmp1 = tmp1 + tmp0;                                        \
+    q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp0 = tmp2 + q3_in;                                       \
+    tmp1 = tmp0 + q0_in;                                       \
+    q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+                                                               \
+    tmp1 = tmp0 - p2_in;                                       \
+    tmp0 = q1_in + q3_in;                                      \
+    tmp1 = tmp0 + tmp1;                                        \
+    q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    /* absolute subtraction of pixel values */                     \
+    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+                                                                   \
+    /* calculation of hev */                                       \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = thresh_in < (v16u8) flat_out;                        \
+                                                                   \
+    /* calculation of mask */                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+                                                                   \
+    mask_out = b_limit_in < p0_asub_q0_m;                          \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+                                                                   \
+    mask_out = limit_in < (v16u8) mask_out;                        \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+
+void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
+    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
+
+    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
+    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
+
+    limit0 = (v16u8) __msa_fill_b(limit_ptr);
+    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                    p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                    q0_filter8);
+        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
+
+        p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
+
+        src -= 3 * pitch;
+
+        SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+        src += (4 * pitch);
+        SD(q1_d, src);
+        src += pitch;
+        SD(q2_d, src);
+    }
+}
+
+void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
+                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
+                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+    } else {
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
+                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+                    q1_filt8_l, q2_filt8_l);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
+
+        src -= 3 * pitch;
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1_out, q2_out, src, pitch);
+        src += (2 * pitch);
+    }
+}
+
+static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
+                                        uint8_t *filter48,
+                                        int32_t b_limit_ptr,
+                                        int32_t limit_ptr,
+                                        int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+
+    /* load vector elements */
+    LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+                   q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
+{
+    v16u8 flat, flat2, filter8;
+    v16i8 zero = { 0 };
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+    v8i16 l_out, r_out;
+
+    flat = LD_UB(filter48 + 96);
+
+    LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        src -= 3 * pitch;
+        ST_UB4(p2, p1, p0, q0, src, pitch);
+        src += (4 * pitch);
+        ST_UB2(q1, q2, src, pitch);
+    } else {
+        src -= 7 * pitch;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+                   p5_l_in, p4_l_in);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+                   p1_l_in, p0_l_in);
+        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
+
+        tmp0_l = p7_l_in << 3;
+        tmp0_l -= p7_l_in;
+        tmp0_l += p6_l_in;
+        tmp0_l += q0_l_in;
+        tmp1_l = p6_l_in + p5_l_in;
+        tmp1_l += p4_l_in;
+        tmp1_l += p3_l_in;
+        tmp1_l += p2_l_in;
+        tmp1_l += p1_l_in;
+        tmp1_l += p0_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST_UB(p6, src);
+        src += pitch;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
+        tmp0_l = p5_l_in - p6_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST_UB(p5, src);
+        src += pitch;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
+        tmp0_l = p4_l_in - p5_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST_UB(p4, src);
+        src += pitch;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
+        tmp0_l = p3_l_in - p4_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST_UB(p3, src);
+        src += pitch;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
+        tmp0_l = p2_l_in - p3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
+        tmp0_l = p1_l_in - p2_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
+        tmp0_l = p0_l_in - p1_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
+        tmp0_l = q7_l_in - p0_l_in;
+        tmp0_l += q0_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q0_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p6_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q1_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p5_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += pitch;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q2_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p4_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST_UB(q3, src);
+        src += pitch;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p3_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST_UB(q4, src);
+        src += pitch;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q4_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p2_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST_UB(q5, src);
+        src += pitch;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        tmp0_l = q7_l_in - q5_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p1_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST_UB(q6, src);
+    }
+}
+
+void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t early_exit = 0;
+
+    early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
+                                          b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        vp9_hz_lpf_t16_16w(src, pitch, filter48);
+    }
+}
+
+void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
+                               int32_t b_limit_ptr,
+                               int32_t limit_ptr,
+                               int32_t thresh_ptr)
+{
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    uint64_t dword0, dword1;
+    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 p0_filter16, p1_filter16;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+    v8u16 tmp0, tmp1, tmp2;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+        SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+    } else {
+        /* convert 8 bit input data into 16 bit */
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
+                   q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
+                   q1_r, q2_r, q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
+                    p2_filter8, p1_filter8, p0_filter8, q0_filter8,
+                    q1_filter8, q2_filter8);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                    zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                    q0_filter8);
+        PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
+                    q2_filter8);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
+
+        /* load 16 vector elements */
+        LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+        LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+        VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+        /* if flat2 is zero for all pixels, then no need to calculate other filter */
+        if (__msa_test_bz_v(flat2)) {
+            p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
+            p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
+            p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
+            q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
+            q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
+            q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
+
+            SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+            SD(q1_d, src + pitch);
+            SD(q2_d, src + 2 * pitch);
+        } else {
+            /* LSB(right) 8 pixel operation */
+            ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
+                       zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
+                       q4_r, q5_r, q6_r, q7_r);
+
+            tmp0 = p7_r << 3;
+            tmp0 -= p7_r;
+            tmp0 += p6_r;
+            tmp0 += q0_r;
+
+            src -= 7 * pitch;
+
+            /* calculation of p6 and p5 */
+            tmp1 = p6_r + p5_r + p4_r + p3_r;
+            tmp1 += (p2_r + p1_r + p0_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp0 = p5_r - p6_r + q1_r - p7_r;
+            tmp1 += tmp0;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p4 and p3 */
+            tmp0 = p4_r - p5_r + q2_r - p7_r;
+            tmp2 = p3_r - p4_r + q3_r - p7_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p2 and p1 */
+            tmp0 = p2_r - p3_r + q4_r - p7_r;
+            tmp2 = p1_r - p2_r + q5_r - p7_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of p0 and q0 */
+            tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+            tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q1 and q2 */
+            tmp0 = q7_r - q0_r + q1_r - p6_r;
+            tmp2 = q7_r - q1_r + q2_r - p5_r;
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q3 and q4 */
+            tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+            tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+            src += pitch;
+
+            /* calculation of q5 and q6 */
+            tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+            tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+            tmp1 += tmp0;
+            p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            tmp1 += tmp2;
+            p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
+            PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
+                        p0_filter16, p1_filter16);
+            p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+            p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+            dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
+            dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
+            SD(dword0, src);
+            src += pitch;
+            SD(dword1, src);
+        }
+    }
+}
+
+void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat, limit, thresh, b_limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v8i16 vec0, vec1, vec2, vec3;
+
+    LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                       p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+    ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+    src -= 2;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    src += 4 * pitch;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
+    thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
+    thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
+
+    b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
+    b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
+
+    limit0 = (v16u8) __msa_fill_b(limit_ptr);
+    limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
+    limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
+
+void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
+                              int32_t b_limit_ptr,
+                              int32_t limit_ptr,
+                              int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    /* load vector elements */
+    LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                       p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        /* Store 4 pixels p1-_q1 */
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+        src -= 2;
+        ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+        src += 4 * pitch;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+                    p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        /* Store 6 pixels p2-_q2 */
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
+
+        src -= 3;
+        ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec4, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec4, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+
+        /* filter8 */
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
+                    p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
+                    p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
+                    q1_filt8_r, q2_filt8_r);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p1_out, p0_out, q0_out, q1_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16u8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    temp_src = src - 4;
+
+    LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+    /* transpose 16x8 matrix into 8x16 */
+    TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                        q3, q2, q1, q0, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
+    thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
+
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
+    b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
+
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+    vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
+    limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src -= 2;
+        ST4x8_UB(vec2, vec3, src, pitch);
+        src += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src, pitch);
+    } else {
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+                    p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
+                    p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+                    q1_filt8_l, q2_filt8_l);
+
+        /* store pixel values */
+        p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
+        p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
+        p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
+        q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
+        q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
+        q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec2, 4, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 0, src + 4, pitch);
+        src += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+        ST2x4_UB(vec5, 4, src + 4, pitch);
+    }
+}
+
+static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                       uint8_t *output, int32_t out_pitch)
+{
+    v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+    v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch,
+           p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
+    /* 8x8 transpose */
+    TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+                       p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+    /* 8x8 transpose */
+    ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+               tmp0, tmp1, tmp2, tmp3);
+    ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+    ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+    ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+    ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+    SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+    output += (8 * out_pitch);
+    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                       uint8_t *output, int32_t out_pitch)
+{
+    v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+    TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+                        q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+    ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
+                                uint8_t *output, int32_t out_pitch)
+{
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+    v4i32 tmp2, tmp3;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+    LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    input += (8 * in_pitch);
+    LD_UB8(input, in_pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p7, p6, p5, p4, p3, p2, p1, p0);
+
+    /* transpose 16x8 matrix into 8x16 */
+    /* total 8 intermediate register and 32 instructions */
+    q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
+    q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
+    q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
+    q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
+    q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
+    q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
+    q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
+    q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
+
+    ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+    tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
+    tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
+
+    ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+    tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
+    tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
+
+    ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+    q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
+    tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
+    q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+    q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
+    tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
+    q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
+    q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
+
+    ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+    output += (8 * out_pitch);
+    ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                       uint8_t *src_org, int32_t pitch_org,
+                                       int32_t b_limit_ptr,
+                                       int32_t limit_ptr,
+                                       int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v16i8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3;
+
+    /* load vector elements */
+    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+        /* convert 16 bit output data into 8 bit */
+        p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
+        p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
+        p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
+        q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
+        q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
+        q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
+                                 uint8_t *filter48)
+{
+    v16i8 zero = { 0 };
+    v16u8 filter8, flat, flat2;
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 tmp0_r, tmp1_r;
+    v8i16 r_out;
+
+    flat = LD_UB(filter48 + 6 * 16);
+
+    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        v8i16 vec0, vec1, vec2, vec3, vec4;
+
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
+
+        src_org -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+        return 1;
+    } else {
+        src -= 7 * 16;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST8x1_UB(p6, src);
+        src += 16;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST8x1_UB(p5, src);
+        src += 16;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST8x1_UB(p4, src);
+        src += 16;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST8x1_UB(p3, src);
+        src += 16;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST8x1_UB(filter8, src);
+        src += 16;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST8x1_UB(q3, src);
+        src += 16;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST8x1_UB(q4, src);
+        src += 16;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST8x1_UB(q5, src);
+        src += 16;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST8x1_UB(q6, src);
+
+        return 0;
+    }
+}
+
+void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
+                               int32_t b_limit_ptr,
+                               int32_t limit_ptr,
+                               int32_t thresh_ptr)
+{
+    uint8_t early_exit = 0;
+    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t *filter48 = &transposed_input[16 * 16];
+
+    vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+    early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+                                         &filter48[0], src, pitch,
+                                         b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+                                       &filter48[0]);
+
+        if (0 == early_exit) {
+            vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+        }
+    }
+}
+
+static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                        uint8_t *src_org, ptrdiff_t pitch,
+                                        int32_t b_limit_ptr,
+                                        int32_t limit_ptr,
+                                        int32_t thresh_ptr)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 flat, mask, hev, thresh, b_limit, limit;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+    v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+    v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
+    v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
+    v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
+    v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
+    v16i8 zero = { 0 };
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+    /* load vector elements */
+    LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8) __msa_fill_b(thresh_ptr);
+    b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
+    limit = (v16u8) __msa_fill_b(limit_ptr);
+
+    /* mask and hev */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    /* flat4 */
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    /* filter4 */
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    /* if flat is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat)) {
+        ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+        ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+        src_org -= 2;
+        ST4x8_UB(vec2, vec3, src_org, pitch);
+        src_org += 8 * pitch;
+        ST4x8_UB(vec4, vec5, src_org, pitch);
+
+        return 1;
+    } else {
+        ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                   zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                   q3_r);
+        VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                    p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
+                   p0_l);
+        ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
+                   q3_l);
+        VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                    p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+        /* convert 16 bit output data into 8 bit */
+        PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                    p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                    p0_filt8_r, q0_filt8_r);
+        PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                    q2_filt8_r);
+
+        /* store pixel values */
+        p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
+        p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
+        p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
+        q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
+        q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
+        q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
+
+        ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+        filter48 += (4 * 16);
+        ST_UB2(q1_out, q2_out, filter48, 16);
+        filter48 += (2 * 16);
+        ST_UB(flat, filter48);
+
+        return 0;
+    }
+}
+
+static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
+                                  uint8_t *filter48)
+{
+    v16u8 flat, flat2, filter8;
+    v16i8 zero = { 0 };
+    v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+    v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
+    v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+    v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
+    v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+    v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+    v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+    v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+    v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+    v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+    v8i16 l_out, r_out;
+
+    flat = LD_UB(filter48 + 6 * 16);
+
+    LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+    LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+    VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+    /* if flat2 is zero for all pixels, then no need to calculate other filter */
+    if (__msa_test_bz_v(flat2)) {
+        v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+        LD_UB4(filter48, 16, p2, p1, p0, q0);
+        LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+        ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+        ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+        ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+        ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+        src_org -= 3;
+        ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+        src_org += (4 * pitch);
+        ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+        ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+        return 1;
+    } else {
+        src -= 7 * 16;
+
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+                   zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+                   p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+        q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
+
+        tmp0_r = p7_r_in << 3;
+        tmp0_r -= p7_r_in;
+        tmp0_r += p6_r_in;
+        tmp0_r += q0_r_in;
+        tmp1_r = p6_r_in + p5_r_in;
+        tmp1_r += p4_r_in;
+        tmp1_r += p3_r_in;
+        tmp1_r += p2_r_in;
+        tmp1_r += p1_r_in;
+        tmp1_r += p0_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+
+        ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+                   p5_l_in, p4_l_in);
+        ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+                   p1_l_in, p0_l_in);
+        q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
+
+        tmp0_l = p7_l_in << 3;
+        tmp0_l -= p7_l_in;
+        tmp0_l += p6_l_in;
+        tmp0_l += q0_l_in;
+        tmp1_l = p6_l_in + p5_l_in;
+        tmp1_l += p4_l_in;
+        tmp1_l += p3_l_in;
+        tmp1_l += p2_l_in;
+        tmp1_l += p1_l_in;
+        tmp1_l += p0_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
+        ST_UB(p6, src);
+        src += 16;
+
+        /* p5 */
+        q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
+        tmp0_r = p5_r_in - p6_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
+        tmp0_l = p5_l_in - p6_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
+        ST_UB(p5, src);
+        src += 16;
+
+        /* p4 */
+        q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
+        tmp0_r = p4_r_in - p5_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
+        tmp0_l = p4_l_in - p5_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
+        ST_UB(p4, src);
+        src += 16;
+
+        /* p3 */
+        q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
+        tmp0_r = p3_r_in - p4_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
+        tmp0_l = p3_l_in - p4_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
+        ST_UB(p3, src);
+        src += 16;
+
+        /* p2 */
+        q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
+        filter8 = LD_UB(filter48);
+        tmp0_r = p2_r_in - p3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
+        tmp0_l = p2_l_in - p3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* p1 */
+        q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
+        filter8 = LD_UB(filter48 + 16);
+        tmp0_r = p1_r_in - p2_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
+        tmp0_l = p1_l_in - p2_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* p0 */
+        q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
+        filter8 = LD_UB(filter48 + 32);
+        tmp0_r = p0_r_in - p1_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
+        tmp0_l = p0_l_in - p1_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q0 */
+        q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
+        filter8 = LD_UB(filter48 + 48);
+        tmp0_r = q7_r_in - p0_r_in;
+        tmp0_r += q0_r_in;
+        tmp0_r -= p7_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
+        tmp0_l = q7_l_in - p0_l_in;
+        tmp0_l += q0_l_in;
+        tmp0_l -= p7_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q1 */
+        filter8 = LD_UB(filter48 + 64);
+        tmp0_r = q7_r_in - q0_r_in;
+        tmp0_r += q1_r_in;
+        tmp0_r -= p6_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q0_l_in;
+        tmp0_l += q1_l_in;
+        tmp0_l -= p6_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q2 */
+        filter8 = LD_UB(filter48 + 80);
+        tmp0_r = q7_r_in - q1_r_in;
+        tmp0_r += q2_r_in;
+        tmp0_r -= p5_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q1_l_in;
+        tmp0_l += q2_l_in;
+        tmp0_l -= p5_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
+        ST_UB(filter8, src);
+        src += 16;
+
+        /* q3 */
+        tmp0_r = q7_r_in - q2_r_in;
+        tmp0_r += q3_r_in;
+        tmp0_r -= p4_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q2_l_in;
+        tmp0_l += q3_l_in;
+        tmp0_l -= p4_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
+        ST_UB(q3, src);
+        src += 16;
+
+        /* q4 */
+        tmp0_r = q7_r_in - q3_r_in;
+        tmp0_r += q4_r_in;
+        tmp0_r -= p3_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q3_l_in;
+        tmp0_l += q4_l_in;
+        tmp0_l -= p3_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
+        ST_UB(q4, src);
+        src += 16;
+
+        /* q5 */
+        tmp0_r = q7_r_in - q4_r_in;
+        tmp0_r += q5_r_in;
+        tmp0_r -= p2_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q4_l_in;
+        tmp0_l += q5_l_in;
+        tmp0_l -= p2_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
+        ST_UB(q5, src);
+        src += 16;
+
+        /* q6 */
+        tmp0_r = q7_r_in - q5_r_in;
+        tmp0_r += q6_r_in;
+        tmp0_r -= p1_r_in;
+        tmp1_r += tmp0_r;
+        r_out = __msa_srari_h((v8i16) tmp1_r, 4);
+        tmp0_l = q7_l_in - q5_l_in;
+        tmp0_l += q6_l_in;
+        tmp0_l -= p1_l_in;
+        tmp1_l += tmp0_l;
+        l_out = __msa_srari_h((v8i16) tmp1_l, 4);
+        r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
+        q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
+        ST_UB(q6, src);
+
+        return 0;
+    }
+}
+
+void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
+                                int32_t b_limit_ptr,
+                                int32_t limit_ptr,
+                                int32_t thresh_ptr)
+{
+    uint8_t early_exit = 0;
+    uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
+    uint8_t *filter48 = &transposed_input[16 * 16];
+
+    vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+    early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+                                          &filter48[0], src, pitch,
+                                          b_limit_ptr, limit_ptr, thresh_ptr);
+
+    if (0 == early_exit) {
+        early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+                                        &filter48[0]);
+
+        if (0 == early_exit) {
+            vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
+        }
+    }
+}
diff --git a/libavcodec/mips/vp9_mc_msa.c b/libavcodec/mips/vp9_mc_msa.c
new file mode 100644
index 0000000..1671d97
--- /dev/null
+++ b/libavcodec/mips/vp9_mc_msa.c
@@ -0,0 +1,4510 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static const int8_t vp9_bilinear_filters_msa[15][2] = {
+    {120, 8},
+    {112, 16},
+    {104, 24},
+    {96, 32},
+    {88, 40},
+    {80, 48},
+    {72, 56},
+    {64, 64},
+    {56, 72},
+    {48, 80},
+    {40, 88},
+    {32, 96},
+    {24, 104},
+    {16, 112},
+    {8, 120}
+};
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
+                            filt0, filt1, filt2, filt3)         \
+( {                                                             \
+    v8i16 tmp0, tmp1;                                           \
+                                                                \
+    tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
+    tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2);         \
+    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3);  \
+    tmp0 = __msa_adds_s_h(tmp0, tmp1);                          \
+                                                                \
+    tmp0;                                                       \
+} )
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,          \
+                        filt_h0, filt_h1, filt_h2, filt_h3)              \
+( {                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
+               vec0_m, vec1_m, vec2_m, vec3_m);                          \
+    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
+                                   filt_h0, filt_h1, filt_h2, filt_h3);  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+} )
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1)                              \
+{                                                                           \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m,  vec4_m, vec5_m, vec6_m, vec7_m;  \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                   \
+                                                                            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);       \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);       \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);       \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);       \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);             \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2, mask3,                \
+                                   filt0, filt1, filt2, filt3,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                res0_m, res1_m, res2_m, res3_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+                res4_m, res5_m, res6_m, res7_m);                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+                 res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+                 res4_m, res5_m, res6_m, res7_m);                             \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+                res7_m, out0, out1, out2, out3);                              \
+}
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)  \
+{                                                     \
+    v16u8 tmp_m;                                      \
+                                                      \
+    tmp_m = PCKEV_XORI128_UB(in1, in0);               \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);       \
+    ST_UB(tmp_m, (pdst));                             \
+}
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)                  \
+{                                                             \
+    v16u8 tmp_m;                                              \
+                                                              \
+    tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst);               \
+    ST_UB(tmp_m, (pdst));                                     \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride)                                \
+{                                                                       \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+    uint8_t *pdst_m = (uint8_t *) (pdst);                               \
+                                                                        \
+    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                    \
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);        \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                           \
+}
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    SRARI_H2_SH(out0, out1, 7);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src0, src1, src2, src3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int8_t *filter)
+{
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out2, out3);
+
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (16 == height) {
+        common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, 7);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (2 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 16);
+
+        src0 = LD_SB(src + 32);
+        src2 = LD_SB(src + 48);
+        src3 = LD_SB(src + 56);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                                   mask2, mask3, filt0, filt1, filt2, filt3,
+                                   out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst + 32);
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                   src87_l, src98_l, src109_l);
+        out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                     filt1, filt2, filt3);
+        out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                     filt1, filt2, filt3);
+        out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                     filt1, filt2, filt3);
+        out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                     filt1, filt2, filt3);
+        out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                     filt1, filt2, filt3);
+        out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                     filt1, filt2, filt3);
+        out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                     filt1, filt2, filt3);
+        out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                     filt1, filt2, filt3);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src54_l = src98_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src65_l = src109_l;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter, int32_t height,
+                                      int32_t width)
+{
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                              64);
+}
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v16u8 mask0, mask1, mask2, mask3, out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
+        out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out5 = hz_out9;
+        out0 = out2;
+        out1 = out3;
+        out2 = out4;
+    }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
+                                   filt_vt1, filt_vt2, filt_vt3);
+
+        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
+        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out6 = hz_out10;
+        out0 = out2;
+        out1 = out3;
+        out2 = out8;
+        out4 = out6;
+        out5 = out7;
+        out6 = out9;
+    }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 8; multiple8_cnt--;) {
+        common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst0, dst1, dst2, dst3, res2, res3;
+    v16u8 mask0, mask1, mask2, mask3;
+    v8i16 filt, res0, res1;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, res0, res1);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    SRARI_H2_SH(res0, res1, 7);
+    SAT_SH2_SH(res0, res1, 7);
+    PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+    XORI_B2_128_UB(res2, res3);
+    AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8i16 filt, vec0, vec1, vec2, vec3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, vec0, vec1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, vec2, vec3);
+    SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
+    SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                res0, res1, res2, res3);
+    ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+    XORI_B2_128_UB(res0, res2);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+               dst0, dst2, dst4, dst6);
+    ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
+    AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+    ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    if (4 == height) {
+        common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   mask3, filt0, filt1, filt2, filt3, out0,
+                                   out1, out2, out3);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    int32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        LD_SB2(src, src_stride, src0, src2);
+        LD_SB2(src + 8, src_stride, src1, src3);
+        src += (2 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                   vec12);
+        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                   vec13);
+        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                   vec14);
+        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                   vec15);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                    vec9, vec10, vec11);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                     vec1, vec2, vec3);
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                     vec8, vec9, vec10, vec11);
+        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                    out1, out2, out3);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+        dst += dst_stride;
+        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                   vec12);
+        VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                   vec13);
+        VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                   vec14);
+        VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                   vec15);
+        DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                    vec9, vec10, vec11);
+        DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                     vec1, vec2, vec3);
+        DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                     vec8, vec9, vec10, vec11);
+        ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                    out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        LD_UB2(dst, 16, dst1, dst2);
+        PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+        PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+    v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+    v8i16 filt, out0, out1, out2, out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= 3;
+
+    /* rearranging filter */
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    for (loop_cnt = height; loop_cnt--;) {
+        for (cnt = 0; cnt < 2; ++cnt) {
+            src0 = LD_SB(&src[cnt << 5]);
+            src2 = LD_SB(&src[16 + (cnt << 5)]);
+            src3 = LD_SB(&src[24 + (cnt << 5)]);
+            src1 = __msa_sldi_b(src2, src0, 8);
+
+            XORI_B4_128_SB(src0, src1, src2, src3);
+            VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                       vec12);
+            VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                       vec13);
+            VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
+                       vec10, vec14);
+            VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
+                       vec11, vec15);
+            DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                        vec0, vec1, vec2, vec3);
+            DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
+                        vec8, vec9, vec10, vec11);
+            DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
+                         vec0, vec1, vec2, vec3);
+            DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
+                         vec8, vec9, vec10, vec11);
+            ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
+                        out1, out2, out3);
+            SRARI_H4_SH(out0, out1, out2, out3, 7);
+            SAT_SH4_SH(out0, out1, out2, out3, 7);
+            LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+            PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+            PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, dst1, dst2, dst3, out;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+    v16i8 src10998, filt0, filt1, filt2, filt3;
+    v8i16 filt, out10, out32;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+               src4332, src6554);
+    XORI_B3_128_SB(src2110, src4332, src6554);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+        XORI_B2_128_SB(src8776, src10998);
+        out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                    filt1, filt2, filt3);
+        out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                    filt1, filt2, filt3);
+        SRARI_H2_SH(out10, out32, 7);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+        dst0 = (v16u8) __msa_ilvr_d((v2i64) dst2, (v2i64) dst0);
+        out = __msa_aver_u_b(out, dst0);
+
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst, int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, dst1, dst2, dst3;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    v8i16 filt, out0, out1, out2, out3;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+        out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+        out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+        out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+        SRARI_H4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter,
+                                                   int32_t height,
+                                                   int32_t width)
+{
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    v16i8 filt0, filt1, filt2, filt3;
+    v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (3 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+        XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+        src_tmp += (7 * src_stride);
+
+        ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+                   src32_r, src54_r, src21_r);
+        ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+        ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+                   src32_l, src54_l, src21_l);
+        ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+            src_tmp += (4 * src_stride);
+
+            LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+            XORI_B4_128_SB(src7, src8, src9, src10);
+            ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                       src87_r, src98_r, src109_r);
+            ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                       src87_l, src98_l, src109_l);
+            out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                         filt0, filt1, filt2, filt3);
+            out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                         filt0, filt1, filt2, filt3);
+            out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                         filt0, filt1, filt2, filt3);
+            out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                         filt0, filt1, filt2, filt3);
+            out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                         filt0, filt1, filt2, filt3);
+            out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                         filt0, filt1, filt2, filt3);
+            out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                         filt0, filt1, filt2, filt3);
+            out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                         filt0, filt1, filt2, filt3);
+            SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+            SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+            PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                        out3_r, tmp0, tmp1, tmp2, tmp3);
+            XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+            AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                        dst0, dst1, dst2, dst3);
+            ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+            dst_tmp += (4 * dst_stride);
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height)
+{
+    common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                           filter, height, 64);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter_horiz,
+                                                  const int8_t *filter_vert,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[16]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
+        vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
+        vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+        ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+        SRARI_H2_SH(res0, res1, 7);
+        SAT_SH2_SH(res0, res1, 7);
+        PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
+        XORI_B2_128_UB(tmp0, tmp1);
+        AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
+        ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out5 = hz_out9;
+        vec0 = vec2;
+        vec1 = vec3;
+        vec2 = vec4;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter_horiz,
+                                                  const int8_t *filter_vert,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+    v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+    mask0 = LD_UB(&mc_filt_mask_arr[0]);
+    src -= (3 + 3 * src_stride);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+    mask3 = mask0 + 6;
+
+    LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    src += (7 * src_stride);
+
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+    ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
+        tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
+        tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                                  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
+        tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                                   filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+        out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
+        tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                                   filt_vt2, filt_vt3);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
+                                dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out6 = hz_out10;
+        out0 = out2;
+        out1 = out3;
+        out2 = out8;
+        out4 = out6;
+        out5 = out7;
+        out6 = out9;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert,
+                                                   int32_t height)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 8; multiple8_cnt--;) {
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                              filter_horiz, filter_vert,
+                                              height);
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, 7);
+    SRARI_H4_UH(out4, out5, out6, out7, 7);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height >> 1; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src5 = __msa_sldi_b(src6, src4, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        PCKEV_ST_SB(out2, out3, dst + 16);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        PCKEV_ST_SB(out6, out7, dst + 16);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src4 = LD_SB(src + 32);
+        src6 = LD_SB(src + 48);
+        src7 = LD_SB(src + 56);
+        SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        PCKEV_ST_SB(out0, out1, dst);
+        PCKEV_ST_SB(out2, out3, dst + 16);
+        PCKEV_ST_SB(out4, out5, dst + 32);
+        PCKEV_ST_SB(out6, out7, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else if (8 == height) {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    } else {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src5 = LD_UB(src + 16);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+        src += (4 * src_stride);
+
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src5 = src9;
+    }
+}
+
+void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB4(src, 16, src0, src3, src6, src9);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        LD_UB2(src + 16, src_stride, src4, src5);
+        LD_UB2(src + 32, src_stride, src7, src8);
+        LD_UB2(src + 48, src_stride, src10, src11);
+        src += (2 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src3 = src5;
+        src6 = src8;
+        src9 = src11;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_horiz, const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int8_t *filter_horiz, const int8_t *filter_vert,
+                                   int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, 7);
+        SAT_UH2_UH(tmp3, tmp4, 7);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, 7);
+        SAT_UH2_UH(tmp1, tmp2, 7);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, 7);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+    AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+                vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
+    PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+                res2, res3);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
+               dst4, dst6);
+    AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
+                res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter,
+                                                  int32_t height)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height) {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                           dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                    vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                           dst, dst_stride);
+    }
+}
+
+void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+
+    if (4 == height) {
+        common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else {
+        common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                              filter, height);
+    }
+}
+
+void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+                res2, res3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+                res6, res7);
+    SRARI_H4_UH(res0, res1, res2, res3, 7);
+    SRARI_H4_UH(res4, res5, res6, res7, 7);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+    dst += dst_stride;
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
+                    res1, res2, res3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
+                    res5, res6, res7);
+        SRARI_H4_UH(res0, res1, res2, res3, 7);
+        SRARI_H4_UH(res4, res5, res6, res7, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+        dst += dst_stride;
+        PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        src0 = LD_SB(src);
+        src2 = LD_SB(src + 16);
+        src3 = LD_SB(src + 24);
+        src1 = __msa_sldi_b(src2, src0, 8);
+        src += src_stride;
+        src4 = LD_SB(src);
+        src6 = LD_SB(src + 16);
+        src7 = LD_SB(src + 24);
+        src5 = __msa_sldi_b(src6, src4, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    res0, res1, res2, res3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    res4, res5, res6, res7);
+        SRARI_H4_UH(res0, res1, res2, res3, 7);
+        SRARI_H4_UH(res4, res5, res6, res7, 7);
+        LD_UB2(dst, 16, dst0, dst1);
+        PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+        PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+        dst += dst_stride;
+        LD_UB2(dst, 16, dst2, dst3);
+        PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+        PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, dst0, dst1, dst2, dst3;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        LD_SB4(src, 16, src0, src2, src4, src6);
+        src7 = LD_SB(src + 56);
+        SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+        src += src_stride;
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, 7);
+        SRARI_H4_UH(out4, out5, out6, out7, 7);
+        LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+        PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+        PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+        PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+    v16i8 src10_r, src32_r, src21_r, src43_r;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    src4 = LD_SB(src);
+    src += src_stride;
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+    dst0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+
+    out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
+    out = __msa_aver_u_b(out, dst0);
+
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst, int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16u8 src2110, src4332, src6554, src8776, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
+               dst2, dst3);
+    ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+}
+
+void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else if (8 == height) {
+        common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                       dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const int8_t *filter,
+                                                  int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_SH(filter);
+    filt0 = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3,
+                           dst4, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+        PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3,
+                           dst8, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
+                         int height, int mx, int my)
+{
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                          filter);
+    } else {
+        common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                              filter, height);
+    }
+}
+
+void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_UB2(src, 16, src0, src5);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+        LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+        LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+        src += (4 * src_stride);
+
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+        ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+        ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+        ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src4;
+        src5 = src9;
+    }
+}
+
+void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
+    v16u8 src0, src1, src2, src3, src4, src5;
+    v16u8 src6, src7, src8, src9, src10, src11, filt0;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8u16 filt;
+
+    /* rearranging filter_y */
+    filt = LD_UH(filter);
+    filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_UB4(src, 16, src0, src3, src6, src9);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        LD_UB2(src, src_stride, src1, src2);
+        LD_UB2(dst, dst_stride, dst0, dst1);
+        LD_UB2(src + 16, src_stride, src4, src5);
+        LD_UB2(dst + 16, dst_stride, dst2, dst3);
+        LD_UB2(src + 32, src_stride, src7, src8);
+        LD_UB2(dst + 32, dst_stride, dst4, dst5);
+        LD_UB2(src + 48, src_stride, src10, src11);
+        LD_UB2(dst + 48, dst_stride, dst6, dst7);
+        src += (2 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+        ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+        ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+        ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+        ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+        ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+        ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+        SRARI_H2_UH(tmp4, tmp5, 7);
+        SAT_UH2_UH(tmp4, tmp5, 7);
+        PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+        SRARI_H2_UH(tmp6, tmp7, 7);
+        SAT_UH2_UH(tmp6, tmp7, 7);
+        PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+        dst += (2 * dst_stride);
+
+        src0 = src2;
+        src3 = src5;
+        src6 = src8;
+        src9 = src11;
+    }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v16u8 dst0, dst1, dst2, dst3, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
+    hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, 7);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[16]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
+
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
+               dst4, dst6);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
+                res2, res3);
+    AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
+                res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    } else if (8 == height) {
+        common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   const int8_t *filter_horiz,
+                                                   const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+    vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+    vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+    vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+    vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                       dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       const int8_t *filter_horiz,
+                                                       const int8_t *filter_vert,
+                                                       int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp2, tmp3, 7);
+        SAT_UH2_UH(tmp2, tmp3, 7);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+        PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3,
+                           dst3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          int height, int mx, int my)
+{
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+
+    if (4 == height) {
+        common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert);
+    } else {
+        common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride,
+                                                   dst, dst_stride,
+                                                   filter_horiz, filter_vert,
+                                                   height);
+    }
+}
+
+void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    uint32_t loop_cnt;
+    const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
+    const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+    v8i16 filt;
+
+    mask = LD_SB(&mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8) __msa_splati_h(filt, 0);
+
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8) __msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, 7);
+        SAT_UH2_UH(tmp0, tmp1, 7);
+        PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+        dst += dst_stride;
+    }
+}
+
+void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 2; multiple8_cnt--;) {
+        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
+                           const uint8_t *src, ptrdiff_t src_stride,
+                           int height, int mx, int my)
+{
+    int32_t multiple8_cnt;
+
+    for (multiple8_cnt = 4; multiple8_cnt--;) {
+        ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        for (cnt = height >> 3; cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+            out4 = __msa_copy_u_d((v2i64) src4, 0);
+            out5 = __msa_copy_u_d((v2i64) src5, 0);
+            out6 = __msa_copy_u_d((v2i64) src6, 0);
+            out7 = __msa_copy_u_d((v2i64) src7, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+            SD4(out4, out5, out6, out7, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 4) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+            out2 = __msa_copy_u_d((v2i64) src2, 0);
+            out3 = __msa_copy_u_d((v2i64) src3, 0);
+
+            SD4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 2) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+            out0 = __msa_copy_u_d((v2i64) src0, 0);
+            out1 = __msa_copy_u_d((v2i64) src1, 0);
+
+            SD(out0, dst);
+            dst += dst_stride;
+            SD(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width)
+{
+    int32_t cnt, loop_cnt;
+    const uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        for (loop_cnt = (height >> 3); loop_cnt--;) {
+            LD_UB8(src_tmp, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src_tmp += (8 * src_stride);
+
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst_tmp, dst_stride);
+            dst_tmp += (8 * dst_stride);
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB8(src, src_stride,
+                   src0, src1, src2, src3, src4, src5, src6, src7);
+            src += (8 * src_stride);
+            ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+                   dst, dst_stride);
+            dst += (8 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if (0 == height % 12) {
+        for (cnt = (height / 12); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == height % 8) {
+        copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+    } else if (0 == height % 4) {
+        for (cnt = (height >> 2); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+            src += (4 * src_stride);
+            ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+            ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint32_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    if (0 == (height % 4)) {
+        for (cnt = (height / 4); cnt--;) {
+            LD_UB4(src, src_stride, src0, src1, src2, src3);
+            src += (4 * src_stride);
+
+            LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+            AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                        dst0, dst1, dst2, dst3);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            out2 = __msa_copy_u_w((v4i32) dst2, 0);
+            out3 = __msa_copy_u_w((v4i32) dst3, 0);
+            SW4(out0, out1, out2, out3, dst, dst_stride);
+            dst += (4 * dst_stride);
+        }
+    } else if (0 == (height % 2)) {
+        for (cnt = (height / 2); cnt--;) {
+            LD_UB2(src, src_stride, src0, src1);
+            src += (2 * src_stride);
+
+            LD_UB2(dst, dst_stride, dst0, dst1);
+
+            AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+            out0 = __msa_copy_u_w((v4i32) dst0, 0);
+            out1 = __msa_copy_u_w((v4i32) dst1, 0);
+            SW(out0, dst);
+            dst += dst_stride;
+            SW(out1, dst);
+            dst += dst_stride;
+        }
+    }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride,
+                           int32_t height)
+{
+    int32_t cnt;
+    uint64_t out0, out1, out2, out3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+        LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+
+        out0 = __msa_copy_u_d((v2i64) dst0, 0);
+        out1 = __msa_copy_u_d((v2i64) dst1, 0);
+        out2 = __msa_copy_u_d((v2i64) dst2, 0);
+        out3 = __msa_copy_u_d((v2i64) dst3, 0);
+        SD4(out0, out1, out2, out3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (8 * src_stride);
+        LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+        dst += (8 * dst_stride);
+    }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint8_t *dst_dup = dst;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+    for (cnt = (height / 8); cnt--;) {
+        LD_UB4(src, src_stride, src0, src2, src4, src6);
+        LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+        LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+        LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+        dst_dup += (4 * dst_stride);
+        LD_UB4(src, src_stride, src8, src10, src12, src14);
+        LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+        src += (4 * src_stride);
+        LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+        LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+        dst_dup += (4 * dst_stride);
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                    dst8, dst9, dst10, dst11);
+        AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                    dst12, dst13, dst14, dst15);
+
+        ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+        ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+        ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+        ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height)
+{
+    int32_t cnt;
+    uint8_t *dst_dup = dst;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+    for (cnt = (height / 4); cnt--;) {
+        LD_UB4(src, 16, src0, src1, src2, src3);
+        src += src_stride;
+        LD_UB4(src, 16, src4, src5, src6, src7);
+        src += src_stride;
+        LD_UB4(src, 16, src8, src9, src10, src11);
+        src += src_stride;
+        LD_UB4(src, 16, src12, src13, src14, src15);
+        src += src_stride;
+
+        LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+        dst_dup += dst_stride;
+        LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+        dst_dup += dst_stride;
+
+        AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                    dst0, dst1, dst2, dst3);
+        AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                    dst4, dst5, dst6, dst7);
+        AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                    dst8, dst9, dst10, dst11);
+        AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                    dst12, dst13, dst14, dst15);
+
+        ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+        dst += dst_stride;
+        ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static const int8_t vp9_subpel_filters_msa[3][15][8] = {
+    [FILTER_8TAP_REGULAR] = {
+         {0, 1, -5, 126, 8, -3, 1, 0},
+         {-1, 3, -10, 122, 18, -6, 2, 0},
+         {-1, 4, -13, 118, 27, -9, 3, -1},
+         {-1, 4, -16, 112, 37, -11, 4, -1},
+         {-1, 5, -18, 105, 48, -14, 4, -1},
+         {-1, 5, -19, 97, 58, -16, 5, -1},
+         {-1, 6, -19, 88, 68, -18, 5, -1},
+         {-1, 6, -19, 78, 78, -19, 6, -1},
+         {-1, 5, -18, 68, 88, -19, 6, -1},
+         {-1, 5, -16, 58, 97, -19, 5, -1},
+         {-1, 4, -14, 48, 105, -18, 5, -1},
+         {-1, 4, -11, 37, 112, -16, 4, -1},
+         {-1, 3, -9, 27, 118, -13, 4, -1},
+         {0, 2, -6, 18, 122, -10, 3, -1},
+         {0, 1, -3, 8, 126, -5, 1, 0},
+    }, [FILTER_8TAP_SHARP] = {
+        {-1, 3, -7, 127, 8, -3, 1, 0},
+        {-2, 5, -13, 125, 17, -6, 3, -1},
+        {-3, 7, -17, 121, 27, -10, 5, -2},
+        {-4, 9, -20, 115, 37, -13, 6, -2},
+        {-4, 10, -23, 108, 48, -16, 8, -3},
+        {-4, 10, -24, 100, 59, -19, 9, -3},
+        {-4, 11, -24, 90, 70, -21, 10, -4},
+        {-4, 11, -23, 80, 80, -23, 11, -4},
+        {-4, 10, -21, 70, 90, -24, 11, -4},
+        {-3, 9, -19, 59, 100, -24, 10, -4},
+        {-3, 8, -16, 48, 108, -23, 10, -4},
+        {-2, 6, -13, 37, 115, -20, 9, -4},
+        {-2, 5, -10, 27, 121, -17, 7, -3},
+        {-1, 3, -6, 17, 125, -13, 5, -2},
+        {0, 1, -3, 8, 127, -7, 3, -1},
+    }, [FILTER_8TAP_SMOOTH] = {
+        {-3, -1, 32, 64, 38, 1, -3, 0},
+        {-2, -2, 29, 63, 41, 2, -3, 0},
+        {-2, -2, 26, 63, 43, 4, -4, 0},
+        {-2, -3, 24, 62, 46, 5, -4, 0},
+        {-2, -3, 21, 60, 49, 7, -4, 0},
+        {-1, -4, 18, 59, 51, 9, -4, 0},
+        {-1, -4, 16, 57, 53, 12, -4, -1},
+        {-1, -4, 14, 55, 55, 14, -4, -1},
+        {-1, -4, 12, 53, 57, 16, -4, -1},
+        {0, -4, 9, 51, 59, 18, -4, -1},
+        {0, -4, 7, 49, 60, 21, -3, -2},
+        {0, -4, 5, 46, 62, 24, -3, -2},
+        {0, -4, 4, 43, 63, 26, -2, -2},
+        {0, -3, 2, 41, 63, 29, -2, -2},
+        {0, -3, 1, 38, 64, 32, -1, -3},
+    }
+};
+
+#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                           \
+void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
+                                                                               \
+    common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
+                                                                               \
+    common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h);     \
+}                                                                              \
+                                                                               \
+void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];           \
+    const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];           \
+                                                                               \
+    common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter,   \
+                                    vfilter, h);                               \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1];             \
+                                                                               \
+    common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,               \
+                                            dststride, filter, h);             \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,     \
+                                        const uint8_t *src,                    \
+                                        ptrdiff_t srcstride,                   \
+                                        int h, int mx, int my)                 \
+{                                                                              \
+    const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1];             \
+                                                                               \
+    common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride,    \
+                                            filter, h);                        \
+}                                                                              \
+                                                                               \
+void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,    \
+                                         const uint8_t *src,                   \
+                                         ptrdiff_t srcstride,                  \
+                                         int h, int mx, int my)                \
+{                                                                              \
+    const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1];           \
+    const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1];           \
+                                                                               \
+    common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst,          \
+                                                 dststride, hfilter,           \
+                                                 vfilter, h);                  \
+}
+
+#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
+void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                         const uint8_t *src, ptrdiff_t srcstride,  \
+                         int h, int mx, int my)                    \
+{                                                                  \
+                                                                   \
+    copy_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
+}                                                                  \
+                                                                   \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                        const uint8_t *src, ptrdiff_t srcstride,   \
+                        int h, int mx, int my)                     \
+{                                                                  \
+                                                                   \
+    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);      \
+}
+
+#define VP9_AVG_MIPS_MSA_FUNC(SIZE)                               \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                        const uint8_t *src, ptrdiff_t srcstride,  \
+                        int h, int mx, int my)                    \
+{                                                                 \
+                                                                  \
+    avg_width##SIZE##_msa(src, srcstride, dst, dststride, h);     \
+}
+
+VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+VP9_COPY_AVG_MIPS_MSA_FUNC(64);
+VP9_COPY_AVG_MIPS_MSA_FUNC(32);
+VP9_COPY_AVG_MIPS_MSA_FUNC(16);
+VP9_COPY_AVG_MIPS_MSA_FUNC(8);
+VP9_AVG_MIPS_MSA_FUNC(4);
+
+#undef VP9_8TAP_MIPS_MSA_FUNC
+#undef VP9_COPY_AVG_MIPS_MSA_FUNC
+#undef VP9_AVG_MIPS_MSA_FUNC
diff --git a/libavcodec/mips/vp9dsp_init_mips.c b/libavcodec/mips/vp9dsp_init_mips.c
new file mode 100644
index 0000000..c8a4890
--- /dev/null
+++ b/libavcodec/mips/vp9dsp_init_mips.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/common.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_mips.h"
+
+#if HAVE_MSA
+static av_cold void vp9dsp_intrapred_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_intra_pred_msa(tx, sz)                             \
+    dsp->intra_pred[tx][VERT_PRED]    = ff_vert_##sz##_msa;     \
+    dsp->intra_pred[tx][HOR_PRED]     = ff_hor_##sz##_msa;      \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_msa;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_128_PRED]  = ff_dc_128_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_127_PRED]  = ff_dc_127_##sz##_msa;   \
+    dsp->intra_pred[tx][DC_129_PRED]  = ff_dc_129_##sz##_msa;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_msa;       \
+
+    init_intra_pred_msa(TX_16X16, 16x16);
+    init_intra_pred_msa(TX_32X32, 32x32);
+#undef init_intra_pred_msa
+
+#define init_intra_pred_msa(tx, sz)                             \
+    dsp->intra_pred[tx][DC_PRED]      = ff_dc_##sz##_msa;       \
+    dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa;  \
+    dsp->intra_pred[tx][TOP_DC_PRED]  = ff_dc_top_##sz##_msa;   \
+    dsp->intra_pred[tx][TM_VP8_PRED]  = ff_tm_##sz##_msa;       \
+
+    init_intra_pred_msa(TX_4X4, 4x4);
+    init_intra_pred_msa(TX_8X8, 8x8);
+#undef init_intra_pred_msa
+    }
+}
+
+static av_cold void vp9dsp_itxfm_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_itxfm(tx, sz)                                         \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_idct_idct_##sz##_add_msa;   \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_iadst_idct_##sz##_add_msa;  \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_idct_iadst_##sz##_add_msa;  \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_iadst_iadst_##sz##_add_msa  \
+
+#define init_idct(tx, nm)                        \
+    dsp->itxfm_add[tx][DCT_DCT]   =              \
+    dsp->itxfm_add[tx][ADST_DCT]  =              \
+    dsp->itxfm_add[tx][DCT_ADST]  =              \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_msa
+
+    init_itxfm(TX_4X4, 4x4);
+    init_itxfm(TX_8X8, 8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32, ff_idct_idct_32x32);
+#undef init_itxfm
+#undef init_idct
+    }
+}
+
+static av_cold void vp9dsp_mc_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+#define init_fpel(idx1, idx2, sz, type)                                    \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = ff_##type##sz##_msa;  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_##type##sz##_msa
+
+#define init_copy_avg(idx, sz)    \
+    init_fpel(idx, 0, sz, copy);  \
+    init_fpel(idx, 1, sz, avg)
+
+#define init_avg(idx, sz)  \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_avg
+#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] =   \
+        ff_##type##_bilin_##sz##dir##_msa;                   \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_msa;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_msa;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_msa;
+
+#define init_subpel2(idx, idxh, idxv, dir, type)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type)         \
+    init_subpel2(idx, 1, 1, hv, type);  \
+    init_subpel2(idx, 0, 1, v, type);   \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+    }
+}
+
+static av_cold void vp9dsp_loopfilter_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 8) {
+        dsp->loop_filter_8[0][0] = ff_loop_filter_h_4_8_msa;
+        dsp->loop_filter_8[0][1] = ff_loop_filter_v_4_8_msa;
+        dsp->loop_filter_8[1][0] = ff_loop_filter_h_8_8_msa;
+        dsp->loop_filter_8[1][1] = ff_loop_filter_v_8_8_msa;
+        dsp->loop_filter_8[2][0] = ff_loop_filter_h_16_8_msa;
+        dsp->loop_filter_8[2][1] = ff_loop_filter_v_16_8_msa;
+
+        dsp->loop_filter_16[0] = ff_loop_filter_h_16_16_msa;
+        dsp->loop_filter_16[1] = ff_loop_filter_v_16_16_msa;
+
+        dsp->loop_filter_mix2[0][0][0] = ff_loop_filter_h_44_16_msa;
+        dsp->loop_filter_mix2[0][0][1] = ff_loop_filter_v_44_16_msa;
+        dsp->loop_filter_mix2[0][1][0] = ff_loop_filter_h_48_16_msa;
+        dsp->loop_filter_mix2[0][1][1] = ff_loop_filter_v_48_16_msa;
+        dsp->loop_filter_mix2[1][0][0] = ff_loop_filter_h_84_16_msa;
+        dsp->loop_filter_mix2[1][0][1] = ff_loop_filter_v_84_16_msa;
+        dsp->loop_filter_mix2[1][1][0] = ff_loop_filter_h_88_16_msa;
+        dsp->loop_filter_mix2[1][1][1] = ff_loop_filter_v_88_16_msa;
+    }
+}
+
+static av_cold void vp9dsp_init_msa(VP9DSPContext *dsp, int bpp)
+{
+    vp9dsp_intrapred_init_msa(dsp, bpp);
+    vp9dsp_itxfm_init_msa(dsp, bpp);
+    vp9dsp_mc_init_msa(dsp, bpp);
+    vp9dsp_loopfilter_init_msa(dsp, bpp);
+}
+#endif  // #if HAVE_MSA
+
+av_cold void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp)
+{
+#if HAVE_MSA
+    vp9dsp_init_msa(dsp, bpp);
+#endif  // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/vp9dsp_mips.h b/libavcodec/mips/vp9dsp_mips.h
new file mode 100644
index 0000000..4d73038
--- /dev/null
+++ b/libavcodec/mips/vp9dsp_mips.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_VP9DSP_MIPS_H
+#define AVCODEC_MIPS_VP9DSP_MIPS_H
+
+#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)                         \
+void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);             \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,   \
+                                        const uint8_t *src,                  \
+                                        ptrdiff_t srcstride,                 \
+                                        int h, int mx, int my);              \
+                                                                             \
+void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,  \
+                                         const uint8_t *src,                 \
+                                         ptrdiff_t srcstride,                \
+                                         int h, int mx, int my);
+
+#define VP9_BILINEAR_MIPS_MSA_FUNC(SIZE)                                   \
+void ff_put_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_put_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_put_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                                 const uint8_t *src, ptrdiff_t srcstride,  \
+                                 int h, int mx, int my);                   \
+                                                                           \
+void ff_avg_bilin_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_avg_bilin_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                                const uint8_t *src, ptrdiff_t srcstride,   \
+                                int h, int mx, int my);                    \
+                                                                           \
+void ff_avg_bilin_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                                 const uint8_t *src, ptrdiff_t srcstride,  \
+                                 int h, int mx, int my);
+
+#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)                           \
+void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,        \
+                         const uint8_t *src, ptrdiff_t srcstride,  \
+                         int h, int mx, int my);                   \
+                                                                   \
+void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride,         \
+                        const uint8_t *src, ptrdiff_t srcstride,   \
+                        int h, int mx, int my);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(32, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(16, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(8, regular, FILTER_8TAP_REGULAR);
+VP9_8TAP_MIPS_MSA_FUNC(4, regular, FILTER_8TAP_REGULAR);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(32, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(16, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(8, sharp, FILTER_8TAP_SHARP);
+VP9_8TAP_MIPS_MSA_FUNC(4, sharp, FILTER_8TAP_SHARP);
+
+VP9_8TAP_MIPS_MSA_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_MIPS_MSA_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+
+VP9_BILINEAR_MIPS_MSA_FUNC(64);
+VP9_BILINEAR_MIPS_MSA_FUNC(32);
+VP9_BILINEAR_MIPS_MSA_FUNC(16);
+VP9_BILINEAR_MIPS_MSA_FUNC(8);
+VP9_BILINEAR_MIPS_MSA_FUNC(4);
+
+VP9_COPY_AVG_MIPS_MSA_FUNC(64);
+VP9_COPY_AVG_MIPS_MSA_FUNC(32);
+VP9_COPY_AVG_MIPS_MSA_FUNC(16);
+VP9_COPY_AVG_MIPS_MSA_FUNC(8);
+VP9_COPY_AVG_MIPS_MSA_FUNC(4);
+
+#undef VP9_8TAP_MIPS_MSA_FUNC
+#undef VP9_BILINEAR_MIPS_MSA_FUNC
+#undef VP9_COPY_AVG_MIPS_MSA_FUNC
+
+void ff_loop_filter_h_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_h_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_v_4_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_8_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                              int32_t i, int32_t h);
+void ff_loop_filter_v_16_8_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                               int32_t i, int32_t h);
+void ff_loop_filter_h_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_44_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_88_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_16_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_h_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_48_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_loop_filter_v_84_16_msa(uint8_t *dst, ptrdiff_t stride, int32_t e,
+                                int32_t i, int32_t h);
+void ff_idct_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+void ff_idct_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+void ff_idct_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_idct_idct_32x32_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                int16_t *block, int eob);
+void ff_iadst_iadst_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                  int16_t *block, int eob);
+void ff_iadst_idct_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_iadst_idct_8x8_add_msa(uint8_t *dst, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_iadst_idct_16x16_add_msa(uint8_t *dst, ptrdiff_t stride,
+                                 int16_t *block, int eob);
+void ff_idct_iadst_4x4_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_idct_iadst_8x8_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                               int16_t *block, int eob);
+void ff_idct_iadst_16x16_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
+                                 int16_t *block, int eob);
+void ff_iwht_iwht_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
+                              int16_t *block, int eob);
+
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                      const uint8_t *top);
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                        const uint8_t *top);
+void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top);
+void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                       const uint8_t *top);
+void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top);
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                   const uint8_t *top);
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+                     const uint8_t *top);
+
+#endif  // #ifndef AVCODEC_MIPS_VP9DSP_MIPS_H
diff --git a/libavcodec/mips/xvid_idct_mmi.c b/libavcodec/mips/xvid_idct_mmi.c
new file mode 100644
index 0000000..d3f9acb
--- /dev/null
+++ b/libavcodec/mips/xvid_idct_mmi.c
@@ -0,0 +1,253 @@
+/*
+ * Loongson SIMD optimized xvid idct
+ *
+ * Copyright (c) 2015 Loongson Technology Corporation Limited
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_mips.h"
+#include "xvididct_mips.h"
+
+#define BITS_INV_ACC    5                           // 4 or 5 for IEEE
+#define SHIFT_INV_ROW   (16 - BITS_INV_ACC)         //11
+#define SHIFT_INV_COL   (1 + BITS_INV_ACC)          //6
+#define RND_INV_ROW     (1024 * (6 - BITS_INV_ACC))
+#define RND_INV_COL     (16 * (BITS_INV_ACC - 3))
+#define RND_INV_CORR    (RND_INV_COL - 1)
+
+#define BITS_FRW_ACC    3                           // 2 or 3 for accuracy
+#define SHIFT_FRW_COL   BITS_FRW_ACC
+#define SHIFT_FRW_ROW   (BITS_FRW_ACC + 17)
+#define RND_FRW_ROW     (262144*(BITS_FRW_ACC - 1))
+
+DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
+     13036, 13036, 13036, 13036,    //  tg * (2<<16) + 0.5
+     27146, 27146, 27146, 27146,    //  tg * (2<<16) + 0.5
+    -21746,-21746,-21746,-21746,    //  tg * (2<<16) + 0.5
+     23170, 23170, 23170, 23170     // cos * (2<<15) + 0.5
+};
+
+DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
+    65536,65536,
+     3597, 3597,
+     2260, 2260,
+     1203, 1203,
+        0,    0,
+      120,  120,
+      512,  512,
+      512,  512
+};
+
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmi)[32*4] = {
+     16384, 21407, 16384,  8867,    // w05 w04 w01 w00
+     16384,  8867,-16384,-21407,    // w07 w06 w03 w02
+     16384, -8867, 16384,-21407,    // w13 w12 w09 w08
+    -16384, 21407, 16384, -8867,    // w15 w14 w11 w10
+     22725, 19266, 19266, -4520,    // w21 w20 w17 w16
+     12873,  4520,-22725,-12873,    // w23 w22 w19 w18
+     12873,-22725,  4520,-12873,    // w29 w28 w25 w24
+      4520, 19266, 19266,-22725,    // w31 w30 w27 w26
+
+     22725, 29692, 22725, 12299,    // w05 w04 w01 w00
+     22725, 12299,-22725,-29692,    // w07 w06 w03 w02
+     22725,-12299, 22725,-29692,    // w13 w12 w09 w08
+    -22725, 29692, 22725,-12299,    // w15 w14 w11 w10
+     31521, 26722, 26722, -6270,    // w21 w20 w17 w16
+     17855,  6270,-31521,-17855,    // w23 w22 w19 w18
+     17855,-31521,  6270,-17855,    // w29 w28 w25 w24
+      6270, 26722, 26722,-31521,    // w31 w30 w27 w26
+
+     21407, 27969, 21407, 11585,    // w05 w04 w01 w00
+     21407, 11585,-21407,-27969,    // w07 w06 w03 w02
+     21407,-11585, 21407,-27969,    // w13 w12 w09 w08
+    -21407, 27969, 21407,-11585,    // w15 w14 w11 w10
+     29692, 25172, 25172, -5906,    // w21 w20 w17 w16
+     16819,  5906,-29692,-16819,    // w23 w22 w19 w18
+     16819,-29692,  5906,-16819,    // w29 w28 w25 w24
+      5906, 25172, 25172,-29692,    // w31 w30 w27 w26
+
+     19266, 25172, 19266, 10426,    // w05 w04 w01 w00
+     19266, 10426,-19266,-25172,    // w07 w06 w03 w02
+     19266,-10426, 19266,-25172,    // w13 w12 w09 w08
+    -19266, 25172, 19266,-10426,    // w15 w14 w11 w10
+     26722, 22654, 22654, -5315,    // w21 w20 w17 w16
+     15137,  5315,-26722,-15137,    // w23 w22 w19 w18
+     15137,-26722,  5315,-15137,    // w29 w28 w25 w24
+      5315, 22654, 22654,-26722,    // w31 w30 w27 w26
+};
+
+#define DCT_8_INV_ROW_MMI(A1,A2,A3,A4)                                      \
+    "dli $10, 0x88              \n\t"                                       \
+    "ldc1 $f4, "#A1"            \n\t" /* 0; x3 x2 x1 x0                   */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "ldc1 $f10, 8+"#A1"         \n\t" /* 1; x7 x6 x5 x4                   */\
+    "ldc1 $f6, "#A3"            \n\t" /* 3; w05 w04 w01 w00               */\
+    "pshufh $f0, $f4, $f16      \n\t" /* x2 x0 x2 x0                      */\
+    "ldc1 $f8, 8+"#A3"          \n\t" /* 4; w07 w06 w03 w02               */\
+    "ldc1 $f12, 32+"#A3"        \n\t" /* 6; w21 w20 w17 w16               */\
+    "pmaddhw $f6, $f6, $f0      \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00      */\
+    "dli $10, 0xdd              \n\t"                                       \
+    "pshufh $f2, $f10, $f16     \n\t" /* x6 x4 x6 x4                      */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "pmaddhw $f8, $f8, $f2      \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02      */\
+    "ldc1 $f14, 40+"#A3"        \n\t" /* 7; w23 w22 w19 w18               */\
+    "pshufh $f4, $f4, $f16      \n\t" /* x3 x1 x3 x1                      */\
+    "pmaddhw $f12, $f12, $f4    \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16      */\
+    "pshufh $f10, $f10, $f16    \n\t" /* x7 x5 x7 x5                      */\
+    "ldc1 $f18, "#A4"           \n\t"                                       \
+    "pmaddhw $f14, $f14, $f10   \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18      */\
+    "paddw $f6, $f6, $f18       \n\t" /* +%4                              */\
+    "ldc1 $f16, 16+"#A3"        \n\t"                                       \
+    "pmaddhw $f0, $f0, $f16     \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08      */\
+    "ldc1 $f16, 24+"#A3"        \n\t"                                       \
+    "paddw $f6, $f6, $f8        \n\t" /* 4; a1=sum(even1) a0=sum(even0)   */\
+    "pmaddhw $f2, $f2, $f16     \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10      */\
+    "ldc1 $f16, 48+"#A3"        \n\t"                                       \
+    "pmaddhw $f4, $f4, $f16     \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24      */\
+    "ldc1 $f16, 56+"#A3"        \n\t"                                       \
+    "paddw $f12, $f12, $f14     \n\t" /* 7; b1=sum(odd1) b0=sum(odd0)     */\
+    "dli $10, 11                \n\t"                                       \
+    "pmaddhw $f10, $f10, $f16   \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26      */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "psubw $f8, $f6, $f12       \n\t" /* 6; a1-b1 a0-b0                   */\
+    "paddw $f6, $f6, $f12       \n\t" /* a1+b1 a0+b0                      */\
+    "paddw $f0, $f0, $f18       \n\t" /* +%4                              */\
+    "psraw $f6, $f6, $f16       \n\t" /* y1=a1+b1 y0=a0+b0                */\
+    "paddw $f0, $f0, $f2        \n\t" /* 1; a3=sum(even3) a2=sum(even2)   */\
+    "paddw $f4, $f4, $f10       \n\t" /* 5; b3=sum(odd3) b2=sum(odd2)     */\
+    "psraw $f8, $f8, $f16       \n\t" /* y6=a1-b1 y7=a0-b0                */\
+    "psubw $f14, $f0, $f4       \n\t" /* 2; a3-b3 a2-b2                   */\
+    "paddw $f0, $f0, $f4        \n\t" /* a3+b3 a2+b2                      */\
+    "psraw $f0, $f0, $f16       \n\t" /* y3=a3+b3 y2=a2+b2                */\
+    "psraw $f14, $f14, $f16     \n\t" /* y4=a3-b3 y5=a2-b2                */\
+    "dli $10, 0xb1              \n\t"                                       \
+    "packsswh $f6, $f6, $f0     \n\t" /* 0; y3 y2 y1 y0                   */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "packsswh $f14, $f14, $f8   \n\t" /* 4; y6 y7 y4 y5                   */\
+    "sdc1 $f6, "#A2"            \n\t" /* 3; save y3 y2 y1 y0              */\
+    "pshufh $f14, $f14, $f16    \n\t" /* y7 y6 y5 y4                      */\
+    "sdc1 $f14, 8+"#A2"         \n\t" /* 7; save y7 y6 y5 y4              */\
+
+
+#define DCT_8_INV_COL(A1,A2)                                                \
+    "ldc1 $f2, 2*8(%3)          \n\t"                                       \
+    "ldc1 $f6, 16*3+"#A1"       \n\t"                                       \
+    "ldc1 $f10, 16*5+"#A1"      \n\t"                                       \
+    "pmulhh $f0, $f2, $f6       \n\t" /* x3*(tg_3_16-1)                   */\
+    "ldc1 $f4, 0(%3)            \n\t"                                       \
+    "pmulhh $f2, $f2, $f10      \n\t" /* x5*(tg_3_16-1)                   */\
+    "ldc1 $f14, 16*7+"#A1"      \n\t"                                       \
+    "ldc1 $f12, 16*1+"#A1"      \n\t"                                       \
+    "pmulhh $f8, $f4, $f14      \n\t" /* x7*tg_1_16                       */\
+    "paddsh $f0, $f0, $f6       \n\t" /* x3*tg_3_16                       */\
+    "pmulhh $f4, $f4, $f12      \n\t" /* x1*tg_1_16                       */\
+    "paddsh $f2, $f2, $f6       \n\t" /* x3+x5*(tg_3_16-1)                */\
+    "psubsh $f0, $f0, $f10      \n\t" /* x3*tg_3_16-x5 = tm35             */\
+    "ldc1 $f6, 3*8(%3)          \n\t"                                       \
+    "paddsh $f2, $f2, $f10      \n\t" /* x3+x5*tg_3_16 = tp35             */\
+    "paddsh $f8, $f8, $f12      \n\t" /* x1+tg_1_16*x7 = tp17             */\
+    "psubsh $f4, $f4, $f14      \n\t" /* x1*tg_1_16-x7 = tm17             */\
+    "paddsh $f10, $f8, $f2      \n\t" /* tp17+tp35 = b0                   */\
+    "psubsh $f12, $f4, $f0      \n\t" /* tm17-tm35 = b3                   */\
+    "psubsh $f8, $f8, $f2       \n\t" /* tp17-tp35 = t1                   */\
+    "paddsh $f4, $f4, $f0       \n\t" /* tm17+tm35 = t2                   */\
+    "ldc1 $f14, 1*8(%3)         \n\t"                                       \
+    "sdc1 $f10, 3*16+"#A2"      \n\t" /* save b0                          */\
+    "paddsh $f2, $f8, $f4       \n\t" /* t1+t2                            */\
+    "sdc1 $f12, 5*16+"#A2"      \n\t" /* save b3                          */\
+    "psubsh $f8, $f8, $f4       \n\t" /* t1-t2                            */\
+    "ldc1 $f10, 2*16+"#A1"      \n\t"                                       \
+    "ldc1 $f12, 6*16+"#A1"      \n\t"                                       \
+    "pmulhh $f0, $f14, $f10     \n\t" /* x2*tg_2_16                       */\
+    "pmulhh $f14, $f14, $f12    \n\t" /* x6*tg_2_16                       */\
+    "pmulhh $f2, $f2, $f6       \n\t" /* ocos_4_16*(t1+t2) = b1/2         */\
+    "ldc1 $f4, 0*16+"#A1"       \n\t"                                       \
+    "pmulhh $f8, $f8, $f6       \n\t" /* ocos_4_16*(t1-t2) = b2/2         */\
+    "psubsh $f0, $f0, $f12      \n\t" /* t2*tg_2_16-x6 = tm26             */\
+    "ldc1 $f12, 4*16+"#A1"      \n\t"                                       \
+    "paddsh $f14, $f14, $f10    \n\t" /* x2+x6*tg_2_16 = tp26             */\
+    "psubsh $f6, $f4, $f12      \n\t" /* x0-x4 = tm04                     */\
+    "paddsh $f4, $f4, $f12      \n\t" /* x0+x4 = tp04                     */\
+    "paddsh $f10, $f4, $f14     \n\t" /* tp04+tp26 = a0                   */\
+    "psubsh $f12, $f6, $f0      \n\t" /* tm04-tm26 = a2                   */\
+    "psubsh $f4, $f4, $f14      \n\t" /* tp04-tp26 = a3                   */\
+    "paddsh $f6, $f6, $f0       \n\t" /* tm04+tm26 = a1                   */\
+    "paddsh $f2, $f2, $f2       \n\t" /* b1                               */\
+    "paddsh $f8, $f8, $f8       \n\t" /* b2                               */\
+    "psubsh $f14, $f6, $f2      \n\t" /* a1-b1                            */\
+    "dli $10, 6                 \n\t"                                       \
+    "paddsh $f6, $f6, $f2       \n\t" /* a1+b1                            */\
+    "dmtc1 $10, $f16            \n\t"                                       \
+    "psubsh $f0, $f12, $f8      \n\t" /* a2-b2                            */\
+    "paddsh $f12, $f12, $f8     \n\t" /* a2+b2                            */\
+    "psrah $f6, $f6, $f16       \n\t" /* dst1                             */\
+    "psrah $f12, $f12, $f16     \n\t" /* dst2                             */\
+    "ldc1 $f2, 3*16+"#A2"       \n\t" /* load b0                          */\
+    "psrah $f14, $f14, $f16     \n\t" /* dst6                             */\
+    "psrah $f0, $f0, $f16       \n\t" /* dst5                             */\
+    "sdc1 $f6, 1*16+"#A2"       \n\t"                                       \
+    "psubsh $f8, $f10, $f2      \n\t" /* a0-b0                            */\
+    "paddsh $f10, $f10, $f2     \n\t" /* a0+b0                            */\
+    "sdc1 $f12, 2*16+"#A2"      \n\t"                                       \
+    "ldc1 $f6, 5*16+"#A2"       \n\t" /* load b3                          */\
+    "psrah $f10, $f10, $f16     \n\t" /* dst0                             */\
+    "psrah $f8, $f8, $f16       \n\t" /* dst7                             */\
+    "sdc1 $f0, 5*16+"#A2"       \n\t"                                       \
+    "psubsh $f12, $f4, $f6      \n\t" /* a3-b3                            */\
+    "paddsh $f4, $f4, $f6       \n\t" /* a3+b3                            */\
+    "sdc1 $f14, 6*16+"#A2"      \n\t"                                       \
+    "sdc1 $f10, 0*16+"#A2"      \n\t"                                       \
+    "psrah $f4, $f4, $f16       \n\t" /* dst3                             */\
+    "sdc1 $f8, 7*16+"#A2"       \n\t"                                       \
+    "psrah $f12, $f12, $f16     \n\t" /* dst4                             */\
+    "sdc1 $f4, 3*16+"#A2"       \n\t"                                       \
+    "sdc1 $f12, 4*16+"#A2"      \n\t"                                       \
+
+
+void ff_xvid_idct_mmi(int16_t *block)
+{
+    __asm__ volatile (
+        //# Process each row
+        DCT_8_INV_ROW_MMI(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+        DCT_8_INV_ROW_MMI(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+        DCT_8_INV_ROW_MMI(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+        DCT_8_INV_ROW_MMI(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+        DCT_8_INV_ROW_MMI(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+        DCT_8_INV_ROW_MMI(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+        DCT_8_INV_ROW_MMI(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+        DCT_8_INV_ROW_MMI(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+        //# Process the columns (4 at a time)
+        DCT_8_INV_COL(0(%0), 0(%0))
+        DCT_8_INV_COL(8(%0), 8(%0))
+        ::"r"(block),"r"(rounder_0),"r"(tab_i_04_mmi),"r"(tg_1_16)
+        : "$10"
+    );
+}
+
+void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
+{
+    ff_xvid_idct_mmi(block);
+    ff_put_pixels_clamped_mmi(block, dest, line_size);
+}
+
+void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block)
+{
+    ff_xvid_idct_mmi(block);
+    ff_add_pixels_clamped_mmi(block, dest, line_size);
+}
diff --git a/libavcodec/mips/xvididct_init_mips.c b/libavcodec/mips/xvididct_init_mips.c
new file mode 100644
index 0000000..c1d82cc
--- /dev/null
+++ b/libavcodec/mips/xvididct_init_mips.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xvididct_mips.h"
+
+#if HAVE_MMI
+static av_cold void xvid_idct_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+    if (!high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+                avctx->idct_algo == FF_IDCT_XVID) {
+            c->idct_put = ff_xvid_idct_put_mmi;
+            c->idct_add = ff_xvid_idct_add_mmi;
+            c->idct = ff_xvid_idct_mmi;
+            c->perm_type = FF_IDCT_PERM_NONE;
+        }
+    }
+}
+#endif /* HAVE_MMI */
+
+av_cold void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+        unsigned high_bit_depth)
+{
+#if HAVE_MMI
+    xvid_idct_init_mmi(c, avctx, high_bit_depth);
+#endif /* HAVE_MMI */
+}
diff --git a/libavcodec/mips/xvididct_mips.h b/libavcodec/mips/xvididct_mips.h
new file mode 100644
index 0000000..0768aaa
--- /dev/null
+++ b/libavcodec/mips/xvididct_mips.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_XVIDIDCT_MIPS_H
+#define AVCODEC_MIPS_XVIDIDCT_MIPS_H
+
+#include "libavcodec/xvididct.h"
+
+void ff_xvid_idct_mmi(int16_t *block);
+void ff_xvid_idct_put_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+void ff_xvid_idct_add_mmi(uint8_t *dest, int32_t line_size, int16_t *block);
+
+#endif /* AVCODEC_MIPS_XVIDIDCT_MIPS_H */
diff --git a/libavcodec/mjpeg.h b/libavcodec/mjpeg.h
index 1ebe283..cd5d0af 100644
--- a/libavcodec/mjpeg.h
+++ b/libavcodec/mjpeg.h
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -117,6 +117,7 @@ enum JpegMarker {
 
 #define PREDICT(ret, topleft, top, left, predictor)\
     switch(predictor){\
+        case 0: ret= 0; break;\
         case 1: ret= left; break;\
         case 2: ret= top; break;\
         case 3: ret= topleft; break;\
diff --git a/libavcodec/mjpeg2jpeg_bsf.c b/libavcodec/mjpeg2jpeg_bsf.c
index eec3469..6f02bc0 100644
--- a/libavcodec/mjpeg2jpeg_bsf.c
+++ b/libavcodec/mjpeg2jpeg_bsf.c
@@ -2,20 +2,20 @@
  * MJPEG/AVI1 to JPEG/JFIF bitstream format filter
  * Copyright (c) 2010 Adrian Daerr and Nicolas George
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,12 @@
 
 #include "libavutil/error.h"
 #include "libavutil/mem.h"
+#include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
 #include "bsf.h"
 #include "jpegtables.h"
+#include "mjpeg.h"
 
 static const uint8_t jpeg_header[] = {
     0xff, 0xd8,                     // SOI
@@ -84,19 +86,24 @@ static int mjpeg2jpeg_filter(AVBSFContext *ctx, AVPacket *out)
     uint8_t *output;
 
     ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
 
     if (in->size < 12) {
         av_log(ctx, AV_LOG_ERROR, "input is truncated\n");
         ret = AVERROR_INVALIDDATA;
         goto fail;
     }
-    if (memcmp("AVI1", in->data + 6, 4)) {
-        av_log(ctx, AV_LOG_ERROR, "input is not MJPEG/AVI1\n");
+    if (AV_RB16(in->data) != 0xffd8) {
+        av_log(ctx, AV_LOG_ERROR, "input is not MJPEG\n");
         ret = AVERROR_INVALIDDATA;
         goto fail;
     }
-
-    input_skip = (in->data[4] << 8) + in->data[5] + 4;
+    if (in->data[2] == 0xff && in->data[3] == APP0) {
+        input_skip = (in->data[4] << 8) + in->data[5] + 4;
+    } else {
+        input_skip = 2;
+    }
     if (in->size < input_skip) {
         av_log(ctx, AV_LOG_ERROR, "input is truncated\n");
         ret = AVERROR_INVALIDDATA;
@@ -125,7 +132,12 @@ fail:
     return ret;
 }
 
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_MJPEG, AV_CODEC_ID_NONE,
+};
+
 const AVBitStreamFilter ff_mjpeg2jpeg_bsf = {
     .name           = "mjpeg2jpeg",
     .filter         = mjpeg2jpeg_filter,
+    .codec_ids      = codec_ids,
 };
diff --git a/libavcodec/mjpeg_parser.c b/libavcodec/mjpeg_parser.c
index ab65461..e548b00 100644
--- a/libavcodec/mjpeg_parser.c
+++ b/libavcodec/mjpeg_parser.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Alex Beregszaszi
  * Copyright (c) 2003-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,27 +28,44 @@
 
 #include "parser.h"
 
+typedef struct MJPEGParserContext{
+    ParseContext pc;
+    int size;
+}MJPEGParserContext;
 
 /**
  * Find the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame, or -1
  */
-static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
+static int find_frame_end(MJPEGParserContext *m, const uint8_t *buf, int buf_size){
+    ParseContext *pc= &m->pc;
     int vop_found, i;
-    uint16_t state;
+    uint32_t state;
 
     vop_found= pc->frame_start_found;
     state= pc->state;
 
     i=0;
     if(!vop_found){
-        for(i=0; i<buf_size; i++){
+        for(i=0; i<buf_size;){
             state= (state<<8) | buf[i];
-            if(state == 0xFFD8){
-                i++;
-                vop_found=1;
-                break;
+            if(state>=0xFFC00000 && state<=0xFFFEFFFF){
+                if(state>=0xFFD80000 && state<=0xFFD8FFFF){
+                    i++;
+                    vop_found=1;
+                    break;
+                }else if(state<0xFFD00000 || state>0xFFD9FFFF){
+                    m->size= (state&0xFFFF)-1;
+                }
             }
+            if(m->size>0){
+                int size= FFMIN(buf_size-i, m->size);
+                i+=size;
+                m->size-=size;
+                state=0;
+                continue;
+            }else
+                i++;
         }
     }
 
@@ -56,13 +73,25 @@ static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
         /* EOF considered as end of frame */
         if (buf_size == 0)
             return 0;
-        for(; i<buf_size; i++){
+        for(; i<buf_size;){
             state= (state<<8) | buf[i];
-            if(state == 0xFFD8){
-                pc->frame_start_found=0;
-                pc->state=0;
-                return i-1;
+            if(state>=0xFFC00000 && state<=0xFFFEFFFF){
+                if(state>=0xFFD80000 && state<=0xFFD8FFFF){
+                    pc->frame_start_found=0;
+                    pc->state=0;
+                    return i-3;
+                } else if(state<0xFFD00000 || state>0xFFD9FFFF){
+                    m->size= (state&0xFFFF)-1;
+                }
             }
+            if(m->size>0){
+                int size= FFMIN(buf_size-i, m->size);
+                i+=size;
+                m->size-=size;
+                state=0;
+                continue;
+            }else
+                i++;
         }
     }
     pc->frame_start_found= vop_found;
@@ -75,13 +104,14 @@ static int jpeg_parse(AVCodecParserContext *s,
                       const uint8_t **poutbuf, int *poutbuf_size,
                       const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    MJPEGParserContext *m = s->priv_data;
+    ParseContext *pc = &m->pc;
     int next;
 
     if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
         next= buf_size;
     }else{
-        next= find_frame_end(pc, buf, buf_size);
+        next= find_frame_end(m, buf, buf_size);
 
         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
             *poutbuf = NULL;
@@ -98,7 +128,7 @@ static int jpeg_parse(AVCodecParserContext *s,
 
 AVCodecParser ff_mjpeg_parser = {
     .codec_ids      = { AV_CODEC_ID_MJPEG },
-    .priv_data_size = sizeof(ParseContext),
+    .priv_data_size = sizeof(MJPEGParserContext),
     .parser_parse   = jpeg_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/mjpega_dump_header_bsf.c b/libavcodec/mjpega_dump_header_bsf.c
index b3ce26a..ca5fb3a 100644
--- a/libavcodec/mjpega_dump_header_bsf.c
+++ b/libavcodec/mjpega_dump_header_bsf.c
@@ -2,20 +2,20 @@
  * MJPEG A dump header bitstream filter
  * Copyright (c) 2006 Baptiste Coudurier
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mjpegbdec.c b/libavcodec/mjpegbdec.c
index 3775aa3..a858707 100644
--- a/libavcodec/mjpegbdec.c
+++ b/libavcodec/mjpegbdec.c
@@ -2,20 +2,20 @@
  * Apple MJPEG-B decoder
  * Copyright (c) 2002 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -55,6 +55,7 @@ static int mjpegb_decode_frame(AVCodecContext *avctx,
 
     buf_ptr = buf;
     buf_end = buf + buf_size;
+    s->got_picture = 0;
 
 read_header:
     /* reset on every SOI */
@@ -122,7 +123,7 @@ read_header:
                       8 * FFMIN(field_size, buf_end - buf_ptr - sos_offs));
         s->mjpb_skiptosod = (sod_offs - sos_offs - show_bits(&s->gb, 16));
         s->start_code = SOS;
-        if (ff_mjpeg_decode_sos(s, NULL, NULL) < 0 &&
+        if (ff_mjpeg_decode_sos(s, NULL, 0, NULL) < 0 &&
             (avctx->err_recognition & AV_EF_EXPLODE))
           return AVERROR_INVALIDDATA;
     }
@@ -133,13 +134,17 @@ read_header:
         if (s->bottom_field != s->interlace_polarity && second_field_offs)
         {
             buf_ptr = buf + second_field_offs;
-            second_field_offs = 0;
             goto read_header;
             }
     }
 
     //XXX FIXME factorize, this looks very similar to the EOI code
 
+    if(!s->got_picture) {
+        av_log(avctx, AV_LOG_WARNING, "no picture\n");
+        return buf_size;
+    }
+
     if ((ret = av_frame_ref(data, s->picture_ptr)) < 0)
         return ret;
     *got_frame = 1;
@@ -162,5 +167,6 @@ AVCodec ff_mjpegb_decoder = {
     .close          = ff_mjpeg_decode_end,
     .decode         = mjpegb_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index 5625929..d773329 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,12 +30,12 @@
  * MJPEG decoder.
  */
 
-#include <assert.h>
-
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "copy_block.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "jpegtables.h"
@@ -43,6 +43,9 @@
 #include "mjpegdec.h"
 #include "jpeglsdec.h"
 #include "put_bits.h"
+#include "tiff.h"
+#include "exif.h"
+#include "bytestream.h"
 
 
 static int build_vlc(VLC *vlc, const uint8_t *bits_table,
@@ -54,7 +57,7 @@ static int build_vlc(VLC *vlc, const uint8_t *bits_table,
     uint16_t huff_sym[256];
     int i;
 
-    assert(nb_codes <= 256);
+    av_assert0(nb_codes <= 256);
 
     ff_mjpeg_build_huffman_codes(huff_size, huff_code, bits_table, val_table);
 
@@ -84,6 +87,26 @@ static void build_basic_mjpeg_vlc(MJpegDecodeContext *s)
               avpriv_mjpeg_val_ac_chrominance, 251, 0, 0);
 }
 
+static void parse_avid(MJpegDecodeContext *s, uint8_t *buf, int len)
+{
+    s->buggy_avid = 1;
+    if (len > 14 && buf[12] == 1) /* 1 - NTSC */
+        s->interlace_polarity = 1;
+    if (len > 14 && buf[12] == 2) /* 2 - PAL */
+        s->interlace_polarity = 0;
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(s->avctx, AV_LOG_INFO, "AVID: len:%d %d\n", len, len > 14 ? buf[12] : -1);
+}
+
+static void init_idct(AVCodecContext *avctx)
+{
+    MJpegDecodeContext *s = avctx->priv_data;
+
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
+                      ff_zigzag_direct);
+}
+
 av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
 {
     MJpegDecodeContext *s = avctx->priv_data;
@@ -98,13 +121,12 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     ff_blockdsp_init(&s->bdsp, avctx);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
-    ff_idctdsp_init(&s->idsp, avctx);
-    ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
-                      ff_zigzag_direct);
+    init_idct(avctx);
     s->buffer_size   = 0;
     s->buffer        = NULL;
     s->start_code    = -1;
     s->first_picture = 1;
+    s->got_picture   = 0;
     s->org_height    = avctx->coded_height;
     avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
     avctx->colorspace = AVCOL_SPC_BT470BG;
@@ -112,19 +134,28 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
     build_basic_mjpeg_vlc(s);
 
     if (s->extern_huff) {
-        int ret;
-        av_log(avctx, AV_LOG_INFO, "mjpeg: using external huffman table\n");
+        av_log(avctx, AV_LOG_INFO, "using external huffman table\n");
         init_get_bits(&s->gb, avctx->extradata, avctx->extradata_size * 8);
-        if ((ret = ff_mjpeg_decode_dht(s))) {
+        if (ff_mjpeg_decode_dht(s)) {
             av_log(avctx, AV_LOG_ERROR,
-                   "mjpeg: error using external huffman table\n");
-            return ret;
+                   "error using external huffman table, switching back to internal\n");
+            build_basic_mjpeg_vlc(s);
         }
     }
     if (avctx->field_order == AV_FIELD_BB) { /* quicktime icefloe 019 */
         s->interlace_polarity = 1;           /* bottom field first */
-        av_log(avctx, AV_LOG_DEBUG, "mjpeg bottom field first\n");
+        av_log(avctx, AV_LOG_DEBUG, "bottom field first\n");
+    } else if (avctx->field_order == AV_FIELD_UNKNOWN) {
+        if (avctx->codec_tag == AV_RL32("MJPG"))
+            s->interlace_polarity = 1;
     }
+
+    if (   avctx->extradata_size > 8
+        && AV_RL32(avctx->extradata) == 0x2C
+        && AV_RL32(avctx->extradata+4) == 0x18) {
+        parse_avid(s, avctx->extradata, avctx->extradata_size);
+    }
+
     if (avctx->codec->id == AV_CODEC_ID_AMV)
         s->flipped = 1;
 
@@ -135,15 +166,20 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
 /* quantize tables */
 int ff_mjpeg_decode_dqt(MJpegDecodeContext *s)
 {
-    int len, index, i, j;
+    int len, index, i;
 
     len = get_bits(&s->gb, 16) - 2;
 
+    if (8*len > get_bits_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "dqt: len %d is too large\n", len);
+        return AVERROR_INVALIDDATA;
+    }
+
     while (len >= 65) {
-        /* only 8-bit precision handled */
-        if (get_bits(&s->gb, 4) != 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "dqt: 16-bit precision\n");
-            return -1;
+        int pr = get_bits(&s->gb, 4);
+        if (pr > 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "dqt: invalid precision\n");
+            return AVERROR_INVALIDDATA;
         }
         index = get_bits(&s->gb, 4);
         if (index >= 4)
@@ -151,16 +187,15 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s)
         av_log(s->avctx, AV_LOG_DEBUG, "index=%d\n", index);
         /* read quant table */
         for (i = 0; i < 64; i++) {
-            j = s->scantable.permutated[i];
-            s->quant_matrixes[index][j] = get_bits(&s->gb, 8);
+            s->quant_matrixes[index][i] = get_bits(&s->gb, pr ? 16 : 8);
         }
 
         // XXX FIXME fine-tune, and perhaps add dc too
-        s->qscale[index] = FFMAX(s->quant_matrixes[index][s->scantable.permutated[1]],
-                                 s->quant_matrixes[index][s->scantable.permutated[8]]) >> 1;
+        s->qscale[index] = FFMAX(s->quant_matrixes[index][1],
+                                 s->quant_matrixes[index][8]) >> 1;
         av_log(s->avctx, AV_LOG_DEBUG, "qscale[%d]: %d\n",
                index, s->qscale[index]);
-        len -= 65;
+        len -= 1 + 64 * (1+pr);
     }
     return 0;
 }
@@ -175,6 +210,11 @@ int ff_mjpeg_decode_dht(MJpegDecodeContext *s)
 
     len = get_bits(&s->gb, 16) - 2;
 
+    if (8*len > get_bits_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "dht: len %d is too large\n", len);
+        return AVERROR_INVALIDDATA;
+    }
+
     while (len > 0) {
         if (len < 17)
             return AVERROR_INVALIDDATA;
@@ -222,21 +262,36 @@ int ff_mjpeg_decode_dht(MJpegDecodeContext *s)
 
 int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
 {
+    int len, nb_components, i, width, height, bits, ret;
+    unsigned pix_fmt_id;
     int h_count[MAX_COMPONENTS] = { 0 };
     int v_count[MAX_COMPONENTS] = { 0 };
-    int len, nb_components, i, width, height, bits, pix_fmt_id, ret;
+
+    s->cur_scan = 0;
+    memset(s->upscale_h, 0, sizeof(s->upscale_h));
+    memset(s->upscale_v, 0, sizeof(s->upscale_v));
 
     /* XXX: verify len field validity */
     len     = get_bits(&s->gb, 16);
     bits    = get_bits(&s->gb, 8);
 
+    if (bits > 16 || bits < 1) {
+        av_log(s->avctx, AV_LOG_ERROR, "bits %d is invalid\n", bits);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->avctx->bits_per_raw_sample != bits) {
+        av_log(s->avctx, s->avctx->bits_per_raw_sample > 0 ? AV_LOG_INFO : AV_LOG_DEBUG, "Changing bps from %d to %d\n", s->avctx->bits_per_raw_sample, bits);
+        s->avctx->bits_per_raw_sample = bits;
+        init_idct(s->avctx);
+    }
     if (s->pegasus_rct)
         bits = 9;
     if (bits == 9 && !s->pegasus_rct)
         s->rct  = 1;    // FIXME ugly
 
-    if (bits != 8 && !s->lossless) {
-        av_log(s->avctx, AV_LOG_ERROR, "only 8 bits/component accepted\n");
+    if(s->lossless && s->avctx->lowres){
+        av_log(s->avctx, AV_LOG_ERROR, "lowres is not possible with lossless jpeg\n");
         return -1;
     }
 
@@ -282,8 +337,10 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         if (v_count[i] > s->v_max)
             s->v_max = v_count[i];
         s->quant_index[i] = get_bits(&s->gb, 8);
-        if (s->quant_index[i] >= 4)
+        if (s->quant_index[i] >= 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "quant_index is invalid\n");
             return AVERROR_INVALIDDATA;
+        }
         if (!h_count[i] || !v_count[i]) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Invalid sampling factor in component %d %d:%d\n",
@@ -295,28 +352,35 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
                i, h_count[i], v_count[i],
                s->component_id[i], s->quant_index[i]);
     }
+    if (   nb_components == 4
+        && s->component_id[0] == 'C' - 1
+        && s->component_id[1] == 'M' - 1
+        && s->component_id[2] == 'Y' - 1
+        && s->component_id[3] == 'K' - 1)
+        s->adobe_transform = 0;
 
     if (s->ls && (s->h_max > 1 || s->v_max > 1)) {
         avpriv_report_missing_feature(s->avctx, "Subsampling in JPEG-LS");
         return AVERROR_PATCHWELCOME;
     }
 
-    if (s->v_max == 1 && s->h_max == 1 && s->lossless == 1)
-        s->rgb = 1;
 
     /* if different size, realloc/alloc picture */
     if (width != s->width || height != s->height || bits != s->bits ||
         memcmp(s->h_count, h_count, sizeof(h_count))                ||
         memcmp(s->v_count, v_count, sizeof(v_count))) {
+
         s->width      = width;
         s->height     = height;
         s->bits       = bits;
         memcpy(s->h_count, h_count, sizeof(h_count));
         memcpy(s->v_count, v_count, sizeof(v_count));
         s->interlaced = 0;
+        s->got_picture = 0;
 
         /* test interlaced mode */
         if (s->first_picture   &&
+            (s->multiscope != 2 || s->avctx->time_base.den >= 25 * s->avctx->time_base.num) &&
             s->org_height != 0 &&
             s->height < ((s->org_height * 3) / 4)) {
             s->interlaced                    = 1;
@@ -333,9 +397,18 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         s->first_picture = 0;
     }
 
-    if (!(s->interlaced && (s->bottom_field == !s->interlace_polarity))) {
+    if (s->got_picture && s->interlaced && (s->bottom_field == !s->interlace_polarity)) {
+        if (s->progressive) {
+            avpriv_request_sample(s->avctx, "progressively coded interlaced picture");
+            return AVERROR_INVALIDDATA;
+        }
+    } else{
+        if (s->v_max == 1 && s->h_max == 1 && s->lossless==1 && (nb_components==3 || nb_components==4))
+            s->rgb = 1;
+        else if (!s->lossless)
+            s->rgb = 0;
     /* XXX: not complete test ! */
-    pix_fmt_id = (s->h_count[0] << 28) | (s->v_count[0] << 24) |
+    pix_fmt_id = ((unsigned)s->h_count[0] << 28) | (s->v_count[0] << 24) |
                  (s->h_count[1] << 20) | (s->v_count[1] << 16) |
                  (s->h_count[2] << 12) | (s->v_count[2] <<  8) |
                  (s->h_count[3] <<  4) |  s->v_count[3];
@@ -347,38 +420,187 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
     if (!(pix_fmt_id & 0x0D0D0D0D))
         pix_fmt_id -= (pix_fmt_id & 0x0F0F0F0F) >> 1;
 
+    for (i = 0; i < 8; i++) {
+        int j = 6 + (i&1) - (i&6);
+        int is = (pix_fmt_id >> (4*i)) & 0xF;
+        int js = (pix_fmt_id >> (4*j)) & 0xF;
+
+        if (is == 1 && js != 2 && (i < 2 || i > 5))
+            js = (pix_fmt_id >> ( 8 + 4*(i&1))) & 0xF;
+        if (is == 1 && js != 2 && (i < 2 || i > 5))
+            js = (pix_fmt_id >> (16 + 4*(i&1))) & 0xF;
+
+        if (is == 1 && js == 2) {
+            if (i & 1) s->upscale_h[j/2] = 1;
+            else       s->upscale_v[j/2] = 1;
+        }
+    }
+
     switch (pix_fmt_id) {
     case 0x11111100:
         if (s->rgb)
-            s->avctx->pix_fmt = AV_PIX_FMT_BGRA;
+            s->avctx->pix_fmt = s->bits <= 9 ? AV_PIX_FMT_BGR24 : AV_PIX_FMT_BGR48;
+        else {
+            if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+                s->avctx->pix_fmt = s->bits <= 8 ? AV_PIX_FMT_GBRP : AV_PIX_FMT_GBRP16;
+            } else {
+                if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+                else              s->avctx->pix_fmt = AV_PIX_FMT_YUV444P16;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+            }
+        }
+        av_assert0(s->nb_components == 3);
+        break;
+    case 0x11111111:
+        if (s->rgb)
+            s->avctx->pix_fmt = s->bits <= 9 ? AV_PIX_FMT_ABGR : AV_PIX_FMT_RGBA64;
         else {
-            s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+            if (s->adobe_transform == 0 && s->bits <= 8) {
+                s->avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            } else {
+                s->avctx->pix_fmt = s->bits <= 8 ? AV_PIX_FMT_YUVA444P : AV_PIX_FMT_YUVA444P16;
+                s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+            }
+        }
+        av_assert0(s->nb_components == 4);
+        break;
+    case 0x22111122:
+    case 0x22111111:
+        if (s->adobe_transform == 0 && s->bits <= 8) {
+            s->avctx->pix_fmt = AV_PIX_FMT_GBRAP;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+        } else if (s->adobe_transform == 2 && s->bits <= 8) {
+            s->avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        } else {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P;
+            else              s->avctx->pix_fmt = AV_PIX_FMT_YUVA420P16;
             s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         }
-        assert(s->nb_components == 3);
+        av_assert0(s->nb_components == 4);
+        break;
+    case 0x12121100:
+    case 0x22122100:
+    case 0x21211100:
+    case 0x22211200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        else
+            goto unk_pixfmt;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        break;
+    case 0x22221100:
+    case 0x22112200:
+    case 0x11222200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        else
+            goto unk_pixfmt;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     case 0x11000000:
-        s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+    case 0x13000000:
+    case 0x14000000:
+    case 0x31000000:
+    case 0x33000000:
+    case 0x34000000:
+    case 0x41000000:
+    case 0x43000000:
+    case 0x44000000:
+        if(s->bits <= 8)
+            s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        else
+            s->avctx->pix_fmt = AV_PIX_FMT_GRAY16;
         break;
     case 0x12111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV440P : AV_PIX_FMT_YUVJ440P;
-        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+    case 0x14121200:
+    case 0x14111100:
+    case 0x22211100:
+    case 0x22112100:
+        if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            else
+                goto unk_pixfmt;
+            s->upscale_v[0] = s->upscale_v[1] = 1;
+        } else {
+            if (pix_fmt_id == 0x14111100)
+                s->upscale_v[1] = s->upscale_v[2] = 1;
+            if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV440P : AV_PIX_FMT_YUVJ440P;
+            else
+                goto unk_pixfmt;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        }
         break;
     case 0x21111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+        if (s->component_id[0] == 'Q' && s->component_id[1] == 'F' && s->component_id[2] == 'A') {
+            if (s->bits <= 8) s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
+            else
+                goto unk_pixfmt;
+            s->upscale_h[0] = s->upscale_h[1] = 1;
+        } else {
+            if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+            else              s->avctx->pix_fmt = AV_PIX_FMT_YUV422P16;
+            s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        }
+        break;
+    case 0x31111100:
+        if (s->bits > 8)
+            goto unk_pixfmt;
+        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        s->upscale_h[1] = s->upscale_h[2] = 2;
+        break;
+    case 0x22121100:
+    case 0x22111200:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV422P : AV_PIX_FMT_YUVJ422P;
+        else
+            goto unk_pixfmt;
         s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     case 0x22111100:
-        s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUVJ420P;
+    case 0x42111100:
+    case 0x24111100:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUVJ420P;
+        else              s->avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
+        s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
+        if (pix_fmt_id == 0x42111100) {
+            if (s->bits > 8)
+                goto unk_pixfmt;
+            s->upscale_h[1] = s->upscale_h[2] = 1;
+        } else if (pix_fmt_id == 0x24111100) {
+            if (s->bits > 8)
+                goto unk_pixfmt;
+            s->upscale_v[1] = s->upscale_v[2] = 1;
+        }
+        break;
+    case 0x41111100:
+        if (s->bits <= 8) s->avctx->pix_fmt = s->cs_itu601 ? AV_PIX_FMT_YUV411P : AV_PIX_FMT_YUVJ411P;
+        else
+            goto unk_pixfmt;
         s->avctx->color_range = s->cs_itu601 ? AVCOL_RANGE_MPEG : AVCOL_RANGE_JPEG;
         break;
     default:
-        av_log(s->avctx, AV_LOG_ERROR, "Unhandled pixel format 0x%x\n", pix_fmt_id);
+unk_pixfmt:
+        av_log(s->avctx, AV_LOG_ERROR, "Unhandled pixel format 0x%x bits:%d\n", pix_fmt_id, s->bits);
+        memset(s->upscale_h, 0, sizeof(s->upscale_h));
+        memset(s->upscale_v, 0, sizeof(s->upscale_v));
+        return AVERROR_PATCHWELCOME;
+    }
+    if ((AV_RB32(s->upscale_h) || AV_RB32(s->upscale_v)) && s->avctx->lowres) {
+        av_log(s->avctx, AV_LOG_ERROR, "lowres not supported for weird subsampling\n");
         return AVERROR_PATCHWELCOME;
     }
     if (s->ls) {
-        if (s->nb_components > 1)
+        memset(s->upscale_h, 0, sizeof(s->upscale_h));
+        memset(s->upscale_v, 0, sizeof(s->upscale_v));
+        if (s->nb_components == 3) {
             s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else if (s->nb_components != 1) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported number of components %d\n", s->nb_components);
+            return AVERROR_PATCHWELCOME;
+        } else if (s->palette_index && s->bits <= 8)
+            s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
         else if (s->bits <= 8)
             s->avctx->pix_fmt = AV_PIX_FMT_GRAY8;
         else
@@ -391,16 +613,21 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         return AVERROR_BUG;
     }
 
+    if (s->avctx->skip_frame == AVDISCARD_ALL) {
+        s->picture_ptr->pict_type = AV_PICTURE_TYPE_I;
+        s->picture_ptr->key_frame = 1;
+        s->got_picture            = 1;
+        return 0;
+    }
+
     av_frame_unref(s->picture_ptr);
-    if (ff_get_buffer(s->avctx, s->picture_ptr, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (ff_get_buffer(s->avctx, s->picture_ptr, AV_GET_BUFFER_FLAG_REF) < 0)
         return -1;
-    }
     s->picture_ptr->pict_type = AV_PICTURE_TYPE_I;
     s->picture_ptr->key_frame = 1;
     s->got_picture            = 1;
 
-    for (i = 0; i < 3; i++)
+    for (i = 0; i < 4; i++)
         s->linesize[i] = s->picture_ptr->linesize[i] << s->interlaced;
 
     ff_dlog(s->avctx, "%d %d %d %d %d %d\n",
@@ -411,6 +638,12 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
         av_log(s->avctx, AV_LOG_DEBUG, "decode_sof0: error, len(%d) mismatch\n", len);
     }
 
+    if ((s->rgb && !s->lossless && !s->ls) ||
+        (!s->rgb && s->ls && s->nb_components > 1)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported coding and pixel format combination\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
     /* totally blank picture as progressive JPEG will only add details to it */
     if (s->progressive) {
         int bw = (width  + s->h_max * 8 - 1) / (s->h_max * 8);
@@ -419,8 +652,10 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
             int size = bw * bh * s->h_count[i] * s->v_count[i];
             av_freep(&s->blocks[i]);
             av_freep(&s->last_nnz[i]);
-            s->blocks[i]       = av_malloc(size * sizeof(**s->blocks));
-            s->last_nnz[i]     = av_mallocz(size * sizeof(**s->last_nnz));
+            s->blocks[i]       = av_mallocz_array(size, sizeof(**s->blocks));
+            s->last_nnz[i]     = av_mallocz_array(size, sizeof(**s->last_nnz));
+            if (!s->blocks[i] || !s->last_nnz[i])
+                return AVERROR(ENOMEM);
             s->block_stride[i] = bw * s->h_count[i];
         }
         memset(s->coefs_finished, 0, sizeof(s->coefs_finished));
@@ -432,11 +667,11 @@ static inline int mjpeg_decode_dc(MJpegDecodeContext *s, int dc_index)
 {
     int code;
     code = get_vlc2(&s->gb, s->vlcs[0][dc_index].table, 9, 2);
-    if (code < 0) {
+    if (code < 0 || code > 16) {
         av_log(s->avctx, AV_LOG_WARNING,
                "mjpeg_decode_dc: bad vlc: %d:%d (%p)\n",
                0, dc_index, &s->vlcs[0][dc_index]);
-        return 0xffff;
+        return 0xfffff;
     }
 
     if (code)
@@ -453,11 +688,12 @@ static int decode_block(MJpegDecodeContext *s, int16_t *block, int component,
 
     /* DC coef */
     val = mjpeg_decode_dc(s, dc_index);
-    if (val == 0xffff) {
+    if (val == 0xfffff) {
         av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
         return AVERROR_INVALIDDATA;
     }
     val = val * quant_matrix[0] + s->last_dc[component];
+    val = FFMIN(val, 32767);
     s->last_dc[component] = val;
     block[0] = val;
     /* AC coefs */
@@ -486,7 +722,7 @@ static int decode_block(MJpegDecodeContext *s, int16_t *block, int component,
                 return AVERROR_INVALIDDATA;
             }
             j        = s->scantable.permutated[i];
-            block[j] = level * quant_matrix[j];
+            block[j] = level * quant_matrix[i];
         }
     } while (i < 63);
     CLOSE_READER(re, &s->gb);}
@@ -501,11 +737,11 @@ static int decode_dc_progressive(MJpegDecodeContext *s, int16_t *block,
     int val;
     s->bdsp.clear_block(block);
     val = mjpeg_decode_dc(s, dc_index);
-    if (val == 0xffff) {
+    if (val == 0xfffff) {
         av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
         return AVERROR_INVALIDDATA;
     }
-    val = (val * quant_matrix[0] << Al) + s->last_dc[component];
+    val = (val * (quant_matrix[0] << Al)) + s->last_dc[component];
     s->last_dc[component] = val;
     block[0] = val;
     return 0;
@@ -548,14 +784,14 @@ static int decode_block_progressive(MJpegDecodeContext *s, int16_t *block,
                 if (i >= se) {
                     if (i == se) {
                         j = s->scantable.permutated[se];
-                        block[j] = level * quant_matrix[j] << Al;
+                        block[j] = level * (quant_matrix[se] << Al);
                         break;
                     }
                     av_log(s->avctx, AV_LOG_ERROR, "error count: %d\n", i);
                     return AVERROR_INVALIDDATA;
                 }
                 j = s->scantable.permutated[i];
-                block[j] = level * quant_matrix[j] << Al;
+                block[j] = level * (quant_matrix[i] << Al);
             } else {
                 if (run == 0xF) {// ZRL - skip 15 coefficients
                     i += 15;
@@ -588,7 +824,7 @@ static int decode_block_progressive(MJpegDecodeContext *s, int16_t *block,
     UPDATE_CACHE(re, &s->gb);                                       \
     sign = block[j] >> 15;                                          \
     block[j] += SHOW_UBITS(re, &s->gb, 1) *                         \
-                ((quant_matrix[j] ^ sign) - sign) << Al;            \
+                ((quant_matrix[i] ^ sign) - sign) << Al;            \
     LAST_SKIP_BITS(re, &s->gb, 1);                                  \
 }
 
@@ -634,7 +870,7 @@ static int decode_block_refinement(MJpegDecodeContext *s, int16_t *block,
                 ZERO_RUN;
                 j = s->scantable.permutated[i];
                 val--;
-                block[j] = ((quant_matrix[j]^val) - val) << Al;
+                block[j] = ((quant_matrix[i] << Al) ^ val) - val;
                 if (i == se) {
                     if (i > *last_nnz)
                         *last_nnz = i;
@@ -675,46 +911,101 @@ static int decode_block_refinement(MJpegDecodeContext *s, int16_t *block,
 #undef REFINE_BIT
 #undef ZERO_RUN
 
-static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
-                                 int point_transform)
+static int handle_rstn(MJpegDecodeContext *s, int nb_components)
+{
+    int i;
+    int reset = 0;
+
+    if (s->restart_interval) {
+        s->restart_count--;
+        if(s->restart_count == 0 && s->avctx->codec_id == AV_CODEC_ID_THP){
+            align_get_bits(&s->gb);
+            for (i = 0; i < nb_components; i++) /* reset dc */
+                s->last_dc[i] = (4 << s->bits);
+        }
+
+        i = 8 + ((-get_bits_count(&s->gb)) & 7);
+        /* skip RSTn */
+        if (s->restart_count == 0) {
+            if(   show_bits(&s->gb, i) == (1 << i) - 1
+               || show_bits(&s->gb, i) == 0xFF) {
+                int pos = get_bits_count(&s->gb);
+                align_get_bits(&s->gb);
+                while (get_bits_left(&s->gb) >= 8 && show_bits(&s->gb, 8) == 0xFF)
+                    skip_bits(&s->gb, 8);
+                if (get_bits_left(&s->gb) >= 8 && (get_bits(&s->gb, 8) & 0xF8) == 0xD0) {
+                    for (i = 0; i < nb_components; i++) /* reset dc */
+                        s->last_dc[i] = (4 << s->bits);
+                    reset = 1;
+                } else
+                    skip_bits_long(&s->gb, pos - get_bits_count(&s->gb));
+            }
+        }
+    }
+    return reset;
+}
+
+static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int nb_components, int predictor, int point_transform)
 {
     int i, mb_x, mb_y;
     uint16_t (*buffer)[4];
-    int left[3], top[3], topleft[3];
+    int left[4], top[4], topleft[4];
     const int linesize = s->linesize[0];
-    const int mask     = (1 << s->bits) - 1;
+    const int mask     = ((1 << s->bits) - 1) << point_transform;
+    int resync_mb_y = 0;
+    int resync_mb_x = 0;
+
+    if (s->nb_components != 3 && s->nb_components != 4)
+        return AVERROR_INVALIDDATA;
+    if (s->v_max != 1 || s->h_max != 1 || !s->lossless)
+        return AVERROR_INVALIDDATA;
+
+
+    s->restart_count = s->restart_interval;
 
     av_fast_malloc(&s->ljpeg_buffer, &s->ljpeg_buffer_size,
                    (unsigned)s->mb_width * 4 * sizeof(s->ljpeg_buffer[0][0]));
     buffer = s->ljpeg_buffer;
 
-    for (i = 0; i < 3; i++)
-        buffer[0][i] = 1 << (s->bits + point_transform - 1);
+    for (i = 0; i < 4; i++)
+        buffer[0][i] = 1 << (s->bits - 1);
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
-        const int modified_predictor = mb_y ? predictor : 1;
         uint8_t *ptr = s->picture_ptr->data[0] + (linesize * mb_y);
 
         if (s->interlaced && s->bottom_field)
             ptr += linesize >> 1;
 
-        for (i = 0; i < 3; i++)
+        for (i = 0; i < 4; i++)
             top[i] = left[i] = topleft[i] = buffer[0][i];
 
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-            if (s->restart_interval && !s->restart_count)
+            int modified_predictor = predictor;
+
+            if (s->restart_interval && !s->restart_count){
                 s->restart_count = s->restart_interval;
+                resync_mb_x = mb_x;
+                resync_mb_y = mb_y;
+                for(i=0; i<4; i++)
+                    top[i] = left[i]= topleft[i]= 1 << (s->bits - 1);
+            }
+            if (mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x || !mb_x)
+                modified_predictor = 1;
 
-            for (i = 0; i < 3; i++) {
-                int pred;
+            for (i=0;i<nb_components;i++) {
+                int pred, dc;
 
                 topleft[i] = top[i];
                 top[i]     = buffer[mb_x][i];
 
                 PREDICT(pred, topleft[i], top[i], left[i], modified_predictor);
 
+                dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                if(dc == 0xFFFFF)
+                    return -1;
+
                 left[i] = buffer[mb_x][i] =
-                    mask & (pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform));
+                    mask & (pred + (dc * (1 << point_transform)));
             }
 
             if (s->restart_interval && !--s->restart_count) {
@@ -722,24 +1013,54 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
                 skip_bits(&s->gb, 16); /* skip RSTn */
             }
         }
-
-        if (s->rct) {
+        if (s->rct && s->nb_components == 4) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
-                ptr[4 * mb_x + 0] = buffer[mb_x][1] + ptr[4 * mb_x + 1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][2] + ptr[4 * mb_x + 1];
+                ptr[4*mb_x + 2] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
+                ptr[4*mb_x + 1] = buffer[mb_x][1] + ptr[4*mb_x + 2];
+                ptr[4*mb_x + 3] = buffer[mb_x][2] + ptr[4*mb_x + 2];
+                ptr[4*mb_x + 0] = buffer[mb_x][3];
+            }
+        } else if (s->nb_components == 4) {
+            for(i=0; i<nb_components; i++) {
+                int c= s->comp_index[i];
+                if (s->bits <= 8) {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ptr[4*mb_x+3-c] = buffer[mb_x][i];
+                    }
+                } else if(s->bits == 9) {
+                    return AVERROR_PATCHWELCOME;
+                } else {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ((uint16_t*)ptr)[4*mb_x+c] = buffer[mb_x][i];
+                    }
+                }
+            }
+        } else if (s->rct) {
+            for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                ptr[3*mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2] - 0x200) >> 2);
+                ptr[3*mb_x + 0] = buffer[mb_x][1] + ptr[3*mb_x + 1];
+                ptr[3*mb_x + 2] = buffer[mb_x][2] + ptr[3*mb_x + 1];
             }
         } else if (s->pegasus_rct) {
             for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2]) >> 2);
-                ptr[4 * mb_x + 0] = buffer[mb_x][1] + ptr[4 * mb_x + 1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][2] + ptr[4 * mb_x + 1];
+                ptr[3*mb_x + 1] = buffer[mb_x][0] - ((buffer[mb_x][1] + buffer[mb_x][2]) >> 2);
+                ptr[3*mb_x + 0] = buffer[mb_x][1] + ptr[3*mb_x + 1];
+                ptr[3*mb_x + 2] = buffer[mb_x][2] + ptr[3*mb_x + 1];
             }
         } else {
-            for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-                ptr[4 * mb_x + 0] = buffer[mb_x][2];
-                ptr[4 * mb_x + 1] = buffer[mb_x][1];
-                ptr[4 * mb_x + 2] = buffer[mb_x][0];
+            for(i=0; i<nb_components; i++) {
+                int c= s->comp_index[i];
+                if (s->bits <= 8) {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ptr[3*mb_x+2-c] = buffer[mb_x][i];
+                    }
+                } else if(s->bits == 9) {
+                    return AVERROR_PATCHWELCOME;
+                } else {
+                    for(mb_x = 0; mb_x < s->mb_width; mb_x++) {
+                        ((uint16_t*)ptr)[3*mb_x+2-c] = buffer[mb_x][i];
+                    }
+                }
             }
         }
     }
@@ -749,48 +1070,91 @@ static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor,
 static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
                                  int point_transform, int nb_components)
 {
-    int i, mb_x, mb_y;
+    int i, mb_x, mb_y, mask;
+    int bits= (s->bits+7)&~7;
+    int resync_mb_y = 0;
+    int resync_mb_x = 0;
+
+    point_transform += bits - s->bits;
+    mask = ((1 << s->bits) - 1) << point_transform;
+
+    av_assert0(nb_components>=1 && nb_components<=4);
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-            if (s->restart_interval && !s->restart_count)
+            if (s->restart_interval && !s->restart_count){
                 s->restart_count = s->restart_interval;
+                resync_mb_x = mb_x;
+                resync_mb_y = mb_y;
+            }
 
-            if (mb_x == 0 || mb_y == 0 || s->interlaced) {
+            if(!mb_x || mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x || s->interlaced){
+                int toprow  = mb_y == resync_mb_y || mb_y == resync_mb_y+1 && mb_x < resync_mb_x;
+                int leftcol = !mb_x || mb_y == resync_mb_y && mb_x == resync_mb_x;
                 for (i = 0; i < nb_components; i++) {
                     uint8_t *ptr;
+                    uint16_t *ptr16;
                     int n, h, v, x, y, c, j, linesize;
-                    n        = s->nb_blocks[i];
-                    c        = s->comp_index[i];
-                    h        = s->h_scount[i];
-                    v        = s->v_scount[i];
-                    x        = 0;
-                    y        = 0;
-                    linesize = s->linesize[c];
-
-                    for (j = 0; j < n; j++) {
-                        int pred;
-                        // FIXME optimize this crap
-                        ptr = s->picture_ptr->data[c] +
-                              (linesize * (v * mb_y + y)) +
-                              (h * mb_x + x);
-                        if (y == 0 && mb_y == 0) {
-                            if (x == 0 && mb_x == 0)
-                                pred = 128 << point_transform;
-                            else
-                                pred = ptr[-1];
-                        } else {
-                            if (x == 0 && mb_x == 0)
-                                pred = ptr[-linesize];
-                            else
-                                PREDICT(pred, ptr[-linesize - 1],
-                                        ptr[-linesize], ptr[-1], predictor);
-                       }
+                    n = s->nb_blocks[i];
+                    c = s->comp_index[i];
+                    h = s->h_scount[i];
+                    v = s->v_scount[i];
+                    x = 0;
+                    y = 0;
+                    linesize= s->linesize[c];
+
+                    if(bits>8) linesize /= 2;
+
+                    for(j=0; j<n; j++) {
+                        int pred, dc;
+
+                        dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                        if(dc == 0xFFFFF)
+                            return -1;
+                        if (   h * mb_x + x >= s->width
+                            || v * mb_y + y >= s->height) {
+                            // Nothing to do
+                        } else if (bits<=8) {
+                        ptr = s->picture_ptr->data[c] + (linesize * (v * mb_y + y)) + (h * mb_x + x); //FIXME optimize this crap
+                        if(y==0 && toprow){
+                            if(x==0 && leftcol){
+                                pred= 1 << (bits - 1);
+                            }else{
+                                pred= ptr[-1];
+                            }
+                        }else{
+                            if(x==0 && leftcol){
+                                pred= ptr[-linesize];
+                            }else{
+                                PREDICT(pred, ptr[-linesize-1], ptr[-linesize], ptr[-1], predictor);
+                            }
+                        }
 
                         if (s->interlaced && s->bottom_field)
                             ptr += linesize >> 1;
-                        *ptr = pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform);
+                        pred &= mask;
+                        *ptr= pred + (dc << point_transform);
+                        }else{
+                            ptr16 = (uint16_t*)(s->picture_ptr->data[c] + 2*(linesize * (v * mb_y + y)) + 2*(h * mb_x + x)); //FIXME optimize this crap
+                            if(y==0 && toprow){
+                                if(x==0 && leftcol){
+                                    pred= 1 << (bits - 1);
+                                }else{
+                                    pred= ptr16[-1];
+                                }
+                            }else{
+                                if(x==0 && leftcol){
+                                    pred= ptr16[-linesize];
+                                }else{
+                                    PREDICT(pred, ptr16[-linesize-1], ptr16[-linesize], ptr16[-1], predictor);
+                                }
+                            }
 
+                            if (s->interlaced && s->bottom_field)
+                                ptr16 += linesize >> 1;
+                            pred &= mask;
+                            *ptr16= pred + (dc << point_transform);
+                        }
                         if (++x == h) {
                             x = 0;
                             y++;
@@ -800,7 +1164,8 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
             } else {
                 for (i = 0; i < nb_components; i++) {
                     uint8_t *ptr;
-                    int n, h, v, x, y, c, j, linesize;
+                    uint16_t *ptr16;
+                    int n, h, v, x, y, c, j, linesize, dc;
                     n        = s->nb_blocks[i];
                     c        = s->comp_index[i];
                     h        = s->h_scount[i];
@@ -809,16 +1174,33 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
                     y        = 0;
                     linesize = s->linesize[c];
 
+                    if(bits>8) linesize /= 2;
+
                     for (j = 0; j < n; j++) {
                         int pred;
 
-                        // FIXME optimize this crap
-                        ptr = s->picture_ptr->data[c] +
+                        dc = mjpeg_decode_dc(s, s->dc_index[i]);
+                        if(dc == 0xFFFFF)
+                            return -1;
+                        if (   h * mb_x + x >= s->width
+                            || v * mb_y + y >= s->height) {
+                            // Nothing to do
+                        } else if (bits<=8) {
+                            ptr = s->picture_ptr->data[c] +
                               (linesize * (v * mb_y + y)) +
-                              (h * mb_x + x);
-                        PREDICT(pred, ptr[-linesize - 1],
-                                ptr[-linesize], ptr[-1], predictor);
-                        *ptr = pred + (mjpeg_decode_dc(s, s->dc_index[i]) << point_transform);
+                              (h * mb_x + x); //FIXME optimize this crap
+                            PREDICT(pred, ptr[-linesize-1], ptr[-linesize], ptr[-1], predictor);
+
+                            pred &= mask;
+                            *ptr = pred + (dc << point_transform);
+                        }else{
+                            ptr16 = (uint16_t*)(s->picture_ptr->data[c] + 2*(linesize * (v * mb_y + y)) + 2*(h * mb_x + x)); //FIXME optimize this crap
+                            PREDICT(pred, ptr16[-linesize-1], ptr16[-linesize], ptr16[-1], predictor);
+
+                            pred &= mask;
+                            *ptr16= pred + (dc << point_transform);
+                        }
+
                         if (++x == h) {
                             x = 0;
                             y++;
@@ -835,18 +1217,63 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor,
     return 0;
 }
 
+static av_always_inline void mjpeg_copy_block(MJpegDecodeContext *s,
+                                              uint8_t *dst, const uint8_t *src,
+                                              int linesize, int lowres)
+{
+    switch (lowres) {
+    case 0: s->hdsp.put_pixels_tab[1][0](dst, src, linesize, 8);
+        break;
+    case 1: copy_block4(dst, src, linesize, linesize, 4);
+        break;
+    case 2: copy_block2(dst, src, linesize, linesize, 2);
+        break;
+    case 3: *dst = *src;
+        break;
+    }
+}
+
+static void shift_output(MJpegDecodeContext *s, uint8_t *ptr, int linesize)
+{
+    int block_x, block_y;
+    int size = 8 >> s->avctx->lowres;
+    if (s->bits > 8) {
+        for (block_y=0; block_y<size; block_y++)
+            for (block_x=0; block_x<size; block_x++)
+                *(uint16_t*)(ptr + 2*block_x + block_y*linesize) <<= 16 - s->bits;
+    } else {
+        for (block_y=0; block_y<size; block_y++)
+            for (block_x=0; block_x<size; block_x++)
+                *(ptr + block_x + block_y*linesize) <<= 8 - s->bits;
+    }
+}
+
 static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                              int Al, const uint8_t *mb_bitmask,
+                             int mb_bitmask_size,
                              const AVFrame *reference)
 {
-    int i, mb_x, mb_y;
+    int i, mb_x, mb_y, chroma_h_shift, chroma_v_shift, chroma_width, chroma_height;
     uint8_t *data[MAX_COMPONENTS];
     const uint8_t *reference_data[MAX_COMPONENTS];
     int linesize[MAX_COMPONENTS];
-    GetBitContext mb_bitmask_gb;
+    GetBitContext mb_bitmask_gb = {0}; // initialize to silence gcc warning
+    int bytes_per_pixel = 1 + (s->bits > 8);
 
-    if (mb_bitmask)
+    if (mb_bitmask) {
+        if (mb_bitmask_size != (s->mb_width * s->mb_height + 7)>>3) {
+            av_log(s->avctx, AV_LOG_ERROR, "mb_bitmask_size mismatches\n");
+            return AVERROR_INVALIDDATA;
+        }
         init_get_bits(&mb_bitmask_gb, mb_bitmask, s->mb_width * s->mb_height);
+    }
+
+    s->restart_count = 0;
+
+    av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt, &chroma_h_shift,
+                                     &chroma_v_shift);
+    chroma_width  = AV_CEIL_RSHIFT(s->width,  chroma_h_shift);
+    chroma_height = AV_CEIL_RSHIFT(s->height, chroma_v_shift);
 
     for (i = 0; i < nb_components; i++) {
         int c   = s->comp_index[i];
@@ -879,27 +1306,36 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                 x = 0;
                 y = 0;
                 for (j = 0; j < n; j++) {
-                    block_offset = ((linesize[c] * (v * mb_y + y) * 8) +
-                                    (h * mb_x + x) * 8);
+                    block_offset = (((linesize[c] * (v * mb_y + y) * 8) +
+                                     (h * mb_x + x) * 8 * bytes_per_pixel) >> s->avctx->lowres);
 
                     if (s->interlaced && s->bottom_field)
                         block_offset += linesize[c] >> 1;
-                    ptr = data[c] + block_offset;
+                    if (   8*(h * mb_x + x) < ((c == 1) || (c == 2) ? chroma_width  : s->width)
+                        && 8*(v * mb_y + y) < ((c == 1) || (c == 2) ? chroma_height : s->height)) {
+                        ptr = data[c] + block_offset;
+                    } else
+                        ptr = NULL;
                     if (!s->progressive) {
-                        if (copy_mb)
-                            s->hdsp.put_pixels_tab[1][0](ptr,
-                                reference_data[c] + block_offset,
-                                linesize[c], 8);
-                        else {
+                        if (copy_mb) {
+                            if (ptr)
+                                mjpeg_copy_block(s, ptr, reference_data[c] + block_offset,
+                                                linesize[c], s->avctx->lowres);
+
+                        } else {
                             s->bdsp.clear_block(s->block);
                             if (decode_block(s, s->block, i,
                                              s->dc_index[i], s->ac_index[i],
-                                             s->quant_matrixes[s->quant_index[c]]) < 0) {
+                                             s->quant_matrixes[s->quant_sindex[i]]) < 0) {
                                 av_log(s->avctx, AV_LOG_ERROR,
                                        "error y=%d x=%d\n", mb_y, mb_x);
                                 return AVERROR_INVALIDDATA;
                             }
-                            s->idsp.idct_put(ptr, linesize[c], s->block);
+                            if (ptr) {
+                                s->idsp.idct_put(ptr, linesize[c], s->block);
+                                if (s->bits & 7)
+                                    shift_output(s, ptr, linesize[c]);
+                            }
                         }
                     } else {
                         int block_idx  = s->block_stride[c] * (v * mb_y + y) +
@@ -907,9 +1343,9 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                         int16_t *block = s->blocks[c][block_idx];
                         if (Ah)
                             block[0] += get_bits1(&s->gb) *
-                                        s->quant_matrixes[s->quant_index[c]][0] << Al;
+                                        s->quant_matrixes[s->quant_sindex[i]][0] << Al;
                         else if (decode_dc_progressive(s, block, i, s->dc_index[i],
-                                                       s->quant_matrixes[s->quant_index[c]],
+                                                       s->quant_matrixes[s->quant_sindex[i]],
                                                        Al) < 0) {
                             av_log(s->avctx, AV_LOG_ERROR,
                                    "error y=%d x=%d\n", mb_y, mb_x);
@@ -927,74 +1363,41 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                 }
             }
 
-            if (s->restart_interval) {
-                s->restart_count--;
-                i = 8 + ((-get_bits_count(&s->gb)) & 7);
-                /* skip RSTn */
-                if (show_bits(&s->gb, i) == (1 << i) - 1) {
-                    int pos = get_bits_count(&s->gb);
-                    align_get_bits(&s->gb);
-                    while (get_bits_left(&s->gb) >= 8 && show_bits(&s->gb, 8) == 0xFF)
-                        skip_bits(&s->gb, 8);
-                    if ((get_bits(&s->gb, 8) & 0xF8) == 0xD0) {
-                        for (i = 0; i < nb_components; i++) /* reset dc */
-                            s->last_dc[i] = 1024;
-                    } else
-                        skip_bits_long(&s->gb, pos - get_bits_count(&s->gb));
-                }
-            }
+            handle_rstn(s, nb_components);
         }
     }
     return 0;
 }
 
 static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss,
-                                            int se, int Ah, int Al,
-                                            const uint8_t *mb_bitmask,
-                                            const AVFrame *reference)
+                                            int se, int Ah, int Al)
 {
     int mb_x, mb_y;
     int EOBRUN = 0;
     int c = s->comp_index[0];
-    uint8_t *data = s->picture_ptr->data[c];
-    const uint8_t *reference_data = reference ? reference->data[c] : NULL;
-    int linesize  = s->linesize[c];
-    int last_scan = 0;
-    int16_t *quant_matrix = s->quant_matrixes[s->quant_index[c]];
-    GetBitContext mb_bitmask_gb;
-
-    if (ss < 0  || ss >= 64 ||
-        se < ss || se >= 64 ||
-        Ah < 0  || Al < 0)
-        return AVERROR_INVALIDDATA;
-
-    if (mb_bitmask)
-        init_get_bits(&mb_bitmask_gb, mb_bitmask, s->mb_width * s->mb_height);
+    int16_t *quant_matrix = s->quant_matrixes[s->quant_sindex[0]];
 
-    if (!Al) {
-        // s->coefs_finished is a bitmask for coefficients coded
-        // ss and se are parameters telling start and end coefficients
-        s->coefs_finished[c] |= (~0ULL >> (63 - (se - ss))) << ss;
-        last_scan = !~s->coefs_finished[c];
+    av_assert0(ss>=0 && Ah>=0 && Al>=0);
+    if (se < ss || se > 63) {
+        av_log(s->avctx, AV_LOG_ERROR, "SS/SE %d/%d is invalid\n", ss, se);
+        return AVERROR_INVALIDDATA;
     }
 
-    if (s->interlaced && s->bottom_field) {
-        int offset      = linesize >> 1;
-        data           += offset;
-        reference_data += offset;
-    }
+    // s->coefs_finished is a bitmask for coefficients coded
+    // ss and se are parameters telling start and end coefficients
+    s->coefs_finished[c] |= (2ULL << se) - (1ULL << ss);
+
+    s->restart_count = 0;
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
-        int block_offset = mb_y * linesize * 8;
-        uint8_t *ptr     = data + block_offset;
         int block_idx    = mb_y * s->block_stride[c];
         int16_t (*block)[64] = &s->blocks[c][block_idx];
         uint8_t *last_nnz    = &s->last_nnz[c][block_idx];
         for (mb_x = 0; mb_x < s->mb_width; mb_x++, block++, last_nnz++) {
-            const int copy_mb = mb_bitmask && !get_bits1(&mb_bitmask_gb);
-
-            if (!copy_mb) {
                 int ret;
+                if (s->restart_interval && !s->restart_count)
+                    s->restart_count = s->restart_interval;
+
                 if (Ah)
                     ret = decode_block_refinement(s, *block, last_nnz, s->ac_index[0],
                                                   quant_matrix, ss, se, Al, &EOBRUN);
@@ -1006,31 +1409,64 @@ static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss,
                            "error y=%d x=%d\n", mb_y, mb_x);
                     return AVERROR_INVALIDDATA;
                 }
-            }
 
-            if (last_scan) {
-                if (copy_mb) {
-                    s->hdsp.put_pixels_tab[1][0](ptr,
-                                                 reference_data + block_offset,
-                                                 linesize, 8);
-                } else {
-                    s->idsp.idct_put(ptr, linesize, *block);
-                    ptr += 8;
-                }
-            }
+            if (handle_rstn(s, 0))
+                EOBRUN = 0;
         }
     }
     return 0;
 }
 
+static void mjpeg_idct_scan_progressive_ac(MJpegDecodeContext *s)
+{
+    int mb_x, mb_y;
+    int c;
+    const int bytes_per_pixel = 1 + (s->bits > 8);
+    const int block_size = s->lossless ? 1 : 8;
+
+    for (c = 0; c < s->nb_components; c++) {
+        uint8_t *data = s->picture_ptr->data[c];
+        int linesize  = s->linesize[c];
+        int h = s->h_max / s->h_count[c];
+        int v = s->v_max / s->v_count[c];
+        int mb_width     = (s->width  + h * block_size - 1) / (h * block_size);
+        int mb_height    = (s->height + v * block_size - 1) / (v * block_size);
+
+        if (~s->coefs_finished[c])
+            av_log(s->avctx, AV_LOG_WARNING, "component %d is incomplete\n", c);
+
+        if (s->interlaced && s->bottom_field)
+            data += linesize >> 1;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            uint8_t *ptr     = data + (mb_y * linesize * 8 >> s->avctx->lowres);
+            int block_idx    = mb_y * s->block_stride[c];
+            int16_t (*block)[64] = &s->blocks[c][block_idx];
+            for (mb_x = 0; mb_x < mb_width; mb_x++, block++) {
+                s->idsp.idct_put(ptr, linesize, *block);
+                if (s->bits & 7)
+                    shift_output(s, ptr, linesize);
+                ptr += bytes_per_pixel*8 >> s->avctx->lowres;
+            }
+        }
+    }
+}
+
 int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
-                        const AVFrame *reference)
+                        int mb_bitmask_size, const AVFrame *reference)
 {
     int len, nb_components, i, h, v, predictor, point_transform;
     int index, id, ret;
     const int block_size = s->lossless ? 1 : 8;
     int ilv, prev_shift;
 
+    if (!s->got_picture) {
+        av_log(s->avctx, AV_LOG_WARNING,
+                "Can not process SOS before SOF, skipping\n");
+        return -1;
+    }
+
+    av_assert0(s->picture_ptr->data[0]);
     /* XXX: verify len field validity */
     len = get_bits(&s->gb, 16);
     nb_components = get_bits(&s->gb, 8);
@@ -1060,27 +1496,35 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
             && nb_components == 3 && s->nb_components == 3 && i)
             index = 3 - i;
 
-        s->comp_index[i] = index;
-
+        s->quant_sindex[i] = s->quant_index[index];
         s->nb_blocks[i] = s->h_count[index] * s->v_count[index];
         s->h_scount[i]  = s->h_count[index];
         s->v_scount[i]  = s->v_count[index];
 
+        if(nb_components == 3 && s->nb_components == 3 && s->avctx->pix_fmt == AV_PIX_FMT_GBR24P)
+            index = (i+2)%3;
+        if(nb_components == 1 && s->nb_components == 3 && s->avctx->pix_fmt == AV_PIX_FMT_GBR24P)
+            index = (index+2)%3;
+
+        s->comp_index[i] = index;
+
         s->dc_index[i] = get_bits(&s->gb, 4);
         s->ac_index[i] = get_bits(&s->gb, 4);
 
         if (s->dc_index[i] <  0 || s->ac_index[i] < 0 ||
             s->dc_index[i] >= 4 || s->ac_index[i] >= 4)
             goto out_of_range;
-        if (!s->vlcs[0][s->dc_index[i]].table ||
-            !s->vlcs[1][s->ac_index[i]].table)
+        if (!s->vlcs[0][s->dc_index[i]].table || !(s->progressive ? s->vlcs[2][s->ac_index[0]].table : s->vlcs[1][s->ac_index[i]].table))
             goto out_of_range;
     }
 
     predictor = get_bits(&s->gb, 8);       /* JPEG Ss / lossless JPEG predictor /JPEG-LS NEAR */
     ilv = get_bits(&s->gb, 8);             /* JPEG Se / JPEG-LS ILV */
-    prev_shift      = get_bits(&s->gb, 4); /* Ah */
-    point_transform = get_bits(&s->gb, 4); /* Al */
+    if(s->avctx->codec_tag != AV_RL32("CJPG")){
+        prev_shift      = get_bits(&s->gb, 4); /* Ah */
+        point_transform = get_bits(&s->gb, 4); /* Al */
+    }else
+        prev_shift = point_transform = 0;
 
     if (nb_components > 1) {
         /* interleaved stream */
@@ -1097,10 +1541,10 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
     }
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(s->avctx, AV_LOG_DEBUG, "%s %s p:%d >>:%d ilv:%d bits:%d %s\n",
+        av_log(s->avctx, AV_LOG_DEBUG, "%s %s p:%d >>:%d ilv:%d bits:%d skip:%d %s comp:%d\n",
                s->lossless ? "lossless" : "sequential DCT", s->rgb ? "RGB" : "",
-               predictor, point_transform, ilv, s->bits,
-               s->pegasus_rct ? "PRCT" : (s->rct ? "RCT" : ""));
+               predictor, point_transform, ilv, s->bits, s->mjpb_skiptosod,
+               s->pegasus_rct ? "PRCT" : (s->rct ? "RCT" : ""), nb_components);
 
 
     /* mjpeg-b can have padding bytes between sos and image data, skip them */
@@ -1109,9 +1553,10 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s, const uint8_t *mb_bitmask,
 
 next_field:
     for (i = 0; i < nb_components; i++)
-        s->last_dc[i] = 1024;
+        s->last_dc[i] = (4 << s->bits);
 
     if (s->lossless) {
+        av_assert0(s->picture_ptr == s->picture);
         if (CONFIG_JPEGLS_DECODER && s->ls) {
 //            for () {
 //            reset_ls_coding_parameters(s, 0);
@@ -1121,8 +1566,7 @@ next_field:
                 return ret;
         } else {
             if (s->rgb) {
-                if ((ret = ljpeg_decode_rgb_scan(s, predictor,
-                                                 point_transform)) < 0)
+                if ((ret = ljpeg_decode_rgb_scan(s, nb_components, predictor, point_transform)) < 0)
                     return ret;
             } else {
                 if ((ret = ljpeg_decode_yuv_scan(s, predictor,
@@ -1133,16 +1577,15 @@ next_field:
         }
     } else {
         if (s->progressive && predictor) {
+            av_assert0(s->picture_ptr == s->picture);
             if ((ret = mjpeg_decode_scan_progressive_ac(s, predictor,
                                                         ilv, prev_shift,
-                                                        point_transform,
-                                                        mb_bitmask,
-                                                        reference)) < 0)
+                                                        point_transform)) < 0)
                 return ret;
         } else {
             if ((ret = mjpeg_decode_scan(s, nb_components,
                                          prev_shift, point_transform,
-                                         mb_bitmask, reference)) < 0)
+                                         mb_bitmask, mb_bitmask_size, reference)) < 0)
                 return ret;
         }
     }
@@ -1153,7 +1596,7 @@ next_field:
         GetBitContext bak = s->gb;
         align_get_bits(&bak);
         if (show_bits(&bak, 16) == 0xFFD1) {
-            ff_dlog(s->avctx, "AVRn interlaced picture marker found\n");
+            av_log(s->avctx, AV_LOG_DEBUG, "AVRn interlaced picture marker found\n");
             s->gb = bak;
             skip_bits(&s->gb, 16);
             s->bottom_field ^= 1;
@@ -1186,22 +1629,24 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
     int len, id, i;
 
     len = get_bits(&s->gb, 16);
-    if (len < 5)
+    if (len < 6)
         return AVERROR_INVALIDDATA;
     if (8 * len > get_bits_left(&s->gb))
         return AVERROR_INVALIDDATA;
 
     id   = get_bits_long(&s->gb, 32);
-    id   = av_be2ne32(id);
     len -= 6;
 
-    if (s->avctx->debug & FF_DEBUG_STARTCODE)
-        av_log(s->avctx, AV_LOG_DEBUG, "APPx %8X\n", id);
+    if (s->avctx->debug & FF_DEBUG_STARTCODE) {
+        char id_str[32];
+        av_get_codec_tag_string(id_str, sizeof(id_str), av_bswap32(id));
+        av_log(s->avctx, AV_LOG_DEBUG, "APPx (%s / %8X) len=%d\n", id_str, id, len);
+    }
 
     /* Buggy AVID, it puts EOI only at every 10th frame. */
     /* Also, this fourcc is used by non-avid files too, it holds some
        information, but it's always present in AVID-created files. */
-    if (id == AV_RL32("AVI1")) {
+    if (id == AV_RB32("AVI1")) {
         /* structure:
             4bytes      AVI1
             1bytes      polarity
@@ -1209,12 +1654,9 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
             4bytes      field_size
             4bytes      field_size_less_padding
         */
-        s->buggy_avid = 1;
-        i = get_bits(&s->gb, 8);
-        if (i == 2)
-            s->bottom_field = 1;
-        else if (i == 1)
-            s->bottom_field = 0;
+            s->buggy_avid = 1;
+        i = get_bits(&s->gb, 8); len--;
+        av_log(s->avctx, AV_LOG_DEBUG, "polarity %d\n", i);
 #if 0
         skip_bits(&s->gb, 8);
         skip_bits(&s->gb, 32);
@@ -1226,7 +1668,7 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
 
 //    len -= 2;
 
-    if (id == AV_RL32("JFIF")) {
+    if (id == AV_RB32("JFIF")) {
         int t_w, t_h, v1, v2;
         skip_bits(&s->gb, 8); /* the trailing zero-byte */
         v1 = get_bits(&s->gb, 8);
@@ -1235,7 +1677,11 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
 
         s->avctx->sample_aspect_ratio.num = get_bits(&s->gb, 16);
         s->avctx->sample_aspect_ratio.den = get_bits(&s->gb, 16);
-        ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
+        if (   s->avctx->sample_aspect_ratio.num <= 0
+            || s->avctx->sample_aspect_ratio.den <= 0) {
+            s->avctx->sample_aspect_ratio.num = 0;
+            s->avctx->sample_aspect_ratio.den = 1;
+        }
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO)
             av_log(s->avctx, AV_LOG_INFO,
@@ -1255,18 +1701,24 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
         goto out;
     }
 
-    if (id == AV_RL32("Adob") && (get_bits(&s->gb, 8) == 'e')) {
-        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-            av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n");
+    if (   id == AV_RB32("Adob")
+        && len >= 7
+        && show_bits(&s->gb, 8) == 'e'
+        && show_bits_long(&s->gb, 32) != AV_RB32("e_CM")) {
+        skip_bits(&s->gb,  8); /* 'e' */
         skip_bits(&s->gb, 16); /* version */
         skip_bits(&s->gb, 16); /* flags0 */
         skip_bits(&s->gb, 16); /* flags1 */
-        skip_bits(&s->gb,  8); /* transform */
+        s->adobe_transform = get_bits(&s->gb,  8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found, transform=%d\n", s->adobe_transform);
         len -= 7;
         goto out;
     }
 
-    if (id == AV_RL32("LJIF")) {
+    if (id == AV_RB32("LJIF")) {
+        int rgb = s->rgb;
+        int pegasus_rct = s->pegasus_rct;
         if (s->avctx->debug & FF_DEBUG_PICT_INFO)
             av_log(s->avctx, AV_LOG_INFO,
                    "Pegasus lossless jpeg header found\n");
@@ -1274,29 +1726,126 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
         skip_bits(&s->gb, 16); /* unknown always 0? */
         skip_bits(&s->gb, 16); /* unknown always 0? */
         skip_bits(&s->gb, 16); /* unknown always 0? */
-        switch (get_bits(&s->gb, 8)) {
+        switch (i=get_bits(&s->gb, 8)) {
         case 1:
-            s->rgb         = 1;
-            s->pegasus_rct = 0;
+            rgb         = 1;
+            pegasus_rct = 0;
             break;
         case 2:
-            s->rgb         = 1;
-            s->pegasus_rct = 1;
+            rgb         = 1;
+            pegasus_rct = 1;
             break;
         default:
-            av_log(s->avctx, AV_LOG_ERROR, "unknown colorspace\n");
+            av_log(s->avctx, AV_LOG_ERROR, "unknown colorspace %d\n", i);
         }
+
         len -= 9;
+        if (s->got_picture)
+            if (rgb != s->rgb || pegasus_rct != s->pegasus_rct) {
+                av_log(s->avctx, AV_LOG_WARNING, "Mismatching LJIF tag\n");
+                goto out;
+            }
+
+        s->rgb = rgb;
+        s->pegasus_rct = pegasus_rct;
+
+        goto out;
+    }
+    if (id == AV_RL32("colr") && len > 0) {
+        s->colr = get_bits(&s->gb, 8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "COLR %d\n", s->colr);
+        len --;
+        goto out;
+    }
+    if (id == AV_RL32("xfrm") && len > 0) {
+        s->xfrm = get_bits(&s->gb, 8);
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "XFRM %d\n", s->xfrm);
+        len --;
+        goto out;
+    }
+
+    /* JPS extension by VRex */
+    if (s->start_code == APP3 && id == AV_RB32("_JPS") && len >= 10) {
+        int flags, layout, type;
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "_JPSJPS_\n");
+
+        skip_bits(&s->gb, 32); len -= 4;  /* JPS_ */
+        skip_bits(&s->gb, 16); len -= 2;  /* block length */
+        skip_bits(&s->gb, 8);             /* reserved */
+        flags  = get_bits(&s->gb, 8);
+        layout = get_bits(&s->gb, 8);
+        type   = get_bits(&s->gb, 8);
+        len -= 4;
+
+        s->stereo3d = av_stereo3d_alloc();
+        if (!s->stereo3d) {
+            goto out;
+        }
+        if (type == 0) {
+            s->stereo3d->type = AV_STEREO3D_2D;
+        } else if (type == 1) {
+            switch (layout) {
+            case 0x01:
+                s->stereo3d->type = AV_STEREO3D_LINES;
+                break;
+            case 0x02:
+                s->stereo3d->type = AV_STEREO3D_SIDEBYSIDE;
+                break;
+            case 0x03:
+                s->stereo3d->type = AV_STEREO3D_TOPBOTTOM;
+                break;
+            }
+            if (!(flags & 0x04)) {
+                s->stereo3d->flags = AV_STEREO3D_FLAG_INVERT;
+            }
+        }
+        goto out;
+    }
+
+    /* EXIF metadata */
+    if (s->start_code == APP1 && id == AV_RB32("Exif") && len >= 2) {
+        GetByteContext gbytes;
+        int ret, le, ifd_offset, bytes_read;
+        const uint8_t *aligned;
+
+        skip_bits(&s->gb, 16); // skip padding
+        len -= 2;
+
+        // init byte wise reading
+        aligned = align_get_bits(&s->gb);
+        bytestream2_init(&gbytes, aligned, len);
+
+        // read TIFF header
+        ret = ff_tdecode_header(&gbytes, &le, &ifd_offset);
+        if (ret) {
+            av_log(s->avctx, AV_LOG_ERROR, "mjpeg: invalid TIFF header in EXIF data\n");
+        } else {
+            bytestream2_seek(&gbytes, ifd_offset, SEEK_SET);
+
+            // read 0th IFD and store the metadata
+            // (return values > 0 indicate the presence of subimage metadata)
+            ret = avpriv_exif_decode_ifd(s->avctx, &gbytes, le, 0, &s->exif_metadata);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "mjpeg: error decoding EXIF data\n");
+            }
+        }
+
+        bytes_read = bytestream2_tell(&gbytes);
+        skip_bits(&s->gb, bytes_read << 3);
+        len -= bytes_read;
+
         goto out;
     }
 
     /* Apple MJPEG-A */
     if ((s->start_code == APP1) && (len > (0x28 - 8))) {
         id   = get_bits_long(&s->gb, 32);
-        id   = av_be2ne32(id);
         len -= 4;
         /* Apple MJPEG-A */
-        if (id == AV_RL32("mjpg")) {
+        if (id == AV_RB32("mjpg")) {
 #if 0
             skip_bits(&s->gb, 32); /* field size */
             skip_bits(&s->gb, 32); /* pad field size */
@@ -1338,16 +1887,20 @@ static int mjpeg_decode_com(MJpegDecodeContext *s)
                 cbuf[i] = 0;
 
             if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-                av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf);
+                av_log(s->avctx, AV_LOG_INFO, "comment: '%s'\n", cbuf);
 
             /* buggy avid, it puts EOI only at every 10th frame */
-            if (!strcmp(cbuf, "AVID")) {
-                s->buggy_avid = 1;
+            if (!strncmp(cbuf, "AVID", 4)) {
+                parse_avid(s, cbuf, len);
             } else if (!strcmp(cbuf, "CS=ITU601"))
                 s->cs_itu601 = 1;
-            else if ((len > 20 && !strncmp(cbuf, "Intel(R) JPEG Library", 21)) ||
-                     (len > 19 && !strncmp(cbuf, "Metasoft MJPEG Codec", 20)))
+            else if ((!strncmp(cbuf, "Intel(R) JPEG Library, version 1", 32) && s->avctx->codec_tag) ||
+                     (!strncmp(cbuf, "Metasoft MJPEG Codec", 20)))
                 s->flipped = 1;
+            else if (!strcmp(cbuf, "MULTISCOPE II")) {
+                s->avctx->sample_aspect_ratio = (AVRational) { 1, 2 };
+                s->multiscope = 2;
+            }
 
             av_free(cbuf);
         }
@@ -1363,22 +1916,19 @@ static int find_marker(const uint8_t **pbuf_ptr, const uint8_t *buf_end)
     const uint8_t *buf_ptr;
     unsigned int v, v2;
     int val;
-#ifdef DEBUG
     int skipped = 0;
-#endif
 
     buf_ptr = *pbuf_ptr;
-    while (buf_ptr < buf_end) {
+    while (buf_end - buf_ptr > 1) {
         v  = *buf_ptr++;
         v2 = *buf_ptr;
         if ((v == 0xff) && (v2 >= 0xc0) && (v2 <= 0xfe) && buf_ptr < buf_end) {
             val = *buf_ptr++;
             goto found;
         }
-#ifdef DEBUG
         skipped++;
-#endif
     }
+    buf_ptr = buf_end;
     val = -1;
 found:
     ff_dlog(NULL, "find_marker skipped %d bytes\n", skipped);
@@ -1401,30 +1951,60 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
     /* unescape buffer of SOS, use special treatment for JPEG-LS */
     if (start_code == SOS && !s->ls) {
         const uint8_t *src = *buf_ptr;
+        const uint8_t *ptr = src;
         uint8_t *dst = s->buffer;
 
-        while (src < buf_end) {
-            uint8_t x = *(src++);
+        #define copy_data_segment(skip) do {       \
+            ptrdiff_t length = (ptr - src) - (skip);  \
+            if (length > 0) {                         \
+                memcpy(dst, src, length);             \
+                dst += length;                        \
+                src = ptr;                            \
+            }                                         \
+        } while (0)
+
+        if (s->avctx->codec_id == AV_CODEC_ID_THP) {
+            ptr = buf_end;
+            copy_data_segment(0);
+        } else {
+            while (ptr < buf_end) {
+                uint8_t x = *(ptr++);
 
-            *(dst++) = x;
-            if (s->avctx->codec_id != AV_CODEC_ID_THP) {
                 if (x == 0xff) {
-                    while (src < buf_end && x == 0xff)
-                        x = *(src++);
+                    ptrdiff_t skip = 0;
+                    while (ptr < buf_end && x == 0xff) {
+                        x = *(ptr++);
+                        skip++;
+                    }
 
-                    if (x >= 0xd0 && x <= 0xd7)
-                        *(dst++) = x;
-                    else if (x)
-                        break;
+                    /* 0xFF, 0xFF, ... */
+                    if (skip > 1) {
+                        copy_data_segment(skip);
+
+                        /* decrement src as it is equal to ptr after the
+                         * copy_data_segment macro and we might want to
+                         * copy the current value of x later on */
+                        src--;
+                    }
+
+                    if (x < 0xd0 || x > 0xd7) {
+                        copy_data_segment(1);
+                        if (x)
+                            break;
+                    }
                 }
             }
+            if (src < ptr)
+                copy_data_segment(0);
         }
+        #undef copy_data_segment
+
         *unescaped_buf_ptr  = s->buffer;
         *unescaped_buf_size = dst - s->buffer;
         memset(s->buffer + *unescaped_buf_size, 0,
                AV_INPUT_BUFFER_PADDING_SIZE);
 
-        av_log(s->avctx, AV_LOG_DEBUG, "escaping removed %td bytes\n",
+        av_log(s->avctx, AV_LOG_DEBUG, "escaping removed %"PTRDIFF_SPECIFIER" bytes\n",
                (buf_end - *buf_ptr) - (dst - s->buffer));
     } else if (start_code == SOS && s->ls) {
         const uint8_t *src = *buf_ptr;
@@ -1433,8 +2013,6 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
         int t = 0, b = 0;
         PutBitContext pb;
 
-        s->cur_scan++;
-
         /* find marker */
         while (src + t < buf_end) {
             uint8_t x = src[t++];
@@ -1442,7 +2020,7 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
                 while ((src + t < buf_end) && x == 0xff)
                     x = src[t++];
                 if (x & 0x80) {
-                    t -= 2;
+                    t -= FFMIN(2, t);
                     break;
                 }
             }
@@ -1454,8 +2032,12 @@ int ff_mjpeg_find_marker(MJpegDecodeContext *s,
         while (b < t) {
             uint8_t x = src[b++];
             put_bits(&pb, 8, x);
-            if (x == 0xFF) {
+            if (x == 0xFF && b < t) {
                 x = src[b++];
+                if (x & 0x80) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid escape sequence\n");
+                    x &= 0x7f;
+                }
                 put_bits(&pb, 7, x);
                 bit_count--;
             }
@@ -1483,11 +2065,17 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     MJpegDecodeContext *s = avctx->priv_data;
     const uint8_t *buf_end, *buf_ptr;
     const uint8_t *unescaped_buf_ptr;
+    int hshift, vshift;
     int unescaped_buf_size;
     int start_code;
+    int i, index;
     int ret = 0;
+    int is16bit;
+
+    av_dict_free(&s->exif_metadata);
+    av_freep(&s->stereo3d);
+    s->adobe_transform = -1;
 
-    s->got_picture = 0; // picture from previous image can not be reused
     buf_ptr = buf;
     buf_end = buf + buf_size;
     while (buf_ptr < buf_end) {
@@ -1497,21 +2085,22 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                                           &unescaped_buf_size);
         /* EOF */
         if (start_code < 0) {
-            goto the_end;
+            break;
         } else if (unescaped_buf_size > INT_MAX / 8) {
             av_log(avctx, AV_LOG_ERROR,
                    "MJPEG packet 0x%x too big (%d/%d), corrupt data?\n",
                    start_code, unescaped_buf_size, buf_size);
             return AVERROR_INVALIDDATA;
         }
-
-        av_log(avctx, AV_LOG_DEBUG, "marker=%x avail_size_in_buf=%td\n",
+        av_log(avctx, AV_LOG_DEBUG, "marker=%x avail_size_in_buf=%"PTRDIFF_SPECIFIER"\n",
                start_code, buf_end - buf_ptr);
 
-        ret = init_get_bits(&s->gb, unescaped_buf_ptr,
-                            unescaped_buf_size * 8);
-        if (ret < 0)
-            return ret;
+        ret = init_get_bits8(&s->gb, unescaped_buf_ptr, unescaped_buf_size);
+
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "invalid buffer\n");
+            goto fail;
+        }
 
         s->start_code = start_code;
         if (s->avctx->debug & FF_DEBUG_STARTCODE)
@@ -1528,12 +2117,30 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         else if (start_code == COM)
             mjpeg_decode_com(s);
 
+        ret = -1;
+
         if (!CONFIG_JPEGLS_DECODER &&
             (start_code == SOF48 || start_code == LSE)) {
             av_log(avctx, AV_LOG_ERROR, "JPEG-LS support not enabled.\n");
             return AVERROR(ENOSYS);
         }
 
+        if (avctx->skip_frame == AVDISCARD_ALL) {
+            switch(start_code) {
+            case SOF0:
+            case SOF1:
+            case SOF2:
+            case SOF3:
+            case SOF48:
+            case SOI:
+            case SOS:
+            case EOI:
+                break;
+            default:
+                goto skip;
+            }
+        }
+
         switch (start_code) {
         case SOI:
             s->restart_interval = 0;
@@ -1546,7 +2153,7 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         case DHT:
             if ((ret = ff_mjpeg_decode_dht(s)) < 0) {
                 av_log(avctx, AV_LOG_ERROR, "huffman table decode error\n");
-                return ret;
+                goto fail;
             }
             break;
         case SOF0:
@@ -1555,39 +2162,41 @@ int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             s->ls          = 0;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF2:
             s->lossless    = 0;
             s->ls          = 0;
             s->progressive = 1;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF3:
+            s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             s->lossless    = 1;
             s->ls          = 0;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case SOF48:
+            s->avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             s->lossless    = 1;
             s->ls          = 1;
             s->progressive = 0;
             if ((ret = ff_mjpeg_decode_sof(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case LSE:
             if (!CONFIG_JPEGLS_DECODER ||
                 (ret = ff_jpegls_decode_lse(s)) < 0)
-                return ret;
+                goto fail;
             break;
         case EOI:
-            s->cur_scan = 0;
-            if ((s->buggy_avid && !s->interlaced) || s->restart_interval)
-                break;
 eoi_parser:
+            if (avctx->skip_frame != AVDISCARD_ALL && s->progressive && s->cur_scan && s->got_picture)
+                mjpeg_idct_scan_progressive_ac(s);
+            s->cur_scan = 0;
             if (!s->got_picture) {
                 av_log(avctx, AV_LOG_WARNING,
                        "Found EOI before any SOF, ignoring\n");
@@ -1597,43 +2206,41 @@ eoi_parser:
                 s->bottom_field ^= 1;
                 /* if not bottom field, do not output image yet */
                 if (s->bottom_field == !s->interlace_polarity)
-                    goto not_the_end;
+                    break;
+            }
+            if (avctx->skip_frame == AVDISCARD_ALL) {
+                s->got_picture = 0;
+                goto the_end_no_picture;
             }
             if ((ret = av_frame_ref(frame, s->picture_ptr)) < 0)
                 return ret;
-            if (s->flipped) {
-                int i;
-                for (i = 0; frame->data[i]; i++) {
-                    int h = frame->height >> ((i == 1 || i == 2) ?
-                                              s->pix_desc->log2_chroma_h : 0);
-                    frame->data[i] += frame->linesize[i] * (h - 1);
-                    frame->linesize[i] *= -1;
-                }
-            }
             *got_frame = 1;
+            s->got_picture = 0;
+
+            if (!s->lossless) {
+                int qp = FFMAX3(s->qscale[0],
+                                s->qscale[1],
+                                s->qscale[2]);
+                int qpw = (s->width + 15) / 16;
+                AVBufferRef *qp_table_buf = av_buffer_alloc(qpw);
+                if (qp_table_buf) {
+                    memset(qp_table_buf->data, qp, qpw);
+                    av_frame_set_qp_table(data, qp_table_buf, 0, FF_QSCALE_TYPE_MPEG1);
+                }
 
-            if (!s->lossless &&
-                avctx->debug & FF_DEBUG_QP) {
-                av_log(avctx, AV_LOG_DEBUG,
-                       "QP: %d\n", FFMAX3(s->qscale[0],
-                                          s->qscale[1],
-                                          s->qscale[2]));
+                if(avctx->debug & FF_DEBUG_QP)
+                    av_log(avctx, AV_LOG_DEBUG, "QP: %d\n", qp);
             }
 
             goto the_end;
         case SOS:
-            if (!s->got_picture) {
-                av_log(avctx, AV_LOG_WARNING,
-                       "Can not process SOS before SOF, skipping\n");
+            s->cur_scan++;
+            if (avctx->skip_frame == AVDISCARD_ALL)
                 break;
-                }
-            if ((ret = ff_mjpeg_decode_sos(s, NULL, NULL)) < 0 &&
+
+            if ((ret = ff_mjpeg_decode_sos(s, NULL, 0, NULL)) < 0 &&
                 (avctx->err_recognition & AV_EF_EXPLODE))
-                return ret;
-            /* buggy avid puts EOI every 10-20th frame */
-            /* if restart period is over process EOI */
-            if ((s->buggy_avid && !s->interlaced) || s->restart_interval)
-                goto eoi_parser;
+                goto fail;
             break;
         case DRI:
             mjpeg_decode_dri(s);
@@ -1653,21 +2260,210 @@ eoi_parser:
             break;
         }
 
-not_the_end:
+skip:
         /* eof process start code */
         buf_ptr += (get_bits_count(&s->gb) + 7) / 8;
         av_log(avctx, AV_LOG_DEBUG,
                "marker parser used %d bytes (%d bits)\n",
                (get_bits_count(&s->gb) + 7) / 8, get_bits_count(&s->gb));
     }
-    if (s->got_picture) {
+    if (s->got_picture && s->cur_scan) {
         av_log(avctx, AV_LOG_WARNING, "EOI missing, emulating\n");
         goto eoi_parser;
     }
     av_log(avctx, AV_LOG_FATAL, "No JPEG data found in image\n");
     return AVERROR_INVALIDDATA;
+fail:
+    s->got_picture = 0;
+    return ret;
 the_end:
-    av_log(avctx, AV_LOG_DEBUG, "mjpeg decode frame unused %td bytes\n",
+
+    is16bit = av_pix_fmt_desc_get(s->avctx->pix_fmt)->comp[0].step > 1;
+
+    if (AV_RB32(s->upscale_h)) {
+        int p;
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ440P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV440P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRP     ||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRAP
+                  );
+        avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        av_assert0(s->nb_components == av_pix_fmt_count_planes(s->picture_ptr->format));
+        for (p = 0; p<s->nb_components; p++) {
+            uint8_t *line = s->picture_ptr->data[p];
+            int w = s->width;
+            int h = s->height;
+            if (!s->upscale_h[p])
+                continue;
+            if (p==1 || p==2) {
+                w = AV_CEIL_RSHIFT(w, hshift);
+                h = AV_CEIL_RSHIFT(h, vshift);
+            }
+            if (s->upscale_v[p])
+                h = (h+1)>>1;
+            av_assert0(w > 0);
+            for (i = 0; i < h; i++) {
+                if (s->upscale_h[p] == 1) {
+                    if (is16bit) ((uint16_t*)line)[w - 1] = ((uint16_t*)line)[(w - 1) / 2];
+                    else                      line[w - 1] = line[(w - 1) / 2];
+                    for (index = w - 2; index > 0; index--) {
+                        if (is16bit)
+                            ((uint16_t*)line)[index] = (((uint16_t*)line)[index / 2] + ((uint16_t*)line)[(index + 1) / 2]) >> 1;
+                        else
+                            line[index] = (line[index / 2] + line[(index + 1) / 2]) >> 1;
+                    }
+                } else if (s->upscale_h[p] == 2) {
+                    if (is16bit) {
+                        ((uint16_t*)line)[w - 1] = ((uint16_t*)line)[(w - 1) / 3];
+                        if (w > 1)
+                            ((uint16_t*)line)[w - 2] = ((uint16_t*)line)[w - 1];
+                    } else {
+                        line[w - 1] = line[(w - 1) / 3];
+                        if (w > 1)
+                            line[w - 2] = line[w - 1];
+                    }
+                    for (index = w - 3; index > 0; index--) {
+                        line[index] = (line[index / 3] + line[(index + 1) / 3] + line[(index + 2) / 3] + 1) / 3;
+                    }
+                }
+                line += s->linesize[p];
+            }
+        }
+    }
+    if (AV_RB32(s->upscale_v)) {
+        int p;
+        av_assert0(avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV444P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ422P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV422P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUV440P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVJ440P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA444P ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P  ||
+                   avctx->pix_fmt == AV_PIX_FMT_YUVA420P16||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRP     ||
+                   avctx->pix_fmt == AV_PIX_FMT_GBRAP
+                   );
+        avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        av_assert0(s->nb_components == av_pix_fmt_count_planes(s->picture_ptr->format));
+        for (p = 0; p < s->nb_components; p++) {
+            uint8_t *dst;
+            int w = s->width;
+            int h = s->height;
+            if (!s->upscale_v[p])
+                continue;
+            if (p==1 || p==2) {
+                w = AV_CEIL_RSHIFT(w, hshift);
+                h = AV_CEIL_RSHIFT(h, vshift);
+            }
+            dst = &((uint8_t *)s->picture_ptr->data[p])[(h - 1) * s->linesize[p]];
+            for (i = h - 1; i; i--) {
+                uint8_t *src1 = &((uint8_t *)s->picture_ptr->data[p])[i / 2 * s->linesize[p]];
+                uint8_t *src2 = &((uint8_t *)s->picture_ptr->data[p])[(i + 1) / 2 * s->linesize[p]];
+                if (src1 == src2 || i == h - 1) {
+                    memcpy(dst, src1, w);
+                } else {
+                    for (index = 0; index < w; index++)
+                        dst[index] = (src1[index] + src2[index]) >> 1;
+                }
+                dst -= s->linesize[p];
+            }
+        }
+    }
+    if (s->flipped) {
+        int j;
+        avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &hshift, &vshift);
+        av_assert0(s->nb_components == av_pix_fmt_count_planes(s->picture_ptr->format));
+        for (index=0; index<s->nb_components; index++) {
+            uint8_t *dst = s->picture_ptr->data[index];
+            int w = s->picture_ptr->width;
+            int h = s->picture_ptr->height;
+            if(index && index<3){
+                w = AV_CEIL_RSHIFT(w, hshift);
+                h = AV_CEIL_RSHIFT(h, vshift);
+            }
+            if(dst){
+                uint8_t *dst2 = dst + s->picture_ptr->linesize[index]*(h-1);
+                for (i=0; i<h/2; i++) {
+                    for (j=0; j<w; j++)
+                        FFSWAP(int, dst[j], dst2[j]);
+                    dst  += s->picture_ptr->linesize[index];
+                    dst2 -= s->picture_ptr->linesize[index];
+                }
+            }
+        }
+    }
+    if (s->adobe_transform == 0 && s->avctx->pix_fmt == AV_PIX_FMT_GBRAP) {
+        int w = s->picture_ptr->width;
+        int h = s->picture_ptr->height;
+        av_assert0(s->nb_components == 4);
+        for (i=0; i<h; i++) {
+            int j;
+            uint8_t *dst[4];
+            for (index=0; index<4; index++) {
+                dst[index] =   s->picture_ptr->data[index]
+                             + s->picture_ptr->linesize[index]*i;
+            }
+            for (j=0; j<w; j++) {
+                int k = dst[3][j];
+                int r = dst[0][j] * k;
+                int g = dst[1][j] * k;
+                int b = dst[2][j] * k;
+                dst[0][j] = g*257 >> 16;
+                dst[1][j] = b*257 >> 16;
+                dst[2][j] = r*257 >> 16;
+                dst[3][j] = 255;
+            }
+        }
+    }
+    if (s->adobe_transform == 2 && s->avctx->pix_fmt == AV_PIX_FMT_YUVA444P) {
+        int w = s->picture_ptr->width;
+        int h = s->picture_ptr->height;
+        av_assert0(s->nb_components == 4);
+        for (i=0; i<h; i++) {
+            int j;
+            uint8_t *dst[4];
+            for (index=0; index<4; index++) {
+                dst[index] =   s->picture_ptr->data[index]
+                             + s->picture_ptr->linesize[index]*i;
+            }
+            for (j=0; j<w; j++) {
+                int k = dst[3][j];
+                int r = (255 - dst[0][j]) * k;
+                int g = (128 - dst[1][j]) * k;
+                int b = (128 - dst[2][j]) * k;
+                dst[0][j] = r*257 >> 16;
+                dst[1][j] = (g*257 >> 16) + 128;
+                dst[2][j] = (b*257 >> 16) + 128;
+                dst[3][j] = 255;
+            }
+        }
+    }
+
+    if (s->stereo3d) {
+        AVStereo3D *stereo = av_stereo3d_create_side_data(data);
+        if (stereo) {
+            stereo->type  = s->stereo3d->type;
+            stereo->flags = s->stereo3d->flags;
+        }
+        av_freep(&s->stereo3d);
+    }
+
+    av_dict_copy(avpriv_frame_get_metadatap(data), s->exif_metadata, 0);
+    av_dict_free(&s->exif_metadata);
+
+the_end_no_picture:
+    av_log(avctx, AV_LOG_DEBUG, "decode frame unused %"PTRDIFF_SPECIFIER" bytes\n",
            buf_end - buf_ptr);
 //  return buf_end - buf_ptr;
     return buf_ptr - buf;
@@ -1678,13 +2474,18 @@ av_cold int ff_mjpeg_decode_end(AVCodecContext *avctx)
     MJpegDecodeContext *s = avctx->priv_data;
     int i, j;
 
+    if (s->interlaced && s->bottom_field == !s->interlace_polarity && s->got_picture && !avctx->frame_number) {
+        av_log(avctx, AV_LOG_INFO, "Single field\n");
+    }
+
     if (s->picture) {
         av_frame_free(&s->picture);
         s->picture_ptr = NULL;
     } else if (s->picture_ptr)
         av_frame_unref(s->picture_ptr);
 
-    av_free(s->buffer);
+    av_freep(&s->buffer);
+    av_freep(&s->stereo3d);
     av_freep(&s->ljpeg_buffer);
     s->ljpeg_buffer_size = 0;
 
@@ -1696,14 +2497,22 @@ av_cold int ff_mjpeg_decode_end(AVCodecContext *avctx)
         av_freep(&s->blocks[i]);
         av_freep(&s->last_nnz[i]);
     }
+    av_dict_free(&s->exif_metadata);
     return 0;
 }
 
+static void decode_flush(AVCodecContext *avctx)
+{
+    MJpegDecodeContext *s = avctx->priv_data;
+    s->got_picture = 0;
+}
+
+#if CONFIG_MJPEG_DECODER
 #define OFFSET(x) offsetof(MJpegDecodeContext, x)
 #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
 static const AVOption options[] = {
     { "extern_huff", "Use external huffman table.",
-      OFFSET(extern_huff), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VD },
+      OFFSET(extern_huff), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VD },
     { NULL },
 };
 
@@ -1723,11 +2532,15 @@ AVCodec ff_mjpeg_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = ff_mjpeg_decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .priv_class     = &mjpegdec_class,
-    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
 };
-
+#endif
+#if CONFIG_THP_DECODER
 AVCodec ff_thp_decoder = {
     .name           = "thp",
     .long_name      = NULL_IF_CONFIG_SMALL("Nintendo Gamecube THP video"),
@@ -1737,6 +2550,9 @@ AVCodec ff_thp_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = ff_mjpeg_decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
+#endif
diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h
index b80a47b..fb81129 100644
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2003 Alex Beregszaszi
  * Copyright (c) 2003-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 
 #include "libavutil/log.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/stereo3d.h"
 
 #include "avcodec.h"
 #include "blockdsp.h"
@@ -61,9 +62,14 @@ typedef struct MJpegDecodeContext {
     int ls;
     int progressive;
     int rgb;
+    uint8_t upscale_h[4];
+    uint8_t upscale_v[4];
     int rct;            /* standard rct */
     int pegasus_rct;    /* pegasus reversible colorspace transform */
     int bits;           /* bits per component */
+    int colr;
+    int xfrm;
+    int adobe_transform;
 
     int maxval;
     int near;         ///< near lossless bound (si 0 for lossless)
@@ -83,6 +89,7 @@ typedef struct MJpegDecodeContext {
     int nb_blocks[MAX_COMPONENTS];
     int h_scount[MAX_COMPONENTS];
     int v_scount[MAX_COMPONENTS];
+    int quant_sindex[MAX_COMPONENTS];
     int h_max, v_max; /* maximum h and v counts */
     int quant_index[4];   /* quant table index for each component */
     int last_dc[MAX_COMPONENTS]; /* last DEQUANTIZED dc (XXX: am I right to do that ?) */
@@ -95,6 +102,7 @@ typedef struct MJpegDecodeContext {
     int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
     uint8_t *last_nnz[MAX_COMPONENTS];
     uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
+    int palette_index;
     ScanTable scantable;
     BlockDSPContext bdsp;
     HpelDSPContext hdsp;
@@ -106,6 +114,7 @@ typedef struct MJpegDecodeContext {
     int buggy_avid;
     int cs_itu601;
     int interlace_polarity;
+    int multiscope;
 
     int mjpb_skiptosod;
 
@@ -116,6 +125,9 @@ typedef struct MJpegDecodeContext {
     unsigned int ljpeg_buffer_size;
 
     int extern_huff;
+    AVDictionary *exif_metadata;
+
+    AVStereo3D *stereo3d; ///!< stereoscopic information (cached, since it is read before frame allocation)
 
     const AVPixFmtDescriptor *pix_desc;
 } MJpegDecodeContext;
@@ -129,7 +141,8 @@ int ff_mjpeg_decode_dqt(MJpegDecodeContext *s);
 int ff_mjpeg_decode_dht(MJpegDecodeContext *s);
 int ff_mjpeg_decode_sof(MJpegDecodeContext *s);
 int ff_mjpeg_decode_sos(MJpegDecodeContext *s,
-                        const uint8_t *mb_bitmask, const AVFrame *reference);
+                        const uint8_t *mb_bitmask,int mb_bitmask_size,
+                        const AVFrame *reference);
 int ff_mjpeg_find_marker(MJpegDecodeContext *s,
                          const uint8_t **buf_ptr, const uint8_t *buf_end,
                          const uint8_t **unescaped_buf_ptr, int *unescaped_buf_size);
diff --git a/libavcodec/mjpegenc.c b/libavcodec/mjpegenc.c
index 8291113..3d11377 100644
--- a/libavcodec/mjpegenc.c
+++ b/libavcodec/mjpegenc.c
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,10 +39,44 @@
 #include "mjpeg.h"
 #include "mjpegenc.h"
 
+static uint8_t uni_ac_vlc_len[64 * 64 * 2];
+static uint8_t uni_chroma_ac_vlc_len[64 * 64 * 2];
+
+static av_cold void init_uni_ac_vlc(const uint8_t huff_size_ac[256], uint8_t *uni_ac_vlc_len)
+{
+    int i;
+
+    for (i = 0; i < 128; i++) {
+        int level = i - 64;
+        int run;
+        if (!level)
+            continue;
+        for (run = 0; run < 64; run++) {
+            int len, code, nbits;
+            int alevel = FFABS(level);
+
+            len = (run >> 4) * huff_size_ac[0xf0];
+
+            nbits= av_log2_16bit(alevel) + 1;
+            code = ((15&run) << 4) | nbits;
+
+            len += huff_size_ac[code] + nbits;
+
+            uni_ac_vlc_len[UNI_AC_ENC_INDEX(run, i)] = len;
+            // We ignore EOB as its just a constant which does not change generally
+        }
+    }
+}
+
 av_cold int ff_mjpeg_encode_init(MpegEncContext *s)
 {
     MJpegContext *m;
 
+    if (s->width > 65500 || s->height > 65500) {
+        av_log(s, AV_LOG_ERROR, "JPEG does not support resolutions above 65500x65500\n");
+        return AVERROR(EINVAL);
+    }
+
     m = av_malloc(sizeof(MJpegContext));
     if (!m)
         return AVERROR(ENOMEM);
@@ -68,13 +102,20 @@ av_cold int ff_mjpeg_encode_init(MpegEncContext *s)
                                  avpriv_mjpeg_bits_ac_chrominance,
                                  avpriv_mjpeg_val_ac_chrominance);
 
+    init_uni_ac_vlc(m->huff_size_ac_luminance,   uni_ac_vlc_len);
+    init_uni_ac_vlc(m->huff_size_ac_chrominance, uni_chroma_ac_vlc_len);
+    s->intra_ac_vlc_length      =
+    s->intra_ac_vlc_last_length = uni_ac_vlc_len;
+    s->intra_chroma_ac_vlc_length      =
+    s->intra_chroma_ac_vlc_last_length = uni_chroma_ac_vlc_len;
+
     s->mjpeg_ctx = m;
     return 0;
 }
 
-void ff_mjpeg_encode_close(MpegEncContext *s)
+av_cold void ff_mjpeg_encode_close(MpegEncContext *s)
 {
-    av_free(s->mjpeg_ctx);
+    av_freep(&s->mjpeg_ctx);
 }
 
 static void encode_block(MpegEncContext *s, int16_t *block, int n)
@@ -120,7 +161,7 @@ static void encode_block(MpegEncContext *s, int16_t *block, int n)
                 mant--;
             }
 
-            nbits= av_log2(val) + 1;
+            nbits= av_log2_16bit(val) + 1;
             code = (run << 4) | nbits;
 
             put_bits(&s->pb, huff_size_ac[code], huff_code_ac[code]);
@@ -135,36 +176,99 @@ static void encode_block(MpegEncContext *s, int16_t *block, int n)
         put_bits(&s->pb, huff_size_ac[0], huff_code_ac[0]);
 }
 
-void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[8][64])
+void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[12][64])
 {
     int i;
-    for(i=0;i<5;i++) {
-        encode_block(s, block[i], i);
-    }
-    if (s->chroma_format == CHROMA_420) {
+    if (s->chroma_format == CHROMA_444) {
+        encode_block(s, block[0], 0);
+        encode_block(s, block[2], 2);
+        encode_block(s, block[4], 4);
+        encode_block(s, block[8], 8);
         encode_block(s, block[5], 5);
+        encode_block(s, block[9], 9);
+
+        if (16*s->mb_x+8 < s->width) {
+            encode_block(s, block[1], 1);
+            encode_block(s, block[3], 3);
+            encode_block(s, block[6], 6);
+            encode_block(s, block[10], 10);
+            encode_block(s, block[7], 7);
+            encode_block(s, block[11], 11);
+        }
     } else {
-        encode_block(s, block[6], 6);
-        encode_block(s, block[5], 5);
-        encode_block(s, block[7], 7);
+        for(i=0;i<5;i++) {
+            encode_block(s, block[i], i);
+        }
+        if (s->chroma_format == CHROMA_420) {
+            encode_block(s, block[5], 5);
+        } else {
+            encode_block(s, block[6], 6);
+            encode_block(s, block[5], 5);
+            encode_block(s, block[7], 7);
+        }
     }
 
     s->i_tex_bits += get_bits_diff(s);
 }
 
+// maximum over s->mjpeg_vsample[i]
+#define V_MAX 2
+static int amv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *pic_arg, int *got_packet)
+
+{
+    MpegEncContext *s = avctx->priv_data;
+    AVFrame *pic;
+    int i, ret;
+    int chroma_h_shift, chroma_v_shift;
+
+    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift, &chroma_v_shift);
+
+#if FF_API_EMU_EDGE
+    //CODEC_FLAG_EMU_EDGE have to be cleared
+    if(s->avctx->flags & CODEC_FLAG_EMU_EDGE)
+        return AVERROR(EINVAL);
+#endif
+
+    if ((avctx->height & 15) && avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Heights which are not a multiple of 16 might fail with some decoders, "
+               "use vstrict=-1 / -strict -1 to use %d anyway.\n", avctx->height);
+        av_log(avctx, AV_LOG_WARNING, "If you have a device that plays AMV videos, please test if videos "
+               "with such heights work with it and report your findings to ffmpeg-devel@ffmpeg.org\n");
+        return AVERROR_EXPERIMENTAL;
+    }
+
+    pic = av_frame_clone(pic_arg);
+    if (!pic)
+        return AVERROR(ENOMEM);
+    //picture should be flipped upside-down
+    for(i=0; i < 3; i++) {
+        int vsample = i ? 2 >> chroma_v_shift : 2;
+        pic->data[i] += pic->linesize[i] * (vsample * s->height / V_MAX - 1);
+        pic->linesize[i] *= -1;
+    }
+    ret = ff_mpv_encode_picture(avctx, pkt, pic, got_packet);
+    av_frame_free(&pic);
+    return ret;
+}
+
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
+FF_MPV_COMMON_OPTS
 { "pred", "Prediction method", OFFSET(pred), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 3, VE, "pred" },
     { "left",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, VE, "pred" },
     { "plane",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, INT_MIN, INT_MAX, VE, "pred" },
     { "median", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 3 }, INT_MIN, INT_MAX, VE, "pred" },
 
-    { NULL},
+{ NULL},
 };
 
+#if CONFIG_MJPEG_ENCODER
+
 static const AVClass mjpeg_class = {
-    .class_name = "mjpeg",
+    .class_name = "mjpeg encoder",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
@@ -176,11 +280,36 @@ AVCodec ff_mjpeg_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_MJPEG,
     .priv_data_size = sizeof(MpegEncContext),
-    .priv_class     = &mjpeg_class,
     .init           = ff_mpv_encode_init,
     .encode2        = ff_mpv_encode_picture,
     .close          = ff_mpv_encode_end,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_NONE
+    },
+    .priv_class     = &mjpeg_class,
+};
+#endif
+#if CONFIG_AMV_ENCODER
+static const AVClass amv_class = {
+    .class_name = "amv encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_amv_encoder = {
+    .name           = "amv",
+    .long_name      = NULL_IF_CONFIG_SMALL("AMV Video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AMV,
+    .priv_data_size = sizeof(MpegEncContext),
+    .init           = ff_mpv_encode_init,
+    .encode2        = amv_encode_picture,
+    .close          = ff_mpv_encode_end,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_NONE
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_NONE
     },
+    .priv_class     = &amv_class,
 };
+#endif
diff --git a/libavcodec/mjpegenc.h b/libavcodec/mjpegenc.h
index bbb0f0e..60cd566 100644
--- a/libavcodec/mjpegenc.h
+++ b/libavcodec/mjpegenc.h
@@ -8,20 +8,20 @@
  * aspecting, new decode_frame mechanism and apple mjpeg-b support
  *                                  by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,6 +59,6 @@ static inline void put_marker(PutBitContext *p, enum JpegMarker code)
 
 int  ff_mjpeg_encode_init(MpegEncContext *s);
 void ff_mjpeg_encode_close(MpegEncContext *s);
-void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[8][64]);
+void ff_mjpeg_encode_mb(MpegEncContext *s, int16_t block[12][64]);
 
 #endif /* AVCODEC_MJPEGENC_H */
diff --git a/libavcodec/mjpegenc_common.c b/libavcodec/mjpegenc_common.c
index 2262de6..7a6fe746 100644
--- a/libavcodec/mjpegenc_common.c
+++ b/libavcodec/mjpegenc_common.c
@@ -1,20 +1,22 @@
 /*
  * lossless JPEG shared bits
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,20 +56,46 @@ static int put_huffman_table(PutBitContext *p, int table_class, int table_id,
     return n + 17;
 }
 
-static void jpeg_table_header(PutBitContext *p, ScanTable *intra_scantable,
-                              uint16_t intra_matrix[64])
+static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p,
+                              ScanTable *intra_scantable,
+                              uint16_t luma_intra_matrix[64],
+                              uint16_t chroma_intra_matrix[64],
+                              int hsample[3])
 {
     int i, j, size;
     uint8_t *ptr;
-
+    MpegEncContext *s = avctx->priv_data;
+
+    if (avctx->codec_id != AV_CODEC_ID_LJPEG) {
+        int matrix_count = 1 + !!memcmp(luma_intra_matrix,
+                                        chroma_intra_matrix,
+                                        sizeof(luma_intra_matrix[0]) * 64);
+    if (s->force_duplicated_matrix)
+        matrix_count = 2;
     /* quant matrixes */
     put_marker(p, DQT);
-    put_bits(p, 16, 2 + 1 * (1 + 64));
+    put_bits(p, 16, 2 + matrix_count * (1 + 64));
     put_bits(p, 4, 0); /* 8 bit precision */
     put_bits(p, 4, 0); /* table 0 */
     for(i=0;i<64;i++) {
         j = intra_scantable->permutated[i];
-        put_bits(p, 8, intra_matrix[j]);
+        put_bits(p, 8, luma_intra_matrix[j]);
+    }
+
+        if (matrix_count > 1) {
+            put_bits(p, 4, 0); /* 8 bit precision */
+            put_bits(p, 4, 1); /* table 1 */
+            for(i=0;i<64;i++) {
+                j = intra_scantable->permutated[i];
+                put_bits(p, 8, chroma_intra_matrix[j]);
+            }
+        }
+    }
+
+    if(avctx->active_thread_type & FF_THREAD_SLICE){
+        put_marker(p, DRI);
+        put_bits(p, 16, 4);
+        put_bits(p, 16, (avctx->width-1)/(8*hsample[0]) + 1);
     }
 
     /* huffman table */
@@ -94,6 +122,16 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
     uint8_t *ptr;
 
     if (avctx->sample_aspect_ratio.num > 0 && avctx->sample_aspect_ratio.den > 0) {
+        AVRational sar = avctx->sample_aspect_ratio;
+
+        if (sar.num > 65535 || sar.den > 65535) {
+            if (!av_reduce(&sar.num, &sar.den, avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den, 65535))
+                av_log(avctx, AV_LOG_WARNING,
+                    "Cannot store exact aspect ratio %d:%d\n",
+                    avctx->sample_aspect_ratio.num,
+                    avctx->sample_aspect_ratio.den);
+        }
+
         /* JFIF header */
         put_marker(p, APP0);
         put_bits(p, 16, 16);
@@ -103,8 +141,8 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
          * released revision. */
         put_bits(p, 16, 0x0102);
         put_bits(p,  8, 0);              /* units type: 0 - aspect ratio */
-        put_bits(p, 16, avctx->sample_aspect_ratio.num);
-        put_bits(p, 16, avctx->sample_aspect_ratio.den);
+        put_bits(p, 16, sar.num);
+        put_bits(p, 16, sar.den);
         put_bits(p, 8, 0); /* thumbnail width */
         put_bits(p, 8, 0); /* thumbnail height */
     }
@@ -120,9 +158,10 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
         AV_WB16(ptr, size);
     }
 
-    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-        avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
-        avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
+    if (((avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
+          avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+          avctx->pix_fmt == AV_PIX_FMT_YUV444P) && avctx->color_range != AVCOL_RANGE_JPEG)
+        || avctx->color_range == AVCOL_RANGE_MPEG) {
         put_marker(p, COM);
         flush_put_bits(p);
         ptr = put_bits_ptr(p);
@@ -133,22 +172,23 @@ static void jpeg_put_comments(AVCodecContext *avctx, PutBitContext *p)
     }
 }
 
-void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
-                                    ScanTable *intra_scantable, int pred,
-                                    uint16_t intra_matrix[64])
+void ff_mjpeg_init_hvsample(AVCodecContext *avctx, int hsample[4], int vsample[4])
 {
     int chroma_h_shift, chroma_v_shift;
-    const int lossless = avctx->codec_id != AV_CODEC_ID_MJPEG;
-    int hsample[3], vsample[3];
 
     av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift,
                                      &chroma_v_shift);
-
     if (avctx->codec->id == AV_CODEC_ID_LJPEG &&
-        avctx->pix_fmt   == AV_PIX_FMT_BGR24) {
+        (   avctx->pix_fmt == AV_PIX_FMT_BGR0
+         || avctx->pix_fmt == AV_PIX_FMT_BGRA
+         || avctx->pix_fmt == AV_PIX_FMT_BGR24)) {
         vsample[0] = hsample[0] =
         vsample[1] = hsample[1] =
-        vsample[2] = hsample[2] = 1;
+        vsample[2] = hsample[2] =
+        vsample[3] = hsample[3] = 1;
+    } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P || avctx->pix_fmt == AV_PIX_FMT_YUVJ444P) {
+        vsample[0] = vsample[1] = vsample[2] = 2;
+        hsample[0] = hsample[1] = hsample[2] = 1;
     } else {
         vsample[0] = 2;
         vsample[1] = 2 >> chroma_v_shift;
@@ -157,27 +197,48 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
         hsample[1] = 2 >> chroma_h_shift;
         hsample[2] = 2 >> chroma_h_shift;
     }
+}
+
+void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
+                                    ScanTable *intra_scantable, int pred,
+                                    uint16_t luma_intra_matrix[64],
+                                    uint16_t chroma_intra_matrix[64])
+{
+    const int lossless = avctx->codec_id != AV_CODEC_ID_MJPEG && avctx->codec_id != AV_CODEC_ID_AMV;
+    int hsample[4], vsample[4];
+    int i;
+    int components = 3 + (avctx->pix_fmt == AV_PIX_FMT_BGRA);
+    int chroma_matrix = !!memcmp(luma_intra_matrix,
+                                 chroma_intra_matrix,
+                                 sizeof(luma_intra_matrix[0])*64);
+
+    ff_mjpeg_init_hvsample(avctx, hsample, vsample);
 
     put_marker(pb, SOI);
 
+    // hack for AMV mjpeg format
+    if(avctx->codec_id == AV_CODEC_ID_AMV) goto end;
+
     jpeg_put_comments(avctx, pb);
 
-    jpeg_table_header(pb, intra_scantable, intra_matrix);
+    jpeg_table_header(avctx, pb, intra_scantable, luma_intra_matrix, chroma_intra_matrix, hsample);
 
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_marker(pb, SOF0 ); break;
     case AV_CODEC_ID_LJPEG:  put_marker(pb, SOF3 ); break;
-    default: assert(0);
+    default: av_assert0(0);
     }
 
     put_bits(pb, 16, 17);
-    if (lossless && avctx->pix_fmt == AV_PIX_FMT_BGR24)
+    if (lossless && (  avctx->pix_fmt == AV_PIX_FMT_BGR0
+                    || avctx->pix_fmt == AV_PIX_FMT_BGRA
+                    || avctx->pix_fmt == AV_PIX_FMT_BGR24))
         put_bits(pb, 8, 9); /* 9 bits/component RCT */
     else
         put_bits(pb, 8, 8); /* 8 bits/component */
     put_bits(pb, 16, avctx->height);
     put_bits(pb, 16, avctx->width);
-    put_bits(pb, 8, 3); /* 3 components */
+    put_bits(pb, 8, components); /* 3 or 4 components */
 
     /* Y component */
     put_bits(pb, 8, 1); /* component number */
@@ -189,18 +250,25 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     put_bits(pb, 8, 2); /* component number */
     put_bits(pb, 4, hsample[1]); /* H factor */
     put_bits(pb, 4, vsample[1]); /* V factor */
-    put_bits(pb, 8, 0); /* select matrix */
+    put_bits(pb, 8, lossless ? 0 : chroma_matrix); /* select matrix */
 
     /* Cr component */
     put_bits(pb, 8, 3); /* component number */
     put_bits(pb, 4, hsample[2]); /* H factor */
     put_bits(pb, 4, vsample[2]); /* V factor */
-    put_bits(pb, 8, 0); /* select matrix */
+    put_bits(pb, 8, lossless ? 0 : chroma_matrix); /* select matrix */
+
+    if (components == 4) {
+        put_bits(pb, 8, 4); /* component number */
+        put_bits(pb, 4, hsample[3]); /* H factor */
+        put_bits(pb, 4, vsample[3]); /* V factor */
+        put_bits(pb, 8, 0); /* select matrix */
+    }
 
     /* scan header */
     put_marker(pb, SOS);
-    put_bits(pb, 16, 12); /* length */
-    put_bits(pb, 8, 3); /* 3 components */
+    put_bits(pb, 16, 6 + 2*components); /* length */
+    put_bits(pb, 8, components); /* 3 components */
 
     /* Y component */
     put_bits(pb, 8, 1); /* index */
@@ -217,25 +285,49 @@ void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
     put_bits(pb, 4, 1); /* DC huffman table index */
     put_bits(pb, 4, lossless ? 0 : 1); /* AC huffman table index */
 
+    if (components == 4) {
+        /* Alpha component */
+        put_bits(pb, 8, 4); /* index */
+        put_bits(pb, 4, 0); /* DC huffman table index */
+        put_bits(pb, 4, 0); /* AC huffman table index */
+    }
+
     put_bits(pb, 8, lossless ? pred : 0); /* Ss (not used) */
 
     switch (avctx->codec_id) {
     case AV_CODEC_ID_MJPEG:  put_bits(pb, 8, 63); break; /* Se (not used) */
     case AV_CODEC_ID_LJPEG:  put_bits(pb, 8,  0); break; /* not used */
-    default: assert(0);
+    default: av_assert0(0);
     }
 
     put_bits(pb, 8, 0); /* Ah/Al (not used) */
+
+end:
+    if (!lossless) {
+        MpegEncContext *s = avctx->priv_data;
+        av_assert0(avctx->codec->priv_data_size == sizeof(MpegEncContext));
+
+        s->esc_pos = put_bits_count(pb) >> 3;
+        for(i=1; i<s->slice_context_count; i++)
+            s->thread_context[i]->esc_pos = 0;
+    }
 }
 
-static void escape_FF(PutBitContext *pb, int start)
+void ff_mjpeg_escape_FF(PutBitContext *pb, int start)
 {
-    int size = put_bits_count(pb) - start * 8;
+    int size;
     int i, ff_count;
     uint8_t *buf = pb->buf + start;
     int align= (-(size_t)(buf))&3;
+    int pad = (-put_bits_count(pb))&7;
 
-    assert((size&7) == 0);
+    if (pad)
+        put_bits(pb, pad, (1<<pad)-1);
+
+    flush_put_bits(pb);
+    size = put_bits_count(pb) - start * 8;
+
+    av_assert1((size&7) == 0);
     size >>= 3;
 
     ff_count=0;
@@ -280,21 +372,35 @@ static void escape_FF(PutBitContext *pb, int start)
     }
 }
 
-void ff_mjpeg_encode_stuffing(PutBitContext * pbc)
+int ff_mjpeg_encode_stuffing(MpegEncContext *s)
 {
-    int length;
-    length= (-put_bits_count(pbc))&7;
-    if(length) put_bits(pbc, length, (1<<length)-1);
+    int i;
+    PutBitContext *pbc = &s->pb;
+    int mb_y = s->mb_y - !s->mb_x;
+
+    int ret = ff_mpv_reallocate_putbitbuffer(s, put_bits_count(&s->pb) / 8 + 100,
+                                                put_bits_count(&s->pb) / 4 + 1000);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Buffer reallocation failed\n");
+        goto fail;
+    }
+
+    ff_mjpeg_escape_FF(pbc, s->esc_pos);
+
+    if((s->avctx->active_thread_type & FF_THREAD_SLICE) && mb_y < s->mb_height)
+        put_marker(pbc, RST0 + (mb_y&7));
+    s->esc_pos = put_bits_count(pbc) >> 3;
+fail:
+
+    for(i=0; i<3; i++)
+        s->last_dc[i] = 128 << s->intra_dc_precision;
+
+    return ret;
 }
 
 void ff_mjpeg_encode_picture_trailer(PutBitContext *pb, int header_bits)
 {
-    ff_mjpeg_encode_stuffing(pb);
-    flush_put_bits(pb);
-
-    assert((header_bits & 7) == 0);
-
-    escape_FF(pb, header_bits >> 3);
+    av_assert1((header_bits & 7) == 0);
 
     put_marker(pb, EOI);
 }
diff --git a/libavcodec/mjpegenc_common.h b/libavcodec/mjpegenc_common.h
index 9b5933e..6e51ca0 100644
--- a/libavcodec/mjpegenc_common.h
+++ b/libavcodec/mjpegenc_common.h
@@ -1,20 +1,20 @@
 /*
  * lossless JPEG shared bits
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,13 +25,18 @@
 
 #include "avcodec.h"
 #include "idctdsp.h"
+#include "mpegvideo.h"
 #include "put_bits.h"
 
 void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
                                     ScanTable *intra_scantable, int pred,
-                                    uint16_t intra_matrix[64]);
+                                    uint16_t luma_intra_matrix[64],
+                                    uint16_t chroma_intra_matrix[64]);
 void ff_mjpeg_encode_picture_trailer(PutBitContext *pb, int header_bits);
-void ff_mjpeg_encode_stuffing(PutBitContext *pbc);
+void ff_mjpeg_escape_FF(PutBitContext *pb, int start);
+int ff_mjpeg_encode_stuffing(MpegEncContext *s);
+void ff_mjpeg_init_hvsample(AVCodecContext *avctx, int hsample[4], int vsample[4]);
+
 void ff_mjpeg_encode_dc(PutBitContext *pb, int val,
                         uint8_t *huff_size, uint16_t *huff_code);
 
diff --git a/libavcodec/mlp.c b/libavcodec/mlp.c
index 9615b66..87f7c77 100644
--- a/libavcodec/mlp.c
+++ b/libavcodec/mlp.c
@@ -2,20 +2,20 @@
  * MLP codec common code
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mlp.h b/libavcodec/mlp.h
index 8a1584e..05d8dba 100644
--- a/libavcodec/mlp.h
+++ b/libavcodec/mlp.h
@@ -2,20 +2,20 @@
  * MLP codec common header file
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mlp_parser.c b/libavcodec/mlp_parser.c
index e8fb4f5..23601c8 100644
--- a/libavcodec/mlp_parser.c
+++ b/libavcodec/mlp_parser.c
@@ -2,20 +2,20 @@
  * MLP parser
  * Copyright (c) 2007 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,28 +44,28 @@ static const uint8_t mlp_channels[32] = {
     5, 6, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
-static const uint64_t mlp_layout[32] = {
+const uint64_t ff_mlp_layout[32] = {
     AV_CH_LAYOUT_MONO,
     AV_CH_LAYOUT_STEREO,
     AV_CH_LAYOUT_2_1,
-    AV_CH_LAYOUT_2_2,
+    AV_CH_LAYOUT_QUAD,
     AV_CH_LAYOUT_STEREO|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_2_1|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_2_2|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_SURROUND,
     AV_CH_LAYOUT_4POINT0,
-    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT0_BACK,
     AV_CH_LAYOUT_SURROUND|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_4POINT0|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_5POINT1_BACK,
     AV_CH_LAYOUT_4POINT0,
-    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT0_BACK,
     AV_CH_LAYOUT_SURROUND|AV_CH_LOW_FREQUENCY,
     AV_CH_LAYOUT_4POINT0|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT1,
-    AV_CH_LAYOUT_2_2|AV_CH_LOW_FREQUENCY,
-    AV_CH_LAYOUT_5POINT0,
-    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_5POINT1_BACK,
+    AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_5POINT0_BACK,
+    AV_CH_LAYOUT_5POINT1_BACK,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
@@ -108,7 +108,7 @@ static int truehd_channels(int chanmap)
     return channels;
 }
 
-static uint64_t truehd_layout(int chanmap)
+uint64_t ff_truehd_layout(int chanmap)
 {
     int i;
     uint64_t layout = 0;
@@ -147,7 +147,7 @@ int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb)
     int ratebits, channel_arrangement, header_size;
     uint16_t checksum;
 
-    assert(get_bits_count(gb) == 0);
+    av_assert1(get_bits_count(gb) == 0);
 
     header_size = ff_mlp_get_major_sync_size(gb->buffer, gb->size_in_bits >> 3);
     if (header_size < 0 || gb->size_in_bits < header_size << 3) {
@@ -177,9 +177,10 @@ int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb)
 
         skip_bits(gb, 11);
 
+        mh->channel_arrangement=
         channel_arrangement    = get_bits(gb, 5);
         mh->channels_mlp       = mlp_channels[channel_arrangement];
-        mh->channel_layout_mlp = mlp_layout[channel_arrangement];
+        mh->channel_layout_mlp = ff_mlp_layout[channel_arrangement];
     } else if (mh->stream_type == 0xba) {
         mh->group1_bits = 24; // TODO: Is this information actually conveyed anywhere?
         mh->group2_bits = 0;
@@ -193,15 +194,16 @@ int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb)
         mh->channel_modifier_thd_stream0 = get_bits(gb, 2);
         mh->channel_modifier_thd_stream1 = get_bits(gb, 2);
 
+        mh->channel_arrangement=
         channel_arrangement            = get_bits(gb, 5);
         mh->channels_thd_stream1       = truehd_channels(channel_arrangement);
-        mh->channel_layout_thd_stream1 = truehd_layout(channel_arrangement);
+        mh->channel_layout_thd_stream1 = ff_truehd_layout(channel_arrangement);
 
         mh->channel_modifier_thd_stream2 = get_bits(gb, 2);
 
         channel_arrangement            = get_bits(gb, 13);
         mh->channels_thd_stream2       = truehd_channels(channel_arrangement);
-        mh->channel_layout_thd_stream2 = truehd_layout(channel_arrangement);
+        mh->channel_layout_thd_stream2 = ff_truehd_layout(channel_arrangement);
     } else
         return AVERROR_INVALIDDATA;
 
@@ -247,6 +249,7 @@ static int mlp_parse(AVCodecParserContext *s,
     int sync_present;
     uint8_t parity_bits;
     int next;
+    int ret;
     int i, p = 0;
 
     *poutbuf_size = 0;
@@ -268,11 +271,15 @@ static int mlp_parse(AVCodecParserContext *s,
         }
 
         if (!mp->in_sync) {
-            ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size);
+            if (ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size) != -1)
+                av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
             return buf_size;
         }
 
-        ff_combine_frame(&mp->pc, i - 7, &buf, &buf_size);
+        if ((ret = ff_combine_frame(&mp->pc, i - 7, &buf, &buf_size)) < 0) {
+            av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
+            return ret;
+        }
 
         return i - 7;
     }
@@ -286,13 +293,17 @@ static int mlp_parse(AVCodecParserContext *s,
         }
 
         if (mp->pc.index + buf_size < 2) {
-            ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size);
+            if (ff_combine_frame(&mp->pc, END_NOT_FOUND, &buf, &buf_size) != -1)
+                av_log(avctx, AV_LOG_WARNING, "ff_combine_frame failed\n");
             return buf_size;
         }
 
         mp->bytes_left = ((mp->pc.index > 0 ? mp->pc.buffer[0] : buf[0]) << 8)
                        |  (mp->pc.index > 1 ? mp->pc.buffer[1] : buf[1-mp->pc.index]);
         mp->bytes_left = (mp->bytes_left & 0xfff) * 2;
+        if (mp->bytes_left <= 0) { // prevent infinite loop
+            goto lost_sync;
+        }
         mp->bytes_left -= mp->pc.index;
     }
 
@@ -343,6 +354,7 @@ static int mlp_parse(AVCodecParserContext *s,
         avctx->sample_rate = mh.group1_samplerate;
         s->duration = mh.access_unit_size;
 
+        if(!avctx->channels || !avctx->channel_layout) {
         if (mh.stream_type == 0xbb) {
             /* MLP stream */
             if (avctx->request_channel_layout &&
@@ -357,7 +369,7 @@ static int mlp_parse(AVCodecParserContext *s,
             }
         } else { /* mh.stream_type == 0xba */
             /* TrueHD stream */
-                if (avctx->request_channel_layout &&
+            if (avctx->request_channel_layout &&
                     (avctx->request_channel_layout & AV_CH_LAYOUT_STEREO) ==
                     avctx->request_channel_layout &&
                     mh.num_substreams > 1) {
@@ -374,6 +386,7 @@ static int mlp_parse(AVCodecParserContext *s,
                 avctx->channel_layout = mh.channel_layout_thd_stream2;
             }
         }
+        }
 
         if (!mh.is_vbr) /* Stream is CBR */
             avctx->bit_rate = mh.peak_bitrate;
diff --git a/libavcodec/mlp_parser.h b/libavcodec/mlp_parser.h
index 8a2ae13..c5a2883 100644
--- a/libavcodec/mlp_parser.h
+++ b/libavcodec/mlp_parser.h
@@ -2,20 +2,20 @@
  * MLP parser prototypes
  * Copyright (c) 2007 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,8 @@ typedef struct MLPHeaderInfo
     int group1_samplerate;                  ///< Sample rate of first substream
     int group2_samplerate;                  ///< Sample rate of second substream (MLP only)
 
+    int channel_arrangement;
+
     int channel_modifier_thd_stream0;       ///< Channel modifier for substream 0 of TrueHD streams ("2-channel presentation")
     int channel_modifier_thd_stream1;       ///< Channel modifier for substream 1 of TrueHD streams ("6-channel presentation")
     int channel_modifier_thd_stream2;       ///< Channel modifier for substream 2 of TrueHD streams ("8-channel presentation")
@@ -62,5 +64,8 @@ typedef struct MLPHeaderInfo
 
 
 int ff_mlp_read_major_sync(void *log, MLPHeaderInfo *mh, GetBitContext *gb);
+uint64_t ff_truehd_layout(int chanmap);
+
+extern const uint64_t ff_mlp_layout[32];
 
 #endif /* AVCODEC_MLP_PARSER_H */
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index 8cfeea6..c93b058 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -2,20 +2,20 @@
  * MLP decoder
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ typedef struct SubStream {
     /// Whether the LSBs of the matrix output are encoded in the bitstream.
     uint8_t     lsb_bypass[MAX_MATRICES];
     /// Matrix coefficients, stored as 2.14 fixed point.
-    int32_t     matrix_coeff[MAX_MATRICES][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, matrix_coeff)[MAX_MATRICES][MAX_CHANNELS];
     /// Left shift to apply to noise values in 0x31eb substreams.
     uint8_t     matrix_noise_shift[MAX_MATRICES];
     //@}
@@ -144,6 +144,9 @@ typedef struct MLPDecodeContext {
     /// Index of the last substream to decode - further substreams are skipped.
     uint8_t     max_decoded_substream;
 
+    /// Stream needs channel reordering to comply with FFmpeg's channel order
+    uint8_t     needs_reordering;
+
     /// number of PCM samples contained in each frame
     int         access_unit_size;
     /// next power of two above the number of samples in each frame
@@ -156,7 +159,7 @@ typedef struct MLPDecodeContext {
 
     int8_t      noise_buffer[MAX_BLOCKSIZE_POW2];
     int8_t      bypassed_lsbs[MAX_BLOCKSIZE][MAX_CHANNELS];
-    int32_t     sample_buffer[MAX_BLOCKSIZE][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, sample_buffer)[MAX_BLOCKSIZE][MAX_CHANNELS];
 
     MLPDSPContext dsp;
 } MLPDecodeContext;
@@ -380,10 +383,22 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
      * substream is Stereo. Subsequent substreams' layouts are indicated in the
      * major sync. */
     if (m->avctx->codec_id == AV_CODEC_ID_MLP) {
+        if (mh.stream_type != 0xbb) {
+            avpriv_request_sample(m->avctx,
+                        "unexpected stream_type %X in MLP",
+                        mh.stream_type);
+            return AVERROR_PATCHWELCOME;
+        }
         if ((substr = (mh.num_substreams > 1)))
             m->substream[0].ch_layout = AV_CH_LAYOUT_STEREO;
         m->substream[substr].ch_layout = mh.channel_layout_mlp;
     } else {
+        if (mh.stream_type != 0xba) {
+            avpriv_request_sample(m->avctx,
+                        "unexpected stream_type %X in !MLP",
+                        mh.stream_type);
+            return AVERROR_PATCHWELCOME;
+        }
         if ((substr = (mh.num_substreams > 1)))
             m->substream[0].ch_layout = AV_CH_LAYOUT_STEREO;
         if (mh.num_substreams > 2)
@@ -392,8 +407,17 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
             else
                 m->substream[2].ch_layout = mh.channel_layout_thd_stream1;
         m->substream[substr].ch_layout = mh.channel_layout_thd_stream1;
+
+        if (m->avctx->channels<=2 && m->substream[substr].ch_layout == AV_CH_LAYOUT_MONO && m->max_decoded_substream == 1) {
+            av_log(m->avctx, AV_LOG_DEBUG, "Mono stream with 2 substreams, ignoring 2nd\n");
+            m->max_decoded_substream = 0;
+            if (m->avctx->channels==2)
+                m->avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+        }
     }
 
+    m->needs_reordering = mh.channel_arrangement >= 18 && mh.channel_arrangement <= 20;
+
     /* Parse the TrueHD decoder channel modifiers and set each substream's
      * AVMatrixEncoding accordingly.
      *
@@ -479,7 +503,7 @@ static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,
     if (max_matrix_channel > std_max_matrix_channel) {
         av_log(m->avctx, AV_LOG_ERROR,
                "Max matrix channel cannot be greater than %d.\n",
-               max_matrix_channel);
+               std_max_matrix_channel);
         return AVERROR_INVALIDDATA;
     }
 
@@ -491,11 +515,11 @@ static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,
 
     /* This should happen for TrueHD streams with >6 channels and MLP's noise
      * type. It is not yet known if this is allowed. */
-    if (s->max_channel > MAX_MATRIX_CHANNEL_MLP && !s->noise_type) {
+    if (max_channel > MAX_MATRIX_CHANNEL_MLP && !s->noise_type) {
         avpriv_request_sample(m->avctx,
                               "%d channels (more than the "
                               "maximum supported by the decoder)",
-                              s->max_channel + 2);
+                              max_channel + 2);
         return AVERROR_PATCHWELCOME;
     }
 
@@ -590,6 +614,20 @@ static int read_restart_header(MLPDecodeContext *m, GetBitContext *gbp,
                                                                s->output_shift,
                                                                s->max_matrix_channel,
                                                                m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+        if (m->avctx->codec_id == AV_CODEC_ID_MLP && m->needs_reordering) {
+            if (m->avctx->channel_layout == (AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY) ||
+                m->avctx->channel_layout == AV_CH_LAYOUT_5POINT0_BACK) {
+                int i = s->ch_assign[4];
+                s->ch_assign[4] = s->ch_assign[3];
+                s->ch_assign[3] = s->ch_assign[2];
+                s->ch_assign[2] = i;
+            } else if (m->avctx->channel_layout == AV_CH_LAYOUT_5POINT1_BACK) {
+                FFSWAP(int, s->ch_assign[2], s->ch_assign[4]);
+                FFSWAP(int, s->ch_assign[3], s->ch_assign[5]);
+            }
+        }
+
     }
 
     return 0;
@@ -608,7 +646,7 @@ static int read_filter_params(MLPDecodeContext *m, GetBitContext *gbp,
     int i, order;
 
     // Filter is 0 for FIR, 1 for IIR.
-    assert(filter < 2);
+    av_assert0(filter < 2);
 
     if (m->filter_changed[channel][filter]++ > 1) {
         av_log(m->avctx, AV_LOG_ERROR, "Filters may change only once per access unit.\n");
@@ -663,7 +701,7 @@ static int read_filter_params(MLPDecodeContext *m, GetBitContext *gbp,
             /* TODO: Check validity of state data. */
 
             for (i = 0; i < order; i++)
-                fp->state[i] = get_sbits(gbp, state_bits) << state_shift;
+                fp->state[i] = state_bits ? get_sbits(gbp, state_bits) << state_shift : 0;
         }
     }
 
@@ -782,6 +820,7 @@ static int read_channel_params(MLPDecodeContext *m, unsigned int substr,
 
     if (cp->huff_lsbs > 24) {
         av_log(m->avctx, AV_LOG_ERROR, "Invalid huff_lsbs.\n");
+        cp->huff_lsbs = 0;
         return AVERROR_INVALIDDATA;
     }
 
@@ -808,7 +847,7 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
         if (get_bits1(gbp)) {
             s->blocksize = get_bits(gbp, 9);
             if (s->blocksize < 8 || s->blocksize > m->access_unit_size) {
-                av_log(m->avctx, AV_LOG_ERROR, "Invalid blocksize.");
+                av_log(m->avctx, AV_LOG_ERROR, "Invalid blocksize.\n");
                 s->blocksize = 0;
                 return AVERROR_INVALIDDATA;
             }
@@ -848,7 +887,7 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
     return 0;
 }
 
-#define MSB_MASK(bits)  (-1u << bits)
+#define MSB_MASK(bits)  (-1u << (bits))
 
 /** Generate PCM samples using the prediction filters and residual values
  *  read from the data stream, and update the filter state. */
@@ -986,15 +1025,27 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
     s->noisegen_seed = seed;
 }
 
+/** Write the audio data into the output buffer. */
 
-/** Apply the channel matrices in turn to reconstruct the original audio
- *  samples. */
-
-static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
+static int output_data(MLPDecodeContext *m, unsigned int substr,
+                       AVFrame *frame, int *got_frame_ptr)
 {
+    AVCodecContext *avctx = m->avctx;
     SubStream *s = &m->substream[substr];
     unsigned int mat;
     unsigned int maxchan;
+    int ret;
+    int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+    if (m->avctx->channels != s->max_matrix_channel + 1) {
+        av_log(m->avctx, AV_LOG_ERROR, "channel count mismatch\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (!s->blockpos) {
+        av_log(avctx, AV_LOG_ERROR, "No samples to output.\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     maxchan = s->max_matrix_channel;
     if (!s->noise_type) {
@@ -1004,6 +1055,8 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
         fill_noise_buffer(m, substr);
     }
 
+    /* Apply the channel matrices in turn to reconstruct the original audio
+     * samples. */
     for (mat = 0; mat < s->num_primitive_matrices; mat++) {
         unsigned int dest_ch = s->matrix_out_ch[mat];
         m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
@@ -1018,34 +1071,11 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
                                     m->access_unit_size_pow2,
                                     MSB_MASK(s->quant_step_size[dest_ch]));
     }
-}
-
-/** Write the audio data into the output buffer. */
-
-static int output_data(MLPDecodeContext *m, unsigned int substr,
-                       AVFrame *frame, int *got_frame_ptr)
-{
-    AVCodecContext *avctx = m->avctx;
-    SubStream *s = &m->substream[substr];
-    int ret;
-    int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
-
-    if (m->avctx->channels != s->max_matrix_channel + 1) {
-        av_log(m->avctx, AV_LOG_ERROR, "channel count mismatch\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    if (!s->blockpos) {
-        av_log(avctx, AV_LOG_ERROR, "No samples to output.\n");
-        return AVERROR_INVALIDDATA;
-    }
 
     /* get output buffer */
     frame->nb_samples = s->blockpos;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
                                                     s->blockpos,
                                                     m->sample_buffer,
@@ -1085,7 +1115,7 @@ static int read_access_unit(AVCodecContext *avctx, void* data,
     int ret;
 
     if (buf_size < 4)
-        return 0;
+        return AVERROR_INVALIDDATA;
 
     length = (AV_RB16(buf) & 0xfff) * 2;
 
@@ -1248,8 +1278,6 @@ next_substr:
         buf += substream_data_len[substr];
     }
 
-    rematrix_channels(m, m->max_decoded_substream);
-
     if ((ret = output_data(m, m->max_decoded_substream, data, got_frame_ptr)) < 0)
         return ret;
 
@@ -1264,6 +1292,7 @@ error:
     return AVERROR_INVALIDDATA;
 }
 
+#if CONFIG_MLP_DECODER
 AVCodec ff_mlp_decoder = {
     .name           = "mlp",
     .long_name      = NULL_IF_CONFIG_SMALL("MLP (Meridian Lossless Packing)"),
@@ -1274,7 +1303,7 @@ AVCodec ff_mlp_decoder = {
     .decode         = read_access_unit,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
-
+#endif
 #if CONFIG_TRUEHD_DECODER
 AVCodec ff_truehd_decoder = {
     .name           = "truehd",
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index aded554..3ae8c37 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2007-2008 Ian Caulfield
  *               2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index acd48fc..a0edeb7 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -2,20 +2,20 @@
  * MLP codec common header file
  * Copyright (c) 2007-2008 Ian Caulfield
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
index 193df7e..099a8c5 100644
--- a/libavcodec/mmaldec.c
+++ b/libavcodec/mmaldec.c
@@ -2,20 +2,20 @@
  * MMAL Video Decoder
  * Copyright (c) 2015 Rodger Combs
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -360,6 +360,9 @@ static av_cold int ffmmal_init_decoder(AVCodecContext *avctx)
     case AV_CODEC_ID_MPEG2VIDEO:
         format_in->encoding = MMAL_ENCODING_MP2V;
         break;
+    case AV_CODEC_ID_MPEG4:
+        format_in->encoding = MMAL_ENCODING_MP4V;
+        break;
     case AV_CODEC_ID_VC1:
         format_in->encoding = MMAL_ENCODING_WVC1;
         break;
@@ -795,6 +798,13 @@ AVHWAccel ff_mpeg2_mmal_hwaccel = {
     .pix_fmt    = AV_PIX_FMT_MMAL,
 };
 
+AVHWAccel ff_mpeg4_mmal_hwaccel = {
+    .name       = "mpeg4_mmal",
+    .type       = AVMEDIA_TYPE_VIDEO,
+    .id         = AV_CODEC_ID_MPEG4,
+    .pix_fmt    = AV_PIX_FMT_MMAL,
+};
+
 AVHWAccel ff_vc1_mmal_hwaccel = {
     .name       = "vc1_mmal",
     .type       = AVMEDIA_TYPE_VIDEO,
@@ -837,4 +847,5 @@ static const AVOption options[]={
 
 FFMMAL_DEC(h264, AV_CODEC_ID_H264)
 FFMMAL_DEC(mpeg2, AV_CODEC_ID_MPEG2VIDEO)
+FFMMAL_DEC(mpeg4, AV_CODEC_ID_MPEG4)
 FFMMAL_DEC(vc1, AV_CODEC_ID_VC1)
diff --git a/libavcodec/mmvideo.c b/libavcodec/mmvideo.c
index 0736630..04de6bb 100644
--- a/libavcodec/mmvideo.c
+++ b/libavcodec/mmvideo.c
@@ -2,20 +2,20 @@
  * American Laser Games MM Video Decoder
  * Copyright (c) 2006,2008 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,7 +49,7 @@
 typedef struct MmContext {
     AVCodecContext *avctx;
     AVFrame *frame;
-    int palette[AVPALETTE_COUNT];
+    unsigned int palette[AVPALETTE_COUNT];
     GetByteContext gb;
 } MmContext;
 
@@ -75,17 +75,15 @@ static av_cold int mm_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int mm_decode_pal(MmContext *s)
+static void mm_decode_pal(MmContext *s)
 {
     int i;
 
     bytestream2_skip(&s->gb, 4);
     for (i = 0; i < 128; i++) {
-        s->palette[i] = bytestream2_get_be24(&s->gb);
+        s->palette[i] = 0xFFU << 24 | bytestream2_get_be24(&s->gb);
         s->palette[i+128] = s->palette[i]<<2;
     }
-
-    return 0;
 }
 
 /**
@@ -99,8 +97,7 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
     while (bytestream2_get_bytes_left(&s->gb) > 0) {
         int run_length, color;
 
-        // writes one more line when half_vert is true
-        if (y >= s->avctx->height + !!half_vert)
+        if (y >= s->avctx->height)
             return 0;
 
         color = bytestream2_get_byte(&s->gb);
@@ -114,12 +111,12 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
         if (half_horiz)
             run_length *=2;
 
-        if (s->avctx->width - x < run_length)
+        if (run_length > s->avctx->width - x)
             return AVERROR_INVALIDDATA;
 
         if (color) {
             memset(s->frame->data[0] + y*s->frame->linesize[0] + x, color, run_length);
-            if (half_vert)
+            if (half_vert && y + half_vert < s->avctx->height)
                 memset(s->frame->data[0] + (y+1)*s->frame->linesize[0] + x, color, run_length);
         }
         x+= run_length;
@@ -133,7 +130,7 @@ static int mm_decode_intra(MmContext * s, int half_horiz, int half_vert)
     return 0;
 }
 
-/*
+/**
  * @param half_horiz Half horizontal resolution (0 or 1)
  * @param half_vert Half vertical resolution (0 or 1)
  */
@@ -204,13 +201,11 @@ static int mm_decode_frame(AVCodecContext *avctx,
     buf_size -= MM_PREAMBLE_SIZE;
     bytestream2_init(&s->gb, buf, buf_size);
 
-    if ((res = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((res = ff_reget_buffer(avctx, s->frame)) < 0)
         return res;
-    }
 
     switch(type) {
-    case MM_TYPE_PALETTE   : res = mm_decode_pal(s); return buf_size;
+    case MM_TYPE_PALETTE   : mm_decode_pal(s); return avpkt->size;
     case MM_TYPE_INTRA     : res = mm_decode_intra(s, 0, 0); break;
     case MM_TYPE_INTRA_HH  : res = mm_decode_intra(s, 1, 0); break;
     case MM_TYPE_INTRA_HHV : res = mm_decode_intra(s, 1, 1); break;
@@ -231,7 +226,7 @@ static int mm_decode_frame(AVCodecContext *avctx,
 
     *got_frame      = 1;
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int mm_decode_end(AVCodecContext *avctx)
diff --git a/libavcodec/motion-test.c b/libavcodec/motion-test.c
new file mode 100644
index 0000000..ebcf4aa
--- /dev/null
+++ b/libavcodec/motion-test.c
@@ -0,0 +1,152 @@
+/*
+ * (c) 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * motion test.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "config.h"
+#include "me_cmp.h"
+#include "libavutil/internal.h"
+#include "libavutil/lfg.h"
+#include "libavutil/mem.h"
+#include "libavutil/time.h"
+
+#undef printf
+
+#define WIDTH 64
+#define HEIGHT 64
+
+static uint8_t img1[WIDTH * HEIGHT];
+static uint8_t img2[WIDTH * HEIGHT];
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);
+    for(i=0;i<size;i++) {
+        tab[i] = av_lfg_get(&prng) % 256;
+    }
+}
+
+static void help(void)
+{
+    printf("motion-test [-h]\n"
+           "test motion implementations\n");
+}
+
+#define NB_ITS 500
+
+int dummy;
+
+static void test_motion(const char *name,
+                 me_cmp_func test_func, me_cmp_func ref_func)
+{
+    int x, y, d1, d2, it;
+    uint8_t *ptr;
+    int64_t ti;
+    printf("testing '%s'\n", name);
+
+    /* test correctness */
+    for(it=0;it<20;it++) {
+
+        fill_random(img1, WIDTH * HEIGHT);
+        fill_random(img2, WIDTH * HEIGHT);
+
+        for(y=0;y<HEIGHT-17;y++) {
+            for(x=0;x<WIDTH-17;x++) {
+                ptr = img2 + y * WIDTH + x;
+                d1 = test_func(NULL, img1, ptr, WIDTH, 8);
+                d2 = ref_func(NULL, img1, ptr, WIDTH, 8);
+                if (d1 != d2) {
+                    printf("error: mmx=%d c=%d\n", d1, d2);
+                }
+            }
+        }
+    }
+    emms_c();
+
+    /* speed test */
+    ti = av_gettime_relative();
+    d1 = 0;
+    for(it=0;it<NB_ITS;it++) {
+        for(y=0;y<HEIGHT-17;y++) {
+            for(x=0;x<WIDTH-17;x++) {
+                ptr = img2 + y * WIDTH + x;
+                d1 += test_func(NULL, img1, ptr, WIDTH, 8);
+            }
+        }
+    }
+    emms_c();
+    dummy = d1; /* avoid optimization */
+    ti = av_gettime_relative() - ti;
+
+    printf("  %0.0f kop/s\n",
+           (double)NB_ITS * (WIDTH - 16) * (HEIGHT - 16) /
+           (double)(ti / 1000.0));
+}
+
+
+int main(int argc, char **argv)
+{
+    AVCodecContext *ctx;
+    int c;
+    MECmpContext cctx, mmxctx;
+    int flags[2] = { AV_CPU_FLAG_MMX, AV_CPU_FLAG_MMXEXT };
+    int flags_size = HAVE_MMXEXT ? 2 : 1;
+
+    if (argc > 1) {
+        help();
+        return 1;
+    }
+
+    printf("ffmpeg motion test\n");
+
+    ctx = avcodec_alloc_context3(NULL);
+    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+    av_force_cpu_flags(0);
+    memset(&cctx, 0, sizeof(cctx));
+    ff_me_cmp_init(&cctx, ctx);
+    for (c = 0; c < flags_size; c++) {
+        int x;
+        av_force_cpu_flags(flags[c]);
+        memset(&mmxctx, 0, sizeof(mmxctx));
+        ff_me_cmp_init(&mmxctx, ctx);
+
+        for (x = 0; x < 2; x++) {
+            printf("%s for %dx%d pixels\n", c ? "mmx2" : "mmx",
+                   x ? 8 : 16, x ? 8 : 16);
+            test_motion("mmx",     mmxctx.pix_abs[x][0], cctx.pix_abs[x][0]);
+            test_motion("mmx_x2",  mmxctx.pix_abs[x][1], cctx.pix_abs[x][1]);
+            test_motion("mmx_y2",  mmxctx.pix_abs[x][2], cctx.pix_abs[x][2]);
+            test_motion("mmx_xy2", mmxctx.pix_abs[x][3], cctx.pix_abs[x][3]);
+        }
+    }
+    av_free(ctx);
+
+    return 0;
+}
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index 1c0116c..257d00b 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -5,20 +5,20 @@
  *
  * new motion estimation (X1/EPZS) by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,9 +38,6 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 #define P_LEFT P[1]
 #define P_TOP P[2]
 #define P_TOPRIGHT P[3]
@@ -118,7 +115,7 @@ static av_always_inline int cmp_direct_inline(MpegEncContext *s, const int x, co
     uint8_t * const * const src= c->src[src_index];
     int d;
     //FIXME check chroma 4mv, (no crashes ...)
-        assert(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1));
+        av_assert2(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1));
         if(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1)){
             const int time_pp= s->pp_time;
             const int time_pb= s->pb_time;
@@ -160,14 +157,14 @@ static av_always_inline int cmp_direct_inline(MpegEncContext *s, const int x, co
                     c->qpel_avg[1][bxy](c->temp     + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride     + 8*stride, stride);
                     c->qpel_avg[1][bxy](c->temp + 8 + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8 + 8*stride, stride);
                 }else{
-                    assert((fx>>1) + 16*s->mb_x >= -16);
-                    assert((fy>>1) + 16*s->mb_y >= -16);
-                    assert((fx>>1) + 16*s->mb_x <= s->width);
-                    assert((fy>>1) + 16*s->mb_y <= s->height);
-                    assert((bx>>1) + 16*s->mb_x >= -16);
-                    assert((by>>1) + 16*s->mb_y >= -16);
-                    assert((bx>>1) + 16*s->mb_x <= s->width);
-                    assert((by>>1) + 16*s->mb_y <= s->height);
+                    av_assert2((fx>>1) + 16*s->mb_x >= -16);
+                    av_assert2((fy>>1) + 16*s->mb_y >= -16);
+                    av_assert2((fx>>1) + 16*s->mb_x <= s->width);
+                    av_assert2((fy>>1) + 16*s->mb_y <= s->height);
+                    av_assert2((bx>>1) + 16*s->mb_x >= -16);
+                    av_assert2((by>>1) + 16*s->mb_y >= -16);
+                    av_assert2((bx>>1) + 16*s->mb_x <= s->width);
+                    av_assert2((by>>1) + 16*s->mb_y <= s->height);
 
                     c->hpel_put[0][fxy](c->temp, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 16);
                     c->hpel_avg[0][bxy](c->temp, ref[8] + (bx>>1) + (by>>1)*stride, stride, 16);
@@ -186,8 +183,8 @@ static av_always_inline int cmp_inline(MpegEncContext *s, const int x, const int
     const int stride= c->stride;
     const int uvstride= c->uvstride;
     const int dxy= subx + (suby<<(1+qpel)); //FIXME log2_subpel?
-    const int hx= subx + (x<<(1+qpel));
-    const int hy= suby + (y<<(1+qpel));
+    const int hx= subx + x*(1<<(1+qpel));
+    const int hy= suby + y*(1<<(1+qpel));
     uint8_t * const * const ref= c->ref[ref_index];
     uint8_t * const * const src= c->src[src_index];
     int d;
@@ -195,7 +192,13 @@ static av_always_inline int cmp_inline(MpegEncContext *s, const int x, const int
         int uvdxy;              /* no, it might not be used uninitialized */
         if(dxy){
             if(qpel){
-                c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+                if (h << size == 16) {
+                    c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+                } else if (size == 0 && h == 8) {
+                    c->qpel_put[1][dxy](c->temp    , ref[0] + x + y*stride    , stride);
+                    c->qpel_put[1][dxy](c->temp + 8, ref[0] + x + y*stride + 8, stride);
+                } else
+                    av_assert2(0);
                 if(chroma){
                     int cx= hx/2;
                     int cy= hy/2;
@@ -305,12 +308,13 @@ int ff_init_me(MpegEncContext *s){
     int cache_size= FFMIN(ME_MAP_SIZE>>ME_MAP_SHIFT, 1<<ME_MAP_SHIFT);
     int dia_size= FFMAX(FFABS(s->avctx->dia_size)&255, FFABS(s->avctx->pre_dia_size)&255);
 
-    if(FFMIN(s->avctx->dia_size, s->avctx->pre_dia_size) < -ME_MAP_SIZE){
+    if(FFMIN(s->avctx->dia_size, s->avctx->pre_dia_size) < -FFMIN(ME_MAP_SIZE, MAX_SAB_SIZE)){
         av_log(s->avctx, AV_LOG_ERROR, "ME_MAP size is too small for SAB diamond\n");
         return -1;
     }
 
 #if FF_API_MOTION_EST
+    //special case of snow is needed because snow uses its own iterative ME code
 FF_DISABLE_DEPRECATION_WARNINGS
     if (s->motion_est == FF_ME_EPZS) {
         if (s->me_method == ME_ZERO)
@@ -319,7 +323,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
             s->motion_est = FF_ME_EPZS;
         else if (s->me_method == ME_X1)
             s->motion_est = FF_ME_XONE;
-        else {
+        else if (s->avctx->codec_id != AV_CODEC_ID_SNOW) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "me_method is only allowed to be set to zero and epzs; "
                    "for hex,umh,full and others see dia_size\n");
@@ -331,6 +335,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     c->avctx= s->avctx;
 
+    if(s->codec_id == AV_CODEC_ID_H261)
+        c->avctx->me_sub_cmp = c->avctx->me_cmp;
+
     if(cache_size < 2*dia_size && !c->stride){
         av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n");
     }
@@ -379,12 +386,14 @@ FF_ENABLE_DEPRECATION_WARNINGS
     /* 8x8 fullpel search would need a 4x4 chroma compare, which we do
      * not have yet, and even if we had, the motion estimation code
      * does not expect it. */
-    if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
-        s->mecc.me_cmp[2] = zero_cmp;
-    if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
-        s->mecc.me_sub_cmp[2] = zero_cmp;
-    c->hpel_put[2][0]= c->hpel_put[2][1]=
-    c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
+    if (s->codec_id != AV_CODEC_ID_SNOW) {
+        if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */)
+            s->mecc.me_cmp[2] = zero_cmp;
+        if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2])
+            s->mecc.me_sub_cmp[2] = zero_cmp;
+        c->hpel_put[2][0]= c->hpel_put[2][1]=
+        c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
+    }
 
     if(s->codec_id == AV_CODEC_ID_H261){
         c->sub_motion_search= no_sub_motion_search;
@@ -410,10 +419,9 @@ static int sad_hpel_motion_search(MpegEncContext * s,
     int mx, my, dminh;
     uint8_t *pix, *ptr;
     int stride= c->stride;
-    const int flags= c->sub_flags;
     LOAD_COMMON
 
-    assert(flags == 0);
+    av_assert2(c->sub_flags == 0);
 
     if(c->skip){
         *mx_ptr = 0;
@@ -433,13 +441,13 @@ static int sad_hpel_motion_search(MpegEncContext * s,
         my > ymin && my < ymax) {
         int dx=0, dy=0;
         int d, pen_x, pen_y;
-        const int index= (my<<ME_MAP_SHIFT) + mx;
+        const int index= my*(1<<ME_MAP_SHIFT) + mx;
         const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
         const int l= score_map[(index- 1               )&(ME_MAP_SIZE-1)];
         const int r= score_map[(index+ 1               )&(ME_MAP_SIZE-1)];
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
-        mx<<=1;
-        my<<=1;
+        mx += mx;
+        my += my;
 
 
         pen_x= pred_x + mx;
@@ -497,8 +505,8 @@ static int sad_hpel_motion_search(MpegEncContext * s,
         my+=dy;
 
     }else{
-        mx<<=1;
-        my<<=1;
+        mx += mx;
+        my += my;
     }
 
     *mx_ptr = mx;
@@ -537,6 +545,7 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
 {
     MotionEstContext * const c= &s->me;
     int range= c->avctx->me_range >> (1 + !!(c->flags&FLAG_QPEL));
+    int max_range = MAX_MV >> (1 + !!(c->flags&FLAG_QPEL));
 /*
     if(c->avctx->me_range) c->range= c->avctx->me_range >> 1;
     else                   c->range= 16;
@@ -544,8 +553,8 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
     if (s->unrestricted_mv) {
         c->xmin = - x - 16;
         c->ymin = - y - 16;
-        c->xmax = - x + s->mb_width *16;
-        c->ymax = - y + s->mb_height*16;
+        c->xmax = - x + s->width;
+        c->ymax = - y + s->height;
     } else if (s->out_format == FMT_H261){
         // Search range of H.261 is different from other codec standards
         c->xmin = (x > 15) ? - 15 : 0;
@@ -558,6 +567,8 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
         c->xmax = - x + s->mb_width *16 - 16;
         c->ymax = - y + s->mb_height*16 - 16;
     }
+    if(!range || range > max_range)
+        range = max_range;
     if(range){
         c->xmin = FFMAX(c->xmin,-range);
         c->xmax = FFMIN(c->xmax, range);
@@ -584,10 +595,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
     const int h=8;
     int block;
     int P[10][2];
-    int dmin_sum=0, mx4_sum=0, my4_sum=0;
+    int dmin_sum=0, mx4_sum=0, my4_sum=0, i;
     int same=1;
     const int stride= c->stride;
     uint8_t *mv_penalty= c->current_mv_penalty;
+    int safety_clipping= s->unrestricted_mv && (s->width&15) && (s->height&15);
 
     init_mv4_ref(c);
 
@@ -599,6 +611,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
         const int mot_stride = s->b8_stride;
         const int mot_xy = s->block_index[block];
 
+        if(safety_clipping){
+            c->xmax = - 16*s->mb_x + s->width  - 8*(block &1);
+            c->ymax = - 16*s->mb_y + s->height - 8*(block>>1);
+        }
+
         P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0];
         P_LEFT[1] = s->current_picture.motion_val[0][mot_xy - 1][1];
 
@@ -626,6 +643,15 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
         }
         P_MV1[0]= mx;
         P_MV1[1]= my;
+        if(safety_clipping)
+            for(i=1; i<10; i++){
+                if (s->first_slice_line && block<2 && i>1 && i<9)
+                    continue;
+                if (i>4 && i<9)
+                    continue;
+                if(P[i][0] > (c->xmax<<shift)) P[i][0]= (c->xmax<<shift);
+                if(P[i][1] > (c->ymax<<shift)) P[i][1]= (c->ymax<<shift);
+            }
 
         dmin4 = epzs_motion_search4(s, &mx4, &my4, P, block, block, s->p_mv_table, (1<<16)>>shift);
 
@@ -760,8 +786,8 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
             int16_t (*mv_table)[2]= mv_tables[block][field_select];
 
             if(user_field_select){
-                assert(field_select==0 || field_select==1);
-                assert(field_select_tables[block][xy]==0 || field_select_tables[block][xy]==1);
+                av_assert1(field_select==0 || field_select==1);
+                av_assert1(field_select_tables[block][xy]==0 || field_select_tables[block][xy]==1);
                 if(field_select_tables[block][xy] != field_select)
                     continue;
             }
@@ -858,6 +884,10 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
         return lambda>>FF_LAMBDA_SHIFT;
     case FF_CMP_DCT:
         return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
+    case FF_CMP_W53:
+        return (4*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_W97:
+        return (2*lambda)>>(FF_LAMBDA_SHIFT);
     case FF_CMP_SATD:
     case FF_CMP_DCT264:
         return (2*lambda)>>FF_LAMBDA_SHIFT;
@@ -886,14 +916,14 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 
     init_ref(c, s->new_picture.f->data, s->last_picture.f->data, NULL, 16*mb_x, 16*mb_y, 0);
 
-    assert(s->quarter_sample==0 || s->quarter_sample==1);
-    assert(s->linesize == c->stride);
-    assert(s->uvlinesize == c->uvstride);
+    av_assert0(s->quarter_sample==0 || s->quarter_sample==1);
+    av_assert0(s->linesize == c->stride);
+    av_assert0(s->uvlinesize == c->uvstride);
 
     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
-    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
+    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_DMV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
     c->skip=0;
@@ -1057,10 +1087,10 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
     const int xy= mb_x + mb_y*s->mb_stride;
     init_ref(c, s->new_picture.f->data, s->last_picture.f->data, NULL, 16*mb_x, 16*mb_y, 0);
 
-    assert(s->quarter_sample==0 || s->quarter_sample==1);
+    av_assert0(s->quarter_sample==0 || s->quarter_sample==1);
 
     c->pre_penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_pre_cmp);
-    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
+    c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_DMV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
     c->skip=0;
@@ -1109,7 +1139,7 @@ static int estimate_motion_b(MpegEncContext *s, int mb_x, int mb_y,
     const int shift= 1+s->quarter_sample;
     const int mot_stride = s->mb_stride;
     const int mot_xy = mb_y*mot_stride + mb_x;
-    uint8_t * const mv_penalty= c->mv_penalty[f_code] + MAX_MV;
+    uint8_t * const mv_penalty= c->mv_penalty[f_code] + MAX_DMV;
     int mv_scale;
 
     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
@@ -1173,8 +1203,8 @@ static inline int check_bidir_mv(MpegEncContext * s,
     //FIXME better f_code prediction (max mv & distance)
     //FIXME pointers
     MotionEstContext * const c= &s->me;
-    uint8_t * const mv_penalty_f= c->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
-    uint8_t * const mv_penalty_b= c->mv_penalty[s->b_code] + MAX_MV; // f_code of the prev frame
+    uint8_t * const mv_penalty_f= c->mv_penalty[s->f_code] + MAX_DMV; // f_code of the prev frame
+    uint8_t * const mv_penalty_b= c->mv_penalty[s->b_code] + MAX_DMV; // f_code of the prev frame
     int stride= c->stride;
     uint8_t *dest_y = c->scratchpad;
     uint8_t *ptr;
@@ -1387,7 +1417,7 @@ static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
     int mx, my, xmin, xmax, ymin, ymax;
     int16_t (*mv_table)[2]= s->b_direct_mv_table;
 
-    c->current_mv_penalty= c->mv_penalty[1] + MAX_MV;
+    c->current_mv_penalty= c->mv_penalty[1] + MAX_DMV;
     ymin= xmin=(-32)>>shift;
     ymax= xmax=   31>>shift;
 
@@ -1425,7 +1455,7 @@ static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
         if(s->mv_type == MV_TYPE_16X16) break;
     }
 
-    assert(xmax <= 15 && ymax <= 15 && xmin >= -16 && ymin >= -16);
+    av_assert2(xmax <= 15 && ymax <= 15 && xmin >= -16 && ymin >= -16);
 
     if(xmax < 0 || xmin >0 || ymax < 0 || ymin > 0){
         s->b_direct_mv_table[mot_xy][0]= 0;
@@ -1523,11 +1553,11 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
     if (s->avctx->flags & AV_CODEC_FLAG_INTERLACED_ME) {
 //FIXME mb type penalty
         c->skip=0;
-        c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
+        c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_DMV;
         fimin= interlaced_search(s, 0,
                                  s->b_field_mv_table[0], s->b_field_select_table[0],
                                  s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 0);
-        c->current_mv_penalty= c->mv_penalty[s->b_code] + MAX_MV;
+        c->current_mv_penalty= c->mv_penalty[s->b_code] + MAX_DMV;
         bimin= interlaced_search(s, 2,
                                  s->b_field_mv_table[1], s->b_field_select_table[1],
                                  s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 0);
@@ -1642,12 +1672,12 @@ void ff_fix_long_p_mvs(MpegEncContext * s)
     MotionEstContext * const c= &s->me;
     const int f_code= s->f_code;
     int y, range;
-    assert(s->pict_type==AV_PICTURE_TYPE_P);
+    av_assert0(s->pict_type==AV_PICTURE_TYPE_P);
 
     range = (((s->out_format == FMT_MPEG1 || s->msmpeg4_version) ? 8 : 16) << f_code);
 
-    assert(range <= 16 || !s->msmpeg4_version);
-    assert(range <=256 || !(s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL));
+    av_assert0(range <= 16 || !s->msmpeg4_version);
+    av_assert0(range <=256 || !(s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->avctx->strict_std_compliance >= FF_COMPLIANCE_NORMAL));
 
     if(c->avctx->me_range && range > c->avctx->me_range) range= c->avctx->me_range;
 
diff --git a/libavcodec/motion_est.h b/libavcodec/motion_est.h
index 3b63972..3b3a8d7 100644
--- a/libavcodec/motion_est.h
+++ b/libavcodec/motion_est.h
@@ -1,25 +1,25 @@
 /*
  * Motion estimation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_MOTIONEST_H
-#define AVCODEC_MOTIONEST_H
+#ifndef AVCODEC_MOTION_EST_H
+#define AVCODEC_MOTION_EST_H
 
 #include <stdint.h>
 
@@ -29,7 +29,12 @@
 
 struct MpegEncContext;
 
-#define MAX_MV 2048
+#if ARCH_IA64 // Limit static arrays to avoid gcc failing "short data segment overflowed"
+#define MAX_MV 1024
+#else
+#define MAX_MV 4096
+#endif
+#define MAX_DMV (2*MAX_MV)
 #define ME_MAP_SIZE 64
 
 #define FF_ME_ZERO 0
@@ -77,15 +82,15 @@ typedef struct MotionEstContext {
     int stride;
     int uvstride;
     /* temp variables for picture complexity calculation */
-    int mc_mb_var_sum_temp;
-    int mb_var_sum_temp;
+    int64_t mc_mb_var_sum_temp;
+    int64_t mb_var_sum_temp;
     int scene_change_score;
 
     op_pixels_func(*hpel_put)[4];
     op_pixels_func(*hpel_avg)[4];
     qpel_mc_func(*qpel_put)[16];
     qpel_mc_func(*qpel_avg)[16];
-    uint8_t (*mv_penalty)[MAX_MV * 2 + 1]; ///< bit amount needed to encode a MV
+    uint8_t (*mv_penalty)[MAX_DMV * 2 + 1]; ///< bit amount needed to encode a MV
     uint8_t *current_mv_penalty;
     int (*sub_motion_search)(struct MpegEncContext *s,
                              int *mx_ptr, int *my_ptr, int dmin,
@@ -127,4 +132,4 @@ void ff_fix_long_mvs(struct MpegEncContext *s, uint8_t *field_select_table,
                      int field_select, int16_t (*mv_table)[2], int f_code,
                      int type, int truncate);
 
-#endif /* AVCODEC_MOTIONEST_H */
+#endif /* AVCODEC_MOTION_EST_H */
diff --git a/libavcodec/motion_est_template.c b/libavcodec/motion_est_template.c
index c655e19..0c21bbf 100644
--- a/libavcodec/motion_est_template.c
+++ b/libavcodec/motion_est_template.c
@@ -2,20 +2,20 @@
  * Motion estimation
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
  * Motion estimation template.
  */
 
+#include "libavutil/qsort.h"
 #include "mpegvideo.h"
 
 //Let us hope gcc will remove the unused vars ...(gcc 3.2.2 seems to do it ...)
@@ -91,19 +92,18 @@ static int hpel_motion_search(MpegEncContext * s,
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]
                      + (mv_penalty[bx   - pred_x] + mv_penalty[by+2 - pred_y])*c->penalty_factor;
 
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
         unsigned key;
         unsigned map_generation= c->map_generation;
-#ifndef NDEBUG
-        uint32_t *map= c->map;
-#endif
         key= ((my-1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
-        assert(map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
         key= ((my+1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
-        assert(map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
         key= ((my)<<ME_MAP_MV_BITS) + (mx+1) + map_generation;
-        assert(map[(index+1)&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index+1)&(ME_MAP_SIZE-1)] == key);
         key= ((my)<<ME_MAP_MV_BITS) + (mx-1) + map_generation;
-        assert(map[(index-1)&(ME_MAP_SIZE-1)] == key);
+        av_assert2(c->map[(index-1)&(ME_MAP_SIZE-1)] == key);
+#endif
         if(t<=b){
             CHECK_HALF_MV(0, 1, mx  ,my-1)
             if(l<=r){
@@ -143,7 +143,7 @@ static int hpel_motion_search(MpegEncContext * s,
             }
             CHECK_HALF_MV(0, 1, mx  , my)
         }
-        assert(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
+        av_assert2(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
     }
 
     *mx_ptr = bx;
@@ -181,9 +181,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my,
     cmp_sub        = s->mecc.mb_cmp[size];
     chroma_cmp_sub = s->mecc.mb_cmp[size + 1];
 
-//    assert(!c->skip);
-//    assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp);
-
     d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
     //FIXME check cbp before adding penalty for (0,0) vector
     if(add_rate && (mx || my || size>0))
@@ -302,7 +299,7 @@ static int qpel_motion_search(MpegEncContext * s,
             const int cy2= b + t - 2*c;
             int cxy;
 
-            if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == (my<<ME_MAP_MV_BITS) + mx + map_generation && 0){ //FIXME
+            if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == ((my-1)<<ME_MAP_MV_BITS) + (mx-1) + map_generation){
                 tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
             }else{
                 tl= cmp(s, mx-1, my-1, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);//FIXME wrong if chroma me is different
@@ -310,11 +307,11 @@ static int qpel_motion_search(MpegEncContext * s,
 
             cxy= 2*tl + (cx + cy)/4 - (cx2 + cy2) - 2*c;
 
-            assert(16*cx2 + 4*cx + 32*c == 32*r);
-            assert(16*cx2 - 4*cx + 32*c == 32*l);
-            assert(16*cy2 + 4*cy + 32*c == 32*b);
-            assert(16*cy2 - 4*cy + 32*c == 32*t);
-            assert(16*cxy + 16*cy2 + 16*cx2 - 4*cy - 4*cx + 32*c == 32*tl);
+            av_assert2(16*cx2 + 4*cx + 32*c == 32*r);
+            av_assert2(16*cx2 - 4*cx + 32*c == 32*l);
+            av_assert2(16*cy2 + 4*cy + 32*c == 32*b);
+            av_assert2(16*cy2 - 4*cy + 32*c == 32*t);
+            av_assert2(16*cxy + 16*cy2 + 16*cx2 - 4*cy - 4*cx + 32*c == 32*tl);
 
             for(ny= -3; ny <= 3; ny++){
                 for(nx= -3; nx <= 3; nx++){
@@ -347,7 +344,7 @@ static int qpel_motion_search(MpegEncContext * s,
             CHECK_QUARTER_MV(nx&3, ny&3, nx>>2, ny>>2)
         }
 
-        assert(bx >= xmin*4 && bx <= xmax*4 && by >= ymin*4 && by <= ymax*4);
+        av_assert2(bx >= xmin*4 && bx <= xmax*4 && by >= ymin*4 && by <= ymax*4);
 
         *mx_ptr = bx;
         *my_ptr = by;
@@ -362,17 +359,17 @@ static int qpel_motion_search(MpegEncContext * s,
 
 #define CHECK_MV(x,y)\
 {\
-    const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
-    assert((x) >= xmin);\
-    assert((x) <= xmax);\
-    assert((y) >= ymin);\
-    assert((y) <= ymax);\
+    const unsigned key = ((unsigned)(y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((unsigned)(y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    av_assert2((x) >= xmin);\
+    av_assert2((x) <= xmax);\
+    av_assert2((y) >= ymin);\
+    av_assert2((y) <= ymax);\
     if(map[index]!=key){\
         d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+        d += (mv_penalty[((x)*(1<<shift))-pred_x] + mv_penalty[((y)*(1<<shift))-pred_y])*penalty_factor;\
         COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
     }\
 }
@@ -388,13 +385,13 @@ static int qpel_motion_search(MpegEncContext * s,
 
 #define CHECK_MV_DIR(x,y,new_dir)\
 {\
-    const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    const unsigned key = ((unsigned)(y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((unsigned)(y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
     if(map[index]!=key){\
         d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+        d += (mv_penalty[(int)((unsigned)(x)<<shift)-pred_x] + mv_penalty[(int)((unsigned)(y)<<shift)-pred_y])*penalty_factor;\
         if(d<dmin){\
             best[0]=x;\
             best[1]=y;\
@@ -405,10 +402,10 @@ static int qpel_motion_search(MpegEncContext * s,
 }
 
 #define check(x,y,S,v)\
-if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
-if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
-if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
-if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
+if( (x)<(xmin<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
+if( (x)>(xmax<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
+if( (y)<(ymin<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
+if( (y)>(ymax<<(S)) ) av_log(NULL, AV_LOG_ERROR, "%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
 
 #define LOAD_COMMON2\
     uint32_t *map= c->map;\
@@ -430,8 +427,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best,
     chroma_cmpf = s->mecc.me_cmp[size + 1];
 
     { /* ensure that the best point is in the MAP as h/qpel refinement needs it */
-        const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
-        const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
+        const unsigned key = ((unsigned)best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
+        const int index= (((unsigned)best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
         if (map[index] != key) { // this will be executed only very rarely
             score_map[index]= cmp(s, best[0], best[1], 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
             map[index]= key;
@@ -693,6 +690,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
     LOAD_COMMON2
     unsigned map_generation = c->map_generation;
 
+    av_assert1(minima_count <= MAX_SAB_SIZE);
+
     cmpf        = s->mecc.me_cmp[size];
     chroma_cmpf = s->mecc.me_cmp[size + 1];
 
@@ -725,7 +724,7 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
         j++;
     }
 
-    qsort(minima, j, sizeof(Minima), minima_cmp);
+    AV_QSORT(minima, j, Minima, minima_cmp);
 
     for(; j<minima_count; j++){
         minima[j].height=256*256*256*64;
@@ -890,7 +889,7 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int
 
     map_generation= update_map_generation(c);
 
-    assert(cmpf);
+    av_assert2(cmpf);
     dmin= cmp(s, 0, 0, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
     map[0]= map_generation;
     score_map[0]= dmin;
diff --git a/libavcodec/motionpixels.c b/libavcodec/motionpixels.c
index a18541b..a88b837 100644
--- a/libavcodec/motionpixels.c
+++ b/libavcodec/motionpixels.c
@@ -2,20 +2,20 @@
  * Motion Pixels Video Decoder
  * Copyright (c) 2008 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,13 +69,24 @@ static av_cold int mp_decode_init(AVCodecContext *avctx)
     int w4 = (avctx->width  + 3) & ~3;
     int h4 = (avctx->height + 3) & ~3;
 
+    if(avctx->extradata_size < 2){
+        av_log(avctx, AV_LOG_ERROR, "extradata too small\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     motionpixels_tableinit();
     mp->avctx = avctx;
     ff_bswapdsp_init(&mp->bdsp);
-    mp->changes_map = av_mallocz(avctx->width * h4);
+    mp->changes_map = av_mallocz_array(avctx->width, h4);
     mp->offset_bits_len = av_log2(avctx->width * avctx->height) + 1;
-    mp->vpt = av_mallocz(avctx->height * sizeof(YuvPixel));
-    mp->hpt = av_mallocz(h4 * w4 / 16 * sizeof(YuvPixel));
+    mp->vpt = av_mallocz_array(avctx->height, sizeof(YuvPixel));
+    mp->hpt = av_mallocz_array(h4 / 4, w4 / 4 * sizeof(YuvPixel));
+    if (!mp->changes_map || !mp->vpt || !mp->hpt) {
+        av_freep(&mp->changes_map);
+        av_freep(&mp->vpt);
+        av_freep(&mp->hpt);
+        return AVERROR(ENOMEM);
+    }
     avctx->pix_fmt = AV_PIX_FMT_RGB555;
 
     mp->frame = av_frame_alloc();
@@ -116,38 +127,48 @@ static void mp_read_changes_map(MotionPixelsContext *mp, GetBitContext *gb, int
     }
 }
 
-static void mp_get_code(MotionPixelsContext *mp, GetBitContext *gb, int size, int code)
+static int mp_get_code(MotionPixelsContext *mp, GetBitContext *gb, int size, int code)
 {
     while (get_bits1(gb)) {
         ++size;
         if (size > mp->max_codes_bits) {
             av_log(mp->avctx, AV_LOG_ERROR, "invalid code size %d/%d\n", size, mp->max_codes_bits);
-            return;
+            return AVERROR_INVALIDDATA;
         }
         code <<= 1;
-        mp_get_code(mp, gb, size, code + 1);
+        if (mp_get_code(mp, gb, size, code + 1) < 0)
+            return AVERROR_INVALIDDATA;
     }
     if (mp->current_codes_count >= MAX_HUFF_CODES) {
         av_log(mp->avctx, AV_LOG_ERROR, "too many codes\n");
-        return;
+        return AVERROR_INVALIDDATA;
     }
+
     mp->codes[mp->current_codes_count  ].code = code;
     mp->codes[mp->current_codes_count++].size = size;
+    return 0;
 }
 
-static void mp_read_codes_table(MotionPixelsContext *mp, GetBitContext *gb)
+static int mp_read_codes_table(MotionPixelsContext *mp, GetBitContext *gb)
 {
     if (mp->codes_count == 1) {
         mp->codes[0].delta = get_bits(gb, 4);
     } else {
         int i;
+        int ret;
 
         mp->max_codes_bits = get_bits(gb, 4);
         for (i = 0; i < mp->codes_count; ++i)
             mp->codes[i].delta = get_bits(gb, 4);
         mp->current_codes_count = 0;
-        mp_get_code(mp, gb, 0, 0);
+        if ((ret = mp_get_code(mp, gb, 0, 0)) < 0)
+            return ret;
+        if (mp->current_codes_count < mp->codes_count) {
+            av_log(mp->avctx, AV_LOG_ERROR, "too few codes\n");
+            return AVERROR_INVALIDDATA;
+        }
    }
+   return 0;
 }
 
 static int mp_gradient(MotionPixelsContext *mp, int component, int v)
@@ -180,7 +201,6 @@ static int mp_get_vlc(MotionPixelsContext *mp, GetBitContext *gb)
     int i;
 
     i = (mp->codes_count == 1) ? 0 : get_vlc2(gb, mp->vlc.table, mp->max_codes_bits, 1);
-    i = FFMIN(i, FF_ARRAY_ELEMS(mp->codes) - 1);
     return mp->codes[i].delta;
 }
 
@@ -236,6 +256,8 @@ static void mp_decode_frame_helper(MotionPixelsContext *mp, GetBitContext *gb)
     YuvPixel p;
     int y, y0;
 
+    av_assert1(mp->changes_map[0]);
+
     for (y = 0; y < mp->avctx->height; ++y) {
         if (mp->changes_map[y * mp->avctx->width] != 0) {
             memset(mp->gradient_scale, 1, sizeof(mp->gradient_scale));
@@ -268,20 +290,17 @@ static int mp_decode_frame(AVCodecContext *avctx,
     GetBitContext gb;
     int i, count1, count2, sz, ret;
 
-    if ((ret = ff_reget_buffer(avctx, mp->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, mp->frame)) < 0)
         return ret;
-    }
 
     /* le32 bitstream msb first */
-    av_fast_malloc(&mp->bswapbuf, &mp->bswapbuf_size, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&mp->bswapbuf, &mp->bswapbuf_size, buf_size);
     if (!mp->bswapbuf)
         return AVERROR(ENOMEM);
     mp->bdsp.bswap_buf((uint32_t *) mp->bswapbuf, (const uint32_t *) buf,
                        buf_size / 4);
     if (buf_size & 3)
         memcpy(mp->bswapbuf + (buf_size & ~3), buf + (buf_size & ~3), buf_size & 3);
-    memset(mp->bswapbuf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     init_get_bits(&gb, mp->bswapbuf, buf_size * 8);
 
     memset(mp->changes_map, 0, avctx->width * avctx->height);
@@ -300,7 +319,8 @@ static int mp_decode_frame(AVCodecContext *avctx,
         *(uint16_t *)mp->frame->data[0] = get_bits(&gb, 15);
         mp->changes_map[0] = 1;
     }
-    mp_read_codes_table(mp, &gb);
+    if (mp_read_codes_table(mp, &gb) < 0)
+        goto end;
 
     sz = get_bits(&gb, 18);
     if (avctx->extradata[0] != 5)
diff --git a/libavcodec/motionpixels_tablegen.c b/libavcodec/motionpixels_tablegen.c
index 2f0df3c..1bebaf1 100644
--- a/libavcodec/motionpixels_tablegen.c
+++ b/libavcodec/motionpixels_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/motionpixels_tablegen.h b/libavcodec/motionpixels_tablegen.h
index 2d0c0ff..9239b6a 100644
--- a/libavcodec/motionpixels_tablegen.h
+++ b/libavcodec/motionpixels_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,13 +24,14 @@
 #define AVCODEC_MOTIONPIXELS_TABLEGEN_H
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
 
 typedef struct YuvPixel {
     int8_t y, v, u;
 } YuvPixel;
 
 static int mp_yuv_to_rgb(int y, int v, int u, int clip_rgb) {
-    static const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
+    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
     int r, g, b;
 
     r = (1000 * y + 701 * v) / 1000;
@@ -49,7 +50,7 @@ static int mp_yuv_to_rgb(int y, int v, int u, int clip_rgb) {
 #else
 static YuvPixel mp_rgb_yuv_table[1 << 15];
 
-static void mp_set_zero_yuv(YuvPixel *p)
+static av_cold void mp_set_zero_yuv(YuvPixel *p)
 {
     int i, j;
 
@@ -63,7 +64,7 @@ static void mp_set_zero_yuv(YuvPixel *p)
     }
 }
 
-static void mp_build_rgb_yuv_table(YuvPixel *p)
+static av_cold void mp_build_rgb_yuv_table(YuvPixel *p)
 {
     int y, v, u, i;
 
@@ -81,7 +82,7 @@ static void mp_build_rgb_yuv_table(YuvPixel *p)
         mp_set_zero_yuv(p + i * 32);
 }
 
-static void motionpixels_tableinit(void)
+static av_cold void motionpixels_tableinit(void)
 {
     if (!mp_rgb_yuv_table[0].u)
         mp_build_rgb_yuv_table(mp_rgb_yuv_table);
diff --git a/libavcodec/movsub_bsf.c b/libavcodec/movsub_bsf.c
index fc6b236..3cb1183 100644
--- a/libavcodec/movsub_bsf.c
+++ b/libavcodec/movsub_bsf.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Reimar Döffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/movtextdec.c b/libavcodec/movtextdec.c
new file mode 100644
index 0000000..abf8711
--- /dev/null
+++ b/libavcodec/movtextdec.c
@@ -0,0 +1,535 @@
+/*
+ * 3GPP TS 26.245 Timed Text decoder
+ * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/common.h"
+#include "libavutil/bprint.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+
+#define STYLE_FLAG_BOLD         (1<<0)
+#define STYLE_FLAG_ITALIC       (1<<1)
+#define STYLE_FLAG_UNDERLINE    (1<<2)
+
+#define BOX_SIZE_INITIAL    40
+
+#define STYL_BOX   (1<<0)
+#define HLIT_BOX   (1<<1)
+#define HCLR_BOX   (1<<2)
+#define TWRP_BOX   (1<<3)
+
+#define BOTTOM_LEFT     1
+#define BOTTOM_CENTER   2
+#define BOTTOM_RIGHT    3
+#define MIDDLE_LEFT     4
+#define MIDDLE_CENTER   5
+#define MIDDLE_RIGHT    6
+#define TOP_LEFT        7
+#define TOP_CENTER      8
+#define TOP_RIGHT       9
+
+typedef struct {
+    char *font;
+    int fontsize;
+    int color;
+    int back_color;
+    int bold;
+    int italic;
+    int underline;
+    int alignment;
+} MovTextDefault;
+
+typedef struct {
+    uint16_t fontID;
+    char *font;
+} FontRecord;
+
+typedef struct {
+    uint16_t style_start;
+    uint16_t style_end;
+    uint8_t style_flag;
+    uint8_t fontsize;
+    uint16_t style_fontID;
+} StyleBox;
+
+typedef struct {
+    uint16_t hlit_start;
+    uint16_t hlit_end;
+} HighlightBox;
+
+typedef struct {
+   uint8_t hlit_color[4];
+} HilightcolorBox;
+
+typedef struct {
+    uint8_t wrap_flag;
+} TextWrapBox;
+
+typedef struct {
+    StyleBox **s;
+    StyleBox *s_temp;
+    HighlightBox h;
+    HilightcolorBox c;
+    FontRecord **ftab;
+    FontRecord *ftab_temp;
+    TextWrapBox w;
+    MovTextDefault d;
+    uint8_t box_flags;
+    uint16_t style_entries, ftab_entries;
+    uint64_t tracksize;
+    int size_var;
+    int count_s, count_f;
+    int readorder;
+} MovTextContext;
+
+typedef struct {
+    uint32_t type;
+    size_t base_size;
+    int (*decode)(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt);
+} Box;
+
+static void mov_text_cleanup(MovTextContext *m)
+{
+    int i;
+    if (m->box_flags & STYL_BOX) {
+        for(i = 0; i < m->count_s; i++) {
+            av_freep(&m->s[i]);
+        }
+        av_freep(&m->s);
+    }
+}
+
+static void mov_text_cleanup_ftab(MovTextContext *m)
+{
+    int i;
+    if (m->ftab_temp)
+        av_freep(&m->ftab_temp->font);
+    av_freep(&m->ftab_temp);
+    if (m->ftab) {
+        for(i = 0; i < m->count_f; i++) {
+            av_freep(&m->ftab[i]->font);
+            av_freep(&m->ftab[i]);
+        }
+    }
+    av_freep(&m->ftab);
+}
+
+static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
+{
+    uint8_t *tx3g_ptr = avctx->extradata;
+    int i, box_size, font_length;
+    int8_t v_align, h_align;
+    int style_fontID;
+    StyleBox s_default;
+
+    m->count_f = 0;
+    m->ftab_entries = 0;
+    box_size = BOX_SIZE_INITIAL; /* Size till ftab_entries */
+    if (avctx->extradata_size < box_size)
+        return -1;
+
+    // Display Flags
+    tx3g_ptr += 4;
+    // Alignment
+    h_align = *tx3g_ptr++;
+    v_align = *tx3g_ptr++;
+    if (h_align == 0) {
+        if (v_align == 0)
+            m->d.alignment = TOP_LEFT;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_LEFT;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_LEFT;
+    }
+    if (h_align == 1) {
+        if (v_align == 0)
+            m->d.alignment = TOP_CENTER;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_CENTER;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_CENTER;
+    }
+    if (h_align == -1) {
+        if (v_align == 0)
+            m->d.alignment = TOP_RIGHT;
+        if (v_align == 1)
+            m->d.alignment = MIDDLE_RIGHT;
+        if (v_align == -1)
+            m->d.alignment = BOTTOM_RIGHT;
+    }
+    // Background Color
+    m->d.back_color = AV_RB24(tx3g_ptr);
+    tx3g_ptr += 4;
+    // BoxRecord
+    tx3g_ptr += 8;
+    // StyleRecord
+    tx3g_ptr += 4;
+    // fontID
+    style_fontID = AV_RB16(tx3g_ptr);
+    tx3g_ptr += 2;
+    // face-style-flags
+    s_default.style_flag = *tx3g_ptr++;
+    m->d.bold = s_default.style_flag & STYLE_FLAG_BOLD;
+    m->d.italic = s_default.style_flag & STYLE_FLAG_ITALIC;
+    m->d.underline = s_default.style_flag & STYLE_FLAG_UNDERLINE;
+    // fontsize
+    m->d.fontsize = *tx3g_ptr++;
+    // Primary color
+    m->d.color = AV_RB24(tx3g_ptr);
+    tx3g_ptr += 4;
+    // FontRecord
+    // FontRecord Size
+    tx3g_ptr += 4;
+    // ftab
+    tx3g_ptr += 4;
+
+    m->ftab_entries = AV_RB16(tx3g_ptr);
+    tx3g_ptr += 2;
+
+    for (i = 0; i < m->ftab_entries; i++) {
+
+        box_size += 3;
+        if (avctx->extradata_size < box_size) {
+            mov_text_cleanup_ftab(m);
+            m->ftab_entries = 0;
+            return -1;
+        }
+        m->ftab_temp = av_mallocz(sizeof(*m->ftab_temp));
+        if (!m->ftab_temp) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        m->ftab_temp->fontID = AV_RB16(tx3g_ptr);
+        tx3g_ptr += 2;
+        font_length = *tx3g_ptr++;
+
+        box_size = box_size + font_length;
+        if (avctx->extradata_size < box_size) {
+            mov_text_cleanup_ftab(m);
+            m->ftab_entries = 0;
+            return -1;
+        }
+        m->ftab_temp->font = av_malloc(font_length + 1);
+        if (!m->ftab_temp->font) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        memcpy(m->ftab_temp->font, tx3g_ptr, font_length);
+        m->ftab_temp->font[font_length] = '\0';
+        av_dynarray_add(&m->ftab, &m->count_f, m->ftab_temp);
+        if (!m->ftab) {
+            mov_text_cleanup_ftab(m);
+            return AVERROR(ENOMEM);
+        }
+        m->ftab_temp = NULL;
+        tx3g_ptr = tx3g_ptr + font_length;
+    }
+    for (i = 0; i < m->ftab_entries; i++) {
+        if (style_fontID == m->ftab[i]->fontID)
+            m->d.font = m->ftab[i]->font;
+    }
+    return 0;
+}
+
+static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= TWRP_BOX;
+    m->w.wrap_flag = *tsmb++;
+    return 0;
+}
+
+static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= HLIT_BOX;
+    m->h.hlit_start = AV_RB16(tsmb);
+    tsmb += 2;
+    m->h.hlit_end = AV_RB16(tsmb);
+    tsmb += 2;
+    return 0;
+}
+
+static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    m->box_flags |= HCLR_BOX;
+    memcpy(m->c.hlit_color, tsmb, 4);
+    tsmb += 4;
+    return 0;
+}
+
+static int decode_styl(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
+{
+    int i;
+    m->style_entries = AV_RB16(tsmb);
+    tsmb += 2;
+    // A single style record is of length 12 bytes.
+    if (m->tracksize + m->size_var + 2 + m->style_entries * 12 > avpkt->size)
+        return -1;
+
+    m->box_flags |= STYL_BOX;
+    for(i = 0; i < m->style_entries; i++) {
+        m->s_temp = av_malloc(sizeof(*m->s_temp));
+        if (!m->s_temp) {
+            mov_text_cleanup(m);
+            return AVERROR(ENOMEM);
+        }
+        m->s_temp->style_start = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_end = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_fontID = AV_RB16(tsmb);
+        tsmb += 2;
+        m->s_temp->style_flag = AV_RB8(tsmb);
+        tsmb++;
+        m->s_temp->fontsize = AV_RB8(tsmb);
+        av_dynarray_add(&m->s, &m->count_s, m->s_temp);
+        if(!m->s) {
+            mov_text_cleanup(m);
+            return AVERROR(ENOMEM);
+        }
+        tsmb++;
+        // text-color-rgba
+        tsmb += 4;
+    }
+    return 0;
+}
+
+static const Box box_types[] = {
+    { MKBETAG('s','t','y','l'), 2, decode_styl },
+    { MKBETAG('h','l','i','t'), 4, decode_hlit },
+    { MKBETAG('h','c','l','r'), 4, decode_hclr },
+    { MKBETAG('t','w','r','p'), 1, decode_twrp }
+};
+
+const static size_t box_count = FF_ARRAY_ELEMS(box_types);
+
+static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
+                        MovTextContext *m)
+{
+    int i = 0;
+    int j = 0;
+    int text_pos = 0;
+
+    if (text < text_end && m->box_flags & TWRP_BOX) {
+        if (m->w.wrap_flag == 1) {
+            av_bprintf(buf, "{\\q1}"); /* End of line wrap */
+        } else {
+            av_bprintf(buf, "{\\q2}"); /* No wrap */
+        }
+    }
+
+    while (text < text_end) {
+        if (m->box_flags & STYL_BOX) {
+            for (i = 0; i < m->style_entries; i++) {
+                if (m->s[i]->style_flag && text_pos == m->s[i]->style_end) {
+                    av_bprintf(buf, "{\\r}");
+                }
+            }
+            for (i = 0; i < m->style_entries; i++) {
+                if (m->s[i]->style_flag && text_pos == m->s[i]->style_start) {
+                    if (m->s[i]->style_flag & STYLE_FLAG_BOLD)
+                        av_bprintf(buf, "{\\b1}");
+                    if (m->s[i]->style_flag & STYLE_FLAG_ITALIC)
+                        av_bprintf(buf, "{\\i1}");
+                    if (m->s[i]->style_flag & STYLE_FLAG_UNDERLINE)
+                        av_bprintf(buf, "{\\u1}");
+                    av_bprintf(buf, "{\\fs%d}", m->s[i]->fontsize);
+                    for (j = 0; j < m->ftab_entries; j++) {
+                        if (m->s[i]->style_fontID == m->ftab[j]->fontID)
+                            av_bprintf(buf, "{\\fn%s}", m->ftab[j]->font);
+                    }
+                }
+            }
+        }
+        if (m->box_flags & HLIT_BOX) {
+            if (text_pos == m->h.hlit_start) {
+                /* If hclr box is present, set the secondary color to the color
+                 * specified. Otherwise, set primary color to white and secondary
+                 * color to black. These colors will come from TextSampleModifier
+                 * boxes in future and inverse video technique for highlight will
+                 * be implemented.
+                 */
+                if (m->box_flags & HCLR_BOX) {
+                    av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
+                                m->c.hlit_color[1], m->c.hlit_color[0]);
+                } else {
+                    av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
+                }
+            }
+            if (text_pos == m->h.hlit_end) {
+                if (m->box_flags & HCLR_BOX) {
+                    av_bprintf(buf, "{\\2c&H000000&}");
+                } else {
+                    av_bprintf(buf, "{\\1c&HFFFFFF&}{\\2c&H000000&}");
+                }
+            }
+        }
+
+        switch (*text) {
+        case '\r':
+            break;
+        case '\n':
+            av_bprintf(buf, "\\N");
+            break;
+        default:
+            av_bprint_chars(buf, *text, 1);
+            break;
+        }
+        text++;
+        text_pos++;
+    }
+
+    return 0;
+}
+
+static int mov_text_init(AVCodecContext *avctx) {
+    /*
+     * TODO: Handle the default text style.
+     * NB: Most players ignore styles completely, with the result that
+     * it's very common to find files where the default style is broken
+     * and respecting it results in a worse experience than ignoring it.
+     */
+    int ret;
+    MovTextContext *m = avctx->priv_data;
+    ret = mov_text_tx3g(avctx, m);
+    if (ret == 0) {
+        return ff_ass_subtitle_header(avctx, m->d.font, m->d.fontsize, m->d.color,
+                                m->d.back_color, m->d.bold, m->d.italic,
+                                m->d.underline, ASS_DEFAULT_BORDERSTYLE,
+                                m->d.alignment);
+    } else
+        return ff_ass_subtitle_header_default(avctx);
+}
+
+static int mov_text_decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    MovTextContext *m = avctx->priv_data;
+    int ret;
+    AVBPrint buf;
+    char *ptr = avpkt->data;
+    char *end;
+    int text_length, tsmb_type, ret_tsmb;
+    uint64_t tsmb_size;
+    const uint8_t *tsmb;
+
+    if (!ptr || avpkt->size < 2)
+        return AVERROR_INVALIDDATA;
+
+    /*
+     * A packet of size two with value zero is an empty subtitle
+     * used to mark the end of the previous non-empty subtitle.
+     * We can just drop them here as we have duration information
+     * already. If the value is non-zero, then it's technically a
+     * bad packet.
+     */
+    if (avpkt->size == 2)
+        return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
+
+    /*
+     * The first two bytes of the packet are the length of the text string
+     * In complex cases, there are style descriptors appended to the string
+     * so we can't just assume the packet size is the string size.
+     */
+    text_length = AV_RB16(ptr);
+    end = ptr + FFMIN(2 + text_length, avpkt->size);
+    ptr += 2;
+
+    tsmb_size = 0;
+    m->tracksize = 2 + text_length;
+    m->style_entries = 0;
+    m->box_flags = 0;
+    m->count_s = 0;
+    // Note that the spec recommends lines be no longer than 2048 characters.
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (text_length + 2 != avpkt->size) {
+        while (m->tracksize + 8 <= avpkt->size) {
+            // A box is a minimum of 8 bytes.
+            tsmb = ptr + m->tracksize - 2;
+            tsmb_size = AV_RB32(tsmb);
+            tsmb += 4;
+            tsmb_type = AV_RB32(tsmb);
+            tsmb += 4;
+
+            if (tsmb_size == 1) {
+                if (m->tracksize + 16 > avpkt->size)
+                    break;
+                tsmb_size = AV_RB64(tsmb);
+                tsmb += 8;
+                m->size_var = 16;
+            } else
+                m->size_var = 8;
+            //size_var is equal to 8 or 16 depending on the size of box
+
+            if (m->tracksize + tsmb_size > avpkt->size)
+                break;
+
+            for (size_t i = 0; i < box_count; i++) {
+                if (tsmb_type == box_types[i].type) {
+                    if (m->tracksize + m->size_var + box_types[i].base_size > avpkt->size)
+                        break;
+                    ret_tsmb = box_types[i].decode(tsmb, m, avpkt);
+                    if (ret_tsmb == -1)
+                        break;
+                }
+            }
+            m->tracksize = m->tracksize + tsmb_size;
+        }
+        text_to_ass(&buf, ptr, end, m);
+        mov_text_cleanup(m);
+    } else
+        text_to_ass(&buf, ptr, end, m);
+
+    ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static int mov_text_decode_close(AVCodecContext *avctx)
+{
+    MovTextContext *m = avctx->priv_data;
+    mov_text_cleanup_ftab(m);
+    return 0;
+}
+
+static void mov_text_flush(AVCodecContext *avctx)
+{
+    MovTextContext *m = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        m->readorder = 0;
+}
+
+AVCodec ff_movtext_decoder = {
+    .name         = "mov_text",
+    .long_name    = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_MOV_TEXT,
+    .priv_data_size = sizeof(MovTextContext),
+    .init         = mov_text_init,
+    .decode       = mov_text_decode_frame,
+    .close        = mov_text_decode_close,
+    .flush        = mov_text_flush,
+};
diff --git a/libavcodec/movtextenc.c b/libavcodec/movtextenc.c
new file mode 100644
index 0000000..20e01e2
--- /dev/null
+++ b/libavcodec/movtextenc.c
@@ -0,0 +1,409 @@
+/*
+ * 3GPP TS 26.245 Timed Text encoder
+ * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "libavutil/common.h"
+#include "ass_split.h"
+#include "ass.h"
+
+#define STYLE_FLAG_BOLD         (1<<0)
+#define STYLE_FLAG_ITALIC       (1<<1)
+#define STYLE_FLAG_UNDERLINE    (1<<2)
+#define STYLE_RECORD_SIZE       12
+#define SIZE_ADD                10
+
+#define STYL_BOX   (1<<0)
+#define HLIT_BOX   (1<<1)
+#define HCLR_BOX   (1<<2)
+
+#define av_bprint_append_any(buf, data, size)   av_bprint_append_data(buf, ((const char*)data), size)
+
+typedef struct {
+    uint16_t style_start;
+    uint16_t style_end;
+    uint8_t style_flag;
+} StyleBox;
+
+typedef struct {
+    uint16_t start;
+    uint16_t end;
+} HighlightBox;
+
+typedef struct {
+   uint32_t color;
+} HilightcolorBox;
+
+typedef struct {
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    StyleBox **style_attributes;
+    StyleBox *style_attributes_temp;
+    HighlightBox hlit;
+    HilightcolorBox hclr;
+    int count;
+    uint8_t box_flags;
+    uint16_t style_entries;
+    uint16_t style_fontID;
+    uint8_t style_fontsize;
+    uint32_t style_color;
+    uint16_t text_pos;
+} MovTextContext;
+
+typedef struct {
+    uint32_t type;
+    void (*encode)(MovTextContext *s, uint32_t tsmb_type);
+} Box;
+
+static void mov_text_cleanup(MovTextContext *s)
+{
+    int j;
+    if (s->box_flags & STYL_BOX) {
+        for (j = 0; j < s->count; j++) {
+            av_freep(&s->style_attributes[j]);
+        }
+        av_freep(&s->style_attributes);
+    }
+}
+
+static void encode_styl(MovTextContext *s, uint32_t tsmb_type)
+{
+    int j;
+    uint32_t tsmb_size;
+    if (s->box_flags & STYL_BOX) {
+        tsmb_size = s->count * STYLE_RECORD_SIZE + SIZE_ADD;
+        tsmb_size = AV_RB32(&tsmb_size);
+        s->style_entries = AV_RB16(&s->count);
+        s->style_fontID = 0x00 | 0x01<<8;
+        s->style_fontsize = 0x12;
+        s->style_color = MKTAG(0xFF, 0xFF, 0xFF, 0xFF);
+        /*The above three attributes are hard coded for now
+        but will come from ASS style in the future*/
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->style_entries, 2);
+        for (j = 0; j < s->count; j++) {
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_start, 2);
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_end, 2);
+            av_bprint_append_any(&s->buffer, &s->style_fontID, 2);
+            av_bprint_append_any(&s->buffer, &s->style_attributes[j]->style_flag, 1);
+            av_bprint_append_any(&s->buffer, &s->style_fontsize, 1);
+            av_bprint_append_any(&s->buffer, &s->style_color, 4);
+        }
+        mov_text_cleanup(s);
+    }
+}
+
+static void encode_hlit(MovTextContext *s, uint32_t tsmb_type)
+{
+    uint32_t tsmb_size;
+    if (s->box_flags & HLIT_BOX) {
+        tsmb_size = 12;
+        tsmb_size = AV_RB32(&tsmb_size);
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->hlit.start, 2);
+        av_bprint_append_any(&s->buffer, &s->hlit.end, 2);
+    }
+}
+
+static void encode_hclr(MovTextContext *s, uint32_t tsmb_type)
+{
+    uint32_t tsmb_size;
+    if (s->box_flags & HCLR_BOX) {
+        tsmb_size = 12;
+        tsmb_size = AV_RB32(&tsmb_size);
+        av_bprint_append_any(&s->buffer, &tsmb_size, 4);
+        av_bprint_append_any(&s->buffer, &tsmb_type, 4);
+        av_bprint_append_any(&s->buffer, &s->hclr.color, 4);
+    }
+}
+
+static const Box box_types[] = {
+    { MKTAG('s','t','y','l'), encode_styl },
+    { MKTAG('h','l','i','t'), encode_hlit },
+    { MKTAG('h','c','l','r'), encode_hclr },
+};
+
+const static size_t box_count = FF_ARRAY_ELEMS(box_types);
+
+static av_cold int mov_text_encode_init(AVCodecContext *avctx)
+{
+    /*
+     * For now, we'll use a fixed default style. When we add styling
+     * support, this will be generated from the ASS style.
+     */
+    static const uint8_t text_sample_entry[] = {
+        0x00, 0x00, 0x00, 0x00, // uint32_t displayFlags
+        0x01,                   // int8_t horizontal-justification
+        0xFF,                   // int8_t vertical-justification
+        0x00, 0x00, 0x00, 0x00, // uint8_t background-color-rgba[4]
+        // BoxRecord {
+        0x00, 0x00,             // int16_t top
+        0x00, 0x00,             // int16_t left
+        0x00, 0x00,             // int16_t bottom
+        0x00, 0x00,             // int16_t right
+        // };
+        // StyleRecord {
+        0x00, 0x00,             // uint16_t startChar
+        0x00, 0x00,             // uint16_t endChar
+        0x00, 0x01,             // uint16_t font-ID
+        0x00,                   // uint8_t face-style-flags
+        0x12,                   // uint8_t font-size
+        0xFF, 0xFF, 0xFF, 0xFF, // uint8_t text-color-rgba[4]
+        // };
+        // FontTableBox {
+        0x00, 0x00, 0x00, 0x12, // uint32_t size
+        'f', 't', 'a', 'b',     // uint8_t name[4]
+        0x00, 0x01,             // uint16_t entry-count
+        // FontRecord {
+        0x00, 0x01,             // uint16_t font-ID
+        0x05,                   // uint8_t font-name-length
+        'S', 'e', 'r', 'i', 'f',// uint8_t font[font-name-length]
+        // };
+        // };
+    };
+
+    MovTextContext *s = avctx->priv_data;
+
+    avctx->extradata_size = sizeof text_sample_entry;
+    avctx->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    memcpy(avctx->extradata, text_sample_entry, avctx->extradata_size);
+
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+static void mov_text_style_cb(void *priv, const char style, int close)
+{
+    MovTextContext *s = priv;
+    if (!close) {
+        if (!(s->box_flags & STYL_BOX)) {   //first style entry
+
+            s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+
+            if (!s->style_attributes_temp) {
+                av_bprint_clear(&s->buffer);
+                s->box_flags &= ~STYL_BOX;
+                return;
+            }
+
+            s->style_attributes_temp->style_flag = 0;
+            s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+        } else {
+            if (s->style_attributes_temp->style_flag) { //break the style record here and start a new one
+                s->style_attributes_temp->style_end = AV_RB16(&s->text_pos);
+                av_dynarray_add(&s->style_attributes, &s->count, s->style_attributes_temp);
+                s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+                if (!s->style_attributes_temp) {
+                    mov_text_cleanup(s);
+                    av_bprint_clear(&s->buffer);
+                    s->box_flags &= ~STYL_BOX;
+                    return;
+                }
+
+                s->style_attributes_temp->style_flag = s->style_attributes[s->count - 1]->style_flag;
+                s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+            } else {
+                s->style_attributes_temp->style_flag = 0;
+                s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+            }
+        }
+        switch (style){
+        case 'b':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_BOLD;
+            break;
+        case 'i':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_ITALIC;
+            break;
+        case 'u':
+            s->style_attributes_temp->style_flag |= STYLE_FLAG_UNDERLINE;
+            break;
+        }
+    } else {
+        s->style_attributes_temp->style_end = AV_RB16(&s->text_pos);
+        av_dynarray_add(&s->style_attributes, &s->count, s->style_attributes_temp);
+
+        s->style_attributes_temp = av_malloc(sizeof(*s->style_attributes_temp));
+
+        if (!s->style_attributes_temp) {
+            mov_text_cleanup(s);
+            av_bprint_clear(&s->buffer);
+            s->box_flags &= ~STYL_BOX;
+            return;
+        }
+
+        s->style_attributes_temp->style_flag = s->style_attributes[s->count - 1]->style_flag;
+        switch (style){
+        case 'b':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_BOLD;
+            break;
+        case 'i':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_ITALIC;
+            break;
+        case 'u':
+            s->style_attributes_temp->style_flag &= ~STYLE_FLAG_UNDERLINE;
+            break;
+        }
+        if (s->style_attributes_temp->style_flag) { //start of new style record
+            s->style_attributes_temp->style_start = AV_RB16(&s->text_pos);
+        }
+    }
+    s->box_flags |= STYL_BOX;
+}
+
+static void mov_text_color_cb(void *priv, unsigned int color, unsigned int color_id)
+{
+    MovTextContext *s = priv;
+    if (color_id == 2) {    //secondary color changes
+        if (s->box_flags & HLIT_BOX) {  //close tag
+            s->hlit.end = AV_RB16(&s->text_pos);
+        } else {
+            s->box_flags |= HCLR_BOX;
+            s->box_flags |= HLIT_BOX;
+            s->hlit.start = AV_RB16(&s->text_pos);
+            s->hclr.color = color | (0xFF << 24);  //set alpha value to FF
+        }
+    }
+    /* If there are more than one secondary color changes in ASS, take start of
+       first section and end of last section. Movtext allows only one
+       highlight box per sample.
+     */
+}
+
+static void mov_text_text_cb(void *priv, const char *text, int len)
+{
+    MovTextContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+    s->text_pos += len;
+}
+
+static void mov_text_new_line_cb(void *priv, int forced)
+{
+    MovTextContext *s = priv;
+    av_bprint_append_data(&s->buffer, "\n", 1);
+    s->text_pos += 1;
+}
+
+static const ASSCodesCallbacks mov_text_callbacks = {
+    .text     = mov_text_text_cb,
+    .new_line = mov_text_new_line_cb,
+    .style    = mov_text_style_cb,
+    .color    = mov_text_color_cb,
+};
+
+static int mov_text_encode_frame(AVCodecContext *avctx, unsigned char *buf,
+                                 int bufsize, const AVSubtitle *sub)
+{
+    MovTextContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i, length;
+    size_t j;
+
+    s->text_pos = 0;
+    s->count = 0;
+    s->box_flags = 0;
+    s->style_entries = 0;
+    for (i = 0; i < sub->num_rects; i++) {
+        const char *ass = sub->rects[i]->ass;
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            int num;
+            dialog = ff_ass_split_dialog(s->ass_ctx, ass, 0, &num);
+            for (; dialog && num--; dialog++) {
+                ff_ass_split_override_codes(&mov_text_callbacks, s, dialog->text);
+            }
+        } else {
+#endif
+            dialog = ff_ass_split_dialog2(s->ass_ctx, ass);
+            if (!dialog)
+                return AVERROR(ENOMEM);
+            ff_ass_split_override_codes(&mov_text_callbacks, s, dialog->text);
+            ff_ass_free_dialog(&dialog);
+#if FF_API_ASS_TIMING
+        }
+#endif
+
+        for (j = 0; j < box_count; j++) {
+            box_types[j].encode(s, box_types[j].type);
+        }
+    }
+
+    AV_WB16(buf, s->text_pos);
+    buf += 2;
+
+    if (!av_bprint_is_complete(&s->buffer)) {
+        length = AVERROR(ENOMEM);
+        goto exit;
+    }
+
+    if (!s->buffer.len) {
+        length = 0;
+        goto exit;
+    }
+
+    if (s->buffer.len > bufsize - 3) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        length = AVERROR(EINVAL);
+        goto exit;
+    }
+
+    memcpy(buf, s->buffer.str, s->buffer.len);
+    length = s->buffer.len + 2;
+
+exit:
+    av_bprint_clear(&s->buffer);
+    return length;
+}
+
+static int mov_text_encode_close(AVCodecContext *avctx)
+{
+    MovTextContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+AVCodec ff_movtext_encoder = {
+    .name           = "mov_text",
+    .long_name      = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_MOV_TEXT,
+    .priv_data_size = sizeof(MovTextContext),
+    .init           = mov_text_encode_init,
+    .encode_sub     = mov_text_encode_frame,
+    .close          = mov_text_encode_close,
+};
diff --git a/libavcodec/mp3_header_decompress_bsf.c b/libavcodec/mp3_header_decompress_bsf.c
new file mode 100644
index 0000000..22c1ef0
--- /dev/null
+++ b/libavcodec/mp3_header_decompress_bsf.c
@@ -0,0 +1,124 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "bsf.h"
+#include "mpegaudiodecheader.h"
+#include "mpegaudiodata.h"
+
+
+static int mp3_header_decompress(AVBSFContext *ctx, AVPacket *out)
+{
+    AVPacket *in;
+    uint32_t header;
+    int sample_rate= ctx->par_in->sample_rate;
+    int sample_rate_index=0;
+    int lsf, mpeg25, bitrate_index, frame_size, ret;
+    uint8_t *buf;
+    int buf_size;
+
+    ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
+
+    buf      = in->data;
+    buf_size = in->size;
+
+    header = AV_RB32(buf);
+    if(ff_mpa_check_header(header) >= 0){
+        av_packet_move_ref(out, in);
+        av_packet_free(&in);
+
+        return 0;
+    }
+
+    if(ctx->par_in->extradata_size != 15 || strcmp(ctx->par_in->extradata, "FFCMP3 0.0")){
+        av_log(ctx, AV_LOG_ERROR, "Extradata invalid %d\n", ctx->par_in->extradata_size);
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    header= AV_RB32(ctx->par_in->extradata+11) & MP3_MASK;
+
+    lsf     = sample_rate < (24000+32000)/2;
+    mpeg25  = sample_rate < (12000+16000)/2;
+    sample_rate_index= (header>>10)&3;
+    sample_rate= avpriv_mpa_freq_tab[sample_rate_index] >> (lsf + mpeg25); //in case sample rate is a little off
+
+    for(bitrate_index=2; bitrate_index<30; bitrate_index++){
+        frame_size = avpriv_mpa_bitrate_tab[lsf][2][bitrate_index>>1];
+        frame_size = (frame_size * 144000) / (sample_rate << lsf) + (bitrate_index&1);
+        if(frame_size == buf_size + 4)
+            break;
+        if(frame_size == buf_size + 6)
+            break;
+    }
+    if(bitrate_index == 30){
+        av_log(ctx, AV_LOG_ERROR, "Could not find bitrate_index.\n");
+        ret = AVERROR(EINVAL);
+        goto fail;
+    }
+
+    header |= (bitrate_index&1)<<9;
+    header |= (bitrate_index>>1)<<12;
+    header |= (frame_size == buf_size + 4)<<16; //FIXME actually set a correct crc instead of 0
+
+    ret = av_new_packet(out, frame_size);
+    if (ret < 0)
+        goto fail;
+    ret = av_packet_copy_props(out, in);
+    if (ret < 0) {
+        av_packet_free(&out);
+        goto fail;
+    }
+    memcpy(out->data + frame_size - buf_size, buf, buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if(ctx->par_in->channels==2){
+        uint8_t *p= out->data + frame_size - buf_size;
+        if(lsf){
+            FFSWAP(int, p[1], p[2]);
+            header |= (p[1] & 0xC0)>>2;
+            p[1] &= 0x3F;
+        }else{
+            header |= p[1] & 0x30;
+            p[1] &= 0xCF;
+        }
+    }
+
+    AV_WB32(out->data, header);
+
+    ret = 0;
+
+fail:
+    av_packet_free(&in);
+    return ret;
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_MP3, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_mp3_header_decompress_bsf = {
+    .name      = "mp3decomp",
+    .filter    = mp3_header_decompress,
+    .codec_ids = codec_ids,
+};
diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
index 763ea2c..7af30bd 100644
--- a/libavcodec/mpc.c
+++ b/libavcodec/mpc.c
@@ -2,20 +2,20 @@
  * Musepack decoder core
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -74,13 +74,13 @@ void ff_mpc_dequantize_and_synth(MPCContext * c, int maxband, int16_t **out,
         for(ch = 0; ch < 2; ch++){
             if(bands[i].res[ch]){
                 j = 0;
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][0]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][0] & 0xFF];
                 for(; j < 12; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][1]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][1] & 0xFF];
                 for(; j < 24; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
-                mul = mpc_CC[bands[i].res[ch] + 1] * mpc_SCF[bands[i].scf_idx[ch][2]+6];
+                mul = (mpc_CC+1)[bands[i].res[ch]] * mpc_SCF[bands[i].scf_idx[ch][2] & 0xFF];
                 for(; j < 36; j++)
                     c->sb_samples[ch][j][i] = mul * c->Q[ch][j + off];
             }
diff --git a/libavcodec/mpc.h b/libavcodec/mpc.h
index cdf49c1..4cb85748 100644
--- a/libavcodec/mpc.h
+++ b/libavcodec/mpc.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc7.c b/libavcodec/mpc7.c
index 28b5192..d38b22a 100644
--- a/libavcodec/mpc7.c
+++ b/libavcodec/mpc7.c
@@ -2,20 +2,20 @@
  * Musepack SV7 decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,10 +36,6 @@
 #include "mpc.h"
 #include "mpc7data.h"
 
-#define BANDS            32
-#define SAMPLES_PER_BAND 36
-#define MPC_FRAME_SIZE   (BANDS * SAMPLES_PER_BAND)
-
 static VLC scfi_vlc, dscf_vlc, hdr_vlc, quant_vlc[MPC7_QUANT_VLC_TABLES][2];
 
 static const uint16_t quant_offsets[MPC7_QUANT_VLC_TABLES*2 + 1] =
@@ -190,7 +186,7 @@ static int get_scale_idx(GetBitContext *gb, int ref)
     int t = get_vlc2(gb, dscf_vlc.table, MPC7_DSCF_BITS, 1) - 7;
     if (t == 8)
         return get_bits(gb, 6);
-    return av_clip_uintp2(ref + t, 7);
+    return ref + t;
 }
 
 static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
@@ -226,11 +222,9 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
     buf_size  -= 4;
 
     /* get output buffer */
-    frame->nb_samples = last_frame ? c->lastframelen : MPC_FRAME_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    frame->nb_samples = MPC_FRAME_SIZE;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     av_fast_padded_malloc(&c->bits, &c->buf_size, buf_size);
     if (!c->bits)
@@ -246,7 +240,11 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
             int t = 4;
             if(i) t = get_vlc2(&gb, hdr_vlc.table, MPC7_HDR_BITS, 1) - 5;
             if(t == 4) bands[i].res[ch] = get_bits(&gb, 4);
-            else bands[i].res[ch] = av_clip(bands[i-1].res[ch] + t, 0, 17);
+            else bands[i].res[ch] = bands[i-1].res[ch] + t;
+            if (bands[i].res[ch] < -1 || bands[i].res[ch] > 17) {
+                av_log(avctx, AV_LOG_ERROR, "subband index invalid\n");
+                return AVERROR_INVALIDDATA;
+            }
         }
 
         if(bands[i].res[0] || bands[i].res[1]){
@@ -293,6 +291,8 @@ static int mpc7_decode_frame(AVCodecContext * avctx, void *data,
             idx_to_quant(c, &gb, bands[i].res[ch], c->Q[ch] + off);
 
     ff_mpc_dequantize_and_synth(c, mb, (int16_t **)frame->extended_data, 2);
+    if(last_frame)
+        frame->nb_samples = c->lastframelen;
 
     bits_used = get_bits_count(&gb);
     bits_avail = buf_size * 8;
diff --git a/libavcodec/mpc7data.h b/libavcodec/mpc7data.h
index f205ffe..5609e8f 100644
--- a/libavcodec/mpc7data.h
+++ b/libavcodec/mpc7data.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc8.c b/libavcodec/mpc8.c
index 84dbb61..8894457 100644
--- a/libavcodec/mpc8.c
+++ b/libavcodec/mpc8.c
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -126,6 +126,10 @@ static av_cold int mpc8_decode_init(AVCodecContext * avctx)
 
     skip_bits(&gb, 3);//sample rate
     c->maxbands = get_bits(&gb, 5) + 1;
+    if (c->maxbands >= BANDS) {
+        av_log(avctx,AV_LOG_ERROR, "maxbands %d too high\n", c->maxbands);
+        return AVERROR_INVALIDDATA;
+    }
     channels = get_bits(&gb, 4) + 1;
     if (channels > 2) {
         avpriv_request_sample(avctx, "Multichannel MPC SV8");
@@ -135,7 +139,8 @@ static av_cold int mpc8_decode_init(AVCodecContext * avctx)
     c->frames = 1 << (get_bits(&gb, 3) * 2);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
-    avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
+    avctx->channel_layout = (channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
+    avctx->channels = channels;
 
     if(vlc_initialized) return 0;
     av_log(avctx, AV_LOG_DEBUG, "Initing VLC\n");
@@ -247,10 +252,8 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = MPC_FRAME_SIZE;
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
 
     keyframe = c->cur_frame == 0;
 
@@ -267,8 +270,11 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
         maxband = c->last_max_band + get_vlc2(gb, band_vlc.table, MPC8_BANDS_BITS, 2);
         if(maxband > 32) maxband -= 33;
     }
-    if(maxband > c->maxbands + 1)
+
+    if(maxband > c->maxbands + 1) {
+        av_log(avctx, AV_LOG_ERROR, "maxband %d too large\n",maxband);
         return AVERROR_INVALIDDATA;
+    }
     c->last_max_band = maxband;
 
     /* read subband indexes */
@@ -409,10 +415,14 @@ static int mpc8_decode_frame(AVCodecContext * avctx, void *data,
     c->cur_frame++;
 
     c->last_bits_used = get_bits_count(gb);
-    if(get_bits_left(gb) < 8) // we have only padding left
-        c->last_bits_used = buf_size << 3;
     if(c->cur_frame >= c->frames)
         c->cur_frame = 0;
+    if(c->cur_frame == 0 && get_bits_left(gb) < 8) {// we have only padding left
+        c->last_bits_used = buf_size << 3;
+    } else if (get_bits_left(gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Overread %d\n", -get_bits_left(gb));
+        c->last_bits_used = buf_size << 3;
+    }
 
     *got_frame_ptr = 1;
 
diff --git a/libavcodec/mpc8data.h b/libavcodec/mpc8data.h
index 2940b30..22c2be4 100644
--- a/libavcodec/mpc8data.h
+++ b/libavcodec/mpc8data.h
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpc8huff.h b/libavcodec/mpc8huff.h
index 6005e21..8491037 100644
--- a/libavcodec/mpc8huff.h
+++ b/libavcodec/mpc8huff.h
@@ -2,20 +2,20 @@
  * Musepack SV8 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpcdata.h b/libavcodec/mpcdata.h
index 15724f3..64fb4ab 100644
--- a/libavcodec/mpcdata.h
+++ b/libavcodec/mpcdata.h
@@ -2,20 +2,20 @@
  * Musepack decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,9 +30,7 @@ static const float mpc_CC[18+1] = {
     4.0002, 2.0001, 1.0000
 };
 
-static const float mpc_SCF[128+6] = {
-    920.016296386718750000, 766.355773925781250000, 638.359558105468750000,
-    531.741149902343750000, 442.930114746093750000, 368.952209472656250000,
+static const float mpc_SCF[256] = {
     307.330047607421875000, 255.999984741210937500, 213.243041992187500000, 177.627334594726562500,
     147.960128784179687500, 123.247924804687500000, 102.663139343261718750, 85.516410827636718750,
     71.233520507812500000, 59.336143493652343750, 49.425861358642578125, 41.170787811279296875,
@@ -64,7 +62,39 @@ static const float mpc_SCF[128+6] = {
     0.000000396931966407, 0.000000330636652279, 0.000000275413924555, 0.000000229414467867,
     0.000000191097811353, 0.000000159180785886, 0.000000132594522029, 0.000000110448674207,
     0.000000092001613439, 0.000000076635565449, 0.000000063835940978, 0.000000053174105119,
-    0.000000044293003043, 0.000000036895215771, 0.000000030733001921, 0.000000025599996789
+    0.000000044293003043, 0.000000036895215771, 0.000000030733001921, 0.000000025599996789,
+    0.000000021324305018, 3689522167600.270019531250000000, 3073300627835.926757812500000000, 2560000000000.002929687500000000,
+    2132430501800.519042968750000000, 1776273376956.721923828125000000, 1479601378343.250244140625000000, 1232479339720.794189453125000000,
+    1026631459710.774291992187500000, 855164155779.391845703125000000, 712335206965.024780273437500000, 593361454233.829101562500000000,
+    494258618594.112609863281250000, 411707872682.763122558593750000, 342944697476.612365722656250000, 285666302081.983886718750000000,
+    237954506209.446411132812500000, 198211502766.368713378906250000, 165106349338.563323974609375000, 137530396629.095306396484375000,
+    114560161209.611633300781250000, 95426399240.062576293945312500, 79488345475.196502685546875000, 66212254855.064872741699218750,
+    55153528064.816276550292968750, 45941822471.611343383789062500, 38268649822.956413269042968750, 31877045369.216873168945312500,
+    26552962442.420688629150390625, 22118104306.789615631103515625, 18423953228.829509735107421875, 15346796808.164905548095703125,
+    12783585007.291271209716796875, 10648479137.463939666748046875, 8869977230.669750213623046875, 7388519530.061036109924316406,
+    6154493909.785535812377929688, 5126574428.270387649536132812, 4270337375.232155323028564453, 3557108465.595236301422119141,
+    2963002574.315670013427734375, 2468123854.056322574615478516, 2055899448.676229715347290039, 1712524489.450022459030151367,
+    1426499787.649837732315063477, 1188246741.404872417449951172, 989786560.561257958412170410, 824473067.192597866058349609,
+    686770123.591610312461853027, 572066234.090648531913757324, 476520111.962911486625671387, 396932039.637152194976806641,
+    330636714.243810534477233887, 275413990.026798009872436523, 229414528.498330980539321899, 191097866.455478429794311523,
+    159180827.835415601730346680, 132594551.788319095969200134, 110448697.892960876226425171, 92001629.793398514389991760,
+    76635578.744844585657119751, 63835955.327594503760337830, 53174116.504741288721561432, 44293010.914454914629459381,
+    36895221.676002673804759979, 30733006.278359245508909225, 25600000.000000011175870895, 21324305.018005173653364182,
+    17762733.769567202776670456, 14796013.783432489261031151, 12324793.397207930684089661, 10266314.597107734531164169,
+    8551641.557793911546468735, 7123352.069650243036448956, 5933614.542338287457823753, 4942586.185941123403608799,
+    4117078.726827629376202822, 3429446.974766122177243233, 2856663.020819837693125010, 2379545.062094463035464287,
+    1982115.027663686312735081, 1651063.493385632522404194, 1375303.966290952404960990, 1145601.612096115713939071,
+    954263.992400625254958868, 794883.454751964658498764, 662122.548550648498348892, 551535.280648162588477135,
+    459418.224716113239992410, 382686.498229563992936164, 318770.453692168579436839, 265529.624424206791445613,
+    221181.043067896069260314, 184239.532288295013131574, 153467.968081648985389620, 127835.850072912653558888,
+    106484.791374639346031472, 88699.772306697457679547, 73885.195300610314006917, 61544.939097855312866159,
+    51265.744282703839417081, 42703.373752321524079889, 35571.084655952341563534, 29630.025743156678800005,
+    24681.238540563208516687, 20558.994486762283486314, 17125.244894500214286381, 14264.997876498367986642,
+    11882.467414048716818797, 9897.865605612574654515, 8244.730671925974093028, 6867.701235916098994494,
+    5720.662340906482313585, 4765.201119629112326948, 3969.320396371519564127, 3306.367142438103201130,
+    2754.139900267978191550, 2294.145284983308101801, 1910.978664554782881169, 1591.808278354154936096,
+    1325.945517883190177599, 1104.486978929608085309, 920.016297933984674273, 766.355787448445425980,
+    638.359553275944676898, 531.741165047412550848, 442.930109144548907807, 368.952216760026544762,
 };
 
 #endif /* AVCODEC_MPCDATA_H */
diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c
index c0c680d..7c14052 100644
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,12 @@
  * MPEG-1/2 decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/timecode.h"
+
 #include "internal.h"
 #include "avcodec.h"
 #include "mpegvideo.h"
@@ -34,6 +39,7 @@
 #include "mpeg12data.h"
 #include "mpegvideodata.h"
 #include "bytestream.h"
+#include "vdpau_internal.h"
 #include "thread.h"
 
 uint8_t ff_mpeg12_static_rl_table_store[2][2][2*MAX_RUN + MAX_LEVEL + 3];
@@ -65,21 +71,21 @@ static const uint8_t table_mb_btype[11][2] = {
 #define INIT_2D_VLC_RL(rl, static_size)\
 {\
     static RL_VLC_ELEM rl_vlc_table[static_size];\
-    INIT_VLC_STATIC(&rl.vlc, TEX_VLC_BITS, rl.n + 2,\
-                    &rl.table_vlc[0][1], 4, 2,\
-                    &rl.table_vlc[0][0], 4, 2, static_size);\
-\
     rl.rl_vlc[0] = rl_vlc_table;\
-    init_2d_vlc_rl(&rl);\
+    init_2d_vlc_rl(&rl, static_size);\
 }
 
-static av_cold void init_2d_vlc_rl(RLTable *rl)
+static av_cold void init_2d_vlc_rl(RLTable *rl, unsigned static_size)
 {
     int i;
-
-    for (i = 0; i < rl->vlc.table_size; i++) {
-        int code = rl->vlc.table[i][0];
-        int len  = rl->vlc.table[i][1];
+    VLC_TYPE table[680][2] = {{0}};
+    VLC vlc = { .table = table, .table_allocated = static_size };
+    av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+    init_vlc(&vlc, TEX_VLC_BITS, rl->n + 2, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC);
+
+    for (i = 0; i < vlc.table_size; i++) {
+        int code = vlc.table[i][0];
+        int len  = vlc.table[i][1];
         int level, run;
 
         if (len == 0) { // illegal code
@@ -195,7 +201,7 @@ int ff_mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size,
 */
 
     for (i = 0; i < buf_size; i++) {
-        assert(pc->frame_start_found >= 0 && pc->frame_start_found <= 4);
+        av_assert1(pc->frame_start_found >= 0 && pc->frame_start_found <= 4);
         if (pc->frame_start_found & 1) {
             if (state == EXT_START_CODE && (buf[i] & 0xF0) != 0x80)
                 pc->frame_start_found--;
@@ -229,7 +235,7 @@ int ff_mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size,
                 }
             }
             if (pc->frame_start_found == 0 && s && state == PICTURE_START_CODE) {
-                ff_fetch_timestamp(s, i - 3, 1);
+                ff_fetch_timestamp(s, i - 3, 1, i > 3);
             }
         }
     }
@@ -262,16 +268,18 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
 
     {
         OPEN_READER(re, gb);
+        UPDATE_CACHE(re, gb);
+        if (((int32_t)GET_CACHE(re, gb)) <= (int32_t)0xBFFFFFFF)
+            goto end;
+
         /* now quantify & encode AC coefficients */
         while (1) {
             int level, run, j;
 
-            UPDATE_CACHE(re, gb);
-            GET_RL_VLC(level, run, re, gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+            GET_RL_VLC(level, run, re, gb, rl->rl_vlc[0],
+                       TEX_VLC_BITS, 2, 0);
 
-            if (level == 127) {
-                break;
-            } else if (level != 0) {
+            if (level != 0) {
                 i += run;
                 if (i > MAX_INDEX)
                     break;
@@ -281,7 +289,7 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
                 level = (level - 1) | 1;
                 level = (level ^ SHOW_SBITS(re, gb, 1)) -
                         SHOW_SBITS(re, gb, 1);
-                LAST_SKIP_BITS(re, gb, 1);
+                SKIP_BITS(re, gb, 1);
             } else {
                 /* escape */
                 run = SHOW_UBITS(re, gb, 6) + 1;
@@ -292,10 +300,10 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
 
                 if (level == -128) {
                     level = SHOW_UBITS(re, gb, 8) - 256;
-                    LAST_SKIP_BITS(re, gb, 8);
+                    SKIP_BITS(re, gb, 8);
                 } else if (level == 0) {
                     level = SHOW_UBITS(re, gb, 8);
-                    LAST_SKIP_BITS(re, gb, 8);
+                    SKIP_BITS(re, gb, 8);
                 }
 
                 i += run;
@@ -315,7 +323,13 @@ int ff_mpeg1_decode_block_intra(GetBitContext *gb,
             }
 
             block[j] = level;
+            if (((int32_t)GET_CACHE(re, gb)) <= (int32_t)0xBFFFFFFF)
+               break;
+
+            UPDATE_CACHE(re, gb);
         }
+end:
+        LAST_SKIP_BITS(re, gb, 2);
         CLOSE_READER(re, gb);
     }
 
diff --git a/libavcodec/mpeg12.h b/libavcodec/mpeg12.h
index 17f0b78..ad9b00c 100644
--- a/libavcodec/mpeg12.h
+++ b/libavcodec/mpeg12.h
@@ -2,20 +2,20 @@
  * MPEG-1/2 common code
  * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg12data.c b/libavcodec/mpeg12data.c
index 46e797f..416bbb8 100644
--- a/libavcodec/mpeg12data.c
+++ b/libavcodec/mpeg12data.c
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 
 #include "mpeg12data.h"
 
-const uint16_t ff_mpeg1_default_intra_matrix[64] = {
+const uint16_t ff_mpeg1_default_intra_matrix[256] = {
         8, 16, 19, 22, 26, 27, 29, 34,
         16, 16, 22, 24, 27, 29, 34, 37,
         19, 22, 26, 27, 29, 34, 34, 38,
@@ -325,6 +325,72 @@ const AVRational ff_mpeg12_frame_rate_tab[16] = {
     {    0,    0},
 };
 
+const AVRational ff_mpeg2_frame_rate_tab[] = {
+    {      1,     1},
+    {      2,     1},
+    {      3,     1},
+    {      4,     1},
+    {      5,     1},
+    {      6,     1},
+    {      8,     1},
+    {      9,     1},
+    {     10,     1},
+    {     12,     1},
+    {     15,     1},
+    {     16,     1},
+    {     18,     1},
+    {     20,     1},
+    {     24,     1},
+    {     25,     1},
+    {     30,     1},
+    {     32,     1},
+    {     36,     1},
+    {     40,     1},
+    {     45,     1},
+    {     48,     1},
+    {     50,     1},
+    {     60,     1},
+    {     72,     1},
+    {     75,     1},
+    {     80,     1},
+    {     90,     1},
+    {     96,     1},
+    {    100,     1},
+    {    120,     1},
+    {    150,     1},
+    {    180,     1},
+    {    200,     1},
+    {    240,     1},
+    {    750,  1001},
+    {    800,  1001},
+    {    960,  1001},
+    {   1000,  1001},
+    {   1200,  1001},
+    {   1250,  1001},
+    {   1500,  1001},
+    {   1600,  1001},
+    {   1875,  1001},
+    {   2000,  1001},
+    {   2400,  1001},
+    {   2500,  1001},
+    {   3000,  1001},
+    {   3750,  1001},
+    {   4000,  1001},
+    {   4800,  1001},
+    {   5000,  1001},
+    {   6000,  1001},
+    {   7500,  1001},
+    {   8000,  1001},
+    {  10000,  1001},
+    {  12000,  1001},
+    {  15000,  1001},
+    {  20000,  1001},
+    {  24000,  1001},
+    {  30000,  1001},
+    {  60000,  1001},
+    {      0,     0},
+};
+
 const float ff_mpeg1_aspect[16]={
     0.0000,
     1.0000,
diff --git a/libavcodec/mpeg12data.h b/libavcodec/mpeg12data.h
index c6750b8..f51faf4 100644
--- a/libavcodec/mpeg12data.h
+++ b/libavcodec/mpeg12data.h
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 #include "libavutil/rational.h"
 #include "rl.h"
 
-extern const uint16_t ff_mpeg1_default_intra_matrix[64];
+extern const uint16_t ff_mpeg1_default_intra_matrix[];
 extern const uint16_t ff_mpeg1_default_non_intra_matrix[64];
 
 extern const uint16_t ff_mpeg12_vlc_dc_lum_code[12];
@@ -49,6 +49,7 @@ extern const uint8_t ff_mpeg12_mbPatTable[64][2];
 extern const uint8_t ff_mpeg12_mbMotionVectorTable[17][2];
 
 extern const AVRational ff_mpeg12_frame_rate_tab[];
+extern const AVRational ff_mpeg2_frame_rate_tab[];
 
 extern const float ff_mpeg1_aspect[16];
 extern const AVRational ff_mpeg2_aspect[16];
diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
index 2d9c99d..54a70e3 100644
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -1,22 +1,22 @@
 /*
  * MPEG-1/2 decoder
  * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2013 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,11 @@
  * MPEG-1/2 decoder
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
 #include <inttypes.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 #include "libavutil/stereo3d.h"
 
@@ -45,6 +47,7 @@
 #include "profiles.h"
 #include "thread.h"
 #include "version.h"
+#include "vdpau_compat.h"
 #include "xvmc_internal.h"
 
 typedef struct Mpeg1Context {
@@ -59,11 +62,11 @@ typedef struct Mpeg1Context {
     uint8_t afd;
     int has_afd;
     int slice_count;
-    int save_aspect_info;
+    AVRational save_aspect;
     int save_width, save_height, save_progressive_seq;
     AVRational frame_rate_ext;  /* MPEG-2 specific framerate modificator */
     int sync;                   /* Did we reach a sync point like a GOP/SEQ/KEYFrame? */
-    int closed_gop;             /* GOP is closed */
+    int tmpgexs;
     int first_slice;
     int extradata_decoded;
 } Mpeg1Context;
@@ -94,13 +97,6 @@ static const uint32_t btype2mb_type[11] = {
     MB_TYPE_QUANT | MB_TYPE_L0L1 | MB_TYPE_CBP,
 };
 
-static const uint8_t non_linear_qscale[32] = {
-     0,  1,  2,  3,  4,  5,   6,   7,
-     8, 10, 12, 14, 16, 18,  20,  22,
-    24, 28, 32, 36, 40, 44,  48,  52,
-    56, 64, 72, 80, 88, 96, 104, 112,
-};
-
 /* as H.263, but only 17 codes */
 static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred)
 {
@@ -223,6 +219,11 @@ end:
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg1_fast_decode_block_inter(MpegEncContext *s,
                                                 int16_t *block, int n)
 {
@@ -394,6 +395,11 @@ end:
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
                                                     int16_t *block, int n)
 {
@@ -451,8 +457,9 @@ static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
         }
 
         block[j] = level;
-        if (((int32_t) GET_CACHE(re, &s->gb)) <= (int32_t) 0xBFFFFFFF)
+        if (((int32_t) GET_CACHE(re, &s->gb)) <= (int32_t) 0xBFFFFFFF || i > 63)
             break;
+
         UPDATE_CACHE(re, &s->gb);
     }
 end:
@@ -491,7 +498,7 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
     dc += diff;
     s->last_dc[component] = dc;
     block[0] = dc << (3 - s->intra_dc_precision);
-    ff_dlog(s->avctx, "dc=%d\n", block[0]);
+    ff_tlog(s->avctx, "dc=%d\n", block[0]);
     mismatch = block[0] ^ 1;
     i = 0;
     if (s->intra_vlc_format)
@@ -550,6 +557,11 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
     return 0;
 }
 
+/**
+ * Note: this function can read out of range and crash for corrupt streams.
+ * Changing this would eat up any speed benefits it has.
+ * Do not use "fast" flag if you need the code to be robust.
+ */
 static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
                                                 int16_t *block, int n)
 {
@@ -589,12 +601,10 @@ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0],
                        TEX_VLC_BITS, 2, 0);
 
-            if (level == 127) {
+            if (level >= 64 || i > 63) {
                 break;
             } else if (level != 0) {
                 i += run;
-                if (i > MAX_INDEX)
-                    break;
                 j = scantable[i];
                 level = (level * qscale * quant_matrix[j]) >> 4;
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) -
@@ -608,8 +618,6 @@ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s,
                 level = SHOW_SBITS(re, &s->gb, 12);
                 SKIP_BITS(re, &s->gb, 12);
                 i += run;
-                if (i > MAX_INDEX)
-                    break;
                 j = scantable[i];
                 if (level < 0) {
                     level = (-level * qscale * quant_matrix[j]) >> 4;
@@ -645,11 +653,12 @@ static inline int get_qscale(MpegEncContext *s)
 {
     int qscale = get_bits(&s->gb, 5);
     if (s->q_scale_type)
-        return non_linear_qscale[qscale];
+        return ff_mpeg2_non_linear_qscale[qscale];
     else
         return qscale << 1;
 }
 
+
 /* motion type (for MPEG-2) */
 #define MT_FIELD 1
 #define MT_FRAME 2
@@ -662,9 +671,9 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
     const int mb_block_count = 4 + (1 << s->chroma_format);
     int ret;
 
-    ff_dlog(s->avctx, "decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
+    ff_tlog(s->avctx, "decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
 
-    assert(s->mb_skipped == 0);
+    av_assert2(s->mb_skipped == 0);
 
     if (s->mb_skip_run-- != 0) {
         if (s->pict_type == AV_PICTURE_TYPE_P) {
@@ -679,11 +688,12 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             else
                 // FIXME not sure if this is allowed in MPEG at all
                 mb_type = s->current_picture.mb_type[s->mb_width + (s->mb_y - 1) * s->mb_stride - 1];
-            if (IS_INTRA(mb_type))
+            if (IS_INTRA(mb_type)) {
+                av_log(s->avctx, AV_LOG_ERROR, "skip with previntra\n");
                 return AVERROR_INVALIDDATA;
+            }
             s->current_picture.mb_type[s->mb_x + s->mb_y * s->mb_stride] =
                 mb_type | MB_TYPE_SKIP;
-//            assert(s->current_picture.mb_type[s->mb_x + s->mb_y * s->mb_stride - 1] & (MB_TYPE_16x16 | MB_TYPE_16x8));
 
             if ((s->mv[0][0][0] | s->mv[0][0][1] | s->mv[1][0][0] | s->mv[1][0][1]) == 0)
                 s->mb_skipped = 1;
@@ -726,7 +736,7 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
         mb_type = btype2mb_type[mb_type];
         break;
     }
-    ff_dlog(s->avctx, "mb_type=%x\n", mb_type);
+    ff_tlog(s->avctx, "mb_type=%x\n", mb_type);
 //    motion_type = 0; /* avoid warning */
     if (IS_INTRA(mb_type)) {
         s->bdsp.clear_blocks(s->block[0]);
@@ -757,19 +767,15 @@ static int mpeg_decode_mb(MpegEncContext *s, int16_t block[12][64])
             s->last_mv[0][1][1] = mpeg_decode_motion(s, s->mpeg_f_code[0][1],
                                                      s->last_mv[0][0][1]);
 
-            skip_bits1(&s->gb); /* marker */
+            check_marker(&s->gb, "after concealment_motion_vectors");
         } else {
             /* reset mv prediction */
             memset(s->last_mv, 0, sizeof(s->last_mv));
         }
         s->mb_intra = 1;
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
         // if 1, we memcpy blocks in xvmcvideo
-        if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1)
+        if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
             ff_xvmc_pack_pblocks(s, -1); // inter are always full blocks
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
         if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
             if (s->avctx->flags2 & AV_CODEC_FLAG2_FAST) {
@@ -798,11 +804,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     } else {
         if (mb_type & MB_TYPE_ZERO_MV) {
-            assert(mb_type & MB_TYPE_CBP);
+            av_assert2(mb_type & MB_TYPE_CBP);
 
             s->mv_dir = MV_DIR_FORWARD;
             if (s->picture_structure == PICT_FRAME) {
-                if (!s->frame_pred_frame_dct)
+                if (s->picture_structure == PICT_FRAME
+                    && !s->frame_pred_frame_dct)
                     s->interlaced_dct = get_bits1(&s->gb);
                 s->mv_type = MV_TYPE_16X16;
             } else {
@@ -821,10 +828,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->mv[0][0][0]      = 0;
             s->mv[0][0][1]      = 0;
         } else {
-            assert(mb_type & MB_TYPE_L0L1);
+            av_assert2(mb_type & MB_TYPE_L0L1);
             // FIXME decide if MBs in field pictures are MB_TYPE_INTERLACED
             /* get additional motion vector type */
-            if (s->frame_pred_frame_dct) {
+            if (s->picture_structure == PICT_FRAME && s->frame_pred_frame_dct) {
                 motion_type = MT_FRAME;
             } else {
                 motion_type = get_bits(&s->gb, 2);
@@ -837,7 +844,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             /* motion vectors */
             s->mv_dir = (mb_type >> 13) & 3;
-            ff_dlog(s->avctx, "motion_type=%d\n", motion_type);
+            ff_tlog(s->avctx, "motion_type=%d\n", motion_type);
             switch (motion_type) {
             case MT_FRAME: /* or MT_16X8 */
                 if (s->picture_structure == PICT_FRAME) {
@@ -894,16 +901,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
                                                          s->last_mv[i][j][0]);
                                 s->last_mv[i][j][0] = val;
                                 s->mv[i][j][0]      = val;
-                                ff_dlog(s->avctx, "fmx=%d\n", val);
+                                ff_tlog(s->avctx, "fmx=%d\n", val);
                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][1],
                                                          s->last_mv[i][j][1] >> 1);
-                                s->last_mv[i][j][1] = val << 1;
+                                s->last_mv[i][j][1] = 2 * val;
                                 s->mv[i][j][1]      = val;
-                                ff_dlog(s->avctx, "fmy=%d\n", val);
+                                ff_tlog(s->avctx, "fmy=%d\n", val);
                             }
                         }
                     }
                 } else {
+                    av_assert0(!s->progressive_sequence);
                     mb_type |= MB_TYPE_16x16 | MB_TYPE_INTERLACED;
                     for (i = 0; i < 2; i++) {
                         if (USES_LIST(mb_type, i)) {
@@ -920,6 +928,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
                 break;
             case MT_DMV:
+                if (s->progressive_sequence){
+                    av_log(s->avctx, AV_LOG_ERROR, "MT_DMV in progressive_sequence\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 s->mv_type = MV_TYPE_DMV;
                 for (i = 0; i < 2; i++) {
                     if (USES_LIST(mb_type, i)) {
@@ -988,17 +1000,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
             if (cbp <= 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "invalid cbp at %d %d\n", s->mb_x, s->mb_y);
+                       "invalid cbp %d at %d %d\n", cbp, s->mb_x, s->mb_y);
                 return AVERROR_INVALIDDATA;
             }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
             // if 1, we memcpy blocks in xvmcvideo
-            if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1)
+            if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
                 ff_xvmc_pack_pblocks(s, cbp);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
             if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
                 if (s->avctx->flags2 & AV_CODEC_FLAG2_FAST) {
@@ -1061,6 +1069,11 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
 
     ff_mpv_decode_defaults(s2);
 
+    if (   avctx->codec_tag != AV_RL32("VCR2")
+        && avctx->codec_tag != AV_RL32("BW10"))
+        avctx->coded_width = avctx->coded_height = 0; // do not trust dimensions from input
+    ff_mpv_decode_init(s2, avctx);
+
     s->mpeg_enc_ctx.avctx  = avctx;
 
     /* we need some permutation to store matrices,
@@ -1069,18 +1082,16 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
     ff_mpeg12_common_init(&s->mpeg_enc_ctx);
     ff_mpeg12_init_vlcs();
 
+    s2->chroma_format              = 1;
     s->mpeg_enc_ctx_allocated      = 0;
     s->mpeg_enc_ctx.picture_number = 0;
     s->repeat_field                = 0;
     s->mpeg_enc_ctx.codec_id       = avctx->codec->id;
     avctx->color_range             = AVCOL_RANGE_MPEG;
-    if (avctx->codec->id == AV_CODEC_ID_MPEG1VIDEO)
-        avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
-    else
-        avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
     return 0;
 }
 
+#if HAVE_THREADS
 static int mpeg_decode_update_thread_context(AVCodecContext *avctx,
                                              const AVCodecContext *avctx_from)
 {
@@ -1097,17 +1108,15 @@ static int mpeg_decode_update_thread_context(AVCodecContext *avctx,
     if (err)
         return err;
 
-    if (!ctx->mpeg_enc_ctx_allocated) {
-        // copy the whole context after the initial MpegEncContext structure
-        memcpy(ctx, ctx_from, sizeof(*ctx));
-        memset(&ctx->mpeg_enc_ctx, 0, sizeof(ctx->mpeg_enc_ctx));
-    }
+    if (!ctx->mpeg_enc_ctx_allocated)
+        memcpy(s + 1, s1 + 1, sizeof(Mpeg1Context) - sizeof(MpegEncContext));
 
     if (!(s->pict_type == AV_PICTURE_TYPE_B || s->low_delay))
         s->picture_number++;
 
     return 0;
 }
+#endif
 
 static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
                                  const uint8_t *new_perm)
@@ -1121,15 +1130,30 @@ static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
         matrix[new_perm[i]] = temp_matrix[old_perm[i]];
 }
 
-#if FF_API_XVMC
-static const enum AVPixelFormat pixfmt_xvmc_mpg2_420[] = {
-    AV_PIX_FMT_XVMC_MPEG2_IDCT,
-    AV_PIX_FMT_XVMC_MPEG2_MC,
+static const enum AVPixelFormat mpeg1_hwaccel_pixfmt_list_420[] = {
+#if CONFIG_MPEG1_XVMC_HWACCEL
+    AV_PIX_FMT_XVMC,
+#endif
+#if CONFIG_MPEG1_VDPAU_DECODER && FF_API_VDPAU
+    AV_PIX_FMT_VDPAU_MPEG1,
+#endif
+#if CONFIG_MPEG1_VDPAU_HWACCEL
+    AV_PIX_FMT_VDPAU,
+#endif
+    AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
 };
-#endif /* FF_API_XVMC */
 
-static const enum AVPixelFormat mpeg12_hwaccel_pixfmt_list_420[] = {
+static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = {
+#if CONFIG_MPEG2_XVMC_HWACCEL
+    AV_PIX_FMT_XVMC,
+#endif
+#if CONFIG_MPEG_VDPAU_DECODER && FF_API_VDPAU
+    AV_PIX_FMT_VDPAU_MPEG2,
+#endif
+#if CONFIG_MPEG2_VDPAU_HWACCEL
+    AV_PIX_FMT_VDPAU,
+#endif
 #if CONFIG_MPEG2_DXVA2_HWACCEL
     AV_PIX_FMT_DXVA2_VLD,
 #endif
@@ -1139,8 +1163,8 @@ static const enum AVPixelFormat mpeg12_hwaccel_pixfmt_list_420[] = {
 #if CONFIG_MPEG2_VAAPI_HWACCEL
     AV_PIX_FMT_VAAPI,
 #endif
-#if CONFIG_MPEG1_VDPAU_HWACCEL | CONFIG_MPEG2_VDPAU_HWACCEL
-    AV_PIX_FMT_VDPAU,
+#if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
+    AV_PIX_FMT_VIDEOTOOLBOX,
 #endif
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NONE
@@ -1156,27 +1180,55 @@ static const enum AVPixelFormat mpeg12_pixfmt_list_444[] = {
     AV_PIX_FMT_NONE
 };
 
+#if FF_API_VDPAU
+static inline int uses_vdpau(AVCodecContext *avctx) {
+    return avctx->pix_fmt == AV_PIX_FMT_VDPAU_MPEG1 || avctx->pix_fmt == AV_PIX_FMT_VDPAU_MPEG2;
+}
+#endif
+
 static enum AVPixelFormat mpeg_get_pixelformat(AVCodecContext *avctx)
 {
     Mpeg1Context *s1  = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     const enum AVPixelFormat *pix_fmts;
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if (avctx->xvmc_acceleration)
-        return ff_get_format(avctx, pixfmt_xvmc_mpg2_420);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
+    if (CONFIG_GRAY && (avctx->flags & AV_CODEC_FLAG_GRAY))
+        return AV_PIX_FMT_GRAY8;
 
     if (s->chroma_format < 2)
-        pix_fmts = mpeg12_hwaccel_pixfmt_list_420;
+        pix_fmts = avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO ?
+                                mpeg1_hwaccel_pixfmt_list_420 :
+                                mpeg2_hwaccel_pixfmt_list_420;
     else if (s->chroma_format == 2)
         pix_fmts = mpeg12_pixfmt_list_422;
     else
         pix_fmts = mpeg12_pixfmt_list_444;
 
-    return ff_get_format(avctx, pix_fmts);
+    return ff_thread_get_format(avctx, pix_fmts);
+}
+
+static void setup_hwaccel_for_pixfmt(AVCodecContext *avctx)
+{
+    // until then pix_fmt may be changed right after codec init
+    if (avctx->hwaccel
+#if FF_API_VDPAU
+        || uses_vdpau(avctx)
+#endif
+        )
+        if (avctx->idct_algo == FF_IDCT_AUTO)
+            avctx->idct_algo = FF_IDCT_SIMPLE;
+
+    if (avctx->hwaccel && avctx->pix_fmt == AV_PIX_FMT_XVMC) {
+        Mpeg1Context *s1 = avctx->priv_data;
+        MpegEncContext *s = &s1->mpeg_enc_ctx;
+
+        s->pack_pblocks = 1;
+#if FF_API_XVMC
+FF_DISABLE_DEPRECATION_WARNINGS
+        avctx->xvmc_acceleration = 2;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif /* FF_API_XVMC */
+    }
 }
 
 /* Call this function when we know all parameters.
@@ -1188,27 +1240,83 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
     uint8_t old_permutation[64];
     int ret;
 
+    if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
+        // MPEG-1 aspect
+        avctx->sample_aspect_ratio = av_d2q(1.0 / ff_mpeg1_aspect[s->aspect_ratio_info], 255);
+    } else { // MPEG-2
+        // MPEG-2 aspect
+        if (s->aspect_ratio_info > 1) {
+            AVRational dar =
+                av_mul_q(av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                                  (AVRational) { s1->pan_scan.width,
+                                                 s1->pan_scan.height }),
+                         (AVRational) { s->width, s->height });
+
+            /* We ignore the spec here and guess a bit as reality does not
+             * match the spec, see for example res_change_ffmpeg_aspect.ts
+             * and sequence-display-aspect.mpg.
+             * issue1613, 621, 562 */
+            if ((s1->pan_scan.width == 0) || (s1->pan_scan.height == 0) ||
+                (av_cmp_q(dar, (AVRational) { 4, 3 }) &&
+                 av_cmp_q(dar, (AVRational) { 16, 9 }))) {
+                s->avctx->sample_aspect_ratio =
+                    av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                             (AVRational) { s->width, s->height });
+            } else {
+                s->avctx->sample_aspect_ratio =
+                    av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
+                             (AVRational) { s1->pan_scan.width, s1->pan_scan.height });
+// issue1613 4/3 16/9 -> 16/9
+// res_change_ffmpeg_aspect.ts 4/3 225/44 ->4/3
+// widescreen-issue562.mpg 4/3 16/9 -> 16/9
+//                s->avctx->sample_aspect_ratio = av_mul_q(s->avctx->sample_aspect_ratio, (AVRational) {s->width, s->height});
+                ff_dlog(avctx, "aspect A %d/%d\n",
+                        ff_mpeg2_aspect[s->aspect_ratio_info].num,
+                        ff_mpeg2_aspect[s->aspect_ratio_info].den);
+                ff_dlog(avctx, "aspect B %d/%d\n", s->avctx->sample_aspect_ratio.num,
+                        s->avctx->sample_aspect_ratio.den);
+            }
+        } else {
+            s->avctx->sample_aspect_ratio =
+                ff_mpeg2_aspect[s->aspect_ratio_info];
+        }
+    } // MPEG-2
+
+    if (av_image_check_sar(s->width, s->height,
+                           avctx->sample_aspect_ratio) < 0) {
+        av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n",
+                avctx->sample_aspect_ratio.num,
+                avctx->sample_aspect_ratio.den);
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+    }
+
     if ((s1->mpeg_enc_ctx_allocated == 0)                   ||
         avctx->coded_width       != s->width                ||
         avctx->coded_height      != s->height               ||
         s1->save_width           != s->width                ||
         s1->save_height          != s->height               ||
-        s1->save_aspect_info     != s->aspect_ratio_info    ||
-        s1->save_progressive_seq != s->progressive_sequence ||
+        av_cmp_q(s1->save_aspect, s->avctx->sample_aspect_ratio) ||
+        (s1->save_progressive_seq != s->progressive_sequence && FFALIGN(s->height, 16) != FFALIGN(s->height, 32)) ||
         0) {
         if (s1->mpeg_enc_ctx_allocated) {
             ParseContext pc = s->parse_context;
             s->parse_context.buffer = 0;
             ff_mpv_common_end(s);
             s->parse_context = pc;
+            s1->mpeg_enc_ctx_allocated = 0;
         }
 
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
 
-        avctx->bit_rate          = s->bit_rate;
-        s1->save_aspect_info     = s->aspect_ratio_info;
+        if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->bit_rate) {
+            avctx->rc_max_rate = s->bit_rate;
+        } else if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && s->bit_rate &&
+                   (s->bit_rate != 0x3FFFF*400 || s->vbv_delay != 0xFFFF)) {
+            avctx->bit_rate = s->bit_rate;
+        }
+        s1->save_aspect          = s->avctx->sample_aspect_ratio;
         s1->save_width           = s->width;
         s1->save_height          = s->height;
         s1->save_progressive_seq = s->progressive_sequence;
@@ -1220,66 +1328,28 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
         if (avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
             // MPEG-1 fps
             avctx->framerate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
-            // MPEG-1 aspect
-            avctx->sample_aspect_ratio = av_d2q(1.0 / ff_mpeg1_aspect[s->aspect_ratio_info], 255);
             avctx->ticks_per_frame     = 1;
+
+            avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
         } else { // MPEG-2
             // MPEG-2 fps
             av_reduce(&s->avctx->framerate.num,
                       &s->avctx->framerate.den,
-                      ff_mpeg12_frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num * 2,
+                      ff_mpeg12_frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num,
                       ff_mpeg12_frame_rate_tab[s->frame_rate_index].den * s1->frame_rate_ext.den,
                       1 << 30);
             avctx->ticks_per_frame = 2;
-            // MPEG-2 aspect
-            if (s->aspect_ratio_info > 1) {
-                AVRational dar =
-                    av_mul_q(av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                      (AVRational) { s1->pan_scan.width,
-                                                     s1->pan_scan.height }),
-                             (AVRational) { s->width, s->height });
 
-                /* We ignore the spec here and guess a bit as reality does not
-                 * match the spec, see for example res_change_ffmpeg_aspect.ts
-                 * and sequence-display-aspect.mpg.
-                 * issue1613, 621, 562 */
-                if ((s1->pan_scan.width == 0) || (s1->pan_scan.height == 0) ||
-                    (av_cmp_q(dar, (AVRational) { 4, 3 }) &&
-                     av_cmp_q(dar, (AVRational) { 16, 9 }))) {
-                    s->avctx->sample_aspect_ratio =
-                        av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                 (AVRational) { s->width, s->height });
-                } else {
-                    s->avctx->sample_aspect_ratio =
-                        av_div_q(ff_mpeg2_aspect[s->aspect_ratio_info],
-                                 (AVRational) { s1->pan_scan.width, s1->pan_scan.height });
-// issue1613 4/3 16/9 -> 16/9
-// res_change_ffmpeg_aspect.ts 4/3 225/44 ->4/3
-// widescreen-issue562.mpg 4/3 16/9 -> 16/9
-//                    s->avctx->sample_aspect_ratio = av_mul_q(s->avctx->sample_aspect_ratio, (AVRational) {s->width, s->height});
-                    ff_dlog(avctx, "A %d/%d\n",
-                            ff_mpeg2_aspect[s->aspect_ratio_info].num,
-                            ff_mpeg2_aspect[s->aspect_ratio_info].den);
-                    ff_dlog(avctx, "B %d/%d\n", s->avctx->sample_aspect_ratio.num,
-                            s->avctx->sample_aspect_ratio.den);
-                }
-            } else {
-                s->avctx->sample_aspect_ratio =
-                    ff_mpeg2_aspect[s->aspect_ratio_info];
+            switch (s->chroma_format) {
+            case 1: avctx->chroma_sample_location = AVCHROMA_LOC_LEFT; break;
+            case 2:
+            case 3: avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT; break;
+            default: av_assert0(0);
             }
         } // MPEG-2
 
-        ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
-
         avctx->pix_fmt = mpeg_get_pixelformat(avctx);
-        // until then pix_fmt may be changed right after codec init
-#if FF_API_XVMC
-        if ((avctx->pix_fmt == AV_PIX_FMT_XVMC_MPEG2_IDCT ||
-             avctx->hwaccel) && avctx->idct_algo == FF_IDCT_AUTO)
-#else
-        if (avctx->hwaccel && avctx->idct_algo == FF_IDCT_AUTO)
-#endif /* FF_API_XVMC */
-            avctx->idct_algo = FF_IDCT_SIMPLE;
+        setup_hwaccel_for_pixfmt(avctx);
 
         /* Quantization matrices may need reordering
          * if DCT permutation is changed. */
@@ -1314,20 +1384,23 @@ static int mpeg1_decode_picture(AVCodecContext *avctx, const uint8_t *buf,
         return AVERROR_INVALIDDATA;
 
     vbv_delay = get_bits(&s->gb, 16);
+    s->vbv_delay = vbv_delay;
     if (s->pict_type == AV_PICTURE_TYPE_P ||
         s->pict_type == AV_PICTURE_TYPE_B) {
         s->full_pel[0] = get_bits1(&s->gb);
         f_code = get_bits(&s->gb, 3);
-        if (f_code == 0 && (avctx->err_recognition & AV_EF_BITSTREAM))
+        if (f_code == 0 && (avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)))
             return AVERROR_INVALIDDATA;
+        f_code += !f_code;
         s->mpeg_f_code[0][0] = f_code;
         s->mpeg_f_code[0][1] = f_code;
     }
     if (s->pict_type == AV_PICTURE_TYPE_B) {
         s->full_pel[1] = get_bits1(&s->gb);
         f_code = get_bits(&s->gb, 3);
-        if (f_code == 0 && (avctx->err_recognition & AV_EF_BITSTREAM))
+        if (f_code == 0 && (avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)))
             return AVERROR_INVALIDDATA;
+        f_code += !f_code;
         s->mpeg_f_code[1][0] = f_code;
         s->mpeg_f_code[1][1] = f_code;
     }
@@ -1354,13 +1427,19 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
     s->avctx->level         = get_bits(&s->gb, 4);
     s->progressive_sequence = get_bits1(&s->gb);   /* progressive_sequence */
     s->chroma_format        = get_bits(&s->gb, 2); /* chroma_format 1=420, 2=422, 3=444 */
+
+    if (!s->chroma_format) {
+        s->chroma_format = 1;
+        av_log(s->avctx, AV_LOG_WARNING, "Chroma format invalid\n");
+    }
+
     horiz_size_ext          = get_bits(&s->gb, 2);
     vert_size_ext           = get_bits(&s->gb, 2);
     s->width  |= (horiz_size_ext << 12);
     s->height |= (vert_size_ext  << 12);
     bit_rate_ext = get_bits(&s->gb, 12);  /* XXX: handle it */
-    s->bit_rate += (bit_rate_ext << 18) * 400;
-    skip_bits1(&s->gb); /* marker */
+    s->bit_rate += (bit_rate_ext << 18) * 400LL;
+    check_marker(&s->gb, "after bit rate extension");
     s->avctx->rc_buffer_size += get_bits(&s->gb, 8) * 1024 * 16 << 10;
 
     s->low_delay = get_bits1(&s->gb);
@@ -1375,8 +1454,8 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG,
-               "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n",
-               s->avctx->profile, s->avctx->level,
+               "profile: %d, level: %d ps: %d cf:%d vbv buffer: %d, bitrate:%"PRId64"\n",
+               s->avctx->profile, s->avctx->level, s->progressive_sequence, s->chroma_format,
                s->avctx->rc_buffer_size, s->bit_rate);
 }
 
@@ -1451,7 +1530,7 @@ static int load_matrix(MpegEncContext *s, uint16_t matrix0[64],
             return AVERROR_INVALIDDATA;
         }
         if (intra && i == 0 && v != 8) {
-            av_log(s->avctx, AV_LOG_ERROR, "intra matrix invalid, ignoring\n");
+            av_log(s->avctx, AV_LOG_DEBUG, "intra matrix specifies invalid DC quantizer %d, ignoring\n", v);
             v = 8; // needed by pink.mpg / issue1046
         }
         matrix0[j] = v;
@@ -1497,6 +1576,11 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1)
         s->current_picture.f->pict_type = s->pict_type;
         s->current_picture.f->key_frame = s->pict_type == AV_PICTURE_TYPE_I;
     }
+    s->mpeg_f_code[0][0] += !s->mpeg_f_code[0][0];
+    s->mpeg_f_code[0][1] += !s->mpeg_f_code[0][1];
+    s->mpeg_f_code[1][0] += !s->mpeg_f_code[1][0];
+    s->mpeg_f_code[1][1] += !s->mpeg_f_code[1][1];
+
     s->intra_dc_precision         = get_bits(&s->gb, 2);
     s->picture_structure          = get_bits(&s->gb, 2);
     s->top_field_first            = get_bits1(&s->gb);
@@ -1509,32 +1593,6 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1)
     s->chroma_420_type            = get_bits1(&s->gb);
     s->progressive_frame          = get_bits1(&s->gb);
 
-    if (s->progressive_sequence && !s->progressive_frame) {
-        s->progressive_frame = 1;
-        av_log(s->avctx, AV_LOG_ERROR,
-               "interlaced frame in progressive sequence, ignoring\n");
-    }
-
-    if (s->picture_structure == 0 ||
-        (s->progressive_frame && s->picture_structure != PICT_FRAME)) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "picture_structure %d invalid, ignoring\n",
-               s->picture_structure);
-        s->picture_structure = PICT_FRAME;
-    }
-
-    if (s->progressive_sequence && !s->frame_pred_frame_dct)
-        av_log(s->avctx, AV_LOG_WARNING, "invalid frame_pred_frame_dct\n");
-
-    if (s->picture_structure == PICT_FRAME) {
-        s->first_field = 0;
-        s->v_edge_pos  = 16 * s->mb_height;
-    } else {
-        s->first_field ^= 1;
-        s->v_edge_pos   = 8 * s->mb_height;
-        memset(s->mbskip_table, 0, s->mb_stride * s->mb_height);
-    }
-
     if (s->alternate_scan) {
         ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan);
         ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan);
@@ -1597,6 +1655,7 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
             if (sd)
                 memcpy(sd->data, s1->a53_caption, s1->a53_caption_size);
             av_freep(&s1->a53_caption);
+            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
         }
 
         if (s1->has_stereo3d) {
@@ -1631,9 +1690,11 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
 
         if (s->avctx->hwaccel &&
             (s->avctx->slice_flags & SLICE_FLAG_ALLOW_FIELD)) {
-            if (s->avctx->hwaccel->end_frame(s->avctx) < 0)
+            if ((ret = s->avctx->hwaccel->end_frame(s->avctx)) < 0) {
                 av_log(avctx, AV_LOG_ERROR,
                        "hardware accelerator failed to decode first field\n");
+                return ret;
+            }
         }
 
         for (i = 0; i < 4; i++) {
@@ -1649,16 +1710,6 @@ static int mpeg_field_start(MpegEncContext *s, const uint8_t *buf, int buf_size)
             return ret;
     }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-// ff_mpv_frame_start will call this function too,
-// but we need to call it on every field
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
-        if (ff_xvmc_field_start(s, avctx) < 0)
-            return -1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
     return 0;
 }
 
@@ -1675,15 +1726,18 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
                              const uint8_t **buf, int buf_size)
 {
     AVCodecContext *avctx = s->avctx;
+    const int lowres      = s->avctx->lowres;
     const int field_pic   = s->picture_structure != PICT_FRAME;
     int ret;
 
     s->resync_mb_x =
     s->resync_mb_y = -1;
 
-    assert(mb_y < s->mb_height);
+    av_assert0(mb_y < s->mb_height);
 
     init_get_bits(&s->gb, *buf, buf_size * 8);
+    if (s->codec_id != AV_CODEC_ID_MPEG1VIDEO && s->mb_height > 2800/16)
+        skip_bits(&s->gb, 3);
 
     ff_mpeg1_clean_buffers(s);
     s->interlaced_dct = 0;
@@ -1696,8 +1750,8 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
     }
 
     /* extra slice info */
-    while (get_bits1(&s->gb) != 0)
-        skip_bits(&s->gb, 8);
+    if (skip_1stop_8data_bits(&s->gb) < 0)
+        return AVERROR_INVALIDDATA;
 
     s->mb_x = 0;
 
@@ -1727,7 +1781,7 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
         return AVERROR_INVALIDDATA;
     }
 
-    if (avctx->hwaccel) {
+    if (avctx->hwaccel && avctx->hwaccel->decode_slice) {
         const uint8_t *buf_end, *buf_start = *buf - 4; /* include start_code */
         int start_code = -1;
         buf_end = avpriv_find_start_code(buf_start + 2, *buf + buf_size, &start_code);
@@ -1767,13 +1821,9 @@ static int mpeg_decode_slice(MpegEncContext *s, int mb_y,
     }
 
     for (;;) {
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
         // If 1, we memcpy blocks in xvmcvideo.
-        if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1)
+        if ((CONFIG_MPEG1_XVMC_HWACCEL || CONFIG_MPEG2_XVMC_HWACCEL) && s->pack_pblocks)
             ff_xvmc_init_block(s); // set s->block
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
         if ((ret = mpeg_decode_mb(s, s->block)) < 0)
             return ret;
@@ -1805,22 +1855,23 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     s->current_picture.motion_val[dir][xy + 1][1] = motion_y;
                     s->current_picture.ref_index [dir][b8_xy]     =
                     s->current_picture.ref_index [dir][b8_xy + 1] = s->field_select[dir][i];
-                    assert(s->field_select[dir][i] == 0 ||
-                           s->field_select[dir][i] == 1);
+                    av_assert2(s->field_select[dir][i] == 0 ||
+                               s->field_select[dir][i] == 1);
                 }
                 xy    += wrap;
                 b8_xy += 2;
             }
         }
 
-        s->dest[0] += 16;
-        s->dest[1] += 16 >> s->chroma_x_shift;
-        s->dest[2] += 16 >> s->chroma_x_shift;
+        s->dest[0] += 16 >> lowres;
+        s->dest[1] +=(16 >> lowres) >> s->chroma_x_shift;
+        s->dest[2] +=(16 >> lowres) >> s->chroma_x_shift;
 
         ff_mpv_decode_mb(s, s->block);
 
         if (++s->mb_x >= s->mb_width) {
-            const int mb_size = 16;
+            const int mb_size = 16 >> s->avctx->lowres;
+            int left;
 
             ff_mpeg_draw_horiz_band(s, mb_size * (s->mb_y >> field_pic), mb_size);
             ff_mpv_report_decode_progress(s);
@@ -1838,15 +1889,40 @@ FF_ENABLE_DEPRECATION_WARNINGS
                              s->progressive_frame == 0
                              /* vbv_delay == 0xBBB || 0xE10 */;
 
+                if (left >= 32 && !is_d10) {
+                    GetBitContext gb = s->gb;
+                    align_get_bits(&gb);
+                    if (show_bits(&gb, 24) == 0x060E2B) {
+                        av_log(avctx, AV_LOG_DEBUG, "Invalid MXF data found in video stream\n");
+                        is_d10 = 1;
+                    }
+                    if (left > 32 && show_bits_long(&gb, 32) == 0x201) {
+                        av_log(avctx, AV_LOG_DEBUG, "skipping m704 alpha (unsupported)\n");
+                        goto eos;
+                    }
+                }
+
                 if (left < 0 ||
                     (left && show_bits(&s->gb, FFMIN(left, 23)) && !is_d10) ||
-                    ((avctx->err_recognition & AV_EF_BUFFER) && left > 8)) {
-                    av_log(avctx, AV_LOG_ERROR, "end mismatch left=%d %0X\n",
-                           left, show_bits(&s->gb, FFMIN(left, 23)));
+                    ((avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_AGGRESSIVE)) && left > 8)) {
+                    av_log(avctx, AV_LOG_ERROR, "end mismatch left=%d %0X at %d %d\n",
+                           left, left>0 ? show_bits(&s->gb, FFMIN(left, 23)) : 0, s->mb_x, s->mb_y);
                     return AVERROR_INVALIDDATA;
                 } else
                     goto eos;
             }
+            // There are some files out there which are missing the last slice
+            // in cases where the slice is completely outside the visible
+            // area, we detect this here instead of running into the end expecting
+            // more data
+            left = get_bits_left(&s->gb);
+            if (s->mb_y >= ((s->height + 15) >> 4) &&
+                !s->progressive_sequence &&
+                left <= 25 &&
+                left >= 0 &&
+                s->mb_skip_run == -1 &&
+                (!left || show_bits(&s->gb, left) == 0))
+                goto eos;
 
             ff_init_block_index(s);
         }
@@ -1912,8 +1988,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     }
 eos: // end of slice
+    if (get_bits_left(&s->gb) < 0) {
+        av_log(s, AV_LOG_ERROR, "overread %d\n", -get_bits_left(&s->gb));
+        return AVERROR_INVALIDDATA;
+    }
     *buf += (get_bits_count(&s->gb) - 1) / 8;
-    ff_dlog(s, "y %d %d %d %d\n", s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y);
+    ff_dlog(s, "Slice start:%d %d  end:%d %d\n", s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y);
     return 0;
 }
 
@@ -1953,7 +2033,10 @@ static int slice_decode_thread(AVCodecContext *c, void *arg)
 
         start_code = -1;
         buf        = avpriv_find_start_code(buf, s->gb.buffer_end, &start_code);
-        mb_y       = (start_code - SLICE_MIN_START_CODE) << field_pic;
+        mb_y       = start_code - SLICE_MIN_START_CODE;
+        if (s->codec_id != AV_CODEC_ID_MPEG1VIDEO && s->mb_height > 2800/16)
+            mb_y += (*buf&0xE0)<<2;
+        mb_y <<= field_pic;
         if (s->picture_structure == PICT_BOTTOM_FIELD)
             mb_y++;
         if (mb_y < 0 || mb_y >= s->end_mb_y)
@@ -1974,20 +2057,16 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict)
         return 0;
 
     if (s->avctx->hwaccel) {
-        if (s->avctx->hwaccel->end_frame(s->avctx) < 0)
+        int ret = s->avctx->hwaccel->end_frame(s->avctx);
+        if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "hardware accelerator failed to decode picture\n");
+            return ret;
+        }
     }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
-        ff_xvmc_field_end(s);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
     /* end of slice reached */
-    if (/* s->mb_y << field_pic == s->mb_height && */ !s->first_field) {
+    if (/* s->mb_y << field_pic == s->mb_height && */ !s->first_field && !s1->first_slice) {
         /* end of image */
 
         ff_er_frame_end(&s->er);
@@ -1998,7 +2077,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
             int ret = av_frame_ref(pict, s->current_picture_ptr->f);
             if (ret < 0)
                 return ret;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG2);
         } else {
             if (avctx->active_thread_type & FF_THREAD_FRAME)
                 s->picture_number++;
@@ -2008,7 +2088,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 int ret = av_frame_ref(pict, s->last_picture_ptr->f);
                 if (ret < 0)
                     return ret;
-                ff_print_debug_info(s, s->last_picture_ptr);
+                ff_print_debug_info(s, s->last_picture_ptr, pict);
+                ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG2);
             }
         }
 
@@ -2033,28 +2114,25 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     if (width == 0 || height == 0) {
         av_log(avctx, AV_LOG_WARNING,
                "Invalid horizontal or vertical size value.\n");
-        if (avctx->err_recognition & AV_EF_BITSTREAM)
+        if (avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return AVERROR_INVALIDDATA;
     }
     s->aspect_ratio_info = get_bits(&s->gb, 4);
     if (s->aspect_ratio_info == 0) {
         av_log(avctx, AV_LOG_ERROR, "aspect ratio has forbidden 0 value\n");
-        if (avctx->err_recognition & AV_EF_BITSTREAM)
+        if (avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_COMPLIANT))
             return AVERROR_INVALIDDATA;
     }
     s->frame_rate_index = get_bits(&s->gb, 4);
     if (s->frame_rate_index == 0 || s->frame_rate_index > 13) {
         av_log(avctx, AV_LOG_WARNING,
                "frame_rate_index %d is invalid\n", s->frame_rate_index);
-        return AVERROR_INVALIDDATA;
+        s->frame_rate_index = 1;
     }
-    s->bit_rate = get_bits(&s->gb, 18) * 400;
-    if (get_bits1(&s->gb) == 0) { /* marker */
-        av_log(avctx, AV_LOG_ERROR, "Marker in sequence header missing\n");
+    s->bit_rate = get_bits(&s->gb, 18) * 400LL;
+    if (check_marker(&s->gb, "in sequence header") == 0) {
         return AVERROR_INVALIDDATA;
     }
-    s->width  = width;
-    s->height = height;
 
     s->avctx->rc_buffer_size = get_bits(&s->gb, 10) * 1024 * 16;
     skip_bits(&s->gb, 1);
@@ -2086,21 +2164,26 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    s->width  = width;
+    s->height = height;
+
     /* We set MPEG-2 parameters so that it emulates MPEG-1. */
     s->progressive_sequence = 1;
     s->progressive_frame    = 1;
     s->picture_structure    = PICT_FRAME;
+    s->first_field          = 0;
     s->frame_pred_frame_dct = 1;
     s->chroma_format        = 1;
     s->codec_id             =
     s->avctx->codec_id      = AV_CODEC_ID_MPEG1VIDEO;
     s->out_format           = FMT_MPEG1;
+    s->swap_uv              = 0; // AFAIK VCR2 does not have SEQ_HEADER
     if (s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY)
         s->low_delay = 1;
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
-        av_log(s->avctx, AV_LOG_DEBUG, "vbv buffer: %d, bitrate:%d\n",
-               s->avctx->rc_buffer_size, s->bit_rate);
+        av_log(s->avctx, AV_LOG_DEBUG, "vbv buffer: %d, bitrate:%"PRId64", aspect_ratio_info: %d \n",
+               s->avctx->rc_buffer_size, s->bit_rate, s->aspect_ratio_info);
 
     return 0;
 }
@@ -2115,6 +2198,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->out_format = FMT_MPEG1;
     if (s1->mpeg_enc_ctx_allocated) {
         ff_mpv_common_end(s);
+        s1->mpeg_enc_ctx_allocated = 0;
     }
     s->width            = avctx->coded_width;
     s->height           = avctx->coded_height;
@@ -2122,14 +2206,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->low_delay        = 1;
 
     avctx->pix_fmt = mpeg_get_pixelformat(avctx);
-
-#if FF_API_XVMC
-    if ((avctx->pix_fmt == AV_PIX_FMT_XVMC_MPEG2_IDCT || avctx->hwaccel) &&
-        avctx->idct_algo == FF_IDCT_AUTO)
-#else
-    if (avctx->hwaccel && avctx->idct_algo == FF_IDCT_AUTO)
-#endif /* FF_API_XVMC */
-        avctx->idct_algo = FF_IDCT_SIMPLE;
+    setup_hwaccel_for_pixfmt(avctx);
 
     ff_mpv_idct_init(s);
     if ((ret = ff_mpv_common_init(s)) < 0)
@@ -2150,9 +2227,15 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s->progressive_sequence  = 1;
     s->progressive_frame     = 1;
     s->picture_structure     = PICT_FRAME;
+    s->first_field           = 0;
     s->frame_pred_frame_dct  = 1;
     s->chroma_format         = 1;
-    s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+    if (s->codec_tag == AV_RL32("BW10")) {
+        s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG1VIDEO;
+    } else {
+        s->swap_uv = 1; // in case of xvmc we need to swap uv for each MB
+        s->codec_id              = s->avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+    }
     s1->save_width           = s->width;
     s1->save_height          = s->height;
     s1->save_progressive_seq = s->progressive_sequence;
@@ -2215,9 +2298,25 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
 static void mpeg_decode_user_data(AVCodecContext *avctx,
                                   const uint8_t *p, int buf_size)
 {
+    Mpeg1Context *s = avctx->priv_data;
     const uint8_t *buf_end = p + buf_size;
     Mpeg1Context *s1 = avctx->priv_data;
 
+#if 0
+    int i;
+    for(i=0; !(!p[i-2] && !p[i-1] && p[i]==1) && i<buf_size; i++){
+        av_log(avctx, AV_LOG_ERROR, "%c", p[i]);
+    }
+    av_log(avctx, AV_LOG_ERROR, "\n");
+#endif
+
+    if (buf_size > 29){
+        int i;
+        for(i=0; i<20; i++)
+            if (!memcmp(p+i, "\0TMPGEXS\0", 9)){
+                s->tmpgexs= 1;
+            }
+    }
     /* we parse the DTG active format information */
     if (buf_end - p >= 5 &&
         p[0] == 'D' && p[1] == 'T' && p[2] == 'G' && p[3] == '1') {
@@ -2276,32 +2375,32 @@ static void mpeg_decode_gop(AVCodecContext *avctx,
 {
     Mpeg1Context *s1  = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
-
-    int time_code_hours, time_code_minutes;
-    int time_code_seconds, time_code_pictures;
     int broken_link;
+    int64_t tc;
 
     init_get_bits(&s->gb, buf, buf_size * 8);
 
-    skip_bits1(&s->gb); /* drop_frame_flag */
+    tc = s-> timecode_frame_start = get_bits(&s->gb, 25);
 
-    time_code_hours   = get_bits(&s->gb, 5);
-    time_code_minutes = get_bits(&s->gb, 6);
-    skip_bits1(&s->gb); // marker bit
-    time_code_seconds  = get_bits(&s->gb, 6);
-    time_code_pictures = get_bits(&s->gb, 6);
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->timecode_frame_start = tc;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
-    s1->closed_gop = get_bits1(&s->gb);
+    s->closed_gop = get_bits1(&s->gb);
     /* broken_link indicate that after editing the
      * reference frames of the first B-Frames after GOP I-Frame
      * are missing (open gop) */
     broken_link = get_bits1(&s->gb);
 
-    if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
+        char tcbuf[AV_TIMECODE_STR_SIZE];
+        av_timecode_make_mpeg_tc_string(tcbuf, tc);
         av_log(s->avctx, AV_LOG_DEBUG,
-               "GOP (%2d:%02d:%02d.[%02d]) closed_gop=%d broken_link=%d\n",
-               time_code_hours, time_code_minutes, time_code_seconds,
-               time_code_pictures, s1->closed_gop, broken_link);
+               "GOP (%s) closed_gop=%d broken_link=%d\n",
+               tcbuf, s->closed_gop, broken_link);
+    }
 }
 
 static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
@@ -2313,6 +2412,7 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
     const uint8_t *buf_end = buf + buf_size;
     int ret, input_size;
     int last_code = 0, skip_frame = 0;
+    int picture_start_code_seen = 0;
 
     for (;;) {
         /* find next start code */
@@ -2324,6 +2424,7 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     (avctx->active_thread_type & FF_THREAD_SLICE) &&
                     !avctx->hwaccel) {
                     int i;
+                    av_assert0(avctx->thread_count > 1);
 
                     avctx->execute(avctx, slice_decode_thread,
                                    &s2->thread_context[0], NULL,
@@ -2332,6 +2433,12 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                         s2->er.error_count += s2->thread_context[i]->er.error_count;
                 }
 
+#if FF_API_VDPAU
+                if ((CONFIG_MPEG_VDPAU_DECODER || CONFIG_MPEG1_VDPAU_DECODER)
+                    && uses_vdpau(avctx))
+                    ff_vdpau_mpeg_picture_complete(s2, buf, buf_size, s->slice_count);
+#endif
+
                 ret = slice_end(avctx, picture);
                 if (ret < 0)
                     return ret;
@@ -2342,13 +2449,17 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                 }
             }
             s2->pict_type = 0;
+
+            if (avctx->err_recognition & AV_EF_EXPLODE && s2->er.error_count)
+                return AVERROR_INVALIDDATA;
+
             return FFMAX(0, buf_ptr - buf - s2->parse_context.last_index);
         }
 
         input_size = buf_end - buf_ptr;
 
         if (avctx->debug & FF_DEBUG_STARTCODE)
-            av_log(avctx, AV_LOG_DEBUG, "%3"PRIX32" at %td left %d\n",
+            av_log(avctx, AV_LOG_DEBUG, "%3"PRIX32" at %"PTRDIFF_SPECIFIER" left %d\n",
                    start_code, buf_ptr - buf, input_size);
 
         /* prepare data for next start code */
@@ -2356,7 +2467,8 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
         case SEQ_START_CODE:
             if (last_code == 0) {
                 mpeg1_decode_sequence(avctx, buf_ptr, input_size);
-                s->sync = 1;
+                if (buf != avctx->extradata)
+                    s->sync = 1;
             } else {
                 av_log(avctx, AV_LOG_ERROR,
                        "ignoring SEQ_START_CODE after %X\n", last_code);
@@ -2366,12 +2478,24 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
             break;
 
         case PICTURE_START_CODE:
+            if (picture_start_code_seen && s2->picture_structure == PICT_FRAME) {
+               /* If it's a frame picture, there can't be more than one picture header.
+                  Yet, it does happen and we need to handle it. */
+               av_log(avctx, AV_LOG_WARNING, "ignoring extra picture following a frame-picture\n");
+               break;
+            }
+            picture_start_code_seen = 1;
+
             if (s2->width <= 0 || s2->height <= 0) {
                 av_log(avctx, AV_LOG_ERROR, "Invalid frame dimensions %dx%d.\n",
                        s2->width, s2->height);
                 return AVERROR_INVALIDDATA;
             }
 
+            if (s->tmpgexs){
+                s2->intra_dc_precision= 3;
+                s2->intra_matrix[0]= 1;
+            }
             if (HAVE_THREADS && (avctx->active_thread_type & FF_THREAD_SLICE) &&
                 !avctx->hwaccel && s->slice_count) {
                 int i;
@@ -2455,14 +2579,50 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
             break;
         default:
             if (start_code >= SLICE_MIN_START_CODE &&
+                start_code <= SLICE_MAX_START_CODE && last_code == PICTURE_START_CODE) {
+                if (s2->progressive_sequence && !s2->progressive_frame) {
+                    s2->progressive_frame = 1;
+                    av_log(s2->avctx, AV_LOG_ERROR,
+                           "interlaced frame in progressive sequence, ignoring\n");
+                }
+
+                if (s2->picture_structure == 0 ||
+                    (s2->progressive_frame && s2->picture_structure != PICT_FRAME)) {
+                    av_log(s2->avctx, AV_LOG_ERROR,
+                           "picture_structure %d invalid, ignoring\n",
+                           s2->picture_structure);
+                    s2->picture_structure = PICT_FRAME;
+                }
+
+                if (s2->progressive_sequence && !s2->frame_pred_frame_dct)
+                    av_log(s2->avctx, AV_LOG_WARNING, "invalid frame_pred_frame_dct\n");
+
+                if (s2->picture_structure == PICT_FRAME) {
+                    s2->first_field = 0;
+                    s2->v_edge_pos  = 16 * s2->mb_height;
+                } else {
+                    s2->first_field ^= 1;
+                    s2->v_edge_pos   = 8 * s2->mb_height;
+                    memset(s2->mbskip_table, 0, s2->mb_stride * s2->mb_height);
+                }
+            }
+            if (start_code >= SLICE_MIN_START_CODE &&
                 start_code <= SLICE_MAX_START_CODE && last_code != 0) {
                 const int field_pic = s2->picture_structure != PICT_FRAME;
-                int mb_y = (start_code - SLICE_MIN_START_CODE) << field_pic;
+                int mb_y = start_code - SLICE_MIN_START_CODE;
                 last_code = SLICE_MIN_START_CODE;
+                if (s2->codec_id != AV_CODEC_ID_MPEG1VIDEO && s2->mb_height > 2800/16)
+                    mb_y += (*buf_ptr&0xE0)<<2;
 
+                mb_y <<= field_pic;
                 if (s2->picture_structure == PICT_BOTTOM_FIELD)
                     mb_y++;
 
+                if (buf_end - buf_ptr < 2) {
+                    av_log(s2->avctx, AV_LOG_ERROR, "slice too small\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
                 if (mb_y >= s2->mb_height) {
                     av_log(s2->avctx, AV_LOG_ERROR,
                            "slice below image (%d >= %d)\n", mb_y, s2->mb_height);
@@ -2473,13 +2633,13 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     /* Skip B-frames if we do not have reference frames and
                      * GOP is not closed. */
                     if (s2->pict_type == AV_PICTURE_TYPE_B) {
-                        if (!s->closed_gop) {
+                        if (!s2->closed_gop) {
                             skip_frame = 1;
                             break;
                         }
                     }
                 }
-                if (s2->pict_type == AV_PICTURE_TYPE_I)
+                if (s2->pict_type == AV_PICTURE_TYPE_I || (s2->avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
                     s->sync = 1;
                 if (!s2->next_picture_ptr) {
                     /* Skip P-frames if we do not have a reference frame or
@@ -2526,12 +2686,20 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture,
                     return AVERROR_INVALIDDATA;
                 }
 
+#if FF_API_VDPAU
+                if (uses_vdpau(avctx)) {
+                    s->slice_count++;
+                    break;
+                }
+#endif
+
                 if (HAVE_THREADS &&
                     (avctx->active_thread_type & FF_THREAD_SLICE) &&
                     !avctx->hwaccel) {
                     int threshold = (s2->mb_height * s->slice_count +
                                      s2->slice_context_count / 2) /
                                     s2->slice_context_count;
+                    av_assert0(avctx->thread_count > 1);
                     if (threshold <= mb_y) {
                         MpegEncContext *thread_context = s2->thread_context[s->slice_count];
 
@@ -2574,11 +2742,11 @@ static int mpeg_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_output, AVPacket *avpkt)
 {
     const uint8_t *buf = avpkt->data;
+    int ret;
     int buf_size = avpkt->size;
     Mpeg1Context *s = avctx->priv_data;
     AVFrame *picture = data;
     MpegEncContext *s2 = &s->mpeg_enc_ctx;
-    ff_dlog(avctx, "fill_buffer\n");
 
     if (buf_size == 0 || (buf_size == 4 && AV_RB32(buf) == SEQ_END_CODE)) {
         /* special case for last picture */
@@ -2603,20 +2771,45 @@ static int mpeg_decode_frame(AVCodecContext *avctx, void *data,
             return buf_size;
     }
 
-    if (s->mpeg_enc_ctx_allocated == 0 && avctx->codec_tag == AV_RL32("VCR2"))
+    s2->codec_tag = avpriv_toupper4(avctx->codec_tag);
+    if (s->mpeg_enc_ctx_allocated == 0 && (   s2->codec_tag == AV_RL32("VCR2")
+                                           || s2->codec_tag == AV_RL32("BW10")
+                                          ))
         vcr2_init_sequence(avctx);
 
     s->slice_count = 0;
 
     if (avctx->extradata && !s->extradata_decoded) {
-        int ret = decode_chunks(avctx, picture, got_output,
-                                avctx->extradata, avctx->extradata_size);
+        ret = decode_chunks(avctx, picture, got_output,
+                            avctx->extradata, avctx->extradata_size);
+        if (*got_output) {
+            av_log(avctx, AV_LOG_ERROR, "picture in extradata\n");
+            *got_output = 0;
+        }
         s->extradata_decoded = 1;
-        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
+        if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE)) {
+            s2->current_picture_ptr = NULL;
             return ret;
+        }
+    }
+
+    ret = decode_chunks(avctx, picture, got_output, buf, buf_size);
+    if (ret<0 || *got_output) {
+        s2->current_picture_ptr = NULL;
+
+        if (s2->timecode_frame_start != -1 && *got_output) {
+            AVFrameSideData *tcside = av_frame_new_side_data(picture,
+                                                             AV_FRAME_DATA_GOP_TIMECODE,
+                                                             sizeof(int64_t));
+            if (!tcside)
+                return AVERROR(ENOMEM);
+            memcpy(tcside->data, &s2->timecode_frame_start, sizeof(int64_t));
+
+            s2->timecode_frame_start = -1;
+        }
     }
 
-    return decode_chunks(avctx, picture, got_output, buf, buf_size);
+    return ret;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -2624,7 +2817,6 @@ static void flush(AVCodecContext *avctx)
     Mpeg1Context *s = avctx->priv_data;
 
     s->sync       = 0;
-    s->closed_gop = 0;
 
     ff_mpeg_flush(avctx);
 }
@@ -2652,6 +2844,7 @@ AVCodec ff_mpeg1video_decoder = {
                              AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                              AV_CODEC_CAP_SLICE_THREADS,
     .flush                 = flush,
+    .max_lowres            = 3,
     .update_thread_context = ONLY_IF_THREADS_ENABLED(mpeg_decode_update_thread_context)
 };
 
@@ -2668,11 +2861,28 @@ AVCodec ff_mpeg2video_decoder = {
                       AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                       AV_CODEC_CAP_SLICE_THREADS,
     .flush          = flush,
+    .max_lowres     = 3,
     .profiles       = NULL_IF_CONFIG_SMALL(ff_mpeg2_video_profiles),
 };
 
+//legacy decoder
+AVCodec ff_mpegvideo_decoder = {
+    .name           = "mpegvideo",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-1 video"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .priv_data_size = sizeof(Mpeg1Context),
+    .init           = mpeg_decode_init,
+    .close          = mpeg_decode_end,
+    .decode         = mpeg_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS,
+    .flush          = flush,
+    .max_lowres     = 3,
+};
+
 #if FF_API_XVMC
 #if CONFIG_MPEG_XVMC_DECODER
+FF_DISABLE_DEPRECATION_WARNINGS
 static av_cold int mpeg_mc_decode_init(AVCodecContext *avctx)
 {
     if (avctx->active_thread_type & FF_THREAD_SLICE)
@@ -2704,6 +2914,38 @@ AVCodec ff_mpeg_xvmc_decoder = {
                       AV_CODEC_CAP_DELAY,
     .flush          = flush,
 };
-
+FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 #endif /* FF_API_XVMC */
+
+#if CONFIG_MPEG_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_mpeg_vdpau_decoder = {
+    .name           = "mpegvideo_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-1/2 video (VDPAU acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .priv_data_size = sizeof(Mpeg1Context),
+    .init           = mpeg_decode_init,
+    .close          = mpeg_decode_end,
+    .decode         = mpeg_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED |
+                      AV_CODEC_CAP_HWACCEL_VDPAU | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+};
+#endif
+
+#if CONFIG_MPEG1_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_mpeg1_vdpau_decoder = {
+    .name           = "mpeg1video_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-1 video (VDPAU acceleration)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .priv_data_size = sizeof(Mpeg1Context),
+    .init           = mpeg_decode_init,
+    .close          = mpeg_decode_end,
+    .decode         = mpeg_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED |
+                      AV_CODEC_CAP_HWACCEL_VDPAU | AV_CODEC_CAP_DELAY,
+    .flush          = flush,
+};
+#endif
diff --git a/libavcodec/mpeg12enc.c b/libavcodec/mpeg12enc.c
index 103f3aa..f45598a 100644
--- a/libavcodec/mpeg12enc.c
+++ b/libavcodec/mpeg12enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/log.h"
 #include "libavutil/opt.h"
+#include "libavutil/timecode.h"
 #include "libavutil/stereo3d.h"
 
 #include "avcodec.h"
@@ -40,17 +42,12 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 
-
-static const uint8_t inv_non_linear_qscale[] = {
-    0, 2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-};
-
 static const uint8_t svcd_scan_offset_placeholder[] = {
     0x10, 0x0E, 0x00, 0x80, 0x81, 0x00, 0x80,
     0x81, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 };
 
-static uint8_t mv_penalty[MAX_FCODE + 1][MAX_MV * 2 + 1];
+static uint8_t mv_penalty[MAX_FCODE + 1][MAX_DMV * 2 + 1];
 static uint8_t fcode_tab[MAX_MV * 2 + 1];
 
 static uint8_t uni_mpeg1_ac_vlc_len[64 * 64 * 2];
@@ -86,7 +83,7 @@ static av_cold void init_uni_ac_vlc(RLTable *rl, uint8_t *uni_ac_vlc_len)
                 /* length of VLC and sign */
                 len = rl->table_vlc[code][1] + 1;
             } else {
-                len = rl->table_vlc[111][1] + 6;    /* rl->n */
+                len = rl->table_vlc[111 /* rl->n */][1] + 6;
 
                 if (alevel < 128)
                     len += 8;
@@ -102,26 +99,37 @@ static av_cold void init_uni_ac_vlc(RLTable *rl, uint8_t *uni_ac_vlc_len)
 static int find_frame_rate_index(MpegEncContext *s)
 {
     int i;
-    int64_t dmin = INT64_MAX;
-    int64_t d;
+    AVRational bestq = (AVRational) {0, 0};
+    AVRational ext;
+    AVRational target = av_inv_q(s->avctx->time_base);
 
     for (i = 1; i < 14; i++) {
-        int64_t n0 = 1001LL / ff_mpeg12_frame_rate_tab[i].den *
-                     ff_mpeg12_frame_rate_tab[i].num * s->avctx->time_base.num;
-        int64_t n1 = 1001LL * s->avctx->time_base.den;
-
         if (s->avctx->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL &&
             i >= 9)
             break;
 
-        d = FFABS(n0 - n1);
-        if (d < dmin) {
-            dmin                = d;
-            s->frame_rate_index = i;
+        for (ext.num=1; ext.num <= 4; ext.num++) {
+            for (ext.den=1; ext.den <= 32; ext.den++) {
+                AVRational q = av_mul_q(ext, ff_mpeg12_frame_rate_tab[i]);
+
+                if (s->codec_id != AV_CODEC_ID_MPEG2VIDEO && (ext.den!=1 || ext.num!=1))
+                    continue;
+                if (av_gcd(ext.den, ext.num) != 1)
+                    continue;
+
+                if (    bestq.num==0
+                    || av_nearer_q(target, bestq, q) < 0
+                    || ext.num==1 && ext.den==1 && av_nearer_q(target, bestq, q) == 0) {
+                    bestq               = q;
+                    s->frame_rate_index = i;
+                    s->mpeg2_frame_rate_ext.num = ext.num;
+                    s->mpeg2_frame_rate_ext.den = ext.den;
+                }
+            }
         }
     }
 
-    if (dmin)
+    if (av_cmp_q(target, bestq))
         return -1;
     else
         return 0;
@@ -176,6 +184,22 @@ static av_cold int encode_init(AVCodecContext *avctx)
         }
     }
 
+    if ((avctx->width & 0xFFF) == 0 && (avctx->height & 0xFFF) == 1) {
+        av_log(avctx, AV_LOG_ERROR, "Width / Height is invalid for MPEG2\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->strict_std_compliance > FF_COMPLIANCE_UNOFFICIAL) {
+        if ((avctx->width & 0xFFF) == 0 || (avctx->height & 0xFFF) == 0) {
+            av_log(avctx, AV_LOG_ERROR, "Width or Height are not allowed to be multiples of 4096\n"
+                                        "add '-strict %d' if you want to use them anyway.\n", FF_COMPLIANCE_UNOFFICIAL);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    s->drop_frame_timecode = s->drop_frame_timecode || !!(avctx->flags2 & AV_CODEC_FLAG2_DROP_FRAME_TIMECODE);
+    if (s->drop_frame_timecode)
+        s->tc.flags |= AV_TIMECODE_FLAG_DROPFRAME;
     if (s->drop_frame_timecode && s->frame_rate_index != 4) {
         av_log(avctx, AV_LOG_ERROR,
                "Drop frame time code only allowed with 1001/30000 fps\n");
@@ -189,6 +213,17 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    if (s->tc_opt_str) {
+        AVRational rate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
+        int ret = av_timecode_init_from_string(&s->tc, rate, s->tc_opt_str, s);
+        if (ret < 0)
+            return ret;
+        s->drop_frame_timecode = !!(s->tc.flags & AV_TIMECODE_FLAG_DROPFRAME);
+        s->timecode_frame_start = s->tc.start;
+    } else {
+        s->timecode_frame_start = 0; // default is -1
+    }
+
     return 0;
 }
 
@@ -205,11 +240,11 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
     unsigned int vbv_buffer_size, fps, v;
     int i, constraint_parameter_flag;
     uint64_t time_code;
-    float best_aspect_error = 1E10;
-    float aspect_ratio      = av_q2d(s->avctx->sample_aspect_ratio);
+    int64_t best_aspect_error = INT64_MAX;
+    AVRational aspect_ratio = s->avctx->sample_aspect_ratio;
 
-    if (aspect_ratio == 0.0)
-        aspect_ratio = 1.0;             // pixel aspect 1.1 (VGA)
+    if (aspect_ratio.num == 0 || aspect_ratio.den == 0)
+        aspect_ratio = (AVRational){1,1};             // pixel aspect 1.1 (VGA)
 
     if (s->current_picture.f->key_frame) {
         AVRational framerate = ff_mpeg12_frame_rate_tab[s->frame_rate_index];
@@ -217,19 +252,19 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         /* MPEG-1 header repeated every GOP */
         put_header(s, SEQ_START_CODE);
 
-        put_sbits(&s->pb, 12, s->width);
-        put_sbits(&s->pb, 12, s->height);
+        put_sbits(&s->pb, 12, s->width  & 0xFFF);
+        put_sbits(&s->pb, 12, s->height & 0xFFF);
 
         for (i = 1; i < 15; i++) {
-            float error = aspect_ratio;
+            int64_t error = aspect_ratio.num * (1LL<<32) / aspect_ratio.den;
             if (s->codec_id == AV_CODEC_ID_MPEG1VIDEO || i <= 1)
-                error -= 1.0 / ff_mpeg1_aspect[i];
+                error -= (1LL<<32) / ff_mpeg1_aspect[i];
             else
-                error -= av_q2d(ff_mpeg2_aspect[i]) * s->height / s->width;
+                error -= (1LL<<32)*ff_mpeg2_aspect[i].num * s->height / s->width / ff_mpeg2_aspect[i].den;
 
             error = FFABS(error);
 
-            if (error < best_aspect_error) {
+            if (error - 2 <= best_aspect_error) {
                 best_aspect_error    = error;
                 s->aspect_ratio_info = i;
             }
@@ -276,6 +311,11 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         ff_write_quant_matrix(&s->pb, s->avctx->inter_matrix);
 
         if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
+            AVFrameSideData *side_data;
+            int width = s->width;
+            int height = s->height;
+            int use_seq_disp_ext;
+
             put_header(s, EXT_START_CODE);
             put_bits(&s->pb, 4, 1);                 // seq ext
 
@@ -292,20 +332,37 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
             put_bits(&s->pb, 1, 1);                 // marker
             put_bits(&s->pb, 8, vbv_buffer_size >> 10); // vbv buffer ext
             put_bits(&s->pb, 1, s->low_delay);
-            put_bits(&s->pb, 2, 0);                 // frame_rate_ext_n
-            put_bits(&s->pb, 5, 0);                 // frame_rate_ext_d
+            put_bits(&s->pb, 2, s->mpeg2_frame_rate_ext.num-1); // frame_rate_ext_n
+            put_bits(&s->pb, 5, s->mpeg2_frame_rate_ext.den-1); // frame_rate_ext_d
+
+            side_data = av_frame_get_side_data(s->current_picture_ptr->f, AV_FRAME_DATA_PANSCAN);
+            if (side_data) {
+                AVPanScan *pan_scan = (AVPanScan *)side_data->data;
+                if (pan_scan->width && pan_scan->height) {
+                    width = pan_scan->width >> 4;
+                    height = pan_scan->height >> 4;
+                }
+            }
 
-            put_header(s, EXT_START_CODE);
-            put_bits(&s->pb, 4, 2);                         // sequence display extension
-            put_bits(&s->pb, 3, 0);                         // video_format: 0 is components
-            put_bits(&s->pb, 1, 1);                         // colour_description
-            put_bits(&s->pb, 8, s->avctx->color_primaries); // colour_primaries
-            put_bits(&s->pb, 8, s->avctx->color_trc);       // transfer_characteristics
-            put_bits(&s->pb, 8, s->avctx->colorspace);      // matrix_coefficients
-            put_bits(&s->pb, 14, s->width);                 // display_horizontal_size
-            put_bits(&s->pb, 1, 1);                         // marker_bit
-            put_bits(&s->pb, 14, s->height);                // display_vertical_size
-            put_bits(&s->pb, 3, 0);                         // remaining 3 bits are zero padding
+            use_seq_disp_ext = (width != s->width ||
+                                height != s->height ||
+                                s->avctx->color_primaries != AVCOL_PRI_UNSPECIFIED ||
+                                s->avctx->color_trc != AVCOL_TRC_UNSPECIFIED ||
+                                s->avctx->colorspace != AVCOL_SPC_UNSPECIFIED);
+
+            if (s->seq_disp_ext == 1 || (s->seq_disp_ext == -1 && use_seq_disp_ext)) {
+                put_header(s, EXT_START_CODE);
+                put_bits(&s->pb, 4, 2);                         // sequence display extension
+                put_bits(&s->pb, 3, 0);                         // video_format: 0 is components
+                put_bits(&s->pb, 1, 1);                         // colour_description
+                put_bits(&s->pb, 8, s->avctx->color_primaries); // colour_primaries
+                put_bits(&s->pb, 8, s->avctx->color_trc);       // transfer_characteristics
+                put_bits(&s->pb, 8, s->avctx->colorspace);      // matrix_coefficients
+                put_bits(&s->pb, 14, width);                    // display_horizontal_size
+                put_bits(&s->pb, 1, 1);                         // marker_bit
+                put_bits(&s->pb, 14, height);                   // display_vertical_size
+                put_bits(&s->pb, 3, 0);                         // remaining 3 bits are zero padding
+            }
         }
 
         put_header(s, GOP_START_CODE);
@@ -317,21 +374,17 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
                     s->timecode_frame_start;
 
         s->gop_picture_number = s->current_picture_ptr->f->coded_picture_number;
-        if (s->drop_frame_timecode) {
-            /* only works for NTSC 29.97 */
-            int d = time_code / 17982;
-            int m = time_code % 17982;
-            /* not needed since -2,-1 / 1798 in C returns 0 */
-            // if (m < 2)
-            //     m += 2;
-            time_code += 18 * d + 2 * ((m - 2) / 1798);
-        }
+
+        av_assert0(s->drop_frame_timecode == !!(s->tc.flags & AV_TIMECODE_FLAG_DROPFRAME));
+        if (s->drop_frame_timecode)
+            time_code = av_timecode_adjust_ntsc_framenum2(time_code, fps);
+
         put_bits(&s->pb, 5, (uint32_t)((time_code / (fps * 3600)) % 24));
         put_bits(&s->pb, 6, (uint32_t)((time_code / (fps *   60)) % 60));
         put_bits(&s->pb, 1, 1);
         put_bits(&s->pb, 6, (uint32_t)((time_code / fps) % 60));
         put_bits(&s->pb, 6, (uint32_t)((time_code % fps)));
-        put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP) || s->intra_only);
+        put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP) || s->intra_only || !s->gop_picture_number);
         put_bits(&s->pb, 1, 0);                     // broken link
     }
 }
@@ -348,17 +401,12 @@ static inline void encode_mb_skip_run(MpegEncContext *s, int run)
 
 static av_always_inline void put_qscale(MpegEncContext *s)
 {
-    if (s->q_scale_type) {
-        assert(s->qscale >= 1 && s->qscale <= 12);
-        put_bits(&s->pb, 5, inv_non_linear_qscale[s->qscale]);
-    } else {
-        put_bits(&s->pb, 5, s->qscale);
-    }
+    put_bits(&s->pb, 5, s->qscale);
 }
 
 void ff_mpeg1_encode_slice_header(MpegEncContext *s)
 {
-    if (s->height > 2800) {
+    if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO && s->height > 2800) {
         put_header(s, SLICE_MIN_START_CODE + (s->mb_y & 127));
         /* slice_vertical_position_extension */
         put_bits(&s->pb, 3, s->mb_y >> 7);
@@ -427,7 +475,7 @@ void ff_mpeg1_encode_picture_header(MpegEncContext *s, int picture_number)
         }
         put_bits(&s->pb, 2, s->intra_dc_precision);
 
-        assert(s->picture_structure == PICT_FRAME);
+        av_assert0(s->picture_structure == PICT_FRAME);
         put_bits(&s->pb, 2, s->picture_structure);
         if (s->progressive_sequence)
             put_bits(&s->pb, 1, 0);             /* no repeat */
@@ -539,7 +587,7 @@ static void mpeg1_encode_motion(MpegEncContext *s, int val, int f_or_b_code)
             sign = 1;
         }
 
-        assert(code > 0 && code <= 16);
+        av_assert2(code > 0 && code <= 16);
 
         put_bits(&s->pb,
                  ff_mpeg12_mbMotionVectorTable[code][1],
@@ -567,12 +615,12 @@ static inline void encode_dc(MpegEncContext *s, int diff, int component)
             put_bits(&s->pb,
                      ff_mpeg12_vlc_dc_lum_bits[index] + index,
                      (ff_mpeg12_vlc_dc_lum_code[index] << index) +
-                     (diff & ((1 << index) - 1)));
+                     av_mod_uintp2(diff, index));
         else
             put_bits(&s->pb,
                      ff_mpeg12_vlc_dc_chroma_bits[index] + index,
                      (ff_mpeg12_vlc_dc_chroma_code[index] << index) +
-                     (diff & ((1 << index) - 1)));
+                     av_mod_uintp2(diff, index));
     } else {
         if (component == 0)
             put_bits(&s->pb,
@@ -682,7 +730,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
 
     if (cbp == 0 && !first_mb && s->mv_type == MV_TYPE_16X16 &&
         (mb_x != s->mb_width - 1 ||
-         (mb_y != s->mb_height - 1 && s->codec_id == AV_CODEC_ID_MPEG1VIDEO)) &&
+         (mb_y != s->end_mb_y - 1 && s->codec_id == AV_CODEC_ID_MPEG1VIDEO)) &&
         ((s->pict_type == AV_PICTURE_TYPE_P && (motion_x | motion_y) == 0) ||
          (s->pict_type == AV_PICTURE_TYPE_B && s->mv_dir == s->last_mv_dir &&
           (((s->mv_dir & MV_DIR_FORWARD)
@@ -704,7 +752,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
         }
     } else {
         if (first_mb) {
-            assert(s->mb_skip_run == 0);
+            av_assert0(s->mb_skip_run == 0);
             encode_mb_skip_run(s, s->mb_x);
         } else {
             encode_mb_skip_run(s, s->mb_skip_run);
@@ -783,7 +831,7 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
                 s->last_mv[0][1][0] = s->last_mv[0][0][0] = motion_x;
                 s->last_mv[0][1][1] = s->last_mv[0][0][1] = motion_y;
             } else {
-                assert(!s->frame_pred_frame_dct && s->mv_type == MV_TYPE_FIELD);
+                av_assert2(!s->frame_pred_frame_dct && s->mv_type == MV_TYPE_FIELD);
 
                 if (cbp) {
                     if (s->dquant) {
@@ -870,8 +918,8 @@ static av_always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
                     s->b_count++;
                 }
             } else {
-                assert(s->mv_type == MV_TYPE_FIELD);
-                assert(!s->frame_pred_frame_dct);
+                av_assert2(s->mv_type == MV_TYPE_FIELD);
+                av_assert2(!s->frame_pred_frame_dct);
                 if (cbp) {                      // With coded bloc pattern
                     if (s->dquant) {
                         if (s->mv_dir == MV_DIR_FORWARD)
@@ -988,17 +1036,17 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
 
             bits = ff_mpeg12_vlc_dc_lum_bits[index] + index;
             code = (ff_mpeg12_vlc_dc_lum_code[index] << index) +
-                   (diff & ((1 << index) - 1));
+                    av_mod_uintp2(diff, index);
             mpeg1_lum_dc_uni[i + 255] = bits + (code << 8);
 
             bits = ff_mpeg12_vlc_dc_chroma_bits[index] + index;
             code = (ff_mpeg12_vlc_dc_chroma_code[index] << index) +
-                   (diff & ((1 << index) - 1));
+                    av_mod_uintp2(diff, index);
             mpeg1_chr_dc_uni[i + 255] = bits + (code << 8);
         }
 
         for (f_code = 1; f_code <= MAX_FCODE; f_code++)
-            for (mv = -MAX_MV; mv <= MAX_MV; mv++) {
+            for (mv = -MAX_DMV; mv <= MAX_DMV; mv++) {
                 int len;
 
                 if (mv == 0) {
@@ -1021,7 +1069,7 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
                               2 + bit_size;
                 }
 
-                mv_penalty[f_code][mv + MAX_MV] = len;
+                mv_penalty[f_code][mv + MAX_DMV] = len;
             }
 
 
@@ -1052,14 +1100,16 @@ av_cold void ff_mpeg1_encode_init(MpegEncContext *s)
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
 #define COMMON_OPTS                                                           \
+    { "gop_timecode",        "MPEG GOP Timecode in hh:mm:ss[:;.]ff format. Overrides timecode_frame_start.",   \
+      OFFSET(tc_opt_str), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, VE },\
     { "intra_vlc",           "Use MPEG-2 intra VLC table.",                   \
-      OFFSET(intra_vlc_format),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE }, \
+      OFFSET(intra_vlc_format),    AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
     { "drop_frame_timecode", "Timecode is in drop frame format.",             \
-      OFFSET(drop_frame_timecode), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE }, \
+      OFFSET(drop_frame_timecode), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
     { "scan_offset",         "Reserve space for SVCD scan offset user data.", \
-      OFFSET(scan_offset),         AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE }, \
+      OFFSET(scan_offset),         AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, \
     { "timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", \
-      OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, VE}, \
+      OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = -1 }, -1, INT64_MAX, VE}, \
 
 static const AVOption mpeg1_options[] = {
     COMMON_OPTS
@@ -1069,8 +1119,12 @@ static const AVOption mpeg1_options[] = {
 
 static const AVOption mpeg2_options[] = {
     COMMON_OPTS
-    { "non_linear_quant", "Use nonlinear quantizer.",    OFFSET(q_scale_type),   AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "alternate_scan",   "Enable alternate scantable.", OFFSET(alternate_scan), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "non_linear_quant", "Use nonlinear quantizer.",    OFFSET(q_scale_type),   AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "alternate_scan",   "Enable alternate scantable.", OFFSET(alternate_scan), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "seq_disp_ext",     "Write sequence_display_extension blocks.", OFFSET(seq_disp_ext), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, VE, "seq_disp_ext" },
+    {     "auto",   NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = -1},  0, 0, VE, "seq_disp_ext" },
+    {     "never",  NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = 0 },  0, 0, VE, "seq_disp_ext" },
+    {     "always", NULL, 0, AV_OPT_TYPE_CONST,  {.i64 = 1 },  0, 0, VE, "seq_disp_ext" },
     FF_MPV_COMMON_OPTS
     { NULL },
 };
@@ -1111,7 +1165,7 @@ AVCodec ff_mpeg2video_encoder = {
     .init                 = encode_init,
     .encode2              = ff_mpv_encode_picture,
     .close                = ff_mpv_encode_end,
-    .supported_framerates = ff_mpeg12_frame_rate_tab + 1,
+    .supported_framerates = ff_mpeg2_frame_rate_tab,
     .pix_fmts             = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV420P,
                                                            AV_PIX_FMT_YUV422P,
                                                            AV_PIX_FMT_NONE },
diff --git a/libavcodec/mpeg12vlc.h b/libavcodec/mpeg12vlc.h
index bb854b0..ca06e56 100644
--- a/libavcodec/mpeg12vlc.h
+++ b/libavcodec/mpeg12vlc.h
@@ -3,20 +3,20 @@
  * copyright (c) 2000,2001 Fabrice Bellard
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4_unpack_bframes_bsf.c b/libavcodec/mpeg4_unpack_bframes_bsf.c
new file mode 100644
index 0000000..0615621
--- /dev/null
+++ b/libavcodec/mpeg4_unpack_bframes_bsf.c
@@ -0,0 +1,191 @@
+/*
+ * Bitstream filter for unpacking DivX-style packed B-frames in MPEG-4 (divx_packed)
+ * Copyright (c) 2015 Andreas Cadhalpun <Andreas.Cadhalpun@googlemail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "mpeg4video.h"
+
+typedef struct UnpackBFramesBSFContext {
+    uint8_t *b_frame_buf;
+    int      b_frame_buf_size;
+} UnpackBFramesBSFContext;
+
+/* search next start code */
+static unsigned int find_startcode(const uint8_t *buf, int buf_size, int *pos)
+{
+    unsigned int startcode = 0xFF;
+
+    for (; *pos < buf_size;) {
+        startcode = ((startcode << 8) | buf[*pos]) & 0xFFFFFFFF;
+        *pos +=1;
+        if ((startcode & 0xFFFFFF00) != 0x100)
+            continue;  /* no startcode */
+        return startcode;
+    }
+
+    return 0;
+}
+
+/* determine the position of the packed marker in the userdata,
+ * the number of VOPs and the position of the second VOP */
+static void scan_buffer(const uint8_t *buf, int buf_size,
+                        int *pos_p, int *nb_vop, int *pos_vop2) {
+    unsigned int startcode;
+    int pos, i;
+
+    for (pos = 0; pos < buf_size;) {
+        startcode = find_startcode(buf, buf_size, &pos);
+
+        if (startcode == USER_DATA_STARTCODE && pos_p) {
+            /* check if the (DivX) userdata string ends with 'p' (packed) */
+            for (i = 0; i < 255 && pos + i + 1 < buf_size; i++) {
+                if (buf[pos + i] == 'p' && buf[pos + i + 1] == '\0') {
+                    *pos_p = pos + i;
+                    break;
+                }
+            }
+        } else if (startcode == VOP_STARTCODE && nb_vop) {
+            *nb_vop += 1;
+            if (*nb_vop == 2 && pos_vop2) {
+                *pos_vop2 = pos - 4; /* subtract 4 bytes startcode */
+            }
+        }
+    }
+}
+
+/* allocate new buffer and copy size bytes from src */
+static uint8_t *create_new_buffer(const uint8_t *src, int size) {
+    uint8_t *dst = av_malloc(size + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if (dst) {
+        memcpy(dst, src, size);
+        memset(dst + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    }
+
+    return dst;
+}
+
+static int mpeg4_unpack_bframes_filter(AVBSFContext *ctx, AVPacket *out)
+{
+    UnpackBFramesBSFContext *s = ctx->priv_data;
+    int pos_p = -1, nb_vop = 0, pos_vop2 = -1, ret = 0;
+    AVPacket *in;
+
+    ret = ff_bsf_get_packet(ctx, &in);
+    if (ret < 0)
+        return ret;
+
+    scan_buffer(in->data, in->size, &pos_p, &nb_vop, &pos_vop2);
+    av_log(ctx, AV_LOG_DEBUG, "Found %d VOP startcode(s) in this packet.\n", nb_vop);
+
+    if (pos_vop2 >= 0) {
+        if (s->b_frame_buf) {
+            av_log(ctx, AV_LOG_WARNING,
+                   "Missing one N-VOP packet, discarding one B-frame.\n");
+            av_freep(&s->b_frame_buf);
+            s->b_frame_buf_size = 0;
+        }
+        /* store the packed B-frame in the BSFContext */
+        s->b_frame_buf_size = in->size - pos_vop2;
+        s->b_frame_buf      = create_new_buffer(in->data + pos_vop2, s->b_frame_buf_size);
+        if (!s->b_frame_buf) {
+            s->b_frame_buf_size = 0;
+            av_packet_free(&in);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    if (nb_vop > 2) {
+        av_log(ctx, AV_LOG_WARNING,
+       "Found %d VOP headers in one packet, only unpacking one.\n", nb_vop);
+    }
+
+    if (nb_vop == 1 && s->b_frame_buf) {
+        /* use frame from BSFContext */
+        av_packet_from_data(out, s->b_frame_buf, s->b_frame_buf_size);
+        if (in->size <= MAX_NVOP_SIZE) {
+            /* N-VOP */
+            av_log(ctx, AV_LOG_DEBUG, "Skipping N-VOP.\n");
+            s->b_frame_buf      = NULL;
+            s->b_frame_buf_size = 0;
+        } else {
+            /* copy packet into BSFContext */
+            s->b_frame_buf_size = in->size;
+            s->b_frame_buf      = create_new_buffer(in->data, in->size);
+            if (!s->b_frame_buf) {
+                s->b_frame_buf_size = 0;
+                av_packet_unref(out);
+                av_packet_free(&in);
+                return AVERROR(ENOMEM);
+            }
+        }
+    } else if (nb_vop >= 2) {
+        /* use first frame of the packet */
+        av_packet_move_ref(out, in);
+        out->size = pos_vop2;
+    } else if (pos_p >= 0) {
+        av_log(ctx, AV_LOG_DEBUG, "Updating DivX userdata (remove trailing 'p').\n");
+        av_packet_move_ref(out, in);
+        /* remove 'p' (packed) from the end of the (DivX) userdata string */
+        out->data[pos_p] = '\0';
+    } else {
+        /* copy packet */
+        av_packet_move_ref(out, in);
+    }
+
+    av_packet_free(&in);
+
+    return 0;
+}
+
+static int mpeg4_unpack_bframes_init(AVBSFContext *ctx)
+{
+    if (ctx->par_in->extradata) {
+        int pos_p_ext = -1;
+        scan_buffer(ctx->par_in->extradata, ctx->par_in->extradata_size, &pos_p_ext, NULL, NULL);
+        if (pos_p_ext >= 0) {
+            av_log(ctx, AV_LOG_DEBUG,
+                   "Updating DivX userdata (remove trailing 'p') in extradata.\n");
+            ctx->par_out->extradata[pos_p_ext] = '\0';
+        }
+    }
+
+    return 0;
+}
+
+static void mpeg4_unpack_bframes_close(AVBSFContext *bsfc)
+{
+    UnpackBFramesBSFContext *ctx = bsfc->priv_data;
+    av_freep(&ctx->b_frame_buf);
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_MPEG4, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_mpeg4_unpack_bframes_bsf = {
+    .name           = "mpeg4_unpack_bframes",
+    .priv_data_size = sizeof(UnpackBFramesBSFContext),
+    .init           = mpeg4_unpack_bframes_init,
+    .filter         = mpeg4_unpack_bframes_filter,
+    .close          = mpeg4_unpack_bframes_close,
+    .codec_ids      = codec_ids,
+};
diff --git a/libavcodec/mpeg4audio.c b/libavcodec/mpeg4audio.c
index 2363cb6..188d843 100644
--- a/libavcodec/mpeg4audio.c
+++ b/libavcodec/mpeg4audio.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  * Copyright (c) 2009 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,6 +52,8 @@ static int parse_config_ALS(GetBitContext *gb, MPEG4AudioConfig *c)
     return 0;
 }
 
+/* XXX: make sure to update the copies in the different encoders if you change
+ * this table */
 const int avpriv_mpeg4audio_sample_rates[16] = {
     96000, 88200, 64000, 48000, 44100, 32000,
     24000, 22050, 16000, 12000, 11025, 8000, 7350
@@ -82,9 +84,13 @@ int avpriv_mpeg4audio_get_config(MPEG4AudioConfig *c, const uint8_t *buf,
     GetBitContext gb;
     int specific_config_bitindex, ret;
 
+    if (bit_size <= 0)
+        return AVERROR_INVALIDDATA;
+
     ret = init_get_bits(&gb, buf, bit_size);
     if (ret < 0)
         return ret;
+
     c->object_type = get_object_type(&gb);
     c->sample_rate = get_sample_rate(&gb, &c->sampling_index);
     c->chan_config = get_bits(&gb, 4);
@@ -125,8 +131,11 @@ int avpriv_mpeg4audio_get_config(MPEG4AudioConfig *c, const uint8_t *buf,
             if (show_bits(&gb, 11) == 0x2b7) { // sync extension
                 get_bits(&gb, 11);
                 c->ext_object_type = get_object_type(&gb);
-                if (c->ext_object_type == AOT_SBR && (c->sbr = get_bits1(&gb)) == 1)
+                if (c->ext_object_type == AOT_SBR && (c->sbr = get_bits1(&gb)) == 1) {
                     c->ext_sample_rate = get_sample_rate(&gb, &c->ext_sampling_index);
+                    if (c->ext_sample_rate == c->sample_rate)
+                        c->sbr = -1;
+                }
                 if (get_bits_left(&gb) > 11 && get_bits(&gb, 11) == 0x548)
                     c->ps = get_bits1(&gb);
                 break;
diff --git a/libavcodec/mpeg4audio.h b/libavcodec/mpeg4audio.h
index 2eef220..8239081 100644
--- a/libavcodec/mpeg4audio.h
+++ b/libavcodec/mpeg4audio.h
@@ -2,20 +2,20 @@
  * MPEG-4 Audio common header
  * Copyright (c) 2008 Baptiste Coudurier <baptiste.coudurier@free.fr>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -102,7 +102,7 @@ enum AudioObjectType {
     AOT_USAC,                  ///< N                       Unified Speech and Audio Coding
 };
 
-#define MAX_PCE_SIZE 304 ///<Maximum size of a PCE including the 3-bit ID_PCE
+#define MAX_PCE_SIZE 320 ///<Maximum size of a PCE including the 3-bit ID_PCE
                          ///<marker and the comment
 
 int avpriv_copy_pce_data(PutBitContext *pb, GetBitContext *gb);
diff --git a/libavcodec/mpeg4data.h b/libavcodec/mpeg4data.h
index b428a5e..b7c3fab 100644
--- a/libavcodec/mpeg4data.h
+++ b/libavcodec/mpeg4data.h
@@ -3,20 +3,20 @@
  * H.263+ support
  * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4video.c b/libavcodec/mpeg4video.c
index b60cd4f..2aaa9f7 100644
--- a/libavcodec/mpeg4video.c
+++ b/libavcodec/mpeg4video.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4video.h b/libavcodec/mpeg4video.h
index 4a4995e..515b008 100644
--- a/libavcodec/mpeg4video.h
+++ b/libavcodec/mpeg4video.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,6 +59,9 @@
 #define VISUAL_OBJ_STARTCODE 0x1B5
 #define VOP_STARTCODE        0x1B6
 
+/* smaller packets likely don't contain a real frame */
+#define MAX_NVOP_SIZE 19
+
 typedef struct Mpeg4DecContext {
     MpegEncContext m;
 
@@ -84,6 +87,7 @@ typedef struct Mpeg4DecContext {
     int enhancement_type;
     int scalability;
     int use_intra_dc_vlc;
+
     /// QP above which the ac VLC should be used for intra dc
     int intra_dc_threshold;
 
@@ -92,6 +96,7 @@ typedef struct Mpeg4DecContext {
     int divx_build;
     int xvid_build;
     int lavc_build;
+
     /// flag for having shown the warning about invalid Divx B-frames
     int showed_packed_warning;
     /** does the stream contain the low_delay flag,
@@ -135,7 +140,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s,
 void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n,
                       int dir);
 void ff_set_mpeg4_time(MpegEncContext *s);
-void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
+int ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
 
 int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb);
 void ff_mpeg4_encode_video_packet_header(MpegEncContext *s);
@@ -148,6 +153,8 @@ int ff_mpeg4_decode_partitions(Mpeg4DecContext *ctx);
 int ff_mpeg4_get_video_packet_prefix_length(MpegEncContext *s);
 int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx);
 void ff_mpeg4_init_direct_mv(MpegEncContext *s);
+void ff_mpeg4videodec_static_init(void);
+int ff_mpeg4_workaround_bugs(AVCodecContext *avctx);
 int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size);
 
 /**
@@ -222,21 +229,21 @@ static inline int ff_mpeg4_pred_dc(MpegEncContext *s, int n, int level,
     } else {
         level += pred;
         ret    = level;
-        if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+    }
+    level *= scale;
+    if (level & (~2047)) {
+        if (!s->encoding && (s->avctx->err_recognition & (AV_EF_BITSTREAM | AV_EF_AGGRESSIVE))) {
             if (level < 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "dc<0 at %dx%d\n", s->mb_x, s->mb_y);
                 return -1;
             }
-            if (level * scale > 2048 + scale) {
+            if (level > 2048 + scale) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "dc overflow at %dx%d\n", s->mb_x, s->mb_y);
                 return -1;
             }
         }
-    }
-    level *= scale;
-    if (level & (~2047)) {
         if (level < 0)
             level = 0;
         else if (!(s->workaround_bugs & FF_BUG_DC_CLIP))
diff --git a/libavcodec/mpeg4video_parser.c b/libavcodec/mpeg4video_parser.c
index e2203f9..b7d6da1 100644
--- a/libavcodec/mpeg4video_parser.c
+++ b/libavcodec/mpeg4video_parser.c
@@ -3,23 +3,25 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "internal.h"
 #include "parser.h"
 #include "mpegvideo.h"
@@ -86,6 +88,8 @@ static int mpeg4_decode_header(AVCodecParserContext *s1, AVCodecContext *avctx,
     if (avctx->extradata_size && pc->first_picture) {
         init_get_bits(gb, avctx->extradata, avctx->extradata_size * 8);
         ret = ff_mpeg4_decode_picture_header(dec_ctx, gb);
+        if (ret < -1)
+            av_log(avctx, AV_LOG_WARNING, "Failed to parse extradata\n");
     }
 
     init_get_bits(gb, buf, 8 * buf_size);
@@ -96,6 +100,13 @@ static int mpeg4_decode_header(AVCodecParserContext *s1, AVCodecContext *avctx,
         if (ret < 0)
             return ret;
     }
+    if((s1->flags & PARSER_FLAG_USE_CODEC_TS) && s->avctx->time_base.den>0 && ret>=0){
+        av_assert1(s1->pts == AV_NOPTS_VALUE);
+        av_assert1(s1->dts == AV_NOPTS_VALUE);
+
+        s1->pts = av_rescale_q(s->time, (AVRational){1, s->avctx->time_base.den}, (AVRational){1, 1200000});
+    }
+
     s1->pict_type     = s->pict_type;
     pc->first_picture = 0;
     return ret;
@@ -105,8 +116,12 @@ static av_cold int mpeg4video_parse_init(AVCodecParserContext *s)
 {
     struct Mp4vParseContext *pc = s->priv_data;
 
+    ff_mpeg4videodec_static_init();
+
     pc->first_picture           = 1;
+    pc->dec_ctx.m.quant_precision     = 5;
     pc->dec_ctx.m.slice_context_count = 1;
+    pc->dec_ctx.showed_packed_warning = 1;
     return 0;
 }
 
diff --git a/libavcodec/mpeg4video_parser.h b/libavcodec/mpeg4video_parser.h
index 030a276..8008e69 100644
--- a/libavcodec/mpeg4video_parser.h
+++ b/libavcodec/mpeg4video_parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
index 40d281b..4105ed6 100644
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -3,23 +3,27 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
 #include "error_resilience.h"
 #include "idctdsp.h"
 #include "internal.h"
@@ -63,7 +67,7 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir)
     int8_t *const qscale_table = s->current_picture.qscale_table;
 
     /* find prediction */
-    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val  = &s->ac_val[0][0][0] + s->block_index[n] * 16;
     ac_val1 = ac_val;
     if (s->ac_pred) {
         if (dir == 0) {
@@ -111,12 +115,13 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir)
  * check if the next stuff is a resync marker or the end.
  * @return 0 if not
  */
-static inline int mpeg4_is_resync(MpegEncContext *s)
+static inline int mpeg4_is_resync(Mpeg4DecContext *ctx)
 {
+    MpegEncContext *s = &ctx->m;
     int bits_count = get_bits_count(&s->gb);
     int v          = show_bits(&s->gb, 16);
 
-    if (s->workaround_bugs & FF_BUG_NO_PADDING)
+    if (s->workaround_bugs & FF_BUG_NO_PADDING && !ctx->resync_marker)
         return 0;
 
     while (v <= 0xFF) {
@@ -133,10 +138,11 @@ static inline int mpeg4_is_resync(MpegEncContext *s)
         v  |= 0x7F >> (7 - (bits_count & 7));
 
         if (v == 0x7F)
-            return 1;
+            return s->mb_num;
     } else {
         if (v == ff_mpeg4_resync_prefix[bits_count & 7]) {
-            int len;
+            int len, mb_num;
+            int mb_num_bits = av_log2(s->mb_num - 1) + 1;
             GetBitContext gb = s->gb;
 
             skip_bits(&s->gb, 1);
@@ -146,10 +152,14 @@ static inline int mpeg4_is_resync(MpegEncContext *s)
                 if (get_bits1(&s->gb))
                     break;
 
+            mb_num = get_bits(&s->gb, mb_num_bits);
+            if (!mb_num || mb_num > s->mb_num || get_bits_count(&s->gb)+6 > s->gb.size_in_bits)
+                mb_num= -1;
+
             s->gb = gb;
 
             if (len >= ff_mpeg4_get_video_packet_prefix_length(s))
-                return 1;
+                return mb_num;
         }
     }
     return 0;
@@ -182,17 +192,17 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
         int x = 0, y = 0;
 
         length = get_vlc2(gb, sprite_trajectory.table, SPRITE_TRAJ_VLC_BITS, 3);
-        if (length)
+        if (length > 0)
             x = get_xbits(gb, length);
 
         if (!(ctx->divx_version == 500 && ctx->divx_build == 413))
-            skip_bits1(gb);     /* marker bit */
+            check_marker(gb, "before sprite_trajectory");
 
         length = get_vlc2(gb, sprite_trajectory.table, SPRITE_TRAJ_VLC_BITS, 3);
-        if (length)
+        if (length > 0)
             y = get_xbits(gb, length);
 
-        skip_bits1(gb);         /* marker bit */
+        check_marker(gb, "after sprite_trajectory");
         ctx->sprite_traj[i][0] = d[i][0] = x;
         ctx->sprite_traj[i][1] = d[i][1] = y;
     }
@@ -370,6 +380,17 @@ static int mpeg4_decode_sprite_trajectory(Mpeg4DecContext *ctx, GetBitContext *g
     return 0;
 }
 
+static int decode_new_pred(Mpeg4DecContext *ctx, GetBitContext *gb) {
+    int len = FFMIN(ctx->time_increment_bits + 3, 15);
+
+    get_bits(gb, len);
+    if (get_bits1(gb))
+        get_bits(gb, len);
+    check_marker(gb, "after new_pred");
+
+    return 0;
+}
+
 /**
  * Decode the next video packet.
  * @return <0 if something went wrong
@@ -405,19 +426,6 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
                "illegal mb_num in video packet (%d %d) \n", mb_num, s->mb_num);
         return -1;
     }
-    if (s->pict_type == AV_PICTURE_TYPE_B) {
-        int mb_x = 0, mb_y = 0;
-
-        while (s->next_picture.mbskip_table[s->mb_index2xy[mb_num]]) {
-            if (!mb_x)
-                ff_thread_await_progress(&s->next_picture_ptr->tf, mb_y++, 0);
-            mb_num++;
-            if (++mb_x == s->mb_width)
-                mb_x = 0;
-        }
-        if (mb_num >= s->mb_num)
-            return -1;  // slice contains just skipped MBs (already decoded)
-    }
 
     s->mb_x = mb_num % s->mb_width;
     s->mb_y = mb_num / s->mb_width;
@@ -470,7 +478,8 @@ int ff_mpeg4_decode_video_packet_header(Mpeg4DecContext *ctx)
             }
         }
     }
-    // FIXME new-pred stuff
+    if (ctx->new_pred)
+        decode_new_pred(ctx, &s->gb);
 
     return 0;
 }
@@ -565,7 +574,7 @@ static inline int mpeg4_decode_dc(MpegEncContext *s, int n, int *dir_ptr)
 
         if (code > 8) {
             if (get_bits1(&s->gb) == 0) { /* marker */
-                if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)) {
                     av_log(s->avctx, AV_LOG_ERROR, "dc marker bit missing\n");
                     return -1;
                 }
@@ -610,7 +619,7 @@ static int mpeg4_decode_partition_a(Mpeg4DecContext *ctx)
                     cbpc = get_vlc2(&s->gb, ff_h263_intra_MCBPC_vlc.table, INTRA_MCBPC_VLC_BITS, 2);
                     if (cbpc < 0) {
                         av_log(s->avctx, AV_LOG_ERROR,
-                               "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                               "mcbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
                         return -1;
                     }
                 } while (cbpc == 8);
@@ -682,7 +691,7 @@ try_again:
                 cbpc = get_vlc2(&s->gb, ff_h263_inter_MCBPC_vlc.table, INTER_MCBPC_VLC_BITS, 2);
                 if (cbpc < 0) {
                     av_log(s->avctx, AV_LOG_ERROR,
-                           "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                           "mcbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
                     return -1;
                 }
                 if (cbpc == 20)
@@ -875,7 +884,7 @@ int ff_mpeg4_decode_partitions(Mpeg4DecContext *ctx)
     const int part_a_end   = s->pict_type == AV_PICTURE_TYPE_I ? (ER_DC_END   | ER_MV_END)   : ER_MV_END;
 
     mb_num = mpeg4_decode_partition_a(ctx);
-    if (mb_num < 0) {
+    if (mb_num <= 0) {
         ff_er_add_slice(&s->er, s->resync_mb_x, s->resync_mb_y,
                         s->mb_x, s->mb_y, part_a_error);
         return -1;
@@ -934,7 +943,8 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                      int n, int coded, int intra, int rvlc)
 {
     MpegEncContext *s = &ctx->m;
-    int level, i, last, run, qmul, qadd, dc_pred_dir;
+    int level, i, last, run, qmul, qadd;
+    int av_uninit(dc_pred_dir);
     RLTable *rl;
     RL_VLC_ELEM *rl_vlc;
     const uint8_t *scan_table;
@@ -1079,7 +1089,8 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                 if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                                     av_log(s->avctx, AV_LOG_ERROR,
                                            "1. marker bit missing in 3. esc\n");
-                                    return -1;
+                                    if (!(s->avctx->err_recognition & AV_EF_IGNORE_ERR))
+                                        return -1;
                                 }
                                 SKIP_CACHE(re, &s->gb, 1);
 
@@ -1089,19 +1100,42 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                                 if (SHOW_UBITS(re, &s->gb, 1) == 0) {
                                     av_log(s->avctx, AV_LOG_ERROR,
                                            "2. marker bit missing in 3. esc\n");
-                                    return -1;
+                                    if (!(s->avctx->err_recognition & AV_EF_IGNORE_ERR))
+                                        return -1;
                                 }
 
                                 SKIP_COUNTER(re, &s->gb, 1 + 12 + 1);
                             }
 
+#if 0
+                            if (s->error_recognition >= FF_ER_COMPLIANT) {
+                                const int abs_level= FFABS(level);
+                                if (abs_level<=MAX_LEVEL && run<=MAX_RUN) {
+                                    const int run1= run - rl->max_run[last][abs_level] - 1;
+                                    if (abs_level <= rl->max_level[last][run]) {
+                                        av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, vlc encoding possible\n");
+                                        return -1;
+                                    }
+                                    if (s->error_recognition > FF_ER_COMPLIANT) {
+                                        if (abs_level <= rl->max_level[last][run]*2) {
+                                            av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 1 encoding possible\n");
+                                            return -1;
+                                        }
+                                        if (run1 >= 0 && abs_level <= rl->max_level[last][run1]) {
+                                            av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 2 encoding possible\n");
+                                            return -1;
+                                        }
+                                    }
+                                }
+                            }
+#endif
                             if (level > 0)
                                 level = level * qmul + qadd;
                             else
                                 level = level * qmul - qadd;
 
                             if ((unsigned)(level + 2048) > 4095) {
-                                if (s->avctx->err_recognition & AV_EF_BITSTREAM) {
+                                if (s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_AGGRESSIVE)) {
                                     if (level > 2560 || level < -2560) {
                                         av_log(s->avctx, AV_LOG_ERROR,
                                                "|level| overflow in 3. esc, qp=%d\n",
@@ -1138,6 +1172,7 @@ static inline int mpeg4_decode_block(Mpeg4DecContext *ctx, int16_t *block,
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
                 LAST_SKIP_BITS(re, &s->gb, 1);
             }
+            ff_tlog(s->avctx, "dct[%d][%d] = %- 4d end?:%d\n", scan_table[i&63]&7, scan_table[i&63] >> 3, level, i>62);
             if (i > 62) {
                 i -= 192;
                 if (i & (~63)) {
@@ -1246,12 +1281,12 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, int16_t block[6][64])
 
     /* per-MB end of slice check */
     if (--s->mb_num_left <= 0) {
-        if (mpeg4_is_resync(s))
+        if (mpeg4_is_resync(ctx))
             return SLICE_END;
         else
             return SLICE_NOEND;
     } else {
-        if (mpeg4_is_resync(s)) {
+        if (mpeg4_is_resync(ctx)) {
             const int delta = s->mb_x + 1 == s->mb_width ? 2 : 1;
             if (s->cbp_table[xy + delta])
                 return SLICE_END;
@@ -1265,10 +1300,10 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
     Mpeg4DecContext *ctx = (Mpeg4DecContext *)s;
     int cbpc, cbpy, i, cbp, pred_x, pred_y, mx, my, dquant;
     int16_t *mot_val;
-    static int8_t quant_tab[4] = { -1, -2, 1, 2 };
+    static const int8_t quant_tab[4] = { -1, -2, 1, 2 };
     const int xy = s->mb_x + s->mb_y * s->mb_stride;
 
-    assert(s->h263_pred);
+    av_assert2(s->h263_pred);
 
     if (s->pict_type == AV_PICTURE_TYPE_P ||
         s->pict_type == AV_PICTURE_TYPE_S) {
@@ -1304,7 +1339,7 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
             cbpc = get_vlc2(&s->gb, ff_h263_inter_MCBPC_vlc.table, INTER_MCBPC_VLC_BITS, 2);
             if (cbpc < 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "cbpc damaged at %d %d\n", s->mb_x, s->mb_y);
+                       "mcbpc damaged at %d %d\n", s->mb_x, s->mb_y);
                 return -1;
             }
         } while (cbpc == 20);
@@ -1321,6 +1356,11 @@ static int mpeg4_decode_mb(MpegEncContext *s, int16_t block[6][64])
         else
             s->mcsel = 0;
         cbpy = get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1) ^ 0x0F;
+        if (cbpy < 0) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "P cbpy damaged at %d %d\n", s->mb_x, s->mb_y);
+            return AVERROR_INVALIDDATA;
+        }
 
         cbp = (cbpc & 3) | (cbpy << 2);
         if (dquant)
@@ -1610,20 +1650,23 @@ intra:
 end:
     /* per-MB end of slice check */
     if (s->codec_id == AV_CODEC_ID_MPEG4) {
-        if (mpeg4_is_resync(s)) {
-            const int delta = s->mb_x + 1 == s->mb_width ? 2 : 1;
+        int next = mpeg4_is_resync(ctx);
+        if (next) {
+            if        (s->mb_x + s->mb_y*s->mb_width + 1 >  next && (s->avctx->err_recognition & AV_EF_AGGRESSIVE)) {
+                return -1;
+            } else if (s->mb_x + s->mb_y*s->mb_width + 1 >= next)
+                return SLICE_END;
 
-            if (s->pict_type == AV_PICTURE_TYPE_B &&
-                s->next_picture.mbskip_table[xy + delta]) {
+            if (s->pict_type == AV_PICTURE_TYPE_B) {
+                const int delta= s->mb_x + 1 == s->mb_width ? 2 : 1;
                 ff_thread_await_progress(&s->next_picture_ptr->tf,
                                          (s->mb_x + delta >= s->mb_width)
                                          ? FFMIN(s->mb_y + 1, s->mb_height - 1)
                                          : s->mb_y, 0);
+                if (s->next_picture.mbskip_table[xy + delta])
+                    return SLICE_OK;
             }
 
-            if (s->pict_type == AV_PICTURE_TYPE_B &&
-                s->next_picture.mbskip_table[xy + delta])
-                return SLICE_OK;
             return SLICE_END;
         }
     }
@@ -1634,29 +1677,30 @@ end:
 static int mpeg4_decode_gop_header(MpegEncContext *s, GetBitContext *gb)
 {
     int hours, minutes, seconds;
-    unsigned time_code = show_bits(gb, 18);
-
-    if (time_code & 0x40) {     /* marker_bit */
-        hours   = time_code >> 13;
-        minutes = time_code >> 7 & 0x3f;
-        seconds = time_code & 0x3f;
-        s->time_base = seconds + 60 * (minutes + 60 * hours);
-        skip_bits(gb, 20);      /* time_code, closed_gov, broken_link */
-    } else {
-        av_log(s->avctx, AV_LOG_WARNING, "GOP header missing marker_bit\n");
+
+    if (!show_bits(gb, 23)) {
+        av_log(s->avctx, AV_LOG_WARNING, "GOP header invalid\n");
+        return -1;
     }
 
+    hours   = get_bits(gb, 5);
+    minutes = get_bits(gb, 6);
+    check_marker(gb, "in gop_header");
+    seconds = get_bits(gb, 6);
+
+    s->time_base = seconds + 60*(minutes + 60*hours);
+
+    skip_bits1(gb);
+    skip_bits1(gb);
+
     return 0;
 }
 
 static int mpeg4_decode_profile_level(MpegEncContext *s, GetBitContext *gb)
 {
-    int profile_and_level_indication;
-
-    profile_and_level_indication = get_bits(gb, 8);
 
-    s->avctx->profile = (profile_and_level_indication & 0xf0) >> 4;
-    s->avctx->level   = (profile_and_level_indication & 0x0f);
+    s->avctx->profile = get_bits(gb, 4);
+    s->avctx->level   = get_bits(gb, 4);
 
     // for Simple profile, level 0
     if (s->avctx->profile == 0 && s->avctx->level == 8) {
@@ -1696,22 +1740,30 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         s->low_delay = get_bits1(gb);
         if (get_bits1(gb)) {    /* vbv parameters */
             get_bits(gb, 15);   /* first_half_bitrate */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after first_half_bitrate");
             get_bits(gb, 15);   /* latter_half_bitrate */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after latter_half_bitrate");
             get_bits(gb, 15);   /* first_half_vbv_buffer_size */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after first_half_vbv_buffer_size");
             get_bits(gb, 3);    /* latter_half_vbv_buffer_size */
             get_bits(gb, 11);   /* first_half_vbv_occupancy */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after first_half_vbv_occupancy");
             get_bits(gb, 15);   /* latter_half_vbv_occupancy */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after latter_half_vbv_occupancy");
         }
     } else {
         /* is setting low delay flag only once the smartest thing to do?
          * low delay detection will not be overridden. */
-        if (s->picture_number == 0)
-            s->low_delay = 0;
+        if (s->picture_number == 0) {
+            switch(s->vo_type) {
+            case SIMPLE_VO_TYPE:
+            case ADV_SIMPLE_VO_TYPE:
+                s->low_delay = 1;
+                break;
+            default:
+                s->low_delay = 0;
+            }
+        }
     }
 
     ctx->shape = get_bits(gb, 2); /* vol shape */
@@ -1727,7 +1779,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     s->avctx->framerate.num = get_bits(gb, 16);
     if (!s->avctx->framerate.num) {
         av_log(s->avctx, AV_LOG_ERROR, "framerate==0\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     ctx->time_increment_bits = av_log2(s->avctx->framerate.num - 1) + 1;
@@ -1741,15 +1793,17 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     else
         s->avctx->framerate.den = 1;
 
+    s->avctx->time_base = av_inv_q(av_mul_q(s->avctx->framerate, (AVRational){s->avctx->ticks_per_frame, 1}));
+
     ctx->t_frame = 0;
 
     if (ctx->shape != BIN_ONLY_SHAPE) {
         if (ctx->shape == RECT_SHAPE) {
-            skip_bits1(gb);   /* marker */
+            check_marker(gb, "before width");
             width = get_bits(gb, 13);
-            skip_bits1(gb);   /* marker */
+            check_marker(gb, "before height");
             height = get_bits(gb, 13);
-            skip_bits1(gb);   /* marker */
+            check_marker(gb, "after height");
             if (width && height &&  /* they should be non zero but who knows */
                 !(s->width && s->codec_tag == AV_RL32("MP4S"))) {
                 if (s->width && s->height &&
@@ -1777,13 +1831,13 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             ctx->vol_sprite_usage == GMC_SPRITE) {
             if (ctx->vol_sprite_usage == STATIC_SPRITE) {
                 skip_bits(gb, 13); // sprite_width
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_width");
                 skip_bits(gb, 13); // sprite_height
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_height");
                 skip_bits(gb, 13); // sprite_left
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_left");
                 skip_bits(gb, 13); // sprite_top
-                skip_bits1(gb); /* marker */
+                check_marker(gb, "after sprite_top");
             }
             ctx->num_sprite_warping_points = get_bits(gb, 6);
             if (ctx->num_sprite_warping_points > 3) {
@@ -1791,7 +1845,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                        "%d sprite_warping_points\n",
                        ctx->num_sprite_warping_points);
                 ctx->num_sprite_warping_points = 0;
-                return -1;
+                return AVERROR_INVALIDDATA;
             }
             s->sprite_warping_accuracy  = get_bits(gb, 2);
             ctx->sprite_brightness_change = get_bits1(gb);
@@ -1807,6 +1861,9 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             if (s->quant_precision != 5)
                 av_log(s->avctx, AV_LOG_ERROR,
                        "quant precision %d\n", s->quant_precision);
+            if (s->quant_precision<3 || s->quant_precision>9) {
+                s->quant_precision = 5;
+            }
         } else {
             s->quant_precision = 5;
         }
@@ -1833,6 +1890,10 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                 int last = 0;
                 for (i = 0; i < 64; i++) {
                     int j;
+                    if (get_bits_left(gb) < 8) {
+                        av_log(s->avctx, AV_LOG_ERROR, "insufficient data for custom matrix\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     v = get_bits(gb, 8);
                     if (v == 0)
                         break;
@@ -1856,6 +1917,10 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                 int last = 0;
                 for (i = 0; i < 64; i++) {
                     int j;
+                    if (get_bits_left(gb) < 8) {
+                        av_log(s->avctx, AV_LOG_ERROR, "insufficient data for custom matrix\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     v = get_bits(gb, 8);
                     if (v == 0)
                         break;
@@ -1882,6 +1947,11 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         else
             s->quarter_sample = 0;
 
+        if (get_bits_left(gb) < 4) {
+            av_log(s->avctx, AV_LOG_ERROR, "VOL Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (!get_bits1(gb)) {
             int pos               = get_bits_count(gb);
             int estimation_method = get_bits(gb, 2);
@@ -1989,6 +2059,18 @@ no_cplx_est:
         }
     }
 
+    if (s->avctx->debug&FF_DEBUG_PICT_INFO) {
+        av_log(s->avctx, AV_LOG_DEBUG, "tb %d/%d, tincrbits:%d, qp_prec:%d, ps:%d, low_delay:%d  %s%s%s%s\n",
+               s->avctx->framerate.den, s->avctx->framerate.num,
+               ctx->time_increment_bits,
+               s->quant_precision,
+               s->progressive_sequence,
+               s->low_delay,
+               ctx->scalability ? "scalability " :"" , s->quarter_sample ? "qpel " : "",
+               s->data_partitioning ? "partition " : "", ctx->rvlc ? "rvlc " : ""
+        );
+    }
+
     return 0;
 }
 
@@ -2020,11 +2102,6 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
         ctx->divx_version = ver;
         ctx->divx_build   = build;
         s->divx_packed  = e == 3 && last == 'p';
-        if (s->divx_packed && !ctx->showed_packed_warning) {
-            av_log(s->avctx, AV_LOG_WARNING,
-                   "Invalid and inefficient vfw-avi packed B-frames detected\n");
-            ctx->showed_packed_warning = 1;
-        }
     }
 
     /* libavcodec detection */
@@ -2048,6 +2125,14 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
     if (e == 1)
         ctx->xvid_build = build;
 
+    return 0;
+}
+
+int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+{
+    Mpeg4DecContext *ctx = avctx->priv_data;
+    MpegEncContext *s = &ctx->m;
+
     if (ctx->xvid_build == -1 && ctx->divx_version == -1 && ctx->lavc_build == -1) {
         if (s->codec_tag        == AV_RL32("XVID") ||
             s->codec_tag        == AV_RL32("XVIX") ||
@@ -2067,8 +2152,89 @@ static int decode_user_data(Mpeg4DecContext *ctx, GetBitContext *gb)
         ctx->divx_build   = -1;
     }
 
-    if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0)
-        ff_xvid_idct_init(&s->idsp, s->avctx);
+    if (s->workaround_bugs & FF_BUG_AUTODETECT) {
+        if (s->codec_tag == AV_RL32("XVIX"))
+            s->workaround_bugs |= FF_BUG_XVID_ILACE;
+
+        if (s->codec_tag == AV_RL32("UMP4"))
+            s->workaround_bugs |= FF_BUG_UMP4;
+
+        if (ctx->divx_version >= 500 && ctx->divx_build < 1814)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
+
+        if (ctx->divx_version > 502 && ctx->divx_build < 1814)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA2;
+
+        if (ctx->xvid_build <= 3U)
+            s->padding_bug_score = 256 * 256 * 256 * 64;
+
+        if (ctx->xvid_build <= 1U)
+            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
+
+        if (ctx->xvid_build <= 12U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->xvid_build <= 32U)
+            s->workaround_bugs |= FF_BUG_DC_CLIP;
+
+#define SET_QPEL_FUNC(postfix1, postfix2)                           \
+    s->qdsp.put_        ## postfix1 = ff_put_        ## postfix2;   \
+    s->qdsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2;   \
+    s->qdsp.avg_        ## postfix1 = ff_avg_        ## postfix2;
+
+        if (ctx->lavc_build < 4653U)
+            s->workaround_bugs |= FF_BUG_STD_QPEL;
+
+        if (ctx->lavc_build < 4655U)
+            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
+
+        if (ctx->lavc_build < 4670U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->lavc_build <= 4712U)
+            s->workaround_bugs |= FF_BUG_DC_CLIP;
+
+        if (ctx->divx_version >= 0)
+            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
+        if (ctx->divx_version == 501 && ctx->divx_build == 20020416)
+            s->padding_bug_score = 256 * 256 * 256 * 64;
+
+        if (ctx->divx_version < 500U)
+            s->workaround_bugs |= FF_BUG_EDGE;
+
+        if (ctx->divx_version >= 0)
+            s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+    }
+
+    if (s->workaround_bugs & FF_BUG_STD_QPEL) {
+        SET_QPEL_FUNC(qpel_pixels_tab[0][5], qpel16_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][7], qpel16_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][9], qpel16_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c)
+
+        SET_QPEL_FUNC(qpel_pixels_tab[1][5], qpel8_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][7], qpel8_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][9], qpel8_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c)
+    }
+
+    if (avctx->debug & FF_DEBUG_BUGS)
+        av_log(s->avctx, AV_LOG_DEBUG,
+               "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n",
+               s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
+               ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+
+    if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+        s->codec_id == AV_CODEC_ID_MPEG4 &&
+        avctx->idct_algo == FF_IDCT_AUTO) {
+        avctx->idct_algo = FF_IDCT_XVID;
+        ff_mpv_idct_init(s);
+        return 1;
+    }
 
     return 0;
 }
@@ -2077,6 +2243,7 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 {
     MpegEncContext *s = &ctx->m;
     int time_incr, time_increment;
+    int64_t pts;
 
     s->pict_type = get_bits(gb, 2) + AV_PICTURE_TYPE_I;        /* pict type: I = 0 , P = 1 */
     if (s->pict_type == AV_PICTURE_TYPE_B && s->low_delay &&
@@ -2099,7 +2266,9 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 
     if (ctx->time_increment_bits == 0 ||
         !(show_bits(gb, ctx->time_increment_bits + 1) & 1)) {
-        /* Headers seem incomplete; try to guess time_increment_bits. */
+        av_log(s->avctx, AV_LOG_WARNING,
+               "time_increment_bits %d is invalid in relation to the current bitstream, this is likely caused by a missing VOL header\n", ctx->time_increment_bits);
+
         for (ctx->time_increment_bits = 1;
              ctx->time_increment_bits < 16;
              ctx->time_increment_bits++) {
@@ -2111,6 +2280,13 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             } else if ((show_bits(gb, ctx->time_increment_bits + 5) & 0x1F) == 0x18)
                 break;
         }
+
+        av_log(s->avctx, AV_LOG_WARNING,
+               "time_increment_bits set to %d bits, based on bitstream analysis\n", ctx->time_increment_bits);
+        if (s->avctx->framerate.num && 4*s->avctx->framerate.num < 1<<ctx->time_increment_bits) {
+            s->avctx->framerate.num = 1<<ctx->time_increment_bits;
+            s->avctx->time_base = av_inv_q(av_mul_q(s->avctx->framerate, (AVRational){s->avctx->ticks_per_frame, 1}));
+        }
     }
 
     if (IS_3IV1)
@@ -2151,12 +2327,20 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                             ROUNDED_DIV(s->last_non_b_time - s->pp_time, ctx->t_frame)) * 2;
         s->pb_field_time = (ROUNDED_DIV(s->time, ctx->t_frame) -
                             ROUNDED_DIV(s->last_non_b_time - s->pp_time, ctx->t_frame)) * 2;
-        if (!s->progressive_sequence) {
-            if (s->pp_field_time <= s->pb_field_time || s->pb_field_time <= 1)
+        if (s->pp_field_time <= s->pb_field_time || s->pb_field_time <= 1) {
+            s->pb_field_time = 2;
+            s->pp_field_time = 4;
+            if (!s->progressive_sequence)
                 return FRAME_SKIPPED;
         }
     }
 
+    if (s->avctx->framerate.den)
+        pts = ROUNDED_DIV(s->time, s->avctx->framerate.den);
+    else
+        pts = AV_NOPTS_VALUE;
+    ff_dlog(s->avctx, "MPEG4 PTS: %"PRId64"\n", pts);
+
     check_marker(gb, "before vop_coded");
 
     /* vop coded */
@@ -2165,6 +2349,9 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             av_log(s->avctx, AV_LOG_ERROR, "vop not coded\n");
         return FRAME_SKIPPED;
     }
+    if (ctx->new_pred)
+        decode_new_pred(ctx, gb);
+
     if (ctx->shape != BIN_ONLY_SHAPE &&
                     (s->pict_type == AV_PICTURE_TYPE_P ||
                      (s->pict_type == AV_PICTURE_TYPE_S &&
@@ -2179,11 +2366,11 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     if (ctx->shape != RECT_SHAPE) {
         if (ctx->vol_sprite_usage != 1 || s->pict_type != AV_PICTURE_TYPE_I) {
             skip_bits(gb, 13);  /* width */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after width");
             skip_bits(gb, 13);  /* height */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after height");
             skip_bits(gb, 13);  /* hor_spat_ref */
-            skip_bits1(gb);     /* marker */
+            check_marker(gb, "after hor_spat_ref");
             skip_bits(gb, 13);  /* ver_spat_ref */
         }
         skip_bits1(gb);         /* change_CR_disable */
@@ -2201,6 +2388,10 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         if (s->pict_type == AV_PICTURE_TYPE_B)
             skip_bits_long(gb, ctx->cplx_estimation_trash_b);
 
+        if (get_bits_left(gb) < 3) {
+            av_log(s->avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
         ctx->intra_dc_threshold = ff_mpeg4_dc_threshold[get_bits(gb, 3)];
         if (!s->progressive_sequence) {
             s->top_field_first = get_bits1(gb);
@@ -2238,7 +2429,7 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         if (s->qscale == 0) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Error, header damaged or not MPEG-4 header (qscale=0)\n");
-            return -1;  // makes no sense to continue, as there is nothing left from the image then
+            return AVERROR_INVALIDDATA;  // makes no sense to continue, as there is nothing left from the image then
         }
 
         if (s->pict_type != AV_PICTURE_TYPE_I) {
@@ -2246,29 +2437,39 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
             if (s->f_code == 0) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Error, header damaged or not MPEG-4 header (f_code=0)\n");
-                return -1;  // makes no sense to continue, as there is nothing left from the image then
+                s->f_code = 1;
+                return AVERROR_INVALIDDATA;  // makes no sense to continue, as there is nothing left from the image then
             }
         } else
             s->f_code = 1;
 
         if (s->pict_type == AV_PICTURE_TYPE_B) {
             s->b_code = get_bits(gb, 3);
+            if (s->b_code == 0) {
+                av_log(s->avctx, AV_LOG_ERROR,
+                       "Error, header damaged or not MPEG4 header (b_code=0)\n");
+                s->b_code=1;
+                return AVERROR_INVALIDDATA; // makes no sense to continue, as the MV decoding will break very quickly
+            }
         } else
             s->b_code = 1;
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
             av_log(s->avctx, AV_LOG_DEBUG,
-                   "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d ce:%d/%d/%d\n",
+                   "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d ce:%d/%d/%d time:%"PRId64" tincr:%d\n",
                    s->qscale, s->f_code, s->b_code,
                    s->pict_type == AV_PICTURE_TYPE_I ? "I" : (s->pict_type == AV_PICTURE_TYPE_P ? "P" : (s->pict_type == AV_PICTURE_TYPE_B ? "B" : "S")),
-                   gb->size_in_bits, s->progressive_sequence, s->alternate_scan,
+                   gb->size_in_bits,s->progressive_sequence, s->alternate_scan,
                    s->top_field_first, s->quarter_sample ? "q" : "h",
                    s->data_partitioning, ctx->resync_marker,
                    ctx->num_sprite_warping_points, s->sprite_warping_accuracy,
                    1 - s->no_rounding, s->vo_type,
                    ctx->vol_control_parameters ? " VOLC" : " ", ctx->intra_dc_threshold,
                    ctx->cplx_estimation_trash_i, ctx->cplx_estimation_trash_p,
-                   ctx->cplx_estimation_trash_b);
+                   ctx->cplx_estimation_trash_b,
+                   s->time,
+                   time_increment
+                  );
         }
 
         if (!ctx->scalability) {
@@ -2317,6 +2518,7 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 {
     MpegEncContext *s = &ctx->m;
     unsigned startcode, v;
+    int ret;
 
     /* search next start code */
     align_get_bits(gb);
@@ -2331,8 +2533,8 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     for (;;) {
         if (get_bits_count(gb) >= gb->size_in_bits) {
             if (gb->size_in_bits == 8 &&
-                (ctx->divx_version >= 0 || ctx->xvid_build >= 0)) {
-                av_log(s->avctx, AV_LOG_WARNING, "frame skip %d\n", gb->size_in_bits);
+                (ctx->divx_version >= 0 || ctx->xvid_build >= 0) || s->codec_tag == AV_RL32("QMP4")) {
+                av_log(s->avctx, AV_LOG_VERBOSE, "frame skip %d\n", gb->size_in_bits);
                 return FRAME_SKIPPED;  // divx bug
             } else
                 return -1;  // end of stream
@@ -2405,8 +2607,8 @@ int ff_mpeg4_decode_picture_header(Mpeg4DecContext *ctx, GetBitContext *gb)
         }
 
         if (startcode >= 0x120 && startcode <= 0x12F) {
-            if (decode_vol_header(ctx, gb) < 0)
-                return -1;
+            if ((ret = decode_vol_header(ctx, gb)) < 0)
+                return ret;
         } else if (startcode == USER_DATA_STARTCODE) {
             decode_user_data(ctx, gb);
         } else if (startcode == GOP_STARTCODE) {
@@ -2426,64 +2628,33 @@ end:
         s->low_delay = 1;
     s->avctx->has_b_frames = !s->low_delay;
 
-    if (s->workaround_bugs & FF_BUG_AUTODETECT) {
-        if (s->codec_tag == AV_RL32("XVIX"))
-            s->workaround_bugs |= FF_BUG_XVID_ILACE;
-
-        if (s->codec_tag == AV_RL32("UMP4"))
-            s->workaround_bugs |= FF_BUG_UMP4;
-
-        if (ctx->divx_version >= 500 && ctx->divx_build < 1814)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
-
-        if (ctx->divx_version > 502 && ctx->divx_build < 1814)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA2;
-
-        if (ctx->xvid_build <= 3U)
-            s->padding_bug_score = 256 * 256 * 256 * 64;
-
-        if (ctx->xvid_build <= 1U)
-            s->workaround_bugs |= FF_BUG_QPEL_CHROMA;
-
-        if (ctx->xvid_build <= 12U)
-            s->workaround_bugs |= FF_BUG_EDGE;
-
-        if (ctx->xvid_build <= 32U)
-            s->workaround_bugs |= FF_BUG_DC_CLIP;
-
-        if (ctx->lavc_build < 4653U)
-            s->workaround_bugs |= FF_BUG_STD_QPEL;
-
-        if (ctx->lavc_build < 4655U)
-            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
-
-        if (ctx->lavc_build < 4670U)
-            s->workaround_bugs |= FF_BUG_EDGE;
-
-        if (ctx->lavc_build <= 4712U)
-            s->workaround_bugs |= FF_BUG_DC_CLIP;
-
-        if (ctx->divx_version >= 0)
-            s->workaround_bugs |= FF_BUG_DIRECT_BLOCKSIZE;
-
-        if (ctx->divx_version == 501 && ctx->divx_build == 20020416)
-            s->padding_bug_score = 256 * 256 * 256 * 64;
+    return decode_vop_header(ctx, gb);
+}
 
-        if (ctx->divx_version < 500U)
-            s->workaround_bugs |= FF_BUG_EDGE;
+av_cold void ff_mpeg4videodec_static_init(void) {
+    static int done = 0;
 
-        if (ctx->divx_version >= 0)
-            s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+    if (!done) {
+        ff_rl_init(&ff_mpeg4_rl_intra, ff_mpeg4_static_rl_table_store[0]);
+        ff_rl_init(&ff_rvlc_rl_inter, ff_mpeg4_static_rl_table_store[1]);
+        ff_rl_init(&ff_rvlc_rl_intra, ff_mpeg4_static_rl_table_store[2]);
+        INIT_VLC_RL(ff_mpeg4_rl_intra, 554);
+        INIT_VLC_RL(ff_rvlc_rl_inter, 1072);
+        INIT_VLC_RL(ff_rvlc_rl_intra, 1072);
+        INIT_VLC_STATIC(&dc_lum, DC_VLC_BITS, 10 /* 13 */,
+                        &ff_mpeg4_DCtab_lum[0][1], 2, 1,
+                        &ff_mpeg4_DCtab_lum[0][0], 2, 1, 512);
+        INIT_VLC_STATIC(&dc_chrom, DC_VLC_BITS, 10 /* 13 */,
+                        &ff_mpeg4_DCtab_chrom[0][1], 2, 1,
+                        &ff_mpeg4_DCtab_chrom[0][0], 2, 1, 512);
+        INIT_VLC_STATIC(&sprite_trajectory, SPRITE_TRAJ_VLC_BITS, 15,
+                        &ff_sprite_trajectory_tab[0][1], 4, 2,
+                        &ff_sprite_trajectory_tab[0][0], 4, 2, 128);
+        INIT_VLC_STATIC(&mb_type_b_vlc, MB_TYPE_B_VLC_BITS, 4,
+                        &ff_mb_type_b_tab[0][1], 2, 1,
+                        &ff_mb_type_b_tab[0][0], 2, 1, 16);
+        done = 1;
     }
-
-
-    if (s->avctx->debug & FF_DEBUG_BUGS)
-        av_log(s->avctx, AV_LOG_DEBUG,
-               "bugs: %X lavc_build:%d xvid_build:%d divx_version:%d divx_build:%d %s\n",
-               s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-               ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
-
-    return decode_vop_header(ctx, gb);
 }
 
 int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
@@ -2492,34 +2663,40 @@ int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
     MpegEncContext    *s = &ctx->m;
 
     /* divx 5.01+ bitstream reorder stuff */
+    /* Since this clobbers the input buffer and hwaccel codecs still need the
+     * data during hwaccel->end_frame we should not do this any earlier */
     if (s->divx_packed) {
-        int current_pos     = get_bits_count(&s->gb) >> 3;
+        int current_pos     = s->gb.buffer == s->bitstream_buffer ? 0 : (get_bits_count(&s->gb) >> 3);
         int startcode_found = 0;
 
-        if (buf_size - current_pos > 5) {
+        if (buf_size - current_pos > 7) {
+
             int i;
-            for (i = current_pos; i < buf_size - 3; i++)
+            for (i = current_pos; i < buf_size - 4; i++)
+
                 if (buf[i]     == 0 &&
                     buf[i + 1] == 0 &&
                     buf[i + 2] == 1 &&
                     buf[i + 3] == 0xB6) {
-                    startcode_found = 1;
+                    startcode_found = !(buf[i + 4] & 0x40);
                     break;
                 }
         }
-        if (s->gb.buffer == s->bitstream_buffer && buf_size > 7 &&
-            ctx->xvid_build >= 0) {       // xvid style
-            startcode_found = 1;
-            current_pos     = 0;
-        }
 
         if (startcode_found) {
-            av_fast_malloc(&s->bitstream_buffer,
+            if (!ctx->showed_packed_warning) {
+                av_log(s->avctx, AV_LOG_INFO, "Video uses a non-standard and "
+                       "wasteful way to store B-frames ('packed B-frames'). "
+                       "Consider using the mpeg4_unpack_bframes bitstream filter without encoding but stream copy to fix it.\n");
+                ctx->showed_packed_warning = 1;
+            }
+            av_fast_padded_malloc(&s->bitstream_buffer,
                            &s->allocated_bitstream_buffer_size,
-                           buf_size - current_pos +
-                           AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!s->bitstream_buffer)
+                           buf_size - current_pos);
+            if (!s->bitstream_buffer) {
+                s->bitstream_buffer_size = 0;
                 return AVERROR(ENOMEM);
+            }
             memcpy(s->bitstream_buffer, buf + current_pos,
                    buf_size - current_pos);
             s->bitstream_buffer_size = buf_size - current_pos;
@@ -2529,6 +2706,7 @@ int ff_mpeg4_frame_end(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
     return 0;
 }
 
+#if HAVE_THREADS
 static int mpeg4_update_thread_context(AVCodecContext *dst,
                                        const AVCodecContext *src)
 {
@@ -2541,22 +2719,20 @@ static int mpeg4_update_thread_context(AVCodecContext *dst,
     if (ret < 0)
         return ret;
 
+    memcpy(((uint8_t*)s) + sizeof(MpegEncContext), ((uint8_t*)s1) + sizeof(MpegEncContext), sizeof(Mpeg4DecContext) - sizeof(MpegEncContext));
+
     if (CONFIG_MPEG4_DECODER && !init && s1->xvid_build >= 0)
         ff_xvid_idct_init(&s->m.idsp, dst);
 
-    s->shape               = s1->shape;
-    s->time_increment_bits = s1->time_increment_bits;
-    s->xvid_build          = s1->xvid_build;
-
     return 0;
 }
+#endif
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     Mpeg4DecContext *ctx = avctx->priv_data;
     MpegEncContext *s = &ctx->m;
     int ret;
-    static int done = 0;
 
     ctx->divx_version =
     ctx->divx_build   =
@@ -2566,28 +2742,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     if ((ret = ff_h263_decode_init(avctx)) < 0)
         return ret;
 
-    if (!done) {
-        done = 1;
-
-        ff_rl_init(&ff_mpeg4_rl_intra, ff_mpeg4_static_rl_table_store[0]);
-        ff_rl_init(&ff_rvlc_rl_inter, ff_mpeg4_static_rl_table_store[1]);
-        ff_rl_init(&ff_rvlc_rl_intra, ff_mpeg4_static_rl_table_store[2]);
-        INIT_VLC_RL(ff_mpeg4_rl_intra, 554);
-        INIT_VLC_RL(ff_rvlc_rl_inter, 1072);
-        INIT_VLC_RL(ff_rvlc_rl_intra, 1072);
-        INIT_VLC_STATIC(&dc_lum, DC_VLC_BITS, 10 /* 13 */,
-                        &ff_mpeg4_DCtab_lum[0][1], 2, 1,
-                        &ff_mpeg4_DCtab_lum[0][0], 2, 1, 512);
-        INIT_VLC_STATIC(&dc_chrom, DC_VLC_BITS, 10 /* 13 */,
-                        &ff_mpeg4_DCtab_chrom[0][1], 2, 1,
-                        &ff_mpeg4_DCtab_chrom[0][0], 2, 1, 512);
-        INIT_VLC_STATIC(&sprite_trajectory, SPRITE_TRAJ_VLC_BITS, 15,
-                        &ff_sprite_trajectory_tab[0][1], 4, 2,
-                        &ff_sprite_trajectory_tab[0][0], 4, 2, 128);
-        INIT_VLC_STATIC(&mb_type_b_vlc, MB_TYPE_B_VLC_BITS, 4,
-                        &ff_mb_type_b_tab[0][1], 2, 1,
-                        &ff_mb_type_b_tab[0][0], 2, 1, 16);
-    }
+    ff_mpeg4videodec_static_init();
 
     s->h263_pred = 1;
     s->low_delay = 0; /* default, might be overridden in the vol header during header parsing */
@@ -2600,6 +2755,19 @@ static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static const AVOption mpeg4_options[] = {
+    {"quarter_sample", "1/4 subpel MC", offsetof(MpegEncContext, quarter_sample), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, 0},
+    {"divx_packed", "divx style packed b frames", offsetof(MpegEncContext, divx_packed), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, 0},
+    {NULL}
+};
+
+static const AVClass mpeg4_class = {
+    "MPEG4 Video Decoder",
+    av_default_item_name,
+    mpeg4_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_mpeg4_decoder = {
     .name                  = "mpeg4",
     .long_name             = NULL_IF_CONFIG_SMALL("MPEG-4 part 2"),
@@ -2613,7 +2781,35 @@ AVCodec ff_mpeg4_decoder = {
                              AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
                              AV_CODEC_CAP_FRAME_THREADS,
     .flush                 = ff_mpeg_flush,
+    .max_lowres            = 3,
     .pix_fmts              = ff_h263_hwaccel_pixfmt_list_420,
     .profiles              = NULL_IF_CONFIG_SMALL(ff_mpeg4_video_profiles),
     .update_thread_context = ONLY_IF_THREADS_ENABLED(mpeg4_update_thread_context),
+    .priv_class = &mpeg4_class,
+};
+
+
+#if CONFIG_MPEG4_VDPAU_DECODER && FF_API_VDPAU
+static const AVClass mpeg4_vdpau_class = {
+    "MPEG4 Video VDPAU Decoder",
+    av_default_item_name,
+    mpeg4_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_mpeg4_vdpau_decoder = {
+    .name           = "mpeg4_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPEG-4 part 2 (VDPAU)"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG4,
+    .priv_data_size = sizeof(Mpeg4DecContext),
+    .init           = decode_init,
+    .close          = ff_h263_decode_end,
+    .decode         = ff_h263_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_TRUNCATED | AV_CODEC_CAP_DELAY |
+                      AV_CODEC_CAP_HWACCEL_VDPAU,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_VDPAU_MPEG4,
+                                                  AV_PIX_FMT_NONE },
+    .priv_class     = &mpeg4_vdpau_class,
 };
+#endif
diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c
index b694935..3149310 100644
--- a/libavcodec/mpeg4videoenc.c
+++ b/libavcodec/mpeg4videoenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -278,19 +278,19 @@ static inline void mpeg4_encode_dc(PutBitContext *s, int level, int n)
 
     if (n < 4) {
         /* luminance */
-        put_bits(&s->pb, ff_mpeg4_DCtab_lum[size][1], ff_mpeg4_DCtab_lum[size][0]);
+        put_bits(s, ff_mpeg4_DCtab_lum[size][1], ff_mpeg4_DCtab_lum[size][0]);
     } else {
         /* chrominance */
-        put_bits(&s->pb, ff_mpeg4_DCtab_chrom[size][1], ff_mpeg4_DCtab_chrom[size][0]);
+        put_bits(s, ff_mpeg4_DCtab_chrom[size][1], ff_mpeg4_DCtab_chrom[size][0]);
     }
 
     /* encode remaining bits */
     if (size > 0) {
         if (level < 0)
             level = (-level) ^ ((1 << size) - 1);
-        put_bits(&s->pb, size, level);
+        put_bits(s, size, level);
         if (size > 8)
-            put_bits(&s->pb, 1, 1);
+            put_bits(s, 1, 1);
     }
 #endif
 }
@@ -525,9 +525,9 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                     s->last_mv[i][1][1] = 0;
             }
 
-            assert(s->dquant >= -2 && s->dquant <= 2);
-            assert((s->dquant & 1) == 0);
-            assert(mb_type >= 0);
+            av_assert2(s->dquant >= -2 && s->dquant <= 2);
+            av_assert2((s->dquant & 1) == 0);
+            av_assert2(mb_type >= 0);
 
             /* nothing to do if this MB was skipped in the next P-frame */
             if (s->next_picture.mbskip_table[s->mb_y * s->mb_stride + s->mb_x]) {  // FIXME avoid DCT & ...
@@ -547,7 +547,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
 
             if ((cbp | motion_x | motion_y | mb_type) == 0) {
                 /* direct MB with MV={0,0} */
-                assert(s->dquant == 0);
+                av_assert2(s->dquant == 0);
 
                 put_bits(&s->pb, 1, 1); /* mb not coded modb1=1 */
 
@@ -584,12 +584,12 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                 s->misc_bits += get_bits_diff(s);
 
             if (!mb_type) {
-                assert(s->mv_dir & MV_DIRECT);
+                av_assert2(s->mv_dir & MV_DIRECT);
                 ff_h263_encode_motion_vector(s, motion_x, motion_y, 1);
                 s->b_count++;
                 s->f_count++;
             } else {
-                assert(mb_type > 0 && mb_type < 4);
+                av_assert2(mb_type > 0 && mb_type < 4);
                 if (s->mv_type != MV_TYPE_FIELD) {
                     if (s->mv_dir & MV_DIR_FORWARD) {
                         ff_h263_encode_motion_vector(s,
@@ -669,10 +669,6 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
 
                     x = s->mb_x * 16;
                     y = s->mb_y * 16;
-                    if (x + 16 > s->width)
-                        x = s->width - 16;
-                    if (y + 16 > s->height)
-                        y = s->height - 16;
 
                     offset = x + y * s->linesize;
                     p_pic  = s->new_picture.f->data[0] + offset;
@@ -689,7 +685,21 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                         b_pic = pic->f->data[0] + offset;
                         if (!pic->shared)
                             b_pic += INPLACE_OFFSET;
-                        diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
+
+                        if (x + 16 > s->width || y + 16 > s->height) {
+                            int x1, y1;
+                            int xe = FFMIN(16, s->width - x);
+                            int ye = FFMIN(16, s->height - y);
+                            diff = 0;
+                            for (y1 = 0; y1 < ye; y1++) {
+                                for (x1 = 0; x1 < xe; x1++) {
+                                    diff += FFABS(p_pic[x1 + y1 * s->linesize] - b_pic[x1 + y1 * s->linesize]);
+                                }
+                            }
+                            diff = diff * 256 / (xe * ye);
+                        } else {
+                            diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16);
+                        }
                         if (diff > s->qscale * 70) {  // FIXME check that 70 is optimal
                             s->mb_skipped = 0;
                             break;
@@ -754,7 +764,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                 if (s->dquant)
                     put_bits(pb2, 2, dquant_code[s->dquant + 2]);
 
-                assert(!s->progressive_sequence);
+                av_assert2(!s->progressive_sequence);
                 if (cbp)
                     put_bits(pb2, 1, s->interlaced_dct);
                 put_bits(pb2, 1, 1);
@@ -778,7 +788,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64],
                                              s->mv[0][1][1] - pred_y,
                                              s->f_code);
             } else {
-                assert(s->mv_type == MV_TYPE_8X8);
+                av_assert2(s->mv_type == MV_TYPE_8X8);
                 put_bits(&s->pb,
                          ff_h263_inter_MCBPC_bits[cbpc + 16],
                          ff_h263_inter_MCBPC_code[cbpc + 16]);
@@ -894,7 +904,7 @@ void ff_set_mpeg4_time(MpegEncContext *s)
         ff_mpeg4_init_direct_mv(s);
     } else {
         s->last_time_base = s->time_base;
-        s->time_base      = s->time / s->avctx->time_base.den;
+        s->time_base      = FFUDIV(s->time, s->avctx->time_base.den);
     }
 }
 
@@ -910,13 +920,12 @@ static void mpeg4_encode_gop_header(MpegEncContext *s)
     if (s->reordered_input_picture[1])
         time = FFMIN(time, s->reordered_input_picture[1]->f->pts);
     time = time * s->avctx->time_base.num;
+    s->last_time_base = FFUDIV(time, s->avctx->time_base.den);
 
-    seconds  = time / s->avctx->time_base.den;
-    minutes  = seconds / 60;
-    seconds %= 60;
-    hours    = minutes / 60;
-    minutes %= 60;
-    hours   %= 24;
+    seconds = FFUDIV(time, s->avctx->time_base.den);
+    minutes = FFUDIV(seconds, 60); seconds = FFUMOD(seconds, 60);
+    hours   = FFUDIV(minutes, 60); minutes = FFUMOD(minutes, 60);
+    hours   = FFUMOD(hours  , 24);
 
     put_bits(&s->pb, 5, hours);
     put_bits(&s->pb, 6, minutes);
@@ -926,8 +935,6 @@ static void mpeg4_encode_gop_header(MpegEncContext *s)
     put_bits(&s->pb, 1, !!(s->avctx->flags & AV_CODEC_FLAG_CLOSED_GOP));
     put_bits(&s->pb, 1, 0);  // broken link == NO
 
-    s->last_time_base = time / s->avctx->time_base.den;
-
     ff_mpeg4_stuffing(&s->pb);
 }
 
@@ -1011,6 +1018,8 @@ static void mpeg4_encode_vol_header(MpegEncContext *s,
 
     put_bits(&s->pb, 4, s->aspect_ratio_info); /* aspect ratio info */
     if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
+        av_reduce(&s->avctx->sample_aspect_ratio.num, &s->avctx->sample_aspect_ratio.den,
+                   s->avctx->sample_aspect_ratio.num,  s->avctx->sample_aspect_ratio.den, 255);
         put_bits(&s->pb, 8, s->avctx->sample_aspect_ratio.num);
         put_bits(&s->pb, 8, s->avctx->sample_aspect_ratio.den);
     }
@@ -1077,10 +1086,10 @@ static void mpeg4_encode_vol_header(MpegEncContext *s,
 }
 
 /* write MPEG-4 VOP header */
-void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
+int ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
 {
-    int time_incr;
-    int time_div, time_mod;
+    uint64_t time_incr;
+    int64_t time_div, time_mod;
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
         if (!(s->avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)) {
@@ -1099,11 +1108,15 @@ void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
     put_bits(&s->pb, 16, VOP_STARTCODE);    /* vop header */
     put_bits(&s->pb, 2, s->pict_type - 1);  /* pict type: I = 0 , P = 1 */
 
-    assert(s->time >= 0);
-    time_div  = s->time / s->avctx->time_base.den;
-    time_mod  = s->time % s->avctx->time_base.den;
+    time_div  = FFUDIV(s->time, s->avctx->time_base.den);
+    time_mod  = FFUMOD(s->time, s->avctx->time_base.den);
     time_incr = time_div - s->last_time_base;
-    assert(time_incr >= 0);
+
+    // This limits the frame duration to max 1 hour
+    if (time_incr > 3600) {
+        av_log(s->avctx, AV_LOG_ERROR, "time_incr %"PRIu64" too large\n", time_incr);
+        return AVERROR(EINVAL);
+    }
     while (time_incr--)
         put_bits(&s->pb, 1, 1);
 
@@ -1129,6 +1142,8 @@ void ff_mpeg4_encode_picture_header(MpegEncContext *s, int picture_number)
         put_bits(&s->pb, 3, s->f_code);  /* fcode_for */
     if (s->pict_type == AV_PICTURE_TYPE_B)
         put_bits(&s->pb, 3, s->b_code);  /* fcode_back */
+
+    return 0;
 }
 
 static av_cold void init_uni_dc_tab(void)
@@ -1191,8 +1206,8 @@ static av_cold void init_uni_mpeg4_rl_tab(RLTable *rl, uint32_t *bits_tab,
 {
     int slevel, run, last;
 
-    assert(MAX_LEVEL >= 64);
-    assert(MAX_RUN >= 63);
+    av_assert0(MAX_LEVEL >= 64);
+    av_assert0(MAX_RUN >= 63);
 
     for (slevel = -64; slevel < 64; slevel++) {
         if (slevel == 0)
@@ -1287,6 +1302,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int ret;
     static int done = 0;
 
+    if (avctx->width >= (1<<13) || avctx->height >= (1<<13)) {
+        av_log(avctx, AV_LOG_ERROR, "dimensions too large for MPEG-4\n");
+        return AVERROR(EINVAL);
+    }
+
     if ((ret = ff_mpv_encode_init(avctx)) < 0)
         return ret;
 
@@ -1381,8 +1401,8 @@ void ff_mpeg4_encode_video_packet_header(MpegEncContext *s)
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "data_partitioning", "Use data partitioning.",      OFFSET(data_partitioning), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "alternate_scan",    "Enable alternate scantable.", OFFSET(alternate_scan),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
+    { "data_partitioning", "Use data partitioning.",      OFFSET(data_partitioning), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "alternate_scan",    "Enable alternate scantable.", OFFSET(alternate_scan),    AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     FF_MPV_COMMON_OPTS
     { NULL },
 };
diff --git a/libavcodec/mpeg_er.c b/libavcodec/mpeg_er.c
index 9410b27..dd87ae9 100644
--- a/libavcodec/mpeg_er.c
+++ b/libavcodec/mpeg_er.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@ static void set_erpic(ERPicture *dst, Picture *src)
 {
     int i;
 
+    memset(dst, 0, sizeof(*dst));
     if (!src) {
         dst->f  = NULL;
         dst->tf = NULL;
diff --git a/libavcodec/mpeg_er.h b/libavcodec/mpeg_er.h
index ca1ea90..bb627a4 100644
--- a/libavcodec/mpeg_er.h
+++ b/libavcodec/mpeg_er.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudio.c b/libavcodec/mpegaudio.c
index 1a83635..cba5299 100644
--- a/libavcodec/mpegaudio.c
+++ b/libavcodec/mpegaudio.c
@@ -2,20 +2,20 @@
  * MPEG Audio common code
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index b556801..1591a17 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,8 +26,8 @@
 #ifndef AVCODEC_MPEGAUDIO_H
 #define AVCODEC_MPEGAUDIO_H
 
-#ifndef CONFIG_FLOAT
-#   define CONFIG_FLOAT 0
+#ifndef USE_FLOATS
+#   define USE_FLOATS 0
 #endif
 
 #include <stdint.h>
@@ -52,11 +52,13 @@
 #define WFRAC_BITS  16   /* fractional bits for window */
 #endif
 
+#define IMDCT_SCALAR 1.759
+
 #define FRAC_ONE    (1 << FRAC_BITS)
 
 #define FIX(a)   ((int)((a) * FRAC_ONE))
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #   define INTFLOAT float
 typedef float MPA_INT;
 typedef float OUT_INT;
diff --git a/libavcodec/mpegaudio_parser.c b/libavcodec/mpegaudio_parser.c
index c44c024..873f941 100644
--- a/libavcodec/mpegaudio_parser.c
+++ b/libavcodec/mpegaudio_parser.c
@@ -3,27 +3,27 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "parser.h"
 #include "mpegaudiodecheader.h"
 #include "libavutil/common.h"
-
+#include "libavformat/id3v1.h" // for ID3v1_TAG_SIZE
 
 typedef struct MpegAudioParseContext {
     ParseContext pc;
@@ -35,7 +35,7 @@ typedef struct MpegAudioParseContext {
 
 #define MPA_HEADER_SIZE 4
 
-/* header + layer + bitrate + freq + lsf/mpeg25 */
+/* header + layer + freq + lsf/mpeg25 */
 #define SAME_HEADER_MASK \
    (0xffe00000 | (3 << 17) | (3 << 10) | (3 << 19))
 
@@ -49,12 +49,14 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
     uint32_t state= pc->state;
     int i;
     int next= END_NOT_FOUND;
+    int flush = !buf_size;
 
     for(i=0; i<buf_size; ){
         if(s->frame_size){
             int inc= FFMIN(buf_size - i, s->frame_size);
             i += inc;
             s->frame_size -= inc;
+            state = 0;
 
             if(!s->frame_size){
                 next= i;
@@ -67,24 +69,26 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
 
                 state= (state<<8) + buf[i++];
 
-                ret = ff_mpa_decode_header(avctx, state, &sr, &channels, &frame_size, &bit_rate);
+                ret = ff_mpa_decode_header(state, &sr, &channels, &frame_size, &bit_rate, &codec_id);
                 if (ret < 4) {
                     if (i > 4)
                         s->header_count = -2;
                 } else {
+                    int header_threshold = avctx->codec_id != AV_CODEC_ID_NONE && avctx->codec_id != codec_id;
                     if((state&SAME_HEADER_MASK) != (s->header&SAME_HEADER_MASK) && s->header)
                         s->header_count= -3;
                     s->header= state;
                     s->header_count++;
                     s->frame_size = ret-4;
 
-                    if (s->header_count > 0) {
+                    if (s->header_count > header_threshold) {
                         avctx->sample_rate= sr;
                         avctx->channels   = channels;
                         s1->duration      = frame_size;
+                        avctx->codec_id   = codec_id;
                         if (s->no_bitrate || !avctx->bit_rate) {
                             s->no_bitrate = 1;
-                            avctx->bit_rate += (bit_rate - avctx->bit_rate) / s->header_count;
+                            avctx->bit_rate += (bit_rate - avctx->bit_rate) / (s->header_count - header_threshold);
                         }
                     }
 
@@ -110,6 +114,12 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
         return buf_size;
     }
 
+    if (flush && buf_size >= ID3v1_TAG_SIZE && memcmp(buf, "TAG", 3) == 0) {
+        *poutbuf = NULL;
+        *poutbuf_size = 0;
+        return next;
+    }
+
     *poutbuf = buf;
     *poutbuf_size = buf_size;
     return next;
diff --git a/libavcodec/mpegaudio_tablegen.c b/libavcodec/mpegaudio_tablegen.c
index b4c240b..ede7c8e 100644
--- a/libavcodec/mpegaudio_tablegen.c
+++ b/libavcodec/mpegaudio_tablegen.c
@@ -3,25 +3,26 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdlib.h>
 #define CONFIG_HARDCODED_TABLES 0
+#include "libavutil/tablegen.h"
 #include "mpegaudio_tablegen.h"
 #include "tableprint.h"
 
diff --git a/libavcodec/mpegaudio_tablegen.h b/libavcodec/mpegaudio_tablegen.h
index 8a3e51a..0b0ea40 100644
--- a/libavcodec/mpegaudio_tablegen.h
+++ b/libavcodec/mpegaudio_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 
 #include <stdint.h>
 #include <math.h>
+#include "libavutil/attributes.h"
 
 #define TABLE_4_3_SIZE (8191 + 16)*4
 #if CONFIG_HARDCODED_TABLES
@@ -39,18 +40,33 @@ static float exp_table_float[512];
 static float expval_table_float[512][16];
 
 #define FRAC_BITS 23
+#define IMDCT_SCALAR 1.759
 
-static void mpegaudio_tableinit(void)
+static av_cold void mpegaudio_tableinit(void)
 {
     int i, value, exponent;
+    static const double exp2_lut[4] = {
+        1.00000000000000000000, /* 2 ^ (0 * 0.25) */
+        1.18920711500272106672, /* 2 ^ (1 * 0.25) */
+        M_SQRT2               , /* 2 ^ (2 * 0.25) */
+        1.68179283050742908606, /* 2 ^ (3 * 0.25) */
+    };
+    static double pow43_lut[16];
+    double exp2_base = 2.11758236813575084767080625169910490512847900390625e-22; // 2^(-72)
+    double exp2_val;
+    double pow43_val = 0;
+    for (i = 0; i < 16; ++i)
+        pow43_lut[i] = i * cbrt(i);
+
     for (i = 1; i < TABLE_4_3_SIZE; i++) {
-        double value = i / 4;
         double f, fm;
         int e, m;
-        /* cbrtf() isn't available on all systems, so we use powf(). */
-        f  = value * powf(value, 1.0 / 3.0) * pow(2, (i & 3) * 0.25);
+        double value = i / 4;
+        if ((i & 3) == 0)
+            pow43_val = value / IMDCT_SCALAR * cbrt(value);
+        f  = pow43_val * exp2_lut[i & 3];
         fm = frexp(f, &e);
-        m  = (uint32_t)(fm * (1LL << 31) + 0.5);
+        m  = llrint(fm * (1LL << 31));
         e += FRAC_BITS - 31 + 5 - 100;
 
         /* normalized to FRAC_BITS */
@@ -58,11 +74,12 @@ static void mpegaudio_tableinit(void)
         table_4_3_exp[i]   = -e;
     }
     for (exponent = 0; exponent < 512; exponent++) {
+        if (exponent && (exponent & 3) == 0)
+            exp2_base *= 2;
+        exp2_val = exp2_base * exp2_lut[exponent & 3] / IMDCT_SCALAR;
         for (value = 0; value < 16; value++) {
-            /* cbrtf() isn't available on all systems, so we use powf(). */
-            double f = (double)value * powf(value, 1.0 / 3.0) * pow(2, (exponent - 400) * 0.25 + FRAC_BITS + 5);
-            /* llrint() isn't always available, so round and cast manually. */
-            expval_table_fixed[exponent][value] = (long long int) (f >= 0 ? floor(f + 0.5) : ceil(f - 0.5));
+            double f = pow43_lut[value] * exp2_val;
+            expval_table_fixed[exponent][value] = (f < 0xFFFFFFFF ? llrint(f) : 0xFFFFFFFF);
             expval_table_float[exponent][value] = f;
         }
         exp_table_fixed[exponent] = expval_table_fixed[exponent][1];
diff --git a/libavcodec/mpegaudiodata.c b/libavcodec/mpegaudiodata.c
index 009a02a..0569281 100644
--- a/libavcodec/mpegaudiodata.c
+++ b/libavcodec/mpegaudiodata.c
@@ -2,20 +2,20 @@
  * MPEG Audio common tables
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,11 +29,11 @@
 
 const uint16_t avpriv_mpa_bitrate_tab[2][3][15] = {
     { {0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448 },
-      {0, 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384 },
-      {0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320 } },
-    { {0, 32, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 224, 256},
-      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160},
-      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160}
+      {0, 32, 48, 56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320, 384 },
+      {0, 32, 40, 48,  56,  64,  80,  96, 112, 128, 160, 192, 224, 256, 320 } },
+    { {0, 32, 48, 56,  64,  80,  96, 112, 128, 144, 160, 176, 192, 224, 256},
+      {0,  8, 16, 24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160},
+      {0,  8, 16, 24,  32,  40,  48,  56,  64,  80,  96, 112, 128, 144, 160}
     }
 };
 
diff --git a/libavcodec/mpegaudiodata.h b/libavcodec/mpegaudiodata.h
index 2b8ff65..29a2658 100644
--- a/libavcodec/mpegaudiodata.h
+++ b/libavcodec/mpegaudiodata.h
@@ -2,20 +2,20 @@
  * MPEG Audio common tables
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodec_fixed.c b/libavcodec/mpegaudiodec_fixed.c
index 2db1e18..9421ffb 100644
--- a/libavcodec/mpegaudiodec_fixed.c
+++ b/libavcodec/mpegaudiodec_fixed.c
@@ -1,27 +1,27 @@
 /*
  * Fixed-point MPEG audio decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/samplefmt.h"
 
-#define CONFIG_FLOAT 0
+#define USE_FLOATS 0
 
 #include "mpegaudio.h"
 
diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c
index 7bdfd90..ddfa5e0 100644
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@@ -2,27 +2,27 @@
  * Float MPEG Audio decoder
  * Copyright (c) 2010 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/samplefmt.h"
 
-#define CONFIG_FLOAT 1
+#define USE_FLOATS 1
 
 #include "mpegaudio.h"
 
@@ -46,6 +46,7 @@ AVCodec ff_mp1float_decoder = {
     .id             = AV_CODEC_ID_MP1,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
@@ -63,6 +64,7 @@ AVCodec ff_mp2float_decoder = {
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
     .decode         = decode_frame,
+    .close          = decode_close,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
@@ -78,6 +80,7 @@ AVCodec ff_mp3float_decoder = {
     .id             = AV_CODEC_ID_MP3,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
@@ -94,6 +97,7 @@ AVCodec ff_mp3adufloat_decoder = {
     .id             = AV_CODEC_ID_MP3ADU,
     .priv_data_size = sizeof(MPADecodeContext),
     .init           = decode_init,
+    .close          = decode_close,
     .decode         = decode_frame_adu,
     .capabilities   = AV_CODEC_CAP_DR1,
     .flush          = flush,
diff --git a/libavcodec/mpegaudiodec_template.c b/libavcodec/mpegaudiodec_template.c
index 2518387..91c4051 100644
--- a/libavcodec/mpegaudiodec_template.c
+++ b/libavcodec/mpegaudiodec_template.c
@@ -2,20 +2,20 @@
  * MPEG Audio decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "internal.h"
@@ -84,7 +85,7 @@ typedef struct MPADecodeContext {
     int err_recognition;
     AVCodecContext* avctx;
     MPADSPContext mpadsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     AVFrame *frame;
 } MPADecodeContext;
 
@@ -184,6 +185,8 @@ static void compute_band_indexes(MPADecodeContext *s, GranuleDef *g)
 {
     if (g->block_type == 2) {
         if (g->switch_point) {
+            if(s->sample_rate_index == 8)
+                avpriv_request_sample(s->avctx, "switch point in 8khz");
             /* if switched mode, we handle the 36 first samples as
                 long blocks.  For 8000Hz, we handle the 72 first
                 exponents as long blocks */
@@ -213,7 +216,7 @@ static inline int l1_unscale(int n, int mant, int scale_factor)
     shift   = scale_factor_modshift[scale_factor];
     mod     = shift & 3;
     shift >>= 2;
-    val     = MUL64(mant + (-1 << n) + 1, scale_factor_mult[n-1][mod]);
+    val     = MUL64((int)(mant + (-1U << n) + 1), scale_factor_mult[n-1][mod]);
     shift  += n;
     /* NOTE: at this point, 1 <= shift >= 21 + 15 */
     return (int)((val + (1LL << (shift - 1))) >> shift);
@@ -243,7 +246,10 @@ static inline int l3_unscale(int value, int exponent)
     e  = table_4_3_exp  [4 * value + (exponent & 3)];
     m  = table_4_3_value[4 * value + (exponent & 3)];
     e -= exponent >> 2;
-    assert(e >= 1);
+#ifdef DEBUG
+    if(e < 1)
+        av_log(NULL, AV_LOG_WARNING, "l3_unscale: e is %d\n", e);
+#endif
     if (e > 31)
         return 0;
     m = (m + (1 << (e - 1))) >> e;
@@ -307,7 +313,7 @@ static av_cold void decode_init_static(void)
                  INIT_VLC_USE_NEW_STATIC);
         offset += huff_vlc_tables_sizes[i];
     }
-    assert(offset == FF_ARRAY_ELEMS(huff_vlc_tables));
+    av_assert0(offset == FF_ARRAY_ELEMS(huff_vlc_tables));
 
     offset = 0;
     for (i = 0; i < 2; i++) {
@@ -318,7 +324,7 @@ static av_cold void decode_init_static(void)
                  INIT_VLC_USE_NEW_STATIC);
         offset += huff_quad_vlc_tables_sizes[i];
     }
-    assert(offset == FF_ARRAY_ELEMS(huff_quad_vlc_tables));
+    av_assert0(offset == FF_ARRAY_ELEMS(huff_quad_vlc_tables));
 
     for (i = 0; i < 9; i++) {
         k = 0;
@@ -371,7 +377,7 @@ static av_cold void decode_init_static(void)
 
         for (j = 0; j < 2; j++) {
             e = -(j + 1) * ((i + 1) >> 1);
-            f = pow(2.0, e / 4.0);
+            f = exp2(e / 4.0);
             k = i & 1;
             is_table_lsf[j][k ^ 1][i] = FIXR(f);
             is_table_lsf[j][k    ][i] = FIXR(1.0);
@@ -382,11 +388,11 @@ static av_cold void decode_init_static(void)
     }
 
     for (i = 0; i < 8; i++) {
-        float ci, cs, ca;
+        double ci, cs, ca;
         ci = ci_table[i];
         cs = 1.0 / sqrt(1.0 + ci * ci);
         ca = cs * ci;
-#if !CONFIG_FLOAT
+#if !USE_FLOATS
         csa_table[i][0] = FIXHR(cs/4);
         csa_table[i][1] = FIXHR(ca/4);
         csa_table[i][2] = FIXHR(ca/4) + FIXHR(cs/4);
@@ -400,6 +406,16 @@ static av_cold void decode_init_static(void)
     }
 }
 
+#if USE_FLOATS
+static av_cold int decode_close(AVCodecContext * avctx)
+{
+    MPADecodeContext *s = avctx->priv_data;
+    av_freep(&s->fdsp);
+
+    return 0;
+}
+#endif
+
 static av_cold int decode_init(AVCodecContext * avctx)
 {
     static int initialized_tables = 0;
@@ -412,7 +428,12 @@ static av_cold int decode_init(AVCodecContext * avctx)
 
     s->avctx = avctx;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+#if USE_FLOATS
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+#endif
+
     ff_mpadsp_init(&s->mpadsp);
 
     if (avctx->request_sample_fmt == OUT_FMT &&
@@ -801,7 +822,7 @@ static void switch_buffer(MPADecodeContext *s, int *pos, int *end_pos,
     if (s->in_gb.buffer && *pos >= s->gb.size_in_bits) {
         s->gb           = s->in_gb;
         s->in_gb.buffer = NULL;
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        av_assert2((get_bits_count(&s->gb) & 7) == 0);
         skip_bits_long(&s->gb, *pos - *end_pos);
         *end_pos2 =
         *end_pos  = *end_pos2 + get_bits_count(&s->gb) - *pos;
@@ -809,13 +830,13 @@ static void switch_buffer(MPADecodeContext *s, int *pos, int *end_pos,
     }
 }
 
-/* Following is a optimized code for
+/* Following is an optimized code for
             INTFLOAT v = *src
             if(get_bits1(&s->gb))
                 v = -v;
             *dst = v;
 */
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #define READ_FLIP_SIGN(dst,src)                     \
     v = AV_RN32A(src) ^ (get_bits1(&s->gb) << 31);  \
     AV_WN32A(dst, v);
@@ -930,7 +951,7 @@ static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
                 s_index -= 4;
                 skip_bits_long(&s->gb, last_pos - pos);
                 av_log(s->avctx, AV_LOG_INFO, "overread, skip %d enddists: %d %d\n", last_pos - pos, end_pos-pos, end_pos2-pos);
-                if(s->err_recognition & AV_EF_BITSTREAM)
+                if(s->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))
                     s_index=0;
                 break;
             }
@@ -957,10 +978,10 @@ static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
     }
     /* skip extension bits */
     bits_left = end_pos2 - get_bits_count(&s->gb);
-    if (bits_left < 0 && (s->err_recognition & AV_EF_BUFFER)) {
+    if (bits_left < 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_COMPLIANT))) {
         av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left);
         s_index=0;
-    } else if (bits_left > 0 && (s->err_recognition & AV_EF_BUFFER)) {
+    } else if (bits_left > 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_AGGRESSIVE))) {
         av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left);
         s_index = 0;
     }
@@ -1124,8 +1145,8 @@ found2:
         /* ms stereo ONLY */
         /* NOTE: the 1/sqrt(2) normalization factor is included in the
            global gain */
-#if CONFIG_FLOAT
-       s->fdsp.butterflies_float(g0->sb_hybrid, g1->sb_hybrid, 576);
+#if USE_FLOATS
+       s->fdsp->butterflies_float(g0->sb_hybrid, g1->sb_hybrid, 576);
 #else
         tab0 = g0->sb_hybrid;
         tab1 = g1->sb_hybrid;
@@ -1139,7 +1160,18 @@ found2:
     }
 }
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
+#if HAVE_MIPSFPU
+#   include "mips/compute_antialias_float.h"
+#endif /* HAVE_MIPSFPU */
+#else
+#if HAVE_MIPSDSP
+#   include "mips/compute_antialias_fixed.h"
+#endif /* HAVE_MIPSDSP */
+#endif /* USE_FLOATS */
+
+#ifndef compute_antialias
+#if USE_FLOATS
 #define AA(j) do {                                                      \
         float tmp0 = ptr[-1-j];                                         \
         float tmp1 = ptr[   j];                                         \
@@ -1185,6 +1217,7 @@ static void compute_antialias(MPADecodeContext *s, GranuleDef *g)
         ptr += 18;
     }
 }
+#endif /* compute_antialias */
 
 static void compute_imdct(MPADecodeContext *s, GranuleDef *g,
                           INTFLOAT *sb_samples, INTFLOAT *mdct_buf)
@@ -1354,9 +1387,8 @@ static int mp_decode_layer3(MPADecodeContext *s)
     if (!s->adu_mode) {
         int skip;
         const uint8_t *ptr = s->gb.buffer + (get_bits_count(&s->gb)>>3);
-        int extrasize = av_clip(get_bits_left(&s->gb) >> 3, 0,
-                                FFMAX(0, LAST_BUF_SIZE - s->last_buf_size));
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        int extrasize = av_clip(get_bits_left(&s->gb) >> 3, 0, EXTRABYTES);
+        av_assert1((get_bits_count(&s->gb) & 7) == 0);
         /* now we get bits from the main_data_begin offset */
         ff_dlog(s->avctx, "seekback:%d, lastbuf:%d\n",
                 main_data_begin, s->last_buf_size);
@@ -1365,7 +1397,7 @@ static int mp_decode_layer3(MPADecodeContext *s)
         s->in_gb = s->gb;
         init_get_bits(&s->gb, s->last_buf, s->last_buf_size*8);
 #if !UNCHECKED_BITSTREAM_READER
-        s->gb.size_in_bits_plus8 += extrasize * 8;
+        s->gb.size_in_bits_plus8 += FFMAX(extrasize, LAST_BUF_SIZE - s->last_buf_size) * 8;
 #endif
         s->last_buf_size <<= 3;
         for (gr = 0; gr < nb_granules && (s->last_buf_size >> 3) < main_data_begin; gr++) {
@@ -1547,9 +1579,6 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
     default:
         nb_frames = mp_decode_layer3(s);
 
-        if (nb_frames < 0)
-            return nb_frames;
-
         s->last_buf_size=0;
         if (s->in_gb.buffer) {
             align_get_bits(&s->gb);
@@ -1564,7 +1593,7 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
         }
 
         align_get_bits(&s->gb);
-        assert((get_bits_count(&s->gb) & 7) == 0);
+        av_assert1((get_bits_count(&s->gb) & 7) == 0);
         i = get_bits_left(&s->gb) >> 3;
 
         if (i < 0 || i > BACKSTEP_SIZE || nb_frames < 0) {
@@ -1572,19 +1601,20 @@ static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples,
                 av_log(s->avctx, AV_LOG_ERROR, "invalid new backstep %d\n", i);
             i = FFMIN(BACKSTEP_SIZE, buf_size - HEADER_SIZE);
         }
-        assert(i <= buf_size - HEADER_SIZE && i >= 0);
+        av_assert1(i <= buf_size - HEADER_SIZE && i >= 0);
         memcpy(s->last_buf + s->last_buf_size, s->gb.buffer + buf_size - HEADER_SIZE - i, i);
         s->last_buf_size += i;
     }
 
+    if(nb_frames < 0)
+        return nb_frames;
+
     /* get output buffer */
     if (!samples) {
-        av_assert0(s->frame != NULL);
+        av_assert0(s->frame);
         s->frame->nb_samples = s->avctx->frame_size;
-        if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0)
             return ret;
-        }
         samples = (OUT_INT **)s->frame->extended_data;
     }
 
@@ -1620,11 +1650,21 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
     uint32_t header;
     int ret;
 
+    int skipped = 0;
+    while(buf_size && !*buf){
+        buf++;
+        buf_size--;
+        skipped++;
+    }
+
     if (buf_size < HEADER_SIZE)
         return AVERROR_INVALIDDATA;
 
     header = AV_RB32(buf);
-
+    if (header>>8 == AV_RB32("TAG")>>8) {
+        av_log(avctx, AV_LOG_DEBUG, "discarding ID3 tag\n");
+        return buf_size;
+    }
     ret = avpriv_mpegaudio_decode_header((MPADecodeHeader *)s, header);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Header missing\n");
@@ -1640,6 +1680,14 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
     if (!avctx->bit_rate)
         avctx->bit_rate = s->bit_rate;
 
+    if (s->frame_size <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
+        return AVERROR_INVALIDDATA;
+    } else if (s->frame_size < buf_size) {
+        av_log(avctx, AV_LOG_DEBUG, "incorrect frame size - multiple frames in buffer?\n");
+        buf_size= s->frame_size;
+    }
+
     s->frame = data;
 
     ret = mp_decode_frame(s, NULL, buf, buf_size);
@@ -1660,13 +1708,15 @@ static int decode_frame(AVCodecContext * avctx, void *data, int *got_frame_ptr,
             return ret;
     }
     s->frame_size = 0;
-    return buf_size;
+    return buf_size + skipped;
 }
 
 static void mp_flush(MPADecodeContext *ctx)
 {
     memset(ctx->synth_buf, 0, sizeof(ctx->synth_buf));
+    memset(ctx->mdct_buf, 0, sizeof(ctx->mdct_buf));
     ctx->last_buf_size = 0;
+    ctx->dither_state = 0;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -1683,6 +1733,7 @@ static int decode_frame_adu(AVCodecContext *avctx, void *data,
     MPADecodeContext *s = avctx->priv_data;
     uint32_t header;
     int len, ret;
+    int av_unused out_size;
 
     len = buf_size;
 
@@ -1776,7 +1827,7 @@ static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
     int i;
 
     for (i = 0; i < s->frames; i++)
-        av_free(s->mp3decctx[i]);
+        av_freep(&s->mp3decctx[i]);
 
     return 0;
 }
@@ -1835,6 +1886,7 @@ static av_cold int decode_init_mp3on4(AVCodecContext * avctx)
         s->mp3decctx[i]->adu_mode = 1;
         s->mp3decctx[i]->avctx = avctx;
         s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
+        s->mp3decctx[i]->fdsp = s->mp3decctx[0]->fdsp;
     }
 
     return 0;
@@ -1870,10 +1922,8 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = MPA_FRAME_SIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out_samples = (OUT_INT **)frame->extended_data;
 
     // Discard too short frames
@@ -1887,7 +1937,7 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         fsize = AV_RB16(buf) >> 4;
         fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE);
         m     = s->mp3decctx[fr];
-        assert(m != NULL);
+        av_assert1(m);
 
         if (fsize < HEADER_SIZE) {
             av_log(avctx, AV_LOG_ERROR, "Frame size smaller than header size\n");
@@ -1896,8 +1946,10 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         header = (AV_RB32(buf) & 0x000fffff) | s->syncword; // patch header
 
         ret = avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header);
-        if (ret < 0) // Bad header, discard block
-            break;
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Bad header, discard block\n");
+            return AVERROR_INVALIDDATA;
+        }
 
         if (ch + m->nb_channels > avctx->channels ||
             s->coff[fr] + m->nb_channels > avctx->channels) {
@@ -1911,8 +1963,13 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
         if (m->nb_channels > 1)
             outptr[1] = out_samples[s->coff[fr] + 1];
 
-        if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0)
-            return ret;
+        if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "failed to decode channel %d\n", ch);
+            memset(outptr[0], 0, MPA_FRAME_SIZE*sizeof(OUT_INT));
+            if (m->nb_channels > 1)
+                memset(outptr[1], 0, MPA_FRAME_SIZE*sizeof(OUT_INT));
+            ret = m->nb_channels * MPA_FRAME_SIZE*sizeof(OUT_INT);
+        }
 
         out_size += ret;
         buf      += fsize;
@@ -1920,6 +1977,10 @@ static int decode_frame_mp3on4(AVCodecContext *avctx, void *data,
 
         avctx->bit_rate += m->bit_rate;
     }
+    if (ch != avctx->channels) {
+        av_log(avctx, AV_LOG_ERROR, "failed to decode all channels\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     /* update codec info */
     avctx->sample_rate = s->mp3decctx[0]->sample_rate;
diff --git a/libavcodec/mpegaudiodecheader.c b/libavcodec/mpegaudiodecheader.c
index a315965..ae86b08 100644
--- a/libavcodec/mpegaudiodecheader.c
+++ b/libavcodec/mpegaudiodecheader.c
@@ -2,20 +2,20 @@
  * MPEG Audio header decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -119,8 +119,7 @@ int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header)
     return 0;
 }
 
-int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
-                         int *channels, int *frame_size, int *bit_rate)
+int ff_mpa_decode_header(uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate, enum AVCodecID *codec_id)
 {
     MPADecodeHeader s1, *s = &s1;
 
@@ -130,17 +129,17 @@ int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
 
     switch(s->layer) {
     case 1:
-        avctx->codec_id = AV_CODEC_ID_MP1;
+        *codec_id = AV_CODEC_ID_MP1;
         *frame_size = 384;
         break;
     case 2:
-        avctx->codec_id = AV_CODEC_ID_MP2;
+        *codec_id = AV_CODEC_ID_MP2;
         *frame_size = 1152;
         break;
     default:
     case 3:
-        if (avctx->codec_id != AV_CODEC_ID_MP3ADU)
-            avctx->codec_id = AV_CODEC_ID_MP3;
+        if (*codec_id != AV_CODEC_ID_MP3ADU)
+            *codec_id = AV_CODEC_ID_MP3;
         if (s->lsf)
             *frame_size = 576;
         else
@@ -154,9 +153,14 @@ int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
     return s->frame_size;
 }
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
+#if LIBAVCODEC_VERSION_MAJOR < 58
+int avpriv_mpa_decode_header2(uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate, enum AVCodecID *codec_id)
+{
+    return ff_mpa_decode_header(head, sample_rate, channels, frame_size, bit_rate, codec_id);
+}
+
 int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate)
 {
-    return ff_mpa_decode_header(avctx, head, sample_rate, channels, frame_size, bit_rate);
+    return ff_mpa_decode_header(head, sample_rate, channels, frame_size, bit_rate, &avctx->codec_id);
 }
 #endif
diff --git a/libavcodec/mpegaudiodecheader.h b/libavcodec/mpegaudiodecheader.h
index 089a508..952ba17 100644
--- a/libavcodec/mpegaudiodecheader.h
+++ b/libavcodec/mpegaudiodecheader.h
@@ -2,20 +2,20 @@
  * MPEG Audio header decoder
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,11 +54,12 @@ int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header);
 
 /* useful helper to get MPEG audio stream info. Return -1 if error in
    header, otherwise the coded frame size in bytes */
-int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate,
-                         int *channels, int *frame_size, int *bitrate);
+int ff_mpa_decode_header(uint32_t head, int *sample_rate,
+                         int *channels, int *frame_size, int *bitrate, enum AVCodecID *codec_id);
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
+#if LIBAVCODEC_VERSION_MAJOR < 58
 int avpriv_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate);
+int avpriv_mpa_decode_header2(uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate, enum AVCodecID *codec_id);
 #endif
 
 /* fast header check for resync */
diff --git a/libavcodec/mpegaudiodectab.h b/libavcodec/mpegaudiodectab.h
index 1221657..accd12b 100644
--- a/libavcodec/mpegaudiodectab.h
+++ b/libavcodec/mpegaudiodectab.h
@@ -2,20 +2,20 @@
  * MPEG Audio decoder
  * copyright (c) 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c
index 58ea1d1..a5d20df 100644
--- a/libavcodec/mpegaudiodsp.c
+++ b/libavcodec/mpegaudiodsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,4 +45,6 @@ av_cold void ff_mpadsp_init(MPADSPContext *s)
     if (ARCH_ARM)     ff_mpadsp_init_arm(s);
     if (ARCH_PPC)     ff_mpadsp_init_ppc(s);
     if (ARCH_X86)     ff_mpadsp_init_x86(s);
+    if (HAVE_MIPSFPU)   ff_mpadsp_init_mipsfpu(s);
+    if (HAVE_MIPSDSP) ff_mpadsp_init_mipsdsp(s);
 }
diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h
index 909c652..b827163 100644
--- a/libavcodec/mpegaudiodsp.h
+++ b/libavcodec/mpegaudiodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@ typedef struct MPADSPContext {
                                int *dither_state, int16_t *samples, int incr);
     void (*dct32_float)(float *dst, const float *src);
     void (*dct32_fixed)(int *dst, const int *src);
+
     void (*imdct36_blocks_float)(float *out, float *buf, float *in,
                                  int count, int switch_point, int block_type);
     void (*imdct36_blocks_fixed)(int *out, int *buf, int *in,
@@ -58,6 +59,8 @@ void ff_mpadsp_init_aarch64(MPADSPContext *s);
 void ff_mpadsp_init_arm(MPADSPContext *s);
 void ff_mpadsp_init_ppc(MPADSPContext *s);
 void ff_mpadsp_init_x86(MPADSPContext *s);
+void ff_mpadsp_init_mipsfpu(MPADSPContext *s);
+void ff_mpadsp_init_mipsdsp(MPADSPContext *s);
 
 void ff_mpa_synth_init_float(float *window);
 void ff_mpa_synth_init_fixed(int32_t *window);
diff --git a/libavcodec/mpegaudiodsp_data.c b/libavcodec/mpegaudiodsp_data.c
index 5cf86b8..4550de9 100644
--- a/libavcodec/mpegaudiodsp_data.c
+++ b/libavcodec/mpegaudiodsp_data.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegaudiodsp_fixed.c b/libavcodec/mpegaudiodsp_fixed.c
index 3c49a56..83c9d66 100644
--- a/libavcodec/mpegaudiodsp_fixed.c
+++ b/libavcodec/mpegaudiodsp_fixed.c
@@ -1,20 +1,20 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#define CONFIG_FLOAT 0
+#define USE_FLOATS 0
 #include "mpegaudiodsp_template.c"
diff --git a/libavcodec/mpegaudiodsp_float.c b/libavcodec/mpegaudiodsp_float.c
index 2d8d53e..c45b136 100644
--- a/libavcodec/mpegaudiodsp_float.c
+++ b/libavcodec/mpegaudiodsp_float.c
@@ -1,20 +1,20 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#define CONFIG_FLOAT 1
+#define USE_FLOATS 1
 #include "mpegaudiodsp_template.c"
diff --git a/libavcodec/mpegaudiodsp_template.c b/libavcodec/mpegaudiodsp_template.c
index 621bbd4..62454ca 100644
--- a/libavcodec/mpegaudiodsp_template.c
+++ b/libavcodec/mpegaudiodsp_template.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2001, 2002 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "mpegaudiodsp.h"
 #include "mpegaudio.h"
 
-#if CONFIG_FLOAT
+#if USE_FLOATS
 #define RENAME(n) n##_float
 
 static inline float round_sample(float *sum)
@@ -125,7 +125,7 @@ void RENAME(ff_mpadsp_apply_window)(MPA_INT *synth_buf, MPA_INT *window,
     register const MPA_INT *w, *w2, *p;
     int j;
     OUT_INT *samples2;
-#if CONFIG_FLOAT
+#if USE_FLOATS
     float sum, sum2;
 #else
     int64_t sum, sum2;
@@ -200,7 +200,7 @@ av_cold void RENAME(ff_mpa_synth_init)(MPA_INT *window)
     for(i=0;i<257;i++) {
         INTFLOAT v;
         v = ff_mpa_enwindow[i];
-#if CONFIG_FLOAT
+#if USE_FLOATS
         v *= 1.0 / (1LL<<(16 + FRAC_BITS));
 #endif
         window[i] = v;
@@ -243,7 +243,7 @@ av_cold void RENAME(ff_init_mpadsp_tabs)(void)
                 else if (i <  18) d = 1;
             }
             //merge last stage of imdct into the window coefficients
-            d *= 0.5 / cos(M_PI * (2 * i + 19) / 72);
+            d *= 0.5 * IMDCT_SCALAR / cos(M_PI * (2 * i + 19) / 72);
 
             if (j == 2)
                 RENAME(ff_mdct_win)[j][i/3] = FIXHR((d / (1<<5)));
@@ -398,3 +398,4 @@ void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in,
         out++;
     }
 }
+
diff --git a/libavcodec/mpegaudioenc_fixed.c b/libavcodec/mpegaudioenc_fixed.c
new file mode 100644
index 0000000..022b6fe
--- /dev/null
+++ b/libavcodec/mpegaudioenc_fixed.c
@@ -0,0 +1,41 @@
+/*
+ * The simplest mpeg audio layer 2 encoder
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "mpegaudioenc_template.c"
+
+AVCodec ff_mp2fixed_encoder = {
+    .name                  = "mp2fixed",
+    .long_name             = NULL_IF_CONFIG_SMALL("MP2 fixed point (MPEG audio layer 2)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP2,
+    .priv_data_size        = sizeof(MpegAudioContext),
+    .init                  = MPA_encode_init,
+    .encode2               = MPA_encode_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){
+        44100, 48000,  32000, 22050, 24000, 16000, 0
+    },
+    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
+                                                 AV_CH_LAYOUT_STEREO,
+                                                 0 },
+    .defaults              = mp2_defaults,
+};
diff --git a/libavcodec/mpegaudioenc_float.c b/libavcodec/mpegaudioenc_float.c
new file mode 100644
index 0000000..4d4ab2d
--- /dev/null
+++ b/libavcodec/mpegaudioenc_float.c
@@ -0,0 +1,42 @@
+/*
+ * The simplest mpeg audio layer 2 encoder
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FLOATS 1
+#include "mpegaudioenc_template.c"
+
+AVCodec ff_mp2_encoder = {
+    .name                  = "mp2",
+    .long_name             = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_MP2,
+    .priv_data_size        = sizeof(MpegAudioContext),
+    .init                  = MPA_encode_init,
+    .encode2               = MPA_encode_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){
+        44100, 48000,  32000, 22050, 24000, 16000, 0
+    },
+    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
+                                                 AV_CH_LAYOUT_STEREO,
+                                                 0 },
+    .defaults              = mp2_defaults,
+};
diff --git a/libavcodec/mpegaudioenc.c b/libavcodec/mpegaudioenc_template.c
index 2be8b7f..93363fe 100644
--- a/libavcodec/mpegaudioenc.c
+++ b/libavcodec/mpegaudioenc_template.c
@@ -2,20 +2,20 @@
  * The simplest mpeg audio layer 2 encoder
  * Copyright (c) 2000, 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,7 +64,12 @@ typedef struct MpegAudioContext {
     int16_t filter_bank[512];
     int scale_factor_table[64];
     unsigned char scale_diff_table[128];
+#if USE_FLOATS
     float scale_factor_inv_table[64];
+#else
+    int8_t scale_factor_shift[64];
+    unsigned short scale_factor_mult[64];
+#endif
     unsigned short total_quant_bits[17]; /* total number of bits per allocation group */
 } MpegAudioContext;
 
@@ -103,10 +108,15 @@ static av_cold int MPA_encode_init(AVCodecContext *avctx)
     s->freq_index = i;
 
     /* encoding bitrate & frequency */
-    for(i=0;i<15;i++) {
+    for(i=1;i<15;i++) {
         if (avpriv_mpa_bitrate_tab[s->lsf][1][i] == bitrate)
             break;
     }
+    if (i == 15 && !avctx->bit_rate) {
+        i = 14;
+        bitrate = avpriv_mpa_bitrate_tab[s->lsf][1][i];
+        avctx->bit_rate = bitrate * 1000;
+    }
     if (i == 15){
         av_log(avctx, AV_LOG_ERROR, "bitrate %d is not allowed in mp2\n", bitrate);
         return AVERROR(EINVAL);
@@ -149,11 +159,17 @@ static av_cold int MPA_encode_init(AVCodecContext *avctx)
     }
 
     for(i=0;i<64;i++) {
-        v = (int)(pow(2.0, (3 - i) / 3.0) * (1 << 20));
+        v = (int)(exp2((3 - i) / 3.0) * (1 << 20));
         if (v <= 0)
             v = 1;
         s->scale_factor_table[i] = v;
-        s->scale_factor_inv_table[i] = pow(2.0, -(3 - i) / 3.0) / (float)(1 << 20);
+#if USE_FLOATS
+        s->scale_factor_inv_table[i] = exp2(-(3 - i) / 3.0) / (float)(1 << 20);
+#else
+#define P 15
+        s->scale_factor_shift[i] = 21 - P - (i / 3);
+        s->scale_factor_mult[i] = (1 << P) * exp2((i % 3) / 3.0);
+#endif
     }
     for(i=0;i<128;i++) {
         v = i - 64;
@@ -228,11 +244,11 @@ static void idct32(int *out, int *tab)
     do {
         int x1, x2, x3, x4;
 
-        x3 = MUL(t[16], FIX(SQRT2*0.5));
+        x3 = MUL(t[16], FIX(M_SQRT2*0.5));
         x4 = t[0] - x3;
         x3 = t[0] + x3;
 
-        x2 = MUL(-(t[24] + t[8]), FIX(SQRT2*0.5));
+        x2 = MUL(-(t[24] + t[8]), FIX(M_SQRT2*0.5));
         x1 = MUL((t[8] - x2), xp[0]);
         x2 = MUL((t[8] + x2), xp[1]);
 
@@ -397,7 +413,7 @@ static void compute_scale_factors(MpegAudioContext *s,
             ff_dlog(NULL, "%2d:%d in=%x %x %d\n",
                     j, i, vmax, s->scale_factor_table[index], index);
             /* store the scale factor */
-            assert(index >=0 && index <= 63);
+            av_assert2(index >=0 && index <= 63);
             sf[i] = index;
         }
 
@@ -459,7 +475,7 @@ static void compute_scale_factors(MpegAudioContext *s,
             sf[1] = sf[2] = sf[0];
             break;
         default:
-            assert(0); //cannot happen
+            av_assert2(0); //cannot happen
             code = 0;           /* kill warning */
         }
 
@@ -579,7 +595,7 @@ static void compute_bit_allocation(MpegAudioContext *s,
         }
     }
     *padding = max_frame_size - current_frame_size;
-    assert(*padding >= 0);
+    av_assert0(*padding >= 0);
 }
 
 /*
@@ -668,14 +684,36 @@ static void encode_frame(MpegAudioContext *s,
                         qindex = s->alloc_table[j+b];
                         steps = ff_mpa_quant_steps[qindex];
                         for(m=0;m<3;m++) {
-                            float a;
                             sample = s->sb_samples[ch][k][l + m][i];
                             /* divide by scale factor */
-                            a = (float)sample * s->scale_factor_inv_table[s->scale_factors[ch][i][k]];
-                            q[m] = (int)((a + 1.0) * steps * 0.5);
+#if USE_FLOATS
+                            {
+                                float a;
+                                a = (float)sample * s->scale_factor_inv_table[s->scale_factors[ch][i][k]];
+                                q[m] = (int)((a + 1.0) * steps * 0.5);
+                            }
+#else
+                            {
+                                int q1, e, shift, mult;
+                                e = s->scale_factors[ch][i][k];
+                                shift = s->scale_factor_shift[e];
+                                mult = s->scale_factor_mult[e];
+
+                                /* normalize to P bits */
+                                if (shift < 0)
+                                    q1 = sample << (-shift);
+                                else
+                                    q1 = sample >> shift;
+                                q1 = (q1 * mult) >> P;
+                                q1 += 1 << P;
+                                if (q1 < 0)
+                                    q1 = 0;
+                                q[m] = (q1 * (unsigned)steps) >> (P + 1);
+                            }
+#endif
                             if (q[m] >= steps)
                                 q[m] = steps - 1;
-                            assert(q[m] >= 0 && q[m] < steps);
+                            av_assert2(q[m] >= 0 && q[m] < steps);
                         }
                         bits = ff_mpa_quant_bits[qindex];
                         if (bits < 0) {
@@ -725,10 +763,8 @@ static int MPA_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     }
     compute_bit_allocation(s, smr, bit_alloc, &padding);
 
-    if ((ret = ff_alloc_packet(avpkt, MPA_MAX_CODED_FRAME_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, MPA_MAX_CODED_FRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
@@ -743,25 +779,7 @@ static int MPA_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 }
 
 static const AVCodecDefault mp2_defaults[] = {
-    { "b", "384000" },
+    { "b", "0" },
     { NULL },
 };
 
-AVCodec ff_mp2_encoder = {
-    .name                  = "mp2",
-    .long_name             = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"),
-    .type                  = AVMEDIA_TYPE_AUDIO,
-    .id                    = AV_CODEC_ID_MP2,
-    .priv_data_size        = sizeof(MpegAudioContext),
-    .init                  = MPA_encode_init,
-    .encode2               = MPA_encode_frame,
-    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
-                                                            AV_SAMPLE_FMT_NONE },
-    .supported_samplerates = (const int[]){
-        44100, 48000,  32000, 22050, 24000, 16000, 0
-    },
-    .channel_layouts       = (const uint64_t[]){ AV_CH_LAYOUT_MONO,
-                                                 AV_CH_LAYOUT_STEREO,
-                                                 0 },
-    .defaults              = mp2_defaults,
-};
diff --git a/libavcodec/mpegaudiotab.h b/libavcodec/mpegaudiotab.h
index d30ef1b..bb2e5de 100644
--- a/libavcodec/mpegaudiotab.h
+++ b/libavcodec/mpegaudiotab.h
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2000, 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,8 +33,6 @@
 #include <stdint.h>
 #include "mpegaudio.h"
 
-#define SQRT2 1.41421356237309514547
-
 static const int costab32[30] = {
     FIX(0.54119610014619701222),
     FIX(1.3065629648763763537),
diff --git a/libavcodec/mpegpicture.c b/libavcodec/mpegpicture.c
index 1d9544b..6748fc2 100644
--- a/libavcodec/mpegpicture.c
+++ b/libavcodec/mpegpicture.c
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related picture management functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -56,17 +56,30 @@ do {\
 int ff_mpeg_framesize_alloc(AVCodecContext *avctx, MotionEstContext *me,
                             ScratchpadContext *sc, int linesize)
 {
-    int alloc_size = FFALIGN(FFABS(linesize) + 32, 32);
+    int alloc_size = FFALIGN(FFABS(linesize) + 64, 32);
+
+    if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+        || avctx->codec->capabilities & AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+        )
+        return 0;
+
+    if (linesize < 24) {
+        av_log(avctx, AV_LOG_ERROR, "Image too small, temporary buffers cannot function\n");
+        return AVERROR_PATCHWELCOME;
+    }
 
     // edge emu needs blocksize + filter length - 1
     // (= 17x17 for  halfpel / 21x21 for H.264)
     // VC-1 computes luma and chroma simultaneously and needs 19X19 + 9x9
     // at uvlinesize. It supports only YUV420 so 24x24 is enough
     // linesize * interlaced * MBsize
-    FF_ALLOCZ_OR_GOTO(avctx, sc->edge_emu_buffer, alloc_size * 2 * 24,
+    // we also use this buffer for encoding in encode_mb_internal() needig an additional 32 lines
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, sc->edge_emu_buffer, alloc_size, 4 * 68,
                       fail);
 
-    FF_ALLOCZ_OR_GOTO(avctx, me->scratchpad, alloc_size * 2 * 16 * 3,
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, me->scratchpad, alloc_size, 4 * 16 * 2,
                       fail)
     me->temp            = me->scratchpad;
     sc->rd_scratchpad   = me->scratchpad;
@@ -165,8 +178,8 @@ static int alloc_frame_buffer(AVCodecContext *avctx,  Picture *pic,
     return 0;
 }
 
-static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
-                                int mb_stride, int mb_height, int b8_stride)
+static int alloc_picture_tables(AVCodecContext *avctx, Picture *pic, int encoding, int out_format,
+                                int mb_stride, int mb_width, int mb_height, int b8_stride)
 {
     const int big_mb_num    = mb_stride * (mb_height + 1) + 1;
     const int mb_array_size = mb_stride * mb_height;
@@ -189,7 +202,8 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
             return AVERROR(ENOMEM);
     }
 
-    if (out_format == FMT_H263 || encoding) {
+    if (out_format == FMT_H263 || encoding || avctx->debug_mv ||
+        (avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS)) {
         int mv_size        = 2 * (b8_array_size + 4) * sizeof(int16_t);
         int ref_index_size = 4 * mb_array_size;
 
@@ -201,6 +215,9 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
         }
     }
 
+    pic->alloc_mb_width  = mb_width;
+    pic->alloc_mb_height = mb_height;
+
     return 0;
 }
 
@@ -211,16 +228,21 @@ static int alloc_picture_tables(Picture *pic, int encoding, int out_format,
 int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
                      ScratchpadContext *sc, int shared, int encoding,
                      int chroma_x_shift, int chroma_y_shift, int out_format,
-                     int mb_stride, int mb_height, int b8_stride,
+                     int mb_stride, int mb_width, int mb_height, int b8_stride,
                      ptrdiff_t *linesize, ptrdiff_t *uvlinesize)
 {
     int i, ret;
 
+    if (pic->qscale_table_buf)
+        if (   pic->alloc_mb_width  != mb_width
+            || pic->alloc_mb_height != mb_height)
+            ff_free_picture_tables(pic);
+
     if (shared) {
-        assert(pic->f->data[0]);
+        av_assert0(pic->f->data[0]);
         pic->shared = 1;
     } else {
-        assert(!pic->f->buf[0]);
+        av_assert0(!pic->f->buf[0]);
         if (alloc_frame_buffer(avctx, pic, me, sc,
                                chroma_x_shift, chroma_y_shift,
                                *linesize, *uvlinesize) < 0)
@@ -231,8 +253,8 @@ int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
     }
 
     if (!pic->qscale_table_buf)
-        ret = alloc_picture_tables(pic, encoding, out_format,
-                                   mb_stride, mb_height, b8_stride);
+        ret = alloc_picture_tables(avctx, pic, encoding, out_format,
+                                   mb_stride, mb_width, mb_height, b8_stride);
     else
         ret = make_tables_writable(pic);
     if (ret < 0)
@@ -268,6 +290,8 @@ fail:
  */
 void ff_mpeg_unref_picture(AVCodecContext *avctx, Picture *pic)
 {
+    int off = offsetof(Picture, mb_mean) + sizeof(pic->mb_mean);
+
     pic->tf.f = pic->f;
     /* WM Image / Screen codecs allocate internal buffers with different
      * dimensions / colorspaces; ignore user-defined callbacks for these. */
@@ -282,6 +306,8 @@ void ff_mpeg_unref_picture(AVCodecContext *avctx, Picture *pic)
 
     if (pic->needs_realloc)
         ff_free_picture_tables(pic);
+
+    memset((uint8_t*)pic + off, 0, sizeof(*pic) - off);
 }
 
 int ff_update_picture_tables(Picture *dst, Picture *src)
@@ -323,6 +349,9 @@ do {                                                                          \
         dst->ref_index[i]  = src->ref_index[i];
     }
 
+    dst->alloc_mb_width  = src->alloc_mb_width;
+    dst->alloc_mb_height = src->alloc_mb_height;
+
     return 0;
 }
 
@@ -376,7 +405,7 @@ static inline int pic_is_unused(Picture *pic)
     return 0;
 }
 
-static int find_unused_picture(Picture *picture, int shared)
+static int find_unused_picture(AVCodecContext *avctx, Picture *picture, int shared)
 {
     int i;
 
@@ -392,12 +421,26 @@ static int find_unused_picture(Picture *picture, int shared)
         }
     }
 
-    return AVERROR_INVALIDDATA;
+    av_log(avctx, AV_LOG_FATAL,
+           "Internal error, picture buffer overflow\n");
+    /* We could return -1, but the codec would crash trying to draw into a
+     * non-existing frame anyway. This is safer than waiting for a random crash.
+     * Also the return of this is never useful, an encoder must only allocate
+     * as much as allowed in the specification. This has no relationship to how
+     * much libavcodec could allocate (and MAX_PICTURE_COUNT is always large
+     * enough for such valid streams).
+     * Plus, a decoder has to check stream validity and remove frames if too
+     * many reference frames are around. Waiting for "OOM" is not correct at
+     * all. Similarly, missing reference frames have to be replaced by
+     * interpolated/MC frames, anything else is a bug in the codec ...
+     */
+    abort();
+    return -1;
 }
 
 int ff_find_unused_picture(AVCodecContext *avctx, Picture *picture, int shared)
 {
-    int ret = find_unused_picture(picture, shared);
+    int ret = find_unused_picture(avctx, picture, shared);
 
     if (ret >= 0 && ret < MAX_PICTURE_COUNT) {
         if (picture[ret].needs_realloc) {
@@ -413,6 +456,9 @@ void ff_free_picture_tables(Picture *pic)
 {
     int i;
 
+    pic->alloc_mb_width  =
+    pic->alloc_mb_height = 0;
+
     av_buffer_unref(&pic->mb_var_buf);
     av_buffer_unref(&pic->mc_mb_var_buf);
     av_buffer_unref(&pic->mb_mean_buf);
diff --git a/libavcodec/mpegpicture.h b/libavcodec/mpegpicture.h
index 115c288..2db3d67 100644
--- a/libavcodec/mpegpicture.h
+++ b/libavcodec/mpegpicture.h
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "motion_est.h"
 #include "thread.h"
 
-#define MAX_PICTURE_COUNT 32
+#define MAX_PICTURE_COUNT 36
 #define EDGE_WIDTH 16
 
 typedef struct ScratchpadContext {
@@ -67,6 +67,9 @@ typedef struct Picture {
     AVBufferRef *mc_mb_var_buf;
     uint16_t *mc_mb_var;        ///< Table for motion compensated MB variances
 
+    int alloc_mb_width;         ///< mb_width used to allocate tables
+    int alloc_mb_height;        ///< mb_height used to allocate tables
+
     AVBufferRef *mb_mean_buf;
     uint8_t *mb_mean;           ///< Table for MB luminance
 
@@ -75,16 +78,16 @@ typedef struct Picture {
 
     int field_picture;          ///< whether or not the picture was encoded in separate fields
 
-    int mb_var_sum;             ///< sum of MB variance for current frame
-    int mc_mb_var_sum;          ///< motion compensated MB variance for current frame
+    int64_t mb_var_sum;         ///< sum of MB variance for current frame
+    int64_t mc_mb_var_sum;      ///< motion compensated MB variance for current frame
 
-    int b_frame_score;          /* */
+    int b_frame_score;
     int needs_realloc;          ///< Picture needs to be reallocated (eg due to a frame size change)
 
     int reference;
     int shared;
 
-    uint64_t encoding_error[4];
+    uint64_t encoding_error[AV_NUM_DATA_POINTERS];
 } Picture;
 
 /**
@@ -94,7 +97,7 @@ typedef struct Picture {
 int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me,
                      ScratchpadContext *sc, int shared, int encoding,
                      int chroma_x_shift, int chroma_y_shift, int out_format,
-                     int mb_stride, int mb_height, int b8_stride,
+                     int mb_stride, int mb_width, int mb_height, int b8_stride,
                      ptrdiff_t *linesize, ptrdiff_t *uvlinesize);
 
 int ff_mpeg_framesize_alloc(AVCodecContext *avctx, MotionEstContext *me,
diff --git a/libavcodec/mpegutils.c b/libavcodec/mpegutils.c
index bc430f0..62cc36a 100644
--- a/libavcodec/mpegutils.c
+++ b/libavcodec/mpegutils.c
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegutils.h b/libavcodec/mpegutils.h
index 60f9712..9cfadfc 100644
--- a/libavcodec/mpegutils.h
+++ b/libavcodec/mpegutils.h
@@ -1,20 +1,20 @@
 /*
  * Mpeg video formats-related defines and utility functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index 230154f..eb14b8c 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -5,20 +5,20 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,9 +31,11 @@
 #include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
+#include "libavutil/motion_vector.h"
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "h264chroma.h"
 #include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
@@ -44,7 +46,6 @@
 #include "mjpegenc.h"
 #include "msmpeg4.h"
 #include "qpeldsp.h"
-#include "xvmc_internal.h"
 #include "thread.h"
 #include "wmv2.h"
 #include <limits.h>
@@ -57,10 +58,7 @@ static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
 
     nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
     /* XXX: only MPEG-1 */
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
@@ -116,13 +114,13 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
         int j= s->intra_scantable.permutated[i];
@@ -130,10 +128,10 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
         if (level) {
             if (level < 0) {
                 level = -level;
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
                 level = -level;
             } else {
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
             }
             block[j] = level;
         }
@@ -147,13 +145,14 @@ static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int sum=-1;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
-    if (n < 4)
-        block[0] = block[0] * s->y_dc_scale;
-    else
-        block[0] = block[0] * s->c_dc_scale;
+    block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
+    sum += block[0];
     quant_matrix = s->intra_matrix;
     for(i=1;i<=nCoeffs;i++) {
         int j= s->intra_scantable.permutated[i];
@@ -161,10 +160,10 @@ static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
         if (level) {
             if (level < 0) {
                 level = -level;
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
                 level = -level;
             } else {
-                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = (int)(level * qscale * quant_matrix[j]) >> 4;
             }
             block[j] = level;
             sum+=level;
@@ -180,6 +179,9 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int sum=-1;
 
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
+
     if(s->alternate_scan) nCoeffs= 63;
     else nCoeffs= s->block_last_index[n];
 
@@ -191,11 +193,11 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
             if (level < 0) {
                 level = -level;
                 level = (((level << 1) + 1) * qscale *
-                         ((int) (quant_matrix[j]))) >> 4;
+                         ((int) (quant_matrix[j]))) >> 5;
                 level = -level;
             } else {
                 level = (((level << 1) + 1) * qscale *
-                         ((int) (quant_matrix[j]))) >> 4;
+                         ((int) (quant_matrix[j]))) >> 5;
             }
             block[j] = level;
             sum+=level;
@@ -210,15 +212,12 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     qmul = qscale << 1;
 
     if (!s->h263_aic) {
-        if (n < 4)
-            block[0] = block[0] * s->y_dc_scale;
-        else
-            block[0] = block[0] * s->c_dc_scale;
+        block[0] *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
         qadd = (qscale - 1) | 1;
     }else{
         qadd = 0;
@@ -247,7 +246,7 @@ static void dct_unquantize_h263_inter_c(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
@@ -267,14 +266,41 @@ static void dct_unquantize_h263_inter_c(MpegEncContext *s,
     }
 }
 
+
+static void gray16(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
+{
+    while(h--)
+        memset(dst + h*linesize, 128, 16);
+}
+
+static void gray8(uint8_t *dst, const uint8_t *src, ptrdiff_t linesize, int h)
+{
+    while(h--)
+        memset(dst + h*linesize, 128, 8);
+}
+
 /* init common dct for both encoder and decoder */
 static av_cold int dct_init(MpegEncContext *s)
 {
     ff_blockdsp_init(&s->bdsp, s->avctx);
+    ff_h264chroma_init(&s->h264chroma, 8); //for lowres
     ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
     ff_mpegvideodsp_init(&s->mdsp);
     ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
 
+    if (s->avctx->debug & FF_DEBUG_NOMC) {
+        int i;
+        for (i=0; i<4; i++) {
+            s->hdsp.avg_pixels_tab[0][i] = gray16;
+            s->hdsp.put_pixels_tab[0][i] = gray16;
+            s->hdsp.put_no_rnd_pixels_tab[0][i] = gray16;
+
+            s->hdsp.avg_pixels_tab[1][i] = gray8;
+            s->hdsp.put_pixels_tab[1][i] = gray8;
+            s->hdsp.put_no_rnd_pixels_tab[1][i] = gray8;
+        }
+    }
+
     s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_c;
     s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_c;
     s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_c;
@@ -287,12 +313,16 @@ static av_cold int dct_init(MpegEncContext *s)
     if (HAVE_INTRINSICS_NEON)
         ff_mpv_common_init_neon(s);
 
+    if (ARCH_ALPHA)
+        ff_mpv_common_init_axp(s);
     if (ARCH_ARM)
         ff_mpv_common_init_arm(s);
     if (ARCH_PPC)
         ff_mpv_common_init_ppc(s);
     if (ARCH_X86)
         ff_mpv_common_init_x86(s);
+    if (ARCH_MIPS)
+        ff_mpv_common_init_mips(s);
 
     return 0;
 }
@@ -319,7 +349,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared)
 {
     return ff_alloc_picture(s->avctx, pic, &s->me, &s->sc, shared, 0,
                             s->chroma_x_shift, s->chroma_y_shift, s->out_format,
-                            s->mb_stride, s->mb_height, s->b8_stride,
+                            s->mb_stride, s->mb_width, s->mb_height, s->b8_stride,
                             &s->linesize, &s->uvlinesize);
 }
 
@@ -330,6 +360,9 @@ static int init_duplicate_context(MpegEncContext *s)
     int yc_size = y_size + 2 * c_size;
     int i;
 
+    if (s->mb_height & 1)
+        yc_size += 2*s->b8_stride + 2*s->mb_stride;
+
     s->sc.edge_emu_buffer =
     s->me.scratchpad   =
     s->me.temp         =
@@ -355,10 +388,7 @@ static int init_duplicate_context(MpegEncContext *s)
     }
     if (s->avctx->codec_tag == AV_RL32("VCR2")) {
         // exchange uv
-        int16_t (*tmp)[64];
-        tmp           = s->pblocks[4];
-        s->pblocks[4] = s->pblocks[5];
-        s->pblocks[5] = tmp;
+        FFSWAP(void *, s->pblocks[4], s->pblocks[5]);
     }
 
     if (s->out_format == FMT_H263) {
@@ -436,10 +466,7 @@ int ff_update_duplicate_context(MpegEncContext *dst, MpegEncContext *src)
     }
     if (dst->avctx->codec_tag == AV_RL32("VCR2")) {
         // exchange uv
-        int16_t (*tmp)[64];
-        tmp             = dst->pblocks[4];
-        dst->pblocks[4] = dst->pblocks[5];
-        dst->pblocks[5] = tmp;
+        FFSWAP(void *, dst->pblocks[4], dst->pblocks[5]);
     }
     if (!dst->sc.edge_emu_buffer &&
         (ret = ff_mpeg_framesize_alloc(dst->avctx, &dst->me,
@@ -459,9 +486,11 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
     int i, ret;
     MpegEncContext *s = dst->priv_data, *s1 = src->priv_data;
 
-    if (dst == src || !s1->context_initialized)
+    if (dst == src)
         return 0;
 
+    av_assert0(s != s1);
+
     // FIXME can parameters change on I-frames?
     // in that case dst may need a reinit
     if (!s->context_initialized) {
@@ -472,18 +501,24 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
         s->bitstream_buffer      = NULL;
         s->bitstream_buffer_size = s->allocated_bitstream_buffer_size = 0;
 
-        ff_mpv_idct_init(s);
-        if ((err = ff_mpv_common_init(s)) < 0)
-            return err;
+        if (s1->context_initialized){
+//             s->picture_range_start  += MAX_PICTURE_COUNT;
+//             s->picture_range_end    += MAX_PICTURE_COUNT;
+            ff_mpv_idct_init(s);
+            if((err = ff_mpv_common_init(s)) < 0){
+                memset(s, 0, sizeof(MpegEncContext));
+                s->avctx = dst;
+                return err;
+            }
+        }
     }
 
     if (s->height != s1->height || s->width != s1->width || s->context_reinit) {
-        int err;
         s->context_reinit = 0;
         s->height = s1->height;
         s->width  = s1->width;
-        if ((err = ff_mpv_common_frame_size_change(s)) < 0)
-            return err;
+        if ((ret = ff_mpv_common_frame_size_change(s)) < 0)
+            return ret;
     }
 
     s->avctx->coded_height  = s1->avctx->coded_height;
@@ -494,9 +529,11 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
     s->coded_picture_number = s1->coded_picture_number;
     s->picture_number       = s1->picture_number;
 
+    av_assert0(!s->picture || s->picture != s1->picture);
+    if(s->picture)
     for (i = 0; i < MAX_PICTURE_COUNT; i++) {
         ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
-        if (s1->picture[i].f->buf[0] &&
+        if (s1->picture && s1->picture[i].f->buf[0] &&
             (ret = ff_mpeg_ref_picture(s->avctx, &s->picture[i], &s1->picture[i])) < 0)
             return ret;
     }
@@ -504,7 +541,7 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst,
 #define UPDATE_PICTURE(pic)\
 do {\
     ff_mpeg_unref_picture(s->avctx, &s->pic);\
-    if (s1->pic.f->buf[0])\
+    if (s1->pic.f && s1->pic.f->buf[0])\
         ret = ff_mpeg_ref_picture(s->avctx, &s->pic, &s1->pic);\
     else\
         ret = ff_update_picture_tables(&s->pic, &s1->pic);\
@@ -528,6 +565,7 @@ do {\
     // Error/bug resilience
     s->next_p_frame_damaged = s1->next_p_frame_damaged;
     s->workaround_bugs      = s1->workaround_bugs;
+    s->padding_bug_score    = s1->padding_bug_score;
 
     // MPEG-4 timing info
     memcpy(&s->last_time_base, &s1->last_time_base,
@@ -544,11 +582,16 @@ do {\
 
     if (s1->bitstream_buffer) {
         if (s1->bitstream_buffer_size +
-            AV_INPUT_BUFFER_PADDING_SIZE > s->allocated_bitstream_buffer_size)
+            AV_INPUT_BUFFER_PADDING_SIZE > s->allocated_bitstream_buffer_size) {
             av_fast_malloc(&s->bitstream_buffer,
                            &s->allocated_bitstream_buffer_size,
                            s1->allocated_bitstream_buffer_size);
-            s->bitstream_buffer_size = s1->bitstream_buffer_size;
+            if (!s->bitstream_buffer) {
+                s->bitstream_buffer_size = 0;
+                return AVERROR(ENOMEM);
+            }
+        }
+        s->bitstream_buffer_size = s1->bitstream_buffer_size;
         memcpy(s->bitstream_buffer, s1->bitstream_buffer,
                s1->bitstream_buffer_size);
         memset(s->bitstream_buffer + s->bitstream_buffer_size, 0,
@@ -567,7 +610,6 @@ do {\
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "Context scratch buffers could not "
                    "be allocated due to unknown size.\n");
-            return AVERROR_BUG;
         }
 
     // MPEG-2/interlacing info
@@ -617,6 +659,18 @@ void ff_mpv_decode_defaults(MpegEncContext *s)
     ff_mpv_common_defaults(s);
 }
 
+void ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx)
+{
+    s->avctx           = avctx;
+    s->width           = avctx->coded_width;
+    s->height          = avctx->coded_height;
+    s->codec_id        = avctx->codec->id;
+    s->workaround_bugs = avctx->workaround_bugs;
+
+    /* convert fourcc to upper case */
+    s->codec_tag          = avpriv_toupper4(avctx->codec_tag);
+}
+
 /**
  * Initialize and allocates MpegEncContext fields dependent on the resolution.
  */
@@ -648,44 +702,36 @@ static int init_context_frame(MpegEncContext *s)
     c_size  = s->mb_stride * (s->mb_height + 1);
     yc_size = y_size + 2   * c_size;
 
+    if (s->mb_height & 1)
+        yc_size += 2*s->b8_stride + 2*s->mb_stride;
+
     FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_index2xy, (s->mb_num + 1) * sizeof(int),
                       fail); // error resilience code looks cleaner with this
     for (y = 0; y < s->mb_height; y++)
         for (x = 0; x < s->mb_width; x++)
             s->mb_index2xy[x + y * s->mb_width] = x + y * s->mb_stride;
 
-    s->mb_index2xy[s->mb_height * s->mb_width] =
-        (s->mb_height - 1) * s->mb_stride + s->mb_width; // FIXME really needed?
+    s->mb_index2xy[s->mb_height * s->mb_width] = (s->mb_height - 1) * s->mb_stride + s->mb_width; // FIXME really needed?
 
     if (s->encoding) {
         /* Allocate MV tables */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->p_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_forw_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_back_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_forw_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_back_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_direct_mv_table_base,
-                          mv_table_size * 2 * sizeof(int16_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->p_mv_table_base,                 mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_forw_mv_table_base,            mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_back_mv_table_base,            mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_forw_mv_table_base,      mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_bidir_back_mv_table_base,      mv_table_size * 2 * sizeof(int16_t), fail)
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->b_direct_mv_table_base,          mv_table_size * 2 * sizeof(int16_t), fail)
         s->p_mv_table            = s->p_mv_table_base + s->mb_stride + 1;
         s->b_forw_mv_table       = s->b_forw_mv_table_base + s->mb_stride + 1;
         s->b_back_mv_table       = s->b_back_mv_table_base + s->mb_stride + 1;
-        s->b_bidir_forw_mv_table = s->b_bidir_forw_mv_table_base +
-                                   s->mb_stride + 1;
-        s->b_bidir_back_mv_table = s->b_bidir_back_mv_table_base +
-                                   s->mb_stride + 1;
+        s->b_bidir_forw_mv_table = s->b_bidir_forw_mv_table_base + s->mb_stride + 1;
+        s->b_bidir_back_mv_table = s->b_bidir_back_mv_table_base + s->mb_stride + 1;
         s->b_direct_mv_table     = s->b_direct_mv_table_base + s->mb_stride + 1;
 
         /* Allocate MB type table */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_type, mb_array_size *
-                          sizeof(uint16_t), fail); // needed for encoding
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->mb_type, mb_array_size * sizeof(uint16_t), fail) // needed for encoding
 
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->lambda_table, mb_array_size *
-                          sizeof(int), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->lambda_table, mb_array_size * sizeof(int), fail)
 
         FF_ALLOC_OR_GOTO(s->avctx, s->cplx_tab,
                          mb_array_size * sizeof(float), fail);
@@ -708,34 +754,27 @@ static int init_context_frame(MpegEncContext *s)
                     s->b_field_mv_table[i][j][k] = s->b_field_mv_table_base[i][j][k] +
                                                    s->mb_stride + 1;
                 }
-                FF_ALLOCZ_OR_GOTO(s->avctx, s->b_field_select_table [i][j],
-                                  mb_array_size * 2 * sizeof(uint8_t), fail);
-                FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_mv_table_base[i][j],
-                                  mv_table_size * 2 * sizeof(int16_t), fail);
-                s->p_field_mv_table[i][j] = s->p_field_mv_table_base[i][j]
-                                            + s->mb_stride + 1;
+                FF_ALLOCZ_OR_GOTO(s->avctx, s->b_field_select_table [i][j], mb_array_size * 2 * sizeof(uint8_t), fail)
+                FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_mv_table_base[i][j], mv_table_size * 2 * sizeof(int16_t), fail)
+                s->p_field_mv_table[i][j] = s->p_field_mv_table_base[i][j] + s->mb_stride + 1;
             }
-            FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_select_table[i],
-                              mb_array_size * 2 * sizeof(uint8_t), fail);
+            FF_ALLOCZ_OR_GOTO(s->avctx, s->p_field_select_table[i], mb_array_size * 2 * sizeof(uint8_t), fail)
         }
     }
     if (s->out_format == FMT_H263) {
         /* cbp values */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->coded_block_base, y_size, fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->coded_block_base, y_size + (s->mb_height&1)*2*s->b8_stride, fail);
         s->coded_block = s->coded_block_base + s->b8_stride + 1;
 
         /* cbp, ac_pred, pred_dir */
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->cbp_table,
-                          mb_array_size * sizeof(uint8_t), fail);
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->pred_dir_table,
-                          mb_array_size * sizeof(uint8_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->cbp_table     , mb_array_size * sizeof(uint8_t), fail);
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->pred_dir_table, mb_array_size * sizeof(uint8_t), fail);
     }
 
     if (s->h263_pred || s->h263_plus || !s->encoding) {
         /* dc values */
-        // MN: we need these for  error resilience of intra-frames
-        FF_ALLOCZ_OR_GOTO(s->avctx, s->dc_val_base,
-                          yc_size * sizeof(int16_t), fail);
+        // MN: we need these for error resilience of intra-frames
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->dc_val_base, yc_size * sizeof(int16_t), fail);
         s->dc_val[0] = s->dc_val_base + s->b8_stride + 1;
         s->dc_val[1] = s->dc_val_base + y_size + s->mb_stride + 1;
         s->dc_val[2] = s->dc_val[1] + c_size;
@@ -743,19 +782,94 @@ static int init_context_frame(MpegEncContext *s)
             s->dc_val_base[i] = 1024;
     }
 
-    /* which mb is a intra block */
+    /* which mb is an intra block */
     FF_ALLOCZ_OR_GOTO(s->avctx, s->mbintra_table, mb_array_size, fail);
     memset(s->mbintra_table, 1, mb_array_size);
 
     /* init macroblock skip table */
     FF_ALLOCZ_OR_GOTO(s->avctx, s->mbskip_table, mb_array_size + 2, fail);
-    // Note the + 1 is for  a quicker MPEG-4 slice_end detection
+    // Note the + 1 is for a quicker MPEG-4 slice_end detection
 
     return ff_mpeg_er_init(s);
 fail:
     return AVERROR(ENOMEM);
 }
 
+static void clear_context(MpegEncContext *s)
+{
+    int i, j, k;
+
+    memset(&s->next_picture, 0, sizeof(s->next_picture));
+    memset(&s->last_picture, 0, sizeof(s->last_picture));
+    memset(&s->current_picture, 0, sizeof(s->current_picture));
+    memset(&s->new_picture, 0, sizeof(s->new_picture));
+
+    memset(s->thread_context, 0, sizeof(s->thread_context));
+
+    s->me.map = NULL;
+    s->me.score_map = NULL;
+    s->dct_error_sum = NULL;
+    s->block = NULL;
+    s->blocks = NULL;
+    memset(s->pblocks, 0, sizeof(s->pblocks));
+    s->ac_val_base = NULL;
+    s->ac_val[0] =
+    s->ac_val[1] =
+    s->ac_val[2] =NULL;
+    s->sc.edge_emu_buffer = NULL;
+    s->me.scratchpad = NULL;
+    s->me.temp =
+    s->sc.rd_scratchpad =
+    s->sc.b_scratchpad =
+    s->sc.obmc_scratchpad = NULL;
+
+
+    s->bitstream_buffer = NULL;
+    s->allocated_bitstream_buffer_size = 0;
+    s->picture          = NULL;
+    s->mb_type          = NULL;
+    s->p_mv_table_base  = NULL;
+    s->b_forw_mv_table_base = NULL;
+    s->b_back_mv_table_base = NULL;
+    s->b_bidir_forw_mv_table_base = NULL;
+    s->b_bidir_back_mv_table_base = NULL;
+    s->b_direct_mv_table_base = NULL;
+    s->p_mv_table            = NULL;
+    s->b_forw_mv_table       = NULL;
+    s->b_back_mv_table       = NULL;
+    s->b_bidir_forw_mv_table = NULL;
+    s->b_bidir_back_mv_table = NULL;
+    s->b_direct_mv_table     = NULL;
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+            for (k = 0; k < 2; k++) {
+                s->b_field_mv_table_base[i][j][k] = NULL;
+                s->b_field_mv_table[i][j][k] = NULL;
+            }
+            s->b_field_select_table[i][j] = NULL;
+            s->p_field_mv_table_base[i][j] = NULL;
+            s->p_field_mv_table[i][j] = NULL;
+        }
+        s->p_field_select_table[i] = NULL;
+    }
+
+    s->dc_val_base = NULL;
+    s->coded_block_base = NULL;
+    s->mbintra_table = NULL;
+    s->cbp_table = NULL;
+    s->pred_dir_table = NULL;
+
+    s->mbskip_table = NULL;
+
+    s->er.error_status_table = NULL;
+    s->er.er_temp_buffer = NULL;
+    s->mb_index2xy = NULL;
+    s->lambda_table = NULL;
+
+    s->cplx_tab = NULL;
+    s->bits_tab = NULL;
+}
+
 /**
  * init common structure for both encoder and decoder.
  * this assumes that some variables like width/height are already set
@@ -767,6 +881,8 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
                      s->avctx->active_thread_type & FF_THREAD_SLICE) ?
                     s->avctx->thread_count : 1;
 
+    clear_context(s);
+
     if (s->encoding && s->avctx->slices)
         nb_slices = s->avctx->slices;
 
@@ -799,12 +915,10 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
     dct_init(s);
 
     /* set chroma shifts */
-    av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt,
-                                     &s->chroma_x_shift,
-                                     &s->chroma_y_shift);
+    avcodec_get_chroma_sub_sample(s->avctx->pix_fmt,
+                                  &s->chroma_x_shift,
+                                  &s->chroma_y_shift);
 
-    /* convert fourcc to upper case */
-    s->codec_tag          = avpriv_toupper4(s->avctx->codec_tag);
 
     FF_ALLOCZ_OR_GOTO(s->avctx, s->picture,
                       MAX_PICTURE_COUNT * sizeof(Picture), fail);
@@ -813,10 +927,6 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
         if (!s->picture[i].f)
             goto fail;
     }
-    memset(&s->next_picture, 0, sizeof(s->next_picture));
-    memset(&s->last_picture, 0, sizeof(s->last_picture));
-    memset(&s->current_picture, 0, sizeof(s->current_picture));
-    memset(&s->new_picture, 0, sizeof(s->new_picture));
     s->next_picture.f = av_frame_alloc();
     if (!s->next_picture.f)
         goto fail;
@@ -830,24 +940,23 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
     if (!s->new_picture.f)
         goto fail;
 
-    if (s->width && s->height) {
         if (init_context_frame(s))
             goto fail;
 
         s->parse_context.state = -1;
-    }
 
-    s->context_initialized = 1;
-    s->thread_context[0]   = s;
+        s->context_initialized = 1;
+        memset(s->thread_context, 0, sizeof(s->thread_context));
+        s->thread_context[0]   = s;
 
-    if (s->width && s->height) {
+//     if (s->width && s->height) {
         if (nb_slices > 1) {
-            for (i = 1; i < nb_slices; i++) {
-                s->thread_context[i] = av_malloc(sizeof(MpegEncContext));
-                memcpy(s->thread_context[i], s, sizeof(MpegEncContext));
-            }
-
             for (i = 0; i < nb_slices; i++) {
+                if (i) {
+                    s->thread_context[i] = av_memdup(s, sizeof(MpegEncContext));
+                    if (!s->thread_context[i])
+                        goto fail;
+                }
                 if (init_duplicate_context(s->thread_context[i]) < 0)
                     goto fail;
                     s->thread_context[i]->start_mb_y =
@@ -862,7 +971,7 @@ av_cold int ff_mpv_common_init(MpegEncContext *s)
             s->end_mb_y   = s->mb_height;
         }
         s->slice_context_count = nb_slices;
-    }
+//     }
 
     return 0;
  fail:
@@ -917,6 +1026,7 @@ static void free_context_frame(MpegEncContext *s)
     av_freep(&s->er.er_temp_buffer);
     av_freep(&s->mb_index2xy);
     av_freep(&s->lambda_table);
+
     av_freep(&s->cplx_tab);
     av_freep(&s->bits_tab);
 
@@ -927,6 +1037,9 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
 {
     int i, err = 0;
 
+    if (!s->context_initialized)
+        return AVERROR(EINVAL);
+
     if (s->slice_context_count > 1) {
         for (i = 0; i < s->slice_context_count; i++) {
             free_duplicate_context(s->thread_context[i]);
@@ -961,17 +1074,20 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
     if ((err = init_context_frame(s)))
         goto fail;
 
+    memset(s->thread_context, 0, sizeof(s->thread_context));
     s->thread_context[0]   = s;
 
     if (s->width && s->height) {
         int nb_slices = s->slice_context_count;
         if (nb_slices > 1) {
-            for (i = 1; i < nb_slices; i++) {
-                s->thread_context[i] = av_malloc(sizeof(MpegEncContext));
-                memcpy(s->thread_context[i], s, sizeof(MpegEncContext));
-            }
-
             for (i = 0; i < nb_slices; i++) {
+                if (i) {
+                    s->thread_context[i] = av_memdup(s, sizeof(MpegEncContext));
+                    if (!s->thread_context[i]) {
+                        err = AVERROR(ENOMEM);
+                        goto fail;
+                    }
+                }
                 if ((err = init_duplicate_context(s->thread_context[i])) < 0)
                     goto fail;
                     s->thread_context[i]->start_mb_y =
@@ -980,7 +1096,8 @@ int ff_mpv_common_frame_size_change(MpegEncContext *s)
                         (s->mb_height * (i + 1) + nb_slices / 2) / nb_slices;
             }
         } else {
-            if (init_duplicate_context(s) < 0)
+            err = init_duplicate_context(s);
+            if (err < 0)
                 goto fail;
             s->start_mb_y = 0;
             s->end_mb_y   = s->mb_height;
@@ -999,6 +1116,9 @@ void ff_mpv_common_end(MpegEncContext *s)
 {
     int i;
 
+    if (!s)
+        return ;
+
     if (s->slice_context_count > 1) {
         for (i = 0; i < s->slice_context_count; i++) {
             free_duplicate_context(s->thread_context[i]);
@@ -1045,6 +1165,23 @@ void ff_mpv_common_end(MpegEncContext *s)
     s->linesize = s->uvlinesize = 0;
 }
 
+
+static void gray_frame(AVFrame *frame)
+{
+    int i, h_chroma_shift, v_chroma_shift;
+
+    av_pix_fmt_get_chroma_sub_sample(frame->format, &h_chroma_shift, &v_chroma_shift);
+
+    for(i=0; i<frame->height; i++)
+        memset(frame->data[0] + frame->linesize[0]*i, 0x80, frame->width);
+    for(i=0; i<AV_CEIL_RSHIFT(frame->height, v_chroma_shift); i++) {
+        memset(frame->data[1] + frame->linesize[1]*i,
+               0x80, AV_CEIL_RSHIFT(frame->width, h_chroma_shift));
+        memset(frame->data[2] + frame->linesize[2]*i,
+               0x80, AV_CEIL_RSHIFT(frame->width, h_chroma_shift));
+    }
+}
+
 /**
  * generic function called after decoding
  * the header and before a frame is decoded.
@@ -1055,6 +1192,11 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     Picture *pic;
     s->mb_skipped = 0;
 
+    if (!ff_thread_can_start_frame(avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "Attempt to start a frame outside SETUP state\n");
+        return -1;
+    }
+
     /* mark & release old frames */
     if (s->pict_type != AV_PICTURE_TYPE_B && s->last_picture_ptr &&
         s->last_picture_ptr != s->next_picture_ptr &&
@@ -1073,6 +1215,8 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     }
 
     ff_mpeg_unref_picture(s->avctx, &s->current_picture);
+    ff_mpeg_unref_picture(s->avctx, &s->last_picture);
+    ff_mpeg_unref_picture(s->avctx, &s->next_picture);
 
     /* release non reference frames */
     for (i = 0; i < MAX_PICTURE_COUNT; i++) {
@@ -1081,7 +1225,7 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     }
 
     if (s->current_picture_ptr && !s->current_picture_ptr->f->buf[0]) {
-        // we already have a unused image
+        // we already have an unused image
         // (maybe it was set before reading the header)
         pic = s->current_picture_ptr;
     } else {
@@ -1144,11 +1288,14 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         int h_chroma_shift, v_chroma_shift;
         av_pix_fmt_get_chroma_sub_sample(s->avctx->pix_fmt,
                                          &h_chroma_shift, &v_chroma_shift);
-        if (s->pict_type != AV_PICTURE_TYPE_I)
+        if (s->pict_type == AV_PICTURE_TYPE_B && s->next_picture_ptr && s->next_picture_ptr->f->buf[0])
+            av_log(avctx, AV_LOG_DEBUG,
+                   "allocating dummy last picture for B frame\n");
+        else if (s->pict_type != AV_PICTURE_TYPE_I)
             av_log(avctx, AV_LOG_ERROR,
                    "warning: first frame is no keyframe\n");
         else if (s->picture_structure != PICT_FRAME)
-            av_log(avctx, AV_LOG_INFO,
+            av_log(avctx, AV_LOG_DEBUG,
                    "allocate dummy last picture for field based first keyframe\n");
 
         /* Allocate a dummy frame */
@@ -1160,21 +1307,36 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->last_picture_ptr = &s->picture[i];
 
         s->last_picture_ptr->reference   = 3;
-        s->last_picture_ptr->f->pict_type = AV_PICTURE_TYPE_I;
+        s->last_picture_ptr->f->key_frame = 0;
+        s->last_picture_ptr->f->pict_type = AV_PICTURE_TYPE_P;
 
         if (alloc_picture(s, s->last_picture_ptr, 0) < 0) {
             s->last_picture_ptr = NULL;
             return -1;
         }
 
-        memset(s->last_picture_ptr->f->data[0], 0,
-               avctx->height * s->last_picture_ptr->f->linesize[0]);
-        memset(s->last_picture_ptr->f->data[1], 0x80,
-               (avctx->height >> v_chroma_shift) *
-               s->last_picture_ptr->f->linesize[1]);
-        memset(s->last_picture_ptr->f->data[2], 0x80,
-               (avctx->height >> v_chroma_shift) *
-               s->last_picture_ptr->f->linesize[2]);
+        if (!avctx->hwaccel
+#if FF_API_CAP_VDPAU
+            && !(avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU)
+#endif
+            ) {
+            for(i=0; i<avctx->height; i++)
+                memset(s->last_picture_ptr->f->data[0] + s->last_picture_ptr->f->linesize[0]*i,
+                       0x80, avctx->width);
+            if (s->last_picture_ptr->f->data[2]) {
+                for(i=0; i<AV_CEIL_RSHIFT(avctx->height, v_chroma_shift); i++) {
+                    memset(s->last_picture_ptr->f->data[1] + s->last_picture_ptr->f->linesize[1]*i,
+                        0x80, AV_CEIL_RSHIFT(avctx->width, h_chroma_shift));
+                    memset(s->last_picture_ptr->f->data[2] + s->last_picture_ptr->f->linesize[2]*i,
+                        0x80, AV_CEIL_RSHIFT(avctx->width, h_chroma_shift));
+                }
+            }
+
+            if(s->codec_id == AV_CODEC_ID_FLV1 || s->codec_id == AV_CODEC_ID_H263){
+                for(i=0; i<avctx->height; i++)
+                memset(s->last_picture_ptr->f->data[0] + s->last_picture_ptr->f->linesize[0]*i, 16, avctx->width);
+            }
+        }
 
         ff_thread_report_progress(&s->last_picture_ptr->tf, INT_MAX, 0);
         ff_thread_report_progress(&s->last_picture_ptr->tf, INT_MAX, 1);
@@ -1190,7 +1352,8 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->next_picture_ptr = &s->picture[i];
 
         s->next_picture_ptr->reference   = 3;
-        s->next_picture_ptr->f->pict_type = AV_PICTURE_TYPE_I;
+        s->next_picture_ptr->f->key_frame = 0;
+        s->next_picture_ptr->f->pict_type = AV_PICTURE_TYPE_P;
 
         if (alloc_picture(s, s->next_picture_ptr, 0) < 0) {
             s->next_picture_ptr = NULL;
@@ -1200,27 +1363,25 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         ff_thread_report_progress(&s->next_picture_ptr->tf, INT_MAX, 1);
     }
 
+#if 0 // BUFREF-FIXME
+    memset(s->last_picture.f->data, 0, sizeof(s->last_picture.f->data));
+    memset(s->next_picture.f->data, 0, sizeof(s->next_picture.f->data));
+#endif
     if (s->last_picture_ptr) {
-        ff_mpeg_unref_picture(s->avctx, &s->last_picture);
         if (s->last_picture_ptr->f->buf[0] &&
             (ret = ff_mpeg_ref_picture(s->avctx, &s->last_picture,
                                        s->last_picture_ptr)) < 0)
             return ret;
     }
     if (s->next_picture_ptr) {
-        ff_mpeg_unref_picture(s->avctx, &s->next_picture);
         if (s->next_picture_ptr->f->buf[0] &&
             (ret = ff_mpeg_ref_picture(s->avctx, &s->next_picture,
                                        s->next_picture_ptr)) < 0)
             return ret;
     }
 
-    if (s->pict_type != AV_PICTURE_TYPE_I &&
-        !(s->last_picture_ptr && s->last_picture_ptr->f->buf[0])) {
-        av_log(s, AV_LOG_ERROR,
-               "Non-reference picture received and no reference available\n");
-        return AVERROR_INVALIDDATA;
-    }
+    av_assert0(s->pict_type == AV_PICTURE_TYPE_I || (s->last_picture_ptr &&
+                                                 s->last_picture_ptr->f->buf[0]));
 
     if (s->picture_structure!= PICT_FRAME) {
         int i;
@@ -1249,12 +1410,9 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
         s->dct_unquantize_inter = s->dct_unquantize_mpeg1_inter;
     }
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration)
-        return ff_xvmc_field_start(s, avctx);
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
+    if (s->avctx->debug & FF_DEBUG_NOMC) {
+        gray_frame(s->current_picture_ptr->f);
+    }
 
     return 0;
 }
@@ -1262,119 +1420,943 @@ FF_ENABLE_DEPRECATION_WARNINGS
 /* called after a frame has been decoded. */
 void ff_mpv_frame_end(MpegEncContext *s)
 {
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* redraw edges for the frame if decoding didn't complete */
-    // just to make sure that all data is rendered.
-    if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration) {
-        ff_xvmc_field_end(s);
-    } else
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
     emms_c();
 
     if (s->current_picture.reference)
         ff_thread_report_progress(&s->current_picture_ptr->tf, INT_MAX, 0);
 }
 
+
+#if FF_API_VISMV
+static int clip_line(int *sx, int *sy, int *ex, int *ey, int maxx)
+{
+    if(*sx > *ex)
+        return clip_line(ex, ey, sx, sy, maxx);
+
+    if (*sx < 0) {
+        if (*ex < 0)
+            return 1;
+        *sy = *ey + (*sy - *ey) * (int64_t)*ex / (*ex - *sx);
+        *sx = 0;
+    }
+
+    if (*ex > maxx) {
+        if (*sx > maxx)
+            return 1;
+        *ey = *sy + (*ey - *sy) * (int64_t)(maxx - *sx) / (*ex - *sx);
+        *ex = maxx;
+    }
+    return 0;
+}
+
+
+/**
+ * Draw a line from (ex, ey) -> (sx, sy).
+ * @param w width of the image
+ * @param h height of the image
+ * @param stride stride/linesize of the image
+ * @param color color of the arrow
+ */
+static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey,
+                      int w, int h, int stride, int color)
+{
+    int x, y, fr, f;
+
+    if (clip_line(&sx, &sy, &ex, &ey, w - 1))
+        return;
+    if (clip_line(&sy, &sx, &ey, &ex, h - 1))
+        return;
+
+    sx = av_clip(sx, 0, w - 1);
+    sy = av_clip(sy, 0, h - 1);
+    ex = av_clip(ex, 0, w - 1);
+    ey = av_clip(ey, 0, h - 1);
+
+    buf[sy * stride + sx] += color;
+
+    if (FFABS(ex - sx) > FFABS(ey - sy)) {
+        if (sx > ex) {
+            FFSWAP(int, sx, ex);
+            FFSWAP(int, sy, ey);
+        }
+        buf += sx + sy * stride;
+        ex  -= sx;
+        f    = ((ey - sy) << 16) / ex;
+        for (x = 0; x <= ex; x++) {
+            y  = (x * f) >> 16;
+            fr = (x * f) & 0xFFFF;
+            buf[y * stride + x]       += (color * (0x10000 - fr)) >> 16;
+            if(fr) buf[(y + 1) * stride + x] += (color *            fr ) >> 16;
+        }
+    } else {
+        if (sy > ey) {
+            FFSWAP(int, sx, ex);
+            FFSWAP(int, sy, ey);
+        }
+        buf += sx + sy * stride;
+        ey  -= sy;
+        if (ey)
+            f = ((ex - sx) << 16) / ey;
+        else
+            f = 0;
+        for(y= 0; y <= ey; y++){
+            x  = (y*f) >> 16;
+            fr = (y*f) & 0xFFFF;
+            buf[y * stride + x]     += (color * (0x10000 - fr)) >> 16;
+            if(fr) buf[y * stride + x + 1] += (color *            fr ) >> 16;
+        }
+    }
+}
+
+/**
+ * Draw an arrow from (ex, ey) -> (sx, sy).
+ * @param w width of the image
+ * @param h height of the image
+ * @param stride stride/linesize of the image
+ * @param color color of the arrow
+ */
+static void draw_arrow(uint8_t *buf, int sx, int sy, int ex,
+                       int ey, int w, int h, int stride, int color, int tail, int direction)
+{
+    int dx,dy;
+
+    if (direction) {
+        FFSWAP(int, sx, ex);
+        FFSWAP(int, sy, ey);
+    }
+
+    sx = av_clip(sx, -100, w + 100);
+    sy = av_clip(sy, -100, h + 100);
+    ex = av_clip(ex, -100, w + 100);
+    ey = av_clip(ey, -100, h + 100);
+
+    dx = ex - sx;
+    dy = ey - sy;
+
+    if (dx * dx + dy * dy > 3 * 3) {
+        int rx =  dx + dy;
+        int ry = -dx + dy;
+        int length = ff_sqrt((rx * rx + ry * ry) << 8);
+
+        // FIXME subpixel accuracy
+        rx = ROUNDED_DIV(rx * 3 << 4, length);
+        ry = ROUNDED_DIV(ry * 3 << 4, length);
+
+        if (tail) {
+            rx = -rx;
+            ry = -ry;
+        }
+
+        draw_line(buf, sx, sy, sx + rx, sy + ry, w, h, stride, color);
+        draw_line(buf, sx, sy, sx - ry, sy + rx, w, h, stride, color);
+    }
+    draw_line(buf, sx, sy, ex, ey, w, h, stride, color);
+}
+#endif
+
+static int add_mb(AVMotionVector *mb, uint32_t mb_type,
+                  int dst_x, int dst_y,
+                  int motion_x, int motion_y, int motion_scale,
+                  int direction)
+{
+    mb->w = IS_8X8(mb_type) || IS_8X16(mb_type) ? 8 : 16;
+    mb->h = IS_8X8(mb_type) || IS_16X8(mb_type) ? 8 : 16;
+    mb->motion_x = motion_x;
+    mb->motion_y = motion_y;
+    mb->motion_scale = motion_scale;
+    mb->dst_x = dst_x;
+    mb->dst_y = dst_y;
+    mb->src_x = dst_x + motion_x / motion_scale;
+    mb->src_y = dst_y + motion_y / motion_scale;
+    mb->source = direction ? 1 : -1;
+    mb->flags = 0; // XXX: does mb_type contain extra information that could be exported here?
+    return 1;
+}
+
 /**
  * Print debugging info for the given picture.
  */
-void ff_print_debug_info(MpegEncContext *s, Picture *p)
+void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
+                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
+                         int *low_delay,
+                         int mb_width, int mb_height, int mb_stride, int quarter_sample)
 {
-    AVFrame *pict;
-    if (s->avctx->hwaccel || !p || !p->mb_type)
+    if ((avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS) && mbtype_table && motion_val[0]) {
+        const int shift = 1 + quarter_sample;
+        const int scale = 1 << shift;
+        const int mv_sample_log2 = avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_SVQ3 ? 2 : 1;
+        const int mv_stride      = (mb_width << mv_sample_log2) +
+                                   (avctx->codec->id == AV_CODEC_ID_H264 ? 0 : 1);
+        int mb_x, mb_y, mbcount = 0;
+
+        /* size is width * height * 2 * 4 where 2 is for directions and 4 is
+         * for the maximum number of MB (4 MB in case of IS_8x8) */
+        AVMotionVector *mvs = av_malloc_array(mb_width * mb_height, 2 * 4 * sizeof(AVMotionVector));
+        if (!mvs)
+            return;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            for (mb_x = 0; mb_x < mb_width; mb_x++) {
+                int i, direction, mb_type = mbtype_table[mb_x + mb_y * mb_stride];
+                for (direction = 0; direction < 2; direction++) {
+                    if (!USES_LIST(mb_type, direction))
+                        continue;
+                    if (IS_8X8(mb_type)) {
+                        for (i = 0; i < 4; i++) {
+                            int sx = mb_x * 16 + 4 + 8 * (i & 1);
+                            int sy = mb_y * 16 + 4 + 8 * (i >> 1);
+                            int xy = (mb_x * 2 + (i & 1) +
+                                      (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = motion_val[direction][xy][0];
+                            int my = motion_val[direction][xy][1];
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                        }
+                    } else if (IS_16X8(mb_type)) {
+                        for (i = 0; i < 2; i++) {
+                            int sx = mb_x * 16 + 8;
+                            int sy = mb_y * 16 + 4 + 8 * i;
+                            int xy = (mb_x * 2 + (mb_y * 2 + i) * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = motion_val[direction][xy][0];
+                            int my = motion_val[direction][xy][1];
+
+                            if (IS_INTERLACED(mb_type))
+                                my *= 2;
+
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                        }
+                    } else if (IS_8X16(mb_type)) {
+                        for (i = 0; i < 2; i++) {
+                            int sx = mb_x * 16 + 4 + 8 * i;
+                            int sy = mb_y * 16 + 8;
+                            int xy = (mb_x * 2 + i + mb_y * 2 * mv_stride) << (mv_sample_log2 - 1);
+                            int mx = motion_val[direction][xy][0];
+                            int my = motion_val[direction][xy][1];
+
+                            if (IS_INTERLACED(mb_type))
+                                my *= 2;
+
+                            mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                        }
+                    } else {
+                          int sx = mb_x * 16 + 8;
+                          int sy = mb_y * 16 + 8;
+                          int xy = (mb_x + mb_y * mv_stride) << mv_sample_log2;
+                          int mx = motion_val[direction][xy][0];
+                          int my = motion_val[direction][xy][1];
+                          mbcount += add_mb(mvs + mbcount, mb_type, sx, sy, mx, my, scale, direction);
+                    }
+                }
+            }
+        }
+
+        if (mbcount) {
+            AVFrameSideData *sd;
+
+            av_log(avctx, AV_LOG_DEBUG, "Adding %d MVs info to frame %d\n", mbcount, avctx->frame_number);
+            sd = av_frame_new_side_data(pict, AV_FRAME_DATA_MOTION_VECTORS, mbcount * sizeof(AVMotionVector));
+            if (!sd) {
+                av_freep(&mvs);
+                return;
+            }
+            memcpy(sd->data, mvs, mbcount * sizeof(AVMotionVector));
+        }
+
+        av_freep(&mvs);
+    }
+
+    /* TODO: export all the following to make them accessible for users (and filters) */
+    if (avctx->hwaccel || !mbtype_table
+#if FF_API_CAP_VDPAU
+        || (avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU)
+#endif
+        )
         return;
-    pict = p->f;
 
-    if (s->avctx->debug & (FF_DEBUG_SKIP | FF_DEBUG_QP | FF_DEBUG_MB_TYPE)) {
+
+    if (avctx->debug & (FF_DEBUG_SKIP | FF_DEBUG_QP | FF_DEBUG_MB_TYPE)) {
         int x,y;
 
-        av_log(s->avctx,AV_LOG_DEBUG,"New frame, type: ");
-        switch (pict->pict_type) {
-        case AV_PICTURE_TYPE_I:
-            av_log(s->avctx,AV_LOG_DEBUG,"I\n");
-            break;
-        case AV_PICTURE_TYPE_P:
-            av_log(s->avctx,AV_LOG_DEBUG,"P\n");
-            break;
-        case AV_PICTURE_TYPE_B:
-            av_log(s->avctx,AV_LOG_DEBUG,"B\n");
-            break;
-        case AV_PICTURE_TYPE_S:
-            av_log(s->avctx,AV_LOG_DEBUG,"S\n");
-            break;
-        case AV_PICTURE_TYPE_SI:
-            av_log(s->avctx,AV_LOG_DEBUG,"SI\n");
-            break;
-        case AV_PICTURE_TYPE_SP:
-            av_log(s->avctx,AV_LOG_DEBUG,"SP\n");
-            break;
-        }
-        for (y = 0; y < s->mb_height; y++) {
-            for (x = 0; x < s->mb_width; x++) {
-                if (s->avctx->debug & FF_DEBUG_SKIP) {
-                    int count = s->mbskip_table[x + y * s->mb_stride];
+        av_log(avctx, AV_LOG_DEBUG, "New frame, type: %c\n",
+               av_get_picture_type_char(pict->pict_type));
+        for (y = 0; y < mb_height; y++) {
+            for (x = 0; x < mb_width; x++) {
+                if (avctx->debug & FF_DEBUG_SKIP) {
+                    int count = mbskip_table ? mbskip_table[x + y * mb_stride] : 0;
                     if (count > 9)
                         count = 9;
-                    av_log(s->avctx, AV_LOG_DEBUG, "%1d", count);
+                    av_log(avctx, AV_LOG_DEBUG, "%1d", count);
                 }
-                if (s->avctx->debug & FF_DEBUG_QP) {
-                    av_log(s->avctx, AV_LOG_DEBUG, "%2d",
-                           p->qscale_table[x + y * s->mb_stride]);
+                if (avctx->debug & FF_DEBUG_QP) {
+                    av_log(avctx, AV_LOG_DEBUG, "%2d",
+                           qscale_table[x + y * mb_stride]);
                 }
-                if (s->avctx->debug & FF_DEBUG_MB_TYPE) {
-                    int mb_type = p->mb_type[x + y * s->mb_stride];
+                if (avctx->debug & FF_DEBUG_MB_TYPE) {
+                    int mb_type = mbtype_table[x + y * mb_stride];
                     // Type & MV direction
                     if (IS_PCM(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "P");
+                        av_log(avctx, AV_LOG_DEBUG, "P");
                     else if (IS_INTRA(mb_type) && IS_ACPRED(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "A");
+                        av_log(avctx, AV_LOG_DEBUG, "A");
                     else if (IS_INTRA4x4(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "i");
+                        av_log(avctx, AV_LOG_DEBUG, "i");
                     else if (IS_INTRA16x16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "I");
+                        av_log(avctx, AV_LOG_DEBUG, "I");
                     else if (IS_DIRECT(mb_type) && IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "d");
+                        av_log(avctx, AV_LOG_DEBUG, "d");
                     else if (IS_DIRECT(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "D");
+                        av_log(avctx, AV_LOG_DEBUG, "D");
                     else if (IS_GMC(mb_type) && IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "g");
+                        av_log(avctx, AV_LOG_DEBUG, "g");
                     else if (IS_GMC(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "G");
+                        av_log(avctx, AV_LOG_DEBUG, "G");
                     else if (IS_SKIP(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "S");
+                        av_log(avctx, AV_LOG_DEBUG, "S");
                     else if (!USES_LIST(mb_type, 1))
-                        av_log(s->avctx, AV_LOG_DEBUG, ">");
+                        av_log(avctx, AV_LOG_DEBUG, ">");
                     else if (!USES_LIST(mb_type, 0))
-                        av_log(s->avctx, AV_LOG_DEBUG, "<");
+                        av_log(avctx, AV_LOG_DEBUG, "<");
                     else {
-                        assert(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
-                        av_log(s->avctx, AV_LOG_DEBUG, "X");
+                        av_assert2(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
+                        av_log(avctx, AV_LOG_DEBUG, "X");
                     }
 
                     // segmentation
                     if (IS_8X8(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "+");
+                        av_log(avctx, AV_LOG_DEBUG, "+");
                     else if (IS_16X8(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "-");
+                        av_log(avctx, AV_LOG_DEBUG, "-");
                     else if (IS_8X16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "|");
+                        av_log(avctx, AV_LOG_DEBUG, "|");
                     else if (IS_INTRA(mb_type) || IS_16X16(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, " ");
+                        av_log(avctx, AV_LOG_DEBUG, " ");
                     else
-                        av_log(s->avctx, AV_LOG_DEBUG, "?");
+                        av_log(avctx, AV_LOG_DEBUG, "?");
 
 
                     if (IS_INTERLACED(mb_type))
-                        av_log(s->avctx, AV_LOG_DEBUG, "=");
+                        av_log(avctx, AV_LOG_DEBUG, "=");
                     else
-                        av_log(s->avctx, AV_LOG_DEBUG, " ");
+                        av_log(avctx, AV_LOG_DEBUG, " ");
+                }
+            }
+            av_log(avctx, AV_LOG_DEBUG, "\n");
+        }
+    }
+
+    if ((avctx->debug & (FF_DEBUG_VIS_QP | FF_DEBUG_VIS_MB_TYPE)) ||
+        (avctx->debug_mv)) {
+        int mb_y;
+        int i;
+        int h_chroma_shift, v_chroma_shift, block_height;
+#if FF_API_VISMV
+        const int shift = 1 + quarter_sample;
+        uint8_t *ptr;
+        const int width          = avctx->width;
+        const int height         = avctx->height;
+#endif
+        const int mv_sample_log2 = avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_SVQ3 ? 2 : 1;
+        const int mv_stride      = (mb_width << mv_sample_log2) +
+                                   (avctx->codec->id == AV_CODEC_ID_H264 ? 0 : 1);
+
+        if (low_delay)
+            *low_delay = 0; // needed to see the vectors without trashing the buffers
+
+        avcodec_get_chroma_sub_sample(avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
+
+        av_frame_make_writable(pict);
+
+        pict->opaque = NULL;
+#if FF_API_VISMV
+        ptr          = pict->data[0];
+#endif
+        block_height = 16 >> v_chroma_shift;
+
+        for (mb_y = 0; mb_y < mb_height; mb_y++) {
+            int mb_x;
+            for (mb_x = 0; mb_x < mb_width; mb_x++) {
+                const int mb_index = mb_x + mb_y * mb_stride;
+#if FF_API_VISMV
+                if ((avctx->debug_mv) && motion_val[0]) {
+                    int type;
+                    for (type = 0; type < 3; type++) {
+                        int direction = 0;
+                        switch (type) {
+                        case 0:
+                            if ((!(avctx->debug_mv & FF_DEBUG_VIS_MV_P_FOR)) ||
+                                (pict->pict_type!= AV_PICTURE_TYPE_P))
+                                continue;
+                            direction = 0;
+                            break;
+                        case 1:
+                            if ((!(avctx->debug_mv & FF_DEBUG_VIS_MV_B_FOR)) ||
+                                (pict->pict_type!= AV_PICTURE_TYPE_B))
+                                continue;
+                            direction = 0;
+                            break;
+                        case 2:
+                            if ((!(avctx->debug_mv & FF_DEBUG_VIS_MV_B_BACK)) ||
+                                (pict->pict_type!= AV_PICTURE_TYPE_B))
+                                continue;
+                            direction = 1;
+                            break;
+                        }
+                        if (!USES_LIST(mbtype_table[mb_index], direction))
+                            continue;
+
+                        if (IS_8X8(mbtype_table[mb_index])) {
+                            int i;
+                            for (i = 0; i < 4; i++) {
+                                int sx = mb_x * 16 + 4 + 8 * (i & 1);
+                                int sy = mb_y * 16 + 4 + 8 * (i >> 1);
+                                int xy = (mb_x * 2 + (i & 1) +
+                                          (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                                int mx = (motion_val[direction][xy][0] >> shift) + sx;
+                                int my = (motion_val[direction][xy][1] >> shift) + sy;
+                                draw_arrow(ptr, sx, sy, mx, my, width,
+                                           height, pict->linesize[0], 100, 0, direction);
+                            }
+                        } else if (IS_16X8(mbtype_table[mb_index])) {
+                            int i;
+                            for (i = 0; i < 2; i++) {
+                                int sx = mb_x * 16 + 8;
+                                int sy = mb_y * 16 + 4 + 8 * i;
+                                int xy = (mb_x * 2 + (mb_y * 2 + i) * mv_stride) << (mv_sample_log2 - 1);
+                                int mx = (motion_val[direction][xy][0] >> shift);
+                                int my = (motion_val[direction][xy][1] >> shift);
+
+                                if (IS_INTERLACED(mbtype_table[mb_index]))
+                                    my *= 2;
+
+                                draw_arrow(ptr, sx, sy, mx + sx, my + sy, width,
+                                           height, pict->linesize[0], 100, 0, direction);
+                            }
+                        } else if (IS_8X16(mbtype_table[mb_index])) {
+                            int i;
+                            for (i = 0; i < 2; i++) {
+                                int sx = mb_x * 16 + 4 + 8 * i;
+                                int sy = mb_y * 16 + 8;
+                                int xy = (mb_x * 2 + i + mb_y * 2 * mv_stride) << (mv_sample_log2 - 1);
+                                int mx = motion_val[direction][xy][0] >> shift;
+                                int my = motion_val[direction][xy][1] >> shift;
+
+                                if (IS_INTERLACED(mbtype_table[mb_index]))
+                                    my *= 2;
+
+                                draw_arrow(ptr, sx, sy, mx + sx, my + sy, width,
+                                           height, pict->linesize[0], 100, 0, direction);
+                            }
+                        } else {
+                              int sx= mb_x * 16 + 8;
+                              int sy= mb_y * 16 + 8;
+                              int xy= (mb_x + mb_y * mv_stride) << mv_sample_log2;
+                              int mx= (motion_val[direction][xy][0]>>shift) + sx;
+                              int my= (motion_val[direction][xy][1]>>shift) + sy;
+                              draw_arrow(ptr, sx, sy, mx, my, width, height, pict->linesize[0], 100, 0, direction);
+                        }
+                    }
+                }
+#endif
+                if ((avctx->debug & FF_DEBUG_VIS_QP)) {
+                    uint64_t c = (qscale_table[mb_index] * 128 / 31) *
+                                 0x0101010101010101ULL;
+                    int y;
+                    for (y = 0; y < block_height; y++) {
+                        *(uint64_t *)(pict->data[1] + 8 * mb_x +
+                                      (block_height * mb_y + y) *
+                                      pict->linesize[1]) = c;
+                        *(uint64_t *)(pict->data[2] + 8 * mb_x +
+                                      (block_height * mb_y + y) *
+                                      pict->linesize[2]) = c;
+                    }
+                }
+                if ((avctx->debug & FF_DEBUG_VIS_MB_TYPE) &&
+                    motion_val[0]) {
+                    int mb_type = mbtype_table[mb_index];
+                    uint64_t u,v;
+                    int y;
+#define COLOR(theta, r) \
+    u = (int)(128 + r * cos(theta * M_PI / 180)); \
+    v = (int)(128 + r * sin(theta * M_PI / 180));
+
+
+                    u = v = 128;
+                    if (IS_PCM(mb_type)) {
+                        COLOR(120, 48)
+                    } else if ((IS_INTRA(mb_type) && IS_ACPRED(mb_type)) ||
+                               IS_INTRA16x16(mb_type)) {
+                        COLOR(30, 48)
+                    } else if (IS_INTRA4x4(mb_type)) {
+                        COLOR(90, 48)
+                    } else if (IS_DIRECT(mb_type) && IS_SKIP(mb_type)) {
+                        // COLOR(120, 48)
+                    } else if (IS_DIRECT(mb_type)) {
+                        COLOR(150, 48)
+                    } else if (IS_GMC(mb_type) && IS_SKIP(mb_type)) {
+                        COLOR(170, 48)
+                    } else if (IS_GMC(mb_type)) {
+                        COLOR(190, 48)
+                    } else if (IS_SKIP(mb_type)) {
+                        // COLOR(180, 48)
+                    } else if (!USES_LIST(mb_type, 1)) {
+                        COLOR(240, 48)
+                    } else if (!USES_LIST(mb_type, 0)) {
+                        COLOR(0, 48)
+                    } else {
+                        av_assert2(USES_LIST(mb_type, 0) && USES_LIST(mb_type, 1));
+                        COLOR(300,48)
+                    }
+
+                    u *= 0x0101010101010101ULL;
+                    v *= 0x0101010101010101ULL;
+                    for (y = 0; y < block_height; y++) {
+                        *(uint64_t *)(pict->data[1] + 8 * mb_x +
+                                      (block_height * mb_y + y) * pict->linesize[1]) = u;
+                        *(uint64_t *)(pict->data[2] + 8 * mb_x +
+                                      (block_height * mb_y + y) * pict->linesize[2]) = v;
+                    }
+
+                    // segmentation
+                    if (IS_8X8(mb_type) || IS_16X8(mb_type)) {
+                        *(uint64_t *)(pict->data[0] + 16 * mb_x + 0 +
+                                      (16 * mb_y + 8) * pict->linesize[0]) ^= 0x8080808080808080ULL;
+                        *(uint64_t *)(pict->data[0] + 16 * mb_x + 8 +
+                                      (16 * mb_y + 8) * pict->linesize[0]) ^= 0x8080808080808080ULL;
+                    }
+                    if (IS_8X8(mb_type) || IS_8X16(mb_type)) {
+                        for (y = 0; y < 16; y++)
+                            pict->data[0][16 * mb_x + 8 + (16 * mb_y + y) *
+                                          pict->linesize[0]] ^= 0x80;
+                    }
+                    if (IS_8X8(mb_type) && mv_sample_log2 >= 2) {
+                        int dm = 1 << (mv_sample_log2 - 2);
+                        for (i = 0; i < 4; i++) {
+                            int sx = mb_x * 16 + 8 * (i & 1);
+                            int sy = mb_y * 16 + 8 * (i >> 1);
+                            int xy = (mb_x * 2 + (i & 1) +
+                                     (mb_y * 2 + (i >> 1)) * mv_stride) << (mv_sample_log2 - 1);
+                            // FIXME bidir
+                            int32_t *mv = (int32_t *) &motion_val[0][xy];
+                            if (mv[0] != mv[dm] ||
+                                mv[dm * mv_stride] != mv[dm * (mv_stride + 1)])
+                                for (y = 0; y < 8; y++)
+                                    pict->data[0][sx + 4 + (sy + y) * pict->linesize[0]] ^= 0x80;
+                            if (mv[0] != mv[dm * mv_stride] || mv[dm] != mv[dm * (mv_stride + 1)])
+                                *(uint64_t *)(pict->data[0] + sx + (sy + 4) *
+                                              pict->linesize[0]) ^= 0x8080808080808080ULL;
+                        }
+                    }
+
+                    if (IS_INTERLACED(mb_type) &&
+                        avctx->codec->id == AV_CODEC_ID_H264) {
+                        // hmm
+                    }
+                }
+                if (mbskip_table)
+                    mbskip_table[mb_index] = 0;
+            }
+        }
+    }
+}
+
+void ff_print_debug_info(MpegEncContext *s, Picture *p, AVFrame *pict)
+{
+    ff_print_debug_info2(s->avctx, pict, s->mbskip_table, p->mb_type,
+                         p->qscale_table, p->motion_val, &s->low_delay,
+                         s->mb_width, s->mb_height, s->mb_stride, s->quarter_sample);
+}
+
+int ff_mpv_export_qp_table(MpegEncContext *s, AVFrame *f, Picture *p, int qp_type)
+{
+    AVBufferRef *ref = av_buffer_ref(p->qscale_table_buf);
+    int offset = 2*s->mb_stride + 1;
+    if(!ref)
+        return AVERROR(ENOMEM);
+    av_assert0(ref->size >= offset + s->mb_stride * ((f->height+15)/16));
+    ref->size -= offset;
+    ref->data += offset;
+    return av_frame_set_qp_table(f, ref, s->mb_stride, qp_type);
+}
+
+static inline int hpel_motion_lowres(MpegEncContext *s,
+                                     uint8_t *dest, uint8_t *src,
+                                     int field_based, int field_select,
+                                     int src_x, int src_y,
+                                     int width, int height, ptrdiff_t stride,
+                                     int h_edge_pos, int v_edge_pos,
+                                     int w, int h, h264_chroma_mc_func *pix_op,
+                                     int motion_x, int motion_y)
+{
+    const int lowres   = s->avctx->lowres;
+    const int op_index = FFMIN(lowres, 3);
+    const int s_mask   = (2 << lowres) - 1;
+    int emu = 0;
+    int sx, sy;
+
+    if (s->quarter_sample) {
+        motion_x /= 2;
+        motion_y /= 2;
+    }
+
+    sx = motion_x & s_mask;
+    sy = motion_y & s_mask;
+    src_x += motion_x >> lowres + 1;
+    src_y += motion_y >> lowres + 1;
+
+    src   += src_y * stride + src_x;
+
+    if ((unsigned)src_x > FFMAX( h_edge_pos - (!!sx) - w,                 0) ||
+        (unsigned)src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, src,
+                                 s->linesize, s->linesize,
+                                 w + 1, (h + 1) << field_based,
+                                 src_x, src_y   << field_based,
+                                 h_edge_pos, v_edge_pos);
+        src = s->sc.edge_emu_buffer;
+        emu = 1;
+    }
+
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    if (field_select)
+        src += s->linesize;
+    pix_op[op_index](dest, src, stride, h, sx, sy);
+    return emu;
+}
+
+/* apply one mpeg motion vector to the three components */
+static av_always_inline void mpeg_motion_lowres(MpegEncContext *s,
+                                                uint8_t *dest_y,
+                                                uint8_t *dest_cb,
+                                                uint8_t *dest_cr,
+                                                int field_based,
+                                                int bottom_field,
+                                                int field_select,
+                                                uint8_t **ref_picture,
+                                                h264_chroma_mc_func *pix_op,
+                                                int motion_x, int motion_y,
+                                                int h, int mb_y)
+{
+    uint8_t *ptr_y, *ptr_cb, *ptr_cr;
+    int mx, my, src_x, src_y, uvsrc_x, uvsrc_y, sx, sy, uvsx, uvsy;
+    ptrdiff_t uvlinesize, linesize;
+    const int lowres     = s->avctx->lowres;
+    const int op_index   = FFMIN(lowres-1+s->chroma_x_shift, 3);
+    const int block_s    = 8>>lowres;
+    const int s_mask     = (2 << lowres) - 1;
+    const int h_edge_pos = s->h_edge_pos >> lowres;
+    const int v_edge_pos = s->v_edge_pos >> lowres;
+    linesize   = s->current_picture.f->linesize[0] << field_based;
+    uvlinesize = s->current_picture.f->linesize[1] << field_based;
+
+    // FIXME obviously not perfect but qpel will not work in lowres anyway
+    if (s->quarter_sample) {
+        motion_x /= 2;
+        motion_y /= 2;
+    }
+
+    if(field_based){
+        motion_y += (bottom_field - field_select)*((1 << lowres)-1);
+    }
+
+    sx = motion_x & s_mask;
+    sy = motion_y & s_mask;
+    src_x = s->mb_x * 2 * block_s + (motion_x >> lowres + 1);
+    src_y = (mb_y * 2 * block_s >> field_based) + (motion_y >> lowres + 1);
+
+    if (s->out_format == FMT_H263) {
+        uvsx    = ((motion_x >> 1) & s_mask) | (sx & 1);
+        uvsy    = ((motion_y >> 1) & s_mask) | (sy & 1);
+        uvsrc_x = src_x >> 1;
+        uvsrc_y = src_y >> 1;
+    } else if (s->out_format == FMT_H261) {
+        // even chroma mv's are full pel in H261
+        mx      = motion_x / 4;
+        my      = motion_y / 4;
+        uvsx    = (2 * mx) & s_mask;
+        uvsy    = (2 * my) & s_mask;
+        uvsrc_x = s->mb_x * block_s + (mx >> lowres);
+        uvsrc_y =    mb_y * block_s + (my >> lowres);
+    } else {
+        if(s->chroma_y_shift){
+            mx      = motion_x / 2;
+            my      = motion_y / 2;
+            uvsx    = mx & s_mask;
+            uvsy    = my & s_mask;
+            uvsrc_x = s->mb_x * block_s                 + (mx >> lowres + 1);
+            uvsrc_y =   (mb_y * block_s >> field_based) + (my >> lowres + 1);
+        } else {
+            if(s->chroma_x_shift){
+            //Chroma422
+                mx = motion_x / 2;
+                uvsx = mx & s_mask;
+                uvsy = motion_y & s_mask;
+                uvsrc_y = src_y;
+                uvsrc_x = s->mb_x*block_s               + (mx >> (lowres+1));
+            } else {
+            //Chroma444
+                uvsx = motion_x & s_mask;
+                uvsy = motion_y & s_mask;
+                uvsrc_x = src_x;
+                uvsrc_y = src_y;
+            }
+        }
+    }
+
+    ptr_y  = ref_picture[0] + src_y   * linesize   + src_x;
+    ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
+    ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
+
+    if ((unsigned) src_x > FFMAX( h_edge_pos - (!!sx) - 2 * block_s,       0) || uvsrc_y<0 ||
+        (unsigned) src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
+                                 linesize >> field_based, linesize >> field_based,
+                                 17, 17 + field_based,
+                                src_x, src_y << field_based, h_edge_pos,
+                                v_edge_pos);
+        ptr_y = s->sc.edge_emu_buffer;
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf =ubuf + 9 * s->uvlinesize;
+            s->vdsp.emulated_edge_mc(ubuf,  ptr_cb,
+                                     uvlinesize >> field_based, uvlinesize >> field_based,
+                                     9, 9 + field_based,
+                                    uvsrc_x, uvsrc_y << field_based,
+                                    h_edge_pos >> 1, v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf,  ptr_cr,
+                                     uvlinesize >> field_based,uvlinesize >> field_based,
+                                     9, 9 + field_based,
+                                    uvsrc_x, uvsrc_y << field_based,
+                                    h_edge_pos >> 1, v_edge_pos >> 1);
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
+        }
+    }
+
+    // FIXME use this for field pix too instead of the obnoxious hack which changes picture.f->data
+    if (bottom_field) {
+        dest_y  += s->linesize;
+        dest_cb += s->uvlinesize;
+        dest_cr += s->uvlinesize;
+    }
+
+    if (field_select) {
+        ptr_y   += s->linesize;
+        ptr_cb  += s->uvlinesize;
+        ptr_cr  += s->uvlinesize;
+    }
+
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    pix_op[lowres - 1](dest_y, ptr_y, linesize, h, sx, sy);
+
+    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        int hc = s->chroma_y_shift ? (h+1-bottom_field)>>1 : h;
+        uvsx = (uvsx << 2) >> lowres;
+        uvsy = (uvsy << 2) >> lowres;
+        if (hc) {
+            pix_op[op_index](dest_cb, ptr_cb, uvlinesize, hc, uvsx, uvsy);
+            pix_op[op_index](dest_cr, ptr_cr, uvlinesize, hc, uvsx, uvsy);
+        }
+    }
+    // FIXME h261 lowres loop filter
+}
+
+static inline void chroma_4mv_motion_lowres(MpegEncContext *s,
+                                            uint8_t *dest_cb, uint8_t *dest_cr,
+                                            uint8_t **ref_picture,
+                                            h264_chroma_mc_func * pix_op,
+                                            int mx, int my)
+{
+    const int lowres     = s->avctx->lowres;
+    const int op_index   = FFMIN(lowres, 3);
+    const int block_s    = 8 >> lowres;
+    const int s_mask     = (2 << lowres) - 1;
+    const int h_edge_pos = s->h_edge_pos >> lowres + 1;
+    const int v_edge_pos = s->v_edge_pos >> lowres + 1;
+    int emu = 0, src_x, src_y, sx, sy;
+    ptrdiff_t offset;
+    uint8_t *ptr;
+
+    if (s->quarter_sample) {
+        mx /= 2;
+        my /= 2;
+    }
+
+    /* In case of 8X8, we construct a single chroma motion vector
+       with a special rounding */
+    mx = ff_h263_round_chroma(mx);
+    my = ff_h263_round_chroma(my);
+
+    sx = mx & s_mask;
+    sy = my & s_mask;
+    src_x = s->mb_x * block_s + (mx >> lowres + 1);
+    src_y = s->mb_y * block_s + (my >> lowres + 1);
+
+    offset = src_y * s->uvlinesize + src_x;
+    ptr = ref_picture[1] + offset;
+    if ((unsigned) src_x > FFMAX(h_edge_pos - (!!sx) - block_s, 0) ||
+        (unsigned) src_y > FFMAX(v_edge_pos - (!!sy) - block_s, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
+                                 s->uvlinesize, s->uvlinesize,
+                                 9, 9,
+                                 src_x, src_y, h_edge_pos, v_edge_pos);
+        ptr = s->sc.edge_emu_buffer;
+        emu = 1;
+    }
+    sx = (sx << 2) >> lowres;
+    sy = (sy << 2) >> lowres;
+    pix_op[op_index](dest_cb, ptr, s->uvlinesize, block_s, sx, sy);
+
+    ptr = ref_picture[2] + offset;
+    if (emu) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
+                                 s->uvlinesize, s->uvlinesize,
+                                 9, 9,
+                                 src_x, src_y, h_edge_pos, v_edge_pos);
+        ptr = s->sc.edge_emu_buffer;
+    }
+    pix_op[op_index](dest_cr, ptr, s->uvlinesize, block_s, sx, sy);
+}
+
+/**
+ * motion compensation of a single macroblock
+ * @param s context
+ * @param dest_y luma destination pointer
+ * @param dest_cb chroma cb/u destination pointer
+ * @param dest_cr chroma cr/v destination pointer
+ * @param dir direction (0->forward, 1->backward)
+ * @param ref_picture array[3] of pointers to the 3 planes of the reference picture
+ * @param pix_op halfpel motion compensation function (average or put normally)
+ * the motion vectors are taken from s->mv and the MV type from s->mv_type
+ */
+static inline void MPV_motion_lowres(MpegEncContext *s,
+                                     uint8_t *dest_y, uint8_t *dest_cb,
+                                     uint8_t *dest_cr,
+                                     int dir, uint8_t **ref_picture,
+                                     h264_chroma_mc_func *pix_op)
+{
+    int mx, my;
+    int mb_x, mb_y, i;
+    const int lowres  = s->avctx->lowres;
+    const int block_s = 8 >>lowres;
+
+    mb_x = s->mb_x;
+    mb_y = s->mb_y;
+
+    switch (s->mv_type) {
+    case MV_TYPE_16X16:
+        mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                           0, 0, 0,
+                           ref_picture, pix_op,
+                           s->mv[dir][0][0], s->mv[dir][0][1],
+                           2 * block_s, mb_y);
+        break;
+    case MV_TYPE_8X8:
+        mx = 0;
+        my = 0;
+        for (i = 0; i < 4; i++) {
+            hpel_motion_lowres(s, dest_y + ((i & 1) + (i >> 1) *
+                               s->linesize) * block_s,
+                               ref_picture[0], 0, 0,
+                               (2 * mb_x + (i & 1)) * block_s,
+                               (2 * mb_y + (i >> 1)) * block_s,
+                               s->width, s->height, s->linesize,
+                               s->h_edge_pos >> lowres, s->v_edge_pos >> lowres,
+                               block_s, block_s, pix_op,
+                               s->mv[dir][i][0], s->mv[dir][i][1]);
+
+            mx += s->mv[dir][i][0];
+            my += s->mv[dir][i][1];
+        }
+
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            chroma_4mv_motion_lowres(s, dest_cb, dest_cr, ref_picture,
+                                     pix_op, mx, my);
+        break;
+    case MV_TYPE_FIELD:
+        if (s->picture_structure == PICT_FRAME) {
+            /* top field */
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               1, 0, s->field_select[dir][0],
+                               ref_picture, pix_op,
+                               s->mv[dir][0][0], s->mv[dir][0][1],
+                               block_s, mb_y);
+            /* bottom field */
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               1, 1, s->field_select[dir][1],
+                               ref_picture, pix_op,
+                               s->mv[dir][1][0], s->mv[dir][1][1],
+                               block_s, mb_y);
+        } else {
+            if (s->picture_structure != s->field_select[dir][0] + 1 &&
+                s->pict_type != AV_PICTURE_TYPE_B && !s->first_field) {
+                ref_picture = s->current_picture_ptr->f->data;
+
+            }
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               0, 0, s->field_select[dir][0],
+                               ref_picture, pix_op,
+                               s->mv[dir][0][0],
+                               s->mv[dir][0][1], 2 * block_s, mb_y >> 1);
+            }
+        break;
+    case MV_TYPE_16X8:
+        for (i = 0; i < 2; i++) {
+            uint8_t **ref2picture;
+
+            if (s->picture_structure == s->field_select[dir][i] + 1 ||
+                s->pict_type == AV_PICTURE_TYPE_B || s->first_field) {
+                ref2picture = ref_picture;
+            } else {
+                ref2picture = s->current_picture_ptr->f->data;
+            }
+
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                               0, 0, s->field_select[dir][i],
+                               ref2picture, pix_op,
+                               s->mv[dir][i][0], s->mv[dir][i][1] +
+                               2 * block_s * i, block_s, mb_y >> 1);
+
+            dest_y  +=  2 * block_s *  s->linesize;
+            dest_cb += (2 * block_s >> s->chroma_y_shift) * s->uvlinesize;
+            dest_cr += (2 * block_s >> s->chroma_y_shift) * s->uvlinesize;
+        }
+        break;
+    case MV_TYPE_DMV:
+        if (s->picture_structure == PICT_FRAME) {
+            for (i = 0; i < 2; i++) {
+                int j;
+                for (j = 0; j < 2; j++) {
+                    mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                                       1, j, j ^ i,
+                                       ref_picture, pix_op,
+                                       s->mv[dir][2 * i + j][0],
+                                       s->mv[dir][2 * i + j][1],
+                                       block_s, mb_y);
+                }
+                pix_op = s->h264chroma.avg_h264_chroma_pixels_tab;
+            }
+        } else {
+            for (i = 0; i < 2; i++) {
+                mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                                   0, 0, s->picture_structure != i + 1,
+                                   ref_picture, pix_op,
+                                   s->mv[dir][2 * i][0],s->mv[dir][2 * i][1],
+                                   2 * block_s, mb_y >> 1);
+
+                // after put we make avg of the same block
+                pix_op = s->h264chroma.avg_h264_chroma_pixels_tab;
+
+                // opposite parity is always in the same
+                // frame if this is second field
+                if (!s->first_field) {
+                    ref_picture = s->current_picture_ptr->f->data;
                 }
             }
-            av_log(s->avctx, AV_LOG_DEBUG, "\n");
         }
+        break;
+    default:
+        av_assert2(0);
     }
 }
 
@@ -1404,14 +2386,14 @@ static int lowest_referenced_row(MpegEncContext *s, int dir)
     }
 
     for (i = 0; i < mvs; i++) {
-        my = s->mv[dir][i][1]<<qpel_shift;
+        my = s->mv[dir][i][1];
         my_max = FFMAX(my_max, my);
         my_min = FFMIN(my_min, my);
     }
 
-    off = (FFMAX(-my_min, my_max) + 63) >> 6;
+    off = ((FFMAX(-my_min, my_max)<<qpel_shift) + 63) >> 6;
 
-    return FFMIN(FFMAX(s->mb_y + off, 0), s->mb_height-1);
+    return av_clip(s->mb_y + off, 0, s->mb_height - 1);
 unhandled:
     return s->mb_height-1;
 }
@@ -1488,18 +2470,15 @@ void ff_clean_intra_table_entries(MpegEncContext *s)
  */
 static av_always_inline
 void mpv_decode_mb_internal(MpegEncContext *s, int16_t block[12][64],
-                            int is_mpeg12)
+                            int lowres_flag, int is_mpeg12)
 {
     const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
 
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    if(CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration){
-        ff_xvmc_decode_mb(s);//xvmc uses pblocks
+    if (CONFIG_XVMC &&
+        s->avctx->hwaccel && s->avctx->hwaccel->decode_mb) {
+        s->avctx->hwaccel->decode_mb(s);//xvmc uses pblocks
         return;
     }
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
 
     if(s->avctx->debug&FF_DEBUG_DCT_COEFF) {
        /* print DCT coefficients */
@@ -1530,7 +2509,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     else if (!is_mpeg12 && (s->h263_pred || s->h263_aic))
         s->mbintra_table[mb_xy]=1;
 
-    if ((s->avctx->flags & AV_CODEC_FLAG_PSNR) ||
+    if ((s->avctx->flags & AV_CODEC_FLAG_PSNR) || s->frame_skip_threshold || s->frame_skip_factor ||
         !(s->encoding && (s->intra_only || s->pict_type == AV_PICTURE_TYPE_B) &&
           s->avctx->mb_decision != FF_MB_DECISION_RD)) { // FIXME precalc
         uint8_t *dest_y, *dest_cb, *dest_cr;
@@ -1539,8 +2518,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
         qpel_mc_func (*op_qpix)[16];
         const int linesize   = s->current_picture.f->linesize[0]; //not s->linesize as this would be wrong for field pics
         const int uvlinesize = s->current_picture.f->linesize[1];
-        const int readable= s->pict_type != AV_PICTURE_TYPE_B || s->encoding || s->avctx->draw_horiz_band;
-        const int block_size = 8;
+        const int readable= s->pict_type != AV_PICTURE_TYPE_B || s->encoding || s->avctx->draw_horiz_band || lowres_flag;
+        const int block_size= lowres_flag ? 8>>s->avctx->lowres : 8;
 
         /* avoid copy if macroblock skipped in last frame too */
         /* skip only during decoding as we might trash the buffers during encoding a bit */
@@ -1549,7 +2528,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             if (s->mb_skipped) {
                 s->mb_skipped= 0;
-                assert(s->pict_type!=AV_PICTURE_TYPE_I);
+                av_assert2(s->pict_type!=AV_PICTURE_TYPE_I);
                 *mbskip_ptr = 1;
             } else if(!s->current_picture.reference) {
                 *mbskip_ptr = 1;
@@ -1589,19 +2568,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     }
                 }
 
-                op_qpix= s->me.qpel_put;
-                if ((!s->no_rounding) || s->pict_type==AV_PICTURE_TYPE_B){
-                    op_pix = s->hdsp.put_pixels_tab;
+                if(lowres_flag){
+                    h264_chroma_mc_func *op_pix = s->h264chroma.put_h264_chroma_pixels_tab;
+
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix);
+                        op_pix = s->h264chroma.avg_h264_chroma_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix);
+                    }
                 }else{
-                    op_pix = s->hdsp.put_no_rnd_pixels_tab;
-                }
-                if (s->mv_dir & MV_DIR_FORWARD) {
-                    ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix, op_qpix);
-                    op_pix = s->hdsp.avg_pixels_tab;
-                    op_qpix= s->me.qpel_avg;
-                }
-                if (s->mv_dir & MV_DIR_BACKWARD) {
-                    ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix, op_qpix);
+                    op_qpix = s->me.qpel_put;
+                    if ((!s->no_rounding) || s->pict_type==AV_PICTURE_TYPE_B){
+                        op_pix = s->hdsp.put_pixels_tab;
+                    }else{
+                        op_pix = s->hdsp.put_no_rnd_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.f->data, op_pix, op_qpix);
+                        op_pix = s->hdsp.avg_pixels_tab;
+                        op_qpix= s->me.qpel_avg;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        ff_mpv_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.f->data, op_pix, op_qpix);
+                    }
                 }
             }
 
@@ -1647,17 +2638,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     }else{
                         //chroma422
                         dct_linesize = uvlinesize << s->interlaced_dct;
-                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize * 8;
+                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize*block_size;
 
                         add_dct(s, block[4], 4, dest_cb, dct_linesize);
                         add_dct(s, block[5], 5, dest_cr, dct_linesize);
                         add_dct(s, block[6], 6, dest_cb+dct_offset, dct_linesize);
                         add_dct(s, block[7], 7, dest_cr+dct_offset, dct_linesize);
                         if(!s->chroma_x_shift){//Chroma444
-                            add_dct(s, block[8], 8, dest_cb+8, dct_linesize);
-                            add_dct(s, block[9], 9, dest_cr+8, dct_linesize);
-                            add_dct(s, block[10], 10, dest_cb+8+dct_offset, dct_linesize);
-                            add_dct(s, block[11], 11, dest_cr+8+dct_offset, dct_linesize);
+                            add_dct(s, block[8], 8, dest_cb+block_size, dct_linesize);
+                            add_dct(s, block[9], 9, dest_cr+block_size, dct_linesize);
+                            add_dct(s, block[10], 10, dest_cb+block_size+dct_offset, dct_linesize);
+                            add_dct(s, block[11], 11, dest_cr+block_size+dct_offset, dct_linesize);
                         }
                     }
                 }//fi gray
@@ -1699,17 +2690,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     }else{
 
                         dct_linesize = uvlinesize << s->interlaced_dct;
-                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize * 8;
+                        dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize*block_size;
 
                         s->idsp.idct_put(dest_cb,              dct_linesize, block[4]);
                         s->idsp.idct_put(dest_cr,              dct_linesize, block[5]);
                         s->idsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]);
                         s->idsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]);
                         if(!s->chroma_x_shift){//Chroma444
-                            s->idsp.idct_put(dest_cb + 8,              dct_linesize, block[8]);
-                            s->idsp.idct_put(dest_cr + 8,              dct_linesize, block[9]);
-                            s->idsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]);
-                            s->idsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]);
+                            s->idsp.idct_put(dest_cb + block_size,              dct_linesize, block[8]);
+                            s->idsp.idct_put(dest_cr + block_size,              dct_linesize, block[9]);
+                            s->idsp.idct_put(dest_cb + block_size + dct_offset, dct_linesize, block[10]);
+                            s->idsp.idct_put(dest_cr + block_size + dct_offset, dct_linesize, block[11]);
                         }
                     }
                 }//gray
@@ -1718,8 +2709,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
 skip_idct:
         if(!readable){
             s->hdsp.put_pixels_tab[0][0](s->dest[0], dest_y ,   linesize,16);
-            s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[1], dest_cb, uvlinesize,16 >> s->chroma_y_shift);
-            s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[2], dest_cr, uvlinesize,16 >> s->chroma_y_shift);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[1], dest_cb, uvlinesize,16 >> s->chroma_y_shift);
+                s->hdsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[2], dest_cr, uvlinesize,16 >> s->chroma_y_shift);
+            }
         }
     }
 }
@@ -1728,23 +2721,25 @@ void ff_mpv_decode_mb(MpegEncContext *s, int16_t block[12][64])
 {
 #if !CONFIG_SMALL
     if(s->out_format == FMT_MPEG1) {
-        mpv_decode_mb_internal(s, block, 1);
+        if(s->avctx->lowres) mpv_decode_mb_internal(s, block, 1, 1);
+        else                 mpv_decode_mb_internal(s, block, 0, 1);
     } else
 #endif
-        mpv_decode_mb_internal(s, block, 0);
+    if(s->avctx->lowres) mpv_decode_mb_internal(s, block, 1, 0);
+    else                  mpv_decode_mb_internal(s, block, 0, 0);
 }
 
 void ff_mpeg_draw_horiz_band(MpegEncContext *s, int y, int h)
 {
-    ff_draw_horiz_band(s->avctx, s->current_picture.f,
-                       s->last_picture.f, y, h, s->picture_structure,
+    ff_draw_horiz_band(s->avctx, s->current_picture_ptr->f,
+                       s->last_picture_ptr ? s->last_picture_ptr->f : NULL, y, h, s->picture_structure,
                        s->first_field, s->low_delay);
 }
 
 void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     const int linesize   = s->current_picture.f->linesize[0]; //not s->linesize as this would be wrong for field pics
     const int uvlinesize = s->current_picture.f->linesize[1];
-    const int mb_size= 4;
+    const int mb_size= 4 - s->avctx->lowres;
 
     s->block_index[0]= s->b8_stride*(s->mb_y*2    ) - 2 + s->mb_x*2;
     s->block_index[1]= s->b8_stride*(s->mb_y*2    ) - 1 + s->mb_x*2;
@@ -1754,9 +2749,9 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
     //block_index is not used by mpeg2, so it is not affected by chroma_format
 
-    s->dest[0] = s->current_picture.f->data[0] + (s->mb_x - 1) * (1 << mb_size);
-    s->dest[1] = s->current_picture.f->data[1] + (s->mb_x - 1) * (1 << (mb_size - s->chroma_x_shift));
-    s->dest[2] = s->current_picture.f->data[2] + (s->mb_x - 1) * (1 << (mb_size - s->chroma_x_shift));
+    s->dest[0] = s->current_picture.f->data[0] + (int)((s->mb_x - 1U) <<  mb_size);
+    s->dest[1] = s->current_picture.f->data[1] + (int)((s->mb_x - 1U) << (mb_size - s->chroma_x_shift));
+    s->dest[2] = s->current_picture.f->data[2] + (int)((s->mb_x - 1U) << (mb_size - s->chroma_x_shift));
 
     if(!(s->pict_type==AV_PICTURE_TYPE_B && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME))
     {
@@ -1768,7 +2763,7 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
             s->dest[0] += (s->mb_y>>1) *   linesize << mb_size;
             s->dest[1] += (s->mb_y>>1) * uvlinesize << (mb_size - s->chroma_y_shift);
             s->dest[2] += (s->mb_y>>1) * uvlinesize << (mb_size - s->chroma_y_shift);
-            assert((s->mb_y&1) == (s->picture_structure == PICT_BOTTOM_FIELD));
+            av_assert1((s->mb_y&1) == (s->picture_structure == PICT_BOTTOM_FIELD));
         }
     }
 }
@@ -1789,6 +2784,7 @@ void ff_mpeg_flush(AVCodecContext *avctx){
     ff_mpeg_unref_picture(s->avctx, &s->next_picture);
 
     s->mb_x= s->mb_y= 0;
+    s->closed_gop= 0;
 
     s->parse_context.state= -1;
     s->parse_context.frame_start_found= 0;
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 932a6f2..a1f3d4b 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,7 @@
 #include "error_resilience.h"
 #include "fdctdsp.h"
 #include "get_bits.h"
+#include "h264chroma.h"
 #include "h263dsp.h"
 #include "hpeldsp.h"
 #include "idctdsp.h"
@@ -55,8 +56,9 @@
 #include "videodsp.h"
 
 #include "libavutil/opt.h"
+#include "libavutil/timecode.h"
 
-#define MAX_THREADS 16
+#define MAX_THREADS 32
 
 #define MAX_B_FRAMES 16
 
@@ -95,7 +97,7 @@ typedef struct MpegEncContext {
     int width, height;///< picture size. must be a multiple of 16
     int gop_size;
     int intra_only;   ///< if true, only intra pictures are generated
-    int bit_rate;     ///< wanted bit rate
+    int64_t bit_rate; ///< wanted bit rate
     enum OutputFormat out_format; ///< output format
     int h263_pred;    ///< use MPEG-4/H.263 ac/dc predictions
     int pb_frame;     ///< PB-frame mode (0 = none, 1 = base, 2 = improved)
@@ -186,7 +188,7 @@ typedef struct MpegEncContext {
     uint8_t *coded_block_base;
     uint8_t *coded_block;          ///< used for coded block pattern prediction (msmpeg4v3, wmv1)
     int16_t (*ac_val_base)[16];
-    int16_t (*ac_val[3])[16];      ///< used for for MPEG-4 AC prediction, all 3 arrays must be continuous
+    int16_t (*ac_val[3])[16];      ///< used for MPEG-4 AC prediction, all 3 arrays must be continuous
     int mb_skipped;                ///< MUST BE SET only during DECODING
     uint8_t *mbskip_table;        /**< used to avoid copy if macroblock skipped (for black regions for example)
                                    and used for B-frame encoding & decoding (contains skip table of next P-frame) */
@@ -203,11 +205,14 @@ typedef struct MpegEncContext {
     int *lambda_table;
     int adaptive_quant;         ///< use adaptive quantization
     int dquant;                 ///< qscale difference to prev qscale
+    int closed_gop;             ///< MPEG1/2 GOP is closed
     int pict_type;              ///< AV_PICTURE_TYPE_I, AV_PICTURE_TYPE_P, AV_PICTURE_TYPE_B, ...
+    int vbv_delay;
     int last_pict_type; //FIXME removes
     int last_non_b_pict_type;   ///< used for MPEG-4 gmc B-frames & ratecontrol
     int droppable;
     int frame_rate_index;
+    AVRational mpeg2_frame_rate_ext;
     int last_lambda_for[5];     ///< last lambda for a specific pict type
     int skipdct;                ///< skip dct and code zero residual
 
@@ -217,6 +222,7 @@ typedef struct MpegEncContext {
 
     BlockDSPContext bdsp;
     FDCTDSPContext fdsp;
+    H264ChromaContext h264chroma;
     HpelDSPContext hdsp;
     IDCTDSPContext idsp;
     MECmpContext mecc;
@@ -295,6 +301,7 @@ typedef struct MpegEncContext {
     uint16_t chroma_intra_matrix[64];
     uint16_t inter_matrix[64];
     uint16_t chroma_inter_matrix[64];
+    int force_duplicated_matrix; ///< Force duplication of mjpeg matrices, useful for rtp streaming
 
     int intra_quant_bias;    ///< bias for the quantizer
     int inter_quant_bias;    ///< bias for the quantizer
@@ -303,18 +310,22 @@ typedef struct MpegEncContext {
     int ac_esc_length;       ///< num of bits needed to encode the longest esc
     uint8_t *intra_ac_vlc_length;
     uint8_t *intra_ac_vlc_last_length;
+    uint8_t *intra_chroma_ac_vlc_length;
+    uint8_t *intra_chroma_ac_vlc_last_length;
     uint8_t *inter_ac_vlc_length;
     uint8_t *inter_ac_vlc_last_length;
     uint8_t *luma_dc_vlc_length;
 #define UNI_AC_ENC_INDEX(run,level) ((run)*128 + (level))
 
-    int coded_score[8];
+    int coded_score[12];
 
     /** precomputed matrix (combine qscale and DCT renorm) */
     int (*q_intra_matrix)[64];
+    int (*q_chroma_intra_matrix)[64];
     int (*q_inter_matrix)[64];
     /** identical to the above but for MMX & these are not permutated, second 64 entries are bias*/
     uint16_t (*q_intra_matrix16)[2][64];
+    uint16_t (*q_chroma_intra_matrix16)[2][64];
     uint16_t (*q_inter_matrix16)[2][64];
 
     /* noise reduction */
@@ -325,6 +336,7 @@ typedef struct MpegEncContext {
     /* bit rate control */
     int64_t total_bits;
     int frame_bits;                ///< bits used for the current frame
+    int stuffing_bits;             ///< bits used for stuffing
     int next_lambda;               ///< next lambda used for retrying to encode a frame
     RateControlContext rc_context; ///< contains stuff only accessed in ratecontrol.c
 
@@ -356,6 +368,7 @@ typedef struct MpegEncContext {
     int prev_mb_info, last_mb_info;
     uint8_t *mb_info_ptr;
     int mb_info_size;
+    int ehc_mode;
     int rc_strategy;
 
     /* H.263+ specific */
@@ -407,6 +420,7 @@ typedef struct MpegEncContext {
 
     /* MJPEG specific */
     struct MJpegContext *mjpeg_ctx;
+    int esc_pos;
     int pred;
 
     /* MSMPEG4 specific */
@@ -451,11 +465,13 @@ typedef struct MpegEncContext {
     int brd_scale;
     int intra_vlc_format;
     int alternate_scan;
+    int seq_disp_ext;
     int repeat_first_field;
     int chroma_420_type;
     int chroma_format;
 #define CHROMA_420 1
 #define CHROMA_422 2
+#define CHROMA_444 3
     int chroma_x_shift;//depend on pix_format, that depend on chroma_format
     int chroma_y_shift;
 
@@ -470,7 +486,12 @@ typedef struct MpegEncContext {
     int rtp_mode;
     int rtp_payload_size;
 
+    char *tc_opt_str;        ///< timecode option string
+    AVTimecode tc;           ///< timecode context
+
     uint8_t *ptr_lastgob;
+    int swap_uv;             //vcr2 codec is an MPEG-2 variant with U and V swapped
+    int pack_pblocks;        //xvmc needs to keep blocks without gaps.
     int16_t (*pblocks[12])[64];
 
     int16_t (*block)[64]; ///< points to one of the following blocks
@@ -515,6 +536,7 @@ typedef struct MpegEncContext {
     float rc_buffer_aggressivity;
     float border_masking;
     int lmin, lmax;
+    int vbv_ignore_qmax;
 
     char *rc_eq;
 
@@ -552,6 +574,12 @@ typedef struct MpegEncContext {
 #define FF_MPV_FLAG_NAQ          0x0010
 #define FF_MPV_FLAG_MV0          0x0020
 
+enum rc_strategy {
+    MPV_RC_STRATEGY_FFMPEG,
+    MPV_RC_STRATEGY_XVID,
+    NB_MPV_RC_STRATEGY
+};
+
 #define FF_MPV_OPT_CMP_FUNC \
 { "sad",    "Sum of absolute differences, fast", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SAD }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }, \
 { "sse",    "Sum of squared errors", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SSE }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }, \
@@ -568,7 +596,9 @@ typedef struct MpegEncContext {
 { "dctmax", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }, \
 { "chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS, "cmp_func" }
 
+#ifndef FF_MPV_OFFSET
 #define FF_MPV_OFFSET(x) offsetof(MpegEncContext, x)
+#endif
 #define FF_MPV_OPT_FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM)
 #define FF_MPV_COMMON_OPTS \
 FF_MPV_OPT_CMP_FUNC, \
@@ -602,11 +632,14 @@ FF_MPV_OPT_CMP_FUNC, \
 {"lmax", "maximum Lagrange factor (VBR)",                           FF_MPV_OFFSET(lmax), AV_OPT_TYPE_INT, {.i64 = 31*FF_QP2LAMBDA }, 0, INT_MAX, FF_MPV_OPT_FLAGS },            \
 {"ibias", "intra quant bias",                                       FF_MPV_OFFSET(intra_quant_bias), AV_OPT_TYPE_INT, {.i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS },   \
 {"pbias", "inter quant bias",                                       FF_MPV_OFFSET(inter_quant_bias), AV_OPT_TYPE_INT, {.i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, FF_MPV_OPT_FLAGS },   \
-{"rc_strategy", "ratecontrol method",                               FF_MPV_OFFSET(rc_strategy), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 1, FF_MPV_OPT_FLAGS },   \
+{"rc_strategy", "ratecontrol method",                               FF_MPV_OFFSET(rc_strategy), AV_OPT_TYPE_INT, {.i64 = MPV_RC_STRATEGY_FFMPEG }, 0, NB_MPV_RC_STRATEGY-1, FF_MPV_OPT_FLAGS, "rc_strategy" },   \
+    { "ffmpeg", "default native rate control", 0, AV_OPT_TYPE_CONST, { .i64 = MPV_RC_STRATEGY_FFMPEG }, 0, 0, FF_MPV_OPT_FLAGS, "rc_strategy" }, \
+    { "xvid",   "libxvid (2 pass only)",       0, AV_OPT_TYPE_CONST, { .i64 = MPV_RC_STRATEGY_XVID },   0, 0, FF_MPV_OPT_FLAGS, "rc_strategy" }, \
 {"motion_est", "motion estimation algorithm",                       FF_MPV_OFFSET(motion_est), AV_OPT_TYPE_INT, {.i64 = FF_ME_EPZS }, FF_ME_ZERO, FF_ME_XONE, FF_MPV_OPT_FLAGS, "motion_est" },   \
 { "zero", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_ZERO }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" }, \
 { "epzs", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_EPZS }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" }, \
 { "xone", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_XONE }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" }, \
+{ "force_duplicated_matrix", "Always write luma and chroma matrix for mjpeg, useful for rtp streaming.", FF_MPV_OFFSET(force_duplicated_matrix), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, FF_MPV_OPT_FLAGS },   \
 {"b_strategy", "Strategy to choose between I/P/B-frames",           FF_MPV_OFFSET(b_frame_strategy), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 2, FF_MPV_OPT_FLAGS }, \
 {"b_sensitivity", "Adjust sensitivity of b_frame_strategy 1",       FF_MPV_OFFSET(b_sensitivity), AV_OPT_TYPE_INT, {.i64 = 40 }, 1, INT_MAX, FF_MPV_OPT_FLAGS }, \
 {"brd_scale", "Downscale frames for dynamic B-frame decision",      FF_MPV_OFFSET(brd_scale), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, 3, FF_MPV_OPT_FLAGS }, \
@@ -630,16 +663,21 @@ extern const AVOption ff_mpv_generic_options[];
  */
 void ff_mpv_common_defaults(MpegEncContext *s);
 
+void ff_dct_encode_init_x86(MpegEncContext *s);
+
 int ff_mpv_common_init(MpegEncContext *s);
 void ff_mpv_common_init_arm(MpegEncContext *s);
+void ff_mpv_common_init_axp(MpegEncContext *s);
 void ff_mpv_common_init_neon(MpegEncContext *s);
 void ff_mpv_common_init_ppc(MpegEncContext *s);
 void ff_mpv_common_init_x86(MpegEncContext *s);
+void ff_mpv_common_init_mips(MpegEncContext *s);
 
 int ff_mpv_common_frame_size_change(MpegEncContext *s);
 void ff_mpv_common_end(MpegEncContext *s);
 
 void ff_mpv_decode_defaults(MpegEncContext *s);
+void ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx);
 void ff_mpv_decode_mb(MpegEncContext *s, int16_t block[12][64]);
 void ff_mpv_report_decode_progress(MpegEncContext *s);
 
@@ -652,11 +690,20 @@ void ff_mpv_encode_init_x86(MpegEncContext *s);
 int ff_mpv_encode_end(AVCodecContext *avctx);
 int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
                           const AVFrame *frame, int *got_packet);
+int ff_mpv_reallocate_putbitbuffer(MpegEncContext *s, size_t threshold, size_t size_increase);
 
 void ff_clean_intra_table_entries(MpegEncContext *s);
 void ff_mpeg_draw_horiz_band(MpegEncContext *s, int y, int h);
 void ff_mpeg_flush(AVCodecContext *avctx);
-void ff_print_debug_info(MpegEncContext *s, Picture *p);
+
+void ff_print_debug_info(MpegEncContext *s, Picture *p, AVFrame *pict);
+void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
+                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
+                         int *low_delay,
+                         int mb_width, int mb_height, int mb_stride, int quarter_sample);
+
+int ff_mpv_export_qp_table(MpegEncContext *s, AVFrame *f, Picture *p, int qp_type);
+
 void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix);
 
 int ff_update_duplicate_context(MpegEncContext *dst, MpegEncContext *src);
@@ -664,10 +711,12 @@ int ff_mpeg_update_thread_context(AVCodecContext *dst, const AVCodecContext *src
 void ff_set_qscale(MpegEncContext * s, int qscale);
 
 void ff_mpv_idct_init(MpegEncContext *s);
+int ff_dct_encode_init(MpegEncContext *s);
 void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[2][64],
                        const uint16_t *quant_matrix, int bias, int qmin, int qmax, int intra);
 int ff_dct_quantize_c(MpegEncContext *s, int16_t *block, int n, int qscale, int *overflow);
-
+void ff_block_permute(int16_t *block, uint8_t *permutation,
+                      const uint8_t *scantable, int last);
 void ff_init_block_index(MpegEncContext *s);
 
 void ff_mpv_motion(MpegEncContext *s,
@@ -678,7 +727,7 @@ void ff_mpv_motion(MpegEncContext *s,
                    qpel_mc_func (*qpix_op)[16]);
 
 static inline void ff_update_block_index(MpegEncContext *s){
-    const int block_size = 8;
+    const int block_size= 8 >> s->avctx->lowres;
 
     s->block_index[0]+=2;
     s->block_index[1]+=2;
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 17cdb87..2e12a3d 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -5,23 +5,27 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/*
+ * non linear quantizers with large QPs and VBV with restrictive qmin fixes sponsored by NOA GmbH
+ */
+
 /**
  * @file
  * The simplest mpeg encoder (well, it was the simplest!).
@@ -61,11 +65,12 @@
 #include "wmv2.h"
 #include "rv10.h"
 #include <limits.h>
+#include "sp5x.h"
 
 #define QUANT_BIAS_SHIFT 8
 
 #define QMAT_SHIFT_MMX 16
-#define QMAT_SHIFT 22
+#define QMAT_SHIFT 21
 
 static int encode_picture(MpegEncContext *s, int picture_number);
 static int dct_quantize_refine(MpegEncContext *s, int16_t *block, int16_t *weight, int16_t *orig, int n, int qscale);
@@ -73,7 +78,7 @@ static int sse_mb(MpegEncContext *s);
 static void denoise_dct_c(MpegEncContext *s, int16_t *block);
 static int dct_quantize_trellis_c(MpegEncContext *s, int16_t *block, int n, int qscale, int *overflow);
 
-static uint8_t default_mv_penalty[MAX_FCODE + 1][MAX_MV * 2 + 1];
+static uint8_t default_mv_penalty[MAX_FCODE + 1][MAX_DMV * 2 + 1];
 static uint8_t default_fcode_tab[MAX_MV * 2 + 1];
 
 const AVOption ff_mpv_generic_options[] = {
@@ -92,6 +97,11 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
 
     for (qscale = qmin; qscale <= qmax; qscale++) {
         int i;
+        int qscale2;
+
+        if (s->q_scale_type) qscale2 = ff_mpeg2_non_linear_qscale[qscale];
+        else                 qscale2 = qscale << 1;
+
         if (fdsp->fdct == ff_jpeg_fdct_islow_8  ||
 #if CONFIG_FAANDCT
             fdsp->fdct == ff_faandct            ||
@@ -99,46 +109,46 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
             fdsp->fdct == ff_jpeg_fdct_islow_10) {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = (int64_t) qscale * quant_matrix[j];
+                int64_t den = (int64_t) qscale2 * quant_matrix[j];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
                  * (1 << 36) / 19952 >= (1 << 36) / (x) >= (1 << 36) / 249205026
                  *           3444240 >= (1 << 36) / (x) >= 275 */
 
-                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << QMAT_SHIFT) / den);
             }
         } else if (fdsp->fdct == ff_fdct_ifast) {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = ff_aanscales[i] * (int64_t) qscale * quant_matrix[j];
+                int64_t den = ff_aanscales[i] * (int64_t) qscale2 * quant_matrix[j];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
                  * (1 << 36) / 19952 >= (1 << 36) / (x) >= (1 << 36) / 249205026
                  *           3444240 >= (1 << 36) / (x) >= 275 */
 
-                qmat[qscale][i] = (int)((UINT64_C(1) << (QMAT_SHIFT + 14)) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << (QMAT_SHIFT + 14)) / den);
             }
         } else {
             for (i = 0; i < 64; i++) {
                 const int j = s->idsp.idct_permutation[i];
-                int64_t den = (int64_t) qscale * quant_matrix[j];
+                int64_t den = (int64_t) qscale2 * quant_matrix[j];
                 /* We can safely suppose that 16 <= quant_matrix[i] <= 255
                  * Assume x = qscale * quant_matrix[i]
                  * So             16 <=              x  <= 7905
                  * so (1 << 19) / 16 >= (1 << 19) / (x) >= (1 << 19) / 7905
                  * so          32768 >= (1 << 19) / (x) >= 67 */
-                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / den);
+                qmat[qscale][i] = (int)((UINT64_C(2) << QMAT_SHIFT) / den);
                 //qmat  [qscale][i] = (1 << QMAT_SHIFT_MMX) /
                 //                    (qscale * quant_matrix[i]);
-                qmat16[qscale][0][i] = (1 << QMAT_SHIFT_MMX) / den;
+                qmat16[qscale][0][i] = (2 << QMAT_SHIFT_MMX) / den;
 
                 if (qmat16[qscale][0][i] == 0 ||
                     qmat16[qscale][0][i] == 128 * 256)
                     qmat16[qscale][0][i] = 128 * 256 - 1;
                 qmat16[qscale][1][i] =
-                    ROUNDED_DIV(bias << (16 - QUANT_BIAS_SHIFT),
+                    ROUNDED_DIV(bias * (1<<(16 - QUANT_BIAS_SHIFT)),
                                 qmat16[qscale][0][i]);
             }
         }
@@ -162,9 +172,27 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
 
 static inline void update_qscale(MpegEncContext *s)
 {
-    s->qscale = (s->lambda * 139 + FF_LAMBDA_SCALE * 64) >>
-                (FF_LAMBDA_SHIFT + 7);
-    s->qscale = av_clip(s->qscale, s->avctx->qmin, s->avctx->qmax);
+    if (s->q_scale_type == 1 && 0) {
+        int i;
+        int bestdiff=INT_MAX;
+        int best = 1;
+
+        for (i = 0 ; i<FF_ARRAY_ELEMS(ff_mpeg2_non_linear_qscale); i++) {
+            int diff = FFABS((ff_mpeg2_non_linear_qscale[i]<<(FF_LAMBDA_SHIFT + 6)) - (int)s->lambda * 139);
+            if (ff_mpeg2_non_linear_qscale[i] < s->avctx->qmin ||
+                (ff_mpeg2_non_linear_qscale[i] > s->avctx->qmax && !s->vbv_ignore_qmax))
+                continue;
+            if (diff < bestdiff) {
+                bestdiff = diff;
+                best = i;
+            }
+        }
+        s->qscale = best;
+    } else {
+        s->qscale = (s->lambda * 139 + FF_LAMBDA_SCALE * 64) >>
+                    (FF_LAMBDA_SHIFT + 7);
+        s->qscale = av_clip(s->qscale, s->avctx->qmin, s->vbv_ignore_qmax ? 31 : s->avctx->qmax);
+    }
 
     s->lambda2 = (s->lambda * s->lambda + FF_LAMBDA_SCALE / 2) >>
                  FF_LAMBDA_SHIFT;
@@ -237,6 +265,23 @@ static void mpv_encode_defaults(MpegEncContext *s)
     s->picture_in_gop_number = 0;
 }
 
+av_cold int ff_dct_encode_init(MpegEncContext *s) {
+    if (ARCH_X86)
+        ff_dct_encode_init_x86(s);
+
+    if (CONFIG_H263_ENCODER)
+        ff_h263dsp_init(&s->h263dsp);
+    if (!s->dct_quantize)
+        s->dct_quantize = ff_dct_quantize_c;
+    if (!s->denoise_dct)
+        s->denoise_dct  = denoise_dct_c;
+    s->fast_dct_quantize = s->dct_quantize;
+    if (s->avctx->trellis)
+        s->dct_quantize  = dct_quantize_trellis_c;
+
+    return 0;
+}
+
 /* init video encoder */
 av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
 {
@@ -256,18 +301,22 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         }
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         format_supported = 0;
         /* JPEG color space */
         if (avctx->pix_fmt == AV_PIX_FMT_YUVJ420P ||
             avctx->pix_fmt == AV_PIX_FMT_YUVJ422P ||
+            avctx->pix_fmt == AV_PIX_FMT_YUVJ444P ||
             (avctx->color_range == AVCOL_RANGE_JPEG &&
              (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-              avctx->pix_fmt == AV_PIX_FMT_YUV422P)))
+              avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+              avctx->pix_fmt == AV_PIX_FMT_YUV444P)))
             format_supported = 1;
         /* MPEG color space */
         else if (avctx->strict_std_compliance <= FF_COMPLIANCE_UNOFFICIAL &&
                  (avctx->pix_fmt == AV_PIX_FMT_YUV420P ||
-                  avctx->pix_fmt == AV_PIX_FMT_YUV422P))
+                  avctx->pix_fmt == AV_PIX_FMT_YUV422P ||
+                  avctx->pix_fmt == AV_PIX_FMT_YUV444P))
             format_supported = 1;
 
         if (!format_supported) {
@@ -283,6 +332,10 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
     }
 
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_YUVJ444P:
+    case AV_PIX_FMT_YUV444P:
+        s->chroma_format = CHROMA_444;
+        break;
     case AV_PIX_FMT_YUVJ422P:
     case AV_PIX_FMT_YUV422P:
         s->chroma_format = CHROMA_422;
@@ -294,6 +347,8 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
         break;
     }
 
+    avctx->bits_per_raw_sample = av_clip(avctx->bits_per_raw_sample, 0, 8);
+
 #if FF_API_PRIVATE_OPT
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->rtp_payload_size)
@@ -310,8 +365,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->height   = avctx->height;
     if (avctx->gop_size > 600 &&
         avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Warning keyframe interval too large! reducing it ...\n");
+        av_log(avctx, AV_LOG_WARNING,
+               "keyframe interval too large!, reducing it from %d to %d\n",
+               avctx->gop_size, 600);
         avctx->gop_size = 600;
     }
     s->gop_size     = avctx->gop_size;
@@ -319,6 +375,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->max_b_frames > MAX_B_FRAMES) {
         av_log(avctx, AV_LOG_ERROR, "Too many B-frames requested, maximum "
                "is %d.\n", MAX_B_FRAMES);
+        avctx->max_b_frames = MAX_B_FRAMES;
     }
     s->max_b_frames = avctx->max_b_frames;
     s->codec_id     = avctx->codec->id;
@@ -326,6 +383,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->quarter_sample     = (avctx->flags & AV_CODEC_FLAG_QPEL) != 0;
     s->rtp_mode           = !!s->rtp_payload_size;
     s->intra_dc_precision = avctx->intra_dc_precision;
+
+    // workaround some differences between how applications specify dc precision
+    if (s->intra_dc_precision < 0) {
+        s->intra_dc_precision += 8;
+    } else if (s->intra_dc_precision >= 8)
+        s->intra_dc_precision -= 8;
+
+    if (s->intra_dc_precision < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+                "intra dc precision must be positive, note some applications use"
+                " 0 and some 8 as base meaning 8bit, the value must not be smaller than that\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->intra_dc_precision > (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO ? 3 : 0)) {
+        av_log(avctx, AV_LOG_ERROR, "intra dc precision too large\n");
+        return AVERROR(EINVAL);
+    }
     s->user_specified_pts = AV_NOPTS_VALUE;
 
     if (s->gop_size <= 1) {
@@ -363,9 +438,33 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->loop_filter = !!(s->avctx->flags & AV_CODEC_FLAG_LOOP_FILTER);
 
     if (avctx->rc_max_rate && !avctx->rc_buffer_size) {
-        av_log(avctx, AV_LOG_ERROR,
-               "a vbv buffer size is needed, "
-               "for encoding with a maximum bitrate\n");
+        switch(avctx->codec_id) {
+        case AV_CODEC_ID_MPEG1VIDEO:
+        case AV_CODEC_ID_MPEG2VIDEO:
+            avctx->rc_buffer_size = FFMAX(avctx->rc_max_rate, 15000000) * 112LL / 15000000 * 16384;
+            break;
+        case AV_CODEC_ID_MPEG4:
+        case AV_CODEC_ID_MSMPEG4V1:
+        case AV_CODEC_ID_MSMPEG4V2:
+        case AV_CODEC_ID_MSMPEG4V3:
+            if       (avctx->rc_max_rate >= 15000000) {
+                avctx->rc_buffer_size = 320 + (avctx->rc_max_rate - 15000000LL) * (760-320) / (38400000 - 15000000);
+            } else if(avctx->rc_max_rate >=  2000000) {
+                avctx->rc_buffer_size =  80 + (avctx->rc_max_rate -  2000000LL) * (320- 80) / (15000000 -  2000000);
+            } else if(avctx->rc_max_rate >=   384000) {
+                avctx->rc_buffer_size =  40 + (avctx->rc_max_rate -   384000LL) * ( 80- 40) / ( 2000000 -   384000);
+            } else
+                avctx->rc_buffer_size = 40;
+            avctx->rc_buffer_size *= 16384;
+            break;
+        }
+        if (avctx->rc_buffer_size) {
+            av_log(avctx, AV_LOG_INFO, "Automatically choosing VBV buffer size of %d kbyte\n", avctx->rc_buffer_size/8192);
+        }
+    }
+
+    if ((!avctx->rc_max_rate) != (!avctx->rc_buffer_size)) {
+        av_log(avctx, AV_LOG_ERROR, "Either both buffer size and max rate or neither must be specified\n");
         return -1;
     }
 
@@ -380,7 +479,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     if (avctx->rc_max_rate && avctx->rc_max_rate < avctx->bit_rate) {
-        av_log(avctx, AV_LOG_INFO, "bitrate above max bitrate\n");
+        av_log(avctx, AV_LOG_ERROR, "bitrate above max bitrate\n");
         return -1;
     }
 
@@ -401,9 +500,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (!s->fixed_qscale &&
         avctx->bit_rate * av_q2d(avctx->time_base) >
             avctx->bit_rate_tolerance) {
-        av_log(avctx, AV_LOG_ERROR,
-               "bitrate tolerance too small for bitrate\n");
-        return -1;
+        av_log(avctx, AV_LOG_WARNING,
+               "bitrate tolerance %d too small for bitrate %"PRId64", overriding\n", avctx->bit_rate_tolerance, (int64_t)avctx->bit_rate);
+        avctx->bit_rate_tolerance = 5 * avctx->bit_rate * av_q2d(avctx->time_base);
     }
 
     if (s->avctx->rc_max_rate &&
@@ -442,18 +541,74 @@ FF_ENABLE_DEPRECATION_WARNINGS
         av_log(avctx, AV_LOG_ERROR, "B-frames not supported by codec\n");
         return -1;
     }
+    if (s->max_b_frames < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "max b frames must be 0 or positive for mpegvideo based encoders\n");
+        return -1;
+    }
 
     if ((s->codec_id == AV_CODEC_ID_MPEG4 ||
          s->codec_id == AV_CODEC_ID_H263  ||
          s->codec_id == AV_CODEC_ID_H263P) &&
         (avctx->sample_aspect_ratio.num > 255 ||
          avctx->sample_aspect_ratio.den > 255)) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Invalid pixel aspect ratio %i/%i, limit is 255/255\n",
+        av_log(avctx, AV_LOG_WARNING,
+               "Invalid pixel aspect ratio %i/%i, limit is 255/255 reducing\n",
                avctx->sample_aspect_ratio.num, avctx->sample_aspect_ratio.den);
+        av_reduce(&avctx->sample_aspect_ratio.num, &avctx->sample_aspect_ratio.den,
+                   avctx->sample_aspect_ratio.num,  avctx->sample_aspect_ratio.den, 255);
+    }
+
+    if ((s->codec_id == AV_CODEC_ID_H263  ||
+         s->codec_id == AV_CODEC_ID_H263P) &&
+        (avctx->width  > 2048 ||
+         avctx->height > 1152 )) {
+        av_log(avctx, AV_LOG_ERROR, "H.263 does not support resolutions above 2048x1152\n");
+        return -1;
+    }
+    if ((s->codec_id == AV_CODEC_ID_H263  ||
+         s->codec_id == AV_CODEC_ID_H263P) &&
+        ((avctx->width &3) ||
+         (avctx->height&3) )) {
+        av_log(avctx, AV_LOG_ERROR, "w/h must be a multiple of 4\n");
+        return -1;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_MPEG1VIDEO &&
+        (avctx->width  > 4095 ||
+         avctx->height > 4095 )) {
+        av_log(avctx, AV_LOG_ERROR, "MPEG-1 does not support resolutions above 4095x4095\n");
         return -1;
     }
 
+    if (s->codec_id == AV_CODEC_ID_MPEG2VIDEO &&
+        (avctx->width  > 16383 ||
+         avctx->height > 16383 )) {
+        av_log(avctx, AV_LOG_ERROR, "MPEG-2 does not support resolutions above 16383x16383\n");
+        return -1;
+    }
+
+    if (s->codec_id == AV_CODEC_ID_RV10 &&
+        (avctx->width &15 ||
+         avctx->height&15 )) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be a multiple of 16\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (s->codec_id == AV_CODEC_ID_RV20 &&
+        (avctx->width &3 ||
+         avctx->height&3 )) {
+        av_log(avctx, AV_LOG_ERROR, "width and height must be a multiple of 4\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((s->codec_id == AV_CODEC_ID_WMV1 ||
+         s->codec_id == AV_CODEC_ID_WMV2) &&
+         avctx->width & 1) {
+         av_log(avctx, AV_LOG_ERROR, "width must be multiple of 2\n");
+         return -1;
+    }
+
     if ((s->avctx->flags & (AV_CODEC_FLAG_INTERLACED_DCT | AV_CODEC_FLAG_INTERLACED_ME)) &&
         s->codec_id != AV_CODEC_ID_MPEG4 && s->codec_id != AV_CODEC_ID_MPEG2VIDEO) {
         av_log(avctx, AV_LOG_ERROR, "interlacing not supported by codec\n");
@@ -468,7 +623,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     // FIXME mpeg2 uses that too
-    if (s->mpeg_quant && s->codec_id != AV_CODEC_ID_MPEG4) {
+    if (s->mpeg_quant && (   s->codec_id != AV_CODEC_ID_MPEG4
+                          && s->codec_id != AV_CODEC_ID_MPEG2VIDEO)) {
         av_log(avctx, AV_LOG_ERROR,
                "mpeg2 style quantization not supported by codec\n");
         return -1;
@@ -514,9 +670,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     if (s->q_scale_type == 1) {
-        if (avctx->qmax > 12) {
+        if (avctx->qmax > 28) {
             av_log(avctx, AV_LOG_ERROR,
-                   "non linear quant only supports qmax <= 12 currently\n");
+                   "non linear quant only supports qmax <= 28 currently\n");
             return -1;
         }
     }
@@ -531,6 +687,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->codec_id != AV_CODEC_ID_MPEG4      &&
         s->codec_id != AV_CODEC_ID_MPEG1VIDEO &&
         s->codec_id != AV_CODEC_ID_MPEG2VIDEO &&
+        s->codec_id != AV_CODEC_ID_MJPEG      &&
         (s->codec_id != AV_CODEC_ID_H263P)) {
         av_log(avctx, AV_LOG_ERROR,
                "multi threaded encoding not supported by codec\n");
@@ -539,7 +696,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     if (s->avctx->thread_count < 1) {
         av_log(avctx, AV_LOG_ERROR,
-               "automatic thread number detection not supported by codec,"
+               "automatic thread number detection not supported by codec, "
                "patch welcome\n");
         return -1;
     }
@@ -572,8 +729,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         //return -1;
     }
 
-    if (s->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG1VIDEO ||
-        s->codec_id == AV_CODEC_ID_MPEG2VIDEO || s->codec_id == AV_CODEC_ID_MJPEG) {
+    if (s->mpeg_quant || s->codec_id == AV_CODEC_ID_MPEG1VIDEO || s->codec_id == AV_CODEC_ID_MPEG2VIDEO || s->codec_id == AV_CODEC_ID_MJPEG || s->codec_id==AV_CODEC_ID_AMV) {
         // (a + x * 3 / 8) / x
         s->intra_quant_bias = 3 << (QUANT_BIAS_SHIFT - 3);
         s->inter_quant_bias = 0;
@@ -583,6 +739,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->inter_quant_bias = -(1 << (QUANT_BIAS_SHIFT - 2));
     }
 
+    if (avctx->qmin > avctx->qmax || avctx->qmin <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "qmin and or qmax are invalid, they must be 0 < min <= max\n");
+        return AVERROR(EINVAL);
+    }
+
 #if FF_API_QUANT_BIAS
 FF_DISABLE_DEPRECATION_WARNINGS
     if (avctx->intra_quant_bias != FF_DEFAULT_QUANT_BIAS)
@@ -592,6 +753,8 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
+    av_log(avctx, AV_LOG_DEBUG, "intra_quant_bias = %d inter_quant_bias = %d\n",s->intra_quant_bias,s->inter_quant_bias);
+
     if (avctx->codec_id == AV_CODEC_ID_MPEG4 &&
         s->avctx->time_base.den > (1 << 16) - 1) {
         av_log(avctx, AV_LOG_ERROR,
@@ -616,6 +779,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->rtp_mode   = 1;
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         s->out_format = FMT_MJPEG;
         s->intra_only = 1; /* force intra only for jpeg */
         if (!CONFIG_MJPEG_ENCODER ||
@@ -641,13 +805,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
         break;
     case AV_CODEC_ID_H263:
         if (!CONFIG_H263_ENCODER)
-        return -1;
+            return -1;
         if (ff_match_2uint16(ff_h263_format, FF_ARRAY_ELEMS(ff_h263_format),
                              s->width, s->height) == 8) {
-            av_log(avctx, AV_LOG_INFO,
+            av_log(avctx, AV_LOG_ERROR,
                    "The specified picture size of %dx%d is not valid for "
                    "the H.263 codec.\nValid sizes are 128x96, 176x144, "
-                   "352x288, 704x576, and 1408x1152."
+                   "352x288, 704x576, and 1408x1152. "
                    "Try H.263+.\n", s->width, s->height);
             return -1;
         }
@@ -759,9 +923,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (ff_mpv_common_init(s) < 0)
         return -1;
 
-    if (ARCH_X86)
-        ff_mpv_encode_init_x86(s);
-
     ff_fdctdsp_init(&s->fdsp, avctx);
     ff_me_cmp_init(&s->mecc, avctx);
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
@@ -776,8 +937,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
     FF_ALLOCZ_OR_GOTO(s->avctx, s->avctx->stats_out, 256, fail);
 
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_intra_matrix,   64 * 32 * sizeof(int), fail);
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->q_chroma_intra_matrix, 64 * 32 * sizeof(int), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_inter_matrix,   64 * 32 * sizeof(int), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_intra_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
+    FF_ALLOCZ_OR_GOTO(s->avctx, s->q_chroma_intra_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->q_inter_matrix16, 64 * 32 * 2 * sizeof(uint16_t), fail);
     FF_ALLOCZ_OR_GOTO(s->avctx, s->input_picture,
                       MAX_PICTURE_COUNT * sizeof(Picture *), fail);
@@ -790,15 +953,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                           2 * 64 * sizeof(uint16_t), fail);
     }
 
-    if (CONFIG_H263_ENCODER)
-        ff_h263dsp_init(&s->h263dsp);
-    if (!s->dct_quantize)
-        s->dct_quantize = ff_dct_quantize_c;
-    if (!s->denoise_dct)
-        s->denoise_dct  = denoise_dct_c;
-    s->fast_dct_quantize = s->dct_quantize;
-    if (avctx->trellis)
-        s->dct_quantize  = dct_quantize_trellis_c;
+    ff_dct_encode_init(s);
 
     if ((CONFIG_H263P_ENCODER || CONFIG_RV20_ENCODER) && s->modified_quant)
         s->chroma_qscale_table = ff_h263_chroma_qscale_table;
@@ -806,7 +961,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (s->slice_context_count > 1) {
         s->rtp_mode = 1;
 
-        if (avctx->codec_id == AV_CODEC_ID_H263 || avctx->codec_id == AV_CODEC_ID_H263P)
+        if (avctx->codec_id == AV_CODEC_ID_H263P)
             s->h263_slice_structured = 1;
     }
 
@@ -851,6 +1006,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         } else {
             /* MPEG-1/2 */
+            s->chroma_intra_matrix[j] =
             s->intra_matrix[j] = ff_mpeg1_default_intra_matrix[i];
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         }
@@ -984,6 +1140,10 @@ av_cold int ff_mpv_encode_end(AVCodecContext *avctx)
     av_freep(&s->avctx->stats_out);
     av_freep(&s->ac_stats);
 
+    if(s->q_chroma_intra_matrix   != s->q_intra_matrix  ) av_freep(&s->q_chroma_intra_matrix);
+    if(s->q_chroma_intra_matrix16 != s->q_intra_matrix16) av_freep(&s->q_chroma_intra_matrix16);
+    s->q_chroma_intra_matrix=   NULL;
+    s->q_chroma_intra_matrix16= NULL;
     av_freep(&s->q_intra_matrix);
     av_freep(&s->q_inter_matrix);
     av_freep(&s->q_intra_matrix16);
@@ -1036,7 +1196,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared)
 {
     return ff_alloc_picture(s->avctx, pic, &s->me, &s->sc, shared, 1,
                             s->chroma_x_shift, s->chroma_y_shift, s->out_format,
-                            s->mb_stride, s->mb_height, s->b8_stride,
+                            s->mb_stride, s->mb_width, s->mb_height, s->b8_stride,
                             &s->linesize, &s->uvlinesize);
 }
 
@@ -1056,18 +1216,17 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
 
         if (pts != AV_NOPTS_VALUE) {
             if (s->user_specified_pts != AV_NOPTS_VALUE) {
-                int64_t time = pts;
                 int64_t last = s->user_specified_pts;
 
-                if (time <= last) {
+                if (pts <= last) {
                     av_log(s->avctx, AV_LOG_ERROR,
-                           "Error, Invalid timestamp=%"PRId64", "
-                           "last=%"PRId64"\n", pts, s->user_specified_pts);
-                    return -1;
+                           "Invalid pts (%"PRId64") <= last (%"PRId64")\n",
+                           pts, last);
+                    return AVERROR(EINVAL);
                 }
 
                 if (!s->low_delay && display_picture_number == 1)
-                    s->dts_delta = time - last;
+                    s->dts_delta = pts - last;
             }
             s->user_specified_pts = pts;
         } else {
@@ -1089,8 +1248,12 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
             direct = 0;
         if ((s->width & 15) || (s->height & 15))
             direct = 0;
+        if (((intptr_t)(pic_arg->data[0])) & (STRIDE_ALIGN-1))
+            direct = 0;
+        if (s->linesize & (STRIDE_ALIGN-1))
+            direct = 0;
 
-        ff_dlog(s->avctx, "%d %d %td %td\n", pic_arg->linesize[0],
+        ff_dlog(s->avctx, "%d %d %"PTRDIFF_SPECIFIER" %"PTRDIFF_SPECIFIER"\n", pic_arg->linesize[0],
                 pic_arg->linesize[1], s->linesize, s->uvlinesize);
 
         i = ff_find_unused_picture(s->avctx, s->picture, direct);
@@ -1128,6 +1291,12 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
                     int h = s->height >> v_shift;
                     uint8_t *src = pic_arg->data[i];
                     uint8_t *dst = pic->f->data[i];
+                    int vpad = 16;
+
+                    if (   s->codec_id == AV_CODEC_ID_MPEG2VIDEO
+                        && !s->progressive_sequence
+                        && FFALIGN(s->height, 32) - s->height > 16)
+                        vpad = 32;
 
                     if (!s->avctx->rc_buffer_size)
                         dst += INPLACE_OFFSET;
@@ -1143,11 +1312,11 @@ static int load_input_picture(MpegEncContext *s, const AVFrame *pic_arg)
                             src += src_stride;
                         }
                     }
-                    if ((s->width & 15) || (s->height & 15)) {
+                    if ((s->width & 15) || (s->height & (vpad-1))) {
                         s->mpvencdsp.draw_edges(dst, dst_stride,
                                                 w, h,
                                                 16 >> h_shift,
-                                                16 >> v_shift,
+                                                vpad >> v_shift,
                                                 EDGE_BOTTOM);
                     }
                 }
@@ -1197,19 +1366,23 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref)
                 uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride);
                 int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8);
 
-                switch (s->frame_skip_exp) {
+                switch (FFABS(s->frame_skip_exp)) {
                 case 0: score    =  FFMAX(score, v);          break;
                 case 1: score   += FFABS(v);                  break;
-                case 2: score   += v * v;                     break;
-                case 3: score64 += FFABS(v * v * (int64_t)v); break;
-                case 4: score64 += v * v * (int64_t)(v * v);  break;
+                case 2: score64 += v * (int64_t)v;                       break;
+                case 3: score64 += FFABS(v * (int64_t)v * v);            break;
+                case 4: score64 += (v * (int64_t)v) * (v * (int64_t)v);  break;
                 }
             }
         }
     }
+    emms_c();
 
     if (score)
         score64 = score;
+    if (s->frame_skip_exp < 0)
+        score64 = pow(score64 / (double)(s->mb_width * s->mb_height),
+                      -1.0/s->frame_skip_exp);
 
     if (score64 < s->frame_skip_threshold)
         return 1;
@@ -1244,7 +1417,7 @@ static int estimate_best_b_count(MpegEncContext *s)
 
     if (!c)
         return AVERROR(ENOMEM);
-    assert(scale >= 0 && scale <= 3);
+    av_assert0(scale >= 0 && scale <= 3);
 
     //emms_c();
     //s->next_picture_ptr->quality;
@@ -1274,29 +1447,31 @@ static int estimate_best_b_count(MpegEncContext *s)
     for (i = 0; i < s->max_b_frames + 2; i++) {
         Picture pre_input, *pre_input_ptr = i ? s->input_picture[i - 1] :
                                                 s->next_picture_ptr;
+        uint8_t *data[4];
 
         if (pre_input_ptr && (!i || s->input_picture[i - 1])) {
             pre_input = *pre_input_ptr;
+            memcpy(data, pre_input_ptr->f->data, sizeof(data));
 
             if (!pre_input.shared && i) {
-                pre_input.f->data[0] += INPLACE_OFFSET;
-                pre_input.f->data[1] += INPLACE_OFFSET;
-                pre_input.f->data[2] += INPLACE_OFFSET;
+                data[0] += INPLACE_OFFSET;
+                data[1] += INPLACE_OFFSET;
+                data[2] += INPLACE_OFFSET;
             }
 
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0],
                                        s->tmp_frames[i]->linesize[0],
-                                       pre_input.f->data[0],
+                                       data[0],
                                        pre_input.f->linesize[0],
                                        c->width, c->height);
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1],
                                        s->tmp_frames[i]->linesize[1],
-                                       pre_input.f->data[1],
+                                       data[1],
                                        pre_input.f->linesize[1],
                                        c->width >> 1, c->height >> 1);
             s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2],
                                        s->tmp_frames[i]->linesize[2],
-                                       pre_input.f->data[2],
+                                       data[2],
                                        pre_input.f->linesize[2],
                                        c->width >> 1, c->height >> 1);
         }
@@ -1358,6 +1533,19 @@ static int select_input_picture(MpegEncContext *s)
 
     /* set next picture type & ordering */
     if (!s->reordered_input_picture[0] && s->input_picture[0]) {
+        if (s->frame_skip_threshold || s->frame_skip_factor) {
+            if (s->picture_in_gop_number < s->gop_size &&
+                s->next_picture_ptr &&
+                skip_check(s, s->input_picture[0], s->next_picture_ptr)) {
+                // FIXME check that the gop check above is +-1 correct
+                av_frame_unref(s->input_picture[0]->f);
+
+                ff_vbv_update(s, 0);
+
+                goto no_output_pic;
+            }
+        }
+
         if (/*s->picture_in_gop_number >= s->gop_size ||*/
             !s->next_picture_ptr || s->intra_only) {
             s->reordered_input_picture[0] = s->input_picture[0];
@@ -1367,19 +1555,6 @@ static int select_input_picture(MpegEncContext *s)
         } else {
             int b_frames = 0;
 
-            if (s->frame_skip_threshold || s->frame_skip_factor) {
-                if (s->picture_in_gop_number < s->gop_size &&
-                    skip_check(s, s->input_picture[0], s->next_picture_ptr)) {
-                    // FIXME check that the gop check above is +-1 correct
-                    av_frame_unref(s->input_picture[0]->f);
-
-                    emms_c();
-                    ff_vbv_update(s, 0);
-
-                    goto no_output_pic;
-                }
-            }
-
             if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
                 for (i = 0; i < s->max_b_frames + 1; i++) {
                     int pict_num = s->input_picture[0]->f->display_picture_number + i;
@@ -1524,25 +1699,26 @@ no_output_pic:
 
 static void frame_end(MpegEncContext *s)
 {
-    int i;
-
     if (s->unrestricted_mv &&
         s->current_picture.reference &&
         !s->intra_only) {
         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
         int hshift = desc->log2_chroma_w;
         int vshift = desc->log2_chroma_h;
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[0], s->linesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[0],
+                                s->current_picture.f->linesize[0],
                                 s->h_edge_pos, s->v_edge_pos,
                                 EDGE_WIDTH, EDGE_WIDTH,
                                 EDGE_TOP | EDGE_BOTTOM);
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[1], s->uvlinesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[1],
+                                s->current_picture.f->linesize[1],
                                 s->h_edge_pos >> hshift,
                                 s->v_edge_pos >> vshift,
                                 EDGE_WIDTH >> hshift,
                                 EDGE_WIDTH >> vshift,
                                 EDGE_TOP | EDGE_BOTTOM);
-        s->mpvencdsp.draw_edges(s->current_picture.f->data[2], s->uvlinesize,
+        s->mpvencdsp.draw_edges(s->current_picture.f->data[2],
+                                s->current_picture.f->linesize[2],
                                 s->h_edge_pos >> hshift,
                                 s->v_edge_pos >> vshift,
                                 EDGE_WIDTH >> hshift,
@@ -1557,14 +1733,6 @@ static void frame_end(MpegEncContext *s)
     if (s->pict_type!= AV_PICTURE_TYPE_B)
         s->last_non_b_pict_type = s->pict_type;
 
-    if (s->encoding) {
-        /* release non-reference frames */
-        for (i = 0; i < MAX_PICTURE_COUNT; i++) {
-            if (!s->picture[i].reference)
-                ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
-        }
-    }
-
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     av_frame_copy_props(s->avctx->coded_frame, s->current_picture.f);
@@ -1664,7 +1832,7 @@ static int frame_start(MpegEncContext *s)
     }
 
     if (s->dct_error_sum) {
-        assert(s->noise_reduction && s->encoding);
+        av_assert2(s->noise_reduction && s->encoding);
         update_noise_reduction(s);
     }
 
@@ -1678,6 +1846,8 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
     int i, stuffing_count, ret;
     int context_count = s->slice_context_count;
 
+    s->vbv_ignore_qmax = 0;
+
     s->picture_in_gop_number++;
 
     if (load_input_picture(s, pic_arg) < 0)
@@ -1689,9 +1859,11 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
 
     /* output? */
     if (s->new_picture.f->data[0]) {
-        uint8_t *sd;
-        if (!pkt->data &&
-            (ret = ff_alloc_packet(pkt, s->mb_width*s->mb_height*MAX_MB_BYTES)) < 0)
+        int growing_buffer = context_count == 1 && !pkt->data && !s->data_partitioning;
+        int pkt_size = growing_buffer ? FFMAX(s->mb_width*s->mb_height*64+10000, avctx->internal->byte_buffer_size) - AV_INPUT_BUFFER_PADDING_SIZE
+                                              :
+                                              s->mb_width*s->mb_height*(MAX_MB_BYTES+100)+10000;
+        if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
             return ret;
         if (s->mb_info) {
             s->mb_info_ptr = av_packet_new_side_data(pkt,
@@ -1716,7 +1888,13 @@ int ff_mpv_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
         if (ret < 0)
             return ret;
 vbv_retry:
-        if (encode_picture(s, s->picture_number) < 0)
+        ret = encode_picture(s, s->picture_number);
+        if (growing_buffer) {
+            av_assert0(s->pb.buf == avctx->internal->byte_buffer);
+            pkt->data = s->pb.buf;
+            pkt->size = avctx->internal->byte_buffer_size;
+        }
+        if (ret < 0)
             return -1;
 
 #if FF_API_STAT_BITS
@@ -1735,28 +1913,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         frame_end(s);
 
-        sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR,
-                                     sizeof(int));
-        if (!sd)
-            return AVERROR(ENOMEM);
-        *(int *)sd = s->current_picture.f->quality;
-
         if (CONFIG_MJPEG_ENCODER && s->out_format == FMT_MJPEG)
             ff_mjpeg_encode_picture_trailer(&s->pb, s->header_bits);
 
         if (avctx->rc_buffer_size) {
             RateControlContext *rcc = &s->rc_context;
-            int max_size = rcc->buffer_index * avctx->rc_max_available_vbv_use;
+            int max_size = FFMAX(rcc->buffer_index * avctx->rc_max_available_vbv_use, rcc->buffer_index - 500);
+            int hq = (s->avctx->mb_decision == FF_MB_DECISION_RD || s->avctx->trellis);
+            int min_step = hq ? 1 : (1<<(FF_LAMBDA_SHIFT + 7))/139;
 
             if (put_bits_count(&s->pb) > max_size &&
                 s->lambda < s->lmax) {
-                s->next_lambda = FFMAX(s->lambda + 1, s->lambda *
+                s->next_lambda = FFMAX(s->lambda + min_step, s->lambda *
                                        (s->qscale + 1) / s->qscale);
                 if (s->adaptive_quant) {
                     int i;
                     for (i = 0; i < s->mb_height * s->mb_stride; i++)
                         s->lambda_table[i] =
-                            FFMAX(s->lambda_table[i] + 1,
+                            FFMAX(s->lambda_table[i] + min_step,
                                   s->lambda_table[i] * (s->qscale + 1) /
                                   s->qscale);
                 }
@@ -1776,10 +1950,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     PutBitContext *pb = &s->thread_context[i]->pb;
                     init_put_bits(pb, pb->buf, pb->buf_end - pb->buf);
                 }
+                s->vbv_ignore_qmax = 1;
+                av_log(s->avctx, AV_LOG_VERBOSE, "reencoding frame due to VBV\n");
                 goto vbv_retry;
             }
 
-            assert(s->avctx->rc_max_rate);
+            av_assert0(s->avctx->rc_max_rate);
         }
 
         if (s->avctx->flags & AV_CODEC_FLAG_PASS1)
@@ -1789,6 +1965,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
             s->current_picture_ptr->encoding_error[i] = s->current_picture.encoding_error[i];
             avctx->error[i] += s->current_picture_ptr->encoding_error[i];
         }
+        ff_side_data_set_encoder_stats(pkt, s->current_picture.f->quality,
+                                       s->current_picture_ptr->encoding_error,
+                                       (s->avctx->flags&AV_CODEC_FLAG_PSNR) ? 4 : 0,
+                                       s->pict_type);
 
         if (s->avctx->flags & AV_CODEC_FLAG_PASS1)
             assert(put_bits_count(&s->pb) == s->header_bits + s->mv_bits +
@@ -1798,6 +1978,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         s->frame_bits  = put_bits_count(&s->pb);
 
         stuffing_count = ff_vbv_update(s, s->frame_bits);
+        s->stuffing_bits = 8*stuffing_count;
         if (stuffing_count) {
             if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb) >> 3) <
                     stuffing_count + 50) {
@@ -1855,7 +2036,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             vbv_delay = FFMAX(vbv_delay, min_delay);
 
-            assert(vbv_delay < 0xFFFF);
+            av_assert0(vbv_delay < 0xFFFF);
 
             s->vbv_delay_ptr[0] &= 0xF8;
             s->vbv_delay_ptr[0] |= vbv_delay >> 13;
@@ -1905,7 +2086,14 @@ FF_ENABLE_DEPRECATION_WARNINGS
     } else {
         s->frame_bits = 0;
     }
-    assert((s->frame_bits & 7) == 0);
+
+    /* release non-reference frames */
+    for (i = 0; i < MAX_PICTURE_COUNT; i++) {
+        if (!s->picture[i].reference)
+            ff_mpeg_unref_picture(s->avctx, &s->picture[i]);
+    }
+
+    av_assert1((s->frame_bits & 7) == 0);
 
     pkt->size = s->frame_bits / 8;
     *got_packet = !!pkt->size;
@@ -2029,15 +2217,17 @@ static void get_visual_weight(int16_t *weight, uint8_t *ptr, int stride)
 static av_always_inline void encode_mb_internal(MpegEncContext *s,
                                                 int motion_x, int motion_y,
                                                 int mb_block_height,
+                                                int mb_block_width,
                                                 int mb_block_count)
 {
-    int16_t weight[8][64];
-    int16_t orig[8][64];
+    int16_t weight[12][64];
+    int16_t orig[12][64];
     const int mb_x = s->mb_x;
     const int mb_y = s->mb_y;
     int i;
-    int skip_dct[8];
+    int skip_dct[12];
     int dct_offset = s->linesize * 8; // default for progressive frames
+    int uv_dct_offset = s->uvlinesize * 8;
     uint8_t *ptr_y, *ptr_cb, *ptr_cr;
     ptrdiff_t wrap_y, wrap_c;
 
@@ -2079,27 +2269,31 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
     ptr_y  = s->new_picture.f->data[0] +
              (mb_y * 16 * wrap_y)              + mb_x * 16;
     ptr_cb = s->new_picture.f->data[1] +
-             (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+             (mb_y * mb_block_height * wrap_c) + mb_x * mb_block_width;
     ptr_cr = s->new_picture.f->data[2] +
-             (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+             (mb_y * mb_block_height * wrap_c) + mb_x * mb_block_width;
 
-    if (mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) {
-        uint8_t *ebuf = s->sc.edge_emu_buffer + 32;
+    if((mb_x * 16 + 16 > s->width || mb_y * 16 + 16 > s->height) && s->codec_id != AV_CODEC_ID_AMV){
+        uint8_t *ebuf = s->sc.edge_emu_buffer + 36 * wrap_y;
+        int cw = (s->width  + s->chroma_x_shift) >> s->chroma_x_shift;
+        int ch = (s->height + s->chroma_y_shift) >> s->chroma_y_shift;
         s->vdsp.emulated_edge_mc(ebuf, ptr_y,
                                  wrap_y, wrap_y,
                                  16, 16, mb_x * 16, mb_y * 16,
                                  s->width, s->height);
         ptr_y = ebuf;
-        s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y, ptr_cb,
+        s->vdsp.emulated_edge_mc(ebuf + 16 * wrap_y, ptr_cb,
                                  wrap_c, wrap_c,
-                                 8, mb_block_height, mb_x * 8, mb_y * 8,
-                                 s->width >> 1, s->height >> 1);
-        ptr_cb = ebuf + 18 * wrap_y;
-        s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y + 8, ptr_cr,
+                                 mb_block_width, mb_block_height,
+                                 mb_x * mb_block_width, mb_y * mb_block_height,
+                                 cw, ch);
+        ptr_cb = ebuf + 16 * wrap_y;
+        s->vdsp.emulated_edge_mc(ebuf + 16 * wrap_y + 16, ptr_cr,
                                  wrap_c, wrap_c,
-                                 8, mb_block_height, mb_x * 8, mb_y * 8,
-                                 s->width >> 1, s->height >> 1);
-        ptr_cr = ebuf + 18 * wrap_y + 8;
+                                 mb_block_width, mb_block_height,
+                                 mb_x * mb_block_width, mb_y * mb_block_height,
+                                 cw, ch);
+        ptr_cr = ebuf + 16 * wrap_y + 16;
     }
 
     if (s->mb_intra) {
@@ -2120,8 +2314,10 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
                     s->interlaced_dct = 1;
 
                     dct_offset = wrap_y;
+                    uv_dct_offset = wrap_c;
                     wrap_y <<= 1;
-                    if (s->chroma_format == CHROMA_422)
+                    if (s->chroma_format == CHROMA_422 ||
+                        s->chroma_format == CHROMA_444)
                         wrap_c <<= 1;
                 }
             }
@@ -2138,11 +2334,16 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
         } else {
             s->pdsp.get_pixels(s->block[4], ptr_cb, wrap_c);
             s->pdsp.get_pixels(s->block[5], ptr_cr, wrap_c);
-            if (!s->chroma_y_shift) { /* 422 */
-                s->pdsp.get_pixels(s->block[6],
-                                   ptr_cb + (dct_offset >> 1), wrap_c);
-                s->pdsp.get_pixels(s->block[7],
-                                   ptr_cr + (dct_offset >> 1), wrap_c);
+            if (!s->chroma_y_shift && s->chroma_x_shift) { /* 422 */
+                s->pdsp.get_pixels(s->block[6], ptr_cb + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[7], ptr_cr + uv_dct_offset, wrap_c);
+            } else if (!s->chroma_y_shift && !s->chroma_x_shift) { /* 444 */
+                s->pdsp.get_pixels(s->block[ 6], ptr_cb + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[ 7], ptr_cr + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[ 8], ptr_cb + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[ 9], ptr_cr + uv_dct_offset, wrap_c);
+                s->pdsp.get_pixels(s->block[10], ptr_cb + uv_dct_offset + 8, wrap_c);
+                s->pdsp.get_pixels(s->block[11], ptr_cr + uv_dct_offset + 8, wrap_c);
             }
         }
     } else {
@@ -2198,6 +2399,7 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
                     s->interlaced_dct = 1;
 
                     dct_offset = wrap_y;
+                    uv_dct_offset = wrap_c;
                     wrap_y <<= 1;
                     if (s->chroma_format == CHROMA_422)
                         wrap_c <<= 1;
@@ -2219,10 +2421,10 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             s->pdsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
             s->pdsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
             if (!s->chroma_y_shift) { /* 422 */
-                s->pdsp.diff_pixels(s->block[6], ptr_cb + (dct_offset >> 1),
-                                    dest_cb + (dct_offset >> 1), wrap_c);
-                s->pdsp.diff_pixels(s->block[7], ptr_cr + (dct_offset >> 1),
-                                    dest_cr + (dct_offset >> 1), wrap_c);
+                s->pdsp.diff_pixels(s->block[6], ptr_cb + uv_dct_offset,
+                                    dest_cb + uv_dct_offset, wrap_c);
+                s->pdsp.diff_pixels(s->block[7], ptr_cr + uv_dct_offset,
+                                    dest_cr + uv_dct_offset, wrap_c);
             }
         }
         /* pre quantization */
@@ -2244,12 +2446,12 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale)
                 skip_dct[5] = 1;
             if (!s->chroma_y_shift) { /* 422 */
-                if (s->mecc.sad[1](NULL, ptr_cb + (dct_offset >> 1),
-                                   dest_cb + (dct_offset >> 1),
+                if (s->mecc.sad[1](NULL, ptr_cb + uv_dct_offset,
+                                   dest_cb + uv_dct_offset,
                                    wrap_c, 8) < 20 * s->qscale)
                     skip_dct[6] = 1;
-                if (s->mecc.sad[1](NULL, ptr_cr + (dct_offset >> 1),
-                                   dest_cr + (dct_offset >> 1),
+                if (s->mecc.sad[1](NULL, ptr_cr + uv_dct_offset,
+                                   dest_cr + uv_dct_offset,
                                    wrap_c, 8) < 20 * s->qscale)
                     skip_dct[7] = 1;
             }
@@ -2271,17 +2473,17 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             get_visual_weight(weight[5], ptr_cr                , wrap_c);
         if (!s->chroma_y_shift) { /* 422 */
             if (!skip_dct[6])
-                get_visual_weight(weight[6], ptr_cb + (dct_offset >> 1),
+                get_visual_weight(weight[6], ptr_cb + uv_dct_offset,
                                   wrap_c);
             if (!skip_dct[7])
-                get_visual_weight(weight[7], ptr_cr + (dct_offset >> 1),
+                get_visual_weight(weight[7], ptr_cr + uv_dct_offset,
                                   wrap_c);
         }
         memcpy(orig[0], s->block[0], sizeof(int16_t) * 64 * mb_block_count);
     }
 
     /* DCT & quantize */
-    assert(s->out_format != FMT_MJPEG || s->qscale == 8);
+    av_assert2(s->out_format != FMT_MJPEG || s->qscale == 8);
     {
         for (i = 0; i < mb_block_count; i++) {
             if (!skip_dct[i]) {
@@ -2327,6 +2529,12 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
         s->block_last_index[5] = 0;
         s->block[4][0] =
         s->block[5][0] = (1024 + s->c_dc_scale / 2) / s->c_dc_scale;
+        if (!s->chroma_y_shift) { /* 422 / 444 */
+            for (i=6; i<12; i++) {
+                s->block_last_index[i] = 0;
+                s->block[i][0] = s->block[4][0];
+            }
+        }
     }
 
     // non c quantize code returns incorrect block_last_index FIXME
@@ -2377,18 +2585,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s,
             ff_h263_encode_mb(s, s->block, motion_x, motion_y);
         break;
     case AV_CODEC_ID_MJPEG:
+    case AV_CODEC_ID_AMV:
         if (CONFIG_MJPEG_ENCODER)
             ff_mjpeg_encode_mb(s, s->block);
         break;
     default:
-        assert(0);
+        av_assert1(0);
     }
 }
 
 static av_always_inline void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 {
-    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 6);
-    else                                encode_mb_internal(s, motion_x, motion_y, 16, 8);
+    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 8, 6);
+    else if (s->chroma_format == CHROMA_422) encode_mb_internal(s, motion_x, motion_y, 16, 8, 8);
+    else encode_mb_internal(s, motion_x, motion_y, 16, 16, 12);
 }
 
 static inline void copy_context_before_encode(MpegEncContext *d, MpegEncContext *s, int type){
@@ -2479,7 +2689,7 @@ static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegE
         s->dest[0] = s->sc.rd_scratchpad;
         s->dest[1] = s->sc.rd_scratchpad + 16*s->linesize;
         s->dest[2] = s->sc.rd_scratchpad + 16*s->linesize + 8;
-        assert(s->linesize >= 32); //FIXME
+        av_assert0(s->linesize >= 32); //FIXME
     }
 
     encode_mb(s, motion_x, motion_y);
@@ -2525,7 +2735,7 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in
         }
     }
 
-    assert(acc>=0);
+    av_assert2(acc>=0);
 
     return acc;
 }
@@ -2575,6 +2785,8 @@ static int pre_estimate_motion_thread(AVCodecContext *c, void *arg){
 static int estimate_motion_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
 
+    ff_check_alignment();
+
     s->me.dia_size= s->avctx->dia_size;
     s->first_slice_line=1;
     for(s->mb_y= s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
@@ -2601,6 +2813,8 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
     int mb_x, mb_y;
 
+    ff_check_alignment();
+
     for(mb_y=s->start_mb_y; mb_y < s->end_mb_y; mb_y++) {
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
             int xx = mb_x * 16;
@@ -2628,7 +2842,7 @@ static void write_slice_end(MpegEncContext *s){
 
         ff_mpeg4_stuffing(&s->pb);
     }else if(CONFIG_MJPEG_ENCODER && s->out_format == FMT_MJPEG){
-        ff_mjpeg_encode_stuffing(&s->pb);
+        ff_mjpeg_encode_stuffing(s);
     }
 
     avpriv_align_put_bits(&s->pb);
@@ -2681,6 +2895,40 @@ static void update_mb_info(MpegEncContext *s, int startcode)
     write_mb_info(s);
 }
 
+int ff_mpv_reallocate_putbitbuffer(MpegEncContext *s, size_t threshold, size_t size_increase)
+{
+    if (   s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < threshold
+        && s->slice_context_count == 1
+        && s->pb.buf == s->avctx->internal->byte_buffer) {
+        int lastgob_pos = s->ptr_lastgob - s->pb.buf;
+        int vbv_pos     = s->vbv_delay_ptr - s->pb.buf;
+
+        uint8_t *new_buffer = NULL;
+        int new_buffer_size = 0;
+
+        if ((s->avctx->internal->byte_buffer_size + size_increase) >= INT_MAX/8) {
+            av_log(s->avctx, AV_LOG_ERROR, "Cannot reallocate putbit buffer\n");
+            return AVERROR(ENOMEM);
+        }
+
+        av_fast_padded_malloc(&new_buffer, &new_buffer_size,
+                              s->avctx->internal->byte_buffer_size + size_increase);
+        if (!new_buffer)
+            return AVERROR(ENOMEM);
+
+        memcpy(new_buffer, s->avctx->internal->byte_buffer, s->avctx->internal->byte_buffer_size);
+        av_free(s->avctx->internal->byte_buffer);
+        s->avctx->internal->byte_buffer      = new_buffer;
+        s->avctx->internal->byte_buffer_size = new_buffer_size;
+        rebase_put_bits(&s->pb, new_buffer, new_buffer_size);
+        s->ptr_lastgob   = s->pb.buf + lastgob_pos;
+        s->vbv_delay_ptr = s->pb.buf + vbv_pos;
+    }
+    if (s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < threshold)
+        return AVERROR(EINVAL);
+    return 0;
+}
+
 static int encode_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= *(void**)arg;
     int mb_x, mb_y, pdif = 0;
@@ -2692,6 +2940,8 @@ static int encode_thread(AVCodecContext *c, void *arg){
     uint8_t bit_buf_tex[2][MAX_MB_BYTES];
     PutBitContext pb[2], pb2[2], tex_pb[2];
 
+    ff_check_alignment();
+
     for(i=0; i<2; i++){
         init_put_bits(&pb    [i], bit_buf    [i], MAX_MB_BYTES);
         init_put_bits(&pb2   [i], bit_buf2   [i], MAX_MB_BYTES);
@@ -2715,6 +2965,11 @@ static int encode_thread(AVCodecContext *c, void *arg){
 
         s->current_picture.encoding_error[i] = 0;
     }
+    if(s->codec_id==AV_CODEC_ID_AMV){
+        s->last_dc[0] = 128*8/13;
+        s->last_dc[1] = 128*8/14;
+        s->last_dc[2] = 128*8/14;
+    }
     s->mb_skip_run = 0;
     memset(s->last_mv, 0, sizeof(s->last_mv));
 
@@ -2750,7 +3005,10 @@ static int encode_thread(AVCodecContext *c, void *arg){
 //            int d;
             int dmin= INT_MAX;
             int dir;
+            int size_increase =  s->avctx->internal->byte_buffer_size/4
+                               + s->mb_width*MAX_MB_BYTES;
 
+            ff_mpv_reallocate_putbitbuffer(s, MAX_MB_BYTES, size_increase);
             if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < MAX_MB_BYTES){
                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
                 return -1;
@@ -2758,7 +3016,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
             if(s->data_partitioning){
                 if(   s->pb2   .buf_end - s->pb2   .buf - (put_bits_count(&s->    pb2)>>3) < MAX_MB_BYTES
                    || s->tex_pb.buf_end - s->tex_pb.buf - (put_bits_count(&s->tex_pb )>>3) < MAX_MB_BYTES){
-                    av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                    av_log(s->avctx, AV_LOG_ERROR, "encoded partitioned frame too large\n");
                     return -1;
                 }
             }
@@ -2796,6 +3054,9 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 case AV_CODEC_ID_MPEG1VIDEO:
                     if(s->mb_skip_run) is_gob_start=0;
                     break;
+                case AV_CODEC_ID_MJPEG:
+                    if(s->mb_x==0 && s->mb_y!=0) is_gob_start=1;
+                    break;
                 }
 
                 if(is_gob_start){
@@ -2807,7 +3068,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         }
                     }
 
-                    assert((put_bits_count(&s->pb)&7) == 0);
+                    av_assert2((put_bits_count(&s->pb)&7) == 0);
                     current_packet_size= put_bits_ptr(&s->pb) - s->ptr_lastgob;
 
                     if (s->error_rate && s->resync_mb_x + s->resync_mb_y > 0) {
@@ -3018,8 +3279,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                         int16_t ac[6][16];
                         const int mvdir= (best_s.mv_dir&MV_DIR_BACKWARD) ? 1 : 0;
                         static const int dquant_tab[4]={-1,1,-2,2};
+                        int storecoefs = s->mb_intra && s->dc_val[0];
 
-                        assert(backup_s.dquant == 0);
+                        av_assert2(backup_s.dquant == 0);
 
                         //FIXME intra
                         s->mv_dir= best_s.mv_dir;
@@ -3037,7 +3299,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                             if(qp < s->avctx->qmin || qp > s->avctx->qmax)
                                 continue;
                             backup_s.dquant= dquant;
-                            if(s->mb_intra && s->dc_val[0]){
+                            if(storecoefs){
                                 for(i=0; i<6; i++){
                                     dc[i]= s->dc_val[0][ s->block_index[i] ];
                                     memcpy(ac[i], s->ac_val[0][s->block_index[i]], sizeof(int16_t)*16);
@@ -3047,7 +3309,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                             encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_INTER /* wrong but unused */, pb, pb2, tex_pb,
                                          &dmin, &next_block, s->mv[mvdir][0][0], s->mv[mvdir][0][1]);
                             if(best_s.qscale != qp){
-                                if(s->mb_intra && s->dc_val[0]){
+                                if(storecoefs){
                                     for(i=0; i<6; i++){
                                         s->dc_val[0][ s->block_index[i] ]= dc[i];
                                         memcpy(s->ac_val[0][s->block_index[i]], ac[i], sizeof(int16_t)*16);
@@ -3395,7 +3657,7 @@ static int estimate_qp(MpegEncContext *s, int dry_run){
 
 /* must be called before writing the header */
 static void set_frame_distances(MpegEncContext * s){
-    assert(s->current_picture_ptr->f->pts != AV_NOPTS_VALUE);
+    av_assert1(s->current_picture_ptr->f->pts != AV_NOPTS_VALUE);
     s->time = s->current_picture_ptr->f->pts * s->avctx->time_base.num;
 
     if(s->pict_type==AV_PICTURE_TYPE_B){
@@ -3451,6 +3713,13 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         update_qscale(s);
     }
 
+    if(s->codec_id != AV_CODEC_ID_AMV && s->codec_id != AV_CODEC_ID_MJPEG){
+        if(s->q_chroma_intra_matrix   != s->q_intra_matrix  ) av_freep(&s->q_chroma_intra_matrix);
+        if(s->q_chroma_intra_matrix16 != s->q_intra_matrix16) av_freep(&s->q_chroma_intra_matrix16);
+        s->q_chroma_intra_matrix   = s->q_intra_matrix;
+        s->q_chroma_intra_matrix16 = s->q_intra_matrix16;
+    }
+
     s->mb_intra=0; //for the rate distortion & bit compare functions
     for(i=1; i<context_count; i++){
         ret = ff_update_duplicate_context(s->thread_context[i], s);
@@ -3495,7 +3764,9 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         s->pict_type= AV_PICTURE_TYPE_I;
         for(i=0; i<s->mb_stride*s->mb_height; i++)
             s->mb_type[i]= CANDIDATE_MB_TYPE_INTRA;
-        ff_dlog(s, "Scene change detected, encoding as I Frame %d %d\n",
+        if(s->msmpeg4_version >= 3)
+            s->no_rounding=1;
+        ff_dlog(s, "Scene change detected, encoding as I Frame %"PRId64" %"PRId64"\n",
                 s->current_picture.mb_var_sum, s->current_picture.mc_mb_var_sum);
     }
 
@@ -3562,17 +3833,50 @@ static int encode_picture(MpegEncContext *s, int picture_number)
         s->qscale= 3; //reduce clipping problems
 
     if (s->out_format == FMT_MJPEG) {
+        const uint16_t *  luma_matrix = ff_mpeg1_default_intra_matrix;
+        const uint16_t *chroma_matrix = ff_mpeg1_default_intra_matrix;
+
+        if (s->avctx->intra_matrix) {
+            chroma_matrix =
+            luma_matrix = s->avctx->intra_matrix;
+        }
+        if (s->avctx->chroma_intra_matrix)
+            chroma_matrix = s->avctx->chroma_intra_matrix;
+
         /* for mjpeg, we do include qscale in the matrix */
         for(i=1;i<64;i++){
             int j = s->idsp.idct_permutation[i];
 
-            s->intra_matrix[j] = av_clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
+            s->chroma_intra_matrix[j] = av_clip_uint8((chroma_matrix[i] * s->qscale) >> 3);
+            s->       intra_matrix[j] = av_clip_uint8((  luma_matrix[i] * s->qscale) >> 3);
         }
         s->y_dc_scale_table=
         s->c_dc_scale_table= ff_mpeg2_dc_scale_table[s->intra_dc_precision];
+        s->chroma_intra_matrix[0] =
         s->intra_matrix[0] = ff_mpeg2_dc_scale_table[s->intra_dc_precision][8];
         ff_convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16,
                        s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        ff_convert_matrix(s, s->q_chroma_intra_matrix, s->q_chroma_intra_matrix16,
+                       s->chroma_intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        s->qscale= 8;
+    }
+    if(s->codec_id == AV_CODEC_ID_AMV){
+        static const uint8_t y[32]={13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13};
+        static const uint8_t c[32]={14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14};
+        for(i=1;i<64;i++){
+            int j= s->idsp.idct_permutation[ff_zigzag_direct[i]];
+
+            s->intra_matrix[j] = sp5x_quant_table[5*2+0][i];
+            s->chroma_intra_matrix[j] = sp5x_quant_table[5*2+1][i];
+        }
+        s->y_dc_scale_table= y;
+        s->c_dc_scale_table= c;
+        s->intra_matrix[0] = 13;
+        s->chroma_intra_matrix[0] = 14;
+        ff_convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16,
+                       s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
+        ff_convert_matrix(s, s->q_chroma_intra_matrix, s->q_chroma_intra_matrix16,
+                       s->chroma_intra_matrix, s->intra_quant_bias, 8, 8, 1);
         s->qscale= 8;
     }
 
@@ -3585,12 +3889,13 @@ static int encode_picture(MpegEncContext *s, int picture_number)
     if (s->current_picture.f->key_frame)
         s->picture_in_gop_number=0;
 
+    s->mb_x = s->mb_y = 0;
     s->last_bits= put_bits_count(&s->pb);
     switch(s->out_format) {
     case FMT_MJPEG:
         if (CONFIG_MJPEG_ENCODER)
             ff_mjpeg_encode_picture_header(s->avctx, &s->pb, &s->intra_scantable,
-                                           s->pred, s->intra_matrix);
+                                           s->pred, s->intra_matrix, s->chroma_intra_matrix);
         break;
     case FMT_H261:
         if (CONFIG_H261_ENCODER)
@@ -3601,9 +3906,11 @@ static int encode_picture(MpegEncContext *s, int picture_number)
             ff_wmv2_encode_picture_header(s, picture_number);
         else if (CONFIG_MSMPEG4_ENCODER && s->msmpeg4_version)
             ff_msmpeg4_encode_picture_header(s, picture_number);
-        else if (CONFIG_MPEG4_ENCODER && s->h263_pred)
-            ff_mpeg4_encode_picture_header(s, picture_number);
-        else if (CONFIG_RV10_ENCODER && s->codec_id == AV_CODEC_ID_RV10) {
+        else if (CONFIG_MPEG4_ENCODER && s->h263_pred) {
+            ret = ff_mpeg4_encode_picture_header(s, picture_number);
+            if (ret < 0)
+                return ret;
+        } else if (CONFIG_RV10_ENCODER && s->codec_id == AV_CODEC_ID_RV10) {
             ret = ff_rv10_encode_picture_header(s, picture_number);
             if (ret < 0)
                 return ret;
@@ -3620,7 +3927,7 @@ static int encode_picture(MpegEncContext *s, int picture_number)
             ff_mpeg1_encode_picture_header(s, picture_number);
         break;
     default:
-        assert(0);
+        av_assert0(0);
     }
     bits= put_bits_count(&s->pb);
     s->header_bits= bits - s->last_bits;
@@ -3630,6 +3937,8 @@ static int encode_picture(MpegEncContext *s, int picture_number)
     }
     s->avctx->execute(s->avctx, encode_thread, &s->thread_context[0], NULL, context_count, sizeof(void*));
     for(i=1; i<context_count; i++){
+        if (s->pb.buf_end == s->thread_context[i]->pb.buf)
+            set_put_bits_buffer_size(&s->pb, FFMIN(s->thread_context[i]->pb.buf_end - s->pb.buf, INT_MAX/8-32));
         merge_context_after_encode(s, s->thread_context[i]);
     }
     emms_c();
@@ -3664,6 +3973,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                                   int16_t *block, int n,
                                   int qscale, int *overflow){
     const int *qmat;
+    const uint16_t *matrix;
     const uint8_t *scantable= s->intra_scantable.scantable;
     const uint8_t *perm_scantable= s->intra_scantable.permutated;
     int max=0;
@@ -3685,6 +3995,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     uint8_t * length;
     uint8_t * last_length;
     const int lambda= s->lambda2 >> (FF_LAMBDA_SHIFT - 6);
+    int mpeg2_qscale;
 
     s->fdsp.fdct(block);
 
@@ -3693,6 +4004,9 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     qmul= qscale*16;
     qadd= ((qscale-1)|1)*8;
 
+    if (s->q_scale_type) mpeg2_qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 mpeg2_qscale = qscale << 1;
+
     if (s->mb_intra) {
         int q;
         if (!s->h263_aic) {
@@ -3711,15 +4025,23 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
         block[0] = (block[0] + (q >> 1)) / q;
         start_i = 1;
         last_non_zero = 0;
-        qmat = s->q_intra_matrix[qscale];
-        if(s->mpeg_quant || s->out_format == FMT_MPEG1)
+        qmat = n < 4 ? s->q_intra_matrix[qscale] : s->q_chroma_intra_matrix[qscale];
+        matrix = n < 4 ? s->intra_matrix : s->chroma_intra_matrix;
+        if(s->mpeg_quant || s->out_format == FMT_MPEG1 || s->out_format == FMT_MJPEG)
             bias= 1<<(QMAT_SHIFT-1);
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
+
+        if (n > 3 && s->intra_chroma_ac_vlc_length) {
+            length     = s->intra_chroma_ac_vlc_length;
+            last_length= s->intra_chroma_ac_vlc_last_length;
+        } else {
+            length     = s->intra_ac_vlc_length;
+            last_length= s->intra_ac_vlc_last_length;
+        }
     } else {
         start_i = 0;
         last_non_zero = -1;
         qmat = s->q_inter_matrix[qscale];
+        matrix = s->inter_matrix;
         length     = s->inter_ac_vlc_length;
         last_length= s->inter_ac_vlc_last_length;
     }
@@ -3757,7 +4079,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
 //                coeff[2][k]= -level+2;
             }
             coeff_count[i]= FFMIN(level, 2);
-            assert(coeff_count[i]);
+            av_assert2(coeff_count[i]);
             max |=level;
         }else{
             coeff[0][i]= (level>>31)|1;
@@ -3791,17 +4113,20 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
             const int alevel= FFABS(level);
             int unquant_coeff;
 
-            assert(level);
+            av_assert2(level);
 
-            if(s->out_format == FMT_H263){
+            if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                 unquant_coeff= alevel*qmul + qadd;
-            } else { // MPEG-1
+            } else if(s->out_format == FMT_MJPEG) {
+                j = s->idsp.idct_permutation[scantable[i]];
+                unquant_coeff = alevel * matrix[j] * 8;
+            }else{ // MPEG-1
                 j = s->idsp.idct_permutation[scantable[i]]; // FIXME: optimize
                 if(s->mb_intra){
-                        unquant_coeff = (int)(  alevel  * qscale * s->intra_matrix[j]) >> 3;
+                        unquant_coeff = (int)(  alevel  * mpeg2_qscale * matrix[j]) >> 4;
                         unquant_coeff =   (unquant_coeff - 1) | 1;
                 }else{
-                        unquant_coeff = (((  alevel  << 1) + 1) * qscale * ((int) s->inter_matrix[j])) >> 4;
+                        unquant_coeff = (((  alevel  << 1) + 1) * mpeg2_qscale * ((int) matrix[j])) >> 5;
                         unquant_coeff =   (unquant_coeff - 1) | 1;
                 }
                 unquant_coeff<<= 3;
@@ -3822,7 +4147,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                     }
                 }
 
-                if(s->out_format == FMT_H263){
+                if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                     for(j=survivor_count-1; j>=0; j--){
                         int run= i - survivor[j];
                         int score= distortion + last_length[UNI_AC_ENC_INDEX(run, level)]*lambda;
@@ -3848,7 +4173,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
                     }
                 }
 
-                if(s->out_format == FMT_H263){
+                if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                   for(j=survivor_count-1; j>=0; j--){
                         int run= i - survivor[j];
                         int score= distortion + score_tab[i-run];
@@ -3881,7 +4206,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
         survivor[ survivor_count++ ]= i+1;
     }
 
-    if(s->out_format != FMT_H263){
+    if(s->out_format != FMT_H263 && s->out_format != FMT_H261){
         last_score= 256*256*256*120;
         for(i= survivor[0]; i<=last_non_zero + 1; i++){
             int score= score_tab[i];
@@ -3915,10 +4240,10 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
             int alevel= FFABS(level);
             int unquant_coeff, score, distortion;
 
-            if(s->out_format == FMT_H263){
+            if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
                     unquant_coeff= (alevel*qmul + qadd)>>3;
-            } else { // MPEG-1
-                    unquant_coeff = (((  alevel  << 1) + 1) * qscale * ((int) s->inter_matrix[0])) >> 4;
+            } else{ // MPEG-1
+                    unquant_coeff = (((  alevel  << 1) + 1) * mpeg2_qscale * ((int) matrix[0])) >> 5;
                     unquant_coeff =   (unquant_coeff - 1) | 1;
             }
             unquant_coeff = (unquant_coeff + 4) >> 3;
@@ -3941,7 +4266,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
     }
 
     i= last_i;
-    assert(last_level);
+    av_assert2(last_level);
 
     block[ perm_scantable[last_non_zero] ]= last_level;
     i -= last_run + 1;
@@ -4025,8 +4350,13 @@ static int messed_sign=0;
         start_i = 1;
 //        if(s->mpeg_quant || s->out_format == FMT_MPEG1)
 //            bias= 1<<(QMAT_SHIFT-1);
-        length     = s->intra_ac_vlc_length;
-        last_length= s->intra_ac_vlc_last_length;
+        if (n > 3 && s->intra_chroma_ac_vlc_length) {
+            length     = s->intra_chroma_ac_vlc_length;
+            last_length= s->intra_chroma_ac_vlc_last_length;
+        } else {
+            length     = s->intra_ac_vlc_length;
+            last_length= s->intra_ac_vlc_last_length;
+        }
     } else {
         dc= 0;
         start_i = 0;
@@ -4057,8 +4387,8 @@ STOP_TIMER("memset rem[]")}
         weight[i] = w;
 //        w=weight[i] = (63*qns + (w/2)) / w;
 
-        assert(w>0);
-        assert(w<(1<<6));
+        av_assert2(w>0);
+        av_assert2(w<(1<<6));
         sum += w*w;
     }
     lambda= sum*(uint64_t)s->lambda2 >> (FF_LAMBDA_SHIFT - 6 + 6 + 6 + 6);
@@ -4124,7 +4454,7 @@ STOP_TIMER("dct")}
             const int level= block[0];
             int change, old_coeff;
 
-            assert(s->mb_intra);
+            av_assert2(s->mb_intra);
 
             old_coeff= q*level;
 
@@ -4168,7 +4498,7 @@ STOP_TIMER("dct")}
             }else{
                 old_coeff=0;
                 run2--;
-                assert(run2>=0 || i >= last_non_zero );
+                av_assert2(run2>=0 || i >= last_non_zero );
             }
 
             for(change=-1; change<=1; change+=2){
@@ -4196,7 +4526,7 @@ STOP_TIMER("dct")}
                                          - last_length[UNI_AC_ENC_INDEX(run, level+64)];
                         }
                     }else{
-                        assert(FFABS(new_level)==1);
+                        av_assert2(FFABS(new_level)==1);
 
                         if(analyze_gradient){
                             int g= d1[ scantable[i] ];
@@ -4229,7 +4559,7 @@ STOP_TIMER("dct")}
                     }
                 }else{
                     new_coeff=0;
-                    assert(FFABS(level)==1);
+                    av_assert2(FFABS(level)==1);
 
                     if(i < last_non_zero){
                         int next_i= i + run2 + 1;
@@ -4258,7 +4588,7 @@ STOP_TIMER("dct")}
                 score *= lambda;
 
                 unquant_change= new_coeff - old_coeff;
-                assert((score < 100*lambda && score > -100*lambda) || lambda==0);
+                av_assert2((score < 100*lambda && score > -100*lambda) || lambda==0);
 
                 score += s->mpvencdsp.try_8x8basis(rem, weight, basis[j],
                                                    unquant_change);
@@ -4290,7 +4620,7 @@ STOP_TIMER("iterative step")}
 
             if(best_coeff > last_non_zero){
                 last_non_zero= best_coeff;
-                assert(block[j]);
+                av_assert2(block[j]);
 #ifdef REFINE_STATS
 after_last++;
 #endif
@@ -4318,7 +4648,7 @@ if(block[j]){
 #ifdef REFINE_STATS
 count++;
 if(256*256*256*64 % count == 0){
-    printf("after_last:%d to_zero:%d from_zero:%d raise:%d lower:%d sign:%d xyp:%d/%d/%d\n", after_last, to_zero, from_zero, raise, lower, messed_sign, s->mb_x, s->mb_y, s->picture_number);
+    av_log(s->avctx, AV_LOG_DEBUG, "after_last:%d to_zero:%d from_zero:%d raise:%d lower:%d sign:%d xyp:%d/%d/%d\n", after_last, to_zero, from_zero, raise, lower, messed_sign, s->mb_x, s->mb_y, s->picture_number);
 }
 #endif
             run=0;
@@ -4361,8 +4691,8 @@ STOP_TIMER("iterative search")
  *                  permutation up, the block is not (inverse) permutated
  *                  to scantable order!
  */
-static void block_permute(int16_t *block, uint8_t *permutation,
-                          const uint8_t *scantable, int last)
+void ff_block_permute(int16_t *block, uint8_t *permutation,
+                      const uint8_t *scantable, int last)
 {
     int i;
     int16_t temp[64];
@@ -4417,13 +4747,13 @@ int ff_dct_quantize_c(MpegEncContext *s,
         block[0] = (block[0] + (q >> 1)) / q;
         start_i = 1;
         last_non_zero = 0;
-        qmat = s->q_intra_matrix[qscale];
-        bias= s->intra_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
+        qmat = n < 4 ? s->q_intra_matrix[qscale] : s->q_chroma_intra_matrix[qscale];
+        bias= s->intra_quant_bias*(1<<(QMAT_SHIFT - QUANT_BIAS_SHIFT));
     } else {
         start_i = 0;
         last_non_zero = -1;
         qmat = s->q_inter_matrix[qscale];
-        bias= s->inter_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
+        bias= s->inter_quant_bias*(1<<(QMAT_SHIFT - QUANT_BIAS_SHIFT));
     }
     threshold1= (1<<QMAT_SHIFT) - bias - 1;
     threshold2= (threshold1<<1);
@@ -4461,7 +4791,7 @@ int ff_dct_quantize_c(MpegEncContext *s,
 
     /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */
     if (s->idsp.perm_type != FF_IDCT_PERM_NONE)
-        block_permute(block, s->idsp.idct_permutation,
+        ff_block_permute(block, s->idsp.idct_permutation,
                       scantable, last_non_zero);
 
     return last_non_zero;
@@ -4470,8 +4800,7 @@ int ff_dct_quantize_c(MpegEncContext *s,
 #define OFFSET(x) offsetof(MpegEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption h263_options[] = {
-    { "obmc",         "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "structured_slices","Write slice start position at every GOB header instead of just GOB number.", OFFSET(h263_slice_structured), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE},
+    { "obmc",         "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "mb_info",      "emit macroblock info for RFC 2190 packetization, the parameter value is the maximum payload size", OFFSET(mb_info), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
     FF_MPV_COMMON_OPTS
     { NULL },
@@ -4498,10 +4827,10 @@ AVCodec ff_h263_encoder = {
 };
 
 static const AVOption h263p_options[] = {
-    { "umv",        "Use unlimited motion vectors.",    OFFSET(umvplus), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "aiv",        "Use alternative inter VLC.",       OFFSET(alt_inter_vlc), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "obmc",       "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
-    { "structured_slices", "Write slice start position at every GOB header instead of just GOB number.", OFFSET(h263_slice_structured), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE},
+    { "umv",        "Use unlimited motion vectors.",    OFFSET(umvplus),       AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "aiv",        "Use alternative inter VLC.",       OFFSET(alt_inter_vlc), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "obmc",       "use overlapped block motion compensation.", OFFSET(obmc), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "structured_slices", "Write slice start position at every GOB header instead of just GOB number.", OFFSET(h263_slice_structured), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE},
     FF_MPV_COMMON_OPTS
     { NULL },
 };
diff --git a/libavcodec/mpegvideo_motion.c b/libavcodec/mpegvideo_motion.c
index 9589714..c29810f 100644
--- a/libavcodec/mpegvideo_motion.c
+++ b/libavcodec/mpegvideo_motion.c
@@ -4,25 +4,26 @@
  *
  * 4MV & hq & B-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 #include "avcodec.h"
 #include "h261.h"
@@ -178,7 +179,7 @@ static void gmc_motion(MpegEncContext *s,
                 s->sprite_delta[0][0], s->sprite_delta[0][1],
                 s->sprite_delta[1][0], s->sprite_delta[1][1],
                 a + 1, (1 << (2 * a + 1)) - s->no_rounding,
-                s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+                (s->h_edge_pos + 1) >> 1, (s->v_edge_pos + 1) >> 1);
 
     ptr = ref_picture[2];
     s->mdsp.gmc(dest_cr, ptr, uvlinesize, 8,
@@ -186,7 +187,7 @@ static void gmc_motion(MpegEncContext *s,
                 s->sprite_delta[0][0], s->sprite_delta[0][1],
                 s->sprite_delta[1][0], s->sprite_delta[1][1],
                 a + 1, (1 << (2 * a + 1)) - s->no_rounding,
-                s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+                (s->h_edge_pos + 1) >> 1, (s->v_edge_pos + 1) >> 1);
 }
 
 static inline int hpel_motion(MpegEncContext *s,
@@ -210,55 +211,21 @@ static inline int hpel_motion(MpegEncContext *s,
         dxy |= (motion_y & 1) << 1;
     src += src_y * s->linesize + src_x;
 
-    if (s->unrestricted_mv) {
-        if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 1) - 8, 0) ||
-            (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y & 1) - 8, 0)) {
+        if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 1) - 7, 0) ||
+            (unsigned)src_y >= FFMAX(s->v_edge_pos - (motion_y & 1) - 7, 0)) {
             s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, src,
                                      s->linesize, s->linesize,
                                      9, 9,
-                                     src_x, src_y, s->h_edge_pos,
-                                     s->v_edge_pos);
+                                     src_x, src_y,
+                                     s->h_edge_pos, s->v_edge_pos);
             src = s->sc.edge_emu_buffer;
             emu = 1;
         }
-    }
     pix_op[dxy](dest, src, s->linesize, 8);
     return emu;
 }
 
 static av_always_inline
-void emulated_edge_mc(MpegEncContext *s,
-                      int src_x, int src_y,
-                      int uvsrc_x, int uvsrc_y,
-                      int field_based,
-                      uint8_t **ptr_y,
-                      uint8_t **ptr_cb,
-                      uint8_t **ptr_cr)
-{
-    s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, *ptr_y,
-                             s->linesize, s->linesize,
-                             17, 17 + field_based,
-                             src_x, src_y * (1 << field_based),
-                             s->h_edge_pos, s->v_edge_pos);
-    *ptr_y = s->sc.edge_emu_buffer;
-    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 18 * s->linesize;
-        s->vdsp.emulated_edge_mc(uvbuf, *ptr_cb,
-                                 s->uvlinesize, s->uvlinesize,
-                                 9, 9 + field_based,
-                                 uvsrc_x, uvsrc_y * (1 << field_based),
-                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, *ptr_cr,
-                                 s->uvlinesize, s->uvlinesize,
-                                 9, 9 + field_based,
-                                 uvsrc_x, uvsrc_y * (1 << field_based),
-                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        *ptr_cb = uvbuf;
-        *ptr_cr = uvbuf + 16;
-    }
-}
-
-static av_always_inline
 void mpeg_motion_internal(MpegEncContext *s,
                           uint8_t *dest_y,
                           uint8_t *dest_cb,
@@ -340,8 +307,8 @@ void mpeg_motion_internal(MpegEncContext *s,
     ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
     ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
 
-    if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 1) - 16, 0) ||
-        (unsigned)src_y > FFMAX(v_edge_pos - (motion_y & 1) - h, 0)) {
+    if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 1) - 15   , 0) ||
+        (unsigned)src_y >= FFMAX(   v_edge_pos - (motion_y & 1) - h + 1, 0)) {
         if (is_mpeg12 ||
             s->codec_id == AV_CODEC_ID_MPEG2VIDEO ||
             s->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
@@ -350,8 +317,30 @@ void mpeg_motion_internal(MpegEncContext *s,
                    src_y);
             return;
         }
-        emulated_edge_mc(s, src_x, src_y, uvsrc_x, uvsrc_y, field_based,
-                         &ptr_y, &ptr_cb, &ptr_cr);
+        src_y = (unsigned)src_y << field_based;
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
+                                 s->linesize, s->linesize,
+                                 17, 17 + field_based,
+                                 src_x, src_y,
+                                 s->h_edge_pos, s->v_edge_pos);
+        ptr_y = s->sc.edge_emu_buffer;
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+            uvsrc_y = (unsigned)uvsrc_y << field_based;
+            s->vdsp.emulated_edge_mc(ubuf, ptr_cb,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y,
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf, ptr_cr,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y,
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
+        }
     }
 
     /* FIXME use this for field pix too instead of the obnoxious hack which
@@ -408,7 +397,7 @@ static void mpeg_motion_field(MpegEncContext *s, uint8_t *dest_y,
                               int motion_x, int motion_y, int h, int mb_y)
 {
 #if !CONFIG_SMALL
-    if(s->out_format == FMT_MPEG1)
+    if (s->out_format == FMT_MPEG1)
         mpeg_motion_internal(s, dest_y, dest_cb, dest_cr, 1,
                              bottom_field, field_select, ref_picture, pix_op,
                              motion_x, motion_y, h, 1, mb_y);
@@ -483,7 +472,7 @@ static inline void obmc_motion(MpegEncContext *s,
     int i;
     uint8_t *ptr[5];
 
-    assert(s->quarter_sample == 0);
+    av_assert2(s->quarter_sample == 0);
 
     for (i = 0; i < 5; i++) {
         if (i && mv[i][0] == mv[MID][0] && mv[i][1] == mv[MID][1]) {
@@ -550,10 +539,30 @@ static inline void qpel_motion(MpegEncContext *s,
     ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
     ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
 
-    if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 3) - 16, 0) ||
-        (unsigned)src_y > FFMAX(v_edge_pos - (motion_y & 3) - h, 0)) {
-        emulated_edge_mc(s, src_x, src_y, uvsrc_x, uvsrc_y, field_based,
-                         &ptr_y, &ptr_cb, &ptr_cr);
+    if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 3) - 15   , 0) ||
+        (unsigned)src_y >= FFMAX(   v_edge_pos - (motion_y & 3) - h + 1, 0)) {
+        s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr_y,
+                                 s->linesize, s->linesize,
+                                 17, 17 + field_based,
+                                 src_x, src_y * (1 << field_based),
+                                 s->h_edge_pos, s->v_edge_pos);
+        ptr_y = s->sc.edge_emu_buffer;
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            uint8_t *ubuf = s->sc.edge_emu_buffer + 18 * s->linesize;
+            uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+            s->vdsp.emulated_edge_mc(ubuf, ptr_cb,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y * (1 << field_based),
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            s->vdsp.emulated_edge_mc(vbuf, ptr_cr,
+                                     s->uvlinesize, s->uvlinesize,
+                                     9, 9 + field_based,
+                                     uvsrc_x, uvsrc_y * (1 << field_based),
+                                     s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+            ptr_cb = ubuf;
+            ptr_cr = vbuf;
+        }
     }
 
     if (!field_based)
@@ -614,8 +623,8 @@ static void chroma_4mv_motion(MpegEncContext *s,
 
     offset = src_y * s->uvlinesize + src_x;
     ptr    = ref_picture[1] + offset;
-    if ((unsigned)src_x > FFMAX((s->h_edge_pos >> 1) - (dxy & 1) - 8, 0) ||
-        (unsigned)src_y > FFMAX((s->v_edge_pos >> 1) - (dxy >> 1) - 8, 0)) {
+    if ((unsigned)src_x >= FFMAX((s->h_edge_pos >> 1) - (dxy  & 1) - 7, 0) ||
+        (unsigned)src_y >= FFMAX((s->v_edge_pos >> 1) - (dxy >> 1) - 7, 0)) {
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
                                  s->uvlinesize, s->uvlinesize,
                                  9, 9, src_x, src_y,
@@ -666,7 +675,7 @@ static inline void apply_obmc(MpegEncContext *s,
     const int mot_xy     = mb_x * 2 + mb_y * 2 * mot_stride;
     int mx, my, i;
 
-    assert(!s->mb_skipped);
+    av_assert2(!s->mb_skipped);
 
     AV_COPY32(mv_cache[1][1], cur_frame->motion_val[0][mot_xy]);
     AV_COPY32(mv_cache[1][2], cur_frame->motion_val[0][mot_xy + 1]);
@@ -772,8 +781,8 @@ static inline void apply_8x8(MpegEncContext *s,
                 dxy &= ~12;
 
             ptr = ref_picture[0] + (src_y * s->linesize) + (src_x);
-            if ((unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x & 3) - 8, 0) ||
-                (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y & 3) - 8, 0)) {
+            if ((unsigned)src_x >= FFMAX(s->h_edge_pos - (motion_x & 3) - 7, 0) ||
+                (unsigned)src_y >= FFMAX(s->v_edge_pos - (motion_y & 3) - 7, 0)) {
                 s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, ptr,
                                          s->linesize, s->linesize,
                                          9, 9,
@@ -893,8 +902,8 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
                                   s->mv[dir][1][0], s->mv[dir][1][1], 8, mb_y);
             }
         } else {
-            if (s->picture_structure != s->field_select[dir][0] + 1 &&
-                s->pict_type != AV_PICTURE_TYPE_B && !s->first_field) {
+            if (   s->picture_structure != s->field_select[dir][0] + 1 && s->pict_type != AV_PICTURE_TYPE_B && !s->first_field
+                || !ref_picture[0]) {
                 ref_picture = s->current_picture_ptr->f->data;
             }
 
@@ -908,8 +917,8 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
         for (i = 0; i < 2; i++) {
             uint8_t **ref2picture;
 
-            if (s->picture_structure == s->field_select[dir][i] + 1
-                || s->pict_type == AV_PICTURE_TYPE_B || s->first_field) {
+            if ((s->picture_structure == s->field_select[dir][i] + 1
+                || s->pict_type == AV_PICTURE_TYPE_B || s->first_field) && ref_picture[0]) {
                 ref2picture = ref_picture;
             } else {
                 ref2picture = s->current_picture_ptr->f->data;
@@ -938,6 +947,9 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
                 pix_op = s->hdsp.avg_pixels_tab;
             }
         } else {
+            if (!ref_picture[0]) {
+                ref_picture = s->current_picture_ptr->f->data;
+            }
             for (i = 0; i < 2; i++) {
                 mpeg_motion(s, dest_y, dest_cb, dest_cr,
                             s->picture_structure != i + 1,
@@ -956,7 +968,7 @@ static av_always_inline void mpv_motion_internal(MpegEncContext *s,
             }
         }
         break;
-    default: assert(0);
+    default: av_assert2(0);
     }
 }
 
diff --git a/libavcodec/mpegvideo_parser.c b/libavcodec/mpegvideo_parser.c
index 3630375..206f268 100644
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,6 +44,9 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
     int top_field_first, repeat_first_field, progressive_frame;
     int horiz_size_ext, vert_size_ext, bit_rate_ext;
     int did_set_size=0;
+    int set_dim_ret = 0;
+    int bit_rate = 0;
+    int vbv_delay = 0;
     int chroma_format;
     enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
 //FIXME replace the crap with get_bits()
@@ -57,6 +60,8 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
         case PICTURE_START_CODE:
             if (bytes_left >= 2) {
                 s->pict_type = (buf[1] >> 3) & 7;
+                if (bytes_left >= 4)
+                vbv_delay = ((buf[1] & 0x07) << 13) | (buf[2] << 5) | (buf[3]  >> 3);
             }
             break;
         case SEQ_START_CODE:
@@ -64,14 +69,15 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                 pc->width  = (buf[0] << 4) | (buf[1] >> 4);
                 pc->height = ((buf[1] & 0x0f) << 8) | buf[2];
                 if(!avctx->width || !avctx->height || !avctx->coded_width || !avctx->coded_height){
-                    ff_set_dimensions(avctx, pc->width, pc->height);
+                    set_dim_ret = ff_set_dimensions(avctx, pc->width, pc->height);
                     did_set_size=1;
                 }
                 pix_fmt = AV_PIX_FMT_YUV420P;
                 frame_rate_index = buf[3] & 0xf;
                 pc->frame_rate = avctx->framerate = ff_mpeg12_frame_rate_tab[frame_rate_index];
-                avctx->bit_rate = ((buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6))*400;
+                bit_rate = (buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6);
                 avctx->codec_id = AV_CODEC_ID_MPEG1VIDEO;
+                avctx->ticks_per_frame = 1;
             }
             break;
         case EXT_START_CODE:
@@ -95,14 +101,15 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                         case 3: pix_fmt = AV_PIX_FMT_YUV444P; break;
                         }
 
-                        pc->width  |=(horiz_size_ext << 12);
-                        pc->height |=( vert_size_ext << 12);
-                        avctx->bit_rate += (bit_rate_ext << 18) * 400;
+                        pc->width  = (pc->width & 0xFFF) | (horiz_size_ext << 12);
+                        pc->height = (pc->height& 0xFFF) | ( vert_size_ext << 12);
+                        bit_rate = (bit_rate&0x3FFFF) | (bit_rate_ext << 18);
                         if(did_set_size)
-                            ff_set_dimensions(avctx, pc->width, pc->height);
-                        avctx->framerate.num = pc->frame_rate.num * (frame_rate_ext_n + 1) * 2;
+                            set_dim_ret = ff_set_dimensions(avctx, pc->width, pc->height);
+                        avctx->framerate.num = pc->frame_rate.num * (frame_rate_ext_n + 1);
                         avctx->framerate.den = pc->frame_rate.den * (frame_rate_ext_d + 1);
                         avctx->codec_id = AV_CODEC_ID_MPEG2VIDEO;
+                        avctx->ticks_per_frame = 2;
                     }
                     break;
                 case 0x8: /* picture coding extension */
@@ -148,6 +155,16 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
         }
     }
  the_end: ;
+    if (set_dim_ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions\n");
+
+    if (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO && bit_rate) {
+        avctx->rc_max_rate = 400LL*bit_rate;
+    }
+    if (bit_rate &&
+        ((avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && bit_rate != 0x3FFFF) || vbv_delay != 0xFFFF)) {
+        avctx->bit_rate = 400LL*bit_rate;
+    }
 
     if (pix_fmt != AV_PIX_FMT_NONE) {
         s->format = pix_fmt;
@@ -157,7 +174,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
 
 #if FF_API_AVCTX_TIMEBASE
     if (avctx->framerate.num)
-        avctx->time_base = av_inv_q(avctx->framerate);
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 #endif
 }
 
@@ -187,7 +204,7 @@ static int mpegvideo_parse(AVCodecParserContext *s,
        function should be negligible for uncorrupted streams */
     mpegvideo_extract_headers(s, avctx, buf, buf_size);
     ff_dlog(NULL, "pict_type=%d frame_rate=%0.3f repeat_pict=%d\n",
-            s->pict_type, (double)avctx->time_base.den / avctx->time_base.num, s->repeat_pict);
+            s->pict_type, av_q2d(avctx->framerate), s->repeat_pict);
 
     *poutbuf = buf;
     *poutbuf_size = buf_size;
@@ -199,18 +216,28 @@ static int mpegvideo_split(AVCodecContext *avctx,
 {
     int i;
     uint32_t state= -1;
+    int found=0;
 
     for(i=0; i<buf_size; i++){
         state= (state<<8) | buf[i];
-        if(state != 0x1B3 && state != 0x1B5 && state < 0x200 && state >= 0x100)
+        if(state == 0x1B3){
+            found=1;
+        }else if(found && state != 0x1B5 && state < 0x200 && state >= 0x100)
             return i-3;
     }
     return 0;
 }
 
+static int mpegvideo_parse_init(AVCodecParserContext *s)
+{
+    s->pict_type = AV_PICTURE_TYPE_NONE; // first frame might be partial
+    return 0;
+}
+
 AVCodecParser ff_mpegvideo_parser = {
     .codec_ids      = { AV_CODEC_ID_MPEG1VIDEO, AV_CODEC_ID_MPEG2VIDEO },
     .priv_data_size = sizeof(struct MpvParseContext),
+    .parser_init    = mpegvideo_parse_init,
     .parser_parse   = mpegvideo_parse,
     .parser_close   = ff_parse_close,
     .split          = mpegvideo_split,
diff --git a/libavcodec/mpegvideo_xvmc.c b/libavcodec/mpegvideo_xvmc.c
index b7de79c..b469c4e 100644
--- a/libavcodec/mpegvideo_xvmc.c
+++ b/libavcodec/mpegvideo_xvmc.c
@@ -2,20 +2,20 @@
  * XVideo Motion Compensation
  * Copyright (c) 2003 Ivan Kalvachev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,8 +33,6 @@
 #include "xvmc_internal.h"
 #include "version.h"
 
-#if FF_API_XVMC
-
 /**
  * Initialize the block field of the MpegEncContext pointer passed as
  * parameter after making sure that the data is not corrupted.
@@ -50,6 +48,15 @@ void ff_xvmc_init_block(MpegEncContext *s)
     s->block = (int16_t (*)[64])(render->data_blocks + render->next_free_data_block_num * 64);
 }
 
+static void exchange_uv(MpegEncContext *s)
+{
+    int16_t (*tmp)[64];
+
+    tmp           = s->pblocks[4];
+    s->pblocks[4] = s->pblocks[5];
+    s->pblocks[5] = tmp;
+}
+
 /**
  * Fill individual block pointers, so there are no gaps in the data_block array
  * in case not all blocks in the macroblock are coded.
@@ -67,6 +74,9 @@ void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp)
             s->pblocks[i] = NULL;
         cbp += cbp;
     }
+    if (s->swap_uv) {
+        exchange_uv(s);
+    }
 }
 
 /**
@@ -74,8 +84,9 @@ void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp)
  * This function should be called for every new field and/or frame.
  * It should be safe to call the function a few times for the same field.
  */
-int ff_xvmc_field_start(MpegEncContext *s, AVCodecContext *avctx)
+static int ff_xvmc_field_start(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size)
 {
+    struct MpegEncContext *s = avctx->priv_data;
     struct xvmc_pix_fmt *last, *next, *render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
     const int mb_block_count = 4 + (1 << s->chroma_format);
 
@@ -142,20 +153,22 @@ return -1;
  * some leftover blocks, for example from error_resilience(), may remain.
  * It should be safe to call the function a few times for the same field.
  */
-void ff_xvmc_field_end(MpegEncContext *s)
+static int ff_xvmc_field_end(AVCodecContext *avctx)
 {
+    struct MpegEncContext *s = avctx->priv_data;
     struct xvmc_pix_fmt *render = (struct xvmc_pix_fmt*)s->current_picture.f->data[2];
     assert(render);
 
     if (render->filled_mv_blocks_num > 0)
         ff_mpeg_draw_horiz_band(s, 0, 0);
+    return 0;
 }
 
 /**
  * Synthesize the data needed by XvMC to render one macroblock of data.
  * Fill all relevant fields, if necessary do IDCT.
  */
-void ff_xvmc_decode_mb(MpegEncContext *s)
+static void ff_xvmc_decode_mb(struct MpegEncContext *s)
 {
     XvMCMacroBlock *mv_block;
     struct xvmc_pix_fmt *render;
@@ -314,7 +327,7 @@ void ff_xvmc_decode_mb(MpegEncContext *s)
                  * slowdown. */
             }
             // copy blocks only if the codec doesn't support pblocks reordering
-            if (s->avctx->xvmc_acceleration == 1) {
+            if (!s->pack_pblocks) {
                 memcpy(&render->data_blocks[render->next_free_data_block_num*64],
                        s->pblocks[i], sizeof(*s->pblocks[i]));
             }
@@ -334,4 +347,30 @@ void ff_xvmc_decode_mb(MpegEncContext *s)
         ff_mpeg_draw_horiz_band(s, 0, 0);
 }
 
-#endif /* FF_API_XVMC */
+#if CONFIG_MPEG1_XVMC_HWACCEL
+AVHWAccel ff_mpeg1_xvmc_hwaccel = {
+    .name           = "mpeg1_xvmc",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .pix_fmt        = AV_PIX_FMT_XVMC,
+    .start_frame    = ff_xvmc_field_start,
+    .end_frame      = ff_xvmc_field_end,
+    .decode_slice   = NULL,
+    .decode_mb      = ff_xvmc_decode_mb,
+    .priv_data_size = 0,
+};
+#endif
+
+#if CONFIG_MPEG2_XVMC_HWACCEL
+AVHWAccel ff_mpeg2_xvmc_hwaccel = {
+    .name           = "mpeg2_xvmc",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .pix_fmt        = AV_PIX_FMT_XVMC,
+    .start_frame    = ff_xvmc_field_start,
+    .end_frame      = ff_xvmc_field_end,
+    .decode_slice   = NULL,
+    .decode_mb      = ff_xvmc_decode_mb,
+    .priv_data_size = 0,
+};
+#endif
diff --git a/libavcodec/mpegvideodata.c b/libavcodec/mpegvideodata.c
index f27dd90..5f1d8f7 100644
--- a/libavcodec/mpegvideodata.c
+++ b/libavcodec/mpegvideodata.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,13 @@ const uint8_t ff_default_chroma_qscale_table[32] = {
     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 };
 
+const uint8_t ff_mpeg2_non_linear_qscale[32] = {
+     0,  1,  2,  3,  4,  5,   6,   7,
+     8, 10, 12, 14, 16, 18,  20,  22,
+    24, 28, 32, 36, 40, 44,  48,  52,
+    56, 64, 72, 80, 88, 96, 104, 112,
+};
+
 const uint8_t ff_mpeg1_dc_scale_table[128] = {
 //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
diff --git a/libavcodec/mpegvideodata.h b/libavcodec/mpegvideodata.h
index d3ace23..14f4806 100644
--- a/libavcodec/mpegvideodata.h
+++ b/libavcodec/mpegvideodata.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@ extern const uint8_t ff_alternate_vertical_scan[64];
 extern const uint8_t ff_mpeg1_dc_scale_table[128];
 extern const uint8_t * const ff_mpeg2_dc_scale_table[4];
 
+extern const uint8_t ff_mpeg2_non_linear_qscale[32];
+
 extern const uint8_t ff_default_chroma_qscale_table[32];
 
 #endif /* AVCODEC_MPEGVIDEODATA_H */
diff --git a/libavcodec/mpegvideodsp.c b/libavcodec/mpegvideodsp.c
index 915a844..a58e45a 100644
--- a/libavcodec/mpegvideodsp.c
+++ b/libavcodec/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideodsp.h b/libavcodec/mpegvideodsp.h
index b0f45db..293e254 100644
--- a/libavcodec/mpegvideodsp.h
+++ b/libavcodec/mpegvideodsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 2011107..a7c6102 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,7 @@
 #include <string.h>
 
 #include "config.h"
+#include "libavutil/avassert.h"
 #include "libavutil/attributes.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
@@ -39,7 +40,7 @@ static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
                           (BASIS_SHIFT - RECON_SHIFT));
         int w = weight[i];
         b >>= RECON_SHIFT;
-        assert(-512 < b && b < 512);
+        av_assert2(-512 < b && b < 512);
 
         sum += (w * b) * (w * b) >> 4;
     }
@@ -261,4 +262,6 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
         ff_mpegvideoencdsp_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_mpegvideoencdsp_init_x86(c, avctx);
+    if (ARCH_MIPS)
+        ff_mpegvideoencdsp_init_mips(c, avctx);
 }
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 91a292a..33f0282 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,5 +52,7 @@ void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
+                                  AVCodecContext *avctx);
 
 #endif /* AVCODEC_MPEGVIDEOENCDSP_H */
diff --git a/libavcodec/mpl2dec.c b/libavcodec/mpl2dec.c
new file mode 100644
index 0000000..409e4b3
--- /dev/null
+++ b/libavcodec/mpl2dec.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * MPL2 subtitles decoder
+ *
+ * @see http://web.archive.org/web/20090328040233/http://napisy.ussbrowarek.org/mpl2-eng.html
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static int mpl2_event_to_ass(AVBPrint *buf, const char *p)
+{
+    if (*p == ' ')
+        p++;
+
+    while (*p) {
+        int got_style = 0;
+
+        while (*p && strchr("/\\_", *p)) {
+            if      (*p == '/')  av_bprintf(buf, "{\\i1}");
+            else if (*p == '\\') av_bprintf(buf, "{\\b1}");
+            else if (*p == '_')  av_bprintf(buf, "{\\u1}");
+            got_style = 1;
+            p++;
+        }
+
+        while (*p && *p != '|') {
+            if (*p != '\r' && *p != '\n')
+                av_bprint_chars(buf, *p, 1);
+            p++;
+        }
+
+        if (*p == '|') {
+            if (got_style)
+                av_bprintf(buf, "{\\r}");
+            av_bprintf(buf, "\\N");
+            p++;
+        }
+    }
+
+    return 0;
+}
+
+static int mpl2_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVBPrint buf;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && *ptr && !mpl2_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_mpl2_decoder = {
+    .name           = "mpl2",
+    .long_name      = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_MPL2,
+    .decode         = mpl2_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/mqc.c b/libavcodec/mqc.c
index 0144581..f2d1e3b 100644
--- a/libavcodec/mqc.c
+++ b/libavcodec/mqc.c
@@ -2,20 +2,20 @@
  * MQ-coder encoder and decoder common functions
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mqc.h b/libavcodec/mqc.h
index 6326192..8bf7223 100644
--- a/libavcodec/mqc.h
+++ b/libavcodec/mqc.h
@@ -2,20 +2,20 @@
  * MQ-coder: structures, common and decoder functions
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,16 +43,34 @@ typedef struct MqcState {
     unsigned int c;
     unsigned int ct;
     uint8_t cx_states[19];
+    int raw;
 } MqcState;
 
+/* encoder */
+
+/** initialize the encoder */
+void ff_mqc_initenc(MqcState *mqc, uint8_t *bp);
+
+/** code bit d with context cx */
+void ff_mqc_encode(MqcState *mqc, uint8_t *cxstate, int d);
+
+/** number of encoded bytes */
+int ff_mqc_length(MqcState *mqc);
+
+/** flush the encoder [returns number of bytes encoded] */
+int ff_mqc_flush(MqcState *mqc);
+int ff_mqc_flush_to(MqcState *mqc, uint8_t *dst, int *dst_len);
+
 /* decoder */
 
 /**
  * Initialize MQ-decoder.
  * @param mqc   MQ decoder state
  * @param bp    byte pointer
+ * @param raw   raw mode
+ * @param reset reset states
  */
-void ff_mqc_initdec(MqcState *mqc, uint8_t *bp);
+void ff_mqc_initdec(MqcState *mqc, uint8_t *bp, int raw, int reset);
 
 /**
  * MQ decoder.
diff --git a/libavcodec/mqcdec.c b/libavcodec/mqcdec.c
index 889763a..34aa519 100644
--- a/libavcodec/mqcdec.c
+++ b/libavcodec/mqcdec.c
@@ -2,20 +2,20 @@
  * MQ-coder decoder
  * Copyright (c) 2007 Kamil Nowosad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,9 +68,11 @@ static int exchange(MqcState *mqc, uint8_t *cxstate, int lps)
     return d;
 }
 
-void ff_mqc_initdec(MqcState *mqc, uint8_t *bp)
+void ff_mqc_initdec(MqcState *mqc, uint8_t *bp, int raw, int reset)
 {
-    ff_mqc_init_contexts(mqc);
+    mqc->raw = raw;
+    if (reset)
+        ff_mqc_init_contexts(mqc);
     mqc->bp = bp;
     mqc->c  = (*mqc->bp ^ 0xff) << 16;
     bytein(mqc);
@@ -78,8 +80,20 @@ void ff_mqc_initdec(MqcState *mqc, uint8_t *bp)
     mqc->a = 0x8000;
 }
 
+static int mqc_decode_bypass(MqcState *mqc) {
+    int bit = !(mqc->c & 0x40000000);
+    if (!(mqc->c & 0xff)) {
+        mqc->c -= 0x100;
+        bytein(mqc);
+    }
+    mqc->c += mqc->c;
+    return bit;
+}
+
 int ff_mqc_decode(MqcState *mqc, uint8_t *cxstate)
 {
+    if (mqc->raw)
+        return mqc_decode_bypass(mqc);
     mqc->a -= ff_mqc_qe[*cxstate];
     if ((mqc->c >> 16) < mqc->a) {
         if (mqc->a & 0x8000)
diff --git a/libavcodec/mqcenc.c b/libavcodec/mqcenc.c
new file mode 100644
index 0000000..7c9e1a0
--- /dev/null
+++ b/libavcodec/mqcenc.c
@@ -0,0 +1,139 @@
+/*
+ * MQ-coder encoder
+ * Copyright (c) 2007 Kamil Nowosad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MQ-coder encoder
+ * @file
+ * @author Kamil Nowosad
+ */
+
+#include "libavutil/avassert.h"
+#include "mqc.h"
+
+static void byteout(MqcState *mqc)
+{
+retry:
+    if (*mqc->bp == 0xff){
+        mqc->bp++;
+        *mqc->bp = mqc->c >> 20;
+        mqc->c &= 0xfffff;
+        mqc->ct = 7;
+    } else if ((mqc->c & 0x8000000)){
+        (*mqc->bp)++;
+        mqc->c &= 0x7ffffff;
+        goto retry;
+    } else{
+        mqc->bp++;
+        *mqc->bp = mqc->c >> 19;
+        mqc->c &= 0x7ffff;
+        mqc->ct = 8;
+    }
+}
+
+static void renorme(MqcState *mqc)
+{
+    do{
+        mqc->a += mqc->a;
+        mqc->c += mqc->c;
+        if (!--mqc->ct)
+            byteout(mqc);
+    } while (!(mqc->a & 0x8000));
+}
+
+static void setbits(MqcState *mqc)
+{
+    int tmp = mqc->c + mqc->a;
+    mqc->c |= 0xffff;
+    if (mqc->c >= tmp)
+        mqc->c -= 0x8000;
+}
+
+void ff_mqc_initenc(MqcState *mqc, uint8_t *bp)
+{
+    ff_mqc_init_contexts(mqc);
+    mqc->a = 0x8000;
+    mqc->c = 0;
+    mqc->bp = bp-1;
+    mqc->bpstart = bp;
+    mqc->ct = 12 + (*mqc->bp == 0xff);
+}
+
+void ff_mqc_encode(MqcState *mqc, uint8_t *cxstate, int d)
+{
+    int qe;
+
+    qe = ff_mqc_qe[*cxstate];
+    mqc->a -= qe;
+    if ((*cxstate & 1) == d){
+        if (!(mqc->a & 0x8000)){
+            if (mqc->a < qe)
+                mqc->a = qe;
+            else
+                mqc->c += qe;
+            *cxstate = ff_mqc_nmps[*cxstate];
+            renorme(mqc);
+        } else
+            mqc->c += qe;
+    } else{
+        if (mqc->a < qe)
+            mqc->c += qe;
+        else
+            mqc->a = qe;
+        *cxstate = ff_mqc_nlps[*cxstate];
+        renorme(mqc);
+    }
+}
+
+int ff_mqc_length(MqcState *mqc)
+{
+    return mqc->bp - mqc->bpstart;
+}
+
+int ff_mqc_flush(MqcState *mqc)
+{
+    setbits(mqc);
+    mqc->c = mqc->c << mqc->ct;
+    byteout(mqc);
+    mqc->c = mqc->c << mqc->ct;
+    byteout(mqc);
+    if (*mqc->bp != 0xff)
+        mqc->bp++;
+    return mqc->bp - mqc->bpstart;
+}
+
+int ff_mqc_flush_to(MqcState *mqc, uint8_t *dst, int *dst_len)
+{
+    MqcState mqc2 = *mqc;
+    mqc2.bpstart=
+    mqc2.bp = dst;
+    *mqc2.bp = *mqc->bp;
+    ff_mqc_flush(&mqc2);
+    *dst_len = mqc2.bp - dst;
+    if (mqc->bp < mqc->bpstart) {
+        av_assert1(mqc->bpstart - mqc->bp == 1);
+        av_assert1(*dst_len > 0);
+        av_assert1(mqc->bp[0] == 0 && dst[0] == 0);
+        (*dst_len) --;
+        memmove(dst, dst+1, *dst_len);
+        return mqc->bp - mqc->bpstart + 1 + *dst_len;
+    }
+    return mqc->bp - mqc->bpstart + *dst_len;
+}
diff --git a/libavcodec/msgsmdec.c b/libavcodec/msgsmdec.c
index be5062a..4c4ddb4 100644
--- a/libavcodec/msgsmdec.c
+++ b/libavcodec/msgsmdec.c
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder, Microsoft variant
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msgsmdec.h b/libavcodec/msgsmdec.h
index adbda9a..b2a1a62 100644
--- a/libavcodec/msgsmdec.h
+++ b/libavcodec/msgsmdec.h
@@ -2,20 +2,20 @@
  * gsm 06.10 decoder, Microsoft variant
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c
index 3ab3bd5..920f50f 100644
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,7 @@
 #include "msmpeg4data.h"
 #include "mpegvideodata.h"
 #include "vc1data.h"
+#include "libavutil/imgutils.h"
 
 /*
  * You can also call this codec: MPEG-4 with a twist!
@@ -52,6 +53,9 @@ static av_cold void init_h263_dc_for_msmpeg4(void)
 {
         int level, uni_code, uni_len;
 
+        if(ff_v2_dc_chroma_table[255 + 256][1])
+            return;
+
         for(level=-256; level<256; level++){
             int size, v, l;
             /* find number of bits */
@@ -104,8 +108,6 @@ static av_cold void init_h263_dc_for_msmpeg4(void)
 
 av_cold void ff_msmpeg4_common_init(MpegEncContext *s)
 {
-    static int initialized=0;
-
     switch(s->msmpeg4_version){
     case 1:
     case 2:
@@ -144,11 +146,7 @@ av_cold void ff_msmpeg4_common_init(MpegEncContext *s)
     }
     //Note the default tables are set in common_init in mpegvideo.c
 
-    if(!initialized){
-        initialized=1;
-
-        init_h263_dc_for_msmpeg4();
-    }
+    init_h263_dc_for_msmpeg4();
 }
 
 /* predict coded block */
@@ -178,13 +176,13 @@ int ff_msmpeg4_coded_block_pred(MpegEncContext * s, int n, uint8_t **coded_block
     return pred;
 }
 
-static int get_dc(uint8_t *src, int stride, int scale)
+static int get_dc(uint8_t *src, int stride, int scale, int block_size)
 {
     int y;
     int sum=0;
-    for(y=0; y<8; y++){
+    for(y=0; y<block_size; y++){
         int x;
-        for(x=0; x<8; x++){
+        for(x=0; x<block_size; x++){
             sum+=src[x + y*stride];
         }
     }
@@ -230,13 +228,13 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
         "addl %%eax, %2         \n\t"
         "addl %%eax, %1         \n\t"
         "addl %0, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %0         \n\t"
         "movl %1, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %1         \n\t"
         "movl %2, %%eax         \n\t"
-        "mull %4                \n\t"
+        "imull %4               \n\t"
         "movl %%edx, %2         \n\t"
         : "+b" (a), "+c" (b), "+D" (c)
         : "g" (scale), "S" (ff_inverse[scale])
@@ -276,17 +274,18 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
                     *dir_ptr = 0;
                 }
             }else{
+                int bs = 8 >> s->avctx->lowres;
                 if(n<4){
                     wrap= s->linesize;
-                    dest= s->current_picture.f->data[0] + (((n >> 1) + 2*s->mb_y) * 8*  wrap ) + ((n & 1) + 2*s->mb_x) * 8;
+                    dest= s->current_picture.f->data[0] + (((n >> 1) + 2*s->mb_y) * bs*  wrap ) + ((n & 1) + 2*s->mb_x) * bs;
                 }else{
                     wrap= s->uvlinesize;
-                    dest= s->current_picture.f->data[n - 3] + (s->mb_y * 8 * wrap) + s->mb_x * 8;
+                    dest= s->current_picture.f->data[n - 3] + (s->mb_y * bs * wrap) + s->mb_x * bs;
                 }
                 if(s->mb_x==0) a= (1024 + (scale>>1))/scale;
-                else           a= get_dc(dest-8, wrap, scale*8);
+                else           a= get_dc(dest-bs, wrap, scale*8>>(2*s->avctx->lowres), bs);
                 if(s->mb_y==0) c= (1024 + (scale>>1))/scale;
-                else           c= get_dc(dest-8*wrap, wrap, scale*8);
+                else           c= get_dc(dest-bs*wrap, wrap, scale*8>>(2*s->avctx->lowres), bs);
 
                 if (s->h263_aic_dir==0) {
                     pred= a;
diff --git a/libavcodec/msmpeg4.h b/libavcodec/msmpeg4.h
index e57ae66..bcdb967 100644
--- a/libavcodec/msmpeg4.h
+++ b/libavcodec/msmpeg4.h
@@ -2,20 +2,20 @@
  * MSMPEG4 backend for encoder and decoder
  * copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -69,10 +69,12 @@ int ff_msmpeg4_pred_dc(MpegEncContext *s, int n,
 #define CONFIG_MSMPEG4_DECODER (CONFIG_MSMPEG4V1_DECODER || \
                                 CONFIG_MSMPEG4V2_DECODER || \
                                 CONFIG_MSMPEG4V3_DECODER || \
+                                CONFIG_WMV1_DECODER      || \
                                 CONFIG_WMV2_DECODER      || \
                                 CONFIG_VC1_DECODER)
 #define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V2_ENCODER || \
                                 CONFIG_MSMPEG4V3_ENCODER || \
+                                CONFIG_WMV1_ENCODER      || \
                                 CONFIG_WMV2_ENCODER)
 
 #endif /* AVCODEC_MSMPEG4_H */
diff --git a/libavcodec/msmpeg4data.c b/libavcodec/msmpeg4data.c
index 6bc0520..b9c1d8e 100644
--- a/libavcodec/msmpeg4data.c
+++ b/libavcodec/msmpeg4data.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4data.h b/libavcodec/msmpeg4data.h
index 9d57d41..52fecec 100644
--- a/libavcodec/msmpeg4data.h
+++ b/libavcodec/msmpeg4data.h
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/msmpeg4dec.c b/libavcodec/msmpeg4dec.c
index 1c55558..35aa15c 100644
--- a/libavcodec/msmpeg4dec.c
+++ b/libavcodec/msmpeg4dec.c
@@ -1,24 +1,24 @@
 /*
  * MSMPEG4 backend for encoder and decoder
  * Copyright (c) 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2013 Michael Niedermayer <michaelni@gmx.at>
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "msmpeg4.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/x86/asm.h"
 #include "h263.h"
 #include "mpeg4video.h"
@@ -104,6 +105,7 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
 static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
 {
     int cbp, code, i;
+    uint32_t * const mb_type_ptr = &s->current_picture.mb_type[s->mb_x + s->mb_y*s->mb_stride];
 
     if (s->pict_type == AV_PICTURE_TYPE_P) {
         if (s->use_skip_mb_code) {
@@ -117,6 +119,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
                 s->mv[0][0][0] = 0;
                 s->mv[0][0][1] = 0;
                 s->mb_skipped = 1;
+                *mb_type_ptr = MB_TYPE_SKIP | MB_TYPE_L0 | MB_TYPE_16x16;
                 return 0;
             }
         }
@@ -165,6 +168,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
         s->mv_type = MV_TYPE_16X16;
         s->mv[0][0][0] = mx;
         s->mv[0][0][1] = my;
+        *mb_type_ptr = MB_TYPE_L0 | MB_TYPE_16x16;
     } else {
         if(s->msmpeg4_version==2){
             s->ac_pred = get_bits1(&s->gb);
@@ -174,6 +178,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, int16_t block[6][64])
             cbp|= get_vlc2(&s->gb, ff_h263_cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
             if(s->pict_type==AV_PICTURE_TYPE_P) cbp^=0x3C;
         }
+        *mb_type_ptr = MB_TYPE_INTRA;
     }
 
     s->bdsp.clear_blocks(s->block[0]);
@@ -283,18 +288,19 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, int16_t block[6][64])
 av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
-    static int done = 0;
-    int i;
+    static volatile int done = 0;
+    int i, ret;
     MVTable *mv;
 
+    if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return ret;
+
     if (ff_h263_decode_init(avctx) < 0)
         return -1;
 
     ff_msmpeg4_common_init(s);
 
     if (!done) {
-        done = 1;
-
         for(i=0;i<NB_RL_TABLES;i++) {
             ff_rl_init(&ff_rl_table[i], ff_static_rl_table_store[i]);
         }
@@ -364,6 +370,7 @@ av_cold int ff_msmpeg4_decode_init(AVCodecContext *avctx)
         INIT_VLC_STATIC(&ff_inter_intra_vlc, INTER_INTRA_VLC_BITS, 4,
                  &ff_table_inter_intra[0][1], 2, 1,
                  &ff_table_inter_intra[0][0], 2, 1, 8);
+        done = 1;
     }
 
     switch(s->msmpeg4_version){
@@ -534,7 +541,7 @@ int ff_msmpeg4_decode_picture_header(MpegEncContext * s)
             s->no_rounding = 0;
         }
     }
-    ff_dlog(s->avctx, "%d %d %d %d %d\n", s->pict_type, s->bit_rate,
+    ff_dlog(s->avctx, "%d %"PRId64" %d %d %d\n", s->pict_type, s->bit_rate,
             s->inter_intra_pred, s->width, s->height);
 
     s->esc3_level_length= 0;
@@ -581,8 +588,11 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         } else {
             level = get_vlc2(&s->gb, v2_dc_chroma_vlc.table, DC_VLC_BITS, 3);
         }
-        if (level < 0)
+        if (level < 0) {
+            av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
+            *dir_ptr = 0;
             return -1;
+        }
         level-=256;
     }else{  //FIXME optimize use unified tables & index
         if (n < 4) {
@@ -592,6 +602,7 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         }
         if (level < 0){
             av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
+            *dir_ptr = 0;
             return -1;
         }
 
@@ -648,7 +659,6 @@ int ff_msmpeg4_decode_block(MpegEncContext * s, int16_t * block,
         if (level < 0){
             av_log(s->avctx, AV_LOG_ERROR, "dc overflow- block: %d qscale: %d//\n", n, s->qscale);
             if(s->inter_intra_pred) level=0;
-            else                    return -1;
         }
         if (n < 4) {
             rl = &ff_rl_table[s->rl_table_index];
@@ -835,9 +845,10 @@ int ff_msmpeg4_decode_block(MpegEncContext * s, int16_t * block,
             if(i&(~63)){
                 const int left= get_bits_left(&s->gb);
                 if (((i + 192 == 64 && level / qmul == -1) ||
-                     !(s->avctx->err_recognition & AV_EF_BITSTREAM)) &&
+                     !(s->avctx->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT))) &&
                     left >= 0) {
                     av_log(s->avctx, AV_LOG_ERROR, "ignoring overflow at %d %d\n", s->mb_x, s->mb_y);
+                    i = 63;
                     break;
                 }else{
                     av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
@@ -914,6 +925,7 @@ AVCodec ff_msmpeg4v1_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -930,6 +942,7 @@ AVCodec ff_msmpeg4v2_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -946,6 +959,7 @@ AVCodec ff_msmpeg4v3_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -962,6 +976,7 @@ AVCodec ff_wmv1_decoder = {
     .close          = ff_h263_decode_end,
     .decode         = ff_h263_decode_frame,
     .capabilities   = AV_CODEC_CAP_DRAW_HORIZ_BAND | AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/msmpeg4enc.c b/libavcodec/msmpeg4enc.c
index 21e38f9..07241e8 100644
--- a/libavcodec/msmpeg4enc.c
+++ b/libavcodec/msmpeg4enc.c
@@ -5,20 +5,20 @@
  *
  * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,7 +34,6 @@
 #include "libavutil/avutil.h"
 #include "libavutil/mem.h"
 #include "mpegvideo.h"
-#include "msmpeg4.h"
 #include "h263.h"
 #include "internal.h"
 #include "mpeg4video.h"
@@ -160,8 +159,8 @@ av_cold int ff_msmpeg4_encode_init(MpegEncContext *s)
 static void find_best_tables(MpegEncContext * s)
 {
     int i;
-    int best       =-1, best_size       =9999999;
-    int chroma_best=-1, best_chroma_size=9999999;
+    int best        = 0, best_size        = INT_MAX;
+    int chroma_best = 0, best_chroma_size = INT_MAX;
 
     for(i=0; i<3; i++){
         int level;
@@ -241,7 +240,7 @@ void ff_msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     s->per_mb_rl_table = 0;
     if(s->msmpeg4_version==4)
         s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE && s->pict_type==AV_PICTURE_TYPE_P);
-    ff_dlog(s, "%d %d %d %d %d\n", s->pict_type, s->bit_rate,
+    ff_dlog(s, "%d %"PRId64" %d %d %d\n", s->pict_type, s->bit_rate,
             s->inter_intra_pred, s->width, s->height);
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
@@ -284,14 +283,15 @@ void ff_msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 
 void ff_msmpeg4_encode_ext_header(MpegEncContext * s)
 {
-        put_bits(&s->pb, 5, s->avctx->time_base.den / s->avctx->time_base.num); //yes 29.97 -> 29
+        unsigned fps = s->avctx->time_base.den / s->avctx->time_base.num / FFMAX(s->avctx->ticks_per_frame, 1);
+        put_bits(&s->pb, 5, FFMIN(fps, 31)); //yes 29.97 -> 29
 
         put_bits(&s->pb, 11, FFMIN(s->bit_rate/1024, 2047));
 
         if(s->msmpeg4_version>=3)
             put_bits(&s->pb, 1, s->flipflop_rounding);
         else
-            assert(s->flipflop_rounding==0);
+            av_assert0(s->flipflop_rounding==0);
 }
 
 void ff_msmpeg4_encode_motion(MpegEncContext * s,
@@ -504,7 +504,7 @@ void ff_msmpeg4_encode_mb(MpegEncContext * s,
 static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr)
 {
     int sign, code;
-    int pred, extquant;
+    int pred, av_uninit(extquant);
     int extrabits = 0;
 
     int16_t *dc_val;
diff --git a/libavcodec/msrle.c b/libavcodec/msrle.c
index a7838ab..c2f6242 100644
--- a/libavcodec/msrle.c
+++ b/libavcodec/msrle.c
@@ -2,20 +2,20 @@
  * Microsoft RLE video decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "msrledec.h"
+#include "libavutil/imgutils.h"
 
 typedef struct MsrleContext {
     AVCodecContext *avctx;
@@ -50,10 +51,14 @@ typedef struct MsrleContext {
 static av_cold int msrle_decode_init(AVCodecContext *avctx)
 {
     MsrleContext *s = avctx->priv_data;
+    int i;
 
     s->avctx = avctx;
 
     switch (avctx->bits_per_coded_sample) {
+    case 1:
+        avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+        break;
     case 4:
     case 8:
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
@@ -70,6 +75,10 @@ static av_cold int msrle_decode_init(AVCodecContext *avctx)
     if (!s->frame)
         return AVERROR(ENOMEM);
 
+    if (avctx->extradata_size >= 4)
+        for (i = 0; i < FFMIN(avctx->extradata_size, AVPALETTE_SIZE)/4; i++)
+            s->pal[i] = 0xFFU<<24 | AV_RL32(avctx->extradata+4*i);
+
     return 0;
 }
 
@@ -86,30 +95,30 @@ static int msrle_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
-    if (avctx->bits_per_coded_sample <= 8) {
+    if (avctx->bits_per_coded_sample > 1 && avctx->bits_per_coded_sample <= 8) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
 
         if (pal) {
             s->frame->palette_has_changed = 1;
             memcpy(s->pal, pal, AVPALETTE_SIZE);
         }
-
         /* make the palette available */
         memcpy(s->frame->data[1], s->pal, AVPALETTE_SIZE);
     }
 
     /* FIXME how to correctly detect RLE ??? */
     if (avctx->height * istride == avpkt->size) { /* assume uncompressed */
-        int linesize = avctx->width * avctx->bits_per_coded_sample / 8;
+        int linesize = av_image_get_linesize(avctx->pix_fmt, avctx->width, 0);
         uint8_t *ptr = s->frame->data[0];
         uint8_t *buf = avpkt->data + (avctx->height-1)*istride;
         int i, j;
 
+        if (linesize < 0)
+            return linesize;
+
         for (i = 0; i < avctx->height; i++) {
             if (avctx->bits_per_coded_sample == 4) {
                 for (j = 0; j < avctx->width - 1; j += 2) {
diff --git a/libavcodec/msrledec.c b/libavcodec/msrledec.c
index f45179f..805802a 100644
--- a/libavcodec/msrledec.c
+++ b/libavcodec/msrledec.c
@@ -2,20 +2,20 @@
  * Microsoft RLE decoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,17 +36,15 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
     unsigned char rle_code;
     unsigned char extra_byte, odd_pixel;
     unsigned char stream_byte;
-    unsigned int pixel_ptr = 0;
-    int row_dec = pic->linesize[0];
-    int row_ptr = (avctx->height - 1) * row_dec;
-    int frame_size = FFABS(row_dec) * avctx->height;
+    int pixel_ptr = 0;
+    int line = avctx->height - 1;
     int i;
 
-    while (row_ptr >= 0) {
+    while (line >= 0 && pixel_ptr <= avctx->width) {
         if (bytestream2_get_bytes_left(gb) <= 0) {
             av_log(avctx, AV_LOG_ERROR,
-                   "MS RLE: bytestream overrun, %d rows left\n",
-                   row_ptr);
+                   "MS RLE: bytestream overrun, %dx%d left\n",
+                   avctx->width - pixel_ptr, line);
             return AVERROR_INVALIDDATA;
         }
         rle_code = stream_byte = bytestream2_get_byteu(gb);
@@ -55,7 +53,7 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
             stream_byte = bytestream2_get_byte(gb);
             if (stream_byte == 0) {
                 /* line is done, goto the next one */
-                row_ptr -= row_dec;
+                line--;
                 pixel_ptr = 0;
             } else if (stream_byte == 1) {
                 /* decode is done */
@@ -65,13 +63,13 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
                 stream_byte = bytestream2_get_byte(gb);
                 pixel_ptr += stream_byte;
                 stream_byte = bytestream2_get_byte(gb);
-                row_ptr -= stream_byte * row_dec;
+                avpriv_request_sample(avctx, "Unused stream byte %X", stream_byte);
             } else {
                 // copy pixels from encoded stream
                 odd_pixel =  stream_byte & 1;
                 rle_code = (stream_byte + 1) / 2;
                 extra_byte = rle_code & 0x01;
-                if (row_ptr + pixel_ptr + stream_byte > frame_size ||
+                if (pixel_ptr + 2*rle_code - odd_pixel > avctx->width ||
                     bytestream2_get_bytes_left(gb) < rle_code) {
                     av_log(avctx, AV_LOG_ERROR,
                            "MS RLE: frame/stream ptr just went out of bounds (copy)\n");
@@ -82,13 +80,13 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
                     if (pixel_ptr >= avctx->width)
                         break;
                     stream_byte = bytestream2_get_byteu(gb);
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte >> 4;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte >> 4;
                     pixel_ptr++;
                     if (i + 1 == rle_code && odd_pixel)
                         break;
                     if (pixel_ptr >= avctx->width)
                         break;
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte & 0x0F;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte & 0x0F;
                     pixel_ptr++;
                 }
 
@@ -98,9 +96,9 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
             }
         } else {
             // decode a run of data
-            if (row_ptr + pixel_ptr + stream_byte > frame_size) {
+            if (pixel_ptr + rle_code > avctx->width + 1) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "MS RLE: frame ptr just went out of bounds (run)\n");
+                       "MS RLE: frame ptr just went out of bounds (run) %d %d %d\n", pixel_ptr, rle_code, avctx->width);
                 return AVERROR_INVALIDDATA;
             }
             stream_byte = bytestream2_get_byte(gb);
@@ -108,9 +106,9 @@ static int msrle_decode_pal4(AVCodecContext *avctx, AVFrame *pic,
                 if (pixel_ptr >= avctx->width)
                     break;
                 if ((i & 1) == 0)
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte >> 4;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte >> 4;
                 else
-                    pic->data[0][row_ptr + pixel_ptr] = stream_byte & 0x0F;
+                    pic->data[0][line * pic->linesize[0] + pixel_ptr] = stream_byte & 0x0F;
                 pixel_ptr++;
             }
         }
@@ -138,7 +136,8 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
     unsigned int width= FFABS(pic->linesize[0]) / (depth >> 3);
 
     output     = pic->data[0] + (avctx->height - 1) * pic->linesize[0];
-    output_end = pic->data[0] +  avctx->height      * pic->linesize[0];
+    output_end = output + FFABS(pic->linesize[0]);
+
     while (bytestream2_get_bytes_left(gb) > 0) {
         p1 = bytestream2_get_byteu(gb);
         if(p1 == 0) { //Escape code
@@ -155,6 +154,7 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
                     }
                 }
                 output = pic->data[0] + line * pic->linesize[0];
+                output_end = output + FFABS(pic->linesize[0]);
                 pos = 0;
                 continue;
             } else if(p2 == 1) { //End-of-picture
@@ -169,11 +169,11 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
                     return -1;
                 }
                 output = pic->data[0] + line * pic->linesize[0] + pos * (depth >> 3);
+                output_end = pic->data[0] + line * pic->linesize[0] + FFABS(pic->linesize[0]);
                 continue;
             }
             // Copy data
-            if ((pic->linesize[0] > 0 && output + p2 * (depth >> 3) > output_end) ||
-                (pic->linesize[0] < 0 && output + p2 * (depth >> 3) < output_end)) {
+            if (output + p2 * (depth >> 3) > output_end) {
                 bytestream2_skip(gb, 2 * (depth >> 3));
                 continue;
             } else if (bytestream2_get_bytes_left(gb) < p2 * (depth >> 3)) {
@@ -182,9 +182,9 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
             }
 
             if ((depth == 8) || (depth == 24)) {
-                for(i = 0; i < p2 * (depth >> 3); i++) {
-                    *output++ = bytestream2_get_byteu(gb);
-                }
+                bytestream2_get_bufferu(gb, output, p2 * (depth >> 3));
+                output += p2 * (depth >> 3);
+
                 // RLE8 copy is actually padded - and runs are not!
                 if(depth == 8 && (p2 & 1)) {
                     bytestream2_skip(gb, 1);
@@ -203,36 +203,39 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVFrame *pic,
             pos += p2;
         } else { //run of pixels
             uint8_t pix[3]; //original pixel
-            switch(depth){
-            case  8: pix[0] = bytestream2_get_byte(gb);
-                     break;
-            case 16: pix16  = bytestream2_get_le16(gb);
-                     break;
-            case 24: pix[0] = bytestream2_get_byte(gb);
-                     pix[1] = bytestream2_get_byte(gb);
-                     pix[2] = bytestream2_get_byte(gb);
-                     break;
-            case 32: pix32  = bytestream2_get_le32(gb);
-                     break;
-            }
-            if ((pic->linesize[0] > 0 && output + p1 * (depth >> 3) > output_end) ||
-                (pic->linesize[0] < 0 && output + p1 * (depth >> 3) < output_end))
+            if (output + p1 * (depth >> 3) > output_end)
                 continue;
-            for(i = 0; i < p1; i++) {
-                switch(depth){
-                case  8: *output++ = pix[0];
-                         break;
-                case 16: *(uint16_t*)output = pix16;
-                         output += 2;
-                         break;
-                case 24: *output++ = pix[0];
-                         *output++ = pix[1];
-                         *output++ = pix[2];
-                         break;
-                case 32: *(uint32_t*)output = pix32;
-                         output += 4;
-                         break;
+
+            switch(depth){
+            case  8:
+                pix[0] = bytestream2_get_byte(gb);
+                memset(output, pix[0], p1);
+                output += p1;
+                break;
+            case 16:
+                pix16  = bytestream2_get_le16(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint16_t*)output = pix16;
+                        output += 2;
+                }
+                break;
+            case 24:
+                pix[0] = bytestream2_get_byte(gb);
+                pix[1] = bytestream2_get_byte(gb);
+                pix[2] = bytestream2_get_byte(gb);
+                for(i = 0; i < p1; i++) {
+                        *output++ = pix[0];
+                        *output++ = pix[1];
+                        *output++ = pix[2];
+                }
+                break;
+            case 32:
+                pix32  = bytestream2_get_le32(gb);
+                for(i = 0; i < p1; i++) {
+                        *(uint32_t*)output = pix32;
+                        output += 4;
                 }
+                break;
             }
             pos += p1;
         }
diff --git a/libavcodec/msrledec.h b/libavcodec/msrledec.h
index 0c5b8b1..7f7bbcf 100644
--- a/libavcodec/msrledec.h
+++ b/libavcodec/msrledec.h
@@ -2,20 +2,20 @@
  * Microsoft RLE decoder
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss1.c b/libavcodec/mss1.c
index a31af06..a579d9d 100644
--- a/libavcodec/mss1.c
+++ b/libavcodec/mss1.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 1 (aka Windows Media Video V7 Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -60,7 +60,7 @@ static void arith_normalise(ArithCoder *c)
     }
 }
 
-ARITH_GET_BIT()
+ARITH_GET_BIT(arith)
 
 static int arith_get_bits(ArithCoder *c, int bits)
 {
@@ -105,7 +105,7 @@ static int arith_get_prob(ArithCoder *c, int16_t *probs)
     return sym;
 }
 
-ARITH_GET_MODEL_SYM()
+ARITH_GET_MODEL_SYM(arith)
 
 static void arith_init(ArithCoder *c, GetBitContext *gb)
 {
@@ -130,7 +130,7 @@ static int decode_pal(MSS12Context *ctx, ArithCoder *acoder)
         r = arith_get_bits(acoder, 8);
         g = arith_get_bits(acoder, 8);
         b = arith_get_bits(acoder, 8);
-        *pal++ = (r << 16) | (g << 8) | b;
+        *pal++ = (0xFFU << 24) | (r << 16) | (g << 8) | b;
     }
 
     return !!ncol;
@@ -139,8 +139,6 @@ static int decode_pal(MSS12Context *ctx, ArithCoder *acoder)
 static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                              AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size = avpkt->size;
     MSS1Context *ctx = avctx->priv_data;
     MSS12Context *c = &ctx->ctx;
     GetBitContext gb;
@@ -148,13 +146,13 @@ static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int pal_changed = 0;
     int ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
+
     arith_init(&acoder, &gb);
 
-    if ((ret = ff_reget_buffer(avctx, ctx->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, ctx->pic)) < 0)
         return ret;
-    }
 
     c->pal_pic    =  ctx->pic->data[0] + ctx->pic->linesize[0] * (avctx->height - 1);
     c->pal_stride = -ctx->pic->linesize[0];
@@ -184,7 +182,7 @@ static int mss1_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     *got_frame      = 1;
 
     /* always report that the buffer was completely consumed */
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int mss1_decode_init(AVCodecContext *avctx)
@@ -199,6 +197,8 @@ static av_cold int mss1_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
 
     ret = ff_mss12_decode_init(&c->ctx, 0, &c->sc, NULL);
+    if (ret < 0)
+        av_frame_free(&c->pic);
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
diff --git a/libavcodec/mss12.c b/libavcodec/mss12.c
index d4b621f..d42093b 100644
--- a/libavcodec/mss12.c
+++ b/libavcodec/mss12.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -450,7 +450,7 @@ static int decode_pivot(SliceContext *sc, ArithCoder *acoder, int base)
         val = acoder->get_number(acoder, (base + 1) / 2 - 2) + 3;
     }
 
-    if (val >= base)
+    if ((unsigned)val >= base)
         return -1;
 
     return inv ? base - val : val;
@@ -581,13 +581,18 @@ av_cold int ff_mss12_decode_init(MSS12Context *c, int version,
         return AVERROR_INVALIDDATA;
     }
 
-    avctx->coded_width  = AV_RB32(avctx->extradata + 20);
-    avctx->coded_height = AV_RB32(avctx->extradata + 24);
+    avctx->coded_width  = FFMAX(AV_RB32(avctx->extradata + 20), avctx->width);
+    avctx->coded_height = FFMAX(AV_RB32(avctx->extradata + 24), avctx->height);
     if (avctx->coded_width > 4096 || avctx->coded_height > 4096) {
         av_log(avctx, AV_LOG_ERROR, "Frame dimensions %dx%d too large",
                avctx->coded_width, avctx->coded_height);
         return AVERROR_INVALIDDATA;
     }
+    if (avctx->coded_width < 1 || avctx->coded_height < 1) {
+        av_log(avctx, AV_LOG_ERROR, "Frame dimensions %dx%d too small",
+               avctx->coded_width, avctx->coded_height);
+        return AVERROR_INVALIDDATA;
+    }
 
     av_log(avctx, AV_LOG_DEBUG, "Encoder version %"PRIu32".%"PRIu32"\n",
            AV_RB32(avctx->extradata + 4), AV_RB32(avctx->extradata + 8));
@@ -647,11 +652,11 @@ av_cold int ff_mss12_decode_init(MSS12Context *c, int version,
     }
 
     for (i = 0; i < 256; i++)
-        c->pal[i] = AV_RB24(avctx->extradata + 52 +
+        c->pal[i] = 0xFFU << 24 | AV_RB24(avctx->extradata + 52 +
                             (version ? 8 : 0) + i * 3);
 
     c->mask_stride = FFALIGN(avctx->width, 16);
-    c->mask        = av_malloc(c->mask_stride * avctx->height);
+    c->mask        = av_malloc_array(c->mask_stride, avctx->height);
     if (!c->mask) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate mask plane\n");
         return AVERROR(ENOMEM);
diff --git a/libavcodec/mss12.h b/libavcodec/mss12.h
index 5b1fee8..f953167 100644
--- a/libavcodec/mss12.h
+++ b/libavcodec/mss12.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -99,8 +99,8 @@ int ff_mss12_decode_init(MSS12Context *c, int version,
                          SliceContext *sc1, SliceContext *sc2);
 int ff_mss12_decode_end(MSS12Context *ctx);
 
-#define ARITH_GET_BIT(VERSION)                                          \
-static int arith ## VERSION ## _get_bit(ArithCoder *c)                  \
+#define ARITH_GET_BIT(prefix)                                           \
+static int prefix ## _get_bit(ArithCoder *c)                            \
 {                                                                       \
     int range = c->high - c->low + 1;                                   \
     int bit   = 2 * c->value - c->low >= c->high;                       \
@@ -110,22 +110,22 @@ static int arith ## VERSION ## _get_bit(ArithCoder *c)                  \
     else                                                                \
         c->high = c->low + (range >> 1) - 1;                            \
                                                                         \
-    arith ## VERSION ## _normalise(c);                                  \
+    prefix ## _normalise(c);                                            \
                                                                         \
     return bit;                                                         \
 }
 
-#define ARITH_GET_MODEL_SYM(VERSION)                                    \
-static int arith ## VERSION ## _get_model_sym(ArithCoder *c, Model *m)  \
+#define ARITH_GET_MODEL_SYM(prefix)                                     \
+static int prefix ## _get_model_sym(ArithCoder *c, Model *m)            \
 {                                                                       \
     int idx, val;                                                       \
                                                                         \
-    idx = arith ## VERSION ## _get_prob(c, m->cum_prob);                \
+    idx = prefix ## _get_prob(c, m->cum_prob);                          \
                                                                         \
     val = m->idx2sym[idx];                                              \
     ff_mss12_model_update(m, idx);                                      \
                                                                         \
-    arith ## VERSION ## _normalise(c);                                  \
+    prefix ## _normalise(c);                                            \
                                                                         \
     return val;                                                         \
 }
diff --git a/libavcodec/mss2.c b/libavcodec/mss2.c
index 2c993f6..d255dd4 100644
--- a/libavcodec/mss2.c
+++ b/libavcodec/mss2.c
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,13 +52,13 @@ static void arith2_normalise(ArithCoder *c)
             c->value ^= 0x8000;
             c->low   ^= 0x8000;
         }
-        c->high  = c->high  << 8 & 0xFFFFFF | 0xFF;
-        c->value = c->value << 8 & 0xFFFFFF | bytestream2_get_byte(c->gbc.gB);
-        c->low   = c->low   << 8 & 0xFFFFFF;
+        c->high  = (uint16_t)c->high  << 8  | 0xFF;
+        c->value = (uint16_t)c->value << 8  | bytestream2_get_byte(c->gbc.gB);
+        c->low   = (uint16_t)c->low   << 8;
     }
 }
 
-ARITH_GET_BIT(2)
+ARITH_GET_BIT(arith2)
 
 /* L. Stuiver and A. Moffat: "Piecewise Integer Mapping for Arithmetic Coding."
  * In Proc. 8th Data Compression Conference (DCC '98), pp. 3-12, Mar. 1998 */
@@ -131,7 +131,7 @@ static int arith2_get_prob(ArithCoder *c, int16_t *probs)
     return i;
 }
 
-ARITH_GET_MODEL_SYM(2)
+ARITH_GET_MODEL_SYM(arith2)
 
 static int arith2_get_consumed_bytes(ArithCoder *c)
 {
@@ -210,8 +210,13 @@ static int decode_555(GetByteContext *gB, uint16_t *dst, int stride,
                     last_symbol = b << 8 | bytestream2_get_byte(gB);
                 else if (b > 129) {
                     repeat = 0;
-                    while (b-- > 130)
+                    while (b-- > 130) {
+                        if (repeat >= (INT_MAX >> 8) - 1) {
+                            av_log(NULL, AV_LOG_ERROR, "repeat overflow\n");
+                            return AVERROR_INVALIDDATA;
+                        }
                         repeat = (repeat << 8) + bytestream2_get_byte(gB) + 1;
+                    }
                     if (last_symbol == -2) {
                         int skip = FFMIN((unsigned)repeat, dst + w - p);
                         repeat -= skip;
@@ -318,7 +323,7 @@ static int decode_rle(GetBitContext *gb, uint8_t *pal_dst, int pal_stride,
     if (next_code != 1 << current_length)
         return AVERROR_INVALIDDATA;
 
-    if (i = init_vlc(&vlc, 9, alphabet_size, bits, 1, 1, codes, 4, 4, 0))
+    if ((i = init_vlc(&vlc, 9, alphabet_size, bits, 1, 1, codes, 4, 4, 0)) < 0)
         return i;
 
     /* frame decode */
@@ -381,7 +386,8 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t *buf, int buf_size,
 
     ff_mpeg_flush(avctx);
 
-    init_get_bits(&s->gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
 
     s->loop_filter = avctx->skip_loop_filter < AVDISCARD_ALL;
 
@@ -424,8 +430,8 @@ static int decode_wmv9(AVCodecContext *avctx, const uint8_t *buf, int buf_size,
 
     if (v->respic == 3) {
         ctx->dsp.upsample_plane(f->data[0], f->linesize[0], w,      h);
-        ctx->dsp.upsample_plane(f->data[1], f->linesize[1], w >> 1, h >> 1);
-        ctx->dsp.upsample_plane(f->data[2], f->linesize[2], w >> 1, h >> 1);
+        ctx->dsp.upsample_plane(f->data[1], f->linesize[1], w+1 >> 1, h+1 >> 1);
+        ctx->dsp.upsample_plane(f->data[2], f->linesize[2], w+1 >> 1, h+1 >> 1);
     } else if (v->respic)
         avpriv_request_sample(v->s.avctx,
                               "Asymmetric WMV9 rectangle subsampling");
@@ -479,7 +485,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     av_assert0(AV_INPUT_BUFFER_PADDING_SIZE >=
                ARITH2_PADDING + (MIN_CACHE_BITS + 7) / 8);
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
 
     if (keyframe = get_bits1(&gb))
         skip_bits(&gb, 7);
@@ -595,10 +602,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (c->mvX < 0 || c->mvY < 0) {
         FFSWAP(uint8_t *, c->pal_pic, c->last_pal_pic);
 
-        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
             return ret;
-        }
 
         if (ctx->last_pic->data[0]) {
             av_assert0(frame->linesize[0] == ctx->last_pic->linesize[0]);
@@ -609,10 +614,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
         }
     } else {
-        if ((ret = ff_reget_buffer(avctx, ctx->last_pic)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+        if ((ret = ff_reget_buffer(avctx, ctx->last_pic)) < 0)
             return ret;
-        }
         if ((ret = av_frame_ref(frame, ctx->last_pic)) < 0)
             return ret;
 
@@ -641,7 +644,8 @@ static int mss2_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                 ff_mss12_slicecontext_reset(&ctx->sc[1]);
         }
         if (is_rle) {
-            init_get_bits(&gb, buf, buf_size * 8);
+            if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+                return ret;
             if (ret = decode_rle(&gb, c->pal_pic, c->pal_stride,
                                  c->rgb_pic, c->rgb_stride, c->pal, keyframe,
                                  ctx->split_position, 0,
@@ -819,10 +823,11 @@ static av_cold int mss2_decode_init(AVCodecContext *avctx)
     c->avctx = avctx;
     if (ret = ff_mss12_decode_init(c, 1, &ctx->sc[0], &ctx->sc[1]))
         return ret;
+    ctx->last_pic   = av_frame_alloc();
     c->pal_stride   = c->mask_stride;
     c->pal_pic      = av_mallocz(c->pal_stride * avctx->height);
     c->last_pal_pic = av_mallocz(c->pal_stride * avctx->height);
-    if (!c->pal_pic || !c->last_pal_pic) {
+    if (!c->pal_pic || !c->last_pal_pic || !ctx->last_pic) {
         mss2_decode_end(avctx);
         return AVERROR(ENOMEM);
     }
@@ -836,11 +841,6 @@ static av_cold int mss2_decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = c->free_colours == 127 ? AV_PIX_FMT_RGB555
                                             : AV_PIX_FMT_RGB24;
 
-    ctx->last_pic = av_frame_alloc();
-    if (!ctx->last_pic) {
-        mss2_decode_end(avctx);
-        return AVERROR(ENOMEM);
-    }
 
     return 0;
 }
diff --git a/libavcodec/mss2dsp.c b/libavcodec/mss2dsp.c
index aa13577..c5fc1f8 100644
--- a/libavcodec/mss2dsp.c
+++ b/libavcodec/mss2dsp.c
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -106,6 +106,9 @@ static void upsample_plane_c(uint8_t *plane, int plane_stride, int w, int h)
     uint8_t *src1, *src2, *dst1, *dst2, *p, a, b;
     int i, j;
 
+    if(!w || !h)
+        return;
+
     w += (w & 1);
     h += (h & 1);
 
diff --git a/libavcodec/mss2dsp.h b/libavcodec/mss2dsp.h
index 61c3a04..7368abb 100644
--- a/libavcodec/mss2dsp.h
+++ b/libavcodec/mss2dsp.h
@@ -1,20 +1,20 @@
 /*
  * Microsoft Screen 2 (aka Windows Media Video V9 Screen) decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss3.c b/libavcodec/mss3.c
index c124834..0194196 100644
--- a/libavcodec/mss3.c
+++ b/libavcodec/mss3.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 3 (aka Microsoft ATC Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -296,7 +296,7 @@ static void rac_normalise(RangeCoder *c)
             c->low |= *c->src++;
         } else if (!c->low) {
             c->got_error = 1;
-            return;
+            c->low = 1;
         }
         if (c->range >= RAC_BOTTOM)
             return;
@@ -731,10 +731,8 @@ static int mss3_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return buf_size;
     c->got_error = 0;
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
     c->pic->key_frame = keyframe;
     c->pic->pict_type = keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
     if (!bytestream2_get_bytes_left(&gb)) {
@@ -840,6 +838,7 @@ static av_cold int mss3_decode_init(AVCodecContext *avctx)
                                             b_width * b_height);
         if (!c->dct_coder[i].prev_dc) {
             av_log(avctx, AV_LOG_ERROR, "Cannot allocate buffer\n");
+            av_frame_free(&c->pic);
             while (i >= 0) {
                 av_freep(&c->dct_coder[i].prev_dc);
                 i--;
diff --git a/libavcodec/mss34dsp.c b/libavcodec/mss34dsp.c
index 11abb2d..0397add 100644
--- a/libavcodec/mss34dsp.c
+++ b/libavcodec/mss34dsp.c
@@ -2,20 +2,20 @@
  * Common stuff for some Microsoft Screen codecs
  * Copyright (C) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -84,8 +84,8 @@ void ff_mss34_gen_quant_mat(uint16_t *qmat, int quality, int luma)
     blk[6 * step] = (-(t3 + t7) + t8 + tA) >> shift;                \
     blk[7 * step] = (-(t1 + t6) + t9 + tB) >> shift;                \
 
-#define SOP_ROW(a) ((a) << 16) + 0x2000
-#define SOP_COL(a) ((a + 32) << 16)
+#define SOP_ROW(a) (((a) << 16) + 0x2000)
+#define SOP_COL(a) (((a) + 32) << 16)
 
 void ff_mss34_dct_put(uint8_t *dst, int stride, int *block)
 {
diff --git a/libavcodec/mss34dsp.h b/libavcodec/mss34dsp.h
index b2cc550..2f9827d 100644
--- a/libavcodec/mss34dsp.h
+++ b/libavcodec/mss34dsp.h
@@ -2,20 +2,20 @@
  * Common stuff for some Microsoft Screen codecs
  * Copyright (C) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/mss4.c b/libavcodec/mss4.c
index a953a57..9639fc8 100644
--- a/libavcodec/mss4.c
+++ b/libavcodec/mss4.c
@@ -2,20 +2,20 @@
  * Microsoft Screen 4 (aka Microsoft Expression Encoder Screen) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -125,7 +125,7 @@ static const uint8_t mss4_vec_entry_vlc_syms[2][9] = {
 #define MAX_ENTRIES  162
 
 typedef struct MSS4Context {
-    AVFrame   *pic;
+    AVFrame    *pic;
 
     VLC        dc_vlc[2], ac_vlc[2];
     VLC        vec_entry_vlc[2];
@@ -363,7 +363,7 @@ static int get_value_cached(GetBitContext *gb, int vec_pos, uint8_t *vec,
     return prev[component];
 }
 
-#define MKVAL(vals)  (vals[0] | (vals[1] << 3) | (vals[2] << 6))
+#define MKVAL(vals)  ((vals)[0] | ((vals)[1] << 3) | ((vals)[2] << 6))
 
 /* Image mode - the hardest to comprehend MSS4 coding mode.
  *
@@ -553,10 +553,8 @@ static int mss4_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
     c->pic->key_frame = (frame_type == INTRA_FRAME);
     c->pic->pict_type = (frame_type == INTRA_FRAME) ? AV_PICTURE_TYPE_I
                                                    : AV_PICTURE_TYPE_P;
@@ -574,7 +572,8 @@ static int mss4_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ff_mss34_gen_quant_mat(c->quant_mat[i], quality, !i);
     }
 
-    init_get_bits(&gb, buf + HEADER_SIZE, (buf_size - HEADER_SIZE) * 8);
+    if ((ret = init_get_bits8(&gb, buf + HEADER_SIZE, buf_size - HEADER_SIZE)) < 0)
+        return ret;
 
     mb_width  = FFALIGN(width,  16) >> 4;
     mb_height = FFALIGN(height, 16) >> 4;
@@ -652,7 +651,7 @@ static av_cold int mss4_decode_init(AVCodecContext *avctx)
     }
     for (i = 0; i < 3; i++) {
         c->dc_stride[i] = FFALIGN(avctx->width, 16) >> (2 + !!i);
-        c->prev_dc[i]   = av_malloc(sizeof(**c->prev_dc) * c->dc_stride[i]);
+        c->prev_dc[i]   = av_malloc_array(c->dc_stride[i], sizeof(**c->prev_dc));
         if (!c->prev_dc[i]) {
             av_log(avctx, AV_LOG_ERROR, "Cannot allocate buffer\n");
             mss4_free_vlcs(c);
diff --git a/libavcodec/msvideo1.c b/libavcodec/msvideo1.c
index 37ea32d..1d14172 100644
--- a/libavcodec/msvideo1.c
+++ b/libavcodec/msvideo1.c
@@ -2,20 +2,20 @@
  * Microsoft Video-1 Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,6 +66,8 @@ static av_cold int msvideo1_decode_init(AVCodecContext *avctx)
     if (s->avctx->bits_per_coded_sample == 8) {
         s->mode_8bit = 1;
         avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        if (avctx->extradata_size >= AVPALETTE_SIZE)
+            memcpy(s->pal, avctx->extradata, AVPALETTE_SIZE);
     } else {
         s->mode_8bit = 0;
         avctx->pix_fmt = AV_PIX_FMT_RGB555;
@@ -299,10 +301,8 @@ static int msvideo1_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (s->mode_8bit) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
diff --git a/libavcodec/msvideo1enc.c b/libavcodec/msvideo1enc.c
new file mode 100644
index 0000000..b6ae92b
--- /dev/null
+++ b/libavcodec/msvideo1enc.c
@@ -0,0 +1,305 @@
+/*
+ * Microsoft Video-1 Encoder
+ * Copyright (c) 2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Microsoft Video-1 encoder
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+#include "libavutil/lfg.h"
+#include "elbg.h"
+#include "libavutil/imgutils.h"
+/**
+ * Encoder context
+ */
+typedef struct Msvideo1EncContext {
+    AVCodecContext *avctx;
+    AVLFG rnd;
+    uint8_t *prev;
+
+    int block[16*3];
+    int block2[16*3];
+    int codebook[8*3];
+    int codebook2[8*3];
+    int output[16*3];
+    int output2[16*3];
+    int avg[3];
+    int bestpos;
+    int keyint;
+} Msvideo1EncContext;
+
+enum MSV1Mode{
+    MODE_SKIP = 0,
+    MODE_FILL,
+    MODE_2COL,
+    MODE_8COL,
+};
+
+#define SKIP_PREFIX 0x8400
+#define SKIPS_MAX 0x03FF
+#define MKRGB555(in, off) (((in)[off] << 10) | ((in)[(off) + 1] << 5) | ((in)[(off) + 2]))
+
+static const int remap[16] = { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 };
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                               const AVFrame *pict, int *got_packet)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+    const AVFrame *p = pict;
+    uint16_t *src;
+    uint8_t *prevptr;
+    uint8_t *dst, *buf;
+    int keyframe = 0;
+    int no_skips = 1;
+    int i, j, k, x, y, ret;
+    int skips = 0;
+    int quality = 24;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width*avctx->height*9 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+    dst= buf= pkt->data;
+
+    if(!c->prev)
+        c->prev = av_malloc(avctx->width * 3 * (avctx->height + 3));
+    prevptr = c->prev + avctx->width * 3 * (FFALIGN(avctx->height, 4) - 1);
+    src = (uint16_t*)(p->data[0] + p->linesize[0]*(FFALIGN(avctx->height, 4) - 1));
+    if(c->keyint >= avctx->keyint_min)
+        keyframe = 1;
+
+
+    for(y = 0; y < avctx->height; y += 4){
+        for(x = 0; x < avctx->width; x += 4){
+            int bestmode = MODE_SKIP;
+            int bestscore = INT_MAX;
+            int flags = 0;
+            int score;
+
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    uint16_t val = src[x + i - j*p->linesize[0]/2];
+                    for(k = 0; k < 3; k++){
+                        c->block[(i + j*4)*3 + k] =
+                        c->block2[remap[i + j*4]*3 + k] = (val >> (10-k*5)) & 0x1F;
+                    }
+                }
+            }
+            if(!keyframe){
+                bestscore = 0;
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4*3; i++){
+                        int t = prevptr[x*3 + i - j*3*avctx->width] - c->block[i + j*4*3];
+                        bestscore += t*t;
+                    }
+                }
+                bestscore /= quality;
+            }
+            // try to find optimal value to fill whole 4x4 block
+            score = 0;
+            avpriv_init_elbg(c->block, 3, 16, c->avg, 1, 1, c->output, &c->rnd);
+            avpriv_do_elbg  (c->block, 3, 16, c->avg, 1, 1, c->output, &c->rnd);
+            if(c->avg[0] == 1) // red component = 1 will be written as skip code
+                c->avg[0] = 0;
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->avg[k] - c->block[(i+j*4)*3+k];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 2;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_FILL;
+            }
+            // search for optimal filling of 2-color block
+            score = 0;
+            avpriv_init_elbg(c->block, 3, 16, c->codebook, 2, 1, c->output, &c->rnd);
+            avpriv_do_elbg  (c->block, 3, 16, c->codebook, 2, 1, c->output, &c->rnd);
+            // last output value should be always 1, swap codebooks if needed
+            if(!c->output[15]){
+                for(i = 0; i < 3; i++)
+                    FFSWAP(uint8_t, c->codebook[i], c->codebook[i+3]);
+                for(i = 0; i < 16; i++)
+                    c->output[i] ^= 1;
+            }
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->codebook[c->output[i+j*4]*3 + k] - c->block[i*3+k+j*4*3];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 6;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_2COL;
+            }
+            // search for optimal filling of 2-color 2x2 subblocks
+            score = 0;
+            for(i = 0; i < 4; i++){
+                avpriv_init_elbg(c->block2 + i*4*3, 3, 4, c->codebook2 + i*2*3, 2, 1, c->output2 + i*4, &c->rnd);
+                avpriv_do_elbg  (c->block2 + i*4*3, 3, 4, c->codebook2 + i*2*3, 2, 1, c->output2 + i*4, &c->rnd);
+            }
+            // last value should be always 1, swap codebooks if needed
+            if(!c->output2[15]){
+                for(i = 0; i < 3; i++)
+                    FFSWAP(uint8_t, c->codebook2[i+18], c->codebook2[i+21]);
+                for(i = 12; i < 16; i++)
+                    c->output2[i] ^= 1;
+            }
+            for(j = 0; j < 4; j++){
+                for(i = 0; i < 4; i++){
+                    for(k = 0; k < 3; k++){
+                        int t = c->codebook2[(c->output2[remap[i+j*4]] + (i&2) + (j&2)*2)*3+k] - c->block[i*3+k + j*4*3];
+                        score += t*t;
+                    }
+                }
+            }
+            score /= quality;
+            score += 18;
+            if(score < bestscore){
+                bestscore = score;
+                bestmode = MODE_8COL;
+            }
+
+            if(bestmode == MODE_SKIP){
+                skips++;
+                no_skips = 0;
+            }
+            if((bestmode != MODE_SKIP && skips) || skips == SKIPS_MAX){
+                bytestream_put_le16(&dst, skips | SKIP_PREFIX);
+                skips = 0;
+            }
+
+            switch(bestmode){
+            case MODE_FILL:
+                bytestream_put_le16(&dst, MKRGB555(c->avg,0) | 0x8000);
+                for(j = 0; j < 4; j++)
+                    for(i = 0; i < 4; i++)
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->avg[k];
+                break;
+            case MODE_2COL:
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4; i++){
+                        flags |= (c->output[i + j*4]^1) << (i + j*4);
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->codebook[c->output[i + j*4]*3 + k];
+                    }
+                }
+                bytestream_put_le16(&dst, flags);
+                bytestream_put_le16(&dst, MKRGB555(c->codebook, 0));
+                bytestream_put_le16(&dst, MKRGB555(c->codebook, 3));
+                break;
+            case MODE_8COL:
+                for(j = 0; j < 4; j++){
+                    for(i = 0; i < 4; i++){
+                        flags |= (c->output2[remap[i + j*4]]^1) << (i + j*4);
+                        for(k = 0; k < 3; k++)
+                            prevptr[x*3 + i*3 + k - j*3*avctx->width] = c->codebook2[(c->output2[remap[i+j*4]] + (i&2) + (j&2)*2)*3 + k];
+                    }
+                }
+                bytestream_put_le16(&dst, flags);
+                bytestream_put_le16(&dst, MKRGB555(c->codebook2, 0) | 0x8000);
+                for(i = 3; i < 24; i += 3)
+                    bytestream_put_le16(&dst, MKRGB555(c->codebook2, i));
+                break;
+            }
+        }
+        src     -= p->linesize[0] << 1;
+        prevptr -= avctx->width * 3 * 4;
+    }
+    if(skips)
+        bytestream_put_le16(&dst, skips | SKIP_PREFIX);
+    //EOF
+    bytestream_put_byte(&dst, 0);
+    bytestream_put_byte(&dst, 0);
+
+    if(no_skips)
+        keyframe = 1;
+    if(keyframe)
+        c->keyint = 0;
+    else
+        c->keyint++;
+    if (keyframe) pkt->flags |= AV_PKT_FLAG_KEY;
+    pkt->size = dst - buf;
+    *got_packet = 1;
+
+    return 0;
+}
+
+
+/**
+ * init encoder
+ */
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+
+    c->avctx = avctx;
+    if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0) {
+        return -1;
+    }
+    if((avctx->width&3) || (avctx->height&3)){
+        av_log(avctx, AV_LOG_ERROR, "width and height must be multiples of 4\n");
+        return -1;
+    }
+
+    avctx->bits_per_coded_sample = 16;
+
+    c->keyint = avctx->keyint_min;
+    av_lfg_init(&c->rnd, 1);
+
+    return 0;
+}
+
+
+
+/**
+ * Uninit encoder
+ */
+static av_cold int encode_end(AVCodecContext *avctx)
+{
+    Msvideo1EncContext * const c = avctx->priv_data;
+
+    av_freep(&c->prev);
+
+    return 0;
+}
+
+AVCodec ff_msvideo1_encoder = {
+    .name           = "msvideo1",
+    .long_name = NULL_IF_CONFIG_SMALL("Microsoft Video-1"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MSVIDEO1,
+    .priv_data_size = sizeof(Msvideo1EncContext),
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_end,
+    .pix_fmts = (const enum AVPixelFormat[]){AV_PIX_FMT_RGB555, AV_PIX_FMT_NONE},
+};
diff --git a/libavcodec/mvcdec.c b/libavcodec/mvcdec.c
index 1546bcc..e507674 100644
--- a/libavcodec/mvcdec.c
+++ b/libavcodec/mvcdec.c
@@ -2,20 +2,20 @@
  * Silicon Graphics Motion Video Compressor 1 & 2 decoder
  * Copyright (c) 2012 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,6 @@
 #include "internal.h"
 
 typedef struct MvcContext {
-    AVFrame *frame;
     int vflip;
 } MvcContext;
 
@@ -53,10 +52,6 @@ static av_cold int mvc_decode_init(AVCodecContext *avctx)
 
     avctx->pix_fmt = (avctx->codec_id == AV_CODEC_ID_MVC1) ? AV_PIX_FMT_RGB555
                                                            : AV_PIX_FMT_RGB32;
-    s->frame       = av_frame_alloc();
-    if (!s->frame)
-        return AVERROR(ENOMEM);
-
     s->vflip = avctx->extradata_size >= 9 &&
                !memcmp(avctx->extradata + avctx->extradata_size - 9, "BottomUp", 9);
     return 0;
@@ -231,39 +226,32 @@ static int mvc_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
     MvcContext *s = avctx->priv_data;
+    AVFrame *frame = data;
     GetByteContext gb;
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
 
     bytestream2_init(&gb, avpkt->data, avpkt->size);
     if (avctx->codec_id == AV_CODEC_ID_MVC1)
-        ret = decode_mvc1(avctx, &gb, s->frame->data[0],
-                          avctx->width, avctx->height, s->frame->linesize[0]);
+        ret = decode_mvc1(avctx, &gb, frame->data[0],
+                          avctx->width, avctx->height, frame->linesize[0]);
     else
-        ret = decode_mvc2(avctx, &gb, s->frame->data[0],
-                          avctx->width, avctx->height, s->frame->linesize[0],
+        ret = decode_mvc2(avctx, &gb, frame->data[0],
+                          avctx->width, avctx->height, frame->linesize[0],
                           s->vflip);
     if (ret < 0)
         return ret;
 
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+
     *got_frame = 1;
-    if ((ret = av_frame_ref(data, s->frame)) < 0)
-        return ret;
 
     return avpkt->size;
 }
 
-static av_cold int mvc_decode_end(AVCodecContext *avctx)
-{
-    MvcContext *s = avctx->priv_data;
-
-    av_frame_free(&s->frame);
-
-    return 0;
-}
-
 #if CONFIG_MVC1_DECODER
 AVCodec ff_mvc1_decoder = {
     .name           = "mvc1",
@@ -272,7 +260,6 @@ AVCodec ff_mvc1_decoder = {
     .id             = AV_CODEC_ID_MVC1,
     .priv_data_size = sizeof(MvcContext),
     .init           = mvc_decode_init,
-    .close          = mvc_decode_end,
     .decode         = mvc_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
@@ -286,7 +273,6 @@ AVCodec ff_mvc2_decoder = {
     .id             = AV_CODEC_ID_MVC2,
     .priv_data_size = sizeof(MvcContext),
     .init           = mvc_decode_init,
-    .close          = mvc_decode_end,
     .decode         = mvc_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/mxpegdec.c b/libavcodec/mxpegdec.c
index a8ef6d0..2e3ebe6 100644
--- a/libavcodec/mxpegdec.c
+++ b/libavcodec/mxpegdec.c
@@ -2,20 +2,20 @@
  * MxPEG decoder
  * Copyright (c) 2011 Anatoly Nenashev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,6 +54,7 @@ static av_cold int mxpeg_decode_end(AVCodecContext *avctx)
     for (i = 0; i < 2; ++i)
         av_frame_free(&s->picture[i]);
 
+    s->bitmask_size = 0;
     av_freep(&s->mxm_bitmask);
     av_freep(&s->completion_bitmask);
 
@@ -105,6 +106,7 @@ static int mxpeg_decode_mxm(MXpegDecodeContext *s,
     }
 
     if (s->bitmask_size != bitmask_size) {
+        s->bitmask_size = 0;
         av_freep(&s->mxm_bitmask);
         s->mxm_bitmask = av_malloc(bitmask_size);
         if (!s->mxm_bitmask) {
@@ -272,11 +274,9 @@ static int mxpeg_decode_frame(AVCodecContext *avctx,
                     }
                     /* use stored SOF data to allocate current picture */
                     av_frame_unref(jpg->picture_ptr);
-                    if (ff_get_buffer(avctx, jpg->picture_ptr,
-                                      AV_GET_BUFFER_FLAG_REF) < 0) {
-                        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                        return AVERROR(ENOMEM);
-                    }
+                    if ((ret = ff_get_buffer(avctx, jpg->picture_ptr,
+                                             AV_GET_BUFFER_FLAG_REF)) < 0)
+                        return ret;
                     jpg->picture_ptr->pict_type = AV_PICTURE_TYPE_P;
                     jpg->picture_ptr->key_frame = 0;
                     jpg->got_picture = 1;
@@ -292,17 +292,15 @@ static int mxpeg_decode_frame(AVCodecContext *avctx,
 
                     /* allocate dummy reference picture if needed */
                     if (!reference_ptr->data[0] &&
-                        ff_get_buffer(avctx, reference_ptr,
-                                      AV_GET_BUFFER_FLAG_REF) < 0) {
-                        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                        return AVERROR(ENOMEM);
-                    }
+                        (ret = ff_get_buffer(avctx, reference_ptr,
+                                             AV_GET_BUFFER_FLAG_REF)) < 0)
+                        return ret;
 
-                    ret = ff_mjpeg_decode_sos(jpg, s->mxm_bitmask, reference_ptr);
+                    ret = ff_mjpeg_decode_sos(jpg, s->mxm_bitmask, s->bitmask_size, reference_ptr);
                     if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
                         return ret;
                 } else {
-                    ret = ff_mjpeg_decode_sos(jpg, NULL, NULL);
+                    ret = ff_mjpeg_decode_sos(jpg, NULL, 0, NULL);
                     if (ret < 0 && (avctx->err_recognition & AV_EF_EXPLODE))
                         return ret;
                 }
@@ -346,5 +344,6 @@ AVCodec ff_mxpeg_decoder = {
     .close          = mxpeg_decode_end,
     .decode         = mxpeg_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c
index 5033282..e6625cb 100644
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -50,7 +50,7 @@ typedef struct NellyMoserDecodeContext {
     AVLFG           random_state;
     GetBitContext   gb;
     float           scale_bias;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext      imdct_ctx;
     DECLARE_ALIGNED(32, float, imdct_buf)[2][NELLY_BUF_LEN];
     float          *imdct_out;
@@ -75,7 +75,7 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
     for (i=0 ; i<NELLY_BANDS ; i++) {
         if (i > 0)
             val += ff_nelly_delta_table[get_bits(&s->gb, 5)];
-        pval = -pow(2, val/2048) * s->scale_bias;
+        pval = -exp2(val/2048) * s->scale_bias;
         for (j = 0; j < ff_nelly_band_sizes_table[i]; j++) {
             *bptr++ = val;
             *pptr++ = pval;
@@ -105,7 +105,7 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
                (NELLY_BUF_LEN - NELLY_FILL_LEN) * sizeof(float));
 
         s->imdct_ctx.imdct_half(&s->imdct_ctx, s->imdct_out, aptr);
-        s->fdsp.vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN / 2,
+        s->fdsp->vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN / 2,
                                    s->imdct_out, ff_sine_128,
                                    NELLY_BUF_LEN / 2);
         FFSWAP(float *, s->imdct_out, s->imdct_prev);
@@ -121,7 +121,9 @@ static av_cold int decode_init(AVCodecContext * avctx) {
     av_lfg_init(&s->random_state, 0);
     ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     s->scale_bias = 1.0/(32768*8);
     avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
@@ -141,16 +143,19 @@ static int decode_tag(AVCodecContext *avctx, void *data,
 {
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
+    const uint8_t *side=av_packet_get_side_data(avpkt, 'F', NULL);
     int buf_size = avpkt->size;
     NellyMoserDecodeContext *s = avctx->priv_data;
     int blocks, i, ret;
     float   *samples_flt;
 
     blocks     = buf_size / NELLY_BLOCK_LEN;
+
     if (blocks <= 0) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
         return AVERROR_INVALIDDATA;
     }
+
     if (buf_size % NELLY_BLOCK_LEN) {
         av_log(avctx, AV_LOG_WARNING, "Leftover bytes: %d.\n",
                buf_size % NELLY_BLOCK_LEN);
@@ -162,13 +167,13 @@ static int decode_tag(AVCodecContext *avctx, void *data,
      * 22050 Hz - 4
      * 44100 Hz - 8
      */
+    if(side && blocks>1 && avctx->sample_rate%11025==0 && (1<<((side[0]>>2)&3)) == blocks)
+        avctx->sample_rate= 11025*(blocks/2);
 
     /* get output buffer */
     frame->nb_samples = NELLY_SAMPLES * blocks;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples_flt = (float *)frame->data[0];
 
     for (i=0 ; i<blocks ; i++) {
@@ -186,6 +191,7 @@ static av_cold int decode_end(AVCodecContext * avctx) {
     NellyMoserDecodeContext *s = avctx->priv_data;
 
     ff_mdct_end(&s->imdct_ctx);
+    av_freep(&s->fdsp);
 
     return 0;
 }
diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index 9d12081..9d22ac8 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2008 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
  *
  * Generic codec information: libavcodec/nellymoserdec.c
  *
- * Some information also from: http://samples.libav.org/A-codecs/Nelly_Moser/ASAO/ASAO.zip
+ * Some information also from: http://samples.mplayerhq.hu/A-codecs/Nelly_Moser/ASAO/ASAO.zip
  *                             (Copyright Joseph Artsimovich and UAB "DKD")
  *
  * for more information about nellymoser format, visit:
@@ -56,7 +56,7 @@
 typedef struct NellyMoserEncodeContext {
     AVCodecContext  *avctx;
     int             last_frame;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext      mdct_ctx;
     AudioFrameQueue afq;
     DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
@@ -66,7 +66,7 @@ typedef struct NellyMoserEncodeContext {
     uint8_t         (*path)[OPT_SIZE];
 } NellyMoserEncodeContext;
 
-static float pow_table[POW_TABLE_SIZE];     ///< -pow(2, -i / 2048.0 - 3.0);
+static float pow_table[POW_TABLE_SIZE];     ///< pow(2, -i / 2048.0 - 3.0);
 
 static const uint8_t sf_lut[96] = {
      0,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  4,  4,
@@ -122,12 +122,12 @@ static void apply_mdct(NellyMoserEncodeContext *s)
     float *in1 = s->buf + NELLY_BUF_LEN;
     float *in2 = s->buf + 2 * NELLY_BUF_LEN;
 
-    s->fdsp.vector_fmul        (s->in_buff,                 in0, ff_sine_128, NELLY_BUF_LEN);
-    s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul        (s->in_buff,                 in0, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in1, ff_sine_128, NELLY_BUF_LEN);
     s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff);
 
-    s->fdsp.vector_fmul        (s->in_buff,                 in1, ff_sine_128, NELLY_BUF_LEN);
-    s->fdsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul        (s->in_buff,                 in1, ff_sine_128, NELLY_BUF_LEN);
+    s->fdsp->vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, in2, ff_sine_128, NELLY_BUF_LEN);
     s->mdct_ctx.mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->in_buff);
 }
 
@@ -138,10 +138,11 @@ static av_cold int encode_end(AVCodecContext *avctx)
     ff_mdct_end(&s->mdct_ctx);
 
     if (s->avctx->trellis) {
-        av_free(s->opt);
-        av_free(s->path);
+        av_freep(&s->opt);
+        av_freep(&s->path);
     }
     ff_af_queue_close(&s->afq);
+    av_freep(&s->fdsp);
 
     return 0;
 }
@@ -170,12 +171,26 @@ static av_cold int encode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     if ((ret = ff_mdct_init(&s->mdct_ctx, 8, 0, 32768.0)) < 0)
         goto error;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp) {
+        ret = AVERROR(ENOMEM);
+        goto error;
+    }
 
     /* Generate overlap window */
-    ff_sine_window_init(ff_sine_128, 128);
+    ff_init_ff_sine_windows(7);
+    /* faster way of doing
     for (i = 0; i < POW_TABLE_SIZE; i++)
-        pow_table[i] = -pow(2, -i / 2048.0 - 3.0 + POW_TABLE_OFFSET);
+       pow_table[i] = 2^(-i / 2048.0 - 3.0 + POW_TABLE_OFFSET); */
+    pow_table[0] = 1;
+    pow_table[1024] = M_SQRT1_2;
+    for (i = 1; i < 513; i++) {
+        double tmp = exp2(-i / 2048.0);
+        pow_table[i] = tmp;
+        pow_table[1024-i] = M_SQRT1_2 / tmp;
+        pow_table[1024+i] = tmp * M_SQRT1_2;
+        pow_table[2048-i] = 0.5 / tmp;
+    }
 
     if (s->avctx->trellis) {
         s->opt  = av_malloc(NELLY_BANDS * OPT_SIZE * sizeof(float  ));
@@ -231,7 +246,7 @@ static void get_exponent_dynamic(NellyMoserEncodeContext *s, float *cand, int *i
     float  (*opt )[OPT_SIZE] = s->opt ;
     uint8_t(*path)[OPT_SIZE] = s->path;
 
-    for (i = 0; i < OPT_SIZE; i++) {
+    for (i = 0; i < NELLY_BANDS * OPT_SIZE; i++) {
         opt[0][i] = INFINITY;
     }
 
@@ -266,7 +281,7 @@ static void get_exponent_dynamic(NellyMoserEncodeContext *s, float *cand, int *i
                 }
             }
         }
-        assert(c); //FIXME
+        av_assert1(c); //FIXME
     }
 
     best_val = INFINITY;
@@ -303,7 +318,7 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int
 
     apply_mdct(s);
 
-    init_put_bits(&pb, output, output_size * 8);
+    init_put_bits(&pb, output, output_size);
 
     i = 0;
     for (band = 0; band < NELLY_BANDS; band++) {
@@ -313,7 +328,7 @@ static void encode_block(NellyMoserEncodeContext *s, unsigned char *output, int
                        + s->mdct_out[i + NELLY_BUF_LEN] * s->mdct_out[i + NELLY_BUF_LEN];
         }
         cand[band] =
-            log(FFMAX(1.0, coeff_sum / (ff_nelly_band_sizes_table[band] << 7))) * 1024.0 / M_LN2;
+            log2(FFMAX(1.0, coeff_sum / (ff_nelly_band_sizes_table[band] << 7))) * 1024.0;
     }
 
     if (s->avctx->trellis) {
@@ -392,10 +407,8 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         s->last_frame = 1;
     }
 
-    if ((ret = ff_alloc_packet(avpkt, NELLY_BLOCK_LEN))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, NELLY_BLOCK_LEN, 0)) < 0)
         return ret;
-    }
     encode_block(s, avpkt->data, avpkt->size);
 
     /* Get the next frame pts/duration */
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index fe952ae..a96ae51 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2010 Mans Rullgard
  * Copyright (c) 2014 James Yu <james.yu@linaro.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/noise_bsf.c b/libavcodec/noise_bsf.c
index 3b41dbf..0aebee1 100644
--- a/libavcodec/noise_bsf.c
+++ b/libavcodec/noise_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,6 +41,9 @@ static int noise(AVBSFContext *ctx, AVPacket *out)
     int amount = s->amount > 0 ? s->amount : (s->state % 10001 + 1);
     int i, ret = 0;
 
+    if (amount <= 0)
+        return AVERROR(EINVAL);
+
     ret = ff_bsf_get_packet(ctx, &in);
     if (ret < 0)
         return ret;
@@ -82,7 +85,7 @@ static const AVClass noise_class = {
 
 const AVBitStreamFilter ff_noise_bsf = {
     .name           = "noise",
-    .priv_data_size = sizeof(int),
+    .priv_data_size = sizeof(NoiseContext),
     .priv_class     = &noise_class,
     .filter         = noise,
 };
diff --git a/libavcodec/nuv.c b/libavcodec/nuv.c
index 92c1fda..ad6c029 100644
--- a/libavcodec/nuv.c
+++ b/libavcodec/nuv.c
@@ -2,25 +2,26 @@
  * NuppelVideo decoder
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <limits.h>
 
 #include "libavutil/bswap.h"
 #include "libavutil/common.h"
@@ -78,7 +79,7 @@ static void copy_frame(AVFrame *f, const uint8_t *src, int width, int height)
     int src_linesize[4];
     av_image_fill_arrays(src_data, src_linesize, src,
                          f->format, width, height, 1);
-    av_image_copy(f->data, f->linesize, src_data, src_linesize,
+    av_image_copy(f->data, f->linesize, (const uint8_t **)src_data, src_linesize,
                   f->format, width, height);
 }
 
@@ -124,23 +125,26 @@ static int codec_reinit(AVCodecContext *avctx, int width, int height,
     if (quality >= 0)
         get_quant_quality(c, quality);
     if (width != c->width || height != c->height) {
-        void *ptr;
+        // also reserve space for a possible additional header
+        int buf_size = height * width * 3 / 2
+                     + FFMAX(AV_LZO_OUTPUT_PADDING, AV_INPUT_BUFFER_PADDING_SIZE)
+                     + RTJPEG_HEADER_SIZE;
+        if (buf_size > INT_MAX/8)
+            return -1;
         if ((ret = av_image_check_size(height, width, 0, avctx)) < 0)
             return ret;
         avctx->width  = c->width  = width;
         avctx->height = c->height = height;
-        ptr = av_fast_realloc(c->decomp_buf, &c->decomp_size,
-                              c->height * c->width * 3 / 2 +
-                              AV_INPUT_BUFFER_PADDING_SIZE +
-                              RTJPEG_HEADER_SIZE);
-        if (!ptr) {
+        av_fast_malloc(&c->decomp_buf, &c->decomp_size,
+                       buf_size);
+        if (!c->decomp_buf) {
             av_log(avctx, AV_LOG_ERROR,
                    "Can't allocate decompression buffer.\n");
             return AVERROR(ENOMEM);
-        } else
-            c->decomp_buf = ptr;
+        }
         ff_rtjpeg_decode_init(&c->rtj, c->width, c->height, c->lq, c->cq);
         av_frame_unref(c->pic);
+        return 1;
     } else if (quality != c->quality)
         ff_rtjpeg_decode_init(&c->rtj, c->width, c->height, c->lq, c->cq);
 
@@ -156,6 +160,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     AVFrame *picture   = data;
     int orig_size      = buf_size;
     int keyframe, ret;
+    int size_change = 0;
     int result, init_frame = !avctx->frame_number;
     enum {
         NUV_UNCOMPRESSED  = '0',
@@ -184,7 +189,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return orig_size;
     }
 
-    if (buf[0] != 'V' || buf_size < 12) {
+    if (buf_size < 12 || buf[0] != 'V') {
         av_log(avctx, AV_LOG_ERROR, "not a nuv video frame\n");
         return AVERROR_INVALIDDATA;
     }
@@ -201,24 +206,32 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         keyframe = 1;
         break;
     }
+retry:
     // Skip the rest of the frame header.
     buf       = &buf[12];
     buf_size -= 12;
     if (comptype == NUV_RTJPEG_IN_LZO || comptype == NUV_LZO) {
-        int outlen = c->decomp_size - AV_INPUT_BUFFER_PADDING_SIZE;
+        int outlen = c->decomp_size - FFMAX(AV_INPUT_BUFFER_PADDING_SIZE, AV_LZO_OUTPUT_PADDING);
         int inlen  = buf_size;
         if (av_lzo1x_decode(c->decomp_buf, &outlen, buf, &inlen)) {
             av_log(avctx, AV_LOG_ERROR, "error during lzo decompression\n");
             return AVERROR_INVALIDDATA;
         }
         buf      = c->decomp_buf;
-        buf_size = outlen;
+        buf_size = c->decomp_size - FFMAX(AV_INPUT_BUFFER_PADDING_SIZE, AV_LZO_OUTPUT_PADDING) - outlen;
+        memset(c->decomp_buf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     }
     if (c->codec_frameheader) {
         int w, h, q;
-        if (buf_size < RTJPEG_HEADER_SIZE || buf[4] != RTJPEG_HEADER_SIZE ||
-            buf[5] != RTJPEG_FILE_VERSION) {
-            av_log(avctx, AV_LOG_ERROR, "invalid nuv video frame\n");
+        if (buf_size < RTJPEG_HEADER_SIZE) {
+            av_log(avctx, AV_LOG_ERROR, "Too small NUV video frame\n");
+            return AVERROR_INVALIDDATA;
+        }
+        // There seem to exist two variants of this header: one starts with 'V'
+        // and 5 bytes unknown, the other matches current MythTV and is 4 bytes size,
+        // 1 byte header size (== 12), 1 byte version (== 0)
+        if (buf[0] != 'V' && AV_RL16(&buf[4]) != 0x000c) {
+            av_log(avctx, AV_LOG_ERROR, "Unknown secondary frame header (wrong codec_tag?)\n");
             return AVERROR_INVALIDDATA;
         }
         w = AV_RL16(&buf[6]);
@@ -226,22 +239,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         q = buf[10];
         if ((result = codec_reinit(avctx, w, h, q)) < 0)
             return result;
-        if (comptype == NUV_RTJPEG_IN_LZO || comptype == NUV_LZO)
-            buf = c->decomp_buf;
+        if (result) {
+            buf = avpkt->data;
+            buf_size = avpkt->size;
+            size_change = 1;
+            goto retry;
+        }
         buf       = &buf[RTJPEG_HEADER_SIZE];
         buf_size -= RTJPEG_HEADER_SIZE;
     }
 
-    if (keyframe) {
+    if (size_change || keyframe) {
         av_frame_unref(c->pic);
         init_frame = 1;
     }
 
-    result = ff_reget_buffer(avctx, c->pic);
-    if (result < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((result = ff_reget_buffer(avctx, c->pic)) < 0)
         return result;
-    }
     if (init_frame) {
         memset(c->pic->data[0], 0,    avctx->height * c->pic->linesize[0]);
         memset(c->pic->data[1], 0x80, avctx->height * c->pic->linesize[1] / 2);
@@ -259,7 +273,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             av_log(avctx, AV_LOG_ERROR, "uncompressed frame too short\n");
             height = buf_size / c->width / 3 * 2;
         }
-        copy_frame(c->pic, buf, c->width, height);
+        if(height > 0)
+            copy_frame(c->pic, buf, c->width, height);
         break;
     }
     case NUV_RTJPEG_IN_LZO:
diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index 54f030b..984dd3b 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -1,62 +1,57 @@
 /*
- * NVIDIA NVENC Support
- * Copyright (C) 2015 Luca Barbato
+ * H.264 hardware encoding using nvidia nvenc
+ * Copyright (c) 2014 Timo Rothenpieler <timo@rothenpieler.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
-#include <cuda.h>
-#include <nvEncodeAPI.h>
-#include <string.h>
-
-#define CUDA_LIBNAME "libcuda.so"
-
-#if HAVE_DLFCN_H
-#include <dlfcn.h>
-
-#define NVENC_LIBNAME "libnvidia-encode.so"
-
-#elif HAVE_WINDOWS_H
+#if defined(_WIN32)
 #include <windows.h>
 
+#define CUDA_LIBNAME TEXT("nvcuda.dll")
 #if ARCH_X86_64
-#define NVENC_LIBNAME "nvEncodeAPI64.dll"
+#define NVENC_LIBNAME TEXT("nvEncodeAPI64.dll")
 #else
-#define NVENC_LIBNAME "nvEncodeAPI.dll"
+#define NVENC_LIBNAME TEXT("nvEncodeAPI.dll")
 #endif
 
 #define dlopen(filename, flags) LoadLibrary((filename))
 #define dlsym(handle, symbol)   GetProcAddress(handle, symbol)
 #define dlclose(handle)         FreeLibrary(handle)
+#else
+#include <dlfcn.h>
+
+#define CUDA_LIBNAME "libcuda.so"
+#define NVENC_LIBNAME "libnvidia-encode.so"
 #endif
 
-#include "libavutil/common.h"
 #include "libavutil/hwcontext.h"
-#include "libavutil/hwcontext_cuda.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "libavutil/mem.h"
-#include "avcodec.h"
 #include "internal.h"
 #include "nvenc.h"
 
 #define NVENC_CAP 0x30
-#define BITSTREAM_BUFFER_SIZE 1024 * 1024
+#define IS_CBR(rc) (rc == NV_ENC_PARAMS_RC_CBR ||               \
+                    rc == NV_ENC_PARAMS_RC_2_PASS_QUALITY ||    \
+                    rc == NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP)
 
 #define LOAD_LIBRARY(l, path)                   \
     do {                                        \
@@ -79,10 +74,12 @@
     } while (0)
 
 const enum AVPixelFormat ff_nvenc_pix_fmts[] = {
-    AV_PIX_FMT_NV12,
     AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
     AV_PIX_FMT_YUV444P,
+#if CONFIG_CUDA
     AV_PIX_FMT_CUDA,
+#endif
     AV_PIX_FMT_NONE
 };
 
@@ -135,7 +132,7 @@ static int nvenc_map_error(NVENCSTATUS err, const char **desc)
 }
 
 static int nvenc_print_error(void *log_ctx, NVENCSTATUS err,
-                             const char *error_string)
+                                     const char *error_string)
 {
     const char *desc;
     int ret;
@@ -146,64 +143,66 @@ static int nvenc_print_error(void *log_ctx, NVENCSTATUS err,
 
 static av_cold int nvenc_load_libraries(AVCodecContext *avctx)
 {
-    NVENCContext *ctx         = avctx->priv_data;
-    NVENCLibraryContext *nvel = &ctx->nvel;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
     PNVENCODEAPICREATEINSTANCE nvenc_create_instance;
     NVENCSTATUS err;
 
 #if CONFIG_CUDA
-    nvel->cu_init                      = cuInit;
-    nvel->cu_device_get_count          = cuDeviceGetCount;
-    nvel->cu_device_get                = cuDeviceGet;
-    nvel->cu_device_get_name           = cuDeviceGetName;
-    nvel->cu_device_compute_capability = cuDeviceComputeCapability;
-    nvel->cu_ctx_create                = cuCtxCreate_v2;
-    nvel->cu_ctx_pop_current           = cuCtxPopCurrent_v2;
-    nvel->cu_ctx_destroy               = cuCtxDestroy_v2;
+    dl_fn->cu_init                      = cuInit;
+    dl_fn->cu_device_get_count          = cuDeviceGetCount;
+    dl_fn->cu_device_get                = cuDeviceGet;
+    dl_fn->cu_device_get_name           = cuDeviceGetName;
+    dl_fn->cu_device_compute_capability = cuDeviceComputeCapability;
+    dl_fn->cu_ctx_create                = cuCtxCreate_v2;
+    dl_fn->cu_ctx_pop_current           = cuCtxPopCurrent_v2;
+    dl_fn->cu_ctx_destroy               = cuCtxDestroy_v2;
 #else
-    LOAD_LIBRARY(nvel->cuda, CUDA_LIBNAME);
+    LOAD_LIBRARY(dl_fn->cuda, CUDA_LIBNAME);
 
-    LOAD_SYMBOL(nvel->cu_init, nvel->cuda, "cuInit");
-    LOAD_SYMBOL(nvel->cu_device_get_count, nvel->cuda, "cuDeviceGetCount");
-    LOAD_SYMBOL(nvel->cu_device_get, nvel->cuda, "cuDeviceGet");
-    LOAD_SYMBOL(nvel->cu_device_get_name, nvel->cuda, "cuDeviceGetName");
-    LOAD_SYMBOL(nvel->cu_device_compute_capability, nvel->cuda,
+    LOAD_SYMBOL(dl_fn->cu_init, dl_fn->cuda, "cuInit");
+    LOAD_SYMBOL(dl_fn->cu_device_get_count, dl_fn->cuda, "cuDeviceGetCount");
+    LOAD_SYMBOL(dl_fn->cu_device_get, dl_fn->cuda, "cuDeviceGet");
+    LOAD_SYMBOL(dl_fn->cu_device_get_name, dl_fn->cuda, "cuDeviceGetName");
+    LOAD_SYMBOL(dl_fn->cu_device_compute_capability, dl_fn->cuda,
                 "cuDeviceComputeCapability");
-    LOAD_SYMBOL(nvel->cu_ctx_create, nvel->cuda, "cuCtxCreate_v2");
-    LOAD_SYMBOL(nvel->cu_ctx_pop_current, nvel->cuda, "cuCtxPopCurrent_v2");
-    LOAD_SYMBOL(nvel->cu_ctx_destroy, nvel->cuda, "cuCtxDestroy_v2");
+    LOAD_SYMBOL(dl_fn->cu_ctx_create, dl_fn->cuda, "cuCtxCreate_v2");
+    LOAD_SYMBOL(dl_fn->cu_ctx_pop_current, dl_fn->cuda, "cuCtxPopCurrent_v2");
+    LOAD_SYMBOL(dl_fn->cu_ctx_destroy, dl_fn->cuda, "cuCtxDestroy_v2");
 #endif
 
-    LOAD_LIBRARY(nvel->nvenc, NVENC_LIBNAME);
+    LOAD_LIBRARY(dl_fn->nvenc, NVENC_LIBNAME);
 
-    LOAD_SYMBOL(nvenc_create_instance, nvel->nvenc,
+    LOAD_SYMBOL(nvenc_create_instance, dl_fn->nvenc,
                 "NvEncodeAPICreateInstance");
 
-    nvel->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
+    dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
 
-    err = nvenc_create_instance(&nvel->nvenc_funcs);
+    err = nvenc_create_instance(&dl_fn->nvenc_funcs);
     if (err != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, err, "Cannot create the NVENC instance");
+        return nvenc_print_error(avctx, err, "Failed to create nvenc instance");
+
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
 
     return 0;
 }
 
-static int nvenc_open_session(AVCodecContext *avctx)
+static av_cold int nvenc_open_session(AVCodecContext *avctx)
 {
     NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params = { 0 };
-    NVENCContext *ctx                           = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv             = &ctx->nvel.nvenc_funcs;
-    int ret;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &ctx->nvenc_dload_funcs.nvenc_funcs;
+    NVENCSTATUS ret;
 
     params.version    = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
     params.apiVersion = NVENCAPI_VERSION;
     params.device     = ctx->cu_context;
     params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
 
-    ret = nv->nvEncOpenEncodeSessionEx(&params, &ctx->nvenc_ctx);
+    ret = p_nvenc->nvEncOpenEncodeSessionEx(&params, &ctx->nvencoder);
     if (ret != NV_ENC_SUCCESS) {
-        ctx->nvenc_ctx = NULL;
-        return nvenc_print_error(avctx, ret, "Cannot open the NVENC Session");
+        ctx->nvencoder = NULL;
+        return nvenc_print_error(avctx, ret, "OpenEncodeSessionEx failed");
     }
 
     return 0;
@@ -211,12 +210,12 @@ static int nvenc_open_session(AVCodecContext *avctx)
 
 static int nvenc_check_codec_support(AVCodecContext *avctx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &ctx->nvenc_dload_funcs.nvenc_funcs;
     int i, ret, count = 0;
     GUID *guids = NULL;
 
-    ret = nv->nvEncGetEncodeGUIDCount(ctx->nvenc_ctx, &count);
+    ret = p_nvenc->nvEncGetEncodeGUIDCount(ctx->nvencoder, &count);
 
     if (ret != NV_ENC_SUCCESS || !count)
         return AVERROR(ENOSYS);
@@ -225,7 +224,7 @@ static int nvenc_check_codec_support(AVCodecContext *avctx)
     if (!guids)
         return AVERROR(ENOMEM);
 
-    ret = nv->nvEncGetEncodeGUIDs(ctx->nvenc_ctx, guids, count, &count);
+    ret = p_nvenc->nvEncGetEncodeGUIDs(ctx->nvencoder, guids, count, &count);
     if (ret != NV_ENC_SUCCESS) {
         ret = AVERROR(ENOSYS);
         goto fail;
@@ -233,7 +232,7 @@ static int nvenc_check_codec_support(AVCodecContext *avctx)
 
     ret = AVERROR(ENOSYS);
     for (i = 0; i < count; i++) {
-        if (!memcmp(&guids[i], &ctx->params.encodeGUID, sizeof(*guids))) {
+        if (!memcmp(&guids[i], &ctx->init_encode_params.encodeGUID, sizeof(*guids))) {
             ret = 0;
             break;
         }
@@ -247,15 +246,15 @@ fail:
 
 static int nvenc_check_cap(AVCodecContext *avctx, NV_ENC_CAPS cap)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &ctx->nvenc_dload_funcs.nvenc_funcs;
     NV_ENC_CAPS_PARAM params        = { 0 };
     int ret, val = 0;
 
     params.version     = NV_ENC_CAPS_PARAM_VER;
     params.capsToQuery = cap;
 
-    ret = nv->nvEncGetEncodeCaps(ctx->nvenc_ctx, ctx->params.encodeGUID, &params, &val);
+    ret = p_nvenc->nvEncGetEncodeCaps(ctx->nvencoder, ctx->init_encode_params.encodeGUID, &params, &val);
 
     if (ret == NV_ENC_SUCCESS)
         return val;
@@ -264,7 +263,7 @@ static int nvenc_check_cap(AVCodecContext *avctx, NV_ENC_CAPS cap)
 
 static int nvenc_check_capabilities(AVCodecContext *avctx)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
     int ret;
 
     ret = nvenc_check_codec_support(avctx);
@@ -279,6 +278,12 @@ static int nvenc_check_capabilities(AVCodecContext *avctx)
         return AVERROR(ENOSYS);
     }
 
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_LOSSLESS_ENCODE);
+    if (ctx->preset >= PRESET_LOSSLESS_DEFAULT && ret <= 0) {
+        av_log(avctx, AV_LOG_VERBOSE, "Lossless encoding not supported\n");
+        return AVERROR(ENOSYS);
+    }
+
     ret = nvenc_check_cap(avctx, NV_ENC_CAPS_WIDTH_MAX);
     if (ret < avctx->width) {
         av_log(avctx, AV_LOG_VERBOSE, "Width %d exceeds %d\n",
@@ -301,15 +306,25 @@ static int nvenc_check_capabilities(AVCodecContext *avctx)
         return AVERROR(ENOSYS);
     }
 
+    ret = nvenc_check_cap(avctx, NV_ENC_CAPS_SUPPORT_FIELD_ENCODING);
+    if (ret < 1 && avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
+        av_log(avctx, AV_LOG_VERBOSE,
+               "Interlaced encoding is not supported. Supported level: %d\n",
+               ret);
+        return AVERROR(ENOSYS);
+    }
+
     return 0;
 }
 
-static int nvenc_check_device(AVCodecContext *avctx, int idx)
+static av_cold int nvenc_check_device(AVCodecContext *avctx, int idx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NVENCLibraryContext *nvel       = &ctx->nvel;
-    char name[128]                  = { 0 };
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    char name[128] = { 0};
     int major, minor, ret;
+    CUresult cu_res;
     CUdevice cu_device;
     CUcontext dummy;
     int loglevel = AV_LOG_VERBOSE;
@@ -317,36 +332,41 @@ static int nvenc_check_device(AVCodecContext *avctx, int idx)
     if (ctx->device == LIST_DEVICES)
         loglevel = AV_LOG_INFO;
 
-    ret = nvel->cu_device_get(&cu_device, idx);
-    if (ret != CUDA_SUCCESS) {
+    cu_res = dl_fn->cu_device_get(&cu_device, idx);
+    if (cu_res != CUDA_SUCCESS) {
         av_log(avctx, AV_LOG_ERROR,
                "Cannot access the CUDA device %d\n",
                idx);
         return -1;
     }
 
-    ret = nvel->cu_device_get_name(name, sizeof(name), cu_device);
-    if (ret != CUDA_SUCCESS)
+    cu_res = dl_fn->cu_device_get_name(name, sizeof(name), cu_device);
+    if (cu_res != CUDA_SUCCESS)
         return -1;
 
-    ret = nvel->cu_device_compute_capability(&major, &minor, cu_device);
-    if (ret != CUDA_SUCCESS)
+    cu_res = dl_fn->cu_device_compute_capability(&major, &minor, cu_device);
+    if (cu_res != CUDA_SUCCESS)
         return -1;
 
-    av_log(avctx, loglevel, "Device %d [%s] ", cu_device, name);
-
-    if (((major << 4) | minor) < NVENC_CAP)
+    av_log(avctx, loglevel, "[ GPU #%d - < %s > has Compute SM %d.%d ]\n", idx, name, major, minor);
+    if (((major << 4) | minor) < NVENC_CAP) {
+        av_log(avctx, loglevel, "does not support NVENC\n");
         goto fail;
+    }
 
-    ret = nvel->cu_ctx_create(&ctx->cu_context_internal, 0, cu_device);
-    if (ret != CUDA_SUCCESS)
+    cu_res = dl_fn->cu_ctx_create(&ctx->cu_context_internal, 0, cu_device);
+    if (cu_res != CUDA_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
         goto fail;
+    }
 
     ctx->cu_context = ctx->cu_context_internal;
 
-    ret = nvel->cu_ctx_pop_current(&dummy);
-    if (ret != CUDA_SUCCESS)
+    cu_res = dl_fn->cu_ctx_pop_current(&dummy);
+    if (cu_res != CUDA_SUCCESS) {
+        av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
         goto fail2;
+    }
 
     if ((ret = nvenc_open_session(avctx)) < 0)
         goto fail2;
@@ -356,42 +376,41 @@ static int nvenc_check_device(AVCodecContext *avctx, int idx)
 
     av_log(avctx, loglevel, "supports NVENC\n");
 
-    if (ctx->device == cu_device || ctx->device == ANY_DEVICE)
+    dl_fn->nvenc_device_count++;
+
+    if (ctx->device == dl_fn->nvenc_device_count - 1 || ctx->device == ANY_DEVICE)
         return 0;
 
 fail3:
-    nvel->nvenc_funcs.nvEncDestroyEncoder(ctx->nvenc_ctx);
-    ctx->nvenc_ctx = NULL;
+    p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+    ctx->nvencoder = NULL;
 
 fail2:
-    nvel->cu_ctx_destroy(ctx->cu_context_internal);
+    dl_fn->cu_ctx_destroy(ctx->cu_context_internal);
     ctx->cu_context_internal = NULL;
 
 fail:
-    if (ret != 0)
-        av_log(avctx, loglevel, "does not support NVENC (major %d minor %d)\n",
-               major, minor);
-
     return AVERROR(ENOSYS);
 }
 
-static int nvenc_setup_device(AVCodecContext *avctx)
+static av_cold int nvenc_setup_device(AVCodecContext *avctx)
 {
-    NVENCContext *ctx         = avctx->priv_data;
-    NVENCLibraryContext *nvel = &ctx->nvel;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
 
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
-        ctx->params.encodeGUID = NV_ENC_CODEC_H264_GUID;
+        ctx->init_encode_params.encodeGUID = NV_ENC_CODEC_H264_GUID;
         break;
     case AV_CODEC_ID_HEVC:
-        ctx->params.encodeGUID = NV_ENC_CODEC_HEVC_GUID;
+        ctx->init_encode_params.encodeGUID = NV_ENC_CODEC_HEVC_GUID;
         break;
     default:
         return AVERROR_BUG;
     }
 
     if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
+#if CONFIG_CUDA
         AVHWFramesContext   *frames_ctx;
         AVCUDADeviceContext *device_hwctx;
         int ret;
@@ -409,24 +428,36 @@ static int nvenc_setup_device(AVCodecContext *avctx)
             return ret;
 
         ret = nvenc_check_capabilities(avctx);
-        if (ret < 0)
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_FATAL, "Provided device doesn't support required NVENC features\n");
             return ret;
+        }
+#else
+        return AVERROR_BUG;
+#endif
     } else {
         int i, nb_devices = 0;
 
-        if ((nvel->cu_init(0)) != CUDA_SUCCESS) {
+        if ((dl_fn->cu_init(0)) != CUDA_SUCCESS) {
             av_log(avctx, AV_LOG_ERROR,
                    "Cannot init CUDA\n");
             return AVERROR_UNKNOWN;
         }
 
-        if ((nvel->cu_device_get_count(&nb_devices)) != CUDA_SUCCESS) {
+        if ((dl_fn->cu_device_get_count(&nb_devices)) != CUDA_SUCCESS) {
             av_log(avctx, AV_LOG_ERROR,
                    "Cannot enumerate the CUDA devices\n");
             return AVERROR_UNKNOWN;
         }
 
+        if (!nb_devices) {
+            av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
+                return AVERROR_EXTERNAL;
+        }
+
+        av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", nb_devices);
 
+        dl_fn->nvenc_device_count = 0;
         for (i = 0; i < nb_devices; ++i) {
             if ((nvenc_check_device(avctx, i)) >= 0 && ctx->device != LIST_DEVICES)
                 return 0;
@@ -435,7 +466,13 @@ static int nvenc_setup_device(AVCodecContext *avctx)
         if (ctx->device == LIST_DEVICES)
             return AVERROR_EXIT;
 
-        return AVERROR(ENOSYS);
+        if (!dl_fn->nvenc_device_count) {
+            av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->device, dl_fn->nvenc_device_count);
+        return AVERROR(EINVAL);
     }
 
     return 0;
@@ -446,68 +483,116 @@ typedef struct GUIDTuple {
     int flags;
 } GUIDTuple;
 
-static int nvec_map_preset(NVENCContext *ctx)
+static void nvenc_map_preset(NvencContext *ctx)
 {
     GUIDTuple presets[] = {
         { NV_ENC_PRESET_DEFAULT_GUID },
+        { NV_ENC_PRESET_HQ_GUID,                  NVENC_TWO_PASSES }, /* slow */
+        { NV_ENC_PRESET_HQ_GUID,                  NVENC_ONE_PASS }, /* medium */
+        { NV_ENC_PRESET_HP_GUID,                  NVENC_ONE_PASS }, /* fast */
         { NV_ENC_PRESET_HP_GUID },
         { NV_ENC_PRESET_HQ_GUID },
         { NV_ENC_PRESET_BD_GUID },
         { NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID, NVENC_LOWLATENCY },
-        { NV_ENC_PRESET_LOW_LATENCY_HP_GUID,      NVENC_LOWLATENCY },
         { NV_ENC_PRESET_LOW_LATENCY_HQ_GUID,      NVENC_LOWLATENCY },
+        { NV_ENC_PRESET_LOW_LATENCY_HP_GUID,      NVENC_LOWLATENCY },
         { NV_ENC_PRESET_LOSSLESS_DEFAULT_GUID,    NVENC_LOSSLESS },
         { NV_ENC_PRESET_LOSSLESS_HP_GUID,         NVENC_LOSSLESS },
-        { { 0 } }
     };
 
     GUIDTuple *t = &presets[ctx->preset];
 
-    ctx->params.presetGUID = t->guid;
-    ctx->flags             = t->flags;
-
-    return AVERROR(EINVAL);
+    ctx->init_encode_params.presetGUID = t->guid;
+    ctx->flags = t->flags;
 }
 
-static void set_constqp(AVCodecContext *avctx, NV_ENC_RC_PARAMS *rc)
+static av_cold void set_constqp(AVCodecContext *avctx)
 {
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
+
     rc->rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
     rc->constQP.qpInterB = avctx->global_quality;
     rc->constQP.qpInterP = avctx->global_quality;
-    rc->constQP.qpIntra  = avctx->global_quality;
+    rc->constQP.qpIntra = avctx->global_quality;
+
+    avctx->qmin = -1;
+    avctx->qmax = -1;
 }
 
-static void set_vbr(AVCodecContext *avctx, NV_ENC_RC_PARAMS *rc)
+static av_cold void set_vbr(AVCodecContext *avctx)
 {
-    if (avctx->qmin >= 0) {
-        rc->enableMinQP    = 1;
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
+    int qp_inter_p;
+
+    if (avctx->qmin >= 0 && avctx->qmax >= 0) {
+        rc->enableMinQP = 1;
+        rc->enableMaxQP = 1;
+
         rc->minQP.qpInterB = avctx->qmin;
         rc->minQP.qpInterP = avctx->qmin;
-        rc->minQP.qpIntra  = avctx->qmin;
-    }
+        rc->minQP.qpIntra = avctx->qmin;
 
-    if (avctx->qmax >= 0) {
-        rc->enableMaxQP = 1;
         rc->maxQP.qpInterB = avctx->qmax;
         rc->maxQP.qpInterP = avctx->qmax;
-        rc->maxQP.qpIntra  = avctx->qmax;
+        rc->maxQP.qpIntra = avctx->qmax;
+
+        qp_inter_p = (avctx->qmax + 3 * avctx->qmin) / 4; // biased towards Qmin
+    } else if (avctx->qmin >= 0) {
+        rc->enableMinQP = 1;
+
+        rc->minQP.qpInterB = avctx->qmin;
+        rc->minQP.qpInterP = avctx->qmin;
+        rc->minQP.qpIntra = avctx->qmin;
+
+        qp_inter_p = avctx->qmin;
+    } else {
+        qp_inter_p = 26; // default to 26
+    }
+
+    rc->enableInitialRCQP = 1;
+    rc->initialRCQP.qpInterP  = qp_inter_p;
+
+    if (avctx->i_quant_factor != 0.0 && avctx->b_quant_factor != 0.0) {
+        rc->initialRCQP.qpIntra = av_clip(
+            qp_inter_p * fabs(avctx->i_quant_factor) + avctx->i_quant_offset, 0, 51);
+        rc->initialRCQP.qpInterB = av_clip(
+            qp_inter_p * fabs(avctx->b_quant_factor) + avctx->b_quant_offset, 0, 51);
+    } else {
+        rc->initialRCQP.qpIntra = qp_inter_p;
+        rc->initialRCQP.qpInterB = qp_inter_p;
     }
 }
 
-static void nvenc_override_rate_control(AVCodecContext *avctx,
-                                        NV_ENC_RC_PARAMS *rc)
+static av_cold void set_lossless(AVCodecContext *avctx)
+{
+    NvencContext *ctx = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
+
+    rc->rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
+    rc->constQP.qpInterB = 0;
+    rc->constQP.qpInterP = 0;
+    rc->constQP.qpIntra = 0;
+
+    avctx->qmin = -1;
+    avctx->qmax = -1;
+}
+
+static void nvenc_override_rate_control(AVCodecContext *avctx)
 {
-    NVENCContext *ctx    = avctx->priv_data;
+    NvencContext *ctx    = avctx->priv_data;
+    NV_ENC_RC_PARAMS *rc = &ctx->encode_config.rcParams;
 
     switch (ctx->rc) {
     case NV_ENC_PARAMS_RC_CONSTQP:
-        if (avctx->global_quality < 0) {
+        if (avctx->global_quality <= 0) {
             av_log(avctx, AV_LOG_WARNING,
                    "The constant quality rate-control requires "
                    "the 'global_quality' option set.\n");
             return;
         }
-        set_constqp(avctx, rc);
+        set_constqp(avctx);
         return;
     case NV_ENC_PARAMS_RC_2_PASS_VBR:
     case NV_ENC_PARAMS_RC_VBR:
@@ -515,6 +600,7 @@ static void nvenc_override_rate_control(AVCodecContext *avctx,
             av_log(avctx, AV_LOG_WARNING,
                    "The variable bitrate rate-control requires "
                    "the 'qmin' and/or 'qmax' option set.\n");
+            set_vbr(avctx);
             return;
         }
     case NV_ENC_PARAMS_RC_VBR_MINQP:
@@ -522,415 +608,555 @@ static void nvenc_override_rate_control(AVCodecContext *avctx,
             av_log(avctx, AV_LOG_WARNING,
                    "The variable bitrate rate-control requires "
                    "the 'qmin' option set.\n");
+            set_vbr(avctx);
             return;
         }
-        set_vbr(avctx, rc);
+        set_vbr(avctx);
         break;
     case NV_ENC_PARAMS_RC_CBR:
-        break;
     case NV_ENC_PARAMS_RC_2_PASS_QUALITY:
     case NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP:
-        if (!(ctx->flags & NVENC_LOWLATENCY)) {
-            av_log(avctx, AV_LOG_WARNING,
-                   "The multipass rate-control requires "
-                   "a low-latency preset.\n");
-            return;
-        }
+        break;
     }
 
     rc->rateControlMode = ctx->rc;
 }
 
-static void nvenc_setup_rate_control(AVCodecContext *avctx)
+static av_cold void nvenc_setup_rate_control(AVCodecContext *avctx)
 {
-    NVENCContext *ctx    = avctx->priv_data;
-    NV_ENC_RC_PARAMS *rc = &ctx->config.rcParams;
+    NvencContext *ctx = avctx->priv_data;
 
-    if (avctx->bit_rate > 0)
-        rc->averageBitRate = avctx->bit_rate;
+    if (avctx->bit_rate > 0) {
+        ctx->encode_config.rcParams.averageBitRate = avctx->bit_rate;
+    } else if (ctx->encode_config.rcParams.averageBitRate > 0) {
+        ctx->encode_config.rcParams.maxBitRate = ctx->encode_config.rcParams.averageBitRate;
+    }
 
     if (avctx->rc_max_rate > 0)
-        rc->maxBitRate = avctx->rc_max_rate;
-
-    if (ctx->rc > 0) {
-        nvenc_override_rate_control(avctx, rc);
-    } else if (avctx->global_quality > 0) {
-        set_constqp(avctx, rc);
-    } else if (avctx->qmin >= 0 && avctx->qmax >= 0) {
-        rc->rateControlMode = NV_ENC_PARAMS_RC_VBR;
-        set_vbr(avctx, rc);
+        ctx->encode_config.rcParams.maxBitRate = avctx->rc_max_rate;
+
+    if (ctx->rc < 0) {
+        if (ctx->flags & NVENC_ONE_PASS)
+            ctx->twopass = 0;
+        if (ctx->flags & NVENC_TWO_PASSES)
+            ctx->twopass = 1;
+
+        if (ctx->twopass < 0)
+            ctx->twopass = (ctx->flags & NVENC_LOWLATENCY) != 0;
+
+        if (ctx->cbr) {
+            if (ctx->twopass) {
+                ctx->rc = NV_ENC_PARAMS_RC_2_PASS_QUALITY;
+            } else {
+                ctx->rc = NV_ENC_PARAMS_RC_CBR;
+            }
+        } else if (avctx->global_quality > 0) {
+            ctx->rc = NV_ENC_PARAMS_RC_CONSTQP;
+        } else if (ctx->twopass) {
+            ctx->rc = NV_ENC_PARAMS_RC_2_PASS_VBR;
+        } else if (avctx->qmin >= 0 && avctx->qmax >= 0) {
+            ctx->rc = NV_ENC_PARAMS_RC_VBR_MINQP;
+        }
     }
 
-    if (avctx->rc_buffer_size > 0)
-        rc->vbvBufferSize = avctx->rc_buffer_size;
+    if (ctx->flags & NVENC_LOSSLESS) {
+        set_lossless(avctx);
+    } else if (ctx->rc >= 0) {
+        nvenc_override_rate_control(avctx);
+    } else {
+        ctx->encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR;
+        set_vbr(avctx);
+    }
 
-    if (rc->averageBitRate > 0)
-        avctx->bit_rate = rc->averageBitRate;
+    if (avctx->rc_buffer_size > 0) {
+        ctx->encode_config.rcParams.vbvBufferSize = avctx->rc_buffer_size;
+    } else if (ctx->encode_config.rcParams.averageBitRate > 0) {
+        ctx->encode_config.rcParams.vbvBufferSize = 2 * ctx->encode_config.rcParams.averageBitRate;
+    }
 }
 
-static int nvenc_setup_h264_config(AVCodecContext *avctx)
+static av_cold int nvenc_setup_h264_config(AVCodecContext *avctx)
 {
-    NVENCContext *ctx                      = avctx->priv_data;
-    NV_ENC_CONFIG *cc                      = &ctx->config;
+    NvencContext *ctx                      = avctx->priv_data;
+    NV_ENC_CONFIG *cc                      = &ctx->encode_config;
     NV_ENC_CONFIG_H264 *h264               = &cc->encodeCodecConfig.h264Config;
     NV_ENC_CONFIG_H264_VUI_PARAMETERS *vui = &h264->h264VUIParameters;
 
-    vui->colourDescriptionPresentFlag = 1;
-    vui->videoSignalTypePresentFlag   = 1;
-
-    vui->colourMatrix            = avctx->colorspace;
-    vui->colourPrimaries         = avctx->color_primaries;
+    vui->colourMatrix = avctx->colorspace;
+    vui->colourPrimaries = avctx->color_primaries;
     vui->transferCharacteristics = avctx->color_trc;
+    vui->videoFullRangeFlag = (avctx->color_range == AVCOL_RANGE_JPEG
+        || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ420P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ422P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ444P);
+
+    vui->colourDescriptionPresentFlag =
+        (avctx->colorspace != 2 || avctx->color_primaries != 2 || avctx->color_trc != 2);
 
-    vui->videoFullRangeFlag = avctx->color_range == AVCOL_RANGE_JPEG;
+    vui->videoSignalTypePresentFlag =
+        (vui->colourDescriptionPresentFlag
+        || vui->videoFormat != 5
+        || vui->videoFullRangeFlag != 0);
+
+    h264->sliceMode = 3;
+    h264->sliceModeData = 1;
 
     h264->disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
     h264->repeatSPSPPS  = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
+    h264->outputAUD     = 1;
 
-    h264->maxNumRefFrames = avctx->refs;
-    h264->idrPeriod       = cc->gopLength;
+    if (avctx->refs >= 0) {
+        /* 0 means "let the hardware decide" */
+        h264->maxNumRefFrames = avctx->refs;
+    }
+    if (avctx->gop_size >= 0) {
+        h264->idrPeriod = cc->gopLength;
+    }
 
-    if (ctx->profile)
-        avctx->profile = ctx->profile;
+    if (IS_CBR(cc->rcParams.rateControlMode)) {
+        h264->outputBufferingPeriodSEI = 1;
+        h264->outputPictureTimingSEI   = 1;
+    }
 
-    if (ctx->data_pix_fmt == AV_PIX_FMT_YUV444P)
-        h264->chromaFormatIDC = 3;
-    else
-        h264->chromaFormatIDC = 1;
+    if (cc->rcParams.rateControlMode == NV_ENC_PARAMS_RC_2_PASS_QUALITY ||
+        cc->rcParams.rateControlMode == NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP ||
+        cc->rcParams.rateControlMode == NV_ENC_PARAMS_RC_2_PASS_VBR) {
+        h264->adaptiveTransformMode = NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE;
+        h264->fmoMode = NV_ENC_H264_FMO_DISABLE;
+    }
 
-    switch (ctx->profile) {
-    case NV_ENC_H264_PROFILE_BASELINE:
-        cc->profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_MAIN:
-        cc->profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_HIGH:
-        cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_HIGH_444:
+    if (ctx->flags & NVENC_LOSSLESS) {
+        h264->qpPrimeYZeroTransformBypassFlag = 1;
+    } else {
+        switch(ctx->profile) {
+        case NV_ENC_H264_PROFILE_BASELINE:
+            cc->profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
+            avctx->profile = FF_PROFILE_H264_BASELINE;
+            break;
+        case NV_ENC_H264_PROFILE_MAIN:
+            cc->profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
+            avctx->profile = FF_PROFILE_H264_MAIN;
+            break;
+        case NV_ENC_H264_PROFILE_HIGH:
+            cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_GUID;
+            avctx->profile = FF_PROFILE_H264_HIGH;
+            break;
+        case NV_ENC_H264_PROFILE_HIGH_444P:
+            cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
+            avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
+            break;
+        }
+    }
+
+    // force setting profile as high444p if input is AV_PIX_FMT_YUV444P
+    if (ctx->data_pix_fmt == AV_PIX_FMT_YUV444P) {
         cc->profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-        break;
-    case NV_ENC_H264_PROFILE_CONSTRAINED_HIGH:
-        cc->profileGUID = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH_GUID;
-        break;
+        avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
     }
 
+    h264->chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1;
+
     h264->level = ctx->level;
 
     return 0;
 }
 
-static int nvenc_setup_hevc_config(AVCodecContext *avctx)
+static av_cold int nvenc_setup_hevc_config(AVCodecContext *avctx)
 {
-    NVENCContext *ctx                      = avctx->priv_data;
-    NV_ENC_CONFIG *cc                      = &ctx->config;
+    NvencContext *ctx                      = avctx->priv_data;
+    NV_ENC_CONFIG *cc                      = &ctx->encode_config;
     NV_ENC_CONFIG_HEVC *hevc               = &cc->encodeCodecConfig.hevcConfig;
+    NV_ENC_CONFIG_HEVC_VUI_PARAMETERS *vui = &hevc->hevcVUIParameters;
+
+    vui->colourMatrix = avctx->colorspace;
+    vui->colourPrimaries = avctx->color_primaries;
+    vui->transferCharacteristics = avctx->color_trc;
+    vui->videoFullRangeFlag = (avctx->color_range == AVCOL_RANGE_JPEG
+        || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ420P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ422P || ctx->data_pix_fmt == AV_PIX_FMT_YUVJ444P);
+
+    vui->colourDescriptionPresentFlag =
+        (avctx->colorspace != 2 || avctx->color_primaries != 2 || avctx->color_trc != 2);
+
+    vui->videoSignalTypePresentFlag =
+        (vui->colourDescriptionPresentFlag
+        || vui->videoFormat != 5
+        || vui->videoFullRangeFlag != 0);
+
+    hevc->sliceMode = 3;
+    hevc->sliceModeData = 1;
 
     hevc->disableSPSPPS = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 1 : 0;
     hevc->repeatSPSPPS  = (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) ? 0 : 1;
+    hevc->outputAUD     = 1;
 
-    hevc->maxNumRefFramesInDPB = avctx->refs;
-    hevc->idrPeriod            = cc->gopLength;
+    if (avctx->refs >= 0) {
+        /* 0 means "let the hardware decide" */
+        hevc->maxNumRefFramesInDPB = avctx->refs;
+    }
+    if (avctx->gop_size >= 0) {
+        hevc->idrPeriod = cc->gopLength;
+    }
+
+    if (IS_CBR(cc->rcParams.rateControlMode)) {
+        hevc->outputBufferingPeriodSEI = 1;
+        hevc->outputPictureTimingSEI   = 1;
+    }
 
     /* No other profile is supported in the current SDK version 5 */
     cc->profileGUID = NV_ENC_HEVC_PROFILE_MAIN_GUID;
-    avctx->profile  = FF_PROFILE_HEVC_MAIN;
+    avctx->profile = FF_PROFILE_HEVC_MAIN;
 
-    if (ctx->level) {
-        hevc->level = ctx->level;
-    } else {
-        hevc->level = NV_ENC_LEVEL_AUTOSELECT;
-    }
+    hevc->level = ctx->level;
 
-    if (ctx->tier) {
-        hevc->tier = ctx->tier;
-    }
+    hevc->tier = ctx->tier;
 
     return 0;
 }
-static int nvenc_setup_codec_config(AVCodecContext *avctx)
+
+static av_cold int nvenc_setup_codec_config(AVCodecContext *avctx)
 {
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
         return nvenc_setup_h264_config(avctx);
     case AV_CODEC_ID_HEVC:
         return nvenc_setup_hevc_config(avctx);
+    /* Earlier switch/case will return if unknown codec is passed. */
     }
+
     return 0;
 }
 
-static int nvenc_setup_encoder(AVCodecContext *avctx)
+static av_cold int nvenc_setup_encoder(AVCodecContext *avctx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_PRESET_CONFIG preset_cfg = { 0 };
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    NV_ENC_PRESET_CONFIG preset_config = { 0 };
+    NVENCSTATUS nv_status = NV_ENC_SUCCESS;
     AVCPBProperties *cpb_props;
-    int ret;
+    int res = 0;
+    int dw, dh;
 
-    ctx->params.version = NV_ENC_INITIALIZE_PARAMS_VER;
+    ctx->encode_config.version = NV_ENC_CONFIG_VER;
+    ctx->init_encode_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
 
-    ctx->params.encodeHeight = avctx->height;
-    ctx->params.encodeWidth  = avctx->width;
+    ctx->init_encode_params.encodeHeight = avctx->height;
+    ctx->init_encode_params.encodeWidth = avctx->width;
 
-    if (avctx->sample_aspect_ratio.num &&
-        avctx->sample_aspect_ratio.den &&
-        (avctx->sample_aspect_ratio.num != 1 ||
-         avctx->sample_aspect_ratio.den != 1)) {
-        av_reduce(&ctx->params.darWidth,
-                  &ctx->params.darHeight,
-                  avctx->width * avctx->sample_aspect_ratio.num,
-                  avctx->height * avctx->sample_aspect_ratio.den,
-                  INT_MAX / 8);
-    } else {
-        ctx->params.darHeight = avctx->height;
-        ctx->params.darWidth  = avctx->width;
-    }
+    ctx->init_encode_params.encodeConfig = &ctx->encode_config;
+
+    nvenc_map_preset(ctx);
 
-    ctx->params.frameRateNum = avctx->time_base.den;
-    ctx->params.frameRateDen = avctx->time_base.num * avctx->ticks_per_frame;
+    preset_config.version = NV_ENC_PRESET_CONFIG_VER;
+    preset_config.presetCfg.version = NV_ENC_CONFIG_VER;
 
-    ctx->params.enableEncodeAsync = 0;
-    ctx->params.enablePTD         = 1;
+    nv_status = p_nvenc->nvEncGetEncodePresetConfig(ctx->nvencoder,
+                                                    ctx->init_encode_params.encodeGUID,
+                                                    ctx->init_encode_params.presetGUID,
+                                                    &preset_config);
+    if (nv_status != NV_ENC_SUCCESS)
+        return nvenc_print_error(avctx, nv_status, "Cannot get the preset configuration");
 
-    ctx->params.encodeConfig = &ctx->config;
+    memcpy(&ctx->encode_config, &preset_config.presetCfg, sizeof(ctx->encode_config));
 
-    nvec_map_preset(ctx);
+    ctx->encode_config.version = NV_ENC_CONFIG_VER;
 
-    preset_cfg.version           = NV_ENC_PRESET_CONFIG_VER;
-    preset_cfg.presetCfg.version = NV_ENC_CONFIG_VER;
+    if (avctx->sample_aspect_ratio.num && avctx->sample_aspect_ratio.den &&
+        (avctx->sample_aspect_ratio.num != 1 || avctx->sample_aspect_ratio.num != 1)) {
+        av_reduce(&dw, &dh,
+                  avctx->width * avctx->sample_aspect_ratio.num,
+                  avctx->height * avctx->sample_aspect_ratio.den,
+                  1024 * 1024);
+        ctx->init_encode_params.darHeight = dh;
+        ctx->init_encode_params.darWidth = dw;
+    } else {
+        ctx->init_encode_params.darHeight = avctx->height;
+        ctx->init_encode_params.darWidth = avctx->width;
+    }
 
-    ret = nv->nvEncGetEncodePresetConfig(ctx->nvenc_ctx,
-                                         ctx->params.encodeGUID,
-                                         ctx->params.presetGUID,
-                                         &preset_cfg);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "Cannot get the preset configuration");
+    // De-compensate for hardware, dubiously, trying to compensate for
+    // playback at 704 pixel width.
+    if (avctx->width == 720 &&
+        (avctx->height == 480 || avctx->height == 576)) {
+        av_reduce(&dw, &dh,
+                  ctx->init_encode_params.darWidth * 44,
+                  ctx->init_encode_params.darHeight * 45,
+                  1024 * 1024);
+        ctx->init_encode_params.darHeight = dh;
+        ctx->init_encode_params.darWidth = dw;
+    }
 
-    memcpy(&ctx->config, &preset_cfg.presetCfg, sizeof(ctx->config));
+    ctx->init_encode_params.frameRateNum = avctx->time_base.den;
+    ctx->init_encode_params.frameRateDen = avctx->time_base.num * avctx->ticks_per_frame;
 
-    ctx->config.version = NV_ENC_CONFIG_VER;
+    ctx->init_encode_params.enableEncodeAsync = 0;
+    ctx->init_encode_params.enablePTD = 1;
 
     if (avctx->gop_size > 0) {
-        if (avctx->max_b_frames > 0) {
-            /* 0 is intra-only,
-             * 1 is I/P only,
-             * 2 is one B-Frame,
-             * 3 two B-frames, and so on. */
-            ctx->config.frameIntervalP = avctx->max_b_frames + 1;
-        } else if (avctx->max_b_frames == 0) {
-            ctx->config.frameIntervalP = 1;
+        if (avctx->max_b_frames >= 0) {
+            /* 0 is intra-only, 1 is I/P only, 2 is one B-Frame, 3 two B-frames, and so on. */
+            ctx->encode_config.frameIntervalP = avctx->max_b_frames + 1;
         }
-        ctx->config.gopLength = avctx->gop_size;
+
+        ctx->encode_config.gopLength = avctx->gop_size;
     } else if (avctx->gop_size == 0) {
-        ctx->config.frameIntervalP = 0;
-        ctx->config.gopLength      = 1;
+        ctx->encode_config.frameIntervalP = 0;
+        ctx->encode_config.gopLength = 1;
     }
 
-    if (ctx->config.frameIntervalP > 1)
-        avctx->max_b_frames = ctx->config.frameIntervalP - 1;
-
     ctx->initial_pts[0] = AV_NOPTS_VALUE;
     ctx->initial_pts[1] = AV_NOPTS_VALUE;
 
     nvenc_setup_rate_control(avctx);
 
     if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
-        ctx->config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
+        ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
     } else {
-        ctx->config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
+        ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
     }
 
-    if ((ret = nvenc_setup_codec_config(avctx)) < 0)
-        return ret;
+    res = nvenc_setup_codec_config(avctx);
+    if (res)
+        return res;
 
-    ret = nv->nvEncInitializeEncoder(ctx->nvenc_ctx, &ctx->params);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "Cannot initialize the decoder");
+    nv_status = p_nvenc->nvEncInitializeEncoder(ctx->nvencoder, &ctx->init_encode_params);
+    if (nv_status != NV_ENC_SUCCESS) {
+        return nvenc_print_error(avctx, nv_status, "InitializeEncoder failed");
+    }
+
+    if (ctx->encode_config.frameIntervalP > 1)
+        avctx->has_b_frames = 2;
+
+    if (ctx->encode_config.rcParams.averageBitRate > 0)
+        avctx->bit_rate = ctx->encode_config.rcParams.averageBitRate;
 
     cpb_props = ff_add_cpb_side_data(avctx);
     if (!cpb_props)
         return AVERROR(ENOMEM);
-    cpb_props->max_bitrate = avctx->rc_max_rate;
-    cpb_props->min_bitrate = avctx->rc_min_rate;
+    cpb_props->max_bitrate = ctx->encode_config.rcParams.maxBitRate;
     cpb_props->avg_bitrate = avctx->bit_rate;
-    cpb_props->buffer_size = avctx->rc_buffer_size;
+    cpb_props->buffer_size = ctx->encode_config.rcParams.vbvBufferSize;
 
     return 0;
 }
 
-static int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
+static av_cold int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    int ret;
-    NV_ENC_CREATE_BITSTREAM_BUFFER out_buffer = { 0 };
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    NVENCSTATUS nv_status;
+    NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
+    allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
 
     switch (ctx->data_pix_fmt) {
     case AV_PIX_FMT_YUV420P:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_YV12_PL;
+        ctx->surfaces[idx].format = NV_ENC_BUFFER_FORMAT_YV12_PL;
         break;
+
     case AV_PIX_FMT_NV12:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_NV12_PL;
+        ctx->surfaces[idx].format = NV_ENC_BUFFER_FORMAT_NV12_PL;
         break;
+
     case AV_PIX_FMT_YUV444P:
-        ctx->frames[idx].format = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+        ctx->surfaces[idx].format = NV_ENC_BUFFER_FORMAT_YUV444_PL;
         break;
+
     default:
-        return AVERROR_BUG;
+        av_log(avctx, AV_LOG_FATAL, "Invalid input pixel format\n");
+        return AVERROR(EINVAL);
     }
 
     if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
-        ctx->frames[idx].in_ref = av_frame_alloc();
-        if (!ctx->frames[idx].in_ref)
+        ctx->surfaces[idx].in_ref = av_frame_alloc();
+        if (!ctx->surfaces[idx].in_ref)
             return AVERROR(ENOMEM);
     } else {
-        NV_ENC_CREATE_INPUT_BUFFER in_buffer      = { 0 };
-
-        in_buffer.version  = NV_ENC_CREATE_INPUT_BUFFER_VER;
-
-        in_buffer.width  = avctx->width;
-        in_buffer.height = avctx->height;
-
-        in_buffer.bufferFmt  = ctx->frames[idx].format;
-        in_buffer.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_UNCACHED;
-
-        ret = nv->nvEncCreateInputBuffer(ctx->nvenc_ctx, &in_buffer);
-        if (ret != NV_ENC_SUCCESS)
-            return nvenc_print_error(avctx, ret, "CreateInputBuffer failed");
+        NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
+        allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
+        allocSurf.width = (avctx->width + 31) & ~31;
+        allocSurf.height = (avctx->height + 31) & ~31;
+        allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+        allocSurf.bufferFmt = ctx->surfaces[idx].format;
+
+        nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf);
+        if (nv_status != NV_ENC_SUCCESS) {
+            return nvenc_print_error(avctx, nv_status, "CreateInputBuffer failed");
+        }
 
-        ctx->frames[idx].in     = in_buffer.inputBuffer;
+        ctx->surfaces[idx].input_surface = allocSurf.inputBuffer;
+        ctx->surfaces[idx].width = allocSurf.width;
+        ctx->surfaces[idx].height = allocSurf.height;
     }
 
-    out_buffer.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
-    /* 1MB is large enough to hold most output frames.
-     * NVENC increases this automatically if it is not enough. */
-    out_buffer.size = BITSTREAM_BUFFER_SIZE;
-
-    out_buffer.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_UNCACHED;
+    ctx->surfaces[idx].lockCount = 0;
 
-    ret = nv->nvEncCreateBitstreamBuffer(ctx->nvenc_ctx, &out_buffer);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "CreateBitstreamBuffer failed");
+    /* 1MB is large enough to hold most output frames.
+     * NVENC increases this automaticaly if it is not enough. */
+    allocOut.size = 1024 * 1024;
+
+    allocOut.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+
+    nv_status = p_nvenc->nvEncCreateBitstreamBuffer(ctx->nvencoder, &allocOut);
+    if (nv_status != NV_ENC_SUCCESS) {
+        int err = nvenc_print_error(avctx, nv_status, "CreateBitstreamBuffer failed");
+        if (avctx->pix_fmt != AV_PIX_FMT_CUDA)
+            p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->surfaces[idx].input_surface);
+        av_frame_free(&ctx->surfaces[idx].in_ref);
+        return err;
+    }
 
-    ctx->frames[idx].out  = out_buffer.bitstreamBuffer;
+    ctx->surfaces[idx].output_surface = allocOut.bitstreamBuffer;
+    ctx->surfaces[idx].size = allocOut.size;
 
     return 0;
 }
 
-static int nvenc_setup_surfaces(AVCodecContext *avctx)
+static av_cold int nvenc_setup_surfaces(AVCodecContext *avctx)
 {
-    NVENCContext *ctx = avctx->priv_data;
-    int i, ret;
-
-    ctx->nb_surfaces = FFMAX(4 + avctx->max_b_frames,
+    NvencContext *ctx = avctx->priv_data;
+    int i, res;
+    int num_mbs = ((avctx->width + 15) >> 4) * ((avctx->height + 15) >> 4);
+    ctx->nb_surfaces = FFMAX((num_mbs >= 8160) ? 32 : 48,
                              ctx->nb_surfaces);
+    ctx->async_depth = FFMIN(ctx->async_depth, ctx->nb_surfaces - 1);
 
-    ctx->frames = av_mallocz_array(ctx->nb_surfaces, sizeof(*ctx->frames));
-    if (!ctx->frames)
+
+    ctx->surfaces = av_mallocz_array(ctx->nb_surfaces, sizeof(*ctx->surfaces));
+    if (!ctx->surfaces)
         return AVERROR(ENOMEM);
 
-    ctx->timestamps = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
-    if (!ctx->timestamps)
+    ctx->timestamp_list = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
+    if (!ctx->timestamp_list)
         return AVERROR(ENOMEM);
-    ctx->pending = av_fifo_alloc(ctx->nb_surfaces * sizeof(*ctx->frames));
-    if (!ctx->pending)
+    ctx->output_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NvencSurface*));
+    if (!ctx->output_surface_queue)
         return AVERROR(ENOMEM);
-    ctx->ready = av_fifo_alloc(ctx->nb_surfaces * sizeof(*ctx->frames));
-    if (!ctx->ready)
+    ctx->output_surface_ready_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NvencSurface*));
+    if (!ctx->output_surface_ready_queue)
         return AVERROR(ENOMEM);
 
     for (i = 0; i < ctx->nb_surfaces; i++) {
-        if ((ret = nvenc_alloc_surface(avctx, i)) < 0)
-            return ret;
+        if ((res = nvenc_alloc_surface(avctx, i)) < 0)
+            return res;
     }
 
     return 0;
 }
 
-#define EXTRADATA_SIZE 512
-
-static int nvenc_setup_extradata(AVCodecContext *avctx)
+static av_cold int nvenc_setup_extradata(AVCodecContext *avctx)
 {
-    NVENCContext *ctx                     = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv       = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    NVENCSTATUS nv_status;
+    uint32_t outSize = 0;
+    char tmpHeader[256];
     NV_ENC_SEQUENCE_PARAM_PAYLOAD payload = { 0 };
-    int ret;
+    payload.version = NV_ENC_SEQUENCE_PARAM_PAYLOAD_VER;
 
-    avctx->extradata = av_mallocz(EXTRADATA_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!avctx->extradata)
-        return AVERROR(ENOMEM);
+    payload.spsppsBuffer = tmpHeader;
+    payload.inBufferSize = sizeof(tmpHeader);
+    payload.outSPSPPSPayloadSize = &outSize;
 
-    payload.version              = NV_ENC_SEQUENCE_PARAM_PAYLOAD_VER;
-    payload.spsppsBuffer         = avctx->extradata;
-    payload.inBufferSize         = EXTRADATA_SIZE;
-    payload.outSPSPPSPayloadSize = &avctx->extradata_size;
+    nv_status = p_nvenc->nvEncGetSequenceParams(ctx->nvencoder, &payload);
+    if (nv_status != NV_ENC_SUCCESS) {
+        return nvenc_print_error(avctx, nv_status, "GetSequenceParams failed");
+    }
+
+    avctx->extradata_size = outSize;
+    avctx->extradata = av_mallocz(outSize + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    if (!avctx->extradata) {
+        return AVERROR(ENOMEM);
+    }
 
-    ret = nv->nvEncGetSequenceParams(ctx->nvenc_ctx, &payload);
-    if (ret != NV_ENC_SUCCESS)
-        return nvenc_print_error(avctx, ret, "Cannot get the extradata");
+    memcpy(avctx->extradata, tmpHeader, outSize);
 
     return 0;
 }
 
 av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx               = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
     int i;
 
     /* the encoder has to be flushed before it can be closed */
-    if (ctx->nvenc_ctx) {
+    if (ctx->nvencoder) {
         NV_ENC_PIC_PARAMS params        = { .version        = NV_ENC_PIC_PARAMS_VER,
                                             .encodePicFlags = NV_ENC_PIC_FLAG_EOS };
 
-        nv->nvEncEncodePicture(ctx->nvenc_ctx, &params);
+        p_nvenc->nvEncEncodePicture(ctx->nvencoder, &params);
     }
 
-    av_fifo_free(ctx->timestamps);
-    av_fifo_free(ctx->pending);
-    av_fifo_free(ctx->ready);
+    av_fifo_freep(&ctx->timestamp_list);
+    av_fifo_freep(&ctx->output_surface_ready_queue);
+    av_fifo_freep(&ctx->output_surface_queue);
 
-    if (ctx->frames) {
+    if (ctx->surfaces && avctx->pix_fmt == AV_PIX_FMT_CUDA) {
         for (i = 0; i < ctx->nb_surfaces; ++i) {
-            if (avctx->pix_fmt != AV_PIX_FMT_CUDA) {
-                nv->nvEncDestroyInputBuffer(ctx->nvenc_ctx, ctx->frames[i].in);
-            } else if (ctx->frames[i].in) {
-                nv->nvEncUnmapInputResource(ctx->nvenc_ctx, ctx->frames[i].in_map.mappedResource);
+            if (ctx->surfaces[i].input_surface) {
+                 p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, ctx->surfaces[i].in_map.mappedResource);
             }
-
-            av_frame_free(&ctx->frames[i].in_ref);
-            nv->nvEncDestroyBitstreamBuffer(ctx->nvenc_ctx, ctx->frames[i].out);
         }
+        for (i = 0; i < ctx->nb_registered_frames; i++) {
+            if (ctx->registered_frames[i].regptr)
+                p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->registered_frames[i].regptr);
+        }
+        ctx->nb_registered_frames = 0;
     }
-    for (i = 0; i < ctx->nb_registered_frames; i++) {
-        if (ctx->registered_frames[i].regptr)
-            nv->nvEncUnregisterResource(ctx->nvenc_ctx, ctx->registered_frames[i].regptr);
-    }
-    ctx->nb_registered_frames = 0;
 
-    av_freep(&ctx->frames);
+    if (ctx->surfaces) {
+        for (i = 0; i < ctx->nb_surfaces; ++i) {
+            if (avctx->pix_fmt != AV_PIX_FMT_CUDA)
+                p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->surfaces[i].input_surface);
+            av_frame_free(&ctx->surfaces[i].in_ref);
+            p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->surfaces[i].output_surface);
+        }
+    }
+    av_freep(&ctx->surfaces);
+    ctx->nb_surfaces = 0;
 
-    if (ctx->nvenc_ctx)
-        nv->nvEncDestroyEncoder(ctx->nvenc_ctx);
+    if (ctx->nvencoder)
+        p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+    ctx->nvencoder = NULL;
 
     if (ctx->cu_context_internal)
-        ctx->nvel.cu_ctx_destroy(ctx->cu_context_internal);
+        dl_fn->cu_ctx_destroy(ctx->cu_context_internal);
+    ctx->cu_context = ctx->cu_context_internal = NULL;
+
+    if (dl_fn->nvenc)
+        dlclose(dl_fn->nvenc);
+    dl_fn->nvenc = NULL;
 
-    if (ctx->nvel.nvenc)
-        dlclose(ctx->nvel.nvenc);
+    dl_fn->nvenc_device_count = 0;
 
 #if !CONFIG_CUDA
-    if (ctx->nvel.cuda)
-        dlclose(ctx->nvel.cuda);
+    if (dl_fn->cuda)
+        dlclose(dl_fn->cuda);
+    dl_fn->cuda = NULL;
 #endif
 
+    dl_fn->cu_init = NULL;
+    dl_fn->cu_device_get_count = NULL;
+    dl_fn->cu_device_get = NULL;
+    dl_fn->cu_device_get_name = NULL;
+    dl_fn->cu_device_compute_capability = NULL;
+    dl_fn->cu_ctx_create = NULL;
+    dl_fn->cu_ctx_pop_current = NULL;
+    dl_fn->cu_ctx_destroy = NULL;
+
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
+
     return 0;
 }
 
 av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
     int ret;
 
     if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
@@ -966,69 +1192,71 @@ av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static NVENCFrame *get_free_frame(NVENCContext *ctx)
+static NvencSurface *get_free_frame(NvencContext *ctx)
 {
     int i;
 
-    for (i = 0; i < ctx->nb_surfaces; i++) {
-        if (!ctx->frames[i].locked) {
-            ctx->frames[i].locked = 1;
-            return &ctx->frames[i];
+    for (i = 0; i < ctx->nb_surfaces; ++i) {
+        if (!ctx->surfaces[i].lockCount) {
+            ctx->surfaces[i].lockCount = 1;
+            return &ctx->surfaces[i];
         }
     }
 
     return NULL;
 }
 
-static int nvenc_copy_frame(NV_ENC_LOCK_INPUT_BUFFER *in, const AVFrame *frame)
+static int nvenc_copy_frame(AVCodecContext *avctx, NvencSurface *inSurf,
+            NV_ENC_LOCK_INPUT_BUFFER *lockBufferParams, const AVFrame *frame)
 {
-    uint8_t *buf = in->bufferDataPtr;
-    int off      = frame->height * in->pitch;
+    uint8_t *buf = lockBufferParams->bufferDataPtr;
+    int off = inSurf->height * lockBufferParams->pitch;
+
+    if (frame->format == AV_PIX_FMT_YUV420P) {
+        av_image_copy_plane(buf, lockBufferParams->pitch,
+            frame->data[0], frame->linesize[0],
+            avctx->width, avctx->height);
 
-    switch (frame->format) {
-    case AV_PIX_FMT_YUV420P:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
         buf += off;
 
-        av_image_copy_plane(buf, in->pitch >> 1,
-                            frame->data[2], frame->linesize[2],
-                            frame->width >> 1, frame->height >> 1);
+        av_image_copy_plane(buf, lockBufferParams->pitch >> 1,
+            frame->data[2], frame->linesize[2],
+            avctx->width >> 1, avctx->height >> 1);
 
         buf += off >> 2;
 
-        av_image_copy_plane(buf, in->pitch >> 1,
-                            frame->data[1], frame->linesize[1],
-                            frame->width >> 1, frame->height >> 1);
-        break;
-    case AV_PIX_FMT_NV12:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
+        av_image_copy_plane(buf, lockBufferParams->pitch >> 1,
+            frame->data[1], frame->linesize[1],
+            avctx->width >> 1, avctx->height >> 1);
+    } else if (frame->format == AV_PIX_FMT_NV12) {
+        av_image_copy_plane(buf, lockBufferParams->pitch,
+            frame->data[0], frame->linesize[0],
+            avctx->width, avctx->height);
+
         buf += off;
 
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width, frame->height >> 1);
-        break;
-    case AV_PIX_FMT_YUV444P:
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[0], frame->linesize[0],
-                            frame->width, frame->height);
+        av_image_copy_plane(buf, lockBufferParams->pitch,
+            frame->data[1], frame->linesize[1],
+            avctx->width, avctx->height >> 1);
+    } else if (frame->format == AV_PIX_FMT_YUV444P) {
+        av_image_copy_plane(buf, lockBufferParams->pitch,
+            frame->data[0], frame->linesize[0],
+            avctx->width, avctx->height);
+
         buf += off;
 
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[1], frame->linesize[1],
-                            frame->width, frame->height);
+        av_image_copy_plane(buf, lockBufferParams->pitch,
+            frame->data[1], frame->linesize[1],
+            avctx->width, avctx->height);
+
         buf += off;
 
-        av_image_copy_plane(buf, in->pitch,
-                            frame->data[2], frame->linesize[2],
-                            frame->width, frame->height);
-        break;
-    default:
-        return AVERROR_BUG;
+        av_image_copy_plane(buf, lockBufferParams->pitch,
+            frame->data[2], frame->linesize[2],
+            avctx->width, avctx->height);
+    } else {
+        av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n");
+        return AVERROR(EINVAL);
     }
 
     return 0;
@@ -1036,15 +1264,17 @@ static int nvenc_copy_frame(NV_ENC_LOCK_INPUT_BUFFER *in, const AVFrame *frame)
 
 static int nvenc_find_free_reg_resource(AVCodecContext *avctx)
 {
-    NVENCContext               *ctx = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
     int i;
 
     if (ctx->nb_registered_frames == FF_ARRAY_ELEMS(ctx->registered_frames)) {
         for (i = 0; i < ctx->nb_registered_frames; i++) {
             if (!ctx->registered_frames[i].mapped) {
                 if (ctx->registered_frames[i].regptr) {
-                    nv->nvEncUnregisterResource(ctx->nvenc_ctx,
+                    p_nvenc->nvEncUnregisterResource(ctx->nvencoder,
                                                 ctx->registered_frames[i].regptr);
                     ctx->registered_frames[i].regptr = NULL;
                 }
@@ -1061,9 +1291,11 @@ static int nvenc_find_free_reg_resource(AVCodecContext *avctx)
 
 static int nvenc_register_frame(AVCodecContext *avctx, const AVFrame *frame)
 {
-    NVENCContext               *ctx = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    AVHWFramesContext   *frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+
+    AVHWFramesContext *frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
     NV_ENC_REGISTER_RESOURCE reg;
     int i, idx, ret;
 
@@ -1080,11 +1312,11 @@ static int nvenc_register_frame(AVCodecContext *avctx, const AVFrame *frame)
     reg.resourceType       = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
     reg.width              = frames_ctx->width;
     reg.height             = frames_ctx->height;
-    reg.bufferFormat       = ctx->frames[0].format;
+    reg.bufferFormat       = ctx->surfaces[0].format;
     reg.pitch              = frame->linesize[0];
     reg.resourceToRegister = frame->data[0];
 
-    ret = nv->nvEncRegisterResource(ctx->nvenc_ctx, &reg);
+    ret = p_nvenc->nvEncRegisterResource(ctx->nvencoder, &reg);
     if (ret != NV_ENC_SUCCESS) {
         nvenc_print_error(avctx, ret, "Error registering an input resource");
         return AVERROR_UNKNOWN;
@@ -1096,101 +1328,102 @@ static int nvenc_register_frame(AVCodecContext *avctx, const AVFrame *frame)
 }
 
 static int nvenc_upload_frame(AVCodecContext *avctx, const AVFrame *frame,
-                              NVENCFrame *nvenc_frame)
+                                      NvencSurface *nvenc_frame)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    int ret;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
 
-    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
-        int reg_idx;
+    int res;
+    NVENCSTATUS nv_status;
 
-        ret = nvenc_register_frame(avctx, frame);
-        if (ret < 0) {
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
+        int reg_idx = nvenc_register_frame(avctx, frame);
+        if (reg_idx < 0) {
             av_log(avctx, AV_LOG_ERROR, "Could not register an input CUDA frame\n");
-            return ret;
+            return reg_idx;
         }
-        reg_idx = ret;
 
-        ret = av_frame_ref(nvenc_frame->in_ref, frame);
-        if (ret < 0)
-            return ret;
+        res = av_frame_ref(nvenc_frame->in_ref, frame);
+        if (res < 0)
+            return res;
 
-        nvenc_frame->in_map.version            = NV_ENC_MAP_INPUT_RESOURCE_VER;
+        nvenc_frame->in_map.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
         nvenc_frame->in_map.registeredResource = ctx->registered_frames[reg_idx].regptr;
-
-        ret = nv->nvEncMapInputResource(ctx->nvenc_ctx, &nvenc_frame->in_map);
-        if (ret != NV_ENC_SUCCESS) {
+        nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &nvenc_frame->in_map);
+        if (nv_status != NV_ENC_SUCCESS) {
             av_frame_unref(nvenc_frame->in_ref);
-            return nvenc_print_error(avctx, ret, "Error mapping an input resource");
+            return nvenc_print_error(avctx, nv_status, "Error mapping an input resource");
         }
 
         ctx->registered_frames[reg_idx].mapped = 1;
         nvenc_frame->reg_idx                   = reg_idx;
-        nvenc_frame->in                        = nvenc_frame->in_map.mappedResource;
+        nvenc_frame->input_surface             = nvenc_frame->in_map.mappedResource;
+        return 0;
     } else {
-        NV_ENC_LOCK_INPUT_BUFFER params = { 0 };
+        NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
 
-        params.version     = NV_ENC_LOCK_INPUT_BUFFER_VER;
-        params.inputBuffer = nvenc_frame->in;
+        lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
+        lockBufferParams.inputBuffer = nvenc_frame->input_surface;
 
-        ret = nv->nvEncLockInputBuffer(ctx->nvenc_ctx, &params);
-        if (ret != NV_ENC_SUCCESS)
-            return nvenc_print_error(avctx, ret, "Cannot lock the buffer");
+        nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams);
+        if (nv_status != NV_ENC_SUCCESS) {
+            return nvenc_print_error(avctx, nv_status, "Failed locking nvenc input buffer");
+        }
 
-        ret = nvenc_copy_frame(&params, frame);
-        if (ret < 0) {
-            nv->nvEncUnlockInputBuffer(ctx->nvenc_ctx, nvenc_frame->in);
-            return ret;
+        res = nvenc_copy_frame(avctx, nvenc_frame, &lockBufferParams, frame);
+
+        nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, nvenc_frame->input_surface);
+        if (nv_status != NV_ENC_SUCCESS) {
+            return nvenc_print_error(avctx, nv_status, "Failed unlocking input buffer!");
         }
 
-        ret = nv->nvEncUnlockInputBuffer(ctx->nvenc_ctx, nvenc_frame->in);
-        if (ret != NV_ENC_SUCCESS)
-            return nvenc_print_error(avctx, ret, "Cannot unlock the buffer");
+        return res;
     }
-
-    return 0;
 }
 
 static void nvenc_codec_specific_pic_params(AVCodecContext *avctx,
                                             NV_ENC_PIC_PARAMS *params)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
 
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
         params->codecPicParams.h264PicParams.sliceMode =
-            ctx->config.encodeCodecConfig.h264Config.sliceMode;
+            ctx->encode_config.encodeCodecConfig.h264Config.sliceMode;
         params->codecPicParams.h264PicParams.sliceModeData =
-            ctx->config.encodeCodecConfig.h264Config.sliceModeData;
-        break;
+            ctx->encode_config.encodeCodecConfig.h264Config.sliceModeData;
+      break;
     case AV_CODEC_ID_HEVC:
         params->codecPicParams.hevcPicParams.sliceMode =
-            ctx->config.encodeCodecConfig.hevcConfig.sliceMode;
+            ctx->encode_config.encodeCodecConfig.hevcConfig.sliceMode;
         params->codecPicParams.hevcPicParams.sliceModeData =
-            ctx->config.encodeCodecConfig.hevcConfig.sliceModeData;
+            ctx->encode_config.encodeCodecConfig.hevcConfig.sliceModeData;
         break;
     }
 }
 
-static inline int nvenc_enqueue_timestamp(AVFifoBuffer *f, int64_t pts)
+static inline void timestamp_queue_enqueue(AVFifoBuffer* queue, int64_t timestamp)
 {
-    return av_fifo_generic_write(f, &pts, sizeof(pts), NULL);
+    av_fifo_generic_write(queue, &timestamp, sizeof(timestamp), NULL);
 }
 
-static inline int nvenc_dequeue_timestamp(AVFifoBuffer *f, int64_t *pts)
+static inline int64_t timestamp_queue_dequeue(AVFifoBuffer* queue)
 {
-    return av_fifo_generic_read(f, pts, sizeof(*pts), NULL);
+    int64_t timestamp = AV_NOPTS_VALUE;
+    if (av_fifo_size(queue) > 0)
+        av_fifo_generic_read(queue, &timestamp, sizeof(timestamp), NULL);
+
+    return timestamp;
 }
 
 static int nvenc_set_timestamp(AVCodecContext *avctx,
                                NV_ENC_LOCK_BITSTREAM *params,
                                AVPacket *pkt)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
 
-    pkt->pts      = params->outputTimeStamp;
-    pkt->duration = params->outputDuration;
+    pkt->pts = params->outputTimeStamp;
 
     /* generate the first dts by linearly extrapolating the
      * first two pts values to the past */
@@ -1212,164 +1445,225 @@ static int nvenc_set_timestamp(AVCodecContext *avctx,
         ctx->first_packet_output = 1;
         return 0;
     }
-    return nvenc_dequeue_timestamp(ctx->timestamps, &pkt->dts);
+
+    pkt->dts = timestamp_queue_dequeue(ctx->timestamp_list);
+
+    return 0;
 }
 
-static int nvenc_get_output(AVCodecContext *avctx, AVPacket *pkt)
+static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, NvencSurface *tmpoutsurf)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_LOCK_BITSTREAM params    = { 0 };
-    NVENCFrame *frame;
-    int ret;
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
 
-    ret = av_fifo_generic_read(ctx->ready, &frame, sizeof(frame), NULL);
-    if (ret)
-        return ret;
+    uint32_t slice_mode_data;
+    uint32_t *slice_offsets;
+    NV_ENC_LOCK_BITSTREAM lock_params = { 0 };
+    NVENCSTATUS nv_status;
+    int res = 0;
 
-    params.version         = NV_ENC_LOCK_BITSTREAM_VER;
-    params.outputBitstream = frame->out;
+    enum AVPictureType pict_type;
 
-    ret = nv->nvEncLockBitstream(ctx->nvenc_ctx, &params);
-    if (ret < 0)
-        return nvenc_print_error(avctx, ret, "Cannot lock the bitstream");
+    switch (avctx->codec->id) {
+    case AV_CODEC_ID_H264:
+      slice_mode_data = ctx->encode_config.encodeCodecConfig.h264Config.sliceModeData;
+      break;
+    case AV_CODEC_ID_H265:
+      slice_mode_data = ctx->encode_config.encodeCodecConfig.hevcConfig.sliceModeData;
+      break;
+    default:
+      av_log(avctx, AV_LOG_ERROR, "Unknown codec name\n");
+      res = AVERROR(EINVAL);
+      goto error;
+    }
+    slice_offsets = av_mallocz(slice_mode_data * sizeof(*slice_offsets));
 
-    ret = ff_alloc_packet(pkt, params.bitstreamSizeInBytes);
-    if (ret < 0)
-        return ret;
+    if (!slice_offsets)
+        goto error;
 
-    memcpy(pkt->data, params.bitstreamBufferPtr, pkt->size);
+    lock_params.version = NV_ENC_LOCK_BITSTREAM_VER;
 
-    ret = nv->nvEncUnlockBitstream(ctx->nvenc_ctx, frame->out);
-    if (ret < 0)
-        return nvenc_print_error(avctx, ret, "Cannot unlock the bitstream");
+    lock_params.doNotWait = 0;
+    lock_params.outputBitstream = tmpoutsurf->output_surface;
+    lock_params.sliceOffsets = slice_offsets;
 
-    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
-        nv->nvEncUnmapInputResource(ctx->nvenc_ctx, frame->in_map.mappedResource);
-        av_frame_unref(frame->in_ref);
-        ctx->registered_frames[frame->reg_idx].mapped = 0;
+    nv_status = p_nvenc->nvEncLockBitstream(ctx->nvencoder, &lock_params);
+    if (nv_status != NV_ENC_SUCCESS) {
+        res = nvenc_print_error(avctx, nv_status, "Failed locking bitstream buffer");
+        goto error;
+    }
 
-        frame->in = NULL;
+    if (res = ff_alloc_packet2(avctx, pkt, lock_params.bitstreamSizeInBytes,0)) {
+        p_nvenc->nvEncUnlockBitstream(ctx->nvencoder, tmpoutsurf->output_surface);
+        goto error;
     }
 
-    frame->locked = 0;
+    memcpy(pkt->data, lock_params.bitstreamBufferPtr, lock_params.bitstreamSizeInBytes);
 
-    ret = nvenc_set_timestamp(avctx, &params, pkt);
-    if (ret < 0)
-        return ret;
+    nv_status = p_nvenc->nvEncUnlockBitstream(ctx->nvencoder, tmpoutsurf->output_surface);
+    if (nv_status != NV_ENC_SUCCESS)
+        nvenc_print_error(avctx, nv_status, "Failed unlocking bitstream buffer, expect the gates of mordor to open");
+
+
+    if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
+        p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, tmpoutsurf->in_map.mappedResource);
+        av_frame_unref(tmpoutsurf->in_ref);
+        ctx->registered_frames[tmpoutsurf->reg_idx].mapped = 0;
 
-    switch (params.pictureType) {
+        tmpoutsurf->input_surface = NULL;
+    }
+
+    switch (lock_params.pictureType) {
     case NV_ENC_PIC_TYPE_IDR:
         pkt->flags |= AV_PKT_FLAG_KEY;
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    case NV_ENC_PIC_TYPE_INTRA_REFRESH:
     case NV_ENC_PIC_TYPE_I:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        pict_type = AV_PICTURE_TYPE_I;
         break;
     case NV_ENC_PIC_TYPE_P:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        pict_type = AV_PICTURE_TYPE_P;
         break;
     case NV_ENC_PIC_TYPE_B:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        pict_type = AV_PICTURE_TYPE_B;
         break;
     case NV_ENC_PIC_TYPE_BI:
-        avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
+        pict_type = AV_PICTURE_TYPE_BI;
         break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown picture type encountered, expect the output to be broken.\n");
+        av_log(avctx, AV_LOG_ERROR, "Please report this error and include as much information on how to reproduce it as possible.\n");
+        res = AVERROR_EXTERNAL;
+        goto error;
+    }
+
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = pict_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
-    }
+
+    ff_side_data_set_encoder_stats(pkt,
+        (lock_params.frameAvgQP - 1) * FF_QP2LAMBDA, NULL, 0, pict_type);
+
+    res = nvenc_set_timestamp(avctx, &lock_params, pkt);
+    if (res < 0)
+        goto error2;
+
+    av_free(slice_offsets);
 
     return 0;
+
+error:
+    timestamp_queue_dequeue(ctx->timestamp_list);
+
+error2:
+    av_free(slice_offsets);
+
+    return res;
 }
 
 static int output_ready(AVCodecContext *avctx, int flush)
 {
-    NVENCContext *ctx = avctx->priv_data;
+    NvencContext *ctx = avctx->priv_data;
+    int nb_ready, nb_pending;
 
     /* when B-frames are enabled, we wait for two initial timestamps to
      * calculate the first dts */
     if (!flush && avctx->max_b_frames > 0 &&
         (ctx->initial_pts[0] == AV_NOPTS_VALUE || ctx->initial_pts[1] == AV_NOPTS_VALUE))
         return 0;
-    return av_fifo_size(ctx->ready) > 0;
+
+    nb_ready   = av_fifo_size(ctx->output_surface_ready_queue)   / sizeof(NvencSurface*);
+    nb_pending = av_fifo_size(ctx->output_surface_queue)         / sizeof(NvencSurface*);
+    if (flush)
+        return nb_ready > 0;
+    return (nb_ready > 0) && (nb_ready + nb_pending >= ctx->async_depth);
 }
 
 int ff_nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                           const AVFrame *frame, int *got_packet)
 {
-    NVENCContext *ctx               = avctx->priv_data;
-    NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
-    NV_ENC_PIC_PARAMS params        = { 0 };
-    NVENCFrame         *nvenc_frame = NULL;
-    int enc_ret, ret;
+    NVENCSTATUS nv_status;
+    NvencSurface *tmpoutsurf, *inSurf;
+    int res;
+
+    NvencContext *ctx = avctx->priv_data;
+    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+    NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
 
-    params.version = NV_ENC_PIC_PARAMS_VER;
+    NV_ENC_PIC_PARAMS pic_params = { 0 };
+    pic_params.version = NV_ENC_PIC_PARAMS_VER;
 
     if (frame) {
-        nvenc_frame = get_free_frame(ctx);
-        if (!nvenc_frame) {
+        inSurf = get_free_frame(ctx);
+        if (!inSurf) {
             av_log(avctx, AV_LOG_ERROR, "No free surfaces\n");
             return AVERROR_BUG;
         }
 
-        ret = nvenc_upload_frame(avctx, frame, nvenc_frame);
-        if (ret < 0)
-            return ret;
+        res = nvenc_upload_frame(avctx, frame, inSurf);
+        if (res) {
+            inSurf->lockCount = 0;
+            return res;
+        }
 
-        params.inputBuffer     = nvenc_frame->in;
-        params.bufferFmt       = nvenc_frame->format;
-        params.inputWidth      = frame->width;
-        params.inputHeight     = frame->height;
-        params.outputBitstream = nvenc_frame->out;
-        params.inputTimeStamp  = frame->pts;
+        pic_params.inputBuffer = inSurf->input_surface;
+        pic_params.bufferFmt = inSurf->format;
+        pic_params.inputWidth = avctx->width;
+        pic_params.inputHeight = avctx->height;
+        pic_params.outputBitstream = inSurf->output_surface;
 
         if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
             if (frame->top_field_first)
-                params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM;
+                pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_TOP_BOTTOM;
             else
-                params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP;
+                pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FIELD_BOTTOM_TOP;
         } else {
-            params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+            pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
         }
 
-        nvenc_codec_specific_pic_params(avctx, &params);
+        pic_params.encodePicFlags = 0;
+        pic_params.inputTimeStamp = frame->pts;
 
-        ret = nvenc_enqueue_timestamp(ctx->timestamps, frame->pts);
-        if (ret < 0)
-            return ret;
+        nvenc_codec_specific_pic_params(avctx, &pic_params);
+    } else {
+        pic_params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+    }
+
+    nv_status = p_nvenc->nvEncEncodePicture(ctx->nvencoder, &pic_params);
+    if (nv_status != NV_ENC_SUCCESS &&
+        nv_status != NV_ENC_ERR_NEED_MORE_INPUT)
+        return nvenc_print_error(avctx, nv_status, "EncodePicture failed!");
+
+    if (frame) {
+        av_fifo_generic_write(ctx->output_surface_queue, &inSurf, sizeof(inSurf), NULL);
+        timestamp_queue_enqueue(ctx->timestamp_list, frame->pts);
 
         if (ctx->initial_pts[0] == AV_NOPTS_VALUE)
             ctx->initial_pts[0] = frame->pts;
         else if (ctx->initial_pts[1] == AV_NOPTS_VALUE)
             ctx->initial_pts[1] = frame->pts;
-    } else {
-        params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
-    }
-
-    enc_ret = nv->nvEncEncodePicture(ctx->nvenc_ctx, &params);
-    if (enc_ret != NV_ENC_SUCCESS &&
-        enc_ret != NV_ENC_ERR_NEED_MORE_INPUT)
-        return nvenc_print_error(avctx, enc_ret, "Error encoding the frame");
-
-    if (nvenc_frame) {
-        ret = av_fifo_generic_write(ctx->pending, &nvenc_frame, sizeof(nvenc_frame), NULL);
-        if (ret < 0)
-            return ret;
     }
 
     /* all the pending buffers are now ready for output */
-    if (enc_ret == NV_ENC_SUCCESS) {
-        while (av_fifo_size(ctx->pending) > 0) {
-            av_fifo_generic_read(ctx->pending, &nvenc_frame, sizeof(nvenc_frame), NULL);
-            av_fifo_generic_write(ctx->ready,  &nvenc_frame, sizeof(nvenc_frame), NULL);
+    if (nv_status == NV_ENC_SUCCESS) {
+        while (av_fifo_size(ctx->output_surface_queue) > 0) {
+            av_fifo_generic_read(ctx->output_surface_queue, &tmpoutsurf, sizeof(tmpoutsurf), NULL);
+            av_fifo_generic_write(ctx->output_surface_ready_queue, &tmpoutsurf, sizeof(tmpoutsurf), NULL);
         }
     }
 
     if (output_ready(avctx, !frame)) {
-        ret = nvenc_get_output(avctx, pkt);
-        if (ret < 0)
-            return ret;
+        av_fifo_generic_read(ctx->output_surface_ready_queue, &tmpoutsurf, sizeof(tmpoutsurf), NULL);
+
+        res = process_output_surface(avctx, pkt, tmpoutsurf);
+
+        if (res)
+            return res;
+
+        av_assert0(tmpoutsurf->lockCount);
+        tmpoutsurf->lockCount--;
+
         *got_packet = 1;
     } else {
         *got_packet = 0;
diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h
index 8660997..961cbc7 100644
--- a/libavcodec/nvenc.h
+++ b/libavcodec/nvenc.h
@@ -1,25 +1,24 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_NVENC_H
 #define AVCODEC_NVENC_H
 
-#include <cuda.h>
 #include <nvEncodeAPI.h>
 
 #include "config.h"
@@ -29,18 +28,40 @@
 
 #include "avcodec.h"
 
+#if CONFIG_CUDA
+#include "libavutil/hwcontext_cuda.h"
+#else
+
+#if defined(_WIN32)
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+typedef enum cudaError_enum {
+    CUDA_SUCCESS = 0
+} CUresult;
+typedef int CUdevice;
+typedef void* CUcontext;
+typedef void* CUdeviceptr;
+#endif
+
 #define MAX_REGISTERED_FRAMES 64
 
-typedef struct NVENCFrame {
-    NV_ENC_INPUT_PTR  in;
-    AVFrame          *in_ref;
+typedef struct NvencSurface
+{
+    NV_ENC_INPUT_PTR input_surface;
+    AVFrame *in_ref;
     NV_ENC_MAP_INPUT_RESOURCE in_map;
     int reg_idx;
+    int width;
+    int height;
 
-    NV_ENC_OUTPUT_PTR out;
+    NV_ENC_OUTPUT_PTR output_surface;
     NV_ENC_BUFFER_FORMAT format;
-    int locked;
-} NVENCFrame;
+    int size;
+    int lockCount;
+} NvencSurface;
 
 typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
 typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
@@ -53,7 +74,7 @@ typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
 
 typedef NVENCSTATUS (NVENCAPI *PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
 
-typedef struct NVENCLibraryContext
+typedef struct NvencDynLoadFunctions
 {
 #if !CONFIG_CUDA
     void *cuda;
@@ -70,17 +91,21 @@ typedef struct NVENCLibraryContext
     PCUCTXDESTROY cu_ctx_destroy;
 
     NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
-} NVENCLibraryContext;
+    int nvenc_device_count;
+} NvencDynLoadFunctions;
 
 enum {
-    PRESET_DEFAULT,
+    PRESET_DEFAULT = 0,
+    PRESET_SLOW,
+    PRESET_MEDIUM,
+    PRESET_FAST,
     PRESET_HP,
     PRESET_HQ,
     PRESET_BD ,
     PRESET_LOW_LATENCY_DEFAULT ,
     PRESET_LOW_LATENCY_HQ ,
     PRESET_LOW_LATENCY_HP,
-    PRESET_LOSSLESS_DEFAULT,
+    PRESET_LOSSLESS_DEFAULT, // lossless presets must be the last ones
     PRESET_LOSSLESS_HP,
 };
 
@@ -88,13 +113,14 @@ enum {
     NV_ENC_H264_PROFILE_BASELINE,
     NV_ENC_H264_PROFILE_MAIN,
     NV_ENC_H264_PROFILE_HIGH,
-    NV_ENC_H264_PROFILE_HIGH_444,
-    NV_ENC_H264_PROFILE_CONSTRAINED_HIGH,
+    NV_ENC_H264_PROFILE_HIGH_444P,
 };
 
 enum {
     NVENC_LOWLATENCY = 1,
-    NVENC_LOSSLESS,
+    NVENC_LOSSLESS   = 2,
+    NVENC_ONE_PASS   = 4,
+    NVENC_TWO_PASSES = 8,
 };
 
 enum {
@@ -102,20 +128,23 @@ enum {
     ANY_DEVICE,
 };
 
-typedef struct NVENCContext {
-    AVClass *class;
-    NVENCLibraryContext nvel;
+typedef struct NvencContext
+{
+    AVClass *avclass;
 
-    NV_ENC_INITIALIZE_PARAMS params;
-    NV_ENC_CONFIG config;
+    NvencDynLoadFunctions nvenc_dload_funcs;
 
+    NV_ENC_INITIALIZE_PARAMS init_encode_params;
+    NV_ENC_CONFIG encode_config;
     CUcontext cu_context;
     CUcontext cu_context_internal;
 
     int nb_surfaces;
-    NVENCFrame *frames;
-    AVFifoBuffer *timestamps;
-    AVFifoBuffer *pending, *ready;
+    NvencSurface *surfaces;
+
+    AVFifoBuffer *output_surface_queue;
+    AVFifoBuffer *output_surface_ready_queue;
+    AVFifoBuffer *timestamp_list;
 
     struct {
         CUdeviceptr ptr;
@@ -133,16 +162,19 @@ typedef struct NVENCContext {
     int64_t initial_pts[2];
     int first_packet_output;
 
-    void *nvenc_ctx;
+    void *nvencoder;
 
     int preset;
     int profile;
     int level;
     int tier;
     int rc;
+    int cbr;
+    int twopass;
     int device;
     int flags;
-} NVENCContext;
+    int async_depth;
+} NvencContext;
 
 int ff_nvenc_encode_init(AVCodecContext *avctx);
 
diff --git a/libavcodec/nvenc_h264.c b/libavcodec/nvenc_h264.c
index 1f886d1..19103f9 100644
--- a/libavcodec/nvenc_h264.c
+++ b/libavcodec/nvenc_h264.c
@@ -1,64 +1,74 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/internal.h"
-#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
 
 #include "nvenc.h"
 
-#define OFFSET(x) offsetof(NVENCContext, x)
+#define OFFSET(x) offsetof(NvencContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_HQ }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
+    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_MEDIUM }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
     { "default",    "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_DEFAULT }, 0, 0, VE, "preset" },
+    { "slow",       "hq 2 passes",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_SLOW }, 0, 0, VE, "preset" },
+    { "medium",     "hq 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_MEDIUM }, 0, 0, VE, "preset" },
+    { "fast",       "hp 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_FAST }, 0, 0, VE, "preset" },
     { "hp",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HP }, 0, 0, VE, "preset" },
     { "hq",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HQ }, 0, 0, VE, "preset" },
     { "bd",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_BD }, 0, 0, VE, "preset" },
     { "ll",         "low latency",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_DEFAULT }, 0, 0, VE, "preset" },
     { "llhq",       "low latency hq",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HQ }, 0, 0, VE, "preset" },
     { "llhp",       "low latency hp",                     0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOW_LATENCY_HP }, 0, 0, VE, "preset" },
-    { "profile",  "Set the encoding profile",             OFFSET(profile),     AV_OPT_TYPE_INT,    { .i64 = NV_ENC_H264_PROFILE_HIGH }, NV_ENC_H264_PROFILE_BASELINE, NV_ENC_H264_PROFILE_CONSTRAINED_HIGH, VE, "profile" },
+    { "lossless",   NULL,                                 0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_DEFAULT }, 0, 0, VE, "preset" },
+    { "losslesshp", NULL,                                 0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_HP }, 0, 0, VE, "preset" },
+    { "profile",  "Set the encoding profile",             OFFSET(profile),     AV_OPT_TYPE_INT,    { .i64 = NV_ENC_H264_PROFILE_MAIN }, NV_ENC_H264_PROFILE_BASELINE, NV_ENC_H264_PROFILE_HIGH_444P, VE, "profile" },
     { "baseline", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_BASELINE },            0, 0, VE, "profile" },
     { "main",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_MAIN },                0, 0, VE, "profile" },
     { "high",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH },                0, 0, VE, "profile" },
-    { "high_444", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH_444 },            0, 0, VE, "profile" },
-    { "constrained_high", "",                             0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH },    0, 0, VE, "profile" },
-    { "level",    "Set the encoding level restriction",   OFFSET(level),       AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_H264_51, VE, "level" },
+    { "high444p", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH_444P },            0, 0, VE, "profile" },
+    { "level", "Set the encoding level restriction", OFFSET(level), AV_OPT_TYPE_INT, { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_H264_51, VE, "level" },
+    { "auto",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT },  0, 0, VE,  "level" },
+    { "1",        "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1 },  0, 0, VE,  "level" },
     { "1.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1 },  0, 0, VE,  "level" },
-    { "1.b",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1b }, 0, 0, VE,  "level" },
+    { "1b",       "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1b }, 0, 0, VE,  "level" },
+    { "1.0b",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1b }, 0, 0, VE,  "level" },
     { "1.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_11 }, 0, 0, VE,  "level" },
     { "1.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_12 }, 0, 0, VE,  "level" },
     { "1.3",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_13 }, 0, 0, VE,  "level" },
+    { "2",        "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_2 },  0, 0, VE,  "level" },
     { "2.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_2 },  0, 0, VE,  "level" },
     { "2.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_21 }, 0, 0, VE,  "level" },
     { "2.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_22 }, 0, 0, VE,  "level" },
+    { "3",        "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_3 },  0, 0, VE,  "level" },
     { "3.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_3 },  0, 0, VE,  "level" },
     { "3.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_31 }, 0, 0, VE,  "level" },
     { "3.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_32 }, 0, 0, VE,  "level" },
+    { "4",        "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_4 },  0, 0, VE,  "level" },
     { "4.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_4 },  0, 0, VE,  "level" },
     { "4.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_41 }, 0, 0, VE,  "level" },
     { "4.2",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_42 }, 0, 0, VE,  "level" },
+    { "5",        "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_5 },  0, 0, VE,  "level" },
     { "5.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_5 },  0, 0, VE,  "level" },
     { "5.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_51 }, 0, 0, VE,  "level" },
-    { "rc",       "Override the preset rate-control",     OFFSET(rc),          AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, 0, VE },
+    { "rc",       "Override the preset rate-control",     OFFSET(rc),          AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, INT_MAX, VE, "rc" },
     { "constqp",          "Constant QP mode",                                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CONSTQP },              0, 0, VE, "rc" },
     { "vbr",              "Variable bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR },                  0, 0, VE, "rc" },
     { "cbr",              "Constant bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CBR },                  0, 0, VE, "rc" },
@@ -67,54 +77,55 @@ static const AVOption options[] = {
     { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
     { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
     { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
-    { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
-    { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
-    { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
+    { "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
+    { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(device), AV_OPT_TYPE_INT, { .i64 = ANY_DEVICE }, -2, INT_MAX, VE, "gpu" },
+    { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "gpu" },
+    { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "gpu" },
+    { "delay",    "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
     { NULL }
 };
 
-static const AVClass nvenc_h264_class = {
-    .class_name = "nvenc_h264",
-    .item_name = av_default_item_name,
-    .option = options,
-    .version = LIBAVUTIL_VERSION_INT,
-};
-
 static const AVCodecDefault defaults[] = {
-    { "b", "0" },
+    { "b", "2M" },
     { "qmin", "-1" },
     { "qmax", "-1" },
     { "qdiff", "-1" },
     { "qblur", "-1" },
     { "qcomp", "-1" },
+    { "g", "250" },
+    { "bf", "0" },
     { NULL },
 };
 
-AVCodec ff_h264_nvenc_encoder = {
-    .name           = "h264_nvenc",
+#if CONFIG_NVENC_ENCODER
+static const AVClass nvenc_class = {
+    .class_name = "nvenc",
+    .item_name = av_default_item_name,
+    .option = options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_nvenc_encoder = {
+    .name           = "nvenc",
     .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC H.264 encoder"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
     .init           = ff_nvenc_encode_init,
     .encode2        = ff_nvenc_encode_frame,
     .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_h264_class,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &nvenc_class,
     .defaults       = defaults,
     .capabilities   = AV_CODEC_CAP_DELAY,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
     .pix_fmts       = ff_nvenc_pix_fmts,
 };
+#endif
 
-#if FF_API_NVENC_OLD_NAME
-
-static int nvenc_old_init(AVCodecContext *avctx)
-{
-    av_log(avctx, AV_LOG_WARNING, "This encoder is deprecated, use 'h264_nvenc' instead\n");
-    return ff_nvenc_encode_init(avctx);
-}
-
-static const AVClass nvenc_h264_old_class = {
+/* Add an alias for nvenc_h264 */
+#if CONFIG_NVENC_H264_ENCODER
+static const AVClass nvenc_h264_class = {
     .class_name = "nvenc_h264",
     .item_name = av_default_item_name,
     .option = options,
@@ -126,11 +137,11 @@ AVCodec ff_nvenc_h264_encoder = {
     .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC H.264 encoder"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
-    .init           = nvenc_old_init,
+    .init           = ff_nvenc_encode_init,
     .encode2        = ff_nvenc_encode_frame,
     .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_h264_old_class,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &nvenc_h264_class,
     .defaults       = defaults,
     .capabilities   = AV_CODEC_CAP_DELAY,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
diff --git a/libavcodec/nvenc_hevc.c b/libavcodec/nvenc_hevc.c
index f0c6d41..cef19f7 100644
--- a/libavcodec/nvenc_hevc.c
+++ b/libavcodec/nvenc_hevc.c
@@ -1,34 +1,36 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/internal.h"
-#include "libavutil/opt.h"
 
 #include "avcodec.h"
 #include "internal.h"
 
 #include "nvenc.h"
 
-#define OFFSET(x) offsetof(NVENCContext, x)
+#define OFFSET(x) offsetof(NvencContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_HQ }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
+    { "preset",   "Set the encoding preset",              OFFSET(preset),      AV_OPT_TYPE_INT,    { .i64 = PRESET_MEDIUM }, PRESET_DEFAULT, PRESET_LOSSLESS_HP, VE, "preset" },
     { "default",    "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_DEFAULT }, 0, 0, VE, "preset" },
+    { "slow",       "hq 2 passes",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_SLOW }, 0, 0, VE, "preset" },
+    { "medium",     "hq 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_MEDIUM }, 0, 0, VE, "preset" },
+    { "fast",       "hp 1 pass",                          0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_FAST }, 0, 0, VE, "preset" },
     { "hp",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HP }, 0, 0, VE, "preset" },
     { "hq",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_HQ }, 0, 0, VE, "preset" },
     { "bd",         "",                                   0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_BD }, 0, 0, VE, "preset" },
@@ -38,25 +40,32 @@ static const AVOption options[] = {
     { "lossless",   "lossless",                           0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_DEFAULT }, 0, 0, VE, "preset" },
     { "losslesshp", "lossless hp",                        0,                   AV_OPT_TYPE_CONST,  { .i64 = PRESET_LOSSLESS_HP }, 0, 0, VE, "preset" },
     { "profile", "Set the encoding profile",             OFFSET(profile),      AV_OPT_TYPE_INT,    { .i64 = FF_PROFILE_HEVC_MAIN }, FF_PROFILE_HEVC_MAIN, FF_PROFILE_HEVC_MAIN, VE, "profile" },
-    { "high",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = FF_PROFILE_HEVC_MAIN }, 0, 0, VE, "profile" },
+    { "main",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = FF_PROFILE_HEVC_MAIN }, 0, 0, VE, "profile" },
     { "level",   "Set the encoding level restriction",   OFFSET(level),        AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_HEVC_62, VE, "level" },
+    { "auto",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT },  0, 0, VE,  "level" },
+    { "1",       "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_1 },  0, 0, VE,  "level" },
     { "1.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_1 },  0, 0, VE,  "level" },
+    { "2",       "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_2 },  0, 0, VE,  "level" },
     { "2.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_2 },  0, 0, VE,  "level" },
     { "2.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_21 }, 0, 0, VE,  "level" },
+    { "3",       "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_3 },  0, 0, VE,  "level" },
     { "3.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_3 },  0, 0, VE,  "level" },
     { "3.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_31 }, 0, 0, VE,  "level" },
+    { "4",       "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_4 },  0, 0, VE,  "level" },
     { "4.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_4 },  0, 0, VE,  "level" },
     { "4.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_41 }, 0, 0, VE,  "level" },
+    { "5",       "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_5 },  0, 0, VE,  "level" },
     { "5.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_5 },  0, 0, VE,  "level" },
     { "5.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_51 }, 0, 0, VE,  "level" },
     { "5.2",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_52 }, 0, 0, VE,  "level" },
+    { "6",       "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_6 },  0, 0, VE,  "level" },
     { "6.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_6 },  0, 0, VE,  "level" },
     { "6.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_61 }, 0, 0, VE,  "level" },
     { "6.2",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_62 }, 0, 0, VE,  "level" },
     { "tier",    "Set the encoding tier",                OFFSET(tier),         AV_OPT_TYPE_INT,    { .i64 = NV_ENC_TIER_HEVC_MAIN }, NV_ENC_TIER_HEVC_MAIN, NV_ENC_TIER_HEVC_HIGH, VE, "tier"},
     { "main",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_TIER_HEVC_MAIN }, 0, 0, VE, "tier" },
     { "high",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_TIER_HEVC_HIGH }, 0, 0, VE, "tier" },
-    { "rc",      "Override the preset rate-control",     OFFSET(rc),           AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, 0, VE, "rc" },
+    { "rc",      "Override the preset rate-control",     OFFSET(rc),           AV_OPT_TYPE_INT,    { .i64 = -1 },                   -1, INT_MAX, VE, "rc" },
     { "constqp",          "Constant QP mode",                                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CONSTQP },              0, 0, VE, "rc" },
     { "vbr",              "Variable bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_VBR },                  0, 0, VE, "rc" },
     { "cbr",              "Constant bitrate mode",                                                       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_CBR },                  0, 0, VE, "rc" },
@@ -65,9 +74,12 @@ static const AVOption options[] = {
     { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
     { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
     { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
-    { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
+    { "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
+    { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(device), AV_OPT_TYPE_INT, { .i64 = ANY_DEVICE }, -2, INT_MAX, VE },
     { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
     { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
+    { "delay",    "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
     { NULL }
 };
 
@@ -79,59 +91,29 @@ static const AVClass nvenc_hevc_class = {
 };
 
 static const AVCodecDefault defaults[] = {
-    { "b", "0" },
+    { "b", "2M" },
     { "qmin", "-1" },
     { "qmax", "-1" },
     { "qdiff", "-1" },
     { "qblur", "-1" },
     { "qcomp", "-1" },
+    { "g", "250" },
+    { "bf", "0" },
     { NULL },
 };
 
-AVCodec ff_hevc_nvenc_encoder = {
-    .name           = "hevc_nvenc",
-    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC HEVC encoder"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_HEVC,
-    .init           = ff_nvenc_encode_init,
-    .encode2        = ff_nvenc_encode_frame,
-    .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_hevc_class,
-    .defaults       = defaults,
-    .pix_fmts       = ff_nvenc_pix_fmts,
-    .capabilities   = AV_CODEC_CAP_DELAY,
-    .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
-};
-
-#if FF_API_NVENC_OLD_NAME
-
-static int nvenc_old_init(AVCodecContext *avctx)
-{
-    av_log(avctx, AV_LOG_WARNING, "This encoder is deprecated, use 'hevc_nvenc' instead\n");
-    return ff_nvenc_encode_init(avctx);
-}
-
-static const AVClass nvenc_hevc_old_class = {
-    .class_name = "nvenc_hevc",
-    .item_name = av_default_item_name,
-    .option = options,
-    .version = LIBAVUTIL_VERSION_INT,
-};
-
 AVCodec ff_nvenc_hevc_encoder = {
     .name           = "nvenc_hevc",
-    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC HEVC encoder"),
+    .long_name      = NULL_IF_CONFIG_SMALL("NVIDIA NVENC hevc encoder"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_HEVC,
-    .init           = nvenc_old_init,
+    .init           = ff_nvenc_encode_init,
     .encode2        = ff_nvenc_encode_frame,
     .close          = ff_nvenc_encode_close,
-    .priv_data_size = sizeof(NVENCContext),
-    .priv_class     = &nvenc_hevc_old_class,
+    .priv_data_size = sizeof(NvencContext),
+    .priv_class     = &nvenc_hevc_class,
     .defaults       = defaults,
     .pix_fmts       = ff_nvenc_pix_fmts,
     .capabilities   = AV_CODEC_CAP_DELAY,
     .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP,
 };
-#endif
diff --git a/libavcodec/omx.c b/libavcodec/omx.c
index 961ff86..674a0c3 100644
--- a/libavcodec/omx.c
+++ b/libavcodec/omx.c
@@ -2,20 +2,20 @@
  * OMX Video encoder
  * Copyright (C) 2011 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/on2avc.c b/libavcodec/on2avc.c
index 3918365..3064c6f 100644
--- a/libavcodec/on2avc.c
+++ b/libavcodec/on2avc.c
@@ -3,24 +3,25 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "bytestream.h"
@@ -47,7 +48,7 @@ enum WindowTypes {
 
 typedef struct On2AVCContext {
     AVCodecContext *avctx;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext mdct, mdct_half, mdct_small;
     FFTContext fft128, fft256, fft512, fft1024;
     void (*wtf)(struct On2AVCContext *ctx, float *out, float *in, int size);
@@ -119,12 +120,12 @@ static int on2avc_decode_band_types(On2AVCContext *c, GetBitContext *gb)
         run_len   = 1;
         do {
             run = get_bits(gb, bits_per_sect);
+            if (run > num_bands - band - run_len) {
+                av_log(c->avctx, AV_LOG_ERROR, "Invalid band type run\n");
+                return AVERROR_INVALIDDATA;
+            }
             run_len += run;
         } while (run == esc_val);
-        if (band + run_len > num_bands) {
-            av_log(c->avctx, AV_LOG_ERROR, "Invalid band type run\n");
-            return AVERROR_INVALIDDATA;
-        }
         for (i = band; i < band + run_len; i++) {
             c->band_type[i]    = band_type;
             c->band_run_end[i] = band + run_len;
@@ -320,7 +321,7 @@ static void zero_head_and_tail(float *src, int len, int order0, int order1)
 }
 
 static void pretwiddle(float *src, float *dst, int dst_len, int tab_step,
-                       int step, int order0, int order1, const double **tabs)
+                       int step, int order0, int order1, const double * const *tabs)
 {
     float *src2, *out;
     const double *tab;
@@ -348,7 +349,7 @@ static void pretwiddle(float *src, float *dst, int dst_len, int tab_step,
 
 static void twiddle(float *src1, float *src2, int src2_len,
                     const double *tab, int tab_len, int step,
-                    int order0, int order1, const double **tabs)
+                    int order0, int order1, const double * const *tabs)
 {
     int steps;
     int mask;
@@ -721,7 +722,7 @@ static int on2avc_reconstruct_stereo(On2AVCContext *c, AVFrame *dst, int offset)
         }
 
         memcpy(out, saved, 448 * sizeof(float));
-        c->fdsp.vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
+        c->fdsp->vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
         memcpy(wout + 128,  buf + 64,         448 * sizeof(float));
         memcpy(saved,       buf + 512,        448 * sizeof(float));
         memcpy(saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
@@ -757,20 +758,20 @@ static int on2avc_reconstruct_channel(On2AVCContext *c, int channel,
          c->prev_window_type == WINDOW_TYPE_LONG_STOP) &&
         (c->window_type == WINDOW_TYPE_LONG ||
          c->window_type == WINDOW_TYPE_LONG_START)) {
-        c->fdsp.vector_fmul_window(out, saved, buf, c->long_win, 512);
+        c->fdsp->vector_fmul_window(out, saved, buf, c->long_win, 512);
     } else {
         float *wout = out + 448;
         memcpy(out, saved, 448 * sizeof(float));
 
         if (c->window_type == WINDOW_TYPE_8SHORT) {
-            c->fdsp.vector_fmul_window(wout + 0*128, saved + 448,      buf + 0*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 1*128, buf + 0*128 + 64, buf + 1*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 2*128, buf + 1*128 + 64, buf + 2*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(wout + 3*128, buf + 2*128 + 64, buf + 3*128, c->short_win, 64);
-            c->fdsp.vector_fmul_window(temp,         buf + 3*128 + 64, buf + 4*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 0*128, saved + 448,      buf + 0*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 1*128, buf + 0*128 + 64, buf + 1*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 2*128, buf + 1*128 + 64, buf + 2*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout + 3*128, buf + 2*128 + 64, buf + 3*128, c->short_win, 64);
+            c->fdsp->vector_fmul_window(temp,         buf + 3*128 + 64, buf + 4*128, c->short_win, 64);
             memcpy(wout + 4*128, temp, 64 * sizeof(float));
         } else {
-            c->fdsp.vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
+            c->fdsp->vector_fmul_window(wout, saved + 448, buf, c->short_win, 64);
             memcpy(wout + 128, buf + 64, 448 * sizeof(float));
         }
     }
@@ -779,9 +780,9 @@ static int on2avc_reconstruct_channel(On2AVCContext *c, int channel,
     switch (c->window_type) {
     case WINDOW_TYPE_8SHORT:
         memcpy(saved,       temp + 64,         64 * sizeof(float));
-        c->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, c->short_win, 64);
-        c->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, c->short_win, 64);
-        c->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, c->short_win, 64);
+        c->fdsp->vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, c->short_win, 64);
         memcpy(saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
         break;
     case WINDOW_TYPE_LONG_START:
@@ -802,7 +803,9 @@ static int on2avc_decode_subframe(On2AVCContext *c, const uint8_t *buf,
     GetBitContext gb;
     int i, ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
+
     if (get_bits1(&gb)) {
         av_log(c->avctx, AV_LOG_ERROR, "enh bit set\n");
         return AVERROR_INVALIDDATA;
@@ -853,10 +856,8 @@ static int on2avc_decode_frame(AVCodecContext * avctx, void *data,
     if (c->is_av500) {
         /* get output buffer */
         frame->nb_samples = ON2AVC_SUBFRAME_SIZE;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
 
         if ((ret = on2avc_decode_subframe(c, buf, buf_size, frame, 0)) < 0)
             return ret;
@@ -879,10 +880,8 @@ static int on2avc_decode_frame(AVCodecContext * avctx, void *data,
 
         /* get output buffer */
         frame->nb_samples = ON2AVC_SUBFRAME_SIZE * num_frames;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
 
         audio_off = 0;
         bytestream2_init(&gb, buf, buf_size);
@@ -915,6 +914,11 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
     On2AVCContext *c = avctx->priv_data;
     int i;
 
+    if (avctx->channels > 2U) {
+        avpriv_request_sample(avctx, "Decoding more than 2 channels");
+        return AVERROR_PATCHWELCOME;
+    }
+
     c->avctx = avctx;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLTP;
     avctx->channel_layout = (avctx->channels == 2) ? AV_CH_LAYOUT_STEREO
@@ -925,18 +929,18 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "0x500 version should be mono\n");
         return AVERROR_INVALIDDATA;
     }
-    if (avctx->channels > 2) {
-        av_log(avctx, AV_LOG_ERROR, "Only 1 or 2 channels are supported.\n");
-        return AVERROR(EINVAL);
-    }
+
     if (avctx->channels == 2)
         av_log(avctx, AV_LOG_WARNING,
                "Stereo mode support is not good, patch is welcome\n");
 
+    // We add -0.01 before ceil() to avoid any values to fall at exactly the
+    // midpoint between different ceil values. The results are identical to
+    // using pow(10, i / 10.0) without such bias
     for (i = 0; i < 20; i++)
-        c->scale_tab[i] = ceil(pow(10.0, i * 0.1) * 16) / 32;
+        c->scale_tab[i] = ceil(ff_exp10(i * 0.1) * 16 - 0.01) / 32;
     for (; i < 128; i++)
-        c->scale_tab[i] = ceil(pow(10.0, i * 0.1) * 0.5);
+        c->scale_tab[i] = ceil(ff_exp10(i * 0.1) * 0.5 - 0.01);
 
     if (avctx->sample_rate < 32000 || avctx->channels == 1)
         memcpy(c->long_win, ff_on2avc_window_long_24000,
@@ -958,13 +962,14 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
     ff_fft_init(&c->fft256,  7, 0);
     ff_fft_init(&c->fft512,  8, 1);
     ff_fft_init(&c->fft1024, 9, 1);
-    avpriv_float_dsp_init(&c->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    c->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!c->fdsp)
+        return AVERROR(ENOMEM);
 
     if (init_vlc(&c->scale_diff, 9, ON2AVC_SCALE_DIFFS,
                  ff_on2avc_scale_diff_bits,  1, 1,
                  ff_on2avc_scale_diff_codes, 4, 4, 0)) {
-        av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-        return AVERROR(ENOMEM);
+        goto vlc_fail;
     }
     for (i = 1; i < 9; i++) {
         int idx = i - 1;
@@ -972,9 +977,7 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
                                ff_on2avc_quad_cb_bits[idx],  1, 1,
                                ff_on2avc_quad_cb_codes[idx], 4, 4,
                                ff_on2avc_quad_cb_syms[idx],  2, 2, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-            on2avc_free_vlcs(c);
-            return AVERROR(ENOMEM);
+            goto vlc_fail;
         }
     }
     for (i = 9; i < 16; i++) {
@@ -983,13 +986,16 @@ static av_cold int on2avc_decode_init(AVCodecContext *avctx)
                                ff_on2avc_pair_cb_bits[idx],  1, 1,
                                ff_on2avc_pair_cb_codes[idx], 2, 2,
                                ff_on2avc_pair_cb_syms[idx],  2, 2, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
-            on2avc_free_vlcs(c);
-            return AVERROR(ENOMEM);
+            goto vlc_fail;
         }
     }
 
     return 0;
+vlc_fail:
+    av_log(avctx, AV_LOG_ERROR, "Cannot init VLC\n");
+    on2avc_free_vlcs(c);
+    av_freep(&c->fdsp);
+    return AVERROR(ENOMEM);
 }
 
 static av_cold int on2avc_decode_close(AVCodecContext *avctx)
@@ -1004,6 +1010,8 @@ static av_cold int on2avc_decode_close(AVCodecContext *avctx)
     ff_fft_end(&c->fft512);
     ff_fft_end(&c->fft1024);
 
+    av_freep(&c->fdsp);
+
     on2avc_free_vlcs(c);
 
     return 0;
diff --git a/libavcodec/on2avcdata.c b/libavcodec/on2avcdata.c
index d039f23..abe5983 100644
--- a/libavcodec/on2avcdata.c
+++ b/libavcodec/on2avcdata.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -7641,11 +7641,11 @@ static const double tabs_4_10[4 * 2][10] = {
      -0.099339873,     -0.041293536,    0.31028851,     0.17727433,    -0.92756648 }
 };
 
-const double *ff_on2avc_tabs_4_10_1[4] = {
+const double * const ff_on2avc_tabs_4_10_1[4] = {
     tabs_4_10[0], tabs_4_10[1], tabs_4_10[2], tabs_4_10[3]
 };
 
-const double *ff_on2avc_tabs_4_10_2[4] = {
+const double * const ff_on2avc_tabs_4_10_2[4] = {
     tabs_4_10[4], tabs_4_10[5], tabs_4_10[6], tabs_4_10[7]
 };
 
@@ -7724,12 +7724,12 @@ static const double tabs_9_20[9 * 2][20] = {
       0.22783, 0.058894795, -0.61350902, 0.69559873, -0.27013783, }
 };
 
-const double* ff_on2avc_tabs_9_20_1[9] = {
+const double* const ff_on2avc_tabs_9_20_1[9] = {
     tabs_9_20[0], tabs_9_20[1], tabs_9_20[2], tabs_9_20[3], tabs_9_20[4],
     tabs_9_20[5], tabs_9_20[6], tabs_9_20[7], tabs_9_20[8]
 };
 
-const double* ff_on2avc_tabs_9_20_2[9] = {
+const double* const ff_on2avc_tabs_9_20_2[9] = {
     tabs_9_20[ 9], tabs_9_20[10], tabs_9_20[11], tabs_9_20[12], tabs_9_20[13],
     tabs_9_20[14], tabs_9_20[15], tabs_9_20[16], tabs_9_20[17]
 };
@@ -7927,7 +7927,7 @@ static const double tabs_19_40[19 * 2][40] = {
       0.019871848, -0.11989559, 0.036659135, 0.26632201, -0.3057397, -0.23220335, 0.68741352, -0.54024027, }
 };
 
-const double* ff_on2avc_tabs_19_40_1[19] = {
+const double* const ff_on2avc_tabs_19_40_1[19] = {
     tabs_19_40[ 0], tabs_19_40[ 1], tabs_19_40[ 2], tabs_19_40[ 3],
     tabs_19_40[ 4], tabs_19_40[ 5], tabs_19_40[ 6], tabs_19_40[ 7],
     tabs_19_40[ 8], tabs_19_40[ 9], tabs_19_40[10], tabs_19_40[11],
@@ -7935,7 +7935,7 @@ const double* ff_on2avc_tabs_19_40_1[19] = {
     tabs_19_40[16], tabs_19_40[17], tabs_19_40[18],
 };
 
-const double* ff_on2avc_tabs_19_40_2[19] = {
+const double* const ff_on2avc_tabs_19_40_2[19] = {
     tabs_19_40[19], tabs_19_40[20], tabs_19_40[21], tabs_19_40[22],
     tabs_19_40[23], tabs_19_40[24], tabs_19_40[25], tabs_19_40[26],
     tabs_19_40[27], tabs_19_40[28], tabs_19_40[29], tabs_19_40[30],
@@ -8826,7 +8826,7 @@ static const double tabs_20_84[20 * 4][84] = {
       0.51434408, -0.41486443, 0.27672635, -0.10432054, },
 };
 
-const double* ff_on2avc_tabs_20_84_1[20] = {
+const double* const ff_on2avc_tabs_20_84_1[20] = {
     tabs_20_84[ 0], tabs_20_84[ 1], tabs_20_84[ 2], tabs_20_84[ 3],
     tabs_20_84[ 4], tabs_20_84[ 5], tabs_20_84[ 6], tabs_20_84[ 7],
     tabs_20_84[ 8], tabs_20_84[ 9], tabs_20_84[10], tabs_20_84[11],
@@ -8834,7 +8834,7 @@ const double* ff_on2avc_tabs_20_84_1[20] = {
     tabs_20_84[16], tabs_20_84[17], tabs_20_84[18], tabs_20_84[19]
 };
 
-const double* ff_on2avc_tabs_20_84_2[20] = {
+const double* const ff_on2avc_tabs_20_84_2[20] = {
     tabs_20_84[20], tabs_20_84[21], tabs_20_84[22], tabs_20_84[23],
     tabs_20_84[24], tabs_20_84[25], tabs_20_84[26], tabs_20_84[27],
     tabs_20_84[28], tabs_20_84[29], tabs_20_84[30], tabs_20_84[31],
@@ -8842,7 +8842,7 @@ const double* ff_on2avc_tabs_20_84_2[20] = {
     tabs_20_84[36], tabs_20_84[37], tabs_20_84[38], tabs_20_84[39]
 };
 
-const double* ff_on2avc_tabs_20_84_3[20] = {
+const double* const ff_on2avc_tabs_20_84_3[20] = {
     tabs_20_84[40], tabs_20_84[41], tabs_20_84[42], tabs_20_84[43],
     tabs_20_84[44], tabs_20_84[45], tabs_20_84[46], tabs_20_84[47],
     tabs_20_84[48], tabs_20_84[49], tabs_20_84[50], tabs_20_84[51],
@@ -8850,7 +8850,7 @@ const double* ff_on2avc_tabs_20_84_3[20] = {
     tabs_20_84[56], tabs_20_84[57], tabs_20_84[58], tabs_20_84[59]
 };
 
-const double* ff_on2avc_tabs_20_84_4[20] = {
+const double* const ff_on2avc_tabs_20_84_4[20] = {
     tabs_20_84[60], tabs_20_84[61], tabs_20_84[62], tabs_20_84[63],
     tabs_20_84[64], tabs_20_84[65], tabs_20_84[66], tabs_20_84[67],
     tabs_20_84[68], tabs_20_84[69], tabs_20_84[70], tabs_20_84[71],
diff --git a/libavcodec/on2avcdata.h b/libavcodec/on2avcdata.h
index 39d2911..95d88e0 100644
--- a/libavcodec/on2avcdata.h
+++ b/libavcodec/on2avcdata.h
@@ -3,25 +3,25 @@
  *
  * Copyright (c) 2013 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_ON2AVC_DATA_H
-#define AVCODEC_ON2AVC_DATA_H
+#ifndef AVCODEC_ON2AVCDATA_H
+#define AVCODEC_ON2AVCDATA_H
 
 #include <stdint.h>
 
@@ -64,19 +64,19 @@ extern const double ff_on2avc_tab_84_1[];
 extern const double ff_on2avc_tab_84_2[];
 extern const double ff_on2avc_tab_84_3[];
 extern const double ff_on2avc_tab_84_4[];
-extern const double* ff_on2avc_tabs_4_10_1[4];
-extern const double* ff_on2avc_tabs_4_10_2[4];
-extern const double* ff_on2avc_tabs_9_20_1[9];
-extern const double* ff_on2avc_tabs_9_20_2[9];
-extern const double* ff_on2avc_tabs_19_40_1[19];
-extern const double* ff_on2avc_tabs_19_40_2[19];
-extern const double* ff_on2avc_tabs_20_84_1[20];
-extern const double* ff_on2avc_tabs_20_84_2[20];
-extern const double* ff_on2avc_tabs_20_84_3[20];
-extern const double* ff_on2avc_tabs_20_84_4[20];
+extern const double* const ff_on2avc_tabs_4_10_1[4];
+extern const double* const ff_on2avc_tabs_4_10_2[4];
+extern const double* const ff_on2avc_tabs_9_20_1[9];
+extern const double* const ff_on2avc_tabs_9_20_2[9];
+extern const double* const ff_on2avc_tabs_19_40_1[19];
+extern const double* const ff_on2avc_tabs_19_40_2[19];
+extern const double* const ff_on2avc_tabs_20_84_1[20];
+extern const double* const ff_on2avc_tabs_20_84_2[20];
+extern const double* const ff_on2avc_tabs_20_84_3[20];
+extern const double* const ff_on2avc_tabs_20_84_4[20];
 extern const float ff_on2avc_ctab_1[2048];
 extern const float ff_on2avc_ctab_2[2048];
 extern const float ff_on2avc_ctab_3[2048];
 extern const float ff_on2avc_ctab_4[2048];
 
-#endif /* AVCODEC_ON2AVC_DATA_H */
+#endif /* AVCODEC_ON2AVCDATA_H */
diff --git a/libavcodec/options-test.c b/libavcodec/options-test.c
new file mode 100644
index 0000000..0b064e6
--- /dev/null
+++ b/libavcodec/options-test.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "options.c"
+
+static int dummy_init(AVCodecContext *ctx)
+{
+    //TODO: this code should set every possible pointer that could be set by codec and is not an option;
+    ctx->extradata_size = 8;
+    ctx->extradata = av_malloc(ctx->extradata_size);
+    return 0;
+}
+
+static int dummy_close(AVCodecContext *ctx)
+{
+    av_freep(&ctx->extradata);
+    ctx->extradata_size = 0;
+    return 0;
+}
+
+static int dummy_encode(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame, int *got_packet)
+{
+    return AVERROR(ENOSYS);
+}
+
+typedef struct Dummy12Context {
+    AVClass  *av_class;
+    int      num;
+    char*    str;
+} Dummy12Context;
+
+typedef struct Dummy3Context {
+    void     *fake_av_class;
+    int      num;
+    char*    str;
+} Dummy3Context;
+
+#define OFFSET(x) offsetof(Dummy12Context, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption dummy_options[] = {
+    { "str", "set str", OFFSET(str), AV_OPT_TYPE_STRING, { .str = "i'm src default value" }, 0, 0, VE},
+    { "num", "set num", OFFSET(num), AV_OPT_TYPE_INT,    { .i64 = 1500100900 },    0, INT_MAX, VE},
+    { NULL },
+};
+
+static const AVClass dummy_v1_class = {
+    .class_name = "dummy_v1_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass dummy_v2_class = {
+    .class_name = "dummy_v2_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+/* codec with options */
+static AVCodec dummy_v1_encoder = {
+    .name             = "dummy_v1_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 1,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v1_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with options, different class */
+static AVCodec dummy_v2_encoder = {
+    .name             = "dummy_v2_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 2,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v2_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with priv data, but no class */
+static AVCodec dummy_v3_encoder = {
+    .name             = "dummy_v3_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 3,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_data_size   = sizeof(Dummy3Context),
+};
+
+/* codec without priv data */
+static AVCodec dummy_v4_encoder = {
+    .name             = "dummy_v4_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 4,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+};
+
+static void test_copy_print_codec(const AVCodecContext *ctx)
+{
+    printf("%-14s: %dx%d prv: %s",
+           ctx->codec ? ctx->codec->name : "NULL",
+           ctx->width, ctx->height,
+           ctx->priv_data ? "set" : "null");
+    if (ctx->codec && ctx->codec->priv_class && ctx->codec->priv_data_size) {
+        int64_t i64;
+        char *str = NULL;
+        av_opt_get_int(ctx->priv_data, "num", 0, &i64);
+        av_opt_get(ctx->priv_data, "str", 0, (uint8_t**)&str);
+        printf(" opts: %"PRId64" %s", i64, str);
+        av_free(str);
+    }
+    printf("\n");
+}
+
+static void test_copy(const AVCodec *c1, const AVCodec *c2)
+{
+    AVCodecContext *ctx1, *ctx2;
+    printf("%s -> %s\nclosed:\n", c1 ? c1->name : "NULL", c2 ? c2->name : "NULL");
+    ctx1 = avcodec_alloc_context3(c1);
+    ctx2 = avcodec_alloc_context3(c2);
+    ctx1->width = ctx1->height = 128;
+    if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+        av_opt_set(ctx2->priv_data, "num", "667", 0);
+        av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+    }
+    avcodec_copy_context(ctx2, ctx1);
+    test_copy_print_codec(ctx1);
+    test_copy_print_codec(ctx2);
+    if (ctx1->codec) {
+        int ret;
+        printf("opened:\n");
+        ret = avcodec_open2(ctx1, ctx1->codec, NULL);
+        if (ret < 0) {
+            fprintf(stderr, "avcodec_open2 failed\n");
+            exit(1);
+        }
+        if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+            av_opt_set(ctx2->priv_data, "num", "667", 0);
+            av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+        }
+        avcodec_copy_context(ctx2, ctx1);
+        test_copy_print_codec(ctx1);
+        test_copy_print_codec(ctx2);
+        avcodec_close(ctx1);
+    }
+    avcodec_free_context(&ctx1);
+    avcodec_free_context(&ctx2);
+}
+
+int main(void)
+{
+    AVCodec *dummy_codec[] = {
+        &dummy_v1_encoder,
+        &dummy_v2_encoder,
+        &dummy_v3_encoder,
+        &dummy_v4_encoder,
+        NULL,
+    };
+    int i, j;
+
+    for (i = 0; dummy_codec[i]; i++)
+        avcodec_register(dummy_codec[i]);
+
+    printf("testing avcodec_copy_context()\n");
+    for (i = 0; i < FF_ARRAY_ELEMS(dummy_codec); i++)
+        for (j = 0; j < FF_ARRAY_ELEMS(dummy_codec); j++)
+            test_copy(dummy_codec[i], dummy_codec[j]);
+    return 0;
+}
diff --git a/libavcodec/options.c b/libavcodec/options.c
index 49c8aea..ea2563b 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,6 +70,13 @@ static const AVClass *codec_child_class_next(const AVClass *prev)
     return NULL;
 }
 
+static AVClassCategory get_category(void *ptr)
+{
+    AVCodecContext* avctx = ptr;
+    if(avctx->codec && avctx->codec->decode) return AV_CLASS_CATEGORY_DECODER;
+    else                                     return AV_CLASS_CATEGORY_ENCODER;
+}
+
 static const AVClass av_codec_context_class = {
     .class_name              = "AVCodecContext",
     .item_name               = context_to_name,
@@ -78,20 +85,34 @@ static const AVClass av_codec_context_class = {
     .log_level_offset_offset = offsetof(AVCodecContext, log_level_offset),
     .child_next              = codec_child_next,
     .child_class_next        = codec_child_class_next,
+    .category                = AV_CLASS_CATEGORY_ENCODER,
+    .get_category            = get_category,
 };
 
 int avcodec_get_context_defaults3(AVCodecContext *s, const AVCodec *codec)
 {
+    int flags=0;
     memset(s, 0, sizeof(AVCodecContext));
 
     s->av_class = &av_codec_context_class;
 
     s->codec_type = codec ? codec->type : AVMEDIA_TYPE_UNKNOWN;
-    s->codec      = codec;
-    av_opt_set_defaults(s);
+    if (codec) {
+        s->codec = codec;
+        s->codec_id = codec->id;
+    }
+
+    if(s->codec_type == AVMEDIA_TYPE_AUDIO)
+        flags= AV_OPT_FLAG_AUDIO_PARAM;
+    else if(s->codec_type == AVMEDIA_TYPE_VIDEO)
+        flags= AV_OPT_FLAG_VIDEO_PARAM;
+    else if(s->codec_type == AVMEDIA_TYPE_SUBTITLE)
+        flags= AV_OPT_FLAG_SUBTITLE_PARAM;
+    av_opt_set_defaults2(s, flags, flags);
 
     s->time_base           = (AVRational){0,1};
     s->framerate           = (AVRational){ 0, 1 };
+    s->pkt_timebase        = (AVRational){ 0, 1 };
     s->get_buffer2         = avcodec_default_get_buffer2;
     s->get_format          = avcodec_default_get_format;
     s->execute             = avcodec_default_execute;
@@ -151,6 +172,9 @@ void avcodec_free_context(AVCodecContext **pavctx)
 
     av_freep(&avctx->extradata);
     av_freep(&avctx->subtitle_header);
+    av_freep(&avctx->intra_matrix);
+    av_freep(&avctx->inter_matrix);
+    av_freep(&avctx->rc_override);
 
     av_freep(pavctx);
 }
@@ -166,15 +190,34 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
                src, dest);
         return AVERROR(EINVAL);
     }
+
+    av_opt_free(dest);
+    av_freep(&dest->rc_override);
+    av_freep(&dest->intra_matrix);
+    av_freep(&dest->inter_matrix);
+    av_freep(&dest->extradata);
+    av_freep(&dest->subtitle_header);
+
     memcpy(dest, src, sizeof(*dest));
+    av_opt_copy(dest, src);
 
     dest->priv_data       = orig_priv_data;
     dest->codec           = orig_codec;
 
+    if (orig_priv_data && src->codec && src->codec->priv_class &&
+        dest->codec && dest->codec->priv_class)
+        av_opt_copy(orig_priv_data, src->priv_data);
+
+
     /* set values specific to opened codecs back to their default state */
     dest->slice_offset    = NULL;
     dest->hwaccel         = NULL;
     dest->internal        = NULL;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    dest->coded_frame     = NULL;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
     /* reallocate values that should be allocated separately */
     dest->extradata       = NULL;
@@ -182,16 +225,6 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
     dest->inter_matrix    = NULL;
     dest->rc_override     = NULL;
     dest->subtitle_header = NULL;
-#if FF_API_MPV_OPT
-    FF_DISABLE_DEPRECATION_WARNINGS
-    dest->rc_eq           = NULL;
-    if (src->rc_eq) {
-        dest->rc_eq = av_strdup(src->rc_eq);
-        if (!dest->rc_eq)
-            return AVERROR(ENOMEM);
-    }
-    FF_ENABLE_DEPRECATION_WARNINGS
-#endif
 
 #define alloc_and_copy_or_fail(obj, size, pad) \
     if (src->obj && size > 0) { \
@@ -204,11 +237,12 @@ int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src)
     }
     alloc_and_copy_or_fail(extradata,    src->extradata_size,
                            AV_INPUT_BUFFER_PADDING_SIZE);
+    dest->extradata_size  = src->extradata_size;
     alloc_and_copy_or_fail(intra_matrix, 64 * sizeof(int16_t), 0);
     alloc_and_copy_or_fail(inter_matrix, 64 * sizeof(int16_t), 0);
     alloc_and_copy_or_fail(rc_override,  src->rc_override_count * sizeof(*src->rc_override), 0);
-    alloc_and_copy_or_fail(subtitle_header, src->subtitle_header_size, 0);
-    dest->subtitle_header_size = src->subtitle_header_size;
+    alloc_and_copy_or_fail(subtitle_header, src->subtitle_header_size, 1);
+    av_assert0(dest->subtitle_header_size == src->subtitle_header_size);
 #undef alloc_and_copy_or_fail
 
     return 0;
@@ -218,11 +252,10 @@ fail:
     av_freep(&dest->intra_matrix);
     av_freep(&dest->inter_matrix);
     av_freep(&dest->extradata);
-#if FF_API_MPV_OPT
-    FF_DISABLE_DEPRECATION_WARNINGS
-    av_freep(&dest->rc_eq);
-    FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    av_freep(&dest->subtitle_header);
+    dest->subtitle_header_size = 0;
+    dest->extradata_size = 0;
+    av_opt_free(dest);
     return AVERROR(ENOMEM);
 }
 
@@ -230,3 +263,224 @@ const AVClass *avcodec_get_class(void)
 {
     return &av_codec_context_class;
 }
+
+#define FOFFSET(x) offsetof(AVFrame,x)
+
+static const AVOption frame_options[]={
+{"best_effort_timestamp", "", FOFFSET(best_effort_timestamp), AV_OPT_TYPE_INT64, {.i64 = AV_NOPTS_VALUE }, INT64_MIN, INT64_MAX, 0},
+{"pkt_pos", "", FOFFSET(pkt_pos), AV_OPT_TYPE_INT64, {.i64 = -1 }, INT64_MIN, INT64_MAX, 0},
+{"pkt_size", "", FOFFSET(pkt_size), AV_OPT_TYPE_INT64, {.i64 = -1 }, INT64_MIN, INT64_MAX, 0},
+{"sample_aspect_ratio", "", FOFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
+{"width", "", FOFFSET(width), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"height", "", FOFFSET(height), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"format", "", FOFFSET(format), AV_OPT_TYPE_INT, {.i64 = -1 }, 0, INT_MAX, 0},
+{"channel_layout", "", FOFFSET(channel_layout), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, 0},
+{"sample_rate", "", FOFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{NULL},
+};
+
+static const AVClass av_frame_class = {
+    .class_name              = "AVFrame",
+    .item_name               = NULL,
+    .option                  = frame_options,
+    .version                 = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_get_frame_class(void)
+{
+    return &av_frame_class;
+}
+
+#define SROFFSET(x) offsetof(AVSubtitleRect,x)
+
+static const AVOption subtitle_rect_options[]={
+{"x", "", SROFFSET(x), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"y", "", SROFFSET(y), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"w", "", SROFFSET(w), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"h", "", SROFFSET(h), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"type", "", SROFFSET(type), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0},
+{"flags", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0, "flags"},
+{"forced", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0},
+{NULL},
+};
+
+static const AVClass av_subtitle_rect_class = {
+    .class_name             = "AVSubtitleRect",
+    .item_name              = NULL,
+    .option                 = subtitle_rect_options,
+    .version                = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *avcodec_get_subtitle_rect_class(void)
+{
+    return &av_subtitle_rect_class;
+}
+
+#ifdef TEST
+static int dummy_init(AVCodecContext *ctx)
+{
+    //TODO: this code should set every possible pointer that could be set by codec and is not an option;
+    ctx->extradata_size = 8;
+    ctx->extradata = av_malloc(ctx->extradata_size);
+    return 0;
+}
+
+static int dummy_close(AVCodecContext *ctx)
+{
+    av_freep(&ctx->extradata);
+    ctx->extradata_size = 0;
+    return 0;
+}
+
+static int dummy_encode(AVCodecContext *ctx, AVPacket *pkt, const AVFrame *frame, int *got_packet)
+{
+    return AVERROR(ENOSYS);
+}
+
+typedef struct Dummy12Context {
+    AVClass  *av_class;
+    int      num;
+    char*    str;
+} Dummy12Context;
+
+typedef struct Dummy3Context {
+    void     *fake_av_class;
+    int      num;
+    char*    str;
+} Dummy3Context;
+
+#define OFFSET(x) offsetof(Dummy12Context, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption dummy_options[] = {
+    { "str", "set str", OFFSET(str), AV_OPT_TYPE_STRING, { .str = "i'm src default value" }, 0, 0, VE},
+    { "num", "set num", OFFSET(num), AV_OPT_TYPE_INT,    { .i64 = 1500100900 },    0, INT_MAX, VE},
+    { NULL },
+};
+
+static const AVClass dummy_v1_class = {
+    .class_name = "dummy_v1_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVClass dummy_v2_class = {
+    .class_name = "dummy_v2_class",
+    .item_name  = av_default_item_name,
+    .option     = dummy_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+/* codec with options */
+static AVCodec dummy_v1_encoder = {
+    .name             = "dummy_v1_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 1,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v1_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with options, different class */
+static AVCodec dummy_v2_encoder = {
+    .name             = "dummy_v2_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 2,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_class       = &dummy_v2_class,
+    .priv_data_size   = sizeof(Dummy12Context),
+};
+
+/* codec with priv data, but no class */
+static AVCodec dummy_v3_encoder = {
+    .name             = "dummy_v3_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 3,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+    .priv_data_size   = sizeof(Dummy3Context),
+};
+
+/* codec without priv data */
+static AVCodec dummy_v4_encoder = {
+    .name             = "dummy_v4_codec",
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_NONE - 4,
+    .encode2          = dummy_encode,
+    .init             = dummy_init,
+    .close            = dummy_close,
+};
+
+static void test_copy_print_codec(const AVCodecContext *ctx)
+{
+    printf("%-14s: %dx%d prv: %s",
+           ctx->codec ? ctx->codec->name : "NULL",
+           ctx->width, ctx->height,
+           ctx->priv_data ? "set" : "null");
+    if (ctx->codec && ctx->codec->priv_class && ctx->codec->priv_data_size) {
+        int64_t i64;
+        char *str = NULL;
+        av_opt_get_int(ctx->priv_data, "num", 0, &i64);
+        av_opt_get(ctx->priv_data, "str", 0, (uint8_t**)&str);
+        printf(" opts: %"PRId64" %s", i64, str);
+        av_free(str);
+    }
+    printf("\n");
+}
+
+static void test_copy(const AVCodec *c1, const AVCodec *c2)
+{
+    AVCodecContext *ctx1, *ctx2;
+    printf("%s -> %s\nclosed:\n", c1 ? c1->name : "NULL", c2 ? c2->name : "NULL");
+    ctx1 = avcodec_alloc_context3(c1);
+    ctx2 = avcodec_alloc_context3(c2);
+    ctx1->width = ctx1->height = 128;
+    if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+        av_opt_set(ctx2->priv_data, "num", "667", 0);
+        av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+    }
+    avcodec_copy_context(ctx2, ctx1);
+    test_copy_print_codec(ctx1);
+    test_copy_print_codec(ctx2);
+    if (ctx1->codec) {
+        printf("opened:\n");
+        avcodec_open2(ctx1, ctx1->codec, NULL);
+        if (ctx2->codec && ctx2->codec->priv_class && ctx2->codec->priv_data_size) {
+            av_opt_set(ctx2->priv_data, "num", "667", 0);
+            av_opt_set(ctx2->priv_data, "str", "i'm dest value before copy", 0);
+        }
+        avcodec_copy_context(ctx2, ctx1);
+        test_copy_print_codec(ctx1);
+        test_copy_print_codec(ctx2);
+        avcodec_close(ctx1);
+    }
+    avcodec_free_context(&ctx1);
+    avcodec_free_context(&ctx2);
+}
+
+int main(void)
+{
+    AVCodec *dummy_codec[] = {
+        &dummy_v1_encoder,
+        &dummy_v2_encoder,
+        &dummy_v3_encoder,
+        &dummy_v4_encoder,
+        NULL,
+    };
+    int i, j;
+
+    for (i = 0; dummy_codec[i]; i++)
+        avcodec_register(dummy_codec[i]);
+
+    printf("testing avcodec_copy_context()\n");
+    for (i = 0; i < FF_ARRAY_ELEMS(dummy_codec); i++)
+        for (j = 0; j < FF_ARRAY_ELEMS(dummy_codec); j++)
+            test_copy(dummy_codec[i], dummy_codec[j]);
+    return 0;
+}
+#endif
diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
index d2f6269..adfbe72 100644
--- a/libavcodec/options_table.h
+++ b/libavcodec/options_table.h
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +29,6 @@
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "version.h"
-#include "config.h"
 
 #define OFFSET(x) offsetof(AVCodecContext,x)
 #define DEFAULT 0 //should be NAN but it does not work as it is not a constant in glibc as required by ANSI/ISO C
@@ -40,12 +42,13 @@
 #define AV_CODEC_DEFAULT_BITRATE 200*1000
 
 static const AVOption avcodec_options[] = {
-{"b", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT, {.i64 = AV_CODEC_DEFAULT_BITRATE }, INT_MIN, INT_MAX, V|A|E},
+{"b", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = AV_CODEC_DEFAULT_BITRATE }, 0, INT64_MAX, A|V|E},
+{"ab", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = 128*1000 }, 0, INT_MAX, A|E},
 {"bt", "Set video bitrate tolerance (in bits/s). In 1-pass mode, bitrate tolerance specifies how far "
        "ratecontrol is willing to deviate from the target average bitrate value. This is not related "
        "to minimum/maximum bitrate. Lowering tolerance too much has an adverse effect on quality.",
        OFFSET(bit_rate_tolerance), AV_OPT_TYPE_INT, {.i64 = AV_CODEC_DEFAULT_BITRATE*20 }, 1, INT_MAX, V|E},
-{"flags", NULL, OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, UINT_MAX, V|A|E|D, "flags"},
+{"flags", NULL, OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, UINT_MAX, V|A|S|E|D, "flags"},
 {"unaligned", "allow decoders to produce unaligned output", 0, AV_OPT_TYPE_CONST, { .i64 = AV_CODEC_FLAG_UNALIGNED }, INT_MIN, INT_MAX, V | D, "flags" },
 {"mv4", "use four motion vectors per macroblock (MPEG-4)", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_4MV }, INT_MIN, INT_MAX, V|E, "flags"},
 {"qpel", "use 1/4-pel motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_QPEL }, INT_MIN, INT_MAX, V|E, "flags"},
@@ -67,7 +70,7 @@ static const AVOption avcodec_options[] = {
 {"emu_edge", "do not draw edges", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_EMU_EDGE }, INT_MIN, INT_MAX, 0, "flags"},
 #endif
 {"psnr", "error[?] variables will be set during encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PSNR }, INT_MIN, INT_MAX, V|E, "flags"},
-{"truncated", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_TRUNCATED }, INT_MIN, INT_MAX, 0, "flags"},
+{"truncated", "Input bitstream might be randomly truncated", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_TRUNCATED }, INT_MIN, INT_MAX, V|D, "flags"},
 #if FF_API_NORMALIZE_AQP
 {"naq", "normalize adaptive quantization", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_NORMALIZE_AQP }, INT_MIN, INT_MAX, V|E, "flags"},
 #endif
@@ -81,13 +84,18 @@ static const AVOption avcodec_options[] = {
 {"output_corrupt", "Output even potentially corrupted frames", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_OUTPUT_CORRUPT }, INT_MIN, INT_MAX, V|D, "flags"},
 {"fast", "allow non-spec-compliant speedup tricks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_FAST }, INT_MIN, INT_MAX, V|E, "flags2"},
 {"noout", "skip bitstream encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_NO_OUTPUT }, INT_MIN, INT_MAX, V|E, "flags2"},
-{"ignorecrop", "ignore cropping information from sps", 1, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_IGNORE_CROP }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"ignorecrop", "ignore cropping information from sps", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_IGNORE_CROP }, INT_MIN, INT_MAX, V|D, "flags2"},
 {"local_header", "place global headers at every keyframe instead of in extradata", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_LOCAL_HEADER }, INT_MIN, INT_MAX, V|E, "flags2"},
+{"chunks", "Frame data might be split into multiple chunks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_CHUNKS }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"showall", "Show all frames before the first keyframe", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SHOW_ALL }, INT_MIN, INT_MAX, V|D, "flags2"},
+{"export_mvs", "export motion vectors through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_EXPORT_MVS}, INT_MIN, INT_MAX, V|D, "flags2"},
+{"skip_manual", "do not skip samples and export skip information as frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SKIP_MANUAL}, INT_MIN, INT_MAX, V|D, "flags2"},
+{"ass_ro_flush_noop", "do not reset ASS ReadOrder field on flush", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_RO_FLUSH_NOOP}, INT_MIN, INT_MAX, S|D, "flags2"},
 #if FF_API_MOTION_EST
 {"me_method", "set motion estimation method", OFFSET(me_method), AV_OPT_TYPE_INT, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method"},
 {"zero", "zero motion estimation (fastest)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_ZERO }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"full", "full motion estimation (slowest)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_FULL }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"epzs", "EPZS motion estimation (default)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method" },
+{"epzs", "EPZS motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"esa", "esa motion estimation (alias for full)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_FULL }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"tesa", "tesa motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_TESA }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"dia", "diamond motion estimation (alias for EPZS)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method" },
@@ -96,12 +104,12 @@ static const AVOption avcodec_options[] = {
 {"x1", "X1 motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_X1 }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"hex", "hex motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_HEX }, INT_MIN, INT_MAX, V|E, "me_method" },
 {"umh", "umh motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_UMH }, INT_MIN, INT_MAX, V|E, "me_method" },
+{"iter", "iter motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_ITER }, INT_MIN, INT_MAX, V|E, "me_method" },
 #endif
-{"extradata_size", NULL, OFFSET(extradata_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"time_base", NULL, OFFSET(time_base), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, INT_MIN, INT_MAX},
 {"g", "set the group of picture (GOP) size", OFFSET(gop_size), AV_OPT_TYPE_INT, {.i64 = 12 }, INT_MIN, INT_MAX, V|E},
-{"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|D|E},
-{"ac", "set number of audio channels", OFFSET(channels), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|D|E},
+{"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
+{"ac", "set number of audio channels", OFFSET(channels), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
 {"cutoff", "set cutoff bandwidth", OFFSET(cutoff), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|E},
 {"frame_size", NULL, OFFSET(frame_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|E},
 {"frame_number", NULL, OFFSET(frame_number), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
@@ -111,9 +119,9 @@ static const AVOption avcodec_options[] = {
           OFFSET(qcompress), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -FLT_MAX, FLT_MAX, V|E},
 {"qblur", "video quantizer scale blur (VBR)", OFFSET(qblur), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -1, FLT_MAX, V|E},
 {"qmin", "minimum video quantizer scale (VBR)", OFFSET(qmin), AV_OPT_TYPE_INT, {.i64 = 2 }, -1, 69, V|E},
-{"qmax", "maximum video quantizer scale (VBR)", OFFSET(qmax), AV_OPT_TYPE_INT, {.i64 = 31 }, -1, 69, V|E},
+{"qmax", "maximum video quantizer scale (VBR)", OFFSET(qmax), AV_OPT_TYPE_INT, {.i64 = 31 }, -1, 1024, V|E},
 {"qdiff", "maximum difference between the quantizer scales (VBR)", OFFSET(max_qdiff), AV_OPT_TYPE_INT, {.i64 = 3 }, INT_MIN, INT_MAX, V|E},
-{"bf", "use 'frames' B-frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E},
+{"bf", "set maximum number of B-frames between non-B-frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E},
 {"b_qfactor", "QP factor between P- and B-frames", OFFSET(b_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E},
 #if FF_API_RC_STRATEGY
 {"rc_strategy", "ratecontrol method", OFFSET(rc_strategy), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -156,17 +164,21 @@ static const AVOption avcodec_options[] = {
 {"ms", "work around various bugs in Microsoft's broken decoders", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_MS }, INT_MIN, INT_MAX, V|D, "bug"},
 {"trunc", "truncated frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_TRUNCATED}, INT_MIN, INT_MAX, V|D, "bug"},
 {"strict", "how strictly to follow the standards", OFFSET(strict_std_compliance), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
-{"very", "strictly conform to a older more strict version of the spec or reference software", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_VERY_STRICT }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"strict", "strictly conform to all the things in the spec no matter what the consequences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_STRICT }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"normal", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_NORMAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"unofficial", "allow unofficial extensions", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_UNOFFICIAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
-{"experimental", "allow non-standardized experimental things", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_EXPERIMENTAL }, INT_MIN, INT_MAX, V|D|E, "strict"},
+{"very", "strictly conform to a older more strict version of the spec or reference software", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_VERY_STRICT }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"strict", "strictly conform to all the things in the spec no matter what the consequences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_STRICT }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"normal", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_NORMAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"unofficial", "allow unofficial extensions", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_UNOFFICIAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
+{"experimental", "allow non-standardized experimental things", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_EXPERIMENTAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"},
 {"b_qoffset", "QP offset between P- and B-frames", OFFSET(b_quant_offset), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E},
-{"err_detect", "set error detection flags", OFFSET(err_recognition), AV_OPT_TYPE_FLAGS, {.i64 = 0}, INT_MIN, INT_MAX, A|V|D, "err_detect"},
-{"crccheck", "verify embedded CRCs", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CRCCHECK }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"bitstream", "detect bitstream specification deviations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BITSTREAM }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"buffer", "detect improper bitstream length", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BUFFER }, INT_MIN, INT_MAX, V|D, "err_detect"},
-{"explode", "abort decoding on minor error detection", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_EXPLODE }, INT_MIN, INT_MAX, V|D, "err_detect"},
+{"err_detect", "set error detection flags", OFFSET(err_recognition), AV_OPT_TYPE_FLAGS, {.i64 = 0 }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"crccheck", "verify embedded CRCs", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CRCCHECK }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"bitstream", "detect bitstream specification deviations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BITSTREAM }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"buffer", "detect improper bitstream length", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BUFFER }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"explode", "abort decoding on minor error detection", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_EXPLODE }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"ignore_err", "ignore errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_IGNORE_ERR }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"careful",    "consider things that violate the spec, are fast to check and have not been seen in the wild as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CAREFUL }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"compliant",  "consider all spec non compliancies as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_COMPLIANT }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
+{"aggressive", "consider things that a sane encoder should not do as an error", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_AGGRESSIVE }, INT_MIN, INT_MAX, A|V|D, "err_detect"},
 {"has_b_frames", NULL, OFFSET(has_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"block_align", NULL, OFFSET(block_align), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 #if FF_API_PRIVATE_OPT
@@ -181,9 +193,9 @@ static const AVOption avcodec_options[] = {
 #if FF_API_MPV_OPT
 {"rc_eq", "deprecated, use encoder private options instead", OFFSET(rc_eq), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, V|E},
 #endif
-{"maxrate", "Set maximum bitrate tolerance (in bits/s). Requires bufsize to be set.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
-{"minrate", "Set minimum bitrate tolerance (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.",
-            OFFSET(rc_min_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
+{"maxrate", "maximum bitrate (in bits/s). Used for VBV together with bufsize.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT_MAX, V|A|E},
+{"minrate", "minimum bitrate (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.",
+            OFFSET(rc_min_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
 {"bufsize", "set ratecontrol buffer size (in bits)", OFFSET(rc_buffer_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|E},
 #if FF_API_MPV_OPT
 {"rc_buf_aggressivity", "deprecated, use encoder private options instead", OFFSET(rc_buffer_aggressivity), AV_OPT_TYPE_FLOAT, {.dbl = 1.0 }, -FLT_MAX, FLT_MAX, V|E},
@@ -194,7 +206,7 @@ static const AVOption avcodec_options[] = {
 {"rc_init_cplx", "deprecated, use encoder private options instead", OFFSET(rc_initial_cplx), AV_OPT_TYPE_FLOAT, {.dbl = DEFAULT }, -FLT_MAX, FLT_MAX, V|E},
 #endif
 {"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"},
-{"auto", "autoselect a good one (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
+{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
 {"fastint", "fast integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"},
 {"int", "accurate integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_INT }, INT_MIN, INT_MAX, V|E, "dct"},
 {"mmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"},
@@ -226,14 +238,14 @@ static const AVOption avcodec_options[] = {
 {"ipp", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_IPP }, INT_MIN, INT_MAX, V|E|D, "idct"},
 #endif /* FF_API_UNUSED_MEMBERS */
 {"xvid", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#if FF_API_IDCT_XVIDMMX
-{"xvidmmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#endif /* FF_API_IDCT_XVIDMMX */
+{"xvidmmx", "deprecated, for compatibility only", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"faani", "floating point AAN IDCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"},
+{"simpleauto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"slice_count", NULL, OFFSET(slice_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"ec", "set error concealment strategy", OFFSET(error_concealment), AV_OPT_TYPE_FLAGS, {.i64 = 3 }, INT_MIN, INT_MAX, V|D, "ec"},
 {"guess_mvs", "iterative motion vector (MV) search (slow)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_GUESS_MVS }, INT_MIN, INT_MAX, V|D, "ec"},
 {"deblock", "use strong deblock filter for damaged MBs", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_DEBLOCK }, INT_MIN, INT_MAX, V|D, "ec"},
+{"favor_inter", "favor predicting from the previous frame", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_FAVOR_INTER }, INT_MIN, INT_MAX, V|D, "ec"},
 {"bits_per_coded_sample", NULL, OFFSET(bits_per_coded_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 #if FF_API_PRIVATE_OPT
 {"pred", "prediction method", OFFSET(prediction_method), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "pred"},
@@ -242,6 +254,7 @@ static const AVOption avcodec_options[] = {
 {"median", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PRED_MEDIAN }, INT_MIN, INT_MAX, V|E, "pred"},
 #endif
 {"aspect", "sample aspect ratio", OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E},
+{"sar",    "sample aspect ratio", OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E},
 {"debug", "print specific debug info", OFFSET(debug), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, INT_MAX, V|A|S|E|D, "debug"},
 {"pict", "picture info", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_PICT_INFO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"rc", "rate control", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_RC }, INT_MIN, INT_MAX, V|E, "debug"},
@@ -252,6 +265,7 @@ static const AVOption avcodec_options[] = {
 {"mv", "motion vector", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MV }, INT_MIN, INT_MAX, V|D, "debug"},
 #endif
 {"dct_coeff", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_DCT_COEFF }, INT_MIN, INT_MAX, V|D, "debug"},
+{"green_metadata", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_GREEN_MD }, INT_MIN, INT_MAX, V|D, "debug"},
 {"skip", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_SKIP }, INT_MIN, INT_MAX, V|D, "debug"},
 {"startcode", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_STARTCODE }, INT_MIN, INT_MAX, V|D, "debug"},
 #if FF_API_UNUSED_MEMBERS
@@ -260,14 +274,13 @@ static const AVOption avcodec_options[] = {
 {"er", "error recognition", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_ER }, INT_MIN, INT_MAX, V|D, "debug"},
 {"mmco", "memory management control operations (H.264)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MMCO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"bugs", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUGS }, INT_MIN, INT_MAX, V|D, "debug"},
-#if FF_API_DEBUG_MV
 {"vis_qp", "visualize quantization parameter (QP), lower QP are tinted greener", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_QP }, INT_MIN, INT_MAX, V|D, "debug"},
 {"vis_mb_type", "visualize block types", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MB_TYPE }, INT_MIN, INT_MAX, V|D, "debug"},
-#endif
 {"buffers", "picture buffer allocations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUFFERS }, INT_MIN, INT_MAX, V|D, "debug"},
-{"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|D, "debug"},
-#if FF_API_DEBUG_MV
-{"vismv", "visualize motion vectors (MVs)", OFFSET(debug_mv), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|D, "debug_mv"},
+{"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|A|D, "debug"},
+{"nomc", "skip motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_NOMC }, INT_MIN, INT_MAX, V|A|D, "debug"},
+#if FF_API_VISMV
+{"vismv", "visualize motion vectors (MVs) (deprecated)", OFFSET(debug_mv), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, INT_MAX, V|D, "debug_mv"},
 {"pf", "forward predicted MVs of P-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_P_FOR }, INT_MIN, INT_MAX, V|D, "debug_mv"},
 {"bf", "forward predicted MVs of B-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_B_FOR }, INT_MIN, INT_MAX, V|D, "debug_mv"},
 {"bb", "backward predicted MVs of B-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_B_BACK }, INT_MIN, INT_MAX, V|D, "debug_mv"},
@@ -282,7 +295,7 @@ static const AVOption avcodec_options[] = {
 {"preme", "pre motion estimation", OFFSET(pre_me), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
 {"precmp", "pre motion estimation compare function", OFFSET(me_pre_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
-{"sad", "sum of absolute differences, fast (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"sad", "sum of absolute differences, fast", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"sse", "sum of squared errors", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"satd", "sum of absolute Hadamard transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SATD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"dct", "sum of absolute DCT transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
@@ -293,6 +306,10 @@ static const AVOption avcodec_options[] = {
 {"vsad", "sum of absolute vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"vsse", "sum of squared vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"nsse", "noise preserving sum of squared differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_NSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#if CONFIG_SNOW_ENCODER
+{"w53", "5/3 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W53 }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+{"w97", "9/7 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W97 }, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#endif
 {"dctmax", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"pre_dia_size", "diamond type & size for motion estimation pre-pass", OFFSET(pre_dia_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -323,8 +340,8 @@ static const AVOption avcodec_options[] = {
 #if FF_API_XVMC
 {"xvmc_acceleration", NULL, OFFSET(xvmc_acceleration), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 #endif /* FF_API_XVMC */
-{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "mbd"},
-{"simple", "use mbcmp (default)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_SIMPLE }, INT_MIN, INT_MAX, V|E, "mbd"},
+{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 2, V|E, "mbd"},
+{"simple", "use mbcmp", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_SIMPLE }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"bits", "use fewest bits", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_BITS }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"rd", "use best rate distortion", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_RD }, INT_MIN, INT_MAX, V|E, "mbd"},
 #if FF_API_STREAM_CODEC_TAG
@@ -345,13 +362,13 @@ static const AVOption avcodec_options[] = {
 #if FF_API_ERROR_RATE
 {"error", NULL, OFFSET(error_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
-{"threads", NULL, OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|E|D, "threads"},
+{"threads", "set the number of threads", OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|A|E|D, "threads"},
 {"auto", "autodetect a suitable number of threads to use", 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, INT_MIN, INT_MAX, V|E|D, "threads"},
 #if FF_API_MPV_OPT
 {"me_threshold", "motion estimation threshold", OFFSET(me_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"mb_threshold", "macroblock threshold", OFFSET(mb_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
-{"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, V|E},
+{"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, -8, 16, V|E},
 {"nssew", "nsse weight", OFFSET(nsse_weight), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E},
 {"skip_top", "number of macroblock rows at the top which are skipped", OFFSET(skip_top), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D},
 {"skip_bottom", "number of macroblock rows at the bottom which are skipped", OFFSET(skip_bottom), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D},
@@ -372,8 +389,13 @@ static const AVOption avcodec_options[] = {
 {"dts_96_24", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_96_24 }, INT_MIN, INT_MAX, A|E, "profile"},
 {"dts_hd_hra", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_HD_HRA }, INT_MIN, INT_MAX, A|E, "profile"},
 {"dts_hd_ma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_DTS_HD_MA }, INT_MIN, INT_MAX, A|E, "profile"},
+{"mpeg4_sp",   NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_SIMPLE }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_core", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_CORE }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_main", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_MAIN }, INT_MIN, INT_MAX, V|E, "profile"},
+{"mpeg4_asp",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_ADVANCED_SIMPLE }, INT_MIN, INT_MAX, V|E, "profile"},
 {"level", NULL, OFFSET(level), AV_OPT_TYPE_INT, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
 {"unknown", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
+{"lowres", "decode at 1= 1/2, 2=1/4, 3=1/8 resolutions", OFFSET(lowres), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|A|D},
 #if FF_API_PRIVATE_OPT
 {"skip_threshold", "frame skip threshold", OFFSET(frame_skip_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"skip_factor", "frame skip factor", OFFSET(frame_skip_factor), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -388,20 +410,21 @@ static const AVOption avcodec_options[] = {
 #if FF_API_PRIVATE_OPT
 {"mepc", "motion estimation bitrate penalty compensation (1.0 = 256)", OFFSET(me_penalty_compensation), AV_OPT_TYPE_INT, {.i64 = 256 }, INT_MIN, INT_MAX, V|E},
 #endif
-{"skip_loop_filter", NULL, OFFSET(skip_loop_filter), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"skip_idct"       , NULL, OFFSET(skip_idct)       , AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"skip_frame"      , NULL, OFFSET(skip_frame)      , AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"none"            , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONE    }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"default"         , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"noref"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONREF  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"bidir"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_BIDIR   }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"nokey"           , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONKEY  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
-{"all"             , NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_ALL     }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_loop_filter", "skip loop filtering process for the selected frames", OFFSET(skip_loop_filter), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_idct"       , "skip IDCT/dequantization for the selected frames",    OFFSET(skip_idct),        AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"skip_frame"      , "skip decoding for the selected frames",               OFFSET(skip_frame),       AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"none"            , "discard no frame",                    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONE    }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"default"         , "discard useless frames",              0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"noref"           , "discard all non-reference frames",    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONREF  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"bidir"           , "discard all bidirectional frames",    0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_BIDIR   }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"nokey"           , "discard all frames except keyframes", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONKEY  }, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"nointra"         , "discard all frames except I frames",  0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONINTRA}, INT_MIN, INT_MAX, V|D, "avdiscard"},
+{"all"             , "discard all frames",                  0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_ALL     }, INT_MIN, INT_MAX, V|D, "avdiscard"},
 {"bidir_refine", "refine the two motion vectors used in bidirectional macroblocks", OFFSET(bidir_refine), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, 4, V|E},
 #if FF_API_PRIVATE_OPT
 {"brd_scale", "downscale frames for dynamic B-frame decision", OFFSET(brd_scale), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 10, V|E},
 #endif
-{"keyint_min", "minimum interval between IDR-frames (x264)", OFFSET(keyint_min), AV_OPT_TYPE_INT, {.i64 = 25 }, INT_MIN, INT_MAX, V|E},
+{"keyint_min", "minimum interval between IDR-frames", OFFSET(keyint_min), AV_OPT_TYPE_INT, {.i64 = 25 }, INT_MIN, INT_MAX, V|E},
 {"refs", "reference frames to consider for motion compensation", OFFSET(refs), AV_OPT_TYPE_INT, {.i64 = 1 }, INT_MIN, INT_MAX, V|E},
 #if FF_API_PRIVATE_OPT
 {"chromaoffset", "chroma QP offset from luma", OFFSET(chromaoffset), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -418,24 +441,24 @@ static const AVOption avcodec_options[] = {
 #if FF_API_PRIVATE_OPT
 {"min_prediction_order", NULL, OFFSET(min_prediction_order), AV_OPT_TYPE_INT, {.i64 = -1 }, INT_MIN, INT_MAX, A|E},
 {"max_prediction_order", NULL, OFFSET(max_prediction_order), AV_OPT_TYPE_INT, {.i64 = -1 }, INT_MIN, INT_MAX, A|E},
-{"timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, V|E},
+{"timecode_frame_start", "GOP timecode frame start number, in non-drop-frame format", OFFSET(timecode_frame_start), AV_OPT_TYPE_INT64, {.i64 = -1 }, -1, INT64_MAX, V|E},
 #endif
 {"bits_per_raw_sample", NULL, OFFSET(bits_per_raw_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"channel_layout", NULL, OFFSET(channel_layout), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT64_MAX, A|E|D, "channel_layout"},
 {"request_channel_layout", NULL, OFFSET(request_channel_layout), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT64_MAX, A|D, "request_channel_layout"},
-{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), AV_OPT_TYPE_FLOAT, {.dbl = 1.0/3 }, 0.0, FLT_MAX, V|E},
+{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, 0.0, FLT_MAX, V|E},
 {"rc_min_vbv_use", NULL, OFFSET(rc_min_vbv_overflow_use),  AV_OPT_TYPE_FLOAT, {.dbl = 3 },     0.0, FLT_MAX, V|E},
 {"ticks_per_frame", NULL, OFFSET(ticks_per_frame), AV_OPT_TYPE_INT, {.i64 = 1 }, 1, INT_MAX, A|V|E|D},
 {"color_primaries", "color primaries", OFFSET(color_primaries), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_UNSPECIFIED }, 1, AVCOL_PRI_NB-1, V|E|D, "color_primaries_type"},
-{"bt709",       "BT.709",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709 },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt470m",      "BT.470 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470M },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt470bg",     "BT.470 BG",   0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG },     INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte170m",   "SMPTE 170 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE170M },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smpte240m",   "SMPTE 240 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE240M },   INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"film",        "Film",        0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_FILM },        INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"bt2020",      "BT.2020",     0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020 },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
-{"smptest428_1", "SMPTE ST 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt709",       "BT.709",         0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709 },        INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"unspecified", "Unspecified",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED },  INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt470m",      "BT.470 M",       0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470M },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt470bg",     "BT.470 BG",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG },      INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte170m",   "SMPTE 170 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE170M },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte240m",   "SMPTE 240 M",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE240M },    INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"film",        "Film",           0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_FILM },         INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"bt2020",      "BT.2020",        0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020 },       INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
+{"smpte428_1",  "SMPTE ST 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"},
 {"color_trc", "color transfer characteristics", OFFSET(color_trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_UNSPECIFIED }, 1, AVCOL_TRC_NB-1, V|E|D, "color_trc_type"},
 {"bt709",        "BT.709",           0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709 },        INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"unspecified",  "Unspecified",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED },  INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
@@ -451,8 +474,8 @@ static const AVOption avcodec_options[] = {
 {"iec61966_2_1", "IEC 61966-2-1",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"bt2020_10bit", "BT.2020 - 10 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 },    INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"bt2020_12bit", "BT.2020 - 12 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 },    INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
-{"smptest2084",  "SMPTE ST 2084",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST2084 },  INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
-{"smptest428_1", "SMPTE ST 428-1",   0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
+{"smpte2084",    "SMPTE ST 2084",    0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST2084 },  INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
+{"smpte428_1",   "SMPTE ST 428-1",   0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTEST428_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"},
 {"colorspace", "color space", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_UNSPECIFIED }, 0, AVCOL_SPC_NB-1, V|E|D, "colorspace_type"},
 {"rgb",         "RGB",         0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_RGB },         INT_MIN, INT_MAX, V|E|D, "colorspace_type"},
 {"bt709",       "BT.709",      0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709 },       INT_MIN, INT_MAX, V|E|D, "colorspace_type"},
@@ -477,8 +500,8 @@ static const AVOption avcodec_options[] = {
 {"bottomleft",  "Bottom-left", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_BOTTOMLEFT },  INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"bottom",      "Bottom",      0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_BOTTOM },      INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"},
 {"log_level_offset", "set the log level offset", OFFSET(log_level_offset), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX },
-{"slices", "number of slices, used in parallelized encoding", OFFSET(slices), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|E},
-{"thread_type", "select multithreading type", OFFSET(thread_type), AV_OPT_TYPE_FLAGS, {.i64 = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|E|D, "thread_type"},
+{"slices", "set the number of slices, used in parallelized encoding", OFFSET(slices), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|E},
+{"thread_type", "select multithreading type", OFFSET(thread_type), AV_OPT_TYPE_FLAGS, {.i64 = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|A|E|D, "thread_type"},
 {"slice", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_SLICE }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
 {"frame", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_FRAME }, INT_MIN, INT_MAX, V|E|D, "thread_type"},
 {"audio_service_type", "audio service type", OFFSET(audio_service_type), AV_OPT_TYPE_INT, {.i64 = AV_AUDIO_SERVICE_TYPE_MAIN }, 0, AV_AUDIO_SERVICE_TYPE_NB-1, A|E, "audio_service_type"},
@@ -491,21 +514,37 @@ static const AVOption avcodec_options[] = {
 {"em", "Emergency",          0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_EMERGENCY },         INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"vo", "Voice Over",         0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_VOICE_OVER },        INT_MIN, INT_MAX, A|E, "audio_service_type"},
 {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
-{"request_sample_fmt", NULL, OFFSET(request_sample_fmt), AV_OPT_TYPE_INT, {.i64 = AV_SAMPLE_FMT_NONE }, AV_SAMPLE_FMT_NONE, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
-{"u8" , "8-bit unsigned integer", 0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_U8  }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s16", "16-bit signed integer",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S16 }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s32", "32-bit signed integer",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S32 }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"flt", "32-bit float",           0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_FLT }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"dbl", "64-bit double",          0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_DBL }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"u8p" , "8-bit unsigned integer planar", 0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_U8P  }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s16p", "16-bit signed integer planar",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S16P }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"s32p", "32-bit signed integer planar",  0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_S32P }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"fltp", "32-bit float planar",           0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_FLTP }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"dblp", "64-bit double planar",          0, AV_OPT_TYPE_CONST, {.i64 = AV_SAMPLE_FMT_DBLP }, INT_MIN, INT_MAX, A|D, "request_sample_fmt"},
-{"refcounted_frames", NULL, OFFSET(refcounted_frames), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, A|V|D },
+{"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, INT_MAX, A|D, "request_sample_fmt"},
+{"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
+{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
+{"sub_charenc_mode", "set input text subtitles character encoding mode", OFFSET(sub_charenc_mode), AV_OPT_TYPE_FLAGS, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC}, -1, INT_MAX, S|D, "sub_charenc_mode"},
+{"do_nothing",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_DO_NOTHING},  INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"auto",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC},   INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"pre_decoder", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_PRE_DECODER}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+#if FF_API_ASS_TIMING
+{"sub_text_format", "set decoded text subtitle format", OFFSET(sub_text_format), AV_OPT_TYPE_INT, {.i64 = FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS}, 0, 1, S|D, "sub_text_format"},
+#else
+{"sub_text_format", "set decoded text subtitle format", OFFSET(sub_text_format), AV_OPT_TYPE_INT, {.i64 = FF_SUB_TEXT_FMT_ASS}, 0, 1, S|D, "sub_text_format"},
+#endif
+{"ass",              NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_TEXT_FMT_ASS},              INT_MIN, INT_MAX, S|D, "sub_text_format"},
+#if FF_API_ASS_TIMING
+{"ass_with_timings", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS}, INT_MIN, INT_MAX, S|D, "sub_text_format"},
+#endif
+{"refcounted_frames", NULL, OFFSET(refcounted_frames), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, A|V|D },
 #if FF_API_SIDEDATA_ONLY_PKT
-{"side_data_only_packets", NULL, OFFSET(side_data_only_packets), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, A|V|E },
-#endif
+{"side_data_only_packets", NULL, OFFSET(side_data_only_packets), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, A|V|E },
+#endif
+{"skip_alpha", "Skip processing alpha", OFFSET(skip_alpha), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, V|D },
+{"field_order", "Field order", OFFSET(field_order), AV_OPT_TYPE_INT, {.i64 = AV_FIELD_UNKNOWN }, 0, 5, V|D|E, "field_order" },
+{"progressive", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_PROGRESSIVE }, 0, 0, V|D|E, "field_order" },
+{"tt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TT }, 0, 0, V|D|E, "field_order" },
+{"bb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BB }, 0, 0, V|D|E, "field_order" },
+{"tb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TB }, 0, 0, V|D|E, "field_order" },
+{"bt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BT }, 0, 0, V|D|E, "field_order" },
+{"dump_separator", "set information dump field separator", OFFSET(dump_separator), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, A|V|S|D|E},
+{"codec_whitelist", "List of decoders that are allowed to be used", OFFSET(codec_whitelist), AV_OPT_TYPE_STRING, { .str = NULL },  CHAR_MIN, CHAR_MAX, A|V|S|D },
+{"pixel_format", "set pixel format", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_NONE}, -1, INT_MAX, 0 },
+{"video_size", "set video size", OFFSET(width), AV_OPT_TYPE_IMAGE_SIZE, {.str=NULL}, 0, INT_MAX, 0 },
 {NULL},
 };
 
diff --git a/libavcodec/opus.c b/libavcodec/opus.c
index 8e896dd..703d2e8 100644
--- a/libavcodec/opus.c
+++ b/libavcodec/opus.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include <stdint.h>
 
 #include "libavutil/error.h"
+#include "libavutil/ffmath.h"
 
 #include "opus.h"
 #include "vorbis.h"
@@ -333,7 +334,7 @@ av_cold int ff_opus_parse_extradata(AVCodecContext *avctx,
 
     s->gain_i = AV_RL16(extradata + 16);
     if (s->gain_i)
-        s->gain = pow(10, s->gain_i / (20.0 * 256));
+        s->gain = ff_exp10(s->gain_i / (20.0 * 256));
 
     map_type = extradata[18];
     if (!map_type) {
diff --git a/libavcodec/opus.h b/libavcodec/opus.h
index 55c91fa..3a7ea9f 100644
--- a/libavcodec/opus.h
+++ b/libavcodec/opus.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "libavutil/float_dsp.h"
 #include "libavutil/frame.h"
 
-#include "libavresample/avresample.h"
+#include "libswresample/swresample.h"
 
 #include "avcodec.h"
 #include "get_bits.h"
@@ -57,7 +57,7 @@
 #define SILK_HISTORY                 322
 #define SILK_MAX_LPC                 16
 
-#define ROUND_MULL(a,b,s) (((MUL64(a, b) >> (s - 1)) + 1) >> 1)
+#define ROUND_MULL(a,b,s) (((MUL64(a, b) >> ((s) - 1)) + 1) >> 1)
 #define ROUND_MUL16(a,b)  ((MUL16(a, b) + 16384) >> 15)
 #define opus_ilog(i) (av_log2(i) + !!(i))
 
@@ -104,19 +104,19 @@ typedef struct SilkContext SilkContext;
 typedef struct CeltContext CeltContext;
 
 typedef struct OpusPacket {
-    int packet_size;                /** packet size */
-    int data_size;                  /** size of the useful data -- packet size - padding */
-    int code;                       /** packet code: specifies the frame layout */
-    int stereo;                     /** whether this packet is mono or stereo */
-    int vbr;                        /** vbr flag */
-    int config;                     /** configuration: tells the audio mode,
+    int packet_size;                /**< packet size */
+    int data_size;                  /**< size of the useful data -- packet size - padding */
+    int code;                       /**< packet code: specifies the frame layout */
+    int stereo;                     /**< whether this packet is mono or stereo */
+    int vbr;                        /**< vbr flag */
+    int config;                     /**< configuration: tells the audio mode,
                                      **                bandwidth, and frame duration */
-    int frame_count;                /** frame count */
-    int frame_offset[MAX_FRAMES];   /** frame offsets */
-    int frame_size[MAX_FRAMES];     /** frame sizes */
-    int frame_duration;             /** frame duration, in samples @ 48kHz */
-    enum OpusMode mode;             /** mode */
-    enum OpusBandwidth bandwidth;   /** bandwidth */
+    int frame_count;                /**< frame count */
+    int frame_offset[MAX_FRAMES];   /**< frame offsets */
+    int frame_size[MAX_FRAMES];     /**< frame sizes */
+    int frame_duration;             /**< frame duration, in samples @ 48kHz */
+    enum OpusMode mode;             /**< mode */
+    enum OpusBandwidth bandwidth;   /**< bandwidth */
 } OpusPacket;
 
 typedef struct OpusStreamContext {
@@ -144,7 +144,7 @@ typedef struct OpusStreamContext {
     float *out_dummy;
     int    out_dummy_allocated_size;
 
-    AVAudioResampleContext *avr;
+    SwrContext *swr;
     AVAudioFifo *celt_delay;
     int silk_samplerate;
     /* number of samples we still want to get from the resampler */
@@ -186,7 +186,7 @@ typedef struct OpusContext {
     int             nb_streams;
     int      nb_stereo_streams;
 
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     int16_t gain_i;
     float   gain;
 
@@ -289,7 +289,7 @@ static av_always_inline unsigned int opus_getrawbits(OpusRangeCoder *rc, unsigne
         rc->rb.bytes--;
     }
 
-    value = rc->rb.cacheval & ((1<<count)-1);
+    value = av_mod_uintp2(rc->rb.cacheval, count);
     rc->rb.cacheval    >>= count;
     rc->rb.cachelen     -= count;
     rc->total_read_bits += count;
diff --git a/libavcodec/opus_celt.c b/libavcodec/opus_celt.c
index 07a4f77..61a9dc6 100644
--- a/libavcodec/opus_celt.c
+++ b/libavcodec/opus_celt.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include <stdint.h>
 
 #include "libavutil/float_dsp.h"
+#include "libavutil/libm.h"
 
 #include "imdct15.h"
 #include "opus.h"
@@ -62,7 +63,7 @@ struct CeltContext {
     // constant values that do not change during context lifetime
     AVCodecContext    *avctx;
     IMDCT15Context    *imdct[4];
-    AVFloatDSPContext  dsp;
+    AVFloatDSPContext  *dsp;
     int output_channels;
 
     // values that have inter-frame effect and must be reset on flush
@@ -991,7 +992,7 @@ static inline int celt_pulses2bits(const uint8_t *cache, int pulses)
    return (pulses == 0) ? 0 : cache[pulses] + 1;
 }
 
-static inline void celt_normalize_residual(const int * restrict iy, float * restrict X,
+static inline void celt_normalize_residual(const int * av_restrict iy, float * av_restrict X,
                                            int N, float g)
 {
     int i;
@@ -1295,7 +1296,7 @@ static inline float celt_decode_pulses(OpusRangeCoder *rc, int *y, unsigned int
 {
     unsigned int idx;
 #define CELT_PVQ_U(n, k) (celt_pvq_u_row[FFMIN(n, k)][FFMAX(n, k)])
-#define CELT_PVQ_V(n, k) (CELT_PVQ_U(n, k) + CELT_PVQ_U(n, k + 1))
+#define CELT_PVQ_V(n, k) (CELT_PVQ_U(n, k) + CELT_PVQ_U(n, (k) + 1))
     idx = opus_rc_unimodel(rc, CELT_PVQ_V(N, K));
     return celt_cwrsi(N, K, idx, y);
 }
@@ -1454,7 +1455,7 @@ static unsigned int celt_decode_band(CeltContext *s, OpusRangeCoder *rc,
         if (itheta == 0) {
             imid = 32767;
             iside = 0;
-            fill &= (1 << blocks) - 1;
+            fill = av_mod_uintp2(fill, blocks);
             delta = -16384;
         } else if (itheta == 16384) {
             imid = 0;
@@ -1666,7 +1667,7 @@ static unsigned int celt_decode_band(CeltContext *s, OpusRangeCoder *rc,
             for (j = 0; j < N0; j++)
                 lowband_out[j] = n * X[j];
         }
-        cm &= (1 << blocks) - 1;
+        cm = av_mod_uintp2(cm, blocks);
     }
     return cm;
 }
@@ -1677,7 +1678,7 @@ static void celt_denormalize(CeltContext *s, CeltFrame *frame, float *data)
 
     for (i = s->startband; i < s->endband; i++) {
         float *dst = data + (celt_freq_bands[i] << s->duration);
-        float norm = pow(2, frame->energy[i] + celt_mean_energy[i]);
+        float norm = exp2(frame->energy[i] + celt_mean_energy[i]);
 
         for (j = 0; j < celt_freq_range[i] << s->duration; j++)
             dst[j] *= norm;
@@ -1839,7 +1840,7 @@ static void process_anticollapse(CeltContext *s, CeltFrame *frame, float *X)
 
         /* depth in 1/8 bits */
         depth = (1 + s->pulses[i]) / (celt_freq_range[i] << s->duration);
-        thresh = pow(2, -1.0 - 0.125f * depth);
+        thresh = exp2f(-1.0 - 0.125f * depth);
         sqrt_1 = 1.0f / sqrtf(celt_freq_range[i] << s->duration);
 
         xptr = X + (celt_freq_bands[i] << s->duration);
@@ -1857,7 +1858,7 @@ static void process_anticollapse(CeltContext *s, CeltFrame *frame, float *X)
 
         /* r needs to be multiplied by 2 or 2*sqrt(2) depending on LM because
         short blocks don't have the same energy as long */
-        r = pow(2, 1 - Ediff);
+        r = exp2(1 - Ediff);
         if (s->duration == 3)
             r *= M_SQRT2;
         r = FFMIN(thresh, r) * sqrt_1;
@@ -2072,7 +2073,7 @@ int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc,
 
     /* stereo -> mono downmix */
     if (s->output_channels < s->coded_channels) {
-        s->dsp.vector_fmac_scalar(s->coeffs[0], s->coeffs[1], 1.0, FFALIGN(frame_size, 16));
+        s->dsp->vector_fmac_scalar(s->coeffs[0], s->coeffs[1], 1.0, FFALIGN(frame_size, 16));
         imdct_scale = 0.5;
     } else if (s->output_channels > s->coded_channels)
         memcpy(s->coeffs[1], s->coeffs[0], frame_size * sizeof(float));
@@ -2098,7 +2099,7 @@ int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc,
 
             imdct->imdct_half(imdct, dst + CELT_OVERLAP / 2, s->coeffs[i] + j,
                               s->blocks, imdct_scale);
-            s->dsp.vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
+            s->dsp->vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
                                       celt_window, CELT_OVERLAP / 2);
         }
 
@@ -2181,6 +2182,7 @@ void ff_celt_free(CeltContext **ps)
     for (i = 0; i < FF_ARRAY_ELEMS(s->imdct); i++)
         ff_imdct15_uninit(&s->imdct[i]);
 
+    av_freep(&s->dsp);
     av_freep(ps);
 }
 
@@ -2208,7 +2210,11 @@ int ff_celt_init(AVCodecContext *avctx, CeltContext **ps, int output_channels)
             goto fail;
     }
 
-    avpriv_float_dsp_init(&s->dsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->dsp) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
 
     ff_celt_flush(s);
 
diff --git a/libavcodec/opus_parser.c b/libavcodec/opus_parser.c
index d256fbb..c30fd7b 100644
--- a/libavcodec/opus_parser.c
+++ b/libavcodec/opus_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/opus_silk.c b/libavcodec/opus_silk.c
index 583801d..73526f9 100644
--- a/libavcodec/opus_silk.c
+++ b/libavcodec/opus_silk.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/opusdec.c b/libavcodec/opusdec.c
index 92e651c..95a2435 100644
--- a/libavcodec/opusdec.c
+++ b/libavcodec/opusdec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2012 Andrew D'Addesio
  * Copyright (c) 2013-2014 Mozilla Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,11 +40,9 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
 
-#include "libavresample/avresample.h"
+#include "libswresample/swresample.h"
 
 #include "avcodec.h"
-#include "celp_filters.h"
-#include "fft.h"
 #include "get_bits.h"
 #include "internal.h"
 #include "mathops.h"
@@ -114,9 +112,9 @@ static int opus_flush_resample(OpusStreamContext *s, int nb_samples)
 {
     int celt_size = av_audio_fifo_size(s->celt_delay);
     int ret, i;
-
-    ret = avresample_convert(s->avr, (uint8_t**)s->out, s->out_size, nb_samples,
-                             NULL, 0, 0);
+    ret = swr_convert(s->swr,
+                      (uint8_t**)s->out, nb_samples,
+                      NULL, 0);
     if (ret < 0)
         return ret;
     else if (ret != nb_samples) {
@@ -155,19 +153,20 @@ static int opus_flush_resample(OpusStreamContext *s, int nb_samples)
 
 static int opus_init_resample(OpusStreamContext *s)
 {
-    float delay[16] = { 0.0 };
-    uint8_t *delayptr[2] = { (uint8_t*)delay, (uint8_t*)delay };
+    static const float delay[16] = { 0.0 };
+    const uint8_t *delayptr[2] = { (uint8_t*)delay, (uint8_t*)delay };
     int ret;
 
-    av_opt_set_int(s->avr, "in_sample_rate", s->silk_samplerate, 0);
-    ret = avresample_open(s->avr);
+    av_opt_set_int(s->swr, "in_sample_rate", s->silk_samplerate, 0);
+    ret = swr_init(s->swr);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR, "Error opening the resampler.\n");
         return ret;
     }
 
-    ret = avresample_convert(s->avr, NULL, 0, 0, delayptr, sizeof(delay),
-                             silk_resample_delay[s->packet.bandwidth]);
+    ret = swr_convert(s->swr,
+                      NULL, 0,
+                      delayptr, silk_resample_delay[s->packet.bandwidth]);
     if (ret < 0) {
         av_log(s->avctx, AV_LOG_ERROR,
                "Error feeding initial silence to the resampler.\n");
@@ -218,7 +217,7 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
 
     /* decode the silk frame */
     if (s->packet.mode == OPUS_MODE_SILK || s->packet.mode == OPUS_MODE_HYBRID) {
-        if (!avresample_is_open(s->avr)) {
+        if (!swr_is_initialized(s->swr)) {
             ret = opus_init_resample(s);
             if (ret < 0)
                 return ret;
@@ -232,16 +231,14 @@ static int opus_decode_frame(OpusStreamContext *s, const uint8_t *data, int size
             av_log(s->avctx, AV_LOG_ERROR, "Error decoding a SILK frame.\n");
             return samples;
         }
-
-        samples = avresample_convert(s->avr, (uint8_t**)s->out, s->out_size,
-                                     s->packet.frame_duration,
-                                     (uint8_t**)s->silk_output,
-                                     sizeof(s->silk_buf[0]),
-                                     samples);
+        samples = swr_convert(s->swr,
+                              (uint8_t**)s->out, s->packet.frame_duration,
+                              (const uint8_t**)s->silk_output, samples);
         if (samples < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Error resampling SILK data.\n");
             return samples;
         }
+        av_assert2((samples & 7) == 0);
         s->delayed_samples += s->packet.frame_duration - samples;
     } else
         ff_silk_flush(s->silk);
@@ -379,10 +376,10 @@ static int opus_decode_subpacket(OpusStreamContext *s,
     s->out_size = out_size;
 
     /* check if we need to flush the resampler */
-    if (avresample_is_open(s->avr)) {
+    if (swr_is_initialized(s->swr)) {
         if (buf) {
             int64_t cur_samplerate;
-            av_opt_get_int(s->avr, "in_sample_rate", 0, &cur_samplerate);
+            av_opt_get_int(s->swr, "in_sample_rate", 0, &cur_samplerate);
             flush_needed = (s->packet.mode == OPUS_MODE_CELT) || (cur_samplerate != s->silk_samplerate);
         } else {
             flush_needed = !!s->delayed_samples;
@@ -411,7 +408,7 @@ static int opus_decode_subpacket(OpusStreamContext *s,
             av_log(s->avctx, AV_LOG_ERROR, "Error flushing the resampler.\n");
             return ret;
         }
-        avresample_close(s->avr);
+        swr_close(s->swr);
         output_samples += s->delayed_samples;
         s->delayed_samples = 0;
 
@@ -461,8 +458,11 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
 
     /* calculate the number of delayed samples */
     for (i = 0; i < c->nb_streams; i++) {
+        OpusStreamContext *s = &c->streams[i];
+        s->out[0] =
+        s->out[1] = NULL;
         delayed_samples = FFMAX(delayed_samples,
-                                c->streams[i].delayed_samples + av_audio_fifo_size(c->sync_buffers[i]));
+                                s->delayed_samples + av_audio_fifo_size(c->sync_buffers[i]));
     }
 
     /* decode the header of the first sub-packet to find out the sample count */
@@ -487,10 +487,8 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
 
     /* setup the data buffers */
     ret = ff_get_buffer(avctx, frame, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if (ret < 0)
         return ret;
-    }
     frame->nb_samples = 0;
 
     memset(c->out, 0, c->nb_streams * 2 * sizeof(*c->out));
@@ -588,7 +586,7 @@ static int opus_decode_packet(AVCodecContext *avctx, void *data,
         }
 
         if (c->gain_i && decoded_samples > 0) {
-            c->fdsp.vector_fmul_scalar((float*)frame->extended_data[i],
+            c->fdsp->vector_fmul_scalar((float*)frame->extended_data[i],
                                        (float*)frame->extended_data[i],
                                        c->gain, FFALIGN(decoded_samples, 8));
         }
@@ -613,7 +611,7 @@ static av_cold void opus_decode_flush(AVCodecContext *ctx)
 
         if (s->celt_delay)
             av_audio_fifo_drain(s->celt_delay, av_audio_fifo_size(s->celt_delay));
-        avresample_close(s->avr);
+        swr_close(s->swr);
 
         av_audio_fifo_drain(c->sync_buffers[i], av_audio_fifo_size(c->sync_buffers[i]));
 
@@ -637,7 +635,7 @@ static av_cold int opus_decode_close(AVCodecContext *avctx)
         s->out_dummy_allocated_size = 0;
 
         av_audio_fifo_free(s->celt_delay);
-        avresample_free(&s->avr);
+        swr_free(&s->swr);
     }
 
     av_freep(&c->streams);
@@ -654,6 +652,7 @@ static av_cold int opus_decode_close(AVCodecContext *avctx)
     c->nb_streams = 0;
 
     av_freep(&c->channel_maps);
+    av_freep(&c->fdsp);
 
     return 0;
 }
@@ -666,12 +665,17 @@ static av_cold int opus_decode_init(AVCodecContext *avctx)
     avctx->sample_fmt  = AV_SAMPLE_FMT_FLTP;
     avctx->sample_rate = 48000;
 
-    avpriv_float_dsp_init(&c->fdsp, 0);
+    c->fdsp = avpriv_float_dsp_alloc(0);
+    if (!c->fdsp)
+        return AVERROR(ENOMEM);
 
     /* find out the channel configuration */
     ret = ff_opus_parse_extradata(avctx, c);
-    if (ret < 0)
+    if (ret < 0) {
+        av_freep(&c->channel_maps);
+        av_freep(&c->fdsp);
         return ret;
+    }
 
     /* allocate and init each independent decoder */
     c->streams = av_mallocz_array(c->nb_streams, sizeof(*c->streams));
@@ -699,18 +703,19 @@ static av_cold int opus_decode_init(AVCodecContext *avctx)
             s->redundancy_output[j] = s->redundancy_buf[j];
         }
 
-        s->fdsp = &c->fdsp;
+        s->fdsp = c->fdsp;
 
-        s->avr = avresample_alloc_context();
-        if (!s->avr)
+        s->swr =swr_alloc();
+        if (!s->swr)
             goto fail;
 
         layout = (s->output_channels == 1) ? AV_CH_LAYOUT_MONO : AV_CH_LAYOUT_STEREO;
-        av_opt_set_int(s->avr, "in_sample_fmt",      avctx->sample_fmt,  0);
-        av_opt_set_int(s->avr, "out_sample_fmt",     avctx->sample_fmt,  0);
-        av_opt_set_int(s->avr, "in_channel_layout",  layout,             0);
-        av_opt_set_int(s->avr, "out_channel_layout", layout,             0);
-        av_opt_set_int(s->avr, "out_sample_rate",    avctx->sample_rate, 0);
+        av_opt_set_int(s->swr, "in_sample_fmt",      avctx->sample_fmt,  0);
+        av_opt_set_int(s->swr, "out_sample_fmt",     avctx->sample_fmt,  0);
+        av_opt_set_int(s->swr, "in_channel_layout",  layout,             0);
+        av_opt_set_int(s->swr, "out_channel_layout", layout,             0);
+        av_opt_set_int(s->swr, "out_sample_rate",    avctx->sample_rate, 0);
+        av_opt_set_int(s->swr, "filter_size",        16,                 0);
 
         ret = ff_silk_init(avctx, &s->silk, s->output_channels);
         if (ret < 0)
diff --git a/libavcodec/paf.h b/libavcodec/paf.h
new file mode 100644
index 0000000..ce8245f
--- /dev/null
+++ b/libavcodec/paf.h
@@ -0,0 +1,28 @@
+/*
+ * Packed Animation File decoder/demuxer common code
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PAF_H
+#define AVCODEC_PAF_H
+
+#define PAF_SOUND_SAMPLES     2205
+#define PAF_SOUND_FRAME_SIZE  ((256 + PAF_SOUND_SAMPLES) * 2)
+
+#endif /* AVCODEC_PAF_H */
diff --git a/libavcodec/pafaudio.c b/libavcodec/pafaudio.c
index c83e7f5..12f473a 100644
--- a/libavcodec/pafaudio.c
+++ b/libavcodec/pafaudio.c
@@ -2,20 +2,20 @@
  * Packed Animation File audio decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "mathops.h"
-
-#define PAF_SOUND_SAMPLES     2205
-#define PAF_SOUND_FRAME_SIZE  ((256 + PAF_SOUND_SAMPLES) * 2)
+#include "paf.h"
 
 static av_cold int paf_audio_init(AVCodecContext *avctx)
 {
diff --git a/libavcodec/pafvideo.c b/libavcodec/pafvideo.c
index b77f47e..cab3129 100644
--- a/libavcodec/pafvideo.c
+++ b/libavcodec/pafvideo.c
@@ -2,20 +2,20 @@
  * Packed Animation File video decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,15 +26,24 @@
 #include "copy_block.h"
 #include "internal.h"
 
+
 static const uint8_t block_sequences[16][8] = {
-    { 0, 0, 0, 0, 0, 0, 0, 0 }, { 2, 0, 0, 0, 0, 0, 0, 0 },
-    { 5, 7, 0, 0, 0, 0, 0, 0 }, { 5, 0, 0, 0, 0, 0, 0, 0 },
-    { 6, 0, 0, 0, 0, 0, 0, 0 }, { 5, 7, 5, 7, 0, 0, 0, 0 },
-    { 5, 7, 5, 0, 0, 0, 0, 0 }, { 5, 7, 6, 0, 0, 0, 0, 0 },
-    { 5, 5, 0, 0, 0, 0, 0, 0 }, { 3, 0, 0, 0, 0, 0, 0, 0 },
-    { 6, 6, 0, 0, 0, 0, 0, 0 }, { 2, 4, 0, 0, 0, 0, 0, 0 },
-    { 2, 4, 5, 7, 0, 0, 0, 0 }, { 2, 4, 5, 0, 0, 0, 0, 0 },
-    { 2, 4, 6, 0, 0, 0, 0, 0 }, { 2, 4, 5, 7, 5, 7, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 2, 0, 0, 0, 0, 0, 0, 0 },
+    { 5, 7, 0, 0, 0, 0, 0, 0 },
+    { 5, 0, 0, 0, 0, 0, 0, 0 },
+    { 6, 0, 0, 0, 0, 0, 0, 0 },
+    { 5, 7, 5, 7, 0, 0, 0, 0 },
+    { 5, 7, 5, 0, 0, 0, 0, 0 },
+    { 5, 7, 6, 0, 0, 0, 0, 0 },
+    { 5, 5, 0, 0, 0, 0, 0, 0 },
+    { 3, 0, 0, 0, 0, 0, 0, 0 },
+    { 6, 6, 0, 0, 0, 0, 0, 0 },
+    { 2, 4, 0, 0, 0, 0, 0, 0 },
+    { 2, 4, 5, 7, 0, 0, 0, 0 },
+    { 2, 4, 5, 0, 0, 0, 0, 0 },
+    { 2, 4, 6, 0, 0, 0, 0, 0 },
+    { 2, 4, 5, 7, 5, 7, 0, 0 },
 };
 
 typedef struct PAFVideoDecContext {
@@ -156,9 +165,11 @@ static int decode_0(PAFVideoDecContext *c, uint8_t *pkt, uint8_t code)
     i = bytestream2_get_byte(&c->gb);
     if (i) {
         if (code & 0x10) {
-            int pos = bytestream2_tell(&c->gb) & 3;
-            if (pos)
-                bytestream2_skip(&c->gb, 4 - pos);
+            int align;
+
+            align = bytestream2_tell(&c->gb) & 3;
+            if (align)
+                bytestream2_skip(&c->gb, 4 - align);
         }
         do {
             int page, val, x, y;
diff --git a/libavcodec/pamenc.c b/libavcodec/pamenc.c
index 2b63af9..50c9fcb 100644
--- a/libavcodec/pamenc.c
+++ b/libavcodec/pamenc.c
@@ -2,54 +2,39 @@
  * PAM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/imgutils.h"
-
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 
 static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *pict, int *got_packet)
+                            const AVFrame *p, int *got_packet)
 {
     uint8_t *bytestream_start, *bytestream, *bytestream_end;
-    const AVFrame * const p = pict;
     int i, h, w, n, linesize, depth, maxval, ret;
     const char *tuple_type;
     uint8_t *ptr;
-    int size = av_image_get_buffer_size(avctx->pix_fmt,
-                                        avctx->width, avctx->height, 1);
-
-    if ((ret = ff_alloc_packet(pkt, size + 200)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
-        return ret;
-    }
-
-    bytestream_start =
-    bytestream       = pkt->data;
-    bytestream_end   = pkt->data + pkt->size;
 
     h = avctx->height;
     w = avctx->width;
     switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_MONOWHITE:
-        n          = (w + 7) >> 3;
+    case AV_PIX_FMT_MONOBLACK:
+        n          = w;
         depth      = 1;
         maxval     = 1;
         tuple_type = "BLACKANDWHITE";
@@ -60,21 +45,59 @@ static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         maxval     = 255;
         tuple_type = "GRAYSCALE";
         break;
+    case AV_PIX_FMT_GRAY16BE:
+        n          = w * 2;
+        depth      = 1;
+        maxval     = 0xFFFF;
+        tuple_type = "GRAYSCALE";
+        break;
+    case AV_PIX_FMT_GRAY8A:
+        n          = w * 2;
+        depth      = 2;
+        maxval     = 255;
+        tuple_type = "GRAYSCALE_ALPHA";
+        break;
+    case AV_PIX_FMT_YA16BE:
+        n          = w * 4;
+        depth      = 2;
+        maxval     = 0xFFFF;
+        tuple_type = "GRAYSCALE_ALPHA";
+        break;
     case AV_PIX_FMT_RGB24:
         n          = w * 3;
         depth      = 3;
         maxval     = 255;
         tuple_type = "RGB";
         break;
-    case AV_PIX_FMT_RGB32:
+    case AV_PIX_FMT_RGBA:
         n          = w * 4;
         depth      = 4;
         maxval     = 255;
         tuple_type = "RGB_ALPHA";
         break;
+    case AV_PIX_FMT_RGB48BE:
+        n          = w * 6;
+        depth      = 3;
+        maxval     = 0xFFFF;
+        tuple_type = "RGB";
+        break;
+    case AV_PIX_FMT_RGBA64BE:
+        n          = w * 8;
+        depth      = 4;
+        maxval     = 0xFFFF;
+        tuple_type = "RGB_ALPHA";
+        break;
     default:
         return -1;
     }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, n*h + 200, 0)) < 0)
+        return ret;
+
+    bytestream_start =
+    bytestream       = pkt->data;
+    bytestream_end   = pkt->data + pkt->size;
+
     snprintf(bytestream, bytestream_end - bytestream,
              "P7\nWIDTH %d\nHEIGHT %d\nDEPTH %d\nMAXVAL %d\nTUPLTYPE %s\nENDHDR\n",
              w, h, depth, maxval, tuple_type);
@@ -83,16 +106,11 @@ static int pam_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     ptr      = p->data[0];
     linesize = p->linesize[0];
 
-    if (avctx->pix_fmt == AV_PIX_FMT_RGB32) {
+    if (avctx->pix_fmt == AV_PIX_FMT_MONOBLACK){
         int j;
-        unsigned int v;
-
         for (i = 0; i < h; i++) {
-            for (j = 0; j < w; j++) {
-                v = ((uint32_t *)ptr)[j];
-                bytestream_put_be24(&bytestream, v);
-                *bytestream++ = v >> 24;
-            }
+            for (j = 0; j < w; j++)
+                *bytestream++ = ptr[j >> 3] >> (7 - j & 7) & 1;
             ptr += linesize;
         }
     } else {
@@ -129,7 +147,10 @@ AVCodec ff_pam_encoder = {
     .init           = pam_encode_init,
     .encode2        = pam_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB32, AV_PIX_FMT_GRAY8, AV_PIX_FMT_MONOWHITE,
-        AV_PIX_FMT_NONE
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+        AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/parser.c b/libavcodec/parser.c
index 355187a..2c8fc69 100644
--- a/libavcodec/parser.c
+++ b/libavcodec/parser.c
@@ -3,26 +3,28 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdint.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
+#include "libavutil/atomic.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
@@ -41,20 +43,21 @@ AVCodecParser *av_parser_next(const AVCodecParser *p)
 
 void av_register_codec_parser(AVCodecParser *parser)
 {
-    parser->next = av_first_parser;
-    av_first_parser = parser;
+    do {
+        parser->next = av_first_parser;
+    } while (parser->next != avpriv_atomic_ptr_cas((void * volatile *)&av_first_parser, parser->next, parser));
 }
 
 AVCodecParserContext *av_parser_init(int codec_id)
 {
-    AVCodecParserContext *s;
+    AVCodecParserContext *s = NULL;
     AVCodecParser *parser;
     int ret;
 
     if (codec_id == AV_CODEC_ID_NONE)
         return NULL;
 
-    for (parser = av_first_parser; parser != NULL; parser = parser->next) {
+    for (parser = av_first_parser; parser; parser = parser->next) {
         if (parser->codec_ids[0] == codec_id ||
             parser->codec_ids[1] == codec_id ||
             parser->codec_ids[2] == codec_id ||
@@ -67,25 +70,18 @@ AVCodecParserContext *av_parser_init(int codec_id)
 found:
     s = av_mallocz(sizeof(AVCodecParserContext));
     if (!s)
-        return NULL;
+        goto err_out;
     s->parser = parser;
-    if (parser->priv_data_size) {
-        s->priv_data = av_mallocz(parser->priv_data_size);
-        if (!s->priv_data) {
-            av_free(s);
-            return NULL;
-        }
-    }
+    s->priv_data = av_mallocz(parser->priv_data_size);
+    if (!s->priv_data)
+        goto err_out;
+    s->fetch_timestamp=1;
+    s->pict_type = AV_PICTURE_TYPE_I;
     if (parser->parser_init) {
         ret = parser->parser_init(s);
-        if (ret != 0) {
-            av_free(s->priv_data);
-            av_free(s);
-            return NULL;
-        }
+        if (ret != 0)
+            goto err_out;
     }
-    s->fetch_timestamp      = 1;
-    s->pict_type            = AV_PICTURE_TYPE_I;
     s->key_frame            = -1;
 #if FF_API_CONVERGENCE_DURATION
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -98,25 +94,37 @@ FF_ENABLE_DEPRECATION_WARNINGS
     s->format               = -1;
 
     return s;
+
+err_out:
+    if (s)
+        av_freep(&s->priv_data);
+    av_free(s);
+    return NULL;
 }
 
-void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove)
+void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy)
 {
     int i;
 
-    s->dts    =
-    s->pts    = AV_NOPTS_VALUE;
-    s->pos    = -1;
-    s->offset = 0;
+    if (!fuzzy) {
+        s->dts    =
+        s->pts    = AV_NOPTS_VALUE;
+        s->pos    = -1;
+        s->offset = 0;
+    }
     for (i = 0; i < AV_PARSER_PTS_NB; i++) {
         if (s->cur_offset + off >= s->cur_frame_offset[i] &&
             (s->frame_offset < s->cur_frame_offset[i] ||
-             (!s->frame_offset && !s->next_frame_offset)) &&
-            s->cur_frame_end[i]) {
-            s->dts    = s->cur_frame_dts[i];
-            s->pts    = s->cur_frame_pts[i];
-            s->pos    = s->cur_frame_pos[i];
-            s->offset = s->next_frame_offset - s->cur_frame_offset[i];
+             (!s->frame_offset && !s->next_frame_offset)) && // first field/frame
+            // check disabled since MPEG-TS does not send complete PES packets
+            /*s->next_frame_offset + off <*/  s->cur_frame_end[i]){
+
+            if (!fuzzy || s->cur_frame_dts[i] != AV_NOPTS_VALUE) {
+                s->dts    = s->cur_frame_dts[i];
+                s->pts    = s->cur_frame_pts[i];
+                s->pos    = s->cur_frame_pos[i];
+                s->offset = s->next_frame_offset - s->cur_frame_offset[i];
+            }
             if (remove)
                 s->cur_frame_offset[i] = INT64_MAX;
             if (s->cur_offset + off < s->cur_frame_end[i])
@@ -133,6 +141,15 @@ int av_parser_parse2(AVCodecParserContext *s, AVCodecContext *avctx,
     int index, i;
     uint8_t dummy_buf[AV_INPUT_BUFFER_PADDING_SIZE];
 
+    av_assert1(avctx->codec_id != AV_CODEC_ID_NONE);
+
+    /* Parsers only work for the specified codec ids. */
+    av_assert1(avctx->codec_id == s->parser->codec_ids[0] ||
+               avctx->codec_id == s->parser->codec_ids[1] ||
+               avctx->codec_id == s->parser->codec_ids[2] ||
+               avctx->codec_id == s->parser->codec_ids[3] ||
+               avctx->codec_id == s->parser->codec_ids[4]);
+
     if (!(s->flags & PARSER_FLAG_FETCHED_OFFSET)) {
         s->next_frame_offset =
         s->cur_offset        = pos;
@@ -159,11 +176,12 @@ int av_parser_parse2(AVCodecParserContext *s, AVCodecContext *avctx,
         s->last_pts        = s->pts;
         s->last_dts        = s->dts;
         s->last_pos        = s->pos;
-        ff_fetch_timestamp(s, 0, 0);
+        ff_fetch_timestamp(s, 0, 0, 0);
     }
     /* WARNING: the returned index can be negative */
     index = s->parser->parser_parse(s, avctx, (const uint8_t **) poutbuf,
                                     poutbuf_size, buf, buf_size);
+    av_assert0(index > -0x20000000); // The API does not allow returning AVERROR codes
     /* update the file pointer */
     if (*poutbuf_size) {
         /* fill the data for the current frame */
@@ -219,7 +237,7 @@ void av_parser_close(AVCodecParserContext *s)
     if (s) {
         if (s->parser->parser_close)
             s->parser->parser_close(s);
-        av_free(s->priv_data);
+        av_freep(&s->priv_data);
         av_free(s);
     }
 }
@@ -250,8 +268,11 @@ int ff_combine_frame(ParseContext *pc, int next,
                                            *buf_size + pc->index +
                                            AV_INPUT_BUFFER_PADDING_SIZE);
 
-        if (!new_buffer)
+        if (!new_buffer) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", *buf_size + pc->index + AV_INPUT_BUFFER_PADDING_SIZE);
+            pc->index = 0;
             return AVERROR(ENOMEM);
+        }
         pc->buffer = new_buffer;
         memcpy(&pc->buffer[pc->index], *buf, *buf_size);
         pc->index += *buf_size;
@@ -266,9 +287,12 @@ int ff_combine_frame(ParseContext *pc, int next,
         void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
                                            next + pc->index +
                                            AV_INPUT_BUFFER_PADDING_SIZE);
-
-        if (!new_buffer)
+        if (!new_buffer) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", next + pc->index + AV_INPUT_BUFFER_PADDING_SIZE);
+            pc->overread_index =
+            pc->index = 0;
             return AVERROR(ENOMEM);
+        }
         pc->buffer = new_buffer;
         if (next > -AV_INPUT_BUFFER_PADDING_SIZE)
             memcpy(&pc->buffer[pc->index], *buf,
@@ -303,13 +327,14 @@ void ff_parse_close(AVCodecParserContext *s)
 
 int ff_mpeg4video_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
 {
-    int i;
     uint32_t state = -1;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
 
-    for (i = 0; i < buf_size; i++) {
-        state = state << 8 | buf[i];
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
         if (state == 0x1B3 || state == 0x1B6)
-            return i - 3;
+            return ptr - 4 - buf;
     }
+
     return 0;
 }
diff --git a/libavcodec/parser.h b/libavcodec/parser.h
index ea1cae2..ef35547 100644
--- a/libavcodec/parser.h
+++ b/libavcodec/parser.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2003 Fabrice Bellard
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -53,7 +53,8 @@ void ff_parse_close(AVCodecParserContext *s);
  * Fetch timestamps for a specific byte within the current access unit.
  * @param off byte position within the access unit
  * @param remove Found timestamps will be removed if set to 1, kept if set to 0.
+ * @param fuzzy Only use found value if it is more informative than what we already have
  */
-void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove);
+void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy);
 
 #endif /* AVCODEC_PARSER_H */
diff --git a/libavcodec/pcm-bluray.c b/libavcodec/pcm-bluray.c
index 51fcd2d..22c1c08 100644
--- a/libavcodec/pcm-bluray.c
+++ b/libavcodec/pcm-bluray.c
@@ -2,20 +2,20 @@
  * LPCM codecs for PCM format found in Blu-ray PCM streams
  * Copyright (c) 2009, 2013 Christian Schmidt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -71,13 +71,14 @@ static int pcm_bluray_parse_header(AVCodecContext *avctx,
 
     /* get the sample depth and derive the sample format from it */
     avctx->bits_per_coded_sample = bits_per_samples[header[3] >> 6];
-    if (!avctx->bits_per_coded_sample) {
-        av_log(avctx, AV_LOG_ERROR, "reserved sample depth (0)\n");
+    if (!(avctx->bits_per_coded_sample == 16 || avctx->bits_per_coded_sample == 24)) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported sample depth (%d)\n", avctx->bits_per_coded_sample);
         return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = avctx->bits_per_coded_sample == 16 ? AV_SAMPLE_FMT_S16
                                                            : AV_SAMPLE_FMT_S32;
-    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
+    if (avctx->sample_fmt == AV_SAMPLE_FMT_S32)
+        avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     /* get the sample rate. Not all values are used. */
     switch (header[2] & 0x0f) {
@@ -116,9 +117,9 @@ static int pcm_bluray_parse_header(AVCodecContext *avctx,
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
         ff_dlog(avctx,
-                "pcm_bluray_parse_header: %d channels, %d bits per sample, %d Hz, %d bit/s\n",
+                "pcm_bluray_parse_header: %d channels, %d bits per sample, %d Hz, %"PRId64" bit/s\n",
                 avctx->channels, avctx->bits_per_coded_sample,
-                avctx->sample_rate, avctx->bit_rate);
+                avctx->sample_rate, (int64_t)avctx->bit_rate);
     return 0;
 }
 
@@ -154,10 +155,8 @@ static int pcm_bluray_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = samples;
-    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0)
         return retval;
-    }
     dst16 = (int16_t *)frame->data[0];
     dst32 = (int32_t *)frame->data[0];
 
diff --git a/libavcodec/pcm-dvd.c b/libavcodec/pcm-dvd.c
index 62aacf8..04c321e 100644
--- a/libavcodec/pcm-dvd.c
+++ b/libavcodec/pcm-dvd.c
@@ -2,20 +2,20 @@
  * LPCM codecs for PCM formats found in Video DVD streams
  * Copyright (c) 2013 Christian Schmidt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 typedef struct PCMDVDContext {
     uint32_t last_header;    // Cached header to see if parsing is needed
     int block_size;          // Size of a block of samples in bytes
+    int last_block_size;     // Size of the last block of samples in bytes
     int samples_per_block;   // Number of samples per channel per block
     int groups_per_block;    // Number of 20/24-bit sample groups per block
     uint8_t *extra_samples;  // Pointer to leftover samples from a frame
@@ -69,9 +70,10 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
     /* early exit if the header didn't change apart from the frame number */
     if (s->last_header == header_int)
         return 0;
+    s->last_header = -1;
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
-        ff_dlog(avctx, "pcm_dvd_parse_header: header = %02x%02x%02x\n",
+        av_log(avctx, AV_LOG_DEBUG, "pcm_dvd_parse_header: header = %02x%02x%02x\n",
                 header[0], header[1], header[2]);
     /*
      * header[0] emphasis (1), muse(1), reserved(1), frame number(5)
@@ -85,7 +87,9 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
     /* get the sample depth and derive the sample format from it */
     avctx->bits_per_coded_sample = 16 + (header[1] >> 6 & 3) * 4;
     if (avctx->bits_per_coded_sample == 28) {
-        av_log(avctx, AV_LOG_ERROR, "PCM DVD unsupported sample depth\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "PCM DVD unsupported sample depth %i\n",
+               avctx->bits_per_coded_sample);
         return AVERROR_INVALIDDATA;
     }
     avctx->sample_fmt = avctx->bits_per_coded_sample == 16 ? AV_SAMPLE_FMT_S16
@@ -136,9 +140,9 @@ static int pcm_dvd_parse_header(AVCodecContext *avctx, const uint8_t *header)
 
     if (avctx->debug & FF_DEBUG_PICT_INFO)
         ff_dlog(avctx,
-                "pcm_dvd_parse_header: %d channels, %d bits per sample, %d Hz, %d bit/s\n",
+                "pcm_dvd_parse_header: %d channels, %d bits per sample, %d Hz, %"PRId64" bit/s\n",
                 avctx->channels, avctx->bits_per_coded_sample,
-                avctx->sample_rate, avctx->bit_rate);
+                avctx->sample_rate, (int64_t)avctx->bit_rate);
 
     s->last_header = header_int;
 
@@ -170,6 +174,17 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
         return dst16;
     }
     case 20:
+        if (avctx->channels == 1) {
+            do {
+                for (i = 2; i; i--) {
+                    dst32[0] = bytestream2_get_be16u(&gb) << 16;
+                    dst32[1] = bytestream2_get_be16u(&gb) << 16;
+                    t = bytestream2_get_byteu(&gb);
+                    *dst32++ += (t & 0xf0) << 8;
+                    *dst32++ += (t & 0x0f) << 12;
+                }
+            } while (--blocks);
+        } else {
         do {
             for (i = s->groups_per_block; i; i--) {
                 dst32[0] = bytestream2_get_be16u(&gb) << 16;
@@ -184,8 +199,19 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
                 *dst32++ += (t & 0x0f) << 12;
             }
         } while (--blocks);
+        }
         return dst32;
     case 24:
+        if (avctx->channels == 1) {
+            do {
+                for (i = 2; i; i--) {
+                    dst32[0] = bytestream2_get_be16u(&gb) << 16;
+                    dst32[1] = bytestream2_get_be16u(&gb) << 16;
+                    *dst32++ += bytestream2_get_byteu(&gb) << 8;
+                    *dst32++ += bytestream2_get_byteu(&gb) << 8;
+                }
+            } while (--blocks);
+        } else {
         do {
             for (i = s->groups_per_block; i; i--) {
                 dst32[0] = bytestream2_get_be16u(&gb) << 16;
@@ -198,6 +224,7 @@ static void *pcm_dvd_decode_samples(AVCodecContext *avctx, const uint8_t *src,
                 *dst32++ += bytestream2_get_byteu(&gb) << 8;
             }
         } while (--blocks);
+        }
         return dst32;
     default:
         return NULL;
@@ -222,6 +249,11 @@ static int pcm_dvd_decode_frame(AVCodecContext *avctx, void *data,
 
     if ((retval = pcm_dvd_parse_header(avctx, src)))
         return retval;
+    if (s->last_block_size && s->last_block_size != s->block_size) {
+        av_log(avctx, AV_LOG_WARNING, "block_size has changed %d != %d\n", s->last_block_size, s->block_size);
+        s->extra_sample_count = 0;
+    }
+    s->last_block_size = s->block_size;
     src      += 3;
     buf_size -= 3;
 
@@ -229,10 +261,8 @@ static int pcm_dvd_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = blocks * s->samples_per_block;
-    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((retval = ff_get_buffer(avctx, frame, 0)) < 0)
         return retval;
-    }
     dst = frame->data[0];
 
     /* consume leftover samples from last packet */
diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c
index 959c50b..9a13602 100644
--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -2,20 +2,20 @@
  * PCM codecs
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ static av_cold int pcm_encode_init(AVCodecContext *avctx)
 
     avctx->bits_per_coded_sample = av_get_bits_per_sample(avctx->codec->id);
     avctx->block_align           = avctx->channels * avctx->bits_per_coded_sample / 8;
-    avctx->bit_rate              = avctx->block_align * avctx->sample_rate * 8;
+    avctx->bit_rate              = avctx->block_align * 8LL * avctx->sample_rate;
 
     return 0;
 }
@@ -69,13 +69,24 @@ static av_cold int pcm_encode_init(AVCodecContext *avctx)
         bytestream_put_ ## endian(&dst, v);                             \
     }
 
+#define ENCODE_PLANAR(type, endian, dst, n, shift, offset)              \
+    n /= avctx->channels;                                               \
+    for (c = 0; c < avctx->channels; c++) {                             \
+        int i;                                                          \
+        samples_ ## type = (const type *) frame->extended_data[c];      \
+        for (i = n; i > 0; i--) {                                       \
+            register type v = (*samples_ ## type++ >> shift) + offset;  \
+            bytestream_put_ ## endian(&dst, v);                         \
+        }                                                               \
+    }
+
 static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                             const AVFrame *frame, int *got_packet_ptr)
 {
-    int n, sample_size, v, ret;
+    int n, c, sample_size, v, ret;
     const short *samples;
     unsigned char *dst;
-    const uint8_t *srcu8;
+    const uint8_t *samples_uint8_t;
     const int16_t *samples_int16_t;
     const int32_t *samples_int32_t;
     const int64_t *samples_int64_t;
@@ -86,10 +97,8 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     n           = frame->nb_samples * avctx->channels;
     samples     = (const short *)frame->data[0];
 
-    if ((ret = ff_alloc_packet(avpkt, n * sample_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, n * sample_size, n * sample_size)) < 0)
         return ret;
-    }
     dst = avpkt->data;
 
     switch (avctx->codec->id) {
@@ -102,6 +111,9 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_S24LE:
         ENCODE(int32_t, le24, samples, dst, n, 8, 0)
         break;
+    case AV_CODEC_ID_PCM_S24LE_PLANAR:
+        ENCODE_PLANAR(int32_t, le24, dst, n, 8, 0)
+        break;
     case AV_CODEC_ID_PCM_S24BE:
         ENCODE(int32_t, be24, samples, dst, n, 8, 0)
         break;
@@ -127,11 +139,10 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         ENCODE(uint16_t, be16, samples, dst, n, 0, 0x8000)
         break;
     case AV_CODEC_ID_PCM_S8:
-        srcu8 = frame->data[0];
-        for (; n > 0; n--) {
-            v      = *srcu8++;
-            *dst++ = v - 128;
-        }
+        ENCODE(uint8_t, byte, samples, dst, n, 0, -128)
+        break;
+    case AV_CODEC_ID_PCM_S8_PLANAR:
+        ENCODE_PLANAR(uint8_t, byte, dst, n, 0, -128)
         break;
 #if HAVE_BIGENDIAN
     case AV_CODEC_ID_PCM_F64LE:
@@ -141,9 +152,15 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_F32LE:
         ENCODE(int32_t, le32, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+        ENCODE_PLANAR(int32_t, le32, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_S16LE:
         ENCODE(int16_t, le16, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+        ENCODE_PLANAR(int16_t, le16, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_F64BE:
     case AV_CODEC_ID_PCM_F32BE:
     case AV_CODEC_ID_PCM_S32BE:
@@ -159,6 +176,9 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     case AV_CODEC_ID_PCM_S16BE:
         ENCODE(int16_t, be16, samples, dst, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+        ENCODE_PLANAR(int16_t, be16, dst, n, 0, 0)
+        break;
     case AV_CODEC_ID_PCM_F64LE:
     case AV_CODEC_ID_PCM_F32LE:
     case AV_CODEC_ID_PCM_S32LE:
@@ -166,7 +186,18 @@ static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
 #endif /* HAVE_BIGENDIAN */
     case AV_CODEC_ID_PCM_U8:
         memcpy(dst, samples, n * sample_size);
-        dst += n * sample_size;
+        break;
+#if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+#else
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+#endif /* HAVE_BIGENDIAN */
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            const uint8_t *src = frame->extended_data[c];
+            bytestream_put_buffer(&dst, src, n * sample_size);
+        }
         break;
     case AV_CODEC_ID_PCM_ALAW:
         for (; n > 0; n--) {
@@ -202,7 +233,7 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
         return AVERROR(EINVAL);
     }
 
-    switch (avctx->codec->id) {
+    switch (avctx->codec_id) {
     case AV_CODEC_ID_PCM_ALAW:
         for (i = 0; i < 256; i++)
             s->table[i] = alaw2linear(i);
@@ -218,7 +249,7 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
     avctx->sample_fmt = avctx->codec->sample_fmts[0];
 
     if (avctx->sample_fmt == AV_SAMPLE_FMT_S32)
-        avctx->bits_per_raw_sample = av_get_bits_per_sample(avctx->codec->id);
+        avctx->bits_per_raw_sample = av_get_bits_per_sample(avctx->codec_id);
 
     return 0;
 }
@@ -240,28 +271,17 @@ static av_cold int pcm_decode_init(AVCodecContext *avctx)
         dst += size / 8;                                                \
     }
 
-#if HAVE_BIGENDIAN
-#define DECODE_PLANAR(size, endian, src, dst, n, shift, offset)         \
-    {                                                                   \
-        int n2;                                                         \
-        n /= avctx->channels;                                           \
-        for (c = 0; c < avctx->channels; c++) {                         \
-            samples = frame->extended_data[c];                          \
-            n2 = n;                                                     \
-            DECODE(size, endian, src, samples, n2, 0, 0)                \
-        }                                                               \
-    }
-#else
 #define DECODE_PLANAR(size, endian, src, dst, n, shift, offset)         \
-    {                                                                   \
-        n /= avctx->channels;                                           \
-        for (c = 0; c < avctx->channels; c++) {                         \
-            samples = frame->extended_data[c];                          \
-            memcpy(samples, src, n * size / 8);                         \
-            src += n * size / 8;                                        \
+    n /= avctx->channels;                                               \
+    for (c = 0; c < avctx->channels; c++) {                             \
+        int i;                                                          \
+        dst = frame->extended_data[c];                                \
+        for (i = n; i > 0; i--) {                                       \
+            uint ## size ## _t v = bytestream_get_ ## endian(&src);     \
+            AV_WN ## size ## A(dst, (v - offset) << shift);             \
+            dst += size / 8;                                            \
         }                                                               \
     }
-#endif /* HAVE_BIGENDIAN */
 
 static int pcm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame_ptr, AVPacket *avpkt)
@@ -289,12 +309,24 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR(EINVAL);
     }
 
+    if (avctx->channels == 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->codec_id != avctx->codec->id) {
+        av_log(avctx, AV_LOG_ERROR, "codec ids mismatch\n");
+        return AVERROR(EINVAL);
+    }
+
     n = avctx->channels * sample_size;
 
     if (n && buf_size % n) {
         if (buf_size < n) {
-            av_log(avctx, AV_LOG_ERROR, "invalid PCM packet\n");
-            return -1;
+            av_log(avctx, AV_LOG_ERROR,
+                   "Invalid PCM packet, data has size %d but at least a size of %d was expected\n",
+                   buf_size, n);
+            return AVERROR_INVALIDDATA;
         } else
             buf_size -= buf_size % n;
     }
@@ -303,13 +335,11 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = n * samples_per_block / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = frame->data[0];
 
-    switch (avctx->codec->id) {
+    switch (avctx->codec_id) {
     case AV_CODEC_ID_PCM_U32LE:
         DECODE(32, le32, src, samples, n, 0, 0x80000000)
         break;
@@ -319,6 +349,9 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_S24LE:
         DECODE(32, le24, src, samples, n, 8, 0)
         break;
+    case AV_CODEC_ID_PCM_S24LE_PLANAR:
+        DECODE_PLANAR(32, le24, src, samples, n, 8, 0);
+        break;
     case AV_CODEC_ID_PCM_S24BE:
         DECODE(32, be24, src, samples, n, 8, 0)
         break;
@@ -337,18 +370,6 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
             samples += 2;
         }
         break;
-    case AV_CODEC_ID_PCM_S16BE_PLANAR:
-        DECODE_PLANAR(16, be16, src, samples, n, 0, 0);
-        break;
-    case AV_CODEC_ID_PCM_S16LE_PLANAR:
-        DECODE_PLANAR(16, le16, src, samples, n, 0, 0);
-        break;
-    case AV_CODEC_ID_PCM_S24LE_PLANAR:
-        DECODE_PLANAR(32, le24, src, samples, n, 8, 0);
-        break;
-    case AV_CODEC_ID_PCM_S32LE_PLANAR:
-        DECODE_PLANAR(32, le32, src, samples, n, 0, 0);
-        break;
     case AV_CODEC_ID_PCM_U16LE:
         DECODE(16, le16, src, samples, n, 0, 0x8000)
         break;
@@ -359,6 +380,15 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
         for (; n > 0; n--)
             *samples++ = *src++ + 128;
         break;
+    case AV_CODEC_ID_PCM_S8_PLANAR:
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            int i;
+            samples = frame->extended_data[c];
+            for (i = n; i > 0; i--)
+                *samples++ = *src++ + 128;
+        }
+        break;
 #if HAVE_BIGENDIAN
     case AV_CODEC_ID_PCM_F64LE:
         DECODE(64, le64, src, samples, n, 0, 0)
@@ -367,9 +397,15 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_F32LE:
         DECODE(32, le32, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+        DECODE_PLANAR(32, le32, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_S16LE:
         DECODE(16, le16, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+        DECODE_PLANAR(16, le16, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_F64BE:
     case AV_CODEC_ID_PCM_F32BE:
     case AV_CODEC_ID_PCM_S32BE:
@@ -385,6 +421,9 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_S16BE:
         DECODE(16, be16, src, samples, n, 0, 0)
         break;
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+        DECODE_PLANAR(16, be16, src, samples, n, 0, 0);
+        break;
     case AV_CODEC_ID_PCM_F64LE:
     case AV_CODEC_ID_PCM_F32LE:
     case AV_CODEC_ID_PCM_S32LE:
@@ -393,6 +432,18 @@ static int pcm_decode_frame(AVCodecContext *avctx, void *data,
     case AV_CODEC_ID_PCM_U8:
         memcpy(samples, src, n * sample_size);
         break;
+#if HAVE_BIGENDIAN
+    case AV_CODEC_ID_PCM_S16BE_PLANAR:
+#else
+    case AV_CODEC_ID_PCM_S16LE_PLANAR:
+    case AV_CODEC_ID_PCM_S32LE_PLANAR:
+#endif /* HAVE_BIGENDIAN */
+        n /= avctx->channels;
+        for (c = 0; c < avctx->channels; c++) {
+            samples = frame->extended_data[c];
+            bytestream_get_buffer(&src, samples, n * sample_size);
+        }
+        break;
     case AV_CODEC_ID_PCM_ZORK:
         for (; n > 0; n--) {
             int v = *src++;
@@ -489,25 +540,26 @@ AVCodec ff_ ## name_ ## _decoder = {                                        \
     PCM_DECODER(id, sample_fmt_, name, long_name_)
 
 /* Note: Do not forget to add new entries to the Makefile as well. */
-PCM_CODEC  (PCM_ALAW,         AV_SAMPLE_FMT_S16, pcm_alaw,         "PCM A-law");
+PCM_CODEC  (PCM_ALAW,         AV_SAMPLE_FMT_S16, pcm_alaw,         "PCM A-law / G.711 A-law");
 PCM_CODEC  (PCM_F32BE,        AV_SAMPLE_FMT_FLT, pcm_f32be,        "PCM 32-bit floating point big-endian");
 PCM_CODEC  (PCM_F32LE,        AV_SAMPLE_FMT_FLT, pcm_f32le,        "PCM 32-bit floating point little-endian");
 PCM_CODEC  (PCM_F64BE,        AV_SAMPLE_FMT_DBL, pcm_f64be,        "PCM 64-bit floating point big-endian");
 PCM_CODEC  (PCM_F64LE,        AV_SAMPLE_FMT_DBL, pcm_f64le,        "PCM 64-bit floating point little-endian");
-PCM_DECODER(PCM_LXF,          AV_SAMPLE_FMT_S32P, pcm_lxf,          "PCM signed 20-bit little-endian planar");
-PCM_CODEC  (PCM_MULAW,        AV_SAMPLE_FMT_S16, pcm_mulaw,        "PCM mu-law");
+PCM_DECODER(PCM_LXF,          AV_SAMPLE_FMT_S32P,pcm_lxf,          "PCM signed 20-bit little-endian planar");
+PCM_CODEC  (PCM_MULAW,        AV_SAMPLE_FMT_S16, pcm_mulaw,        "PCM mu-law / G.711 mu-law");
 PCM_CODEC  (PCM_S8,           AV_SAMPLE_FMT_U8,  pcm_s8,           "PCM signed 8-bit");
+PCM_CODEC  (PCM_S8_PLANAR,    AV_SAMPLE_FMT_U8P, pcm_s8_planar,    "PCM signed 8-bit planar");
 PCM_CODEC  (PCM_S16BE,        AV_SAMPLE_FMT_S16, pcm_s16be,        "PCM signed 16-bit big-endian");
-PCM_DECODER(PCM_S16BE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16be_planar, "PCM signed 16-bit big-endian planar");
+PCM_CODEC  (PCM_S16BE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16be_planar, "PCM signed 16-bit big-endian planar");
 PCM_CODEC  (PCM_S16LE,        AV_SAMPLE_FMT_S16, pcm_s16le,        "PCM signed 16-bit little-endian");
-PCM_DECODER(PCM_S16LE_PLANAR, AV_SAMPLE_FMT_S16P, pcm_s16le_planar, "PCM 16-bit little-endian planar");
+PCM_CODEC  (PCM_S16LE_PLANAR, AV_SAMPLE_FMT_S16P,pcm_s16le_planar, "PCM signed 16-bit little-endian planar");
 PCM_CODEC  (PCM_S24BE,        AV_SAMPLE_FMT_S32, pcm_s24be,        "PCM signed 24-bit big-endian");
 PCM_CODEC  (PCM_S24DAUD,      AV_SAMPLE_FMT_S16, pcm_s24daud,      "PCM D-Cinema audio signed 24-bit");
 PCM_CODEC  (PCM_S24LE,        AV_SAMPLE_FMT_S32, pcm_s24le,        "PCM signed 24-bit little-endian");
-PCM_DECODER(PCM_S24LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s24le_planar, "PCM signed 24-bit little-endian planar");
+PCM_CODEC  (PCM_S24LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s24le_planar, "PCM signed 24-bit little-endian planar");
 PCM_CODEC  (PCM_S32BE,        AV_SAMPLE_FMT_S32, pcm_s32be,        "PCM signed 32-bit big-endian");
 PCM_CODEC  (PCM_S32LE,        AV_SAMPLE_FMT_S32, pcm_s32le,        "PCM signed 32-bit little-endian");
-PCM_DECODER(PCM_S32LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s32le_planar, "PCM signed 32-bit little-endian planar");
+PCM_CODEC  (PCM_S32LE_PLANAR, AV_SAMPLE_FMT_S32P,pcm_s32le_planar, "PCM signed 32-bit little-endian planar");
 PCM_CODEC  (PCM_U8,           AV_SAMPLE_FMT_U8,  pcm_u8,           "PCM unsigned 8-bit");
 PCM_CODEC  (PCM_U16BE,        AV_SAMPLE_FMT_S16, pcm_u16be,        "PCM unsigned 16-bit big-endian");
 PCM_CODEC  (PCM_U16LE,        AV_SAMPLE_FMT_S16, pcm_u16le,        "PCM unsigned 16-bit little-endian");
@@ -516,3 +568,4 @@ PCM_CODEC  (PCM_U24LE,        AV_SAMPLE_FMT_S32, pcm_u24le,        "PCM unsigned
 PCM_CODEC  (PCM_U32BE,        AV_SAMPLE_FMT_S32, pcm_u32be,        "PCM unsigned 32-bit big-endian");
 PCM_CODEC  (PCM_U32LE,        AV_SAMPLE_FMT_S32, pcm_u32le,        "PCM unsigned 32-bit little-endian");
 PCM_DECODER(PCM_ZORK,         AV_SAMPLE_FMT_U8,  pcm_zork,         "PCM Zork");
+
diff --git a/libavcodec/pcm_tablegen.c b/libavcodec/pcm_tablegen.c
index 7b4bc8c..bf8e7fb 100644
--- a/libavcodec/pcm_tablegen.c
+++ b/libavcodec/pcm_tablegen.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pcm_tablegen.h b/libavcodec/pcm_tablegen.h
index 438c2b9..7ce147f 100644
--- a/libavcodec/pcm_tablegen.h
+++ b/libavcodec/pcm_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -84,21 +84,21 @@ static av_cold void build_xlaw_table(uint8_t *linear_to_xlaw,
 {
     int i, j, v, v1, v2;
 
-    j = 0;
-    for(i=0;i<128;i++) {
-        if (i != 127) {
-            v1 = xlaw2linear(i ^ mask);
-            v2 = xlaw2linear((i + 1) ^ mask);
-            v = (v1 + v2 + 4) >> 3;
-        } else {
-            v = 8192;
-        }
-        for(;j<v;j++) {
+    j = 1;
+    linear_to_xlaw[8192] = mask;
+    for(i=0;i<127;i++) {
+        v1 = xlaw2linear(i ^ mask);
+        v2 = xlaw2linear((i + 1) ^ mask);
+        v = (v1 + v2 + 4) >> 3;
+        for(;j<v;j+=1) {
+            linear_to_xlaw[8192 - j] = (i ^ (mask ^ 0x80));
             linear_to_xlaw[8192 + j] = (i ^ mask);
-            if (j > 0)
-                linear_to_xlaw[8192 - j] = (i ^ (mask ^ 0x80));
         }
     }
+    for(;j<8192;j++) {
+        linear_to_xlaw[8192 - j] = (127 ^ (mask ^ 0x80));
+        linear_to_xlaw[8192 + j] = (127 ^ mask);
+    }
     linear_to_xlaw[0] = linear_to_xlaw[1];
 }
 
diff --git a/libavcodec/pcx.c b/libavcodec/pcx.c
index aa69d51..1d3ee8d 100644
--- a/libavcodec/pcx.c
+++ b/libavcodec/pcx.c
@@ -5,20 +5,20 @@
  * This decoder does not support CGA palettes. I am unable to find samples
  * and Netpbm cannot generate them.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,44 +28,37 @@
 #include "get_bits.h"
 #include "internal.h"
 
-/**
- * @return advanced src pointer
- */
-static const uint8_t *pcx_rle_decode(const uint8_t *src,
-                                     const uint8_t *end,
-                                     uint8_t *dst,
-                                     unsigned int bytes_per_scanline,
-                                     int compressed)
+static void pcx_rle_decode(GetByteContext *gb,
+                           uint8_t *dst,
+                           unsigned int bytes_per_scanline,
+                           int compressed)
 {
     unsigned int i = 0;
     unsigned char run, value;
 
     if (compressed) {
-        while (i < bytes_per_scanline && src < end) {
+        while (i < bytes_per_scanline && bytestream2_get_bytes_left(gb)>0) {
             run   = 1;
-            value = *src++;
-            if (value >= 0xc0 && src < end) {
+            value = bytestream2_get_byte(gb);
+            if (value >= 0xc0 && bytestream2_get_bytes_left(gb)>0) {
                 run   = value & 0x3f;
-                value = *src++;
+                value = bytestream2_get_byte(gb);
             }
             while (i < bytes_per_scanline && run--)
                 dst[i++] = value;
         }
     } else {
-        memcpy(dst, src, bytes_per_scanline);
-        src += bytes_per_scanline;
+        bytestream2_get_buffer(gb, dst, bytes_per_scanline);
     }
-
-    return src;
 }
 
-static void pcx_palette(const uint8_t **src, uint32_t *dst,
-                        unsigned int pallen)
+static void pcx_palette(GetByteContext *gb, uint32_t *dst, int pallen)
 {
-    unsigned int i;
+    int i;
 
+    pallen = FFMIN(pallen, bytestream2_get_bytes_left(gb) / 3);
     for (i = 0; i < pallen; i++)
-        *dst++ = bytestream_get_be24(src);
+        *dst++ = 0xFF000000 | bytestream2_get_be24u(gb);
     if (pallen < 256)
         memset(dst, 0, (256 - pallen) * sizeof(*dst));
 }
@@ -73,28 +66,32 @@ static void pcx_palette(const uint8_t **src, uint32_t *dst,
 static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-    AVFrame *const p   = data;
+    GetByteContext gb;
+    AVFrame * const p  = data;
     int compressed, xmin, ymin, xmax, ymax;
+    int ret;
     unsigned int w, h, bits_per_pixel, bytes_per_line, nplanes, stride, y, x,
                  bytes_per_scanline;
-    uint8_t *ptr;
-    const uint8_t *buf_end = buf + buf_size;
-    const uint8_t *bufstart = buf;
-    uint8_t *scanline;
-    int ret = -1;
+    uint8_t *ptr, *scanline;
+
+    if (avpkt->size < 128)
+        return AVERROR_INVALIDDATA;
+
+    bytestream2_init(&gb, avpkt->data, avpkt->size);
 
-    if (buf[0] != 0x0a || buf[1] > 5) {
+    if (bytestream2_get_byteu(&gb) != 0x0a || bytestream2_get_byteu(&gb) > 5) {
         av_log(avctx, AV_LOG_ERROR, "this is not PCX encoded data\n");
         return AVERROR_INVALIDDATA;
     }
 
-    compressed = buf[2];
-    xmin       = AV_RL16(buf + 4);
-    ymin       = AV_RL16(buf + 6);
-    xmax       = AV_RL16(buf + 8);
-    ymax       = AV_RL16(buf + 10);
+    compressed                     = bytestream2_get_byteu(&gb);
+    bits_per_pixel                 = bytestream2_get_byteu(&gb);
+    xmin                           = bytestream2_get_le16u(&gb);
+    ymin                           = bytestream2_get_le16u(&gb);
+    xmax                           = bytestream2_get_le16u(&gb);
+    ymax                           = bytestream2_get_le16u(&gb);
+    avctx->sample_aspect_ratio.num = bytestream2_get_le16u(&gb);
+    avctx->sample_aspect_ratio.den = bytestream2_get_le16u(&gb);
 
     if (xmax < xmin || ymax < ymin) {
         av_log(avctx, AV_LOG_ERROR, "invalid image dimensions\n");
@@ -104,13 +101,13 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     w = xmax - xmin + 1;
     h = ymax - ymin + 1;
 
-    bits_per_pixel     = buf[3];
-    bytes_per_line     = AV_RL16(buf + 66);
-    nplanes            = buf[65];
+    bytestream2_skipu(&gb, 49);
+    nplanes            = bytestream2_get_byteu(&gb);
+    bytes_per_line     = bytestream2_get_le16u(&gb);
     bytes_per_scanline = nplanes * bytes_per_line;
 
     if (bytes_per_scanline < (w * bits_per_pixel * nplanes + 7) / 8 ||
-        (!compressed && bytes_per_scanline > buf_size / h)) {
+        (!compressed && bytes_per_scanline > bytestream2_get_bytes_left(&gb) / h)) {
         av_log(avctx, AV_LOG_ERROR, "PCX data is corrupted\n");
         return AVERROR_INVALIDDATA;
     }
@@ -133,29 +130,26 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    buf += 128;
+    bytestream2_skipu(&gb, 60);
 
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
     ptr    = p->data[0];
     stride = p->linesize[0];
 
-    scanline = av_malloc(bytes_per_scanline);
+    scanline = av_malloc(bytes_per_scanline + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!scanline)
         return AVERROR(ENOMEM);
 
     if (nplanes == 3 && bits_per_pixel == 8) {
         for (y = 0; y < h; y++) {
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
 
             for (x = 0; x < w; x++) {
                 ptr[3 * x]     = scanline[x];
@@ -166,39 +160,37 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             ptr += stride;
         }
     } else if (nplanes == 1 && bits_per_pixel == 8) {
-        const uint8_t *palstart = bufstart + buf_size - 769;
+        int palstart = avpkt->size - 769;
 
-        if (buf_size < 769) {
+        if (avpkt->size < 769) {
             av_log(avctx, AV_LOG_ERROR, "File is too short\n");
             ret = avctx->err_recognition & AV_EF_EXPLODE ?
-                  AVERROR_INVALIDDATA : buf_size;
+                  AVERROR_INVALIDDATA : avpkt->size;
             goto end;
         }
 
         for (y = 0; y < h; y++, ptr += stride) {
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
             memcpy(ptr, scanline, w);
         }
 
-        if (buf != palstart) {
+        if (bytestream2_tell(&gb) != palstart) {
             av_log(avctx, AV_LOG_WARNING, "image data possibly corrupted\n");
-            buf = palstart;
+            bytestream2_seek(&gb, palstart, SEEK_SET);
         }
-        if (*buf++ != 12) {
+        if (bytestream2_get_byte(&gb) != 12) {
             av_log(avctx, AV_LOG_ERROR, "expected palette after image data\n");
             ret = avctx->err_recognition & AV_EF_EXPLODE ?
-                  AVERROR_INVALIDDATA : buf_size;
+                  AVERROR_INVALIDDATA : avpkt->size;
             goto end;
         }
     } else if (nplanes == 1) {   /* all packed formats, max. 16 colors */
         GetBitContext s;
 
         for (y = 0; y < h; y++) {
-            init_get_bits(&s, scanline, bytes_per_scanline << 3);
+            init_get_bits8(&s, scanline, bytes_per_scanline);
 
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
 
             for (x = 0; x < w; x++)
                 ptr[x] = get_bits(&s, bits_per_pixel);
@@ -208,8 +200,7 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         int i;
 
         for (y = 0; y < h; y++) {
-            buf = pcx_rle_decode(buf, buf_end,
-                                 scanline, bytes_per_scanline, compressed);
+            pcx_rle_decode(&gb, scanline, bytes_per_scanline, compressed);
 
             for (x = 0; x < w; x++) {
                 int m = 0x80 >> (x & 7), v = 0;
@@ -223,16 +214,20 @@ static int pcx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
+    ret = bytestream2_tell(&gb);
     if (nplanes == 1 && bits_per_pixel == 8) {
-        pcx_palette(&buf, (uint32_t *)p->data[1], 256);
+        pcx_palette(&gb, (uint32_t *)p->data[1], 256);
+        ret += 256 * 3;
+    } else if (bits_per_pixel * nplanes == 1) {
+        AV_WN32A(p->data[1]  , 0xFF000000);
+        AV_WN32A(p->data[1]+4, 0xFFFFFFFF);
     } else if (bits_per_pixel < 8) {
-        const uint8_t *palette = bufstart + 16;
-        pcx_palette(&palette, (uint32_t *)p->data[1], 16);
+        bytestream2_seek(&gb, 16, SEEK_SET);
+        pcx_palette(&gb, (uint32_t *)p->data[1], 16);
     }
 
     *got_frame = 1;
 
-    ret = buf - bufstart;
 end:
     av_free(scanline);
     return ret;
diff --git a/libavcodec/pcxenc.c b/libavcodec/pcxenc.c
index 7fc0d9c..6135944 100644
--- a/libavcodec/pcxenc.c
+++ b/libavcodec/pcxenc.c
@@ -2,20 +2,20 @@
  * PC Paintbrush PCX (.pcx) image encoder
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,11 +23,12 @@
  * @file
  * PCX image encoder
  * @author Daniel Verkamp
- * @see http://www.qzx.com/pc-gpe/pcx.txt
+ * @see http://bespin.org/~qz/pc-gpe/pcx.txt
  */
 
 #include "avcodec.h"
 #include "bytestream.h"
+#include "libavutil/imgutils.h"
 #include "internal.h"
 
 static const uint32_t monoblack_pal[16] = { 0x000000, 0xFFFFFF };
@@ -100,8 +101,9 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const uint8_t *buf_end;
     uint8_t *buf;
 
-    int bpp, nplanes, i, y, line_bytes, written, ret, max_pkt_size;
+    int bpp, nplanes, i, y, line_bytes, written, ret, max_pkt_size, sw, sh;
     const uint32_t *pal = NULL;
+    uint32_t palette256[256];
     const uint8_t *src;
 
     if (avctx->width > 65535 || avctx->height > 65535) {
@@ -119,6 +121,11 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_RGB4_BYTE:
     case AV_PIX_FMT_BGR4_BYTE:
     case AV_PIX_FMT_GRAY8:
+        bpp = 8;
+        nplanes = 1;
+        avpriv_set_systematic_pal2(palette256, avctx->pix_fmt);
+        pal = palette256;
+        break;
     case AV_PIX_FMT_PAL8:
         bpp = 8;
         nplanes = 1;
@@ -138,13 +145,16 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     line_bytes = (line_bytes + 1) & ~1;
 
     max_pkt_size = 128 + avctx->height * 2 * line_bytes * nplanes + (pal ? 256*3 + 1 : 0);
-    if ((ret = ff_alloc_packet(pkt, max_pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", max_pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, max_pkt_size, 0)) < 0)
         return ret;
-    }
     buf     = pkt->data;
     buf_end = pkt->data + pkt->size;
 
+    sw = avctx->sample_aspect_ratio.num;
+    sh = avctx->sample_aspect_ratio.den;
+    if (sw > 0xFFFFu || sh > 0xFFFFu)
+        av_reduce(&sw, &sh, sw, sh, 0xFFFFu);
+
     bytestream_put_byte(&buf, 10);                  // manufacturer
     bytestream_put_byte(&buf, 5);                   // version
     bytestream_put_byte(&buf, 1);                   // encoding
@@ -153,8 +163,8 @@ static int pcx_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     bytestream_put_le16(&buf, 0);                   // y min
     bytestream_put_le16(&buf, avctx->width - 1);    // x max
     bytestream_put_le16(&buf, avctx->height - 1);   // y max
-    bytestream_put_le16(&buf, 0);                   // horizontal DPI
-    bytestream_put_le16(&buf, 0);                   // vertical DPI
+    bytestream_put_le16(&buf, sw);                  // horizontal DPI
+    bytestream_put_le16(&buf, sh);                  // vertical DPI
     for (i = 0; i < 16; i++)
         bytestream_put_be24(&buf, pal ? pal[i] : 0);// palette (<= 16 color only)
     bytestream_put_byte(&buf, 0);                   // reserved
diff --git a/libavcodec/pel_template.c b/libavcodec/pel_template.c
index b832ae7..6da7a56 100644
--- a/libavcodec/pel_template.c
+++ b/libavcodec/pel_template.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pgssubdec.c b/libavcodec/pgssubdec.c
index 7bcb1ed..133d08b 100644
--- a/libavcodec/pgssubdec.c
+++ b/libavcodec/pgssubdec.c
@@ -2,20 +2,20 @@
  * PGS subtitle decoder
  * Copyright (c) 2009 Stephen Backway
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,8 +31,9 @@
 
 #include "libavutil/colorspace.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
-#define RGBA(r,g,b,a) (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
+#define RGBA(r,g,b,a) (((unsigned)(a) << 24) | ((r) << 16) | ((g) << 8) | (b))
 #define MAX_EPOCH_PALETTES 8   // Max 8 allowed per PGS epoch
 #define MAX_EPOCH_OBJECTS  64  // Max 64 allowed per PGS epoch
 #define MAX_OBJECT_REFS    2   // Max objects per display set
@@ -90,9 +91,11 @@ typedef struct PGSSubPalettes {
 } PGSSubPalettes;
 
 typedef struct PGSSubContext {
+    AVClass *class;
     PGSSubPresentation presentation;
     PGSSubPalettes     palettes;
     PGSSubObjects      objects;
+    int forced_subs_only;
 } PGSSubContext;
 
 static void flush_cache(AVCodecContext *avctx)
@@ -133,7 +136,7 @@ static PGSSubPalette * find_palette(int id, PGSSubPalettes *palettes)
 
 static av_cold int init_decoder(AVCodecContext *avctx)
 {
-    avctx->pix_fmt = AV_PIX_FMT_PAL8;
+    avctx->pix_fmt     = AV_PIX_FMT_PAL8;
 
     return 0;
 }
@@ -148,7 +151,7 @@ static av_cold int close_decoder(AVCodecContext *avctx)
 /**
  * Decode the RLE data.
  *
- * The subtitle is stored as an Run Length Encoded image.
+ * The subtitle is stored as a Run Length Encoded image.
  *
  * @param avctx contains the current codec context
  * @param sub pointer to the processed subtitle data
@@ -163,7 +166,7 @@ static int decode_rle(AVCodecContext *avctx, AVSubtitleRect *rect,
 
     rle_bitmap_end = buf + buf_size;
 
-    rect->data[0] = av_malloc(rect->w * rect->h);
+    rect->data[0] = av_malloc_array(rect->w, rect->h);
 
     if (!rect->data[0])
         return AVERROR(ENOMEM);
@@ -287,15 +290,15 @@ static int parse_object_segment(AVCodecContext *avctx,
     height = bytestream_get_be16(&buf);
 
     /* Make sure the bitmap is not too large */
-    if (avctx->width < width || avctx->height < height) {
-        av_log(avctx, AV_LOG_ERROR, "Bitmap dimensions larger than video.\n");
+    if (avctx->width < width || avctx->height < height || !width || !height) {
+        av_log(avctx, AV_LOG_ERROR, "Bitmap dimensions (%dx%d) invalid.\n", width, height);
         return AVERROR_INVALIDDATA;
     }
 
     object->w = width;
     object->h = height;
 
-    av_fast_malloc(&object->rle, &object->rle_buffer_size, rle_bitmap_len);
+    av_fast_padded_malloc(&object->rle, &object->rle_buffer_size, rle_bitmap_len);
 
     if (!object->rle)
         return AVERROR(ENOMEM);
@@ -351,8 +354,14 @@ static int parse_palette_segment(AVCodecContext *avctx,
         cb        = bytestream_get_byte(&buf);
         alpha     = bytestream_get_byte(&buf);
 
-        YUV_TO_RGB1(cb, cr);
-        YUV_TO_RGB2(r, g, b, y);
+        /* Default to BT.709 colorimetry. In case of <= 576 height use BT.601 */
+        if (avctx->height <= 0 || avctx->height > 576) {
+            YUV_TO_RGB1_CCIR_BT709(cb, cr);
+        } else {
+            YUV_TO_RGB1_CCIR(cb, cr);
+        }
+
+        YUV_TO_RGB2_CCIR(r, g, b, y);
 
         ff_dlog(avctx, "Color %d := (%d,%d,%d,%d)\n", color_id, r, g, b, alpha);
 
@@ -378,8 +387,8 @@ static int parse_presentation_segment(AVCodecContext *avctx,
                                       int64_t pts)
 {
     PGSSubContext *ctx = avctx->priv_data;
-
     int i, state, ret;
+    const uint8_t *buf_end = buf + buf_size;
 
     // Video descriptor
     int w = bytestream_get_be16(&buf);
@@ -428,8 +437,16 @@ static int parse_presentation_segment(AVCodecContext *avctx,
         }
     }
 
+
     for (i = 0; i < ctx->presentation.object_count; i++)
     {
+
+        if (buf_end - buf < 8) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficent space for object\n");
+            ctx->presentation.object_count = i;
+            return AVERROR_INVALIDDATA;
+        }
+
         ctx->presentation.objects[i].id = bytestream_get_be16(&buf);
         ctx->presentation.objects[i].window_id = bytestream_get_byte(&buf);
         ctx->presentation.objects[i].composition_flag = bytestream_get_byte(&buf);
@@ -480,11 +497,14 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
 {
     AVSubtitle    *sub = data;
     PGSSubContext *ctx = avctx->priv_data;
+    int64_t pts;
     PGSSubPalette *palette;
     int i, ret;
 
+    pts = ctx->presentation.pts != AV_NOPTS_VALUE ? ctx->presentation.pts : sub->pts;
     memset(sub, 0, sizeof(*sub));
-    sub->pts = ctx->presentation.pts;
+    sub->pts = pts;
+    ctx->presentation.pts = AV_NOPTS_VALUE;
     sub->start_display_time = 0;
     // There is no explicit end time for PGS subtitles.  The end time
     // is defined by the start of the next sub which may contain no
@@ -495,7 +515,7 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
     // Blank if last object_count was 0.
     if (!ctx->presentation.object_count)
         return 1;
-    sub->rects = av_mallocz(sizeof(*sub->rects) * ctx->presentation.object_count);
+    sub->rects = av_mallocz_array(ctx->presentation.object_count, sizeof(*sub->rects));
     if (!sub->rects) {
         return AVERROR(ENOMEM);
     }
@@ -572,6 +592,9 @@ static int display_end_segment(AVCodecContext *avctx, void *data,
             return AVERROR(ENOMEM);
         }
 
+        if (!ctx->forced_subs_only || ctx->presentation.objects[i].composition_flag & 0x40)
+        memcpy(sub->rects[i]->data[1], palette->clut, sub->rects[i]->nb_colors * sizeof(uint32_t));
+
 #if FF_API_AVPICTURE
 FF_DISABLE_DEPRECATION_WARNINGS
         rect = sub->rects[i];
@@ -581,9 +604,6 @@ FF_DISABLE_DEPRECATION_WARNINGS
         }
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
-
-        memcpy(sub->rects[i]->data[1], palette->clut, sub->rects[i]->nb_colors * sizeof(uint32_t));
-
     }
     return 1;
 }
@@ -637,7 +657,7 @@ static int decode(AVCodecContext *avctx, void *data, int *data_size,
             ret = parse_object_segment(avctx, buf, segment_length);
             break;
         case PRESENTATION_SEGMENT:
-            ret = parse_presentation_segment(avctx, buf, segment_length, avpkt->pts);
+            ret = parse_presentation_segment(avctx, buf, segment_length, ((AVSubtitle*)(data))->pts);
             break;
         case WINDOW_SEGMENT:
             /*
@@ -669,6 +689,20 @@ static int decode(AVCodecContext *avctx, void *data, int *data_size,
     return buf_size;
 }
 
+#define OFFSET(x) offsetof(PGSSubContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    {"forced_subs_only", "Only show forced subtitles", OFFSET(forced_subs_only), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, SD},
+    { NULL },
+};
+
+static const AVClass pgsdec_class = {
+    .class_name = "PGS subtitle decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_pgssub_decoder = {
     .name           = "pgssub",
     .long_name      = NULL_IF_CONFIG_SMALL("HDMV Presentation Graphic Stream subtitles"),
@@ -678,4 +712,5 @@ AVCodec ff_pgssub_decoder = {
     .init           = init_decoder,
     .close          = close_decoder,
     .decode         = decode,
+    .priv_class     = &pgsdec_class,
 };
diff --git a/libavcodec/pictordec.c b/libavcodec/pictordec.c
index 9477bc4..ff6eb7f 100644
--- a/libavcodec/pictordec.c
+++ b/libavcodec/pictordec.c
@@ -2,20 +2,20 @@
  * Pictor/PC Paint decoder
  * Copyright (c) 2010 Peter Ross <pross@xvid.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     uint32_t *palette;
     int bits_per_plane, bpp, etype, esize, npal, pos_after_pal;
-    int i, x, y, plane, tmp, ret;
+    int i, x, y, plane, tmp, ret, val;
 
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
 
@@ -127,7 +127,7 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_PATCHWELCOME;
     }
 
-    if (bytestream2_peek_byte(&s->g) == 0xFF) {
+    if (bytestream2_peek_byte(&s->g) == 0xFF || bpp == 1 || bpp == 4 || bpp == 8) {
         bytestream2_skip(&s->g, 2);
         etype = bytestream2_get_le16(&s->g);
         esize = bytestream2_get_le16(&s->g);
@@ -140,16 +140,16 @@ static int decode_frame(AVCodecContext *avctx,
 
     avctx->pix_fmt = AV_PIX_FMT_PAL8;
 
+    if (av_image_check_size(s->width, s->height, 0, avctx) < 0)
+        return -1;
     if (s->width != avctx->width && s->height != avctx->height) {
         ret = ff_set_dimensions(avctx, s->width, s->height);
         if (ret < 0)
             return ret;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     memset(frame->data[0], 0, s->height * frame->linesize[0]);
     frame->pict_type           = AV_PICTURE_TYPE_I;
     frame->palette_has_changed = 1;
@@ -165,7 +165,7 @@ static int decode_frame(AVCodecContext *avctx,
         npal = FFMIN(esize, 16);
         for (i = 0; i < npal; i++) {
             int pal_idx = bytestream2_get_byte(&s->g);
-            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 16)];
+            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 15)];
         }
     } else if (etype == 3) {
         npal = FFMIN(esize, 16);
@@ -175,13 +175,15 @@ static int decode_frame(AVCodecContext *avctx,
         }
     } else if (etype == 4 || etype == 5) {
         npal = FFMIN(esize / 3, 256);
-        for (i = 0; i < npal; i++)
+        for (i = 0; i < npal; i++) {
             palette[i] = bytestream2_get_be24(&s->g) << 2;
+            palette[i] |= 0xFFU << 24 | palette[i] >> 6 & 0x30303;
+        }
     } else {
         if (bpp == 1) {
             npal = 2;
-            palette[0] = 0x000000;
-            palette[1] = 0xFFFFFF;
+            palette[0] = 0xFF000000;
+            palette[1] = 0xFFFFFFFF;
         } else if (bpp == 2) {
             npal = 4;
             for (i = 0; i < npal; i++)
@@ -196,10 +198,11 @@ static int decode_frame(AVCodecContext *avctx,
     // skip remaining palette bytes
     bytestream2_seek(&s->g, pos_after_pal, SEEK_SET);
 
-    x = 0;
+    val = 0;
     y = s->height - 1;
-    plane = 0;
     if (bytestream2_get_le16(&s->g)) {
+        x = 0;
+        plane = 0;
         while (bytestream2_get_bytes_left(&s->g) >= 6) {
             int stop_size, marker, t1, t2;
 
@@ -213,7 +216,7 @@ static int decode_frame(AVCodecContext *avctx,
             while (plane < s->nb_planes &&
                    bytestream2_get_bytes_left(&s->g) > stop_size) {
                 int run = 1;
-                int val = bytestream2_get_byte(&s->g);
+                val = bytestream2_get_byte(&s->g);
                 if (val == marker) {
                     run = bytestream2_get_byte(&s->g);
                     if (run == 0)
@@ -232,9 +235,20 @@ static int decode_frame(AVCodecContext *avctx,
                 }
             }
         }
+
+        if (x < avctx->width) {
+            int run = (y + 1) * avctx->width - x;
+            if (bits_per_plane == 8)
+                picmemset_8bpp(s, frame, val, run, &x, &y);
+            else
+                picmemset(s, frame, val, run / (8 / bits_per_plane), &x, &y, &plane, bits_per_plane);
+        }
     } else {
-        avpriv_request_sample(avctx, "Uncompressed image");
-        return avpkt->size;
+        while (y >= 0 && bytestream2_get_bytes_left(&s->g) > 0) {
+            memcpy(frame->data[0] + y * frame->linesize[0], s->g.buffer, FFMIN(avctx->width, bytestream2_get_bytes_left(&s->g)));
+            bytestream2_skip(&s->g, avctx->width);
+            y--;
+        }
     }
 finish:
 
diff --git a/libavcodec/pixblockdsp.c b/libavcodec/pixblockdsp.c
index 71423f9..f0883d3 100644
--- a/libavcodec/pixblockdsp.c
+++ b/libavcodec/pixblockdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,17 +20,44 @@
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "pixblockdsp.h"
 
-#define BIT_DEPTH 16
-#include "pixblockdsp_template.c"
-#undef BIT_DEPTH
+static void get_pixels_16_c(int16_t *av_restrict block, const uint8_t *pixels,
+                            ptrdiff_t line_size)
+{
+    AV_COPY128U(block + 0 * 8, pixels + 0 * line_size);
+    AV_COPY128U(block + 1 * 8, pixels + 1 * line_size);
+    AV_COPY128U(block + 2 * 8, pixels + 2 * line_size);
+    AV_COPY128U(block + 3 * 8, pixels + 3 * line_size);
+    AV_COPY128U(block + 4 * 8, pixels + 4 * line_size);
+    AV_COPY128U(block + 5 * 8, pixels + 5 * line_size);
+    AV_COPY128U(block + 6 * 8, pixels + 6 * line_size);
+    AV_COPY128U(block + 7 * 8, pixels + 7 * line_size);
+}
+
+static void get_pixels_8_c(int16_t *av_restrict block, const uint8_t *pixels,
+                           ptrdiff_t line_size)
+{
+    int i;
 
-#define BIT_DEPTH 8
-#include "pixblockdsp_template.c"
+    /* read the pixels */
+    for (i = 0; i < 8; i++) {
+        block[0] = pixels[0];
+        block[1] = pixels[1];
+        block[2] = pixels[2];
+        block[3] = pixels[3];
+        block[4] = pixels[4];
+        block[5] = pixels[5];
+        block[6] = pixels[6];
+        block[7] = pixels[7];
+        pixels  += line_size;
+        block   += 8;
+    }
+}
 
-static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
+static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
                           const uint8_t *s2, int stride)
 {
     int i;
@@ -60,17 +87,25 @@ av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx)
     switch (avctx->bits_per_raw_sample) {
     case 9:
     case 10:
+    case 12:
+    case 14:
         c->get_pixels = get_pixels_16_c;
         break;
     default:
-        c->get_pixels = get_pixels_8_c;
+        if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
+            c->get_pixels = get_pixels_8_c;
+        }
         break;
     }
 
+    if (ARCH_ALPHA)
+        ff_pixblockdsp_init_alpha(c, avctx, high_bit_depth);
     if (ARCH_ARM)
         ff_pixblockdsp_init_arm(c, avctx, high_bit_depth);
     if (ARCH_PPC)
         ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth);
     if (ARCH_X86)
         ff_pixblockdsp_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_pixblockdsp_init_mips(c, avctx, high_bit_depth);
 }
diff --git a/libavcodec/pixblockdsp.h b/libavcodec/pixblockdsp.h
index 8094d14..79ed86c 100644
--- a/libavcodec/pixblockdsp.h
+++ b/libavcodec/pixblockdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 typedef struct PixblockDSPContext {
     void (*get_pixels)(int16_t *block /* align 16 */,
                        const uint8_t *pixels /* align 8 */,
-                       int line_size);
+                       ptrdiff_t line_size);
     void (*diff_pixels)(int16_t *block /* align 16 */,
                         const uint8_t *s1 /* align 8 */,
                         const uint8_t *s2 /* align 8 */,
@@ -34,11 +34,15 @@ typedef struct PixblockDSPContext {
 } PixblockDSPContext;
 
 void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx);
+void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx,
+                               unsigned high_bit_depth);
 void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
 void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
 void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
+void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth);
 
 #endif /* AVCODEC_PIXBLOCKDSP_H */
diff --git a/libavcodec/pixblockdsp_template.c b/libavcodec/pixblockdsp_template.c
deleted file mode 100644
index 71d3cf1..0000000
--- a/libavcodec/pixblockdsp_template.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "bit_depth_template.c"
-
-static void FUNCC(get_pixels)(int16_t *restrict block, const uint8_t *_pixels,
-                              int line_size)
-{
-    const pixel *pixels = (const pixel *) _pixels;
-    int i;
-
-    /* read the pixels */
-    for (i = 0; i < 8; i++) {
-        block[0] = pixels[0];
-        block[1] = pixels[1];
-        block[2] = pixels[2];
-        block[3] = pixels[3];
-        block[4] = pixels[4];
-        block[5] = pixels[5];
-        block[6] = pixels[6];
-        block[7] = pixels[7];
-        pixels  += line_size / sizeof(pixel);
-        block   += 8;
-    }
-}
diff --git a/libavcodec/pixels.h b/libavcodec/pixels.h
index d9d2fde..98eacd4 100644
--- a/libavcodec/pixels.h
+++ b/libavcodec/pixels.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/png.c b/libavcodec/png.c
index cd75dc1..ef52b51 100644
--- a/libavcodec/png.c
+++ b/libavcodec/png.c
@@ -2,29 +2,25 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
-#include "bytestream.h"
 #include "png.h"
 
-const uint8_t ff_pngsig[8] = { 137, 80, 78, 71, 13, 10, 26, 10 };
-const uint8_t ff_mngsig[8] = { 138, 77, 78, 71, 13, 10, 26, 10 };
-
 /* Mask to determine which y pixels are valid in a pass */
 const uint8_t ff_png_pass_ymask[NB_PASSES] = {
     0x80, 0x80, 0x08, 0x88, 0x22, 0xaa, 0x55,
@@ -40,11 +36,6 @@ static const uint8_t ff_png_pass_xshift[NB_PASSES] = {
     3, 3, 2, 2, 1, 1, 0
 };
 
-/* Mask to determine which pixels are valid in a pass */
-const uint8_t ff_png_pass_mask[NB_PASSES] = {
-    0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff
-};
-
 void *ff_png_zalloc(void *opaque, unsigned int items, unsigned int size)
 {
     return av_mallocz_array(items, size);
diff --git a/libavcodec/png.h b/libavcodec/png.h
index b8c72ee..948c2f7 100644
--- a/libavcodec/png.h
+++ b/libavcodec/png.h
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,15 +49,12 @@
 
 #define NB_PASSES 7
 
-extern const uint8_t ff_pngsig[8];
-extern const uint8_t ff_mngsig[8];
+#define PNGSIG 0x89504e470d0a1a0a
+#define MNGSIG 0x8a4d4e470d0a1a0a
 
 /* Mask to determine which y pixels are valid in a pass */
 extern const uint8_t ff_png_pass_ymask[NB_PASSES];
 
-/* Mask to determine which pixels are valid in a pass */
-extern const uint8_t ff_png_pass_mask[NB_PASSES];
-
 void *ff_png_zalloc(void *opaque, unsigned int items, unsigned int size);
 
 void ff_png_zfree(void *opaque, void *ptr);
diff --git a/libavcodec/png_parser.c b/libavcodec/png_parser.c
index c66caf3..74f2964 100644
--- a/libavcodec/png_parser.c
+++ b/libavcodec/png_parser.c
@@ -2,20 +2,20 @@
  * PNG parser
  * Copyright (c) 2009 Peter Holik
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,20 +24,14 @@
  * PNG parser
  */
 
-#include "libavutil/intreadwrite.h"
-#include "libavutil/common.h"
-
 #include "parser.h"
-
-#define PNG_SIGNATURE UINT64_C(0x89504e470d0a1a0a)
-#define MNG_SIGNATURE UINT64_C(0x8a4d4e470d0a1a0a)
+#include "png.h"
 
 typedef struct PNGParseContext {
     ParseContext pc;
-
-    int chunk_pos;          ///< position inside current chunk
-    uint32_t chunk_length;  ///< length of the current chunk
-    int remaining_size;     ///< remaining size of the current chunk
+    uint32_t chunk_pos;           ///< position inside current chunk
+    uint32_t chunk_length;        ///< length of the current chunk
+    uint32_t remaining_size;      ///< remaining size of the current chunk
 } PNGParseContext;
 
 static int png_parse(AVCodecParserContext *s, AVCodecContext *avctx,
@@ -48,16 +42,15 @@ static int png_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     int next = END_NOT_FOUND;
     int i = 0;
 
+    s->pict_type = AV_PICTURE_TYPE_NONE;
+
     *poutbuf_size = 0;
-    if (buf_size == 0)
-        return 0;
 
     if (!ppc->pc.frame_start_found) {
         uint64_t state64 = ppc->pc.state64;
         for (; i < buf_size; i++) {
             state64 = (state64 << 8) | buf[i];
-            if (state64 == PNG_SIGNATURE ||
-                state64 == MNG_SIGNATURE) {
+            if (state64 == PNGSIG || state64 == MNGSIG) {
                 i++;
                 ppc->pc.frame_start_found = 1;
                 break;
diff --git a/libavcodec/pngdec.c b/libavcodec/pngdec.c
index a6ab665..36275ae 100644
--- a/libavcodec/pngdec.c
+++ b/libavcodec/pngdec.c
@@ -2,47 +2,57 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/avstring.h"
+//#define DEBUG
+
+#include "libavutil/avassert.h"
+#include "libavutil/bprint.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/stereo3d.h"
 
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "apng.h"
 #include "png.h"
 #include "pngdsp.h"
-
-/* TODO:
- * - add 2, 4 and 16 bit depth support
- */
+#include "thread.h"
 
 #include <zlib.h>
 
 typedef struct PNGDecContext {
     PNGDSPContext dsp;
+    AVCodecContext *avctx;
 
     GetByteContext gb;
-    AVFrame *prev;
+    ThreadFrame previous_picture;
+    ThreadFrame last_picture;
+    ThreadFrame picture;
 
     int state;
     int width, height;
+    int cur_w, cur_h;
+    int last_w, last_h;
+    int x_offset, y_offset;
+    int last_x_offset, last_y_offset;
+    uint8_t dispose_op, blend_op;
+    uint8_t last_dispose_op;
     int bit_depth;
     int color_type;
     int compression_type;
@@ -51,13 +61,19 @@ typedef struct PNGDecContext {
     int channels;
     int bits_per_pixel;
     int bpp;
+    int has_trns;
+    uint8_t transparent_color_be[6];
 
     uint8_t *image_buf;
     int image_linesize;
     uint32_t palette[256];
     uint8_t *crow_buf;
     uint8_t *last_row;
+    unsigned int last_row_size;
     uint8_t *tmp_row;
+    unsigned int tmp_row_size;
+    uint8_t *buffer;
+    int buffer_size;
     int pass;
     int crow_size; /* compressed row size (include filter type) */
     int row_size; /* decompressed row size */
@@ -66,9 +82,14 @@ typedef struct PNGDecContext {
     z_stream zstream;
 } PNGDecContext;
 
+/* Mask to determine which pixels are valid in a pass */
+static const uint8_t png_pass_mask[NB_PASSES] = {
+    0x01, 0x01, 0x11, 0x11, 0x55, 0x55, 0xff,
+};
+
 /* Mask to determine which y pixels can be written in a pass */
 static const uint8_t png_pass_dsp_ymask[NB_PASSES] = {
-    0xff, 0xff, 0x0f, 0xcc, 0x33, 0xff, 0x55,
+    0xff, 0xff, 0x0f, 0xff, 0x33, 0xff, 0x55,
 };
 
 /* Mask to determine which pixels to overwrite while displaying */
@@ -87,40 +108,55 @@ static void png_put_interlaced_row(uint8_t *dst, int width,
     uint8_t *d;
     const uint8_t *s;
 
-    mask     = ff_png_pass_mask[pass];
+    mask     = png_pass_mask[pass];
     dsp_mask = png_pass_dsp_mask[pass];
 
     switch (bits_per_pixel) {
     case 1:
-        /* we must initialize the line to zero before writing to it */
-        if (pass == 0)
-            memset(dst, 0, (width + 7) >> 3);
         src_x = 0;
         for (x = 0; x < width; x++) {
             j = (x & 7);
             if ((dsp_mask << j) & 0x80) {
                 b = (src[src_x >> 3] >> (7 - (src_x & 7))) & 1;
+                dst[x >> 3] &= 0xFF7F>>j;
                 dst[x >> 3] |= b << (7 - j);
             }
             if ((mask << j) & 0x80)
                 src_x++;
         }
         break;
+    case 2:
+        src_x = 0;
+        for (x = 0; x < width; x++) {
+            int j2 = 2 * (x & 3);
+            j = (x & 7);
+            if ((dsp_mask << j) & 0x80) {
+                b = (src[src_x >> 2] >> (6 - 2*(src_x & 3))) & 3;
+                dst[x >> 2] &= 0xFF3F>>j2;
+                dst[x >> 2] |= b << (6 - j2);
+            }
+            if ((mask << j) & 0x80)
+                src_x++;
+        }
+        break;
+    case 4:
+        src_x = 0;
+        for (x = 0; x < width; x++) {
+            int j2 = 4*(x&1);
+            j = (x & 7);
+            if ((dsp_mask << j) & 0x80) {
+                b = (src[src_x >> 1] >> (4 - 4*(src_x & 1))) & 15;
+                dst[x >> 1] &= 0xFF0F>>j2;
+                dst[x >> 1] |= b << (4 - j2);
+            }
+            if ((mask << j) & 0x80)
+                src_x++;
+        }
+        break;
     default:
         bpp = bits_per_pixel >> 3;
         d   = dst;
         s   = src;
-        if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-            for (x = 0; x < width; x++) {
-                j = x & 7;
-                if ((dsp_mask << j) & 0x80) {
-                    *(uint32_t *)d = (s[3] << 24) | (s[0] << 16) | (s[1] << 8) | s[2];
-                }
-                d += bpp;
-                if ((mask << j) & 0x80)
-                    s += bpp;
-            }
-        } else {
             for (x = 0; x < width; x++) {
                 j = x & 7;
                 if ((dsp_mask << j) & 0x80) {
@@ -130,7 +166,6 @@ static void png_put_interlaced_row(uint8_t *dst, int width,
                 if ((mask << j) & 0x80)
                     s += bpp;
             }
-        }
         break;
     }
 }
@@ -172,7 +207,7 @@ void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
             b = dst[2];                                                       \
         if (bpp >= 4)                                                         \
             a = dst[3];                                                       \
-        for (; i < size; i += bpp) {                                          \
+        for (; i <= size - bpp; i += bpp) {                                   \
             dst[i + 0] = r = op(r, src[i + 0], last[i + 0]);                  \
             if (bpp == 1)                                                     \
                 continue;                                                     \
@@ -195,12 +230,9 @@ void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
         UNROLL1(3, op)                                                        \
     } else if (bpp == 4) {                                                    \
         UNROLL1(4, op)                                                        \
-    } else {                                                                  \
-        for (; i < size; i += bpp) {                                          \
-            int j;                                                            \
-            for (j = 0; j < bpp; j++)                                         \
-                dst[i + j] = op(dst[i + j - bpp], src[i + j], last[i + j]);   \
-        }                                                                     \
+    }                                                                         \
+    for (; i < size; i++) {                                                   \
+        dst[i] = op(dst[i - bpp], src[i], last[i]);                           \
     }
 
 /* NOTE: 'dst' can be equal to 'last' */
@@ -219,12 +251,12 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
         if (bpp == 4) {
             p = *(int *)dst;
             for (; i < size; i += bpp) {
-                int s = *(int *)(src + i);
+                unsigned s = *(int *)(src + i);
                 p = ((s & 0x7f7f7f7f) + (p & 0x7f7f7f7f)) ^ ((s ^ p) & 0x80808080);
                 *(int *)(dst + i) = p;
             }
         } else {
-#define OP_SUB(x, s, l) x + s
+#define OP_SUB(x, s, l) ((x) + (s))
             UNROLL_FILTER(OP_SUB);
         }
         break;
@@ -236,7 +268,7 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
             p      = (last[i] >> 1);
             dst[i] = p + src[i];
         }
-#define OP_AVG(x, s, l) (((x + l) >> 1) + s) & 0xff
+#define OP_AVG(x, s, l) (((((x) + (l)) >> 1) + (s)) & 0xff)
         UNROLL_FILTER(OP_AVG);
         break;
     case PNG_FILTER_VALUE_PAETH:
@@ -247,55 +279,33 @@ static void png_filter_row(PNGDSPContext *dsp, uint8_t *dst, int filter_type,
         if (bpp > 2 && size > 4) {
             /* would write off the end of the array if we let it process
              * the last pixel with bpp=3 */
-            int w = bpp == 4 ? size : size - 3;
-            dsp->add_paeth_prediction(dst + i, src + i, last + i, w - i, bpp);
-            i = w;
+            int w = (bpp & 3) ? size - 3 : size;
+
+            if (w > i) {
+                dsp->add_paeth_prediction(dst + i, src + i, last + i, size - i, bpp);
+                i = w;
+            }
         }
         ff_add_png_paeth_prediction(dst + i, src + i, last + i, size - i, bpp);
         break;
     }
 }
 
-static av_always_inline void convert_to_rgb32_loco(uint8_t *dst,
-                                                   const uint8_t *src,
-                                                   int width, int loco)
-{
-    int j;
-    unsigned int r, g, b, a;
-
-    for (j = 0; j < width; j++) {
-        r = src[0];
-        g = src[1];
-        b = src[2];
-        a = src[3];
-        if (loco) {
-            r = (r + g) & 0xff;
-            b = (b + g) & 0xff;
-        }
-        *(uint32_t *) dst = (a << 24) | (r << 16) | (g << 8) | b;
-        dst += 4;
-        src += 4;
-    }
+/* This used to be called "deloco" in FFmpeg
+ * and is actually an inverse reversible colorspace transformation */
+#define YUV2RGB(NAME, TYPE) \
+static void deloco_ ## NAME(TYPE *dst, int size, int alpha) \
+{ \
+    int i; \
+    for (i = 0; i < size; i += 3 + alpha) { \
+        int g = dst [i + 1]; \
+        dst[i + 0] += g; \
+        dst[i + 2] += g; \
+    } \
 }
 
-static void convert_to_rgb32(uint8_t *dst, const uint8_t *src,
-                             int width, int loco)
-{
-    if (loco)
-        convert_to_rgb32_loco(dst, src, width, 1);
-    else
-        convert_to_rgb32_loco(dst, src, width, 0);
-}
-
-static void deloco_rgb24(uint8_t *dst, int size)
-{
-    int i;
-    for (i = 0; i < size; i += 3) {
-        int g = dst[i + 1];
-        dst[i + 0] += g;
-        dst[i + 2] += g;
-    }
-}
+YUV2RGB(rgb8, uint8_t)
+YUV2RGB(rgb16, uint16_t)
 
 /* process exactly one decompressed row */
 static void png_handle_row(PNGDecContext *s)
@@ -304,39 +314,41 @@ static void png_handle_row(PNGDecContext *s)
     int got_line;
 
     if (!s->interlace_type) {
-        ptr = s->image_buf + s->image_linesize * s->y;
-        /* need to swap bytes correctly for RGB_ALPHA */
-        if (s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-            png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
-                           s->last_row, s->row_size, s->bpp);
-            convert_to_rgb32(ptr, s->tmp_row, s->width,
-                             s->filter_type == PNG_FILTER_TYPE_LOCO);
-            FFSWAP(uint8_t *, s->last_row, s->tmp_row);
-        } else {
-            /* in normal case, we avoid one copy */
-            if (s->y == 0)
-                last_row = s->last_row;
-            else
-                last_row = ptr - s->image_linesize;
+        ptr = s->image_buf + s->image_linesize * (s->y + s->y_offset) + s->x_offset * s->bpp;
+        if (s->y == 0)
+            last_row = s->last_row;
+        else
+            last_row = ptr - s->image_linesize;
 
-            png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1,
-                           last_row, s->row_size, s->bpp);
-        }
+        png_filter_row(&s->dsp, ptr, s->crow_buf[0], s->crow_buf + 1,
+                       last_row, s->row_size, s->bpp);
         /* loco lags by 1 row so that it doesn't interfere with top prediction */
-        if (s->filter_type == PNG_FILTER_TYPE_LOCO &&
-            s->color_type == PNG_COLOR_TYPE_RGB && s->y > 0)
-            deloco_rgb24(ptr - s->image_linesize, s->row_size);
+        if (s->filter_type == PNG_FILTER_TYPE_LOCO && s->y > 0) {
+            if (s->bit_depth == 16) {
+                deloco_rgb16((uint16_t *)(ptr - s->image_linesize), s->row_size / 2,
+                             s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+            } else {
+                deloco_rgb8(ptr - s->image_linesize, s->row_size,
+                            s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+            }
+        }
         s->y++;
-        if (s->y == s->height) {
+        if (s->y == s->cur_h) {
             s->state |= PNG_ALLIMAGE;
-            if (s->filter_type == PNG_FILTER_TYPE_LOCO &&
-                s->color_type == PNG_COLOR_TYPE_RGB)
-                deloco_rgb24(ptr, s->row_size);
+            if (s->filter_type == PNG_FILTER_TYPE_LOCO) {
+                if (s->bit_depth == 16) {
+                    deloco_rgb16((uint16_t *)ptr, s->row_size / 2,
+                                 s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+                } else {
+                    deloco_rgb8(ptr, s->row_size,
+                                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA);
+                }
+            }
         }
     } else {
         got_line = 0;
         for (;;) {
-            ptr = s->image_buf + s->image_linesize * s->y;
+            ptr = s->image_buf + s->image_linesize * (s->y + s->y_offset) + s->x_offset * s->bpp;
             if ((ff_png_pass_ymask[s->pass] << (s->y & 7)) & 0x80) {
                 /* if we already read one row, it is time to stop to
                  * wait for the next one */
@@ -345,15 +357,16 @@ static void png_handle_row(PNGDecContext *s)
                 png_filter_row(&s->dsp, s->tmp_row, s->crow_buf[0], s->crow_buf + 1,
                                s->last_row, s->pass_row_size, s->bpp);
                 FFSWAP(uint8_t *, s->last_row, s->tmp_row);
+                FFSWAP(unsigned int, s->last_row_size, s->tmp_row_size);
                 got_line = 1;
             }
             if ((png_pass_dsp_ymask[s->pass] << (s->y & 7)) & 0x80) {
-                /* NOTE: RGB32 is handled directly in png_put_interlaced_row */
-                png_put_interlaced_row(ptr, s->width, s->bits_per_pixel, s->pass,
+                png_put_interlaced_row(ptr, s->cur_w, s->bits_per_pixel, s->pass,
                                        s->color_type, s->last_row);
             }
             s->y++;
-            if (s->y == s->height) {
+            if (s->y == s->cur_h) {
+                memset(s->last_row, 0, s->row_size);
                 for (;;) {
                     if (s->pass == NB_PASSES - 1) {
                         s->state |= PNG_ALLIMAGE;
@@ -363,7 +376,7 @@ static void png_handle_row(PNGDecContext *s)
                         s->y = 0;
                         s->pass_row_size = ff_png_pass_row_size(s->pass,
                                                                 s->bits_per_pixel,
-                                                                s->width);
+                                                                s->cur_w);
                         s->crow_size = s->pass_row_size + 1;
                         if (s->pass_row_size != 0)
                             break;
@@ -380,14 +393,15 @@ static int png_decode_idat(PNGDecContext *s, int length)
 {
     int ret;
     s->zstream.avail_in = FFMIN(length, bytestream2_get_bytes_left(&s->gb));
-    s->zstream.next_in  = s->gb.buffer;
+    s->zstream.next_in  = (unsigned char *)s->gb.buffer;
     bytestream2_skip(&s->gb, length);
 
     /* decode one line if possible */
     while (s->zstream.avail_in > 0) {
         ret = inflate(&s->zstream, Z_PARTIAL_FLUSH);
         if (ret != Z_OK && ret != Z_STREAM_END) {
-            return -1;
+            av_log(s->avctx, AV_LOG_ERROR, "inflate returned error %d\n", ret);
+            return AVERROR_EXTERNAL;
         }
         if (s->zstream.avail_out == 0) {
             if (!(s->state & PNG_ALLIMAGE)) {
@@ -405,216 +419,785 @@ static int png_decode_idat(PNGDecContext *s, int length)
     return 0;
 }
 
-static int decode_frame(AVCodecContext *avctx,
-                        void *data, int *got_frame,
-                        AVPacket *avpkt)
+static int decode_zbuf(AVBPrint *bp, const uint8_t *data,
+                       const uint8_t *data_end)
 {
-    PNGDecContext *const s = avctx->priv_data;
-    const uint8_t *buf     = avpkt->data;
-    int buf_size           = avpkt->size;
-    AVFrame *p             = data;
-    uint8_t *crow_buf_base = NULL;
-    uint32_t tag, length;
+    z_stream zstream;
+    unsigned char *buf;
+    unsigned buf_size;
     int ret;
 
-    /* check signature */
-    if (buf_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Not enough data %d\n",
-               buf_size);
+    zstream.zalloc = ff_png_zalloc;
+    zstream.zfree  = ff_png_zfree;
+    zstream.opaque = NULL;
+    if (inflateInit(&zstream) != Z_OK)
+        return AVERROR_EXTERNAL;
+    zstream.next_in  = (unsigned char *)data;
+    zstream.avail_in = data_end - data;
+    av_bprint_init(bp, 0, -1);
+
+    while (zstream.avail_in > 0) {
+        av_bprint_get_buffer(bp, 1, &buf, &buf_size);
+        if (!buf_size) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+        zstream.next_out  = buf;
+        zstream.avail_out = buf_size;
+        ret = inflate(&zstream, Z_PARTIAL_FLUSH);
+        if (ret != Z_OK && ret != Z_STREAM_END) {
+            ret = AVERROR_EXTERNAL;
+            goto fail;
+        }
+        bp->len += zstream.next_out - buf;
+        if (ret == Z_STREAM_END)
+            break;
+    }
+    inflateEnd(&zstream);
+    bp->str[bp->len] = 0;
+    return 0;
+
+fail:
+    inflateEnd(&zstream);
+    av_bprint_finalize(bp, NULL);
+    return ret;
+}
+
+static uint8_t *iso88591_to_utf8(const uint8_t *in, size_t size_in)
+{
+    size_t extra = 0, i;
+    uint8_t *out, *q;
+
+    for (i = 0; i < size_in; i++)
+        extra += in[i] >= 0x80;
+    if (size_in == SIZE_MAX || extra > SIZE_MAX - size_in - 1)
+        return NULL;
+    q = out = av_malloc(size_in + extra + 1);
+    if (!out)
+        return NULL;
+    for (i = 0; i < size_in; i++) {
+        if (in[i] >= 0x80) {
+            *(q++) = 0xC0 | (in[i] >> 6);
+            *(q++) = 0x80 | (in[i] & 0x3F);
+        } else {
+            *(q++) = in[i];
+        }
+    }
+    *(q++) = 0;
+    return out;
+}
+
+static int decode_text_chunk(PNGDecContext *s, uint32_t length, int compressed,
+                             AVDictionary **dict)
+{
+    int ret, method;
+    const uint8_t *data        = s->gb.buffer;
+    const uint8_t *data_end    = data + length;
+    const uint8_t *keyword     = data;
+    const uint8_t *keyword_end = memchr(keyword, 0, data_end - keyword);
+    uint8_t *kw_utf8 = NULL, *text, *txt_utf8 = NULL;
+    unsigned text_len;
+    AVBPrint bp;
+
+    if (!keyword_end)
         return AVERROR_INVALIDDATA;
+    data = keyword_end + 1;
+
+    if (compressed) {
+        if (data == data_end)
+            return AVERROR_INVALIDDATA;
+        method = *(data++);
+        if (method)
+            return AVERROR_INVALIDDATA;
+        if ((ret = decode_zbuf(&bp, data, data_end)) < 0)
+            return ret;
+        text_len = bp.len;
+        av_bprint_finalize(&bp, (char **)&text);
+        if (!text)
+            return AVERROR(ENOMEM);
+    } else {
+        text = (uint8_t *)data;
+        text_len = data_end - text;
     }
-    if (memcmp(buf, ff_pngsig, 8) != 0 &&
-        memcmp(buf, ff_mngsig, 8) != 0) {
-        char signature[5 * 8 + 1] = { 0 };
-        int i;
-        for (i = 0; i < 8; i++) {
-            av_strlcatf(signature + i * 5, sizeof(signature) - i * 5,
-                        " 0x%02x", buf[i]);
+
+    kw_utf8  = iso88591_to_utf8(keyword, keyword_end - keyword);
+    txt_utf8 = iso88591_to_utf8(text, text_len);
+    if (text != data)
+        av_free(text);
+    if (!(kw_utf8 && txt_utf8)) {
+        av_free(kw_utf8);
+        av_free(txt_utf8);
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(dict, kw_utf8, txt_utf8,
+                AV_DICT_DONT_STRDUP_KEY | AV_DICT_DONT_STRDUP_VAL);
+    return 0;
+}
+
+static int decode_ihdr_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    if (length != 13)
+        return AVERROR_INVALIDDATA;
+
+    if (s->state & PNG_IDAT) {
+        av_log(avctx, AV_LOG_ERROR, "IHDR after IDAT\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->state & PNG_IHDR) {
+        av_log(avctx, AV_LOG_ERROR, "Multiple IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->width  = s->cur_w = bytestream2_get_be32(&s->gb);
+    s->height = s->cur_h = bytestream2_get_be32(&s->gb);
+    if (av_image_check_size(s->width, s->height, 0, avctx)) {
+        s->cur_w = s->cur_h = s->width = s->height = 0;
+        av_log(avctx, AV_LOG_ERROR, "Invalid image size\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->bit_depth        = bytestream2_get_byte(&s->gb);
+    s->color_type       = bytestream2_get_byte(&s->gb);
+    s->compression_type = bytestream2_get_byte(&s->gb);
+    s->filter_type      = bytestream2_get_byte(&s->gb);
+    s->interlace_type   = bytestream2_get_byte(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* crc */
+    s->state |= PNG_IHDR;
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_DEBUG, "width=%d height=%d depth=%d color_type=%d "
+                "compression_type=%d filter_type=%d interlace_type=%d\n",
+                s->width, s->height, s->bit_depth, s->color_type,
+                s->compression_type, s->filter_type, s->interlace_type);
+
+    return 0;
+}
+
+static int decode_phys_chunk(AVCodecContext *avctx, PNGDecContext *s)
+{
+    if (s->state & PNG_IDAT) {
+        av_log(avctx, AV_LOG_ERROR, "pHYs after IDAT\n");
+        return AVERROR_INVALIDDATA;
+    }
+    avctx->sample_aspect_ratio.num = bytestream2_get_be32(&s->gb);
+    avctx->sample_aspect_ratio.den = bytestream2_get_be32(&s->gb);
+    if (avctx->sample_aspect_ratio.num < 0 || avctx->sample_aspect_ratio.den < 0)
+        avctx->sample_aspect_ratio = (AVRational){ 0, 1 };
+    bytestream2_skip(&s->gb, 1); /* unit specifier */
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    return 0;
+}
+
+static int decode_idat_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length, AVFrame *p)
+{
+    int ret;
+    size_t byte_depth = s->bit_depth > 8 ? 2 : 1;
+
+    if (!(s->state & PNG_IHDR)) {
+        av_log(avctx, AV_LOG_ERROR, "IDAT without IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (!(s->state & PNG_IDAT)) {
+        /* init image info */
+        avctx->width  = s->width;
+        avctx->height = s->height;
+
+        s->channels       = ff_png_get_nb_channels(s->color_type);
+        s->bits_per_pixel = s->bit_depth * s->channels;
+        s->bpp            = (s->bits_per_pixel + 7) >> 3;
+        s->row_size       = (s->cur_w * s->bits_per_pixel + 7) >> 3;
+
+        if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_RGB) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        } else if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_RGBA;
+        } else if ((s->bit_depth == 2 || s->bit_depth == 4 || s->bit_depth == 8) &&
+                s->color_type == PNG_COLOR_TYPE_GRAY) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_RGB) {
+            avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+        } else if ((s->bits_per_pixel == 1 || s->bits_per_pixel == 2 || s->bits_per_pixel == 4 || s->bits_per_pixel == 8) &&
+                s->color_type == PNG_COLOR_TYPE_PALETTE) {
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        } else if (s->bit_depth == 1 && s->bits_per_pixel == 1 && avctx->codec_id != AV_CODEC_ID_APNG) {
+            avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+        } else if (s->bit_depth == 8 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_YA8;
+        } else if (s->bit_depth == 16 &&
+                s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+            avctx->pix_fmt = AV_PIX_FMT_YA16BE;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "unsupported bit depth %d "
+                    "and color type %d\n",
+                    s->bit_depth, s->color_type);
+            return AVERROR_INVALIDDATA;
         }
-        av_log(avctx, AV_LOG_ERROR, "Invalid PNG signature %s\n",
-               signature);
+
+        if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE) {
+            switch (avctx->pix_fmt) {
+            case AV_PIX_FMT_RGB24:
+                avctx->pix_fmt = AV_PIX_FMT_RGBA;
+                break;
+
+            case AV_PIX_FMT_RGB48BE:
+                avctx->pix_fmt = AV_PIX_FMT_RGBA64BE;
+                break;
+
+            case AV_PIX_FMT_GRAY8:
+                avctx->pix_fmt = AV_PIX_FMT_YA8;
+                break;
+
+            case AV_PIX_FMT_GRAY16BE:
+                avctx->pix_fmt = AV_PIX_FMT_YA16BE;
+                break;
+
+            default:
+                avpriv_request_sample(avctx, "bit depth %d "
+                        "and color type %d with TRNS",
+                        s->bit_depth, s->color_type);
+                return AVERROR_INVALIDDATA;
+            }
+
+            s->bpp += byte_depth;
+        }
+
+        if ((ret = ff_thread_get_buffer(avctx, &s->picture, AV_GET_BUFFER_FLAG_REF)) < 0)
+            return ret;
+        if (avctx->codec_id == AV_CODEC_ID_APNG && s->last_dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+            ff_thread_release_buffer(avctx, &s->previous_picture);
+            if ((ret = ff_thread_get_buffer(avctx, &s->previous_picture, AV_GET_BUFFER_FLAG_REF)) < 0)
+                return ret;
+        }
+        ff_thread_finish_setup(avctx);
+
+        p->pict_type        = AV_PICTURE_TYPE_I;
+        p->key_frame        = 1;
+        p->interlaced_frame = !!s->interlace_type;
+
+        /* compute the compressed row size */
+        if (!s->interlace_type) {
+            s->crow_size = s->row_size + 1;
+        } else {
+            s->pass          = 0;
+            s->pass_row_size = ff_png_pass_row_size(s->pass,
+                    s->bits_per_pixel,
+                    s->cur_w);
+            s->crow_size = s->pass_row_size + 1;
+        }
+        ff_dlog(avctx, "row_size=%d crow_size =%d\n",
+                s->row_size, s->crow_size);
+        s->image_buf      = p->data[0];
+        s->image_linesize = p->linesize[0];
+        /* copy the palette if needed */
+        if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+            memcpy(p->data[1], s->palette, 256 * sizeof(uint32_t));
+        /* empty row is used if differencing to the first row */
+        av_fast_padded_mallocz(&s->last_row, &s->last_row_size, s->row_size);
+        if (!s->last_row)
+            return AVERROR_INVALIDDATA;
+        if (s->interlace_type ||
+                s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
+            av_fast_padded_malloc(&s->tmp_row, &s->tmp_row_size, s->row_size);
+            if (!s->tmp_row)
+                return AVERROR_INVALIDDATA;
+        }
+        /* compressed row */
+        av_fast_padded_malloc(&s->buffer, &s->buffer_size, s->row_size + 16);
+        if (!s->buffer)
+            return AVERROR(ENOMEM);
+
+        /* we want crow_buf+1 to be 16-byte aligned */
+        s->crow_buf          = s->buffer + 15;
+        s->zstream.avail_out = s->crow_size;
+        s->zstream.next_out  = s->crow_buf;
+    }
+
+    s->state |= PNG_IDAT;
+
+    /* set image to non-transparent bpp while decompressing */
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE)
+        s->bpp -= byte_depth;
+
+    ret = png_decode_idat(s, length);
+
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE)
+        s->bpp += byte_depth;
+
+    if (ret < 0)
+        return ret;
+
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    return 0;
+}
+
+static int decode_plte_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    int n, i, r, g, b;
+
+    if ((length % 3) != 0 || length > 256 * 3)
         return AVERROR_INVALIDDATA;
+    /* read the palette */
+    n = length / 3;
+    for (i = 0; i < n; i++) {
+        r = bytestream2_get_byte(&s->gb);
+        g = bytestream2_get_byte(&s->gb);
+        b = bytestream2_get_byte(&s->gb);
+        s->palette[i] = (0xFFU << 24) | (r << 16) | (g << 8) | b;
     }
+    for (; i < 256; i++)
+        s->palette[i] = (0xFFU << 24);
+    s->state |= PNG_PLTE;
+    bytestream2_skip(&s->gb, 4);     /* crc */
 
-    bytestream2_init(&s->gb, buf + 8, buf_size - 8);
-    s->y = s->state = 0;
+    return 0;
+}
+
+static int decode_trns_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    int v, i;
+
+    if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        if (length > 256 || !(s->state & PNG_PLTE))
+            return AVERROR_INVALIDDATA;
+
+        for (i = 0; i < length; i++) {
+            v = bytestream2_get_byte(&s->gb);
+            s->palette[i] = (s->palette[i] & 0x00ffffff) | (v << 24);
+        }
+    } else if (s->color_type == PNG_COLOR_TYPE_GRAY || s->color_type == PNG_COLOR_TYPE_RGB) {
+        if ((s->color_type == PNG_COLOR_TYPE_GRAY && length != 2) ||
+            (s->color_type == PNG_COLOR_TYPE_RGB && length != 6))
+            return AVERROR_INVALIDDATA;
+
+        for (i = 0; i < length / 2; i++) {
+            /* only use the least significant bits */
+            v = av_mod_uintp2(bytestream2_get_be16(&s->gb), s->bit_depth);
+
+            if (s->bit_depth > 8)
+                AV_WB16(&s->transparent_color_be[2 * i], v);
+            else
+                s->transparent_color_be[i] = v;
+        }
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&s->gb, 4); /* crc */
+    s->has_trns = 1;
+
+    return 0;
+}
+
+static void handle_small_bpp(PNGDecContext *s, AVFrame *p)
+{
+    if (s->bits_per_pixel == 1 && s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        int i, j, k;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width / 8;
+            for (k = 7; k >= 1; k--)
+                if ((s->width&7) >= k)
+                    pd[8*i + k - 1] = (pd[i]>>8-k) & 1;
+            for (i--; i >= 0; i--) {
+                pd[8*i + 7]=  pd[i]     & 1;
+                pd[8*i + 6]= (pd[i]>>1) & 1;
+                pd[8*i + 5]= (pd[i]>>2) & 1;
+                pd[8*i + 4]= (pd[i]>>3) & 1;
+                pd[8*i + 3]= (pd[i]>>4) & 1;
+                pd[8*i + 2]= (pd[i]>>5) & 1;
+                pd[8*i + 1]= (pd[i]>>6) & 1;
+                pd[8*i + 0]=  pd[i]>>7;
+            }
+            pd += s->image_linesize;
+        }
+    } else if (s->bits_per_pixel == 2) {
+        int i, j;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width / 4;
+            if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+                if ((s->width&3) >= 3) pd[4*i + 2]= (pd[i] >> 2) & 3;
+                if ((s->width&3) >= 2) pd[4*i + 1]= (pd[i] >> 4) & 3;
+                if ((s->width&3) >= 1) pd[4*i + 0]=  pd[i] >> 6;
+                for (i--; i >= 0; i--) {
+                    pd[4*i + 3]=  pd[i]     & 3;
+                    pd[4*i + 2]= (pd[i]>>2) & 3;
+                    pd[4*i + 1]= (pd[i]>>4) & 3;
+                    pd[4*i + 0]=  pd[i]>>6;
+                }
+            } else {
+                if ((s->width&3) >= 3) pd[4*i + 2]= ((pd[i]>>2) & 3)*0x55;
+                if ((s->width&3) >= 2) pd[4*i + 1]= ((pd[i]>>4) & 3)*0x55;
+                if ((s->width&3) >= 1) pd[4*i + 0]= ( pd[i]>>6     )*0x55;
+                for (i--; i >= 0; i--) {
+                    pd[4*i + 3]= ( pd[i]     & 3)*0x55;
+                    pd[4*i + 2]= ((pd[i]>>2) & 3)*0x55;
+                    pd[4*i + 1]= ((pd[i]>>4) & 3)*0x55;
+                    pd[4*i + 0]= ( pd[i]>>6     )*0x55;
+                }
+            }
+            pd += s->image_linesize;
+        }
+    } else if (s->bits_per_pixel == 4) {
+        int i, j;
+        uint8_t *pd = p->data[0];
+        for (j = 0; j < s->height; j++) {
+            i = s->width/2;
+            if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
+                if (s->width&1) pd[2*i+0]= pd[i]>>4;
+                for (i--; i >= 0; i--) {
+                    pd[2*i + 1] = pd[i] & 15;
+                    pd[2*i + 0] = pd[i] >> 4;
+                }
+            } else {
+                if (s->width & 1) pd[2*i + 0]= (pd[i] >> 4) * 0x11;
+                for (i--; i >= 0; i--) {
+                    pd[2*i + 1] = (pd[i] & 15) * 0x11;
+                    pd[2*i + 0] = (pd[i] >> 4) * 0x11;
+                }
+            }
+            pd += s->image_linesize;
+        }
+    }
+}
+
+static int decode_fctl_chunk(AVCodecContext *avctx, PNGDecContext *s,
+                             uint32_t length)
+{
+    uint32_t sequence_number;
+    int cur_w, cur_h, x_offset, y_offset, dispose_op, blend_op;
+
+    if (length != 26)
+        return AVERROR_INVALIDDATA;
+
+    if (!(s->state & PNG_IHDR)) {
+        av_log(avctx, AV_LOG_ERROR, "fctl before IHDR\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->last_w = s->cur_w;
+    s->last_h = s->cur_h;
+    s->last_x_offset = s->x_offset;
+    s->last_y_offset = s->y_offset;
+    s->last_dispose_op = s->dispose_op;
+
+    sequence_number = bytestream2_get_be32(&s->gb);
+    cur_w           = bytestream2_get_be32(&s->gb);
+    cur_h           = bytestream2_get_be32(&s->gb);
+    x_offset        = bytestream2_get_be32(&s->gb);
+    y_offset        = bytestream2_get_be32(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* delay_num (2), delay_den (2) */
+    dispose_op      = bytestream2_get_byte(&s->gb);
+    blend_op        = bytestream2_get_byte(&s->gb);
+    bytestream2_skip(&s->gb, 4); /* crc */
+
+    if (sequence_number == 0 &&
+        (cur_w != s->width ||
+         cur_h != s->height ||
+         x_offset != 0 ||
+         y_offset != 0) ||
+        cur_w <= 0 || cur_h <= 0 ||
+        x_offset < 0 || y_offset < 0 ||
+        cur_w > s->width - x_offset|| cur_h > s->height - y_offset)
+            return AVERROR_INVALIDDATA;
+
+    if (blend_op != APNG_BLEND_OP_OVER && blend_op != APNG_BLEND_OP_SOURCE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid blend_op %d\n", blend_op);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (sequence_number == 0 && dispose_op == APNG_DISPOSE_OP_PREVIOUS) {
+        // No previous frame to revert to for the first frame
+        // Spec says to just treat it as a APNG_DISPOSE_OP_BACKGROUND
+        dispose_op = APNG_DISPOSE_OP_BACKGROUND;
+    }
+
+    if (blend_op == APNG_BLEND_OP_OVER && !s->has_trns && (
+            avctx->pix_fmt == AV_PIX_FMT_RGB24 ||
+            avctx->pix_fmt == AV_PIX_FMT_RGB48BE ||
+            avctx->pix_fmt == AV_PIX_FMT_PAL8 ||
+            avctx->pix_fmt == AV_PIX_FMT_GRAY8 ||
+            avctx->pix_fmt == AV_PIX_FMT_GRAY16BE ||
+            avctx->pix_fmt == AV_PIX_FMT_MONOBLACK
+        )) {
+        // APNG_BLEND_OP_OVER is the same as APNG_BLEND_OP_SOURCE when there is no alpha channel
+        blend_op = APNG_BLEND_OP_SOURCE;
+    }
+
+    s->cur_w      = cur_w;
+    s->cur_h      = cur_h;
+    s->x_offset   = x_offset;
+    s->y_offset   = y_offset;
+    s->dispose_op = dispose_op;
+    s->blend_op   = blend_op;
+
+    return 0;
+}
+
+static void handle_p_frame_png(PNGDecContext *s, AVFrame *p)
+{
+    int i, j;
+    uint8_t *pd      = p->data[0];
+    uint8_t *pd_last = s->last_picture.f->data[0];
+    int ls = FFMIN(av_image_get_linesize(p->format, s->width, 0), s->width * s->bpp);
+
+    ff_thread_await_progress(&s->last_picture, INT_MAX, 0);
+    for (j = 0; j < s->height; j++) {
+        for (i = 0; i < ls; i++)
+            pd[i] += pd_last[i];
+        pd      += s->image_linesize;
+        pd_last += s->image_linesize;
+    }
+}
+
+// divide by 255 and round to nearest
+// apply a fast variant: (X+127)/255 = ((X+127)*257+257)>>16 = ((X+128)*257)>>16
+#define FAST_DIV255(x) ((((x) + 128) * 257) >> 16)
+
+static int handle_p_frame_apng(AVCodecContext *avctx, PNGDecContext *s,
+                               AVFrame *p)
+{
+    size_t x, y;
+    uint8_t *buffer;
+
+    if (s->blend_op == APNG_BLEND_OP_OVER &&
+        avctx->pix_fmt != AV_PIX_FMT_RGBA &&
+        avctx->pix_fmt != AV_PIX_FMT_GRAY8A &&
+        avctx->pix_fmt != AV_PIX_FMT_PAL8) {
+        avpriv_request_sample(avctx, "Blending with pixel format %s",
+                              av_get_pix_fmt_name(avctx->pix_fmt));
+        return AVERROR_PATCHWELCOME;
+    }
+
+    buffer = av_malloc_array(s->image_linesize, s->height);
+    if (!buffer)
+        return AVERROR(ENOMEM);
+
+
+    // Do the disposal operation specified by the last frame on the frame
+    if (s->last_dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+        ff_thread_await_progress(&s->last_picture, INT_MAX, 0);
+        memcpy(buffer, s->last_picture.f->data[0], s->image_linesize * s->height);
+
+        if (s->last_dispose_op == APNG_DISPOSE_OP_BACKGROUND)
+            for (y = s->last_y_offset; y < s->last_y_offset + s->last_h; ++y)
+                memset(buffer + s->image_linesize * y + s->bpp * s->last_x_offset, 0, s->bpp * s->last_w);
+
+        memcpy(s->previous_picture.f->data[0], buffer, s->image_linesize * s->height);
+        ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+    } else {
+        ff_thread_await_progress(&s->previous_picture, INT_MAX, 0);
+        memcpy(buffer, s->previous_picture.f->data[0], s->image_linesize * s->height);
+    }
+
+    // Perform blending
+    if (s->blend_op == APNG_BLEND_OP_SOURCE) {
+        for (y = s->y_offset; y < s->y_offset + s->cur_h; ++y) {
+            size_t row_start = s->image_linesize * y + s->bpp * s->x_offset;
+            memcpy(buffer + row_start, p->data[0] + row_start, s->bpp * s->cur_w);
+        }
+    } else { // APNG_BLEND_OP_OVER
+        for (y = s->y_offset; y < s->y_offset + s->cur_h; ++y) {
+            uint8_t *foreground = p->data[0] + s->image_linesize * y + s->bpp * s->x_offset;
+            uint8_t *background = buffer + s->image_linesize * y + s->bpp * s->x_offset;
+            for (x = s->x_offset; x < s->x_offset + s->cur_w; ++x, foreground += s->bpp, background += s->bpp) {
+                size_t b;
+                uint8_t foreground_alpha, background_alpha, output_alpha;
+                uint8_t output[10];
+
+                // Since we might be blending alpha onto alpha, we use the following equations:
+                // output_alpha = foreground_alpha + (1 - foreground_alpha) * background_alpha
+                // output = (foreground_alpha * foreground + (1 - foreground_alpha) * background_alpha * background) / output_alpha
+
+                switch (avctx->pix_fmt) {
+                case AV_PIX_FMT_RGBA:
+                    foreground_alpha = foreground[3];
+                    background_alpha = background[3];
+                    break;
+
+                case AV_PIX_FMT_GRAY8A:
+                    foreground_alpha = foreground[1];
+                    background_alpha = background[1];
+                    break;
+
+                case AV_PIX_FMT_PAL8:
+                    foreground_alpha = s->palette[foreground[0]] >> 24;
+                    background_alpha = s->palette[background[0]] >> 24;
+                    break;
+                }
+
+                if (foreground_alpha == 0)
+                    continue;
+
+                if (foreground_alpha == 255) {
+                    memcpy(background, foreground, s->bpp);
+                    continue;
+                }
+
+                if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+                    // TODO: Alpha blending with PAL8 will likely need the entire image converted over to RGBA first
+                    avpriv_request_sample(avctx, "Alpha blending palette samples");
+                    background[0] = foreground[0];
+                    continue;
+                }
+
+                output_alpha = foreground_alpha + FAST_DIV255((255 - foreground_alpha) * background_alpha);
+
+                av_assert0(s->bpp <= 10);
+
+                for (b = 0; b < s->bpp - 1; ++b) {
+                    if (output_alpha == 0) {
+                        output[b] = 0;
+                    } else if (background_alpha == 255) {
+                        output[b] = FAST_DIV255(foreground_alpha * foreground[b] + (255 - foreground_alpha) * background[b]);
+                    } else {
+                        output[b] = (255 * foreground_alpha * foreground[b] + (255 - foreground_alpha) * background_alpha * background[b]) / (255 * output_alpha);
+                    }
+                }
+                output[b] = output_alpha;
+                memcpy(background, output, s->bpp);
+            }
+        }
+    }
+
+    // Copy blended buffer into the frame and free
+    memcpy(p->data[0], buffer, s->image_linesize * s->height);
+    av_free(buffer);
+
+    return 0;
+}
+
+static int decode_frame_common(AVCodecContext *avctx, PNGDecContext *s,
+                               AVFrame *p, AVPacket *avpkt)
+{
+    AVDictionary *metadata  = NULL;
+    uint32_t tag, length;
+    int decode_next_dat = 0;
+    int ret;
 
-    /* init the zlib */
-    s->zstream.zalloc = ff_png_zalloc;
-    s->zstream.zfree  = ff_png_zfree;
-    s->zstream.opaque = NULL;
-    ret = inflateInit(&s->zstream);
-    if (ret != Z_OK)
-        return -1;
     for (;;) {
-        if (bytestream2_get_bytes_left(&s->gb) <= 0)
+        length = bytestream2_get_bytes_left(&s->gb);
+        if (length <= 0) {
+
+            if (avctx->codec_id == AV_CODEC_ID_PNG &&
+                avctx->skip_frame == AVDISCARD_ALL) {
+                av_frame_set_metadata(p, metadata);
+                return 0;
+            }
+
+            if (CONFIG_APNG_DECODER && avctx->codec_id == AV_CODEC_ID_APNG && length == 0) {
+                if (!(s->state & PNG_IDAT))
+                    return 0;
+                else
+                    goto exit_loop;
+            }
+            av_log(avctx, AV_LOG_ERROR, "%d bytes left\n", length);
+            if (   s->state & PNG_ALLIMAGE
+                && avctx->strict_std_compliance <= FF_COMPLIANCE_NORMAL)
+                goto exit_loop;
+            ret = AVERROR_INVALIDDATA;
             goto fail;
+        }
+
         length = bytestream2_get_be32(&s->gb);
-        if (length > 0x7fffffff)
+        if (length > 0x7fffffff || length > bytestream2_get_bytes_left(&s->gb)) {
+            av_log(avctx, AV_LOG_ERROR, "chunk too big\n");
+            ret = AVERROR_INVALIDDATA;
             goto fail;
+        }
         tag = bytestream2_get_le32(&s->gb);
-        ff_dlog(avctx, "png: tag=%c%c%c%c length=%u\n",
+        if (avctx->debug & FF_DEBUG_STARTCODE)
+            av_log(avctx, AV_LOG_DEBUG, "png: tag=%c%c%c%c length=%u\n",
                 (tag & 0xff),
                 ((tag >> 8) & 0xff),
                 ((tag >> 16) & 0xff),
                 ((tag >> 24) & 0xff), length);
+
+        if (avctx->codec_id == AV_CODEC_ID_PNG &&
+            avctx->skip_frame == AVDISCARD_ALL) {
+            switch(tag) {
+            case MKTAG('I', 'H', 'D', 'R'):
+            case MKTAG('p', 'H', 'Y', 's'):
+            case MKTAG('t', 'E', 'X', 't'):
+            case MKTAG('I', 'D', 'A', 'T'):
+            case MKTAG('t', 'R', 'N', 'S'):
+                break;
+            default:
+                goto skip_tag;
+            }
+        }
+
         switch (tag) {
         case MKTAG('I', 'H', 'D', 'R'):
-            if (length != 13)
+            if ((ret = decode_ihdr_chunk(avctx, s, length)) < 0)
                 goto fail;
-            s->width  = bytestream2_get_be32(&s->gb);
-            s->height = bytestream2_get_be32(&s->gb);
-            if (av_image_check_size(s->width, s->height, 0, avctx)) {
-                s->width = s->height = 0;
+            break;
+        case MKTAG('p', 'H', 'Y', 's'):
+            if ((ret = decode_phys_chunk(avctx, s)) < 0)
                 goto fail;
-            }
-            s->bit_depth        = bytestream2_get_byte(&s->gb);
-            s->color_type       = bytestream2_get_byte(&s->gb);
-            s->compression_type = bytestream2_get_byte(&s->gb);
-            s->filter_type      = bytestream2_get_byte(&s->gb);
-            s->interlace_type   = bytestream2_get_byte(&s->gb);
-            bytestream2_skip(&s->gb, 4); /* crc */
-            s->state |= PNG_IHDR;
-            ff_dlog(avctx, "width=%d height=%d depth=%d color_type=%d "
-                           "compression_type=%d filter_type=%d interlace_type=%d\n",
-                    s->width, s->height, s->bit_depth, s->color_type,
-                    s->compression_type, s->filter_type, s->interlace_type);
             break;
-        case MKTAG('I', 'D', 'A', 'T'):
-            if (!(s->state & PNG_IHDR))
+        case MKTAG('f', 'c', 'T', 'L'):
+            if (!CONFIG_APNG_DECODER || avctx->codec_id != AV_CODEC_ID_APNG)
+                goto skip_tag;
+            if ((ret = decode_fctl_chunk(avctx, s, length)) < 0)
+                goto fail;
+            decode_next_dat = 1;
+            break;
+        case MKTAG('f', 'd', 'A', 'T'):
+            if (!CONFIG_APNG_DECODER || avctx->codec_id != AV_CODEC_ID_APNG)
+                goto skip_tag;
+            if (!decode_next_dat) {
+                ret = AVERROR_INVALIDDATA;
                 goto fail;
-            if (!(s->state & PNG_IDAT)) {
-                /* init image info */
-                avctx->width  = s->width;
-                avctx->height = s->height;
-
-                s->channels       = ff_png_get_nb_channels(s->color_type);
-                s->bits_per_pixel = s->bit_depth * s->channels;
-                s->bpp            = (s->bits_per_pixel + 7) >> 3;
-                s->row_size       = (avctx->width * s->bits_per_pixel + 7) >> 3;
-
-                if (s->bit_depth == 8 &&
-                    s->color_type == PNG_COLOR_TYPE_RGB) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB24;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB32;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY8;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_RGB) {
-                    avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
-                } else if (s->bit_depth == 1 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY) {
-                    avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_PALETTE) {
-                    avctx->pix_fmt = AV_PIX_FMT_PAL8;
-                } else if (s->bit_depth == 8 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_YA8;
-                } else if (s->bit_depth == 16 &&
-                           s->color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-                    avctx->pix_fmt = AV_PIX_FMT_YA16BE;
-                } else {
-                    goto fail;
-                }
-
-                if (ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF) < 0) {
-                    av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                    goto fail;
-                }
-                p->pict_type        = AV_PICTURE_TYPE_I;
-                p->key_frame        = 1;
-                p->interlaced_frame = !!s->interlace_type;
-
-                /* compute the compressed row size */
-                if (!s->interlace_type) {
-                    s->crow_size = s->row_size + 1;
-                } else {
-                    s->pass          = 0;
-                    s->pass_row_size = ff_png_pass_row_size(s->pass,
-                                                            s->bits_per_pixel,
-                                                            s->width);
-                    s->crow_size = s->pass_row_size + 1;
-                }
-                ff_dlog(avctx, "row_size=%d crow_size =%d\n",
-                        s->row_size, s->crow_size);
-                s->image_buf      = p->data[0];
-                s->image_linesize = p->linesize[0];
-                /* copy the palette if needed */
-                if (s->color_type == PNG_COLOR_TYPE_PALETTE)
-                    memcpy(p->data[1], s->palette, 256 * sizeof(uint32_t));
-                /* empty row is used if differencing to the first row */
-                s->last_row = av_mallocz(s->row_size);
-                if (!s->last_row)
-                    goto fail;
-                if (s->interlace_type ||
-                    s->color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                    s->tmp_row = av_malloc(s->row_size);
-                    if (!s->tmp_row)
-                        goto fail;
-                }
-                /* compressed row */
-                crow_buf_base = av_malloc(s->row_size + 16);
-                if (!crow_buf_base)
-                    goto fail;
-
-                /* we want crow_buf+1 to be 16-byte aligned */
-                s->crow_buf          = crow_buf_base + 15;
-                s->zstream.avail_out = s->crow_size;
-                s->zstream.next_out  = s->crow_buf;
             }
-            s->state |= PNG_IDAT;
-            if (png_decode_idat(s, length) < 0)
+            bytestream2_get_be32(&s->gb);
+            length -= 4;
+            /* fallthrough */
+        case MKTAG('I', 'D', 'A', 'T'):
+            if (CONFIG_APNG_DECODER && avctx->codec_id == AV_CODEC_ID_APNG && !decode_next_dat)
+                goto skip_tag;
+            if ((ret = decode_idat_chunk(avctx, s, length, p)) < 0)
                 goto fail;
-            bytestream2_skip(&s->gb, 4); /* crc */
             break;
         case MKTAG('P', 'L', 'T', 'E'):
-        {
-            int n, i, r, g, b;
-
-            if ((length % 3) != 0 || length > 256 * 3)
+            if (decode_plte_chunk(avctx, s, length) < 0)
                 goto skip_tag;
-            /* read the palette */
-            n = length / 3;
-            for (i = 0; i < n; i++) {
-                r = bytestream2_get_byte(&s->gb);
-                g = bytestream2_get_byte(&s->gb);
-                b = bytestream2_get_byte(&s->gb);
-                s->palette[i] = (0xff << 24) | (r << 16) | (g << 8) | b;
-            }
-            for (; i < 256; i++)
-                s->palette[i] = (0xff << 24);
-            s->state |= PNG_PLTE;
-            bytestream2_skip(&s->gb, 4);     /* crc */
-        }
-        break;
+            break;
         case MKTAG('t', 'R', 'N', 'S'):
-        {
-            int v, i;
-
-            /* read the transparency. XXX: Only palette mode supported */
-            if (s->color_type != PNG_COLOR_TYPE_PALETTE ||
-                length > 256 ||
-                !(s->state & PNG_PLTE))
+            if (decode_trns_chunk(avctx, s, length) < 0)
                 goto skip_tag;
-            for (i = 0; i < length; i++) {
-                v = bytestream2_get_byte(&s->gb);
-                s->palette[i] = (s->palette[i] & 0x00ffffff) | (v << 24);
-            }
-            bytestream2_skip(&s->gb, 4);     /* crc */
-        }
-        break;
+            break;
+        case MKTAG('t', 'E', 'X', 't'):
+            if (decode_text_chunk(s, length, 0, &metadata) < 0)
+                av_log(avctx, AV_LOG_WARNING, "Broken tEXt chunk\n");
+            bytestream2_skip(&s->gb, length + 4);
+            break;
+        case MKTAG('z', 'T', 'X', 't'):
+            if (decode_text_chunk(s, length, 1, &metadata) < 0)
+                av_log(avctx, AV_LOG_WARNING, "Broken zTXt chunk\n");
+            bytestream2_skip(&s->gb, length + 4);
+            break;
         case MKTAG('s', 'T', 'E', 'R'): {
             int mode = bytestream2_get_byte(&s->gb);
             AVStereo3D *stereo3d = av_stereo3d_create_side_data(p);
             if (!stereo3d)
-                goto the_end;
+                goto fail;
 
             if (mode == 0 || mode == 1) {
                 stereo3d->type  = AV_STEREO3D_SIDEBYSIDE;
@@ -628,7 +1211,11 @@ static int decode_frame(AVCodecContext *avctx,
         }
         case MKTAG('I', 'E', 'N', 'D'):
             if (!(s->state & PNG_ALLIMAGE))
+                av_log(avctx, AV_LOG_ERROR, "IEND without all image\n");
+            if (!(s->state & (PNG_ALLIMAGE|PNG_IDAT))) {
+                ret = AVERROR_INVALIDDATA;
                 goto fail;
+            }
             bytestream2_skip(&s->gb, 4); /* crc */
             goto exit_loop;
         default:
@@ -639,40 +1226,232 @@ skip_tag:
         }
     }
 exit_loop:
-    /* handle P-frames only if a predecessor frame is available */
-    if (s->prev->data[0]) {
-        if (!(avpkt->flags & AV_PKT_FLAG_KEY)) {
-            int i, j;
-            uint8_t *pd      = p->data[0];
-            uint8_t *pd_last = s->prev->data[0];
-
-            for (j = 0; j < s->height; j++) {
-                for (i = 0; i < s->width * s->bpp; i++)
-                    pd[i] += pd_last[i];
-                pd      += s->image_linesize;
-                pd_last += s->image_linesize;
+    if (avctx->codec_id == AV_CODEC_ID_PNG &&
+        avctx->skip_frame == AVDISCARD_ALL) {
+        av_frame_set_metadata(p, metadata);
+        return 0;
+    }
+
+    if (s->bits_per_pixel <= 4)
+        handle_small_bpp(s, p);
+
+    /* apply transparency if needed */
+    if (s->has_trns && s->color_type != PNG_COLOR_TYPE_PALETTE) {
+        size_t byte_depth = s->bit_depth > 8 ? 2 : 1;
+        size_t raw_bpp = s->bpp - byte_depth;
+        unsigned x, y;
+
+        for (y = 0; y < s->height; ++y) {
+            uint8_t *row = &s->image_buf[s->image_linesize * y];
+
+            /* since we're updating in-place, we have to go from right to left */
+            for (x = s->width; x > 0; --x) {
+                uint8_t *pixel = &row[s->bpp * (x - 1)];
+                memmove(pixel, &row[raw_bpp * (x - 1)], raw_bpp);
+
+                if (!memcmp(pixel, s->transparent_color_be, raw_bpp)) {
+                    memset(&pixel[raw_bpp], 0, byte_depth);
+                } else {
+                    memset(&pixel[raw_bpp], 0xff, byte_depth);
+                }
             }
         }
     }
 
-    av_frame_unref(s->prev);
-    if ((ret = av_frame_ref(s->prev, p)) < 0)
-        goto fail;
+    /* handle P-frames only if a predecessor frame is available */
+    if (s->last_picture.f->data[0]) {
+        if (   !(avpkt->flags & AV_PKT_FLAG_KEY) && avctx->codec_tag != AV_RL32("MPNG")
+            && s->last_picture.f->width == p->width
+            && s->last_picture.f->height== p->height
+            && s->last_picture.f->format== p->format
+         ) {
+            if (CONFIG_PNG_DECODER && avctx->codec_id != AV_CODEC_ID_APNG)
+                handle_p_frame_png(s, p);
+            else if (CONFIG_APNG_DECODER &&
+                     avctx->codec_id == AV_CODEC_ID_APNG &&
+                     (ret = handle_p_frame_apng(avctx, s, p)) < 0)
+                goto fail;
+        }
+    }
+    ff_thread_report_progress(&s->picture, INT_MAX, 0);
+    ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+
+    av_frame_set_metadata(p, metadata);
+    metadata   = NULL;
+    return 0;
+
+fail:
+    av_dict_free(&metadata);
+    ff_thread_report_progress(&s->picture, INT_MAX, 0);
+    ff_thread_report_progress(&s->previous_picture, INT_MAX, 0);
+    return ret;
+}
+
+#if CONFIG_PNG_DECODER
+static int decode_frame_png(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    PNGDecContext *const s = avctx->priv_data;
+    const uint8_t *buf     = avpkt->data;
+    int buf_size           = avpkt->size;
+    AVFrame *p;
+    int64_t sig;
+    int ret;
+
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    FFSWAP(ThreadFrame, s->picture, s->last_picture);
+    p = s->picture.f;
+
+    bytestream2_init(&s->gb, buf, buf_size);
+
+    /* check signature */
+    sig = bytestream2_get_be64(&s->gb);
+    if (sig != PNGSIG &&
+        sig != MNGSIG) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid PNG signature 0x%08"PRIX64".\n", sig);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->y = s->state = s->has_trns = 0;
+
+    /* init the zlib */
+    s->zstream.zalloc = ff_png_zalloc;
+    s->zstream.zfree  = ff_png_zfree;
+    s->zstream.opaque = NULL;
+    ret = inflateInit(&s->zstream);
+    if (ret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "inflateInit returned error %d\n", ret);
+        return AVERROR_EXTERNAL;
+    }
+
+    if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+        goto the_end;
+
+    if (avctx->skip_frame == AVDISCARD_ALL) {
+        *got_frame = 0;
+        ret = bytestream2_tell(&s->gb);
+        goto the_end;
+    }
+
+    if ((ret = av_frame_ref(data, s->picture.f)) < 0)
+        return ret;
 
     *got_frame = 1;
 
     ret = bytestream2_tell(&s->gb);
 the_end:
     inflateEnd(&s->zstream);
-    av_free(crow_buf_base);
     s->crow_buf = NULL;
-    av_freep(&s->last_row);
-    av_freep(&s->tmp_row);
     return ret;
-fail:
-    ret = -1;
-    goto the_end;
 }
+#endif
+
+#if CONFIG_APNG_DECODER
+static int decode_frame_apng(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    PNGDecContext *const s = avctx->priv_data;
+    int ret;
+    AVFrame *p;
+
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    FFSWAP(ThreadFrame, s->picture, s->last_picture);
+    p = s->picture.f;
+
+    if (!(s->state & PNG_IHDR)) {
+        if (!avctx->extradata_size)
+            return AVERROR_INVALIDDATA;
+
+        /* only init fields, there is no zlib use in extradata */
+        s->zstream.zalloc = ff_png_zalloc;
+        s->zstream.zfree  = ff_png_zfree;
+
+        bytestream2_init(&s->gb, avctx->extradata, avctx->extradata_size);
+        if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+            goto end;
+    }
+
+    /* reset state for a new frame */
+    if ((ret = inflateInit(&s->zstream)) != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "inflateInit returned error %d\n", ret);
+        ret = AVERROR_EXTERNAL;
+        goto end;
+    }
+    s->y = 0;
+    s->state &= ~(PNG_IDAT | PNG_ALLIMAGE);
+    bytestream2_init(&s->gb, avpkt->data, avpkt->size);
+    if ((ret = decode_frame_common(avctx, s, p, avpkt)) < 0)
+        goto end;
+
+    if (!(s->state & PNG_ALLIMAGE))
+        av_log(avctx, AV_LOG_WARNING, "Frame did not contain a complete image\n");
+    if (!(s->state & (PNG_ALLIMAGE|PNG_IDAT))) {
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    if ((ret = av_frame_ref(data, s->picture.f)) < 0)
+        goto end;
+
+    *got_frame = 1;
+    ret = bytestream2_tell(&s->gb);
+
+end:
+    inflateEnd(&s->zstream);
+    return ret;
+}
+#endif
+
+#if HAVE_THREADS
+static int update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    PNGDecContext *psrc = src->priv_data;
+    PNGDecContext *pdst = dst->priv_data;
+    int ret;
+
+    if (dst == src)
+        return 0;
+
+    ff_thread_release_buffer(dst, &pdst->picture);
+    if (psrc->picture.f->data[0] &&
+        (ret = ff_thread_ref_frame(&pdst->picture, &psrc->picture)) < 0)
+        return ret;
+    if (CONFIG_APNG_DECODER && dst->codec_id == AV_CODEC_ID_APNG) {
+        pdst->width             = psrc->width;
+        pdst->height            = psrc->height;
+        pdst->bit_depth         = psrc->bit_depth;
+        pdst->color_type        = psrc->color_type;
+        pdst->compression_type  = psrc->compression_type;
+        pdst->interlace_type    = psrc->interlace_type;
+        pdst->filter_type       = psrc->filter_type;
+        pdst->cur_w = psrc->cur_w;
+        pdst->cur_h = psrc->cur_h;
+        pdst->x_offset = psrc->x_offset;
+        pdst->y_offset = psrc->y_offset;
+        pdst->has_trns = psrc->has_trns;
+        memcpy(pdst->transparent_color_be, psrc->transparent_color_be, sizeof(pdst->transparent_color_be));
+
+        pdst->dispose_op = psrc->dispose_op;
+
+        memcpy(pdst->palette, psrc->palette, sizeof(pdst->palette));
+
+        pdst->state |= psrc->state & (PNG_IHDR | PNG_PLTE);
+
+        ff_thread_release_buffer(dst, &pdst->last_picture);
+        if (psrc->last_picture.f->data[0] &&
+            (ret = ff_thread_ref_frame(&pdst->last_picture, &psrc->last_picture)) < 0)
+            return ret;
+
+        ff_thread_release_buffer(dst, &pdst->previous_picture);
+        if (psrc->previous_picture.f->data[0] &&
+            (ret = ff_thread_ref_frame(&pdst->previous_picture, &psrc->previous_picture)) < 0)
+            return ret;
+    }
+
+    return 0;
+}
+#endif
 
 static av_cold int png_dec_init(AVCodecContext *avctx)
 {
@@ -680,11 +1459,21 @@ static av_cold int png_dec_init(AVCodecContext *avctx)
 
     avctx->color_range = AVCOL_RANGE_JPEG;
 
-    s->prev = av_frame_alloc();
-    if (!s->prev)
+    s->avctx = avctx;
+    s->previous_picture.f = av_frame_alloc();
+    s->last_picture.f = av_frame_alloc();
+    s->picture.f = av_frame_alloc();
+    if (!s->previous_picture.f || !s->last_picture.f || !s->picture.f) {
+        av_frame_free(&s->previous_picture.f);
+        av_frame_free(&s->last_picture.f);
+        av_frame_free(&s->picture.f);
         return AVERROR(ENOMEM);
+    }
 
-    ff_pngdsp_init(&s->dsp);
+    if (!avctx->internal->is_copy) {
+        avctx->internal->allocate_progress = 1;
+        ff_pngdsp_init(&s->dsp);
+    }
 
     return 0;
 }
@@ -693,11 +1482,39 @@ static av_cold int png_dec_end(AVCodecContext *avctx)
 {
     PNGDecContext *s = avctx->priv_data;
 
-    av_frame_free(&s->prev);
+    ff_thread_release_buffer(avctx, &s->previous_picture);
+    av_frame_free(&s->previous_picture.f);
+    ff_thread_release_buffer(avctx, &s->last_picture);
+    av_frame_free(&s->last_picture.f);
+    ff_thread_release_buffer(avctx, &s->picture);
+    av_frame_free(&s->picture.f);
+    av_freep(&s->buffer);
+    s->buffer_size = 0;
+    av_freep(&s->last_row);
+    s->last_row_size = 0;
+    av_freep(&s->tmp_row);
+    s->tmp_row_size = 0;
 
     return 0;
 }
 
+#if CONFIG_APNG_DECODER
+AVCodec ff_apng_decoder = {
+    .name           = "apng",
+    .long_name      = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_APNG,
+    .priv_data_size = sizeof(PNGDecContext),
+    .init           = png_dec_init,
+    .close          = png_dec_end,
+    .decode         = decode_frame_apng,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(png_dec_init),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+};
+#endif
+
+#if CONFIG_PNG_DECODER
 AVCodec ff_png_decoder = {
     .name           = "png",
     .long_name      = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
@@ -706,6 +1523,10 @@ AVCodec ff_png_decoder = {
     .priv_data_size = sizeof(PNGDecContext),
     .init           = png_dec_init,
     .close          = png_dec_end,
-    .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .decode         = decode_frame_png,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(png_dec_init),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .caps_internal  = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
 };
+#endif
diff --git a/libavcodec/pngdsp.c b/libavcodec/pngdsp.c
index c0e9402..d275316 100644
--- a/libavcodec/pngdsp.c
+++ b/libavcodec/pngdsp.c
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h
index 607fe64..5475d0d 100644
--- a/libavcodec/pngdsp.h
+++ b/libavcodec/pngdsp.h
@@ -2,20 +2,20 @@
  * PNG image format
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,9 +25,9 @@
 #include <stdint.h>
 
 typedef struct PNGDSPContext {
-    void (*add_bytes_l2)(uint8_t *dst  /* align 16 */,
+    void (*add_bytes_l2)(uint8_t *dst,
                          uint8_t *src1 /* align 16 */,
-                         uint8_t *src2 /* align 16 */, int w);
+                         uint8_t *src2, int w);
 
     /* this might write to dst[w] */
     void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c
index f91c54c..00c830e 100644
--- a/libavcodec/pngenc.c
+++ b/libavcodec/pngenc.c
@@ -2,39 +2,49 @@
  * PNG image format
  * Copyright (c) 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/opt.h"
-#include "libavutil/stereo3d.h"
-
 #include "avcodec.h"
+#include "internal.h"
 #include "bytestream.h"
 #include "huffyuvencdsp.h"
 #include "png.h"
+#include "apng.h"
 
-/* TODO:
- * - add 2, 4 and 16 bit depth support
- */
+#include "libavutil/avassert.h"
+#include "libavutil/crc.h"
+#include "libavutil/libm.h"
+#include "libavutil/opt.h"
+#include "libavutil/color_utils.h"
+#include "libavutil/stereo3d.h"
 
 #include <zlib.h>
 
 #define IOBUF_SIZE 4096
 
+typedef struct APNGFctlChunk {
+    uint32_t sequence_number;
+    uint32_t width, height;
+    uint32_t x_offset, y_offset;
+    uint16_t delay_num, delay_den;
+    uint8_t dispose_op, blend_op;
+} APNGFctlChunk;
+
 typedef struct PNGEncContext {
     AVClass *class;
     HuffYUVEncDSPContext hdsp;
@@ -47,6 +57,23 @@ typedef struct PNGEncContext {
 
     z_stream zstream;
     uint8_t buf[IOBUF_SIZE];
+    int dpi;                     ///< Physical pixel density, in dots per inch, if set
+    int dpm;                     ///< Physical pixel density, in dots per meter, if set
+
+    int is_progressive;
+    int bit_depth;
+    int color_type;
+    int bits_per_pixel;
+
+    // APNG
+    uint32_t palette_checksum;   // Used to ensure a single unique palette
+    uint32_t sequence_number;
+
+    AVFrame *prev_frame;
+    AVFrame *last_frame;
+    APNGFctlChunk last_frame_fctl;
+    uint8_t *last_frame_packet;
+    size_t last_frame_packet_size;
 } PNGEncContext;
 
 static void png_get_interlaced_row(uint8_t *dst, int row_size,
@@ -56,8 +83,9 @@ static void png_get_interlaced_row(uint8_t *dst, int row_size,
     int x, mask, dst_x, j, b, bpp;
     uint8_t *d;
     const uint8_t *s;
+    static const int masks[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff};
 
-    mask = ff_png_pass_mask[pass];
+    mask = masks[pass];
     switch (bits_per_pixel) {
     case 1:
         memset(dst, 0, row_size);
@@ -115,6 +143,22 @@ static void sub_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top,
     }
 }
 
+static void sub_left_prediction(PNGEncContext *c, uint8_t *dst, const uint8_t *src, int bpp, int size)
+{
+    const uint8_t *src1 = src + bpp;
+    const uint8_t *src2 = src;
+    int x, unaligned_w;
+
+    memcpy(dst, src, bpp);
+    dst += bpp;
+    size -= bpp;
+    unaligned_w = FFMIN(32 - bpp, size);
+    for (x = 0; x < unaligned_w; x++)
+        *dst++ = *src1++ - *src2++;
+    size -= unaligned_w;
+    c->hdsp.diff_bytes(dst, src1, src2, size);
+}
+
 static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
                            uint8_t *src, uint8_t *top, int size, int bpp)
 {
@@ -125,8 +169,7 @@ static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
         memcpy(dst, src, size);
         break;
     case PNG_FILTER_VALUE_SUB:
-        c->hdsp.diff_bytes(dst, src, src - bpp, size);
-        memcpy(dst, src, bpp);
+        sub_left_prediction(c, dst, src, bpp, size);
         break;
     case PNG_FILTER_VALUE_UP:
         c->hdsp.diff_bytes(dst, src, top, size);
@@ -149,7 +192,7 @@ static uint8_t *png_choose_filter(PNGEncContext *s, uint8_t *dst,
                                   uint8_t *src, uint8_t *top, int size, int bpp)
 {
     int pred = s->filter_type;
-    assert(bpp || !pred);
+    av_assert0(bpp || !pred);
     if (!top && pred)
         pred = PNG_FILTER_VALUE_SUB;
     if (pred == PNG_FILTER_VALUE_MIXED) {
@@ -175,45 +218,56 @@ static uint8_t *png_choose_filter(PNGEncContext *s, uint8_t *dst,
     }
 }
 
-static void convert_from_rgb32(uint8_t *dst, const uint8_t *src, int width)
-{
-    uint8_t *d;
-    int j;
-    unsigned int v;
-
-    d = dst;
-    for (j = 0; j < width; j++) {
-        v    = ((const uint32_t *) src)[j];
-        d[0] = v >> 16;
-        d[1] = v >> 8;
-        d[2] = v;
-        d[3] = v >> 24;
-        d   += 4;
-    }
-}
-
 static void png_write_chunk(uint8_t **f, uint32_t tag,
                             const uint8_t *buf, int length)
 {
-    uint32_t crc;
+    const AVCRC *crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    uint32_t crc = ~0U;
     uint8_t tagbuf[4];
 
     bytestream_put_be32(f, length);
-    crc = crc32(0, Z_NULL, 0);
     AV_WL32(tagbuf, tag);
-    crc = crc32(crc, tagbuf, 4);
+    crc = av_crc(crc_table, crc, tagbuf, 4);
     bytestream_put_be32(f, av_bswap32(tag));
     if (length > 0) {
-        crc = crc32(crc, buf, length);
+        crc = av_crc(crc_table, crc, buf, length);
         memcpy(*f, buf, length);
         *f += length;
     }
-    bytestream_put_be32(f, crc);
+    bytestream_put_be32(f, ~crc);
+}
+
+static void png_write_image_data(AVCodecContext *avctx,
+                                 const uint8_t *buf, int length)
+{
+    PNGEncContext *s = avctx->priv_data;
+    const AVCRC *crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    uint32_t crc = ~0U;
+
+    if (avctx->codec_id == AV_CODEC_ID_PNG || avctx->frame_number == 0) {
+        png_write_chunk(&s->bytestream, MKTAG('I', 'D', 'A', 'T'), buf, length);
+        return;
+    }
+
+    bytestream_put_be32(&s->bytestream, length + 4);
+
+    bytestream_put_be32(&s->bytestream, MKBETAG('f', 'd', 'A', 'T'));
+    bytestream_put_be32(&s->bytestream, s->sequence_number);
+    crc = av_crc(crc_table, crc, s->bytestream - 8, 8);
+
+    crc = av_crc(crc_table, crc, buf, length);
+    memcpy(s->bytestream, buf, length);
+    s->bytestream += length;
+
+    bytestream_put_be32(&s->bytestream, ~crc);
+
+    ++s->sequence_number;
 }
 
 /* XXX: do filtering */
-static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
+static int png_write_row(AVCodecContext *avctx, const uint8_t *data, int size)
 {
+    PNGEncContext *s = avctx->priv_data;
     int ret;
 
     s->zstream.avail_in = size;
@@ -224,8 +278,7 @@ static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
             return -1;
         if (s->zstream.avail_out == 0) {
             if (s->bytestream_end - s->bytestream > IOBUF_SIZE + 100)
-                png_write_chunk(&s->bytestream,
-                                MKTAG('I', 'D', 'A', 'T'), s->buf, IOBUF_SIZE);
+                png_write_image_data(avctx, s->buf, IOBUF_SIZE);
             s->zstream.avail_out = IOBUF_SIZE;
             s->zstream.next_out  = s->buf;
         }
@@ -233,137 +286,130 @@ static int png_write_row(PNGEncContext *s, const uint8_t *data, int size)
     return 0;
 }
 
-static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                        const AVFrame *pict, int *got_packet)
+#define AV_WB32_PNG(buf, n) AV_WB32(buf, lrint((n) * 100000))
+static int png_get_chrm(enum AVColorPrimaries prim,  uint8_t *buf)
 {
-    PNGEncContext *s       = avctx->priv_data;
-    AVFrameSideData *side_data;
-    const AVFrame *const p = pict;
-    int bit_depth, color_type, y, len, row_size, ret, is_progressive;
-    int bits_per_pixel, pass_row_size, enc_row_size, max_packet_size;
-    int compression_level;
-    uint8_t *ptr, *top, *crow_buf, *crow;
-    uint8_t *crow_base       = NULL;
-    uint8_t *progressive_buf = NULL;
-    uint8_t *rgba_buf        = NULL;
-    uint8_t *top_buf         = NULL;
-
-    is_progressive = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
-    switch (avctx->pix_fmt) {
-    case AV_PIX_FMT_RGBA64BE:
-        bit_depth = 16;
-        color_type = PNG_COLOR_TYPE_RGB_ALPHA;
-        break;
-    case AV_PIX_FMT_RGB48BE:
-        bit_depth = 16;
-        color_type = PNG_COLOR_TYPE_RGB;
-        break;
-    case AV_PIX_FMT_RGB32:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_RGB_ALPHA;
-        break;
-    case AV_PIX_FMT_RGB24:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_RGB;
-        break;
-    case AV_PIX_FMT_GRAY16BE:
-        bit_depth  = 16;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_GRAY8:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_MONOBLACK:
-        bit_depth  = 1;
-        color_type = PNG_COLOR_TYPE_GRAY;
-        break;
-    case AV_PIX_FMT_PAL8:
-        bit_depth  = 8;
-        color_type = PNG_COLOR_TYPE_PALETTE;
-        break;
-    default:
-        return -1;
+    double rx, ry, gx, gy, bx, by, wx = 0.3127, wy = 0.3290;
+    switch (prim) {
+        case AVCOL_PRI_BT709:
+            rx = 0.640; ry = 0.330;
+            gx = 0.300; gy = 0.600;
+            bx = 0.150; by = 0.060;
+            break;
+        case AVCOL_PRI_BT470M:
+            rx = 0.670; ry = 0.330;
+            gx = 0.210; gy = 0.710;
+            bx = 0.140; by = 0.080;
+            wx = 0.310; wy = 0.316;
+            break;
+        case AVCOL_PRI_BT470BG:
+            rx = 0.640; ry = 0.330;
+            gx = 0.290; gy = 0.600;
+            bx = 0.150; by = 0.060;
+            break;
+        case AVCOL_PRI_SMPTE170M:
+        case AVCOL_PRI_SMPTE240M:
+            rx = 0.630; ry = 0.340;
+            gx = 0.310; gy = 0.595;
+            bx = 0.155; by = 0.070;
+            break;
+        case AVCOL_PRI_BT2020:
+            rx = 0.708; ry = 0.292;
+            gx = 0.170; gy = 0.797;
+            bx = 0.131; by = 0.046;
+            break;
+        default:
+            return 0;
     }
-    bits_per_pixel = ff_png_get_nb_channels(color_type) * bit_depth;
-    row_size       = (avctx->width * bits_per_pixel + 7) >> 3;
 
-    s->zstream.zalloc = ff_png_zalloc;
-    s->zstream.zfree  = ff_png_zfree;
-    s->zstream.opaque = NULL;
-    compression_level = avctx->compression_level == FF_COMPRESSION_DEFAULT
-                      ? Z_DEFAULT_COMPRESSION
-                      : av_clip(avctx->compression_level, 0, 9);
-    ret = deflateInit2(&s->zstream, compression_level,
-                       Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY);
-    if (ret != Z_OK)
-        return -1;
+    AV_WB32_PNG(buf     , wx); AV_WB32_PNG(buf + 4 , wy);
+    AV_WB32_PNG(buf + 8 , rx); AV_WB32_PNG(buf + 12, ry);
+    AV_WB32_PNG(buf + 16, gx); AV_WB32_PNG(buf + 20, gy);
+    AV_WB32_PNG(buf + 24, bx); AV_WB32_PNG(buf + 28, by);
+    return 1;
+}
 
-    enc_row_size    = deflateBound(&s->zstream, row_size);
-    max_packet_size = avctx->height * (enc_row_size +
-                                       ((enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) * 12)
-                      + AV_INPUT_BUFFER_MIN_SIZE;
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, max_packet_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate output packet of size %d.\n",
-               max_packet_size);
-        return ret;
-    }
+static int png_get_gama(enum AVColorTransferCharacteristic trc, uint8_t *buf)
+{
+    double gamma = avpriv_get_gamma_from_trc(trc);
+    if (gamma <= 1e-6)
+        return 0;
 
-    s->bytestream_start =
-    s->bytestream       = pkt->data;
-    s->bytestream_end   = pkt->data + pkt->size;
+    AV_WB32_PNG(buf, 1.0 / gamma);
+    return 1;
+}
 
-    crow_base = av_malloc((row_size + 32) << (s->filter_type == PNG_FILTER_VALUE_MIXED));
-    if (!crow_base)
-        goto fail;
-    // pixel data should be aligned, but there's a control byte before it
-    crow_buf = crow_base + 15;
-    if (is_progressive) {
-        progressive_buf = av_malloc(row_size + 1);
-        if (!progressive_buf)
-            goto fail;
-    }
-    if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-        rgba_buf = av_malloc(row_size + 1);
-        if (!rgba_buf)
-            goto fail;
-    }
-    if (is_progressive || color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-        top_buf = av_malloc(row_size + 1);
-        if (!top_buf)
-            goto fail;
-    }
+static int encode_headers(AVCodecContext *avctx, const AVFrame *pict)
+{
+    AVFrameSideData *side_data;
+    PNGEncContext *s = avctx->priv_data;
 
     /* write png header */
-    memcpy(s->bytestream, ff_pngsig, 8);
-    s->bytestream += 8;
-
     AV_WB32(s->buf, avctx->width);
     AV_WB32(s->buf + 4, avctx->height);
-    s->buf[8]  = bit_depth;
-    s->buf[9]  = color_type;
+    s->buf[8]  = s->bit_depth;
+    s->buf[9]  = s->color_type;
     s->buf[10] = 0; /* compression type */
     s->buf[11] = 0; /* filter type */
-    s->buf[12] = is_progressive; /* interlace type */
-
+    s->buf[12] = s->is_progressive; /* interlace type */
     png_write_chunk(&s->bytestream, MKTAG('I', 'H', 'D', 'R'), s->buf, 13);
 
+    /* write physical information */
+    if (s->dpm) {
+      AV_WB32(s->buf, s->dpm);
+      AV_WB32(s->buf + 4, s->dpm);
+      s->buf[8] = 1; /* unit specifier is meter */
+    } else {
+      AV_WB32(s->buf, avctx->sample_aspect_ratio.num);
+      AV_WB32(s->buf + 4, avctx->sample_aspect_ratio.den);
+      s->buf[8] = 0; /* unit specifier is unknown */
+    }
+    png_write_chunk(&s->bytestream, MKTAG('p', 'H', 'Y', 's'), s->buf, 9);
+
+    /* write stereoscopic information */
+    side_data = av_frame_get_side_data(pict, AV_FRAME_DATA_STEREO3D);
+    if (side_data) {
+        AVStereo3D *stereo3d = (AVStereo3D *)side_data->data;
+        switch (stereo3d->type) {
+            case AV_STEREO3D_SIDEBYSIDE:
+                s->buf[0] = ((stereo3d->flags & AV_STEREO3D_FLAG_INVERT) == 0) ? 1 : 0;
+                png_write_chunk(&s->bytestream, MKTAG('s', 'T', 'E', 'R'), s->buf, 1);
+                break;
+            case AV_STEREO3D_2D:
+                break;
+            default:
+                av_log(avctx, AV_LOG_WARNING, "Only side-by-side stereo3d flag can be defined within sTER chunk\n");
+                break;
+        }
+    }
+
+    /* write colorspace information */
+    if (pict->color_primaries == AVCOL_PRI_BT709 &&
+        pict->color_trc == AVCOL_TRC_IEC61966_2_1) {
+        s->buf[0] = 1; /* rendering intent, relative colorimetric by default */
+        png_write_chunk(&s->bytestream, MKTAG('s', 'R', 'G', 'B'), s->buf, 1);
+    }
+
+    if (png_get_chrm(pict->color_primaries, s->buf))
+        png_write_chunk(&s->bytestream, MKTAG('c', 'H', 'R', 'M'), s->buf, 32);
+    if (png_get_gama(pict->color_trc, s->buf))
+        png_write_chunk(&s->bytestream, MKTAG('g', 'A', 'M', 'A'), s->buf, 4);
+
     /* put the palette if needed */
-    if (color_type == PNG_COLOR_TYPE_PALETTE) {
+    if (s->color_type == PNG_COLOR_TYPE_PALETTE) {
         int has_alpha, alpha, i;
         unsigned int v;
         uint32_t *palette;
-        uint8_t *alpha_ptr;
+        uint8_t *ptr, *alpha_ptr;
 
-        palette   = (uint32_t *)p->data[1];
+        palette   = (uint32_t *)pict->data[1];
         ptr       = s->buf;
         alpha_ptr = s->buf + 256 * 3;
         has_alpha = 0;
         for (i = 0; i < 256; i++) {
             v     = palette[i];
             alpha = v >> 24;
-            if (alpha && alpha != 0xff)
+            if (alpha != 0xff)
                 has_alpha = 1;
             *alpha_ptr++ = alpha;
             bytestream_put_be24(&ptr, v);
@@ -376,67 +422,71 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
 
-    /* write stereoscopic information */
-    side_data = av_frame_get_side_data(pict, AV_FRAME_DATA_STEREO3D);
-    if (side_data) {
-        AVStereo3D *stereo3d = (AVStereo3D *)side_data->data;
-        uint8_t sm;
-        switch (stereo3d->type) {
-        case AV_STEREO3D_SIDEBYSIDE:
-            sm = !(stereo3d->flags & AV_STEREO3D_FLAG_INVERT);
-            png_write_chunk(&s->bytestream, MKTAG('s', 'T', 'E', 'R'), &sm, 1);
-            break;
-        case AV_STEREO3D_2D:
-            break;
-        default:
-            av_log(avctx, AV_LOG_WARNING,
-                   "Only side-by-side stereo3d flag can be defined within sTER chunk\n");
-            break;
+    return 0;
+}
+
+static int encode_frame(AVCodecContext *avctx, const AVFrame *pict)
+{
+    PNGEncContext *s       = avctx->priv_data;
+    const AVFrame *const p = pict;
+    int y, len, ret;
+    int row_size, pass_row_size;
+    uint8_t *ptr, *top, *crow_buf, *crow;
+    uint8_t *crow_base       = NULL;
+    uint8_t *progressive_buf = NULL;
+    uint8_t *top_buf         = NULL;
+
+    row_size = (pict->width * s->bits_per_pixel + 7) >> 3;
+
+    crow_base = av_malloc((row_size + 32) << (s->filter_type == PNG_FILTER_VALUE_MIXED));
+    if (!crow_base) {
+        ret = AVERROR(ENOMEM);
+        goto the_end;
+    }
+    // pixel data should be aligned, but there's a control byte before it
+    crow_buf = crow_base + 15;
+    if (s->is_progressive) {
+        progressive_buf = av_malloc(row_size + 1);
+        top_buf = av_malloc(row_size + 1);
+        if (!progressive_buf || !top_buf) {
+            ret = AVERROR(ENOMEM);
+            goto the_end;
         }
     }
 
-    /* now put each row */
+    /* put each row */
     s->zstream.avail_out = IOBUF_SIZE;
     s->zstream.next_out  = s->buf;
-    if (is_progressive) {
+    if (s->is_progressive) {
         int pass;
 
         for (pass = 0; pass < NB_PASSES; pass++) {
             /* NOTE: a pass is completely omitted if no pixels would be
              * output */
-            pass_row_size = ff_png_pass_row_size(pass, bits_per_pixel, avctx->width);
+            pass_row_size = ff_png_pass_row_size(pass, s->bits_per_pixel, pict->width);
             if (pass_row_size > 0) {
                 top = NULL;
-                for (y = 0; y < avctx->height; y++)
+                for (y = 0; y < pict->height; y++)
                     if ((ff_png_pass_ymask[pass] << (y & 7)) & 0x80) {
                         ptr = p->data[0] + y * p->linesize[0];
                         FFSWAP(uint8_t *, progressive_buf, top_buf);
-                        if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                            convert_from_rgb32(rgba_buf, ptr, avctx->width);
-                            ptr = rgba_buf;
-                        }
                         png_get_interlaced_row(progressive_buf, pass_row_size,
-                                               bits_per_pixel, pass,
-                                               ptr, avctx->width);
+                                               s->bits_per_pixel, pass,
+                                               ptr, pict->width);
                         crow = png_choose_filter(s, crow_buf, progressive_buf,
-                                                 top, pass_row_size, bits_per_pixel >> 3);
-                        png_write_row(s, crow, pass_row_size + 1);
+                                                 top, pass_row_size, s->bits_per_pixel >> 3);
+                        png_write_row(avctx, crow, pass_row_size + 1);
                         top = progressive_buf;
                     }
             }
         }
     } else {
         top = NULL;
-        for (y = 0; y < avctx->height; y++) {
+        for (y = 0; y < pict->height; y++) {
             ptr = p->data[0] + y * p->linesize[0];
-            if (color_type == PNG_COLOR_TYPE_RGB_ALPHA) {
-                FFSWAP(uint8_t *, rgba_buf, top_buf);
-                convert_from_rgb32(rgba_buf, ptr, avctx->width);
-                ptr = rgba_buf;
-            }
             crow = png_choose_filter(s, crow_buf, ptr, top,
-                                     row_size, bits_per_pixel >> 3);
-            png_write_row(s, crow, row_size + 1);
+                                     row_size, s->bits_per_pixel >> 3);
+            png_write_row(avctx, crow, row_size + 1);
             top = ptr;
         }
     }
@@ -446,38 +496,507 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         if (ret == Z_OK || ret == Z_STREAM_END) {
             len = IOBUF_SIZE - s->zstream.avail_out;
             if (len > 0 && s->bytestream_end - s->bytestream > len + 100) {
-                png_write_chunk(&s->bytestream, MKTAG('I', 'D', 'A', 'T'), s->buf, len);
+                png_write_image_data(avctx, s->buf, len);
             }
             s->zstream.avail_out = IOBUF_SIZE;
             s->zstream.next_out  = s->buf;
             if (ret == Z_STREAM_END)
                 break;
         } else {
-            goto fail;
+            ret = -1;
+            goto the_end;
         }
     }
+
+    ret = 0;
+
+the_end:
+    av_freep(&crow_base);
+    av_freep(&progressive_buf);
+    av_freep(&top_buf);
+    deflateReset(&s->zstream);
+    return ret;
+}
+
+static int encode_png(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *pict, int *got_packet)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    int enc_row_size;
+    size_t max_packet_size;
+
+    enc_row_size    = deflateBound(&s->zstream, (avctx->width * s->bits_per_pixel + 7) >> 3);
+    max_packet_size =
+        AV_INPUT_BUFFER_MIN_SIZE + // headers
+        avctx->height * (
+            enc_row_size +
+            12 * (((int64_t)enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) // IDAT * ceil(enc_row_size / IOBUF_SIZE)
+        );
+    if (max_packet_size > INT_MAX)
+        return AVERROR(ENOMEM);
+    ret = ff_alloc_packet2(avctx, pkt, max_packet_size, 0);
+    if (ret < 0)
+        return ret;
+
+    s->bytestream_start =
+    s->bytestream       = pkt->data;
+    s->bytestream_end   = pkt->data + pkt->size;
+
+    AV_WB64(s->bytestream, PNGSIG);
+    s->bytestream += 8;
+
+    ret = encode_headers(avctx, pict);
+    if (ret < 0)
+        return ret;
+
+    ret = encode_frame(avctx, pict);
+    if (ret < 0)
+        return ret;
+
     png_write_chunk(&s->bytestream, MKTAG('I', 'E', 'N', 'D'), NULL, 0);
 
-    pkt->size   = s->bytestream - s->bytestream_start;
+    pkt->size = s->bytestream - s->bytestream_start;
     pkt->flags |= AV_PKT_FLAG_KEY;
     *got_packet = 1;
-    ret         = 0;
 
-the_end:
-    av_free(crow_base);
-    av_free(progressive_buf);
-    av_free(rgba_buf);
-    av_free(top_buf);
-    deflateEnd(&s->zstream);
-    return ret;
+    return 0;
+}
+
+static int apng_do_inverse_blend(AVFrame *output, const AVFrame *input,
+                                  APNGFctlChunk *fctl_chunk, uint8_t bpp)
+{
+    // output: background, input: foreground
+    // output the image such that when blended with the background, will produce the foreground
+
+    unsigned int x, y;
+    unsigned int leftmost_x = input->width;
+    unsigned int rightmost_x = 0;
+    unsigned int topmost_y = input->height;
+    unsigned int bottommost_y = 0;
+    const uint8_t *input_data = input->data[0];
+    uint8_t *output_data = output->data[0];
+    ptrdiff_t input_linesize = input->linesize[0];
+    ptrdiff_t output_linesize = output->linesize[0];
+
+    // Find bounding box of changes
+    for (y = 0; y < input->height; ++y) {
+        for (x = 0; x < input->width; ++x) {
+            if (!memcmp(input_data + bpp * x, output_data + bpp * x, bpp))
+                continue;
+
+            if (x < leftmost_x)
+                leftmost_x = x;
+            if (x >= rightmost_x)
+                rightmost_x = x + 1;
+            if (y < topmost_y)
+                topmost_y = y;
+            if (y >= bottommost_y)
+                bottommost_y = y + 1;
+        }
+
+        input_data += input_linesize;
+        output_data += output_linesize;
+    }
+
+    if (leftmost_x == input->width && rightmost_x == 0) {
+        // Empty frame
+        // APNG does not support empty frames, so we make it a 1x1 frame
+        leftmost_x = topmost_y = 0;
+        rightmost_x = bottommost_y = 1;
+    }
+
+    // Do actual inverse blending
+    if (fctl_chunk->blend_op == APNG_BLEND_OP_SOURCE) {
+        output_data = output->data[0];
+        for (y = topmost_y; y < bottommost_y; ++y) {
+            memcpy(output_data,
+                   input->data[0] + input_linesize * y + bpp * leftmost_x,
+                   bpp * (rightmost_x - leftmost_x));
+            output_data += output_linesize;
+        }
+    } else { // APNG_BLEND_OP_OVER
+        size_t transparent_palette_index;
+        uint32_t *palette;
+
+        switch (input->format) {
+        case AV_PIX_FMT_RGBA64BE:
+        case AV_PIX_FMT_YA16BE:
+        case AV_PIX_FMT_RGBA:
+        case AV_PIX_FMT_GRAY8A:
+            break;
+
+        case AV_PIX_FMT_PAL8:
+            palette = (uint32_t*)input->data[1];
+            for (transparent_palette_index = 0; transparent_palette_index < 256; ++transparent_palette_index)
+                if (palette[transparent_palette_index] >> 24 == 0)
+                    break;
+            break;
+
+        default:
+            // No alpha, so blending not possible
+            return -1;
+        }
+
+        for (y = topmost_y; y < bottommost_y; ++y) {
+            uint8_t *foreground = input->data[0] + input_linesize * y + bpp * leftmost_x;
+            uint8_t *background = output->data[0] + output_linesize * y + bpp * leftmost_x;
+            output_data = output->data[0] + output_linesize * (y - topmost_y);
+            for (x = leftmost_x; x < rightmost_x; ++x, foreground += bpp, background += bpp, output_data += bpp) {
+                if (!memcmp(foreground, background, bpp)) {
+                    if (input->format == AV_PIX_FMT_PAL8) {
+                        if (transparent_palette_index == 256) {
+                            // Need fully transparent colour, but none exists
+                            return -1;
+                        }
+
+                        *output_data = transparent_palette_index;
+                    } else {
+                        memset(output_data, 0, bpp);
+                    }
+                    continue;
+                }
+
+                // Check for special alpha values, since full inverse
+                // alpha-on-alpha blending is rarely possible, and when
+                // possible, doesn't compress much better than
+                // APNG_BLEND_OP_SOURCE blending
+                switch (input->format) {
+                case AV_PIX_FMT_RGBA64BE:
+                    if (((uint16_t*)foreground)[3] == 0xffff ||
+                        ((uint16_t*)background)[3] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_YA16BE:
+                    if (((uint16_t*)foreground)[1] == 0xffff ||
+                        ((uint16_t*)background)[1] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_RGBA:
+                    if (foreground[3] == 0xff || background[3] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_GRAY8A:
+                    if (foreground[1] == 0xff || background[1] == 0)
+                        break;
+                    return -1;
+
+                case AV_PIX_FMT_PAL8:
+                    if (palette[*foreground] >> 24 == 0xff ||
+                        palette[*background] >> 24 == 0)
+                        break;
+                    return -1;
+                }
+
+                memmove(output_data, foreground, bpp);
+            }
+        }
+    }
+
+    output->width = rightmost_x - leftmost_x;
+    output->height = bottommost_y - topmost_y;
+    fctl_chunk->width = output->width;
+    fctl_chunk->height = output->height;
+    fctl_chunk->x_offset = leftmost_x;
+    fctl_chunk->y_offset = topmost_y;
+
+    return 0;
+}
+
+static int apng_encode_frame(AVCodecContext *avctx, const AVFrame *pict,
+                             APNGFctlChunk *best_fctl_chunk, APNGFctlChunk *best_last_fctl_chunk)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    unsigned int y;
+    AVFrame* diffFrame;
+    uint8_t bpp = (s->bits_per_pixel + 7) >> 3;
+    uint8_t *original_bytestream, *original_bytestream_end;
+    uint8_t *temp_bytestream = 0, *temp_bytestream_end;
+    uint32_t best_sequence_number;
+    uint8_t *best_bytestream;
+    size_t best_bytestream_size = SIZE_MAX;
+    APNGFctlChunk last_fctl_chunk = *best_last_fctl_chunk;
+    APNGFctlChunk fctl_chunk = *best_fctl_chunk;
+
+    if (avctx->frame_number == 0) {
+        best_fctl_chunk->width = pict->width;
+        best_fctl_chunk->height = pict->height;
+        best_fctl_chunk->x_offset = 0;
+        best_fctl_chunk->y_offset = 0;
+        best_fctl_chunk->blend_op = APNG_BLEND_OP_SOURCE;
+        return encode_frame(avctx, pict);
+    }
+
+    diffFrame = av_frame_alloc();
+    if (!diffFrame)
+        return AVERROR(ENOMEM);
+
+    diffFrame->format = pict->format;
+    diffFrame->width = pict->width;
+    diffFrame->height = pict->height;
+    if ((ret = av_frame_get_buffer(diffFrame, 32)) < 0)
+        goto fail;
+
+    original_bytestream = s->bytestream;
+    original_bytestream_end = s->bytestream_end;
+
+    temp_bytestream = av_malloc(original_bytestream_end - original_bytestream);
+    temp_bytestream_end = temp_bytestream + (original_bytestream_end - original_bytestream);
+    if (!temp_bytestream) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (last_fctl_chunk.dispose_op = 0; last_fctl_chunk.dispose_op < 3; ++last_fctl_chunk.dispose_op) {
+        // 0: APNG_DISPOSE_OP_NONE
+        // 1: APNG_DISPOSE_OP_BACKGROUND
+        // 2: APNG_DISPOSE_OP_PREVIOUS
+
+        for (fctl_chunk.blend_op = 0; fctl_chunk.blend_op < 2; ++fctl_chunk.blend_op) {
+            // 0: APNG_BLEND_OP_SOURCE
+            // 1: APNG_BLEND_OP_OVER
+
+            uint32_t original_sequence_number = s->sequence_number, sequence_number;
+            uint8_t *bytestream_start = s->bytestream;
+            size_t bytestream_size;
+
+            // Do disposal
+            if (last_fctl_chunk.dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+                diffFrame->width = pict->width;
+                diffFrame->height = pict->height;
+                ret = av_frame_copy(diffFrame, s->last_frame);
+                if (ret < 0)
+                    goto fail;
+
+                if (last_fctl_chunk.dispose_op == APNG_DISPOSE_OP_BACKGROUND) {
+                    for (y = last_fctl_chunk.y_offset; y < last_fctl_chunk.y_offset + last_fctl_chunk.height; ++y) {
+                        size_t row_start = diffFrame->linesize[0] * y + bpp * last_fctl_chunk.x_offset;
+                        memset(diffFrame->data[0] + row_start, 0, bpp * last_fctl_chunk.width);
+                    }
+                }
+            } else {
+                if (!s->prev_frame)
+                    continue;
+
+                diffFrame->width = pict->width;
+                diffFrame->height = pict->height;
+                ret = av_frame_copy(diffFrame, s->prev_frame);
+                if (ret < 0)
+                    goto fail;
+            }
+
+            // Do inverse blending
+            if (apng_do_inverse_blend(diffFrame, pict, &fctl_chunk, bpp) < 0)
+                continue;
+
+            // Do encoding
+            ret = encode_frame(avctx, diffFrame);
+            sequence_number = s->sequence_number;
+            s->sequence_number = original_sequence_number;
+            bytestream_size = s->bytestream - bytestream_start;
+            s->bytestream = bytestream_start;
+            if (ret < 0)
+                goto fail;
+
+            if (bytestream_size < best_bytestream_size) {
+                *best_fctl_chunk = fctl_chunk;
+                *best_last_fctl_chunk = last_fctl_chunk;
+
+                best_sequence_number = sequence_number;
+                best_bytestream = s->bytestream;
+                best_bytestream_size = bytestream_size;
+
+                if (best_bytestream == original_bytestream) {
+                    s->bytestream = temp_bytestream;
+                    s->bytestream_end = temp_bytestream_end;
+                } else {
+                    s->bytestream = original_bytestream;
+                    s->bytestream_end = original_bytestream_end;
+                }
+            }
+        }
+    }
+
+    s->sequence_number = best_sequence_number;
+    s->bytestream = original_bytestream + best_bytestream_size;
+    s->bytestream_end = original_bytestream_end;
+    if (best_bytestream != original_bytestream)
+        memcpy(original_bytestream, best_bytestream, best_bytestream_size);
+
+    ret = 0;
+
 fail:
-    ret = -1;
-    goto the_end;
+    av_freep(&temp_bytestream);
+    av_frame_free(&diffFrame);
+    return ret;
+}
+
+static int encode_apng(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *pict, int *got_packet)
+{
+    PNGEncContext *s = avctx->priv_data;
+    int ret;
+    int enc_row_size;
+    size_t max_packet_size;
+    APNGFctlChunk fctl_chunk = {0};
+
+    if (pict && avctx->codec_id == AV_CODEC_ID_APNG && s->color_type == PNG_COLOR_TYPE_PALETTE) {
+        uint32_t checksum = ~av_crc(av_crc_get_table(AV_CRC_32_IEEE_LE), ~0U, pict->data[1], 256 * sizeof(uint32_t));
+
+        if (avctx->frame_number == 0) {
+            s->palette_checksum = checksum;
+        } else if (checksum != s->palette_checksum) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Input contains more than one unique palette. APNG does not support multiple palettes.\n");
+            return -1;
+        }
+    }
+
+    enc_row_size    = deflateBound(&s->zstream, (avctx->width * s->bits_per_pixel + 7) >> 3);
+    max_packet_size =
+        AV_INPUT_BUFFER_MIN_SIZE + // headers
+        avctx->height * (
+            enc_row_size +
+            (4 + 12) * (((int64_t)enc_row_size + IOBUF_SIZE - 1) / IOBUF_SIZE) // fdAT * ceil(enc_row_size / IOBUF_SIZE)
+        );
+    if (max_packet_size > INT_MAX)
+        return AVERROR(ENOMEM);
+
+    if (avctx->frame_number == 0) {
+        if (!pict)
+            return AVERROR(EINVAL);
+
+        s->bytestream = avctx->extradata = av_malloc(FF_MIN_BUFFER_SIZE);
+        if (!avctx->extradata)
+            return AVERROR(ENOMEM);
+
+        ret = encode_headers(avctx, pict);
+        if (ret < 0)
+            return ret;
+
+        avctx->extradata_size = s->bytestream - avctx->extradata;
+
+        s->last_frame_packet = av_malloc(max_packet_size);
+        if (!s->last_frame_packet)
+            return AVERROR(ENOMEM);
+    } else if (s->last_frame) {
+        ret = ff_alloc_packet2(avctx, pkt, max_packet_size, 0);
+        if (ret < 0)
+            return ret;
+
+        memcpy(pkt->data, s->last_frame_packet, s->last_frame_packet_size);
+        pkt->size = s->last_frame_packet_size;
+        pkt->pts = pkt->dts = s->last_frame->pts;
+    }
+
+    if (pict) {
+        s->bytestream_start =
+        s->bytestream       = s->last_frame_packet;
+        s->bytestream_end   = s->bytestream + max_packet_size;
+
+        // We're encoding the frame first, so we have to do a bit of shuffling around
+        // to have the image data write to the correct place in the buffer
+        fctl_chunk.sequence_number = s->sequence_number;
+        ++s->sequence_number;
+        s->bytestream += 26 + 12;
+
+        ret = apng_encode_frame(avctx, pict, &fctl_chunk, &s->last_frame_fctl);
+        if (ret < 0)
+            return ret;
+
+        fctl_chunk.delay_num = 0; // delay filled in during muxing
+        fctl_chunk.delay_den = 0;
+    } else {
+        s->last_frame_fctl.dispose_op = APNG_DISPOSE_OP_NONE;
+    }
+
+    if (s->last_frame) {
+        uint8_t* last_fctl_chunk_start = pkt->data;
+        uint8_t buf[26];
+
+        AV_WB32(buf + 0, s->last_frame_fctl.sequence_number);
+        AV_WB32(buf + 4, s->last_frame_fctl.width);
+        AV_WB32(buf + 8, s->last_frame_fctl.height);
+        AV_WB32(buf + 12, s->last_frame_fctl.x_offset);
+        AV_WB32(buf + 16, s->last_frame_fctl.y_offset);
+        AV_WB16(buf + 20, s->last_frame_fctl.delay_num);
+        AV_WB16(buf + 22, s->last_frame_fctl.delay_den);
+        buf[24] = s->last_frame_fctl.dispose_op;
+        buf[25] = s->last_frame_fctl.blend_op;
+        png_write_chunk(&last_fctl_chunk_start, MKTAG('f', 'c', 'T', 'L'), buf, 26);
+
+        *got_packet = 1;
+    }
+
+    if (pict) {
+        if (!s->last_frame) {
+            s->last_frame = av_frame_alloc();
+            if (!s->last_frame)
+                return AVERROR(ENOMEM);
+        } else if (s->last_frame_fctl.dispose_op != APNG_DISPOSE_OP_PREVIOUS) {
+            if (!s->prev_frame) {
+                s->prev_frame = av_frame_alloc();
+                if (!s->prev_frame)
+                    return AVERROR(ENOMEM);
+
+                s->prev_frame->format = pict->format;
+                s->prev_frame->width = pict->width;
+                s->prev_frame->height = pict->height;
+                if ((ret = av_frame_get_buffer(s->prev_frame, 32)) < 0)
+                    return ret;
+            }
+
+            // Do disposal, but not blending
+            av_frame_copy(s->prev_frame, s->last_frame);
+            if (s->last_frame_fctl.dispose_op == APNG_DISPOSE_OP_BACKGROUND) {
+                uint32_t y;
+                uint8_t bpp = (s->bits_per_pixel + 7) >> 3;
+                for (y = s->last_frame_fctl.y_offset; y < s->last_frame_fctl.y_offset + s->last_frame_fctl.height; ++y) {
+                    size_t row_start = s->prev_frame->linesize[0] * y + bpp * s->last_frame_fctl.x_offset;
+                    memset(s->prev_frame->data[0] + row_start, 0, bpp * s->last_frame_fctl.width);
+                }
+            }
+        }
+
+        av_frame_unref(s->last_frame);
+        ret = av_frame_ref(s->last_frame, (AVFrame*)pict);
+        if (ret < 0)
+            return ret;
+
+        s->last_frame_fctl = fctl_chunk;
+        s->last_frame_packet_size = s->bytestream - s->bytestream_start;
+    } else {
+        av_frame_free(&s->last_frame);
+    }
+
+    return 0;
 }
 
 static av_cold int png_enc_init(AVCodecContext *avctx)
 {
     PNGEncContext *s = avctx->priv_data;
+    int compression_level;
+
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGBA:
+        avctx->bits_per_coded_sample = 32;
+        break;
+    case AV_PIX_FMT_RGB24:
+        avctx->bits_per_coded_sample = 24;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        avctx->bits_per_coded_sample = 0x28;
+        break;
+    case AV_PIX_FMT_MONOBLACK:
+        avctx->bits_per_coded_sample = 1;
+        break;
+    case AV_PIX_FMT_PAL8:
+        avctx->bits_per_coded_sample = 8;
+    }
 
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -500,41 +1019,150 @@ FF_ENABLE_DEPRECATION_WARNINGS
     if (avctx->pix_fmt == AV_PIX_FMT_MONOBLACK)
         s->filter_type = PNG_FILTER_VALUE_NONE;
 
+    if (s->dpi && s->dpm) {
+      av_log(avctx, AV_LOG_ERROR, "Only one of 'dpi' or 'dpm' options should be set\n");
+      return AVERROR(EINVAL);
+    } else if (s->dpi) {
+      s->dpm = s->dpi * 10000 / 254;
+    }
+
+    s->is_progressive = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
+    switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_RGBA64BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_RGB_ALPHA;
+        break;
+    case AV_PIX_FMT_RGB48BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_RGB;
+        break;
+    case AV_PIX_FMT_RGBA:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_RGB_ALPHA;
+        break;
+    case AV_PIX_FMT_RGB24:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_RGB;
+        break;
+    case AV_PIX_FMT_GRAY16BE:
+        s->bit_depth  = 16;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_GRAY8A:
+        s->bit_depth = 8;
+        s->color_type = PNG_COLOR_TYPE_GRAY_ALPHA;
+        break;
+    case AV_PIX_FMT_YA16BE:
+        s->bit_depth = 16;
+        s->color_type = PNG_COLOR_TYPE_GRAY_ALPHA;
+        break;
+    case AV_PIX_FMT_MONOBLACK:
+        s->bit_depth  = 1;
+        s->color_type = PNG_COLOR_TYPE_GRAY;
+        break;
+    case AV_PIX_FMT_PAL8:
+        s->bit_depth  = 8;
+        s->color_type = PNG_COLOR_TYPE_PALETTE;
+        break;
+    default:
+        return -1;
+    }
+    s->bits_per_pixel = ff_png_get_nb_channels(s->color_type) * s->bit_depth;
+
+    s->zstream.zalloc = ff_png_zalloc;
+    s->zstream.zfree  = ff_png_zfree;
+    s->zstream.opaque = NULL;
+    compression_level = avctx->compression_level == FF_COMPRESSION_DEFAULT
+                      ? Z_DEFAULT_COMPRESSION
+                      : av_clip(avctx->compression_level, 0, 9);
+    if (deflateInit2(&s->zstream, compression_level, Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY) != Z_OK)
+        return -1;
+
+    return 0;
+}
+
+static av_cold int png_enc_close(AVCodecContext *avctx)
+{
+    PNGEncContext *s = avctx->priv_data;
+
+    deflateEnd(&s->zstream);
+    av_frame_free(&s->last_frame);
+    av_frame_free(&s->prev_frame);
+    av_freep(&s->last_frame_packet);
     return 0;
 }
 
 #define OFFSET(x) offsetof(PNGEncContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-{ "pred", "Prediction method", OFFSET(filter_type), AV_OPT_TYPE_INT, { .i64 = PNG_FILTER_VALUE_NONE }, PNG_FILTER_VALUE_NONE, PNG_FILTER_VALUE_MIXED, VE, "pred" },
-    { "none",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_NONE },  INT_MIN, INT_MAX, VE, "pred" },
-    { "sub",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_SUB },   INT_MIN, INT_MAX, VE, "pred" },
-    { "up",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_UP },    INT_MIN, INT_MAX, VE, "pred" },
-    { "avg",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_AVG },   INT_MIN, INT_MAX, VE, "pred" },
-    { "paeth", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_PAETH }, INT_MIN, INT_MAX, VE, "pred" },
-    { "mixed", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_MIXED }, INT_MIN, INT_MAX, VE, "pred" },
-
+    {"dpi", "Set image resolution (in dots per inch)",  OFFSET(dpi), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 0x10000, VE},
+    {"dpm", "Set image resolution (in dots per meter)", OFFSET(dpm), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 0x10000, VE},
+    { "pred", "Prediction method", OFFSET(filter_type), AV_OPT_TYPE_INT, { .i64 = PNG_FILTER_VALUE_NONE }, PNG_FILTER_VALUE_NONE, PNG_FILTER_VALUE_MIXED, VE, "pred" },
+        { "none",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_NONE },  INT_MIN, INT_MAX, VE, "pred" },
+        { "sub",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_SUB },   INT_MIN, INT_MAX, VE, "pred" },
+        { "up",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_UP },    INT_MIN, INT_MAX, VE, "pred" },
+        { "avg",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_AVG },   INT_MIN, INT_MAX, VE, "pred" },
+        { "paeth", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_PAETH }, INT_MIN, INT_MAX, VE, "pred" },
+        { "mixed", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PNG_FILTER_VALUE_MIXED }, INT_MIN, INT_MAX, VE, "pred" },
     { NULL},
 };
 
-static const AVClass png_class = {
-    .class_name = "png",
+static const AVClass pngenc_class = {
+    .class_name = "PNG encoder",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
 };
+
+static const AVClass apngenc_class = {
+    .class_name = "APNG encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_png_encoder = {
     .name           = "png",
     .long_name      = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_PNG,
     .priv_data_size = sizeof(PNGEncContext),
-    .priv_class     = &png_class,
     .init           = png_enc_init,
-    .encode2        = encode_frame,
+    .close          = png_enc_close,
+    .encode2        = encode_png,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .pix_fmts       = (const enum AVPixelFormat[]) {
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_PAL8,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+        AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
+    },
+    .priv_class     = &pngenc_class,
+};
+
+AVCodec ff_apng_encoder = {
+    .name           = "apng",
+    .long_name      = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_APNG,
+    .priv_data_size = sizeof(PNGEncContext),
+    .init           = png_enc_init,
+    .close          = png_enc_close,
+    .encode2        = encode_apng,
+    .capabilities   = CODEC_CAP_DELAY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB32, AV_PIX_FMT_PAL8, AV_PIX_FMT_GRAY8,
-        AV_PIX_FMT_RGBA64BE, AV_PIX_FMT_RGB48BE, AV_PIX_FMT_GRAY16BE,
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_RGB48BE, AV_PIX_FMT_RGBA64BE,
+        AV_PIX_FMT_PAL8,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+        AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
         AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
     },
+    .priv_class     = &apngenc_class,
 };
diff --git a/libavcodec/pnm.c b/libavcodec/pnm.c
index 1c380b0..1675959 100644
--- a/libavcodec/pnm.c
+++ b/libavcodec/pnm.c
@@ -2,20 +2,20 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,12 +37,12 @@ static void pnm_get(PNMContext *sc, char *str, int buf_size)
     int c;
 
     /* skip spaces and comments */
-    for (;;) {
+    while (sc->bytestream < sc->bytestream_end) {
         c = *sc->bytestream++;
         if (c == '#')  {
-            do {
+            while (c != '\n' && sc->bytestream < sc->bytestream_end) {
                 c = *sc->bytestream++;
-            } while (c != '\n' && sc->bytestream < sc->bytestream_end);
+            }
         } else if (!pnm_space(c)) {
             break;
         }
@@ -63,9 +63,9 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
     int h, w, depth, maxval;
 
     pnm_get(s, buf1, sizeof(buf1));
-    s->type= buf1[1]-'0';
     if(buf1[0] != 'P')
         return AVERROR_INVALIDDATA;
+    s->type= buf1[1]-'0';
 
     if (s->type==1 || s->type==4) {
         avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
@@ -107,26 +107,38 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
             }
         }
         /* check that all tags are present */
-        if (w <= 0 || h <= 0 || maxval <= 0 || depth <= 0 || tuple_type[0] == '\0' || av_image_check_size(w, h, 0, avctx))
+        if (w <= 0 || h <= 0 || maxval <= 0 || depth <= 0 || tuple_type[0] == '\0' || av_image_check_size(w, h, 0, avctx) || s->bytestream >= s->bytestream_end)
             return AVERROR_INVALIDDATA;
 
         avctx->width  = w;
         avctx->height = h;
+        s->maxval     = maxval;
         if (depth == 1) {
-            if (maxval == 1)
-                avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-            else
+            if (maxval == 1) {
+                avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+            } else if (maxval < 256) {
                 avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+            }
+        } else if (depth == 2) {
+            if (maxval < 256) {
+                avctx->pix_fmt = AV_PIX_FMT_GRAY8A;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_YA16;
+            }
         } else if (depth == 3) {
             if (maxval < 256) {
-            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+                avctx->pix_fmt = AV_PIX_FMT_RGB24;
             } else {
-                av_log(avctx, AV_LOG_ERROR, "16-bit components are only supported for grayscale\n");
-                avctx->pix_fmt = AV_PIX_FMT_NONE;
-                return AVERROR_INVALIDDATA;
+                avctx->pix_fmt = AV_PIX_FMT_RGB48;
             }
         } else if (depth == 4) {
-            avctx->pix_fmt = AV_PIX_FMT_RGB32;
+            if (maxval < 256) {
+                avctx->pix_fmt = AV_PIX_FMT_RGBA;
+            } else {
+                avctx->pix_fmt = AV_PIX_FMT_RGBA64;
+            }
         } else {
             return AVERROR_INVALIDDATA;
         }
@@ -135,14 +147,16 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
         return AVERROR_INVALIDDATA;
     }
     pnm_get(s, buf1, sizeof(buf1));
-    avctx->width = atoi(buf1);
-    if (avctx->width <= 0)
-        return AVERROR_INVALIDDATA;
+    w = atoi(buf1);
     pnm_get(s, buf1, sizeof(buf1));
-    avctx->height = atoi(buf1);
-    if(av_image_check_size(avctx->width, avctx->height, 0, avctx))
+    h = atoi(buf1);
+    if(w <= 0 || h <= 0 || av_image_check_size(w, h, 0, avctx) || s->bytestream >= s->bytestream_end)
         return AVERROR_INVALIDDATA;
-    if (avctx->pix_fmt != AV_PIX_FMT_MONOWHITE) {
+
+    avctx->width  = w;
+    avctx->height = h;
+
+    if (avctx->pix_fmt != AV_PIX_FMT_MONOWHITE && avctx->pix_fmt != AV_PIX_FMT_MONOBLACK) {
         pnm_get(s, buf1, sizeof(buf1));
         s->maxval = atoi(buf1);
         if (s->maxval <= 0) {
@@ -151,17 +165,14 @@ int ff_pnm_decode_header(AVCodecContext *avctx, PNMContext * const s)
         }
         if (s->maxval >= 256) {
             if (avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
-                avctx->pix_fmt = AV_PIX_FMT_GRAY16BE;
-                if (s->maxval != 65535)
-                    avctx->pix_fmt = AV_PIX_FMT_GRAY16;
+                avctx->pix_fmt = AV_PIX_FMT_GRAY16;
             } else if (avctx->pix_fmt == AV_PIX_FMT_RGB24) {
-                if (s->maxval > 255)
-                    avctx->pix_fmt = AV_PIX_FMT_RGB48BE;
+                avctx->pix_fmt = AV_PIX_FMT_RGB48;
             } else if (avctx->pix_fmt == AV_PIX_FMT_YUV420P && s->maxval < 65536) {
                 if (s->maxval < 512)
-                    avctx->pix_fmt = AV_PIX_FMT_YUV420P9BE;
+                    avctx->pix_fmt = AV_PIX_FMT_YUV420P9;
                 else if (s->maxval < 1024)
-                    avctx->pix_fmt = AV_PIX_FMT_YUV420P10BE;
+                    avctx->pix_fmt = AV_PIX_FMT_YUV420P10;
                 else
                     avctx->pix_fmt = AV_PIX_FMT_YUV420P16;
             } else {
diff --git a/libavcodec/pnm.h b/libavcodec/pnm.h
index 5fc6513..5bc0aad 100644
--- a/libavcodec/pnm.h
+++ b/libavcodec/pnm.h
@@ -2,20 +2,20 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pnm_parser.c b/libavcodec/pnm_parser.c
index 1b81c2a..a7d70b9 100644
--- a/libavcodec/pnm_parser.c
+++ b/libavcodec/pnm_parser.c
@@ -2,20 +2,20 @@
  * PNM image parser
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pnmdec.c b/libavcodec/pnmdec.c
index d23e2c0..d4261a4 100644
--- a/libavcodec/pnmdec.c
+++ b/libavcodec/pnmdec.c
@@ -2,29 +2,39 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 #include "put_bits.h"
 #include "pnm.h"
 
+static void samplecpy(uint8_t *dst, const uint8_t *src, int n, int maxval)
+{
+    if (maxval <= 255) {
+        memcpy(dst, src, n);
+    } else {
+        int i;
+        for (i=0; i<n/2; i++) {
+            ((uint16_t *)dst)[i] = AV_RB16(src+2*i);
+        }
+    }
+}
 
 static int pnm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
@@ -33,36 +43,51 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
     int buf_size         = avpkt->size;
     PNMContext * const s = avctx->priv_data;
     AVFrame * const p    = data;
-    int i, j, n, linesize, h, upgrade = 0;
+    int i, j, n, linesize, h, upgrade = 0, is_mono = 0;
     unsigned char *ptr;
     int components, sample_len, ret;
 
     s->bytestream_start =
-    s->bytestream       = buf;
-    s->bytestream_end   = buf + buf_size;
+    s->bytestream       = (uint8_t *)buf;
+    s->bytestream_end   = (uint8_t *)buf + buf_size;
 
     if ((ret = ff_pnm_decode_header(avctx, s)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
+    avctx->bits_per_raw_sample = av_log2(s->maxval) + 1;
 
     switch (avctx->pix_fmt) {
     default:
         return AVERROR(EINVAL);
-    case AV_PIX_FMT_RGB48BE:
+    case AV_PIX_FMT_RGBA64:
+        n = avctx->width * 8;
+        components=4;
+        sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
+    case AV_PIX_FMT_RGB48:
         n = avctx->width * 6;
         components=3;
         sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
+    case AV_PIX_FMT_RGBA:
+        n = avctx->width * 4;
+        components=4;
+        sample_len=8;
         goto do_read;
     case AV_PIX_FMT_RGB24:
         n = avctx->width * 3;
         components=3;
         sample_len=8;
+        if (s->maxval < 255)
+            upgrade = 1;
         goto do_read;
     case AV_PIX_FMT_GRAY8:
         n = avctx->width;
@@ -71,48 +96,71 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
         if (s->maxval < 255)
             upgrade = 1;
         goto do_read;
-    case AV_PIX_FMT_GRAY16BE:
-    case AV_PIX_FMT_GRAY16LE:
+    case AV_PIX_FMT_GRAY8A:
+        n = avctx->width * 2;
+        components=2;
+        sample_len=8;
+        goto do_read;
+    case AV_PIX_FMT_GRAY16:
         n = avctx->width * 2;
         components=1;
         sample_len=16;
         if (s->maxval < 65535)
             upgrade = 2;
         goto do_read;
+    case AV_PIX_FMT_YA16:
+        n =  avctx->width * 4;
+        components=2;
+        sample_len=16;
+        if (s->maxval < 65535)
+            upgrade = 2;
+        goto do_read;
     case AV_PIX_FMT_MONOWHITE:
     case AV_PIX_FMT_MONOBLACK:
         n = (avctx->width + 7) >> 3;
         components=1;
         sample_len=1;
+        is_mono = 1;
     do_read:
         ptr      = p->data[0];
         linesize = p->linesize[0];
         if (s->bytestream + n * avctx->height > s->bytestream_end)
             return AVERROR_INVALIDDATA;
-        if(s->type < 4){
+        if(s->type < 4 || (is_mono && s->type==7)){
             for (i=0; i<avctx->height; i++) {
                 PutBitContext pb;
                 init_put_bits(&pb, ptr, linesize);
                 for(j=0; j<avctx->width * components; j++){
                     unsigned int c=0;
                     int v=0;
+                    if(s->type < 4)
                     while(s->bytestream < s->bytestream_end && (*s->bytestream < '0' || *s->bytestream > '9' ))
                         s->bytestream++;
                     if(s->bytestream >= s->bytestream_end)
                         return AVERROR_INVALIDDATA;
-                    do{
-                        v= 10*v + c;
-                        c= (*s->bytestream++) - '0';
-                    }while(c <= 9);
-                    put_bits(&pb, sample_len, (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval);
+                    if (is_mono) {
+                        /* read a single digit */
+                        v = (*s->bytestream++)&1;
+                    } else {
+                        /* read a sequence of digits */
+                        do {
+                            v = 10*v + c;
+                            c = (*s->bytestream++) - '0';
+                        } while (c <= 9);
+                    }
+                    if (sample_len == 16) {
+                        ((uint16_t*)ptr)[j] = (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval;
+                    } else
+                        put_bits(&pb, sample_len, (((1<<sample_len)-1)*v + (s->maxval>>1))/s->maxval);
                 }
-                flush_put_bits(&pb);
+                if (sample_len != 16)
+                    flush_put_bits(&pb);
                 ptr+= linesize;
             }
         }else{
         for (i = 0; i < avctx->height; i++) {
             if (!upgrade)
-                memcpy(ptr, s->bytestream, n);
+                samplecpy(ptr, s->bytestream, n, s->maxval);
             else if (upgrade == 1) {
                 unsigned int j, f = (255 * 128 + s->maxval / 2) / s->maxval;
                 for (j = 0; j < n; j++)
@@ -130,8 +178,8 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
         }
         break;
     case AV_PIX_FMT_YUV420P:
-    case AV_PIX_FMT_YUV420P9BE:
-    case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV420P10:
         {
             unsigned char *ptr1, *ptr2;
 
@@ -143,7 +191,7 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             if (s->bytestream + n * avctx->height * 3 / 2 > s->bytestream_end)
                 return AVERROR_INVALIDDATA;
             for (i = 0; i < avctx->height; i++) {
-                memcpy(ptr, s->bytestream, n);
+                samplecpy(ptr, s->bytestream, n, s->maxval);
                 s->bytestream += n;
                 ptr           += linesize;
             }
@@ -152,9 +200,9 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             n >>= 1;
             h = avctx->height >> 1;
             for (i = 0; i < h; i++) {
-                memcpy(ptr1, s->bytestream, n);
+                samplecpy(ptr1, s->bytestream, n, s->maxval);
                 s->bytestream += n;
-                memcpy(ptr2, s->bytestream, n);
+                samplecpy(ptr2, s->bytestream, n, s->maxval);
                 s->bytestream += n;
                 ptr1 += p->linesize[1];
                 ptr2 += p->linesize[2];
@@ -202,24 +250,6 @@ static int pnm_decode_frame(AVCodecContext *avctx, void *data,
             }
         }
         break;
-    case AV_PIX_FMT_RGB32:
-        ptr      = p->data[0];
-        linesize = p->linesize[0];
-        if (s->bytestream + avctx->width * avctx->height * 4 > s->bytestream_end)
-            return AVERROR_INVALIDDATA;
-        for (i = 0; i < avctx->height; i++) {
-            int j, r, g, b, a;
-
-            for (j = 0; j < avctx->width; j++) {
-                r = *s->bytestream++;
-                g = *s->bytestream++;
-                b = *s->bytestream++;
-                a = *s->bytestream++;
-                ((uint32_t *)ptr)[j] = (a << 24) | (r << 16) | (g << 8) | b;
-            }
-            ptr += linesize;
-        }
-        break;
     }
     *got_frame = 1;
 
diff --git a/libavcodec/pnmenc.c b/libavcodec/pnmenc.c
index f8c600f..ba9478d 100644
--- a/libavcodec/pnmenc.c
+++ b/libavcodec/pnmenc.c
@@ -2,43 +2,39 @@
  * PNM image format
  * Copyright (c) 2002, 2003 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
-#include "bytestream.h"
 #include "internal.h"
 
 static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *pict, int *got_packet)
+                            const AVFrame *p, int *got_packet)
 {
     uint8_t *bytestream, *bytestream_start, *bytestream_end;
-    const AVFrame * const p = pict;
     int i, h, h1, c, n, linesize, ret;
     uint8_t *ptr, *ptr1, *ptr2;
     int size = av_image_get_buffer_size(avctx->pix_fmt,
                                         avctx->width, avctx->height, 1);
 
-    if ((ret = ff_alloc_packet(pkt, size + 200)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, size + 200, 0)) < 0)
         return ret;
-    }
 
     bytestream_start =
     bytestream       = pkt->data;
@@ -68,6 +64,10 @@ static int pnm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         n  = avctx->width * 6;
         break;
     case AV_PIX_FMT_YUV420P:
+        if (avctx->width & 1 || avctx->height & 1) {
+            av_log(avctx, AV_LOG_ERROR, "pgmyuv needs even width and height\n");
+            return AVERROR(EINVAL);
+        }
         c  = '5';
         n  = avctx->width;
         h1 = (h * 3) / 2;
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index 9234e77..56a1398 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -2,16 +2,16 @@
 OBJS-$(CONFIG_AUDIODSP)                += ppc/audiodsp.o
 OBJS-$(CONFIG_BLOCKDSP)                += ppc/blockdsp.o
 OBJS-$(CONFIG_FFT)                     += ppc/fft_init.o                \
-                                          ppc/fft_altivec.o
+                                          ppc/fft_altivec.o             \
+                                          ppc/fft_vsx.o
 OBJS-$(CONFIG_FDCTDSP)                 += ppc/fdctdsp.o
 OBJS-$(CONFIG_FMTCONVERT)              += ppc/fmtconvert_altivec.o
 OBJS-$(CONFIG_H264CHROMA)              += ppc/h264chroma_init.o
-OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o
+OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o ppc/hpeldsp_altivec.o
 OBJS-$(CONFIG_H264QPEL)                += ppc/h264qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += ppc/hpeldsp_altivec.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += ppc/huffyuvdsp_altivec.o
 OBJS-$(CONFIG_IDCTDSP)                 += ppc/idctdsp.o
-OBJS-$(CONFIG_MDCT)                    += ppc/mdct_init.o
 OBJS-$(CONFIG_ME_CMP)                  += ppc/me_cmp.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
@@ -24,7 +24,7 @@ OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
 OBJS-$(CONFIG_VP8DSP)                  += ppc/vp8dsp_altivec.o
 
 # decoders/encoders
-OBJS-$(CONFIG_APE_DECODER)             += ppc/apedsp_altivec.o
+OBJS-$(CONFIG_LLAUDDSP)                += ppc/lossless_audiodsp_altivec.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += ppc/svq1enc_altivec.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
 OBJS-$(CONFIG_VP7_DECODER)             += ppc/vp8dsp_altivec.o
diff --git a/libavcodec/ppc/asm.S b/libavcodec/ppc/asm.S
index 141dee9..a3edeed 100644
--- a/libavcodec/ppc/asm.S
+++ b/libavcodec/ppc/asm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/audiodsp.c b/libavcodec/ppc/audiodsp.c
index 2a0a6d8..c88c3d9 100644
--- a/libavcodec/ppc/audiodsp.c
+++ b/libavcodec/ppc/audiodsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/audiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                            int order)
@@ -63,7 +63,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
 
 av_cold void ff_audiodsp_init_ppc(AudioDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/blockdsp.c b/libavcodec/ppc/blockdsp.c
index 679bc04..45c492a 100644
--- a/libavcodec/ppc/blockdsp.c
+++ b/libavcodec/ppc/blockdsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -143,27 +143,24 @@ static void clear_block_altivec(int16_t *block)
 }
 #endif /* HAVE_ALTIVEC */
 
-av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c)
 {
     // common optimizations whether AltiVec is available or not
-    if (!high_bit_depth) {
-        switch (check_dcbzl_effect()) {
-        case 32:
-            c->clear_blocks = clear_blocks_dcbz32_ppc;
-            break;
-        case 128:
-            c->clear_blocks = clear_blocks_dcbz128_ppc;
-            break;
-        default:
-            break;
-        }
+    switch (check_dcbzl_effect()) {
+    case 32:
+        c->clear_blocks = clear_blocks_dcbz32_ppc;
+        break;
+    case 128:
+        c->clear_blocks = clear_blocks_dcbz128_ppc;
+        break;
+    default:
+        break;
     }
 
 #if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
-    if (!high_bit_depth)
-        c->clear_block = clear_block_altivec;
+    c->clear_block = clear_block_altivec;
 #endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/dct-test.c b/libavcodec/ppc/dct-test.c
index 2acbe2a..2328516 100644
--- a/libavcodec/ppc/dct-test.c
+++ b/libavcodec/ppc/dct-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,7 +21,7 @@
 #include "fdct.h"
 
 static const struct algo fdct_tab_arch[] = {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     { "altivecfdct", ff_fdct_altivec, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
 #endif
     { 0 }
diff --git a/libavcodec/ppc/fdct.h b/libavcodec/ppc/fdct.h
index 7471035..437f815 100644
--- a/libavcodec/ppc/fdct.h
+++ b/libavcodec/ppc/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/fdctdsp.c b/libavcodec/ppc/fdctdsp.c
index 89ceb47..6659046 100644
--- a/libavcodec/ppc/fdctdsp.c
+++ b/libavcodec/ppc/fdctdsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003  James Klicman <james@klicman.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "libavcodec/fdctdsp.h"
 #include "fdct.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define vs16(v)   ((vector signed short) (v))
 #define vs32(v)     ((vector signed int) (v))
@@ -37,29 +37,28 @@
 #define vu16(v) ((vector unsigned short) (v))
 #define vu32(v)   ((vector unsigned int) (v))
 
-#define C1     0.98078525066375732421875000 /* cos(1 * PI / 16) */
-#define C2     0.92387950420379638671875000 /* cos(2 * PI / 16) */
-#define C3     0.83146959543228149414062500 /* cos(3 * PI / 16) */
-#define C4     0.70710676908493041992187500 /* cos(4 * PI / 16) */
-#define C5     0.55557024478912353515625000 /* cos(5 * PI / 16) */
-#define C6     0.38268342614173889160156250 /* cos(6 * PI / 16) */
-#define C7     0.19509032368659973144531250 /* cos(7 * PI / 16) */
-#define SQRT_2 1.41421353816986083984375000 /* sqrt(2)          */
+#define C1     0.98078528040323044912618224 /* cos(1 * PI / 16) */
+#define C2     0.92387953251128675612818319 /* cos(2 * PI / 16) */
+#define C3     0.83146961230254523707878838 /* cos(3 * PI / 16) */
+#define C4     0.70710678118654752440084436 /* cos(4 * PI / 16) */
+#define C5     0.55557023301960222474283081 /* cos(5 * PI / 16) */
+#define C6     0.38268343236508977172845998 /* cos(6 * PI / 16) */
+#define C7     0.19509032201612826784828487 /* cos(7 * PI / 16) */
 
 #define W0 -(2 * C2)
 #define W1  (2 * C6)
-#define W2 (SQRT_2 * C6)
-#define W3 (SQRT_2 * C3)
-#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
-#define W5 (SQRT_2 *  (C1 + C3 - C5 + C7))
-#define W6 (SQRT_2 *  (C1 + C3 + C5 - C7))
-#define W7 (SQRT_2 *  (C1 + C3 - C5 - C7))
-#define W8 (SQRT_2 *  (C7 - C3))
-#define W9 (SQRT_2 * (-C1 - C3))
-#define WA (SQRT_2 * (-C3 - C5))
-#define WB (SQRT_2 *  (C5 - C3))
-
-static vector float fdctconsts[3] = {
+#define W2 (M_SQRT2 * C6)
+#define W3 (M_SQRT2 * C3)
+#define W4 (M_SQRT2 * (-C1 + C3 + C5 - C7))
+#define W5 (M_SQRT2 *  (C1 + C3 - C5 + C7))
+#define W6 (M_SQRT2 *  (C1 + C3 + C5 - C7))
+#define W7 (M_SQRT2 *  (C1 + C3 - C5 - C7))
+#define W8 (M_SQRT2 *  (C7 - C3))
+#define W9 (M_SQRT2 * (-C1 - C3))
+#define WA (M_SQRT2 * (-C3 - C5))
+#define WB (M_SQRT2 *  (C5 - C3))
+
+static const vector float fdctconsts[3] = {
     { W0, W1, W2, W3 },
     { W4, W5, W6, W7 },
     { W8, W9, WA, WB }
@@ -196,7 +195,7 @@ static vector float fdctconsts[3] = {
 void ff_fdct_altivec(int16_t *block)
 {
     vector signed short *bp;
-    vector float *cp = fdctconsts;
+    const vector float *cp = fdctconsts;
     vector float b00, b10, b20, b30, b40, b50, b60, b70;
     vector float b01, b11, b21, b31, b41, b51, b61, b71;
     vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
@@ -465,7 +464,7 @@ void ff_fdct_altivec(int16_t *block)
 av_cold void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/fft_altivec.S b/libavcodec/ppc/fft_altivec.S
index cb7c871..bcc242d 100644
--- a/libavcodec/ppc/fft_altivec.S
+++ b/libavcodec/ppc/fft_altivec.S
@@ -5,20 +5,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/fft_init.c b/libavcodec/ppc/fft_init.c
index 56eafb9..cbeaf98 100644
--- a/libavcodec/ppc/fft_init.c
+++ b/libavcodec/ppc/fft_init.c
@@ -1,36 +1,168 @@
 /*
- * This file is part of Libav.
+ * FFT/IFFT transforms
+ * AltiVec-enabled
+ * Copyright (c) 2009 Loren Merritt
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
-
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
-
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fft.h"
 
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init().
+ * The input data must be permuted before with s->revtab table.
+ * No 1.0 / sqrt(n) normalization is done.
+ * AltiVec-enabled:
+ * This code assumes that the 'z' pointer is 16 bytes-aligned.
+ * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
+ */
+
+#if HAVE_VSX
+#include "fft_vsx.h"
+#else
+void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
+#endif
+
+#if HAVE_GNU_AS && HAVE_ALTIVEC
+static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int j, k;
+    int n = 1 << s->mdct_bits;
+    int n4 = n >> 2;
+    int n8 = n >> 3;
+    int n32 = n >> 5;
+    const uint16_t *revtabj = s->revtab;
+    const uint16_t *revtabk = s->revtab+n4;
+    const vec_f *tcos = (const vec_f*)(s->tcos+n8);
+    const vec_f *tsin = (const vec_f*)(s->tsin+n8);
+    const vec_f *pin = (const vec_f*)(input+n4);
+    vec_f *pout = (vec_f*)(output+n4);
+
+    /* pre rotation */
+    k = n32-1;
+    do {
+        vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
+#define CMULA(p,o0,o1,o2,o3)\
+        a = pin[ k*2+p];                       /* { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  } */\
+        b = pin[-k*2-p-1];                     /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
+        re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re } */\
+        im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    } */\
+        cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
+        sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
+        r##p = im*cos - re*sin;\
+        i##p = re*cos + im*sin;
+#define STORE2(v,dst)\
+        j = dst;\
+        vec_ste(v, 0, output+j*2);\
+        vec_ste(v, 4, output+j*2);
+#define STORE8(p)\
+        a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
+        b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
+        c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
+        d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
+        STORE2(a, revtabk[ p*2-4]);\
+        STORE2(b, revtabk[ p*2-3]);\
+        STORE2(c, revtabj[-p*2+2]);\
+        STORE2(d, revtabj[-p*2+3]);
+
+        cos0 = tcos[k];
+        sin0 = tsin[k];
+        cos1 = tcos[-k-1];
+        sin1 = tsin[-k-1];
+        CMULA(0, 0,1,2,3);
+        CMULA(1, 2,3,0,1);
+        STORE8(0);
+        STORE8(1);
+        revtabj += 4;
+        revtabk -= 4;
+        k--;
+    } while(k >= 0);
+
+#if HAVE_VSX
+    ff_fft_calc_vsx(s, (FFTComplex*)output);
+#else
+    ff_fft_calc_altivec(s, (FFTComplex*)output);
+#endif
+
+    /* post rotation + reordering */
+    j = -n32;
+    k = n32-1;
+    do {
+        vec_f cos,sin,re,im,a,b,c,d;
+#define CMULB(d0,d1,o)\
+        re = pout[o*2];\
+        im = pout[o*2+1];\
+        cos = tcos[o];\
+        sin = tsin[o];\
+        d0 = im*sin - re*cos;\
+        d1 = re*sin + im*cos;
+
+        CMULB(a,b,j);
+        CMULB(c,d,k);
+        pout[2*j]   = vec_perm(a, d, vcprm(0,s3,1,s2));
+        pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
+        pout[2*k]   = vec_perm(c, b, vcprm(0,s3,1,s2));
+        pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
+        j++;
+        k--;
+    } while(k >= 0);
+}
+
+static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    int k;
+    int n = 1 << s->mdct_bits;
+    int n4 = n >> 2;
+    int n16 = n >> 4;
+    vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
+    vec_u32 *p0 = (vec_u32*)(output+n4);
+    vec_u32 *p1 = (vec_u32*)(output+n4*3);
+
+    imdct_half_altivec(s, output + n4, input);
+
+    for (k = 0; k < n16; k++) {
+        vec_u32 a = p0[k] ^ sign;
+        vec_u32 b = p1[-k-1];
+        p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
+        p1[k]    = vec_perm(b, b, vcprm(3,2,1,0));
+    }
+}
+#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
 
 av_cold void ff_fft_init_ppc(FFTContext *s)
 {
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_GNU_AS && HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
+#if HAVE_VSX
+    s->fft_calc = ff_fft_calc_interleave_vsx;
+#else
     s->fft_calc   = ff_fft_calc_interleave_altivec;
+#endif
+    if (s->mdct_bits >= 5) {
+        s->imdct_calc = imdct_calc_altivec;
+        s->imdct_half = imdct_half_altivec;
+    }
 #endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
 }
diff --git a/libavcodec/ppc/fft_vsx.c b/libavcodec/ppc/fft_vsx.c
new file mode 100644
index 0000000..e92975f
--- /dev/null
+++ b/libavcodec/ppc/fft_vsx.c
@@ -0,0 +1,227 @@
+/*
+ * FFT  transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+#include "fft_vsx.h"
+
+#if HAVE_VSX
+
+static void fft32_vsx_interleave(FFTComplex *z)
+{
+    fft16_vsx_interleave(z);
+    fft8_vsx_interleave(z+16);
+    fft8_vsx_interleave(z+24);
+    pass_vsx_interleave(z,ff_cos_32,4);
+}
+
+static void fft64_vsx_interleave(FFTComplex *z)
+{
+    fft32_vsx_interleave(z);
+    fft16_vsx_interleave(z+32);
+    fft16_vsx_interleave(z+48);
+    pass_vsx_interleave(z,ff_cos_64, 8);
+}
+static void fft128_vsx_interleave(FFTComplex *z)
+{
+    fft64_vsx_interleave(z);
+    fft32_vsx_interleave(z+64);
+    fft32_vsx_interleave(z+96);
+    pass_vsx_interleave(z,ff_cos_128,16);
+}
+static void fft256_vsx_interleave(FFTComplex *z)
+{
+    fft128_vsx_interleave(z);
+    fft64_vsx_interleave(z+128);
+    fft64_vsx_interleave(z+192);
+    pass_vsx_interleave(z,ff_cos_256,32);
+}
+static void fft512_vsx_interleave(FFTComplex *z)
+{
+    fft256_vsx_interleave(z);
+    fft128_vsx_interleave(z+256);
+    fft128_vsx_interleave(z+384);
+    pass_vsx_interleave(z,ff_cos_512,64);
+}
+static void fft1024_vsx_interleave(FFTComplex *z)
+{
+    fft512_vsx_interleave(z);
+    fft256_vsx_interleave(z+512);
+    fft256_vsx_interleave(z+768);
+    pass_vsx_interleave(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx_interleave(FFTComplex *z)
+{
+    fft1024_vsx_interleave(z);
+    fft512_vsx_interleave(z+1024);
+    fft512_vsx_interleave(z+1536);
+    pass_vsx_interleave(z,ff_cos_2048,256);
+}
+static void fft4096_vsx_interleave(FFTComplex *z)
+{
+    fft2048_vsx_interleave(z);
+    fft1024_vsx_interleave(z+2048);
+    fft1024_vsx_interleave(z+3072);
+    pass_vsx_interleave(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx_interleave(FFTComplex *z)
+{
+    fft4096_vsx_interleave(z);
+    fft2048_vsx_interleave(z+4096);
+    fft2048_vsx_interleave(z+6144);
+    pass_vsx_interleave(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx_interleave(FFTComplex *z)
+{
+    fft8192_vsx_interleave(z);
+    fft4096_vsx_interleave(z+8192);
+    fft4096_vsx_interleave(z+12288);
+    pass_vsx_interleave(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx_interleave(FFTComplex *z)
+{
+    fft16384_vsx_interleave(z);
+    fft8192_vsx_interleave(z+16384);
+    fft8192_vsx_interleave(z+24576);
+    pass_vsx_interleave(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx_interleave(FFTComplex *z)
+{
+    fft32768_vsx_interleave(z);
+    fft16384_vsx_interleave(z+32768);
+    fft16384_vsx_interleave(z+49152);
+    pass_vsx_interleave(z,ff_cos_65536,8192);
+}
+
+static void fft32_vsx(FFTComplex *z)
+{
+    fft16_vsx(z);
+    fft8_vsx(z+16);
+    fft8_vsx(z+24);
+    pass_vsx(z,ff_cos_32,4);
+}
+
+static void fft64_vsx(FFTComplex *z)
+{
+    fft32_vsx(z);
+    fft16_vsx(z+32);
+    fft16_vsx(z+48);
+    pass_vsx(z,ff_cos_64, 8);
+}
+static void fft128_vsx(FFTComplex *z)
+{
+    fft64_vsx(z);
+    fft32_vsx(z+64);
+    fft32_vsx(z+96);
+    pass_vsx(z,ff_cos_128,16);
+}
+static void fft256_vsx(FFTComplex *z)
+{
+    fft128_vsx(z);
+    fft64_vsx(z+128);
+    fft64_vsx(z+192);
+    pass_vsx(z,ff_cos_256,32);
+}
+static void fft512_vsx(FFTComplex *z)
+{
+    fft256_vsx(z);
+    fft128_vsx(z+256);
+    fft128_vsx(z+384);
+    pass_vsx(z,ff_cos_512,64);
+}
+static void fft1024_vsx(FFTComplex *z)
+{
+    fft512_vsx(z);
+    fft256_vsx(z+512);
+    fft256_vsx(z+768);
+    pass_vsx(z,ff_cos_1024,128);
+
+}
+static void fft2048_vsx(FFTComplex *z)
+{
+    fft1024_vsx(z);
+    fft512_vsx(z+1024);
+    fft512_vsx(z+1536);
+    pass_vsx(z,ff_cos_2048,256);
+}
+static void fft4096_vsx(FFTComplex *z)
+{
+    fft2048_vsx(z);
+    fft1024_vsx(z+2048);
+    fft1024_vsx(z+3072);
+    pass_vsx(z,ff_cos_4096, 512);
+}
+static void fft8192_vsx(FFTComplex *z)
+{
+    fft4096_vsx(z);
+    fft2048_vsx(z+4096);
+    fft2048_vsx(z+6144);
+    pass_vsx(z,ff_cos_8192,1024);
+}
+static void fft16384_vsx(FFTComplex *z)
+{
+    fft8192_vsx(z);
+    fft4096_vsx(z+8192);
+    fft4096_vsx(z+12288);
+    pass_vsx(z,ff_cos_16384,2048);
+}
+static void fft32768_vsx(FFTComplex *z)
+{
+    fft16384_vsx(z);
+    fft8192_vsx(z+16384);
+    fft8192_vsx(z+24576);
+    pass_vsx(z,ff_cos_32768,4096);
+}
+static void fft65536_vsx(FFTComplex *z)
+{
+    fft32768_vsx(z);
+    fft16384_vsx(z+32768);
+    fft16384_vsx(z+49152);
+    pass_vsx(z,ff_cos_65536,8192);
+}
+
+static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
+    fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
+    fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
+};
+static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
+    fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
+    fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
+    fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
+};
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
+{
+     fft_dispatch_vsx_interleave[s->nbits-2](z);
+}
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
+{
+     fft_dispatch_vsx[s->nbits-2](z);
+}
+#endif /* HAVE_VSX */
diff --git a/libavcodec/ppc/fft_vsx.h b/libavcodec/ppc/fft_vsx.h
new file mode 100644
index 0000000..a85475d
--- /dev/null
+++ b/libavcodec/ppc/fft_vsx.h
@@ -0,0 +1,830 @@
+#ifndef AVCODEC_PPC_FFT_VSX_H
+#define AVCODEC_PPC_FFT_VSX_H
+/*
+ * FFT  transform, optimized with VSX built-in functions
+ * Copyright (c) 2014 Rong Yan  Copyright (c) 2009 Loren Merritt
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/fft-internal.h"
+
+#if HAVE_VSX
+
+void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
+
+
+#define byte_2complex (2*sizeof(FFTComplex))
+#define byte_4complex (4*sizeof(FFTComplex))
+#define byte_6complex (6*sizeof(FFTComplex))
+#define byte_8complex (8*sizeof(FFTComplex))
+#define byte_10complex (10*sizeof(FFTComplex))
+#define byte_12complex (12*sizeof(FFTComplex))
+#define byte_14complex (14*sizeof(FFTComplex))
+
+inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
+{
+    int o1 = n<<1;
+    int o2 = n<<2;
+    int o3 = o1+o2;
+    int i1, i2, i3;
+    FFTSample* out = (FFTSample*)z;
+    const FFTSample *wim = wre+o1;
+    vec_f vz0, vzo1, vzo2, vzo3;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
+    vec_f y0, y1, y2, y3;
+    vec_f y4, y5, y8, y9;
+    vec_f y10, y13, y14, y15;
+    vec_f y16, y17, y18, y19;
+    vec_f y20, y21, y22, y23;
+    vec_f wr1, wi1, wr0, wi0;
+    vec_f wr2, wi2, wr3, wi3;
+    vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
+
+    n = n-2;
+    i1 = o1*sizeof(FFTComplex);
+    i2 = o2*sizeof(FFTComplex);
+    i3 = o3*sizeof(FFTComplex);
+    vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
+    vzo2plus1 = vec_ld(i2+16, &(out[0]));
+    vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
+    vzo3plus1 = vec_ld(i3+16, &(out[0]));
+    vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
+    vz0plus1 = vec_ld(16, &(out[0]));
+    vzo1 = vec_ld(i1, &(out[0]));  // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
+    vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+    x0 = vec_add(vzo2, vzo3);
+    x1 = vec_sub(vzo2, vzo3);
+    y0 = vec_add(vzo2plus1, vzo3plus1);
+    y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+    wr1 = vec_splats(wre[1]);
+    wi1 = vec_splats(wim[-1]);
+    wi2 = vec_splats(wim[-2]);
+    wi3 = vec_splats(wim[-3]);
+    wr2 = vec_splats(wre[2]);
+    wr3 = vec_splats(wre[3]);
+
+    x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+    x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+
+    y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+    y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+    y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+    y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+
+    ymulwi2 = vec_mul(y4, wi2);
+    ymulwi3 = vec_mul(y5, wi3);
+    x4 = vec_mul(x2, wr1);
+    x5 = vec_mul(x3, wi1);
+    y8 = vec_madd(y2, wr2, ymulwi2);
+    y9 = vec_msub(y2, wr2, ymulwi2);
+    x6 = vec_add(x4, x5);
+    x7 = vec_sub(x4, x5);
+    y13 = vec_madd(y3, wr3, ymulwi3);
+    y14 = vec_msub(y3, wr3, ymulwi3);
+
+    x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
+    y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+    y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+    x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
+    x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
+
+    y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+    y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+    x11 = vec_add(vz0, x9);
+    x12 = vec_sub(vz0, x9);
+    x13 = vec_add(vzo1, x10);
+    x14 = vec_sub(vzo1, x10);
+
+    y18 = vec_add(vz0plus1, y16);
+    y19 = vec_sub(vz0plus1, y16);
+    y20 = vec_add(vzo1plus1, y17);
+    y21 = vec_sub(vzo1plus1, y17);
+
+    x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
+    x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
+    y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+    y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+
+    vec_st(x11, 0, &(out[0]));
+    vec_st(y18, 16, &(out[0]));
+    vec_st(x15, i1, &(out[0]));
+    vec_st(y22, i1+16, &(out[0]));
+    vec_st(x12, i2, &(out[0]));
+    vec_st(y19, i2+16, &(out[0]));
+    vec_st(x16, i3, &(out[0]));
+    vec_st(y23, i3+16, &(out[0]));
+
+    do {
+        out += 8;
+        wre += 4;
+        wim -= 4;
+        wr0 = vec_splats(wre[0]);
+        wr1 = vec_splats(wre[1]);
+        wi0 = vec_splats(wim[0]);
+        wi1 = vec_splats(wim[-1]);
+
+        wr2 = vec_splats(wre[2]);
+        wr3 = vec_splats(wre[3]);
+        wi2 = vec_splats(wim[-2]);
+        wi3 = vec_splats(wim[-3]);
+
+        vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
+        vzo2plus1 = vec_ld(i2+16, &(out[0]));
+        vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
+        vzo3plus1 = vec_ld(i3+16, &(out[0]));
+        vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
+        vz0plus1 = vec_ld(16, &(out[0]));
+        vzo1 = vec_ld(i1, &(out[0])); // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
+        vzo1plus1 = vec_ld(i1+16, &(out[0]));
+
+        x0 = vec_add(vzo2, vzo3);
+        x1 = vec_sub(vzo2, vzo3);
+
+        y0 = vec_add(vzo2plus1, vzo3plus1);
+        y1 = vec_sub(vzo2plus1, vzo3plus1);
+
+        x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
+        x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
+        x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
+        x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
+
+        y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
+        y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
+        xmulwi0 = vec_mul(x4, wi0);
+        xmulwi1 = vec_mul(x5, wi1);
+
+        y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
+        y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
+
+        x8 = vec_madd(x2, wr0, xmulwi0);
+        x9 = vec_msub(x2, wr0, xmulwi0);
+        ymulwi2 = vec_mul(y4, wi2);
+        ymulwi3 = vec_mul(y5, wi3);
+
+        x13 = vec_madd(x3, wr1, xmulwi1);
+        x14 = vec_msub(x3, wr1, xmulwi1);
+
+        y8 = vec_madd(y2, wr2, ymulwi2);
+        y9 = vec_msub(y2, wr2, ymulwi2);
+        y13 = vec_madd(y3, wr3, ymulwi3);
+        y14 = vec_msub(y3, wr3, ymulwi3);
+
+        x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
+        x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
+
+        y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
+        y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
+
+        x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
+        x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
+
+        y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
+        y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
+
+        x18 = vec_add(vz0, x16);
+        x19 = vec_sub(vz0, x16);
+        x20 = vec_add(vzo1, x17);
+        x21 = vec_sub(vzo1, x17);
+
+        y18 = vec_add(vz0plus1, y16);
+        y19 = vec_sub(vz0plus1, y16);
+        y20 = vec_add(vzo1plus1, y17);
+        y21 = vec_sub(vzo1plus1, y17);
+
+        x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
+        x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
+
+        y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
+        y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
+
+        vec_st(x18, 0, &(out[0]));
+        vec_st(y18, 16, &(out[0]));
+        vec_st(x22, i1, &(out[0]));
+        vec_st(y22, i1+16, &(out[0]));
+        vec_st(x19, i2, &(out[0]));
+        vec_st(y19, i2+16, &(out[0]));
+        vec_st(x23, i3, &(out[0]));
+        vec_st(y23, i3+16, &(out[0]));
+    } while (n-=2);
+}
+
+inline static void fft2_vsx_interleave(FFTComplex *z)
+{
+    FFTSample r1, i1;
+
+    r1 = z[0].re - z[1].re;
+    z[0].re += z[1].re;
+    z[1].re = r1;
+
+    i1 = z[0].im - z[1].im;
+    z[0].im += z[1].im;
+    z[1].im = i1;
+ }
+
+inline static void fft4_vsx_interleave(FFTComplex *z)
+{
+    vec_f a, b, c, d;
+    float* out=  (float*)z;
+    a = vec_ld(0, &(out[0]));
+    b = vec_ld(byte_2complex, &(out[0]));
+
+    c = vec_perm(a, b, vcprm(0,1,s2,s1));
+    d = vec_perm(a, b, vcprm(2,3,s0,s3));
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a, b, vcprm(0,1,s0,s1));
+    d = vec_perm(a, b, vcprm(2,3,s3,s2));
+
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+    vec_st(a, 0, &(out[0]));
+    vec_st(b, byte_2complex, &(out[0]));
+}
+
+inline static void fft8_vsx_interleave(FFTComplex *z)
+{
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f x24, x25, x26, x27;
+    vec_f x28, x29, x30, x31;
+    vec_f x32, x33, x34;
+
+    float* out=  (float*)z;
+    vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+
+    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
+    x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
+
+    x4 = vec_add(x0, x1);
+    x5 = vec_sub(x0, x1);
+    x6 = vec_add(x2, x3);
+    x7 = vec_sub(x2, x3);
+
+    x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
+    x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
+    x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
+    x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
+
+    x12 = vec_add(x8, x9);
+    x13 = vec_sub(x8, x9);
+    x14 = vec_add(x10, x11);
+    x15 = vec_sub(x10, x11);
+    x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
+    x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
+    x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
+    x19 = vec_add(x16, x18); // z0.r  z2.r  z0.i  z2.i
+    x20 = vec_sub(x16, x18); // z4.r  z6.r  z4.i  z6.i
+
+    x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
+    x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
+    x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
+    x24 = vec_add(x22, x23);
+    x25 = vec_sub(x22, x23);
+    x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
+
+    x27 = vec_add(x21, x26); // z1.r  z7.r z1.i z3.i
+    x28 = vec_sub(x21, x26); //z5.r  z3.r z5.i z7.i
+
+    x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r  z0.i  z1.r  z1.i
+    x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r  z2.i  z7.r  z3.i
+    x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r  z4.i  z5.r  z5.i
+    x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r  z6.i  z3.r  z7.i
+    x33 = vec_perm(x30, x32, vcprm(0,1,s2,3));  // z2.r  z2.i  z3.r  z3.i
+    x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r  z6.i  z7.r  z7.i
+
+    vec_st(x29, 0, &(out[0]));
+    vec_st(x33, byte_2complex, &(out[0]));
+    vec_st(x31, byte_4complex, &(out[0]));
+    vec_st(x34, byte_6complex, &(out[0]));
+}
+
+inline static void fft16_vsx_interleave(FFTComplex *z)
+{
+    float* out=  (float*)z;
+    vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+    vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
+    vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7;
+    vec_f x0, x1, x2, x3;
+    vec_f x4, x5, x6, x7;
+    vec_f x8, x9, x10, x11;
+    vec_f x12, x13, x14, x15;
+    vec_f x16, x17, x18, x19;
+    vec_f x20, x21, x22, x23;
+    vec_f x24, x25, x26, x27;
+    vec_f x28, x29, x30, x31;
+    vec_f x32, x33, x34, x35;
+    vec_f x36, x37, x38, x39;
+    vec_f x40, x41, x42, x43;
+    vec_f x44, x45, x46, x47;
+    vec_f x48, x49, x50, x51;
+    vec_f x52, x53, x54, x55;
+    vec_f x56, x57, x58, x59;
+    vec_f x60, x61, x62, x63;
+    vec_f x64, x65, x66, x67;
+    vec_f x68, x69, x70, x71;
+    vec_f x72, x73, x74, x75;
+    vec_f x76, x77, x78, x79;
+    vec_f x80, x81, x82, x83;
+    vec_f x84, x85, x86;
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+    vz4 = vec_ld(byte_8complex, &(out[0]));
+    vz5 = vec_ld(byte_10complex, &(out[0]));
+    vz6 = vec_ld(byte_12complex, &(out[0]));
+    vz7 = vec_ld(byte_14complex, &(out[0]));
+
+    x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+    x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+    x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
+    x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
+    x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
+    x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
+
+    x8 = vec_add(x0, x1);
+    x9 = vec_sub(x0, x1);
+    x10 = vec_add(x2, x3);
+    x11 = vec_sub(x2, x3);
+
+    x12 = vec_add(x4, x5);
+    x13 = vec_sub(x4, x5);
+    x14 = vec_add(x6, x7);
+    x15 = vec_sub(x6, x7);
+
+    x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
+    x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
+    x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
+    x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
+    x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
+    x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
+    x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
+    x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
+
+    x24 = vec_add(x16, x17);
+    x25 = vec_sub(x16, x17);
+    x26 = vec_add(x18, x19);
+    x27 = vec_sub(x18, x19);
+    x28 = vec_add(x20, x21);
+    x29 = vec_sub(x20, x21);
+    x30 = vec_add(x22, x23);
+    x31 = vec_sub(x22, x23);
+
+    x32 = vec_add(x24, x26);
+    x33 = vec_sub(x24, x26);
+    x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
+
+    x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
+    x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
+    x37 = vec_add(x35, x36);
+    x38 = vec_sub(x35, x36);
+    x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
+
+    x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
+    x41 = vec_perm(x26,  x37, vcprm(2,3,s3,s2));
+    x42 = vec_add(x40, x41);
+    x43 = vec_sub(x40, x41);
+    x44 = vec_mul(x42, vc0);
+    x45 = vec_mul(x43, vc0);
+
+    x46 = vec_add(x34, x39);  // z0.r  z0.i  z4.r  z4.i
+    x47 = vec_sub(x34, x39);  // z8.r  z8.i  z12.r  z12.i
+
+    x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
+    x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
+    x50 = vec_add(x48, x49);
+    x51 = vec_sub(x48, x49);
+    x52 = vec_mul(x50, vc1);
+    x53 = vec_mul(x50, vc2);
+    x54 = vec_mul(x51, vc1);
+    x55 = vec_mul(x51, vc2);
+
+    x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
+    x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
+    x58 = vec_add(x56, x57);
+    x59 = vec_sub(x56, x57);
+
+    x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
+    x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
+    x62 = vec_add(x52, x61);
+    x63 = vec_sub(x52, x61);
+    x64 = vec_add(x60, x53);
+    x65 = vec_sub(x60, x53);
+    x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
+    x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
+
+    x68 = vec_add(x58, x66); // z1.r    z1.i  z3.r    z3.i
+    x69 = vec_sub(x58, x66); // z9.r    z9.i  z11.r  z11.i
+    x70 = vec_add(x59, x67); // z5.r    z5.i  z15.r  z15.i
+    x71 = vec_sub(x59, x67); // z13.r  z13.i z7.r   z7.i
+
+    x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
+    x73 = vec_add(x25, x72);
+    x74 = vec_sub(x25, x72);
+    x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
+    x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
+    x77 = vec_add(x75, x76); // z2.r   z2.i    z6.r    z6.i
+    x78 = vec_sub(x75, x76); // z10.r  z10.i  z14.r  z14.i
+
+    x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r  z0.i  z1.r  z1.i
+    x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r  z2.i  z3.r  z3.i
+    x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r  z4.i  z5.r  z5.i
+    x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r  z6.i  z7.r  z7.i
+    vec_st(x79, 0, &(out[0]));
+    vec_st(x80, byte_2complex, &(out[0]));
+    vec_st(x81, byte_4complex, &(out[0]));
+    vec_st(x82, byte_6complex, &(out[0]));
+    x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r  z8.i  z9.r  z9.i
+    x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r  z10.i  z11.r  z11.i
+    x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r  z12.i  z13.r  z13.i
+    x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r  z14.i  z15.r  z15.i
+    vec_st(x83, byte_8complex, &(out[0]));
+    vec_st(x84, byte_10complex, &(out[0]));
+    vec_st(x85, byte_12complex, &(out[0]));
+    vec_st(x86, byte_14complex, &(out[0]));
+}
+
+inline static void fft4_vsx(FFTComplex *z)
+{
+    vec_f a, b, c, d;
+    float* out=  (float*)z;
+    a = vec_ld(0, &(out[0]));
+    b = vec_ld(byte_2complex, &(out[0]));
+
+    c = vec_perm(a, b, vcprm(0,1,s2,s1));
+    d = vec_perm(a, b, vcprm(2,3,s0,s3));
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a,b, vcprm(0,s0,1,s1));
+    d = vec_perm(a, b, vcprm(2,s3,3,s2));
+
+    a = vec_add(c, d);
+    b = vec_sub(c, d);
+
+    c = vec_perm(a, b, vcprm(0,1,s0,s1));
+    d = vec_perm(a, b, vcprm(2,3,s2,s3));
+
+    vec_st(c, 0, &(out[0]));
+    vec_st(d, byte_2complex, &(out[0]));
+    return;
+}
+
+inline static void fft8_vsx(FFTComplex *z)
+{
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7, vz8;
+
+    float* out=  (float*)z;
+    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+
+    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+    vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+
+    vz3 = vec_madd(vz3, vc1, vc0);
+    vz3 = vec_madd(vz8, vc2, vz3);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+    vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+    vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+
+    vz2 = vec_sub(vz4, vz6);
+    vz3 = vec_sub(vz5, vz7);
+
+    vz0 = vec_add(vz4, vz6);
+    vz1 = vec_add(vz5, vz7);
+
+    vec_st(vz0, 0, &(out[0]));
+    vec_st(vz1, byte_2complex, &(out[0]));
+    vec_st(vz2, byte_4complex, &(out[0]));
+    vec_st(vz3, byte_6complex, &(out[0]));
+    return;
+}
+
+inline static void fft16_vsx(FFTComplex *z)
+{
+    float* out=  (float*)z;
+    vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
+    vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
+    vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
+    vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
+    vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
+    vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
+
+    vec_f vz0, vz1, vz2, vz3;
+    vec_f vz4, vz5, vz6, vz7;
+    vec_f vz8, vz9, vz10, vz11;
+    vec_f vz12, vz13;
+
+    vz0 = vec_ld(byte_8complex, &(out[0]));
+    vz1 = vec_ld(byte_10complex, &(out[0]));
+    vz2 = vec_ld(byte_12complex, &(out[0]));
+    vz3 = vec_ld(byte_14complex, &(out[0]));
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1= vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
+
+    vz0 = vec_add(vz4, vz5);
+    vz1 = vec_sub(vz4, vz5);
+    vz2 = vec_add(vz6, vz7);
+    vz3 = vec_sub(vz6, vz7);
+
+    vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+
+    vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
+    vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
+
+    vz0 = vec_ld(0, &(out[0]));
+    vz1 = vec_ld(byte_2complex, &(out[0]));
+    vz2 = vec_ld(byte_4complex, &(out[0]));
+    vz3 = vec_ld(byte_6complex, &(out[0]));
+    vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
+    vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
+    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
+
+    vz2 = vec_add(vz10, vz11);
+    vz3 = vec_sub(vz10, vz11);
+    vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
+    vz0 = vec_add(vz8, vz9);
+    vz1 = vec_sub(vz8, vz9);
+
+    vz3 = vec_madd(vz3, vc1, vc0);
+    vz3 = vec_madd(vz12, vc2, vz3);
+    vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
+    vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
+    vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
+
+    vz0 = vec_add(vz8, vz9);
+    vz1 = vec_sub(vz8, vz9);
+    vz2 = vec_add(vz10, vz11);
+    vz3 = vec_sub(vz10, vz11);
+
+    vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
+    vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
+    vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
+    vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
+
+    vz2 = vec_sub(vz8, vz10);
+    vz3 = vec_sub(vz9, vz11);
+    vz0 = vec_add(vz8, vz10);
+    vz1 = vec_add(vz9, vz11);
+
+    vz8 = vec_madd(vz4, vc3, vc0);
+    vz9 = vec_madd(vz5, vc3, vc0);
+    vz10 = vec_madd(vz6, vc3, vc0);
+    vz11 = vec_madd(vz7, vc3, vc0);
+
+    vz8 = vec_madd(vz5, vc4, vz8);
+    vz9 = vec_madd(vz4, vc5, vz9);
+    vz10 = vec_madd(vz7, vc5, vz10);
+    vz11 = vec_madd(vz6, vc4, vz11);
+
+    vz12 = vec_sub(vz10, vz8);
+    vz10 = vec_add(vz10, vz8);
+
+    vz13 = vec_sub(vz9, vz11);
+    vz11 = vec_add(vz9, vz11);
+
+    vz4 = vec_sub(vz0, vz10);
+    vz0 = vec_add(vz0, vz10);
+
+    vz7= vec_sub(vz3, vz12);
+    vz3= vec_add(vz3, vz12);
+
+    vz5 = vec_sub(vz1, vz11);
+    vz1 = vec_add(vz1, vz11);
+
+    vz6 = vec_sub(vz2, vz13);
+    vz2 = vec_add(vz2, vz13);
+
+    vec_st(vz0, 0, &(out[0]));
+    vec_st(vz1, byte_2complex, &(out[0]));
+    vec_st(vz2, byte_4complex, &(out[0]));
+    vec_st(vz3, byte_6complex, &(out[0]));
+    vec_st(vz4, byte_8complex, &(out[0]));
+    vec_st(vz5, byte_10complex, &(out[0]));
+    vec_st(vz6, byte_12complex, &(out[0]));
+    vec_st(vz7, byte_14complex, &(out[0]));
+    return;
+
+}
+inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
+{
+    int o1 = n<<1;
+    int o2 = n<<2;
+    int o3 = o1+o2;
+    int i1, i2, i3;
+    FFTSample* out = (FFTSample*)z;
+    const FFTSample *wim = wre+o1;
+    vec_f v0, v1, v2, v3;
+    vec_f v4, v5, v6, v7;
+    vec_f v8, v9, v10, v11;
+    vec_f v12, v13;
+
+    n = n-2;
+    i1 = o1*sizeof(FFTComplex);
+    i2 = o2*sizeof(FFTComplex);
+    i3 = o3*sizeof(FFTComplex);
+
+    v8 = vec_ld(0, &(wre[0]));
+    v10 = vec_ld(0, &(wim[0]));
+    v9 = vec_ld(0, &(wim[-4]));
+    v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+    v4 = vec_ld(i2, &(out[0]));
+    v5 = vec_ld(i2+16, &(out[0]));
+    v6 = vec_ld(i3, &(out[0]));
+    v7 = vec_ld(i3+16, &(out[0]));
+    v10 = vec_mul(v4, v8); // r2*wre
+    v11 = vec_mul(v5, v8); // i2*wre
+    v12 = vec_mul(v6, v8); // r3*wre
+    v13 = vec_mul(v7, v8); // i3*wre
+
+    v0 = vec_ld(0, &(out[0])); // r0
+    v3 = vec_ld(i1+16, &(out[0])); // i1
+    v10 = vec_madd(v5, v9, v10); // r2*wim
+    v11 = vec_nmsub(v4, v9, v11); // i2*wim
+    v12 = vec_nmsub(v7, v9, v12); // r3*wim
+    v13 = vec_madd(v6, v9, v13); // i3*wim
+
+    v1 = vec_ld(16, &(out[0])); // i0
+    v2 = vec_ld(i1, &(out[0])); // r1
+    v8 = vec_sub(v12, v10);
+    v12 = vec_add(v12, v10);
+    v9 = vec_sub(v11, v13);
+    v13 = vec_add(v11, v13);
+    v4 = vec_sub(v0, v12);
+    v0 = vec_add(v0, v12);
+    v7 = vec_sub(v3, v8);
+    v3 = vec_add(v3, v8);
+
+    vec_st(v0, 0, &(out[0])); // r0
+    vec_st(v3, i1+16, &(out[0])); // i1
+    vec_st(v4, i2, &(out[0])); // r2
+    vec_st(v7, i3+16, &(out[0]));// i3
+
+    v5 = vec_sub(v1, v13);
+    v1 = vec_add(v1, v13);
+    v6 = vec_sub(v2, v9);
+    v2 = vec_add(v2, v9);
+
+    vec_st(v1, 16, &(out[0])); // i0
+    vec_st(v2, i1, &(out[0])); // r1
+    vec_st(v5, i2+16, &(out[0])); // i2
+    vec_st(v6, i3, &(out[0])); // r3
+
+    do {
+        out += 8;
+        wre += 4;
+        wim -= 4;
+
+        v8 = vec_ld(0, &(wre[0]));
+        v10 = vec_ld(0, &(wim[0]));
+        v9 = vec_ld(0, &(wim[-4]));
+        v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
+
+        v4 = vec_ld(i2, &(out[0])); // r2
+        v5 = vec_ld(i2+16, &(out[0])); // i2
+        v6 = vec_ld(i3, &(out[0])); // r3
+        v7 = vec_ld(i3+16, &(out[0]));// i3
+        v10 = vec_mul(v4, v8); // r2*wre
+        v11 = vec_mul(v5, v8); // i2*wre
+        v12 = vec_mul(v6, v8); // r3*wre
+        v13 = vec_mul(v7, v8); // i3*wre
+
+        v0 = vec_ld(0, &(out[0])); // r0
+        v3 = vec_ld(i1+16, &(out[0])); // i1
+        v10 = vec_madd(v5, v9, v10); // r2*wim
+        v11 = vec_nmsub(v4, v9, v11); // i2*wim
+        v12 = vec_nmsub(v7, v9, v12); // r3*wim
+        v13 = vec_madd(v6, v9, v13); // i3*wim
+
+        v1 = vec_ld(16, &(out[0])); // i0
+        v2 = vec_ld(i1, &(out[0])); // r1
+        v8 = vec_sub(v12, v10);
+        v12 = vec_add(v12, v10);
+        v9 = vec_sub(v11, v13);
+        v13 = vec_add(v11, v13);
+        v4 = vec_sub(v0, v12);
+        v0 = vec_add(v0, v12);
+        v7 = vec_sub(v3, v8);
+        v3 = vec_add(v3, v8);
+
+        vec_st(v0, 0, &(out[0])); // r0
+        vec_st(v3, i1+16, &(out[0])); // i1
+        vec_st(v4, i2, &(out[0])); // r2
+        vec_st(v7, i3+16, &(out[0])); // i3
+
+        v5 = vec_sub(v1, v13);
+        v1 = vec_add(v1, v13);
+        v6 = vec_sub(v2, v9);
+        v2 = vec_add(v2, v9);
+
+        vec_st(v1, 16, &(out[0])); // i0
+        vec_st(v2, i1, &(out[0])); // r1
+        vec_st(v5, i2+16, &(out[0])); // i2
+        vec_st(v6, i3, &(out[0])); // r3
+    } while (n-=2);
+}
+
+#endif
+
+#endif /* AVCODEC_PPC_FFT_VSX_H */
diff --git a/libavcodec/ppc/fmtconvert_altivec.c b/libavcodec/ppc/fmtconvert_altivec.c
index 796c431..7323eff 100644
--- a/libavcodec/ppc/fmtconvert_altivec.c
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/fmtconvert.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
                                                float mul, int len)
@@ -57,7 +57,7 @@ static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
 av_cold void ff_fmt_convert_init_ppc(FmtConvertContext *c,
                                      AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/h264chroma_init.c b/libavcodec/ppc/h264chroma_init.c
index 178f239..876efec 100644
--- a/libavcodec/ppc/h264chroma_init.c
+++ b/libavcodec/ppc/h264chroma_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/h264chroma.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
 
@@ -50,7 +50,7 @@
 
 av_cold void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     const int high_bit_depth = bit_depth > 8;
 
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
diff --git a/libavcodec/ppc/h264chroma_template.c b/libavcodec/ppc/h264chroma_template.c
index 293fef5..cb1e095 100644
--- a/libavcodec/ppc/h264chroma_template.c
+++ b/libavcodec/ppc/h264chroma_template.c
@@ -1,30 +1,32 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/mem.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
 
 /* this code assume that stride % 16 == 0 */
 
 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
-        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
-        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
+        vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\
+        vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
         psum = vec_mladd(vB, vsrc1ssH, psum);\
@@ -49,8 +51,8 @@
 
 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
 \
-        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
-        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
+        vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\
+        vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
         psum = vec_mladd(vE, vsrc1ssH, psum);\
@@ -70,6 +72,43 @@
 #define noop(a) a
 #define add28(a) vec_add(v28ss, a)
 
+#if HAVE_BIGENDIAN
+#define GET_VSRC1(vs0, off, b, perm0, s){    \
+    vec_u8 vsrcCuc, vsrcDuc;                 \
+    vsrcCuc = vec_ld(off, s);                \
+    if (loadSecond){                         \
+        vsrcDuc = vec_ld(off + b, s);        \
+    } else                                   \
+        vsrcDuc = vsrcCuc;                   \
+                                             \
+    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
+}
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+    vec_u8 vsrcCuc, vsrcDuc;                         \
+    vsrcCuc = vec_ld(off, s);                        \
+    if (loadSecond){                                 \
+        vsrcDuc = vec_ld(off + b, s);                \
+    } else                                           \
+        vsrcDuc = vsrcCuc;                           \
+                                                     \
+    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0);         \
+    if (reallyBadAlign){                             \
+        vs1 = vsrcDuc;                               \
+    } else                                           \
+        vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1);     \
+ }
+
+#else
+
+#define GET_VSRC1(vs0, off, b, perm0, s){            \
+    vs0 = vec_vsx_ld(off, s);                        \
+ }
+#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
+    vs0 = vec_vsx_ld(off, s);                        \
+    vs1 = vec_vsx_ld(off + 1, s);                    \
+ }
+#endif /* HAVE_BIGENDIAN */
+
 #ifdef PREFIX_h264_chroma_mc8_altivec
 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                                     int stride, int h, int x, int y) {
@@ -80,23 +119,27 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                          ((    x) * (    y))};
     register int i;
     vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
+    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
+    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
+    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
     const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
     const vec_u16 v6us = vec_splat_u16(6);
-    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrcperm0, vsrcperm1;
     vec_u8 vsrc0uc, vsrc1uc;
     vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_u8 vsrc2uc, vsrc3uc;
     vec_s16 vsrc2ssH, vsrc3ssH, psum;
     vec_u8 vdst, ppsum, vfdst, fsum;
+#if HAVE_BIGENDIAN
+    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+#endif
 
     if (((unsigned long)dst) % 16 == 0) {
         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
@@ -110,89 +153,28 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                          0x1C, 0x1D, 0x1E, 0x1F};
     }
 
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
+    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
+    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);
 
     if (ABCD[3]) {
-        if (!loadSecond) {// -> !reallyBadAlign
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
-        } else {
-            vec_u8 vsrcDuc;
-            for (i = 0 ; i < h ; i++) {
-                vsrcCuc = vec_ld(stride + 0, src);
-                vsrcDuc = vec_ld(stride + 16, src);
-                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                if (reallyBadAlign)
-                    vsrc3uc = vsrcDuc;
-                else
-                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
-            }
+        for (i = 0 ; i < h ; i++) {
+            GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
+            CHROMA_MC8_ALTIVEC_CORE(v32ss, noop);
         }
     } else {
         const vec_s16 vE = vec_add(vB, vC);
         if (ABCD[2]) { // x == 0 B == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(stride + 0, src);
-                    vsrcDuc = vec_ld(stride + 15, src);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-
-                    vsrc0uc = vsrc1uc;
-                }
+            for (i = 0 ; i < h ; i++) {
+                GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src);
+                CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
+                vsrc0uc = vsrc1uc;
             }
         } else { // y == 0 C == 0
-            if (!loadSecond) {// -> !reallyBadAlign
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
-            } else {
-                vec_u8 vsrcDuc;
-                for (i = 0 ; i < h ; i++) {
-                    vsrcCuc = vec_ld(0, src);
-                    vsrcDuc = vec_ld(15, src);
-                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-                    if (reallyBadAlign)
-                        vsrc1uc = vsrcDuc;
-                    else
-                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
-                }
+            for (i = 0 ; i < h ; i++) {
+               GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src);
+               CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
             }
         }
     }
@@ -209,23 +191,27 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i
                          ((    x) * (    y))};
     register int i;
     vec_u8 fperm;
-    const vec_s32 vABCD = vec_ld(0, ABCD);
-    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
-    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
-    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
-    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
+    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
+    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
+    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
     const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
     const vec_u16 v6us  = vec_splat_u16(6);
-    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
+    vec_u8 vsrcperm0, vsrcperm1;
     vec_u8 vsrc0uc, vsrc1uc;
     vec_s16 vsrc0ssH, vsrc1ssH;
-    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_u8 vsrc2uc, vsrc3uc;
     vec_s16 vsrc2ssH, vsrc3ssH, psum;
     vec_u8 vdst, ppsum, vfdst, fsum;
+#if HAVE_BIGENDIAN
+    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
+    vsrcperm0 = vec_lvsl(0, src);
+    vsrcperm1 = vec_lvsl(1, src);
+#endif
 
     if (((unsigned long)dst) % 16 == 0) {
         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
@@ -239,47 +225,14 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i
                          0x1C, 0x1D, 0x1E, 0x1F};
     }
 
-    vsrcAuc = vec_ld(0, src);
-
-    if (loadSecond)
-        vsrcBuc = vec_ld(16, src);
-    vsrcperm0 = vec_lvsl(0, src);
-    vsrcperm1 = vec_lvsl(1, src);
-
-    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
-    if (reallyBadAlign)
-        vsrc1uc = vsrcBuc;
-    else
-        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
-
-    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
-    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
-
-    if (!loadSecond) {// -> !reallyBadAlign
-        for (i = 0 ; i < h ; i++) {
-
-
-            vsrcCuc = vec_ld(stride + 0, src);
+    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 
-            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
-            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc);
 
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
-    } else {
-        vec_u8 vsrcDuc;
-        for (i = 0 ; i < h ; i++) {
-            vsrcCuc = vec_ld(stride + 0, src);
-            vsrcDuc = vec_ld(stride + 16, src);
-
-            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
-            if (reallyBadAlign)
-                vsrc3uc = vsrcDuc;
-            else
-                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
-            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
-        }
+    for (i = 0 ; i < h ; i++) {
+        GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
+        CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28);
     }
 }
 #endif
diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index ce514e3..1bc8058 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,7 +34,7 @@
 #include "libavcodec/h264.h"
 #include "libavcodec/h264dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 /****************************************************************************
  * IDCT transform:
@@ -68,10 +68,17 @@
     b2 = vec_mergeh( a1, a3 ); \
     b3 = vec_mergel( a1, a3 )
 
+#if HAVE_BIGENDIAN
+#define vdst_load(d)              \
+    vdst_orig = vec_ld(0, dst);   \
+    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);
+#else
+#define vdst_load(d) vdst = vec_vsx_ld(0, dst)
+#endif
+
 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
-    vdst_orig = vec_ld(0, dst);                               \
-    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
-    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
+    vdst_load();                                              \
+    vdst_ss = (vec_s16) VEC_MERGEH(zero_u8v, vdst);           \
     va = vec_add(va, vdst_ss);                                \
     va_u8 = vec_packsu(va, zero_s16v);                        \
     va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
@@ -171,26 +178,43 @@ static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
     d7 = vec_sub(b0v, b7v); \
 }
 
+#if HAVE_BIGENDIAN
+#define GET_2PERM(ldv, stv, d)  \
+    ldv = vec_lvsl(0, d);       \
+    stv = vec_lvsr(8, d);
+#define dstv_load(d)            \
+    vec_u8 hv = vec_ld( 0, d ); \
+    vec_u8 lv = vec_ld( 7, d);  \
+    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );
+#define dest_unligned_store(d)                                 \
+    vec_u8 edgehv;                                             \
+    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );  \
+    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );       \
+    lv    = vec_sel( lv, bodyv, edgelv );                      \
+    vec_st( lv, 7, d );                                        \
+    hv    = vec_ld( 0, d );                                    \
+    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
+    hv    = vec_sel( hv, bodyv, edgehv );                      \
+    vec_st( hv, 0, d );
+#else
+
+#define GET_2PERM(ldv, stv, d) {}
+#define dstv_load(d) vec_u8 dstv = vec_vsx_ld(0, d)
+#define dest_unligned_store(d)\
+    vec_u8 dst8 = vec_perm((vec_u8)idstsum8, dstv, vcprm(2,3,s2,s3));\
+    vec_vsx_st(dst8, 0, d)
+#endif /* HAVE_BIGENDIAN */
+
 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
     /* unaligned load */                                       \
-    vec_u8 hv = vec_ld( 0, dest );                           \
-    vec_u8 lv = vec_ld( 7, dest );                           \
-    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
+    dstv_load(dest);                                           \
     vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
+    vec_u16 dst16 = (vec_u16)VEC_MERGEH(zero_u8v, dstv);   \
     vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
     vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
-    vec_u8 edgehv;                                           \
     /* unaligned store */                                      \
-    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
-    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
-    lv    = vec_sel( lv, bodyv, edgelv );                      \
-    vec_st( lv, 7, dest );                                     \
-    hv    = vec_ld( 0, dest );                                 \
-    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
-    hv    = vec_sel( hv, bodyv, edgehv );                      \
-    vec_st( hv, 0, dest );                                     \
- }
+    dest_unligned_store(dest);\
+}
 
 static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
 {
@@ -198,8 +222,8 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
     vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
     vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
 
-    vec_u8 perm_ldv = vec_lvsl(0, dst);
-    vec_u8 perm_stv = vec_lvsr(8, dst);
+    vec_u8 perm_ldv, perm_stv;
+    GET_2PERM(perm_ldv, perm_stv, dst);
 
     const vec_u16 onev = vec_splat_u16(1);
     const vec_u16 twov = vec_splat_u16(2);
@@ -238,32 +262,41 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
     ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
 }
 
+#if HAVE_BIGENDIAN
+#define DST_LD vec_ld
+#else
+#define DST_LD vec_vsx_ld
+#endif
 static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
 {
     vec_s16 dc16;
     vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
+    vec_s32 v_dc32;
     LOAD_ZERO;
     DECLARE_ALIGNED(16, int, dc);
     int i;
 
     dc = (block[0] + 32) >> 6;
     block[0] = 0;
-    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
+    v_dc32 = vec_lde(0, &dc);
+    dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1);
 
     if (size == 4)
-        dc16 = vec_sld(dc16, zero_s16v, 8);
+        dc16 = VEC_SLD16(dc16, zero_s16v, 8);
     dcplus = vec_packsu(dc16, zero_s16v);
     dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
 
+#if HAVE_BIGENDIAN
     aligner = vec_lvsr(0, dst);
     dcplus = vec_perm(dcplus, dcplus, aligner);
     dcminus = vec_perm(dcminus, dcminus, aligner);
+#endif
 
     for (i = 0; i < size; i += 4) {
-        v0 = vec_ld(0, dst+0*stride);
-        v1 = vec_ld(0, dst+1*stride);
-        v2 = vec_ld(0, dst+2*stride);
-        v3 = vec_ld(0, dst+3*stride);
+        v0 = DST_LD(0, dst+0*stride);
+        v1 = DST_LD(0, dst+1*stride);
+        v2 = DST_LD(0, dst+2*stride);
+        v3 = DST_LD(0, dst+3*stride);
 
         v0 = vec_adds(v0, dcplus);
         v1 = vec_adds(v1, dcplus);
@@ -275,10 +308,10 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl
         v2 = vec_subs(v2, dcminus);
         v3 = vec_subs(v3, dcminus);
 
-        vec_st(v0, 0, dst+0*stride);
-        vec_st(v1, 0, dst+1*stride);
-        vec_st(v2, 0, dst+2*stride);
-        vec_st(v3, 0, dst+3*stride);
+        VEC_ST(v0, 0, dst+0*stride);
+        VEC_ST(v1, 0, dst+1*stride);
+        VEC_ST(v2, 0, dst+2*stride);
+        VEC_ST(v3, 0, dst+3*stride);
 
         dst += 4*stride;
     }
@@ -497,7 +530,7 @@ static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
 
     register vec_u8 average = vec_avg(p0, q0);
     register vec_u8 temp;
-    register vec_u8 uncliped;
+    register vec_u8 unclipped;
     register vec_u8 ones;
     register vec_u8 max;
     register vec_u8 min;
@@ -507,10 +540,10 @@ static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
     average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
     ones = vec_splat_u8(1);
     temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
-    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
+    unclipped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
     max = vec_adds(p1, tc0);
     min = vec_subs(p1, tc0);
-    newp1 = vec_max(min, uncliped);
+    newp1 = vec_max(min, unclipped);
     newp1 = vec_min(max, newp1);
     return newp1;
 }
@@ -639,6 +672,9 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
     temp[2] = offset;
 
     vtemp = (vec_s16)vec_ld(0, temp);
+#if !HAVE_BIGENDIAN
+    vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
+#endif
     vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
     vweight = vec_splat(vtemp, 3);
     voffset = vec_splat(vtemp, 5);
@@ -647,8 +683,8 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
     for (y = 0; y < height; y++) {
         vblock = vec_ld(0, block);
 
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
+        v0 = (vec_s16)VEC_MERGEH(zero_u8v, vblock);
+        v1 = (vec_s16)VEC_MERGEL(zero_u8v, vblock);
 
         if (w == 16 || aligned) {
             v0 = vec_mladd(v0, vweight, zero_s16v);
@@ -685,6 +721,9 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
     temp[3] = offset;
 
     vtemp = (vec_s16)vec_ld(0, temp);
+#if !HAVE_BIGENDIAN
+    vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
+#endif
     vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
     vweights = vec_splat(vtemp, 3);
     vweightd = vec_splat(vtemp, 5);
@@ -696,10 +735,10 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
         vdst = vec_ld(0, dst);
         vsrc = vec_ld(0, src);
 
-        v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
-        v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
-        v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
-        v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
+        v0 = (vec_s16)VEC_MERGEH(zero_u8v, vdst);
+        v1 = (vec_s16)VEC_MERGEL(zero_u8v, vdst);
+        v2 = (vec_s16)VEC_MERGEH(zero_u8v, vsrc);
+        v3 = (vec_s16)VEC_MERGEL(zero_u8v, vsrc);
 
         if (w == 8) {
             if (src_aligned)
@@ -751,7 +790,7 @@ H264_WEIGHT( 8)
 av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/h264qpel.c b/libavcodec/ppc/h264qpel.c
index 92c86f3..575f504 100644
--- a/libavcodec/ppc/h264qpel.c
+++ b/libavcodec/ppc/h264qpel.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 #include "libavcodec/h264qpel.h"
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
@@ -191,86 +191,79 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, cons
     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 }\
 
+#if HAVE_BIGENDIAN
+#define put_unligned_store(s, dest) {    \
+    tmp1 = vec_ld(0, dest);              \
+    mask = vec_lvsl(0, dest);            \
+    tmp2 = vec_ld(15, dest);             \
+    edges = vec_perm(tmp2, tmp1, mask);  \
+    align = vec_lvsr(0, dest);           \
+    tmp2 = vec_perm(s, edges, align);    \
+    tmp1 = vec_perm(edges, s, align);    \
+    vec_st(tmp2, 15, dest);              \
+    vec_st(tmp1, 0 , dest);              \
+ }
+#else
+#define put_unligned_store(s, dest) vec_vsx_st(s, 0, dest);
+#endif /* HAVE_BIGENDIAN */
+
 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     const uint8_t * src2, int dst_stride,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
-
+    vec_u8 a, b, d, mask_;
+#if HAVE_BIGENDIAN
+    vec_u8 tmp1, tmp2, mask, edges, align;
     mask_ = vec_lvsl(0, src2);
+#endif
 
     for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
+        a = unaligned_load(i * src_stride1, src1);
+        b = load_with_perm_vec(i * 16, src2, mask_);
         d = vec_avg(a, b);
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
+        put_unligned_store(d, dst);
         dst += dst_stride;
     }
 }
 
+#if HAVE_BIGENDIAN
+#define avg_unligned_store(s, dest){            \
+    tmp1 = vec_ld(0, dest);                     \
+    mask = vec_lvsl(0, dest);                   \
+    tmp2 = vec_ld(15, dest);                    \
+    a = vec_avg(vec_perm(tmp1, tmp2, mask), s); \
+    edges = vec_perm(tmp2, tmp1, mask);         \
+    align = vec_lvsr(0, dest);                  \
+    tmp2 = vec_perm(a, edges, align);           \
+    tmp1 = vec_perm(edges, a, align);           \
+    vec_st(tmp2, 15, dest);                     \
+    vec_st(tmp1, 0 , dest);                     \
+ }
+#else
+#define avg_unligned_store(s, dest){            \
+    a = vec_avg(vec_vsx_ld(0, dst), s);         \
+    vec_vsx_st(a, 0, dst);                      \
+ }
+#endif /* HAVE_BIGENDIAN */
+
 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     const uint8_t * src2, int dst_stride,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+    vec_u8 a, b, d, mask_;
 
+#if HAVE_BIGENDIAN
+    vec_u8 tmp1, tmp2, mask, edges, align;
     mask_ = vec_lvsl(0, src2);
+#endif
 
     for (i = 0; i < h; i++) {
-
-        tmp1 = vec_ld(i * src_stride1, src1);
-        mask = vec_lvsl(i * src_stride1, src1);
-        tmp2 = vec_ld(i * src_stride1 + 15, src1);
-
-        a = vec_perm(tmp1, tmp2, mask);
-
-        tmp1 = vec_ld(i * 16, src2);
-        tmp2 = vec_ld(i * 16 + 15, src2);
-
-        b = vec_perm(tmp1, tmp2, mask_);
-
-        tmp1 = vec_ld(0, dst);
-        mask = vec_lvsl(0, dst);
-        tmp2 = vec_ld(15, dst);
-
-        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
-
-        edges = vec_perm(tmp2, tmp1, mask);
-
-        align = vec_lvsr(0, dst);
-
-        tmp2 = vec_perm(d, edges, align);
-        tmp1 = vec_perm(edges, d, align);
-
-        vec_st(tmp2, 15, dst);
-        vec_st(tmp1, 0 , dst);
-
+        a = unaligned_load(i * src_stride1, src1);
+        b = load_with_perm_vec(i * 16, src2, mask_);
+        d = vec_avg(a, b);
+        avg_unligned_store(d, dst);
         dst += dst_stride;
     }
 }
@@ -286,7 +279,7 @@ H264_MC(avg_, 16, altivec)
 
 av_cold void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     const int high_bit_depth = bit_depth > 8;
 
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
diff --git a/libavcodec/ppc/h264qpel_template.c b/libavcodec/ppc/h264qpel_template.c
index fe83146..5ff72e3 100644
--- a/libavcodec/ppc/h264qpel_template.c
+++ b/libavcodec/ppc/h264qpel_template.c
@@ -1,30 +1,104 @@
 /*
  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/mem.h"
+#include "config.h"
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 
-#ifdef DEBUG
-#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+
+#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
+
+#if HAVE_BIGENDIAN
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+    vec_u8 srcR1 = vec_ld(-2, s);\
+    vec_u8 srcR2 = vec_ld(14, s);\
+    switch (ali) {\
+    default: {\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = vec_perm(srcR1, srcR2, pp2);\
+        srcP3 = vec_perm(srcR1, srcR2, pp3);\
+    } break;\
+    case 11: {\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = vec_perm(srcR1, srcR2, pp2);\
+        srcP3 = srcR2;\
+    } break;\
+    case 12: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = vec_perm(srcR1, srcR2, pp1);\
+        srcP2 = srcR2;\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 13: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = vec_perm(srcR1, srcR2, pp0);\
+        srcP1 = srcR2;\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 14: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = vec_perm(srcR1, srcR2, pm1);\
+        srcP0 = srcR2;\
+        srcP1 = vec_perm(srcR2, srcR3, pp1);\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    case 15: {\
+        vec_u8 srcR3 = vec_ld(30, s);\
+        srcM2 = vec_perm(srcR1, srcR2, pm2);\
+        srcM1 = srcR2;\
+        srcP0 = vec_perm(srcR2, srcR3, pp0);\
+        srcP1 = vec_perm(srcR2, srcR3, pp1);\
+        srcP2 = vec_perm(srcR2, srcR3, pp2);\
+        srcP3 = vec_perm(srcR2, srcR3, pp3);\
+    } break;\
+    }\
+ }
 #else
-#define ASSERT_ALIGNED(ptr) ;
-#endif
+#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
+    srcM2 =  vec_vsx_ld(-2, s);\
+    srcM1 = vec_vsx_ld(-1, s);\
+    srcP0 = vec_vsx_ld(0, s);\
+    srcP1 = vec_vsx_ld(1, s);\
+    srcP2 = vec_vsx_ld(2, s);\
+    srcP3 = vec_vsx_ld(3, s);\
+ }
+#endif /* HAVE_BIGENDIAN */
 
 /* this code assume stride % 16 == 0 */
 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
@@ -35,12 +109,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
     register int i;
 
     LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
+    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
     const vec_s16 v5ss = vec_splat_s16(5);
     const vec_u16 v5us = vec_splat_u16(5);
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
@@ -59,79 +128,32 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
 
     vec_u8 sum, fsum;
 
+#if HAVE_BIGENDIAN
+    permM2 = vec_lvsl(-2, src);
+    permM1 = vec_lvsl(-1, src);
+    permP0 = vec_lvsl(+0, src);
+    permP1 = vec_lvsl(+1, src);
+    permP2 = vec_lvsl(+2, src);
+    permP3 = vec_lvsl(+3, src);
+#endif /* HAVE_BIGENDIAN */
+
     for (i = 0 ; i < 16 ; i ++) {
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
+
+        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
+        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -178,7 +200,10 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
     register int i;
 
     LOAD_ZERO;
-    const vec_u8 perm = vec_lvsl(0, src);
+    vec_u8 perm;
+#if HAVE_BIGENDIAN
+    perm = vec_lvsl(0, src);
+#endif
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
     const vec_u16 v5us = vec_splat_u16(5);
     const vec_s16 v5ss = vec_splat_s16(5);
@@ -186,52 +211,41 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
 
     const uint8_t *srcbis = src - (srcStride * 2);
 
-    const vec_u8 srcM2a = vec_ld(0, srcbis);
-    const vec_u8 srcM2b = vec_ld(16, srcbis);
-    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcM1b = vec_ld(16, srcbis);
-    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP0b = vec_ld(16, srcbis);
-    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP1b = vec_ld(16, srcbis);
-    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
-    //srcbis += srcStride;
-    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
-    const vec_u8 srcP2b = vec_ld(16, srcbis);
-    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
-    //srcbis += srcStride;
-
-    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
-    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
-    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
-    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
-    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
+    const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+    const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
+    srcbis += srcStride;
+
+    vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+    vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
+    vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+    vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+    vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+    vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+    vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+    vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+    vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+    vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
 
     vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
               psumA, psumB, sumA, sumB,
               srcP3ssA, srcP3ssB,
               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
 
-    vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
+    vec_u8 sum, fsum, srcP3;
 
     for (i = 0 ; i < 16 ; i++) {
-        srcP3a = vec_ld(0, srcbis += srcStride);
-        srcP3b = vec_ld(16, srcbis);
-        srcP3 = vec_perm(srcP3a, srcP3b, perm);
-        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
-        //srcbis += srcStride;
+        srcP3 = load_with_perm_vec(0, srcbis, perm);
+        srcbis += srcStride;
+
+        srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
 
         sum1A = vec_adds(srcP0ssA, srcP1ssA);
         sum1B = vec_adds(srcP0ssB, srcP1ssB);
@@ -288,12 +302,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
 {
     register int i;
     LOAD_ZERO;
-    const vec_u8 permM2 = vec_lvsl(-2, src);
-    const vec_u8 permM1 = vec_lvsl(-1, src);
-    const vec_u8 permP0 = vec_lvsl(+0, src);
-    const vec_u8 permP1 = vec_lvsl(+1, src);
-    const vec_u8 permP2 = vec_lvsl(+2, src);
-    const vec_u8 permP3 = vec_lvsl(+3, src);
+    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
     const vec_u32 v10ui = vec_splat_u32(10);
     const vec_s16 v5ss = vec_splat_s16(5);
@@ -325,81 +334,35 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
     vec_u8 fsum, sumv, sum;
     vec_s16 ssume, ssumo;
 
+#if HAVE_BIGENDIAN
+    permM2 = vec_lvsl(-2, src);
+    permM1 = vec_lvsl(-1, src);
+    permP0 = vec_lvsl(+0, src);
+    permP1 = vec_lvsl(+1, src);
+    permP2 = vec_lvsl(+2, src);
+    permP3 = vec_lvsl(+3, src);
+#endif /* HAVE_BIGENDIAN */
+
     src -= (2 * srcStride);
     for (i = 0 ; i < 21 ; i ++) {
         vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-        vec_u8 srcR1 = vec_ld(-2, src);
-        vec_u8 srcR2 = vec_ld(14, src);
-
-        switch (align) {
-        default: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = vec_perm(srcR1, srcR2, permP3);
-        } break;
-        case 11: {
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = vec_perm(srcR1, srcR2, permP2);
-            srcP3 = srcR2;
-        } break;
-        case 12: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = vec_perm(srcR1, srcR2, permP1);
-            srcP2 = srcR2;
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 13: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = vec_perm(srcR1, srcR2, permP0);
-            srcP1 = srcR2;
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 14: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = vec_perm(srcR1, srcR2, permM1);
-            srcP0 = srcR2;
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        case 15: {
-            vec_u8 srcR3 = vec_ld(30, src);
-            srcM2 = vec_perm(srcR1, srcR2, permM2);
-            srcM1 = srcR2;
-            srcP0 = vec_perm(srcR2, srcR3, permP0);
-            srcP1 = vec_perm(srcR2, srcR3, permP1);
-            srcP2 = vec_perm(srcR2, srcR3, permP2);
-            srcP3 = vec_perm(srcR2, srcR3, permP3);
-        } break;
-        }
-
-        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
-
-        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
-
-        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
+
+        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
+
+        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
+        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
+        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
+        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
+
+        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
+        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
+        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
+        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
+
+        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
+        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
+        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
+        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -448,8 +411,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
         const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
         const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
         const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
-        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
-        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
+        vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
+        vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
 
         tmpbis += tmpStride;
 
@@ -474,10 +437,14 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
         pp2Be = vec_mule(sum2B, v5ss);
         pp2Bo = vec_mulo(sum2B, v5ss);
 
-        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
         pp3Ao = vec_mulo(sum3A, v1ss);
-        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
         pp3Bo = vec_mulo(sum3B, v1ss);
+#if !HAVE_BIGENDIAN
+        sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
+        sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
+#endif
+        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
+        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
 
         pp1cAe = vec_add(pp1Ae, v512si);
         pp1cAo = vec_add(pp1Ao, v512si);
diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 780240b..87a1f05 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,16 +34,15 @@
 #include "libavcodec/hpeldsp.h"
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 /* next one assumes that ((line_size % 16) == 0) */
 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
+    register vector unsigned char pixelsv1;
+    register vector unsigned char pixelsv1B;
+    register vector unsigned char pixelsv1C;
+    register vector unsigned char pixelsv1D;
 
-    register vector unsigned char perm = vec_lvsl(0, pixels);
     int i;
     register ptrdiff_t line_size_2 = line_size << 1;
     register ptrdiff_t line_size_3 = line_size + line_size_2;
@@ -55,22 +54,14 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 // -funroll-loops w/ this is bad - 74 cycles again.
 // all this is on a 7450, tuning for the 7450
     for (i = 0; i < h; i += 4) {
-        pixelsv1  = vec_ld( 0, pixels);
-        pixelsv2  = vec_ld(15, pixels);
-        pixelsv1B = vec_ld(line_size, pixels);
-        pixelsv2B = vec_ld(15 + line_size, pixels);
-        pixelsv1C = vec_ld(line_size_2, pixels);
-        pixelsv2C = vec_ld(15 + line_size_2, pixels);
-        pixelsv1D = vec_ld(line_size_3, pixels);
-        pixelsv2D = vec_ld(15 + line_size_3, pixels);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
-               line_size, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
-               line_size_2, (unsigned char*)block);
-        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
-               line_size_3, (unsigned char*)block);
+        pixelsv1  = unaligned_load( 0, pixels);
+        pixelsv1B = unaligned_load(line_size, pixels);
+        pixelsv1C = unaligned_load(line_size_2, pixels);
+        pixelsv1D = unaligned_load(line_size_3, pixels);
+        VEC_ST(pixelsv1, 0, (unsigned char*)block);
+        VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
+        VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
+        VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
         pixels+=line_size_4;
         block +=line_size_4;
     }
@@ -80,15 +71,12 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
-    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
-    register vector unsigned char perm = vec_lvsl(0, pixels);
-    int i;
+    register vector unsigned char pixelsv, blockv;
 
+    int i;
     for (i = 0; i < h; i++) {
-        pixelsv1 = vec_ld( 0, pixels);
-        pixelsv2 = vec_ld(16,pixels);
         blockv = vec_ld(0, block);
-        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
+        pixelsv = VEC_LD( 0, pixels);
         blockv = vec_avg(blockv,pixelsv);
         vec_st(blockv, 0, (unsigned char*)block);
         pixels+=line_size;
@@ -108,9 +96,7 @@ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff
        int rightside = ((unsigned long)block & 0x0000000F);
 
        blockv = vec_ld(0, block);
-       pixelsv1 = vec_ld( 0, pixels);
-       pixelsv2 = vec_ld(16, pixels);
-       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+       pixelsv = VEC_LD( 0, pixels);
 
        if (rightside) {
            pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
@@ -132,21 +118,16 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
+
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -155,17 +136,10 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -191,22 +165,16 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vcone);
@@ -215,17 +183,10 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -251,24 +212,18 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short temp3, temp4,
         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vctwo);
@@ -279,20 +234,13 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     for (i = 0; i < h ; i++) {
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
+        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -319,25 +267,19 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
-    register vector unsigned char blockv, temp1, temp2;
+    register vector unsigned char blockv;
     register vector unsigned short temp3, temp4,
         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv3 = vec_mergel(vczero, pixelsv1);
-    pixelsv4 = vec_mergel(vczero, pixelsv2);
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vcone);
@@ -346,22 +288,13 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     pixelssum1 = vec_add(pixelssum1, vcone);
 
     for (i = 0; i < h ; i++) {
-        blockv = vec_ld(0, block);
-
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
-
-        pixelsv3 = vec_mergel(vczero, pixelsv1);
-        pixelsv4 = vec_mergel(vczero, pixelsv2);
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
+        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
+        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
@@ -376,7 +309,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
 
         blockv = vec_packsu(temp3, temp4);
 
-        vec_st(blockv, 0, block);
+        VEC_ST(blockv, 0, block);
 
         block += line_size;
         pixels += line_size;
@@ -388,7 +321,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 {
     register int i;
     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
-    register vector unsigned char blockv, temp1, temp2, blocktemp;
+    register vector unsigned char blockv, blocktemp;
     register vector unsigned short pixelssum1, pixelssum2, temp3;
 
     register const vector unsigned char vczero = (const vector unsigned char)
@@ -396,16 +329,10 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     register const vector unsigned short vctwo = (const vector unsigned short)
                                         vec_splat_u16(2);
 
-    temp1 = vec_ld(0, pixels);
-    temp2 = vec_ld(16, pixels);
-    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
-    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
-        pixelsv2 = temp2;
-    } else {
-        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
-    }
-    pixelsv1 = vec_mergeh(vczero, pixelsv1);
-    pixelsv2 = vec_mergeh(vczero, pixelsv2);
+    pixelsv1 = VEC_LD(0, pixels);
+    pixelsv2 = VEC_LD(1, pixels);
+    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -414,17 +341,11 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
-        temp1 = vec_ld(line_size, pixels);
-        temp2 = vec_ld(line_size + 16, pixels);
-        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
-        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
-            pixelsv2 = temp2;
-        } else {
-            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
-        }
+        pixelsv1 = unaligned_load(line_size, pixels);
+        pixelsv2 = unaligned_load(line_size+1, pixels);
 
-        pixelsv1 = vec_mergeh(vczero, pixelsv1);
-        pixelsv2 = vec_mergeh(vczero, pixelsv2);
+        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
+        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -449,7 +370,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 
 av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/hpeldsp_altivec.h b/libavcodec/ppc/hpeldsp_altivec.h
index 98dd80e..590809f 100644
--- a/libavcodec/ppc/hpeldsp_altivec.h
+++ b/libavcodec/ppc/hpeldsp_altivec.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c b/libavcodec/ppc/huffyuvdsp_altivec.c
index 4c16283..6701524 100644
--- a/libavcodec/ppc/huffyuvdsp_altivec.c
+++ b/libavcodec/ppc/huffyuvdsp_altivec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,8 +32,8 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/huffyuvdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
-static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
+#if HAVE_ALTIVEC
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     register int i;
     register vector unsigned char vdst, vsrc;
@@ -53,7 +53,7 @@ static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
 
 av_cold void ff_huffyuvdsp_init_ppc(HuffYUVDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c
index c85b58e..80e71fd 100644
--- a/libavcodec/ppc/idctdsp.c
+++ b/libavcodec/ppc/idctdsp.c
@@ -1,28 +1,28 @@
 /*
  * Copyright (c) 2001 Michel Lespinasse
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /* NOTE: This code is based on GPL code from the libmpeg2 project.  The
  * author, Michel Lespinasses, has given explicit permission to release
- * under LGPL as part of Libav.
+ * under LGPL as part of FFmpeg.
  *
- * Libav integration by Dieter Shirley
+ * FFmpeg integration by Dieter Shirley
  *
  * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
  * project.  I've deleted all of the libmpeg2-specific code, renamed the
@@ -43,7 +43,7 @@
 #include "libavutil/ppc/types_altivec.h"
 #include "libavcodec/idctdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define IDCT_HALF                                       \
     /* 1st stage */                                     \
@@ -153,6 +153,22 @@ static const vec_s16 constants[5] = {
     { 19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722 }
 };
 
+static void idct_altivec(int16_t *blk)
+{
+    vec_s16 *block = (vec_s16 *) blk;
+
+    IDCT;
+
+    block[0] = vx0;
+    block[1] = vx1;
+    block[2] = vx2;
+    block[3] = vx3;
+    block[4] = vx4;
+    block[5] = vx5;
+    block[6] = vx6;
+    block[7] = vx7;
+}
+
 static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
 {
     vec_s16 *block = (vec_s16 *) blk;
@@ -193,16 +209,26 @@ static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
 
     IDCT;
 
+#if HAVE_BIGENDIAN
     p0    = vec_lvsl(0, dest);
     p1    = vec_lvsl(stride, dest);
     p     = vec_splat_u8(-1);
     perm0 = vec_mergeh(p, p0);
     perm1 = vec_mergeh(p, p1);
+#endif
 
-#define ADD(dest, src, perm)                                \
-    /* *(uint64_t *) &tmp = *(uint64_t *) dest; */          \
+#if HAVE_BIGENDIAN
+#define GET_TMP2(dest, prm)                                 \
     tmp  = vec_ld(0, dest);                                 \
-    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm);    \
+    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, prm);
+#else
+#define GET_TMP2(dest, prm)                                 \
+    tmp  = vec_vsx_ld(0, dest);                             \
+    tmp2 = (vec_s16) vec_mergeh(tmp, (vec_u8) zero)
+#endif
+
+#define ADD(dest, src, perm)                                \
+    GET_TMP2(dest, perm);                                   \
     tmp3 = vec_adds(tmp2, src);                             \
     tmp  = vec_packsu(tmp3, tmp3);                          \
     vec_ste((vec_u32) tmp, 0, (unsigned int *) dest);       \
@@ -230,13 +256,14 @@ static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
 av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
-    if (!high_bit_depth) {
-        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
+    if (!high_bit_depth && avctx->lowres == 0) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
+            c->idct      = idct_altivec;
             c->idct_add  = idct_add_altivec;
             c->idct_put  = idct_put_altivec;
             c->perm_type = FF_IDCT_PERM_TRANSPOSE;
diff --git a/libavcodec/ppc/apedsp_altivec.c b/libavcodec/ppc/lossless_audiodsp_altivec.c
index 3b9d045..bdec252 100644
--- a/libavcodec/ppc/apedsp_altivec.c
+++ b/libavcodec/ppc/lossless_audiodsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,23 @@
 #include "libavutil/cpu.h"
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/ppc/types_altivec.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_BIGENDIAN
+#define GET_T(tt0,tt1,src,a,b){       \
+        a = vec_ld(16, src);          \
+        tt0 = vec_perm(b, a, align);  \
+        b = vec_ld(32, src);          \
+        tt1 = vec_perm(a, b, align);  \
+ }
+#else
+#define GET_T(tt0,tt1,src,a,b){       \
+        tt0 = vec_vsx_ld(0, src);     \
+        tt1 = vec_vsx_ld(16, src);    \
+ }
+#endif
+
+#if HAVE_ALTIVEC
 static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                     const int16_t *v2,
                                                     const int16_t *v3,
@@ -38,26 +52,23 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
     LOAD_ZERO;
     vec_s16 *pv1 = (vec_s16 *) v1;
     register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
-    register vec_s16 t0, t1, i0, i1, i4;
-    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
+    register vec_s16 t0, t1, i0, i1, i4, i2, i3;
     register vec_s32 res = zero_s32v;
+#if HAVE_BIGENDIAN
     register vec_u8 align = vec_lvsl(0, v2);
+    i2 = vec_ld(0, v2);
+    i3 = vec_ld(0, v3);
+#endif
     int32_t ires;
 
     order >>= 4;
     do {
-        i1     = vec_ld(16, v2);
-        t0     = vec_perm(i2, i1, align);
-        i2     = vec_ld(32, v2);
-        t1     = vec_perm(i1, i2, align);
+        GET_T(t0,t1,v2,i1,i2);
         i0     = pv1[0];
         i1     = pv1[1];
         res    = vec_msum(t0, i0, res);
         res    = vec_msum(t1, i1, res);
-        i4     = vec_ld(16, v3);
-        t0     = vec_perm(i3, i4, align);
-        i3     = vec_ld(32, v3);
-        t1     = vec_perm(i4, i3, align);
+        GET_T(t0,t1,v3,i4,i3);
         pv1[0] = vec_mladd(t0, muls, i0);
         pv1[1] = vec_mladd(t1, muls, i1);
         pv1   += 2;
@@ -71,9 +82,9 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
 }
 #endif /* HAVE_ALTIVEC */
 
-av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
+av_cold void ff_llauddsp_init_ppc(LLAudDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mathops.h b/libavcodec/ppc/mathops.h
index 34ddb11..dbd714f 100644
--- a/libavcodec/ppc/mathops.h
+++ b/libavcodec/ppc/mathops.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2001, 2002 Fabrice Bellard
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/mdct_init.c b/libavcodec/ppc/mdct_init.c
deleted file mode 100644
index d3582bc..0000000
--- a/libavcodec/ppc/mdct_init.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * FFT/IFFT transforms
- * AltiVec-enabled
- * Copyright (c) 2009 Loren Merritt
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/cpu.h"
-#include "libavutil/ppc/cpu.h"
-#include "libavutil/ppc/types_altivec.h"
-#include "libavutil/ppc/util_altivec.h"
-#include "libavcodec/fft.h"
-
-/**
- * Do a complex FFT with the parameters defined in ff_fft_init().
- * The input data must be permuted before with s->revtab table.
- * No 1.0 / sqrt(n) normalization is done.
- * AltiVec-enabled:
- * This code assumes that the 'z' pointer is 16 bytes-aligned.
- * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
- */
-
-void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
-
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
-static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
-{
-    int j, k;
-    int n = 1 << s->mdct_bits;
-    int n4 = n >> 2;
-    int n8 = n >> 3;
-    int n32 = n >> 5;
-    const uint16_t *revtabj = s->revtab;
-    const uint16_t *revtabk = s->revtab+n4;
-    const vec_f *tcos = (const vec_f*)(s->tcos+n8);
-    const vec_f *tsin = (const vec_f*)(s->tsin+n8);
-    const vec_f *pin = (const vec_f*)(input+n4);
-    vec_f *pout = (vec_f*)(output+n4);
-
-    /* pre rotation */
-    k = n32-1;
-    do {
-        vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
-#define CMULA(p,o0,o1,o2,o3)\
-        a = pin[ k*2+p];                       /* { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  } */\
-        b = pin[-k*2-p-1];                     /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
-        re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re } */\
-        im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    } */\
-        cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
-        sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
-        r##p = im*cos - re*sin;\
-        i##p = re*cos + im*sin;
-#define STORE2(v,dst)\
-        j = dst;\
-        vec_ste(v, 0, output+j*2);\
-        vec_ste(v, 4, output+j*2);
-#define STORE8(p)\
-        a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
-        b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
-        c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
-        d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
-        STORE2(a, revtabk[ p*2-4]);\
-        STORE2(b, revtabk[ p*2-3]);\
-        STORE2(c, revtabj[-p*2+2]);\
-        STORE2(d, revtabj[-p*2+3]);
-
-        cos0 = tcos[k];
-        sin0 = tsin[k];
-        cos1 = tcos[-k-1];
-        sin1 = tsin[-k-1];
-        CMULA(0, 0,1,2,3);
-        CMULA(1, 2,3,0,1);
-        STORE8(0);
-        STORE8(1);
-        revtabj += 4;
-        revtabk -= 4;
-        k--;
-    } while(k >= 0);
-
-    ff_fft_calc_altivec(s, (FFTComplex*)output);
-
-    /* post rotation + reordering */
-    j = -n32;
-    k = n32-1;
-    do {
-        vec_f cos,sin,re,im,a,b,c,d;
-#define CMULB(d0,d1,o)\
-        re = pout[o*2];\
-        im = pout[o*2+1];\
-        cos = tcos[o];\
-        sin = tsin[o];\
-        d0 = im*sin - re*cos;\
-        d1 = re*sin + im*cos;
-
-        CMULB(a,b,j);
-        CMULB(c,d,k);
-        pout[2*j]   = vec_perm(a, d, vcprm(0,s3,1,s2));
-        pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
-        pout[2*k]   = vec_perm(c, b, vcprm(0,s3,1,s2));
-        pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
-        j++;
-        k--;
-    } while(k >= 0);
-}
-
-static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
-{
-    int k;
-    int n = 1 << s->mdct_bits;
-    int n4 = n >> 2;
-    int n16 = n >> 4;
-    vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
-    vec_u32 *p0 = (vec_u32*)(output+n4);
-    vec_u32 *p1 = (vec_u32*)(output+n4*3);
-
-    imdct_half_altivec(s, output + n4, input);
-
-    for (k = 0; k < n16; k++) {
-        vec_u32 a = p0[k] ^ sign;
-        vec_u32 b = p1[-k-1];
-        p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
-        p1[k]    = vec_perm(b, b, vcprm(3,2,1,0));
-    }
-}
-#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
-
-av_cold void ff_mdct_init_ppc(FFTContext *s)
-{
-#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
-    if (!PPC_ALTIVEC(av_get_cpu_flags()))
-        return;
-
-    if (s->mdct_bits >= 5) {
-        s->imdct_calc = imdct_calc_altivec;
-        s->imdct_half = imdct_half_altivec;
-    }
-#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
-}
diff --git a/libavcodec/ppc/me_cmp.c b/libavcodec/ppc/me_cmp.c
index b074d28..9f75ed2 100644
--- a/libavcodec/ppc/me_cmp.c
+++ b/libavcodec/ppc/me_cmp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,27 +34,44 @@
 #include "libavcodec/mpegvideo.h"
 #include "libavcodec/me_cmp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
+
+#if HAVE_BIGENDIAN
+#define GET_PERM(per1, per2, pix) {\
+    per1 = vec_lvsl(0, pix);\
+    per2 = vec_add(per1, vec_splat_u8(1));\
+}
+#define LOAD_PIX(v, iv, pix, per1, per2) {\
+    vector unsigned char pix2l  = vec_ld(0,  pix);\
+    vector unsigned char pix2r  = vec_ld(16, pix);\
+    v  = vec_perm(pix2l, pix2r, per1);\
+    iv = vec_perm(pix2l, pix2r, per2);\
+}
+#else
+#define GET_PERM(per1, per2, pix) {}
+#define LOAD_PIX(v, iv, pix, per1, per2) {\
+    v  = vec_vsx_ld(0,  pix);\
+    iv = vec_vsx_ld(1,  pix);\
+}
+#endif
 static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                             ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int __attribute__((aligned(16))) s = 0;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+    vector unsigned char perm1, perm2, pix2v, pix2iv;
 
+    GET_PERM(perm1, perm2, pix2);
     for (i = 0; i < h; i++) {
         /* Read unaligned pixels into our vectors. The vectors are as follows:
          * pix1v: pix1[0] - pix1[15]
          * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
         vector unsigned char pix1v  = vec_ld(0,  pix1);
-        vector unsigned char pix2l  = vec_ld(0,  pix2);
-        vector unsigned char pix2r  = vec_ld(16, pix2);
-        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
-        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
+        LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
 
         /* Calculate the average vector. */
         vector unsigned char avgv = vec_avg(pix2v, pix2iv);
@@ -80,13 +97,14 @@ static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                             ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int  __attribute__((aligned(16))) s = 0;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned char pix1v, pix3v, avgv, t5;
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+
     uint8_t *pix3 = pix2 + stride;
 
     /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
@@ -96,19 +114,14 @@ static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
      * Read unaligned pixels into our vectors. The vectors are as follows:
      * pix2v: pix2[0] - pix2[15]
      * Split the pixel vectors into shorts. */
-    vector unsigned char pix2l = vec_ld(0,  pix2);
-    vector unsigned char pix2r = vec_ld(15, pix2);
-    vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
+    vector unsigned char pix2v = VEC_LD(0, pix2);
 
     for (i = 0; i < h; i++) {
         /* Read unaligned pixels into our vectors. The vectors are as follows:
          * pix1v: pix1[0] - pix1[15]
          * pix3v: pix3[0] - pix3[15] */
         pix1v = vec_ld(0,  pix1);
-
-        pix2l = vec_ld(0,  pix3);
-        pix2r = vec_ld(15, pix3);
-        pix3v = vec_perm(pix2l, pix2r, perm);
+        pix3v = VEC_LD(0,  pix3);
 
         /* Calculate the average vector. */
         avgv = vec_avg(pix2v, pix3v);
@@ -134,20 +147,21 @@ static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                              ptrdiff_t stride, int h)
 {
-    int i, s = 0;
+    int i;
+    int  __attribute__((aligned(16))) s = 0;
     uint8_t *pix3 = pix2 + stride;
     const vector unsigned char zero =
         (const vector unsigned char) vec_splat_u8(0);
     const vector unsigned short two =
         (const vector unsigned short) vec_splat_u16(2);
     vector unsigned char avgv, t5;
-    vector unsigned char perm1 = vec_lvsl(0, pix2);
-    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
     vector unsigned char pix1v, pix3v, pix3iv;
     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
     vector unsigned short avghv, avglv;
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
+    vector unsigned char perm1, perm2, pix2v, pix2iv;
+    GET_PERM(perm1, perm2, pix2);
 
     /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
      * iteration becomes pix2 in the next iteration. We can use this
@@ -156,19 +170,16 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
      * Read unaligned pixels into our vectors. The vectors are as follows:
      * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
      * Split the pixel vectors into shorts. */
-    vector unsigned char pix2l  = vec_ld(0,  pix2);
-    vector unsigned char pix2r  = vec_ld(16, pix2);
-    vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
-    vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
-
+    LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
     vector unsigned short pix2hv  =
-        (vector unsigned short) vec_mergeh(zero, pix2v);
+        (vector unsigned short) VEC_MERGEH(zero, pix2v);
     vector unsigned short pix2lv  =
-        (vector unsigned short) vec_mergel(zero, pix2v);
+        (vector unsigned short) VEC_MERGEL(zero, pix2v);
     vector unsigned short pix2ihv =
-        (vector unsigned short) vec_mergeh(zero, pix2iv);
+        (vector unsigned short) VEC_MERGEH(zero, pix2iv);
     vector unsigned short pix2ilv =
-        (vector unsigned short) vec_mergel(zero, pix2iv);
+        (vector unsigned short) VEC_MERGEL(zero, pix2iv);
+
     vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
     vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
     vector unsigned short t3, t4;
@@ -178,11 +189,7 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
          * pix1v: pix1[0] - pix1[15]
          * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
         pix1v  = vec_ld(0, pix1);
-
-        pix2l  = vec_ld(0, pix3);
-        pix2r  = vec_ld(16, pix3);
-        pix3v  = vec_perm(pix2l, pix2r, perm1);
-        pix3iv = vec_perm(pix2l, pix2r, perm2);
+        LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2);
 
         /* Note that AltiVec does have vec_avg, but this works on vector pairs
          * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
@@ -191,10 +198,10 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
          * vectors of shorts and do the averaging by hand. */
 
         /* Split the pixel vectors into shorts. */
-        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
-        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
-        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
-        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
+        pix3hv  = (vector unsigned short) VEC_MERGEH(zero, pix3v);
+        pix3lv  = (vector unsigned short) VEC_MERGEL(zero, pix3v);
+        pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv);
+        pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv);
 
         /* Do the averaging on them. */
         t3 = vec_add(pix3hv, pix3ihv);
@@ -229,19 +236,17 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                          ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
 
     for (i = 0; i < h; i++) {
         /* Read potentially unaligned pixels into t1 and t2. */
-        vector unsigned char pix2l = vec_ld(0,  pix2);
-        vector unsigned char pix2r = vec_ld(15, pix2);
-        vector unsigned char t1 = vec_ld(0, pix1);
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
+        vector unsigned char t1 =vec_ld(0, pix1);
+        vector unsigned char t2 = VEC_LD(0, pix2);
 
         /* Calculate a sum of abs differences vector. */
         vector unsigned char t3 = vec_max(t1, t2);
@@ -266,14 +271,13 @@ static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                         ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
     const vector unsigned char permclear =
         (vector unsigned char)
         { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
     vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumdiffs;
 
@@ -281,14 +285,10 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
         /* Read potentially unaligned pixels into t1 and t2.
          * Since we're reading 16 pixels, and actually only want 8,
          * mask out the last 8 pixels. The 0s don't change the sum. */
-        vector unsigned char pix1l = vec_ld(0, pix1);
-        vector unsigned char pix1r = vec_ld(7, pix1);
-        vector unsigned char pix2l = vec_ld(0, pix2);
-        vector unsigned char pix2r = vec_ld(7, pix2);
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
-                                          permclear);
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
-                                          permclear);
+        vector unsigned char pix1l = VEC_LD(0, pix1);
+        vector unsigned char pix2l = VEC_LD(0, pix2);
+        vector unsigned char t1 = vec_and(pix1l, permclear);
+        vector unsigned char t2 = vec_and(pix2l, permclear);
 
         /* Calculate a sum of abs differences vector. */
         vector unsigned char t3 = vec_max(t1, t2);
@@ -315,14 +315,13 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                         ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
     const vector unsigned char permclear =
         (vector unsigned char)
         { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
-    vector unsigned char perm1 = vec_lvsl(0, pix1);
-    vector unsigned char perm2 = vec_lvsl(0, pix2);
     vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumsqr;
 
@@ -330,14 +329,8 @@ static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
         /* Read potentially unaligned pixels into t1 and t2.
          * Since we're reading 16 pixels, and actually only want 8,
          * mask out the last 8 pixels. The 0s don't change the sum. */
-        vector unsigned char pix1l = vec_ld(0, pix1);
-        vector unsigned char pix1r = vec_ld(7, pix1);
-        vector unsigned char pix2l = vec_ld(0, pix2);
-        vector unsigned char pix2r = vec_ld(7, pix2);
-        vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
-                                          permclear);
-        vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
-                                          permclear);
+        vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear);
+        vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear);
 
         /* Since we want to use unsigned chars, we can take advantage
          * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
@@ -367,19 +360,17 @@ static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                          ptrdiff_t stride, int h)
 {
-    int i, s;
+    int i;
+    int  __attribute__((aligned(16))) s;
     const vector unsigned int zero =
         (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix2);
     vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
     vector signed int sumsqr;
 
     for (i = 0; i < h; i++) {
         /* Read potentially unaligned pixels into t1 and t2. */
-        vector unsigned char pix2l = vec_ld(0,  pix2);
-        vector unsigned char pix2r = vec_ld(15, pix2);
         vector unsigned char t1 = vec_ld(0, pix1);
-        vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
+        vector unsigned char t2 = VEC_LD(0, pix2);
 
         /* Since we want to use unsigned chars, we can take advantage
          * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
@@ -399,15 +390,15 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
     /* Sum up the four partial sums, and put the result into s. */
     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
     sumsqr = vec_splat(sumsqr, 3);
-    vec_ste(sumsqr, 0, &s);
 
+    vec_ste(sumsqr, 0, &s);
     return s;
 }
 
 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
                                      uint8_t *src, ptrdiff_t stride, int h)
 {
-    int sum;
+    int __attribute__((aligned(16))) sum;
     register const vector unsigned char vzero =
         (const vector unsigned char) vec_splat_u8(0);
     register vector signed short temp0, temp1, temp2, temp3, temp4,
@@ -432,24 +423,19 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
             { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
 
+
 #define ONEITERBUTTERFLY(i, res)                                            \
     {                                                                       \
-        register vector unsigned char src1 = vec_ld(stride * i, src);       \
-        register vector unsigned char src2 = vec_ld(stride * i + 15, src);  \
-        register vector unsigned char srcO =                                \
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
-        register vector unsigned char dst1 = vec_ld(stride * i, dst);       \
-        register vector unsigned char dst2 = vec_ld(stride * i + 15, dst);  \
-        register vector unsigned char dstO =                                \
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
+        register vector unsigned char srcO =  unaligned_load(stride * i, src);  \
+        register vector unsigned char dstO = unaligned_load(stride * i, dst);\
                                                                             \
         /* Promote the unsigned chars to signed shorts. */                  \
         /* We're in the 8x8 function, we only care for the first 8. */      \
         register vector signed short srcV =                                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstV =                                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
                                                                             \
         /* subtractions inside the first butterfly */                       \
@@ -461,6 +447,7 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
         register vector signed short op3  = vec_perm(but2, but2, perm3);    \
         res  = vec_mladd(but2, vprod3, op3);                                \
     }
+
         ONEITERBUTTERFLY(0, temp0);
         ONEITERBUTTERFLY(1, temp1);
         ONEITERBUTTERFLY(2, temp2);
@@ -510,13 +497,14 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
         vsum = vec_sum4s(vec_abs(line7C), vsum);
         vsum = vec_sums(vsum, (vector signed int) vzero);
         vsum = vec_splat(vsum, 3);
+
         vec_ste(vsum, 0, &sum);
     }
     return sum;
 }
 
 /*
- * 16x8 works with 16 elements; it allows to avoid replicating loads, and
+ * 16x8 works with 16 elements; it can avoid replicating loads, and
  * gives the compiler more room for scheduling. It's only used from
  * inside hadamard8_diff16_altivec.
  *
@@ -536,7 +524,7 @@ static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
 static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
                                       uint8_t *src, ptrdiff_t stride, int h)
 {
-    int sum;
+    int __attribute__((aligned(16))) sum;
     register vector signed short
         temp0 __asm__ ("v0"),
         temp1 __asm__ ("v1"),
@@ -584,31 +572,23 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
 
 #define ONEITERBUTTERFLY(i, res1, res2)                                     \
     {                                                                       \
-        register vector unsigned char src1 __asm__ ("v22") =                \
-            vec_ld(stride * i, src);                                        \
-        register vector unsigned char src2 __asm__ ("v23") =                \
-            vec_ld(stride * i + 16, src);                                   \
         register vector unsigned char srcO __asm__ ("v22") =                \
-            vec_perm(src1, src2, vec_lvsl(stride * i, src));                \
-        register vector unsigned char dst1 __asm__ ("v24") =                \
-            vec_ld(stride * i, dst);                                        \
-        register vector unsigned char dst2 __asm__ ("v25") =                \
-            vec_ld(stride * i + 16, dst);                                   \
+            unaligned_load(stride * i, src);                                    \
         register vector unsigned char dstO __asm__ ("v23") =                \
-            vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));                \
+            unaligned_load(stride * i, dst);\
                                                                             \
         /* Promote the unsigned chars to signed shorts. */                  \
         register vector signed short srcV __asm__ ("v24") =                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstV __asm__ ("v25") =                 \
-            (vector signed short) vec_mergeh((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
         register vector signed short srcW __asm__ ("v26") =                 \
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
                                              (vector signed char) srcO);    \
         register vector signed short dstW __asm__ ("v27") =                 \
-            (vector signed short) vec_mergel((vector signed char) vzero,    \
+            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
                                              (vector signed char) dstO);    \
                                                                             \
         /* subtractions inside the first butterfly */                       \
@@ -639,6 +619,7 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
         res1 = vec_mladd(but2, vprod3, op3);                                \
         res2 = vec_mladd(but2S, vprod3, op3S);                              \
     }
+
         ONEITERBUTTERFLY(0, temp0, temp0S);
         ONEITERBUTTERFLY(1, temp1, temp1S);
         ONEITERBUTTERFLY(2, temp2, temp2S);
@@ -725,6 +706,7 @@ static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
         vsum = vec_sum4s(vec_abs(line7CS), vsum);
         vsum = vec_sums(vsum, (vector signed int) vzero);
         vsum = vec_splat(vsum, 3);
+
         vec_ste(vsum, 0, &sum);
     }
     return sum;
@@ -746,7 +728,7 @@ static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
 
 av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mpegaudiodsp_altivec.c b/libavcodec/ppc/mpegaudiodsp_altivec.c
index 21cbcf3..ddfe5dc 100644
--- a/libavcodec/ppc/mpegaudiodsp_altivec.c
+++ b/libavcodec/ppc/mpegaudiodsp_altivec.c
@@ -2,20 +2,20 @@
  * Altivec optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -132,7 +132,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
 
 av_cold void ff_mpadsp_init_ppc(MPADSPContext *s)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c
index 550a03a..1b6bda6 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -4,20 +4,20 @@
  * dct_unquantize_h263_altivec:
  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 /* AltiVec version of dct_unquantize_h263
    this code assumes `block' is 16 bytes-aligned */
@@ -42,8 +42,6 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
     int i, level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
-
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
 
@@ -59,6 +57,7 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
         nCoeffs= 63; //does not always use zigzag table
     } else {
         i = 0;
+        av_assert2(s->block_last_index[n]>=0);
         nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
     }
 
@@ -117,7 +116,7 @@ static void dct_unquantize_h263_altivec(MpegEncContext *s,
 
 av_cold void ff_mpv_common_init_ppc(MpegEncContext *s)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/mpegvideodsp.c b/libavcodec/ppc/mpegvideodsp.c
index 0b426e5..7696954 100644
--- a/libavcodec/ppc/mpegvideodsp.c
+++ b/libavcodec/ppc/mpegvideodsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideodsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 /* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
  * to preserve proper dst alignment. */
 static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
@@ -66,7 +66,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
                                                    vec_lvsl(0, src));
 
     if (src_really_odd != 0x0000000F)
-        /* If src & 0xF == 0xF, then (src + 1) is properly aligned
+        /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
          * on the second vector. */
         srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
     else
@@ -88,7 +88,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
         srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
 
         if (src_really_odd != 0x0000000F)
-            /* If src & 0xF == 0xF, then (src + 1) is properly aligned
+            /* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
              * on the second vector. */
             srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
         else
@@ -127,7 +127,7 @@ static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
 
 av_cold void ff_mpegvideodsp_init_ppc(MpegVideoDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     c->gmc1 = gmc1_altivec;
 #endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/mpegvideoencdsp.c b/libavcodec/ppc/mpegvideoencdsp.c
index 7354816..3e6765c 100644
--- a/libavcodec/ppc/mpegvideoencdsp.c
+++ b/libavcodec/ppc/mpegvideoencdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,36 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
+#if HAVE_VSX
+static int pix_norm1_altivec(uint8_t *pix, int line_size)
+{
+    int i, s = 0;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sum;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned pixels. */
+        //vector unsigned char pixl = vec_ld(0,  pix);
+        //vector unsigned char pixr = vec_ld(15, pix);
+        //vector unsigned char pixv = vec_perm(pixl, pixr, perm);
+        vector unsigned char pixv = vec_vsx_ld(0,  pix);
+
+        /* Square the values, and add them to our sum. */
+        sv = vec_msum(pixv, pixv, sv);
+
+        pix += line_size;
+    }
+    /* Sum up the four partial sums, and put the result into s. */
+    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
+    sum = vec_splat(sum, 3);
+    vec_ste(sum, 0, &s);
+    return s;
+}
+#else
 static int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
     int i, s = 0;
@@ -58,7 +86,37 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
 
     return s;
 }
+#endif /* HAVE_VSX */
+
+#if HAVE_VSX
+static int pix_sum_altivec(uint8_t *pix, int line_size)
+{
+    int i, s;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sumdiffs;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned 16 pixels into t1. */
+        //vector unsigned char pixl = vec_ld(0,  pix);
+        //vector unsigned char pixr = vec_ld(15, pix);
+        //vector unsigned char t1   = vec_perm(pixl, pixr, perm);
+        vector unsigned char t1   = vec_vsx_ld(0,  pix);
 
+        /* Add each 4 pixel group together and put 4 results into sad. */
+        sad = vec_sum4s(t1, sad);
+
+        pix += line_size;
+    }
+
+    /* Sum up the four partial sums, and put the result into s. */
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
+    sumdiffs = vec_splat(sumdiffs, 3);
+    vec_ste(sumdiffs, 0, &s);
+    return s;
+}
+#else
 static int pix_sum_altivec(uint8_t *pix, int line_size)
 {
     int i, s;
@@ -88,12 +146,14 @@ static int pix_sum_altivec(uint8_t *pix, int line_size)
     return s;
 }
 
+#endif /* HAVE_VSX */
+
 #endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
                                          AVCodecContext *avctx)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/pixblockdsp.c b/libavcodec/ppc/pixblockdsp.c
index 9cac70e..84aa562 100644
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2002 Dieter Shirley
  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,10 +33,38 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/pixblockdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
+#if HAVE_VSX
 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
-                               int line_size)
+                               ptrdiff_t line_size)
+{
+    int i;
+    vector unsigned char perm =
+        (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
+            0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
+    const vector unsigned char zero =
+        (const vector unsigned char) vec_splat_u8(0);
+
+    for (i = 0; i < 8; i++) {
+        /* Read potentially unaligned pixels.
+         * We're reading 16 pixels, and actually only want 8,
+         * but we simply ignore the extras. */
+        vector unsigned char bytes = vec_vsx_ld(0, pixels);
+
+        // Convert the bytes into shorts.
+        //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
+        vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
+
+        // Save the data to the block, we assume the block is 16-byte aligned.
+        vec_vsx_st(shorts, i * 16, (vector signed short *) block);
+
+        pixels += line_size;
+    }
+}
+#else
+static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
+                               ptrdiff_t line_size)
 {
     int i;
     vec_u8 perm = vec_lvsl(0, pixels);
@@ -60,6 +88,71 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
     }
 }
 
+#endif /* HAVE_VSX */
+
+#if HAVE_VSX
+static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
+                                const uint8_t *s2, int stride)
+{
+  int i;
+  const vector unsigned char zero =
+    (const vector unsigned char) vec_splat_u8(0);
+  vector signed short shorts1, shorts2;
+
+  for (i = 0; i < 4; i++) {
+    /* Read potentially unaligned pixels.
+     * We're reading 16 pixels, and actually only want 8,
+     * but we simply ignore the extras. */
+    vector unsigned char bytes = vec_vsx_ld(0,  s1);
+
+    // Convert the bytes into shorts.
+    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the same for the second block of pixels.
+    bytes =vec_vsx_ld(0,  s2);
+
+    // Convert the bytes into shorts.
+    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the subtraction.
+    shorts1 = vec_sub(shorts1, shorts2);
+
+    // Save the data to the block, we assume the block is 16-byte aligned.
+    vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+    s1    += stride;
+    s2    += stride;
+    block += 8;
+
+    /* The code below is a copy of the code above...
+     * This is a manual unroll. */
+
+    /* Read potentially unaligned pixels.
+     * We're reading 16 pixels, and actually only want 8,
+     * but we simply ignore the extras. */
+    bytes = vec_vsx_ld(0,  s1);
+
+    // Convert the bytes into shorts.
+    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the same for the second block of pixels.
+    bytes = vec_vsx_ld(0,  s2);
+
+    // Convert the bytes into shorts.
+    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
+
+    // Do the subtraction.
+    shorts1 = vec_sub(shorts1, shorts2);
+
+    // Save the data to the block, we assume the block is 16-byte aligned.
+    vec_vsx_st(shorts1, 0, (vector signed short *) block);
+
+    s1    += stride;
+    s2    += stride;
+    block += 8;
+  }
+}
+#else
 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
                                 const uint8_t *s2, int stride)
 {
@@ -131,11 +224,13 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
     }
 }
 
+#endif /* HAVE_VSX */
+
 #endif /* HAVE_ALTIVEC */
 
 #if HAVE_VSX
 static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
-                           int line_size)
+                           ptrdiff_t line_size)
 {
     int i;
     for (i = 0; i < 8; i++) {
@@ -171,7 +266,7 @@ av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/svq1enc_altivec.c b/libavcodec/ppc/svq1enc_altivec.c
index 222f7c1..4e25e25 100644
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/svq1enc.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                      int size)
 {
@@ -76,7 +76,7 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
 
 av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c
index caf8721..35bb280 100644
--- a/libavcodec/ppc/vc1dsp_altivec.c
+++ b/libavcodec/ppc/vc1dsp_altivec.c
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/vc1dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 // main steps of 8x8 transform
 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
@@ -304,16 +304,23 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block)
     src2 = vec_pack(s2, sA);
     src3 = vec_pack(s3, sB);
 
+#if HAVE_BIGENDIAN
     p0 = vec_lvsl (0, dest);
     p1 = vec_lvsl (stride, dest);
     p = vec_splat_u8 (-1);
     perm0 = vec_mergeh (p, p0);
     perm1 = vec_mergeh (p, p1);
+#define GET_TMP2(dst, p)        \
+    tmp = vec_ld (0, dest);     \
+    tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), p);
+#else
+#define GET_TMP2(dst,p)         \
+    tmp = vec_vsx_ld (0, dst);  \
+    tmp2 = (vector signed short)vec_mergeh (tmp, vec_splat_u8(0));
+#endif
 
 #define ADD(dest,src,perm)                                              \
-    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
-    tmp = vec_ld (0, dest);                                             \
-    tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm);  \
+    GET_TMP2(dest, perm);                                               \
     tmp3 = vec_adds (tmp2, src);                                        \
     tmp = vec_packsu (tmp3, tmp3);                                      \
     vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest);        \
@@ -344,7 +351,7 @@ static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block)
 
 av_cold void ff_vc1dsp_init_ppc(VC1DSPContext *dsp)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/videodsp_ppc.c b/libavcodec/ppc/videodsp_ppc.c
index b9e003b..9157022 100644
--- a/libavcodec/ppc/videodsp_ppc.c
+++ b/libavcodec/ppc/videodsp_ppc.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2004 Romain Dolbeau
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ppc/vorbisdsp_altivec.c b/libavcodec/ppc/vorbisdsp_altivec.c
index a7aad86..d7557c8 100644
--- a/libavcodec/ppc/vorbisdsp_altivec.c
+++ b/libavcodec/ppc/vorbisdsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,7 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavcodec/vorbisdsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
                                             intptr_t blocksize)
 {
@@ -54,7 +54,7 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
 
 av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index 68e7102..4a367b6 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,17 @@
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/vp3dsp.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 
 static const vec_s16 constants =
     {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
+#if HAVE_BIGENDIAN
 static const vec_u8 interleave_high =
     {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+#else
+static const vec_u8 interleave_high =
+    {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+#endif
 
 #define IDCT_START \
     vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
@@ -156,9 +161,18 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64])
     TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
     IDCT_1D(ADD8, SHIFT4)
 
-#define ADD(a)\
+#if HAVE_BIGENDIAN
+#define GET_VDST16\
     vdst = vec_ld(0, dst);\
-    vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\
+    vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);
+#else
+#define GET_VDST16\
+    vdst = vec_vsx_ld(0,dst);\
+    vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v);
+#endif
+
+#define ADD(a)\
+    GET_VDST16;\
     vdst_16 = vec_adds(a, vdst_16);\
     t = vec_packsu(vdst_16, vdst_16);\
     vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
@@ -179,7 +193,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64])
 
 av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/ppc/vp8dsp_altivec.c b/libavcodec/ppc/vp8dsp_altivec.c
index 869fe67..23e4ace 100644
--- a/libavcodec/ppc/vp8dsp_altivec.c
+++ b/libavcodec/ppc/vp8dsp_altivec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2010 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,7 @@
 #include "libavcodec/vp8dsp.h"
 #include "hpeldsp_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
 
 // h subpel filter uses msum to multiply+add 4 pixel taps at once
@@ -59,17 +59,30 @@ static const vec_s8 h_subpel_filters_outer[3] =
     vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
     vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
 
+#if HAVE_BIGENDIAN
+#define GET_PIXHL(offset)                   \
+    a = vec_ld((offset)-is6tap-1, src);     \
+    b = vec_ld((offset)-is6tap-1+15, src);  \
+    pixh  = vec_perm(a, b, permh##offset);  \
+    pixl  = vec_perm(a, b, perml##offset)
+
+#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
+#else
+#define GET_PIXHL(offset)                   \
+    a = vec_vsx_ld((offset)-is6tap-1, src); \
+    pixh  = vec_perm(a, a, perm_inner);     \
+    pixl  = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
+
+#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
+#endif
+
 #define FILTER_H(dstv, off) \
-    a = vec_ld((off)-is6tap-1,    src); \
-    b = vec_ld((off)-is6tap-1+15, src); \
-\
-    pixh  = vec_perm(a, b, permh##off); \
-    pixl  = vec_perm(a, b, perml##off); \
+    GET_PIXHL(off);                            \
     filth = vec_msum(filter_inner, pixh, c64); \
     filtl = vec_msum(filter_inner, pixl, c64); \
 \
     if (is6tap) { \
-        outer = vec_perm(a, b, perm_6tap##off); \
+        GET_OUTER(off);                                \
         filth = vec_msum(filter_outerh, outer, filth); \
         filtl = vec_msum(filter_outerl, outer, filtl); \
     } \
@@ -84,9 +97,12 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                  int h, int mx, int w, int is6tap)
 {
     LOAD_H_SUBPEL_FILTER(mx-1);
-    vec_u8 align_vec0, align_vec8, permh0, permh8, filt;
+#if HAVE_BIGENDIAN
+    vec_u8 align_vec0, align_vec8, permh0, permh8;
     vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
-    vec_u8 a, b, pixh, pixl, outer;
+    vec_u8 b;
+#endif
+    vec_u8 filt, a, pixh, pixl, outer;
     vec_s16 f16h, f16l;
     vec_s32 filth, filtl;
 
@@ -97,6 +113,7 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
     vec_u16 c7  = vec_splat_u16(7);
 
+#if HAVE_BIGENDIAN
     align_vec0 = vec_lvsl( -is6tap-1, src);
     align_vec8 = vec_lvsl(8-is6tap-1, src);
 
@@ -107,6 +124,7 @@ void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     perml8     = vec_perm(align_vec8, align_vec8, perm_inner);
     perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
     perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
+#endif
 
     while (h --> 0) {
         FILTER_H(f16h, 0);
@@ -164,6 +182,12 @@ static const vec_u8 v_subpel_filters[7] =
     dstv = vec_adds(dstv, c64); \
     dstv = vec_sra(dstv, c7)
 
+#if HAVE_BIGENDIAN
+#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
+#else
+#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
+#endif
+
 static av_always_inline
 void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                  uint8_t *src, ptrdiff_t src_stride,
@@ -175,6 +199,7 @@ void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
     vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
     vec_u16 c7  = vec_splat_u16(7);
 
+#if HAVE_BIGENDIAN
     // we want pixels 0-7 to be in the even positions and 8-15 in the odd,
     // so combine this permute with the alignment permute vector
     align_vech = vec_lvsl(0, src);
@@ -183,22 +208,23 @@ void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
         perm_vec = vec_mergeh(align_vech, align_vecl);
     else
         perm_vec = vec_mergeh(align_vech, align_vech);
+#endif
 
     if (is6tap)
-        s0 = load_with_perm_vec(-2*src_stride, src, perm_vec);
-    s1 = load_with_perm_vec(-1*src_stride, src, perm_vec);
-    s2 = load_with_perm_vec( 0*src_stride, src, perm_vec);
-    s3 = load_with_perm_vec( 1*src_stride, src, perm_vec);
+        s0 = LOAD_HL(-2*src_stride, src, perm_vec);
+    s1 = LOAD_HL(-1*src_stride, src, perm_vec);
+    s2 = LOAD_HL( 0*src_stride, src, perm_vec);
+    s3 = LOAD_HL( 1*src_stride, src, perm_vec);
     if (is6tap)
-        s4 = load_with_perm_vec( 2*src_stride, src, perm_vec);
+        s4 = LOAD_HL( 2*src_stride, src, perm_vec);
 
     src += (2+is6tap)*src_stride;
 
     while (h --> 0) {
         if (is6tap)
-            s5 = load_with_perm_vec(0, src, perm_vec);
+            s5 = LOAD_HL(0, src, perm_vec);
         else
-            s4 = load_with_perm_vec(0, src, perm_vec);
+            s4 = LOAD_HL(0, src, perm_vec);
 
         FILTER_V(f16h, vec_mule);
 
@@ -272,39 +298,25 @@ EPEL_HV(4,  4,4)
 
 static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
 {
-    register vector unsigned char pixelsv1, pixelsv2;
-    register vector unsigned char pixelsv1B, pixelsv2B;
-    register vector unsigned char pixelsv1C, pixelsv2C;
-    register vector unsigned char pixelsv1D, pixelsv2D;
-
-    register vector unsigned char perm = vec_lvsl(0, src);
+    register vector unsigned char perm;
     int i;
     register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
     register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
     register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
 
+#if HAVE_BIGENDIAN
+    perm = vec_lvsl(0, src);
+#endif
 // hand-unrolling the loop by 4 gains about 15%
 // mininum execution time goes from 74 to 60 cycles
 // it's faster than -funroll-loops, but using
 // -funroll-loops w/ this is bad - 74 cycles again.
 // all this is on a 7450, tuning for the 7450
     for (i = 0; i < h; i += 4) {
-        pixelsv1  = vec_ld( 0, src);
-        pixelsv2  = vec_ld(15, src);
-        pixelsv1B = vec_ld(sstride, src);
-        pixelsv2B = vec_ld(15 + sstride, src);
-        pixelsv1C = vec_ld(sstride2, src);
-        pixelsv2C = vec_ld(15 + sstride2, src);
-        pixelsv1D = vec_ld(sstride3, src);
-        pixelsv2D = vec_ld(15 + sstride3, src);
-        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
-               0, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
-               dstride, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
-               dstride2, (unsigned char*)dst);
-        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
-               dstride3, (unsigned char*)dst);
+        vec_st(load_with_perm_vec(0, src, perm), 0, dst);
+        vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
+        vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
+        vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
         src += sstride4;
         dst += dstride4;
     }
@@ -315,7 +327,7 @@ static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *s
 
 av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
 {
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
index c03106c..da745e1 100644
--- a/libavcodec/profiles.c
+++ b/libavcodec/profiles.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,11 +37,12 @@ const AVProfile ff_aac_profiles[] = {
 };
 
 const AVProfile ff_dca_profiles[] = {
-    { FF_PROFILE_DTS,        "DTS"        },
-    { FF_PROFILE_DTS_ES,     "DTS-ES"     },
-    { FF_PROFILE_DTS_96_24,  "DTS 96/24"  },
-    { FF_PROFILE_DTS_HD_HRA, "DTS-HD HRA" },
-    { FF_PROFILE_DTS_HD_MA,  "DTS-HD MA"  },
+    { FF_PROFILE_DTS,         "DTS"         },
+    { FF_PROFILE_DTS_ES,      "DTS-ES"      },
+    { FF_PROFILE_DTS_96_24,   "DTS 96/24"   },
+    { FF_PROFILE_DTS_HD_HRA,  "DTS-HD HRA"  },
+    { FF_PROFILE_DTS_HD_MA,   "DTS-HD MA"   },
+    { FF_PROFILE_DTS_EXPRESS, "DTS Express" },
     { FF_PROFILE_UNKNOWN },
 };
 
@@ -66,6 +67,7 @@ const AVProfile ff_hevc_profiles[] = {
     { FF_PROFILE_HEVC_MAIN,                 "Main"                },
     { FF_PROFILE_HEVC_MAIN_10,              "Main 10"             },
     { FF_PROFILE_HEVC_MAIN_STILL_PICTURE,   "Main Still Picture"  },
+    { FF_PROFILE_HEVC_REXT,                 "Rext"                },
     { FF_PROFILE_UNKNOWN },
 };
 
@@ -118,4 +120,12 @@ const AVProfile ff_vc1_profiles[] = {
     { FF_PROFILE_UNKNOWN },
 };
 
+const AVProfile ff_vp9_profiles[] = {
+    { FF_PROFILE_VP9_0, "Profile 0" },
+    { FF_PROFILE_VP9_1, "Profile 1" },
+    { FF_PROFILE_VP9_2, "Profile 2" },
+    { FF_PROFILE_VP9_3, "Profile 3" },
+    { FF_PROFILE_UNKNOWN },
+};
+
 #endif /* !CONFIG_SMALL */
diff --git a/libavcodec/profiles.h b/libavcodec/profiles.h
index 0276b17..c86c683 100644
--- a/libavcodec/profiles.h
+++ b/libavcodec/profiles.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,5 +29,6 @@ extern const AVProfile ff_jpeg2000_profiles[];
 extern const AVProfile ff_mpeg2_video_profiles[];
 extern const AVProfile ff_mpeg4_video_profiles[];
 extern const AVProfile ff_vc1_profiles[];
+extern const AVProfile ff_vp9_profiles[];
 
 #endif /* AVCODEC_PROFILES_H */
diff --git a/libavcodec/proresdata.c b/libavcodec/proresdata.c
index fcaf32a..9849b5c 100644
--- a/libavcodec/proresdata.c
+++ b/libavcodec/proresdata.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/proresdata.h b/libavcodec/proresdata.h
index 1e5d05e..ee8278d 100644
--- a/libavcodec/proresdata.h
+++ b/libavcodec/proresdata.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
new file mode 100644
index 0000000..14ede5d
--- /dev/null
+++ b/libavcodec/proresdec.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ * Copyright (c) 2010-2011 Elvis Presley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_PRORESDEC_H
+#define AVCODEC_PRORESDEC_H
+
+#include "blockdsp.h"
+#include "proresdsp.h"
+
+typedef struct {
+    const uint8_t *data;
+    unsigned mb_x;
+    unsigned mb_y;
+    unsigned mb_count;
+    unsigned data_size;
+    int ret;
+} SliceContext;
+
+typedef struct {
+    BlockDSPContext bdsp;
+    ProresDSPContext prodsp;
+    AVFrame *frame;
+    int frame_type;              ///< 0 = progressive, 1 = tff, 2 = bff
+    uint8_t qmat_luma[64];
+    uint8_t qmat_chroma[64];
+    SliceContext *slices;
+    int slice_count;             ///< number of slices in the current picture
+    unsigned mb_width;           ///< width of the current picture in mb
+    unsigned mb_height;          ///< height of the current picture in mb
+    uint8_t progressive_scan[64];
+    uint8_t interlaced_scan[64];
+    const uint8_t *scan;
+    int first_field;
+    int alpha_info;
+} ProresContext;
+
+#endif /* AVCODEC_PRORESDEC_H */
diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
new file mode 100644
index 0000000..a3a1ebd
--- /dev/null
+++ b/libavcodec/proresdec2.c
@@ -0,0 +1,703 @@
+/*
+ * Copyright (c) 2010-2011 Maxim Poliakovski
+ * Copyright (c) 2010-2011 Elvis Presley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'acpo' (Proxy), 'ap4h' (4444)
+ */
+
+//#define DEBUG
+
+#define LONG_BITSTREAM_READER
+
+#include "libavutil/internal.h"
+#include "avcodec.h"
+#include "get_bits.h"
+#include "idctdsp.h"
+#include "internal.h"
+#include "simple_idct.h"
+#include "proresdec.h"
+#include "proresdata.h"
+
+static void permute(uint8_t *dst, const uint8_t *src, const uint8_t permutation[64])
+{
+    int i;
+    for (i = 0; i < 64; i++)
+        dst[i] = permutation[src[i]];
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+    uint8_t idct_permutation[64];
+
+    avctx->bits_per_raw_sample = 10;
+
+    ff_blockdsp_init(&ctx->bdsp, avctx);
+    ff_proresdsp_init(&ctx->prodsp, avctx);
+
+    ff_init_scantable_permutation(idct_permutation,
+                                  ctx->prodsp.idct_permutation_type);
+
+    permute(ctx->progressive_scan, ff_prores_progressive_scan, idct_permutation);
+    permute(ctx->interlaced_scan, ff_prores_interlaced_scan, idct_permutation);
+
+    return 0;
+}
+
+static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
+                               const int data_size, AVCodecContext *avctx)
+{
+    int hdr_size, width, height, flags;
+    int version;
+    const uint8_t *ptr;
+
+    hdr_size = AV_RB16(buf);
+    ff_dlog(avctx, "header size %d\n", hdr_size);
+    if (hdr_size > data_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong header size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    version = AV_RB16(buf + 2);
+    ff_dlog(avctx, "%.4s version %d\n", buf+4, version);
+    if (version > 1) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported version: %d\n", version);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    width  = AV_RB16(buf + 8);
+    height = AV_RB16(buf + 10);
+    if (width != avctx->width || height != avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "picture resolution change: %dx%d -> %dx%d\n",
+               avctx->width, avctx->height, width, height);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ctx->frame_type = (buf[12] >> 2) & 3;
+    ctx->alpha_info = buf[17] & 0xf;
+
+    if (ctx->alpha_info > 2) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid alpha mode %d\n", ctx->alpha_info);
+        return AVERROR_INVALIDDATA;
+    }
+    if (avctx->skip_alpha) ctx->alpha_info = 0;
+
+    ff_dlog(avctx, "frame type %d\n", ctx->frame_type);
+
+    if (ctx->frame_type == 0) {
+        ctx->scan = ctx->progressive_scan; // permuted
+    } else {
+        ctx->scan = ctx->interlaced_scan; // permuted
+        ctx->frame->interlaced_frame = 1;
+        ctx->frame->top_field_first = ctx->frame_type == 1;
+    }
+
+    if (ctx->alpha_info) {
+        avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUVA444P10 : AV_PIX_FMT_YUVA422P10;
+    } else {
+        avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? AV_PIX_FMT_YUV444P10 : AV_PIX_FMT_YUV422P10;
+    }
+
+    ptr   = buf + 20;
+    flags = buf[19];
+    ff_dlog(avctx, "flags %x\n", flags);
+
+    if (flags & 2) {
+        if(buf + data_size - ptr < 64) {
+            av_log(avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+        permute(ctx->qmat_luma, ctx->prodsp.idct_permutation, ptr);
+        ptr += 64;
+    } else {
+        memset(ctx->qmat_luma, 4, 64);
+    }
+
+    if (flags & 1) {
+        if(buf + data_size - ptr < 64) {
+            av_log(avctx, AV_LOG_ERROR, "Header truncated\n");
+            return AVERROR_INVALIDDATA;
+        }
+        permute(ctx->qmat_chroma, ctx->prodsp.idct_permutation, ptr);
+    } else {
+        memset(ctx->qmat_chroma, 4, 64);
+    }
+
+    return hdr_size;
+}
+
+static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, const int buf_size)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int i, hdr_size, slice_count;
+    unsigned pic_data_size;
+    int log2_slice_mb_width, log2_slice_mb_height;
+    int slice_mb_count, mb_x, mb_y;
+    const uint8_t *data_ptr, *index_ptr;
+
+    hdr_size = buf[0] >> 3;
+    if (hdr_size < 8 || hdr_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture header size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    pic_data_size = AV_RB32(buf + 1);
+    if (pic_data_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture data size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    log2_slice_mb_width  = buf[7] >> 4;
+    log2_slice_mb_height = buf[7] & 0xF;
+    if (log2_slice_mb_width > 3 || log2_slice_mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported slice resolution: %dx%d\n",
+               1 << log2_slice_mb_width, 1 << log2_slice_mb_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->mb_width  = (avctx->width  + 15) >> 4;
+    if (ctx->frame_type)
+        ctx->mb_height = (avctx->height + 31) >> 5;
+    else
+        ctx->mb_height = (avctx->height + 15) >> 4;
+
+    // QT ignores the written value
+    // slice_count = AV_RB16(buf + 5);
+    slice_count = ctx->mb_height * ((ctx->mb_width >> log2_slice_mb_width) +
+                                    av_popcount(ctx->mb_width & (1 << log2_slice_mb_width) - 1));
+
+    if (ctx->slice_count != slice_count || !ctx->slices) {
+        av_freep(&ctx->slices);
+        ctx->slice_count = 0;
+        ctx->slices = av_mallocz_array(slice_count, sizeof(*ctx->slices));
+        if (!ctx->slices)
+            return AVERROR(ENOMEM);
+        ctx->slice_count = slice_count;
+    }
+
+    if (!slice_count)
+        return AVERROR(EINVAL);
+
+    if (hdr_size + slice_count*2 > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong slice count\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // parse slice information
+    index_ptr = buf + hdr_size;
+    data_ptr  = index_ptr + slice_count*2;
+
+    slice_mb_count = 1 << log2_slice_mb_width;
+    mb_x = 0;
+    mb_y = 0;
+
+    for (i = 0; i < slice_count; i++) {
+        SliceContext *slice = &ctx->slices[i];
+
+        slice->data = data_ptr;
+        data_ptr += AV_RB16(index_ptr + i*2);
+
+        while (ctx->mb_width - mb_x < slice_mb_count)
+            slice_mb_count >>= 1;
+
+        slice->mb_x = mb_x;
+        slice->mb_y = mb_y;
+        slice->mb_count = slice_mb_count;
+        slice->data_size = data_ptr - slice->data;
+
+        if (slice->data_size < 6) {
+            av_log(avctx, AV_LOG_ERROR, "error, wrong slice data size\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        mb_x += slice_mb_count;
+        if (mb_x == ctx->mb_width) {
+            slice_mb_count = 1 << log2_slice_mb_width;
+            mb_x = 0;
+            mb_y++;
+        }
+        if (data_ptr > buf + buf_size) {
+            av_log(avctx, AV_LOG_ERROR, "error, slice out of bounds\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    if (mb_x || mb_y != ctx->mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "error wrong mb count y %d h %d\n",
+               mb_y, ctx->mb_height);
+        return AVERROR_INVALIDDATA;
+    }
+
+    return pic_data_size;
+}
+
+#define DECODE_CODEWORD(val, codebook)                                  \
+    do {                                                                \
+        unsigned int rice_order, exp_order, switch_bits;                \
+        unsigned int q, buf, bits;                                      \
+                                                                        \
+        UPDATE_CACHE(re, gb);                                           \
+        buf = GET_CACHE(re, gb);                                        \
+                                                                        \
+        /* number of bits to switch between rice and exp golomb */      \
+        switch_bits =  codebook & 3;                                    \
+        rice_order  =  codebook >> 5;                                   \
+        exp_order   = (codebook >> 2) & 7;                              \
+                                                                        \
+        q = 31 - av_log2(buf);                                          \
+                                                                        \
+        if (q > switch_bits) { /* exp golomb */                         \
+            bits = exp_order - switch_bits + (q<<1);                    \
+            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
+                ((switch_bits + 1) << rice_order);                      \
+            SKIP_BITS(re, gb, bits);                                    \
+        } else if (rice_order) {                                        \
+            SKIP_BITS(re, gb, q+1);                                     \
+            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
+            SKIP_BITS(re, gb, rice_order);                              \
+        } else {                                                        \
+            val = q;                                                    \
+            SKIP_BITS(re, gb, q+1);                                     \
+        }                                                               \
+    } while (0)
+
+#define TOSIGNED(x) (((x) >> 1) ^ (-((x) & 1)))
+
+#define FIRST_DC_CB 0xB8
+
+static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+
+static av_always_inline void decode_dc_coeffs(GetBitContext *gb, int16_t *out,
+                                              int blocks_per_slice)
+{
+    int16_t prev_dc;
+    int code, i, sign;
+
+    OPEN_READER(re, gb);
+
+    DECODE_CODEWORD(code, FIRST_DC_CB);
+    prev_dc = TOSIGNED(code);
+    out[0] = prev_dc;
+
+    out += 64; // dc coeff for the next block
+
+    code = 5;
+    sign = 0;
+    for (i = 1; i < blocks_per_slice; i++, out += 64) {
+        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
+        if(code) sign ^= -(code & 1);
+        else     sign  = 0;
+        prev_dc += (((code + 1) >> 1) ^ sign) - sign;
+        out[0] = prev_dc;
+    }
+    CLOSE_READER(re, gb);
+}
+
+// adaptive codebook switching lut according to previous run/level values
+static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29, 0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28, 0x28, 0x28, 0x28, 0x4C };
+
+static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
+                                             int16_t *out, int blocks_per_slice)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int block_mask, sign;
+    unsigned pos, run, level;
+    int max_coeffs, i, bits_left;
+    int log2_block_count = av_log2(blocks_per_slice);
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);                                           \
+    run   = 4;
+    level = 2;
+
+    max_coeffs = 64 << log2_block_count;
+    block_mask = blocks_per_slice - 1;
+
+    for (pos = block_mask;;) {
+        bits_left = gb->size_in_bits - re_index;
+        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
+            break;
+
+        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
+        pos += run + 1;
+        if (pos >= max_coeffs) {
+            av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
+            return AVERROR_INVALIDDATA;
+        }
+
+        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
+        level += 1;
+
+        i = pos >> log2_block_count;
+
+        sign = SHOW_SBITS(re, gb, 1);
+        SKIP_BITS(re, gb, 1);
+        out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
+    }
+
+    CLOSE_READER(re, gb);
+    return 0;
+}
+
+static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
+                             uint16_t *dst, int dst_stride,
+                             const uint8_t *buf, unsigned buf_size,
+                             const int16_t *qmat)
+{
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+    GetBitContext gb;
+    int i, blocks_per_slice = slice->mb_count<<2;
+    int ret;
+
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    decode_dc_coeffs(&gb, blocks, blocks_per_slice);
+    if ((ret = decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        ctx->prodsp.idct_put(dst, dst_stride, block+(0<<6), qmat);
+        ctx->prodsp.idct_put(dst             +8, dst_stride, block+(1<<6), qmat);
+        ctx->prodsp.idct_put(dst+4*dst_stride  , dst_stride, block+(2<<6), qmat);
+        ctx->prodsp.idct_put(dst+4*dst_stride+8, dst_stride, block+(3<<6), qmat);
+        block += 4*64;
+        dst += 16;
+    }
+    return 0;
+}
+
+static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
+                               uint16_t *dst, int dst_stride,
+                               const uint8_t *buf, unsigned buf_size,
+                               const int16_t *qmat, int log2_blocks_per_mb)
+{
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+    GetBitContext gb;
+    int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
+    int ret;
+
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    decode_dc_coeffs(&gb, blocks, blocks_per_slice);
+    if ((ret = decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice)) < 0)
+        return ret;
+
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        for (j = 0; j < log2_blocks_per_mb; j++) {
+            ctx->prodsp.idct_put(dst,              dst_stride, block+(0<<6), qmat);
+            ctx->prodsp.idct_put(dst+4*dst_stride, dst_stride, block+(1<<6), qmat);
+            block += 2*64;
+            dst += 8;
+        }
+    }
+    return 0;
+}
+
+static void unpack_alpha(GetBitContext *gb, uint16_t *dst, int num_coeffs,
+                         const int num_bits)
+{
+    const int mask = (1 << num_bits) - 1;
+    int i, idx, val, alpha_val;
+
+    idx       = 0;
+    alpha_val = mask;
+    do {
+        do {
+            if (get_bits1(gb)) {
+                val = get_bits(gb, num_bits);
+            } else {
+                int sign;
+                val  = get_bits(gb, num_bits == 16 ? 7 : 4);
+                sign = val & 1;
+                val  = (val + 2) >> 1;
+                if (sign)
+                    val = -val;
+            }
+            alpha_val = (alpha_val + val) & mask;
+            if (num_bits == 16) {
+                dst[idx++] = alpha_val >> 6;
+            } else {
+                dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
+            }
+            if (idx >= num_coeffs)
+                break;
+        } while (get_bits_left(gb)>0 && get_bits1(gb));
+        val = get_bits(gb, 4);
+        if (!val)
+            val = get_bits(gb, 11);
+        if (idx + val > num_coeffs)
+            val = num_coeffs - idx;
+        if (num_bits == 16) {
+            for (i = 0; i < val; i++)
+                dst[idx++] = alpha_val >> 6;
+        } else {
+            for (i = 0; i < val; i++)
+                dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
+
+        }
+    } while (idx < num_coeffs);
+}
+
+/**
+ * Decode alpha slice plane.
+ */
+static void decode_slice_alpha(ProresContext *ctx,
+                               uint16_t *dst, int dst_stride,
+                               const uint8_t *buf, int buf_size,
+                               int blocks_per_slice)
+{
+    GetBitContext gb;
+    int i;
+    LOCAL_ALIGNED_16(int16_t, blocks, [8*4*64]);
+    int16_t *block;
+
+    for (i = 0; i < blocks_per_slice<<2; i++)
+        ctx->bdsp.clear_block(blocks+(i<<6));
+
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    if (ctx->alpha_info == 2) {
+        unpack_alpha(&gb, blocks, blocks_per_slice * 4 * 64, 16);
+    } else {
+        unpack_alpha(&gb, blocks, blocks_per_slice * 4 * 64, 8);
+    }
+
+    block = blocks;
+    for (i = 0; i < 16; i++) {
+        memcpy(dst, block, 16 * blocks_per_slice * sizeof(*dst));
+        dst   += dst_stride >> 1;
+        block += 16 * blocks_per_slice;
+    }
+}
+
+static int decode_slice_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    ProresContext *ctx = avctx->priv_data;
+    SliceContext *slice = &ctx->slices[jobnr];
+    const uint8_t *buf = slice->data;
+    AVFrame *pic = ctx->frame;
+    int i, hdr_size, qscale, log2_chroma_blocks_per_mb;
+    int luma_stride, chroma_stride;
+    int y_data_size, u_data_size, v_data_size, a_data_size;
+    uint8_t *dest_y, *dest_u, *dest_v, *dest_a;
+    int16_t qmat_luma_scaled[64];
+    int16_t qmat_chroma_scaled[64];
+    int mb_x_shift;
+    int ret;
+
+    slice->ret = -1;
+    //av_log(avctx, AV_LOG_INFO, "slice %d mb width %d mb x %d y %d\n",
+    //       jobnr, slice->mb_count, slice->mb_x, slice->mb_y);
+
+    // slice header
+    hdr_size = buf[0] >> 3;
+    qscale = av_clip(buf[1], 1, 224);
+    qscale = qscale > 128 ? qscale - 96 << 2: qscale;
+    y_data_size = AV_RB16(buf + 2);
+    u_data_size = AV_RB16(buf + 4);
+    v_data_size = slice->data_size - y_data_size - u_data_size - hdr_size;
+    if (hdr_size > 7) v_data_size = AV_RB16(buf + 6);
+    a_data_size = slice->data_size - y_data_size - u_data_size -
+                  v_data_size - hdr_size;
+
+    if (y_data_size < 0 || u_data_size < 0 || v_data_size < 0
+        || hdr_size+y_data_size+u_data_size+v_data_size > slice->data_size){
+        av_log(avctx, AV_LOG_ERROR, "invalid plane data size\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    buf += hdr_size;
+
+    for (i = 0; i < 64; i++) {
+        qmat_luma_scaled  [i] = ctx->qmat_luma  [i] * qscale;
+        qmat_chroma_scaled[i] = ctx->qmat_chroma[i] * qscale;
+    }
+
+    if (ctx->frame_type == 0) {
+        luma_stride   = pic->linesize[0];
+        chroma_stride = pic->linesize[1];
+    } else {
+        luma_stride   = pic->linesize[0] << 1;
+        chroma_stride = pic->linesize[1] << 1;
+    }
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV444P10 || avctx->pix_fmt == AV_PIX_FMT_YUVA444P10) {
+        mb_x_shift = 5;
+        log2_chroma_blocks_per_mb = 2;
+    } else {
+        mb_x_shift = 4;
+        log2_chroma_blocks_per_mb = 1;
+    }
+
+    dest_y = pic->data[0] + (slice->mb_y << 4) * luma_stride + (slice->mb_x << 5);
+    dest_u = pic->data[1] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
+    dest_v = pic->data[2] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
+    dest_a = pic->data[3] + (slice->mb_y << 4) * luma_stride + (slice->mb_x << 5);
+
+    if (ctx->frame_type && ctx->first_field ^ ctx->frame->top_field_first) {
+        dest_y += pic->linesize[0];
+        dest_u += pic->linesize[1];
+        dest_v += pic->linesize[2];
+        dest_a += pic->linesize[3];
+    }
+
+    ret = decode_slice_luma(avctx, slice, (uint16_t*)dest_y, luma_stride,
+                            buf, y_data_size, qmat_luma_scaled);
+    if (ret < 0)
+        return ret;
+
+    if (!(avctx->flags & AV_CODEC_FLAG_GRAY) && (u_data_size + v_data_size) > 0) {
+        ret = decode_slice_chroma(avctx, slice, (uint16_t*)dest_u, chroma_stride,
+                                  buf + y_data_size, u_data_size,
+                                  qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+        if (ret < 0)
+            return ret;
+
+        ret = decode_slice_chroma(avctx, slice, (uint16_t*)dest_v, chroma_stride,
+                                  buf + y_data_size + u_data_size, v_data_size,
+                                  qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+        if (ret < 0)
+            return ret;
+    }
+    else {
+        size_t mb_max_x = slice->mb_count << (mb_x_shift - 1);
+        for (size_t i = 0; i < 16; ++i)
+            for (size_t j = 0; j < mb_max_x; ++j) {
+                *(uint16_t*)(dest_u + (i * chroma_stride) + (j << 1)) = 511;
+                *(uint16_t*)(dest_v + (i * chroma_stride) + (j << 1)) = 511;
+            }
+    }
+
+    /* decode alpha plane if available */
+    if (ctx->alpha_info && pic->data[3] && a_data_size)
+        decode_slice_alpha(ctx, (uint16_t*)dest_a, luma_stride,
+                           buf + y_data_size + u_data_size + v_data_size,
+                           a_data_size, slice->mb_count);
+
+    slice->ret = 0;
+    return 0;
+}
+
+static int decode_picture(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+    int i;
+
+    avctx->execute2(avctx, decode_slice_thread, NULL, NULL, ctx->slice_count);
+
+    for (i = 0; i < ctx->slice_count; i++)
+        if (ctx->slices[i].ret < 0)
+            return ctx->slices[i].ret;
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    ProresContext *ctx = avctx->priv_data;
+    AVFrame *frame = data;
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    int frame_hdr_size, pic_size, ret;
+
+    if (buf_size < 28 || AV_RL32(buf + 4) != AV_RL32("icpf")) {
+        av_log(avctx, AV_LOG_ERROR, "invalid frame header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ctx->frame = frame;
+    ctx->frame->pict_type = AV_PICTURE_TYPE_I;
+    ctx->frame->key_frame = 1;
+    ctx->first_field = 1;
+
+    buf += 8;
+    buf_size -= 8;
+
+    frame_hdr_size = decode_frame_header(ctx, buf, buf_size, avctx);
+    if (frame_hdr_size < 0)
+        return frame_hdr_size;
+
+    buf += frame_hdr_size;
+    buf_size -= frame_hdr_size;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+ decode_picture:
+    pic_size = decode_picture_header(avctx, buf, buf_size);
+    if (pic_size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture header\n");
+        return pic_size;
+    }
+
+    if ((ret = decode_picture(avctx)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture\n");
+        return ret;
+    }
+
+    buf += pic_size;
+    buf_size -= pic_size;
+
+    if (ctx->frame_type && buf_size > 0 && ctx->first_field) {
+        ctx->first_field = 0;
+        goto decode_picture;
+    }
+
+    *got_frame      = 1;
+
+    return avpkt->size;
+}
+
+static av_cold int decode_close(AVCodecContext *avctx)
+{
+    ProresContext *ctx = avctx->priv_data;
+
+    av_freep(&ctx->slices);
+
+    return 0;
+}
+
+AVCodec ff_prores_decoder = {
+    .name           = "prores",
+    .long_name      = NULL_IF_CONFIG_SMALL("ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = decode_init,
+    .close          = decode_close,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
+};
diff --git a/libavcodec/proresdec.c b/libavcodec/proresdec_lgpl.c
index 8a53719..467a423 100644
--- a/libavcodec/proresdec.c
+++ b/libavcodec/proresdec_lgpl.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -86,7 +86,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     ctx->slice_data       = NULL;
 
     avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE;
-    ff_proresdsp_init(&ctx->dsp);
+    ff_proresdsp_init(&ctx->dsp, avctx);
 
     ctx->scantable_type = -1;   // set scantable type to uninitialized
     memset(ctx->qmat_luma, 4, 64);
@@ -140,6 +140,7 @@ static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
         av_log(avctx, AV_LOG_ERROR, "Invalid alpha mode %d\n", ctx->alpha_info);
         return AVERROR_INVALIDDATA;
     }
+    if (avctx->skip_alpha) ctx->alpha_info = 0;
 
     switch (ctx->chroma_factor) {
     case 2:
@@ -250,7 +251,7 @@ static int decode_picture_header(ProresContext *ctx, const uint8_t *buf,
                       (1 << (4 + ctx->frame->interlaced_frame)) - 1) >>
                      (4 + ctx->frame->interlaced_frame);
 
-    remainder    = ctx->num_x_mbs & ((1 << slice_width_factor) - 1);
+    remainder    = av_mod_uintp2(ctx->num_x_mbs, slice_width_factor);
     num_x_slices = (ctx->num_x_mbs >> slice_width_factor) + (remainder & 1) +
                    ((remainder >> 1) & 1) + ((remainder >> 2) & 1);
 
@@ -262,7 +263,7 @@ static int decode_picture_header(ProresContext *ctx, const uint8_t *buf,
 
     if (ctx->total_slices != num_slices) {
         av_freep(&ctx->slice_data);
-        ctx->slice_data = av_malloc((num_slices + 1) * sizeof(ctx->slice_data[0]));
+        ctx->slice_data = av_malloc_array(num_slices + 1, sizeof(ctx->slice_data[0]));
         if (!ctx->slice_data)
             return AVERROR(ENOMEM);
         ctx->total_slices = num_slices;
@@ -506,8 +507,9 @@ static void unpack_alpha(GetBitContext *gb, uint16_t *dst, int num_coeffs,
                 dst[idx++] = alpha_val >> 6;
             else
                 dst[idx++] = (alpha_val << 2) | (alpha_val >> 6);
-            if (idx >= num_coeffs - 1)
+            if (idx >= num_coeffs) {
                 break;
+            }
         } while (get_bits1(gb));
         val = get_bits(gb, 4);
         if (!val)
@@ -619,7 +621,7 @@ static int decode_slice(AVCodecContext *avctx, void *tdata)
     coff[2]     = coff[1] + u_data_size;
     v_data_size = hdr_size > 7 ? AV_RB16(buf + 6) : slice_data_size - coff[2];
     coff[3]     = coff[2] + v_data_size;
-    a_data_size = slice_data_size - coff[3];
+    a_data_size = ctx->alpha_info ? slice_data_size - coff[3] : 0;
 
     /* if V or alpha component size is negative that means that previous
        component sizes are too large */
@@ -769,8 +771,8 @@ static av_cold int decode_close(AVCodecContext *avctx)
 }
 
 
-AVCodec ff_prores_decoder = {
-    .name           = "prores",
+AVCodec ff_prores_lgpl_decoder = {
+    .name           = "prores_lgpl",
     .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_PRORES,
diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c
index 3af2f0b..82d6009 100644
--- a/libavcodec/proresdsp.c
+++ b/libavcodec/proresdsp.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
 #define CLIP_MIN (1 << (PRORES_BITS_PER_SAMPLE - 8))           ///< minimum value for clipping resulting pixels
 #define CLIP_MAX (1 << PRORES_BITS_PER_SAMPLE) - CLIP_MIN - 1  ///< maximum value for clipping resulting pixels
 
-#define CLIP_AND_BIAS(x) (av_clip((x) + BIAS, CLIP_MIN, CLIP_MAX))
+#define CLIP(x) (av_clip((x), CLIP_MIN, CLIP_MAX))
 
 /**
  * Add bias value, clamp and output pixels of a slice
@@ -44,7 +44,7 @@ static void put_pixels(uint16_t *dst, int stride, const int16_t *in)
         for (x = 0; x < 8; x++) {
             src_offset = (y << 3) + x;
 
-            dst[dst_offset + x] = CLIP_AND_BIAS(in[src_offset]);
+            dst[dst_offset + x] = CLIP(in[src_offset]);
         }
     }
 }
@@ -55,13 +55,13 @@ static void prores_idct_put_c(uint16_t *out, int linesize, int16_t *block, const
     put_pixels(out, linesize >> 1, block);
 }
 
-av_cold void ff_proresdsp_init(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
     dsp->idct_put = prores_idct_put_c;
     dsp->idct_permutation_type = FF_IDCT_PERM_NONE;
 
     if (ARCH_X86)
-        ff_proresdsp_init_x86(dsp);
+        ff_proresdsp_init_x86(dsp, avctx);
 
     ff_init_scantable_permutation(dsp->idct_permutation,
                                   dsp->idct_permutation_type);
diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h
index e8a3ea9..159862e 100644
--- a/libavcodec/proresdsp.h
+++ b/libavcodec/proresdsp.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #define AVCODEC_PRORESDSP_H
 
 #include <stdint.h>
+#include "avcodec.h"
 
 #define PRORES_BITS_PER_SAMPLE 10 ///< output precision of prores decoder
 
@@ -33,8 +34,8 @@ typedef struct ProresDSPContext {
     void (* idct_put) (uint16_t *out, int linesize, int16_t *block, const int16_t *qmat);
 } ProresDSPContext;
 
-void ff_proresdsp_init(ProresDSPContext *dsp);
+void ff_proresdsp_init(ProresDSPContext *dsp, AVCodecContext *avctx);
 
-void ff_proresdsp_init_x86(ProresDSPContext *dsp);
+void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx);
 
 #endif /* AVCODEC_PRORESDSP_H */
diff --git a/libavcodec/proresenc_anatoliy.c b/libavcodec/proresenc_anatoliy.c
new file mode 100644
index 0000000..0516066
--- /dev/null
+++ b/libavcodec/proresenc_anatoliy.c
@@ -0,0 +1,630 @@
+/*
+ * Apple ProRes encoder
+ *
+ * Copyright (c) 2011 Anatoliy Wasserman
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Apple ProRes encoder (Anatoliy Wasserman version)
+ * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'acpo' (Proxy)
+ */
+
+#include "avcodec.h"
+#include "dct.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+#include "fdctdsp.h"
+
+#define DEFAULT_SLICE_MB_WIDTH 8
+
+#define FF_PROFILE_PRORES_PROXY     0
+#define FF_PROFILE_PRORES_LT        1
+#define FF_PROFILE_PRORES_STANDARD  2
+#define FF_PROFILE_PRORES_HQ        3
+
+static const AVProfile profiles[] = {
+    { FF_PROFILE_PRORES_PROXY,    "apco"},
+    { FF_PROFILE_PRORES_LT,       "apcs"},
+    { FF_PROFILE_PRORES_STANDARD, "apcn"},
+    { FF_PROFILE_PRORES_HQ,       "apch"},
+    { FF_PROFILE_UNKNOWN }
+};
+
+static const int qp_start_table[4] = { 4, 1, 1, 1 };
+static const int qp_end_table[4]   = { 8, 9, 6, 6 };
+static const int bitrate_table[5]  = { 1000, 2100, 3500, 5400 };
+
+static const uint8_t progressive_scan[64] = {
+     0,  1,  8,  9,  2,  3, 10, 11,
+    16, 17, 24, 25, 18, 19, 26, 27,
+     4,  5, 12, 20, 13,  6,  7, 14,
+    21, 28, 29, 22, 15, 23, 30, 31,
+    32, 33, 40, 48, 41, 34, 35, 42,
+    49, 56, 57, 50, 43, 36, 37, 44,
+    51, 58, 59, 52, 45, 38, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+static const uint8_t QMAT_LUMA[4][64] = {
+    {
+         4,  7,  9, 11, 13, 14, 15, 63,
+         7,  7, 11, 12, 14, 15, 63, 63,
+         9, 11, 13, 14, 15, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    }, {
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41
+    }, {
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21
+    }, {
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7
+    }
+};
+
+static const uint8_t QMAT_CHROMA[4][64] = {
+    {
+         4,  7,  9, 11, 13, 14, 63, 63,
+         7,  7, 11, 12, 14, 63, 63, 63,
+         9, 11, 13, 14, 63, 63, 63, 63,
+        11, 11, 13, 14, 63, 63, 63, 63,
+        11, 13, 14, 63, 63, 63, 63, 63,
+        13, 14, 63, 63, 63, 63, 63, 63,
+        13, 63, 63, 63, 63, 63, 63, 63,
+        63, 63, 63, 63, 63, 63, 63, 63
+    }, {
+         4,  5,  6,  7,  9, 11, 13, 15,
+         5,  5,  7,  8, 11, 13, 15, 17,
+         6,  7,  9, 11, 13, 15, 15, 17,
+         7,  7,  9, 11, 13, 15, 17, 19,
+         7,  9, 11, 13, 14, 16, 19, 23,
+         9, 11, 13, 14, 16, 19, 23, 29,
+         9, 11, 13, 15, 17, 21, 28, 35,
+        11, 13, 16, 17, 21, 28, 35, 41
+    }, {
+         4,  4,  5,  5,  6,  7,  7,  9,
+         4,  4,  5,  6,  7,  7,  9,  9,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  5,  6,  7,  7,  9,  9, 10,
+         5,  6,  7,  7,  8,  9, 10, 12,
+         6,  7,  7,  8,  9, 10, 12, 15,
+         6,  7,  7,  9, 10, 11, 14, 17,
+         7,  7,  9, 10, 11, 14, 17, 21
+    }, {
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  4,
+         4,  4,  4,  4,  4,  4,  4,  5,
+         4,  4,  4,  4,  4,  4,  5,  5,
+         4,  4,  4,  4,  4,  5,  5,  6,
+         4,  4,  4,  4,  5,  5,  6,  7,
+         4,  4,  4,  4,  5,  6,  7,  7
+    }
+};
+
+
+typedef struct {
+    FDCTDSPContext fdsp;
+    uint8_t* fill_y;
+    uint8_t* fill_u;
+    uint8_t* fill_v;
+
+    int qmat_luma[16][64];
+    int qmat_chroma[16][64];
+} ProresContext;
+
+static void encode_codeword(PutBitContext *pb, int val, int codebook)
+{
+    unsigned int rice_order, exp_order, switch_bits, first_exp, exp, zeros;
+
+    /* number of bits to switch between rice and exp golomb */
+    switch_bits = codebook & 3;
+    rice_order  = codebook >> 5;
+    exp_order   = (codebook >> 2) & 7;
+
+    first_exp = ((switch_bits + 1) << rice_order);
+
+    if (val >= first_exp) { /* exp golomb */
+        val -= first_exp;
+        val += (1 << exp_order);
+        exp = av_log2(val);
+        zeros = exp - exp_order + switch_bits + 1;
+        put_bits(pb, zeros, 0);
+        put_bits(pb, exp + 1, val);
+    } else if (rice_order) {
+        put_bits(pb, (val >> rice_order), 0);
+        put_bits(pb, 1, 1);
+        put_sbits(pb, rice_order, val);
+    } else {
+        put_bits(pb, val, 0);
+        put_bits(pb, 1, 1);
+    }
+}
+
+#define QSCALE(qmat,ind,val) ((val) / ((qmat)[ind]))
+#define TO_GOLOMB(val) (((val) << 1) ^ ((val) >> 31))
+#define DIFF_SIGN(val, sign) (((val) >> 31) ^ (sign))
+#define IS_NEGATIVE(val) ((((val) >> 31) ^ -1) + 1)
+#define TO_GOLOMB2(val,sign) ((val)==0 ? 0 : ((val) << 1) + (sign))
+
+static av_always_inline int get_level(int val)
+{
+    int sign = (val >> 31);
+    return (val ^ sign) - sign;
+}
+
+#define FIRST_DC_CB 0xB8
+
+static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+
+static void encode_dc_coeffs(PutBitContext *pb, int16_t *in,
+        int blocks_per_slice, int *qmat)
+{
+    int prev_dc, code;
+    int i, sign, idx;
+    int new_dc, delta, diff_sign, new_code;
+
+    prev_dc = QSCALE(qmat, 0, in[0] - 16384);
+    code = TO_GOLOMB(prev_dc);
+    encode_codeword(pb, code, FIRST_DC_CB);
+
+    code = 5; sign = 0; idx = 64;
+    for (i = 1; i < blocks_per_slice; i++, idx += 64) {
+        new_dc    = QSCALE(qmat, 0, in[idx] - 16384);
+        delta     = new_dc - prev_dc;
+        diff_sign = DIFF_SIGN(delta, sign);
+        new_code  = TO_GOLOMB2(get_level(delta), diff_sign);
+
+        encode_codeword(pb, new_code, dc_codebook[FFMIN(code, 6)]);
+
+        code      = new_code;
+        sign      = delta >> 31;
+        prev_dc   = new_dc;
+    }
+}
+
+static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29,
+        0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28,
+        0x28, 0x28, 0x28, 0x4C };
+
+static void encode_ac_coeffs(AVCodecContext *avctx, PutBitContext *pb,
+        int16_t *in, int blocks_per_slice, int *qmat)
+{
+    int prev_run = 4;
+    int prev_level = 2;
+
+    int run = 0, level, code, i, j;
+    for (i = 1; i < 64; i++) {
+        int indp = progressive_scan[i];
+        for (j = 0; j < blocks_per_slice; j++) {
+            int val = QSCALE(qmat, indp, in[(j << 6) + indp]);
+            if (val) {
+                encode_codeword(pb, run, run_to_cb[FFMIN(prev_run, 15)]);
+
+                prev_run   = run;
+                run        = 0;
+                level      = get_level(val);
+                code       = level - 1;
+
+                encode_codeword(pb, code, lev_to_cb[FFMIN(prev_level, 9)]);
+
+                prev_level = level;
+
+                put_bits(pb, 1, IS_NEGATIVE(val));
+            } else {
+                ++run;
+            }
+        }
+    }
+}
+
+static void get(uint8_t *pixels, int stride, int16_t* block)
+{
+    int i;
+
+    for (i = 0; i < 8; i++) {
+        AV_WN64(block, AV_RN64(pixels));
+        AV_WN64(block+4, AV_RN64(pixels+8));
+        pixels += stride;
+        block += 8;
+    }
+}
+
+static void fdct_get(FDCTDSPContext *fdsp, uint8_t *pixels, int stride, int16_t* block)
+{
+    get(pixels, stride, block);
+    fdsp->fdct(block);
+}
+
+static int encode_slice_plane(AVCodecContext *avctx, int mb_count,
+        uint8_t *src, int src_stride, uint8_t *buf, unsigned buf_size,
+        int *qmat, int chroma)
+{
+    ProresContext* ctx = avctx->priv_data;
+    FDCTDSPContext *fdsp = &ctx->fdsp;
+    LOCAL_ALIGNED(16, int16_t, blocks, [DEFAULT_SLICE_MB_WIDTH << 8]);
+    int16_t *block;
+    int i, blocks_per_slice;
+    PutBitContext pb;
+
+    block = blocks;
+    for (i = 0; i < mb_count; i++) {
+        fdct_get(fdsp, src,                  src_stride, block + (0 << 6));
+        fdct_get(fdsp, src + 8 * src_stride, src_stride, block + ((2 - chroma) << 6));
+        if (!chroma) {
+            fdct_get(fdsp, src + 16,                  src_stride, block + (1 << 6));
+            fdct_get(fdsp, src + 16 + 8 * src_stride, src_stride, block + (3 << 6));
+        }
+
+        block += (256 >> chroma);
+        src   += (32  >> chroma);
+    }
+
+    blocks_per_slice = mb_count << (2 - chroma);
+    init_put_bits(&pb, buf, buf_size);
+
+    encode_dc_coeffs(&pb, blocks, blocks_per_slice, qmat);
+    encode_ac_coeffs(avctx, &pb, blocks, blocks_per_slice, qmat);
+
+    flush_put_bits(&pb);
+    return put_bits_ptr(&pb) - pb.buf;
+}
+
+static av_always_inline unsigned encode_slice_data(AVCodecContext *avctx,
+        uint8_t *dest_y, uint8_t *dest_u, uint8_t *dest_v, int luma_stride,
+        int chroma_stride, unsigned mb_count, uint8_t *buf, unsigned data_size,
+        unsigned* y_data_size, unsigned* u_data_size, unsigned* v_data_size,
+        int qp)
+{
+    ProresContext* ctx = avctx->priv_data;
+
+    *y_data_size = encode_slice_plane(avctx, mb_count, dest_y, luma_stride,
+            buf, data_size, ctx->qmat_luma[qp - 1], 0);
+
+    if (!(avctx->flags & AV_CODEC_FLAG_GRAY)) {
+        *u_data_size = encode_slice_plane(avctx, mb_count, dest_u,
+                chroma_stride, buf + *y_data_size, data_size - *y_data_size,
+                ctx->qmat_chroma[qp - 1], 1);
+
+        *v_data_size = encode_slice_plane(avctx, mb_count, dest_v,
+                chroma_stride, buf + *y_data_size + *u_data_size,
+                data_size - *y_data_size - *u_data_size,
+                ctx->qmat_chroma[qp - 1], 1);
+    }
+
+    return *y_data_size + *u_data_size + *v_data_size;
+}
+
+static void subimage_with_fill(uint16_t *src, unsigned x, unsigned y,
+        unsigned stride, unsigned width, unsigned height, uint16_t *dst,
+        unsigned dst_width, unsigned dst_height)
+{
+
+    int box_width = FFMIN(width - x, dst_width);
+    int box_height = FFMIN(height - y, dst_height);
+    int i, j, src_stride = stride >> 1;
+    uint16_t last_pix, *last_line;
+
+    src += y * src_stride + x;
+    for (i = 0; i < box_height; ++i) {
+        for (j = 0; j < box_width; ++j) {
+            dst[j] = src[j];
+        }
+        last_pix = dst[j - 1];
+        for (; j < dst_width; j++)
+            dst[j] = last_pix;
+        src += src_stride;
+        dst += dst_width;
+    }
+    last_line = dst - dst_width;
+    for (; i < dst_height; i++) {
+        for (j = 0; j < dst_width; ++j) {
+            dst[j] = last_line[j];
+        }
+        dst += dst_width;
+    }
+}
+
+static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, int mb_x,
+        int mb_y, unsigned mb_count, uint8_t *buf, unsigned data_size,
+        int unsafe, int *qp)
+{
+    int luma_stride, chroma_stride;
+    int hdr_size = 6, slice_size;
+    uint8_t *dest_y, *dest_u, *dest_v;
+    unsigned y_data_size = 0, u_data_size = 0, v_data_size = 0;
+    ProresContext* ctx = avctx->priv_data;
+    int tgt_bits   = (mb_count * bitrate_table[avctx->profile]) >> 2;
+    int low_bytes  = (tgt_bits - (tgt_bits >> 3)) >> 3; // 12% bitrate fluctuation
+    int high_bytes = (tgt_bits + (tgt_bits >> 3)) >> 3;
+
+    luma_stride   = pic->linesize[0];
+    chroma_stride = pic->linesize[1];
+
+    dest_y = pic->data[0] + (mb_y << 4) * luma_stride   + (mb_x << 5);
+    dest_u = pic->data[1] + (mb_y << 4) * chroma_stride + (mb_x << 4);
+    dest_v = pic->data[2] + (mb_y << 4) * chroma_stride + (mb_x << 4);
+
+    if (unsafe) {
+
+        subimage_with_fill((uint16_t *) pic->data[0], mb_x << 4, mb_y << 4,
+                luma_stride, avctx->width, avctx->height,
+                (uint16_t *) ctx->fill_y, mb_count << 4, 16);
+        subimage_with_fill((uint16_t *) pic->data[1], mb_x << 3, mb_y << 4,
+                chroma_stride, avctx->width >> 1, avctx->height,
+                (uint16_t *) ctx->fill_u, mb_count << 3, 16);
+        subimage_with_fill((uint16_t *) pic->data[2], mb_x << 3, mb_y << 4,
+                chroma_stride, avctx->width >> 1, avctx->height,
+                (uint16_t *) ctx->fill_v, mb_count << 3, 16);
+
+        encode_slice_data(avctx, ctx->fill_y, ctx->fill_u, ctx->fill_v,
+                mb_count << 5, mb_count << 4, mb_count, buf + hdr_size,
+                data_size - hdr_size, &y_data_size, &u_data_size, &v_data_size,
+                *qp);
+    } else {
+        slice_size = encode_slice_data(avctx, dest_y, dest_u, dest_v,
+                luma_stride, chroma_stride, mb_count, buf + hdr_size,
+                data_size - hdr_size, &y_data_size, &u_data_size, &v_data_size,
+                *qp);
+
+        if (slice_size > high_bytes && *qp < qp_end_table[avctx->profile]) {
+            do {
+                *qp += 1;
+                slice_size = encode_slice_data(avctx, dest_y, dest_u, dest_v,
+                        luma_stride, chroma_stride, mb_count, buf + hdr_size,
+                        data_size - hdr_size, &y_data_size, &u_data_size,
+                        &v_data_size, *qp);
+            } while (slice_size > high_bytes && *qp < qp_end_table[avctx->profile]);
+        } else if (slice_size < low_bytes && *qp
+                > qp_start_table[avctx->profile]) {
+            do {
+                *qp -= 1;
+                slice_size = encode_slice_data(avctx, dest_y, dest_u, dest_v,
+                        luma_stride, chroma_stride, mb_count, buf + hdr_size,
+                        data_size - hdr_size, &y_data_size, &u_data_size,
+                        &v_data_size, *qp);
+            } while (slice_size < low_bytes && *qp > qp_start_table[avctx->profile]);
+        }
+    }
+
+    buf[0] = hdr_size << 3;
+    buf[1] = *qp;
+    AV_WB16(buf + 2, y_data_size);
+    AV_WB16(buf + 4, u_data_size);
+
+    return hdr_size + y_data_size + u_data_size + v_data_size;
+}
+
+static int prores_encode_picture(AVCodecContext *avctx, const AVFrame *pic,
+        uint8_t *buf, const int buf_size)
+{
+    int mb_width = (avctx->width + 15) >> 4;
+    int mb_height = (avctx->height + 15) >> 4;
+    int hdr_size, sl_size, i;
+    int mb_y, sl_data_size, qp;
+    int unsafe_bot, unsafe_right;
+    uint8_t *sl_data, *sl_data_sizes;
+    int slice_per_line = 0, rem = mb_width;
+
+    for (i = av_log2(DEFAULT_SLICE_MB_WIDTH); i >= 0; --i) {
+        slice_per_line += rem >> i;
+        rem &= (1 << i) - 1;
+    }
+
+    qp = qp_start_table[avctx->profile];
+    hdr_size = 8; sl_data_size = buf_size - hdr_size;
+    sl_data_sizes = buf + hdr_size;
+    sl_data = sl_data_sizes + (slice_per_line * mb_height * 2);
+    for (mb_y = 0; mb_y < mb_height; mb_y++) {
+        int mb_x = 0;
+        int slice_mb_count = DEFAULT_SLICE_MB_WIDTH;
+        while (mb_x < mb_width) {
+            while (mb_width - mb_x < slice_mb_count)
+                slice_mb_count >>= 1;
+
+            unsafe_bot = (avctx->height & 0xf) && (mb_y == mb_height - 1);
+            unsafe_right = (avctx->width & 0xf) && (mb_x + slice_mb_count == mb_width);
+
+            sl_size = encode_slice(avctx, pic, mb_x, mb_y, slice_mb_count,
+                    sl_data, sl_data_size, unsafe_bot || unsafe_right, &qp);
+
+            bytestream_put_be16(&sl_data_sizes, sl_size);
+            sl_data           += sl_size;
+            sl_data_size      -= sl_size;
+            mb_x              += slice_mb_count;
+        }
+    }
+
+    buf[0] = hdr_size << 3;
+    AV_WB32(buf + 1, sl_data - buf);
+    AV_WB16(buf + 5, slice_per_line * mb_height);
+    buf[7] = av_log2(DEFAULT_SLICE_MB_WIDTH) << 4;
+
+    return sl_data - buf;
+}
+
+static int prores_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                               const AVFrame *pict, int *got_packet)
+{
+    int header_size = 148;
+    uint8_t *buf;
+    int pic_size, ret;
+    int frame_size = FFALIGN(avctx->width, 16) * FFALIGN(avctx->height, 16)*16 + 500 + AV_INPUT_BUFFER_MIN_SIZE; //FIXME choose tighter limit
+
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, frame_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    buf = pkt->data;
+    pic_size = prores_encode_picture(avctx, pict, buf + header_size + 8,
+            pkt->size - header_size - 8);
+
+    bytestream_put_be32(&buf, pic_size + 8 + header_size);
+    bytestream_put_buffer(&buf, "icpf", 4);
+
+    bytestream_put_be16(&buf, header_size);
+    bytestream_put_be16(&buf, 0);
+    bytestream_put_buffer(&buf, "fmpg", 4);
+    bytestream_put_be16(&buf, avctx->width);
+    bytestream_put_be16(&buf, avctx->height);
+    *buf++ = 0x83; // {10}(422){00}{00}(frame){11}
+    *buf++ = 0;
+    *buf++ = 2;
+    *buf++ = 2;
+    *buf++ = 6;
+    *buf++ = 32;
+    *buf++ = 0;
+    *buf++ = 3;
+
+    bytestream_put_buffer(&buf, QMAT_LUMA[avctx->profile],   64);
+    bytestream_put_buffer(&buf, QMAT_CHROMA[avctx->profile], 64);
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    pkt->size = pic_size + 8 + header_size;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static void scale_mat(const uint8_t* src, int* dst, int scale)
+{
+    int i;
+    for (i = 0; i < 64; i++)
+        dst[i] = src[i] * scale;
+}
+
+static av_cold int prores_encode_init(AVCodecContext *avctx)
+{
+    int i;
+    ProresContext* ctx = avctx->priv_data;
+
+    if (avctx->pix_fmt != AV_PIX_FMT_YUV422P10) {
+        av_log(avctx, AV_LOG_ERROR, "need YUV422P10\n");
+        return AVERROR_PATCHWELCOME;
+    }
+    avctx->bits_per_raw_sample = 10;
+
+    if (avctx->width & 0x1) {
+        av_log(avctx, AV_LOG_ERROR,
+                "frame width needs to be multiple of 2\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->width > 65534 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR,
+                "The maximum dimensions are 65534x65535\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((avctx->height & 0xf) || (avctx->width & 0xf)) {
+        ctx->fill_y = av_malloc(4 * (DEFAULT_SLICE_MB_WIDTH << 8));
+        if (!ctx->fill_y)
+            return AVERROR(ENOMEM);
+        ctx->fill_u = ctx->fill_y + (DEFAULT_SLICE_MB_WIDTH << 9);
+        ctx->fill_v = ctx->fill_u + (DEFAULT_SLICE_MB_WIDTH << 8);
+    }
+
+    if (avctx->profile == FF_PROFILE_UNKNOWN) {
+        avctx->profile = FF_PROFILE_PRORES_STANDARD;
+        av_log(avctx, AV_LOG_INFO,
+                "encoding with ProRes standard (apcn) profile\n");
+
+    } else if (avctx->profile < FF_PROFILE_PRORES_PROXY
+            || avctx->profile > FF_PROFILE_PRORES_HQ) {
+        av_log(
+                avctx,
+                AV_LOG_ERROR,
+                "unknown profile %d, use [0 - apco, 1 - apcs, 2 - apcn (default), 3 - apch]\n",
+                avctx->profile);
+        return AVERROR(EINVAL);
+    }
+
+    ff_fdctdsp_init(&ctx->fdsp, avctx);
+
+    avctx->codec_tag = AV_RL32((const uint8_t*)profiles[avctx->profile].name);
+
+    for (i = 1; i <= 16; i++) {
+        scale_mat(QMAT_LUMA[avctx->profile]  , ctx->qmat_luma[i - 1]  , i);
+        scale_mat(QMAT_CHROMA[avctx->profile], ctx->qmat_chroma[i - 1], i);
+    }
+
+    return 0;
+}
+
+static av_cold int prores_encode_close(AVCodecContext *avctx)
+{
+    ProresContext* ctx = avctx->priv_data;
+    av_freep(&ctx->fill_y);
+
+    return 0;
+}
+
+AVCodec ff_prores_aw_encoder = {
+    .name           = "prores_aw",
+    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = prores_encode_init,
+    .close          = prores_encode_close,
+    .encode2        = prores_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE},
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .profiles       = profiles
+};
+
+AVCodec ff_prores_encoder = {
+    .name           = "prores",
+    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_PRORES,
+    .priv_data_size = sizeof(ProresContext),
+    .init           = prores_encode_init,
+    .close          = prores_encode_close,
+    .encode2        = prores_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]){AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE},
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
+    .profiles       = profiles
+};
diff --git a/libavcodec/proresenc.c b/libavcodec/proresenc_kostya.c
index 0564b12..3bc1d5d 100644
--- a/libavcodec/proresenc.c
+++ b/libavcodec/proresenc_kostya.c
@@ -3,20 +3,23 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This encoder appears to be based on Anatoliy Wassermans considering
+ * similarities in the bugs.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +40,7 @@
 #define MAX_PLANES 4
 
 enum {
+    PRORES_PROFILE_AUTO  = -1,
     PRORES_PROFILE_PROXY = 0,
     PRORES_PROFILE_LT,
     PRORES_PROFILE_STANDARD,
@@ -437,12 +441,11 @@ static int encode_slice_plane(ProresContext *ctx, PutBitContext *pb,
 
 static void put_alpha_diff(PutBitContext *pb, int cur, int prev, int abits)
 {
-    const int mask  = (1 << abits) - 1;
     const int dbits = (abits == 8) ? 4 : 7;
     const int dsize = 1 << dbits - 1;
     int diff = cur - prev;
 
-    diff &= mask;
+    diff = av_mod_uintp2(diff, abits);
     if (diff >= (1 << abits) - dsize)
         diff -= 1 << abits;
     if (diff < -dsize || diff > dsize || !diff) {
@@ -686,12 +689,11 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane,
 
 static int est_alpha_diff(int cur, int prev, int abits)
 {
-    const int mask  = (1 << abits) - 1;
     const int dbits = (abits == 8) ? 4 : 7;
     const int dsize = 1 << dbits - 1;
     int diff = cur - prev;
 
-    diff &= mask;
+    diff = av_mod_uintp2(diff, abits);
     if (diff >= (1 << abits) - dsize)
         diff -= 1 << abits;
     if (diff < -dsize || diff > dsize || !diff)
@@ -935,23 +937,15 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int sizes[4] = { 0 };
     int slice_hdr_size = 2 + 2 * (ctx->num_planes - 1);
     int frame_size, picture_size, slice_size;
-    int pkt_size, ret, max_slice_size = 0;
+    int pkt_size, ret;
+    int max_slice_size = (ctx->frame_size_upper_bound - 200) / (ctx->pictures_per_frame * ctx->slices_per_picture + 1);
     uint8_t frame_flags;
 
     ctx->pic = pic;
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
     pkt_size = ctx->frame_size_upper_bound;
 
-    if ((ret = ff_alloc_packet(pkt, pkt_size + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     orig_buf = pkt->data;
 
@@ -1008,7 +1002,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         // slices
         if (!ctx->force_quant) {
-            ret = avctx->execute2(avctx, find_quant_thread, NULL, NULL,
+            ret = avctx->execute2(avctx, find_quant_thread, (void*)pic, NULL,
                                   ctx->mb_height);
             if (ret)
                 return ret;
@@ -1030,9 +1024,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     uint8_t *start = pkt->data;
                     // Recompute new size according to max_slice_size
                     // and deduce delta
-                    int delta = 200 + ctx->pictures_per_frame *
-                                ctx->slices_per_picture * max_slice_size -
-                                pkt_size;
+                    int delta = 200 + (ctx->pictures_per_frame *
+                                ctx->slices_per_picture + 1) *
+                                max_slice_size - pkt_size;
 
                     delta = FFMAX(delta, 2 * max_slice_size);
                     ctx->frame_size_upper_bound += delta;
@@ -1059,7 +1053,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     slice_hdr        = pkt->data + (slice_hdr        - start);
                     tmp              = pkt->data + (tmp              - start);
                 }
-                init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)) * 8);
+                init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)));
                 ret = encode_slice(avctx, pic, &pb, sizes, x, y, q,
                                    mbs_per_slice);
                 if (ret < 0)
@@ -1078,10 +1072,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
         }
 
-        if (ctx->pictures_per_frame == 1)
-            picture_size = buf - picture_size_pos - 6;
-        else
-            picture_size = buf - picture_size_pos + 1;
+        picture_size = buf - (picture_size_pos - 1);
         bytestream_put_be32(&picture_size_pos, picture_size);
     }
 
@@ -1103,7 +1094,7 @@ static av_cold int encode_close(AVCodecContext *avctx)
 
     if (ctx->tdata) {
         for (i = 0; i < avctx->thread_count; i++)
-            av_free(ctx->tdata[i].nodes);
+            av_freep(&ctx->tdata[i].nodes);
     }
     av_freep(&ctx->tdata);
     av_freep(&ctx->slice_q);
@@ -1134,6 +1125,12 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int interlaced = !!(avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT);
 
     avctx->bits_per_raw_sample = 10;
+#if FF_API_CODED_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+    avctx->coded_frame->key_frame = 1;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
 
     ctx->fdct      = prores_fdct;
     ctx->scantable = interlaced ? ff_prores_interlaced_scan
@@ -1146,7 +1143,23 @@ static av_cold int encode_init(AVCodecContext *avctx)
                "there should be an integer power of two MBs per slice\n");
         return AVERROR(EINVAL);
     }
+    if (ctx->profile == PRORES_PROFILE_AUTO) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+        ctx->profile = (desc->flags & AV_PIX_FMT_FLAG_ALPHA ||
+                        !(desc->log2_chroma_w + desc->log2_chroma_h))
+                     ? PRORES_PROFILE_4444 : PRORES_PROFILE_HQ;
+        av_log(avctx, AV_LOG_INFO, "Autoselected %s. It can be overridden "
+               "through -profile option.\n", ctx->profile == PRORES_PROFILE_4444
+               ? "4:4:4:4 profile because of the used input colorspace"
+               : "HQ profile to keep best quality");
+    }
     if (av_pix_fmt_desc_get(avctx->pix_fmt)->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        if (ctx->profile != PRORES_PROFILE_4444) {
+            // force alpha and warn
+            av_log(avctx, AV_LOG_WARNING, "Profile selected will not "
+                   "encode alpha. Override with -profile if needed.\n");
+            ctx->alpha_bits = 0;
+        }
         if (ctx->alpha_bits & 7) {
             av_log(avctx, AV_LOG_ERROR, "alpha bits should be 0, 8 or 16\n");
             return AVERROR(EINVAL);
@@ -1248,16 +1261,16 @@ static av_cold int encode_init(AVCodecContext *avctx)
             ctx->bits_per_mb += ls * 4;
     }
 
-    ctx->frame_size_upper_bound = ctx->pictures_per_frame *
-                                  ctx->slices_per_picture *
+    ctx->frame_size_upper_bound = (ctx->pictures_per_frame *
+                                   ctx->slices_per_picture + 1) *
                                   (2 + 2 * ctx->num_planes +
                                    (mps * ctx->bits_per_mb) / 8)
                                   + 200;
 
     if (ctx->alpha_bits) {
          // The alpha plane is run-coded and might exceed the bit budget.
-         ctx->frame_size_upper_bound += ctx->pictures_per_frame *
-                                        ctx->slices_per_picture *
+         ctx->frame_size_upper_bound += (ctx->pictures_per_frame *
+                                         ctx->slices_per_picture + 1) *
          /* num pixels per slice */     (ctx->mbs_per_slice * 256 *
          /* bits per pixel */            (1 + ctx->alpha_bits + 1) + 7 >> 3);
     }
@@ -1281,8 +1294,10 @@ static const AVOption options[] = {
     { "mbs_per_slice", "macroblocks per slice", OFFSET(mbs_per_slice),
         AV_OPT_TYPE_INT, { .i64 = 8 }, 1, MAX_MBS_PER_SLICE, VE },
     { "profile",       NULL, OFFSET(profile), AV_OPT_TYPE_INT,
-        { .i64 = PRORES_PROFILE_STANDARD },
-        PRORES_PROFILE_PROXY, PRORES_PROFILE_4444, VE, "profile" },
+        { .i64 = PRORES_PROFILE_AUTO },
+        PRORES_PROFILE_AUTO, PRORES_PROFILE_4444, VE, "profile" },
+    { "auto",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_AUTO },
+        0, 0, VE, "profile" },
     { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_PROXY },
         0, 0, VE, "profile" },
     { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { .i64 = PRORES_PROFILE_LT },
@@ -1323,8 +1338,8 @@ static const AVClass proresenc_class = {
     .version    = LIBAVUTIL_VERSION_INT,
 };
 
-AVCodec ff_prores_encoder = {
-    .name           = "prores",
+AVCodec ff_prores_ks_encoder = {
+    .name           = "prores_ks",
     .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_PRORES,
diff --git a/libavcodec/psymodel.c b/libavcodec/psymodel.c
index 5179ede..2b5f111 100644
--- a/libavcodec/psymodel.c
+++ b/libavcodec/psymodel.c
@@ -2,20 +2,20 @@
  * audio encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,10 +35,11 @@ av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx, int num_lens,
     int i, j, k = 0;
 
     ctx->avctx = avctx;
-    ctx->ch        = av_mallocz(sizeof(ctx->ch[0]) * avctx->channels * 2);
-    ctx->group     = av_mallocz(sizeof(ctx->group[0]) * num_groups);
-    ctx->bands     = av_malloc (sizeof(ctx->bands[0])     * num_lens);
-    ctx->num_bands = av_malloc (sizeof(ctx->num_bands[0]) * num_lens);
+    ctx->ch        = av_mallocz_array(sizeof(ctx->ch[0]), avctx->channels * 2);
+    ctx->group     = av_mallocz_array(sizeof(ctx->group[0]), num_groups);
+    ctx->bands     = av_malloc_array (sizeof(ctx->bands[0]),      num_lens);
+    ctx->num_bands = av_malloc_array (sizeof(ctx->num_bands[0]),  num_lens);
+    ctx->cutoff    = avctx->cutoff;
 
     if (!ctx->ch || !ctx->group || !ctx->bands || !ctx->num_bands) {
         ff_psy_end(ctx);
@@ -81,7 +82,7 @@ FFPsyChannelGroup *ff_psy_find_group(FFPsyContext *ctx, int channel)
 
 av_cold void ff_psy_end(FFPsyContext *ctx)
 {
-    if (ctx->model->end)
+    if (ctx->model && ctx->model->end)
         ctx->model->end(ctx);
     av_freep(&ctx->bands);
     av_freep(&ctx->num_bands);
@@ -94,6 +95,7 @@ typedef struct FFPsyPreprocessContext{
     float stereo_att;
     struct FFIIRFilterCoeffs *fcoeffs;
     struct FFIIRFilterState **fstate;
+    struct FFIIRFilterContext fiir;
 }FFPsyPreprocessContext;
 
 #define FILT_ORDER 4
@@ -108,22 +110,29 @@ av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *av
         return NULL;
     ctx->avctx = avctx;
 
-    if (avctx->cutoff > 0)
-        cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;
-
-    if (cutoff_coeff)
-    ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
-                                             FF_FILTER_MODE_LOWPASS, FILT_ORDER,
-                                             cutoff_coeff, 0.0, 0.0);
-    if (ctx->fcoeffs) {
-        ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
-        if (!ctx->fstate) {
-            av_free(ctx);
-            return NULL;
+    /* AAC has its own LP method */
+    if (avctx->codec_id != AV_CODEC_ID_AAC) {
+        if (avctx->cutoff > 0)
+            cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;
+
+        if (cutoff_coeff && cutoff_coeff < 0.98)
+        ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
+                                                 FF_FILTER_MODE_LOWPASS, FILT_ORDER,
+                                                 cutoff_coeff, 0.0, 0.0);
+        if (ctx->fcoeffs) {
+            ctx->fstate = av_mallocz_array(sizeof(ctx->fstate[0]), avctx->channels);
+            if (!ctx->fstate) {
+                av_free(ctx->fcoeffs);
+                av_free(ctx);
+                return NULL;
+            }
+            for (i = 0; i < avctx->channels; i++)
+                ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
         }
-        for (i = 0; i < avctx->channels; i++)
-            ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
     }
+
+    ff_iir_filter_init(&ctx->fiir);
+
     return ctx;
 }
 
@@ -131,21 +140,22 @@ void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx, float **audio, int ch
 {
     int ch;
     int frame_size = ctx->avctx->frame_size;
+    FFIIRFilterContext *iir = &ctx->fiir;
 
     if (ctx->fstate) {
         for (ch = 0; ch < channels; ch++)
-            ff_iir_filter_flt(ctx->fcoeffs, ctx->fstate[ch], frame_size,
-                              &audio[ch][frame_size], 1, &audio[ch][frame_size], 1);
+            iir->filter_flt(ctx->fcoeffs, ctx->fstate[ch], frame_size,
+                            &audio[ch][frame_size], 1, &audio[ch][frame_size], 1);
     }
 }
 
 av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx)
 {
     int i;
-    ff_iir_filter_free_coeffs(ctx->fcoeffs);
+    ff_iir_filter_free_coeffsp(&ctx->fcoeffs);
     if (ctx->fstate)
         for (i = 0; i < ctx->avctx->channels; i++)
-            ff_iir_filter_free_state(ctx->fstate[i]);
+            ff_iir_filter_free_statep(&ctx->fstate[i]);
     av_freep(&ctx->fstate);
     av_free(ctx);
 }
diff --git a/libavcodec/psymodel.h b/libavcodec/psymodel.h
index 1cc3066..35d184c 100644
--- a/libavcodec/psymodel.h
+++ b/libavcodec/psymodel.h
@@ -2,20 +2,20 @@
  * audio encoder psychoacoustic model
  * Copyright (C) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,21 @@
 /** maximum number of channels */
 #define PSY_MAX_CHANS 20
 
+/* cutoff for VBR is purposedly increased, since LP filtering actually
+ * hinders VBR performance rather than the opposite
+ */
+#define AAC_CUTOFF_FROM_BITRATE(bit_rate,channels,sample_rate) (bit_rate ? FFMIN3(FFMIN3( \
+    FFMAX(bit_rate/channels/5, bit_rate/channels*15/32 - 5500), \
+    3000 + bit_rate/channels/4, \
+    12000 + bit_rate/channels/16), \
+    22000, \
+    sample_rate / 2): (sample_rate / 2))
+#define AAC_CUTOFF(s) ( \
+    (s->flags & CODEC_FLAG_QSCALE) \
+    ? s->sample_rate / 2 \
+    : AAC_CUTOFF_FROM_BITRATE(s->bit_rate, s->channels, s->sample_rate) \
+)
+
 /**
  * single band psychoacoustic information
  */
@@ -36,8 +51,7 @@ typedef struct FFPsyBand {
     int   bits;
     float energy;
     float threshold;
-    float distortion;
-    float perceptual_weight;
+    float spread;    /* Energy spread over the band */
 } FFPsyBand;
 
 /**
@@ -65,6 +79,7 @@ typedef struct FFPsyWindowInfo {
     int window_shape;                 ///< window shape (sine/KBD/whatever)
     int num_windows;                  ///< number of windows in a frame
     int grouping[8];                  ///< window grouping (for e.g. AAC)
+    float clipping[8];                ///< maximum absolute normalized intensity in the given window for clip avoidance
     int *window_sizes;                ///< sequence of window sizes inside one frame (for eg. WMA)
 } FFPsyWindowInfo;
 
@@ -78,6 +93,7 @@ typedef struct FFPsyContext {
     FFPsyChannel      *ch;            ///< single channel information
     FFPsyChannelGroup *group;         ///< channel group information
     int num_groups;                   ///< number of channel groups
+    int cutoff;                       ///< lowpass frequency cutoff for analysis
 
     uint8_t **bands;                  ///< scalefactor band sizes for possible frame sizes
     int     *num_bands;               ///< number of scalefactor bands for possible frame sizes
@@ -86,6 +102,7 @@ typedef struct FFPsyContext {
     struct {
         int size;                     ///< size of the bitresevoir in bits
         int bits;                     ///< number of bits used in the bitresevoir
+        int alloc;                    ///< number of bits allocated by the psy, or -1 if no allocation was done
     } bitres;
 
     void* model_priv_data;            ///< psychoacoustic model implementation private data
diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index 3b3f3ad..5724715 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -6,20 +6,20 @@
  * to Michael Niedermayer <michaelni@gmx.at> for writing initial
  * implementation.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
index c72da53..7ef5e9f 100644
--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,12 +26,6 @@
 
 #include <stdint.h>
 
-#if HAVE_PTHREADS
-#include <pthread.h>
-#elif HAVE_W32THREADS
-#include "compat/w32pthreads.h"
-#endif
-
 #include "avcodec.h"
 #include "internal.h"
 #include "pthread_internal.h"
@@ -46,6 +40,8 @@
 #include "libavutil/internal.h"
 #include "libavutil/log.h"
 #include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/thread.h"
 
 /**
  * Context used by codec threads and stored in their AVCodecInternal thread_ctx.
@@ -77,6 +73,10 @@ typedef struct PerThreadContext {
                                      * Set when the codec calls get_buffer().
                                      * State is returned to STATE_SETTING_UP afterwards.
                                      */
+        STATE_GET_FORMAT,           /**<
+                                     * Set when the codec calls get_format().
+                                     * State is returned to STATE_SETTING_UP afterwards.
+                                     */
         STATE_SETUP_FINISHED        ///< Set after the codec has called ff_thread_finish_setup().
     } state;
 
@@ -90,6 +90,11 @@ typedef struct PerThreadContext {
 
     AVFrame *requested_frame;       ///< AVFrame the codec passed to get_buffer()
     int      requested_flags;       ///< flags passed to get_buffer() for requested_frame
+
+    const enum AVPixelFormat *available_formats; ///< Format array for get_format()
+    enum AVPixelFormat result_format;            ///< get_format() result
+
+    int die;                        ///< Set when the thread should exit.
 } PerThreadContext;
 
 /**
@@ -108,10 +113,11 @@ typedef struct FrameThreadContext {
                                     * Set for the first N packets, where N is the number of threads.
                                     * While it is set, ff_thread_en/decode_frame won't return any results.
                                     */
-
-    int die;                       ///< Set when threads should exit.
 } FrameThreadContext;
 
+#define THREAD_SAFE_CALLBACKS(avctx) \
+((avctx)->thread_safe_callbacks || (avctx)->get_buffer2 == avcodec_default_get_buffer2)
+
 /**
  * Codec worker thread.
  *
@@ -122,24 +128,19 @@ typedef struct FrameThreadContext {
 static attribute_align_arg void *frame_worker_thread(void *arg)
 {
     PerThreadContext *p = arg;
-    FrameThreadContext *fctx = p->parent;
     AVCodecContext *avctx = p->avctx;
     const AVCodec *codec = avctx->codec;
 
+    pthread_mutex_lock(&p->mutex);
     while (1) {
-        if (p->state == STATE_INPUT_READY && !fctx->die) {
-            pthread_mutex_lock(&p->mutex);
-            while (p->state == STATE_INPUT_READY && !fctx->die)
+            while (p->state == STATE_INPUT_READY && !p->die)
                 pthread_cond_wait(&p->input_cond, &p->mutex);
-            pthread_mutex_unlock(&p->mutex);
-        }
 
-        if (fctx->die) break;
+        if (p->die) break;
 
-        if (!codec->update_thread_context && avctx->thread_safe_callbacks)
+        if (!codec->update_thread_context && THREAD_SAFE_CALLBACKS(avctx))
             ff_thread_finish_setup(avctx);
 
-        pthread_mutex_lock(&p->mutex);
         av_frame_unref(p->frame);
         p->got_frame = 0;
         p->result = codec->decode(avctx, p->frame, &p->got_frame, &p->avpkt);
@@ -153,14 +154,21 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
 
         if (p->state == STATE_SETTING_UP) ff_thread_finish_setup(avctx);
 
+        pthread_mutex_lock(&p->progress_mutex);
+#if 0 //BUFREF-FIXME
+        for (i = 0; i < MAX_BUFFERS; i++)
+            if (p->progress_used[i] && (p->got_frame || p->result<0 || avctx->codec_id != AV_CODEC_ID_H264)) {
+                p->progress[i][0] = INT_MAX;
+                p->progress[i][1] = INT_MAX;
+            }
+#endif
         p->state = STATE_INPUT_READY;
 
-        pthread_mutex_lock(&p->progress_mutex);
+        pthread_cond_broadcast(&p->progress_cond);
         pthread_cond_signal(&p->output_cond);
         pthread_mutex_unlock(&p->progress_mutex);
-
-        pthread_mutex_unlock(&p->mutex);
     }
+    pthread_mutex_unlock(&p->mutex);
 
     return NULL;
 }
@@ -171,6 +179,7 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
  * @param dst The destination context.
  * @param src The source context.
  * @param for_user 0 if the destination is a codec thread, 1 if the destination is the user's thread
+ * @return 0 on success, negative error code on failure
  */
 static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src, int for_user)
 {
@@ -211,10 +220,16 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         dst->hwaccel = src->hwaccel;
         dst->hwaccel_context = src->hwaccel_context;
+
+        dst->channels       = src->channels;
+        dst->sample_rate    = src->sample_rate;
+        dst->sample_fmt     = src->sample_fmt;
+        dst->channel_layout = src->channel_layout;
         dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;
     }
 
     if (for_user) {
+        dst->delay       = src->thread_count - 1;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
         dst->coded_frame = src->coded_frame;
@@ -245,6 +260,7 @@ static int update_context_from_user(AVCodecContext *dst, AVCodecContext *src)
 
     dst->opaque   = src->opaque;
     dst->debug    = src->debug;
+    dst->debug_mv = src->debug_mv;
 
     dst->slice_flags = src->slice_flags;
     dst->flags2      = src->flags2;
@@ -253,16 +269,14 @@ static int update_context_from_user(AVCodecContext *dst, AVCodecContext *src)
 
     dst->frame_number     = src->frame_number;
     dst->reordered_opaque = src->reordered_opaque;
+    dst->thread_safe_callbacks = src->thread_safe_callbacks;
 
     if (src->slice_count && src->slice_offset) {
         if (dst->slice_count < src->slice_count) {
-            int *tmp = av_realloc(dst->slice_offset, src->slice_count *
-                                  sizeof(*dst->slice_offset));
-            if (!tmp) {
-                av_free(dst->slice_offset);
-                return AVERROR(ENOMEM);
-            }
-            dst->slice_offset = tmp;
+            int err = av_reallocp_array(&dst->slice_offset, src->slice_count,
+                                        sizeof(*dst->slice_offset));
+            if (err < 0)
+                return err;
         }
         memcpy(dst->slice_offset, src->slice_offset,
                src->slice_count * sizeof(*dst->slice_offset));
@@ -283,7 +297,8 @@ static void release_delayed_buffers(PerThreadContext *p)
         pthread_mutex_lock(&fctx->buffer_mutex);
 
         // fix extended data in case the caller screwed it up
-        av_assert0(p->avctx->codec_type == AVMEDIA_TYPE_VIDEO);
+        av_assert0(p->avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+                   p->avctx->codec_type == AVMEDIA_TYPE_AUDIO);
         f = &p->released_buffers[--p->num_released_buffers];
         f->extended_data = f->data;
         av_frame_unref(f);
@@ -334,15 +349,27 @@ static int submit_packet(PerThreadContext *p, AVPacket *avpkt)
      * and it calls back to the client here.
      */
 
-    if (!p->avctx->thread_safe_callbacks &&
-        p->avctx->get_buffer2 != avcodec_default_get_buffer2) {
+    if (!p->avctx->thread_safe_callbacks && (
+         p->avctx->get_format != avcodec_default_get_format ||
+         p->avctx->get_buffer2 != avcodec_default_get_buffer2)) {
         while (p->state != STATE_SETUP_FINISHED && p->state != STATE_INPUT_READY) {
+            int call_done = 1;
             pthread_mutex_lock(&p->progress_mutex);
             while (p->state == STATE_SETTING_UP)
                 pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
 
-            if (p->state == STATE_GET_BUFFER) {
+            switch (p->state) {
+            case STATE_GET_BUFFER:
                 p->result = ff_get_buffer(p->avctx, p->requested_frame, p->requested_flags);
+                break;
+            case STATE_GET_FORMAT:
+                p->result_format = ff_get_format(p->avctx, p->available_formats);
+                break;
+            default:
+                call_done = 0;
+                break;
+            }
+            if (call_done) {
                 p->state  = STATE_SETTING_UP;
                 pthread_cond_signal(&p->progress_cond);
             }
@@ -379,9 +406,10 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
      * If we're still receiving the initial packets, don't return a frame.
      */
 
-    if (fctx->delaying) {
-        if (fctx->next_decoding >= (avctx->thread_count-1)) fctx->delaying = 0;
+    if (fctx->next_decoding > (avctx->thread_count-1-(avctx->codec_id == AV_CODEC_ID_FFV1)))
+        fctx->delaying = 0;
 
+    if (fctx->delaying) {
         *got_picture_ptr=0;
         if (avpkt->size)
             return avpkt->size;
@@ -408,6 +436,9 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
         *got_picture_ptr = p->got_frame;
         picture->pkt_dts = p->avpkt.dts;
 
+        if (p->result < 0)
+            err = p->result;
+
         /*
          * A later call with avkpt->size == 0 may loop over all threads,
          * including this one, searching for a frame to return before being
@@ -425,6 +456,14 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
 
     fctx->next_finished = finished;
 
+    /*
+     * When no frame was found while flushing, but an error occurred in
+     * any thread, return it instead of 0.
+     * Otherwise the error can get lost.
+     */
+    if (!avpkt->size && !*got_picture_ptr)
+        return err;
+
     /* return the size of the consumed packet if no error occurred */
     return (p->result >= 0) ? avpkt->size : p->result;
 }
@@ -432,7 +471,7 @@ int ff_thread_decode_frame(AVCodecContext *avctx,
 void ff_thread_report_progress(ThreadFrame *f, int n, int field)
 {
     PerThreadContext *p;
-    int *progress = f->progress ? (int*)f->progress->data : NULL;
+    volatile int *progress = f->progress ? (int*)f->progress->data : NULL;
 
     if (!progress || progress[field] >= n) return;
 
@@ -450,7 +489,7 @@ void ff_thread_report_progress(ThreadFrame *f, int n, int field)
 void ff_thread_await_progress(ThreadFrame *f, int n, int field)
 {
     PerThreadContext *p;
-    int *progress = f->progress ? (int*)f->progress->data : NULL;
+    volatile int *progress = f->progress ? (int*)f->progress->data : NULL;
 
     if (!progress || progress[field] >= n) return;
 
@@ -470,6 +509,10 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
 
     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
 
+    if(p->state == STATE_SETUP_FINISHED){
+        av_log(avctx, AV_LOG_WARNING, "Multiple ff_thread_finish_setup() calls\n");
+    }
+
     pthread_mutex_lock(&p->progress_mutex);
     p->state = STATE_SETUP_FINISHED;
     pthread_cond_broadcast(&p->progress_cond);
@@ -490,6 +533,7 @@ static void park_frame_worker_threads(FrameThreadContext *fctx, int thread_count
                 pthread_cond_wait(&p->output_cond, &p->progress_mutex);
             pthread_mutex_unlock(&p->progress_mutex);
         }
+        p->got_frame = 0;
     }
 }
 
@@ -502,25 +546,27 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
     park_frame_worker_threads(fctx, thread_count);
 
     if (fctx->prev_thread && fctx->prev_thread != fctx->threads)
-        update_context_from_thread(fctx->threads->avctx, fctx->prev_thread->avctx, 0);
-
-    fctx->die = 1;
+        if (update_context_from_thread(fctx->threads->avctx, fctx->prev_thread->avctx, 0) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Final thread update failed\n");
+            fctx->prev_thread->avctx->internal->is_copy = fctx->threads->avctx->internal->is_copy;
+            fctx->threads->avctx->internal->is_copy = 1;
+        }
 
     for (i = 0; i < thread_count; i++) {
         PerThreadContext *p = &fctx->threads[i];
 
         pthread_mutex_lock(&p->mutex);
+        p->die = 1;
         pthread_cond_signal(&p->input_cond);
         pthread_mutex_unlock(&p->mutex);
 
         if (p->thread_init)
             pthread_join(p->thread, NULL);
+        p->thread_init=0;
 
-        if (codec->close)
+        if (codec->close && p->avctx)
             codec->close(p->avctx);
 
-        avctx->codec = NULL;
-
         release_delayed_buffers(p);
         av_frame_free(&p->frame);
     }
@@ -536,18 +582,23 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
         av_packet_unref(&p->avpkt);
         av_freep(&p->released_buffers);
 
-        if (i) {
+        if (i && p->avctx) {
             av_freep(&p->avctx->priv_data);
             av_freep(&p->avctx->slice_offset);
         }
 
-        av_freep(&p->avctx->internal);
+        if (p->avctx)
+            av_freep(&p->avctx->internal);
         av_freep(&p->avctx);
     }
 
     av_freep(&fctx->threads);
     pthread_mutex_destroy(&fctx->buffer_mutex);
     av_freep(&avctx->internal->thread_ctx);
+
+    if (avctx->priv_data && avctx->codec && avctx->codec->priv_class)
+        av_opt_free(avctx->priv_data);
+    avctx->codec = NULL;
 }
 
 int ff_frame_thread_init(AVCodecContext *avctx)
@@ -564,7 +615,8 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
     if (!thread_count) {
         int nb_cpus = av_cpu_count();
-        av_log(avctx, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
+        if ((avctx->debug & (FF_DEBUG_VIS_QP | FF_DEBUG_VIS_MB_TYPE)) || avctx->debug_mv)
+            nb_cpus = 1;
         // use number of cores + 1 as thread count if there is more than one
         if (nb_cpus > 1)
             thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
@@ -581,7 +633,7 @@ int ff_frame_thread_init(AVCodecContext *avctx)
     if (!fctx)
         return AVERROR(ENOMEM);
 
-    fctx->threads = av_mallocz(sizeof(PerThreadContext) * thread_count);
+    fctx->threads = av_mallocz_array(thread_count, sizeof(PerThreadContext));
     if (!fctx->threads) {
         av_freep(&avctx->internal->thread_ctx);
         return AVERROR(ENOMEM);
@@ -619,6 +671,7 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
         copy->internal = av_malloc(sizeof(AVCodecInternal));
         if (!copy->internal) {
+            copy->priv_data = NULL;
             err = AVERROR(ENOMEM);
             goto error;
         }
@@ -648,8 +701,10 @@ int ff_frame_thread_init(AVCodecContext *avctx)
 
         if (err) goto error;
 
-        if (!pthread_create(&p->thread, NULL, frame_worker_thread, p))
-            p->thread_init = 1;
+        err = AVERROR(pthread_create(&p->thread, NULL, frame_worker_thread, p));
+        p->thread_init= !err;
+        if(!p->thread_init)
+            goto error;
     }
 
     return 0;
@@ -689,18 +744,30 @@ void ff_thread_flush(AVCodecContext *avctx)
     }
 }
 
-int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
+int ff_thread_can_start_frame(AVCodecContext *avctx)
+{
+    PerThreadContext *p = avctx->internal->thread_ctx;
+    if ((avctx->active_thread_type&FF_THREAD_FRAME) && p->state != STATE_SETTING_UP &&
+        (avctx->codec->update_thread_context || !THREAD_SAFE_CALLBACKS(avctx))) {
+        return 0;
+    }
+    return 1;
+}
+
+static int thread_get_buffer_internal(AVCodecContext *avctx, ThreadFrame *f, int flags)
 {
     PerThreadContext *p = avctx->internal->thread_ctx;
     int err;
 
     f->owner = avctx;
 
+    ff_init_buffer_info(avctx, f->f);
+
     if (!(avctx->active_thread_type & FF_THREAD_FRAME))
         return ff_get_buffer(avctx, f->f, flags);
 
     if (p->state != STATE_SETTING_UP &&
-        (avctx->codec->update_thread_context || !avctx->thread_safe_callbacks)) {
+        (avctx->codec->update_thread_context || !THREAD_SAFE_CALLBACKS(avctx))) {
         av_log(avctx, AV_LOG_ERROR, "get_buffer() cannot be called after ff_thread_finish_setup()\n");
         return -1;
     }
@@ -721,11 +788,11 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
         avctx->get_buffer2 == avcodec_default_get_buffer2) {
         err = ff_get_buffer(avctx, f->f, flags);
     } else {
+        pthread_mutex_lock(&p->progress_mutex);
         p->requested_frame = f->f;
         p->requested_flags = flags;
         p->state = STATE_GET_BUFFER;
-        pthread_mutex_lock(&p->progress_mutex);
-        pthread_cond_signal(&p->progress_cond);
+        pthread_cond_broadcast(&p->progress_cond);
 
         while (p->state != STATE_SETTING_UP)
             pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
@@ -735,9 +802,8 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
         pthread_mutex_unlock(&p->progress_mutex);
 
     }
-    if (!avctx->thread_safe_callbacks && !avctx->codec->update_thread_context)
+    if (!THREAD_SAFE_CALLBACKS(avctx) && !avctx->codec->update_thread_context)
         ff_thread_finish_setup(avctx);
-
     if (err)
         av_buffer_unref(&f->progress);
 
@@ -746,6 +812,40 @@ int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
     return err;
 }
 
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
+{
+    enum AVPixelFormat res;
+    PerThreadContext *p = avctx->internal->thread_ctx;
+    if (!(avctx->active_thread_type & FF_THREAD_FRAME) || avctx->thread_safe_callbacks ||
+        avctx->get_format == avcodec_default_get_format)
+        return ff_get_format(avctx, fmt);
+    if (p->state != STATE_SETTING_UP) {
+        av_log(avctx, AV_LOG_ERROR, "get_format() cannot be called after ff_thread_finish_setup()\n");
+        return -1;
+    }
+    pthread_mutex_lock(&p->progress_mutex);
+    p->available_formats = fmt;
+    p->state = STATE_GET_FORMAT;
+    pthread_cond_broadcast(&p->progress_cond);
+
+    while (p->state != STATE_SETTING_UP)
+        pthread_cond_wait(&p->progress_cond, &p->progress_mutex);
+
+    res = p->result_format;
+
+    pthread_mutex_unlock(&p->progress_mutex);
+
+    return res;
+}
+
+int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
+{
+    int ret = thread_get_buffer_internal(avctx, f, flags);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "thread_get_buffer() failed\n");
+    return ret;
+}
+
 void ff_thread_release_buffer(AVCodecContext *avctx, ThreadFrame *f)
 {
     PerThreadContext *p = avctx->internal->thread_ctx;
diff --git a/libavcodec/pthread_internal.h b/libavcodec/pthread_internal.h
index fca9b10..d2115cb 100644
--- a/libavcodec/pthread_internal.h
+++ b/libavcodec/pthread_internal.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index d7c73f0..96a7643 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,20 +24,16 @@
 
 #include "config.h"
 
-#if HAVE_PTHREADS
-#include <pthread.h>
-#elif HAVE_W32THREADS
-#include "compat/w32pthreads.h"
-#endif
-
 #include "avcodec.h"
 #include "internal.h"
 #include "pthread_internal.h"
 #include "thread.h"
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
+#include "libavutil/thread.h"
 
 typedef int (action_func)(AVCodecContext *c, void *arg);
 typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr);
@@ -48,7 +44,6 @@ typedef struct SliceThreadContext {
     action_func2 *func2;
     void *args;
     int *rets;
-    int rets_count;
     int job_count;
     int job_size;
 
@@ -58,6 +53,12 @@ typedef struct SliceThreadContext {
     unsigned current_execute;
     int current_job;
     int done;
+
+    int *entries;
+    int entries_count;
+    int thread_count;
+    pthread_cond_t *progress_cond;
+    pthread_mutex_t *progress_mutex;
 } SliceThreadContext;
 
 static void* attribute_align_arg worker(void *v)
@@ -72,6 +73,7 @@ static void* attribute_align_arg worker(void *v)
     pthread_mutex_lock(&c->current_job_lock);
     self_id = c->current_job++;
     for (;;){
+        int ret;
         while (our_job >= c->job_count) {
             if (c->current_job == thread_count + c->job_count)
                 pthread_cond_signal(&c->last_job_cond);
@@ -88,8 +90,10 @@ static void* attribute_align_arg worker(void *v)
         }
         pthread_mutex_unlock(&c->current_job_lock);
 
-        c->rets[our_job%c->rets_count] = c->func ? c->func(avctx, (char*)c->args + our_job*c->job_size):
-                                                   c->func2(avctx, c->args, our_job, self_id);
+        ret = c->func ? c->func(avctx, (char*)c->args + our_job*c->job_size):
+                                c->func2(avctx, c->args, our_job, self_id);
+        if (c->rets)
+            c->rets[our_job%c->job_count] = ret;
 
         pthread_mutex_lock(&c->current_job_lock);
         our_job = c->current_job++;
@@ -104,15 +108,27 @@ void ff_slice_thread_free(AVCodecContext *avctx)
     pthread_mutex_lock(&c->current_job_lock);
     c->done = 1;
     pthread_cond_broadcast(&c->current_job_cond);
+    for (i = 0; i < c->thread_count; i++)
+        pthread_cond_broadcast(&c->progress_cond[i]);
     pthread_mutex_unlock(&c->current_job_lock);
 
     for (i=0; i<avctx->thread_count; i++)
          pthread_join(c->workers[i], NULL);
 
+    for (i = 0; i < c->thread_count; i++) {
+        pthread_mutex_destroy(&c->progress_mutex[i]);
+        pthread_cond_destroy(&c->progress_cond[i]);
+    }
+
     pthread_mutex_destroy(&c->current_job_lock);
     pthread_cond_destroy(&c->current_job_cond);
     pthread_cond_destroy(&c->last_job_cond);
-    av_free(c->workers);
+
+    av_freep(&c->entries);
+    av_freep(&c->progress_mutex);
+    av_freep(&c->progress_cond);
+
+    av_freep(&c->workers);
     av_freep(&avctx->internal->thread_ctx);
 }
 
@@ -126,7 +142,6 @@ static av_always_inline void thread_park_workers(SliceThreadContext *c, int thre
 static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size)
 {
     SliceThreadContext *c = avctx->internal->thread_ctx;
-    int dummy_ret;
 
     if (!(avctx->active_thread_type&FF_THREAD_SLICE) || avctx->thread_count <= 1)
         return avcodec_default_execute(avctx, func, arg, ret, job_count, job_size);
@@ -143,10 +158,8 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i
     c->func = func;
     if (ret) {
         c->rets = ret;
-        c->rets_count = job_count;
     } else {
-        c->rets = &dummy_ret;
-        c->rets_count = 1;
+        c->rets = NULL;
     }
     c->current_execute++;
     pthread_cond_broadcast(&c->current_job_cond);
@@ -173,9 +186,16 @@ int ff_slice_thread_init(AVCodecContext *avctx)
     w32thread_init();
 #endif
 
+    // We cannot do this in the encoder init as the threads are created before
+    if (av_codec_is_encoder(avctx->codec) &&
+        avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO &&
+        avctx->height > 2800)
+        thread_count = avctx->thread_count = 1;
+
     if (!thread_count) {
         int nb_cpus = av_cpu_count();
-        av_log(avctx, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
+        if  (avctx->height)
+            nb_cpus = FFMIN(nb_cpus, (avctx->height+15)/16);
         // use number of cores + 1 as thread count if there is more than one
         if (nb_cpus > 1)
             thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
@@ -192,7 +212,7 @@ int ff_slice_thread_init(AVCodecContext *avctx)
     if (!c)
         return -1;
 
-    c->workers = av_mallocz(sizeof(pthread_t)*thread_count);
+    c->workers = av_mallocz_array(thread_count, sizeof(pthread_t));
     if (!c->workers) {
         av_free(c);
         return -1;
@@ -222,3 +242,73 @@ int ff_slice_thread_init(AVCodecContext *avctx)
     avctx->execute2 = thread_execute2;
     return 0;
 }
+
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n)
+{
+    SliceThreadContext *p = avctx->internal->thread_ctx;
+    int *entries = p->entries;
+
+    pthread_mutex_lock(&p->progress_mutex[thread]);
+    entries[field] +=n;
+    pthread_cond_signal(&p->progress_cond[thread]);
+    pthread_mutex_unlock(&p->progress_mutex[thread]);
+}
+
+void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift)
+{
+    SliceThreadContext *p  = avctx->internal->thread_ctx;
+    int *entries      = p->entries;
+
+    if (!entries || !field) return;
+
+    thread = thread ? thread - 1 : p->thread_count - 1;
+
+    pthread_mutex_lock(&p->progress_mutex[thread]);
+    while ((entries[field - 1] - entries[field]) < shift){
+        pthread_cond_wait(&p->progress_cond[thread], &p->progress_mutex[thread]);
+    }
+    pthread_mutex_unlock(&p->progress_mutex[thread]);
+}
+
+int ff_alloc_entries(AVCodecContext *avctx, int count)
+{
+    int i;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        SliceThreadContext *p = avctx->internal->thread_ctx;
+
+        if (p->entries) {
+            av_assert0(p->thread_count == avctx->thread_count);
+            av_freep(&p->entries);
+        }
+
+        p->thread_count  = avctx->thread_count;
+        p->entries       = av_mallocz_array(count, sizeof(int));
+
+        if (!p->progress_mutex) {
+            p->progress_mutex = av_malloc_array(p->thread_count, sizeof(pthread_mutex_t));
+            p->progress_cond  = av_malloc_array(p->thread_count, sizeof(pthread_cond_t));
+        }
+
+        if (!p->entries || !p->progress_mutex || !p->progress_cond) {
+            av_freep(&p->entries);
+            av_freep(&p->progress_mutex);
+            av_freep(&p->progress_cond);
+            return AVERROR(ENOMEM);
+        }
+        p->entries_count  = count;
+
+        for (i = 0; i < p->thread_count; i++) {
+            pthread_mutex_init(&p->progress_mutex[i], NULL);
+            pthread_cond_init(&p->progress_cond[i], NULL);
+        }
+    }
+
+    return 0;
+}
+
+void ff_reset_entries(AVCodecContext *avctx)
+{
+    SliceThreadContext *p = avctx->internal->thread_ctx;
+    memset(p->entries, 0, p->entries_count * sizeof(int));
+}
diff --git a/libavcodec/ptx.c b/libavcodec/ptx.c
index 312850c..42147f4 100644
--- a/libavcodec/ptx.c
+++ b/libavcodec/ptx.c
@@ -2,20 +2,20 @@
  * V.Flash PTX (.ptx) image decoder
  * Copyright (c) 2007 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_PATCHWELCOME;
     }
 
-    avctx->pix_fmt = AV_PIX_FMT_RGB555;
+    avctx->pix_fmt = AV_PIX_FMT_BGR555LE;
 
     if (buf_end - buf < offset)
         return AVERROR_INVALIDDATA;
@@ -58,10 +58,8 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
@@ -69,13 +67,7 @@ static int ptx_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     stride = p->linesize[0];
 
     for (y = 0; y < h && buf_end - buf >= w * bytes_per_pixel; y++) {
-#if HAVE_BIGENDIAN
-        unsigned int x;
-        for (x=0; x<w*bytes_per_pixel; x+=bytes_per_pixel)
-            AV_WN16(ptr+x, AV_RL16(buf+x));
-#else
         memcpy(ptr, buf, w*bytes_per_pixel);
-#endif
         ptr += stride;
         buf += w*bytes_per_pixel;
     }
diff --git a/libavcodec/put_bits.h b/libavcodec/put_bits.h
index 17666fa..68ed391 100644
--- a/libavcodec/put_bits.h
+++ b/libavcodec/put_bits.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 
 #include <stdint.h>
 #include <stddef.h>
-#include <assert.h>
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/avassert.h"
 
 typedef struct PutBitContext {
     uint32_t bit_buf;
@@ -62,6 +62,24 @@ static inline void init_put_bits(PutBitContext *s, uint8_t *buffer,
 }
 
 /**
+ * Rebase the bit writer onto a reallocated buffer.
+ *
+ * @param buffer the buffer where to put bits
+ * @param buffer_size the size in bytes of buffer,
+ *                    must be larger than the previous size
+ */
+static inline void rebase_put_bits(PutBitContext *s, uint8_t *buffer,
+                                   int buffer_size)
+{
+    av_assert0(8*buffer_size > s->size_in_bits);
+
+    s->buf_end = buffer + buffer_size;
+    s->buf_ptr = buffer + (s->buf_ptr - s->buf);
+    s->buf     = buffer;
+    s->size_in_bits = 8 * buffer_size;
+}
+
+/**
  * @return the total number of bits written to the bitstream.
  */
 static inline int put_bits_count(PutBitContext *s)
@@ -87,7 +105,7 @@ static inline void flush_put_bits(PutBitContext *s)
         s->bit_buf <<= s->bit_left;
 #endif
     while (s->bit_left < 32) {
-        /* XXX: should test end of buffer */
+        av_assert0(s->buf_ptr < s->buf_end);
 #ifdef BITSTREAM_WRITER_LE
         *s->buf_ptr++ = s->bit_buf;
         s->bit_buf  >>= 8;
@@ -136,7 +154,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     unsigned int bit_buf;
     int bit_left;
 
-    assert(n <= 31 && value < (1U << n));
+    av_assert2(n <= 31 && value < (1U << n));
 
     bit_buf  = s->bit_buf;
     bit_left = s->bit_left;
@@ -145,9 +163,14 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 #ifdef BITSTREAM_WRITER_LE
     bit_buf |= value << (32 - bit_left);
     if (n >= bit_left) {
-        AV_WL32(s->buf_ptr, bit_buf);
-        s->buf_ptr += 4;
-        bit_buf     = (bit_left == 32) ? 0 : value >> bit_left;
+        if (3 < s->buf_end - s->buf_ptr) {
+            AV_WL32(s->buf_ptr, bit_buf);
+            s->buf_ptr += 4;
+        } else {
+            av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n");
+            av_assert2(0);
+        }
+        bit_buf     = value >> bit_left;
         bit_left   += 32;
     }
     bit_left -= n;
@@ -158,8 +181,13 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     } else {
         bit_buf   <<= bit_left;
         bit_buf    |= value >> (n - bit_left);
-        AV_WB32(s->buf_ptr, bit_buf);
-        s->buf_ptr += 4;
+        if (3 < s->buf_end - s->buf_ptr) {
+            AV_WB32(s->buf_ptr, bit_buf);
+            s->buf_ptr += 4;
+        } else {
+            av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n");
+            av_assert2(0);
+        }
         bit_left   += 32 - n;
         bit_buf     = value;
     }
@@ -171,9 +199,9 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 
 static inline void put_sbits(PutBitContext *pb, int n, int32_t value)
 {
-    assert(n >= 0 && n <= 31);
+    av_assert2(n >= 0 && n <= 31);
 
-    put_bits(pb, n, value & ((1 << n) - 1));
+    put_bits(pb, n, av_mod_uintp2(value, n));
 }
 
 /**
@@ -207,8 +235,9 @@ static inline uint8_t *put_bits_ptr(PutBitContext *s)
  */
 static inline void skip_put_bytes(PutBitContext *s, int n)
 {
-    assert((put_bits_count(s) & 7) == 0);
-    assert(s->bit_left == 32);
+    av_assert2((put_bits_count(s) & 7) == 0);
+    av_assert2(s->bit_left == 32);
+    av_assert0(n <= s->buf_end - s->buf_ptr);
     s->buf_ptr += n;
 }
 
@@ -231,7 +260,9 @@ static inline void skip_put_bits(PutBitContext *s, int n)
  */
 static inline void set_put_bits_buffer_size(PutBitContext *s, int size)
 {
+    av_assert0(size <= INT_MAX/8 - 32);
     s->buf_end = s->buf + size;
+    s->size_in_bits = 8*size;
 }
 
 #endif /* AVCODEC_PUT_BITS_H */
diff --git a/libavcodec/qcelpdata.h b/libavcodec/qcelpdata.h
index 319833e..931c990 100644
--- a/libavcodec/qcelpdata.h
+++ b/libavcodec/qcelpdata.h
@@ -2,20 +2,20 @@
  * QCELP decoder
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
  * @file
  * Data tables for the QCELP decoder
  * @author Reynaldo H. Verdejo Pinochet
- * @remark Libav merging spearheaded by Kenan Gillet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
  * @remark Development mentored by Benjamin Larson
  */
 
@@ -66,7 +66,7 @@ typedef struct QCELPFrame {
 } QCELPFrame;
 
 /**
- * pre-calculated table for hammsinc function
+ * Pre-calculated table for hammsinc function.
  * Only half of the table is needed because of symmetry.
  *
  * TIA/EIA/IS-733 2.4.5.2-2/3
@@ -82,7 +82,7 @@ typedef struct QCELPBitmap {
 #define QCELP_OF(variable, bit, len) {offsetof(QCELPFrame, variable), bit, len}
 
 /**
- * bitmap unpacking tables for RATE_FULL
+ * Bitmap unpacking tables for RATE_FULL
  *
  * TIA/EIA/IS-733 Table 2.4.7.1-1
  */
@@ -169,7 +169,7 @@ static const QCELPBitmap qcelp_rate_full_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_HALF
+ * Bitmap unpacking tables for RATE_HALF
  *
  * TIA/EIA/IS-733 Table 2.4.7.2-1
  */
@@ -211,7 +211,7 @@ static const QCELPBitmap qcelp_rate_half_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_QUARTER
+ * Bitmap unpacking tables for RATE_QUARTER
  *
  * TIA/EIA/IS-733 Table 2.4.7.3-1
  */
@@ -232,7 +232,7 @@ static const QCELPBitmap qcelp_rate_quarter_bitmap[] = {
 };
 
 /**
- * bitmap unpacking tables for RATE_OCTAVE
+ * Bitmap unpacking tables for RATE_OCTAVE
  *
  * trick: CBSEED is written into QCELPContext.cbsign[15],
  * which is not used for RATE_OCTAVE.
@@ -257,12 +257,12 @@ static const QCELPBitmap qcelp_rate_octave_bitmap[] = {
     QCELP_OF(lspv   [8], 0, 1), //  8
     QCELP_OF(cbsign[15], 0, 1), //  7
     QCELP_OF(lspv   [9], 0, 1), //  6
-    QCELP_OF(cbgain [0], 0, 2), //  7
+    QCELP_OF(cbgain [0], 0, 2), //  5
     QCELP_OF(reserved,   0, 4)  //  3
 };
 
 /**
- * position of the bitmapping data for each packet type in
+ * Bitmapping data position for each packet type in
  * the QCELPContext
  */
 static const QCELPBitmap * const qcelp_unpacking_bitmaps_per_rate[5] = {
@@ -420,12 +420,12 @@ static const qcelp_vector * const qcelp_lspvq[5] = {
 };
 
 /**
- * the final gain scalefactor before clipping into a usable output float
+ * The final gain scalefactor before clipping into a usable output float
  */
 #define QCELP_SCALE 8192.
 
 /**
- * table for computing Ga (decoded linear codebook gain magnitude)
+ * Table for computing Ga (decoded linear codebook gain magnitude)
  *
  * @note The table could fit in int16_t in x*8 form, but it seems
  *       to be slower on x86
@@ -452,7 +452,7 @@ static const float qcelp_g12ga[61] = {
  1000.000/QCELP_SCALE};
 
 /**
- * circular codebook for rate 1 frames in x*100 form
+ * Circular codebook for rate 1 frames in x*100 form
  *
  * TIA/EIA/IS-733 2.4.6.1-2
  */
@@ -477,7 +477,7 @@ static const int16_t qcelp_rate_full_codebook[128] = {
 #define QCELP_RATE_FULL_CODEBOOK_RATIO .01
 
 /**
- * circular codebook for rate 1/2 frames in x*2 form
+ * Circular codebook for rate 1/2 frames in x*2 form
  *
  * TIA/EIA/IS-733 2.4.6.1-1
  */
@@ -511,7 +511,7 @@ static const int8_t qcelp_rate_half_codebook[128] = {
 #define QCELP_SQRT1887 1.373681186
 
 /**
- * table for impulse response of BPF used to filter
+ * Table for impulse response of BPF used to filter
  * the white excitation for bitrate 1/4 synthesis
  *
  * Only half the tables are needed because of symmetry.
@@ -526,14 +526,14 @@ static const double qcelp_rnd_fir_coefs[11] = {
 
 /**
  * This spread factor is used, for bitrate 1/8 and I_F_Q,
- * to force the LSP frequencies to be at least 80 Hz apart.
+ * to force LSP frequencies to be at least 80 Hz apart.
  *
  * TIA/EIA/IS-733 2.4.3.3.2
  */
 #define QCELP_LSP_SPREAD_FACTOR 0.02
 
 /**
- * predictor coefficient for the conversion of LSP codes
+ * Predictor coefficient for the conversion of LSP codes
  * to LSP frequencies for 1/8 and I_F_Q
  *
  * TIA/EIA/IS-733 2.4.3.2.7-2
@@ -541,7 +541,7 @@ static const double qcelp_rnd_fir_coefs[11] = {
 #define QCELP_LSP_OCTAVE_PREDICTOR 29.0/32
 
 /**
- * initial coefficient to perform bandwidth expansion on LPC
+ * Initial coefficient to perform bandwidth expansion on LPC
  *
  * @note: 0.9883 looks like an approximation of 253/256.
  *
diff --git a/libavcodec/qcelpdec.c b/libavcodec/qcelpdec.c
index e9e7347..b4afda2 100644
--- a/libavcodec/qcelpdec.c
+++ b/libavcodec/qcelpdec.c
@@ -2,20 +2,20 @@
  * QCELP decoder
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,12 +23,13 @@
  * @file
  * QCELP decoder
  * @author Reynaldo H. Verdejo Pinochet
- * @remark Libav merging spearheaded by Kenan Gillet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
  * @remark Development mentored by Benjamin Larson
  */
 
 #include <stddef.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/float_dsp.h"
 #include "avcodec.h"
@@ -40,9 +41,6 @@
 #include "acelp_vectors.h"
 #include "lsp.h"
 
-#undef NDEBUG
-#include <assert.h>
-
 typedef enum {
     I_F_Q = -1,    /**< insufficient frame quality */
     SILENCE,
@@ -135,7 +133,7 @@ static int decode_lspf(QCELPContext *q, float *lspf)
         } else {
             erasure_coeff = QCELP_LSP_OCTAVE_PREDICTOR;
 
-            assert(q->bitrate == I_F_Q);
+            av_assert2(q->bitrate == I_F_Q);
 
             if (q->erasure_count > 1)
                 erasure_coeff *= q->erasure_count < 4 ? 0.9 : 0.7;
@@ -239,7 +237,7 @@ static void decode_gain_and_index(QCELPContext *q, float *gain)
                     av_clip((q->prev_g1[0] + q->prev_g1[1]) / 2 - 5, 0, 54);
             subframes_count = 8;
         } else {
-            assert(q->bitrate == I_F_Q);
+            av_assert2(q->bitrate == I_F_Q);
 
             g1[0] = q->prev_g1[1];
             switch (q->erasure_count) {
@@ -321,7 +319,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
             tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
             cindex   = -q->frame.cindex[i];
             for (j = 0; j < 10; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cindex++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_full_codebook[cindex++ & 127];
         }
         break;
     case RATE_HALF:
@@ -329,7 +328,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
             tmp_gain = gain[i] * QCELP_RATE_HALF_CODEBOOK_RATIO;
             cindex   = -q->frame.cindex[i];
             for (j = 0; j < 40; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_half_codebook[cindex++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_half_codebook[cindex++ & 127];
         }
         break;
     case RATE_QUARTER:
@@ -374,7 +374,8 @@ static void compute_svector(QCELPContext *q, const float *gain,
         for (i = 0; i < 4; i++) {
             tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
             for (j = 0; j < 40; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cbseed++ & 127];
+                *cdn_vector++ = tmp_gain *
+                                qcelp_rate_full_codebook[cbseed++ & 127];
         }
         break;
     case SILENCE:
@@ -435,7 +436,8 @@ static const float *do_pitchfilter(float memory[303], const float v_in[160],
             for (v_len = v_in + 40; v_in < v_len; v_in++) {
                 if (pfrac[i]) { // If it is a fractional lag...
                     for (j = 0, *v_out = 0.0; j < 4; j++)
-                        *v_out += qcelp_hammsinc_table[j] * (v_lag[j - 4] + v_lag[3 - j]);
+                        *v_out += qcelp_hammsinc_table[j] *
+                                  (v_lag[j - 4] + v_lag[3 - j]);
                 } else
                     *v_out = *v_lag;
 
@@ -486,7 +488,7 @@ static void apply_pitch_filters(QCELPContext *q, float *cdn_vector)
                   else
                       max_pitch_gain = 0.0;
             } else {
-                assert(q->bitrate == SILENCE);
+                av_assert2(q->bitrate == SILENCE);
                 max_pitch_gain = 1.0;
             }
             for (i = 0; i < 4; i++)
@@ -511,7 +513,8 @@ static void apply_pitch_filters(QCELPContext *q, float *cdn_vector)
 
         apply_gain_ctrl(cdn_vector, v_synthesis_filtered, v_pre_filtered);
     } else {
-        memcpy(q->pitch_synthesis_filter_mem, cdn_vector + 17, 143 * sizeof(float));
+        memcpy(q->pitch_synthesis_filter_mem,
+               cdn_vector + 17, 143 * sizeof(float));
         memcpy(q->pitch_pre_filter_mem, cdn_vector + 17, 143 * sizeof(float));
         memset(q->pitch_gain, 0, sizeof(q->pitch_gain));
         memset(q->pitch_lag,  0, sizeof(q->pitch_lag));
@@ -630,7 +633,7 @@ static qcelp_packet_rate determine_bitrate(AVCodecContext *avctx,
         (*buf)++;
     } else if ((bitrate = buf_size2bitrate(buf_size + 1)) >= 0) {
         av_log(avctx, AV_LOG_WARNING,
-               "Bitrate byte is missing, guessing the bitrate from packet size.\n");
+               "Bitrate byte missing, guessing bitrate from packet size.\n");
     } else
         return I_F_Q;
 
@@ -695,14 +698,12 @@ static int qcelp_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 160;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     outbuffer = (float *)frame->data[0];
 
     if ((q->bitrate = determine_bitrate(avctx, buf_size, &buf)) == I_F_Q) {
-        warn_insufficient_frame_quality(avctx, "bitrate cannot be determined.");
+        warn_insufficient_frame_quality(avctx, "Bitrate cannot be determined.");
         goto erasure;
     }
 
@@ -718,7 +719,8 @@ static int qcelp_decode_frame(AVCodecContext *avctx, void *data,
                                          qcelp_unpacking_bitmaps_lengths[q->bitrate];
         uint8_t *unpacked_data         = (uint8_t *)&q->frame;
 
-        init_get_bits(&q->gb, buf, 8 * buf_size);
+        if ((ret = init_get_bits8(&q->gb, buf, buf_size)) < 0)
+            return ret;
 
         memset(&q->frame, 0, sizeof(QCELPFrame));
 
@@ -770,7 +772,8 @@ erasure:
     formant_mem = q->formant_mem + 10;
     for (i = 0; i < 4; i++) {
         interpolate_lpc(q, quantized_lspf, lpc, i);
-        ff_celp_lp_synthesis_filterf(formant_mem, lpc, outbuffer + i * 40, 40, 10);
+        ff_celp_lp_synthesis_filterf(formant_mem, lpc,
+                                     outbuffer + i * 40, 40, 10);
         formant_mem += 40;
     }
 
diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c
index b33e7c6..074aafd 100644
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2005 Alex Beregszaszi
  * Copyright (c) 2005 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,10 +44,8 @@
 #include "mpegaudiodsp.h"
 #include "mpegaudio.h"
 
-#include "qdm2data.h"
 #include "qdm2_tablegen.h"
 
-
 #define QDM2_LIST_ADD(list, size, packet) \
 do { \
       if (size > 0) { \
@@ -165,7 +163,7 @@ typedef struct QDM2Context {
     /// I/O data
     const uint8_t *compressed_data;
     int compressed_size;
-    float output_buffer[QDM2_MAX_FRAME_SIZE * 2];
+    float output_buffer[QDM2_MAX_FRAME_SIZE * MPA_MAX_CHANNELS * 2];
 
     /// Synthesis filter
     MPADSPContext mpadsp;
@@ -194,173 +192,11 @@ typedef struct QDM2Context {
     int noise_idx; ///< index for dithering noise table
 } QDM2Context;
 
-
-static VLC vlc_tab_level;
-static VLC vlc_tab_diff;
-static VLC vlc_tab_run;
-static VLC fft_level_exp_alt_vlc;
-static VLC fft_level_exp_vlc;
-static VLC fft_stereo_exp_vlc;
-static VLC fft_stereo_phase_vlc;
-static VLC vlc_tab_tone_level_idx_hi1;
-static VLC vlc_tab_tone_level_idx_mid;
-static VLC vlc_tab_tone_level_idx_hi2;
-static VLC vlc_tab_type30;
-static VLC vlc_tab_type34;
-static VLC vlc_tab_fft_tone_offset[5];
-
-static const uint16_t qdm2_vlc_offs[] = {
-    0,260,566,598,894,1166,1230,1294,1678,1950,2214,2278,2310,2570,2834,3124,3448,3838,
-};
-
 static const int switchtable[23] = {
     0, 5, 1, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 4
 };
 
-static av_cold void qdm2_init_vlc(void)
-{
-    static VLC_TYPE qdm2_table[3838][2];
-
-    vlc_tab_level.table           = &qdm2_table[qdm2_vlc_offs[0]];
-    vlc_tab_level.table_allocated = qdm2_vlc_offs[1] - qdm2_vlc_offs[0];
-    init_vlc(&vlc_tab_level, 8, 24,
-             vlc_tab_level_huffbits, 1, 1,
-             vlc_tab_level_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_diff.table           = &qdm2_table[qdm2_vlc_offs[1]];
-    vlc_tab_diff.table_allocated = qdm2_vlc_offs[2] - qdm2_vlc_offs[1];
-    init_vlc(&vlc_tab_diff, 8, 37,
-             vlc_tab_diff_huffbits, 1, 1,
-             vlc_tab_diff_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_run.table           = &qdm2_table[qdm2_vlc_offs[2]];
-    vlc_tab_run.table_allocated = qdm2_vlc_offs[3] - qdm2_vlc_offs[2];
-    init_vlc(&vlc_tab_run, 5, 6,
-             vlc_tab_run_huffbits, 1, 1,
-             vlc_tab_run_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_level_exp_alt_vlc.table           = &qdm2_table[qdm2_vlc_offs[3]];
-    fft_level_exp_alt_vlc.table_allocated = qdm2_vlc_offs[4] -
-                                            qdm2_vlc_offs[3];
-    init_vlc(&fft_level_exp_alt_vlc, 8, 28,
-             fft_level_exp_alt_huffbits, 1, 1,
-             fft_level_exp_alt_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_level_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[4]];
-    fft_level_exp_vlc.table_allocated = qdm2_vlc_offs[5] - qdm2_vlc_offs[4];
-    init_vlc(&fft_level_exp_vlc, 8, 20,
-             fft_level_exp_huffbits, 1, 1,
-             fft_level_exp_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_stereo_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[5]];
-    fft_stereo_exp_vlc.table_allocated = qdm2_vlc_offs[6] -
-                                         qdm2_vlc_offs[5];
-    init_vlc(&fft_stereo_exp_vlc, 6, 7,
-             fft_stereo_exp_huffbits, 1, 1,
-             fft_stereo_exp_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    fft_stereo_phase_vlc.table           = &qdm2_table[qdm2_vlc_offs[6]];
-    fft_stereo_phase_vlc.table_allocated = qdm2_vlc_offs[7] -
-                                           qdm2_vlc_offs[6];
-    init_vlc(&fft_stereo_phase_vlc, 6, 9,
-             fft_stereo_phase_huffbits, 1, 1,
-             fft_stereo_phase_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_hi1.table =
-        &qdm2_table[qdm2_vlc_offs[7]];
-    vlc_tab_tone_level_idx_hi1.table_allocated = qdm2_vlc_offs[8] -
-                                                 qdm2_vlc_offs[7];
-    init_vlc(&vlc_tab_tone_level_idx_hi1, 8, 20,
-             vlc_tab_tone_level_idx_hi1_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_hi1_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_mid.table =
-        &qdm2_table[qdm2_vlc_offs[8]];
-    vlc_tab_tone_level_idx_mid.table_allocated = qdm2_vlc_offs[9] -
-                                                 qdm2_vlc_offs[8];
-    init_vlc(&vlc_tab_tone_level_idx_mid, 8, 24,
-             vlc_tab_tone_level_idx_mid_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_mid_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_tone_level_idx_hi2.table =
-        &qdm2_table[qdm2_vlc_offs[9]];
-    vlc_tab_tone_level_idx_hi2.table_allocated = qdm2_vlc_offs[10] -
-                                                 qdm2_vlc_offs[9];
-    init_vlc(&vlc_tab_tone_level_idx_hi2, 8, 24,
-             vlc_tab_tone_level_idx_hi2_huffbits, 1, 1,
-             vlc_tab_tone_level_idx_hi2_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_type30.table           = &qdm2_table[qdm2_vlc_offs[10]];
-    vlc_tab_type30.table_allocated = qdm2_vlc_offs[11] - qdm2_vlc_offs[10];
-    init_vlc(&vlc_tab_type30, 6, 9,
-             vlc_tab_type30_huffbits, 1, 1,
-             vlc_tab_type30_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_type34.table           = &qdm2_table[qdm2_vlc_offs[11]];
-    vlc_tab_type34.table_allocated = qdm2_vlc_offs[12] - qdm2_vlc_offs[11];
-    init_vlc(&vlc_tab_type34, 5, 10,
-             vlc_tab_type34_huffbits, 1, 1,
-             vlc_tab_type34_huffcodes, 1, 1,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[0].table =
-        &qdm2_table[qdm2_vlc_offs[12]];
-    vlc_tab_fft_tone_offset[0].table_allocated = qdm2_vlc_offs[13] -
-                                                 qdm2_vlc_offs[12];
-    init_vlc(&vlc_tab_fft_tone_offset[0], 8, 23,
-             vlc_tab_fft_tone_offset_0_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_0_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[1].table =
-        &qdm2_table[qdm2_vlc_offs[13]];
-    vlc_tab_fft_tone_offset[1].table_allocated = qdm2_vlc_offs[14] -
-                                                 qdm2_vlc_offs[13];
-    init_vlc(&vlc_tab_fft_tone_offset[1], 8, 28,
-             vlc_tab_fft_tone_offset_1_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_1_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[2].table =
-        &qdm2_table[qdm2_vlc_offs[14]];
-    vlc_tab_fft_tone_offset[2].table_allocated = qdm2_vlc_offs[15] -
-                                                 qdm2_vlc_offs[14];
-    init_vlc(&vlc_tab_fft_tone_offset[2], 8, 32,
-             vlc_tab_fft_tone_offset_2_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_2_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[3].table =
-        &qdm2_table[qdm2_vlc_offs[15]];
-    vlc_tab_fft_tone_offset[3].table_allocated = qdm2_vlc_offs[16] -
-                                                 qdm2_vlc_offs[15];
-    init_vlc(&vlc_tab_fft_tone_offset[3], 8, 35,
-             vlc_tab_fft_tone_offset_3_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_3_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-
-    vlc_tab_fft_tone_offset[4].table =
-        &qdm2_table[qdm2_vlc_offs[16]];
-    vlc_tab_fft_tone_offset[4].table_allocated = qdm2_vlc_offs[17] -
-                                                 qdm2_vlc_offs[16];
-    init_vlc(&vlc_tab_fft_tone_offset[4], 8, 38,
-             vlc_tab_fft_tone_offset_4_huffbits, 1, 1,
-             vlc_tab_fft_tone_offset_4_huffcodes, 2, 2,
-             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
-}
-
-static int qdm2_get_vlc(GetBitContext *gb, VLC *vlc, int flag, int depth)
+static int qdm2_get_vlc(GetBitContext *gb, const VLC *vlc, int flag, int depth)
 {
     int value;
 
@@ -372,7 +208,14 @@ static int qdm2_get_vlc(GetBitContext *gb, VLC *vlc, int flag, int depth)
 
     /* stage-3, optional */
     if (flag) {
-        int tmp = vlc_stage3_values[value];
+        int tmp;
+
+        if (value >= 60) {
+            av_log(NULL, AV_LOG_ERROR, "value %d in qdm2_get_vlc too large\n", value);
+            return 0;
+        }
+
+        tmp= vlc_stage3_values[value];
 
         if ((value & ~3) > 0)
             tmp += get_bits(gb, (value >> 2));
@@ -382,7 +225,7 @@ static int qdm2_get_vlc(GetBitContext *gb, VLC *vlc, int flag, int depth)
     return value;
 }
 
-static int qdm2_get_se_vlc(VLC *vlc, GetBitContext *gb, int depth)
+static int qdm2_get_se_vlc(const VLC *vlc, GetBitContext *gb, int depth)
 {
     int value = qdm2_get_vlc(gb, vlc, 0, depth);
 
@@ -691,7 +534,8 @@ static void fill_coding_method_array(sb_int8_array tone_level_idx,
 
     if (!superblocktype_2_3) {
         /* This case is untested, no samples available */
-        SAMPLES_NEEDED
+        avpriv_request_sample(NULL, "!superblocktype_2_3");
+        return;
         for (ch = 0; ch < nb_channels; ch++)
             for (sb = 0; sb < 30; sb++) {
                 for (j = 1; j < 63; j++) {  // The loop only iterates to 63 so the code doesn't overflow the buffer
@@ -802,7 +646,7 @@ static void fill_coding_method_array(sb_int8_array tone_level_idx,
  * @param sb_min    lower subband processed (sb_min included)
  * @param sb_max    higher subband processed (sb_max excluded)
  */
-static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
+static int synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                        int length, int sb_min, int sb_max)
 {
     int sb, j, k, n, ch, run, channels;
@@ -810,14 +654,15 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
     int type34_first;
     float type34_div = 0;
     float type34_predictor;
-    float samples[10], sign_bits[16];
+    float samples[10];
+    int sign_bits[16] = {0};
 
     if (length == 0) {
         // If no data use noise
         for (sb=sb_min; sb < sb_max; sb++)
             build_sb_samples_from_noise(q, sb);
 
-        return;
+        return 0;
     }
 
     for (sb = sb_min; sb < sb_max; sb++) {
@@ -841,6 +686,7 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
 
             if (fix_coding_method_array(sb, q->nb_channels,
                                             q->coding_method)) {
+                av_log(NULL, AV_LOG_ERROR, "coding method invalid\n");
                 build_sb_samples_from_noise(q, sb);
                 continue;
             }
@@ -865,6 +711,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                 }
                             } else {
                                 n = get_bits(gb, 8);
+                                if (n >= 243) {
+                                    av_log(NULL, AV_LOG_ERROR, "Invalid 8bit codeword\n");
+                                    return AVERROR_INVALIDDATA;
+                                }
+
                                 for (k = 0; k < 5; k++)
                                     samples[2 * k] = dequant_1bit[joined_stereo][random_dequant_index[n][k]];
                             }
@@ -901,6 +752,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                 }
                             } else {
                                 n = get_bits (gb, 8);
+                                if (n >= 243) {
+                                    av_log(NULL, AV_LOG_ERROR, "Invalid 8bit codeword\n");
+                                    return AVERROR_INVALIDDATA;
+                                }
+
                                 for (k = 0; k < 5; k++)
                                     samples[k] = dequant_1bit[joined_stereo][random_dequant_index[n][k]];
                             }
@@ -914,6 +770,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                     case 24:
                         if (get_bits_left(gb) >= 7) {
                             n = get_bits(gb, 7);
+                            if (n >= 125) {
+                                av_log(NULL, AV_LOG_ERROR, "Invalid 7bit codeword\n");
+                                return AVERROR_INVALIDDATA;
+                            }
+
                             for (k = 0; k < 3; k++)
                                 samples[k] = (random_dequant_type24[n][k] - 2.0) * 0.5;
                         } else {
@@ -926,10 +787,11 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                     case 30:
                         if (get_bits_left(gb) >= 4) {
                             unsigned index = qdm2_get_vlc(gb, &vlc_tab_type30, 0, 1);
-                            if (index < FF_ARRAY_ELEMS(type30_dequant)) {
-                                samples[0] = type30_dequant[index];
-                            } else
-                                samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
+                            if (index >= FF_ARRAY_ELEMS(type30_dequant)) {
+                                av_log(NULL, AV_LOG_ERROR, "index %d out of type30_dequant array\n", index);
+                                return AVERROR_INVALIDDATA;
+                            }
+                            samples[0] = type30_dequant[index];
                         } else
                             samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
 
@@ -945,11 +807,12 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
                                 type34_first = 0;
                             } else {
                                 unsigned index = qdm2_get_vlc(gb, &vlc_tab_type34, 0, 1);
-                                if (index < FF_ARRAY_ELEMS(type34_delta)) {
-                                    samples[0] = type34_delta[index] / type34_div + type34_predictor;
-                                    type34_predictor = samples[0];
-                                } else
-                                    samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
+                                if (index >= FF_ARRAY_ELEMS(type34_delta)) {
+                                    av_log(NULL, AV_LOG_ERROR, "index %d out of type34_delta array\n", index);
+                                    return AVERROR_INVALIDDATA;
+                                }
+                                samples[0] = type34_delta[index] / type34_div + type34_predictor;
+                                type34_predictor = samples[0];
                             }
                         } else {
                             samples[0] = SB_DITHERING_NOISE(sb,q->noise_idx);
@@ -986,6 +849,7 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
             } // j loop
         } // channel loop
     } // subband loop
+    return 0;
 }
 
 /**
@@ -998,24 +862,27 @@ static void synthfilt_build_sb_samples(QDM2Context *q, GetBitContext *gb,
  * @param quantized_coeffs    pointer to quantized_coeffs[ch][0]
  * @param gb        bitreader context
  */
-static void init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
+static int init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
                                         GetBitContext *gb)
 {
     int i, k, run, level, diff;
 
     if (get_bits_left(gb) < 16)
-        return;
+        return -1;
     level = qdm2_get_vlc(gb, &vlc_tab_level, 0, 2);
 
     quantized_coeffs[0] = level;
 
     for (i = 0; i < 7; ) {
         if (get_bits_left(gb) < 16)
-            break;
+            return -1;
         run = qdm2_get_vlc(gb, &vlc_tab_run, 0, 1) + 1;
 
+        if (i + run >= 8)
+            return -1;
+
         if (get_bits_left(gb) < 16)
-            break;
+            return -1;
         diff = qdm2_get_se_vlc(&vlc_tab_diff, gb, 2);
 
         for (k = 1; k <= run; k++)
@@ -1024,6 +891,7 @@ static void init_quantized_coeffs_elem0(int8_t *quantized_coeffs,
         level += diff;
         i += run;
     }
+    return 0;
 }
 
 /**
@@ -1098,7 +966,7 @@ static void init_tone_level_dequantization(QDM2Context *q, GetBitContext *gb)
  * @param q       context
  * @param node    pointer to node with packet
  */
-static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
+static int process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
 {
     GetBitContext gb;
     int i, j, k, n, ch, run, level, diff;
@@ -1116,6 +984,9 @@ static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
                 run  = qdm2_get_vlc(&gb, &vlc_tab_run, 0, 1) + 1;
                 diff = qdm2_get_se_vlc(&vlc_tab_diff, &gb, 2);
 
+                if (j + run >= 8)
+                    return -1;
+
                 for (k = 1; k <= run; k++)
                     q->quantized_coeffs[ch][i][j + k] = (level + ((k * diff) / run));
 
@@ -1127,6 +998,8 @@ static void process_subpacket_9(QDM2Context *q, QDM2SubPNode *node)
     for (ch = 0; ch < q->nb_channels; ch++)
         for (i = 0; i < 8; i++)
             q->quantized_coeffs[ch][0][i] = 0;
+
+    return 0;
 }
 
 /**
@@ -1196,7 +1069,7 @@ static void process_subpacket_12(QDM2Context *q, QDM2SubPNode *node)
     synthfilt_build_sb_samples(q, &gb, length, 8, QDM2_SB_USED(q->sub_sampling));
 }
 
-/*
+/**
  * Process new subpackets for synthesis filter
  *
  * @param q       context
@@ -1229,7 +1102,7 @@ static void process_synthesis_subpackets(QDM2Context *q, QDM2SubPNode *list)
         process_subpacket_12(q, NULL);
 }
 
-/*
+/**
  * Decode superblock, fill packet lists.
  *
  * @param q    context
@@ -1389,9 +1262,14 @@ static void qdm2_fft_decode_tones(QDM2Context *q, int duration,
     local_int_10 = 1 << (q->group_order - duration - 1);
     offset       = 1;
 
-    while (1) {
+    while (get_bits_left(gb)>0) {
         if (q->superblocktype_2_3) {
             while ((n = qdm2_get_vlc(gb, &vlc_tab_fft_tone_offset[local_int_8], 1, 2)) < 2) {
+                if (get_bits_left(gb)<0) {
+                    if(local_int_4 < q->group_size)
+                        av_log(NULL, AV_LOG_ERROR, "overread in qdm2_fft_decode_tones()\n");
+                    return;
+                }
                 offset = 1;
                 if (n == 0) {
                     local_int_4  += local_int_10;
@@ -1704,12 +1582,19 @@ static void qdm2_synthesis_filter(QDM2Context *q, int index)
  *
  * @param q    context
  */
-static av_cold void qdm2_init_static_data(AVCodec *codec) {
+static av_cold void qdm2_init_static_data(void) {
+    static int done;
+
+    if(done)
+        return;
+
     qdm2_init_vlc();
     ff_mpa_synth_init_float(ff_mpa_synth_window_float);
     softclip_table_init();
     rnd_table_init();
     init_noise_samples();
+
+    done = 1;
 }
 
 /**
@@ -1722,6 +1607,8 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     int extradata_size;
     int tmp_val, tmp, size;
 
+    qdm2_init_static_data();
+
     /* extradata parsing
 
     Structure:
@@ -1810,8 +1697,10 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
 
     avctx->channels = s->nb_channels = s->channels = AV_RB32(extradata);
     extradata += 4;
-    if (s->channels <= 0 || s->channels > MPA_MAX_CHANNELS)
+    if (s->channels <= 0 || s->channels > MPA_MAX_CHANNELS) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
         return AVERROR_INVALIDDATA;
+    }
     avctx->channel_layout = avctx->channels == 2 ? AV_CH_LAYOUT_STEREO :
                                                    AV_CH_LAYOUT_MONO;
 
@@ -1838,6 +1727,7 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     // something like max decodable tones
     s->group_order = av_log2(s->group_size) + 1;
     s->frame_size = s->group_size / 16; // 16 iterations per super block
+
     if (s->frame_size > QDM2_MAX_FRAME_SIZE)
         return AVERROR_INVALIDDATA;
 
@@ -1860,18 +1750,9 @@ static av_cold int qdm2_decode_init(AVCodecContext *avctx)
     if ((tmp * 2240) < avctx->bit_rate)  tmp_val = 4;
     s->cm_table_select = tmp_val;
 
-    if (s->sub_sampling == 0)
-        tmp = 7999;
-    else
-        tmp = ((-(s->sub_sampling -1)) & 8000) + 20000;
-    /*
-    0: 7999 -> 0
-    1: 20000 -> 2
-    2: 28000 -> 2
-    */
-    if (tmp < 8000)
+    if (avctx->bit_rate <= 8000)
         s->coeff_per_sb_select = 0;
-    else if (tmp <= 16000)
+    else if (avctx->bit_rate < 16000)
         s->coeff_per_sb_select = 1;
     else
         s->coeff_per_sb_select = 2;
@@ -1908,6 +1789,9 @@ static int qdm2_decode(QDM2Context *q, const uint8_t *in, int16_t *out)
     int ch, i;
     const int frame_size = (q->frame_size * q->channels);
 
+    if((unsigned)frame_size > FF_ARRAY_ELEMS(q->output_buffer)/2)
+        return -1;
+
     /* select input buffer */
     q->compressed_data = in;
     q->compressed_size = q->checksum_size;
@@ -1979,10 +1863,8 @@ static int qdm2_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = 16 * s->frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out = (int16_t *)frame->data[0];
 
     for (i = 0; i < 16; i++) {
@@ -2003,7 +1885,6 @@ AVCodec ff_qdm2_decoder = {
     .id               = AV_CODEC_ID_QDM2,
     .priv_data_size   = sizeof(QDM2Context),
     .init             = qdm2_decode_init,
-    .init_static_data = qdm2_init_static_data,
     .close            = qdm2_decode_close,
     .decode           = qdm2_decode_frame,
     .capabilities     = AV_CODEC_CAP_DR1,
diff --git a/libavcodec/qdm2_tablegen.c b/libavcodec/qdm2_tablegen.c
index 59d82df..e19b49b 100644
--- a/libavcodec/qdm2_tablegen.c
+++ b/libavcodec/qdm2_tablegen.c
@@ -3,27 +3,27 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdlib.h>
+#include "tableprint_vlc.h"
 #define CONFIG_HARDCODED_TABLES 0
 #include "qdm2_tablegen.h"
-#include "tableprint.h"
 
 int main(void)
 {
@@ -40,5 +40,22 @@ int main(void)
     WRITE_2D_ARRAY("static const", uint8_t, random_dequant_index);
     WRITE_2D_ARRAY("static const", uint8_t, random_dequant_type24);
 
+    qdm2_init_vlc();
+
+    WRITE_2D_ARRAY("static const", VLC_TYPE, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_level, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_diff, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_run, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_level_exp_alt_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_level_exp_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_stereo_exp_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", fft_stereo_phase_vlc, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_hi1, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_mid, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_tone_level_idx_hi2, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_type30, qdm2_table);
+    WRITE_VLC_TYPE("static const", vlc_tab_type34, qdm2_table);
+    WRITE_VLC_ARRAY("static const", vlc_tab_fft_tone_offset, qdm2_table);
+
     return 0;
 }
diff --git a/libavcodec/qdm2_tablegen.h b/libavcodec/qdm2_tablegen.h
index bb73d92..2331ebf 100644
--- a/libavcodec/qdm2_tablegen.h
+++ b/libavcodec/qdm2_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 #include <math.h>
 #include "libavutil/attributes.h"
+#include "qdm2data.h"
 
 #define SOFTCLIP_THRESHOLD 27600
 #define HARDCLIP_THRESHOLD 35716
@@ -34,10 +35,11 @@
 #define softclip_table_init()
 #define rnd_table_init()
 #define init_noise_samples()
+#define qdm2_init_vlc()
 #include "libavcodec/qdm2_tables.h"
 #else
 static uint16_t softclip_table[HARDCLIP_THRESHOLD - SOFTCLIP_THRESHOLD + 1];
-static float noise_table[4096];
+static float noise_table[4096 + 20];
 static uint8_t random_dequant_index[256][5];
 static uint8_t random_dequant_type24[128][3];
 static float noise_samples[128];
@@ -54,8 +56,7 @@ static av_cold void softclip_table_init(void) {
 // random generated table
 static av_cold void rnd_table_init(void) {
     int i,j;
-    uint32_t ldw,hdw;
-    uint64_t tmp64_1;
+    uint32_t ldw;
     uint64_t random_seed = 0;
     float delta = 1.0 / 16384.0;
     for(i = 0; i < 4096 ;i++) {
@@ -67,22 +68,18 @@ static av_cold void rnd_table_init(void) {
         random_seed = 81;
         ldw = i;
         for (j = 0; j < 5 ;j++) {
-            random_dequant_index[i][j] = (uint8_t)((ldw / random_seed) & 0xFF);
-            ldw = (uint32_t)ldw % (uint32_t)random_seed;
-            tmp64_1 = (random_seed * 0x55555556);
-            hdw = (uint32_t)(tmp64_1 >> 32);
-            random_seed = (uint64_t)(hdw + (ldw >> 31));
+            random_dequant_index[i][j] = ldw / random_seed;
+            ldw %= random_seed;
+            random_seed /= 3;
         }
     }
     for (i = 0; i < 128 ;i++) {
         random_seed = 25;
         ldw = i;
         for (j = 0; j < 3 ;j++) {
-            random_dequant_type24[i][j] = (uint8_t)((ldw / random_seed) & 0xFF);
-            ldw = (uint32_t)ldw % (uint32_t)random_seed;
-            tmp64_1 = (random_seed * 0x66666667);
-            hdw = (uint32_t)(tmp64_1 >> 33);
-            random_seed = hdw + (ldw >> 31);
+            random_dequant_type24[i][j] = ldw / random_seed;
+            ldw %= random_seed;
+            random_seed /= 5;
         }
     }
 }
@@ -97,6 +94,168 @@ static av_cold void init_noise_samples(void) {
         noise_samples[i] = (delta * (float)((random_seed >> 16) & 0x00007fff) - 1.0);
     }
 }
+
+static VLC vlc_tab_level;
+static VLC vlc_tab_diff;
+static VLC vlc_tab_run;
+static VLC fft_level_exp_alt_vlc;
+static VLC fft_level_exp_vlc;
+static VLC fft_stereo_exp_vlc;
+static VLC fft_stereo_phase_vlc;
+static VLC vlc_tab_tone_level_idx_hi1;
+static VLC vlc_tab_tone_level_idx_mid;
+static VLC vlc_tab_tone_level_idx_hi2;
+static VLC vlc_tab_type30;
+static VLC vlc_tab_type34;
+static VLC vlc_tab_fft_tone_offset[5];
+
+static const uint16_t qdm2_vlc_offs[] = {
+    0,260,566,598,894,1166,1230,1294,1678,1950,2214,2278,2310,2570,2834,3124,3448,3838,
+};
+
+static VLC_TYPE qdm2_table[3838][2];
+
+static av_cold void qdm2_init_vlc(void)
+{
+    vlc_tab_level.table           = &qdm2_table[qdm2_vlc_offs[0]];
+    vlc_tab_level.table_allocated = qdm2_vlc_offs[1] - qdm2_vlc_offs[0];
+    init_vlc(&vlc_tab_level, 8, 24,
+             vlc_tab_level_huffbits, 1, 1,
+             vlc_tab_level_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_diff.table           = &qdm2_table[qdm2_vlc_offs[1]];
+    vlc_tab_diff.table_allocated = qdm2_vlc_offs[2] - qdm2_vlc_offs[1];
+    init_vlc(&vlc_tab_diff, 8, 37,
+             vlc_tab_diff_huffbits, 1, 1,
+             vlc_tab_diff_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_run.table           = &qdm2_table[qdm2_vlc_offs[2]];
+    vlc_tab_run.table_allocated = qdm2_vlc_offs[3] - qdm2_vlc_offs[2];
+    init_vlc(&vlc_tab_run, 5, 6,
+             vlc_tab_run_huffbits, 1, 1,
+             vlc_tab_run_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_level_exp_alt_vlc.table           = &qdm2_table[qdm2_vlc_offs[3]];
+    fft_level_exp_alt_vlc.table_allocated = qdm2_vlc_offs[4] -
+                                            qdm2_vlc_offs[3];
+    init_vlc(&fft_level_exp_alt_vlc, 8, 28,
+             fft_level_exp_alt_huffbits, 1, 1,
+             fft_level_exp_alt_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_level_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[4]];
+    fft_level_exp_vlc.table_allocated = qdm2_vlc_offs[5] - qdm2_vlc_offs[4];
+    init_vlc(&fft_level_exp_vlc, 8, 20,
+             fft_level_exp_huffbits, 1, 1,
+             fft_level_exp_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_stereo_exp_vlc.table           = &qdm2_table[qdm2_vlc_offs[5]];
+    fft_stereo_exp_vlc.table_allocated = qdm2_vlc_offs[6] -
+                                         qdm2_vlc_offs[5];
+    init_vlc(&fft_stereo_exp_vlc, 6, 7,
+             fft_stereo_exp_huffbits, 1, 1,
+             fft_stereo_exp_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    fft_stereo_phase_vlc.table           = &qdm2_table[qdm2_vlc_offs[6]];
+    fft_stereo_phase_vlc.table_allocated = qdm2_vlc_offs[7] -
+                                           qdm2_vlc_offs[6];
+    init_vlc(&fft_stereo_phase_vlc, 6, 9,
+             fft_stereo_phase_huffbits, 1, 1,
+             fft_stereo_phase_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_hi1.table =
+        &qdm2_table[qdm2_vlc_offs[7]];
+    vlc_tab_tone_level_idx_hi1.table_allocated = qdm2_vlc_offs[8] -
+                                                 qdm2_vlc_offs[7];
+    init_vlc(&vlc_tab_tone_level_idx_hi1, 8, 20,
+             vlc_tab_tone_level_idx_hi1_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_hi1_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_mid.table =
+        &qdm2_table[qdm2_vlc_offs[8]];
+    vlc_tab_tone_level_idx_mid.table_allocated = qdm2_vlc_offs[9] -
+                                                 qdm2_vlc_offs[8];
+    init_vlc(&vlc_tab_tone_level_idx_mid, 8, 24,
+             vlc_tab_tone_level_idx_mid_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_mid_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_tone_level_idx_hi2.table =
+        &qdm2_table[qdm2_vlc_offs[9]];
+    vlc_tab_tone_level_idx_hi2.table_allocated = qdm2_vlc_offs[10] -
+                                                 qdm2_vlc_offs[9];
+    init_vlc(&vlc_tab_tone_level_idx_hi2, 8, 24,
+             vlc_tab_tone_level_idx_hi2_huffbits, 1, 1,
+             vlc_tab_tone_level_idx_hi2_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_type30.table           = &qdm2_table[qdm2_vlc_offs[10]];
+    vlc_tab_type30.table_allocated = qdm2_vlc_offs[11] - qdm2_vlc_offs[10];
+    init_vlc(&vlc_tab_type30, 6, 9,
+             vlc_tab_type30_huffbits, 1, 1,
+             vlc_tab_type30_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_type34.table           = &qdm2_table[qdm2_vlc_offs[11]];
+    vlc_tab_type34.table_allocated = qdm2_vlc_offs[12] - qdm2_vlc_offs[11];
+    init_vlc(&vlc_tab_type34, 5, 10,
+             vlc_tab_type34_huffbits, 1, 1,
+             vlc_tab_type34_huffcodes, 1, 1,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[0].table =
+        &qdm2_table[qdm2_vlc_offs[12]];
+    vlc_tab_fft_tone_offset[0].table_allocated = qdm2_vlc_offs[13] -
+                                                 qdm2_vlc_offs[12];
+    init_vlc(&vlc_tab_fft_tone_offset[0], 8, 23,
+             vlc_tab_fft_tone_offset_0_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_0_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[1].table =
+        &qdm2_table[qdm2_vlc_offs[13]];
+    vlc_tab_fft_tone_offset[1].table_allocated = qdm2_vlc_offs[14] -
+                                                 qdm2_vlc_offs[13];
+    init_vlc(&vlc_tab_fft_tone_offset[1], 8, 28,
+             vlc_tab_fft_tone_offset_1_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_1_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[2].table =
+        &qdm2_table[qdm2_vlc_offs[14]];
+    vlc_tab_fft_tone_offset[2].table_allocated = qdm2_vlc_offs[15] -
+                                                 qdm2_vlc_offs[14];
+    init_vlc(&vlc_tab_fft_tone_offset[2], 8, 32,
+             vlc_tab_fft_tone_offset_2_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_2_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[3].table =
+        &qdm2_table[qdm2_vlc_offs[15]];
+    vlc_tab_fft_tone_offset[3].table_allocated = qdm2_vlc_offs[16] -
+                                                 qdm2_vlc_offs[15];
+    init_vlc(&vlc_tab_fft_tone_offset[3], 8, 35,
+             vlc_tab_fft_tone_offset_3_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_3_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+
+    vlc_tab_fft_tone_offset[4].table =
+        &qdm2_table[qdm2_vlc_offs[16]];
+    vlc_tab_fft_tone_offset[4].table_allocated = qdm2_vlc_offs[17] -
+                                                 qdm2_vlc_offs[16];
+    init_vlc(&vlc_tab_fft_tone_offset[4], 8, 38,
+             vlc_tab_fft_tone_offset_4_huffbits, 1, 1,
+             vlc_tab_fft_tone_offset_4_huffcodes, 2, 2,
+             INIT_VLC_USE_NEW_STATIC | INIT_VLC_LE);
+}
+
 #endif /* CONFIG_HARDCODED_TABLES */
 
 #endif /* AVCODEC_QDM2_TABLEGEN_H */
diff --git a/libavcodec/qdm2data.h b/libavcodec/qdm2data.h
index ad6ea88..355d613 100644
--- a/libavcodec/qdm2data.h
+++ b/libavcodec/qdm2data.h
@@ -5,20 +5,20 @@
  * Copyright (c) 2005 Alex Beregszaszi
  * Copyright (c) 2005 Roberto Togni
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qdrw.c b/libavcodec/qdrw.c
index b7493e4..828cfea 100644
--- a/libavcodec/qdrw.c
+++ b/libavcodec/qdrw.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2004 Konstantin Shishkov
  * Copyright (c) 2015 Vittorio Giovara
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,6 +95,8 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
                         pos -= offset;
                         pos++;
                     }
+                    if (pos >= offset)
+                        return AVERROR_INVALIDDATA;
                 }
                 left  -= 2;
             } else { /* copy */
@@ -105,6 +107,8 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
                         pos -= offset;
                         pos++;
                     }
+                    if (pos >= offset)
+                        return AVERROR_INVALIDDATA;
                 }
                 left  -= 2 + code;
             }
@@ -114,6 +118,29 @@ static int decode_rle(AVCodecContext *avctx, AVFrame *p, GetByteContext *gbc,
     return 0;
 }
 
+static int check_header(const char *buf, int buf_size)
+{
+    unsigned w, h, v0, v1;
+
+    if (buf_size < 40)
+        return 0;
+
+    w = AV_RB16(buf+6);
+    h = AV_RB16(buf+8);
+    v0 = AV_RB16(buf+10);
+    v1 = AV_RB16(buf+12);
+
+    if (!w || !h)
+        return 0;
+
+    if (v0 == 0x1101)
+        return 1;
+    if (v0 == 0x0011 && v1 == 0x02FF)
+        return 2;
+    return 0;
+}
+
+
 static int decode_frame(AVCodecContext *avctx,
                         void *data, int *got_frame,
                         AVPacket *avpkt)
@@ -122,13 +149,16 @@ static int decode_frame(AVCodecContext *avctx,
     GetByteContext gbc;
     int colors;
     int w, h, ret;
+    int ver;
 
     bytestream2_init(&gbc, avpkt->data, avpkt->size);
-
-    /* PICT images start with a 512 bytes empty header */
-    if (bytestream2_peek_be32(&gbc) == 0)
+    if (   bytestream2_get_bytes_left(&gbc) >= 552
+           &&  check_header(gbc.buffer + 512, bytestream2_get_bytes_left(&gbc) - 512)
+       )
         bytestream2_skip(&gbc, 512);
 
+    ver = check_header(gbc.buffer, bytestream2_get_bytes_left(&gbc));
+
     /* smallest PICT header */
     if (bytestream2_get_bytes_left(&gbc) < 40) {
         av_log(avctx, AV_LOG_ERROR, "Frame is too small %d\n",
@@ -146,12 +176,15 @@ static int decode_frame(AVCodecContext *avctx,
 
     /* version 1 is identified by 0x1101
      * it uses byte-aligned opcodes rather than word-aligned */
-    if (bytestream2_get_be32(&gbc) != 0x001102FF) {
+    if (ver == 1) {
         avpriv_request_sample(avctx, "QuickDraw version 1");
         return AVERROR_PATCHWELCOME;
+    } else if (ver != 2) {
+        avpriv_request_sample(avctx, "QuickDraw version unknown (%X)", bytestream2_get_be32(&gbc));
+        return AVERROR_PATCHWELCOME;
     }
 
-    bytestream2_skip(&gbc, 26);
+    bytestream2_skip(&gbc, 4+26);
 
     while (bytestream2_get_bytes_left(&gbc) >= 4) {
         int bppcnt, bpp;
@@ -191,10 +224,8 @@ static int decode_frame(AVCodecContext *avctx,
                        bytestream2_get_bytes_left(&gbc));
                 return AVERROR_INVALIDDATA;
             }
-            if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
                 return ret;
-            }
 
             parse_palette(avctx, &gbc, (uint32_t *)p->data[1], colors);
             p->palette_has_changed = 1;
@@ -250,10 +281,8 @@ static int decode_frame(AVCodecContext *avctx,
                 avpriv_request_sample(avctx, "Pack type %d", pack_type);
                 return AVERROR_PATCHWELCOME;
             }
-            if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+            if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
                 return ret;
-            }
 
             /* jump to data */
             bytestream2_skip(&gbc, 30);
diff --git a/libavcodec/qpeg.c b/libavcodec/qpeg.c
index f549cd5..9eaf9b8 100644
--- a/libavcodec/qpeg.c
+++ b/libavcodec/qpeg.c
@@ -2,20 +2,20 @@
  * QPEG codec
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,8 +30,7 @@
 
 typedef struct QpegContext{
     AVCodecContext *avctx;
-    AVFrame *pic;
-    uint8_t *refdata;
+    AVFrame *pic, *ref;
     uint32_t pal[256];
     GetByteContext buffer;
 } QpegContext;
@@ -111,7 +110,7 @@ static const int qpeg_table_w[16] =
  { 0x00, 0x20, 0x18, 0x08, 0x18, 0x10, 0x20, 0x10, 0x08, 0x10, 0x20, 0x20, 0x08, 0x10, 0x18, 0x04};
 
 /* Decodes delta frames */
-static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
+static void av_noinline qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
                               int stride, int width, int height,
                               int delta, const uint8_t *ctable,
                               uint8_t *refdata)
@@ -121,9 +120,13 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
     int filled = 0;
     int orig_height;
 
-    /* copy prev frame */
-    for(i = 0; i < height; i++)
-        memcpy(refdata + (i * width), dst + (i * stride), width);
+    if (refdata) {
+        /* copy prev frame */
+        for (i = 0; i < height; i++)
+            memcpy(dst + (i * stride), refdata + (i * stride), width);
+    } else {
+        refdata = dst;
+    }
 
     orig_height = height;
     height--;
@@ -134,7 +137,7 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
 
         if(delta) {
             /* motion compensation */
-            while((code & 0xF0) == 0xF0) {
+            while(bytestream2_get_bytes_left(&qctx->buffer) > 0 && (code & 0xF0) == 0xF0) {
                 if(delta == 1) {
                     int me_idx;
                     int me_w, me_h, me_x, me_y;
@@ -161,16 +164,16 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
 
                     /* check motion vector */
                     if ((me_x + filled < 0) || (me_x + me_w + filled > width) ||
-                       (height - me_y - me_h < 0) || (height - me_y > orig_height) ||
+                       (height - me_y - me_h < 0) || (height - me_y >= orig_height) ||
                        (filled + me_w > width) || (height - me_h < 0))
                         av_log(NULL, AV_LOG_ERROR, "Bogus motion vector (%i,%i), block size %ix%i at %i,%i\n",
                                me_x, me_y, me_w, me_h, filled, height);
                     else {
                         /* do motion compensation */
-                        me_plane = refdata + (filled + me_x) + (height - me_y) * width;
+                        me_plane = refdata + (filled + me_x) + (height - me_y) * stride;
                         for(j = 0; j < me_h; j++) {
                             for(i = 0; i < me_w; i++)
-                                dst[filled + i - (j * stride)] = me_plane[i - (j * width)];
+                                dst[filled + i - (j * stride)] = me_plane[i - (j * stride)];
                         }
                     }
                 }
@@ -198,6 +201,9 @@ static void qpeg_decode_inter(QpegContext *qctx, uint8_t *dst,
         } else if(code >= 0xC0) { /* copy code: 0xC0..0xDF */
             code &= 0x1F;
 
+            if(code + 1 > bytestream2_get_bytes_left(&qctx->buffer))
+                break;
+
             for(i = 0; i <= code; i++) {
                 dst[filled++] = bytestream2_get_byte(&qctx->buffer);
                 if(filled >= width) {
@@ -251,6 +257,7 @@ static int decode_frame(AVCodecContext *avctx,
     uint8_t ctable[128];
     QpegContext * const a = avctx->priv_data;
     AVFrame * const p = a->pic;
+    AVFrame * const ref = a->ref;
     uint8_t* outdata;
     int delta, ret;
     const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE, NULL);
@@ -261,10 +268,12 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     bytestream2_init(&a->buffer, avpkt->data, avpkt->size);
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+
+    av_frame_unref(ref);
+    av_frame_move_ref(ref, p);
+
+    if ((ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
     outdata = p->data[0];
     bytestream2_skip(&a->buffer, 4);
     bytestream2_get_buffer(&a->buffer, ctable, 128);
@@ -274,7 +283,7 @@ static int decode_frame(AVCodecContext *avctx,
     if(delta == 0x10) {
         qpeg_decode_intra(a, outdata, p->linesize[0], avctx->width, avctx->height);
     } else {
-        qpeg_decode_inter(a, outdata, p->linesize[0], avctx->width, avctx->height, delta, ctable, a->refdata);
+        qpeg_decode_inter(a, outdata, p->linesize[0], avctx->width, avctx->height, delta, ctable, ref->data[0]);
     }
 
     /* make the palette available on the way out */
@@ -292,13 +301,25 @@ static int decode_frame(AVCodecContext *avctx,
     return avpkt->size;
 }
 
+static void decode_flush(AVCodecContext *avctx){
+    QpegContext * const a = avctx->priv_data;
+    int i, pal_size;
+    const uint8_t *pal_src;
+
+    pal_size = FFMIN(1024U, avctx->extradata_size);
+    pal_src = avctx->extradata + avctx->extradata_size - pal_size;
+
+    for (i=0; i<pal_size/4; i++)
+        a->pal[i] = 0xFFU<<24 | AV_RL32(pal_src+4*i);
+}
+
 static av_cold int decode_end(AVCodecContext *avctx)
 {
     QpegContext * const a = avctx->priv_data;
 
     av_frame_free(&a->pic);
+    av_frame_free(&a->ref);
 
-    av_free(a->refdata);
     return 0;
 }
 
@@ -307,10 +328,12 @@ static av_cold int decode_init(AVCodecContext *avctx){
 
     a->avctx = avctx;
     avctx->pix_fmt= AV_PIX_FMT_PAL8;
-    a->refdata = av_malloc(avctx->width * avctx->height);
+
+    decode_flush(avctx);
 
     a->pic = av_frame_alloc();
-    if (!a->pic) {
+    a->ref = av_frame_alloc();
+    if (!a->pic || !a->ref) {
         decode_end(avctx);
         return AVERROR(ENOMEM);
     }
@@ -327,5 +350,6 @@ AVCodec ff_qpeg_decoder = {
     .init           = decode_init,
     .close          = decode_end,
     .decode         = decode_frame,
+    .flush          = decode_flush,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/qpel_template.c b/libavcodec/qpel_template.c
index 2106160..e52a78c 100644
--- a/libavcodec/qpel_template.c
+++ b/libavcodec/qpel_template.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP function templates
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qpeldsp.c b/libavcodec/qpeldsp.c
index 1d0422a..6e52b33 100644
--- a/libavcodec/qpeldsp.c
+++ b/libavcodec/qpeldsp.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +32,7 @@
 #include "libavutil/attributes.h"
 #include "copy_block.h"
 #include "qpeldsp.h"
+#include "diracdsp.h"
 
 #define BIT_DEPTH 8
 #include "hpel_template.c"
@@ -732,6 +735,51 @@ void ff_put_pixels8_l2_8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 
 }
 
+#if CONFIG_DIRAC_DECODER
+#define DIRAC_MC(OPNAME)\
+void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+     OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
+    OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
+    OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
+    OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
+}
+DIRAC_MC(put)
+DIRAC_MC(avg)
+#endif
+
 av_cold void ff_qpeldsp_init(QpelDSPContext *c)
 {
 #define dspfunc(PFX, IDX, NUM)                              \
@@ -763,4 +811,6 @@ av_cold void ff_qpeldsp_init(QpelDSPContext *c)
 
     if (ARCH_X86)
         ff_qpeldsp_init_x86(c);
+    if (ARCH_MIPS)
+        ff_qpeldsp_init_mips(c);
 }
diff --git a/libavcodec/qpeldsp.h b/libavcodec/qpeldsp.h
index 4ad141d..91019ed 100644
--- a/libavcodec/qpeldsp.h
+++ b/libavcodec/qpeldsp.h
@@ -1,20 +1,20 @@
 /*
  * quarterpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -78,5 +78,6 @@ typedef struct QpelDSPContext {
 void ff_qpeldsp_init(QpelDSPContext *c);
 
 void ff_qpeldsp_init_x86(QpelDSPContext *c);
+void ff_qpeldsp_init_mips(QpelDSPContext *c);
 
 #endif /* AVCODEC_QPELDSP_H */
diff --git a/libavcodec/qsv.c b/libavcodec/qsv.c
index e08518b..11d453d 100644
--- a/libavcodec/qsv.c
+++ b/libavcodec/qsv.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV encoder/decoder shared code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -85,8 +85,90 @@ int ff_qsv_error(int mfx_err)
         return AVERROR_UNKNOWN;
     }
 }
+static int ff_qsv_set_display_handle(AVCodecContext *avctx, QSVSession *qs)
+{
+    // this code is only required for Linux.  It searches for a valid
+    // display handle.  First in /dev/dri/renderD then in /dev/dri/card
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+    // VAAPI display handle
+    int ret = 0;
+    VADisplay va_dpy = NULL;
+    VAStatus va_res = VA_STATUS_SUCCESS;
+    int major_version = 0, minor_version = 0;
+    int fd = -1;
+    char adapterpath[256];
+    int adapter_num;
+
+    qs->fd_display = -1;
+    qs->va_display = NULL;
+
+    //search for valid graphics device
+    for (adapter_num = 0;adapter_num < 6;adapter_num++) {
+
+        if (adapter_num<3) {
+            snprintf(adapterpath,sizeof(adapterpath),
+                "/dev/dri/renderD%d", adapter_num+128);
+        } else {
+            snprintf(adapterpath,sizeof(adapterpath),
+                "/dev/dri/card%d", adapter_num-3);
+        }
+
+        fd = open(adapterpath, O_RDWR);
+        if (fd < 0) {
+            av_log(avctx, AV_LOG_ERROR,
+                "mfx init: %s fd open failed\n", adapterpath);
+            continue;
+        }
 
-int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
+        va_dpy = vaGetDisplayDRM(fd);
+        if (!va_dpy) {
+            av_log(avctx, AV_LOG_ERROR,
+                "mfx init: %s vaGetDisplayDRM failed\n", adapterpath);
+            close(fd);
+            continue;
+        }
+
+        va_res = vaInitialize(va_dpy, &major_version, &minor_version);
+        if (VA_STATUS_SUCCESS != va_res) {
+            av_log(avctx, AV_LOG_ERROR,
+                "mfx init: %s vaInitialize failed\n", adapterpath);
+            close(fd);
+            fd = -1;
+            continue;
+        } else {
+            av_log(avctx, AV_LOG_VERBOSE,
+            "mfx initialization: %s vaInitialize successful\n",adapterpath);
+            qs->fd_display = fd;
+            qs->va_display = va_dpy;
+            ret = MFXVideoCORE_SetHandle(qs->session,
+                  (mfxHandleType)MFX_HANDLE_VA_DISPLAY, (mfxHDL)va_dpy);
+            if (ret < 0) {
+                av_log(avctx, AV_LOG_ERROR,
+                "Error %d during set display handle\n", ret);
+                return ff_qsv_error(ret);
+            }
+            break;
+        }
+    }
+#endif //AVCODEC_QSV_LINUX_SESSION_HANDLE
+    return 0;
+}
+/**
+ * @brief Initialize a MSDK session
+ *
+ * Media SDK is based on sessions, so this is the prerequisite
+ * initialization for HW acceleration.  For Windows the session is
+ * complete and ready to use, for Linux a display handle is
+ * required.  For releases of Media Server Studio >= 2015 R4 the
+ * render nodes interface is preferred (/dev/dri/renderD).
+ * Using Media Server Studio 2015 R4 or newer is recommended
+ * but the older /dev/dri/card interface is also searched
+ * for broader compatibility.
+ *
+ * @param avctx    ffmpeg metadata for this codec context
+ * @param session  the MSDK session used
+ */
+int ff_qsv_init_internal_session(AVCodecContext *avctx, QSVSession *qs,
                                  const char *load_plugins)
 {
     mfxIMPL impl   = MFX_IMPL_AUTO_ANY;
@@ -95,12 +177,16 @@ int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
     const char *desc;
     int ret;
 
-    ret = MFXInit(impl, &ver, session);
+    ret = MFXInit(impl, &ver, &qs->session);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing an internal MFX session\n");
         return ff_qsv_error(ret);
     }
 
+    ret = ff_qsv_set_display_handle(avctx, qs);
+    if (ret < 0)
+        return ret;
+
     if (load_plugins && *load_plugins) {
         while (*load_plugins) {
             mfxPluginUID uid;
@@ -125,7 +211,7 @@ int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
 
             }
 
-            ret = MFXVideoUSER_Load(*session, &uid, 1);
+            ret = MFXVideoUSER_Load(qs->session, &uid, 1);
             if (ret < 0) {
                 av_log(avctx, AV_LOG_ERROR, "Could not load the requested plugin: %s\n",
                        plugin);
@@ -142,7 +228,7 @@ load_plugin_fail:
         }
     }
 
-    MFXQueryIMPL(*session, &impl);
+    MFXQueryIMPL(qs->session, &impl);
 
     switch (MFX_IMPL_BASETYPE(impl)) {
     case MFX_IMPL_SOFTWARE:
@@ -164,3 +250,22 @@ load_plugin_fail:
 
     return 0;
 }
+
+int ff_qsv_close_internal_session(QSVSession *qs)
+{
+    if (qs->session) {
+        MFXClose(qs->session);
+        qs->session = NULL;
+    }
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+    if (qs->va_display) {
+        vaTerminate(qs->va_display);
+        qs->va_display = NULL;
+    }
+    if (qs->fd_display > 0) {
+        close(qs->fd_display);
+        qs->fd_display = -1;
+    }
+#endif
+    return 0;
+}
diff --git a/libavcodec/qsv.h b/libavcodec/qsv.h
index 1d1f8b4..b77158e 100644
--- a/libavcodec/qsv.h
+++ b/libavcodec/qsv.h
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV public API
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsv_api.c b/libavcodec/qsv_api.c
index 234b596..327ff7d 100644
--- a/libavcodec/qsv_api.c
+++ b/libavcodec/qsv_api.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV public API functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsv_internal.h b/libavcodec/qsv_internal.h
index 1b7a2e7..f289a2b 100644
--- a/libavcodec/qsv_internal.h
+++ b/libavcodec/qsv_internal.h
@@ -1,26 +1,41 @@
 /*
  * Intel MediaSDK QSV encoder/decoder shared code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_QSV_INTERNAL_H
 #define AVCODEC_QSV_INTERNAL_H
 
+#if CONFIG_VAAPI
+#define AVCODEC_QSV_LINUX_SESSION_HANDLE
+#endif //CONFIG_VAAPI
+
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+#include <stdio.h>
+#include <string.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <fcntl.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+#endif
+
 #include <mfx/mfxvideo.h>
 
 #include "libavutil/frame.h"
@@ -28,10 +43,12 @@
 #include "avcodec.h"
 
 #define QSV_VERSION_MAJOR 1
-#define QSV_VERSION_MINOR 1
+#define QSV_VERSION_MINOR 9
 
 #define ASYNC_DEPTH_DEFAULT 4       // internal parallelism
 
+#define QSV_MAX_ENC_PAYLOAD 2       // # of mfxEncodeCtrl payloads supported
+
 #define QSV_VERSION_ATLEAST(MAJOR, MINOR)   \
     (MFX_VERSION_MAJOR > (MAJOR) ||         \
      MFX_VERSION_MAJOR == (MAJOR) && MFX_VERSION_MINOR >= (MINOR))
@@ -39,6 +56,7 @@
 typedef struct QSVFrame {
     AVFrame *frame;
     mfxFrameSurface1 *surface;
+    mfxEncodeCtrl enc_ctrl;
 
     mfxFrameSurface1 surface_internal;
 
@@ -47,14 +65,23 @@ typedef struct QSVFrame {
     struct QSVFrame *next;
 } QSVFrame;
 
+typedef struct QSVSession {
+    mfxSession session;
+#ifdef AVCODEC_QSV_LINUX_SESSION_HANDLE
+    int        fd_display;
+    VADisplay  va_display;
+#endif
+} QSVSession;
+
 /**
- * Convert a libmfx error code into a libav error code.
+ * Convert a libmfx error code into a ffmpeg error code.
  */
 int ff_qsv_error(int mfx_err);
 
 int ff_qsv_codec_id_to_mfx(enum AVCodecID codec_id);
 
-int ff_qsv_init_internal_session(AVCodecContext *avctx, mfxSession *session,
+int ff_qsv_init_internal_session(AVCodecContext *avctx, QSVSession *qs,
                                  const char *load_plugins);
+int ff_qsv_close_internal_session(QSVSession *qs);
 
 #endif /* AVCODEC_QSV_INTERNAL_H */
diff --git a/libavcodec/qsvdec.c b/libavcodec/qsvdec.c
index 1d59e72..c17606d 100644
--- a/libavcodec/qsvdec.c
+++ b/libavcodec/qsvdec.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Luca Barbato
  * copyright (c) 2015 Anton Khirnov <anton@khirnov.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,73 +49,117 @@ int ff_qsv_map_pixfmt(enum AVPixelFormat format)
     }
 }
 
-static int qsv_init_session(AVCodecContext *avctx, QSVContext *q, mfxSession session)
+static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, AVPacket *avpkt)
 {
-    if (!session) {
-        if (!q->internal_session) {
-            int ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
-                                                   q->load_plugins);
-            if (ret < 0)
-                return ret;
-        }
+    mfxVideoParam param = { { 0 } };
+    mfxBitstream bs   = { { { 0 } } };
+    int ret;
+    enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_QSV,
+                                       AV_PIX_FMT_NV12,
+                                       AV_PIX_FMT_NONE };
 
-        q->session = q->internal_session;
-    } else {
-        q->session = session;
-    }
+    ret = ff_get_format(avctx, pix_fmts);
+    if (ret < 0)
+        return ret;
 
-    /* make sure the decoder is uninitialized */
-    MFXVideoDECODE_Close(q->session);
+    avctx->pix_fmt      = ret;
 
-    return 0;
-}
+    q->iopattern  = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
+    if (avctx->hwaccel_context) {
+        AVQSVContext *qsv = avctx->hwaccel_context;
 
-static int qsv_decode_init(AVCodecContext *avctx, QSVContext *q, mfxSession session)
-{
-    mfxVideoParam param = { { 0 } };
-    int ret;
-
-    if (!q->async_fifo) {
-        q->async_fifo = av_fifo_alloc((1 + q->async_depth) *
-                                      (sizeof(mfxSyncPoint*) + sizeof(QSVFrame*)));
-        if (!q->async_fifo)
-            return AVERROR(ENOMEM);
+        q->session        = qsv->session;
+        q->iopattern      = qsv->iopattern;
+        q->ext_buffers    = qsv->ext_buffers;
+        q->nb_ext_buffers = qsv->nb_ext_buffers;
     }
+    if (!q->session) {
+        if (!q->internal_qs.session) {
+            ret = ff_qsv_init_internal_session(avctx, &q->internal_qs,
+                                               q->load_plugins);
+            if (ret < 0)
+                return ret;
+        }
 
-    ret = qsv_init_session(avctx, q, session);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error initializing an MFX session\n");
-        return ret;
+        q->session = q->internal_qs.session;
     }
 
+    if (avpkt->size) {
+        bs.Data       = avpkt->data;
+        bs.DataLength = avpkt->size;
+        bs.MaxLength  = bs.DataLength;
+        bs.TimeStamp  = avpkt->pts;
+    } else
+        return AVERROR_INVALIDDATA;
 
     ret = ff_qsv_codec_id_to_mfx(avctx->codec_id);
-    if (ret < 0)
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported codec_id %08x\n", avctx->codec_id);
         return ret;
+    }
 
-    param.mfx.CodecId      = ret;
-    param.mfx.CodecProfile = avctx->profile;
-    param.mfx.CodecLevel   = avctx->level;
-
-    param.mfx.FrameInfo.BitDepthLuma   = 8;
-    param.mfx.FrameInfo.BitDepthChroma = 8;
-    param.mfx.FrameInfo.Shift          = 0;
-    param.mfx.FrameInfo.FourCC         = MFX_FOURCC_NV12;
-    param.mfx.FrameInfo.Width          = avctx->coded_width;
-    param.mfx.FrameInfo.Height         = avctx->coded_height;
-    param.mfx.FrameInfo.ChromaFormat   = MFX_CHROMAFORMAT_YUV420;
+    param.mfx.CodecId = ret;
 
+    ret = MFXVideoDECODE_DecodeHeader(q->session, &bs, &param);
+    if (MFX_ERR_MORE_DATA==ret) {
+        /* this code means that header not found so we return packet size to skip
+           a current packet
+         */
+        return avpkt->size;
+    } else if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Decode header error %d\n", ret);
+        return ff_qsv_error(ret);
+    }
     param.IOPattern   = q->iopattern;
     param.AsyncDepth  = q->async_depth;
     param.ExtParam    = q->ext_buffers;
     param.NumExtParam = q->nb_ext_buffers;
+    param.mfx.FrameInfo.BitDepthLuma   = 8;
+    param.mfx.FrameInfo.BitDepthChroma = 8;
 
     ret = MFXVideoDECODE_Init(q->session, &param);
     if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error initializing the MFX video decoder\n");
+        if (MFX_ERR_INVALID_VIDEO_PARAM==ret) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Error initializing the MFX video decoder, unsupported video\n");
+        } else {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Error initializing the MFX video decoder %d\n", ret);
+        }
         return ff_qsv_error(ret);
     }
 
+    avctx->profile      = param.mfx.CodecProfile;
+    avctx->level        = param.mfx.CodecLevel;
+    avctx->coded_width  = param.mfx.FrameInfo.Width;
+    avctx->coded_height = param.mfx.FrameInfo.Height;
+    avctx->width        = param.mfx.FrameInfo.CropW - param.mfx.FrameInfo.CropX;
+    avctx->height       = param.mfx.FrameInfo.CropH - param.mfx.FrameInfo.CropY;
+
+    /* maximum decoder latency should be not exceed max DPB size for h.264 and
+       HEVC which is 16 for both cases.
+       So weare  pre-allocating fifo big enough for 17 elements:
+     */
+    if (!q->async_fifo) {
+        q->async_fifo = av_fifo_alloc((1 + 16) *
+                                      (sizeof(mfxSyncPoint*) + sizeof(QSVFrame*)));
+        if (!q->async_fifo)
+            return AVERROR(ENOMEM);
+    }
+
+    if (!q->input_fifo) {
+        q->input_fifo = av_fifo_alloc(1024*16);
+        if (!q->input_fifo)
+            return AVERROR(ENOMEM);
+    }
+
+    if (!q->pkt_fifo) {
+        q->pkt_fifo = av_fifo_alloc( sizeof(AVPacket) * (1 + 16) );
+        if (!q->pkt_fifo)
+            return AVERROR(ENOMEM);
+    }
+    q->engine_ready = 1;
+
     return 0;
 }
 
@@ -211,9 +255,73 @@ static QSVFrame *find_frame(QSVContext *q, mfxFrameSurface1 *surf)
     return NULL;
 }
 
-static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
-                      AVFrame *frame, int *got_frame,
-                      AVPacket *avpkt)
+/*  This function uses for 'smart' releasing of consumed data
+    from the input bitstream fifo.
+    Since the input fifo mapped to mfxBitstream which does not understand
+    a wrapping of data over fifo end, we should also to relocate a possible
+    data rest to fifo begin. If rest of data is absent then we just reset fifo's
+    pointers to initial positions.
+    NOTE the case when fifo does contain unconsumed data is rare and typical
+    amount of such data is 1..4 bytes.
+*/
+static void qsv_fifo_relocate(AVFifoBuffer *f, int bytes_to_free)
+{
+    int data_size;
+    int data_rest = 0;
+
+    av_fifo_drain(f, bytes_to_free);
+
+    data_size = av_fifo_size(f);
+    if (data_size > 0) {
+        if (f->buffer!=f->rptr) {
+            if ( (f->end - f->rptr) < data_size) {
+                data_rest = data_size - (f->end - f->rptr);
+                data_size-=data_rest;
+                memmove(f->buffer+data_size, f->buffer, data_rest);
+            }
+            memmove(f->buffer, f->rptr, data_size);
+            data_size+= data_rest;
+        }
+    }
+    f->rptr = f->buffer;
+    f->wptr = f->buffer + data_size;
+    f->wndx = data_size;
+    f->rndx = 0;
+}
+
+
+static void close_decoder(QSVContext *q)
+{
+    QSVFrame *cur;
+
+    if (q->session)
+        MFXVideoDECODE_Close(q->session);
+
+    while (q->async_fifo && av_fifo_size(q->async_fifo)) {
+        QSVFrame *out_frame;
+        mfxSyncPoint *sync;
+
+        av_fifo_generic_read(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
+        av_fifo_generic_read(q->async_fifo, &sync,      sizeof(sync),      NULL);
+
+        av_freep(&sync);
+    }
+
+    cur = q->work_frames;
+    while (cur) {
+        q->work_frames = cur->next;
+        av_frame_free(&cur->frame);
+        av_freep(&cur);
+        cur = q->work_frames;
+    }
+
+    q->engine_ready   = 0;
+    q->reinit_pending = 0;
+}
+
+static int do_qsv_decode(AVCodecContext *avctx, QSVContext *q,
+                  AVFrame *frame, int *got_frame,
+                  AVPacket *avpkt)
 {
     QSVFrame *out_frame;
     mfxFrameSurface1 *insurf;
@@ -221,10 +329,32 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
     mfxSyncPoint *sync;
     mfxBitstream bs = { { { 0 } } };
     int ret;
+    int n_out_frames;
+    int buffered = 0;
+    int flush    = !avpkt->size || q->reinit_pending;
 
-    if (avpkt->size) {
-        bs.Data       = avpkt->data;
-        bs.DataLength = avpkt->size;
+    if (!q->engine_ready) {
+        ret = qsv_decode_init(avctx, q, avpkt);
+        if (ret)
+            return ret;
+    }
+
+    if (!flush) {
+        if (av_fifo_size(q->input_fifo)) {
+            /* we have got rest of previous packet into buffer */
+            if (av_fifo_space(q->input_fifo) < avpkt->size) {
+                ret = av_fifo_grow(q->input_fifo, avpkt->size);
+                if (ret < 0)
+                    return ret;
+            }
+            av_fifo_generic_write(q->input_fifo, avpkt->data, avpkt->size, NULL);
+            bs.Data       = q->input_fifo->rptr;
+            bs.DataLength = av_fifo_size(q->input_fifo);
+            buffered = 1;
+        } else {
+            bs.Data       = avpkt->data;
+            bs.DataLength = avpkt->size;
+        }
         bs.MaxLength  = bs.DataLength;
         bs.TimeStamp  = avpkt->pts;
     }
@@ -235,53 +365,71 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
         return AVERROR(ENOMEM);
     }
 
-    do {
+    while (1) {
         ret = get_surface(avctx, q, &insurf);
         if (ret < 0)
             return ret;
+        do {
+            ret = MFXVideoDECODE_DecodeFrameAsync(q->session, flush ? NULL : &bs,
+                                                  insurf, &outsurf, sync);
+            if (ret != MFX_WRN_DEVICE_BUSY)
+                break;
+            av_usleep(500);
+        } while (1);
+
+        if (MFX_WRN_VIDEO_PARAM_CHANGED==ret) {
+            /* TODO: handle here minor sequence header changing */
+        } else if (MFX_ERR_INCOMPATIBLE_VIDEO_PARAM==ret) {
+            av_fifo_reset(q->input_fifo);
+            flush = q->reinit_pending = 1;
+            continue;
+        }
 
-        ret = MFXVideoDECODE_DecodeFrameAsync(q->session, avpkt->size ? &bs : NULL,
-                                              insurf, &outsurf, sync);
-        if (ret == MFX_WRN_DEVICE_BUSY)
-            av_usleep(1);
+        if (*sync) {
+            QSVFrame *out_frame = find_frame(q, outsurf);
 
-    } while (ret == MFX_WRN_DEVICE_BUSY || ret == MFX_ERR_MORE_SURFACE);
+            if (!out_frame) {
+                av_freep(&sync);
+                av_log(avctx, AV_LOG_ERROR,
+                       "The returned surface does not correspond to any frame\n");
+                return AVERROR_BUG;
+            }
 
-    if (ret != MFX_ERR_NONE &&
-        ret != MFX_ERR_MORE_DATA &&
-        ret != MFX_WRN_VIDEO_PARAM_CHANGED &&
-        ret != MFX_ERR_MORE_SURFACE) {
-        av_log(avctx, AV_LOG_ERROR, "Error during QSV decoding.\n");
-        av_freep(&sync);
-        return ff_qsv_error(ret);
+            out_frame->queued = 1;
+            av_fifo_generic_write(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
+            av_fifo_generic_write(q->async_fifo, &sync,      sizeof(sync),      NULL);
+
+            continue;
+        } else {
+            av_freep(&sync);
+        }
+        if (MFX_ERR_MORE_SURFACE != ret && ret < 0)
+            break;
     }
 
     /* make sure we do not enter an infinite loop if the SDK
      * did not consume any data and did not return anything */
-    if (!*sync && !bs.DataOffset) {
+    if (!*sync && !bs.DataOffset && !flush) {
         av_log(avctx, AV_LOG_WARNING, "A decode call did not consume any data\n");
         bs.DataOffset = avpkt->size;
     }
 
-    if (*sync) {
-        QSVFrame *out_frame = find_frame(q, outsurf);
-
-        if (!out_frame) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "The returned surface does not correspond to any frame\n");
-            av_freep(&sync);
-            return AVERROR_BUG;
-        }
+    if (buffered) {
+        qsv_fifo_relocate(q->input_fifo, bs.DataOffset);
+    } else if (bs.DataOffset!=avpkt->size) {
+        /* some data of packet was not consumed. store it to local buffer */
+        av_fifo_generic_write(q->input_fifo, avpkt->data+bs.DataOffset,
+                              avpkt->size - bs.DataOffset, NULL);
+    }
 
-        out_frame->queued = 1;
-        av_fifo_generic_write(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
-        av_fifo_generic_write(q->async_fifo, &sync,      sizeof(sync),      NULL);
-    } else {
+    if (MFX_ERR_MORE_DATA!=ret && ret < 0) {
         av_freep(&sync);
+        av_log(avctx, AV_LOG_ERROR, "Error %d during QSV decoding.\n", ret);
+        return ff_qsv_error(ret);
     }
+    n_out_frames = av_fifo_size(q->async_fifo) / (sizeof(out_frame)+sizeof(sync));
 
-    if (!av_fifo_space(q->async_fifo) ||
-        (!avpkt->size && av_fifo_size(q->async_fifo))) {
+    if (n_out_frames > q->async_depth || (flush && n_out_frames) ) {
         AVFrame *src_frame;
 
         av_fifo_generic_read(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
@@ -316,140 +464,147 @@ static int qsv_decode(AVCodecContext *avctx, QSVContext *q,
         *got_frame = 1;
     }
 
-    return bs.DataOffset;
+    return avpkt->size;
 }
-
-int ff_qsv_decode_close(QSVContext *q)
+/*
+ This function inserts a packet at fifo front.
+*/
+static void qsv_packet_push_front(QSVContext *q, AVPacket *avpkt)
 {
-    QSVFrame *cur = q->work_frames;
-
-    if (q->session)
-        MFXVideoDECODE_Close(q->session);
-
-    while (q->async_fifo && av_fifo_size(q->async_fifo)) {
-        QSVFrame *out_frame;
-        mfxSyncPoint *sync;
+    int fifo_size = av_fifo_size(q->pkt_fifo);
+    if (!fifo_size) {
+    /* easy case fifo is empty */
+        av_fifo_generic_write(q->pkt_fifo, avpkt, sizeof(*avpkt), NULL);
+    } else {
+    /* realloc necessary */
+        AVPacket pkt;
+        AVFifoBuffer *fifo = av_fifo_alloc(fifo_size+av_fifo_space(q->pkt_fifo));
 
-        av_fifo_generic_read(q->async_fifo, &out_frame, sizeof(out_frame), NULL);
-        av_fifo_generic_read(q->async_fifo, &sync,      sizeof(sync),      NULL);
+        av_fifo_generic_write(fifo, avpkt, sizeof(*avpkt), NULL);
 
-        av_freep(&sync);
+        while (av_fifo_size(q->pkt_fifo)) {
+            av_fifo_generic_read(q->pkt_fifo, &pkt, sizeof(pkt), NULL);
+            av_fifo_generic_write(fifo,       &pkt, sizeof(pkt), NULL);
+        }
+        av_fifo_free(q->pkt_fifo);
+        q->pkt_fifo = fifo;
     }
+}
+int ff_qsv_decode(AVCodecContext *avctx, QSVContext *q,
+                  AVFrame *frame, int *got_frame,
+                  AVPacket *avpkt)
+{
+    AVPacket pkt_ref = { 0 };
+    int ret = 0;
 
-    while (cur) {
-        q->work_frames = cur->next;
-        av_frame_free(&cur->frame);
-        av_freep(&cur);
-        cur = q->work_frames;
+    if (q->pkt_fifo && av_fifo_size(q->pkt_fifo) >= sizeof(AVPacket)) {
+        /* we already have got some buffered packets. so add new to tail */
+        ret = av_packet_ref(&pkt_ref, avpkt);
+        if (ret < 0)
+            return ret;
+        av_fifo_generic_write(q->pkt_fifo, &pkt_ref, sizeof(pkt_ref), NULL);
     }
+    if (q->reinit_pending) {
+        ret = do_qsv_decode(avctx, q, frame, got_frame, avpkt);
 
-    av_fifo_free(q->async_fifo);
-    q->async_fifo = NULL;
-
-    av_parser_close(q->parser);
-    avcodec_free_context(&q->avctx_internal);
-
-    if (q->internal_session)
-        MFXClose(q->internal_session);
+        if (!*got_frame) {
+            /* Flushing complete, no more frames  */
+            close_decoder(q);
+            //return ff_qsv_decode(avctx, q, frame, got_frame, avpkt);
+        }
+    }
+    if (!q->reinit_pending) {
+        if (q->pkt_fifo && av_fifo_size(q->pkt_fifo) >= sizeof(AVPacket)) {
+            /* process buffered packets */
+            while (!*got_frame && av_fifo_size(q->pkt_fifo) >= sizeof(AVPacket)) {
+                av_fifo_generic_read(q->pkt_fifo, &pkt_ref, sizeof(pkt_ref), NULL);
+                ret = do_qsv_decode(avctx, q, frame, got_frame, &pkt_ref);
+                if (q->reinit_pending) {
+                    /*
+                       A rare case: new reinit pending when buffering existing.
+                       We should to return the pkt_ref back to same place of fifo
+                    */
+                    qsv_packet_push_front(q, &pkt_ref);
+                } else {
+                    av_packet_unref(&pkt_ref);
+                }
+           }
+        } else {
+            /* general decoding */
+            ret = do_qsv_decode(avctx, q, frame, got_frame, avpkt);
+            if (q->reinit_pending) {
+                ret = av_packet_ref(&pkt_ref, avpkt);
+                if (ret < 0)
+                    return ret;
+                av_fifo_generic_write(q->pkt_fifo, &pkt_ref, sizeof(pkt_ref), NULL);
+            }
+        }
+    }
 
-    return 0;
+    return ret;
 }
-
-int ff_qsv_process_data(AVCodecContext *avctx, QSVContext *q,
-                        AVFrame *frame, int *got_frame, AVPacket *pkt)
+/*
+ This function resets decoder and corresponded buffers before seek operation
+*/
+void ff_qsv_decode_reset(AVCodecContext *avctx, QSVContext *q)
 {
-    uint8_t *dummy_data;
-    int dummy_size;
-    int ret;
+    QSVFrame *cur;
+    AVPacket pkt;
+    int ret = 0;
+    mfxVideoParam param = { { 0 } };
 
-    if (!q->avctx_internal) {
-        q->avctx_internal = avcodec_alloc_context3(NULL);
-        if (!q->avctx_internal)
-            return AVERROR(ENOMEM);
+    if (q->reinit_pending) {
+        close_decoder(q);
+    } else if (q->engine_ready) {
+        ret = MFXVideoDECODE_GetVideoParam(q->session, &param);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "MFX decode get param error %d\n", ret);
+        }
 
-        if (avctx->extradata) {
-            q->avctx_internal->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
-            if (!q->avctx_internal->extradata)
-                return AVERROR(ENOMEM);
+        ret = MFXVideoDECODE_Reset(q->session, &param);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "MFX decode reset error %d\n", ret);
+        }
 
-            memcpy(q->avctx_internal->extradata, avctx->extradata,
-                   avctx->extradata_size);
-            q->avctx_internal->extradata_size = avctx->extradata_size;
+        /* Free all frames*/
+        cur = q->work_frames;
+        while (cur) {
+            q->work_frames = cur->next;
+            av_frame_free(&cur->frame);
+            av_freep(&cur);
+            cur = q->work_frames;
         }
+    }
 
-        q->parser = av_parser_init(avctx->codec_id);
-        if (!q->parser)
-            return AVERROR(ENOMEM);
+    /* Reset output surfaces */
+    av_fifo_reset(q->async_fifo);
 
-        q->parser->flags |= PARSER_FLAG_COMPLETE_FRAMES;
-        q->orig_pix_fmt   = AV_PIX_FMT_NONE;
+    /* Reset input packets fifo */
+    while (av_fifo_size(q->pkt_fifo)) {
+        av_fifo_generic_read(q->pkt_fifo, &pkt, sizeof(pkt), NULL);
+        av_packet_unref(&pkt);
     }
 
-    if (!pkt->size)
-        return qsv_decode(avctx, q, frame, got_frame, pkt);
-
-    /* we assume the packets are already split properly and want
-     * just the codec parameters here */
-    av_parser_parse2(q->parser, q->avctx_internal,
-                     &dummy_data, &dummy_size,
-                     pkt->data, pkt->size, pkt->pts, pkt->dts,
-                     pkt->pos);
-
-    /* TODO: flush delayed frames on reinit */
-    if (q->parser->format       != q->orig_pix_fmt    ||
-        q->parser->coded_width  != avctx->coded_width ||
-        q->parser->coded_height != avctx->coded_height) {
-        mfxSession session = NULL;
-
-        enum AVPixelFormat pix_fmts[3] = { AV_PIX_FMT_QSV,
-                                           AV_PIX_FMT_NONE,
-                                           AV_PIX_FMT_NONE };
-        enum AVPixelFormat qsv_format;
-
-        qsv_format = ff_qsv_map_pixfmt(q->parser->format);
-        if (qsv_format < 0) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Only 8-bit YUV420 streams are supported.\n");
-            ret = AVERROR(ENOSYS);
-            goto reinit_fail;
-        }
-
-        q->orig_pix_fmt     = q->parser->format;
-        avctx->pix_fmt      = pix_fmts[1] = qsv_format;
-        avctx->width        = q->parser->width;
-        avctx->height       = q->parser->height;
-        avctx->coded_width  = q->parser->coded_width;
-        avctx->coded_height = q->parser->coded_height;
-        avctx->level        = q->avctx_internal->level;
-        avctx->profile      = q->avctx_internal->profile;
+    /* Reset input bitstream fifo */
+    av_fifo_reset(q->input_fifo);
+}
 
-        ret = ff_get_format(avctx, pix_fmts);
-        if (ret < 0)
-            goto reinit_fail;
+int ff_qsv_decode_close(QSVContext *q)
+{
+    close_decoder(q);
 
-        avctx->pix_fmt = ret;
+    q->session = NULL;
 
-        if (avctx->hwaccel_context) {
-            AVQSVContext *user_ctx = avctx->hwaccel_context;
-            session           = user_ctx->session;
-            q->iopattern      = user_ctx->iopattern;
-            q->ext_buffers    = user_ctx->ext_buffers;
-            q->nb_ext_buffers = user_ctx->nb_ext_buffers;
-        }
+    ff_qsv_close_internal_session(&q->internal_qs);
 
-        ret = qsv_decode_init(avctx, q, session);
-        if (ret < 0)
-            goto reinit_fail;
-    }
+    av_fifo_free(q->async_fifo);
+    q->async_fifo = NULL;
 
-    return qsv_decode(avctx, q, frame, got_frame, pkt);
+    av_fifo_free(q->input_fifo);
+    q->input_fifo = NULL;
 
-reinit_fail:
-    q->orig_pix_fmt = q->parser->format = avctx->pix_fmt = AV_PIX_FMT_NONE;
-    return ret;
-}
+    av_fifo_free(q->pkt_fifo);
+    q->pkt_fifo = NULL;
 
-void ff_qsv_decode_flush(AVCodecContext *avctx, QSVContext *q)
-{
-    q->orig_pix_fmt = AV_PIX_FMT_NONE;
+    return 0;
 }
diff --git a/libavcodec/qsvdec.h b/libavcodec/qsvdec.h
index 698d8c8..97a3315 100644
--- a/libavcodec/qsvdec.h
+++ b/libavcodec/qsvdec.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,7 +41,7 @@ typedef struct QSVContext {
 
     // the session we allocated internally, in case the caller did not provide
     // one
-    mfxSession internal_session;
+    QSVSession internal_qs;
 
     /**
      * a linked list of frames currently being used by QSV
@@ -49,11 +49,22 @@ typedef struct QSVContext {
     QSVFrame *work_frames;
 
     AVFifoBuffer *async_fifo;
+    AVFifoBuffer *input_fifo;
 
-    // the internal parser and codec context for parsing the data
-    AVCodecParserContext *parser;
-    AVCodecContext *avctx_internal;
-    enum AVPixelFormat orig_pix_fmt;
+    // we should to buffer input packets at some cases
+    // else it is not possible to handle dynamic stream changes correctly
+    // this fifo uses for input packets buffering
+    AVFifoBuffer *pkt_fifo;
+
+    // this flag indicates that header parsed,
+    // decoder instance created and ready to general decoding
+    int engine_ready;
+
+    // we can not just re-init decoder if different sequence header arrived
+    // we should to deliver all buffered frames but we can not decode new packets
+    // this time. So when reinit_pending is non-zero we flushing decoder and
+    // accumulate new arrived packets into pkt_fifo
+    int reinit_pending;
 
     // options set by the caller
     int async_depth;
@@ -67,10 +78,11 @@ typedef struct QSVContext {
 
 int ff_qsv_map_pixfmt(enum AVPixelFormat format);
 
-int ff_qsv_process_data(AVCodecContext *avctx, QSVContext *q,
-                        AVFrame *frame, int *got_frame, AVPacket *pkt);
+int ff_qsv_decode(AVCodecContext *s, QSVContext *q,
+                  AVFrame *frame, int *got_frame,
+                  AVPacket *avpkt);
 
-void ff_qsv_decode_flush(AVCodecContext *avctx, QSVContext *q);
+void ff_qsv_decode_reset(AVCodecContext *avctx, QSVContext *q);
 
 int ff_qsv_decode_close(QSVContext *q);
 
diff --git a/libavcodec/qsvdec_h2645.c b/libavcodec/qsvdec_h2645.c
index a65be99..fda827c 100644
--- a/libavcodec/qsvdec_h2645.c
+++ b/libavcodec/qsvdec_h2645.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Luca Barbato
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,9 +33,7 @@
 
 #include "avcodec.h"
 #include "internal.h"
-#include "qsv_internal.h"
 #include "qsvdec.h"
-#include "qsv.h"
 
 enum LoadPlugin {
     LOAD_PLUGIN_NONE,
@@ -49,35 +47,17 @@ typedef struct QSVH2645Context {
     int load_plugin;
 
     // the filter for converting to Annex B
-    AVBSFContext *bsf;
+    AVBitStreamFilterContext *bsf;
 
-    AVFifoBuffer *packet_fifo;
-
-    AVPacket pkt_filtered;
 } QSVH2645Context;
 
-static void qsv_clear_buffers(QSVH2645Context *s)
-{
-    AVPacket pkt;
-    while (av_fifo_size(s->packet_fifo) >= sizeof(pkt)) {
-        av_fifo_generic_read(s->packet_fifo, &pkt, sizeof(pkt), NULL);
-        av_packet_unref(&pkt);
-    }
-
-    av_bsf_free(&s->bsf);
-
-    av_packet_unref(&s->pkt_filtered);
-}
-
 static av_cold int qsv_decode_close(AVCodecContext *avctx)
 {
     QSVH2645Context *s = avctx->priv_data;
 
     ff_qsv_decode_close(&s->qsv);
 
-    qsv_clear_buffers(s);
-
-    av_fifo_free(s->packet_fifo);
+    av_bitstream_filter_close(s->bsf);
 
     return 0;
 }
@@ -102,123 +82,65 @@ static av_cold int qsv_decode_init(AVCodecContext *avctx)
         }
     }
 
-    s->packet_fifo = av_fifo_alloc(sizeof(AVPacket));
-    if (!s->packet_fifo) {
+    if (avctx->codec_id == AV_CODEC_ID_H264) {
+        s->bsf = av_bitstream_filter_init("h264_mp4toannexb");
+        //regarding ticks_per_frame description, should be 2 for h.264:
+        avctx->ticks_per_frame = 2;
+    } else
+        s->bsf = av_bitstream_filter_init("hevc_mp4toannexb");
+    if (!s->bsf) {
         ret = AVERROR(ENOMEM);
         goto fail;
     }
 
-    s->qsv.iopattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
-
     return 0;
 fail:
     qsv_decode_close(avctx);
     return ret;
 }
 
-static int qsv_init_bsf(AVCodecContext *avctx, QSVH2645Context *s)
-{
-    const char *filter_name = avctx->codec_id == AV_CODEC_ID_HEVC ?
-                              "hevc_mp4toannexb" : "h264_mp4toannexb";
-    const AVBitStreamFilter *filter;
-    int ret;
-
-    if (s->bsf)
-        return 0;
-
-    filter = av_bsf_get_by_name(filter_name);
-    if (!filter)
-        return AVERROR_BUG;
-
-    ret = av_bsf_alloc(filter, &s->bsf);
-    if (ret < 0)
-        return ret;
-
-    ret = avcodec_parameters_from_context(s->bsf->par_in, avctx);
-    if (ret < 0)
-        return ret;
-
-    s->bsf->time_base_in = avctx->time_base;
-
-    ret = av_bsf_init(s->bsf);
-    if (ret < 0)
-        return ret;
-
-    return ret;
-}
-
 static int qsv_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
 {
     QSVH2645Context *s = avctx->priv_data;
     AVFrame *frame    = data;
     int ret;
+    uint8_t *p_filtered = NULL;
+    int      n_filtered = NULL;
+    AVPacket pkt_filtered = { 0 };
 
-    /* make sure the bitstream filter is initialized */
-    ret = qsv_init_bsf(avctx, s);
-    if (ret < 0)
-        return ret;
-
-    /* buffer the input packet */
     if (avpkt->size) {
-        AVPacket input_ref = { 0 };
-
-        if (av_fifo_space(s->packet_fifo) < sizeof(input_ref)) {
-            ret = av_fifo_realloc2(s->packet_fifo,
-                                   av_fifo_size(s->packet_fifo) + sizeof(input_ref));
-            if (ret < 0)
-                return ret;
-        }
-
-        ret = av_packet_ref(&input_ref, avpkt);
-        if (ret < 0)
-            return ret;
-        av_fifo_generic_write(s->packet_fifo, &input_ref, sizeof(input_ref), NULL);
-    }
+        if (avpkt->size > 3 && !avpkt->data[0] &&
+            !avpkt->data[1] && !avpkt->data[2] && 1==avpkt->data[3]) {
+            /* we already have annex-b prefix */
+            return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
 
-    /* process buffered data */
-    while (!*got_frame) {
-        /* prepare the input data -- convert to Annex B if needed */
-        if (s->pkt_filtered.size <= 0) {
-            AVPacket input_ref;
-
-            /* no more data */
-            if (av_fifo_size(s->packet_fifo) < sizeof(AVPacket))
-                return avpkt->size ? avpkt->size : ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, avpkt);
-
-            av_packet_unref(&s->pkt_filtered);
-
-            av_fifo_generic_read(s->packet_fifo, &input_ref, sizeof(input_ref), NULL);
-            ret = av_bsf_send_packet(s->bsf, &input_ref);
-            if (ret < 0) {
-                av_packet_unref(&input_ref);
-                return ret;
+        } else {
+            /* no annex-b prefix. try to restore: */
+            ret = av_bitstream_filter_filter(s->bsf, avctx, "private_spspps_buf",
+                                         &p_filtered, &n_filtered,
+                                         avpkt->data, avpkt->size, 0);
+            if (ret>=0) {
+                pkt_filtered.pts  = avpkt->pts;
+                pkt_filtered.data = p_filtered;
+                pkt_filtered.size = n_filtered;
+
+                ret = ff_qsv_decode(avctx, &s->qsv, frame, got_frame, &pkt_filtered);
+
+                if (p_filtered != avpkt->data)
+                    av_free(p_filtered);
+                return ret > 0 ? avpkt->size : ret;
             }
-
-            ret = av_bsf_receive_packet(s->bsf, &s->pkt_filtered);
-            if (ret < 0)
-                av_packet_move_ref(&s->pkt_filtered, &input_ref);
-            else
-                av_packet_unref(&input_ref);
         }
-
-        ret = ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, &s->pkt_filtered);
-        if (ret < 0)
-            return ret;
-
-        s->pkt_filtered.size -= ret;
-        s->pkt_filtered.data += ret;
     }
 
-    return avpkt->size;
+    return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
 }
 
 static void qsv_decode_flush(AVCodecContext *avctx)
 {
     QSVH2645Context *s = avctx->priv_data;
-
-    qsv_clear_buffers(s);
-    ff_qsv_decode_flush(avctx, &s->qsv);
+    ff_qsv_decode_reset(avctx, &s->qsv);
 }
 
 #define OFFSET(x) offsetof(QSVH2645Context, x)
diff --git a/libavcodec/qsvdec_mpeg2.c b/libavcodec/qsvdec_mpeg2.c
index c319ac0..70ccbc5 100644
--- a/libavcodec/qsvdec_mpeg2.c
+++ b/libavcodec/qsvdec_mpeg2.c
@@ -1,91 +1,49 @@
 /*
- * Intel MediaSDK QSV based MPEG-2 decoder
+ * Intel MediaSDK QSV based MPEG-2 video decoder
  *
- * copyright (c) 2015 Anton Khirnov
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-
 #include <stdint.h>
 #include <string.h>
 
-#include <mfx/mfxvideo.h>
-
 #include "libavutil/common.h"
-#include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 
 #include "avcodec.h"
-#include "internal.h"
-#include "qsv_internal.h"
 #include "qsvdec.h"
-#include "qsv.h"
 
 typedef struct QSVMPEG2Context {
     AVClass *class;
     QSVContext qsv;
-
-    AVFifoBuffer *packet_fifo;
-
-    AVPacket input_ref;
 } QSVMPEG2Context;
 
-static void qsv_clear_buffers(QSVMPEG2Context *s)
-{
-    AVPacket pkt;
-    while (av_fifo_size(s->packet_fifo) >= sizeof(pkt)) {
-        av_fifo_generic_read(s->packet_fifo, &pkt, sizeof(pkt), NULL);
-        av_packet_unref(&pkt);
-    }
-
-    av_packet_unref(&s->input_ref);
-}
-
 static av_cold int qsv_decode_close(AVCodecContext *avctx)
 {
     QSVMPEG2Context *s = avctx->priv_data;
 
     ff_qsv_decode_close(&s->qsv);
 
-    qsv_clear_buffers(s);
-
-    av_fifo_free(s->packet_fifo);
-
     return 0;
 }
 
 static av_cold int qsv_decode_init(AVCodecContext *avctx)
 {
-    QSVMPEG2Context *s = avctx->priv_data;
-    int ret;
-
-    s->packet_fifo = av_fifo_alloc(sizeof(AVPacket));
-    if (!s->packet_fifo) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
-    }
-
-    s->qsv.iopattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
-
     return 0;
-fail:
-    qsv_decode_close(avctx);
-    return ret;
 }
 
 static int qsv_decode_frame(AVCodecContext *avctx, void *data,
@@ -93,53 +51,14 @@ static int qsv_decode_frame(AVCodecContext *avctx, void *data,
 {
     QSVMPEG2Context *s = avctx->priv_data;
     AVFrame *frame    = data;
-    int ret;
-
-    /* buffer the input packet */
-    if (avpkt->size) {
-        AVPacket input_ref = { 0 };
-
-        if (av_fifo_space(s->packet_fifo) < sizeof(input_ref)) {
-            ret = av_fifo_realloc2(s->packet_fifo,
-                                   av_fifo_size(s->packet_fifo) + sizeof(input_ref));
-            if (ret < 0)
-                return ret;
-        }
-
-        ret = av_packet_ref(&input_ref, avpkt);
-        if (ret < 0)
-            return ret;
-        av_fifo_generic_write(s->packet_fifo, &input_ref, sizeof(input_ref), NULL);
-    }
 
-    /* process buffered data */
-    while (!*got_frame) {
-        if (s->input_ref.size <= 0) {
-            /* no more data */
-            if (av_fifo_size(s->packet_fifo) < sizeof(AVPacket))
-                return avpkt->size ? avpkt->size : ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, avpkt);
-
-            av_packet_unref(&s->input_ref);
-            av_fifo_generic_read(s->packet_fifo, &s->input_ref, sizeof(s->input_ref), NULL);
-        }
-
-        ret = ff_qsv_process_data(avctx, &s->qsv, frame, got_frame, &s->input_ref);
-        if (ret < 0)
-            return ret;
-
-        s->input_ref.size -= ret;
-        s->input_ref.data += ret;
-    }
-
-    return avpkt->size;
+    return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
 }
 
 static void qsv_decode_flush(AVCodecContext *avctx)
 {
     QSVMPEG2Context *s = avctx->priv_data;
-
-    qsv_clear_buffers(s);
-    ff_qsv_decode_flush(avctx, &s->qsv);
+    ff_qsv_decode_reset(avctx, &s->qsv);
 }
 
 AVHWAccel ff_mpeg2_qsv_hwaccel = {
diff --git a/libavcodec/qsvdec_vc1.c b/libavcodec/qsvdec_vc1.c
new file mode 100644
index 0000000..fcf101f
--- /dev/null
+++ b/libavcodec/qsvdec_vc1.c
@@ -0,0 +1,97 @@
+/*
+ * Intel MediaSDK QSV based VC-1 video decoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/fifo.h"
+#include "libavutil/opt.h"
+
+#include "avcodec.h"
+#include "qsvdec.h"
+
+typedef struct QSVVC1Context {
+    AVClass *class;
+    QSVContext qsv;
+} QSVVC1Context;
+
+
+static av_cold int qsv_decode_close(AVCodecContext *avctx)
+{
+    QSVVC1Context *s = avctx->priv_data;
+
+    ff_qsv_decode_close(&s->qsv);
+
+    return 0;
+}
+
+static int qsv_decode_frame(AVCodecContext *avctx, void *data,
+                            int *got_frame, AVPacket *avpkt)
+{
+    QSVVC1Context *s = avctx->priv_data;
+    AVFrame *frame    = data;
+
+    return ff_qsv_decode(avctx, &s->qsv, frame, got_frame, avpkt);
+}
+
+static void qsv_decode_flush(AVCodecContext *avctx)
+{
+    QSVVC1Context *s = avctx->priv_data;
+    ff_qsv_decode_reset(avctx, &s->qsv);
+}
+
+AVHWAccel ff_vc1_qsv_hwaccel = {
+    .name           = "vc1_qsv",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .pix_fmt        = AV_PIX_FMT_QSV,
+};
+
+#define OFFSET(x) offsetof(QSVVC1Context, x)
+#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 0, INT_MAX, VD },
+    { NULL },
+};
+
+static const AVClass class = {
+    .class_name = "vc1_qsv",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_vc1_qsv_decoder = {
+    .name           = "vc1_qsv",
+    .long_name      = NULL_IF_CONFIG_SMALL("VC-1 video (Intel Quick Sync Video acceleration)"),
+    .priv_data_size = sizeof(QSVVC1Context),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .init           = NULL,
+    .decode         = qsv_decode_frame,
+    .flush          = qsv_decode_flush,
+    .close          = qsv_decode_close,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .priv_class     = &class,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_NV12,
+                                                    AV_PIX_FMT_QSV,
+                                                    AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/qsvenc.c b/libavcodec/qsvenc.c
index 473a180..f56cb61 100644
--- a/libavcodec/qsvenc.c
+++ b/libavcodec/qsvenc.c
@@ -4,20 +4,20 @@
  * copyright (c) 2013 Yukinori Yamazoe
  * copyright (c) 2015 Anton Khirnov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 #include "libavutil/log.h"
 #include "libavutil/time.h"
 #include "libavutil/imgutils.h"
+#include "libavcodec/bytestream.h"
 
 #include "avcodec.h"
 #include "internal.h"
@@ -271,7 +272,7 @@ static int select_rc_mode(AVCodecContext *avctx, QSVEncContext *q)
     const char *rc_desc;
     mfxU16      rc_mode;
 
-    int want_la     = q->la_depth >= 0;
+    int want_la     = q->look_ahead;
     int want_qscale = !!(avctx->flags & AV_CODEC_FLAG_QSCALE);
     int want_vcm    = q->vcm;
 
@@ -379,18 +380,30 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
     q->param.mfx.BufferSizeInKB     = 0;
 
     q->param.mfx.FrameInfo.FourCC         = MFX_FOURCC_NV12;
-    q->param.mfx.FrameInfo.Width          = FFALIGN(avctx->width, q->width_align);
-    q->param.mfx.FrameInfo.Height         = FFALIGN(avctx->height, 32);
     q->param.mfx.FrameInfo.CropX          = 0;
     q->param.mfx.FrameInfo.CropY          = 0;
     q->param.mfx.FrameInfo.CropW          = avctx->width;
     q->param.mfx.FrameInfo.CropH          = avctx->height;
     q->param.mfx.FrameInfo.AspectRatioW   = avctx->sample_aspect_ratio.num;
     q->param.mfx.FrameInfo.AspectRatioH   = avctx->sample_aspect_ratio.den;
-    q->param.mfx.FrameInfo.PicStruct      = MFX_PICSTRUCT_PROGRESSIVE;
     q->param.mfx.FrameInfo.ChromaFormat   = MFX_CHROMAFORMAT_YUV420;
     q->param.mfx.FrameInfo.BitDepthLuma   = 8;
     q->param.mfx.FrameInfo.BitDepthChroma = 8;
+    q->param.mfx.FrameInfo.Width          = FFALIGN(avctx->width, q->width_align);
+
+    if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
+       /* A true field layout (TFF or BFF) is not important here,
+          it will specified later during frame encoding. But it is important
+          to specify is frame progressive or not because allowed heigh alignment
+          does depend by this.
+        */
+        q->param.mfx.FrameInfo.PicStruct = MFX_PICSTRUCT_FIELD_TFF;
+        q->height_align = 32;
+    } else {
+        q->param.mfx.FrameInfo.PicStruct = MFX_PICSTRUCT_PROGRESSIVE;
+        q->height_align = 16;
+    }
+   q->param.mfx.FrameInfo.Height    = FFALIGN(avctx->height, q->height_align);
 
     if (avctx->framerate.den > 0 && avctx->framerate.num > 0) {
         q->param.mfx.FrameInfo.FrameRateExtN = avctx->framerate.num;
@@ -430,11 +443,11 @@ static int init_video_param(AVCodecContext *avctx, QSVEncContext *q)
 #if QSV_HAVE_LA
     case MFX_RATECONTROL_LA:
         q->param.mfx.TargetKbps  = avctx->bit_rate / 1000;
-        q->extco2.LookAheadDepth = q->la_depth;
+        q->extco2.LookAheadDepth = q->look_ahead_depth;
         break;
 #if QSV_HAVE_ICQ
     case MFX_RATECONTROL_LA_ICQ:
-        q->extco2.LookAheadDepth = q->la_depth;
+        q->extco2.LookAheadDepth = q->look_ahead_depth;
     case MFX_RATECONTROL_ICQ:
         q->param.mfx.ICQQuality  = avctx->global_quality;
         break;
@@ -456,6 +469,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
         q->extco.CAVLC = q->cavlc ? MFX_CODINGOPTION_ON
                                   : MFX_CODINGOPTION_UNKNOWN;
 
+        q->extco.PicTimingSEI         = q->pic_timing_sei ?
+                                        MFX_CODINGOPTION_ON : MFX_CODINGOPTION_UNKNOWN;
+
         if (q->rdo >= 0)
             q->extco.RateDistortionOpt = q->rdo > 0 ? MFX_CODINGOPTION_ON : MFX_CODINGOPTION_OFF;
 
@@ -519,6 +535,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
             q->extparam_internal[q->nb_extparam_internal++] = (mfxExtBuffer *)&q->extco2;
+
+#if QSV_VERSION_ATLEAST(1,8)
+            q->extco2.LookAheadDS           = q->look_ahead_downsampling;
+#endif
         }
 #endif
     }
@@ -676,18 +696,26 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
     }
 
     if (!q->session) {
-        ret = ff_qsv_init_internal_session(avctx, &q->internal_session,
+        ret = ff_qsv_init_internal_session(avctx, &q->internal_qs,
                                            q->load_plugins);
         if (ret < 0)
             return ret;
 
-        q->session = q->internal_session;
+        q->session = q->internal_qs.session;
     }
 
     ret = init_video_param(avctx, q);
     if (ret < 0)
         return ret;
 
+    ret = MFXVideoENCODE_Query(q->session, &q->param,&q->param);
+    if (MFX_WRN_PARTIAL_ACCELERATION==ret) {
+        av_log(avctx, AV_LOG_WARNING, "Encoder will work with partial HW acceleration\n");
+    } else if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error %d querying encoder params\n", ret);
+        return ff_qsv_error(ret);
+    }
+
     ret = MFXVideoENCODE_QueryIOSurf(q->session, &q->param, &q->req);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error querying the encoding parameters\n");
@@ -730,7 +758,9 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
     }
 
     ret = MFXVideoENCODE_Init(q->session, &q->param);
-    if (ret < 0) {
+    if (MFX_WRN_PARTIAL_ACCELERATION==ret) {
+        av_log(avctx, AV_LOG_WARNING, "Encoder will work with partial HW acceleration\n");
+    } else if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing the encoder\n");
         return ff_qsv_error(ret);
     }
@@ -746,12 +776,24 @@ int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q)
     return 0;
 }
 
+static void free_encoder_ctrl_payloads(mfxEncodeCtrl* enc_ctrl)
+{
+    if (enc_ctrl) {
+        int i;
+        for (i = 0; i < enc_ctrl->NumPayload && i < QSV_MAX_ENC_PAYLOAD; i++) {
+            av_free(enc_ctrl->Payload[i]);
+        }
+        enc_ctrl->NumPayload = 0;
+    }
+}
+
 static void clear_unused_frames(QSVEncContext *q)
 {
     QSVFrame *cur = q->work_frames;
     while (cur) {
         if (cur->surface && !cur->surface->Data.Locked) {
             cur->surface = NULL;
+            free_encoder_ctrl_payloads(&cur->enc_ctrl);
             av_frame_unref(cur->frame);
         }
         cur = cur->next;
@@ -784,6 +826,11 @@ static int get_free_frame(QSVEncContext *q, QSVFrame **f)
         av_freep(&frame);
         return AVERROR(ENOMEM);
     }
+    frame->enc_ctrl.Payload = av_mallocz(sizeof(mfxPayload*) * QSV_MAX_ENC_PAYLOAD);
+    if (!frame->enc_ctrl.Payload) {
+        av_freep(&frame);
+        return AVERROR(ENOMEM);
+    }
     *last = frame;
 
     *f = frame;
@@ -792,7 +839,7 @@ static int get_free_frame(QSVEncContext *q, QSVFrame **f)
 }
 
 static int submit_frame(QSVEncContext *q, const AVFrame *frame,
-                        mfxFrameSurface1 **surface)
+                        QSVFrame **new_frame)
 {
     QSVFrame *qf;
     int ret;
@@ -809,8 +856,9 @@ static int submit_frame(QSVEncContext *q, const AVFrame *frame,
         qf->surface = (mfxFrameSurface1*)qf->frame->data[3];
     } else {
         /* make a copy if the input is not padded as libmfx requires */
-        if (frame->height & 31 || frame->linesize[0] & (q->width_align - 1)) {
-            qf->frame->height = FFALIGN(frame->height, 32);
+        if (     frame->height & (q->height_align - 1) ||
+            frame->linesize[0] & (q->width_align - 1)) {
+            qf->frame->height = FFALIGN(frame->height, q->height_align);
             qf->frame->width  = FFALIGN(frame->width, q->width_align);
 
             ret = ff_get_buffer(q->avctx, qf->frame, AV_GET_BUFFER_FLAG_REF);
@@ -852,7 +900,7 @@ static int submit_frame(QSVEncContext *q, const AVFrame *frame,
 
     qf->surface->Data.TimeStamp = av_rescale_q(frame->pts, q->avctx->time_base, (AVRational){1, 90000});
 
-    *surface = qf->surface;
+    *new_frame = qf;
 
     return 0;
 }
@@ -876,16 +924,22 @@ static int encode_frame(AVCodecContext *avctx, QSVEncContext *q,
     mfxBitstream *bs;
 
     mfxFrameSurface1 *surf = NULL;
-    mfxSyncPoint *sync     = NULL;
+    mfxSyncPoint *sync      = NULL;
+    QSVFrame *qsv_frame = NULL;
+    mfxEncodeCtrl* enc_ctrl = NULL;
     int ret;
 
     if (frame) {
-        ret = submit_frame(q, frame, &surf);
+        ret = submit_frame(q, frame, &qsv_frame);
         if (ret < 0) {
             av_log(avctx, AV_LOG_ERROR, "Error submitting the frame for encoding.\n");
             return ret;
         }
     }
+    if (qsv_frame) {
+        surf = qsv_frame->surface;
+        enc_ctrl = &qsv_frame->enc_ctrl;
+    }
 
     ret = av_new_packet(&new_pkt, q->packet_size);
     if (ret < 0) {
@@ -901,6 +955,10 @@ static int encode_frame(AVCodecContext *avctx, QSVEncContext *q,
     bs->Data      = new_pkt.data;
     bs->MaxLength = new_pkt.size;
 
+    if (q->set_encode_ctrl_cb) {
+        q->set_encode_ctrl_cb(avctx, frame, &qsv_frame->enc_ctrl);
+    }
+
     sync = av_mallocz(sizeof(*sync));
     if (!sync) {
         av_freep(&bs);
@@ -909,22 +967,31 @@ static int encode_frame(AVCodecContext *avctx, QSVEncContext *q,
     }
 
     do {
-        ret = MFXVideoENCODE_EncodeFrameAsync(q->session, NULL, surf, bs, sync);
-        if (ret == MFX_WRN_DEVICE_BUSY)
-            av_usleep(1);
-    } while (ret > 0);
+        ret = MFXVideoENCODE_EncodeFrameAsync(q->session, enc_ctrl, surf, bs, sync);
+        if (ret == MFX_WRN_DEVICE_BUSY) {
+            av_usleep(500);
+            continue;
+        }
+        break;
+    } while ( 1 );
 
     if (ret < 0) {
         av_packet_unref(&new_pkt);
         av_freep(&bs);
-        av_freep(&sync);
-        return (ret == MFX_ERR_MORE_DATA) ? 0 : ff_qsv_error(ret);
+        if (ret == MFX_ERR_MORE_DATA)
+            return 0;
+        av_log(avctx, AV_LOG_ERROR, "EncodeFrameAsync returned %d\n", ret);
+        return ff_qsv_error(ret);
     }
 
-    if (ret == MFX_WRN_INCOMPATIBLE_VIDEO_PARAM && frame->interlaced_frame)
-        print_interlace_msg(avctx, q);
-
-    if (*sync) {
+    if (ret == MFX_WRN_INCOMPATIBLE_VIDEO_PARAM) {
+        if (frame->interlaced_frame)
+            print_interlace_msg(avctx, q);
+        else
+            av_log(avctx, AV_LOG_WARNING,
+                   "EncodeFrameAsync returned 'incompatible param' code\n");
+    }
+    if (sync) {
         av_fifo_generic_write(q->async_fifo, &new_pkt, sizeof(new_pkt), NULL);
         av_fifo_generic_write(q->async_fifo, &sync,    sizeof(sync),    NULL);
         av_fifo_generic_write(q->async_fifo, &bs,      sizeof(bs),    NULL);
@@ -1012,15 +1079,15 @@ int ff_qsv_enc_close(AVCodecContext *avctx, QSVEncContext *q)
 
     if (q->session)
         MFXVideoENCODE_Close(q->session);
-    if (q->internal_session)
-        MFXClose(q->internal_session);
-    q->session          = NULL;
-    q->internal_session = NULL;
+    q->session = NULL;
+
+    ff_qsv_close_internal_session(&q->internal_qs);
 
     cur = q->work_frames;
     while (cur) {
         q->work_frames = cur->next;
         av_frame_free(&cur->frame);
+        av_free(cur->enc_ctrl.Payload);
         av_freep(&cur);
         cur = q->work_frames;
     }
diff --git a/libavcodec/qsvenc.h b/libavcodec/qsvenc.h
index 719e4ec..2d7bd32 100644
--- a/libavcodec/qsvenc.h
+++ b/libavcodec/qsvenc.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Yukinori Yamazoe
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -48,36 +48,42 @@
 #define QSV_HAVE_QVBR   QSV_VERSION_ATLEAST(1, 11)
 
 #define QSV_COMMON_OPTS \
-{ "async_depth", "Maximum processing parallelism", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 0, INT_MAX, VE },  \
-{ "avbr_accuracy",    "Accuracy of the AVBR ratecontrol",    OFFSET(qsv.avbr_accuracy),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },     \
-{ "avbr_convergence", "Convergence of the AVBR ratecontrol", OFFSET(qsv.avbr_convergence), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },     \
-{ "preset", NULL, OFFSET(qsv.preset), AV_OPT_TYPE_INT, { .i64 = MFX_TARGETUSAGE_BALANCED }, 0, 7,   VE, "preset" },                             \
-{ "fast",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_SPEED  },   INT_MIN, INT_MAX, VE, "preset" },                             \
-{ "medium", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BALANCED  },     INT_MIN, INT_MAX, VE, "preset" },                             \
-{ "slow",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_QUALITY  }, INT_MIN, INT_MAX, VE, "preset" },                             \
-{ "la_depth", "Number of frames to analyze before encoding.", OFFSET(qsv.la_depth), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT16_MAX, VE },        \
-{ "vcm",      "Use the video conferencing mode ratecontrol",  OFFSET(qsv.vcm),      AV_OPT_TYPE_INT, { .i64 = 0  },  0, 1,         VE },        \
-{ "rdo",            "Enable rate distortion optimization",    OFFSET(qsv.rdo),            AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "max_frame_size", "Maximum encoded frame size in bytes",    OFFSET(qsv.max_frame_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE }, \
-{ "max_slice_size", "Maximum encoded slice size in bytes",    OFFSET(qsv.max_slice_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE }, \
-{ "bitrate_limit",  "Toggle bitrate limitations",             OFFSET(qsv.bitrate_limit),  AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "mbbrc",          "MB level bitrate control",               OFFSET(qsv.mbbrc),          AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "extbrc",         "Extended bitrate control",               OFFSET(qsv.extbrc),         AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "adaptive_i",     "Adaptive I-frame placement",             OFFSET(qsv.adaptive_i),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "adaptive_b",     "Adaptive B-frame placement",             OFFSET(qsv.adaptive_b),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "b_strategy",     "Strategy to choose between I/P/B-frames", OFFSET(qsv.b_strategy),    AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE }, \
-{ "cavlc",          "Enable CAVLC",                           OFFSET(qsv.cavlc),          AV_OPT_TYPE_INT, { .i64 = 0 },   0,          1, VE }, \
-
+{ "async_depth", "Maximum processing parallelism", OFFSET(qsv.async_depth), AV_OPT_TYPE_INT, { .i64 = ASYNC_DEPTH_DEFAULT }, 0, INT_MAX, VE },                          \
+{ "avbr_accuracy",    "Accuracy of the AVBR ratecontrol",    OFFSET(qsv.avbr_accuracy),    AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },                             \
+{ "avbr_convergence", "Convergence of the AVBR ratecontrol", OFFSET(qsv.avbr_convergence), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },                             \
+{ "preset", NULL, OFFSET(qsv.preset), AV_OPT_TYPE_INT, { .i64 = MFX_TARGETUSAGE_BALANCED }, MFX_TARGETUSAGE_BEST_QUALITY, MFX_TARGETUSAGE_BEST_SPEED,   VE, "preset" }, \
+{ "veryfast",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_SPEED  },   INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "faster",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_6  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "fast",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_5  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "medium",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BALANCED  },     INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "slow",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_3  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "slower",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_2  },            INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "veryslow",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_TARGETUSAGE_BEST_QUALITY  }, INT_MIN, INT_MAX, VE, "preset" },                                                \
+{ "vcm",      "Use the video conferencing mode ratecontrol",  OFFSET(qsv.vcm),      AV_OPT_TYPE_INT, { .i64 = 0  },  0, 1,         VE },                                \
+{ "rdo",            "Enable rate distortion optimization",    OFFSET(qsv.rdo),            AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "max_frame_size", "Maximum encoded frame size in bytes",    OFFSET(qsv.max_frame_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE },                         \
+{ "max_slice_size", "Maximum encoded slice size in bytes",    OFFSET(qsv.max_slice_size), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE },                         \
+{ "bitrate_limit",  "Toggle bitrate limitations",             OFFSET(qsv.bitrate_limit),  AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "mbbrc",          "MB level bitrate control",               OFFSET(qsv.mbbrc),          AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "extbrc",         "Extended bitrate control",               OFFSET(qsv.extbrc),         AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "adaptive_i",     "Adaptive I-frame placement",             OFFSET(qsv.adaptive_i),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "adaptive_b",     "Adaptive B-frame placement",             OFFSET(qsv.adaptive_b),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "b_strategy",     "Strategy to choose between I/P/B-frames", OFFSET(qsv.b_strategy),    AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },                         \
+{ "cavlc",          "Enable CAVLC",                           OFFSET(qsv.cavlc),          AV_OPT_TYPE_INT, { .i64 = 0 },   0,          1, VE },                         \
+
+typedef int SetEncodeCtrlCB (AVCodecContext *avctx,
+                             const AVFrame *frame, mfxEncodeCtrl* enc_ctrl);
 typedef struct QSVEncContext {
     AVCodecContext *avctx;
 
     QSVFrame *work_frames;
 
     mfxSession session;
-    mfxSession internal_session;
+    QSVSession internal_qs;
 
     int packet_size;
     int width_align;
+    int height_align;
 
     mfxVideoParam param;
     mfxFrameAllocRequest req;
@@ -105,7 +111,10 @@ typedef struct QSVEncContext {
     int preset;
     int avbr_accuracy;
     int avbr_convergence;
-    int la_depth;
+    int pic_timing_sei;
+    int look_ahead;
+    int look_ahead_depth;
+    int look_ahead_downsampling;
     int vcm;
     int rdo;
     int max_frame_size;
@@ -128,7 +137,9 @@ typedef struct QSVEncContext {
     int int_ref_qp_delta;
     int recovery_point_sei;
 
+    int a53_cc;
     char *load_plugins;
+    SetEncodeCtrlCB *set_encode_ctrl_cb;
 } QSVEncContext;
 
 int ff_qsv_enc_init(AVCodecContext *avctx, QSVEncContext *q);
diff --git a/libavcodec/qsvenc_h264.c b/libavcodec/qsvenc_h264.c
index 7f4fb81..3fb5ca3 100644
--- a/libavcodec/qsvenc_h264.c
+++ b/libavcodec/qsvenc_h264.c
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2013 Yukinori Yamazoe
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,10 +40,45 @@ typedef struct QSVH264EncContext {
     QSVEncContext qsv;
 } QSVH264EncContext;
 
+static int qsv_h264_set_encode_ctrl(AVCodecContext *avctx,
+                                    const AVFrame *frame, mfxEncodeCtrl* enc_ctrl)
+{
+    QSVH264EncContext *qh264 = avctx->priv_data;
+    QSVEncContext *q = &qh264->qsv;
+
+    if (q->a53_cc && frame) {
+        mfxPayload* payload;
+        mfxU8* sei_data;
+        size_t sei_size;
+        int res;
+
+        res = ff_alloc_a53_sei(frame, sizeof(mfxPayload) + 2, (void**)&payload, &sei_size);
+        if (res < 0)
+            return res;
+
+        sei_data = (mfxU8*)(payload + 1);
+        // SEI header
+        sei_data[0] = 4;
+        sei_data[1] = (mfxU8)sei_size; // size of SEI data
+        // SEI data filled in by ff_alloc_a53_sei
+
+        payload->BufSize = sei_size + 2;
+        payload->NumBit = payload->BufSize * 8;
+        payload->Type = 4;
+        payload->Data = sei_data;
+
+        enc_ctrl->NumExtParam = 0;
+        enc_ctrl->NumPayload = 1;
+        enc_ctrl->Payload[0] = payload;
+    }
+    return 0;
+}
+
 static av_cold int qsv_enc_init(AVCodecContext *avctx)
 {
     QSVH264EncContext *q = avctx->priv_data;
 
+    q->qsv.set_encode_ctrl_cb = qsv_h264_set_encode_ctrl;
     return ff_qsv_enc_init(avctx, &q->qsv);
 }
 
@@ -68,9 +103,22 @@ static const AVOption options[] = {
     QSV_COMMON_OPTS
 
     { "idr_interval", "Distance (in I-frames) between IDR frames", OFFSET(qsv.idr_interval), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "pic_timing_sei",    "Insert picture timing SEI with pic_struct_syntax element", OFFSET(qsv.pic_timing_sei), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
     { "single_sei_nal_unit",    "Put all the SEI messages into one NALU",        OFFSET(qsv.single_sei_nal_unit),     AV_OPT_TYPE_INT, { .i64 = -1 }, -1,          1, VE },
     { "max_dec_frame_buffering", "Maximum number of frames buffered in the DPB", OFFSET(qsv.max_dec_frame_buffering), AV_OPT_TYPE_INT, { .i64 = 0 },   0, UINT16_MAX, VE },
 
+#if QSV_HAVE_LA
+    { "look_ahead",       "Use VBR algorithm with look ahead",    OFFSET(qsv.look_ahead),       AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
+    { "look_ahead_depth", "Depth of look ahead in number frames", OFFSET(qsv.look_ahead_depth), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 100, VE },
+#endif
+
+#if QSV_VERSION_ATLEAST(1,8)
+    { "look_ahead_downsampling", NULL, OFFSET(qsv.look_ahead_downsampling), AV_OPT_TYPE_INT, { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, MFX_LOOKAHEAD_DS_UNKNOWN, MFX_LOOKAHEAD_DS_2x, VE, "look_ahead_downsampling" },
+    { "unknown"                , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_UNKNOWN }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "off"                    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_OFF     }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+    { "2x"                     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_LOOKAHEAD_DS_2x      }, INT_MIN, INT_MAX,     VE, "look_ahead_downsampling" },
+#endif
+
     { "int_ref_type", "Intra refresh type",                                      OFFSET(qsv.int_ref_type),            AV_OPT_TYPE_INT, { .i64 = -1 }, -1, UINT16_MAX, VE, "int_ref_type" },
         { "none",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, .flags = VE, "int_ref_type" },
         { "vertical", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, .flags = VE, "int_ref_type" },
@@ -90,6 +138,7 @@ static const AVOption options[] = {
     { "main"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_AVC_MAIN     }, INT_MIN, INT_MAX,     VE, "profile" },
     { "high"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MFX_PROFILE_AVC_HIGH     }, INT_MIN, INT_MAX,     VE, "profile" },
 
+    { "a53cc" , "Use A53 Closed Captions (if available)", OFFSET(qsv.a53_cc), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, VE},
     { NULL },
 };
 
diff --git a/libavcodec/qsvenc_hevc.c b/libavcodec/qsvenc_hevc.c
index 706191d..1d1e801 100644
--- a/libavcodec/qsvenc_hevc.c
+++ b/libavcodec/qsvenc_hevc.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV based HEVC encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qsvenc_mpeg2.c b/libavcodec/qsvenc_mpeg2.c
index f29f445..5b583fb 100644
--- a/libavcodec/qsvenc_mpeg2.c
+++ b/libavcodec/qsvenc_mpeg2.c
@@ -1,20 +1,20 @@
 /*
  * Intel MediaSDK QSV based MPEG-2 encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/qtrle.c b/libavcodec/qtrle.c
index 70440d3..d9d27f0 100644
--- a/libavcodec/qtrle.c
+++ b/libavcodec/qtrle.c
@@ -2,20 +2,20 @@
  * Quicktime Animation (RLE) Video Decoder
  * Copyright (C) 2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,27 +59,38 @@ static void qtrle_decode_1bpp(QtrleContext *s, int row_ptr, int lines_to_change)
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi0, pi1;  /* 2 8-pixel values */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi0, pi1;  /* 2 8-pixel values */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
     int skip;
+    /* skip & 0x80 appears to mean 'start a new line', which can be interpreted
+     * as 'go to next line' during the decoding of a frame but is 'go to first
+     * line' at the beginning. Since we always interpret it as 'go to next line'
+     * in the decoding loop (which makes code simpler/faster), the first line
+     * would not be counted, so we count one more.
+     * See: https://trac.ffmpeg.org/ticket/226
+     * In the following decoding loop, row_ptr will be the position of the
+     * current row. */
 
     row_ptr  -= row_inc;
     pixel_ptr = row_ptr;
     lines_to_change++;
     while (lines_to_change) {
         skip     =              bytestream2_get_byte(&s->g);
-        rle_code = (signed char)bytestream2_get_byte(&s->g);
+        rle_code = (int8_t)bytestream2_get_byte(&s->g);
         if (rle_code == 0)
             break;
         if(skip & 0x80) {
             lines_to_change--;
             row_ptr += row_inc;
-            pixel_ptr = row_ptr + 2 * (skip & 0x7f);
+            pixel_ptr = row_ptr + 2 * 8 * (skip & 0x7f);
         } else
-            pixel_ptr += 2 * skip;
+            pixel_ptr += 2 * 8 * skip;
         CHECK_PIXEL_PTR(0);  /* make sure pixel_ptr is positive */
 
+        if(rle_code == -1)
+            continue;
+
         if (rle_code < 0) {
             /* decode the run length code */
             rle_code = -rle_code;
@@ -88,19 +99,42 @@ static void qtrle_decode_1bpp(QtrleContext *s, int row_ptr, int lines_to_change)
 
             pi0 = bytestream2_get_byte(&s->g);
             pi1 = bytestream2_get_byte(&s->g);
-            CHECK_PIXEL_PTR(rle_code * 2);
+            CHECK_PIXEL_PTR(rle_code * 2 * 8);
 
             while (rle_code--) {
-                rgb[pixel_ptr++] = pi0;
-                rgb[pixel_ptr++] = pi1;
+                rgb[pixel_ptr++] = (pi0 >> 7) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 6) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 5) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 4) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 3) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 2) & 0x01;
+                rgb[pixel_ptr++] = (pi0 >> 1) & 0x01;
+                rgb[pixel_ptr++] =  pi0       & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 7) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 6) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 5) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 4) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 3) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 2) & 0x01;
+                rgb[pixel_ptr++] = (pi1 >> 1) & 0x01;
+                rgb[pixel_ptr++] =  pi1       & 0x01;
             }
         } else {
             /* copy the same pixel directly to output 2 times */
             rle_code *= 2;
-            CHECK_PIXEL_PTR(rle_code);
+            CHECK_PIXEL_PTR(rle_code * 8);
 
-            while (rle_code--)
-                rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
+            while (rle_code--) {
+                int x = bytestream2_get_byte(&s->g);
+                rgb[pixel_ptr++] = (x >> 7) & 0x01;
+                rgb[pixel_ptr++] = (x >> 6) & 0x01;
+                rgb[pixel_ptr++] = (x >> 5) & 0x01;
+                rgb[pixel_ptr++] = (x >> 4) & 0x01;
+                rgb[pixel_ptr++] = (x >> 3) & 0x01;
+                rgb[pixel_ptr++] = (x >> 2) & 0x01;
+                rgb[pixel_ptr++] = (x >> 1) & 0x01;
+                rgb[pixel_ptr++] =  x       & 0x01;
+            }
         }
     }
 }
@@ -111,8 +145,8 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
     int rle_code, i;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi[16];  /* 16 palette indices */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi[16];  /* 16 palette indices */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
     int num_pixels = (bpp == 4) ? 8 : 16;
 
@@ -120,7 +154,7 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
         pixel_ptr = row_ptr + (num_pixels * (bytestream2_get_byte(&s->g) - 1));
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (num_pixels * (bytestream2_get_byte(&s->g) - 1));
@@ -136,8 +170,8 @@ static inline void qtrle_decode_2n4bpp(QtrleContext *s, int row_ptr,
                 }
                 CHECK_PIXEL_PTR(rle_code * num_pixels);
                 while (rle_code--) {
-                    for (i = 0; i < num_pixels; i++)
-                        rgb[pixel_ptr++] = pi[i];
+                    memcpy(&rgb[pixel_ptr], &pi, num_pixels);
+                    pixel_ptr += num_pixels;
                 }
             } else {
                 /* copy the same pixel directly to output 4 times */
@@ -167,15 +201,15 @@ static void qtrle_decode_8bpp(QtrleContext *s, int row_ptr, int lines_to_change)
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char pi1, pi2, pi3, pi4;  /* 4 palette indexes */
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t pi1, pi2, pi3, pi4;  /* 4 palette indexes */
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (4 * (bytestream2_get_byte(&s->g) - 1));
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (4 * (bytestream2_get_byte(&s->g) - 1));
@@ -203,9 +237,8 @@ static void qtrle_decode_8bpp(QtrleContext *s, int row_ptr, int lines_to_change)
                 rle_code *= 4;
                 CHECK_PIXEL_PTR(rle_code);
 
-                while (rle_code--) {
-                    rgb[pixel_ptr++] = bytestream2_get_byte(&s->g);
-                }
+                bytestream2_get_buffer(&s->g, &rgb[pixel_ptr], rle_code);
+                pixel_ptr += rle_code;
             }
         }
         row_ptr += row_inc;
@@ -217,15 +250,15 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned short rgb16;
-    unsigned char *rgb = s->frame->data[0];
+    uint16_t rgb16;
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 2;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 2;
@@ -238,7 +271,7 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
                 CHECK_PIXEL_PTR(rle_code * 2);
 
                 while (rle_code--) {
-                    *(unsigned short *)(&rgb[pixel_ptr]) = rgb16;
+                    *(uint16_t *)(&rgb[pixel_ptr]) = rgb16;
                     pixel_ptr += 2;
                 }
             } else {
@@ -247,7 +280,7 @@ static void qtrle_decode_16bpp(QtrleContext *s, int row_ptr, int lines_to_change
                 /* copy pixels directly to output */
                 while (rle_code--) {
                     rgb16 = bytestream2_get_be16(&s->g);
-                    *(unsigned short *)(&rgb[pixel_ptr]) = rgb16;
+                    *(uint16_t *)(&rgb[pixel_ptr]) = rgb16;
                     pixel_ptr += 2;
                 }
             }
@@ -261,15 +294,15 @@ static void qtrle_decode_24bpp(QtrleContext *s, int row_ptr, int lines_to_change
     int rle_code;
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
-    unsigned char r, g, b;
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t r, g, b;
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 3;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 3;
@@ -309,14 +342,14 @@ static void qtrle_decode_32bpp(QtrleContext *s, int row_ptr, int lines_to_change
     int pixel_ptr;
     int row_inc = s->frame->linesize[0];
     unsigned int argb;
-    unsigned char *rgb = s->frame->data[0];
+    uint8_t *rgb = s->frame->data[0];
     int pixel_limit = s->frame->linesize[0] * s->avctx->height;
 
     while (lines_to_change--) {
         pixel_ptr = row_ptr + (bytestream2_get_byte(&s->g) - 1) * 4;
         CHECK_PIXEL_PTR(0);
 
-        while ((rle_code = (signed char)bytestream2_get_byte(&s->g)) != -1) {
+        while ((rle_code = (int8_t)bytestream2_get_byte(&s->g)) != -1) {
             if (rle_code == 0) {
                 /* there's another skip code in the stream */
                 pixel_ptr += (bytestream2_get_byte(&s->g) - 1) * 4;
@@ -354,13 +387,10 @@ static av_cold int qtrle_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     switch (avctx->bits_per_coded_sample) {
     case 1:
-    case 33:
-        avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-        break;
-
     case 2:
     case 4:
     case 8:
+    case 33:
     case 34:
     case 36:
     case 40:
@@ -403,10 +433,8 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
     int ret;
 
     bytestream2_init(&s->g, avpkt->data, avpkt->size);
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log (s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     /* check if this frame is even supposed to change */
     if (avpkt->size < 8)
@@ -426,6 +454,8 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
         bytestream2_skip(&s->g, 2);
         height     = bytestream2_get_be16(&s->g);
         bytestream2_skip(&s->g, 2);
+        if (height > s->avctx->height - start_line)
+            goto done;
     } else {
         start_line = 0;
         height     = s->avctx->height;
@@ -436,6 +466,7 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
     case 1:
     case 33:
         qtrle_decode_1bpp(s, row_ptr, height);
+        has_palette = 1;
         break;
 
     case 2:
diff --git a/libavcodec/qtrleenc.c b/libavcodec/qtrleenc.c
index 67c88e1..5aa7420 100644
--- a/libavcodec/qtrleenc.c
+++ b/libavcodec/qtrleenc.c
@@ -5,20 +5,20 @@
  *
  * This file is based on flashsvenc.c.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,6 +39,7 @@ typedef struct QtrleEncContext {
     int pixel_size;
     AVFrame *previous_frame;
     unsigned int max_buf_size;
+    int logical_width;
     /**
      * This array will contain at ith position the value of the best RLE code
      * if the line started at pixel i
@@ -78,11 +79,20 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
     QtrleEncContext *s = avctx->priv_data;
 
     if (av_image_check_size(avctx->width, avctx->height, 0, avctx) < 0) {
-        return -1;
+        return AVERROR(EINVAL);
     }
     s->avctx=avctx;
+    s->logical_width=avctx->width;
 
     switch (avctx->pix_fmt) {
+    case AV_PIX_FMT_GRAY8:
+        if (avctx->width % 4) {
+            av_log(avctx, AV_LOG_ERROR, "Width not being a multiple of 4 is not supported\n");
+            return AVERROR(EINVAL);
+        }
+        s->logical_width = avctx->width / 4;
+        s->pixel_size = 4;
+        break;
     case AV_PIX_FMT_RGB555BE:
         s->pixel_size = 2;
         break;
@@ -96,25 +106,25 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "Unsupported colorspace.\n");
         break;
     }
-    avctx->bits_per_coded_sample = s->pixel_size*8;
+    avctx->bits_per_coded_sample = avctx->pix_fmt == AV_PIX_FMT_GRAY8 ? 40 : s->pixel_size*8;
 
-    s->rlecode_table = av_mallocz(s->avctx->width);
-    s->skip_table    = av_mallocz(s->avctx->width);
-    s->length_table  = av_mallocz((s->avctx->width + 1)*sizeof(int));
+    s->rlecode_table = av_mallocz(s->logical_width);
+    s->skip_table    = av_mallocz(s->logical_width);
+    s->length_table  = av_mallocz_array(s->logical_width + 1, sizeof(int));
     if (!s->skip_table || !s->length_table || !s->rlecode_table) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating memory.\n");
-        return -1;
+        return AVERROR(ENOMEM);
     }
     s->previous_frame = av_frame_alloc();
     if (!s->previous_frame) {
         av_log(avctx, AV_LOG_ERROR, "Error allocating picture\n");
-        return -1;
+        return AVERROR(ENOMEM);
     }
 
-    s->max_buf_size = s->avctx->width*s->avctx->height*s->pixel_size*2 /* image base material */
-                      + 15                                           /* header + footer */
-                      + s->avctx->height*2                           /* skip code+rle end */
-                      + s->avctx->width/MAX_RLE_BULK + 1             /* rle codes */;
+    s->max_buf_size = s->logical_width*s->avctx->height*s->pixel_size*2 /* image base material */
+                      + 15                                            /* header + footer */
+                      + s->avctx->height*2                            /* skip code+rle end */
+                      + s->logical_width/MAX_RLE_BULK + 1             /* rle codes */;
 
     return 0;
 }
@@ -124,26 +134,26 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
  */
 static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, uint8_t **buf)
 {
-    int width=s->avctx->width;
+    int width=s->logical_width;
     int i;
     signed char rlecode;
 
-    /* We will use it to compute the best bulk copy sequence */
-    unsigned int bulkcount;
     /* This will be the number of pixels equal to the previous frame one's
      * starting from the ith pixel */
     unsigned int skipcount;
     /* This will be the number of consecutive equal pixels in the current
      * frame, starting from the ith one also */
-    unsigned int repeatcount;
+    unsigned int av_uninit(repeatcount);
 
     /* The cost of the three different possibilities */
-    int total_bulk_cost;
     int total_skip_cost;
     int total_repeat_cost;
 
-    int temp_cost;
-    int j;
+    int base_bulk_cost;
+    int lowest_bulk_cost;
+    int lowest_bulk_cost_index;
+    int sec_lowest_bulk_cost;
+    int sec_lowest_bulk_cost_index;
 
     uint8_t *this_line = p->               data[0] + line*p->               linesize[0] +
         (width - 1)*s->pixel_size;
@@ -153,8 +163,57 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
     s->length_table[width] = 0;
     skipcount = 0;
 
+    /* Initial values */
+    lowest_bulk_cost = INT_MAX / 2;
+    lowest_bulk_cost_index = width;
+    sec_lowest_bulk_cost = INT_MAX / 2;
+    sec_lowest_bulk_cost_index = width;
+
+    base_bulk_cost = 1 + s->pixel_size;
+
     for (i = width - 1; i >= 0; i--) {
 
+        int prev_bulk_cost;
+
+        /* If our lowest bulk cost index is too far away, replace it
+         * with the next lowest bulk cost */
+        if (FFMIN(width, i + MAX_RLE_BULK) < lowest_bulk_cost_index) {
+            lowest_bulk_cost = sec_lowest_bulk_cost;
+            lowest_bulk_cost_index = sec_lowest_bulk_cost_index;
+
+            sec_lowest_bulk_cost = INT_MAX / 2;
+            sec_lowest_bulk_cost_index = width;
+        }
+
+        /* Deal with the first pixel's bulk cost */
+        if (!i) {
+            base_bulk_cost++;
+            lowest_bulk_cost++;
+            sec_lowest_bulk_cost++;
+        }
+
+        /* Look at the bulk cost of the previous loop and see if it is
+         * a new lower bulk cost */
+        prev_bulk_cost = s->length_table[i + 1] + base_bulk_cost;
+        if (prev_bulk_cost <= sec_lowest_bulk_cost) {
+            /* If it's lower than the 2nd lowest, then it may be lower
+             * than the lowest */
+            if (prev_bulk_cost <= lowest_bulk_cost) {
+
+                /* If we have found a new lowest bulk cost,
+                 * then the 2nd lowest bulk cost is now farther than the
+                 * lowest bulk cost, and will never be used */
+                sec_lowest_bulk_cost = INT_MAX / 2;
+
+                lowest_bulk_cost = prev_bulk_cost;
+                lowest_bulk_cost_index = i + 1;
+            } else {
+                /* Then it must be the 2nd lowest bulk cost */
+                sec_lowest_bulk_cost = prev_bulk_cost;
+                sec_lowest_bulk_cost_index = i + 1;
+            }
+        }
+
         if (!s->key_frame && !memcmp(this_line, prev_line, s->pixel_size))
             skipcount = FFMIN(skipcount + 1, MAX_RLE_SKIP);
         else
@@ -190,26 +249,17 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
         }
         else {
             /* We cannot do neither skip nor repeat
-             * thus we search for the best bulk copy to do */
-
-            int limit = FFMIN(width - i, MAX_RLE_BULK);
-
-            temp_cost = 1 + s->pixel_size + !i;
-            total_bulk_cost = INT_MAX;
+             * thus we use the best bulk copy  */
 
-            for (j = 1; j <= limit; j++) {
-                if (s->length_table[i + j] + temp_cost < total_bulk_cost) {
-                    /* We have found a better bulk copy ... */
-                    total_bulk_cost = s->length_table[i + j] + temp_cost;
-                    bulkcount = j;
-                }
-                temp_cost += s->pixel_size;
-            }
+            s->length_table[i]  = lowest_bulk_cost;
+            s->rlecode_table[i] = lowest_bulk_cost_index - i;
 
-            s->length_table[i]  = total_bulk_cost;
-            s->rlecode_table[i] = bulkcount;
         }
 
+        /* These bulk costs increase every iteration */
+        lowest_bulk_cost += s->pixel_size;
+        sec_lowest_bulk_cost += s->pixel_size;
+
         this_line -= s->pixel_size;
         prev_line -= s->pixel_size;
     }
@@ -239,12 +289,28 @@ static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, ui
         }
         else if (rlecode > 0) {
             /* bulk copy */
-            bytestream_put_buffer(buf, this_line + i*s->pixel_size, rlecode*s->pixel_size);
+            if (s->avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                int j;
+                // QT grayscale colorspace has 0=white and 255=black, we will
+                // ignore the palette that is included in the AVFrame because
+                // AV_PIX_FMT_GRAY8 has defined color mapping
+                for (j = 0; j < rlecode*s->pixel_size; ++j)
+                    bytestream_put_byte(buf, *(this_line + i*s->pixel_size + j) ^ 0xff);
+            } else {
+                bytestream_put_buffer(buf, this_line + i*s->pixel_size, rlecode*s->pixel_size);
+            }
             i += rlecode;
         }
         else {
             /* repeat the bits */
-            bytestream_put_buffer(buf, this_line + i*s->pixel_size, s->pixel_size);
+            if (s->avctx->pix_fmt == AV_PIX_FMT_GRAY8) {
+                int j;
+                // QT grayscale colorspace has 0=white and 255=black, ...
+                for (j = 0; j < s->pixel_size; ++j)
+                    bytestream_put_byte(buf, *(this_line + i*s->pixel_size + j) ^ 0xff);
+            } else {
+                bytestream_put_buffer(buf, this_line + i*s->pixel_size, s->pixel_size);
+            }
             i -= rlecode;
         }
     }
@@ -260,7 +326,7 @@ static int encode_frame(QtrleEncContext *s, const AVFrame *p, uint8_t *buf)
     uint8_t *orig_buf = buf;
 
     if (!s->key_frame) {
-        unsigned line_size = s->avctx->width * s->pixel_size;
+        unsigned line_size = s->logical_width * s->pixel_size;
         for (start_line = 0; start_line < s->avctx->height; start_line++)
             if (memcmp(p->data[0] + start_line*p->linesize[0],
                        s->previous_frame->data[0] + start_line * s->previous_frame->linesize[0],
@@ -300,11 +366,8 @@ static int qtrle_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     enum AVPictureType pict_type;
     int ret;
 
-    if ((ret = ff_alloc_packet(pkt, s->max_buf_size)) < 0) {
-        /* Upper bound check for compressed data */
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", s->max_buf_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->max_buf_size, 0)) < 0)
         return ret;
-    }
 
     if (avctx->gop_size == 0 || (s->avctx->frame_number % avctx->gop_size) == 0) {
         /* I-Frame */
@@ -350,6 +413,6 @@ AVCodec ff_qtrle_encoder = {
     .encode2        = qtrle_encode_frame,
     .close          = qtrle_encode_end,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB555BE, AV_PIX_FMT_ARGB, AV_PIX_FMT_NONE
+        AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB555BE, AV_PIX_FMT_ARGB, AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/r210dec.c b/libavcodec/r210dec.c
index f168fd3..9c868cd 100644
--- a/libavcodec/r210dec.c
+++ b/libavcodec/r210dec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Doeffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,7 +27,11 @@
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    avctx->pix_fmt             = AV_PIX_FMT_RGB48;
+    if ((avctx->codec_tag & 0xFFFFFF) == MKTAG('r', '1', '0', 0)) {
+        avctx->pix_fmt = AV_PIX_FMT_BGR48;
+    } else {
+        avctx->pix_fmt = AV_PIX_FMT_RGB48;
+    }
     avctx->bits_per_raw_sample = 10;
 
     return 0;
@@ -39,8 +43,13 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int h, w, ret;
     AVFrame *pic = data;
     const uint32_t *src = (const uint32_t *)avpkt->data;
-    int aligned_width = FFALIGN(avctx->width, 64);
+    int aligned_width = FFALIGN(avctx->width,
+                                avctx->codec_id == AV_CODEC_ID_R10K ? 1 : 64);
     uint8_t *dst_line;
+    int r10 = (avctx->codec_tag & 0xFFFFFF) == MKTAG('r', '1', '0', 0);
+    int le = avctx->codec_tag == MKTAG('R', '1', '0', 'k') &&
+             avctx->extradata_size >= 12 && !memcmp(&avctx->extradata[4], "DpxE", 4) &&
+             !avctx->extradata[11];
 
     if (avpkt->size < 4 * aligned_width * avctx->height) {
         av_log(avctx, AV_LOG_ERROR, "packet too small\n");
@@ -57,14 +66,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     for (h = 0; h < avctx->height; h++) {
         uint16_t *dst = (uint16_t *)dst_line;
         for (w = 0; w < avctx->width; w++) {
-            uint32_t pixel = av_be2ne32(*src++);
+            uint32_t pixel;
             uint16_t r, g, b;
-            if (avctx->codec_id==AV_CODEC_ID_R210) {
+            if (avctx->codec_id == AV_CODEC_ID_AVRP || r10 || le) {
+                pixel = av_le2ne32(*src++);
+            } else {
+                pixel = av_be2ne32(*src++);
+            }
+            if (avctx->codec_id == AV_CODEC_ID_R210 || r10) {
                 b =  pixel <<  6;
                 g = (pixel >>  4) & 0xffc0;
                 r = (pixel >> 14) & 0xffc0;
             } else {
-                b =  pixel <<  4;
+                b = (pixel <<  4) & 0xffc0;
                 g = (pixel >>  6) & 0xffc0;
                 r = (pixel >> 16) & 0xffc0;
             }
@@ -103,3 +117,14 @@ AVCodec ff_r10k_decoder = {
     .capabilities   = AV_CODEC_CAP_DR1,
 };
 #endif
+#if CONFIG_AVRP_DECODER
+AVCodec ff_avrp_decoder = {
+    .name           = "avrp",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRP,
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
+#endif
diff --git a/libavcodec/r210enc.c b/libavcodec/r210enc.c
new file mode 100644
index 0000000..65b3c06
--- /dev/null
+++ b/libavcodec/r210enc.c
@@ -0,0 +1,102 @@
+/*
+ * R210 encoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "bytestream.h"
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pic, int *got_packet)
+{
+    int i, j, ret;
+    int aligned_width = FFALIGN(avctx->width,
+                                avctx->codec_id == AV_CODEC_ID_R10K ? 1 : 64);
+    int pad = (aligned_width - avctx->width) * 4;
+    uint8_t *src_line;
+    uint8_t *dst;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, 4 * aligned_width * avctx->height, 0)) < 0)
+        return ret;
+
+    src_line = pic->data[0];
+    dst = pkt->data;
+
+    for (i = 0; i < avctx->height; i++) {
+        uint16_t *src = (uint16_t *)src_line;
+        for (j = 0; j < avctx->width; j++) {
+            uint32_t pixel;
+            uint16_t r = *src++ >> 6;
+            uint16_t g = *src++ >> 6;
+            uint16_t b = *src++ >> 6;
+            if (avctx->codec_id == AV_CODEC_ID_R210)
+                pixel = (r << 20) | (g << 10) | b;
+            else
+                pixel = (r << 22) | (g << 12) | (b << 2);
+            if (avctx->codec_id == AV_CODEC_ID_AVRP)
+                bytestream_put_le32(&dst, pixel);
+            else
+                bytestream_put_be32(&dst, pixel);
+        }
+        memset(dst, 0, pad);
+        dst += pad;
+        src_line += pic->linesize[0];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+
+#if CONFIG_R210_ENCODER
+AVCodec ff_r210_encoder = {
+    .name           = "r210",
+    .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed RGB 10-bit"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_R210,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_RGB48, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_R10K_ENCODER
+AVCodec ff_r10k_encoder = {
+    .name           = "r10k",
+    .long_name      = NULL_IF_CONFIG_SMALL("AJA Kona 10-bit RGB Codec"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_R10K,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_RGB48, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_AVRP_ENCODER
+AVCodec ff_avrp_encoder = {
+    .name           = "avrp",
+    .long_name      = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_AVRP,
+    .encode2        = encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_RGB48, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
diff --git a/libavcodec/ra144.c b/libavcodec/ra144.c
index ccaa149..ceec32d 100644
--- a/libavcodec/ra144.c
+++ b/libavcodec/ra144.c
@@ -2,20 +2,20 @@
  * Real Audio 1.0 (14.4K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1566,8 +1566,15 @@ int ff_eval_refl(int *refl, const int16_t *coefs, AVCodecContext *avctx)
         if (!b)
             b = -2;
 
-        for (j=0; j <= i; j++)
-            bp1[j] = ((bp2[j] - ((refl[i+1] * bp2[i-j]) >> 12)) * (0x1000000 / b)) >> 12;
+        b = 0x1000000 / b;
+        for (j=0; j <= i; j++) {
+#if CONFIG_FTRAPV
+            int a = bp2[j] - ((refl[i+1] * bp2[i-j]) >> 12);
+            if((int)(a*(unsigned)b) != a*(int64_t)b)
+                return 1;
+#endif
+            bp1[j] = ((bp2[j] - ((refl[i+1] * bp2[i-j]) >> 12)) * b) >> 12;
+        }
 
         if ((unsigned) bp1[i] + 0x1000 > 0x1fff)
             return 1;
@@ -1674,12 +1681,9 @@ unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy)
 }
 
 /** inverse root mean square */
-int ff_irms(const int16_t *data)
+int ff_irms(AudioDSPContext *adsp, const int16_t *data)
 {
-    unsigned int i, sum = 0;
-
-    for (i=0; i < BLOCKSIZE; i++)
-        sum += data[i] * data[i];
+    unsigned int sum = adsp->scalarproduct_int16(data, data, BLOCKSIZE);
 
     if (sum == 0)
         return 0; /* OOPS - division by zero */
@@ -1687,18 +1691,17 @@ int ff_irms(const int16_t *data)
     return 0x20000000 / (ff_t_sqrt(sum) >> 8);
 }
 
-void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
+void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs,
                            int cba_idx, int cb1_idx, int cb2_idx,
                            int gval, int gain)
 {
-    uint16_t buffer_a[BLOCKSIZE];
-    uint16_t *block;
+    int16_t *block;
     int m[3];
 
     if (cba_idx) {
         cba_idx += BLOCKSIZE/2 - 1;
-        ff_copy_and_dup(buffer_a, ractx->adapt_cb, cba_idx);
-        m[0] = (ff_irms(buffer_a) * gval) >> 12;
+        ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx);
+        m[0] = (ff_irms(&ractx->adsp, ractx->buffer_a) * gval) >> 12;
     } else {
         m[0] = 0;
     }
@@ -1709,7 +1712,7 @@ void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
 
     block = ractx->adapt_cb + BUFFERSIZE - BLOCKSIZE;
 
-    add_wav(block, gain, cba_idx, m, cba_idx? buffer_a: NULL,
+    add_wav(block, gain, cba_idx, m, cba_idx? ractx->buffer_a: NULL,
             ff_cb1_vects[cb1_idx], ff_cb2_vects[cb2_idx]);
 
     memcpy(ractx->curr_sblock, ractx->curr_sblock + BLOCKSIZE,
diff --git a/libavcodec/ra144.h b/libavcodec/ra144.h
index 89d4fb5..19a4ce0 100644
--- a/libavcodec/ra144.h
+++ b/libavcodec/ra144.h
@@ -2,20 +2,20 @@
  * Real Audio 1.0 (14.4K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,16 +25,18 @@
 #include <stdint.h>
 #include "lpc.h"
 #include "audio_frame_queue.h"
+#include "audiodsp.h"
 
 #define NBLOCKS         4       ///< number of subblocks within a block
 #define BLOCKSIZE       40      ///< subblock size in 16-bit words
 #define BUFFERSIZE      146     ///< the size of the adaptive codebook
 #define FIXED_CB_SIZE   128     ///< size of fixed codebooks
-#define FRAMESIZE       20      ///< size of encoded frame
+#define FRAME_SIZE      20      ///< size of encoded frame
 #define LPC_ORDER       10      ///< order of LPC filter
 
 typedef struct RA144Context {
     AVCodecContext *avctx;
+    AudioDSPContext adsp;
     LPCContext lpc_ctx;
     AudioFrameQueue afq;
     int last_frame;
@@ -56,7 +58,9 @@ typedef struct RA144Context {
 
     /** Adaptive codebook, its size is two units bigger to avoid a
      *  buffer overflow. */
-    uint16_t adapt_cb[146+2];
+    int16_t adapt_cb[146+2];
+
+    DECLARE_ALIGNED(16, int16_t, buffer_a)[FFALIGN(BLOCKSIZE,16)];
 } RA144Context;
 
 void ff_copy_and_dup(int16_t *target, const int16_t *source, int offset);
@@ -68,8 +72,8 @@ unsigned int ff_rms(const int *data);
 int ff_interp(RA144Context *ractx, int16_t *out, int a, int copyold,
               int energy);
 unsigned int ff_rescale_rms(unsigned int rms, unsigned int energy);
-int ff_irms(const int16_t *data);
-void ff_subblock_synthesis(RA144Context *ractx, const uint16_t *lpc_coefs,
+int ff_irms(AudioDSPContext *adsp, const int16_t *data/*align 16*/);
+void ff_subblock_synthesis(RA144Context *ractx, const int16_t *lpc_coefs,
                            int cba_idx, int cb1_idx, int cb2_idx,
                            int gval, int gain);
 
diff --git a/libavcodec/ra144dec.c b/libavcodec/ra144dec.c
index 2895357..3eed17c 100644
--- a/libavcodec/ra144dec.c
+++ b/libavcodec/ra144dec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2003 Nick Kurshev
  *     Based on public domain decoder at http://www.honeypot.net/audio
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ static av_cold int ra144_decode_init(AVCodecContext * avctx)
     RA144Context *ractx = avctx->priv_data;
 
     ractx->avctx = avctx;
+    ff_audiodsp_init(&ractx->adsp);
 
     ractx->lpc_coef[0] = ractx->lpc_tables[0];
     ractx->lpc_coef[1] = ractx->lpc_tables[1];
@@ -45,7 +46,7 @@ static av_cold int ra144_decode_init(AVCodecContext * avctx)
     return 0;
 }
 
-static void do_output_subblock(RA144Context *ractx, const uint16_t  *lpc_coefs,
+static void do_output_subblock(RA144Context *ractx, const int16_t  *lpc_coefs,
                                int gval, GetBitContext *gb)
 {
     int cba_idx = get_bits(gb, 7); // index of the adaptive CB, 0 if none
@@ -66,7 +67,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
     int buf_size = avpkt->size;
     static const uint8_t sizes[LPC_ORDER] = {6, 5, 5, 4, 4, 3, 3, 3, 3, 2};
     unsigned int refl_rms[NBLOCKS];           // RMS of the reflection coefficients
-    uint16_t block_coefs[NBLOCKS][LPC_ORDER]; // LPC coefficients of each sub-block
+    int16_t block_coefs[NBLOCKS][LPC_ORDER];  // LPC coefficients of each sub-block
     unsigned int lpc_refl[LPC_ORDER];         // LPC reflection coefficients of the frame
     int i, j;
     int ret;
@@ -76,7 +77,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
     RA144Context *ractx = avctx->priv_data;
     GetBitContext gb;
 
-    if (buf_size < FRAMESIZE) {
+    if (buf_size < FRAME_SIZE) {
         av_log(avctx, AV_LOG_ERROR,
                "Frame too small (%d bytes). Truncated file?\n", buf_size);
         *got_frame_ptr = 0;
@@ -85,13 +86,11 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = NBLOCKS * BLOCKSIZE;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
-    init_get_bits(&gb, buf, FRAMESIZE * 8);
+    init_get_bits8(&gb, buf, FRAME_SIZE);
 
     for (i = 0; i < LPC_ORDER; i++)
         lpc_refl[i] = ff_lpc_refl_cb[i][get_bits(&gb, sizes[i])];
@@ -124,7 +123,7 @@ static int ra144_decode_frame(AVCodecContext * avctx, void *data,
 
     *got_frame_ptr = 1;
 
-    return FRAMESIZE;
+    return FRAME_SIZE;
 }
 
 AVCodec ff_ra_144_decoder = {
diff --git a/libavcodec/ra144enc.c b/libavcodec/ra144enc.c
index 678e668..d3a7fff 100644
--- a/libavcodec/ra144enc.c
+++ b/libavcodec/ra144enc.c
@@ -2,20 +2,20 @@
  * Real Audio 1.0 (14.4K) encoder
  * Copyright (c) 2010 Francesco Lavra <francescolavra@interfree.it>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,6 @@
 #include "put_bits.h"
 #include "ra144.h"
 
-
 static av_cold int ra144_encode_close(AVCodecContext *avctx)
 {
     RA144Context *ractx = avctx->priv_data;
@@ -62,6 +61,7 @@ static av_cold int ra144_encode_init(AVCodecContext * avctx)
     ractx->lpc_coef[0] = ractx->lpc_tables[0];
     ractx->lpc_coef[1] = ractx->lpc_tables[1];
     ractx->avctx = avctx;
+    ff_audiodsp_init(&ractx->adsp);
     ret = ff_lpc_init(&ractx->lpc_ctx, avctx->frame_size, LPC_ORDER,
                       FF_LPC_TYPE_LEVINSON);
     if (ret < 0)
@@ -198,8 +198,8 @@ static void create_adapt_vect(float *vect, const int16_t *cb, int lag)
 static int adaptive_cb_search(const int16_t *adapt_cb, float *work,
                               const float *coefs, float *data)
 {
-    int i, best_vect;
-    float score, gain, best_score, best_gain;
+    int i, av_uninit(best_vect);
+    float score, gain, best_score, av_uninit(best_gain);
     float exc[BLOCKSIZE];
 
     gain = best_score = 0;
@@ -335,9 +335,9 @@ static void ra144_encode_subblock(RA144Context *ractx,
     float data[BLOCKSIZE] = { 0 }, work[LPC_ORDER + BLOCKSIZE];
     float coefs[LPC_ORDER];
     float zero[BLOCKSIZE], cba[BLOCKSIZE], cb1[BLOCKSIZE], cb2[BLOCKSIZE];
-    int16_t cba_vect[BLOCKSIZE];
     int cba_idx, cb1_idx, cb2_idx, gain;
-    int i, n, m[3];
+    int i, n;
+    unsigned m[3];
     float g[3];
     float error, best_error;
 
@@ -373,8 +373,8 @@ static void ra144_encode_subblock(RA144Context *ractx,
          */
         memcpy(cba, work + LPC_ORDER, sizeof(cba));
 
-        ff_copy_and_dup(cba_vect, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1);
-        m[0] = (ff_irms(cba_vect) * rms) >> 12;
+        ff_copy_and_dup(ractx->buffer_a, ractx->adapt_cb, cba_idx + BLOCKSIZE / 2 - 1);
+        m[0] = (ff_irms(&ractx->adsp, ractx->buffer_a) * rms) >> 12;
     }
     fixed_cb_search(work + LPC_ORDER, coefs, data, cba_idx, &cb1_idx, &cb2_idx);
     for (i = 0; i < BLOCKSIZE; i++) {
@@ -447,10 +447,8 @@ static int ra144_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     if (ractx->last_frame)
         return 0;
 
-    if ((ret = ff_alloc_packet(avpkt, FRAMESIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, FRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
     /**
      * Since the LPC coefficients are calculated on a frame centered over the
@@ -538,7 +536,7 @@ static int ra144_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     ff_af_queue_remove(&ractx->afq, avctx->frame_size, &avpkt->pts,
                        &avpkt->duration);
 
-    avpkt->size = FRAMESIZE;
+    avpkt->size = FRAME_SIZE;
     *got_packet_ptr = 1;
     return 0;
 }
@@ -556,4 +554,6 @@ AVCodec ff_ra_144_encoder = {
     .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SMALL_LAST_FRAME,
     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
                                                      AV_SAMPLE_FMT_NONE },
+    .supported_samplerates = (const int[]){ 8000, 0 },
+    .channel_layouts = (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0 },
 };
diff --git a/libavcodec/ra288.c b/libavcodec/ra288.c
index a9f1045..e141bb3 100644
--- a/libavcodec/ra288.c
+++ b/libavcodec/ra288.c
@@ -2,20 +2,20 @@
  * RealAudio 2.0 (28.8K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
 #define RA288_BLOCKS_PER_FRAME 32
 
 typedef struct RA288Context {
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     DECLARE_ALIGNED(32, float,   sp_lpc)[FFALIGN(36, 16)];   ///< LPC coefficients for speech data (spec: A)
     DECLARE_ALIGNED(32, float, gain_lpc)[FFALIGN(10, 16)];   ///< LPC coefficients for gain        (spec: GB)
 
@@ -59,6 +59,15 @@ typedef struct RA288Context {
     float gain_rec[11];
 } RA288Context;
 
+static av_cold int ra288_decode_close(AVCodecContext *avctx)
+{
+    RA288Context *ractx = avctx->priv_data;
+
+    av_freep(&ractx->fdsp);
+
+    return 0;
+}
+
 static av_cold int ra288_decode_init(AVCodecContext *avctx)
 {
     RA288Context *ractx = avctx->priv_data;
@@ -67,7 +76,14 @@ static av_cold int ra288_decode_init(AVCodecContext *avctx)
     avctx->channel_layout = AV_CH_LAYOUT_MONO;
     avctx->sample_fmt     = AV_SAMPLE_FMT_FLT;
 
-    avpriv_float_dsp_init(&ractx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (avctx->block_align <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported block align\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    ractx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!ractx->fdsp)
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -104,14 +120,14 @@ static void decode(RA288Context *ractx, float gain, int cb_coef)
     for (i=0; i < 5; i++)
         buffer[i] = codetable[cb_coef][i] * sumsum;
 
-    sum = avpriv_scalarproduct_float_c(buffer, buffer, 5) * ((1 << 24) / 5.0);
+    sum = avpriv_scalarproduct_float_c(buffer, buffer, 5);
 
-    sum = FFMAX(sum, 1);
+    sum = FFMAX(sum, 5.0 / (1<<24));
 
     /* shift and store */
     memmove(gain_block, gain_block + 1, 9 * sizeof(*gain_block));
 
-    gain_block[9] = 10 * log10(sum) - 32;
+    gain_block[9] = 10 * log10(sum) + (10*log10(((1<<24)/5.)) - 32);
 
     ff_celp_lp_synthesis_filterf(block, ractx->sp_lpc, buffer, 5, 36);
 }
@@ -139,7 +155,9 @@ static void do_hybrid_window(RA288Context *ractx,
                                             MAX_BACKWARD_FILTER_LEN   +
                                             MAX_BACKWARD_FILTER_NONREC, 16)]);
 
-    ractx->fdsp.vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16));
+    av_assert2(order>=0);
+
+    ractx->fdsp->vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16));
 
     convolve(buffer1, work + order    , n      , order);
     convolve(buffer2, work + order + n, non_rec, order);
@@ -166,7 +184,7 @@ static void backward_filter(RA288Context *ractx,
     do_hybrid_window(ractx, order, n, non_rec, temp, hist, rec, window);
 
     if (!compute_lpc_coefs(temp, order, lpc, 0, 1, 1))
-        ractx->fdsp.vector_fmul(lpc, lpc, tab, FFALIGN(order, 16));
+        ractx->fdsp->vector_fmul(lpc, lpc, tab, FFALIGN(order, 16));
 
     memmove(hist, hist + n, move_size*sizeof(*hist));
 }
@@ -189,16 +207,16 @@ static int ra288_decode_frame(AVCodecContext * avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    ret = init_get_bits8(&gb, buf, avctx->block_align);
+    if (ret < 0)
+        return ret;
+
     /* get output buffer */
     frame->nb_samples = RA288_BLOCK_SIZE * RA288_BLOCKS_PER_FRAME;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     out = (float *)frame->data[0];
 
-    init_get_bits(&gb, buf, avctx->block_align * 8);
-
     for (i=0; i < RA288_BLOCKS_PER_FRAME; i++) {
         float gain = amptable[get_bits(&gb, 3)];
         int cb_coef = get_bits(&gb, 6 + (i&1));
@@ -230,5 +248,6 @@ AVCodec ff_ra_288_decoder = {
     .priv_data_size = sizeof(RA288Context),
     .init           = ra288_decode_init,
     .decode         = ra288_decode_frame,
+    .close          = ra288_decode_close,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/ra288.h b/libavcodec/ra288.h
index 3d6ed8d..fa0b528 100644
--- a/libavcodec/ra288.h
+++ b/libavcodec/ra288.h
@@ -2,20 +2,20 @@
  * RealAudio 2.0 (28.8K)
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ralf.c b/libavcodec/ralf.c
index bed5adf..3f7953c 100644
--- a/libavcodec/ralf.c
+++ b/libavcodec/ralf.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -461,10 +461,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     }
 
     frame->nb_samples = ctx->max_frame_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Me fail get_buffer()? That's unpossible!\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples0 = (int16_t *)frame->data[0];
     samples1 = (int16_t *)frame->data[1];
 
@@ -481,7 +479,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame_ptr,
     init_get_bits(&gb, src + 2, table_size);
     ctx->num_blocks = 0;
     while (get_bits_left(&gb) > 0) {
-        ctx->block_size[ctx->num_blocks] = get_bits(&gb, 15);
+        ctx->block_size[ctx->num_blocks] = get_bits(&gb, 13 + avctx->channels);
         if (get_bits1(&gb)) {
             ctx->block_pts[ctx->num_blocks] = get_bits(&gb, 9);
         } else {
diff --git a/libavcodec/ralfdata.h b/libavcodec/ralfdata.h
index 83eb970..9a84e45 100644
--- a/libavcodec/ralfdata.h
+++ b/libavcodec/ralfdata.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rangecoder-test.c b/libavcodec/rangecoder-test.c
index f4c76c0..2892949 100644
--- a/libavcodec/rangecoder-test.c
+++ b/libavcodec/rangecoder-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@ int main(void)
     av_lfg_init(&prng, 1);
 
     ff_init_range_encoder(&c, b, SIZE);
-    ff_build_rac_states(&c, 0.05 * (1LL << 32), 128 + 64 + 32 + 16);
+    ff_build_rac_states(&c, (1LL << 32) / 20, 128 + 64 + 32 + 16);
 
     memset(state, 128, sizeof(state));
 
diff --git a/libavcodec/rangecoder.c b/libavcodec/rangecoder.c
index 4c4731d..9c6ef75 100644
--- a/libavcodec/rangecoder.c
+++ b/libavcodec/rangecoder.c
@@ -2,20 +2,20 @@
  * Range coder
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 
 #include "avcodec.h"
@@ -108,8 +109,8 @@ int ff_rac_terminate(RangeCoder *c)
     c->range = 0xFF;
     renorm_encoder(c);
 
-    assert(c->low == 0);
-    assert(c->range >= 0x100);
+    av_assert1(c->low   == 0);
+    av_assert1(c->range >= 0x100);
 
     return c->bytestream - c->bytestream_start;
 }
diff --git a/libavcodec/rangecoder.h b/libavcodec/rangecoder.h
index 4c88169..d36fbd7 100644
--- a/libavcodec/rangecoder.h
+++ b/libavcodec/rangecoder.h
@@ -2,20 +2,20 @@
  * Range coder
  * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #define AVCODEC_RANGECODER_H
 
 #include <stdint.h>
-#include <assert.h>
 
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
 
 typedef struct RangeCoder {
     int low;
@@ -86,9 +86,9 @@ static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
 {
     int range1 = (c->range * (*state)) >> 8;
 
-    assert(*state);
-    assert(range1 < c->range);
-    assert(range1 > 0);
+    av_assert2(*state);
+    av_assert2(range1 < c->range);
+    av_assert2(range1 > 0);
     if (!bit) {
         c->range -= range1;
         *state    = c->zero_state[*state];
diff --git a/libavcodec/ratecontrol.c b/libavcodec/ratecontrol.c
index 47a1490..3a8ac22 100644
--- a/libavcodec/ratecontrol.c
+++ b/libavcodec/ratecontrol.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,13 +35,6 @@
 #include "mpegvideo.h"
 #include "libavutil/eval.h"
 
-#undef NDEBUG // Always check asserts, the speed effect is far too small to disable them.
-#include <assert.h>
-
-#ifndef M_E
-#define M_E 2.718281828
-#endif
-
 static int init_pass2(MpegEncContext *s);
 static double get_qscale(MpegEncContext *s, RateControlEntry *rce,
                          double rate_factor, int frame_num);
@@ -50,7 +43,7 @@ void ff_write_pass1_stats(MpegEncContext *s)
 {
     snprintf(s->avctx->stats_out, 256,
              "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d "
-             "fcode:%d bcode:%d mc-var:%d var:%d icount:%d skipcount:%d hbits:%d;\n",
+             "fcode:%d bcode:%d mc-var:%"PRId64" var:%"PRId64" icount:%d skipcount:%d hbits:%d;\n",
              s->current_picture_ptr->f->display_picture_number,
              s->current_picture_ptr->f->coded_picture_number,
              s->pict_type,
@@ -67,6 +60,11 @@ void ff_write_pass1_stats(MpegEncContext *s)
              s->header_bits);
 }
 
+static double get_fps(AVCodecContext *avctx)
+{
+    return 1.0 / av_q2d(avctx->time_base) / FFMAX(avctx->ticks_per_frame, 1);
+}
+
 static inline double qp2bits(RateControlEntry *rce, double qp)
 {
     if (qp <= 0.0) {
@@ -128,6 +126,13 @@ av_cold int ff_rate_control_init(MpegEncContext *s)
     };
     emms_c();
 
+    if (!s->avctx->rc_max_available_vbv_use && s->avctx->rc_buffer_size) {
+        if (s->avctx->rc_max_rate) {
+            s->avctx->rc_max_available_vbv_use = av_clipf(s->avctx->rc_max_rate/(s->avctx->rc_buffer_size*get_fps(s->avctx)), 1.0/3, 1.0);
+        } else
+            s->avctx->rc_max_available_vbv_use = 1.0;
+    }
+
     res = av_expr_parse(&rcc->rc_eq_eval,
                         s->rc_eq ? s->rc_eq : "tex^qComp",
                         const_names, func1_names, func1,
@@ -158,6 +163,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
         rcc->last_qscale_for[i] = FF_QP2LAMBDA * 5;
     }
     rcc->buffer_index = s->avctx->rc_initial_buffer_occupancy;
+    if (!rcc->buffer_index)
+        rcc->buffer_index = s->avctx->rc_buffer_size * 3 / 4;
 
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
         int i;
@@ -171,9 +178,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
         if (i <= 0 || i >= INT_MAX / sizeof(RateControlEntry))
             return -1;
         rcc->entry       = av_mallocz(i * sizeof(RateControlEntry));
-        rcc->num_entries = i;
         if (!rcc->entry)
             return AVERROR(ENOMEM);
+        rcc->num_entries = i;
 
         /* init all to skipped P-frames
          * (with B-frames we might have a not encoded frame at the end FIXME) */
@@ -201,11 +208,11 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
             e = sscanf(p, " in:%d ", &picture_number);
 
-            assert(picture_number >= 0);
-            assert(picture_number < rcc->num_entries);
+            av_assert0(picture_number >= 0);
+            av_assert0(picture_number < rcc->num_entries);
             rce = &rcc->entry[picture_number];
 
-            e += sscanf(p, " in:%*d out:%*d type:%d q:%f itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%d var:%d icount:%d skipcount:%d hbits:%d",
+            e += sscanf(p, " in:%*d out:%*d type:%d q:%f itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%"SCNd64" var:%"SCNd64" icount:%d skipcount:%d hbits:%d",
                         &rce->pict_type, &rce->qscale, &rce->i_tex_bits, &rce->p_tex_bits,
                         &rce->mv_bits, &rce->misc_bits,
                         &rce->f_code, &rce->b_code,
@@ -226,8 +233,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
             return -1;
         }
 
+#if FF_API_RC_STRATEGY
+        av_assert0(MPV_RC_STRATEGY_XVID == FF_RC_STRATEGY_XVID);
+#endif
+
         // FIXME maybe move to end
-        if ((s->avctx->flags & AV_CODEC_FLAG_PASS2) && s->rc_strategy == 1) {
+        if ((s->avctx->flags & AV_CODEC_FLAG_PASS2) && s->rc_strategy == MPV_RC_STRATEGY_XVID) {
 #if CONFIG_LIBXVID
             return ff_xvid_rate_control_init(s);
 #else
@@ -290,7 +301,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 get_qscale(s, &rce, rcc->pass1_wanted_bits / rcc->pass1_rc_eq_output_sum, i);
 
                 // FIXME misbehaves a little for variable fps
-                rcc->pass1_wanted_bits += s->bit_rate / (1 / av_q2d(s->avctx->time_base));
+                rcc->pass1_wanted_bits += s->bit_rate / get_fps(s->avctx);
             }
         }
     }
@@ -307,7 +318,7 @@ av_cold void ff_rate_control_uninit(MpegEncContext *s)
     av_freep(&rcc->entry);
 
 #if CONFIG_LIBXVID
-    if ((s->avctx->flags & AV_CODEC_FLAG_PASS2) && s->rc_strategy == 1)
+    if ((s->avctx->flags & AV_CODEC_FLAG_PASS2) && s->rc_strategy == MPV_RC_STRATEGY_XVID)
         ff_xvid_rate_control_uninit(s);
 #endif
 }
@@ -315,7 +326,7 @@ av_cold void ff_rate_control_uninit(MpegEncContext *s)
 int ff_vbv_update(MpegEncContext *s, int frame_size)
 {
     RateControlContext *rcc = &s->rc_context;
-    const double fps        = 1 / av_q2d(s->avctx->time_base);
+    const double fps        = get_fps(s->avctx);
     const int buffer_size   = s->avctx->rc_buffer_size;
     const double min_rate   = s->avctx->rc_min_rate / fps;
     const double max_rate   = s->avctx->rc_max_rate / fps;
@@ -329,6 +340,9 @@ int ff_vbv_update(MpegEncContext *s, int frame_size)
         rcc->buffer_index -= frame_size;
         if (rcc->buffer_index < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "rc buffer underflow\n");
+            if (frame_size > max_rate && s->qscale == s->avctx->qmax) {
+                av_log(s->avctx, AV_LOG_ERROR, "max bitrate possibly too small or try trellis with large lmax or increase qmax\n");
+            }
             rcc->buffer_index = 0;
         }
 
@@ -477,7 +491,7 @@ static void get_qminmax(int *qmin_ret, int *qmax_ret, MpegEncContext *s, int pic
     int qmin = s->lmin;
     int qmax = s->lmax;
 
-    assert(qmin <= qmax);
+    av_assert0(qmin <= qmax);
 
     switch (pict_type) {
     case AV_PICTURE_TYPE_B:
@@ -505,7 +519,7 @@ static double modify_qscale(MpegEncContext *s, RateControlEntry *rce,
 {
     RateControlContext *rcc  = &s->rc_context;
     const double buffer_size = s->avctx->rc_buffer_size;
-    const double fps         = 1 / av_q2d(s->avctx->time_base);
+    const double fps         = get_fps(s->avctx);
     const double min_rate    = s->avctx->rc_min_rate / fps;
     const double max_rate    = s->avctx->rc_max_rate / fps;
     const int pict_type      = rce->new_pict_type;
@@ -751,31 +765,37 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
     RateControlEntry local_rce, *rce;
     double bits;
     double rate_factor;
-    int var;
+    int64_t var;
     const int pict_type = s->pict_type;
     Picture * const pic = &s->current_picture;
     emms_c();
 
 #if CONFIG_LIBXVID
-    if ((s->avctx->flags & AV_CODEC_FLAG_PASS2) && s->rc_strategy == 1)
+    if ((s->avctx->flags & AV_CODEC_FLAG_PASS2) && s->rc_strategy == MPV_RC_STRATEGY_XVID)
         return ff_xvid_rate_estimate_qscale(s, dry_run);
 #endif
 
     get_qminmax(&qmin, &qmax, s, pict_type);
 
-    fps = 1 / av_q2d(s->avctx->time_base);
+    fps = get_fps(s->avctx);
     /* update predictors */
     if (picture_number > 2 && !dry_run) {
-        const int last_var = s->last_pict_type == AV_PICTURE_TYPE_I ? rcc->last_mb_var_sum
-                                                                    : rcc->last_mc_mb_var_sum;
+        const int64_t last_var =
+            s->last_pict_type == AV_PICTURE_TYPE_I ? rcc->last_mb_var_sum
+                                                   : rcc->last_mc_mb_var_sum;
+        av_assert1(s->frame_bits >= s->stuffing_bits);
         update_predictor(&rcc->pred[s->last_pict_type],
                          rcc->last_qscale,
-                         sqrt(last_var), s->frame_bits);
+                         sqrt(last_var),
+                         s->frame_bits - s->stuffing_bits);
     }
 
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
-        assert(picture_number >= 0);
-        assert(picture_number < rcc->num_entries);
+        av_assert0(picture_number >= 0);
+        if (picture_number >= rcc->num_entries) {
+            av_log(s, AV_LOG_ERROR, "Input is longer than 2-pass log file\n");
+            return -1;
+        }
         rce         = &rcc->entry[picture_number];
         wanted_bits = rce->expected_bits;
     } else {
@@ -806,10 +826,10 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
     short_term_q = 0; /* avoid warning */
     if (s->avctx->flags & AV_CODEC_FLAG_PASS2) {
         if (pict_type != AV_PICTURE_TYPE_I)
-            assert(pict_type == rce->new_pict_type);
+            av_assert0(pict_type == rce->new_pict_type);
 
         q = rce->new_qscale / br_compensation;
-        ff_dlog(s, "%f %f %f last:%d var:%d type:%d//\n", q, rce->new_qscale,
+        ff_dlog(s, "%f %f %f last:%d var:%"PRId64" type:%d//\n", q, rce->new_qscale,
                 br_compensation, s->frame_bits, var, pict_type);
     } else {
         rce->pict_type     =
@@ -838,7 +858,6 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
         rcc->mv_bits_sum[pict_type] += rce->mv_bits;
         rcc->frame_count[pict_type]++;
 
-        bits        = rce->i_tex_bits + rce->p_tex_bits;
         rate_factor = rcc->pass1_wanted_bits /
                       rcc->pass1_rc_eq_output_sum * br_compensation;
 
@@ -846,9 +865,9 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
         if (q < 0)
             return -1;
 
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
         q = get_diff_limited_q(s, rce, q);
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
 
         // FIXME type dependent blur like in 2-pass
         if (pict_type == AV_PICTURE_TYPE_P || s->intra_only) {
@@ -859,19 +878,19 @@ float ff_rate_estimate_qscale(MpegEncContext *s, int dry_run)
             rcc->short_term_qcount++;
             q = short_term_q = rcc->short_term_qsum / rcc->short_term_qcount;
         }
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
 
         q = modify_qscale(s, rce, q, picture_number);
 
         rcc->pass1_wanted_bits += s->bit_rate / fps;
 
-        assert(q > 0.0);
+        av_assert0(q > 0.0);
     }
 
     if (s->avctx->debug & FF_DEBUG_RC) {
         av_log(s->avctx, AV_LOG_DEBUG,
                "%c qp:%d<%2.1f<%d %d want:%d total:%d comp:%f st_q:%2.2f "
-               "size:%d var:%d/%d br:%d fps:%d\n",
+               "size:%d var:%"PRId64"/%"PRId64" br:%"PRId64" fps:%d\n",
                av_get_picture_type_char(pict_type),
                qmin, q, qmax, picture_number,
                (int)wanted_bits / 1000, (int)s->total_bits / 1000,
@@ -906,7 +925,7 @@ static int init_pass2(MpegEncContext *s)
     RateControlContext *rcc = &s->rc_context;
     AVCodecContext *a       = s->avctx;
     int i, toobig;
-    double fps             = 1 / av_q2d(s->avctx->time_base);
+    double fps             = get_fps(s->avctx);
     double complexity[5]   = { 0 }; // approximate bits at quant=1
     uint64_t const_bits[5] = { 0 }; // quantizer independent bits
     uint64_t all_const_bits;
@@ -915,7 +934,7 @@ static int init_pass2(MpegEncContext *s)
     double rate_factor          = 0;
     double step;
     const int filter_size = (int)(a->qblur * 4) | 1;
-    double expected_bits;
+    double expected_bits = 0; // init to silence gcc warning
     double *qscale, *blurred_qscale, qscale_sum;
 
     /* find complexity & const_bits & decide the pict_types */
@@ -942,8 +961,8 @@ static int init_pass2(MpegEncContext *s)
         return -1;
     }
 
-    qscale         = av_malloc(sizeof(double) * rcc->num_entries);
-    blurred_qscale = av_malloc(sizeof(double) * rcc->num_entries);
+    qscale         = av_malloc_array(rcc->num_entries, sizeof(double));
+    blurred_qscale = av_malloc_array(rcc->num_entries, sizeof(double));
     if (!qscale || !blurred_qscale) {
         av_free(qscale);
         av_free(blurred_qscale);
@@ -964,9 +983,15 @@ static int init_pass2(MpegEncContext *s)
             qscale[i] = get_qscale(s, &rcc->entry[i], rate_factor, i);
             rcc->last_qscale_for[rce->pict_type] = qscale[i];
         }
-        assert(filter_size % 2 == 1);
+        av_assert0(filter_size % 2 == 1);
 
         /* fixed I/B QP relative to P mode */
+        for (i = FFMAX(0, rcc->num_entries - 300); i < rcc->num_entries; i++) {
+            RateControlEntry *rce = &rcc->entry[i];
+
+            qscale[i] = get_diff_limited_q(s, rce, qscale[i]);
+        }
+
         for (i = rcc->num_entries - 1; i >= 0; i--) {
             RateControlEntry *rce = &rcc->entry[i];
 
@@ -1030,11 +1055,11 @@ static int init_pass2(MpegEncContext *s)
         qscale_sum += av_clip(rcc->entry[i].new_qscale / FF_QP2LAMBDA,
                               s->avctx->qmin, s->avctx->qmax);
     }
-    assert(toobig <= 40);
+    av_assert0(toobig <= 40);
     av_log(s->avctx, AV_LOG_DEBUG,
-           "[lavc rc] requested bitrate: %d bps  expected bitrate: %d bps\n",
+           "[lavc rc] requested bitrate: %"PRId64" bps  expected bitrate: %"PRId64" bps\n",
            s->bit_rate,
-           (int)(expected_bits / ((double)all_available_bits / s->bit_rate)));
+           (int64_t)(expected_bits / ((double)all_available_bits / s->bit_rate)));
     av_log(s->avctx, AV_LOG_DEBUG,
            "[lavc rc] estimated target average qp: %.3f\n",
            (float)qscale_sum / rcc->num_entries);
diff --git a/libavcodec/ratecontrol.h b/libavcodec/ratecontrol.h
index 3bcf38e..c15f9e2 100644
--- a/libavcodec/ratecontrol.h
+++ b/libavcodec/ratecontrol.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,8 +49,8 @@ typedef struct RateControlEntry{
     uint64_t expected_bits;
     int new_pict_type;
     float new_qscale;
-    int mc_mb_var_sum;
-    int mb_var_sum;
+    int64_t mc_mb_var_sum;
+    int64_t mb_var_sum;
     int i_count;
     int skip_count;
     int f_code;
@@ -71,8 +71,8 @@ typedef struct RateControlContext{
     double pass1_wanted_bits;     ///< bits which should have been output by the pass1 code (including complexity init)
     double last_qscale;
     double last_qscale_for[5];    ///< last qscale for a specific pict type, used for max_diff & ipb factor stuff
-    int last_mc_mb_var_sum;
-    int last_mb_var_sum;
+    int64_t last_mc_mb_var_sum;
+    int64_t last_mb_var_sum;
     uint64_t i_cplx_sum[5];
     uint64_t p_cplx_sum[5];
     uint64_t mv_bits_sum[5];
diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index 67dff9b..bfa2537 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -2,20 +2,20 @@
  * Raw Video Codec
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,6 +66,7 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'u', 'p') },
     { AV_PIX_FMT_UYVY422, MKTAG('V', 'D', 'T', 'Z') }, /* SoftLab-NSK VideoTizer */
     { AV_PIX_FMT_UYVY422, MKTAG('a', 'u', 'v', '2') },
+    { AV_PIX_FMT_UYVY422, MKTAG('c', 'y', 'u', 'v') }, /* CYUV is also Creative YUV */
     { AV_PIX_FMT_UYYVYY411, MKTAG('Y', '4', '1', '1') },
     { AV_PIX_FMT_GRAY8,   MKTAG('G', 'R', 'E', 'Y') },
     { AV_PIX_FMT_NV12,    MKTAG('N', 'V', '1', '2') },
@@ -84,14 +85,18 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_BGR444LE, MKTAG('B', 'G', 'R', 12) },
     { AV_PIX_FMT_RGB444BE, MKTAG(12 , 'B', 'G', 'R') },
     { AV_PIX_FMT_BGR444BE, MKTAG(12 , 'R', 'G', 'B') },
-    { AV_PIX_FMT_RGBA,     MKTAG('R', 'G', 'B', 'A') },
-    { AV_PIX_FMT_BGRA,     MKTAG('B', 'G', 'R', 'A') },
     { AV_PIX_FMT_RGBA64LE, MKTAG('R', 'B', 'A', 64 ) },
     { AV_PIX_FMT_BGRA64LE, MKTAG('B', 'R', 'A', 64 ) },
     { AV_PIX_FMT_RGBA64BE, MKTAG(64 , 'R', 'B', 'A') },
     { AV_PIX_FMT_BGRA64BE, MKTAG(64 , 'B', 'R', 'A') },
+    { AV_PIX_FMT_RGBA,     MKTAG('R', 'G', 'B', 'A') },
+    { AV_PIX_FMT_RGB0,     MKTAG('R', 'G', 'B',  0 ) },
+    { AV_PIX_FMT_BGRA,     MKTAG('B', 'G', 'R', 'A') },
+    { AV_PIX_FMT_BGR0,     MKTAG('B', 'G', 'R',  0 ) },
     { AV_PIX_FMT_ABGR,     MKTAG('A', 'B', 'G', 'R') },
+    { AV_PIX_FMT_0BGR,     MKTAG( 0 , 'B', 'G', 'R') },
     { AV_PIX_FMT_ARGB,     MKTAG('A', 'R', 'G', 'B') },
+    { AV_PIX_FMT_0RGB,     MKTAG( 0 , 'R', 'G', 'B') },
     { AV_PIX_FMT_RGB24,    MKTAG('R', 'G', 'B', 24 ) },
     { AV_PIX_FMT_BGR24,    MKTAG('B', 'G', 'R', 24 ) },
     { AV_PIX_FMT_YUV411P,  MKTAG('4', '1', '1', 'P') },
@@ -115,12 +120,30 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_BGR48BE,  MKTAG( 48, 'B', 'G', 'R') },
     { AV_PIX_FMT_GRAY16LE,    MKTAG('Y', '1',  0 , 16 ) },
     { AV_PIX_FMT_GRAY16BE,    MKTAG(16 ,  0 , '1', 'Y') },
+    { AV_PIX_FMT_YUV420P9LE,  MKTAG('Y', '3', 11 ,  9 ) },
+    { AV_PIX_FMT_YUV420P9BE,  MKTAG( 9 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P9LE,  MKTAG('Y', '3', 10 ,  9 ) },
+    { AV_PIX_FMT_YUV422P9BE,  MKTAG( 9 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P9LE,  MKTAG('Y', '3',  0 ,  9 ) },
+    { AV_PIX_FMT_YUV444P9BE,  MKTAG( 9 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUV420P10LE, MKTAG('Y', '3', 11 , 10 ) },
     { AV_PIX_FMT_YUV420P10BE, MKTAG(10 , 11 , '3', 'Y') },
     { AV_PIX_FMT_YUV422P10LE, MKTAG('Y', '3', 10 , 10 ) },
     { AV_PIX_FMT_YUV422P10BE, MKTAG(10 , 10 , '3', 'Y') },
     { AV_PIX_FMT_YUV444P10LE, MKTAG('Y', '3',  0 , 10 ) },
     { AV_PIX_FMT_YUV444P10BE, MKTAG(10 ,  0 , '3', 'Y') },
+    { AV_PIX_FMT_YUV420P12LE, MKTAG('Y', '3', 11 , 12 ) },
+    { AV_PIX_FMT_YUV420P12BE, MKTAG(12 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P12LE, MKTAG('Y', '3', 10 , 12 ) },
+    { AV_PIX_FMT_YUV422P12BE, MKTAG(12 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P12LE, MKTAG('Y', '3',  0 , 12 ) },
+    { AV_PIX_FMT_YUV444P12BE, MKTAG(12 ,  0 , '3', 'Y') },
+    { AV_PIX_FMT_YUV420P14LE, MKTAG('Y', '3', 11 , 14 ) },
+    { AV_PIX_FMT_YUV420P14BE, MKTAG(14 , 11 , '3', 'Y') },
+    { AV_PIX_FMT_YUV422P14LE, MKTAG('Y', '3', 10 , 14 ) },
+    { AV_PIX_FMT_YUV422P14BE, MKTAG(14 , 10 , '3', 'Y') },
+    { AV_PIX_FMT_YUV444P14LE, MKTAG('Y', '3',  0 , 14 ) },
+    { AV_PIX_FMT_YUV444P14BE, MKTAG(14 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUV420P16LE, MKTAG('Y', '3', 11 , 16 ) },
     { AV_PIX_FMT_YUV420P16BE, MKTAG(16 , 11 , '3', 'Y') },
     { AV_PIX_FMT_YUV422P16LE, MKTAG('Y', '3', 10 , 16 ) },
@@ -128,7 +151,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUV444P16LE, MKTAG('Y', '3',  0 , 16 ) },
     { AV_PIX_FMT_YUV444P16BE, MKTAG(16 ,  0 , '3', 'Y') },
     { AV_PIX_FMT_YUVA420P,    MKTAG('Y', '4', 11 ,  8 ) },
+    { AV_PIX_FMT_YUVA422P,    MKTAG('Y', '4', 10 ,  8 ) },
+    { AV_PIX_FMT_YUVA444P,    MKTAG('Y', '4',  0 ,  8 ) },
     { AV_PIX_FMT_YA8,         MKTAG('Y', '2',  0 ,  8 ) },
+    { AV_PIX_FMT_PAL8,        MKTAG('P', 'A', 'L',  8 ) },
 
     { AV_PIX_FMT_YUVA420P9LE,  MKTAG('Y', '4', 11 ,  9 ) },
     { AV_PIX_FMT_YUVA420P9BE,  MKTAG( 9 , 11 , '4', 'Y') },
@@ -149,10 +175,41 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_YUVA444P16LE, MKTAG('Y', '4',  0 , 16 ) },
     { AV_PIX_FMT_YUVA444P16BE, MKTAG(16 ,  0 , '4', 'Y') },
 
+    { AV_PIX_FMT_GBRP,         MKTAG('G', '3', 00 ,  8 ) },
+    { AV_PIX_FMT_GBRP9LE,      MKTAG('G', '3', 00 ,  9 ) },
+    { AV_PIX_FMT_GBRP9BE,      MKTAG( 9 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP10LE,     MKTAG('G', '3', 00 , 10 ) },
+    { AV_PIX_FMT_GBRP10BE,     MKTAG(10 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP12LE,     MKTAG('G', '3', 00 , 12 ) },
+    { AV_PIX_FMT_GBRP12BE,     MKTAG(12 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP14LE,     MKTAG('G', '3', 00 , 14 ) },
+    { AV_PIX_FMT_GBRP14BE,     MKTAG(14 , 00 , '3', 'G') },
+    { AV_PIX_FMT_GBRP16LE,     MKTAG('G', '3', 00 , 16 ) },
+    { AV_PIX_FMT_GBRP16BE,     MKTAG(16 , 00 , '3', 'G') },
+
+    { AV_PIX_FMT_XYZ12LE,      MKTAG('X', 'Y', 'Z' , 36 ) },
+    { AV_PIX_FMT_XYZ12BE,      MKTAG(36 , 'Z' , 'Y', 'X') },
+
+    { AV_PIX_FMT_BAYER_BGGR8,    MKTAG(0xBA, 'B', 'G', 8   ) },
+    { AV_PIX_FMT_BAYER_BGGR16LE, MKTAG(0xBA, 'B', 'G', 16  ) },
+    { AV_PIX_FMT_BAYER_BGGR16BE, MKTAG(16  , 'G', 'B', 0xBA) },
+    { AV_PIX_FMT_BAYER_RGGB8,    MKTAG(0xBA, 'R', 'G', 8   ) },
+    { AV_PIX_FMT_BAYER_RGGB16LE, MKTAG(0xBA, 'R', 'G', 16  ) },
+    { AV_PIX_FMT_BAYER_RGGB16BE, MKTAG(16  , 'G', 'R', 0xBA) },
+    { AV_PIX_FMT_BAYER_GBRG8,    MKTAG(0xBA, 'G', 'B', 8   ) },
+    { AV_PIX_FMT_BAYER_GBRG16LE, MKTAG(0xBA, 'G', 'B', 16  ) },
+    { AV_PIX_FMT_BAYER_GBRG16BE, MKTAG(16,   'B', 'G', 0xBA) },
+    { AV_PIX_FMT_BAYER_GRBG8,    MKTAG(0xBA, 'G', 'R', 8   ) },
+    { AV_PIX_FMT_BAYER_GRBG16LE, MKTAG(0xBA, 'G', 'R', 16  ) },
+    { AV_PIX_FMT_BAYER_GRBG16BE, MKTAG(16,   'R', 'G', 0xBA) },
+
     /* quicktime */
+    { AV_PIX_FMT_YUV420P, MKTAG('R', '4', '2', '0') }, /* Radius DV YUV PAL */
+    { AV_PIX_FMT_YUV411P, MKTAG('R', '4', '1', '1') }, /* Radius DV YUV NTSC */
     { AV_PIX_FMT_UYVY422, MKTAG('2', 'v', 'u', 'y') },
     { AV_PIX_FMT_UYVY422, MKTAG('2', 'V', 'u', 'y') },
     { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'U', 'I') }, /* FIXME merge both fields */
+    { AV_PIX_FMT_UYVY422, MKTAG('b', 'x', 'y', 'v') },
     { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', '2') },
     { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', 's') },
     { AV_PIX_FMT_YUYV422, MKTAG('D', 'V', 'O', 'O') }, /* Digital Voodoo SD 8 Bit */
@@ -160,12 +217,48 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_RGB565LE,MKTAG('L', '5', '6', '5') },
     { AV_PIX_FMT_RGB565BE,MKTAG('B', '5', '6', '5') },
     { AV_PIX_FMT_BGR24,   MKTAG('2', '4', 'B', 'G') },
+    { AV_PIX_FMT_BGR24,   MKTAG('b', 'x', 'b', 'g') },
     { AV_PIX_FMT_BGRA,    MKTAG('B', 'G', 'R', 'A') },
     { AV_PIX_FMT_RGBA,    MKTAG('R', 'G', 'B', 'A') },
+    { AV_PIX_FMT_RGB24,   MKTAG('b', 'x', 'r', 'g') },
     { AV_PIX_FMT_ABGR,    MKTAG('A', 'B', 'G', 'R') },
     { AV_PIX_FMT_GRAY16BE,MKTAG('b', '1', '6', 'g') },
     { AV_PIX_FMT_RGB48BE, MKTAG('b', '4', '8', 'r') },
 
+    /* vlc */
+    { AV_PIX_FMT_YUV410P,     MKTAG('I', '4', '1', '0') },
+    { AV_PIX_FMT_YUV411P,     MKTAG('I', '4', '1', '1') },
+    { AV_PIX_FMT_YUV422P,     MKTAG('I', '4', '2', '2') },
+    { AV_PIX_FMT_YUV440P,     MKTAG('I', '4', '4', '0') },
+    { AV_PIX_FMT_YUV444P,     MKTAG('I', '4', '4', '4') },
+    { AV_PIX_FMT_YUVJ420P,    MKTAG('J', '4', '2', '0') },
+    { AV_PIX_FMT_YUVJ422P,    MKTAG('J', '4', '2', '2') },
+    { AV_PIX_FMT_YUVJ440P,    MKTAG('J', '4', '4', '0') },
+    { AV_PIX_FMT_YUVJ444P,    MKTAG('J', '4', '4', '4') },
+    { AV_PIX_FMT_YUVA444P,    MKTAG('Y', 'U', 'V', 'A') },
+    { AV_PIX_FMT_YUVA420P,    MKTAG('I', '4', '0', 'A') },
+    { AV_PIX_FMT_YUVA422P,    MKTAG('I', '4', '2', 'A') },
+    { AV_PIX_FMT_RGB8,        MKTAG('R', 'G', 'B', '2') },
+    { AV_PIX_FMT_RGB555LE,    MKTAG('R', 'V', '1', '5') },
+    { AV_PIX_FMT_RGB565LE,    MKTAG('R', 'V', '1', '6') },
+    { AV_PIX_FMT_BGR24,       MKTAG('R', 'V', '2', '4') },
+    { AV_PIX_FMT_BGR0,        MKTAG('R', 'V', '3', '2') },
+    { AV_PIX_FMT_RGBA,        MKTAG('A', 'V', '3', '2') },
+    { AV_PIX_FMT_YUV420P9LE,  MKTAG('I', '0', '9', 'L') },
+    { AV_PIX_FMT_YUV420P9BE,  MKTAG('I', '0', '9', 'B') },
+    { AV_PIX_FMT_YUV422P9LE,  MKTAG('I', '2', '9', 'L') },
+    { AV_PIX_FMT_YUV422P9BE,  MKTAG('I', '2', '9', 'B') },
+    { AV_PIX_FMT_YUV444P9LE,  MKTAG('I', '4', '9', 'L') },
+    { AV_PIX_FMT_YUV444P9BE,  MKTAG('I', '4', '9', 'B') },
+    { AV_PIX_FMT_YUV420P10LE, MKTAG('I', '0', 'A', 'L') },
+    { AV_PIX_FMT_YUV420P10BE, MKTAG('I', '0', 'A', 'B') },
+    { AV_PIX_FMT_YUV422P10LE, MKTAG('I', '2', 'A', 'L') },
+    { AV_PIX_FMT_YUV422P10BE, MKTAG('I', '2', 'A', 'B') },
+    { AV_PIX_FMT_YUV444P10LE, MKTAG('I', '4', 'A', 'L') },
+    { AV_PIX_FMT_YUV444P10BE, MKTAG('I', '4', 'A', 'B') },
+    { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+    { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
+
     /* special */
     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
@@ -173,6 +266,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
     { AV_PIX_FMT_NONE, 0 },
 };
 
+const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void)
+{
+    return ff_raw_pix_fmt_tags;
+}
+
 unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat fmt)
 {
     const PixelFormatTag *tags = ff_raw_pix_fmt_tags;
@@ -183,3 +281,28 @@ unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat fmt)
     }
     return 0;
 }
+
+const PixelFormatTag avpriv_pix_fmt_bps_avi[] = {
+    { AV_PIX_FMT_PAL8,    1 },
+    { AV_PIX_FMT_PAL8,    2 },
+    { AV_PIX_FMT_PAL8,    4 },
+    { AV_PIX_FMT_PAL8,    8 },
+    { AV_PIX_FMT_RGB444LE, 12 },
+    { AV_PIX_FMT_RGB555LE, 15 },
+    { AV_PIX_FMT_RGB555LE, 16 },
+    { AV_PIX_FMT_BGR24,  24 },
+    { AV_PIX_FMT_BGRA,   32 },
+    { AV_PIX_FMT_NONE,    0 },
+};
+
+const PixelFormatTag avpriv_pix_fmt_bps_mov[] = {
+    { AV_PIX_FMT_PAL8,      1 },
+    { AV_PIX_FMT_PAL8,      2 },
+    { AV_PIX_FMT_PAL8,      4 },
+    { AV_PIX_FMT_PAL8,      8 },
+    { AV_PIX_FMT_RGB555BE, 16 },
+    { AV_PIX_FMT_RGB24,    24 },
+    { AV_PIX_FMT_ARGB,     32 },
+    { AV_PIX_FMT_PAL8,     33 },
+    { AV_PIX_FMT_NONE,      0 },
+};
diff --git a/libavcodec/raw.h b/libavcodec/raw.h
index bf66671..24bf4cc 100644
--- a/libavcodec/raw.h
+++ b/libavcodec/raw.h
@@ -2,20 +2,20 @@
  * Raw Video Codec
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,12 +28,20 @@
 #define AVCODEC_RAW_H
 
 #include "avcodec.h"
+#include "libavutil/internal.h"
 
 typedef struct PixelFormatTag {
     enum AVPixelFormat pix_fmt;
     unsigned int fourcc;
 } PixelFormatTag;
 
-extern const PixelFormatTag ff_raw_pix_fmt_tags[];
+extern const PixelFormatTag ff_raw_pix_fmt_tags[]; // exposed through avpriv_get_raw_pix_fmt_tags()
+
+const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void);
+
+enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags, unsigned int fourcc);
+
+extern av_export const PixelFormatTag avpriv_pix_fmt_bps_avi[];
+extern av_export const PixelFormatTag avpriv_pix_fmt_bps_mov[];
 
 #endif /* AVCODEC_RAW_H */
diff --git a/libavcodec/rawdec.c b/libavcodec/rawdec.c
index 284c345..765e567 100644
--- a/libavcodec/rawdec.c
+++ b/libavcodec/rawdec.c
@@ -2,20 +2,20 @@
  * Raw Video Decoder
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,72 +25,65 @@
  */
 
 #include "avcodec.h"
+#include "bswapdsp.h"
+#include "get_bits.h"
 #include "internal.h"
 #include "raw.h"
+#include "libavutil/avassert.h"
 #include "libavutil/buffer.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
 
 typedef struct RawVideoContext {
+    AVClass *av_class;
     AVBufferRef *palette;
     int frame_size;  /* size of the frame in bytes */
     int flip;
-    int is_2_4_bpp; // 2 or 4 bpp raw in avi/mov
+    int is_1_2_4_8_bpp; // 1, 2, 4 and 8 bpp in avi/mov, 1 and 8 bpp in nut
+    int is_mono;
+    int is_pal8;
+    int is_nut_mono;
+    int is_nut_pal8;
     int is_yuv2;
+    int is_lt_16bpp; // 16bpp pixfmt and bits_per_coded_sample < 16
+    int tff;
+
+    BswapDSPContext bbdsp;
+    void *bitstream_buf;
+    unsigned int bitstream_buf_size;
 } RawVideoContext;
 
-static const PixelFormatTag pix_fmt_bps_avi[] = {
-    { AV_PIX_FMT_PAL8,    4 },
-    { AV_PIX_FMT_PAL8,    8 },
-    { AV_PIX_FMT_RGB444, 12 },
-    { AV_PIX_FMT_RGB555, 15 },
-    { AV_PIX_FMT_RGB555, 16 },
-    { AV_PIX_FMT_BGR24,  24 },
-    { AV_PIX_FMT_RGB32,  32 },
-    { AV_PIX_FMT_NONE,    0 },
+static const AVOption options[]={
+{"top", "top field first", offsetof(RawVideoContext, tff), AV_OPT_TYPE_BOOL, {.i64 = -1}, -1, 1, AV_OPT_FLAG_DECODING_PARAM|AV_OPT_FLAG_VIDEO_PARAM},
+{NULL}
 };
 
-static const PixelFormatTag pix_fmt_bps_mov[] = {
-    { AV_PIX_FMT_MONOWHITE, 1 },
-    { AV_PIX_FMT_PAL8,      2 },
-    { AV_PIX_FMT_PAL8,      4 },
-    { AV_PIX_FMT_PAL8,      8 },
-    // FIXME swscale does not support 16 bit in .mov, sample 16bit.mov
-    // http://developer.apple.com/documentation/QuickTime/QTFF/QTFFChap3/qtff3.html
-    { AV_PIX_FMT_RGB555BE, 16 },
-    { AV_PIX_FMT_RGB24,    24 },
-    { AV_PIX_FMT_ARGB,     32 },
-    { AV_PIX_FMT_MONOWHITE,33 },
-    { AV_PIX_FMT_NONE,      0 },
+static const AVClass rawdec_class = {
+    .class_name = "rawdec",
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
 };
 
-static enum AVPixelFormat find_pix_fmt(const PixelFormatTag *tags,
-                                       unsigned int fourcc)
-{
-    while (tags->pix_fmt >= 0) {
-        if (tags->fourcc == fourcc)
-            return tags->pix_fmt;
-        tags++;
-    }
-    return AV_PIX_FMT_YUV420P;
-}
-
 static av_cold int raw_init_decoder(AVCodecContext *avctx)
 {
     RawVideoContext *context = avctx->priv_data;
     const AVPixFmtDescriptor *desc;
 
-    if (avctx->codec_tag == MKTAG('r', 'a', 'w', ' '))
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_mov,
+    ff_bswapdsp_init(&context->bbdsp);
+
+    if (   avctx->codec_tag == MKTAG('r','a','w',' ')
+        || avctx->codec_tag == MKTAG('N','O','1','6'))
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_mov,
                                       avctx->bits_per_coded_sample);
     else if (avctx->codec_tag == MKTAG('W', 'R', 'A', 'W'))
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_avi,
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_avi,
                                       avctx->bits_per_coded_sample);
-    else if (avctx->codec_tag)
-        avctx->pix_fmt = find_pix_fmt(ff_raw_pix_fmt_tags, avctx->codec_tag);
+    else if (avctx->codec_tag && (avctx->codec_tag & 0xFFFFFF) != MKTAG('B','I','T', 0))
+        avctx->pix_fmt = avpriv_find_pix_fmt(ff_raw_pix_fmt_tags, avctx->codec_tag);
     else if (avctx->pix_fmt == AV_PIX_FMT_NONE && avctx->bits_per_coded_sample)
-        avctx->pix_fmt = find_pix_fmt(pix_fmt_bps_avi,
+        avctx->pix_fmt = avpriv_find_pix_fmt(avpriv_pix_fmt_bps_avi,
                                       avctx->bits_per_coded_sample);
 
     desc = av_pix_fmt_desc_get(avctx->pix_fmt);
@@ -105,25 +98,32 @@ static av_cold int raw_init_decoder(AVCodecContext *avctx)
             return AVERROR(ENOMEM);
         if (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)
             avpriv_set_systematic_pal2((uint32_t*)context->palette->data, avctx->pix_fmt);
-        else
+        else {
             memset(context->palette->data, 0, AVPALETTE_SIZE);
+            if (avctx->bits_per_coded_sample == 1)
+                memset(context->palette->data, 0xff, 4);
+        }
     }
 
-    context->frame_size = av_image_get_buffer_size(avctx->pix_fmt,
-                                                   avctx->width,
-                                                   avctx->height, 1);
-
-    if ((avctx->bits_per_coded_sample == 4 || avctx->bits_per_coded_sample == 2) &&
-        avctx->pix_fmt == AV_PIX_FMT_PAL8 &&
-       (!avctx->codec_tag || avctx->codec_tag == MKTAG('r','a','w',' ')))
-        context->is_2_4_bpp = 1;
-
     if ((avctx->extradata_size >= 9 &&
          !memcmp(avctx->extradata + avctx->extradata_size - 9, "BottomUp", 9)) ||
+        avctx->codec_tag == MKTAG('c','y','u','v') ||
         avctx->codec_tag == MKTAG(3, 0, 0, 0) ||
         avctx->codec_tag == MKTAG('W','R','A','W'))
         context->flip = 1;
 
+    if (avctx->pix_fmt == AV_PIX_FMT_MONOWHITE ||
+        avctx->pix_fmt == AV_PIX_FMT_MONOBLACK)
+        context->is_mono = 1;
+    else if (avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        context->is_pal8 = 1;
+
+    if (avctx->codec_tag == MKTAG('B','1','W','0') ||
+        avctx->codec_tag == MKTAG('B','0','W','1'))
+        context->is_nut_mono = 1;
+    else if (avctx->codec_tag == MKTAG('P','A','L',8))
+        context->is_nut_pal8 = 1;
+
     if (avctx->codec_tag == AV_RL32("yuv2") &&
         avctx->pix_fmt   == AV_PIX_FMT_YUYV422)
         context->is_yuv2 = 1;
@@ -137,18 +137,98 @@ static void flip(AVCodecContext *avctx, AVFrame *frame)
     frame->linesize[0] *= -1;
 }
 
+/*
+ * Scale sample to 16-bit resolution
+ */
+#define SCALE16(x, bits) (((x) << (16 - (bits))) | ((x) >> (2 * (bits) - 16)))
+
+/**
+ * Scale buffer to 16 bits per coded sample resolution
+ */
+#define MKSCALE16(name, r16, w16) \
+static void name(AVCodecContext *avctx, uint8_t * dst, const uint8_t *buf, int buf_size, int packed) \
+{ \
+    int i; \
+    if (!packed) { \
+        for (i = 0; i + 1 < buf_size; i += 2) \
+            w16(dst + i, SCALE16(r16(buf + i), avctx->bits_per_coded_sample)); \
+    } else { \
+        GetBitContext gb; \
+        init_get_bits(&gb, buf, buf_size * 8); \
+        for (i = 0; i < avctx->width * avctx->height; i++) { \
+            int sample = get_bits(&gb, avctx->bits_per_coded_sample); \
+            w16(dst + i*2, SCALE16(sample, avctx->bits_per_coded_sample)); \
+        } \
+   } \
+}
+
+MKSCALE16(scale16be, AV_RB16, AV_WB16)
+MKSCALE16(scale16le, AV_RL16, AV_WL16)
+
 static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
                       AVPacket *avpkt)
 {
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+    const AVPixFmtDescriptor *desc;
     RawVideoContext *context       = avctx->priv_data;
     const uint8_t *buf             = avpkt->data;
     int buf_size                   = avpkt->size;
-    int need_copy                  = !avpkt->buf || context->is_2_4_bpp || context->is_yuv2;
-    int res;
+    int linesize_align             = 4;
+    int stride;
+    int res, len;
+    int need_copy;
 
     AVFrame   *frame   = data;
 
+    if (avctx->width <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "width is not set\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (avctx->height <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "height is not set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (context->is_nut_mono)
+        stride = avctx->width / 8 + (avctx->width & 7 ? 1 : 0);
+    else if (context->is_nut_pal8)
+        stride = avctx->width;
+    else
+        stride = avpkt->size / avctx->height;
+
+    av_log(avctx, AV_LOG_DEBUG, "PACKET SIZE: %d, STRIDE: %d\n", avpkt->size, stride);
+
+    if (stride == 0 || avpkt->size < stride * avctx->height) {
+        av_log(avctx, AV_LOG_ERROR, "Packet too small (%d)\n", avpkt->size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if ((avctx->bits_per_coded_sample == 8 || avctx->bits_per_coded_sample == 4
+            || avctx->bits_per_coded_sample <= 2) &&
+        (context->is_mono || context->is_pal8) &&
+        (!avctx->codec_tag || avctx->codec_tag == MKTAG('r','a','w',' ') ||
+                context->is_nut_mono || context->is_nut_pal8)) {
+        context->is_1_2_4_8_bpp = 1;
+        if (context->is_mono) {
+            int row_bytes = avctx->width / 8 + (avctx->width & 7 ? 1 : 0);
+            context->frame_size = av_image_get_buffer_size(avctx->pix_fmt,
+                                                           FFALIGN(row_bytes, 16) * 8,
+                                                           avctx->height, 1);
+        } else
+            context->frame_size = av_image_get_buffer_size(avctx->pix_fmt,
+                                                           FFALIGN(avctx->width, 16),
+                                                           avctx->height, 1);
+    } else {
+        context->is_lt_16bpp = av_get_bits_per_pixel(desc) == 16 && avctx->bits_per_coded_sample && avctx->bits_per_coded_sample < 16;
+        context->frame_size = av_image_get_buffer_size(avctx->pix_fmt, avctx->width,
+                                                       avctx->height, 1);
+    }
+    if (context->frame_size < 0)
+        return context->frame_size;
+
+    need_copy = !avpkt->buf || context->is_1_2_4_8_bpp || context->is_yuv2 || context->is_lt_16bpp;
+
     frame->pict_type        = AV_PICTURE_TYPE_I;
     frame->key_frame        = 1;
 
@@ -156,38 +236,111 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
     if (res < 0)
         return res;
 
-    if (buf_size < context->frame_size - (avctx->pix_fmt == AV_PIX_FMT_PAL8 ?
-                                          AVPALETTE_SIZE : 0))
-        return -1;
+    av_frame_set_pkt_pos     (frame, avctx->internal->pkt->pos);
+    av_frame_set_pkt_duration(frame, avctx->internal->pkt->duration);
+
+    if (context->tff >= 0) {
+        frame->interlaced_frame = 1;
+        frame->top_field_first  = context->tff;
+    }
+
+    if ((res = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
+        return res;
 
     if (need_copy)
-        frame->buf[0] = av_buffer_alloc(context->frame_size);
+        frame->buf[0] = av_buffer_alloc(FFMAX(context->frame_size, buf_size));
     else
         frame->buf[0] = av_buffer_ref(avpkt->buf);
     if (!frame->buf[0])
         return AVERROR(ENOMEM);
 
-    //2bpp and 4bpp raw in avi and mov (yes this is ugly ...)
-    if (context->is_2_4_bpp) {
-        int i;
+    // 1, 2, 4 and 8 bpp in avi/mov, 1 and 8 bpp in nut
+    if (context->is_1_2_4_8_bpp) {
+        int i, j, row_pix = 0;
         uint8_t *dst = frame->buf[0]->data;
-        buf_size = context->frame_size - AVPALETTE_SIZE;
-        if (avctx->bits_per_coded_sample == 4) {
-            for (i = 0; 2 * i + 1 < buf_size; i++) {
-                dst[2 * i + 0] = buf[i] >> 4;
-                dst[2 * i + 1] = buf[i] & 15;
+        buf_size = context->frame_size - (context->is_pal8 ? AVPALETTE_SIZE : 0);
+        if (avctx->bits_per_coded_sample == 8 || context->is_nut_pal8 || context->is_mono) {
+            int pix_per_byte = context->is_mono ? 8 : 1;
+            for (i = 0, j = 0; j < buf_size && i<avpkt->size; i++, j++) {
+                dst[j] = buf[i];
+                row_pix += pix_per_byte;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 16 - (j % 16) - 1;
+                    row_pix = 0;
+                }
+            }
+        } else if (avctx->bits_per_coded_sample == 4) {
+            for (i = 0, j = 0; 2 * j + 1 < buf_size && i<avpkt->size; i++, j++) {
+                dst[2 * j + 0] = buf[i] >> 4;
+                dst[2 * j + 1] = buf[i] & 15;
+                row_pix += 2;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 8 - (j % 8) - 1;
+                    row_pix = 0;
+                }
+            }
+        } else if (avctx->bits_per_coded_sample == 2) {
+            for (i = 0, j = 0; 4 * j + 3 < buf_size && i<avpkt->size; i++, j++) {
+                dst[4 * j + 0] = buf[i] >> 6;
+                dst[4 * j + 1] = buf[i] >> 4 & 3;
+                dst[4 * j + 2] = buf[i] >> 2 & 3;
+                dst[4 * j + 3] = buf[i]      & 3;
+                row_pix += 4;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 4 - (j % 4) - 1;
+                    row_pix = 0;
+                }
             }
         } else {
-            for (i = 0; 4 * i + 3 < buf_size; i++) {
-                dst[4 * i + 0] = buf[i] >> 6;
-                dst[4 * i + 1] = buf[i] >> 4 & 3;
-                dst[4 * i + 2] = buf[i] >> 2 & 3;
-                dst[4 * i + 3] = buf[i]      & 3;
+            av_assert0(avctx->bits_per_coded_sample == 1);
+            for (i = 0, j = 0; 8 * j + 7 < buf_size && i<avpkt->size; i++, j++) {
+                dst[8 * j + 0] = buf[i] >> 7;
+                dst[8 * j + 1] = buf[i] >> 6 & 1;
+                dst[8 * j + 2] = buf[i] >> 5 & 1;
+                dst[8 * j + 3] = buf[i] >> 4 & 1;
+                dst[8 * j + 4] = buf[i] >> 3 & 1;
+                dst[8 * j + 5] = buf[i] >> 2 & 1;
+                dst[8 * j + 6] = buf[i] >> 1 & 1;
+                dst[8 * j + 7] = buf[i]      & 1;
+                row_pix += 8;
+                if (row_pix >= avctx->width) {
+                    i += stride - (i % stride) - 1;
+                    j += 2 - (j % 2) - 1;
+                    row_pix = 0;
+                }
             }
         }
+        linesize_align = 16;
+        buf = dst;
+    } else if (context->is_lt_16bpp) {
+        uint8_t *dst = frame->buf[0]->data;
+        int packed = (avctx->codec_tag & 0xFFFFFF) == MKTAG('B','I','T', 0);
+        int swap   =  avctx->codec_tag >> 24;
+
+        if (packed && swap) {
+            av_fast_padded_malloc(&context->bitstream_buf, &context->bitstream_buf_size, buf_size);
+            if (!context->bitstream_buf)
+                return AVERROR(ENOMEM);
+            if (swap == 16)
+                context->bbdsp.bswap16_buf(context->bitstream_buf, (const uint16_t*)buf, buf_size / 2);
+            else if (swap == 32)
+                context->bbdsp.bswap_buf(context->bitstream_buf, (const uint32_t*)buf, buf_size / 4);
+            else
+                return AVERROR_INVALIDDATA;
+            buf = context->bitstream_buf;
+        }
+
+        if (desc->flags & AV_PIX_FMT_FLAG_BE)
+            scale16be(avctx, dst, buf, buf_size, packed);
+        else
+            scale16le(avctx, dst, buf, buf_size, packed);
+
         buf = dst;
     } else if (need_copy) {
-        memcpy(frame->buf[0]->data, buf, FFMIN(buf_size, context->frame_size));
+        memcpy(frame->buf[0]->data, buf, buf_size);
         buf = frame->buf[0]->data;
     }
 
@@ -195,32 +348,73 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->codec_tag == MKTAG('A', 'V', 'u', 'p'))
         buf += buf_size - context->frame_size;
 
+    len = context->frame_size - (avctx->pix_fmt==AV_PIX_FMT_PAL8 ? AVPALETTE_SIZE : 0);
+    if (buf_size < len && ((avctx->codec_tag & 0xFFFFFF) != MKTAG('B','I','T', 0) || !need_copy)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid buffer size, packet size %d < expected frame_size %d\n", buf_size, len);
+        av_buffer_unref(&frame->buf[0]);
+        return AVERROR(EINVAL);
+    }
+
     if ((res = av_image_fill_arrays(frame->data, frame->linesize,
                                     buf, avctx->pix_fmt,
-                                    avctx->width, avctx->height, 1)) < 0)
+                                    avctx->width, avctx->height, 1)) < 0) {
+        av_buffer_unref(&frame->buf[0]);
         return res;
+    }
 
     if (avctx->pix_fmt == AV_PIX_FMT_PAL8) {
         const uint8_t *pal = av_packet_get_side_data(avpkt, AV_PKT_DATA_PALETTE,
                                                      NULL);
-
         if (pal) {
             av_buffer_unref(&context->palette);
             context->palette = av_buffer_alloc(AVPALETTE_SIZE);
-            if (!context->palette)
+            if (!context->palette) {
+                av_buffer_unref(&frame->buf[0]);
                 return AVERROR(ENOMEM);
+            }
             memcpy(context->palette->data, pal, AVPALETTE_SIZE);
             frame->palette_has_changed = 1;
+        } else if (context->is_nut_pal8) {
+            int vid_size = avctx->width * avctx->height;
+            if (avpkt->size - vid_size) {
+                pal = avpkt->data + vid_size;
+                memcpy(context->palette->data, pal, avpkt->size - vid_size);
+                frame->palette_has_changed = 1;
+            }
         }
     }
 
+    if ((avctx->pix_fmt==AV_PIX_FMT_RGB24    ||
+        avctx->pix_fmt==AV_PIX_FMT_BGR24     ||
+        avctx->pix_fmt==AV_PIX_FMT_GRAY8     ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB555LE  ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB555BE  ||
+        avctx->pix_fmt==AV_PIX_FMT_RGB565LE  ||
+        avctx->pix_fmt==AV_PIX_FMT_MONOWHITE ||
+        avctx->pix_fmt==AV_PIX_FMT_MONOBLACK ||
+        avctx->pix_fmt==AV_PIX_FMT_PAL8) &&
+        FFALIGN(frame->linesize[0], linesize_align) * avctx->height <= buf_size)
+        frame->linesize[0] = FFALIGN(frame->linesize[0], linesize_align);
+
+    if (avctx->pix_fmt == AV_PIX_FMT_NV12 && avctx->codec_tag == MKTAG('N', 'V', '1', '2') &&
+        FFALIGN(frame->linesize[0], linesize_align) * avctx->height +
+        FFALIGN(frame->linesize[1], linesize_align) * ((avctx->height + 1) / 2) <= buf_size) {
+        int la0 = FFALIGN(frame->linesize[0], linesize_align);
+        frame->data[1] += (la0 - frame->linesize[0]) * avctx->height;
+        frame->linesize[0] = la0;
+        frame->linesize[1] = FFALIGN(frame->linesize[1], linesize_align);
+    }
+
     if ((avctx->pix_fmt == AV_PIX_FMT_PAL8 && buf_size < context->frame_size) ||
         (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)) {
         frame->buf[1]  = av_buffer_ref(context->palette);
-        if (!frame->buf[1])
+        if (!frame->buf[1]) {
+            av_buffer_unref(&frame->buf[0]);
             return AVERROR(ENOMEM);
+        }
         frame->data[1] = frame->buf[1]->data;
     }
+
     if (avctx->pix_fmt == AV_PIX_FMT_BGR24 &&
         ((frame->linesize[0] + 3) & ~3) * avctx->height <= buf_size)
         frame->linesize[0] = (frame->linesize[0] + 3) & ~3;
@@ -234,6 +428,11 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         avctx->codec_tag == MKTAG('Y', 'V', 'U', '9'))
         FFSWAP(uint8_t *, frame->data[1], frame->data[2]);
 
+    if (avctx->codec_tag == AV_RL32("I420") && (avctx->width+1)*(avctx->height+1) * 3/2 == buf_size) {
+        frame->data[1] = frame->data[1] +  (avctx->width+1)*(avctx->height+1) -avctx->width*avctx->height;
+        frame->data[2] = frame->data[2] + ((avctx->width+1)*(avctx->height+1) -avctx->width*avctx->height)*5/4;
+    }
+
     if (avctx->codec_tag == AV_RL32("yuv2") &&
         avctx->pix_fmt   == AV_PIX_FMT_YUYV422) {
         int x, y;
@@ -245,6 +444,12 @@ static int raw_decode(AVCodecContext *avctx, void *data, int *got_frame,
         }
     }
 
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) { /* we have interlaced material flagged in container */
+        frame->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            frame->top_field_first = 1;
+    }
+
     *got_frame = 1;
     return buf_size;
 }
@@ -266,4 +471,6 @@ AVCodec ff_rawvideo_decoder = {
     .init           = raw_init_decoder,
     .close          = raw_close_decoder,
     .decode         = raw_decode,
+    .priv_class     = &rawdec_class,
+    .capabilities   = AV_CODEC_CAP_PARAM_CHANGE,
 };
diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
index 60bd0c7..d837056 100644
--- a/libavcodec/rawenc.c
+++ b/libavcodec/rawenc.c
@@ -2,20 +2,20 @@
  * Raw Video Encoder
  * Copyright (c) 2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,7 +39,6 @@ static av_cold int raw_encode_init(AVCodecContext *avctx)
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-    avctx->coded_frame->key_frame = 1;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     avctx->bits_per_coded_sample = av_get_bits_per_pixel(desc);
@@ -51,24 +50,24 @@ FF_ENABLE_DEPRECATION_WARNINGS
 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                       const AVFrame *frame, int *got_packet)
 {
-    int ret = av_image_get_buffer_size(avctx->pix_fmt,
-                                       avctx->width, avctx->height, 1);
+    int ret = av_image_get_buffer_size(frame->format,
+                                       frame->width, frame->height, 1);
 
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_alloc_packet(pkt, ret)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
         return ret;
     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-                                       frame->data, frame->linesize,
+                                       (const uint8_t **)frame->data, frame->linesize,
                                        frame->format,
                                        frame->width, frame->height, 1)) < 0)
         return ret;
 
     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
-       avctx->pix_fmt   == AV_PIX_FMT_YUYV422) {
+       frame->format   == AV_PIX_FMT_YUYV422) {
         int x;
-        for(x = 1; x < avctx->height*avctx->width*2; x += 2)
+        for(x = 1; x < frame->height*frame->width*2; x += 2)
             pkt->data[x] ^= 0x80;
     }
     pkt->flags |= AV_PKT_FLAG_KEY;
diff --git a/libavcodec/rdft.c b/libavcodec/rdft.c
index 1965253..c318aa8 100644
--- a/libavcodec/rdft.c
+++ b/libavcodec/rdft.c
@@ -2,20 +2,20 @@
  * (I)RDFT transforms
  * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include <stdlib.h>
@@ -99,16 +99,17 @@ static void rdft_calc_c(RDFTContext *s, FFTSample *data)
 av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
 {
     int n = 1 << nbits;
+    int ret;
 
     s->nbits           = nbits;
     s->inverse         = trans == IDFT_C2R || trans == DFT_C2R;
     s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1;
 
     if (nbits < 4 || nbits > 16)
-        return -1;
+        return AVERROR(EINVAL);
 
-    if (ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C) < 0)
-        return -1;
+    if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0)
+        return ret;
 
     ff_init_ff_cos_tabs(nbits);
     s->tcos = ff_cos_tabs[nbits];
diff --git a/libavcodec/rdft.h b/libavcodec/rdft.h
index 8ff620f..37c40e7 100644
--- a/libavcodec/rdft.h
+++ b/libavcodec/rdft.h
@@ -2,24 +2,24 @@
  * (I)RDFT transforms
  * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_RDFT_H
+#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
 #define AVCODEC_RDFT_H
 
 #include "config.h"
diff --git a/libavcodec/realtextdec.c b/libavcodec/realtextdec.c
new file mode 100644
index 0000000..5084781
--- /dev/null
+++ b/libavcodec/realtextdec.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * RealText subtitle decoder
+ * @see http://service.real.com/help/library/guides/ProductionGuide/prodguide/htmfiles/realtext.htm
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+
+static int rt_event_to_ass(AVBPrint *buf, const char *p)
+{
+    int prev_chr_is_space = 1;
+
+    while (*p) {
+        if (*p != '<') {
+            if (!av_isspace(*p))
+                av_bprint_chars(buf, *p, 1);
+            else if (!prev_chr_is_space)
+                av_bprint_chars(buf, ' ', 1);
+            prev_chr_is_space = av_isspace(*p);
+        } else {
+            const char *end = strchr(p, '>');
+            if (!end)
+                break;
+            if (!av_strncasecmp(p, "<br/>", 5) ||
+                !av_strncasecmp(p, "<br>",  4)) {
+                av_bprintf(buf, "\\N");
+            }
+            p = end;
+        }
+        p++;
+    }
+    return 0;
+}
+
+static int realtext_decode_frame(AVCodecContext *avctx,
+                                 void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, 4096);
+    if (ptr && avpkt->size > 0 && !rt_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_realtext_decoder = {
+    .name           = "realtext",
+    .long_name      = NULL_IF_CONFIG_SMALL("RealText subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_REALTEXT,
+    .decode         = realtext_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/rectangle.h b/libavcodec/rectangle.h
index 616a637..df7c18a 100644
--- a/libavcodec/rectangle.h
+++ b/libavcodec/rectangle.h
@@ -2,20 +2,20 @@
  * rectangle filling function
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,9 +28,9 @@
 #ifndef AVCODEC_RECTANGLE_H
 #define AVCODEC_RECTANGLE_H
 
-#include <assert.h>
 #include "config.h"
 #include "libavutil/common.h"
+#include "libavutil/avassert.h"
 
 /**
  * fill a rectangle.
@@ -40,13 +40,14 @@
  */
 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
     uint8_t *p= (uint8_t*)vp;
-    assert(size==1 || size==2 || size==4);
-    assert(w<=4);
+    av_assert2(size==1 || size==2 || size==4);
+    av_assert2(w<=4);
 
     w      *= size;
     stride *= size;
 
-    assert((stride&(w-1))==0);
+    av_assert2((((long)vp)&(FFMIN(w, 8<<(HAVE_NEON|ARCH_PPC|HAVE_MMX))-1)) == 0);
+    av_assert2((stride&(w-1))==0);
     if(w==2){
         const uint16_t v= size==4 ? val : val*0x0101;
         *(uint16_t*)(p + 0*stride)= v;
@@ -116,8 +117,8 @@ static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride,
         *(uint32_t*)(p +12+3*stride)= val;
 #endif
     }else
-        assert(0);
-    assert(h==4);
+        av_assert2(0);
+    av_assert2(h==4);
 }
 
 #endif /* AVCODEC_RECTANGLE_H */
diff --git a/libavcodec/remove_extradata_bsf.c b/libavcodec/remove_extradata_bsf.c
index a89fa06..dd4cf17 100644
--- a/libavcodec/remove_extradata_bsf.c
+++ b/libavcodec/remove_extradata_bsf.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 enum RemoveFreq {
     REMOVE_FREQ_KEYFRAME,
     REMOVE_FREQ_ALL,
+    REMOVE_FREQ_NONKEYFRAME,
 };
 
 typedef struct RemoveExtradataContext {
@@ -50,6 +51,7 @@ static int remove_extradata(AVBSFContext *ctx, AVPacket *out)
 
     if (s->parser && s->parser->parser->split) {
         if (s->freq == REMOVE_FREQ_ALL ||
+            (s->freq == REMOVE_FREQ_NONKEYFRAME && !(in->flags & AV_PKT_FLAG_KEY)) ||
             (s->freq == REMOVE_FREQ_KEYFRAME && in->flags & AV_PKT_FLAG_KEY)) {
             int i = s->parser->parser->split(s->avctx, in->data, in->size);
             in->data += i;
@@ -94,7 +96,9 @@ static void remove_extradata_close(AVBSFContext *ctx)
 #define OFFSET(x) offsetof(RemoveExtradataContext, x)
 static const AVOption options[] = {
     { "freq", NULL, OFFSET(freq), AV_OPT_TYPE_INT, { .i64 = REMOVE_FREQ_KEYFRAME }, REMOVE_FREQ_KEYFRAME, REMOVE_FREQ_ALL, 0, "freq" },
+        { "k",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_NONKEYFRAME }, .unit = "freq" },
         { "keyframe", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_KEYFRAME }, .unit = "freq" },
+        { "e",        NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_ALL      }, .unit = "freq" },
         { "all",      NULL, 0, AV_OPT_TYPE_CONST, { .i64 = REMOVE_FREQ_ALL      }, .unit = "freq" },
     { NULL },
 };
diff --git a/libavcodec/resample.c b/libavcodec/resample.c
new file mode 100644
index 0000000..4c5eb9f
--- /dev/null
+++ b/libavcodec/resample.c
@@ -0,0 +1,439 @@
+/*
+ * samplerate conversion for both audio and video
+ * Copyright (c) 2000 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * samplerate conversion for both audio and video
+ */
+
+#include <string.h>
+
+#include "avcodec.h"
+#include "audioconvert.h"
+#include "libavutil/opt.h"
+#include "libavutil/mem.h"
+#include "libavutil/samplefmt.h"
+
+#if FF_API_AVCODEC_RESAMPLE
+FF_DISABLE_DEPRECATION_WARNINGS
+
+#define MAX_CHANNELS 8
+
+struct AVResampleContext;
+
+static const char *context_to_name(void *ptr)
+{
+    return "audioresample";
+}
+
+static const AVOption options[] = {{NULL}};
+static const AVClass audioresample_context_class = {
+    "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT
+};
+
+struct ReSampleContext {
+    struct AVResampleContext *resample_context;
+    short *temp[MAX_CHANNELS];
+    int temp_len;
+    float ratio;
+    /* channel convert */
+    int input_channels, output_channels, filter_channels;
+    AVAudioConvert *convert_ctx[2];
+    enum AVSampleFormat sample_fmt[2]; ///< input and output sample format
+    unsigned sample_size[2];           ///< size of one sample in sample_fmt
+    short *buffer[2];                  ///< buffers used for conversion to S16
+    unsigned buffer_size[2];           ///< sizes of allocated buffers
+};
+
+/* n1: number of samples */
+static void stereo_to_mono(short *output, short *input, int n1)
+{
+    short *p, *q;
+    int n = n1;
+
+    p = input;
+    q = output;
+    while (n >= 4) {
+        q[0] = (p[0] + p[1]) >> 1;
+        q[1] = (p[2] + p[3]) >> 1;
+        q[2] = (p[4] + p[5]) >> 1;
+        q[3] = (p[6] + p[7]) >> 1;
+        q += 4;
+        p += 8;
+        n -= 4;
+    }
+    while (n > 0) {
+        q[0] = (p[0] + p[1]) >> 1;
+        q++;
+        p += 2;
+        n--;
+    }
+}
+
+/* n1: number of samples */
+static void mono_to_stereo(short *output, short *input, int n1)
+{
+    short *p, *q;
+    int n = n1;
+    int v;
+
+    p = input;
+    q = output;
+    while (n >= 4) {
+        v = p[0]; q[0] = v; q[1] = v;
+        v = p[1]; q[2] = v; q[3] = v;
+        v = p[2]; q[4] = v; q[5] = v;
+        v = p[3]; q[6] = v; q[7] = v;
+        q += 8;
+        p += 4;
+        n -= 4;
+    }
+    while (n > 0) {
+        v = p[0]; q[0] = v; q[1] = v;
+        q += 2;
+        p += 1;
+        n--;
+    }
+}
+
+/*
+5.1 to stereo input: [fl, fr, c, lfe, rl, rr]
+- Left = front_left + rear_gain * rear_left + center_gain * center
+- Right = front_right + rear_gain * rear_right + center_gain * center
+Where rear_gain is usually around 0.5-1.0 and
+      center_gain is almost always 0.7 (-3 dB)
+*/
+static void surround_to_stereo(short **output, short *input, int channels, int samples)
+{
+    int i;
+    short l, r;
+
+    for (i = 0; i < samples; i++) {
+        int fl,fr,c,rl,rr;
+        fl = input[0];
+        fr = input[1];
+        c = input[2];
+        // lfe = input[3];
+        rl = input[4];
+        rr = input[5];
+
+        l = av_clip_int16(fl + (0.5 * rl) + (0.7 * c));
+        r = av_clip_int16(fr + (0.5 * rr) + (0.7 * c));
+
+        /* output l & r. */
+        *output[0]++ = l;
+        *output[1]++ = r;
+
+        /* increment input. */
+        input += channels;
+    }
+}
+
+static void deinterleave(short **output, short *input, int channels, int samples)
+{
+    int i, j;
+
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output[j]++ = *input++;
+        }
+    }
+}
+
+static void interleave(short *output, short **input, int channels, int samples)
+{
+    int i, j;
+
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output++ = *input[j]++;
+        }
+    }
+}
+
+static void ac3_5p1_mux(short *output, short *input1, short *input2, int n)
+{
+    int i;
+    short l, r;
+
+    for (i = 0; i < n; i++) {
+        l = *input1++;
+        r = *input2++;
+        *output++ = l;                  /* left */
+        *output++ = (l / 2) + (r / 2);  /* center */
+        *output++ = r;                  /* right */
+        *output++ = 0;                  /* left surround */
+        *output++ = 0;                  /* right surroud */
+        *output++ = 0;                  /* low freq */
+    }
+}
+
+#define SUPPORT_RESAMPLE(ch1, ch2, ch3, ch4, ch5, ch6, ch7, ch8) \
+    ch8<<7 | ch7<<6 | ch6<<5 | ch5<<4 | ch4<<3 | ch3<<2 | ch2<<1 | ch1<<0
+
+static const uint8_t supported_resampling[MAX_CHANNELS] = {
+    // output ch:    1  2  3  4  5  6  7  8
+    SUPPORT_RESAMPLE(1, 1, 0, 0, 0, 0, 0, 0), // 1 input channel
+    SUPPORT_RESAMPLE(1, 1, 0, 0, 0, 1, 0, 0), // 2 input channels
+    SUPPORT_RESAMPLE(0, 0, 1, 0, 0, 0, 0, 0), // 3 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 1, 0, 0, 0, 0), // 4 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 0, 1, 0, 0, 0), // 5 input channels
+    SUPPORT_RESAMPLE(0, 1, 0, 0, 0, 1, 0, 0), // 6 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 0, 0, 0, 1, 0), // 7 input channels
+    SUPPORT_RESAMPLE(0, 0, 0, 0, 0, 0, 0, 1), // 8 input channels
+};
+
+ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
+                                        int output_rate, int input_rate,
+                                        enum AVSampleFormat sample_fmt_out,
+                                        enum AVSampleFormat sample_fmt_in,
+                                        int filter_length, int log2_phase_count,
+                                        int linear, double cutoff)
+{
+    ReSampleContext *s;
+
+    if (input_channels > MAX_CHANNELS) {
+        av_log(NULL, AV_LOG_ERROR,
+               "Resampling with input channels greater than %d is unsupported.\n",
+               MAX_CHANNELS);
+        return NULL;
+    }
+    if (!(supported_resampling[input_channels-1] & (1<<(output_channels-1)))) {
+        int i;
+        av_log(NULL, AV_LOG_ERROR, "Unsupported audio resampling. Allowed "
+               "output channels for %d input channel%s", input_channels,
+               input_channels > 1 ? "s:" : ":");
+        for (i = 0; i < MAX_CHANNELS; i++)
+            if (supported_resampling[input_channels-1] & (1<<i))
+                av_log(NULL, AV_LOG_ERROR, " %d", i + 1);
+        av_log(NULL, AV_LOG_ERROR, "\n");
+        return NULL;
+    }
+
+    s = av_mallocz(sizeof(ReSampleContext));
+    if (!s) {
+        av_log(NULL, AV_LOG_ERROR, "Can't allocate memory for resample context.\n");
+        return NULL;
+    }
+
+    s->ratio = (float)output_rate / (float)input_rate;
+
+    s->input_channels = input_channels;
+    s->output_channels = output_channels;
+
+    s->filter_channels = s->input_channels;
+    if (s->output_channels < s->filter_channels)
+        s->filter_channels = s->output_channels;
+
+    s->sample_fmt[0]  = sample_fmt_in;
+    s->sample_fmt[1]  = sample_fmt_out;
+    s->sample_size[0] = av_get_bytes_per_sample(s->sample_fmt[0]);
+    s->sample_size[1] = av_get_bytes_per_sample(s->sample_fmt[1]);
+
+    if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
+        if (!(s->convert_ctx[0] = av_audio_convert_alloc(AV_SAMPLE_FMT_S16, 1,
+                                                         s->sample_fmt[0], 1, NULL, 0))) {
+            av_log(s, AV_LOG_ERROR,
+                   "Cannot convert %s sample format to s16 sample format\n",
+                   av_get_sample_fmt_name(s->sample_fmt[0]));
+            av_free(s);
+            return NULL;
+        }
+    }
+
+    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
+        if (!(s->convert_ctx[1] = av_audio_convert_alloc(s->sample_fmt[1], 1,
+                                                         AV_SAMPLE_FMT_S16, 1, NULL, 0))) {
+            av_log(s, AV_LOG_ERROR,
+                   "Cannot convert s16 sample format to %s sample format\n",
+                   av_get_sample_fmt_name(s->sample_fmt[1]));
+            av_audio_convert_free(s->convert_ctx[0]);
+            av_free(s);
+            return NULL;
+        }
+    }
+
+    s->resample_context = av_resample_init(output_rate, input_rate,
+                                           filter_length, log2_phase_count,
+                                           linear, cutoff);
+
+    *(const AVClass**)s->resample_context = &audioresample_context_class;
+
+    return s;
+}
+
+/* resample audio. 'nb_samples' is the number of input samples */
+/* XXX: optimize it ! */
+int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples)
+{
+    int i, nb_samples1;
+    short *bufin[MAX_CHANNELS];
+    short *bufout[MAX_CHANNELS];
+    short *buftmp2[MAX_CHANNELS], *buftmp3[MAX_CHANNELS];
+    short *output_bak = NULL;
+    int lenout;
+
+    if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
+        int istride[1] = { s->sample_size[0] };
+        int ostride[1] = { 2 };
+        const void *ibuf[1] = { input };
+        void       *obuf[1];
+        unsigned input_size = nb_samples * s->input_channels * 2;
+
+        if (!s->buffer_size[0] || s->buffer_size[0] < input_size) {
+            av_free(s->buffer[0]);
+            s->buffer_size[0] = input_size;
+            s->buffer[0] = av_malloc(s->buffer_size[0]);
+            if (!s->buffer[0]) {
+                av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
+                return 0;
+            }
+        }
+
+        obuf[0] = s->buffer[0];
+
+        if (av_audio_convert(s->convert_ctx[0], obuf, ostride,
+                             ibuf, istride, nb_samples * s->input_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format conversion failed\n");
+            return 0;
+        }
+
+        input = s->buffer[0];
+    }
+
+    lenout= 2*s->output_channels*nb_samples * s->ratio + 16;
+
+    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
+        int out_size = lenout * av_get_bytes_per_sample(s->sample_fmt[1]) *
+                       s->output_channels;
+        output_bak = output;
+
+        if (!s->buffer_size[1] || s->buffer_size[1] < out_size) {
+            av_free(s->buffer[1]);
+            s->buffer_size[1] = out_size;
+            s->buffer[1] = av_malloc(s->buffer_size[1]);
+            if (!s->buffer[1]) {
+                av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
+                return 0;
+            }
+        }
+
+        output = s->buffer[1];
+    }
+
+    /* XXX: move those malloc to resample init code */
+    for (i = 0; i < s->filter_channels; i++) {
+        bufin[i] = av_malloc_array((nb_samples + s->temp_len), sizeof(short));
+        bufout[i] = av_malloc_array(lenout, sizeof(short));
+
+        if (!bufin[i] || !bufout[i]) {
+            av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
+            nb_samples1 = 0;
+            goto fail;
+        }
+
+        memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short));
+        buftmp2[i] = bufin[i] + s->temp_len;
+    }
+
+    if (s->input_channels == 2 && s->output_channels == 1) {
+        buftmp3[0] = output;
+        stereo_to_mono(buftmp2[0], input, nb_samples);
+    } else if (s->output_channels >= 2 && s->input_channels == 1) {
+        buftmp3[0] = bufout[0];
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
+    } else if (s->input_channels == 6 && s->output_channels ==2) {
+        buftmp3[0] = bufout[0];
+        buftmp3[1] = bufout[1];
+        surround_to_stereo(buftmp2, input, s->input_channels, nb_samples);
+    } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) {
+        for (i = 0; i < s->input_channels; i++) {
+            buftmp3[i] = bufout[i];
+        }
+        deinterleave(buftmp2, input, s->input_channels, nb_samples);
+    } else {
+        buftmp3[0] = output;
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
+    }
+
+    nb_samples += s->temp_len;
+
+    /* resample each channel */
+    nb_samples1 = 0; /* avoid warning */
+    for (i = 0; i < s->filter_channels; i++) {
+        int consumed;
+        int is_last = i + 1 == s->filter_channels;
+
+        nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i],
+                                  &consumed, nb_samples, lenout, is_last);
+        s->temp_len = nb_samples - consumed;
+        s->temp[i] = av_realloc_array(s->temp[i], s->temp_len, sizeof(short));
+        memcpy(s->temp[i], bufin[i] + consumed, s->temp_len * sizeof(short));
+    }
+
+    if (s->output_channels == 2 && s->input_channels == 1) {
+        mono_to_stereo(output, buftmp3[0], nb_samples1);
+    } else if (s->output_channels == 6 && s->input_channels == 2) {
+        ac3_5p1_mux(output, buftmp3[0], buftmp3[1], nb_samples1);
+    } else if ((s->output_channels == s->input_channels && s->input_channels >= 2) ||
+               (s->output_channels == 2 && s->input_channels == 6)) {
+        interleave(output, buftmp3, s->output_channels, nb_samples1);
+    }
+
+    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
+        int istride[1] = { 2 };
+        int ostride[1] = { s->sample_size[1] };
+        const void *ibuf[1] = { output };
+        void       *obuf[1] = { output_bak };
+
+        if (av_audio_convert(s->convert_ctx[1], obuf, ostride,
+                             ibuf, istride, nb_samples1 * s->output_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format conversion failed\n");
+            return 0;
+        }
+    }
+
+fail:
+    for (i = 0; i < s->filter_channels; i++) {
+        av_free(bufin[i]);
+        av_free(bufout[i]);
+    }
+
+    return nb_samples1;
+}
+
+void audio_resample_close(ReSampleContext *s)
+{
+    int i;
+    av_resample_close(s->resample_context);
+    for (i = 0; i < s->filter_channels; i++)
+        av_freep(&s->temp[i]);
+    av_freep(&s->buffer[0]);
+    av_freep(&s->buffer[1]);
+    av_audio_convert_free(s->convert_ctx[0]);
+    av_audio_convert_free(s->convert_ctx[1]);
+    av_free(s);
+}
+
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
diff --git a/libavcodec/resample2.c b/libavcodec/resample2.c
new file mode 100644
index 0000000..56ae9f7
--- /dev/null
+++ b/libavcodec/resample2.c
@@ -0,0 +1,319 @@
+/*
+ * audio resampling
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * audio resampling
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "libavutil/avassert.h"
+#include "avcodec.h"
+#include "libavutil/common.h"
+
+#if FF_API_AVCODEC_RESAMPLE
+
+#ifndef CONFIG_RESAMPLE_HP
+#define FILTER_SHIFT 15
+
+typedef int16_t FELEM;
+typedef int32_t FELEM2;
+typedef int64_t FELEML;
+#define FELEM_MAX INT16_MAX
+#define FELEM_MIN INT16_MIN
+#define WINDOW_TYPE 9
+#elif !defined(CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE)
+#define FILTER_SHIFT 30
+
+#define FELEM int32_t
+#define FELEM2 int64_t
+#define FELEML int64_t
+#define FELEM_MAX INT32_MAX
+#define FELEM_MIN INT32_MIN
+#define WINDOW_TYPE 12
+#else
+#define FILTER_SHIFT 0
+
+typedef double FELEM;
+typedef double FELEM2;
+typedef double FELEML;
+#define WINDOW_TYPE 24
+#endif
+
+
+typedef struct AVResampleContext{
+    const AVClass *av_class;
+    FELEM *filter_bank;
+    int filter_length;
+    int ideal_dst_incr;
+    int dst_incr;
+    int index;
+    int frac;
+    int src_incr;
+    int compensation_distance;
+    int phase_shift;
+    int phase_mask;
+    int linear;
+}AVResampleContext;
+
+/**
+ * 0th order modified bessel function of the first kind.
+ */
+static double bessel(double x){
+    double v=1;
+    double lastv=0;
+    double t=1;
+    int i;
+
+    x= x*x/4;
+    for(i=1; v != lastv; i++){
+        lastv=v;
+        t *= x/(i*i);
+        v += t;
+    }
+    return v;
+}
+
+/**
+ * Build a polyphase filterbank.
+ * @param factor resampling factor
+ * @param scale wanted sum of coefficients for each filter
+ * @param type 0->cubic, 1->blackman nuttall windowed sinc, 2..16->kaiser windowed sinc beta=2..16
+ * @return 0 on success, negative on error
+ */
+static int build_filter(FELEM *filter, double factor, int tap_count, int phase_count, int scale, int type){
+    int ph, i;
+    double x, y, w;
+    double *tab = av_malloc_array(tap_count, sizeof(*tab));
+    const int center= (tap_count-1)/2;
+
+    if (!tab)
+        return AVERROR(ENOMEM);
+
+    /* if upsampling, only need to interpolate, no filter */
+    if (factor > 1.0)
+        factor = 1.0;
+
+    for(ph=0;ph<phase_count;ph++) {
+        double norm = 0;
+        for(i=0;i<tap_count;i++) {
+            x = M_PI * ((double)(i - center) - (double)ph / phase_count) * factor;
+            if (x == 0) y = 1.0;
+            else        y = sin(x) / x;
+            switch(type){
+            case 0:{
+                const float d= -0.5; //first order derivative = -0.5
+                x = fabs(((double)(i - center) - (double)ph / phase_count) * factor);
+                if(x<1.0) y= 1 - 3*x*x + 2*x*x*x + d*(            -x*x + x*x*x);
+                else      y=                       d*(-4 + 8*x - 5*x*x + x*x*x);
+                break;}
+            case 1:
+                w = 2.0*x / (factor*tap_count) + M_PI;
+                y *= 0.3635819 - 0.4891775 * cos(w) + 0.1365995 * cos(2*w) - 0.0106411 * cos(3*w);
+                break;
+            default:
+                w = 2.0*x / (factor*tap_count*M_PI);
+                y *= bessel(type*sqrt(FFMAX(1-w*w, 0)));
+                break;
+            }
+
+            tab[i] = y;
+            norm += y;
+        }
+
+        /* normalize so that an uniform color remains the same */
+        for(i=0;i<tap_count;i++) {
+#ifdef CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE
+            filter[ph * tap_count + i] = tab[i] / norm;
+#else
+            filter[ph * tap_count + i] = av_clip(lrintf(tab[i] * scale / norm), FELEM_MIN, FELEM_MAX);
+#endif
+        }
+    }
+#if 0
+    {
+#define LEN 1024
+        int j,k;
+        double sine[LEN + tap_count];
+        double filtered[LEN];
+        double maxff=-2, minff=2, maxsf=-2, minsf=2;
+        for(i=0; i<LEN; i++){
+            double ss=0, sf=0, ff=0;
+            for(j=0; j<LEN+tap_count; j++)
+                sine[j]= cos(i*j*M_PI/LEN);
+            for(j=0; j<LEN; j++){
+                double sum=0;
+                ph=0;
+                for(k=0; k<tap_count; k++)
+                    sum += filter[ph * tap_count + k] * sine[k+j];
+                filtered[j]= sum / (1<<FILTER_SHIFT);
+                ss+= sine[j + center] * sine[j + center];
+                ff+= filtered[j] * filtered[j];
+                sf+= sine[j + center] * filtered[j];
+            }
+            ss= sqrt(2*ss/LEN);
+            ff= sqrt(2*ff/LEN);
+            sf= 2*sf/LEN;
+            maxff= FFMAX(maxff, ff);
+            minff= FFMIN(minff, ff);
+            maxsf= FFMAX(maxsf, sf);
+            minsf= FFMIN(minsf, sf);
+            if(i%11==0){
+                av_log(NULL, AV_LOG_ERROR, "i:%4d ss:%f ff:%13.6e-%13.6e sf:%13.6e-%13.6e\n", i, ss, maxff, minff, maxsf, minsf);
+                minff=minsf= 2;
+                maxff=maxsf= -2;
+            }
+        }
+    }
+#endif
+
+    av_free(tab);
+    return 0;
+}
+
+AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_size, int phase_shift, int linear, double cutoff){
+    AVResampleContext *c= av_mallocz(sizeof(AVResampleContext));
+    double factor= FFMIN(out_rate * cutoff / in_rate, 1.0);
+    int phase_count= 1<<phase_shift;
+
+    if (!c)
+        return NULL;
+
+    c->phase_shift= phase_shift;
+    c->phase_mask= phase_count-1;
+    c->linear= linear;
+
+    c->filter_length= FFMAX((int)ceil(filter_size/factor), 1);
+    c->filter_bank= av_mallocz_array(c->filter_length, (phase_count+1)*sizeof(FELEM));
+    if (!c->filter_bank)
+        goto error;
+    if (build_filter(c->filter_bank, factor, c->filter_length, phase_count, 1<<FILTER_SHIFT, WINDOW_TYPE))
+        goto error;
+    memcpy(&c->filter_bank[c->filter_length*phase_count+1], c->filter_bank, (c->filter_length-1)*sizeof(FELEM));
+    c->filter_bank[c->filter_length*phase_count]= c->filter_bank[c->filter_length - 1];
+
+    if(!av_reduce(&c->src_incr, &c->dst_incr, out_rate, in_rate * (int64_t)phase_count, INT32_MAX/2))
+        goto error;
+    c->ideal_dst_incr= c->dst_incr;
+
+    c->index= -phase_count*((c->filter_length-1)/2);
+
+    return c;
+error:
+    av_free(c->filter_bank);
+    av_free(c);
+    return NULL;
+}
+
+void av_resample_close(AVResampleContext *c){
+    av_freep(&c->filter_bank);
+    av_freep(&c);
+}
+
+void av_resample_compensate(AVResampleContext *c, int sample_delta, int compensation_distance){
+//    sample_delta += (c->ideal_dst_incr - c->dst_incr)*(int64_t)c->compensation_distance / c->ideal_dst_incr;
+    c->compensation_distance= compensation_distance;
+    c->dst_incr = c->ideal_dst_incr - c->ideal_dst_incr * (int64_t)sample_delta / compensation_distance;
+}
+
+int av_resample(AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx){
+    int dst_index, i;
+    int index= c->index;
+    int frac= c->frac;
+    int dst_incr_frac= c->dst_incr % c->src_incr;
+    int dst_incr=      c->dst_incr / c->src_incr;
+    int compensation_distance= c->compensation_distance;
+
+  if(compensation_distance == 0 && c->filter_length == 1 && c->phase_shift==0){
+        int64_t index2= ((int64_t)index)<<32;
+        int64_t incr= (1LL<<32) * c->dst_incr / c->src_incr;
+        dst_size= FFMIN(dst_size, (src_size-1-index) * (int64_t)c->src_incr / c->dst_incr);
+
+        for(dst_index=0; dst_index < dst_size; dst_index++){
+            dst[dst_index] = src[index2>>32];
+            index2 += incr;
+        }
+        index += dst_index * dst_incr;
+        index += (frac + dst_index * (int64_t)dst_incr_frac) / c->src_incr;
+        frac   = (frac + dst_index * (int64_t)dst_incr_frac) % c->src_incr;
+  }else{
+    for(dst_index=0; dst_index < dst_size; dst_index++){
+        FELEM *filter= c->filter_bank + c->filter_length*(index & c->phase_mask);
+        int sample_index= index >> c->phase_shift;
+        FELEM2 val=0;
+
+        if(sample_index < 0){
+            for(i=0; i<c->filter_length; i++)
+                val += src[FFABS(sample_index + i) % src_size] * filter[i];
+        }else if(sample_index + c->filter_length > src_size){
+            break;
+        }else if(c->linear){
+            FELEM2 v2=0;
+            for(i=0; i<c->filter_length; i++){
+                val += src[sample_index + i] * (FELEM2)filter[i];
+                v2  += src[sample_index + i] * (FELEM2)filter[i + c->filter_length];
+            }
+            val+=(v2-val)*(FELEML)frac / c->src_incr;
+        }else{
+            for(i=0; i<c->filter_length; i++){
+                val += src[sample_index + i] * (FELEM2)filter[i];
+            }
+        }
+
+#ifdef CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE
+        dst[dst_index] = av_clip_int16(lrintf(val));
+#else
+        val = (val + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;
+        dst[dst_index] = (unsigned)(val + 32768) > 65535 ? (val>>31) ^ 32767 : val;
+#endif
+
+        frac += dst_incr_frac;
+        index += dst_incr;
+        if(frac >= c->src_incr){
+            frac -= c->src_incr;
+            index++;
+        }
+
+        if(dst_index + 1 == compensation_distance){
+            compensation_distance= 0;
+            dst_incr_frac= c->ideal_dst_incr % c->src_incr;
+            dst_incr=      c->ideal_dst_incr / c->src_incr;
+        }
+    }
+  }
+    *consumed= FFMAX(index, 0) >> c->phase_shift;
+    if(index>=0) index &= c->phase_mask;
+
+    if(compensation_distance){
+        compensation_distance -= dst_index;
+        av_assert2(compensation_distance > 0);
+    }
+    if(update_ctx){
+        c->frac= frac;
+        c->index= index;
+        c->dst_incr= dst_incr_frac + c->src_incr*dst_incr;
+        c->compensation_distance= compensation_distance;
+    }
+
+    return dst_index;
+}
+
+#endif
diff --git a/libavcodec/reverse.c b/libavcodec/reverse.c
new file mode 100644
index 0000000..440bada
--- /dev/null
+++ b/libavcodec/reverse.c
@@ -0,0 +1 @@
+#include "libavutil/reverse.c"
diff --git a/libavcodec/rl.c b/libavcodec/rl.c
index 5bc1f91..b206c6f 100644
--- a/libavcodec/rl.c
+++ b/libavcodec/rl.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -100,9 +100,13 @@ fail:
     return AVERROR(ENOMEM);
 }
 
-av_cold void ff_rl_init_vlc(RLTable *rl)
+av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
 {
     int i, q;
+    VLC_TYPE table[1500][2] = {{0}};
+    VLC vlc = { .table = table, .table_allocated = static_size };
+    av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+    init_vlc(&vlc, 9, rl->n + 1, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC);
 
     for (q = 0; q < 32; q++) {
         int qmul = q * 2;
@@ -112,9 +116,9 @@ av_cold void ff_rl_init_vlc(RLTable *rl)
             qmul = 1;
             qadd = 0;
         }
-        for (i = 0; i < rl->vlc.table_size; i++) {
-            int code = rl->vlc.table[i][0];
-            int len  = rl->vlc.table[i][1];
+        for (i = 0; i < vlc.table_size; i++) {
+            int code = vlc.table[i][0];
+            int len  = vlc.table[i][1];
             int level, run;
 
             if (len == 0) { // illegal code
diff --git a/libavcodec/rl.h b/libavcodec/rl.h
index e4a622f..af525ef 100644
--- a/libavcodec/rl.h
+++ b/libavcodec/rl.h
@@ -2,20 +2,20 @@
  * Copyright (c) 2000-2002 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,7 +44,6 @@ typedef struct RLTable {
     uint8_t *index_run[2];         ///< encoding only
     int8_t *max_level[2];          ///< encoding & decoding
     int8_t *max_run[2];            ///< encoding & decoding
-    VLC vlc;                       ///< decoding only deprecated FIXME remove
     RL_VLC_ELEM *rl_vlc[32];       ///< decoding only
 } RLTable;
 
@@ -53,7 +52,7 @@ typedef struct RLTable {
  *                     the level and run tables, if this is NULL av_malloc() will be used
  */
 int ff_rl_init(RLTable *rl, uint8_t static_store[2][2*MAX_RUN + MAX_LEVEL + 3]);
-void ff_rl_init_vlc(RLTable *rl);
+void ff_rl_init_vlc(RLTable *rl, unsigned static_size);
 
 /**
  * Free the contents of a dynamically allocated table.
@@ -64,15 +63,12 @@ void ff_rl_free(RLTable *rl);
 {\
     int q;\
     static RL_VLC_ELEM rl_vlc_table[32][static_size];\
-    INIT_VLC_STATIC(&rl.vlc, 9, rl.n + 1,\
-             &rl.table_vlc[0][1], 4, 2,\
-             &rl.table_vlc[0][0], 4, 2, static_size);\
 \
     if(!rl.rl_vlc[0]){\
         for(q=0; q<32; q++)\
             rl.rl_vlc[q]= rl_vlc_table[q];\
 \
-        ff_rl_init_vlc(&rl);\
+        ff_rl_init_vlc(&rl, static_size);\
     }\
 }
 
diff --git a/libavcodec/rl2.c b/libavcodec/rl2.c
index c42a1cd..6662979 100644
--- a/libavcodec/rl2.c
+++ b/libavcodec/rl2.c
@@ -2,20 +2,20 @@
  * RL2 Video Decoder
  * Copyright (C) 2008 Sascha Sommer (saschasommer@freenet.de)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -155,7 +155,7 @@ static av_cold int rl2_decode_init(AVCodecContext *avctx)
 
     /** initialize palette */
     for (i = 0; i < AVPALETTE_COUNT; i++)
-        s->palette[i] = AV_RB24(&avctx->extradata[6 + i * 3]);
+        s->palette[i] = 0xFFU << 24 | AV_RB24(&avctx->extradata[6 + i * 3]);
 
     /** decode background frame if present */
     back_size = avctx->extradata_size - EXTRADATA1_SIZE;
@@ -181,10 +181,8 @@ static int rl2_decode_frame(AVCodecContext *avctx,
     int ret, buf_size  = avpkt->size;
     Rl2Context *s = avctx->priv_data;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     /** run length decode */
     rl2_rle_decode(s, buf, buf_size, frame->data[0], frame->linesize[0],
@@ -209,7 +207,7 @@ static av_cold int rl2_decode_end(AVCodecContext *avctx)
 {
     Rl2Context *s = avctx->priv_data;
 
-    av_free(s->back_frame);
+    av_freep(&s->back_frame);
 
     return 0;
 }
diff --git a/libavcodec/rle.c b/libavcodec/rle.c
index 8a2d922..7924ea7 100644
--- a/libavcodec/rle.c
+++ b/libavcodec/rle.c
@@ -2,20 +2,20 @@
  * RLE encoder
  * Copyright (c) 2007 Bobby Bingham
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "avcodec.h"
diff --git a/libavcodec/rle.h b/libavcodec/rle.h
index c967764..cb51624 100644
--- a/libavcodec/rle.h
+++ b/libavcodec/rle.h
@@ -1,20 +1,20 @@
 /*
  * RLE encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rnd_avg.h b/libavcodec/rnd_avg.h
index 412cda5..344775e 100644
--- a/libavcodec/rnd_avg.h
+++ b/libavcodec/rnd_avg.h
@@ -1,18 +1,21 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
+ * Copyright (c) 2011 Oskar Arvidsson
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/roqaudioenc.c b/libavcodec/roqaudioenc.c
index f687f5c..5154604 100644
--- a/libavcodec/roqaudioenc.c
+++ b/libavcodec/roqaudioenc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2005 Eric Lasota
  *    Based on RoQ specs (c)2001 Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -160,10 +160,8 @@ static int roq_dpcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
     else
         data_size = avctx->channels * avctx->frame_size;
 
-    if ((ret = ff_alloc_packet(avpkt, ROQ_HEADER_SIZE + data_size))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, ROQ_HEADER_SIZE + data_size, 0)) < 0)
         return ret;
-    }
     out = avpkt->data;
 
     bytestream_put_byte(&out, stereo ? 0x21 : 0x20);
diff --git a/libavcodec/roqvideo.c b/libavcodec/roqvideo.c
index b0fd6ba..8eda93c 100644
--- a/libavcodec/roqvideo.c
+++ b/libavcodec/roqvideo.c
@@ -2,20 +2,20 @@
  * Copyright (C) 2003 Mike Melanson
  * Copyright (C) 2003 Dr. Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/roqvideo.h b/libavcodec/roqvideo.h
index 3f00022..3da6eaa 100644
--- a/libavcodec/roqvideo.h
+++ b/libavcodec/roqvideo.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2003 Mike Melanson
  * Copyright (C) 2003 Dr. Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,6 +43,7 @@ struct RoqTempData;
 
 typedef struct RoqContext {
 
+    const AVClass *class;
     AVCodecContext *avctx;
     AVFrame *last_frame;
     AVFrame *current_frame;
@@ -69,6 +70,9 @@ typedef struct RoqContext {
     const AVFrame *frame_to_enc;
     uint8_t *out_buf;
     struct RoqTempData *tmpData;
+
+    int quake3_compat; // Quake 3 compatibility option
+
 } RoqContext;
 
 #define RoQ_INFO              0x1001
diff --git a/libavcodec/roqvideodec.c b/libavcodec/roqvideodec.c
index 4f778dc..4c5dec5 100644
--- a/libavcodec/roqvideodec.c
+++ b/libavcodec/roqvideodec.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,10 +25,7 @@
  *   http://www.csse.monash.edu.au/~timf/
  */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
+#include "libavutil/avassert.h"
 #include "libavutil/imgutils.h"
 
 #include "avcodec.h"
@@ -74,9 +71,19 @@ static void roqvideo_decode_frame(RoqContext *ri)
 
     chunk_start = bytestream2_tell(&ri->gb);
     xpos = ypos = 0;
+
+    if (chunk_size > bytestream2_get_bytes_left(&ri->gb)) {
+        av_log(ri->avctx, AV_LOG_ERROR, "Chunk does not fit in input buffer\n");
+        chunk_size = bytestream2_get_bytes_left(&ri->gb);
+    }
+
     while (bytestream2_tell(&ri->gb) < chunk_start + chunk_size) {
         for (yp = ypos; yp < ypos + 16; yp += 8)
             for (xp = xpos; xp < xpos + 16; xp += 8) {
+                if (bytestream2_tell(&ri->gb) >= chunk_start + chunk_size) {
+                    av_log(ri->avctx, AV_LOG_VERBOSE, "Chunk is too short\n");
+                    return;
+                }
                 if (vqflg_pos < 0) {
                     vqflg = bytestream2_get_le16(&ri->gb);
                     vqflg_pos = 7;
@@ -108,6 +115,10 @@ static void roqvideo_decode_frame(RoqContext *ri)
                         if(k & 0x01) x += 4;
                         if(k & 0x02) y += 4;
 
+                        if (bytestream2_tell(&ri->gb) >= chunk_start + chunk_size) {
+                            av_log(ri->avctx, AV_LOG_VERBOSE, "Chunk is too short\n");
+                            return;
+                        }
                         if (vqflg_pos < 0) {
                             vqflg = bytestream2_get_le16(&ri->gb);
                             vqflg_pos = 7;
@@ -142,7 +153,7 @@ static void roqvideo_decode_frame(RoqContext *ri)
                     }
                     break;
                 default:
-                    av_log(ri->avctx, AV_LOG_ERROR, "Unknown vq code: %d\n", vqid);
+                    av_assert2(0);
             }
         }
 
@@ -180,7 +191,8 @@ static av_cold int roq_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
     }
 
-    avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+    avctx->pix_fmt = AV_PIX_FMT_YUVJ444P;
+    avctx->color_range = AVCOL_RANGE_JPEG;
 
     return 0;
 }
@@ -195,10 +207,8 @@ static int roq_decode_frame(AVCodecContext *avctx,
     int copy = !s->current_frame->data[0] && s->last_frame->data[0];
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->current_frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "  RoQ: get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->current_frame)) < 0)
         return ret;
-    }
 
     if (copy) {
         ret = av_frame_copy(s->current_frame, s->last_frame);
diff --git a/libavcodec/roqvideoenc.c b/libavcodec/roqvideoenc.c
index eb02166..038a63d 100644
--- a/libavcodec/roqvideoenc.c
+++ b/libavcodec/roqvideoenc.c
@@ -5,27 +5,27 @@
  * Copyright (C) 2004-2007 Eric Lasota
  *    Based on RoQ specs (C) 2001 Tim Ferguson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * id RoQ encoder by Vitor. Based on the Switchblade3 library and the
- * Switchblade3 Libav glue by Eric Lasota.
+ * Switchblade3 FFmpeg glue by Eric Lasota.
  */
 
 /*
@@ -57,6 +57,7 @@
 #include <string.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/opt.h"
 #include "roqvideo.h"
 #include "bytestream.h"
 #include "elbg.h"
@@ -69,7 +70,7 @@
  * Maximum number of generated 4x4 codebooks. Can't be 256 to workaround a
  * Quake 3 bug.
  */
-#define MAX_CBS_4x4 255
+#define MAX_CBS_4x4 256
 
 #define MAX_CBS_2x2 256 ///< Maximum number of 2x2 codebooks.
 
@@ -245,7 +246,7 @@ static int create_cel_evals(RoqContext *enc, RoqTempdata *tempData)
 {
     int n=0, x, y, i;
 
-    tempData->cel_evals = av_malloc(enc->width*enc->height/64 * sizeof(CelEvaluation));
+    tempData->cel_evals = av_malloc_array(enc->width*enc->height/64, sizeof(CelEvaluation));
     if (!tempData->cel_evals)
         return AVERROR(ENOMEM);
 
@@ -541,7 +542,7 @@ static void remap_codebooks(RoqContext *enc, RoqTempdata *tempData)
     int i, j, idx=0;
 
     /* Make remaps for the final codebook usage */
-    for (i=0; i<MAX_CBS_4x4; i++) {
+    for (i=0; i<(enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4); i++) {
         if (tempData->codebooks.usedCB4[i]) {
             tempData->i2f4[i] = idx;
             tempData->f2i4[idx] = i;
@@ -798,14 +799,14 @@ static int generate_codebook(RoqContext *enc, RoqTempdata *tempdata,
     int i, j, k, ret = 0;
     int c_size = size*size/4;
     int *buf;
-    int *codebook = av_malloc(6*c_size*cbsize*sizeof(int));
+    int *codebook = av_malloc_array(6*c_size, cbsize*sizeof(int));
     int *closest_cb;
 
     if (!codebook)
         return AVERROR(ENOMEM);
 
     if (size == 4) {
-        closest_cb = av_malloc(6*c_size*inputCount*sizeof(int));
+        closest_cb = av_malloc_array(6*c_size, inputCount*sizeof(int));
         if (!closest_cb) {
             ret = AVERROR(ENOMEM);
             goto out;
@@ -813,11 +814,11 @@ static int generate_codebook(RoqContext *enc, RoqTempdata *tempdata,
     } else
         closest_cb = tempdata->closest_cb2;
 
-    ret = ff_init_elbg(points, 6 * c_size, inputCount, codebook,
+    ret = avpriv_init_elbg(points, 6 * c_size, inputCount, codebook,
                        cbsize, 1, closest_cb, &enc->randctx);
     if (ret < 0)
         goto out;
-    ret = ff_do_elbg(points, 6 * c_size, inputCount, codebook,
+    ret = avpriv_do_elbg(points, 6 * c_size, inputCount, codebook,
                      cbsize, 1, closest_cb, &enc->randctx);
     if (ret < 0)
         goto out;
@@ -846,8 +847,8 @@ static int generate_new_codebooks(RoqContext *enc, RoqTempdata *tempData)
     int max = enc->width*enc->height/16;
     uint8_t mb2[3*4];
     roq_cell *results4 = av_malloc(sizeof(roq_cell)*MAX_CBS_4x4*4);
-    uint8_t *yuvClusters=av_malloc(sizeof(int)*max*6*4);
-    int *points = av_malloc(max*6*4*sizeof(int));
+    uint8_t *yuvClusters=av_malloc_array(max, sizeof(int)*6*4);
+    int *points = av_malloc_array(max, 6*4*sizeof(int));
     int bias;
 
     if (!results4 || !yuvClusters || !points) {
@@ -866,12 +867,12 @@ static int generate_new_codebooks(RoqContext *enc, RoqTempdata *tempData)
 
     /* Create 4x4 codebooks */
     if ((ret = generate_codebook(enc, tempData, points, max,
-                                 results4, 4, MAX_CBS_4x4)) < 0)
+                                 results4, 4, (enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4))) < 0)
         goto out;
 
-    codebooks->numCB4 = MAX_CBS_4x4;
+    codebooks->numCB4 = (enc->quake3_compat ? MAX_CBS_4x4-1 : MAX_CBS_4x4);
 
-    tempData->closest_cb2 = av_malloc(max*4*sizeof(int));
+    tempData->closest_cb2 = av_malloc_array(max, 4*sizeof(int));
     if (!tempData->closest_cb2) {
         ret = AVERROR(ENOMEM);
         goto out;
@@ -932,10 +933,14 @@ static int roq_encode_video(RoqContext *enc)
         gather_data_for_cel(tempData->cel_evals + i, enc, tempData);
 
     /* Quake 3 can't handle chunks bigger than 65535 bytes */
-    if (tempData->mainChunkSize/8 > 65535) {
+    if (tempData->mainChunkSize/8 > 65535 && enc->quake3_compat) {
+        if (enc->lambda > 100000) {
+            av_log(enc->avctx, AV_LOG_ERROR, "Cannot encode video in Quake compatible form\n");
+            return AVERROR(EINVAL);
+        }
         av_log(enc->avctx, AV_LOG_ERROR,
-               "Warning, generated a frame too big (%d > 65535), "
-               "try using a smaller qscale value.\n",
+               "Warning, generated a frame too big for Quake (%d > 65535), "
+               "now switching to a bigger qscale value.\n",
                tempData->mainChunkSize/8);
         enc->lambda *= 1.5;
         tempData->mainChunkSize = 0;
@@ -960,8 +965,8 @@ static int roq_encode_video(RoqContext *enc)
     FFSWAP(motion_vect *, enc->last_motion4, enc->this_motion4);
     FFSWAP(motion_vect *, enc->last_motion8, enc->this_motion8);
 
-    av_free(tempData->cel_evals);
-    av_free(tempData->closest_cb2);
+    av_freep(&tempData->cel_evals);
+    av_freep(&tempData->closest_cb2);
 
     enc->framesSinceKeyframe++;
 
@@ -975,11 +980,11 @@ static av_cold int roq_encode_end(AVCodecContext *avctx)
     av_frame_free(&enc->current_frame);
     av_frame_free(&enc->last_frame);
 
-    av_free(enc->tmpData);
-    av_free(enc->this_motion4);
-    av_free(enc->last_motion4);
-    av_free(enc->this_motion8);
-    av_free(enc->last_motion8);
+    av_freep(&enc->tmpData);
+    av_freep(&enc->this_motion4);
+    av_freep(&enc->last_motion4);
+    av_freep(&enc->this_motion8);
+    av_freep(&enc->last_motion8);
 
     return 0;
 }
@@ -995,11 +1000,16 @@ static av_cold int roq_encode_init(AVCodecContext *avctx)
     enc->framesSinceKeyframe = 0;
     if ((avctx->width & 0xf) || (avctx->height & 0xf)) {
         av_log(avctx, AV_LOG_ERROR, "Dimensions must be divisible by 16\n");
-        return -1;
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->width > 65535 || avctx->height > 65535) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions are max %d\n", enc->quake3_compat ? 32768 : 65535);
+        return AVERROR(EINVAL);
     }
 
     if (((avctx->width)&(avctx->width-1))||((avctx->height)&(avctx->height-1)))
-        av_log(avctx, AV_LOG_ERROR, "Warning: dimensions not power of two\n");
+        av_log(avctx, AV_LOG_ERROR, "Warning: dimensions not power of two, this is not supported by quake\n");
 
     enc->width = avctx->width;
     enc->height = avctx->height;
@@ -1017,16 +1027,22 @@ static av_cold int roq_encode_init(AVCodecContext *avctx)
     enc->tmpData      = av_malloc(sizeof(RoqTempdata));
 
     enc->this_motion4 =
-        av_mallocz((enc->width*enc->height/16)*sizeof(motion_vect));
+        av_mallocz_array((enc->width*enc->height/16), sizeof(motion_vect));
 
     enc->last_motion4 =
-        av_malloc ((enc->width*enc->height/16)*sizeof(motion_vect));
+        av_malloc_array ((enc->width*enc->height/16), sizeof(motion_vect));
 
     enc->this_motion8 =
-        av_mallocz((enc->width*enc->height/64)*sizeof(motion_vect));
+        av_mallocz_array((enc->width*enc->height/64), sizeof(motion_vect));
 
     enc->last_motion8 =
-        av_malloc ((enc->width*enc->height/64)*sizeof(motion_vect));
+        av_malloc_array ((enc->width*enc->height/64), sizeof(motion_vect));
+
+    if (!enc->tmpData || !enc->this_motion4 || !enc->last_motion4 ||
+        !enc->this_motion8 || !enc->last_motion8) {
+        roq_encode_end(avctx);
+        return AVERROR(ENOMEM);
+    }
 
     return 0;
 }
@@ -1074,10 +1090,8 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     /* 138 bits max per 8x8 block +
      *     256 codebooks*(6 bytes 2x2 + 4 bytes 4x4) + 8 bytes frame header */
     size = ((enc->width * enc->height / 64) * 138 + 7) / 8 + 256 * (6 + 4) + 8;
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet with size %d.\n", size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0)
         return ret;
-    }
     enc->out_buf = pkt->data;
 
     /* Check for I-frame */
@@ -1087,11 +1101,9 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     if (enc->first_frame) {
         /* Alloc memory for the reconstruction data (we must know the stride
          for that) */
-        if (ff_get_buffer(avctx, enc->current_frame, 0) ||
-            ff_get_buffer(avctx, enc->last_frame, 0)) {
-            av_log(avctx, AV_LOG_ERROR, "  RoQ: get_buffer() failed\n");
-            return -1;
-        }
+        if ((ret = ff_get_buffer(avctx, enc->current_frame, 0)) < 0 ||
+            (ret = ff_get_buffer(avctx, enc->last_frame,    0)) < 0)
+            return ret;
 
         /* Before the first video frame, write a "video info" chunk */
         roq_write_video_info_chunk(enc);
@@ -1112,6 +1124,20 @@ static int roq_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     return 0;
 }
 
+#define OFFSET(x) offsetof(RoqContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "quake3_compat", "Whether to respect known limitations in Quake 3 decoder", OFFSET(quake3_compat), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VE },
+    { NULL },
+};
+
+static const AVClass roq_class = {
+    .class_name = "RoQ",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_roq_encoder = {
     .name                 = "roqvideo",
     .long_name            = NULL_IF_CONFIG_SMALL("id RoQ video"),
@@ -1121,7 +1147,7 @@ AVCodec ff_roq_encoder = {
     .init                 = roq_encode_init,
     .encode2              = roq_encode_frame,
     .close                = roq_encode_end,
-    .supported_framerates = (const AVRational[]){ {30,1}, {0,0} },
-    .pix_fmts             = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV444P,
+    .pix_fmts             = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVJ444P,
                                                         AV_PIX_FMT_NONE },
+    .priv_class     = &roq_class,
 };
diff --git a/libavcodec/rpza.c b/libavcodec/rpza.c
index f3f3fbc..b71ebd1 100644
--- a/libavcodec/rpza.c
+++ b/libavcodec/rpza.c
@@ -2,20 +2,20 @@
  * Quicktime Video (RPZA) Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -96,8 +96,12 @@ static int rpza_decode_stream(RpzaContext *s)
     chunk_size = bytestream2_get_be32(&s->gb) & 0x00FFFFFF;
 
     /* If length mismatch use size from MOV file and try to decode anyway */
-    if (chunk_size != bytestream2_get_bytes_left(&s->gb) - 4)
-        av_log(s->avctx, AV_LOG_WARNING, "MOV chunk size != encoded chunk size\n");
+    if (chunk_size != bytestream2_get_bytes_left(&s->gb) + 4)
+        av_log(s->avctx, AV_LOG_WARNING,
+               "MOV chunk size %d != encoded chunk size %d\n",
+               chunk_size,
+               bytestream2_get_bytes_left(&s->gb) + 4
+              );
 
     /* Number of 4x4 blocks in frame. */
     total_blocks = ((s->avctx->width + 3) / 4) * ((s->avctx->height + 3) / 4);
@@ -252,10 +256,8 @@ static int rpza_decode_frame(AVCodecContext *avctx,
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     ret = rpza_decode_stream(s);
     if (ret < 0)
diff --git a/libavcodec/rscc.c b/libavcodec/rscc.c
index 4a91783..fe0df2e 100644
--- a/libavcodec/rscc.c
+++ b/libavcodec/rscc.c
@@ -2,20 +2,20 @@
  * innoHeim/Rsupport Screen Capture Codec
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -106,8 +106,9 @@ static av_cold int rscc_init(AVCodecContext *avctx)
             return AVERROR_INVALIDDATA;
         }
     } else {
-        av_log(avctx, AV_LOG_ERROR, "Invalid codec tag\n");
-        return AVERROR_INVALIDDATA;
+        avctx->pix_fmt = AV_PIX_FMT_BGR0;
+        ctx->component_size = 4;
+        av_log(avctx, AV_LOG_WARNING, "Invalid codec tag\n");
     }
 
     /* Store the value to check for keyframes */
@@ -247,11 +248,27 @@ static int rscc_decode_frame(AVCodecContext *avctx, void *data,
 
     ff_dlog(avctx, "pixel_size %d packed_size %d.\n", pixel_size, packed_size);
 
+    if (packed_size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid tile size %d\n", packed_size);
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+
     /* Get pixels buffer, it may be deflated or just raw */
     if (pixel_size == packed_size) {
+        if (bytestream2_get_bytes_left(gbc) < pixel_size) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficient input for %d\n", pixel_size);
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
         pixels = gbc->buffer;
     } else {
         uLongf len = ctx->inflated_size;
+        if (bytestream2_get_bytes_left(gbc) < packed_size) {
+            av_log(avctx, AV_LOG_ERROR, "Insufficient input for %d\n", packed_size);
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
         ret = uncompress(ctx->inflated_buf, &len, gbc->buffer, packed_size);
         if (ret) {
             av_log(avctx, AV_LOG_ERROR, "Pixel deflate error %d.\n", ret);
diff --git a/libavcodec/rtjpeg.c b/libavcodec/rtjpeg.c
index 67eeff8..8e02bce 100644
--- a/libavcodec/rtjpeg.c
+++ b/libavcodec/rtjpeg.c
@@ -2,20 +2,20 @@
  * RTJpeg decoding functions
  * Copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "libavutil/common.h"
diff --git a/libavcodec/rtjpeg.h b/libavcodec/rtjpeg.h
index cd30079..d22ff40 100644
--- a/libavcodec/rtjpeg.h
+++ b/libavcodec/rtjpeg.h
@@ -2,20 +2,20 @@
  * RTJpeg decoding functions
  * copyright (c) 2006 Reimar Doeffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c
index 3b5f4df..81aa9ae 100644
--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -309,7 +309,7 @@ static int rv20_decode_picture_header(RVDecContext *rv)
 {
     MpegEncContext *s = &rv->m;
     int seq, mb_pos, i, ret;
-    int rpr_bits;
+    int rpr_max;
 
     i = get_bits(&s->gb, 2);
     switch (i) {
@@ -330,6 +330,10 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         return AVERROR_INVALIDDATA;
     }
 
+    if (s->low_delay && s->pict_type == AV_PICTURE_TYPE_B) {
+        av_log(s->avctx, AV_LOG_ERROR, "low delay B\n");
+        return -1;
+    }
     if (!s->last_picture_ptr && s->pict_type == AV_PICTURE_TYPE_B) {
         av_log(s->avctx, AV_LOG_ERROR, "early B-frame\n");
         return AVERROR_INVALIDDATA;
@@ -347,17 +351,17 @@ static int rv20_decode_picture_header(RVDecContext *rv)
     }
 
     if (RV_GET_MINOR_VER(rv->sub_id) >= 2)
-        s->loop_filter = get_bits1(&s->gb);
+        s->loop_filter = get_bits1(&s->gb) && !s->avctx->lowres;
 
     if (RV_GET_MINOR_VER(rv->sub_id) <= 1)
         seq = get_bits(&s->gb, 8) << 7;
     else
         seq = get_bits(&s->gb, 13) << 2;
 
-    rpr_bits = s->avctx->extradata[1] & 7;
-    if (rpr_bits) {
+    rpr_max = s->avctx->extradata[1] & 7;
+    if (rpr_max) {
         int f, new_w, new_h;
-        rpr_bits = FFMIN((rpr_bits >> 1) + 1, 3);
+        int rpr_bits = av_log2(rpr_max) + 1;
 
         f = get_bits(&s->gb, rpr_bits);
 
@@ -374,10 +378,21 @@ static int rv20_decode_picture_header(RVDecContext *rv)
             new_h = rv->orig_height;
         }
         if (new_w != s->width || new_h != s->height) {
+            AVRational old_aspect = s->avctx->sample_aspect_ratio;
             av_log(s->avctx, AV_LOG_DEBUG,
                    "attempting to change resolution to %dx%d\n", new_w, new_h);
+            if (av_image_check_size(new_w, new_h, 0, s->avctx) < 0)
+                return AVERROR_INVALIDDATA;
             ff_mpv_common_end(s);
 
+            // attempt to keep aspect during typical resolution switches
+            if (!old_aspect.num)
+                old_aspect = (AVRational){1, 1};
+            if (2 * new_w * s->height == new_h * s->width)
+                s->avctx->sample_aspect_ratio = av_mul_q(old_aspect, (AVRational){2, 1});
+            if (new_w * s->height == 2 * new_h * s->width)
+                s->avctx->sample_aspect_ratio = av_mul_q(old_aspect, (AVRational){1, 2});
+
             ret = ff_set_dimensions(s->avctx, new_w, new_h);
             if (ret < 0)
                 return ret;
@@ -389,9 +404,10 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         }
 
         if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
-            av_log(s->avctx, AV_LOG_DEBUG, "F %d/%d\n", f, rpr_bits);
+            av_log(s->avctx, AV_LOG_DEBUG, "F %d/%d/%d\n", f, rpr_bits, rpr_max);
         }
-    } else if (av_image_check_size(s->width, s->height, 0, s->avctx) < 0)
+    }
+    if (av_image_check_size(s->width, s->height, 0, s->avctx) < 0)
         return AVERROR_INVALIDDATA;
 
     mb_pos = ff_h263_decode_mba(s);
@@ -410,15 +426,17 @@ static int rv20_decode_picture_header(RVDecContext *rv)
         } else {
             s->time    = seq;
             s->pb_time = s->pp_time - (s->last_non_b_time - s->time);
-            if (s->pp_time <= s->pb_time ||
-                s->pp_time <= s->pp_time - s->pb_time || s->pp_time <= 0) {
-                av_log(s->avctx, AV_LOG_DEBUG, "messed up order, possible "
-                       "from seeking? skipping current B-frame\n");
-                return FRAME_SKIPPED;
-            }
-            ff_mpeg4_init_direct_mv(s);
         }
     }
+    if (s->pict_type == AV_PICTURE_TYPE_B) {
+        if (s->pp_time <=s->pb_time || s->pp_time <= s->pp_time - s->pb_time || s->pp_time<=0) {
+            av_log(s->avctx, AV_LOG_DEBUG,
+                   "messed up order, possible from seeking? skipping current B-frame\n");
+#define ERROR_SKIP_FRAME -123
+            return ERROR_SKIP_FRAME;
+        }
+        ff_mpeg4_init_direct_mv(s);
+    }
 
     s->no_rounding = get_bits1(&s->gb);
 
@@ -430,7 +448,8 @@ static int rv20_decode_picture_header(RVDecContext *rv)
     s->unrestricted_mv = 1;
     s->h263_aic        = s->pict_type == AV_PICTURE_TYPE_I;
     s->modified_quant  = 1;
-    s->loop_filter     = 1;
+    if (!s->avctx->lowres)
+        s->loop_filter = 1;
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO) {
         av_log(s->avctx, AV_LOG_INFO,
@@ -439,7 +458,7 @@ static int rv20_decode_picture_header(RVDecContext *rv)
                s->no_rounding);
     }
 
-    assert(s->pict_type != AV_PICTURE_TYPE_B || !s->low_delay);
+    av_assert0(s->pict_type != AV_PICTURE_TYPE_B || !s->low_delay);
 
     return s->mb_width * s->mb_height - mb_pos;
 }
@@ -460,10 +479,9 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
         return ret;
 
     ff_mpv_decode_defaults(s);
+    ff_mpv_decode_init(s, avctx);
 
-    s->avctx       = avctx;
     s->out_format  = FMT_H263;
-    s->codec_id    = avctx->codec_id;
 
     rv->orig_width  =
     s->width        = avctx->coded_width;
@@ -496,8 +514,8 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
     }
 
     if (avctx->debug & FF_DEBUG_PICT_INFO) {
-        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", rv->sub_id,
-               avctx->extradata_size >= 4 ? ((int *) avctx->extradata)[0] : -1);
+        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%"PRIX32"\n", rv->sub_id,
+               ((uint32_t *) avctx->extradata)[0]);
     }
 
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
@@ -545,7 +563,8 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
     else
         mb_count = rv20_decode_picture_header(rv);
     if (mb_count < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
+        if (mb_count != ERROR_SKIP_FRAME)
+            av_log(s->avctx, AV_LOG_ERROR, "HEADER ERROR\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -578,6 +597,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
         }
     }
 
+
     ff_dlog(avctx, "qscale=%d\n", s->qscale);
 
     /* default quantization values */
@@ -618,7 +638,7 @@ static int rv10_decode_packet(AVCodecContext *avctx, const uint8_t *buf,
     for (s->mb_num_left = mb_count; s->mb_num_left > 0; s->mb_num_left--) {
         int ret;
         ff_update_block_index(s);
-        ff_dlog(avctx, "**mb x=%d y=%d\n", s->mb_x, s->mb_y);
+        ff_tlog(avctx, "**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 
         s->mv_dir  = MV_DIR_FORWARD;
         s->mv_type = MV_TYPE_16X16;
@@ -748,11 +768,13 @@ static int rv10_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
             if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
                 return ret;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         } else if (s->last_picture_ptr) {
             if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
                 return ret;
-            ff_print_debug_info(s, s->last_picture_ptr);
+            ff_print_debug_info(s, s->last_picture_ptr, pict);
+            ff_mpv_export_qp_table(s, pict,s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         }
 
         if (s->last_picture_ptr || s->low_delay) {
@@ -776,6 +798,7 @@ AVCodec ff_rv10_decoder = {
     .close          = rv10_decode_end,
     .decode         = rv10_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
@@ -793,6 +816,7 @@ AVCodec ff_rv20_decoder = {
     .decode         = rv10_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
     .flush          = ff_mpeg_flush,
+    .max_lowres     = 3,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_YUV420P,
         AV_PIX_FMT_NONE
diff --git a/libavcodec/rv10.h b/libavcodec/rv10.h
index b44bc1f..364270e 100644
--- a/libavcodec/rv10.h
+++ b/libavcodec/rv10.h
@@ -1,20 +1,20 @@
 /*
  * RV10/RV20 decoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv10enc.c b/libavcodec/rv10enc.c
index 765c57d..8691d18 100644
--- a/libavcodec/rv10enc.c
+++ b/libavcodec/rv10enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv20enc.c b/libavcodec/rv20enc.c
index 20090b1..81fb4fc 100644
--- a/libavcodec/rv20enc.c
+++ b/libavcodec/rv20enc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000,2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,12 +43,12 @@ void ff_rv20_encode_picture_header(MpegEncContext *s, int picture_number){
 
     put_bits(&s->pb, 1, s->no_rounding);
 
-    assert(s->f_code == 1);
-    assert(s->unrestricted_mv == 0);
-    assert(s->alt_inter_vlc == 0);
-    assert(s->umvplus == 0);
-    assert(s->modified_quant==1);
-    assert(s->loop_filter==1);
+    av_assert0(s->f_code == 1);
+    av_assert0(s->unrestricted_mv == 0);
+    av_assert0(s->alt_inter_vlc == 0);
+    av_assert0(s->umvplus == 0);
+    av_assert0(s->modified_quant==1);
+    av_assert0(s->loop_filter==1);
 
     s->h263_aic= s->pict_type == AV_PICTURE_TYPE_I;
     if(s->h263_aic){
diff --git a/libavcodec/rv30.c b/libavcodec/rv30.c
index bf22df5..3b9868c 100644
--- a/libavcodec/rv30.c
+++ b/libavcodec/rv30.c
@@ -2,20 +2,20 @@
  * RV30 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,8 +51,13 @@ static int rv30_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn
     si->quant = get_bits(gb, 5);
     skip_bits1(gb);
     si->pts = get_bits(gb, 13);
-    rpr = get_bits(gb, r->rpr);
+    rpr = get_bits(gb, av_log2(r->max_rpr) + 1);
     if(rpr){
+        if (rpr > r->max_rpr) {
+            av_log(avctx, AV_LOG_ERROR, "rpr too large\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (avctx->extradata_size < rpr * 2 + 8) {
             av_log(avctx, AV_LOG_ERROR,
                    "Insufficient extradata - need at least %d bytes, got %d\n",
@@ -62,6 +67,9 @@ static int rv30_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn
 
         w = r->s.avctx->extradata[6 + rpr*2] << 2;
         h = r->s.avctx->extradata[7 + rpr*2] << 2;
+    } else {
+        w = r->orig_width;
+        h = r->orig_height;
     }
     si->width  = w;
     si->height = h;
@@ -82,7 +90,7 @@ static int rv30_decode_intra_types(RV34DecContext *r, GetBitContext *gb, int8_t
     for(i = 0; i < 4; i++, dst += r->intra_types_stride - 4){
         for(j = 0; j < 4; j+= 2){
             unsigned code = svq3_get_ue_golomb(gb) << 1;
-            if(code >= 81*2){
+            if (code > 80U*2U) {
                 av_log(r->s.avctx, AV_LOG_ERROR, "Incorrect intra prediction code\n");
                 return -1;
             }
@@ -254,15 +262,22 @@ static av_cold int rv30_decode_init(AVCodecContext *avctx)
     RV34DecContext *r = avctx->priv_data;
     int ret;
 
+    r->orig_width  = avctx->coded_width;
+    r->orig_height = avctx->coded_height;
+
+    if (avctx->extradata_size < 2) {
+        av_log(avctx, AV_LOG_ERROR, "Extradata is too small.\n");
+        return AVERROR(EINVAL);
+    }
     r->rv30 = 1;
     if ((ret = ff_rv34_decode_init(avctx)) < 0)
         return ret;
-    if(avctx->extradata_size < 2){
-        av_log(avctx, AV_LOG_ERROR, "Extradata is too small.\n");
-        return -1;
+
+    r->max_rpr = avctx->extradata[1] & 7;
+    if(avctx->extradata_size < 2*r->max_rpr + 8){
+        av_log(avctx, AV_LOG_WARNING, "Insufficient extradata - need at least %d bytes, got %d\n",
+               2*r->max_rpr + 8, avctx->extradata_size);
     }
-    r->rpr = (avctx->extradata[1] & 7) >> 1;
-    r->rpr = FFMIN(r->rpr + 1, 3);
 
     r->parse_slice_header = rv30_parse_slice_header;
     r->decode_intra_types = rv30_decode_intra_types;
diff --git a/libavcodec/rv30data.h b/libavcodec/rv30data.h
index 079204d..5c4cb97 100644
--- a/libavcodec/rv30data.h
+++ b/libavcodec/rv30data.h
@@ -2,20 +2,20 @@
  * RealVideo 3 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv30dsp.c b/libavcodec/rv30dsp.c
index 50f4186..8b205e0 100644
--- a/libavcodec/rv30dsp.c
+++ b/libavcodec/rv30dsp.c
@@ -2,20 +2,20 @@
  * RV30 decoder motion compensation functions
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 5fa71d8..18f1de7 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -2,20 +2,20 @@
  * RV30/40 decoder common data
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
  * RV30/40 decoder common data
  */
 
+#include "libavutil/imgutils.h"
 #include "libavutil/internal.h"
 
 #include "avcodec.h"
@@ -510,7 +511,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int
     }
 }
 
-#define GET_PTS_DIFF(a, b) ((a - b + 8192) & 0x1FFF)
+#define GET_PTS_DIFF(a, b) (((a) - (b) + 8192) & 0x1FFF)
 
 /**
  * Calculate motion vector component that should be added for direct blocks.
@@ -672,6 +673,7 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     int dxy, mx, my, umx, umy, lx, ly, uvmx, uvmy, src_x, src_y, uvsrc_x, uvsrc_y;
     int mv_pos = s->mb_x * 2 + s->mb_y * 2 * s->b8_stride + mv_off;
     int is16x16 = 1;
+    int emu = 0;
 
     if(thirdpel){
         int chroma_mx, chroma_my;
@@ -723,24 +725,14 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     if(s->h_edge_pos - (width << 3) < 6 || s->v_edge_pos - (height << 3) < 6 ||
        (unsigned)(src_x - !!lx*2) > s->h_edge_pos - !!lx*2 - (width <<3) - 4 ||
        (unsigned)(src_y - !!ly*2) > s->v_edge_pos - !!ly*2 - (height<<3) - 4) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 22 * s->linesize;
-
         srcY -= 2 + 2*s->linesize;
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
                                  (width << 3) + 6, (height << 3) + 6,
-                            src_x - 2, src_y - 2, s->h_edge_pos, s->v_edge_pos);
+                                 src_x - 2, src_y - 2,
+                                 s->h_edge_pos, s->v_edge_pos);
         srcY = s->sc.edge_emu_buffer + 2 + 2*s->linesize;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
-                                 s->uvlinesize,s->uvlinesize,
-                                 (width << 2) + 1, (height << 2) + 1,
-                            uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
-                                 s->uvlinesize, s->uvlinesize,
-                                 (width << 2) + 1, (height << 2) + 1,
-                            uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+        emu = 1;
     }
     if(!weighted){
         Y = s->dest[0] + xoff      + yoff     *s->linesize;
@@ -763,6 +755,24 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     }
     is16x16 = (block_type != RV34_MB_P_8x8) && (block_type != RV34_MB_P_16x8) && (block_type != RV34_MB_P_8x16);
     qpel_mc[!is16x16][dxy](Y, srcY, s->linesize);
+    if (emu) {
+        uint8_t *uvbuf = s->sc.edge_emu_buffer;
+
+        s->vdsp.emulated_edge_mc(uvbuf, srcU,
+                                 s->uvlinesize, s->uvlinesize,
+                                 (width << 2) + 1, (height << 2) + 1,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+        srcU = uvbuf;
+        uvbuf += 9*s->uvlinesize;
+
+        s->vdsp.emulated_edge_mc(uvbuf, srcV,
+                                 s->uvlinesize, s->uvlinesize,
+                                 (width << 2) + 1, (height << 2) + 1,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, s->v_edge_pos >> 1);
+        srcV = uvbuf;
+    }
     chroma_mc[2-width]   (U, srcU, s->uvlinesize, height*4, uvmx, uvmy);
     chroma_mc[2-width]   (V, srcV, s->uvlinesize, height*4, uvmx, uvmy);
 }
@@ -1339,7 +1349,7 @@ static int check_slice_end(RV34DecContext *r, MpegEncContext *s)
     if(r->s.mb_skip_run > 1)
         return 0;
     bits = get_bits_left(&s->gb);
-    if(bits < 0 || (bits < 8 && !show_bits(&s->gb, bits)))
+    if(bits <= 0 || (bits < 8 && !show_bits(&s->gb, bits)))
         return 1;
     return 0;
 }
@@ -1361,11 +1371,11 @@ static int rv34_decoder_alloc(RV34DecContext *r)
 {
     r->intra_types_stride = r->s.mb_width * 4 + 4;
 
-    r->cbp_chroma       = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->cbp_chroma       = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->cbp_chroma));
-    r->cbp_luma         = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->cbp_luma         = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->cbp_luma));
-    r->deblock_coefs    = av_malloc(r->s.mb_stride * r->s.mb_height *
+    r->deblock_coefs    = av_mallocz(r->s.mb_stride * r->s.mb_height *
                                     sizeof(*r->deblock_coefs));
     r->intra_types_hist = av_malloc(r->intra_types_stride * 4 * 2 *
                                     sizeof(*r->intra_types_hist));
@@ -1410,6 +1420,10 @@ static int rv34_decode_slice(RV34DecContext *r, int end, const uint8_t* buf, int
         av_log(s->avctx, AV_LOG_ERROR, "Slice type mismatch\n");
         return AVERROR_INVALIDDATA;
     }
+    if (s->width != r->si.width || s->height != r->si.height) {
+        av_log(s->avctx, AV_LOG_ERROR, "Size mismatch\n");
+        return AVERROR_INVALIDDATA;
+    }
 
     r->si.end = end;
     s->qscale = r->si.quant;
@@ -1476,14 +1490,9 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx)
     int ret;
 
     ff_mpv_decode_defaults(s);
-    s->avctx      = avctx;
+    ff_mpv_decode_init(s, avctx);
     s->out_format = FMT_H263;
-    s->codec_id   = avctx->codec_id;
-
-    s->width  = avctx->width;
-    s->height = avctx->height;
 
-    r->s.avctx = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->has_b_frames = 1;
     s->low_delay = 0;
@@ -1525,7 +1534,14 @@ int ff_rv34_decode_init_thread_copy(AVCodecContext *avctx)
 
     if (avctx->internal->is_copy) {
         r->tmp_b_block_base = NULL;
+        r->cbp_chroma       = NULL;
+        r->cbp_luma         = NULL;
+        r->deblock_coefs    = NULL;
+        r->intra_types_hist = NULL;
+        r->mb_type          = NULL;
+
         ff_mpv_idct_init(&r->s);
+
         if ((err = ff_mpv_common_init(&r->s)) < 0)
             return err;
         if ((err = rv34_decoder_alloc(r)) < 0) {
@@ -1591,18 +1607,30 @@ static int finish_frame(AVCodecContext *avctx, AVFrame *pict)
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
         if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->current_picture_ptr);
+        ff_print_debug_info(s, s->current_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->current_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         got_picture = 1;
     } else if (s->last_picture_ptr) {
         if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
             return ret;
-        ff_print_debug_info(s, s->last_picture_ptr);
+        ff_print_debug_info(s, s->last_picture_ptr, pict);
+        ff_mpv_export_qp_table(s, pict, s->last_picture_ptr, FF_QSCALE_TYPE_MPEG1);
         got_picture = 1;
     }
 
     return got_picture;
 }
 
+static AVRational update_sar(int old_w, int old_h, AVRational sar, int new_w, int new_h)
+{
+    // attempt to keep aspect during typical resolution switches
+    if (!sar.num)
+        sar = (AVRational){1, 1};
+
+    sar = av_mul_q(sar, (AVRational){new_h * old_w, new_w * old_h});
+    return sar;
+}
+
 int ff_rv34_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_picture_ptr,
                             AVPacket *avpkt)
@@ -1617,6 +1645,7 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
     int slice_count;
     const uint8_t *slices_hdr = NULL;
     int last = 0;
+    int faulty_b = 0;
 
     /* no supplementary picture */
     if (buf_size == 0) {
@@ -1654,7 +1683,7 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
         si.type == AV_PICTURE_TYPE_B) {
         av_log(avctx, AV_LOG_ERROR, "Invalid decoder state: B-frame without "
                "reference data.\n");
-        return AVERROR_INVALIDDATA;
+        faulty_b = 1;
     }
     if(   (avctx->skip_frame >= AVDISCARD_NONREF && si.type==AV_PICTURE_TYPE_B)
        || (avctx->skip_frame >= AVDISCARD_NONKEY && si.type!=AV_PICTURE_TYPE_I)
@@ -1663,8 +1692,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
 
     /* first slice */
     if (si.start == 0) {
-        if (s->mb_num_left > 0) {
-            av_log(avctx, AV_LOG_ERROR, "New frame but still %d MB left.",
+        if (s->mb_num_left > 0 && s->current_picture_ptr) {
+            av_log(avctx, AV_LOG_ERROR, "New frame but still %d MB left.\n",
                    s->mb_num_left);
             ff_er_frame_end(&s->er);
             ff_mpv_frame_end(s);
@@ -1676,6 +1705,12 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
             av_log(s->avctx, AV_LOG_WARNING, "Changing dimensions to %dx%d\n",
                    si.width, si.height);
 
+            if (av_image_check_size(si.width, si.height, 0, s->avctx))
+                return AVERROR_INVALIDDATA;
+
+            s->avctx->sample_aspect_ratio = update_sar(
+                s->width, s->height, s->avctx->sample_aspect_ratio,
+                si.width, si.height);
             s->width  = si.width;
             s->height = si.height;
 
@@ -1738,6 +1773,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
                "multithreading mode (start MB is %d).\n", si.start);
         return AVERROR_INVALIDDATA;
     }
+    if (faulty_b)
+        return AVERROR_INVALIDDATA;
 
     for(i = 0; i < slice_count; i++){
         int offset = get_slice_offset(avctx, slices_hdr, i);
diff --git a/libavcodec/rv34.h b/libavcodec/rv34.h
index 0ac24bf..efff94a 100644
--- a/libavcodec/rv34.h
+++ b/libavcodec/rv34.h
@@ -2,20 +2,20 @@
  * RV30/40 decoder common data declarations
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -102,13 +102,15 @@ typedef struct RV34DecContext{
     int dmv[4][2];           ///< differential motion vectors for the current macroblock
 
     int rv30;                ///< indicates which RV variant is currently decoded
-    int rpr;                 ///< one field size in RV30 slice header
+    int max_rpr;
 
     int cur_pts, last_pts, next_pts;
     int scaled_weight;
     int weight1, weight2;    ///< B-frame distance fractions (0.14) used in motion compensation
     int mv_weight1, mv_weight2;
 
+    int orig_width, orig_height;
+
     uint16_t *cbp_luma;      ///< CBP values for luma subblocks
     uint8_t  *cbp_chroma;    ///< CBP values for chroma subblocks
     uint16_t *deblock_coefs; ///< deblock coefficients for each macroblock
diff --git a/libavcodec/rv34_parser.c b/libavcodec/rv34_parser.c
index ec6d3a5..765d390 100644
--- a/libavcodec/rv34_parser.c
+++ b/libavcodec/rv34_parser.c
@@ -2,20 +2,20 @@
  * RV30/40 parser
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34data.h b/libavcodec/rv34data.h
index 3064124..4b2701f 100644
--- a/libavcodec/rv34data.h
+++ b/libavcodec/rv34data.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c
index 7234ee8..c3f245e 100644
--- a/libavcodec/rv34dsp.c
+++ b/libavcodec/rv34dsp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Mike Melanson, Konstantin Shishkov
  * Copyright (c) 2011 Janne Grunau
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 1aa80cf..2e9ec4e 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -2,20 +2,20 @@
  * RV30/40 decoder motion compensation functions
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv34vlc.h b/libavcodec/rv34vlc.h
index f4670c1..aa29357 100644
--- a/libavcodec/rv34vlc.h
+++ b/libavcodec/rv34vlc.h
@@ -2,20 +2,20 @@
  * RealVideo 3/4 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv40.c b/libavcodec/rv40.c
index e6c77e8..3ff1554 100644
--- a/libavcodec/rv40.c
+++ b/libavcodec/rv40.c
@@ -2,20 +2,20 @@
  * RV40 decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -230,8 +230,11 @@ static int rv40_decode_mb_info(RV34DecContext *r)
     int prev_type = 0;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
-    if(!r->s.mb_skip_run)
+    if(!r->s.mb_skip_run) {
         r->s.mb_skip_run = svq3_get_ue_golomb(gb) + 1;
+        if(r->s.mb_skip_run > (unsigned)s->mb_num)
+            return -1;
+    }
 
     if(--r->s.mb_skip_run)
          return RV34_MB_SKIP;
@@ -358,7 +361,7 @@ static void rv40_loop_filter(RV34DecContext *r, int row)
     int uvcbp[4][2];
     /**
      * This mask represents the pattern of luma subblocks that should be filtered
-     * in addition to the coded ones because because they lie at the edge of
+     * in addition to the coded ones because they lie at the edge of
      * 8x8 block with different enough motion vectors
      */
     unsigned mvmasks[4];
diff --git a/libavcodec/rv40data.h b/libavcodec/rv40data.h
index 42328af..36f9f91 100644
--- a/libavcodec/rv40data.h
+++ b/libavcodec/rv40data.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index da3efb4..19b0e93 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,7 @@
 #include "pixels.h"
 #include "rnd_avg.h"
 #include "rv34dsp.h"
+#include "libavutil/avassert.h"
 
 #define RV40_LOWPASS(OPNAME, OP) \
 static void OPNAME ## rv40_qpel8_h_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\
@@ -299,7 +300,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
     int i;\
     int bias = rv40_bias[y>>1][x>>1];\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i = 0; i < h; i++){\
@@ -332,7 +333,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
     int i;\
     int bias = rv40_bias[y>>1][x>>1];\
     \
-    assert(x<8 && y<8 && x>=0 && y>=0);\
+    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
     if(D){\
         for(i = 0; i < h; i++){\
diff --git a/libavcodec/rv40vlc2.h b/libavcodec/rv40vlc2.h
index 2f63fc2..15119a1 100644
--- a/libavcodec/rv40vlc2.h
+++ b/libavcodec/rv40vlc2.h
@@ -2,20 +2,20 @@
  * RealVideo 4 decoder
  * copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/s302m.c b/libavcodec/s302m.c
index 635f697..ccfb591 100644
--- a/libavcodec/s302m.c
+++ b/libavcodec/s302m.c
@@ -3,24 +3,25 @@
  * Copyright (c) 2008 Laurent Aimar <fenrir@videolan.org>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste.coudurier@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
 #include "libavutil/log.h"
 #include "avcodec.h"
 #include "internal.h"
@@ -28,6 +29,11 @@
 
 #define AES3_HEADER_LEN 4
 
+typedef struct S302Context {
+    AVClass *class;
+    int non_pcm_mode;
+} S302Context;
+
 static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
                                     int buf_size)
 {
@@ -59,18 +65,26 @@ static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
     }
 
     /* Set output properties */
-    avctx->bits_per_coded_sample = bits;
+    avctx->bits_per_raw_sample = bits;
     if (bits > 16)
         avctx->sample_fmt = AV_SAMPLE_FMT_S32;
     else
         avctx->sample_fmt = AV_SAMPLE_FMT_S16;
 
     avctx->channels    = channels;
-    avctx->sample_rate = 48000;
-    avctx->bit_rate    = 48000 * avctx->channels * (avctx->bits_per_coded_sample + 4) +
-                         32 * (48000 / (buf_size * 8 /
-                                        (avctx->channels *
-                                         (avctx->bits_per_coded_sample + 4))));
+    switch(channels) {
+        case 2:
+            avctx->channel_layout = AV_CH_LAYOUT_STEREO;
+            break;
+        case 4:
+            avctx->channel_layout = AV_CH_LAYOUT_QUAD;
+            break;
+        case 6:
+            avctx->channel_layout = AV_CH_LAYOUT_5POINT1_BACK;
+            break;
+        case 8:
+            avctx->channel_layout = AV_CH_LAYOUT_5POINT1_BACK | AV_CH_LAYOUT_STEREO_DOWNMIX;
+    }
 
     return frame_size;
 }
@@ -78,10 +92,13 @@ static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
 static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                               int *got_frame_ptr, AVPacket *avpkt)
 {
+    S302Context *s = avctx->priv_data;
     AVFrame *frame     = data;
     const uint8_t *buf = avpkt->data;
     int buf_size       = avpkt->size;
     int block_size, ret;
+    int i;
+    int non_pcm_data_type = -1;
 
     int frame_size = s302m_parse_frame_header(avctx, buf, buf_size);
     if (frame_size < 0)
@@ -91,16 +108,16 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
     buf      += AES3_HEADER_LEN;
 
     /* get output buffer */
-    block_size = (avctx->bits_per_coded_sample + 4) / 4;
+    block_size = (avctx->bits_per_raw_sample + 4) / 4;
     frame->nb_samples = 2 * (buf_size / block_size) / avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
+    avctx->bit_rate = 48000 * avctx->channels * (avctx->bits_per_raw_sample + 4) +
+                      32 * 48000 / frame->nb_samples;
     buf_size = (frame->nb_samples * avctx->channels / 2) * block_size;
 
-    if (avctx->bits_per_coded_sample == 24) {
+    if (avctx->bits_per_raw_sample == 24) {
         uint32_t *o = (uint32_t *)frame->data[0];
         for (; buf_size > 6; buf_size -= 7) {
             *o++ = (ff_reverse[buf[2]]        << 24) |
@@ -112,7 +129,17 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                    (ff_reverse[buf[3] & 0x0f] <<  4);
             buf += 7;
         }
-    } else if (avctx->bits_per_coded_sample == 20) {
+        o = (uint32_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0x96F87200U && o[i+5] == 0xA54E1F00) {
+                    non_pcm_data_type = (o[i+6] >> 16) & 0x1F;
+                    break;
+                }
+            }
+    } else if (avctx->bits_per_raw_sample == 20) {
         uint32_t *o = (uint32_t *)frame->data[0];
         for (; buf_size > 5; buf_size -= 6) {
             *o++ = (ff_reverse[buf[2] & 0xf0] << 28) |
@@ -123,6 +150,16 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                    (ff_reverse[buf[3]]        << 12);
             buf += 6;
         }
+        o = (uint32_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0x6F872000U && o[i+5] == 0x54E1F000) {
+                    non_pcm_data_type = (o[i+6] >> 16) & 0x1F;
+                    break;
+                }
+            }
     } else {
         uint16_t *o = (uint16_t *)frame->data[0];
         for (; buf_size > 4; buf_size -= 5) {
@@ -133,18 +170,61 @@ static int s302m_decode_frame(AVCodecContext *avctx, void *data,
                    (ff_reverse[buf[2]]        >>  4);
             buf += 5;
         }
+        o = (uint16_t *)frame->data[0];
+        if (avctx->channels == 2)
+            for (i=0; i<frame->nb_samples * 2 - 6; i+=2) {
+                if (o[i] || o[i+1] || o[i+2] || o[i+3])
+                    break;
+                if (o[i+4] == 0xF872U && o[i+5] == 0x4E1F) {
+                    non_pcm_data_type = (o[i+6] & 0x1F);
+                    break;
+                }
+            }
+    }
+
+    if (non_pcm_data_type != -1) {
+        if (s->non_pcm_mode == 3) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "S302 non PCM mode with data type %d not supported\n",
+                   non_pcm_data_type);
+            return AVERROR_PATCHWELCOME;
+        }
+        if (s->non_pcm_mode & 1) {
+            return avpkt->size;
+        }
     }
 
+    avctx->sample_rate = 48000;
+
     *got_frame_ptr = 1;
 
     return avpkt->size;
 }
 
+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_DECODING_PARAM
+static const AVOption s302m_options[] = {
+    {"non_pcm_mode", "Chooses what to do with NON-PCM", offsetof(S302Context, non_pcm_mode), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"copy"        , "Pass NON-PCM through unchanged"     , 0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"drop"        , "Drop NON-PCM"                       , 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"decode_copy" , "Decode if possible else passthrough", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 3, FLAGS, "non_pcm_mode"},
+    {"decode_drop" , "Decode if possible else drop"       , 0, AV_OPT_TYPE_CONST, {.i64 = 3}, 0, 3, FLAGS, "non_pcm_mode"},
+    {NULL}
+};
+
+static const AVClass s302m_class = {
+    "SMPTE 302M Decoder",
+    av_default_item_name,
+    s302m_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_s302m_decoder = {
     .name           = "s302m",
     .long_name      = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
     .type           = AVMEDIA_TYPE_AUDIO,
     .id             = AV_CODEC_ID_S302M,
+    .priv_data_size = sizeof(S302Context),
     .decode         = s302m_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &s302m_class,
 };
diff --git a/libavcodec/s302menc.c b/libavcodec/s302menc.c
new file mode 100644
index 0000000..b04a54e
--- /dev/null
+++ b/libavcodec/s302menc.c
@@ -0,0 +1,188 @@
+/*
+ * SMPTE 302M encoder
+ * Copyright (c) 2010 Google, Inc.
+ * Copyright (c) 2013 Darryl Wallace <wallacdj@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+#include "mathops.h"
+#include "put_bits.h"
+
+#define AES3_HEADER_LEN 4
+
+typedef struct S302MEncContext {
+    uint8_t framing_index; /* Set for even channels on multiple of 192 samples */
+} S302MEncContext;
+
+static av_cold int s302m_encode_init(AVCodecContext *avctx)
+{
+    S302MEncContext *s = avctx->priv_data;
+
+    if (avctx->channels & 1 || avctx->channels > 8) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Encoding %d channel(s) is not allowed. Only 2, 4, 6 and 8 channels are supported.\n",
+               avctx->channels);
+        return AVERROR(EINVAL);
+    }
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_S16:
+        avctx->bits_per_raw_sample = 16;
+        break;
+    case AV_SAMPLE_FMT_S32:
+        if (avctx->bits_per_raw_sample > 20) {
+            if (avctx->bits_per_raw_sample > 24)
+                av_log(avctx, AV_LOG_WARNING, "encoding as 24 bits-per-sample\n");
+            avctx->bits_per_raw_sample = 24;
+        } else if (!avctx->bits_per_raw_sample) {
+            avctx->bits_per_raw_sample = 24;
+        } else if (avctx->bits_per_raw_sample <= 20) {
+            avctx->bits_per_raw_sample = 20;
+        }
+    }
+
+    avctx->frame_size = 0;
+    avctx->bit_rate   = 48000 * avctx->channels *
+                       (avctx->bits_per_raw_sample + 4);
+    s->framing_index  = 0;
+
+    return 0;
+}
+
+static int s302m_encode2_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                               const AVFrame *frame, int *got_packet_ptr)
+{
+    S302MEncContext *s = avctx->priv_data;
+    const int buf_size = AES3_HEADER_LEN +
+                        (frame->nb_samples *
+                         avctx->channels *
+                        (avctx->bits_per_raw_sample + 4)) / 8;
+    int ret, c, channels;
+    uint8_t *o;
+    PutBitContext pb;
+
+    if (buf_size - AES3_HEADER_LEN > UINT16_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "number of samples in frame too big\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, buf_size, 0)) < 0)
+        return ret;
+
+    o = avpkt->data;
+    init_put_bits(&pb, o, buf_size);
+    put_bits(&pb, 16, buf_size - AES3_HEADER_LEN);
+    put_bits(&pb, 2, (avctx->channels - 2) >> 1);   // number of channels
+    put_bits(&pb, 8, 0);                            // channel ID
+    put_bits(&pb, 2, (avctx->bits_per_raw_sample - 16) / 4); // bits per samples (0 = 16bit, 1 = 20bit, 2 = 24bit)
+    put_bits(&pb, 4, 0);                            // alignments
+    flush_put_bits(&pb);
+    o += AES3_HEADER_LEN;
+
+    if (avctx->bits_per_raw_sample == 24) {
+        const uint32_t *samples = (uint32_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x10: 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[(samples[0] & 0x0000FF00) >> 8];
+                o[1] = ff_reverse[(samples[0] & 0x00FF0000) >> 16];
+                o[2] = ff_reverse[(samples[0] & 0xFF000000) >> 24];
+                o[3] = ff_reverse[(samples[1] & 0x00000F00) >> 4] | vucf;
+                o[4] = ff_reverse[(samples[1] & 0x000FF000) >> 12];
+                o[5] = ff_reverse[(samples[1] & 0x0FF00000) >> 20];
+                o[6] = ff_reverse[(samples[1] & 0xF0000000) >> 28];
+                o += 7;
+                samples += 2;
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    } else if (avctx->bits_per_raw_sample == 20) {
+        const uint32_t *samples = (uint32_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x80: 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[ (samples[0] & 0x000FF000) >> 12];
+                o[1] = ff_reverse[ (samples[0] & 0x0FF00000) >> 20];
+                o[2] = ff_reverse[((samples[0] & 0xF0000000) >> 28) | vucf];
+                o[3] = ff_reverse[ (samples[1] & 0x000FF000) >> 12];
+                o[4] = ff_reverse[ (samples[1] & 0x0FF00000) >> 20];
+                o[5] = ff_reverse[ (samples[1] & 0xF0000000) >> 28];
+                o += 6;
+                samples += 2;
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    } else if (avctx->bits_per_raw_sample == 16) {
+        const uint16_t *samples = (uint16_t *)frame->data[0];
+
+        for (c = 0; c < frame->nb_samples; c++) {
+            uint8_t vucf = s->framing_index == 0 ? 0x10 : 0;
+
+            for (channels = 0; channels < avctx->channels; channels += 2) {
+                o[0] = ff_reverse[ samples[0] & 0xFF];
+                o[1] = ff_reverse[(samples[0] & 0xFF00) >>  8];
+                o[2] = ff_reverse[(samples[1] & 0x0F)   <<  4] | vucf;
+                o[3] = ff_reverse[(samples[1] & 0x0FF0) >>  4];
+                o[4] = ff_reverse[(samples[1] & 0xF000) >> 12];
+                o += 5;
+                samples += 2;
+
+            }
+
+            s->framing_index++;
+            if (s->framing_index >= 192)
+                s->framing_index = 0;
+        }
+    }
+
+    *got_packet_ptr = 1;
+
+    return 0;
+}
+
+AVCodec ff_s302m_encoder = {
+    .name                  = "s302m",
+    .long_name             = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
+    .type                  = AVMEDIA_TYPE_AUDIO,
+    .id                    = AV_CODEC_ID_S302M,
+    .priv_data_size        = sizeof(S302MEncContext),
+    .init                  = s302m_encode_init,
+    .encode2               = s302m_encode2_frame,
+    .sample_fmts           = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S32,
+                                                            AV_SAMPLE_FMT_S16,
+                                                            AV_SAMPLE_FMT_NONE },
+    .capabilities          = AV_CODEC_CAP_VARIABLE_FRAME_SIZE | AV_CODEC_CAP_EXPERIMENTAL,
+    .supported_samplerates = (const int[]) { 48000, 0 },
+ /* .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_STEREO,
+                                                  AV_CH_LAYOUT_QUAD,
+                                                  AV_CH_LAYOUT_5POINT1_BACK,
+                                                  AV_CH_LAYOUT_5POINT1_BACK | AV_CH_LAYOUT_STEREO_DOWNMIX,
+                                                  0 }, */
+};
diff --git a/libavcodec/samidec.c b/libavcodec/samidec.c
new file mode 100644
index 0000000..16f3f58
--- /dev/null
+++ b/libavcodec/samidec.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SAMI subtitle decoder
+ * @see http://msdn.microsoft.com/en-us/library/ms971327.aspx
+ */
+
+#include "ass.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "htmlsubtitles.h"
+
+typedef struct {
+    AVBPrint source;
+    AVBPrint content;
+    AVBPrint encoded_source;
+    AVBPrint encoded_content;
+    AVBPrint full;
+    int readorder;
+} SAMIContext;
+
+static int sami_paragraph_to_ass(AVCodecContext *avctx, const char *src)
+{
+    SAMIContext *sami = avctx->priv_data;
+    int ret = 0;
+    char *tag = NULL;
+    char *dupsrc = av_strdup(src);
+    char *p = dupsrc;
+    AVBPrint *dst_content = &sami->encoded_content;
+    AVBPrint *dst_source = &sami->encoded_source;
+
+    av_bprint_clear(&sami->encoded_content);
+    av_bprint_clear(&sami->content);
+    av_bprint_clear(&sami->encoded_source);
+    for (;;) {
+        char *saveptr = NULL;
+        int prev_chr_is_space = 0;
+        AVBPrint *dst = &sami->content;
+
+        /* parse & extract paragraph tag */
+        p = av_stristr(p, "<P");
+        if (!p)
+            break;
+        if (p[2] != '>' && !av_isspace(p[2])) { // avoid confusion with tags such as <PRE>
+            p++;
+            continue;
+        }
+        if (dst->len) // add a separator with the previous paragraph if there was one
+            av_bprintf(dst, "\\N");
+        tag = av_strtok(p, ">", &saveptr);
+        if (!tag || !saveptr)
+            break;
+        p = saveptr;
+
+        /* check if the current paragraph is the "source" (speaker name) */
+        if (av_stristr(tag, "ID=Source") || av_stristr(tag, "ID=\"Source\"")) {
+            dst = &sami->source;
+            av_bprint_clear(dst);
+        }
+
+        /* if empty event -> skip subtitle */
+        while (av_isspace(*p))
+            p++;
+        if (!strncmp(p, "&nbsp;", 6)) {
+            ret = -1;
+            goto end;
+        }
+
+        /* extract the text, stripping most of the tags */
+        while (*p) {
+            if (*p == '<') {
+                if (!av_strncasecmp(p, "<P", 2) && (p[2] == '>' || av_isspace(p[2])))
+                    break;
+            }
+            if (!av_strncasecmp(p, "<BR", 3)) {
+                av_bprintf(dst, "\\N");
+                p++;
+                while (*p && *p != '>')
+                    p++;
+                if (!*p)
+                    break;
+                if (*p == '>')
+                    p++;
+                continue;
+            }
+            if (!av_isspace(*p))
+                av_bprint_chars(dst, *p, 1);
+            else if (!prev_chr_is_space)
+                av_bprint_chars(dst, ' ', 1);
+            prev_chr_is_space = av_isspace(*p);
+            p++;
+        }
+    }
+
+    av_bprint_clear(&sami->full);
+    if (sami->source.len) {
+        ff_htmlmarkup_to_ass(avctx, dst_source, sami->source.str);
+        av_bprintf(&sami->full, "{\\i1}%s{\\i0}\\N", sami->encoded_source.str);
+    }
+    ff_htmlmarkup_to_ass(avctx, dst_content, sami->content.str);
+    av_bprintf(&sami->full, "%s", sami->encoded_content.str);
+
+end:
+    av_free(dupsrc);
+    return ret;
+}
+
+static int sami_decode_frame(AVCodecContext *avctx,
+                             void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    SAMIContext *sami = avctx->priv_data;
+
+    if (ptr && avpkt->size > 0 && !sami_paragraph_to_ass(avctx, ptr)) {
+        // TODO: pass escaped sami->encoded_source.str as source
+        int ret = ff_ass_add_rect(sub, sami->full.str, sami->readorder++, 0, NULL, NULL);
+        if (ret < 0)
+            return ret;
+    }
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static av_cold int sami_init(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    av_bprint_init(&sami->source,  0, 2048);
+    av_bprint_init(&sami->content, 0, 2048);
+    av_bprint_init(&sami->encoded_source,  0, 2048);
+    av_bprint_init(&sami->encoded_content, 0, 2048);
+    av_bprint_init(&sami->full,    0, 2048);
+    return ff_ass_subtitle_header_default(avctx);
+}
+
+static av_cold int sami_close(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    av_bprint_finalize(&sami->source,  NULL);
+    av_bprint_finalize(&sami->content, NULL);
+    av_bprint_finalize(&sami->encoded_source,  NULL);
+    av_bprint_finalize(&sami->encoded_content, NULL);
+    av_bprint_finalize(&sami->full,    NULL);
+    return 0;
+}
+
+static void sami_flush(AVCodecContext *avctx)
+{
+    SAMIContext *sami = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        sami->readorder = 0;
+}
+
+AVCodec ff_sami_decoder = {
+    .name           = "sami",
+    .long_name      = NULL_IF_CONFIG_SMALL("SAMI subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SAMI,
+    .priv_data_size = sizeof(SAMIContext),
+    .init           = sami_init,
+    .close          = sami_close,
+    .decode         = sami_decode_frame,
+    .flush          = sami_flush,
+};
diff --git a/libavcodec/sanm.c b/libavcodec/sanm.c
index 6436f84..1aa002b 100644
--- a/libavcodec/sanm.c
+++ b/libavcodec/sanm.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006 Cyril Zorin
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,108 +105,159 @@ static const int8_t motion_vectors[256][2] = {
 };
 
 static const int8_t c37_mv[] = {
-    0,   0,   1,   0,   2,   0,   3,   0,   5,   0,   8,   0,  13,   0,  21,
-    0,  -1,   0,  -2,   0,  -3,   0,  -5,   0,  -8,   0, -13,   0, -17,   0,
-  -21,   0,   0,   1,   1,   1,   2,   1,   3,   1,   5,   1,   8,   1,  13,
-    1,  21,   1,  -1,   1,  -2,   1,  -3,   1,  -5,   1,  -8,   1, -13,   1,
-  -17,   1, -21,   1,   0,   2,   1,   2,   2,   2,   3,   2,   5,   2,   8,
-    2,  13,   2,  21,   2,  -1,   2,  -2,   2,  -3,   2,  -5,   2,  -8,   2,
-  -13,   2, -17,   2, -21,   2,   0,   3,   1,   3,   2,   3,   3,   3,   5,
-    3,   8,   3,  13,   3,  21,   3,  -1,   3,  -2,   3,  -3,   3,  -5,   3,
-   -8,   3, -13,   3, -17,   3, -21,   3,   0,   5,   1,   5,   2,   5,   3,
-    5,   5,   5,   8,   5,  13,   5,  21,   5,  -1,   5,  -2,   5,  -3,   5,
-   -5,   5,  -8,   5, -13,   5, -17,   5, -21,   5,   0,   8,   1,   8,   2,
-    8,   3,   8,   5,   8,   8,   8,  13,   8,  21,   8,  -1,   8,  -2,   8,
-   -3,   8,  -5,   8,  -8,   8, -13,   8, -17,   8, -21,   8,   0,  13,   1,
-   13,   2,  13,   3,  13,   5,  13,   8,  13,  13,  13,  21,  13,  -1,  13,
-   -2,  13,  -3,  13,  -5,  13,  -8,  13, -13,  13, -17,  13, -21,  13,   0,
-   21,   1,  21,   2,  21,   3,  21,   5,  21,   8,  21,  13,  21,  21,  21,
-   -1,  21,  -2,  21,  -3,  21,  -5,  21,  -8,  21, -13,  21, -17,  21, -21,
-   21,   0,  -1,   1,  -1,   2,  -1,   3,  -1,   5,  -1,   8,  -1,  13,  -1,
-   21,  -1,  -1,  -1,  -2,  -1,  -3,  -1,  -5,  -1,  -8,  -1, -13,  -1, -17,
-   -1, -21,  -1,   0,  -2,   1,  -2,   2,  -2,   3,  -2,   5,  -2,   8,  -2,
-   13,  -2,  21,  -2,  -1,  -2,  -2,  -2,  -3,  -2,  -5,  -2,  -8,  -2, -13,
-   -2, -17,  -2, -21,  -2,   0,  -3,   1,  -3,   2,  -3,   3,  -3,   5,  -3,
-    8,  -3,  13,  -3,  21,  -3,  -1,  -3,  -2,  -3,  -3,  -3,  -5,  -3,  -8,
-   -3, -13,  -3, -17,  -3, -21,  -3,   0,  -5,   1,  -5,   2,  -5,   3,  -5,
-    5,  -5,   8,  -5,  13,  -5,  21,  -5,  -1,  -5,  -2,  -5,  -3,  -5,  -5,
-   -5,  -8,  -5, -13,  -5, -17,  -5, -21,  -5,   0,  -8,   1,  -8,   2,  -8,
-    3,  -8,   5,  -8,   8,  -8,  13,  -8,  21,  -8,  -1,  -8,  -2,  -8,  -3,
-   -8,  -5,  -8,  -8,  -8, -13,  -8, -17,  -8, -21,  -8,   0, -13,   1, -13,
-    2, -13,   3, -13,   5, -13,   8, -13,  13, -13,  21, -13,  -1, -13,  -2,
-  -13,  -3, -13,  -5, -13,  -8, -13, -13, -13, -17, -13, -21, -13,   0, -17,
-    1, -17,   2, -17,   3, -17,   5, -17,   8, -17,  13, -17,  21, -17,  -1,
-  -17,  -2, -17,  -3, -17,  -5, -17,  -8, -17, -13, -17, -17, -17, -21, -17,
-    0, -21,   1, -21,   2, -21,   3, -21,   5, -21,   8, -21,  13, -21,  21,
-  -21,  -1, -21,  -2, -21,  -3, -21,  -5, -21,  -8, -21, -13, -21, -17, -21,
-    0,   0,  -8, -29,   8, -29, -18, -25,  17, -25,   0, -23,  -6, -22,   6,
-  -22, -13, -19,  12, -19,   0, -18,  25, -18, -25, -17,  -5, -17,   5, -17,
-  -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,  -8,
-  -11,  -2, -11,   0, -11,   2, -11,   8, -11, -15, -10,  -4, -10,   4, -10,
-   15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9, -29,  -8, -11,  -8,  -8,
-   -8,  -3,  -8,   3,  -8,   8,  -8,  11,  -8,  29,  -8,  -5,  -7,  -2,  -7,
-    0,  -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,
-   -6,   1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,
-   -4,  -5,  -2,  -5,   0,  -5,   2,  -5,   4,  -5,   7,  -5,  17,  -5, -13,
-   -4, -10,  -4,  -5,  -4,  -3,  -4,  -1,  -4,   0,  -4,   1,  -4,   3,  -4,
-    5,  -4,  10,  -4,  13,  -4,  -8,  -3,  -6,  -3,  -4,  -3,  -3,  -3,  -2,
-   -3,  -1,  -3,   0,  -3,   1,  -3,   2,  -3,   4,  -3,   6,  -3,   8,  -3,
-  -11,  -2,  -7,  -2,  -5,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,   1,
-   -2,   2,  -2,   3,  -2,   5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,
-   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,   1,  -1,   2,  -1,   3,
-   -1,   4,  -1,   6,  -1,   9,  -1, -31,   0, -23,   0, -18,   0, -14,   0,
-  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,  -2,   0,  -1,   0,   0,
-  -31,   1,   0,   2,   0,   3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
-   14,   0,  18,   0,  23,   0,  31,   0,  -9,   1,  -6,   1,  -4,   1,  -3,
-    1,  -2,   1,  -1,   1,   0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
-    6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,  -3,   2,  -2,   2,  -1,
-    2,   0,   2,   1,   2,   2,   2,   3,   2,   5,   2,   7,   2,  11,   2,
-   -8,   3,  -6,   3,  -4,   3,  -2,   3,  -1,   3,   0,   3,   1,   3,   2,
-    3,   3,   3,   4,   3,   6,   3,   8,   3, -13,   4, -10,   4,  -5,   4,
-   -3,   4,  -1,   4,   0,   4,   1,   4,   3,   4,   5,   4,  10,   4,  13,
-    4, -17,   5,  -7,   5,  -4,   5,  -2,   5,   0,   5,   2,   5,   4,   5,
-    7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,   1,
-    6,   3,   6,   6,   6,   9,   6,  22,   6,  -5,   7,  -2,   7,   0,   7,
-    2,   7,   5,   7, -29,   8, -11,   8,  -8,   8,  -3,   8,   3,   8,   8,
-    8,  11,   8,  29,   8,  -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,
-   -4,  10,   4,  10,  15,  10,  -8,  11,  -2,  11,   0,  11,   2,  11,   8,
-   11,  19,  12, -19,  13,  -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,
-   -5,  17,   5,  17,  25,  17, -25,  18,   0,  18, -12,  19,  13,  19,  -6,
-   22,   6,  22,   0,  23, -17,  25,  18,  25,  -8,  29,   8,  29,   0,  31,
-    0,   0,  -6, -22,   6, -22, -13, -19,  12, -19,   0, -18,  -5, -17,   5,
-  -17, -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,
-   -8, -11,  -2, -11,   0, -11,   2, -11,   8, -11, -15, -10,  -4, -10,   4,
-  -10,  15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9, -11,  -8,  -8,  -8,
-   -3,  -8,   0,  -8,   3,  -8,   8,  -8,  11,  -8,  -5,  -7,  -2,  -7,   0,
-   -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,  -6,
-    1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,  -4,
-   -5,  -2,  -5,  -1,  -5,   0,  -5,   1,  -5,   2,  -5,   4,  -5,   7,  -5,
-   17,  -5, -13,  -4, -10,  -4,  -5,  -4,  -3,  -4,  -2,  -4,  -1,  -4,   0,
-   -4,   1,  -4,   2,  -4,   3,  -4,   5,  -4,  10,  -4,  13,  -4,  -8,  -3,
-   -6,  -3,  -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,   0,  -3,   1,  -3,   2,
-   -3,   3,  -3,   4,  -3,   6,  -3,   8,  -3, -11,  -2,  -7,  -2,  -5,  -2,
-   -4,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,   1,  -2,   2,  -2,   3,
-   -2,   4,  -2,   5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,  -5,  -1,
-   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,   1,  -1,   2,  -1,   3,
-   -1,   4,  -1,   5,  -1,   6,  -1,   9,  -1, -23,   0, -18,   0, -14,   0,
-  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,  -2,   0,  -1,   0,   0,
-  -23,   1,   0,   2,   0,   3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
-   14,   0,  18,   0,  23,   0,  -9,   1,  -6,   1,  -5,   1,  -4,   1,  -3,
-    1,  -2,   1,  -1,   1,   0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
-    5,   1,   6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,  -4,   2,  -3,
-    2,  -2,   2,  -1,   2,   0,   2,   1,   2,   2,   2,   3,   2,   4,   2,
-    5,   2,   7,   2,  11,   2,  -8,   3,  -6,   3,  -4,   3,  -3,   3,  -2,
-    3,  -1,   3,   0,   3,   1,   3,   2,   3,   3,   3,   4,   3,   6,   3,
-    8,   3, -13,   4, -10,   4,  -5,   4,  -3,   4,  -2,   4,  -1,   4,   0,
-    4,   1,   4,   2,   4,   3,   4,   5,   4,  10,   4,  13,   4, -17,   5,
-   -7,   5,  -4,   5,  -2,   5,  -1,   5,   0,   5,   1,   5,   2,   5,   4,
-    5,   7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,
-    1,   6,   3,   6,   6,   6,   9,   6,  22,   6,  -5,   7,  -2,   7,   0,
-    7,   2,   7,   5,   7, -11,   8,  -8,   8,  -3,   8,   0,   8,   3,   8,
-    8,   8,  11,   8,  -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,  -4,
-   10,   4,  10,  15,  10,  -8,  11,  -2,  11,   0,  11,   2,  11,   8,  11,
-   19,  12, -19,  13,  -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,  -5,
-   17,   5,  17,   0,  18, -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
+    0,   0,   1,   0,   2,   0,   3,   0,   5,   0,
+    8,   0,  13,   0,  21,   0,  -1,   0,  -2,   0,
+   -3,   0,  -5,   0,  -8,   0, -13,   0, -17,   0,
+  -21,   0,   0,   1,   1,   1,   2,   1,   3,   1,
+    5,   1,   8,   1,  13,   1,  21,   1,  -1,   1,
+   -2,   1,  -3,   1,  -5,   1,  -8,   1, -13,   1,
+  -17,   1, -21,   1,   0,   2,   1,   2,   2,   2,
+    3,   2,   5,   2,   8,   2,  13,   2,  21,   2,
+   -1,   2,  -2,   2,  -3,   2,  -5,   2,  -8,   2,
+  -13,   2, -17,   2, -21,   2,   0,   3,   1,   3,
+    2,   3,   3,   3,   5,   3,   8,   3,  13,   3,
+   21,   3,  -1,   3,  -2,   3,  -3,   3,  -5,   3,
+   -8,   3, -13,   3, -17,   3, -21,   3,   0,   5,
+    1,   5,   2,   5,   3,   5,   5,   5,   8,   5,
+   13,   5,  21,   5,  -1,   5,  -2,   5,  -3,   5,
+   -5,   5,  -8,   5, -13,   5, -17,   5, -21,   5,
+    0,   8,   1,   8,   2,   8,   3,   8,   5,   8,
+    8,   8,  13,   8,  21,   8,  -1,   8,  -2,   8,
+   -3,   8,  -5,   8,  -8,   8, -13,   8, -17,   8,
+  -21,   8,   0,  13,   1,  13,   2,  13,   3,  13,
+    5,  13,   8,  13,  13,  13,  21,  13,  -1,  13,
+   -2,  13,  -3,  13,  -5,  13,  -8,  13, -13,  13,
+  -17,  13, -21,  13,   0,  21,   1,  21,   2,  21,
+    3,  21,   5,  21,   8,  21,  13,  21,  21,  21,
+   -1,  21,  -2,  21,  -3,  21,  -5,  21,  -8,  21,
+  -13,  21, -17,  21, -21,  21,   0,  -1,   1,  -1,
+    2,  -1,   3,  -1,   5,  -1,   8,  -1,  13,  -1,
+   21,  -1,  -1,  -1,  -2,  -1,  -3,  -1,  -5,  -1,
+   -8,  -1, -13,  -1, -17,  -1, -21,  -1,   0,  -2,
+    1,  -2,   2,  -2,   3,  -2,   5,  -2,   8,  -2,
+   13,  -2,  21,  -2,  -1,  -2,  -2,  -2,  -3,  -2,
+   -5,  -2,  -8,  -2, -13,  -2, -17,  -2, -21,  -2,
+    0,  -3,   1,  -3,   2,  -3,   3,  -3,   5,  -3,
+    8,  -3,  13,  -3,  21,  -3,  -1,  -3,  -2,  -3,
+   -3,  -3,  -5,  -3,  -8,  -3, -13,  -3, -17,  -3,
+  -21,  -3,   0,  -5,   1,  -5,   2,  -5,   3,  -5,
+    5,  -5,   8,  -5,  13,  -5,  21,  -5,  -1,  -5,
+   -2,  -5,  -3,  -5,  -5,  -5,  -8,  -5, -13,  -5,
+  -17,  -5, -21,  -5,   0,  -8,   1,  -8,   2,  -8,
+    3,  -8,   5,  -8,   8,  -8,  13,  -8,  21,  -8,
+   -1,  -8,  -2,  -8,  -3,  -8,  -5,  -8,  -8,  -8,
+  -13,  -8, -17,  -8, -21,  -8,   0, -13,   1, -13,
+    2, -13,   3, -13,   5, -13,   8, -13,  13, -13,
+   21, -13,  -1, -13,  -2, -13,  -3, -13,  -5, -13,
+   -8, -13, -13, -13, -17, -13, -21, -13,   0, -17,
+    1, -17,   2, -17,   3, -17,   5, -17,   8, -17,
+   13, -17,  21, -17,  -1, -17,  -2, -17,  -3, -17,
+   -5, -17,  -8, -17, -13, -17, -17, -17, -21, -17,
+    0, -21,   1, -21,   2, -21,   3, -21,   5, -21,
+    8, -21,  13, -21,  21, -21,  -1, -21,  -2, -21,
+   -3, -21,  -5, -21,  -8, -21, -13, -21, -17, -21,
+    0,   0,  -8, -29,   8, -29, -18, -25,  17, -25,
+    0, -23,  -6, -22,   6, -22, -13, -19,  12, -19,
+    0, -18,  25, -18, -25, -17,  -5, -17,   5, -17,
+  -10, -15,  10, -15,   0, -14,  -4, -13,   4, -13,
+   19, -13, -19, -12,  -8, -11,  -2, -11,   0, -11,
+    2, -11,   8, -11, -15, -10,  -4, -10,   4, -10,
+   15, -10,  -6,  -9,  -1,  -9,   1,  -9,   6,  -9,
+  -29,  -8, -11,  -8,  -8,  -8,  -3,  -8,   3,  -8,
+    8,  -8,  11,  -8,  29,  -8,  -5,  -7,  -2,  -7,
+    0,  -7,   2,  -7,   5,  -7, -22,  -6,  -9,  -6,
+   -6,  -6,  -3,  -6,  -1,  -6,   1,  -6,   3,  -6,
+    6,  -6,   9,  -6,  22,  -6, -17,  -5,  -7,  -5,
+   -4,  -5,  -2,  -5,   0,  -5,   2,  -5,   4,  -5,
+    7,  -5,  17,  -5, -13,  -4, -10,  -4,  -5,  -4,
+   -3,  -4,  -1,  -4,   0,  -4,   1,  -4,   3,  -4,
+    5,  -4,  10,  -4,  13,  -4,  -8,  -3,  -6,  -3,
+   -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,   0,  -3,
+    1,  -3,   2,  -3,   4,  -3,   6,  -3,   8,  -3,
+  -11,  -2,  -7,  -2,  -5,  -2,  -3,  -2,  -2,  -2,
+   -1,  -2,   0,  -2,   1,  -2,   2,  -2,   3,  -2,
+    5,  -2,   7,  -2,  11,  -2,  -9,  -1,  -6,  -1,
+   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,
+    1,  -1,   2,  -1,   3,  -1,   4,  -1,   6,  -1,
+    9,  -1, -31,   0, -23,   0, -18,   0, -14,   0,
+  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,
+   -2,   0,  -1,   0,   0, -31,   1,   0,   2,   0,
+    3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
+   14,   0,  18,   0,  23,   0,  31,   0,  -9,   1,
+   -6,   1,  -4,   1,  -3,   1,  -2,   1,  -1,   1,
+    0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
+    6,   1,   9,   1, -11,   2,  -7,   2,  -5,   2,
+   -3,   2,  -2,   2,  -1,   2,   0,   2,   1,   2,
+    2,   2,   3,   2,   5,   2,   7,   2,  11,   2,
+   -8,   3,  -6,   3,  -4,   3,  -2,   3,  -1,   3,
+    0,   3,   1,   3,   2,   3,   3,   3,   4,   3,
+    6,   3,   8,   3, -13,   4, -10,   4,  -5,   4,
+   -3,   4,  -1,   4,   0,   4,   1,   4,   3,   4,
+    5,   4,  10,   4,  13,   4, -17,   5,  -7,   5,
+   -4,   5,  -2,   5,   0,   5,   2,   5,   4,   5,
+    7,   5,  17,   5, -22,   6,  -9,   6,  -6,   6,
+   -3,   6,  -1,   6,   1,   6,   3,   6,   6,   6,
+    9,   6,  22,   6,  -5,   7,  -2,   7,   0,   7,
+    2,   7,   5,   7, -29,   8, -11,   8,  -8,   8,
+   -3,   8,   3,   8,   8,   8,  11,   8,  29,   8,
+   -6,   9,  -1,   9,   1,   9,   6,   9, -15,  10,
+   -4,  10,   4,  10,  15,  10,  -8,  11,  -2,  11,
+    0,  11,   2,  11,   8,  11,  19,  12, -19,  13,
+   -4,  13,   4,  13,   0,  14, -10,  15,  10,  15,
+   -5,  17,   5,  17,  25,  17, -25,  18,   0,  18,
+  -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
+  -17,  25,  18,  25,  -8,  29,   8,  29,   0,  31,
+    0,   0,  -6, -22,   6, -22, -13, -19,  12, -19,
+    0, -18,  -5, -17,   5, -17, -10, -15,  10, -15,
+    0, -14,  -4, -13,   4, -13,  19, -13, -19, -12,
+   -8, -11,  -2, -11,   0, -11,   2, -11,   8, -11,
+  -15, -10,  -4, -10,   4, -10,  15, -10,  -6,  -9,
+   -1,  -9,   1,  -9,   6,  -9, -11,  -8,  -8,  -8,
+   -3,  -8,   0,  -8,   3,  -8,   8,  -8,  11,  -8,
+   -5,  -7,  -2,  -7,   0,  -7,   2,  -7,   5,  -7,
+  -22,  -6,  -9,  -6,  -6,  -6,  -3,  -6,  -1,  -6,
+    1,  -6,   3,  -6,   6,  -6,   9,  -6,  22,  -6,
+  -17,  -5,  -7,  -5,  -4,  -5,  -2,  -5,  -1,  -5,
+    0,  -5,   1,  -5,   2,  -5,   4,  -5,   7,  -5,
+   17,  -5, -13,  -4, -10,  -4,  -5,  -4,  -3,  -4,
+   -2,  -4,  -1,  -4,   0,  -4,   1,  -4,   2,  -4,
+    3,  -4,   5,  -4,  10,  -4,  13,  -4,  -8,  -3,
+   -6,  -3,  -4,  -3,  -3,  -3,  -2,  -3,  -1,  -3,
+    0,  -3,   1,  -3,   2,  -3,   3,  -3,   4,  -3,
+    6,  -3,   8,  -3, -11,  -2,  -7,  -2,  -5,  -2,
+   -4,  -2,  -3,  -2,  -2,  -2,  -1,  -2,   0,  -2,
+    1,  -2,   2,  -2,   3,  -2,   4,  -2,   5,  -2,
+    7,  -2,  11,  -2,  -9,  -1,  -6,  -1,  -5,  -1,
+   -4,  -1,  -3,  -1,  -2,  -1,  -1,  -1,   0,  -1,
+    1,  -1,   2,  -1,   3,  -1,   4,  -1,   5,  -1,
+    6,  -1,   9,  -1, -23,   0, -18,   0, -14,   0,
+  -11,   0,  -7,   0,  -5,   0,  -4,   0,  -3,   0,
+   -2,   0,  -1,   0,   0, -23,   1,   0,   2,   0,
+    3,   0,   4,   0,   5,   0,   7,   0,  11,   0,
+   14,   0,  18,   0,  23,   0,  -9,   1,  -6,   1,
+   -5,   1,  -4,   1,  -3,   1,  -2,   1,  -1,   1,
+    0,   1,   1,   1,   2,   1,   3,   1,   4,   1,
+    5,   1,   6,   1,   9,   1, -11,   2,  -7,   2,
+   -5,   2,  -4,   2,  -3,   2,  -2,   2,  -1,   2,
+    0,   2,   1,   2,   2,   2,   3,   2,   4,   2,
+    5,   2,   7,   2,  11,   2,  -8,   3,  -6,   3,
+   -4,   3,  -3,   3,  -2,   3,  -1,   3,   0,   3,
+    1,   3,   2,   3,   3,   3,   4,   3,   6,   3,
+    8,   3, -13,   4, -10,   4,  -5,   4,  -3,   4,
+   -2,   4,  -1,   4,   0,   4,   1,   4,   2,   4,
+    3,   4,   5,   4,  10,   4,  13,   4, -17,   5,
+   -7,   5,  -4,   5,  -2,   5,  -1,   5,   0,   5,
+    1,   5,   2,   5,   4,   5,   7,   5,  17,   5,
+  -22,   6,  -9,   6,  -6,   6,  -3,   6,  -1,   6,
+    1,   6,   3,   6,   6,   6,   9,   6,  22,   6,
+   -5,   7,  -2,   7,   0,   7,   2,   7,   5,   7,
+  -11,   8,  -8,   8,  -3,   8,   0,   8,   3,   8,
+    8,   8,  11,   8,  -6,   9,  -1,   9,   1,   9,
+    6,   9, -15,  10,  -4,  10,   4,  10,  15,  10,
+   -8,  11,  -2,  11,   0,  11,   2,  11,   8,  11,
+   19,  12, -19,  13,  -4,  13,   4,  13,   0,  14,
+  -10,  15,  10,  15,  -5,  17,   5,  17,   0,  18,
+  -12,  19,  13,  19,  -6,  22,   6,  22,   0,  23,
 };
 
 typedef struct SANMVideoContext {
@@ -406,6 +457,7 @@ static void destroy_buffers(SANMVideoContext *ctx)
     ctx->frm0_size =
     ctx->frm1_size =
     ctx->frm2_size = 0;
+    init_sizes(ctx, 0, 0);
 }
 
 static av_cold int init_buffers(SANMVideoContext *ctx)
@@ -460,7 +512,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         }
 
         ctx->subversion = AV_RL16(avctx->extradata);
-        for (i = 0; i < 256; i++)
+        for (i = 0; i < PALETTE_SIZE; i++)
             ctx->pal[i] = 0xFFU << 24 | AV_RL32(avctx->extradata + 2 + i * 4);
     }
 
@@ -1466,7 +1518,7 @@ static int decode_frame(AVCodecContext *avctx, void *data,
 
 AVCodec ff_sanm_decoder = {
     .name           = "sanm",
-    .long_name      = NULL_IF_CONFIG_SMALL("LucasArts SANM video"),
+    .long_name      = NULL_IF_CONFIG_SMALL("LucasArts SANM/Smush video"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_SANM,
     .priv_data_size = sizeof(SANMVideoContext),
diff --git a/libavcodec/sbr.h b/libavcodec/sbr.h
index a47ad6e..eb7d1ae 100644
--- a/libavcodec/sbr.h
+++ b/libavcodec/sbr.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2010      Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,8 @@
 #include "aacps.h"
 #include "sbrdsp.h"
 
+typedef struct AACContext AACContext;
+
 /**
  * Spectral Band Replication header - spectrum parameters that invoke a reset if they differ from the previous header.
  */
@@ -64,9 +66,9 @@ typedef struct SBRData {
      */
     unsigned           bs_frame_class;
     unsigned           bs_add_harmonic_flag;
-    unsigned           bs_num_env;
+    AAC_SIGNE          bs_num_env;
     uint8_t            bs_freq_res[7];
-    unsigned           bs_num_noise;
+    AAC_SIGNE          bs_num_noise;
     uint8_t            bs_df_env[5];
     uint8_t            bs_df_noise[2];
     uint8_t            bs_invf_mode[2][5];
@@ -78,25 +80,27 @@ typedef struct SBRData {
      * @name State variables
      * @{
      */
-    DECLARE_ALIGNED(32, float, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
-    DECLARE_ALIGNED(32, float, analysis_filterbank_samples) [1312];
+    DECLARE_ALIGNED(32, INTFLOAT, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE];
+    DECLARE_ALIGNED(32, INTFLOAT, analysis_filterbank_samples) [1312];
     int                synthesis_filterbank_samples_offset;
     ///l_APrev and l_A
     int                e_a[2];
     ///Chirp factors
-    float              bw_array[5];
+    INTFLOAT              bw_array[5];
     ///QMF values of the original signal
-    float              W[2][32][32][2];
+    INTFLOAT              W[2][32][32][2];
     ///QMF output of the HF adjustor
     int                Ypos;
-    DECLARE_ALIGNED(16, float, Y)[2][38][64][2];
-    DECLARE_ALIGNED(16, float, g_temp)[42][48];
-    float              q_temp[42][48];
+    DECLARE_ALIGNED(16, INTFLOAT, Y)[2][38][64][2];
+    DECLARE_ALIGNED(16, AAC_FLOAT, g_temp)[42][48];
+    AAC_FLOAT          q_temp[42][48];
     uint8_t            s_indexmapped[8][48];
     ///Envelope scalefactors
-    float              env_facs[6][48];
+    uint8_t            env_facs_q[6][48];
+    AAC_FLOAT          env_facs[6][48];
     ///Noise scalefactors
-    float              noise_facs[3][5];
+    uint8_t            noise_facs_q[3][5];
+    AAC_FLOAT          noise_facs[3][5];
     ///Envelope time borders
     uint8_t            t_env[8];
     ///Envelope time border of the last envelope of the previous frame
@@ -108,12 +112,35 @@ typedef struct SBRData {
     /** @} */
 } SBRData;
 
+typedef struct SpectralBandReplication SpectralBandReplication;
+
+/**
+ * aacsbr functions pointers
+ */
+typedef struct AACSBRContext {
+    int (*sbr_lf_gen)(AACContext *ac, SpectralBandReplication *sbr,
+                      INTFLOAT X_low[32][40][2], const INTFLOAT W[2][32][32][2],
+                      int buf_idx);
+    void (*sbr_hf_assemble)(INTFLOAT Y1[38][64][2],
+                            const INTFLOAT X_high[64][40][2],
+                            SpectralBandReplication *sbr, SBRData *ch_data,
+                            const int e_a[2]);
+    int (*sbr_x_gen)(SpectralBandReplication *sbr, INTFLOAT X[2][38][64],
+                     const INTFLOAT Y0[38][64][2], const INTFLOAT Y1[38][64][2],
+                     const INTFLOAT X_low[32][40][2], int ch);
+    void (*sbr_hf_inverse_filter)(SBRDSPContext *dsp,
+                                  INTFLOAT (*alpha0)[2], INTFLOAT (*alpha1)[2],
+                                  const INTFLOAT X_low[32][40][2], int k0);
+} AACSBRContext;
+
 /**
  * Spectral Band Replication
  */
-typedef struct SpectralBandReplication {
+struct SpectralBandReplication {
     int                sample_rate;
     int                start;
+    int                ready_for_dequant;
+    int                id_aac;
     int                reset;
     SpectrumParameters spectrum_params;
     int                bs_amp_res_header;
@@ -127,23 +154,23 @@ typedef struct SpectralBandReplication {
     unsigned           bs_smoothing_mode;
     /** @} */
     unsigned           bs_coupling;
-    unsigned           k[5]; ///< k0, k1, k2
+    AAC_SIGNE          k[5]; ///< k0, k1, k2
     ///kx', and kx respectively, kx is the first QMF subband where SBR is used.
     ///kx' is its value from the previous frame
-    unsigned           kx[2];
+    AAC_SIGNE          kx[2];
     ///M' and M respectively, M is the number of QMF subbands that use SBR.
-    unsigned           m[2];
+    AAC_SIGNE          m[2];
     unsigned           kx_and_m_pushed;
     ///The number of frequency bands in f_master
-    unsigned           n_master;
+    AAC_SIGNE          n_master;
     SBRData            data[2];
     PSContext          ps;
     ///N_Low and N_High respectively, the number of frequency bands for low and high resolution
-    unsigned           n[2];
+    AAC_SIGNE          n[2];
     ///Number of noise floor bands
-    unsigned           n_q;
+    AAC_SIGNE          n_q;
     ///Number of limiter bands
-    unsigned           n_lim;
+    AAC_SIGNE          n_lim;
     ///The master QMF frequency grouping
     uint16_t           f_master[49];
     ///Frequency borders for low resolution SBR
@@ -153,37 +180,38 @@ typedef struct SpectralBandReplication {
     ///Frequency borders for noise floors
     uint16_t           f_tablenoise[6];
     ///Frequency borders for the limiter
-    uint16_t           f_tablelim[29];
-    unsigned           num_patches;
+    uint16_t           f_tablelim[30];
+    AAC_SIGNE          num_patches;
     uint8_t            patch_num_subbands[6];
     uint8_t            patch_start_subband[6];
     ///QMF low frequency input to the HF generator
-    DECLARE_ALIGNED(16, float, X_low)[32][40][2];
+    DECLARE_ALIGNED(16, INTFLOAT, X_low)[32][40][2];
     ///QMF output of the HF generator
-    DECLARE_ALIGNED(16, float, X_high)[64][40][2];
+    DECLARE_ALIGNED(16, INTFLOAT, X_high)[64][40][2];
     ///QMF values of the reconstructed signal
-    DECLARE_ALIGNED(16, float, X)[2][2][38][64];
+    DECLARE_ALIGNED(16, INTFLOAT, X)[2][2][38][64];
     ///Zeroth coefficient used to filter the subband signals
-    DECLARE_ALIGNED(16, float, alpha0)[64][2];
+    DECLARE_ALIGNED(16, INTFLOAT, alpha0)[64][2];
     ///First coefficient used to filter the subband signals
-    DECLARE_ALIGNED(16, float, alpha1)[64][2];
+    DECLARE_ALIGNED(16, INTFLOAT, alpha1)[64][2];
     ///Dequantized envelope scalefactors, remapped
-    float              e_origmapped[7][48];
+    AAC_FLOAT          e_origmapped[7][48];
     ///Dequantized noise scalefactors, remapped
-    float              q_mapped[7][48];
+    AAC_FLOAT          q_mapped[7][48];
     ///Sinusoidal presence, remapped
     uint8_t            s_mapped[7][48];
     ///Estimated envelope
-    float              e_curr[7][48];
+    AAC_FLOAT          e_curr[7][48];
     ///Amplitude adjusted noise scalefactors
-    float              q_m[7][48];
+    AAC_FLOAT          q_m[7][48];
     ///Sinusoidal levels
-    float              s_m[7][48];
-    float              gain[7][48];
-    DECLARE_ALIGNED(32, float, qmf_filter_scratch)[5][64];
+    AAC_FLOAT          s_m[7][48];
+    AAC_FLOAT          gain[7][48];
+    DECLARE_ALIGNED(32, INTFLOAT, qmf_filter_scratch)[5][64];
     FFTContext         mdct_ana;
     FFTContext         mdct;
     SBRDSPContext      dsp;
-} SpectralBandReplication;
+    AACSBRContext      c;
+};
 
 #endif /* AVCODEC_SBR_H */
diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
index b7917dc..cc432b6 100644
--- a/libavcodec/sbrdsp.c
+++ b/libavcodec/sbrdsp.c
@@ -3,37 +3,31 @@
  * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
  * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define USE_FIXED 0
+
+#include "aac.h"
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/intfloat.h"
 #include "sbrdsp.h"
 
-static void sbr_sum64x5_c(float *z)
-{
-    int k;
-    for (k = 0; k < 64; k++) {
-        float f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k + 256];
-        z[k] = f;
-    }
-}
-
 static float sbr_sum_square_c(float (*x)[2], int n)
 {
     float sum0 = 0.0f, sum1 = 0.0f;
@@ -72,6 +66,7 @@ static void sbr_qmf_pre_shuffle_c(float *z)
         zi[64 + 2 * k + 2].i = zi[63 - k].i ^ (1U << 31);
         zi[64 + 2 * k + 3].i = zi[ k + 2].i;
     }
+
     zi[64 + 2 * 31 + 0].i = zi[64 - 31].i ^ (1U << 31);
     zi[64 + 2 * 31 + 1].i = zi[31 +  1].i;
 }
@@ -100,16 +95,6 @@ static void sbr_qmf_deint_neg_c(float *v, const float *src)
     }
 }
 
-static void sbr_qmf_deint_bfly_c(float *v, const float *src0, const float *src1)
-{
-    int i;
-    for (i = 0; i < 64; i++) {
-        v[      i] = src0[i] - src1[63 - i];
-        v[127 - i] = src0[i] + src1[63 - i];
-    }
-}
-
-
 #if 0
     /* This code is slower because it multiplies memory accesses.
      * It is left for educational purposes and because it may offer
@@ -237,56 +222,4 @@ static av_always_inline void sbr_hf_apply_noise(float (*Y)[2],
     }
 }
 
-static void sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 1.0, 0.0, m_max);
-}
-
-static void sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    float phi_sign = 1 - 2 * (kx & 1);
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 0.0, phi_sign, m_max);
-}
-
-static void sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, -1.0, 0.0, m_max);
-}
-
-static void sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
-                                 const float *q_filt, int noise,
-                                 int kx, int m_max)
-{
-    float phi_sign = 1 - 2 * (kx & 1);
-    sbr_hf_apply_noise(Y, s_m, q_filt, noise, 0.0, -phi_sign, m_max);
-}
-
-av_cold void ff_sbrdsp_init(SBRDSPContext *s)
-{
-    s->sum64x5 = sbr_sum64x5_c;
-    s->sum_square = sbr_sum_square_c;
-    s->neg_odd_64 = sbr_neg_odd_64_c;
-    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_c;
-    s->qmf_post_shuffle = sbr_qmf_post_shuffle_c;
-    s->qmf_deint_neg = sbr_qmf_deint_neg_c;
-    s->qmf_deint_bfly = sbr_qmf_deint_bfly_c;
-    s->autocorrelate = sbr_autocorrelate_c;
-    s->hf_gen = sbr_hf_gen_c;
-    s->hf_g_filt = sbr_hf_g_filt_c;
-
-    s->hf_apply_noise[0] = sbr_hf_apply_noise_0;
-    s->hf_apply_noise[1] = sbr_hf_apply_noise_1;
-    s->hf_apply_noise[2] = sbr_hf_apply_noise_2;
-    s->hf_apply_noise[3] = sbr_hf_apply_noise_3;
-
-    if (ARCH_ARM)
-        ff_sbrdsp_init_arm(s);
-    if (ARCH_X86)
-        ff_sbrdsp_init_x86(s);
-}
+#include "sbrdsp_template.c"
diff --git a/libavcodec/sbrdsp.h b/libavcodec/sbrdsp.h
index 07235c6..66852de 100644
--- a/libavcodec/sbrdsp.h
+++ b/libavcodec/sbrdsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,30 +22,33 @@
 #define AVCODEC_SBRDSP_H
 
 #include <stdint.h>
+#include "aac_defines.h"
+#include "libavutil/softfloat.h"
 
 typedef struct SBRDSPContext {
-    void (*sum64x5)(float *z);
-    float (*sum_square)(float (*x)[2], int n);
-    void (*neg_odd_64)(float *x);
-    void (*qmf_pre_shuffle)(float *z);
-    void (*qmf_post_shuffle)(float W[32][2], const float *z);
-    void (*qmf_deint_neg)(float *v, const float *src);
-    void (*qmf_deint_bfly)(float *v, const float *src0, const float *src1);
-    void (*autocorrelate)(const float x[40][2], float phi[3][2][2]);
-    void (*hf_gen)(float (*X_high)[2], const float (*X_low)[2],
-                   const float alpha0[2], const float alpha1[2],
-                   float bw, int start, int end);
-    void (*hf_g_filt)(float (*Y)[2], const float (*X_high)[40][2],
-                      const float *g_filt, int m_max, intptr_t ixh);
-    void (*hf_apply_noise[4])(float (*Y)[2], const float *s_m,
-                              const float *q_filt, int noise,
+    void (*sum64x5)(INTFLOAT *z);
+    AAC_FLOAT (*sum_square)(INTFLOAT (*x)[2], int n);
+    void (*neg_odd_64)(INTFLOAT *x);
+    void (*qmf_pre_shuffle)(INTFLOAT *z);
+    void (*qmf_post_shuffle)(INTFLOAT W[32][2], const INTFLOAT *z);
+    void (*qmf_deint_neg)(INTFLOAT *v, const INTFLOAT *src);
+    void (*qmf_deint_bfly)(INTFLOAT *v, const INTFLOAT *src0, const INTFLOAT *src1);
+    void (*autocorrelate)(const INTFLOAT x[40][2], AAC_FLOAT phi[3][2][2]);
+    void (*hf_gen)(INTFLOAT (*X_high)[2], const INTFLOAT (*X_low)[2],
+                   const INTFLOAT alpha0[2], const INTFLOAT alpha1[2],
+                   INTFLOAT bw, int start, int end);
+    void (*hf_g_filt)(INTFLOAT (*Y)[2], const INTFLOAT (*X_high)[40][2],
+                      const AAC_FLOAT *g_filt, int m_max, intptr_t ixh);
+    void (*hf_apply_noise[4])(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                              const AAC_FLOAT *q_filt, int noise,
                               int kx, int m_max);
 } SBRDSPContext;
 
-extern const float ff_sbr_noise_table[][2];
+extern const INTFLOAT AAC_RENAME(ff_sbr_noise_table)[][2];
 
-void ff_sbrdsp_init(SBRDSPContext *s);
+void AAC_RENAME(ff_sbrdsp_init)(SBRDSPContext *s);
 void ff_sbrdsp_init_arm(SBRDSPContext *s);
 void ff_sbrdsp_init_x86(SBRDSPContext *s);
+void ff_sbrdsp_init_mips(SBRDSPContext *s);
 
 #endif /* AVCODEC_SBRDSP_H */
diff --git a/libavcodec/sbrdsp_fixed.c b/libavcodec/sbrdsp_fixed.c
new file mode 100644
index 0000000..f4e3de0
--- /dev/null
+++ b/libavcodec/sbrdsp_fixed.c
@@ -0,0 +1,291 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Note: Rounding-to-nearest used unless otherwise stated
+ *
+ */
+
+#define USE_FIXED 1
+
+#include "aac.h"
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/intfloat.h"
+#include "sbrdsp.h"
+
+static SoftFloat sbr_sum_square_c(int (*x)[2], int n)
+{
+    SoftFloat ret;
+    int64_t accu = 0;
+    int i, nz, round;
+
+    for (i = 0; i < n; i += 2) {
+        // Larger values are inavlid and could cause overflows of accu.
+        av_assert2(FFABS(x[i + 0][0]) >> 29 == 0);
+        accu += (int64_t)x[i + 0][0] * x[i + 0][0];
+        av_assert2(FFABS(x[i + 0][1]) >> 29 == 0);
+        accu += (int64_t)x[i + 0][1] * x[i + 0][1];
+        av_assert2(FFABS(x[i + 1][0]) >> 29 == 0);
+        accu += (int64_t)x[i + 1][0] * x[i + 1][0];
+        av_assert2(FFABS(x[i + 1][1]) >> 29 == 0);
+        accu += (int64_t)x[i + 1][1] * x[i + 1][1];
+    }
+
+    i = (int)(accu >> 32);
+    if (i == 0) {
+        nz = 1;
+    } else {
+        nz = 0;
+        while (FFABS(i) < 0x40000000) {
+            i <<= 1;
+            nz++;
+        }
+        nz = 32 - nz;
+    }
+
+    round = 1 << (nz-1);
+    i = (int)((accu + round) >> nz);
+    i >>= 1;
+    ret = av_int2sf(i, 15 - nz);
+
+    return ret;
+}
+
+static void sbr_neg_odd_64_c(int *x)
+{
+    int i;
+    for (i = 1; i < 64; i += 2)
+        x[i] = -x[i];
+}
+
+static void sbr_qmf_pre_shuffle_c(int *z)
+{
+    int k;
+    z[64] = z[0];
+    z[65] = z[1];
+    for (k = 1; k < 32; k++) {
+        z[64+2*k  ] = -z[64 - k];
+        z[64+2*k+1] =  z[ k + 1];
+    }
+}
+
+static void sbr_qmf_post_shuffle_c(int W[32][2], const int *z)
+{
+    int k;
+    for (k = 0; k < 32; k++) {
+        W[k][0] = -z[63-k];
+        W[k][1] = z[k];
+    }
+}
+
+static void sbr_qmf_deint_neg_c(int *v, const int *src)
+{
+    int i;
+    for (i = 0; i < 32; i++) {
+        v[     i] = ( src[63 - 2*i    ] + 0x10) >> 5;
+        v[63 - i] = (-src[63 - 2*i - 1] + 0x10) >> 5;
+    }
+}
+
+static av_always_inline SoftFloat autocorr_calc(int64_t accu)
+{
+        int nz, mant, expo, round;
+        int i = (int)(accu >> 32);
+        if (i == 0) {
+            nz = 1;
+        } else {
+            nz = 0;
+            while (FFABS(i) < 0x40000000) {
+                i <<= 1;
+                nz++;
+            }
+            nz = 32-nz;
+        }
+
+        round = 1 << (nz-1);
+        mant = (int)((accu + round) >> nz);
+        mant = (mant + 0x40)>>7;
+        mant <<= 6;
+        expo = nz + 15;
+        return av_int2sf(mant, 30 - expo);
+}
+
+static av_always_inline void autocorrelate(const int x[40][2], SoftFloat phi[3][2][2], int lag)
+{
+    int i;
+    int64_t real_sum, imag_sum;
+    int64_t accu_re = 0, accu_im = 0;
+
+    if (lag) {
+        for (i = 1; i < 38; i++) {
+            accu_re += (int64_t)x[i][0] * x[i+lag][0];
+            accu_re += (int64_t)x[i][1] * x[i+lag][1];
+            accu_im += (int64_t)x[i][0] * x[i+lag][1];
+            accu_im -= (int64_t)x[i][1] * x[i+lag][0];
+        }
+
+        real_sum = accu_re;
+        imag_sum = accu_im;
+
+        accu_re += (int64_t)x[ 0][0] * x[lag][0];
+        accu_re += (int64_t)x[ 0][1] * x[lag][1];
+        accu_im += (int64_t)x[ 0][0] * x[lag][1];
+        accu_im -= (int64_t)x[ 0][1] * x[lag][0];
+
+        phi[2-lag][1][0] = autocorr_calc(accu_re);
+        phi[2-lag][1][1] = autocorr_calc(accu_im);
+
+        if (lag == 1) {
+            accu_re = real_sum;
+            accu_im = imag_sum;
+            accu_re += (int64_t)x[38][0] * x[39][0];
+            accu_re += (int64_t)x[38][1] * x[39][1];
+            accu_im += (int64_t)x[38][0] * x[39][1];
+            accu_im -= (int64_t)x[38][1] * x[39][0];
+
+            phi[0][0][0] = autocorr_calc(accu_re);
+            phi[0][0][1] = autocorr_calc(accu_im);
+        }
+    } else {
+        for (i = 1; i < 38; i++) {
+            accu_re += (int64_t)x[i][0] * x[i][0];
+            accu_re += (int64_t)x[i][1] * x[i][1];
+        }
+        real_sum = accu_re;
+        accu_re += (int64_t)x[ 0][0] * x[ 0][0];
+        accu_re += (int64_t)x[ 0][1] * x[ 0][1];
+
+        phi[2][1][0] = autocorr_calc(accu_re);
+
+        accu_re = real_sum;
+        accu_re += (int64_t)x[38][0] * x[38][0];
+        accu_re += (int64_t)x[38][1] * x[38][1];
+
+        phi[1][0][0] = autocorr_calc(accu_re);
+    }
+}
+
+static void sbr_autocorrelate_c(const int x[40][2], SoftFloat phi[3][2][2])
+{
+    autocorrelate(x, phi, 0);
+    autocorrelate(x, phi, 1);
+    autocorrelate(x, phi, 2);
+}
+
+static void sbr_hf_gen_c(int (*X_high)[2], const int (*X_low)[2],
+                       const int alpha0[2], const int alpha1[2],
+                       int bw, int start, int end)
+{
+    int alpha[4];
+    int i;
+    int64_t accu;
+
+    accu = (int64_t)alpha0[0] * bw;
+    alpha[2] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha0[1] * bw;
+    alpha[3] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)bw * bw;
+    bw = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha1[0] * bw;
+    alpha[0] = (int)((accu + 0x40000000) >> 31);
+    accu = (int64_t)alpha1[1] * bw;
+    alpha[1] = (int)((accu + 0x40000000) >> 31);
+
+    for (i = start; i < end; i++) {
+        accu  = (int64_t)X_low[i][0] * 0x20000000;
+        accu += (int64_t)X_low[i - 2][0] * alpha[0];
+        accu -= (int64_t)X_low[i - 2][1] * alpha[1];
+        accu += (int64_t)X_low[i - 1][0] * alpha[2];
+        accu -= (int64_t)X_low[i - 1][1] * alpha[3];
+        X_high[i][0] = (int)((accu + 0x10000000) >> 29);
+
+        accu  = (int64_t)X_low[i][1] * 0x20000000;
+        accu += (int64_t)X_low[i - 2][1] * alpha[0];
+        accu += (int64_t)X_low[i - 2][0] * alpha[1];
+        accu += (int64_t)X_low[i - 1][1] * alpha[2];
+        accu += (int64_t)X_low[i - 1][0] * alpha[3];
+        X_high[i][1] = (int)((accu + 0x10000000) >> 29);
+    }
+}
+
+static void sbr_hf_g_filt_c(int (*Y)[2], const int (*X_high)[40][2],
+                          const SoftFloat *g_filt, int m_max, intptr_t ixh)
+{
+    int m, r;
+    int64_t accu;
+
+    for (m = 0; m < m_max; m++) {
+        r = 1 << (22-g_filt[m].exp);
+        accu = (int64_t)X_high[m][ixh][0] * ((g_filt[m].mant + 0x40)>>7);
+        Y[m][0] = (int)((accu + r) >> (23-g_filt[m].exp));
+
+        accu = (int64_t)X_high[m][ixh][1] * ((g_filt[m].mant + 0x40)>>7);
+        Y[m][1] = (int)((accu + r) >> (23-g_filt[m].exp));
+    }
+}
+
+static av_always_inline void sbr_hf_apply_noise(int (*Y)[2],
+                                                const SoftFloat *s_m,
+                                                const SoftFloat *q_filt,
+                                                int noise,
+                                                int phi_sign0,
+                                                int phi_sign1,
+                                                int m_max)
+{
+    int m;
+
+    for (m = 0; m < m_max; m++) {
+        int y0 = Y[m][0];
+        int y1 = Y[m][1];
+        noise = (noise + 1) & 0x1ff;
+        if (s_m[m].mant) {
+            int shift, round;
+
+            shift = 22 - s_m[m].exp;
+            if (shift < 30) {
+                round = 1 << (shift-1);
+                y0 += (s_m[m].mant * phi_sign0 + round) >> shift;
+                y1 += (s_m[m].mant * phi_sign1 + round) >> shift;
+            }
+        } else {
+            int shift, round, tmp;
+            int64_t accu;
+
+            shift = 22 - q_filt[m].exp;
+            if (shift < 30) {
+                round = 1 << (shift-1);
+
+                accu = (int64_t)q_filt[m].mant * ff_sbr_noise_table_fixed[noise][0];
+                tmp = (int)((accu + 0x40000000) >> 31);
+                y0 += (tmp + round) >> shift;
+
+                accu = (int64_t)q_filt[m].mant * ff_sbr_noise_table_fixed[noise][1];
+                tmp = (int)((accu + 0x40000000) >> 31);
+                y1 += (tmp + round) >> shift;
+            }
+        }
+        Y[m][0] = y0;
+        Y[m][1] = y1;
+        phi_sign1 = -phi_sign1;
+    }
+}
+
+#include "sbrdsp_template.c"
diff --git a/libavcodec/sbrdsp_template.c b/libavcodec/sbrdsp_template.c
new file mode 100644
index 0000000..b649dfd
--- /dev/null
+++ b/libavcodec/sbrdsp_template.c
@@ -0,0 +1,97 @@
+/*
+ * AAC Spectral Band Replication decoding functions
+ * Copyright (c) 2008-2009 Robert Swain ( rob opendot cl )
+ * Copyright (c) 2009-2010 Alex Converse <alex.converse@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static void sbr_sum64x5_c(INTFLOAT *z)
+{
+    int k;
+    for (k = 0; k < 64; k++) {
+        INTFLOAT f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k + 256];
+        z[k] = f;
+    }
+}
+
+static void sbr_qmf_deint_bfly_c(INTFLOAT *v, const INTFLOAT *src0, const INTFLOAT *src1)
+{
+    int i;
+    for (i = 0; i < 64; i++) {
+        v[      i] = AAC_SRA_R((src0[i] - src1[63 - i]), 5);
+        v[127 - i] = AAC_SRA_R((src0[i] + src1[63 - i]), 5);
+    }
+}
+
+static void sbr_hf_apply_noise_0(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)1.0, (INTFLOAT)0.0, m_max);
+}
+
+static void sbr_hf_apply_noise_1(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    INTFLOAT phi_sign = 1 - 2 * (kx & 1);
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)0.0, phi_sign, m_max);
+}
+
+static void sbr_hf_apply_noise_2(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)-1.0, (INTFLOAT)0.0, m_max);
+}
+
+static void sbr_hf_apply_noise_3(INTFLOAT (*Y)[2], const AAC_FLOAT *s_m,
+                                 const AAC_FLOAT *q_filt, int noise,
+                                 int kx, int m_max)
+{
+    INTFLOAT phi_sign = 1 - 2 * (kx & 1);
+    sbr_hf_apply_noise(Y, s_m, q_filt, noise, (INTFLOAT)0.0, -phi_sign, m_max);
+}
+
+av_cold void AAC_RENAME(ff_sbrdsp_init)(SBRDSPContext *s)
+{
+    s->sum64x5 = sbr_sum64x5_c;
+    s->sum_square = sbr_sum_square_c;
+    s->neg_odd_64 = sbr_neg_odd_64_c;
+    s->qmf_pre_shuffle = sbr_qmf_pre_shuffle_c;
+    s->qmf_post_shuffle = sbr_qmf_post_shuffle_c;
+    s->qmf_deint_neg = sbr_qmf_deint_neg_c;
+    s->qmf_deint_bfly = sbr_qmf_deint_bfly_c;
+    s->autocorrelate = sbr_autocorrelate_c;
+    s->hf_gen = sbr_hf_gen_c;
+    s->hf_g_filt = sbr_hf_g_filt_c;
+
+    s->hf_apply_noise[0] = sbr_hf_apply_noise_0;
+    s->hf_apply_noise[1] = sbr_hf_apply_noise_1;
+    s->hf_apply_noise[2] = sbr_hf_apply_noise_2;
+    s->hf_apply_noise[3] = sbr_hf_apply_noise_3;
+
+#if !USE_FIXED
+    if (ARCH_ARM)
+        ff_sbrdsp_init_arm(s);
+    if (ARCH_X86)
+        ff_sbrdsp_init_x86(s);
+    if (ARCH_MIPS)
+        ff_sbrdsp_init_mips(s);
+#endif /* !USE_FIXED */
+}
diff --git a/libavcodec/screenpresso.c b/libavcodec/screenpresso.c
index eae0ae7..34efb03 100644
--- a/libavcodec/screenpresso.c
+++ b/libavcodec/screenpresso.c
@@ -2,20 +2,20 @@
  * Screenpresso decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sgi.h b/libavcodec/sgi.h
index 3c47d3a..5ec891e 100644
--- a/libavcodec/sgi.h
+++ b/libavcodec/sgi.h
@@ -2,20 +2,20 @@
  * SGI image encoder
  * Xiaohui Sun <tjnksxh@hotmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sgidec.c b/libavcodec/sgidec.c
index 6f93a30..02ad1e1 100644
--- a/libavcodec/sgidec.c
+++ b/libavcodec/sgidec.c
@@ -2,24 +2,25 @@
  * SGI image decoder
  * Todd Kirby <doubleshot@pacbell.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/imgutils.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
@@ -41,7 +42,7 @@ typedef struct SgiState {
  * @param out_buf Points to one line after the output buffer.
  * @param len length of out_buf in bytes
  * @param pixelstride pixel stride of input buffer
- * @return size of output in bytes, -1 if buffer overflows
+ * @return size of output in bytes, else return error code.
  */
 static int expand_rle_row8(SgiState *s, uint8_t *out_buf,
                            int len, int pixelstride)
@@ -59,7 +60,7 @@ static int expand_rle_row8(SgiState *s, uint8_t *out_buf,
         }
 
         /* Check for buffer overflow. */
-        if (pixelstride * (count - 1) >= len) {
+        if (out_end - out_buf <= pixelstride * (count - 1)) {
             av_log(s->avctx, AV_LOG_ERROR, "Invalid pixel count.\n");
             return AVERROR_INVALIDDATA;
         }
@@ -97,7 +98,7 @@ static int expand_rle_row16(SgiState *s, uint16_t *out_buf,
             break;
 
         /* Check for buffer overflow. */
-        if (pixelstride * (count - 1) >= len) {
+        if (out_end - out_buf <= pixelstride * (count - 1)) {
             av_log(s->avctx, AV_LOG_ERROR, "Invalid pixel count.\n");
             return AVERROR_INVALIDDATA;
         }
@@ -125,7 +126,7 @@ static int expand_rle_row16(SgiState *s, uint16_t *out_buf,
  * Read a run length encoded SGI image.
  * @param out_buf output buffer
  * @param s the current image state
- * @return 0 if no error, else return error number.
+ * @return 0 if no error, else return error code.
  */
 static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
 {
@@ -144,7 +145,7 @@ static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
     for (z = 0; z < s->depth; z++) {
         dest_row = out_buf;
         for (y = 0; y < s->height; y++) {
-            linesize = s->width * s->depth * s->bytes_per_channel;
+            linesize = s->width * s->depth;
             dest_row -= s->linesize;
             start_offset = bytestream2_get_be32(&g_table);
             bytestream2_seek(&s->g, start_offset, SEEK_SET);
@@ -163,7 +164,7 @@ static int read_rle_sgi(uint8_t *out_buf, SgiState *s)
  * Read an uncompressed SGI image.
  * @param out_buf output buffer
  * @param s the current image state
- * @return 0 if read success, otherwise return -1.
+ * @return 0 if read success, else return error code.
  */
 static int read_uncompressed_sgi(unsigned char *out_buf, SgiState *s)
 {
@@ -215,27 +216,27 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     /* Test for SGI magic. */
-    if (bytestream2_get_be16(&s->g) != SGI_MAGIC) {
+    if (bytestream2_get_be16u(&s->g) != SGI_MAGIC) {
         av_log(avctx, AV_LOG_ERROR, "bad magic number\n");
         return AVERROR_INVALIDDATA;
     }
 
-    rle                  = bytestream2_get_byte(&s->g);
-    s->bytes_per_channel = bytestream2_get_byte(&s->g);
-    dimension            = bytestream2_get_be16(&s->g);
-    s->width             = bytestream2_get_be16(&s->g);
-    s->height            = bytestream2_get_be16(&s->g);
-    s->depth             = bytestream2_get_be16(&s->g);
+    rle                  = bytestream2_get_byteu(&s->g);
+    s->bytes_per_channel = bytestream2_get_byteu(&s->g);
+    dimension            = bytestream2_get_be16u(&s->g);
+    s->width             = bytestream2_get_be16u(&s->g);
+    s->height            = bytestream2_get_be16u(&s->g);
+    s->depth             = bytestream2_get_be16u(&s->g);
 
     if (s->bytes_per_channel != 1 && s->bytes_per_channel != 2) {
         av_log(avctx, AV_LOG_ERROR, "wrong channel number\n");
-        return AVERROR(EINVAL);
+        return AVERROR_INVALIDDATA;
     }
 
     /* Check for supported image dimensions. */
     if (dimension != 2 && dimension != 3) {
         av_log(avctx, AV_LOG_ERROR, "wrong dimension number\n");
-        return AVERROR(EINVAL);
+        return AVERROR_INVALIDDATA;
     }
 
     if (s->depth == SGI_GRAYSCALE) {
@@ -246,18 +247,15 @@ static int decode_frame(AVCodecContext *avctx,
         avctx->pix_fmt = s->bytes_per_channel == 2 ? AV_PIX_FMT_RGBA64BE : AV_PIX_FMT_RGBA;
     } else {
         av_log(avctx, AV_LOG_ERROR, "wrong picture format\n");
-        return AVERROR(EINVAL);
+        return AVERROR_INVALIDDATA;
     }
 
     ret = ff_set_dimensions(avctx, s->width, s->height);
     if (ret < 0)
         return ret;
 
-    ret = ff_get_buffer(avctx, p, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed.\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
@@ -274,13 +272,11 @@ static int decode_frame(AVCodecContext *avctx,
     } else {
         ret = read_uncompressed_sgi(out_buf, s);
     }
-
-    if (ret == 0) {
-        *got_frame = 1;
-        return avpkt->size;
-    } else {
+    if (ret)
         return ret;
-    }
+
+    *got_frame = 1;
+    return avpkt->size;
 }
 
 static av_cold int sgi_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/sgienc.c b/libavcodec/sgienc.c
index 07e224c..13756f1 100644
--- a/libavcodec/sgienc.c
+++ b/libavcodec/sgienc.c
@@ -2,20 +2,20 @@
  * SGI image encoder
  * Todd Kirby <doubleshot@pacbell.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,6 +41,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     if (avctx->width > 65535 || avctx->height > 65535) {
         av_log(avctx, AV_LOG_ERROR,
                "Unsupported resolution %dx%d.\n", avctx->width, avctx->height);
+        av_log(avctx, AV_LOG_ERROR, "SGI does not support resolutions above 65535x65535\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -167,10 +168,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     else // assume sgi_rle_encode() produces at most 2x size of input
         length += tablesize * 2 + depth * height * (2 * width + 1);
 
-    if ((ret = ff_alloc_packet(pkt, bytes_per_channel * length)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", length);
+    if ((ret = ff_alloc_packet2(avctx, pkt, bytes_per_channel * length, 0)) < 0)
         return ret;
-    }
 
     bytestream2_init_writer(&pbc, pkt->data, pkt->size);
 
diff --git a/libavcodec/sgirledec.c b/libavcodec/sgirledec.c
index 67d79c8..aa4f0e7 100644
--- a/libavcodec/sgirledec.c
+++ b/libavcodec/sgirledec.c
@@ -2,20 +2,20 @@
  * Silicon Graphics RLE 8-bit video decoder
  * Copyright (c) 2012 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,17 +31,9 @@
 #include "avcodec.h"
 #include "internal.h"
 
-typedef struct SGIRLEContext {
-    AVFrame *frame;
-} SGIRLEContext;
-
 static av_cold int sgirle_decode_init(AVCodecContext *avctx)
 {
-    SGIRLEContext *s = avctx->priv_data;
     avctx->pix_fmt = AV_PIX_FMT_BGR8;
-    s->frame = av_frame_alloc();
-    if (!s->frame)
-        return AVERROR(ENOMEM);
     return 0;
 }
 
@@ -49,9 +41,9 @@ static av_cold int sgirle_decode_init(AVCodecContext *avctx)
  * Convert SGI RBG323 pixel into AV_PIX_FMT_BGR8
  * SGI RGB data is packed as 8bpp, (msb)3R 2B 3G(lsb)
  */
-#define RBG323_TO_BGR8(x) (((x << 3) & 0xC0) |                                \
-                           ((x << 3) & 0x38) |                                \
-                           ((x >> 5) & 7))
+#define RBG323_TO_BGR8(x) ((((x) << 3) & 0xC0) |                                \
+                           (((x) << 3) & 0x38) |                                \
+                           (((x) >> 5) & 7))
 static av_always_inline
 void rbg323_to_bgr8(uint8_t *dst, const uint8_t *src, int size)
 {
@@ -110,8 +102,8 @@ static int decode_sgirle8(AVCodecContext *avctx, uint8_t *dst,
                 v   -= length;
             } while (v > 0);
         } else {
-            av_log(avctx, AV_LOG_ERROR, "Invalid opcode %d.\n", v);
-            return AVERROR_INVALIDDATA;
+            avpriv_request_sample(avctx, "opcode %d", v);
+            return AVERROR_PATCHWELCOME;
         }
     }
     return 0;
@@ -120,41 +112,31 @@ static int decode_sgirle8(AVCodecContext *avctx, uint8_t *dst,
 static int sgirle_decode_frame(AVCodecContext *avctx, void *data,
                                int *got_frame, AVPacket *avpkt)
 {
-    SGIRLEContext *s = avctx->priv_data;
+    AVFrame *frame = data;
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
 
-    ret = decode_sgirle8(avctx, s->frame->data[0], avpkt->data, avpkt->size,
-                         avctx->width, avctx->height, s->frame->linesize[0]);
+    ret = decode_sgirle8(avctx, frame->data[0], avpkt->data, avpkt->size,
+                         avctx->width, avctx->height, frame->linesize[0]);
     if (ret < 0)
         return ret;
 
+    frame->pict_type = AV_PICTURE_TYPE_I;
+    frame->key_frame = 1;
+
     *got_frame = 1;
-    if ((ret = av_frame_ref(data, s->frame)) < 0)
-        return ret;
 
     return avpkt->size;
 }
 
-static av_cold int sgirle_decode_end(AVCodecContext *avctx)
-{
-    SGIRLEContext *s = avctx->priv_data;
-
-    av_frame_free(&s->frame);
-
-    return 0;
-}
-
 AVCodec ff_sgirle_decoder = {
     .name           = "sgirle",
     .long_name      = NULL_IF_CONFIG_SMALL("Silicon Graphics RLE 8-bit video"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_SGIRLE,
-    .priv_data_size = sizeof(SGIRLEContext),
     .init           = sgirle_decode_init,
-    .close          = sgirle_decode_end,
     .decode         = sgirle_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/sh4/README b/libavcodec/sh4/README
new file mode 100644
index 0000000..8dd61fe
--- /dev/null
+++ b/libavcodec/sh4/README
@@ -0,0 +1,6 @@
+SH4 optimizations have been removed in
+commit d6096a67422534918405abb46dafbbac4608cbc3
+The last revission with the optimizations is cbfc9046e1c7e295b74f252902ae6f255eef4e78
+
+If you want to maintain these (or other) SH4 optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/sheervideo.c b/libavcodec/sheervideo.c
new file mode 100644
index 0000000..7ae5479
--- /dev/null
+++ b/libavcodec/sheervideo.c
@@ -0,0 +1,3156 @@
+/*
+ * BitJazz SheerVideo decoder
+ * Copyright (c) 2016 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+#include "thread.h"
+
+typedef struct SheerVideoContext {
+    unsigned format;
+    VLC vlc[2];
+    void (*decode_frame)(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb);
+} SheerVideoContext;
+
+static const uint8_t l_r_rgb[256] = {
+     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,
+     8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_r_rgbi[256] = {
+     3,  4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  7,
+     8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,  8,  8,  8,  8,
+     8,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_g_rgbi[256] = {
+     1,  3,  4,  5,  6,  7,  7,  8,  9,  9, 10, 10, 10, 10, 11, 11,
+    11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14,
+    14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12,
+    11, 11, 11, 10, 10, 10,  9,  9,  9,  8,  8,  7,  6,  5,  5,  3,
+};
+
+static const uint8_t l_g_rgb[256] = {
+     2,  2,  4,  4,  6,  7,  9,  9, 10, 11, 11, 11, 12, 12, 12, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 12, 12, 12, 11, 11, 11, 10,  9,  9,  8,  6,  4,  3,  3,
+};
+
+static const uint8_t l_y_ybr[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_ybr[256] = {
+     1,  2,  4,  6,  9, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,  8,  5,  3,
+};
+
+static const uint8_t l_y_ybyr[256] = {
+     3,  3,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,
+     8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_u_ybyr[256] = {
+     1,  2,  4,  6,  8,  9, 10, 10, 11, 11, 12, 12, 12, 13, 13, 14,
+    14, 14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14,
+    14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10,  9,  8,  7,  6,  3,
+};
+
+static const uint8_t l_y_byry[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_byry[256] = {
+     1,  2,  4,  6,  8,  9,  9, 10, 11, 11, 12, 12, 13, 13, 13, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 14,
+    14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10,  9,  8,  7,  6,  3,
+};
+
+static const uint8_t l_y_ybr10i[1024] = {
+     3,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,
+};
+
+static const uint8_t l_y_ybr10[1024] = {
+     4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
+};
+
+static const uint8_t l_u_ybr10i[1024] = {
+     2,  3,  4,  4,  5,  5,  6,  7,  7,  8,  8,  9,  9,  9,  9, 10,
+    10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10,  9,  9,  9,  8,  8,  8,  7,  6,  5,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_ybr10[1024] = {
+     2,  3,  3,  4,  5,  5,  6,  7,  8,  9,  9, 10, 10, 10, 11, 11,
+    12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 12, 12, 12,
+    12, 11, 11, 11, 10, 10,  9,  9,  8,  8,  7,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_r_rgbx[1024] = {
+     4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,
+};
+
+static const uint8_t l_g_rgbx[1024] = {
+     3,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,
+     8,  8,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+    12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10,  9,  9,  9,  9,
+     8,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_y_yry10[1024] = {
+     4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,
+     7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,
+};
+
+static const uint8_t l_y_yry10i[1024] = {
+     3,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,
+     7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,
+};
+
+static const uint8_t l_u_yry10[1024] = {
+     2,  3,  3,  4,  5,  6,  7,  7,  8,  8,  8,  9,  9, 10, 10, 10,
+    10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13,
+    13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+    13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
+    10, 10, 10, 10,  9,  9,  9,  8,  8,  7,  7,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_yry10i[1024] = {
+     2,  4,  4,  4,  5,  6,  6,  6,  7,  7,  7,  8,  8,  8,  9,  9,
+     9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
+    11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11,
+    11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,  9,  9,  9,
+     9,  9,  8,  8,  8,  8,  7,  7,  7,  6,  6,  5,  5,  4,  4,  3,
+};
+
+static const uint8_t l_y_ybri[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  5,  5,  5,  4,  3,
+};
+
+static const uint8_t l_u_ybri[256] = {
+     1,  3,  5,  6,  8,  8,  9, 10, 10, 11, 11, 12, 12, 13, 13, 13,
+    14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14,
+    14, 13, 13, 13, 12, 12, 11, 11, 10, 10,  9,  8,  8,  6,  5,  2,
+};
+
+static const uint8_t l_y_byryi[256] = {
+     3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10,
+    10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10,
+    10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  8,  8,
+     8,  8,  8,  7,  7,  7,  7,  7,  7,  6,  6,  6,  5,  4,  4,  3,
+};
+
+static const uint8_t l_u_byryi[256] = {
+     1,  3,  4,  6,  6,  7,  8,  8,  9,  9, 10, 10, 10, 11, 11, 11,
+    12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15,
+    15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15,
+    15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+    12, 11, 11, 11, 10, 10, 10,  9,  9,  8,  8,  7,  7,  5,  4,  3,
+};
+
+static const uint8_t l_r_rgbxi[1024] = {
+     3,  4,  4,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,
+     8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
+     8,  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  5,  5,  4,  4,  4,
+};
+
+static const uint8_t l_g_rgbxi[1024] = {
+     2,  3,  4,  4,  6,  6,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,
+     9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11,
+    11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11,
+    11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10,  9,  9,  9,
+     9,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  6,  6,  4,  4,  3,
+};
+
+static void decode_ca4i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_a = (uint16_t *)p->data[3];
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 502 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int y, u, v, a;
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+                dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+                dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+                dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_ca4p(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_a = (uint16_t *)p->data[3];
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 10);
+            dst_y[x] = get_bits(gb, 10);
+            dst_u[x] = get_bits(gb, 10);
+            dst_v[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 502 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v, a;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+            dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+    dst_a += p->linesize[3] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y, u, v, a;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+            pred_TL[3] = pred_L[3] = dst_a[-p->linesize[3] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x];
+                pred_T[3] = dst_a[-p->linesize[3] / 2 + x];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[3] = (a + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0x3ff;
+                dst_y[x] = pred_L[0] = (y + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x] = pred_L[1] = (u + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_v[x] = pred_L[2] = (v + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_ybr10i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 512 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int y, u, v;
+
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+                dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+                dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_ybr10(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_y[x] = get_bits(gb, 10);
+            dst_u[x] = get_bits(gb, 10);
+            dst_v[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 512 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v;
+
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x] = pred[0] = (y + pred[0]) & 0x3ff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0x3ff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 10);
+                dst_u[x] = get_bits(gb, 10);
+                dst_v[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x];
+
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred_L[0] = (y + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x] = pred_L[1] = (u + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_v[x] = pred_L[2] = (v + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_yry10i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 0 };
+
+            for (x = 0; x < avctx->width; x += 2) {
+                int y1, y2, u, v;
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+                dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+                dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+                dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_yry10(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 10);
+            dst_u[x / 2] = get_bits(gb, 10);
+            dst_y[x + 1] = get_bits(gb, 10);
+            dst_v[x / 2] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[6], pred_L[6], pred_T[6];
+            int y1, y2, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[3] = dst_y[-p->linesize[0] / 2 + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x / 2];
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0x3ff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0x3ff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0x3ff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+    }
+}
+
+static void decode_ca2i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+    dst_a = (uint16_t *)p->data[3];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 10);
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_a[x + 1] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 502, 512, 512, 502 };
+
+            for (x = 0; x < avctx->width; x += 2) {
+                int y1, y2, u, v, a1, a2;
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+                dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+                dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+                dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0x3ff;
+                dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+                dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0x3ff;
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_ca2p(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = (uint16_t *)p->data[0];
+    dst_u = (uint16_t *)p->data[1];
+    dst_v = (uint16_t *)p->data[2];
+    dst_a = (uint16_t *)p->data[3];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_a[x    ] = get_bits(gb, 10);
+            dst_y[x    ] = get_bits(gb, 10);
+            dst_u[x / 2] = get_bits(gb, 10);
+            dst_a[x + 1] = get_bits(gb, 10);
+            dst_y[x + 1] = get_bits(gb, 10);
+            dst_v[x / 2] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 502, 512, 512, 502 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v, a1, a2;
+
+            a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0x3ff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0x3ff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0x3ff;
+            dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0x3ff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0x3ff;
+            dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0x3ff;
+        }
+    }
+
+    dst_y += p->linesize[0] / 2;
+    dst_u += p->linesize[1] / 2;
+    dst_v += p->linesize[2] / 2;
+    dst_a += p->linesize[3] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 10);
+                dst_y[x    ] = get_bits(gb, 10);
+                dst_u[x / 2] = get_bits(gb, 10);
+                dst_a[x + 1] = get_bits(gb, 10);
+                dst_y[x + 1] = get_bits(gb, 10);
+                dst_v[x / 2] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[6], pred_L[6], pred_T[6];
+            int y1, y2, u, v, a1, a2;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0] / 2];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1] / 2];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2] / 2];
+            pred_TL[4] = pred_L[4] = dst_a[-p->linesize[3] / 2];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] / 2 + x];
+                pred_T[3] = dst_y[-p->linesize[0] / 2 + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] / 2 + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] / 2 + x / 2];
+                pred_T[4] = dst_a[-p->linesize[3] / 2 + x];
+                pred_T[5] = dst_a[-p->linesize[3] / 2 + x + 1];
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0x3ff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0x3ff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0x3ff;
+                dst_a[x    ] = pred_L[4] = (a1 + ((3 * (pred_T[4] + pred_L[4]) - 2 * pred_TL[4]) >> 2)) & 0x3ff;
+                dst_a[x + 1] = pred_L[4] = (a2 + ((3 * (pred_T[5] + pred_L[4]) - 2 * pred_T[4]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[4] = pred_T[5];
+            }
+        }
+
+        dst_y += p->linesize[0] / 2;
+        dst_u += p->linesize[1] / 2;
+        dst_v += p->linesize[2] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_c82i(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+    dst_a = p->data[3];
+
+    for (y = 0; y < avctx->height; y += 1) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 8);
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_a[x + 1] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred[4] = { 125, -128, -128, 125 };
+
+            for (x = 0; x < avctx->width; x += 2) {
+                int y1, y2, u, v, a1, a2;
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+                dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+                dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+                dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+                dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0xff;
+                dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0xff;
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+        dst_a += p->linesize[3];
+    }
+}
+
+static void decode_c82p(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v, *dst_a;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+    dst_a = p->data[3];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_a[x    ] = get_bits(gb, 8);
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8);
+            dst_a[x + 1] = get_bits(gb, 8);
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, 125 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v, a1, a2;
+
+            a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_a[x    ] = pred[3] = (a1 + pred[3]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+            dst_a[x + 1] = pred[3] = (a2 + pred[3]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+    dst_a += p->linesize[3];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_a[x    ] = get_bits(gb, 8);
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_a[x + 1] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[6], pred_L[6], pred_T[6];
+            int y1, y2, u, v, a1, a2;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+            pred_TL[4] = pred_L[4] = dst_a[-p->linesize[3]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[3] = dst_y[-p->linesize[0] + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] + x / 2];
+                pred_T[4] = dst_a[-p->linesize[3] + x];
+                pred_T[5] = dst_a[-p->linesize[3] + x + 1];
+
+                a1 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                a2 = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0xff;
+                dst_a[x    ] = pred_L[4] = (a1 + ((3 * (pred_T[4] + pred_L[4]) - 2 * pred_TL[4]) >> 2)) & 0xff;
+                dst_a[x + 1] = pred_L[4] = (a2 + ((3 * (pred_T[5] + pred_L[4]) - 2 * pred_T[4]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[4] = pred_T[5];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+        dst_a += p->linesize[3];
+    }
+}
+
+static void decode_ybyr(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8);
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, 128, 128, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y1, y2, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[3] = dst_y[-p->linesize[0] + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] + x / 2];
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0xff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_byryi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8);
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int y1, y2, u, v;
+
+            pred_L[0] = dst_y[-p->linesize[0]];
+            pred_L[1] = dst_u[-p->linesize[1]];
+            pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + pred_L[0]) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u  + pred_L[1]) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + pred_L[0]) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v +  pred_L[2]) & 0xff;
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_byry(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x += 2) {
+            dst_y[x    ] = get_bits(gb, 8);
+            dst_u[x / 2] = get_bits(gb, 8);
+            dst_y[x + 1] = get_bits(gb, 8);
+            dst_v[x / 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, 0 };
+
+        for (x = 0; x < avctx->width; x += 2) {
+            int y1, y2, u, v;
+
+            y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x    ] = pred[0] = (y1 + pred[0]) & 0xff;
+            dst_u[x / 2] = pred[1] = (u  + pred[1]) & 0xff;
+            dst_y[x + 1] = pred[0] = (y2 + pred[0]) & 0xff;
+            dst_v[x / 2] = pred[2] = (v  + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x += 2) {
+                dst_y[x    ] = get_bits(gb, 8);
+                dst_u[x / 2] = get_bits(gb, 8);
+                dst_y[x + 1] = get_bits(gb, 8);
+                dst_v[x / 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y1, y2, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x += 2) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[3] = dst_y[-p->linesize[0] + x + 1];
+                pred_T[1] = dst_u[-p->linesize[1] + x / 2];
+                pred_T[2] = dst_v[-p->linesize[2] + x / 2];
+
+                y1 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y2 = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                v  = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x    ] = pred_L[0] = (y1 + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x / 2] = pred_L[1] = (u + (((pred_L[1] - pred_TL[1]) >> 1) + pred_T[1])) & 0xff;
+                dst_y[x + 1] = pred_L[0] = (y2 + ((3 * (pred_T[3] + pred_L[0]) - 2 * pred_T[0]) >> 2)) & 0xff;
+                dst_v[x / 2] = pred_L[2] = (v + (((pred_L[2] - pred_TL[2]) >> 1) + pred_T[2])) & 0xff;
+
+                pred_TL[0] = pred_T[3];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_ybri(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v;
+
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x] = pred[0] = (y + pred[0]) & 0xff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0xff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int y, u, v;
+
+            pred_L[0] = dst_y[-p->linesize[0]];
+            pred_L[1] = dst_u[-p->linesize[1]];
+            pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred_L[0] = (y + pred_L[0]) & 0xff;
+                dst_u[x] = pred_L[1] = (u + pred_L[1]) & 0xff;
+                dst_v[x] = pred_L[2] = (v + pred_L[2]) & 0xff;
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_ybr(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int y, u, v;
+
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_y[x] = pred[0] = (y + pred[0]) & 0xff;
+            dst_u[x] = pred[1] = (u + pred[1]) & 0xff;
+            dst_v[x] = pred[2] = (v + pred[2]) & 0xff;
+        }
+    }
+
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int y, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_y[-p->linesize[0]];
+            pred_TL[1] = pred_L[1] = dst_u[-p->linesize[1]];
+            pred_TL[2] = pred_L[2] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_y[-p->linesize[0] + x];
+                pred_T[1] = dst_u[-p->linesize[1] + x];
+                pred_T[2] = dst_v[-p->linesize[2] + x];
+
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_y[x] = pred_L[0] = (y + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_u[x] = pred_L[1] = (u + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst_v[x] = pred_L[2] = (v + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_aybri(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_a, *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_a = p->data[3];
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 8);
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, 125, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, y, u, v;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[0] = (a + pred[0]) & 0xff;
+            dst_y[x] = pred[1] = (y + pred[1]) & 0xff;
+            dst_u[x] = pred[2] = (u + pred[2]) & 0xff;
+            dst_v[x] = pred[3] = (v + pred[3]) & 0xff;
+        }
+    }
+
+    dst_a += p->linesize[3];
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 8);
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int a, y, u, v;
+
+            pred_L[0] = dst_a[-p->linesize[3]];
+            pred_L[1] = dst_y[-p->linesize[0]];
+            pred_L[2] = dst_u[-p->linesize[1]];
+            pred_L[3] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[0] = (a + pred_L[0]) & 0xff;
+                dst_y[x] = pred_L[1] = (y + pred_L[1]) & 0xff;
+                dst_u[x] = pred_L[2] = (u + pred_L[2]) & 0xff;
+                dst_v[x] = pred_L[3] = (v + pred_L[3]) & 0xff;
+            }
+        }
+
+        dst_a += p->linesize[3];
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_aybr(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst_a, *dst_y, *dst_u, *dst_v;
+    int x, y;
+
+    dst_a = p->data[3];
+    dst_y = p->data[0];
+    dst_u = p->data[1];
+    dst_v = p->data[2];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 8);
+            dst_y[x] = get_bits(gb, 8);
+            dst_u[x] = get_bits(gb, 8);
+            dst_v[x] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { 125, 125, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, y, u, v;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[0] = (a + pred[0]) & 0xff;
+            dst_y[x] = pred[1] = (y + pred[1]) & 0xff;
+            dst_u[x] = pred[2] = (u + pred[2]) & 0xff;
+            dst_v[x] = pred[3] = (v + pred[3]) & 0xff;
+        }
+    }
+
+    dst_a += p->linesize[3];
+    dst_y += p->linesize[0];
+    dst_u += p->linesize[1];
+    dst_v += p->linesize[2];
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 8);
+                dst_y[x] = get_bits(gb, 8);
+                dst_u[x] = get_bits(gb, 8);
+                dst_v[x] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int a, y, u, v;
+
+            pred_TL[0] = pred_L[0] = dst_a[-p->linesize[3]];
+            pred_TL[1] = pred_L[1] = dst_y[-p->linesize[0]];
+            pred_TL[2] = pred_L[2] = dst_u[-p->linesize[1]];
+            pred_TL[3] = pred_L[3] = dst_v[-p->linesize[2]];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_a[-p->linesize[3] + x];
+                pred_T[1] = dst_y[-p->linesize[0] + x];
+                pred_T[2] = dst_u[-p->linesize[1] + x];
+                pred_T[3] = dst_v[-p->linesize[2] + x];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                y = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                u = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                v = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[0] = (a + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst_y[x] = pred_L[1] = (y + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst_u[x] = pred_L[2] = (u + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+                dst_v[x] = pred_L[3] = (v + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+
+        dst_a += p->linesize[3];
+        dst_y += p->linesize[0];
+        dst_u += p->linesize[1];
+        dst_v += p->linesize[2];
+    }
+}
+
+static void decode_argxi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b, *dst_a;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+    dst_a = (uint16_t *)p->data[3];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 512, 512, 512, 512 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int r, g, b, a;
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+                dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+                dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+                dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_argx(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b, *dst_a;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+    dst_a = (uint16_t *)p->data[3];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_a[x] = get_bits(gb, 10);
+            dst_r[x] = get_bits(gb, 10);
+            dst_g[x] = get_bits(gb, 10);
+            dst_b[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 512, 512, 512, 512 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b, a;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_a[x] = pred[3] = (a + pred[3]) & 0x3ff;
+            dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+            dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+            dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_r += p->linesize[2] / 2;
+    dst_g += p->linesize[0] / 2;
+    dst_b += p->linesize[1] / 2;
+    dst_a += p->linesize[3] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_a[x] = get_bits(gb, 10);
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int r, g, b, a;
+
+            pred_TL[0] = pred_L[0] = dst_r[-p->linesize[2] / 2];
+            pred_TL[1] = pred_L[1] = dst_g[-p->linesize[0] / 2];
+            pred_TL[2] = pred_L[2] = dst_b[-p->linesize[1] / 2];
+            pred_TL[3] = pred_L[3] = dst_a[-p->linesize[3] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_r[-p->linesize[2] / 2 + x];
+                pred_T[1] = dst_g[-p->linesize[0] / 2 + x];
+                pred_T[2] = dst_b[-p->linesize[1] / 2 + x];
+                pred_T[3] = dst_a[-p->linesize[3] / 2 + x];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_a[x] = pred_L[3] = (a + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0x3ff;
+                dst_r[x] = pred_L[0] = (r + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_g[x] = pred_L[1] = (r + g + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_b[x] = pred_L[2] = (r + g + b + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+        dst_a += p->linesize[3] / 2;
+    }
+}
+
+static void decode_rgbxi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+
+    for (y = 0; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred[4] = { 512, 512, 512, 0 };
+
+            for (x = 0; x < avctx->width; x++) {
+                int r, g, b;
+
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+                dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+                dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+    }
+}
+
+static void decode_rgbx(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint16_t *dst_r, *dst_g, *dst_b;
+    int x, y;
+
+    dst_r = (uint16_t *)p->data[2];
+    dst_g = (uint16_t *)p->data[0];
+    dst_b = (uint16_t *)p->data[1];
+
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst_r[x] = get_bits(gb, 10);
+            dst_g[x] = get_bits(gb, 10);
+            dst_b[x] = get_bits(gb, 10);
+        }
+    } else {
+        int pred[4] = { 512, 512, 512, 0 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b;
+
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst_r[x] = pred[0] = (r + pred[0]) & 0x3ff;
+            dst_g[x] = pred[1] = (r + g + pred[1]) & 0x3ff;
+            dst_b[x] = pred[2] = (r + g + b + pred[2]) & 0x3ff;
+        }
+    }
+
+    dst_r += p->linesize[2] / 2;
+    dst_g += p->linesize[0] / 2;
+    dst_b += p->linesize[1] / 2;
+
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst_r[x] = get_bits(gb, 10);
+                dst_g[x] = get_bits(gb, 10);
+                dst_b[x] = get_bits(gb, 10);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int r, g, b;
+
+            pred_TL[0] = pred_L[0] = dst_r[-p->linesize[2] / 2];
+            pred_TL[1] = pred_L[1] = dst_g[-p->linesize[0] / 2];
+            pred_TL[2] = pred_L[2] = dst_b[-p->linesize[1] / 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst_r[-p->linesize[2] / 2 + x];
+                pred_T[1] = dst_g[-p->linesize[0] / 2 + x];
+                pred_T[2] = dst_b[-p->linesize[1] / 2 + x];
+
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst_r[x] = pred_L[0] = (r + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0x3ff;
+                dst_g[x] = pred_L[1] = (r + g + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0x3ff;
+                dst_b[x] = pred_L[2] = (r + g + b + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0x3ff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+
+        dst_r += p->linesize[2] / 2;
+        dst_g += p->linesize[0] / 2;
+        dst_b += p->linesize[1] / 2;
+    }
+}
+
+static void decode_argbi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+            dst[x * 4 + 3] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, r, g, b;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (a + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + pred[2]) & 0xff;
+            dst[4 * x + 3] = pred[3] = (r + g + b + pred[3]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+                dst[x * 4 + 3] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int a, r, g, b;
+
+            pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_L[2] = dst[-p->linesize[0] + 2];
+            pred_L[3] = dst[-p->linesize[0] + 3];
+
+            for (x = 0; x < avctx->width; x++) {
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (a + pred_L[0]) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + pred_L[1]) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + pred_L[2]) & 0xff;
+                dst[4 * x + 3] = pred_L[3] = (r + g + b + pred_L[3]) & 0xff;
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static void decode_argb(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+            dst[x * 4 + 3] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int a, r, g, b;
+
+            a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (a + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + pred[2]) & 0xff;
+            dst[4 * x + 3] = pred[3] = (r + g + b + pred[3]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+                dst[x * 4 + 3] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int a, r, g, b;
+
+            pred_TL[0] = pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_TL[1] = pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_TL[2] = pred_L[2] = dst[-p->linesize[0] + 2];
+            pred_TL[3] = pred_L[3] = dst[-p->linesize[0] + 3];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst[-p->linesize[0] + 4 * x + 0];
+                pred_T[1] = dst[-p->linesize[0] + 4 * x + 1];
+                pred_T[2] = dst[-p->linesize[0] + 4 * x + 2];
+                pred_T[3] = dst[-p->linesize[0] + 4 * x + 3];
+
+                a = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (a + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+                dst[4 * x + 3] = pred_L[3] = (r + g + b + ((3 * (pred_T[3] + pred_L[3]) - 2 * pred_TL[3]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+                pred_TL[3] = pred_T[3];
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static void decode_rgbi(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b;
+
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (r + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + g + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + b + pred[2]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_L[4];
+            int r, g, b;
+
+            pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_L[2] = dst[-p->linesize[0] + 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (r + pred_L[0]) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + g + pred_L[1]) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + b + pred_L[2]) & 0xff;
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static void decode_rgb(AVCodecContext *avctx, AVFrame *p, GetBitContext *gb)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    uint8_t *dst;
+    int x, y;
+
+    dst = p->data[0];
+    if (get_bits1(gb)) {
+        for (x = 0; x < avctx->width; x++) {
+            dst[x * 4 + 0] = get_bits(gb, 8);
+            dst[x * 4 + 1] = get_bits(gb, 8);
+            dst[x * 4 + 2] = get_bits(gb, 8);
+        }
+    } else {
+        int pred[4] = { -128, -128, -128, -128 };
+
+        for (x = 0; x < avctx->width; x++) {
+            int r, g, b;
+
+            r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+            g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+            b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+            dst[4 * x + 0] = pred[0] = (r + pred[0]) & 0xff;
+            dst[4 * x + 1] = pred[1] = (r + g + pred[1]) & 0xff;
+            dst[4 * x + 2] = pred[2] = (r + g + b + pred[2]) & 0xff;
+        }
+    }
+
+    dst += p->linesize[0];
+    for (y = 1; y < avctx->height; y++) {
+        if (get_bits1(gb)) {
+            for (x = 0; x < avctx->width; x++) {
+                dst[x * 4 + 0] = get_bits(gb, 8);
+                dst[x * 4 + 1] = get_bits(gb, 8);
+                dst[x * 4 + 2] = get_bits(gb, 8);
+            }
+        } else {
+            int pred_TL[4], pred_L[4], pred_T[4];
+            int r, g, b;
+
+            pred_TL[0] = pred_L[0] = dst[-p->linesize[0] + 0];
+            pred_TL[1] = pred_L[1] = dst[-p->linesize[0] + 1];
+            pred_TL[2] = pred_L[2] = dst[-p->linesize[0] + 2];
+
+            for (x = 0; x < avctx->width; x++) {
+                pred_T[0] = dst[-p->linesize[0] + 4 * x + 0];
+                pred_T[1] = dst[-p->linesize[0] + 4 * x + 1];
+                pred_T[2] = dst[-p->linesize[0] + 4 * x + 2];
+
+                r = get_vlc2(gb, s->vlc[0].table, s->vlc[0].bits, 2);
+                g = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+                b = get_vlc2(gb, s->vlc[1].table, s->vlc[1].bits, 2);
+
+                dst[4 * x + 0] = pred_L[0] = (r + ((3 * (pred_T[0] + pred_L[0]) - 2 * pred_TL[0]) >> 2)) & 0xff;
+                dst[4 * x + 1] = pred_L[1] = (r + g + ((3 * (pred_T[1] + pred_L[1]) - 2 * pred_TL[1]) >> 2)) & 0xff;
+                dst[4 * x + 2] = pred_L[2] = (r + g + b + ((3 * (pred_T[2] + pred_L[2]) - 2 * pred_TL[2]) >> 2)) & 0xff;
+
+                pred_TL[0] = pred_T[0];
+                pred_TL[1] = pred_T[1];
+                pred_TL[2] = pred_T[2];
+            }
+        }
+        dst += p->linesize[0];
+    }
+}
+
+static int build_vlc(VLC *vlc, const uint8_t *len, int count)
+{
+    uint32_t codes[1024];
+    uint8_t bits[1024];
+    uint16_t syms[1024];
+    uint64_t index;
+    int i;
+
+    index = 0;
+    for (i = 0; i < count; i++) {
+        codes[i]  = index >> (32 - len[i]);
+        bits[i] = len[i];
+        syms[i]  = i;
+        index += 1ULL << (32 - len[i]);
+    }
+
+    ff_free_vlc(vlc);
+    return ff_init_vlc_sparse(vlc, 16, count,
+                              bits,  sizeof(*bits),  sizeof(*bits),
+                              codes, sizeof(*codes), sizeof(*codes),
+                              syms,  sizeof(*syms),  sizeof(*syms), 0);
+}
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    SheerVideoContext *s = avctx->priv_data;
+    ThreadFrame frame = { .f = data };
+    AVFrame *p = data;
+    GetBitContext gb;
+    unsigned format;
+    int ret;
+
+    if (avpkt->size <= 20)
+        return AVERROR_INVALIDDATA;
+
+    if (AV_RL32(avpkt->data) != MKTAG('S','h','i','r') &&
+        AV_RL32(avpkt->data) != MKTAG('Z','w','a','k'))
+        return AVERROR_INVALIDDATA;
+
+    format = AV_RL32(avpkt->data + 16);
+    switch (format) {
+    case MKTAG(' ', 'R', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
+        s->decode_frame = decode_rgb;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgb, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgb, 256);
+        }
+        break;
+    case MKTAG(' ', 'r', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_RGB0;
+        s->decode_frame = decode_rgbi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbi, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbi, 256);
+        }
+        break;
+    case MKTAG('A', 'R', 'G', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        s->decode_frame = decode_argx;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbx, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbx, 1024);
+        }
+        break;
+    case MKTAG('A', 'r', 'G', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        s->decode_frame = decode_argxi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbxi, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbxi, 1024);
+        }
+        break;
+    case MKTAG('R', 'G', 'B', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        s->decode_frame = decode_rgbx;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbx, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbx, 1024);
+        }
+        break;
+    case MKTAG('r', 'G', 'B', 'X'):
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        s->decode_frame = decode_rgbxi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbxi, 1024);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbxi, 1024);
+        }
+        break;
+    case MKTAG('A', 'R', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_ARGB;
+        s->decode_frame = decode_argb;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgb, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgb, 256);
+        }
+        break;
+    case MKTAG('A', 'r', 'G', 'B'):
+        avctx->pix_fmt = AV_PIX_FMT_ARGB;
+        s->decode_frame = decode_argbi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_r_rgbi, 256);
+            ret |= build_vlc(&s->vlc[1], l_g_rgbi, 256);
+        }
+        break;
+    case MKTAG('A', 'Y', 'B', 'R'):
+    case MKTAG('A', 'Y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        s->decode_frame = decode_aybr;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr, 256);
+        }
+        break;
+    case MKTAG('A', 'y', 'B', 'R'):
+    case MKTAG('A', 'y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+        s->decode_frame = decode_aybri;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybri, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybri, 256);
+        }
+        break;
+    case MKTAG(' ', 'Y', 'B', 'R'):
+    case MKTAG(' ', 'Y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        s->decode_frame = decode_ybr;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr, 256);
+        }
+        break;
+    case MKTAG(' ', 'y', 'B', 'R'):
+    case MKTAG(' ', 'y', 'b', 'R'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+        s->decode_frame = decode_ybri;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybri, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybri, 256);
+        }
+        break;
+    case MKTAG('Y', 'B', 'R', 0x0a):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+        s->decode_frame = decode_ybr10;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10, 1024);
+        }
+        break;
+    case MKTAG('y', 'B', 'R', 0x0a):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+        s->decode_frame = decode_ybr10i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10i, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '4', 'p'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P10;
+        s->decode_frame = decode_ca4p;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '4', 'i'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA444P10;
+        s->decode_frame = decode_ca4i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybr10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_ybr10i, 1024);
+        }
+        break;
+    case MKTAG('B', 'Y', 'R', 'Y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        s->decode_frame = decode_byry;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byry, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byry, 256);
+        }
+        break;
+    case MKTAG('B', 'Y', 'R', 'y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        s->decode_frame = decode_byryi;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byryi, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byryi, 256);
+        }
+        break;
+    case MKTAG('Y', 'b', 'Y', 'r'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+        s->decode_frame = decode_ybyr;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_ybyr, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_ybyr, 256);
+        }
+        break;
+    case MKTAG('C', '8', '2', 'p'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+        s->decode_frame = decode_c82p;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byry, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byry, 256);
+        }
+        break;
+    case MKTAG('C', '8', '2', 'i'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P;
+        s->decode_frame = decode_c82i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_byryi, 256);
+            ret |= build_vlc(&s->vlc[1], l_u_byryi, 256);
+        }
+        break;
+    case MKTAG(0xa2, 'Y', 'R', 'Y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        s->decode_frame = decode_yry10;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10, 1024);
+        }
+        break;
+    case MKTAG(0xa2, 'Y', 'R', 'y'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        s->decode_frame = decode_yry10i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10i, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '2', 'p'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P10;
+        s->decode_frame = decode_ca2p;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10, 1024);
+        }
+        break;
+    case MKTAG('C', 'A', '2', 'i'):
+        avctx->pix_fmt = AV_PIX_FMT_YUVA422P10;
+        s->decode_frame = decode_ca2i;
+        if (s->format != format) {
+            ret  = build_vlc(&s->vlc[0], l_y_yry10i, 1024);
+            ret |= build_vlc(&s->vlc[1], l_u_yry10i, 1024);
+        }
+        break;
+    default:
+        avpriv_request_sample(avctx, "unsupported format: 0x%X", format);
+        return AVERROR_PATCHWELCOME;
+    }
+
+    if (s->format != format) {
+        if (ret < 0)
+            return ret;
+        s->format = format;
+    }
+
+    p->pict_type = AV_PICTURE_TYPE_I;
+    p->key_frame = 1;
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    if ((ret = init_get_bits8(&gb, avpkt->data + 20, avpkt->size - 20)) < 0)
+        return ret;
+
+    s->decode_frame(avctx, p, &gb);
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+#if HAVE_THREADS
+static int decode_init_thread_copy(AVCodecContext *avctx)
+{
+    SheerVideoContext *s = avctx->priv_data;
+
+    s->format = 0;
+    memset(&s->vlc[0], 0, sizeof(s->vlc[0]));
+    memset(&s->vlc[1], 0, sizeof(s->vlc[1]));
+
+    return 0;
+}
+#endif
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    SheerVideoContext *s = avctx->priv_data;
+
+    ff_free_vlc(&s->vlc[0]);
+    ff_free_vlc(&s->vlc[1]);
+
+    return 0;
+}
+
+AVCodec ff_sheervideo_decoder = {
+    .name             = "sheervideo",
+    .long_name        = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_SHEERVIDEO,
+    .priv_data_size   = sizeof(SheerVideoContext),
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(decode_init_thread_copy),
+    .close            = decode_end,
+    .decode           = decode_frame,
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+};
diff --git a/libavcodec/shorten.c b/libavcodec/shorten.c
index ee6dc70..e4cef61 100644
--- a/libavcodec/shorten.c
+++ b/libavcodec/shorten.c
@@ -2,20 +2,20 @@
  * Shorten decoder
  * Copyright (c) 2005 Jeff Muizelaar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,8 +49,12 @@
 #define ENERGYSIZE 3
 #define BITSHIFTSIZE 2
 
+#define TYPE_S8    1
+#define TYPE_U8    2
 #define TYPE_S16HL 3
+#define TYPE_U16HL 4
 #define TYPE_S16LH 5
+#define TYPE_U16LH 6
 
 #define NWRAP 3
 #define NSKIPSIZE 1
@@ -111,7 +115,6 @@ static av_cold int shorten_decode_init(AVCodecContext *avctx)
 {
     ShortenContext *s = avctx->priv_data;
     s->avctx          = avctx;
-    avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
 
     return 0;
 }
@@ -125,19 +128,18 @@ static int allocate_buffers(ShortenContext *s)
             av_log(s->avctx, AV_LOG_ERROR, "nmean too large\n");
             return AVERROR_INVALIDDATA;
         }
-        if (s->blocksize + s->nwrap >= UINT_MAX / sizeof(int32_t) ||
-            s->blocksize + s->nwrap <= (unsigned)s->nwrap) {
+        if (s->blocksize + (uint64_t)s->nwrap >= UINT_MAX / sizeof(int32_t)) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "s->blocksize + s->nwrap too large\n");
             return AVERROR_INVALIDDATA;
         }
 
-        if ((err = av_reallocp(&s->offset[chan],
-                               sizeof(int32_t) *
+        if ((err = av_reallocp_array(&s->offset[chan],
+                               sizeof(int32_t),
                                FFMAX(1, s->nmean))) < 0)
             return err;
 
-        if ((err = av_reallocp(&s->decoded_base[chan], (s->blocksize + s->nwrap) *
+        if ((err = av_reallocp_array(&s->decoded_base[chan], (s->blocksize + s->nwrap),
                                sizeof(s->decoded_base[0][0]))) < 0)
             return err;
         for (i = 0; i < s->nwrap; i++)
@@ -145,7 +147,7 @@ static int allocate_buffers(ShortenContext *s)
         s->decoded[chan] = s->decoded_base[chan] + s->nwrap;
     }
 
-    if ((err = av_reallocp(&s->coeffs, s->nwrap * sizeof(*s->coeffs))) < 0)
+    if ((err = av_reallocp_array(&s->coeffs, s->nwrap, sizeof(*s->coeffs))) < 0)
         return err;
 
     return 0;
@@ -162,9 +164,13 @@ static void fix_bitshift(ShortenContext *s, int32_t *buffer)
 {
     int i;
 
-    if (s->bitshift != 0)
+    if (s->bitshift == 32) {
+        for (i = 0; i < s->blocksize; i++)
+            buffer[i] = 0;
+    } else if (s->bitshift != 0) {
         for (i = 0; i < s->blocksize; i++)
             buffer[i] <<= s->bitshift;
+    }
 }
 
 static int init_offset(ShortenContext *s)
@@ -174,13 +180,17 @@ static int init_offset(ShortenContext *s)
     int nblock = FFMAX(1, s->nmean);
     /* initialise offset */
     switch (s->internal_ftype) {
+    case TYPE_U8:
+        s->avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
+        mean = 0x80;
+        break;
     case TYPE_S16HL:
     case TYPE_S16LH:
-        mean = 0;
+        s->avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         break;
     default:
-        av_log(s->avctx, AV_LOG_ERROR, "unknown audio type");
-        return AVERROR_INVALIDDATA;
+        av_log(s->avctx, AV_LOG_ERROR, "unknown audio type\n");
+        return AVERROR_PATCHWELCOME;
     }
 
     for (chan = 0; chan < s->channels; chan++)
@@ -189,10 +199,74 @@ static int init_offset(ShortenContext *s)
     return 0;
 }
 
+static int decode_aiff_header(AVCodecContext *avctx, const uint8_t *header,
+                              int header_size)
+{
+    int len, bps, exp;
+    GetByteContext gb;
+    uint64_t val;
+    uint32_t tag;
+
+    bytestream2_init(&gb, header, header_size);
+
+    if (bytestream2_get_le32(&gb) != MKTAG('F', 'O', 'R', 'M')) {
+        av_log(avctx, AV_LOG_ERROR, "missing FORM tag\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&gb, 4); /* chunk size */
+
+    tag = bytestream2_get_le32(&gb);
+    if (tag != MKTAG('A', 'I', 'F', 'F')) {
+        av_log(avctx, AV_LOG_ERROR, "missing AIFF tag\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    while (bytestream2_get_le32(&gb) != MKTAG('C', 'O', 'M', 'M')) {
+        len = bytestream2_get_be32(&gb);
+        bytestream2_skip(&gb, len + (len & 1));
+        if (len < 0 || bytestream2_get_bytes_left(&gb) < 18) {
+            av_log(avctx, AV_LOG_ERROR, "no COMM chunk found\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    len = bytestream2_get_be32(&gb);
+
+    if (len < 18) {
+        av_log(avctx, AV_LOG_ERROR, "COMM chunk was too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    bytestream2_skip(&gb, 6);
+    bps = bytestream2_get_be16(&gb);
+    avctx->bits_per_coded_sample = bps;
+
+    if (bps != 16 && bps != 8) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample: %d\n", bps);
+        return AVERROR(ENOSYS);
+    }
+
+    exp = bytestream2_get_be16(&gb) - 16383 - 63;
+    val = bytestream2_get_be64(&gb);
+    if (exp < -63 || exp > 63) {
+        av_log(avctx, AV_LOG_ERROR, "exp %d is out of range\n", exp);
+        return AVERROR_INVALIDDATA;
+    }
+    if (exp >= 0)
+        avctx->sample_rate = val << exp;
+    else
+        avctx->sample_rate = (val + (1ULL<<(-exp-1))) >> -exp;
+    len -= 18;
+    if (len > 0)
+        av_log(avctx, AV_LOG_INFO, "%d header bytes unparsed\n", len);
+
+    return 0;
+}
+
 static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
                               int header_size)
 {
-    int len;
+    int len, bps;
     short wave_format;
     GetByteContext gb;
 
@@ -213,7 +287,7 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     while (bytestream2_get_le32(&gb) != MKTAG('f', 'm', 't', ' ')) {
         len = bytestream2_get_le32(&gb);
         bytestream2_skip(&gb, len);
-        if (bytestream2_get_bytes_left(&gb) < 16) {
+        if (len < 0 || bytestream2_get_bytes_left(&gb) < 16) {
             av_log(avctx, AV_LOG_ERROR, "no fmt chunk found\n");
             return AVERROR_INVALIDDATA;
         }
@@ -239,10 +313,11 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     avctx->sample_rate = bytestream2_get_le32(&gb);
     bytestream2_skip(&gb, 4); // skip bit rate    (represents original uncompressed bit rate)
     bytestream2_skip(&gb, 2); // skip block align (not needed)
-    avctx->bits_per_coded_sample = bytestream2_get_le16(&gb);
+    bps = bytestream2_get_le16(&gb);
+    avctx->bits_per_coded_sample = bps;
 
-    if (avctx->bits_per_coded_sample != 16) {
-        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample\n");
+    if (bps != 16 && bps != 8) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample: %d\n", bps);
         return AVERROR(ENOSYS);
     }
 
@@ -253,18 +328,6 @@ static int decode_wave_header(AVCodecContext *avctx, const uint8_t *header,
     return 0;
 }
 
-static void output_buffer(int16_t **samples, int nchan, int blocksize,
-                          int32_t **buffer)
-{
-    int i, ch;
-    for (ch = 0; ch < nchan; ch++) {
-        int32_t *in  = buffer[ch];
-        int16_t *out = samples[ch];
-        for (i = 0; i < blocksize; i++)
-            out[i] = av_clip_int16(in[i]);
-    }
-}
-
 static const int fixed_coeffs[][3] = {
     { 0,  0,  0 },
     { 1,  0,  0 },
@@ -281,7 +344,7 @@ static int decode_subframe_lpc(ShortenContext *s, int command, int channel,
     if (command == FN_QLPC) {
         /* read/validate prediction order */
         pred_order = get_ur_golomb_shorten(&s->gb, LPCQSIZE);
-        if (pred_order > s->nwrap) {
+        if ((unsigned)pred_order > s->nwrap) {
             av_log(s->avctx, AV_LOG_ERROR, "invalid pred_order %d\n",
                    pred_order);
             return AVERROR(EINVAL);
@@ -373,6 +436,11 @@ static int read_header(ShortenContext *s)
         s->nmean = get_uint(s, 0);
 
         skip_bytes = get_uint(s, NSKIPSIZE);
+        if ((unsigned)skip_bytes > get_bits_left(&s->gb)/8) {
+            av_log(s->avctx, AV_LOG_ERROR, "invalid skip_bytes: %d\n", skip_bytes);
+            return AVERROR_INVALIDDATA;
+        }
+
         for (i = 0; i < skip_bytes; i++)
             skip_bits(&s->gb, 8);
     }
@@ -387,6 +455,9 @@ static int read_header(ShortenContext *s)
     if (s->version > 1)
         s->lpcqoffset = V2LPCQOFFSET;
 
+    if (s->avctx->extradata_size > 0)
+        goto end;
+
     if (get_ur_golomb_shorten(&s->gb, FNSIZE) != FN_VERBATIM) {
         av_log(s->avctx, AV_LOG_ERROR,
                "missing verbatim section at beginning of stream\n");
@@ -404,9 +475,18 @@ static int read_header(ShortenContext *s)
     for (i = 0; i < s->header_size; i++)
         s->header[i] = (char)get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE);
 
-    if ((ret = decode_wave_header(s->avctx, s->header, s->header_size)) < 0)
-        return ret;
+    if (AV_RL32(s->header) == MKTAG('R','I','F','F')) {
+        if ((ret = decode_wave_header(s->avctx, s->header, s->header_size)) < 0)
+            return ret;
+    } else if (AV_RL32(s->header) == MKTAG('F','O','R','M')) {
+        if ((ret = decode_aiff_header(s->avctx, s->header, s->header_size)) < 0)
+            return ret;
+    } else {
+        avpriv_report_missing_feature(s->avctx, "unsupported bit packing %X", AV_RL32(s->header));
+        return AVERROR_PATCHWELCOME;
+    }
 
+end:
     s->cur_chan = 0;
     s->bitshift = 0;
 
@@ -428,51 +508,68 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
     /* allocate internal bitstream buffer */
     if (s->max_framesize == 0) {
         void *tmp_ptr;
-        s->max_framesize = 1024; // should hopefully be enough for the first header
+        s->max_framesize = 8192; // should hopefully be enough for the first header
         tmp_ptr = av_fast_realloc(s->bitstream, &s->allocated_bitstream_size,
                                   s->max_framesize + AV_INPUT_BUFFER_PADDING_SIZE);
         if (!tmp_ptr) {
+            s->max_framesize = 0;
             av_log(avctx, AV_LOG_ERROR, "error allocating bitstream buffer\n");
             return AVERROR(ENOMEM);
         }
+        memset(tmp_ptr, 0, s->allocated_bitstream_size);
         s->bitstream = tmp_ptr;
     }
 
     /* append current packet data to bitstream buffer */
-    if (1 && s->max_framesize) { //FIXME truncated
-        buf_size       = FFMIN(buf_size, s->max_framesize - s->bitstream_size);
-        input_buf_size = buf_size;
-
-        if (s->bitstream_index + s->bitstream_size + buf_size >
-            s->allocated_bitstream_size) {
-            memmove(s->bitstream, &s->bitstream[s->bitstream_index],
-                    s->bitstream_size);
-            s->bitstream_index = 0;
-        }
-        if (buf)
-            memcpy(&s->bitstream[s->bitstream_index + s->bitstream_size], buf,
-                   buf_size);
-        buf               = &s->bitstream[s->bitstream_index];
-        buf_size         += s->bitstream_size;
-        s->bitstream_size = buf_size;
-
-        /* do not decode until buffer has at least max_framesize bytes or
-         * the end of the file has been reached */
-        if (buf_size < s->max_framesize && avpkt->data) {
-            *got_frame_ptr = 0;
-            return input_buf_size;
-        }
+    buf_size       = FFMIN(buf_size, s->max_framesize - s->bitstream_size);
+    input_buf_size = buf_size;
+
+    if (s->bitstream_index + s->bitstream_size + buf_size + AV_INPUT_BUFFER_PADDING_SIZE >
+        s->allocated_bitstream_size) {
+        memmove(s->bitstream, &s->bitstream[s->bitstream_index],
+                s->bitstream_size);
+        s->bitstream_index = 0;
+    }
+    if (buf)
+        memcpy(&s->bitstream[s->bitstream_index + s->bitstream_size], buf,
+               buf_size);
+    buf               = &s->bitstream[s->bitstream_index];
+    buf_size         += s->bitstream_size;
+    s->bitstream_size = buf_size;
+
+    /* do not decode until buffer has at least max_framesize bytes or
+     * the end of the file has been reached */
+    if (buf_size < s->max_framesize && avpkt->data) {
+        *got_frame_ptr = 0;
+        return input_buf_size;
     }
     /* init and position bitstream reader */
-    init_get_bits(&s->gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0)
+        return ret;
     skip_bits(&s->gb, s->bitindex);
 
     /* process header or next subblock */
     if (!s->got_header) {
+
         if ((ret = read_header(s)) < 0)
             return ret;
-        *got_frame_ptr = 0;
-        goto finish_frame;
+
+        if (avpkt->size) {
+            int max_framesize;
+            void *tmp_ptr;
+
+            max_framesize = FFMAX(s->max_framesize, s->blocksize * s->channels * 8);
+            tmp_ptr = av_fast_realloc(s->bitstream, &s->allocated_bitstream_size,
+                                      max_framesize + AV_INPUT_BUFFER_PADDING_SIZE);
+            if (!tmp_ptr) {
+                av_log(avctx, AV_LOG_ERROR, "error allocating bitstream buffer\n");
+                return AVERROR(ENOMEM);
+            }
+            s->bitstream = tmp_ptr;
+            s->max_framesize = max_framesize;
+            *got_frame_ptr = 0;
+            goto finish_frame;
+        }
     }
 
     /* if quit command was read previously, don't decode anything */
@@ -507,11 +604,16 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                 while (len--)
                     get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE);
                 break;
-            case FN_BITSHIFT:
-                s->bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE);
-                if (s->bitshift < 0)
+            case FN_BITSHIFT: {
+                unsigned bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE);
+                if (bitshift > 32) {
+                    av_log(avctx, AV_LOG_ERROR, "bitshift %d is invalid\n",
+                           bitshift);
                     return AVERROR_INVALIDDATA;
+                }
+                s->bitshift = bitshift;
                 break;
+            }
             case FN_BLOCKSIZE: {
                 unsigned blocksize = get_uint(s, av_log2(s->blocksize));
                 if (blocksize > s->blocksize) {
@@ -531,10 +633,8 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                 s->got_quit_command = 1;
                 break;
             }
-            if (cmd == FN_BLOCKSIZE || cmd == FN_QUIT) {
-                *got_frame_ptr = 0;
+            if (cmd == FN_QUIT)
                 break;
-            }
         } else {
             /* process audio command */
             int residual_size = 0;
@@ -559,7 +659,7 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                     sum += s->offset[channel][i];
                 coffset = sum / s->nmean;
                 if (s->version >= 2)
-                    coffset >>= FFMIN(1, s->bitshift);
+                    coffset = s->bitshift == 0 ? coffset : coffset >> s->bitshift - 1 >> 1;
             }
 
             /* decode samples for this channel */
@@ -584,7 +684,7 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
                 if (s->version < 2)
                     s->offset[channel][s->nmean - 1] = sum / s->blocksize;
                 else
-                    s->offset[channel][s->nmean - 1] = (sum / s->blocksize) << s->bitshift;
+                    s->offset[channel][s->nmean - 1] = s->bitshift == 32 ? 0 : (sum / s->blocksize) << s->bitshift;
             }
 
             /* copy wrap samples for use with next block */
@@ -598,15 +698,30 @@ static int shorten_decode_frame(AVCodecContext *avctx, void *data,
             /* if this is the last channel in the block, output the samples */
             s->cur_chan++;
             if (s->cur_chan == s->channels) {
+                uint8_t *samples_u8;
+                int16_t *samples_s16;
+                int chan;
+
                 /* get output buffer */
                 frame->nb_samples = s->blocksize;
-                if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-                    av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
                     return ret;
+
+                for (chan = 0; chan < s->channels; chan++) {
+                    samples_u8  = ((uint8_t **)frame->extended_data)[chan];
+                    samples_s16 = ((int16_t **)frame->extended_data)[chan];
+                    for (i = 0; i < s->blocksize; i++) {
+                        switch (s->internal_ftype) {
+                        case TYPE_U8:
+                            *samples_u8++ = av_clip_uint8(s->decoded[chan][i]);
+                            break;
+                        case TYPE_S16HL:
+                        case TYPE_S16LH:
+                            *samples_s16++ = av_clip_int16(s->decoded[chan][i]);
+                            break;
+                        }
+                    }
                 }
-                /* interleave output */
-                output_buffer((int16_t **)frame->extended_data, s->channels,
-                              s->blocksize, s->decoded);
 
                 *got_frame_ptr = 1;
             }
@@ -657,7 +772,8 @@ AVCodec ff_shorten_decoder = {
     .init           = shorten_decode_init,
     .close          = shorten_decode_close,
     .decode         = shorten_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1,
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P,
+                                                      AV_SAMPLE_FMT_U8P,
                                                       AV_SAMPLE_FMT_NONE },
 };
diff --git a/libavcodec/simple_idct.c b/libavcodec/simple_idct.c
index 6ee1320..0711e16 100644
--- a/libavcodec/simple_idct.c
+++ b/libavcodec/simple_idct.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,6 +36,15 @@
 
 #define BIT_DEPTH 10
 #include "simple_idct_template.c"
+
+#define EXTRA_SHIFT  2
+#include "simple_idct_template.c"
+
+#undef EXTRA_SHIFT
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 12
+#include "simple_idct_template.c"
 #undef BIT_DEPTH
 
 /* 2x4x8 idct */
@@ -123,7 +132,7 @@ void ff_simple_idct248_put(uint8_t *dest, int line_size, int16_t *block)
 #undef C1
 #undef C2
 #define CN_SHIFT 12
-#define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
+#define C_FIX(x) ((int)((x) * M_SQRT2 * (1 << CN_SHIFT) + 0.5))
 #define C1 C_FIX(0.6532814824)
 #define C2 C_FIX(0.2705980501)
 #define C3 C_FIX(0.5)
@@ -150,7 +159,7 @@ static inline void idct4col_add(uint8_t *dest, int line_size, const int16_t *col
 }
 
 #define RN_SHIFT 15
-#define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
+#define R_FIX(x) ((int)((x) * M_SQRT2 * (1 << RN_SHIFT) + 0.5))
 #define R1 R_FIX(0.6532814824)
 #define R2 R_FIX(0.2705980501)
 #define R3 R_FIX(0.5)
@@ -218,7 +227,6 @@ void ff_simple_idct44_add(uint8_t *dest, int line_size, int16_t *block)
     }
 }
 
-#if CONFIG_PRORES_DECODER
 void ff_prores_idct(int16_t *block, const int16_t *qmat)
 {
     int i;
@@ -227,9 +235,10 @@ void ff_prores_idct(int16_t *block, const int16_t *qmat)
         block[i] *= qmat[i];
 
     for (i = 0; i < 8; i++)
-        idctRowCondDC_10(block + i*8, 2);
+        idctRowCondDC_extrashift_10(block + i*8, 2);
 
-    for (i = 0; i < 8; i++)
-        idctSparseCol_10(block + i);
+    for (i = 0; i < 8; i++) {
+        block[i] += 8192;
+        idctSparseCol_extrashift_10(block + i);
+    }
 }
-#endif /* CONFIG_PRORES_DECODER */
diff --git a/libavcodec/simple_idct.h b/libavcodec/simple_idct.h
index 7f14aae..154e297 100644
--- a/libavcodec/simple_idct.h
+++ b/libavcodec/simple_idct.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,11 @@ void ff_simple_idct_8(int16_t *block);
 void ff_simple_idct_put_10(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct_add_10(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct_10(int16_t *block);
+
+void ff_simple_idct_put_12(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_add_12(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct_12(int16_t *block);
+
 /**
  * Special version of ff_simple_idct_10() which does dequantization
  * and scales by a factor of 2 more between the two IDCTs to account
diff --git a/libavcodec/simple_idct_template.c b/libavcodec/simple_idct_template.c
index 9a5e38a..b73614b 100644
--- a/libavcodec/simple_idct_template.c
+++ b/libavcodec/simple_idct_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,19 +62,40 @@
 #define MUL(a, b)    MUL16(a, b)
 #define MAC(a, b, c) MAC16(a, b, c)
 
-#elif BIT_DEPTH == 10
-
-#define W1 90901
-#define W2 85627
-#define W3 77062
-#define W4 65535
-#define W5 51491
-#define W6 35468
-#define W7 18081
-
-#define ROW_SHIFT 15
-#define COL_SHIFT 20
-#define DC_SHIFT 1
+#elif BIT_DEPTH == 10 || BIT_DEPTH == 12
+
+# if BIT_DEPTH == 10
+#define W1 22725 // 90901
+#define W2 21407 //  85627
+#define W3 19265 //  77062
+#define W4 16384 //  65535
+#define W5 12873 //  51491
+#define W6  8867 //  35468
+#define W7  4520 //  18081
+
+#   ifdef EXTRA_SHIFT
+#define ROW_SHIFT 13
+#define COL_SHIFT 18
+#define DC_SHIFT  1
+#   else
+#define ROW_SHIFT 12
+#define COL_SHIFT 19
+#define DC_SHIFT  2
+#   endif
+
+# else
+#define W1 45451
+#define W2 42813
+#define W3 38531
+#define W4 32767
+#define W5 25746
+#define W6 17734
+#define W7 9041
+
+#define ROW_SHIFT 16
+#define COL_SHIFT 17
+#define DC_SHIFT -1
+# endif
 
 #define MUL(a, b)    ((a) * (b))
 #define MAC(a, b, c) ((a) += (b) * (c))
@@ -85,44 +106,50 @@
 
 #endif
 
+#ifdef EXTRA_SHIFT
+static inline void FUNC(idctRowCondDC_extrashift)(int16_t *row, int extra_shift)
+#else
 static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
+#endif
 {
     int a0, a1, a2, a3, b0, b1, b2, b3;
 
 #if HAVE_FAST_64BIT
 #define ROW0_MASK (0xffffLL << 48 * HAVE_BIGENDIAN)
-    if (((((uint64_t *)row)[0] & ~ROW0_MASK) | ((uint64_t *)row)[1]) == 0) {
+    if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) {
         uint64_t temp;
-        if (DC_SHIFT - extra_shift > 0) {
+        if (DC_SHIFT - extra_shift >= 0) {
             temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
         } else {
-            temp = (row[0] >> (extra_shift - DC_SHIFT)) & 0xffff;
+            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
         }
         temp += temp * (1 << 16);
         temp += temp * ((uint64_t) 1 << 32);
-        ((uint64_t *)row)[0] = temp;
-        ((uint64_t *)row)[1] = temp;
+        AV_WN64A(row, temp);
+        AV_WN64A(row+4, temp);
         return;
     }
 #else
-    if (!(((uint32_t*)row)[1] |
-          ((uint32_t*)row)[2] |
-          ((uint32_t*)row)[3] |
+    if (!(AV_RN32A(row+2) |
+          AV_RN32A(row+4) |
+          AV_RN32A(row+6) |
           row[1])) {
         uint32_t temp;
-        if (DC_SHIFT - extra_shift > 0) {
+        if (DC_SHIFT - extra_shift >= 0) {
             temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff;
         } else {
-            temp = (row[0] >> (extra_shift - DC_SHIFT)) & 0xffff;
+            temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff;
         }
         temp += temp * (1 << 16);
-        ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
-            ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
+        AV_WN32A(row, temp);
+        AV_WN32A(row+2, temp);
+        AV_WN32A(row+4, temp);
+        AV_WN32A(row+6, temp);
         return;
     }
 #endif
 
-    a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+    a0 = (W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1));
     a1 = a0;
     a2 = a0;
     a3 = a0;
@@ -220,6 +247,9 @@ static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift)
         }                                               \
     } while (0)
 
+#ifdef EXTRA_SHIFT
+static inline void FUNC(idctSparseCol_extrashift)(int16_t *col)
+#else
 static inline void FUNC(idctSparseColPut)(pixel *dest, int line_size,
                                           int16_t *col)
 {
@@ -269,6 +299,7 @@ static inline void FUNC(idctSparseColAdd)(pixel *dest, int line_size,
 }
 
 static inline void FUNC(idctSparseCol)(int16_t *col)
+#endif
 {
     int a0, a1, a2, a3, b0, b1, b2, b3;
 
@@ -284,6 +315,7 @@ static inline void FUNC(idctSparseCol)(int16_t *col)
     col[56] = ((a0 - b0) >> COL_SHIFT);
 }
 
+#ifndef EXTRA_SHIFT
 void FUNC(ff_simple_idct_put)(uint8_t *dest_, int line_size, int16_t *block)
 {
     pixel *dest = (pixel *)dest_;
@@ -322,3 +354,4 @@ void FUNC(ff_simple_idct)(int16_t *block)
     for (i = 0; i < 8; i++)
         FUNC(idctSparseCol)(block + i);
 }
+#endif
diff --git a/libavcodec/sinewin.c b/libavcodec/sinewin.c
index be38dbc..4532dc7 100644
--- a/libavcodec/sinewin.c
+++ b/libavcodec/sinewin.c
@@ -1,20 +1,21 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define USE_FIXED 0
 #include "sinewin.h"
 #include "sinewin_tablegen.h"
diff --git a/libavcodec/sinewin.h b/libavcodec/sinewin.h
index 478036d..27c107c 100644
--- a/libavcodec/sinewin.h
+++ b/libavcodec/sinewin.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Robert Swain
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 
 #include "config.h"
 #include "libavutil/mem.h"
+#include "libavcodec/aac_defines.h"
 
 #if CONFIG_HARDCODED_TABLES
 #   define SINETABLE_CONST const
@@ -30,20 +31,24 @@
 #   define SINETABLE_CONST
 #endif
 
+#ifndef USE_FIXED
+#define USE_FIXED 0
+#endif
+
 #define SINETABLE(size) \
-    SINETABLE_CONST DECLARE_ALIGNED(32, float, ff_sine_##size)[size]
+    SINETABLE_CONST DECLARE_ALIGNED(32, INTFLOAT, AAC_RENAME(ff_sine_##size))[size]
 
 /**
  * Generate a sine window.
  * @param   window  pointer to half window
  * @param   n       size of half window
  */
-void ff_sine_window_init(float *window, int n);
+void AAC_RENAME(ff_sine_window_init)(INTFLOAT *window, int n);
 
 /**
  * initialize the specified entry of ff_sine_windows
  */
-void ff_init_ff_sine_windows(int index);
+void AAC_RENAME(ff_init_ff_sine_windows)(int index);
 
 extern SINETABLE(  32);
 extern SINETABLE(  64);
@@ -55,6 +60,6 @@ extern SINETABLE(2048);
 extern SINETABLE(4096);
 extern SINETABLE(8192);
 
-extern SINETABLE_CONST float * const ff_sine_windows[14];
+extern SINETABLE_CONST INTFLOAT * const AAC_RENAME(ff_sine_windows)[14];
 
 #endif /* AVCODEC_SINEWIN_H */
diff --git a/libavcodec/sinewin_fixed.c b/libavcodec/sinewin_fixed.c
new file mode 100644
index 0000000..27ead29
--- /dev/null
+++ b/libavcodec/sinewin_fixed.c
@@ -0,0 +1,21 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "sinewin.h"
+#include "sinewin_tablegen.h"
diff --git a/libavcodec/sinewin_fixed_tablegen.c b/libavcodec/sinewin_fixed_tablegen.c
new file mode 100644
index 0000000..977e6f3c
--- /dev/null
+++ b/libavcodec/sinewin_fixed_tablegen.c
@@ -0,0 +1,24 @@
+/*
+ * Generate a header file for hardcoded sine windows
+ *
+ * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define USE_FIXED 1
+#include "sinewin_tablegen_template.c"
diff --git a/libavcodec/sinewin_tablegen.c b/libavcodec/sinewin_tablegen.c
index 90a75c2..dd60266 100644
--- a/libavcodec/sinewin_tablegen.c
+++ b/libavcodec/sinewin_tablegen.c
@@ -3,44 +3,22 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <stdlib.h>
-#define CONFIG_HARDCODED_TABLES 0
-#define SINETABLE_CONST
-#define SINETABLE(size) \
-    float ff_sine_##size[size]
-#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
-#include "sinewin_tablegen.h"
-#include "tableprint.h"
-
-int main(void)
-{
-    int i;
-
-    write_fileheader();
-
-    for (i = 5; i <= 13; i++) {
-        ff_init_ff_sine_windows(i);
-        printf("SINETABLE(%4i) = {\n", 1 << i);
-        write_float_array(ff_sine_windows[i], 1 << i);
-        printf("};\n");
-    }
-
-    return 0;
-}
+#define USE_FIXED 0
+#include "sinewin_tablegen_template.c"
diff --git a/libavcodec/sinewin_tablegen.h b/libavcodec/sinewin_tablegen.h
index 1ee225b..4432135 100644
--- a/libavcodec/sinewin_tablegen.h
+++ b/libavcodec/sinewin_tablegen.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 // do not use libavutil/libm.h since this is compiled both
 // for the host and the target and config.h is only valid for the target
 #include <math.h>
+#include "libavcodec/aac_defines.h"
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 
@@ -41,26 +42,37 @@ SINETABLE(2048);
 SINETABLE(4096);
 SINETABLE(8192);
 #else
+#if USE_FIXED
+#include "libavcodec/sinewin_fixed_tables.h"
+#else
 #include "libavcodec/sinewin_tables.h"
 #endif
+#endif
+
+#if USE_FIXED
+#define SIN_FIX(a) (int)floor((a) * 0x80000000 + 0.5)
+#else
+#define SIN_FIX(a) a
+#endif
 
-SINETABLE_CONST float * const ff_sine_windows[] = {
+SINETABLE_CONST INTFLOAT * const AAC_RENAME(ff_sine_windows)[] = {
     NULL, NULL, NULL, NULL, NULL, // unused
-    ff_sine_32 , ff_sine_64 ,
-    ff_sine_128, ff_sine_256, ff_sine_512, ff_sine_1024, ff_sine_2048, ff_sine_4096, ff_sine_8192
+    AAC_RENAME(ff_sine_32) , AAC_RENAME(ff_sine_64), AAC_RENAME(ff_sine_128),
+    AAC_RENAME(ff_sine_256), AAC_RENAME(ff_sine_512), AAC_RENAME(ff_sine_1024),
+    AAC_RENAME(ff_sine_2048), AAC_RENAME(ff_sine_4096), AAC_RENAME(ff_sine_8192)
 };
 
 // Generate a sine window.
-av_cold void ff_sine_window_init(float *window, int n) {
+av_cold void AAC_RENAME(ff_sine_window_init)(INTFLOAT *window, int n) {
     int i;
     for(i = 0; i < n; i++)
-        window[i] = sinf((i + 0.5) * (M_PI / (2.0 * n)));
+        window[i] = SIN_FIX(sinf((i + 0.5) * (M_PI / (2.0 * n))));
 }
 
-av_cold void ff_init_ff_sine_windows(int index) {
-    assert(index >= 0 && index < FF_ARRAY_ELEMS(ff_sine_windows));
+av_cold void AAC_RENAME(ff_init_ff_sine_windows)(int index) {
+    assert(index >= 0 && index < FF_ARRAY_ELEMS(AAC_RENAME(ff_sine_windows)));
 #if !CONFIG_HARDCODED_TABLES
-    ff_sine_window_init(ff_sine_windows[index], 1 << index);
+    AAC_RENAME(ff_sine_window_init)(AAC_RENAME(ff_sine_windows)[index], 1 << index);
 #endif
 }
 
diff --git a/libavcodec/sinewin_tablegen_template.c b/libavcodec/sinewin_tablegen_template.c
new file mode 100644
index 0000000..43ce1ba
--- /dev/null
+++ b/libavcodec/sinewin_tablegen_template.c
@@ -0,0 +1,54 @@
+/*
+ * Generate a header file for hardcoded sine windows
+ *
+ * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavcodec/aac_defines.h"
+#define CONFIG_HARDCODED_TABLES 0
+
+#if USE_FIXED
+#define WRITE_FUNC write_int32_t_array
+#else
+#define WRITE_FUNC write_float_array
+#endif
+
+#define SINETABLE_CONST
+#define SINETABLE(size) \
+    INTFLOAT AAC_RENAME(ff_sine_##size)[size]
+#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
+#include "sinewin_tablegen.h"
+#include "tableprint.h"
+
+int main(void)
+{
+    int i;
+
+    write_fileheader();
+
+    for (i = 5; i <= 13; i++) {
+        AAC_RENAME(ff_init_ff_sine_windows)(i);
+        printf("SINETABLE(%4i) = {\n", 1 << i);
+        WRITE_FUNC(AAC_RENAME(ff_sine_windows)[i], 1 << i);
+        printf("};\n");
+    }
+
+    return 0;
+}
diff --git a/libavcodec/sipr.c b/libavcodec/sipr.c
index 3f17686..595097a 100644
--- a/libavcodec/sipr.c
+++ b/libavcodec/sipr.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,7 +139,7 @@ const float ff_pow_0_5[] = {
     1.0/(1 << 13), 1.0/(1 << 14), 1.0/(1 << 15), 1.0/(1 << 16)
 };
 
-static void dequant(float *out, const int *idx, const float *cbs[])
+static void dequant(float *out, const int *idx, const float * const cbs[])
 {
     int i;
     int stride  = 2;
@@ -493,8 +493,8 @@ static av_cold int sipr_decoder_init(AVCodecContext * avctx)
         else if (avctx->bit_rate > 5750 ) ctx->mode = MODE_6k5;
         else                              ctx->mode = MODE_5k0;
         av_log(avctx, AV_LOG_WARNING,
-               "Invalid block_align: %d. Mode %s guessed based on bitrate: %d\n",
-               avctx->block_align, modes[ctx->mode].mode_name, avctx->bit_rate);
+               "Invalid block_align: %d. Mode %s guessed based on bitrate: %"PRId64"\n",
+               avctx->block_align, modes[ctx->mode].mode_name, (int64_t)avctx->bit_rate);
     }
 
     av_log(avctx, AV_LOG_DEBUG, "Mode: %s\n", modes[ctx->mode].mode_name);
@@ -537,16 +537,14 @@ static int sipr_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_ERROR,
                "Error processing packet: packet size (%d) too small\n",
                avpkt->size);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     /* get output buffer */
     frame->nb_samples = mode_par->frames_per_packet * subframe_size *
                         mode_par->subframe_count;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (float *)frame->data[0];
 
     init_get_bits(&gb, buf, mode_par->bits_per_frame);
diff --git a/libavcodec/sipr.h b/libavcodec/sipr.h
index 4cdea67..34f7f99 100644
--- a/libavcodec/sipr.h
+++ b/libavcodec/sipr.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sipr16k.c b/libavcodec/sipr16k.c
index f7fcb34..9c8f684 100644
--- a/libavcodec/sipr16k.c
+++ b/libavcodec/sipr16k.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,7 +29,6 @@
 #include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
 #include "lsp.h"
-#include "celp_filters.h"
 #include "acelp_vectors.h"
 #include "acelp_pitch_delay.h"
 #include "acelp_filters.h"
@@ -51,7 +50,7 @@ static void lsf2lsp(const float *lsf, double *lsp)
         lsp[i] = cosf(lsf[i]);
 }
 
-static void dequant(float *out, const int *idx, const float *cbs[])
+static void dequant(float *out, const int *idx, const float * const cbs[])
 {
     int i;
 
diff --git a/libavcodec/sipr16kdata.h b/libavcodec/sipr16kdata.h
index ec60c29..16a653d 100644
--- a/libavcodec/sipr16kdata.h
+++ b/libavcodec/sipr16kdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -525,7 +525,7 @@ static const float lsf_cb5_16k[128][4] = {
     { 0.124405,  0.009943, -0.148477, -0.205184}
 };
 
-static const float *lsf_codebooks_16k[] = {
+static const float * const lsf_codebooks_16k[] = {
     lsf_cb1_16k[0], lsf_cb2_16k[0], lsf_cb3_16k[0], lsf_cb4_16k[0],
     lsf_cb5_16k[0]
 };
diff --git a/libavcodec/siprdata.h b/libavcodec/siprdata.h
index 92037a4..0dbc113 100644
--- a/libavcodec/siprdata.h
+++ b/libavcodec/siprdata.h
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 Vladimir Voroshilov
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -206,7 +206,7 @@ static const float lsf_cb5[32][2] = {
     { 0.150514,  0.034366}, { 0.186092, -0.069272}
 };
 
-static const float *lsf_codebooks[] = {
+static const float * const lsf_codebooks[] = {
     lsf_cb1[0], lsf_cb2[0], lsf_cb3[0], lsf_cb4[0], lsf_cb5[0]
 };
 
diff --git a/libavcodec/smacker.c b/libavcodec/smacker.c
index 1ec957a..f4fc16c 100644
--- a/libavcodec/smacker.c
+++ b/libavcodec/smacker.c
@@ -2,20 +2,20 @@
  * Smacker decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -95,10 +95,14 @@ enum SmkBlockTypes {
  */
 static int smacker_decode_tree(GetBitContext *gb, HuffContext *hc, uint32_t prefix, int length)
 {
+    if(length > 32 || length > 3*SMKTREE_BITS) {
+        av_log(NULL, AV_LOG_ERROR, "length too long\n");
+        return AVERROR_INVALIDDATA;
+    }
     if(!get_bits1(gb)){ //Leaf
-        if(hc->current >= 256){
+        if(hc->current >= hc->length){
             av_log(NULL, AV_LOG_ERROR, "Tree size exceeded!\n");
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
         if(length){
             hc->bits[hc->current] = prefix;
@@ -129,14 +133,14 @@ static int smacker_decode_bigtree(GetBitContext *gb, HuffContext *hc, DBCtx *ctx
 {
     if (hc->current + 1 >= hc->length) {
         av_log(NULL, AV_LOG_ERROR, "Tree size exceeded!\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     if(!get_bits1(gb)){ //Leaf
         int val, i1, i2;
         i1 = ctx->v1->table ? get_vlc2(gb, ctx->v1->table, SMKTREE_BITS, 3) : 0;
         i2 = ctx->v2->table ? get_vlc2(gb, ctx->v2->table, SMKTREE_BITS, 3) : 0;
         if (i1 < 0 || i2 < 0)
-            return -1;
+            return AVERROR_INVALIDDATA;
         val = ctx->recode1[i1] | (ctx->recode2[i2] << 8);
         if(val == ctx->escapes[0]) {
             ctx->last[0] = hc->current;
@@ -168,7 +172,7 @@ static int smacker_decode_bigtree(GetBitContext *gb, HuffContext *hc, DBCtx *ctx
 }
 
 /**
- * Store large tree as Libav's vlc codes
+ * Store large tree as FFmpeg's vlc codes
  */
 static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int **recodes, int *last, int size)
 {
@@ -182,7 +186,7 @@ static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int
 
     if(size >= UINT_MAX>>4){ // (((size + 3) >> 2) + 3) << 2 must not overflow
         av_log(smk->avctx, AV_LOG_ERROR, "size too large\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     tmp1.length = 256;
@@ -205,40 +209,51 @@ static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int
     }
 
     if(get_bits1(gb)) {
-        smacker_decode_tree(gb, &tmp1, 0, 0);
-        skip_bits1(gb);
-        res = init_vlc(&vlc[0], SMKTREE_BITS, tmp1.length,
-                    tmp1.lengths, sizeof(int), sizeof(int),
-                    tmp1.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
-        if(res < 0) {
-            av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+        res = smacker_decode_tree(gb, &tmp1, 0, 0);
+        if (res < 0) {
             err = res;
             goto error;
         }
-    } else {
+        skip_bits1(gb);
+        if(tmp1.current > 1) {
+            res = init_vlc(&vlc[0], SMKTREE_BITS, tmp1.length,
+                        tmp1.lengths, sizeof(int), sizeof(int),
+                        tmp1.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
+            if(res < 0) {
+                av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+                err = res;
+                goto error;
+            }
+        }
+    }
+    if (!vlc[0].table) {
         av_log(smk->avctx, AV_LOG_ERROR, "Skipping low bytes tree\n");
     }
     if(get_bits1(gb)){
-        smacker_decode_tree(gb, &tmp2, 0, 0);
-        skip_bits1(gb);
-        res = init_vlc(&vlc[1], SMKTREE_BITS, tmp2.length,
-                    tmp2.lengths, sizeof(int), sizeof(int),
-                    tmp2.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
-        if(res < 0) {
-            av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+        res = smacker_decode_tree(gb, &tmp2, 0, 0);
+        if (res < 0) {
             err = res;
             goto error;
         }
-    } else {
+        skip_bits1(gb);
+        if(tmp2.current > 1) {
+            res = init_vlc(&vlc[1], SMKTREE_BITS, tmp2.length,
+                        tmp2.lengths, sizeof(int), sizeof(int),
+                        tmp2.bits, sizeof(uint32_t), sizeof(uint32_t), INIT_VLC_LE);
+            if(res < 0) {
+                av_log(smk->avctx, AV_LOG_ERROR, "Cannot build VLC table\n");
+                err = res;
+                goto error;
+            }
+        }
+    }
+    if (!vlc[1].table) {
         av_log(smk->avctx, AV_LOG_ERROR, "Skipping high bytes tree\n");
     }
 
-    escapes[0]  = get_bits(gb, 8);
-    escapes[0] |= get_bits(gb, 8) << 8;
-    escapes[1]  = get_bits(gb, 8);
-    escapes[1] |= get_bits(gb, 8) << 8;
-    escapes[2]  = get_bits(gb, 8);
-    escapes[2] |= get_bits(gb, 8) << 8;
+    escapes[0]  = get_bits(gb, 16);
+    escapes[1]  = get_bits(gb, 16);
+    escapes[2]  = get_bits(gb, 16);
 
     last[0] = last[1] = last[2] = -1;
 
@@ -254,7 +269,7 @@ static int smacker_decode_header_tree(SmackVContext *smk, GetBitContext *gb, int
     huff.length = ((size + 3) >> 2) + 4;
     huff.maxlength = 0;
     huff.current = 0;
-    huff.values = av_mallocz(huff.length * sizeof(int));
+    huff.values = av_mallocz_array(huff.length, sizeof(int));
     if (!huff.values) {
         err = AVERROR(ENOMEM);
         goto error;
@@ -292,14 +307,16 @@ error:
 
 static int decode_header_trees(SmackVContext *smk) {
     GetBitContext gb;
-    int mmap_size, mclr_size, full_size, type_size;
+    int mmap_size, mclr_size, full_size, type_size, ret;
 
     mmap_size = AV_RL32(smk->avctx->extradata);
     mclr_size = AV_RL32(smk->avctx->extradata + 4);
     full_size = AV_RL32(smk->avctx->extradata + 8);
     type_size = AV_RL32(smk->avctx->extradata + 12);
 
-    init_get_bits(&gb, smk->avctx->extradata + 16, (smk->avctx->extradata_size - 16) * 8);
+    ret = init_get_bits8(&gb, smk->avctx->extradata + 16, smk->avctx->extradata_size - 16);
+    if (ret < 0)
+        return ret;
 
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping MMAP tree\n");
@@ -309,8 +326,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->mmap_tbl[0] = 0;
         smk->mmap_last[0] = smk->mmap_last[1] = smk->mmap_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->mmap_tbl, smk->mmap_last, mmap_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->mmap_tbl, smk->mmap_last, mmap_size);
+        if (ret < 0)
+            return ret;
     }
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping MCLR tree\n");
@@ -320,8 +338,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->mclr_tbl[0] = 0;
         smk->mclr_last[0] = smk->mclr_last[1] = smk->mclr_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->mclr_tbl, smk->mclr_last, mclr_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->mclr_tbl, smk->mclr_last, mclr_size);
+        if (ret < 0)
+            return ret;
     }
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping FULL tree\n");
@@ -331,8 +350,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->full_tbl[0] = 0;
         smk->full_last[0] = smk->full_last[1] = smk->full_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->full_tbl, smk->full_last, full_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->full_tbl, smk->full_last, full_size);
+        if (ret < 0)
+            return ret;
     }
     if(!get_bits1(&gb)) {
         av_log(smk->avctx, AV_LOG_INFO, "Skipping TYPE tree\n");
@@ -342,8 +362,9 @@ static int decode_header_trees(SmackVContext *smk) {
         smk->type_tbl[0] = 0;
         smk->type_last[0] = smk->type_last[1] = smk->type_last[2] = 1;
     } else {
-        if (smacker_decode_header_tree(smk, &gb, &smk->type_tbl, smk->type_last, type_size))
-            return -1;
+        ret = smacker_decode_header_tree(smk, &gb, &smk->type_tbl, smk->type_last, type_size);
+        if (ret < 0)
+            return ret;
     }
 
     return 0;
@@ -387,12 +408,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int flags;
 
     if (avpkt->size <= 769)
-        return 0;
+        return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_reget_buffer(avctx, smk->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, smk->pic)) < 0)
         return ret;
-    }
 
     /* make the palette available on the way out */
     pal = (uint32_t*)smk->pic->data[1];
@@ -400,25 +419,25 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     flags = bytestream2_get_byteu(&gb2);
     smk->pic->palette_has_changed = flags & 1;
     smk->pic->key_frame = !!(flags & 2);
-    if(smk->pic->key_frame)
+    if (smk->pic->key_frame)
         smk->pic->pict_type = AV_PICTURE_TYPE_I;
     else
         smk->pic->pict_type = AV_PICTURE_TYPE_P;
 
     for(i = 0; i < 256; i++)
-        *pal++ = bytestream2_get_be24u(&gb2);
+        *pal++ = 0xFFU << 24 | bytestream2_get_be24u(&gb2);
 
     last_reset(smk->mmap_tbl, smk->mmap_last);
     last_reset(smk->mclr_tbl, smk->mclr_last);
     last_reset(smk->full_tbl, smk->full_last);
     last_reset(smk->type_tbl, smk->type_last);
-    init_get_bits(&gb, avpkt->data + 769, (avpkt->size - 769) * 8);
+    if ((ret = init_get_bits8(&gb, avpkt->data + 769, avpkt->size - 769)) < 0)
+        return ret;
 
     blk = 0;
     bw = avctx->width >> 2;
     bh = avctx->height >> 2;
     blocks = bw * bh;
-    out = smk->pic->data[0];
     stride = smk->pic->linesize[0];
     while(blk < blocks) {
         int type, run, mode;
@@ -479,7 +498,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                     out += stride;
                     out[0] = out[1] = pix & 0xFF;
                     out[2] = out[3] = pix >> 8;
-                    out += stride;
                     break;
                 case 2:
                     for(i = 0; i < 2; i++) {
@@ -547,6 +565,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     SmackVContext * const c = avctx->priv_data;
+    int ret;
 
     c->avctx = avctx;
 
@@ -559,19 +578,20 @@ static av_cold int decode_init(AVCodecContext *avctx)
     /* decode huffman trees from extradata */
     if(avctx->extradata_size < 16){
         av_log(avctx, AV_LOG_ERROR, "Extradata missing!\n");
-        return -1;
+        decode_end(avctx);
+        return AVERROR(EINVAL);
     }
 
-    if (decode_header_trees(c)) {
+    ret = decode_header_trees(c);
+    if (ret < 0) {
         decode_end(avctx);
-        return -1;
+        return ret;
     }
 
     return 0;
 }
 
 
-
 static av_cold int smka_decode_init(AVCodecContext *avctx)
 {
     if (avctx->channels < 1 || avctx->channels > 2) {
@@ -611,7 +631,13 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
 
     unp_size = AV_RL32(buf);
 
-    init_get_bits(&gb, buf + 4, (buf_size - 4) * 8);
+    if (unp_size > (1U<<24)) {
+        av_log(avctx, AV_LOG_ERROR, "packet is too big\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if ((ret = init_get_bits8(&gb, buf + 4, buf_size - 4)) < 0)
+        return ret;
 
     if(!get_bits1(&gb)){
         av_log(avctx, AV_LOG_INFO, "Sound: no data\n");
@@ -624,17 +650,19 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
         av_log(avctx, AV_LOG_ERROR, "channels mismatch\n");
         return AVERROR(EINVAL);
     }
-    if (bits && avctx->sample_fmt == AV_SAMPLE_FMT_U8) {
+    if (bits == (avctx->sample_fmt == AV_SAMPLE_FMT_U8)) {
         av_log(avctx, AV_LOG_ERROR, "sample format mismatch\n");
         return AVERROR(EINVAL);
     }
 
     /* get output buffer */
     frame->nb_samples = unp_size / (avctx->channels * (bits + 1));
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if (unp_size % (avctx->channels * (bits + 1))) {
+        av_log(avctx, AV_LOG_ERROR, "unp_size %d is odd\n", unp_size);
+        return AVERROR(EINVAL);
     }
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
     samples  = (int16_t *)frame->data[0];
     samples8 =            frame->data[0];
 
@@ -674,16 +702,26 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
         for(i = 0; i <= stereo; i++)
             *samples++ = pred[i];
         for(; i < unp_size / 2; i++) {
+            if(get_bits_left(&gb)<0)
+                return AVERROR_INVALIDDATA;
             if(i & stereo) {
                 if(vlc[2].table)
                     res = get_vlc2(&gb, vlc[2].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val  = h[2].values[res];
                 if(vlc[3].table)
                     res = get_vlc2(&gb, vlc[3].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val |= h[3].values[res] << 8;
                 pred[1] += sign_extend(val, 16);
                 *samples++ = pred[1];
@@ -692,11 +730,19 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
                     res = get_vlc2(&gb, vlc[0].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val  = h[0].values[res];
                 if(vlc[1].table)
                     res = get_vlc2(&gb, vlc[1].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 val |= h[1].values[res] << 8;
                 pred[0] += sign_extend(val, 16);
                 *samples++ = pred[0];
@@ -708,11 +754,17 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
         for(i = 0; i <= stereo; i++)
             *samples8++ = pred[i];
         for(; i < unp_size; i++) {
+            if(get_bits_left(&gb)<0)
+                return AVERROR_INVALIDDATA;
             if(i & stereo){
                 if(vlc[1].table)
                     res = get_vlc2(&gb, vlc[1].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 pred[1] += sign_extend(h[1].values[res], 8);
                 *samples8++ = pred[1];
             } else {
@@ -720,6 +772,10 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data,
                     res = get_vlc2(&gb, vlc[0].table, SMKTREE_BITS, 3);
                 else
                     res = 0;
+                if (res < 0) {
+                    av_log(avctx, AV_LOG_ERROR, "invalid vlc\n");
+                    return AVERROR_INVALIDDATA;
+                }
                 pred[0] += sign_extend(h[0].values[res], 8);
                 *samples8++ = pred[0];
             }
diff --git a/libavcodec/smc.c b/libavcodec/smc.c
index 92e522b..69d78ad 100644
--- a/libavcodec/smc.c
+++ b/libavcodec/smc.c
@@ -2,20 +2,20 @@
  * Quicktime Graphics (SMC) Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -84,7 +84,7 @@ static void smc_decode_stream(SmcContext *s)
     int stride = s->frame->linesize[0];
     int i;
     int chunk_size;
-    int buf_size = (int) (s->gb.buffer_end - s->gb.buffer_start);
+    int buf_size = bytestream2_size(&s->gb);
     unsigned char opcode;
     int n_blocks;
     unsigned int color_flags;
@@ -92,7 +92,7 @@ static void smc_decode_stream(SmcContext *s)
     unsigned int color_flags_b;
     unsigned int flag_mask;
 
-    unsigned char *pixels = s->frame->data[0];
+    unsigned char * const pixels = s->frame->data[0];
 
     int image_size = height * s->frame->linesize[0];
     int row_ptr = 0;
@@ -436,10 +436,8 @@ static int smc_decode_frame(AVCodecContext *avctx,
 
     bytestream2_init(&s->gb, buf, buf_size);
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (pal) {
         s->frame->palette_has_changed = 1;
diff --git a/libavcodec/smvjpegdec.c b/libavcodec/smvjpegdec.c
new file mode 100644
index 0000000..9057e86
--- /dev/null
+++ b/libavcodec/smvjpegdec.c
@@ -0,0 +1,220 @@
+/*
+ * SMV JPEG decoder
+ * Copyright (c) 2013 Ash Hughes
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SMV JPEG decoder.
+ */
+
+// #define DEBUG
+#include "avcodec.h"
+#include "libavutil/opt.h"
+#include "libavutil/imgutils.h"
+#include "mjpegdec.h"
+#include "internal.h"
+
+typedef struct SMVJpegDecodeContext {
+    MJpegDecodeContext jpg;
+    AVFrame *picture[2]; /* pictures array */
+    AVCodecContext* avctx;
+    int frames_per_jpeg;
+    int mjpeg_data_size;
+} SMVJpegDecodeContext;
+
+static inline void smv_img_pnt_plane(uint8_t      **dst, uint8_t *src,
+                                     int src_linesize, int height, int nlines)
+{
+    if (!dst || !src)
+        return;
+    src += (nlines) * src_linesize * height;
+    *dst = src;
+}
+
+static inline void smv_img_pnt(uint8_t *dst_data[4], uint8_t *src_data[4],
+                               const int src_linesizes[4],
+                               enum AVPixelFormat pix_fmt, int width, int height,
+                               int nlines)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    int i, planes_nb = 0;
+
+    if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL)
+        return;
+
+    for (i = 0; i < desc->nb_components; i++)
+        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+
+    for (i = 0; i < planes_nb; i++) {
+        int h = height;
+        if (i == 1 || i == 2) {
+            h = AV_CEIL_RSHIFT(height, desc->log2_chroma_h);
+        }
+        smv_img_pnt_plane(&dst_data[i], src_data[i],
+            src_linesizes[i], h, nlines);
+    }
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
+        desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)
+        dst_data[1] = src_data[1];
+}
+
+static av_cold int smvjpeg_decode_end(AVCodecContext *avctx)
+{
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    MJpegDecodeContext *jpg = &s->jpg;
+    int ret;
+
+    jpg->picture_ptr = NULL;
+    av_frame_free(&s->picture[0]);
+    av_frame_free(&s->picture[1]);
+    ret = avcodec_close(s->avctx);
+    av_freep(&s->avctx);
+    return ret;
+}
+
+static av_cold int smvjpeg_decode_init(AVCodecContext *avctx)
+{
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    AVCodec *codec;
+    AVDictionary *thread_opt = NULL;
+    int ret = 0, r;
+
+    s->frames_per_jpeg = 0;
+
+    s->picture[0] = av_frame_alloc();
+    if (!s->picture[0])
+        return AVERROR(ENOMEM);
+
+    s->picture[1] = av_frame_alloc();
+    if (!s->picture[1]) {
+        av_frame_free(&s->picture[0]);
+        return AVERROR(ENOMEM);
+    }
+
+    s->jpg.picture_ptr      = s->picture[0];
+
+    if (avctx->extradata_size >= 4)
+        s->frames_per_jpeg = AV_RL32(avctx->extradata);
+
+    if (s->frames_per_jpeg <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of frames per jpeg.\n");
+        ret = AVERROR_INVALIDDATA;
+    }
+
+    codec = avcodec_find_decoder(AV_CODEC_ID_MJPEG);
+    if (!codec) {
+        av_log(avctx, AV_LOG_ERROR, "MJPEG codec not found\n");
+        smvjpeg_decode_end(avctx);
+        return AVERROR_DECODER_NOT_FOUND;
+    }
+
+    s->avctx = avcodec_alloc_context3(codec);
+
+    av_dict_set(&thread_opt, "threads", "1", 0);
+    s->avctx->refcounted_frames = 1;
+    s->avctx->flags = avctx->flags;
+    s->avctx->idct_algo = avctx->idct_algo;
+    if ((r = ff_codec_open2_recursive(s->avctx, codec, &thread_opt)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "MJPEG codec failed to open\n");
+        ret = r;
+    }
+    av_dict_free(&thread_opt);
+
+    if (ret < 0)
+        smvjpeg_decode_end(avctx);
+    return ret;
+}
+
+static int smvjpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
+                            AVPacket *avpkt)
+{
+    const AVPixFmtDescriptor *desc;
+    SMVJpegDecodeContext *s = avctx->priv_data;
+    AVFrame* mjpeg_data = s->picture[0];
+    int i, cur_frame = 0, ret = 0;
+
+    cur_frame = avpkt->pts % s->frames_per_jpeg;
+
+    /* Are we at the start of a block? */
+    if (!cur_frame) {
+        av_frame_unref(mjpeg_data);
+        ret = avcodec_decode_video2(s->avctx, mjpeg_data, &s->mjpeg_data_size, avpkt);
+        if (ret < 0) {
+            s->mjpeg_data_size = 0;
+            return ret;
+        }
+    } else if (!s->mjpeg_data_size)
+        return AVERROR(EINVAL);
+
+    desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
+    av_assert0(desc);
+
+    if (mjpeg_data->height % (s->frames_per_jpeg << desc->log2_chroma_h)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid height\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*use the last lot... */
+    *data_size = s->mjpeg_data_size;
+
+    avctx->pix_fmt = s->avctx->pix_fmt;
+
+    /* We shouldn't get here if frames_per_jpeg <= 0 because this was rejected
+       in init */
+    ret = ff_set_dimensions(avctx, mjpeg_data->width, mjpeg_data->height / s->frames_per_jpeg);
+    if (ret < 0) {
+        av_log(s, AV_LOG_ERROR, "Failed to set dimensions\n");
+        return ret;
+    }
+
+    if (*data_size) {
+        s->picture[1]->extended_data = NULL;
+        s->picture[1]->width         = avctx->width;
+        s->picture[1]->height        = avctx->height;
+        s->picture[1]->format        = avctx->pix_fmt;
+        /* ff_init_buffer_info(avctx, &s->picture[1]); */
+        smv_img_pnt(s->picture[1]->data, mjpeg_data->data, mjpeg_data->linesize,
+                    avctx->pix_fmt, avctx->width, avctx->height, cur_frame);
+        for (i = 0; i < AV_NUM_DATA_POINTERS; i++)
+            s->picture[1]->linesize[i] = mjpeg_data->linesize[i];
+
+        ret = av_frame_ref(data, s->picture[1]);
+    }
+
+    return ret;
+}
+
+static const AVClass smvjpegdec_class = {
+    .class_name = "SMVJPEG decoder",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_smvjpeg_decoder = {
+    .name           = "smvjpeg",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMV JPEG"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SMVJPEG,
+    .priv_data_size = sizeof(SMVJpegDecodeContext),
+    .init           = smvjpeg_decode_init,
+    .close          = smvjpeg_decode_end,
+    .decode         = smvjpeg_decode_frame,
+    .priv_class     = &smvjpegdec_class,
+};
diff --git a/libavcodec/snappy.c b/libavcodec/snappy.c
index df6c6b3..7900b0f 100644
--- a/libavcodec/snappy.c
+++ b/libavcodec/snappy.c
@@ -2,20 +2,20 @@
  * Snappy decompression algorithm
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -148,7 +148,7 @@ int ff_snappy_uncompress(GetByteContext *gb, uint8_t *buf, int64_t *size)
         return len;
 
     if (len > *size)
-        return AVERROR_BUG;
+        return AVERROR_BUFFER_TOO_SMALL;
 
     *size = len;
     p     = buf;
diff --git a/libavcodec/snappy.h b/libavcodec/snappy.h
index 8d365c0..a65cb3a 100644
--- a/libavcodec/snappy.h
+++ b/libavcodec/snappy.h
@@ -2,20 +2,20 @@
  * Snappy module
  * Copyright (c) Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
new file mode 100644
index 0000000..a3e6afc
--- /dev/null
+++ b/libavcodec/snow.c
@@ -0,0 +1,733 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "me_cmp.h"
+#include "snow_dwt.h"
+#include "internal.h"
+#include "snow.h"
+#include "snowdata.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+#include "h263.h"
+
+
+void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    int y, x;
+    IDWTELEM * dst;
+    for(y=0; y<b_h; y++){
+        //FIXME ugly misuse of obmc_stride
+        const uint8_t *obmc1= obmc + y*obmc_stride;
+        const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+        const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+        const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        dst = slice_buffer_get_line(sb, src_y + y);
+        for(x=0; x<b_w; x++){
+            int v=   obmc1[x] * block[3][x + y*src_stride]
+                    +obmc2[x] * block[2][x + y*src_stride]
+                    +obmc3[x] * block[1][x + y*src_stride]
+                    +obmc4[x] * block[0][x + y*src_stride];
+
+            v <<= 8 - LOG2_OBMC_MAX;
+            if(FRAC_BITS != 8){
+                v >>= 8 - FRAC_BITS;
+            }
+            if(add){
+                v += dst[x + src_x];
+                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                if(v&(~255)) v= ~(v>>31);
+                dst8[x + y*src_stride] = v;
+            }else{
+                dst[x + src_x] -= v;
+            }
+        }
+    }
+}
+
+int ff_snow_get_buffer(SnowContext *s, AVFrame *frame)
+{
+    int ret, i;
+    int edges_needed = av_codec_is_encoder(s->avctx->codec);
+
+    frame->width  = s->avctx->width ;
+    frame->height = s->avctx->height;
+    if (edges_needed) {
+        frame->width  += 2 * EDGE_WIDTH;
+        frame->height += 2 * EDGE_WIDTH;
+    }
+    if ((ret = ff_get_buffer(s->avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+    if (edges_needed) {
+        for (i = 0; frame->data[i]; i++) {
+            int offset = (EDGE_WIDTH >> (i ? s->chroma_v_shift : 0)) *
+                            frame->linesize[i] +
+                            (EDGE_WIDTH >> (i ? s->chroma_h_shift : 0));
+            frame->data[i] += offset;
+        }
+        frame->width  = s->avctx->width;
+        frame->height = s->avctx->height;
+    }
+
+    return 0;
+}
+
+void ff_snow_reset_contexts(SnowContext *s){ //FIXME better initial contexts
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        for(level=0; level<MAX_DECOMPOSITIONS; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
+            }
+        }
+    }
+    memset(s->header_state, MID_STATE, sizeof(s->header_state));
+    memset(s->block_state, MID_STATE, sizeof(s->block_state));
+}
+
+int ff_snow_alloc_blocks(SnowContext *s){
+    int w= AV_CEIL_RSHIFT(s->avctx->width,  LOG2_MB_SIZE);
+    int h= AV_CEIL_RSHIFT(s->avctx->height, LOG2_MB_SIZE);
+
+    s->b_width = w;
+    s->b_height= h;
+
+    av_free(s->block);
+    s->block= av_mallocz_array(w * h,  sizeof(BlockNode) << (s->block_max_depth*2));
+    if (!s->block)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void init_qexp(void){
+    int i;
+    double v=128;
+
+    for(i=0; i<QROOT; i++){
+        ff_qexp[i]= lrintf(v);
+        v *= pow(2, 1.0 / QROOT);
+    }
+}
+static void mc_block(Plane *p, uint8_t *dst, const uint8_t *src, int stride, int b_w, int b_h, int dx, int dy){
+    static const uint8_t weight[64]={
+    8,7,6,5,4,3,2,1,
+    7,7,0,0,0,0,0,1,
+    6,0,6,0,0,0,2,0,
+    5,0,0,5,0,3,0,0,
+    4,0,0,0,4,0,0,0,
+    3,0,0,5,0,3,0,0,
+    2,0,6,0,0,0,2,0,
+    1,7,0,0,0,0,0,1,
+    };
+
+    static const uint8_t brane[256]={
+    0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
+    0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
+    0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
+    0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
+    0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
+    0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
+    0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
+    0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
+    0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
+    0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
+    0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
+    0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
+    0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
+    0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
+    0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
+    0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
+    };
+
+    static const uint8_t needs[16]={
+    0,1,0,0,
+    2,4,2,0,
+    0,1,0,0,
+    15
+    };
+
+    int x, y, b, r, l;
+    int16_t tmpIt   [64*(32+HTAPS_MAX)];
+    uint8_t tmp2t[3][64*(32+HTAPS_MAX)];
+    int16_t *tmpI= tmpIt;
+    uint8_t *tmp2= tmp2t[0];
+    const uint8_t *hpel[11];
+    av_assert2(dx<16 && dy<16);
+    r= brane[dx + 16*dy]&15;
+    l= brane[dx + 16*dy]>>4;
+
+    b= needs[l] | needs[r];
+    if(p && !p->diag_mc)
+        b= 15;
+
+    if(b&5){
+        for(y=0; y < b_h+HTAPS_MAX-1; y++){
+            for(x=0; x < b_w; x++){
+                int a_1=src[x + HTAPS_MAX/2-4];
+                int a0= src[x + HTAPS_MAX/2-3];
+                int a1= src[x + HTAPS_MAX/2-2];
+                int a2= src[x + HTAPS_MAX/2-1];
+                int a3= src[x + HTAPS_MAX/2+0];
+                int a4= src[x + HTAPS_MAX/2+1];
+                int a5= src[x + HTAPS_MAX/2+2];
+                int a6= src[x + HTAPS_MAX/2+3];
+                int am=0;
+                if(!p || p->fast_mc){
+                    am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+                    tmpI[x]= am;
+                    am= (am+16)>>5;
+                }else{
+                    am= p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6);
+                    tmpI[x]= am;
+                    am= (am+32)>>6;
+                }
+
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            tmpI+= 64;
+            tmp2+= 64;
+            src += stride;
+        }
+        src -= stride*y;
+    }
+    src += HTAPS_MAX/2 - 1;
+    tmp2= tmp2t[1];
+
+    if(b&2){
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w+1; x++){
+                int a_1=src[x + (HTAPS_MAX/2-4)*stride];
+                int a0= src[x + (HTAPS_MAX/2-3)*stride];
+                int a1= src[x + (HTAPS_MAX/2-2)*stride];
+                int a2= src[x + (HTAPS_MAX/2-1)*stride];
+                int a3= src[x + (HTAPS_MAX/2+0)*stride];
+                int a4= src[x + (HTAPS_MAX/2+1)*stride];
+                int a5= src[x + (HTAPS_MAX/2+2)*stride];
+                int a6= src[x + (HTAPS_MAX/2+3)*stride];
+                int am=0;
+                if(!p || p->fast_mc)
+                    am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
+                else
+                    am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 32)>>6;
+
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            src += stride;
+            tmp2+= 64;
+        }
+        src -= stride*y;
+    }
+    src += stride*(HTAPS_MAX/2 - 1);
+    tmp2= tmp2t[2];
+    tmpI= tmpIt;
+    if(b&4){
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
+                int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
+                int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
+                int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
+                int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
+                int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
+                int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
+                int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
+                int am=0;
+                if(!p || p->fast_mc)
+                    am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
+                else
+                    am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 2048)>>12;
+                if(am&(~255)) am= ~(am>>31);
+                tmp2[x]= am;
+            }
+            tmpI+= 64;
+            tmp2+= 64;
+        }
+    }
+
+    hpel[ 0]= src;
+    hpel[ 1]= tmp2t[0] + 64*(HTAPS_MAX/2-1);
+    hpel[ 2]= src + 1;
+
+    hpel[ 4]= tmp2t[1];
+    hpel[ 5]= tmp2t[2];
+    hpel[ 6]= tmp2t[1] + 1;
+
+    hpel[ 8]= src + stride;
+    hpel[ 9]= hpel[1] + 64;
+    hpel[10]= hpel[8] + 1;
+
+#define MC_STRIDE(x) (needs[x] ? 64 : stride)
+
+    if(b==15){
+        int dxy = dx / 8 + dy / 8 * 4;
+        const uint8_t *src1 = hpel[dxy    ];
+        const uint8_t *src2 = hpel[dxy + 1];
+        const uint8_t *src3 = hpel[dxy + 4];
+        const uint8_t *src4 = hpel[dxy + 5];
+        int stride1 = MC_STRIDE(dxy);
+        int stride2 = MC_STRIDE(dxy + 1);
+        int stride3 = MC_STRIDE(dxy + 4);
+        int stride4 = MC_STRIDE(dxy + 5);
+        dx&=7;
+        dy&=7;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
+                         (8-dx)*   dy *src3[x] + dx*   dy *src4[x]+32)>>6;
+            }
+            src1+=stride1;
+            src2+=stride2;
+            src3+=stride3;
+            src4+=stride4;
+            dst +=stride;
+        }
+    }else{
+        const uint8_t *src1= hpel[l];
+        const uint8_t *src2= hpel[r];
+        int stride1 = MC_STRIDE(l);
+        int stride2 = MC_STRIDE(r);
+        int a= weight[((dx&7) + (8*(dy&7)))];
+        int b= 8-a;
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
+            }
+            src1+=stride1;
+            src2+=stride2;
+            dst +=stride;
+        }
+    }
+}
+
+void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t stride, int sx, int sy, int b_w, int b_h, const BlockNode *block, int plane_index, int w, int h){
+    if(block->type & BLOCK_INTRA){
+        int x, y;
+        const unsigned color  = block->color[plane_index];
+        const unsigned color4 = color*0x01010101;
+        if(b_w==32){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+                *(uint32_t*)&dst[8 + y*stride]= color4;
+                *(uint32_t*)&dst[12+ y*stride]= color4;
+                *(uint32_t*)&dst[16+ y*stride]= color4;
+                *(uint32_t*)&dst[20+ y*stride]= color4;
+                *(uint32_t*)&dst[24+ y*stride]= color4;
+                *(uint32_t*)&dst[28+ y*stride]= color4;
+            }
+        }else if(b_w==16){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+                *(uint32_t*)&dst[8 + y*stride]= color4;
+                *(uint32_t*)&dst[12+ y*stride]= color4;
+            }
+        }else if(b_w==8){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+                *(uint32_t*)&dst[4 + y*stride]= color4;
+            }
+        }else if(b_w==4){
+            for(y=0; y < b_h; y++){
+                *(uint32_t*)&dst[0 + y*stride]= color4;
+            }
+        }else{
+            for(y=0; y < b_h; y++){
+                for(x=0; x < b_w; x++){
+                    dst[x + y*stride]= color;
+                }
+            }
+        }
+    }else{
+        uint8_t *src= s->last_picture[block->ref]->data[plane_index];
+        const int scale= plane_index ?  (2*s->mv_scale)>>s->chroma_h_shift : 2*s->mv_scale;
+        int mx= block->mx*scale;
+        int my= block->my*scale;
+        const int dx= mx&15;
+        const int dy= my&15;
+        const int tab_index= 3 - (b_w>>2) + (b_w>>4);
+        sx += (mx>>4) - (HTAPS_MAX/2-1);
+        sy += (my>>4) - (HTAPS_MAX/2-1);
+        src += sx + sy*stride;
+        if(   (unsigned)sx >= FFMAX(w - b_w - (HTAPS_MAX-2), 0)
+           || (unsigned)sy >= FFMAX(h - b_h - (HTAPS_MAX-2), 0)){
+            s->vdsp.emulated_edge_mc(tmp + MB_SIZE, src,
+                                     stride, stride,
+                                     b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1,
+                                     sx, sy, w, h);
+            src= tmp + MB_SIZE;
+        }
+
+        av_assert2(s->chroma_h_shift == s->chroma_v_shift); // only one mv_scale
+
+        av_assert2((tab_index>=0 && tab_index<4) || b_w==32);
+        if(    (dx&3) || (dy&3)
+            || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h)
+            || (b_w&(b_w-1))
+            || b_w == 1
+            || b_h == 1
+            || !s->plane[plane_index].fast_mc )
+            mc_block(&s->plane[plane_index], dst, src, stride, b_w, b_h, dx, dy);
+        else if(b_w==32){
+            int y;
+            for(y=0; y<b_h; y+=16){
+                s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
+                s->h264qpel.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
+            }
+        }else if(b_w==b_h)
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
+        else if(b_w==2*b_h){
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 3       + 3*stride,stride);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
+        }else{
+            av_assert2(2*b_w==b_h);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 3 + 3*stride           ,stride);
+            s->h264qpel.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
+        }
+    }
+}
+
+#define mca(dx,dy,b_w)\
+static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h){\
+    av_assert2(h==b_w);\
+    mc_block(NULL, dst, src-(HTAPS_MAX/2-1)-(HTAPS_MAX/2-1)*stride, stride, b_w, b_w, dx, dy);\
+}
+
+mca( 0, 0,16)
+mca( 8, 0,16)
+mca( 0, 8,16)
+mca( 8, 8,16)
+mca( 0, 0,8)
+mca( 8, 0,8)
+mca( 0, 8,8)
+mca( 8, 8,8)
+
+av_cold int ff_snow_common_init(AVCodecContext *avctx){
+    SnowContext *s = avctx->priv_data;
+    int width, height;
+    int i, j;
+
+    s->avctx= avctx;
+    s->max_ref_frames=1; //just make sure it's not an invalid value in case of no initial keyframe
+    s->spatial_decomposition_count = 1;
+
+    ff_me_cmp_init(&s->mecc, avctx);
+    ff_hpeldsp_init(&s->hdsp, avctx->flags);
+    ff_videodsp_init(&s->vdsp, 8);
+    ff_dwt_init(&s->dwt);
+    ff_h264qpel_init(&s->h264qpel, 8);
+
+#define mcf(dx,dy)\
+    s->qdsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
+    s->qdsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
+        s->h264qpel.put_h264_qpel_pixels_tab[0][dy+dx/4];\
+    s->qdsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
+    s->qdsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
+        s->h264qpel.put_h264_qpel_pixels_tab[1][dy+dx/4];
+
+    mcf( 0, 0)
+    mcf( 4, 0)
+    mcf( 8, 0)
+    mcf(12, 0)
+    mcf( 0, 4)
+    mcf( 4, 4)
+    mcf( 8, 4)
+    mcf(12, 4)
+    mcf( 0, 8)
+    mcf( 4, 8)
+    mcf( 8, 8)
+    mcf(12, 8)
+    mcf( 0,12)
+    mcf( 4,12)
+    mcf( 8,12)
+    mcf(12,12)
+
+#define mcfh(dx,dy)\
+    s->hdsp.put_pixels_tab       [0][dy/4+dx/8]=\
+    s->hdsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 16;\
+    s->hdsp.put_pixels_tab       [1][dy/4+dx/8]=\
+    s->hdsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 8;
+
+    mcfh(0, 0)
+    mcfh(8, 0)
+    mcfh(0, 8)
+    mcfh(8, 8)
+
+    init_qexp();
+
+//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
+
+    width= s->avctx->width;
+    height= s->avctx->height;
+
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->spatial_idwt_buffer, width, height * sizeof(IDWTELEM), fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->spatial_dwt_buffer,  width, height * sizeof(DWTELEM),  fail); //FIXME this does not belong here
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->temp_dwt_buffer,     width, sizeof(DWTELEM),  fail);
+    FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->temp_idwt_buffer,    width, sizeof(IDWTELEM), fail);
+    FF_ALLOC_ARRAY_OR_GOTO(avctx,  s->run_buffer,          ((width + 1) >> 1), ((height + 1) >> 1) * sizeof(*s->run_buffer), fail);
+
+    for(i=0; i<MAX_REF_FRAMES; i++) {
+        for(j=0; j<MAX_REF_FRAMES; j++)
+            ff_scale_mv_ref[i][j] = 256*(i+1)/(j+1);
+        s->last_picture[i] = av_frame_alloc();
+        if (!s->last_picture[i])
+            goto fail;
+    }
+
+    s->mconly_picture = av_frame_alloc();
+    s->current_picture = av_frame_alloc();
+    if (!s->mconly_picture || !s->current_picture)
+        goto fail;
+
+    return 0;
+fail:
+    return AVERROR(ENOMEM);
+}
+
+int ff_snow_common_init_after_header(AVCodecContext *avctx) {
+    SnowContext *s = avctx->priv_data;
+    int plane_index, level, orientation;
+    int ret, emu_buf_size;
+
+    if(!s->scratchbuf) {
+        if ((ret = ff_get_buffer(s->avctx, s->mconly_picture,
+                                 AV_GET_BUFFER_FLAG_REF)) < 0)
+            return ret;
+        FF_ALLOCZ_ARRAY_OR_GOTO(avctx, s->scratchbuf, FFMAX(s->mconly_picture->linesize[0], 2*avctx->width+256), 7*MB_SIZE, fail);
+        emu_buf_size = FFMAX(s->mconly_picture->linesize[0], 2*avctx->width+256) * (2 * MB_SIZE + HTAPS_MAX - 1);
+        FF_ALLOC_OR_GOTO(avctx, s->emu_edge_buffer, emu_buf_size, fail);
+    }
+
+    if(s->mconly_picture->format != avctx->pix_fmt) {
+        av_log(avctx, AV_LOG_ERROR, "pixel format changed\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        int w= s->avctx->width;
+        int h= s->avctx->height;
+
+        if(plane_index){
+            w = AV_CEIL_RSHIFT(w, s->chroma_h_shift);
+            h = AV_CEIL_RSHIFT(h, s->chroma_v_shift);
+        }
+        s->plane[plane_index].width = w;
+        s->plane[plane_index].height= h;
+
+        for(level=s->spatial_decomposition_count-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+
+                b->buf= s->spatial_dwt_buffer;
+                b->level= level;
+                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
+                b->width = (w + !(orientation&1))>>1;
+                b->height= (h + !(orientation>1))>>1;
+
+                b->stride_line = 1 << (s->spatial_decomposition_count - level);
+                b->buf_x_offset = 0;
+                b->buf_y_offset = 0;
+
+                if(orientation&1){
+                    b->buf += (w+1)>>1;
+                    b->buf_x_offset = (w+1)>>1;
+                }
+                if(orientation>1){
+                    b->buf += b->stride>>1;
+                    b->buf_y_offset = b->stride_line >> 1;
+                }
+                b->ibuf= s->spatial_idwt_buffer + (b->buf - s->spatial_dwt_buffer);
+
+                if(level)
+                    b->parent= &s->plane[plane_index].band[level-1][orientation];
+                //FIXME avoid this realloc
+                av_freep(&b->x_coeff);
+                b->x_coeff=av_mallocz_array(((b->width+1) * b->height+1), sizeof(x_and_coeff));
+                if (!b->x_coeff)
+                    goto fail;
+            }
+            w= (w+1)>>1;
+            h= (h+1)>>1;
+        }
+    }
+
+    return 0;
+fail:
+    return AVERROR(ENOMEM);
+}
+
+#define USE_HALFPEL_PLANE 0
+
+static int halfpel_interpol(SnowContext *s, uint8_t *halfpel[4][4], AVFrame *frame){
+    int p,x,y;
+
+    for(p=0; p < s->nb_planes; p++){
+        int is_chroma= !!p;
+        int w= is_chroma ? AV_CEIL_RSHIFT(s->avctx->width,  s->chroma_h_shift) : s->avctx->width;
+        int h= is_chroma ? AV_CEIL_RSHIFT(s->avctx->height, s->chroma_v_shift) : s->avctx->height;
+        int ls= frame->linesize[p];
+        uint8_t *src= frame->data[p];
+
+        halfpel[1][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        halfpel[2][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        halfpel[3][p] = av_malloc_array(ls, (h + 2 * EDGE_WIDTH));
+        if (!halfpel[1][p] || !halfpel[2][p] || !halfpel[3][p]) {
+            av_freep(&halfpel[1][p]);
+            av_freep(&halfpel[2][p]);
+            av_freep(&halfpel[3][p]);
+            return AVERROR(ENOMEM);
+        }
+        halfpel[1][p] += EDGE_WIDTH * (1 + ls);
+        halfpel[2][p] += EDGE_WIDTH * (1 + ls);
+        halfpel[3][p] += EDGE_WIDTH * (1 + ls);
+
+        halfpel[0][p]= src;
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[1][p][i]= (20*(src[i] + src[i+1]) - 5*(src[i-1] + src[i+2]) + (src[i-2] + src[i+3]) + 16 )>>5;
+            }
+        }
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[2][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
+            }
+        }
+        src= halfpel[1][p];
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= y*ls + x;
+
+                halfpel[3][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
+            }
+        }
+
+//FIXME border!
+    }
+    return 0;
+}
+
+void ff_snow_release_buffer(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int i;
+
+    if(s->last_picture[s->max_ref_frames-1]->data[0]){
+        av_frame_unref(s->last_picture[s->max_ref_frames-1]);
+        for(i=0; i<9; i++)
+            if(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3]) {
+                av_free(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] - EDGE_WIDTH*(1+s->current_picture->linesize[i%3]));
+                s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] = NULL;
+            }
+    }
+}
+
+int ff_snow_frame_start(SnowContext *s){
+   AVFrame *tmp;
+   int i, ret;
+
+    ff_snow_release_buffer(s->avctx);
+
+    tmp= s->last_picture[s->max_ref_frames-1];
+    for(i=s->max_ref_frames-1; i>0; i--)
+        s->last_picture[i] = s->last_picture[i-1];
+    memmove(s->halfpel_plane+1, s->halfpel_plane, (s->max_ref_frames-1)*sizeof(void*)*4*4);
+    if(USE_HALFPEL_PLANE && s->current_picture->data[0]) {
+        if((ret = halfpel_interpol(s, s->halfpel_plane[0], s->current_picture)) < 0)
+            return ret;
+    }
+    s->last_picture[0] = s->current_picture;
+    s->current_picture = tmp;
+
+    if(s->keyframe){
+        s->ref_frames= 0;
+    }else{
+        int i;
+        for(i=0; i<s->max_ref_frames && s->last_picture[i]->data[0]; i++)
+            if(i && s->last_picture[i-1]->key_frame)
+                break;
+        s->ref_frames= i;
+        if(s->ref_frames==0){
+            av_log(s->avctx,AV_LOG_ERROR, "No reference frames\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+    if ((ret = ff_snow_get_buffer(s, s->current_picture)) < 0)
+        return ret;
+
+    s->current_picture->key_frame= s->keyframe;
+
+    return 0;
+}
+
+av_cold void ff_snow_common_end(SnowContext *s)
+{
+    int plane_index, level, orientation, i;
+
+    av_freep(&s->spatial_dwt_buffer);
+    av_freep(&s->temp_dwt_buffer);
+    av_freep(&s->spatial_idwt_buffer);
+    av_freep(&s->temp_idwt_buffer);
+    av_freep(&s->run_buffer);
+
+    s->m.me.temp= NULL;
+    av_freep(&s->m.me.scratchpad);
+    av_freep(&s->m.me.map);
+    av_freep(&s->m.me.score_map);
+    av_freep(&s->m.sc.obmc_scratchpad);
+
+    av_freep(&s->block);
+    av_freep(&s->scratchbuf);
+    av_freep(&s->emu_edge_buffer);
+
+    for(i=0; i<MAX_REF_FRAMES; i++){
+        av_freep(&s->ref_mvs[i]);
+        av_freep(&s->ref_scores[i]);
+        if(s->last_picture[i] && s->last_picture[i]->data[0]) {
+            av_assert0(s->last_picture[i]->data[0] != s->current_picture->data[0]);
+        }
+        av_frame_free(&s->last_picture[i]);
+    }
+
+    for(plane_index=0; plane_index < MAX_PLANES; plane_index++){
+        for(level=MAX_DECOMPOSITIONS-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+
+                av_freep(&b->x_coeff);
+            }
+        }
+    }
+    av_frame_free(&s->mconly_picture);
+    av_frame_free(&s->current_picture);
+}
diff --git a/libavcodec/snow.h b/libavcodec/snow.h
new file mode 100644
index 0000000..59c710b
--- /dev/null
+++ b/libavcodec/snow.h
@@ -0,0 +1,708 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOW_H
+#define AVCODEC_SNOW_H
+
+#include "libavutil/motion_vector.h"
+
+#include "hpeldsp.h"
+#include "me_cmp.h"
+#include "qpeldsp.h"
+#include "snow_dwt.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#define FF_MPV_OFFSET(x) (offsetof(MpegEncContext, x) + offsetof(SnowContext, m))
+#include "mpegvideo.h"
+#include "h264qpel.h"
+
+#define MID_STATE 128
+
+#define MAX_PLANES 4
+#define QSHIFT 5
+#define QROOT (1<<QSHIFT)
+#define LOSSLESS_QLOG -128
+#define FRAC_BITS 4
+#define MAX_REF_FRAMES 8
+
+#define LOG2_OBMC_MAX 8
+#define OBMC_MAX (1<<(LOG2_OBMC_MAX))
+typedef struct BlockNode{
+    int16_t mx;                 ///< Motion vector component X, see mv_scale
+    int16_t my;                 ///< Motion vector component Y, see mv_scale
+    uint8_t ref;                ///< Reference frame index
+    uint8_t color[3];           ///< Color for intra
+    uint8_t type;               ///< Bitfield of BLOCK_*
+//#define TYPE_SPLIT    1
+#define BLOCK_INTRA   1         ///< Intra block, inter otherwise
+#define BLOCK_OPT     2         ///< Block needs no checks in this round of iterative motion estiation
+//#define TYPE_NOCOLOR  4
+    uint8_t level; //FIXME merge into type?
+}BlockNode;
+
+static const BlockNode null_block= { //FIXME add border maybe
+    .color= {128,128,128},
+    .mx= 0,
+    .my= 0,
+    .ref= 0,
+    .type= 0,
+    .level= 0,
+};
+
+#define LOG2_MB_SIZE 4
+#define MB_SIZE (1<<LOG2_MB_SIZE)
+#define ENCODER_EXTRA_BITS 4
+#define HTAPS_MAX 8
+
+typedef struct x_and_coeff{
+    int16_t x;
+    uint16_t coeff;
+} x_and_coeff;
+
+typedef struct SubBand{
+    int level;
+    int stride;
+    int width;
+    int height;
+    int qlog;        ///< log(qscale)/log[2^(1/6)]
+    DWTELEM *buf;
+    IDWTELEM *ibuf;
+    int buf_x_offset;
+    int buf_y_offset;
+    int stride_line; ///< Stride measured in lines, not pixels.
+    x_and_coeff * x_coeff;
+    struct SubBand *parent;
+    uint8_t state[/*7*2*/ 7 + 512][32];
+}SubBand;
+
+typedef struct Plane{
+    int width;
+    int height;
+    SubBand band[MAX_DECOMPOSITIONS][4];
+
+    int htaps;
+    int8_t hcoeff[HTAPS_MAX/2];
+    int diag_mc;
+    int fast_mc;
+
+    int last_htaps;
+    int8_t last_hcoeff[HTAPS_MAX/2];
+    int last_diag_mc;
+}Plane;
+
+typedef struct SnowContext{
+    AVClass *class;
+    AVCodecContext *avctx;
+    RangeCoder c;
+    MECmpContext mecc;
+    HpelDSPContext hdsp;
+    QpelDSPContext qdsp;
+    VideoDSPContext vdsp;
+    H264QpelContext h264qpel;
+    MpegvideoEncDSPContext mpvencdsp;
+    SnowDWTContext dwt;
+    AVFrame *input_picture;              ///< new_picture with the internal linesizes
+    AVFrame *current_picture;
+    AVFrame *last_picture[MAX_REF_FRAMES];
+    uint8_t *halfpel_plane[MAX_REF_FRAMES][4][4];
+    AVFrame *mconly_picture;
+//     uint8_t q_context[16];
+    uint8_t header_state[32];
+    uint8_t block_state[128 + 32*128];
+    int keyframe;
+    int always_reset;
+    int version;
+    int spatial_decomposition_type;
+    int last_spatial_decomposition_type;
+    int temporal_decomposition_type;
+    int spatial_decomposition_count;
+    int last_spatial_decomposition_count;
+    int temporal_decomposition_count;
+    int max_ref_frames;
+    int ref_frames;
+    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
+    uint32_t *ref_scores[MAX_REF_FRAMES];
+    DWTELEM *spatial_dwt_buffer;
+    DWTELEM *temp_dwt_buffer;
+    IDWTELEM *spatial_idwt_buffer;
+    IDWTELEM *temp_idwt_buffer;
+    int *run_buffer;
+    int colorspace_type;
+    int chroma_h_shift;
+    int chroma_v_shift;
+    int spatial_scalability;
+    int qlog;
+    int last_qlog;
+    int lambda;
+    int lambda2;
+    int pass1_rc;
+    int mv_scale;
+    int last_mv_scale;
+    int qbias;
+    int last_qbias;
+#define QBIAS_SHIFT 3
+    int b_width;
+    int b_height;
+    int block_max_depth;
+    int last_block_max_depth;
+    int nb_planes;
+    Plane plane[MAX_PLANES];
+    BlockNode *block;
+#define ME_CACHE_SIZE 1024
+    unsigned me_cache[ME_CACHE_SIZE];
+    unsigned me_cache_generation;
+    slice_buffer sb;
+    int memc_only;
+    int no_bitstream;
+    int intra_penalty;
+    int motion_est;
+    int iterative_dia_size;
+    int scenechange_threshold;
+
+    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to eventually make the motion estimation independent of MpegEncContext, so this will be removed then (FIXME/XXX)
+
+    uint8_t *scratchbuf;
+    uint8_t *emu_edge_buffer;
+
+    AVMotionVector *avmv;
+    int avmv_index;
+    uint64_t encoding_error[AV_NUM_DATA_POINTERS];
+
+    int pred;
+}SnowContext;
+
+/* Tables */
+extern const uint8_t * const ff_obmc_tab[4];
+extern uint8_t ff_qexp[QROOT];
+extern int ff_scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
+
+/* C bits used by mmx/sse2/altivec */
+
+static av_always_inline void snow_interleave_line_header(int * i, int width, IDWTELEM * low, IDWTELEM * high){
+    (*i) = (width) - 2;
+
+    if (width & 1){
+        low[(*i)+1] = low[((*i)+1)>>1];
+        (*i)--;
+    }
+}
+
+static av_always_inline void snow_interleave_line_footer(int * i, IDWTELEM * low, IDWTELEM * high){
+    for (; (*i)>=0; (*i)-=2){
+        low[(*i)+1] = high[(*i)>>1];
+        low[*i] = low[(*i)>>1];
+    }
+}
+
+static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM * dst, IDWTELEM * src, IDWTELEM * ref, int width, int w, int lift_high, int mul, int add, int shift){
+    for(; i<w; i++){
+        dst[i] = src[i] - ((mul * (ref[i] + ref[i + 1]) + add) >> shift);
+    }
+
+    if((width^lift_high)&1){
+        dst[w] = src[w] - ((mul * 2 * ref[w] + add) >> shift);
+    }
+}
+
+static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM * dst, IDWTELEM * src, IDWTELEM * ref, int width, int w){
+        for(; i<w; i++){
+            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
+        }
+
+        if(width&1){
+            dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
+        }
+}
+
+/* common code */
+
+int ff_snow_common_init(AVCodecContext *avctx);
+int ff_snow_common_init_after_header(AVCodecContext *avctx);
+void ff_snow_common_end(SnowContext *s);
+void ff_snow_release_buffer(AVCodecContext *avctx);
+void ff_snow_reset_contexts(SnowContext *s);
+int ff_snow_alloc_blocks(SnowContext *s);
+int ff_snow_frame_start(SnowContext *s);
+void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t stride,
+                     int sx, int sy, int b_w, int b_h, const BlockNode *block,
+                     int plane_index, int w, int h);
+int ff_snow_get_buffer(SnowContext *s, AVFrame *frame);
+/* common inline functions */
+//XXX doublecheck all of them should stay inlined
+
+static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
+                           const BlockNode *left, const BlockNode *top, const BlockNode *tr){
+    if(s->ref_frames == 1){
+        *mx = mid_pred(left->mx, top->mx, tr->mx);
+        *my = mid_pred(left->my, top->my, tr->my);
+    }else{
+        const int *scale = ff_scale_mv_ref[ref];
+        *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
+                       (top ->mx * scale[top ->ref] + 128) >>8,
+                       (tr  ->mx * scale[tr  ->ref] + 128) >>8);
+        *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
+                       (top ->my * scale[top ->ref] + 128) >>8,
+                       (tr  ->my * scale[tr  ->ref] + 128) >>8);
+    }
+}
+
+static av_always_inline int same_block(BlockNode *a, BlockNode *b){
+    if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
+        return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
+    }else{
+        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
+    }
+}
+
+//FIXME name cleanup (b_w, block_w, b_width stuff)
+//XXX should we really inline it?
+static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, IDWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    BlockNode *lt= &s->block[b_x + b_y*b_stride];
+    BlockNode *rt= lt+1;
+    BlockNode *lb= lt+b_stride;
+    BlockNode *rb= lb+1;
+    uint8_t *block[4];
+    // When src_stride is large enough, it is possible to interleave the blocks.
+    // Otherwise the blocks are written sequentially in the tmp buffer.
+    int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
+    uint8_t *tmp = s->scratchbuf;
+    uint8_t *ptmp;
+    int x,y;
+
+    if(b_x<0){
+        lt= rt;
+        lb= rb;
+    }else if(b_x + 1 >= b_width){
+        rt= lt;
+        rb= lb;
+    }
+    if(b_y<0){
+        lt= lb;
+        rt= rb;
+    }else if(b_y + 1 >= b_height){
+        lb= lt;
+        rb= rt;
+    }
+
+    if(src_x<0){ //FIXME merge with prev & always round internal width up to *16
+        obmc -= src_x;
+        b_w += src_x;
+        if(!sliced && !offset_dst)
+            dst -= src_x;
+        src_x=0;
+    }
+    if(src_x + b_w > w){
+        b_w = w - src_x;
+    }
+    if(src_y<0){
+        obmc -= src_y*obmc_stride;
+        b_h += src_y;
+        if(!sliced && !offset_dst)
+            dst -= src_y*dst_stride;
+        src_y=0;
+    }
+    if(src_y + b_h> h){
+        b_h = h - src_y;
+    }
+
+    if(b_w<=0 || b_h<=0) return;
+
+    if(!sliced && offset_dst)
+        dst += src_x + src_y*dst_stride;
+    dst8+= src_x + src_y*src_stride;
+//    src += src_x + src_y*src_stride;
+
+    ptmp= tmp + 3*tmp_step;
+    block[0]= ptmp;
+    ptmp+=tmp_step;
+    ff_snow_pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
+
+    if(same_block(lt, rt)){
+        block[1]= block[0];
+    }else{
+        block[1]= ptmp;
+        ptmp+=tmp_step;
+        ff_snow_pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
+    }
+
+    if(same_block(lt, lb)){
+        block[2]= block[0];
+    }else if(same_block(rt, lb)){
+        block[2]= block[1];
+    }else{
+        block[2]= ptmp;
+        ptmp+=tmp_step;
+        ff_snow_pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
+    }
+
+    if(same_block(lt, rb) ){
+        block[3]= block[0];
+    }else if(same_block(rt, rb)){
+        block[3]= block[1];
+    }else if(same_block(lb, rb)){
+        block[3]= block[2];
+    }else{
+        block[3]= ptmp;
+        ff_snow_pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
+    }
+    if(sliced){
+        s->dwt.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    }else{
+        for(y=0; y<b_h; y++){
+            //FIXME ugly misuse of obmc_stride
+            const uint8_t *obmc1= obmc + y*obmc_stride;
+            const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+            const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+            const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+            for(x=0; x<b_w; x++){
+                int v=   obmc1[x] * block[3][x + y*src_stride]
+                        +obmc2[x] * block[2][x + y*src_stride]
+                        +obmc3[x] * block[1][x + y*src_stride]
+                        +obmc4[x] * block[0][x + y*src_stride];
+
+                v <<= 8 - LOG2_OBMC_MAX;
+                if(FRAC_BITS != 8){
+                    v >>= 8 - FRAC_BITS;
+                }
+                if(add){
+                    v += dst[x + y*dst_stride];
+                    v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*src_stride] = v;
+                }else{
+                    dst[x + y*dst_stride] -= v;
+                }
+            }
+        }
+    }
+}
+
+static av_always_inline void predict_slice(SnowContext *s, IDWTELEM *buf, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst8= s->current_picture->data[plane_index];
+    int w= p->width;
+    int h= p->height;
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); // obmc params assume squares
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for(mb_x=0; mb_x<=mb_w; mb_x++){
+        add_yblock(s, 0, NULL, buf, dst8, obmc,
+                   block_w*mb_x - block_w/2,
+                   block_h*mb_y - block_h/2,
+                   block_w, block_h,
+                   w, h,
+                   w, ref_stride, obmc_stride,
+                   mb_x - 1, mb_y - 1,
+                   add, 1, plane_index);
+    }
+}
+
+static av_always_inline void predict_plane(SnowContext *s, IDWTELEM *buf, int plane_index, int add){
+    const int mb_h= s->b_height << s->block_max_depth;
+    int mb_y;
+    for(mb_y=0; mb_y<=mb_h; mb_y++)
+        predict_slice(s, buf, plane_index, add, mb_y);
+}
+
+static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<rem_depth;
+    const int block_h= 1<<rem_depth; //FIXME "w!=h"
+    BlockNode block;
+    int i,j;
+
+    block.color[0]= l;
+    block.color[1]= cb;
+    block.color[2]= cr;
+    block.mx= mx;
+    block.my= my;
+    block.ref= ref;
+    block.type= type;
+    block.level= level;
+
+    for(j=0; j<block_h; j++){
+        for(i=0; i<block_w; i++){
+            s->block[index + i + j*w]= block;
+        }
+    }
+}
+
+static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
+    SnowContext *s = c->avctx->priv_data;
+    const int offset[3]= {
+          y*c->  stride + x,
+        ((y*c->uvstride + x)>>s->chroma_h_shift),
+        ((y*c->uvstride + x)>>s->chroma_h_shift),
+    };
+    int i;
+    for(i=0; i<3; i++){
+        c->src[0][i]= src [i];
+        c->ref[0][i]= ref [i] + offset[i];
+    }
+    av_assert2(!ref_index);
+}
+
+
+/* bitstream functions */
+
+extern const int8_t ff_quant3bA[256];
+
+#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
+
+static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
+    int i;
+
+    if(v){
+        const int a= FFABS(v);
+        const int e= av_log2(a);
+        const int el= FFMIN(e, 10);
+        put_rac(c, state+0, 0);
+
+        for(i=0; i<el; i++){
+            put_rac(c, state+1+i, 1);  //1..10
+        }
+        for(; i<e; i++){
+            put_rac(c, state+1+9, 1);  //1..10
+        }
+        put_rac(c, state+1+FFMIN(i,9), 0);
+
+        for(i=e-1; i>=el; i--){
+            put_rac(c, state+22+9, (a>>i)&1); //22..31
+        }
+        for(; i>=0; i--){
+            put_rac(c, state+22+i, (a>>i)&1); //22..31
+        }
+
+        if(is_signed)
+            put_rac(c, state+11 + el, v < 0); //11..21
+    }else{
+        put_rac(c, state+0, 1);
+    }
+}
+
+static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
+        return 0;
+    else{
+        int i, e, a;
+        e= 0;
+        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
+            e++;
+            if (e > 31)
+                return AVERROR_INVALIDDATA;
+        }
+
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
+        }
+
+        e= -(is_signed && get_rac(c, state+11 + FFMIN(e,10))); //11..21
+        return (a^e)-e;
+    }
+}
+
+static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+
+    av_assert2(v>=0);
+    av_assert2(log2>=-4);
+
+    while(v >= r){
+        put_rac(c, state+4+log2, 1);
+        v -= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+    put_rac(c, state+4+log2, 0);
+
+    for(i=log2-1; i>=0; i--){
+        put_rac(c, state+31-i, (v>>i)&1);
+    }
+}
+
+static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+    int v=0;
+
+    av_assert2(log2>=-4);
+
+    while(log2<28 && get_rac(c, state+4+log2)){
+        v+= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+
+    for(i=log2-1; i>=0; i--){
+        v+= get_rac(c, state+31-i)<<i;
+    }
+
+    return v;
+}
+
+static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    int run, runs;
+    x_and_coeff *xc= b->x_coeff;
+    x_and_coeff *prev_xc= NULL;
+    x_and_coeff *prev2_xc= xc;
+    x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
+    x_and_coeff *prev_parent_xc= parent_xc;
+
+    runs= get_symbol2(&s->c, b->state[30], 0);
+    if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+    else           run= INT_MAX;
+
+    for(y=0; y<h; y++){
+        int v=0;
+        int lt=0, t=0, rt=0;
+
+        if(y && prev_xc->x == 0){
+            rt= prev_xc->coeff;
+        }
+        for(x=0; x<w; x++){
+            int p=0;
+            const int l= v;
+
+            lt= t; t= rt;
+
+            if(y){
+                if(prev_xc->x <= x)
+                    prev_xc++;
+                if(prev_xc->x == x + 1)
+                    rt= prev_xc->coeff;
+                else
+                    rt=0;
+            }
+            if(parent_xc){
+                if(x>>1 > parent_xc->x){
+                    parent_xc++;
+                }
+                if(x>>1 == parent_xc->x){
+                    p= parent_xc->coeff;
+                }
+            }
+            if(/*ll|*/l|lt|t|rt|p){
+                int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
+
+                v=get_rac(&s->c, &b->state[0][context]);
+                if(v){
+                    v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
+                    v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + ff_quant3bA[l&0xFF] + 3*ff_quant3bA[t&0xFF]]);
+                    if ((uint16_t)v != v) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Coefficient damaged\n");
+                        v = 1;
+                    }
+                    xc->x=x;
+                    (xc++)->coeff= v;
+                }
+            }else{
+                if(!run){
+                    if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+                    else           run= INT_MAX;
+                    v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
+                    v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
+                    if ((uint16_t)v != v) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Coefficient damaged\n");
+                        v = 1;
+                    }
+
+                    xc->x=x;
+                    (xc++)->coeff= v;
+                }else{
+                    int max_run;
+                    run--;
+                    v=0;
+                    av_assert2(run >= 0);
+                    if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
+                    else  max_run= FFMIN(run, w-x-1);
+                    if(parent_xc)
+                        max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
+                    av_assert2(max_run >= 0 && max_run <= run);
+
+                    x+= max_run;
+                    run-= max_run;
+                }
+            }
+        }
+        (xc++)->x= w+1; //end marker
+        prev_xc= prev2_xc;
+        prev2_xc= xc;
+
+        if(parent_xc){
+            if(y&1){
+                while(parent_xc->x != parent->width+1)
+                    parent_xc++;
+                parent_xc++;
+                prev_parent_xc= parent_xc;
+            }else{
+                parent_xc= prev_parent_xc;
+            }
+        }
+    }
+
+    (xc++)->x= w+1; //end marker
+}
+
+#endif /* AVCODEC_SNOW_H */
diff --git a/libavcodec/snow_dwt.c b/libavcodec/snow_dwt.c
new file mode 100644
index 0000000..25681e7
--- /dev/null
+++ b/libavcodec/snow_dwt.c
@@ -0,0 +1,860 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2008 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "me_cmp.h"
+#include "snow_dwt.h"
+
+int ff_slice_buffer_init(slice_buffer *buf, int line_count,
+                         int max_allocated_lines, int line_width,
+                         IDWTELEM *base_buffer)
+{
+    int i;
+
+    buf->base_buffer = base_buffer;
+    buf->line_count  = line_count;
+    buf->line_width  = line_width;
+    buf->data_count  = max_allocated_lines;
+    buf->line        = av_mallocz_array(line_count, sizeof(IDWTELEM *));
+    if (!buf->line)
+        return AVERROR(ENOMEM);
+    buf->data_stack  = av_malloc_array(max_allocated_lines, sizeof(IDWTELEM *));
+    if (!buf->data_stack) {
+        av_freep(&buf->line);
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < max_allocated_lines; i++) {
+        buf->data_stack[i] = av_malloc_array(line_width, sizeof(IDWTELEM));
+        if (!buf->data_stack[i]) {
+            for (i--; i >=0; i--)
+                av_freep(&buf->data_stack[i]);
+            av_freep(&buf->data_stack);
+            av_freep(&buf->line);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    buf->data_stack_top = max_allocated_lines - 1;
+    return 0;
+}
+
+IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line)
+{
+    IDWTELEM *buffer;
+
+    av_assert0(buf->data_stack_top >= 0);
+//  av_assert1(!buf->line[line]);
+    if (buf->line[line])
+        return buf->line[line];
+
+    buffer = buf->data_stack[buf->data_stack_top];
+    buf->data_stack_top--;
+    buf->line[line] = buffer;
+
+    return buffer;
+}
+
+void ff_slice_buffer_release(slice_buffer *buf, int line)
+{
+    IDWTELEM *buffer;
+
+    av_assert1(line >= 0 && line < buf->line_count);
+    av_assert1(buf->line[line]);
+
+    buffer = buf->line[line];
+    buf->data_stack_top++;
+    buf->data_stack[buf->data_stack_top] = buffer;
+    buf->line[line]                      = NULL;
+}
+
+void ff_slice_buffer_flush(slice_buffer *buf)
+{
+    int i;
+
+    if (!buf->line)
+        return;
+
+    for (i = 0; i < buf->line_count; i++)
+        if (buf->line[i])
+            ff_slice_buffer_release(buf, i);
+}
+
+void ff_slice_buffer_destroy(slice_buffer *buf)
+{
+    int i;
+    ff_slice_buffer_flush(buf);
+
+    if (buf->data_stack)
+        for (i = buf->data_count - 1; i >= 0; i--)
+            av_freep(&buf->data_stack[i]);
+    av_freep(&buf->data_stack);
+    av_freep(&buf->line);
+}
+
+static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
+                                  int dst_step, int src_step, int ref_step,
+                                  int width, int mul, int add, int shift,
+                                  int highpass, int inverse)
+{
+    const int mirror_left  = !highpass;
+    const int mirror_right = (width & 1) ^ highpass;
+    const int w            = (width >> 1) - 1 + (highpass & width);
+    int i;
+
+#define LIFT(src, ref, inv) ((src) + ((inv) ? -(ref) : +(ref)))
+    if (mirror_left) {
+        dst[0] = LIFT(src[0], ((mul * 2 * ref[0] + add) >> shift), inverse);
+        dst   += dst_step;
+        src   += src_step;
+    }
+
+    for (i = 0; i < w; i++)
+        dst[i * dst_step] = LIFT(src[i * src_step],
+                                 ((mul * (ref[i * ref_step] +
+                                          ref[(i + 1) * ref_step]) +
+                                   add) >> shift),
+                                 inverse);
+
+    if (mirror_right)
+        dst[w * dst_step] = LIFT(src[w * src_step],
+                                 ((mul * 2 * ref[w * ref_step] + add) >> shift),
+                                 inverse);
+}
+
+static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
+                                   int dst_step, int src_step, int ref_step,
+                                   int width, int mul, int add, int shift,
+                                   int highpass, int inverse)
+{
+    const int mirror_left  = !highpass;
+    const int mirror_right = (width & 1) ^ highpass;
+    const int w            = (width >> 1) - 1 + (highpass & width);
+    int i;
+
+    av_assert1(shift == 4);
+#define LIFTS(src, ref, inv)                                            \
+    ((inv) ? (src) + (((ref) + 4 * (src)) >> shift)                     \
+           : -((-16 * (src) + (ref) + add /                             \
+                4 + 1 + (5 << 25)) / (5 * 4) - (1 << 23)))
+    if (mirror_left) {
+        dst[0] = LIFTS(src[0], mul * 2 * ref[0] + add, inverse);
+        dst   += dst_step;
+        src   += src_step;
+    }
+
+    for (i = 0; i < w; i++)
+        dst[i * dst_step] = LIFTS(src[i * src_step],
+                                  mul * (ref[i * ref_step] +
+                                         ref[(i + 1) * ref_step]) + add,
+                                  inverse);
+
+    if (mirror_right)
+        dst[w * dst_step] = LIFTS(src[w * src_step],
+                                  mul * 2 * ref[w * ref_step] + add,
+                                  inverse);
+}
+
+static void horizontal_decompose53i(DWTELEM *b, DWTELEM *temp, int width)
+{
+    const int width2 = width >> 1;
+    int x;
+    const int w2 = (width + 1) >> 1;
+
+    for (x = 0; x < width2; x++) {
+        temp[x]      = b[2 * x];
+        temp[x + w2] = b[2 * x + 1];
+    }
+    if (width & 1)
+        temp[x] = b[2 * x];
+    lift(b + w2, temp + w2, temp,   1, 1, 1, width, -1, 0, 1, 1, 0);
+    lift(b,      temp,      b + w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
+}
+
+static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i]) >> 1;
+}
+
+static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (b0[i] + b2[i] + 2) >> 2;
+}
+
+static void spatial_decompose53i(DWTELEM *buffer, DWTELEM *temp,
+                                 int width, int height, int stride)
+{
+    int y;
+    DWTELEM *b0 = buffer + avpriv_mirror(-2 - 1, height - 1) * stride;
+    DWTELEM *b1 = buffer + avpriv_mirror(-2,     height - 1) * stride;
+
+    for (y = -2; y < height; y += 2) {
+        DWTELEM *b2 = buffer + avpriv_mirror(y + 1, height - 1) * stride;
+        DWTELEM *b3 = buffer + avpriv_mirror(y + 2, height - 1) * stride;
+
+        if (y + 1 < (unsigned)height)
+            horizontal_decompose53i(b2, temp, width);
+        if (y + 2 < (unsigned)height)
+            horizontal_decompose53i(b3, temp, width);
+
+        if (y + 1 < (unsigned)height)
+            vertical_decompose53iH0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_decompose53iL0(b0, b1, b2, width);
+
+        b0 = b2;
+        b1 = b3;
+    }
+}
+
+static void horizontal_decompose97i(DWTELEM *b, DWTELEM *temp, int width)
+{
+    const int w2 = (width + 1) >> 1;
+
+    lift(temp + w2, b + 1, b,         1, 2, 2, width, W_AM, W_AO, W_AS, 1, 1);
+    liftS(temp,     b,     temp + w2, 1, 2, 1, width, W_BM, W_BO, W_BS, 0, 0);
+    lift(b + w2, temp + w2, temp,     1, 1, 1, width, W_CM, W_CO, W_CS, 1, 0);
+    lift(b,      temp,      b + w2,   1, 1, 1, width, W_DM, W_DO, W_DS, 0, 0);
+}
+
+static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+}
+
+static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_CM * (b0[i] + b2[i]) + W_CO) >> W_CS;
+}
+
+static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] = (16 * 4 * b1[i] - 4 * (b0[i] + b2[i]) + W_BO * 5 + (5 << 27)) /
+                (5 * 16) - (1 << 23);
+}
+
+static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
+                                    int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_DM * (b0[i] + b2[i]) + W_DO) >> W_DS;
+}
+
+static void spatial_decompose97i(DWTELEM *buffer, DWTELEM *temp,
+                                 int width, int height, int stride)
+{
+    int y;
+    DWTELEM *b0 = buffer + avpriv_mirror(-4 - 1, height - 1) * stride;
+    DWTELEM *b1 = buffer + avpriv_mirror(-4,     height - 1) * stride;
+    DWTELEM *b2 = buffer + avpriv_mirror(-4 + 1, height - 1) * stride;
+    DWTELEM *b3 = buffer + avpriv_mirror(-4 + 2, height - 1) * stride;
+
+    for (y = -4; y < height; y += 2) {
+        DWTELEM *b4 = buffer + avpriv_mirror(y + 3, height - 1) * stride;
+        DWTELEM *b5 = buffer + avpriv_mirror(y + 4, height - 1) * stride;
+
+        if (y + 3 < (unsigned)height)
+            horizontal_decompose97i(b4, temp, width);
+        if (y + 4 < (unsigned)height)
+            horizontal_decompose97i(b5, temp, width);
+
+        if (y + 3 < (unsigned)height)
+            vertical_decompose97iH0(b3, b4, b5, width);
+        if (y + 2 < (unsigned)height)
+            vertical_decompose97iL0(b2, b3, b4, width);
+        if (y + 1 < (unsigned)height)
+            vertical_decompose97iH1(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_decompose97iL1(b0, b1, b2, width);
+
+        b0 = b2;
+        b1 = b3;
+        b2 = b4;
+        b3 = b5;
+    }
+}
+
+void ff_spatial_dwt(DWTELEM *buffer, DWTELEM *temp, int width, int height,
+                    int stride, int type, int decomposition_count)
+{
+    int level;
+
+    for (level = 0; level < decomposition_count; level++) {
+        switch (type) {
+        case DWT_97:
+            spatial_decompose97i(buffer, temp,
+                                 width >> level, height >> level,
+                                 stride << level);
+            break;
+        case DWT_53:
+            spatial_decompose53i(buffer, temp,
+                                 width >> level, height >> level,
+                                 stride << level);
+            break;
+        }
+    }
+}
+
+static void horizontal_compose53i(IDWTELEM *b, IDWTELEM *temp, int width)
+{
+    const int width2 = width >> 1;
+    const int w2     = (width + 1) >> 1;
+    int x;
+
+    for (x = 0; x < width2; x++) {
+        temp[2 * x]     = b[x];
+        temp[2 * x + 1] = b[x + w2];
+    }
+    if (width & 1)
+        temp[2 * x] = b[x];
+
+    b[0] = temp[0] - ((temp[1] + 1) >> 1);
+    for (x = 2; x < width - 1; x += 2) {
+        b[x]     = temp[x]     - ((temp[x - 1] + temp[x + 1] + 2) >> 2);
+        b[x - 1] = temp[x - 1] + ((b[x - 2]    + b[x]        + 1) >> 1);
+    }
+    if (width & 1) {
+        b[x]     = temp[x]     - ((temp[x - 1]     + 1) >> 1);
+        b[x - 1] = temp[x - 1] + ((b[x - 2] + b[x] + 1) >> 1);
+    } else
+        b[x - 1] = temp[x - 1] + b[x - 2];
+}
+
+static void vertical_compose53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (b0[i] + b2[i]) >> 1;
+}
+
+static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (b0[i] + b2[i] + 2) >> 2;
+}
+
+static void spatial_compose53i_buffered_init(DWTCompose *cs, slice_buffer *sb,
+                                             int height, int stride_line)
+{
+    cs->b0 = slice_buffer_get_line(sb,
+                                   avpriv_mirror(-1 - 1, height - 1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, avpriv_mirror(-1, height - 1) * stride_line);
+    cs->y  = -1;
+}
+
+static void spatial_compose53i_init(DWTCompose *cs, IDWTELEM *buffer,
+                                    int height, int stride)
+{
+    cs->b0 = buffer + avpriv_mirror(-1 - 1, height - 1) * stride;
+    cs->b1 = buffer + avpriv_mirror(-1,     height - 1) * stride;
+    cs->y  = -1;
+}
+
+static void spatial_compose53i_dy_buffered(DWTCompose *cs, slice_buffer *sb,
+                                           IDWTELEM *temp,
+                                           int width, int height,
+                                           int stride_line)
+{
+    int y = cs->y;
+
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 1, height - 1) *
+                                         stride_line);
+    IDWTELEM *b3 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 2, height - 1) *
+                                         stride_line);
+
+    if (y + 1 < (unsigned)height && y < (unsigned)height) {
+        int x;
+
+        for (x = 0; x < width; x++) {
+            b2[x] -= (b1[x] + b3[x] + 2) >> 2;
+            b1[x] += (b0[x] + b2[x])     >> 1;
+        }
+    } else {
+        if (y + 1 < (unsigned)height)
+            vertical_compose53iL0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_compose53iH0(b0, b1, b2, width);
+    }
+
+    if (y - 1 < (unsigned)height)
+        horizontal_compose53i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        horizontal_compose53i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->y  += 2;
+}
+
+static void spatial_compose53i_dy(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride)
+{
+    int y        = cs->y;
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = buffer + avpriv_mirror(y + 1, height - 1) * stride;
+    IDWTELEM *b3 = buffer + avpriv_mirror(y + 2, height - 1) * stride;
+
+    if (y + 1 < (unsigned)height)
+        vertical_compose53iL0(b1, b2, b3, width);
+    if (y + 0 < (unsigned)height)
+        vertical_compose53iH0(b0, b1, b2, width);
+
+    if (y - 1 < (unsigned)height)
+        horizontal_compose53i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        horizontal_compose53i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->y  += 2;
+}
+
+void ff_snow_horizontal_compose97i(IDWTELEM *b, IDWTELEM *temp, int width)
+{
+    const int w2 = (width + 1) >> 1;
+    int x;
+
+    temp[0] = b[0] - ((3 * b[w2] + 2) >> 2);
+    for (x = 1; x < (width >> 1); x++) {
+        temp[2 * x]     = b[x] - ((3 * (b[x + w2 - 1] + b[x + w2]) + 4) >> 3);
+        temp[2 * x - 1] = b[x + w2 - 1] - temp[2 * x - 2] - temp[2 * x];
+    }
+    if (width & 1) {
+        temp[2 * x]     = b[x] - ((3 * b[x + w2 - 1] + 2) >> 2);
+        temp[2 * x - 1] = b[x + w2 - 1] - temp[2 * x - 2] - temp[2 * x];
+    } else
+        temp[2 * x - 1] = b[x + w2 - 1] - 2 * temp[2 * x - 2];
+
+    b[0] = temp[0] + ((2 * temp[0] + temp[1] + 4) >> 3);
+    for (x = 2; x < width - 1; x += 2) {
+        b[x]     = temp[x] + ((4 * temp[x] + temp[x - 1] + temp[x + 1] + 8) >> 4);
+        b[x - 1] = temp[x - 1] + ((3 * (b[x - 2] + b[x])) >> 1);
+    }
+    if (width & 1) {
+        b[x]     = temp[x] + ((2 * temp[x] + temp[x - 1] + 4) >> 3);
+        b[x - 1] = temp[x - 1] + ((3 * (b[x - 2] + b[x])) >> 1);
+    } else
+        b[x - 1] = temp[x - 1] + 3 * b[x - 2];
+}
+
+static void vertical_compose97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+}
+
+static void vertical_compose97iH1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_CM * (b0[i] + b2[i]) + W_CO) >> W_CS;
+}
+
+static void vertical_compose97iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] += (W_BM * (b0[i] + b2[i]) + 4 * b1[i] + W_BO) >> W_BS;
+}
+
+static void vertical_compose97iL1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                  int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++)
+        b1[i] -= (W_DM * (b0[i] + b2[i]) + W_DO) >> W_DS;
+}
+
+void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                 IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                 int width)
+{
+    int i;
+
+    for (i = 0; i < width; i++) {
+        b4[i] -= (W_DM * (b3[i] + b5[i]) + W_DO) >> W_DS;
+        b3[i] -= (W_CM * (b2[i] + b4[i]) + W_CO) >> W_CS;
+        b2[i] += (W_BM * (b1[i] + b3[i]) + 4 * b2[i] + W_BO) >> W_BS;
+        b1[i] += (W_AM * (b0[i] + b2[i]) + W_AO) >> W_AS;
+    }
+}
+
+static void spatial_compose97i_buffered_init(DWTCompose *cs, slice_buffer *sb,
+                                             int height, int stride_line)
+{
+    cs->b0 = slice_buffer_get_line(sb, avpriv_mirror(-3 - 1, height - 1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, avpriv_mirror(-3,     height - 1) * stride_line);
+    cs->b2 = slice_buffer_get_line(sb, avpriv_mirror(-3 + 1, height - 1) * stride_line);
+    cs->b3 = slice_buffer_get_line(sb, avpriv_mirror(-3 + 2, height - 1) * stride_line);
+    cs->y  = -3;
+}
+
+static void spatial_compose97i_init(DWTCompose *cs, IDWTELEM *buffer, int height,
+                                    int stride)
+{
+    cs->b0 = buffer + avpriv_mirror(-3 - 1, height - 1) * stride;
+    cs->b1 = buffer + avpriv_mirror(-3,     height - 1) * stride;
+    cs->b2 = buffer + avpriv_mirror(-3 + 1, height - 1) * stride;
+    cs->b3 = buffer + avpriv_mirror(-3 + 2, height - 1) * stride;
+    cs->y  = -3;
+}
+
+static void spatial_compose97i_dy_buffered(SnowDWTContext *dsp, DWTCompose *cs,
+                                           slice_buffer * sb, IDWTELEM *temp,
+                                           int width, int height,
+                                           int stride_line)
+{
+    int y = cs->y;
+
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = cs->b2;
+    IDWTELEM *b3 = cs->b3;
+    IDWTELEM *b4 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 3, height - 1) *
+                                         stride_line);
+    IDWTELEM *b5 = slice_buffer_get_line(sb,
+                                         avpriv_mirror(y + 4, height - 1) *
+                                         stride_line);
+
+    if (y > 0 && y + 4 < height) {
+        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
+    } else {
+        if (y + 3 < (unsigned)height)
+            vertical_compose97iL1(b3, b4, b5, width);
+        if (y + 2 < (unsigned)height)
+            vertical_compose97iH1(b2, b3, b4, width);
+        if (y + 1 < (unsigned)height)
+            vertical_compose97iL0(b1, b2, b3, width);
+        if (y + 0 < (unsigned)height)
+            vertical_compose97iH0(b0, b1, b2, width);
+    }
+
+    if (y - 1 < (unsigned)height)
+        dsp->horizontal_compose97i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        dsp->horizontal_compose97i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->b2  = b4;
+    cs->b3  = b5;
+    cs->y  += 2;
+}
+
+static void spatial_compose97i_dy(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride)
+{
+    int y        = cs->y;
+    IDWTELEM *b0 = cs->b0;
+    IDWTELEM *b1 = cs->b1;
+    IDWTELEM *b2 = cs->b2;
+    IDWTELEM *b3 = cs->b3;
+    IDWTELEM *b4 = buffer + avpriv_mirror(y + 3, height - 1) * stride;
+    IDWTELEM *b5 = buffer + avpriv_mirror(y + 4, height - 1) * stride;
+
+    if (y + 3 < (unsigned)height)
+        vertical_compose97iL1(b3, b4, b5, width);
+    if (y + 2 < (unsigned)height)
+        vertical_compose97iH1(b2, b3, b4, width);
+    if (y + 1 < (unsigned)height)
+        vertical_compose97iL0(b1, b2, b3, width);
+    if (y + 0 < (unsigned)height)
+        vertical_compose97iH0(b0, b1, b2, width);
+
+    if (y - 1 < (unsigned)height)
+        ff_snow_horizontal_compose97i(b0, temp, width);
+    if (y + 0 < (unsigned)height)
+        ff_snow_horizontal_compose97i(b1, temp, width);
+
+    cs->b0  = b2;
+    cs->b1  = b3;
+    cs->b2  = b4;
+    cs->b3  = b5;
+    cs->y  += 2;
+}
+
+void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer *sb, int width,
+                                   int height, int stride_line, int type,
+                                   int decomposition_count)
+{
+    int level;
+    for (level = decomposition_count - 1; level >= 0; level--) {
+        switch (type) {
+        case DWT_97:
+            spatial_compose97i_buffered_init(cs + level, sb, height >> level,
+                                             stride_line << level);
+            break;
+        case DWT_53:
+            spatial_compose53i_buffered_init(cs + level, sb, height >> level,
+                                             stride_line << level);
+            break;
+        }
+    }
+}
+
+void ff_spatial_idwt_buffered_slice(SnowDWTContext *dsp, DWTCompose *cs,
+                                    slice_buffer *slice_buf, IDWTELEM *temp,
+                                    int width, int height, int stride_line,
+                                    int type, int decomposition_count, int y)
+{
+    const int support = type == 1 ? 3 : 5;
+    int level;
+    if (type == 2)
+        return;
+
+    for (level = decomposition_count - 1; level >= 0; level--)
+        while (cs[level].y <= FFMIN((y >> level) + support, height >> level)) {
+            switch (type) {
+            case DWT_97:
+                spatial_compose97i_dy_buffered(dsp, cs + level, slice_buf, temp,
+                                               width >> level,
+                                               height >> level,
+                                               stride_line << level);
+                break;
+            case DWT_53:
+                spatial_compose53i_dy_buffered(cs + level, slice_buf, temp,
+                                               width >> level,
+                                               height >> level,
+                                               stride_line << level);
+                break;
+            }
+        }
+}
+
+static void spatial_idwt_init(DWTCompose *cs, IDWTELEM *buffer, int width,
+                                 int height, int stride, int type,
+                                 int decomposition_count)
+{
+    int level;
+    for (level = decomposition_count - 1; level >= 0; level--) {
+        switch (type) {
+        case DWT_97:
+            spatial_compose97i_init(cs + level, buffer, height >> level,
+                                    stride << level);
+            break;
+        case DWT_53:
+            spatial_compose53i_init(cs + level, buffer, height >> level,
+                                    stride << level);
+            break;
+        }
+    }
+}
+
+static void spatial_idwt_slice(DWTCompose *cs, IDWTELEM *buffer,
+                                  IDWTELEM *temp, int width, int height,
+                                  int stride, int type,
+                                  int decomposition_count, int y)
+{
+    const int support = type == 1 ? 3 : 5;
+    int level;
+    if (type == 2)
+        return;
+
+    for (level = decomposition_count - 1; level >= 0; level--)
+        while (cs[level].y <= FFMIN((y >> level) + support, height >> level)) {
+            switch (type) {
+            case DWT_97:
+                spatial_compose97i_dy(cs + level, buffer, temp, width >> level,
+                                      height >> level, stride << level);
+                break;
+            case DWT_53:
+                spatial_compose53i_dy(cs + level, buffer, temp, width >> level,
+                                      height >> level, stride << level);
+                break;
+            }
+        }
+}
+
+void ff_spatial_idwt(IDWTELEM *buffer, IDWTELEM *temp, int width, int height,
+                     int stride, int type, int decomposition_count)
+{
+    DWTCompose cs[MAX_DECOMPOSITIONS];
+    int y;
+    spatial_idwt_init(cs, buffer, width, height, stride, type,
+                         decomposition_count);
+    for (y = 0; y < height; y += 4)
+        spatial_idwt_slice(cs, buffer, temp, width, height, stride, type,
+                              decomposition_count, y);
+}
+
+static inline int w_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size,
+                      int w, int h, int type)
+{
+    int s, i, j;
+    const int dec_count = w == 8 ? 3 : 4;
+    int tmp[32 * 32], tmp2[32];
+    int level, ori;
+    static const int scale[2][2][4][4] = {
+        {
+            { // 9/7 8x8 dec=3
+                { 268, 239, 239, 213 },
+                { 0,   224, 224, 152 },
+                { 0,   135, 135, 110 },
+            },
+            { // 9/7 16x16 or 32x32 dec=4
+                { 344, 310, 310, 280 },
+                { 0,   320, 320, 228 },
+                { 0,   175, 175, 136 },
+                { 0,   129, 129, 102 },
+            }
+        },
+        {
+            { // 5/3 8x8 dec=3
+                { 275, 245, 245, 218 },
+                { 0,   230, 230, 156 },
+                { 0,   138, 138, 113 },
+            },
+            { // 5/3 16x16 or 32x32 dec=4
+                { 352, 317, 317, 286 },
+                { 0,   328, 328, 233 },
+                { 0,   180, 180, 140 },
+                { 0,   132, 132, 105 },
+            }
+        }
+    };
+
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j += 4) {
+            tmp[32 * i + j + 0] = (pix1[j + 0] - pix2[j + 0]) << 4;
+            tmp[32 * i + j + 1] = (pix1[j + 1] - pix2[j + 1]) << 4;
+            tmp[32 * i + j + 2] = (pix1[j + 2] - pix2[j + 2]) << 4;
+            tmp[32 * i + j + 3] = (pix1[j + 3] - pix2[j + 3]) << 4;
+        }
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+
+    ff_spatial_dwt(tmp, tmp2, w, h, 32, type, dec_count);
+
+    s = 0;
+    av_assert1(w == h);
+    for (level = 0; level < dec_count; level++)
+        for (ori = level ? 1 : 0; ori < 4; ori++) {
+            int size   = w >> (dec_count - level);
+            int sx     = (ori & 1) ? size : 0;
+            int stride = 32 << (dec_count - level);
+            int sy     = (ori & 2) ? stride >> 1 : 0;
+
+            for (i = 0; i < size; i++)
+                for (j = 0; j < size; j++) {
+                    int v = tmp[sx + sy + i * stride + j] *
+                            scale[type][dec_count - 3][level][ori];
+                    s += FFABS(v);
+                }
+        }
+    av_assert1(s >= 0);
+    return s >> 9;
+}
+
+static int w53_8_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 8, h, 1);
+}
+
+static int w97_8_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 8, h, 0);
+}
+
+static int w53_16_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 16, h, 1);
+}
+
+static int w97_16_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 16, h, 0);
+}
+
+int ff_w53_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 32, h, 1);
+}
+
+int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h)
+{
+    return w_c(v, pix1, pix2, line_size, 32, h, 0);
+}
+
+av_cold void ff_dsputil_init_dwt(MECmpContext *c)
+{
+    c->w53[0] = w53_16_c;
+    c->w53[1] = w53_8_c;
+    c->w97[0] = w97_16_c;
+    c->w97[1] = w97_8_c;
+}
+
+av_cold void ff_dwt_init(SnowDWTContext *c)
+{
+    c->vertical_compose97i   = ff_snow_vertical_compose97i;
+    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
+    c->inner_add_yblock      = ff_snow_inner_add_yblock;
+
+    if (HAVE_MMX)
+        ff_dwt_init_x86(c);
+}
+
+
diff --git a/libavcodec/snow_dwt.h b/libavcodec/snow_dwt.h
new file mode 100644
index 0000000..e2d7528
--- /dev/null
+++ b/libavcodec/snow_dwt.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2004-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOW_DWT_H
+#define AVCODEC_SNOW_DWT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef int DWTELEM;
+typedef short IDWTELEM;
+
+#define MAX_DECOMPOSITIONS 8
+
+typedef struct DWTCompose {
+    IDWTELEM *b0;
+    IDWTELEM *b1;
+    IDWTELEM *b2;
+    IDWTELEM *b3;
+    int y;
+} DWTCompose;
+
+/** Used to minimize the amount of memory used in order to
+ *  optimize cache performance. **/
+typedef struct slice_buffer_s {
+    IDWTELEM **line;   ///< For use by idwt and predict_slices.
+    IDWTELEM **data_stack;   ///< Used for internal purposes.
+    int data_stack_top;
+    int line_count;
+    int line_width;
+    int data_count;
+    IDWTELEM *base_buffer;  ///< Buffer that this structure is caching.
+} slice_buffer;
+
+struct SnowDWTContext;
+
+typedef struct SnowDWTContext {
+    void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                int width);
+    void (*horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width);
+    void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride,
+                             uint8_t **block, int b_w, int b_h, int src_x,
+                             int src_y, int src_stride, slice_buffer *sb,
+                             int add, uint8_t *dst8);
+} SnowDWTContext;
+
+
+#define DWT_97 0
+#define DWT_53 1
+
+#define liftS lift
+#define W_AM 3
+#define W_AO 0
+#define W_AS 1
+
+#undef liftS
+#define W_BM 1
+#define W_BO 8
+#define W_BS 4
+
+#define W_CM 1
+#define W_CO 0
+#define W_CS 0
+
+#define W_DM 3
+#define W_DO 4
+#define W_DS 3
+
+#define slice_buffer_get_line(slice_buf, line_num)                          \
+    ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num]              \
+                                 : ff_slice_buffer_load_line((slice_buf),   \
+                                                             (line_num)))
+
+int ff_slice_buffer_init(slice_buffer *buf, int line_count,
+                         int max_allocated_lines, int line_width,
+                         IDWTELEM *base_buffer);
+void ff_slice_buffer_release(slice_buffer *buf, int line);
+void ff_slice_buffer_flush(slice_buffer *buf);
+void ff_slice_buffer_destroy(slice_buffer *buf);
+IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line);
+
+void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+                                 IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5,
+                                 int width);
+void ff_snow_horizontal_compose97i(IDWTELEM *b, IDWTELEM *temp, int width);
+void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride,
+                              uint8_t **block, int b_w, int b_h, int src_x,
+                              int src_y, int src_stride, slice_buffer *sb,
+                              int add, uint8_t *dst8);
+
+int ff_w53_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h);
+int ff_w97_32_c(struct MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t line_size, int h);
+
+void ff_spatial_dwt(int *buffer, int *temp, int width, int height, int stride,
+                    int type, int decomposition_count);
+
+void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer *sb, int width,
+                                   int height, int stride_line, int type,
+                                   int decomposition_count);
+void ff_spatial_idwt_buffered_slice(SnowDWTContext *dsp, DWTCompose *cs,
+                                    slice_buffer *slice_buf, IDWTELEM *temp,
+                                    int width, int height, int stride_line,
+                                    int type, int decomposition_count, int y);
+void ff_spatial_idwt(IDWTELEM *buffer, IDWTELEM *temp, int width, int height,
+                     int stride, int type, int decomposition_count);
+
+void ff_dwt_init(SnowDWTContext *c);
+void ff_dwt_init_x86(SnowDWTContext *c);
+
+#endif /* AVCODEC_DWT_H */
diff --git a/libavcodec/snowdata.h b/libavcodec/snowdata.h
new file mode 100644
index 0000000..490fdf8
--- /dev/null
+++ b/libavcodec/snowdata.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SNOWDATA_H
+#define AVCODEC_SNOWDATA_H
+
+#include "snow.h"
+
+static const uint8_t obmc32[1024]={
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+ //error:0.000020
+};
+static const uint8_t obmc16[256]={
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+//error:0.000015
+};
+
+//linear *64
+static const uint8_t obmc8[64]={
+  4, 12, 20, 28, 28, 20, 12,  4,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 20, 60,100,140,140,100, 60, 20,
+ 28, 84,140,196,196,140, 84, 28,
+ 28, 84,140,196,196,140, 84, 28,
+ 20, 60,100,140,140,100, 60, 20,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+  4, 12, 20, 28, 28, 20, 12,  4,
+//error:0.000000
+};
+
+//linear *64
+static const uint8_t obmc4[16]={
+ 16, 48, 48, 16,
+ 48,144,144, 48,
+ 48,144,144, 48,
+ 16, 48, 48, 16,
+//error:0.000000
+};
+
+const int8_t ff_quant3bA[256]={
+ 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+};
+
+const uint8_t * const ff_obmc_tab[4]= {
+    obmc32, obmc16, obmc8, obmc4
+};
+
+/* runtime generated tables */
+uint8_t ff_qexp[QROOT];
+int ff_scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
+
+
+#endif /* AVCODEC_SNOW_H */
diff --git a/libavcodec/snowdec.c b/libavcodec/snowdec.c
new file mode 100644
index 0000000..042aecb
--- /dev/null
+++ b/libavcodec/snowdec.c
@@ -0,0 +1,651 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "snow_dwt.h"
+#include "internal.h"
+#include "snow.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#include "mpegvideo.h"
+#include "h263.h"
+
+static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, IDWTELEM * old_buffer, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst8= s->current_picture->data[plane_index];
+    int w= p->width;
+    int h= p->height;
+
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                IDWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++){
+//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_h*mb_y; y<FFMIN(h,block_h*(mb_y+1)); y++){
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                IDWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++){
+                    line[x] -= 128 << FRAC_BITS;
+//                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for(mb_x=0; mb_x<=mb_w; mb_x++){
+        add_yblock(s, 1, sb, old_buffer, dst8, obmc,
+                   block_w*mb_x - block_w/2,
+                   block_h*mb_y - block_h/2,
+                   block_w, block_h,
+                   w, h,
+                   w, ref_stride, obmc_stride,
+                   mb_x - 1, mb_y - 1,
+                   add, 0, plane_index);
+    }
+
+    if(s->avmv && mb_y < mb_h && plane_index == 0)
+        for(mb_x=0; mb_x<mb_w; mb_x++){
+            AVMotionVector *avmv = s->avmv + s->avmv_index;
+            const int b_width = s->b_width  << s->block_max_depth;
+            const int b_stride= b_width;
+            BlockNode *bn= &s->block[mb_x + mb_y*b_stride];
+
+            if (bn->type)
+                continue;
+
+            s->avmv_index++;
+
+            avmv->w = block_w;
+            avmv->h = block_h;
+            avmv->dst_x = block_w*mb_x - block_w/2;
+            avmv->dst_y = block_h*mb_y - block_h/2;
+            avmv->motion_scale = 8;
+            avmv->motion_x = bn->mx * s->mv_scale;
+            avmv->motion_y = bn->my * s->mv_scale;
+            avmv->src_x = avmv->dst_x + avmv->motion_x / 8;
+            avmv->src_y = avmv->dst_y + avmv->motion_y / 8;
+            avmv->source= -1 - bn->ref;
+            avmv->flags = 0;
+        }
+}
+
+static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
+    const int w= b->width;
+    int y;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int new_index = 0;
+
+    if(b->ibuf == s->spatial_idwt_buffer || s->qlog == LOSSLESS_QLOG){
+        qadd= 0;
+        qmul= 1<<QEXPSHIFT;
+    }
+
+    /* If we are on the second or later slice, restore our index. */
+    if (start_y != 0)
+        new_index = save_state[0];
+
+
+    for(y=start_y; y<h; y++){
+        int x = 0;
+        int v;
+        IDWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
+        memset(line, 0, b->width*sizeof(IDWTELEM));
+        v = b->x_coeff[new_index].coeff;
+        x = b->x_coeff[new_index++].x;
+        while(x < w){
+            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
+            register int u= -(v&1);
+            line[x] = (t^u) - u;
+
+            v = b->x_coeff[new_index].coeff;
+            x = b->x_coeff[new_index++].x;
+        }
+    }
+
+    /* Save our variables for the next slice. */
+    save_state[0] = new_index;
+
+    return;
+}
+
+static int decode_q_branch(SnowContext *s, int level, int x, int y){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    int trx= (x+1)<<rem_depth;
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    int res;
+
+    if(s->keyframe){
+        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
+        return 0;
+    }
+
+    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
+        int type, mx, my;
+        int l = left->color[0];
+        int cb= left->color[1];
+        int cr= left->color[2];
+        unsigned ref = 0;
+        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+        int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
+        int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
+
+        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
+
+        if(type){
+            pred_mv(s, &mx, &my, 0, left, top, tr);
+            l += get_symbol(&s->c, &s->block_state[32], 1);
+            if (s->nb_planes > 2) {
+                cb+= get_symbol(&s->c, &s->block_state[64], 1);
+                cr+= get_symbol(&s->c, &s->block_state[96], 1);
+            }
+        }else{
+            if(s->ref_frames > 1)
+                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
+            if (ref >= s->ref_frames) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid ref\n");
+                return AVERROR_INVALIDDATA;
+            }
+            pred_mv(s, &mx, &my, ref, left, top, tr);
+            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
+            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
+        }
+        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
+    }else{
+        if ((res = decode_q_branch(s, level+1, 2*x+0, 2*y+0)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+1, 2*y+0)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+0, 2*y+1)) < 0 ||
+            (res = decode_q_branch(s, level+1, 2*x+1, 2*y+1)) < 0)
+            return res;
+    }
+    return 0;
+}
+
+static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int start_y, int end_y){
+    const int w= b->width;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+
+    if(s->qlog == LOSSLESS_QLOG) return;
+
+    for(y=start_y; y<end_y; y++){
+//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        IDWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            int i= line[x];
+            if(i<0){
+                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+}
+
+static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
+    const int w= b->width;
+    int x,y;
+
+    IDWTELEM * line=0; // silence silly "could be used without having been initialized" warning
+    IDWTELEM * prev;
+
+    if (start_y != 0)
+        line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+
+    for(y=start_y; y<end_y; y++){
+        prev = line;
+//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
+                    else  line[x] += line[x - 1];
+                }else{
+                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
+                    else  line[x] += line[x - 1];
+                }
+            }else{
+                if(y) line[x] += prev[x];
+            }
+        }
+    }
+}
+
+static void decode_qlogs(SnowContext *s){
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                int q;
+                if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
+                else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
+                else                    q= get_symbol(&s->c, s->header_state, 1);
+                s->plane[plane_index].band[level][orientation].qlog= q;
+            }
+        }
+    }
+}
+
+#define GET_S(dst, check) \
+    tmp= get_symbol(&s->c, s->header_state, 0);\
+    if(!(check)){\
+        av_log(s->avctx, AV_LOG_ERROR, "Error " #dst " is %d\n", tmp);\
+        return AVERROR_INVALIDDATA;\
+    }\
+    dst= tmp;
+
+static int decode_header(SnowContext *s){
+    int plane_index, tmp;
+    uint8_t kstate[32];
+
+    memset(kstate, MID_STATE, sizeof(kstate));
+
+    s->keyframe= get_rac(&s->c, kstate);
+    if(s->keyframe || s->always_reset){
+        ff_snow_reset_contexts(s);
+        s->spatial_decomposition_type=
+        s->qlog=
+        s->qbias=
+        s->mv_scale=
+        s->block_max_depth= 0;
+    }
+    if(s->keyframe){
+        GET_S(s->version, tmp <= 0U)
+        s->always_reset= get_rac(&s->c, s->header_state);
+        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
+        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
+        GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
+        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
+        if (s->colorspace_type == 1) {
+            s->avctx->pix_fmt= AV_PIX_FMT_GRAY8;
+            s->nb_planes = 1;
+        } else if(s->colorspace_type == 0) {
+            s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
+            s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
+
+            if(s->chroma_h_shift == 1 && s->chroma_v_shift==1){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+            }else if(s->chroma_h_shift == 0 && s->chroma_v_shift==0){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV444P;
+            }else if(s->chroma_h_shift == 2 && s->chroma_v_shift==2){
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV410P;
+            } else {
+                av_log(s, AV_LOG_ERROR, "unsupported color subsample mode %d %d\n", s->chroma_h_shift, s->chroma_v_shift);
+                s->chroma_h_shift = s->chroma_v_shift = 1;
+                s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+                return AVERROR_INVALIDDATA;
+            }
+            s->nb_planes = 3;
+        } else {
+            av_log(s, AV_LOG_ERROR, "unsupported color space\n");
+            s->chroma_h_shift = s->chroma_v_shift = 1;
+            s->avctx->pix_fmt= AV_PIX_FMT_YUV420P;
+            return AVERROR_INVALIDDATA;
+        }
+
+
+        s->spatial_scalability= get_rac(&s->c, s->header_state);
+//        s->rate_scalability= get_rac(&s->c, s->header_state);
+        GET_S(s->max_ref_frames, tmp < (unsigned)MAX_REF_FRAMES)
+        s->max_ref_frames++;
+
+        decode_qlogs(s);
+    }
+
+    if(!s->keyframe){
+        if(get_rac(&s->c, s->header_state)){
+            for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+                int htaps, i, sum=0;
+                Plane *p= &s->plane[plane_index];
+                p->diag_mc= get_rac(&s->c, s->header_state);
+                htaps= get_symbol(&s->c, s->header_state, 0)*2 + 2;
+                if((unsigned)htaps > HTAPS_MAX || htaps==0)
+                    return AVERROR_INVALIDDATA;
+                p->htaps= htaps;
+                for(i= htaps/2; i; i--){
+                    p->hcoeff[i]= get_symbol(&s->c, s->header_state, 0) * (1-2*(i&1));
+                    sum += p->hcoeff[i];
+                }
+                p->hcoeff[0]= 32-sum;
+            }
+            s->plane[2].diag_mc= s->plane[1].diag_mc;
+            s->plane[2].htaps  = s->plane[1].htaps;
+            memcpy(s->plane[2].hcoeff, s->plane[1].hcoeff, sizeof(s->plane[1].hcoeff));
+        }
+        if(get_rac(&s->c, s->header_state)){
+            GET_S(s->spatial_decomposition_count, 0 < tmp && tmp <= MAX_DECOMPOSITIONS)
+            decode_qlogs(s);
+        }
+    }
+
+    s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
+    if(s->spatial_decomposition_type > 1U){
+        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported\n", s->spatial_decomposition_type);
+        return AVERROR_INVALIDDATA;
+    }
+    if(FFMIN(s->avctx-> width>>s->chroma_h_shift,
+             s->avctx->height>>s->chroma_v_shift) >> (s->spatial_decomposition_count-1) <= 1){
+        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_count %d too large for size\n", s->spatial_decomposition_count);
+        return AVERROR_INVALIDDATA;
+    }
+
+
+    s->qlog           += get_symbol(&s->c, s->header_state, 1);
+    s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
+    s->qbias          += get_symbol(&s->c, s->header_state, 1);
+    s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
+    if(s->block_max_depth > 1 || s->block_max_depth < 0){
+        av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large\n", s->block_max_depth);
+        s->block_max_depth= 0;
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    int ret;
+
+    if ((ret = ff_snow_common_init(avctx)) < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+static int decode_blocks(SnowContext *s){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+    int res;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            if ((res = decode_q_branch(s, 0, x, y)) < 0)
+                return res;
+        }
+    }
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    int bytes_read;
+    AVFrame *picture = data;
+    int level, orientation, plane_index;
+    int res;
+
+    ff_init_range_decoder(c, buf, buf_size);
+    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
+
+    s->current_picture->pict_type= AV_PICTURE_TYPE_I; //FIXME I vs. P
+    if ((res = decode_header(s)) < 0)
+        return res;
+    if ((res=ff_snow_common_init_after_header(avctx)) < 0)
+        return res;
+
+    // realloc slice buffer for the case that spatial_decomposition_count changed
+    ff_slice_buffer_destroy(&s->sb);
+    if ((res = ff_slice_buffer_init(&s->sb, s->plane[0].height,
+                                    (MB_SIZE >> s->block_max_depth) +
+                                    s->spatial_decomposition_count * 11 + 1,
+                                    s->plane[0].width,
+                                    s->spatial_idwt_buffer)) < 0)
+        return res;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        p->fast_mc= p->diag_mc && p->htaps==6 && p->hcoeff[0]==40
+                                              && p->hcoeff[1]==-10
+                                              && p->hcoeff[2]==2;
+    }
+
+    ff_snow_alloc_blocks(s);
+
+    if((res = ff_snow_frame_start(s)) < 0)
+        return res;
+
+    s->current_picture->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+
+    //keyframe flag duplication mess FIXME
+    if(avctx->debug&FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_ERROR,
+               "keyframe:%d qlog:%d qbias: %d mvscale: %d "
+               "decomposition_type:%d decomposition_count:%d\n",
+               s->keyframe, s->qlog, s->qbias, s->mv_scale,
+               s->spatial_decomposition_type,
+               s->spatial_decomposition_count
+              );
+
+    av_assert0(!s->avmv);
+    if (s->avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS) {
+        s->avmv = av_malloc_array(s->b_width * s->b_height, sizeof(AVMotionVector) << (s->block_max_depth*2));
+    }
+    s->avmv_index = 0;
+
+    if ((res = decode_blocks(s)) < 0)
+        return res;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+        int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
+
+        if(s->avctx->debug&2048){
+            memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+
+            for(y=0; y<h; y++){
+                for(x=0; x<w; x++){
+                    int v= s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x];
+                    s->mconly_picture->data[plane_index][y*s->mconly_picture->linesize[plane_index] + x]= v;
+                }
+            }
+        }
+
+        {
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &p->band[level][orientation];
+                unpack_coeffs(s, b, b->parent, orientation);
+            }
+        }
+        }
+
+        {
+        const int mb_h= s->b_height << s->block_max_depth;
+        const int block_size = MB_SIZE >> s->block_max_depth;
+        const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+        int mb_y;
+        DWTCompose cs[MAX_DECOMPOSITIONS];
+        int yd=0, yq=0;
+        int y;
+        int end_y;
+
+        ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
+        for(mb_y=0; mb_y<=mb_h; mb_y++){
+
+            int slice_starty = block_h*mb_y;
+            int slice_h = block_h*(mb_y+1);
+
+            if (!(s->keyframe || s->avctx->debug&512)){
+                slice_starty = FFMAX(0, slice_starty - (block_h >> 1));
+                slice_h -= (block_h >> 1);
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+                    int start_y;
+                    int end_y;
+                    int our_mb_start = mb_y;
+                    int our_mb_end = (mb_y + 1);
+                    const int extra= 3;
+                    start_y = (mb_y ? ((block_h * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
+                    end_y = (((block_h * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
+                    if (!(s->keyframe || s->avctx->debug&512)){
+                        start_y = FFMAX(0, start_y - (block_h >> (1+s->spatial_decomposition_count - level)));
+                        end_y = FFMAX(0, end_y - (block_h >> (1+s->spatial_decomposition_count - level)));
+                    }
+                    start_y = FFMIN(b->height, start_y);
+                    end_y = FFMIN(b->height, end_y);
+
+                    if (start_y != end_y){
+                        if (orientation == 0){
+                            SubBand * correlate_band = &p->band[0][0];
+                            int correlate_end_y = FFMIN(b->height, end_y + 1);
+                            int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
+                            decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
+                            correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
+                            dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, start_y, end_y);
+                        }
+                        else
+                            decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
+                    }
+                }
+            }
+
+            for(; yd<slice_h; yd+=4){
+                ff_spatial_idwt_buffered_slice(&s->dwt, cs, &s->sb, s->temp_idwt_buffer, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
+            }
+
+            if(s->qlog == LOSSLESS_QLOG){
+                for(; yq<slice_h && yq<h; yq++){
+                    IDWTELEM * line = slice_buffer_get_line(&s->sb, yq);
+                    for(x=0; x<w; x++){
+                        line[x] <<= FRAC_BITS;
+                    }
+                }
+            }
+
+            predict_slice_buffered(s, &s->sb, s->spatial_idwt_buffer, plane_index, 1, mb_y);
+
+            y = FFMIN(p->height, slice_starty);
+            end_y = FFMIN(p->height, slice_h);
+            while(y < end_y)
+                ff_slice_buffer_release(&s->sb, y++);
+        }
+
+        ff_slice_buffer_flush(&s->sb);
+        }
+
+    }
+
+    emms_c();
+
+    ff_snow_release_buffer(avctx);
+
+    if(!(s->avctx->debug&2048))
+        res = av_frame_ref(picture, s->current_picture);
+    else
+        res = av_frame_ref(picture, s->mconly_picture);
+    if (res >= 0 && s->avmv_index) {
+        AVFrameSideData *sd;
+
+        sd = av_frame_new_side_data(picture, AV_FRAME_DATA_MOTION_VECTORS, s->avmv_index * sizeof(AVMotionVector));
+        if (!sd)
+            return AVERROR(ENOMEM);
+        memcpy(sd->data, s->avmv, s->avmv_index * sizeof(AVMotionVector));
+    }
+
+    av_freep(&s->avmv);
+
+    if (res < 0)
+        return res;
+
+    *got_frame = 1;
+
+    bytes_read= c->bytestream - c->bytestream_start;
+    if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
+
+    return bytes_read;
+}
+
+static av_cold int decode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    ff_slice_buffer_destroy(&s->sb);
+
+    ff_snow_common_end(s);
+
+    return 0;
+}
+
+AVCodec ff_snow_decoder = {
+    .name           = "snow",
+    .long_name      = NULL_IF_CONFIG_SMALL("Snow"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SNOW,
+    .priv_data_size = sizeof(SnowContext),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 /*| AV_CODEC_CAP_DRAW_HORIZ_BAND*/,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/snowenc-test.c b/libavcodec/snowenc-test.c
new file mode 100644
index 0000000..e1ed86f
--- /dev/null
+++ b/libavcodec/snowenc-test.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "snowenc.c"
+
+#undef malloc
+#undef free
+#undef printf
+
+#include "libavutil/lfg.h"
+#include "libavutil/mathematics.h"
+
+int main(void){
+#define width  256
+#define height 256
+    int buffer[2][width*height];
+    SnowContext s;
+    int i;
+    AVLFG prng;
+    s.spatial_decomposition_count=6;
+    s.spatial_decomposition_type=1;
+
+    s.temp_dwt_buffer  = av_mallocz_array(width, sizeof(DWTELEM));
+    s.temp_idwt_buffer = av_mallocz_array(width, sizeof(IDWTELEM));
+
+    if (!s.temp_dwt_buffer || !s.temp_idwt_buffer) {
+        fprintf(stderr, "Failed to allocate memory\n");
+        return 1;
+    }
+
+    av_lfg_init(&prng, 1);
+
+    printf("testing 5/3 DWT\n");
+    for(i=0; i<width*height; i++)
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
+
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+
+    for(i=0; i<width*height; i++)
+        if(buffer[0][i]!= buffer[1][i]) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
+
+    printf("testing 9/7 DWT\n");
+    s.spatial_decomposition_type=0;
+    for(i=0; i<width*height; i++)
+        buffer[0][i] = buffer[1][i] = av_lfg_get(&prng) % 54321 - 12345;
+
+    ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+    ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+
+    for(i=0; i<width*height; i++)
+        if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %6d %12d %7d\n",i, buffer[0][i], buffer[1][i]);
+
+    {
+    int level, orientation, x, y;
+    int64_t errors[8][4];
+    int64_t g=0;
+
+        memset(errors, 0, sizeof(errors));
+        s.spatial_decomposition_count=3;
+        s.spatial_decomposition_type=0;
+        for(level=0; level<s.spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                int w= width  >> (s.spatial_decomposition_count-level);
+                int h= height >> (s.spatial_decomposition_count-level);
+                int stride= width  << (s.spatial_decomposition_count-level);
+                DWTELEM *buf= buffer[0];
+                int64_t error=0;
+
+                if(orientation&1) buf+=w;
+                if(orientation>1) buf+=stride>>1;
+
+                memset(buffer[0], 0, sizeof(int)*width*height);
+                buf[w/2 + h/2*stride]= 256*256;
+                ff_spatial_idwt((IDWTELEM*)buffer[0], s.temp_idwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+                for(y=0; y<height; y++){
+                    for(x=0; x<width; x++){
+                        int64_t d= buffer[0][x + y*width];
+                        error += d*d;
+                        if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
+                    }
+                    if(FFABS(height/2-y)<9 && level==2) printf("\n");
+                }
+                error= (int)(sqrt(error)+0.5);
+                errors[level][orientation]= error;
+                if(g) g=av_gcd(g, error);
+                else g= error;
+            }
+        }
+        printf("static int const visual_weight[][4]={\n");
+        for(level=0; level<s.spatial_decomposition_count; level++){
+            printf("  {");
+            for(orientation=0; orientation<4; orientation++){
+                printf("%8"PRId64",", errors[level][orientation]/g);
+            }
+            printf("},\n");
+        }
+        printf("};\n");
+        {
+            int level=2;
+            int w= width  >> (s.spatial_decomposition_count-level);
+            //int h= height >> (s.spatial_decomposition_count-level);
+            int stride= width  << (s.spatial_decomposition_count-level);
+            DWTELEM *buf= buffer[0];
+            int64_t error=0;
+
+            buf+=w;
+            buf+=stride>>1;
+
+            memset(buffer[0], 0, sizeof(int)*width*height);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int tab[4]={0,2,3,1};
+                    buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
+                }
+            }
+            ff_spatial_dwt(buffer[0], s.temp_dwt_buffer, width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= buffer[0][x + y*width];
+                    error += d*d;
+                    if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
+                }
+                if(FFABS(height/2-y)<9) printf("\n");
+            }
+        }
+
+    }
+    return 0;
+}
diff --git a/libavcodec/snowenc.c b/libavcodec/snowenc.c
new file mode 100644
index 0000000..00aef57
--- /dev/null
+++ b/libavcodec/snowenc.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intmath.h"
+#include "libavutil/libm.h"
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "snow_dwt.h"
+#include "snow.h"
+
+#include "rangecoder.h"
+#include "mathops.h"
+
+#include "mpegvideo.h"
+#include "h263.h"
+
+#define FF_ME_ITER 50
+
+static av_cold int encode_init(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int plane_index, ret;
+    int i;
+
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->prediction_method)
+        s->pred = avctx->prediction_method;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    if(s->pred == DWT_97
+       && (avctx->flags & AV_CODEC_FLAG_QSCALE)
+       && avctx->global_quality == 0){
+        av_log(avctx, AV_LOG_ERROR, "The 9/7 wavelet is incompatible with lossless mode.\n");
+        return -1;
+    }
+#if FF_API_MOTION_EST
+FF_DISABLE_DEPRECATION_WARNINGS
+    if (avctx->me_method == ME_ITER)
+        s->motion_est = FF_ME_ITER;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    s->spatial_decomposition_type= s->pred; //FIXME add decorrelator type r transform_type
+
+    s->mv_scale       = (avctx->flags & AV_CODEC_FLAG_QPEL) ? 2 : 4;
+    s->block_max_depth= (avctx->flags & AV_CODEC_FLAG_4MV ) ? 1 : 0;
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        s->plane[plane_index].diag_mc= 1;
+        s->plane[plane_index].htaps= 6;
+        s->plane[plane_index].hcoeff[0]=  40;
+        s->plane[plane_index].hcoeff[1]= -10;
+        s->plane[plane_index].hcoeff[2]=   2;
+        s->plane[plane_index].fast_mc= 1;
+    }
+
+    if ((ret = ff_snow_common_init(avctx)) < 0) {
+        return ret;
+    }
+    ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
+
+    ff_snow_alloc_blocks(s);
+
+    s->version=0;
+
+    s->m.avctx   = avctx;
+    s->m.bit_rate= avctx->bit_rate;
+
+    s->m.me.temp      =
+    s->m.me.scratchpad= av_mallocz_array((avctx->width+64), 2*16*2*sizeof(uint8_t));
+    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    s->m.sc.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
+    if (!s->m.me.scratchpad || !s->m.me.map || !s->m.me.score_map || !s->m.sc.obmc_scratchpad)
+        return AVERROR(ENOMEM);
+
+    ff_h263_encode_init(&s->m); //mv_penalty
+
+    s->max_ref_frames = av_clip(avctx->refs, 1, MAX_REF_FRAMES);
+
+    if(avctx->flags&AV_CODEC_FLAG_PASS1){
+        if(!avctx->stats_out)
+            avctx->stats_out = av_mallocz(256);
+
+        if (!avctx->stats_out)
+            return AVERROR(ENOMEM);
+    }
+    if((avctx->flags&AV_CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
+        if(ff_rate_control_init(&s->m) < 0)
+            return -1;
+    }
+    s->pass1_rc= !(avctx->flags & (AV_CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
+
+    switch(avctx->pix_fmt){
+    case AV_PIX_FMT_YUV444P:
+//    case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV420P:
+//    case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUV410P:
+        s->nb_planes = 3;
+        s->colorspace_type= 0;
+        break;
+    case AV_PIX_FMT_GRAY8:
+        s->nb_planes = 1;
+        s->colorspace_type = 1;
+        break;
+/*    case AV_PIX_FMT_RGB32:
+        s->colorspace= 1;
+        break;*/
+    default:
+        av_log(avctx, AV_LOG_ERROR, "pixel format not supported\n");
+        return -1;
+    }
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
+
+    ff_set_cmp(&s->mecc, s->mecc.me_cmp, s->avctx->me_cmp);
+    ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, s->avctx->me_sub_cmp);
+
+    s->input_picture = av_frame_alloc();
+    if (!s->input_picture)
+        return AVERROR(ENOMEM);
+
+    if ((ret = ff_snow_get_buffer(s, s->input_picture)) < 0)
+        return ret;
+
+    if(s->motion_est == FF_ME_ITER){
+        int size= s->b_width * s->b_height << 2*s->block_max_depth;
+        for(i=0; i<s->max_ref_frames; i++){
+            s->ref_mvs[i]= av_mallocz_array(size, sizeof(int16_t[2]));
+            s->ref_scores[i]= av_mallocz_array(size, sizeof(uint32_t));
+            if (!s->ref_mvs[i] || !s->ref_scores[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_sum(uint8_t * pix, int line_size, int w, int h)
+{
+    int s, i, j;
+
+    s = 0;
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j++) {
+            s += pix[0];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_norm1(uint8_t * pix, int line_size, int w)
+{
+    int s, i, j;
+    uint32_t *sq = ff_square_tab + 256;
+
+    s = 0;
+    for (i = 0; i < w; i++) {
+        for (j = 0; j < w; j ++) {
+            s += sq[pix[0]];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+static inline int get_penalty_factor(int lambda, int lambda2, int type){
+    switch(type&0xFF){
+    default:
+    case FF_CMP_SAD:
+        return lambda>>FF_LAMBDA_SHIFT;
+    case FF_CMP_DCT:
+        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
+    case FF_CMP_W53:
+        return (4*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_W97:
+        return (2*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_SATD:
+    case FF_CMP_DCT264:
+        return (2*lambda)>>FF_LAMBDA_SHIFT;
+    case FF_CMP_RD:
+    case FF_CMP_PSNR:
+    case FF_CMP_SSE:
+    case FF_CMP_NSSE:
+        return lambda2>>FF_LAMBDA_SHIFT;
+    case FF_CMP_BIT:
+        return 1;
+    }
+}
+
+//FIXME copy&paste
+#define P_LEFT P[1]
+#define P_TOP P[2]
+#define P_TOPRIGHT P[3]
+#define P_MEDIAN P[4]
+#define P_MV1 P[9]
+#define FLAG_QPEL   1 //must be 1
+
+static int encode_q_branch(SnowContext *s, int level, int x, int y){
+    uint8_t p_buffer[1024];
+    uint8_t i_buffer[1024];
+    uint8_t p_state[sizeof(s->block_state)];
+    uint8_t i_state[sizeof(s->block_state)];
+    RangeCoder pc, ic;
+    uint8_t *pbbak= s->c.bytestream;
+    uint8_t *pbbak_start= s->c.bytestream_start;
+    int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
+    const int w= s->b_width  << s->block_max_depth;
+    const int h= s->b_height << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<(LOG2_MB_SIZE - level);
+    int trx= (x+1)<<rem_depth;
+    int try= (y+1)<<rem_depth;
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
+    const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int pl = left->color[0];
+    int pcb= left->color[1];
+    int pcr= left->color[2];
+    int pmx, pmy;
+    int mx=0, my=0;
+    int l,cr,cb;
+    const int stride= s->current_picture->linesize[0];
+    const int uvstride= s->current_picture->linesize[1];
+    uint8_t *current_data[3]= { s->input_picture->data[0] + (x + y*  stride)*block_w,
+                                s->input_picture->data[1] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift),
+                                s->input_picture->data[2] + ((x*block_w)>>s->chroma_h_shift) + ((y*uvstride*block_w)>>s->chroma_v_shift)};
+    int P[10][2];
+    int16_t last_mv[3][2];
+    int qpel= !!(s->avctx->flags & AV_CODEC_FLAG_QPEL); //unused
+    const int shift= 1+qpel;
+    MotionEstContext *c= &s->m.me;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+    int mx_context= av_log2(2*FFABS(left->mx - top->mx));
+    int my_context= av_log2(2*FFABS(left->my - top->my));
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    int ref, best_ref, ref_score, ref_mx, ref_my;
+
+    av_assert0(sizeof(s->block_state) >= 256);
+    if(s->keyframe){
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
+        return 0;
+    }
+
+//    clip predictors / edge ?
+
+    P_LEFT[0]= left->mx;
+    P_LEFT[1]= left->my;
+    P_TOP [0]= top->mx;
+    P_TOP [1]= top->my;
+    P_TOPRIGHT[0]= tr->mx;
+    P_TOPRIGHT[1]= tr->my;
+
+    last_mv[0][0]= s->block[index].mx;
+    last_mv[0][1]= s->block[index].my;
+    last_mv[1][0]= right->mx;
+    last_mv[1][1]= right->my;
+    last_mv[2][0]= bottom->mx;
+    last_mv[2][1]= bottom->my;
+
+    s->m.mb_stride=2;
+    s->m.mb_x=
+    s->m.mb_y= 0;
+    c->skip= 0;
+
+    av_assert1(c->  stride ==   stride);
+    av_assert1(c->uvstride == uvstride);
+
+    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
+    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
+    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
+    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_DMV;
+
+    c->xmin = - x*block_w - 16+3;
+    c->ymin = - y*block_w - 16+3;
+    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
+    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-3;
+
+    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
+    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
+    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
+    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
+    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
+    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
+    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
+
+    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+    if (!y) {
+        c->pred_x= P_LEFT[0];
+        c->pred_y= P_LEFT[1];
+    } else {
+        c->pred_x = P_MEDIAN[0];
+        c->pred_y = P_MEDIAN[1];
+    }
+
+    score= INT_MAX;
+    best_ref= 0;
+    for(ref=0; ref<s->ref_frames; ref++){
+        init_ref(c, current_data, s->last_picture[ref]->data, NULL, block_w*x, block_w*y, 0);
+
+        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
+                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
+
+        av_assert2(ref_mx >= c->xmin);
+        av_assert2(ref_mx <= c->xmax);
+        av_assert2(ref_my >= c->ymin);
+        av_assert2(ref_my <= c->ymax);
+
+        ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
+        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
+        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
+        if(s->ref_mvs[ref]){
+            s->ref_mvs[ref][index][0]= ref_mx;
+            s->ref_mvs[ref][index][1]= ref_my;
+            s->ref_scores[ref][index]= ref_score;
+        }
+        if(score > ref_score){
+            score= ref_score;
+            best_ref= ref;
+            mx= ref_mx;
+            my= ref_my;
+        }
+    }
+    //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
+
+  //  subpel search
+    base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
+    pc= s->c;
+    pc.bytestream_start=
+    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
+    memcpy(p_state, s->block_state, sizeof(s->block_state));
+
+    if(level!=s->block_max_depth)
+        put_rac(&pc, &p_state[4 + s_context], 1);
+    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
+    if(s->ref_frames > 1)
+        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
+    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
+    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
+    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
+    p_len= pc.bytestream - pc.bytestream_start;
+    score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
+
+    block_s= block_w*block_w;
+    sum = pix_sum(current_data[0], stride, block_w, block_w);
+    l= (sum + block_s/2)/block_s;
+    iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
+
+    if (s->nb_planes > 2) {
+        block_s= block_w*block_w>>(s->chroma_h_shift + s->chroma_v_shift);
+        sum = pix_sum(current_data[1], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
+        cb= (sum + block_s/2)/block_s;
+    //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
+        sum = pix_sum(current_data[2], uvstride, block_w>>s->chroma_h_shift, block_w>>s->chroma_v_shift);
+        cr= (sum + block_s/2)/block_s;
+    //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
+    }else
+        cb = cr = 0;
+
+    ic= s->c;
+    ic.bytestream_start=
+    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
+    memcpy(i_state, s->block_state, sizeof(s->block_state));
+    if(level!=s->block_max_depth)
+        put_rac(&ic, &i_state[4 + s_context], 1);
+    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
+    put_symbol(&ic, &i_state[32],  l-pl , 1);
+    if (s->nb_planes > 2) {
+        put_symbol(&ic, &i_state[64], cb-pcb, 1);
+        put_symbol(&ic, &i_state[96], cr-pcr, 1);
+    }
+    i_len= ic.bytestream - ic.bytestream_start;
+    iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
+
+    av_assert1(iscore < 255*255*256 + s->lambda2*10);
+    av_assert1(iscore >= 0);
+    av_assert1(l>=0 && l<=255);
+    av_assert1(pl>=0 && pl<=255);
+
+    if(level==0){
+        int varc= iscore >> 8;
+        int vard= score >> 8;
+        if (vard <= 64 || vard < varc)
+            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+        else
+            c->scene_change_score+= s->m.qscale;
+    }
+
+    if(level!=s->block_max_depth){
+        put_rac(&s->c, &s->block_state[4 + s_context], 0);
+        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
+        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
+
+        if(score2 < score && score2 < iscore)
+            return score2;
+    }
+
+    if(iscore < score){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
+        memcpy(pbbak, i_buffer, i_len);
+        s->c= ic;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + i_len;
+        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
+        memcpy(s->block_state, i_state, sizeof(s->block_state));
+        return iscore;
+    }else{
+        memcpy(pbbak, p_buffer, p_len);
+        s->c= pc;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + p_len;
+        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
+        memcpy(s->block_state, p_state, sizeof(s->block_state));
+        return score;
+    }
+}
+
+static void encode_q_branch2(SnowContext *s, int level, int x, int y){
+    const int w= s->b_width  << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    int trx= (x+1)<<rem_depth;
+    BlockNode *b= &s->block[index];
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int pl = left->color[0];
+    int pcb= left->color[1];
+    int pcr= left->color[2];
+    int pmx, pmy;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+    int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
+    int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+
+    if(s->keyframe){
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
+        return;
+    }
+
+    if(level!=s->block_max_depth){
+        if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
+            put_rac(&s->c, &s->block_state[4 + s_context], 1);
+        }else{
+            put_rac(&s->c, &s->block_state[4 + s_context], 0);
+            encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
+            encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
+            encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
+            encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
+            return;
+        }
+    }
+    if(b->type & BLOCK_INTRA){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
+        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
+        put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
+        if (s->nb_planes > 2) {
+            put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
+            put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
+        }
+        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
+    }else{
+        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
+        put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
+        if(s->ref_frames > 1)
+            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
+        put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
+        put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
+        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
+    }
+}
+
+static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
+    int i, x2, y2;
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *src= s-> input_picture->data[plane_index];
+    IDWTELEM *dst= (IDWTELEM*)s->m.sc.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int index= mb_x + mb_y*b_stride;
+    BlockNode *b= &s->block[index];
+    BlockNode backup= *b;
+    int ab=0;
+    int aa=0;
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc stuff above
+
+    b->type|= BLOCK_INTRA;
+    b->color[plane_index]= 0;
+    memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
+
+    for(i=0; i<4; i++){
+        int mb_x2= mb_x + (i &1) - 1;
+        int mb_y2= mb_y + (i>>1) - 1;
+        int x= block_w*mb_x2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
+
+        add_yblock(s, 0, NULL, dst + (i&1)*block_w + (i>>1)*obmc_stride*block_h, NULL, obmc,
+                    x, y, block_w, block_h, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
+
+        for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_h); y2++){
+            for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
+                int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_h*mb_y - block_h/2))*obmc_stride;
+                int obmc_v= obmc[index];
+                int d;
+                if(y<0) obmc_v += obmc[index + block_h*obmc_stride];
+                if(x<0) obmc_v += obmc[index + block_w];
+                if(y+block_h>h) obmc_v += obmc[index - block_h*obmc_stride];
+                if(x+block_w>w) obmc_v += obmc[index - block_w];
+                //FIXME precalculate this or simplify it somehow else
+
+                d = -dst[index] + (1<<(FRAC_BITS-1));
+                dst[index] = d;
+                ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
+                aa += obmc_v * obmc_v; //FIXME precalculate this
+            }
+        }
+    }
+    *b= backup;
+
+    return av_clip_uint8( ROUNDED_DIV(ab<<LOG2_OBMC_MAX, aa) ); //FIXME we should not need clipping
+}
+
+static inline int get_block_bits(SnowContext *s, int x, int y, int w){
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int b_height = s->b_height<< s->block_max_depth;
+    int index= x + y*b_stride;
+    const BlockNode *b     = &s->block[index];
+    const BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
+    const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
+    const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
+    int dmx, dmy;
+//  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
+//  int my_context= av_log2(2*FFABS(left->my - top->my));
+
+    if(x<0 || x>=b_stride || y>=b_height)
+        return 0;
+/*
+1            0      0
+01X          1-2    1
+001XX        3-6    2-3
+0001XXX      7-14   4-7
+00001XXXX   15-30   8-15
+*/
+//FIXME try accurate rate
+//FIXME intra and inter predictors if surrounding blocks are not the same type
+    if(b->type & BLOCK_INTRA){
+        return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
+                   + av_log2(2*FFABS(left->color[1] - b->color[1]))
+                   + av_log2(2*FFABS(left->color[2] - b->color[2])));
+    }else{
+        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
+        dmx-= b->mx;
+        dmy-= b->my;
+        return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
+                    + av_log2(2*FFABS(dmy))
+                    + av_log2(2*b->ref));
+    }
+}
+
+static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, uint8_t (*obmc_edged)[MB_SIZE * 2]){
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst= s->current_picture->data[plane_index];
+    uint8_t *src= s->  input_picture->data[plane_index];
+    IDWTELEM *pred= (IDWTELEM*)s->m.sc.obmc_scratchpad + plane_index*block_size*block_size*4;
+    uint8_t *cur = s->scratchbuf;
+    uint8_t *tmp = s->emu_edge_buffer;
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int b_height = s->b_height<< s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int distortion;
+    int rate= 0;
+    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
+    int sx= block_w*mb_x - block_w/2;
+    int sy= block_h*mb_y - block_h/2;
+    int x0= FFMAX(0,-sx);
+    int y0= FFMAX(0,-sy);
+    int x1= FFMIN(block_w*2, w-sx);
+    int y1= FFMIN(block_h*2, h-sy);
+    int i,x,y;
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below chckinhg only block_w
+
+    ff_snow_pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_h*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
+
+    for(y=y0; y<y1; y++){
+        const uint8_t *obmc1= obmc_edged[y];
+        const IDWTELEM *pred1 = pred + y*obmc_stride;
+        uint8_t *cur1 = cur + y*ref_stride;
+        uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
+        for(x=x0; x<x1; x++){
+#if FRAC_BITS >= LOG2_OBMC_MAX
+            int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
+#else
+            int v = (cur1[x] * obmc1[x] + (1<<(LOG2_OBMC_MAX - FRAC_BITS-1))) >> (LOG2_OBMC_MAX - FRAC_BITS);
+#endif
+            v = (v + pred1[x]) >> FRAC_BITS;
+            if(v&(~255)) v= ~(v>>31);
+            dst1[x] = v;
+        }
+    }
+
+    /* copy the regions where obmc[] = (uint8_t)256 */
+    if(LOG2_OBMC_MAX == 8
+        && (mb_x == 0 || mb_x == b_stride-1)
+        && (mb_y == 0 || mb_y == b_height-1)){
+        if(mb_x == 0)
+            x1 = block_w;
+        else
+            x0 = block_w;
+        if(mb_y == 0)
+            y1 = block_h;
+        else
+            y0 = block_h;
+        for(y=y0; y<y1; y++)
+            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
+    }
+
+    if(block_w==16){
+        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
+        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
+        /* FIXME cmps overlap but do not cover the wavelet's whole support.
+         * So improving the score of one block is not strictly guaranteed
+         * to improve the score of the whole frame, thus iterative motion
+         * estimation does not always converge. */
+        if(s->avctx->me_cmp == FF_CMP_W97)
+            distortion = ff_w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else if(s->avctx->me_cmp == FF_CMP_W53)
+            distortion = ff_w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else{
+            distortion = 0;
+            for(i=0; i<4; i++){
+                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
+                distortion += s->mecc.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
+            }
+        }
+    }else{
+        av_assert2(block_w==8);
+        distortion = s->mecc.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
+    }
+
+    if(plane_index==0){
+        for(i=0; i<4; i++){
+/* ..RRr
+ * .RXx.
+ * rxx..
+ */
+            rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
+        }
+        if(mb_x == b_stride-2)
+            rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
+    }
+    return distortion + rate*penalty_factor;
+}
+
+static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
+    int i, y2;
+    Plane *p= &s->plane[plane_index];
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size>>s->chroma_h_shift : block_size;
+    const int block_h    = plane_index ? block_size>>s->chroma_v_shift : block_size;
+    const uint8_t *obmc  = plane_index ? ff_obmc_tab[s->block_max_depth+s->chroma_h_shift] : ff_obmc_tab[s->block_max_depth];
+    const int obmc_stride= plane_index ? (2*block_size)>>s->chroma_h_shift : 2*block_size;
+    const int ref_stride= s->current_picture->linesize[plane_index];
+    uint8_t *dst= s->current_picture->data[plane_index];
+    uint8_t *src= s-> input_picture->data[plane_index];
+    //FIXME zero_dst is const but add_yblock changes dst if add is 0 (this is never the case for dst=zero_dst
+    // const has only been removed from zero_dst to suppress a warning
+    static IDWTELEM zero_dst[4096]; //FIXME
+    const int b_stride = s->b_width << s->block_max_depth;
+    const int w= p->width;
+    const int h= p->height;
+    int distortion= 0;
+    int rate= 0;
+    const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
+
+    av_assert2(s->chroma_h_shift == s->chroma_v_shift); //obmc and square assumtions below
+
+    for(i=0; i<9; i++){
+        int mb_x2= mb_x + (i%3) - 1;
+        int mb_y2= mb_y + (i/3) - 1;
+        int x= block_w*mb_x2 + block_w/2;
+        int y= block_h*mb_y2 + block_h/2;
+
+        add_yblock(s, 0, NULL, zero_dst, dst, obmc,
+                   x, y, block_w, block_h, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
+
+        //FIXME find a cleaner/simpler way to skip the outside stuff
+        for(y2= y; y2<0; y2++)
+            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
+        for(y2= h; y2<y+block_h; y2++)
+            memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
+        if(x<0){
+            for(y2= y; y2<y+block_h; y2++)
+                memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
+        }
+        if(x+block_w > w){
+            for(y2= y; y2<y+block_h; y2++)
+                memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
+        }
+
+        av_assert1(block_w== 8 || block_w==16);
+        distortion += s->mecc.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_h);
+    }
+
+    if(plane_index==0){
+        BlockNode *b= &s->block[mb_x+mb_y*b_stride];
+        int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
+
+/* ..RRRr
+ * .RXXx.
+ * .RXXx.
+ * rxxx.
+ */
+        if(merged)
+            rate = get_block_bits(s, mb_x, mb_y, 2);
+        for(i=merged?4:0; i<9; i++){
+            static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
+            rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
+        }
+    }
+    return distortion + rate*penalty_factor;
+}
+
+static int encode_subband_c0run(SnowContext *s, SubBand *b, const IDWTELEM *src, const IDWTELEM *parent, int stride, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x, y;
+
+    if(1){
+        int run=0;
+        int *runs = s->run_buffer;
+        int run_index=0;
+        int max_index;
+
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height)
+                        p= parent[px + py*2*stride];
+                }
+                if(!(/*ll|*/l|lt|t|rt|p)){
+                    if(v){
+                        runs[run_index++]= run;
+                        run=0;
+                    }else{
+                        run++;
+                    }
+                }
+            }
+        }
+        max_index= run_index;
+        runs[run_index++]= run;
+        run_index=0;
+        run= runs[run_index++];
+
+        put_symbol2(&s->c, b->state[30], max_index, 0);
+        if(run_index <= max_index)
+            put_symbol2(&s->c, b->state[1], run, 3);
+
+        for(y=0; y<h; y++){
+            if(s->c.bytestream_end - s->c.bytestream < w*40){
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return -1;
+            }
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height)
+                        p= parent[px + py*2*stride];
+                }
+                if(/*ll|*/l|lt|t|rt|p){
+                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
+
+                    put_rac(&s->c, &b->state[0][context], !!v);
+                }else{
+                    if(!run){
+                        run= runs[run_index++];
+
+                        if(run_index <= max_index)
+                            put_symbol2(&s->c, b->state[1], run, 3);
+                        av_assert2(v);
+                    }else{
+                        run--;
+                        av_assert2(!v);
+                    }
+                }
+                if(v){
+                    int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
+                    int l2= 2*FFABS(l) + (l<0);
+                    int t2= 2*FFABS(t) + (t<0);
+
+                    put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
+                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + ff_quant3bA[l2&0xFF] + 3*ff_quant3bA[t2&0xFF]], v<0);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int encode_subband(SnowContext *s, SubBand *b, const IDWTELEM *src, const IDWTELEM *parent, int stride, int orientation){
+//    encode_subband_qtree(s, b, src, parent, stride, orientation);
+//    encode_subband_z0run(s, b, src, parent, stride, orientation);
+    return encode_subband_c0run(s, b, src, parent, stride, orientation);
+//    encode_subband_dzr(s, b, src, parent, stride, orientation);
+}
+
+static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
+    const int b_stride= s->b_width << s->block_max_depth;
+    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
+    BlockNode backup= *block;
+    unsigned value;
+    int rd, index;
+
+    av_assert2(mb_x>=0 && mb_y>=0);
+    av_assert2(mb_x<b_stride);
+
+    if(intra){
+        block->color[0] = p[0];
+        block->color[1] = p[1];
+        block->color[2] = p[2];
+        block->type |= BLOCK_INTRA;
+    }else{
+        index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
+        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
+        if(s->me_cache[index] == value)
+            return 0;
+        s->me_cache[index]= value;
+
+        block->mx= p[0];
+        block->my= p[1];
+        block->type &= ~BLOCK_INTRA;
+    }
+
+    rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged) + s->intra_penalty * !!intra;
+
+//FIXME chroma
+    if(rd < *best_rd){
+        *best_rd= rd;
+        return 1;
+    }else{
+        *block= backup;
+        return 0;
+    }
+}
+
+/* special case for int[2] args we discard afterwards,
+ * fixes compilation problem with gcc 2.95 */
+static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, uint8_t (*obmc_edged)[MB_SIZE * 2], int *best_rd){
+    int p[2] = {p0, p1};
+    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
+}
+
+static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
+    const int b_stride= s->b_width << s->block_max_depth;
+    BlockNode *block= &s->block[mb_x + mb_y * b_stride];
+    BlockNode backup[4];
+    unsigned value;
+    int rd, index;
+
+    /* We don't initialize backup[] during variable declaration, because
+     * that fails to compile on MSVC: "cannot convert from 'BlockNode' to
+     * 'int16_t'". */
+    backup[0] = block[0];
+    backup[1] = block[1];
+    backup[2] = block[b_stride];
+    backup[3] = block[b_stride + 1];
+
+    av_assert2(mb_x>=0 && mb_y>=0);
+    av_assert2(mb_x<b_stride);
+    av_assert2(((mb_x|mb_y)&1) == 0);
+
+    index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
+    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
+    if(s->me_cache[index] == value)
+        return 0;
+    s->me_cache[index]= value;
+
+    block->mx= p0;
+    block->my= p1;
+    block->ref= ref;
+    block->type &= ~BLOCK_INTRA;
+    block[1]= block[b_stride]= block[b_stride+1]= *block;
+
+    rd= get_4block_rd(s, mb_x, mb_y, 0);
+
+//FIXME chroma
+    if(rd < *best_rd){
+        *best_rd= rd;
+        return 1;
+    }else{
+        block[0]= backup[0];
+        block[1]= backup[1];
+        block[b_stride]= backup[2];
+        block[b_stride+1]= backup[3];
+        return 0;
+    }
+}
+
+static void iterative_me(SnowContext *s){
+    int pass, mb_x, mb_y;
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    int color[3];
+
+    {
+        RangeCoder r = s->c;
+        uint8_t state[sizeof(s->block_state)];
+        memcpy(state, s->block_state, sizeof(s->block_state));
+        for(mb_y= 0; mb_y<s->b_height; mb_y++)
+            for(mb_x= 0; mb_x<s->b_width; mb_x++)
+                encode_q_branch(s, 0, mb_x, mb_y);
+        s->c = r;
+        memcpy(s->block_state, state, sizeof(s->block_state));
+    }
+
+    for(pass=0; pass<25; pass++){
+        int change= 0;
+
+        for(mb_y= 0; mb_y<b_height; mb_y++){
+            for(mb_x= 0; mb_x<b_width; mb_x++){
+                int dia_change, i, j, ref;
+                int best_rd= INT_MAX, ref_rd;
+                BlockNode backup, ref_b;
+                const int index= mb_x + mb_y * b_stride;
+                BlockNode *block= &s->block[index];
+                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
+                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
+                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
+                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
+                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
+                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
+                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
+                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
+                const int b_w= (MB_SIZE >> s->block_max_depth);
+                uint8_t obmc_edged[MB_SIZE * 2][MB_SIZE * 2];
+
+                if(pass && (block->type & BLOCK_OPT))
+                    continue;
+                block->type |= BLOCK_OPT;
+
+                backup= *block;
+
+                if(!s->me_cache_generation)
+                    memset(s->me_cache, 0, sizeof(s->me_cache));
+                s->me_cache_generation += 1<<22;
+
+                //FIXME precalculate
+                {
+                    int x, y;
+                    for (y = 0; y < b_w * 2; y++)
+                        memcpy(obmc_edged[y], ff_obmc_tab[s->block_max_depth] + y * b_w * 2, b_w * 2);
+                    if(mb_x==0)
+                        for(y=0; y<b_w*2; y++)
+                            memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
+                    if(mb_x==b_stride-1)
+                        for(y=0; y<b_w*2; y++)
+                            memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
+                    if(mb_y==0){
+                        for(x=0; x<b_w*2; x++)
+                            obmc_edged[0][x] += obmc_edged[b_w-1][x];
+                        for(y=1; y<b_w; y++)
+                            memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
+                    }
+                    if(mb_y==b_height-1){
+                        for(x=0; x<b_w*2; x++)
+                            obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
+                        for(y=b_w; y<b_w*2-1; y++)
+                            memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
+                    }
+                }
+
+                //skip stuff outside the picture
+                if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1){
+                    uint8_t *src= s->  input_picture->data[0];
+                    uint8_t *dst= s->current_picture->data[0];
+                    const int stride= s->current_picture->linesize[0];
+                    const int block_w= MB_SIZE >> s->block_max_depth;
+                    const int block_h= MB_SIZE >> s->block_max_depth;
+                    const int sx= block_w*mb_x - block_w/2;
+                    const int sy= block_h*mb_y - block_h/2;
+                    const int w= s->plane[0].width;
+                    const int h= s->plane[0].height;
+                    int y;
+
+                    for(y=sy; y<0; y++)
+                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
+                    for(y=h; y<sy+block_h*2; y++)
+                        memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
+                    if(sx<0){
+                        for(y=sy; y<sy+block_h*2; y++)
+                            memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
+                    }
+                    if(sx+block_w*2 > w){
+                        for(y=sy; y<sy+block_h*2; y++)
+                            memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
+                    }
+                }
+
+                // intra(black) = neighbors' contribution to the current block
+                for(i=0; i < s->nb_planes; i++)
+                    color[i]= get_dc(s, mb_x, mb_y, i);
+
+                // get previous score (cannot be cached due to OBMC)
+                if(pass > 0 && (block->type&BLOCK_INTRA)){
+                    int color0[3]= {block->color[0], block->color[1], block->color[2]};
+                    check_block(s, mb_x, mb_y, color0, 1, obmc_edged, &best_rd);
+                }else
+                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, obmc_edged, &best_rd);
+
+                ref_b= *block;
+                ref_rd= best_rd;
+                for(ref=0; ref < s->ref_frames; ref++){
+                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
+                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
+                        continue;
+                    block->ref= ref;
+                    best_rd= INT_MAX;
+
+                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, 0, 0, obmc_edged, &best_rd);
+                    if(tb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], obmc_edged, &best_rd);
+                    if(lb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], obmc_edged, &best_rd);
+                    if(rb)
+                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], obmc_edged, &best_rd);
+                    if(bb)
+                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], obmc_edged, &best_rd);
+
+                    /* fullpel ME */
+                    //FIXME avoid subpel interpolation / round to nearest integer
+                    do{
+                        int newx = block->mx;
+                        int newy = block->my;
+                        int dia_size = s->iterative_dia_size ? s->iterative_dia_size : FFMAX(s->avctx->dia_size, 1);
+                        dia_change=0;
+                        for(i=0; i < dia_size; i++){
+                            for(j=0; j<i; j++){
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx+4*(i-j), newy+(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx-4*(i-j), newy-(4*j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx-(4*j), newy+4*(i-j), obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, newx+(4*j), newy-4*(i-j), obmc_edged, &best_rd);
+                            }
+                        }
+                    }while(dia_change);
+                    /* subpel ME */
+                    do{
+                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
+                        dia_change=0;
+                        for(i=0; i<8; i++)
+                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], obmc_edged, &best_rd);
+                    }while(dia_change);
+                    //FIXME or try the standard 2 pass qpel or similar
+
+                    mvr[0][0]= block->mx;
+                    mvr[0][1]= block->my;
+                    if(ref_rd > best_rd){
+                        ref_rd= best_rd;
+                        ref_b= *block;
+                    }
+                }
+                best_rd= ref_rd;
+                *block= ref_b;
+                check_block(s, mb_x, mb_y, color, 1, obmc_edged, &best_rd);
+                //FIXME RD style color selection
+                if(!same_block(block, &backup)){
+                    if(tb ) tb ->type &= ~BLOCK_OPT;
+                    if(lb ) lb ->type &= ~BLOCK_OPT;
+                    if(rb ) rb ->type &= ~BLOCK_OPT;
+                    if(bb ) bb ->type &= ~BLOCK_OPT;
+                    if(tlb) tlb->type &= ~BLOCK_OPT;
+                    if(trb) trb->type &= ~BLOCK_OPT;
+                    if(blb) blb->type &= ~BLOCK_OPT;
+                    if(brb) brb->type &= ~BLOCK_OPT;
+                    change ++;
+                }
+            }
+        }
+        av_log(s->avctx, AV_LOG_DEBUG, "pass:%d changed:%d\n", pass, change);
+        if(!change)
+            break;
+    }
+
+    if(s->block_max_depth == 1){
+        int change= 0;
+        for(mb_y= 0; mb_y<b_height; mb_y+=2){
+            for(mb_x= 0; mb_x<b_width; mb_x+=2){
+                int i;
+                int best_rd, init_rd;
+                const int index= mb_x + mb_y * b_stride;
+                BlockNode *b[4];
+
+                b[0]= &s->block[index];
+                b[1]= b[0]+1;
+                b[2]= b[0]+b_stride;
+                b[3]= b[2]+1;
+                if(same_block(b[0], b[1]) &&
+                   same_block(b[0], b[2]) &&
+                   same_block(b[0], b[3]))
+                    continue;
+
+                if(!s->me_cache_generation)
+                    memset(s->me_cache, 0, sizeof(s->me_cache));
+                s->me_cache_generation += 1<<22;
+
+                init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
+
+                //FIXME more multiref search?
+                check_4block_inter(s, mb_x, mb_y,
+                                   (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
+                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
+
+                for(i=0; i<4; i++)
+                    if(!(b[i]->type&BLOCK_INTRA))
+                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
+
+                if(init_rd != best_rd)
+                    change++;
+            }
+        }
+        av_log(s->avctx, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
+    }
+}
+
+static void encode_blocks(SnowContext *s, int search){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+
+    if(s->motion_est == FF_ME_ITER && !s->keyframe && search)
+        iterative_me(s);
+
+    for(y=0; y<h; y++){
+        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return;
+        }
+        for(x=0; x<w; x++){
+            if(s->motion_est == FF_ME_ITER || !search)
+                encode_q_branch2(s, 0, x, y);
+            else
+                encode_q_branch (s, 0, x, y);
+        }
+    }
+}
+
+static void quantize(SnowContext *s, SubBand *b, IDWTELEM *dst, DWTELEM *src, int stride, int bias){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
+    int x,y, thres1, thres2;
+
+    if(s->qlog == LOSSLESS_QLOG){
+        for(y=0; y<h; y++)
+            for(x=0; x<w; x++)
+                dst[x + y*stride]= src[x + y*stride];
+        return;
+    }
+
+    bias= bias ? 0 : (3*qmul)>>3;
+    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
+    thres2= 2*thres1;
+
+    if(!bias){
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride];
+
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        dst[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        dst[x + y*stride]= -i;
+                    }
+                }else
+                    dst[x + y*stride]= 0;
+            }
+        }
+    }else{
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride];
+
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        dst[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        dst[x + y*stride]= -i;
+                    }
+                }else
+                    dst[x + y*stride]= 0;
+            }
+        }
+    }
+}
+
+static void dequantize(SnowContext *s, SubBand *b, IDWTELEM *src, int stride){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+
+    if(s->qlog == LOSSLESS_QLOG) return;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= src[x + y*stride];
+            if(i<0){
+                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+}
+
+static void decorrelate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    for(y=h-1; y>=0; y--){
+        for(x=w-1; x>=0; x--){
+            int i= x + y*stride;
+
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] -= src[i - 1];
+                }else{
+                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] -= src[i - 1];
+                }
+            }else{
+                if(y) src[i] -= src[i - stride];
+            }
+        }
+    }
+}
+
+static void correlate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= x + y*stride;
+
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] += src[i - 1];
+                }else{
+                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] += src[i - 1];
+                }
+            }else{
+                if(y) src[i] += src[i - stride];
+            }
+        }
+    }
+}
+
+static void encode_qlogs(SnowContext *s){
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                if(orientation==2) continue;
+                put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
+            }
+        }
+    }
+}
+
+static void encode_header(SnowContext *s){
+    int plane_index, i;
+    uint8_t kstate[32];
+
+    memset(kstate, MID_STATE, sizeof(kstate));
+
+    put_rac(&s->c, kstate, s->keyframe);
+    if(s->keyframe || s->always_reset){
+        ff_snow_reset_contexts(s);
+        s->last_spatial_decomposition_type=
+        s->last_qlog=
+        s->last_qbias=
+        s->last_mv_scale=
+        s->last_block_max_depth= 0;
+        for(plane_index=0; plane_index<2; plane_index++){
+            Plane *p= &s->plane[plane_index];
+            p->last_htaps=0;
+            p->last_diag_mc=0;
+            memset(p->last_hcoeff, 0, sizeof(p->last_hcoeff));
+        }
+    }
+    if(s->keyframe){
+        put_symbol(&s->c, s->header_state, s->version, 0);
+        put_rac(&s->c, s->header_state, s->always_reset);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
+        if (s->nb_planes > 2) {
+            put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
+            put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
+        }
+        put_rac(&s->c, s->header_state, s->spatial_scalability);
+//        put_rac(&s->c, s->header_state, s->rate_scalability);
+        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
+
+        encode_qlogs(s);
+    }
+
+    if(!s->keyframe){
+        int update_mc=0;
+        for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+            Plane *p= &s->plane[plane_index];
+            update_mc |= p->last_htaps   != p->htaps;
+            update_mc |= p->last_diag_mc != p->diag_mc;
+            update_mc |= !!memcmp(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
+        }
+        put_rac(&s->c, s->header_state, update_mc);
+        if(update_mc){
+            for(plane_index=0; plane_index<FFMIN(s->nb_planes, 2); plane_index++){
+                Plane *p= &s->plane[plane_index];
+                put_rac(&s->c, s->header_state, p->diag_mc);
+                put_symbol(&s->c, s->header_state, p->htaps/2-1, 0);
+                for(i= p->htaps/2; i; i--)
+                    put_symbol(&s->c, s->header_state, FFABS(p->hcoeff[i]), 0);
+            }
+        }
+        if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
+            put_rac(&s->c, s->header_state, 1);
+            put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
+            encode_qlogs(s);
+        }else
+            put_rac(&s->c, s->header_state, 0);
+    }
+
+    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
+    put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
+    put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
+    put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
+    put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
+
+}
+
+static void update_last_header_values(SnowContext *s){
+    int plane_index;
+
+    if(!s->keyframe){
+        for(plane_index=0; plane_index<2; plane_index++){
+            Plane *p= &s->plane[plane_index];
+            p->last_diag_mc= p->diag_mc;
+            p->last_htaps  = p->htaps;
+            memcpy(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
+        }
+    }
+
+    s->last_spatial_decomposition_type  = s->spatial_decomposition_type;
+    s->last_qlog                        = s->qlog;
+    s->last_qbias                       = s->qbias;
+    s->last_mv_scale                    = s->mv_scale;
+    s->last_block_max_depth             = s->block_max_depth;
+    s->last_spatial_decomposition_count = s->spatial_decomposition_count;
+}
+
+static int qscale2qlog(int qscale){
+    return lrint(QROOT*log2(qscale / (float)FF_QP2LAMBDA))
+           + 61*QROOT/8; ///< 64 > 60
+}
+
+static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
+{
+    /* Estimate the frame's complexity as a sum of weighted dwt coefficients.
+     * FIXME we know exact mv bits at this point,
+     * but ratecontrol isn't set up to include them. */
+    uint32_t coef_sum= 0;
+    int level, orientation, delta_qlog;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &s->plane[0].band[level][orientation];
+            IDWTELEM *buf= b->ibuf;
+            const int w= b->width;
+            const int h= b->height;
+            const int stride= b->stride;
+            const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
+            const int qmul= ff_qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+            const int qdiv= (1<<16)/qmul;
+            int x, y;
+            //FIXME this is ugly
+            for(y=0; y<h; y++)
+                for(x=0; x<w; x++)
+                    buf[x+y*stride]= b->buf[x+y*stride];
+            if(orientation==0)
+                decorrelate(s, b, buf, stride, 1, 0);
+            for(y=0; y<h; y++)
+                for(x=0; x<w; x++)
+                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
+        }
+    }
+
+    /* ugly, ratecontrol just takes a sqrt again */
+    av_assert0(coef_sum < INT_MAX);
+    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
+
+    if(pict->pict_type == AV_PICTURE_TYPE_I){
+        s->m.current_picture.mb_var_sum= coef_sum;
+        s->m.current_picture.mc_mb_var_sum= 0;
+    }else{
+        s->m.current_picture.mc_mb_var_sum= coef_sum;
+        s->m.current_picture.mb_var_sum= 0;
+    }
+
+    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
+    if (pict->quality < 0)
+        return INT_MIN;
+    s->lambda= pict->quality * 3/2;
+    delta_qlog= qscale2qlog(pict->quality) - s->qlog;
+    s->qlog+= delta_qlog;
+    return delta_qlog;
+}
+
+static void calculate_visual_weight(SnowContext *s, Plane *p){
+    int width = p->width;
+    int height= p->height;
+    int level, orientation, x, y;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &p->band[level][orientation];
+            IDWTELEM *ibuf= b->ibuf;
+            int64_t error=0;
+
+            memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
+            ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
+                    error += d*d;
+                }
+            }
+
+            b->qlog= (int)(QROOT * log2(352256.0/sqrt(error)) + 0.5);
+        }
+    }
+}
+
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    AVFrame *pic;
+    const int width= s->avctx->width;
+    const int height= s->avctx->height;
+    int level, orientation, plane_index, i, y, ret;
+    uint8_t rc_header_bak[sizeof(s->header_state)];
+    uint8_t rc_block_bak[sizeof(s->block_state)];
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->b_width*s->b_height*MB_SIZE*MB_SIZE*3 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
+        return ret;
+
+    ff_init_range_encoder(c, pkt->data, pkt->size);
+    ff_build_rac_states(c, (1LL<<32)/20, 256-8);
+
+    for(i=0; i < s->nb_planes; i++){
+        int hshift= i ? s->chroma_h_shift : 0;
+        int vshift= i ? s->chroma_v_shift : 0;
+        for(y=0; y<AV_CEIL_RSHIFT(height, vshift); y++)
+            memcpy(&s->input_picture->data[i][y * s->input_picture->linesize[i]],
+                   &pict->data[i][y * pict->linesize[i]],
+                   AV_CEIL_RSHIFT(width, hshift));
+        s->mpvencdsp.draw_edges(s->input_picture->data[i], s->input_picture->linesize[i],
+                                AV_CEIL_RSHIFT(width, hshift), AV_CEIL_RSHIFT(height, vshift),
+                                EDGE_WIDTH >> hshift, EDGE_WIDTH >> vshift,
+                                EDGE_TOP | EDGE_BOTTOM);
+
+    }
+    emms_c();
+    pic = s->input_picture;
+    pic->pict_type = pict->pict_type;
+    pic->quality = pict->quality;
+
+    s->m.picture_number= avctx->frame_number;
+    if(avctx->flags&AV_CODEC_FLAG_PASS2){
+        s->m.pict_type = pic->pict_type = s->m.rc_context.entry[avctx->frame_number].new_pict_type;
+        s->keyframe = pic->pict_type == AV_PICTURE_TYPE_I;
+        if(!(avctx->flags&AV_CODEC_FLAG_QSCALE)) {
+            pic->quality = ff_rate_estimate_qscale(&s->m, 0);
+            if (pic->quality < 0)
+                return -1;
+        }
+    }else{
+        s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
+        s->m.pict_type = pic->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+    }
+
+    if(s->pass1_rc && avctx->frame_number == 0)
+        pic->quality = 2*FF_QP2LAMBDA;
+    if (pic->quality) {
+        s->qlog   = qscale2qlog(pic->quality);
+        s->lambda = pic->quality * 3/2;
+    }
+    if (s->qlog < 0 || (!pic->quality && (avctx->flags & AV_CODEC_FLAG_QSCALE))) {
+        s->qlog= LOSSLESS_QLOG;
+        s->lambda = 0;
+    }//else keep previous frame's qlog until after motion estimation
+
+    if (s->current_picture->data[0]
+#if FF_API_EMU_EDGE
+        && !(s->avctx->flags&CODEC_FLAG_EMU_EDGE)
+#endif
+        ) {
+        int w = s->avctx->width;
+        int h = s->avctx->height;
+
+        s->mpvencdsp.draw_edges(s->current_picture->data[0],
+                                s->current_picture->linesize[0], w   , h   ,
+                                EDGE_WIDTH  , EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
+        if (s->current_picture->data[2]) {
+            s->mpvencdsp.draw_edges(s->current_picture->data[1],
+                                    s->current_picture->linesize[1], w>>s->chroma_h_shift, h>>s->chroma_v_shift,
+                                    EDGE_WIDTH>>s->chroma_h_shift, EDGE_WIDTH>>s->chroma_v_shift, EDGE_TOP | EDGE_BOTTOM);
+            s->mpvencdsp.draw_edges(s->current_picture->data[2],
+                                    s->current_picture->linesize[2], w>>s->chroma_h_shift, h>>s->chroma_v_shift,
+                                    EDGE_WIDTH>>s->chroma_h_shift, EDGE_WIDTH>>s->chroma_v_shift, EDGE_TOP | EDGE_BOTTOM);
+        }
+    }
+
+    ff_snow_frame_start(s);
+    av_frame_unref(avctx->coded_frame);
+    ret = av_frame_ref(avctx->coded_frame, s->current_picture);
+    if (ret < 0)
+        return ret;
+
+    s->m.current_picture_ptr= &s->m.current_picture;
+    s->m.current_picture.f = s->current_picture;
+    s->m.current_picture.f->pts = pict->pts;
+    if(pic->pict_type == AV_PICTURE_TYPE_P){
+        int block_width = (width +15)>>4;
+        int block_height= (height+15)>>4;
+        int stride= s->current_picture->linesize[0];
+
+        av_assert0(s->current_picture->data[0]);
+        av_assert0(s->last_picture[0]->data[0]);
+
+        s->m.avctx= s->avctx;
+        s->m.   last_picture.f = s->last_picture[0];
+        s->m.    new_picture.f = s->input_picture;
+        s->m.   last_picture_ptr= &s->m.   last_picture;
+        s->m.linesize = stride;
+        s->m.uvlinesize= s->current_picture->linesize[1];
+        s->m.width = width;
+        s->m.height= height;
+        s->m.mb_width = block_width;
+        s->m.mb_height= block_height;
+        s->m.mb_stride=   s->m.mb_width+1;
+        s->m.b8_stride= 2*s->m.mb_width+1;
+        s->m.f_code=1;
+        s->m.pict_type = pic->pict_type;
+#if FF_API_MOTION_EST
+        s->m.me_method= s->avctx->me_method;
+#endif
+        s->m.motion_est= s->motion_est;
+        s->m.me.scene_change_score=0;
+        s->m.me.dia_size = avctx->dia_size;
+        s->m.quarter_sample= (s->avctx->flags & AV_CODEC_FLAG_QPEL)!=0;
+        s->m.out_format= FMT_H263;
+        s->m.unrestricted_mv= 1;
+
+        s->m.lambda = s->lambda;
+        s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
+        s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
+
+        s->m.mecc= s->mecc; //move
+        s->m.qdsp= s->qdsp; //move
+        s->m.hdsp = s->hdsp;
+        ff_init_me(&s->m);
+        s->hdsp = s->m.hdsp;
+        s->mecc= s->m.mecc;
+    }
+
+    if(s->pass1_rc){
+        memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
+        memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
+    }
+
+redo_frame:
+
+    s->spatial_decomposition_count= 5;
+
+    while(   !(width >>(s->chroma_h_shift + s->spatial_decomposition_count))
+          || !(height>>(s->chroma_v_shift + s->spatial_decomposition_count)))
+        s->spatial_decomposition_count--;
+
+    if (s->spatial_decomposition_count <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Resolution too low\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->m.pict_type = pic->pict_type;
+    s->qbias = pic->pict_type == AV_PICTURE_TYPE_P ? 2 : 0;
+
+    ff_snow_common_init_after_header(avctx);
+
+    if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
+        for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+            calculate_visual_weight(s, &s->plane[plane_index]);
+        }
+    }
+
+    encode_header(s);
+    s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    encode_blocks(s, 1);
+    s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
+
+    for(plane_index=0; plane_index < s->nb_planes; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+//        int bits= put_bits_count(&s->c.pb);
+
+        if (!s->memc_only) {
+            //FIXME optimize
+            if(pict->data[plane_index]) //FIXME gray hack
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_idwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
+                    }
+                }
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
+
+#if FF_API_PRIVATE_OPT
+FF_DISABLE_DEPRECATION_WARNINGS
+            if(s->avctx->scenechange_threshold)
+                s->scenechange_threshold = s->avctx->scenechange_threshold;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+            if(   plane_index==0
+               && pic->pict_type == AV_PICTURE_TYPE_P
+               && !(avctx->flags&AV_CODEC_FLAG_PASS2)
+               && s->m.me.scene_change_score > s->scenechange_threshold){
+                ff_init_range_encoder(c, pkt->data, pkt->size);
+                ff_build_rac_states(c, (1LL<<32)/20, 256-8);
+                pic->pict_type= AV_PICTURE_TYPE_I;
+                s->keyframe=1;
+                s->current_picture->key_frame=1;
+                goto redo_frame;
+            }
+
+            if(s->qlog == LOSSLESS_QLOG){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_dwt_buffer[y*w + x]= (s->spatial_idwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
+                    }
+                }
+            }else{
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_dwt_buffer[y*w + x]=s->spatial_idwt_buffer[y*w + x]<<ENCODER_EXTRA_BITS;
+                    }
+                }
+            }
+
+            ff_spatial_dwt(s->spatial_dwt_buffer, s->temp_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+
+            if(s->pass1_rc && plane_index==0){
+                int delta_qlog = ratecontrol_1pass(s, pic);
+                if (delta_qlog <= INT_MIN)
+                    return -1;
+                if(delta_qlog){
+                    //reordering qlog in the bitstream would eliminate this reset
+                    ff_init_range_encoder(c, pkt->data, pkt->size);
+                    memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
+                    memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
+                    encode_header(s);
+                    encode_blocks(s, 0);
+                }
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+
+                    quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
+                    if(orientation==0)
+                        decorrelate(s, b, b->ibuf, b->stride, pic->pict_type == AV_PICTURE_TYPE_P, 0);
+                    if (!s->no_bitstream)
+                    encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
+                    av_assert0(b->parent==NULL || b->parent->stride == b->stride*2);
+                    if(orientation==0)
+                        correlate(s, b, b->ibuf, b->stride, 1, 0);
+                }
+            }
+
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                    SubBand *b= &p->band[level][orientation];
+
+                    dequantize(s, b, b->ibuf, b->stride);
+                }
+            }
+
+            ff_spatial_idwt(s->spatial_idwt_buffer, s->temp_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            if(s->qlog == LOSSLESS_QLOG){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->spatial_idwt_buffer[y*w + x]<<=FRAC_BITS;
+                    }
+                }
+            }
+            predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+        }else{
+            //ME/MC only
+            if(pic->pict_type == AV_PICTURE_TYPE_I){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x]=
+                            pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                    }
+                }
+            }else{
+                memset(s->spatial_idwt_buffer, 0, sizeof(IDWTELEM)*w*h);
+                predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
+            }
+        }
+        if(s->avctx->flags&AV_CODEC_FLAG_PSNR){
+            int64_t error= 0;
+
+            if(pict->data[plane_index]) //FIXME gray hack
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        int d= s->current_picture->data[plane_index][y*s->current_picture->linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                        error += d*d;
+                    }
+                }
+            s->avctx->error[plane_index] += error;
+            s->encoding_error[plane_index] = error;
+        }
+
+    }
+
+    update_last_header_values(s);
+
+    ff_snow_release_buffer(avctx);
+
+    s->current_picture->coded_picture_number = avctx->frame_number;
+    s->current_picture->pict_type = pic->pict_type;
+    s->current_picture->quality = pic->quality;
+    s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
+    s->m.current_picture.f->display_picture_number =
+    s->m.current_picture.f->coded_picture_number   = avctx->frame_number;
+    s->m.current_picture.f->quality                = pic->quality;
+    s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
+    if(s->pass1_rc)
+        if (ff_rate_estimate_qscale(&s->m, 0) < 0)
+            return -1;
+    if(avctx->flags&AV_CODEC_FLAG_PASS1)
+        ff_write_pass1_stats(&s->m);
+    s->m.last_pict_type = s->m.pict_type;
+    avctx->frame_bits = s->m.frame_bits;
+    avctx->mv_bits = s->m.mv_bits;
+    avctx->misc_bits = s->m.misc_bits;
+    avctx->p_tex_bits = s->m.p_tex_bits;
+
+    emms_c();
+
+    ff_side_data_set_encoder_stats(pkt, s->current_picture->quality,
+                                   s->encoding_error,
+                                   (s->avctx->flags&AV_CODEC_FLAG_PSNR) ? 4 : 0,
+                                   s->current_picture->pict_type);
+
+#if FF_API_ERROR_FRAME
+FF_DISABLE_DEPRECATION_WARNINGS
+    memcpy(s->current_picture->error, s->encoding_error, sizeof(s->encoding_error));
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    pkt->size = ff_rac_terminate(c);
+    if (s->current_picture->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int encode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    ff_snow_common_end(s);
+    ff_rate_control_uninit(&s->m);
+    av_frame_free(&s->input_picture);
+    av_freep(&avctx->stats_out);
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(SnowContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    FF_MPV_COMMON_OPTS
+    { "iter",           NULL, 0, AV_OPT_TYPE_CONST, { .i64 = FF_ME_ITER }, 0, 0, FF_MPV_OPT_FLAGS, "motion_est" },
+    { "memc_only",      "Only do ME/MC (I frames -> ref, P frame -> ME+MC).",   OFFSET(memc_only), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "no_bitstream",   "Skip final bitstream writeout.",                    OFFSET(no_bitstream), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "intra_penalty",  "Penalty for intra blocks in block decission",      OFFSET(intra_penalty), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "iterative_dia_size",  "Dia size for the iterative ME",          OFFSET(iterative_dia_size), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
+    { "sc_threshold",   "Scene change threshold",                   OFFSET(scenechange_threshold), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, VE },
+    { "pred",           "Spatial decomposition type",                                OFFSET(pred), AV_OPT_TYPE_INT, { .i64 = 0 }, DWT_97, DWT_53, VE, "pred" },
+        { "dwt97", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, INT_MIN, INT_MAX, VE, "pred" },
+        { "dwt53", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, VE, "pred" },
+    { NULL },
+};
+
+static const AVClass snowenc_class = {
+    .class_name = "snow encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_snow_encoder = {
+    .name           = "snow",
+    .long_name      = NULL_IF_CONFIG_SMALL("Snow"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_SNOW,
+    .priv_data_size = sizeof(SnowContext),
+    .init           = encode_init,
+    .encode2        = encode_frame,
+    .close          = encode_end,
+    .pix_fmts       = (const enum AVPixelFormat[]){
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_NONE
+    },
+    .priv_class     = &snowenc_class,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE |
+                      FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/sonic.c b/libavcodec/sonic.c
new file mode 100644
index 0000000..2e3ca79
--- /dev/null
+++ b/libavcodec/sonic.c
@@ -0,0 +1,1126 @@
+/*
+ * Simple free lossless/lossy audio codec
+ * Copyright (c) 2004 Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "avcodec.h"
+#include "get_bits.h"
+#include "golomb.h"
+#include "internal.h"
+#include "rangecoder.h"
+
+
+/**
+ * @file
+ * Simple free lossless/lossy audio codec
+ * Based on Paul Francis Harrison's Bonk (http://www.logarithmic.net/pfh/bonk)
+ * Written and designed by Alex Beregszaszi
+ *
+ * TODO:
+ *  - CABAC put/get_symbol
+ *  - independent quantizer for channels
+ *  - >2 channels support
+ *  - more decorrelation types
+ *  - more tap_quant tests
+ *  - selectable intlist writers/readers (bonk-style, golomb, cabac)
+ */
+
+#define MAX_CHANNELS 2
+
+#define MID_SIDE 0
+#define LEFT_SIDE 1
+#define RIGHT_SIDE 2
+
+typedef struct SonicContext {
+    int version;
+    int minor_version;
+    int lossless, decorrelation;
+
+    int num_taps, downsampling;
+    double quantization;
+
+    int channels, samplerate, block_align, frame_size;
+
+    int *tap_quant;
+    int *int_samples;
+    int *coded_samples[MAX_CHANNELS];
+
+    // for encoding
+    int *tail;
+    int tail_size;
+    int *window;
+    int window_size;
+
+    // for decoding
+    int *predictor_k;
+    int *predictor_state[MAX_CHANNELS];
+} SonicContext;
+
+#define LATTICE_SHIFT   10
+#define SAMPLE_SHIFT    4
+#define LATTICE_FACTOR  (1 << LATTICE_SHIFT)
+#define SAMPLE_FACTOR   (1 << SAMPLE_SHIFT)
+
+#define BASE_QUANT      0.6
+#define RATE_VARIATION  3.0
+
+static inline int shift(int a,int b)
+{
+    return (a+(1<<(b-1))) >> b;
+}
+
+static inline int shift_down(int a,int b)
+{
+    return (a>>b)+(a<0);
+}
+
+static av_always_inline av_flatten void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed, uint64_t rc_stat[256][2], uint64_t rc_stat2[32][2]){
+    int i;
+
+#define put_rac(C,S,B) \
+do{\
+    if(rc_stat){\
+        rc_stat[*(S)][B]++;\
+        rc_stat2[(S)-state][B]++;\
+    }\
+    put_rac(C,S,B);\
+}while(0)
+
+    if(v){
+        const int a= FFABS(v);
+        const int e= av_log2(a);
+        put_rac(c, state+0, 0);
+        if(e<=9){
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+i, 1);  //1..10
+            }
+            put_rac(c, state+1+i, 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+i, (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + e, v < 0); //11..21
+        }else{
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
+            }
+            put_rac(c, state+1+9, 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + 10, v < 0); //11..21
+        }
+    }else{
+        put_rac(c, state+0, 1);
+    }
+#undef put_rac
+}
+
+static inline av_flatten int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
+        return 0;
+    else{
+        int i, e, a;
+        e= 0;
+        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
+            e++;
+        }
+
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
+        }
+
+        e= -(is_signed && get_rac(c, state+11 + FFMIN(e, 10))); //11..21
+        return (a^e)-e;
+    }
+}
+
+#if 1
+static inline int intlist_write(RangeCoder *c, uint8_t *state, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        put_symbol(c, state, buf[i], 1, NULL, NULL);
+
+    return 1;
+}
+
+static inline int intlist_read(RangeCoder *c, uint8_t *state, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        buf[i] = get_symbol(c, state, 1);
+
+    return 1;
+}
+#elif 1
+static inline int intlist_write(PutBitContext *pb, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        set_se_golomb(pb, buf[i]);
+
+    return 1;
+}
+
+static inline int intlist_read(GetBitContext *gb, int *buf, int entries, int base_2_part)
+{
+    int i;
+
+    for (i = 0; i < entries; i++)
+        buf[i] = get_se_golomb(gb);
+
+    return 1;
+}
+
+#else
+
+#define ADAPT_LEVEL 8
+
+static int bits_to_store(uint64_t x)
+{
+    int res = 0;
+
+    while(x)
+    {
+        res++;
+        x >>= 1;
+    }
+    return res;
+}
+
+static void write_uint_max(PutBitContext *pb, unsigned int value, unsigned int max)
+{
+    int i, bits;
+
+    if (!max)
+        return;
+
+    bits = bits_to_store(max);
+
+    for (i = 0; i < bits-1; i++)
+        put_bits(pb, 1, value & (1 << i));
+
+    if ( (value | (1 << (bits-1))) <= max)
+        put_bits(pb, 1, value & (1 << (bits-1)));
+}
+
+static unsigned int read_uint_max(GetBitContext *gb, int max)
+{
+    int i, bits, value = 0;
+
+    if (!max)
+        return 0;
+
+    bits = bits_to_store(max);
+
+    for (i = 0; i < bits-1; i++)
+        if (get_bits1(gb))
+            value += 1 << i;
+
+    if ( (value | (1<<(bits-1))) <= max)
+        if (get_bits1(gb))
+            value += 1 << (bits-1);
+
+    return value;
+}
+
+static int intlist_write(PutBitContext *pb, int *buf, int entries, int base_2_part)
+{
+    int i, j, x = 0, low_bits = 0, max = 0;
+    int step = 256, pos = 0, dominant = 0, any = 0;
+    int *copy, *bits;
+
+    copy = av_calloc(entries, sizeof(*copy));
+    if (!copy)
+        return AVERROR(ENOMEM);
+
+    if (base_2_part)
+    {
+        int energy = 0;
+
+        for (i = 0; i < entries; i++)
+            energy += abs(buf[i]);
+
+        low_bits = bits_to_store(energy / (entries * 2));
+        if (low_bits > 15)
+            low_bits = 15;
+
+        put_bits(pb, 4, low_bits);
+    }
+
+    for (i = 0; i < entries; i++)
+    {
+        put_bits(pb, low_bits, abs(buf[i]));
+        copy[i] = abs(buf[i]) >> low_bits;
+        if (copy[i] > max)
+            max = abs(copy[i]);
+    }
+
+    bits = av_calloc(entries*max, sizeof(*bits));
+    if (!bits)
+    {
+        av_free(copy);
+        return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i <= max; i++)
+    {
+        for (j = 0; j < entries; j++)
+            if (copy[j] >= i)
+                bits[x++] = copy[j] > i;
+    }
+
+    // store bitstream
+    while (pos < x)
+    {
+        int steplet = step >> 8;
+
+        if (pos + steplet > x)
+            steplet = x - pos;
+
+        for (i = 0; i < steplet; i++)
+            if (bits[i+pos] != dominant)
+                any = 1;
+
+        put_bits(pb, 1, any);
+
+        if (!any)
+        {
+            pos += steplet;
+            step += step / ADAPT_LEVEL;
+        }
+        else
+        {
+            int interloper = 0;
+
+            while (((pos + interloper) < x) && (bits[pos + interloper] == dominant))
+                interloper++;
+
+            // note change
+            write_uint_max(pb, interloper, (step >> 8) - 1);
+
+            pos += interloper + 1;
+            step -= step / ADAPT_LEVEL;
+        }
+
+        if (step < 256)
+        {
+            step = 65536 / step;
+            dominant = !dominant;
+        }
+    }
+
+    // store signs
+    for (i = 0; i < entries; i++)
+        if (buf[i])
+            put_bits(pb, 1, buf[i] < 0);
+
+    av_free(bits);
+    av_free(copy);
+
+    return 0;
+}
+
+static int intlist_read(GetBitContext *gb, int *buf, int entries, int base_2_part)
+{
+    int i, low_bits = 0, x = 0;
+    int n_zeros = 0, step = 256, dominant = 0;
+    int pos = 0, level = 0;
+    int *bits = av_calloc(entries, sizeof(*bits));
+
+    if (!bits)
+        return AVERROR(ENOMEM);
+
+    if (base_2_part)
+    {
+        low_bits = get_bits(gb, 4);
+
+        if (low_bits)
+            for (i = 0; i < entries; i++)
+                buf[i] = get_bits(gb, low_bits);
+    }
+
+//    av_log(NULL, AV_LOG_INFO, "entries: %d, low bits: %d\n", entries, low_bits);
+
+    while (n_zeros < entries)
+    {
+        int steplet = step >> 8;
+
+        if (!get_bits1(gb))
+        {
+            for (i = 0; i < steplet; i++)
+                bits[x++] = dominant;
+
+            if (!dominant)
+                n_zeros += steplet;
+
+            step += step / ADAPT_LEVEL;
+        }
+        else
+        {
+            int actual_run = read_uint_max(gb, steplet-1);
+
+//            av_log(NULL, AV_LOG_INFO, "actual run: %d\n", actual_run);
+
+            for (i = 0; i < actual_run; i++)
+                bits[x++] = dominant;
+
+            bits[x++] = !dominant;
+
+            if (!dominant)
+                n_zeros += actual_run;
+            else
+                n_zeros++;
+
+            step -= step / ADAPT_LEVEL;
+        }
+
+        if (step < 256)
+        {
+            step = 65536 / step;
+            dominant = !dominant;
+        }
+    }
+
+    // reconstruct unsigned values
+    n_zeros = 0;
+    for (i = 0; n_zeros < entries; i++)
+    {
+        while(1)
+        {
+            if (pos >= entries)
+            {
+                pos = 0;
+                level += 1 << low_bits;
+            }
+
+            if (buf[pos] >= level)
+                break;
+
+            pos++;
+        }
+
+        if (bits[i])
+            buf[pos] += 1 << low_bits;
+        else
+            n_zeros++;
+
+        pos++;
+    }
+    av_free(bits);
+
+    // read signs
+    for (i = 0; i < entries; i++)
+        if (buf[i] && get_bits1(gb))
+            buf[i] = -buf[i];
+
+//    av_log(NULL, AV_LOG_INFO, "zeros: %d pos: %d\n", n_zeros, pos);
+
+    return 0;
+}
+#endif
+
+static void predictor_init_state(int *k, int *state, int order)
+{
+    int i;
+
+    for (i = order-2; i >= 0; i--)
+    {
+        int j, p, x = state[i];
+
+        for (j = 0, p = i+1; p < order; j++,p++)
+            {
+            int tmp = x + shift_down(k[j] * state[p], LATTICE_SHIFT);
+            state[p] += shift_down(k[j]*x, LATTICE_SHIFT);
+            x = tmp;
+        }
+    }
+}
+
+static int predictor_calc_error(int *k, int *state, int order, int error)
+{
+    int i, x = error - shift_down(k[order-1] * state[order-1], LATTICE_SHIFT);
+
+#if 1
+    int *k_ptr = &(k[order-2]),
+        *state_ptr = &(state[order-2]);
+    for (i = order-2; i >= 0; i--, k_ptr--, state_ptr--)
+    {
+        int k_value = *k_ptr, state_value = *state_ptr;
+        x -= shift_down(k_value * state_value, LATTICE_SHIFT);
+        state_ptr[1] = state_value + shift_down(k_value * x, LATTICE_SHIFT);
+    }
+#else
+    for (i = order-2; i >= 0; i--)
+    {
+        x -= shift_down(k[i] * state[i], LATTICE_SHIFT);
+        state[i+1] = state[i] + shift_down(k[i] * x, LATTICE_SHIFT);
+    }
+#endif
+
+    // don't drift too far, to avoid overflows
+    if (x >  (SAMPLE_FACTOR<<16)) x =  (SAMPLE_FACTOR<<16);
+    if (x < -(SAMPLE_FACTOR<<16)) x = -(SAMPLE_FACTOR<<16);
+
+    state[0] = x;
+
+    return x;
+}
+
+#if CONFIG_SONIC_ENCODER || CONFIG_SONIC_LS_ENCODER
+// Heavily modified Levinson-Durbin algorithm which
+// copes better with quantization, and calculates the
+// actual whitened result as it goes.
+
+static int modified_levinson_durbin(int *window, int window_entries,
+        int *out, int out_entries, int channels, int *tap_quant)
+{
+    int i;
+    int *state = av_calloc(window_entries, sizeof(*state));
+
+    if (!state)
+        return AVERROR(ENOMEM);
+
+    memcpy(state, window, 4* window_entries);
+
+    for (i = 0; i < out_entries; i++)
+    {
+        int step = (i+1)*channels, k, j;
+        double xx = 0.0, xy = 0.0;
+#if 1
+        int *x_ptr = &(window[step]);
+        int *state_ptr = &(state[0]);
+        j = window_entries - step;
+        for (;j>0;j--,x_ptr++,state_ptr++)
+        {
+            double x_value = *x_ptr;
+            double state_value = *state_ptr;
+            xx += state_value*state_value;
+            xy += x_value*state_value;
+        }
+#else
+        for (j = 0; j <= (window_entries - step); j++);
+        {
+            double stepval = window[step+j];
+            double stateval = window[j];
+//            xx += (double)window[j]*(double)window[j];
+//            xy += (double)window[step+j]*(double)window[j];
+            xx += stateval*stateval;
+            xy += stepval*stateval;
+        }
+#endif
+        if (xx == 0.0)
+            k = 0;
+        else
+            k = (int)(floor(-xy/xx * (double)LATTICE_FACTOR / (double)(tap_quant[i]) + 0.5));
+
+        if (k > (LATTICE_FACTOR/tap_quant[i]))
+            k = LATTICE_FACTOR/tap_quant[i];
+        if (-k > (LATTICE_FACTOR/tap_quant[i]))
+            k = -(LATTICE_FACTOR/tap_quant[i]);
+
+        out[i] = k;
+        k *= tap_quant[i];
+
+#if 1
+        x_ptr = &(window[step]);
+        state_ptr = &(state[0]);
+        j = window_entries - step;
+        for (;j>0;j--,x_ptr++,state_ptr++)
+        {
+            int x_value = *x_ptr;
+            int state_value = *state_ptr;
+            *x_ptr = x_value + shift_down(k*state_value,LATTICE_SHIFT);
+            *state_ptr = state_value + shift_down(k*x_value, LATTICE_SHIFT);
+        }
+#else
+        for (j=0; j <= (window_entries - step); j++)
+        {
+            int stepval = window[step+j];
+            int stateval=state[j];
+            window[step+j] += shift_down(k * stateval, LATTICE_SHIFT);
+            state[j] += shift_down(k * stepval, LATTICE_SHIFT);
+        }
+#endif
+    }
+
+    av_free(state);
+    return 0;
+}
+
+static inline int code_samplerate(int samplerate)
+{
+    switch (samplerate)
+    {
+        case 44100: return 0;
+        case 22050: return 1;
+        case 11025: return 2;
+        case 96000: return 3;
+        case 48000: return 4;
+        case 32000: return 5;
+        case 24000: return 6;
+        case 16000: return 7;
+        case 8000: return 8;
+    }
+    return AVERROR(EINVAL);
+}
+
+static av_cold int sonic_encode_init(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    PutBitContext pb;
+    int i;
+
+    s->version = 2;
+
+    if (avctx->channels > MAX_CHANNELS)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo streams are supported by now\n");
+        return AVERROR(EINVAL); /* only stereo or mono for now */
+    }
+
+    if (avctx->channels == 2)
+        s->decorrelation = MID_SIDE;
+    else
+        s->decorrelation = 3;
+
+    if (avctx->codec->id == AV_CODEC_ID_SONIC_LS)
+    {
+        s->lossless = 1;
+        s->num_taps = 32;
+        s->downsampling = 1;
+        s->quantization = 0.0;
+    }
+    else
+    {
+        s->num_taps = 128;
+        s->downsampling = 2;
+        s->quantization = 1.0;
+    }
+
+    // max tap 2048
+    if (s->num_taps < 32 || s->num_taps > 1024 || s->num_taps % 32) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid number of taps\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    // generate taps
+    s->tap_quant = av_calloc(s->num_taps, sizeof(*s->tap_quant));
+    if (!s->tap_quant)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->num_taps; i++)
+        s->tap_quant[i] = ff_sqrt(i+1);
+
+    s->channels = avctx->channels;
+    s->samplerate = avctx->sample_rate;
+
+    s->block_align = 2048LL*s->samplerate/(44100*s->downsampling);
+    s->frame_size = s->channels*s->block_align*s->downsampling;
+
+    s->tail_size = s->num_taps*s->channels;
+    s->tail = av_calloc(s->tail_size, sizeof(*s->tail));
+    if (!s->tail)
+        return AVERROR(ENOMEM);
+
+    s->predictor_k = av_calloc(s->num_taps, sizeof(*s->predictor_k) );
+    if (!s->predictor_k)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->coded_samples[i] = av_calloc(s->block_align, sizeof(**s->coded_samples));
+        if (!s->coded_samples[i])
+            return AVERROR(ENOMEM);
+    }
+
+    s->int_samples = av_calloc(s->frame_size, sizeof(*s->int_samples));
+
+    s->window_size = ((2*s->tail_size)+s->frame_size);
+    s->window = av_calloc(s->window_size, sizeof(*s->window));
+    if (!s->window || !s->int_samples)
+        return AVERROR(ENOMEM);
+
+    avctx->extradata = av_mallocz(16);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+    init_put_bits(&pb, avctx->extradata, 16*8);
+
+    put_bits(&pb, 2, s->version); // version
+    if (s->version >= 1)
+    {
+        if (s->version >= 2) {
+            put_bits(&pb, 8, s->version);
+            put_bits(&pb, 8, s->minor_version);
+        }
+        put_bits(&pb, 2, s->channels);
+        put_bits(&pb, 4, code_samplerate(s->samplerate));
+    }
+    put_bits(&pb, 1, s->lossless);
+    if (!s->lossless)
+        put_bits(&pb, 3, SAMPLE_SHIFT); // XXX FIXME: sample precision
+    put_bits(&pb, 2, s->decorrelation);
+    put_bits(&pb, 2, s->downsampling);
+    put_bits(&pb, 5, (s->num_taps >> 5)-1); // 32..1024
+    put_bits(&pb, 1, 0); // XXX FIXME: no custom tap quant table
+
+    flush_put_bits(&pb);
+    avctx->extradata_size = put_bits_count(&pb)/8;
+
+    av_log(avctx, AV_LOG_INFO, "Sonic: ver: %d.%d ls: %d dr: %d taps: %d block: %d frame: %d downsamp: %d\n",
+        s->version, s->minor_version, s->lossless, s->decorrelation, s->num_taps, s->block_align, s->frame_size, s->downsampling);
+
+    avctx->frame_size = s->block_align*s->downsampling;
+
+    return 0;
+}
+
+static av_cold int sonic_encode_close(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->channels; i++)
+        av_freep(&s->coded_samples[i]);
+
+    av_freep(&s->predictor_k);
+    av_freep(&s->tail);
+    av_freep(&s->tap_quant);
+    av_freep(&s->window);
+    av_freep(&s->int_samples);
+
+    return 0;
+}
+
+static int sonic_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                              const AVFrame *frame, int *got_packet_ptr)
+{
+    SonicContext *s = avctx->priv_data;
+    RangeCoder c;
+    int i, j, ch, quant = 0, x = 0;
+    int ret;
+    const short *samples = (const int16_t*)frame->data[0];
+    uint8_t state[32];
+
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->frame_size * 5 + 1000, 0)) < 0)
+        return ret;
+
+    ff_init_range_encoder(&c, avpkt->data, avpkt->size);
+    ff_build_rac_states(&c, 0.05*(1LL<<32), 256-8);
+    memset(state, 128, sizeof(state));
+
+    // short -> internal
+    for (i = 0; i < s->frame_size; i++)
+        s->int_samples[i] = samples[i];
+
+    if (!s->lossless)
+        for (i = 0; i < s->frame_size; i++)
+            s->int_samples[i] = s->int_samples[i] << SAMPLE_SHIFT;
+
+    switch(s->decorrelation)
+    {
+        case MID_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+            {
+                s->int_samples[i] += s->int_samples[i+1];
+                s->int_samples[i+1] -= shift(s->int_samples[i], 1);
+            }
+            break;
+        case LEFT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i+1] -= s->int_samples[i];
+            break;
+        case RIGHT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i] -= s->int_samples[i+1];
+            break;
+    }
+
+    memset(s->window, 0, 4* s->window_size);
+
+    for (i = 0; i < s->tail_size; i++)
+        s->window[x++] = s->tail[i];
+
+    for (i = 0; i < s->frame_size; i++)
+        s->window[x++] = s->int_samples[i];
+
+    for (i = 0; i < s->tail_size; i++)
+        s->window[x++] = 0;
+
+    for (i = 0; i < s->tail_size; i++)
+        s->tail[i] = s->int_samples[s->frame_size - s->tail_size + i];
+
+    // generate taps
+    ret = modified_levinson_durbin(s->window, s->window_size,
+                s->predictor_k, s->num_taps, s->channels, s->tap_quant);
+    if (ret < 0)
+        return ret;
+
+    if ((ret = intlist_write(&c, state, s->predictor_k, s->num_taps, 0)) < 0)
+        return ret;
+
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        x = s->tail_size+ch;
+        for (i = 0; i < s->block_align; i++)
+        {
+            int sum = 0;
+            for (j = 0; j < s->downsampling; j++, x += s->channels)
+                sum += s->window[x];
+            s->coded_samples[ch][i] = sum;
+        }
+    }
+
+    // simple rate control code
+    if (!s->lossless)
+    {
+        double energy1 = 0.0, energy2 = 0.0;
+        for (ch = 0; ch < s->channels; ch++)
+        {
+            for (i = 0; i < s->block_align; i++)
+            {
+                double sample = s->coded_samples[ch][i];
+                energy2 += sample*sample;
+                energy1 += fabs(sample);
+            }
+        }
+
+        energy2 = sqrt(energy2/(s->channels*s->block_align));
+        energy1 = M_SQRT2*energy1/(s->channels*s->block_align);
+
+        // increase bitrate when samples are like a gaussian distribution
+        // reduce bitrate when samples are like a two-tailed exponential distribution
+
+        if (energy2 > energy1)
+            energy2 += (energy2-energy1)*RATE_VARIATION;
+
+        quant = (int)(BASE_QUANT*s->quantization*energy2/SAMPLE_FACTOR);
+//        av_log(avctx, AV_LOG_DEBUG, "quant: %d energy: %f / %f\n", quant, energy1, energy2);
+
+        quant = av_clip(quant, 1, 65534);
+
+        put_symbol(&c, state, quant, 0, NULL, NULL);
+
+        quant *= SAMPLE_FACTOR;
+    }
+
+    // write out coded samples
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        if (!s->lossless)
+            for (i = 0; i < s->block_align; i++)
+                s->coded_samples[ch][i] = ROUNDED_DIV(s->coded_samples[ch][i], quant);
+
+        if ((ret = intlist_write(&c, state, s->coded_samples[ch], s->block_align, 1)) < 0)
+            return ret;
+    }
+
+//    av_log(avctx, AV_LOG_DEBUG, "used bytes: %d\n", (put_bits_count(&pb)+7)/8);
+
+    avpkt->size = ff_rac_terminate(&c);
+    *got_packet_ptr = 1;
+    return 0;
+
+}
+#endif /* CONFIG_SONIC_ENCODER || CONFIG_SONIC_LS_ENCODER */
+
+#if CONFIG_SONIC_DECODER
+static const int samplerate_table[] =
+    { 44100, 22050, 11025, 96000, 48000, 32000, 24000, 16000, 8000 };
+
+static av_cold int sonic_decode_init(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    GetBitContext gb;
+    int i;
+    int ret;
+
+    s->channels = avctx->channels;
+    s->samplerate = avctx->sample_rate;
+
+    if (!avctx->extradata)
+    {
+        av_log(avctx, AV_LOG_ERROR, "No mandatory headers present\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size);
+    if (ret < 0)
+        return ret;
+
+    s->version = get_bits(&gb, 2);
+    if (s->version >= 2) {
+        s->version       = get_bits(&gb, 8);
+        s->minor_version = get_bits(&gb, 8);
+    }
+    if (s->version != 2)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Unsupported Sonic version, please report\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->version >= 1)
+    {
+        int sample_rate_index;
+        s->channels = get_bits(&gb, 2);
+        sample_rate_index = get_bits(&gb, 4);
+        if (sample_rate_index >= FF_ARRAY_ELEMS(samplerate_table)) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid sample_rate_index %d\n", sample_rate_index);
+            return AVERROR_INVALIDDATA;
+        }
+        s->samplerate = samplerate_table[sample_rate_index];
+        av_log(avctx, AV_LOG_INFO, "Sonicv2 chans: %d samprate: %d\n",
+            s->channels, s->samplerate);
+    }
+
+    if (s->channels > MAX_CHANNELS || s->channels < 1)
+    {
+        av_log(avctx, AV_LOG_ERROR, "Only mono and stereo streams are supported by now\n");
+        return AVERROR_INVALIDDATA;
+    }
+    avctx->channels = s->channels;
+
+    s->lossless = get_bits1(&gb);
+    if (!s->lossless)
+        skip_bits(&gb, 3); // XXX FIXME
+    s->decorrelation = get_bits(&gb, 2);
+    if (s->decorrelation != 3 && s->channels != 2) {
+        av_log(avctx, AV_LOG_ERROR, "invalid decorrelation %d\n", s->decorrelation);
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->downsampling = get_bits(&gb, 2);
+    if (!s->downsampling) {
+        av_log(avctx, AV_LOG_ERROR, "invalid downsampling value\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    s->num_taps = (get_bits(&gb, 5)+1)<<5;
+    if (get_bits1(&gb)) // XXX FIXME
+        av_log(avctx, AV_LOG_INFO, "Custom quant table\n");
+
+    s->block_align = 2048LL*s->samplerate/(44100*s->downsampling);
+    s->frame_size = s->channels*s->block_align*s->downsampling;
+//    avctx->frame_size = s->block_align;
+
+    if (s->num_taps * s->channels > s->frame_size) {
+        av_log(avctx, AV_LOG_ERROR,
+               "number of taps times channels (%d * %d) larger than frame size %d\n",
+               s->num_taps, s->channels, s->frame_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    av_log(avctx, AV_LOG_INFO, "Sonic: ver: %d.%d ls: %d dr: %d taps: %d block: %d frame: %d downsamp: %d\n",
+        s->version, s->minor_version, s->lossless, s->decorrelation, s->num_taps, s->block_align, s->frame_size, s->downsampling);
+
+    // generate taps
+    s->tap_quant = av_calloc(s->num_taps, sizeof(*s->tap_quant));
+    if (!s->tap_quant)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < s->num_taps; i++)
+        s->tap_quant[i] = ff_sqrt(i+1);
+
+    s->predictor_k = av_calloc(s->num_taps, sizeof(*s->predictor_k));
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->predictor_state[i] = av_calloc(s->num_taps, sizeof(**s->predictor_state));
+        if (!s->predictor_state[i])
+            return AVERROR(ENOMEM);
+    }
+
+    for (i = 0; i < s->channels; i++)
+    {
+        s->coded_samples[i] = av_calloc(s->block_align, sizeof(**s->coded_samples));
+        if (!s->coded_samples[i])
+            return AVERROR(ENOMEM);
+    }
+    s->int_samples = av_calloc(s->frame_size, sizeof(*s->int_samples));
+    if (!s->int_samples)
+        return AVERROR(ENOMEM);
+
+    avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+    return 0;
+}
+
+static av_cold int sonic_decode_close(AVCodecContext *avctx)
+{
+    SonicContext *s = avctx->priv_data;
+    int i;
+
+    av_freep(&s->int_samples);
+    av_freep(&s->tap_quant);
+    av_freep(&s->predictor_k);
+
+    for (i = 0; i < s->channels; i++)
+    {
+        av_freep(&s->predictor_state[i]);
+        av_freep(&s->coded_samples[i]);
+    }
+
+    return 0;
+}
+
+static int sonic_decode_frame(AVCodecContext *avctx,
+                            void *data, int *got_frame_ptr,
+                            AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    SonicContext *s = avctx->priv_data;
+    RangeCoder c;
+    uint8_t state[32];
+    int i, quant, ch, j, ret;
+    int16_t *samples;
+    AVFrame *frame = data;
+
+    if (buf_size == 0) return 0;
+
+    frame->nb_samples = s->frame_size / avctx->channels;
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+    samples = (int16_t *)frame->data[0];
+
+//    av_log(NULL, AV_LOG_INFO, "buf_size: %d\n", buf_size);
+
+    memset(state, 128, sizeof(state));
+    ff_init_range_decoder(&c, buf, buf_size);
+    ff_build_rac_states(&c, 0.05*(1LL<<32), 256-8);
+
+    intlist_read(&c, state, s->predictor_k, s->num_taps, 0);
+
+    // dequantize
+    for (i = 0; i < s->num_taps; i++)
+        s->predictor_k[i] *= s->tap_quant[i];
+
+    if (s->lossless)
+        quant = 1;
+    else
+        quant = get_symbol(&c, state, 0) * SAMPLE_FACTOR;
+
+//    av_log(NULL, AV_LOG_INFO, "quant: %d\n", quant);
+
+    for (ch = 0; ch < s->channels; ch++)
+    {
+        int x = ch;
+
+        predictor_init_state(s->predictor_k, s->predictor_state[ch], s->num_taps);
+
+        intlist_read(&c, state, s->coded_samples[ch], s->block_align, 1);
+
+        for (i = 0; i < s->block_align; i++)
+        {
+            for (j = 0; j < s->downsampling - 1; j++)
+            {
+                s->int_samples[x] = predictor_calc_error(s->predictor_k, s->predictor_state[ch], s->num_taps, 0);
+                x += s->channels;
+            }
+
+            s->int_samples[x] = predictor_calc_error(s->predictor_k, s->predictor_state[ch], s->num_taps, s->coded_samples[ch][i] * quant);
+            x += s->channels;
+        }
+
+        for (i = 0; i < s->num_taps; i++)
+            s->predictor_state[ch][i] = s->int_samples[s->frame_size - s->channels + ch - i*s->channels];
+    }
+
+    switch(s->decorrelation)
+    {
+        case MID_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+            {
+                s->int_samples[i+1] += shift(s->int_samples[i], 1);
+                s->int_samples[i] -= s->int_samples[i+1];
+            }
+            break;
+        case LEFT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i+1] += s->int_samples[i];
+            break;
+        case RIGHT_SIDE:
+            for (i = 0; i < s->frame_size; i += s->channels)
+                s->int_samples[i] += s->int_samples[i+1];
+            break;
+    }
+
+    if (!s->lossless)
+        for (i = 0; i < s->frame_size; i++)
+            s->int_samples[i] = shift(s->int_samples[i], SAMPLE_SHIFT);
+
+    // internal -> short
+    for (i = 0; i < s->frame_size; i++)
+        samples[i] = av_clip_int16(s->int_samples[i]);
+
+    *got_frame_ptr = 1;
+
+    return buf_size;
+}
+
+AVCodec ff_sonic_decoder = {
+    .name           = "sonic",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_decode_init,
+    .close          = sonic_decode_close,
+    .decode         = sonic_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_EXPERIMENTAL,
+};
+#endif /* CONFIG_SONIC_DECODER */
+
+#if CONFIG_SONIC_ENCODER
+AVCodec ff_sonic_encoder = {
+    .name           = "sonic",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_encode_init,
+    .encode2        = sonic_encode_frame,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_EXPERIMENTAL,
+    .close          = sonic_encode_close,
+};
+#endif
+
+#if CONFIG_SONIC_LS_ENCODER
+AVCodec ff_sonic_ls_encoder = {
+    .name           = "sonicls",
+    .long_name      = NULL_IF_CONFIG_SMALL("Sonic lossless"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_SONIC_LS,
+    .priv_data_size = sizeof(SonicContext),
+    .init           = sonic_encode_init,
+    .encode2        = sonic_encode_frame,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_EXPERIMENTAL,
+    .close          = sonic_encode_close,
+};
+#endif
diff --git a/libavcodec/sp5x.h b/libavcodec/sp5x.h
index 090662b..21c4571 100644
--- a/libavcodec/sp5x.h
+++ b/libavcodec/sp5x.h
@@ -2,20 +2,20 @@
  * Sunplus JPEG tables
  * Copyright (c) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sp5xdec.c b/libavcodec/sp5xdec.c
index 7f57b63..815f9ad 100644
--- a/libavcodec/sp5xdec.c
+++ b/libavcodec/sp5xdec.c
@@ -2,20 +2,20 @@
  * Sunplus JPEG decoder (SP5X)
  * Copyright (c) 2003 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
         for (i = 2; i < buf_size-2 && j < buf_size+1024-2; i++)
             recoded[j++] = buf[i];
     else
-    for (i = 14; i < buf_size && j < buf_size+1024-2; i++)
+    for (i = 14; i < buf_size && j < buf_size+1024-3; i++)
     {
         recoded[j++] = buf[i];
         if (buf[i] == 0xff)
@@ -91,9 +91,10 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
 
     av_free(recoded);
 
-    return i;
+    return i < 0 ? i : avpkt->size;
 }
 
+#if CONFIG_SP5X_DECODER
 AVCodec ff_sp5x_decoder = {
     .name           = "sp5x",
     .long_name      = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"),
@@ -104,9 +105,11 @@ AVCodec ff_sp5x_decoder = {
     .close          = ff_mjpeg_decode_end,
     .decode         = sp5x_decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .max_lowres     = 3,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
-
+#endif
+#if CONFIG_AMV_DECODER
 AVCodec ff_amv_decoder = {
     .name           = "amv",
     .long_name      = NULL_IF_CONFIG_SMALL("AMV Video"),
@@ -116,6 +119,8 @@ AVCodec ff_amv_decoder = {
     .init           = ff_mjpeg_decode_init,
     .close          = ff_mjpeg_decode_end,
     .decode         = sp5x_decode_frame,
+    .max_lowres     = 3,
     .capabilities   = AV_CODEC_CAP_DR1,
     .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
 };
+#endif
diff --git a/libavcodec/sparc/README b/libavcodec/sparc/README
new file mode 100644
index 0000000..f9f2349
--- /dev/null
+++ b/libavcodec/sparc/README
@@ -0,0 +1,6 @@
+SPARC optimizations have been removed in
+commit b4dd424d96f09f9bafb88e47f37df65dc4529143
+The last revission with the optimizations is fb1b70c1ed50951c5fc1a309c3c446b2eaaf564b
+
+If you want to maintain these (or other) SPARC optimizations in ffmpeg, then please
+contact ffmpeg-devel@ffmpeg.org
diff --git a/libavcodec/srtdec.c b/libavcodec/srtdec.c
index 3bee3c7..30930c8 100644
--- a/libavcodec/srtdec.c
+++ b/libavcodec/srtdec.c
@@ -2,241 +2,111 @@
  * SubRip subtitle decoder
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/parseutils.h"
 #include "avcodec.h"
 #include "ass.h"
+#include "htmlsubtitles.h"
 
-static int html_color_parse(AVCodecContext *avctx, const char *str)
+static void srt_to_ass(AVCodecContext *avctx, AVBPrint *dst,
+                       const char *in, int x1, int y1, int x2, int y2)
 {
-    uint8_t rgba[4];
-    if (av_parse_color(rgba, str, strcspn(str, "\" >"), avctx) < 0)
-        return -1;
-    return rgba[0] | rgba[1] << 8 | rgba[2] << 16;
-}
-
-enum {
-    PARAM_UNKNOWN = -1,
-    PARAM_SIZE,
-    PARAM_COLOR,
-    PARAM_FACE,
-    PARAM_NUMBER
-};
-
-typedef struct SrtStack {
-    char tag[128];
-    char param[PARAM_NUMBER][128];
-} SrtStack;
-
-static const char *srt_to_ass(AVCodecContext *avctx, char *out, char *out_end,
-                              const char *in, int x1, int y1, int x2, int y2)
-{
-    char c, *param, buffer[128], tmp[128];
-    int len, tag_close, sptr = 1, line_start = 1, an = 0, end = 0;
-    SrtStack stack[16];
-
-    stack[0].tag[0] = 0;
-    strcpy(stack[0].param[PARAM_SIZE],  "{\\fs}");
-    strcpy(stack[0].param[PARAM_COLOR], "{\\c}");
-    strcpy(stack[0].param[PARAM_FACE],  "{\\fn}");
-
     if (x1 >= 0 && y1 >= 0) {
-        if (x2 >= 0 && y2 >= 0 && (x2 != x1 || y2 != y1))
-            out += snprintf(out, out_end-out,
-                            "{\\an1}{\\move(%d,%d,%d,%d)}", x1, y1, x2, y2);
-        else
-            out += snprintf(out, out_end-out, "{\\an1}{\\pos(%d,%d)}", x1, y1);
-    }
-
-    for (; out < out_end && !end && *in; in++) {
-        switch (*in) {
-        case '\r':
-            break;
-        case '\n':
-            if (line_start) {
-                end = 1;
-                break;
-            }
-            while (out[-1] == ' ')
-                out--;
-            out += snprintf(out, out_end-out, "\\N");
-            line_start = 1;
-            break;
-        case ' ':
-            if (!line_start)
-                *out++ = *in;
-            break;
-        case '{':    /* skip all {\xxx} substrings except for {\an%d}
-                        and all microdvd like styles such as {Y:xxx} */
-            an += sscanf(in, "{\\an%*1u}%c", &c) == 1;
-            if ((an != 1 && sscanf(in, "{\\%*[^}]}%n%c", &len, &c) > 0) ||
-                sscanf(in, "{%*1[CcFfoPSsYy]:%*[^}]}%n%c", &len, &c) > 0) {
-                in += len - 1;
-            } else
-                *out++ = *in;
-            break;
-        case '<':
-            tag_close = in[1] == '/';
-            if (sscanf(in+tag_close+1, "%127[^>]>%n%c", buffer, &len,&c) >= 2) {
-                if ((param = strchr(buffer, ' ')))
-                    *param++ = 0;
-                if ((!tag_close && sptr < FF_ARRAY_ELEMS(stack)) ||
-                    ( tag_close && sptr > 0 && !strcmp(stack[sptr-1].tag, buffer))) {
-                    int i, j, unknown = 0;
-                    in += len + tag_close;
-                    if (!tag_close)
-                        memset(stack+sptr, 0, sizeof(*stack));
-                    if (!strcmp(buffer, "font")) {
-                        if (tag_close) {
-                            for (i=PARAM_NUMBER-1; i>=0; i--)
-                                if (stack[sptr-1].param[i][0])
-                                    for (j=sptr-2; j>=0; j--)
-                                        if (stack[j].param[i][0]) {
-                                            out += snprintf(out, out_end-out,
-                                                            "%s", stack[j].param[i]);
-                                            break;
-                                        }
-                        } else {
-                            while (param) {
-                                if (!strncmp(param, "size=", 5)) {
-                                    unsigned font_size;
-                                    param += 5 + (param[5] == '"');
-                                    if (sscanf(param, "%u", &font_size) == 1) {
-                                        snprintf(stack[sptr].param[PARAM_SIZE],
-                                             sizeof(stack[0].param[PARAM_SIZE]),
-                                             "{\\fs%u}", font_size);
-                                    }
-                                } else if (!strncmp(param, "color=", 6)) {
-                                    param += 6 + (param[6] == '"');
-                                    snprintf(stack[sptr].param[PARAM_COLOR],
-                                         sizeof(stack[0].param[PARAM_COLOR]),
-                                         "{\\c&H%X&}",
-                                         html_color_parse(avctx, param));
-                                } else if (!strncmp(param, "face=", 5)) {
-                                    param += 5 + (param[5] == '"');
-                                    len = strcspn(param,
-                                                  param[-1] == '"' ? "\"" :" ");
-                                    av_strlcpy(tmp, param,
-                                               FFMIN(sizeof(tmp), len+1));
-                                    param += len;
-                                    snprintf(stack[sptr].param[PARAM_FACE],
-                                             sizeof(stack[0].param[PARAM_FACE]),
-                                             "{\\fn%s}", tmp);
-                                }
-                                if ((param = strchr(param, ' ')))
-                                    param++;
-                            }
-                            for (i=0; i<PARAM_NUMBER; i++)
-                                if (stack[sptr].param[i][0])
-                                    out += snprintf(out, out_end-out,
-                                                    "%s", stack[sptr].param[i]);
-                        }
-                    } else if (!buffer[1] && strspn(buffer, "bisu") == 1) {
-                        out += snprintf(out, out_end-out,
-                                        "{\\%c%d}", buffer[0], !tag_close);
-                    } else {
-                        unknown = 1;
-                        snprintf(tmp, sizeof(tmp), "</%s>", buffer);
-                    }
-                    if (tag_close) {
-                        sptr--;
-                    } else if (unknown && !strstr(in, tmp)) {
-                        in -= len + tag_close;
-                        *out++ = *in;
-                    } else
-                        av_strlcpy(stack[sptr++].tag, buffer,
-                                   sizeof(stack[0].tag));
-                    break;
-                }
-            }
-        default:
-            *out++ = *in;
-            break;
+        /* XXX: here we rescale coordinate assuming they are in DVD resolution
+         * (720x480) since we don't have anything better */
+
+        if (x2 >= 0 && y2 >= 0 && (x2 != x1 || y2 != y1) && x2 >= x1 && y2 >= y1) {
+            /* text rectangle defined, write the text at the center of the rectangle */
+            const int cx = x1 + (x2 - x1)/2;
+            const int cy = y1 + (y2 - y1)/2;
+            const int scaled_x = cx * ASS_DEFAULT_PLAYRESX / 720;
+            const int scaled_y = cy * ASS_DEFAULT_PLAYRESY / 480;
+            av_bprintf(dst, "{\\an5}{\\pos(%d,%d)}", scaled_x, scaled_y);
+        } else {
+            /* only the top left corner, assume the text starts in that corner */
+            const int scaled_x = x1 * ASS_DEFAULT_PLAYRESX / 720;
+            const int scaled_y = y1 * ASS_DEFAULT_PLAYRESY / 480;
+            av_bprintf(dst, "{\\an1}{\\pos(%d,%d)}", scaled_x, scaled_y);
         }
-        if (*in != ' ' && *in != '\r' && *in != '\n')
-            line_start = 0;
     }
 
-    out = FFMIN(out, out_end-3);
-    while (!strncmp(out-2, "\\N", 2))
-        out -= 2;
-    while (out[-1] == ' ')
-        out--;
-    out += snprintf(out, out_end-out, "\r\n");
-    return in;
-}
-
-static const char *read_ts(const char *buf, int *ts_start, int *ts_end,
-                           int *x1, int *y1, int *x2, int *y2)
-{
-    int i, hs, ms, ss, he, me, se;
-
-    for (i=0; i<2; i++) {
-        /* try to read timestamps in either the first or second line */
-        int c = sscanf(buf, "%d:%2d:%2d%*1[,.]%3d --> %d:%2d:%2d%*1[,.]%3d"
-                       "%*[ ]X1:%u X2:%u Y1:%u Y2:%u",
-                       &hs, &ms, &ss, ts_start, &he, &me, &se, ts_end,
-                       x1, x2, y1, y2);
-        buf += strcspn(buf, "\n") + 1;
-        if (c >= 8) {
-            *ts_start = 100*(ss + 60*(ms + 60*hs)) + *ts_start/10;
-            *ts_end   = 100*(se + 60*(me + 60*he)) + *ts_end  /10;
-            return buf;
-        }
-    }
-    return NULL;
+    ff_htmlmarkup_to_ass(avctx, dst, in);
 }
 
 static int srt_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_sub_ptr, AVPacket *avpkt)
 {
     AVSubtitle *sub = data;
-    int ts_start, ts_end, x1 = -1, y1 = -1, x2 = -1, y2 = -1;
-    char buffer[2048];
-    const char *ptr = avpkt->data;
-    const char *end = avpkt->data + avpkt->size;
+    AVBPrint buffer;
+    int x1 = -1, y1 = -1, x2 = -1, y2 = -1;
+    int size, ret;
+    const uint8_t *p = av_packet_get_side_data(avpkt, AV_PKT_DATA_SUBTITLE_POSITION, &size);
+    FFASSDecoderContext *s = avctx->priv_data;
+
+    if (p && size == 16) {
+        x1 = AV_RL32(p     );
+        y1 = AV_RL32(p +  4);
+        x2 = AV_RL32(p +  8);
+        y2 = AV_RL32(p + 12);
+    }
 
     if (avpkt->size <= 0)
         return avpkt->size;
 
-    ff_ass_init(sub);
+    av_bprint_init(&buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
 
-    while (ptr < end && *ptr) {
-        ptr = read_ts(ptr, &ts_start, &ts_end, &x1, &y1, &x2, &y2);
-        if (!ptr)
-            break;
-        ptr = srt_to_ass(avctx, buffer, buffer+sizeof(buffer), ptr,
-                         x1, y1, x2, y2);
-        ff_ass_add_rect(sub, buffer, ts_start, ts_end, 0);
-    }
+    srt_to_ass(avctx, &buffer, avpkt->data, x1, y1, x2, y2);
+    ret = ff_ass_add_rect(sub, buffer.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buffer, NULL);
+    if (ret < 0)
+        return ret;
 
     *got_sub_ptr = sub->num_rects > 0;
     return avpkt->size;
 }
 
+#if CONFIG_SRT_DECODER
+/* deprecated decoder */
 AVCodec ff_srt_decoder = {
     .name         = "srt",
     .long_name    = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
     .type         = AVMEDIA_TYPE_SUBTITLE,
-    .id           = AV_CODEC_ID_SRT,
+    .id           = AV_CODEC_ID_SUBRIP,
+    .init         = ff_ass_subtitle_header_default,
+    .decode       = srt_decode_frame,
+    .flush        = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
+#endif
+
+#if CONFIG_SUBRIP_DECODER
+AVCodec ff_subrip_decoder = {
+    .name         = "subrip",
+    .long_name    = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type         = AVMEDIA_TYPE_SUBTITLE,
+    .id           = AV_CODEC_ID_SUBRIP,
     .init         = ff_ass_subtitle_header_default,
     .decode       = srt_decode_frame,
+    .flush        = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
 };
+#endif
diff --git a/libavcodec/srtenc.c b/libavcodec/srtenc.c
new file mode 100644
index 0000000..34f0f0d
--- /dev/null
+++ b/libavcodec/srtenc.c
@@ -0,0 +1,344 @@
+/*
+ * SubRip subtitle encoder
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "ass_split.h"
+#include "ass.h"
+
+
+#define SRT_STACK_SIZE 64
+
+typedef struct {
+    AVCodecContext *avctx;
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    char stack[SRT_STACK_SIZE];
+    int stack_ptr;
+    int alignment_applied;
+} SRTContext;
+
+
+#ifdef __GNUC__
+__attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+static void srt_print(SRTContext *s, const char *str, ...)
+{
+    va_list vargs;
+    va_start(vargs, str);
+    av_vbprintf(&s->buffer, str, vargs);
+    va_end(vargs);
+}
+
+static int srt_stack_push(SRTContext *s, const char c)
+{
+    if (s->stack_ptr >= SRT_STACK_SIZE)
+        return -1;
+    s->stack[s->stack_ptr++] = c;
+    return 0;
+}
+
+static char srt_stack_pop(SRTContext *s)
+{
+    if (s->stack_ptr <= 0)
+        return 0;
+    return s->stack[--s->stack_ptr];
+}
+
+static int srt_stack_find(SRTContext *s, const char c)
+{
+    int i;
+    for (i = s->stack_ptr-1; i >= 0; i--)
+        if (s->stack[i] == c)
+            break;
+    return i;
+}
+
+static void srt_close_tag(SRTContext *s, char tag)
+{
+    srt_print(s, "</%c%s>", tag, tag == 'f' ? "ont" : "");
+}
+
+static void srt_stack_push_pop(SRTContext *s, const char c, int close)
+{
+    if (close) {
+        int i = c ? srt_stack_find(s, c) : 0;
+        if (i < 0)
+            return;
+        while (s->stack_ptr != i)
+            srt_close_tag(s, srt_stack_pop(s));
+    } else if (srt_stack_push(s, c) < 0)
+        av_log(s->avctx, AV_LOG_ERROR, "tag stack overflow\n");
+}
+
+static void srt_style_apply(SRTContext *s, const char *style)
+{
+    ASSStyle *st = ff_ass_style_get(s->ass_ctx, style);
+    if (st) {
+        int c = st->primary_color & 0xFFFFFF;
+        if (st->font_name && strcmp(st->font_name, ASS_DEFAULT_FONT) ||
+            st->font_size != ASS_DEFAULT_FONT_SIZE ||
+            c != ASS_DEFAULT_COLOR) {
+            srt_print(s, "<font");
+            if (st->font_name && strcmp(st->font_name, ASS_DEFAULT_FONT))
+                srt_print(s, " face=\"%s\"", st->font_name);
+            if (st->font_size != ASS_DEFAULT_FONT_SIZE)
+                srt_print(s, " size=\"%d\"", st->font_size);
+            if (c != ASS_DEFAULT_COLOR)
+                srt_print(s, " color=\"#%06x\"",
+                          (c & 0xFF0000) >> 16 | c & 0xFF00 | (c & 0xFF) << 16);
+            srt_print(s, ">");
+            srt_stack_push(s, 'f');
+        }
+        if (st->bold != ASS_DEFAULT_BOLD) {
+            srt_print(s, "<b>");
+            srt_stack_push(s, 'b');
+        }
+        if (st->italic != ASS_DEFAULT_ITALIC) {
+            srt_print(s, "<i>");
+            srt_stack_push(s, 'i');
+        }
+        if (st->underline != ASS_DEFAULT_UNDERLINE) {
+            srt_print(s, "<u>");
+            srt_stack_push(s, 'u');
+        }
+        if (st->alignment != ASS_DEFAULT_ALIGNMENT) {
+            srt_print(s, "{\\an%d}", st->alignment);
+            s->alignment_applied = 1;
+        }
+    }
+}
+
+
+static av_cold int srt_encode_init(AVCodecContext *avctx)
+{
+    SRTContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+static void srt_text_cb(void *priv, const char *text, int len)
+{
+    SRTContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+}
+
+static void srt_new_line_cb(void *priv, int forced)
+{
+    srt_print(priv, "\r\n");
+}
+
+static void srt_style_cb(void *priv, char style, int close)
+{
+    srt_stack_push_pop(priv, style, close);
+    if (!close)
+        srt_print(priv, "<%c>", style);
+}
+
+static void srt_color_cb(void *priv, unsigned int color, unsigned int color_id)
+{
+    if (color_id > 1)
+        return;
+    srt_stack_push_pop(priv, 'f', color == 0xFFFFFFFF);
+    if (color != 0xFFFFFFFF)
+        srt_print(priv, "<font color=\"#%06x\">",
+              (color & 0xFF0000) >> 16 | color & 0xFF00 | (color & 0xFF) << 16);
+}
+
+static void srt_font_name_cb(void *priv, const char *name)
+{
+    srt_stack_push_pop(priv, 'f', !name);
+    if (name)
+        srt_print(priv, "<font face=\"%s\">", name);
+}
+
+static void srt_font_size_cb(void *priv, int size)
+{
+    srt_stack_push_pop(priv, 'f', size < 0);
+    if (size >= 0)
+        srt_print(priv, "<font size=\"%d\">", size);
+}
+
+static void srt_alignment_cb(void *priv, int alignment)
+{
+    SRTContext *s = priv;
+    if (!s->alignment_applied && alignment >= 0) {
+        srt_print(s, "{\\an%d}", alignment);
+        s->alignment_applied = 1;
+    }
+}
+
+static void srt_cancel_overrides_cb(void *priv, const char *style)
+{
+    srt_stack_push_pop(priv, 0, 1);
+    srt_style_apply(priv, style);
+}
+
+static void srt_move_cb(void *priv, int x1, int y1, int x2, int y2,
+                        int t1, int t2)
+{
+    // TODO: add a AV_PKT_DATA_SUBTITLE_POSITION side data when a new subtitles
+    // encoding API passing the AVPacket is available.
+}
+
+static void srt_end_cb(void *priv)
+{
+    srt_stack_push_pop(priv, 0, 1);
+}
+
+static const ASSCodesCallbacks srt_callbacks = {
+    .text             = srt_text_cb,
+    .new_line         = srt_new_line_cb,
+    .style            = srt_style_cb,
+    .color            = srt_color_cb,
+    .font_name        = srt_font_name_cb,
+    .font_size        = srt_font_size_cb,
+    .alignment        = srt_alignment_cb,
+    .cancel_overrides = srt_cancel_overrides_cb,
+    .move             = srt_move_cb,
+    .end              = srt_end_cb,
+};
+
+static const ASSCodesCallbacks text_callbacks = {
+    .text             = srt_text_cb,
+    .new_line         = srt_new_line_cb,
+};
+
+static int encode_frame(AVCodecContext *avctx,
+                        unsigned char *buf, int bufsize, const AVSubtitle *sub,
+                        const ASSCodesCallbacks *cb)
+{
+    SRTContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i;
+
+    av_bprint_clear(&s->buffer);
+
+    for (i=0; i<sub->num_rects; i++) {
+        const char *ass = sub->rects[i]->ass;
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            int num;
+            dialog = ff_ass_split_dialog(s->ass_ctx, ass, 0, &num);
+            for (; dialog && num--; dialog++) {
+                s->alignment_applied = 0;
+                if (avctx->codec_id == AV_CODEC_ID_SUBRIP)
+                    srt_style_apply(s, dialog->style);
+                ff_ass_split_override_codes(cb, s, dialog->text);
+            }
+        } else {
+#endif
+            dialog = ff_ass_split_dialog2(s->ass_ctx, ass);
+            if (!dialog)
+                return AVERROR(ENOMEM);
+            s->alignment_applied = 0;
+            if (avctx->codec_id == AV_CODEC_ID_SUBRIP)
+                srt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(cb, s, dialog->text);
+            ff_ass_free_dialog(&dialog);
+#if FF_API_ASS_TIMING
+        }
+#endif
+    }
+
+    if (!av_bprint_is_complete(&s->buffer))
+        return AVERROR(ENOMEM);
+    if (!s->buffer.len)
+        return 0;
+
+    if (s->buffer.len > bufsize) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        return -1;
+    }
+    memcpy(buf, s->buffer.str, s->buffer.len);
+
+    return s->buffer.len;
+}
+
+static int srt_encode_frame(AVCodecContext *avctx,
+                               unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    return encode_frame(avctx, buf, bufsize, sub, &srt_callbacks);
+}
+
+static int text_encode_frame(AVCodecContext *avctx,
+                             unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    return encode_frame(avctx, buf, bufsize, sub, &text_callbacks);
+}
+
+static int srt_encode_close(AVCodecContext *avctx)
+{
+    SRTContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+#if CONFIG_SRT_ENCODER
+/* deprecated encoder */
+AVCodec ff_srt_encoder = {
+    .name           = "srt",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBRIP,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = srt_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
+
+#if CONFIG_SUBRIP_ENCODER
+AVCodec ff_subrip_encoder = {
+    .name           = "subrip",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBRIP,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = srt_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
+
+#if CONFIG_TEXT_ENCODER
+AVCodec ff_text_encoder = {
+    .name           = "text",
+    .long_name      = NULL_IF_CONFIG_SMALL("Raw text subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_TEXT,
+    .priv_data_size = sizeof(SRTContext),
+    .init           = srt_encode_init,
+    .encode_sub     = text_encode_frame,
+    .close          = srt_encode_close,
+};
+#endif
diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
index 5c1ec84..9efdffe 100644
--- a/libavcodec/startcode.c
+++ b/libavcodec/startcode.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/startcode.h b/libavcodec/startcode.h
index f38ce54..cfa02b0 100644
--- a/libavcodec/startcode.h
+++ b/libavcodec/startcode.h
@@ -1,21 +1,27 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * @file
+ * Accelerated start code search function for start codes common to
+ * MPEG-1/2/4 video, VC-1, H.264/5
+ */
+
 #ifndef AVCODEC_STARTCODE_H
 #define AVCODEC_STARTCODE_H
 
diff --git a/libavcodec/subviewerdec.c b/libavcodec/subviewerdec.c
new file mode 100644
index 0000000..805c7dd
--- /dev/null
+++ b/libavcodec/subviewerdec.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SubViewer subtitle decoder
+ * @see https://en.wikipedia.org/wiki/SubViewer
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static int subviewer_event_to_ass(AVBPrint *buf, const char *p)
+{
+    while (*p) {
+        if (!strncmp(p, "[br]", 4)) {
+            av_bprintf(buf, "\\N");
+            p += 4;
+        } else {
+            if (p[0] == '\n' && p[1])
+                av_bprintf(buf, "\\N");
+            else if (*p != '\n' && *p != '\r')
+                av_bprint_chars(buf, *p, 1);
+            p++;
+        }
+    }
+
+    return 0;
+}
+
+static int subviewer_decode_frame(AVCodecContext *avctx,
+                                  void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && !subviewer_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_subviewer_decoder = {
+    .name           = "subviewer",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubViewer subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBVIEWER,
+    .decode         = subviewer_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/sunrast.c b/libavcodec/sunrast.c
index 6a928bb..25e11f6 100644
--- a/libavcodec/sunrast.c
+++ b/libavcodec/sunrast.c
@@ -2,20 +2,20 @@
  * Sun Rasterfile (.sun/.ras/im{1,8,24}/.sunras) image decoder
  * Copyright (c) 2007, 2008 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     const uint8_t *buf_end   = avpkt->data + avpkt->size;
     AVFrame * const p        = data;
     unsigned int w, h, depth, type, maptype, maplength, stride, x, y, len, alen;
-    uint8_t *ptr;
+    uint8_t *ptr, *ptr2 = NULL;
     const uint8_t *bufstart = buf;
     int ret;
 
@@ -53,7 +53,7 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     maplength = AV_RB32(buf + 28);
     buf      += 32;
 
-    if (type == RT_FORMAT_TIFF || type == RT_FORMAT_IFF || type == RT_EXPERIMENTAL) {
+    if (type == RT_EXPERIMENTAL) {
         avpriv_request_sample(avctx, "TIFF/IFF/EXPERIMENTAL (compression) type");
         return AVERROR_PATCHWELCOME;
     }
@@ -70,10 +70,17 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
+    if (type == RT_FORMAT_TIFF || type == RT_FORMAT_IFF) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported (compression) type\n");
+        return -1;
+    }
 
     switch (depth) {
         case 1:
-            avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+            avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_MONOWHITE;
+            break;
+        case 4:
+            avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_NONE;
             break;
         case 8:
             avctx->pix_fmt = maplength ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
@@ -81,6 +88,9 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
         case 24:
             avctx->pix_fmt = (type == RT_FORMAT_RGB) ? AV_PIX_FMT_RGB24 : AV_PIX_FMT_BGR24;
             break;
+        case 32:
+            avctx->pix_fmt = (type == RT_FORMAT_RGB) ? AV_PIX_FMT_0RGB : AV_PIX_FMT_0BGR;
+            break;
         default:
             av_log(avctx, AV_LOG_ERROR, "invalid depth\n");
             return AVERROR_INVALIDDATA;
@@ -90,17 +100,15 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
     if (ret < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
     if (buf_end - buf < maplength)
         return AVERROR_INVALIDDATA;
 
-    if (depth != 8 && maplength) {
+    if (depth > 8 && maplength) {
         av_log(avctx, AV_LOG_WARNING, "useless colormap found or file is corrupted, trying to recover\n");
 
     } else if (maplength) {
@@ -113,13 +121,20 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
 
         ptr = p->data[1];
         for (x = 0; x < len; x++, ptr += 4)
-            *(uint32_t *)ptr = (buf[x] << 16) + (buf[len + x] << 8) + buf[len + len + x];
+            *(uint32_t *)ptr = (0xFFU<<24) + (buf[x]<<16) + (buf[len+x]<<8) + buf[len+len+x];
     }
 
     buf += maplength;
 
+    if (maplength && depth < 8) {
+        ptr = ptr2 = av_malloc_array((w + 15), h);
+        if (!ptr)
+            return AVERROR(ENOMEM);
+        stride = (w + 15 >> 3) * depth;
+    } else {
     ptr    = p->data[0];
     stride = p->linesize[0];
+    }
 
     /* scanlines are aligned on 16 bit boundaries */
     len  = (depth * w + 7) >> 3;
@@ -160,6 +175,30 @@ static int sunrast_decode_frame(AVCodecContext *avctx, void *data,
             buf += alen;
         }
     }
+    if (avctx->pix_fmt == AV_PIX_FMT_PAL8 && depth < 8) {
+        uint8_t *ptr_free = ptr2;
+        ptr = p->data[0];
+        for (y=0; y<h; y++) {
+            for (x = 0; x < (w + 7 >> 3) * depth; x++) {
+                if (depth == 1) {
+                    ptr[8*x]   = ptr2[x] >> 7;
+                    ptr[8*x+1] = ptr2[x] >> 6 & 1;
+                    ptr[8*x+2] = ptr2[x] >> 5 & 1;
+                    ptr[8*x+3] = ptr2[x] >> 4 & 1;
+                    ptr[8*x+4] = ptr2[x] >> 3 & 1;
+                    ptr[8*x+5] = ptr2[x] >> 2 & 1;
+                    ptr[8*x+6] = ptr2[x] >> 1 & 1;
+                    ptr[8*x+7] = ptr2[x]      & 1;
+                } else {
+                    ptr[2*x]   = ptr2[x] >> 4;
+                    ptr[2*x+1] = ptr2[x] & 0xF;
+                }
+            }
+            ptr  += p->linesize[0];
+            ptr2 += (w + 15 >> 3) * depth;
+        }
+        av_freep(&ptr_free);
+    }
 
     *got_frame = 1;
 
diff --git a/libavcodec/sunrast.h b/libavcodec/sunrast.h
index d9fe307..d162e63 100644
--- a/libavcodec/sunrast.h
+++ b/libavcodec/sunrast.h
@@ -2,20 +2,20 @@
  * Sun Rasterfile Image Format
  * Copyright (c) 2007, 2008 Ivo van Poorten
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/sunrastenc.c b/libavcodec/sunrastenc.c
index 3a5f410..97b2242 100644
--- a/libavcodec/sunrastenc.c
+++ b/libavcodec/sunrastenc.c
@@ -2,20 +2,20 @@
  * Sun Rasterfile (.sun/.ras/im{1,8,24}/.sunras) image encoder
  * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,7 +59,7 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
 {
     SUNRASTContext *s = avctx->priv_data;
     const uint8_t *ptr;
-    int len, alen, x;
+    int len, alen, x, y;
 
     if (s->maplength) {     // palettized
         PutByteContext pb_r, pb_g;
@@ -86,33 +86,29 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
      if (s->type == RT_BYTE_ENCODED) {
         uint8_t value, value2;
         int run;
-        const uint8_t *start = linesize < 0 ? pixels + (avctx->height - 1) * linesize
-                                            : pixels;
-        const uint8_t *end   = linesize < 0 ? pixels - linesize
-                                            : pixels + avctx->height * linesize;
 
         ptr = pixels;
 
-#define GET_VALUE ptr >= end || ptr < start ? 0 : x >= len ? ptr[len-1] : ptr[x]
+#define GET_VALUE y >= avctx->height ? 0 : x >= len ? ptr[len-1] : ptr[x]
 
-        x = 0;
+        x = 0, y = 0;
         value2 = GET_VALUE;
-        while (ptr < end && ptr >= start) {
+        while (y < avctx->height) {
             run = 1;
             value = value2;
             x++;
             if (x >= alen) {
                 x = 0;
-                ptr += linesize;
+                ptr += linesize, y++;
             }
 
             value2 = GET_VALUE;
-            while (value2 == value && run < 256 && ptr < end && ptr >= start) {
+            while (value2 == value && run < 256 && y < avctx->height) {
                 x++;
                 run++;
                 if (x >= alen) {
                     x = 0;
-                    ptr += linesize;
+                    ptr += linesize, y++;
                 }
                 value2 = GET_VALUE;
             }
@@ -131,7 +127,6 @@ static void sunrast_image_write_image(AVCodecContext *avctx,
         // update data length for header
         s->length = bytestream2_tell_p(&s->p) - 32 - s->maplength;
     } else {
-        int y;
         for (y = 0; y < avctx->height; y++) {
             bytestream2_put_buffer(&s->p, ptr, len);
             if (len < alen)
@@ -164,12 +159,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     // adjust boolean option to RT equivalent
     s->type++;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->key_frame = 1;
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
     s->maptype                    = RMT_NONE;
     s->maplength                  = 0;
 
@@ -202,7 +191,7 @@ static int sunrast_encode_frame(AVCodecContext *avctx,  AVPacket *avpkt,
     SUNRASTContext *s = avctx->priv_data;
     int ret;
 
-    if ((ret = ff_alloc_packet(avpkt, s->size)) < 0)
+    if ((ret = ff_alloc_packet2(avctx, avpkt, s->size, 0)) < 0)
         return ret;
 
     bytestream2_init_writer(&s->p, avpkt->data, avpkt->size);
@@ -248,12 +237,12 @@ AVCodec ff_sunrast_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_SUNRAST,
     .priv_data_size = sizeof(SUNRASTContext),
-    .priv_class     = &sunrast_class,
     .init           = sunrast_encode_init,
     .encode2        = sunrast_encode_frame,
 #if FF_API_CODER_TYPE
     .defaults       = sunrast_defaults,
 #endif
+    .priv_class     = &sunrast_class,
     .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_BGR24,
                                                   AV_PIX_FMT_PAL8,
                                                   AV_PIX_FMT_GRAY8,
diff --git a/libavcodec/svq1.c b/libavcodec/svq1.c
index b9922a7..cc214f9 100644
--- a/libavcodec/svq1.c
+++ b/libavcodec/svq1.c
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1.h b/libavcodec/svq1.h
index 988a0a0..63c0479 100644
--- a/libavcodec/svq1.h
+++ b/libavcodec/svq1.h
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq13.c b/libavcodec/svq13.c
index e0d2154..b821a44 100644
--- a/libavcodec/svq13.c
+++ b/libavcodec/svq13.c
@@ -1,20 +1,20 @@
 /*
  * SVQ1/SVQ3 decoder common code
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1_cb.h b/libavcodec/svq1_cb.h
index 396cdf7..11f7969 100644
--- a/libavcodec/svq1_cb.h
+++ b/libavcodec/svq1_cb.h
@@ -3,23 +3,23 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1_vlc.h b/libavcodec/svq1_vlc.h
index 073bb6d..06e3509 100644
--- a/libavcodec/svq1_vlc.h
+++ b/libavcodec/svq1_vlc.h
@@ -1,20 +1,20 @@
 /*
- * copyright (C) 2003 The FFmpeg project
+ * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq1dec.c b/libavcodec/svq1dec.c
index d6f4b43..2b72e08 100644
--- a/libavcodec/svq1dec.c
+++ b/libavcodec/svq1dec.c
@@ -3,25 +3,25 @@
  * ported to MPlayer by Arpi <arpi@thot.banki.hu>
  * ported to libavcodec by Nick Kurshev <nickols_k@mail.ru>
  *
- * Copyright (C) 2002 the xine project
- * Copyright (C) 2002 The FFmpeg project
+ * Copyright (c) 2002 The Xine project
+ * Copyright (c) 2002 The FFmpeg project
  *
  * SVQ1 Encoder (c) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -111,12 +111,11 @@ static const uint8_t string_table[256] = {
                 break;                                                  \
         }                                                               \
         /* divide block if next bit set */                              \
-        if (get_bits1(bitbuf) == 0)                                     \
+        if (!get_bits1(bitbuf))                                         \
             break;                                                      \
         /* add child nodes */                                           \
         list[n++] = list[i];                                            \
-        list[n++] = list[i] +                                           \
-                    (((level & 1) ? pitch : 1) << (level / 2 + 1));     \
+        list[n++] = list[i] + (((level & 1) ? pitch : 1) << ((level >> 1) + 1));\
     }
 
 #define SVQ1_ADD_CODEBOOK()                                             \
@@ -152,7 +151,7 @@ static const uint8_t string_table[256] = {
                       16 * j) << (level + 1);                           \
     }                                                                   \
     mean -= stages * 128;                                               \
-    n4    = mean + (mean >> 31) << 16 | (mean & 0xFFFF);
+    n4    = (mean << 16) + mean;
 
 static int svq1_decode_block_intra(GetBitContext *bitbuf, uint8_t *pixels,
                                    int pitch)
@@ -163,7 +162,8 @@ static int svq1_decode_block_intra(GetBitContext *bitbuf, uint8_t *pixels,
     const uint32_t *codebook;
     int entries[6];
     int i, j, m, n;
-    int mean, stages;
+    int stages;
+    unsigned mean;
     unsigned x, y, width, height, level;
     uint32_t n1, n2, n3, n4;
 
@@ -188,12 +188,13 @@ static int svq1_decode_block_intra(GetBitContext *bitbuf, uint8_t *pixels,
             continue;   /* skip vector */
         }
 
-        if ((stages > 0 && level >= 4) || stages < 0) {
+        if ((stages > 0 && level >= 4)) {
             ff_dlog(NULL,
                     "Error (svq1_decode_block_intra): invalid vector: stages=%i level=%i\n",
                     stages, level);
             return AVERROR_INVALIDDATA;  /* invalid vector */
         }
+        av_assert0(stages >= 0);
 
         mean = get_vlc2(bitbuf, svq1_intra_mean.table, 8, 3);
 
@@ -228,7 +229,8 @@ static int svq1_decode_block_non_intra(GetBitContext *bitbuf, uint8_t *pixels,
     const uint32_t *codebook;
     int entries[6];
     int i, j, m, n;
-    int mean, stages;
+    int stages;
+    unsigned mean;
     int x, y, width, height, level;
     uint32_t n1, n2, n3, n4;
 
@@ -250,12 +252,13 @@ static int svq1_decode_block_non_intra(GetBitContext *bitbuf, uint8_t *pixels,
         if (stages == -1)
             continue;           /* skip vector */
 
-        if ((stages > 0 && level >= 4) || stages < 0) {
+        if ((stages > 0 && level >= 4)) {
             ff_dlog(NULL,
                     "Error (svq1_decode_block_non_intra): invalid vector: stages=%i level=%i\n",
                     stages, level);
             return AVERROR_INVALIDDATA;  /* invalid vector */
         }
+        av_assert0(stages >= 0);
 
         mean = get_vlc2(bitbuf, svq1_inter_mean.table, 9, 3) - 256;
 
@@ -342,8 +345,7 @@ static int svq1_motion_inter_block(HpelDSPContext *hdsp, GetBitContext *bitbuf,
     }
 
     result = svq1_decode_motion_vector(bitbuf, &mv, pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     motion[0].x         =
@@ -386,8 +388,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
     }
 
     result = svq1_decode_motion_vector(bitbuf, &mv, pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* predict and decode motion vector (1) */
@@ -399,8 +400,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
         pmv[1] = &motion[(x / 8) + 3];
     }
     result = svq1_decode_motion_vector(bitbuf, &motion[0], pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* predict and decode motion vector (2) */
@@ -408,8 +408,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
     pmv[2] = &motion[(x / 8) + 1];
 
     result = svq1_decode_motion_vector(bitbuf, &motion[(x / 8) + 2], pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* predict and decode motion vector (3) */
@@ -417,8 +416,7 @@ static int svq1_motion_inter_4v_block(HpelDSPContext *hdsp, GetBitContext *bitbu
     pmv[3] = &motion[(x / 8) + 3];
 
     result = svq1_decode_motion_vector(bitbuf, pmv[3], pmv);
-
-    if (result != 0)
+    if (result)
         return result;
 
     /* form predictions */
@@ -502,7 +500,7 @@ static int svq1_decode_delta_block(AVCodecContext *avctx, HpelDSPContext *hdsp,
     return result;
 }
 
-static void svq1_parse_string(GetBitContext *bitbuf, uint8_t *out)
+static void svq1_parse_string(GetBitContext *bitbuf, uint8_t out[257])
 {
     uint8_t seed;
     int i;
@@ -514,6 +512,7 @@ static void svq1_parse_string(GetBitContext *bitbuf, uint8_t *out)
         out[i] = get_bits(bitbuf, 8) ^ seed;
         seed   = string_table[out[i] ^ seed];
     }
+    out[i] = 0;
 }
 
 static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
@@ -521,6 +520,8 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
     SVQ1Context *s = avctx->priv_data;
     GetBitContext *bitbuf = &s->gb;
     int frame_size_code;
+    int width  = s->width;
+    int height = s->height;
 
     skip_bits(bitbuf, 8); /* temporal_reference */
 
@@ -554,12 +555,12 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
         }
 
         if ((s->frame_code ^ 0x10) >= 0x50) {
-            uint8_t msg[256];
+            uint8_t msg[257];
 
             svq1_parse_string(bitbuf, msg);
 
             av_log(avctx, AV_LOG_INFO,
-                   "embedded message: \"%s\"\n", (char *)msg);
+                   "embedded message:\n%s\n", ((char *)msg) + 1);
         }
 
         skip_bits(bitbuf, 2);
@@ -571,20 +572,20 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
 
         if (frame_size_code == 7) {
             /* load width, height (12 bits each) */
-            s->width  = get_bits(bitbuf, 12);
-            s->height = get_bits(bitbuf, 12);
+            width  = get_bits(bitbuf, 12);
+            height = get_bits(bitbuf, 12);
 
-            if (!s->width || !s->height)
+            if (!width || !height)
                 return AVERROR_INVALIDDATA;
         } else {
             /* get width, height from table */
-            s->width  = ff_svq1_frame_size_table[frame_size_code][0];
-            s->height = ff_svq1_frame_size_table[frame_size_code][1];
+            width  = ff_svq1_frame_size_table[frame_size_code][0];
+            height = ff_svq1_frame_size_table[frame_size_code][1];
         }
     }
 
     /* unknown fields */
-    if (get_bits1(bitbuf) == 1) {
+    if (get_bits1(bitbuf)) {
         skip_bits1(bitbuf);    /* use packet checksum if (1) */
         skip_bits1(bitbuf);    /* component checksums after image data if (1) */
 
@@ -592,16 +593,18 @@ static int svq1_decode_frame_header(AVCodecContext *avctx, AVFrame *frame)
             return AVERROR_INVALIDDATA;
     }
 
-    if (get_bits1(bitbuf) == 1) {
+    if (get_bits1(bitbuf)) {
         skip_bits1(bitbuf);
         skip_bits(bitbuf, 4);
         skip_bits1(bitbuf);
         skip_bits(bitbuf, 2);
 
-        while (get_bits1(bitbuf) == 1)
-            skip_bits(bitbuf, 8);
+        if (skip_1stop_8data_bits(bitbuf) < 0)
+            return AVERROR_INVALIDDATA;
     }
 
+    s->width  = width;
+    s->height = height;
     return 0;
 }
 
@@ -615,9 +618,12 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *current;
     int result, i, x, y, width, height;
     svq1_pmv *pmv;
+    int ret;
 
     /* initialize bit buffer */
-    init_get_bits(&s->gb, buf, buf_size * 8);
+    ret = init_get_bits8(&s->gb, buf, buf_size);
+    if (ret < 0)
+        return ret;
 
     /* decode frame header */
     s->frame_code = get_bits(&s->gb, 22);
@@ -652,7 +658,6 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     result = svq1_decode_frame_header(avctx, cur);
-
     if (result != 0) {
         ff_dlog(avctx, "Error in svq1_decode_frame_header %i\n", result);
         return result;
@@ -697,8 +702,8 @@ static int svq1_decode_frame(AVCodecContext *avctx, void *data,
                 for (x = 0; x < width; x += 16) {
                     result = svq1_decode_block_intra(&s->gb, &current[x],
                                                      linesize);
-                    if (result != 0) {
-                        av_log(avctx, AV_LOG_INFO,
+                    if (result) {
+                        av_log(avctx, AV_LOG_ERROR,
                                "Error in svq1_decode_block %i (keyframe)\n",
                                result);
                         goto err;
@@ -816,6 +821,7 @@ static av_cold int svq1_decode_end(AVCodecContext *avctx)
 
     av_frame_free(&s->prev);
     av_freep(&s->pkt_swapped);
+    s->pkt_swapped_allocated = 0;
 
     return 0;
 }
diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c
index 12a23d0..cb7c4a1 100644
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@@ -2,20 +2,20 @@
  * SVQ1 Encoder
  * Copyright (C) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,9 +36,8 @@
 #include "svq1.h"
 #include "svq1enc.h"
 #include "svq1enc_cb.h"
+#include "libavutil/avassert.h"
 
-#undef NDEBUG
-#include <assert.h>
 
 static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 {
@@ -59,7 +58,7 @@ static void svq1_write_header(SVQ1EncContext *s, int frame_type)
         /* output 5 unknown bits (2 + 2 + 1) */
         put_bits(&s->pb, 5, 2); /* 2 needed by quicktime decoder */
 
-        i = ff_match_2uint16(ff_svq1_frame_size_table,
+        i = ff_match_2uint16((void*)ff_svq1_frame_size_table,
                              FF_ARRAY_ELEMS(ff_svq1_frame_size_table),
                              s->frame_width, s->frame_height);
         put_bits(&s->pb, 3, i);
@@ -78,7 +77,7 @@ static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 #define THRESHOLD_MULTIPLIER 0.6
 
 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
-                               int size)
+                               intptr_t size)
 {
     int score = 0, i;
 
@@ -97,7 +96,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     int w            = 2 << (level + 2 >> 1);
     int h            = 2 << (level + 1 >> 1);
     int size         = w * h;
-    int16_t block[7][256];
+    int16_t (*block)[256] = s->encoded_block_levels[level];
     const int8_t *codebook_sum, *codebook;
     const uint16_t(*mean_vlc)[2];
     const uint8_t(*multistage_vlc)[2];
@@ -105,7 +104,9 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     best_score = 0;
     // FIXME: Optimize, this does not need to be done multiple times.
     if (intra) {
-        codebook_sum   = svq1_intra_codebook_sum[level];
+        // level is 5 when encode_block is called from svq1_encode_plane
+        // and always < 4 when called recursively from this function.
+        codebook_sum   = level < 4 ? svq1_intra_codebook_sum[level] : NULL;
         codebook       = ff_svq1_intra_codebooks[level];
         mean_vlc       = ff_svq1_intra_mean_vlc;
         multistage_vlc = ff_svq1_intra_multistage_vlc[level];
@@ -118,7 +119,8 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
             }
         }
     } else {
-        codebook_sum   = svq1_inter_codebook_sum[level];
+        // level is 5 or < 4, see above for details.
+        codebook_sum   = level < 4 ? svq1_inter_codebook_sum[level] : NULL;
         codebook       = ff_svq1_inter_codebooks[level];
         mean_vlc       = ff_svq1_inter_mean_vlc + 256;
         multistage_vlc = ff_svq1_inter_multistage_vlc[level];
@@ -153,7 +155,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                 score  = sqr - (diff * (int64_t)diff >> (level + 3)); // FIXME: 64 bits slooow
                 if (score < best_vector_score) {
                     int mean = diff + (size >> 1) >> (level + 3);
-                    assert(mean > -300 && mean < 300);
+                    av_assert2(mean > -300 && mean < 300);
                     mean               = av_clip(mean, intra ? 0 : -256, 255);
                     best_vector_score  = score;
                     best_vector[stage] = i;
@@ -161,7 +163,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                     best_vector_mean   = mean;
                 }
             }
-            assert(best_vector_mean != -999);
+            av_assert0(best_vector_mean != -999);
             vector = codebook + stage * size * 16 + best_vector[stage] * size;
             for (j = 0; j < size; j++)
                 block[stage + 1][j] = block[stage][j] - vector[j];
@@ -205,10 +207,10 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
         put_bits(&s->reorder_pb[level], 1, split);
 
     if (!split) {
-        assert(best_mean >= 0 && best_mean < 256 || !intra);
-        assert(best_mean >= -256 && best_mean < 256);
-        assert(best_count >= 0 && best_count < 7);
-        assert(level < 4 || best_count == 0);
+        av_assert1(best_mean >= 0 && best_mean < 256 || !intra);
+        av_assert1(best_mean >= -256 && best_mean < 256);
+        av_assert1(best_count >= 0 && best_count < 7);
+        av_assert1(level < 4 || best_count == 0);
 
         /* output the encoding */
         put_bits(&s->reorder_pb[level],
@@ -218,7 +220,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                  mean_vlc[best_mean][0]);
 
         for (i = 0; i < best_count; i++) {
-            assert(best_vector[i] >= 0 && best_vector[i] < 16);
+            av_assert2(best_vector[i] >= 0 && best_vector[i] < 16);
             put_bits(&s->reorder_pb[level], 4, best_vector[i]);
         }
 
@@ -232,6 +234,15 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
     return best_score;
 }
 
+static void init_block_index(MpegEncContext *s){
+    s->block_index[0]= s->b8_stride*(s->mb_y*2    )     + s->mb_x*2;
+    s->block_index[1]= s->b8_stride*(s->mb_y*2    ) + 1 + s->mb_x*2;
+    s->block_index[2]= s->b8_stride*(s->mb_y*2 + 1)     + s->mb_x*2;
+    s->block_index[3]= s->b8_stride*(s->mb_y*2 + 1) + 1 + s->mb_x*2;
+    s->block_index[4]= s->mb_stride*(s->mb_y + 1)                + s->b8_stride*s->mb_height*2 + s->mb_x;
+    s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x;
+}
+
 static int svq1_encode_plane(SVQ1EncContext *s, int plane,
                              unsigned char *src_plane,
                              unsigned char *ref_plane,
@@ -243,7 +254,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane,
     int block_width, block_height;
     int level;
     int threshold[6];
-    uint8_t *src     = s->scratchbuf + stride * 16;
+    uint8_t *src     = s->scratchbuf + stride * 32;
     const int lambda = (s->quality * s->quality) >>
                        (2 * FF_LAMBDA_SHIFT);
 
@@ -340,8 +351,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             for (x = 0; x < block_width; x++) {
                 s->m.mb_x = x;
-                ff_init_block_index(&s->m);
-                ff_update_block_index(&s->m);
+                init_block_index(&s->m);
 
                 ff_estimate_p_frame_motion(&s->m, x, y);
             }
@@ -366,8 +376,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
         s->m.mb_y = y;
         for (x = 0; x < block_width; x++) {
-            uint8_t reorder_buffer[3][6][7 * 32];
-            int count[3][6];
+            uint8_t reorder_buffer[2][6][7 * 32];
+            int count[2][6];
             int offset       = y * 16 * stride + x * 16;
             uint8_t *decoded = decoded_plane + offset;
             uint8_t *ref     = ref_plane + offset;
@@ -381,8 +391,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
 
             s->m.mb_x = x;
-            ff_init_block_index(&s->m);
-            ff_update_block_index(&s->m);
+            init_block_index(&s->m);
 
             if (s->pict_type == AV_PICTURE_TYPE_I ||
                 (s->m.mb_type[x + y * s->m.mb_stride] &
@@ -423,23 +432,23 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     s->m.pb = s->reorder_pb[5];
                     mx      = motion_ptr[0];
                     my      = motion_ptr[1];
-                    assert(mx     >= -32 && mx     <= 31);
-                    assert(my     >= -32 && my     <= 31);
-                    assert(pred_x >= -32 && pred_x <= 31);
-                    assert(pred_y >= -32 && pred_y <= 31);
-                    ff_h263_encode_motion(&s->m, mx - pred_x, 1);
-                    ff_h263_encode_motion(&s->m, my - pred_y, 1);
+                    av_assert1(mx     >= -32 && mx     <= 31);
+                    av_assert1(my     >= -32 && my     <= 31);
+                    av_assert1(pred_x >= -32 && pred_x <= 31);
+                    av_assert1(pred_y >= -32 && pred_y <= 31);
+                    ff_h263_encode_motion(&s->m.pb, mx - pred_x, 1);
+                    ff_h263_encode_motion(&s->m.pb, my - pred_y, 1);
                     s->reorder_pb[5] = s->m.pb;
                     score[1]        += lambda * put_bits_count(&s->reorder_pb[5]);
 
                     dxy = (mx & 1) + 2 * (my & 1);
 
-                    s->hdsp.put_pixels_tab[0][dxy](temp + 16,
+                    s->hdsp.put_pixels_tab[0][dxy](temp + 16*stride,
                                                    ref + (mx >> 1) +
                                                    stride * (my >> 1),
                                                    stride, 16);
 
-                    score[1] += encode_block(s, src + 16 * x, temp + 16,
+                    score[1] += encode_block(s, src + 16 * x, temp + 16*stride,
                                              decoded, stride, 5, 64, lambda, 0);
                     best      = score[1] <= score[0];
 
@@ -450,8 +459,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     if (score[2] < score[best] && mx == 0 && my == 0) {
                         best = 2;
                         s->hdsp.put_pixels_tab[0][0](decoded, ref, stride, 16);
-                        for (i = 0; i < 6; i++)
-                            count[2][i] = 0;
                         put_bits(&s->pb, vlc[1], vlc[0]);
                     }
                 }
@@ -475,6 +482,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
             s->rd_total += score[best];
 
+            if (best != 2)
             for (i = 5; i >= 0; i--)
                 avpriv_copy_bits(&s->pb, reorder_buffer[best][i],
                                  count[best][i]);
@@ -521,6 +529,11 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
     SVQ1EncContext *const s = avctx->priv_data;
     int ret;
 
+    if (avctx->width >= 4096 || avctx->height >= 4096) {
+        av_log(avctx, AV_LOG_ERROR, "Dimensions too large, maximum is 4095x4095\n");
+        return AVERROR(EINVAL);
+    }
+
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
     ff_me_cmp_init(&s->mecc, avctx);
     ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx);
@@ -582,14 +595,10 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 {
     SVQ1EncContext *const s = avctx->priv_data;
     int i, ret;
-    uint8_t *sd;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, s->y_block_width * s->y_block_height *
-                             MAX_MB_BYTES * 3 + AV_INPUT_BUFFER_MIN_SIZE)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, s->y_block_width * s->y_block_height *
+                             MAX_MB_BYTES*3 + AV_INPUT_BUFFER_MIN_SIZE, 0)) < 0)
         return ret;
-    }
 
     if (avctx->pix_fmt != AV_PIX_FMT_YUV410P) {
         av_log(avctx, AV_LOG_ERROR, "unsupported pixel format\n");
@@ -597,9 +606,9 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
 
     if (!s->current_picture->data[0]) {
-        ret = ff_get_buffer(avctx, s->current_picture, 0);
-        if (ret < 0)
+        if ((ret = ff_get_buffer(avctx, s->current_picture, 0)) < 0) {
             return ret;
+        }
     }
     if (!s->last_picture->data[0]) {
         ret = ff_get_buffer(avctx, s->last_picture, 0);
@@ -607,7 +616,7 @@ static int svq1_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             return ret;
     }
     if (!s->scratchbuf) {
-        s->scratchbuf = av_malloc(s->current_picture->linesize[0] * 16 * 2);
+        s->scratchbuf = av_malloc_array(s->current_picture->linesize[0], 16 * 3);
         if (!s->scratchbuf)
             return AVERROR(ENOMEM);
     }
@@ -629,10 +638,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    sd = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_FACTOR, sizeof(int));
-    if (!sd)
-        return AVERROR(ENOMEM);
-    *(int *)sd = pict->quality;
+    ff_side_data_set_encoder_stats(pkt, pict->quality, NULL, 0, s->pict_type);
 
     svq1_write_header(s, s->pict_type);
     for (i = 0; i < 3; i++)
diff --git a/libavcodec/svq1enc.h b/libavcodec/svq1enc.h
index 62e8bb2..37f05a0 100644
--- a/libavcodec/svq1enc.h
+++ b/libavcodec/svq1enc.h
@@ -1,20 +1,20 @@
 /*
  * SVQ1 encoder
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,6 +63,8 @@ typedef struct SVQ1EncContext {
     int c_block_width;
     int c_block_height;
 
+    DECLARE_ALIGNED(16, int16_t, encoded_block_levels)[6][7][256];
+
     uint16_t *mb_type;
     uint32_t *dummy;
     int16_t (*motion_val8[3])[2];
@@ -75,7 +77,7 @@ typedef struct SVQ1EncContext {
     int motion_est;
 
     int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
-                             int size);
+                             intptr_t size);
 } SVQ1EncContext;
 
 void ff_svq1enc_init_ppc(SVQ1EncContext *c);
diff --git a/libavcodec/svq1enc_cb.h b/libavcodec/svq1enc_cb.h
index a5cd179..1edb4ec 100644
--- a/libavcodec/svq1enc_cb.h
+++ b/libavcodec/svq1enc_cb.h
@@ -2,20 +2,20 @@
  * SVQ1 Encoder
  * Copyright (C) 2004 Mike Melanson <melanson@pcisys.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index b11e6ff..557f63a 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2003 The Libav Project
+ * Copyright (c) 2003 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@
  *
  * You will know you have these parameters passed correctly when the decoder
  * correctly decodes this file:
- *  http://samples.libav.org/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
+ *  http://samples.mplayerhq.hu/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
  */
 
 #include <inttypes.h>
@@ -53,6 +53,7 @@
 #include "mathops.h"
 #include "rectangle.h"
 #include "tpeldsp.h"
+#include "vdpau_internal.h"
 
 #if CONFIG_ZLIB
 #include <zlib.h>
@@ -83,8 +84,10 @@ typedef struct SVQ3Context {
     int slice_size;
     int halfpel_flag;
     int thirdpel_flag;
-    int unknown_flag;
+    int has_watermark;
     uint32_t watermark_key;
+    uint8_t *buf;
+    int buf_size;
     int adaptive_quant;
     int next_p_frame_damaged;
     int h_edge_pos;
@@ -202,6 +205,8 @@ static const uint32_t svq3_dequant_coeff[32] = {
     61694, 68745, 77615, 89113, 100253, 109366, 126635, 141533
 };
 
+static int svq3_decode_end(AVCodecContext *avctx);
+
 static void svq3_luma_dc_dequant_idct_c(int16_t *output, int16_t *input, int qp)
 {
     const int qmul = svq3_dequant_coeff[qp];
@@ -284,14 +289,17 @@ static inline int svq3_decode_block(GetBitContext *gb, int16_t *block,
         luma_dc_zigzag_scan, ff_zigzag_scan, svq3_scan, ff_h264_chroma_dc_scan
     };
 
-    int run, level, limit;
+    int run, level, sign, limit;
     unsigned vlc;
     const int intra           = 3 * type >> 2;
     const uint8_t *const scan = scan_patterns[type];
 
     for (limit = (16 >> intra); index < 16; index = limit, limit += 8) {
         for (; (vlc = svq3_get_ue_golomb(gb)) != 0; index++) {
-            int sign = (vlc & 1) ? 0 : -1;
+            if ((int32_t)vlc < 0)
+                return -1;
+
+            sign     = (vlc & 1) ? 0 : -1;
             vlc      = vlc + 1 >> 1;
 
             if (type == 3) {
@@ -306,20 +314,19 @@ static inline int svq3_decode_block(GetBitContext *gb, int16_t *block,
                     level = (vlc + 9 >> 2) - run;
                 }
             } else {
-                if (vlc < 16) {
+                if (vlc < 16U) {
                     run   = svq3_dct_tables[intra][vlc].run;
                     level = svq3_dct_tables[intra][vlc].level;
                 } else if (intra) {
                     run   = vlc & 0x7;
-                    level = (vlc >> 3) +
-                            ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
+                    level = (vlc >> 3) + ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
                 } else {
                     run   = vlc & 0xF;
-                    level = (vlc >> 4) +
-                            ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
+                    level = (vlc >> 4) + ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
                 }
             }
 
+
             if ((index += run) >= limit)
                 return -1;
 
@@ -631,7 +638,7 @@ static av_always_inline void hl_decode_mb_predict_luma(SVQ3Context *s,
             int nnz, tr;
             if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
                 const int topright_avail = (s->topright_samples_available << i) & 0x8000;
-                assert(s->mb_y || linesize <= block_offset[i]);
+                av_assert2(s->mb_y || linesize <= block_offset[i]);
                 if (!topright_avail) {
                     tr       = ptr[3 - linesize] * 0x01010101u;
                     topright = (uint8_t *)&tr;
@@ -841,7 +848,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
             for (i = 0; i < 16; i += 2) {
                 vlc = svq3_get_ue_golomb(&s->gb_slice);
 
-                if (vlc >= 25) {
+                if (vlc >= 25U) {
                     av_log(s->avctx, AV_LOG_ERROR,
                            "luma prediction:%"PRIu32"\n", vlc);
                     return -1;
@@ -917,7 +924,7 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
 
     if (!IS_INTRA16x16(mb_type) &&
         (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B)) {
-        if ((vlc = svq3_get_ue_golomb(&s->gb_slice)) >= 48) {
+        if ((vlc = svq3_get_ue_golomb(&s->gb_slice)) >= 48U){
             av_log(s->avctx, AV_LOG_ERROR, "cbp_vlc=%"PRIu32"\n", vlc);
             return -1;
         }
@@ -1040,7 +1047,7 @@ static int svq3_decode_slice_header(AVCodecContext *avctx)
             AV_WL32(&s->gb_slice.buffer[1], header ^ s->watermark_key);
         }
         if (length > 0) {
-            memcpy(s->slice_buf, &s->slice_buf[slice_length], length - 1);
+            memmove(s->slice_buf, &s->slice_buf[slice_length], length - 1);
         }
         skip_bits_long(&s->gb, slice_bytes * 8);
     }
@@ -1066,14 +1073,14 @@ static int svq3_decode_slice_header(AVCodecContext *avctx)
     /* unknown fields */
     skip_bits1(&s->gb_slice);
 
-    if (s->unknown_flag)
+    if (s->has_watermark)
         skip_bits1(&s->gb_slice);
 
     skip_bits1(&s->gb_slice);
     skip_bits(&s->gb_slice, 2);
 
-    while (get_bits1(&s->gb_slice))
-        skip_bits(&s->gb_slice, 8);
+    if (skip_1stop_8data_bits(&s->gb_slice) < 0)
+        return AVERROR_INVALIDDATA;
 
     /* reset intra predictors and invalidate motion vector references */
     if (s->mb_x > 0) {
@@ -1115,15 +1122,14 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     unsigned char *extradata_end;
     unsigned int size;
     int marker_found = 0;
+    int ret;
 
     s->cur_pic  = av_mallocz(sizeof(*s->cur_pic));
     s->last_pic = av_mallocz(sizeof(*s->last_pic));
     s->next_pic = av_mallocz(sizeof(*s->next_pic));
     if (!s->next_pic || !s->last_pic || !s->cur_pic) {
-        av_freep(&s->cur_pic);
-        av_freep(&s->last_pic);
-        av_freep(&s->next_pic);
-        return AVERROR(ENOMEM);
+        ret = AVERROR(ENOMEM);
+        goto fail;
     }
 
     s->cur_pic->f  = av_frame_alloc();
@@ -1136,6 +1142,9 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_SVQ3, 8, 1);
     ff_videodsp_init(&s->vdsp, 8);
 
+
+    avctx->bits_per_raw_sample = 8;
+
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
     ff_tpeldsp_init(&s->tdsp);
 
@@ -1145,7 +1154,7 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     s->avctx         = avctx;
     s->halfpel_flag  = 1;
     s->thirdpel_flag = 1;
-    s->unknown_flag  = 0;
+    s->has_watermark = 0;
 
     /* prowl for the "SEQH" marker in the extradata */
     extradata     = (unsigned char *)avctx->extradata;
@@ -1164,10 +1173,13 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     if (marker_found) {
         GetBitContext gb;
         int frame_size_code;
+        int unk0, unk1, unk2, unk3, unk4;
 
         size = AV_RB32(&extradata[4]);
-        if (size > extradata_end - extradata - 8)
-            return AVERROR_INVALIDDATA;
+        if (size > extradata_end - extradata - 8) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
         init_get_bits(&gb, extradata + 8, size * 8);
 
         /* 'frame size code' and optional 'width, height' */
@@ -1211,22 +1223,27 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
         s->thirdpel_flag = get_bits1(&gb);
 
         /* unknown fields */
-        skip_bits1(&gb);
-        skip_bits1(&gb);
-        skip_bits1(&gb);
-        skip_bits1(&gb);
+        unk0 = get_bits1(&gb);
+        unk1 = get_bits1(&gb);
+        unk2 = get_bits1(&gb);
+        unk3 = get_bits1(&gb);
 
         s->low_delay = get_bits1(&gb);
 
         /* unknown field */
-        skip_bits1(&gb);
+        unk4 = get_bits1(&gb);
 
-        while (get_bits1(&gb))
-            skip_bits(&gb, 8);
+        av_log(avctx, AV_LOG_DEBUG, "Unknown fields %d %d %d %d %d\n",
+               unk0, unk1, unk2, unk3, unk4);
 
-        s->unknown_flag  = get_bits1(&gb);
+        if (skip_1stop_8data_bits(&gb) < 0) {
+            ret = AVERROR_INVALIDDATA;
+            goto fail;
+        }
+
+        s->has_watermark  = get_bits1(&gb);
         avctx->has_b_frames = !s->low_delay;
-        if (s->unknown_flag) {
+        if (s->has_watermark) {
 #if CONFIG_ZLIB
             unsigned watermark_width  = svq3_get_ue_golomb(&gb);
             unsigned watermark_height = svq3_get_ue_golomb(&gb);
@@ -1239,11 +1256,17 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
             int offset                = get_bits_count(&gb) + 7 >> 3;
             uint8_t *buf;
 
-            if (watermark_height > 0 &&
-                (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height)
-                return -1;
+            if (watermark_height <= 0 ||
+                (uint64_t)watermark_width * 4 > UINT_MAX / watermark_height) {
+                ret = -1;
+                goto fail;
+            }
 
             buf = av_malloc(buf_len);
+            if (!buf) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
             av_log(avctx, AV_LOG_DEBUG, "watermark size: %ux%u\n",
                    watermark_width, watermark_height);
             av_log(avctx, AV_LOG_DEBUG,
@@ -1254,7 +1277,8 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
                 av_log(avctx, AV_LOG_ERROR,
                        "could not uncompress watermark logo\n");
                 av_free(buf);
-                return -1;
+                ret = -1;
+                goto fail;
             }
             s->watermark_key = ff_svq1_packet_checksum(buf, buf_len, 0);
             s->watermark_key = s->watermark_key << 16 | s->watermark_key;
@@ -1264,7 +1288,8 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
 #else
             av_log(avctx, AV_LOG_ERROR,
                    "this svq3 file contains watermark which need zlib support compiled in\n");
-            return -1;
+            ret = -1;
+            goto fail;
 #endif
         }
     }
@@ -1296,6 +1321,9 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx)
     init_dequant4_coeff_table(s);
 
     return 0;
+fail:
+    svq3_decode_end(avctx);
+    return ret;
 }
 
 static void free_picture(AVCodecContext *avctx, H264Picture *pic)
@@ -1347,7 +1375,7 @@ static int get_buffer(AVCodecContext *avctx, H264Picture *pic)
         goto fail;
 
     if (!s->edge_emu_buffer) {
-        s->edge_emu_buffer = av_mallocz(pic->f->linesize[0] * 17);
+        s->edge_emu_buffer = av_mallocz_array(pic->f->linesize[0], 17);
         if (!s->edge_emu_buffer)
             return AVERROR(ENOMEM);
     }
@@ -1361,9 +1389,10 @@ fail:
 static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf = avpkt->data;
     SVQ3Context *s     = avctx->priv_data;
     int buf_size       = avpkt->size;
+    int left;
+    uint8_t *buf;
     int ret, m, i;
 
     /* special case for last picture */
@@ -1378,12 +1407,22 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
         return 0;
     }
 
+    s->mb_x = s->mb_y = s->mb_xy = 0;
+
+    if (s->watermark_key) {
+        av_fast_padded_malloc(&s->buf, &s->buf_size, buf_size);
+        if (!s->buf)
+            return AVERROR(ENOMEM);
+        memcpy(s->buf, avpkt->data, buf_size);
+        buf = s->buf;
+    } else {
+        buf = avpkt->data;
+    }
+
     ret = init_get_bits(&s->gb, buf, 8 * buf_size);
     if (ret < 0)
         return ret;
 
-    s->mb_x = s->mb_y = s->mb_xy = 0;
-
     if (svq3_decode_slice_header(avctx))
         return -1;
 
@@ -1414,6 +1453,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
     if (s->pict_type != AV_PICTURE_TYPE_I) {
         if (!s->last_pic->f->data[0]) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+            av_frame_unref(s->last_pic->f);
             ret = get_buffer(avctx, s->last_pic);
             if (ret < 0)
                 return ret;
@@ -1426,6 +1466,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
 
         if (s->pict_type == AV_PICTURE_TYPE_B && !s->next_pic->f->data[0]) {
             av_log(avctx, AV_LOG_ERROR, "Missing reference frame.\n");
+            av_frame_unref(s->next_pic->f);
             ret = get_buffer(avctx, s->next_pic);
             if (ret < 0)
                 return ret;
@@ -1513,7 +1554,7 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                 return -1;
             }
 
-            if (mb_type != 0)
+            if (mb_type != 0 || s->cbp)
                 hl_decode_mb(s);
 
             if (s->pict_type != AV_PICTURE_TYPE_B && !s->low_delay)
@@ -1527,6 +1568,18 @@ static int svq3_decode_frame(AVCodecContext *avctx, void *data,
                            s->low_delay);
     }
 
+    left = buf_size*8 - get_bits_count(&s->gb_slice);
+
+    if (s->mb_y != s->mb_height || s->mb_x != s->mb_width) {
+        av_log(avctx, AV_LOG_INFO, "frame num %d incomplete pic x %d y %d left %d\n", avctx->frame_number, s->mb_y, s->mb_x, left);
+        //av_hex_dump(stderr, buf+buf_size-8, 8);
+    }
+
+    if (left < 0) {
+        av_log(avctx, AV_LOG_ERROR, "frame num %d left %d\n", avctx->frame_number, left);
+        return -1;
+    }
+
     if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay)
         ret = av_frame_ref(data, s->cur_pic->f);
     else if (s->last_pic->f->data[0])
@@ -1565,6 +1618,10 @@ static av_cold int svq3_decode_end(AVCodecContext *avctx)
     av_freep(&s->edge_emu_buffer);
     av_freep(&s->mb2br_xy);
 
+
+    av_freep(&s->buf);
+    s->buf_size = 0;
+
     return 0;
 }
 
diff --git a/libavcodec/synth_filter.c b/libavcodec/synth_filter.c
index 708bd4e..1c5dab5 100644
--- a/libavcodec/synth_filter.c
+++ b/libavcodec/synth_filter.c
@@ -1,64 +1,179 @@
 /*
  * copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2016 foo86
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "fft.h"
+#include "dcadct.h"
+#include "dcamath.h"
 #include "synth_filter.h"
 
 static void synth_filter_float(FFTContext *imdct,
-                           float *synth_buf_ptr, int *synth_buf_offset,
-                           float synth_buf2[32], const float window[512],
-                           float out[32], const float in[32], float scale)
+                               float *synth_buf_ptr, int *synth_buf_offset,
+                               float synth_buf2[32], const float window[512],
+                               float out[32], const float in[32], float scale)
 {
-    float *synth_buf= synth_buf_ptr + *synth_buf_offset;
+    float *synth_buf = synth_buf_ptr + *synth_buf_offset;
     int i, j;
 
     imdct->imdct_half(imdct, synth_buf, in);
 
-    for (i = 0; i < 16; i++){
-        float a= synth_buf2[i     ];
-        float b= synth_buf2[i + 16];
-        float c= 0;
-        float d= 0;
-        for (j = 0; j < 512 - *synth_buf_offset; j += 64){
-            a += window[i + j     ]*(-synth_buf[15 - i + j      ]);
-            b += window[i + j + 16]*( synth_buf[     i + j      ]);
-            c += window[i + j + 32]*( synth_buf[16 + i + j      ]);
-            d += window[i + j + 48]*( synth_buf[31 - i + j      ]);
+    for (i = 0; i < 16; i++) {
+        float a = synth_buf2[i     ];
+        float b = synth_buf2[i + 16];
+        float c = 0;
+        float d = 0;
+        for (j = 0; j < 512 - *synth_buf_offset; j += 64) {
+            a += window[i + j     ] * (-synth_buf[15 - i + j      ]);
+            b += window[i + j + 16] * ( synth_buf[     i + j      ]);
+            c += window[i + j + 32] * ( synth_buf[16 + i + j      ]);
+            d += window[i + j + 48] * ( synth_buf[31 - i + j      ]);
         }
-        for (     ; j < 512; j += 64){
-            a += window[i + j     ]*(-synth_buf[15 - i + j - 512]);
-            b += window[i + j + 16]*( synth_buf[     i + j - 512]);
-            c += window[i + j + 32]*( synth_buf[16 + i + j - 512]);
-            d += window[i + j + 48]*( synth_buf[31 - i + j - 512]);
+        for (     ; j < 512; j += 64) {
+            a += window[i + j     ] * (-synth_buf[15 - i + j - 512]);
+            b += window[i + j + 16] * ( synth_buf[     i + j - 512]);
+            c += window[i + j + 32] * ( synth_buf[16 + i + j - 512]);
+            d += window[i + j + 48] * ( synth_buf[31 - i + j - 512]);
         }
-        out[i     ] = a*scale;
-        out[i + 16] = b*scale;
+        out[i     ] = a * scale;
+        out[i + 16] = b * scale;
         synth_buf2[i     ] = c;
         synth_buf2[i + 16] = d;
     }
-    *synth_buf_offset= (*synth_buf_offset - 32)&511;
+
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;
+}
+
+static void synth_filter_float_64(FFTContext *imdct,
+                                  float *synth_buf_ptr, int *synth_buf_offset,
+                                  float synth_buf2[64], const float window[1024],
+                                  float out[64], const float in[64], float scale)
+{
+    float *synth_buf = synth_buf_ptr + *synth_buf_offset;
+    int i, j;
+
+    imdct->imdct_half(imdct, synth_buf, in);
+
+    for (i = 0; i < 32; i++) {
+        float a = synth_buf2[i     ];
+        float b = synth_buf2[i + 32];
+        float c = 0;
+        float d = 0;
+        for (j = 0; j < 1024 - *synth_buf_offset; j += 128) {
+            a += window[i + j     ] * (-synth_buf[31 - i + j       ]);
+            b += window[i + j + 32] * ( synth_buf[     i + j       ]);
+            c += window[i + j + 64] * ( synth_buf[32 + i + j       ]);
+            d += window[i + j + 96] * ( synth_buf[63 - i + j       ]);
+        }
+        for (     ; j < 1024; j += 128) {
+            a += window[i + j     ] * (-synth_buf[31 - i + j - 1024]);
+            b += window[i + j + 32] * ( synth_buf[     i + j - 1024]);
+            c += window[i + j + 64] * ( synth_buf[32 + i + j - 1024]);
+            d += window[i + j + 96] * ( synth_buf[63 - i + j - 1024]);
+        }
+        out[i     ] = a * scale;
+        out[i + 32] = b * scale;
+        synth_buf2[i     ] = c;
+        synth_buf2[i + 32] = d;
+    }
+
+    *synth_buf_offset = (*synth_buf_offset - 64) & 1023;
+}
+
+static void synth_filter_fixed(DCADCTContext *imdct,
+                               int32_t *synth_buf_ptr, int *synth_buf_offset,
+                               int32_t synth_buf2[32], const int32_t window[512],
+                               int32_t out[32], const int32_t in[32])
+{
+    int32_t *synth_buf = synth_buf_ptr + *synth_buf_offset;
+    int i, j;
+
+    imdct->imdct_half[0](synth_buf, in);
+
+    for (i = 0; i < 16; i++) {
+        int64_t a = synth_buf2[i     ] * (INT64_C(1) << 21);
+        int64_t b = synth_buf2[i + 16] * (INT64_C(1) << 21);
+        int64_t c = 0;
+        int64_t d = 0;
+        for (j = 0; j < 512 - *synth_buf_offset; j += 64) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j      ];
+            b += (int64_t)window[i + j + 16] * synth_buf[15 - i + j      ];
+            c += (int64_t)window[i + j + 32] * synth_buf[16 + i + j      ];
+            d += (int64_t)window[i + j + 48] * synth_buf[31 - i + j      ];
+        }
+        for (     ; j < 512; j += 64) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j - 512];
+            b += (int64_t)window[i + j + 16] * synth_buf[15 - i + j - 512];
+            c += (int64_t)window[i + j + 32] * synth_buf[16 + i + j - 512];
+            d += (int64_t)window[i + j + 48] * synth_buf[31 - i + j - 512];
+        }
+        out[i     ] = clip23(norm21(a));
+        out[i + 16] = clip23(norm21(b));
+        synth_buf2[i     ] = norm21(c);
+        synth_buf2[i + 16] = norm21(d);
+    }
+
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;
+}
+
+static void synth_filter_fixed_64(DCADCTContext *imdct,
+                                  int32_t *synth_buf_ptr, int *synth_buf_offset,
+                                  int32_t synth_buf2[64], const int32_t window[1024],
+                                  int32_t out[64], const int32_t in[64])
+{
+    int32_t *synth_buf = synth_buf_ptr + *synth_buf_offset;
+    int i, j;
+
+    imdct->imdct_half[1](synth_buf, in);
+
+    for (i = 0; i < 32; i++) {
+        int64_t a = synth_buf2[i     ] * (INT64_C(1) << 20);
+        int64_t b = synth_buf2[i + 32] * (INT64_C(1) << 20);
+        int64_t c = 0;
+        int64_t d = 0;
+        for (j = 0; j < 1024 - *synth_buf_offset; j += 128) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j       ];
+            b += (int64_t)window[i + j + 32] * synth_buf[31 - i + j       ];
+            c += (int64_t)window[i + j + 64] * synth_buf[32 + i + j       ];
+            d += (int64_t)window[i + j + 96] * synth_buf[63 - i + j       ];
+        }
+        for (     ; j < 1024; j += 128) {
+            a += (int64_t)window[i + j     ] * synth_buf[     i + j - 1024];
+            b += (int64_t)window[i + j + 32] * synth_buf[31 - i + j - 1024];
+            c += (int64_t)window[i + j + 64] * synth_buf[32 + i + j - 1024];
+            d += (int64_t)window[i + j + 96] * synth_buf[63 - i + j - 1024];
+        }
+        out[i     ] = clip23(norm20(a));
+        out[i + 32] = clip23(norm20(b));
+        synth_buf2[i     ] = norm20(c);
+        synth_buf2[i + 32] = norm20(d);
+    }
+
+    *synth_buf_offset = (*synth_buf_offset - 64) & 1023;
 }
 
 av_cold void ff_synth_filter_init(SynthFilterContext *c)
 {
-    c->synth_filter_float = synth_filter_float;
+    c->synth_filter_float    = synth_filter_float;
+    c->synth_filter_float_64 = synth_filter_float_64;
+    c->synth_filter_fixed    = synth_filter_fixed;
+    c->synth_filter_fixed_64 = synth_filter_fixed_64;
 
     if (ARCH_AARCH64)
         ff_synth_filter_init_aarch64(c);
diff --git a/libavcodec/synth_filter.h b/libavcodec/synth_filter.h
index a93dc4f..df3589a 100644
--- a/libavcodec/synth_filter.h
+++ b/libavcodec/synth_filter.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,7 @@
 #define AVCODEC_SYNTH_FILTER_H
 
 #include "fft.h"
+#include "dcadct.h"
 
 typedef struct SynthFilterContext {
     void (*synth_filter_float)(FFTContext *imdct,
@@ -29,6 +30,18 @@ typedef struct SynthFilterContext {
                                float synth_buf2[32], const float window[512],
                                float out[32], const float in[32],
                                float scale);
+    void (*synth_filter_float_64)(FFTContext *imdct,
+                                  float *synth_buf_ptr, int *synth_buf_offset,
+                                  float synth_buf2[64], const float window[1024],
+                                  float out[64], const float in[64], float scale);
+    void (*synth_filter_fixed)(DCADCTContext *imdct,
+                               int32_t *synth_buf_ptr, int *synth_buf_offset,
+                               int32_t synth_buf2[32], const int32_t window[512],
+                               int32_t out[32], const int32_t in[32]);
+    void (*synth_filter_fixed_64)(DCADCTContext *imdct,
+                                  int32_t *synth_buf_ptr, int *synth_buf_offset,
+                                  int32_t synth_buf2[64], const int32_t window[1024],
+                                  int32_t out[64], const int32_t in[64]);
 } SynthFilterContext;
 
 void ff_synth_filter_init(SynthFilterContext *c);
diff --git a/libavcodec/tableprint.h b/libavcodec/tableprint.h
index daa89fe..6f61c71 100644
--- a/libavcodec/tableprint.h
+++ b/libavcodec/tableprint.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,6 +64,7 @@ void write_int8_t_array     (const int8_t   *, int);
 void write_uint8_t_array    (const uint8_t  *, int);
 void write_uint16_t_array   (const uint16_t *, int);
 void write_uint32_t_array   (const uint32_t *, int);
+void write_int32_t_array    (const int32_t  *, int);
 void write_float_array      (const float    *, int);
 void write_int8_t_2d_array  (const void *, int, int);
 void write_uint8_t_2d_array (const void *, int, int);
@@ -81,6 +82,16 @@ void write_float_2d_array   (const void *, int, int);
 #define FMT "zu"
 #endif
 
+#define WRITE_ARRAY_ALIGNED(prefix, align, type, name)  \
+    do {                                                \
+        const size_t array_size = FF_ARRAY_ELEMS(name); \
+        printf(prefix" DECLARE_ALIGNED("#align", "      \
+               #type", "#name")[%"FMT"] = {\n",         \
+               array_size);                             \
+        write_##type##_array(name, array_size);         \
+        printf("};\n");                                 \
+    } while(0)
+
 #define WRITE_ARRAY(prefix, type, name)                 \
     do {                                                \
         const size_t array_size = FF_ARRAY_ELEMS(name); \
@@ -104,7 +115,9 @@ void write_float_2d_array   (const void *, int, int);
 WRITE_1D_FUNC(int8_t,   "%3"PRIi8, 15)
 WRITE_1D_FUNC(uint8_t,  "0x%02"PRIx8, 15)
 WRITE_1D_FUNC(uint16_t, "0x%08"PRIx16, 7)
+WRITE_1D_FUNC(int16_t,  "%5"PRIi16, 7)
 WRITE_1D_FUNC(uint32_t, "0x%08"PRIx32, 7)
+WRITE_1D_FUNC(int32_t,  "0x%08"PRIx32, 7)
 WRITE_1D_FUNC(float,    "%.18e", 3)
 
 WRITE_2D_FUNC(int8_t)
diff --git a/libavcodec/tableprint_vlc.h b/libavcodec/tableprint_vlc.h
new file mode 100644
index 0000000..675251a
--- /dev/null
+++ b/libavcodec/tableprint_vlc.h
@@ -0,0 +1,82 @@
+/*
+ * Helpers for generating hard-coded VLC tables
+ *
+ * Copyright (c) 2014 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TABLEPRINT_VLC_H
+#define AVCODEC_TABLEPRINT_VLC_H
+
+#define FFMPEG_CONFIG_H
+#define AVUTIL_LOG_H
+#define av_log(a, ...) while(0)
+#define ff_dlog(a, ...) while(0)
+#define AVUTIL_MEM_H
+#define av_malloc(s) NULL
+#define av_malloc_array(a, b) NULL
+#define av_realloc_f(p, o, n) NULL
+#define av_free(p) while(0)
+#define av_freep(p) while(0)
+#define AVCODEC_AVCODEC_H
+#define AVCODEC_INTERNAL_H
+#include "tableprint.h"
+#include "get_bits.h"
+#include "mathtables.c"
+#include "libavutil/reverse.c"
+#include "bitstream.c"
+
+#define REPLACE_DEFINE2(type) write_##type##_array
+#define REPLACE_DEFINE(type) REPLACE_DEFINE2(type)
+static void write_VLC_TYPE_array(const VLC_TYPE *p, int s) {
+    REPLACE_DEFINE(VLC_TYPE)(p, s);
+}
+
+WRITE_2D_FUNC(VLC_TYPE)
+
+static void write_vlc_type(const VLC *vlc, VLC_TYPE (*base_table)[2], const char *base_table_name)
+{
+    printf("    .bits = %i,\n", vlc->bits);
+    // Unfortunately need to cast away const currently
+    printf("    .table = (VLC_TYPE (*)[2])(%s + 0x%x),\n", base_table_name, (int)(vlc->table - base_table));
+    printf("    .table_size = 0x%x,\n", vlc->table_size);
+    printf("    .table_allocated = 0x%x,\n", vlc->table_allocated);
+}
+
+#define WRITE_VLC_TYPE(prefix, name, base_table)        \
+    do {                                                \
+        printf(prefix" VLC "#name" = {\n");             \
+        write_vlc_type(&name, base_table, #base_table); \
+        printf("};\n");                                 \
+    } while(0)
+
+#define WRITE_VLC_ARRAY(prefix, name, base_table)       \
+    do {                                                \
+        int i;                                          \
+        const size_t array_size = FF_ARRAY_ELEMS(name); \
+        printf(prefix" VLC "#name"[%"FMT"] = {{\n",     \
+               array_size);                             \
+        for (i = 0; i < array_size; i++) {              \
+            write_vlc_type(name + i,                    \
+                           base_table, #base_table);    \
+            if (i != array_size - 1) printf("}, {\n");  \
+        }                                               \
+        printf("}};\n");                                \
+    } while(0)
+
+#endif /* AVCODEC_TABLEPRINT_VLC_H */
diff --git a/libavcodec/tak.c b/libavcodec/tak.c
index 867a84b..ed41ca8 100644
--- a/libavcodec/tak.c
+++ b/libavcodec/tak.c
@@ -2,28 +2,49 @@
  * TAK common code
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/bswap.h"
 #include "libavutil/crc.h"
 #include "libavutil/intreadwrite.h"
 #include "tak.h"
 
+static const int64_t tak_channel_layouts[] = {
+    0,
+    AV_CH_FRONT_LEFT,
+    AV_CH_FRONT_RIGHT,
+    AV_CH_FRONT_CENTER,
+    AV_CH_LOW_FREQUENCY,
+    AV_CH_BACK_LEFT,
+    AV_CH_BACK_RIGHT,
+    AV_CH_FRONT_LEFT_OF_CENTER,
+    AV_CH_FRONT_RIGHT_OF_CENTER,
+    AV_CH_BACK_CENTER,
+    AV_CH_SIDE_LEFT,
+    AV_CH_SIDE_RIGHT,
+    AV_CH_TOP_CENTER,
+    AV_CH_TOP_FRONT_LEFT,
+    AV_CH_TOP_FRONT_CENTER,
+    AV_CH_TOP_FRONT_RIGHT,
+    AV_CH_TOP_BACK_LEFT,
+    AV_CH_TOP_BACK_CENTER,
+    AV_CH_TOP_BACK_RIGHT,
+};
+
 static const uint16_t frame_duration_type_quants[] = {
     3, 4, 6, 8, 4096, 8192, 16384, 512, 1024, 2048,
 };
@@ -51,22 +72,6 @@ static int tak_get_nb_samples(int sample_rate, enum TAKFrameSizeType type)
     return nb_samples;
 }
 
-static int crc_init = 0;
-#if CONFIG_SMALL
-#define CRC_TABLE_SIZE 257
-#else
-#define CRC_TABLE_SIZE 1024
-#endif
-static AVCRC crc_24[CRC_TABLE_SIZE];
-
-av_cold void ff_tak_init_crc(void)
-{
-    if (!crc_init) {
-        av_crc_init(crc_24, 0, 24, 0x864CFBU, sizeof(crc_24));
-        crc_init = 1;
-    }
-}
-
 int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size)
 {
     uint32_t crc, CRC;
@@ -75,8 +80,8 @@ int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size)
         return AVERROR_INVALIDDATA;
     buf_size -= 3;
 
-    CRC = av_bswap32(AV_RL24(buf + buf_size)) >> 8;
-    crc = av_crc(crc_24, 0xCE04B7U, buf, buf_size);
+    CRC = AV_RB24(buf + buf_size);
+    crc = av_crc(av_crc_get_table(AV_CRC_24_IEEE), 0xCE04B7U, buf, buf_size);
     if (CRC != crc)
         return AVERROR_INVALIDDATA;
 
@@ -108,8 +113,8 @@ void avpriv_tak_parse_streaminfo(GetBitContext *gb, TAKStreamInfo *s)
             for (i = 0; i < s->channels; i++) {
                 int value = get_bits(gb, TAK_FORMAT_CH_LAYOUT_BITS);
 
-                if (value > 0 && value <= 18)
-                    channel_mask |= 1 << (value - 1);
+                if (value < FF_ARRAY_ELEMS(tak_channel_layouts))
+                    channel_mask |= tak_channel_layouts[value];
             }
         }
     }
@@ -144,6 +149,9 @@ int ff_tak_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
         align_get_bits(gb);
     }
 
+    if (ti->flags & TAK_FRAME_FLAG_HAS_METADATA)
+        return AVERROR_INVALIDDATA;
+
     skip_bits(gb, 24);
 
     return 0;
diff --git a/libavcodec/tak.h b/libavcodec/tak.h
index fa91149..e8e2dac 100644
--- a/libavcodec/tak.h
+++ b/libavcodec/tak.h
@@ -2,20 +2,20 @@
  * TAK decoder/demuxer common code
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -99,7 +99,7 @@
 
 enum TAKCodecType {
     TAK_CODEC_MONO_STEREO  = 2,
-    TAK_CODEC_MULTICHANNEL = 4
+    TAK_CODEC_MULTICHANNEL = 4,
 };
 
 enum TAKMetaDataType {
@@ -140,8 +140,6 @@ typedef struct TAKStreamInfo {
     int64_t           samples;
 } TAKStreamInfo;
 
-void ff_tak_init_crc(void);
-
 int ff_tak_check_crc(const uint8_t *buf, unsigned int buf_size);
 
 /**
@@ -162,5 +160,4 @@ void avpriv_tak_parse_streaminfo(GetBitContext *gb, TAKStreamInfo *s);
  */
 int ff_tak_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb,
                                TAKStreamInfo *s, int log_level_offset);
-
 #endif /* AVCODEC_TAK_H */
diff --git a/libavcodec/tak_parser.c b/libavcodec/tak_parser.c
index 295df24..1417fb4 100644
--- a/libavcodec/tak_parser.c
+++ b/libavcodec/tak_parser.c
@@ -2,20 +2,20 @@
  * TAK parser
  * Copyright (c) 2012 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,12 +33,6 @@ typedef struct TAKParseContext {
     int           index;
 } TAKParseContext;
 
-static av_cold int tak_init(AVCodecParserContext *s)
-{
-    ff_tak_init_crc();
-    return 0;
-}
-
 static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                      const uint8_t **poutbuf, int *poutbuf_size,
                      const uint8_t *buf, int buf_size)
@@ -49,10 +43,12 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     GetBitContext gb;
     int consumed = 0;
     int needed   = buf_size ? TAK_MAX_FRAME_HEADER_BYTES : 8;
+    int ret;
 
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         TAKStreamInfo ti;
-        init_get_bits(&gb, buf, buf_size);
+        if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+            return ret;
         if (!ff_tak_decode_frame_header(avctx, &gb, &ti, 127))
             s->duration = t->ti.last_frame_samples ? t->ti.last_frame_samples
                                                    : t->ti.frame_samples;
@@ -63,27 +59,27 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
 
     while (buf_size || t->index + needed <= pc->index) {
         if (buf_size && t->index + TAK_MAX_FRAME_HEADER_BYTES > pc->index) {
-            int tmp_buf_size       = FFMIN(2 * TAK_MAX_FRAME_HEADER_BYTES,
+            int tmp_buf_size       = FFMIN(TAK_MAX_FRAME_HEADER_BYTES,
                                            buf_size);
             const uint8_t *tmp_buf = buf;
 
-            ff_combine_frame(pc, END_NOT_FOUND, &tmp_buf, &tmp_buf_size);
+            if (ff_combine_frame(pc, END_NOT_FOUND, &tmp_buf, &tmp_buf_size) != -1)
+                return AVERROR(ENOMEM);
             consumed += tmp_buf_size;
             buf      += tmp_buf_size;
             buf_size -= tmp_buf_size;
         }
 
-        for (; t->index + needed <= pc->index; t->index++)
-            if (pc->buffer[t->index]     == 0xFF &&
-                pc->buffer[t->index + 1] == 0xA0) {
+        for (; t->index + needed <= pc->index; t->index++) {
+            if (pc->buffer[ t->index     ] == 0xFF &&
+                pc->buffer[ t->index + 1 ] == 0xA0) {
                 TAKStreamInfo ti;
 
-                init_get_bits(&gb, pc->buffer + t->index,
-                              8 * (pc->index - t->index));
+                if ((ret = init_get_bits8(&gb, pc->buffer + t->index,
+                                          pc->index - t->index)) < 0)
+                    return ret;
                 if (!ff_tak_decode_frame_header(avctx, &gb,
-                                                pc->frame_start_found ? &ti
-                                                                      : &t->ti,
-                                                127) &&
+                        pc->frame_start_found ? &ti : &t->ti, 127) &&
                     !ff_tak_check_crc(pc->buffer + t->index,
                                       get_bits_count(&gb) / 8)) {
                     if (!pc->frame_start_found) {
@@ -91,6 +87,7 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                         s->duration           = t->ti.last_frame_samples ?
                                                 t->ti.last_frame_samples :
                                                 t->ti.frame_samples;
+                        s->key_frame          = !!(t->ti.flags & TAK_FRAME_FLAG_HAS_INFO);
                     } else {
                         pc->frame_start_found = 0;
                         next                  = t->index - pc->index;
@@ -99,9 +96,10 @@ static int tak_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                     }
                 }
             }
+        }
     }
-
 found:
+
     if (consumed && !buf_size && next == END_NOT_FOUND ||
         ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
         *poutbuf      = NULL;
@@ -122,7 +120,6 @@ found:
 AVCodecParser ff_tak_parser = {
     .codec_ids      = { AV_CODEC_ID_TAK },
     .priv_data_size = sizeof(TAKParseContext),
-    .parser_init    = tak_init,
     .parser_parse   = tak_parse,
     .parser_close   = ff_parse_close,
 };
diff --git a/libavcodec/takdec.c b/libavcodec/takdec.c
index 93098be..023bc87 100644
--- a/libavcodec/takdec.c
+++ b/libavcodec/takdec.c
@@ -2,20 +2,20 @@
  * TAK decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,43 +28,50 @@
 #include "libavutil/internal.h"
 #include "libavutil/samplefmt.h"
 #include "tak.h"
+#include "takdsp.h"
 #include "audiodsp.h"
+#include "thread.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "unary.h"
 
-#define MAX_SUBFRAMES     8                         // max number of subframes per channel
+#define MAX_SUBFRAMES     8                         ///< max number of subframes per channel
 #define MAX_PREDICTORS  256
 
 typedef struct MCDParam {
-    int8_t present;                                 // decorrelation parameter availability for this channel
-    int8_t index;                                   // index into array of decorrelation types
+    int8_t present;                                 ///< decorrelation parameter availability for this channel
+    int8_t index;                                   ///< index into array of decorrelation types
     int8_t chan1;
     int8_t chan2;
 } MCDParam;
 
 typedef struct TAKDecContext {
-    AVCodecContext *avctx;                          // parent AVCodecContext
+    AVCodecContext *avctx;                          ///< parent AVCodecContext
     AudioDSPContext adsp;
+    TAKDSPContext   tdsp;
     TAKStreamInfo   ti;
-    GetBitContext   gb;                             // bitstream reader initialized to start at the current frame
+    GetBitContext   gb;                             ///< bitstream reader initialized to start at the current frame
 
     int             uval;
-    int             nb_samples;                     // number of samples in the current frame
+    int             nb_samples;                     ///< number of samples in the current frame
     uint8_t        *decode_buffer;
     unsigned int    decode_buffer_size;
-    int32_t        *decoded[TAK_MAX_CHANNELS];      // decoded samples for each channel
+    int32_t        *decoded[TAK_MAX_CHANNELS];      ///< decoded samples for each channel
 
     int8_t          lpc_mode[TAK_MAX_CHANNELS];
-    int8_t          sample_shift[TAK_MAX_CHANNELS]; // shift applied to every sample in the channel
+    int8_t          sample_shift[TAK_MAX_CHANNELS]; ///< shift applied to every sample in the channel
+    int16_t         predictors[MAX_PREDICTORS];
+    int             nb_subframes;                   ///< number of subframes in the current frame
+    int16_t         subframe_len[MAX_SUBFRAMES];    ///< subframe length in samples
     int             subframe_scale;
 
-    int8_t          dmode;                          // channel decorrelation type in the current frame
+    int8_t          dmode;                          ///< channel decorrelation type in the current frame
 
-    MCDParam        mcdparams[TAK_MAX_CHANNELS];    // multichannel decorrelation parameters
+    MCDParam        mcdparams[TAK_MAX_CHANNELS];    ///< multichannel decorrelation parameters
 
-    int16_t        *residues;
-    unsigned int    residues_buf_size;
+    int8_t          coding_mode[128];
+    DECLARE_ALIGNED(16, int16_t, filter)[MAX_PREDICTORS];
+    DECLARE_ALIGNED(16, int16_t, residues)[544];
 } TAKDecContext;
 
 static const int8_t mc_dmodes[] = { 1, 3, 4, 6, };
@@ -132,14 +139,9 @@ static const struct CParam {
     { 0x1A, 0x1800000, 0x1800000, 0x6800000, 0xC000000 },
 };
 
-static av_cold void tak_init_static_data(AVCodec *codec)
-{
-    ff_tak_init_crc();
-}
-
 static int set_bps_params(AVCodecContext *avctx)
 {
-    switch (avctx->bits_per_coded_sample) {
+    switch (avctx->bits_per_raw_sample) {
     case 8:
         avctx->sample_fmt = AV_SAMPLE_FMT_U8P;
         break;
@@ -150,11 +152,10 @@ static int set_bps_params(AVCodecContext *avctx)
         avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
         break;
     default:
-        av_log(avctx, AV_LOG_ERROR, "unsupported bits per sample: %d\n",
-               avctx->bits_per_coded_sample);
+        av_log(avctx, AV_LOG_ERROR, "invalid/unsupported bits per sample: %d\n",
+               avctx->bits_per_raw_sample);
         return AVERROR_INVALIDDATA;
     }
-    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     return 0;
 }
@@ -162,8 +163,17 @@ static int set_bps_params(AVCodecContext *avctx)
 static void set_sample_rate_params(AVCodecContext *avctx)
 {
     TAKDecContext *s  = avctx->priv_data;
-    int shift         = 3 - (avctx->sample_rate / 11025);
-    shift             = FFMAX(0, shift);
+    int shift;
+
+    if (avctx->sample_rate < 11025) {
+        shift = 3;
+    } else if (avctx->sample_rate < 22050) {
+        shift = 2;
+    } else if (avctx->sample_rate < 44100) {
+        shift = 1;
+    } else {
+        shift = 0;
+    }
     s->uval           = FFALIGN(avctx->sample_rate + 511 >> 9, 4) << shift;
     s->subframe_scale = FFALIGN(avctx->sample_rate + 511 >> 9, 4) << 1;
 }
@@ -173,8 +183,10 @@ static av_cold int tak_decode_init(AVCodecContext *avctx)
     TAKDecContext *s = avctx->priv_data;
 
     ff_audiodsp_init(&s->adsp);
+    ff_takdsp_init(&s->tdsp);
 
     s->avctx = avctx;
+    avctx->bits_per_raw_sample = avctx->bits_per_coded_sample;
 
     set_sample_rate_params(avctx);
 
@@ -224,6 +236,7 @@ static void decode_lpc(int32_t *coeffs, int mode, int length)
             int a3  = coeffs[2];
             int a4  = a3 + a1;
             int a5  = a4 + a2;
+            coeffs[2] = a5;
             coeffs += 3;
             for (i = 0; i < length - 3; i++) {
                 a3     += *coeffs;
@@ -236,10 +249,10 @@ static void decode_lpc(int32_t *coeffs, int mode, int length)
     }
 }
 
-static int decode_segment(GetBitContext *gb, int mode, int32_t *decoded,
-                          int len)
+static int decode_segment(TAKDecContext *s, int8_t mode, int32_t *decoded, int len)
 {
     struct CParam code;
+    GetBitContext *gb = &s->gb;
     int i;
 
     if (!mode) {
@@ -290,7 +303,6 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
 
     if (get_bits1(gb)) {
         int wlength, rval;
-        int coding_mode[128];
 
         wlength = length / s->uval;
 
@@ -304,7 +316,7 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
         if (wlength <= 1 || wlength > 128)
             return AVERROR_INVALIDDATA;
 
-        coding_mode[0] = mode = get_bits(gb, 6);
+        s->coding_mode[0] = mode = get_bits(gb, 6);
 
         for (i = 1; i < wlength; i++) {
             int c = get_unary(gb, 1, 6);
@@ -328,14 +340,14 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
                 mode--;
                 break;
             }
-            coding_mode[i] = mode;
+            s->coding_mode[i] = mode;
         }
 
         i = 0;
         while (i < wlength) {
             int len = 0;
 
-            mode = coding_mode[i];
+            mode = s->coding_mode[i];
             do {
                 if (i >= wlength - 1)
                     len += rval;
@@ -345,15 +357,15 @@ static int decode_residues(TAKDecContext *s, int32_t *decoded, int length)
 
                 if (i == wlength)
                     break;
-            } while (coding_mode[i] == mode);
+            } while (s->coding_mode[i] == mode);
 
-            if ((ret = decode_segment(gb, mode, decoded, len)) < 0)
+            if ((ret = decode_segment(s, mode, decoded, len)) < 0)
                 return ret;
             decoded += len;
         }
     } else {
         mode = get_bits(gb, 6);
-        if ((ret = decode_segment(gb, mode, decoded, length)) < 0)
+        if ((ret = decode_segment(s, mode, decoded, length)) < 0)
             return ret;
     }
 
@@ -368,62 +380,13 @@ static int get_bits_esc4(GetBitContext *gb)
         return 0;
 }
 
-static void decode_filter_coeffs(TAKDecContext *s, int filter_order, int size,
-                                 int filter_quant, int16_t *filter)
-{
-    GetBitContext *gb = &s->gb;
-    int i, j, a, b;
-    int filter_tmp[MAX_PREDICTORS];
-    int16_t predictors[MAX_PREDICTORS];
-
-    predictors[0] = get_sbits(gb, 10);
-    predictors[1] = get_sbits(gb, 10);
-    predictors[2] = get_sbits(gb, size) << (10 - size);
-    predictors[3] = get_sbits(gb, size) << (10 - size);
-    if (filter_order > 4) {
-        int av_uninit(code_size);
-        int code_size_base = size - get_bits1(gb);
-
-        for (i = 4; i < filter_order; i++) {
-            if (!(i & 3))
-                code_size = code_size_base - get_bits(gb, 2);
-            predictors[i] = get_sbits(gb, code_size) << (10 - size);
-        }
-    }
-
-    filter_tmp[0] = predictors[0] << 6;
-    for (i = 1; i < filter_order; i++) {
-        int *p1 = &filter_tmp[0];
-        int *p2 = &filter_tmp[i - 1];
-
-        for (j = 0; j < (i + 1) / 2; j++) {
-            int tmp = *p1 + (predictors[i] * *p2 + 256 >> 9);
-            *p2     = *p2 + (predictors[i] * *p1 + 256 >> 9);
-            *p1     = tmp;
-            p1++;
-            p2--;
-        }
-
-        filter_tmp[i] = predictors[i] << 6;
-    }
-
-    a = 1 << (32 - (15 - filter_quant));
-    b = 1 << ((15 - filter_quant) - 1);
-    for (i = 0, j = filter_order - 1; i < filter_order / 2; i++, j--) {
-        filter[j] = a - ((filter_tmp[i] + b) >> (15 - filter_quant));
-        filter[i] = a - ((filter_tmp[j] + b) >> (15 - filter_quant));
-    }
-}
-
 static int decode_subframe(TAKDecContext *s, int32_t *decoded,
                            int subframe_size, int prev_subframe_size)
 {
-    LOCAL_ALIGNED_16(int16_t, filter, [MAX_PREDICTORS]);
     GetBitContext *gb = &s->gb;
-    int i, ret;
+    int x, y, i, j, ret = 0;
     int dshift, size, filter_quant, filter_order;
-
-    memset(filter, 0, MAX_PREDICTORS * sizeof(*filter));
+    int tfilter[MAX_PREDICTORS];
 
     if (!get_bits1(gb))
         return decode_residues(s, decoded, subframe_size);
@@ -466,30 +429,74 @@ static int decode_subframe(TAKDecContext *s, int32_t *decoded,
             return AVERROR_INVALIDDATA;
     }
 
-    decode_filter_coeffs(s, filter_order, size, filter_quant, filter);
+    s->predictors[0] = get_sbits(gb, 10);
+    s->predictors[1] = get_sbits(gb, 10);
+    s->predictors[2] = get_sbits(gb, size) << (10 - size);
+    s->predictors[3] = get_sbits(gb, size) << (10 - size);
+    if (filter_order > 4) {
+        int tmp = size - get_bits1(gb);
+
+        for (i = 4; i < filter_order; i++) {
+            if (!(i & 3))
+                x = tmp - get_bits(gb, 2);
+            s->predictors[i] = get_sbits(gb, x) << (10 - size);
+        }
+    }
+
+    tfilter[0] = s->predictors[0] << 6;
+    for (i = 1; i < filter_order; i++) {
+        int32_t *p1 = &tfilter[0];
+        int32_t *p2 = &tfilter[i - 1];
+
+        for (j = 0; j < (i + 1) / 2; j++) {
+            x     = *p1 + (s->predictors[i] * *p2 + 256 >> 9);
+            *p2  += s->predictors[i] * *p1 + 256 >> 9;
+            *p1++ = x;
+            p2--;
+        }
+
+        tfilter[i] = s->predictors[i] << 6;
+    }
+
+    x = 1 << (32 - (15 - filter_quant));
+    y = 1 << ((15 - filter_quant) - 1);
+    for (i = 0, j = filter_order - 1; i < filter_order / 2; i++, j--) {
+        s->filter[j] = x - ((tfilter[i] + y) >> (15 - filter_quant));
+        s->filter[i] = x - ((tfilter[j] + y) >> (15 - filter_quant));
+    }
 
     if ((ret = decode_residues(s, &decoded[filter_order],
                                subframe_size - filter_order)) < 0)
         return ret;
 
-    av_fast_malloc(&s->residues, &s->residues_buf_size,
-                   FFALIGN(subframe_size + 16, 16) * sizeof(*s->residues));
-    if (!s->residues)
-        return AVERROR(ENOMEM);
-    memset(s->residues, 0, s->residues_buf_size);
-
     for (i = 0; i < filter_order; i++)
         s->residues[i] = *decoded++ >> dshift;
 
-    for (i = 0; i < subframe_size - filter_order; i++) {
-        int v = 1 << (filter_quant - 1);
-
-        v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
-                                         FFALIGN(filter_order, 16));
+    y    = FF_ARRAY_ELEMS(s->residues) - filter_order;
+    x    = subframe_size - filter_order;
+    while (x > 0) {
+        int tmp = FFMIN(y, x);
+
+        for (i = 0; i < tmp; i++) {
+            int v = 1 << (filter_quant - 1);
+
+            if (filter_order & -16)
+                v += s->adsp.scalarproduct_int16(&s->residues[i], s->filter,
+                                                 filter_order & -16);
+            for (j = filter_order & -16; j < filter_order; j += 4) {
+                v += s->residues[i + j + 3] * s->filter[j + 3] +
+                     s->residues[i + j + 2] * s->filter[j + 2] +
+                     s->residues[i + j + 1] * s->filter[j + 1] +
+                     s->residues[i + j    ] * s->filter[j    ];
+            }
+            v = (av_clip_intp2(v >> filter_quant, 13) << dshift) - *decoded;
+            *decoded++ = v;
+            s->residues[filter_order + i] = v >> dshift;
+        }
 
-        v = (av_clip_intp2(v >> filter_quant, 13) << dshift) - *decoded;
-        *decoded++ = v;
-        s->residues[filter_order + i] = v >> dshift;
+        x -= tmp;
+        if (x > 0)
+            memcpy(s->residues, &s->residues[y], 2 * filter_order);
     }
 
     emms_c();
@@ -503,50 +510,42 @@ static int decode_channel(TAKDecContext *s, int chan)
     GetBitContext *gb     = &s->gb;
     int32_t *decoded      = s->decoded[chan];
     int left              = s->nb_samples - 1;
-    int i, prev, ret, nb_subframes;
-    int subframe_len[MAX_SUBFRAMES];
+    int i = 0, ret, prev = 0;
 
     s->sample_shift[chan] = get_bits_esc4(gb);
-    if (s->sample_shift[chan] >= avctx->bits_per_coded_sample)
+    if (s->sample_shift[chan] >= avctx->bits_per_raw_sample)
         return AVERROR_INVALIDDATA;
 
-    /* NOTE: TAK 2.2.0 appears to set the sample value to 0 if
-     *       bits_per_coded_sample - sample_shift is 1, but this produces
-     *       non-bit-exact output. Reading the 1 bit using get_sbits() instead
-     *       of skipping it produces bit-exact output. This has been reported
-     *       to the TAK author. */
-    *decoded++        = get_sbits(gb,
-                                  avctx->bits_per_coded_sample -
-                                  s->sample_shift[chan]);
+    *decoded++ = get_sbits(gb, avctx->bits_per_raw_sample - s->sample_shift[chan]);
     s->lpc_mode[chan] = get_bits(gb, 2);
-    nb_subframes      = get_bits(gb, 3) + 1;
+    s->nb_subframes   = get_bits(gb, 3) + 1;
 
-    i = 0;
-    if (nb_subframes > 1) {
-        if (get_bits_left(gb) < (nb_subframes - 1) * 6)
+    if (s->nb_subframes > 1) {
+        if (get_bits_left(gb) < (s->nb_subframes - 1) * 6)
             return AVERROR_INVALIDDATA;
 
-        prev = 0;
-        for (; i < nb_subframes - 1; i++) {
-            int subframe_end = get_bits(gb, 6) * s->subframe_scale;
-            if (subframe_end <= prev)
+        for (; i < s->nb_subframes - 1; i++) {
+            int v = get_bits(gb, 6);
+
+            s->subframe_len[i] = (v - prev) * s->subframe_scale;
+            if (s->subframe_len[i] <= 0)
                 return AVERROR_INVALIDDATA;
-            subframe_len[i] = subframe_end - prev;
-            left           -= subframe_len[i];
-            prev            = subframe_end;
+
+            left -= s->subframe_len[i];
+            prev  = v;
         }
 
         if (left <= 0)
             return AVERROR_INVALIDDATA;
     }
-    subframe_len[i] = left;
+    s->subframe_len[i] = left;
 
     prev = 0;
-    for (i = 0; i < nb_subframes; i++) {
-        if ((ret = decode_subframe(s, decoded, subframe_len[i], prev)) < 0)
+    for (i = 0; i < s->nb_subframes; i++) {
+        if ((ret = decode_subframe(s, decoded, s->subframe_len[i], prev)) < 0)
             return ret;
-        decoded += subframe_len[i];
-        prev     = subframe_len[i];
+        decoded += s->subframe_len[i];
+        prev     = s->subframe_len[i];
     }
 
     return 0;
@@ -555,55 +554,38 @@ static int decode_channel(TAKDecContext *s, int chan)
 static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
 {
     GetBitContext *gb = &s->gb;
-    int32_t *p1       = s->decoded[c1] + 1;
-    int32_t *p2       = s->decoded[c2] + 1;
+    int32_t *p1       = s->decoded[c1] + (s->dmode > 5);
+    int32_t *p2       = s->decoded[c2] + (s->dmode > 5);
+    int32_t bp1       = p1[0];
+    int32_t bp2       = p2[0];
     int i;
     int dshift, dfactor;
 
+    length += s->dmode < 6;
+
     switch (s->dmode) {
     case 1: /* left/side */
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            p2[i]     = a + b;
-        }
+        s->tdsp.decorrelate_ls(p1, p2, length);
         break;
     case 2: /* side/right */
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            p1[i]     = b - a;
-        }
+        s->tdsp.decorrelate_sr(p1, p2, length);
         break;
     case 3: /* side/mid */
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            a        -= b >> 1;
-            p1[i]     = a;
-            p2[i]     = a + b;
-        }
+        s->tdsp.decorrelate_sm(p1, p2, length);
         break;
     case 4: /* side/left with scale factor */
         FFSWAP(int32_t*, p1, p2);
+        FFSWAP(int32_t, bp1, bp2);
     case 5: /* side/right with scale factor */
         dshift  = get_bits_esc4(gb);
         dfactor = get_sbits(gb, 10);
-        for (i = 0; i < length; i++) {
-            int32_t a = p1[i];
-            int32_t b = p2[i];
-            b         = dfactor * (b >> dshift) + 128 >> 8 << dshift;
-            p1[i]     = b - a;
-        }
+        s->tdsp.decorrelate_sf(p1, p2, length, dshift, dfactor);
         break;
     case 6:
         FFSWAP(int32_t*, p1, p2);
     case 7: {
-        LOCAL_ALIGNED_16(int16_t, filter, [MAX_PREDICTORS]);
         int length2, order_half, filter_order, dval1, dval2;
-        int av_uninit(code_size);
-
-        memset(filter, 0, MAX_PREDICTORS * sizeof(*filter));
+        int tmp, x, code_size;
 
         if (length < 256)
             return AVERROR_INVALIDDATA;
@@ -616,7 +598,7 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
         for (i = 0; i < filter_order; i++) {
             if (!(i & 3))
                 code_size = 14 - get_bits(gb, 3);
-            filter[i] = get_sbits(gb, code_size);
+            s->filter[i] = get_sbits(gb, code_size);
         }
 
         order_half = filter_order / 2;
@@ -640,24 +622,40 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
             }
         }
 
-        av_fast_malloc(&s->residues, &s->residues_buf_size,
-                       FFALIGN(length + 16, 16) * sizeof(*s->residues));
-        if (!s->residues)
-            return AVERROR(ENOMEM);
-        memset(s->residues, 0, s->residues_buf_size);
 
-        for (i = 0; i < length; i++)
-            s->residues[i] = p2[i] >> dshift;
+        for (i = 0; i < filter_order; i++)
+            s->residues[i] = *p2++ >> dshift;
 
         p1 += order_half;
+        x = FF_ARRAY_ELEMS(s->residues) - filter_order;
+        for (; length2 > 0; length2 -= tmp) {
+            tmp = FFMIN(length2, x);
+
+            for (i = 0; i < tmp - (tmp == length2); i++)
+                s->residues[filter_order + i] = *p2++ >> dshift;
+
+            for (i = 0; i < tmp; i++) {
+                int v = 1 << 9;
+
+                if (filter_order == 16) {
+                    v += s->adsp.scalarproduct_int16(&s->residues[i], s->filter,
+                                                     filter_order);
+                } else {
+                    v += s->residues[i + 7] * s->filter[7] +
+                         s->residues[i + 6] * s->filter[6] +
+                         s->residues[i + 5] * s->filter[5] +
+                         s->residues[i + 4] * s->filter[4] +
+                         s->residues[i + 3] * s->filter[3] +
+                         s->residues[i + 2] * s->filter[2] +
+                         s->residues[i + 1] * s->filter[1] +
+                         s->residues[i    ] * s->filter[0];
+                }
 
-        for (i = 0; i < length2; i++) {
-            int v = 1 << 9;
-
-            v += s->adsp.scalarproduct_int16(&s->residues[i], filter,
-                                             FFALIGN(filter_order, 16));
+                v = (av_clip_intp2(v >> 10, 13) << dshift) - *p1;
+                *p1++ = v;
+            }
 
-            p1[i] = (av_clip_intp2(v >> 10, 13) << dshift) - p1[i];
+            memmove(s->residues, &s->residues[tmp], 2 * filter_order);
         }
 
         emms_c();
@@ -665,6 +663,11 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, int length)
     }
     }
 
+    if (s->dmode > 0 && s->dmode < 6) {
+        p1[0] = bp1;
+        p2[0] = bp2;
+    }
+
     return 0;
 }
 
@@ -673,24 +676,21 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
 {
     TAKDecContext *s  = avctx->priv_data;
     AVFrame *frame    = data;
+    ThreadFrame tframe = { .f = data };
     GetBitContext *gb = &s->gb;
     int chan, i, ret, hsize;
 
     if (pkt->size < TAK_MIN_FRAME_HEADER_BYTES)
         return AVERROR_INVALIDDATA;
 
-    init_get_bits(gb, pkt->data, pkt->size * 8);
+    if ((ret = init_get_bits8(gb, pkt->data, pkt->size)) < 0)
+        return ret;
 
     if ((ret = ff_tak_decode_frame_header(avctx, gb, &s->ti, 0)) < 0)
         return ret;
 
-    if (s->ti.flags & TAK_FRAME_FLAG_HAS_METADATA) {
-        avpriv_request_sample(avctx, "Frame metadata");
-        return AVERROR_PATCHWELCOME;
-    }
-
     hsize = get_bits_count(gb) / 8;
-    if (avctx->err_recognition & AV_EF_CRCCHECK) {
+    if (avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_COMPLIANT)) {
         if (ff_tak_check_crc(pkt->data, hsize)) {
             av_log(avctx, AV_LOG_ERROR, "CRC error\n");
             if (avctx->err_recognition & AV_EF_EXPLODE)
@@ -724,11 +724,9 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->ti.bps != avctx->bits_per_coded_sample) {
-        avctx->bits_per_coded_sample = s->ti.bps;
-        if ((ret = set_bps_params(avctx)) < 0)
-            return ret;
-    }
+    avctx->bits_per_raw_sample = s->ti.bps;
+    if ((ret = set_bps_params(avctx)) < 0)
+        return ret;
     if (s->ti.sample_rate != avctx->sample_rate) {
         avctx->sample_rate = s->ti.sample_rate;
         set_sample_rate_params(avctx);
@@ -741,10 +739,11 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                                              : s->ti.frame_samples;
 
     frame->nb_samples = s->nb_samples;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
+    ff_thread_finish_setup(avctx);
 
-    if (avctx->bits_per_coded_sample <= 16) {
+    if (avctx->bits_per_raw_sample <= 16) {
         int buf_size = av_samples_get_buffer_size(NULL, avctx->channels,
                                                   s->nb_samples,
                                                   AV_SAMPLE_FMT_S32P, 0);
@@ -767,7 +766,7 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
         for (chan = 0; chan < avctx->channels; chan++) {
             int32_t *decoded = s->decoded[chan];
             for (i = 0; i < s->nb_samples; i++)
-                decoded[i] = get_sbits(gb, avctx->bits_per_coded_sample);
+                decoded[i] = get_sbits(gb, avctx->bits_per_raw_sample);
         }
     } else {
         if (s->ti.codec == TAK_CODEC_MONO_STEREO) {
@@ -776,9 +775,9 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
                     return ret;
 
             if (avctx->channels == 2) {
-                if (get_bits1(gb)) {
-                    // some kind of subframe length, but it seems to be unused
-                    skip_bits(gb, 6);
+                s->nb_subframes = get_bits(gb, 1) + 1;
+                if (s->nb_subframes > 1) {
+                    s->subframe_len[1] = get_bits(gb, 6);
                 }
 
                 s->dmode = get_bits(gb, 3);
@@ -872,7 +871,7 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
     else if (get_bits_left(gb) > 0)
         av_log(avctx, AV_LOG_DEBUG, "underread\n");
 
-    if (avctx->err_recognition & AV_EF_CRCCHECK) {
+    if (avctx->err_recognition & (AV_EF_CRCCHECK | AV_EF_COMPLIANT)) {
         if (ff_tak_check_crc(pkt->data + hsize,
                              get_bits_count(gb) / 8 - hsize)) {
             av_log(avctx, AV_LOG_ERROR, "CRC error\n");
@@ -913,12 +912,32 @@ static int tak_decode_frame(AVCodecContext *avctx, void *data,
     return pkt->size;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    TAKDecContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return 0;
+}
+
+static int update_thread_context(AVCodecContext *dst,
+                                 const AVCodecContext *src)
+{
+    TAKDecContext *tsrc = src->priv_data;
+    TAKDecContext *tdst = dst->priv_data;
+
+    if (dst == src)
+        return 0;
+    memcpy(&tdst->ti, &tsrc->ti, sizeof(TAKStreamInfo));
+    return 0;
+}
+#endif
+
 static av_cold int tak_decode_close(AVCodecContext *avctx)
 {
     TAKDecContext *s = avctx->priv_data;
 
     av_freep(&s->decode_buffer);
-    av_freep(&s->residues);
 
     return 0;
 }
@@ -930,10 +949,11 @@ AVCodec ff_tak_decoder = {
     .id               = AV_CODEC_ID_TAK,
     .priv_data_size   = sizeof(TAKDecContext),
     .init             = tak_decode_init,
-    .init_static_data = tak_init_static_data,
     .close            = tak_decode_close,
     .decode           = tak_decode_frame,
-    .capabilities     = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(update_thread_context),
+    .capabilities     = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
     .sample_fmts      = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_U8P,
                                                         AV_SAMPLE_FMT_S16P,
                                                         AV_SAMPLE_FMT_S32P,
diff --git a/libavcodec/takdsp.c b/libavcodec/takdsp.c
new file mode 100644
index 0000000..2441c2b
--- /dev/null
+++ b/libavcodec/takdsp.c
@@ -0,0 +1,82 @@
+/*
+ * TAK decoder
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "takdsp.h"
+#include "config.h"
+
+static void decorrelate_ls(int32_t *p1, int32_t *p2, int length)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        p2[i]     = a + b;
+    }
+}
+
+static void decorrelate_sr(int32_t *p1, int32_t *p2, int length)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        p1[i]     = b - a;
+    }
+}
+
+static void decorrelate_sm(int32_t *p1, int32_t *p2, int length)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        a        -= b >> 1;
+        p1[i]     = a;
+        p2[i]     = a + b;
+    }
+}
+
+static void decorrelate_sf(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor)
+{
+    int i;
+
+    for (i = 0; i < length; i++) {
+        int32_t a = p1[i];
+        int32_t b = p2[i];
+        b         = dfactor * (b >> dshift) + 128 >> 8 << dshift;
+        p1[i]     = b - a;
+    }
+}
+
+av_cold void ff_takdsp_init(TAKDSPContext *c)
+{
+    c->decorrelate_ls = decorrelate_ls;
+    c->decorrelate_sr = decorrelate_sr;
+    c->decorrelate_sm = decorrelate_sm;
+    c->decorrelate_sf = decorrelate_sf;
+
+    if (ARCH_X86)
+        ff_takdsp_init_x86(c);
+}
diff --git a/libavcodec/takdsp.h b/libavcodec/takdsp.h
new file mode 100644
index 0000000..c05b574
--- /dev/null
+++ b/libavcodec/takdsp.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TAKDSP_H
+#define AVCODEC_TAKDSP_H
+
+#include <stdint.h>
+
+typedef struct TAKDSPContext {
+    void (*decorrelate_ls)(int32_t *p1, int32_t *p2, int length);
+    void (*decorrelate_sr)(int32_t *p1, int32_t *p2, int length);
+    void (*decorrelate_sm)(int32_t *p1, int32_t *p2, int length);
+    void (*decorrelate_sf)(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+} TAKDSPContext;
+
+void ff_takdsp_init(TAKDSPContext *c);
+void ff_takdsp_init_x86(TAKDSPContext *c);
+
+#endif /* AVCODEC_TAKDSP_H */
diff --git a/libavcodec/targa.c b/libavcodec/targa.c
index ef8565f..215c0f5 100644
--- a/libavcodec/targa.c
+++ b/libavcodec/targa.c
@@ -2,20 +2,20 @@
  * Targa (.tga) image decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,22 +28,37 @@
 
 typedef struct TargaContext {
     GetByteContext gb;
-
-    int color_type;
-    int compression_type;
 } TargaContext;
 
+static uint8_t *advance_line(uint8_t *start, uint8_t *line,
+                             int stride, int *y, int h, int interleave)
+{
+    *y += interleave;
+
+    if (*y < h) {
+        return line + interleave * stride;
+    } else {
+        *y = (*y + 1) & (interleave - 1);
+        if (*y && *y < h) {
+            return start + *y * stride;
+        } else {
+            return NULL;
+        }
+    }
+}
+
 static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
-                            uint8_t *dst, int w, int h, int stride, int bpp)
+                            uint8_t *start, int w, int h, int stride,
+                            int bpp, int interleave)
 {
     int x, y;
     int depth = (bpp + 1) >> 3;
     int type, count;
-    int diff;
+    uint8_t *line = start;
+    uint8_t *dst  = line;
 
-    diff = stride - w * depth;
-    x = y = 0;
-    while (y < h) {
+    x = y = count = 0;
+    while (dst) {
         if (bytestream2_get_bytes_left(&s->gb) <= 0) {
             av_log(avctx, AV_LOG_ERROR,
                    "Ran ouf of data before end-of-image\n");
@@ -52,12 +67,6 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
         type  = bytestream2_get_byteu(&s->gb);
         count = (type & 0x7F) + 1;
         type &= 0x80;
-        if (x + count > w && x + count + 1 > (h - y) * w) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Packet went out of bounds: position (%i,%i) size %i\n",
-                   x, y, count);
-            return AVERROR_INVALIDDATA;
-        }
         if (!type) {
             do {
                 int n  = FFMIN(count, w - x);
@@ -67,10 +76,9 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
                 x     += n;
                 if (x == w) {
                     x    = 0;
-                    y++;
-                    dst += diff;
+                    dst = line = advance_line(start, line, stride, &y, h, interleave);
                 }
-            } while (count > 0);
+            } while (dst && count > 0);
         } else {
             uint8_t tmp[4];
             bytestream2_get_buffer(&s->gb, tmp, depth);
@@ -84,12 +92,17 @@ static int targa_decode_rle(AVCodecContext *avctx, TargaContext *s,
                 } while (--n);
                 if (x == w) {
                     x    = 0;
-                    y++;
-                    dst += diff;
+                    dst = line = advance_line(start, line, stride, &y, h, interleave);
                 }
-            } while (count > 0);
+            } while (dst && count > 0);
         }
     }
+
+    if (count) {
+        av_log(avctx, AV_LOG_ERROR, "Packet went out of bounds\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     return 0;
 }
 
@@ -101,14 +114,15 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p = data;
     uint8_t *dst;
     int stride;
-    int idlen, compr, y, w, h, bpp, flags, ret;
+    int idlen, pal, compr, y, w, h, bpp, flags, ret;
     int first_clr, colors, csize;
+    int interleave;
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
     /* parse image header */
     idlen     = bytestream2_get_byte(&s->gb);
-    bytestream2_skip(&s->gb, 1); /* pal */
+    pal       = bytestream2_get_byte(&s->gb);
     compr     = bytestream2_get_byte(&s->gb);
     first_clr = bytestream2_get_le16(&s->gb);
     colors    = bytestream2_get_le16(&s->gb);
@@ -117,17 +131,29 @@ static int decode_frame(AVCodecContext *avctx,
     w         = bytestream2_get_le16(&s->gb);
     h         = bytestream2_get_le16(&s->gb);
     bpp       = bytestream2_get_byte(&s->gb);
+
+    if (bytestream2_get_bytes_left(&s->gb) <= idlen) {
+        av_log(avctx, AV_LOG_ERROR,
+                "Not enough data to read header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     flags     = bytestream2_get_byte(&s->gb);
+
+    if (!pal && (first_clr || colors || csize)) {
+        av_log(avctx, AV_LOG_WARNING, "File without colormap has colormap information set.\n");
+        // specification says we should ignore those value in this case
+        first_clr = colors = csize = 0;
+    }
+
     // skip identifier if any
     bytestream2_skip(&s->gb, idlen);
 
-    switch(bpp){
+    switch (bpp) {
     case 8:
         avctx->pix_fmt = ((compr & (~TGA_RLE)) == TGA_BW) ? AV_PIX_FMT_GRAY8 : AV_PIX_FMT_PAL8;
         break;
     case 15:
-        avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
-        break;
     case 16:
         avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
         break;
@@ -142,28 +168,34 @@ static int decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
+    if (colors && (colors + first_clr) > 256) {
+        av_log(avctx, AV_LOG_ERROR, "Incorrect palette: %i colors with offset %i\n", colors, first_clr);
+        return AVERROR_INVALIDDATA;
+    }
+
     if ((ret = ff_set_dimensions(avctx, w, h)) < 0)
         return ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
-    if(flags & 0x20){
+    p->pict_type = AV_PICTURE_TYPE_I;
+
+    if (flags & TGA_TOPTOBOTTOM) {
         dst = p->data[0];
         stride = p->linesize[0];
-    }else{ //image is upside-down
+    } else { //image is upside-down
         dst = p->data[0] + p->linesize[0] * (h - 1);
         stride = -p->linesize[0];
     }
 
-    if(colors){
+    interleave = flags & TGA_INTERLEAVE2 ? 2 :
+                 flags & TGA_INTERLEAVE4 ? 4 : 1;
+
+    if (colors) {
         int pal_size, pal_sample_size;
-        if((colors + first_clr) > 256){
-            av_log(avctx, AV_LOG_ERROR, "Incorrect palette: %i colors with offset %i\n", colors, first_clr);
-            return AVERROR_INVALIDDATA;
-        }
+
         switch (csize) {
+        case 32: pal_sample_size = 4; break;
         case 24: pal_sample_size = 3; break;
         case 16:
         case 15: pal_sample_size = 2; break;
@@ -172,9 +204,9 @@ static int decode_frame(AVCodecContext *avctx,
             return AVERROR_INVALIDDATA;
         }
         pal_size = colors * pal_sample_size;
-        if(avctx->pix_fmt != AV_PIX_FMT_PAL8)//should not occur but skip palette anyway
+        if (avctx->pix_fmt != AV_PIX_FMT_PAL8) //should not occur but skip palette anyway
             bytestream2_skip(&s->gb, pal_size);
-        else{
+        else {
             int t;
             uint32_t *pal = ((uint32_t *)p->data[1]) + first_clr;
 
@@ -184,10 +216,14 @@ static int decode_frame(AVCodecContext *avctx,
                 return AVERROR_INVALIDDATA;
             }
             switch (pal_sample_size) {
+            case 4:
+                for (t = 0; t < colors; t++)
+                    *pal++ = bytestream2_get_le32u(&s->gb);
+                break;
             case 3:
                 /* RGB24 */
                 for (t = 0; t < colors; t++)
-                    *pal++ = bytestream2_get_le24u(&s->gb);
+                    *pal++ = (0xffU<<24) | bytestream2_get_le24u(&s->gb);
                 break;
             case 2:
                 /* RGB555 */
@@ -198,30 +234,59 @@ static int decode_frame(AVCodecContext *avctx,
                         ((v & 0x001F) <<  3);
                     /* left bit replication */
                     v |= (v & 0xE0E0E0U) >> 5;
-                    *pal++ = v;
+                    *pal++ = (0xffU<<24) | v;
                 }
                 break;
             }
             p->palette_has_changed = 1;
         }
     }
+
     if ((compr & (~TGA_RLE)) == TGA_NODATA) {
         memset(p->data[0], 0, p->linesize[0] * h);
     } else {
-        if(compr & TGA_RLE){
-            int res = targa_decode_rle(avctx, s, dst, w, h, stride, bpp);
+        if (compr & TGA_RLE) {
+            int res = targa_decode_rle(avctx, s, dst, w, h, stride, bpp, interleave);
             if (res < 0)
                 return res;
         } else {
             size_t img_size = w * ((bpp + 1) >> 3);
+            uint8_t *line;
             if (bytestream2_get_bytes_left(&s->gb) < img_size * h) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Not enough data available for image\n");
                 return AVERROR_INVALIDDATA;
             }
-            for (y = 0; y < h; y++) {
-                bytestream2_get_bufferu(&s->gb, dst, img_size);
-                dst += stride;
+
+            line = dst;
+            y = 0;
+            do {
+                bytestream2_get_buffer(&s->gb, line, img_size);
+                line = advance_line(dst, line, stride, &y, h, interleave);
+            } while (line);
+        }
+    }
+
+    if (flags & TGA_RIGHTTOLEFT) { // right-to-left, needs horizontal flip
+        int x;
+        for (y = 0; y < h; y++) {
+            void *line = &p->data[0][y * p->linesize[0]];
+            for (x = 0; x < w >> 1; x++) {
+                switch (bpp) {
+                case 32:
+                    FFSWAP(uint32_t, ((uint32_t *)line)[x], ((uint32_t *)line)[w - x - 1]);
+                    break;
+                case 24:
+                    FFSWAP(uint8_t, ((uint8_t *)line)[3 * x    ], ((uint8_t *)line)[3 * w - 3 * x - 3]);
+                    FFSWAP(uint8_t, ((uint8_t *)line)[3 * x + 1], ((uint8_t *)line)[3 * w - 3 * x - 2]);
+                    FFSWAP(uint8_t, ((uint8_t *)line)[3 * x + 2], ((uint8_t *)line)[3 * w - 3 * x - 1]);
+                    break;
+                case 16:
+                    FFSWAP(uint16_t, ((uint16_t *)line)[x], ((uint16_t *)line)[w - x - 1]);
+                    break;
+                case 8:
+                    FFSWAP(uint8_t, ((uint8_t *)line)[x], ((uint8_t *)line)[w - x - 1]);
+                }
             }
         }
     }
diff --git a/libavcodec/targa.h b/libavcodec/targa.h
index f4ef553..c2f5224 100644
--- a/libavcodec/targa.h
+++ b/libavcodec/targa.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,4 +38,11 @@ enum TargaCompr {
     TGA_RLE    = 8, // flag pointing that data is RLE-coded
 };
 
+enum TargaFlags {
+    TGA_RIGHTTOLEFT = 0x10, // right-to-left (flipped horizontally)
+    TGA_TOPTOBOTTOM = 0x20, // top-to-bottom (NOT flipped vertically)
+    TGA_INTERLEAVE2 = 0x40, // 2-way interleave, odd then even lines
+    TGA_INTERLEAVE4 = 0x80, // 4-way interleave
+};
+
 #endif /* AVCODEC_TARGA_H */
diff --git a/libavcodec/targa_y216dec.c b/libavcodec/targa_y216dec.c
new file mode 100644
index 0000000..21b3d35
--- /dev/null
+++ b/libavcodec/targa_y216dec.c
@@ -0,0 +1,83 @@
+/*
+ * Pinnacle TARGA CineWave YUV16 decoder
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y216_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV422P16;
+    avctx->bits_per_raw_sample = 14;
+
+    return 0;
+}
+
+static int y216_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint16_t *src = (uint16_t *)avpkt->data;
+    uint16_t *y, *u, *v, aligned_width = FFALIGN(avctx->width, 4);
+    int i, j, ret;
+
+    if (avpkt->size < 4 * avctx->height * aligned_width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = (uint16_t *)pic->data[0];
+    u = (uint16_t *)pic->data[1];
+    v = (uint16_t *)pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width >> 1; j++) {
+            u[    j    ] = src[4 * j    ] << 2 | src[4 * j    ] >> 14;
+            y[2 * j    ] = src[4 * j + 1] << 2 | src[4 * j + 1] >> 14;
+            v[    j    ] = src[4 * j + 2] << 2 | src[4 * j + 2] >> 14;
+            y[2 * j + 1] = src[4 * j + 3] << 2 | src[4 * j + 3] >> 14;
+        }
+
+        y += pic->linesize[0] >> 1;
+        u += pic->linesize[1] >> 1;
+        v += pic->linesize[2] >> 1;
+        src += aligned_width << 1;
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_targa_y216_decoder = {
+    .name         = "targa_y216",
+    .long_name    = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_TARGA_Y216,
+    .init         = y216_decode_init,
+    .decode       = y216_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/targaenc.c b/libavcodec/targaenc.c
index 204ecfe..66bc55c 100644
--- a/libavcodec/targaenc.c
+++ b/libavcodec/targaenc.c
@@ -2,20 +2,20 @@
  * Targa (.tga) image encoder
  * Copyright (c) 2007 Bobby Bingham
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -86,7 +86,7 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                               const AVFrame *p, int *got_packet)
 {
     TargaContext *s = avctx->priv_data;
-    int bpp, picsize, datasize = -1, ret;
+    int bpp, picsize, datasize = -1, ret, i;
     uint8_t *out;
 
     if(avctx->width > 0xffff || avctx->height > 0xffff) {
@@ -95,10 +95,8 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
     picsize = av_image_get_buffer_size(avctx->pix_fmt,
                                        avctx->width, avctx->height, 1);
-    if ((ret = ff_alloc_packet(pkt, picsize + 45)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "encoded frame too large\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, picsize + 45, 0)) < 0)
         return ret;
-    }
 
     /* zero out the header and only set applicable fields */
     memset(pkt->data, 0, 12);
@@ -107,13 +105,39 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     /* image descriptor byte: origin is always top-left, bits 0-3 specify alpha */
     pkt->data[17] = 0x20 | (avctx->pix_fmt == AV_PIX_FMT_BGRA ? 8 : 0);
 
+    out = pkt->data + 18;  /* skip past the header we write */
+
+    avctx->bits_per_coded_sample = av_get_bits_per_pixel(av_pix_fmt_desc_get(avctx->pix_fmt));
     switch(avctx->pix_fmt) {
+    case AV_PIX_FMT_PAL8: {
+        int pal_bpp = 24; /* Only write 32bit palette if there is transparency information */
+        for (i = 0; i < 256; i++)
+            if (AV_RN32(p->data[1] + 4 * i) >> 24 != 0xFF) {
+                pal_bpp = 32;
+                break;
+            }
+        pkt->data[1]  = 1;          /* palette present */
+        pkt->data[2]  = TGA_PAL;    /* uncompressed palettised image */
+        pkt->data[6]  = 1;          /* palette contains 256 entries */
+        pkt->data[7]  = pal_bpp;    /* palette contains pal_bpp bit entries */
+        pkt->data[16] = 8;          /* bpp */
+        for (i = 0; i < 256; i++)
+            if (pal_bpp == 32) {
+                AV_WL32(pkt->data + 18 + 4 * i, *(uint32_t *)(p->data[1] + i * 4));
+            } else {
+            AV_WL24(pkt->data + 18 + 3 * i, *(uint32_t *)(p->data[1] + i * 4));
+            }
+        out += 32 * pal_bpp;        /* skip past the palette we just output */
+        break;
+        }
     case AV_PIX_FMT_GRAY8:
         pkt->data[2]  = TGA_BW;     /* uncompressed grayscale image */
+        avctx->bits_per_coded_sample = 0x28;
         pkt->data[16] = 8;          /* bpp */
         break;
     case AV_PIX_FMT_RGB555LE:
-        pkt->data[2]  = TGA_RGB;    /* uncompresses true-color image */
+        pkt->data[2]  = TGA_RGB;    /* uncompressed true-color image */
+        avctx->bits_per_coded_sample =
         pkt->data[16] = 16;         /* bpp */
         break;
     case AV_PIX_FMT_BGR24:
@@ -131,7 +155,6 @@ static int targa_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     }
     bpp = pkt->data[16] >> 3;
 
-    out = pkt->data + 18;  /* skip past the header we just output */
 
 #if FF_API_CODER_TYPE
 FF_DISABLE_DEPRECATION_WARNINGS
@@ -146,7 +169,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     /* if that worked well, mark the picture as RLE compressed */
     if(datasize >= 0)
-        pkt->data[2] |= 8;
+        pkt->data[2] |= TGA_RLE;
 
     /* if RLE didn't make it smaller, go back to no compression */
     else datasize = targa_encode_normal(out, p, bpp, avctx->width, avctx->height);
@@ -202,7 +225,7 @@ AVCodec ff_targa_encoder = {
     .init           = targa_encode_init,
     .encode2        = targa_encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]){
-        AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_RGB555LE, AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_RGB555LE, AV_PIX_FMT_GRAY8, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_NONE
     },
 };
diff --git a/libavcodec/tdsc.c b/libavcodec/tdsc.c
index c70c77f..d1265a0 100644
--- a/libavcodec/tdsc.c
+++ b/libavcodec/tdsc.c
@@ -2,20 +2,20 @@
  * TDSC decoder
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -124,8 +124,8 @@ static av_cold int tdsc_init(AVCodecContext *avctx)
     ctx->jpeg_avctx->flags = avctx->flags;
     ctx->jpeg_avctx->flags2 = avctx->flags2;
     ctx->jpeg_avctx->dct_algo = avctx->dct_algo;
-    ctx->jpeg_avctx->idct_algo = avctx->idct_algo;;
-    ret = avcodec_open2(ctx->jpeg_avctx, codec, NULL);
+    ctx->jpeg_avctx->idct_algo = avctx->idct_algo;
+    ret = ff_codec_open2_recursive(ctx->jpeg_avctx, codec, NULL);
     if (ret < 0)
         return ret;
 
diff --git a/libavcodec/textdec.c b/libavcodec/textdec.c
new file mode 100644
index 0000000..964da72
--- /dev/null
+++ b/libavcodec/textdec.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Raw subtitles decoder
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+#include "libavutil/opt.h"
+
+typedef struct {
+    AVClass *class;
+    const char *linebreaks;
+    int keep_ass_markup;
+    int readorder;
+} TextContext;
+
+#define OFFSET(x) offsetof(TextContext, x)
+#define SD AV_OPT_FLAG_SUBTITLE_PARAM | AV_OPT_FLAG_DECODING_PARAM
+static const AVOption options[] = {
+    { "keep_ass_markup", "Set if ASS tags must be escaped", OFFSET(keep_ass_markup), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, .flags=SD },
+    { NULL }
+};
+
+static int text_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVBPrint buf;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    TextContext *text = avctx->priv_data;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && *ptr) {
+        ff_ass_bprint_text_event(&buf, ptr, avpkt->size, text->linebreaks, text->keep_ass_markup);
+        ret = ff_ass_add_rect(sub, buf.str, text->readorder++, 0, NULL, NULL);
+    }
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+static void text_flush(AVCodecContext *avctx)
+{
+    TextContext *text = avctx->priv_data;
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
+        text->readorder = 0;
+}
+
+#define DECLARE_CLASS(decname) static const AVClass decname ## _decoder_class = {   \
+    .class_name = #decname " decoder",      \
+    .item_name  = av_default_item_name,     \
+    .option     = decname ## _options,      \
+    .version    = LIBAVUTIL_VERSION_INT,    \
+}
+
+#if CONFIG_TEXT_DECODER
+#define text_options options
+DECLARE_CLASS(text);
+
+AVCodec ff_text_decoder = {
+    .name           = "text",
+    .long_name      = NULL_IF_CONFIG_SMALL("Raw text subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_TEXT,
+    .decode         = text_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .priv_class     = &text_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_VPLAYER_DECODER || CONFIG_PJS_DECODER || CONFIG_SUBVIEWER1_DECODER || CONFIG_STL_DECODER
+
+static int linebreak_init(AVCodecContext *avctx)
+{
+    TextContext *text = avctx->priv_data;
+    text->linebreaks = "|";
+    return ff_ass_subtitle_header_default(avctx);
+}
+
+#if CONFIG_VPLAYER_DECODER
+#define vplayer_options options
+DECLARE_CLASS(vplayer);
+
+AVCodec ff_vplayer_decoder = {
+    .name           = "vplayer",
+    .long_name      = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_VPLAYER,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &vplayer_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_STL_DECODER
+#define stl_options options
+DECLARE_CLASS(stl);
+
+AVCodec ff_stl_decoder = {
+    .name           = "stl",
+    .long_name      = NULL_IF_CONFIG_SMALL("Spruce subtitle format"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_STL,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &stl_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_PJS_DECODER
+#define pjs_options options
+DECLARE_CLASS(pjs);
+
+AVCodec ff_pjs_decoder = {
+    .name           = "pjs",
+    .long_name      = NULL_IF_CONFIG_SMALL("PJS subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_PJS,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &pjs_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#if CONFIG_SUBVIEWER1_DECODER
+#define subviewer1_options options
+DECLARE_CLASS(subviewer1);
+
+AVCodec ff_subviewer1_decoder = {
+    .name           = "subviewer1",
+    .long_name      = NULL_IF_CONFIG_SMALL("SubViewer1 subtitle"),
+    .priv_data_size = sizeof(TextContext),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_SUBVIEWER1,
+    .decode         = text_decode_frame,
+    .init           = linebreak_init,
+    .priv_class     = &subviewer1_decoder_class,
+    .flush          = text_flush,
+};
+#endif
+
+#endif /* text subtitles with '|' line break */
diff --git a/libavcodec/texturedsp.c b/libavcodec/texturedsp.c
index 7b54a5d..5012245 100644
--- a/libavcodec/texturedsp.c
+++ b/libavcodec/texturedsp.c
@@ -28,13 +28,14 @@
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
+#include "libavutil/libm.h"
 
 #include "texturedsp.h"
 
-#define RGBA(r, g, b, a) ((uint8_t)(r) <<  0) | \
-                         ((uint8_t)(g) <<  8) | \
-                         ((uint8_t)(b) << 16) | \
-                         ((uint8_t)(a) << 24)
+#define RGBA(r, g, b, a) (((uint8_t)(r) <<  0) | \
+                          ((uint8_t)(g) <<  8) | \
+                          ((uint8_t)(b) << 16) | \
+                          ((uint8_t)(a) << 24))
 
 static av_always_inline void extract_color(uint32_t colors[4],
                                            uint16_t color0,
@@ -428,7 +429,7 @@ static inline void rgtc_block_internal(uint8_t *dst, ptrdiff_t stride,
             int i = indices[x + y * 4];
             /* Interval expansion from [-1 1] or [0 1] to [0 255]. */
             int c = color_tab[i];
-            uint32_t pixel = RGBA(c, c, c, 255);
+            uint32_t pixel = RGBA(c, c, c, 255U);
             AV_WL32(dst + x * 4 + y * stride, pixel);
         }
     }
@@ -528,7 +529,7 @@ static inline void rgtc2_block_internal(uint8_t *dst, ptrdiff_t stride,
 
             int d = (255 * 255 - r * r - g * g) / 2;
             if (d > 0)
-                b = rint(sqrtf(d));
+                b = lrint(sqrtf(d));
 
             p[0] = r;
             p[1] = g;
diff --git a/libavcodec/texturedsp.h b/libavcodec/texturedsp.h
index fcbe7a4..26f3b64 100644
--- a/libavcodec/texturedsp.h
+++ b/libavcodec/texturedsp.h
@@ -2,20 +2,20 @@
  * Texture block module
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/texturedspenc.c b/libavcodec/texturedspenc.c
index 6fdf9c8..36506a6 100644
--- a/libavcodec/texturedspenc.c
+++ b/libavcodec/texturedspenc.c
@@ -3,7 +3,7 @@
  * Copyright (C) 2015 Vittorio Giovara <vittorio.giovara@gmail.com>
  * Based on public domain code by Fabian Giesen, Sean Barrett and Yann Collet.
  *
- * This file is part of Libav
+ * This file is part of FFmpeg
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -140,14 +140,14 @@ static const uint8_t match6[256][2] = {
 };
 
 /* Multiplication over 8 bit emulation */
-#define mul8(a, b) (a * b + 128 + ((a * b + 128) >> 8)) >> 8
+#define mul8(a, b) (((a) * (b) + 128 + (((a) * (b) + 128) >> 8)) >> 8)
 
 /* Conversion from rgb24 to rgb565 */
 #define rgb2rgb565(r, g, b) \
-    (mul8(r, 31) << 11) | (mul8(g, 63) << 5) | (mul8(b, 31) << 0)
+    ((mul8(r, 31) << 11) | (mul8(g, 63) << 5) | (mul8(b, 31) << 0))
 
 /* Linear interpolation at 1/3 point between a and b */
-#define lerp13(a, b) (2 * a + b) / 3
+#define lerp13(a, b) ((2 * (a) + (b)) / 3)
 
 /* Linear interpolation on an RGB pixel */
 static inline void lerp13rgb(uint8_t *out, uint8_t *p1, uint8_t *p2)
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index 864e67e..c848d7a 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Alexander Strange <astrange@ithinksw.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -98,6 +98,16 @@ void ff_thread_report_progress(ThreadFrame *f, int progress, int field);
 void ff_thread_await_progress(ThreadFrame *f, int progress, int field);
 
 /**
+ * Wrapper around get_format() for frame-multithreaded codecs.
+ * Call this function instead of avctx->get_format().
+ * Cannot be called after the codec has called ff_thread_finish_setup().
+ *
+ * @param avctx The current context.
+ * @param fmt The list of available formats.
+ */
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt);
+
+/**
  * Wrapper around get_buffer() for frame-multithreaded codecs.
  * Call this function instead of ff_get_buffer(f).
  * Cannot be called after the codec has called ff_thread_finish_setup().
@@ -125,4 +135,9 @@ int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src);
 int ff_thread_init(AVCodecContext *s);
 void ff_thread_free(AVCodecContext *s);
 
+int ff_alloc_entries(AVCodecContext *avctx, int count);
+void ff_reset_entries(AVCodecContext *avctx);
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n);
+void ff_thread_await_progress2(AVCodecContext *avctx,  int field, int thread, int shift);
+
 #endif /* AVCODEC_THREAD_H */
diff --git a/libavcodec/tiertexseqv.c b/libavcodec/tiertexseqv.c
index 626324a..df12ee3 100644
--- a/libavcodec/tiertexseqv.c
+++ b/libavcodec/tiertexseqv.c
@@ -2,20 +2,20 @@
  * Tiertex Limited SEQ Video Decoder
  * Copyright (c) 2006 Gregory Montoir (cyx@users.sourceforge.net)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -179,7 +179,7 @@ static int seqvideo_decode(SeqVideoContext *seq, const unsigned char *data, int
         for (i = 0; i < 256; i++) {
             for (j = 0; j < 3; j++, data++)
                 c[j] = (*data << 2) | (*data >> 4);
-            palette[i] = AV_RB24(c);
+            palette[i] = 0xFFU << 24 | AV_RB24(c);
         }
         seq->frame->palette_has_changed = 1;
     }
@@ -234,10 +234,8 @@ static int seqvideo_decode_frame(AVCodecContext *avctx,
 
     SeqVideoContext *seq = avctx->priv_data;
 
-    if ((ret = ff_reget_buffer(avctx, seq->frame)) < 0) {
-        av_log(seq->avctx, AV_LOG_ERROR, "tiertexseqvideo: reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, seq->frame)) < 0)
         return ret;
-    }
 
     if (seqvideo_decode(seq, buf, buf_size))
         return AVERROR_INVALIDDATA;
diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index 97b9d6f..4be587d 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -1,21 +1,20 @@
 /*
- * TIFF image decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +28,13 @@
 #if CONFIG_ZLIB
 #include <zlib.h>
 #endif
+#if CONFIG_LZMA
+#define LZMA_API_STATIC
+#include <lzma.h>
+#endif
 
 #include "libavutil/attributes.h"
+#include "libavutil/avstring.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
@@ -40,6 +44,8 @@
 #include "lzw.h"
 #include "mathops.h"
 #include "tiff.h"
+#include "tiff_data.h"
+#include "thread.h"
 
 typedef struct TiffContext {
     AVCodecContext *avctx;
@@ -53,33 +59,259 @@ typedef struct TiffContext {
     enum TiffCompr compr;
     enum TiffPhotometric photometric;
     int planar;
+    int subsampling[2];
     int fax_opts;
     int predictor;
     int fill_order;
+    uint32_t res[4];
 
     int strips, rps, sstype;
     int sot;
     int stripsizesoff, stripsize, stripoff, strippos;
     LZWState *lzw;
+
+    uint8_t *deinvert_buf;
+    int deinvert_buf_size;
+    uint8_t *yuv_line;
+    unsigned int yuv_line_size;
+
+    int geotag_count;
+    TiffGeoTag *geotags;
 } TiffContext;
 
-static unsigned tget_short(GetByteContext *gb, int le)
+static void free_geotags(TiffContext *const s)
+{
+    int i;
+    for (i = 0; i < s->geotag_count; i++) {
+        if (s->geotags[i].val)
+            av_freep(&s->geotags[i].val);
+    }
+    av_freep(&s->geotags);
+    s->geotag_count = 0;
+}
+
+#define RET_GEOKEY(TYPE, array, element)\
+    if (key >= TIFF_##TYPE##_KEY_ID_OFFSET &&\
+        key - TIFF_##TYPE##_KEY_ID_OFFSET < FF_ARRAY_ELEMS(ff_tiff_##array##_name_type_map))\
+        return ff_tiff_##array##_name_type_map[key - TIFF_##TYPE##_KEY_ID_OFFSET].element;
+
+static const char *get_geokey_name(int key)
+{
+    RET_GEOKEY(VERT, vert, name);
+    RET_GEOKEY(PROJ, proj, name);
+    RET_GEOKEY(GEOG, geog, name);
+    RET_GEOKEY(CONF, conf, name);
+
+    return NULL;
+}
+
+static int get_geokey_type(int key)
+{
+    RET_GEOKEY(VERT, vert, type);
+    RET_GEOKEY(PROJ, proj, type);
+    RET_GEOKEY(GEOG, geog, type);
+    RET_GEOKEY(CONF, conf, type);
+
+    return AVERROR_INVALIDDATA;
+}
+
+static int cmp_id_key(const void *id, const void *k)
+{
+    return *(const int*)id - ((const TiffGeoTagKeyName*)k)->key;
+}
+
+static const char *search_keyval(const TiffGeoTagKeyName *keys, int n, int id)
+{
+    TiffGeoTagKeyName *r = bsearch(&id, keys, n, sizeof(keys[0]), cmp_id_key);
+    if(r)
+        return r->name;
+
+    return NULL;
+}
+
+static char *get_geokey_val(int key, int val)
+{
+    char *ap;
+
+    if (val == TIFF_GEO_KEY_UNDEFINED)
+        return av_strdup("undefined");
+    if (val == TIFF_GEO_KEY_USER_DEFINED)
+        return av_strdup("User-Defined");
+
+#define RET_GEOKEY_VAL(TYPE, array)\
+    if (val >= TIFF_##TYPE##_OFFSET &&\
+        val - TIFF_##TYPE##_OFFSET < FF_ARRAY_ELEMS(ff_tiff_##array##_codes))\
+        return av_strdup(ff_tiff_##array##_codes[val - TIFF_##TYPE##_OFFSET]);
+
+    switch (key) {
+    case TIFF_GT_MODEL_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GT_MODEL_TYPE, gt_model_type);
+        break;
+    case TIFF_GT_RASTER_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GT_RASTER_TYPE, gt_raster_type);
+        break;
+    case TIFF_GEOG_LINEAR_UNITS_GEOKEY:
+    case TIFF_PROJ_LINEAR_UNITS_GEOKEY:
+    case TIFF_VERTICAL_UNITS_GEOKEY:
+        RET_GEOKEY_VAL(LINEAR_UNIT, linear_unit);
+        break;
+    case TIFF_GEOG_ANGULAR_UNITS_GEOKEY:
+    case TIFF_GEOG_AZIMUTH_UNITS_GEOKEY:
+        RET_GEOKEY_VAL(ANGULAR_UNIT, angular_unit);
+        break;
+    case TIFF_GEOGRAPHIC_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(GCS_TYPE, gcs_type);
+        RET_GEOKEY_VAL(GCSE_TYPE, gcse_type);
+        break;
+    case TIFF_GEOG_GEODETIC_DATUM_GEOKEY:
+        RET_GEOKEY_VAL(GEODETIC_DATUM, geodetic_datum);
+        RET_GEOKEY_VAL(GEODETIC_DATUM_E, geodetic_datum_e);
+        break;
+    case TIFF_GEOG_ELLIPSOID_GEOKEY:
+        RET_GEOKEY_VAL(ELLIPSOID, ellipsoid);
+        break;
+    case TIFF_GEOG_PRIME_MERIDIAN_GEOKEY:
+        RET_GEOKEY_VAL(PRIME_MERIDIAN, prime_meridian);
+        break;
+    case TIFF_PROJECTED_CS_TYPE_GEOKEY:
+        ap = av_strdup(search_keyval(ff_tiff_proj_cs_type_codes, FF_ARRAY_ELEMS(ff_tiff_proj_cs_type_codes), val));
+        if(ap) return ap;
+        break;
+    case TIFF_PROJECTION_GEOKEY:
+        ap = av_strdup(search_keyval(ff_tiff_projection_codes, FF_ARRAY_ELEMS(ff_tiff_projection_codes), val));
+        if(ap) return ap;
+        break;
+    case TIFF_PROJ_COORD_TRANS_GEOKEY:
+        RET_GEOKEY_VAL(COORD_TRANS, coord_trans);
+        break;
+    case TIFF_VERTICAL_CS_TYPE_GEOKEY:
+        RET_GEOKEY_VAL(VERT_CS, vert_cs);
+        RET_GEOKEY_VAL(ORTHO_VERT_CS, ortho_vert_cs);
+        break;
+
+    }
+
+    ap = av_malloc(14);
+    if (ap)
+        snprintf(ap, 14, "Unknown-%d", val);
+    return ap;
+}
+
+static char *doubles2str(double *dp, int count, const char *sep)
 {
-    return le ? bytestream2_get_le16(gb) : bytestream2_get_be16(gb);
+    int i;
+    char *ap, *ap0;
+    uint64_t component_len;
+    if (!sep) sep = ", ";
+    component_len = 24LL + strlen(sep);
+    if (count >= (INT_MAX - 1)/component_len)
+        return NULL;
+    ap = av_malloc(component_len * count + 1);
+    if (!ap)
+        return NULL;
+    ap0   = ap;
+    ap[0] = '\0';
+    for (i = 0; i < count; i++) {
+        unsigned l = snprintf(ap, component_len, "%.15g%s", dp[i], sep);
+        if(l >= component_len) {
+            av_free(ap0);
+            return NULL;
+        }
+        ap += l;
+    }
+    ap0[strlen(ap0) - strlen(sep)] = '\0';
+    return ap0;
 }
 
-static unsigned tget_long(GetByteContext *gb, int le)
+static int add_metadata(int count, int type,
+                        const char *name, const char *sep, TiffContext *s, AVFrame *frame)
 {
-    return le ? bytestream2_get_le32(gb) : bytestream2_get_be32(gb);
+    switch(type) {
+    case TIFF_DOUBLE: return ff_tadd_doubles_metadata(count, name, sep, &s->gb, s->le, avpriv_frame_get_metadatap(frame));
+    case TIFF_SHORT : return ff_tadd_shorts_metadata(count, name, sep, &s->gb, s->le, 0, avpriv_frame_get_metadatap(frame));
+    case TIFF_STRING: return ff_tadd_string_metadata(count, name, &s->gb, s->le, avpriv_frame_get_metadatap(frame));
+    default         : return AVERROR_INVALIDDATA;
+    };
 }
 
-static unsigned tget(GetByteContext *gb, int type, int le)
+static void av_always_inline horizontal_fill(unsigned int bpp, uint8_t* dst,
+                                             int usePtr, const uint8_t *src,
+                                             uint8_t c, int width, int offset)
 {
-    switch (type) {
-    case TIFF_BYTE:  return bytestream2_get_byte(gb);
-    case TIFF_SHORT: return tget_short(gb, le);
-    case TIFF_LONG:  return tget_long(gb, le);
-    default:         return UINT_MAX;
+    switch (bpp) {
+    case 1:
+        while (--width >= 0) {
+            dst[(width+offset)*8+7] = (usePtr ? src[width] : c)      & 0x1;
+            dst[(width+offset)*8+6] = (usePtr ? src[width] : c) >> 1 & 0x1;
+            dst[(width+offset)*8+5] = (usePtr ? src[width] : c) >> 2 & 0x1;
+            dst[(width+offset)*8+4] = (usePtr ? src[width] : c) >> 3 & 0x1;
+            dst[(width+offset)*8+3] = (usePtr ? src[width] : c) >> 4 & 0x1;
+            dst[(width+offset)*8+2] = (usePtr ? src[width] : c) >> 5 & 0x1;
+            dst[(width+offset)*8+1] = (usePtr ? src[width] : c) >> 6 & 0x1;
+            dst[(width+offset)*8+0] = (usePtr ? src[width] : c) >> 7;
+        }
+        break;
+    case 2:
+        while (--width >= 0) {
+            dst[(width+offset)*4+3] = (usePtr ? src[width] : c) & 0x3;
+            dst[(width+offset)*4+2] = (usePtr ? src[width] : c) >> 2 & 0x3;
+            dst[(width+offset)*4+1] = (usePtr ? src[width] : c) >> 4 & 0x3;
+            dst[(width+offset)*4+0] = (usePtr ? src[width] : c) >> 6;
+        }
+        break;
+    case 4:
+        while (--width >= 0) {
+            dst[(width+offset)*2+1] = (usePtr ? src[width] : c) & 0xF;
+            dst[(width+offset)*2+0] = (usePtr ? src[width] : c) >> 4;
+        }
+        break;
+    default:
+        if (usePtr) {
+            memcpy(dst + offset, src, width);
+        } else {
+            memset(dst + offset, c, width);
+        }
+    }
+}
+
+static int deinvert_buffer(TiffContext *s, const uint8_t *src, int size)
+{
+    int i;
+
+    av_fast_padded_malloc(&s->deinvert_buf, &s->deinvert_buf_size, size);
+    if (!s->deinvert_buf)
+        return AVERROR(ENOMEM);
+    for (i = 0; i < size; i++)
+        s->deinvert_buf[i] = ff_reverse[src[i]];
+
+    return 0;
+}
+
+static void unpack_yuv(TiffContext *s, AVFrame *p,
+                       const uint8_t *src, int lnum)
+{
+    int i, j, k;
+    int w       = (s->width - 1) / s->subsampling[0] + 1;
+    uint8_t *pu = &p->data[1][lnum / s->subsampling[1] * p->linesize[1]];
+    uint8_t *pv = &p->data[2][lnum / s->subsampling[1] * p->linesize[2]];
+    if (s->width % s->subsampling[0] || s->height % s->subsampling[1]) {
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    p->data[0][FFMIN(lnum + j, s->height-1) * p->linesize[0] +
+                               FFMIN(i * s->subsampling[0] + k, s->width-1)] = *src++;
+            *pu++ = *src++;
+            *pv++ = *src++;
+        }
+    }else{
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    p->data[0][(lnum + j) * p->linesize[0] +
+                               i * s->subsampling[0] + k] = *src++;
+            *pu++ = *src++;
+            *pv++ = *src++;
+        }
     }
 }
 
@@ -90,7 +322,7 @@ static int tiff_uncompress(uint8_t *dst, unsigned long *len, const uint8_t *src,
     z_stream zstream = { 0 };
     int zret;
 
-    zstream.next_in   = src;
+    zstream.next_in   = (uint8_t *)src;
     zstream.avail_in  = size;
     zstream.next_out  = dst;
     zstream.avail_out = *len;
@@ -105,9 +337,9 @@ static int tiff_uncompress(uint8_t *dst, unsigned long *len, const uint8_t *src,
     return zret == Z_STREAM_END ? Z_OK : zret;
 }
 
-static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
-                            const uint8_t *src, int size,
-                            int width, int lines)
+static int tiff_unpack_zlib(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                            const uint8_t *src, int size, int width, int lines,
+                            int strip_start, int is_yuv)
 {
     uint8_t *zbuf;
     unsigned long outlen;
@@ -116,6 +348,13 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
     zbuf   = av_malloc(outlen);
     if (!zbuf)
         return AVERROR(ENOMEM);
+    if (s->fill_order) {
+        if ((ret = deinvert_buffer(s, src, size)) < 0) {
+            av_free(zbuf);
+            return ret;
+        }
+        src = s->deinvert_buf;
+    }
     ret = tiff_uncompress(zbuf, &outlen, src, size);
     if (ret != Z_OK) {
         av_log(s->avctx, AV_LOG_ERROR,
@@ -126,7 +365,15 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
     }
     src = zbuf;
     for (line = 0; line < lines; line++) {
-        memcpy(dst, src, width);
+        if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            horizontal_fill(s->bpp, dst, 1, src, 0, width, 0);
+        } else {
+            memcpy(dst, src, width);
+        }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
         dst += stride;
         src += width;
     }
@@ -135,11 +382,76 @@ static int tiff_unpack_zlib(TiffContext *s, uint8_t *dst, int stride,
 }
 #endif
 
+#if CONFIG_LZMA
+static int tiff_uncompress_lzma(uint8_t *dst, uint64_t *len, const uint8_t *src,
+                                int size)
+{
+    lzma_stream stream = LZMA_STREAM_INIT;
+    lzma_ret ret;
+
+    stream.next_in   = (uint8_t *)src;
+    stream.avail_in  = size;
+    stream.next_out  = dst;
+    stream.avail_out = *len;
+    ret              = lzma_stream_decoder(&stream, UINT64_MAX, 0);
+    if (ret != LZMA_OK) {
+        av_log(NULL, AV_LOG_ERROR, "LZMA init error: %d\n", ret);
+        return ret;
+    }
+    ret = lzma_code(&stream, LZMA_RUN);
+    lzma_end(&stream);
+    *len = stream.total_out;
+    return ret == LZMA_STREAM_END ? LZMA_OK : ret;
+}
+
+static int tiff_unpack_lzma(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                            const uint8_t *src, int size, int width, int lines,
+                            int strip_start, int is_yuv)
+{
+    uint64_t outlen = width * lines;
+    int ret, line;
+    uint8_t *buf = av_malloc(outlen);
+    if (!buf)
+        return AVERROR(ENOMEM);
+    if (s->fill_order) {
+        if ((ret = deinvert_buffer(s, src, size)) < 0) {
+            av_free(buf);
+            return ret;
+        }
+        src = s->deinvert_buf;
+    }
+    ret = tiff_uncompress_lzma(buf, &outlen, src, size);
+    if (ret != LZMA_OK) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Uncompressing failed (%"PRIu64" of %"PRIu64") with error %d\n", outlen,
+               (uint64_t)width * lines, ret);
+        av_free(buf);
+        return AVERROR_UNKNOWN;
+    }
+    src = buf;
+    for (line = 0; line < lines; line++) {
+        if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
+            horizontal_fill(s->bpp, dst, 1, src, 0, width, 0);
+        } else {
+            memcpy(dst, src, width);
+        }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
+        dst += stride;
+        src += width;
+    }
+    av_free(buf);
+    return 0;
+}
+#endif
 
 static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
-                           const uint8_t *src, int size, int lines)
+                           const uint8_t *src, int size, int width, int lines)
 {
     int i, ret = 0;
+    int line;
     uint8_t *src2 = av_malloc((unsigned)size +
                               AV_INPUT_BUFFER_PADDING_SIZE);
 
@@ -148,11 +460,7 @@ static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
                "Error allocating temporary buffer\n");
         return AVERROR(ENOMEM);
     }
-    if (s->fax_opts & 2) {
-        avpriv_request_sample(s->avctx, "Uncompressed fax mode");
-        av_free(src2);
-        return AVERROR_PATCHWELCOME;
-    }
+
     if (!s->fill_order) {
         memcpy(src2, src, size);
     } else {
@@ -162,16 +470,26 @@ static int tiff_unpack_fax(TiffContext *s, uint8_t *dst, int stride,
     memset(src2 + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
     ret = ff_ccitt_unpack(s->avctx, src2, size, dst, lines, stride,
                           s->compr, s->fax_opts);
+    if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+        for (line = 0; line < lines; line++) {
+            horizontal_fill(s->bpp, dst, 1, dst, 0, width, 0);
+            dst += stride;
+        }
     av_free(src2);
     return ret;
 }
 
-static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
-                             const uint8_t *src, int size, int lines)
+static int tiff_unpack_strip(TiffContext *s, AVFrame *p, uint8_t *dst, int stride,
+                             const uint8_t *src, int size, int strip_start, int lines)
 {
     PutByteContext pb;
     int c, line, pixels, code, ret;
+    const uint8_t *ssrc = src;
     int width = ((s->width * s->bpp) + 7) >> 3;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(p->format);
+    int is_yuv = !(desc->flags & AV_PIX_FMT_FLAG_RGB) &&
+                 (desc->flags & AV_PIX_FMT_FLAG_PLANAR) &&
+                 desc->nb_components >= 3;
 
     if (s->planar)
         width /= s->bppcount;
@@ -179,9 +497,27 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
     if (size <= 0)
         return AVERROR_INVALIDDATA;
 
+    if (is_yuv) {
+        int bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
+                            s->subsampling[0] * s->subsampling[1] + 7) >> 3;
+        av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, bytes_per_row);
+        if (s->yuv_line == NULL) {
+            av_log(s->avctx, AV_LOG_ERROR, "Not enough memory\n");
+            return AVERROR(ENOMEM);
+        }
+        dst = s->yuv_line;
+        stride = 0;
+
+        width = (s->width - 1) / s->subsampling[0] + 1;
+        width = width * s->subsampling[0] * s->subsampling[1] + 2*width;
+        av_assert0(width <= bytes_per_row);
+        av_assert0(s->bpp == 24);
+    }
+
     if (s->compr == TIFF_DEFLATE || s->compr == TIFF_ADOBE_DEFLATE) {
 #if CONFIG_ZLIB
-        return tiff_unpack_zlib(s, dst, stride, src, size, width, lines);
+        return tiff_unpack_zlib(s, p, dst, stride, src, size, width, lines,
+                                strip_start, is_yuv);
 #else
         av_log(s->avctx, AV_LOG_ERROR,
                "zlib support not enabled, "
@@ -189,7 +525,25 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
         return AVERROR(ENOSYS);
 #endif
     }
+    if (s->compr == TIFF_LZMA) {
+#if CONFIG_LZMA
+        return tiff_unpack_lzma(s, p, dst, stride, src, size, width, lines,
+                                strip_start, is_yuv);
+#else
+        av_log(s->avctx, AV_LOG_ERROR,
+               "LZMA support not enabled\n");
+        return AVERROR(ENOSYS);
+#endif
+    }
     if (s->compr == TIFF_LZW) {
+        if (s->fill_order) {
+            if ((ret = deinvert_buffer(s, src, size)) < 0)
+                return ret;
+            ssrc = src = s->deinvert_buf;
+        }
+        if (size > 1 && !src[0] && (src[1]&1)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Old style LZW is unsupported\n");
+        }
         if ((ret = ff_lzw_decode_init(s->lzw, 8, src, size, FF_LZW_TIFF)) < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Error initializing LZW decoder\n");
             return ret;
@@ -201,6 +555,12 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
                        pixels, width);
                 return AVERROR_INVALIDDATA;
             }
+            if (s->bpp < 8 && s->avctx->pix_fmt == AV_PIX_FMT_PAL8)
+                horizontal_fill(s->bpp, dst, 1, dst, 0, width, 0);
+            if (is_yuv) {
+                unpack_yuv(s, p, dst, strip_start + line);
+                line += s->subsampling[1] - 1;
+            }
             dst += stride;
         }
         return 0;
@@ -208,49 +568,91 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t *dst, int stride,
     if (s->compr == TIFF_CCITT_RLE ||
         s->compr == TIFF_G3        ||
         s->compr == TIFF_G4) {
-        return tiff_unpack_fax(s, dst, stride, src, size, lines);
+        if (is_yuv)
+            return AVERROR_INVALIDDATA;
+
+        return tiff_unpack_fax(s, dst, stride, src, size, width, lines);
     }
 
     bytestream2_init(&s->gb, src, size);
-    bytestream2_init_writer(&pb, dst, stride * lines);
+    bytestream2_init_writer(&pb, dst, is_yuv ? s->yuv_line_size : (stride * lines));
 
     for (line = 0; line < lines; line++) {
+        if (src - ssrc > size) {
+            av_log(s->avctx, AV_LOG_ERROR, "Source data overread\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         if (bytestream2_get_bytes_left(&s->gb) == 0 || bytestream2_get_eof(&pb))
             break;
         bytestream2_seek_p(&pb, stride * line, SEEK_SET);
         switch (s->compr) {
         case TIFF_RAW:
+            if (ssrc + size - src < width)
+                return AVERROR_INVALIDDATA;
+
             if (!s->fill_order) {
-                bytestream2_copy_buffer(&pb, &s->gb, width);
+                horizontal_fill(s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                dst, 1, src, 0, width, 0);
             } else {
                 int i;
                 for (i = 0; i < width; i++)
-                    bytestream2_put_byte(&pb, ff_reverse[bytestream2_get_byte(&s->gb)]);
+                    dst[i] = ff_reverse[src[i]];
             }
+            src += width;
             break;
         case TIFF_PACKBITS:
             for (pixels = 0; pixels < width;) {
-                code = ff_u8_to_s8(bytestream2_get_byte(&s->gb));
+                if (ssrc + size - src < 2) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Read went out of bounds\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                code = s->fill_order ? (int8_t) ff_reverse[*src++]: (int8_t) *src++;
                 if (code >= 0) {
                     code++;
-                    bytestream2_copy_buffer(&pb, &s->gb, code);
+                    if (pixels + code > width ||
+                        ssrc + size - src < code) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Copy went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    horizontal_fill(s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                    dst, 1, src, 0, code, pixels);
+                    src    += code;
                     pixels += code;
                 } else if (code != -128) { // -127..-1
                     code = (-code) + 1;
-                    c    = bytestream2_get_byte(&s->gb);
-                    bytestream2_set_buffer(&pb, c, code);
+                    if (pixels + code > width) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Run went out of bounds\n");
+                        return AVERROR_INVALIDDATA;
+                    }
+                    c = *src++;
+                    horizontal_fill(s->bpp * (s->avctx->pix_fmt == AV_PIX_FMT_PAL8),
+                                    dst, 0, NULL, c, code, pixels);
                     pixels += code;
                 }
             }
+            if (s->fill_order) {
+                int i;
+                for (i = 0; i < width; i++)
+                    dst[i] = ff_reverse[dst[i]];
+            }
             break;
         }
+        if (is_yuv) {
+            unpack_yuv(s, p, dst, strip_start + line);
+            line += s->subsampling[1] - 1;
+        }
+        dst += stride;
     }
     return 0;
 }
 
-static int init_image(TiffContext *s, AVFrame *frame)
+static int init_image(TiffContext *s, ThreadFrame *frame)
 {
     int ret;
+    int create_gray_palette = 0;
 
     // make sure there is no aliasing in the following switch
     if (s->bpp >= 100 || s->bppcount >= 10) {
@@ -262,13 +664,40 @@ static int init_image(TiffContext *s, AVFrame *frame)
 
     switch (s->planar * 1000 + s->bpp * 10 + s->bppcount) {
     case 11:
-        s->avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+        if (!s->palette_is_set) {
+            s->avctx->pix_fmt = AV_PIX_FMT_MONOBLACK;
+            break;
+        }
+    case 21:
+    case 41:
+        s->avctx->pix_fmt = AV_PIX_FMT_PAL8;
+        if (!s->palette_is_set) {
+            create_gray_palette = 1;
+        }
         break;
     case 81:
         s->avctx->pix_fmt = s->palette_is_set ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_GRAY8;
         break;
     case 243:
-        s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
+        if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+            if (s->subsampling[0] == 1 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+            } else if (s->subsampling[0] == 2 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV422P;
+            } else if (s->subsampling[0] == 4 && s->subsampling[1] == 1) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV411P;
+            } else if (s->subsampling[0] == 1 && s->subsampling[1] == 2) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV440P;
+            } else if (s->subsampling[0] == 2 && s->subsampling[1] == 2) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+            } else if (s->subsampling[0] == 4 && s->subsampling[1] == 4) {
+                s->avctx->pix_fmt = AV_PIX_FMT_YUV410P;
+            } else {
+                av_log(s->avctx, AV_LOG_ERROR, "Unsupported YCbCr subsampling\n");
+                return AVERROR_PATCHWELCOME;
+            }
+        } else
+            s->avctx->pix_fmt = AV_PIX_FMT_RGB24;
         break;
     case 161:
         s->avctx->pix_fmt = s->le ? AV_PIX_FMT_GRAY16LE : AV_PIX_FMT_GRAY16BE;
@@ -283,10 +712,10 @@ static int init_image(TiffContext *s, AVFrame *frame)
         s->avctx->pix_fmt = AV_PIX_FMT_RGBA;
         break;
     case 483:
-        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGB48LE : AV_PIX_FMT_RGB48BE;
+        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGB48LE  : AV_PIX_FMT_RGB48BE;
         break;
     case 644:
-        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGBA64LE : AV_PIX_FMT_RGBA64BE;
+        s->avctx->pix_fmt = s->le ? AV_PIX_FMT_RGBA64LE  : AV_PIX_FMT_RGBA64BE;
         break;
     case 1243:
         s->avctx->pix_fmt = AV_PIX_FMT_GBRP;
@@ -306,64 +735,80 @@ static int init_image(TiffContext *s, AVFrame *frame)
                s->bpp, s->bppcount);
         return AVERROR_INVALIDDATA;
     }
+
+    if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->avctx->pix_fmt);
+        if((desc->flags & AV_PIX_FMT_FLAG_RGB) ||
+           !(desc->flags & AV_PIX_FMT_FLAG_PLANAR) ||
+           desc->nb_components < 3) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported YCbCr variant\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
     if (s->width != s->avctx->width || s->height != s->avctx->height) {
         ret = ff_set_dimensions(s->avctx, s->width, s->height);
         if (ret < 0)
             return ret;
     }
-    if ((ret = ff_get_buffer(s->avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(s->avctx, frame, 0)) < 0)
         return ret;
-    }
     if (s->avctx->pix_fmt == AV_PIX_FMT_PAL8) {
-        memcpy(frame->data[1], s->palette, sizeof(s->palette));
+        if (!create_gray_palette)
+            memcpy(frame->f->data[1], s->palette, sizeof(s->palette));
+        else {
+            /* make default grayscale pal */
+            int i;
+            uint32_t *pal = (uint32_t *)frame->f->data[1];
+            for (i = 0; i < 1<<s->bpp; i++)
+                pal[i] = 0xFFU << 24 | i * 255 / ((1<<s->bpp) - 1) * 0x010101;
+        }
     }
     return 0;
 }
 
-static int tiff_decode_tag(TiffContext *s)
+static void set_sar(TiffContext *s, unsigned tag, unsigned num, unsigned den)
 {
-    unsigned tag, type, count, off, value = 0;
+    int offset = tag == TIFF_YRES ? 2 : 0;
+    s->res[offset++] = num;
+    s->res[offset]   = den;
+    if (s->res[0] && s->res[1] && s->res[2] && s->res[3])
+        av_reduce(&s->avctx->sample_aspect_ratio.num, &s->avctx->sample_aspect_ratio.den,
+                  s->res[2] * (uint64_t)s->res[1], s->res[0] * (uint64_t)s->res[3], INT32_MAX);
+}
+
+static int tiff_decode_tag(TiffContext *s, AVFrame *frame)
+{
+    unsigned tag, type, count, off, value = 0, value2 = 0;
     int i, start;
+    int pos;
+    int ret;
+    double *dp;
 
-    if (bytestream2_get_bytes_left(&s->gb) < 12)
-        return AVERROR_INVALIDDATA;
-    tag   = tget_short(&s->gb, s->le);
-    type  = tget_short(&s->gb, s->le);
-    count = tget_long(&s->gb, s->le);
-    off   = tget_long(&s->gb, s->le);
-    start = bytestream2_tell(&s->gb);
-
-    if (type == 0 || type >= FF_ARRAY_ELEMS(type_sizes)) {
-        av_log(s->avctx, AV_LOG_DEBUG, "Unknown tiff type (%u) encountered\n",
-               type);
-        return 0;
+    ret = ff_tread_tag(&s->gb, s->le, &tag, &type, &count, &start);
+    if (ret < 0) {
+        goto end;
     }
 
+    off = bytestream2_tell(&s->gb);
     if (count == 1) {
         switch (type) {
         case TIFF_BYTE:
         case TIFF_SHORT:
-            bytestream2_seek(&s->gb, -4, SEEK_CUR);
-            value = tget(&s->gb, type, s->le);
-            break;
         case TIFF_LONG:
-            value = off;
+            value = ff_tget(&s->gb, type, s->le);
+            break;
+        case TIFF_RATIONAL:
+            value  = ff_tget(&s->gb, TIFF_LONG, s->le);
+            value2 = ff_tget(&s->gb, TIFF_LONG, s->le);
             break;
         case TIFF_STRING:
             if (count <= 4) {
-                bytestream2_seek(&s->gb, -4, SEEK_CUR);
                 break;
             }
         default:
             value = UINT_MAX;
-            bytestream2_seek(&s->gb, off, SEEK_SET);
         }
-    } else {
-        if (count <= 4 && type_sizes[type] * count <= 4)
-            bytestream2_seek(&s->gb, -4, SEEK_CUR);
-        else
-            bytestream2_seek(&s->gb, off, SEEK_SET);
     }
 
     switch (tag) {
@@ -374,26 +819,25 @@ static int tiff_decode_tag(TiffContext *s)
         s->height = value;
         break;
     case TIFF_BPP:
-        s->bppcount = count;
-        if (count > 4) {
+        if (count > 4U) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "This format is not supported (bpp=%d, %d components)\n",
-                   s->bpp, count);
+                   value, count);
             return AVERROR_INVALIDDATA;
         }
+        s->bppcount = count;
         if (count == 1)
             s->bpp = value;
         else {
             switch (type) {
             case TIFF_BYTE:
-                s->bpp = (off & 0xFF) + ((off >> 8) & 0xFF) +
-                         ((off >> 16) & 0xFF) + ((off >> 24) & 0xFF);
-                break;
             case TIFF_SHORT:
             case TIFF_LONG:
                 s->bpp = 0;
+                if (bytestream2_get_bytes_left(&s->gb) < type_sizes[type] * count)
+                    return AVERROR_INVALIDDATA;
                 for (i = 0; i < count; i++)
-                    s->bpp += tget(&s->gb, type, s->le);
+                    s->bpp += ff_tget(&s->gb, type, s->le);
                 break;
             default:
                 s->bpp = -1;
@@ -406,6 +850,11 @@ static int tiff_decode_tag(TiffContext *s)
                    "Samples per pixel requires a single value, many provided\n");
             return AVERROR_INVALIDDATA;
         }
+        if (value > 4U) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "Samples per pixel %d is too large\n", value);
+            return AVERROR_INVALIDDATA;
+        }
         if (s->bppcount == 1)
             s->bpp *= value;
         s->bppcount = value;
@@ -436,8 +885,12 @@ static int tiff_decode_tag(TiffContext *s)
             avpriv_report_missing_feature(s->avctx, "JPEG compression");
             return AVERROR_PATCHWELCOME;
         case TIFF_LZMA:
-            avpriv_report_missing_feature(s->avctx, "LZMA compression");
-            return AVERROR_PATCHWELCOME;
+#if CONFIG_LZMA
+            break;
+#else
+            av_log(s->avctx, AV_LOG_ERROR, "LZMA not compiled in\n");
+            return AVERROR(ENOSYS);
+#endif
         default:
             av_log(s->avctx, AV_LOG_ERROR, "Unknown compression method %i\n",
                    s->compr);
@@ -471,6 +924,17 @@ static int tiff_decode_tag(TiffContext *s)
         s->strips = count;
         s->sstype = type;
         break;
+    case TIFF_XRES:
+    case TIFF_YRES:
+        set_sar(s, tag, value, value2);
+        break;
+    case TIFF_TILE_BYTE_COUNTS:
+    case TIFF_TILE_LENGTH:
+    case TIFF_TILE_OFFSETS:
+    case TIFF_TILE_WIDTH:
+        av_log(s->avctx, AV_LOG_ERROR, "Tiled images are not supported\n");
+        return AVERROR_PATCHWELCOME;
+        break;
     case TIFF_PREDICTOR:
         s->predictor = value;
         break;
@@ -480,11 +944,11 @@ static int tiff_decode_tag(TiffContext *s)
         case TIFF_PHOTOMETRIC_BLACK_IS_ZERO:
         case TIFF_PHOTOMETRIC_RGB:
         case TIFF_PHOTOMETRIC_PALETTE:
+        case TIFF_PHOTOMETRIC_YCBCR:
             s->photometric = value;
             break;
         case TIFF_PHOTOMETRIC_ALPHA_MASK:
         case TIFF_PHOTOMETRIC_SEPARATED:
-        case TIFF_PHOTOMETRIC_YCBCR:
         case TIFF_PHOTOMETRIC_CIE_LAB:
         case TIFF_PHOTOMETRIC_ICC_LAB:
         case TIFF_PHOTOMETRIC_ITU_LAB:
@@ -516,15 +980,17 @@ static int tiff_decode_tag(TiffContext *s)
         if (count / 3 > 256 ||
             bytestream2_get_bytes_left(&s->gb) < count / 3 * off * 3)
             return AVERROR_INVALIDDATA;
+
         pal_gb[0] = pal_gb[1] = pal_gb[2] = s->gb;
         bytestream2_skip(&pal_gb[1], count / 3 * off);
         bytestream2_skip(&pal_gb[2], count / 3 * off * 2);
+
         off = (type_sizes[type] - 1) << 3;
         for (i = 0; i < count / 3; i++) {
             uint32_t p = 0xFF000000;
-            p |= (tget(&pal_gb[0], type, s->le) >> off) << 16;
-            p |= (tget(&pal_gb[1], type, s->le) >> off) << 8;
-            p |=  tget(&pal_gb[2], type, s->le) >> off;
+            p |= (ff_tget(&pal_gb[0], type, s->le) >> off) << 16;
+            p |= (ff_tget(&pal_gb[1], type, s->le) >> off) << 8;
+            p |=  ff_tget(&pal_gb[2], type, s->le) >> off;
             s->palette[i] = p;
         }
         s->palette_is_set = 1;
@@ -533,6 +999,19 @@ static int tiff_decode_tag(TiffContext *s)
     case TIFF_PLANAR:
         s->planar = value == 2;
         break;
+    case TIFF_YCBCR_SUBSAMPLING:
+        if (count != 2) {
+            av_log(s->avctx, AV_LOG_ERROR, "subsample count invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
+        for (i = 0; i < count; i++) {
+            s->subsampling[i] = ff_tget(&s->gb, type, s->le);
+            if (s->subsampling[i] <= 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "subsampling %d is invalid\n", s->subsampling[i]);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+        break;
     case TIFF_T4OPTIONS:
         if (s->compr == TIFF_G3)
             s->fax_opts = value;
@@ -541,6 +1020,137 @@ static int tiff_decode_tag(TiffContext *s)
         if (s->compr == TIFF_G4)
             s->fax_opts = value;
         break;
+#define ADD_METADATA(count, name, sep)\
+    if ((ret = add_metadata(count, type, name, sep, s, frame)) < 0) {\
+        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");\
+        goto end;\
+    }
+    case TIFF_MODEL_PIXEL_SCALE:
+        ADD_METADATA(count, "ModelPixelScaleTag", NULL);
+        break;
+    case TIFF_MODEL_TRANSFORMATION:
+        ADD_METADATA(count, "ModelTransformationTag", NULL);
+        break;
+    case TIFF_MODEL_TIEPOINT:
+        ADD_METADATA(count, "ModelTiepointTag", NULL);
+        break;
+    case TIFF_GEO_KEY_DIRECTORY:
+        ADD_METADATA(1, "GeoTIFF_Version", NULL);
+        ADD_METADATA(2, "GeoTIFF_Key_Revision", ".");
+        s->geotag_count   = ff_tget_short(&s->gb, s->le);
+        if (s->geotag_count > count / 4 - 1) {
+            s->geotag_count = count / 4 - 1;
+            av_log(s->avctx, AV_LOG_WARNING, "GeoTIFF key directory buffer shorter than specified\n");
+        }
+        if (bytestream2_get_bytes_left(&s->gb) < s->geotag_count * sizeof(int16_t) * 4) {
+            s->geotag_count = 0;
+            return -1;
+        }
+        s->geotags = av_mallocz_array(s->geotag_count, sizeof(TiffGeoTag));
+        if (!s->geotags) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+            s->geotag_count = 0;
+            goto end;
+        }
+        for (i = 0; i < s->geotag_count; i++) {
+            s->geotags[i].key    = ff_tget_short(&s->gb, s->le);
+            s->geotags[i].type   = ff_tget_short(&s->gb, s->le);
+            s->geotags[i].count  = ff_tget_short(&s->gb, s->le);
+
+            if (!s->geotags[i].type)
+                s->geotags[i].val  = get_geokey_val(s->geotags[i].key, ff_tget_short(&s->gb, s->le));
+            else
+                s->geotags[i].offset = ff_tget_short(&s->gb, s->le);
+        }
+        break;
+    case TIFF_GEO_DOUBLE_PARAMS:
+        if (count >= INT_MAX / sizeof(int64_t))
+            return AVERROR_INVALIDDATA;
+        if (bytestream2_get_bytes_left(&s->gb) < count * sizeof(int64_t))
+            return AVERROR_INVALIDDATA;
+        dp = av_malloc_array(count, sizeof(double));
+        if (!dp) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+            goto end;
+        }
+        for (i = 0; i < count; i++)
+            dp[i] = ff_tget_double(&s->gb, s->le);
+        for (i = 0; i < s->geotag_count; i++) {
+            if (s->geotags[i].type == TIFF_GEO_DOUBLE_PARAMS) {
+                if (s->geotags[i].count == 0
+                    || s->geotags[i].offset + s->geotags[i].count > count) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid GeoTIFF key %d\n", s->geotags[i].key);
+                } else {
+                    char *ap = doubles2str(&dp[s->geotags[i].offset], s->geotags[i].count, ", ");
+                    if (!ap) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+                        av_freep(&dp);
+                        return AVERROR(ENOMEM);
+                    }
+                    s->geotags[i].val = ap;
+                }
+            }
+        }
+        av_freep(&dp);
+        break;
+    case TIFF_GEO_ASCII_PARAMS:
+        pos = bytestream2_tell(&s->gb);
+        for (i = 0; i < s->geotag_count; i++) {
+            if (s->geotags[i].type == TIFF_GEO_ASCII_PARAMS) {
+                if (s->geotags[i].count == 0
+                    || s->geotags[i].offset +  s->geotags[i].count > count) {
+                    av_log(s->avctx, AV_LOG_WARNING, "Invalid GeoTIFF key %d\n", s->geotags[i].key);
+                } else {
+                    char *ap;
+
+                    bytestream2_seek(&s->gb, pos + s->geotags[i].offset, SEEK_SET);
+                    if (bytestream2_get_bytes_left(&s->gb) < s->geotags[i].count)
+                        return AVERROR_INVALIDDATA;
+                    ap = av_malloc(s->geotags[i].count);
+                    if (!ap) {
+                        av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+                        return AVERROR(ENOMEM);
+                    }
+                    bytestream2_get_bufferu(&s->gb, ap, s->geotags[i].count);
+                    ap[s->geotags[i].count - 1] = '\0'; //replace the "|" delimiter with a 0 byte
+                    s->geotags[i].val = ap;
+                }
+            }
+        }
+        break;
+    case TIFF_ARTIST:
+        ADD_METADATA(count, "artist", NULL);
+        break;
+    case TIFF_COPYRIGHT:
+        ADD_METADATA(count, "copyright", NULL);
+        break;
+    case TIFF_DATE:
+        ADD_METADATA(count, "date", NULL);
+        break;
+    case TIFF_DOCUMENT_NAME:
+        ADD_METADATA(count, "document_name", NULL);
+        break;
+    case TIFF_HOST_COMPUTER:
+        ADD_METADATA(count, "computer", NULL);
+        break;
+    case TIFF_IMAGE_DESCRIPTION:
+        ADD_METADATA(count, "description", NULL);
+        break;
+    case TIFF_MAKE:
+        ADD_METADATA(count, "make", NULL);
+        break;
+    case TIFF_MODEL:
+        ADD_METADATA(count, "model", NULL);
+        break;
+    case TIFF_PAGE_NAME:
+        ADD_METADATA(count, "page_name", NULL);
+        break;
+    case TIFF_PAGE_NUMBER:
+        ADD_METADATA(count, "page_number", " / ");
+        break;
+    case TIFF_SOFTWARE_NAME:
+        ADD_METADATA(count, "software", NULL);
+        break;
     default:
         if (s->avctx->err_recognition & AV_EF_EXPLODE) {
             av_log(s->avctx, AV_LOG_ERROR,
@@ -549,6 +1159,14 @@ static int tiff_decode_tag(TiffContext *s)
             return AVERROR_INVALIDDATA;
         }
     }
+end:
+    if (s->bpp > 64U) {
+        av_log(s->avctx, AV_LOG_ERROR,
+                "This format is not supported (bpp=%d, %d components)\n",
+                s->bpp, count);
+        s->bpp = 0;
+        return AVERROR_INVALIDDATA;
+    }
     bytestream2_seek(&s->gb, start, SEEK_SET);
     return 0;
 }
@@ -558,8 +1176,9 @@ static int decode_frame(AVCodecContext *avctx,
 {
     TiffContext *const s = avctx->priv_data;
     AVFrame *const p = data;
+    ThreadFrame frame = { .f = data };
     unsigned off;
-    int id, le, ret, plane, planes;
+    int le, ret, plane, planes;
     int i, j, entries, stride;
     unsigned soff, ssize;
     uint8_t *dst;
@@ -569,48 +1188,56 @@ static int decode_frame(AVCodecContext *avctx,
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
 
     // parse image header
-    if (avpkt->size < 8)
-        return AVERROR_INVALIDDATA;
-    id = bytestream2_get_le16(&s->gb);
-    if (id == 0x4949)
-        le = 1;
-    else if (id == 0x4D4D)
-        le = 0;
-    else {
-        av_log(avctx, AV_LOG_ERROR, "TIFF header not found\n");
+    if ((ret = ff_tdecode_header(&s->gb, &le, &off))) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid TIFF header\n");
+        return ret;
+    } else if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
+        av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
         return AVERROR_INVALIDDATA;
     }
     s->le          = le;
+    // TIFF_BPP is not a required tag and defaults to 1
+    s->bppcount    = s->bpp = 1;
     s->photometric = TIFF_PHOTOMETRIC_NONE;
     s->compr       = TIFF_RAW;
     s->fill_order  = 0;
-    // As TIFF 6.0 specification puts it "An arbitrary but carefully chosen number
-    // that further identifies the file as a TIFF file"
-    if (tget_short(&s->gb, le) != 42) {
-        av_log(avctx, AV_LOG_ERROR,
-               "The answer to life, universe and everything is not correct!\n");
-        return AVERROR_INVALIDDATA;
-    }
+    free_geotags(s);
+
     // Reset these offsets so we can tell if they were set this frame
     s->stripsizesoff = s->strippos = 0;
     /* parse image file directory */
-    off = tget_long(&s->gb, le);
-    if (off >= UINT_MAX - 14 || avpkt->size < off + 14) {
-        av_log(avctx, AV_LOG_ERROR, "IFD offset is greater than image size\n");
-        return AVERROR_INVALIDDATA;
-    }
     bytestream2_seek(&s->gb, off, SEEK_SET);
-    entries = tget_short(&s->gb, le);
+    entries = ff_tget_short(&s->gb, le);
+    if (bytestream2_get_bytes_left(&s->gb) < entries * 12)
+        return AVERROR_INVALIDDATA;
     for (i = 0; i < entries; i++) {
-        if ((ret = tiff_decode_tag(s)) < 0)
+        if ((ret = tiff_decode_tag(s, p)) < 0)
+            return ret;
+    }
+
+    for (i = 0; i<s->geotag_count; i++) {
+        const char *keyname = get_geokey_name(s->geotags[i].key);
+        if (!keyname) {
+            av_log(avctx, AV_LOG_WARNING, "Unknown or unsupported GeoTIFF key %d\n", s->geotags[i].key);
+            continue;
+        }
+        if (get_geokey_type(s->geotags[i].key) != s->geotags[i].type) {
+            av_log(avctx, AV_LOG_WARNING, "Type of GeoTIFF key %d is wrong\n", s->geotags[i].key);
+            continue;
+        }
+        ret = av_dict_set(avpriv_frame_get_metadatap(p), keyname, s->geotags[i].val, 0);
+        if (ret<0) {
+            av_log(avctx, AV_LOG_ERROR, "Writing metadata with key '%s' failed\n", keyname);
             return ret;
+        }
     }
+
     if (!s->strippos && !s->stripoff) {
         av_log(avctx, AV_LOG_ERROR, "Image data is missing\n");
         return AVERROR_INVALIDDATA;
     }
     /* now we have the data and may start decoding */
-    if ((ret = init_image(s, p)) < 0)
+    if ((ret = init_image(s, &frame)) < 0)
         return ret;
 
     if (s->strips == 1 && !s->stripsize) {
@@ -619,30 +1246,35 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     if (s->stripsizesoff) {
-        if (s->stripsizesoff >= avpkt->size)
+        if (s->stripsizesoff >= (unsigned)avpkt->size)
             return AVERROR_INVALIDDATA;
         bytestream2_init(&stripsizes, avpkt->data + s->stripsizesoff,
                          avpkt->size - s->stripsizesoff);
     }
     if (s->strippos) {
-        if (s->strippos >= avpkt->size)
+        if (s->strippos >= (unsigned)avpkt->size)
             return AVERROR_INVALIDDATA;
         bytestream2_init(&stripdata, avpkt->data + s->strippos,
                          avpkt->size - s->strippos);
     }
 
+    if (s->rps <= 0 || s->rps % s->subsampling[1]) {
+        av_log(avctx, AV_LOG_ERROR, "rps %d invalid\n", s->rps);
+        return AVERROR_INVALIDDATA;
+    }
+
     planes = s->planar ? s->bppcount : 1;
     for (plane = 0; plane < planes; plane++) {
         stride = p->linesize[plane];
         dst = p->data[plane];
         for (i = 0; i < s->height; i += s->rps) {
             if (s->stripsizesoff)
-                ssize = tget(&stripsizes, s->sstype, le);
+                ssize = ff_tget(&stripsizes, s->sstype, le);
             else
                 ssize = s->stripsize;
 
             if (s->strippos)
-                soff = tget(&stripdata, s->sot, le);
+                soff = ff_tget(&stripdata, s->sot, le);
             else
                 soff = s->stripoff;
 
@@ -650,7 +1282,7 @@ static int decode_frame(AVCodecContext *avctx,
                 av_log(avctx, AV_LOG_ERROR, "Invalid strip size/offset\n");
                 return AVERROR_INVALIDDATA;
             }
-            if ((ret = tiff_unpack_strip(s, dst, stride, avpkt->data + soff, ssize,
+            if ((ret = tiff_unpack_strip(s, p, dst, stride, avpkt->data + soff, ssize, i,
                                          FFMIN(s->rps, s->height - i))) < 0) {
                 if (avctx->err_recognition & AV_EF_EXPLODE)
                     return ret;
@@ -659,18 +1291,32 @@ static int decode_frame(AVCodecContext *avctx,
             dst += s->rps * stride;
         }
         if (s->predictor == 2) {
+            if (s->photometric == TIFF_PHOTOMETRIC_YCBCR) {
+                av_log(s->avctx, AV_LOG_ERROR, "predictor == 2 with YUV is unsupported");
+                return AVERROR_PATCHWELCOME;
+            }
             dst   = p->data[plane];
             soff  = s->bpp >> 3;
+            if (s->planar)
+                soff  = FFMAX(soff / s->bppcount, 1);
             ssize = s->width * soff;
             if (s->avctx->pix_fmt == AV_PIX_FMT_RGB48LE ||
-                s->avctx->pix_fmt == AV_PIX_FMT_RGBA64LE) {
+                s->avctx->pix_fmt == AV_PIX_FMT_RGBA64LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GRAY16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_YA16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GBRP16LE ||
+                s->avctx->pix_fmt == AV_PIX_FMT_GBRAP16LE) {
                 for (i = 0; i < s->height; i++) {
                     for (j = soff; j < ssize; j += 2)
                         AV_WL16(dst + j, AV_RL16(dst + j) + AV_RL16(dst + j - soff));
                     dst += stride;
                 }
             } else if (s->avctx->pix_fmt == AV_PIX_FMT_RGB48BE ||
-                       s->avctx->pix_fmt == AV_PIX_FMT_RGBA64BE) {
+                       s->avctx->pix_fmt == AV_PIX_FMT_RGBA64BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GRAY16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_YA16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GBRP16BE ||
+                       s->avctx->pix_fmt == AV_PIX_FMT_GBRAP16BE) {
                 for (i = 0; i < s->height; i++) {
                     for (j = soff; j < ssize; j += 2)
                         AV_WB16(dst + j, AV_RB16(dst + j) + AV_RB16(dst + j - soff));
@@ -689,7 +1335,7 @@ static int decode_frame(AVCodecContext *avctx,
             dst = p->data[plane];
             for (i = 0; i < s->height; i++) {
                 for (j = 0; j < stride; j++)
-                    dst[j] = 255 - dst[j];
+                    dst[j] = (s->avctx->pix_fmt == AV_PIX_FMT_PAL8 ? (1<<s->bpp) - 1 : 255) - dst[j];
                 dst += stride;
             }
         }
@@ -713,6 +1359,8 @@ static av_cold int tiff_init(AVCodecContext *avctx)
 
     s->width  = 0;
     s->height = 0;
+    s->subsampling[0] =
+    s->subsampling[1] = 1;
     s->avctx  = avctx;
     ff_lzw_decode_open(&s->lzw);
     ff_ccitt_unpack_init();
@@ -724,7 +1372,10 @@ static av_cold int tiff_end(AVCodecContext *avctx)
 {
     TiffContext *const s = avctx->priv_data;
 
+    free_geotags(s);
+
     ff_lzw_decode_close(&s->lzw);
+    av_freep(&s->deinvert_buf);
     return 0;
 }
 
@@ -737,5 +1388,6 @@ AVCodec ff_tiff_decoder = {
     .init           = tiff_init,
     .close          = tiff_end,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(tiff_init),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/tiff.h b/libavcodec/tiff.h
index 68ac695..3f692af 100644
--- a/libavcodec/tiff.h
+++ b/libavcodec/tiff.h
@@ -1,27 +1,29 @@
 /*
- * TIFF tables
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
  * TIFF tables
+ *
+ * For more information about the TIFF format, check the official docs at:
+ * http://partners.adobe.com/public/developer/tiff/index.html
  * @author Konstantin Shishkov
  */
 
@@ -29,6 +31,7 @@
 #define AVCODEC_TIFF_H
 
 #include <stdint.h>
+#include "tiff_common.h"
 
 /** abridged list of TIFF tags */
 enum TiffTags {
@@ -39,6 +42,10 @@ enum TiffTags {
     TIFF_COMPR,
     TIFF_PHOTOMETRIC        = 0x106,
     TIFF_FILL_ORDER         = 0x10A,
+    TIFF_DOCUMENT_NAME      = 0x10D,
+    TIFF_IMAGE_DESCRIPTION  = 0x10E,
+    TIFF_MAKE               = 0x10F,
+    TIFF_MODEL              = 0x110,
     TIFF_STRIP_OFFS         = 0x111,
     TIFF_SAMPLES_PER_PIXEL  = 0x115,
     TIFF_ROWSPERSTRIP       = 0x116,
@@ -46,18 +53,35 @@ enum TiffTags {
     TIFF_XRES               = 0x11A,
     TIFF_YRES               = 0x11B,
     TIFF_PLANAR             = 0x11C,
+    TIFF_PAGE_NAME          = 0x11D,
     TIFF_XPOS               = 0x11E,
     TIFF_YPOS               = 0x11F,
     TIFF_T4OPTIONS          = 0x124,
     TIFF_T6OPTIONS,
     TIFF_RES_UNIT           = 0x128,
+    TIFF_PAGE_NUMBER        = 0x129,
     TIFF_SOFTWARE_NAME      = 0x131,
+    TIFF_DATE               = 0x132,
+    TIFF_ARTIST             = 0x13B,
+    TIFF_HOST_COMPUTER      = 0x13C,
     TIFF_PREDICTOR          = 0x13D,
     TIFF_PAL                = 0x140,
+    TIFF_TILE_WIDTH         = 0x142,
+    TIFF_TILE_LENGTH        = 0x143,
+    TIFF_TILE_OFFSETS       = 0x144,
+    TIFF_TILE_BYTE_COUNTS   = 0x145,
+    TIFF_EXTRASAMPLES       = 0x152,
     TIFF_YCBCR_COEFFICIENTS = 0x211,
     TIFF_YCBCR_SUBSAMPLING  = 0x212,
     TIFF_YCBCR_POSITIONING  = 0x213,
     TIFF_REFERENCE_BW       = 0x214,
+    TIFF_COPYRIGHT          = 0x8298,
+    TIFF_MODEL_TIEPOINT     = 0x8482,
+    TIFF_MODEL_PIXEL_SCALE  = 0x830E,
+    TIFF_MODEL_TRANSFORMATION= 0x8480,
+    TIFF_GEO_KEY_DIRECTORY  = 0x87AF,
+    TIFF_GEO_DOUBLE_PARAMS  = 0x87B0,
+    TIFF_GEO_ASCII_PARAMS   = 0x87B1
 };
 
 /** list of TIFF compression types */
@@ -75,12 +99,52 @@ enum TiffCompr {
     TIFF_LZMA     = 0x886D,
 };
 
-enum TiffTypes {
-    TIFF_BYTE = 1,
-    TIFF_STRING,
-    TIFF_SHORT,
-    TIFF_LONG,
-    TIFF_RATIONAL,
+enum TiffGeoTagKey {
+    TIFF_GT_MODEL_TYPE_GEOKEY                = 1024,
+    TIFF_GT_RASTER_TYPE_GEOKEY               = 1025,
+    TIFF_GT_CITATION_GEOKEY                  = 1026,
+    TIFF_GEOGRAPHIC_TYPE_GEOKEY              = 2048,
+    TIFF_GEOG_CITATION_GEOKEY                = 2049,
+    TIFF_GEOG_GEODETIC_DATUM_GEOKEY          = 2050,
+    TIFF_GEOG_PRIME_MERIDIAN_GEOKEY          = 2051,
+    TIFF_GEOG_LINEAR_UNITS_GEOKEY            = 2052,
+    TIFF_GEOG_LINEAR_UNIT_SIZE_GEOKEY        = 2053,
+    TIFF_GEOG_ANGULAR_UNITS_GEOKEY           = 2054,
+    TIFF_GEOG_ANGULAR_UNIT_SIZE_GEOKEY       = 2055,
+    TIFF_GEOG_ELLIPSOID_GEOKEY               = 2056,
+    TIFF_GEOG_SEMI_MAJOR_AXIS_GEOKEY         = 2057,
+    TIFF_GEOG_SEMI_MINOR_AXIS_GEOKEY         = 2058,
+    TIFF_GEOG_INV_FLATTENING_GEOKEY          = 2059,
+    TIFF_GEOG_AZIMUTH_UNITS_GEOKEY           = 2060,
+    TIFF_GEOG_PRIME_MERIDIAN_LONG_GEOKEY     = 2061,
+    TIFF_PROJECTED_CS_TYPE_GEOKEY            = 3072,
+    TIFF_PCS_CITATION_GEOKEY                 = 3073,
+    TIFF_PROJECTION_GEOKEY                   = 3074,
+    TIFF_PROJ_COORD_TRANS_GEOKEY             = 3075,
+    TIFF_PROJ_LINEAR_UNITS_GEOKEY            = 3076,
+    TIFF_PROJ_LINEAR_UNIT_SIZE_GEOKEY        = 3077,
+    TIFF_PROJ_STD_PARALLEL1_GEOKEY           = 3078,
+    TIFF_PROJ_STD_PARALLEL2_GEOKEY           = 3079,
+    TIFF_PROJ_NAT_ORIGIN_LONG_GEOKEY         = 3080,
+    TIFF_PROJ_NAT_ORIGIN_LAT_GEOKEY          = 3081,
+    TIFF_PROJ_FALSE_EASTING_GEOKEY           = 3082,
+    TIFF_PROJ_FALSE_NORTHING_GEOKEY          = 3083,
+    TIFF_PROJ_FALSE_ORIGIN_LONG_GEOKEY       = 3084,
+    TIFF_PROJ_FALSE_ORIGIN_LAT_GEOKEY        = 3085,
+    TIFF_PROJ_FALSE_ORIGIN_EASTING_GEOKEY    = 3086,
+    TIFF_PROJ_FALSE_ORIGIN_NORTHING_GEOKEY   = 3087,
+    TIFF_PROJ_CENTER_LONG_GEOKEY             = 3088,
+    TIFF_PROJ_CENTER_LAT_GEOKEY              = 3089,
+    TIFF_PROJ_CENTER_EASTING_GEOKEY          = 3090,
+    TIFF_PROJ_CENTER_NORTHING_GEOKEY         = 3091,
+    TIFF_PROJ_SCALE_AT_NAT_ORIGIN_GEOKEY     = 3092,
+    TIFF_PROJ_SCALE_AT_CENTER_GEOKEY         = 3093,
+    TIFF_PROJ_AZIMUTH_ANGLE_GEOKEY           = 3094,
+    TIFF_PROJ_STRAIGHT_VERT_POLE_LONG_GEOKEY = 3095,
+    TIFF_VERTICAL_CS_TYPE_GEOKEY             = 4096,
+    TIFF_VERTICAL_CITATION_GEOKEY            = 4097,
+    TIFF_VERTICAL_DATUM_GEOKEY               = 4098,
+    TIFF_VERTICAL_UNITS_GEOKEY               = 4099
 };
 
 enum TiffPhotometric {
@@ -101,9 +165,28 @@ enum TiffPhotometric {
     TIFF_PHOTOMETRIC_LINEAR_RAW = 34892, /* Linear Raw (DNG) */
 };
 
-/** sizes of various TIFF field types (string size = 100)*/
-static const uint8_t type_sizes[6] = {
-    0, 1, 100, 2, 4, 8
+enum TiffGeoTagType {
+    GEOTIFF_SHORT  = 0,
+    GEOTIFF_DOUBLE = 34736,
+    GEOTIFF_STRING = 34737
 };
 
+typedef struct TiffGeoTag {
+    enum TiffGeoTagKey key;
+    enum TiffTags type;
+    int count;
+    int offset;
+    char *val;
+} TiffGeoTag;
+
+typedef struct TiffGeoTagKeyName {
+    const enum TiffGeoTagKey key;
+    const char *const name;
+} TiffGeoTagKeyName;
+
+typedef struct TiffGeoTagNameType {
+    const char *const name;
+    const enum TiffGeoTagType type;
+} TiffGeoTagNameType;
+
 #endif /* AVCODEC_TIFF_H */
diff --git a/libavcodec/tiff_common.c b/libavcodec/tiff_common.c
new file mode 100644
index 0000000..35119af
--- /dev/null
+++ b/libavcodec/tiff_common.c
@@ -0,0 +1,313 @@
+/*
+ * TIFF Common Routines
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF Common Routines
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#include "tiff_common.h"
+
+
+int ff_tis_ifd(unsigned tag)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(ifd_tags); i++) {
+        if (ifd_tags[i] == tag) {
+            return i + 1;
+        }
+    }
+    return 0;
+}
+
+
+unsigned ff_tget_short(GetByteContext *gb, int le)
+{
+    return le ? bytestream2_get_le16(gb) : bytestream2_get_be16(gb);
+}
+
+
+unsigned ff_tget_long(GetByteContext *gb, int le)
+{
+    return le ? bytestream2_get_le32(gb) : bytestream2_get_be32(gb);
+}
+
+
+double ff_tget_double(GetByteContext *gb, int le)
+{
+    av_alias64 i = { .u64 = le ? bytestream2_get_le64(gb) : bytestream2_get_be64(gb)};
+    return i.f64;
+}
+
+
+unsigned ff_tget(GetByteContext *gb, int type, int le)
+{
+    switch (type) {
+    case TIFF_BYTE:  return bytestream2_get_byte(gb);
+    case TIFF_SHORT: return ff_tget_short(gb, le);
+    case TIFF_LONG:  return ff_tget_long(gb, le);
+    default:         return UINT_MAX;
+    }
+}
+
+static const char *auto_sep(int count, const char *sep, int i, int columns)
+{
+    if (sep)
+        return i ? sep : "";
+    if (i && i%columns) {
+        return ", ";
+    } else
+        return columns < count ? "\n" : "";
+}
+
+int ff_tadd_rational_metadata(int count, const char *name, const char *sep,
+                              GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int32_t nom, denom;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int64_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int64_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        nom   = ff_tget_long(gb, le);
+        denom = ff_tget_long(gb, le);
+        av_bprintf(&bp, "%s%7i:%-7i", auto_sep(count, sep, i, 4), nom, denom);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_long_metadata(int count, const char *name, const char *sep,
+                          GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int32_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int32_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        av_bprintf(&bp, "%s%7i", auto_sep(count, sep, i, 8), ff_tget_long(gb, le));
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_doubles_metadata(int count, const char *name, const char *sep,
+                             GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int64_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int64_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, 100 * count);
+
+    for (i = 0; i < count; i++) {
+        av_bprintf(&bp, "%s%.15g", auto_sep(count, sep, i, 4), ff_tget_double(gb, le));
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_shorts_metadata(int count, const char *name, const char *sep,
+                            GetByteContext *gb, int le, int is_signed, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int16_t) || count <= 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int16_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        int v = is_signed ? (int16_t)ff_tget_short(gb, le) :  ff_tget_short(gb, le);
+        av_bprintf(&bp, "%s%5i", auto_sep(count, sep, i, 8), v);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+
+int ff_tadd_bytes_metadata(int count, const char *name, const char *sep,
+                           GetByteContext *gb, int le, int is_signed, AVDictionary **metadata)
+{
+    AVBPrint bp;
+    char *ap;
+    int i;
+
+    if (count >= INT_MAX / sizeof(int8_t) || count < 0)
+        return AVERROR_INVALIDDATA;
+    if (bytestream2_get_bytes_left(gb) < count * sizeof(int8_t))
+        return AVERROR_INVALIDDATA;
+
+    av_bprint_init(&bp, 10 * count, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < count; i++) {
+        int v = is_signed ? (int8_t)bytestream2_get_byte(gb) :  bytestream2_get_byte(gb);
+        av_bprintf(&bp, "%s%3i", auto_sep(count, sep, i, 16), v);
+    }
+
+    if ((i = av_bprint_finalize(&bp, &ap))) {
+        return i;
+    }
+    if (!ap) {
+        return AVERROR(ENOMEM);
+    }
+
+    av_dict_set(metadata, name, ap, AV_DICT_DONT_STRDUP_VAL);
+
+    return 0;
+}
+
+int ff_tadd_string_metadata(int count, const char *name,
+                            GetByteContext *gb, int le, AVDictionary **metadata)
+{
+    char *value;
+
+    if (bytestream2_get_bytes_left(gb) < count || count < 0)
+        return AVERROR_INVALIDDATA;
+
+    value = av_malloc(count + 1);
+    if (!value)
+        return AVERROR(ENOMEM);
+
+    bytestream2_get_bufferu(gb, value, count);
+    value[count] = 0;
+
+    av_dict_set(metadata, name, value, AV_DICT_DONT_STRDUP_VAL);
+    return 0;
+}
+
+
+int ff_tdecode_header(GetByteContext *gb, int *le, int *ifd_offset)
+{
+    if (bytestream2_get_bytes_left(gb) < 8) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    *le = bytestream2_get_le16u(gb);
+    if (*le == AV_RB16("II")) {
+        *le = 1;
+    } else if (*le == AV_RB16("MM")) {
+        *le = 0;
+    } else {
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (ff_tget_short(gb, *le) != 42) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    *ifd_offset = ff_tget_long(gb, *le);
+
+    return 0;
+}
+
+
+int ff_tread_tag(GetByteContext *gb, int le, unsigned *tag, unsigned *type,
+                 unsigned *count, int *next)
+{
+    int ifd_tag;
+    int valid_type;
+
+    *tag    = ff_tget_short(gb, le);
+    *type   = ff_tget_short(gb, le);
+    *count  = ff_tget_long (gb, le);
+
+    ifd_tag    = ff_tis_ifd(*tag);
+    valid_type = *type != 0 && *type < FF_ARRAY_ELEMS(type_sizes);
+
+    *next = bytestream2_tell(gb) + 4;
+
+    // check for valid type
+    if (!valid_type) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    // seek to offset if this is an IFD-tag or
+    // if count values do not fit into the offset value
+    if (ifd_tag || (*count > 4 || !(type_sizes[*type] * (*count) <= 4 || *type == TIFF_STRING))) {
+        bytestream2_seek(gb, ff_tget_long (gb, le), SEEK_SET);
+    }
+
+    return 0;
+}
diff --git a/libavcodec/tiff_common.h b/libavcodec/tiff_common.h
new file mode 100644
index 0000000..03558c3
--- /dev/null
+++ b/libavcodec/tiff_common.h
@@ -0,0 +1,152 @@
+/*
+ * TIFF Common Routines
+ * Copyright (c) 2013 Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF Common Routines
+ * @author Thilo Borgmann <thilo.borgmann _at_ mail.de>
+ */
+
+#ifndef AVCODEC_TIFF_COMMON_H
+#define AVCODEC_TIFF_COMMON_H
+
+#include "avcodec.h"
+#include "tiff.h"
+#include "bytestream.h"
+#include "libavutil/bprint.h"
+
+/** data type identifiers for TIFF tags */
+enum TiffTypes {
+    TIFF_BYTE = 1,
+    TIFF_STRING,
+    TIFF_SHORT,
+    TIFF_LONG,
+    TIFF_RATIONAL,
+    TIFF_SBYTE,
+    TIFF_UNDEFINED,
+    TIFF_SSHORT,
+    TIFF_SLONG,
+    TIFF_SRATIONAL,
+    TIFF_FLOAT,
+    TIFF_DOUBLE,
+    TIFF_IFD
+};
+
+/** sizes of various TIFF field types (string size = 100)*/
+static const uint8_t type_sizes[14] = {
+    0, 1, 100, 2, 4, 8, 1, 1, 2, 4, 8, 4, 8, 4
+};
+
+static const uint16_t ifd_tags[] = {
+    0x8769, // EXIF IFD
+    0x8825, // GPS IFD
+    0xA005  // Interoperability IFD
+};
+
+
+/** Returns a value > 0 if the tag is a known IFD-tag.
+ *  The return value is the array index + 1 within ifd_tags[].
+ */
+int ff_tis_ifd(unsigned tag);
+
+/** Reads a short from the bytestream using given endianness. */
+unsigned ff_tget_short(GetByteContext *gb, int le);
+
+/** Reads a long from the bytestream using given endianness. */
+unsigned ff_tget_long(GetByteContext *gb, int le);
+
+/** Reads a double from the bytestream using given endianness. */
+double   ff_tget_double(GetByteContext *gb, int le);
+
+/** Reads a byte from the bytestream using given endianness. */
+unsigned ff_tget(GetByteContext *gb, int type, int le);
+
+/** Returns an allocated string containing count
+ *  rational values using the given separator.
+ */
+char *ff_trationals2str(int *rp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  long values using the given separator.
+ */
+char *ff_tlongs2str(int32_t *lp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  double values using the given separator.
+ */
+char *ff_tdoubles2str(double *dp, int count, const char *sep);
+
+/** Returns an allocated string containing count
+ *  short values using the given separator.
+ */
+char *ff_tshorts2str(int16_t *sp, int count, const char *sep);
+
+/** Adds count rationals converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_rational_metadata(int count, const char *name, const char *sep,
+                              GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count longs converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_long_metadata(int count, const char *name, const char *sep,
+                          GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count doubles converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_doubles_metadata(int count, const char *name, const char *sep,
+                             GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Adds count shorts converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_shorts_metadata(int count, const char *name, const char *sep,
+                            GetByteContext *gb, int le, int is_signed, AVDictionary **metadata);
+
+/** Adds count bytes converted to a string
+ *  into the metadata dictionary.
+ */
+int ff_tadd_bytes_metadata(int count, const char *name, const char *sep,
+                           GetByteContext *gb, int le, int is_signed, AVDictionary **metadata);
+
+/** Adds a string of count characters
+ *  into the metadata dictionary.
+ */
+int ff_tadd_string_metadata(int count, const char *name,
+                            GetByteContext *gb, int le, AVDictionary **metadata);
+
+/** Decodes a TIFF header from the input bytestream
+ *  and sets the endianness in *le and the offset to
+ *  the first IFD in *ifd_offset accordingly.
+ */
+int ff_tdecode_header(GetByteContext *gb, int *le, int *ifd_offset);
+
+/** Reads the first 3 fields of a TIFF tag, which are
+ *  the tag id, the tag type and the count of values for that tag.
+ *  Afterwards the bytestream is located at the first value to read and
+ *  *next holds the bytestream offset of the following tag.
+ */
+int ff_tread_tag(GetByteContext *gb, int le, unsigned *tag, unsigned *type,
+                 unsigned *count, int *next);
+
+#endif /* AVCODEC_TIFF_COMMON_H */
diff --git a/libavcodec/tiff_data.c b/libavcodec/tiff_data.c
new file mode 100644
index 0000000..88c2256
--- /dev/null
+++ b/libavcodec/tiff_data.c
@@ -0,0 +1,1870 @@
+/*
+ * TIFF data tables
+ * Copyright (c) 2011 Thomas Kuehnel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF data tables
+ * @author Thomas Kuehnel
+ * @see GeoTIFF specification at
+ * http://www.remotesensing.org/geotiff/spec/geotiffhome.html
+ */
+
+#include "tiff_data.h"
+
+const TiffGeoTagNameType ff_tiff_conf_name_type_map[] = {
+    {"GTModelTypeGeoKey",              GEOTIFF_SHORT },
+    {"GTRasterTypeGeoKey",             GEOTIFF_SHORT },
+    {"GTCitationGeoKey",               GEOTIFF_STRING}
+};
+
+const TiffGeoTagNameType ff_tiff_geog_name_type_map[] = {
+    {"GeographicTypeGeoKey",           GEOTIFF_SHORT },
+    {"GeogCitationGeoKey",             GEOTIFF_STRING},
+    {"GeogGeodeticDatumGeoKey",        GEOTIFF_SHORT },
+    {"GeogPrimeMeridianGeoKey",        GEOTIFF_SHORT },
+    {"GeogLinearUnitsGeoKey",          GEOTIFF_SHORT },
+    {"GeogLinearUnitSizeGeoKey",       GEOTIFF_DOUBLE},
+    {"GeogAngularUnitsGeoKey",         GEOTIFF_SHORT },
+    {"GeogAngularUnitSizeGeoKey",      GEOTIFF_DOUBLE},
+    {"GeogEllipsoidGeoKey",            GEOTIFF_SHORT },
+    {"GeogSemiMajorAxisGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogSemiMinorAxisGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogInvFlatteningGeoKey",        GEOTIFF_DOUBLE},
+    {"GeogAzimuthUnitsGeoKey",         GEOTIFF_SHORT },
+    {"GeogPrimeMeridianLongGeoKey",    GEOTIFF_DOUBLE}
+};
+
+const TiffGeoTagNameType ff_tiff_proj_name_type_map[] = {
+    {"ProjectedCSTypeGeoKey",          GEOTIFF_SHORT },
+    {"PCSCitationGeoKey",              GEOTIFF_STRING},
+    {"ProjectionGeoKey",               GEOTIFF_SHORT },
+    {"ProjCoordTransGeoKey",           GEOTIFF_SHORT },
+    {"ProjLinearUnitsGeoKey",          GEOTIFF_SHORT },
+    {"ProjLinearUnitSizeGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjStdParallel1GeoKey",         GEOTIFF_DOUBLE},
+    {"ProjStdParallel2GeoKey",         GEOTIFF_DOUBLE},
+    {"ProjNatOriginLongGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjNatOriginLatGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjFalseEastingGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjFalseNorthingGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjFalseOriginLongGeoKey",      GEOTIFF_DOUBLE},
+    {"ProjFalseOriginLatGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjFalseOriginEastingGeoKey",   GEOTIFF_DOUBLE},
+    {"ProjFalseOriginNorthingGeoKey",  GEOTIFF_DOUBLE},
+    {"ProjCenterLongGeoKey",           GEOTIFF_DOUBLE},
+    {"ProjCenterLatGeoKey",            GEOTIFF_DOUBLE},
+    {"ProjCenterEastingGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjCenterNorthingGeoKey",       GEOTIFF_DOUBLE},
+    {"ProjScaleAtNatOriginGeoKey",     GEOTIFF_DOUBLE},
+    {"ProjScaleAtCenterGeoKey",        GEOTIFF_DOUBLE},
+    {"ProjAzimuthAngleGeoKey",         GEOTIFF_DOUBLE},
+    {"ProjStraightVertPoleLongGeoKey", GEOTIFF_DOUBLE}
+};
+
+const TiffGeoTagNameType ff_tiff_vert_name_type_map[] = {
+    {"VerticalCSTypeGeoKey",           GEOTIFF_SHORT },
+    {"VerticalCitationGeoKey",         GEOTIFF_STRING},
+    {"VerticalDatumGeoKey",            GEOTIFF_SHORT },
+    {"VerticalUnitsGeoKey",            GEOTIFF_SHORT }
+};
+
+const char *const ff_tiff_gt_model_type_codes[] = {
+    "ModelTypeProjected",
+    "ModelTypeGeographic",
+    "ModelTypeGeocentric"
+};
+
+const char *const ff_tiff_gt_raster_type_codes[] = {
+    "RasterPixelIsArea",
+    "RasterPixelIsPoint"
+};
+
+const char *const ff_tiff_linear_unit_codes[] = {
+    "Linear_Meter",
+    "Linear_Foot",
+    "Linear_Foot_US_Survey",
+    "Linear_Foot_Modified_American",
+    "Linear_Foot_Clarke",
+    "Linear_Foot_Indian",
+    "Linear_Link",
+    "Linear_Link_Benoit",
+    "Linear_Link_Sears",
+    "Linear_Chain_Benoit",
+    "Linear_Chain_Sears",
+    "Linear_Yard_Sears",
+    "Linear_Yard_Indian",
+    "Linear_Fathom",
+    "Linear_Mile_International_Nautical"
+};
+
+const char *const ff_tiff_angular_unit_codes[] = {
+    "Angular_Radian",
+    "Angular_Degree",
+    "Angular_Arc_Minute",
+    "Angular_Arc_Second",
+    "Angular_Grad",
+    "Angular_Gon",
+    "Angular_DMS",
+    "Angular_DMS_Hemisphere"
+};
+
+const char *const ff_tiff_gcs_type_codes[] = {
+    "GCS_Adindan",
+    "GCS_AGD66",
+    "GCS_AGD84",
+    "GCS_Ain_el_Abd",
+    "GCS_Afgooye",
+    "GCS_Agadez",
+    "GCS_Lisbon",
+    "GCS_Aratu",
+    "GCS_Arc_1950",
+    "GCS_Arc_1960",
+    "GCS_Batavia",
+    "GCS_Barbados",
+    "GCS_Beduaram",
+    "GCS_Beijing_1954",
+    "GCS_Belge_1950",
+    "GCS_Bermuda_1957",
+    "GCS_Bern_1898",
+    "GCS_Bogota",
+    "GCS_Bukit_Rimpah",
+    "GCS_Camacupa",
+    "GCS_Campo_Inchauspe",
+    "GCS_Cape",
+    "GCS_Carthage",
+    "GCS_Chua",
+    "GCS_Corrego_Alegre",
+    "GCS_Cote_d_Ivoire",
+    "GCS_Deir_ez_Zor",
+    "GCS_Douala",
+    "GCS_Egypt_1907",
+    "GCS_ED50",
+    "GCS_ED87",
+    "GCS_Fahud",
+    "GCS_Gandajika_1970",
+    "GCS_Garoua",
+    "GCS_Guyane_Francaise",
+    "GCS_Hu_Tzu_Shan",
+    "GCS_HD72",
+    "GCS_ID74",
+    "GCS_Indian_1954",
+    "GCS_Indian_1975",
+    "GCS_Jamaica_1875",
+    "GCS_JAD69",
+    "GCS_Kalianpur",
+    "GCS_Kandawala",
+    "GCS_Kertau",
+    "GCS_KOC",
+    "GCS_La_Canoa",
+    "GCS_PSAD56",
+    "GCS_Lake",
+    "GCS_Leigon",
+    "GCS_Liberia_1964",
+    "GCS_Lome",
+    "GCS_Luzon_1911",
+    "GCS_Hito_XVIII_1963",
+    "GCS_Herat_North",
+    "GCS_Mahe_1971",
+    "GCS_Makassar",
+    "GCS_EUREF89",
+    "GCS_Malongo_1987",
+    "GCS_Manoca",
+    "GCS_Merchich",
+    "GCS_Massawa",
+    "GCS_Minna",
+    "GCS_Mhast",
+    "GCS_Monte_Mario",
+    "GCS_M_poraloko",
+    "GCS_NAD27",
+    "GCS_NAD_Michigan",
+    "GCS_NAD83",
+    "GCS_Nahrwan_1967",
+    "GCS_Naparima_1972",
+    "GCS_GD49",
+    "GCS_NGO_1948",
+    "GCS_Datum_73",
+    "GCS_NTF",
+    "GCS_NSWC_9Z_2",
+    "GCS_OSGB_1936",
+    "GCS_OSGB70",
+    "GCS_OS_SN80",
+    "GCS_Padang",
+    "GCS_Palestine_1923",
+    "GCS_Pointe_Noire",
+    "GCS_GDA94",
+    "GCS_Pulkovo_1942",
+    "GCS_Qatar",
+    "GCS_Qatar_1948",
+    "GCS_Qornoq",
+    "GCS_Loma_Quintana",
+    "GCS_Amersfoort",
+    "GCS_RT38",
+    "GCS_SAD69",
+    "GCS_Sapper_Hill_1943",
+    "GCS_Schwarzeck",
+    "GCS_Segora",
+    "GCS_Serindung",
+    "GCS_Sudan",
+    "GCS_Tananarive",
+    "GCS_Timbalai_1948",
+    "GCS_TM65",
+    "GCS_TM75",
+    "GCS_Tokyo",
+    "GCS_Trinidad_1903",
+    "GCS_TC_1948",
+    "GCS_Voirol_1875",
+    "GCS_Voirol_Unifie",
+    "GCS_Bern_1938",
+    "GCS_Nord_Sahara_1959",
+    "GCS_Stockholm_1938",
+    "GCS_Yacare",
+    "GCS_Yoff",
+    "GCS_Zanderij",
+    "GCS_MGI",
+    "GCS_Belge_1972",
+    "GCS_DHDN",
+    "GCS_Conakry_1905",
+    "GCS_WGS_72",
+    "GCS_WGS_72BE",
+    "GCS_WGS_84",
+    "GCS_Bern_1898_Bern",
+    "GCS_Bogota_Bogota",
+    "GCS_Lisbon_Lisbon",
+    "GCS_Makassar_Jakarta",
+    "GCS_MGI_Ferro",
+    "GCS_Monte_Mario_Rome",
+    "GCS_NTF_Paris",
+    "GCS_Padang_Jakarta",
+    "GCS_Belge_1950_Brussels",
+    "GCS_Tananarive_Paris",
+    "GCS_Voirol_1875_Paris",
+    "GCS_Voirol_Unifie_Paris",
+    "GCS_Batavia_Jakarta",
+    "GCS_ATF_Paris",
+    "GCS_NDG_Paris"
+};
+
+const char *const ff_tiff_gcse_type_codes[] = {
+    "GCSE_Airy1830",
+    "GCSE_AiryModified1849",
+    "GCSE_AustralianNationalSpheroid",
+    "GCSE_Bessel1841",
+    "GCSE_BesselModified",
+    "GCSE_BesselNamibia",
+    "GCSE_Clarke1858",
+    "GCSE_Clarke1866",
+    "GCSE_Clarke1866Michigan",
+    "GCSE_Clarke1880_Benoit",
+    "GCSE_Clarke1880_IGN",
+    "GCSE_Clarke1880_RGS",
+    "GCSE_Clarke1880_Arc",
+    "GCSE_Clarke1880_SGA1922",
+    "GCSE_Everest1830_1937Adjustment",
+    "GCSE_Everest1830_1967Definition",
+    "GCSE_Everest1830_1975Definition",
+    "GCSE_Everest1830Modified",
+    "GCSE_GRS1980",
+    "GCSE_Helmert1906",
+    "GCSE_IndonesianNationalSpheroid",
+    "GCSE_International1924",
+    "GCSE_International1967",
+    "GCSE_Krassowsky1940",
+    "GCSE_NWL9D",
+    "GCSE_NWL10D",
+    "GCSE_Plessis1817",
+    "GCSE_Struve1860",
+    "GCSE_WarOffice",
+    "GCSE_WGS84",
+    "GCSE_GEM10C",
+    "GCSE_OSU86F",
+    "GCSE_OSU91A",
+    "GCSE_Clarke1880",
+    "GCSE_Sphere"
+};
+
+const char *const ff_tiff_geodetic_datum_codes[] = {
+    "Datum_Adindan",
+    "Datum_Australian_Geodetic_Datum_1966",
+    "Datum_Australian_Geodetic_Datum_1984",
+    "Datum_Ain_el_Abd_1970",
+    "Datum_Afgooye",
+    "Datum_Agadez",
+    "Datum_Lisbon",
+    "Datum_Aratu",
+    "Datum_Arc_1950",
+    "Datum_Arc_1960",
+    "Datum_Batavia",
+    "Datum_Barbados",
+    "Datum_Beduaram",
+    "Datum_Beijing_1954",
+    "Datum_Reseau_National_Belge_1950",
+    "Datum_Bermuda_1957",
+    "Datum_Bern_1898",
+    "Datum_Bogota",
+    "Datum_Bukit_Rimpah",
+    "Datum_Camacupa",
+    "Datum_Campo_Inchauspe",
+    "Datum_Cape",
+    "Datum_Carthage",
+    "Datum_Chua",
+    "Datum_Corrego_Alegre",
+    "Datum_Cote_d_Ivoire",
+    "Datum_Deir_ez_Zor",
+    "Datum_Douala",
+    "Datum_Egypt_1907",
+    "Datum_European_Datum_1950",
+    "Datum_European_Datum_1987",
+    "Datum_Fahud",
+    "Datum_Gandajika_1970",
+    "Datum_Garoua",
+    "Datum_Guyane_Francaise",
+    "Datum_Hu_Tzu_Shan",
+    "Datum_Hungarian_Datum_1972",
+    "Datum_Indonesian_Datum_1974",
+    "Datum_Indian_1954",
+    "Datum_Indian_1975",
+    "Datum_Jamaica_1875",
+    "Datum_Jamaica_1969",
+    "Datum_Kalianpur",
+    "Datum_Kandawala",
+    "Datum_Kertau",
+    "Datum_Kuwait_Oil_Company",
+    "Datum_La_Canoa",
+    "Datum_Provisional_S_American_Datum_1956",
+    "Datum_Lake",
+    "Datum_Leigon",
+    "Datum_Liberia_1964",
+    "Datum_Lome",
+    "Datum_Luzon_1911",
+    "Datum_Hito_XVIII_1963",
+    "Datum_Herat_North",
+    "Datum_Mahe_1971",
+    "Datum_Makassar",
+    "Datum_European_Reference_System_1989",
+    "Datum_Malongo_1987",
+    "Datum_Manoca",
+    "Datum_Merchich",
+    "Datum_Massawa",
+    "Datum_Minna",
+    "Datum_Mhast",
+    "Datum_Monte_Mario",
+    "Datum_M_poraloko",
+    "Datum_North_American_Datum_1927",
+    "Datum_NAD_Michigan",
+    "Datum_North_American_Datum_1983",
+    "Datum_Nahrwan_1967",
+    "Datum_Naparima_1972",
+    "Datum_New_Zealand_Geodetic_Datum_1949",
+    "Datum_NGO_1948",
+    "Datum_Datum_73",
+    "Datum_Nouvelle_Triangulation_Francaise",
+    "Datum_NSWC_9Z_2",
+    "Datum_OSGB_1936",
+    "Datum_OSGB_1970_SN",
+    "Datum_OS_SN_1980",
+    "Datum_Padang_1884",
+    "Datum_Palestine_1923",
+    "Datum_Pointe_Noire",
+    "Datum_Geocentric_Datum_of_Australia_1994",
+    "Datum_Pulkovo_1942",
+    "Datum_Qatar",
+    "Datum_Qatar_1948",
+    "Datum_Qornoq",
+    "Datum_Loma_Quintana",
+    "Datum_Amersfoort",
+    "Datum_RT38",
+    "Datum_South_American_Datum_1969",
+    "Datum_Sapper_Hill_1943",
+    "Datum_Schwarzeck",
+    "Datum_Segora",
+    "Datum_Serindung",
+    "Datum_Sudan",
+    "Datum_Tananarive_1925",
+    "Datum_Timbalai_1948",
+    "Datum_TM65",
+    "Datum_TM75",
+    "Datum_Tokyo",
+    "Datum_Trinidad_1903",
+    "Datum_Trucial_Coast_1948",
+    "Datum_Voirol_1875",
+    "Datum_Voirol_Unifie_1960",
+    "Datum_Bern_1938",
+    "Datum_Nord_Sahara_1959",
+    "Datum_Stockholm_1938",
+    "Datum_Yacare",
+    "Datum_Yoff",
+    "Datum_Zanderij",
+    "Datum_Militar_Geographische_Institut",
+    "Datum_Reseau_National_Belge_1972",
+    "Datum_Deutsche_Hauptdreiecksnetz",
+    "Datum_Conakry_1905",
+    "Datum_WGS72",
+    "Datum_WGS72_Transit_Broadcast_Ephemeris",
+    "Datum_WGS84",
+    "Datum_Ancienne_Triangulation_Francaise",
+    "Datum_Nord_de_Guerre"
+};
+
+const char *const ff_tiff_geodetic_datum_e_codes[] = {
+    "DatumE_Airy1830",
+    "DatumE_AiryModified1849",
+    "DatumE_AustralianNationalSpheroid",
+    "DatumE_Bessel1841",
+    "DatumE_BesselModified",
+    "DatumE_BesselNamibia",
+    "DatumE_Clarke1858",
+    "DatumE_Clarke1866",
+    "DatumE_Clarke1866Michigan",
+    "DatumE_Clarke1880_Benoit",
+    "DatumE_Clarke1880_IGN",
+    "DatumE_Clarke1880_RGS",
+    "DatumE_Clarke1880_Arc",
+    "DatumE_Clarke1880_SGA1922",
+    "DatumE_Everest1830_1937Adjustment",
+    "DatumE_Everest1830_1967Definition",
+    "DatumE_Everest1830_1975Definition",
+    "DatumE_Everest1830Modified",
+    "DatumE_GRS1980",
+    "DatumE_Helmert1906",
+    "DatumE_IndonesianNationalSpheroid",
+    "DatumE_International1924",
+    "DatumE_International1967",
+    "DatumE_Krassowsky1960",
+    "DatumE_NWL9D",
+    "DatumE_NWL10D",
+    "DatumE_Plessis1817",
+    "DatumE_Struve1860",
+    "DatumE_WarOffice",
+    "DatumE_WGS84",
+    "DatumE_GEM10C",
+    "DatumE_OSU86F",
+    "DatumE_OSU91A",
+    "DatumE_Clarke1880",
+    "DatumE_Sphere"
+};
+
+const char *const ff_tiff_ellipsoid_codes[] = {
+    "Ellipse_Airy_1830",
+    "Ellipse_Airy_Modified_1849",
+    "Ellipse_Australian_National_Spheroid",
+    "Ellipse_Bessel_1841",
+    "Ellipse_Bessel_Modified",
+    "Ellipse_Bessel_Namibia",
+    "Ellipse_Clarke_1858",
+    "Ellipse_Clarke_1866",
+    "Ellipse_Clarke_1866_Michigan",
+    "Ellipse_Clarke_1880_Benoit",
+    "Ellipse_Clarke_1880_IGN",
+    "Ellipse_Clarke_1880_RGS",
+    "Ellipse_Clarke_1880_Arc",
+    "Ellipse_Clarke_1880_SGA_1922",
+    "Ellipse_Everest_1830_1937_Adjustment",
+    "Ellipse_Everest_1830_1967_Definition",
+    "Ellipse_Everest_1830_1975_Definition",
+    "Ellipse_Everest_1830_Modified",
+    "Ellipse_GRS_1980",
+    "Ellipse_Helmert_1906",
+    "Ellipse_Indonesian_National_Spheroid",
+    "Ellipse_International_1924",
+    "Ellipse_International_1967",
+    "Ellipse_Krassowsky_1940",
+    "Ellipse_NWL_9D",
+    "Ellipse_NWL_10D",
+    "Ellipse_Plessis_1817",
+    "Ellipse_Struve_1860",
+    "Ellipse_War_Office",
+    "Ellipse_WGS_84",
+    "Ellipse_GEM_10C",
+    "Ellipse_OSU86F",
+    "Ellipse_OSU91A",
+    "Ellipse_Clarke_1880",
+    "Ellipse_Sphere"
+};
+
+const char *const ff_tiff_prime_meridian_codes[] = {
+    "PM_Greenwich",
+    "PM_Lisbon",
+    "PM_Paris",
+    "PM_Bogota",
+    "PM_Madrid",
+    "PM_Rome",
+    "PM_Bern",
+    "PM_Jakarta",
+    "PM_Ferro",
+    "PM_Brussels",
+    "PM_Stockholm"
+};
+
+const TiffGeoTagKeyName ff_tiff_proj_cs_type_codes[] = {
+    {20137, "PCS_Adindan_UTM_zone_37N"},
+    {20138, "PCS_Adindan_UTM_zone_38N"},
+    {20248, "PCS_AGD66_AMG_zone_48"},
+    {20249, "PCS_AGD66_AMG_zone_49"},
+    {20250, "PCS_AGD66_AMG_zone_50"},
+    {20251, "PCS_AGD66_AMG_zone_51"},
+    {20252, "PCS_AGD66_AMG_zone_52"},
+    {20253, "PCS_AGD66_AMG_zone_53"},
+    {20254, "PCS_AGD66_AMG_zone_54"},
+    {20255, "PCS_AGD66_AMG_zone_55"},
+    {20256, "PCS_AGD66_AMG_zone_56"},
+    {20257, "PCS_AGD66_AMG_zone_57"},
+    {20258, "PCS_AGD66_AMG_zone_58"},
+    {20348, "PCS_AGD84_AMG_zone_48"},
+    {20349, "PCS_AGD84_AMG_zone_49"},
+    {20350, "PCS_AGD84_AMG_zone_50"},
+    {20351, "PCS_AGD84_AMG_zone_51"},
+    {20352, "PCS_AGD84_AMG_zone_52"},
+    {20353, "PCS_AGD84_AMG_zone_53"},
+    {20354, "PCS_AGD84_AMG_zone_54"},
+    {20355, "PCS_AGD84_AMG_zone_55"},
+    {20356, "PCS_AGD84_AMG_zone_56"},
+    {20357, "PCS_AGD84_AMG_zone_57"},
+    {20358, "PCS_AGD84_AMG_zone_58"},
+    {20437, "PCS_Ain_el_Abd_UTM_zone_37N"},
+    {20438, "PCS_Ain_el_Abd_UTM_zone_38N"},
+    {20439, "PCS_Ain_el_Abd_UTM_zone_39N"},
+    {20499, "PCS_Ain_el_Abd_Bahrain_Grid"},
+    {20538, "PCS_Afgooye_UTM_zone_38N"},
+    {20539, "PCS_Afgooye_UTM_zone_39N"},
+    {20700, "PCS_Lisbon_Portugese_Grid"},
+    {20822, "PCS_Aratu_UTM_zone_22S"},
+    {20823, "PCS_Aratu_UTM_zone_23S"},
+    {20824, "PCS_Aratu_UTM_zone_24S"},
+    {20973, "PCS_Arc_1950_Lo13"},
+    {20975, "PCS_Arc_1950_Lo15"},
+    {20977, "PCS_Arc_1950_Lo17"},
+    {20979, "PCS_Arc_1950_Lo19"},
+    {20981, "PCS_Arc_1950_Lo21"},
+    {20983, "PCS_Arc_1950_Lo23"},
+    {20985, "PCS_Arc_1950_Lo25"},
+    {20987, "PCS_Arc_1950_Lo27"},
+    {20989, "PCS_Arc_1950_Lo29"},
+    {20991, "PCS_Arc_1950_Lo31"},
+    {20993, "PCS_Arc_1950_Lo33"},
+    {20995, "PCS_Arc_1950_Lo35"},
+    {21100, "PCS_Batavia_NEIEZ"},
+    {21148, "PCS_Batavia_UTM_zone_48S"},
+    {21149, "PCS_Batavia_UTM_zone_49S"},
+    {21150, "PCS_Batavia_UTM_zone_50S"},
+    {21413, "PCS_Beijing_Gauss_zone_13"},
+    {21414, "PCS_Beijing_Gauss_zone_14"},
+    {21415, "PCS_Beijing_Gauss_zone_15"},
+    {21416, "PCS_Beijing_Gauss_zone_16"},
+    {21417, "PCS_Beijing_Gauss_zone_17"},
+    {21418, "PCS_Beijing_Gauss_zone_18"},
+    {21419, "PCS_Beijing_Gauss_zone_19"},
+    {21420, "PCS_Beijing_Gauss_zone_20"},
+    {21421, "PCS_Beijing_Gauss_zone_21"},
+    {21422, "PCS_Beijing_Gauss_zone_22"},
+    {21423, "PCS_Beijing_Gauss_zone_23"},
+    {21473, "PCS_Beijing_Gauss_13N"},
+    {21474, "PCS_Beijing_Gauss_14N"},
+    {21475, "PCS_Beijing_Gauss_15N"},
+    {21476, "PCS_Beijing_Gauss_16N"},
+    {21477, "PCS_Beijing_Gauss_17N"},
+    {21478, "PCS_Beijing_Gauss_18N"},
+    {21479, "PCS_Beijing_Gauss_19N"},
+    {21480, "PCS_Beijing_Gauss_20N"},
+    {21481, "PCS_Beijing_Gauss_21N"},
+    {21482, "PCS_Beijing_Gauss_22N"},
+    {21483, "PCS_Beijing_Gauss_23N"},
+    {21500, "PCS_Belge_Lambert_50"},
+    {21790, "PCS_Bern_1898_Swiss_Old"},
+    {21817, "PCS_Bogota_UTM_zone_17N"},
+    {21818, "PCS_Bogota_UTM_zone_18N"},
+    {21891, "PCS_Bogota_Colombia_3W"},
+    {21892, "PCS_Bogota_Colombia_Bogota"},
+    {21893, "PCS_Bogota_Colombia_3E"},
+    {21894, "PCS_Bogota_Colombia_6E"},
+    {22032, "PCS_Camacupa_UTM_32S"},
+    {22033, "PCS_Camacupa_UTM_33S"},
+    {22191, "PCS_C_Inchauspe_Argentina_1"},
+    {22192, "PCS_C_Inchauspe_Argentina_2"},
+    {22193, "PCS_C_Inchauspe_Argentina_3"},
+    {22194, "PCS_C_Inchauspe_Argentina_4"},
+    {22195, "PCS_C_Inchauspe_Argentina_5"},
+    {22196, "PCS_C_Inchauspe_Argentina_6"},
+    {22197, "PCS_C_Inchauspe_Argentina_7"},
+    {22332, "PCS_Carthage_UTM_zone_32N"},
+    {22391, "PCS_Carthage_Nord_Tunisie"},
+    {22392, "PCS_Carthage_Sud_Tunisie"},
+    {22523, "PCS_Corrego_Alegre_UTM_23S"},
+    {22524, "PCS_Corrego_Alegre_UTM_24S"},
+    {22832, "PCS_Douala_UTM_zone_32N"},
+    {22992, "PCS_Egypt_1907_Red_Belt"},
+    {22993, "PCS_Egypt_1907_Purple_Belt"},
+    {22994, "PCS_Egypt_1907_Ext_Purple"},
+    {23028, "PCS_ED50_UTM_zone_28N"},
+    {23029, "PCS_ED50_UTM_zone_29N"},
+    {23030, "PCS_ED50_UTM_zone_30N"},
+    {23031, "PCS_ED50_UTM_zone_31N"},
+    {23032, "PCS_ED50_UTM_zone_32N"},
+    {23033, "PCS_ED50_UTM_zone_33N"},
+    {23034, "PCS_ED50_UTM_zone_34N"},
+    {23035, "PCS_ED50_UTM_zone_35N"},
+    {23036, "PCS_ED50_UTM_zone_36N"},
+    {23037, "PCS_ED50_UTM_zone_37N"},
+    {23038, "PCS_ED50_UTM_zone_38N"},
+    {23239, "PCS_Fahud_UTM_zone_39N"},
+    {23240, "PCS_Fahud_UTM_zone_40N"},
+    {23433, "PCS_Garoua_UTM_zone_33N"},
+    {23846, "PCS_ID74_UTM_zone_46N"},
+    {23847, "PCS_ID74_UTM_zone_47N"},
+    {23848, "PCS_ID74_UTM_zone_48N"},
+    {23849, "PCS_ID74_UTM_zone_49N"},
+    {23850, "PCS_ID74_UTM_zone_50N"},
+    {23851, "PCS_ID74_UTM_zone_51N"},
+    {23852, "PCS_ID74_UTM_zone_52N"},
+    {23853, "PCS_ID74_UTM_zone_53N"},
+    {23886, "PCS_ID74_UTM_zone_46S"},
+    {23887, "PCS_ID74_UTM_zone_47S"},
+    {23888, "PCS_ID74_UTM_zone_48S"},
+    {23889, "PCS_ID74_UTM_zone_49S"},
+    {23890, "PCS_ID74_UTM_zone_50S"},
+    {23891, "PCS_ID74_UTM_zone_51S"},
+    {23892, "PCS_ID74_UTM_zone_52S"},
+    {23893, "PCS_ID74_UTM_zone_53S"},
+    {23894, "PCS_ID74_UTM_zone_54S"},
+    {23947, "PCS_Indian_1954_UTM_47N"},
+    {23948, "PCS_Indian_1954_UTM_48N"},
+    {24047, "PCS_Indian_1975_UTM_47N"},
+    {24048, "PCS_Indian_1975_UTM_48N"},
+    {24100, "PCS_Jamaica_1875_Old_Grid"},
+    {24200, "PCS_JAD69_Jamaica_Grid"},
+    {24370, "PCS_Kalianpur_India_0"},
+    {24371, "PCS_Kalianpur_India_I"},
+    {24372, "PCS_Kalianpur_India_IIa"},
+    {24373, "PCS_Kalianpur_India_IIIa"},
+    {24374, "PCS_Kalianpur_India_IVa"},
+    {24382, "PCS_Kalianpur_India_IIb"},
+    {24383, "PCS_Kalianpur_India_IIIb"},
+    {24384, "PCS_Kalianpur_India_IVb"},
+    {24500, "PCS_Kertau_Singapore_Grid"},
+    {24547, "PCS_Kertau_UTM_zone_47N"},
+    {24548, "PCS_Kertau_UTM_zone_48N"},
+    {24720, "PCS_La_Canoa_UTM_zone_20N"},
+    {24721, "PCS_La_Canoa_UTM_zone_21N"},
+    {24818, "PCS_PSAD56_UTM_zone_18N"},
+    {24819, "PCS_PSAD56_UTM_zone_19N"},
+    {24820, "PCS_PSAD56_UTM_zone_20N"},
+    {24821, "PCS_PSAD56_UTM_zone_21N"},
+    {24877, "PCS_PSAD56_UTM_zone_17S"},
+    {24878, "PCS_PSAD56_UTM_zone_18S"},
+    {24879, "PCS_PSAD56_UTM_zone_19S"},
+    {24880, "PCS_PSAD56_UTM_zone_20S"},
+    {24891, "PCS_PSAD56_Peru_west_zone"},
+    {24892, "PCS_PSAD56_Peru_central"},
+    {24893, "PCS_PSAD56_Peru_east_zone"},
+    {25000, "PCS_Leigon_Ghana_Grid"},
+    {25231, "PCS_Lome_UTM_zone_31N"},
+    {25391, "PCS_Luzon_Philippines_I"},
+    {25392, "PCS_Luzon_Philippines_II"},
+    {25393, "PCS_Luzon_Philippines_III"},
+    {25394, "PCS_Luzon_Philippines_IV"},
+    {25395, "PCS_Luzon_Philippines_V"},
+    {25700, "PCS_Makassar_NEIEZ"},
+    {25932, "PCS_Malongo_1987_UTM_32S"},
+    {26191, "PCS_Merchich_Nord_Maroc"},
+    {26192, "PCS_Merchich_Sud_Maroc"},
+    {26193, "PCS_Merchich_Sahara"},
+    {26237, "PCS_Massawa_UTM_zone_37N"},
+    {26331, "PCS_Minna_UTM_zone_31N"},
+    {26332, "PCS_Minna_UTM_zone_32N"},
+    {26391, "PCS_Minna_Nigeria_West"},
+    {26392, "PCS_Minna_Nigeria_Mid_Belt"},
+    {26393, "PCS_Minna_Nigeria_East"},
+    {26432, "PCS_Mhast_UTM_zone_32S"},
+    {26591, "PCS_Monte_Mario_Italy_1"},
+    {26592, "PCS_Monte_Mario_Italy_2"},
+    {26632, "PCS_M_poraloko_UTM_32N"},
+    {26692, "PCS_M_poraloko_UTM_32S"},
+    {26703, "PCS_NAD27_UTM_zone_3N"},
+    {26704, "PCS_NAD27_UTM_zone_4N"},
+    {26705, "PCS_NAD27_UTM_zone_5N"},
+    {26706, "PCS_NAD27_UTM_zone_6N"},
+    {26707, "PCS_NAD27_UTM_zone_7N"},
+    {26708, "PCS_NAD27_UTM_zone_8N"},
+    {26709, "PCS_NAD27_UTM_zone_9N"},
+    {26710, "PCS_NAD27_UTM_zone_10N"},
+    {26711, "PCS_NAD27_UTM_zone_11N"},
+    {26712, "PCS_NAD27_UTM_zone_12N"},
+    {26713, "PCS_NAD27_UTM_zone_13N"},
+    {26714, "PCS_NAD27_UTM_zone_14N"},
+    {26715, "PCS_NAD27_UTM_zone_15N"},
+    {26716, "PCS_NAD27_UTM_zone_16N"},
+    {26717, "PCS_NAD27_UTM_zone_17N"},
+    {26718, "PCS_NAD27_UTM_zone_18N"},
+    {26719, "PCS_NAD27_UTM_zone_19N"},
+    {26720, "PCS_NAD27_UTM_zone_20N"},
+    {26721, "PCS_NAD27_UTM_zone_21N"},
+    {26722, "PCS_NAD27_UTM_zone_22N"},
+    {26729, "PCS_NAD27_Alabama_East"},
+    {26730, "PCS_NAD27_Alabama_West"},
+    {26731, "PCS_NAD27_Alaska_zone_1"},
+    {26732, "PCS_NAD27_Alaska_zone_2"},
+    {26733, "PCS_NAD27_Alaska_zone_3"},
+    {26734, "PCS_NAD27_Alaska_zone_4"},
+    {26735, "PCS_NAD27_Alaska_zone_5"},
+    {26736, "PCS_NAD27_Alaska_zone_6"},
+    {26737, "PCS_NAD27_Alaska_zone_7"},
+    {26738, "PCS_NAD27_Alaska_zone_8"},
+    {26739, "PCS_NAD27_Alaska_zone_9"},
+    {26740, "PCS_NAD27_Alaska_zone_10"},
+    {26741, "PCS_NAD27_California_I"},
+    {26742, "PCS_NAD27_California_II"},
+    {26743, "PCS_NAD27_California_III"},
+    {26744, "PCS_NAD27_California_IV"},
+    {26745, "PCS_NAD27_California_V"},
+    {26746, "PCS_NAD27_California_VI"},
+    {26747, "PCS_NAD27_California_VII"},
+    {26748, "PCS_NAD27_Arizona_East"},
+    {26749, "PCS_NAD27_Arizona_Central"},
+    {26750, "PCS_NAD27_Arizona_West"},
+    {26751, "PCS_NAD27_Arkansas_North"},
+    {26752, "PCS_NAD27_Arkansas_South"},
+    {26753, "PCS_NAD27_Colorado_North"},
+    {26754, "PCS_NAD27_Colorado_Central"},
+    {26755, "PCS_NAD27_Colorado_South"},
+    {26756, "PCS_NAD27_Connecticut"},
+    {26757, "PCS_NAD27_Delaware"},
+    {26758, "PCS_NAD27_Florida_East"},
+    {26759, "PCS_NAD27_Florida_West"},
+    {26760, "PCS_NAD27_Florida_North"},
+    {26761, "PCS_NAD27_Hawaii_zone_1"},
+    {26762, "PCS_NAD27_Hawaii_zone_2"},
+    {26763, "PCS_NAD27_Hawaii_zone_3"},
+    {26764, "PCS_NAD27_Hawaii_zone_4"},
+    {26765, "PCS_NAD27_Hawaii_zone_5"},
+    {26766, "PCS_NAD27_Georgia_East"},
+    {26767, "PCS_NAD27_Georgia_West"},
+    {26768, "PCS_NAD27_Idaho_East"},
+    {26769, "PCS_NAD27_Idaho_Central"},
+    {26770, "PCS_NAD27_Idaho_West"},
+    {26771, "PCS_NAD27_Illinois_East"},
+    {26772, "PCS_NAD27_Illinois_West"},
+    {26773, "PCS_NAD27_Indiana_East"},
+    {26774, "PCS_NAD27_BLM_14N_feet"},
+    {26774, "PCS_NAD27_Indiana_West"},
+    {26775, "PCS_NAD27_BLM_15N_feet"},
+    {26775, "PCS_NAD27_Iowa_North"},
+    {26776, "PCS_NAD27_BLM_16N_feet"},
+    {26776, "PCS_NAD27_Iowa_South"},
+    {26777, "PCS_NAD27_BLM_17N_feet"},
+    {26777, "PCS_NAD27_Kansas_North"},
+    {26778, "PCS_NAD27_Kansas_South"},
+    {26779, "PCS_NAD27_Kentucky_North"},
+    {26780, "PCS_NAD27_Kentucky_South"},
+    {26781, "PCS_NAD27_Louisiana_North"},
+    {26782, "PCS_NAD27_Louisiana_South"},
+    {26783, "PCS_NAD27_Maine_East"},
+    {26784, "PCS_NAD27_Maine_West"},
+    {26785, "PCS_NAD27_Maryland"},
+    {26786, "PCS_NAD27_Massachusetts"},
+    {26787, "PCS_NAD27_Massachusetts_Is"},
+    {26788, "PCS_NAD27_Michigan_North"},
+    {26789, "PCS_NAD27_Michigan_Central"},
+    {26790, "PCS_NAD27_Michigan_South"},
+    {26791, "PCS_NAD27_Minnesota_North"},
+    {26792, "PCS_NAD27_Minnesota_Cent"},
+    {26793, "PCS_NAD27_Minnesota_South"},
+    {26794, "PCS_NAD27_Mississippi_East"},
+    {26795, "PCS_NAD27_Mississippi_West"},
+    {26796, "PCS_NAD27_Missouri_East"},
+    {26797, "PCS_NAD27_Missouri_Central"},
+    {26798, "PCS_NAD27_Missouri_West"},
+    {26801, "PCS_NAD_Michigan_Michigan_East"},
+    {26802, "PCS_NAD_Michigan_Michigan_Old_Central"},
+    {26803, "PCS_NAD_Michigan_Michigan_West"},
+    {26903, "PCS_NAD83_UTM_zone_3N"},
+    {26904, "PCS_NAD83_UTM_zone_4N"},
+    {26905, "PCS_NAD83_UTM_zone_5N"},
+    {26906, "PCS_NAD83_UTM_zone_6N"},
+    {26907, "PCS_NAD83_UTM_zone_7N"},
+    {26908, "PCS_NAD83_UTM_zone_8N"},
+    {26909, "PCS_NAD83_UTM_zone_9N"},
+    {26910, "PCS_NAD83_UTM_zone_10N"},
+    {26911, "PCS_NAD83_UTM_zone_11N"},
+    {26912, "PCS_NAD83_UTM_zone_12N"},
+    {26913, "PCS_NAD83_UTM_zone_13N"},
+    {26914, "PCS_NAD83_UTM_zone_14N"},
+    {26915, "PCS_NAD83_UTM_zone_15N"},
+    {26916, "PCS_NAD83_UTM_zone_16N"},
+    {26917, "PCS_NAD83_UTM_zone_17N"},
+    {26918, "PCS_NAD83_UTM_zone_18N"},
+    {26919, "PCS_NAD83_UTM_zone_19N"},
+    {26920, "PCS_NAD83_UTM_zone_20N"},
+    {26921, "PCS_NAD83_UTM_zone_21N"},
+    {26922, "PCS_NAD83_UTM_zone_22N"},
+    {26923, "PCS_NAD83_UTM_zone_23N"},
+    {26929, "PCS_NAD83_Alabama_East"},
+    {26930, "PCS_NAD83_Alabama_West"},
+    {26931, "PCS_NAD83_Alaska_zone_1"},
+    {26932, "PCS_NAD83_Alaska_zone_2"},
+    {26933, "PCS_NAD83_Alaska_zone_3"},
+    {26934, "PCS_NAD83_Alaska_zone_4"},
+    {26935, "PCS_NAD83_Alaska_zone_5"},
+    {26936, "PCS_NAD83_Alaska_zone_6"},
+    {26937, "PCS_NAD83_Alaska_zone_7"},
+    {26938, "PCS_NAD83_Alaska_zone_8"},
+    {26939, "PCS_NAD83_Alaska_zone_9"},
+    {26940, "PCS_NAD83_Alaska_zone_10"},
+    {26941, "PCS_NAD83_California_1"},
+    {26942, "PCS_NAD83_California_2"},
+    {26943, "PCS_NAD83_California_3"},
+    {26944, "PCS_NAD83_California_4"},
+    {26945, "PCS_NAD83_California_5"},
+    {26946, "PCS_NAD83_California_6"},
+    {26948, "PCS_NAD83_Arizona_East"},
+    {26949, "PCS_NAD83_Arizona_Central"},
+    {26950, "PCS_NAD83_Arizona_West"},
+    {26951, "PCS_NAD83_Arkansas_North"},
+    {26952, "PCS_NAD83_Arkansas_South"},
+    {26953, "PCS_NAD83_Colorado_North"},
+    {26954, "PCS_NAD83_Colorado_Central"},
+    {26955, "PCS_NAD83_Colorado_South"},
+    {26956, "PCS_NAD83_Connecticut"},
+    {26957, "PCS_NAD83_Delaware"},
+    {26958, "PCS_NAD83_Florida_East"},
+    {26959, "PCS_NAD83_Florida_West"},
+    {26960, "PCS_NAD83_Florida_North"},
+    {26961, "PCS_NAD83_Hawaii_zone_1"},
+    {26962, "PCS_NAD83_Hawaii_zone_2"},
+    {26963, "PCS_NAD83_Hawaii_zone_3"},
+    {26964, "PCS_NAD83_Hawaii_zone_4"},
+    {26965, "PCS_NAD83_Hawaii_zone_5"},
+    {26966, "PCS_NAD83_Georgia_East"},
+    {26967, "PCS_NAD83_Georgia_West"},
+    {26968, "PCS_NAD83_Idaho_East"},
+    {26969, "PCS_NAD83_Idaho_Central"},
+    {26970, "PCS_NAD83_Idaho_West"},
+    {26971, "PCS_NAD83_Illinois_East"},
+    {26972, "PCS_NAD83_Illinois_West"},
+    {26973, "PCS_NAD83_Indiana_East"},
+    {26974, "PCS_NAD83_Indiana_West"},
+    {26975, "PCS_NAD83_Iowa_North"},
+    {26976, "PCS_NAD83_Iowa_South"},
+    {26977, "PCS_NAD83_Kansas_North"},
+    {26978, "PCS_NAD83_Kansas_South"},
+    {26979, "PCS_NAD83_Kentucky_North"},
+    {26980, "PCS_NAD83_Kentucky_South"},
+    {26981, "PCS_NAD83_Louisiana_North"},
+    {26982, "PCS_NAD83_Louisiana_South"},
+    {26983, "PCS_NAD83_Maine_East"},
+    {26984, "PCS_NAD83_Maine_West"},
+    {26985, "PCS_NAD83_Maryland"},
+    {26986, "PCS_NAD83_Massachusetts"},
+    {26987, "PCS_NAD83_Massachusetts_Is"},
+    {26988, "PCS_NAD83_Michigan_North"},
+    {26989, "PCS_NAD83_Michigan_Central"},
+    {26990, "PCS_NAD83_Michigan_South"},
+    {26991, "PCS_NAD83_Minnesota_North"},
+    {26992, "PCS_NAD83_Minnesota_Cent"},
+    {26993, "PCS_NAD83_Minnesota_South"},
+    {26994, "PCS_NAD83_Mississippi_East"},
+    {26995, "PCS_NAD83_Mississippi_West"},
+    {26996, "PCS_NAD83_Missouri_East"},
+    {26997, "PCS_NAD83_Missouri_Central"},
+    {26998, "PCS_NAD83_Missouri_West"},
+    {27038, "PCS_Nahrwan_1967_UTM_38N"},
+    {27039, "PCS_Nahrwan_1967_UTM_39N"},
+    {27040, "PCS_Nahrwan_1967_UTM_40N"},
+    {27120, "PCS_Naparima_UTM_20N"},
+    {27200, "PCS_GD49_NZ_Map_Grid"},
+    {27291, "PCS_GD49_North_Island_Grid"},
+    {27292, "PCS_GD49_South_Island_Grid"},
+    {27429, "PCS_Datum_73_UTM_zone_29N"},
+    {27500, "PCS_ATF_Nord_de_Guerre"},
+    {27581, "PCS_NTF_France_I"},
+    {27582, "PCS_NTF_France_II"},
+    {27583, "PCS_NTF_France_III"},
+    {27591, "PCS_NTF_Nord_France"},
+    {27592, "PCS_NTF_Centre_France"},
+    {27593, "PCS_NTF_Sud_France"},
+    {27700, "PCS_British_National_Grid"},
+    {28232, "PCS_Point_Noire_UTM_32S"},
+    {28348, "PCS_GDA94_MGA_zone_48"},
+    {28349, "PCS_GDA94_MGA_zone_49"},
+    {28350, "PCS_GDA94_MGA_zone_50"},
+    {28351, "PCS_GDA94_MGA_zone_51"},
+    {28352, "PCS_GDA94_MGA_zone_52"},
+    {28353, "PCS_GDA94_MGA_zone_53"},
+    {28354, "PCS_GDA94_MGA_zone_54"},
+    {28355, "PCS_GDA94_MGA_zone_55"},
+    {28356, "PCS_GDA94_MGA_zone_56"},
+    {28357, "PCS_GDA94_MGA_zone_57"},
+    {28358, "PCS_GDA94_MGA_zone_58"},
+    {28404, "PCS_Pulkovo_Gauss_zone_4"},
+    {28405, "PCS_Pulkovo_Gauss_zone_5"},
+    {28406, "PCS_Pulkovo_Gauss_zone_6"},
+    {28407, "PCS_Pulkovo_Gauss_zone_7"},
+    {28408, "PCS_Pulkovo_Gauss_zone_8"},
+    {28409, "PCS_Pulkovo_Gauss_zone_9"},
+    {28410, "PCS_Pulkovo_Gauss_zone_10"},
+    {28411, "PCS_Pulkovo_Gauss_zone_11"},
+    {28412, "PCS_Pulkovo_Gauss_zone_12"},
+    {28413, "PCS_Pulkovo_Gauss_zone_13"},
+    {28414, "PCS_Pulkovo_Gauss_zone_14"},
+    {28415, "PCS_Pulkovo_Gauss_zone_15"},
+    {28416, "PCS_Pulkovo_Gauss_zone_16"},
+    {28417, "PCS_Pulkovo_Gauss_zone_17"},
+    {28418, "PCS_Pulkovo_Gauss_zone_18"},
+    {28419, "PCS_Pulkovo_Gauss_zone_19"},
+    {28420, "PCS_Pulkovo_Gauss_zone_20"},
+    {28421, "PCS_Pulkovo_Gauss_zone_21"},
+    {28422, "PCS_Pulkovo_Gauss_zone_22"},
+    {28423, "PCS_Pulkovo_Gauss_zone_23"},
+    {28424, "PCS_Pulkovo_Gauss_zone_24"},
+    {28425, "PCS_Pulkovo_Gauss_zone_25"},
+    {28426, "PCS_Pulkovo_Gauss_zone_26"},
+    {28427, "PCS_Pulkovo_Gauss_zone_27"},
+    {28428, "PCS_Pulkovo_Gauss_zone_28"},
+    {28429, "PCS_Pulkovo_Gauss_zone_29"},
+    {28430, "PCS_Pulkovo_Gauss_zone_30"},
+    {28431, "PCS_Pulkovo_Gauss_zone_31"},
+    {28432, "PCS_Pulkovo_Gauss_zone_32"},
+    {28464, "PCS_Pulkovo_Gauss_4N"},
+    {28465, "PCS_Pulkovo_Gauss_5N"},
+    {28466, "PCS_Pulkovo_Gauss_6N"},
+    {28467, "PCS_Pulkovo_Gauss_7N"},
+    {28468, "PCS_Pulkovo_Gauss_8N"},
+    {28469, "PCS_Pulkovo_Gauss_9N"},
+    {28470, "PCS_Pulkovo_Gauss_10N"},
+    {28471, "PCS_Pulkovo_Gauss_11N"},
+    {28472, "PCS_Pulkovo_Gauss_12N"},
+    {28473, "PCS_Pulkovo_Gauss_13N"},
+    {28474, "PCS_Pulkovo_Gauss_14N"},
+    {28475, "PCS_Pulkovo_Gauss_15N"},
+    {28476, "PCS_Pulkovo_Gauss_16N"},
+    {28477, "PCS_Pulkovo_Gauss_17N"},
+    {28478, "PCS_Pulkovo_Gauss_18N"},
+    {28479, "PCS_Pulkovo_Gauss_19N"},
+    {28480, "PCS_Pulkovo_Gauss_20N"},
+    {28481, "PCS_Pulkovo_Gauss_21N"},
+    {28482, "PCS_Pulkovo_Gauss_22N"},
+    {28483, "PCS_Pulkovo_Gauss_23N"},
+    {28484, "PCS_Pulkovo_Gauss_24N"},
+    {28485, "PCS_Pulkovo_Gauss_25N"},
+    {28486, "PCS_Pulkovo_Gauss_26N"},
+    {28487, "PCS_Pulkovo_Gauss_27N"},
+    {28488, "PCS_Pulkovo_Gauss_28N"},
+    {28489, "PCS_Pulkovo_Gauss_29N"},
+    {28490, "PCS_Pulkovo_Gauss_30N"},
+    {28491, "PCS_Pulkovo_Gauss_31N"},
+    {28492, "PCS_Pulkovo_Gauss_32N"},
+    {28600, "PCS_Qatar_National_Grid"},
+    {28991, "PCS_RD_Netherlands_Old"},
+    {28992, "PCS_RD_Netherlands_New"},
+    {29118, "PCS_SAD69_UTM_zone_18N"},
+    {29119, "PCS_SAD69_UTM_zone_19N"},
+    {29120, "PCS_SAD69_UTM_zone_20N"},
+    {29121, "PCS_SAD69_UTM_zone_21N"},
+    {29122, "PCS_SAD69_UTM_zone_22N"},
+    {29177, "PCS_SAD69_UTM_zone_17S"},
+    {29178, "PCS_SAD69_UTM_zone_18S"},
+    {29179, "PCS_SAD69_UTM_zone_19S"},
+    {29180, "PCS_SAD69_UTM_zone_20S"},
+    {29181, "PCS_SAD69_UTM_zone_21S"},
+    {29182, "PCS_SAD69_UTM_zone_22S"},
+    {29183, "PCS_SAD69_UTM_zone_23S"},
+    {29184, "PCS_SAD69_UTM_zone_24S"},
+    {29185, "PCS_SAD69_UTM_zone_25S"},
+    {29220, "PCS_Sapper_Hill_UTM_20S"},
+    {29221, "PCS_Sapper_Hill_UTM_21S"},
+    {29333, "PCS_Schwarzeck_UTM_33S"},
+    {29635, "PCS_Sudan_UTM_zone_35N"},
+    {29636, "PCS_Sudan_UTM_zone_36N"},
+    {29700, "PCS_Tananarive_Laborde"},
+    {29738, "PCS_Tananarive_UTM_38S"},
+    {29739, "PCS_Tananarive_UTM_39S"},
+    {29800, "PCS_Timbalai_1948_Borneo"},
+    {29849, "PCS_Timbalai_1948_UTM_49N"},
+    {29850, "PCS_Timbalai_1948_UTM_50N"},
+    {29900, "PCS_TM65_Irish_Nat_Grid"},
+    {30200, "PCS_Trinidad_1903_Trinidad"},
+    {30339, "PCS_TC_1948_UTM_zone_39N"},
+    {30340, "PCS_TC_1948_UTM_zone_40N"},
+    {30491, "PCS_Voirol_N_Algerie_ancien"},
+    {30492, "PCS_Voirol_S_Algerie_ancien"},
+    {30591, "PCS_Voirol_Unifie_N_Algerie"},
+    {30592, "PCS_Voirol_Unifie_S_Algerie"},
+    {30600, "PCS_Bern_1938_Swiss_New"},
+    {30729, "PCS_Nord_Sahara_UTM_29N"},
+    {30730, "PCS_Nord_Sahara_UTM_30N"},
+    {30731, "PCS_Nord_Sahara_UTM_31N"},
+    {30732, "PCS_Nord_Sahara_UTM_32N"},
+    {31028, "PCS_Yoff_UTM_zone_28N"},
+    {31121, "PCS_Zanderij_UTM_zone_21N"},
+    {31291, "PCS_MGI_Austria_West"},
+    {31292, "PCS_MGI_Austria_Central"},
+    {31293, "PCS_MGI_Austria_East"},
+    {31300, "PCS_Belge_Lambert_72"},
+    {31491, "PCS_DHDN_Germany_zone_1"},
+    {31492, "PCS_DHDN_Germany_zone_2"},
+    {31493, "PCS_DHDN_Germany_zone_3"},
+    {31494, "PCS_DHDN_Germany_zone_4"},
+    {31495, "PCS_DHDN_Germany_zone_5"},
+    {32001, "PCS_NAD27_Montana_North"},
+    {32002, "PCS_NAD27_Montana_Central"},
+    {32003, "PCS_NAD27_Montana_South"},
+    {32005, "PCS_NAD27_Nebraska_North"},
+    {32006, "PCS_NAD27_Nebraska_South"},
+    {32007, "PCS_NAD27_Nevada_East"},
+    {32008, "PCS_NAD27_Nevada_Central"},
+    {32009, "PCS_NAD27_Nevada_West"},
+    {32010, "PCS_NAD27_New_Hampshire"},
+    {32011, "PCS_NAD27_New_Jersey"},
+    {32012, "PCS_NAD27_New_Mexico_East"},
+    {32013, "PCS_NAD27_New_Mexico_Cent"},
+    {32014, "PCS_NAD27_New_Mexico_West"},
+    {32015, "PCS_NAD27_New_York_East"},
+    {32016, "PCS_NAD27_New_York_Central"},
+    {32017, "PCS_NAD27_New_York_West"},
+    {32018, "PCS_NAD27_New_York_Long_Is"},
+    {32019, "PCS_NAD27_North_Carolina"},
+    {32020, "PCS_NAD27_North_Dakota_N"},
+    {32021, "PCS_NAD27_North_Dakota_S"},
+    {32022, "PCS_NAD27_Ohio_North"},
+    {32023, "PCS_NAD27_Ohio_South"},
+    {32024, "PCS_NAD27_Oklahoma_North"},
+    {32025, "PCS_NAD27_Oklahoma_South"},
+    {32026, "PCS_NAD27_Oregon_North"},
+    {32027, "PCS_NAD27_Oregon_South"},
+    {32028, "PCS_NAD27_Pennsylvania_N"},
+    {32029, "PCS_NAD27_Pennsylvania_S"},
+    {32030, "PCS_NAD27_Rhode_Island"},
+    {32031, "PCS_NAD27_South_Carolina_N"},
+    {32033, "PCS_NAD27_South_Carolina_S"},
+    {32034, "PCS_NAD27_South_Dakota_N"},
+    {32035, "PCS_NAD27_South_Dakota_S"},
+    {32036, "PCS_NAD27_Tennessee"},
+    {32037, "PCS_NAD27_Texas_North"},
+    {32038, "PCS_NAD27_Texas_North_Cen"},
+    {32039, "PCS_NAD27_Texas_Central"},
+    {32040, "PCS_NAD27_Texas_South_Cen"},
+    {32041, "PCS_NAD27_Texas_South"},
+    {32042, "PCS_NAD27_Utah_North"},
+    {32043, "PCS_NAD27_Utah_Central"},
+    {32044, "PCS_NAD27_Utah_South"},
+    {32045, "PCS_NAD27_Vermont"},
+    {32046, "PCS_NAD27_Virginia_North"},
+    {32047, "PCS_NAD27_Virginia_South"},
+    {32048, "PCS_NAD27_Washington_North"},
+    {32049, "PCS_NAD27_Washington_South"},
+    {32050, "PCS_NAD27_West_Virginia_N"},
+    {32051, "PCS_NAD27_West_Virginia_S"},
+    {32052, "PCS_NAD27_Wisconsin_North"},
+    {32053, "PCS_NAD27_Wisconsin_Cen"},
+    {32054, "PCS_NAD27_Wisconsin_South"},
+    {32055, "PCS_NAD27_Wyoming_East"},
+    {32056, "PCS_NAD27_Wyoming_E_Cen"},
+    {32057, "PCS_NAD27_Wyoming_W_Cen"},
+    {32058, "PCS_NAD27_Wyoming_West"},
+    {32059, "PCS_NAD27_Puerto_Rico"},
+    {32060, "PCS_NAD27_St_Croix"},
+    {32100, "PCS_NAD83_Montana"},
+    {32104, "PCS_NAD83_Nebraska"},
+    {32107, "PCS_NAD83_Nevada_East"},
+    {32108, "PCS_NAD83_Nevada_Central"},
+    {32109, "PCS_NAD83_Nevada_West"},
+    {32110, "PCS_NAD83_New_Hampshire"},
+    {32111, "PCS_NAD83_New_Jersey"},
+    {32112, "PCS_NAD83_New_Mexico_East"},
+    {32113, "PCS_NAD83_New_Mexico_Cent"},
+    {32114, "PCS_NAD83_New_Mexico_West"},
+    {32115, "PCS_NAD83_New_York_East"},
+    {32116, "PCS_NAD83_New_York_Central"},
+    {32117, "PCS_NAD83_New_York_West"},
+    {32118, "PCS_NAD83_New_York_Long_Is"},
+    {32119, "PCS_NAD83_North_Carolina"},
+    {32120, "PCS_NAD83_North_Dakota_N"},
+    {32121, "PCS_NAD83_North_Dakota_S"},
+    {32122, "PCS_NAD83_Ohio_North"},
+    {32123, "PCS_NAD83_Ohio_South"},
+    {32124, "PCS_NAD83_Oklahoma_North"},
+    {32125, "PCS_NAD83_Oklahoma_South"},
+    {32126, "PCS_NAD83_Oregon_North"},
+    {32127, "PCS_NAD83_Oregon_South"},
+    {32128, "PCS_NAD83_Pennsylvania_N"},
+    {32129, "PCS_NAD83_Pennsylvania_S"},
+    {32130, "PCS_NAD83_Rhode_Island"},
+    {32133, "PCS_NAD83_South_Carolina"},
+    {32134, "PCS_NAD83_South_Dakota_N"},
+    {32135, "PCS_NAD83_South_Dakota_S"},
+    {32136, "PCS_NAD83_Tennessee"},
+    {32137, "PCS_NAD83_Texas_North"},
+    {32138, "PCS_NAD83_Texas_North_Cen"},
+    {32139, "PCS_NAD83_Texas_Central"},
+    {32140, "PCS_NAD83_Texas_South_Cen"},
+    {32141, "PCS_NAD83_Texas_South"},
+    {32142, "PCS_NAD83_Utah_North"},
+    {32143, "PCS_NAD83_Utah_Central"},
+    {32144, "PCS_NAD83_Utah_South"},
+    {32145, "PCS_NAD83_Vermont"},
+    {32146, "PCS_NAD83_Virginia_North"},
+    {32147, "PCS_NAD83_Virginia_South"},
+    {32148, "PCS_NAD83_Washington_North"},
+    {32149, "PCS_NAD83_Washington_South"},
+    {32150, "PCS_NAD83_West_Virginia_N"},
+    {32151, "PCS_NAD83_West_Virginia_S"},
+    {32152, "PCS_NAD83_Wisconsin_North"},
+    {32153, "PCS_NAD83_Wisconsin_Cen"},
+    {32154, "PCS_NAD83_Wisconsin_South"},
+    {32155, "PCS_NAD83_Wyoming_East"},
+    {32156, "PCS_NAD83_Wyoming_E_Cen"},
+    {32157, "PCS_NAD83_Wyoming_W_Cen"},
+    {32158, "PCS_NAD83_Wyoming_West"},
+    {32161, "PCS_NAD83_Puerto_Rico_Virgin_Is"},
+    {32201, "PCS_WGS72_UTM_zone_1N"},
+    {32202, "PCS_WGS72_UTM_zone_2N"},
+    {32203, "PCS_WGS72_UTM_zone_3N"},
+    {32204, "PCS_WGS72_UTM_zone_4N"},
+    {32205, "PCS_WGS72_UTM_zone_5N"},
+    {32206, "PCS_WGS72_UTM_zone_6N"},
+    {32207, "PCS_WGS72_UTM_zone_7N"},
+    {32208, "PCS_WGS72_UTM_zone_8N"},
+    {32209, "PCS_WGS72_UTM_zone_9N"},
+    {32210, "PCS_WGS72_UTM_zone_10N"},
+    {32211, "PCS_WGS72_UTM_zone_11N"},
+    {32212, "PCS_WGS72_UTM_zone_12N"},
+    {32213, "PCS_WGS72_UTM_zone_13N"},
+    {32214, "PCS_WGS72_UTM_zone_14N"},
+    {32215, "PCS_WGS72_UTM_zone_15N"},
+    {32216, "PCS_WGS72_UTM_zone_16N"},
+    {32217, "PCS_WGS72_UTM_zone_17N"},
+    {32218, "PCS_WGS72_UTM_zone_18N"},
+    {32219, "PCS_WGS72_UTM_zone_19N"},
+    {32220, "PCS_WGS72_UTM_zone_20N"},
+    {32221, "PCS_WGS72_UTM_zone_21N"},
+    {32222, "PCS_WGS72_UTM_zone_22N"},
+    {32223, "PCS_WGS72_UTM_zone_23N"},
+    {32224, "PCS_WGS72_UTM_zone_24N"},
+    {32225, "PCS_WGS72_UTM_zone_25N"},
+    {32226, "PCS_WGS72_UTM_zone_26N"},
+    {32227, "PCS_WGS72_UTM_zone_27N"},
+    {32228, "PCS_WGS72_UTM_zone_28N"},
+    {32229, "PCS_WGS72_UTM_zone_29N"},
+    {32230, "PCS_WGS72_UTM_zone_30N"},
+    {32231, "PCS_WGS72_UTM_zone_31N"},
+    {32232, "PCS_WGS72_UTM_zone_32N"},
+    {32233, "PCS_WGS72_UTM_zone_33N"},
+    {32234, "PCS_WGS72_UTM_zone_34N"},
+    {32235, "PCS_WGS72_UTM_zone_35N"},
+    {32236, "PCS_WGS72_UTM_zone_36N"},
+    {32237, "PCS_WGS72_UTM_zone_37N"},
+    {32238, "PCS_WGS72_UTM_zone_38N"},
+    {32239, "PCS_WGS72_UTM_zone_39N"},
+    {32240, "PCS_WGS72_UTM_zone_40N"},
+    {32241, "PCS_WGS72_UTM_zone_41N"},
+    {32242, "PCS_WGS72_UTM_zone_42N"},
+    {32243, "PCS_WGS72_UTM_zone_43N"},
+    {32244, "PCS_WGS72_UTM_zone_44N"},
+    {32245, "PCS_WGS72_UTM_zone_45N"},
+    {32246, "PCS_WGS72_UTM_zone_46N"},
+    {32247, "PCS_WGS72_UTM_zone_47N"},
+    {32248, "PCS_WGS72_UTM_zone_48N"},
+    {32249, "PCS_WGS72_UTM_zone_49N"},
+    {32250, "PCS_WGS72_UTM_zone_50N"},
+    {32251, "PCS_WGS72_UTM_zone_51N"},
+    {32252, "PCS_WGS72_UTM_zone_52N"},
+    {32253, "PCS_WGS72_UTM_zone_53N"},
+    {32254, "PCS_WGS72_UTM_zone_54N"},
+    {32255, "PCS_WGS72_UTM_zone_55N"},
+    {32256, "PCS_WGS72_UTM_zone_56N"},
+    {32257, "PCS_WGS72_UTM_zone_57N"},
+    {32258, "PCS_WGS72_UTM_zone_58N"},
+    {32259, "PCS_WGS72_UTM_zone_59N"},
+    {32260, "PCS_WGS72_UTM_zone_60N"},
+    {32301, "PCS_WGS72_UTM_zone_1S"},
+    {32302, "PCS_WGS72_UTM_zone_2S"},
+    {32303, "PCS_WGS72_UTM_zone_3S"},
+    {32304, "PCS_WGS72_UTM_zone_4S"},
+    {32305, "PCS_WGS72_UTM_zone_5S"},
+    {32306, "PCS_WGS72_UTM_zone_6S"},
+    {32307, "PCS_WGS72_UTM_zone_7S"},
+    {32308, "PCS_WGS72_UTM_zone_8S"},
+    {32309, "PCS_WGS72_UTM_zone_9S"},
+    {32310, "PCS_WGS72_UTM_zone_10S"},
+    {32311, "PCS_WGS72_UTM_zone_11S"},
+    {32312, "PCS_WGS72_UTM_zone_12S"},
+    {32313, "PCS_WGS72_UTM_zone_13S"},
+    {32314, "PCS_WGS72_UTM_zone_14S"},
+    {32315, "PCS_WGS72_UTM_zone_15S"},
+    {32316, "PCS_WGS72_UTM_zone_16S"},
+    {32317, "PCS_WGS72_UTM_zone_17S"},
+    {32318, "PCS_WGS72_UTM_zone_18S"},
+    {32319, "PCS_WGS72_UTM_zone_19S"},
+    {32320, "PCS_WGS72_UTM_zone_20S"},
+    {32321, "PCS_WGS72_UTM_zone_21S"},
+    {32322, "PCS_WGS72_UTM_zone_22S"},
+    {32323, "PCS_WGS72_UTM_zone_23S"},
+    {32324, "PCS_WGS72_UTM_zone_24S"},
+    {32325, "PCS_WGS72_UTM_zone_25S"},
+    {32326, "PCS_WGS72_UTM_zone_26S"},
+    {32327, "PCS_WGS72_UTM_zone_27S"},
+    {32328, "PCS_WGS72_UTM_zone_28S"},
+    {32329, "PCS_WGS72_UTM_zone_29S"},
+    {32330, "PCS_WGS72_UTM_zone_30S"},
+    {32331, "PCS_WGS72_UTM_zone_31S"},
+    {32332, "PCS_WGS72_UTM_zone_32S"},
+    {32333, "PCS_WGS72_UTM_zone_33S"},
+    {32334, "PCS_WGS72_UTM_zone_34S"},
+    {32335, "PCS_WGS72_UTM_zone_35S"},
+    {32336, "PCS_WGS72_UTM_zone_36S"},
+    {32337, "PCS_WGS72_UTM_zone_37S"},
+    {32338, "PCS_WGS72_UTM_zone_38S"},
+    {32339, "PCS_WGS72_UTM_zone_39S"},
+    {32340, "PCS_WGS72_UTM_zone_40S"},
+    {32341, "PCS_WGS72_UTM_zone_41S"},
+    {32342, "PCS_WGS72_UTM_zone_42S"},
+    {32343, "PCS_WGS72_UTM_zone_43S"},
+    {32344, "PCS_WGS72_UTM_zone_44S"},
+    {32345, "PCS_WGS72_UTM_zone_45S"},
+    {32346, "PCS_WGS72_UTM_zone_46S"},
+    {32347, "PCS_WGS72_UTM_zone_47S"},
+    {32348, "PCS_WGS72_UTM_zone_48S"},
+    {32349, "PCS_WGS72_UTM_zone_49S"},
+    {32350, "PCS_WGS72_UTM_zone_50S"},
+    {32351, "PCS_WGS72_UTM_zone_51S"},
+    {32352, "PCS_WGS72_UTM_zone_52S"},
+    {32353, "PCS_WGS72_UTM_zone_53S"},
+    {32354, "PCS_WGS72_UTM_zone_54S"},
+    {32355, "PCS_WGS72_UTM_zone_55S"},
+    {32356, "PCS_WGS72_UTM_zone_56S"},
+    {32357, "PCS_WGS72_UTM_zone_57S"},
+    {32358, "PCS_WGS72_UTM_zone_58S"},
+    {32359, "PCS_WGS72_UTM_zone_59S"},
+    {32360, "PCS_WGS72_UTM_zone_60S"},
+    {32401, "PCS_WGS72BE_UTM_zone_1N"},
+    {32402, "PCS_WGS72BE_UTM_zone_2N"},
+    {32403, "PCS_WGS72BE_UTM_zone_3N"},
+    {32404, "PCS_WGS72BE_UTM_zone_4N"},
+    {32405, "PCS_WGS72BE_UTM_zone_5N"},
+    {32406, "PCS_WGS72BE_UTM_zone_6N"},
+    {32407, "PCS_WGS72BE_UTM_zone_7N"},
+    {32408, "PCS_WGS72BE_UTM_zone_8N"},
+    {32409, "PCS_WGS72BE_UTM_zone_9N"},
+    {32410, "PCS_WGS72BE_UTM_zone_10N"},
+    {32411, "PCS_WGS72BE_UTM_zone_11N"},
+    {32412, "PCS_WGS72BE_UTM_zone_12N"},
+    {32413, "PCS_WGS72BE_UTM_zone_13N"},
+    {32414, "PCS_WGS72BE_UTM_zone_14N"},
+    {32415, "PCS_WGS72BE_UTM_zone_15N"},
+    {32416, "PCS_WGS72BE_UTM_zone_16N"},
+    {32417, "PCS_WGS72BE_UTM_zone_17N"},
+    {32418, "PCS_WGS72BE_UTM_zone_18N"},
+    {32419, "PCS_WGS72BE_UTM_zone_19N"},
+    {32420, "PCS_WGS72BE_UTM_zone_20N"},
+    {32421, "PCS_WGS72BE_UTM_zone_21N"},
+    {32422, "PCS_WGS72BE_UTM_zone_22N"},
+    {32423, "PCS_WGS72BE_UTM_zone_23N"},
+    {32424, "PCS_WGS72BE_UTM_zone_24N"},
+    {32425, "PCS_WGS72BE_UTM_zone_25N"},
+    {32426, "PCS_WGS72BE_UTM_zone_26N"},
+    {32427, "PCS_WGS72BE_UTM_zone_27N"},
+    {32428, "PCS_WGS72BE_UTM_zone_28N"},
+    {32429, "PCS_WGS72BE_UTM_zone_29N"},
+    {32430, "PCS_WGS72BE_UTM_zone_30N"},
+    {32431, "PCS_WGS72BE_UTM_zone_31N"},
+    {32432, "PCS_WGS72BE_UTM_zone_32N"},
+    {32433, "PCS_WGS72BE_UTM_zone_33N"},
+    {32434, "PCS_WGS72BE_UTM_zone_34N"},
+    {32435, "PCS_WGS72BE_UTM_zone_35N"},
+    {32436, "PCS_WGS72BE_UTM_zone_36N"},
+    {32437, "PCS_WGS72BE_UTM_zone_37N"},
+    {32438, "PCS_WGS72BE_UTM_zone_38N"},
+    {32439, "PCS_WGS72BE_UTM_zone_39N"},
+    {32440, "PCS_WGS72BE_UTM_zone_40N"},
+    {32441, "PCS_WGS72BE_UTM_zone_41N"},
+    {32442, "PCS_WGS72BE_UTM_zone_42N"},
+    {32443, "PCS_WGS72BE_UTM_zone_43N"},
+    {32444, "PCS_WGS72BE_UTM_zone_44N"},
+    {32445, "PCS_WGS72BE_UTM_zone_45N"},
+    {32446, "PCS_WGS72BE_UTM_zone_46N"},
+    {32447, "PCS_WGS72BE_UTM_zone_47N"},
+    {32448, "PCS_WGS72BE_UTM_zone_48N"},
+    {32449, "PCS_WGS72BE_UTM_zone_49N"},
+    {32450, "PCS_WGS72BE_UTM_zone_50N"},
+    {32451, "PCS_WGS72BE_UTM_zone_51N"},
+    {32452, "PCS_WGS72BE_UTM_zone_52N"},
+    {32453, "PCS_WGS72BE_UTM_zone_53N"},
+    {32454, "PCS_WGS72BE_UTM_zone_54N"},
+    {32455, "PCS_WGS72BE_UTM_zone_55N"},
+    {32456, "PCS_WGS72BE_UTM_zone_56N"},
+    {32457, "PCS_WGS72BE_UTM_zone_57N"},
+    {32458, "PCS_WGS72BE_UTM_zone_58N"},
+    {32459, "PCS_WGS72BE_UTM_zone_59N"},
+    {32460, "PCS_WGS72BE_UTM_zone_60N"},
+    {32501, "PCS_WGS72BE_UTM_zone_1S"},
+    {32502, "PCS_WGS72BE_UTM_zone_2S"},
+    {32503, "PCS_WGS72BE_UTM_zone_3S"},
+    {32504, "PCS_WGS72BE_UTM_zone_4S"},
+    {32505, "PCS_WGS72BE_UTM_zone_5S"},
+    {32506, "PCS_WGS72BE_UTM_zone_6S"},
+    {32507, "PCS_WGS72BE_UTM_zone_7S"},
+    {32508, "PCS_WGS72BE_UTM_zone_8S"},
+    {32509, "PCS_WGS72BE_UTM_zone_9S"},
+    {32510, "PCS_WGS72BE_UTM_zone_10S"},
+    {32511, "PCS_WGS72BE_UTM_zone_11S"},
+    {32512, "PCS_WGS72BE_UTM_zone_12S"},
+    {32513, "PCS_WGS72BE_UTM_zone_13S"},
+    {32514, "PCS_WGS72BE_UTM_zone_14S"},
+    {32515, "PCS_WGS72BE_UTM_zone_15S"},
+    {32516, "PCS_WGS72BE_UTM_zone_16S"},
+    {32517, "PCS_WGS72BE_UTM_zone_17S"},
+    {32518, "PCS_WGS72BE_UTM_zone_18S"},
+    {32519, "PCS_WGS72BE_UTM_zone_19S"},
+    {32520, "PCS_WGS72BE_UTM_zone_20S"},
+    {32521, "PCS_WGS72BE_UTM_zone_21S"},
+    {32522, "PCS_WGS72BE_UTM_zone_22S"},
+    {32523, "PCS_WGS72BE_UTM_zone_23S"},
+    {32524, "PCS_WGS72BE_UTM_zone_24S"},
+    {32525, "PCS_WGS72BE_UTM_zone_25S"},
+    {32526, "PCS_WGS72BE_UTM_zone_26S"},
+    {32527, "PCS_WGS72BE_UTM_zone_27S"},
+    {32528, "PCS_WGS72BE_UTM_zone_28S"},
+    {32529, "PCS_WGS72BE_UTM_zone_29S"},
+    {32530, "PCS_WGS72BE_UTM_zone_30S"},
+    {32531, "PCS_WGS72BE_UTM_zone_31S"},
+    {32532, "PCS_WGS72BE_UTM_zone_32S"},
+    {32533, "PCS_WGS72BE_UTM_zone_33S"},
+    {32534, "PCS_WGS72BE_UTM_zone_34S"},
+    {32535, "PCS_WGS72BE_UTM_zone_35S"},
+    {32536, "PCS_WGS72BE_UTM_zone_36S"},
+    {32537, "PCS_WGS72BE_UTM_zone_37S"},
+    {32538, "PCS_WGS72BE_UTM_zone_38S"},
+    {32539, "PCS_WGS72BE_UTM_zone_39S"},
+    {32540, "PCS_WGS72BE_UTM_zone_40S"},
+    {32541, "PCS_WGS72BE_UTM_zone_41S"},
+    {32542, "PCS_WGS72BE_UTM_zone_42S"},
+    {32543, "PCS_WGS72BE_UTM_zone_43S"},
+    {32544, "PCS_WGS72BE_UTM_zone_44S"},
+    {32545, "PCS_WGS72BE_UTM_zone_45S"},
+    {32546, "PCS_WGS72BE_UTM_zone_46S"},
+    {32547, "PCS_WGS72BE_UTM_zone_47S"},
+    {32548, "PCS_WGS72BE_UTM_zone_48S"},
+    {32549, "PCS_WGS72BE_UTM_zone_49S"},
+    {32550, "PCS_WGS72BE_UTM_zone_50S"},
+    {32551, "PCS_WGS72BE_UTM_zone_51S"},
+    {32552, "PCS_WGS72BE_UTM_zone_52S"},
+    {32553, "PCS_WGS72BE_UTM_zone_53S"},
+    {32554, "PCS_WGS72BE_UTM_zone_54S"},
+    {32555, "PCS_WGS72BE_UTM_zone_55S"},
+    {32556, "PCS_WGS72BE_UTM_zone_56S"},
+    {32557, "PCS_WGS72BE_UTM_zone_57S"},
+    {32558, "PCS_WGS72BE_UTM_zone_58S"},
+    {32559, "PCS_WGS72BE_UTM_zone_59S"},
+    {32560, "PCS_WGS72BE_UTM_zone_60S"},
+    {32601, "PCS_WGS84_UTM_zone_1N"},
+    {32602, "PCS_WGS84_UTM_zone_2N"},
+    {32603, "PCS_WGS84_UTM_zone_3N"},
+    {32604, "PCS_WGS84_UTM_zone_4N"},
+    {32605, "PCS_WGS84_UTM_zone_5N"},
+    {32606, "PCS_WGS84_UTM_zone_6N"},
+    {32607, "PCS_WGS84_UTM_zone_7N"},
+    {32608, "PCS_WGS84_UTM_zone_8N"},
+    {32609, "PCS_WGS84_UTM_zone_9N"},
+    {32610, "PCS_WGS84_UTM_zone_10N"},
+    {32611, "PCS_WGS84_UTM_zone_11N"},
+    {32612, "PCS_WGS84_UTM_zone_12N"},
+    {32613, "PCS_WGS84_UTM_zone_13N"},
+    {32614, "PCS_WGS84_UTM_zone_14N"},
+    {32615, "PCS_WGS84_UTM_zone_15N"},
+    {32616, "PCS_WGS84_UTM_zone_16N"},
+    {32617, "PCS_WGS84_UTM_zone_17N"},
+    {32618, "PCS_WGS84_UTM_zone_18N"},
+    {32619, "PCS_WGS84_UTM_zone_19N"},
+    {32620, "PCS_WGS84_UTM_zone_20N"},
+    {32621, "PCS_WGS84_UTM_zone_21N"},
+    {32622, "PCS_WGS84_UTM_zone_22N"},
+    {32623, "PCS_WGS84_UTM_zone_23N"},
+    {32624, "PCS_WGS84_UTM_zone_24N"},
+    {32625, "PCS_WGS84_UTM_zone_25N"},
+    {32626, "PCS_WGS84_UTM_zone_26N"},
+    {32627, "PCS_WGS84_UTM_zone_27N"},
+    {32628, "PCS_WGS84_UTM_zone_28N"},
+    {32629, "PCS_WGS84_UTM_zone_29N"},
+    {32630, "PCS_WGS84_UTM_zone_30N"},
+    {32631, "PCS_WGS84_UTM_zone_31N"},
+    {32632, "PCS_WGS84_UTM_zone_32N"},
+    {32633, "PCS_WGS84_UTM_zone_33N"},
+    {32634, "PCS_WGS84_UTM_zone_34N"},
+    {32635, "PCS_WGS84_UTM_zone_35N"},
+    {32636, "PCS_WGS84_UTM_zone_36N"},
+    {32637, "PCS_WGS84_UTM_zone_37N"},
+    {32638, "PCS_WGS84_UTM_zone_38N"},
+    {32639, "PCS_WGS84_UTM_zone_39N"},
+    {32640, "PCS_WGS84_UTM_zone_40N"},
+    {32641, "PCS_WGS84_UTM_zone_41N"},
+    {32642, "PCS_WGS84_UTM_zone_42N"},
+    {32643, "PCS_WGS84_UTM_zone_43N"},
+    {32644, "PCS_WGS84_UTM_zone_44N"},
+    {32645, "PCS_WGS84_UTM_zone_45N"},
+    {32646, "PCS_WGS84_UTM_zone_46N"},
+    {32647, "PCS_WGS84_UTM_zone_47N"},
+    {32648, "PCS_WGS84_UTM_zone_48N"},
+    {32649, "PCS_WGS84_UTM_zone_49N"},
+    {32650, "PCS_WGS84_UTM_zone_50N"},
+    {32651, "PCS_WGS84_UTM_zone_51N"},
+    {32652, "PCS_WGS84_UTM_zone_52N"},
+    {32653, "PCS_WGS84_UTM_zone_53N"},
+    {32654, "PCS_WGS84_UTM_zone_54N"},
+    {32655, "PCS_WGS84_UTM_zone_55N"},
+    {32656, "PCS_WGS84_UTM_zone_56N"},
+    {32657, "PCS_WGS84_UTM_zone_57N"},
+    {32658, "PCS_WGS84_UTM_zone_58N"},
+    {32659, "PCS_WGS84_UTM_zone_59N"},
+    {32660, "PCS_WGS84_UTM_zone_60N"},
+    {32701, "PCS_WGS84_UTM_zone_1S"},
+    {32702, "PCS_WGS84_UTM_zone_2S"},
+    {32703, "PCS_WGS84_UTM_zone_3S"},
+    {32704, "PCS_WGS84_UTM_zone_4S"},
+    {32705, "PCS_WGS84_UTM_zone_5S"},
+    {32706, "PCS_WGS84_UTM_zone_6S"},
+    {32707, "PCS_WGS84_UTM_zone_7S"},
+    {32708, "PCS_WGS84_UTM_zone_8S"},
+    {32709, "PCS_WGS84_UTM_zone_9S"},
+    {32710, "PCS_WGS84_UTM_zone_10S"},
+    {32711, "PCS_WGS84_UTM_zone_11S"},
+    {32712, "PCS_WGS84_UTM_zone_12S"},
+    {32713, "PCS_WGS84_UTM_zone_13S"},
+    {32714, "PCS_WGS84_UTM_zone_14S"},
+    {32715, "PCS_WGS84_UTM_zone_15S"},
+    {32716, "PCS_WGS84_UTM_zone_16S"},
+    {32717, "PCS_WGS84_UTM_zone_17S"},
+    {32718, "PCS_WGS84_UTM_zone_18S"},
+    {32719, "PCS_WGS84_UTM_zone_19S"},
+    {32720, "PCS_WGS84_UTM_zone_20S"},
+    {32721, "PCS_WGS84_UTM_zone_21S"},
+    {32722, "PCS_WGS84_UTM_zone_22S"},
+    {32723, "PCS_WGS84_UTM_zone_23S"},
+    {32724, "PCS_WGS84_UTM_zone_24S"},
+    {32725, "PCS_WGS84_UTM_zone_25S"},
+    {32726, "PCS_WGS84_UTM_zone_26S"},
+    {32727, "PCS_WGS84_UTM_zone_27S"},
+    {32728, "PCS_WGS84_UTM_zone_28S"},
+    {32729, "PCS_WGS84_UTM_zone_29S"},
+    {32730, "PCS_WGS84_UTM_zone_30S"},
+    {32731, "PCS_WGS84_UTM_zone_31S"},
+    {32732, "PCS_WGS84_UTM_zone_32S"},
+    {32733, "PCS_WGS84_UTM_zone_33S"},
+    {32734, "PCS_WGS84_UTM_zone_34S"},
+    {32735, "PCS_WGS84_UTM_zone_35S"},
+    {32736, "PCS_WGS84_UTM_zone_36S"},
+    {32737, "PCS_WGS84_UTM_zone_37S"},
+    {32738, "PCS_WGS84_UTM_zone_38S"},
+    {32739, "PCS_WGS84_UTM_zone_39S"},
+    {32740, "PCS_WGS84_UTM_zone_40S"},
+    {32741, "PCS_WGS84_UTM_zone_41S"},
+    {32742, "PCS_WGS84_UTM_zone_42S"},
+    {32743, "PCS_WGS84_UTM_zone_43S"},
+    {32744, "PCS_WGS84_UTM_zone_44S"},
+    {32745, "PCS_WGS84_UTM_zone_45S"},
+    {32746, "PCS_WGS84_UTM_zone_46S"},
+    {32747, "PCS_WGS84_UTM_zone_47S"},
+    {32748, "PCS_WGS84_UTM_zone_48S"},
+    {32749, "PCS_WGS84_UTM_zone_49S"},
+    {32750, "PCS_WGS84_UTM_zone_50S"},
+    {32751, "PCS_WGS84_UTM_zone_51S"},
+    {32752, "PCS_WGS84_UTM_zone_52S"},
+    {32753, "PCS_WGS84_UTM_zone_53S"},
+    {32754, "PCS_WGS84_UTM_zone_54S"},
+    {32755, "PCS_WGS84_UTM_zone_55S"},
+    {32756, "PCS_WGS84_UTM_zone_56S"},
+    {32757, "PCS_WGS84_UTM_zone_57S"},
+    {32758, "PCS_WGS84_UTM_zone_58S"},
+    {32759, "PCS_WGS84_UTM_zone_59S"},
+    {32760, "PCS_WGS84_UTM_zone_60S"}
+};
+
+const TiffGeoTagKeyName ff_tiff_projection_codes[] = {
+    {10101, "Proj_Alabama_CS27_East"},
+    {10102, "Proj_Alabama_CS27_West"},
+    {10131, "Proj_Alabama_CS83_East"},
+    {10132, "Proj_Alabama_CS83_West"},
+    {10201, "Proj_Arizona_Coordinate_System_east"},
+    {10202, "Proj_Arizona_Coordinate_System_Central"},
+    {10203, "Proj_Arizona_Coordinate_System_west"},
+    {10231, "Proj_Arizona_CS83_east"},
+    {10232, "Proj_Arizona_CS83_Central"},
+    {10233, "Proj_Arizona_CS83_west"},
+    {10301, "Proj_Arkansas_CS27_North"},
+    {10302, "Proj_Arkansas_CS27_South"},
+    {10331, "Proj_Arkansas_CS83_North"},
+    {10332, "Proj_Arkansas_CS83_South"},
+    {10401, "Proj_California_CS27_I"},
+    {10402, "Proj_California_CS27_II"},
+    {10403, "Proj_California_CS27_III"},
+    {10404, "Proj_California_CS27_IV"},
+    {10405, "Proj_California_CS27_V"},
+    {10406, "Proj_California_CS27_VI"},
+    {10407, "Proj_California_CS27_VII"},
+    {10431, "Proj_California_CS83_1"},
+    {10432, "Proj_California_CS83_2"},
+    {10433, "Proj_California_CS83_3"},
+    {10434, "Proj_California_CS83_4"},
+    {10435, "Proj_California_CS83_5"},
+    {10436, "Proj_California_CS83_6"},
+    {10501, "Proj_Colorado_CS27_North"},
+    {10502, "Proj_Colorado_CS27_Central"},
+    {10503, "Proj_Colorado_CS27_South"},
+    {10531, "Proj_Colorado_CS83_North"},
+    {10532, "Proj_Colorado_CS83_Central"},
+    {10533, "Proj_Colorado_CS83_South"},
+    {10600, "Proj_Connecticut_CS27"},
+    {10630, "Proj_Connecticut_CS83"},
+    {10700, "Proj_Delaware_CS27"},
+    {10730, "Proj_Delaware_CS83"},
+    {10901, "Proj_Florida_CS27_East"},
+    {10902, "Proj_Florida_CS27_West"},
+    {10903, "Proj_Florida_CS27_North"},
+    {10931, "Proj_Florida_CS83_East"},
+    {10932, "Proj_Florida_CS83_West"},
+    {10933, "Proj_Florida_CS83_North"},
+    {11001, "Proj_Georgia_CS27_East"},
+    {11002, "Proj_Georgia_CS27_West"},
+    {11031, "Proj_Georgia_CS83_East"},
+    {11032, "Proj_Georgia_CS83_West"},
+    {11101, "Proj_Idaho_CS27_East"},
+    {11102, "Proj_Idaho_CS27_Central"},
+    {11103, "Proj_Idaho_CS27_West"},
+    {11131, "Proj_Idaho_CS83_East"},
+    {11132, "Proj_Idaho_CS83_Central"},
+    {11133, "Proj_Idaho_CS83_West"},
+    {11201, "Proj_Illinois_CS27_East"},
+    {11202, "Proj_Illinois_CS27_West"},
+    {11231, "Proj_Illinois_CS83_East"},
+    {11232, "Proj_Illinois_CS83_West"},
+    {11301, "Proj_Indiana_CS27_East"},
+    {11302, "Proj_Indiana_CS27_West"},
+    {11331, "Proj_Indiana_CS83_East"},
+    {11332, "Proj_Indiana_CS83_West"},
+    {11401, "Proj_Iowa_CS27_North"},
+    {11402, "Proj_Iowa_CS27_South"},
+    {11431, "Proj_Iowa_CS83_North"},
+    {11432, "Proj_Iowa_CS83_South"},
+    {11501, "Proj_Kansas_CS27_North"},
+    {11502, "Proj_Kansas_CS27_South"},
+    {11531, "Proj_Kansas_CS83_North"},
+    {11532, "Proj_Kansas_CS83_South"},
+    {11601, "Proj_Kentucky_CS27_North"},
+    {11602, "Proj_Kentucky_CS27_South"},
+    {11631, "Proj_Kentucky_CS83_North"},
+    {11632, "Proj_Kentucky_CS83_South"},
+    {11701, "Proj_Louisiana_CS27_North"},
+    {11702, "Proj_Louisiana_CS27_South"},
+    {11731, "Proj_Louisiana_CS83_North"},
+    {11732, "Proj_Louisiana_CS83_South"},
+    {11801, "Proj_Maine_CS27_East"},
+    {11802, "Proj_Maine_CS27_West"},
+    {11831, "Proj_Maine_CS83_East"},
+    {11832, "Proj_Maine_CS83_West"},
+    {11900, "Proj_Maryland_CS27"},
+    {11930, "Proj_Maryland_CS83"},
+    {12001, "Proj_Massachusetts_CS27_Mainland"},
+    {12002, "Proj_Massachusetts_CS27_Island"},
+    {12031, "Proj_Massachusetts_CS83_Mainland"},
+    {12032, "Proj_Massachusetts_CS83_Island"},
+    {12101, "Proj_Michigan_State_Plane_East"},
+    {12102, "Proj_Michigan_State_Plane_Old_Central"},
+    {12103, "Proj_Michigan_State_Plane_West"},
+    {12111, "Proj_Michigan_CS27_North"},
+    {12112, "Proj_Michigan_CS27_Central"},
+    {12113, "Proj_Michigan_CS27_South"},
+    {12141, "Proj_Michigan_CS83_North"},
+    {12142, "Proj_Michigan_CS83_Central"},
+    {12143, "Proj_Michigan_CS83_South"},
+    {12201, "Proj_Minnesota_CS27_North"},
+    {12202, "Proj_Minnesota_CS27_Central"},
+    {12203, "Proj_Minnesota_CS27_South"},
+    {12231, "Proj_Minnesota_CS83_North"},
+    {12232, "Proj_Minnesota_CS83_Central"},
+    {12233, "Proj_Minnesota_CS83_South"},
+    {12301, "Proj_Mississippi_CS27_East"},
+    {12302, "Proj_Mississippi_CS27_West"},
+    {12331, "Proj_Mississippi_CS83_East"},
+    {12332, "Proj_Mississippi_CS83_West"},
+    {12401, "Proj_Missouri_CS27_East"},
+    {12402, "Proj_Missouri_CS27_Central"},
+    {12403, "Proj_Missouri_CS27_West"},
+    {12431, "Proj_Missouri_CS83_East"},
+    {12432, "Proj_Missouri_CS83_Central"},
+    {12433, "Proj_Missouri_CS83_West"},
+    {12501, "Proj_Montana_CS27_North"},
+    {12502, "Proj_Montana_CS27_Central"},
+    {12503, "Proj_Montana_CS27_South"},
+    {12530, "Proj_Montana_CS83"},
+    {12601, "Proj_Nebraska_CS27_North"},
+    {12602, "Proj_Nebraska_CS27_South"},
+    {12630, "Proj_Nebraska_CS83"},
+    {12701, "Proj_Nevada_CS27_East"},
+    {12702, "Proj_Nevada_CS27_Central"},
+    {12703, "Proj_Nevada_CS27_West"},
+    {12731, "Proj_Nevada_CS83_East"},
+    {12732, "Proj_Nevada_CS83_Central"},
+    {12733, "Proj_Nevada_CS83_West"},
+    {12800, "Proj_New_Hampshire_CS27"},
+    {12830, "Proj_New_Hampshire_CS83"},
+    {12900, "Proj_New_Jersey_CS27"},
+    {12930, "Proj_New_Jersey_CS83"},
+    {13001, "Proj_New_Mexico_CS27_East"},
+    {13002, "Proj_New_Mexico_CS27_Central"},
+    {13003, "Proj_New_Mexico_CS27_West"},
+    {13031, "Proj_New_Mexico_CS83_East"},
+    {13032, "Proj_New_Mexico_CS83_Central"},
+    {13033, "Proj_New_Mexico_CS83_West"},
+    {13101, "Proj_New_York_CS27_East"},
+    {13102, "Proj_New_York_CS27_Central"},
+    {13103, "Proj_New_York_CS27_West"},
+    {13104, "Proj_New_York_CS27_Long_Island"},
+    {13131, "Proj_New_York_CS83_East"},
+    {13132, "Proj_New_York_CS83_Central"},
+    {13133, "Proj_New_York_CS83_West"},
+    {13134, "Proj_New_York_CS83_Long_Island"},
+    {13200, "Proj_North_Carolina_CS27"},
+    {13230, "Proj_North_Carolina_CS83"},
+    {13301, "Proj_North_Dakota_CS27_North"},
+    {13302, "Proj_North_Dakota_CS27_South"},
+    {13331, "Proj_North_Dakota_CS83_North"},
+    {13332, "Proj_North_Dakota_CS83_South"},
+    {13401, "Proj_Ohio_CS27_North"},
+    {13402, "Proj_Ohio_CS27_South"},
+    {13431, "Proj_Ohio_CS83_North"},
+    {13432, "Proj_Ohio_CS83_South"},
+    {13501, "Proj_Oklahoma_CS27_North"},
+    {13502, "Proj_Oklahoma_CS27_South"},
+    {13531, "Proj_Oklahoma_CS83_North"},
+    {13532, "Proj_Oklahoma_CS83_South"},
+    {13601, "Proj_Oregon_CS27_North"},
+    {13602, "Proj_Oregon_CS27_South"},
+    {13631, "Proj_Oregon_CS83_North"},
+    {13632, "Proj_Oregon_CS83_South"},
+    {13701, "Proj_Pennsylvania_CS27_North"},
+    {13702, "Proj_Pennsylvania_CS27_South"},
+    {13731, "Proj_Pennsylvania_CS83_North"},
+    {13732, "Proj_Pennsylvania_CS83_South"},
+    {13800, "Proj_Rhode_Island_CS27"},
+    {13830, "Proj_Rhode_Island_CS83"},
+    {13901, "Proj_South_Carolina_CS27_North"},
+    {13902, "Proj_South_Carolina_CS27_South"},
+    {13930, "Proj_South_Carolina_CS83"},
+    {14001, "Proj_South_Dakota_CS27_North"},
+    {14002, "Proj_South_Dakota_CS27_South"},
+    {14031, "Proj_South_Dakota_CS83_North"},
+    {14032, "Proj_South_Dakota_CS83_South"},
+    {14100, "Proj_Tennessee_CS27"},
+    {14130, "Proj_Tennessee_CS83"},
+    {14201, "Proj_Texas_CS27_North"},
+    {14202, "Proj_Texas_CS27_North_Central"},
+    {14203, "Proj_Texas_CS27_Central"},
+    {14204, "Proj_Texas_CS27_South_Central"},
+    {14205, "Proj_Texas_CS27_South"},
+    {14231, "Proj_Texas_CS83_North"},
+    {14232, "Proj_Texas_CS83_North_Central"},
+    {14233, "Proj_Texas_CS83_Central"},
+    {14234, "Proj_Texas_CS83_South_Central"},
+    {14235, "Proj_Texas_CS83_South"},
+    {14301, "Proj_Utah_CS27_North"},
+    {14302, "Proj_Utah_CS27_Central"},
+    {14303, "Proj_Utah_CS27_South"},
+    {14331, "Proj_Utah_CS83_North"},
+    {14332, "Proj_Utah_CS83_Central"},
+    {14333, "Proj_Utah_CS83_South"},
+    {14400, "Proj_Vermont_CS27"},
+    {14430, "Proj_Vermont_CS83"},
+    {14501, "Proj_Virginia_CS27_North"},
+    {14502, "Proj_Virginia_CS27_South"},
+    {14531, "Proj_Virginia_CS83_North"},
+    {14532, "Proj_Virginia_CS83_South"},
+    {14601, "Proj_Washington_CS27_North"},
+    {14602, "Proj_Washington_CS27_South"},
+    {14631, "Proj_Washington_CS83_North"},
+    {14632, "Proj_Washington_CS83_South"},
+    {14701, "Proj_West_Virginia_CS27_North"},
+    {14702, "Proj_West_Virginia_CS27_South"},
+    {14731, "Proj_West_Virginia_CS83_North"},
+    {14732, "Proj_West_Virginia_CS83_South"},
+    {14801, "Proj_Wisconsin_CS27_North"},
+    {14802, "Proj_Wisconsin_CS27_Central"},
+    {14803, "Proj_Wisconsin_CS27_South"},
+    {14831, "Proj_Wisconsin_CS83_North"},
+    {14832, "Proj_Wisconsin_CS83_Central"},
+    {14833, "Proj_Wisconsin_CS83_South"},
+    {14901, "Proj_Wyoming_CS27_East"},
+    {14902, "Proj_Wyoming_CS27_East_Central"},
+    {14903, "Proj_Wyoming_CS27_West_Central"},
+    {14904, "Proj_Wyoming_CS27_West"},
+    {14931, "Proj_Wyoming_CS83_East"},
+    {14932, "Proj_Wyoming_CS83_East_Central"},
+    {14933, "Proj_Wyoming_CS83_West_Central"},
+    {14934, "Proj_Wyoming_CS83_West"},
+    {15001, "Proj_Alaska_CS27_1"},
+    {15002, "Proj_Alaska_CS27_2"},
+    {15003, "Proj_Alaska_CS27_3"},
+    {15004, "Proj_Alaska_CS27_4"},
+    {15005, "Proj_Alaska_CS27_5"},
+    {15006, "Proj_Alaska_CS27_6"},
+    {15007, "Proj_Alaska_CS27_7"},
+    {15008, "Proj_Alaska_CS27_8"},
+    {15009, "Proj_Alaska_CS27_9"},
+    {15010, "Proj_Alaska_CS27_10"},
+    {15031, "Proj_Alaska_CS83_1"},
+    {15032, "Proj_Alaska_CS83_2"},
+    {15033, "Proj_Alaska_CS83_3"},
+    {15034, "Proj_Alaska_CS83_4"},
+    {15035, "Proj_Alaska_CS83_5"},
+    {15036, "Proj_Alaska_CS83_6"},
+    {15037, "Proj_Alaska_CS83_7"},
+    {15038, "Proj_Alaska_CS83_8"},
+    {15039, "Proj_Alaska_CS83_9"},
+    {15040, "Proj_Alaska_CS83_10"},
+    {15101, "Proj_Hawaii_CS27_1"},
+    {15102, "Proj_Hawaii_CS27_2"},
+    {15103, "Proj_Hawaii_CS27_3"},
+    {15104, "Proj_Hawaii_CS27_4"},
+    {15105, "Proj_Hawaii_CS27_5"},
+    {15131, "Proj_Hawaii_CS83_1"},
+    {15132, "Proj_Hawaii_CS83_2"},
+    {15133, "Proj_Hawaii_CS83_3"},
+    {15134, "Proj_Hawaii_CS83_4"},
+    {15135, "Proj_Hawaii_CS83_5"},
+    {15201, "Proj_Puerto_Rico_CS27"},
+    {15202, "Proj_St_Croix"},
+    {15230, "Proj_Puerto_Rico_Virgin_Is"},
+    {15914, "Proj_BLM_14N_feet"},
+    {15915, "Proj_BLM_15N_feet"},
+    {15916, "Proj_BLM_16N_feet"},
+    {15917, "Proj_BLM_17N_feet"},
+    {17348, "Proj_Map_Grid_of_Australia_48"},
+    {17349, "Proj_Map_Grid_of_Australia_49"},
+    {17350, "Proj_Map_Grid_of_Australia_50"},
+    {17351, "Proj_Map_Grid_of_Australia_51"},
+    {17352, "Proj_Map_Grid_of_Australia_52"},
+    {17353, "Proj_Map_Grid_of_Australia_53"},
+    {17354, "Proj_Map_Grid_of_Australia_54"},
+    {17355, "Proj_Map_Grid_of_Australia_55"},
+    {17356, "Proj_Map_Grid_of_Australia_56"},
+    {17357, "Proj_Map_Grid_of_Australia_57"},
+    {17358, "Proj_Map_Grid_of_Australia_58"},
+    {17448, "Proj_Australian_Map_Grid_48"},
+    {17449, "Proj_Australian_Map_Grid_49"},
+    {17450, "Proj_Australian_Map_Grid_50"},
+    {17451, "Proj_Australian_Map_Grid_51"},
+    {17452, "Proj_Australian_Map_Grid_52"},
+    {17453, "Proj_Australian_Map_Grid_53"},
+    {17454, "Proj_Australian_Map_Grid_54"},
+    {17455, "Proj_Australian_Map_Grid_55"},
+    {17456, "Proj_Australian_Map_Grid_56"},
+    {17457, "Proj_Australian_Map_Grid_57"},
+    {17458, "Proj_Australian_Map_Grid_58"},
+    {18031, "Proj_Argentina_1"},
+    {18032, "Proj_Argentina_2"},
+    {18033, "Proj_Argentina_3"},
+    {18034, "Proj_Argentina_4"},
+    {18035, "Proj_Argentina_5"},
+    {18036, "Proj_Argentina_6"},
+    {18037, "Proj_Argentina_7"},
+    {18051, "Proj_Colombia_3W"},
+    {18052, "Proj_Colombia_Bogota"},
+    {18053, "Proj_Colombia_3E"},
+    {18054, "Proj_Colombia_6E"},
+    {18072, "Proj_Egypt_Red_Belt"},
+    {18073, "Proj_Egypt_Purple_Belt"},
+    {18074, "Proj_Extended_Purple_Belt"},
+    {18141, "Proj_New_Zealand_North_Island_Nat_Grid"},
+    {18142, "Proj_New_Zealand_South_Island_Nat_Grid"},
+    {19900, "Proj_Bahrain_Grid"},
+    {19905, "Proj_Netherlands_E_Indies_Equatorial"},
+    {19912, "Proj_RSO_Borneo"}
+};
+
+const char *const ff_tiff_coord_trans_codes[] = {
+    "CT_TransverseMercator",
+    "CT_TransvMercator_Modified_Alaska",
+    "CT_ObliqueMercator",
+    "CT_ObliqueMercator_Laborde",
+    "CT_ObliqueMercator_Rosenmund",
+    "CT_ObliqueMercator_Spherical",
+    "CT_Mercator",
+    "CT_LambertConfConic_2SP",
+    "CT_LambertConfConic_Helmert",
+    "CT_LambertAzimEqualArea",
+    "CT_AlbersEqualArea",
+    "CT_AzimuthalEquidistant",
+    "CT_EquidistantConic",
+    "CT_Stereographic",
+    "CT_PolarStereographic",
+    "CT_ObliqueStereographic",
+    "CT_Equirectangular",
+    "CT_CassiniSoldner",
+    "CT_Gnomonic",
+    "CT_MillerCylindrical",
+    "CT_Orthographic",
+    "CT_Polyconic",
+    "CT_Robinson",
+    "CT_Sinusoidal",
+    "CT_VanDerGrinten",
+    "CT_NewZealandMapGrid",
+    "CT_TransvMercator_SouthOriented"
+};
+
+const char *const ff_tiff_vert_cs_codes[] = {
+    "VertCS_Airy_1830_ellipsoid",
+    "VertCS_Airy_Modified_1849_ellipsoid",
+    "VertCS_ANS_ellipsoid",
+    "VertCS_Bessel_1841_ellipsoid",
+    "VertCS_Bessel_Modified_ellipsoid",
+    "VertCS_Bessel_Namibia_ellipsoid",
+    "VertCS_Clarke_1858_ellipsoid",
+    "VertCS_Clarke_1866_ellipsoid",
+    "VertCS_Clarke_1880_Benoit_ellipsoid",
+    "VertCS_Clarke_1880_IGN_ellipsoid",
+    "VertCS_Clarke_1880_RGS_ellipsoid",
+    "VertCS_Clarke_1880_Arc_ellipsoid",
+    "VertCS_Clarke_1880_SGA_1922_ellipsoid",
+    "VertCS_Everest_1830_1937_Adjustment_ellipsoid",
+    "VertCS_Everest_1830_1967_Definition_ellipsoid",
+    "VertCS_Everest_1830_1975_Definition_ellipsoid",
+    "VertCS_Everest_1830_Modified_ellipsoid",
+    "VertCS_GRS_1980_ellipsoid",
+    "VertCS_Helmert_1906_ellipsoid",
+    "VertCS_INS_ellipsoid",
+    "VertCS_International_1924_ellipsoid",
+    "VertCS_International_1967_ellipsoid",
+    "VertCS_Krassowsky_1940_ellipsoid",
+    "VertCS_NWL_9D_ellipsoid",
+    "VertCS_NWL_10D_ellipsoid",
+    "VertCS_Plessis_1817_ellipsoid",
+    "VertCS_Struve_1860_ellipsoid",
+    "VertCS_War_Office_ellipsoid",
+    "VertCS_WGS_84_ellipsoid",
+    "VertCS_GEM_10C_ellipsoid",
+    "VertCS_OSU86F_ellipsoid",
+    "VertCS_OSU91A_ellipsoid"
+};
+
+const char *const ff_tiff_ortho_vert_cs_codes[] = {
+    "VertCS_Newlyn",
+    "VertCS_North_American_Vertical_Datum_1929",
+    "VertCS_North_American_Vertical_Datum_1988",
+    "VertCS_Yellow_Sea_1956",
+    "VertCS_Baltic_Sea",
+    "VertCS_Caspian_Sea"
+};
diff --git a/libavcodec/tiff_data.h b/libavcodec/tiff_data.h
new file mode 100644
index 0000000..57515f9
--- /dev/null
+++ b/libavcodec/tiff_data.h
@@ -0,0 +1,92 @@
+/*
+ * TIFF data tables
+ * Copyright (c) 2011 Thomas Kuehnel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * TIFF data tables
+ * @author Thomas Kuehnel
+ * @see GeoTIFF specification at
+ * http://www.remotesensing.org/geotiff/spec/geotiffhome.html
+ */
+
+#ifndef AVCODEC_TIFF_DATA_H
+#define AVCODEC_TIFF_DATA_H
+
+#include "tiff.h"
+
+#define TIFF_CONF_KEY_ID_OFFSET 1024
+extern const TiffGeoTagNameType ff_tiff_conf_name_type_map[3];
+
+#define TIFF_GEOG_KEY_ID_OFFSET 2048
+extern const TiffGeoTagNameType ff_tiff_geog_name_type_map[14];
+
+#define TIFF_PROJ_KEY_ID_OFFSET 3072
+extern const TiffGeoTagNameType ff_tiff_proj_name_type_map[24];
+
+#define TIFF_VERT_KEY_ID_OFFSET 4096
+extern const TiffGeoTagNameType ff_tiff_vert_name_type_map[4];
+
+#define TIFF_GEO_KEY_UNDEFINED    0
+#define TIFF_GEO_KEY_USER_DEFINED 32767
+
+#define TIFF_GT_MODEL_TYPE_OFFSET 1
+extern const char *const ff_tiff_gt_model_type_codes[3];
+
+#define TIFF_GT_RASTER_TYPE_OFFSET 1
+extern const char *const ff_tiff_gt_raster_type_codes[2];
+
+#define TIFF_LINEAR_UNIT_OFFSET 9001
+extern const char *const ff_tiff_linear_unit_codes[15];
+
+#define TIFF_ANGULAR_UNIT_OFFSET 9101
+extern const char *const ff_tiff_angular_unit_codes[8];
+
+#define TIFF_GCS_TYPE_OFFSET 4201
+extern const char *const ff_tiff_gcs_type_codes[133];
+
+#define TIFF_GCSE_TYPE_OFFSET 4001
+extern const char *const ff_tiff_gcse_type_codes[35];
+
+#define TIFF_GEODETIC_DATUM_OFFSET 6201
+extern const char *const ff_tiff_geodetic_datum_codes[120];
+
+#define TIFF_GEODETIC_DATUM_E_OFFSET 6001
+extern const char *const ff_tiff_geodetic_datum_e_codes[35];
+
+#define TIFF_ELLIPSOID_OFFSET 7001
+extern const char *const ff_tiff_ellipsoid_codes[35];
+
+#define TIFF_PRIME_MERIDIAN_OFFSET 8901
+extern const char *const ff_tiff_prime_meridian_codes[11];
+
+extern const TiffGeoTagKeyName ff_tiff_proj_cs_type_codes[978];
+
+extern const TiffGeoTagKeyName ff_tiff_projection_codes[298];
+
+#define TIFF_COORD_TRANS_OFFSET 1
+extern const char *const ff_tiff_coord_trans_codes[27];
+
+#define TIFF_VERT_CS_OFFSET 5001
+extern const char *const ff_tiff_vert_cs_codes[32];
+
+#define TIFF_ORTHO_VERT_CS_OFFSET 5101
+extern const char *const ff_tiff_ortho_vert_cs_codes[6];
+#endif
diff --git a/libavcodec/tiffenc.c b/libavcodec/tiffenc.c
index 7c23ee2..3d37d2e 100644
--- a/libavcodec/tiffenc.c
+++ b/libavcodec/tiffenc.c
@@ -2,20 +2,20 @@
  * TIFF image encoder
  * Copyright (c) 2007 Bartlomiej Wolowiec
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,11 +30,13 @@
 #include <zlib.h>
 #endif
 
+#include "libavutil/imgutils.h"
 #include "libavutil/log.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
 #include "lzw.h"
 #include "put_bits.h"
 #include "rle.h"
@@ -43,8 +45,8 @@
 #define TIFF_MAX_ENTRY 32
 
 /** sizes of various TIFF field types (string size = 1)*/
-static const uint8_t type_sizes2[6] = {
-    0, 1, 1, 2, 4, 8
+static const uint8_t type_sizes2[14] = {
+    0, 1, 1, 2, 4, 8, 1, 1, 2, 4, 8, 4, 8, 4
 };
 
 typedef struct TiffEncoderContext {
@@ -58,6 +60,12 @@ typedef struct TiffEncoderContext {
     int bpp_tab_size;                       ///< bpp_tab size
     enum TiffPhotometric photometric_interpretation;  ///< photometric interpretation
     int strips;                             ///< number of strips
+    uint32_t *strip_sizes;
+    unsigned int strip_sizes_size;
+    uint32_t *strip_offsets;
+    unsigned int strip_offsets_size;
+    uint8_t *yuv_line;
+    unsigned int yuv_line_size;
     int rps;                                ///< row per strip
     uint8_t entries[TIFF_MAX_ENTRY * 12];   ///< entries in header
     int num_entries;                        ///< number of entries
@@ -66,10 +74,12 @@ typedef struct TiffEncoderContext {
     int buf_size;                           ///< buffer size
     uint16_t subsampling[2];                ///< YUV subsampling factors
     struct LZWEncodeState *lzws;            ///< LZW encode state
+    uint32_t dpi;                           ///< image resolution in DPI
 } TiffEncoderContext;
 
 /**
- * Check free space in buffer
+ * Check free space in buffer.
+ *
  * @param s Tiff context
  * @param need Needed bytes
  * @return 0 - ok, 1 - no free space
@@ -85,13 +95,13 @@ static inline int check_size(TiffEncoderContext *s, uint64_t need)
 }
 
 /**
- * Put n values to buffer
+ * Put n values to buffer.
  *
- * @param p Pointer to pointer to output buffer
- * @param n Number of values
- * @param val Pointer to values
- * @param type Type of values
- * @param flip =0 - normal copy, >0 - flip
+ * @param p pointer to pointer to output buffer
+ * @param n number of values
+ * @param val pointer to values
+ * @param type type of values
+ * @param flip = 0 - normal copy, >0 - flip
  */
 static void tnput(uint8_t **p, int n, const uint8_t *val, enum TiffTypes type,
                   int flip)
@@ -106,28 +116,29 @@ static void tnput(uint8_t **p, int n, const uint8_t *val, enum TiffTypes type,
 
 /**
  * Add entry to directory in tiff header.
+ *
  * @param s Tiff context
- * @param tag Tag that identifies the entry
- * @param type Entry type
- * @param count The number of values
- * @param ptr_val Pointer to values
+ * @param tag tag that identifies the entry
+ * @param type entry type
+ * @param count the number of values
+ * @param ptr_val pointer to values
  */
 static int add_entry(TiffEncoderContext *s, enum TiffTags tag,
                      enum TiffTypes type, int count, const void *ptr_val)
 {
     uint8_t *entries_ptr = s->entries + 12 * s->num_entries;
 
-    assert(s->num_entries < TIFF_MAX_ENTRY);
+    av_assert0(s->num_entries < TIFF_MAX_ENTRY);
 
     bytestream_put_le16(&entries_ptr, tag);
     bytestream_put_le16(&entries_ptr, type);
     bytestream_put_le32(&entries_ptr, count);
 
-    if (type_sizes[type] * count <= 4) {
+    if (type_sizes[type] * (int64_t)count <= 4) {
         tnput(&entries_ptr, count, ptr_val, type, 0);
     } else {
         bytestream_put_le32(&entries_ptr, *s->buf - s->buf_start);
-        if (check_size(s, count * type_sizes2[type]))
+        if (check_size(s, count * (int64_t)type_sizes2[type]))
             return AVERROR_INVALIDDATA;
         tnput(s->buf, count, ptr_val, type, 0);
     }
@@ -146,14 +157,14 @@ static int add_entry1(TiffEncoderContext *s,
 }
 
 /**
- * Encode one strip in tiff file
+ * Encode one strip in tiff file.
  *
  * @param s Tiff context
- * @param src Input buffer
- * @param dst Output buffer
- * @param n Size of input buffer
- * @param compr Compression method
- * @return Number of output bytes. If an output error is encountered, a negative
+ * @param src input buffer
+ * @param dst output buffer
+ * @param n size of input buffer
+ * @param compr compression method
+ * @return number of output bytes. If an output error is encountered, a negative
  * value corresponding to an AVERROR error code is returned.
  */
 static int encode_strip(TiffEncoderContext *s, const int8_t *src,
@@ -167,7 +178,7 @@ static int encode_strip(TiffEncoderContext *s, const int8_t *src,
         unsigned long zlen = s->buf_size - (*s->buf - s->buf_start);
         if (compress(dst, &zlen, src, n) != Z_OK) {
             av_log(s->avctx, AV_LOG_ERROR, "Compressing failed\n");
-            return AVERROR_UNKNOWN;
+            return AVERROR_EXTERNAL;
         }
         return zlen;
     }
@@ -183,6 +194,8 @@ static int encode_strip(TiffEncoderContext *s, const int8_t *src,
     case TIFF_LZW:
         return ff_lzw_encode(s->lzws, src, n);
     default:
+        av_log(s->avctx, AV_LOG_ERROR, "Unsupported compression method: %d\n",
+               compr);
         return AVERROR(EINVAL);
     }
 }
@@ -194,13 +207,24 @@ static void pack_yuv(TiffEncoderContext *s, const AVFrame *p,
     int w       = (s->width - 1) / s->subsampling[0] + 1;
     uint8_t *pu = &p->data[1][lnum / s->subsampling[1] * p->linesize[1]];
     uint8_t *pv = &p->data[2][lnum / s->subsampling[1] * p->linesize[2]];
-    for (i = 0; i < w; i++) {
-        for (j = 0; j < s->subsampling[1]; j++)
-            for (k = 0; k < s->subsampling[0]; k++)
-                *dst++ = p->data[0][(lnum + j) * p->linesize[0] +
-                                    i * s->subsampling[0] + k];
-        *dst++ = *pu++;
-        *dst++ = *pv++;
+    if (s->width % s->subsampling[0] || s->height % s->subsampling[1]) {
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    *dst++ = p->data[0][FFMIN(lnum + j, s->height-1) * p->linesize[0] +
+                                        FFMIN(i * s->subsampling[0] + k, s->width-1)];
+            *dst++ = *pu++;
+            *dst++ = *pv++;
+        }
+    }else{
+        for (i = 0; i < w; i++) {
+            for (j = 0; j < s->subsampling[1]; j++)
+                for (k = 0; k < s->subsampling[0]; k++)
+                    *dst++ = p->data[0][(lnum + j) * p->linesize[0] +
+                                        i * s->subsampling[0] + k];
+            *dst++ = *pu++;
+            *dst++ = *pv++;
+        }
     }
 }
 
@@ -209,84 +233,75 @@ static void pack_yuv(TiffEncoderContext *s, const AVFrame *p,
         ret = add_entry(s, tag, type, count, ptr_val);  \
         if (ret < 0)                                    \
             goto fail;                                  \
-    } while(0);
+    } while (0)
 
 #define ADD_ENTRY1(s, tag, type, val)           \
     do {                                        \
         ret = add_entry1(s, tag, type, val);    \
         if (ret < 0)                            \
             goto fail;                          \
-    } while(0);
+    } while (0)
 
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                         const AVFrame *pict, int *got_packet)
 {
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
     TiffEncoderContext *s = avctx->priv_data;
     const AVFrame *const p = pict;
     int i;
     uint8_t *ptr;
     uint8_t *offset;
     uint32_t strips;
-    uint32_t *strip_sizes   = NULL;
-    uint32_t *strip_offsets = NULL;
     int bytes_per_row;
-    uint32_t res[2]    = { 72, 1 };     // image resolution (72/1)
-    uint16_t bpp_tab[] = { 8, 8, 8, 8 };
+    uint32_t res[2] = { s->dpi, 1 };    // image resolution (72/1)
+    uint16_t bpp_tab[4];
     int ret = 0;
-    int is_yuv = 0;
-    uint8_t *yuv_line = NULL;
+    int is_yuv = 0, alpha = 0;
     int shift_h, shift_v;
     int packet_size;
-    const AVPixFmtDescriptor *pfd;
-
-    s->avctx = avctx;
 
     s->width          = avctx->width;
     s->height         = avctx->height;
     s->subsampling[0] = 1;
     s->subsampling[1] = 1;
 
+    avctx->bits_per_coded_sample =
+    s->bpp          = av_get_bits_per_pixel(desc);
+    s->bpp_tab_size = desc->nb_components;
+
     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_RGBA64LE:
-    case AV_PIX_FMT_RGB48LE:
-    case AV_PIX_FMT_GRAY16LE:
     case AV_PIX_FMT_RGBA:
+        alpha = 1;
+    case AV_PIX_FMT_RGB48LE:
     case AV_PIX_FMT_RGB24:
-    case AV_PIX_FMT_GRAY8:
-    case AV_PIX_FMT_PAL8:
-        pfd    = av_pix_fmt_desc_get(avctx->pix_fmt);
-        s->bpp = av_get_bits_per_pixel(pfd);
-        if (pfd->flags & AV_PIX_FMT_FLAG_PAL)
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_PALETTE;
-        else if (pfd->flags & AV_PIX_FMT_FLAG_RGB)
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_RGB;
-        else
-            s->photometric_interpretation = TIFF_PHOTOMETRIC_BLACK_IS_ZERO;
-        s->bpp_tab_size = pfd->nb_components;
-        for (i = 0; i < s->bpp_tab_size; i++)
-            bpp_tab[i] = s->bpp / s->bpp_tab_size;
+        s->photometric_interpretation = TIFF_PHOTOMETRIC_RGB;
         break;
+    case AV_PIX_FMT_GRAY8:
+        avctx->bits_per_coded_sample = 0x28;
+    case AV_PIX_FMT_GRAY8A:
+    case AV_PIX_FMT_YA16LE:
+        alpha = avctx->pix_fmt == AV_PIX_FMT_GRAY8A || avctx->pix_fmt == AV_PIX_FMT_YA16LE;
+    case AV_PIX_FMT_GRAY16LE:
     case AV_PIX_FMT_MONOBLACK:
-        s->bpp                        = 1;
         s->photometric_interpretation = TIFF_PHOTOMETRIC_BLACK_IS_ZERO;
-        s->bpp_tab_size               = 0;
+        break;
+    case AV_PIX_FMT_PAL8:
+        s->photometric_interpretation = TIFF_PHOTOMETRIC_PALETTE;
         break;
     case AV_PIX_FMT_MONOWHITE:
-        s->bpp                        = 1;
         s->photometric_interpretation = TIFF_PHOTOMETRIC_WHITE_IS_ZERO;
-        s->bpp_tab_size               = 0;
         break;
     case AV_PIX_FMT_YUV420P:
     case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV440P:
     case AV_PIX_FMT_YUV444P:
     case AV_PIX_FMT_YUV410P:
     case AV_PIX_FMT_YUV411P:
         av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &shift_h, &shift_v);
         s->photometric_interpretation = TIFF_PHOTOMETRIC_YCBCR;
-        s->bpp                        = 8 + (16 >> (shift_h + shift_v));
         s->subsampling[0]             = 1 << shift_h;
         s->subsampling[1]             = 1 << shift_v;
-        s->bpp_tab_size               = 3;
         is_yuv                        = 1;
         break;
     default:
@@ -295,6 +310,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         return AVERROR(EINVAL);
     }
 
+    for (i = 0; i < s->bpp_tab_size; i++)
+        bpp_tab[i] = desc->comp[i].depth;
+
     if (s->compr == TIFF_DEFLATE       ||
         s->compr == TIFF_ADOBE_DEFLATE ||
         s->compr == TIFF_LZW)
@@ -308,14 +326,13 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     strips = (s->height - 1) / s->rps + 1;
 
-    packet_size = avctx->height * ((avctx->width * s->bpp + 7) >> 3) * 2 +
+    bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
+                     s->subsampling[0] * s->subsampling[1] + 7) >> 3;
+    packet_size = avctx->height * bytes_per_row * 2 +
                   avctx->height * 4 + AV_INPUT_BUFFER_MIN_SIZE;
 
-    if (!pkt->data &&
-        (ret = av_new_packet(pkt, packet_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, packet_size, 0)) < 0)
         return ret;
-    }
     ptr          = pkt->data;
     s->buf_start = pkt->data;
     s->buf       = &ptr;
@@ -333,18 +350,21 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     offset = ptr;
     bytestream_put_le32(&ptr, 0);
 
-    strip_sizes   = av_mallocz_array(strips, sizeof(*strip_sizes));
-    strip_offsets = av_mallocz_array(strips, sizeof(*strip_offsets));
-    if (!strip_sizes || !strip_offsets) {
+    if (strips > INT_MAX / FFMAX(sizeof(s->strip_sizes[0]), sizeof(s->strip_offsets[0]))) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    av_fast_padded_mallocz(&s->strip_sizes  , &s->strip_sizes_size  , sizeof(s->strip_sizes  [0]) * strips);
+    av_fast_padded_mallocz(&s->strip_offsets, &s->strip_offsets_size, sizeof(s->strip_offsets[0]) * strips);
+
+    if (!s->strip_sizes || !s->strip_offsets) {
         ret = AVERROR(ENOMEM);
         goto fail;
     }
 
-    bytes_per_row = (((s->width - 1) / s->subsampling[0] + 1) * s->bpp *
-                     s->subsampling[0] * s->subsampling[1] + 7) >> 3;
     if (is_yuv) {
-        yuv_line = av_malloc(bytes_per_row);
-        if (!yuv_line) {
+        av_fast_padded_malloc(&s->yuv_line, &s->yuv_line_size, bytes_per_row);
+        if (s->yuv_line == NULL) {
             av_log(s->avctx, AV_LOG_ERROR, "Not enough memory\n");
             ret = AVERROR(ENOMEM);
             goto fail;
@@ -363,12 +383,12 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             ret = AVERROR(ENOMEM);
             goto fail;
         }
-        strip_offsets[0] = ptr - pkt->data;
+        s->strip_offsets[0] = ptr - pkt->data;
         zn               = 0;
         for (j = 0; j < s->rps; j++) {
             if (is_yuv) {
-                pack_yuv(s, p, yuv_line, j);
-                memcpy(zbuf + zn, yuv_line, bytes_per_row);
+                pack_yuv(s, p, s->yuv_line, j);
+                memcpy(zbuf + zn, s->yuv_line, bytes_per_row);
                 j += s->subsampling[1] - 1;
             } else
                 memcpy(zbuf + j * bytes_per_row,
@@ -382,9 +402,10 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             goto fail;
         }
         ptr           += ret;
-        strip_sizes[0] = ptr - pkt->data - strip_offsets[0];
+        s->strip_sizes[0] = ptr - pkt->data - s->strip_offsets[0];
     } else
 #endif
+    {
     if (s->compr == TIFF_LZW) {
         s->lzws = av_malloc(ff_lzw_encode_state_size);
         if (!s->lzws) {
@@ -393,17 +414,17 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
     }
     for (i = 0; i < s->height; i++) {
-        if (strip_sizes[i / s->rps] == 0) {
+        if (s->strip_sizes[i / s->rps] == 0) {
             if (s->compr == TIFF_LZW) {
                 ff_lzw_encode_init(s->lzws, ptr,
                                    s->buf_size - (*s->buf - s->buf_start),
                                    12, FF_LZW_TIFF, put_bits);
             }
-            strip_offsets[i / s->rps] = ptr - pkt->data;
+            s->strip_offsets[i / s->rps] = ptr - pkt->data;
         }
         if (is_yuv) {
-            pack_yuv(s, p, yuv_line, i);
-            ret = encode_strip(s, yuv_line, ptr, bytes_per_row, s->compr);
+            pack_yuv(s, p, s->yuv_line, i);
+            ret = encode_strip(s, s->yuv_line, ptr, bytes_per_row, s->compr);
             i  += s->subsampling[1] - 1;
         } else
             ret = encode_strip(s, p->data[0] + i * p->linesize[0],
@@ -412,17 +433,18 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
             av_log(s->avctx, AV_LOG_ERROR, "Encode strip failed\n");
             goto fail;
         }
-        strip_sizes[i / s->rps] += ret;
+        s->strip_sizes[i / s->rps] += ret;
         ptr                     += ret;
         if (s->compr == TIFF_LZW &&
             (i == s->height - 1 || i % s->rps == s->rps - 1)) {
             ret = ff_lzw_encode_flush(s->lzws, flush_put_bits);
-            strip_sizes[(i / s->rps)] += ret;
-            ptr                       += ret;
+            s->strip_sizes[(i / s->rps)] += ret;
+            ptr                          += ret;
         }
     }
     if (s->compr == TIFF_LZW)
-        av_free(s->lzws);
+        av_freep(&s->lzws);
+    }
 
     s->num_entries = 0;
 
@@ -435,14 +457,21 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     ADD_ENTRY1(s, TIFF_COMPR,       TIFF_SHORT, s->compr);
     ADD_ENTRY1(s, TIFF_PHOTOMETRIC, TIFF_SHORT, s->photometric_interpretation);
-    ADD_ENTRY(s,  TIFF_STRIP_OFFS,  TIFF_LONG,  strips, strip_offsets);
+    ADD_ENTRY(s,  TIFF_STRIP_OFFS,  TIFF_LONG,  strips, s->strip_offsets);
 
     if (s->bpp_tab_size)
         ADD_ENTRY1(s, TIFF_SAMPLES_PER_PIXEL, TIFF_SHORT, s->bpp_tab_size);
 
     ADD_ENTRY1(s, TIFF_ROWSPERSTRIP, TIFF_LONG,     s->rps);
-    ADD_ENTRY(s,  TIFF_STRIP_SIZE,   TIFF_LONG,     strips, strip_sizes);
+    ADD_ENTRY(s,  TIFF_STRIP_SIZE,   TIFF_LONG,     strips, s->strip_sizes);
     ADD_ENTRY(s,  TIFF_XRES,         TIFF_RATIONAL, 1,      res);
+    if (avctx->sample_aspect_ratio.num > 0 &&
+        avctx->sample_aspect_ratio.den > 0) {
+        AVRational y = av_mul_q(av_make_q(s->dpi, 1),
+                                avctx->sample_aspect_ratio);
+        res[0] = y.num;
+        res[1] = y.den;
+    }
     ADD_ENTRY(s,  TIFF_YRES,         TIFF_RATIONAL, 1,      res);
     ADD_ENTRY1(s, TIFF_RES_UNIT,     TIFF_SHORT,    2);
 
@@ -460,10 +489,14 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         }
         ADD_ENTRY(s, TIFF_PAL, TIFF_SHORT, 256 * 3, pal);
     }
+    if (alpha)
+        ADD_ENTRY1(s,TIFF_EXTRASAMPLES,      TIFF_SHORT,            2);
     if (is_yuv) {
         /** according to CCIR Recommendation 601.1 */
         uint32_t refbw[12] = { 15, 1, 235, 1, 128, 1, 240, 1, 128, 1, 240, 1 };
         ADD_ENTRY(s, TIFF_YCBCR_SUBSAMPLING, TIFF_SHORT,    2, s->subsampling);
+        if (avctx->chroma_sample_location == AVCHROMA_LOC_TOPLEFT)
+            ADD_ENTRY1(s, TIFF_YCBCR_POSITIONING, TIFF_SHORT, 2);
         ADD_ENTRY(s, TIFF_REFERENCE_BW,      TIFF_RATIONAL, 6, refbw);
     }
     // write offset to dir
@@ -482,20 +515,30 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     *got_packet = 1;
 
 fail:
-    av_free(strip_sizes);
-    av_free(strip_offsets);
-    av_free(yuv_line);
-    return ret;
+    return ret < 0 ? ret : 0;
 }
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
+    TiffEncoderContext *s = avctx->priv_data;
 #if FF_API_CODED_FRAME
 FF_DISABLE_DEPRECATION_WARNINGS
     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
     avctx->coded_frame->key_frame = 1;
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
+    s->avctx = avctx;
+
+    return 0;
+}
+
+static av_cold int encode_close(AVCodecContext *avctx)
+{
+    TiffEncoderContext *s = avctx->priv_data;
+
+    av_freep(&s->strip_sizes);
+    av_freep(&s->strip_offsets);
+    av_freep(&s->yuv_line);
 
     return 0;
 }
@@ -503,6 +546,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #define OFFSET(x) offsetof(TiffEncoderContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
+    {"dpi", "set the image resolution (in dpi)", OFFSET(dpi), AV_OPT_TYPE_INT, {.i64 = 72}, 1, 0x10000, AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_ENCODING_PARAM},
     { "compression_algo", NULL, OFFSET(compr), AV_OPT_TYPE_INT,   { .i64 = TIFF_PACKBITS }, TIFF_RAW, TIFF_DEFLATE, VE, "compression_algo" },
     { "packbits",         NULL, 0,             AV_OPT_TYPE_CONST, { .i64 = TIFF_PACKBITS }, 0,        0,            VE, "compression_algo" },
     { "raw",              NULL, 0,             AV_OPT_TYPE_CONST, { .i64 = TIFF_RAW      }, 0,        0,            VE, "compression_algo" },
@@ -527,13 +571,15 @@ AVCodec ff_tiff_encoder = {
     .id             = AV_CODEC_ID_TIFF,
     .priv_data_size = sizeof(TiffEncoderContext),
     .init           = encode_init,
+    .close          = encode_close,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .encode2        = encode_frame,
     .pix_fmts       = (const enum AVPixelFormat[]) {
         AV_PIX_FMT_RGB24, AV_PIX_FMT_RGB48LE, AV_PIX_FMT_PAL8,
         AV_PIX_FMT_RGBA, AV_PIX_FMT_RGBA64LE,
-        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY16LE,
+        AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A, AV_PIX_FMT_GRAY16LE, AV_PIX_FMT_YA16LE,
         AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_MONOWHITE,
-        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
         AV_PIX_FMT_NONE
     },
diff --git a/libavcodec/tmv.c b/libavcodec/tmv.c
index a9fcdf3..b738fcb 100644
--- a/libavcodec/tmv.c
+++ b/libavcodec/tmv.c
@@ -2,20 +2,20 @@
  * 8088flex TMV video decoder
  * Copyright (c) 2009 Daniel Verkamp <daniel at drv.nu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "avcodec.h"
 #include "internal.h"
 #include "libavutil/internal.h"
+#include "libavutil/xga_font_data.h"
 
 #include "cga_data.h"
 
@@ -45,10 +46,8 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
     unsigned x, y, fg, bg, c;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (avpkt->size < 2*char_rows*char_cols) {
         av_log(avctx, AV_LOG_ERROR,
@@ -63,6 +62,7 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
 
     frame->palette_has_changed = 1;
     memcpy(frame->data[1], ff_cga_palette, 16 * 4);
+    memset(frame->data[1] + 16 * 4, 0, AVPALETTE_SIZE - 16 * 4);
 
     for (y = 0; y < char_rows; y++) {
         for (x = 0; x < char_cols; x++) {
@@ -70,7 +70,7 @@ static int tmv_decode_frame(AVCodecContext *avctx, void *data,
             bg = *src  >> 4;
             fg = *src++ & 0xF;
             ff_draw_pc_font(dst + x * 8, frame->linesize[0],
-                            ff_cga_font, 8, c, fg, bg);
+                            avpriv_cga_font, 8, c, fg, bg);
         }
         dst += frame->linesize[0] * 8;
     }
diff --git a/libavcodec/tpeldsp.c b/libavcodec/tpeldsp.c
index 7ea1da4..cc4fed3 100644
--- a/libavcodec/tpeldsp.c
+++ b/libavcodec/tpeldsp.c
@@ -1,20 +1,20 @@
 /*
  * thirdpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tpeldsp.h b/libavcodec/tpeldsp.h
index 9c67d60..3732f17 100644
--- a/libavcodec/tpeldsp.h
+++ b/libavcodec/tpeldsp.h
@@ -1,20 +1,20 @@
 /*
  * thirdpel DSP functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/truemotion1.c b/libavcodec/truemotion1.c
index 3eab33a..da843c4 100644
--- a/libavcodec/truemotion1.c
+++ b/libavcodec/truemotion1.c
@@ -2,20 +2,20 @@
  * Duck TrueMotion 1.0 Decoder
  * Copyright (C) 2003 Alex Beregszaszi & Mike Melanson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -215,7 +215,7 @@ static int make_cdt16_entry(int p1, int p2, int16_t *cdt)
     b = cdt[p2];
     r = cdt[p1] << 11;
     lo = b + r;
-    return (lo + (lo << 16)) << 1;
+    return (lo + (lo * (1 << 16))) * 2;
 }
 
 static int make_ydt24_entry(int p1, int p2, int16_t *ydt)
@@ -224,7 +224,7 @@ static int make_ydt24_entry(int p1, int p2, int16_t *ydt)
 
     lo = ydt[p1];
     hi = ydt[p2];
-    return (lo + (hi << 8) + (hi << 16)) << 1;
+    return (lo + (hi * (1 << 8)) + (hi * (1 << 16))) * 2;
 }
 
 static int make_cdt24_entry(int p1, int p2, int16_t *cdt)
@@ -232,8 +232,8 @@ static int make_cdt24_entry(int p1, int p2, int16_t *cdt)
     int r, b;
 
     b = cdt[p2];
-    r = cdt[p1]<<16;
-    return (b+r) << 1;
+    r = cdt[p1] * (1 << 16);
+    return (b+r) * 2;
 }
 
 static void gen_vector_table15(TrueMotion1Context *s, const uint8_t *sel_vector_table)
@@ -396,12 +396,16 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     }
 
     if (compression_types[header.compression].algorithm == ALGO_RGB24H) {
-        new_pix_fmt = AV_PIX_FMT_RGB32;
+        new_pix_fmt = AV_PIX_FMT_0RGB32;
         width_shift = 1;
     } else
         new_pix_fmt = AV_PIX_FMT_RGB555; // RGB565 is supported as well
 
     s->w >>= width_shift;
+    if (s->w & 1) {
+        avpriv_request_sample(s->avctx, "Frame with odd width");
+        return AVERROR_PATCHWELCOME;
+    }
 
     if (s->w != s->avctx->width || s->h != s->avctx->height ||
         new_pix_fmt != s->avctx->pix_fmt) {
@@ -415,6 +419,8 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
         ff_set_sar(s->avctx, s->avctx->sample_aspect_ratio);
 
         av_fast_malloc(&s->vert_pred, &s->vert_pred_size, s->avctx->width * sizeof(unsigned int));
+        if (!s->vert_pred)
+            return AVERROR(ENOMEM);
     }
 
     /* There is 1 change bit per 4 pixels, so each change byte represents
@@ -483,6 +489,8 @@ static av_cold int truemotion1_decode_init(AVCodecContext *avctx)
     /* there is a vertical predictor for each pixel in a line; each vertical
      * predictor is 0 to start with */
     av_fast_malloc(&s->vert_pred, &s->vert_pred_size, s->avctx->width * sizeof(unsigned int));
+    if (!s->vert_pred)
+        return AVERROR(ENOMEM);
 
     return 0;
 }
@@ -637,7 +645,8 @@ static void truemotion1_decode_16bit(TrueMotion1Context *s)
         current_pixel_pair = (unsigned int *)current_line;
         vert_pred = s->vert_pred;
         mb_change_index = 0;
-        mb_change_byte = mb_change_bits[mb_change_index++];
+        if (!keyframe)
+            mb_change_byte = mb_change_bits[mb_change_index++];
         mb_change_byte_mask = 0x01;
         pixels_left = s->avctx->width;
 
@@ -871,10 +880,8 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
     if ((ret = truemotion1_decode_header(s)) < 0)
         return ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     if (compression_types[s->compression].algorithm == ALGO_RGB24H) {
         truemotion1_decode_24bit(s);
@@ -896,7 +903,7 @@ static av_cold int truemotion1_decode_end(AVCodecContext *avctx)
     TrueMotion1Context *s = avctx->priv_data;
 
     av_frame_free(&s->frame);
-    av_free(s->vert_pred);
+    av_freep(&s->vert_pred);
 
     return 0;
 }
diff --git a/libavcodec/truemotion1data.h b/libavcodec/truemotion1data.h
index e950450..3e58143 100644
--- a/libavcodec/truemotion1data.h
+++ b/libavcodec/truemotion1data.h
@@ -6,20 +6,20 @@
  * the GNU LGPL using the common understanding that data tables necessary
  * for decoding algorithms are not necessarily copyrightable.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #ifndef AVCODEC_TRUEMOTION1DATA_H
diff --git a/libavcodec/truemotion2.c b/libavcodec/truemotion2.c
index 1726440..245a32a 100644
--- a/libavcodec/truemotion2.c
+++ b/libavcodec/truemotion2.c
@@ -2,20 +2,20 @@
  * Duck/ON2 TrueMotion 2 Decoder
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,9 @@ typedef struct TM2Context {
     GetBitContext gb;
     BswapDSPContext bdsp;
 
+    uint8_t *buffer;
+    int buffer_size;
+
     /* TM2 streams */
     int *tokens[TM2_NUM_STREAMS];
     int tok_lens[TM2_NUM_STREAMS];
@@ -87,7 +90,7 @@ typedef struct TM2Context {
 * Huffman codes for each of streams
 */
 typedef struct TM2Codes {
-    VLC vlc; ///< table for Libav bitstream reader
+    VLC vlc; ///< table for FFmpeg bitstream reader
     int bits;
     int *recode; ///< table for converting from code indexes to values
     int length;
@@ -168,9 +171,10 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
 
     /* allocate space for codes - it is exactly ceil(nodes / 2) entries */
     huff.max_num = (huff.nodes + 1) >> 1;
-    huff.nums    = av_mallocz(huff.max_num * sizeof(int));
-    huff.bits    = av_mallocz(huff.max_num * sizeof(uint32_t));
-    huff.lens    = av_mallocz(huff.max_num * sizeof(int));
+    huff.nums    = av_calloc(huff.max_num, sizeof(int));
+    huff.bits    = av_calloc(huff.max_num, sizeof(uint32_t));
+    huff.lens    = av_calloc(huff.max_num, sizeof(int));
+
     if (!huff.nums || !huff.bits || !huff.lens) {
         res = AVERROR(ENOMEM);
         goto out;
@@ -196,7 +200,7 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
         else {
             code->bits = huff.max_bits;
             code->length = huff.max_num;
-            code->recode = av_malloc(code->length * sizeof(int));
+            code->recode = av_malloc_array(code->length, sizeof(int));
             if (!code->recode) {
                 res = AVERROR(ENOMEM);
                 goto out;
@@ -226,6 +230,8 @@ static inline int tm2_get_token(GetBitContext *gb, TM2Codes *code)
 {
     int val;
     val = get_vlc2(gb, code->vlc.table, code->bits, 1);
+    if(val<0)
+        return -1;
     return code->recode[val];
 }
 
@@ -257,7 +263,8 @@ static int tm2_read_deltas(TM2Context *ctx, int stream_id)
     d  = get_bits(&ctx->gb, 9);
     mb = get_bits(&ctx->gb, 5);
 
-    if ((d < 1) || (d > TM2_DELTAS) || (mb < 1) || (mb > 32)) {
+    av_assert2(mb < 32);
+    if ((d < 1) || (d > TM2_DELTAS) || (mb < 1)) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect delta table: %i deltas x %i bits\n", d, mb);
         return AVERROR_INVALIDDATA;
     }
@@ -283,6 +290,11 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
     TM2Codes codes;
     GetByteContext gb;
 
+    if (buf_size < 4) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "not enough space for len left\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* get stream length in dwords */
     bytestream2_init(&gb, buf, buf_size);
     len  = bytestream2_get_be32(&gb);
@@ -332,27 +344,35 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
     /* check if we have sane number of tokens */
     if ((toks < 0) || (toks > 0xFFFFFF)) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect number of tokens: %i\n", toks);
-        tm2_free_codes(&codes);
-        return AVERROR_INVALIDDATA;
+        ret = AVERROR_INVALIDDATA;
+        goto end;
+    }
+    ret = av_reallocp_array(&ctx->tokens[stream_id], toks, sizeof(int));
+    if (ret < 0) {
+        ctx->tok_lens[stream_id] = 0;
+        goto end;
     }
-    ctx->tokens[stream_id]   = av_realloc(ctx->tokens[stream_id], toks * sizeof(int));
     ctx->tok_lens[stream_id] = toks;
     len = bytestream2_get_be32(&gb);
     if (len > 0) {
         pos = bytestream2_tell(&gb);
-        if (skip <= pos)
-            return AVERROR_INVALIDDATA;
+        if (skip <= pos) {
+            ret = AVERROR_INVALIDDATA;
+            goto end;
+        }
         init_get_bits(&ctx->gb, buf + pos, (skip - pos) * 8);
         for (i = 0; i < toks; i++) {
             if (get_bits_left(&ctx->gb) <= 0) {
                 av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect number of tokens: %i\n", toks);
-                return AVERROR_INVALIDDATA;
+                ret = AVERROR_INVALIDDATA;
+                goto end;
             }
             ctx->tokens[stream_id][i] = tm2_get_token(&ctx->gb, &codes);
-            if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS) {
+            if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS || ctx->tokens[stream_id][i]<0) {
                 av_log(ctx->avctx, AV_LOG_ERROR, "Invalid delta token index %d for type %d, n=%d\n",
                        ctx->tokens[stream_id][i], stream_id, i);
-                return AVERROR_INVALIDDATA;
+                ret = AVERROR_INVALIDDATA;
+                goto end;
             }
         }
     } else {
@@ -361,13 +381,17 @@ static int tm2_read_stream(TM2Context *ctx, const uint8_t *buf, int stream_id, i
             if (stream_id <= TM2_MOT && ctx->tokens[stream_id][i] >= TM2_DELTAS) {
                 av_log(ctx->avctx, AV_LOG_ERROR, "Invalid delta token index %d for type %d, n=%d\n",
                        ctx->tokens[stream_id][i], stream_id, i);
-                return AVERROR_INVALIDDATA;
+                ret = AVERROR_INVALIDDATA;
+                goto end;
             }
         }
     }
-    tm2_free_codes(&codes);
 
-    return skip;
+    ret = skip;
+
+end:
+    tm2_free_codes(&codes);
+    return ret;
 }
 
 static inline int GET_TOK(TM2Context *ctx,int type)
@@ -376,8 +400,13 @@ static inline int GET_TOK(TM2Context *ctx,int type)
         av_log(ctx->avctx, AV_LOG_ERROR, "Read token from stream %i out of bounds (%i>=%i)\n", type, ctx->tok_ptrs[type], ctx->tok_lens[type]);
         return 0;
     }
-    if (type <= TM2_MOT)
+    if (type <= TM2_MOT) {
+        if (ctx->tokens[type][ctx->tok_ptrs[type]] >= TM2_DELTAS) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "token %d is too large\n", ctx->tokens[type][ctx->tok_ptrs[type]]);
+            return 0;
+        }
         return ctx->deltas[type][ctx->tokens[type][ctx->tok_ptrs[type]++]];
+    }
     return ctx->tokens[type][ctx->tok_ptrs[type]++];
 }
 
@@ -684,6 +713,11 @@ static inline void tm2_motion_block(TM2Context *ctx, AVFrame *pic, int bx, int b
     mx = av_clip(mx, -(bx * 4 + 4), ctx->avctx->width  - bx * 4);
     my = av_clip(my, -(by * 4 + 4), ctx->avctx->height - by * 4);
 
+    if (4*bx+mx<0 || 4*by+my<0 || 4*bx+mx+4 > ctx->avctx->width || 4*by+my+4 > ctx->avctx->height) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "MV out of picture\n");
+        return;
+    }
+
     Yo += my * oYstride + mx;
     Uo += (my >> 1) * oUstride + (mx >> 1);
     Vo += (my >> 1) * oVstride + (mx >> 1);
@@ -854,37 +888,34 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p    = l->pic;
     int offset           = TM2_HEADER_SIZE;
     int i, t, ret;
-    uint8_t *swbuf;
 
-    swbuf = av_malloc(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (!swbuf) {
+    av_fast_padded_malloc(&l->buffer, &l->buffer_size, buf_size);
+    if (!l->buffer) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
         return AVERROR(ENOMEM);
     }
 
-    if ((ret = ff_reget_buffer(avctx, p)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        av_free(swbuf);
+    if ((ret = ff_reget_buffer(avctx, p)) < 0)
         return ret;
-    }
 
-    l->bdsp.bswap_buf((uint32_t *) swbuf, (const uint32_t *) buf,
+    l->bdsp.bswap_buf((uint32_t *) l->buffer, (const uint32_t *) buf,
                       buf_size >> 2);
 
-    if ((ret = tm2_read_header(l, swbuf)) < 0) {
-        av_free(swbuf);
+    if ((ret = tm2_read_header(l, l->buffer)) < 0) {
         return ret;
     }
 
     for (i = 0; i < TM2_NUM_STREAMS; i++) {
         if (offset >= buf_size) {
-            av_free(swbuf);
+            av_log(avctx, AV_LOG_ERROR, "no space for tm2_read_stream\n");
             return AVERROR_INVALIDDATA;
         }
-        t = tm2_read_stream(l, swbuf + offset, tm2_stream_order[i],
+
+        t = tm2_read_stream(l, l->buffer + offset, tm2_stream_order[i],
                             buf_size - offset);
         if (t < 0) {
-            av_free(swbuf);
+            int j = tm2_stream_order[i];
+            memset(l->tokens[j], 0, sizeof(**l->tokens) * l->tok_lens[j]);
             return t;
         }
         offset += t;
@@ -898,7 +929,6 @@ static int decode_frame(AVCodecContext *avctx,
     l->cur = !l->cur;
     *got_frame      = 1;
     ret = av_frame_ref(data, l->pic);
-    av_free(swbuf);
 
     return (ret < 0) ? ret : buf_size;
 }
@@ -922,8 +952,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     ff_bswapdsp_init(&l->bdsp);
 
-    l->last  = av_malloc(4 * sizeof(*l->last)  * (w >> 2));
-    l->clast = av_malloc(4 * sizeof(*l->clast) * (w >> 2));
+    l->last  = av_malloc_array(w >> 2, 4 * sizeof(*l->last) );
+    l->clast = av_malloc_array(w >> 2, 4 * sizeof(*l->clast));
 
     for (i = 0; i < TM2_NUM_STREAMS; i++) {
         l->tokens[i] = NULL;
@@ -932,15 +962,15 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     w += 8;
     h += 8;
-    l->Y1_base = av_malloc(sizeof(*l->Y1_base) * w * h);
-    l->Y2_base = av_malloc(sizeof(*l->Y2_base) * w * h);
+    l->Y1_base = av_calloc(w * h, sizeof(*l->Y1_base));
+    l->Y2_base = av_calloc(w * h, sizeof(*l->Y2_base));
     l->y_stride = w;
     w = (w + 1) >> 1;
     h = (h + 1) >> 1;
-    l->U1_base = av_malloc(sizeof(*l->U1_base) * w * h);
-    l->V1_base = av_malloc(sizeof(*l->V1_base) * w * h);
-    l->U2_base = av_malloc(sizeof(*l->U2_base) * w * h);
-    l->V2_base = av_malloc(sizeof(*l->V1_base) * w * h);
+    l->U1_base = av_calloc(w * h, sizeof(*l->U1_base));
+    l->V1_base = av_calloc(w * h, sizeof(*l->V1_base));
+    l->U2_base = av_calloc(w * h, sizeof(*l->U2_base));
+    l->V2_base = av_calloc(w * h, sizeof(*l->V1_base));
     l->uv_stride = w;
     l->cur = 0;
     if (!l->Y1_base || !l->Y2_base || !l->U1_base ||
@@ -954,6 +984,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
         av_freep(&l->V2_base);
         av_freep(&l->last);
         av_freep(&l->clast);
+        av_frame_free(&l->pic);
         return AVERROR(ENOMEM);
     }
     l->Y1 = l->Y1_base + l->y_stride  * 4 + 4;
@@ -974,15 +1005,17 @@ static av_cold int decode_end(AVCodecContext *avctx)
     av_free(l->last);
     av_free(l->clast);
     for (i = 0; i < TM2_NUM_STREAMS; i++)
-        av_free(l->tokens[i]);
+        av_freep(&l->tokens[i]);
     if (l->Y1) {
-        av_free(l->Y1_base);
-        av_free(l->U1_base);
-        av_free(l->V1_base);
-        av_free(l->Y2_base);
-        av_free(l->U2_base);
-        av_free(l->V2_base);
+        av_freep(&l->Y1_base);
+        av_freep(&l->U1_base);
+        av_freep(&l->V1_base);
+        av_freep(&l->Y2_base);
+        av_freep(&l->U2_base);
+        av_freep(&l->V2_base);
     }
+    av_freep(&l->buffer);
+    l->buffer_size = 0;
 
     av_frame_free(&l->pic);
 
diff --git a/libavcodec/truemotion2rt.c b/libavcodec/truemotion2rt.c
new file mode 100644
index 0000000..49f2114
--- /dev/null
+++ b/libavcodec/truemotion2rt.c
@@ -0,0 +1,221 @@
+/*
+ * Duck TrueMotion 2.0 Real Time Decoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "avcodec.h"
+#define BITSTREAM_READER_LE
+#include "get_bits.h"
+#include "internal.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+
+typedef struct TrueMotion2RTContext {
+    GetBitContext gb;
+    const uint8_t *buf;
+    int size;
+    int delta_size;
+    int hscale;
+} TrueMotion2RTContext;
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUV410P;
+    return 0;
+}
+
+/* Returns the number of bytes consumed from the bytestream. Returns -1 if
+ * there was an error while decoding the header */
+static int truemotion2rt_decode_header(AVCodecContext *avctx)
+{
+    TrueMotion2RTContext *s = avctx->priv_data;
+    uint8_t header_buffer[128] = { 0 };  /* logical maximum size of the header */
+    int i, header_size;
+
+    header_size = ((s->buf[0] >> 5) | (s->buf[0] << 3)) & 0x7f;
+    if (header_size < 10) {
+        av_log(avctx, AV_LOG_ERROR, "invalid header size (%d)\n", header_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (header_size + 1 > s->size) {
+        av_log(avctx, AV_LOG_ERROR, "Input packet too small.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* unscramble the header bytes with a XOR operation */
+    for (i = 1; i < header_size; i++)
+        header_buffer[i - 1] = s->buf[i] ^ s->buf[i + 1];
+
+    s->delta_size = header_buffer[1];
+    s->hscale = 1 + !!header_buffer[3];
+    if (s->delta_size < 2 || s->delta_size > 4)
+        return AVERROR_INVALIDDATA;
+
+    avctx->height = AV_RL16(header_buffer + 5);
+    avctx->width  = AV_RL16(header_buffer + 7);
+
+    return header_size;
+}
+
+static const int16_t delta_tab4[] = {
+    1, -1, 2, -3, 8, -8, 18, -18, 36, -36, 54, -54, 96, -96, 144, -144
+};
+
+static const int16_t delta_tab3[] = {
+    2, -3, 8, -8, 18, -18, 36, -36
+};
+
+static const int16_t delta_tab2[] = {
+    5, -7, 36, -36
+};
+
+static const int16_t *const delta_tabs[] = {
+    delta_tab2, delta_tab3, delta_tab4,
+};
+
+static int decode_frame(AVCodecContext *avctx,
+                        void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    TrueMotion2RTContext *s = avctx->priv_data;
+    const uint8_t *buf = avpkt->data;
+    int ret, buf_size = avpkt->size;
+    AVFrame * const p = data;
+    GetBitContext *gb = &s->gb;
+    uint8_t *dst;
+    int x, y, delta_mode;
+
+    s->buf = buf;
+    s->size = buf_size;
+
+    if ((ret = truemotion2rt_decode_header(avctx)) < 0)
+        return ret;
+
+    if ((ret = init_get_bits8(gb, buf + ret, buf_size - ret)) < 0)
+        return ret;
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
+
+    skip_bits(gb, 32);
+    delta_mode = s->delta_size - 2;
+    dst = p->data[0];
+    for (y = 0; y < avctx->height; y++) {
+        int diff = 0;
+        for (x = 0; x < avctx->width; x += s->hscale) {
+            diff  += delta_tabs[delta_mode][get_bits(gb, s->delta_size)];
+            dst[x] = av_clip_uint8((y ? dst[x - p->linesize[0]] : 0) + diff);
+        }
+        dst += p->linesize[0];
+    }
+
+    if (s->hscale > 1) {
+        dst = p->data[0];
+        for (y = 0; y < avctx->height; y++) {
+            for (x = 1; x < avctx->width; x += s->hscale) {
+                dst[x] = dst[x - 1];
+            }
+            dst += p->linesize[0];
+        }
+    }
+
+    dst = p->data[0];
+    for (y = 0; y < avctx->height; y++) {
+        for (x = 0; x < avctx->width; x++)
+            dst[x] = av_clip_uint8(dst[x] + (dst[x] - 128) / 3);
+        dst += p->linesize[0];
+    }
+
+    dst = p->data[1];
+    for (y = 0; y < avctx->height >> 2; y++) {
+        int diff = 0;
+        for (x = 0; x < avctx->width >> 2; x += s->hscale) {
+            diff  += delta_tabs[delta_mode][get_bits(gb, s->delta_size)];
+            dst[x] = av_clip_uint8((y ? dst[x - p->linesize[1]] : 128) + diff);
+        }
+        dst += p->linesize[1];
+    }
+
+    if (s->hscale > 1) {
+        dst = p->data[1];
+        for (y = 0; y < avctx->height >> 2; y++) {
+            for (x = 1; x < avctx->width >> 2; x += s->hscale) {
+                dst[x] = dst[x - 1];
+            }
+            dst += p->linesize[1];
+        }
+    }
+
+    dst = p->data[1];
+    for (y = 0; y < avctx->height >> 2; y++) {
+        for (x = 0; x < avctx->width >> 2; x++)
+            dst[x] += (dst[x] - 128) / 8;
+        dst += p->linesize[1];
+    }
+
+    dst = p->data[2];
+    for (y = 0; y < avctx->height >> 2; y++) {
+        int diff = 0;
+        for (x = 0; x < avctx->width >> 2; x += s->hscale) {
+            diff  += delta_tabs[delta_mode][get_bits(gb, s->delta_size)];
+            dst[x] = av_clip_uint8((y ? dst[x - p->linesize[2]] : 128) + diff);
+        }
+        dst += p->linesize[2];
+    }
+
+    if (s->hscale > 1) {
+        dst = p->data[2];
+        for (y = 0; y < avctx->height >> 2; y++) {
+            for (x = 1; x < avctx->width >> 2; x += s->hscale) {
+                dst[x] = dst[x - 1];
+            }
+            dst += p->linesize[2];
+        }
+    }
+
+    dst = p->data[2];
+    for (y = 0; y < avctx->height >> 2; y++) {
+        for (x = 0; x < avctx->width >> 2; x++)
+            dst[x] += (dst[x] - 128) / 8;
+        dst += p->linesize[2];
+    }
+
+    p->pict_type = AV_PICTURE_TYPE_I;
+    p->key_frame = 1;
+    *got_frame   = 1;
+
+    return buf_size;
+}
+
+AVCodec ff_truemotion2rt_decoder = {
+    .name           = "truemotion2rt",
+    .long_name      = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_TRUEMOTION2RT,
+    .priv_data_size = sizeof(TrueMotion2RTContext),
+    .init           = decode_init,
+    .decode         = decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/truespeech.c b/libavcodec/truespeech.c
index b2195ba..d4ddfcb 100644
--- a/libavcodec/truespeech.c
+++ b/libavcodec/truespeech.c
@@ -2,20 +2,20 @@
  * DSP Group TrueSpeech compatible decoder
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -325,10 +325,8 @@ static int truespeech_decode_frame(AVCodecContext *avctx, void *data,
 
     /* get output buffer */
     frame->nb_samples = iterations * 240;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples = (int16_t *)frame->data[0];
 
     memset(samples, 0, iterations * 240 * sizeof(*samples));
diff --git a/libavcodec/truespeech_data.h b/libavcodec/truespeech_data.h
index 6e9806a..73ebda5 100644
--- a/libavcodec/truespeech_data.h
+++ b/libavcodec/truespeech_data.h
@@ -2,20 +2,20 @@
  * DSP Group TrueSpeech compatible decoder
  * copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/tscc.c b/libavcodec/tscc.c
index 7c54473..bd5fe03 100644
--- a/libavcodec/tscc.c
+++ b/libavcodec/tscc.c
@@ -2,20 +2,20 @@
  * TechSmith Camtasia decoder
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,6 +46,7 @@
 typedef struct TsccContext {
 
     AVCodecContext *avctx;
+    AVFrame *frame;
 
     // Bits per pixel
     int bpp;
@@ -66,13 +67,11 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     CamtasiaContext * const c = avctx->priv_data;
-    AVFrame *frame = data;
+    AVFrame *frame = c->frame;
     int ret;
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0){
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
         return ret;
-    }
 
     ret = inflateReset(&c->zstream);
     if (ret != Z_OK) {
@@ -108,6 +107,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         memcpy(frame->data[1], c->pal, AVPALETTE_SIZE);
     }
 
+    if ((ret = av_frame_ref(data, frame)) < 0)
+        return ret;
     *got_frame      = 1;
 
     /* always report that the buffer was completely consumed */
@@ -131,9 +132,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     case 24:
              avctx->pix_fmt = AV_PIX_FMT_BGR24;
              break;
-    case 32: avctx->pix_fmt = AV_PIX_FMT_RGB32; break;
+    case 32: avctx->pix_fmt = AV_PIX_FMT_0RGB32; break;
     default: av_log(avctx, AV_LOG_ERROR, "Camtasia error: unknown depth %i bpp\n", avctx->bits_per_coded_sample);
-             return AVERROR_INVALIDDATA;
+             return AVERROR_PATCHWELCOME;
     }
     c->bpp = avctx->bits_per_coded_sample;
     // buffer size for RLE 'best' case when 2-byte code precedes each pixel and there may be padding after it too
@@ -156,6 +157,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_UNKNOWN;
     }
 
+    c->frame = av_frame_alloc();
+
     return 0;
 }
 
@@ -164,6 +167,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
     CamtasiaContext * const c = avctx->priv_data;
 
     av_freep(&c->decomp_buf);
+    av_frame_free(&c->frame);
 
     inflateEnd(&c->zstream);
 
diff --git a/libavcodec/tscc2.c b/libavcodec/tscc2.c
index e830bf9..69a6fac 100644
--- a/libavcodec/tscc2.c
+++ b/libavcodec/tscc2.c
@@ -2,20 +2,20 @@
  * TechSmith Screen Codec 2 (aka Dora) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -91,14 +91,14 @@ static av_cold int init_vlcs(TSCC2Context *c)
     return 0;
 }
 
-#define DEQUANT(val, q) ((q * val + 0x80) >> 8)
+#define DEQUANT(val, q) (((q) * (val) + 0x80) >> 8)
 #define DCT1D(d0, d1, d2, d3, s0, s1, s2, s3, OP) \
     OP(d0, 5 * ((s0) + (s1) + (s2)) + 2 * (s3));  \
     OP(d1, 5 * ((s0) - (s2) - (s3)) + 2 * (s1));  \
     OP(d2, 5 * ((s0) - (s2) + (s3)) - 2 * (s1));  \
     OP(d3, 5 * ((s0) - (s1) + (s2)) - 2 * (s3));  \
 
-#define COL_OP(a, b)  a = b
+#define COL_OP(a, b)  a = (b)
 #define ROW_OP(a, b)  a = ((b) + 0x20) >> 6
 
 static void tscc2_idct4_put(int *in, int q[3], uint8_t *dst, int stride)
@@ -195,7 +195,8 @@ static int tscc2_decode_slice(TSCC2Context *c, int mb_y,
     int i, mb_x, q, ret;
     int off;
 
-    init_get_bits(&c->gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&c->gb, buf, buf_size)) < 0)
+        return ret;
 
     for (mb_x = 0; mb_x < c->mb_width; mb_x++) {
         q = c->slice_quants[mb_x + c->mb_width * mb_y];
@@ -235,7 +236,6 @@ static int tscc2_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
         return ret;
     }
 
diff --git a/libavcodec/tscc2data.h b/libavcodec/tscc2data.h
index ac0a898..7cd6f52 100644
--- a/libavcodec/tscc2data.h
+++ b/libavcodec/tscc2data.h
@@ -2,25 +2,25 @@
  * TechSmith Screen Codec 2 (aka Dora) decoder
  * Copyright (c) 2012 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_TSCC2_DATA_H
-#define AVCODEC_TSCC2_DATA_H
+#ifndef AVCODEC_TSCC2DATA_H
+#define AVCODEC_TSCC2DATA_H
 
 #include <stdint.h>
 
@@ -925,4 +925,4 @@ static const uint8_t *tscc2_ac_vlc_bits[NUM_VLC_SETS] = {
     ac_vlc_descC_bits,
 };
 
-#endif /* AVCODEC_TSCC2_DATA_H */
+#endif /* AVCODEC_TSCC2DATA_H */
diff --git a/libavcodec/tta.c b/libavcodec/tta.c
index 7399827..1e2e9c4 100644
--- a/libavcodec/tta.c
+++ b/libavcodec/tta.c
@@ -2,20 +2,20 @@
  * TTA (The Lossless True Audio) decoder
  * Copyright (c) 2006 Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,35 +29,23 @@
 
 #define BITSTREAM_READER_LE
 #include <limits.h>
+#include "ttadata.h"
+#include "ttadsp.h"
 #include "avcodec.h"
 #include "get_bits.h"
+#include "thread.h"
+#include "unary.h"
 #include "internal.h"
 #include "libavutil/crc.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
 
 #define FORMAT_SIMPLE    1
 #define FORMAT_ENCRYPTED 2
 
-#define MAX_ORDER 16
-typedef struct TTAFilter {
-    int32_t shift, round, error;
-    int32_t qm[MAX_ORDER];
-    int32_t dx[MAX_ORDER];
-    int32_t dl[MAX_ORDER];
-} TTAFilter;
-
-typedef struct TTARice {
-    uint32_t k0, k1, sum0, sum1;
-} TTARice;
-
-typedef struct TTAChannel {
-    int32_t predictor;
-    TTAFilter filter;
-    TTARice rice;
-} TTAChannel;
-
 typedef struct TTAContext {
+    AVClass *class;
     AVCodecContext *avctx;
-    GetBitContext gb;
     const AVCRC *crc_table;
 
     int format, channels, bps;
@@ -66,128 +54,65 @@ typedef struct TTAContext {
 
     int32_t *decode_buffer;
 
+    uint8_t crc_pass[8];
+    uint8_t *pass;
     TTAChannel *ch_ctx;
+    TTADSPContext dsp;
 } TTAContext;
 
-static const uint32_t shift_1[] = {
-    0x00000001, 0x00000002, 0x00000004, 0x00000008,
-    0x00000010, 0x00000020, 0x00000040, 0x00000080,
-    0x00000100, 0x00000200, 0x00000400, 0x00000800,
-    0x00001000, 0x00002000, 0x00004000, 0x00008000,
-    0x00010000, 0x00020000, 0x00040000, 0x00080000,
-    0x00100000, 0x00200000, 0x00400000, 0x00800000,
-    0x01000000, 0x02000000, 0x04000000, 0x08000000,
-    0x10000000, 0x20000000, 0x40000000, 0x80000000,
-    0x80000000, 0x80000000, 0x80000000, 0x80000000,
-    0x80000000, 0x80000000, 0x80000000, 0x80000000
-};
-
-static const uint32_t * const shift_16 = shift_1 + 4;
-
-static const int32_t ttafilter_configs[4] = {
-    10,
-    9,
-    10,
-    12
+static const int64_t tta_channel_layouts[7] = {
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_STEREO|AV_CH_LOW_FREQUENCY,
+    AV_CH_LAYOUT_QUAD,
+    0,
+    AV_CH_LAYOUT_5POINT1_BACK,
+    AV_CH_LAYOUT_5POINT1_BACK|AV_CH_BACK_CENTER,
+    AV_CH_LAYOUT_7POINT1_WIDE
 };
 
-static void ttafilter_init(TTAFilter *c, int32_t shift) {
-    memset(c, 0, sizeof(TTAFilter));
-    c->shift = shift;
-   c->round = shift_1[shift-1];
-//    c->round = 1 << (shift - 1);
-}
-
-// FIXME: copy paste from original
-static inline void memshl(register int32_t *a, register int32_t *b) {
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a++ = *b++;
-    *a = *b;
-}
-
-static inline void ttafilter_process(TTAFilter *c, int32_t *in)
+static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
 {
-    register int32_t *dl = c->dl, *qm = c->qm, *dx = c->dx, sum = c->round;
-
-    if (!c->error) {
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        sum += *dl++ * *qm, qm++;
-        dx += 8;
-    } else if(c->error < 0) {
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-        sum += *dl++ * (*qm -= *dx++), qm++;
-    } else {
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-        sum += *dl++ * (*qm += *dx++), qm++;
-    }
-
-    *(dx-0) = ((*(dl-1) >> 30) | 1) << 2;
-    *(dx-1) = ((*(dl-2) >> 30) | 1) << 1;
-    *(dx-2) = ((*(dl-3) >> 30) | 1) << 1;
-    *(dx-3) = ((*(dl-4) >> 30) | 1);
-
-    c->error = *in;
-    *in += (sum >> c->shift);
-    *dl = *in;
-
-    *(dl-1) = *dl - *(dl-1);
-    *(dl-2) = *(dl-1) - *(dl-2);
-    *(dl-3) = *(dl-2) - *(dl-3);
+    uint32_t crc, CRC;
 
-    memshl(c->dl, c->dl + 1);
-    memshl(c->dx, c->dx + 1);
-}
+    CRC = AV_RL32(buf + buf_size);
+    crc = av_crc(s->crc_table, 0xFFFFFFFFU, buf, buf_size);
+    if (CRC != (crc ^ 0xFFFFFFFFU)) {
+        av_log(s->avctx, AV_LOG_ERROR, "CRC error\n");
+        return AVERROR_INVALIDDATA;
+    }
 
-static void rice_init(TTARice *c, uint32_t k0, uint32_t k1)
-{
-    c->k0 = k0;
-    c->k1 = k1;
-    c->sum0 = shift_16[k0];
-    c->sum1 = shift_16[k1];
+    return 0;
 }
 
-static int tta_get_unary(GetBitContext *gb)
+static uint64_t tta_check_crc64(uint8_t *pass)
 {
-    int ret = 0;
+    uint64_t crc = UINT64_MAX, poly = 0x42F0E1EBA9EA3693U;
+    uint8_t *end = pass + strlen(pass);
+    int i;
+
+    while (pass < end) {
+        crc ^= (uint64_t)*pass++ << 56;
+        for (i = 0; i < 8; i++)
+            crc = (crc << 1) ^ (poly & (((int64_t) crc) >> 63));
+    }
 
-    // count ones
-    while (get_bits_left(gb) > 0 && get_bits1(gb))
-        ret++;
-    return ret;
+    return crc ^ UINT64_MAX;
 }
 
-static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
+static int allocate_buffers(AVCodecContext *avctx)
 {
-    uint32_t crc, CRC;
+    TTAContext *s = avctx->priv_data;
 
-    CRC = AV_RL32(buf + buf_size);
-    crc = av_crc(s->crc_table, 0xFFFFFFFFU, buf, buf_size);
-    if (CRC != (crc ^ 0xFFFFFFFFU)) {
-        av_log(s->avctx, AV_LOG_ERROR, "CRC error\n");
-        return AVERROR_INVALIDDATA;
+    if (s->bps < 3) {
+        s->decode_buffer = av_mallocz_array(sizeof(int32_t)*s->frame_length, s->channels);
+        if (!s->decode_buffer)
+            return AVERROR(ENOMEM);
+    } else
+        s->decode_buffer = NULL;
+    s->ch_ctx = av_malloc_array(avctx->channels, sizeof(*s->ch_ctx));
+    if (!s->ch_ctx) {
+        av_freep(&s->decode_buffer);
+        return AVERROR(ENOMEM);
     }
 
     return 0;
@@ -196,58 +121,63 @@ static int tta_check_crc(TTAContext *s, const uint8_t *buf, int buf_size)
 static av_cold int tta_decode_init(AVCodecContext * avctx)
 {
     TTAContext *s = avctx->priv_data;
+    GetBitContext gb;
     int total_frames;
+    int ret;
 
     s->avctx = avctx;
 
-    // 30bytes includes a seektable with one frame
-    if (avctx->extradata_size < 30)
-        return -1;
+    // 30bytes includes TTA1 header
+    if (avctx->extradata_size < 22)
+        return AVERROR_INVALIDDATA;
 
-    init_get_bits(&s->gb, avctx->extradata, avctx->extradata_size * 8);
-    if (show_bits_long(&s->gb, 32) == AV_RL32("TTA1"))
-    {
-        if (avctx->err_recognition & AV_EF_CRCCHECK) {
-            s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
-            tta_check_crc(s, avctx->extradata, 18);
-        }
+    s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+    ret = init_get_bits8(&gb, avctx->extradata, avctx->extradata_size);
+    if (ret < 0)
+        return ret;
 
+    if (show_bits_long(&gb, 32) == AV_RL32("TTA1")) {
         /* signature */
-        skip_bits_long(&s->gb, 32);
+        skip_bits_long(&gb, 32);
 
-        s->format = get_bits(&s->gb, 16);
+        s->format = get_bits(&gb, 16);
         if (s->format > 2) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid format\n");
-            return -1;
+            av_log(avctx, AV_LOG_ERROR, "Invalid format\n");
+            return AVERROR_INVALIDDATA;
         }
         if (s->format == FORMAT_ENCRYPTED) {
-            avpriv_report_missing_feature(s->avctx, "Encrypted TTA");
-            return AVERROR_PATCHWELCOME;
+            if (!s->pass) {
+                av_log(avctx, AV_LOG_ERROR, "Missing password for encrypted stream. Please use the -password option\n");
+                return AVERROR(EINVAL);
+            }
+            AV_WL64(s->crc_pass, tta_check_crc64(s->pass));
         }
-        avctx->channels = s->channels = get_bits(&s->gb, 16);
-        avctx->bits_per_coded_sample = get_bits(&s->gb, 16);
-        s->bps = (avctx->bits_per_coded_sample + 7) / 8;
-        avctx->sample_rate = get_bits_long(&s->gb, 32);
-        s->data_length = get_bits_long(&s->gb, 32);
-        skip_bits_long(&s->gb, 32); // CRC32 of header
+        avctx->channels = s->channels = get_bits(&gb, 16);
+        if (s->channels > 1 && s->channels < 9)
+            avctx->channel_layout = tta_channel_layouts[s->channels-2];
+        avctx->bits_per_raw_sample = get_bits(&gb, 16);
+        s->bps = (avctx->bits_per_raw_sample + 7) / 8;
+        avctx->sample_rate = get_bits_long(&gb, 32);
+        s->data_length = get_bits_long(&gb, 32);
+        skip_bits_long(&gb, 32); // CRC32 of header
 
         if (s->channels == 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid number of channels\n");
+            av_log(avctx, AV_LOG_ERROR, "Invalid number of channels\n");
             return AVERROR_INVALIDDATA;
         } else if (avctx->sample_rate == 0) {
-            av_log(s->avctx, AV_LOG_ERROR, "Invalid samplerate\n");
+            av_log(avctx, AV_LOG_ERROR, "Invalid samplerate\n");
             return AVERROR_INVALIDDATA;
         }
 
         switch(s->bps) {
+        case 1: avctx->sample_fmt = AV_SAMPLE_FMT_U8; break;
         case 2:
             avctx->sample_fmt = AV_SAMPLE_FMT_S16;
-            avctx->bits_per_raw_sample = 16;
             break;
         case 3:
             avctx->sample_fmt = AV_SAMPLE_FMT_S32;
-            avctx->bits_per_raw_sample = 24;
             break;
+        //case 4: avctx->sample_fmt = AV_SAMPLE_FMT_S32; break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Invalid/unsupported sample format.\n");
             return AVERROR_INVALIDDATA;
@@ -264,54 +194,35 @@ static av_cold int tta_decode_init(AVCodecContext * avctx)
         total_frames = s->data_length / s->frame_length +
                        (s->last_frame_length ? 1 : 0);
 
-        av_log(s->avctx, AV_LOG_DEBUG, "format: %d chans: %d bps: %d rate: %d block: %d\n",
+        av_log(avctx, AV_LOG_DEBUG, "format: %d chans: %d bps: %d rate: %d block: %d\n",
             s->format, avctx->channels, avctx->bits_per_coded_sample, avctx->sample_rate,
             avctx->block_align);
-        av_log(s->avctx, AV_LOG_DEBUG, "data_length: %d frame_length: %d last: %d total: %d\n",
+        av_log(avctx, AV_LOG_DEBUG, "data_length: %d frame_length: %d last: %d total: %d\n",
             s->data_length, s->frame_length, s->last_frame_length, total_frames);
 
-        // FIXME: seek table
-        if (avctx->extradata_size <= 26 || total_frames > INT_MAX / 4 ||
-            avctx->extradata_size - 26 < total_frames * 4)
-            av_log(avctx, AV_LOG_WARNING, "Seek table missing or too small\n");
-        else if (avctx->err_recognition & AV_EF_CRCCHECK) {
-            int ret = tta_check_crc(s, avctx->extradata + 22, total_frames * 4);
-            if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE)
-                return AVERROR_INVALIDDATA;
-        }
-        skip_bits_long(&s->gb, 32 * total_frames);
-        skip_bits_long(&s->gb, 32); // CRC32 of seektable
-
         if(s->frame_length >= UINT_MAX / (s->channels * sizeof(int32_t))){
             av_log(avctx, AV_LOG_ERROR, "frame_length too large\n");
-            return -1;
-        }
-
-        if (s->bps == 2) {
-            s->decode_buffer = av_mallocz(sizeof(int32_t)*s->frame_length*s->channels);
-            if (!s->decode_buffer)
-                return AVERROR(ENOMEM);
-        }
-        s->ch_ctx = av_malloc(avctx->channels * sizeof(*s->ch_ctx));
-        if (!s->ch_ctx) {
-            av_freep(&s->decode_buffer);
-            return AVERROR(ENOMEM);
+            return AVERROR_INVALIDDATA;
         }
     } else {
         av_log(avctx, AV_LOG_ERROR, "Wrong extradata present\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
-    return 0;
+    ff_ttadsp_init(&s->dsp);
+
+    return allocate_buffers(avctx);
 }
 
 static int tta_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame_ptr, AVPacket *avpkt)
 {
     AVFrame *frame     = data;
+    ThreadFrame tframe = { .f = data };
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     TTAContext *s = avctx->priv_data;
+    GetBitContext gb;
     int i, ret;
     int cur_chan = 0, framelen = s->frame_length;
     int32_t *p;
@@ -322,14 +233,13 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             return AVERROR_INVALIDDATA;
     }
 
-    init_get_bits(&s->gb, buf, buf_size*8);
+    if ((ret = init_get_bits8(&gb, avpkt->data, avpkt->size)) < 0)
+        return ret;
 
     /* get output buffer */
     frame->nb_samples = framelen;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
         return ret;
-    }
 
     // decode directly to output buffer for 24-bit sample format
     if (s->bps == 3)
@@ -337,9 +247,15 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
 
     // init per channel states
     for (i = 0; i < s->channels; i++) {
+        TTAFilter *filter = &s->ch_ctx[i].filter;
         s->ch_ctx[i].predictor = 0;
-        ttafilter_init(&s->ch_ctx[i].filter, ttafilter_configs[s->bps-1]);
-        rice_init(&s->ch_ctx[i].rice, 10, 10);
+        ff_tta_filter_init(filter, ff_tta_filter_configs[s->bps-1]);
+        if (s->format == FORMAT_ENCRYPTED) {
+            int i;
+            for (i = 0; i < 8; i++)
+                filter->qm[i] = sign_extend(s->crc_pass[i], 8);
+        }
+        ff_tta_rice_init(&s->ch_ctx[i].rice, 10, 10);
     }
 
     i = 0;
@@ -350,7 +266,7 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         uint32_t unary, depth, k;
         int32_t value;
 
-        unary = tta_get_unary(&s->gb);
+        unary = get_unary(&gb, 0, get_bits_left(&gb));
 
         if (unary == 0) {
             depth = 0;
@@ -361,7 +277,7 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             unary--;
         }
 
-        if (get_bits_left(&s->gb) < k) {
+        if (get_bits_left(&gb) < k) {
             ret = AVERROR_INVALIDDATA;
             goto error;
         }
@@ -371,7 +287,7 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
                 ret = AVERROR_INVALIDDATA;
                 goto error;
             }
-            value = (unary << k) + get_bits(&s->gb, k);
+            value = (unary << k) + get_bits(&gb, k);
         } else
             value = unary;
 
@@ -379,16 +295,16 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         switch (depth) {
         case 1:
             rice->sum1 += value - (rice->sum1 >> 4);
-            if (rice->k1 > 0 && rice->sum1 < shift_16[rice->k1])
+            if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
                 rice->k1--;
-            else if(rice->sum1 > shift_16[rice->k1 + 1])
+            else if(rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
                 rice->k1++;
-            value += shift_1[rice->k0];
+            value += ff_tta_shift_1[rice->k0];
         default:
             rice->sum0 += value - (rice->sum0 >> 4);
-            if (rice->k0 > 0 && rice->sum0 < shift_16[rice->k0])
+            if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
                 rice->k0--;
-            else if(rice->sum0 > shift_16[rice->k0 + 1])
+            else if(rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
                 rice->k0++;
         }
 
@@ -396,10 +312,11 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
         *p = 1 + ((value >> 1) ^ ((value & 1) - 1));
 
         // run hybrid filter
-        ttafilter_process(filter, p);
+        s->dsp.ttafilter_process_dec(filter->qm, filter->dx, filter->dl, &filter->error, p,
+                                     filter->shift, filter->round);
 
         // fixed order prediction
-#define PRED(x, k) (int32_t)((((uint64_t)x << k) - x) >> k)
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
         switch (s->bps) {
         case 1: *p += PRED(*predictor, 4); break;
         case 2:
@@ -421,32 +338,43 @@ static int tta_decode_frame(AVCodecContext *avctx, void *data,
             cur_chan = 0;
             i++;
             // check for last frame
-            if (i == s->last_frame_length && get_bits_left(&s->gb) / 8 == 4) {
+            if (i == s->last_frame_length && get_bits_left(&gb) / 8 == 4) {
                 frame->nb_samples = framelen = s->last_frame_length;
                 break;
             }
         }
     }
 
-    align_get_bits(&s->gb);
-    if (get_bits_left(&s->gb) < 32) {
+    align_get_bits(&gb);
+    if (get_bits_left(&gb) < 32) {
         ret = AVERROR_INVALIDDATA;
         goto error;
     }
-    skip_bits_long(&s->gb, 32); // frame crc
+    skip_bits_long(&gb, 32); // frame crc
 
     // convert to output buffer
-    if (s->bps == 2) {
+    switch (s->bps) {
+    case 1: {
+        uint8_t *samples = (uint8_t *)frame->data[0];
+        for (p = s->decode_buffer; p < s->decode_buffer + (framelen * s->channels); p++)
+            *samples++ = *p + 0x80;
+        break;
+        }
+    case 2: {
         int16_t *samples = (int16_t *)frame->data[0];
         for (p = s->decode_buffer; p < s->decode_buffer + (framelen * s->channels); p++)
             *samples++ = *p;
-    } else {
+        break;
+        }
+    case 3: {
         // shift samples for 24-bit sample format
         int32_t *samples = (int32_t *)frame->data[0];
         for (i = 0; i < framelen * s->channels; i++)
             *samples++ <<= 8;
         // reset decode buffer
         s->decode_buffer = NULL;
+        break;
+        }
     }
 
     *got_frame_ptr = 1;
@@ -459,15 +387,38 @@ error:
     return ret;
 }
 
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    TTAContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return allocate_buffers(avctx);
+}
+
 static av_cold int tta_decode_close(AVCodecContext *avctx) {
     TTAContext *s = avctx->priv_data;
 
-    av_free(s->decode_buffer);
+    if (s->bps < 3)
+        av_freep(&s->decode_buffer);
+    s->decode_buffer = NULL;
     av_freep(&s->ch_ctx);
 
     return 0;
 }
 
+#define OFFSET(x) offsetof(TTAContext, x)
+#define DEC (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM)
+static const AVOption options[] = {
+    { "password", "Set decoding password", OFFSET(pass), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, DEC },
+    { NULL },
+};
+
+static const AVClass tta_decoder_class = {
+    .class_name = "TTA Decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_tta_decoder = {
     .name           = "tta",
     .long_name      = NULL_IF_CONFIG_SMALL("TTA (True Audio)"),
@@ -477,5 +428,7 @@ AVCodec ff_tta_decoder = {
     .init           = tta_decode_init,
     .close          = tta_decode_close,
     .decode         = tta_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .priv_class     = &tta_decoder_class,
 };
diff --git a/libavcodec/ttadata.c b/libavcodec/ttadata.c
new file mode 100644
index 0000000..bf793a4
--- /dev/null
+++ b/libavcodec/ttadata.c
@@ -0,0 +1,52 @@
+/*
+ * TTA (The Lossless True Audio) data
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ttadata.h"
+
+const uint32_t ff_tta_shift_1[] = {
+    0x00000001, 0x00000002, 0x00000004, 0x00000008,
+    0x00000010, 0x00000020, 0x00000040, 0x00000080,
+    0x00000100, 0x00000200, 0x00000400, 0x00000800,
+    0x00001000, 0x00002000, 0x00004000, 0x00008000,
+    0x00010000, 0x00020000, 0x00040000, 0x00080000,
+    0x00100000, 0x00200000, 0x00400000, 0x00800000,
+    0x01000000, 0x02000000, 0x04000000, 0x08000000,
+    0x10000000, 0x20000000, 0x40000000, 0x80000000,
+    0x80000000, 0x80000000, 0x80000000, 0x80000000,
+    0x80000000, 0x80000000, 0x80000000, 0x80000000
+};
+
+const uint32_t * const ff_tta_shift_16 = ff_tta_shift_1 + 4;
+
+const uint8_t ff_tta_filter_configs[] = { 10, 9, 10, 12 };
+
+void ff_tta_rice_init(TTARice *c, uint32_t k0, uint32_t k1)
+{
+    c->k0 = k0;
+    c->k1 = k1;
+    c->sum0 = ff_tta_shift_16[k0];
+    c->sum1 = ff_tta_shift_16[k1];
+}
+
+void ff_tta_filter_init(TTAFilter *c, int32_t shift) {
+    memset(c, 0, sizeof(TTAFilter));
+    c->shift = shift;
+    c->round = ff_tta_shift_1[shift-1];
+}
diff --git a/libavcodec/ttadata.h b/libavcodec/ttadata.h
new file mode 100644
index 0000000..48c4cd0
--- /dev/null
+++ b/libavcodec/ttadata.h
@@ -0,0 +1,50 @@
+/*
+ * TTA (The Lossless True Audio) data
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TTADATA_H
+#define AVCODEC_TTADATA_H
+
+#include "internal.h"
+
+#define MAX_ORDER 16
+typedef struct TTAFilter {
+    int32_t shift, round, error;
+    int32_t qm[MAX_ORDER];
+    int32_t dx[MAX_ORDER];
+    int32_t dl[MAX_ORDER];
+} TTAFilter;
+
+typedef struct TTARice {
+    uint32_t k0, k1, sum0, sum1;
+} TTARice;
+
+typedef struct TTAChannel {
+    int32_t predictor;
+    TTAFilter filter;
+    TTARice rice;
+} TTAChannel;
+
+extern const uint32_t ff_tta_shift_1[];
+extern const uint32_t * const ff_tta_shift_16;
+extern const uint8_t ff_tta_filter_configs[];
+
+void ff_tta_rice_init(TTARice *c, uint32_t k0, uint32_t k1);
+void ff_tta_filter_init(TTAFilter *c, int32_t shift);
+#endif /* AVCODEC_TTADATA_H */
diff --git a/libavcodec/ttadsp.c b/libavcodec/ttadsp.c
new file mode 100644
index 0000000..30b7ab9
--- /dev/null
+++ b/libavcodec/ttadsp.c
@@ -0,0 +1,57 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ttadsp.h"
+
+static void ttafilter_process_dec_c(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round) {
+    if (*error < 0) {
+        qm[0] -= dx[0]; qm[1] -= dx[1]; qm[2] -= dx[2]; qm[3] -= dx[3];
+        qm[4] -= dx[4]; qm[5] -= dx[5]; qm[6] -= dx[6]; qm[7] -= dx[7];
+    } else if (*error > 0) {
+        qm[0] += dx[0]; qm[1] += dx[1]; qm[2] += dx[2]; qm[3] += dx[3];
+        qm[4] += dx[4]; qm[5] += dx[5]; qm[6] += dx[6]; qm[7] += dx[7];
+    }
+
+    round +=    dl[0] * qm[0] + dl[1] * qm[1] + dl[2] * qm[2] + dl[3] * qm[3] +
+                dl[4] * qm[4] + dl[5] * qm[5] + dl[6] * qm[6] + dl[7] * qm[7];
+
+    dx[0] = dx[1]; dx[1] = dx[2]; dx[2] = dx[3]; dx[3] = dx[4];
+    dl[0] = dl[1]; dl[1] = dl[2]; dl[2] = dl[3]; dl[3] = dl[4];
+
+    dx[4] = ((dl[4] >> 30) | 1);
+    dx[5] = ((dl[5] >> 30) | 2) & ~1;
+    dx[6] = ((dl[6] >> 30) | 2) & ~1;
+    dx[7] = ((dl[7] >> 30) | 4) & ~3;
+
+    *error = *in;
+    *in += (round >> shift);
+
+    dl[4] = -dl[5]; dl[5] = -dl[6];
+    dl[6] = *in - dl[7]; dl[7] = *in;
+    dl[5] += dl[6]; dl[4] += dl[5];
+}
+
+av_cold void ff_ttadsp_init(TTADSPContext *c)
+{
+    c->ttafilter_process_dec = ttafilter_process_dec_c;
+
+    if (ARCH_X86)
+        ff_ttadsp_init_x86(c);
+}
diff --git a/libavcodec/ttadsp.h b/libavcodec/ttadsp.h
new file mode 100644
index 0000000..56930f1
--- /dev/null
+++ b/libavcodec/ttadsp.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_TTADSP_H
+#define AVCODEC_TTADSP_H
+
+#include <stdint.h>
+#include "ttadata.h"
+
+typedef struct TTADSPContext {
+    void (*ttafilter_process_dec)(int32_t *qm, int32_t *dx, int32_t *dl,
+                                  int32_t *error, int32_t *in, int32_t shift,
+                                  int32_t round);
+} TTADSPContext;
+
+void ff_ttadsp_init(TTADSPContext *c);
+void ff_ttadsp_init_x86(TTADSPContext *c);
+
+#endif /* AVCODEC_TTADSP_H */
diff --git a/libavcodec/ttaenc.c b/libavcodec/ttaenc.c
new file mode 100644
index 0000000..2f1c8db
--- /dev/null
+++ b/libavcodec/ttaenc.c
@@ -0,0 +1,243 @@
+/*
+ * TTA (The Lossless True Audio) encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_WRITER_LE
+#include "ttadata.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "internal.h"
+#include "libavutil/crc.h"
+
+typedef struct TTAEncContext {
+    const AVCRC *crc_table;
+    int bps;
+    TTAChannel *ch_ctx;
+} TTAEncContext;
+
+static av_cold int tta_encode_init(AVCodecContext *avctx)
+{
+    TTAEncContext *s = avctx->priv_data;
+
+    s->crc_table = av_crc_get_table(AV_CRC_32_IEEE_LE);
+
+    switch (avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_U8:
+        avctx->bits_per_raw_sample = 8;
+        break;
+    case AV_SAMPLE_FMT_S16:
+        avctx->bits_per_raw_sample = 16;
+        break;
+    case AV_SAMPLE_FMT_S32:
+        if (avctx->bits_per_raw_sample > 24)
+            av_log(avctx, AV_LOG_WARNING, "encoding as 24 bits-per-sample\n");
+        avctx->bits_per_raw_sample = 24;
+    }
+
+    s->bps = avctx->bits_per_raw_sample >> 3;
+    avctx->frame_size = 256 * avctx->sample_rate / 245;
+
+    s->ch_ctx = av_malloc_array(avctx->channels, sizeof(*s->ch_ctx));
+    if (!s->ch_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static inline void ttafilter_process(TTAFilter *c, int32_t *in)
+{
+    register int32_t *dl = c->dl, *qm = c->qm, *dx = c->dx, sum = c->round;
+
+    if (c->error < 0) {
+        qm[0] -= dx[0]; qm[1] -= dx[1]; qm[2] -= dx[2]; qm[3] -= dx[3];
+        qm[4] -= dx[4]; qm[5] -= dx[5]; qm[6] -= dx[6]; qm[7] -= dx[7];
+    } else if (c->error > 0) {
+        qm[0] += dx[0]; qm[1] += dx[1]; qm[2] += dx[2]; qm[3] += dx[3];
+        qm[4] += dx[4]; qm[5] += dx[5]; qm[6] += dx[6]; qm[7] += dx[7];
+    }
+
+    sum += dl[0] * qm[0] + dl[1] * qm[1] + dl[2] * qm[2] + dl[3] * qm[3] +
+           dl[4] * qm[4] + dl[5] * qm[5] + dl[6] * qm[6] + dl[7] * qm[7];
+
+    dx[0] = dx[1]; dx[1] = dx[2]; dx[2] = dx[3]; dx[3] = dx[4];
+    dl[0] = dl[1]; dl[1] = dl[2]; dl[2] = dl[3]; dl[3] = dl[4];
+
+    dx[4] = ((dl[4] >> 30) | 1);
+    dx[5] = ((dl[5] >> 30) | 2) & ~1;
+    dx[6] = ((dl[6] >> 30) | 2) & ~1;
+    dx[7] = ((dl[7] >> 30) | 4) & ~3;
+
+    dl[4] = -dl[5]; dl[5] = -dl[6];
+    dl[6] = *in - dl[7]; dl[7] = *in;
+    dl[5] += dl[6]; dl[4] += dl[5];
+
+    *in -= (sum >> c->shift);
+    c->error = *in;
+}
+
+static int32_t get_sample(const AVFrame *frame, int sample,
+                          enum AVSampleFormat format)
+{
+    int32_t ret;
+
+    if (format == AV_SAMPLE_FMT_U8) {
+        ret = frame->data[0][sample] - 0x80;
+    } else if (format == AV_SAMPLE_FMT_S16) {
+        const int16_t *ptr = (const int16_t *)frame->data[0];
+        ret = ptr[sample];
+    } else {
+        const int32_t *ptr = (const int32_t *)frame->data[0];
+        ret = ptr[sample] >> 8;
+    }
+
+    return ret;
+}
+
+static int tta_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet_ptr)
+{
+    TTAEncContext *s = avctx->priv_data;
+    PutBitContext pb;
+    int ret, i, out_bytes, cur_chan, res, samples;
+    int64_t pkt_size =  frame->nb_samples * 2LL * avctx->channels * s->bps;
+
+pkt_alloc:
+    cur_chan = 0, res = 0, samples = 0;
+    if ((ret = ff_alloc_packet2(avctx, avpkt, pkt_size, 0)) < 0)
+        return ret;
+    init_put_bits(&pb, avpkt->data, avpkt->size);
+
+    // init per channel states
+    for (i = 0; i < avctx->channels; i++) {
+        s->ch_ctx[i].predictor = 0;
+        ff_tta_filter_init(&s->ch_ctx[i].filter, ff_tta_filter_configs[s->bps - 1]);
+        ff_tta_rice_init(&s->ch_ctx[i].rice, 10, 10);
+    }
+
+    for (i = 0; i < frame->nb_samples * avctx->channels; i++) {
+        TTAChannel *c = &s->ch_ctx[cur_chan];
+        TTAFilter *filter = &c->filter;
+        TTARice *rice = &c->rice;
+        uint32_t k, unary, outval;
+        int32_t value, temp;
+
+        value = get_sample(frame, samples++, avctx->sample_fmt);
+
+        if (avctx->channels > 1) {
+            if (cur_chan < avctx->channels - 1)
+                value  = res = get_sample(frame, samples, avctx->sample_fmt) - value;
+            else
+                value -= res / 2;
+        }
+
+        temp = value;
+#define PRED(x, k) (int32_t)((((uint64_t)(x) << (k)) - (x)) >> (k))
+        switch (s->bps) {
+        case 1: value -= PRED(c->predictor, 4); break;
+        case 2:
+        case 3: value -= PRED(c->predictor, 5); break;
+        }
+        c->predictor = temp;
+
+        ttafilter_process(filter, &value);
+        outval = (value > 0) ? (value << 1) - 1: -value << 1;
+
+        k = rice->k0;
+
+        rice->sum0 += outval - (rice->sum0 >> 4);
+        if (rice->k0 > 0 && rice->sum0 < ff_tta_shift_16[rice->k0])
+            rice->k0--;
+        else if (rice->sum0 > ff_tta_shift_16[rice->k0 + 1])
+            rice->k0++;
+
+        if (outval >= ff_tta_shift_1[k]) {
+            outval -= ff_tta_shift_1[k];
+            k = rice->k1;
+
+            rice->sum1 += outval - (rice->sum1 >> 4);
+            if (rice->k1 > 0 && rice->sum1 < ff_tta_shift_16[rice->k1])
+                rice->k1--;
+            else if (rice->sum1 > ff_tta_shift_16[rice->k1 + 1])
+                rice->k1++;
+
+            unary = 1 + (outval >> k);
+            if (unary + 100LL > put_bits_left(&pb)) {
+                if (pkt_size < INT_MAX/2) {
+                    pkt_size *= 2;
+                    av_packet_unref(avpkt);
+                    goto pkt_alloc;
+                } else
+                    return AVERROR(ENOMEM);
+            }
+            do {
+                if (unary > 31) {
+                    put_bits(&pb, 31, 0x7FFFFFFF);
+                    unary -= 31;
+                } else {
+                    put_bits(&pb, unary, (1 << unary) - 1);
+                    unary = 0;
+                }
+            } while (unary);
+        }
+
+        put_bits(&pb, 1, 0);
+
+        if (k)
+            put_bits(&pb, k, outval & (ff_tta_shift_1[k] - 1));
+
+        if (cur_chan < avctx->channels - 1)
+            cur_chan++;
+        else
+            cur_chan = 0;
+    }
+
+    flush_put_bits(&pb);
+    out_bytes = put_bits_count(&pb) >> 3;
+    put_bits32(&pb, av_crc(s->crc_table, UINT32_MAX, avpkt->data, out_bytes) ^ UINT32_MAX);
+    flush_put_bits(&pb);
+
+    avpkt->pts      = frame->pts;
+    avpkt->size     = out_bytes + 4;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static av_cold int tta_encode_close(AVCodecContext *avctx)
+{
+    TTAEncContext *s = avctx->priv_data;
+    av_freep(&s->ch_ctx);
+    return 0;
+}
+
+AVCodec ff_tta_encoder = {
+    .name           = "tta",
+    .long_name      = NULL_IF_CONFIG_SMALL("TTA (True Audio)"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_TTA,
+    .priv_data_size = sizeof(TTAEncContext),
+    .init           = tta_encode_init,
+    .close          = tta_encode_close,
+    .encode2        = tta_encode_frame,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME | AV_CODEC_CAP_LOSSLESS,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_U8,
+                                                     AV_SAMPLE_FMT_S16,
+                                                     AV_SAMPLE_FMT_S32,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index 940def4..7b2e19e 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -358,7 +358,7 @@ static void imdct_and_window(TwinVQContext *tctx, enum TwinVQFrameType ftype,
 
         mdct->imdct_half(mdct, buf1 + bsize * j, in + bsize * j);
 
-        tctx->fdsp.vector_fmul_window(out2, prev_buf + (bsize - wsize) / 2,
+        tctx->fdsp->vector_fmul_window(out2, prev_buf + (bsize - wsize) / 2,
                                       buf1 + bsize * j,
                                       ff_sine_windows[av_log2(wsize)],
                                       wsize / 2);
@@ -405,7 +405,7 @@ static void imdct_output(TwinVQContext *tctx, enum TwinVQFrameType ftype,
                size1 * sizeof(*out2));
         memcpy(out2 + size1, &tctx->curr_frame[2 * mtab->size],
                size2 * sizeof(*out2));
-        tctx->fdsp.butterflies_float(out1, out2, mtab->size);
+        tctx->fdsp->butterflies_float(out1, out2, mtab->size);
     }
 }
 
@@ -446,7 +446,7 @@ static void read_and_decode_spectrum(TwinVQContext *tctx, float *out,
                                bits->bark_use_hist[i][j], i,
                                tctx->tmp_buf, gain[sub * i + j], ftype);
 
-            tctx->fdsp.vector_fmul(chunk + block_size * j,
+            tctx->fdsp->vector_fmul(chunk + block_size * j,
                                    chunk + block_size * j,
                                    tctx->tmp_buf, block_size);
         }
@@ -461,7 +461,7 @@ static void read_and_decode_spectrum(TwinVQContext *tctx, float *out,
         dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf);
 
         for (j = 0; j < mtab->fmode[ftype].sub; j++) {
-            tctx->fdsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
+            tctx->fdsp->vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
             chunk += block_size;
         }
     }
@@ -487,10 +487,8 @@ int ff_twinvq_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     if (tctx->discarded_packets >= 2) {
         frame->nb_samples = mtab->size * tctx->frames_per_packet;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
             return ret;
-        }
         out = (float **)frame->extended_data;
     }
 
@@ -548,24 +546,24 @@ static av_cold int init_mdct_win(TwinVQContext *tctx)
             return ret;
     }
 
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->tmp_buf,
-                     mtab->size * sizeof(*tctx->tmp_buf), alloc_fail);
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->tmp_buf,
+                     mtab->size, sizeof(*tctx->tmp_buf), alloc_fail);
 
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->spectrum,
-                     2 * mtab->size * channels * sizeof(*tctx->spectrum),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->spectrum,
+                     2 * mtab->size, channels * sizeof(*tctx->spectrum),
                      alloc_fail);
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->curr_frame,
-                     2 * mtab->size * channels * sizeof(*tctx->curr_frame),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->curr_frame,
+                     2 * mtab->size, channels * sizeof(*tctx->curr_frame),
                      alloc_fail);
-    FF_ALLOC_OR_GOTO(tctx->avctx, tctx->prev_frame,
-                     2 * mtab->size * channels * sizeof(*tctx->prev_frame),
+    FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->prev_frame,
+                     2 * mtab->size, channels * sizeof(*tctx->prev_frame),
                      alloc_fail);
 
     for (i = 0; i < 3; i++) {
         int m       = 4 * mtab->size / mtab->fmode[i].sub;
         double freq = 2 * M_PI / m;
-        FF_ALLOC_OR_GOTO(tctx->avctx, tctx->cos_tabs[i],
-                         (m / 4) * sizeof(*tctx->cos_tabs[i]), alloc_fail);
+        FF_ALLOC_ARRAY_OR_GOTO(tctx->avctx, tctx->cos_tabs[i],
+                         (m / 4), sizeof(*tctx->cos_tabs[i]), alloc_fail);
 
         for (j = 0; j <= m / 8; j++)
             tctx->cos_tabs[i][j] = cos((2 * j + 1) * freq);
@@ -757,13 +755,14 @@ av_cold int ff_twinvq_decode_close(AVCodecContext *avctx)
 
     for (i = 0; i < 3; i++) {
         ff_mdct_end(&tctx->mdct_ctx[i]);
-        av_free(tctx->cos_tabs[i]);
+        av_freep(&tctx->cos_tabs[i]);
     }
 
-    av_free(tctx->curr_frame);
-    av_free(tctx->spectrum);
-    av_free(tctx->prev_frame);
-    av_free(tctx->tmp_buf);
+    av_freep(&tctx->curr_frame);
+    av_freep(&tctx->spectrum);
+    av_freep(&tctx->prev_frame);
+    av_freep(&tctx->tmp_buf);
+    av_freep(&tctx->fdsp);
 
     return 0;
 }
@@ -790,7 +789,11 @@ av_cold int ff_twinvq_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    avpriv_float_dsp_init(&tctx->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    tctx->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!tctx->fdsp) {
+        ff_twinvq_decode_close(avctx);
+        return AVERROR(ENOMEM);
+    }
     if ((ret = init_mdct_win(tctx))) {
         av_log(avctx, AV_LOG_ERROR, "Error initializing MDCT\n");
         ff_twinvq_decode_close(avctx);
diff --git a/libavcodec/twinvq.h b/libavcodec/twinvq.h
index e810565..24e5ebc 100644
--- a/libavcodec/twinvq.h
+++ b/libavcodec/twinvq.h
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -136,7 +136,7 @@ typedef struct TwinVQModeTab {
 
 typedef struct TwinVQContext {
     AVCodecContext *avctx;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     FFTContext mdct_ctx[3];
 
     const TwinVQModeTab *mtab;
diff --git a/libavcodec/twinvq_data.h b/libavcodec/twinvq_data.h
index 01a54a5..375acc2 100644
--- a/libavcodec/twinvq_data.h
+++ b/libavcodec/twinvq_data.h
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/twinvqdec.c b/libavcodec/twinvqdec.c
index 56df105..c2353f5 100644
--- a/libavcodec/twinvqdec.c
+++ b/libavcodec/twinvqdec.c
@@ -2,20 +2,20 @@
  * TwinVQ decoder
  * Copyright (c) 2009 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -256,9 +256,10 @@ static int twinvq_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
     int channels              = tctx->avctx->channels;
     int sub;
     GetBitContext gb;
-    int i, j, k;
+    int i, j, k, ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
     skip_bits(&gb, get_bits(&gb, 8));
 
     bits->window_type = get_bits(&gb, TWINVQ_WINDOW_TYPE_BITS);
@@ -312,7 +313,7 @@ static int twinvq_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx,
         }
     }
 
-    return 0;
+    return (get_bits_count(&gb) + 7) / 8;
 }
 
 static av_cold int twinvq_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/txd.c b/libavcodec/txd.c
index 463223c..d7fdde0 100644
--- a/libavcodec/txd.c
+++ b/libavcodec/txd.c
@@ -4,27 +4,27 @@
  *
  * See also: http://wiki.multimedia.cx/index.php?title=TXD
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
-#include "avcodec.h"
 #include "bytestream.h"
+#include "avcodec.h"
 #include "internal.h"
 #include "texturedsp.h"
 
@@ -76,10 +76,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     avctx->coded_width  = FFALIGN(w, 4);
     avctx->coded_height = FFALIGN(h, 4);
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->pict_type = AV_PICTURE_TYPE_I;
 
@@ -92,6 +90,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             v = bytestream2_get_be32(&gb);
             pal[y] = (v >> 8) + (v << 24);
         }
+        if (bytestream2_get_bytes_left(&gb) < w * h)
+            return AVERROR_INVALIDDATA;
         bytestream2_skip(&gb, 4);
         for (y=0; y<h; y++) {
             bytestream2_get_buffer(&gb, ptr, w);
@@ -104,6 +104,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             if (!(flags & 1))
                 goto unsupported;
         case TXD_DXT1:
+            if (bytestream2_get_bytes_left(&gb) < AV_CEIL_RSHIFT(w, 2) * AV_CEIL_RSHIFT(h, 2) * 8)
+                return AVERROR_INVALIDDATA;
             for (j = 0; j < avctx->height; j += 4) {
                 for (i = 0; i < avctx->width; i += 4) {
                     uint8_t *p = ptr + i * 4 + j * stride;
@@ -113,6 +115,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             }
             break;
         case TXD_DXT3:
+            if (bytestream2_get_bytes_left(&gb) < AV_CEIL_RSHIFT(w, 2) * AV_CEIL_RSHIFT(h, 2) * 16)
+                return AVERROR_INVALIDDATA;
             for (j = 0; j < avctx->height; j += 4) {
                 for (i = 0; i < avctx->width; i += 4) {
                     uint8_t *p = ptr + i * 4 + j * stride;
@@ -128,6 +132,8 @@ static int txd_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         switch (d3d_format) {
         case 0x15:
         case 0x16:
+            if (bytestream2_get_bytes_left(&gb) < h * w * 4)
+                return AVERROR_INVALIDDATA;
             for (y=0; y<h; y++) {
                 bytestream2_get_buffer(&gb, ptr, w * 4);
                 ptr += stride;
diff --git a/libavcodec/ulti.c b/libavcodec/ulti.c
index 46aa27d..e6f4374 100644
--- a/libavcodec/ulti.c
+++ b/libavcodec/ulti.c
@@ -2,20 +2,20 @@
  * IBM Ultimotion Video Decoder
  * Copyright (C) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -227,10 +227,8 @@ static int ulti_decode_frame(AVCodecContext *avctx,
     int skip;
     int tmp;
 
-    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->frame)) < 0)
         return ret;
-    }
 
     bytestream2_init(&s->gb, buf, buf_size);
 
diff --git a/libavcodec/ulti_cb.h b/libavcodec/ulti_cb.h
index 0bd83ff..7061d83 100644
--- a/libavcodec/ulti_cb.h
+++ b/libavcodec/ulti_cb.h
@@ -2,20 +2,20 @@
  * IBM Ultimotion Video Decoder
  * copyright (C) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/unary.h b/libavcodec/unary.h
index d14929f..908dc93 100644
--- a/libavcodec/unary.h
+++ b/libavcodec/unary.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/utils-test.c b/libavcodec/utils-test.c
new file mode 100644
index 0000000..7b3b718
--- /dev/null
+++ b/libavcodec/utils-test.c
@@ -0,0 +1,37 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+
+int main(void){
+    AVCodec *codec = NULL;
+    int ret = 0;
+    avcodec_register_all();
+
+    while (codec = av_codec_next(codec)) {
+        if (av_codec_is_encoder(codec)) {
+            if (codec->type == AVMEDIA_TYPE_AUDIO) {
+                if (!codec->sample_fmts) {
+                    av_log(NULL, AV_LOG_FATAL, "Encoder %s is missing the sample_fmts field\n", codec->name);
+                    ret = 1;
+                }
+            }
+        }
+    }
+    return ret;
+}
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index d42885d..54a3e87 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,52 +26,124 @@
  */
 
 #include "config.h"
+#include "libavutil/atomic.h"
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/crc.h"
 #include "libavutil/frame.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/internal.h"
 #include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/samplefmt.h"
 #include "libavutil/dict.h"
+#include "libavutil/thread.h"
 #include "avcodec.h"
 #include "libavutil/opt.h"
 #include "me_cmp.h"
 #include "mpegvideo.h"
 #include "thread.h"
+#include "frame_thread_encoder.h"
 #include "internal.h"
+#include "raw.h"
 #include "bytestream.h"
 #include "version.h"
 #include <stdlib.h>
 #include <stdarg.h>
 #include <limits.h>
 #include <float.h>
+#if CONFIG_ICONV
+# include <iconv.h>
+#endif
+
+#include "libavutil/ffversion.h"
+const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+
+#if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
+static int default_lockmgr_cb(void **arg, enum AVLockOp op)
+{
+    void * volatile * mutex = arg;
+    int err;
+
+    switch (op) {
+    case AV_LOCK_CREATE:
+        return 0;
+    case AV_LOCK_OBTAIN:
+        if (!*mutex) {
+            pthread_mutex_t *tmp = av_malloc(sizeof(pthread_mutex_t));
+            if (!tmp)
+                return AVERROR(ENOMEM);
+            if ((err = pthread_mutex_init(tmp, NULL))) {
+                av_free(tmp);
+                return AVERROR(err);
+            }
+            if (avpriv_atomic_ptr_cas(mutex, NULL, tmp)) {
+                pthread_mutex_destroy(tmp);
+                av_free(tmp);
+            }
+        }
+
+        if ((err = pthread_mutex_lock(*mutex)))
+            return AVERROR(err);
+
+        return 0;
+    case AV_LOCK_RELEASE:
+        if ((err = pthread_mutex_unlock(*mutex)))
+            return AVERROR(err);
+
+        return 0;
+    case AV_LOCK_DESTROY:
+        if (*mutex)
+            pthread_mutex_destroy(*mutex);
+        av_free(*mutex);
+        avpriv_atomic_ptr_cas(mutex, *mutex, NULL);
+        return 0;
+    }
+    return 1;
+}
+static int (*lockmgr_cb)(void **mutex, enum AVLockOp op) = default_lockmgr_cb;
+#else
+static int (*lockmgr_cb)(void **mutex, enum AVLockOp op) = NULL;
+#endif
+
 
+volatile int ff_avcodec_locked;
 static int volatile entangled_thread_counter = 0;
-static int (*lockmgr_cb)(void **mutex, enum AVLockOp op);
 static void *codec_mutex;
 static void *avformat_mutex;
 
 void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size)
 {
-    void **p = ptr;
+    uint8_t **p = ptr;
+    if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_freep(p);
+        *size = 0;
+        return;
+    }
+    if (!ff_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE, 1))
+        memset(*p + min_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+}
+
+void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size)
+{
+    uint8_t **p = ptr;
     if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
         av_freep(p);
         *size = 0;
         return;
     }
-    av_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE);
-    if (*size)
-        memset((uint8_t *)*p + min_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!ff_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE, 1))
+        memset(*p, 0, min_size + AV_INPUT_BUFFER_PADDING_SIZE);
 }
 
 /* encoder management */
 static AVCodec *first_avcodec = NULL;
+static AVCodec **last_avcodec = &first_avcodec;
 
 AVCodec *av_codec_next(const AVCodec *c)
 {
@@ -107,12 +179,13 @@ av_cold void avcodec_register(AVCodec *codec)
 {
     AVCodec **p;
     avcodec_init();
-    p = &first_avcodec;
-    while (*p)
-        p = &(*p)->next;
-    *p          = codec;
+    p = last_avcodec;
     codec->next = NULL;
 
+    while(*p || avpriv_atomic_ptr_cas((void * volatile *)p, NULL, codec))
+        p = &(*p)->next;
+    last_avcodec = &codec->next;
+
     if (codec->init_static_data)
         codec->init_static_data(codec);
 }
@@ -127,7 +200,10 @@ unsigned avcodec_get_edge_width(void)
 #if FF_API_SET_DIMENSIONS
 void avcodec_set_dimensions(AVCodecContext *s, int width, int height)
 {
-    ff_set_dimensions(s, width, height);
+    int ret = ff_set_dimensions(s, width, height);
+    if (ret < 0) {
+        av_log(s, AV_LOG_WARNING, "Failed to set dimensions %d %d\n", width, height);
+    }
 }
 #endif
 
@@ -137,8 +213,11 @@ int ff_set_dimensions(AVCodecContext *s, int width, int height)
 
     if (ret < 0)
         width = height = 0;
-    s->width  = s->coded_width  = width;
-    s->height = s->coded_height = height;
+
+    s->coded_width  = width;
+    s->coded_height = height;
+    s->width        = AV_CEIL_RSHIFT(width,  s->lowres);
+    s->height       = AV_CEIL_RSHIFT(height, s->lowres);
 
     return ret;
 }
@@ -178,18 +257,18 @@ int ff_side_data_update_matrix_encoding(AVFrame *frame,
     return 0;
 }
 
-#if HAVE_SIMD_ALIGN_16
-#   define STRIDE_ALIGN 16
-#else
-#   define STRIDE_ALIGN 8
-#endif
-
 void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
                                int linesize_align[AV_NUM_DATA_POINTERS])
 {
     int i;
     int w_align = 1;
     int h_align = 1;
+    AVPixFmtDescriptor const *desc = av_pix_fmt_desc_get(s->pix_fmt);
+
+    if (desc) {
+        w_align = 1 << desc->log2_chroma_w;
+        h_align = 1 << desc->log2_chroma_h;
+    }
 
     switch (s->pix_fmt) {
     case AV_PIX_FMT_YUV420P:
@@ -215,47 +294,101 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
     case AV_PIX_FMT_YUV420P9BE:
     case AV_PIX_FMT_YUV420P10LE:
     case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV420P12LE:
+    case AV_PIX_FMT_YUV420P12BE:
+    case AV_PIX_FMT_YUV420P14LE:
+    case AV_PIX_FMT_YUV420P14BE:
+    case AV_PIX_FMT_YUV420P16LE:
+    case AV_PIX_FMT_YUV420P16BE:
+    case AV_PIX_FMT_YUVA420P9LE:
+    case AV_PIX_FMT_YUVA420P9BE:
+    case AV_PIX_FMT_YUVA420P10LE:
+    case AV_PIX_FMT_YUVA420P10BE:
+    case AV_PIX_FMT_YUVA420P16LE:
+    case AV_PIX_FMT_YUVA420P16BE:
     case AV_PIX_FMT_YUV422P9LE:
     case AV_PIX_FMT_YUV422P9BE:
     case AV_PIX_FMT_YUV422P10LE:
     case AV_PIX_FMT_YUV422P10BE:
+    case AV_PIX_FMT_YUV422P12LE:
+    case AV_PIX_FMT_YUV422P12BE:
+    case AV_PIX_FMT_YUV422P14LE:
+    case AV_PIX_FMT_YUV422P14BE:
+    case AV_PIX_FMT_YUV422P16LE:
+    case AV_PIX_FMT_YUV422P16BE:
+    case AV_PIX_FMT_YUVA422P9LE:
+    case AV_PIX_FMT_YUVA422P9BE:
     case AV_PIX_FMT_YUVA422P10LE:
     case AV_PIX_FMT_YUVA422P10BE:
+    case AV_PIX_FMT_YUVA422P16LE:
+    case AV_PIX_FMT_YUVA422P16BE:
+    case AV_PIX_FMT_YUV440P10LE:
+    case AV_PIX_FMT_YUV440P10BE:
+    case AV_PIX_FMT_YUV440P12LE:
+    case AV_PIX_FMT_YUV440P12BE:
     case AV_PIX_FMT_YUV444P9LE:
     case AV_PIX_FMT_YUV444P9BE:
     case AV_PIX_FMT_YUV444P10LE:
     case AV_PIX_FMT_YUV444P10BE:
+    case AV_PIX_FMT_YUV444P12LE:
+    case AV_PIX_FMT_YUV444P12BE:
+    case AV_PIX_FMT_YUV444P14LE:
+    case AV_PIX_FMT_YUV444P14BE:
+    case AV_PIX_FMT_YUV444P16LE:
+    case AV_PIX_FMT_YUV444P16BE:
+    case AV_PIX_FMT_YUVA444P9LE:
+    case AV_PIX_FMT_YUVA444P9BE:
     case AV_PIX_FMT_YUVA444P10LE:
     case AV_PIX_FMT_YUVA444P10BE:
+    case AV_PIX_FMT_YUVA444P16LE:
+    case AV_PIX_FMT_YUVA444P16BE:
     case AV_PIX_FMT_GBRP9LE:
     case AV_PIX_FMT_GBRP9BE:
     case AV_PIX_FMT_GBRP10LE:
     case AV_PIX_FMT_GBRP10BE:
+    case AV_PIX_FMT_GBRP12LE:
+    case AV_PIX_FMT_GBRP12BE:
+    case AV_PIX_FMT_GBRP14LE:
+    case AV_PIX_FMT_GBRP14BE:
+    case AV_PIX_FMT_GBRP16LE:
+    case AV_PIX_FMT_GBRP16BE:
+    case AV_PIX_FMT_GBRAP12LE:
+    case AV_PIX_FMT_GBRAP12BE:
+    case AV_PIX_FMT_GBRAP16LE:
+    case AV_PIX_FMT_GBRAP16BE:
         w_align = 16; //FIXME assume 16 pixel per macroblock
         h_align = 16 * 2; // interlaced needs 2 macroblocks height
         break;
     case AV_PIX_FMT_YUV411P:
+    case AV_PIX_FMT_YUVJ411P:
     case AV_PIX_FMT_UYYVYY411:
         w_align = 32;
-        h_align = 8;
+        h_align = 16 * 2;
         break;
     case AV_PIX_FMT_YUV410P:
         if (s->codec_id == AV_CODEC_ID_SVQ1) {
             w_align = 64;
             h_align = 64;
         }
+        break;
     case AV_PIX_FMT_RGB555:
         if (s->codec_id == AV_CODEC_ID_RPZA) {
             w_align = 4;
             h_align = 4;
         }
+        break;
     case AV_PIX_FMT_PAL8:
     case AV_PIX_FMT_BGR8:
     case AV_PIX_FMT_RGB8:
-        if (s->codec_id == AV_CODEC_ID_SMC) {
+        if (s->codec_id == AV_CODEC_ID_SMC ||
+            s->codec_id == AV_CODEC_ID_CINEPAK) {
             w_align = 4;
             h_align = 4;
         }
+        if (s->codec_id == AV_CODEC_ID_JV) {
+            w_align = 8;
+            h_align = 8;
+        }
         break;
     case AV_PIX_FMT_BGR24:
         if ((s->codec_id == AV_CODEC_ID_MSZH) ||
@@ -264,18 +397,34 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
             h_align = 4;
         }
         break;
+    case AV_PIX_FMT_RGB24:
+        if (s->codec_id == AV_CODEC_ID_CINEPAK) {
+            w_align = 4;
+            h_align = 4;
+        }
+        break;
     default:
-        w_align = 1;
-        h_align = 1;
         break;
     }
 
+    if (s->codec_id == AV_CODEC_ID_IFF_ILBM) {
+        w_align = FFMAX(w_align, 8);
+    }
+
     *width  = FFALIGN(*width, w_align);
     *height = FFALIGN(*height, h_align);
-    if (s->codec_id == AV_CODEC_ID_H264)
+    if (s->codec_id == AV_CODEC_ID_H264 || s->lowres) {
         // some of the optimized chroma MC reads one line too much
+        // which is also done in mpeg decoders with lowres > 0
         *height += 2;
 
+        // H.264 uses edge emulation for out of frame motion vectors, for this
+        // it requires a temporary area large enough to hold a 21x21 block,
+        // increasing witdth ensure that the temporary area is large enough,
+        // the next rounded up width is 32
+        *width = FFMAX(*width, 32);
+    }
+
     for (i = 0; i < 4; i++)
         linesize_align[i] = STRIDE_ALIGN;
 }
@@ -295,6 +444,29 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height)
     *width              = FFALIGN(*width, align);
 }
 
+int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos)
+{
+    if (pos <= AVCHROMA_LOC_UNSPECIFIED || pos >= AVCHROMA_LOC_NB)
+        return AVERROR(EINVAL);
+    pos--;
+
+    *xpos = (pos&1) * 128;
+    *ypos = ((pos>>1)^(pos<4)) * 128;
+
+    return 0;
+}
+
+enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos)
+{
+    int pos, xout, yout;
+
+    for (pos = AVCHROMA_LOC_UNSPECIFIED + 1; pos < AVCHROMA_LOC_NB; pos++) {
+        if (avcodec_enum_to_chroma_pos(&xout, &yout, pos) == 0 && xout == xpos && yout == ypos)
+            return pos;
+    }
+    return AVCHROMA_LOC_UNSPECIFIED;
+}
+
 int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
                              enum AVSampleFormat sample_fmt, const uint8_t *buf,
                              int buf_size, int align)
@@ -309,7 +481,7 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
 
     planar = av_sample_fmt_is_planar(sample_fmt);
     if (planar && nb_channels > AV_NUM_DATA_POINTERS) {
-        if (!(frame->extended_data = av_mallocz(nb_channels *
+        if (!(frame->extended_data = av_mallocz_array(nb_channels,
                                                 sizeof(*frame->extended_data))))
             return AVERROR(ENOMEM);
     } else {
@@ -317,10 +489,10 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
     }
 
     if ((ret = av_samples_fill_arrays(frame->extended_data, &frame->linesize[0],
-                                      buf, nb_channels, frame->nb_samples,
+                                      (uint8_t *)(intptr_t)buf, nb_channels, frame->nb_samples,
                                       sample_fmt, align)) < 0) {
         if (frame->extended_data != frame->data)
-            av_free(frame->extended_data);
+            av_freep(&frame->extended_data);
         return ret;
     }
     if (frame->extended_data != frame->data) {
@@ -354,7 +526,9 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
         do {
             // NOTE: do not align linesizes individually, this breaks e.g. assumptions
             // that linesize[0] == 2*linesize[1] in the MPEG-encoder for 4:2:2
-            av_image_fill_linesizes(linesize, avctx->pix_fmt, w);
+            ret = av_image_fill_linesizes(linesize, avctx->pix_fmt, w);
+            if (ret < 0)
+                return ret;
             // increase alignment of w for next try (rhs gives the lowest bit set in w)
             w += w & ~(w - 1);
 
@@ -376,7 +550,10 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
             av_buffer_pool_uninit(&pool->pools[i]);
             pool->linesize[i] = linesize[i];
             if (size[i]) {
-                pool->pools[i] = av_buffer_pool_init(size[i] + 16, NULL);
+                pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                     CONFIG_MEMORY_POISONING ?
+                                                        NULL :
+                                                        av_buffer_allocz);
                 if (!pool->pools[i]) {
                     ret = AVERROR(ENOMEM);
                     goto fail;
@@ -390,7 +567,7 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
         break;
         }
     case AVMEDIA_TYPE_AUDIO: {
-        int ch     = av_get_channel_layout_nb_channels(frame->channel_layout);
+        int ch     = av_frame_get_channels(frame); //av_get_channel_layout_nb_channels(frame->channel_layout);
         int planar = av_sample_fmt_is_planar(frame->format);
         int planes = planar ? ch : 1;
 
@@ -437,17 +614,19 @@ static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
     frame->linesize[0] = pool->linesize[0];
 
     if (planes > AV_NUM_DATA_POINTERS) {
-        frame->extended_data = av_mallocz(planes * sizeof(*frame->extended_data));
+        frame->extended_data = av_mallocz_array(planes, sizeof(*frame->extended_data));
         frame->nb_extended_buf = planes - AV_NUM_DATA_POINTERS;
-        frame->extended_buf  = av_mallocz(frame->nb_extended_buf *
+        frame->extended_buf  = av_mallocz_array(frame->nb_extended_buf,
                                           sizeof(*frame->extended_buf));
         if (!frame->extended_data || !frame->extended_buf) {
             av_freep(&frame->extended_data);
             av_freep(&frame->extended_buf);
             return AVERROR(ENOMEM);
         }
-    } else
+    } else {
         frame->extended_data = frame->data;
+        av_assert0(frame->nb_extended_buf == 0);
+    }
 
     for (i = 0; i < FFMIN(planes, AV_NUM_DATA_POINTERS); i++) {
         frame->buf[i] = av_buffer_pool_get(pool->pools[0]);
@@ -474,13 +653,21 @@ fail:
 static int video_get_buffer(AVCodecContext *s, AVFrame *pic)
 {
     FramePool *pool = s->internal->pool;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pic->format);
     int i;
 
-    if (pic->data[0]) {
-        av_log(s, AV_LOG_ERROR, "pic->data[0]!=NULL in avcodec_default_get_buffer\n");
+    if (pic->data[0] || pic->data[1] || pic->data[2] || pic->data[3]) {
+        av_log(s, AV_LOG_ERROR, "pic->data[*]!=NULL in avcodec_default_get_buffer\n");
         return -1;
     }
 
+    if (!desc) {
+        av_log(s, AV_LOG_ERROR,
+            "Unable to get pixel format descriptor for format %s\n",
+            av_get_pix_fmt_name(pic->format));
+        return AVERROR(EINVAL);
+    }
+
     memset(pic->data, 0, sizeof(pic->data));
     pic->extended_data = pic->data;
 
@@ -497,8 +684,9 @@ static int video_get_buffer(AVCodecContext *s, AVFrame *pic)
         pic->data[i] = NULL;
         pic->linesize[i] = 0;
     }
-    if (pic->data[1] && !pic->data[2])
-        avpriv_set_systematic_pal2((uint32_t *)pic->data[1], s->pix_fmt);
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
+        desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)
+        avpriv_set_systematic_pal2((uint32_t *)pic->data[1], pic->format);
 
     if (s->debug & FF_DEBUG_BUFFERS)
         av_log(s, AV_LOG_DEBUG, "default_get_buffer called on pic %p\n", pic);
@@ -509,6 +697,29 @@ fail:
     return AVERROR(ENOMEM);
 }
 
+void ff_color_frame(AVFrame *frame, const int c[4])
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    int p, y, x;
+
+    av_assert0(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
+
+    for (p = 0; p<desc->nb_components; p++) {
+        uint8_t *dst = frame->data[p];
+        int is_chroma = p == 1 || p == 2;
+        int bytes  = is_chroma ? AV_CEIL_RSHIFT(frame->width,  desc->log2_chroma_w) : frame->width;
+        int height = is_chroma ? AV_CEIL_RSHIFT(frame->height, desc->log2_chroma_h) : frame->height;
+        for (y = 0; y < height; y++) {
+            if (desc->comp[0].depth >= 9) {
+                for (x = 0; x<bytes; x++)
+                    ((uint16_t*)dst)[x] = c[p];
+            }else
+                memset(dst, c[p], bytes);
+            dst += frame->linesize[p];
+        }
+    }
+}
+
 int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags)
 {
     int ret;
@@ -526,70 +737,80 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
     }
 }
 
-int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
+static int add_metadata_from_side_data(AVPacket *avpkt, AVFrame *frame)
+{
+    int size;
+    const uint8_t *side_metadata;
+
+    AVDictionary **frame_md = avpriv_frame_get_metadatap(frame);
+
+    side_metadata = av_packet_get_side_data(avpkt,
+                                            AV_PKT_DATA_STRINGS_METADATA, &size);
+    return av_packet_unpack_dictionary(side_metadata, size, frame_md);
+}
+
+int ff_init_buffer_info(AVCodecContext *avctx, AVFrame *frame)
 {
     AVPacket *pkt = avctx->internal->pkt;
     int i;
-    struct {
+    static const struct {
         enum AVPacketSideDataType packet;
         enum AVFrameSideDataType frame;
     } sd[] = {
-        { AV_PKT_DATA_REPLAYGAIN ,   AV_FRAME_DATA_REPLAYGAIN },
-        { AV_PKT_DATA_DISPLAYMATRIX, AV_FRAME_DATA_DISPLAYMATRIX },
-        { AV_PKT_DATA_STEREO3D,      AV_FRAME_DATA_STEREO3D },
-        { AV_PKT_DATA_AUDIO_SERVICE_TYPE, AV_FRAME_DATA_AUDIO_SERVICE_TYPE },
+        { AV_PKT_DATA_REPLAYGAIN ,                AV_FRAME_DATA_REPLAYGAIN },
+        { AV_PKT_DATA_DISPLAYMATRIX,              AV_FRAME_DATA_DISPLAYMATRIX },
+        { AV_PKT_DATA_STEREO3D,                   AV_FRAME_DATA_STEREO3D },
+        { AV_PKT_DATA_AUDIO_SERVICE_TYPE,         AV_FRAME_DATA_AUDIO_SERVICE_TYPE },
+        { AV_PKT_DATA_MASTERING_DISPLAY_METADATA, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA },
     };
 
-    frame->color_primaries = avctx->color_primaries;
-    frame->color_trc       = avctx->color_trc;
-    frame->colorspace      = avctx->colorspace;
-    frame->color_range     = avctx->color_range;
-    frame->chroma_location = avctx->chroma_sample_location;
-
-    frame->reordered_opaque = avctx->reordered_opaque;
-    if (!pkt) {
-        frame->pkt_pts = AV_NOPTS_VALUE;
-        return 0;
-    }
-
-    frame->pkt_pts = pkt->pts;
-
-    for (i = 0; i < FF_ARRAY_ELEMS(sd); i++) {
-        int size;
-        uint8_t *packet_sd = av_packet_get_side_data(pkt, sd[i].packet, &size);
-        if (packet_sd) {
-            AVFrameSideData *frame_sd = av_frame_new_side_data(frame,
-                                                               sd[i].frame,
-                                                               size);
-            if (!frame_sd)
-                return AVERROR(ENOMEM);
-
-            memcpy(frame_sd->data, packet_sd, size);
+    if (pkt) {
+        frame->pkt_pts = pkt->pts;
+        av_frame_set_pkt_pos     (frame, pkt->pos);
+        av_frame_set_pkt_duration(frame, pkt->duration);
+        av_frame_set_pkt_size    (frame, pkt->size);
+
+        for (i = 0; i < FF_ARRAY_ELEMS(sd); i++) {
+            int size;
+            uint8_t *packet_sd = av_packet_get_side_data(pkt, sd[i].packet, &size);
+            if (packet_sd) {
+                AVFrameSideData *frame_sd = av_frame_new_side_data(frame,
+                                                                   sd[i].frame,
+                                                                   size);
+                if (!frame_sd)
+                    return AVERROR(ENOMEM);
+
+                memcpy(frame_sd->data, packet_sd, size);
+            }
         }
+        add_metadata_from_side_data(pkt, frame);
+    } else {
+        frame->pkt_pts = AV_NOPTS_VALUE;
+        av_frame_set_pkt_pos     (frame, -1);
+        av_frame_set_pkt_duration(frame, 0);
+        av_frame_set_pkt_size    (frame, -1);
     }
+    frame->reordered_opaque = avctx->reordered_opaque;
 
-    return 0;
-}
-
-int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
-{
-    const AVHWAccel *hwaccel = avctx->hwaccel;
-    int override_dimensions = 1;
-    int ret;
-
-    switch (avctx->codec_type) {
+    if (frame->color_primaries == AVCOL_PRI_UNSPECIFIED)
+        frame->color_primaries = avctx->color_primaries;
+    if (frame->color_trc == AVCOL_TRC_UNSPECIFIED)
+        frame->color_trc = avctx->color_trc;
+    if (av_frame_get_colorspace(frame) == AVCOL_SPC_UNSPECIFIED)
+        av_frame_set_colorspace(frame, avctx->colorspace);
+    if (av_frame_get_color_range(frame) == AVCOL_RANGE_UNSPECIFIED)
+        av_frame_set_color_range(frame, avctx->color_range);
+    if (frame->chroma_location == AVCHROMA_LOC_UNSPECIFIED)
+        frame->chroma_location = avctx->chroma_sample_location;
+
+    switch (avctx->codec->type) {
     case AVMEDIA_TYPE_VIDEO:
-        if (frame->width <= 0 || frame->height <= 0) {
-            frame->width  = FFMAX(avctx->width, avctx->coded_width);
-            frame->height = FFMAX(avctx->height, avctx->coded_height);
-            override_dimensions = 0;
-        }
-        if (frame->format < 0)
-            frame->format              = avctx->pix_fmt;
+        frame->format              = avctx->pix_fmt;
         if (!frame->sample_aspect_ratio.num)
             frame->sample_aspect_ratio = avctx->sample_aspect_ratio;
 
-        if (av_image_check_sar(frame->width, frame->height,
+        if (frame->width && frame->height &&
+            av_image_check_sar(frame->width, frame->height,
                                frame->sample_aspect_ratio) < 0) {
             av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n",
                    frame->sample_aspect_ratio.num,
@@ -597,8 +818,6 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
             frame->sample_aspect_ratio = (AVRational){ 0, 1 };
         }
 
-        if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0)
-            return ret;
         break;
     case AVMEDIA_TYPE_AUDIO:
         if (!frame->sample_rate)
@@ -621,16 +840,66 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
                            avctx->channels);
                     return AVERROR(ENOSYS);
                 }
-
-                frame->channel_layout = av_get_default_channel_layout(avctx->channels);
-                if (!frame->channel_layout)
-                    frame->channel_layout = (1ULL << avctx->channels) - 1;
             }
         }
+        av_frame_set_channels(frame, avctx->channels);
         break;
-    default: return AVERROR(EINVAL);
     }
+    return 0;
+}
+
+int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
+{
+    return ff_init_buffer_info(avctx, frame);
+}
+
+static void validate_avframe_allocation(AVCodecContext *avctx, AVFrame *frame)
+{
+    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+        int i;
+        int num_planes = av_pix_fmt_count_planes(frame->format);
+        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+        int flags = desc ? desc->flags : 0;
+        if (num_planes == 1 && (flags & AV_PIX_FMT_FLAG_PAL))
+            num_planes = 2;
+        for (i = 0; i < num_planes; i++) {
+            av_assert0(frame->data[i]);
+        }
+        // For now do not enforce anything for palette of pseudopal formats
+        if (num_planes == 1 && (flags & AV_PIX_FMT_FLAG_PSEUDOPAL))
+            num_planes = 2;
+        // For formats without data like hwaccel allow unused pointers to be non-NULL.
+        for (i = num_planes; num_planes > 0 && i < FF_ARRAY_ELEMS(frame->data); i++) {
+            if (frame->data[i])
+                av_log(avctx, AV_LOG_ERROR, "Buffer returned by get_buffer2() did not zero unused plane pointers\n");
+            frame->data[i] = NULL;
+        }
+    }
+}
+
+static int get_buffer_internal(AVCodecContext *avctx, AVFrame *frame, int flags)
+{
+    const AVHWAccel *hwaccel = avctx->hwaccel;
+    int override_dimensions = 1;
+    int ret;
+
+    if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+        if ((ret = av_image_check_size(avctx->width, avctx->height, 0, avctx)) < 0 || avctx->pix_fmt<0) {
+            av_log(avctx, AV_LOG_ERROR, "video_get_buffer: image parameters invalid\n");
+            return AVERROR(EINVAL);
+        }
+
+        if (frame->width <= 0 || frame->height <= 0) {
+            frame->width  = FFMAX(avctx->width,  AV_CEIL_RSHIFT(avctx->coded_width,  avctx->lowres));
+            frame->height = FFMAX(avctx->height, AV_CEIL_RSHIFT(avctx->coded_height, avctx->lowres));
+            override_dimensions = 0;
+        }
 
+        if (frame->data[0] || frame->data[1] || frame->data[2] || frame->data[3]) {
+            av_log(avctx, AV_LOG_ERROR, "pic->data[*]!=NULL in get_buffer_internal\n");
+            return AVERROR(EINVAL);
+        }
+    }
     ret = ff_decode_frame_props(avctx, frame);
     if (ret < 0)
         return ret;
@@ -644,6 +913,8 @@ int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
         avctx->sw_pix_fmt = avctx->pix_fmt;
 
     ret = avctx->get_buffer2(avctx, frame, flags);
+    if (ret >= 0)
+        validate_avframe_allocation(avctx, frame);
 
 end:
     if (avctx->codec_type == AVMEDIA_TYPE_VIDEO && !override_dimensions) {
@@ -654,13 +925,31 @@ end:
     return ret;
 }
 
-int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
+int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+{
+    int ret = get_buffer_internal(avctx, frame, flags);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        frame->width = frame->height = 0;
+    }
+    return ret;
+}
+
+static int reget_buffer_internal(AVCodecContext *avctx, AVFrame *frame)
 {
     AVFrame *tmp;
     int ret;
 
     av_assert0(avctx->codec_type == AVMEDIA_TYPE_VIDEO);
 
+    if (frame->data[0] && (frame->width != avctx->width || frame->height != avctx->height || frame->format != avctx->pix_fmt)) {
+        av_log(avctx, AV_LOG_WARNING, "Picture changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s in reget buffer()\n",
+               frame->width, frame->height, av_get_pix_fmt_name(frame->format), avctx->width, avctx->height, av_get_pix_fmt_name(avctx->pix_fmt));
+        av_frame_unref(frame);
+    }
+
+    ff_init_buffer_info(avctx, frame);
+
     if (!frame->data[0])
         return ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF);
 
@@ -685,6 +974,14 @@ int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
     return 0;
 }
 
+int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame)
+{
+    int ret = reget_buffer_internal(avctx, frame);
+    if (ret < 0)
+        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    return ret;
+}
+
 int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2), void *arg, int *ret, int count, int size)
 {
     int i;
@@ -709,6 +1006,17 @@ int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2,
     return 0;
 }
 
+enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags,
+                                       unsigned int fourcc)
+{
+    while (tags->pix_fmt >= 0) {
+        if (tags->fourcc == fourcc)
+            return tags->pix_fmt;
+        tags++;
+    }
+    return AV_PIX_FMT_NONE;
+}
+
 static int is_hwaccel_pix_fmt(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -741,6 +1049,11 @@ static int setup_hwaccel(AVCodecContext *avctx,
     AVHWAccel *hwa = find_hwaccel(avctx->codec_id, fmt);
     int ret        = 0;
 
+    if (avctx->active_thread_type & FF_THREAD_FRAME) {
+        av_log(avctx, AV_LOG_WARNING,
+               "Hardware accelerated decoding with frame threading is known to be unstable and its use is discouraged.\n");
+    }
+
     if (!hwa) {
         av_log(avctx, AV_LOG_ERROR,
                "Could not find an AVHWAccel for the pixel format: %s",
@@ -748,6 +1061,13 @@ static int setup_hwaccel(AVCodecContext *avctx,
         return AVERROR(ENOENT);
     }
 
+    if (hwa->capabilities & HWACCEL_CODEC_CAP_EXPERIMENTAL &&
+        avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        av_log(avctx, AV_LOG_WARNING, "Ignoring experimental hwaccel: %s\n",
+               hwa->name);
+        return AVERROR_PATCHWELCOME;
+    }
+
     if (hwa->priv_data_size) {
         avctx->internal->hwaccel_priv_data = av_mallocz(hwa->priv_data_size);
         if (!avctx->internal->hwaccel_priv_data)
@@ -803,6 +1123,10 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
 
         if (!(desc->flags & AV_PIX_FMT_FLAG_HWACCEL))
             break;
+#if FF_API_CAP_VDPAU
+        if (avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU)
+            break;
+#endif
 
         if (!setup_hwaccel(avctx, ret, desc->name))
             break;
@@ -820,21 +1144,83 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
     return ret;
 }
 
+MAKE_ACCESSORS(AVCodecContext, codec, AVRational, pkt_timebase)
+MAKE_ACCESSORS(AVCodecContext, codec, const AVCodecDescriptor *, codec_descriptor)
+MAKE_ACCESSORS(AVCodecContext, codec, int, lowres)
+MAKE_ACCESSORS(AVCodecContext, codec, int, seek_preroll)
+MAKE_ACCESSORS(AVCodecContext, codec, uint16_t*, chroma_intra_matrix)
+
+unsigned av_codec_get_codec_properties(const AVCodecContext *codec)
+{
+    return codec->properties;
+}
+
+int av_codec_get_max_lowres(const AVCodec *codec)
+{
+    return codec->max_lowres;
+}
+
+int avpriv_codec_get_cap_skip_frame_fill_param(const AVCodec *codec){
+    return !!(codec->caps_internal & FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM);
+}
+
+static void get_subtitle_defaults(AVSubtitle *sub)
+{
+    memset(sub, 0, sizeof(*sub));
+    sub->pts = AV_NOPTS_VALUE;
+}
+
+static int64_t get_bit_rate(AVCodecContext *ctx)
+{
+    int64_t bit_rate;
+    int bits_per_sample;
+
+    switch (ctx->codec_type) {
+    case AVMEDIA_TYPE_VIDEO:
+    case AVMEDIA_TYPE_DATA:
+    case AVMEDIA_TYPE_SUBTITLE:
+    case AVMEDIA_TYPE_ATTACHMENT:
+        bit_rate = ctx->bit_rate;
+        break;
+    case AVMEDIA_TYPE_AUDIO:
+        bits_per_sample = av_get_bits_per_sample(ctx->codec_id);
+        bit_rate = bits_per_sample ? ctx->sample_rate * (int64_t)ctx->channels * bits_per_sample : ctx->bit_rate;
+        break;
+    default:
+        bit_rate = 0;
+        break;
+    }
+    return bit_rate;
+}
+
+int attribute_align_arg ff_codec_open2_recursive(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options)
+{
+    int ret = 0;
+
+    ff_unlock_avcodec(codec);
+
+    ret = avcodec_open2(avctx, codec, options);
+
+    ff_lock_avcodec(avctx, codec);
+    return ret;
+}
+
 int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options)
 {
     int ret = 0;
     AVDictionary *tmp = NULL;
+    const AVPixFmtDescriptor *pixdesc;
 
     if (avcodec_is_open(avctx))
         return 0;
 
     if ((!codec && !avctx->codec)) {
-        av_log(avctx, AV_LOG_ERROR, "No codec provided to avcodec_open2().\n");
+        av_log(avctx, AV_LOG_ERROR, "No codec provided to avcodec_open2()\n");
         return AVERROR(EINVAL);
     }
     if ((codec && avctx->codec && codec != avctx->codec)) {
         av_log(avctx, AV_LOG_ERROR, "This AVCodecContext was allocated for %s, "
-                                    "but %s passed to avcodec_open2().\n", avctx->codec->name, codec->name);
+                                    "but %s passed to avcodec_open2()\n", avctx->codec->name, codec->name);
         return AVERROR(EINVAL);
     }
     if (!codec)
@@ -846,23 +1232,9 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if (options)
         av_dict_copy(&tmp, *options, 0);
 
-    /* If there is a user-supplied mutex locking routine, call it. */
-    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init) {
-        if (lockmgr_cb) {
-            if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_OBTAIN))
-                return -1;
-        }
-
-        entangled_thread_counter++;
-        if (entangled_thread_counter != 1) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Insufficient thread locking. At least %d threads are "
-                   "calling avcodec_open2() at the same time right now.\n",
-                   entangled_thread_counter);
-            ret = -1;
-            goto end;
-        }
-    }
+    ret = ff_lock_avcodec(avctx, codec);
+    if (ret < 0)
+        return ret;
 
     avctx->internal = av_mallocz(sizeof(AVCodecInternal));
     if (!avctx->internal) {
@@ -914,17 +1286,27 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if ((ret = av_opt_set_dict(avctx, &tmp)) < 0)
         goto free_and_end;
 
-    if (avctx->coded_width && avctx->coded_height && !avctx->width && !avctx->height)
+    if (avctx->codec_whitelist && av_match_list(codec->name, avctx->codec_whitelist, ',') <= 0) {
+        av_log(avctx, AV_LOG_ERROR, "Codec (%s) not on whitelist \'%s\'\n", codec->name, avctx->codec_whitelist);
+        ret = AVERROR(EINVAL);
+        goto free_and_end;
+    }
+
+    // only call ff_set_dimensions() for non H.264/VP6F/DXV codecs so as not to overwrite previously setup dimensions
+    if (!(avctx->coded_width && avctx->coded_height && avctx->width && avctx->height &&
+          (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_VP6F || avctx->codec_id == AV_CODEC_ID_DXV))) {
+    if (avctx->coded_width && avctx->coded_height)
         ret = ff_set_dimensions(avctx, avctx->coded_width, avctx->coded_height);
     else if (avctx->width && avctx->height)
         ret = ff_set_dimensions(avctx, avctx->width, avctx->height);
     if (ret < 0)
         goto free_and_end;
+    }
 
     if ((avctx->coded_width || avctx->coded_height || avctx->width || avctx->height)
         && (  av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx) < 0
            || av_image_check_size(avctx->width,       avctx->height,       0, avctx) < 0)) {
-        av_log(avctx, AV_LOG_WARNING, "ignoring invalid width/height values\n");
+        av_log(avctx, AV_LOG_WARNING, "Ignoring invalid width/height values\n");
         ff_set_dimensions(avctx, 0, 0);
     }
 
@@ -956,14 +1338,25 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     }
     if (avctx->codec_id != codec->id || (avctx->codec_type != codec->type
                                          && avctx->codec_type != AVMEDIA_TYPE_ATTACHMENT)) {
-        av_log(avctx, AV_LOG_ERROR, "codec type or id mismatches\n");
+        av_log(avctx, AV_LOG_ERROR, "Codec type or id mismatches\n");
         ret = AVERROR(EINVAL);
         goto free_and_end;
     }
     avctx->frame_number = 0;
+    avctx->codec_descriptor = avcodec_descriptor_get(avctx->codec_id);
 
     if ((avctx->codec->capabilities & AV_CODEC_CAP_EXPERIMENTAL) &&
         avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
+        const char *codec_string = av_codec_is_encoder(codec) ? "encoder" : "decoder";
+        AVCodec *codec2;
+        av_log(avctx, AV_LOG_ERROR,
+               "The %s '%s' is experimental but experimental codecs are not enabled, "
+               "add '-strict %d' if you want to use it.\n",
+               codec_string, codec->name, FF_COMPLIANCE_EXPERIMENTAL);
+        codec2 = av_codec_is_encoder(codec) ? avcodec_find_encoder(codec->id) : avcodec_find_decoder(codec->id);
+        if (!(codec2->capabilities & AV_CODEC_CAP_EXPERIMENTAL))
+            av_log(avctx, AV_LOG_ERROR, "Alternatively use the non experimental %s '%s'.\n",
+                codec_string, codec2->name);
         ret = AVERROR_EXPERIMENTAL;
         goto free_and_end;
     }
@@ -974,7 +1367,19 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
         avctx->time_base.den = avctx->sample_rate;
     }
 
-    if (HAVE_THREADS) {
+    if (!HAVE_THREADS)
+        av_log(avctx, AV_LOG_WARNING, "Warning: not compiled with thread support, using thread emulation\n");
+
+    if (CONFIG_FRAME_THREAD_ENCODER && av_codec_is_encoder(avctx->codec)) {
+        ff_unlock_avcodec(codec); //we will instantiate a few encoders thus kick the counter to prevent false detection of a problem
+        ret = ff_frame_thread_encoder_init(avctx, options ? *options : NULL);
+        ff_lock_avcodec(avctx, codec);
+        if (ret < 0)
+            goto free_and_end;
+    }
+
+    if (HAVE_THREADS
+        && !(avctx->internal->frame_thread_encoder && (avctx->active_thread_type&FF_THREAD_FRAME))) {
         ret = ff_thread_init(avctx);
         if (ret < 0) {
             goto free_and_end;
@@ -983,6 +1388,19 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if (!HAVE_THREADS && !(codec->capabilities & AV_CODEC_CAP_AUTO_THREADS))
         avctx->thread_count = 1;
 
+    if (avctx->codec->max_lowres < avctx->lowres || avctx->lowres < 0) {
+        av_log(avctx, AV_LOG_ERROR, "The maximum value for lowres supported by the decoder is %d\n",
+               avctx->codec->max_lowres);
+        ret = AVERROR(EINVAL);
+        goto free_and_end;
+    }
+
+#if FF_API_VISMV
+    if (avctx->debug_mv)
+        av_log(avctx, AV_LOG_WARNING, "The 'vismv' option is deprecated, "
+               "see the codecview filter instead.\n");
+#endif
+
     if (av_codec_is_encoder(avctx->codec)) {
         int i;
 #if FF_API_CODED_FRAME
@@ -1006,7 +1424,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
             }
             if (avctx->codec->sample_fmts[i] == AV_SAMPLE_FMT_NONE) {
-                av_log(avctx, AV_LOG_ERROR, "Specified sample_fmt is not supported.\n");
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%d", avctx->sample_fmt);
+                av_log(avctx, AV_LOG_ERROR, "Specified sample format %s is invalid or not supported\n",
+                       (char *)av_x_if_null(av_get_sample_fmt_name(avctx->sample_fmt), buf));
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
@@ -1015,12 +1436,18 @@ FF_ENABLE_DEPRECATION_WARNINGS
             for (i = 0; avctx->codec->pix_fmts[i] != AV_PIX_FMT_NONE; i++)
                 if (avctx->pix_fmt == avctx->codec->pix_fmts[i])
                     break;
-            if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_NONE) {
-                av_log(avctx, AV_LOG_ERROR, "Specified pix_fmt is not supported\n");
+            if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_NONE
+                && !((avctx->codec_id == AV_CODEC_ID_MJPEG || avctx->codec_id == AV_CODEC_ID_LJPEG)
+                     && avctx->strict_std_compliance <= FF_COMPLIANCE_UNOFFICIAL)) {
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%d", avctx->pix_fmt);
+                av_log(avctx, AV_LOG_ERROR, "Specified pixel format %s is invalid or not supported\n",
+                       (char *)av_x_if_null(av_get_pix_fmt_name(avctx->pix_fmt), buf));
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
             if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ420P ||
+                avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ411P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ422P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ440P ||
                 avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ444P)
@@ -1031,39 +1458,77 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 if (avctx->sample_rate == avctx->codec->supported_samplerates[i])
                     break;
             if (avctx->codec->supported_samplerates[i] == 0) {
-                av_log(avctx, AV_LOG_ERROR, "Specified sample_rate is not supported\n");
+                av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n",
+                       avctx->sample_rate);
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
         }
+        if (avctx->sample_rate < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n",
+                    avctx->sample_rate);
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
         if (avctx->codec->channel_layouts) {
             if (!avctx->channel_layout) {
-                av_log(avctx, AV_LOG_WARNING, "channel_layout not specified\n");
+                av_log(avctx, AV_LOG_WARNING, "Channel layout not specified\n");
             } else {
                 for (i = 0; avctx->codec->channel_layouts[i] != 0; i++)
                     if (avctx->channel_layout == avctx->codec->channel_layouts[i])
                         break;
                 if (avctx->codec->channel_layouts[i] == 0) {
-                    av_log(avctx, AV_LOG_ERROR, "Specified channel_layout is not supported\n");
+                    char buf[512];
+                    av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
+                    av_log(avctx, AV_LOG_ERROR, "Specified channel layout '%s' is not supported\n", buf);
                     ret = AVERROR(EINVAL);
                     goto free_and_end;
                 }
             }
         }
         if (avctx->channel_layout && avctx->channels) {
-            if (av_get_channel_layout_nb_channels(avctx->channel_layout) != avctx->channels) {
-                av_log(avctx, AV_LOG_ERROR, "channel layout does not match number of channels\n");
+            int channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
+            if (channels != avctx->channels) {
+                char buf[512];
+                av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
+                av_log(avctx, AV_LOG_ERROR,
+                       "Channel layout '%s' with %d channels does not match number of specified channels %d\n",
+                       buf, channels, avctx->channels);
                 ret = AVERROR(EINVAL);
                 goto free_and_end;
             }
         } else if (avctx->channel_layout) {
             avctx->channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
         }
+        if (avctx->channels < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Specified number of channels %d is not supported\n",
+                    avctx->channels);
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
+        if(avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+            pixdesc = av_pix_fmt_desc_get(avctx->pix_fmt);
+            if (    avctx->bits_per_raw_sample < 0
+                || (avctx->bits_per_raw_sample > 8 && pixdesc->comp[0].depth <= 8)) {
+                av_log(avctx, AV_LOG_WARNING, "Specified bit depth %d not possible with the specified pixel formats depth %d\n",
+                    avctx->bits_per_raw_sample, pixdesc->comp[0].depth);
+                avctx->bits_per_raw_sample = pixdesc->comp[0].depth;
+            }
+            if (avctx->width <= 0 || avctx->height <= 0) {
+                av_log(avctx, AV_LOG_ERROR, "dimensions not set\n");
+                ret = AVERROR(EINVAL);
+                goto free_and_end;
+            }
+        }
+        if (   (avctx->codec_type == AVMEDIA_TYPE_VIDEO || avctx->codec_type == AVMEDIA_TYPE_AUDIO)
+            && avctx->bit_rate>0 && avctx->bit_rate<1000) {
+            av_log(avctx, AV_LOG_WARNING, "Bitrate %"PRId64" is extremely low, maybe you mean %"PRId64"k\n", (int64_t)avctx->bit_rate, (int64_t)avctx->bit_rate);
+        }
 
         if (!avctx->rc_initial_buffer_occupancy)
             avctx->rc_initial_buffer_occupancy = avctx->rc_buffer_size * 3 / 4;
 
-        if (avctx->ticks_per_frame &&
+        if (avctx->ticks_per_frame && avctx->time_base.num &&
             avctx->ticks_per_frame > INT_MAX / avctx->time_base.num) {
             av_log(avctx, AV_LOG_ERROR,
                    "ticks_per_frame %d too large for the timebase %d/%d.",
@@ -1084,27 +1549,46 @@ FF_ENABLE_DEPRECATION_WARNINGS
         }
     }
 
-    if (avctx->codec->init && !(avctx->active_thread_type & FF_THREAD_FRAME)) {
+    avctx->pts_correction_num_faulty_pts =
+    avctx->pts_correction_num_faulty_dts = 0;
+    avctx->pts_correction_last_pts =
+    avctx->pts_correction_last_dts = INT64_MIN;
+
+    if (   !CONFIG_GRAY && avctx->flags & AV_CODEC_FLAG_GRAY
+        && avctx->codec_descriptor->type == AVMEDIA_TYPE_VIDEO)
+        av_log(avctx, AV_LOG_WARNING,
+               "gray decoding requested but not enabled at configuration time\n");
+
+    if (   avctx->codec->init && (!(avctx->active_thread_type&FF_THREAD_FRAME)
+        || avctx->internal->frame_thread_encoder)) {
         ret = avctx->codec->init(avctx);
         if (ret < 0) {
             goto free_and_end;
         }
     }
 
+    ret=0;
+
 #if FF_API_AUDIOENC_DELAY
     if (av_codec_is_encoder(avctx->codec))
         avctx->delay = avctx->initial_padding;
 #endif
 
     if (av_codec_is_decoder(avctx->codec)) {
+        if (!avctx->bit_rate)
+            avctx->bit_rate = get_bit_rate(avctx);
         /* validate channel layout from the decoder */
         if (avctx->channel_layout) {
             int channels = av_get_channel_layout_nb_channels(avctx->channel_layout);
             if (!avctx->channels)
                 avctx->channels = channels;
             else if (channels != avctx->channels) {
+                char buf[512];
+                av_get_channel_layout_string(buf, sizeof(buf), -1, avctx->channel_layout);
                 av_log(avctx, AV_LOG_WARNING,
-                       "channel layout does not match number of channels\n");
+                       "Channel layout '%s' with %d channels does not match specified number of channels %d: "
+                       "ignoring specified channel layout\n",
+                       buf, channels, avctx->channels);
                 avctx->channel_layout = 0;
             }
         }
@@ -1113,22 +1597,55 @@ FF_ENABLE_DEPRECATION_WARNINGS
             ret = AVERROR(EINVAL);
             goto free_and_end;
         }
+        if (avctx->sub_charenc) {
+            if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
+                av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
+                       "supported with subtitles codecs\n");
+                ret = AVERROR(EINVAL);
+                goto free_and_end;
+            } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
+                av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
+                       "subtitles character encoding will be ignored\n",
+                       avctx->codec_descriptor->name);
+                avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
+            } else {
+                /* input character encoding is set for a text based subtitle
+                 * codec at this point */
+                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
+                    avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_PRE_DECODER;
+
+                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_PRE_DECODER) {
+#if CONFIG_ICONV
+                    iconv_t cd = iconv_open("UTF-8", avctx->sub_charenc);
+                    if (cd == (iconv_t)-1) {
+                        ret = AVERROR(errno);
+                        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
+                               "with input character encoding \"%s\"\n", avctx->sub_charenc);
+                        goto free_and_end;
+                    }
+                    iconv_close(cd);
+#else
+                    av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
+                           "conversion needs a libavcodec built with iconv support "
+                           "for this codec\n");
+                    ret = AVERROR(ENOSYS);
+                    goto free_and_end;
+#endif
+                }
+            }
+        }
 
 #if FF_API_AVCTX_TIMEBASE
         if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
-            avctx->time_base = av_inv_q(avctx->framerate);
+            avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 #endif
     }
-end:
-    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init) {
-        entangled_thread_counter--;
-
-        /* Release any user-supplied mutex. */
-        if (lockmgr_cb) {
-            (*lockmgr_cb)(&codec_mutex, AV_LOCK_RELEASE);
-        }
+    if (codec->priv_data_size > 0 && avctx->priv_data && codec->priv_class) {
+        av_assert0(*(const AVClass **)avctx->priv_data == codec->priv_class);
     }
 
+end:
+    ff_unlock_avcodec(codec);
     if (options) {
         av_dict_free(options);
         *options = tmp;
@@ -1140,7 +1657,7 @@ free_and_end:
         (avctx->codec->caps_internal & FF_CODEC_CAP_INIT_CLEANUP))
         avctx->codec->close(avctx);
 
-    if (avctx->priv_data && avctx->codec && avctx->codec->priv_class)
+    if (codec->priv_class && codec->priv_data_size)
         av_opt_free(avctx->priv_data);
     av_opt_free(avctx);
 
@@ -1153,6 +1670,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     av_dict_free(&tmp);
     av_freep(&avctx->priv_data);
     if (avctx->internal) {
+        av_packet_free(&avctx->internal->buffer_pkt);
+        av_frame_free(&avctx->internal->buffer_frame);
         av_frame_free(&avctx->internal->to_free);
         av_freep(&avctx->internal->pool);
     }
@@ -1161,26 +1680,52 @@ FF_ENABLE_DEPRECATION_WARNINGS
     goto end;
 }
 
-int ff_alloc_packet(AVPacket *avpkt, int size)
+int ff_alloc_packet2(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int64_t min_size)
 {
-    if (size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE)
+    if (avpkt->size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid negative user packet size %d\n", avpkt->size);
+        return AVERROR(EINVAL);
+    }
+    if (size < 0 || size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid minimum required packet size %"PRId64" (max allowed is %d)\n",
+               size, INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE);
         return AVERROR(EINVAL);
+    }
+
+    if (avctx && 2*min_size < size) { // FIXME The factor needs to be finetuned
+        av_assert0(!avpkt->data || avpkt->data != avctx->internal->byte_buffer);
+        if (!avpkt->data || avpkt->size < size) {
+            av_fast_padded_malloc(&avctx->internal->byte_buffer, &avctx->internal->byte_buffer_size, size);
+            avpkt->data = avctx->internal->byte_buffer;
+            avpkt->size = avctx->internal->byte_buffer_size;
+        }
+    }
 
     if (avpkt->data) {
         AVBufferRef *buf = avpkt->buf;
 
-        if (avpkt->size < size)
+        if (avpkt->size < size) {
+            av_log(avctx, AV_LOG_ERROR, "User packet is too small (%d < %"PRId64")\n", avpkt->size, size);
             return AVERROR(EINVAL);
+        }
 
         av_init_packet(avpkt);
         avpkt->buf      = buf;
         avpkt->size     = size;
         return 0;
     } else {
-        return av_new_packet(avpkt, size);
+        int ret = av_new_packet(avpkt, size);
+        if (ret < 0)
+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate packet of size %"PRId64"\n", size);
+        return ret;
     }
 }
 
+int ff_alloc_packet(AVPacket *avpkt, int size)
+{
+    return ff_alloc_packet2(NULL, avpkt, size, 0);
+}
+
 /**
  * Pad last frame with silence.
  */
@@ -1194,6 +1739,7 @@ static int pad_last_frame(AVCodecContext *s, AVFrame **dst, const AVFrame *src)
 
     frame->format         = src->format;
     frame->channel_layout = src->channel_layout;
+    av_frame_set_channels(frame, av_frame_get_channels(src));
     frame->nb_samples     = s->frame_size;
     ret = av_frame_get_buffer(frame, 32);
     if (ret < 0)
@@ -1225,10 +1771,11 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
                                               const AVFrame *frame,
                                               int *got_packet_ptr)
 {
-    AVFrame tmp;
+    AVFrame *extended_frame = NULL;
     AVFrame *padded_frame = NULL;
     int ret;
-    int user_packet = !!avpkt->data;
+    AVPacket user_pkt = *avpkt;
+    int needs_realloc = !user_pkt.data;
 
     *got_packet_ptr = 0;
 
@@ -1254,9 +1801,13 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
         }
         av_log(avctx, AV_LOG_WARNING, "extended_data is not set.\n");
 
-        tmp = *frame;
-        tmp.extended_data = tmp.data;
-        frame = &tmp;
+        extended_frame = av_frame_alloc();
+        if (!extended_frame)
+            return AVERROR(ENOMEM);
+
+        memcpy(extended_frame, frame, sizeof(AVFrame));
+        extended_frame->extended_data = extended_frame->data;
+        frame = extended_frame;
     }
 
     /* extract audio service type metadata */
@@ -1269,26 +1820,32 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
     /* check for valid frame size */
     if (frame) {
         if (avctx->codec->capabilities & AV_CODEC_CAP_SMALL_LAST_FRAME) {
-            if (frame->nb_samples > avctx->frame_size)
-                return AVERROR(EINVAL);
+            if (frame->nb_samples > avctx->frame_size) {
+                av_log(avctx, AV_LOG_ERROR, "more samples than frame size (avcodec_encode_audio2)\n");
+                ret = AVERROR(EINVAL);
+                goto end;
+            }
         } else if (!(avctx->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE)) {
             if (frame->nb_samples < avctx->frame_size &&
                 !avctx->internal->last_audio_frame) {
                 ret = pad_last_frame(avctx, &padded_frame, frame);
                 if (ret < 0)
-                    return ret;
+                    goto end;
 
                 frame = padded_frame;
                 avctx->internal->last_audio_frame = 1;
             }
 
             if (frame->nb_samples != avctx->frame_size) {
+                av_log(avctx, AV_LOG_ERROR, "nb_samples (%d) != frame_size (%d) (avcodec_encode_audio2)\n", frame->nb_samples, avctx->frame_size);
                 ret = AVERROR(EINVAL);
                 goto end;
             }
         }
     }
 
+    av_assert0(avctx->codec->encode2);
+
     ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
     if (!ret) {
         if (*got_packet_ptr) {
@@ -1303,9 +1860,29 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
         } else {
             avpkt->size = 0;
         }
+    }
+    if (avpkt->data && avpkt->data == avctx->internal->byte_buffer) {
+        needs_realloc = 0;
+        if (user_pkt.data) {
+            if (user_pkt.size >= avpkt->size) {
+                memcpy(user_pkt.data, avpkt->data, avpkt->size);
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Provided packet is too small, needs to be %d\n", avpkt->size);
+                avpkt->size = user_pkt.size;
+                ret = -1;
+            }
+            avpkt->buf      = user_pkt.buf;
+            avpkt->data     = user_pkt.data;
+        } else {
+            if (av_dup_packet(avpkt) < 0) {
+                ret = AVERROR(ENOMEM);
+            }
+        }
+    }
 
-        if (!user_packet && avpkt->size) {
-            ret = av_buffer_realloc(&avpkt->buf, avpkt->size);
+    if (!ret) {
+        if (needs_realloc && avpkt->data) {
+            ret = av_buffer_realloc(&avpkt->buf, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
             if (ret >= 0)
                 avpkt->data = avpkt->buf->data;
         }
@@ -1326,6 +1903,7 @@ int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
 
 end:
     av_frame_free(&padded_frame);
+    av_free(extended_frame);
 
 #if FF_API_AUDIOENC_DELAY
     avctx->delay = avctx->initial_padding;
@@ -1340,7 +1918,8 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
                                               int *got_packet_ptr)
 {
     int ret;
-    int user_packet = !!avpkt->data;
+    AVPacket user_pkt = *avpkt;
+    int needs_realloc = !user_pkt.data;
 
     *got_packet_ptr = 0;
 
@@ -1349,6 +1928,13 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
         return AVERROR(ENOSYS);
     }
 
+    if(CONFIG_FRAME_THREAD_ENCODER &&
+       avctx->internal->frame_thread_encoder && (avctx->active_thread_type&FF_THREAD_FRAME))
+        return ff_thread_video_encode_frame(avctx, avpkt, frame, got_packet_ptr);
+
+    if ((avctx->flags&AV_CODEC_FLAG_PASS1) && avctx->stats_out)
+        avctx->stats_out[0] = '\0';
+
     if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY) && !frame) {
         av_packet_unref(avpkt);
         av_init_packet(avpkt);
@@ -1359,17 +1945,43 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
     if (av_image_check_size(avctx->width, avctx->height, 0, avctx))
         return AVERROR(EINVAL);
 
+    if (frame && frame->format == AV_PIX_FMT_NONE)
+        av_log(avctx, AV_LOG_WARNING, "AVFrame.format is not set\n");
+    if (frame && (frame->width == 0 || frame->height == 0))
+        av_log(avctx, AV_LOG_WARNING, "AVFrame.width or height is not set\n");
+
     av_assert0(avctx->codec->encode2);
 
     ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
+    av_assert0(ret <= 0);
+
+    if (avpkt->data && avpkt->data == avctx->internal->byte_buffer) {
+        needs_realloc = 0;
+        if (user_pkt.data) {
+            if (user_pkt.size >= avpkt->size) {
+                memcpy(user_pkt.data, avpkt->data, avpkt->size);
+            } else {
+                av_log(avctx, AV_LOG_ERROR, "Provided packet is too small, needs to be %d\n", avpkt->size);
+                avpkt->size = user_pkt.size;
+                ret = -1;
+            }
+            avpkt->buf      = user_pkt.buf;
+            avpkt->data     = user_pkt.data;
+        } else {
+            if (av_dup_packet(avpkt) < 0) {
+                ret = AVERROR(ENOMEM);
+            }
+        }
+    }
+
     if (!ret) {
         if (!*got_packet_ptr)
             avpkt->size = 0;
         else if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY))
             avpkt->pts = avpkt->dts = frame->pts;
 
-        if (!user_packet && avpkt->size) {
-            ret = av_buffer_realloc(&avpkt->buf, avpkt->size);
+        if (needs_realloc && avpkt->data) {
+            ret = av_buffer_realloc(&avpkt->buf, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
             if (ret >= 0)
                 avpkt->data = avpkt->buf->data;
         }
@@ -1392,18 +2004,54 @@ int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
         av_log(avctx, AV_LOG_ERROR, "start_display_time must be 0.\n");
         return -1;
     }
-    if (sub->num_rects == 0 || !sub->rects)
-        return -1;
+
     ret = avctx->codec->encode_sub(avctx, buf, buf_size, sub);
     avctx->frame_number++;
     return ret;
 }
 
+/**
+ * Attempt to guess proper monotonic timestamps for decoded video frames
+ * which might have incorrect times. Input timestamps may wrap around, in
+ * which case the output will as well.
+ *
+ * @param pts the pts field of the decoded AVPacket, as passed through
+ * AVFrame.pkt_pts
+ * @param dts the dts field of the decoded AVPacket
+ * @return one of the input values, may be AV_NOPTS_VALUE
+ */
+static int64_t guess_correct_pts(AVCodecContext *ctx,
+                                 int64_t reordered_pts, int64_t dts)
+{
+    int64_t pts = AV_NOPTS_VALUE;
+
+    if (dts != AV_NOPTS_VALUE) {
+        ctx->pts_correction_num_faulty_dts += dts <= ctx->pts_correction_last_dts;
+        ctx->pts_correction_last_dts = dts;
+    } else if (reordered_pts != AV_NOPTS_VALUE)
+        ctx->pts_correction_last_dts = reordered_pts;
+
+    if (reordered_pts != AV_NOPTS_VALUE) {
+        ctx->pts_correction_num_faulty_pts += reordered_pts <= ctx->pts_correction_last_pts;
+        ctx->pts_correction_last_pts = reordered_pts;
+    } else if(dts != AV_NOPTS_VALUE)
+        ctx->pts_correction_last_pts = dts;
+
+    if ((ctx->pts_correction_num_faulty_pts<=ctx->pts_correction_num_faulty_dts || dts == AV_NOPTS_VALUE)
+       && reordered_pts != AV_NOPTS_VALUE)
+        pts = reordered_pts;
+    else
+        pts = dts;
+
+    return pts;
+}
+
 static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
 {
     int size = 0, ret;
     const uint8_t *data;
     uint32_t flags;
+    int64_t val;
 
     data = av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, &size);
     if (!data)
@@ -1425,7 +2073,13 @@ static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT) {
         if (size < 4)
             goto fail;
-        avctx->channels = bytestream_get_le32(&data);
+        val = bytestream_get_le32(&data);
+        if (val <= 0 || val > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid channel count");
+            ret = AVERROR_INVALIDDATA;
+            goto fail2;
+        }
+        avctx->channels = val;
         size -= 4;
     }
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT) {
@@ -1437,7 +2091,13 @@ static int apply_param_change(AVCodecContext *avctx, AVPacket *avpkt)
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE) {
         if (size < 4)
             goto fail;
-        avctx->sample_rate = bytestream_get_le32(&data);
+        val = bytestream_get_le32(&data);
+        if (val <= 0 || val > INT_MAX) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid sample rate");
+            ret = AVERROR_INVALIDDATA;
+            goto fail2;
+        }
+        avctx->sample_rate = val;
         size -= 4;
     }
     if (flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS) {
@@ -1482,7 +2142,7 @@ static int unrefcount_frame(AVCodecInternal *avci, AVFrame *frame)
     memcpy(frame->data,     avci->to_free->data,     sizeof(frame->data));
     memcpy(frame->linesize, avci->to_free->linesize, sizeof(frame->linesize));
     if (avci->to_free->extended_data != avci->to_free->data) {
-        int planes = av_get_channel_layout_nb_channels(avci->to_free->channel_layout);
+        int planes = av_frame_get_channels(avci->to_free);
         int size   = planes * sizeof(*frame->extended_data);
 
         if (!size) {
@@ -1505,26 +2165,36 @@ static int unrefcount_frame(AVCodecInternal *avci, AVFrame *frame)
     frame->height         = avci->to_free->height;
     frame->channel_layout = avci->to_free->channel_layout;
     frame->nb_samples     = avci->to_free->nb_samples;
+    av_frame_set_channels(frame, av_frame_get_channels(avci->to_free));
 
     return 0;
 }
 
 int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
                                               int *got_picture_ptr,
-                                              AVPacket *avpkt)
+                                              const AVPacket *avpkt)
 {
     AVCodecInternal *avci = avctx->internal;
     int ret;
+    // copy to ensure we do not change avpkt
+    AVPacket tmp = *avpkt;
 
-    *got_picture_ptr = 0;
-    if ((avctx->coded_width || avctx->coded_height) && av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx))
-        return -1;
+    if (!avctx->codec)
+        return AVERROR(EINVAL);
+    if (avctx->codec->type != AVMEDIA_TYPE_VIDEO) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid media type for video\n");
+        return AVERROR(EINVAL);
+    }
 
     if (!avctx->codec->decode) {
         av_log(avctx, AV_LOG_ERROR, "This decoder requires using the avcodec_send_packet() API.\n");
         return AVERROR(ENOSYS);
     }
 
+    *got_picture_ptr = 0;
+    if ((avctx->coded_width || avctx->coded_height) && av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx))
+        return AVERROR(EINVAL);
+
     avctx->internal->pkt = avpkt;
     ret = apply_param_change(avctx, avpkt);
     if (ret < 0)
@@ -1534,25 +2204,44 @@ int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *pi
 
     if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size ||
         (avctx->active_thread_type & FF_THREAD_FRAME)) {
+        int did_split = av_packet_split_side_data(&tmp);
+        ret = apply_param_change(avctx, &tmp);
+        if (ret < 0)
+            goto fail;
+
+        avctx->internal->pkt = &tmp;
         if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME)
             ret = ff_thread_decode_frame(avctx, picture, got_picture_ptr,
-                                         avpkt);
+                                         &tmp);
         else {
             ret = avctx->codec->decode(avctx, picture, got_picture_ptr,
-                                       avpkt);
+                                       &tmp);
             if (!(avctx->codec->caps_internal & FF_CODEC_CAP_SETS_PKT_DTS))
                 picture->pkt_dts = avpkt->dts;
+
+            if(!avctx->has_b_frames){
+                av_frame_set_pkt_pos(picture, avpkt->pos);
+            }
+            //FIXME these should be under if(!avctx->has_b_frames)
             /* get_buffer is supposed to set frame parameters */
             if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1)) {
-                picture->sample_aspect_ratio = avctx->sample_aspect_ratio;
-                picture->width               = avctx->width;
-                picture->height              = avctx->height;
-                picture->format              = avctx->pix_fmt;
+                if (!picture->sample_aspect_ratio.num)    picture->sample_aspect_ratio = avctx->sample_aspect_ratio;
+                if (!picture->width)                      picture->width               = avctx->width;
+                if (!picture->height)                     picture->height              = avctx->height;
+                if (picture->format == AV_PIX_FMT_NONE)   picture->format              = avctx->pix_fmt;
             }
         }
 
+fail:
         emms_c(); //needed to avoid an emms_c() call before every return;
 
+        avctx->internal->pkt = NULL;
+        if (did_split) {
+            av_packet_free_side_data(&tmp);
+            if(ret == tmp.size)
+                ret = avpkt->size;
+        }
+
         if (*got_picture_ptr) {
             if (!avctx->refcounted_frames) {
                 int err = unrefcount_frame(avci, picture);
@@ -1561,14 +2250,22 @@ int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *pi
             }
 
             avctx->frame_number++;
+            av_frame_set_best_effort_timestamp(picture,
+                                               guess_correct_pts(avctx,
+                                                                 picture->pkt_pts,
+                                                                 picture->pkt_dts));
         } else
             av_frame_unref(picture);
     } else
         ret = 0;
 
+    /* many decoders assign whole AVFrames, thus overwriting extended_data;
+     * make sure it's set correctly */
+    av_assert0(!picture->extended_data || picture->extended_data == picture->data);
+
 #if FF_API_AVCTX_TIMEBASE
     if (avctx->framerate.num > 0 && avctx->framerate.den > 0)
-        avctx->time_base = av_inv_q(avctx->framerate);
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 #endif
 
     return ret;
@@ -1577,39 +2274,146 @@ int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *pi
 int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
                                               AVFrame *frame,
                                               int *got_frame_ptr,
-                                              AVPacket *avpkt)
+                                              const AVPacket *avpkt)
 {
     AVCodecInternal *avci = avctx->internal;
     int ret = 0;
 
     *got_frame_ptr = 0;
 
+    if (!avctx->codec)
+        return AVERROR(EINVAL);
+
     if (!avctx->codec->decode) {
         av_log(avctx, AV_LOG_ERROR, "This decoder requires using the avcodec_send_packet() API.\n");
         return AVERROR(ENOSYS);
     }
 
-    avctx->internal->pkt = avpkt;
-
     if (!avpkt->data && avpkt->size) {
         av_log(avctx, AV_LOG_ERROR, "invalid packet: NULL data, size != 0\n");
         return AVERROR(EINVAL);
     }
-
-    ret = apply_param_change(avctx, avpkt);
-    if (ret < 0)
-        return ret;
+    if (avctx->codec->type != AVMEDIA_TYPE_AUDIO) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid media type for audio\n");
+        return AVERROR(EINVAL);
+    }
 
     av_frame_unref(frame);
 
-    if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size) {
-        ret = avctx->codec->decode(avctx, frame, got_frame_ptr, avpkt);
+    if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size || (avctx->active_thread_type & FF_THREAD_FRAME)) {
+        uint8_t *side;
+        int side_size;
+        uint32_t discard_padding = 0;
+        uint8_t skip_reason = 0;
+        uint8_t discard_reason = 0;
+        // copy to ensure we do not change avpkt
+        AVPacket tmp = *avpkt;
+        int did_split = av_packet_split_side_data(&tmp);
+        ret = apply_param_change(avctx, &tmp);
+        if (ret < 0)
+            goto fail;
+
+        avctx->internal->pkt = &tmp;
+        if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME)
+            ret = ff_thread_decode_frame(avctx, frame, got_frame_ptr, &tmp);
+        else {
+            ret = avctx->codec->decode(avctx, frame, got_frame_ptr, &tmp);
+            av_assert0(ret <= tmp.size);
+            frame->pkt_dts = avpkt->dts;
+        }
         if (ret >= 0 && *got_frame_ptr) {
             avctx->frame_number++;
-            frame->pkt_dts = avpkt->dts;
+            av_frame_set_best_effort_timestamp(frame,
+                                               guess_correct_pts(avctx,
+                                                                 frame->pkt_pts,
+                                                                 frame->pkt_dts));
             if (frame->format == AV_SAMPLE_FMT_NONE)
                 frame->format = avctx->sample_fmt;
+            if (!frame->channel_layout)
+                frame->channel_layout = avctx->channel_layout;
+            if (!av_frame_get_channels(frame))
+                av_frame_set_channels(frame, avctx->channels);
+            if (!frame->sample_rate)
+                frame->sample_rate = avctx->sample_rate;
+        }
+
+        side= av_packet_get_side_data(avctx->internal->pkt, AV_PKT_DATA_SKIP_SAMPLES, &side_size);
+        if(side && side_size>=10) {
+            avctx->internal->skip_samples = AV_RL32(side);
+            discard_padding = AV_RL32(side + 4);
+            av_log(avctx, AV_LOG_DEBUG, "skip %d / discard %d samples due to side data\n",
+                   avctx->internal->skip_samples, (int)discard_padding);
+            skip_reason = AV_RL8(side + 8);
+            discard_reason = AV_RL8(side + 9);
+        }
+        if (avctx->internal->skip_samples > 0 && *got_frame_ptr &&
+            !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) {
+            if(frame->nb_samples <= avctx->internal->skip_samples){
+                *got_frame_ptr = 0;
+                avctx->internal->skip_samples -= frame->nb_samples;
+                av_log(avctx, AV_LOG_DEBUG, "skip whole frame, skip left: %d\n",
+                       avctx->internal->skip_samples);
+            } else {
+                av_samples_copy(frame->extended_data, frame->extended_data, 0, avctx->internal->skip_samples,
+                                frame->nb_samples - avctx->internal->skip_samples, avctx->channels, frame->format);
+                if(avctx->pkt_timebase.num && avctx->sample_rate) {
+                    int64_t diff_ts = av_rescale_q(avctx->internal->skip_samples,
+                                                   (AVRational){1, avctx->sample_rate},
+                                                   avctx->pkt_timebase);
+                    if(frame->pkt_pts!=AV_NOPTS_VALUE)
+                        frame->pkt_pts += diff_ts;
+                    if(frame->pkt_dts!=AV_NOPTS_VALUE)
+                        frame->pkt_dts += diff_ts;
+                    if (av_frame_get_pkt_duration(frame) >= diff_ts)
+                        av_frame_set_pkt_duration(frame, av_frame_get_pkt_duration(frame) - diff_ts);
+                } else {
+                    av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for skipped samples.\n");
+                }
+                av_log(avctx, AV_LOG_DEBUG, "skip %d/%d samples\n",
+                       avctx->internal->skip_samples, frame->nb_samples);
+                frame->nb_samples -= avctx->internal->skip_samples;
+                avctx->internal->skip_samples = 0;
+            }
+        }
+
+        if (discard_padding > 0 && discard_padding <= frame->nb_samples && *got_frame_ptr &&
+            !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) {
+            if (discard_padding == frame->nb_samples) {
+                *got_frame_ptr = 0;
+            } else {
+                if(avctx->pkt_timebase.num && avctx->sample_rate) {
+                    int64_t diff_ts = av_rescale_q(frame->nb_samples - discard_padding,
+                                                   (AVRational){1, avctx->sample_rate},
+                                                   avctx->pkt_timebase);
+                    av_frame_set_pkt_duration(frame, diff_ts);
+                } else {
+                    av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for discarded samples.\n");
+                }
+                av_log(avctx, AV_LOG_DEBUG, "discard %d/%d samples\n",
+                       (int)discard_padding, frame->nb_samples);
+                frame->nb_samples -= discard_padding;
+            }
+        }
+
+        if ((avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL) && *got_frame_ptr) {
+            AVFrameSideData *fside = av_frame_new_side_data(frame, AV_FRAME_DATA_SKIP_SAMPLES, 10);
+            if (fside) {
+                AV_WL32(fside->data, avctx->internal->skip_samples);
+                AV_WL32(fside->data + 4, discard_padding);
+                AV_WL8(fside->data + 8, skip_reason);
+                AV_WL8(fside->data + 9, discard_reason);
+                avctx->internal->skip_samples = 0;
+            }
+        }
+fail:
+        avctx->internal->pkt = NULL;
+        if (did_split) {
+            av_packet_free_side_data(&tmp);
+            if(ret == tmp.size)
+                ret = avpkt->size;
+        }
 
+        if (ret >= 0 && *got_frame_ptr) {
             if (!avctx->refcounted_frames) {
                 int err = unrefcount_frame(avci, frame);
                 if (err < 0)
@@ -1619,21 +2423,264 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
             av_frame_unref(frame);
     }
 
+    av_assert0(ret <= avpkt->size);
+
+    return ret;
+}
+
+#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
+static int recode_subtitle(AVCodecContext *avctx,
+                           AVPacket *outpkt, const AVPacket *inpkt)
+{
+#if CONFIG_ICONV
+    iconv_t cd = (iconv_t)-1;
+    int ret = 0;
+    char *inb, *outb;
+    size_t inl, outl;
+    AVPacket tmp;
+#endif
+
+    if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_PRE_DECODER || inpkt->size == 0)
+        return 0;
+
+#if CONFIG_ICONV
+    cd = iconv_open("UTF-8", avctx->sub_charenc);
+    av_assert0(cd != (iconv_t)-1);
+
+    inb = inpkt->data;
+    inl = inpkt->size;
+
+    if (inl >= INT_MAX / UTF8_MAX_BYTES - AV_INPUT_BUFFER_PADDING_SIZE) {
+        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
+    if (ret < 0)
+        goto end;
+    outpkt->buf  = tmp.buf;
+    outpkt->data = tmp.data;
+    outpkt->size = tmp.size;
+    outb = outpkt->data;
+    outl = outpkt->size;
+
+    if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
+        iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
+        outl >= outpkt->size || inl != 0) {
+        ret = FFMIN(AVERROR(errno), -1);
+        av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
+               "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
+        av_packet_unref(&tmp);
+        goto end;
+    }
+    outpkt->size -= outl;
+    memset(outpkt->data + outpkt->size, 0, outl);
 
+end:
+    if (cd != (iconv_t)-1)
+        iconv_close(cd);
     return ret;
+#else
+    av_log(avctx, AV_LOG_ERROR, "requesting subtitles recoding without iconv");
+    return AVERROR(EINVAL);
+#endif
+}
+
+static int utf8_check(const uint8_t *str)
+{
+    const uint8_t *byte;
+    uint32_t codepoint, min;
+
+    while (*str) {
+        byte = str;
+        GET_UTF8(codepoint, *(byte++), return 0;);
+        min = byte - str == 1 ? 0 : byte - str == 2 ? 0x80 :
+              1 << (5 * (byte - str) - 4);
+        if (codepoint < min || codepoint >= 0x110000 ||
+            codepoint == 0xFFFE /* BOM */ ||
+            codepoint >= 0xD800 && codepoint <= 0xDFFF /* surrogates */)
+            return 0;
+        str = byte;
+    }
+    return 1;
+}
+
+#if FF_API_ASS_TIMING
+static void insert_ts(AVBPrint *buf, int ts)
+{
+    if (ts == -1) {
+        av_bprintf(buf, "9:59:59.99,");
+    } else {
+        int h, m, s;
+
+        h = ts/360000;  ts -= 360000*h;
+        m = ts/  6000;  ts -=   6000*m;
+        s = ts/   100;  ts -=    100*s;
+        av_bprintf(buf, "%d:%02d:%02d.%02d,", h, m, s, ts);
+    }
+}
+
+static int convert_sub_to_old_ass_form(AVSubtitle *sub, const AVPacket *pkt, AVRational tb)
+{
+    int i;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+
+    for (i = 0; i < sub->num_rects; i++) {
+        char *final_dialog;
+        const char *dialog;
+        AVSubtitleRect *rect = sub->rects[i];
+        int ts_start, ts_duration = -1;
+        long int layer;
+
+        if (rect->type != SUBTITLE_ASS || !strncmp(rect->ass, "Dialogue: ", 10))
+            continue;
+
+        av_bprint_clear(&buf);
+
+        /* skip ReadOrder */
+        dialog = strchr(rect->ass, ',');
+        if (!dialog)
+            continue;
+        dialog++;
+
+        /* extract Layer or Marked */
+        layer = strtol(dialog, (char**)&dialog, 10);
+        if (*dialog != ',')
+            continue;
+        dialog++;
+
+        /* rescale timing to ASS time base (ms) */
+        ts_start = av_rescale_q(pkt->pts, tb, av_make_q(1, 100));
+        if (pkt->duration != -1)
+            ts_duration = av_rescale_q(pkt->duration, tb, av_make_q(1, 100));
+        sub->end_display_time = FFMAX(sub->end_display_time, 10 * ts_duration);
+
+        /* construct ASS (standalone file form with timestamps) string */
+        av_bprintf(&buf, "Dialogue: %ld,", layer);
+        insert_ts(&buf, ts_start);
+        insert_ts(&buf, ts_duration == -1 ? -1 : ts_start + ts_duration);
+        av_bprintf(&buf, "%s\r\n", dialog);
+
+        final_dialog = av_strdup(buf.str);
+        if (!av_bprint_is_complete(&buf) || !final_dialog) {
+            av_freep(&final_dialog);
+            av_bprint_finalize(&buf, NULL);
+            return AVERROR(ENOMEM);
+        }
+        av_freep(&rect->ass);
+        rect->ass = final_dialog;
+    }
+
+    av_bprint_finalize(&buf, NULL);
+    return 0;
 }
+#endif
 
 int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
                              int *got_sub_ptr,
                              AVPacket *avpkt)
 {
-    int ret;
+    int i, ret = 0;
+
+    if (!avpkt->data && avpkt->size) {
+        av_log(avctx, AV_LOG_ERROR, "invalid packet: NULL data, size != 0\n");
+        return AVERROR(EINVAL);
+    }
+    if (!avctx->codec)
+        return AVERROR(EINVAL);
+    if (avctx->codec->type != AVMEDIA_TYPE_SUBTITLE) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid media type for subtitles\n");
+        return AVERROR(EINVAL);
+    }
 
-    avctx->internal->pkt = avpkt;
     *got_sub_ptr = 0;
-    ret = avctx->codec->decode(avctx, sub, got_sub_ptr, avpkt);
-    if (*got_sub_ptr)
-        avctx->frame_number++;
+    get_subtitle_defaults(sub);
+
+    if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size) {
+        AVPacket pkt_recoded;
+        AVPacket tmp = *avpkt;
+        int did_split = av_packet_split_side_data(&tmp);
+        //apply_param_change(avctx, &tmp);
+
+        if (did_split) {
+            /* FFMIN() prevents overflow in case the packet wasn't allocated with
+             * proper padding.
+             * If the side data is smaller than the buffer padding size, the
+             * remaining bytes should have already been filled with zeros by the
+             * original packet allocation anyway. */
+            memset(tmp.data + tmp.size, 0,
+                   FFMIN(avpkt->size - tmp.size, AV_INPUT_BUFFER_PADDING_SIZE));
+        }
+
+        pkt_recoded = tmp;
+        ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
+        if (ret < 0) {
+            *got_sub_ptr = 0;
+        } else {
+            avctx->internal->pkt = &pkt_recoded;
+
+            if (avctx->pkt_timebase.num && avpkt->pts != AV_NOPTS_VALUE)
+                sub->pts = av_rescale_q(avpkt->pts,
+                                        avctx->pkt_timebase, AV_TIME_BASE_Q);
+            ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
+            av_assert1((ret >= 0) >= !!*got_sub_ptr &&
+                       !!*got_sub_ptr >= !!sub->num_rects);
+
+#if FF_API_ASS_TIMING
+            if (avctx->sub_text_format == FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS
+                && *got_sub_ptr && sub->num_rects) {
+                const AVRational tb = avctx->pkt_timebase.num ? avctx->pkt_timebase
+                                                              : avctx->time_base;
+                int err = convert_sub_to_old_ass_form(sub, avpkt, tb);
+                if (err < 0)
+                    ret = err;
+            }
+#endif
+
+            if (sub->num_rects && !sub->end_display_time && avpkt->duration &&
+                avctx->pkt_timebase.num) {
+                AVRational ms = { 1, 1000 };
+                sub->end_display_time = av_rescale_q(avpkt->duration,
+                                                     avctx->pkt_timebase, ms);
+            }
+
+            for (i = 0; i < sub->num_rects; i++) {
+                if (sub->rects[i]->ass && !utf8_check(sub->rects[i]->ass)) {
+                    av_log(avctx, AV_LOG_ERROR,
+                           "Invalid UTF-8 in decoded subtitles text; "
+                           "maybe missing -sub_charenc option\n");
+                    avsubtitle_free(sub);
+                    return AVERROR_INVALIDDATA;
+                }
+            }
+
+            if (tmp.data != pkt_recoded.data) { // did we recode?
+                /* prevent from destroying side data from original packet */
+                pkt_recoded.side_data = NULL;
+                pkt_recoded.side_data_elems = 0;
+
+                av_packet_unref(&pkt_recoded);
+            }
+            if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB)
+                sub->format = 0;
+            else if (avctx->codec_descriptor->props & AV_CODEC_PROP_TEXT_SUB)
+                sub->format = 1;
+            avctx->internal->pkt = NULL;
+        }
+
+        if (did_split) {
+            av_packet_free_side_data(&tmp);
+            if(ret == tmp.size)
+                ret = avpkt->size;
+        }
+
+        if (*got_sub_ptr)
+            avctx->frame_number++;
+    }
+
     return ret;
 }
 
@@ -1688,6 +2735,9 @@ static int do_decode(AVCodecContext *avctx, AVPacket *pkt)
         ret = AVERROR(EINVAL);
     }
 
+    if (ret == AVERROR(EAGAIN))
+        ret = pkt->size;
+
     if (ret < 0)
         return ret;
 
@@ -1737,11 +2787,17 @@ int attribute_align_arg avcodec_send_packet(AVCodecContext *avctx, const AVPacke
 
     if (avctx->codec->send_packet) {
         if (avpkt) {
-            ret = apply_param_change(avctx, (AVPacket *)avpkt);
-            if (ret < 0)
-                return ret;
+            AVPacket tmp = *avpkt;
+            int did_split = av_packet_split_side_data(&tmp);
+            ret = apply_param_change(avctx, &tmp);
+            if (ret >= 0)
+                ret = avctx->codec->send_packet(avctx, &tmp);
+            if (did_split)
+                av_packet_free_side_data(&tmp);
+            return ret;
+        } else {
+            return avctx->codec->send_packet(avctx, NULL);
         }
-        return avctx->codec->send_packet(avctx, avpkt);
     }
 
     // Emulation via old API. Assume avpkt is likely not refcounted, while
@@ -1897,13 +2953,21 @@ av_cold int avcodec_close(AVCodecContext *avctx)
 {
     int i;
 
+    if (!avctx)
+        return 0;
+
     if (avcodec_is_open(avctx)) {
         FramePool *pool = avctx->internal->pool;
-
+        if (CONFIG_FRAME_THREAD_ENCODER &&
+            avctx->internal->frame_thread_encoder && avctx->thread_count > 1) {
+            ff_frame_thread_encoder_free(avctx);
+        }
         if (HAVE_THREADS && avctx->internal->thread_ctx)
             ff_thread_free(avctx);
         if (avctx->codec && avctx->codec->close)
             avctx->codec->close(avctx);
+        avctx->internal->byte_buffer_size = 0;
+        av_freep(&avctx->internal->byte_buffer);
         av_frame_free(&avctx->internal->to_free);
         av_frame_free(&avctx->internal->buffer_frame);
         av_packet_free(&avctx->internal->buffer_pkt);
@@ -1943,10 +3007,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
+static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
+{
+    switch(id){
+        //This is for future deprecatec codec ids, its empty since
+        //last major bump but will fill up again over time, please don't remove it
+        default                                         : return id;
+    }
+}
+
 static AVCodec *find_encdec(enum AVCodecID id, int encoder)
 {
     AVCodec *p, *experimental = NULL;
     p = first_avcodec;
+    id= remap_deprecated_codec_id(id);
     while (p) {
         if ((encoder ? av_codec_is_encoder(p) : av_codec_is_decoder(p)) &&
             p->id == id) {
@@ -1998,27 +3072,24 @@ AVCodec *avcodec_find_decoder_by_name(const char *name)
     return NULL;
 }
 
-static int get_bit_rate(AVCodecContext *ctx)
+const char *avcodec_get_name(enum AVCodecID id)
 {
-    int bit_rate;
-    int bits_per_sample;
-
-    switch (ctx->codec_type) {
-    case AVMEDIA_TYPE_VIDEO:
-    case AVMEDIA_TYPE_DATA:
-    case AVMEDIA_TYPE_SUBTITLE:
-    case AVMEDIA_TYPE_ATTACHMENT:
-        bit_rate = ctx->bit_rate;
-        break;
-    case AVMEDIA_TYPE_AUDIO:
-        bits_per_sample = av_get_bits_per_sample(ctx->codec_id);
-        bit_rate = bits_per_sample ? ctx->sample_rate * ctx->channels * bits_per_sample : ctx->bit_rate;
-        break;
-    default:
-        bit_rate = 0;
-        break;
-    }
-    return bit_rate;
+    const AVCodecDescriptor *cd;
+    AVCodec *codec;
+
+    if (id == AV_CODEC_ID_NONE)
+        return "none";
+    cd = avcodec_descriptor_get(id);
+    if (cd)
+        return cd->name;
+    av_log(NULL, AV_LOG_WARNING, "Codec 0x%x is not in the full list.\n", id);
+    codec = avcodec_find_decoder(id);
+    if (codec)
+        return codec->name;
+    codec = avcodec_find_encoder(id);
+    if (codec)
+        return codec->name;
+    return "unknown_codec";
 }
 
 size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_tag)
@@ -2028,7 +3099,7 @@ size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_ta
 #define TAG_PRINT(x)                                              \
     (((x) >= '0' && (x) <= '9') ||                                \
      ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z') ||  \
-     ((x) == '.' || (x) == ' '))
+     ((x) == '.' || (x) == ' ' || (x) == '-' || (x) == '_'))
 
     for (i = 0; i < 4; i++) {
         len = snprintf(buf, buf_size,
@@ -2043,68 +3114,88 @@ size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_ta
 
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 {
+    const char *codec_type;
     const char *codec_name;
     const char *profile = NULL;
-    char buf1[32];
-    int bitrate;
+    int64_t bitrate;
     int new_line = 0;
     AVRational display_aspect_ratio;
-    const AVCodecDescriptor *desc = avcodec_descriptor_get(enc->codec_id);
+    const char *separator = enc->dump_separator ? (const char *)enc->dump_separator : ", ";
 
-    if (desc) {
-        codec_name = desc->name;
-        profile = avcodec_profile_name(enc->codec_id, enc->profile);
-    } else if (enc->codec_id == AV_CODEC_ID_MPEG2TS) {
-        /* fake mpeg2 transport stream codec (currently not
-         * registered) */
-        codec_name = "mpeg2ts";
-    } else {
-        /* output avi tags */
+    if (!buf || buf_size <= 0)
+        return;
+    codec_type = av_get_media_type_string(enc->codec_type);
+    codec_name = avcodec_get_name(enc->codec_id);
+    profile = avcodec_profile_name(enc->codec_id, enc->profile);
+
+    snprintf(buf, buf_size, "%s: %s", codec_type ? codec_type : "unknown",
+             codec_name);
+    buf[0] ^= 'a' ^ 'A'; /* first letter in uppercase */
+
+    if (enc->codec && strcmp(enc->codec->name, codec_name))
+        snprintf(buf + strlen(buf), buf_size - strlen(buf), " (%s)", enc->codec->name);
+
+    if (profile)
+        snprintf(buf + strlen(buf), buf_size - strlen(buf), " (%s)", profile);
+    if (   enc->codec_type == AVMEDIA_TYPE_VIDEO
+        && av_log_get_level() >= AV_LOG_VERBOSE
+        && enc->refs)
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 ", %d reference frame%s",
+                 enc->refs, enc->refs > 1 ? "s" : "");
+
+    if (enc->codec_tag) {
         char tag_buf[32];
         av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-        snprintf(buf1, sizeof(buf1), "%s / 0x%04X", tag_buf, enc->codec_tag);
-        codec_name = buf1;
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 " (%s / 0x%04X)", tag_buf, enc->codec_tag);
     }
 
     switch (enc->codec_type) {
     case AVMEDIA_TYPE_VIDEO:
-        snprintf(buf, buf_size,
-                 "Video: %s%s",
-                 codec_name, enc->mb_decision ? " (hq)" : "");
-        if (profile)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " (%s)", profile);
-        if (enc->codec_tag) {
-            char tag_buf[32];
-            av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " [%s / 0x%04X]", tag_buf, enc->codec_tag);
-        }
+        {
+            char detail[256] = "(";
 
-        av_strlcat(buf, "\n      ", buf_size);
-        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+            av_strlcat(buf, separator, buf_size);
+
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
                  "%s", enc->pix_fmt == AV_PIX_FMT_NONE ? "none" :
                      av_get_pix_fmt_name(enc->pix_fmt));
+            if (enc->bits_per_raw_sample && enc->pix_fmt != AV_PIX_FMT_NONE &&
+                enc->bits_per_raw_sample < av_pix_fmt_desc_get(enc->pix_fmt)->comp[0].depth)
+                av_strlcatf(detail, sizeof(detail), "%d bpc, ", enc->bits_per_raw_sample);
+            if (enc->color_range != AVCOL_RANGE_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_color_range_name(enc->color_range));
+
+            if (enc->colorspace != AVCOL_SPC_UNSPECIFIED ||
+                enc->color_primaries != AVCOL_PRI_UNSPECIFIED ||
+                enc->color_trc != AVCOL_TRC_UNSPECIFIED) {
+                if (enc->colorspace != (int)enc->color_primaries ||
+                    enc->colorspace != (int)enc->color_trc) {
+                    new_line = 1;
+                    av_strlcatf(detail, sizeof(detail), "%s/%s/%s, ",
+                                av_color_space_name(enc->colorspace),
+                                av_color_primaries_name(enc->color_primaries),
+                                av_color_transfer_name(enc->color_trc));
+                } else
+                    av_strlcatf(detail, sizeof(detail), "%s, ",
+                                av_get_colorspace_name(enc->colorspace));
+            }
 
-        if (enc->color_range != AVCOL_RANGE_UNSPECIFIED)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s",
-                     av_color_range_name(enc->color_range));
-        if (enc->colorspace != AVCOL_SPC_UNSPECIFIED ||
-            enc->color_primaries != AVCOL_PRI_UNSPECIFIED ||
-            enc->color_trc != AVCOL_TRC_UNSPECIFIED) {
-            new_line = 1;
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s/%s/%s",
-                     av_color_space_name(enc->colorspace),
-                     av_color_primaries_name(enc->color_primaries),
-                     av_color_transfer_name(enc->color_trc));
-        }
-        if (av_log_get_level() >= AV_LOG_DEBUG &&
-            enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf), ", %s",
-                     av_chroma_location_name(enc->chroma_sample_location));
+            if (av_log_get_level() >= AV_LOG_DEBUG &&
+                enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
+                av_strlcatf(detail, sizeof(detail), "%s, ",
+                            av_chroma_location_name(enc->chroma_sample_location));
+
+            if (strlen(detail) > 1) {
+                detail[strlen(detail) - 2] = 0;
+                av_strlcatf(buf, buf_size, "%s)", detail);
+            }
+        }
 
         if (enc->width) {
-            av_strlcat(buf, new_line ? "\n      " : ", ", buf_size);
+            av_strlcat(buf, new_line ? separator : ", ", buf_size);
 
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      "%dx%d",
@@ -2118,11 +3209,11 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 
             if (enc->sample_aspect_ratio.num) {
                 av_reduce(&display_aspect_ratio.num, &display_aspect_ratio.den,
-                          enc->width * enc->sample_aspect_ratio.num,
-                          enc->height * enc->sample_aspect_ratio.den,
+                          enc->width * (int64_t)enc->sample_aspect_ratio.num,
+                          enc->height * (int64_t)enc->sample_aspect_ratio.den,
                           1024 * 1024);
                 snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                         " [PAR %d:%d DAR %d:%d]",
+                         " [SAR %d:%d DAR %d:%d]",
                          enc->sample_aspect_ratio.num, enc->sample_aspect_ratio.den,
                          display_aspect_ratio.num, display_aspect_ratio.den);
             }
@@ -2136,23 +3227,18 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
         if (encode) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", q=%d-%d", enc->qmin, enc->qmax);
+        } else {
+            if (enc->properties & FF_CODEC_PROPERTY_CLOSED_CAPTIONS)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", Closed Captions");
+            if (enc->properties & FF_CODEC_PROPERTY_LOSSLESS)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", lossless");
         }
         break;
     case AVMEDIA_TYPE_AUDIO:
-        snprintf(buf, buf_size,
-                 "Audio: %s",
-                 codec_name);
-        if (profile)
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " (%s)", profile);
-        if (enc->codec_tag) {
-            char tag_buf[32];
-            av_get_codec_tag_string(tag_buf, sizeof(tag_buf), enc->codec_tag);
-            snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                     " [%s / 0x%04X]", tag_buf, enc->codec_tag);
-        }
+        av_strlcat(buf, separator, buf_size);
 
-        av_strlcat(buf, "\n      ", buf_size);
         if (enc->sample_rate) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      "%d Hz, ", enc->sample_rate);
@@ -2162,18 +3248,26 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", %s", av_get_sample_fmt_name(enc->sample_fmt));
         }
+        if (   enc->bits_per_raw_sample > 0
+            && enc->bits_per_raw_sample != av_get_bytes_per_sample(enc->sample_fmt) * 8)
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                     " (%d bit)", enc->bits_per_raw_sample);
         break;
     case AVMEDIA_TYPE_DATA:
-        snprintf(buf, buf_size, "Data: %s", codec_name);
+        if (av_log_get_level() >= AV_LOG_DEBUG) {
+            int g = av_gcd(enc->time_base.num, enc->time_base.den);
+            if (g)
+                snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                         ", %d/%d",
+                         enc->time_base.num / g, enc->time_base.den / g);
+        }
         break;
     case AVMEDIA_TYPE_SUBTITLE:
-        snprintf(buf, buf_size, "Subtitle: %s", codec_name);
-        break;
-    case AVMEDIA_TYPE_ATTACHMENT:
-        snprintf(buf, buf_size, "Attachment: %s", codec_name);
+        if (enc->width)
+            snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                     ", %dx%d", enc->width, enc->height);
         break;
     default:
-        snprintf(buf, buf_size, "Invalid Codec type %d", enc->codec_type);
         return;
     }
     if (encode) {
@@ -2187,7 +3281,10 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
     bitrate = get_bit_rate(enc);
     if (bitrate != 0) {
         snprintf(buf + strlen(buf), buf_size - strlen(buf),
-                 ", %d kb/s", bitrate / 1000);
+                 ", %"PRId64" kb/s", bitrate / 1000);
+    } else if (enc->rc_max_rate > 0) {
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                 ", max. %"PRId64" kb/s", (int64_t)enc->rc_max_rate / 1000);
     }
 }
 
@@ -2221,18 +3318,25 @@ const char *avcodec_profile_name(enum AVCodecID codec_id, int profile)
 
 unsigned avcodec_version(void)
 {
+//    av_assert0(AV_CODEC_ID_V410==164);
+    av_assert0(AV_CODEC_ID_PCM_S8_PLANAR==65563);
+    av_assert0(AV_CODEC_ID_ADPCM_G722==69660);
+//     av_assert0(AV_CODEC_ID_BMV_AUDIO==86071);
+    av_assert0(AV_CODEC_ID_SRT==94216);
+    av_assert0(LIBAVCODEC_VERSION_MICRO >= 100);
+
     return LIBAVCODEC_VERSION_INT;
 }
 
 const char *avcodec_configuration(void)
 {
-    return LIBAV_CONFIGURATION;
+    return FFMPEG_CONFIGURATION;
 }
 
 const char *avcodec_license(void)
 {
 #define LICENSE_PREFIX "libavcodec license: "
-    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
 void avcodec_flush_buffers(AVCodecContext *avctx)
@@ -2248,6 +3352,9 @@ void avcodec_flush_buffers(AVCodecContext *avctx)
     else if (avctx->codec->flush)
         avctx->codec->flush(avctx);
 
+    avctx->pts_correction_last_pts =
+    avctx->pts_correction_last_dts = INT64_MIN;
+
     if (!avctx->refcounted_frames)
         av_frame_unref(avctx->internal->to_free);
 }
@@ -2255,18 +3362,28 @@ void avcodec_flush_buffers(AVCodecContext *avctx)
 int av_get_exact_bits_per_sample(enum AVCodecID codec_id)
 {
     switch (codec_id) {
+    case AV_CODEC_ID_8SVX_EXP:
+    case AV_CODEC_ID_8SVX_FIB:
     case AV_CODEC_ID_ADPCM_CT:
     case AV_CODEC_ID_ADPCM_IMA_APC:
     case AV_CODEC_ID_ADPCM_IMA_EA_SEAD:
+    case AV_CODEC_ID_ADPCM_IMA_OKI:
     case AV_CODEC_ID_ADPCM_IMA_WS:
     case AV_CODEC_ID_ADPCM_G722:
     case AV_CODEC_ID_ADPCM_YAMAHA:
+    case AV_CODEC_ID_ADPCM_AICA:
         return 4;
+    case AV_CODEC_ID_DSD_LSBF:
+    case AV_CODEC_ID_DSD_MSBF:
+    case AV_CODEC_ID_DSD_LSBF_PLANAR:
+    case AV_CODEC_ID_DSD_MSBF_PLANAR:
     case AV_CODEC_ID_PCM_ALAW:
     case AV_CODEC_ID_PCM_MULAW:
     case AV_CODEC_ID_PCM_S8:
+    case AV_CODEC_ID_PCM_S8_PLANAR:
     case AV_CODEC_ID_PCM_U8:
     case AV_CODEC_ID_PCM_ZORK:
+    case AV_CODEC_ID_SDX2_DPCM:
         return 8;
     case AV_CODEC_ID_PCM_S16BE:
     case AV_CODEC_ID_PCM_S16BE_PLANAR:
@@ -2298,6 +3415,27 @@ int av_get_exact_bits_per_sample(enum AVCodecID codec_id)
     }
 }
 
+enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be)
+{
+    static const enum AVCodecID map[AV_SAMPLE_FMT_NB][2] = {
+        [AV_SAMPLE_FMT_U8  ] = { AV_CODEC_ID_PCM_U8,    AV_CODEC_ID_PCM_U8    },
+        [AV_SAMPLE_FMT_S16 ] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE },
+        [AV_SAMPLE_FMT_S32 ] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE },
+        [AV_SAMPLE_FMT_FLT ] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE },
+        [AV_SAMPLE_FMT_DBL ] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE },
+        [AV_SAMPLE_FMT_U8P ] = { AV_CODEC_ID_PCM_U8,    AV_CODEC_ID_PCM_U8    },
+        [AV_SAMPLE_FMT_S16P] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE },
+        [AV_SAMPLE_FMT_S32P] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE },
+        [AV_SAMPLE_FMT_FLTP] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE },
+        [AV_SAMPLE_FMT_DBLP] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE },
+    };
+    if (fmt < 0 || fmt >= AV_SAMPLE_FMT_NB)
+        return AV_CODEC_ID_NONE;
+    if (be < 0 || be > 1)
+        be = AV_NE(1, 0);
+    return map[fmt][be];
+}
+
 int av_get_bits_per_sample(enum AVCodecID codec_id)
 {
     switch (codec_id) {
@@ -2317,13 +3455,15 @@ int av_get_bits_per_sample(enum AVCodecID codec_id)
 }
 
 static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
-                                    uint32_t tag, int bits_per_coded_sample, int frame_bytes)
+                                    uint32_t tag, int bits_per_coded_sample, int64_t bitrate,
+                                    uint8_t * extradata, int frame_size, int frame_bytes)
 {
     int bps = av_get_exact_bits_per_sample(id);
+    int framecount = (ba > 0 && frame_bytes / ba > 0) ? frame_bytes / ba : 1;
 
     /* codecs with an exact constant bits per sample */
-    if (bps > 0 && ch > 0 && frame_bytes > 0)
-        return (frame_bytes * 8) / (bps * ch);
+    if (bps > 0 && ch > 0 && frame_bytes > 0 && ch < 32768 && bps < 32768)
+        return (frame_bytes * 8LL) / (bps * ch);
     bps = bits_per_coded_sample;
 
     /* codecs with a fixed packet duration */
@@ -2332,16 +3472,16 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
     case AV_CODEC_ID_ADPCM_IMA_QT: return   64;
     case AV_CODEC_ID_ADPCM_EA_XAS: return  128;
     case AV_CODEC_ID_AMR_NB:
+    case AV_CODEC_ID_EVRC:
     case AV_CODEC_ID_GSM:
     case AV_CODEC_ID_QCELP:
-    case AV_CODEC_ID_RA_144:
     case AV_CODEC_ID_RA_288:       return  160;
-    case AV_CODEC_ID_IMC:          return  256;
     case AV_CODEC_ID_AMR_WB:
     case AV_CODEC_ID_GSM_MS:       return  320;
     case AV_CODEC_ID_MP1:          return  384;
     case AV_CODEC_ID_ATRAC1:       return  512;
-    case AV_CODEC_ID_ATRAC3:       return 1024;
+    case AV_CODEC_ID_ATRAC3:       return 1024 * framecount;
+    case AV_CODEC_ID_ATRAC3P:      return 2048;
     case AV_CODEC_ID_MP2:
     case AV_CODEC_ID_MUSEPACK7:    return 1152;
     case AV_CODEC_ID_AC3:          return 1536;
@@ -2351,6 +3491,8 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
         /* calc from sample rate */
         if (id == AV_CODEC_ID_TTA)
             return 256 * sr / 245;
+        else if (id == AV_CODEC_ID_DST)
+            return 588 * sr / 44100;
 
         if (ch > 0) {
             /* calc from sample rate and channels */
@@ -2382,6 +3524,10 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
             return 240 * (frame_bytes / 32);
         if (id == AV_CODEC_ID_NELLYMOSER)
             return 256 * (frame_bytes / 64);
+        if (id == AV_CODEC_ID_RA_144)
+            return 160 * (frame_bytes / 20);
+        if (id == AV_CODEC_ID_G723_1)
+            return 240 * (frame_bytes / 24);
 
         if (bps > 0) {
             /* calc from frame_bytes and bits_per_coded_sample */
@@ -2389,16 +3535,27 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 return frame_bytes * 8 / bps;
         }
 
-        if (ch > 0) {
+        if (ch > 0 && ch < INT_MAX/16) {
             /* calc from frame_bytes and channels */
             switch (id) {
+            case AV_CODEC_ID_ADPCM_AFC:
+                return frame_bytes / (9 * ch) * 16;
+            case AV_CODEC_ID_ADPCM_PSX:
+            case AV_CODEC_ID_ADPCM_DTK:
+                return frame_bytes / (16 * ch) * 28;
             case AV_CODEC_ID_ADPCM_4XM:
+            case AV_CODEC_ID_ADPCM_IMA_DAT4:
             case AV_CODEC_ID_ADPCM_IMA_ISS:
                 return (frame_bytes - 4 * ch) * 2 / ch;
             case AV_CODEC_ID_ADPCM_IMA_SMJPEG:
                 return (frame_bytes - 4) * 2 / ch;
             case AV_CODEC_ID_ADPCM_IMA_AMV:
                 return (frame_bytes - 8) * 2 / ch;
+            case AV_CODEC_ID_ADPCM_THP:
+            case AV_CODEC_ID_ADPCM_THP_LE:
+                if (extradata)
+                    return frame_bytes * 14 / (8 * ch);
+                break;
             case AV_CODEC_ID_ADPCM_XA:
                 return (frame_bytes / 128) * 224 / ch;
             case AV_CODEC_ID_INTERPLAY_DPCM:
@@ -2413,6 +3570,9 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 return 6 * frame_bytes / ch;
             case AV_CODEC_ID_PCM_LXF:
                 return 2 * (frame_bytes / (5 * ch));
+            case AV_CODEC_ID_IAC:
+            case AV_CODEC_ID_IMC:
+                return 4 * frame_bytes / ch;
             }
 
             if (tag) {
@@ -2430,13 +3590,19 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 int blocks = frame_bytes / ba;
                 switch (id) {
                 case AV_CODEC_ID_ADPCM_IMA_WAV:
-                    return blocks * (1 + (ba - 4 * ch) / (4 * ch) * 8);
+                    if (bps < 2 || bps > 5)
+                        return 0;
+                    return blocks * (1 + (ba - 4 * ch) / (bps * ch) * 8);
                 case AV_CODEC_ID_ADPCM_IMA_DK3:
                     return blocks * (((ba - 16) * 2 / 3 * 4) / ch);
                 case AV_CODEC_ID_ADPCM_IMA_DK4:
                     return blocks * (1 + (ba - 4 * ch) * 2 / ch);
+                case AV_CODEC_ID_ADPCM_IMA_RAD:
+                    return blocks * ((ba - 4 * ch) * 2 / ch);
                 case AV_CODEC_ID_ADPCM_MS:
                     return blocks * (2 + (ba - 7 * ch) * 2 / ch);
+                case AV_CODEC_ID_ADPCM_MTAF:
+                    return blocks * (ba - 16) * 2 / ch;
                 }
             }
 
@@ -2444,8 +3610,12 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 /* calc from frame_bytes, channels, and bits_per_coded_sample */
                 switch (id) {
                 case AV_CODEC_ID_PCM_DVD:
+                    if(bps<4)
+                        return 0;
                     return 2 * (frame_bytes / ((bps * 2 / 8) * ch));
                 case AV_CODEC_ID_PCM_BLURAY:
+                    if(bps<4)
+                        return 0;
                     return frame_bytes / ((FFALIGN(ch, 2) * bps) / 8);
                 case AV_CODEC_ID_S302M:
                     return 2 * (frame_bytes / ((bps + 4) / 4)) / ch;
@@ -2454,6 +3624,17 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
         }
     }
 
+    /* Fall back on using frame_size */
+    if (frame_size > 1 && frame_bytes)
+        return frame_size;
+
+    //For WMA we currently have no other means to calculate duration thus we
+    //do it here by assuming CBR, which is true for all known cases.
+    if (bitrate > 0 && frame_bytes > 0 && sr > 0 && ba > 1) {
+        if (id == AV_CODEC_ID_WMAV1 || id == AV_CODEC_ID_WMAV2)
+            return  (frame_bytes * 8LL * sr) / bitrate;
+    }
+
     return 0;
 }
 
@@ -2462,6 +3643,7 @@ int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes)
     return get_audio_frame_duration(avctx->codec_id, avctx->sample_rate,
                                     avctx->channels, avctx->block_align,
                                     avctx->codec_tag, avctx->bits_per_coded_sample,
+                                    avctx->bit_rate, avctx->extradata, avctx->frame_size,
                                     frame_bytes);
 }
 
@@ -2470,6 +3652,7 @@ int av_get_audio_frame_duration2(AVCodecParameters *par, int frame_bytes)
     return get_audio_frame_duration(par->codec_id, par->sample_rate,
                                     par->channels, par->block_align,
                                     par->codec_tag, par->bits_per_coded_sample,
+                                    par->bit_rate, par->extradata, par->frame_size,
                                     frame_bytes);
 }
 
@@ -2506,7 +3689,7 @@ int ff_match_2uint16(const uint16_t(*tab)[2], int size, int a, int b)
 FF_DISABLE_DEPRECATION_WARNINGS
 void av_log_missing_feature(void *avc, const char *feature, int want_sample)
 {
-    av_log(avc, AV_LOG_WARNING, "%s is not implemented. Update your Libav "
+    av_log(avc, AV_LOG_WARNING, "%s is not implemented. Update your FFmpeg "
             "version to the newest one from Git. If the problem still "
             "occurs, it means that your file has a feature which has not "
             "been implemented.\n", feature);
@@ -2523,8 +3706,8 @@ void av_log_ask_for_sample(void *avc, const char *msg, ...)
     if (msg)
         av_vlog(avc, AV_LOG_WARNING, msg, argument_list);
     av_log(avc, AV_LOG_WARNING, "If you want to help, upload a sample "
-            "of this file to ftp://upload.libav.org/incoming/ "
-            "and contact the libav-devel mailing list.\n");
+            "of this file to ftp://upload.ffmpeg.org/incoming/ "
+            "and contact the ffmpeg-devel mailing list. (ffmpeg-devel@ffmpeg.org)\n");
 
     va_end(argument_list);
 }
@@ -2532,14 +3715,15 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif /* FF_API_MISSING_SAMPLE */
 
 static AVHWAccel *first_hwaccel = NULL;
+static AVHWAccel **last_hwaccel = &first_hwaccel;
 
 void av_register_hwaccel(AVHWAccel *hwaccel)
 {
-    AVHWAccel **p = &first_hwaccel;
-    while (*p)
-        p = &(*p)->next;
-    *p = hwaccel;
+    AVHWAccel **p = last_hwaccel;
     hwaccel->next = NULL;
+    while(*p || avpriv_atomic_ptr_cas((void * volatile *)p, NULL, hwaccel))
+        p = &(*p)->next;
+    last_hwaccel = &hwaccel->next;
 }
 
 AVHWAccel *av_hwaccel_next(const AVHWAccel *hwaccel)
@@ -2579,6 +3763,48 @@ int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op))
     return 0;
 }
 
+int ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec)
+{
+    if (codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE || !codec->init)
+        return 0;
+
+    if (lockmgr_cb) {
+        if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_OBTAIN))
+            return -1;
+    }
+
+    if (avpriv_atomic_int_add_and_fetch(&entangled_thread_counter, 1) != 1) {
+        av_log(log_ctx, AV_LOG_ERROR,
+               "Insufficient thread locking. At least %d threads are "
+               "calling avcodec_open2() at the same time right now.\n",
+               entangled_thread_counter);
+        if (!lockmgr_cb)
+            av_log(log_ctx, AV_LOG_ERROR, "No lock manager is set, please see av_lockmgr_register()\n");
+        ff_avcodec_locked = 1;
+        ff_unlock_avcodec(codec);
+        return AVERROR(EINVAL);
+    }
+    av_assert0(!ff_avcodec_locked);
+    ff_avcodec_locked = 1;
+    return 0;
+}
+
+int ff_unlock_avcodec(const AVCodec *codec)
+{
+    if (codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE || !codec->init)
+        return 0;
+
+    av_assert0(ff_avcodec_locked);
+    ff_avcodec_locked = 0;
+    avpriv_atomic_int_add_and_fetch(&entangled_thread_counter, -1);
+    if (lockmgr_cb) {
+        if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_RELEASE))
+            return -1;
+    }
+
+    return 0;
+}
+
 int avpriv_lock_avformat(void)
 {
     if (lockmgr_cb) {
@@ -2602,7 +3828,7 @@ unsigned int avpriv_toupper4(unsigned int x)
     return av_toupper(x & 0xFF) +
           (av_toupper((x >>  8) & 0xFF) << 8)  +
           (av_toupper((x >> 16) & 0xFF) << 16) +
-          (av_toupper((x >> 24) & 0xFF) << 24);
+((unsigned)av_toupper((x >> 24) & 0xFF) << 24);
 }
 
 int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
@@ -2615,6 +3841,8 @@ int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
     if (ret < 0)
         return ret;
 
+    av_assert0(!dst->progress);
+
     if (src->progress &&
         !(dst->progress = av_buffer_ref(src->progress))) {
         ff_thread_release_buffer(dst->owner, dst);
@@ -2626,6 +3854,11 @@ int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src)
 
 #if !HAVE_THREADS
 
+enum AVPixelFormat ff_thread_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
+{
+    return ff_get_format(avctx, fmt);
+}
+
 int ff_thread_get_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags)
 {
     f->owner = avctx;
@@ -2650,6 +3883,28 @@ void ff_thread_await_progress(ThreadFrame *f, int progress, int field)
 {
 }
 
+int ff_thread_can_start_frame(AVCodecContext *avctx)
+{
+    return 1;
+}
+
+int ff_alloc_entries(AVCodecContext *avctx, int count)
+{
+    return 0;
+}
+
+void ff_reset_entries(AVCodecContext *avctx)
+{
+}
+
+void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift)
+{
+}
+
+void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n)
+{
+}
+
 #endif
 
 int avcodec_is_open(AVCodecContext *s)
@@ -2657,13 +3912,36 @@ int avcodec_is_open(AVCodecContext *s)
     return !!s->internal;
 }
 
-const uint8_t *avpriv_find_start_code(const uint8_t *restrict p,
+int avpriv_bprint_to_extradata(AVCodecContext *avctx, struct AVBPrint *buf)
+{
+    int ret;
+    char *str;
+
+    ret = av_bprint_finalize(buf, &str);
+    if (ret < 0)
+        return ret;
+    if (!av_bprint_is_complete(buf)) {
+        av_free(str);
+        return AVERROR(ENOMEM);
+    }
+
+    avctx->extradata = str;
+    /* Note: the string is NUL terminated (so extradata can be read as a
+     * string), but the ending character is not accounted in the size (in
+     * binary formats you are likely not supposed to mux that character). When
+     * extradata is copied, it is also padded with AV_INPUT_BUFFER_PADDING_SIZE
+     * zeros. */
+    avctx->extradata_size = buf->len;
+    return 0;
+}
+
+const uint8_t *avpriv_find_start_code(const uint8_t *av_restrict p,
                                       const uint8_t *end,
-                                      uint32_t * restrict state)
+                                      uint32_t *av_restrict state)
 {
     int i;
 
-    assert(p <= end);
+    av_assert0(p <= end);
     if (p >= end)
         return end;
 
@@ -2746,6 +4024,8 @@ static void codec_parameters_reset(AVCodecParameters *par)
     par->color_space         = AVCOL_SPC_UNSPECIFIED;
     par->chroma_location     = AVCHROMA_LOC_UNSPECIFIED;
     par->sample_aspect_ratio = (AVRational){ 0, 1 };
+    par->profile             = FF_PROFILE_UNKNOWN;
+    par->level               = FF_LEVEL_UNKNOWN;
 }
 
 AVCodecParameters *avcodec_parameters_alloc(void)
@@ -2798,6 +4078,7 @@ int avcodec_parameters_from_context(AVCodecParameters *par,
 
     par->bit_rate              = codec->bit_rate;
     par->bits_per_coded_sample = codec->bits_per_coded_sample;
+    par->bits_per_raw_sample   = codec->bits_per_raw_sample;
     par->profile               = codec->profile;
     par->level                 = codec->level;
 
@@ -2813,6 +4094,7 @@ int avcodec_parameters_from_context(AVCodecParameters *par,
         par->color_space         = codec->colorspace;
         par->chroma_location     = codec->chroma_sample_location;
         par->sample_aspect_ratio = codec->sample_aspect_ratio;
+        par->video_delay         = codec->has_b_frames;
         break;
     case AVMEDIA_TYPE_AUDIO:
         par->format          = codec->sample_fmt;
@@ -2820,7 +4102,13 @@ int avcodec_parameters_from_context(AVCodecParameters *par,
         par->channels        = codec->channels;
         par->sample_rate     = codec->sample_rate;
         par->block_align     = codec->block_align;
+        par->frame_size      = codec->frame_size;
         par->initial_padding = codec->initial_padding;
+        par->seek_preroll    = codec->seek_preroll;
+        break;
+    case AVMEDIA_TYPE_SUBTITLE:
+        par->width  = codec->width;
+        par->height = codec->height;
         break;
     }
 
@@ -2844,6 +4132,7 @@ int avcodec_parameters_to_context(AVCodecContext *codec,
 
     codec->bit_rate              = par->bit_rate;
     codec->bits_per_coded_sample = par->bits_per_coded_sample;
+    codec->bits_per_raw_sample   = par->bits_per_raw_sample;
     codec->profile               = par->profile;
     codec->level                 = par->level;
 
@@ -2859,6 +4148,7 @@ int avcodec_parameters_to_context(AVCodecContext *codec,
         codec->colorspace             = par->color_space;
         codec->chroma_sample_location = par->chroma_location;
         codec->sample_aspect_ratio    = par->sample_aspect_ratio;
+        codec->has_b_frames           = par->video_delay;
         break;
     case AVMEDIA_TYPE_AUDIO:
         codec->sample_fmt      = par->format;
@@ -2866,7 +4156,14 @@ int avcodec_parameters_to_context(AVCodecContext *codec,
         codec->channels        = par->channels;
         codec->sample_rate     = par->sample_rate;
         codec->block_align     = par->block_align;
+        codec->frame_size      = par->frame_size;
+        codec->delay           =
         codec->initial_padding = par->initial_padding;
+        codec->seek_preroll    = par->seek_preroll;
+        break;
+    case AVMEDIA_TYPE_SUBTITLE:
+        codec->width  = par->width;
+        codec->height = par->height;
         break;
     }
 
@@ -2881,3 +4178,46 @@ int avcodec_parameters_to_context(AVCodecContext *codec,
 
     return 0;
 }
+
+int ff_alloc_a53_sei(const AVFrame *frame, size_t prefix_len,
+                     void **data, size_t *sei_size)
+{
+    AVFrameSideData *side_data = NULL;
+    uint8_t *sei_data;
+
+    if (frame)
+        side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_A53_CC);
+
+    if (!side_data) {
+        *data = NULL;
+        return 0;
+    }
+
+    *sei_size = side_data->size + 11;
+    *data = av_mallocz(*sei_size + prefix_len);
+    if (!*data)
+        return AVERROR(ENOMEM);
+    sei_data = (uint8_t*)*data + prefix_len;
+
+    // country code
+    sei_data[0] = 181;
+    sei_data[1] = 0;
+    sei_data[2] = 49;
+
+    /**
+     * 'GA94' is standard in North America for ATSC, but hard coding
+     * this style may not be the right thing to do -- other formats
+     * do exist. This information is not available in the side_data
+     * so we are going with this right now.
+     */
+    AV_WL32(sei_data + 3, MKTAG('G', 'A', '9', '4'));
+    sei_data[7] = 3;
+    sei_data[8] = ((side_data->size/3) & 0x1f) | 0x40;
+    sei_data[9] = 0;
+
+    memcpy(sei_data + 10, side_data->data, side_data->size);
+
+    sei_data[side_data->size+10] = 255;
+
+    return 0;
+}
diff --git a/libavcodec/utvideo.c b/libavcodec/utvideo.c
index 35e927c..5828d5e 100644
--- a/libavcodec/utvideo.c
+++ b/libavcodec/utvideo.c
@@ -2,20 +2,20 @@
  * Common Ut Video code
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,3 +39,9 @@ int ff_ut_huff_cmp_len(const void *a, const void *b)
     const HuffEntry *aa = a, *bb = b;
     return (aa->len - bb->len)*256 + aa->sym - bb->sym;
 }
+
+int ff_ut10_huff_cmp_len(const void *a, const void *b)
+{
+    const HuffEntry *aa = a, *bb = b;
+    return (aa->len - bb->len)*1024 + aa->sym - bb->sym;
+}
diff --git a/libavcodec/utvideo.h b/libavcodec/utvideo.h
index 0035e9c..b785bf0 100644
--- a/libavcodec/utvideo.h
+++ b/libavcodec/utvideo.h
@@ -2,20 +2,20 @@
  * Common Ut Video header
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,6 +76,7 @@ typedef struct UtvideoContext {
     int      compression;
     int      interlaced;
     int      frame_pred;
+    int      pro;
 
     int      slice_stride;
     uint8_t *slice_bits, *slice_buffer[4];
@@ -83,12 +84,13 @@ typedef struct UtvideoContext {
 } UtvideoContext;
 
 typedef struct HuffEntry {
-    uint8_t  sym;
+    uint16_t sym;
     uint8_t  len;
     uint32_t code;
 } HuffEntry;
 
 /* Compare huffman tree nodes */
 int ff_ut_huff_cmp_len(const void *a, const void *b);
+int ff_ut10_huff_cmp_len(const void *a, const void *b);
 
 #endif /* AVCODEC_UTVIDEO_H */
diff --git a/libavcodec/utvideodec.c b/libavcodec/utvideodec.c
index 30ca4d2..8798714 100644
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@@ -2,20 +2,20 @@
  * Ut Video decoder
  * Copyright (c) 2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,6 +35,50 @@
 #include "thread.h"
 #include "utvideo.h"
 
+static int build_huff10(const uint8_t *src, VLC *vlc, int *fsym)
+{
+    int i;
+    HuffEntry he[1024];
+    int last;
+    uint32_t codes[1024];
+    uint8_t bits[1024];
+    uint16_t syms[1024];
+    uint32_t code;
+
+    *fsym = -1;
+    for (i = 0; i < 1024; i++) {
+        he[i].sym = i;
+        he[i].len = *src++;
+    }
+    qsort(he, 1024, sizeof(*he), ff_ut10_huff_cmp_len);
+
+    if (!he[0].len) {
+        *fsym = he[0].sym;
+        return 0;
+    }
+
+    last = 1023;
+    while (he[last].len == 255 && last)
+        last--;
+
+    if (he[last].len > 32) {
+        return -1;
+    }
+
+    code = 1;
+    for (i = last; i >= 0; i--) {
+        codes[i] = code >> (32 - he[i].len);
+        bits[i]  = he[i].len;
+        syms[i]  = he[i].sym;
+        code += 0x80000000u >> (he[i].len - 1);
+    }
+
+    return ff_init_vlc_sparse(vlc, FFMIN(he[last].len, 11), last + 1,
+                              bits,  sizeof(*bits),  sizeof(*bits),
+                              codes, sizeof(*codes), sizeof(*codes),
+                              syms,  sizeof(*syms),  sizeof(*syms), 0);
+}
+
 static int build_huff(const uint8_t *src, VLC *vlc, int *fsym)
 {
     int i;
@@ -56,13 +100,14 @@ static int build_huff(const uint8_t *src, VLC *vlc, int *fsym)
         *fsym = he[0].sym;
         return 0;
     }
-    if (he[0].len > 32)
-        return -1;
 
     last = 255;
     while (he[last].len == 255 && last)
         last--;
 
+    if (he[last].len > 32)
+        return -1;
+
     code = 1;
     for (i = last; i >= 0; i--) {
         codes[i] = code >> (32 - he[i].len);
@@ -71,12 +116,117 @@ static int build_huff(const uint8_t *src, VLC *vlc, int *fsym)
         code += 0x80000000u >> (he[i].len - 1);
     }
 
-    return ff_init_vlc_sparse(vlc, FFMIN(he[last].len, 9), last + 1,
+    return ff_init_vlc_sparse(vlc, FFMIN(he[last].len, 11), last + 1,
                               bits,  sizeof(*bits),  sizeof(*bits),
                               codes, sizeof(*codes), sizeof(*codes),
                               syms,  sizeof(*syms),  sizeof(*syms), 0);
 }
 
+static int decode_plane10(UtvideoContext *c, int plane_no,
+                          uint16_t *dst, int step, int stride,
+                          int width, int height,
+                          const uint8_t *src, const uint8_t *huff,
+                          int use_pred)
+{
+    int i, j, slice, pix, ret;
+    int sstart, send;
+    VLC vlc;
+    GetBitContext gb;
+    int prev, fsym;
+
+    if ((ret = build_huff10(huff, &vlc, &fsym)) < 0) {
+        av_log(c->avctx, AV_LOG_ERROR, "Cannot build Huffman codes\n");
+        return ret;
+    }
+    if (fsym >= 0) { // build_huff reported a symbol to fill slices with
+        send = 0;
+        for (slice = 0; slice < c->slices; slice++) {
+            uint16_t *dest;
+
+            sstart = send;
+            send   = (height * (slice + 1) / c->slices);
+            dest   = dst + sstart * stride;
+
+            prev = 0x200;
+            for (j = sstart; j < send; j++) {
+                for (i = 0; i < width * step; i += step) {
+                    pix = fsym;
+                    if (use_pred) {
+                        prev += pix;
+                        prev &= 0x3FF;
+                        pix   = prev;
+                    }
+                    dest[i] = pix;
+                }
+                dest += stride;
+            }
+        }
+        return 0;
+    }
+
+    send = 0;
+    for (slice = 0; slice < c->slices; slice++) {
+        uint16_t *dest;
+        int slice_data_start, slice_data_end, slice_size;
+
+        sstart = send;
+        send   = (height * (slice + 1) / c->slices);
+        dest   = dst + sstart * stride;
+
+        // slice offset and size validation was done earlier
+        slice_data_start = slice ? AV_RL32(src + slice * 4 - 4) : 0;
+        slice_data_end   = AV_RL32(src + slice * 4);
+        slice_size       = slice_data_end - slice_data_start;
+
+        if (!slice_size) {
+            av_log(c->avctx, AV_LOG_ERROR, "Plane has more than one symbol "
+                   "yet a slice has a length of zero.\n");
+            goto fail;
+        }
+
+        memcpy(c->slice_bits, src + slice_data_start + c->slices * 4,
+               slice_size);
+        memset(c->slice_bits + slice_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+        c->bdsp.bswap_buf((uint32_t *) c->slice_bits,
+                          (uint32_t *) c->slice_bits,
+                          (slice_data_end - slice_data_start + 3) >> 2);
+        init_get_bits(&gb, c->slice_bits, slice_size * 8);
+
+        prev = 0x200;
+        for (j = sstart; j < send; j++) {
+            for (i = 0; i < width * step; i += step) {
+                if (get_bits_left(&gb) <= 0) {
+                    av_log(c->avctx, AV_LOG_ERROR,
+                           "Slice decoding ran out of bits\n");
+                    goto fail;
+                }
+                pix = get_vlc2(&gb, vlc.table, vlc.bits, 3);
+                if (pix < 0) {
+                    av_log(c->avctx, AV_LOG_ERROR, "Decoding error\n");
+                    goto fail;
+                }
+                if (use_pred) {
+                    prev += pix;
+                    prev &= 0x3FF;
+                    pix   = prev;
+                }
+                dest[i] = pix;
+            }
+            dest += stride;
+        }
+        if (get_bits_left(&gb) > 32)
+            av_log(c->avctx, AV_LOG_WARNING,
+                   "%d bits left after decoding slice\n", get_bits_left(&gb));
+    }
+
+    ff_free_vlc(&vlc);
+
+    return 0;
+fail:
+    ff_free_vlc(&vlc);
+    return AVERROR_INVALIDDATA;
+}
+
 static int decode_plane(UtvideoContext *c, int plane_no,
                         uint8_t *dst, int step, int stride,
                         int width, int height,
@@ -156,7 +306,7 @@ static int decode_plane(UtvideoContext *c, int plane_no,
                            "Slice decoding ran out of bits\n");
                     goto fail;
                 }
-                pix = get_vlc2(&gb, vlc.table, vlc.bits, 4);
+                pix = get_vlc2(&gb, vlc.table, vlc.bits, 3);
                 if (pix < 0) {
                     av_log(c->avctx, AV_LOG_ERROR, "Decoding error\n");
                     goto fail;
@@ -200,6 +350,28 @@ static void restore_rgb_planes(uint8_t *src, int step, int stride, int width,
     }
 }
 
+static void restore_rgb_planes10(AVFrame *frame, int width, int height)
+{
+    uint16_t *src_r = (uint16_t *)frame->data[2];
+    uint16_t *src_g = (uint16_t *)frame->data[0];
+    uint16_t *src_b = (uint16_t *)frame->data[1];
+    int r, g, b;
+    int i, j;
+
+    for (j = 0; j < height; j++) {
+        for (i = 0; i < width; i++) {
+            r = src_r[i];
+            g = src_g[i];
+            b = src_b[i];
+            src_r[i] = (r + g - 0x200) & 0x3FF;
+            src_b[i] = (b + g - 0x200) & 0x3FF;
+        }
+        src_r += frame->linesize[2] / 2;
+        src_g += frame->linesize[0] / 2;
+        src_b += frame->linesize[1] / 2;
+    }
+}
+
 static void restore_median(uint8_t *src, int step, int stride,
                            int width, int height, int slices, int rmode)
 {
@@ -213,9 +385,9 @@ static void restore_median(uint8_t *src, int step, int stride,
         slice_start  = ((slice * height) / slices) & cmask;
         slice_height = ((((slice + 1) * height) / slices) & cmask) -
                        slice_start;
+
         if (!slice_height)
             continue;
-
         bsrc = src + slice_start * stride;
 
         // first line - left neighbour prediction
@@ -226,7 +398,7 @@ static void restore_median(uint8_t *src, int step, int stride,
             A        = bsrc[i];
         }
         bsrc += stride;
-        if (slice_height == 1)
+        if (slice_height <= 1)
             continue;
         // second line - first element has top prediction, the rest uses median
         C        = bsrc[-stride];
@@ -288,7 +460,7 @@ static void restore_median_il(uint8_t *src, int step, int stride,
             A                 = bsrc[stride + i];
         }
         bsrc += stride2;
-        if (slice_height == 1)
+        if (slice_height <= 1)
             continue;
         // second line - first element has top prediction, the rest uses median
         C        = bsrc[-stride2];
@@ -339,44 +511,73 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     GetByteContext gb;
     ThreadFrame frame = { .f = data };
 
-    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
-    }
-
-    ff_thread_finish_setup(avctx);
 
     /* parse plane structure to get frame flags and validate slice offsets */
     bytestream2_init(&gb, buf, buf_size);
-    for (i = 0; i < c->planes; i++) {
-        plane_start[i] = gb.buffer;
-        if (bytestream2_get_bytes_left(&gb) < 256 + 4 * c->slices) {
-            av_log(avctx, AV_LOG_ERROR, "Insufficient data for a plane\n");
+    if (c->pro) {
+        if (bytestream2_get_bytes_left(&gb) < c->frame_info_size) {
+            av_log(avctx, AV_LOG_ERROR, "Not enough data for frame information\n");
             return AVERROR_INVALIDDATA;
         }
-        bytestream2_skipu(&gb, 256);
-        slice_start = 0;
-        slice_end   = 0;
-        for (j = 0; j < c->slices; j++) {
-            slice_end   = bytestream2_get_le32u(&gb);
-            slice_size  = slice_end - slice_start;
-            if (slice_end < 0 || slice_size < 0 ||
-                bytestream2_get_bytes_left(&gb) < slice_end) {
-                av_log(avctx, AV_LOG_ERROR, "Incorrect slice size\n");
+        c->frame_info = bytestream2_get_le32u(&gb);
+        c->slices = ((c->frame_info >> 16) & 0xff) + 1;
+        for (i = 0; i < c->planes; i++) {
+            plane_start[i] = gb.buffer;
+            if (bytestream2_get_bytes_left(&gb) < 1024 + 4 * c->slices) {
+                av_log(avctx, AV_LOG_ERROR, "Insufficient data for a plane\n");
                 return AVERROR_INVALIDDATA;
             }
-            slice_start = slice_end;
-            max_slice_size = FFMAX(max_slice_size, slice_size);
+            slice_start = 0;
+            slice_end   = 0;
+            for (j = 0; j < c->slices; j++) {
+                slice_end   = bytestream2_get_le32u(&gb);
+                if (slice_end < 0 || slice_end < slice_start ||
+                    bytestream2_get_bytes_left(&gb) < slice_end) {
+                    av_log(avctx, AV_LOG_ERROR, "Incorrect slice size\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                slice_size  = slice_end - slice_start;
+                slice_start = slice_end;
+                max_slice_size = FFMAX(max_slice_size, slice_size);
+            }
+            plane_size = slice_end;
+            bytestream2_skipu(&gb, plane_size);
+            bytestream2_skipu(&gb, 1024);
         }
-        plane_size = slice_end;
-        bytestream2_skipu(&gb, plane_size);
-    }
-    plane_start[c->planes] = gb.buffer;
-    if (bytestream2_get_bytes_left(&gb) < c->frame_info_size) {
-        av_log(avctx, AV_LOG_ERROR, "Not enough data for frame information\n");
-        return AVERROR_INVALIDDATA;
+        plane_start[c->planes] = gb.buffer;
+    } else {
+        for (i = 0; i < c->planes; i++) {
+            plane_start[i] = gb.buffer;
+            if (bytestream2_get_bytes_left(&gb) < 256 + 4 * c->slices) {
+                av_log(avctx, AV_LOG_ERROR, "Insufficient data for a plane\n");
+                return AVERROR_INVALIDDATA;
+            }
+            bytestream2_skipu(&gb, 256);
+            slice_start = 0;
+            slice_end   = 0;
+            for (j = 0; j < c->slices; j++) {
+                slice_end   = bytestream2_get_le32u(&gb);
+                if (slice_end < 0 || slice_end < slice_start ||
+                    bytestream2_get_bytes_left(&gb) < slice_end) {
+                    av_log(avctx, AV_LOG_ERROR, "Incorrect slice size\n");
+                    return AVERROR_INVALIDDATA;
+                }
+                slice_size  = slice_end - slice_start;
+                slice_start = slice_end;
+                max_slice_size = FFMAX(max_slice_size, slice_size);
+            }
+            plane_size = slice_end;
+            bytestream2_skipu(&gb, plane_size);
+        }
+        plane_start[c->planes] = gb.buffer;
+        if (bytestream2_get_bytes_left(&gb) < c->frame_info_size) {
+            av_log(avctx, AV_LOG_ERROR, "Not enough data for frame information\n");
+            return AVERROR_INVALIDDATA;
+        }
+        c->frame_info = bytestream2_get_le32u(&gb);
     }
-    c->frame_info = bytestream2_get_le32u(&gb);
     av_log(avctx, AV_LOG_DEBUG, "frame information flags %"PRIX32"\n",
            c->frame_info);
 
@@ -421,6 +622,19 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         restore_rgb_planes(frame.f->data[0], c->planes, frame.f->linesize[0],
                            avctx->width, avctx->height);
         break;
+    case AV_PIX_FMT_GBRAP10:
+    case AV_PIX_FMT_GBRP10:
+        for (i = 0; i < c->planes; i++) {
+            ret = decode_plane10(c, i, (uint16_t *)frame.f->data[i], 1,
+                                 frame.f->linesize[i] / 2, avctx->width,
+                                 avctx->height, plane_start[i],
+                                 plane_start[i + 1] - 1024,
+                                 c->frame_pred == PRED_LEFT);
+            if (ret)
+                return ret;
+        }
+        restore_rgb_planes10(frame.f, avctx->width, avctx->height);
+        break;
     case AV_PIX_FMT_YUV420P:
         for (i = 0; i < 3; i++) {
             ret = decode_plane(c, i, frame.f->data[i], 1, frame.f->linesize[i],
@@ -462,6 +676,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             }
         }
         break;
+    case AV_PIX_FMT_YUV422P10:
+        for (i = 0; i < 3; i++) {
+            ret = decode_plane10(c, i, (uint16_t *)frame.f->data[i], 1, frame.f->linesize[i] / 2,
+                                 avctx->width >> !!i, avctx->height,
+                                 plane_start[i], plane_start[i + 1] - 1024, c->frame_pred == PRED_LEFT);
+            if (ret)
+                return ret;
+        }
+        break;
     }
 
     frame.f->key_frame = 1;
@@ -482,28 +705,37 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     ff_bswapdsp_init(&c->bdsp);
 
-    if (avctx->extradata_size < 16) {
+    if (avctx->extradata_size >= 16) {
+        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
+               avctx->extradata[3], avctx->extradata[2],
+               avctx->extradata[1], avctx->extradata[0]);
+        av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
+               AV_RB32(avctx->extradata + 4));
+        c->frame_info_size = AV_RL32(avctx->extradata + 8);
+        c->flags           = AV_RL32(avctx->extradata + 12);
+
+        if (c->frame_info_size != 4)
+            avpriv_request_sample(avctx, "Frame info not 4 bytes");
+        av_log(avctx, AV_LOG_DEBUG, "Encoding parameters %08"PRIX32"\n", c->flags);
+        c->slices      = (c->flags >> 24) + 1;
+        c->compression = c->flags & 1;
+        c->interlaced  = c->flags & 0x800;
+    } else if (avctx->extradata_size == 8) {
+        av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
+               avctx->extradata[3], avctx->extradata[2],
+               avctx->extradata[1], avctx->extradata[0]);
+        av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
+               AV_RB32(avctx->extradata + 4));
+        c->interlaced  = 0;
+        c->pro         = 1;
+        c->frame_info_size = 4;
+    } else {
         av_log(avctx, AV_LOG_ERROR,
                "Insufficient extradata size %d, should be at least 16\n",
                avctx->extradata_size);
         return AVERROR_INVALIDDATA;
     }
 
-    av_log(avctx, AV_LOG_DEBUG, "Encoder version %d.%d.%d.%d\n",
-           avctx->extradata[3], avctx->extradata[2],
-           avctx->extradata[1], avctx->extradata[0]);
-    av_log(avctx, AV_LOG_DEBUG, "Original format %"PRIX32"\n",
-           AV_RB32(avctx->extradata + 4));
-    c->frame_info_size = AV_RL32(avctx->extradata + 8);
-    c->flags           = AV_RL32(avctx->extradata + 12);
-
-    if (c->frame_info_size != 4)
-        avpriv_request_sample(avctx, "Frame info not 4 bytes");
-    av_log(avctx, AV_LOG_DEBUG, "Encoding parameters %08"PRIX32"\n", c->flags);
-    c->slices      = (c->flags >> 24) + 1;
-    c->compression = c->flags & 1;
-    c->interlaced  = c->flags & 0x800;
-
     c->slice_bits_size = 0;
 
     switch (avctx->codec_tag) {
@@ -525,6 +757,18 @@ static av_cold int decode_init(AVCodecContext *avctx)
         avctx->pix_fmt = AV_PIX_FMT_YUV422P;
         avctx->colorspace = AVCOL_SPC_BT470BG;
         break;
+    case MKTAG('U', 'Q', 'Y', '2'):
+        c->planes      = 3;
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        break;
+    case MKTAG('U', 'Q', 'R', 'G'):
+        c->planes      = 3;
+        avctx->pix_fmt = AV_PIX_FMT_GBRP10;
+        break;
+    case MKTAG('U', 'Q', 'R', 'A'):
+        c->planes      = 4;
+        avctx->pix_fmt = AV_PIX_FMT_GBRAP10;
+        break;
     case MKTAG('U', 'L', 'H', '0'):
         c->planes      = 3;
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c
index 4eddd98..6240413 100644
--- a/libavcodec/utvideoenc.c
+++ b/libavcodec/utvideoenc.c
@@ -2,20 +2,20 @@
  * Ut Video encoder
  * Copyright (c) 2012 Jan Ekström
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -387,7 +387,7 @@ static int write_huff_codes(uint8_t *src, uint8_t *dst, int dst_size,
 }
 
 static int encode_plane(AVCodecContext *avctx, uint8_t *src,
-                        uint8_t *dst, int stride,
+                        uint8_t *dst, int stride, int plane_no,
                         int width, int height, PutByteContext *pb)
 {
     UtvideoContext *c        = avctx->priv_data;
@@ -397,15 +397,17 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     HuffEntry he[256];
 
     uint32_t offset = 0, slice_len = 0;
+    const int cmask = ~(!plane_no && avctx->pix_fmt == AV_PIX_FMT_YUV420P);
     int      i, sstart, send = 0;
     int      symbol;
+    int      ret;
 
     /* Do prediction / make planes */
     switch (c->frame_pred) {
     case PRED_NONE:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             av_image_copy_plane(dst + sstart * width, width,
                                 src + sstart * stride, stride,
                                 width, send - sstart);
@@ -414,7 +416,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     case PRED_LEFT:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             left_predict(src + sstart * stride, dst + sstart * width,
                          stride, width, send - sstart);
         }
@@ -422,7 +424,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     case PRED_MEDIAN:
         for (i = 0; i < c->slices; i++) {
             sstart = send;
-            send   = height * (i + 1) / c->slices;
+            send   = height * (i + 1) / c->slices & cmask;
             median_predict(c, src + sstart * stride, dst + sstart * width,
                            stride, width, send - sstart);
         }
@@ -441,7 +443,7 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
         /* If non-zero count is found, see if it matches width * height */
         if (counts[symbol]) {
             /* Special case if only one symbol was used */
-            if (counts[symbol] == width * height) {
+            if (counts[symbol] == width * (int64_t)height) {
                 /*
                  * Write a zero for the single symbol
                  * used in the plane, else 0xFF.
@@ -465,7 +467,8 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     }
 
     /* Calculate huffman lengths */
-    ff_huff_gen_len_table(lengths, counts);
+    if ((ret = ff_huff_gen_len_table(lengths, counts, 256, 1)) < 0)
+        return ret;
 
     /*
      * Write the plane's header into the output packet:
@@ -485,14 +488,14 @@ static int encode_plane(AVCodecContext *avctx, uint8_t *src,
     send = 0;
     for (i = 0; i < c->slices; i++) {
         sstart  = send;
-        send    = height * (i + 1) / c->slices;
+        send    = height * (i + 1) / c->slices & cmask;
 
         /*
          * Write the huffman codes to a buffer,
          * get the offset in bits and convert to bytes.
          */
         offset += write_huff_codes(dst + sstart * width, c->slice_bits,
-                                   width * (send - sstart), width,
+                                   width * height + 4, width,
                                    send - sstart, he) >> 3;
 
         slice_len = offset - slice_len;
@@ -539,22 +542,17 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int i, ret = 0;
 
     /* Allocate a new packet if needed, and set it to the pointer dst */
-    ret = ff_alloc_packet(pkt, (256 + 4 * c->slices + width * height) *
-                          c->planes + 4);
+    ret = ff_alloc_packet2(avctx, pkt, (256 + 4 * c->slices + width * height) *
+                           c->planes + 4, 0);
 
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Error allocating the output packet, or the provided packet "
-               "was too small.\n");
+    if (ret < 0)
         return ret;
-    }
 
     dst = pkt->data;
 
     bytestream2_init_writer(&pb, dst, pkt->size);
 
-    av_fast_malloc(&c->slice_bits, &c->slice_bits_size,
-                   width * height + AV_INPUT_BUFFER_PADDING_SIZE);
+    av_fast_padded_malloc(&c->slice_bits, &c->slice_bits_size, width * height + 4);
 
     if (!c->slice_bits) {
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer 2.\n");
@@ -572,7 +570,7 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_RGBA:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, c->slice_buffer[i] + 2 * c->slice_stride,
-                               c->slice_buffer[i], c->slice_stride,
+                               c->slice_buffer[i], c->slice_stride, i,
                                width, height, &pb);
 
             if (ret) {
@@ -584,7 +582,7 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV422P:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, pic->data[i], c->slice_buffer[0],
-                               pic->linesize[i], width >> !!i, height, &pb);
+                               pic->linesize[i], i, width >> !!i, height, &pb);
 
             if (ret) {
                 av_log(avctx, AV_LOG_ERROR, "Error encoding plane %d.\n", i);
@@ -595,7 +593,7 @@ static int utvideo_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     case AV_PIX_FMT_YUV420P:
         for (i = 0; i < c->planes; i++) {
             ret = encode_plane(avctx, pic->data[i], c->slice_buffer[0],
-                               pic->linesize[i], width >> !!i, height >> !!i,
+                               pic->linesize[i], i, width >> !!i, height >> !!i,
                                &pb);
 
             if (ret) {
@@ -667,6 +665,7 @@ AVCodec ff_utvideo_encoder = {
     .init           = utvideo_encode_init,
     .encode2        = utvideo_encode_frame,
     .close          = utvideo_encode_close,
+    .capabilities   = AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_INTRA_ONLY,
     .pix_fmts       = (const enum AVPixelFormat[]) {
                           AV_PIX_FMT_RGB24, AV_PIX_FMT_RGBA, AV_PIX_FMT_YUV422P,
                           AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE
diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 74301fe..9af9af6 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -4,54 +4,100 @@
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste dot coudurier at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "avcodec.h"
 #include "internal.h"
+#include "v210dec.h"
 #include "libavutil/bswap.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 
-static av_cold int decode_init(AVCodecContext *avctx)
+#define READ_PIXELS(a, b, c)         \
+    do {                             \
+        val  = av_le2ne32(*src++);   \
+        *a++ =  val & 0x3FF;         \
+        *b++ = (val >> 10) & 0x3FF;  \
+        *c++ = (val >> 20) & 0x3FF;  \
+    } while (0)
+
+static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
 {
-    if (avctx->width & 1) {
-        av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
-        return AVERROR_INVALIDDATA;
+    uint32_t val;
+    int i;
+
+    for( i = 0; i < width-5; i += 6 ){
+        READ_PIXELS(u, y, v);
+        READ_PIXELS(y, u, y);
+        READ_PIXELS(v, y, u);
+        READ_PIXELS(y, v, y);
     }
+}
+
+static av_cold int decode_init(AVCodecContext *avctx)
+{
+    V210DecContext *s = avctx->priv_data;
+
     avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
     avctx->bits_per_raw_sample = 10;
 
+    s->unpack_frame            = v210_planar_unpack_c;
+
+    if (HAVE_MMX)
+        ff_v210_x86_init(s);
+
     return 0;
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
-    int h, w, ret;
+    V210DecContext *s = avctx->priv_data;
+
+    int h, w, ret, stride, aligned_input;
     AVFrame *pic = data;
     const uint8_t *psrc = avpkt->data;
     uint16_t *y, *u, *v;
-    int aligned_width = ((avctx->width + 47) / 48) * 48;
-    int stride = aligned_width * 8 / 3;
+
+    if (s->custom_stride )
+        stride = s->custom_stride;
+    else {
+        int aligned_width = ((avctx->width + 47) / 48) * 48;
+        stride = aligned_width * 8 / 3;
+    }
 
     if (avpkt->size < stride * avctx->height) {
-        av_log(avctx, AV_LOG_ERROR, "packet too small\n");
-        return AVERROR_INVALIDDATA;
+        if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
+            stride = avpkt->size / avctx->height;
+            if (!s->stride_warning_shown)
+                av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
+            s->stride_warning_shown = 1;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    aligned_input = !((uintptr_t)psrc & 0xf) && !(stride & 0xf);
+    if (aligned_input != s->aligned_input) {
+        s->aligned_input = aligned_input;
+        if (HAVE_MMX)
+            ff_v210_x86_init(s);
     }
 
     if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
@@ -63,55 +109,73 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
 
-#define READ_PIXELS(a, b, c)         \
-    do {                             \
-        val  = av_le2ne32(*src++);   \
-        *a++ =  val & 0x3FF;         \
-        *b++ = (val >> 10) & 0x3FF;  \
-        *c++ = (val >> 20) & 0x3FF;  \
-    } while (0)
-
     for (h = 0; h < avctx->height; h++) {
         const uint32_t *src = (const uint32_t*)psrc;
         uint32_t val;
-        for (w = 0; w < avctx->width - 5; w += 6) {
-            READ_PIXELS(u, y, v);
-            READ_PIXELS(y, u, y);
-            READ_PIXELS(v, y, u);
-            READ_PIXELS(y, v, y);
-        }
+
+        w = (avctx->width / 6) * 6;
+        s->unpack_frame(src, y, u, v, w);
+
+        y += w;
+        u += w >> 1;
+        v += w >> 1;
+        src += (w << 1) / 3;
+
         if (w < avctx->width - 1) {
             READ_PIXELS(u, y, v);
 
             val  = av_le2ne32(*src++);
             *y++ =  val & 0x3FF;
-        }
-        if (w < avctx->width - 3) {
-            *u++ = (val >> 10) & 0x3FF;
-            *y++ = (val >> 20) & 0x3FF;
+            if (w < avctx->width - 3) {
+                *u++ = (val >> 10) & 0x3FF;
+                *y++ = (val >> 20) & 0x3FF;
 
-            val  = av_le2ne32(*src++);
-            *v++ =  val & 0x3FF;
-            *y++ = (val >> 10) & 0x3FF;
+                val  = av_le2ne32(*src++);
+                *v++ =  val & 0x3FF;
+                *y++ = (val >> 10) & 0x3FF;
+            }
         }
 
         psrc += stride;
-        y += pic->linesize[0] / 2 - avctx->width;
+        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
         u += pic->linesize[1] / 2 - avctx->width / 2;
         v += pic->linesize[2] / 2 - avctx->width / 2;
     }
 
+    if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
+        /* we have interlaced material flagged in container */
+        pic->interlaced_frame = 1;
+        if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB)
+            pic->top_field_first = 1;
+    }
+
     *got_frame      = 1;
 
     return avpkt->size;
 }
 
+#define V210DEC_FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+static const AVOption v210dec_options[] = {
+    {"custom_stride", "Custom V210 stride", offsetof(V210DecContext, custom_stride), AV_OPT_TYPE_INT,
+     {.i64 = 0}, INT_MIN, INT_MAX, V210DEC_FLAGS},
+    {NULL}
+};
+
+static const AVClass v210dec_class = {
+    "V210 Decoder",
+    av_default_item_name,
+    v210dec_options,
+    LIBAVUTIL_VERSION_INT,
+};
+
 AVCodec ff_v210_decoder = {
     .name           = "v210",
     .long_name      = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_V210,
+    .priv_data_size = sizeof(V210DecContext),
     .init           = decode_init,
     .decode         = decode_frame,
     .capabilities   = AV_CODEC_CAP_DR1,
+    .priv_class     = &v210dec_class,
 };
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
new file mode 100644
index 0000000..533afc4
--- /dev/null
+++ b/libavcodec/v210dec.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_V210DEC_H
+#define AVCODEC_V210DEC_H
+
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+
+
+typedef struct {
+    AVClass *av_class;
+    int custom_stride;
+    int aligned_input;
+    int stride_warning_shown;
+    void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+} V210DecContext;
+
+void ff_v210_x86_init(V210DecContext *s);
+
+#endif /* AVCODEC_V210DEC_H */
diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c
index 51c182c..d3a8e26 100644
--- a/libavcodec/v210enc.c
+++ b/libavcodec/v210enc.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2009 Baptiste Coudurier <baptiste dot coudurier at gmail dot com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -123,7 +123,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     int h, w, ret;
     uint8_t *dst;
 
-    ret = ff_alloc_packet(pkt, avctx->height * stride);
+    ret = ff_alloc_packet2(avctx, pkt, avctx->height * stride, avctx->height * stride);
     if (ret < 0) {
         av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
         return ret;
diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h
index ee3637a..51305c1 100644
--- a/libavcodec/v210enc.h
+++ b/libavcodec/v210enc.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/v210x.c b/libavcodec/v210x.c
index 3f220ff..f6a453a 100644
--- a/libavcodec/v210x.c
+++ b/libavcodec/v210x.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/v308dec.c b/libavcodec/v308dec.c
new file mode 100644
index 0000000..dd53fbd
--- /dev/null
+++ b/libavcodec/v308dec.c
@@ -0,0 +1,83 @@
+/*
+ * v308 decoder
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v308_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUV444P;
+
+    if (avctx->width & 1)
+        av_log(avctx, AV_LOG_WARNING, "v308 requires width to be even.\n");
+
+    return 0;
+}
+
+static int v308_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 3 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            v[j] = *src++;
+            y[j] = *src++;
+            u[j] = *src++;
+        }
+
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_v308_decoder = {
+    .name         = "v308",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V308,
+    .init         = v308_decode_init,
+    .decode       = v308_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/v308enc.c b/libavcodec/v308enc.c
new file mode 100644
index 0000000..b60a72c
--- /dev/null
+++ b/libavcodec/v308enc.c
@@ -0,0 +1,83 @@
+/*
+ * v308 encoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v308_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width & 1) {
+        av_log(avctx, AV_LOG_ERROR, "v308 requires width to be even.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    return 0;
+}
+
+static int v308_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 3, 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            *dst++ = v[j];
+            *dst++ = y[j];
+            *dst++ = u[j];
+        }
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int v308_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_v308_encoder = {
+    .name         = "v308",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V308,
+    .init         = v308_encode_init,
+    .encode2      = v308_encode_frame,
+    .close        = v308_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/v408dec.c b/libavcodec/v408dec.c
new file mode 100644
index 0000000..acff95d
--- /dev/null
+++ b/libavcodec/v408dec.c
@@ -0,0 +1,103 @@
+/*
+ * v408 decoder
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v408_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUVA444P;
+
+    return 0;
+}
+
+static int v408_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v, *a;
+    int i, j, ret;
+
+    if (avpkt->size < 4 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+    a = pic->data[3];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+            if (avctx->codec_id==AV_CODEC_ID_AYUV) {
+                v[j] = *src++;
+                u[j] = *src++;
+                y[j] = *src++;
+                a[j] = *src++;
+            } else {
+                u[j] = *src++;
+                y[j] = *src++;
+                v[j] = *src++;
+                a[j] = *src++;
+            }
+        }
+
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+        a += pic->linesize[3];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+#if CONFIG_AYUV_DECODER
+AVCodec ff_ayuv_decoder = {
+    .name         = "ayuv",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AYUV,
+    .init         = v408_decode_init,
+    .decode       = v408_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
+#endif
+#if CONFIG_V408_DECODER
+AVCodec ff_v408_decoder = {
+    .name         = "v408",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V408,
+    .init         = v408_decode_init,
+    .decode       = v408_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
+#endif
diff --git a/libavcodec/v408enc.c b/libavcodec/v408enc.c
new file mode 100644
index 0000000..f37f360
--- /dev/null
+++ b/libavcodec/v408enc.c
@@ -0,0 +1,104 @@
+/*
+ * v408 encoder
+ *
+ * Copyright (c) 2012 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int v408_encode_init(AVCodecContext *avctx)
+{
+
+    return 0;
+}
+
+static int v408_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v, *a;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 4, 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+    a = pic->data[3];
+
+    for (i = 0; i < avctx->height; i++) {
+        for (j = 0; j < avctx->width; j++) {
+           if (avctx->codec_id==AV_CODEC_ID_AYUV) {
+                *dst++ = v[j];
+                *dst++ = u[j];
+                *dst++ = y[j];
+                *dst++ = a[j];
+            } else {
+                *dst++ = u[j];
+                *dst++ = y[j];
+                *dst++ = v[j];
+                *dst++ = a[j];
+            }
+        }
+        y += pic->linesize[0];
+        u += pic->linesize[1];
+        v += pic->linesize[2];
+        a += pic->linesize[3];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int v408_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+#if CONFIG_AYUV_ENCODER
+AVCodec ff_ayuv_encoder = {
+    .name         = "ayuv",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_AYUV,
+    .init         = v408_encode_init,
+    .encode2      = v408_encode_frame,
+    .close        = v408_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVA444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
+#if CONFIG_V408_ENCODER
+AVCodec ff_v408_encoder = {
+    .name         = "v408",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_V408,
+    .init         = v408_encode_init,
+    .encode2      = v408_encode_frame,
+    .close        = v408_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUVA444P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
+#endif
diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
index ca68e0b..48fab68 100644
--- a/libavcodec/v410dec.c
+++ b/libavcodec/v410dec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,17 +49,15 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data,
     uint8_t *src = avpkt->data;
     uint16_t *y, *u, *v;
     uint32_t val;
-    int i, j;
+    int i, j, ret;
 
     if (avpkt->size < 4 * avctx->height * avctx->width) {
         av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
         return AVERROR(EINVAL);
     }
 
-    if (ff_get_buffer(avctx, pic, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
-    }
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
 
     pic->key_frame = 1;
     pic->pict_type = AV_PICTURE_TYPE_I;
diff --git a/libavcodec/v410enc.c b/libavcodec/v410enc.c
index 1e3f38f..f35ff75 100644
--- a/libavcodec/v410enc.c
+++ b/libavcodec/v410enc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 static av_cold int v410_encode_init(AVCodecContext *avctx)
 {
     if (avctx->width & 1) {
-        av_log(avctx, AV_LOG_ERROR, "v410 requires even width.\n");
+        av_log(avctx, AV_LOG_ERROR, "v410 requires width to be even.\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -43,10 +43,9 @@ static int v410_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     uint32_t val;
     int i, j, ret;
 
-    if ((ret = ff_alloc_packet(pkt, avctx->width * avctx->height * 4)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 4,
+                                            avctx->width * avctx->height * 4)) < 0)
         return ret;
-    }
     dst = pkt->data;
 
 #if FF_API_CODED_FRAME
diff --git a/libavcodec/vaapi.c b/libavcodec/vaapi.c
index 094692e..36db640 100644
--- a/libavcodec/vaapi.c
+++ b/libavcodec/vaapi.c
@@ -4,24 +4,24 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "h264.h"
+#include "libavutil/log.h"
 #include "mpegvideo.h"
 #include "vaapi_internal.h"
 
@@ -35,27 +35,56 @@ static void destroy_buffers(VADisplay display, VABufferID *buffers, unsigned int
 {
     unsigned int i;
     for (i = 0; i < n_buffers; i++) {
-        if (buffers[i]) {
+        if (buffers[i] != VA_INVALID_ID) {
             vaDestroyBuffer(display, buffers[i]);
-            buffers[i] = 0;
+            buffers[i] = VA_INVALID_ID;
         }
     }
 }
 
-int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface)
+int ff_vaapi_context_init(AVCodecContext *avctx)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    const struct vaapi_context * const user_vactx = avctx->hwaccel_context;
+
+    if (!user_vactx) {
+        av_log(avctx, AV_LOG_ERROR, "Hardware acceleration context (hwaccel_context) does not exist.\n");
+        return AVERROR(ENOSYS);
+    }
+
+    vactx->display              = user_vactx->display;
+    vactx->config_id            = user_vactx->config_id;
+    vactx->context_id           = user_vactx->context_id;
+
+    vactx->pic_param_buf_id     = VA_INVALID_ID;
+    vactx->iq_matrix_buf_id     = VA_INVALID_ID;
+    vactx->bitplane_buf_id      = VA_INVALID_ID;
+
+    return 0;
+}
+
+int ff_vaapi_context_fini(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+int ff_vaapi_render_picture(FFVAContext *vactx, VASurfaceID surface)
 {
     VABufferID va_buffers[3];
     unsigned int n_va_buffers = 0;
 
+    if (vactx->pic_param_buf_id == VA_INVALID_ID)
+        return 0;
+
     vaUnmapBuffer(vactx->display, vactx->pic_param_buf_id);
     va_buffers[n_va_buffers++] = vactx->pic_param_buf_id;
 
-    if (vactx->iq_matrix_buf_id) {
+    if (vactx->iq_matrix_buf_id != VA_INVALID_ID) {
         vaUnmapBuffer(vactx->display, vactx->iq_matrix_buf_id);
         va_buffers[n_va_buffers++] = vactx->iq_matrix_buf_id;
     }
 
-    if (vactx->bitplane_buf_id) {
+    if (vactx->bitplane_buf_id != VA_INVALID_ID) {
         vaUnmapBuffer(vactx->display, vactx->bitplane_buf_id);
         va_buffers[n_va_buffers++] = vactx->bitplane_buf_id;
     }
@@ -79,7 +108,7 @@ int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface)
     return 0;
 }
 
-int ff_vaapi_commit_slices(struct vaapi_context *vactx)
+int ff_vaapi_commit_slices(FFVAContext *vactx)
 {
     VABufferID *slice_buf_ids;
     VABufferID slice_param_buf_id, slice_data_buf_id;
@@ -95,7 +124,7 @@ int ff_vaapi_commit_slices(struct vaapi_context *vactx)
         return -1;
     vactx->slice_buf_ids = slice_buf_ids;
 
-    slice_param_buf_id = 0;
+    slice_param_buf_id = VA_INVALID_ID;
     if (vaCreateBuffer(vactx->display, vactx->context_id,
                        VASliceParameterBufferType,
                        vactx->slice_param_size,
@@ -104,7 +133,7 @@ int ff_vaapi_commit_slices(struct vaapi_context *vactx)
         return -1;
     vactx->slice_count = 0;
 
-    slice_data_buf_id = 0;
+    slice_data_buf_id = VA_INVALID_ID;
     if (vaCreateBuffer(vactx->display, vactx->context_id,
                        VASliceDataBufferType,
                        vactx->slice_data_size,
@@ -119,11 +148,11 @@ int ff_vaapi_commit_slices(struct vaapi_context *vactx)
     return 0;
 }
 
-static void *alloc_buffer(struct vaapi_context *vactx, int type, unsigned int size, uint32_t *buf_id)
+static void *alloc_buffer(FFVAContext *vactx, int type, unsigned int size, uint32_t *buf_id)
 {
     void *data = NULL;
 
-    *buf_id = 0;
+    *buf_id = VA_INVALID_ID;
     if (vaCreateBuffer(vactx->display, vactx->context_id,
                        type, size, 1, NULL, buf_id) == VA_STATUS_SUCCESS)
         vaMapBuffer(vactx->display, *buf_id, &data);
@@ -131,22 +160,22 @@ static void *alloc_buffer(struct vaapi_context *vactx, int type, unsigned int si
     return data;
 }
 
-void *ff_vaapi_alloc_pic_param(struct vaapi_context *vactx, unsigned int size)
+void *ff_vaapi_alloc_pic_param(FFVAContext *vactx, unsigned int size)
 {
     return alloc_buffer(vactx, VAPictureParameterBufferType, size, &vactx->pic_param_buf_id);
 }
 
-void *ff_vaapi_alloc_iq_matrix(struct vaapi_context *vactx, unsigned int size)
+void *ff_vaapi_alloc_iq_matrix(FFVAContext *vactx, unsigned int size)
 {
     return alloc_buffer(vactx, VAIQMatrixBufferType, size, &vactx->iq_matrix_buf_id);
 }
 
-uint8_t *ff_vaapi_alloc_bitplane(struct vaapi_context *vactx, uint32_t size)
+uint8_t *ff_vaapi_alloc_bitplane(FFVAContext *vactx, uint32_t size)
 {
     return alloc_buffer(vactx, VABitPlaneBufferType, size, &vactx->bitplane_buf_id);
 }
 
-VASliceParameterBufferBase *ff_vaapi_alloc_slice(struct vaapi_context *vactx, const uint8_t *buffer, uint32_t size)
+VASliceParameterBufferBase *ff_vaapi_alloc_slice(FFVAContext *vactx, const uint8_t *buffer, uint32_t size)
 {
     uint8_t *slice_params;
     VASliceParameterBufferBase *slice_param;
@@ -179,7 +208,7 @@ VASliceParameterBufferBase *ff_vaapi_alloc_slice(struct vaapi_context *vactx, co
 
 void ff_vaapi_common_end_frame(AVCodecContext *avctx)
 {
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
 
     destroy_buffers(vactx->display, &vactx->pic_param_buf_id, 1);
     destroy_buffers(vactx->display, &vactx->iq_matrix_buf_id, 1);
@@ -198,7 +227,7 @@ void ff_vaapi_common_end_frame(AVCodecContext *avctx)
     CONFIG_VC1_VAAPI_HWACCEL   || CONFIG_WMV3_VAAPI_HWACCEL
 int ff_vaapi_mpeg_end_frame(AVCodecContext *avctx)
 {
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     MpegEncContext *s = avctx->priv_data;
     int ret;
 
diff --git a/libavcodec/vaapi.h b/libavcodec/vaapi.h
index 39e8825..7a29f6f 100644
--- a/libavcodec/vaapi.h
+++ b/libavcodec/vaapi.h
@@ -1,23 +1,23 @@
 /*
- * Video Acceleration API (shared data between Libav and the video player)
+ * Video Acceleration API (shared data between FFmpeg and the video player)
  * HW decode acceleration for MPEG-2, MPEG-4, H.264 and VC-1
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,8 @@
  */
 
 #include <stdint.h>
+#include "libavutil/attributes.h"
+#include "version.h"
 
 /**
  * @defgroup lavc_codec_hwaccel_vaapi VA API Decoding
@@ -39,7 +41,7 @@
  */
 
 /**
- * This structure is used to share data between the Libav library and
+ * This structure is used to share data between the FFmpeg library and
  * the client video application.
  * This shall be zero-allocated and available as
  * AVCodecContext.hwaccel_context. All user members can be set once
@@ -72,12 +74,14 @@ struct vaapi_context {
      */
     uint32_t context_id;
 
+#if FF_API_VAAPI_CONTEXT
     /**
      * VAPictureParameterBuffer ID
      *
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t pic_param_buf_id;
 
     /**
@@ -86,6 +90,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t iq_matrix_buf_id;
 
     /**
@@ -94,6 +99,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t bitplane_buf_id;
 
     /**
@@ -102,6 +108,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t *slice_buf_ids;
 
     /**
@@ -110,6 +117,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int n_slice_buf_ids;
 
     /**
@@ -118,6 +126,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_buf_ids_alloc;
 
     /**
@@ -126,6 +135,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     void *slice_params;
 
     /**
@@ -134,6 +144,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_param_size;
 
     /**
@@ -142,6 +153,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_params_alloc;
 
     /**
@@ -150,6 +162,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     unsigned int slice_count;
 
     /**
@@ -157,6 +170,7 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     const uint8_t *slice_data;
 
     /**
@@ -165,7 +179,9 @@ struct vaapi_context {
      * - encoding: unused
      * - decoding: Set by libavcodec
      */
+    attribute_deprecated
     uint32_t slice_data_size;
+#endif
 };
 
 /* @} */
diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
index 075c848..20b1e5e 100644
--- a/libavcodec/vaapi_encode.c
+++ b/libavcodec/vaapi_encode.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vaapi_encode.h b/libavcodec/vaapi_encode.h
index ec552da..b67168b 100644
--- a/libavcodec/vaapi_encode.h
+++ b/libavcodec/vaapi_encode.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vaapi_encode_h264.c b/libavcodec/vaapi_encode_h264.c
index 8690a85..4802929 100644
--- a/libavcodec/vaapi_encode_h264.c
+++ b/libavcodec/vaapi_encode_h264.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -731,6 +731,12 @@ static av_cold int vaapi_encode_h264_init_constant_bitrate(AVCodecContext *avctx
     int hrd_buffer_size;
     int hrd_initial_buffer_fullness;
 
+    if (avctx->bit_rate > INT32_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "Target bitrate of 2^31 bps or "
+               "higher is not supported.\n");
+        return AVERROR(EINVAL);
+    }
+
     if (avctx->rc_buffer_size)
         hrd_buffer_size = avctx->rc_buffer_size;
     else
@@ -769,7 +775,7 @@ static av_cold int vaapi_encode_h264_init_constant_bitrate(AVCodecContext *avctx
     priv->fixed_qp_p   = 26;
     priv->fixed_qp_b   = 26;
 
-    av_log(avctx, AV_LOG_DEBUG, "Using constant-bitrate = %d bps.\n",
+    av_log(avctx, AV_LOG_DEBUG, "Using constant-bitrate = %"PRId64" bps.\n",
            avctx->bit_rate);
     return 0;
 }
diff --git a/libavcodec/vaapi_encode_h265.c b/libavcodec/vaapi_encode_h265.c
index 1b1cce7..c42c08e 100644
--- a/libavcodec/vaapi_encode_h265.c
+++ b/libavcodec/vaapi_encode_h265.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1158,6 +1158,12 @@ static av_cold int vaapi_encode_h265_init_constant_bitrate(AVCodecContext *avctx
     int hrd_buffer_size;
     int hrd_initial_buffer_fullness;
 
+    if (avctx->bit_rate > INT32_MAX) {
+        av_log(avctx, AV_LOG_ERROR, "Target bitrate of 2^31 bps or "
+               "higher is not supported.\n");
+        return AVERROR(EINVAL);
+    }
+
     if (avctx->rc_buffer_size)
         hrd_buffer_size = avctx->rc_buffer_size;
     else
@@ -1196,7 +1202,7 @@ static av_cold int vaapi_encode_h265_init_constant_bitrate(AVCodecContext *avctx
     priv->fixed_qp_p   = 30;
     priv->fixed_qp_b   = 30;
 
-    av_log(avctx, AV_LOG_DEBUG, "Using constant-bitrate = %d bps.\n",
+    av_log(avctx, AV_LOG_DEBUG, "Using constant-bitrate = %"PRId64" bps.\n",
            avctx->bit_rate);
     return 0;
 }
diff --git a/libavcodec/vaapi_encode_h26x.c b/libavcodec/vaapi_encode_h26x.c
index bf9eb92..d806f9b 100644
--- a/libavcodec/vaapi_encode_h26x.c
+++ b/libavcodec/vaapi_encode_h26x.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vaapi_encode_h26x.h b/libavcodec/vaapi_encode_h26x.h
index d6db69a..f8c6e13 100644
--- a/libavcodec/vaapi_encode_h26x.h
+++ b/libavcodec/vaapi_encode_h26x.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vaapi_encode_mjpeg.c b/libavcodec/vaapi_encode_mjpeg.c
index e3bf191..316b359 100644
--- a/libavcodec/vaapi_encode_mjpeg.c
+++ b/libavcodec/vaapi_encode_mjpeg.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vaapi_h264.c b/libavcodec/vaapi_h264.c
index 0925985..92efb26 100644
--- a/libavcodec/vaapi_h264.c
+++ b/libavcodec/vaapi_h264.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 
 /**
  * @file
- * This file implements the glue code between Libav's and VA API's
+ * This file implements the glue code between FFmpeg's and VA API's
  * structures for H.264 decoding.
  */
 
@@ -44,10 +44,10 @@ static void init_vaapi_pic(VAPictureH264 *va_pic)
 }
 
 /**
- * Translate an Libav Picture into its VA API form.
+ * Translate an FFmpeg Picture into its VA API form.
  *
  * @param[out] va_pic          A pointer to VA API's own picture struct
- * @param[in]  pic             A pointer to the Libav picture struct to convert
+ * @param[in]  pic             A pointer to the FFmpeg picture struct to convert
  * @param[in]  pic_structure   The picture field type (as defined in mpegvideo.h),
  *                             supersedes pic's field type if nonzero.
  */
@@ -148,11 +148,11 @@ static int fill_vaapi_ReferenceFrames(VAPictureParameterBufferH264 *pic_param,
 }
 
 /**
- * Fill in VA API reference picture lists from the Libav reference
+ * Fill in VA API reference picture lists from the FFmpeg reference
  * picture list.
  *
  * @param[out] RefPicList  VA API internal reference picture list
- * @param[in]  ref_list    A pointer to the Libav reference list
+ * @param[in]  ref_list    A pointer to the FFmpeg reference list
  * @param[in]  ref_count   The number of reference pictures in ref_list
  */
 static void fill_vaapi_RefPicList(VAPictureH264 RefPicList[32],
@@ -162,7 +162,8 @@ static void fill_vaapi_RefPicList(VAPictureH264 RefPicList[32],
     unsigned int i, n = 0;
     for (i = 0; i < ref_count; i++)
         if (ref_list[i].reference)
-            fill_vaapi_pic(&RefPicList[n++], ref_list[i].parent, 0);
+            fill_vaapi_pic(&RefPicList[n++], ref_list[i].parent,
+                           ref_list[i].reference);
 
     for (; n < 32; n++)
         init_vaapi_pic(&RefPicList[n]);
@@ -226,7 +227,7 @@ static int vaapi_h264_start_frame(AVCodecContext          *avctx,
                                   av_unused uint32_t       size)
 {
     H264Context * const h = avctx->priv_data;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     const PPS *pps = h->ps.pps;
     const SPS *sps = h->ps.sps;
     VAPictureParameterBufferH264 *pic_param;
@@ -260,7 +261,7 @@ static int vaapi_h264_start_frame(AVCodecContext          *avctx,
     pic_param->seq_fields.bits.delta_pic_order_always_zero_flag = sps->delta_pic_order_always_zero_flag;
     pic_param->num_slice_groups_minus1                          = pps->slice_group_count - 1;
     pic_param->slice_group_map_type                             = pps->mb_slice_group_map_type;
-    pic_param->slice_group_change_rate_minus1                   = 0; /* XXX: unimplemented in Libav */
+    pic_param->slice_group_change_rate_minus1                   = 0; /* XXX: unimplemented in FFmpeg */
     pic_param->pic_init_qp_minus26                              = pps->init_qp - 26;
     pic_param->pic_init_qs_minus26                              = pps->init_qs - 26;
     pic_param->chroma_qp_index_offset                           = pps->chroma_qp_index_offset[0];
@@ -291,7 +292,7 @@ static int vaapi_h264_start_frame(AVCodecContext          *avctx,
 /** End a hardware decoding based frame. */
 static int vaapi_h264_end_frame(AVCodecContext *avctx)
 {
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     H264Context * const h = avctx->priv_data;
     H264SliceContext *sl = &h->slice_ctx[0];
     int ret;
@@ -316,12 +317,13 @@ static int vaapi_h264_decode_slice(AVCodecContext *avctx,
                                    const uint8_t  *buffer,
                                    uint32_t        size)
 {
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     H264Context * const h = avctx->priv_data;
     H264SliceContext *sl  = &h->slice_ctx[0];
     VASliceParameterBufferH264 *slice_param;
 
     /* Fill in VASliceParameterBufferH264. */
-    slice_param = (VASliceParameterBufferH264 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferH264 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->slice_data_bit_offset          = get_bits_count(&sl->gb);
@@ -358,4 +360,7 @@ AVHWAccel ff_h264_vaapi_hwaccel = {
     .start_frame    = vaapi_h264_start_frame,
     .end_frame      = vaapi_h264_end_frame,
     .decode_slice   = vaapi_h264_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
diff --git a/libavcodec/vaapi_hevc.c b/libavcodec/vaapi_hevc.c
new file mode 100644
index 0000000..62f783e
--- /dev/null
+++ b/libavcodec/vaapi_hevc.c
@@ -0,0 +1,490 @@
+/*
+ * HEVC HW decode acceleration through VA API
+ *
+ * Copyright (C) 2015 Timo Rothenpieler <timo@rothenpieler.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vaapi_internal.h"
+#include "hevc.h"
+#include "mpegutils.h"
+
+/**
+ * @file
+ * This file implements the glue code between FFmpeg's and VA API's
+ * structures for HEVC decoding.
+ */
+
+typedef struct vaapi_hevc_frame_data {
+    VAPictureParameterBufferHEVC *pic_param;
+    VASliceParameterBufferHEVC *last_slice_param;
+} vaapi_hevc_frame_data;
+
+/**
+ * Initialize an empty VA API picture.
+ *
+ * VA API requires a fixed-size reference picture array.
+ */
+static void init_vaapi_pic(VAPictureHEVC *va_pic)
+{
+    va_pic->picture_id = VA_INVALID_ID;
+    va_pic->flags = VA_PICTURE_HEVC_INVALID;
+    va_pic->pic_order_cnt = 0;
+}
+
+static void fill_vaapi_pic(VAPictureHEVC *va_pic, const HEVCFrame *pic, int rps_type)
+{
+    va_pic->picture_id = ff_vaapi_get_surface_id(pic->frame);
+    va_pic->pic_order_cnt = pic->poc;
+    va_pic->flags = rps_type;
+
+    if (pic->flags & HEVC_FRAME_FLAG_LONG_REF)
+        va_pic->flags |= VA_PICTURE_HEVC_LONG_TERM_REFERENCE;
+
+    if (pic->frame->interlaced_frame) {
+        va_pic->flags |= VA_PICTURE_HEVC_FIELD_PIC;
+
+        if (!pic->frame->top_field_first) {
+            va_pic->flags |= VA_PICTURE_HEVC_BOTTOM_FIELD;
+        }
+    }
+}
+
+static int find_frame_rps_type(const HEVCContext *h, const HEVCFrame *pic)
+{
+    VASurfaceID pic_surf = ff_vaapi_get_surface_id(pic->frame);
+    int i;
+
+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; ++i) {
+        if (pic_surf == ff_vaapi_get_surface_id(h->rps[ST_CURR_BEF].ref[i]->frame))
+            return VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE;
+    }
+
+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; ++i) {
+        if (pic_surf == ff_vaapi_get_surface_id(h->rps[ST_CURR_AFT].ref[i]->frame))
+            return VA_PICTURE_HEVC_RPS_ST_CURR_AFTER;
+    }
+
+    for (i = 0; i < h->rps[LT_CURR].nb_refs; ++i) {
+        if (pic_surf == ff_vaapi_get_surface_id(h->rps[LT_CURR].ref[i]->frame))
+            return VA_PICTURE_HEVC_RPS_LT_CURR;
+    }
+
+    return 0;
+}
+
+static void fill_vaapi_ReferenceFrames(const HEVCContext *h, VAPictureParameterBufferHEVC *pp)
+{
+    const HEVCFrame *current_picture = h->ref;
+    int i, j, rps_type;
+
+    for (i = 0, j = 0; i < FF_ARRAY_ELEMS(pp->ReferenceFrames); i++) {
+        const HEVCFrame *frame = NULL;
+
+        while (!frame && j < FF_ARRAY_ELEMS(h->DPB)) {
+            if (&h->DPB[j] != current_picture && (h->DPB[j].flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF)))
+                frame = &h->DPB[j];
+            j++;
+        }
+
+        init_vaapi_pic(&pp->ReferenceFrames[i]);
+
+        if (frame) {
+            rps_type = find_frame_rps_type(h, frame);
+            fill_vaapi_pic(&pp->ReferenceFrames[i], frame, rps_type);
+        }
+    }
+}
+
+static uint8_t get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame)
+{
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    VAPictureParameterBufferHEVC *pp = frame_data->pic_param;
+    uint8_t i;
+
+    if (!frame)
+        return 0xff;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(pp->ReferenceFrames); ++i) {
+        VASurfaceID pid = pp->ReferenceFrames[i].picture_id;
+        int poc = pp->ReferenceFrames[i].pic_order_cnt;
+        if (pid != VA_INVALID_ID && pid == ff_vaapi_get_surface_id(frame->frame) && poc == frame->poc)
+            return i;
+    }
+
+    return 0xff;
+}
+
+static void fill_picture_parameters(const HEVCContext *h, VAPictureParameterBufferHEVC *pp)
+{
+    int i;
+
+    pp->pic_fields.value = 0;
+    pp->slice_parsing_fields.value = 0;
+
+    fill_vaapi_pic(&pp->CurrPic, h->ref, 0);
+    fill_vaapi_ReferenceFrames(h, pp);
+
+    pp->pic_width_in_luma_samples  = h->ps.sps->width;
+    pp->pic_height_in_luma_samples = h->ps.sps->height;
+
+    pp->log2_min_luma_coding_block_size_minus3 = h->ps.sps->log2_min_cb_size - 3;
+
+    pp->pic_fields.bits.chroma_format_idc = h->ps.sps->chroma_format_idc;
+
+    pp->sps_max_dec_pic_buffering_minus1 = h->ps.sps->temporal_layer[h->ps.sps->max_sub_layers - 1].max_dec_pic_buffering - 1;
+    pp->log2_diff_max_min_luma_coding_block_size = h->ps.sps->log2_diff_max_min_coding_block_size;
+    pp->log2_min_transform_block_size_minus2 = h->ps.sps->log2_min_tb_size - 2;
+    pp->log2_diff_max_min_transform_block_size = h->ps.sps->log2_max_trafo_size  - h->ps.sps->log2_min_tb_size;
+    pp->max_transform_hierarchy_depth_inter = h->ps.sps->max_transform_hierarchy_depth_inter;
+    pp->max_transform_hierarchy_depth_intra = h->ps.sps->max_transform_hierarchy_depth_intra;
+    pp->num_short_term_ref_pic_sets = h->ps.sps->nb_st_rps;
+    pp->num_long_term_ref_pic_sps = h->ps.sps->num_long_term_ref_pics_sps;
+
+    pp->num_ref_idx_l0_default_active_minus1 = h->ps.pps->num_ref_idx_l0_default_active - 1;
+    pp->num_ref_idx_l1_default_active_minus1 = h->ps.pps->num_ref_idx_l1_default_active - 1;
+    pp->init_qp_minus26 = h->ps.pps->pic_init_qp_minus26;
+
+    pp->pps_cb_qp_offset = h->ps.pps->cb_qp_offset;
+    pp->pps_cr_qp_offset = h->ps.pps->cr_qp_offset;
+
+    pp->pic_fields.bits.tiles_enabled_flag = h->ps.pps->tiles_enabled_flag;
+    pp->pic_fields.bits.separate_colour_plane_flag = h->ps.sps->separate_colour_plane_flag;
+    pp->pic_fields.bits.pcm_enabled_flag = h->ps.sps->pcm_enabled_flag;
+    pp->pic_fields.bits.scaling_list_enabled_flag = h->ps.sps->scaling_list_enable_flag;
+    pp->pic_fields.bits.transform_skip_enabled_flag = h->ps.pps->transform_skip_enabled_flag;
+    pp->pic_fields.bits.amp_enabled_flag = h->ps.sps->amp_enabled_flag;
+    pp->pic_fields.bits.strong_intra_smoothing_enabled_flag = h->ps.sps->sps_strong_intra_smoothing_enable_flag;
+    pp->pic_fields.bits.sign_data_hiding_enabled_flag = h->ps.pps->sign_data_hiding_flag;
+    pp->pic_fields.bits.constrained_intra_pred_flag = h->ps.pps->constrained_intra_pred_flag;
+    pp->pic_fields.bits.cu_qp_delta_enabled_flag = h->ps.pps->cu_qp_delta_enabled_flag;
+    pp->pic_fields.bits.weighted_pred_flag = h->ps.pps->weighted_pred_flag;
+    pp->pic_fields.bits.weighted_bipred_flag = h->ps.pps->weighted_bipred_flag;
+    pp->pic_fields.bits.transquant_bypass_enabled_flag = h->ps.pps->transquant_bypass_enable_flag;
+    pp->pic_fields.bits.entropy_coding_sync_enabled_flag = h->ps.pps->entropy_coding_sync_enabled_flag;
+    pp->pic_fields.bits.pps_loop_filter_across_slices_enabled_flag = h->ps.pps->seq_loop_filter_across_slices_enabled_flag;
+    pp->pic_fields.bits.loop_filter_across_tiles_enabled_flag = h->ps.pps->loop_filter_across_tiles_enabled_flag;
+
+    pp->pic_fields.bits.pcm_loop_filter_disabled_flag = h->ps.sps->pcm.loop_filter_disable_flag;
+    pp->pcm_sample_bit_depth_luma_minus1 = h->ps.sps->pcm.bit_depth - 1;
+    pp->pcm_sample_bit_depth_chroma_minus1 = h->ps.sps->pcm.bit_depth_chroma - 1;
+    pp->log2_min_pcm_luma_coding_block_size_minus3 = h->ps.sps->pcm.log2_min_pcm_cb_size - 3;
+    pp->log2_diff_max_min_pcm_luma_coding_block_size = h->ps.sps->pcm.log2_max_pcm_cb_size - h->ps.sps->pcm.log2_min_pcm_cb_size;
+
+    memset(pp->column_width_minus1, 0, sizeof(pp->column_width_minus1));
+    memset(pp->row_height_minus1, 0, sizeof(pp->row_height_minus1));
+
+    if (h->ps.pps->tiles_enabled_flag) {
+        pp->num_tile_columns_minus1 = h->ps.pps->num_tile_columns - 1;
+        pp->num_tile_rows_minus1 = h->ps.pps->num_tile_rows - 1;
+
+        for (i = 0; i < h->ps.pps->num_tile_columns; i++)
+            pp->column_width_minus1[i] = h->ps.pps->column_width[i] - 1;
+
+        for (i = 0; i < h->ps.pps->num_tile_rows; i++)
+            pp->row_height_minus1[i] = h->ps.pps->row_height[i] - 1;
+    }
+
+    pp->diff_cu_qp_delta_depth = h->ps.pps->diff_cu_qp_delta_depth;
+    pp->pps_beta_offset_div2 = h->ps.pps->beta_offset / 2;
+    pp->pps_tc_offset_div2 = h->ps.pps->tc_offset / 2;
+    pp->log2_parallel_merge_level_minus2 = h->ps.pps->log2_parallel_merge_level - 2;
+
+    /* Different chroma/luma bit depths are currently not supported by ffmpeg. */
+    pp->bit_depth_luma_minus8 = h->ps.sps->bit_depth - 8;
+    pp->bit_depth_chroma_minus8 = h->ps.sps->bit_depth - 8;
+
+    pp->slice_parsing_fields.bits.lists_modification_present_flag = h->ps.pps->lists_modification_present_flag;
+    pp->slice_parsing_fields.bits.long_term_ref_pics_present_flag = h->ps.sps->long_term_ref_pics_present_flag;
+    pp->slice_parsing_fields.bits.sps_temporal_mvp_enabled_flag = h->ps.sps->sps_temporal_mvp_enabled_flag;
+    pp->slice_parsing_fields.bits.cabac_init_present_flag = h->ps.pps->cabac_init_present_flag;
+    pp->slice_parsing_fields.bits.output_flag_present_flag = h->ps.pps->output_flag_present_flag;
+    pp->slice_parsing_fields.bits.dependent_slice_segments_enabled_flag = h->ps.pps->dependent_slice_segments_enabled_flag;
+    pp->slice_parsing_fields.bits.pps_slice_chroma_qp_offsets_present_flag = h->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag;
+    pp->slice_parsing_fields.bits.sample_adaptive_offset_enabled_flag = h->ps.sps->sao_enabled;
+    pp->slice_parsing_fields.bits.deblocking_filter_override_enabled_flag = h->ps.pps->deblocking_filter_override_enabled_flag;
+    pp->slice_parsing_fields.bits.pps_disable_deblocking_filter_flag = h->ps.pps->disable_dbf;
+    pp->slice_parsing_fields.bits.slice_segment_header_extension_present_flag = h->ps.pps->slice_header_extension_present_flag;
+
+    pp->log2_max_pic_order_cnt_lsb_minus4 = h->ps.sps->log2_max_poc_lsb - 4;
+    pp->num_extra_slice_header_bits = h->ps.pps->num_extra_slice_header_bits;
+
+    if (h->nal_unit_type >= NAL_BLA_W_LP && h->nal_unit_type <= NAL_CRA_NUT) {
+        pp->slice_parsing_fields.bits.RapPicFlag = 1;
+    } else {
+        pp->slice_parsing_fields.bits.RapPicFlag = 0;
+    }
+
+    if (IS_IDR(h)) {
+        pp->slice_parsing_fields.bits.IdrPicFlag = 1;
+    } else {
+        pp->slice_parsing_fields.bits.IdrPicFlag = 0;
+    }
+
+    if (IS_IRAP(h)) {
+        pp->slice_parsing_fields.bits.IntraPicFlag = 1;
+    } else {
+        pp->slice_parsing_fields.bits.IntraPicFlag = 0;
+    }
+
+    if (h->sh.short_term_ref_pic_set_sps_flag == 0 && h->sh.short_term_rps) {
+        pp->st_rps_bits = h->sh.short_term_ref_pic_set_size;
+    } else {
+        pp->st_rps_bits = 0;
+    }
+
+    /* TODO */
+    pp->pic_fields.bits.NoPicReorderingFlag = 0;
+    pp->pic_fields.bits.NoBiPredFlag = 0;
+}
+
+
+/** Initialize and start decoding a frame with VA API. */
+static int vaapi_hevc_start_frame(AVCodecContext          *avctx,
+                                  av_unused const uint8_t *buffer,
+                                  av_unused uint32_t       size)
+{
+    HEVCContext * const h = avctx->priv_data;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    VAPictureParameterBufferHEVC *pic_param;
+    VAIQMatrixBufferHEVC *iq_matrix;
+    ScalingList const * scaling_list;
+    int i, j, pos;
+
+    ff_dlog(avctx, "vaapi_hevc_start_frame()\n");
+
+    vactx->slice_param_size = sizeof(VASliceParameterBufferHEVC);
+
+    /* Fill in VAPictureParameterBufferHEVC. */
+    pic_param = ff_vaapi_alloc_pic_param(vactx, sizeof(VAPictureParameterBufferHEVC));
+    if (!pic_param)
+        return -1;
+    fill_picture_parameters(h, pic_param);
+    frame_data->pic_param = pic_param;
+
+    /* Fill in VAIQMatrixBufferHEVC. */
+    if (h->ps.pps->scaling_list_data_present_flag) {
+        scaling_list = &h->ps.pps->scaling_list;
+    } else if (h->ps.sps->scaling_list_enable_flag) {
+        scaling_list = &h->ps.sps->scaling_list;
+    } else {
+        return 0;
+    }
+
+    iq_matrix = ff_vaapi_alloc_iq_matrix(vactx, sizeof(VAIQMatrixBufferHEVC));
+    if (!iq_matrix)
+        return -1;
+
+    for (i = 0; i < 6; ++i) {
+        for (j = 0; j < 16; ++j) {
+            pos = 4 * ff_hevc_diag_scan4x4_y[j] + ff_hevc_diag_scan4x4_x[j];
+            iq_matrix->ScalingList4x4[i][j] = scaling_list->sl[0][i][pos];
+        }
+        for (j = 0; j < 64; ++j) {
+            pos = 8 * ff_hevc_diag_scan8x8_y[j] + ff_hevc_diag_scan8x8_x[j];
+            iq_matrix->ScalingList8x8[i][j] = scaling_list->sl[1][i][pos];
+            iq_matrix->ScalingList16x16[i][j] = scaling_list->sl[2][i][pos];
+            if (i < 2) {
+                iq_matrix->ScalingList32x32[i][j] = scaling_list->sl[3][i * 3][pos];
+            }
+        }
+        iq_matrix->ScalingListDC16x16[i] = scaling_list->sl_dc[0][i];
+        if (i < 2) {
+            iq_matrix->ScalingListDC32x32[i] = scaling_list->sl_dc[1][i * 3];
+        }
+    }
+
+    return 0;
+}
+
+/** End a hardware decoding based frame. */
+static int vaapi_hevc_end_frame(AVCodecContext *avctx)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    HEVCContext * const h = avctx->priv_data;
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    int ret;
+
+    ff_dlog(avctx, "vaapi_hevc_end_frame()\n");
+
+    frame_data->last_slice_param->LongSliceFlags.fields.LastSliceOfPic = 1;
+
+    ret = ff_vaapi_commit_slices(vactx);
+    if (ret < 0)
+        goto finish;
+
+    ret = ff_vaapi_render_picture(vactx, ff_vaapi_get_surface_id(h->ref->frame));
+    if (ret < 0)
+        goto finish;
+
+finish:
+    ff_vaapi_common_end_frame(avctx);
+    return ret;
+}
+
+static int fill_pred_weight_table(HEVCContext * const h,
+                                  VASliceParameterBufferHEVC *slice_param,
+                                  SliceHeader * const sh)
+{
+    int i;
+
+    memset(slice_param->delta_luma_weight_l0, 0, sizeof(slice_param->delta_luma_weight_l0));
+    memset(slice_param->delta_luma_weight_l1, 0, sizeof(slice_param->delta_luma_weight_l1));
+    memset(slice_param->luma_offset_l0, 0, sizeof(slice_param->luma_offset_l0));
+    memset(slice_param->luma_offset_l1, 0, sizeof(slice_param->luma_offset_l1));
+    memset(slice_param->delta_chroma_weight_l0, 0, sizeof(slice_param->delta_chroma_weight_l0));
+    memset(slice_param->delta_chroma_weight_l1, 0, sizeof(slice_param->delta_chroma_weight_l1));
+    memset(slice_param->ChromaOffsetL0, 0, sizeof(slice_param->ChromaOffsetL0));
+    memset(slice_param->ChromaOffsetL1, 0, sizeof(slice_param->ChromaOffsetL1));
+
+    slice_param->delta_chroma_log2_weight_denom = 0;
+    slice_param->luma_log2_weight_denom = 0;
+
+    if (  sh->slice_type == I_SLICE
+      || (sh->slice_type == P_SLICE && !h->ps.pps->weighted_pred_flag)
+      || (sh->slice_type == B_SLICE && !h->ps.pps->weighted_bipred_flag)) {
+        return 0;
+    }
+
+    slice_param->luma_log2_weight_denom = sh->luma_log2_weight_denom;
+
+    if (h->ps.sps->chroma_format_idc) {
+        slice_param->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
+    }
+
+    for (i = 0; i < 15 && i < sh->nb_refs[L0]; ++i) {
+        slice_param->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - (1 << sh->luma_log2_weight_denom);
+        slice_param->luma_offset_l0[i] = sh->luma_offset_l0[i];
+        slice_param->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - (1 << sh->chroma_log2_weight_denom);
+        slice_param->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - (1 << sh->chroma_log2_weight_denom);
+        slice_param->ChromaOffsetL0[i][0] = sh->chroma_offset_l0[i][0];
+        slice_param->ChromaOffsetL0[i][1] = sh->chroma_offset_l0[i][1];
+    }
+
+    if (sh->slice_type == B_SLICE) {
+        for (i = 0; i < 15 && i < sh->nb_refs[L1]; ++i) {
+            slice_param->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - (1 << sh->luma_log2_weight_denom);
+            slice_param->luma_offset_l1[i] = sh->luma_offset_l1[i];
+            slice_param->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - (1 << sh->chroma_log2_weight_denom);
+            slice_param->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - (1 << sh->chroma_log2_weight_denom);
+            slice_param->ChromaOffsetL1[i][0] = sh->chroma_offset_l1[i][0];
+            slice_param->ChromaOffsetL1[i][1] = sh->chroma_offset_l1[i][1];
+        }
+    }
+
+    return 0;
+}
+
+/** Decode the given hevc slice with VA API. */
+static int vaapi_hevc_decode_slice(AVCodecContext *avctx,
+                                   const uint8_t  *buffer,
+                                   uint32_t        size)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    HEVCContext * const h = avctx->priv_data;
+    vaapi_hevc_frame_data *frame_data = h->ref->hwaccel_picture_private;
+    SliceHeader * const sh = &h->sh;
+    VASliceParameterBufferHEVC *slice_param;
+    int i, list_idx;
+    uint8_t nb_list = sh->slice_type == B_SLICE ? 2 : 1;
+
+    if (sh->slice_type == I_SLICE)
+        nb_list = 0;
+
+    ff_dlog(avctx, "vaapi_hevc_decode_slice(): buffer %p, size %d\n", buffer, size);
+
+    /* Fill in VASliceParameterBufferH264. */
+    slice_param = (VASliceParameterBufferHEVC *)ff_vaapi_alloc_slice(vactx, buffer, size);
+    if (!slice_param)
+        return -1;
+
+    frame_data->last_slice_param = slice_param;
+
+    /* The base structure changed, so this has to be re-set in order to be valid on every byte order. */
+    slice_param->slice_data_flag = VA_SLICE_DATA_FLAG_ALL;
+
+    /* Add 1 to the bits count here to account for the byte_alignment bit, which allways is at least one bit and not accounted for otherwise. */
+    slice_param->slice_data_byte_offset = (get_bits_count(&h->HEVClc->gb) + 1 + 7) / 8;
+
+    slice_param->slice_segment_address = sh->slice_segment_addr;
+
+    slice_param->LongSliceFlags.value = 0;
+    slice_param->LongSliceFlags.fields.dependent_slice_segment_flag = sh->dependent_slice_segment_flag;
+    slice_param->LongSliceFlags.fields.slice_type = sh->slice_type;
+    slice_param->LongSliceFlags.fields.color_plane_id = sh->colour_plane_id;
+    slice_param->LongSliceFlags.fields.mvd_l1_zero_flag = sh->mvd_l1_zero_flag;
+    slice_param->LongSliceFlags.fields.cabac_init_flag = sh->cabac_init_flag;
+    slice_param->LongSliceFlags.fields.slice_temporal_mvp_enabled_flag = sh->slice_temporal_mvp_enabled_flag;
+    slice_param->LongSliceFlags.fields.slice_deblocking_filter_disabled_flag = sh->disable_deblocking_filter_flag;
+    slice_param->LongSliceFlags.fields.collocated_from_l0_flag = sh->collocated_list == L0 ? 1 : 0;
+    slice_param->LongSliceFlags.fields.slice_loop_filter_across_slices_enabled_flag = sh->slice_loop_filter_across_slices_enabled_flag;
+
+    slice_param->LongSliceFlags.fields.slice_sao_luma_flag = sh->slice_sample_adaptive_offset_flag[0];
+    if (h->ps.sps->chroma_format_idc) {
+        slice_param->LongSliceFlags.fields.slice_sao_chroma_flag = sh->slice_sample_adaptive_offset_flag[1];
+    }
+
+    if (sh->slice_temporal_mvp_enabled_flag) {
+        slice_param->collocated_ref_idx = sh->collocated_ref_idx;
+    } else {
+        slice_param->collocated_ref_idx = 0xFF;
+    }
+
+    slice_param->slice_qp_delta = sh->slice_qp_delta;
+    slice_param->slice_cb_qp_offset = sh->slice_cb_qp_offset;
+    slice_param->slice_cr_qp_offset = sh->slice_cr_qp_offset;
+    slice_param->slice_beta_offset_div2 = sh->beta_offset / 2;
+    slice_param->slice_tc_offset_div2 = sh->tc_offset / 2;
+
+    if (sh->slice_type == I_SLICE) {
+        slice_param->five_minus_max_num_merge_cand = 0;
+    } else {
+        slice_param->five_minus_max_num_merge_cand = 5 - sh->max_num_merge_cand;
+    }
+
+    slice_param->num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0;
+    slice_param->num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0;
+
+    memset(slice_param->RefPicList, 0xFF, sizeof(slice_param->RefPicList));
+
+    /* h->ref->refPicList is updated befor calling each slice */
+    for (list_idx = 0; list_idx < nb_list; ++list_idx) {
+        RefPicList *rpl = &h->ref->refPicList[list_idx];
+
+        for (i = 0; i < rpl->nb_refs; ++i) {
+            slice_param->RefPicList[list_idx][i] = get_ref_pic_index(h, rpl->ref[i]);
+        }
+    }
+
+    return fill_pred_weight_table(h, slice_param, sh);
+}
+
+AVHWAccel ff_hevc_vaapi_hwaccel = {
+    .name                 = "hevc_vaapi",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_HEVC,
+    .pix_fmt              = AV_PIX_FMT_VAAPI,
+    .start_frame          = vaapi_hevc_start_frame,
+    .end_frame            = vaapi_hevc_end_frame,
+    .decode_slice         = vaapi_hevc_decode_slice,
+    .init                 = ff_vaapi_context_init,
+    .uninit               = ff_vaapi_context_fini,
+    .priv_data_size       = sizeof(FFVAContext),
+    .frame_priv_data_size = sizeof(vaapi_hevc_frame_data),
+};
diff --git a/libavcodec/vaapi_internal.h b/libavcodec/vaapi_internal.h
index 5e2a6ca..306ae13 100644
--- a/libavcodec/vaapi_internal.h
+++ b/libavcodec/vaapi_internal.h
@@ -4,20 +4,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +27,7 @@
 #include <va/va.h>
 #include "vaapi.h"
 #include "avcodec.h"
+#include "internal.h"
 
 /**
  * @addtogroup VAAPI_Decoding
@@ -34,23 +35,53 @@
  * @{
  */
 
+typedef struct {
+    VADisplay display;                  ///< Windowing system dependent handle
+    VAConfigID config_id;               ///< Configuration ID
+    VAContextID context_id;             ///< Context ID (video decode pipeline)
+    VABufferID pic_param_buf_id;        ///< Picture parameter buffer
+    VABufferID iq_matrix_buf_id;        ///< Inverse quantiser matrix buffer
+    VABufferID bitplane_buf_id;         ///< Bitplane buffer (for VC-1 decoding)
+    VABufferID *slice_buf_ids;          ///< Slice parameter/data buffers
+    unsigned int n_slice_buf_ids;       ///< Number of effective slice buffers
+    unsigned int slice_buf_ids_alloc;   ///< Number of allocated slice buffers
+    void *slice_params;                 ///< Pointer to slice parameter buffers
+    unsigned int slice_param_size;      ///< Size of a slice parameter element
+    unsigned int slice_params_alloc;    ///< Number of allocated slice parameters
+    unsigned int slice_count;           ///< Number of slices currently filled in
+    const uint8_t *slice_data;          ///< Pointer to slice data buffer base
+    unsigned int slice_data_size;       ///< Current size of slice data
+} FFVAContext;
+
+/** Extract vaapi_context from an AVCodecContext */
+static inline FFVAContext *ff_vaapi_get_context(AVCodecContext *avctx)
+{
+    return avctx->internal->hwaccel_priv_data;
+}
+
 /** Extract VASurfaceID from an AVFrame */
 static inline VASurfaceID ff_vaapi_get_surface_id(AVFrame *pic)
 {
     return (uintptr_t)pic->data[3];
 }
 
+/** Common AVHWAccel.init() implementation */
+int ff_vaapi_context_init(AVCodecContext *avctx);
+
+/** Common AVHWAccel.uninit() implementation */
+int ff_vaapi_context_fini(AVCodecContext *avctx);
+
 /** Common AVHWAccel.end_frame() implementation */
 void ff_vaapi_common_end_frame(AVCodecContext *avctx);
 
 /** Allocate a new picture parameter buffer */
-void *ff_vaapi_alloc_pic_param(struct vaapi_context *vactx, unsigned int size);
+void *ff_vaapi_alloc_pic_param(FFVAContext *vactx, unsigned int size);
 
 /** Allocate a new IQ matrix buffer */
-void *ff_vaapi_alloc_iq_matrix(struct vaapi_context *vactx, unsigned int size);
+void *ff_vaapi_alloc_iq_matrix(FFVAContext *vactx, unsigned int size);
 
 /** Allocate a new bit-plane buffer */
-uint8_t *ff_vaapi_alloc_bitplane(struct vaapi_context *vactx, uint32_t size);
+uint8_t *ff_vaapi_alloc_bitplane(FFVAContext *vactx, uint32_t size);
 
 /**
  * Allocate a new slice descriptor for the input slice.
@@ -60,11 +91,11 @@ uint8_t *ff_vaapi_alloc_bitplane(struct vaapi_context *vactx, uint32_t size);
  * @param size the size of the slice in bytes
  * @return the newly allocated slice parameter
  */
-VASliceParameterBufferBase *ff_vaapi_alloc_slice(struct vaapi_context *vactx, const uint8_t *buffer, uint32_t size);
+VASliceParameterBufferBase *ff_vaapi_alloc_slice(FFVAContext *vactx, const uint8_t *buffer, uint32_t size);
 
 int ff_vaapi_mpeg_end_frame(AVCodecContext *avctx);
-int ff_vaapi_commit_slices(struct vaapi_context *vactx);
-int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface);
+int ff_vaapi_commit_slices(FFVAContext *vactx);
+int ff_vaapi_render_picture(FFVAContext *vactx, VASurfaceID surface);
 
 /* @} */
 
diff --git a/libavcodec/vaapi_mpeg2.c b/libavcodec/vaapi_mpeg2.c
index cb77745..9329e54 100644
--- a/libavcodec/vaapi_mpeg2.c
+++ b/libavcodec/vaapi_mpeg2.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -41,7 +41,7 @@ static inline int mpeg2_get_is_frame_start(MpegEncContext *s)
 static int vaapi_mpeg2_start_frame(AVCodecContext *avctx, av_unused const uint8_t *buffer, av_unused uint32_t size)
 {
     struct MpegEncContext * const s = avctx->priv_data;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VAPictureParameterBufferMPEG2 *pic_param;
     VAIQMatrixBufferMPEG2 *iq_matrix;
     int i;
@@ -102,6 +102,7 @@ static int vaapi_mpeg2_start_frame(AVCodecContext *avctx, av_unused const uint8_
 static int vaapi_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 {
     MpegEncContext * const s = avctx->priv_data;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VASliceParameterBufferMPEG2 *slice_param;
     GetBitContext gb;
     uint32_t quantiser_scale_code, intra_slice_flag, macroblock_offset;
@@ -114,13 +115,13 @@ static int vaapi_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer
     intra_slice_flag = get_bits1(&gb);
     if (intra_slice_flag) {
         skip_bits(&gb, 8);
-        while (get_bits1(&gb) != 0)
-            skip_bits(&gb, 8);
+        if (skip_1stop_8data_bits(&gb) < 0)
+            return AVERROR_INVALIDDATA;
     }
     macroblock_offset = get_bits_count(&gb);
 
     /* Fill in VASliceParameterBufferMPEG2 */
-    slice_param = (VASliceParameterBufferMPEG2 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferMPEG2 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->macroblock_offset              = macroblock_offset;
@@ -139,4 +140,7 @@ AVHWAccel ff_mpeg2_vaapi_hwaccel = {
     .start_frame    = vaapi_mpeg2_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_mpeg2_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
diff --git a/libavcodec/vaapi_mpeg4.c b/libavcodec/vaapi_mpeg4.c
index 6743e2a..725f00b 100644
--- a/libavcodec/vaapi_mpeg4.c
+++ b/libavcodec/vaapi_mpeg4.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,7 +46,7 @@ static int vaapi_mpeg4_start_frame(AVCodecContext *avctx, av_unused const uint8_
 {
     Mpeg4DecContext *ctx = avctx->priv_data;
     MpegEncContext * const s = &ctx->m;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VAPictureParameterBufferMPEG4 *pic_param;
     VAIQMatrixBufferMPEG4 *iq_matrix;
     int i;
@@ -120,27 +120,17 @@ static int vaapi_mpeg4_start_frame(AVCodecContext *avctx, av_unused const uint8_
 static int vaapi_mpeg4_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 {
     MpegEncContext * const s = avctx->priv_data;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VASliceParameterBufferMPEG4 *slice_param;
 
-    /* video_plane_with_short_video_header() contains all GOBs
-     * in-order, and this is what VA API (Intel backend) expects: only
-     * a single slice param. So fake macroblock_number for Libav so
-     * that we don't call vaapi_mpeg4_decode_slice() again
-     */
-    if (avctx->codec->id == AV_CODEC_ID_H263)
-        size = s->gb.buffer_end - buffer;
-
     /* Fill in VASliceParameterBufferMPEG4 */
-    slice_param = (VASliceParameterBufferMPEG4 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferMPEG4 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->macroblock_offset      = get_bits_count(&s->gb) % 8;
-    slice_param->macroblock_number      = s->mb_y * s->mb_width + s->mb_x;
+    slice_param->macroblock_number      = 0;
     slice_param->quant_scale            = s->qscale;
 
-    if (avctx->codec->id == AV_CODEC_ID_H263)
-        s->mb_y = s->mb_height;
-
     return 0;
 }
 
@@ -153,6 +143,9 @@ AVHWAccel ff_mpeg4_vaapi_hwaccel = {
     .start_frame    = vaapi_mpeg4_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_mpeg4_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
 #endif
 
@@ -165,5 +158,8 @@ AVHWAccel ff_h263_vaapi_hwaccel = {
     .start_frame    = vaapi_mpeg4_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_mpeg4_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
 #endif
diff --git a/libavcodec/vaapi_vc1.c b/libavcodec/vaapi_vc1.c
index 4022549..4e82be1 100644
--- a/libavcodec/vaapi_vc1.c
+++ b/libavcodec/vaapi_vc1.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2008-2009 Splitted-Desktop Systems
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,7 @@
 #include "vc1.h"
 #include "vc1data.h"
 
-/** Translate Libav MV modes to VA API */
+/** Translate FFmpeg MV modes to VA API */
 static int get_VAMvModeVC1(enum MVModes mv_mode)
 {
     switch (mv_mode) {
@@ -129,7 +129,7 @@ static inline int vc1_get_TTFRM(VC1Context *v)
     return 0;
 }
 
-/** Pack Libav bitplanes into a VABitPlaneBuffer element */
+/** Pack FFmpeg bitplanes into a VABitPlaneBuffer element */
 static inline void vc1_pack_bitplanes(uint8_t *bitplane, int n, const uint8_t *ff_bp[3], int x, int y, int stride)
 {
     const int bitplane_index = n / 2;
@@ -148,7 +148,7 @@ static int vaapi_vc1_start_frame(AVCodecContext *avctx, av_unused const uint8_t
 {
     VC1Context * const v = avctx->priv_data;
     MpegEncContext * const s = &v->s;
-    struct vaapi_context * const vactx = avctx->hwaccel_context;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VAPictureParameterBufferVC1 *pic_param;
 
     vactx->slice_param_size = sizeof(VASliceParameterBufferVC1);
@@ -313,6 +313,7 @@ static int vaapi_vc1_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
 {
     VC1Context * const v = avctx->priv_data;
     MpegEncContext * const s = &v->s;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
     VASliceParameterBufferVC1 *slice_param;
 
     /* Current bit buffer is beyond any marker for VC-1, so skip it */
@@ -322,7 +323,7 @@ static int vaapi_vc1_decode_slice(AVCodecContext *avctx, const uint8_t *buffer,
     }
 
     /* Fill in VASliceParameterBufferVC1 */
-    slice_param = (VASliceParameterBufferVC1 *)ff_vaapi_alloc_slice(avctx->hwaccel_context, buffer, size);
+    slice_param = (VASliceParameterBufferVC1 *)ff_vaapi_alloc_slice(vactx, buffer, size);
     if (!slice_param)
         return -1;
     slice_param->macroblock_offset       = get_bits_count(&s->gb);
@@ -339,6 +340,9 @@ AVHWAccel ff_wmv3_vaapi_hwaccel = {
     .start_frame    = vaapi_vc1_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_vc1_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
 #endif
 
@@ -350,4 +354,7 @@ AVHWAccel ff_vc1_vaapi_hwaccel = {
     .start_frame    = vaapi_vc1_start_frame,
     .end_frame      = ff_vaapi_mpeg_end_frame,
     .decode_slice   = vaapi_vc1_decode_slice,
+    .init           = ff_vaapi_context_init,
+    .uninit         = ff_vaapi_context_fini,
+    .priv_data_size = sizeof(FFVAContext),
 };
diff --git a/libavcodec/vaapi_vp9.c b/libavcodec/vaapi_vp9.c
new file mode 100644
index 0000000..b360dcb
--- /dev/null
+++ b/libavcodec/vaapi_vp9.c
@@ -0,0 +1,168 @@
+/*
+ * VP9 HW decode acceleration through VA API
+ *
+ * Copyright (C) 2015 Timo Rothenpieler <timo@rothenpieler.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include "vaapi_internal.h"
+#include "vp9.h"
+
+static void fill_picture_parameters(AVCodecContext                 *avctx,
+                                    const VP9SharedContext         *h,
+                                    VADecPictureParameterBufferVP9 *pp)
+{
+    const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt);
+    int i;
+
+    pp->frame_width = avctx->width;
+    pp->frame_height = avctx->height;
+
+    pp->frame_header_length_in_bytes = h->h.uncompressed_header_size;
+    pp->first_partition_size = h->h.compressed_header_size;
+
+    pp->profile = h->h.profile;
+
+    pp->filter_level = h->h.filter.level;
+    pp->sharpness_level = h->h.filter.sharpness;
+    pp->log2_tile_rows = h->h.tiling.log2_tile_rows;
+    pp->log2_tile_columns = h->h.tiling.log2_tile_cols;
+
+    pp->pic_fields.bits.subsampling_x = pixdesc->log2_chroma_w;
+    pp->pic_fields.bits.subsampling_y = pixdesc->log2_chroma_h;
+    pp->pic_fields.bits.frame_type = !h->h.keyframe;
+    pp->pic_fields.bits.show_frame = !h->h.invisible;
+    pp->pic_fields.bits.error_resilient_mode = h->h.errorres;
+    pp->pic_fields.bits.intra_only = h->h.intraonly;
+    pp->pic_fields.bits.allow_high_precision_mv = h->h.keyframe ? 0 : h->h.highprecisionmvs;
+    pp->pic_fields.bits.mcomp_filter_type = h->h.filtermode ^ (h->h.filtermode <= 1);
+    pp->pic_fields.bits.frame_parallel_decoding_mode = h->h.parallelmode;
+    pp->pic_fields.bits.reset_frame_context = h->h.resetctx;
+    pp->pic_fields.bits.refresh_frame_context = h->h.refreshctx;
+    pp->pic_fields.bits.frame_context_idx = h->h.framectxid;
+
+    pp->pic_fields.bits.segmentation_enabled = h->h.segmentation.enabled;
+    pp->pic_fields.bits.segmentation_temporal_update = h->h.segmentation.temporal;
+    pp->pic_fields.bits.segmentation_update_map = h->h.segmentation.update_map;
+
+    pp->pic_fields.bits.last_ref_frame = h->h.refidx[0];
+    pp->pic_fields.bits.last_ref_frame_sign_bias = h->h.signbias[0];
+    pp->pic_fields.bits.golden_ref_frame = h->h.refidx[1];
+    pp->pic_fields.bits.golden_ref_frame_sign_bias = h->h.signbias[1];
+    pp->pic_fields.bits.alt_ref_frame = h->h.refidx[2];
+    pp->pic_fields.bits.alt_ref_frame_sign_bias = h->h.signbias[2];
+    pp->pic_fields.bits.lossless_flag = h->h.lossless;
+
+    for (i = 0; i < 7; i++)
+        pp->mb_segment_tree_probs[i] = h->h.segmentation.prob[i];
+
+    if (h->h.segmentation.temporal) {
+        for (i = 0; i < 3; i++)
+            pp->segment_pred_probs[i] = h->h.segmentation.pred_prob[i];
+    } else {
+        memset(pp->segment_pred_probs, 255, sizeof(pp->segment_pred_probs));
+    }
+
+    for (i = 0; i < 8; i++) {
+        if (h->refs[i].f->buf[0]) {
+            pp->reference_frames[i] = ff_vaapi_get_surface_id(h->refs[i].f);
+        } else {
+            pp->reference_frames[i] = VA_INVALID_ID;
+        }
+    }
+}
+
+static int vaapi_vp9_start_frame(AVCodecContext          *avctx,
+                                 av_unused const uint8_t *buffer,
+                                 av_unused uint32_t       size)
+{
+    const VP9SharedContext *h = avctx->priv_data;
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    VADecPictureParameterBufferVP9 *pic_param;
+
+    vactx->slice_param_size = sizeof(VASliceParameterBufferVP9);
+
+    pic_param = ff_vaapi_alloc_pic_param(vactx, sizeof(VADecPictureParameterBufferVP9));
+    if (!pic_param)
+        return -1;
+    fill_picture_parameters(avctx, h, pic_param);
+
+    return 0;
+}
+
+static int vaapi_vp9_end_frame(AVCodecContext *avctx)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    const VP9SharedContext *h = avctx->priv_data;
+    int ret;
+
+    ret = ff_vaapi_commit_slices(vactx);
+    if (ret < 0)
+        goto finish;
+
+    ret = ff_vaapi_render_picture(vactx, ff_vaapi_get_surface_id(h->frames[CUR_FRAME].tf.f));
+    if (ret < 0)
+        goto finish;
+
+finish:
+    ff_vaapi_common_end_frame(avctx);
+    return ret;
+}
+
+static int vaapi_vp9_decode_slice(AVCodecContext *avctx,
+                                  const uint8_t  *buffer,
+                                  uint32_t        size)
+{
+    FFVAContext * const vactx = ff_vaapi_get_context(avctx);
+    const VP9SharedContext *h = avctx->priv_data;
+    VASliceParameterBufferVP9 *slice_param;
+    int i;
+
+    slice_param = (VASliceParameterBufferVP9*)ff_vaapi_alloc_slice(vactx, buffer, size);
+    if (!slice_param)
+        return -1;
+
+    for (i = 0; i < 8; i++) {
+        slice_param->seg_param[i].segment_flags.fields.segment_reference_enabled = h->h.segmentation.feat[i].ref_enabled;
+        slice_param->seg_param[i].segment_flags.fields.segment_reference = h->h.segmentation.feat[i].ref_val;
+        slice_param->seg_param[i].segment_flags.fields.segment_reference_skipped = h->h.segmentation.feat[i].skip_enabled;
+
+        memcpy(slice_param->seg_param[i].filter_level, h->h.segmentation.feat[i].lflvl, sizeof(slice_param->seg_param[i].filter_level));
+
+        slice_param->seg_param[i].luma_dc_quant_scale = h->h.segmentation.feat[i].qmul[0][0];
+        slice_param->seg_param[i].luma_ac_quant_scale = h->h.segmentation.feat[i].qmul[0][1];
+        slice_param->seg_param[i].chroma_dc_quant_scale = h->h.segmentation.feat[i].qmul[1][0];
+        slice_param->seg_param[i].chroma_ac_quant_scale = h->h.segmentation.feat[i].qmul[1][1];
+    }
+
+    return 0;
+}
+
+AVHWAccel ff_vp9_vaapi_hwaccel = {
+    .name                 = "vp9_vaapi",
+    .type                 = AVMEDIA_TYPE_VIDEO,
+    .id                   = AV_CODEC_ID_VP9,
+    .pix_fmt              = AV_PIX_FMT_VAAPI,
+    .start_frame          = vaapi_vp9_start_frame,
+    .end_frame            = vaapi_vp9_end_frame,
+    .decode_slice         = vaapi_vp9_decode_slice,
+    .init                 = ff_vaapi_context_init,
+    .uninit               = ff_vaapi_context_fini,
+    .priv_data_size       = sizeof(FFVAContext),
+};
diff --git a/libavcodec/vb.c b/libavcodec/vb.c
index 43954c1..560165a 100644
--- a/libavcodec/vb.c
+++ b/libavcodec/vb.c
@@ -2,20 +2,20 @@
  * Beam Software VB decoder
  * Copyright (c) 2007 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static void vb_decode_palette(VBDecContext *c, int data_size)
         return;
     }
     for (i = start; i <= start + size; i++)
-        c->pal[i] = bytestream2_get_be24(&c->stream);
+        c->pal[i] = 0xFFU << 24 | bytestream2_get_be24(&c->stream);
 }
 
 static inline int check_pixel(uint8_t *buf, uint8_t *start, uint8_t *end)
@@ -197,10 +197,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     bytestream2_init(&c->stream, avpkt->data, avpkt->size);
 
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     flags = bytestream2_get_le16(&c->stream);
 
@@ -211,6 +209,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     }
     if (flags & VB_HAS_VIDEO) {
         size = bytestream2_get_le32(&c->stream);
+        if(size > bytestream2_get_bytes_left(&c->stream)+4 || size<4){
+            av_log(avctx, AV_LOG_ERROR, "Frame size invalid\n");
+            return -1;
+        }
         vb_decode_framedata(c, offset);
         bytestream2_skip(&c->stream, size - 4);
     }
@@ -249,6 +251,12 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->frame      = av_mallocz(avctx->width * avctx->height);
     c->prev_frame = av_mallocz(avctx->width * avctx->height);
 
+    if (!c->frame || !c->prev_frame) {
+        av_freep(&c->frame);
+        av_freep(&c->prev_frame);
+        return AVERROR(ENOMEM);
+    }
+
     return 0;
 }
 
diff --git a/libavcodec/vble.c b/libavcodec/vble.c
index 7ce1aee..0340cad 100644
--- a/libavcodec/vble.c
+++ b/libavcodec/vble.c
@@ -2,20 +2,20 @@
  * VBLE Decoder
  * Copyright (c) 2011 Derek Buitenhuis
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,62 +33,59 @@
 #include "huffyuvdsp.h"
 #include "internal.h"
 #include "mathops.h"
+#include "thread.h"
 
 typedef struct VBLEContext {
     AVCodecContext *avctx;
     HuffYUVDSPContext hdsp;
 
     int            size;
-    uint8_t        *val; /* First holds the lengths of vlc symbols and then their values */
+    uint8_t        *val; ///< This array first holds the lengths of vlc symbols and then their value.
 } VBLEContext;
 
-static uint8_t vble_read_reverse_unary(GetBitContext *gb)
-{
-    /* At most we need to read 9 bits total to get indices up to 8 */
-    uint8_t val = show_bits(gb, 8);
-
-    if (val) {
-        val = 7 - av_log2_16bit(ff_reverse[val]);
-        skip_bits(gb, val + 1);
-        return val;
-    } else {
-        skip_bits(gb, 8);
-        if (get_bits1(gb))
-            return 8;
-    }
-
-    /* Return something larger than 8 on error */
-    return UINT8_MAX;
-}
-
 static int vble_unpack(VBLEContext *ctx, GetBitContext *gb)
 {
     int i;
+    int allbits = 0;
+    static const uint8_t LUT[256] = {
+        8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+        5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
+    };
 
     /* Read all the lengths in first */
     for (i = 0; i < ctx->size; i++) {
-        ctx->val[i] = vble_read_reverse_unary(gb);
-
-        if (ctx->val[i] == UINT8_MAX)
-            return -1;
-    }
-
-    for (i = 0; i < ctx->size; i++) {
-        /* Check we have enough bits left */
-        if (get_bits_left(gb) < ctx->val[i])
-            return -1;
-
-        /* get_bits can't take a length of 0 */
-        if (ctx->val[i])
-            ctx->val[i] = (1 << ctx->val[i]) + get_bits(gb, ctx->val[i]) - 1;
+        /* At most we need to read 9 bits total to get indices up to 8 */
+        int val = show_bits(gb, 8);
+
+        // read reverse unary
+        if (val) {
+            val = LUT[val];
+            skip_bits(gb, val + 1);
+            ctx->val[i] = val;
+        } else {
+            skip_bits(gb, 8);
+            if (!get_bits1(gb))
+                return -1;
+            ctx->val[i] = 8;
+        }
+        allbits += ctx->val[i];
     }
 
+    /* Check we have enough bits left */
+    if (get_bits_left(gb) < allbits)
+        return -1;
     return 0;
 }
 
 static void vble_restore_plane(VBLEContext *ctx, AVFrame *pic,
-                               int plane, int offset,
-                               int width, int height)
+                               GetBitContext *gb, int plane,
+                               int offset, int width, int height)
 {
     uint8_t *dst = pic->data[plane];
     uint8_t *val = ctx->val + offset;
@@ -96,9 +93,13 @@ static void vble_restore_plane(VBLEContext *ctx, AVFrame *pic,
     int i, j, left, left_top;
 
     for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j++)
-            val[j] = (val[j] >> 1) ^ -(val[j] & 1);
-
+        for (j = 0; j < width; j++) {
+            /* get_bits can't take a length of 0 */
+            if (val[j]) {
+                int v = (1 << val[j]) + get_bits(gb, val[j]) - 1;
+                val[j] = (v >> 1) ^ -(v & 1);
+            }
+        }
         if (i) {
             left = 0;
             left_top = dst[-stride];
@@ -124,13 +125,18 @@ static int vble_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     int version;
     int offset = 0;
     int width_uv = avctx->width / 2, height_uv = avctx->height / 2;
+    int ret;
+    ThreadFrame frame = { .f = data };
 
-    /* Allocate buffer */
-    if (ff_get_buffer(avctx, pic, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
+    if (avpkt->size < 4 || avpkt->size - 4 > INT_MAX/8) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid packet size\n");
+        return AVERROR_INVALIDDATA;
     }
 
+    /* Allocate buffer */
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
     /* Set flags */
     pic->key_frame = 1;
     pic->pict_type = AV_PICTURE_TYPE_I;
@@ -150,15 +156,15 @@ static int vble_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     }
 
     /* Restore planes. Should be almost identical to Huffyuv's. */
-    vble_restore_plane(ctx, pic, 0, offset, avctx->width, avctx->height);
+    vble_restore_plane(ctx, pic, &gb, 0, offset, avctx->width, avctx->height);
 
     /* Chroma */
     if (!(ctx->avctx->flags & AV_CODEC_FLAG_GRAY)) {
         offset += avctx->width * avctx->height;
-        vble_restore_plane(ctx, pic, 1, offset, width_uv, height_uv);
+        vble_restore_plane(ctx, pic, &gb, 1, offset, width_uv, height_uv);
 
         offset += width_uv * height_uv;
-        vble_restore_plane(ctx, pic, 2, offset, width_uv, height_uv);
+        vble_restore_plane(ctx, pic, &gb, 2, offset, width_uv, height_uv);
     }
 
     *got_frame       = 1;
@@ -188,7 +194,7 @@ static av_cold int vble_decode_init(AVCodecContext *avctx)
     ctx->size = av_image_get_buffer_size(avctx->pix_fmt,
                                          avctx->width, avctx->height, 1);
 
-    ctx->val = av_malloc(ctx->size * sizeof(*ctx->val));
+    ctx->val = av_malloc_array(ctx->size, sizeof(*ctx->val));
 
     if (!ctx->val) {
         av_log(avctx, AV_LOG_ERROR, "Could not allocate values buffer.\n");
@@ -208,5 +214,6 @@ AVCodec ff_vble_decoder = {
     .init           = vble_decode_init,
     .close          = vble_decode_close,
     .decode         = vble_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(vble_decode_init),
 };
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 7a93e97..48a2cc1 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,21 +43,6 @@
  * @{
  */
 
-/**
- * Imode types
- * @{
- */
-enum Imode {
-    IMODE_RAW,
-    IMODE_NORM2,
-    IMODE_DIFF2,
-    IMODE_NORM6,
-    IMODE_DIFF6,
-    IMODE_ROWSKIP,
-    IMODE_COLSKIP
-};
-/** @} */ //imode defines
-
 /** Decode rows by checking if they are skipped
  * @param plane Buffer to store decoded bits
  * @param[in] width Width of this buffer
@@ -133,12 +118,16 @@ static int bitplane_decoding(uint8_t* data, int *raw_flag, VC1Context *v)
     case IMODE_NORM2:
         if ((height * width) & 1) {
             *planep++ = get_bits1(gb);
-            offset    = 1;
+            y = offset = 1;
+            if (offset == width) {
+                offset = 0;
+                planep += stride - width;
+            }
         }
         else
-            offset = 0;
+            y = offset = 0;
         // decode bitplane as one long line
-        for (y = offset; y < height * width; y += 2) {
+        for (; y < height * width; y += 2) {
             code = get_vlc2(gb, ff_vc1_norm2_vlc.table, VC1_NORM2_VLC_BITS, 1);
             *planep++ = code & 1;
             offset++;
@@ -244,37 +233,34 @@ static int vop_dquant_decoding(VC1Context *v)
     int pqdiff;
 
     //variable size
-    if (v->dquant == 2) {
-        pqdiff = get_bits(gb, 3);
-        if (pqdiff == 7)
-            v->altpq = get_bits(gb, 5);
-        else
-            v->altpq = v->pq + pqdiff + 1;
-    } else {
+    if (v->dquant != 2) {
         v->dquantfrm = get_bits1(gb);
-        if (v->dquantfrm) {
-            v->dqprofile = get_bits(gb, 2);
-            switch (v->dqprofile) {
-            case DQPROFILE_SINGLE_EDGE:
-            case DQPROFILE_DOUBLE_EDGES:
-                v->dqsbedge = get_bits(gb, 2);
-                break;
-            case DQPROFILE_ALL_MBS:
-                v->dqbilevel = get_bits1(gb);
-                if (!v->dqbilevel)
-                    v->halfpq = 0;
-            default:
-                break; //Forbidden ?
-            }
-            if (v->dqbilevel || v->dqprofile != DQPROFILE_ALL_MBS) {
-                pqdiff = get_bits(gb, 3);
-                if (pqdiff == 7)
-                    v->altpq = get_bits(gb, 5);
-                else
-                    v->altpq = v->pq + pqdiff + 1;
+        if (!v->dquantfrm)
+            return 0;
+
+        v->dqprofile = get_bits(gb, 2);
+        switch (v->dqprofile) {
+        case DQPROFILE_SINGLE_EDGE:
+        case DQPROFILE_DOUBLE_EDGES:
+            v->dqsbedge = get_bits(gb, 2);
+            break;
+        case DQPROFILE_ALL_MBS:
+            v->dqbilevel = get_bits1(gb);
+            if (!v->dqbilevel) {
+                v->halfpq = 0;
+                return 0;
             }
+        default:
+            break; //Forbidden ?
         }
     }
+
+    pqdiff = get_bits(gb, 3);
+    if (pqdiff == 7)
+        v->altpq = get_bits(gb, 5);
+    else
+        v->altpq = v->pq + pqdiff + 1;
+
     return 0;
 }
 
@@ -289,7 +275,7 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb);
  */
 int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitContext *gb)
 {
-    av_log(avctx, AV_LOG_DEBUG, "Header: %0X\n", show_bits(gb, 32));
+    av_log(avctx, AV_LOG_DEBUG, "Header: %0X\n", show_bits_long(gb, 32));
     v->profile = get_bits(gb, 2);
     if (v->profile == PROFILE_COMPLEX) {
         av_log(avctx, AV_LOG_WARNING, "WMV3 Complex Profile is not fully supported\n");
@@ -300,6 +286,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
         v->zz_4x8 = ff_vc1_adv_progressive_4x8_zz;
         return decode_sequence_header_adv(v, gb);
     } else {
+        v->chromaformat = 1;
         v->zz_8x4 = ff_wmv2_scantableA;
         v->zz_4x8 = ff_wmv2_scantableB;
         v->res_y411   = get_bits1(gb);
@@ -344,8 +331,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
         return -1;
     }
     v->extended_mv     = get_bits1(gb); //common
-    if (!v->profile && v->extended_mv)
-    {
+    if (!v->profile && v->extended_mv) {
         av_log(avctx, AV_LOG_ERROR,
                "Extended MVs unavailable in Simple Profile\n");
         return -1;
@@ -354,8 +340,7 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     v->vstransform     = get_bits1(gb); //common
 
     v->res_transtab    = get_bits1(gb);
-    if (v->res_transtab)
-    {
+    if (v->res_transtab) {
         av_log(avctx, AV_LOG_ERROR,
                "1 for reserved RES_TRANSTAB is forbidden\n");
         return -1;
@@ -376,8 +361,13 @@ int ff_vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitCo
     v->finterpflag = get_bits1(gb); //common
 
     if (v->res_sprite) {
-        v->s.avctx->width  = v->s.avctx->coded_width  = get_bits(gb, 11);
-        v->s.avctx->height = v->s.avctx->coded_height = get_bits(gb, 11);
+        int w = get_bits(gb, 11);
+        int h = get_bits(gb, 11);
+        int ret = ff_set_dimensions(v->s.avctx, w, h);
+        if (ret < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions %d %d\n", w, h);
+            return ret;
+        }
         skip_bits(gb, 5); //frame rate
         v->res_x8 = get_bits1(gb);
         if (get_bits1(gb)) { // something to do with DC VLC selection
@@ -429,10 +419,8 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
     v->bitrtq_postproc       = get_bits(gb, 5); //common
     v->postprocflag          = get_bits1(gb);   //common
 
-    v->s.avctx->coded_width  = (get_bits(gb, 12) + 1) << 1;
-    v->s.avctx->coded_height = (get_bits(gb, 12) + 1) << 1;
-    v->s.avctx->width        = v->s.avctx->coded_width;
-    v->s.avctx->height       = v->s.avctx->coded_height;
+    v->max_coded_width       = (get_bits(gb, 12) + 1) << 1;
+    v->max_coded_height      = (get_bits(gb, 12) + 1) << 1;
     v->broadcast             = get_bits1(gb);
     v->interlace             = get_bits1(gb);
     v->tfcntrflag            = get_bits1(gb);
@@ -493,7 +481,6 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
                 }
             }
             if (v->broadcast) { // Pulldown may be present
-                v->s.avctx->framerate.num  *= 2;
                 v->s.avctx->ticks_per_frame = 2;
             }
         }
@@ -522,6 +509,8 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
 int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContext *gb)
 {
     int i;
+    int w,h;
+    int ret;
 
     av_log(avctx, AV_LOG_DEBUG, "Entry point: %08X\n", show_bits_long(gb, 32));
     v->broken_link    = get_bits1(gb);
@@ -529,6 +518,8 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
     v->panscanflag    = get_bits1(gb);
     v->refdist_flag   = get_bits1(gb);
     v->s.loop_filter  = get_bits1(gb);
+    if (v->s.avctx->skip_loop_filter >= AVDISCARD_ALL)
+        v->s.loop_filter = 0;
     v->fastuvmc       = get_bits1(gb);
     v->extended_mv    = get_bits1(gb);
     v->dquant         = get_bits(gb, 2);
@@ -542,10 +533,18 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
         }
     }
 
-    if (get_bits1(gb)) {
-        avctx->width  = avctx->coded_width  = (get_bits(gb, 12) + 1) << 1;
-        avctx->height = avctx->coded_height = (get_bits(gb, 12) + 1) << 1;
+    if(get_bits1(gb)){
+        w = (get_bits(gb, 12)+1)<<1;
+        h = (get_bits(gb, 12)+1)<<1;
+    } else {
+        w = v->max_coded_width;
+        h = v->max_coded_height;
     }
+    if ((ret = ff_set_dimensions(avctx, w, h)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to set dimensions %d %d\n", w, h);
+        return ret;
+    }
+
     if (v->extended_mv)
         v->extended_dmv = get_bits1(gb);
     if ((v->range_mapy_flag = get_bits1(gb))) {
@@ -572,13 +571,13 @@ int ff_vc1_decode_entry_point(AVCodecContext *avctx, VC1Context *v, GetBitContex
         int scale, shift, i;                                                  \
         if (!lumscale) {                                                      \
             scale = -64;                                                      \
-            shift = (255 - lumshift * 2) << 6;                                \
+            shift = (255 - lumshift * 2) * 64;                                \
             if (lumshift > 31)                                                \
                 shift += 128 << 6;                                            \
         } else {                                                              \
             scale = lumscale + 32;                                            \
             if (lumshift > 31)                                                \
-                shift = (lumshift - 64) << 6;                                 \
+                shift = (lumshift - 64) * 64;                                 \
             else                                                              \
                 shift = lumshift << 6;                                        \
         }                                                                     \
@@ -597,32 +596,44 @@ static void rotate_luts(VC1Context *v)
             C = A;                                            \
         } else {                                              \
             DEF;                                              \
-            memcpy(&tmp, &L  , sizeof(tmp));                  \
-            memcpy(&L  , &N  , sizeof(tmp));                  \
-            memcpy(&N  , &tmp, sizeof(tmp));                  \
+            memcpy(&tmp, L   , sizeof(tmp));                  \
+            memcpy(L   , N   , sizeof(tmp));                  \
+            memcpy(N   , &tmp, sizeof(tmp));                  \
             C = N;                                            \
         }                                                     \
     } while(0)
 
-    ROTATE(int tmp,             v->last_use_ic, v->next_use_ic, v->curr_use_ic, v->aux_use_ic);
+    ROTATE(int tmp,             &v->last_use_ic, &v->next_use_ic, v->curr_use_ic, &v->aux_use_ic);
     ROTATE(uint8_t tmp[2][256], v->last_luty,   v->next_luty,   v->curr_luty,   v->aux_luty);
     ROTATE(uint8_t tmp[2][256], v->last_lutuv,  v->next_lutuv,  v->curr_lutuv,  v->aux_lutuv);
 
     INIT_LUT(32, 0, v->curr_luty[0], v->curr_lutuv[0], 0);
     INIT_LUT(32, 0, v->curr_luty[1], v->curr_lutuv[1], 0);
-    v->curr_use_ic = 0;
-    if (v->curr_luty == v->next_luty) {
-        // If we just initialized next_lut, clear next_use_ic to match.
-        v->next_use_ic = 0;
+    *v->curr_use_ic = 0;
+}
+
+static int read_bfraction(VC1Context *v, GetBitContext* gb) {
+    int bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
+
+    if (bfraction_lut_index == 21 || bfraction_lut_index < 0) {
+        av_log(v->s.avctx, AV_LOG_ERROR, "bfraction invalid\n");
+        return AVERROR_INVALIDDATA;
     }
+    v->bfraction_lut_index = bfraction_lut_index;
+    v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+    return 0;
 }
 
 int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
 {
     int pqindex, lowquant, status;
 
+    v->field_mode = 0;
+    v->fcm = 0;
     if (v->finterpflag)
         v->interpfrm = get_bits1(gb);
+    if (!v->s.avctx->codec)
+        return -1;
     if (v->s.avctx->codec_id == AV_CODEC_ID_MSS2)
         v->respic   =
         v->rangered =
@@ -632,22 +643,19 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
     v->rangeredfrm = 0;
     if (v->rangered)
         v->rangeredfrm = get_bits1(gb);
-    v->s.pict_type = get_bits1(gb);
-    if (v->s.avctx->max_b_frames) {
-        if (!v->s.pict_type) {
-            if (get_bits1(gb))
-                v->s.pict_type = AV_PICTURE_TYPE_I;
-            else
-                v->s.pict_type = AV_PICTURE_TYPE_B;
+    if (get_bits1(gb)) {
+        v->s.pict_type = AV_PICTURE_TYPE_P;
+    } else {
+        if (v->s.avctx->max_b_frames && !get_bits1(gb)) {
+            v->s.pict_type = AV_PICTURE_TYPE_B;
         } else
-            v->s.pict_type = AV_PICTURE_TYPE_P;
-    } else
-        v->s.pict_type = v->s.pict_type ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+            v->s.pict_type = AV_PICTURE_TYPE_I;
+    }
 
     v->bi_type = 0;
     if (v->s.pict_type == AV_PICTURE_TYPE_B) {
-        v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-        v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+        if (read_bfraction(v, gb) < 0)
+            return AVERROR_INVALIDDATA;
         if (v->bfraction == 0) {
             v->s.pict_type = AV_PICTURE_TYPE_BI;
         }
@@ -672,19 +680,25 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
         v->pq = ff_vc1_pquant_table[0][pqindex];
     else
         v->pq = ff_vc1_pquant_table[1][pqindex];
-
-    v->pquantizer = 1;
-    if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
-        v->pquantizer = pqindex < 9;
-    if (v->quantizer_mode == QUANT_NON_UNIFORM)
-        v->pquantizer = 0;
     v->pqindex = pqindex;
     if (pqindex < 9)
         v->halfpq = get_bits1(gb);
     else
         v->halfpq = 0;
-    if (v->quantizer_mode == QUANT_FRAME_EXPLICIT)
+    switch (v->quantizer_mode) {
+    case QUANT_FRAME_IMPLICIT:
+        v->pquantizer = pqindex < 9;
+        break;
+    case QUANT_NON_UNIFORM:
+        v->pquantizer = 0;
+        break;
+    case QUANT_FRAME_EXPLICIT:
         v->pquantizer = get_bits1(gb);
+        break;
+    default:
+        v->pquantizer = 1;
+        break;
+    }
     v->dquantfrm = 0;
     if (v->extended_mv == 1)
         v->mvrange = get_unary(gb, 0, 3);
@@ -708,9 +722,7 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
 
     switch (v->s.pict_type) {
     case AV_PICTURE_TYPE_P:
-        if (v->pq < 5)       v->tt_index = 0;
-        else if (v->pq < 13) v->tt_index = 1;
-        else                 v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         lowquant = (v->pq > 12) ? 0 : 1;
         v->mv_mode = ff_vc1_mv_pmode_table[lowquant][get_unary(gb, 1, 4)];
@@ -724,16 +736,15 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
             INIT_LUT(v->lumscale, v->lumshift, v->last_luty[1], v->last_lutuv[1], 1);
         }
         v->qs_last = v->s.quarter_sample;
-        if (v->mv_mode == MV_PMODE_1MV_HPEL || v->mv_mode == MV_PMODE_1MV_HPEL_BILIN)
-            v->s.quarter_sample = 0;
-        else if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
-            if (v->mv_mode2 == MV_PMODE_1MV_HPEL || v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN)
-                v->s.quarter_sample = 0;
-            else
-                v->s.quarter_sample = 1;
-        } else
-            v->s.quarter_sample = 1;
-        v->s.mspel = !(v->mv_mode == MV_PMODE_1MV_HPEL_BILIN || (v->mv_mode == MV_PMODE_INTENSITY_COMP && v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN));
+        if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
+            v->s.quarter_sample = (v->mv_mode2 != MV_PMODE_1MV_HPEL &&
+                                   v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+            v->s.mspel          = (v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+        } else {
+            v->s.quarter_sample = (v->mv_mode != MV_PMODE_1MV_HPEL &&
+                                   v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+            v->s.mspel          = (v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+        }
 
         if ((v->mv_mode  == MV_PMODE_INTENSITY_COMP &&
              v->mv_mode2 == MV_PMODE_MIXED_MV)      ||
@@ -762,21 +773,19 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0; //FIXME Is that so ?
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0; //FIXME Is that so ?
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
         }
         break;
     case AV_PICTURE_TYPE_B:
-        if (v->pq < 5)       v->tt_index = 0;
-        else if (v->pq < 13) v->tt_index = 1;
-        else                 v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         v->mv_mode          = get_bits1(gb) ? MV_PMODE_1MV : MV_PMODE_1MV_HPEL_BILIN;
         v->qs_last          = v->s.quarter_sample;
@@ -802,12 +811,12 @@ int ff_vc1_parse_frame_header(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0;
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0;
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -842,9 +851,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->numref          = 0;
     v->p_frame_skipped = 0;
     if (v->second_field) {
-        v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+        if (v->fcm != ILACE_FIELD || v->field_mode!=1)
+            return -1;
         if (v->fptype & 4)
             v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_BI : AV_PICTURE_TYPE_B;
+        else
+            v->s.pict_type = (v->fptype & 1) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
         v->s.current_picture_ptr->f->pict_type = v->s.pict_type;
         if (!v->pic_header_flag)
             goto parse_common_info;
@@ -865,12 +877,15 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->field_mode = field_mode;
     v->fcm = fcm;
 
+    av_assert0(    v->s.mb_height == v->s.height + 15 >> 4
+                || v->s.mb_height == FFALIGN(v->s.height + 15 >> 4, 2));
     if (v->field_mode) {
         v->s.mb_height = FFALIGN(v->s.height + 15 >> 4, 2);
         v->fptype = get_bits(gb, 3);
-        v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
         if (v->fptype & 4) // B-picture
             v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_BI : AV_PICTURE_TYPE_B;
+        else
+            v->s.pict_type = (v->fptype & 2) ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
     } else {
         v->s.mb_height = v->s.height + 15 >> 4;
         switch (get_unary(gb, 0, 4)) {
@@ -901,6 +916,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             v->tff = get_bits1(gb);
             v->rff = get_bits1(gb);
         }
+    } else {
+        v->tff = 1;
     }
     if (v->panscanflag) {
         avpriv_report_missing_feature(v->s.avctx, "Pan-scan");
@@ -912,6 +929,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     v->rnd = get_bits1(gb);
     if (v->interlace)
         v->uvsamp = get_bits1(gb);
+    if(!ff_vc1_bfraction_vlc.table)
+        return 0; //parsing only, vlc tables havnt been allocated
     if (v->field_mode) {
         if (!v->refdist_flag)
             v->refdist = 0;
@@ -921,8 +940,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 v->refdist += get_unary(gb, 0, 16);
         }
         if ((v->s.pict_type == AV_PICTURE_TYPE_B) || (v->s.pict_type == AV_PICTURE_TYPE_BI)) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             v->frfd = (v->bfraction * v->refdist) >> 8;
             v->brfd = v->refdist - v->frfd - 1;
             if (v->brfd < 0)
@@ -934,8 +953,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         if (v->finterpflag)
             v->interpfrm = get_bits1(gb);
         if (v->s.pict_type == AV_PICTURE_TYPE_B) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             if (v->bfraction == 0) {
                 v->s.pict_type = AV_PICTURE_TYPE_BI; /* XXX: should not happen here */
             }
@@ -948,24 +967,29 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     pqindex = get_bits(gb, 5);
     if (!pqindex)
         return -1;
-    v->pqindex = pqindex;
     if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
         v->pq = ff_vc1_pquant_table[0][pqindex];
     else
         v->pq = ff_vc1_pquant_table[1][pqindex];
-
-    v->pquantizer = 1;
-    if (v->quantizer_mode == QUANT_FRAME_IMPLICIT)
-        v->pquantizer = pqindex < 9;
-    if (v->quantizer_mode == QUANT_NON_UNIFORM)
-        v->pquantizer = 0;
     v->pqindex = pqindex;
     if (pqindex < 9)
         v->halfpq = get_bits1(gb);
     else
         v->halfpq = 0;
-    if (v->quantizer_mode == QUANT_FRAME_EXPLICIT)
+    switch (v->quantizer_mode) {
+    case QUANT_FRAME_IMPLICIT:
+        v->pquantizer = pqindex < 9;
+        break;
+    case QUANT_NON_UNIFORM:
+        v->pquantizer = 0;
+        break;
+    case QUANT_FRAME_EXPLICIT:
         v->pquantizer = get_bits1(gb);
+        break;
+    default:
+        v->pquantizer = 1;
+        break;
+    }
     if (v->postprocflag)
         v->postproc = get_bits(gb, 2);
 
@@ -1055,12 +1079,7 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         v->range_x = 1 << (v->k_x - 1);
         v->range_y = 1 << (v->k_y - 1);
 
-        if (v->pq < 5)
-            v->tt_index = 0;
-        else if (v->pq < 13)
-            v->tt_index = 1;
-        else
-            v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
         if (v->fcm != ILACE_FRAME) {
             int mvmode;
             mvmode     = get_unary(gb, 1, 4);
@@ -1096,7 +1115,7 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                         INIT_LUT(v->lumscale2, v->lumshift2, v->curr_luty[v->cur_field_type^1], v->curr_lutuv[v->cur_field_type^1], 0);
                         INIT_LUT(v->lumscale , v->lumshift , v->last_luty[v->cur_field_type  ], v->last_lutuv[v->cur_field_type  ], 1);
                     }
-                    v->next_use_ic = v->curr_use_ic = 1;
+                    v->next_use_ic = *v->curr_use_ic = 1;
                 } else {
                     INIT_LUT(v->lumscale , v->lumshift , v->last_luty[0], v->last_lutuv[0], 1);
                     INIT_LUT(v->lumscale2, v->lumshift2, v->last_luty[1], v->last_lutuv[1], 1);
@@ -1104,18 +1123,15 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
                 v->last_use_ic = 1;
             }
             v->qs_last = v->s.quarter_sample;
-            if (v->mv_mode == MV_PMODE_1MV_HPEL || v->mv_mode == MV_PMODE_1MV_HPEL_BILIN)
-                v->s.quarter_sample = 0;
-            else if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
-                if (v->mv_mode2 == MV_PMODE_1MV_HPEL || v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN)
-                    v->s.quarter_sample = 0;
-                else
-                    v->s.quarter_sample = 1;
-            } else
-                v->s.quarter_sample = 1;
-            v->s.mspel = !(v->mv_mode == MV_PMODE_1MV_HPEL_BILIN
-                           || (v->mv_mode == MV_PMODE_INTENSITY_COMP
-                               && v->mv_mode2 == MV_PMODE_1MV_HPEL_BILIN));
+            if (v->mv_mode == MV_PMODE_INTENSITY_COMP) {
+                v->s.quarter_sample = (v->mv_mode2 != MV_PMODE_1MV_HPEL &&
+                                       v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+                v->s.mspel          = (v->mv_mode2 != MV_PMODE_1MV_HPEL_BILIN);
+            } else {
+                v->s.quarter_sample = (v->mv_mode != MV_PMODE_1MV_HPEL &&
+                                       v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+                v->s.mspel          = (v->mv_mode != MV_PMODE_1MV_HPEL_BILIN);
+            }
         }
         if (v->fcm == PROGRESSIVE) { // progressive
             if ((v->mv_mode == MV_PMODE_INTENSITY_COMP &&
@@ -1166,12 +1182,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0; //FIXME Is that so ?
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0; //FIXME Is that so ?
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -1179,8 +1195,8 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         break;
     case AV_PICTURE_TYPE_B:
         if (v->fcm == ILACE_FRAME) {
-            v->bfraction_lut_index = get_vlc2(gb, ff_vc1_bfraction_vlc.table, VC1_BFRACTION_VLC_BITS, 1);
-            v->bfraction           = ff_vc1_bfraction_lut[v->bfraction_lut_index];
+            if (read_bfraction(v, gb) < 0)
+                return AVERROR_INVALIDDATA;
             if (v->bfraction == 0) {
                 return -1;
             }
@@ -1194,15 +1210,11 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         v->range_x = 1 << (v->k_x - 1);
         v->range_y = 1 << (v->k_y - 1);
 
-        if (v->pq < 5)
-            v->tt_index = 0;
-        else if (v->pq < 13)
-            v->tt_index = 1;
-        else
-            v->tt_index = 2;
+        v->tt_index = (v->pq > 4) + (v->pq > 12);
 
         if (v->field_mode) {
             int mvmode;
+            av_log(v->s.avctx, AV_LOG_DEBUG, "B Fields\n");
             if (v->extended_dmv)
                 v->dmvrange = get_unary(gb, 0, 3);
             mvmode = get_unary(gb, 1, 3);
@@ -1286,12 +1298,12 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
             vop_dquant_decoding(v);
         }
 
-        v->ttfrm = 0;
         if (v->vstransform) {
             v->ttmbf = get_bits1(gb);
             if (v->ttmbf) {
                 v->ttfrm = ff_vc1_ttfrm_to_tt[get_bits(gb, 2)];
-            }
+            } else
+                v->ttfrm = 0;
         } else {
             v->ttmbf = 1;
             v->ttfrm = TT_8X8;
@@ -1317,11 +1329,10 @@ int ff_vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         vop_dquant_decoding(v);
     }
 
-    v->bi_type = 0;
-    if (v->s.pict_type == AV_PICTURE_TYPE_BI) {
+    v->bi_type = (v->s.pict_type == AV_PICTURE_TYPE_BI);
+    if (v->bi_type)
         v->s.pict_type = AV_PICTURE_TYPE_B;
-        v->bi_type = 1;
-    }
+
     return 0;
 }
 
diff --git a/libavcodec/vc1.h b/libavcodec/vc1.h
index 5087b7c..556906d 100644
--- a/libavcodec/vc1.h
+++ b/libavcodec/vc1.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -151,6 +151,21 @@ enum FrameCodingMode {
     ILACE_FIELD         ///<  in the bitstream is reported as 11b
 };
 
+/**
+ * Imode types
+ * @{
+ */
+enum Imode {
+    IMODE_RAW,
+    IMODE_NORM2,
+    IMODE_DIFF2,
+    IMODE_NORM6,
+    IMODE_DIFF6,
+    IMODE_ROWSKIP,
+    IMODE_COLSKIP
+};
+/** @} */ //imode defines
+
 /** The VC1 Context
  * @todo Change size wherever another size is more efficient
  * Many members are only used for Advanced Profile
@@ -201,8 +216,9 @@ typedef struct VC1Context{
      */
     //@{
     int profile;          ///< 2 bits, Profile
-    int frmrtq_postproc;  ///< 3 bits
+    int frmrtq_postproc;  ///< 3 bits,
     int bitrtq_postproc;  ///< 5 bits, quantized framerate-based postprocessing strength
+    int max_coded_width, max_coded_height;
     int fastuvmc;         ///< Rounding of qpel vector to hpel ? (not in Simple)
     int extended_mv;      ///< Ext MV in P/B (not in Simple)
     int dquant;           ///< How qscale varies with MBs, 2 bits (not in Simple)
@@ -278,7 +294,7 @@ typedef struct VC1Context{
     uint8_t  aux_luty[2][256],  aux_lutuv[2][256];  ///< lookup tables used for intensity compensation
     uint8_t next_luty[2][256], next_lutuv[2][256];  ///< lookup tables used for intensity compensation
     uint8_t (*curr_luty)[256]  ,(*curr_lutuv)[256];
-    int last_use_ic, curr_use_ic, next_use_ic, aux_use_ic;
+    int last_use_ic, *curr_use_ic, next_use_ic, aux_use_ic;
     int rnd;                        ///< rounding control
 
     /** Frame decoding info for S/M profiles only */
@@ -329,7 +345,7 @@ typedef struct VC1Context{
     uint8_t fourmvbp;
     uint8_t* fieldtx_plane;
     int fieldtx_is_raw;
-    int8_t zzi_8x8[64];
+    uint8_t zzi_8x8[64];
     uint8_t *blk_mv_type_base, *blk_mv_type;    ///< 0: frame MV, 1: field MV (interlaced frame)
     uint8_t *mv_f_base, *mv_f[2];               ///< 0: MV obtained from same field, 1: opposite field
     uint8_t *mv_f_next_base, *mv_f_next[2];
diff --git a/libavcodec/vc1_block.c b/libavcodec/vc1_block.c
index 0e1018c..f9f26f7 100644
--- a/libavcodec/vc1_block.c
+++ b/libavcodec/vc1_block.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,8 +40,10 @@
 #define DC_VLC_BITS 9
 
 // offset tables for interlaced picture MVDATA decoding
-static const int offset_table1[9] = {  0,  1,  2,  4,  8, 16, 32,  64, 128 };
-static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
+static const uint8_t offset_table[2][9] = {
+    {  0,  1,  2,  4,  8, 16, 32,  64, 128 },
+    {  0,  1,  3,  7, 15, 31, 63, 127, 255 },
+};
 
 /***********************************************************************/
 /**
@@ -50,22 +52,8 @@ static const int offset_table2[9] = {  0,  1,  3,  7, 15, 31, 63, 127, 255 };
  * @{
  */
 
-/**
- * Imode types
- * @{
- */
-enum Imode {
-    IMODE_RAW,
-    IMODE_NORM2,
-    IMODE_DIFF2,
-    IMODE_NORM6,
-    IMODE_DIFF6,
-    IMODE_ROWSKIP,
-    IMODE_COLSKIP
-};
-/** @} */ //imode defines
 
-static void init_block_index(VC1Context *v)
+static inline void init_block_index(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     ff_init_block_index(s);
@@ -111,12 +99,14 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
             s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3],
                                               s->dest[0] - v_dist * s->linesize - 8,
                                               stride_y);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4],
                                               s->dest[1] - 8 * s->uvlinesize - 8,
                                               s->uvlinesize);
             s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5],
                                               s->dest[2] - 8 * s->uvlinesize - 8,
                                               s->uvlinesize);
+            }
         }
         if (s->mb_x == s->mb_width - 1) {
             top_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x;
@@ -136,12 +126,14 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
             s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3],
                                               s->dest[0] - v_dist * s->linesize + 8,
                                               stride_y);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4],
                                               s->dest[1] - 8 * s->uvlinesize,
                                               s->uvlinesize);
             s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5],
                                               s->dest[2] - 8 * s->uvlinesize,
                                               s->uvlinesize);
+            }
         }
     }
 
@@ -230,33 +222,32 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
         s->mb_intra = 1;                                                \
     } else {                                                            \
         index1 = index % 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val&1);                                             \
-        _dmv_x = (sign ^ ((val>>1) + offset_table[index1])) - sign;     \
+        _dmv_x = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_x = (sign ^ ((val >> 1) + _dmv_x)) - sign;             \
+        }                                                               \
                                                                         \
         index1 = index / 6;                                             \
-        if (!s->quarter_sample && index1 == 5) val = 1;                 \
-        else                                   val = 0;                 \
-        if (size_table[index1] - val > 0)                               \
-            val = get_bits(gb, size_table[index1] - val);               \
-        else                                   val = 0;                 \
-        sign = 0 - (val & 1);                                           \
-        _dmv_y = (sign ^ ((val >> 1) + offset_table[index1])) - sign;   \
+        _dmv_y = offset_table[1][index1];                               \
+        val = size_table[index1] - (!s->quarter_sample && index1 == 5); \
+        if (val > 0) {                                                  \
+            val = get_bits(gb, val);                                    \
+            sign = 0 - (val & 1);                                       \
+            _dmv_y = (sign ^ ((val >> 1) + _dmv_y)) - sign;             \
+        }                                                               \
     }
 
 static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
                                                    int *dmv_y, int *pred_flag)
 {
     int index, index1;
-    int extend_x = 0, extend_y = 0;
+    int extend_x, extend_y;
     GetBitContext *gb = &v->s.gb;
     int bits, esc;
     int val, sign;
-    const int* offs_tab;
 
     if (v->numref) {
         bits = VC1_2REF_MVDATA_VLC_BITS;
@@ -265,51 +256,32 @@ static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x,
         bits = VC1_1REF_MVDATA_VLC_BITS;
         esc  = 71;
     }
-    switch (v->dmvrange) {
-    case 1:
-        extend_x = 1;
-        break;
-    case 2:
-        extend_y = 1;
-        break;
-    case 3:
-        extend_x = extend_y = 1;
-        break;
-    }
+    extend_x = v->dmvrange & 1;
+    extend_y = (v->dmvrange >> 1) & 1;
     index = get_vlc2(gb, v->imv_vlc->table, bits, 3);
     if (index == esc) {
         *dmv_x = get_bits(gb, v->k_x);
         *dmv_y = get_bits(gb, v->k_y);
         if (v->numref) {
-            if (pred_flag) {
+            if (pred_flag)
                 *pred_flag = *dmv_y & 1;
-                *dmv_y     = (*dmv_y + *pred_flag) >> 1;
-            } else {
-                *dmv_y     = (*dmv_y + (*dmv_y & 1)) >> 1;
-            }
+            *dmv_y = (*dmv_y + (*dmv_y & 1)) >> 1;
         }
     }
     else {
-        if (extend_x)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
+        av_assert0(index < esc);
         index1 = (index + 1) % 9;
         if (index1 != 0) {
             val    = get_bits(gb, index1 + extend_x);
-            sign   = 0 -(val & 1);
-            *dmv_x = (sign ^ ((val >> 1) + offs_tab[index1])) - sign;
+            sign   = 0 - (val & 1);
+            *dmv_x = (sign ^ ((val >> 1) + offset_table[extend_x][index1])) - sign;
         } else
             *dmv_x = 0;
-        if (extend_y)
-            offs_tab = offset_table2;
-        else
-            offs_tab = offset_table1;
         index1 = (index + 1) / 9;
         if (index1 > v->numref) {
-            val    = get_bits(gb, (index1 + (extend_y << v->numref)) >> v->numref);
+            val    = get_bits(gb, (index1 >> v->numref) + extend_y);
             sign   = 0 - (val & 1);
-            *dmv_y = (sign ^ ((val >> 1) + offs_tab[index1 >> v->numref])) - sign;
+            *dmv_y = (sign ^ ((val >> 1) + offset_table[extend_y][index1 >> v->numref])) - sign;
         } else
             *dmv_y = 0;
         if (v->numref && pred_flag)
@@ -420,6 +392,12 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     int q1, q2 = 0;
     int dqscale_index;
 
+    /* scale predictors if needed */
+    q1 = s->current_picture.qscale_table[mb_pos];
+    dqscale_index = s->y_dc_scale_table[q1] - 1;
+    if (dqscale_index < 0)
+        return 0;
+
     wrap = s->block_wrap[n];
     dc_val = s->dc_val[0] + s->block_index[n];
 
@@ -429,11 +407,7 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
     c = dc_val[ - 1];
     b = dc_val[ - 1 - wrap];
     a = dc_val[ - wrap];
-    /* scale predictors if needed */
-    q1 = s->current_picture.qscale_table[mb_pos];
-    dqscale_index = s->y_dc_scale_table[q1] - 1;
-    if (dqscale_index < 0)
-        return 0;
+
     if (c_avail && (n != 1 && n != 3)) {
         q2 = s->current_picture.qscale_table[mb_pos - 1];
         if (q2 && q2 != q1)
@@ -455,20 +429,12 @@ static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
             b = (b * s->y_dc_scale_table[q2] * ff_vc1_dqscale[dqscale_index] + 0x20000) >> 18;
     }
 
-    if (a_avail && c_avail) {
-        if (abs(a - b) <= abs(b - c)) {
-            pred     = c;
-            *dir_ptr = 1; // left
-        } else {
-            pred     = a;
-            *dir_ptr = 0; // top
-        }
+    if (c_avail && (!a_avail || abs(a - b) <= abs(b - c))) {
+        pred     = c;
+        *dir_ptr = 1; // left
     } else if (a_avail) {
         pred     = a;
         *dir_ptr = 0; // top
-    } else if (c_avail) {
-        pred     = c;
-        *dir_ptr = 1; // left
     } else {
         pred     = 0;
         *dir_ptr = 1; // left
@@ -527,17 +493,16 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                                 int *value, int codingset)
 {
     GetBitContext *gb = &v->s.gb;
-    int index, escape, run = 0, level = 0, lst = 0;
+    int index, run, level, lst, sign;
 
     index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
     if (index != ff_vc1_ac_sizes[codingset] - 1) {
         run   = vc1_index_decode_table[codingset][index][0];
         level = vc1_index_decode_table[codingset][index][1];
         lst   = index >= vc1_last_decode_table[codingset] || get_bits_left(gb) < 0;
-        if (get_bits1(gb))
-            level = -level;
+        sign  = get_bits1(gb);
     } else {
-        escape = decode210(gb);
+        int escape = decode210(gb);
         if (escape != 2) {
             index = get_vlc2(gb, ff_vc1_ac_coeff_table[codingset].table, AC_VLC_BITS, 3);
             run   = vc1_index_decode_table[codingset][index][0];
@@ -554,10 +519,8 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
                 else
                     run += vc1_delta_run_table[codingset][level] + 1;
             }
-            if (get_bits1(gb))
-                level = -level;
+            sign = get_bits1(gb);
         } else {
-            int sign;
             lst = get_bits1(gb);
             if (v->s.esc3_level_length == 0) {
                 if (v->pq < 8 || v->dquantfrm) { // table 59
@@ -572,14 +535,12 @@ static void vc1_decode_ac_coeff(VC1Context *v, int *last, int *skip,
             run   = get_bits(gb, v->s.esc3_run_length);
             sign  = get_bits1(gb);
             level = get_bits(gb, v->s.esc3_level_length);
-            if (sign)
-                level = -level;
         }
     }
 
     *last  = lst;
     *skip  = run;
-    *value = level;
+    *value = (level ^ -sign) + sign;
 }
 
 /** Decode intra block in intra frames - should be faster than decode_intra_block
@@ -598,7 +559,7 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     int i;
     int16_t *dc_val;
     int16_t *ac_val, *ac_val2;
-    int dcdiff;
+    int dcdiff, scale;
 
     /* Get DC differential */
     if (n < 4) {
@@ -611,16 +572,12 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (v->pq == 1 || v->pq == 2) ? 3 - v->pq : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (v->pq == 1)      dcdiff = get_bits(gb, 10);
-            else if (v->pq == 2) dcdiff = get_bits(gb, 9);
-            else                 dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (v->pq == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (v->pq == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -631,27 +588,29 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-    /* Skip ? */
-    if (!coded) {
-        goto not_coded;
-    }
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
-    // AC Decoding
-    i = 1;
+    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val2 = ac_val;
+    if (dc_pred_dir) // left
+        ac_val -= 16;
+    else // top
+        ac_val -= 16 * s->block_wrap[n];
+
+    scale = v->pq * 2 + v->halfpq;
+
+    //AC Decoding
+    i = !!coded;
 
-    {
+    if (coded) {
         int last = 0, skip, value;
         const uint8_t *zz_table;
-        int scale;
         int k;
 
-        scale = v->pq * 2 + v->halfpq;
-
         if (v->s.ac_pred) {
             if (!dc_pred_dir)
                 zz_table = v->zz_8x8[2];
@@ -660,13 +619,6 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
         } else
             zz_table = v->zz_8x8[1];
 
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
-        if (dc_pred_dir) // left
-            ac_val -= 16;
-        else // top
-            ac_val -= 16 * s->block_wrap[n];
-
         while (!last) {
             vc1_decode_ac_coeff(v, &last, &skip, &value, codingset);
             i += skip;
@@ -677,13 +629,15 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++)
-                    block[k << v->left_blk_sh] += ac_val[k];
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++)
-                    block[k << v->top_blk_sh] += ac_val[k + 8];
+                sh = v->top_blk_sh;
+                ac_val += 8;
             }
+            for (k = 1; k < 8; k++)
+                block[k << sh] += ac_val[k];
         }
         /* save AC coeffs for further prediction */
         for (k = 1; k < 8; k++) {
@@ -699,46 +653,30 @@ static int vc1_decode_i_block(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -v->pq : v->pq;
             }
 
-        if (s->ac_pred) i = 63;
-    }
-
-not_coded:
-    if (!coded) {
-        int k, scale;
-        ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-        ac_val2 = ac_val;
+    } else {
+        int k;
 
-        i = 0;
-        scale = v->pq * 2 + v->halfpq;
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            ac_val -= 16;
-            if (s->ac_pred)
-                memcpy(ac_val2, ac_val, 8 * 2);
-        } else { // top
-            ac_val -= 16 * s->block_wrap[n];
-            if (s->ac_pred)
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-        }
 
         /* apply AC prediction if needed */
         if (s->ac_pred) {
+            int sh;
             if (dc_pred_dir) { //left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -v->pq : v->pq;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -v->pq : v->pq;
             }
-            i = 63;
         }
     }
+    if (s->ac_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -759,7 +697,7 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int a_avail = v->a_avail, c_avail = v->c_avail;
@@ -779,16 +717,12 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (mquant == 1 || mquant == 2) ? 3 - mquant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -799,39 +733,42 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
     *dc_val = dcdiff;
 
     /* Store the quantized DC coeff, used for prediction */
-    if (n < 4) {
-        block[0] = dcdiff * s->y_dc_scale;
-    } else {
-        block[0] = dcdiff * s->c_dc_scale;
-    }
-
-    //AC Decoding
-    i = 1;
+    if (n < 4)
+        scale = s->y_dc_scale;
+    else
+        scale = s->c_dc_scale;
+    block[0] = dcdiff * scale;
 
     /* check if AC is needed at all */
     if (!a_avail && !c_avail)
         use_pred = 0;
-    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
-    ac_val2 = ac_val;
 
     scale = mquant * 2 + ((mquant == v->pq) ? v->halfpq : 0);
 
+    ac_val  = s->ac_val[0][0] + s->block_index[n] * 16;
+    ac_val2 = ac_val;
     if (dc_pred_dir) // left
         ac_val -= 16;
     else // top
         ac_val -= 16 * s->block_wrap[n];
 
     q1 = s->current_picture.qscale_table[mb_pos];
-    if (dc_pred_dir && c_avail && mb_pos)
-        q2 = s->current_picture.qscale_table[mb_pos - 1];
-    if (!dc_pred_dir && a_avail && mb_pos >= s->mb_stride)
-        q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
-    if (dc_pred_dir && n == 1)
-        q2 = q1;
-    if (!dc_pred_dir && n == 2)
-        q2 = q1;
     if (n == 3)
         q2 = q1;
+    else if (dc_pred_dir) {
+        if (n == 1)
+            q2 = q1;
+        else if (c_avail && mb_pos)
+            q2 = s->current_picture.qscale_table[mb_pos - 1];
+    } else {
+        if (n == 2)
+            q2 = q1;
+        else if (a_avail && mb_pos >= s->mb_stride)
+            q2 = s->current_picture.qscale_table[mb_pos - s->mb_stride];
+    }
+
+    //AC Decoding
+    i = 1;
 
     if (coded) {
         int last = 0, skip, value;
@@ -864,28 +801,24 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
+            if (dc_pred_dir) { // left
+                sh = v->left_blk_sh;
+            } else { // top
+                sh = v->top_blk_sh;
+                ac_val += 8;
+            }
             /* scale predictors if needed*/
             if (q2 && q1 != q2) {
                 q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-
                 if (q1 < 1)
                     return AVERROR_INVALIDDATA;
-                if (dc_pred_dir) { // left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                } else { // top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += (ac_val[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += (ac_val[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
             } else {
-                if (dc_pred_dir) { //left
-                    for (k = 1; k < 8; k++)
-                        block[k << v->left_blk_sh] += ac_val[k];
-                } else { //top
-                    for (k = 1; k < 8; k++)
-                        block[k << v->top_blk_sh] += ac_val[k + 8];
-                }
+                for (k = 1; k < 8; k++)
+                    block[k << sh] += ac_val[k];
             }
         }
         /* save AC coeffs for further prediction */
@@ -902,55 +835,38 @@ static int vc1_decode_i_block_adv(VC1Context *v, int16_t block[64], int n,
                     block[k] += (block[k] < 0) ? -mquant : mquant;
             }
 
-        if (use_pred) i = 63;
     } else { // no AC coeffs
         int k;
 
         memset(ac_val2, 0, 16 * 2);
-        if (dc_pred_dir) { // left
-            if (use_pred) {
-                memcpy(ac_val2, ac_val, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        } else { // top
-            if (use_pred) {
-                memcpy(ac_val2 + 8, ac_val + 8, 8 * 2);
-                if (q2 && q1 != q2) {
-                    q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
-                    q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
-                    if (q1 < 1)
-                        return AVERROR_INVALIDDATA;
-                    for (k = 1; k < 8; k++)
-                        ac_val2[k + 8] = (ac_val2[k + 8] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
-                }
-            }
-        }
 
         /* apply AC prediction if needed */
         if (use_pred) {
+            int sh;
             if (dc_pred_dir) { // left
-                for (k = 1; k < 8; k++) {
-                    block[k << v->left_blk_sh] = ac_val2[k] * scale;
-                    if (!v->pquantizer && block[k << v->left_blk_sh])
-                        block[k << v->left_blk_sh] += (block[k << v->left_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->left_blk_sh;
             } else { // top
-                for (k = 1; k < 8; k++) {
-                    block[k << v->top_blk_sh] = ac_val2[k + 8] * scale;
-                    if (!v->pquantizer && block[k << v->top_blk_sh])
-                        block[k << v->top_blk_sh] += (block[k << v->top_blk_sh] < 0) ? -mquant : mquant;
-                }
+                sh = v->top_blk_sh;
+                ac_val  += 8;
+                ac_val2 += 8;
+            }
+            memcpy(ac_val2, ac_val, 8 * 2);
+            if (q2 && q1 != q2) {
+                q1 = q1 * 2 + ((q1 == v->pq) ? v->halfpq : 0) - 1;
+                q2 = q2 * 2 + ((q2 == v->pq) ? v->halfpq : 0) - 1;
+                if (q1 < 1)
+                    return AVERROR_INVALIDDATA;
+                for (k = 1; k < 8; k++)
+                    ac_val2[k] = (ac_val2[k] * q2 * ff_vc1_dqscale[q1 - 1] + 0x20000) >> 18;
+            }
+            for (k = 1; k < 8; k++) {
+                block[k << sh] = ac_val2[k] * scale;
+                if (!v->pquantizer && block[k << sh])
+                    block[k << sh] += (block[k << sh] < 0) ? -mquant : mquant;
             }
-            i = 63;
         }
     }
+    if (use_pred) i = 63;
     s->block_last_index[n] = i;
 
     return 0;
@@ -971,7 +887,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     MpegEncContext *s = &v->s;
     int dc_pred_dir = 0; /* Direction of the DC prediction used */
     int i;
-    int16_t *dc_val;
+    int16_t *dc_val = NULL;
     int16_t *ac_val, *ac_val2;
     int dcdiff;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
@@ -983,7 +899,7 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
     s->bdsp.clear_block(block);
 
     /* XXX: Guard against dumb values of mquant */
-    mquant = (mquant < 1) ? 0 : ((mquant > 31) ? 31 : mquant);
+    mquant = av_clip_uintp2(mquant, 5);
 
     /* Set DC scale - y and c use the same */
     s->y_dc_scale = s->y_dc_scale_table[mquant];
@@ -1000,16 +916,12 @@ static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
         return -1;
     }
     if (dcdiff) {
+        const int m = (mquant == 1 || mquant == 2) ? 3 - mquant : 0;
         if (dcdiff == 119 /* ESC index value */) {
-            /* TODO: Optimize */
-            if (mquant == 1)      dcdiff = get_bits(gb, 10);
-            else if (mquant == 2) dcdiff = get_bits(gb, 9);
-            else                  dcdiff = get_bits(gb, 8);
+            dcdiff = get_bits(gb, 8 + m);
         } else {
-            if (mquant == 1)
-                dcdiff = (dcdiff << 2) + get_bits(gb, 2) - 3;
-            else if (mquant == 2)
-                dcdiff = (dcdiff << 1) + get_bits1(gb)   - 1;
+            if (m)
+                dcdiff = (dcdiff << m) + get_bits(gb, m) - ((1 << m) - 1);
         }
         if (get_bits1(gb))
             dcdiff = -dcdiff;
@@ -1333,8 +1245,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
 
 /** @} */ // Macroblock group
 
-static const int size_table  [6] = { 0, 2, 3, 4,  5,  8 };
-static const int offset_table[6] = { 0, 1, 3, 7, 15, 31 };
+static const uint8_t size_table[6] = { 0, 2, 3, 4,  5,  8 };
 
 /** Decode one P-frame MB
  */
@@ -1416,7 +1327,7 @@ static int vc1_decode_p_mb(VC1Context *v)
 
                     vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                            (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                         continue;
                     v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                     if (v->rangeredfrm)
@@ -1437,7 +1348,7 @@ static int vc1_decode_p_mb(VC1Context *v)
                 } else if (val) {
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block,
                                              s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -1527,7 +1438,7 @@ static int vc1_decode_p_mb(VC1Context *v)
 
                     vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant,
                                            (i & 4) ? v->codingset2 : v->codingset);
-                    if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                    if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                         continue;
                     v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                     if (v->rangeredfrm)
@@ -1549,7 +1460,7 @@ static int vc1_decode_p_mb(VC1Context *v)
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : s->linesize,
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
                                              &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
@@ -1675,7 +1586,7 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
 
                 vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                        (i & 4) ? v->codingset2 : v->codingset);
-                if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
                 v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
                 if (i < 4) {
@@ -1711,19 +1622,14 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
             dst_idx = 0;
             if (fourmv) {
                 mvbp = v->fourmvbp;
-                for (i = 0; i < 6; i++) {
-                    if (i < 4) {
-                        dmv_x = dmv_y = 0;
-                        val   = ((mvbp >> (3 - i)) & 1);
-                        if (val) {
-                            get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
-                        }
-                        ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
-                        ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                    } else if (i == 4) {
-                        ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
-                    }
+                for (i = 0; i < 4; i++) {
+                    dmv_x = dmv_y = 0;
+                    if (mvbp & (8 >> i))
+                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, 0);
+                    ff_vc1_pred_mv_intfr(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], 0);
+                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
                 }
+                ff_vc1_mc_4mv_chroma4(v, 0, 0, 0);
             } else if (twomv) {
                 mvbp  = v->twomvbp;
                 dmv_x = dmv_y = 0;
@@ -1767,7 +1673,7 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -1810,7 +1716,7 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
     int val; /* temp values */
     int first_block = 1;
     int dst_idx, off;
-    int pred_flag;
+    int pred_flag = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0;
 
@@ -1846,7 +1752,7 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             off  = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
@@ -1859,7 +1765,8 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (idx_mbmode <= 5) { // 1-MV
             dmv_x = dmv_y = pred_flag = 0;
             if (idx_mbmode & 1) {
@@ -1870,18 +1777,14 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
             mb_has_coeffs = !(idx_mbmode & 2);
         } else { // 4-MV
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x = dmv_y = pred_flag = 0;
-                    val   = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
-                    }
-                    ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
-                    ff_vc1_mc_4mv_luma(v, i, 0, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, 0);
+            for (i = 0; i < 4; i++) {
+                dmv_x = dmv_y = pred_flag = 0;
+                if (v->fourmvbp & (8 >> i))
+                    get_mvdata_interlaced(v, &dmv_x, &dmv_y, &pred_flag);
+                ff_vc1_pred_mv(v, i, dmv_x, dmv_y, 0, v->range_x, v->range_y, v->mb_type[0], pred_flag, 0);
+                ff_vc1_mc_4mv_luma(v, i, 0, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, 0);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -1903,10 +1806,11 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
                 pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                          first_block, s->dest[dst_idx] + off,
                                          (i & 4) ? s->uvlinesize : s->linesize,
-                                         (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
+                                         CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY),
                                          &block_tt);
                 block_cbp |= pat << (i << 2);
-                if (!v->ttmbf && ttmb < 8) ttmb = -1;
+                if (!v->ttmbf && ttmb < 8)
+                    ttmb = -1;
                 first_block = 0;
             }
         }
@@ -2049,7 +1953,7 @@ static void vc1_decode_b_mb(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
@@ -2063,7 +1967,7 @@ static void vc1_decode_b_mb(VC1Context *v)
             vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                first_block, s->dest[dst_idx] + off,
                                (i & 4) ? s->uvlinesize : s->linesize,
-                               (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
+                               CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
             if (!v->ttmbf && ttmb < 8)
                 ttmb = -1;
             first_block = 0;
@@ -2089,7 +1993,7 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
     int fwd;
     int dmv_x[2], dmv_y[2], pred_flag[2];
     int bmvtype = BMV_TYPE_BACKWARD;
-    int idx_mbmode, interpmvp;
+    int idx_mbmode;
 
     mquant      = v->pq; /* Lossy initialization */
     s->mb_intra = 0;
@@ -2124,7 +2028,7 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if ((i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && (i > 3) && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (v->rangeredfrm)
@@ -2140,12 +2044,14 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
         s->current_picture.mb_type[mb_pos + v->mb_off] = MB_TYPE_16x16;
-        for (i = 0; i < 6; i++) v->mb_type[0][s->block_index[i]] = 0;
+        for (i = 0; i < 6; i++)
+            v->mb_type[0][s->block_index[i]] = 0;
         if (v->fmb_is_raw)
             fwd = v->forward_mb_plane[mb_pos] = get_bits1(gb);
         else
             fwd = v->forward_mb_plane[mb_pos];
         if (idx_mbmode <= 5) { // 1-MV
+            int interpmvp = 0;
             dmv_x[0]     = dmv_x[1] = dmv_y[0] = dmv_y[1] = 0;
             pred_flag[0] = pred_flag[1] = 0;
             if (fwd)
@@ -2168,12 +2074,16 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
             if (bmvtype != BMV_TYPE_DIRECT && idx_mbmode & 1) {
                 get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD], &dmv_y[bmvtype == BMV_TYPE_BACKWARD], &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
             }
-            if (bmvtype == BMV_TYPE_INTERPOLATED && interpmvp) {
+            if (interpmvp) {
                 get_mvdata_interlaced(v, &dmv_x[1], &dmv_y[1], &pred_flag[1]);
             }
             if (bmvtype == BMV_TYPE_DIRECT) {
                 dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
                 dmv_x[1] = dmv_y[1] = pred_flag[0] = 0;
+                if (!s->next_picture_ptr->field_picture) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Mixed field/frame direct mode not supported\n");
+                    return;
+                }
             }
             ff_vc1_pred_b_mv_intfi(v, 0, dmv_x, dmv_y, 1, pred_flag);
             vc1_b_mc(v, dmv_x, dmv_y, (bmvtype == BMV_TYPE_DIRECT), bmvtype);
@@ -2183,21 +2093,18 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                 bmvtype = BMV_TYPE_FORWARD;
             v->bmvtype  = bmvtype;
             v->fourmvbp = get_vlc2(gb, v->fourmvbp_vlc->table, VC1_4MV_BLOCK_PATTERN_VLC_BITS, 1);
-            for (i = 0; i < 6; i++) {
-                if (i < 4) {
-                    dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
-                    dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
-                    val = ((v->fourmvbp >> (3 - i)) & 1);
-                    if (val) {
-                        get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
-                                                 &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
-                                             &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
-                    }
-                    ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
-                    ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
-                } else if (i == 4)
-                    ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
+            for (i = 0; i < 4; i++) {
+                dmv_x[0] = dmv_y[0] = pred_flag[0] = 0;
+                dmv_x[1] = dmv_y[1] = pred_flag[1] = 0;
+                if (v->fourmvbp & (8 >> i)) {
+                    get_mvdata_interlaced(v, &dmv_x[bmvtype == BMV_TYPE_BACKWARD],
+                                             &dmv_y[bmvtype == BMV_TYPE_BACKWARD],
+                                         &pred_flag[bmvtype == BMV_TYPE_BACKWARD]);
+                }
+                ff_vc1_pred_b_mv_intfi(v, i, dmv_x, dmv_y, 0, pred_flag);
+                ff_vc1_mc_4mv_luma(v, i, bmvtype == BMV_TYPE_BACKWARD, 0);
             }
+            ff_vc1_mc_4mv_chroma(v, bmvtype == BMV_TYPE_BACKWARD);
             mb_has_coeffs = idx_mbmode & 1;
         }
         if (mb_has_coeffs)
@@ -2219,7 +2126,7 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                 vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                    first_block, s->dest[dst_idx] + off,
                                    (i & 4) ? s->uvlinesize : s->linesize,
-                                   (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
+                                   CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), NULL);
                 if (!v->ttmbf && ttmb < 8)
                     ttmb = -1;
                 first_block = 0;
@@ -2281,6 +2188,8 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
         direct = v->direct_mb_plane[mb_pos];
 
     if (direct) {
+        if (s->next_picture_ptr->field_picture)
+            av_log(s->avctx, AV_LOG_WARNING, "Mixed frame/field direct mode not supported\n");
         s->mv[0][0][0] = s->current_picture.motion_val[0][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 0, s->quarter_sample);
         s->mv[0][0][1] = s->current_picture.motion_val[0][s->block_index[0]][1] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][1], v->bfraction, 0, s->quarter_sample);
         s->mv[1][0][0] = s->current_picture.motion_val[1][s->block_index[0]][0] = scale_mv(s->next_picture.motion_val[1][s->block_index[0]][0], v->bfraction, 1, s->quarter_sample);
@@ -2342,7 +2251,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
 
             vc1_decode_intra_block(v, s->block[i], i, val, mquant,
                                    (i & 4) ? v->codingset2 : v->codingset);
-            if (i > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+            if (CONFIG_GRAY && i > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             if (i < 4) {
@@ -2508,7 +2417,7 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
                     pat = vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                              first_block, s->dest[dst_idx] + off,
                                              (i & 4) ? s->uvlinesize : (s->linesize << fieldtx),
-                                             (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
+                                             CONFIG_GRAY && (i & 4) && (s->avctx->flags & AV_CODEC_FLAG_GRAY), &block_tt);
                     block_cbp |= pat << (i << 2);
                     if (!v->ttmbf && ttmb < 8)
                         ttmb = -1;
@@ -2651,7 +2560,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
 
                 vc1_decode_i_block(v, s->block[k], k, val, (k < 4) ? v->codingset : v->codingset2);
 
-                if (k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
                 v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
                 if (v->pq >= 9 && v->overlap) {
@@ -2675,7 +2584,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
                 if (s->mb_x) {
                     v->vc1dsp.vc1_h_overlap(s->dest[0], s->linesize);
                     v->vc1dsp.vc1_h_overlap(s->dest[0] + 8 * s->linesize, s->linesize);
-                    if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                         v->vc1dsp.vc1_h_overlap(s->dest[1], s->uvlinesize);
                         v->vc1dsp.vc1_h_overlap(s->dest[2], s->uvlinesize);
                     }
@@ -2685,7 +2594,7 @@ static void vc1_decode_i_blocks(VC1Context *v)
                 if (!s->first_slice_line) {
                     v->vc1dsp.vc1_v_overlap(s->dest[0], s->linesize);
                     v->vc1dsp.vc1_v_overlap(s->dest[0] + 8, s->linesize);
-                    if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                    if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                         v->vc1dsp.vc1_v_overlap(s->dest[1], s->uvlinesize);
                         v->vc1dsp.vc1_v_overlap(s->dest[2], s->uvlinesize);
                     }
@@ -2814,7 +2723,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
                 vc1_decode_i_block_adv(v, block[k], k, val,
                                        (k < 4) ? v->codingset : v->codingset2, mquant);
 
-                if (k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
+                if (CONFIG_GRAY && k > 3 && (s->avctx->flags & AV_CODEC_FLAG_GRAY))
                     continue;
                 v->vc1dsp.vc1_inv_trans_8x8(block[k]);
             }
@@ -2842,15 +2751,14 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
     /* raw bottom MB row */
     s->mb_x = 0;
     init_block_index(v);
-
-    for (;s->mb_x < s->mb_width; s->mb_x++) {
+    for (; s->mb_x < s->mb_width; s->mb_x++) {
         ff_update_block_index(s);
         vc1_put_signed_blocks_clamped(v);
         if (v->s.loop_filter)
             ff_vc1_loop_filter_iblk_delayed(v, v->pq);
     }
     if (v->s.loop_filter)
-        ff_mpeg_draw_horiz_band(s, (s->end_mb_y-1)*16, 16);
+        ff_mpeg_draw_horiz_band(s, (s->end_mb_y - 1) * 16, 16);
     ff_er_add_slice(&s->er, 0, s->start_mb_y << v->field_mode, s->mb_width - 1,
                     (s->end_mb_y << v->field_mode) - 1, ER_MB_END);
 }
@@ -2914,7 +2822,8 @@ static void vc1_decode_p_blocks(VC1Context *v)
         memmove(v->ttblk_base,    v->ttblk,    sizeof(v->ttblk_base[0])    * s->mb_stride);
         memmove(v->is_intra_base, v->is_intra, sizeof(v->is_intra_base[0]) * s->mb_stride);
         memmove(v->luma_mv_base,  v->luma_mv,  sizeof(v->luma_mv_base[0])  * s->mb_stride);
-        if (s->mb_y != s->start_mb_y) ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
+        if (s->mb_y != s->start_mb_y)
+            ff_mpeg_draw_horiz_band(s, (s->mb_y - 1) * 16, 16);
         s->first_slice_line = 0;
     }
     if (apply_loop_filter) {
diff --git a/libavcodec/vc1_common.h b/libavcodec/vc1_common.h
index 788d324..b46c33f 100644
--- a/libavcodec/vc1_common.h
+++ b/libavcodec/vc1_common.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "internal.h"
 
 /** Markers used in VC-1 AP frame data */
 //@{
@@ -57,12 +58,9 @@ enum Profile {
  */
 static av_always_inline const uint8_t* find_next_marker(const uint8_t *src, const uint8_t *end)
 {
-    uint32_t mrk = 0xFFFFFFFF;
-
-    if (end-src < 4)
-        return end;
-    while (src < end) {
-        mrk = (mrk << 8) | *src++;
+    if (end - src >= 4) {
+        uint32_t mrk = 0xFFFFFFFF;
+        src = avpriv_find_start_code(src, end, &mrk);
         if (IS_MARKER(mrk))
             return src - 4;
     }
diff --git a/libavcodec/vc1_loopfilter.c b/libavcodec/vc1_loopfilter.c
index 52cff1e..025776b 100644
--- a/libavcodec/vc1_loopfilter.c
+++ b/libavcodec/vc1_loopfilter.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -40,6 +40,7 @@ void ff_vc1_loop_filter_iblk(VC1Context *v, int pq)
         if (s->mb_x)
             v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize, s->linesize, pq);
         v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize + 8, s->linesize, pq);
+        if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
         for (j = 0; j < 2; j++) {
             v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1], s->uvlinesize, pq);
             if (s->mb_x)
@@ -51,8 +52,10 @@ void ff_vc1_loop_filter_iblk(VC1Context *v, int pq)
     if (s->mb_y == s->end_mb_y - 1) {
         if (s->mb_x) {
             v->vc1dsp.vc1_h_loop_filter16(s->dest[0], s->linesize, pq);
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
             v->vc1dsp.vc1_h_loop_filter8(s->dest[1], s->uvlinesize, pq);
             v->vc1dsp.vc1_h_loop_filter8(s->dest[2], s->uvlinesize, pq);
+            }
         }
         v->vc1dsp.vc1_h_loop_filter16(s->dest[0] + 8, s->linesize, pq);
     }
@@ -73,6 +76,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x >= 2)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize - 16, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize - 8, s->linesize, pq);
+                if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 for (j = 0; j < 2; j++) {
                     v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize - 8, s->uvlinesize, pq);
                     if (s->mb_x >= 2) {
@@ -90,6 +94,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 32 * s->linesize + 8, s->linesize, pq);
+                if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))
                 for (j = 0; j < 2; j++) {
                     v->vc1dsp.vc1_v_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize, s->uvlinesize, pq);
                     if (s->mb_x >= 2) {
@@ -105,7 +110,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x >= 2)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize - 16, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize - 8, s->linesize, pq);
-                if (s->mb_x >= 2) {
+                if (s->mb_x >= 2 && (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))) {
                     for (j = 0; j < 2; j++) {
                         v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize - 8, s->uvlinesize, pq);
                     }
@@ -116,7 +121,7 @@ void ff_vc1_loop_filter_iblk_delayed(VC1Context *v, int pq)
                 if (s->mb_x)
                     v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize, s->linesize, pq);
                 v->vc1dsp.vc1_h_loop_filter16(s->dest[0] - 16 * s->linesize + 8, s->linesize, pq);
-                if (s->mb_x) {
+                if (s->mb_x && (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY))) {
                     for (j = 0; j < 2; j++) {
                         v->vc1dsp.vc1_h_loop_filter8(s->dest[j + 1] - 8 * s->uvlinesize, s->uvlinesize, pq);
                     }
@@ -150,7 +155,7 @@ void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v)
                                       v->block[v->cur_blk_idx][0]);
             v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][3],
                                       v->block[v->cur_blk_idx][2]);
-            if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                 v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][4],
                                           v->block[v->cur_blk_idx][4]);
                 v->vc1dsp.vc1_h_s_overlap(v->block[v->left_blk_idx][5],
@@ -169,7 +174,7 @@ void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v)
                                           v->block[v->cur_blk_idx][0]);
                 v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][3],
                                           v->block[v->cur_blk_idx][1]);
-                if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+                if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                     v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][4],
                                               v->block[v->cur_blk_idx][4]);
                     v->vc1dsp.vc1_v_s_overlap(v->block[v->top_blk_idx][5],
@@ -189,7 +194,7 @@ void ff_vc1_smooth_overlap_filter_iblk(VC1Context *v)
                                       v->block[v->left_blk_idx][0]);
             v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][3],
                                       v->block[v->left_blk_idx][1]);
-            if (!(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
+            if (!CONFIG_GRAY || !(s->avctx->flags & AV_CODEC_FLAG_GRAY)) {
                 v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][4],
                                           v->block[v->left_blk_idx][4]);
                 v->vc1dsp.vc1_v_s_overlap(v->block[v->topleft_blk_idx][5],
@@ -209,7 +214,7 @@ static av_always_inline void vc1_apply_p_v_loop_filter(VC1Context *v, int block_
     int mb_cbp         = v->cbp[s->mb_x - s->mb_stride],
         block_cbp      = mb_cbp      >> (block_num * 4), bottom_cbp,
         mb_is_intra    = v->is_intra[s->mb_x - s->mb_stride],
-        block_is_intra = mb_is_intra >> (block_num * 4), bottom_is_intra;
+        block_is_intra = mb_is_intra >> block_num, bottom_is_intra;
     int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize, ttblk;
     uint8_t *dst;
 
@@ -331,21 +336,22 @@ void ff_vc1_apply_p_loop_filter(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     int i;
+    int block_count = CONFIG_GRAY && (s->avctx->flags & AV_CODEC_FLAG_GRAY) ? 4 : 6;
 
-    for (i = 0; i < 6; i++) {
+    for (i = 0; i < block_count; i++) {
         vc1_apply_p_v_loop_filter(v, i);
     }
 
     /* V always precedes H, therefore we run H one MB before V;
      * at the end of a row, we catch up to complete the row */
     if (s->mb_x) {
-        for (i = 0; i < 6; i++) {
+        for (i = 0; i < block_count; i++) {
             vc1_apply_p_h_loop_filter(v, i);
         }
         if (s->mb_x == s->mb_width - 1) {
             s->mb_x++;
             ff_update_block_index(s);
-            for (i = 0; i < 6; i++) {
+            for (i = 0; i < block_count; i++) {
                 vc1_apply_p_h_loop_filter(v, i);
             }
         }
diff --git a/libavcodec/vc1_mc.c b/libavcodec/vc1_mc.c
index f4632d6..75c74ca 100644
--- a/libavcodec/vc1_mc.c
+++ b/libavcodec/vc1_mc.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,140 @@
 #include "mpegvideo.h"
 #include "vc1.h"
 
+static av_always_inline void vc1_scale_luma(uint8_t *srcY,
+                                            int k, int linesize)
+{
+    int i, j;
+    for (j = 0; j < k; j++) {
+        for (i = 0; i < k; i++)
+            srcY[i] = ((srcY[i] - 128) >> 1) + 128;
+        srcY += linesize;
+    }
+}
+
+static av_always_inline void vc1_scale_chroma(uint8_t *srcU, uint8_t *srcV,
+                                              int k, int uvlinesize)
+{
+    int i, j;
+    for (j = 0; j < k; j++) {
+        for (i = 0; i < k; i++) {
+            srcU[i] = ((srcU[i] - 128) >> 1) + 128;
+            srcV[i] = ((srcV[i] - 128) >> 1) + 128;
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+    }
+}
+
+static av_always_inline void vc1_lut_scale_luma(uint8_t *srcY,
+                                                uint8_t *lut1, uint8_t *lut2,
+                                                int k, int linesize)
+{
+    int i, j;
+
+    for (j = 0; j < k; j += 2) {
+        for (i = 0; i < k; i++)
+            srcY[i] = lut1[srcY[i]];
+        srcY += linesize;
+
+        if (j + 1 == k)
+            break;
+
+        for (i = 0; i < k; i++)
+            srcY[i] = lut2[srcY[i]];
+        srcY += linesize;
+    }
+}
+
+static av_always_inline void vc1_lut_scale_chroma(uint8_t *srcU, uint8_t *srcV,
+                                                  uint8_t *lut1, uint8_t *lut2,
+                                                  int k, int uvlinesize)
+{
+    int i, j;
+
+    for (j = 0; j < k; j += 2) {
+        for (i = 0; i < k; i++) {
+            srcU[i] = lut1[srcU[i]];
+            srcV[i] = lut1[srcV[i]];
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+
+        if (j + 1 == k)
+            break;
+
+        for (i = 0; i < k; i++) {
+            srcU[i] = lut2[srcU[i]];
+            srcV[i] = lut2[srcV[i]];
+        }
+        srcU += uvlinesize;
+        srcV += uvlinesize;
+    }
+}
+
+static const uint8_t popcount4[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+
+static av_always_inline int get_luma_mv(VC1Context *v, int dir, int16_t *tx, int16_t *ty)
+{
+    MpegEncContext *s = &v->s;
+    int idx = v->mv_f[dir][s->block_index[0] + v->blocks_off] |
+             (v->mv_f[dir][s->block_index[1] + v->blocks_off] << 1) |
+             (v->mv_f[dir][s->block_index[2] + v->blocks_off] << 2) |
+             (v->mv_f[dir][s->block_index[3] + v->blocks_off] << 3);
+    static const uint8_t index2[16] = { 0, 0, 0, 0x23, 0, 0x13, 0x03, 0, 0, 0x12, 0x02, 0, 0x01, 0, 0, 0 };
+    int opp_count = popcount4[idx];
+
+    switch (opp_count) {
+    case 0:
+    case 4:
+        *tx = median4(s->mv[dir][0][0], s->mv[dir][1][0], s->mv[dir][2][0], s->mv[dir][3][0]);
+        *ty = median4(s->mv[dir][0][1], s->mv[dir][1][1], s->mv[dir][2][1], s->mv[dir][3][1]);
+        break;
+    case 1:
+        *tx = mid_pred(s->mv[dir][idx < 2][0], s->mv[dir][1 + (idx < 4)][0], s->mv[dir][2 + (idx < 8)][0]);
+        *ty = mid_pred(s->mv[dir][idx < 2][1], s->mv[dir][1 + (idx < 4)][1], s->mv[dir][2 + (idx < 8)][1]);
+        break;
+    case 3:
+        *tx = mid_pred(s->mv[dir][idx > 0xd][0], s->mv[dir][1 + (idx > 0xb)][0], s->mv[dir][2 + (idx > 0x7)][0]);
+        *ty = mid_pred(s->mv[dir][idx > 0xd][1], s->mv[dir][1 + (idx > 0xb)][1], s->mv[dir][2 + (idx > 0x7)][1]);
+        break;
+    case 2:
+        *tx = (s->mv[dir][index2[idx] >> 4][0] + s->mv[dir][index2[idx] & 0xf][0]) / 2;
+        *ty = (s->mv[dir][index2[idx] >> 4][1] + s->mv[dir][index2[idx] & 0xf][1]) / 2;
+        break;
+    }
+    return opp_count;
+}
+
+static av_always_inline int get_chroma_mv(VC1Context *v, int dir, int16_t *tx, int16_t *ty)
+{
+    MpegEncContext *s = &v->s;
+    int idx = !v->mb_type[0][s->block_index[0]] |
+             (!v->mb_type[0][s->block_index[1]] << 1) |
+             (!v->mb_type[0][s->block_index[2]] << 2) |
+             (!v->mb_type[0][s->block_index[3]] << 3);
+    static const uint8_t index2[16] = { 0, 0, 0, 0x01, 0, 0x02, 0x12, 0, 0, 0x03, 0x13, 0, 0x23, 0, 0, 0 };
+    int valid_count = popcount4[idx];
+
+    switch (valid_count) {
+    case 4:
+        *tx = median4(s->mv[dir][0][0], s->mv[dir][1][0], s->mv[dir][2][0], s->mv[dir][3][0]);
+        *ty = median4(s->mv[dir][0][1], s->mv[dir][1][1], s->mv[dir][2][1], s->mv[dir][3][1]);
+        break;
+    case 3:
+        *tx = mid_pred(s->mv[dir][idx > 0xd][0], s->mv[dir][1 + (idx > 0xb)][0], s->mv[dir][2 + (idx > 0x7)][0]);
+        *ty = mid_pred(s->mv[dir][idx > 0xd][1], s->mv[dir][1 + (idx > 0xb)][1], s->mv[dir][2 + (idx > 0x7)][1]);
+        break;
+    case 2:
+        *tx = (s->mv[dir][index2[idx] >> 4][0] + s->mv[dir][index2[idx] & 0xf][0]) / 2;
+        *ty = (s->mv[dir][index2[idx] >> 4][1] + s->mv[dir][index2[idx] & 0xf][1]) / 2;
+        break;
+    default:
+        return 0;
+    }
+    return valid_count;
+}
+
 /** Do motion compensation over 1 macroblock
  * Mostly adapted hpel_motion and qpel_motion from mpegvideo.c
  */
@@ -85,7 +219,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
             srcV = s->current_picture.f->data[2];
             luty  = v->curr_luty;
             lutuv = v->curr_lutuv;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
         } else {
             srcY = s->last_picture.f->data[0];
             srcU = s->last_picture.f->data[1];
@@ -136,7 +270,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
     }
 
     /* for grayscale we should not try to read from unknown area */
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY) {
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY) {
         srcU = s->sc.edge_emu_buffer + 18 * s->linesize;
         srcV = s->sc.edge_emu_buffer + 18 * s->linesize;
     }
@@ -145,81 +279,51 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
         || s->h_edge_pos < 22 || v_edge_pos < 22
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx&3) - 16 - s->mspel * 3
         || (unsigned)(src_y - 1)        > v_edge_pos    - (my&3) - 16 - 3) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *ubuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+        const int k = 17 + s->mspel * 2;
 
         srcY -= s->mspel * (1 + s->linesize);
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
-                                 17 + s->mspel * 2, 17 + s->mspel * 2,
+                                 k, k,
                                  src_x - s->mspel, src_y - s->mspel,
                                  s->h_edge_pos, v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
+        s->vdsp.emulated_edge_mc(ubuf, srcU,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        s->vdsp.emulated_edge_mc(vbuf, srcV,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        srcU = ubuf;
+        srcV = vbuf;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_luma(srcY, k, s->linesize);
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : ((j + src_y - s->mspel) & 1) ;
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : ((j + uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[dir] : ((0 + src_y - s->mspel) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[dir] : ((1 + src_y - s->mspel) & 1)],
+                               k, s->linesize);
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? v->ref_field_type[dir] : ((0 + uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? v->ref_field_type[dir] : ((1 + uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
         srcY += s->mspel * (1 + s->linesize);
     }
 
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0]    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8, srcY + 8, s->linesize, v->rnd);
-        srcY += s->linesize * 8;
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8 * s->linesize    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + 8 * s->linesize + 8, srcY + 8, s->linesize, v->rnd);
+        v->vc1dsp.put_vc1_mspel_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, v->rnd);
     } else { // hpel mc - always used for luma
         dxy = (my & 2) | ((mx & 2) >> 1);
         if (!v->rnd)
@@ -228,7 +332,7 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
             s->hdsp.put_no_rnd_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, 16);
     }
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
     /* Chroma MC always uses qpel bilinear */
     uvmx = (uvmx & 3) << 1;
@@ -242,17 +346,6 @@ void ff_vc1_mc_1mv(VC1Context *v, int dir)
     }
 }
 
-static inline int median4(int a, int b, int c, int d)
-{
-    if (a < b) {
-        if (c < d) return (FFMIN(b, d) + FFMAX(a, c)) / 2;
-        else       return (FFMIN(b, c) + FFMAX(a, d)) / 2;
-    } else {
-        if (c < d) return (FFMIN(a, d) + FFMAX(b, c)) / 2;
-        else       return (FFMIN(a, c) + FFMAX(b, d)) / 2;
-    }
-}
-
 /** Do motion compensation for 4-MV macroblock - luminance block
  */
 void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
@@ -278,7 +371,7 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
         if (v->field_mode && (v->cur_field_type != v->ref_field_type[dir]) && v->second_field) {
             srcY = s->current_picture.f->data[0];
             luty = v->curr_luty;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
         } else {
             srcY = s->last_picture.f->data[0];
             luty = v->last_luty;
@@ -301,35 +394,10 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     }
 
     if (s->pict_type == AV_PICTURE_TYPE_P && n == 3 && v->field_mode) {
-        int same_count = 0, opp_count = 0, k;
-        int chosen_mv[2][4][2], f;
-        int tx = 0, ty = 0;
-        for (k = 0; k < 4; k++) {
-            f = v->mv_f[0][s->block_index[k] + v->blocks_off];
-            chosen_mv[f][f ? opp_count : same_count][0] = s->mv[0][k][0];
-            chosen_mv[f][f ? opp_count : same_count][1] = s->mv[0][k][1];
-            opp_count  += f;
-            same_count += 1 - f;
-        }
-        f = opp_count > same_count;
-        switch (f ? opp_count : same_count) {
-        case 4:
-            tx = median4(chosen_mv[f][0][0], chosen_mv[f][1][0],
-                         chosen_mv[f][2][0], chosen_mv[f][3][0]);
-            ty = median4(chosen_mv[f][0][1], chosen_mv[f][1][1],
-                         chosen_mv[f][2][1], chosen_mv[f][3][1]);
-            break;
-        case 3:
-            tx = mid_pred(chosen_mv[f][0][0], chosen_mv[f][1][0], chosen_mv[f][2][0]);
-            ty = mid_pred(chosen_mv[f][0][1], chosen_mv[f][1][1], chosen_mv[f][2][1]);
-            break;
-        case 2:
-            tx = (chosen_mv[f][0][0] + chosen_mv[f][1][0]) / 2;
-            ty = (chosen_mv[f][0][1] + chosen_mv[f][1][1]) / 2;
-            break;
-        }
-        s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = tx;
-        s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = ty;
+        int opp_count = get_luma_mv(v, 0,
+                                    &s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0],
+                                    &s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1]);
+        int k, f = opp_count > 2;
         for (k = 0; k < 4; k++)
             v->mv_f[1][s->block_index[k] + v->blocks_off] = f;
     }
@@ -385,46 +453,36 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     if (v->field_mode && v->ref_field_type[dir])
         srcY += s->current_picture_ptr->f->linesize[0];
 
-    if (fieldmv && !(src_y & 1))
-        v_edge_pos--;
-    if (fieldmv && (src_y & 1) && src_y < 4)
-        src_y--;
+    if (fieldmv) {
+        if (!(src_y & 1))
+            v_edge_pos--;
+        else
+            src_y -= (src_y < 4);
+    }
     if (v->rangeredfrm || use_ic
         || s->h_edge_pos < 13 || v_edge_pos < 23
         || (unsigned)(src_x - s->mspel) > s->h_edge_pos - (mx & 3) - 8 - s->mspel * 2
         || (unsigned)(src_y - (s->mspel << fieldmv)) > v_edge_pos - (my & 3) - ((8 + s->mspel * 2) << fieldmv)) {
+        const int k = 9 + s->mspel * 2;
+
         srcY -= s->mspel * (1 + (s->linesize << fieldmv));
         /* check emulate edge stride and offset */
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
-                                 9 + s->mspel * 2, (9 + s->mspel * 2) << fieldmv,
+                                 k, k << fieldmv,
                                  src_x - s->mspel, src_y - (s->mspel << fieldmv),
                                  s->h_edge_pos, v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src;
-
-            src = srcY;
-            for (j = 0; j < 9 + s->mspel * 2; j++) {
-                for (i = 0; i < 9 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize << fieldmv;
-            }
+            vc1_scale_luma(srcY, k, s->linesize << fieldmv);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src;
-
-            src = srcY;
-            for (j = 0; j < 9 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[dir] : (((j<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1);
-                for (i = 0; i < 9 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize << fieldmv;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[dir] : (((0<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[dir] : (((1<<fieldmv)+src_y - (s->mspel << fieldmv)) & 1)],
+                               k, s->linesize << fieldmv);
         }
         srcY += s->mspel * (1 + (s->linesize << fieldmv));
     }
@@ -432,9 +490,9 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
         if (avg)
-            v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
+            v->vc1dsp.avg_vc1_mspel_pixels_tab[1][dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
         else
-            v->vc1dsp.put_vc1_mspel_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
+            v->vc1dsp.put_vc1_mspel_pixels_tab[1][dxy](s->dest[0] + off, srcY, s->linesize << fieldmv, v->rnd);
     } else { // hpel mc - always used for luma
         dxy = (my & 2) | ((mx & 2) >> 1);
         if (!v->rnd)
@@ -444,59 +502,6 @@ void ff_vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg)
     }
 }
 
-static av_always_inline int get_chroma_mv(int *mvx, int *mvy, int *a, int flag, int *tx, int *ty)
-{
-    int idx, i;
-    static const int count[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
-
-    idx =  ((a[3] != flag) << 3)
-         | ((a[2] != flag) << 2)
-         | ((a[1] != flag) << 1)
-         |  (a[0] != flag);
-    if (!idx) {
-        *tx = median4(mvx[0], mvx[1], mvx[2], mvx[3]);
-        *ty = median4(mvy[0], mvy[1], mvy[2], mvy[3]);
-        return 4;
-    } else if (count[idx] == 1) {
-        switch (idx) {
-        case 0x1:
-            *tx = mid_pred(mvx[1], mvx[2], mvx[3]);
-            *ty = mid_pred(mvy[1], mvy[2], mvy[3]);
-            return 3;
-        case 0x2:
-            *tx = mid_pred(mvx[0], mvx[2], mvx[3]);
-            *ty = mid_pred(mvy[0], mvy[2], mvy[3]);
-            return 3;
-        case 0x4:
-            *tx = mid_pred(mvx[0], mvx[1], mvx[3]);
-            *ty = mid_pred(mvy[0], mvy[1], mvy[3]);
-            return 3;
-        case 0x8:
-            *tx = mid_pred(mvx[0], mvx[1], mvx[2]);
-            *ty = mid_pred(mvy[0], mvy[1], mvy[2]);
-            return 3;
-        }
-    } else if (count[idx] == 2) {
-        int t1 = 0, t2 = 0;
-        for (i = 0; i < 3; i++)
-            if (!a[i]) {
-                t1 = i;
-                break;
-            }
-        for (i = t1 + 1; i < 4; i++)
-            if (!a[i]) {
-                t2 = i;
-                break;
-            }
-        *tx = (mvx[t1] + mvx[t2]) / 2;
-        *ty = (mvy[t1] + mvy[t2]) / 2;
-        return 2;
-    } else {
-        return 0;
-    }
-    return -1;
-}
-
 /** Do motion compensation for 4-MV macroblock - both chroma blocks
  */
 void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
@@ -505,44 +510,30 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
     H264ChromaContext *h264chroma = &v->h264chroma;
     uint8_t *srcU, *srcV;
     int uvmx, uvmy, uvsrc_x, uvsrc_y;
-    int k, tx = 0, ty = 0;
-    int mvx[4], mvy[4], intra[4], mv_f[4];
-    int valid_count;
-    int chroma_ref_type = v->cur_field_type;
+    int16_t tx, ty;
+    int chroma_ref_type;
     int v_edge_pos = s->v_edge_pos >> v->field_mode;
     uint8_t (*lutuv)[256];
     int use_ic;
 
     if (!v->field_mode && !v->s.last_picture.f->data[0])
         return;
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
-    for (k = 0; k < 4; k++) {
-        mvx[k] = s->mv[dir][k][0];
-        mvy[k] = s->mv[dir][k][1];
-        intra[k] = v->mb_type[0][s->block_index[k]];
-        if (v->field_mode)
-            mv_f[k] = v->mv_f[dir][s->block_index[k] + v->blocks_off];
-    }
-
     /* calculate chroma MV vector from four luma MVs */
-    if (!v->field_mode || (v->field_mode && !v->numref)) {
-        valid_count = get_chroma_mv(mvx, mvy, intra, 0, &tx, &ty);
-        chroma_ref_type = v->reffield;
+    if (!v->field_mode || !v->numref) {
+        int valid_count = get_chroma_mv(v, dir, &tx, &ty);
         if (!valid_count) {
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][0] = 0;
             s->current_picture.motion_val[1][s->block_index[0] + v->blocks_off][1] = 0;
             v->luma_mv[s->mb_x][0] = v->luma_mv[s->mb_x][1] = 0;
             return; //no need to do MC for intra blocks
         }
+        chroma_ref_type = v->ref_field_type[dir];
     } else {
-        int dominant = 0;
-        if (mv_f[0] + mv_f[1] + mv_f[2] + mv_f[3] > 2)
-            dominant = 1;
-        valid_count = get_chroma_mv(mvx, mvy, mv_f, dominant, &tx, &ty);
-        if (dominant)
-            chroma_ref_type = !v->cur_field_type;
+        int opp_count = get_luma_mv(v, dir, &tx, &ty);
+        chroma_ref_type = v->cur_field_type ^ (opp_count > 2);
     }
     if (v->field_mode && chroma_ref_type == 1 && v->cur_field_type == 1 && !v->s.last_picture.f->data[0])
         return;
@@ -578,7 +569,7 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
             srcU = s->current_picture.f->data[1];
             srcV = s->current_picture.f->data[2];
             lutuv = v->curr_lutuv;
-            use_ic = v->curr_use_ic;
+            use_ic = *v->curr_use_ic;
         } else {
             srcU = s->last_picture.f->data[1];
             srcV = s->last_picture.f->data[2];
@@ -624,36 +615,14 @@ void ff_vc1_mc_4mv_chroma(VC1Context *v, int dir)
 
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
         /* if we deal with intensity compensation we need to scale source blocks */
         if (use_ic) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? chroma_ref_type : ((j + uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? chroma_ref_type : ((0 + uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? chroma_ref_type : ((1 + uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
     }
 
@@ -680,13 +649,13 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
     int uvmx_field[4], uvmy_field[4];
     int i, off, tx, ty;
     int fieldmv = v->blk_mv_type[s->block_index[0]];
-    static const int s_rndtblfield[16] = { 0, 0, 1, 2, 4, 4, 5, 6, 2, 2, 3, 8, 6, 6, 7, 12 };
+    static const uint8_t s_rndtblfield[16] = { 0, 0, 1, 2, 4, 4, 5, 6, 2, 2, 3, 8, 6, 6, 7, 12 };
     int v_dist = fieldmv ? 1 : 4; // vertical offset for lower sub-blocks
     int v_edge_pos = s->v_edge_pos >> 1;
     int use_ic;
     uint8_t (*lutuv)[256];
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
     for (i = 0; i < 4; i++) {
@@ -708,23 +677,29 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
         uvsrc_x = av_clip(uvsrc_x, -8, s->avctx->coded_width  >> 1);
         uvsrc_y = av_clip(uvsrc_y, -8, s->avctx->coded_height >> 1);
         if (i < 2 ? dir : dir2) {
-            srcU = s->next_picture.f->data[1] + uvsrc_y * s->uvlinesize + uvsrc_x;
-            srcV = s->next_picture.f->data[2] + uvsrc_y * s->uvlinesize + uvsrc_x;
+            srcU = s->next_picture.f->data[1];
+            srcV = s->next_picture.f->data[2];
             lutuv  = v->next_lutuv;
             use_ic = v->next_use_ic;
         } else {
-            srcU = s->last_picture.f->data[1] + uvsrc_y * s->uvlinesize + uvsrc_x;
-            srcV = s->last_picture.f->data[2] + uvsrc_y * s->uvlinesize + uvsrc_x;
+            srcU = s->last_picture.f->data[1];
+            srcV = s->last_picture.f->data[2];
             lutuv  = v->last_lutuv;
             use_ic = v->last_use_ic;
         }
+        if (!srcU)
+            return;
+        srcU += uvsrc_y * s->uvlinesize + uvsrc_x;
+        srcV += uvsrc_y * s->uvlinesize + uvsrc_x;
         uvmx_field[i] = (uvmx_field[i] & 3) << 1;
         uvmy_field[i] = (uvmy_field[i] & 3) << 1;
 
-        if (fieldmv && !(uvsrc_y & 1))
-            v_edge_pos--;
-        if (fieldmv && (uvsrc_y & 1) && uvsrc_y < 2)
-            uvsrc_y--;
+        if (fieldmv) {
+            if (!(uvsrc_y & 1))
+                v_edge_pos = (s->v_edge_pos >> 1) - 1;
+            else
+                uvsrc_y -= (uvsrc_y < 2);
+        }
         if (use_ic
             || s->h_edge_pos < 10 || v_edge_pos < (5 << fieldmv)
             || (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 5
@@ -742,20 +717,10 @@ void ff_vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg)
 
             /* if we deal with intensity compensation we need to scale source blocks */
             if (use_ic) {
-                int i, j;
-                uint8_t *src, *src2;
-
-                src  = srcU;
-                src2 = srcV;
-                for (j = 0; j < 5; j++) {
-                    int f = (uvsrc_y + (j << fieldmv)) & 1;
-                    for (i = 0; i < 5; i++) {
-                        src[i]  = lutuv[f][src[i]];
-                        src2[i] = lutuv[f][src2[i]];
-                    }
-                    src  += s->uvlinesize << fieldmv;
-                    src2 += s->uvlinesize << fieldmv;
-                }
+                vc1_lut_scale_chroma(srcU, srcV,
+                                     lutuv[(uvsrc_y + (0 << fieldmv)) & 1],
+                                     lutuv[(uvsrc_y + (1 << fieldmv)) & 1],
+                                     5, s->uvlinesize << fieldmv);
             }
         }
         if (avg) {
@@ -786,7 +751,6 @@ void ff_vc1_interp_mc(VC1Context *v)
     H264ChromaContext *h264chroma = &v->h264chroma;
     uint8_t *srcY, *srcU, *srcV;
     int dxy, mx, my, uvmx, uvmy, src_x, src_y, uvsrc_x, uvsrc_y;
-    int off, off_uv;
     int v_edge_pos = s->v_edge_pos >> v->field_mode;
     int use_ic = v->next_use_ic;
 
@@ -837,7 +801,7 @@ void ff_vc1_interp_mc(VC1Context *v)
     }
 
     /* for grayscale we should not try to read from unknown area */
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY) {
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY) {
         srcU = s->sc.edge_emu_buffer + 18 * s->linesize;
         srcV = s->sc.edge_emu_buffer + 18 * s->linesize;
     }
@@ -845,105 +809,72 @@ void ff_vc1_interp_mc(VC1Context *v)
     if (v->rangeredfrm || s->h_edge_pos < 22 || v_edge_pos < 22 || use_ic
         || (unsigned)(src_x - 1) > s->h_edge_pos - (mx & 3) - 16 - 3
         || (unsigned)(src_y - 1) > v_edge_pos    - (my & 3) - 16 - 3) {
-        uint8_t *uvbuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *ubuf = s->sc.edge_emu_buffer + 19 * s->linesize;
+        uint8_t *vbuf = ubuf + 9 * s->uvlinesize;
+        const int k = 17 + s->mspel * 2;
 
         srcY -= s->mspel * (1 + s->linesize);
         s->vdsp.emulated_edge_mc(s->sc.edge_emu_buffer, srcY,
                                  s->linesize, s->linesize,
-                                 17 + s->mspel * 2, 17 + s->mspel * 2,
+                                 k, k,
                                  src_x - s->mspel, src_y - s->mspel,
                                  s->h_edge_pos, v_edge_pos);
         srcY = s->sc.edge_emu_buffer;
-        s->vdsp.emulated_edge_mc(uvbuf, srcU,
+        s->vdsp.emulated_edge_mc(ubuf, srcU,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        s->vdsp.emulated_edge_mc(uvbuf + 16, srcV,
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        s->vdsp.emulated_edge_mc(vbuf, srcV,
                                  s->uvlinesize, s->uvlinesize,
                                  8 + 1, 8 + 1,
-                                 uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1);
-        srcU = uvbuf;
-        srcV = uvbuf + 16;
+                                 uvsrc_x, uvsrc_y,
+                                 s->h_edge_pos >> 1, v_edge_pos >> 1);
+        srcU = ubuf;
+        srcV = vbuf;
         /* if we deal with range reduction we need to scale source blocks */
         if (v->rangeredfrm) {
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = ((src[i] - 128) >> 1) + 128;
-                src += s->linesize;
-            }
-            src = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                for (i = 0; i < 9; i++) {
-                    src[i]  = ((src[i]  - 128) >> 1) + 128;
-                    src2[i] = ((src2[i] - 128) >> 1) + 128;
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_scale_luma(srcY, k, s->linesize);
+            vc1_scale_chroma(srcU, srcV, 9, s->uvlinesize);
         }
 
         if (use_ic) {
             uint8_t (*luty )[256] = v->next_luty;
             uint8_t (*lutuv)[256] = v->next_lutuv;
-            int i, j;
-            uint8_t *src, *src2;
-
-            src = srcY;
-            for (j = 0; j < 17 + s->mspel * 2; j++) {
-                int f = v->field_mode ? v->ref_field_type[1] : ((j+src_y - s->mspel) & 1);
-                for (i = 0; i < 17 + s->mspel * 2; i++)
-                    src[i] = luty[f][src[i]];
-                src += s->linesize;
-            }
-            src  = srcU;
-            src2 = srcV;
-            for (j = 0; j < 9; j++) {
-                int f = v->field_mode ? v->ref_field_type[1] : ((j+uvsrc_y) & 1);
-                for (i = 0; i < 9; i++) {
-                    src[i]  = lutuv[f][src[i]];
-                    src2[i] = lutuv[f][src2[i]];
-                }
-                src  += s->uvlinesize;
-                src2 += s->uvlinesize;
-            }
+            vc1_lut_scale_luma(srcY,
+                               luty[v->field_mode ? v->ref_field_type[1] : ((0+src_y - s->mspel) & 1)],
+                               luty[v->field_mode ? v->ref_field_type[1] : ((1+src_y - s->mspel) & 1)],
+                               k, s->linesize);
+            vc1_lut_scale_chroma(srcU, srcV,
+                                 lutuv[v->field_mode ? v->ref_field_type[1] : ((0+uvsrc_y) & 1)],
+                                 lutuv[v->field_mode ? v->ref_field_type[1] : ((1+uvsrc_y) & 1)],
+                                 9, s->uvlinesize);
         }
         srcY += s->mspel * (1 + s->linesize);
     }
 
-    off    = 0;
-    off_uv = 0;
-
     if (s->mspel) {
         dxy = ((my & 3) << 2) | (mx & 3);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8, srcY + 8, s->linesize, v->rnd);
-        srcY += s->linesize * 8;
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8 * s->linesize    , srcY    , s->linesize, v->rnd);
-        v->vc1dsp.avg_vc1_mspel_pixels_tab[dxy](s->dest[0] + off + 8 * s->linesize + 8, srcY + 8, s->linesize, v->rnd);
+        v->vc1dsp.avg_vc1_mspel_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, v->rnd);
     } else { // hpel mc
         dxy = (my & 2) | ((mx & 2) >> 1);
 
         if (!v->rnd)
-            s->hdsp.avg_pixels_tab[0][dxy](s->dest[0] + off, srcY, s->linesize, 16);
+            s->hdsp.avg_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, 16);
         else
-            s->hdsp.avg_no_rnd_pixels_tab[dxy](s->dest[0] + off, srcY, s->linesize, 16);
+            s->hdsp.avg_no_rnd_pixels_tab[dxy](s->dest[0], srcY, s->linesize, 16);
     }
 
-    if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
+    if (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
     /* Chroma MC always uses qpel bilinear */
     uvmx = (uvmx & 3) << 1;
     uvmy = (uvmy & 3) << 1;
     if (!v->rnd) {
-        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[1] + off_uv, srcU, s->uvlinesize, 8, uvmx, uvmy);
-        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[2] + off_uv, srcV, s->uvlinesize, 8, uvmx, uvmy);
+        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
+        h264chroma->avg_h264_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     } else {
-        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1] + off_uv, srcU, s->uvlinesize, 8, uvmx, uvmy);
-        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2] + off_uv, srcV, s->uvlinesize, 8, uvmx, uvmy);
+        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[1], srcU, s->uvlinesize, 8, uvmx, uvmy);
+        v->vc1dsp.avg_no_rnd_vc1_chroma_pixels_tab[0](s->dest[2], srcV, s->uvlinesize, 8, uvmx, uvmy);
     }
 }
diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
index 38b62f7..9ca6154 100644
--- a/libavcodec/vc1_parser.c
+++ b/libavcodec/vc1_parser.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,6 +29,7 @@
 #include "parser.h"
 #include "vc1.h"
 #include "get_bits.h"
+#include "internal.h"
 
 /** The maximum number of bytes of a sequence, entry point or
  *  frame header whose values we pay any attention to */
@@ -63,6 +64,7 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
     /* Parse the header we just finished unescaping */
     VC1ParseContext *vpc = s->priv_data;
     GetBitContext gb;
+    int ret;
     vpc->v.s.avctx = avctx;
     vpc->v.parse_only = 1;
     init_get_bits(&gb, buf, buf_size * 8);
@@ -75,9 +77,12 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
         break;
     case VC1_CODE_FRAME & 0xFF:
         if(vpc->v.profile < PROFILE_ADVANCED)
-            ff_vc1_parse_frame_header    (&vpc->v, &gb);
+            ret = ff_vc1_parse_frame_header    (&vpc->v, &gb);
         else
-            ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+            ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+
+        if (ret < 0)
+            break;
 
         /* keep AV_PICTURE_TYPE_BI internal to VC1 */
         if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
@@ -108,6 +113,8 @@ static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
 
         break;
     }
+    if (avctx->framerate.num)
+        avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
 }
 
 static int vc1_parse(AVCodecParserContext *s,
@@ -233,7 +240,7 @@ static int vc1_parse(AVCodecParserContext *s,
      * the start code we've already seen, or cause extra bytes to be
      * inserted at the start of the unescaped buffer. */
     vpc->bytes_to_skip = 4;
-    if (next < 0 && start_code_found)
+    if (next < 0 && next != END_NOT_FOUND)
         vpc->bytes_to_skip += next;
 
     *poutbuf = buf;
@@ -244,20 +251,18 @@ static int vc1_parse(AVCodecParserContext *s,
 static int vc1_split(AVCodecContext *avctx,
                            const uint8_t *buf, int buf_size)
 {
-    int i;
-    uint32_t state= -1;
-    int charged=0;
+    uint32_t state = -1;
+    int charged = 0;
+    const uint8_t *ptr = buf, *end = buf + buf_size;
 
-    for(i=0; i<buf_size; i++){
-        state= (state<<8) | buf[i];
-        if(IS_MARKER(state)){
-            if(state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT){
-                charged=1;
-            }else if(charged){
-                return i-3;
-            }
-        }
+    while (ptr < end) {
+        ptr = avpriv_find_start_code(ptr, end, &state);
+        if (state == VC1_CODE_SEQHDR || state == VC1_CODE_ENTRYPOINT) {
+            charged = 1;
+        } else if (charged && IS_MARKER(state))
+            return ptr - 4 - buf;
     }
+
     return 0;
 }
 
@@ -265,6 +270,7 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s)
 {
     VC1ParseContext *vpc = s->priv_data;
     vpc->v.s.slice_context_count = 1;
+    vpc->v.first_pic_header_flag = 1;
     vpc->prev_start_code = 0;
     vpc->bytes_to_skip = 0;
     vpc->unesc_index = 0;
diff --git a/libavcodec/vc1_pred.c b/libavcodec/vc1_pred.c
index 25be787..54712f6 100644
--- a/libavcodec/vc1_pred.c
+++ b/libavcodec/vc1_pred.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -170,9 +170,9 @@ static av_always_inline int scaleforsame(VC1Context *v, int i, int n /* MV */,
     n >>= hpel;
     if (v->s.pict_type != AV_PICTURE_TYPE_B || v->second_field || !dir) {
         if (dim)
-            n = scaleforsame_y(v, i, n, dir) << hpel;
+            n = scaleforsame_y(v, i, n, dir) * (1 << hpel);
         else
-            n = scaleforsame_x(v, n, dir) << hpel;
+            n = scaleforsame_x(v, n, dir) * (1 << hpel);
         return n;
     }
     brfd      = FFMIN(v->brfd, 3);
@@ -202,7 +202,7 @@ static av_always_inline int scaleforopp(VC1Context *v, int n /* MV */,
         refdist = dir ? v->brfd : v->frfd;
     scaleopp = ff_vc1_field_mvpred_scales[dir ^ v->second_field][0][refdist];
 
-    n = (n * scaleopp >> 8) << hpel;
+    n = (n * scaleopp >> 8) * (1 << hpel);
     return n;
 }
 
@@ -231,8 +231,10 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     else
         mixedmv_pic = 0;
     /* scale MV difference to be quad-pel */
-    dmv_x <<= 1 - s->quarter_sample;
-    dmv_y <<= 1 - s->quarter_sample;
+    if (!s->quarter_sample) {
+        dmv_x *= 2;
+        dmv_y *= 2;
+    }
 
     wrap = s->b8_stride;
     xy   = s->block_index[n];
@@ -392,17 +394,13 @@ void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
     /* Pullback MV as specified in 8.3.5.3.4 */
     if (!v->field_mode) {
         int qx, qy, X, Y;
+        int MV = mv1 ? -60 : -28;
         qx = (s->mb_x << 6) + ((n == 1 || n == 3) ? 32 : 0);
         qy = (s->mb_y << 6) + ((n == 2 || n == 3) ? 32 : 0);
         X  = (s->mb_width  << 6) - 4;
         Y  = (s->mb_height << 6) - 4;
-        if (mv1) {
-            if (qx + px < -60) px = -60 - qx;
-            if (qy + py < -60) py = -60 - qy;
-        } else {
-            if (qx + px < -28) px = -28 - qx;
-            if (qy + py < -28) py = -28 - qy;
-        }
+        if (qx + px < MV) px = MV - qx;
+        if (qy + py < MV) py = MV - qy;
         if (qx + px > X) px = X - qx;
         if (qy + py > Y) py = Y - qy;
     }
@@ -602,9 +600,9 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 px = mid_pred(A[0], B[0], C[0]);
                 py = mid_pred(A[1], B[1], C[1]);
             } else if (total_valid) {
-                if (a_valid) { px = A[0]; py = A[1]; }
-                if (b_valid) { px = B[0]; py = B[1]; }
-                if (c_valid) { px = C[0]; py = C[1]; }
+                if      (a_valid) { px = A[0]; py = A[1]; }
+                else if (b_valid) { px = B[0]; py = B[1]; }
+                else              { px = C[0]; py = C[1]; }
             }
         }
     } else {
@@ -644,7 +642,8 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 } else if (!field_b && b_valid) {
                     px = B[0];
                     py = B[1];
-                } else if (c_valid) {
+                } else /*if (c_valid)*/ {
+                    av_assert1(c_valid);
                     px = C[0];
                     py = C[1];
                 }
@@ -652,7 +651,8 @@ void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                 if (field_a && a_valid) {
                     px = A[0];
                     py = A[1];
-                } else if (field_b && b_valid) {
+                } else /*if (field_b && b_valid)*/ {
+                    av_assert1(field_b && b_valid);
                     px = B[0];
                     py = B[1];
                 }
@@ -692,25 +692,31 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
     int r_x, r_y;
     const uint8_t *is_intra = v->mb_type[0];
 
+    av_assert0(!v->field_mode);
+
     r_x = v->range_x;
     r_y = v->range_y;
     /* scale MV difference to be quad-pel */
-    dmv_x[0] <<= 1 - s->quarter_sample;
-    dmv_y[0] <<= 1 - s->quarter_sample;
-    dmv_x[1] <<= 1 - s->quarter_sample;
-    dmv_y[1] <<= 1 - s->quarter_sample;
+    if (!s->quarter_sample) {
+        dmv_x[0] *= 2;
+        dmv_y[0] *= 2;
+        dmv_x[1] *= 2;
+        dmv_y[1] *= 2;
+    }
 
     wrap = s->b8_stride;
     xy = s->block_index[0];
 
     if (s->mb_intra) {
-        s->current_picture.motion_val[0][xy + v->blocks_off][0] =
-        s->current_picture.motion_val[0][xy + v->blocks_off][1] =
-        s->current_picture.motion_val[1][xy + v->blocks_off][0] =
-        s->current_picture.motion_val[1][xy + v->blocks_off][1] = 0;
+        s->current_picture.motion_val[0][xy][0] =
+        s->current_picture.motion_val[0][xy][1] =
+        s->current_picture.motion_val[1][xy][0] =
+        s->current_picture.motion_val[1][xy][1] = 0;
         return;
     }
-    if (!v->field_mode) {
+        if (direct && s->next_picture_ptr->field_picture)
+            av_log(s->avctx, AV_LOG_WARNING, "Mixed frame/field direct mode not supported\n");
+
         s->mv[0][0][0] = scale_mv(s->next_picture.motion_val[1][xy][0], v->bfraction, 0, s->quarter_sample);
         s->mv[0][0][1] = scale_mv(s->next_picture.motion_val[1][xy][1], v->bfraction, 0, s->quarter_sample);
         s->mv[1][0][0] = scale_mv(s->next_picture.motion_val[1][xy][0], v->bfraction, 1, s->quarter_sample);
@@ -721,12 +727,11 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         s->mv[0][0][1] = av_clip(s->mv[0][0][1], -60 - (s->mb_y << 6), (s->mb_height << 6) - 4 - (s->mb_y << 6));
         s->mv[1][0][0] = av_clip(s->mv[1][0][0], -60 - (s->mb_x << 6), (s->mb_width  << 6) - 4 - (s->mb_x << 6));
         s->mv[1][0][1] = av_clip(s->mv[1][0][1], -60 - (s->mb_y << 6), (s->mb_height << 6) - 4 - (s->mb_y << 6));
-    }
     if (direct) {
-        s->current_picture.motion_val[0][xy + v->blocks_off][0] = s->mv[0][0][0];
-        s->current_picture.motion_val[0][xy + v->blocks_off][1] = s->mv[0][0][1];
-        s->current_picture.motion_val[1][xy + v->blocks_off][0] = s->mv[1][0][0];
-        s->current_picture.motion_val[1][xy + v->blocks_off][1] = s->mv[1][0][1];
+        s->current_picture.motion_val[0][xy][0] = s->mv[0][0][0];
+        s->current_picture.motion_val[0][xy][1] = s->mv[0][0][1];
+        s->current_picture.motion_val[1][xy][0] = s->mv[1][0][0];
+        s->current_picture.motion_val[1][xy][1] = s->mv[1][0][1];
         return;
     }
 
@@ -754,25 +759,16 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         /* Pullback MV as specified in 8.3.5.3.4 */
         {
             int qx, qy, X, Y;
-            if (v->profile < PROFILE_ADVANCED) {
-                qx = (s->mb_x << 5);
-                qy = (s->mb_y << 5);
-                X  = (s->mb_width  << 5) - 4;
-                Y  = (s->mb_height << 5) - 4;
-                if (qx + px < -28) px = -28 - qx;
-                if (qy + py < -28) py = -28 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            } else {
-                qx = (s->mb_x << 6);
-                qy = (s->mb_y << 6);
-                X  = (s->mb_width  << 6) - 4;
-                Y  = (s->mb_height << 6) - 4;
-                if (qx + px < -60) px = -60 - qx;
-                if (qy + py < -60) py = -60 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            }
+            int sh = v->profile < PROFILE_ADVANCED ? 5 : 6;
+            int MV = 4 - (1 << sh);
+            qx = (s->mb_x << sh);
+            qy = (s->mb_y << sh);
+            X  = (s->mb_width  << sh) - 4;
+            Y  = (s->mb_height << sh) - 4;
+            if (qx + px < MV) px = MV - qx;
+            if (qy + py < MV) py = MV - qy;
+            if (qx + px > X) px = X - qx;
+            if (qy + py > Y) py = Y - qy;
         }
         /* Calculate hybrid prediction as specified in 8.3.5.3.5 */
         if (0 && !s->first_slice_line && s->mb_x) {
@@ -833,25 +829,16 @@ void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
         /* Pullback MV as specified in 8.3.5.3.4 */
         {
             int qx, qy, X, Y;
-            if (v->profile < PROFILE_ADVANCED) {
-                qx = (s->mb_x << 5);
-                qy = (s->mb_y << 5);
-                X  = (s->mb_width  << 5) - 4;
-                Y  = (s->mb_height << 5) - 4;
-                if (qx + px < -28) px = -28 - qx;
-                if (qy + py < -28) py = -28 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            } else {
-                qx = (s->mb_x << 6);
-                qy = (s->mb_y << 6);
-                X  = (s->mb_width  << 6) - 4;
-                Y  = (s->mb_height << 6) - 4;
-                if (qx + px < -60) px = -60 - qx;
-                if (qy + py < -60) py = -60 - qy;
-                if (qx + px > X) px = X - qx;
-                if (qy + py > Y) py = Y - qy;
-            }
+            int sh = v->profile < PROFILE_ADVANCED ? 5 : 6;
+            int MV = 4 - (1 << sh);
+            qx = (s->mb_x << sh);
+            qy = (s->mb_y << sh);
+            X  = (s->mb_width  << sh) - 4;
+            Y  = (s->mb_height << sh) - 4;
+            if (qx + px < MV) px = MV - qx;
+            if (qy + py < MV) py = MV - qy;
+            if (qx + px > X) px = X - qx;
+            if (qy + py > Y) py = Y - qy;
         }
         /* Calculate hybrid prediction as specified in 8.3.5.3.5 */
         if (0 && !s->first_slice_line && s->mb_x) {
diff --git a/libavcodec/vc1_pred.h b/libavcodec/vc1_pred.h
index 34c9c1a..4d47f86 100644
--- a/libavcodec/vc1_pred.h
+++ b/libavcodec/vc1_pred.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vc1acdata.h b/libavcodec/vc1acdata.h
index 73ebe40..a70b44a 100644
--- a/libavcodec/vc1acdata.h
+++ b/libavcodec/vc1acdata.h
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder
  * copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vc1data.c b/libavcodec/vc1data.c
index 70cead8..fc9ba6d 100644
--- a/libavcodec/vc1data.c
+++ b/libavcodec/vc1data.c
@@ -4,20 +4,20 @@
  * copyright (c) 2006 Konstantin Shishkov
  * (c) 2005 anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -1019,21 +1019,21 @@ const uint8_t ff_vc1_mv_diff_bits[4][73] = {
 /* DC differentials low+hi-mo, p217 are the same as in msmpeg4data .h */
 
 /* Table 232 */
-const int8_t ff_vc1_simple_progressive_4x4_zz [16] = {
+const uint8_t ff_vc1_simple_progressive_4x4_zz [16] = {
      0,     8,    16,     1,
      9,    24,    17,     2,
     10,    18,    25,     3,
     11,    26,    19,    27
 };
 
-const int8_t ff_vc1_adv_progressive_8x4_zz [32] = { /* Table 233 */
+const uint8_t ff_vc1_adv_progressive_8x4_zz [32] = { /* Table 233 */
      0,     8,     1,    16,     2,     9,    10,     3,
     24,    17,     4,    11,    18,    12,     5,    19,
     25,    13,    20,    26,    27,     6,    21,    28,
     14,    22,    29,     7,    30,    15,    23,    31
 };
 
-const int8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
+const uint8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
      0,     1,     8,     2,
      9,    16,    17,    24,
     10,    32,    25,    18,
@@ -1044,7 +1044,7 @@ const int8_t ff_vc1_adv_progressive_4x8_zz [32] = { /* Table 234 */
     35,    43,    51,    59
 };
 
-const int8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
+const uint8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
      0,     8,     1,    16,    24,     9,     2,    32,
     40,    48,    56,    17,    10,     3,    25,    18,
     11,     4,    33,    41,    49,    57,    26,    34,
@@ -1055,14 +1055,14 @@ const int8_t ff_vc1_adv_interlaced_8x8_zz [64] = { /* Table 235 */
     61,    62,    54,    46,    39,    47,    55,    63
 };
 
-const int8_t ff_vc1_adv_interlaced_8x4_zz [32] = { /* Table 236 */
+const uint8_t ff_vc1_adv_interlaced_8x4_zz [32] = { /* Table 236 */
      0,     8,    16,    24,     1,     9,     2,    17,
     25,    10,     3,    18,    26,     4,    11,    19,
     12,     5,    13,    20,    27,     6,    21,    28,
     14,    22,    29,     7,    30,    15,    23,    31
 };
 
-const int8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
+const uint8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
      0,     1,     2,     8,
     16,     9,    24,    17,
     10,     3,    32,    40,
@@ -1073,7 +1073,7 @@ const int8_t ff_vc1_adv_interlaced_4x8_zz [32] = { /* Table 237 */
     35,    43,    51,    59
 };
 
-const int8_t ff_vc1_adv_interlaced_4x4_zz [16] = { /* Table 238 */
+const uint8_t ff_vc1_adv_interlaced_4x4_zz [16] = { /* Table 238 */
      0,     8,    16,    24,
      1,     9,    17,     2,
     25,    10,    18,     3,
diff --git a/libavcodec/vc1data.h b/libavcodec/vc1data.h
index 66c569b..763cd48 100644
--- a/libavcodec/vc1data.h
+++ b/libavcodec/vc1data.h
@@ -3,20 +3,20 @@
  * copyright (c) 2006 Konstantin Shishkov
  * (c) 2005 anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -181,15 +181,15 @@ extern const uint8_t ff_vc1_2ref_mvdata_bits[8][126];
 /* DC differentials low+hi-mo, p217 are the same as in msmpeg4data .h */
 
 /* Scantables/ZZ scan are at 11.9 (p262) and 8.1.1.12 (p10) */
-extern const int8_t ff_vc1_simple_progressive_4x4_zz [16];
-extern const int8_t ff_vc1_adv_progressive_8x4_zz [32];
-extern const int8_t ff_vc1_adv_progressive_4x8_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_8x8_zz [64];
-extern const int8_t ff_vc1_adv_interlaced_8x4_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_4x8_zz [32];
-extern const int8_t ff_vc1_adv_interlaced_4x4_zz [16];
-extern const int8_t ff_vc1_intra_horz_8x8_zz [64];
-extern const int8_t ff_vc1_intra_vert_8x8_zz [64];
+extern const uint8_t ff_vc1_simple_progressive_4x4_zz [16];
+extern const uint8_t ff_vc1_adv_progressive_8x4_zz [32];
+extern const uint8_t ff_vc1_adv_progressive_4x8_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_8x8_zz [64];
+extern const uint8_t ff_vc1_adv_interlaced_8x4_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_4x8_zz [32];
+extern const uint8_t ff_vc1_adv_interlaced_4x4_zz [16];
+extern const uint8_t ff_vc1_intra_horz_8x8_zz [64];
+extern const uint8_t ff_vc1_intra_vert_8x8_zz [64];
 
 /* DQScale as specified in 8.1.3.9 - almost identical to 0x40000/i */
 extern const int32_t ff_vc1_dqscale[63];
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 3d042d0..4f78aa8 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2006-2007 Konstantin Shishkov
  * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,6 +37,9 @@
 #include "profiles.h"
 #include "vc1.h"
 #include "vc1data.h"
+#include "vdpau_compat.h"
+#include "libavutil/avassert.h"
+
 
 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
 
@@ -98,7 +101,7 @@ static void vc1_sprite_parse_transform(GetBitContext* gb, int c[7])
         c[6] = 1 << 16;
 }
 
-static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
+static int vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
 {
     AVCodecContext *avctx = v->s.avctx;
     int sprite, i;
@@ -142,7 +145,7 @@ static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
         sd->effect_pcount2 = get_bits(gb, 16);
         if (sd->effect_pcount2 > 10) {
             av_log(avctx, AV_LOG_ERROR, "Too many effect parameters\n");
-            return;
+            return AVERROR_INVALIDDATA;
         } else if (sd->effect_pcount2) {
             i = -1;
             av_log(avctx, AV_LOG_DEBUG, "Effect params 2: ");
@@ -159,10 +162,14 @@ static void vc1_parse_sprites(VC1Context *v, GetBitContext* gb, SpriteData* sd)
         av_log(avctx, AV_LOG_DEBUG, "Effect flag set\n");
 
     if (get_bits_count(gb) >= gb->size_in_bits +
-       (avctx->codec_id == AV_CODEC_ID_WMV3IMAGE ? 64 : 0))
+       (avctx->codec_id == AV_CODEC_ID_WMV3IMAGE ? 64 : 0)) {
         av_log(avctx, AV_LOG_ERROR, "Buffer overrun\n");
+        return AVERROR_INVALIDDATA;
+    }
     if (get_bits_count(gb) < gb->size_in_bits - 8)
         av_log(avctx, AV_LOG_WARNING, "Buffer not fully read\n");
+
+    return 0;
 }
 
 static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
@@ -174,7 +181,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
     int ysub[2];
     MpegEncContext *s = &v->s;
 
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i <= v->two_sprites; i++) {
         xoff[i] = av_clip(sd->coefs[i][2], 0, v->sprite_width-1 << 16);
         xadv[i] = sd->coefs[i][0];
         if (xadv[i] != 1<<16 || (v->sprite_width << 16) - (v->output_width << 16) - xoff[i])
@@ -185,7 +192,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
     }
     alpha = av_clip_uint16(sd->coefs[1][6]);
 
-    for (plane = 0; plane < (s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++) {
+    for (plane = 0; plane < (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++) {
         int width = v->output_width>>!!plane;
 
         for (row = 0; row < v->output_height>>!!plane; row++) {
@@ -252,7 +259,7 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
         }
 
         if (!plane) {
-            for (i = 0; i < 2; i++) {
+            for (i = 0; i <= v->two_sprites; i++) {
                 xoff[i] >>= 1;
                 yoff[i] >>= 1;
             }
@@ -264,15 +271,20 @@ static void vc1_draw_sprites(VC1Context *v, SpriteData* sd)
 
 static int vc1_decode_sprites(VC1Context *v, GetBitContext* gb)
 {
+    int ret;
     MpegEncContext *s     = &v->s;
     AVCodecContext *avctx = s->avctx;
     SpriteData sd;
 
-    vc1_parse_sprites(v, gb, &sd);
+    memset(&sd, 0, sizeof(sd));
 
-    if (!s->current_picture.f->data[0]) {
+    ret = vc1_parse_sprites(v, gb, &sd);
+    if (ret < 0)
+        return ret;
+
+    if (!s->current_picture.f || !s->current_picture.f->data[0]) {
         av_log(avctx, AV_LOG_ERROR, "Got no sprites\n");
-        return -1;
+        return AVERROR_UNKNOWN;
     }
 
     if (v->two_sprites && (!s->last_picture_ptr || !s->last_picture.f->data[0])) {
@@ -281,10 +293,8 @@ static int vc1_decode_sprites(VC1Context *v, GetBitContext* gb)
     }
 
     av_frame_unref(v->sprite_output_frame);
-    if (ff_get_buffer(avctx, v->sprite_output_frame, 0) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return -1;
-    }
+    if ((ret = ff_get_buffer(avctx, v->sprite_output_frame, 0)) < 0)
+        return ret;
 
     vc1_draw_sprites(v, &sd);
 
@@ -303,7 +313,7 @@ static void vc1_sprite_flush(AVCodecContext *avctx)
        wrong but it looks better than doing nothing. */
 
     if (f && f->data[0])
-        for (plane = 0; plane < (s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++)
+        for (plane = 0; plane < (CONFIG_GRAY && s->avctx->flags & AV_CODEC_FLAG_GRAY ? 1 : 3); plane++)
             for (i = 0; i < v->sprite_height>>!!plane; i++)
                 memset(f->data[plane] + i * f->linesize[plane],
                        plane ? 128 : 0, f->linesize[plane]);
@@ -342,7 +352,7 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
     if (!v->is_intra_base)
         goto error;
     v->is_intra         = v->is_intra_base + s->mb_stride;
-    v->luma_mv_base     = av_malloc(sizeof(v->luma_mv_base[0]) * 2 * s->mb_stride);
+    v->luma_mv_base     = av_mallocz(sizeof(v->luma_mv_base[0]) * 2 * s->mb_stride);
     if (!v->luma_mv_base)
         goto error;
     v->luma_mv          = v->luma_mv_base + s->mb_stride;
@@ -372,11 +382,9 @@ av_cold int ff_vc1_decode_init_alloc_tables(VC1Context *v)
     v->mv_f_next[1]     = v->mv_f_next[0] + (s->b8_stride * (mb_height * 2 + 1) + s->mb_stride * (mb_height + 1) * 2);
 
     if (s->avctx->codec_id == AV_CODEC_ID_WMV3IMAGE || s->avctx->codec_id == AV_CODEC_ID_VC1IMAGE) {
-        for (i = 0; i < 4; i++) {
-            v->sr_rows[i >> 1][i & 1] = av_malloc(v->output_width);
-            if (!v->sr_rows[i >> 1][i & 1])
-                goto error;
-        }
+        for (i = 0; i < 4; i++)
+            if (!(v->sr_rows[i >> 1][i & 1] = av_malloc(v->output_width)))
+                return AVERROR(ENOMEM);
     }
 
     ret = ff_intrax8_common_init(s->avctx, &v->x8, &s->idsp,
@@ -396,7 +404,7 @@ av_cold void ff_vc1_init_transposed_scantables(VC1Context *v)
 {
     int i;
     for (i = 0; i < 64; i++) {
-#define transpose(x) ((x >> 3) | ((x & 7) << 3))
+#define transpose(x) (((x) >> 3) | (((x) & 7) << 3))
         v->zz_8x8[0][i] = transpose(ff_wmv1_scantable[0][i]);
         v->zz_8x8[1][i] = transpose(ff_wmv1_scantable[1][i]);
         v->zz_8x8[2][i] = transpose(ff_wmv1_scantable[2][i]);
@@ -416,6 +424,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
     VC1Context *v = avctx->priv_data;
     MpegEncContext *s = &v->s;
     GetBitContext gb;
+    int ret;
 
     /* save the container output size for WMImage */
     v->output_width  = avctx->width;
@@ -423,17 +432,10 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 
     if (!avctx->extradata_size || !avctx->extradata)
         return -1;
-    if (!(avctx->flags & AV_CODEC_FLAG_GRAY))
-        avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-    else
-        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
     v->s.avctx = avctx;
 
-    if (ff_vc1_init_common(v) < 0)
-        return -1;
-    ff_blockdsp_init(&s->bdsp, avctx);
-    ff_h264chroma_init(&v->h264chroma, 8);
-    ff_qpeldsp_init(&s->qdsp);
+    if ((ret = ff_vc1_init_common(v)) < 0)
+        return ret;
 
     if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) {
         int count = 0;
@@ -445,8 +447,8 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
 
         init_get_bits(&gb, avctx->extradata, avctx->extradata_size*8);
 
-        if (ff_vc1_decode_sequence_header(avctx, v, &gb) < 0)
-          return -1;
+        if ((ret = ff_vc1_decode_sequence_header(avctx, v, &gb)) < 0)
+          return ret;
 
         count = avctx->extradata_size*8 - get_bits_count(&gb);
         if (count > 0) {
@@ -469,6 +471,9 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
         }
 
         buf2  = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!buf2)
+            return AVERROR(ENOMEM);
+
         start = find_next_marker(start, end); // in WVC1 extradata first byte is its size, but can be 0 in mkv
         next  = start;
         for (; next < end; start = next) {
@@ -480,16 +485,16 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             init_get_bits(&gb, buf2, buf2_size * 8);
             switch (AV_RB32(start)) {
             case VC1_CODE_SEQHDR:
-                if (ff_vc1_decode_sequence_header(avctx, v, &gb) < 0) {
+                if ((ret = ff_vc1_decode_sequence_header(avctx, v, &gb)) < 0) {
                     av_free(buf2);
-                    return -1;
+                    return ret;
                 }
                 seq_initialized = 1;
                 break;
             case VC1_CODE_ENTRYPOINT:
-                if (ff_vc1_decode_entry_point(avctx, v, &gb) < 0) {
+                if ((ret = ff_vc1_decode_entry_point(avctx, v, &gb)) < 0) {
                     av_free(buf2);
-                    return -1;
+                    return ret;
                 }
                 ep_initialized = 1;
                 break;
@@ -503,14 +508,38 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
         v->res_sprite = (avctx->codec_id == AV_CODEC_ID_VC1IMAGE);
     }
 
-    v->sprite_output_frame = av_frame_alloc();
-    if (!v->sprite_output_frame)
-        return AVERROR(ENOMEM);
-
     avctx->profile = v->profile;
     if (v->profile == PROFILE_ADVANCED)
         avctx->level = v->level;
 
+    if (!CONFIG_GRAY || !(avctx->flags & AV_CODEC_FLAG_GRAY))
+        avctx->pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
+    else {
+        avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        if (avctx->color_range == AVCOL_RANGE_UNSPECIFIED)
+            avctx->color_range = AVCOL_RANGE_MPEG;
+    }
+
+    // ensure static VLC tables are initialized
+    if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
+        return ret;
+    if ((ret = ff_vc1_decode_init_alloc_tables(v)) < 0)
+        return ret;
+    // Hack to ensure the above functions will be called
+    // again once we know all necessary settings.
+    // That this is necessary might indicate a bug.
+    ff_vc1_decode_end(avctx);
+
+    ff_blockdsp_init(&s->bdsp, avctx);
+    ff_h264chroma_init(&v->h264chroma, 8);
+    ff_qpeldsp_init(&s->qdsp);
+
+    // Must happen after calling ff_vc1_decode_end
+    // to avoid de-allocating the sprite_output_frame
+    v->sprite_output_frame = av_frame_alloc();
+    if (!v->sprite_output_frame)
+        return AVERROR(ENOMEM);
+
     avctx->has_b_frames = !!avctx->max_b_frames;
 
     if (v->color_prim == 1 || v->color_prim == 5 || v->color_prim == 6)
@@ -543,6 +572,11 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             v->sprite_height > 1 << 14 ||
             v->output_width  > 1 << 14 ||
             v->output_height > 1 << 14) return -1;
+
+        if ((v->sprite_width&1) || (v->sprite_height&1)) {
+            avpriv_request_sample(avctx, "odd sprites support");
+            return AVERROR_PATCHWELCOME;
+        }
     }
     return 0;
 }
@@ -594,14 +628,19 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     MpegEncContext *s = &v->s;
     AVFrame *pict = data;
     uint8_t *buf2 = NULL;
-    const uint8_t *buf_start = buf;
-    int mb_height, n_slices1;
+    const uint8_t *buf_start = buf, *buf_start_second_field = NULL;
+    int mb_height, n_slices1=-1;
     struct {
         uint8_t *buf;
         GetBitContext gb;
         int mby_start;
     } *slices = NULL, *tmp;
 
+    v->second_field = 0;
+
+    if(s->avctx->flags & AV_CODEC_FLAG_LOW_DELAY)
+        s->low_delay = 1;
+
     /* no supplementary picture */
     if (buf_size == 0 || (buf_size == 4 && AV_RB32(buf) == VC1_CODE_ENDOFSEQ)) {
         /* special case for last picture */
@@ -613,13 +652,24 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             *got_frame = 1;
         }
 
-        return 0;
+        return buf_size;
     }
 
+#if FF_API_CAP_VDPAU
+    if (s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU) {
+        if (v->profile < PROFILE_ADVANCED)
+            avctx->pix_fmt = AV_PIX_FMT_VDPAU_WMV3;
+        else
+            avctx->pix_fmt = AV_PIX_FMT_VDPAU_VC1;
+    }
+#endif
+
     //for advanced profile we may need to parse and unescape data
     if (avctx->codec_id == AV_CODEC_ID_VC1 || avctx->codec_id == AV_CODEC_ID_VC1IMAGE) {
         int buf_size2 = 0;
         buf2 = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
+        if (!buf2)
+            return AVERROR(ENOMEM);
 
         if (IS_MARKER(AV_RB32(buf))) { /* frame starts with marker and needs to be parsed */
             const uint8_t *start, *end, *next;
@@ -632,26 +682,40 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 if (size <= 0) continue;
                 switch (AV_RB32(start)) {
                 case VC1_CODE_FRAME:
-                    if (avctx->hwaccel)
+                    if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+                        || s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+                        )
                         buf_start = start;
                     buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
                     break;
                 case VC1_CODE_FIELD: {
                     int buf_size3;
-                    tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                    if (!tmp)
+                    if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+                        || s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+                        )
+                        buf_start_second_field = start;
+                    tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                    if (!tmp) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     slices = tmp;
                     slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                    if (!slices[n_slices].buf)
+                    if (!slices[n_slices].buf) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     buf_size3 = vc1_unescape_buffer(start + 4, size,
                                                     slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     /* assuming that the field marker is at the exact middle,
                        hope it's correct */
-                    slices[n_slices].mby_start = s->mb_height >> 1;
+                    slices[n_slices].mby_start = s->mb_height + 1 >> 1;
                     n_slices1 = n_slices - 1; // index of the last slice of the first field
                     n_slices++;
                     break;
@@ -663,13 +727,17 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 case VC1_CODE_SLICE: {
                     int buf_size3;
-                    tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                    if (!tmp)
+                    tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                    if (!tmp) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     slices = tmp;
                     slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                    if (!slices[n_slices].buf)
+                    if (!slices[n_slices].buf) {
+                        ret = AVERROR(ENOMEM);
                         goto err;
+                    }
                     buf_size3 = vc1_unescape_buffer(start + 4, size,
                                                     slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
@@ -687,19 +755,30 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             divider = find_next_marker(buf, buf + buf_size);
             if ((divider == (buf + buf_size)) || AV_RB32(divider) != VC1_CODE_FIELD) {
                 av_log(avctx, AV_LOG_ERROR, "Error in WVC1 interlaced frame\n");
+                ret = AVERROR_INVALIDDATA;
                 goto err;
             } else { // found field marker, unescape second field
-                tmp = av_realloc(slices, sizeof(*slices) * (n_slices+1));
-                if (!tmp)
+                if (avctx->hwaccel
+#if FF_API_CAP_VDPAU
+                    || s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+                    )
+                    buf_start_second_field = divider;
+                tmp = av_realloc_array(slices, sizeof(*slices), (n_slices+1));
+                if (!tmp) {
+                    ret = AVERROR(ENOMEM);
                     goto err;
+                }
                 slices = tmp;
                 slices[n_slices].buf = av_mallocz(buf_size + AV_INPUT_BUFFER_PADDING_SIZE);
-                if (!slices[n_slices].buf)
+                if (!slices[n_slices].buf) {
+                    ret = AVERROR(ENOMEM);
                     goto err;
+                }
                 buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                               buf_size3 << 3);
-                slices[n_slices].mby_start = s->mb_height >> 1;
+                slices[n_slices].mby_start = s->mb_height + 1 >> 1;
                 n_slices1 = n_slices - 1;
                 n_slices++;
             }
@@ -736,9 +815,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (!s->context_initialized) {
-        if (ff_msmpeg4_decode_init(avctx) < 0)
+        if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
             goto err;
-        if (ff_vc1_decode_init_alloc_tables(v) < 0) {
+        if ((ret = ff_vc1_decode_init_alloc_tables(v)) < 0) {
             ff_mpv_common_end(s);
             goto err;
         }
@@ -746,6 +825,10 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
         s->low_delay = !avctx->has_b_frames || v->res_sprite;
 
         if (v->profile == PROFILE_ADVANCED) {
+            if(avctx->coded_width<=1 || avctx->coded_height<=1) {
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
             s->h_edge_pos = avctx->coded_width;
             s->v_edge_pos = avctx->coded_height;
         }
@@ -755,19 +838,29 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     v->pic_header_flag = 0;
     v->first_pic_header_flag = 1;
     if (v->profile < PROFILE_ADVANCED) {
-        if (ff_vc1_parse_frame_header(v, &s->gb) < 0) {
+        if ((ret = ff_vc1_parse_frame_header(v, &s->gb)) < 0) {
             goto err;
         }
     } else {
-        if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+        if ((ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
             goto err;
         }
     }
     v->first_pic_header_flag = 0;
 
+    if (avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(v->s.avctx, AV_LOG_DEBUG, "pict_type: %c\n", av_get_picture_type_char(s->pict_type));
+
     if ((avctx->codec_id == AV_CODEC_ID_WMV3IMAGE || avctx->codec_id == AV_CODEC_ID_VC1IMAGE)
         && s->pict_type != AV_PICTURE_TYPE_I) {
         av_log(v->s.avctx, AV_LOG_ERROR, "Sprite decoder: expected I-frame\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    if ((s->mb_height >> v->field_mode) == 0) {
+        av_log(v->s.avctx, AV_LOG_ERROR, "image too short\n");
+        ret = AVERROR_INVALIDDATA;
         goto err;
     }
 
@@ -777,6 +870,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
 
     /* skip B-frames if we don't have reference frames */
     if (!s->last_picture_ptr && (s->pict_type == AV_PICTURE_TYPE_B || s->droppable)) {
+        av_log(v->s.avctx, AV_LOG_DEBUG, "Skipping B frame without reference frames\n");
         goto end;
     }
     if ((avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type == AV_PICTURE_TYPE_B) ||
@@ -792,10 +886,14 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             s->next_p_frame_damaged = 0;
     }
 
-    if (ff_mpv_frame_start(s, avctx) < 0) {
+    if ((ret = ff_mpv_frame_start(s, avctx)) < 0) {
         goto err;
     }
 
+    v->s.current_picture_ptr->field_picture = v->field_mode;
+    v->s.current_picture_ptr->f->interlaced_frame = (v->fcm != PROGRESSIVE);
+    v->s.current_picture_ptr->f->top_field_first  = v->tff;
+
     // process pulldown flags
     s->current_picture_ptr->f->repeat_pict = 0;
     // Pulldown flags are only valid when 'broadcast' has been set.
@@ -811,13 +909,55 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
     s->me.qpel_put = s->qdsp.put_qpel_pixels_tab;
     s->me.qpel_avg = s->qdsp.avg_qpel_pixels_tab;
 
+#if FF_API_CAP_VDPAU
+    if ((CONFIG_VC1_VDPAU_DECODER)
+        &&s->avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU) {
+        if (v->field_mode && buf_start_second_field) {
+            ff_vdpau_vc1_decode_picture(s, buf_start, buf_start_second_field - buf_start);
+            ff_vdpau_vc1_decode_picture(s, buf_start_second_field, (buf + buf_size) - buf_start_second_field);
+        } else {
+            ff_vdpau_vc1_decode_picture(s, buf_start, (buf + buf_size) - buf_start);
+        }
+    } else
+#endif
     if (avctx->hwaccel) {
-        if (avctx->hwaccel->start_frame(avctx, buf, buf_size) < 0)
-            goto err;
-        if (avctx->hwaccel->decode_slice(avctx, buf_start, (buf + buf_size) - buf_start) < 0)
-            goto err;
-        if (avctx->hwaccel->end_frame(avctx) < 0)
-            goto err;
+        if (v->field_mode && buf_start_second_field) {
+            // decode first field
+            s->picture_structure = PICT_BOTTOM_FIELD - v->tff;
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start, buf_start_second_field - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, buf_start_second_field - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+
+            // decode second field
+            s->gb = slices[n_slices1 + 1].gb;
+            s->picture_structure = PICT_TOP_FIELD + v->tff;
+            v->second_field = 1;
+            v->pic_header_flag = 0;
+            if (ff_vc1_parse_frame_header_adv(v, &s->gb) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "parsing header for second field failed");
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            v->s.current_picture_ptr->f->pict_type = v->s.pict_type;
+
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start_second_field, (buf + buf_size) - buf_start_second_field)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start_second_field, (buf + buf_size) - buf_start_second_field)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+        } else {
+            s->picture_structure = PICT_FRAME;
+            if ((ret = avctx->hwaccel->start_frame(avctx, buf_start, (buf + buf_size) - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->decode_slice(avctx, buf_start, (buf + buf_size) - buf_start)) < 0)
+                goto err;
+            if ((ret = avctx->hwaccel->end_frame(avctx)) < 0)
+                goto err;
+        }
     } else {
         int header_ret = 0;
 
@@ -834,10 +974,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
         }
         mb_height = s->mb_height >> v->field_mode;
 
-        if (!mb_height) {
-            av_log(v->s.avctx, AV_LOG_ERROR, "Invalid mb_height.\n");
-            goto err;
-        }
+        av_assert0 (mb_height > 0);
 
         for (i = 0; i <= n_slices; i++) {
             if (i > 0 &&  slices[i - 1].mby_start >= mb_height) {
@@ -848,7 +985,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     continue;
                 }
                 v->second_field = 1;
-                v->blocks_off   = s->mb_width  * s->mb_height << 1;
+                av_assert0((s->mb_height & 1) == 0);
+                v->blocks_off   = s->b8_stride * (s->mb_height&~1);
                 v->mb_off       = s->mb_stride * s->mb_height >> 1;
             } else {
                 v->second_field = 0;
@@ -860,6 +998,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 if (v->field_mode && i == n_slices1 + 2) {
                     if ((header_ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
                         av_log(v->s.avctx, AV_LOG_ERROR, "Field header damaged\n");
+                        ret = AVERROR_INVALIDDATA;
                         if (avctx->err_recognition & AV_EF_EXPLODE)
                             goto err;
                         continue;
@@ -868,6 +1007,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     v->pic_header_flag = 1;
                     if ((header_ret = ff_vc1_parse_frame_header_adv(v, &s->gb)) < 0) {
                         av_log(v->s.avctx, AV_LOG_ERROR, "Slice header damaged\n");
+                        ret = AVERROR_INVALIDDATA;
                         if (avctx->err_recognition & AV_EF_EXPLODE)
                             goto err;
                         continue;
@@ -879,8 +1019,21 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
             s->start_mb_y = (i == 0) ? 0 : FFMAX(0, slices[i-1].mby_start % mb_height);
             if (!v->field_mode || v->second_field)
                 s->end_mb_y = (i == n_slices     ) ? mb_height : FFMIN(mb_height, slices[i].mby_start % mb_height);
-            else
+            else {
+                if (i >= n_slices) {
+                    av_log(v->s.avctx, AV_LOG_ERROR, "first field slice count too large\n");
+                    continue;
+                }
                 s->end_mb_y = (i <= n_slices1 + 1) ? mb_height : FFMIN(mb_height, slices[i].mby_start % mb_height);
+            }
+            if (s->end_mb_y <= s->start_mb_y) {
+                av_log(v->s.avctx, AV_LOG_ERROR, "end mb y %d %d invalid\n", s->end_mb_y, s->start_mb_y);
+                continue;
+            }
+            if (!v->p_frame_skipped && s->pict_type != AV_PICTURE_TYPE_I && !v->cbpcy_vlc) {
+                av_log(v->s.avctx, AV_LOG_ERROR, "missing cbpcy_vlc\n");
+                continue;
+            }
             ff_vc1_decode_blocks(v);
             if (i != n_slices)
                 s->gb = slices[i].gb;
@@ -901,6 +1054,10 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 get_bits_count(&s->gb), s->gb.size_in_bits);
 //  if (get_bits_count(&s->gb) > buf_size * 8)
 //      return -1;
+        if(s->er.error_occurred && s->pict_type == AV_PICTURE_TYPE_B) {
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
         if (!v->field_mode)
             ff_er_frame_end(&s->er);
     }
@@ -914,7 +1071,7 @@ image:
         if (avctx->skip_frame >= AVDISCARD_NONREF)
             goto end;
 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
-        if (vc1_decode_sprites(v, &s->gb))
+        if ((ret = vc1_decode_sprites(v, &s->gb)) < 0)
             goto err;
 #endif
         if ((ret = av_frame_ref(pict, v->sprite_output_frame)) < 0)
@@ -924,12 +1081,12 @@ image:
         if (s->pict_type == AV_PICTURE_TYPE_B || s->low_delay) {
             if ((ret = av_frame_ref(pict, s->current_picture_ptr->f)) < 0)
                 goto err;
-            ff_print_debug_info(s, s->current_picture_ptr);
+            ff_print_debug_info(s, s->current_picture_ptr, pict);
             *got_frame = 1;
         } else if (s->last_picture_ptr) {
             if ((ret = av_frame_ref(pict, s->last_picture_ptr->f)) < 0)
                 goto err;
-            ff_print_debug_info(s, s->last_picture_ptr);
+            ff_print_debug_info(s, s->last_picture_ptr, pict);
             *got_frame = 1;
         }
     }
@@ -946,7 +1103,7 @@ err:
     for (i = 0; i < n_slices; i++)
         av_free(slices[i].buf);
     av_free(slices);
-    return -1;
+    return ret;
 }
 
 
@@ -999,6 +1156,38 @@ AVCodec ff_wmv3_decoder = {
 };
 #endif
 
+#if CONFIG_WMV3_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_wmv3_vdpau_decoder = {
+    .name           = "wmv3_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Video 9 VDPAU"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_WMV3,
+    .priv_data_size = sizeof(VC1Context),
+    .init           = vc1_decode_init,
+    .close          = ff_vc1_decode_end,
+    .decode         = vc1_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HWACCEL_VDPAU,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_VDPAU_WMV3, AV_PIX_FMT_NONE },
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_vc1_profiles)
+};
+#endif
+
+#if CONFIG_VC1_VDPAU_DECODER && FF_API_VDPAU
+AVCodec ff_vc1_vdpau_decoder = {
+    .name           = "vc1_vdpau",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE VC-1 VDPAU"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VC1,
+    .priv_data_size = sizeof(VC1Context),
+    .init           = vc1_decode_init,
+    .close          = ff_vc1_decode_end,
+    .decode         = vc1_decode_frame,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HWACCEL_VDPAU,
+    .pix_fmts       = (const enum AVPixelFormat[]){ AV_PIX_FMT_VDPAU_VC1, AV_PIX_FMT_NONE },
+    .profiles       = NULL_IF_CONFIG_SMALL(ff_vc1_profiles)
+};
+#endif
+
 #if CONFIG_WMV3IMAGE_DECODER
 AVCodec ff_wmv3image_decoder = {
     .name           = "wmv3image",
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index cbcc8d2..2d7e9b7 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,12 @@
  * VC-1 and WMV3 decoder
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "h264chroma.h"
 #include "qpeldsp.h"
+#include "rnd_avg.h"
 #include "vc1dsp.h"
 #include "startcode.h"
 
@@ -581,10 +584,10 @@ static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride,
 }
 
 /* Function used to do motion compensation with bicubic interpolation */
-#define VC1_MSPEL_MC(OP, OPNAME)                                              \
+#define VC1_MSPEL_MC(OP, OP4, OPNAME)                                         \
 static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
                                                     const uint8_t *src,       \
-                                                    int stride,               \
+                                                    ptrdiff_t stride,         \
                                                     int hmode,                \
                                                     int vmode,                \
                                                     int rnd)                  \
@@ -639,13 +642,93 @@ static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
         dst += stride;                                                        \
         src += stride;                                                        \
     }                                                                         \
+}\
+static av_always_inline void OPNAME ## vc1_mspel_mc_16(uint8_t *dst,          \
+                                                       const uint8_t *src,    \
+                                                       ptrdiff_t stride,      \
+                                                       int hmode,             \
+                                                       int vmode,             \
+                                                       int rnd)               \
+{                                                                             \
+    int i, j;                                                                 \
+                                                                              \
+    if (vmode) { /* Horizontal filter to apply */                             \
+        int r;                                                                \
+                                                                              \
+        if (hmode) { /* Vertical filter to apply, output to tmp */            \
+            static const int shift_value[] = { 0, 5, 1, 5 };                  \
+            int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;       \
+            int16_t tmp[19 * 16], *tptr = tmp;                                \
+                                                                              \
+            r = (1 << (shift - 1)) + rnd - 1;                                 \
+                                                                              \
+            src -= 1;                                                         \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 19; i++)                                      \
+                    tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode) + r) >> shift; \
+                src  += stride;                                               \
+                tptr += 19;                                                   \
+            }                                                                 \
+                                                                              \
+            r    = 64 - rnd;                                                  \
+            tptr = tmp + 1;                                                   \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 16; i++)                                      \
+                    OP(dst[i], (vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode) + r) >> 7); \
+                dst  += stride;                                               \
+                tptr += 19;                                                   \
+            }                                                                 \
+                                                                              \
+            return;                                                           \
+        } else { /* No horizontal filter, output 8 lines to dst */            \
+            r = 1 - rnd;                                                      \
+                                                                              \
+            for (j = 0; j < 16; j++) {                                        \
+                for (i = 0; i < 16; i++)                                      \
+                    OP(dst[i], vc1_mspel_filter(src + i, stride, vmode, r));  \
+                src += stride;                                                \
+                dst += stride;                                                \
+            }                                                                 \
+            return;                                                           \
+        }                                                                     \
+    }                                                                         \
+                                                                              \
+    /* Horizontal mode with no vertical mode */                               \
+    for (j = 0; j < 16; j++) {                                                \
+        for (i = 0; i < 16; i++)                                              \
+            OP(dst[i], vc1_mspel_filter(src + i, 1, hmode, rnd));             \
+        dst += stride;                                                        \
+        src += stride;                                                        \
+    }                                                                         \
+}\
+static void OPNAME ## pixels8x8_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\
+    int i;\
+    for(i=0; i<8; i++){\
+        OP4(*(uint32_t*)(block  ), AV_RN32(pixels  ));\
+        OP4(*(uint32_t*)(block+4), AV_RN32(pixels+4));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+static void OPNAME ## pixels16x16_c(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd){\
+    int i;\
+    for(i=0; i<16; i++){\
+        OP4(*(uint32_t*)(block   ), AV_RN32(pixels   ));\
+        OP4(*(uint32_t*)(block+ 4), AV_RN32(pixels+ 4));\
+        OP4(*(uint32_t*)(block+ 8), AV_RN32(pixels+ 8));\
+        OP4(*(uint32_t*)(block+12), AV_RN32(pixels+12));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
 }
 
-#define op_put(a, b) a = av_clip_uint8(b)
-#define op_avg(a, b) a = (a + av_clip_uint8(b) + 1) >> 1
+#define op_put(a, b) (a) = av_clip_uint8(b)
+#define op_avg(a, b) (a) = ((a) + av_clip_uint8(b) + 1) >> 1
+#define op4_avg(a, b) (a) = rnd_avg32(a, b)
+#define op4_put(a, b) (a) = (b)
 
-VC1_MSPEL_MC(op_put, put_)
-VC1_MSPEL_MC(op_avg, avg_)
+VC1_MSPEL_MC(op_put, op4_put, put_)
+VC1_MSPEL_MC(op_avg, op4_avg, avg_)
 
 /* pixel functions - really are entry points to vc1_mspel_mc */
 
@@ -661,6 +744,18 @@ static void avg_vc1_mspel_mc ## a ## b ## _c(uint8_t *dst,                    \
                                              ptrdiff_t stride, int rnd)       \
 {                                                                             \
     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                            \
+}                                                                             \
+static void put_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst,                 \
+                                                const uint8_t *src,           \
+                                                ptrdiff_t stride, int rnd)    \
+{                                                                             \
+    put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                         \
+}                                                                             \
+static void avg_vc1_mspel_mc ## a ## b ## _16_c(uint8_t *dst,                 \
+                                                const uint8_t *src,           \
+                                                ptrdiff_t stride, int rnd)    \
+{                                                                             \
+    avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                         \
 }
 
 PUT_VC1_MSPEL(1, 0)
@@ -682,19 +777,6 @@ PUT_VC1_MSPEL(1, 3)
 PUT_VC1_MSPEL(2, 3)
 PUT_VC1_MSPEL(3, 3)
 
-
-static void put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride, int rnd)
-{
-    ff_put_pixels8x8_c(dst, src, stride);
-}
-
-static void avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride, int rnd)
-{
-    ff_avg_pixels8x8_c(dst, src, stride);
-}
-
 #define chroma_mc(a) \
     ((A * src[a] + B * src[a + 1] + \
       C * src[stride + a] + D * src[stride + a + 1] + 32 - 4) >> 6)
@@ -708,7 +790,7 @@ static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = chroma_mc(0);
@@ -733,7 +815,7 @@ static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = chroma_mc(0);
@@ -756,7 +838,7 @@ static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
     const int D =     (x) *     (y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = avg2(dst[0], chroma_mc(0));
@@ -782,7 +864,7 @@ static void avg_no_rnd_vc1_chroma_mc4_c(uint8_t *dst /* align 8 */,
     const int D = (    x) * (    y);
     int i;
 
-    assert(x < 8 && y < 8 && x >= 0 && y >= 0);
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
     for (i = 0; i < h; i++) {
         dst[0] = avg2(dst[0], chroma_mc(0));
@@ -877,6 +959,11 @@ static void sprite_v_double_twoscale_c(uint8_t *dst,
 }
 
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+#define FN_ASSIGN(X, Y) \
+    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = put_vc1_mspel_mc##X##Y##_c; \
+    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = put_vc1_mspel_mc##X##Y##_16_c; \
+    dsp->avg_vc1_mspel_pixels_tab[1][X+4*Y] = avg_vc1_mspel_mc##X##Y##_c; \
+    dsp->avg_vc1_mspel_pixels_tab[0][X+4*Y] = avg_vc1_mspel_mc##X##Y##_16_c
 
 av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 {
@@ -901,39 +988,28 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
     dsp->vc1_v_loop_filter16  = vc1_v_loop_filter16_c;
     dsp->vc1_h_loop_filter16  = vc1_h_loop_filter16_c;
 
-    dsp->put_vc1_mspel_pixels_tab[0]  = put_vc1_mspel_mc00_c;
-    dsp->put_vc1_mspel_pixels_tab[1]  = put_vc1_mspel_mc10_c;
-    dsp->put_vc1_mspel_pixels_tab[2]  = put_vc1_mspel_mc20_c;
-    dsp->put_vc1_mspel_pixels_tab[3]  = put_vc1_mspel_mc30_c;
-    dsp->put_vc1_mspel_pixels_tab[4]  = put_vc1_mspel_mc01_c;
-    dsp->put_vc1_mspel_pixels_tab[5]  = put_vc1_mspel_mc11_c;
-    dsp->put_vc1_mspel_pixels_tab[6]  = put_vc1_mspel_mc21_c;
-    dsp->put_vc1_mspel_pixels_tab[7]  = put_vc1_mspel_mc31_c;
-    dsp->put_vc1_mspel_pixels_tab[8]  = put_vc1_mspel_mc02_c;
-    dsp->put_vc1_mspel_pixels_tab[9]  = put_vc1_mspel_mc12_c;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_c;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_c;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_c;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_c;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_c;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_c;
-
-    dsp->avg_vc1_mspel_pixels_tab[0]  = avg_vc1_mspel_mc00_c;
-    dsp->avg_vc1_mspel_pixels_tab[1]  = avg_vc1_mspel_mc10_c;
-    dsp->avg_vc1_mspel_pixels_tab[2]  = avg_vc1_mspel_mc20_c;
-    dsp->avg_vc1_mspel_pixels_tab[3]  = avg_vc1_mspel_mc30_c;
-    dsp->avg_vc1_mspel_pixels_tab[4]  = avg_vc1_mspel_mc01_c;
-    dsp->avg_vc1_mspel_pixels_tab[5]  = avg_vc1_mspel_mc11_c;
-    dsp->avg_vc1_mspel_pixels_tab[6]  = avg_vc1_mspel_mc21_c;
-    dsp->avg_vc1_mspel_pixels_tab[7]  = avg_vc1_mspel_mc31_c;
-    dsp->avg_vc1_mspel_pixels_tab[8]  = avg_vc1_mspel_mc02_c;
-    dsp->avg_vc1_mspel_pixels_tab[9]  = avg_vc1_mspel_mc12_c;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_c;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_c;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_c;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_c;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_c;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_c;
+    dsp->put_vc1_mspel_pixels_tab[0][0] = put_pixels16x16_c;
+    dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_pixels16x16_c;
+    dsp->put_vc1_mspel_pixels_tab[1][0] = put_pixels8x8_c;
+    dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_pixels8x8_c;
+    FN_ASSIGN(0, 1);
+    FN_ASSIGN(0, 2);
+    FN_ASSIGN(0, 3);
+
+    FN_ASSIGN(1, 0);
+    FN_ASSIGN(1, 1);
+    FN_ASSIGN(1, 2);
+    FN_ASSIGN(1, 3);
+
+    FN_ASSIGN(2, 0);
+    FN_ASSIGN(2, 1);
+    FN_ASSIGN(2, 2);
+    FN_ASSIGN(2, 3);
+
+    FN_ASSIGN(3, 0);
+    FN_ASSIGN(3, 1);
+    FN_ASSIGN(3, 2);
+    FN_ASSIGN(3, 3);
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_c;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_c;
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index f2b0c70..3d48f09 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -2,20 +2,20 @@
  * VC-1 and WMV3 decoder - DSP functions
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 #include "hpeldsp.h"
 #include "h264chroma.h"
 
+typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, ptrdiff_t line_size, int h);
+
 typedef struct VC1DSPContext {
     /* vc1 functions */
     void (*vc1_inv_trans_8x8)(int16_t *b);
@@ -54,8 +56,8 @@ typedef struct VC1DSPContext {
     /* put 8x8 block with bicubic interpolation and quarterpel precision
      * last argument is actually round value instead of height
      */
-    op_pixels_func put_vc1_mspel_pixels_tab[16];
-    op_pixels_func avg_vc1_mspel_pixels_tab[16];
+    vc1op_pixels_func put_vc1_mspel_pixels_tab[2][16];
+    vc1op_pixels_func avg_vc1_mspel_pixels_tab[2][16];
 
     /* This is really one func used in VC-1 decoding */
     h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
diff --git a/libavcodec/vc2enc.c b/libavcodec/vc2enc.c
new file mode 100644
index 0000000..bbbeaa0
--- /dev/null
+++ b/libavcodec/vc2enc.c
@@ -0,0 +1,1292 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include "libavutil/opt.h"
+#include "dirac.h"
+#include "put_bits.h"
+#include "internal.h"
+#include "version.h"
+
+#include "vc2enc_dwt.h"
+#include "diractab.h"
+
+/* Quantizations above this usually zero coefficients and lower the quality */
+#define MAX_QUANT_INDEX FF_ARRAY_ELEMS(ff_dirac_qscale_tab)
+
+/* Total range is -COEF_LUT_TAB to +COEFF_LUT_TAB, but total tab size is half
+ * (COEF_LUT_TAB*MAX_QUANT_INDEX) since the sign is appended during encoding */
+#define COEF_LUT_TAB 2048
+
+/* The limited size resolution of each slice forces us to do this */
+#define SSIZE_ROUND(b) (FFALIGN((b), s->size_scaler) + 4 + s->prefix_bytes)
+
+/* Decides the cutoff point in # of slices to distribute the leftover bytes */
+#define SLICE_REDIST_TOTAL 150
+
+typedef struct VC2BaseVideoFormat {
+    enum AVPixelFormat pix_fmt;
+    AVRational time_base;
+    int width, height, interlaced, level;
+    const char *name;
+} VC2BaseVideoFormat;
+
+static const VC2BaseVideoFormat base_video_fmts[] = {
+    { 0 }, /* Custom format, here just to make indexing equal to base_vf */
+    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  176,  120, 0, 1,     "QSIF525" },
+    { AV_PIX_FMT_YUV420P,   {    2,    25 },  176,  144, 0, 1,     "QCIF"    },
+    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  352,  240, 0, 1,     "SIF525"  },
+    { AV_PIX_FMT_YUV420P,   {    2,    25 },  352,  288, 0, 1,     "CIF"     },
+    { AV_PIX_FMT_YUV420P,   { 1001, 15000 },  704,  480, 0, 1,     "4SIF525" },
+    { AV_PIX_FMT_YUV420P,   {    2,    25 },  704,  576, 0, 1,     "4CIF"    },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  480, 1, 2,   "SD480I-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    25 },  720,  576, 1, 2,   "SD576I-50" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1280,  720, 0, 3,  "HD720P-60"  },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1280,  720, 0, 3,  "HD720P-50"  },
+    { AV_PIX_FMT_YUV422P10, { 1001, 30000 }, 1920, 1080, 1, 3,  "HD1080I-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    25 }, 1920, 1080, 1, 3,  "HD1080I-50" },
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 1920, 1080, 0, 3,  "HD1080P-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 1920, 1080, 0, 3,  "HD1080P-50" },
+
+    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 2048, 1080, 0, 4,        "DC2K" },
+    { AV_PIX_FMT_YUV444P12, {    1,    24 }, 4096, 2160, 0, 5,        "DC4K" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 3840, 2160, 0, 6, "UHDTV 4K-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 3840, 2160, 0, 6, "UHDTV 4K-50" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 60000 }, 7680, 4320, 0, 7, "UHDTV 8K-60" },
+    { AV_PIX_FMT_YUV422P10, {    1,    50 }, 7680, 4320, 0, 7, "UHDTV 8K-50" },
+
+    { AV_PIX_FMT_YUV422P10, { 1001, 24000 }, 1920, 1080, 0, 3,  "HD1080P-24" },
+    { AV_PIX_FMT_YUV422P10, { 1001, 30000 },  720,  486, 1, 2,  "SD Pro486"  },
+};
+static const int base_video_fmts_len = FF_ARRAY_ELEMS(base_video_fmts);
+
+enum VC2_QM {
+    VC2_QM_DEF = 0,
+    VC2_QM_COL,
+    VC2_QM_FLAT,
+
+    VC2_QM_NB
+};
+
+typedef struct SubBand {
+    dwtcoef *buf;
+    ptrdiff_t stride;
+    int width;
+    int height;
+} SubBand;
+
+typedef struct Plane {
+    SubBand band[MAX_DWT_LEVELS][4];
+    dwtcoef *coef_buf;
+    int width;
+    int height;
+    int dwt_width;
+    int dwt_height;
+    ptrdiff_t coef_stride;
+} Plane;
+
+typedef struct SliceArgs {
+    PutBitContext pb;
+    int cache[MAX_QUANT_INDEX];
+    void *ctx;
+    int x;
+    int y;
+    int quant_idx;
+    int bits_ceil;
+    int bits_floor;
+    int bytes;
+} SliceArgs;
+
+typedef struct TransformArgs {
+    void *ctx;
+    Plane *plane;
+    void *idata;
+    ptrdiff_t istride;
+    int field;
+    VC2TransformContext t;
+} TransformArgs;
+
+typedef struct VC2EncContext {
+    AVClass *av_class;
+    PutBitContext pb;
+    Plane plane[3];
+    AVCodecContext *avctx;
+    DiracVersionInfo ver;
+
+    SliceArgs *slice_args;
+    TransformArgs transform_args[3];
+
+    /* For conversion from unsigned pixel values to signed */
+    int diff_offset;
+    int bpp;
+    int bpp_idx;
+
+    /* Picture number */
+    uint32_t picture_number;
+
+    /* Base video format */
+    int base_vf;
+    int level;
+    int profile;
+
+    /* Quantization matrix */
+    uint8_t quant[MAX_DWT_LEVELS][4];
+    int custom_quant_matrix;
+
+    /* Coefficient LUT */
+    uint32_t *coef_lut_val;
+    uint8_t  *coef_lut_len;
+
+    int num_x; /* #slices horizontally */
+    int num_y; /* #slices vertically */
+    int prefix_bytes;
+    int size_scaler;
+    int chroma_x_shift;
+    int chroma_y_shift;
+
+    /* Rate control stuff */
+    int slice_max_bytes;
+    int slice_min_bytes;
+    int q_ceil;
+    int q_avg;
+
+    /* Options */
+    double tolerance;
+    int wavelet_idx;
+    int wavelet_depth;
+    int strict_compliance;
+    int slice_height;
+    int slice_width;
+    int interlaced;
+    enum VC2_QM quant_matrix;
+
+    /* Parse code state */
+    uint32_t next_parse_offset;
+    enum DiracParseCodes last_parse_code;
+} VC2EncContext;
+
+static av_always_inline void put_vc2_ue_uint(PutBitContext *pb, uint32_t val)
+{
+    int i;
+    int pbits = 0, bits = 0, topbit = 1, maxval = 1;
+
+    if (!val++) {
+        put_bits(pb, 1, 1);
+        return;
+    }
+
+    while (val > maxval) {
+        topbit <<= 1;
+        maxval <<= 1;
+        maxval |=  1;
+    }
+
+    bits = ff_log2(topbit);
+
+    for (i = 0; i < bits; i++) {
+        topbit >>= 1;
+        pbits <<= 2;
+        if (val & topbit)
+            pbits |= 0x1;
+    }
+
+    put_bits(pb, bits*2 + 1, (pbits << 1) | 1);
+}
+
+static av_always_inline int count_vc2_ue_uint(uint32_t val)
+{
+    int topbit = 1, maxval = 1;
+
+    if (!val++)
+        return 1;
+
+    while (val > maxval) {
+        topbit <<= 1;
+        maxval <<= 1;
+        maxval |=  1;
+    }
+
+    return ff_log2(topbit)*2 + 1;
+}
+
+static av_always_inline void get_vc2_ue_uint(int val, uint8_t *nbits,
+                                             uint32_t *eval)
+{
+    int i;
+    int pbits = 0, bits = 0, topbit = 1, maxval = 1;
+
+    if (!val++) {
+        *nbits = 1;
+        *eval = 1;
+        return;
+    }
+
+    while (val > maxval) {
+        topbit <<= 1;
+        maxval <<= 1;
+        maxval |=  1;
+    }
+
+    bits = ff_log2(topbit);
+
+    for (i = 0; i < bits; i++) {
+        topbit >>= 1;
+        pbits <<= 2;
+        if (val & topbit)
+            pbits |= 0x1;
+    }
+
+    *nbits = bits*2 + 1;
+    *eval = (pbits << 1) | 1;
+}
+
+/* VC-2 10.4 - parse_info() */
+static void encode_parse_info(VC2EncContext *s, enum DiracParseCodes pcode)
+{
+    uint32_t cur_pos, dist;
+
+    avpriv_align_put_bits(&s->pb);
+
+    cur_pos = put_bits_count(&s->pb) >> 3;
+
+    /* Magic string */
+    avpriv_put_string(&s->pb, "BBCD", 0);
+
+    /* Parse code */
+    put_bits(&s->pb, 8, pcode);
+
+    /* Next parse offset */
+    dist = cur_pos - s->next_parse_offset;
+    AV_WB32(s->pb.buf + s->next_parse_offset + 5, dist);
+    s->next_parse_offset = cur_pos;
+    put_bits32(&s->pb, pcode == DIRAC_PCODE_END_SEQ ? 13 : 0);
+
+    /* Last parse offset */
+    put_bits32(&s->pb, s->last_parse_code == DIRAC_PCODE_END_SEQ ? 13 : dist);
+
+    s->last_parse_code = pcode;
+}
+
+/* VC-2 11.1 - parse_parameters()
+ * The level dictates what the decoder should expect in terms of resolution
+ * and allows it to quickly reject whatever it can't support. Remember,
+ * this codec kinda targets cheapo FPGAs without much memory. Unfortunately
+ * it also limits us greatly in our choice of formats, hence the flag to disable
+ * strict_compliance */
+static void encode_parse_params(VC2EncContext *s)
+{
+    put_vc2_ue_uint(&s->pb, s->ver.major); /* VC-2 demands this to be 2 */
+    put_vc2_ue_uint(&s->pb, s->ver.minor); /* ^^ and this to be 0       */
+    put_vc2_ue_uint(&s->pb, s->profile);   /* 3 to signal HQ profile    */
+    put_vc2_ue_uint(&s->pb, s->level);     /* 3 - 1080/720, 6 - 4K      */
+}
+
+/* VC-2 11.3 - frame_size() */
+static void encode_frame_size(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        AVCodecContext *avctx = s->avctx;
+        put_vc2_ue_uint(&s->pb, avctx->width);
+        put_vc2_ue_uint(&s->pb, avctx->height);
+    }
+}
+
+/* VC-2 11.3.3 - color_diff_sampling_format() */
+static void encode_sample_fmt(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        int idx;
+        if (s->chroma_x_shift == 1 && s->chroma_y_shift == 0)
+            idx = 1; /* 422 */
+        else if (s->chroma_x_shift == 1 && s->chroma_y_shift == 1)
+            idx = 2; /* 420 */
+        else
+            idx = 0; /* 444 */
+        put_vc2_ue_uint(&s->pb, idx);
+    }
+}
+
+/* VC-2 11.3.4 - scan_format() */
+static void encode_scan_format(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance)
+        put_vc2_ue_uint(&s->pb, s->interlaced);
+}
+
+/* VC-2 11.3.5 - frame_rate() */
+static void encode_frame_rate(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        AVCodecContext *avctx = s->avctx;
+        put_vc2_ue_uint(&s->pb, 0);
+        put_vc2_ue_uint(&s->pb, avctx->time_base.den);
+        put_vc2_ue_uint(&s->pb, avctx->time_base.num);
+    }
+}
+
+/* VC-2 11.3.6 - aspect_ratio() */
+static void encode_aspect_ratio(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        AVCodecContext *avctx = s->avctx;
+        put_vc2_ue_uint(&s->pb, 0);
+        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.num);
+        put_vc2_ue_uint(&s->pb, avctx->sample_aspect_ratio.den);
+    }
+}
+
+/* VC-2 11.3.7 - clean_area() */
+static void encode_clean_area(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, 0);
+}
+
+/* VC-2 11.3.8 - signal_range() */
+static void encode_signal_range(VC2EncContext *s)
+{
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance)
+        put_vc2_ue_uint(&s->pb, s->bpp_idx);
+}
+
+/* VC-2 11.3.9 - color_spec() */
+static void encode_color_spec(VC2EncContext *s)
+{
+    AVCodecContext *avctx = s->avctx;
+    put_bits(&s->pb, 1, !s->strict_compliance);
+    if (!s->strict_compliance) {
+        int val;
+        put_vc2_ue_uint(&s->pb, 0);
+
+        /* primaries */
+        put_bits(&s->pb, 1, 1);
+        if (avctx->color_primaries == AVCOL_PRI_BT470BG)
+            val = 2;
+        else if (avctx->color_primaries == AVCOL_PRI_SMPTE170M)
+            val = 1;
+        else if (avctx->color_primaries == AVCOL_PRI_SMPTE240M)
+            val = 1;
+        else
+            val = 0;
+        put_vc2_ue_uint(&s->pb, val);
+
+        /* color matrix */
+        put_bits(&s->pb, 1, 1);
+        if (avctx->colorspace == AVCOL_SPC_RGB)
+            val = 3;
+        else if (avctx->colorspace == AVCOL_SPC_YCOCG)
+            val = 2;
+        else if (avctx->colorspace == AVCOL_SPC_BT470BG)
+            val = 1;
+        else
+            val = 0;
+        put_vc2_ue_uint(&s->pb, val);
+
+        /* transfer function */
+        put_bits(&s->pb, 1, 1);
+        if (avctx->color_trc == AVCOL_TRC_LINEAR)
+            val = 2;
+        else if (avctx->color_trc == AVCOL_TRC_BT1361_ECG)
+            val = 1;
+        else
+            val = 0;
+        put_vc2_ue_uint(&s->pb, val);
+    }
+}
+
+/* VC-2 11.3 - source_parameters() */
+static void encode_source_params(VC2EncContext *s)
+{
+    encode_frame_size(s);
+    encode_sample_fmt(s);
+    encode_scan_format(s);
+    encode_frame_rate(s);
+    encode_aspect_ratio(s);
+    encode_clean_area(s);
+    encode_signal_range(s);
+    encode_color_spec(s);
+}
+
+/* VC-2 11 - sequence_header() */
+static void encode_seq_header(VC2EncContext *s)
+{
+    avpriv_align_put_bits(&s->pb);
+    encode_parse_params(s);
+    put_vc2_ue_uint(&s->pb, s->base_vf);
+    encode_source_params(s);
+    put_vc2_ue_uint(&s->pb, s->interlaced); /* Frames or fields coding */
+}
+
+/* VC-2 12.1 - picture_header() */
+static void encode_picture_header(VC2EncContext *s)
+{
+    avpriv_align_put_bits(&s->pb);
+    put_bits32(&s->pb, s->picture_number++);
+}
+
+/* VC-2 12.3.4.1 - slice_parameters() */
+static void encode_slice_params(VC2EncContext *s)
+{
+    put_vc2_ue_uint(&s->pb, s->num_x);
+    put_vc2_ue_uint(&s->pb, s->num_y);
+    put_vc2_ue_uint(&s->pb, s->prefix_bytes);
+    put_vc2_ue_uint(&s->pb, s->size_scaler);
+}
+
+/* 1st idx = LL, second - vertical, third - horizontal, fourth - total */
+const uint8_t vc2_qm_col_tab[][4] = {
+    {20,  9, 15,  4},
+    { 0,  6,  6,  4},
+    { 0,  3,  3,  5},
+    { 0,  3,  5,  1},
+    { 0, 11, 10, 11}
+};
+
+const uint8_t vc2_qm_flat_tab[][4] = {
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0},
+    { 0,  0,  0,  0}
+};
+
+static void init_quant_matrix(VC2EncContext *s)
+{
+    int level, orientation;
+
+    if (s->wavelet_depth <= 4 && s->quant_matrix == VC2_QM_DEF) {
+        s->custom_quant_matrix = 0;
+        for (level = 0; level < s->wavelet_depth; level++) {
+            s->quant[level][0] = ff_dirac_default_qmat[s->wavelet_idx][level][0];
+            s->quant[level][1] = ff_dirac_default_qmat[s->wavelet_idx][level][1];
+            s->quant[level][2] = ff_dirac_default_qmat[s->wavelet_idx][level][2];
+            s->quant[level][3] = ff_dirac_default_qmat[s->wavelet_idx][level][3];
+        }
+        return;
+    }
+
+    s->custom_quant_matrix = 1;
+
+    if (s->quant_matrix == VC2_QM_DEF) {
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = 0; orientation < 4; orientation++) {
+                if (level <= 3)
+                    s->quant[level][orientation] = ff_dirac_default_qmat[s->wavelet_idx][level][orientation];
+                else
+                    s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
+            }
+        }
+    } else if (s->quant_matrix == VC2_QM_COL) {
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = 0; orientation < 4; orientation++) {
+                s->quant[level][orientation] = vc2_qm_col_tab[level][orientation];
+            }
+        }
+    } else {
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = 0; orientation < 4; orientation++) {
+                s->quant[level][orientation] = vc2_qm_flat_tab[level][orientation];
+            }
+        }
+    }
+}
+
+/* VC-2 12.3.4.2 - quant_matrix() */
+static void encode_quant_matrix(VC2EncContext *s)
+{
+    int level;
+    put_bits(&s->pb, 1, s->custom_quant_matrix);
+    if (s->custom_quant_matrix) {
+        put_vc2_ue_uint(&s->pb, s->quant[0][0]);
+        for (level = 0; level < s->wavelet_depth; level++) {
+            put_vc2_ue_uint(&s->pb, s->quant[level][1]);
+            put_vc2_ue_uint(&s->pb, s->quant[level][2]);
+            put_vc2_ue_uint(&s->pb, s->quant[level][3]);
+        }
+    }
+}
+
+/* VC-2 12.3 - transform_parameters() */
+static void encode_transform_params(VC2EncContext *s)
+{
+    put_vc2_ue_uint(&s->pb, s->wavelet_idx);
+    put_vc2_ue_uint(&s->pb, s->wavelet_depth);
+
+    encode_slice_params(s);
+    encode_quant_matrix(s);
+}
+
+/* VC-2 12.2 - wavelet_transform() */
+static void encode_wavelet_transform(VC2EncContext *s)
+{
+    encode_transform_params(s);
+    avpriv_align_put_bits(&s->pb);
+}
+
+/* VC-2 12 - picture_parse() */
+static void encode_picture_start(VC2EncContext *s)
+{
+    avpriv_align_put_bits(&s->pb);
+    encode_picture_header(s);
+    avpriv_align_put_bits(&s->pb);
+    encode_wavelet_transform(s);
+}
+
+#define QUANT(c, qf) (((c) << 2)/(qf))
+
+/* VC-2 13.5.5.2 - slice_band() */
+static void encode_subband(VC2EncContext *s, PutBitContext *pb, int sx, int sy,
+                           SubBand *b, int quant)
+{
+    int x, y;
+
+    const int left   = b->width  * (sx+0) / s->num_x;
+    const int right  = b->width  * (sx+1) / s->num_x;
+    const int top    = b->height * (sy+0) / s->num_y;
+    const int bottom = b->height * (sy+1) / s->num_y;
+
+    const int qfactor = ff_dirac_qscale_tab[quant];
+    const uint8_t  *len_lut = &s->coef_lut_len[quant*COEF_LUT_TAB];
+    const uint32_t *val_lut = &s->coef_lut_val[quant*COEF_LUT_TAB];
+
+    dwtcoef *coeff = b->buf + top * b->stride;
+
+    for (y = top; y < bottom; y++) {
+        for (x = left; x < right; x++) {
+            const int neg = coeff[x] < 0;
+            uint32_t c_abs = FFABS(coeff[x]);
+            if (c_abs < COEF_LUT_TAB) {
+                put_bits(pb, len_lut[c_abs], val_lut[c_abs] | neg);
+            } else {
+                c_abs = QUANT(c_abs, qfactor);
+                put_vc2_ue_uint(pb, c_abs);
+                if (c_abs)
+                    put_bits(pb, 1, neg);
+            }
+        }
+        coeff += b->stride;
+    }
+}
+
+static int count_hq_slice(SliceArgs *slice, int quant_idx)
+{
+    int x, y;
+    uint8_t quants[MAX_DWT_LEVELS][4];
+    int bits = 0, p, level, orientation;
+    VC2EncContext *s = slice->ctx;
+
+    if (slice->cache[quant_idx])
+        return slice->cache[quant_idx];
+
+    bits += 8*s->prefix_bytes;
+    bits += 8; /* quant_idx */
+
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++)
+            quants[level][orientation] = FFMAX(quant_idx - s->quant[level][orientation], 0);
+
+    for (p = 0; p < 3; p++) {
+        int bytes_start, bytes_len, pad_s, pad_c;
+        bytes_start = bits >> 3;
+        bits += 8;
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = !!level; orientation < 4; orientation++) {
+                SubBand *b = &s->plane[p].band[level][orientation];
+
+                const int q_idx = quants[level][orientation];
+                const uint8_t *len_lut = &s->coef_lut_len[q_idx*COEF_LUT_TAB];
+                const int qfactor = ff_dirac_qscale_tab[q_idx];
+
+                const int left   = b->width  * slice->x    / s->num_x;
+                const int right  = b->width  *(slice->x+1) / s->num_x;
+                const int top    = b->height * slice->y    / s->num_y;
+                const int bottom = b->height *(slice->y+1) / s->num_y;
+
+                dwtcoef *buf = b->buf + top * b->stride;
+
+                for (y = top; y < bottom; y++) {
+                    for (x = left; x < right; x++) {
+                        uint32_t c_abs = FFABS(buf[x]);
+                        if (c_abs < COEF_LUT_TAB) {
+                            bits += len_lut[c_abs];
+                        } else {
+                            c_abs = QUANT(c_abs, qfactor);
+                            bits += count_vc2_ue_uint(c_abs);
+                            bits += !!c_abs;
+                        }
+                    }
+                    buf += b->stride;
+                }
+            }
+        }
+        bits += FFALIGN(bits, 8) - bits;
+        bytes_len = (bits >> 3) - bytes_start - 1;
+        pad_s = FFALIGN(bytes_len, s->size_scaler)/s->size_scaler;
+        pad_c = (pad_s*s->size_scaler) - bytes_len;
+        bits += pad_c*8;
+    }
+
+    slice->cache[quant_idx] = bits;
+
+    return bits;
+}
+
+/* Approaches the best possible quantizer asymptotically, its kinda exaustive
+ * but we have a LUT to get the coefficient size in bits. Guaranteed to never
+ * overshoot, which is apparently very important when streaming */
+static int rate_control(AVCodecContext *avctx, void *arg)
+{
+    SliceArgs *slice_dat = arg;
+    VC2EncContext *s = slice_dat->ctx;
+    const int top = slice_dat->bits_ceil;
+    const int bottom = slice_dat->bits_floor;
+    int quant_buf[2] = {-1, -1};
+    int quant = slice_dat->quant_idx, step = 1;
+    int bits_last, bits = count_hq_slice(slice_dat, quant);
+    while ((bits > top) || (bits < bottom)) {
+        const int signed_step = bits > top ? +step : -step;
+        quant  = av_clip(quant + signed_step, 0, s->q_ceil-1);
+        bits   = count_hq_slice(slice_dat, quant);
+        if (quant_buf[1] == quant) {
+            quant = FFMAX(quant_buf[0], quant);
+            bits  = quant == quant_buf[0] ? bits_last : bits;
+            break;
+        }
+        step         = av_clip(step/2, 1, (s->q_ceil-1)/2);
+        quant_buf[1] = quant_buf[0];
+        quant_buf[0] = quant;
+        bits_last    = bits;
+    }
+    slice_dat->quant_idx = av_clip(quant, 0, s->q_ceil-1);
+    slice_dat->bytes = SSIZE_ROUND(bits >> 3);
+    return 0;
+}
+
+static int calc_slice_sizes(VC2EncContext *s)
+{
+    int i, j, slice_x, slice_y, bytes_left = 0;
+    int bytes_top[SLICE_REDIST_TOTAL] = {0};
+    int64_t total_bytes_needed = 0;
+    int slice_redist_range = FFMIN(SLICE_REDIST_TOTAL, s->num_x*s->num_y);
+    SliceArgs *enc_args = s->slice_args;
+    SliceArgs *top_loc[SLICE_REDIST_TOTAL] = {NULL};
+
+    init_quant_matrix(s);
+
+    for (slice_y = 0; slice_y < s->num_y; slice_y++) {
+        for (slice_x = 0; slice_x < s->num_x; slice_x++) {
+            SliceArgs *args = &enc_args[s->num_x*slice_y + slice_x];
+            args->ctx = s;
+            args->x   = slice_x;
+            args->y   = slice_y;
+            args->bits_ceil  = s->slice_max_bytes << 3;
+            args->bits_floor = s->slice_min_bytes << 3;
+            memset(args->cache, 0, s->q_ceil*sizeof(*args->cache));
+        }
+    }
+
+    /* First pass - determine baseline slice sizes w.r.t. max_slice_size */
+    s->avctx->execute(s->avctx, rate_control, enc_args, NULL, s->num_x*s->num_y,
+                      sizeof(SliceArgs));
+
+    for (i = 0; i < s->num_x*s->num_y; i++) {
+        SliceArgs *args = &enc_args[i];
+        bytes_left += s->slice_max_bytes - args->bytes;
+        for (j = 0; j < slice_redist_range; j++) {
+            if (args->bytes > bytes_top[j]) {
+                bytes_top[j] = args->bytes;
+                top_loc[j]   = args;
+                break;
+            }
+        }
+    }
+
+    /* Second pass - distribute leftover bytes */
+    while (1) {
+        int distributed = 0;
+        for (i = 0; i < slice_redist_range; i++) {
+            SliceArgs *args;
+            int bits, bytes, diff, prev_bytes, new_idx;
+            if (bytes_left <= 0)
+                break;
+            if (!top_loc[i] || !top_loc[i]->quant_idx)
+                break;
+            args = top_loc[i];
+            prev_bytes = args->bytes;
+            new_idx = FFMAX(args->quant_idx - 1, 0);
+            bits  = count_hq_slice(args, new_idx);
+            bytes = SSIZE_ROUND(bits >> 3);
+            diff  = bytes - prev_bytes;
+            if ((bytes_left - diff) > 0) {
+                args->quant_idx = new_idx;
+                args->bytes = bytes;
+                bytes_left -= diff;
+                distributed++;
+            }
+        }
+        if (!distributed)
+            break;
+    }
+
+    for (i = 0; i < s->num_x*s->num_y; i++) {
+        SliceArgs *args = &enc_args[i];
+        total_bytes_needed += args->bytes;
+        s->q_avg = (s->q_avg + args->quant_idx)/2;
+    }
+
+    return total_bytes_needed;
+}
+
+/* VC-2 13.5.3 - hq_slice */
+static int encode_hq_slice(AVCodecContext *avctx, void *arg)
+{
+    SliceArgs *slice_dat = arg;
+    VC2EncContext *s = slice_dat->ctx;
+    PutBitContext *pb = &slice_dat->pb;
+    const int slice_x = slice_dat->x;
+    const int slice_y = slice_dat->y;
+    const int quant_idx = slice_dat->quant_idx;
+    const int slice_bytes_max = slice_dat->bytes;
+    uint8_t quants[MAX_DWT_LEVELS][4];
+    int p, level, orientation;
+
+    /* The reference decoder ignores it, and its typical length is 0 */
+    memset(put_bits_ptr(pb), 0, s->prefix_bytes);
+    skip_put_bytes(pb, s->prefix_bytes);
+
+    put_bits(pb, 8, quant_idx);
+
+    /* Slice quantization (slice_quantizers() in the specs) */
+    for (level = 0; level < s->wavelet_depth; level++)
+        for (orientation = !!level; orientation < 4; orientation++)
+            quants[level][orientation] = FFMAX(quant_idx - s->quant[level][orientation], 0);
+
+    /* Luma + 2 Chroma planes */
+    for (p = 0; p < 3; p++) {
+        int bytes_start, bytes_len, pad_s, pad_c;
+        bytes_start = put_bits_count(pb) >> 3;
+        put_bits(pb, 8, 0);
+        for (level = 0; level < s->wavelet_depth; level++) {
+            for (orientation = !!level; orientation < 4; orientation++) {
+                encode_subband(s, pb, slice_x, slice_y,
+                               &s->plane[p].band[level][orientation],
+                               quants[level][orientation]);
+            }
+        }
+        avpriv_align_put_bits(pb);
+        bytes_len = (put_bits_count(pb) >> 3) - bytes_start - 1;
+        if (p == 2) {
+            int len_diff = slice_bytes_max - (put_bits_count(pb) >> 3);
+            pad_s = FFALIGN((bytes_len + len_diff), s->size_scaler)/s->size_scaler;
+            pad_c = (pad_s*s->size_scaler) - bytes_len;
+        } else {
+            pad_s = FFALIGN(bytes_len, s->size_scaler)/s->size_scaler;
+            pad_c = (pad_s*s->size_scaler) - bytes_len;
+        }
+        pb->buf[bytes_start] = pad_s;
+        flush_put_bits(pb);
+        /* vc2-reference uses that padding that decodes to '0' coeffs */
+        memset(put_bits_ptr(pb), 0xFF, pad_c);
+        skip_put_bytes(pb, pad_c);
+    }
+
+    return 0;
+}
+
+/* VC-2 13.5.1 - low_delay_transform_data() */
+static int encode_slices(VC2EncContext *s)
+{
+    uint8_t *buf;
+    int slice_x, slice_y, skip = 0;
+    SliceArgs *enc_args = s->slice_args;
+
+    avpriv_align_put_bits(&s->pb);
+    flush_put_bits(&s->pb);
+    buf = put_bits_ptr(&s->pb);
+
+    for (slice_y = 0; slice_y < s->num_y; slice_y++) {
+        for (slice_x = 0; slice_x < s->num_x; slice_x++) {
+            SliceArgs *args = &enc_args[s->num_x*slice_y + slice_x];
+            init_put_bits(&args->pb, buf + skip, args->bytes+s->prefix_bytes);
+            skip += args->bytes;
+        }
+    }
+
+    s->avctx->execute(s->avctx, encode_hq_slice, enc_args, NULL, s->num_x*s->num_y,
+                      sizeof(SliceArgs));
+
+    skip_put_bytes(&s->pb, skip);
+
+    return 0;
+}
+
+/*
+ * Transform basics for a 3 level transform
+ * |---------------------------------------------------------------------|
+ * |  LL-0  | HL-0  |                 |                                  |
+ * |--------|-------|      HL-1       |                                  |
+ * |  LH-0  | HH-0  |                 |                                  |
+ * |----------------|-----------------|              HL-2                |
+ * |                |                 |                                  |
+ * |     LH-1       |      HH-1       |                                  |
+ * |                |                 |                                  |
+ * |----------------------------------|----------------------------------|
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |              LH-2                |              HH-2                |
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |                                  |                                  |
+ * |---------------------------------------------------------------------|
+ *
+ * DWT transforms are generally applied by splitting the image in two vertically
+ * and applying a low pass transform on the left part and a corresponding high
+ * pass transform on the right hand side. This is known as the horizontal filter
+ * stage.
+ * After that, the same operation is performed except the image is divided
+ * horizontally, with the high pass on the lower and the low pass on the higher
+ * side.
+ * Therefore, you're left with 4 subdivisions - known as  low-low, low-high,
+ * high-low and high-high. They're referred to as orientations in the decoder
+ * and encoder.
+ *
+ * The LL (low-low) area contains the original image downsampled by the amount
+ * of levels. The rest of the areas can be thought as the details needed
+ * to restore the image perfectly to its original size.
+ */
+static int dwt_plane(AVCodecContext *avctx, void *arg)
+{
+    TransformArgs *transform_dat = arg;
+    VC2EncContext *s = transform_dat->ctx;
+    const void *frame_data = transform_dat->idata;
+    const ptrdiff_t linesize = transform_dat->istride;
+    const int field = transform_dat->field;
+    const Plane *p = transform_dat->plane;
+    VC2TransformContext *t = &transform_dat->t;
+    dwtcoef *buf = p->coef_buf;
+    const int idx = s->wavelet_idx;
+    const int skip = 1 + s->interlaced;
+
+    int x, y, level, offset;
+    ptrdiff_t pix_stride = linesize >> (s->bpp - 1);
+
+    if (field == 1) {
+        offset = 0;
+        pix_stride <<= 1;
+    } else if (field == 2) {
+        offset = pix_stride;
+        pix_stride <<= 1;
+    } else {
+        offset = 0;
+    }
+
+    if (s->bpp == 1) {
+        const uint8_t *pix = (const uint8_t *)frame_data + offset;
+        for (y = 0; y < p->height*skip; y+=skip) {
+            for (x = 0; x < p->width; x++) {
+                buf[x] = pix[x] - s->diff_offset;
+            }
+            buf += p->coef_stride;
+            pix += pix_stride;
+        }
+    } else {
+        const uint16_t *pix = (const uint16_t *)frame_data + offset;
+        for (y = 0; y < p->height*skip; y+=skip) {
+            for (x = 0; x < p->width; x++) {
+                buf[x] = pix[x] - s->diff_offset;
+            }
+            buf += p->coef_stride;
+            pix += pix_stride;
+        }
+    }
+
+    memset(buf, 0, p->coef_stride * (p->dwt_height - p->height) * sizeof(dwtcoef));
+
+    for (level = s->wavelet_depth-1; level >= 0; level--) {
+        const SubBand *b = &p->band[level][0];
+        t->vc2_subband_dwt[idx](t, p->coef_buf, p->coef_stride,
+                                b->width, b->height);
+    }
+
+    return 0;
+}
+
+static int encode_frame(VC2EncContext *s, AVPacket *avpkt, const AVFrame *frame,
+                        const char *aux_data, const int header_size, int field)
+{
+    int i, ret;
+    int64_t max_frame_bytes;
+
+     /* Threaded DWT transform */
+    for (i = 0; i < 3; i++) {
+        s->transform_args[i].ctx   = s;
+        s->transform_args[i].field = field;
+        s->transform_args[i].plane = &s->plane[i];
+        s->transform_args[i].idata = frame->data[i];
+        s->transform_args[i].istride = frame->linesize[i];
+    }
+    s->avctx->execute(s->avctx, dwt_plane, s->transform_args, NULL, 3,
+                      sizeof(TransformArgs));
+
+    /* Calculate per-slice quantizers and sizes */
+    max_frame_bytes = header_size + calc_slice_sizes(s);
+
+    if (field < 2) {
+        ret = ff_alloc_packet2(s->avctx, avpkt,
+                               max_frame_bytes << s->interlaced,
+                               max_frame_bytes << s->interlaced);
+        if (ret) {
+            av_log(s->avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+            return ret;
+        }
+        init_put_bits(&s->pb, avpkt->data, avpkt->size);
+    }
+
+    /* Sequence header */
+    encode_parse_info(s, DIRAC_PCODE_SEQ_HEADER);
+    encode_seq_header(s);
+
+    /* Encoder version */
+    if (aux_data) {
+        encode_parse_info(s, DIRAC_PCODE_AUX);
+        avpriv_put_string(&s->pb, aux_data, 1);
+    }
+
+    /* Picture header */
+    encode_parse_info(s, DIRAC_PCODE_PICTURE_HQ);
+    encode_picture_start(s);
+
+    /* Encode slices */
+    encode_slices(s);
+
+    /* End sequence */
+    encode_parse_info(s, DIRAC_PCODE_END_SEQ);
+
+    return 0;
+}
+
+static av_cold int vc2_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                      const AVFrame *frame, int *got_packet)
+{
+    int ret = 0;
+    int sig_size = 256;
+    VC2EncContext *s = avctx->priv_data;
+    const int bitexact = avctx->flags & AV_CODEC_FLAG_BITEXACT;
+    const char *aux_data = bitexact ? "Lavc" : LIBAVCODEC_IDENT;
+    const int aux_data_size = bitexact ? sizeof("Lavc") : sizeof(LIBAVCODEC_IDENT);
+    const int header_size = 100 + aux_data_size;
+    int64_t max_frame_bytes, r_bitrate = avctx->bit_rate >> (s->interlaced);
+
+    s->avctx = avctx;
+    s->size_scaler = 2;
+    s->prefix_bytes = 0;
+    s->last_parse_code = 0;
+    s->next_parse_offset = 0;
+
+    /* Rate control */
+    max_frame_bytes = (av_rescale(r_bitrate, s->avctx->time_base.num,
+                                  s->avctx->time_base.den) >> 3) - header_size;
+    s->slice_max_bytes = av_rescale(max_frame_bytes, 1, s->num_x*s->num_y);
+
+    /* Find an appropriate size scaler */
+    while (sig_size > 255) {
+        int r_size = SSIZE_ROUND(s->slice_max_bytes);
+        sig_size = r_size/s->size_scaler; /* Signalled slize size */
+        s->size_scaler <<= 1;
+    }
+
+    s->slice_max_bytes = SSIZE_ROUND(s->slice_max_bytes);
+    s->slice_min_bytes = s->slice_max_bytes - s->slice_max_bytes*(s->tolerance/100.0f);
+
+    ret = encode_frame(s, avpkt, frame, aux_data, header_size, s->interlaced);
+    if (ret)
+        return ret;
+    if (s->interlaced) {
+        ret = encode_frame(s, avpkt, frame, aux_data, header_size, 2);
+        if (ret)
+            return ret;
+    }
+
+    flush_put_bits(&s->pb);
+    avpkt->size = put_bits_count(&s->pb) >> 3;
+
+    *got_packet = 1;
+
+    return 0;
+}
+
+static av_cold int vc2_encode_end(AVCodecContext *avctx)
+{
+    int i;
+    VC2EncContext *s = avctx->priv_data;
+
+    av_log(avctx, AV_LOG_INFO, "Qavg: %i\n", s->q_avg);
+
+    for (i = 0; i < 3; i++) {
+        ff_vc2enc_free_transforms(&s->transform_args[i].t);
+        av_freep(&s->plane[i].coef_buf);
+    }
+
+    av_freep(&s->slice_args);
+    av_freep(&s->coef_lut_len);
+    av_freep(&s->coef_lut_val);
+
+    return 0;
+}
+
+static av_cold int vc2_encode_init(AVCodecContext *avctx)
+{
+    Plane *p;
+    SubBand *b;
+    int i, j, level, o, shift;
+    const AVPixFmtDescriptor *fmt = av_pix_fmt_desc_get(avctx->pix_fmt);
+    const int depth = fmt->comp[0].depth;
+    VC2EncContext *s = avctx->priv_data;
+
+    s->picture_number = 0;
+
+    /* Total allowed quantization range */
+    s->q_ceil    = MAX_QUANT_INDEX;
+
+    s->ver.major = 2;
+    s->ver.minor = 0;
+    s->profile   = 3;
+    s->level     = 3;
+
+    s->base_vf   = -1;
+    s->strict_compliance = 1;
+
+    s->q_avg = 0;
+    s->slice_max_bytes = 0;
+    s->slice_min_bytes = 0;
+
+    /* Mark unknown as progressive */
+    s->interlaced = !((avctx->field_order == AV_FIELD_UNKNOWN) ||
+                      (avctx->field_order == AV_FIELD_PROGRESSIVE));
+
+    for (i = 0; i < base_video_fmts_len; i++) {
+        const VC2BaseVideoFormat *fmt = &base_video_fmts[i];
+        if (avctx->pix_fmt != fmt->pix_fmt)
+            continue;
+        if (avctx->time_base.num != fmt->time_base.num)
+            continue;
+        if (avctx->time_base.den != fmt->time_base.den)
+            continue;
+        if (avctx->width != fmt->width)
+            continue;
+        if (avctx->height != fmt->height)
+            continue;
+        if (s->interlaced != fmt->interlaced)
+            continue;
+        s->base_vf = i;
+        s->level   = base_video_fmts[i].level;
+        break;
+    }
+
+    if (s->interlaced)
+        av_log(avctx, AV_LOG_WARNING, "Interlacing enabled!\n");
+
+    if ((s->slice_width  & (s->slice_width  - 1)) ||
+        (s->slice_height & (s->slice_height - 1))) {
+        av_log(avctx, AV_LOG_ERROR, "Slice size is not a power of two!\n");
+        return AVERROR_UNKNOWN;
+    }
+
+    if ((s->slice_width > avctx->width) ||
+        (s->slice_height > avctx->height)) {
+        av_log(avctx, AV_LOG_ERROR, "Slice size is bigger than the image!\n");
+        return AVERROR_UNKNOWN;
+    }
+
+    if (s->base_vf <= 0) {
+        if (avctx->strict_std_compliance <= FF_COMPLIANCE_UNOFFICIAL) {
+            s->strict_compliance = s->base_vf = 0;
+            av_log(avctx, AV_LOG_WARNING, "Disabling strict compliance\n");
+        } else {
+            av_log(avctx, AV_LOG_WARNING, "Given format does not strictly comply with "
+                   "the specifications, please add a -strict -1 flag to use it\n");
+            return AVERROR_UNKNOWN;
+        }
+    } else {
+        av_log(avctx, AV_LOG_INFO, "Selected base video format = %i (%s)\n",
+               s->base_vf, base_video_fmts[s->base_vf].name);
+    }
+
+    /* Chroma subsampling */
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
+
+    /* Bit depth and color range index */
+    if (depth == 8 && avctx->color_range == AVCOL_RANGE_JPEG) {
+        s->bpp = 1;
+        s->bpp_idx = 1;
+        s->diff_offset = 128;
+    } else if (depth == 8 && (avctx->color_range == AVCOL_RANGE_MPEG ||
+               avctx->color_range == AVCOL_RANGE_UNSPECIFIED)) {
+        s->bpp = 1;
+        s->bpp_idx = 2;
+        s->diff_offset = 128;
+    } else if (depth == 10) {
+        s->bpp = 2;
+        s->bpp_idx = 3;
+        s->diff_offset = 512;
+    } else {
+        s->bpp = 2;
+        s->bpp_idx = 4;
+        s->diff_offset = 2048;
+    }
+
+    /* Planes initialization */
+    for (i = 0; i < 3; i++) {
+        int w, h;
+        p = &s->plane[i];
+        p->width      = avctx->width  >> (i ? s->chroma_x_shift : 0);
+        p->height     = avctx->height >> (i ? s->chroma_y_shift : 0);
+        if (s->interlaced)
+            p->height >>= 1;
+        p->dwt_width  = w = FFALIGN(p->width,  (1 << s->wavelet_depth));
+        p->dwt_height = h = FFALIGN(p->height, (1 << s->wavelet_depth));
+        p->coef_stride = FFALIGN(p->dwt_width, 32);
+        p->coef_buf = av_malloc(p->coef_stride*p->dwt_height*sizeof(dwtcoef));
+        if (!p->coef_buf)
+            goto alloc_fail;
+        for (level = s->wavelet_depth-1; level >= 0; level--) {
+            w = w >> 1;
+            h = h >> 1;
+            for (o = 0; o < 4; o++) {
+                b = &p->band[level][o];
+                b->width  = w;
+                b->height = h;
+                b->stride = p->coef_stride;
+                shift = (o > 1)*b->height*b->stride + (o & 1)*b->width;
+                b->buf = p->coef_buf + shift;
+            }
+        }
+
+        /* DWT init */
+        if (ff_vc2enc_init_transforms(&s->transform_args[i].t,
+                                      s->plane[i].coef_stride,
+                                      s->plane[i].dwt_height))
+            goto alloc_fail;
+    }
+
+    /* Slices */
+    s->num_x = s->plane[0].dwt_width/s->slice_width;
+    s->num_y = s->plane[0].dwt_height/s->slice_height;
+
+    s->slice_args = av_calloc(s->num_x*s->num_y, sizeof(SliceArgs));
+    if (!s->slice_args)
+        goto alloc_fail;
+
+    /* Lookup tables */
+    s->coef_lut_len = av_malloc(COEF_LUT_TAB*(s->q_ceil+1)*sizeof(*s->coef_lut_len));
+    if (!s->coef_lut_len)
+        goto alloc_fail;
+
+    s->coef_lut_val = av_malloc(COEF_LUT_TAB*(s->q_ceil+1)*sizeof(*s->coef_lut_val));
+    if (!s->coef_lut_val)
+        goto alloc_fail;
+
+    for (i = 0; i < s->q_ceil; i++) {
+        uint8_t  *len_lut = &s->coef_lut_len[i*COEF_LUT_TAB];
+        uint32_t *val_lut = &s->coef_lut_val[i*COEF_LUT_TAB];
+        for (j = 0; j < COEF_LUT_TAB; j++) {
+            get_vc2_ue_uint(QUANT(j, ff_dirac_qscale_tab[i]),
+                            &len_lut[j], &val_lut[j]);
+            if (len_lut[j] != 1) {
+                len_lut[j] += 1;
+                val_lut[j] <<= 1;
+            } else {
+                val_lut[j] = 1;
+            }
+        }
+    }
+
+    return 0;
+
+alloc_fail:
+    vc2_encode_end(avctx);
+    av_log(avctx, AV_LOG_ERROR, "Unable to allocate memory!\n");
+    return AVERROR(ENOMEM);
+}
+
+#define VC2ENC_FLAGS (AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption vc2enc_options[] = {
+    {"tolerance",     "Max undershoot in percent", offsetof(VC2EncContext, tolerance), AV_OPT_TYPE_DOUBLE, {.dbl = 5.0f}, 0.0f, 45.0f, VC2ENC_FLAGS, "tolerance"},
+    {"slice_width",   "Slice width",  offsetof(VC2EncContext, slice_width), AV_OPT_TYPE_INT, {.i64 = 32}, 32, 1024, VC2ENC_FLAGS, "slice_width"},
+    {"slice_height",  "Slice height", offsetof(VC2EncContext, slice_height), AV_OPT_TYPE_INT, {.i64 = 16}, 8, 1024, VC2ENC_FLAGS, "slice_height"},
+    {"wavelet_depth", "Transform depth", offsetof(VC2EncContext, wavelet_depth), AV_OPT_TYPE_INT, {.i64 = 4}, 1, 5, VC2ENC_FLAGS, "wavelet_depth"},
+    {"wavelet_type",  "Transform type",  offsetof(VC2EncContext, wavelet_idx), AV_OPT_TYPE_INT, {.i64 = VC2_TRANSFORM_9_7}, 0, VC2_TRANSFORMS_NB, VC2ENC_FLAGS, "wavelet_idx"},
+        {"9_7",          "Deslauriers-Dubuc (9,7)", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_9_7},    INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+        {"5_3",          "LeGall (5,3)",            0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_5_3},    INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+        {"haar",         "Haar (with shift)",       0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR_S}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+        {"haar_noshift", "Haar (without shift)",    0, AV_OPT_TYPE_CONST, {.i64 = VC2_TRANSFORM_HAAR},   INT_MIN, INT_MAX, VC2ENC_FLAGS, "wavelet_idx"},
+    {"qm", "Custom quantization matrix", offsetof(VC2EncContext, quant_matrix), AV_OPT_TYPE_INT, {.i64 = VC2_QM_DEF}, 0, VC2_QM_NB, VC2ENC_FLAGS, "quant_matrix"},
+        {"default",   "Default from the specifications", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_DEF}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "quant_matrix"},
+        {"color",     "Prevents low bitrate discoloration", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_COL}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "quant_matrix"},
+        {"flat",      "Optimize for PSNR", 0, AV_OPT_TYPE_CONST, {.i64 = VC2_QM_FLAT}, INT_MIN, INT_MAX, VC2ENC_FLAGS, "quant_matrix"},
+    {NULL}
+};
+
+static const AVClass vc2enc_class = {
+    .class_name = "SMPTE VC-2 encoder",
+    .category = AV_CLASS_CATEGORY_ENCODER,
+    .option = vc2enc_options,
+    .item_name = av_default_item_name,
+    .version = LIBAVUTIL_VERSION_INT
+};
+
+static const AVCodecDefault vc2enc_defaults[] = {
+    { "b",              "600000000"   },
+    { NULL },
+};
+
+static const enum AVPixelFormat allowed_pix_fmts[] = {
+    AV_PIX_FMT_YUV420P,   AV_PIX_FMT_YUV422P,   AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
+    AV_PIX_FMT_NONE
+};
+
+AVCodec ff_vc2_encoder = {
+    .name           = "vc2",
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE VC-2"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_DIRAC,
+    .priv_data_size = sizeof(VC2EncContext),
+    .init           = vc2_encode_init,
+    .close          = vc2_encode_end,
+    .capabilities   = AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal  = FF_CODEC_CAP_INIT_THREADSAFE,
+    .encode2        = vc2_encode_frame,
+    .priv_class     = &vc2enc_class,
+    .defaults       = vc2enc_defaults,
+    .pix_fmts       = allowed_pix_fmts
+};
diff --git a/libavcodec/vc2enc_dwt.c b/libavcodec/vc2enc_dwt.c
new file mode 100644
index 0000000..c60b003
--- /dev/null
+++ b/libavcodec/vc2enc_dwt.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+#include "vc2enc_dwt.h"
+
+/* Since the transforms spit out interleaved coefficients, this function
+ * rearranges the coefficients into the more traditional subdivision,
+ * making it easier to encode and perform another level. */
+static av_always_inline void deinterleave(dwtcoef *linell, ptrdiff_t stride,
+                                          int width, int height, dwtcoef *synthl)
+{
+    int x, y;
+    ptrdiff_t synthw = width << 1;
+    dwtcoef *linehl = linell + width;
+    dwtcoef *linelh = linell + height*stride;
+    dwtcoef *linehh = linelh + width;
+
+    /* Deinterleave the coefficients. */
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            linell[x] = synthl[(x << 1)];
+            linehl[x] = synthl[(x << 1) + 1];
+            linelh[x] = synthl[(x << 1) + synthw];
+            linehh[x] = synthl[(x << 1) + synthw + 1];
+        }
+        synthl += synthw << 1;
+        linell += stride;
+        linelh += stride;
+        linehl += stride;
+        linehh += stride;
+    }
+}
+
+static void vc2_subband_dwt_97(VC2TransformContext *t, dwtcoef *data,
+                               ptrdiff_t stride, int width, int height)
+{
+    int x, y;
+    dwtcoef *datal = data, *synth = t->buffer, *synthl = synth;
+    const ptrdiff_t synth_width  = width  << 1;
+    const ptrdiff_t synth_height = height << 1;
+
+    /*
+     * Shift in one bit that is used for additional precision and copy
+     * the data to the buffer.
+     */
+    for (y = 0; y < synth_height; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] = datal[x] << 1;
+        synthl += synth_width;
+        datal += stride;
+    }
+
+    /* Horizontal synthesis. */
+    synthl = synth;
+    for (y = 0; y < synth_height; y++) {
+        /* Lifting stage 2. */
+        synthl[1] -= (8*synthl[0] + 9*synthl[2] - synthl[4] + 8) >> 4;
+        for (x = 1; x < width - 2; x++)
+            synthl[2*x + 1] -= (9*synthl[2*x] + 9*synthl[2*x + 2] - synthl[2*x + 4] -
+                                synthl[2 * x - 2] + 8) >> 4;
+        synthl[synth_width - 1] -= (17*synthl[synth_width - 2] -
+                                    synthl[synth_width - 4] + 8) >> 4;
+        synthl[synth_width - 3] -= (8*synthl[synth_width - 2] +
+                                    9*synthl[synth_width - 4] -
+                                    synthl[synth_width - 6] + 8) >> 4;
+        /* Lifting stage 1. */
+        synthl[0] += (synthl[1] + synthl[1] + 2) >> 2;
+        for (x = 1; x < width - 1; x++)
+            synthl[2*x] += (synthl[2*x - 1] + synthl[2*x + 1] + 2) >> 2;
+
+        synthl[synth_width - 2] += (synthl[synth_width - 3] +
+                                    synthl[synth_width - 1] + 2) >> 2;
+        synthl += synth_width;
+    }
+
+    /* Vertical synthesis: Lifting stage 2. */
+    synthl = synth + synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] -= (8*synthl[x - synth_width] + 9*synthl[x + synth_width] -
+                      synthl[x + 3 * synth_width] + 8) >> 4;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 2; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x + synth_width] -= (9*synthl[x] +
+                                        9*synthl[x + 2 * synth_width] -
+                                        synthl[x - 2 * synth_width] -
+                                        synthl[x + 4 * synth_width] + 8) >> 4;
+        synthl += synth_width << 1;
+    }
+
+    synthl = synth + (synth_height - 1) * synth_width;
+    for (x = 0; x < synth_width; x++) {
+        synthl[x] -= (17*synthl[x - synth_width] -
+                      synthl[x - 3*synth_width] + 8) >> 4;
+                      synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] +
+                      8*synthl[x - 1*synth_width] - synthl[x - 5*synth_width] + 8) >> 4;
+    }
+
+    /* Vertical synthesis: Lifting stage 1. */
+    synthl = synth;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (synthl[x + synth_width] + synthl[x + synth_width] + 2) >> 2;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 1; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] += (synthl[x - synth_width] + synthl[x + synth_width] + 2) >> 2;
+        synthl += synth_width << 1;
+    }
+
+    synthl = synth + (synth_height - 2) * synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (synthl[x - synth_width] + synthl[x + synth_width] + 2) >> 2;
+
+    deinterleave(data, stride, width, height, synth);
+}
+
+static void vc2_subband_dwt_53(VC2TransformContext *t, dwtcoef *data,
+                               ptrdiff_t stride, int width, int height)
+{
+    int x, y;
+    dwtcoef *synth = t->buffer, *synthl = synth, *datal = data;
+    const ptrdiff_t synth_width  = width  << 1;
+    const ptrdiff_t synth_height = height << 1;
+
+    /*
+     * Shift in one bit that is used for additional precision and copy
+     * the data to the buffer.
+     */
+    for (y = 0; y < synth_height; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] = datal[x] << 1;
+        synthl += synth_width;
+        datal  += stride;
+    }
+
+    /* Horizontal synthesis. */
+    synthl = synth;
+    for (y = 0; y < synth_height; y++) {
+        /* Lifting stage 2. */
+        for (x = 0; x < width - 1; x++)
+            synthl[2 * x + 1] -= (synthl[2 * x] + synthl[2 * x + 2] + 1) >> 1;
+
+        synthl[synth_width - 1] -= (2*synthl[synth_width - 2] + 1) >> 1;
+
+        /* Lifting stage 1. */
+        synthl[0] += (2*synthl[1] + 2) >> 2;
+        for (x = 1; x < width - 1; x++)
+            synthl[2 * x] += (synthl[2 * x - 1] + synthl[2 * x + 1] + 2) >> 2;
+
+        synthl[synth_width - 2] += (synthl[synth_width - 3] + synthl[synth_width - 1] + 2) >> 2;
+
+        synthl += synth_width;
+    }
+
+    /* Vertical synthesis: Lifting stage 2. */
+    synthl = synth + synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] -= (synthl[x - synth_width] + synthl[x + synth_width] + 1) >> 1;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 1; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x + synth_width] -= (synthl[x] + synthl[x + synth_width * 2] + 1) >> 1;
+        synthl += (synth_width << 1);
+    }
+
+    synthl = synth + (synth_height - 1) * synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] -= (2*synthl[x - synth_width] + 1) >> 1;
+
+    /* Vertical synthesis: Lifting stage 1. */
+    synthl = synth;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (2*synthl[synth_width + x] + 2) >> 2;
+
+    synthl = synth + (synth_width << 1);
+    for (y = 1; y < height - 1; y++) {
+        for (x = 0; x < synth_width; x++)
+            synthl[x] += (synthl[x + synth_width] + synthl[x - synth_width] + 2) >> 2;
+        synthl += (synth_width << 1);
+    }
+
+    synthl = synth + (synth_height - 2)*synth_width;
+    for (x = 0; x < synth_width; x++)
+        synthl[x] += (synthl[x - synth_width] + synthl[x + synth_width] + 2) >> 2;
+
+
+    deinterleave(data, stride, width, height, synth);
+}
+
+static av_always_inline void dwt_haar(VC2TransformContext *t, dwtcoef *data,
+                                      ptrdiff_t stride, int width, int height,
+                                      const int s)
+{
+    int x, y;
+    dwtcoef *synth = t->buffer, *synthl = synth, *datal = data;
+    const ptrdiff_t synth_width  = width  << 1;
+    const ptrdiff_t synth_height = height << 1;
+
+    /* Horizontal synthesis. */
+    for (y = 0; y < synth_height; y++) {
+        for (x = 0; x < synth_width; x += 2) {
+            synthl[y*synth_width + x + 1] = (datal[y*stride + x + 1] << s) -
+                                            (datal[y*stride + x] << s);
+            synthl[y*synth_width + x] = (datal[y*stride + x + 0] << s) +
+                                        ((synthl[y*synth_width + x + 1] + 1) >> 1);
+        }
+    }
+
+    /* Vertical synthesis. */
+    for (x = 0; x < synth_width; x++) {
+        for (y = 0; y < synth_height; y += 2) {
+            synthl[(y + 1)*synth_width + x] = synthl[(y + 1)*synth_width + x] -
+                                              synthl[y*synth_width + x];
+            synthl[y*synth_width + x] = synthl[y*synth_width + x] +
+                                        ((synthl[(y + 1)*synth_width + x] + 1) >> 1);
+        }
+    }
+
+    deinterleave(data, stride, width, height, synth);
+}
+
+static void vc2_subband_dwt_haar(VC2TransformContext *t, dwtcoef *data,
+                                 ptrdiff_t stride, int width, int height)
+{
+    dwt_haar(t, data, stride, width, height, 0);
+}
+
+static void vc2_subband_dwt_haar_shift(VC2TransformContext *t, dwtcoef *data,
+                                       ptrdiff_t stride, int width, int height)
+{
+    dwt_haar(t, data, stride, width, height, 1);
+}
+
+av_cold int ff_vc2enc_init_transforms(VC2TransformContext *s, int p_width, int p_height)
+{
+    s->vc2_subband_dwt[VC2_TRANSFORM_9_7]    = vc2_subband_dwt_97;
+    s->vc2_subband_dwt[VC2_TRANSFORM_5_3]    = vc2_subband_dwt_53;
+    s->vc2_subband_dwt[VC2_TRANSFORM_HAAR]   = vc2_subband_dwt_haar;
+    s->vc2_subband_dwt[VC2_TRANSFORM_HAAR_S] = vc2_subband_dwt_haar_shift;
+
+    s->buffer = av_malloc(2*p_width*p_height*sizeof(dwtcoef));
+    if (!s->buffer)
+        return 1;
+
+    return 0;
+}
+
+av_cold void ff_vc2enc_free_transforms(VC2TransformContext *s)
+{
+    av_freep(&s->buffer);
+}
diff --git a/libavcodec/vc2enc_dwt.h b/libavcodec/vc2enc_dwt.h
new file mode 100644
index 0000000..7fbbfbe
--- /dev/null
+++ b/libavcodec/vc2enc_dwt.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2016 Open Broadcast Systems Ltd.
+ * Author        2016 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VC2ENC_DWT_H
+#define AVCODEC_VC2ENC_DWT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef int32_t dwtcoef;
+
+enum VC2TransformType {
+    VC2_TRANSFORM_9_7    = 0,   /* Deslauriers-Dubuc (9,7)  */
+    VC2_TRANSFORM_5_3    = 1,   /* LeGall (5,3)             */
+    VC2_TRANSFORM_13_7   = 2,   /* Deslauriers-Dubuc (13,7) */
+    VC2_TRANSFORM_HAAR   = 3,   /* Haar without shift       */
+    VC2_TRANSFORM_HAAR_S = 4,   /* Haar with 1 shift/lvl    */
+    VC2_TRANSFORM_FIDEL  = 5,   /* Fidelity filter          */
+    VC2_TRANSFORM_9_7_I  = 6,   /* Daubechies (9,7)         */
+
+    VC2_TRANSFORMS_NB
+};
+
+typedef struct VC2TransformContext {
+    dwtcoef *buffer;
+    void (*vc2_subband_dwt[VC2_TRANSFORMS_NB])(struct VC2TransformContext *t,
+                                               dwtcoef *data, ptrdiff_t stride,
+                                               int width, int height);
+} VC2TransformContext;
+
+int  ff_vc2enc_init_transforms(VC2TransformContext *t, int p_width, int p_height);
+void ff_vc2enc_free_transforms(VC2TransformContext *t);
+
+#endif /* AVCODEC_VC2ENC_DWT_H */
diff --git a/libavcodec/vcr1.c b/libavcodec/vcr1.c
index 76c47eb..28a5eec 100644
--- a/libavcodec/vcr1.c
+++ b/libavcodec/vcr1.c
@@ -2,20 +2,20 @@
  * ATI VCR1 codec
  * Copyright (c) 2003 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 
 #include "avcodec.h"
 #include "internal.h"
+#include "libavutil/avassert.h"
 #include "libavutil/internal.h"
 
 typedef struct VCR1Context {
@@ -37,8 +38,8 @@ static av_cold int vcr1_decode_init(AVCodecContext *avctx)
 {
     avctx->pix_fmt = AV_PIX_FMT_YUV410P;
 
-    if (avctx->width & 7) {
-        av_log(avctx, AV_LOG_ERROR, "Width %d is not divisble by 8.\n", avctx->width);
+    if (avctx->width % 8 || avctx->height%4) {
+        avpriv_request_sample(avctx, "odd dimensions (%d x %d) support", avctx->width, avctx->height);
         return AVERROR_INVALIDDATA;
     }
 
@@ -48,27 +49,25 @@ static av_cold int vcr1_decode_init(AVCodecContext *avctx)
 static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                              int *got_frame, AVPacket *avpkt)
 {
-    const uint8_t *buf        = avpkt->data;
-    int buf_size              = avpkt->size;
     VCR1Context *const a      = avctx->priv_data;
     AVFrame *const p          = data;
-    const uint8_t *bytestream = buf;
+    const uint8_t *bytestream = avpkt->data;
+    const uint8_t *bytestream_end = bytestream + avpkt->size;
     int i, x, y, ret;
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
+    if(avpkt->size < 32 + avctx->height + avctx->width*avctx->height*5/8){
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data. %d < %d\n", avpkt->size ,  32 + avctx->height + avctx->width*avctx->height*5/8);
+        return AVERROR(EINVAL);
     }
+
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
+        return ret;
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
-    if (buf_size < 32)
-        goto packet_small;
-
     for (i = 0; i < 16; i++) {
         a->delta[i] = *bytestream++;
         bytestream++;
-        buf_size--;
     }
 
     for (y = 0; y < avctx->height; y++) {
@@ -79,12 +78,10 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
             uint8_t *cb = &p->data[1][(y >> 2) * p->linesize[1]];
             uint8_t *cr = &p->data[2][(y >> 2) * p->linesize[2]];
 
-            if (buf_size < 4 + avctx->width)
-                goto packet_small;
+            av_assert0 (bytestream_end - bytestream >= 4 + avctx->width);
 
             for (i = 0; i < 4; i++)
                 a->offset[i] = *bytestream++;
-            buf_size -= 4;
 
             offset = a->offset[0] - a->delta[bytestream[2] & 0xF];
             for (x = 0; x < avctx->width; x += 4) {
@@ -98,11 +95,9 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                 *cr++       = bytestream[1];
 
                 bytestream += 4;
-                buf_size   -= 4;
             }
         } else {
-            if (buf_size < avctx->width / 2)
-                goto packet_small;
+            av_assert0 (bytestream_end - bytestream >= avctx->width / 2);
 
             offset = a->offset[y & 3] - a->delta[bytestream[2] & 0xF];
 
@@ -117,17 +112,13 @@ static int vcr1_decode_frame(AVCodecContext *avctx, void *data,
                 luma[7]     = offset += a->delta[bytestream[1] >>  4];
                 luma       += 8;
                 bytestream += 4;
-                buf_size   -= 4;
             }
         }
     }
 
     *got_frame = 1;
 
-    return buf_size;
-packet_small:
-    av_log(avctx, AV_LOG_ERROR, "Input packet too small.\n");
-    return AVERROR_INVALIDDATA;
+    return bytestream - avpkt->data;
 }
 
 AVCodec ff_vcr1_decoder = {
diff --git a/libavcodec/vda.c b/libavcodec/vda.c
index eb4b998..4670140 100644
--- a/libavcodec/vda.c
+++ b/libavcodec/vda.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,7 +21,7 @@
 #include "libavutil/mem.h"
 
 #include "vda.h"
-#include "vda_internal.h"
+#include "vda_vt_internal.h"
 
 #if CONFIG_H264_VDA_HWACCEL
 AVVDAContext *av_vda_alloc_context(void)
diff --git a/libavcodec/vda.h b/libavcodec/vda.h
index 5e7228c..bde14e3 100644
--- a/libavcodec/vda.h
+++ b/libavcodec/vda.h
@@ -3,20 +3,20 @@
  *
  * copyright (c) 2011 Sebastien Zwickert
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,6 @@
  */
 
 #include "libavcodec/avcodec.h"
-#include "libavcodec/version.h"
 
 #include <stdint.h>
 
@@ -42,6 +41,14 @@
 #include <VideoDecodeAcceleration/VDADecoder.h>
 #undef Picture
 
+#include "libavcodec/version.h"
+
+// extra flags not defined in VDADecoder.h
+enum {
+    kVDADecodeInfo_Asynchronous = 1UL << 0,
+    kVDADecodeInfo_FrameDropped = 1UL << 1
+};
+
 /**
  * @defgroup lavc_codec_hwaccel_vda VDA
  * @ingroup lavc_codec_hwaccel
@@ -51,7 +58,7 @@
 
 /**
  * This structure is used to provide the necessary configurations and data
- * to the VDA Libav HWAccel implementation.
+ * to the VDA FFmpeg HWAccel implementation.
  *
  * The application must make it available as AVCodecContext.hwaccel_context.
  */
@@ -126,6 +133,17 @@ struct vda_context {
      * unused
      */
     int                 priv_allocated_size;
+
+    /**
+     * Use av_buffer to manage buffer.
+     * When the flag is set, the CVPixelBuffers returned by the decoder will
+     * be released automatically, so you have to retain them if necessary.
+     * Not setting this flag may cause memory leak.
+     *
+     * encoding: unused
+     * decoding: Set by user.
+     */
+    int                 use_ref_buffer;
 };
 
 /** Create the video decoder. */
diff --git a/libavcodec/vda_h264.c b/libavcodec/vda_h264.c
index 8ae9792..8c526c0 100644
--- a/libavcodec/vda_h264.c
+++ b/libavcodec/vda_h264.c
@@ -1,49 +1,40 @@
 /*
- * VDA H.264 hardware acceleration
+ * VDA H264 HW acceleration.
  *
  * copyright (c) 2011 Sebastien Zwickert
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <CoreFoundation/CFDictionary.h>
 #include <CoreFoundation/CFNumber.h>
 #include <CoreFoundation/CFData.h>
-#include <CoreFoundation/CFString.h>
 
+#include "vda.h"
 #include "libavutil/avutil.h"
 #include "h264.h"
-#include "internal.h"
-#include "vda.h"
-#include "vda_internal.h"
-
-typedef struct VDAContext {
-    // The current bitstream buffer.
-    uint8_t             *bitstream;
-
-    // The current size of the bitstream.
-    int                  bitstream_size;
-
-    // The reference size used for fast reallocation.
-    int                  allocated_size;
 
-    CVImageBufferRef frame;
-} VDAContext;
+struct vda_buffer {
+    CVPixelBufferRef cv_buffer;
+};
+#include "internal.h"
+#include "vda_vt_internal.h"
 
-/* Decoder callback that adds the VDA frame to the queue in display order. */
+/* Decoder callback that adds the vda frame to the queue in display order. */
 static void vda_decoder_callback(void *vda_hw_ctx,
                                  CFDictionaryRef user_info,
                                  OSStatus status,
@@ -52,6 +43,9 @@ static void vda_decoder_callback(void *vda_hw_ctx,
 {
     struct vda_context *vda_ctx = vda_hw_ctx;
 
+    if (infoFlags & kVDADecodeInfo_FrameDropped)
+        vda_ctx->cv_buffer = NULL;
+
     if (!image_buffer)
         return;
 
@@ -61,7 +55,7 @@ static void vda_decoder_callback(void *vda_hw_ctx,
     vda_ctx->cv_buffer = CVPixelBufferRetain(image_buffer);
 }
 
-static int vda_sync_decode(VDAContext *ctx, struct vda_context *vda_ctx)
+static int vda_sync_decode(VTContext *ctx, struct vda_context *vda_ctx)
 {
     OSStatus status;
     CFDataRef coded_frame;
@@ -86,8 +80,8 @@ static int vda_old_h264_start_frame(AVCodecContext *avctx,
                                 av_unused const uint8_t *buffer,
                                 av_unused uint32_t size)
 {
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-    struct vda_context *vda_ctx         = avctx->hwaccel_context;
+    VTContext *vda = avctx->internal->hwaccel_priv_data;
+    struct vda_context *vda_ctx = avctx->hwaccel_context;
 
     if (!vda_ctx->decoder)
         return -1;
@@ -101,8 +95,8 @@ static int vda_old_h264_decode_slice(AVCodecContext *avctx,
                                  const uint8_t *buffer,
                                  uint32_t size)
 {
-    VDAContext *vda                     = avctx->internal->hwaccel_priv_data;
-    struct vda_context *vda_ctx         = avctx->hwaccel_context;
+    VTContext *vda              = avctx->internal->hwaccel_priv_data;
+    struct vda_context *vda_ctx = avctx->hwaccel_context;
     void *tmp;
 
     if (!vda_ctx->decoder)
@@ -124,12 +118,21 @@ static int vda_old_h264_decode_slice(AVCodecContext *avctx,
     return 0;
 }
 
+static void vda_h264_release_buffer(void *opaque, uint8_t *data)
+{
+    struct vda_buffer *context = opaque;
+    CVPixelBufferRelease(context->cv_buffer);
+    av_free(context);
+}
+
 static int vda_old_h264_end_frame(AVCodecContext *avctx)
 {
     H264Context *h                      = avctx->priv_data;
-    VDAContext *vda                     = avctx->internal->hwaccel_priv_data;
+    VTContext *vda                      = avctx->internal->hwaccel_priv_data;
     struct vda_context *vda_ctx         = avctx->hwaccel_context;
     AVFrame *frame                      = h->cur_pic_ptr->f;
+    struct vda_buffer *context;
+    AVBufferRef *buffer;
     int status;
 
     if (!vda_ctx->decoder || !vda->bitstream)
@@ -141,6 +144,20 @@ static int vda_old_h264_end_frame(AVCodecContext *avctx)
     if (status)
         av_log(avctx, AV_LOG_ERROR, "Failed to decode frame (%d)\n", status);
 
+    if (!vda_ctx->use_ref_buffer || status)
+        return status;
+
+    context = av_mallocz(sizeof(*context));
+    buffer = av_buffer_create(NULL, 0, vda_h264_release_buffer, context, 0);
+    if (!context || !buffer) {
+        CVPixelBufferRelease(vda_ctx->cv_buffer);
+        av_free(context);
+        return -1;
+    }
+
+    context->cv_buffer = vda_ctx->cv_buffer;
+    frame->buf[3] = buffer;
+
     return status;
 }
 
@@ -148,7 +165,7 @@ int ff_vda_create_decoder(struct vda_context *vda_ctx,
                           uint8_t *extradata,
                           int extradata_size)
 {
-    OSStatus status = kVDADecoderNoErr;
+    OSStatus status;
     CFNumberRef height;
     CFNumberRef width;
     CFNumberRef format;
@@ -158,6 +175,9 @@ int ff_vda_create_decoder(struct vda_context *vda_ctx,
     CFMutableDictionaryRef io_surface_properties;
     CFNumberRef cv_pix_fmt;
 
+    vda_ctx->priv_bitstream = NULL;
+    vda_ctx->priv_allocated_size = 0;
+
     /* Each VCL NAL in the bitstream sent to the decoder
      * is preceded by a 4 bytes length header.
      * Change the avcC atom header if needed, to signal headers of 4 bytes. */
@@ -200,9 +220,9 @@ int ff_vda_create_decoder(struct vda_context *vda_ctx,
                                                       0,
                                                       &kCFTypeDictionaryKeyCallBacks,
                                                       &kCFTypeDictionaryValueCallBacks);
-    cv_pix_fmt      = CFNumberCreate(kCFAllocatorDefault,
-                                     kCFNumberSInt32Type,
-                                     &vda_ctx->cv_pix_fmt_type);
+    cv_pix_fmt  = CFNumberCreate(kCFAllocatorDefault,
+                                 kCFNumberSInt32Type,
+                                 &vda_ctx->cv_pix_fmt_type);
     CFDictionarySetValue(buffer_attributes,
                          kCVPixelBufferPixelFormatTypeKey,
                          cv_pix_fmt);
@@ -238,15 +258,6 @@ int ff_vda_destroy_decoder(struct vda_context *vda_ctx)
     return status;
 }
 
-static int vda_h264_uninit(AVCodecContext *avctx)
-{
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-    av_freep(&vda->bitstream);
-    if (vda->frame)
-        CVPixelBufferRelease(vda->frame);
-    return 0;
-}
-
 AVHWAccel ff_h264_vda_old_hwaccel = {
     .name           = "h264_vda",
     .type           = AVMEDIA_TYPE_VIDEO,
@@ -255,8 +266,8 @@ AVHWAccel ff_h264_vda_old_hwaccel = {
     .start_frame    = vda_old_h264_start_frame,
     .decode_slice   = vda_old_h264_decode_slice,
     .end_frame      = vda_old_h264_end_frame,
-    .uninit         = vda_h264_uninit,
-    .priv_data_size = sizeof(VDAContext),
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
 };
 
 void ff_vda_output_callback(void *opaque,
@@ -266,7 +277,7 @@ void ff_vda_output_callback(void *opaque,
                             CVImageBufferRef image_buffer)
 {
     AVCodecContext *ctx = opaque;
-    VDAContext *vda = ctx->internal->hwaccel_priv_data;
+    VTContext *vda = ctx->internal->hwaccel_priv_data;
 
 
     if (vda->frame) {
@@ -280,50 +291,10 @@ void ff_vda_output_callback(void *opaque,
     vda->frame = CVPixelBufferRetain(image_buffer);
 }
 
-static int vda_h264_start_frame(AVCodecContext *avctx,
-                                const uint8_t *buffer,
-                                uint32_t size)
-{
-    VDAContext *vda = avctx->internal->hwaccel_priv_data;
-
-    vda->bitstream_size = 0;
-
-    return 0;
-}
-
-static int vda_h264_decode_slice(AVCodecContext *avctx,
-                                 const uint8_t *buffer,
-                                 uint32_t size)
-{
-    VDAContext *vda       = avctx->internal->hwaccel_priv_data;
-    void *tmp;
-
-    tmp = av_fast_realloc(vda->bitstream,
-                          &vda->allocated_size,
-                          vda->bitstream_size + size + 4);
-    if (!tmp)
-        return AVERROR(ENOMEM);
-
-    vda->bitstream = tmp;
-
-    AV_WB32(vda->bitstream + vda->bitstream_size, size);
-    memcpy(vda->bitstream + vda->bitstream_size + 4, buffer, size);
-
-    vda->bitstream_size += size + 4;
-
-    return 0;
-}
-
-static void release_buffer(void *opaque, uint8_t *data)
-{
-    CVImageBufferRef frame = (CVImageBufferRef)data;
-    CVPixelBufferRelease(frame);
-}
-
 static int vda_h264_end_frame(AVCodecContext *avctx)
 {
     H264Context *h        = avctx->priv_data;
-    VDAContext *vda       = avctx->internal->hwaccel_priv_data;
+    VTContext *vda        = avctx->internal->hwaccel_priv_data;
     AVVDAContext *vda_ctx = avctx->hwaccel_context;
     AVFrame *frame        = h->cur_pic_ptr->f;
     uint32_t flush_flags  = 1 << 0; ///< kVDADecoderFlush_emitFrames
@@ -353,19 +324,7 @@ static int vda_h264_end_frame(AVCodecContext *avctx)
         return AVERROR_UNKNOWN;
     }
 
-    av_buffer_unref(&frame->buf[0]);
-
-    frame->buf[0] = av_buffer_create((uint8_t*)vda->frame,
-                                     sizeof(vda->frame),
-                                     release_buffer, NULL,
-                                     AV_BUFFER_FLAG_READONLY);
-    if (!frame->buf[0])
-        return AVERROR(ENOMEM);
-
-    frame->data[3] = (uint8_t*)vda->frame;
-    vda->frame = NULL;
-
-    return 0;
+    return ff_videotoolbox_buffer_create(vda, frame);
 }
 
 int ff_vda_default_init(AVCodecContext *avctx)
@@ -384,26 +343,7 @@ int ff_vda_default_init(AVCodecContext *avctx)
 
     // kCVPixelFormatType_420YpCbCr8Planar;
 
-    /* Each VCL NAL in the bitstream sent to the decoder
-     * is preceded by a 4 bytes length header.
-     * Change the avcC atom header if needed, to signal headers of 4 bytes. */
-    if (avctx->extradata_size >= 4 && (avctx->extradata[4] & 0x03) != 0x03) {
-        uint8_t *rw_extradata;
-
-        if (!(rw_extradata = av_malloc(avctx->extradata_size)))
-            return AVERROR(ENOMEM);
-
-        memcpy(rw_extradata, avctx->extradata, avctx->extradata_size);
-
-        rw_extradata[4] |= 0x03;
-
-        avc_data = CFDataCreate(kCFAllocatorDefault, rw_extradata, avctx->extradata_size);
-
-        av_freep(&rw_extradata);
-    } else {
-        avc_data = CFDataCreate(kCFAllocatorDefault,
-                                avctx->extradata, avctx->extradata_size);
-    }
+    avc_data = ff_videotoolbox_avcc_extradata_create(avctx);
 
     config_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
                                             4,
@@ -471,27 +411,15 @@ int ff_vda_default_init(AVCodecContext *avctx)
     }
 }
 
-static int vda_h264_alloc_frame(AVCodecContext *avctx, AVFrame *frame)
-{
-    frame->width  = avctx->width;
-    frame->height = avctx->height;
-    frame->format = avctx->pix_fmt;
-    frame->buf[0] = av_buffer_alloc(1);
-
-    if (!frame->buf[0])
-        return AVERROR(ENOMEM);
-    return 0;
-}
-
 AVHWAccel ff_h264_vda_hwaccel = {
     .name           = "h264_vda",
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = AV_CODEC_ID_H264,
     .pix_fmt        = AV_PIX_FMT_VDA,
-    .alloc_frame    = vda_h264_alloc_frame,
-    .start_frame    = vda_h264_start_frame,
-    .decode_slice   = vda_h264_decode_slice,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = ff_videotoolbox_h264_start_frame,
+    .decode_slice   = ff_videotoolbox_h264_decode_slice,
     .end_frame      = vda_h264_end_frame,
-    .uninit         = vda_h264_uninit,
-    .priv_data_size = sizeof(VDAContext),
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
 };
diff --git a/libavcodec/vda_h264_dec.c b/libavcodec/vda_h264_dec.c
new file mode 100644
index 0000000..a196eb7
--- /dev/null
+++ b/libavcodec/vda_h264_dec.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2012, Xidorn Quan
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * H.264 decoder via VDA
+ * @author Xidorn Quan <quanxunzhen@gmail.com>
+ */
+
+#include <string.h>
+#include <CoreFoundation/CoreFoundation.h>
+
+#include "vda.h"
+#include "h264.h"
+#include "avcodec.h"
+
+#ifndef kCFCoreFoundationVersionNumber10_7
+#define kCFCoreFoundationVersionNumber10_7      635.00
+#endif
+
+extern AVCodec ff_h264_decoder, ff_h264_vda_decoder;
+
+static const enum AVPixelFormat vda_pixfmts_prior_10_7[] = {
+    AV_PIX_FMT_UYVY422,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+static const enum AVPixelFormat vda_pixfmts[] = {
+    AV_PIX_FMT_UYVY422,
+    AV_PIX_FMT_YUYV422,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+typedef struct {
+    H264Context h264ctx;
+    int h264_initialized;
+    struct vda_context vda_ctx;
+    enum AVPixelFormat pix_fmt;
+
+    /* for backing-up fields set by user.
+     * we have to gain full control of such fields here */
+    void *hwaccel_context;
+    enum AVPixelFormat (*get_format)(struct AVCodecContext *s, const enum AVPixelFormat * fmt);
+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+} VDADecoderContext;
+
+static enum AVPixelFormat get_format(struct AVCodecContext *avctx,
+        const enum AVPixelFormat *fmt)
+{
+    return AV_PIX_FMT_VDA_VLD;
+}
+
+typedef struct {
+    CVPixelBufferRef cv_buffer;
+} VDABufferContext;
+
+static void release_buffer(void *opaque, uint8_t *data)
+{
+    VDABufferContext *context = opaque;
+    CVPixelBufferUnlockBaseAddress(context->cv_buffer, 0);
+    CVPixelBufferRelease(context->cv_buffer);
+    av_free(context);
+}
+
+static int get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flag)
+{
+    VDABufferContext *context = av_mallocz(sizeof(VDABufferContext));
+    AVBufferRef *buffer = av_buffer_create(NULL, 0, release_buffer, context, 0);
+    if (!context || !buffer) {
+        av_free(context);
+        return AVERROR(ENOMEM);
+    }
+
+    pic->buf[0] = buffer;
+    pic->data[0] = (void *)1;
+    return 0;
+}
+
+static inline void set_context(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    ctx->hwaccel_context = avctx->hwaccel_context;
+    avctx->hwaccel_context = &ctx->vda_ctx;
+    ctx->get_format = avctx->get_format;
+    avctx->get_format = get_format;
+    ctx->get_buffer2 = avctx->get_buffer2;
+    avctx->get_buffer2 = get_buffer2;
+}
+
+static inline void restore_context(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    avctx->hwaccel_context = ctx->hwaccel_context;
+    avctx->get_format = ctx->get_format;
+    avctx->get_buffer2 = ctx->get_buffer2;
+}
+
+static int vdadec_decode(AVCodecContext *avctx,
+        void *data, int *got_frame, AVPacket *avpkt)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    AVFrame *pic = data;
+    int ret;
+
+    set_context(avctx);
+    ret = ff_h264_decoder.decode(avctx, data, got_frame, avpkt);
+    restore_context(avctx);
+    if (*got_frame) {
+        AVBufferRef *buffer = pic->buf[0];
+        VDABufferContext *context = av_buffer_get_opaque(buffer);
+        CVPixelBufferRef cv_buffer = (CVPixelBufferRef)pic->data[3];
+
+        CVPixelBufferRetain(cv_buffer);
+        CVPixelBufferLockBaseAddress(cv_buffer, 0);
+        context->cv_buffer = cv_buffer;
+        pic->format = ctx->pix_fmt;
+        if (CVPixelBufferIsPlanar(cv_buffer)) {
+            int i, count = CVPixelBufferGetPlaneCount(cv_buffer);
+            av_assert0(count < 4);
+            for (i = 0; i < count; i++) {
+                pic->data[i] = CVPixelBufferGetBaseAddressOfPlane(cv_buffer, i);
+                pic->linesize[i] = CVPixelBufferGetBytesPerRowOfPlane(cv_buffer, i);
+            }
+        } else {
+            pic->data[0] = CVPixelBufferGetBaseAddress(cv_buffer);
+            pic->linesize[0] = CVPixelBufferGetBytesPerRow(cv_buffer);
+        }
+    }
+    avctx->pix_fmt = ctx->pix_fmt;
+
+    return ret;
+}
+
+static av_cold int vdadec_close(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    /* release buffers and decoder */
+    ff_vda_destroy_decoder(&ctx->vda_ctx);
+    /* close H.264 decoder */
+    if (ctx->h264_initialized) {
+        set_context(avctx);
+        ff_h264_decoder.close(avctx);
+        restore_context(avctx);
+    }
+    return 0;
+}
+
+static av_cold int vdadec_init(AVCodecContext *avctx)
+{
+    VDADecoderContext *ctx = avctx->priv_data;
+    struct vda_context *vda_ctx = &ctx->vda_ctx;
+    OSStatus status;
+    int ret, i;
+
+    ctx->h264_initialized = 0;
+
+    /* init pix_fmts of codec */
+    if (!ff_h264_vda_decoder.pix_fmts) {
+        if (kCFCoreFoundationVersionNumber < kCFCoreFoundationVersionNumber10_7)
+            ff_h264_vda_decoder.pix_fmts = vda_pixfmts_prior_10_7;
+        else
+            ff_h264_vda_decoder.pix_fmts = vda_pixfmts;
+    }
+
+    /* init vda */
+    memset(vda_ctx, 0, sizeof(struct vda_context));
+    vda_ctx->width = avctx->width;
+    vda_ctx->height = avctx->height;
+    vda_ctx->format = 'avc1';
+    vda_ctx->use_sync_decoding = 1;
+    vda_ctx->use_ref_buffer = 1;
+    ctx->pix_fmt = avctx->get_format(avctx, avctx->codec->pix_fmts);
+    switch (ctx->pix_fmt) {
+    case AV_PIX_FMT_UYVY422:
+        vda_ctx->cv_pix_fmt_type = '2vuy';
+        break;
+    case AV_PIX_FMT_YUYV422:
+        vda_ctx->cv_pix_fmt_type = 'yuvs';
+        break;
+    case AV_PIX_FMT_NV12:
+        vda_ctx->cv_pix_fmt_type = '420v';
+        break;
+    case AV_PIX_FMT_YUV420P:
+        vda_ctx->cv_pix_fmt_type = 'y420';
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unsupported pixel format: %d\n", avctx->pix_fmt);
+        goto failed;
+    }
+    status = ff_vda_create_decoder(vda_ctx,
+                                   avctx->extradata, avctx->extradata_size);
+    if (status != kVDADecoderNoErr) {
+        av_log(avctx, AV_LOG_ERROR,
+                "Failed to init VDA decoder: %d.\n", status);
+        goto failed;
+    }
+
+    /* init H.264 decoder */
+    set_context(avctx);
+    ret = ff_h264_decoder.init(avctx);
+    restore_context(avctx);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to open H.264 decoder.\n");
+        goto failed;
+    }
+    ctx->h264_initialized = 1;
+
+    for (i = 0; i < MAX_SPS_COUNT; i++) {
+        const SPS *sps = (const SPS*)ctx->h264ctx.ps.sps_list[i]->data;
+        if (sps && (sps->bit_depth_luma != 8 ||
+                sps->chroma_format_idc == 2 ||
+                sps->chroma_format_idc == 3)) {
+            av_log(avctx, AV_LOG_ERROR, "Format is not supported.\n");
+            goto failed;
+        }
+    }
+
+    return 0;
+
+failed:
+    vdadec_close(avctx);
+    return -1;
+}
+
+static void vdadec_flush(AVCodecContext *avctx)
+{
+    set_context(avctx);
+    ff_h264_decoder.flush(avctx);
+    restore_context(avctx);
+}
+
+AVCodec ff_h264_vda_decoder = {
+    .name           = "h264_vda",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .priv_data_size = sizeof(VDADecoderContext),
+    .init           = vdadec_init,
+    .close          = vdadec_close,
+    .decode         = vdadec_decode,
+    .capabilities   = AV_CODEC_CAP_DELAY,
+    .flush          = vdadec_flush,
+    .long_name      = NULL_IF_CONFIG_SMALL("H.264 (VDA acceleration)"),
+};
diff --git a/libavcodec/vda_internal.h b/libavcodec/vda_internal.h
deleted file mode 100644
index 9d0ed80..0000000
--- a/libavcodec/vda_internal.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_VDA_INTERNAL_H
-#define AVCODEC_VDA_INTERNAL_H
-
-#include "vda.h"
-
-void ff_vda_output_callback(void *vda_hw_ctx,
-                            CFDictionaryRef user_info,
-                            OSStatus status,
-                            uint32_t infoFlags,
-                            CVImageBufferRef image_buffer);
-
-int ff_vda_default_init(AVCodecContext *avctx);
-void ff_vda_default_free(AVCodecContext *avctx);
-
-#endif /* AVCODEC_VDA_INTERNAL_H */
diff --git a/libavcodec/vda_vt_internal.h b/libavcodec/vda_vt_internal.h
new file mode 100644
index 0000000..9ff63cc
--- /dev/null
+++ b/libavcodec/vda_vt_internal.h
@@ -0,0 +1,55 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDA_VT_INTERNAL_H
+#define AVCODEC_VDA_VT_INTERNAL_H
+
+void ff_vda_output_callback(void *vda_hw_ctx,
+                            CFDictionaryRef user_info,
+                            OSStatus status,
+                            uint32_t infoFlags,
+                            CVImageBufferRef image_buffer);
+
+int ff_vda_default_init(AVCodecContext *avctx);
+void ff_vda_default_free(AVCodecContext *avctx);
+
+typedef struct VTContext {
+    // The current bitstream buffer.
+    uint8_t                     *bitstream;
+
+    // The current size of the bitstream.
+    int                         bitstream_size;
+
+    // The reference size used for fast reallocation.
+    int                         allocated_size;
+
+    // The core video buffer
+    CVImageBufferRef            frame;
+} VTContext;
+
+int ff_videotoolbox_alloc_frame(AVCodecContext *avctx, AVFrame *frame);
+int ff_videotoolbox_uninit(AVCodecContext *avctx);
+int ff_videotoolbox_buffer_create(VTContext *vtctx, AVFrame *frame);
+int ff_videotoolbox_h264_start_frame(AVCodecContext *avctx,
+                                     const uint8_t *buffer,
+                                     uint32_t size);
+int ff_videotoolbox_h264_decode_slice(AVCodecContext *avctx,
+                                      const uint8_t *buffer,
+                                      uint32_t size);
+CFDataRef ff_videotoolbox_avcc_extradata_create(AVCodecContext *avctx);
+#endif /* AVCODEC_VDA_VT_INTERNAL_H */
diff --git a/libavcodec/vdpau.c b/libavcodec/vdpau.c
index b778439..1d4c1ce 100644
--- a/libavcodec/vdpau.c
+++ b/libavcodec/vdpau.c
@@ -4,20 +4,20 @@
  *
  * Copyright (c) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,20 @@
 #include "h264.h"
 #include "vc1.h"
 #include "vdpau.h"
+#include "vdpau_compat.h"
 #include "vdpau_internal.h"
 
+// XXX: at the time of adding this ifdefery, av_assert* wasn't use outside.
+// When dropping it, make sure other av_assert* were not added since then.
+#if FF_API_BUFS_VDPAU
+#include "libavutil/avassert.h"
+#endif
+
+#if FF_API_VDPAU
+#undef NDEBUG
+#include <assert.h>
+#endif
+
 /**
  * @addtogroup VDPAU_Decoding
  *
@@ -60,6 +72,13 @@ static int vdpau_error(VdpStatus status)
     }
 }
 
+AVVDPAUContext *av_alloc_vdpaucontext(void)
+{
+    return av_vdpau_alloc_context();
+}
+
+MAKE_ACCESSORS(AVVDPAUContext, vdpau_hwaccel, AVVDPAU_Render2, render2)
+
 int av_vdpau_get_surface_parameters(AVCodecContext *avctx,
                                     VdpChromaType *type,
                                     uint32_t *width, uint32_t *height)
@@ -118,7 +137,12 @@ int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
 
     vdctx->width            = UINT32_MAX;
     vdctx->height           = UINT32_MAX;
-    hwctx->reset            = 0;
+
+    if (!hwctx) {
+        vdctx->device  = VDP_INVALID_HANDLE;
+        av_log(avctx, AV_LOG_WARNING, "hwaccel_context has not been setup by the user application, cannot initialize\n");
+        return 0;
+    }
 
     if (hwctx->context.decoder != VDP_INVALID_HANDLE) {
         vdctx->decoder = hwctx->context.decoder;
@@ -126,6 +150,7 @@ int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
         vdctx->device  = VDP_INVALID_HANDLE;
         return 0; /* Decoder created by user */
     }
+    hwctx->reset            = 0;
 
     vdctx->device           = hwctx->device;
     vdctx->get_proc_address = hwctx->get_proc_address;
@@ -259,6 +284,7 @@ int ff_vdpau_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
                               struct vdpau_picture_context *pic_ctx)
 {
     VDPAUContext *vdctx = avctx->internal->hwaccel_priv_data;
+    AVVDPAUContext *hwctx = avctx->hwaccel_context;
     VdpVideoSurface surf = ff_vdpau_get_surface_id(frame);
     VdpStatus status;
     int val;
@@ -267,11 +293,34 @@ int ff_vdpau_common_end_frame(AVCodecContext *avctx, AVFrame *frame,
     if (val < 0)
         return val;
 
+#if FF_API_BUFS_VDPAU
+FF_DISABLE_DEPRECATION_WARNINGS
+    av_assert0(sizeof(hwctx->info) <= sizeof(pic_ctx->info));
+    memcpy(&hwctx->info, &pic_ctx->info, sizeof(hwctx->info));
+    hwctx->bitstream_buffers = pic_ctx->bitstream_buffers;
+    hwctx->bitstream_buffers_used = pic_ctx->bitstream_buffers_used;
+    hwctx->bitstream_buffers_allocated = pic_ctx->bitstream_buffers_allocated;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+    if (!hwctx->render && hwctx->render2) {
+        status = hwctx->render2(avctx, frame, (void *)&pic_ctx->info,
+                                pic_ctx->bitstream_buffers_used, pic_ctx->bitstream_buffers);
+    } else
     status = vdctx->render(vdctx->decoder, surf, (void *)&pic_ctx->info,
                            pic_ctx->bitstream_buffers_used,
                            pic_ctx->bitstream_buffers);
 
     av_freep(&pic_ctx->bitstream_buffers);
+
+#if FF_API_BUFS_VDPAU
+FF_DISABLE_DEPRECATION_WARNINGS
+    hwctx->bitstream_buffers = NULL;
+    hwctx->bitstream_buffers_used = 0;
+    hwctx->bitstream_buffers_allocated = 0;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
     return vdpau_error(status);
 }
 
@@ -313,6 +362,345 @@ int ff_vdpau_add_buffer(struct vdpau_picture_context *pic_ctx,
     return 0;
 }
 
+/* Obsolete non-hwaccel VDPAU support below... */
+
+#if FF_API_VDPAU
+void ff_vdpau_add_data_chunk(uint8_t *data, const uint8_t *buf, int buf_size)
+{
+    struct vdpau_render_state *render = (struct vdpau_render_state*)data;
+    assert(render);
+
+    render->bitstream_buffers= av_fast_realloc(
+        render->bitstream_buffers,
+        &render->bitstream_buffers_allocated,
+        sizeof(*render->bitstream_buffers)*(render->bitstream_buffers_used + 1)
+    );
+
+    render->bitstream_buffers[render->bitstream_buffers_used].struct_version  = VDP_BITSTREAM_BUFFER_VERSION;
+    render->bitstream_buffers[render->bitstream_buffers_used].bitstream       = buf;
+    render->bitstream_buffers[render->bitstream_buffers_used].bitstream_bytes = buf_size;
+    render->bitstream_buffers_used++;
+}
+
+#if CONFIG_H264_VDPAU_DECODER
+void ff_vdpau_h264_set_reference_frames(H264Context *h)
+{
+    struct vdpau_render_state *render, *render_ref;
+    VdpReferenceFrameH264 *rf, *rf2;
+    H264Picture *pic;
+    int i, list, pic_frame_idx;
+
+    render = (struct vdpau_render_state *)h->cur_pic_ptr->f->data[0];
+    assert(render);
+
+    rf = &render->info.h264.referenceFrames[0];
+#define H264_RF_COUNT FF_ARRAY_ELEMS(render->info.h264.referenceFrames)
+
+    for (list = 0; list < 2; ++list) {
+        H264Picture **lp = list ? h->long_ref : h->short_ref;
+        int ls = list ? 16 : h->short_ref_count;
+
+        for (i = 0; i < ls; ++i) {
+            pic = lp[i];
+            if (!pic || !pic->reference)
+                continue;
+            pic_frame_idx = pic->long_ref ? pic->pic_id : pic->frame_num;
+
+            render_ref = (struct vdpau_render_state *)pic->f->data[0];
+            assert(render_ref);
+
+            rf2 = &render->info.h264.referenceFrames[0];
+            while (rf2 != rf) {
+                if (
+                    (rf2->surface == render_ref->surface)
+                    && (rf2->is_long_term == pic->long_ref)
+                    && (rf2->frame_idx == pic_frame_idx)
+                )
+                    break;
+                ++rf2;
+            }
+            if (rf2 != rf) {
+                rf2->top_is_reference    |= (pic->reference & PICT_TOP_FIELD)    ? VDP_TRUE : VDP_FALSE;
+                rf2->bottom_is_reference |= (pic->reference & PICT_BOTTOM_FIELD) ? VDP_TRUE : VDP_FALSE;
+                continue;
+            }
+
+            if (rf >= &render->info.h264.referenceFrames[H264_RF_COUNT])
+                continue;
+
+            rf->surface             = render_ref->surface;
+            rf->is_long_term        = pic->long_ref;
+            rf->top_is_reference    = (pic->reference & PICT_TOP_FIELD)    ? VDP_TRUE : VDP_FALSE;
+            rf->bottom_is_reference = (pic->reference & PICT_BOTTOM_FIELD) ? VDP_TRUE : VDP_FALSE;
+            rf->field_order_cnt[0]  = pic->field_poc[0];
+            rf->field_order_cnt[1]  = pic->field_poc[1];
+            rf->frame_idx           = pic_frame_idx;
+
+            ++rf;
+        }
+    }
+
+    for (; rf < &render->info.h264.referenceFrames[H264_RF_COUNT]; ++rf) {
+        rf->surface             = VDP_INVALID_HANDLE;
+        rf->is_long_term        = 0;
+        rf->top_is_reference    = 0;
+        rf->bottom_is_reference = 0;
+        rf->field_order_cnt[0]  = 0;
+        rf->field_order_cnt[1]  = 0;
+        rf->frame_idx           = 0;
+    }
+}
+
+void ff_vdpau_h264_picture_start(H264Context *h)
+{
+    struct vdpau_render_state *render;
+    int i;
+
+    render = (struct vdpau_render_state *)h->cur_pic_ptr->f->data[0];
+    assert(render);
+
+    for (i = 0; i < 2; ++i) {
+        int foc = h->cur_pic_ptr->field_poc[i];
+        if (foc == INT_MAX)
+            foc = 0;
+        render->info.h264.field_order_cnt[i] = foc;
+    }
+
+    render->info.h264.frame_num = h->poc.frame_num;
+}
+
+void ff_vdpau_h264_picture_complete(H264Context *h)
+{
+    struct vdpau_render_state *render;
+
+    render = (struct vdpau_render_state *)h->cur_pic_ptr->f->data[0];
+    assert(render);
+
+    render->info.h264.slice_count = h->current_slice;
+    if (render->info.h264.slice_count < 1)
+        return;
+
+    render->info.h264.is_reference                           = (h->cur_pic_ptr->reference & 3) ? VDP_TRUE : VDP_FALSE;
+    render->info.h264.field_pic_flag                         = h->picture_structure != PICT_FRAME;
+    render->info.h264.bottom_field_flag                      = h->picture_structure == PICT_BOTTOM_FIELD;
+    render->info.h264.num_ref_frames                         = h->ps.sps->ref_frame_count;
+    render->info.h264.mb_adaptive_frame_field_flag           = h->ps.sps->mb_aff && !render->info.h264.field_pic_flag;
+    render->info.h264.constrained_intra_pred_flag            = h->ps.pps->constrained_intra_pred;
+    render->info.h264.weighted_pred_flag                     = h->ps.pps->weighted_pred;
+    render->info.h264.weighted_bipred_idc                    = h->ps.pps->weighted_bipred_idc;
+    render->info.h264.frame_mbs_only_flag                    = h->ps.sps->frame_mbs_only_flag;
+    render->info.h264.transform_8x8_mode_flag                = h->ps.pps->transform_8x8_mode;
+    render->info.h264.chroma_qp_index_offset                 = h->ps.pps->chroma_qp_index_offset[0];
+    render->info.h264.second_chroma_qp_index_offset          = h->ps.pps->chroma_qp_index_offset[1];
+    render->info.h264.pic_init_qp_minus26                    = h->ps.pps->init_qp - 26;
+    render->info.h264.num_ref_idx_l0_active_minus1           = h->ps.pps->ref_count[0] - 1;
+    render->info.h264.num_ref_idx_l1_active_minus1           = h->ps.pps->ref_count[1] - 1;
+    render->info.h264.log2_max_frame_num_minus4              = h->ps.sps->log2_max_frame_num - 4;
+    render->info.h264.pic_order_cnt_type                     = h->ps.sps->poc_type;
+    render->info.h264.log2_max_pic_order_cnt_lsb_minus4      = h->ps.sps->poc_type ? 0 : h->ps.sps->log2_max_poc_lsb - 4;
+    render->info.h264.delta_pic_order_always_zero_flag       = h->ps.sps->delta_pic_order_always_zero_flag;
+    render->info.h264.direct_8x8_inference_flag              = h->ps.sps->direct_8x8_inference_flag;
+    render->info.h264.entropy_coding_mode_flag               = h->ps.pps->cabac;
+    render->info.h264.pic_order_present_flag                 = h->ps.pps->pic_order_present;
+    render->info.h264.deblocking_filter_control_present_flag = h->ps.pps->deblocking_filter_parameters_present;
+    render->info.h264.redundant_pic_cnt_present_flag         = h->ps.pps->redundant_pic_cnt_present;
+    memcpy(render->info.h264.scaling_lists_4x4, h->ps.pps->scaling_matrix4, sizeof(render->info.h264.scaling_lists_4x4));
+    memcpy(render->info.h264.scaling_lists_8x8[0], h->ps.pps->scaling_matrix8[0], sizeof(render->info.h264.scaling_lists_8x8[0]));
+    memcpy(render->info.h264.scaling_lists_8x8[1], h->ps.pps->scaling_matrix8[3], sizeof(render->info.h264.scaling_lists_8x8[0]));
+
+    ff_h264_draw_horiz_band(h, &h->slice_ctx[0], 0, h->avctx->height);
+    render->bitstream_buffers_used = 0;
+}
+#endif /* CONFIG_H264_VDPAU_DECODER */
+
+#if CONFIG_MPEG_VDPAU_DECODER || CONFIG_MPEG1_VDPAU_DECODER
+void ff_vdpau_mpeg_picture_complete(MpegEncContext *s, const uint8_t *buf,
+                                    int buf_size, int slice_count)
+{
+    struct vdpau_render_state *render, *last, *next;
+    int i;
+
+    if (!s->current_picture_ptr) return;
+
+    render = (struct vdpau_render_state *)s->current_picture_ptr->f->data[0];
+    assert(render);
+
+    /* fill VdpPictureInfoMPEG1Or2 struct */
+    render->info.mpeg.picture_structure          = s->picture_structure;
+    render->info.mpeg.picture_coding_type        = s->pict_type;
+    render->info.mpeg.intra_dc_precision         = s->intra_dc_precision;
+    render->info.mpeg.frame_pred_frame_dct       = s->frame_pred_frame_dct;
+    render->info.mpeg.concealment_motion_vectors = s->concealment_motion_vectors;
+    render->info.mpeg.intra_vlc_format           = s->intra_vlc_format;
+    render->info.mpeg.alternate_scan             = s->alternate_scan;
+    render->info.mpeg.q_scale_type               = s->q_scale_type;
+    render->info.mpeg.top_field_first            = s->top_field_first;
+    render->info.mpeg.full_pel_forward_vector    = s->full_pel[0]; // MPEG-1 only.  Set 0 for MPEG-2
+    render->info.mpeg.full_pel_backward_vector   = s->full_pel[1]; // MPEG-1 only.  Set 0 for MPEG-2
+    render->info.mpeg.f_code[0][0]               = s->mpeg_f_code[0][0]; // For MPEG-1 fill both horiz. & vert.
+    render->info.mpeg.f_code[0][1]               = s->mpeg_f_code[0][1];
+    render->info.mpeg.f_code[1][0]               = s->mpeg_f_code[1][0];
+    render->info.mpeg.f_code[1][1]               = s->mpeg_f_code[1][1];
+    for (i = 0; i < 64; ++i) {
+        render->info.mpeg.intra_quantizer_matrix[i]     = s->intra_matrix[i];
+        render->info.mpeg.non_intra_quantizer_matrix[i] = s->inter_matrix[i];
+    }
+
+    render->info.mpeg.forward_reference          = VDP_INVALID_HANDLE;
+    render->info.mpeg.backward_reference         = VDP_INVALID_HANDLE;
+
+    switch(s->pict_type){
+    case  AV_PICTURE_TYPE_B:
+        next = (struct vdpau_render_state *)s->next_picture.f->data[0];
+        assert(next);
+        render->info.mpeg.backward_reference     = next->surface;
+        // no return here, going to set forward prediction
+    case  AV_PICTURE_TYPE_P:
+        last = (struct vdpau_render_state *)s->last_picture.f->data[0];
+        if (!last) // FIXME: Does this test make sense?
+            last = render; // predict second field from the first
+        render->info.mpeg.forward_reference      = last->surface;
+    }
+
+    ff_vdpau_add_data_chunk(s->current_picture_ptr->f->data[0], buf, buf_size);
+
+    render->info.mpeg.slice_count                = slice_count;
+
+    if (slice_count)
+        ff_mpeg_draw_horiz_band(s, 0, s->avctx->height);
+    render->bitstream_buffers_used               = 0;
+}
+#endif /* CONFIG_MPEG_VDPAU_DECODER || CONFIG_MPEG1_VDPAU_DECODER */
+
+#if CONFIG_VC1_VDPAU_DECODER
+void ff_vdpau_vc1_decode_picture(MpegEncContext *s, const uint8_t *buf,
+                                 int buf_size)
+{
+    VC1Context *v = s->avctx->priv_data;
+    struct vdpau_render_state *render, *last, *next;
+
+    render = (struct vdpau_render_state *)s->current_picture.f->data[0];
+    assert(render);
+
+    /*  fill LvPictureInfoVC1 struct */
+    render->info.vc1.frame_coding_mode  = v->fcm ? v->fcm + 1 : 0;
+    render->info.vc1.postprocflag       = v->postprocflag;
+    render->info.vc1.pulldown           = v->broadcast;
+    render->info.vc1.interlace          = v->interlace;
+    render->info.vc1.tfcntrflag         = v->tfcntrflag;
+    render->info.vc1.finterpflag        = v->finterpflag;
+    render->info.vc1.psf                = v->psf;
+    render->info.vc1.dquant             = v->dquant;
+    render->info.vc1.panscan_flag       = v->panscanflag;
+    render->info.vc1.refdist_flag       = v->refdist_flag;
+    render->info.vc1.quantizer          = v->quantizer_mode;
+    render->info.vc1.extended_mv        = v->extended_mv;
+    render->info.vc1.extended_dmv       = v->extended_dmv;
+    render->info.vc1.overlap            = v->overlap;
+    render->info.vc1.vstransform        = v->vstransform;
+    render->info.vc1.loopfilter         = v->s.loop_filter;
+    render->info.vc1.fastuvmc           = v->fastuvmc;
+    render->info.vc1.range_mapy_flag    = v->range_mapy_flag;
+    render->info.vc1.range_mapy         = v->range_mapy;
+    render->info.vc1.range_mapuv_flag   = v->range_mapuv_flag;
+    render->info.vc1.range_mapuv        = v->range_mapuv;
+    /* Specific to simple/main profile only */
+    render->info.vc1.multires           = v->multires;
+    render->info.vc1.syncmarker         = v->resync_marker;
+    render->info.vc1.rangered           = v->rangered | (v->rangeredfrm << 1);
+    render->info.vc1.maxbframes         = v->s.max_b_frames;
+
+    render->info.vc1.deblockEnable      = v->postprocflag & 1;
+    render->info.vc1.pquant             = v->pq;
+
+    render->info.vc1.forward_reference  = VDP_INVALID_HANDLE;
+    render->info.vc1.backward_reference = VDP_INVALID_HANDLE;
+
+    if (v->bi_type)
+        render->info.vc1.picture_type = 4;
+    else
+        render->info.vc1.picture_type = s->pict_type - 1 + s->pict_type / 3;
+
+    switch(s->pict_type){
+    case  AV_PICTURE_TYPE_B:
+        next = (struct vdpau_render_state *)s->next_picture.f->data[0];
+        assert(next);
+        render->info.vc1.backward_reference = next->surface;
+        // no break here, going to set forward prediction
+    case  AV_PICTURE_TYPE_P:
+        last = (struct vdpau_render_state *)s->last_picture.f->data[0];
+        if (!last) // FIXME: Does this test make sense?
+            last = render; // predict second field from the first
+        render->info.vc1.forward_reference = last->surface;
+    }
+
+    ff_vdpau_add_data_chunk(s->current_picture_ptr->f->data[0], buf, buf_size);
+
+    render->info.vc1.slice_count          = 1;
+
+    ff_mpeg_draw_horiz_band(s, 0, s->avctx->height);
+    render->bitstream_buffers_used        = 0;
+}
+#endif /* (CONFIG_VC1_VDPAU_DECODER */
+
+#if CONFIG_MPEG4_VDPAU_DECODER
+void ff_vdpau_mpeg4_decode_picture(Mpeg4DecContext *ctx, const uint8_t *buf,
+                                   int buf_size)
+{
+    MpegEncContext *s = &ctx->m;
+    struct vdpau_render_state *render, *last, *next;
+    int i;
+
+    if (!s->current_picture_ptr) return;
+
+    render = (struct vdpau_render_state *)s->current_picture_ptr->f->data[0];
+    assert(render);
+
+    /* fill VdpPictureInfoMPEG4Part2 struct */
+    render->info.mpeg4.trd[0]                            = s->pp_time;
+    render->info.mpeg4.trb[0]                            = s->pb_time;
+    render->info.mpeg4.trd[1]                            = s->pp_field_time >> 1;
+    render->info.mpeg4.trb[1]                            = s->pb_field_time >> 1;
+    render->info.mpeg4.vop_time_increment_resolution     = s->avctx->time_base.den;
+    render->info.mpeg4.vop_coding_type                   = 0;
+    render->info.mpeg4.vop_fcode_forward                 = s->f_code;
+    render->info.mpeg4.vop_fcode_backward                = s->b_code;
+    render->info.mpeg4.resync_marker_disable             = !ctx->resync_marker;
+    render->info.mpeg4.interlaced                        = !s->progressive_sequence;
+    render->info.mpeg4.quant_type                        = s->mpeg_quant;
+    render->info.mpeg4.quarter_sample                    = s->quarter_sample;
+    render->info.mpeg4.short_video_header                = s->avctx->codec->id == AV_CODEC_ID_H263;
+    render->info.mpeg4.rounding_control                  = s->no_rounding;
+    render->info.mpeg4.alternate_vertical_scan_flag      = s->alternate_scan;
+    render->info.mpeg4.top_field_first                   = s->top_field_first;
+    for (i = 0; i < 64; ++i) {
+        render->info.mpeg4.intra_quantizer_matrix[i]     = s->intra_matrix[i];
+        render->info.mpeg4.non_intra_quantizer_matrix[i] = s->inter_matrix[i];
+    }
+    render->info.mpeg4.forward_reference                 = VDP_INVALID_HANDLE;
+    render->info.mpeg4.backward_reference                = VDP_INVALID_HANDLE;
+
+    switch (s->pict_type) {
+    case AV_PICTURE_TYPE_B:
+        next = (struct vdpau_render_state *)s->next_picture.f->data[0];
+        assert(next);
+        render->info.mpeg4.backward_reference     = next->surface;
+        render->info.mpeg4.vop_coding_type        = 2;
+        // no break here, going to set forward prediction
+    case AV_PICTURE_TYPE_P:
+        last = (struct vdpau_render_state *)s->last_picture.f->data[0];
+        assert(last);
+        render->info.mpeg4.forward_reference      = last->surface;
+    }
+
+    ff_vdpau_add_data_chunk(s->current_picture_ptr->f->data[0], buf, buf_size);
+
+    ff_mpeg_draw_horiz_band(s, 0, s->avctx->height);
+    render->bitstream_buffers_used = 0;
+}
+#endif /* CONFIG_MPEG4_VDPAU_DECODER */
+#endif /* FF_API_VDPAU */
+
 #if FF_API_VDPAU_PROFILE
 int av_vdpau_get_profile(AVCodecContext *avctx, VdpDecoderProfile *profile)
 {
diff --git a/libavcodec/vdpau.h b/libavcodec/vdpau.h
index 967c728..e85e4d9 100644
--- a/libavcodec/vdpau.h
+++ b/libavcodec/vdpau.h
@@ -4,20 +4,20 @@
  *
  * Copyright (C) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -39,7 +39,7 @@
  * - VDPAU decoding
  * - VDPAU presentation
  *
- * The VDPAU decoding module parses all headers using Libav
+ * The VDPAU decoding module parses all headers using FFmpeg
  * parsing mechanisms and uses VDPAU for the actual decoding.
  *
  * As per the current implementation, the actual decoding
@@ -51,7 +51,7 @@
 
 #include <vdpau/vdpau.h>
 #include <vdpau/vdpau_x11.h>
-
+#include "libavutil/avconfig.h"
 #include "libavutil/attributes.h"
 
 #include "avcodec.h"
@@ -66,10 +66,18 @@ union AVVDPAUPictureInfo {
 };
 #endif
 
+struct AVCodecContext;
+struct AVFrame;
+
+typedef int (*AVVDPAU_Render2)(struct AVCodecContext *, struct AVFrame *,
+                               const VdpPictureInfo *, uint32_t,
+                               const VdpBitstreamBuffer *);
+
 /**
  * This structure is used to share data between the libavcodec library and
  * the client video application.
- * The user shall zero-allocate the structure and make it available as
+ * The user shall allocate the structure via the av_alloc_vdpau_hwaccel
+ * function and make it available as
  * AVCodecContext.hwaccel_context. Members can be set by the user once
  * during initialization or through each AVCodecContext.get_buffer()
  * function call. In any case, they must be valid prior to calling
@@ -128,9 +136,20 @@ typedef struct AVVDPAUContext {
     attribute_deprecated
     VdpBitstreamBuffer *bitstream_buffers;
 #endif
+    AVVDPAU_Render2 render2;
 } AVVDPAUContext;
 
 /**
+ * @brief allocation function for AVVDPAUContext
+ *
+ * Allows extending the struct without breaking API/ABI
+ */
+AVVDPAUContext *av_alloc_vdpaucontext(void);
+
+AVVDPAU_Render2 av_vdpau_hwaccel_get_render2(const AVVDPAUContext *);
+void av_vdpau_hwaccel_set_render2(AVVDPAUContext *, AVVDPAU_Render2);
+
+/**
  * Associate a VDPAU device with a codec context for hardware acceleration.
  * This function is meant to be called from the get_format() codec callback,
  * or earlier. It can also be called after avcodec_flush_buffers() to change
@@ -206,11 +225,11 @@ int av_vdpau_get_profile(AVCodecContext *avctx, VdpDecoderProfile *profile);
 #define FF_VDPAU_STATE_USED_FOR_REFERENCE 2
 
 /**
- * @brief This structure is used as a callback between the Libav
+ * @brief This structure is used as a callback between the FFmpeg
  * decoder (vd_) and presentation (vo_) module.
  * This is used for defining a video frame containing surface,
  * picture parameter, bitstream information etc which are passed
- * between the Libav decoder and its clients.
+ * between the FFmpeg decoder and its clients.
  */
 struct vdpau_render_state {
     VdpVideoSurface surface; ///< Used as rendered surface, never changed.
diff --git a/libavcodec/vdpau_compat.h b/libavcodec/vdpau_compat.h
new file mode 100644
index 0000000..6b4b086
--- /dev/null
+++ b/libavcodec/vdpau_compat.h
@@ -0,0 +1,48 @@
+/*
+ * Video Decode and Presentation API for UNIX (VDPAU) is used for
+ * HW decode acceleration for MPEG-1/2, H.264 and VC-1.
+ *
+ * Copyright (C) 2008 NVIDIA
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDPAU_COMPAT_H
+#define AVCODEC_VDPAU_COMPAT_H
+
+#include <stdint.h>
+
+#include "h264.h"
+#include "mpeg4video.h"
+
+void ff_vdpau_add_data_chunk(uint8_t *data, const uint8_t *buf,
+                             int buf_size);
+
+void ff_vdpau_mpeg_picture_complete(MpegEncContext *s, const uint8_t *buf,
+                                    int buf_size, int slice_count);
+
+void ff_vdpau_h264_picture_start(H264Context *h);
+void ff_vdpau_h264_set_reference_frames(H264Context *h);
+void ff_vdpau_h264_picture_complete(H264Context *h);
+
+void ff_vdpau_vc1_decode_picture(MpegEncContext *s, const uint8_t *buf,
+                                 int buf_size);
+
+void ff_vdpau_mpeg4_decode_picture(Mpeg4DecContext *s, const uint8_t *buf,
+                                   int buf_size);
+
+#endif /* AVCODEC_VDPAU_COMPAT_H */
diff --git a/libavcodec/vdpau_h264.c b/libavcodec/vdpau_h264.c
index c34e323..5ae3449 100644
--- a/libavcodec/vdpau_h264.c
+++ b/libavcodec/vdpau_h264.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_hevc.c b/libavcodec/vdpau_hevc.c
index 1e5fb71..03c61dc 100644
--- a/libavcodec/vdpau_hevc.c
+++ b/libavcodec/vdpau_hevc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2013 Philip Langdale
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_internal.h b/libavcodec/vdpau_internal.h
index c7281a6..8a63733 100644
--- a/libavcodec/vdpau_internal.h
+++ b/libavcodec/vdpau_internal.h
@@ -4,33 +4,35 @@
  *
  * Copyright (C) 2008 NVIDIA
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_VDPAU_INTERNAL_H
 #define AVCODEC_VDPAU_INTERNAL_H
 
+#include "config.h"
 #include <stdint.h>
+#if CONFIG_VDPAU
 #include <vdpau/vdpau.h>
+#endif
 
 #include "libavutil/frame.h"
 
 #include "avcodec.h"
-#include "vdpau.h"
 
 /** Extract VdpVideoSurface from an AVFrame */
 static inline uintptr_t ff_vdpau_get_surface_id(AVFrame *pic)
@@ -38,6 +40,8 @@ static inline uintptr_t ff_vdpau_get_surface_id(AVFrame *pic)
     return (uintptr_t)pic->data[3];
 }
 
+struct vdpau_picture_context;
+#if CONFIG_VDPAU
 union VDPAUPictureInfo {
     VdpPictureInfoH264        h264;
     VdpPictureInfoMPEG1Or2    mpeg;
@@ -51,6 +55,8 @@ union VDPAUPictureInfo {
 #endif
 };
 
+#include "vdpau.h"
+
 typedef struct VDPAUHWContext {
     AVVDPAUContext context;
     VdpDevice device;
@@ -108,6 +114,8 @@ struct vdpau_picture_context {
 
 int ff_vdpau_common_init(AVCodecContext *avctx, VdpDecoderProfile profile,
                          int level);
+#endif //CONFIG_VDPAU
+
 int ff_vdpau_common_uninit(AVCodecContext *avctx);
 
 int ff_vdpau_common_start_frame(struct vdpau_picture_context *pic,
diff --git a/libavcodec/vdpau_mpeg12.c b/libavcodec/vdpau_mpeg12.c
index cb6f81a..3ac2cb8 100644
--- a/libavcodec/vdpau_mpeg12.c
+++ b/libavcodec/vdpau_mpeg12.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vdpau_mpeg4.c b/libavcodec/vdpau_mpeg4.c
index fcad42f..46a00cb 100644
--- a/libavcodec/vdpau_mpeg4.c
+++ b/libavcodec/vdpau_mpeg4.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -96,6 +96,9 @@ static int vdpau_mpeg4_init(AVCodecContext *avctx)
     case FF_PROFILE_MPEG4_SIMPLE:
         profile = VDP_DECODER_PROFILE_MPEG4_PART2_SP;
         break;
+    // As any ASP decoder must be able to decode SP, this
+    // should be a safe fallback if profile is unknown/unspecified.
+    case FF_PROFILE_UNKNOWN:
     case FF_PROFILE_MPEG4_ADVANCED_SIMPLE:
         profile = VDP_DECODER_PROFILE_MPEG4_PART2_ASP;
         break;
diff --git a/libavcodec/vdpau_vc1.c b/libavcodec/vdpau_vc1.c
index 4f87c52..ffd6505 100644
--- a/libavcodec/vdpau_vc1.c
+++ b/libavcodec/vdpau_vc1.c
@@ -4,20 +4,20 @@
  * Copyright (c) 2008 NVIDIA
  * Copyright (c) 2013 Rémi Denis-Courmont
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software Foundation,
+ * License along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,14 +44,18 @@ static int vdpau_vc1_start_frame(AVCodecContext *avctx,
 
     switch (s->pict_type) {
     case AV_PICTURE_TYPE_B:
+        if (s->next_picture_ptr) {
         ref = ff_vdpau_get_surface_id(s->next_picture.f);
         assert(ref != VDP_INVALID_HANDLE);
         info->backward_reference = ref;
+        }
         /* fall-through */
     case AV_PICTURE_TYPE_P:
+        if (s->last_picture_ptr) {
         ref = ff_vdpau_get_surface_id(s->last_picture.f);
         assert(ref != VDP_INVALID_HANDLE);
         info->forward_reference  = ref;
+        }
     }
 
     info->slice_count       = 0;
diff --git a/libavcodec/version.h b/libavcodec/version.h
index db3f33a..0852b43 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,9 +27,9 @@
 
 #include "libavutil/version.h"
 
-#define LIBAVCODEC_VERSION_MAJOR 57
-#define LIBAVCODEC_VERSION_MINOR 19
-#define LIBAVCODEC_VERSION_MICRO  0
+#define LIBAVCODEC_VERSION_MAJOR  57
+#define LIBAVCODEC_VERSION_MINOR  46
+#define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
                                                LIBAVCODEC_VERSION_MINOR, \
@@ -45,8 +45,21 @@
  * FF_API_* defines may be placed below to indicate public API that will be
  * dropped at a future version bump. The defines themselves are not part of
  * the public API and may change, break or disappear at any time.
+ *
+ * @note, when bumping the major version it is recommended to manually
+ * disable each FF_API_* in its own commit instead of disabling them all
+ * at once through the bump. This improves the git bisect-ability of the change.
  */
 
+#ifndef FF_API_VIMA_DECODER
+#define FF_API_VIMA_DECODER     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AUDIO_CONVERT
+#define FF_API_AUDIO_CONVERT     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AVCODEC_RESAMPLE
+#define FF_API_AVCODEC_RESAMPLE  FF_API_AUDIO_CONVERT
+#endif
 #ifndef FF_API_GETCHROMA
 #define FF_API_GETCHROMA         (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
@@ -98,9 +111,6 @@
 #ifndef FF_API_MAX_BFRAMES
 #define FF_API_MAX_BFRAMES       (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
-#ifndef FF_API_FAST_MALLOC
-#define FF_API_FAST_MALLOC       (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
 #ifndef FF_API_NEG_LINESIZES
 #define FF_API_NEG_LINESIZES     (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
@@ -137,9 +147,16 @@
 #ifndef FF_API_AFD
 #define FF_API_AFD               (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
+#ifndef FF_API_VISMV
+/* XXX: don't forget to drop the -vismv documentation */
+#define FF_API_VISMV             (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
 #ifndef FF_API_AUDIOENC_DELAY
 #define FF_API_AUDIOENC_DELAY    (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
+#ifndef FF_API_VAAPI_CONTEXT
+#define FF_API_VAAPI_CONTEXT     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
 #ifndef FF_API_AVCTX_TIMEBASE
 #define FF_API_AVCTX_TIMEBASE    (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
@@ -191,12 +208,12 @@
 #ifndef FF_API_STAT_BITS
 #define FF_API_STAT_BITS         (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
-#ifndef FF_API_NVENC_OLD_NAME
-#define FF_API_NVENC_OLD_NAME    (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
 #ifndef FF_API_PRIVATE_OPT
 #define FF_API_PRIVATE_OPT      (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
+#ifndef FF_API_ASS_TIMING
+#define FF_API_ASS_TIMING       (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
 #ifndef FF_API_OLD_BSF
 #define FF_API_OLD_BSF          (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
diff --git a/libavcodec/videodsp.c b/libavcodec/videodsp.c
index e6d9303..ba618a7 100644
--- a/libavcodec/videodsp.c
+++ b/libavcodec/videodsp.c
@@ -1,24 +1,25 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "videodsp.h"
 
diff --git a/libavcodec/videodsp.h b/libavcodec/videodsp.h
index 04c012a..fc01a31 100644
--- a/libavcodec/videodsp.h
+++ b/libavcodec/videodsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,14 +29,25 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#define EMULATED_EDGE(depth) \
+void ff_emulated_edge_mc_ ## depth(uint8_t *dst, const uint8_t *src, \
+                                   ptrdiff_t dst_stride, ptrdiff_t src_stride, \
+                                   int block_w, int block_h,\
+                                   int src_x, int src_y, int w, int h);
+
+EMULATED_EDGE(8)
+EMULATED_EDGE(16)
+
 typedef struct VideoDSPContext {
     /**
      * Copy a rectangular area of samples to a temporary buffer and replicate
      * the border samples.
      *
-     * @param buf destination buffer
+     * @param dst destination buffer
+     * @param dst_stride number of bytes between 2 vertically adjacent samples
+     *                   in destination buffer
      * @param src source buffer
-     * @param buf_linesize number of bytes between 2 vertically adjacent
+     * @param dst_linesize number of bytes between 2 vertically adjacent
      *                     samples in the destination buffer
      * @param src_linesize number of bytes between 2 vertically adjacent
      *                     samples in both the source buffer
@@ -49,8 +60,8 @@ typedef struct VideoDSPContext {
      * @param w width of the source buffer
      * @param h height of the source buffer
      */
-    void (*emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
-                             ptrdiff_t buf_linesize,
+    void (*emulated_edge_mc)(uint8_t *dst, const uint8_t *src,
+                             ptrdiff_t dst_linesize,
                              ptrdiff_t src_linesize,
                              int block_w, int block_h,
                              int src_x, int src_y, int w, int h);
diff --git a/libavcodec/videodsp_template.c b/libavcodec/videodsp_template.c
index 28b8c32..94c1b71 100644
--- a/libavcodec/videodsp_template.c
+++ b/libavcodec/videodsp_template.c
@@ -1,42 +1,46 @@
 /*
- * Copyright (c) 2002-2004 Michael Niedermayer
+ * Copyright (c) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
-
 #include "bit_depth_template.c"
-
-static void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
-                                      ptrdiff_t buf_linesize,
-                                      ptrdiff_t src_linesize,
-                                      int block_w, int block_h,
-                                      int src_x, int src_y, int w, int h)
+void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
+                               ptrdiff_t buf_linesize,
+                               ptrdiff_t src_linesize,
+                               int block_w, int block_h,
+                               int src_x, int src_y, int w, int h)
 {
     int x, y;
     int start_y, start_x, end_y, end_x;
 
+    if (!w || !h)
+        return;
+
+    av_assert2(block_w * sizeof(pixel) <= FFABS(buf_linesize));
+
     if (src_y >= h) {
-        src  += (h - 1 - src_y) * src_linesize;
+        src -= src_y * src_linesize;
+        src += (h - 1) * src_linesize;
         src_y = h - 1;
     } else if (src_y <= -block_h) {
-        src  += (1 - block_h - src_y) * src_linesize;
+        src -= src_y * src_linesize;
+        src += (1 - block_h) * src_linesize;
         src_y = 1 - block_h;
     }
     if (src_x >= w) {
@@ -51,8 +55,8 @@ static void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y = FFMIN(block_h, h-src_y);
     end_x = FFMIN(block_w, w-src_x);
-    assert(start_y < end_y && block_h);
-    assert(start_x < end_x && block_w);
+    av_assert2(start_y < end_y && block_h);
+    av_assert2(start_x < end_x && block_w);
 
     w    = end_x - start_x;
     src += start_y * src_linesize + start_x * sizeof(pixel);
diff --git a/libavcodec/videotoolbox.c b/libavcodec/videotoolbox.c
new file mode 100644
index 0000000..7b5245a
--- /dev/null
+++ b/libavcodec/videotoolbox.c
@@ -0,0 +1,739 @@
+/*
+ * Videotoolbox hardware acceleration
+ *
+ * copyright (c) 2012 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#if CONFIG_VIDEOTOOLBOX
+#  include "videotoolbox.h"
+#else
+#  include "vda.h"
+#endif
+#include "vda_vt_internal.h"
+#include "libavutil/avutil.h"
+#include "bytestream.h"
+#include "h264.h"
+#include "mpegvideo.h"
+
+#ifndef kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder
+#  define kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder CFSTR("RequireHardwareAcceleratedVideoDecoder")
+#endif
+
+#define VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING  12
+
+static void videotoolbox_buffer_release(void *opaque, uint8_t *data)
+{
+    CVPixelBufferRef cv_buffer = (CVImageBufferRef)data;
+    CVPixelBufferRelease(cv_buffer);
+}
+
+static int videotoolbox_buffer_copy(VTContext *vtctx,
+                                    const uint8_t *buffer,
+                                    uint32_t size)
+{
+    void *tmp;
+
+    tmp = av_fast_realloc(vtctx->bitstream,
+                         &vtctx->allocated_size,
+                         size);
+
+    if (!tmp)
+        return AVERROR(ENOMEM);
+
+    vtctx->bitstream = tmp;
+    memcpy(vtctx->bitstream, buffer, size);
+    vtctx->bitstream_size = size;
+
+    return 0;
+}
+
+int ff_videotoolbox_alloc_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    frame->width  = avctx->width;
+    frame->height = avctx->height;
+    frame->format = avctx->pix_fmt;
+    frame->buf[0] = av_buffer_alloc(1);
+
+    if (!frame->buf[0])
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+#define AV_W8(p, v) *(p) = (v)
+
+CFDataRef ff_videotoolbox_avcc_extradata_create(AVCodecContext *avctx)
+{
+    H264Context *h     = avctx->priv_data;
+    CFDataRef data = NULL;
+    uint8_t *p;
+    int vt_extradata_size = 6 + 3 + h->ps.sps->data_size + 4 + h->ps.pps->data_size;
+    uint8_t *vt_extradata = av_malloc(vt_extradata_size);
+    if (!vt_extradata)
+        return NULL;
+
+    p = vt_extradata;
+
+    AV_W8(p + 0, 1); /* version */
+    AV_W8(p + 1, h->ps.sps->data[0]); /* profile */
+    AV_W8(p + 2, h->ps.sps->data[1]); /* profile compat */
+    AV_W8(p + 3, h->ps.sps->data[2]); /* level */
+    AV_W8(p + 4, 0xff); /* 6 bits reserved (111111) + 2 bits nal size length - 3 (11) */
+    AV_W8(p + 5, 0xe1); /* 3 bits reserved (111) + 5 bits number of sps (00001) */
+    AV_WB16(p + 6, h->ps.sps->data_size + 1);
+    AV_W8(p + 8, NAL_SPS | (3 << 5)); // NAL unit header
+    memcpy(p + 9, h->ps.sps->data, h->ps.sps->data_size);
+    p += 9 + h->ps.sps->data_size;
+    AV_W8(p + 0, 1); /* number of pps */
+    AV_WB16(p + 1, h->ps.pps->data_size + 1);
+    AV_W8(p + 3, NAL_PPS | (3 << 5)); // NAL unit header
+    memcpy(p + 4, h->ps.pps->data, h->ps.pps->data_size);
+
+    p += 4 + h->ps.pps->data_size;
+    av_assert0(p - vt_extradata == vt_extradata_size);
+
+    data = CFDataCreate(kCFAllocatorDefault, vt_extradata, vt_extradata_size);
+    av_free(vt_extradata);
+    return data;
+}
+
+int ff_videotoolbox_buffer_create(VTContext *vtctx, AVFrame *frame)
+{
+    av_buffer_unref(&frame->buf[0]);
+
+    frame->buf[0] = av_buffer_create((uint8_t*)vtctx->frame,
+                                     sizeof(vtctx->frame),
+                                     videotoolbox_buffer_release,
+                                     NULL,
+                                     AV_BUFFER_FLAG_READONLY);
+    if (!frame->buf[0]) {
+        return AVERROR(ENOMEM);
+    }
+
+    frame->data[3] = (uint8_t*)vtctx->frame;
+    vtctx->frame = NULL;
+
+    return 0;
+}
+
+int ff_videotoolbox_h264_start_frame(AVCodecContext *avctx,
+                                     const uint8_t *buffer,
+                                     uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    H264Context *h  = avctx->priv_data;
+
+    vtctx->bitstream_size = 0;
+
+    if (h->is_avc == 1) {
+        return videotoolbox_buffer_copy(vtctx, buffer, size);
+    }
+
+    return 0;
+}
+
+int ff_videotoolbox_h264_decode_slice(AVCodecContext *avctx,
+                                      const uint8_t *buffer,
+                                      uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    H264Context *h  = avctx->priv_data;
+    void *tmp;
+
+    if (h->is_avc == 1)
+        return 0;
+
+    tmp = av_fast_realloc(vtctx->bitstream,
+                          &vtctx->allocated_size,
+                          vtctx->bitstream_size+size+4);
+    if (!tmp)
+        return AVERROR(ENOMEM);
+
+    vtctx->bitstream = tmp;
+
+    AV_WB32(vtctx->bitstream + vtctx->bitstream_size, size);
+    memcpy(vtctx->bitstream + vtctx->bitstream_size + 4, buffer, size);
+
+    vtctx->bitstream_size += size + 4;
+
+    return 0;
+}
+
+int ff_videotoolbox_uninit(AVCodecContext *avctx)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+    if (vtctx) {
+        av_freep(&vtctx->bitstream);
+        if (vtctx->frame)
+            CVPixelBufferRelease(vtctx->frame);
+    }
+
+    return 0;
+}
+
+#if CONFIG_VIDEOTOOLBOX
+static void videotoolbox_write_mp4_descr_length(PutByteContext *pb, int length)
+{
+    int i;
+    uint8_t b;
+
+    for (i = 3; i >= 0; i--) {
+        b = (length >> (i * 7)) & 0x7F;
+        if (i != 0)
+            b |= 0x80;
+
+        bytestream2_put_byteu(pb, b);
+    }
+}
+
+static CFDataRef videotoolbox_esds_extradata_create(AVCodecContext *avctx)
+{
+    CFDataRef data;
+    uint8_t *rw_extradata;
+    PutByteContext pb;
+    int full_size = 3 + 5 + 13 + 5 + avctx->extradata_size + 3;
+    // ES_DescrTag data + DecoderConfigDescrTag + data + DecSpecificInfoTag + size + SLConfigDescriptor
+    int config_size = 13 + 5 + avctx->extradata_size;
+    int s;
+
+    if (!(rw_extradata = av_mallocz(full_size + VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING)))
+        return NULL;
+
+    bytestream2_init_writer(&pb, rw_extradata, full_size + VIDEOTOOLBOX_ESDS_EXTRADATA_PADDING);
+    bytestream2_put_byteu(&pb, 0);        // version
+    bytestream2_put_ne24(&pb, 0);         // flags
+
+    // elementary stream descriptor
+    bytestream2_put_byteu(&pb, 0x03);     // ES_DescrTag
+    videotoolbox_write_mp4_descr_length(&pb, full_size);
+    bytestream2_put_ne16(&pb, 0);         // esid
+    bytestream2_put_byteu(&pb, 0);        // stream priority (0-32)
+
+    // decoder configuration descriptor
+    bytestream2_put_byteu(&pb, 0x04);     // DecoderConfigDescrTag
+    videotoolbox_write_mp4_descr_length(&pb, config_size);
+    bytestream2_put_byteu(&pb, 32);       // object type indication. 32 = AV_CODEC_ID_MPEG4
+    bytestream2_put_byteu(&pb, 0x11);     // stream type
+    bytestream2_put_ne24(&pb, 0);         // buffer size
+    bytestream2_put_ne32(&pb, 0);         // max bitrate
+    bytestream2_put_ne32(&pb, 0);         // avg bitrate
+
+    // decoder specific descriptor
+    bytestream2_put_byteu(&pb, 0x05);     ///< DecSpecificInfoTag
+    videotoolbox_write_mp4_descr_length(&pb, avctx->extradata_size);
+
+    bytestream2_put_buffer(&pb, avctx->extradata, avctx->extradata_size);
+
+    // SLConfigDescriptor
+    bytestream2_put_byteu(&pb, 0x06);     // SLConfigDescrTag
+    bytestream2_put_byteu(&pb, 0x01);     // length
+    bytestream2_put_byteu(&pb, 0x02);     //
+
+    s = bytestream2_size_p(&pb);
+
+    data = CFDataCreate(kCFAllocatorDefault, rw_extradata, s);
+
+    av_freep(&rw_extradata);
+    return data;
+}
+
+static CMSampleBufferRef videotoolbox_sample_buffer_create(CMFormatDescriptionRef fmt_desc,
+                                                           void *buffer,
+                                                           int size)
+{
+    OSStatus status;
+    CMBlockBufferRef  block_buf;
+    CMSampleBufferRef sample_buf;
+
+    block_buf  = NULL;
+    sample_buf = NULL;
+
+    status = CMBlockBufferCreateWithMemoryBlock(kCFAllocatorDefault,// structureAllocator
+                                                buffer,             // memoryBlock
+                                                size,               // blockLength
+                                                kCFAllocatorNull,   // blockAllocator
+                                                NULL,               // customBlockSource
+                                                0,                  // offsetToData
+                                                size,               // dataLength
+                                                0,                  // flags
+                                                &block_buf);
+
+    if (!status) {
+        status = CMSampleBufferCreate(kCFAllocatorDefault,  // allocator
+                                      block_buf,            // dataBuffer
+                                      TRUE,                 // dataReady
+                                      0,                    // makeDataReadyCallback
+                                      0,                    // makeDataReadyRefcon
+                                      fmt_desc,             // formatDescription
+                                      1,                    // numSamples
+                                      0,                    // numSampleTimingEntries
+                                      NULL,                 // sampleTimingArray
+                                      0,                    // numSampleSizeEntries
+                                      NULL,                 // sampleSizeArray
+                                      &sample_buf);
+    }
+
+    if (block_buf)
+        CFRelease(block_buf);
+
+    return sample_buf;
+}
+
+static void videotoolbox_decoder_callback(void *opaque,
+                                          void *sourceFrameRefCon,
+                                          OSStatus status,
+                                          VTDecodeInfoFlags flags,
+                                          CVImageBufferRef image_buffer,
+                                          CMTime pts,
+                                          CMTime duration)
+{
+    AVCodecContext *avctx = opaque;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    if (vtctx->frame) {
+        CVPixelBufferRelease(vtctx->frame);
+        vtctx->frame = NULL;
+    }
+
+    if (!image_buffer) {
+        av_log(NULL, AV_LOG_DEBUG, "vt decoder cb: output image buffer is null\n");
+        return;
+    }
+
+    vtctx->frame = CVPixelBufferRetain(image_buffer);
+}
+
+static OSStatus videotoolbox_session_decode_frame(AVCodecContext *avctx)
+{
+    OSStatus status;
+    CMSampleBufferRef sample_buf;
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    sample_buf = videotoolbox_sample_buffer_create(videotoolbox->cm_fmt_desc,
+                                                   vtctx->bitstream,
+                                                   vtctx->bitstream_size);
+
+    if (!sample_buf)
+        return -1;
+
+    status = VTDecompressionSessionDecodeFrame(videotoolbox->session,
+                                               sample_buf,
+                                               0,       // decodeFlags
+                                               NULL,    // sourceFrameRefCon
+                                               0);      // infoFlagsOut
+    if (status == noErr)
+        status = VTDecompressionSessionWaitForAsynchronousFrames(videotoolbox->session);
+
+    CFRelease(sample_buf);
+
+    return status;
+}
+
+static int videotoolbox_common_end_frame(AVCodecContext *avctx, AVFrame *frame)
+{
+    int status;
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    av_buffer_unref(&frame->buf[0]);
+
+    if (!videotoolbox->session || !vtctx->bitstream)
+        return AVERROR_INVALIDDATA;
+
+    status = videotoolbox_session_decode_frame(avctx);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to decode frame (%d)\n", status);
+        return AVERROR_UNKNOWN;
+    }
+
+    if (!vtctx->frame)
+        return AVERROR_UNKNOWN;
+
+    return ff_videotoolbox_buffer_create(vtctx, frame);
+}
+
+static int videotoolbox_h264_end_frame(AVCodecContext *avctx)
+{
+    H264Context *h = avctx->priv_data;
+    AVFrame *frame = h->cur_pic_ptr->f;
+
+    return videotoolbox_common_end_frame(avctx, frame);
+}
+
+static int videotoolbox_mpeg_start_frame(AVCodecContext *avctx,
+                                         const uint8_t *buffer,
+                                         uint32_t size)
+{
+    VTContext *vtctx = avctx->internal->hwaccel_priv_data;
+
+    return videotoolbox_buffer_copy(vtctx, buffer, size);
+}
+
+static int videotoolbox_mpeg_decode_slice(AVCodecContext *avctx,
+                                          const uint8_t *buffer,
+                                          uint32_t size)
+{
+    return 0;
+}
+
+static int videotoolbox_mpeg_end_frame(AVCodecContext *avctx)
+{
+    MpegEncContext *s = avctx->priv_data;
+    AVFrame *frame = s->current_picture_ptr->f;
+
+    return videotoolbox_common_end_frame(avctx, frame);
+}
+
+static CFDictionaryRef videotoolbox_decoder_config_create(CMVideoCodecType codec_type,
+                                                          AVCodecContext *avctx)
+{
+    CFMutableDictionaryRef config_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                                   0,
+                                                                   &kCFTypeDictionaryKeyCallBacks,
+                                                                   &kCFTypeDictionaryValueCallBacks);
+
+    CFDictionarySetValue(config_info,
+                         kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder,
+                         kCFBooleanTrue);
+
+    if (avctx->extradata_size) {
+        CFMutableDictionaryRef avc_info;
+        CFDataRef data = NULL;
+
+        avc_info = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                             1,
+                                             &kCFTypeDictionaryKeyCallBacks,
+                                             &kCFTypeDictionaryValueCallBacks);
+
+        switch (codec_type) {
+        case kCMVideoCodecType_MPEG4Video :
+            data = videotoolbox_esds_extradata_create(avctx);
+            if (data)
+                CFDictionarySetValue(avc_info, CFSTR("esds"), data);
+            break;
+        case kCMVideoCodecType_H264 :
+            data = ff_videotoolbox_avcc_extradata_create(avctx);
+            if (data)
+                CFDictionarySetValue(avc_info, CFSTR("avcC"), data);
+            break;
+        default:
+            break;
+        }
+
+        CFDictionarySetValue(config_info,
+                kCMFormatDescriptionExtension_SampleDescriptionExtensionAtoms,
+                avc_info);
+
+        if (data)
+            CFRelease(data);
+
+        CFRelease(avc_info);
+    }
+    return config_info;
+}
+
+static CFDictionaryRef videotoolbox_buffer_attributes_create(int width,
+                                                             int height,
+                                                             OSType pix_fmt)
+{
+    CFMutableDictionaryRef buffer_attributes;
+    CFMutableDictionaryRef io_surface_properties;
+    CFNumberRef cv_pix_fmt;
+    CFNumberRef w;
+    CFNumberRef h;
+
+    w = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &width);
+    h = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &height);
+    cv_pix_fmt = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &pix_fmt);
+
+    buffer_attributes = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                  4,
+                                                  &kCFTypeDictionaryKeyCallBacks,
+                                                  &kCFTypeDictionaryValueCallBacks);
+    io_surface_properties = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                                      0,
+                                                      &kCFTypeDictionaryKeyCallBacks,
+                                                      &kCFTypeDictionaryValueCallBacks);
+
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferPixelFormatTypeKey, cv_pix_fmt);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferIOSurfacePropertiesKey, io_surface_properties);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferWidthKey, w);
+    CFDictionarySetValue(buffer_attributes, kCVPixelBufferHeightKey, h);
+
+    CFRelease(io_surface_properties);
+    CFRelease(cv_pix_fmt);
+    CFRelease(w);
+    CFRelease(h);
+
+    return buffer_attributes;
+}
+
+static CMVideoFormatDescriptionRef videotoolbox_format_desc_create(AVCodecContext *avctx,
+                                                                   CMVideoCodecType codec_type,
+                                                                   CFDictionaryRef decoder_spec,
+                                                                   int width,
+                                                                   int height)
+{
+    CMFormatDescriptionRef cm_fmt_desc = NULL;
+    int status;
+
+#if TARGET_OS_IPHONE || defined(__MAC_10_9)
+    H264Context *h = codec_type == kCMVideoCodecType_H264 ? avctx->priv_data : NULL;
+
+    if (h && h->ps.sps->data_size && h->ps.pps->data_size) {
+        int ps_count = 2;
+        const uint8_t **ps_data = av_malloc(sizeof(uint8_t*) * ps_count);
+        size_t *ps_sizes = av_malloc(sizeof(size_t)  * ps_count);
+
+        ps_data[0]  = h->ps.sps->data;
+        ps_sizes[0] = h->ps.sps->data_size;
+
+        ps_data[1]  = h->ps.pps->data;
+        ps_sizes[1] = h->ps.pps->data_size;
+
+        status = CMVideoFormatDescriptionCreateFromH264ParameterSets(NULL,
+                                                                     ps_count,
+                                                                     ps_data,
+                                                                     ps_sizes,
+                                                                     4,
+                                                                     &cm_fmt_desc);
+        av_freep(&ps_sizes);
+        av_freep(&ps_data);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error creating H.264 format description: %d\n", status);
+            return NULL;
+        }
+    } else {
+#endif
+        status = CMVideoFormatDescriptionCreate(kCFAllocatorDefault,
+                                                codec_type,
+                                                width,
+                                                height,
+                                                decoder_spec, // Dictionary of extension
+                                                &cm_fmt_desc);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error creating format description: %d\n", status);
+            return NULL;
+        }
+#if TARGET_OS_IPHONE || defined(__MAC_10_9)
+    }
+#endif
+
+    return cm_fmt_desc;
+}
+
+static int videotoolbox_default_init(AVCodecContext *avctx)
+{
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+    OSStatus status;
+    VTDecompressionOutputCallbackRecord decoder_cb;
+    CFDictionaryRef decoder_spec;
+    CFDictionaryRef buf_attr;
+
+    if (!videotoolbox) {
+        av_log(avctx, AV_LOG_ERROR, "hwaccel context is not set\n");
+        return -1;
+    }
+
+    switch( avctx->codec_id ) {
+    case AV_CODEC_ID_H263 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_H263;
+        break;
+    case AV_CODEC_ID_H264 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_H264;
+        break;
+    case AV_CODEC_ID_MPEG1VIDEO :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG1Video;
+        break;
+    case AV_CODEC_ID_MPEG2VIDEO :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG2Video;
+        break;
+    case AV_CODEC_ID_MPEG4 :
+        videotoolbox->cm_codec_type = kCMVideoCodecType_MPEG4Video;
+        break;
+    default :
+        break;
+    }
+
+    decoder_spec = videotoolbox_decoder_config_create(videotoolbox->cm_codec_type, avctx);
+
+    videotoolbox->cm_fmt_desc = videotoolbox_format_desc_create(avctx,
+                                                                videotoolbox->cm_codec_type,
+                                                                decoder_spec,
+                                                                avctx->width,
+                                                                avctx->height);
+    if (!videotoolbox->cm_fmt_desc) {
+        if (decoder_spec)
+            CFRelease(decoder_spec);
+
+        av_log(avctx, AV_LOG_ERROR, "format description creation failed\n");
+        return -1;
+    }
+
+    buf_attr = videotoolbox_buffer_attributes_create(avctx->width,
+                                                     avctx->height,
+                                                     videotoolbox->cv_pix_fmt_type);
+
+    decoder_cb.decompressionOutputCallback = videotoolbox_decoder_callback;
+    decoder_cb.decompressionOutputRefCon   = avctx;
+
+    status = VTDecompressionSessionCreate(NULL,                      // allocator
+                                          videotoolbox->cm_fmt_desc, // videoFormatDescription
+                                          decoder_spec,              // videoDecoderSpecification
+                                          buf_attr,                  // destinationImageBufferAttributes
+                                          &decoder_cb,               // outputCallback
+                                          &videotoolbox->session);   // decompressionSessionOut
+
+    if (decoder_spec)
+        CFRelease(decoder_spec);
+    if (buf_attr)
+        CFRelease(buf_attr);
+
+    switch (status) {
+    case kVTVideoDecoderNotAvailableNowErr:
+    case kVTVideoDecoderUnsupportedDataFormatErr:
+        return AVERROR(ENOSYS);
+    case kVTVideoDecoderMalfunctionErr:
+        return AVERROR(EINVAL);
+    case kVTVideoDecoderBadDataErr :
+        return AVERROR_INVALIDDATA;
+    case 0:
+        return 0;
+    default:
+        return AVERROR_UNKNOWN;
+    }
+}
+
+static void videotoolbox_default_free(AVCodecContext *avctx)
+{
+    AVVideotoolboxContext *videotoolbox = avctx->hwaccel_context;
+
+    if (videotoolbox) {
+        if (videotoolbox->cm_fmt_desc)
+            CFRelease(videotoolbox->cm_fmt_desc);
+
+        if (videotoolbox->session) {
+            VTDecompressionSessionInvalidate(videotoolbox->session);
+            CFRelease(videotoolbox->session);
+        }
+    }
+}
+
+AVHWAccel ff_h263_videotoolbox_hwaccel = {
+    .name           = "h263_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H263,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_h264_videotoolbox_hwaccel = {
+    .name           = "h264_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_H264,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = ff_videotoolbox_h264_start_frame,
+    .decode_slice   = ff_videotoolbox_h264_decode_slice,
+    .end_frame      = videotoolbox_h264_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_mpeg1_videotoolbox_hwaccel = {
+    .name           = "mpeg1_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG1VIDEO,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_mpeg2_videotoolbox_hwaccel = {
+    .name           = "mpeg2_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG2VIDEO,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVHWAccel ff_mpeg4_videotoolbox_hwaccel = {
+    .name           = "mpeg4_videotoolbox",
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_MPEG4,
+    .pix_fmt        = AV_PIX_FMT_VIDEOTOOLBOX,
+    .alloc_frame    = ff_videotoolbox_alloc_frame,
+    .start_frame    = videotoolbox_mpeg_start_frame,
+    .decode_slice   = videotoolbox_mpeg_decode_slice,
+    .end_frame      = videotoolbox_mpeg_end_frame,
+    .uninit         = ff_videotoolbox_uninit,
+    .priv_data_size = sizeof(VTContext),
+};
+
+AVVideotoolboxContext *av_videotoolbox_alloc_context(void)
+{
+    AVVideotoolboxContext *ret = av_mallocz(sizeof(*ret));
+
+    if (ret) {
+        ret->output_callback = videotoolbox_decoder_callback;
+        ret->cv_pix_fmt_type = kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange;
+    }
+
+    return ret;
+}
+
+int av_videotoolbox_default_init(AVCodecContext *avctx)
+{
+    return av_videotoolbox_default_init2(avctx, NULL);
+}
+
+int av_videotoolbox_default_init2(AVCodecContext *avctx, AVVideotoolboxContext *vtctx)
+{
+    avctx->hwaccel_context = vtctx ?: av_videotoolbox_alloc_context();
+    if (!avctx->hwaccel_context)
+        return AVERROR(ENOMEM);
+    return videotoolbox_default_init(avctx);
+}
+
+void av_videotoolbox_default_free(AVCodecContext *avctx)
+{
+
+    videotoolbox_default_free(avctx);
+    av_freep(&avctx->hwaccel_context);
+}
+#endif /* CONFIG_VIDEOTOOLBOX */
diff --git a/libavcodec/videotoolbox.h b/libavcodec/videotoolbox.h
new file mode 100644
index 0000000..a48638e
--- /dev/null
+++ b/libavcodec/videotoolbox.h
@@ -0,0 +1,126 @@
+/*
+ * Videotoolbox hardware acceleration
+ *
+ * copyright (c) 2012 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VIDEOTOOLBOX_H
+#define AVCODEC_VIDEOTOOLBOX_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_videotoolbox
+ * Public libavcodec Videotoolbox header.
+ */
+
+#include <stdint.h>
+
+#define Picture QuickdrawPicture
+#include <VideoToolbox/VideoToolbox.h>
+#undef Picture
+
+#include "libavcodec/avcodec.h"
+
+/**
+ * This struct holds all the information that needs to be passed
+ * between the caller and libavcodec for initializing Videotoolbox decoding.
+ * Its size is not a part of the public ABI, it must be allocated with
+ * av_videotoolbox_alloc_context() and freed with av_free().
+ */
+typedef struct AVVideotoolboxContext {
+    /**
+     * Videotoolbox decompression session object.
+     * Created and freed the caller.
+     */
+    VTDecompressionSessionRef session;
+
+    /**
+     * The output callback that must be passed to the session.
+     * Set by av_videottoolbox_default_init()
+     */
+    VTDecompressionOutputCallback output_callback;
+
+    /**
+     * CVPixelBuffer Format Type that Videotoolbox will use for decoded frames.
+     * set by the caller.
+     */
+    OSType cv_pix_fmt_type;
+
+    /**
+     * CoreMedia Format Description that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    CMVideoFormatDescriptionRef cm_fmt_desc;
+
+    /**
+     * CoreMedia codec type that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    int cm_codec_type;
+} AVVideotoolboxContext;
+
+/**
+ * Allocate and initialize a Videotoolbox context.
+ *
+ * This function should be called from the get_format() callback when the caller
+ * selects the AV_PIX_FMT_VIDETOOLBOX format. The caller must then create
+ * the decoder object (using the output callback provided by libavcodec) that
+ * will be used for Videotoolbox-accelerated decoding.
+ *
+ * When decoding with Videotoolbox is finished, the caller must destroy the decoder
+ * object and free the Videotoolbox context using av_free().
+ *
+ * @return the newly allocated context or NULL on failure
+ */
+AVVideotoolboxContext *av_videotoolbox_alloc_context(void);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init(AVCodecContext *avctx);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ * @param vtctx the Videotoolbox context to use
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init2(AVCodecContext *avctx, AVVideotoolboxContext *vtctx);
+
+/**
+ * This function must be called to free the Videotoolbox context initialized with
+ * av_videotoolbox_default_init().
+ *
+ * @param avctx the corresponding codec context
+ */
+void av_videotoolbox_default_free(AVCodecContext *avctx);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_VIDEOTOOLBOX_H */
diff --git a/libavcodec/videotoolboxenc.c b/libavcodec/videotoolboxenc.c
new file mode 100644
index 0000000..4345ca3
--- /dev/null
+++ b/libavcodec/videotoolboxenc.c
@@ -0,0 +1,2003 @@
+/*
+ * copyright (c) 2015 Rick Kern <kernrj@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <VideoToolbox/VideoToolbox.h>
+#include <CoreVideo/CoreVideo.h>
+#include <CoreMedia/CoreMedia.h>
+#include <TargetConditionals.h>
+#include <Availability.h>
+#include "avcodec.h"
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/atomic.h"
+#include "libavutil/avstring.h"
+#include "libavcodec/avcodec.h"
+#include "libavutil/pixdesc.h"
+#include "internal.h"
+#include <pthread.h>
+
+#if !CONFIG_VT_BT2020
+# define kCVImageBufferColorPrimaries_ITU_R_2020   CFSTR("ITU_R_2020")
+# define kCVImageBufferTransferFunction_ITU_R_2020 CFSTR("ITU_R_2020")
+# define kCVImageBufferYCbCrMatrix_ITU_R_2020      CFSTR("ITU_R_2020")
+#endif
+
+typedef enum VT_H264Profile {
+    H264_PROF_AUTO,
+    H264_PROF_BASELINE,
+    H264_PROF_MAIN,
+    H264_PROF_HIGH,
+    H264_PROF_COUNT
+} VT_H264Profile;
+
+typedef enum VTH264Entropy{
+    VT_ENTROPY_NOT_SET,
+    VT_CAVLC,
+    VT_CABAC
+} VTH264Entropy;
+
+static const uint8_t start_code[] = { 0, 0, 0, 1 };
+
+typedef struct BufNode {
+    CMSampleBufferRef cm_buffer;
+    struct BufNode* next;
+    int error;
+} BufNode;
+
+typedef struct VTEncContext {
+    AVClass *class;
+    VTCompressionSessionRef session;
+    CFStringRef ycbcr_matrix;
+    CFStringRef color_primaries;
+    CFStringRef transfer_function;
+
+    pthread_mutex_t lock;
+    pthread_cond_t  cv_sample_sent;
+
+    int async_error;
+
+    BufNode *q_head;
+    BufNode *q_tail;
+
+    int64_t frame_ct_out;
+    int64_t frame_ct_in;
+
+    int64_t first_pts;
+    int64_t dts_delta;
+
+    int64_t profile;
+    int64_t level;
+    int64_t entropy;
+    int64_t realtime;
+    int64_t frames_before;
+    int64_t frames_after;
+
+    int64_t allow_sw;
+
+    bool flushing;
+    bool has_b_frames;
+    bool warned_color_range;
+} VTEncContext;
+
+static int vtenc_populate_extradata(AVCodecContext   *avctx,
+                                    CMVideoCodecType codec_type,
+                                    CFStringRef      profile_level,
+                                    CFNumberRef      gamma_level,
+                                    CFDictionaryRef  enc_info,
+                                    CFDictionaryRef  pixel_buffer_info);
+
+/**
+ * NULL-safe release of *refPtr, and sets value to NULL.
+ */
+static void vt_release_num(CFNumberRef* refPtr){
+    if (!*refPtr) {
+        return;
+    }
+
+    CFRelease(*refPtr);
+    *refPtr = NULL;
+}
+
+static void set_async_error(VTEncContext *vtctx, int err)
+{
+    BufNode *info;
+
+    pthread_mutex_lock(&vtctx->lock);
+
+    vtctx->async_error = err;
+
+    info = vtctx->q_head;
+    vtctx->q_head = vtctx->q_tail = NULL;
+
+    while (info) {
+        BufNode *next = info->next;
+        CFRelease(info->cm_buffer);
+        av_free(info);
+        info = next;
+    }
+
+    pthread_mutex_unlock(&vtctx->lock);
+}
+
+static int vtenc_q_pop(VTEncContext *vtctx, bool wait, CMSampleBufferRef *buf)
+{
+    BufNode *info;
+
+    pthread_mutex_lock(&vtctx->lock);
+
+    if (vtctx->async_error) {
+        pthread_mutex_unlock(&vtctx->lock);
+        return vtctx->async_error;
+    }
+
+    if (vtctx->flushing && vtctx->frame_ct_in == vtctx->frame_ct_out) {
+        *buf = NULL;
+
+        pthread_mutex_unlock(&vtctx->lock);
+        return 0;
+    }
+
+    while (!vtctx->q_head && !vtctx->async_error && wait) {
+        pthread_cond_wait(&vtctx->cv_sample_sent, &vtctx->lock);
+    }
+
+    if (!vtctx->q_head) {
+        pthread_mutex_unlock(&vtctx->lock);
+        *buf = NULL;
+        return 0;
+    }
+
+    info = vtctx->q_head;
+    vtctx->q_head = vtctx->q_head->next;
+    if (!vtctx->q_head) {
+        vtctx->q_tail = NULL;
+    }
+
+    pthread_mutex_unlock(&vtctx->lock);
+
+    *buf = info->cm_buffer;
+    av_free(info);
+
+    vtctx->frame_ct_out++;
+
+    return 0;
+}
+
+static void vtenc_q_push(VTEncContext *vtctx, CMSampleBufferRef buffer)
+{
+    BufNode *info = av_malloc(sizeof(BufNode));
+    if (!info) {
+        set_async_error(vtctx, AVERROR(ENOMEM));
+        return;
+    }
+
+    CFRetain(buffer);
+    info->cm_buffer = buffer;
+    info->next = NULL;
+
+    pthread_mutex_lock(&vtctx->lock);
+    pthread_cond_signal(&vtctx->cv_sample_sent);
+
+    if (!vtctx->q_head) {
+        vtctx->q_head = info;
+    } else {
+        vtctx->q_tail->next = info;
+    }
+
+    vtctx->q_tail = info;
+
+    pthread_mutex_unlock(&vtctx->lock);
+}
+
+static int count_nalus(size_t length_code_size,
+                       CMSampleBufferRef sample_buffer,
+                       int *count)
+{
+    size_t offset = 0;
+    int status;
+    int nalu_ct = 0;
+    uint8_t size_buf[4];
+    size_t src_size = CMSampleBufferGetTotalSampleSize(sample_buffer);
+    CMBlockBufferRef block = CMSampleBufferGetDataBuffer(sample_buffer);
+
+    if (length_code_size > 4)
+        return AVERROR_INVALIDDATA;
+
+    while (offset < src_size) {
+        size_t curr_src_len;
+        size_t box_len = 0;
+        size_t i;
+
+        status = CMBlockBufferCopyDataBytes(block,
+                                            offset,
+                                            length_code_size,
+                                            size_buf);
+
+        for (i = 0; i < length_code_size; i++) {
+            box_len <<= 8;
+            box_len |= size_buf[i];
+        }
+
+        curr_src_len = box_len + length_code_size;
+        offset += curr_src_len;
+
+        nalu_ct++;
+    }
+
+    *count = nalu_ct;
+    return 0;
+}
+
+static CMVideoCodecType get_cm_codec_type(enum AVCodecID id)
+{
+    switch (id) {
+    case AV_CODEC_ID_H264: return kCMVideoCodecType_H264;
+    default:               return 0;
+    }
+}
+
+/**
+ * Get the parameter sets from a CMSampleBufferRef.
+ * @param dst If *dst isn't NULL, the parameters are copied into existing
+ *            memory. *dst_size must be set accordingly when *dst != NULL.
+ *            If *dst is NULL, it will be allocated.
+ *            In all cases, *dst_size is set to the number of bytes used starting
+ *            at *dst.
+ */
+static int get_params_size(
+    AVCodecContext              *avctx,
+    CMVideoFormatDescriptionRef vid_fmt,
+    size_t                      *size)
+{
+    size_t total_size = 0;
+    size_t ps_count;
+    int is_count_bad = 0;
+    size_t i;
+    int status;
+    status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(vid_fmt,
+                                                                0,
+                                                                NULL,
+                                                                NULL,
+                                                                &ps_count,
+                                                                NULL);
+    if (status) {
+        is_count_bad = 1;
+        ps_count     = 0;
+        status       = 0;
+    }
+
+    for (i = 0; i < ps_count || is_count_bad; i++) {
+        const uint8_t *ps;
+        size_t ps_size;
+        status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(vid_fmt,
+                                                                    i,
+                                                                    &ps,
+                                                                    &ps_size,
+                                                                    NULL,
+                                                                    NULL);
+        if (status) {
+            /*
+             * When ps_count is invalid, status != 0 ends the loop normally
+             * unless we didn't get any parameter sets.
+             */
+            if (i > 0 && is_count_bad) status = 0;
+
+            break;
+        }
+
+        total_size += ps_size + sizeof(start_code);
+    }
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting parameter set sizes: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    *size = total_size;
+    return 0;
+}
+
+static int copy_param_sets(
+    AVCodecContext              *avctx,
+    CMVideoFormatDescriptionRef vid_fmt,
+    uint8_t                     *dst,
+    size_t                      dst_size)
+{
+    size_t ps_count;
+    int is_count_bad = 0;
+    int status;
+    size_t offset = 0;
+    size_t i;
+
+    status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(vid_fmt,
+                                                                0,
+                                                                NULL,
+                                                                NULL,
+                                                                &ps_count,
+                                                                NULL);
+    if (status) {
+        is_count_bad = 1;
+        ps_count     = 0;
+        status       = 0;
+    }
+
+
+    for (i = 0; i < ps_count || is_count_bad; i++) {
+        const uint8_t *ps;
+        size_t ps_size;
+        size_t next_offset;
+
+        status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(vid_fmt,
+                                                                    i,
+                                                                    &ps,
+                                                                    &ps_size,
+                                                                    NULL,
+                                                                    NULL);
+        if (status) {
+            if (i > 0 && is_count_bad) status = 0;
+
+            break;
+        }
+
+        next_offset = offset + sizeof(start_code) + ps_size;
+        if (dst_size < next_offset) {
+            av_log(avctx, AV_LOG_ERROR, "Error: buffer too small for parameter sets.\n");
+            return AVERROR_BUFFER_TOO_SMALL;
+        }
+
+        memcpy(dst + offset, start_code, sizeof(start_code));
+        offset += sizeof(start_code);
+
+        memcpy(dst + offset, ps, ps_size);
+        offset = next_offset;
+    }
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting parameter set data: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+static int set_extradata(AVCodecContext *avctx, CMSampleBufferRef sample_buffer)
+{
+    CMVideoFormatDescriptionRef vid_fmt;
+    size_t total_size;
+    int status;
+
+    vid_fmt = CMSampleBufferGetFormatDescription(sample_buffer);
+    if (!vid_fmt) {
+        av_log(avctx, AV_LOG_ERROR, "No video format.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    status = get_params_size(avctx, vid_fmt, &total_size);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Could not get parameter sets.\n");
+        return status;
+    }
+
+    avctx->extradata = av_mallocz(total_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata) {
+        return AVERROR(ENOMEM);
+    }
+    avctx->extradata_size = total_size;
+
+    status = copy_param_sets(avctx, vid_fmt, avctx->extradata, total_size);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Could not copy param sets.\n");
+        return status;
+    }
+
+    return 0;
+}
+
+static void vtenc_output_callback(
+    void *ctx,
+    void *sourceFrameCtx,
+    OSStatus status,
+    VTEncodeInfoFlags flags,
+    CMSampleBufferRef sample_buffer)
+{
+    AVCodecContext *avctx = ctx;
+    VTEncContext   *vtctx = avctx->priv_data;
+
+    if (vtctx->async_error) {
+        if(sample_buffer) CFRelease(sample_buffer);
+        return;
+    }
+
+    if (status || !sample_buffer) {
+        av_log(avctx, AV_LOG_ERROR, "Error encoding frame: %d\n", (int)status);
+        set_async_error(vtctx, AVERROR_EXTERNAL);
+        return;
+    }
+
+    if (!avctx->extradata && (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)) {
+        int set_status = set_extradata(avctx, sample_buffer);
+        if (set_status) {
+            set_async_error(vtctx, set_status);
+            return;
+        }
+    }
+
+    vtenc_q_push(vtctx, sample_buffer);
+}
+
+static int get_length_code_size(
+    AVCodecContext    *avctx,
+    CMSampleBufferRef sample_buffer,
+    size_t            *size)
+{
+    CMVideoFormatDescriptionRef vid_fmt;
+    int isize;
+    int status;
+
+    vid_fmt = CMSampleBufferGetFormatDescription(sample_buffer);
+    if (!vid_fmt) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting buffer format description.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(vid_fmt,
+                                                                0,
+                                                                NULL,
+                                                                NULL,
+                                                                NULL,
+                                                                &isize);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting length code size: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    *size = isize;
+    return 0;
+}
+
+/*
+ * Returns true on success.
+ *
+ * If profile_level_val is NULL and this method returns true, don't specify the
+ * profile/level to the encoder.
+ */
+static bool get_vt_profile_level(AVCodecContext *avctx,
+                                 CFStringRef    *profile_level_val)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    int64_t profile = vtctx->profile;
+
+    if (profile == H264_PROF_AUTO && vtctx->level) {
+        //Need to pick a profile if level is not auto-selected.
+        profile = vtctx->has_b_frames ? H264_PROF_MAIN : H264_PROF_BASELINE;
+    }
+
+    *profile_level_val = NULL;
+
+    switch (profile) {
+        case H264_PROF_AUTO:
+            return true;
+
+        case H264_PROF_BASELINE:
+            switch (vtctx->level) {
+                case  0: *profile_level_val = kVTProfileLevel_H264_Baseline_AutoLevel; break;
+                case 13: *profile_level_val = kVTProfileLevel_H264_Baseline_1_3;       break;
+                case 30: *profile_level_val = kVTProfileLevel_H264_Baseline_3_0;       break;
+                case 31: *profile_level_val = kVTProfileLevel_H264_Baseline_3_1;       break;
+                case 32: *profile_level_val = kVTProfileLevel_H264_Baseline_3_2;       break;
+                case 40: *profile_level_val = kVTProfileLevel_H264_Baseline_4_0;       break;
+                case 41: *profile_level_val = kVTProfileLevel_H264_Baseline_4_1;       break;
+                case 42: *profile_level_val = kVTProfileLevel_H264_Baseline_4_2;       break;
+                case 50: *profile_level_val = kVTProfileLevel_H264_Baseline_5_0;       break;
+                case 51: *profile_level_val = kVTProfileLevel_H264_Baseline_5_1;       break;
+                case 52: *profile_level_val = kVTProfileLevel_H264_Baseline_5_2;       break;
+            }
+            break;
+
+        case H264_PROF_MAIN:
+            switch (vtctx->level) {
+                case  0: *profile_level_val = kVTProfileLevel_H264_Main_AutoLevel; break;
+                case 30: *profile_level_val = kVTProfileLevel_H264_Main_3_0;       break;
+                case 31: *profile_level_val = kVTProfileLevel_H264_Main_3_1;       break;
+                case 32: *profile_level_val = kVTProfileLevel_H264_Main_3_2;       break;
+                case 40: *profile_level_val = kVTProfileLevel_H264_Main_4_0;       break;
+                case 41: *profile_level_val = kVTProfileLevel_H264_Main_4_1;       break;
+                case 42: *profile_level_val = kVTProfileLevel_H264_Main_4_2;       break;
+                case 50: *profile_level_val = kVTProfileLevel_H264_Main_5_0;       break;
+                case 51: *profile_level_val = kVTProfileLevel_H264_Main_5_1;       break;
+                case 52: *profile_level_val = kVTProfileLevel_H264_Main_5_2;       break;
+            }
+            break;
+
+        case H264_PROF_HIGH:
+            switch (vtctx->level) {
+                case  0: *profile_level_val = kVTProfileLevel_H264_High_AutoLevel; break;
+                case 30: *profile_level_val = kVTProfileLevel_H264_High_3_0;       break;
+                case 31: *profile_level_val = kVTProfileLevel_H264_High_3_1;       break;
+                case 32: *profile_level_val = kVTProfileLevel_H264_High_3_2;       break;
+                case 40: *profile_level_val = kVTProfileLevel_H264_High_4_0;       break;
+                case 41: *profile_level_val = kVTProfileLevel_H264_High_4_1;       break;
+                case 42: *profile_level_val = kVTProfileLevel_H264_High_4_2;       break;
+                case 50: *profile_level_val = kVTProfileLevel_H264_High_5_0;       break;
+                case 51: *profile_level_val = kVTProfileLevel_H264_High_5_1;       break;
+                case 52: *profile_level_val = kVTProfileLevel_H264_High_5_2;       break;
+            }
+            break;
+    }
+
+    if (!*profile_level_val) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid Profile/Level.\n");
+        return false;
+    }
+
+    return true;
+}
+
+static int get_cv_pixel_format(AVCodecContext* avctx,
+                               enum AVPixelFormat fmt,
+                               enum AVColorRange range,
+                               int* av_pixel_format,
+                               int* range_guessed)
+{
+    if (range_guessed) *range_guessed = range != AVCOL_RANGE_MPEG &&
+                                        range != AVCOL_RANGE_JPEG;
+
+    //MPEG range is used when no range is set
+    if (fmt == AV_PIX_FMT_NV12) {
+        *av_pixel_format = range == AVCOL_RANGE_JPEG ?
+                                        kCVPixelFormatType_420YpCbCr8BiPlanarFullRange :
+                                        kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange;
+    } else if (fmt == AV_PIX_FMT_YUV420P) {
+        *av_pixel_format = range == AVCOL_RANGE_JPEG ?
+                                        kCVPixelFormatType_420YpCbCr8PlanarFullRange :
+                                        kCVPixelFormatType_420YpCbCr8Planar;
+    } else {
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static void add_color_attr(AVCodecContext *avctx, CFMutableDictionaryRef dict) {
+    VTEncContext *vtctx = avctx->priv_data;
+
+    if (vtctx->color_primaries) {
+        CFDictionarySetValue(dict,
+                             kCVImageBufferColorPrimariesKey,
+                             vtctx->color_primaries);
+    }
+
+    if (vtctx->transfer_function) {
+        CFDictionarySetValue(dict,
+                             kCVImageBufferTransferFunctionKey,
+                             vtctx->transfer_function);
+    }
+
+    if (vtctx->ycbcr_matrix) {
+        CFDictionarySetValue(dict,
+                             kCVImageBufferYCbCrMatrixKey,
+                             vtctx->ycbcr_matrix);
+    }
+}
+
+static int create_cv_pixel_buffer_info(AVCodecContext* avctx,
+                                       CFMutableDictionaryRef* dict)
+{
+    CFNumberRef cv_color_format_num = NULL;
+    CFNumberRef width_num = NULL;
+    CFNumberRef height_num = NULL;
+    CFMutableDictionaryRef pixel_buffer_info = NULL;
+    int cv_color_format;
+    int status = get_cv_pixel_format(avctx,
+                                     avctx->pix_fmt,
+                                     avctx->color_range,
+                                     &cv_color_format,
+                                     NULL);
+    if (status) return status;
+
+    pixel_buffer_info = CFDictionaryCreateMutable(
+                            kCFAllocatorDefault,
+                            20,
+                            &kCFCopyStringDictionaryKeyCallBacks,
+                            &kCFTypeDictionaryValueCallBacks);
+
+    if (!pixel_buffer_info) goto pbinfo_nomem;
+
+    cv_color_format_num = CFNumberCreate(kCFAllocatorDefault,
+                                         kCFNumberSInt32Type,
+                                         &cv_color_format);
+    if (!cv_color_format_num) goto pbinfo_nomem;
+
+    CFDictionarySetValue(pixel_buffer_info,
+                         kCVPixelBufferPixelFormatTypeKey,
+                         cv_color_format_num);
+    vt_release_num(&cv_color_format_num);
+
+    width_num = CFNumberCreate(kCFAllocatorDefault,
+                               kCFNumberSInt32Type,
+                               &avctx->width);
+    if (!width_num) return AVERROR(ENOMEM);
+
+    CFDictionarySetValue(pixel_buffer_info,
+                         kCVPixelBufferWidthKey,
+                         width_num);
+    vt_release_num(&width_num);
+
+    height_num = CFNumberCreate(kCFAllocatorDefault,
+                                kCFNumberSInt32Type,
+                                &avctx->height);
+    if (!height_num) goto pbinfo_nomem;
+
+    CFDictionarySetValue(pixel_buffer_info,
+                         kCVPixelBufferHeightKey,
+                         height_num);
+    vt_release_num(&height_num);
+
+    add_color_attr(avctx, pixel_buffer_info);
+
+    *dict = pixel_buffer_info;
+    return 0;
+
+pbinfo_nomem:
+    vt_release_num(&cv_color_format_num);
+    vt_release_num(&width_num);
+    vt_release_num(&height_num);
+    if (pixel_buffer_info) CFRelease(pixel_buffer_info);
+
+    return AVERROR(ENOMEM);
+}
+
+static int get_cv_color_primaries(AVCodecContext *avctx,
+                                  CFStringRef *primaries)
+{
+    enum AVColorPrimaries pri = avctx->color_primaries;
+    switch (pri) {
+        case AVCOL_PRI_UNSPECIFIED:
+            *primaries = NULL;
+            break;
+
+        case AVCOL_PRI_BT709:
+            *primaries = kCVImageBufferColorPrimaries_ITU_R_709_2;
+            break;
+
+        case AVCOL_PRI_BT2020:
+            *primaries = kCVImageBufferColorPrimaries_ITU_R_2020;
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Color primaries %s is not supported.\n", av_color_primaries_name(pri));
+            *primaries = NULL;
+            return -1;
+    }
+
+    return 0;
+}
+
+static int get_cv_transfer_function(AVCodecContext *avctx,
+                                    CFStringRef *transfer_fnc,
+                                    CFNumberRef *gamma_level)
+{
+    enum AVColorTransferCharacteristic trc = avctx->color_trc;
+    Float32 gamma;
+    *gamma_level = NULL;
+
+    switch (trc) {
+        case AVCOL_TRC_UNSPECIFIED:
+            *transfer_fnc = NULL;
+            break;
+
+        case AVCOL_TRC_BT709:
+            *transfer_fnc = kCVImageBufferTransferFunction_ITU_R_709_2;
+            break;
+
+        case AVCOL_TRC_SMPTE240M:
+            *transfer_fnc = kCVImageBufferTransferFunction_SMPTE_240M_1995;
+            break;
+
+        case AVCOL_TRC_GAMMA22:
+            gamma = 2.2;
+            *transfer_fnc = kCVImageBufferTransferFunction_UseGamma;
+            *gamma_level = CFNumberCreate(NULL, kCFNumberFloat32Type, &gamma);
+            break;
+
+        case AVCOL_TRC_GAMMA28:
+            gamma = 2.8;
+            *transfer_fnc = kCVImageBufferTransferFunction_UseGamma;
+            *gamma_level = CFNumberCreate(NULL, kCFNumberFloat32Type, &gamma);
+            break;
+
+        case AVCOL_TRC_BT2020_10:
+        case AVCOL_TRC_BT2020_12:
+            *transfer_fnc = kCVImageBufferTransferFunction_ITU_R_2020;
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Transfer function %s is not supported.\n", av_color_transfer_name(trc));
+            return -1;
+    }
+
+    return 0;
+}
+
+static int get_cv_ycbcr_matrix(AVCodecContext *avctx, CFStringRef *matrix) {
+    switch(avctx->colorspace) {
+        case AVCOL_SPC_BT709:
+            *matrix = kCVImageBufferYCbCrMatrix_ITU_R_709_2;
+            break;
+
+        case AVCOL_SPC_UNSPECIFIED:
+            *matrix = NULL;
+            break;
+
+        case AVCOL_SPC_BT470BG:
+        case AVCOL_SPC_SMPTE170M:
+            *matrix = kCVImageBufferYCbCrMatrix_ITU_R_601_4;
+            break;
+
+        case AVCOL_SPC_SMPTE240M:
+            *matrix = kCVImageBufferYCbCrMatrix_SMPTE_240M_1995;
+            break;
+
+        case AVCOL_SPC_BT2020_NCL:
+            *matrix = kCVImageBufferYCbCrMatrix_ITU_R_2020;
+            break;
+
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Color space %s is not supported.\n", av_color_space_name(avctx->colorspace));
+            return -1;
+    }
+
+    return 0;
+}
+
+static int vtenc_create_encoder(AVCodecContext   *avctx,
+                                CMVideoCodecType codec_type,
+                                CFStringRef      profile_level,
+                                CFNumberRef      gamma_level,
+                                CFDictionaryRef  enc_info,
+                                CFDictionaryRef  pixel_buffer_info,
+                                VTCompressionSessionRef *session)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    SInt32       bit_rate = avctx->bit_rate;
+    CFNumberRef  bit_rate_num;
+
+    int status = VTCompressionSessionCreate(kCFAllocatorDefault,
+                                            avctx->width,
+                                            avctx->height,
+                                            codec_type,
+                                            enc_info,
+                                            pixel_buffer_info,
+                                            kCFAllocatorDefault,
+                                            vtenc_output_callback,
+                                            avctx,
+                                            session);
+
+    if (status || !vtctx->session) {
+        av_log(avctx, AV_LOG_ERROR, "Error: cannot create compression session: %d\n", status);
+
+#if !TARGET_OS_IPHONE
+        if (!vtctx->allow_sw) {
+            av_log(avctx, AV_LOG_ERROR, "Try -allow_sw 1. The hardware encoder may be busy, or not supported.\n");
+        }
+#endif
+
+        return AVERROR_EXTERNAL;
+    }
+
+    bit_rate_num = CFNumberCreate(kCFAllocatorDefault,
+                                  kCFNumberSInt32Type,
+                                  &bit_rate);
+    if (!bit_rate_num) return AVERROR(ENOMEM);
+
+    status = VTSessionSetProperty(vtctx->session,
+                                  kVTCompressionPropertyKey_AverageBitRate,
+                                  bit_rate_num);
+    CFRelease(bit_rate_num);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error setting bitrate property: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    if (profile_level) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_ProfileLevel,
+                                      profile_level);
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting profile/level property: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (avctx->gop_size > 0) {
+        CFNumberRef interval = CFNumberCreate(kCFAllocatorDefault,
+                                              kCFNumberIntType,
+                                              &avctx->gop_size);
+        if (!interval) {
+            return AVERROR(ENOMEM);
+        }
+
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_MaxKeyFrameInterval,
+                                      interval);
+        CFRelease(interval);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting 'max key-frame interval' property: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (vtctx->frames_before) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_MoreFramesBeforeStart,
+                                      kCFBooleanTrue);
+
+        if (status == kVTPropertyNotSupportedErr) {
+            av_log(avctx, AV_LOG_WARNING, "frames_before property is not supported on this device. Ignoring.\n");
+        } else if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting frames_before property: %d\n", status);
+        }
+    }
+
+    if (vtctx->frames_after) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_MoreFramesAfterEnd,
+                                      kCFBooleanTrue);
+
+        if (status == kVTPropertyNotSupportedErr) {
+            av_log(avctx, AV_LOG_WARNING, "frames_after property is not supported on this device. Ignoring.\n");
+        } else if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting frames_after property: %d\n", status);
+        }
+    }
+
+    if (avctx->sample_aspect_ratio.num != 0) {
+        CFNumberRef num;
+        CFNumberRef den;
+        CFMutableDictionaryRef par;
+        AVRational *avpar = &avctx->sample_aspect_ratio;
+
+        av_reduce(&avpar->num, &avpar->den,
+                   avpar->num,  avpar->den,
+                  0xFFFFFFFF);
+
+        num = CFNumberCreate(kCFAllocatorDefault,
+                             kCFNumberIntType,
+                             &avpar->num);
+
+        den = CFNumberCreate(kCFAllocatorDefault,
+                             kCFNumberIntType,
+                             &avpar->den);
+
+
+
+        par = CFDictionaryCreateMutable(kCFAllocatorDefault,
+                                        2,
+                                        &kCFCopyStringDictionaryKeyCallBacks,
+                                        &kCFTypeDictionaryValueCallBacks);
+
+        if (!par || !num || !den) {
+            if (par) CFRelease(par);
+            if (num) CFRelease(num);
+            if (den) CFRelease(den);
+
+            return AVERROR(ENOMEM);
+        }
+
+        CFDictionarySetValue(
+            par,
+            kCMFormatDescriptionKey_PixelAspectRatioHorizontalSpacing,
+            num);
+
+        CFDictionarySetValue(
+            par,
+            kCMFormatDescriptionKey_PixelAspectRatioVerticalSpacing,
+            den);
+
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_PixelAspectRatio,
+                                      par);
+
+        CFRelease(par);
+        CFRelease(num);
+        CFRelease(den);
+
+        if (status) {
+            av_log(avctx,
+                   AV_LOG_ERROR,
+                   "Error setting pixel aspect ratio to %d:%d: %d.\n",
+                   avctx->sample_aspect_ratio.num,
+                   avctx->sample_aspect_ratio.den,
+                   status);
+
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+
+    if (vtctx->transfer_function) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_TransferFunction,
+                                      vtctx->transfer_function);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set transfer function: %d\n", status);
+        }
+    }
+
+
+    if (vtctx->ycbcr_matrix) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_YCbCrMatrix,
+                                      vtctx->ycbcr_matrix);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set ycbcr matrix: %d\n", status);
+        }
+    }
+
+
+    if (vtctx->color_primaries) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_ColorPrimaries,
+                                      vtctx->color_primaries);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set color primaries: %d\n", status);
+        }
+    }
+
+    if (gamma_level) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kCVImageBufferGammaLevelKey,
+                                      gamma_level);
+
+        if (status) {
+            av_log(avctx, AV_LOG_WARNING, "Could not set gamma level: %d\n", status);
+        }
+    }
+
+    if (!vtctx->has_b_frames) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_AllowFrameReordering,
+                                      kCFBooleanFalse);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting 'allow frame reordering' property: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (vtctx->entropy != VT_ENTROPY_NOT_SET) {
+        CFStringRef entropy = vtctx->entropy == VT_CABAC ?
+                                kVTH264EntropyMode_CABAC:
+                                kVTH264EntropyMode_CAVLC;
+
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_H264EntropyMode,
+                                      entropy);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting entropy property: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    if (vtctx->realtime) {
+        status = VTSessionSetProperty(vtctx->session,
+                                      kVTCompressionPropertyKey_RealTime,
+                                      kCFBooleanTrue);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error setting realtime property: %d\n", status);
+        }
+    }
+
+    status = VTCompressionSessionPrepareToEncodeFrames(vtctx->session);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: cannot prepare encoder: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+static av_cold int vtenc_init(AVCodecContext *avctx)
+{
+    CFMutableDictionaryRef enc_info;
+    CFMutableDictionaryRef pixel_buffer_info;
+    CMVideoCodecType       codec_type;
+    VTEncContext           *vtctx = avctx->priv_data;
+    CFStringRef            profile_level;
+    CFBooleanRef           has_b_frames_cfbool;
+    CFNumberRef            gamma_level = NULL;
+    int                    status;
+
+    codec_type = get_cm_codec_type(avctx->codec_id);
+    if (!codec_type) {
+        av_log(avctx, AV_LOG_ERROR, "Error: no mapping for AVCodecID %d\n", avctx->codec_id);
+        return AVERROR(EINVAL);
+    }
+
+    vtctx->has_b_frames = avctx->max_b_frames > 0;
+    if(vtctx->has_b_frames && vtctx->profile == H264_PROF_BASELINE){
+        av_log(avctx, AV_LOG_WARNING, "Cannot use B-frames with baseline profile. Output will not contain B-frames.\n");
+        vtctx->has_b_frames = false;
+    }
+
+    if (vtctx->entropy == VT_CABAC && vtctx->profile == H264_PROF_BASELINE) {
+        av_log(avctx, AV_LOG_WARNING, "CABAC entropy requires 'main' or 'high' profile, but baseline was requested. Encode will not use CABAC entropy.\n");
+        vtctx->entropy = VT_ENTROPY_NOT_SET;
+    }
+
+    if (!get_vt_profile_level(avctx, &profile_level)) return AVERROR(EINVAL);
+
+    vtctx->session = NULL;
+
+    enc_info = CFDictionaryCreateMutable(
+        kCFAllocatorDefault,
+        20,
+        &kCFCopyStringDictionaryKeyCallBacks,
+        &kCFTypeDictionaryValueCallBacks
+    );
+
+    if (!enc_info) return AVERROR(ENOMEM);
+
+#if !TARGET_OS_IPHONE
+    if (!vtctx->allow_sw) {
+        CFDictionarySetValue(enc_info, kVTVideoEncoderSpecification_RequireHardwareAcceleratedVideoEncoder, kCFBooleanTrue);
+    } else {
+        CFDictionarySetValue(enc_info, kVTVideoEncoderSpecification_EnableHardwareAcceleratedVideoEncoder,  kCFBooleanTrue);
+    }
+#endif
+
+    if (avctx->pix_fmt != AV_PIX_FMT_VIDEOTOOLBOX) {
+        status = create_cv_pixel_buffer_info(avctx, &pixel_buffer_info);
+        if (status)
+            goto init_cleanup;
+    } else {
+        pixel_buffer_info = NULL;
+    }
+
+    pthread_mutex_init(&vtctx->lock, NULL);
+    pthread_cond_init(&vtctx->cv_sample_sent, NULL);
+    vtctx->dts_delta = vtctx->has_b_frames ? -1 : 0;
+
+    get_cv_transfer_function(avctx, &vtctx->transfer_function, &gamma_level);
+    get_cv_ycbcr_matrix(avctx, &vtctx->ycbcr_matrix);
+    get_cv_color_primaries(avctx, &vtctx->color_primaries);
+
+
+    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
+        status = vtenc_populate_extradata(avctx,
+                                          codec_type,
+                                          profile_level,
+                                          gamma_level,
+                                          enc_info,
+                                          pixel_buffer_info);
+        if (status)
+            goto init_cleanup;
+    }
+
+    status = vtenc_create_encoder(avctx,
+                                  codec_type,
+                                  profile_level,
+                                  gamma_level,
+                                  enc_info,
+                                  pixel_buffer_info,
+                                  &vtctx->session);
+
+    if (status < 0)
+        goto init_cleanup;
+
+    status = VTSessionCopyProperty(vtctx->session,
+                                   kVTCompressionPropertyKey_AllowFrameReordering,
+                                   kCFAllocatorDefault,
+                                   &has_b_frames_cfbool);
+
+    if (!status) {
+        //Some devices don't output B-frames for main profile, even if requested.
+        vtctx->has_b_frames = CFBooleanGetValue(has_b_frames_cfbool);
+        CFRelease(has_b_frames_cfbool);
+    }
+    avctx->has_b_frames = vtctx->has_b_frames;
+
+init_cleanup:
+    if (gamma_level)
+        CFRelease(gamma_level);
+
+    if (pixel_buffer_info)
+        CFRelease(pixel_buffer_info);
+
+    CFRelease(enc_info);
+
+    return status;
+}
+
+static void vtenc_get_frame_info(CMSampleBufferRef buffer, bool *is_key_frame)
+{
+    CFArrayRef      attachments;
+    CFDictionaryRef attachment;
+    CFBooleanRef    not_sync;
+    CFIndex         len;
+
+    attachments = CMSampleBufferGetSampleAttachmentsArray(buffer, false);
+    len = !attachments ? 0 : CFArrayGetCount(attachments);
+
+    if (!len) {
+        *is_key_frame = true;
+        return;
+    }
+
+    attachment = CFArrayGetValueAtIndex(attachments, 0);
+
+    if (CFDictionaryGetValueIfPresent(attachment,
+                                      kCMSampleAttachmentKey_NotSync,
+                                      (const void **)&not_sync))
+    {
+        *is_key_frame = !CFBooleanGetValue(not_sync);
+    } else {
+        *is_key_frame = true;
+    }
+}
+
+/**
+ * Copies NAL units and replaces length codes with
+ * H.264 Annex B start codes. On failure, the contents of
+ * dst_data may have been modified.
+ *
+ * @param length_code_size Byte length of each length code
+ * @param src_data NAL units prefixed with length codes.
+ * @param src_size Length of buffer, excluding any padding.
+ * @param dst_data Must be zeroed before calling this function.
+ *                 Contains the copied NAL units prefixed with
+ *                 start codes when the function returns
+ *                 successfully.
+ * @param dst_size Length of dst_data
+ * @return 0 on success
+ *         AVERROR_INVALIDDATA if length_code_size is invalid
+ *         AVERROR_BUFFER_TOO_SMALL if dst_data is too small
+ *         or if a length_code in src_data specifies data beyond
+ *         the end of its buffer.
+ */
+static int copy_replace_length_codes(
+    AVCodecContext *avctx,
+    size_t        length_code_size,
+    CMSampleBufferRef sample_buffer,
+    uint8_t       *dst_data,
+    size_t        dst_size)
+{
+    size_t src_size = CMSampleBufferGetTotalSampleSize(sample_buffer);
+    size_t remaining_src_size = src_size;
+    size_t remaining_dst_size = dst_size;
+    size_t src_offset = 0;
+    int status;
+    uint8_t size_buf[4];
+    CMBlockBufferRef block = CMSampleBufferGetDataBuffer(sample_buffer);
+
+    if (length_code_size > 4) {
+        return AVERROR_INVALIDDATA;
+    }
+
+    while (remaining_src_size > 0) {
+        size_t curr_src_len;
+        size_t curr_dst_len;
+        size_t box_len = 0;
+        size_t i;
+
+        uint8_t       *dst_box;
+
+        status = CMBlockBufferCopyDataBytes(block,
+                                            src_offset,
+                                            length_code_size,
+                                            size_buf);
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot copy length: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+
+        for (i = 0; i < length_code_size; i++) {
+            box_len <<= 8;
+            box_len |= size_buf[i];
+        }
+
+        curr_src_len = box_len + length_code_size;
+        curr_dst_len = box_len + sizeof(start_code);
+
+        if (remaining_src_size < curr_src_len) {
+            return AVERROR_BUFFER_TOO_SMALL;
+        }
+
+        if (remaining_dst_size < curr_dst_len) {
+            return AVERROR_BUFFER_TOO_SMALL;
+        }
+
+        dst_box = dst_data + sizeof(start_code);
+
+        memcpy(dst_data, start_code, sizeof(start_code));
+        status = CMBlockBufferCopyDataBytes(block,
+                                            src_offset + length_code_size,
+                                            box_len,
+                                            dst_box);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot copy data: %d\n", status);
+            return AVERROR_EXTERNAL;
+        }
+
+        src_offset += curr_src_len;
+        dst_data += curr_dst_len;
+
+        remaining_src_size -= curr_src_len;
+        remaining_dst_size -= curr_dst_len;
+    }
+
+    return 0;
+}
+
+static int vtenc_cm_to_avpacket(
+    AVCodecContext    *avctx,
+    CMSampleBufferRef sample_buffer,
+    AVPacket          *pkt)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+
+    int     status;
+    bool    is_key_frame;
+    bool    add_header;
+    size_t  length_code_size;
+    size_t  header_size = 0;
+    size_t  in_buf_size;
+    size_t  out_buf_size;
+    int64_t dts_delta;
+    int64_t time_base_num;
+    int nalu_count;
+    CMTime  pts;
+    CMTime  dts;
+    CMVideoFormatDescriptionRef vid_fmt;
+
+
+    vtenc_get_frame_info(sample_buffer, &is_key_frame);
+    status = get_length_code_size(avctx, sample_buffer, &length_code_size);
+    if (status) return status;
+
+    add_header = is_key_frame && !(avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER);
+
+    if (add_header) {
+        vid_fmt = CMSampleBufferGetFormatDescription(sample_buffer);
+        if (!vid_fmt) {
+            av_log(avctx, AV_LOG_ERROR, "Cannot get format description.\n");
+            return AVERROR_EXTERNAL;
+        }
+
+        int status = get_params_size(avctx, vid_fmt, &header_size);
+        if (status) return status;
+    }
+
+    status = count_nalus(length_code_size, sample_buffer, &nalu_count);
+    if(status)
+        return status;
+
+    in_buf_size = CMSampleBufferGetTotalSampleSize(sample_buffer);
+    out_buf_size = header_size +
+                   in_buf_size +
+                   nalu_count * ((int)sizeof(start_code) - (int)length_code_size);
+
+    status = ff_alloc_packet2(avctx, pkt, out_buf_size, out_buf_size);
+    if (status < 0)
+        return status;
+
+    if (add_header) {
+        status = copy_param_sets(avctx, vid_fmt, pkt->data, out_buf_size);
+        if(status) return status;
+    }
+
+    status = copy_replace_length_codes(
+        avctx,
+        length_code_size,
+        sample_buffer,
+        pkt->data + header_size,
+        pkt->size - header_size
+    );
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error copying packet data: %d", status);
+        return status;
+    }
+
+    if (is_key_frame) {
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    }
+
+    pts = CMSampleBufferGetPresentationTimeStamp(sample_buffer);
+    dts = CMSampleBufferGetDecodeTimeStamp      (sample_buffer);
+
+    if (CMTIME_IS_INVALID(dts)) {
+        if (!vtctx->has_b_frames) {
+            dts = pts;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "DTS is invalid.\n");
+            return AVERROR_EXTERNAL;
+        }
+    }
+
+    dts_delta = vtctx->dts_delta >= 0 ? vtctx->dts_delta : 0;
+    time_base_num = avctx->time_base.num;
+    pkt->pts = pts.value / time_base_num;
+    pkt->dts = dts.value / time_base_num - dts_delta;
+    pkt->size = out_buf_size;
+
+    return 0;
+}
+
+/*
+ * contiguous_buf_size is 0 if not contiguous, and the size of the buffer
+ * containing all planes if so.
+ */
+static int get_cv_pixel_info(
+    AVCodecContext *avctx,
+    const AVFrame  *frame,
+    int            *color,
+    int            *plane_count,
+    size_t         *widths,
+    size_t         *heights,
+    size_t         *strides,
+    size_t         *contiguous_buf_size)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    int av_format       = frame->format;
+    int av_color_range  = av_frame_get_color_range(frame);
+    int i;
+    int range_guessed;
+    int status;
+
+    status = get_cv_pixel_format(avctx, av_format, av_color_range, color, &range_guessed);
+    if (status) {
+        av_log(avctx,
+            AV_LOG_ERROR,
+            "Could not get pixel format for color format '%s' range '%s'.\n",
+            av_get_pix_fmt_name(av_format),
+            av_color_range > AVCOL_RANGE_UNSPECIFIED &&
+            av_color_range < AVCOL_RANGE_NB ?
+               av_color_range_name(av_color_range) :
+               "Unknown");
+
+        return AVERROR(EINVAL);
+    }
+
+    if (range_guessed) {
+        if (!vtctx->warned_color_range) {
+            vtctx->warned_color_range = true;
+            av_log(avctx,
+                   AV_LOG_WARNING,
+                   "Color range not set for %s. Using MPEG range.\n",
+                   av_get_pix_fmt_name(av_format));
+        }
+
+        av_log(avctx, AV_LOG_WARNING, "");
+    }
+
+    switch (av_format) {
+    case AV_PIX_FMT_NV12:
+        *plane_count = 2;
+
+        widths [0] = avctx->width;
+        heights[0] = avctx->height;
+        strides[0] = frame ? frame->linesize[0] : avctx->width;
+
+        widths [1] = (avctx->width  + 1) / 2;
+        heights[1] = (avctx->height + 1) / 2;
+        strides[1] = frame ? frame->linesize[1] : (avctx->width + 1) & -2;
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+        *plane_count = 3;
+
+        widths [0] = avctx->width;
+        heights[0] = avctx->height;
+        strides[0] = frame ? frame->linesize[0] : avctx->width;
+
+        widths [1] = (avctx->width  + 1) / 2;
+        heights[1] = (avctx->height + 1) / 2;
+        strides[1] = frame ? frame->linesize[1] : (avctx->width + 1) / 2;
+
+        widths [2] = (avctx->width  + 1) / 2;
+        heights[2] = (avctx->height + 1) / 2;
+        strides[2] = frame ? frame->linesize[2] : (avctx->width + 1) / 2;
+        break;
+
+    default:
+        av_log(
+               avctx,
+               AV_LOG_ERROR,
+               "Could not get frame format info for color %d range %d.\n",
+               av_format,
+               av_color_range);
+
+        return AVERROR(EINVAL);
+    }
+
+    *contiguous_buf_size = 0;
+    for (i = 0; i < *plane_count; i++) {
+        if (i < *plane_count - 1 &&
+            frame->data[i] + strides[i] * heights[i] != frame->data[i + 1]) {
+            *contiguous_buf_size = 0;
+            break;
+        }
+
+        *contiguous_buf_size += strides[i] * heights[i];
+    }
+
+    return 0;
+}
+
+#if !TARGET_OS_IPHONE
+//Not used on iOS - frame is always copied.
+static void free_avframe(
+    void       *release_ctx,
+    const void *data,
+    size_t      size,
+    size_t      plane_count,
+    const void *plane_addresses[])
+{
+    AVFrame *frame = release_ctx;
+    av_frame_free(&frame);
+}
+#else
+//Not used on OSX - frame is never copied.
+static int copy_avframe_to_pixel_buffer(AVCodecContext   *avctx,
+                                        const AVFrame    *frame,
+                                        CVPixelBufferRef cv_img,
+                                        const size_t     *plane_strides,
+                                        const size_t     *plane_rows)
+{
+    int i, j;
+    size_t plane_count;
+    int status;
+    int rows;
+    int src_stride;
+    int dst_stride;
+    uint8_t *src_addr;
+    uint8_t *dst_addr;
+    size_t copy_bytes;
+
+    status = CVPixelBufferLockBaseAddress(cv_img, 0);
+    if (status) {
+        av_log(
+            avctx,
+            AV_LOG_ERROR,
+            "Error: Could not lock base address of CVPixelBuffer: %d.\n",
+            status
+        );
+    }
+
+    if (CVPixelBufferIsPlanar(cv_img)) {
+        plane_count = CVPixelBufferGetPlaneCount(cv_img);
+        for (i = 0; frame->data[i]; i++) {
+            if (i == plane_count) {
+                CVPixelBufferUnlockBaseAddress(cv_img, 0);
+                av_log(avctx,
+                    AV_LOG_ERROR,
+                    "Error: different number of planes in AVFrame and CVPixelBuffer.\n"
+                );
+
+                return AVERROR_EXTERNAL;
+            }
+
+            dst_addr = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(cv_img, i);
+            src_addr = (uint8_t*)frame->data[i];
+            dst_stride = CVPixelBufferGetBytesPerRowOfPlane(cv_img, i);
+            src_stride = plane_strides[i];
+            rows = plane_rows[i];
+
+            if (dst_stride == src_stride) {
+                memcpy(dst_addr, src_addr, src_stride * rows);
+            } else {
+                copy_bytes = dst_stride < src_stride ? dst_stride : src_stride;
+
+                for (j = 0; j < rows; j++) {
+                    memcpy(dst_addr + j * dst_stride, src_addr + j * src_stride, copy_bytes);
+                }
+            }
+        }
+    } else {
+        if (frame->data[1]) {
+            CVPixelBufferUnlockBaseAddress(cv_img, 0);
+            av_log(avctx,
+                AV_LOG_ERROR,
+                "Error: different number of planes in AVFrame and non-planar CVPixelBuffer.\n"
+            );
+
+            return AVERROR_EXTERNAL;
+        }
+
+        dst_addr = (uint8_t*)CVPixelBufferGetBaseAddress(cv_img);
+        src_addr = (uint8_t*)frame->data[0];
+        dst_stride = CVPixelBufferGetBytesPerRow(cv_img);
+        src_stride = plane_strides[0];
+        rows = plane_rows[0];
+
+        if (dst_stride == src_stride) {
+            memcpy(dst_addr, src_addr, src_stride * rows);
+        } else {
+            copy_bytes = dst_stride < src_stride ? dst_stride : src_stride;
+
+            for (j = 0; j < rows; j++) {
+                memcpy(dst_addr + j * dst_stride, src_addr + j * src_stride, copy_bytes);
+            }
+        }
+    }
+
+    status = CVPixelBufferUnlockBaseAddress(cv_img, 0);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: Could not unlock CVPixelBuffer base address: %d.\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+#endif //!TARGET_OS_IPHONE
+
+static int create_cv_pixel_buffer(AVCodecContext   *avctx,
+                                  const AVFrame    *frame,
+                                  CVPixelBufferRef *cv_img)
+{
+    int plane_count;
+    int color;
+    size_t widths [AV_NUM_DATA_POINTERS];
+    size_t heights[AV_NUM_DATA_POINTERS];
+    size_t strides[AV_NUM_DATA_POINTERS];
+    int status;
+    size_t contiguous_buf_size;
+#if TARGET_OS_IPHONE
+    CVPixelBufferPoolRef pix_buf_pool;
+    VTEncContext* vtctx = avctx->priv_data;
+#else
+    CFMutableDictionaryRef pix_buf_attachments = CFDictionaryCreateMutable(
+                                                   kCFAllocatorDefault,
+                                                   10,
+                                                   &kCFCopyStringDictionaryKeyCallBacks,
+                                                   &kCFTypeDictionaryValueCallBacks);
+
+    if (!pix_buf_attachments) return AVERROR(ENOMEM);
+#endif
+
+    if (avctx->pix_fmt == AV_PIX_FMT_VIDEOTOOLBOX) {
+        av_assert0(frame->format == AV_PIX_FMT_VIDEOTOOLBOX);
+
+        *cv_img = (CVPixelBufferRef)frame->data[3];
+        av_assert0(*cv_img);
+
+        CFRetain(*cv_img);
+        return 0;
+    }
+
+    memset(widths,  0, sizeof(widths));
+    memset(heights, 0, sizeof(heights));
+    memset(strides, 0, sizeof(strides));
+
+    status = get_cv_pixel_info(
+        avctx,
+        frame,
+        &color,
+        &plane_count,
+        widths,
+        heights,
+        strides,
+        &contiguous_buf_size
+    );
+
+    if (status) {
+        av_log(
+            avctx,
+            AV_LOG_ERROR,
+            "Error: Cannot convert format %d color_range %d: %d\n",
+            frame->format,
+            av_frame_get_color_range(frame),
+            status
+        );
+
+        return AVERROR_EXTERNAL;
+    }
+
+#if TARGET_OS_IPHONE
+    pix_buf_pool = VTCompressionSessionGetPixelBufferPool(vtctx->session);
+    if (!pix_buf_pool) {
+        av_log(avctx, AV_LOG_ERROR, "Could not get pixel buffer pool.\n");
+        return AVERROR_EXTERNAL;
+    }
+
+    status = CVPixelBufferPoolCreatePixelBuffer(NULL,
+                                                pix_buf_pool,
+                                                cv_img);
+
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Could not create pixel buffer from pool: %d.\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    status = copy_avframe_to_pixel_buffer(avctx, frame, *cv_img, strides, heights);
+    if (status) {
+        CFRelease(*cv_img);
+        *cv_img = NULL;
+        return status;
+    }
+#else
+    AVFrame *enc_frame = av_frame_alloc();
+    if (!enc_frame) return AVERROR(ENOMEM);
+
+    status = av_frame_ref(enc_frame, frame);
+    if (status) {
+        av_frame_free(&enc_frame);
+        return status;
+    }
+
+    status = CVPixelBufferCreateWithPlanarBytes(
+        kCFAllocatorDefault,
+        enc_frame->width,
+        enc_frame->height,
+        color,
+        NULL,
+        contiguous_buf_size,
+        plane_count,
+        (void **)enc_frame->data,
+        widths,
+        heights,
+        strides,
+        free_avframe,
+        enc_frame,
+        NULL,
+        cv_img
+    );
+
+    add_color_attr(avctx, pix_buf_attachments);
+    CVBufferSetAttachments(*cv_img, pix_buf_attachments, kCVAttachmentMode_ShouldPropagate);
+    CFRelease(pix_buf_attachments);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: Could not create CVPixelBuffer: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+#endif
+
+    return 0;
+}
+
+static int create_encoder_dict_h264(const AVFrame *frame,
+                                    CFDictionaryRef* dict_out)
+{
+    CFDictionaryRef dict = NULL;
+    if (frame->pict_type == AV_PICTURE_TYPE_I) {
+        const void *keys[] = { kVTEncodeFrameOptionKey_ForceKeyFrame };
+        const void *vals[] = { kCFBooleanTrue };
+
+        dict = CFDictionaryCreate(NULL, keys, vals, 1, NULL, NULL);
+        if(!dict) return AVERROR(ENOMEM);
+    }
+
+    *dict_out = dict;
+    return 0;
+}
+
+static int vtenc_send_frame(AVCodecContext *avctx,
+                            VTEncContext   *vtctx,
+                            const AVFrame  *frame)
+{
+    CMTime time;
+    CFDictionaryRef frame_dict;
+    CVPixelBufferRef cv_img = NULL;
+    int status = create_cv_pixel_buffer(avctx, frame, &cv_img);
+
+    if (status) return status;
+
+    status = create_encoder_dict_h264(frame, &frame_dict);
+    if (status) {
+        CFRelease(cv_img);
+        return status;
+    }
+
+    time = CMTimeMake(frame->pts * avctx->time_base.num, avctx->time_base.den);
+    status = VTCompressionSessionEncodeFrame(
+        vtctx->session,
+        cv_img,
+        time,
+        kCMTimeInvalid,
+        frame_dict,
+        NULL,
+        NULL
+    );
+
+    if (frame_dict) CFRelease(frame_dict);
+    CFRelease(cv_img);
+
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error: cannot encode frame: %d\n", status);
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+static av_cold int vtenc_frame(
+    AVCodecContext *avctx,
+    AVPacket       *pkt,
+    const AVFrame  *frame,
+    int            *got_packet)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    bool get_frame;
+    int status;
+    CMSampleBufferRef buf = NULL;
+
+    if (frame) {
+        status = vtenc_send_frame(avctx, vtctx, frame);
+
+        if (status) {
+            status = AVERROR_EXTERNAL;
+            goto end_nopkt;
+        }
+
+        if (vtctx->frame_ct_in == 0) {
+            vtctx->first_pts = frame->pts;
+        } else if(vtctx->frame_ct_in == 1 && vtctx->has_b_frames) {
+            vtctx->dts_delta = frame->pts - vtctx->first_pts;
+        }
+
+        vtctx->frame_ct_in++;
+    } else if(!vtctx->flushing) {
+        vtctx->flushing = true;
+
+        status = VTCompressionSessionCompleteFrames(vtctx->session,
+                                                    kCMTimeIndefinite);
+
+        if (status) {
+            av_log(avctx, AV_LOG_ERROR, "Error flushing frames: %d\n", status);
+            status = AVERROR_EXTERNAL;
+            goto end_nopkt;
+        }
+    }
+
+    *got_packet = 0;
+    get_frame = vtctx->dts_delta >= 0 || !frame;
+    if (!get_frame) {
+        status = 0;
+        goto end_nopkt;
+    }
+
+    status = vtenc_q_pop(vtctx, !frame, &buf);
+    if (status) goto end_nopkt;
+    if (!buf)   goto end_nopkt;
+
+    status = vtenc_cm_to_avpacket(avctx, buf, pkt);
+    CFRelease(buf);
+    if (status) goto end_nopkt;
+
+    *got_packet = 1;
+    return 0;
+
+end_nopkt:
+    av_packet_unref(pkt);
+    return status;
+}
+
+static int vtenc_populate_extradata(AVCodecContext   *avctx,
+                                    CMVideoCodecType codec_type,
+                                    CFStringRef      profile_level,
+                                    CFNumberRef      gamma_level,
+                                    CFDictionaryRef  enc_info,
+                                    CFDictionaryRef  pixel_buffer_info)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+    AVFrame *frame = av_frame_alloc();
+    int y_size = avctx->width * avctx->height;
+    int chroma_size = (avctx->width / 2) * (avctx->height / 2);
+    CMSampleBufferRef buf = NULL;
+    int status;
+
+    if (!frame)
+        return AVERROR(ENOMEM);
+
+    frame->buf[0] = av_buffer_alloc(y_size + 2 * chroma_size);
+
+    if(!frame->buf[0]){
+        status = AVERROR(ENOMEM);
+        goto pe_cleanup;
+    }
+
+    status = vtenc_create_encoder(avctx,
+                                  codec_type,
+                                  profile_level,
+                                  gamma_level,
+                                  enc_info,
+                                  pixel_buffer_info,
+                                  &vtctx->session);
+    if (status)
+        goto pe_cleanup;
+
+    frame->data[0] = frame->buf[0]->data;
+    memset(frame->data[0],   0,      y_size);
+
+    frame->data[1] = frame->buf[0]->data + y_size;
+    memset(frame->data[1], 128, chroma_size);
+
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
+        frame->data[2] = frame->buf[0]->data + y_size + chroma_size;
+        memset(frame->data[2], 128, chroma_size);
+    }
+
+    frame->linesize[0] = avctx->width;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
+        frame->linesize[1] =
+        frame->linesize[2] = (avctx->width + 1) / 2;
+    } else {
+        frame->linesize[1] = (avctx->width + 1) / 2;
+    }
+
+    frame->format          = avctx->pix_fmt;
+    frame->width           = avctx->width;
+    frame->height          = avctx->height;
+    av_frame_set_colorspace(frame, avctx->colorspace);
+    av_frame_set_color_range(frame, avctx->color_range);
+    frame->color_trc       = avctx->color_trc;
+    frame->color_primaries = avctx->color_primaries;
+
+    frame->pts = 0;
+    status = vtenc_send_frame(avctx, vtctx, frame);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "Error sending frame: %d\n", status);
+        goto pe_cleanup;
+    }
+
+    //Populates extradata - output frames are flushed and param sets are available.
+    status = VTCompressionSessionCompleteFrames(vtctx->session,
+                                                kCMTimeIndefinite);
+
+    if (status)
+        goto pe_cleanup;
+
+    status = vtenc_q_pop(vtctx, 0, &buf);
+    if (status) {
+        av_log(avctx, AV_LOG_ERROR, "popping: %d\n", status);
+        goto pe_cleanup;
+    }
+
+    CFRelease(buf);
+
+
+
+pe_cleanup:
+    if(vtctx->session)
+        CFRelease(vtctx->session);
+
+    vtctx->session = NULL;
+    vtctx->frame_ct_out = 0;
+
+    av_frame_unref(frame);
+    av_frame_free(&frame);
+
+    av_assert0(status != 0 || (avctx->extradata && avctx->extradata_size > 0));
+
+    return status;
+}
+
+static av_cold int vtenc_close(AVCodecContext *avctx)
+{
+    VTEncContext *vtctx = avctx->priv_data;
+
+    if(!vtctx->session) return 0;
+
+    pthread_cond_destroy(&vtctx->cv_sample_sent);
+    pthread_mutex_destroy(&vtctx->lock);
+    CFRelease(vtctx->session);
+    vtctx->session = NULL;
+
+    if (vtctx->color_primaries) {
+        CFRelease(vtctx->color_primaries);
+        vtctx->color_primaries = NULL;
+    }
+
+    if (vtctx->transfer_function) {
+        CFRelease(vtctx->transfer_function);
+        vtctx->transfer_function = NULL;
+    }
+
+    if (vtctx->ycbcr_matrix) {
+        CFRelease(vtctx->ycbcr_matrix);
+        vtctx->ycbcr_matrix = NULL;
+    }
+
+    return 0;
+}
+
+static const enum AVPixelFormat pix_fmts[] = {
+    AV_PIX_FMT_VIDEOTOOLBOX,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE
+};
+
+#define OFFSET(x) offsetof(VTEncContext, x)
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption options[] = {
+    { "profile", "Profile", OFFSET(profile), AV_OPT_TYPE_INT, { .i64 = H264_PROF_AUTO }, H264_PROF_AUTO, H264_PROF_COUNT, VE, "profile" },
+    { "baseline", "Baseline Profile", 0, AV_OPT_TYPE_CONST, { .i64 = H264_PROF_BASELINE }, INT_MIN, INT_MAX, VE, "profile" },
+    { "main",     "Main Profile",     0, AV_OPT_TYPE_CONST, { .i64 = H264_PROF_MAIN     }, INT_MIN, INT_MAX, VE, "profile" },
+    { "high",     "High Profile",     0, AV_OPT_TYPE_CONST, { .i64 = H264_PROF_HIGH     }, INT_MIN, INT_MAX, VE, "profile" },
+
+    { "level", "Level", OFFSET(level), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 52, VE, "level" },
+    { "1.3", "Level 1.3, only available with Baseline Profile", 0, AV_OPT_TYPE_CONST, { .i64 = 13 }, INT_MIN, INT_MAX, VE, "level" },
+    { "3.0", "Level 3.0", 0, AV_OPT_TYPE_CONST, { .i64 = 30 }, INT_MIN, INT_MAX, VE, "level" },
+    { "3.1", "Level 3.1", 0, AV_OPT_TYPE_CONST, { .i64 = 31 }, INT_MIN, INT_MAX, VE, "level" },
+    { "3.2", "Level 3.2", 0, AV_OPT_TYPE_CONST, { .i64 = 32 }, INT_MIN, INT_MAX, VE, "level" },
+    { "4.0", "Level 4.0", 0, AV_OPT_TYPE_CONST, { .i64 = 40 }, INT_MIN, INT_MAX, VE, "level" },
+    { "4.1", "Level 4.1", 0, AV_OPT_TYPE_CONST, { .i64 = 41 }, INT_MIN, INT_MAX, VE, "level" },
+    { "4.2", "Level 4.2", 0, AV_OPT_TYPE_CONST, { .i64 = 42 }, INT_MIN, INT_MAX, VE, "level" },
+    { "5.0", "Level 5.0", 0, AV_OPT_TYPE_CONST, { .i64 = 50 }, INT_MIN, INT_MAX, VE, "level" },
+    { "5.1", "Level 5.1", 0, AV_OPT_TYPE_CONST, { .i64 = 51 }, INT_MIN, INT_MAX, VE, "level" },
+    { "5.2", "Level 5.2", 0, AV_OPT_TYPE_CONST, { .i64 = 52 }, INT_MIN, INT_MAX, VE, "level" },
+
+    { "allow_sw", "Allow software encoding", OFFSET(allow_sw), AV_OPT_TYPE_BOOL,
+        { .i64 = 0 }, 0, 1, VE },
+
+    { "coder", "Entropy coding", OFFSET(entropy), AV_OPT_TYPE_INT, { .i64 = VT_ENTROPY_NOT_SET }, VT_ENTROPY_NOT_SET, VT_CABAC, VE, "coder" },
+    { "cavlc", "CAVLC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CAVLC }, INT_MIN, INT_MAX, VE, "coder" },
+    { "vlc",   "CAVLC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CAVLC }, INT_MIN, INT_MAX, VE, "coder" },
+    { "cabac", "CABAC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CABAC }, INT_MIN, INT_MAX, VE, "coder" },
+    { "ac",    "CABAC entropy coding", 0, AV_OPT_TYPE_CONST, { .i64 = VT_CABAC }, INT_MIN, INT_MAX, VE, "coder" },
+
+    { "realtime", "Hint that encoding should happen in real-time if not faster (e.g. capturing from camera).",
+        OFFSET(realtime), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+
+    { "frames_before", "Other frames will come before the frames in this session. This helps smooth concatenation issues.",
+        OFFSET(frames_before), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+    { "frames_after", "Other frames will come after the frames in this session. This helps smooth concatenation issues.",
+        OFFSET(frames_after), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
+
+    { NULL },
+};
+
+static const AVClass h264_videotoolbox_class = {
+    .class_name = "h264_videotoolbox",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_h264_videotoolbox_encoder = {
+    .name             = "h264_videotoolbox",
+    .long_name        = NULL_IF_CONFIG_SMALL("VideoToolbox H.264 Encoder"),
+    .type             = AVMEDIA_TYPE_VIDEO,
+    .id               = AV_CODEC_ID_H264,
+    .priv_data_size   = sizeof(VTEncContext),
+    .pix_fmts         = pix_fmts,
+    .init             = vtenc_init,
+    .encode2          = vtenc_frame,
+    .close            = vtenc_close,
+    .capabilities     = AV_CODEC_CAP_DELAY,
+    .priv_class       = &h264_videotoolbox_class,
+    .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE |
+                        FF_CODEC_CAP_INIT_CLEANUP,
+};
diff --git a/libavcodec/vima.c b/libavcodec/vima.c
index 6f539a8..b4620ac 100644
--- a/libavcodec/vima.c
+++ b/libavcodec/vima.c
@@ -2,20 +2,20 @@
  * LucasArts VIMA decoder
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vmdaudio.c b/libavcodec/vmdaudio.c
index 9e02ba7..e8c8a06 100644
--- a/libavcodec/vmdaudio.c
+++ b/libavcodec/vmdaudio.c
@@ -1,20 +1,21 @@
 /*
  * Sierra VMD audio decoder
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +35,7 @@
 
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
@@ -74,7 +76,7 @@ static av_cold int vmdaudio_decode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n");
         return AVERROR(EINVAL);
     }
-    if (avctx->block_align < 1) {
+    if (avctx->block_align < 1 || avctx->block_align % avctx->channels) {
         av_log(avctx, AV_LOG_ERROR, "invalid block align\n");
         return AVERROR(EINVAL);
     }
@@ -180,17 +182,16 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx, void *data,
     /* get output buffer */
     frame->nb_samples = ((silent_chunks + audio_chunks) * avctx->block_align) /
                         avctx->channels;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     output_samples_u8  =            frame->data[0];
     output_samples_s16 = (int16_t *)frame->data[0];
 
     /* decode silent chunks */
     if (silent_chunks > 0) {
-        int silent_size = FFMIN(avctx->block_align * silent_chunks,
-                                frame->nb_samples * avctx->channels);
+        int silent_size = avctx->block_align * silent_chunks;
+        av_assert0(avctx->block_align * silent_chunks <= frame->nb_samples * avctx->channels);
+
         if (s->out_bps == 2) {
             memset(output_samples_s16, 0x00, silent_size * 2);
             output_samples_s16 += silent_size;
@@ -202,8 +203,9 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx, void *data,
 
     /* decode audio chunks */
     if (audio_chunks > 0) {
-        buf_end = buf + (buf_size & ~(avctx->channels > 1));
-        while (buf + s->chunk_size <= buf_end) {
+        buf_end = buf + buf_size;
+        av_assert0((buf_size & (avctx->channels > 1)) == 0);
+        while (buf_end - buf >= s->chunk_size) {
             if (s->out_bps == 2) {
                 decode_audio_s16(output_samples_s16, buf, s->chunk_size,
                                  avctx->channels);
diff --git a/libavcodec/vmdvideo.c b/libavcodec/vmdvideo.c
index 2e91c06..b97032f 100644
--- a/libavcodec/vmdvideo.c
+++ b/libavcodec/vmdvideo.c
@@ -1,20 +1,21 @@
 /*
  * Sierra VMD video decoder
+ * Copyright (c) 2004 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,7 +63,7 @@ typedef struct VmdVideoContext {
 #define QUEUE_SIZE 0x1000
 #define QUEUE_MASK 0x0FFF
 
-static void lz_unpack(const unsigned char *src, int src_len,
+static int lz_unpack(const unsigned char *src, int src_len,
                       unsigned char *dest, int dest_len)
 {
     unsigned char *d;
@@ -83,9 +84,9 @@ static void lz_unpack(const unsigned char *src, int src_len,
     dataleft = bytestream2_get_le32(&gb);
     memset(queue, 0x20, QUEUE_SIZE);
     if (bytestream2_get_bytes_left(&gb) < 4)
-        return;
+        return AVERROR_INVALIDDATA;
     if (bytestream2_peek_le32(&gb) == 0x56781234) {
-        bytestream2_get_le32(&gb);
+        bytestream2_skipu(&gb, 4);
         qpos = 0x111;
         speclen = 0xF + 3;
     } else {
@@ -96,8 +97,8 @@ static void lz_unpack(const unsigned char *src, int src_len,
     while (dataleft > 0 && bytestream2_get_bytes_left(&gb) > 0) {
         tag = bytestream2_get_byteu(&gb);
         if ((tag == 0xFF) && (dataleft > 8)) {
-            if (d + 8 > d_end || bytestream2_get_bytes_left(&gb) < 8)
-                return;
+            if (d_end - d < 8 || bytestream2_get_bytes_left(&gb) < 8)
+                return AVERROR_INVALIDDATA;
             for (i = 0; i < 8; i++) {
                 queue[qpos++] = *d++ = bytestream2_get_byteu(&gb);
                 qpos &= QUEUE_MASK;
@@ -108,9 +109,9 @@ static void lz_unpack(const unsigned char *src, int src_len,
                 if (dataleft == 0)
                     break;
                 if (tag & 0x01) {
-                    if (d + 1 > d_end || bytestream2_get_bytes_left(&gb) < 1)
-                        return;
-                    queue[qpos++] = *d++ = bytestream2_get_byte(&gb);
+                    if (d_end - d < 1 || bytestream2_get_bytes_left(&gb) < 1)
+                        return AVERROR_INVALIDDATA;
+                    queue[qpos++] = *d++ = bytestream2_get_byteu(&gb);
                     qpos &= QUEUE_MASK;
                     dataleft--;
                 } else {
@@ -120,8 +121,8 @@ static void lz_unpack(const unsigned char *src, int src_len,
                     if (chainlen == speclen) {
                         chainlen = bytestream2_get_byte(&gb) + 0xF + 3;
                     }
-                    if (d + chainlen > d_end)
-                        return;
+                    if (d_end - d < chainlen)
+                        return AVERROR_INVALIDDATA;
                     for (j = 0; j < chainlen; j++) {
                         *d = queue[chainofs++ & QUEUE_MASK];
                         queue[qpos++] = *d++;
@@ -133,10 +134,10 @@ static void lz_unpack(const unsigned char *src, int src_len,
             }
         }
     }
+    return d - dest;
 }
-
 static int rle_unpack(const unsigned char *src, unsigned char *dest,
-    int src_count, int src_size, int dest_len)
+                      int src_count, int src_size, int dest_len)
 {
     unsigned char *pd;
     int i, l, used = 0;
@@ -159,12 +160,12 @@ static int rle_unpack(const unsigned char *src, unsigned char *dest,
         l = bytestream2_get_byteu(&gb);
         if (l & 0x80) {
             l = (l & 0x7F) * 2;
-            if (pd + l > dest_end || bytestream2_get_bytes_left(&gb) < l)
+            if (dest_end - pd < l || bytestream2_get_bytes_left(&gb) < l)
                 return bytestream2_tell(&gb);
-            bytestream2_get_buffer(&gb, pd, l);
+            bytestream2_get_bufferu(&gb, pd, l);
             pd += l;
         } else {
-            if (pd + l > dest_end || bytestream2_get_bytes_left(&gb) < 2)
+            if (dest_end - pd < 2*l || bytestream2_get_bytes_left(&gb) < 2)
                 return bytestream2_tell(&gb);
             run_val = bytestream2_get_ne16(&gb);
             for (i = 0; i < l; i++) {
@@ -200,6 +201,16 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
     frame_y = AV_RL16(&s->buf[8]);
     frame_width = AV_RL16(&s->buf[10]) - frame_x + 1;
     frame_height = AV_RL16(&s->buf[12]) - frame_y + 1;
+
+    if ((frame_width == s->avctx->width && frame_height == s->avctx->height) &&
+        (frame_x || frame_y)) {
+
+        s->x_off = frame_x;
+        s->y_off = frame_y;
+    }
+    frame_x -= s->x_off;
+    frame_y -= s->y_off;
+
     if (frame_x < 0 || frame_width < 0 ||
         frame_x >= s->avctx->width ||
         frame_width > s->avctx->width ||
@@ -219,15 +230,6 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
         return AVERROR_INVALIDDATA;
     }
 
-    if ((frame_width == s->avctx->width && frame_height == s->avctx->height) &&
-        (frame_x || frame_y)) {
-
-        s->x_off = frame_x;
-        s->y_off = frame_y;
-    }
-    frame_x -= s->x_off;
-    frame_y -= s->y_off;
-
     /* if only a certain region will be updated, copy the entire previous
      * frame before the decode */
     if (s->prev_frame->data[0] &&
@@ -248,13 +250,13 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                 r = bytestream2_get_byteu(&gb) * 4;
                 g = bytestream2_get_byteu(&gb) * 4;
                 b = bytestream2_get_byteu(&gb) * 4;
-                palette32[i] = (r << 16) | (g << 8) | (b);
+                palette32[i] = 0xFFU << 24 | (r << 16) | (g << 8) | (b);
+                palette32[i] |= palette32[i] >> 6 & 0x30303;
             }
         } else {
             av_log(s->avctx, AV_LOG_ERROR, "Incomplete palette\n");
             return AVERROR_INVALIDDATA;
         }
-        s->size -= PALETTE_COUNT * 3 + 2;
     }
 
     if (!s->size)
@@ -265,15 +267,18 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
         return AVERROR_INVALIDDATA;
     meth = bytestream2_get_byteu(&gb);
     if (meth & 0x80) {
+        int size;
         if (!s->unpack_buffer_size) {
             av_log(s->avctx, AV_LOG_ERROR,
                    "Trying to unpack LZ-compressed frame with no LZ buffer\n");
             return AVERROR_INVALIDDATA;
         }
-        lz_unpack(gb.buffer, bytestream2_get_bytes_left(&gb),
-                  s->unpack_buffer, s->unpack_buffer_size);
+        size = lz_unpack(gb.buffer, bytestream2_get_bytes_left(&gb),
+                         s->unpack_buffer, s->unpack_buffer_size);
+        if (size < 0)
+            return size;
         meth &= 0x7F;
-        bytestream2_init(&gb, s->unpack_buffer, s->unpack_buffer_size);
+        bytestream2_init(&gb, s->unpack_buffer, size);
     }
 
     dp = &frame->data[0][frame_y * frame->linesize[0] + frame_x];
@@ -289,7 +294,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                     if (ofs + len > frame_width ||
                         bytestream2_get_bytes_left(&gb) < len)
                         return AVERROR_INVALIDDATA;
-                    bytestream2_get_buffer(&gb, &dp[ofs], len);
+                    bytestream2_get_bufferu(&gb, &dp[ofs], len);
                     ofs += len;
                 } else {
                     /* interframe pixel copy */
@@ -301,7 +306,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
             } while (ofs < frame_width);
             if (ofs > frame_width) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "VMD video: offset > width (%d > %d)\n",
+                       "offset > width (%d > %d)\n",
                        ofs, frame_width);
                 return AVERROR_INVALIDDATA;
             }
@@ -334,6 +339,9 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
                         ofs += slen;
                         bytestream2_skip(&gb, len);
                     } else {
+                        if (ofs + len > frame_width ||
+                            bytestream2_get_bytes_left(&gb) < len)
+                            return AVERROR_INVALIDDATA;
                         bytestream2_get_buffer(&gb, &dp[ofs], len);
                         ofs += len;
                     }
@@ -347,7 +355,7 @@ static int vmd_decode(VmdVideoContext *s, AVFrame *frame)
             } while (ofs < frame_width);
             if (ofs > frame_width) {
                 av_log(s->avctx, AV_LOG_ERROR,
-                       "VMD video: offset > width (%d > %d)\n",
+                       "offset > width (%d > %d)\n",
                        ofs, frame_width);
                 return AVERROR_INVALIDDATA;
             }
@@ -364,7 +372,8 @@ static av_cold int vmdvideo_decode_end(AVCodecContext *avctx)
     VmdVideoContext *s = avctx->priv_data;
 
     av_frame_free(&s->prev_frame);
-    av_free(s->unpack_buffer);
+    av_freep(&s->unpack_buffer);
+    s->unpack_buffer_size = 0;
 
     return 0;
 }
@@ -384,9 +393,9 @@ static av_cold int vmdvideo_decode_init(AVCodecContext *avctx)
 
     /* make sure the VMD header made it */
     if (s->avctx->extradata_size != VMD_HEADER_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR, "VMD video: expected extradata size of %d\n",
+        av_log(s->avctx, AV_LOG_ERROR, "expected extradata size of %d\n",
             VMD_HEADER_SIZE);
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
     vmd_header = (unsigned char *)avctx->extradata;
 
@@ -404,7 +413,8 @@ static av_cold int vmdvideo_decode_init(AVCodecContext *avctx)
         r = raw_palette[palette_index++] * 4;
         g = raw_palette[palette_index++] * 4;
         b = raw_palette[palette_index++] * 4;
-        palette32[i] = (r << 16) | (g << 8) | (b);
+        palette32[i] = 0xFFU << 24 | (r << 16) | (g << 8) | (b);
+        palette32[i] |= palette32[i] >> 6 & 0x30303;
     }
 
     s->prev_frame = av_frame_alloc();
@@ -432,10 +442,8 @@ static int vmdvideo_decode_frame(AVCodecContext *avctx,
     if (buf_size < 16)
         return AVERROR_INVALIDDATA;
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "VMD Video: get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if ((ret = vmd_decode(s, frame)) < 0)
         return ret;
diff --git a/libavcodec/vmnc.c b/libavcodec/vmnc.c
index 3ef2134..49abb77 100644
--- a/libavcodec/vmnc.c
+++ b/libavcodec/vmnc.c
@@ -2,20 +2,20 @@
  * VMware Screen Codec (VMnc) decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -291,6 +291,11 @@ static int decode_hextile(VmncContext *c, uint8_t* dst, GetByteContext *gb,
                         fg = vmnc_get_pixel(gb, bpp, c->bigendian);
                     xy = bytestream2_get_byte(gb);
                     wh = bytestream2_get_byte(gb);
+                    if (   (xy >> 4) + (wh >> 4) + 1 > w - i
+                        || (xy & 0xF) + (wh & 0xF)+1 > h - j) {
+                        av_log(c->avctx, AV_LOG_ERROR, "Rectangle outside picture\n");
+                        return AVERROR_INVALIDDATA;
+                    }
                     paint_rect(dst2, xy >> 4, xy & 0xF,
                                (wh>>4)+1, (wh & 0xF)+1, fg, bpp, stride);
                 }
@@ -307,6 +312,8 @@ static void reset_buffers(VmncContext *c)
     av_freep(&c->curmask);
     av_freep(&c->screendta);
     c->cur_w = c->cur_h = 0;
+    c->cur_hx = c->cur_hy = 0;
+
 }
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
@@ -319,10 +326,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     uint8_t *outptr;
     int dx, dy, w, h, depth, enc, chunks, res, size_left, ret;
 
-    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, c->pic)) < 0)
         return ret;
-    }
 
     bytestream2_init(gb, buf, buf_size);
 
@@ -360,6 +365,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     bytestream2_skip(gb, 2);
     chunks = bytestream2_get_be16(gb);
     while (chunks--) {
+        if (bytestream2_get_bytes_left(gb) < 12) {
+            av_log(avctx, AV_LOG_ERROR, "Premature end of data!\n");
+            return -1;
+        }
         dx  = bytestream2_get_be16(gb);
         dy  = bytestream2_get_be16(gb);
         w   = bytestream2_get_be16(gb);
@@ -369,6 +378,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         size_left = bytestream2_get_bytes_left(gb);
         switch (enc) {
         case MAGIC_WMVd: // cursor
+            if (w*(int64_t)h*c->bpp2 > INT_MAX/2 - 2) {
+                av_log(avctx, AV_LOG_ERROR, "dimensions too large\n");
+                return AVERROR_INVALIDDATA;
+            }
             if (size_left < 2 + w * h * c->bpp2 * 2) {
                 av_log(avctx, AV_LOG_ERROR,
                        "Premature end of data! (need %i got %i)\n",
@@ -419,18 +432,10 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             c->pic->pict_type = AV_PICTURE_TYPE_I;
             depth = bytestream2_get_byte(gb);
             if (depth != c->bpp) {
-                av_log(avctx, AV_LOG_WARNING, "Depth mismatch. "
-                       "Container %i bpp / Codec %i bpp\n", c->bpp, depth);
-
-                if (depth != 8 && depth != 16 && depth != 32) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Unsupported codec bitdepth %i\n", depth);
-                    return AVERROR_INVALIDDATA;
-                }
-
-                /* reset values */
-                c->bpp  = depth;
-                c->bpp2 = c->bpp / 8;
+                av_log(avctx, AV_LOG_INFO,
+                       "Depth mismatch. Container %i bpp, "
+                       "Frame data: %i bpp\n",
+                       c->bpp, depth);
             }
             bytestream2_skip(gb, 1);
             c->bigendian = bytestream2_get_byte(gb);
@@ -523,7 +528,6 @@ static av_cold int decode_init(AVCodecContext *avctx)
     c->width  = avctx->width;
     c->height = avctx->height;
     c->bpp    = avctx->bits_per_coded_sample;
-    c->bpp2   = c->bpp / 8;
 
     switch (c->bpp) {
     case 8:
@@ -534,14 +538,16 @@ static av_cold int decode_init(AVCodecContext *avctx)
         break;
     case 24:
         /* 24 bits is not technically supported, but some clients might
-         * mistakenly set it -- delay the actual check until decode_frame() */
+         * mistakenly set it, so let's assume they actually meant 32 bits */
+        c->bpp = 32;
     case 32:
-        avctx->pix_fmt = AV_PIX_FMT_RGB32;
+        avctx->pix_fmt = AV_PIX_FMT_0RGB32;
         break;
     default:
         av_log(avctx, AV_LOG_ERROR, "Unsupported bitdepth %i\n", c->bpp);
         return AVERROR_INVALIDDATA;
     }
+    c->bpp2 = c->bpp / 8;
 
     c->pic = av_frame_alloc();
     if (!c->pic)
@@ -556,9 +562,9 @@ static av_cold int decode_end(AVCodecContext *avctx)
 
     av_frame_free(&c->pic);
 
-    av_free(c->curbits);
-    av_free(c->curmask);
-    av_free(c->screendta);
+    av_freep(&c->curbits);
+    av_freep(&c->curmask);
+    av_freep(&c->screendta);
     return 0;
 }
 
diff --git a/libavcodec/vorbis.c b/libavcodec/vorbis.c
index 66fa21b..86d1040 100644
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -1,18 +1,22 @@
-/*
- * This file is part of Libav.
+/**
+ * @file
+ * Common code for Vorbis I encoder and decoder
+ * @author Denes Balatoni  ( dbalatoni programozo hu )
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -67,7 +71,7 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
 
     codes[p] = 0;
     if (bits[p] > 32)
-        return 1;
+        return AVERROR_INVALIDDATA;
     for (i = 0; i < bits[p]; ++i)
         exit_at_level[i+1] = 1 << i;
 
@@ -81,9 +85,14 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
 
     ++p;
 
+    for (i = p; (bits[i] == 0) && (i < num); ++i)
+        ;
+    if (i == num)
+        return 0;
+
     for (; p < num; ++p) {
         if (bits[p] > 32)
-             return 1;
+             return AVERROR_INVALIDDATA;
         if (bits[p] == 0)
              continue;
         // find corresponding exit(node which the tree can grow further from)
@@ -91,7 +100,7 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
             if (exit_at_level[i])
                 break;
         if (!i) // overspecified tree
-             return 1;
+             return AVERROR_INVALIDDATA;
         code = exit_at_level[i];
         exit_at_level[i] = 0;
         // construct code (append 0s to end) and introduce new exits
@@ -112,7 +121,7 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, unsigned num)
     //no exits should be left (underspecified tree - ie. unused valid vlcs - not allowed by SPEC)
     for (p = 1; p < 33; p++)
         if (exit_at_level[p])
-            return 1;
+            return AVERROR_INVALIDDATA;
 
     return 0;
 }
diff --git a/libavcodec/vorbis.h b/libavcodec/vorbis.h
index 5ae20ac..98dd14f 100644
--- a/libavcodec/vorbis.h
+++ b/libavcodec/vorbis.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbis_data.c b/libavcodec/vorbis_data.c
index bafb77b..063a075 100644
--- a/libavcodec/vorbis_data.c
+++ b/libavcodec/vorbis_data.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2005 Denes Balatoni ( dbalatoni programozo hu )
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbis_enc_data.h b/libavcodec/vorbis_enc_data.h
index a1e743e..a51aaec 100644
--- a/libavcodec/vorbis_enc_data.h
+++ b/libavcodec/vorbis_enc_data.h
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -492,13 +492,13 @@ static const struct {
     int dim;
     int subclass;
     int masterbook;
-    const int *nbooks;
+    const int nbooks[4];
 } floor_classes[] = {
-    { 3, 0, 0, (const int[]){  4             } },
-    { 4, 1, 0, (const int[]){  5,  6         } },
-    { 3, 1, 1, (const int[]){  7,  8         } },
-    { 4, 2, 2, (const int[]){ -1,  9, 10, 11 } },
-    { 3, 2, 3, (const int[]){ -1, 12, 13, 14 } },
+    { 3, 0, 0, {  4             } },
+    { 4, 1, 0, {  5,  6         } },
+    { 3, 1, 1, {  7,  8         } },
+    { 4, 2, 2, { -1,  9, 10, 11 } },
+    { 3, 2, 3, { -1, 12, 13, 14 } },
 };
 
 #endif /* AVCODEC_VORBIS_ENC_DATA_H */
diff --git a/libavcodec/vorbis_parser.c b/libavcodec/vorbis_parser.c
index 054635d..0b2c97c 100644
--- a/libavcodec/vorbis_parser.c
+++ b/libavcodec/vorbis_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -173,7 +173,7 @@ static int parse_setup_header(AVVorbisParseContext *s,
     skip_bits_long(&gb, got_framing_bit);
     for (i = mode_count - 1; i >= 0; i--) {
         skip_bits_long(&gb, 40);
-        s->mode_blocksize[i] = s->blocksize[get_bits1(&gb)];
+        s->mode_blocksize[i] = get_bits1(&gb);
     }
 
 bad_header:
@@ -184,7 +184,7 @@ bad_header:
 static int vorbis_parse_init(AVVorbisParseContext *s,
                              const uint8_t *extradata, int extradata_size)
 {
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     int ret;
 
@@ -205,13 +205,13 @@ static int vorbis_parse_init(AVVorbisParseContext *s,
         return ret;
 
     s->valid_extradata = 1;
-    s->previous_blocksize = s->mode_blocksize[0];
+    s->previous_blocksize = s->blocksize[s->mode_blocksize[0]];
 
     return 0;
 }
 
-int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                          int buf_size)
+int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf,
+                                int buf_size, int *flags)
 {
     int duration = 0;
 
@@ -220,6 +220,24 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
         int previous_blocksize = s->previous_blocksize;
 
         if (buf[0] & 1) {
+            /* If the user doesn't care about special packets, it's a bad one. */
+            if (!flags)
+                goto bad_packet;
+
+            /* Set the flag for which kind of special packet it is. */
+            if (buf[0] == 1)
+                *flags |= VORBIS_FLAG_HEADER;
+            else if (buf[0] == 3)
+                *flags |= VORBIS_FLAG_COMMENT;
+            else if (buf[0] == 5)
+                *flags |= VORBIS_FLAG_SETUP;
+            else
+                goto bad_packet;
+
+            /* Special packets have no duration. */
+            return 0;
+
+bad_packet:
             av_log(s, AV_LOG_ERROR, "Invalid packet\n");
             return AVERROR_INVALIDDATA;
         }
@@ -231,11 +249,11 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
             av_log(s, AV_LOG_ERROR, "Invalid mode in packet\n");
             return AVERROR_INVALIDDATA;
         }
-        if (mode) {
+        if(s->mode_blocksize[mode]){
             int flag = !!(buf[0] & s->prev_mask);
             previous_blocksize = s->blocksize[flag];
         }
-        current_blocksize     = s->mode_blocksize[mode];
+        current_blocksize     = s->blocksize[s->mode_blocksize[mode]];
         duration              = (previous_blocksize + current_blocksize) >> 2;
         s->previous_blocksize = current_blocksize;
     }
@@ -243,10 +261,16 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
     return duration;
 }
 
+int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
+                          int buf_size)
+{
+    return av_vorbis_parse_frame_flags(s, buf, buf_size, NULL);
+}
+
 void av_vorbis_parse_reset(AVVorbisParseContext *s)
 {
     if (s->valid_extradata)
-        s->previous_blocksize = s->mode_blocksize[0];
+        s->previous_blocksize = s->blocksize[0];
 }
 
 void av_vorbis_parse_free(AVVorbisParseContext **s)
@@ -272,22 +296,6 @@ AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata,
     return s;
 }
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-int avpriv_vorbis_parse_extradata(AVCodecContext *avctx, AVVorbisParseContext *s)
-{
-    return vorbis_parse_init(s, avctx->extradata, avctx->extradata_size);
-}
-void avpriv_vorbis_parse_reset(AVVorbisParseContext *s)
-{
-    av_vorbis_parse_reset(s);
-}
-int avpriv_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                              int buf_size)
-{
-    return av_vorbis_parse_frame(s, buf, buf_size);
-}
-#endif
-
 #if CONFIG_VORBIS_PARSER
 
 typedef struct VorbisParseContext {
diff --git a/libavcodec/vorbis_parser.h b/libavcodec/vorbis_parser.h
index f97a523..9205027 100644
--- a/libavcodec/vorbis_parser.h
+++ b/libavcodec/vorbis_parser.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,8 +23,8 @@
  * Determines the duration for each packet.
  */
 
-#ifndef AVCODEC_VORBIS_PARSE_H
-#define AVCODEC_VORBIS_PARSE_H
+#ifndef AVCODEC_VORBIS_PARSER_H
+#define AVCODEC_VORBIS_PARSER_H
 
 #include <stdint.h>
 
@@ -44,6 +44,24 @@ AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata,
  */
 void av_vorbis_parse_free(AVVorbisParseContext **s);
 
+#define VORBIS_FLAG_HEADER  0x00000001
+#define VORBIS_FLAG_COMMENT 0x00000002
+#define VORBIS_FLAG_SETUP   0x00000004
+
+/**
+ * Get the duration for a Vorbis packet.
+ *
+ * If @p flags is @c NULL,
+ * special frames are considered invalid.
+ *
+ * @param s        Vorbis parser context
+ * @param buf      buffer containing a Vorbis frame
+ * @param buf_size size of the buffer
+ * @param flags    flags for special frames
+ */
+int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf,
+                                int buf_size, int *flags);
+
 /**
  * Get the duration for a Vorbis packet.
  *
@@ -56,4 +74,4 @@ int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
 
 void av_vorbis_parse_reset(AVVorbisParseContext *s);
 
-#endif /* AVCODEC_VORBIS_PARSE_H */
+#endif /* AVCODEC_VORBIS_PARSER_H */
diff --git a/libavcodec/vorbis_parser_internal.h b/libavcodec/vorbis_parser_internal.h
index 8f76af7..691a842 100644
--- a/libavcodec/vorbis_parser_internal.h
+++ b/libavcodec/vorbis_parser_internal.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,8 @@
  * Determines the duration for each packet.
  */
 
-#ifndef AVCODEC_VORBIS_PARSER_H
-#define AVCODEC_VORBIS_PARSER_H
+#ifndef AVCODEC_VORBIS_PARSER_INTERNAL_H
+#define AVCODEC_VORBIS_PARSER_INTERNAL_H
 
 #include "avcodec.h"
 #include "vorbis_parser.h"
@@ -43,29 +43,4 @@ struct AVVorbisParseContext {
     int prev_mask;              ///< bitmask used to get the previous mode flag in each packet
 };
 
-#if LIBAVCODEC_VERSION_MAJOR < 57
-/**
- * Initialize the Vorbis parser using headers in the extradata.
- *
- * @param avctx codec context
- * @param s     Vorbis parser context
- */
-int avpriv_vorbis_parse_extradata(AVCodecContext *avctx, AVVorbisParseContext *s);
-
-/**
- * Get the duration for a Vorbis packet.
- *
- * avpriv_vorbis_parse_extradata() must have been successfully called prior to
- * this in order for a correct duration to be returned.
- *
- * @param s        Vorbis parser context
- * @param buf      buffer containing a Vorbis frame
- * @param buf_size size of the buffer
- */
-int avpriv_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
-                              int buf_size);
-
-void avpriv_vorbis_parse_reset(AVVorbisParseContext *s);
-#endif
-
-#endif /* AVCODEC_VORBIS_PARSER_H */
+#endif /* AVCODEC_VORBIS_PARSER_INTERNAL_H */
diff --git a/libavcodec/vorbisdec.c b/libavcodec/vorbisdec.c
index abc01c8..225f1e9 100644
--- a/libavcodec/vorbisdec.c
+++ b/libavcodec/vorbisdec.c
@@ -1,18 +1,22 @@
-/*
- * This file is part of Libav.
+/**
+ * @file
+ * Vorbis I decoder
+ * @author Denes Balatoni  ( dbalatoni programozo hu )
+ *
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,6 +31,7 @@
 
 #define BITSTREAM_READER_LE
 #include "libavutil/float_dsp.h"
+#include "libavutil/avassert.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "fft.h"
@@ -121,7 +126,7 @@ typedef struct vorbis_context_s {
     AVCodecContext *avctx;
     GetBitContext gb;
     VorbisDSPContext dsp;
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
     FFTContext mdct[2];
     uint8_t       first_frame;
@@ -144,7 +149,7 @@ typedef struct vorbis_context_s {
     uint8_t       mode_count;
     vorbis_mode  *modes;
     uint8_t       mode_number; // mode number for the current packet
-    uint8_t       previous_window;
+    int8_t       previous_window;
     float        *channel_residues;
     float        *saved;
 } vorbis_context;
@@ -186,38 +191,43 @@ static void vorbis_free(vorbis_context *vc)
 
     av_freep(&vc->channel_residues);
     av_freep(&vc->saved);
+    av_freep(&vc->fdsp);
 
-    for (i = 0; i < vc->residue_count; i++)
-        av_free(vc->residues[i].classifs);
+    if (vc->residues)
+        for (i = 0; i < vc->residue_count; i++)
+            av_freep(&vc->residues[i].classifs);
     av_freep(&vc->residues);
     av_freep(&vc->modes);
 
     ff_mdct_end(&vc->mdct[0]);
     ff_mdct_end(&vc->mdct[1]);
 
-    for (i = 0; i < vc->codebook_count; ++i) {
-        av_free(vc->codebooks[i].codevectors);
-        ff_free_vlc(&vc->codebooks[i].vlc);
-    }
+    if (vc->codebooks)
+        for (i = 0; i < vc->codebook_count; ++i) {
+            av_freep(&vc->codebooks[i].codevectors);
+            ff_free_vlc(&vc->codebooks[i].vlc);
+        }
     av_freep(&vc->codebooks);
 
-    for (i = 0; i < vc->floor_count; ++i) {
-        if (vc->floors[i].floor_type == 0) {
-            av_free(vc->floors[i].data.t0.map[0]);
-            av_free(vc->floors[i].data.t0.map[1]);
-            av_free(vc->floors[i].data.t0.book_list);
-            av_free(vc->floors[i].data.t0.lsp);
-        } else {
-            av_free(vc->floors[i].data.t1.list);
+    if (vc->floors)
+        for (i = 0; i < vc->floor_count; ++i) {
+            if (vc->floors[i].floor_type == 0) {
+                av_freep(&vc->floors[i].data.t0.map[0]);
+                av_freep(&vc->floors[i].data.t0.map[1]);
+                av_freep(&vc->floors[i].data.t0.book_list);
+                av_freep(&vc->floors[i].data.t0.lsp);
+            } else {
+                av_freep(&vc->floors[i].data.t1.list);
+            }
         }
-    }
     av_freep(&vc->floors);
 
-    for (i = 0; i < vc->mapping_count; ++i) {
-        av_free(vc->mappings[i].magnitude);
-        av_free(vc->mappings[i].angle);
-        av_free(vc->mappings[i].mux);
-    }
+    if (vc->mappings)
+        for (i = 0; i < vc->mapping_count; ++i) {
+            av_freep(&vc->mappings[i].magnitude);
+            av_freep(&vc->mappings[i].angle);
+            av_freep(&vc->mappings[i].mux);
+        }
     av_freep(&vc->mappings);
 }
 
@@ -369,10 +379,12 @@ static int vorbis_parse_setup_hdr_codebooks(vorbis_context *vc)
 // Weed out unused vlcs and build codevector vector
             if (used_entries) {
                 codebook_setup->codevectors =
-                    av_mallocz(used_entries * codebook_setup->dimensions *
+                    av_mallocz_array(used_entries, codebook_setup->dimensions *
                                sizeof(*codebook_setup->codevectors));
-                if (!codebook_setup->codevectors)
-                    return AVERROR(ENOMEM);
+                if (!codebook_setup->codevectors) {
+                    ret = AVERROR(ENOMEM);
+                    goto error;
+                }
             } else
                 codebook_setup->codevectors = NULL;
 
@@ -555,12 +567,17 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc)
             for (j = 0; j < floor_setup->data.t1.partitions; ++j)
                 floor_setup->data.t1.x_list_dim+=floor_setup->data.t1.class_dimensions[floor_setup->data.t1.partition_class[j]];
 
-            floor_setup->data.t1.list = av_mallocz(floor_setup->data.t1.x_list_dim *
+            floor_setup->data.t1.list = av_mallocz_array(floor_setup->data.t1.x_list_dim,
                                                    sizeof(*floor_setup->data.t1.list));
             if (!floor_setup->data.t1.list)
                 return AVERROR(ENOMEM);
 
             rangebits = get_bits(gb, 4);
+            if (!rangebits && floor_setup->data.t1.partitions) {
+                av_log(vc->avctx, AV_LOG_ERROR,
+                       "A rangebits value of 0 is not compliant with the Vorbis I specification.\n");
+                return AVERROR_INVALIDDATA;
+            }
             rangemax = (1 << rangebits);
             if (rangemax > vc->blocksize[1] / 2) {
                 av_log(vc->avctx, AV_LOG_ERROR,
@@ -634,8 +651,8 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc)
             /* codebook dim is for padding if codebook dim doesn't *
              * divide order+1 then we need to read more data       */
             floor_setup->data.t0.lsp =
-                av_malloc((floor_setup->data.t0.order + 1 + max_codebook_dim)
-                          * sizeof(*floor_setup->data.t0.lsp));
+                av_malloc_array((floor_setup->data.t0.order + 1 + max_codebook_dim),
+                                sizeof(*floor_setup->data.t0.lsp));
             if (!floor_setup->data.t0.lsp)
                 return AVERROR(ENOMEM);
 
@@ -695,8 +712,7 @@ static int vorbis_parse_setup_hdr_residues(vorbis_context *vc)
         res_setup->partition_size = get_bits(gb, 24) + 1;
         /* Validations to prevent a buffer overflow later. */
         if (res_setup->begin>res_setup->end ||
-            res_setup->end > (res_setup->type == 2 ? vc->avctx->channels : 1) * vc->blocksize[1] / 2 ||
-            (res_setup->end-res_setup->begin) / res_setup->partition_size > V_MAX_PARTITIONS) {
+            (res_setup->end-res_setup->begin) / res_setup->partition_size > FFMIN(V_MAX_PARTITIONS, 65535)) {
             av_log(vc->avctx, AV_LOG_ERROR,
                    "partition out of bounds: type, begin, end, size, blocksize: %"PRIu16", %"PRIu32", %"PRIu32", %u, %"PRIu32"\n",
                    res_setup->type, res_setup->begin, res_setup->end,
@@ -709,7 +725,7 @@ static int vorbis_parse_setup_hdr_residues(vorbis_context *vc)
 
         res_setup->ptns_to_read =
             (res_setup->end - res_setup->begin) / res_setup->partition_size;
-        res_setup->classifs = av_malloc(res_setup->ptns_to_read *
+        res_setup->classifs = av_malloc_array(res_setup->ptns_to_read,
                                         vc->audio_channels *
                                         sizeof(*res_setup->classifs));
         if (!res_setup->classifs)
@@ -778,6 +794,11 @@ static int vorbis_parse_setup_hdr_mappings(vorbis_context *vc)
 
         if (get_bits1(gb)) {
             mapping_setup->coupling_steps = get_bits(gb, 8) + 1;
+            if (vc->audio_channels < 2) {
+                av_log(vc->avctx, AV_LOG_ERROR,
+                       "Square polar channel mapping with less than two channels is not compliant with the Vorbis I specification.\n");
+                return AVERROR_INVALIDDATA;
+            }
             mapping_setup->magnitude      = av_mallocz(mapping_setup->coupling_steps *
                                                        sizeof(*mapping_setup->magnitude));
             mapping_setup->angle          = av_mallocz(mapping_setup->coupling_steps *
@@ -802,7 +823,7 @@ static int vorbis_parse_setup_hdr_mappings(vorbis_context *vc)
         }
 
         if (mapping_setup->submaps>1) {
-            mapping_setup->mux = av_mallocz(vc->audio_channels *
+            mapping_setup->mux = av_mallocz_array(vc->audio_channels,
                                             sizeof(*mapping_setup->mux));
             if (!mapping_setup->mux)
                 return AVERROR(ENOMEM);
@@ -837,7 +858,7 @@ static int create_map(vorbis_context *vc, unsigned floor_number)
     for (blockflag = 0; blockflag < 2; ++blockflag) {
         n = vc->blocksize[blockflag] / 2;
         floors[floor_number].data.t0.map[blockflag] =
-            av_malloc((n + 1) * sizeof(int32_t)); // n + sentinel
+            av_malloc_array(n + 1, sizeof(int32_t)); // n + sentinel
         if (!floors[floor_number].data.t0.map[blockflag])
             return AVERROR(ENOMEM);
 
@@ -964,12 +985,12 @@ static int vorbis_parse_id_hdr(vorbis_context *vc)
     vc->bitrate_minimum = get_bits_long(gb, 32);
     bl0 = get_bits(gb, 4);
     bl1 = get_bits(gb, 4);
-    vc->blocksize[0] = (1 << bl0);
-    vc->blocksize[1] = (1 << bl1);
     if (bl0 > 13 || bl0 < 6 || bl1 > 13 || bl1 < 6 || bl1 < bl0) {
         av_log(vc->avctx, AV_LOG_ERROR, " Vorbis id header packet corrupt (illegal blocksize). \n");
         return AVERROR_INVALIDDATA;
     }
+    vc->blocksize[0] = (1 << bl0);
+    vc->blocksize[1] = (1 << bl1);
     vc->win[0] = ff_vorbis_vwin[bl0 - 6];
     vc->win[1] = ff_vorbis_vwin[bl1 - 6];
 
@@ -978,15 +999,18 @@ static int vorbis_parse_id_hdr(vorbis_context *vc)
         return AVERROR_INVALIDDATA;
     }
 
-    vc->channel_residues =  av_malloc((vc->blocksize[1]  / 2) * vc->audio_channels * sizeof(*vc->channel_residues));
-    vc->saved            =  av_mallocz((vc->blocksize[1] / 4) * vc->audio_channels * sizeof(*vc->saved));
+    vc->channel_residues =  av_malloc_array(vc->blocksize[1]  / 2, vc->audio_channels * sizeof(*vc->channel_residues));
+    vc->saved            =  av_mallocz_array(vc->blocksize[1] / 4, vc->audio_channels * sizeof(*vc->saved));
     if (!vc->channel_residues || !vc->saved)
         return AVERROR(ENOMEM);
 
-    vc->previous_window  = 0;
+    vc->previous_window  = -1;
 
     ff_mdct_init(&vc->mdct[0], bl0, 1, -1.0);
     ff_mdct_init(&vc->mdct[1], bl1, 1, -1.0);
+    vc->fdsp = avpriv_float_dsp_alloc(vc->avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!vc->fdsp)
+        return AVERROR(ENOMEM);
 
     ff_dlog(NULL, " vorbis version %d \n audio_channels %d \n audio_samplerate %d \n bitrate_max %d \n bitrate_nom %d \n bitrate_min %d \n blk_0 %d blk_1 %d \n ",
             vc->version, vc->audio_channels, vc->audio_samplerate, vc->bitrate_maximum, vc->bitrate_nominal, vc->bitrate_minimum, vc->blocksize[0], vc->blocksize[1]);
@@ -1008,14 +1032,13 @@ static av_cold int vorbis_decode_init(AVCodecContext *avctx)
     vorbis_context *vc = avctx->priv_data;
     uint8_t *headers   = avctx->extradata;
     int headers_len    = avctx->extradata_size;
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     GetBitContext *gb = &vc->gb;
     int hdr_type, ret;
 
     vc->avctx = avctx;
     ff_vorbisdsp_init(&vc->dsp);
-    avpriv_float_dsp_init(&vc->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
@@ -1187,7 +1210,7 @@ static int vorbis_floor1_decode(vorbis_context *vc,
     uint16_t floor1_Y[258];
     uint16_t floor1_Y_final[258];
     int floor1_flag[258];
-    unsigned class, cdim, cbits, csub, cval, offset, i, j;
+    unsigned partition_class, cdim, cbits, csub, cval, offset, i, j;
     int book, adx, ady, dy, off, predicted, err;
 
 
@@ -1203,28 +1226,31 @@ static int vorbis_floor1_decode(vorbis_context *vc,
 
     offset = 2;
     for (i = 0; i < vf->partitions; ++i) {
-        class = vf->partition_class[i];
-        cdim   = vf->class_dimensions[class];
-        cbits  = vf->class_subclasses[class];
+        partition_class = vf->partition_class[i];
+        cdim   = vf->class_dimensions[partition_class];
+        cbits  = vf->class_subclasses[partition_class];
         csub = (1 << cbits) - 1;
         cval = 0;
 
         ff_dlog(NULL, "Cbits %u\n", cbits);
 
         if (cbits) // this reads all subclasses for this partition's class
-            cval = get_vlc2(gb, vc->codebooks[vf->class_masterbook[class]].vlc.table,
-                            vc->codebooks[vf->class_masterbook[class]].nb_bits, 3);
+            cval = get_vlc2(gb, vc->codebooks[vf->class_masterbook[partition_class]].vlc.table,
+                            vc->codebooks[vf->class_masterbook[partition_class]].nb_bits, 3);
 
         for (j = 0; j < cdim; ++j) {
-            book = vf->subclass_books[class][cval & csub];
+            book = vf->subclass_books[partition_class][cval & csub];
 
             ff_dlog(NULL, "book %d Cbits %u cval %u  bits:%d\n",
                     book, cbits, cval, get_bits_count(gb));
 
             cval = cval >> cbits;
             if (book > -1) {
-                floor1_Y[offset+j] = get_vlc2(gb, vc->codebooks[book].vlc.table,
-                vc->codebooks[book].nb_bits, 3);
+                int v = get_vlc2(gb, vc->codebooks[book].vlc.table,
+                                 vc->codebooks[book].nb_bits, 3);
+                if (v < 0)
+                    return AVERROR_INVALIDDATA;
+                floor1_Y[offset+j] = v;
             } else {
                 floor1_Y[offset+j] = 0;
             }
@@ -1305,7 +1331,9 @@ static av_always_inline int setup_classifs(vorbis_context *vc,
                                            vorbis_residue *vr,
                                            uint8_t *do_not_decode,
                                            unsigned ch_used,
-                                           int partition_count)
+                                           int partition_count,
+                                           int ptns_to_read
+                                          )
 {
     vorbis_codebook *codebook = vc->codebooks + vr->classbook;
     int p, j, i;
@@ -1319,21 +1347,25 @@ static av_always_inline int setup_classifs(vorbis_context *vc,
 
             ff_dlog(NULL, "Classword: %u\n", temp);
 
+            av_assert0(temp < 65536);
+
             if (temp < 0) {
                 av_log(vc->avctx, AV_LOG_ERROR,
                        "Invalid vlc code decoding %d channel.", j);
                 return AVERROR_INVALIDDATA;
             }
 
+            av_assert0(vr->classifications > 1); //needed for inverse[]
+
             for (i = partition_count + c_p_c - 1; i >= partition_count; i--) {
                 temp2 = (((uint64_t)temp) * inverse_class) >> 32;
 
-                if (i < vr->ptns_to_read)
+                if (i < ptns_to_read)
                     vr->classifs[p + i] = temp - temp2 * vr->classifications;
                 temp = temp2;
             }
         }
-        p += vr->ptns_to_read;
+        p += ptns_to_read;
     }
     return 0;
 }
@@ -1354,6 +1386,7 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
     unsigned pass, ch_used, i, j, k, l;
     unsigned max_output = (ch - 1) * vlen;
     int ptns_to_read = vr->ptns_to_read;
+    int libvorbis_bug = 0;
 
     if (vr_type == 2) {
         for (j = 1; j < ch; ++j)
@@ -1368,8 +1401,13 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
     }
 
     if (max_output > ch_left * vlen) {
-        av_log(vc->avctx, AV_LOG_ERROR, "Insufficient output buffer\n");
-        return AVERROR_INVALIDDATA;
+        if (max_output <= ch_left * vlen + vr->partition_size*ch_used/ch) {
+            ptns_to_read--;
+            libvorbis_bug = 1;
+        } else {
+            av_log(vc->avctx, AV_LOG_ERROR, "Insufficient output buffer\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
     ff_dlog(NULL, " residue type 0/1/2 decode begin, ch: %d  cpc %d  \n", ch, c_p_c);
@@ -1380,7 +1418,7 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
         voffset = vr->begin;
         for (partition_count = 0; partition_count < ptns_to_read;) {  // SPEC        error
             if (!pass) {
-                int ret = setup_classifs(vc, vr, do_not_decode, ch_used, partition_count);
+                int ret = setup_classifs(vc, vr, do_not_decode, ch_used, partition_count, ptns_to_read);
                 if (ret < 0)
                     return ret;
             }
@@ -1478,6 +1516,14 @@ static av_always_inline int vorbis_residue_decode_internal(vorbis_context *vc,
                 voffset += vr->partition_size;
             }
         }
+        if (libvorbis_bug && !pass) {
+            for (j = 0; j < ch_used; ++j) {
+                if (!do_not_decode[j]) {
+                    get_vlc2(&vc->gb, vc->codebooks[vr->classbook].vlc.table,
+                                vc->codebooks[vr->classbook].nb_bits, 3);
+                }
+            }
+        }
     }
     return 0;
 }
@@ -1530,7 +1576,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
 {
     GetBitContext *gb = &vc->gb;
     FFTContext *mdct;
-    unsigned previous_window = vc->previous_window;
+    int previous_window = vc->previous_window;
     unsigned mode_number, blockflag, blocksize;
     int i, j;
     uint8_t no_residue[255];
@@ -1563,9 +1609,11 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
     blocksize = vc->blocksize[blockflag];
     vlen = blocksize / 2;
     if (blockflag) {
-        previous_window = get_bits(gb, 1);
-        skip_bits1(gb); // next_window
-    }
+        int code = get_bits(gb, 2);
+        if (previous_window < 0)
+            previous_window = code>>1;
+    } else if (previous_window < 0)
+        previous_window = 0;
 
     memset(ch_res_ptr,   0, sizeof(float) * vc->audio_channels * vlen); //FIXME can this be removed ?
     for (i = 0; i < vc->audio_channels; ++i)
@@ -1653,7 +1701,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
 
     for (j = vc->audio_channels-1;j >= 0; j--) {
         ch_res_ptr   = vc->channel_residues + res_chan[j] * blocksize / 2;
-        vc->fdsp.vector_fmul(floor_ptr[j], floor_ptr[j], ch_res_ptr, blocksize / 2);
+        vc->fdsp->vector_fmul(floor_ptr[j], floor_ptr[j], ch_res_ptr, blocksize / 2);
         mdct->imdct_half(mdct, ch_res_ptr, floor_ptr[j]);
     }
 
@@ -1670,13 +1718,13 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
         const float *win  = vc->win[blockflag & previous_window];
 
         if (blockflag == previous_window) {
-            vc->fdsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4);
+            vc->fdsp->vector_fmul_window(ret, saved, buf, win, blocksize / 4);
         } else if (blockflag > previous_window) {
-            vc->fdsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4);
+            vc->fdsp->vector_fmul_window(ret, saved, buf, win, bs0 / 4);
             memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float));
         } else {
             memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float));
-            vc->fdsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
+            vc->fdsp->vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
         }
         memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float));
     }
@@ -1700,12 +1748,53 @@ static int vorbis_decode_frame(AVCodecContext *avctx, void *data,
 
     ff_dlog(NULL, "packet length %d \n", buf_size);
 
+    if (*buf == 1 && buf_size > 7) {
+        if ((ret = init_get_bits8(gb, buf + 1, buf_size - 1)) < 0)
+            return ret;
+
+        vorbis_free(vc);
+        if ((ret = vorbis_parse_id_hdr(vc))) {
+            av_log(avctx, AV_LOG_ERROR, "Id header corrupt.\n");
+            vorbis_free(vc);
+            return ret;
+        }
+
+        if (vc->audio_channels > 8)
+            avctx->channel_layout = 0;
+        else
+            avctx->channel_layout = ff_vorbis_channel_layouts[vc->audio_channels - 1];
+
+        avctx->channels    = vc->audio_channels;
+        avctx->sample_rate = vc->audio_samplerate;
+        return buf_size;
+    }
+
+    if (*buf == 3 && buf_size > 7) {
+        av_log(avctx, AV_LOG_DEBUG, "Ignoring comment header\n");
+        return buf_size;
+    }
+
+    if (*buf == 5 && buf_size > 7 && vc->channel_residues && !vc->modes) {
+        if ((ret = init_get_bits8(gb, buf + 1, buf_size - 1)) < 0)
+            return ret;
+
+        if ((ret = vorbis_parse_setup_hdr(vc))) {
+            av_log(avctx, AV_LOG_ERROR, "Setup header corrupt.\n");
+            vorbis_free(vc);
+            return ret;
+        }
+        return buf_size;
+    }
+
+    if (!vc->channel_residues || !vc->modes) {
+        av_log(avctx, AV_LOG_ERROR, "Data packet before valid headers\n");
+        return AVERROR_INVALIDDATA;
+    }
+
     /* get output buffer */
     frame->nb_samples = vc->blocksize[1] / 2;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
 
     if (vc->audio_channels > 8) {
         for (i = 0; i < vc->audio_channels; i++)
@@ -1717,7 +1806,8 @@ static int vorbis_decode_frame(AVCodecContext *avctx, void *data,
         }
     }
 
-    init_get_bits(gb, buf, buf_size*8);
+    if ((ret = init_get_bits8(gb, buf, buf_size)) < 0)
+        return ret;
 
     if ((len = vorbis_parse_audio_packet(vc, channel_ptrs)) <= 0)
         return len;
@@ -1757,7 +1847,8 @@ static av_cold void vorbis_decode_flush(AVCodecContext *avctx)
         memset(vc->saved, 0, (vc->blocksize[1] / 4) * vc->audio_channels *
                              sizeof(*vc->saved));
     }
-    vc->previous_window = 0;
+    vc->previous_window = -1;
+    vc->first_frame = 0;
 }
 
 AVCodec ff_vorbis_decoder = {
diff --git a/libavcodec/vorbisdsp.c b/libavcodec/vorbisdsp.c
index c37e2c4..362a276 100644
--- a/libavcodec/vorbisdsp.c
+++ b/libavcodec/vorbisdsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbisdsp.h b/libavcodec/vorbisdsp.h
index ea41c40..7abec4e 100644
--- a/libavcodec/vorbisdsp.h
+++ b/libavcodec/vorbisdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vorbisenc.c b/libavcodec/vorbisenc.c
index 35bdd57..2974ca2 100644
--- a/libavcodec/vorbisenc.c
+++ b/libavcodec/vorbisenc.c
@@ -1,20 +1,20 @@
 /*
  * copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -142,9 +142,9 @@ typedef struct vorbis_enc_context {
 static inline int put_codeword(PutBitContext *pb, vorbis_enc_codebook *cb,
                                int entry)
 {
-    assert(entry >= 0);
-    assert(entry < cb->nentries);
-    assert(cb->lens[entry]);
+    av_assert2(entry >= 0);
+    av_assert2(entry < cb->nentries);
+    av_assert2(cb->lens[entry]);
     if (pb->size_in_bits - put_bits_count(pb) < cb->lens[entry])
         return AVERROR(EINVAL);
     put_bits(pb, cb->lens[entry], cb->codewords[entry]);
@@ -170,8 +170,8 @@ static int ready_codebook(vorbis_enc_codebook *cb)
         cb->pow2 = cb->dimensions = NULL;
     } else {
         int vals = cb_lookup_vals(cb->lookup, cb->ndimensions, cb->nentries);
-        cb->dimensions = av_malloc(sizeof(float) * cb->nentries * cb->ndimensions);
-        cb->pow2 = av_mallocz(sizeof(float) * cb->nentries);
+        cb->dimensions = av_malloc_array(cb->nentries, sizeof(float) * cb->ndimensions);
+        cb->pow2 = av_mallocz_array(cb->nentries, sizeof(float));
         if (!cb->dimensions || !cb->pow2)
             return AVERROR(ENOMEM);
         for (i = 0; i < cb->nentries; i++) {
@@ -200,8 +200,8 @@ static int ready_codebook(vorbis_enc_codebook *cb)
 static int ready_residue(vorbis_enc_residue *rc, vorbis_enc_context *venc)
 {
     int i;
-    assert(rc->type == 2);
-    rc->maxes = av_mallocz(sizeof(float[2]) * rc->classifications);
+    av_assert0(rc->type == 2);
+    rc->maxes = av_mallocz_array(rc->classifications, sizeof(float[2]));
     if (!rc->maxes)
         return AVERROR(ENOMEM);
     for (i = 0; i < rc->classifications; i++) {
@@ -266,8 +266,8 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         cb->lookup      = cvectors[book].lookup;
         cb->seq_p       = 0;
 
-        cb->lens      = av_malloc(sizeof(uint8_t)  * cb->nentries);
-        cb->codewords = av_malloc(sizeof(uint32_t) * cb->nentries);
+        cb->lens      = av_malloc_array(cb->nentries, sizeof(uint8_t));
+        cb->codewords = av_malloc_array(cb->nentries, sizeof(uint32_t));
         if (!cb->lens || !cb->codewords)
             return AVERROR(ENOMEM);
         memcpy(cb->lens, cvectors[book].clens, cvectors[book].len);
@@ -275,7 +275,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
 
         if (cb->lookup) {
             vals = cb_lookup_vals(cb->lookup, cb->ndimensions, cb->nentries);
-            cb->quantlist = av_malloc(sizeof(int) * vals);
+            cb->quantlist = av_malloc_array(vals, sizeof(int));
             if (!cb->quantlist)
                 return AVERROR(ENOMEM);
             for (i = 0; i < vals; i++)
@@ -305,7 +305,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         fc->nclasses = FFMAX(fc->nclasses, fc->partition_to_class[i]);
     }
     fc->nclasses++;
-    fc->classes = av_malloc(sizeof(vorbis_enc_floor_class) * fc->nclasses);
+    fc->classes = av_malloc_array(fc->nclasses, sizeof(vorbis_enc_floor_class));
     if (!fc->classes)
         return AVERROR(ENOMEM);
     for (i = 0; i < fc->nclasses; i++) {
@@ -315,7 +315,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
         c->subclass   = floor_classes[i].subclass;
         c->masterbook = floor_classes[i].masterbook;
         books         = (1 << c->subclass);
-        c->books      = av_malloc(sizeof(int) * books);
+        c->books      = av_malloc_array(books, sizeof(int));
         if (!c->books)
             return AVERROR(ENOMEM);
         for (j = 0; j < books; j++)
@@ -328,7 +328,7 @@ static int create_vorbis_context(vorbis_enc_context *venc,
     for (i = 0; i < fc->partitions; i++)
         fc->values += fc->classes[fc->partition_to_class[i]].dim;
 
-    fc->list = av_malloc(sizeof(vorbis_floor1_entry) * fc->values);
+    fc->list = av_malloc_array(fc->values, sizeof(vorbis_floor1_entry));
     if (!fc->list)
         return AVERROR(ENOMEM);
     fc->list[0].x = 0;
@@ -419,10 +419,10 @@ static int create_vorbis_context(vorbis_enc_context *venc,
     venc->modes[0].mapping   = 0;
 
     venc->have_saved = 0;
-    venc->saved      = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
-    venc->samples    = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]));
-    venc->floor      = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
-    venc->coeffs     = av_malloc(sizeof(float) * venc->channels * (1 << venc->log2_blocksize[1]) / 2);
+    venc->saved      = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
+    venc->samples    = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]));
+    venc->floor      = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
+    venc->coeffs     = av_malloc_array(sizeof(float) * venc->channels, (1 << venc->log2_blocksize[1]) / 2);
     if (!venc->saved || !venc->samples || !venc->floor || !venc->coeffs)
         return AVERROR(ENOMEM);
 
@@ -585,9 +585,11 @@ static int put_main_header(vorbis_enc_context *venc, uint8_t **out)
 {
     int i;
     PutBitContext pb;
-    uint8_t buffer[50000] = {0}, *p = buffer;
-    int buffer_len = sizeof buffer;
     int len, hlens[3];
+    int buffer_len = 50000;
+    uint8_t *buffer = av_mallocz(buffer_len), *p = buffer;
+    if (!buffer)
+        return AVERROR(ENOMEM);
 
     // identification header
     init_put_bits(&pb, p, buffer_len);
@@ -710,6 +712,7 @@ static int put_main_header(vorbis_enc_context *venc, uint8_t **out)
         buffer_len += hlens[i];
     }
 
+    av_freep(&buffer);
     return p - *out;
 }
 
@@ -880,8 +883,8 @@ static int residue_encode(vorbis_enc_context *venc, vorbis_enc_residue *rc,
     int classes[MAX_CHANNELS][NUM_RESIDUE_PARTITIONS];
     int classwords = venc->codebooks[rc->classbook].ndimensions;
 
-    assert(rc->type == 2);
-    assert(real_ch == 2);
+    av_assert0(rc->type == 2);
+    av_assert0(real_ch == 2);
     for (p = 0; p < partitions; p++) {
         float max1 = 0.0, max2 = 0.0;
         int s = rc->begin + p * psize;
@@ -1015,7 +1018,6 @@ static int apply_window_and_mdct(vorbis_enc_context *venc,
     return 1;
 }
 
-
 static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                                const AVFrame *frame, int *got_packet_ptr)
 {
@@ -1031,10 +1033,8 @@ static int vorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
         return 0;
     samples = 1 << (venc->log2_blocksize[0] - 1);
 
-    if ((ret = ff_alloc_packet(avpkt, 8192))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 8192, 0)) < 0)
         return ret;
-    }
 
     init_put_bits(&pb, avpkt->data, avpkt->size);
 
@@ -1170,7 +1170,7 @@ static av_cold int vorbis_encode_init(AVCodecContext *avctx)
     int ret;
 
     if (avctx->channels != 2) {
-        av_log(avctx, AV_LOG_ERROR, "Current Libav Vorbis encoder only supports 2 channels.\n");
+        av_log(avctx, AV_LOG_ERROR, "Current FFmpeg Vorbis encoder only supports 2 channels.\n");
         return -1;
     }
 
@@ -1181,7 +1181,7 @@ static av_cold int vorbis_encode_init(AVCodecContext *avctx)
     if (avctx->flags & AV_CODEC_FLAG_QSCALE)
         venc->quality = avctx->global_quality / (float)FF_QP2LAMBDA;
     else
-        venc->quality = 3.0;
+        venc->quality = 8;
     venc->quality *= venc->quality;
 
     if ((ret = put_main_header(venc, (uint8_t**)&avctx->extradata)) < 0)
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 26374cc..fa749be 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003-2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,6 +77,10 @@ typedef struct Vp3Fragment {
 /* special internal mode */
 #define MODE_COPY             8
 
+static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb);
+static int theora_decode_tables(AVCodecContext *avctx, GetBitContext *gb);
+
+
 /* There are 6 preset schemes, plus a free-form scheme */
 static const int ModeAlphabet[6][CODING_MODE_COUNT] = {
     /* scheme 1: Last motion vector dominates */
@@ -127,7 +131,7 @@ static const uint8_t hilbert_offset[16][2] = {
 
 typedef struct Vp3DecodeContext {
     AVCodecContext *avctx;
-    int theora, theora_tables;
+    int theora, theora_tables, theora_header;
     int version;
     int width, height;
     int chroma_x_shift, chroma_y_shift;
@@ -173,6 +177,7 @@ typedef struct Vp3DecodeContext {
     int data_offset[3];
     uint8_t offset_x;
     uint8_t offset_y;
+    int offset_x_warned;
 
     int8_t (*motion_val[2])[2];
 
@@ -204,8 +209,8 @@ typedef struct Vp3DecodeContext {
     int16_t *dct_tokens[3][64];
     int16_t *dct_tokens_base;
 #define TOKEN_EOB(eob_run)              ((eob_run) << 2)
-#define TOKEN_ZERO_RUN(coeff, zero_run) (((coeff) << 9) + ((zero_run) << 2) + 1)
-#define TOKEN_COEFF(coeff)              (((coeff) << 2) + 2)
+#define TOKEN_ZERO_RUN(coeff, zero_run) (((coeff) * 512) + ((zero_run) << 2) + 1)
+#define TOKEN_COEFF(coeff)              (((coeff) * 4) + 2)
 
     /**
      * number of blocks that contain DCT coefficients at
@@ -260,6 +265,20 @@ typedef struct Vp3DecodeContext {
  * VP3 specific functions
  ************************************************************************/
 
+static av_cold void free_tables(AVCodecContext *avctx)
+{
+    Vp3DecodeContext *s = avctx->priv_data;
+
+    av_freep(&s->superblock_coding);
+    av_freep(&s->all_fragments);
+    av_freep(&s->coded_fragment_list[0]);
+    av_freep(&s->dct_tokens_base);
+    av_freep(&s->superblock_fragments);
+    av_freep(&s->macroblock_coding);
+    av_freep(&s->motion_val[0]);
+    av_freep(&s->motion_val[1]);
+}
+
 static void vp3_decode_flush(AVCodecContext *avctx)
 {
     Vp3DecodeContext *s = avctx->priv_data;
@@ -277,16 +296,11 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     int i;
 
-    av_freep(&s->superblock_coding);
-    av_freep(&s->all_fragments);
-    av_freep(&s->coded_fragment_list[0]);
-    av_freep(&s->dct_tokens_base);
-    av_freep(&s->superblock_fragments);
-    av_freep(&s->macroblock_coding);
-    av_freep(&s->motion_val[0]);
-    av_freep(&s->motion_val[1]);
+    free_tables(avctx);
     av_freep(&s->edge_emu_buffer);
 
+    s->theora_tables = 0;
+
     /* release all frames */
     vp3_decode_flush(avctx);
     av_frame_free(&s->current_frame.f);
@@ -312,7 +326,7 @@ static av_cold int vp3_decode_end(AVCodecContext *avctx)
     return 0;
 }
 
-/*
+/**
  * This function sets up all of the various blocks mappings:
  * superblocks <-> fragments, macroblocks <-> fragments,
  * superblocks <-> macroblocks
@@ -403,7 +417,7 @@ static void init_loop_filter(Vp3DecodeContext *s)
     int value;
 
     filter_limit = s->filter_limit_values[s->qps[0]];
-    assert(filter_limit < 128);
+    av_assert0(filter_limit < 128U);
 
     /* set up the bounding values */
     memset(s->bounding_values_array, 0, 256 * sizeof(int));
@@ -456,7 +470,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
             if (current_run == 34)
                 current_run += get_bits(gb, 12);
 
-            if (current_superblock + current_run > s->superblock_count) {
+            if (current_run > s->superblock_count - current_superblock) {
                 av_log(s->avctx, AV_LOG_ERROR,
                        "Invalid partially coded superblock run length\n");
                 return -1;
@@ -1600,20 +1614,14 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                         /* invert DCT and place (or add) in final output */
 
                         if (s->all_fragments[i].coding_method == MODE_INTRA) {
-                            int index;
-                            index = vp3_dequant(s, s->all_fragments + i,
-                                                plane, 0, block);
-                            if (index > 63)
-                                continue;
+                            vp3_dequant(s, s->all_fragments + i,
+                                        plane, 0, block);
                             s->vp3dsp.idct_put(output_plane + first_pixel,
                                                stride,
                                                block);
                         } else {
-                            int index = vp3_dequant(s, s->all_fragments + i,
-                                                    plane, 1, block);
-                            if (index > 63)
-                                continue;
-                            if (index > 0) {
+                            if (vp3_dequant(s, s->all_fragments + i,
+                                            plane, 1, block)) {
                                 s->vp3dsp.idct_add(output_plane + first_pixel,
                                                    stride,
                                                    block);
@@ -1657,22 +1665,24 @@ static av_cold int allocate_tables(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     int y_fragment_count, c_fragment_count;
 
+    free_tables(avctx);
+
     y_fragment_count = s->fragment_width[0] * s->fragment_height[0];
     c_fragment_count = s->fragment_width[1] * s->fragment_height[1];
 
-    s->superblock_coding = av_malloc(s->superblock_count);
-    s->all_fragments     = av_malloc(s->fragment_count * sizeof(Vp3Fragment));
+    s->superblock_coding = av_mallocz(s->superblock_count);
+    s->all_fragments     = av_mallocz_array(s->fragment_count, sizeof(Vp3Fragment));
 
-    s->coded_fragment_list[0] = av_malloc(s->fragment_count * sizeof(int));
+    s->coded_fragment_list[0] = av_mallocz_array(s->fragment_count, sizeof(int));
 
-    s->dct_tokens_base = av_malloc(64 * s->fragment_count *
-                                   sizeof(*s->dct_tokens_base));
-    s->motion_val[0] = av_malloc(y_fragment_count * sizeof(*s->motion_val[0]));
-    s->motion_val[1] = av_malloc(c_fragment_count * sizeof(*s->motion_val[1]));
+    s->dct_tokens_base = av_mallocz_array(s->fragment_count,
+                                          64 * sizeof(*s->dct_tokens_base));
+    s->motion_val[0] = av_mallocz_array(y_fragment_count, sizeof(*s->motion_val[0]));
+    s->motion_val[1] = av_mallocz_array(c_fragment_count, sizeof(*s->motion_val[1]));
 
     /* work out the block mapping tables */
-    s->superblock_fragments = av_malloc(s->superblock_count * 16 * sizeof(int));
-    s->macroblock_coding    = av_malloc(s->macroblock_count + 1);
+    s->superblock_fragments = av_mallocz_array(s->superblock_count, 16 * sizeof(int));
+    s->macroblock_coding    = av_mallocz(s->macroblock_count + 1);
 
     if (!s->superblock_coding    || !s->all_fragments          ||
         !s->dct_tokens_base      || !s->coded_fragment_list[0] ||
@@ -1725,7 +1735,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     s->avctx  = avctx;
     s->width  = FFALIGN(avctx->coded_width, 16);
     s->height = FFALIGN(avctx->coded_height, 16);
-    if (avctx->pix_fmt == AV_PIX_FMT_NONE)
+    if (avctx->codec_id != AV_CODEC_ID_THEORA)
         avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->chroma_sample_location = AVCHROMA_LOC_CENTER;
     ff_hpeldsp_init(&s->hdsp, avctx->flags | AV_CODEC_FLAG_BITEXACT);
@@ -1733,7 +1743,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
 
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
         s->idct_permutation[i] = TRANSPOSE(i);
         s->idct_scantable[i]   = TRANSPOSE(ff_zigzag_direct[i]);
 #undef TRANSPOSE
@@ -1744,8 +1754,7 @@ static av_cold int vp3_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 3; i++)
         s->qps[i] = -1;
 
-    av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift,
-                                     &s->chroma_y_shift);
+    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_x_shift, &s->chroma_y_shift);
 
     s->y_superblock_width  = (s->width  + 31) / 32;
     s->y_superblock_height = (s->height + 31) / 32;
@@ -1921,6 +1930,7 @@ static int ref_frames(Vp3DecodeContext *dst, Vp3DecodeContext *src)
     return 0;
 }
 
+#if HAVE_THREADS
 static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
 {
     Vp3DecodeContext *s = dst->priv_data, *s1 = src->priv_data;
@@ -1938,6 +1948,8 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *
     }
 
     if (s != s1) {
+        if (!s->current_frame.f)
+            return AVERROR(ENOMEM);
         // init tables if the first frame hasn't been decoded
         if (!s->current_frame.f->data[0]) {
             int y_fragment_count, c_fragment_count;
@@ -1978,6 +1990,7 @@ static int vp3_update_thread_context(AVCodecContext *dst, const AVCodecContext *
 
     return update_frames(dst);
 }
+#endif
 
 static int vp3_decode_frame(AVCodecContext *avctx,
                             void *data, int *got_frame,
@@ -1989,15 +2002,50 @@ static int vp3_decode_frame(AVCodecContext *avctx,
     GetBitContext gb;
     int i, ret;
 
-    init_get_bits(&gb, buf, buf_size * 8);
+    if ((ret = init_get_bits8(&gb, buf, buf_size)) < 0)
+        return ret;
 
+#if CONFIG_THEORA_DECODER
     if (s->theora && get_bits1(&gb)) {
+        int type = get_bits(&gb, 7);
+        skip_bits_long(&gb, 6*8); /* "theora" */
+
+        if (s->avctx->active_thread_type&FF_THREAD_FRAME) {
+            av_log(avctx, AV_LOG_ERROR, "midstream reconfiguration with multithreading is unsupported, try -threads 1\n");
+            return AVERROR_PATCHWELCOME;
+        }
+        if (type == 0) {
+            vp3_decode_end(avctx);
+            ret = theora_decode_header(avctx, &gb);
+
+            if (ret >= 0)
+                ret = vp3_decode_init(avctx);
+            if (ret < 0) {
+                vp3_decode_end(avctx);
+            }
+            return ret;
+        } else if (type == 2) {
+            vp3_decode_end(avctx);
+            ret = theora_decode_tables(avctx, &gb);
+            if (ret >= 0)
+                ret = vp3_decode_init(avctx);
+            if (ret < 0) {
+                vp3_decode_end(avctx);
+            }
+            return ret;
+        }
+
         av_log(avctx, AV_LOG_ERROR,
                "Header packet passed to frame decoder, skipping\n");
         return -1;
     }
+#endif
 
     s->keyframe = !get_bits1(&gb);
+    if (!s->all_fragments) {
+        av_log(avctx, AV_LOG_ERROR, "Data packet without prior valid headers\n");
+        return -1;
+    }
     if (!s->theora)
         skip_bits(&gb, 1);
     for (i = 0; i < 3; i++)
@@ -2032,10 +2080,9 @@ static int vp3_decode_frame(AVCodecContext *avctx,
 
     s->current_frame.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
                                                 : AV_PICTURE_TYPE_P;
-    if (ff_thread_get_buffer(avctx, &s->current_frame, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    s->current_frame.f->key_frame = s->keyframe;
+    if (ff_thread_get_buffer(avctx, &s->current_frame, AV_GET_BUFFER_FLAG_REF) < 0)
         goto error;
-    }
 
     if (!s->edge_emu_buffer)
         s->edge_emu_buffer = av_malloc(9 * FFABS(s->current_frame.f->linesize[0]));
@@ -2064,10 +2111,8 @@ static int vp3_decode_frame(AVCodecContext *avctx,
 
             s->golden_frame.f->pict_type = AV_PICTURE_TYPE_I;
             if (ff_thread_get_buffer(avctx, &s->golden_frame,
-                                     AV_GET_BUFFER_FLAG_REF) < 0) {
-                av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+                                     AV_GET_BUFFER_FLAG_REF) < 0)
                 goto error;
-            }
             ff_thread_release_buffer(avctx, &s->last_frame);
             if ((ret = ff_thread_ref_frame(&s->last_frame,
                                            &s->golden_frame)) < 0)
@@ -2181,6 +2226,7 @@ static int read_huffman_tree(AVCodecContext *avctx, GetBitContext *gb)
     return 0;
 }
 
+#if HAVE_THREADS
 static int vp3_init_thread_copy(AVCodecContext *avctx)
 {
     Vp3DecodeContext *s = avctx->priv_data;
@@ -2197,6 +2243,7 @@ static int vp3_init_thread_copy(AVCodecContext *avctx)
 
     return init_frames(s);
 }
+#endif
 
 #if CONFIG_THEORA_DECODER
 static const enum AVPixelFormat theora_pix_fmts[4] = {
@@ -2211,6 +2258,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     int ret;
     AVRational fps, aspect;
 
+    s->theora_header = 0;
     s->theora = get_bits_long(gb, 24);
     av_log(avctx, AV_LOG_DEBUG, "Theora bitstream version %X\n", s->theora);
 
@@ -2239,7 +2287,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     if (av_image_check_size(visible_width, visible_height, 0, avctx) < 0 ||
         visible_width  + offset_x > s->width ||
         visible_height + offset_y > s->height) {
-        av_log(s, AV_LOG_ERROR,
+        av_log(avctx, AV_LOG_ERROR,
                "Invalid frame dimensions - w:%d h:%d x:%d y:%d (%dx%d).\n",
                visible_width, visible_height, offset_x, offset_y,
                s->width, s->height);
@@ -2276,14 +2324,18 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
     if (s->theora >= 0x030200) {
         skip_bits(gb, 5); /* keyframe frequency force */
         avctx->pix_fmt = theora_pix_fmts[get_bits(gb, 2)];
+        if (avctx->pix_fmt == AV_PIX_FMT_NONE) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid pixel format\n");
+            return AVERROR_INVALIDDATA;
+        }
         skip_bits(gb, 3); /* reserved */
-    }
+    } else
+        avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     ret = ff_set_dimensions(avctx, s->width, s->height);
     if (ret < 0)
         return ret;
-    if (!(avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) &&
-        (visible_width != s->width || visible_height != s->height)) {
+    if (!(avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP)) {
         avctx->width  = visible_width;
         avctx->height = visible_height;
         // translate offsets from theora axis ([0,0] lower left)
@@ -2293,9 +2345,12 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
 
         if ((s->offset_x & 0x1F) && !(avctx->flags & AV_CODEC_FLAG_UNALIGNED)) {
             s->offset_x &= ~0x1F;
-            av_log(avctx, AV_LOG_WARNING, "Reducing offset_x from %d to %d"
-                   "chroma samples to preserve alignment.\n",
-                   offset_x, s->offset_x);
+            if (!s->offset_x_warned) {
+                s->offset_x_warned = 1;
+                av_log(avctx, AV_LOG_WARNING, "Reducing offset_x from %d to %d"
+                    "chroma samples to preserve alignment.\n",
+                    offset_x, s->offset_x);
+            }
         }
     }
 
@@ -2309,6 +2364,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
         avctx->color_trc  = AVCOL_TRC_BT709;
     }
 
+    s->theora_header = 1;
     return 0;
 }
 
@@ -2317,6 +2373,9 @@ static int theora_decode_tables(AVCodecContext *avctx, GetBitContext *gb)
     Vp3DecodeContext *s = avctx->priv_data;
     int i, n, matrices, inter, plane;
 
+    if (!s->theora_header)
+        return AVERROR_INVALIDDATA;
+
     if (s->theora >= 0x030200) {
         n = get_bits(gb, 3);
         /* loop filter limit values table */
@@ -2426,9 +2485,12 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     GetBitContext gb;
     int ptype;
-    uint8_t *header_start[3];
+    const uint8_t *header_start[3];
     int header_len[3];
     int i;
+    int ret;
+
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     s->theora = 1;
 
@@ -2446,7 +2508,9 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
     for (i = 0; i < 3; i++) {
         if (header_len[i] <= 0)
             continue;
-        init_get_bits(&gb, header_start[i], header_len[i] * 8);
+        ret = init_get_bits8(&gb, header_start[i], header_len[i]);
+        if (ret < 0)
+            return ret;
 
         ptype = get_bits(&gb, 8);
 
@@ -2460,7 +2524,8 @@ static av_cold int theora_decode_init(AVCodecContext *avctx)
 
         switch (ptype) {
         case 0x80:
-            theora_decode_header(avctx, &gb);
+            if (theora_decode_header(avctx, &gb) < 0)
+                return -1;
             break;
         case 0x81:
 // FIXME: is this needed? it breaks sometimes
diff --git a/libavcodec/vp3_parser.c b/libavcodec/vp3_parser.c
index e8fdcca..7ee046c 100644
--- a/libavcodec/vp3_parser.c
+++ b/libavcodec/vp3_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp3data.h b/libavcodec/vp3data.h
index 5603d3b..3884bca 100644
--- a/libavcodec/vp3data.h
+++ b/libavcodec/vp3data.h
@@ -1,20 +1,20 @@
 /*
- * copyright (C) 2003 The FFmpeg project
+ * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 
 /* these coefficients dequantize intraframe Y plane coefficients
  * (note: same as JPEG) */
-static const int16_t vp31_intra_y_dequant[64] = {
+static const int8_t vp31_intra_y_dequant[64] = {
     16, 11, 10, 16,  24,  40,  51,  61,
     12, 12, 14, 19,  26,  58,  60,  55,
     14, 13, 16, 24,  40,  57,  69,  56,
@@ -39,7 +39,7 @@ static const int16_t vp31_intra_y_dequant[64] = {
 
 /* these coefficients dequantize intraframe C plane coefficients
  * (note: same as JPEG) */
-static const int16_t vp31_intra_c_dequant[64] = {
+static const int8_t vp31_intra_c_dequant[64] = {
     17, 18, 24, 47, 99, 99, 99, 99,
     18, 21, 26, 66, 99, 99, 99, 99,
     24, 26, 56, 99, 99, 99, 99, 99,
@@ -51,7 +51,7 @@ static const int16_t vp31_intra_c_dequant[64] = {
 };
 
 /* these coefficients dequantize interframe coefficients (all planes) */
-static const int16_t vp31_inter_dequant[64] = {
+static const int8_t vp31_inter_dequant[64] = {
     16, 16, 16, 20, 24, 28,  32,  40,
     16, 16, 20, 24, 28, 32,  40,  48,
     16, 20, 24, 28, 32, 40,  48,  64,
@@ -62,7 +62,7 @@ static const int16_t vp31_inter_dequant[64] = {
     40, 48, 64, 64, 64, 96, 128, 128
 };
 
-static const int16_t vp31_dc_scale_factor[64] = {
+static const uint8_t vp31_dc_scale_factor[64] = {
     220, 200, 190, 180, 170, 170, 160, 160,
     150, 150, 140, 140, 130, 130, 120, 120,
     110, 110, 100, 100,  90,  90,  90,  80,
@@ -176,7 +176,7 @@ static const uint8_t motion_vector_vlc_table[63][2] = {
     { 0xFC, 8 }, { 0xFD, 8 }, { 0xFE, 8 }, { 0xFF, 8 }
 };
 
-static const int motion_vector_table[63] = {
+static const int8_t motion_vector_table[63] = {
      0,   1, -1,
      2,  -2,
      3,  -3,
@@ -198,21 +198,21 @@ static const int8_t fixed_motion_vector_table[64] = {
 };
 
 /* only tokens 0..6 indicate eob runs */
-static const int eob_run_base[7] = {
+static const uint8_t eob_run_base[7] = {
     1, 2, 3, 4, 8, 16, 0
 };
-static const int eob_run_get_bits[7] = {
+static const uint8_t eob_run_get_bits[7] = {
     0, 0, 0, 2, 3, 4, 12
 };
 
-static const int zero_run_base[32] = {
+static const uint8_t zero_run_base[32] = {
     0,  0, 0, 0, 0, 0, 0,   /* 0..6 are never used */
     0,  0,                  /* 7..8 */
     0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9..22 */
     1,  2, 3, 4, 5,         /* 23..27 */
     6, 10, 1, 2             /* 28..31 */
 };
-static const int zero_run_get_bits[32] = {
+static const uint8_t zero_run_get_bits[32] = {
     0, 0, 0, 0, 0, 0, 0,    /* 0..6 are never used */
     3, 6,                   /* 7..8 */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9..22 */
@@ -220,7 +220,7 @@ static const int zero_run_get_bits[32] = {
     2, 3, 0, 1              /* 28..31 */
 };
 
-static const int coeff_get_bits[32] = {
+static const uint8_t coeff_get_bits[32] = {
     0, 0, 0, 0, 0, 0, 0,    /* 0..6 are never used */
     0, 0, 0, 0, 0, 0,       /* 7..12 use constant coeffs */
     1, 1, 1, 1,             /* 13..16 are constants but still need sign bit */
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index ab65f2b..814c78e 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2004 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,7 +25,6 @@
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/intreadwrite.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 3099a7e..b95adae 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp5.c b/libavcodec/vp5.c
index 81c725d..5bcf9b6 100644
--- a/libavcodec/vp5.c
+++ b/libavcodec/vp5.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,8 +35,7 @@
 #include "vp5data.h"
 
 
-static int vp5_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
-                            int *golden_frame)
+static int vp5_parse_header(VP56Context *s, const uint8_t *buf, int buf_size)
 {
     VP56RangeCoder *c = &s->c;
     int rows, cols;
@@ -86,7 +85,7 @@ static void vp5_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
 
     for (comp=0; comp<2; comp++) {
         int delta = 0;
-        if (vp56_rac_get_prob(c, model->vector_dct[comp])) {
+        if (vp56_rac_get_prob_branchy(c, model->vector_dct[comp])) {
             int sign = vp56_rac_get_prob(c, model->vector_sig[comp]);
             di  = vp56_rac_get_prob(c, model->vector_pdi[comp][0]);
             di |= vp56_rac_get_prob(c, model->vector_pdi[comp][1]) << 1;
@@ -109,19 +108,19 @@ static void vp5_parse_vector_models(VP56Context *s)
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][0]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][0]))
             model->vector_dct[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][1]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][1]))
             model->vector_sig[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][2]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][2]))
             model->vector_pdi[comp][0] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][3]))
+        if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][3]))
             model->vector_pdi[comp][1] = vp56_rac_gets_nn(c, 7);
     }
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<7; node++)
-            if (vp56_rac_get_prob(c, vp5_vmc_pct[comp][4 + node]))
+            if (vp56_rac_get_prob_branchy(c, vp5_vmc_pct[comp][4 + node]))
                 model->vector_pdv[comp][node] = vp56_rac_gets_nn(c, 7);
 }
 
@@ -138,7 +137,7 @@ static int vp5_parse_coeff_models(VP56Context *s)
 
     for (pt=0; pt<2; pt++)
         for (node=0; node<11; node++)
-            if (vp56_rac_get_prob(c, vp5_dccv_pct[pt][node])) {
+            if (vp56_rac_get_prob_branchy(c, vp5_dccv_pct[pt][node])) {
                 def_prob[node] = vp56_rac_gets_nn(c, 7);
                 model->coeff_dccv[pt][node] = def_prob[node];
             } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -149,7 +148,7 @@ static int vp5_parse_coeff_models(VP56Context *s)
         for (pt=0; pt<2; pt++)
             for (cg=0; cg<6; cg++)
                 for (node=0; node<11; node++)
-                    if (vp56_rac_get_prob(c, vp5_ract_pct[ct][pt][cg][node])) {
+                    if (vp56_rac_get_prob_branchy(c, vp5_ract_pct[ct][pt][cg][node])) {
                         def_prob[node] = vp56_rac_gets_nn(c, 7);
                         model->coeff_ract[pt][ct][cg][node] = def_prob[node];
                     } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -194,9 +193,9 @@ static void vp5_parse_coeff(VP56Context *s)
 
         coeff_idx = 0;
         for (;;) {
-            if (vp56_rac_get_prob(c, model2[0])) {
-                if (vp56_rac_get_prob(c, model2[2])) {
-                    if (vp56_rac_get_prob(c, model2[3])) {
+            if (vp56_rac_get_prob_branchy(c, model2[0])) {
+                if (vp56_rac_get_prob_branchy(c, model2[2])) {
+                    if (vp56_rac_get_prob_branchy(c, model2[3])) {
                         s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 4;
                         idx = vp56_rac_get_tree(c, ff_vp56_pc_tree, model1);
                         sign = vp56_rac_get(c);
@@ -204,7 +203,7 @@ static void vp5_parse_coeff(VP56Context *s)
                         for (i=ff_vp56_coeff_bit_length[idx]; i>=0; i--)
                             coeff += vp56_rac_get_prob(c, ff_vp56_coeff_parse_table[idx][i]) << i;
                     } else {
-                        if (vp56_rac_get_prob(c, model2[4])) {
+                        if (vp56_rac_get_prob_branchy(c, model2[4])) {
                             coeff = 3 + vp56_rac_get_prob(c, model1[5]);
                             s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 3;
                         } else {
@@ -225,7 +224,7 @@ static void vp5_parse_coeff(VP56Context *s)
                     coeff *= s->dequant_ac;
                 s->block_coeff[b][permute[coeff_idx]] = coeff;
             } else {
-                if (ct && !vp56_rac_get_prob(c, model2[1]))
+                if (ct && !vp56_rac_get_prob_branchy(c, model2[1]))
                     break;
                 ct = 0;
                 s->coeff_ctx[ff_vp56_b6to4[b]][coeff_idx] = 0;
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index 2f1de5a..6319248 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -83,16 +83,16 @@ static void vp56_parse_mb_type_models(VP56Context *s)
     int i, ctx, type;
 
     for (ctx=0; ctx<3; ctx++) {
-        if (vp56_rac_get_prob(c, 174)) {
+        if (vp56_rac_get_prob_branchy(c, 174)) {
             int idx = vp56_rac_gets(c, 4);
             memcpy(model->mb_types_stats[ctx],
                    ff_vp56_pre_def_mb_type_stats[idx][ctx],
                    sizeof(model->mb_types_stats[ctx]));
         }
-        if (vp56_rac_get_prob(c, 254)) {
+        if (vp56_rac_get_prob_branchy(c, 254)) {
             for (type=0; type<10; type++) {
                 for(i=0; i<2; i++) {
-                    if (vp56_rac_get_prob(c, 205)) {
+                    if (vp56_rac_get_prob_branchy(c, 205)) {
                         int delta, sign = vp56_rac_get(c);
 
                         delta = vp56_rac_get_tree(c, ff_vp56_pmbtm_tree,
@@ -153,7 +153,7 @@ static VP56mb vp56_parse_mb_type(VP56Context *s,
     uint8_t *mb_type_model = s->modelp->mb_type[ctx][prev_type];
     VP56RangeCoder *c = &s->c;
 
-    if (vp56_rac_get_prob(c, mb_type_model[0]))
+    if (vp56_rac_get_prob_branchy(c, mb_type_model[0]))
         return prev_type;
     else
         return vp56_rac_get_tree(c, ff_vp56_pmbt_tree, mb_type_model);
@@ -340,11 +340,11 @@ static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src,
     if (x<0 || x+12>=s->plane_width[plane] ||
         y<0 || y+12>=s->plane_height[plane]) {
         s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                            src + s->block_offset[b] + (dy-2)*stride + (dx-2),
-                            stride, stride,
-                            12, 12, x, y,
-                            s->plane_width[plane],
-                            s->plane_height[plane]);
+                                 src + s->block_offset[b] + (dy-2)*stride + (dx-2),
+                                 stride, stride,
+                                 12, 12, x, y,
+                                 s->plane_width[plane],
+                                 s->plane_height[plane]);
         src_block = s->edge_emu_buffer;
         src_offset = 2 + 2*stride;
     } else if (deblock_filtering) {
@@ -453,9 +453,9 @@ static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
     }
 }
 
-static int vp56_size_changed(AVCodecContext *avctx)
+static int vp56_size_changed(VP56Context *s)
 {
-    VP56Context *s = avctx->priv_data;
+    AVCodecContext *avctx = s->avctx;
     int stride = s->frames[VP56_FRAME_CURRENT]->linesize[0];
     int i;
 
@@ -476,19 +476,26 @@ static int vp56_size_changed(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    s->above_blocks = av_realloc(s->above_blocks,
-                                 (4*s->mb_width+6) * sizeof(*s->above_blocks));
-    s->macroblocks = av_realloc(s->macroblocks,
-                                s->mb_width*s->mb_height*sizeof(*s->macroblocks));
+    av_reallocp_array(&s->above_blocks, 4*s->mb_width+6,
+                      sizeof(*s->above_blocks));
+    av_reallocp_array(&s->macroblocks, s->mb_width*s->mb_height,
+                      sizeof(*s->macroblocks));
     av_free(s->edge_emu_buffer_alloc);
     s->edge_emu_buffer_alloc = av_malloc(16*stride);
     s->edge_emu_buffer = s->edge_emu_buffer_alloc;
+    if (!s->above_blocks || !s->macroblocks || !s->edge_emu_buffer_alloc)
+        return AVERROR(ENOMEM);
     if (s->flip < 0)
         s->edge_emu_buffer += 15 * stride;
 
+    if (s->alpha_context)
+        return vp56_size_changed(s->alpha_context);
+
     return 0;
 }
 
+static int ff_vp56_decode_mbs(AVCodecContext *avctx, void *, int, int);
+
 int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                          AVPacket *avpkt)
 {
@@ -496,8 +503,9 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     VP56Context *s = avctx->priv_data;
     AVFrame *const p = s->frames[VP56_FRAME_CURRENT];
     int remaining_buf_size = avpkt->size;
-    int is_alpha, av_uninit(alpha_offset);
-    int res;
+    int av_uninit(alpha_offset);
+    int i, res;
+    int ret;
 
     if (s->has_alpha) {
         if (remaining_buf_size < 3)
@@ -508,156 +516,184 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
             return AVERROR_INVALIDDATA;
     }
 
-    for (is_alpha=0; is_alpha < 1+s->has_alpha; is_alpha++) {
-        int mb_row, mb_col, mb_row_flip, mb_offset = 0;
-        int block, y, uv;
-        ptrdiff_t stride_y, stride_uv;
-        int golden_frame = 0;
+    res = s->parse_header(s, buf, remaining_buf_size);
+    if (res < 0)
+        return res;
 
-        s->modelp = &s->models[is_alpha];
+    if (res == VP56_SIZE_CHANGE) {
+        for (i = 0; i < 4; i++) {
+            av_frame_unref(s->frames[i]);
+            if (s->alpha_context)
+                av_frame_unref(s->alpha_context->frames[i]);
+        }
+    }
 
-        res = s->parse_header(s, buf, remaining_buf_size, &golden_frame);
-        if (res < 0) {
-            int i;
-            for (i = 0; i < 4; i++)
-                av_frame_unref(s->frames[i]);
-            return res;
+    ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF);
+    if (ret < 0)
+        return ret;
+
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
+        av_frame_unref(s->alpha_context->frames[VP56_FRAME_CURRENT]);
+        if ((ret = av_frame_ref(s->alpha_context->frames[VP56_FRAME_CURRENT], p)) < 0) {
+            av_frame_unref(p);
+            return ret;
         }
+    }
 
-        if (res == VP56_SIZE_CHANGE) {
-            int i;
-            for (i = 0; i < 4; i++)
-                av_frame_unref(s->frames[i]);
-            if (is_alpha) {
-                ff_set_dimensions(avctx, 0, 0);
-                return AVERROR_INVALIDDATA;
-            }
+    if (res == VP56_SIZE_CHANGE) {
+        if (vp56_size_changed(s)) {
+            av_frame_unref(p);
+            return AVERROR_INVALIDDATA;
         }
+    }
 
-        if (!is_alpha) {
-            int ret = ff_get_buffer(avctx, p, AV_GET_BUFFER_FLAG_REF);
-            if (ret < 0) {
-                av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-                return ret;
+    if (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) {
+        int bak_w = avctx->width;
+        int bak_h = avctx->height;
+        int bak_cw = avctx->coded_width;
+        int bak_ch = avctx->coded_height;
+        buf += alpha_offset;
+        remaining_buf_size -= alpha_offset;
+
+        res = s->alpha_context->parse_header(s->alpha_context, buf, remaining_buf_size);
+        if (res != 0) {
+            if(res==VP56_SIZE_CHANGE) {
+                av_log(avctx, AV_LOG_ERROR, "Alpha reconfiguration\n");
+                avctx->width  = bak_w;
+                avctx->height = bak_h;
+                avctx->coded_width  = bak_cw;
+                avctx->coded_height = bak_ch;
             }
-
-            if (res == VP56_SIZE_CHANGE)
-                if (vp56_size_changed(avctx)) {
-                    av_frame_unref(p);
-                    return AVERROR_INVALIDDATA;
-                }
+            av_frame_unref(p);
+            return AVERROR_INVALIDDATA;
         }
+    }
 
-        if (p->key_frame) {
-            p->pict_type = AV_PICTURE_TYPE_I;
-            s->default_models_init(s);
-            for (block=0; block<s->mb_height*s->mb_width; block++)
-                s->macroblocks[block].type = VP56_MB_INTRA;
-        } else {
-            p->pict_type = AV_PICTURE_TYPE_P;
-            vp56_parse_mb_type_models(s);
-            s->parse_vector_models(s);
-            s->mb_type = VP56_MB_INTER_NOVEC_PF;
-        }
+    avctx->execute2(avctx, ff_vp56_decode_mbs, 0, 0, (avctx->pix_fmt == AV_PIX_FMT_YUVA420P) + 1);
 
-        if (s->parse_coeff_models(s))
-            goto next;
+    if ((res = av_frame_ref(data, p)) < 0)
+        return res;
+    *got_frame = 1;
 
-        memset(s->prev_dc, 0, sizeof(s->prev_dc));
-        s->prev_dc[1][VP56_FRAME_CURRENT] = 128;
-        s->prev_dc[2][VP56_FRAME_CURRENT] = 128;
+    return avpkt->size;
+}
 
-        for (block=0; block < 4*s->mb_width+6; block++) {
-            s->above_blocks[block].ref_frame = VP56_FRAME_NONE;
-            s->above_blocks[block].dc_coeff = 0;
-            s->above_blocks[block].not_null_dc = 0;
-        }
-        s->above_blocks[2*s->mb_width + 2].ref_frame = VP56_FRAME_CURRENT;
-        s->above_blocks[3*s->mb_width + 4].ref_frame = VP56_FRAME_CURRENT;
+static int ff_vp56_decode_mbs(AVCodecContext *avctx, void *data,
+                              int jobnr, int threadnr)
+{
+    VP56Context *s0 = avctx->priv_data;
+    int is_alpha = (jobnr == 1);
+    VP56Context *s = is_alpha ? s0->alpha_context : s0;
+    AVFrame *const p = s->frames[VP56_FRAME_CURRENT];
+    int mb_row, mb_col, mb_row_flip, mb_offset = 0;
+    int block, y, uv;
+    ptrdiff_t stride_y, stride_uv;
+    int res;
+
+    if (p->key_frame) {
+        p->pict_type = AV_PICTURE_TYPE_I;
+        s->default_models_init(s);
+        for (block=0; block<s->mb_height*s->mb_width; block++)
+            s->macroblocks[block].type = VP56_MB_INTRA;
+    } else {
+        p->pict_type = AV_PICTURE_TYPE_P;
+        vp56_parse_mb_type_models(s);
+        s->parse_vector_models(s);
+        s->mb_type = VP56_MB_INTER_NOVEC_PF;
+    }
+
+    if (s->parse_coeff_models(s))
+        goto next;
 
-        stride_y  = p->linesize[0];
-        stride_uv = p->linesize[1];
+    memset(s->prev_dc, 0, sizeof(s->prev_dc));
+    s->prev_dc[1][VP56_FRAME_CURRENT] = 128;
+    s->prev_dc[2][VP56_FRAME_CURRENT] = 128;
 
+    for (block=0; block < 4*s->mb_width+6; block++) {
+        s->above_blocks[block].ref_frame = VP56_FRAME_NONE;
+        s->above_blocks[block].dc_coeff = 0;
+        s->above_blocks[block].not_null_dc = 0;
+    }
+    s->above_blocks[2*s->mb_width + 2].ref_frame = VP56_FRAME_CURRENT;
+    s->above_blocks[3*s->mb_width + 4].ref_frame = VP56_FRAME_CURRENT;
+
+    stride_y  = p->linesize[0];
+    stride_uv = p->linesize[1];
+
+    if (s->flip < 0)
+        mb_offset = 7;
+
+    /* main macroblocks loop */
+    for (mb_row=0; mb_row<s->mb_height; mb_row++) {
         if (s->flip < 0)
-            mb_offset = 7;
-
-        /* main macroblocks loop */
-        for (mb_row=0; mb_row<s->mb_height; mb_row++) {
-            if (s->flip < 0)
-                mb_row_flip = s->mb_height - mb_row - 1;
-            else
-                mb_row_flip = mb_row;
-
-            for (block=0; block<4; block++) {
-                s->left_block[block].ref_frame = VP56_FRAME_NONE;
-                s->left_block[block].dc_coeff = 0;
-                s->left_block[block].not_null_dc = 0;
-            }
-            memset(s->coeff_ctx, 0, sizeof(s->coeff_ctx));
-            memset(s->coeff_ctx_last, 24, sizeof(s->coeff_ctx_last));
-
-            s->above_block_idx[0] = 1;
-            s->above_block_idx[1] = 2;
-            s->above_block_idx[2] = 1;
-            s->above_block_idx[3] = 2;
-            s->above_block_idx[4] = 2*s->mb_width + 2 + 1;
-            s->above_block_idx[5] = 3*s->mb_width + 4 + 1;
-
-            s->block_offset[s->frbi] = (mb_row_flip*16 + mb_offset) * stride_y;
-            s->block_offset[s->srbi] = s->block_offset[s->frbi] + 8*stride_y;
-            s->block_offset[1] = s->block_offset[0] + 8;
-            s->block_offset[3] = s->block_offset[2] + 8;
-            s->block_offset[4] = (mb_row_flip*8 + mb_offset) * stride_uv;
-            s->block_offset[5] = s->block_offset[4];
-
-            for (mb_col=0; mb_col<s->mb_width; mb_col++) {
-                vp56_decode_mb(s, mb_row, mb_col, is_alpha);
-
-                for (y=0; y<4; y++) {
-                    s->above_block_idx[y] += 2;
-                    s->block_offset[y] += 16;
-                }
+            mb_row_flip = s->mb_height - mb_row - 1;
+        else
+            mb_row_flip = mb_row;
 
-                for (uv=4; uv<6; uv++) {
-                    s->above_block_idx[uv] += 1;
-                    s->block_offset[uv] += 8;
-                }
-            }
+        for (block=0; block<4; block++) {
+            s->left_block[block].ref_frame = VP56_FRAME_NONE;
+            s->left_block[block].dc_coeff = 0;
+            s->left_block[block].not_null_dc = 0;
         }
+        memset(s->coeff_ctx, 0, sizeof(s->coeff_ctx));
+        memset(s->coeff_ctx_last, 24, sizeof(s->coeff_ctx_last));
+
+        s->above_block_idx[0] = 1;
+        s->above_block_idx[1] = 2;
+        s->above_block_idx[2] = 1;
+        s->above_block_idx[3] = 2;
+        s->above_block_idx[4] = 2*s->mb_width + 2 + 1;
+        s->above_block_idx[5] = 3*s->mb_width + 4 + 1;
+
+        s->block_offset[s->frbi] = (mb_row_flip*16 + mb_offset) * stride_y;
+        s->block_offset[s->srbi] = s->block_offset[s->frbi] + 8*stride_y;
+        s->block_offset[1] = s->block_offset[0] + 8;
+        s->block_offset[3] = s->block_offset[2] + 8;
+        s->block_offset[4] = (mb_row_flip*8 + mb_offset) * stride_uv;
+        s->block_offset[5] = s->block_offset[4];
+
+        for (mb_col=0; mb_col<s->mb_width; mb_col++) {
+            vp56_decode_mb(s, mb_row, mb_col, is_alpha);
+
+            for (y=0; y<4; y++) {
+                s->above_block_idx[y] += 2;
+                s->block_offset[y] += 16;
+            }
 
-    next:
-        if (p->key_frame || golden_frame) {
-            av_frame_unref(s->frames[VP56_FRAME_GOLDEN]);
-            if ((res = av_frame_ref(s->frames[VP56_FRAME_GOLDEN], p)) < 0)
-                return res;
+            for (uv=4; uv<6; uv++) {
+                s->above_block_idx[uv] += 1;
+                s->block_offset[uv] += 8;
+            }
         }
+    }
 
-        if (s->has_alpha) {
-            FFSWAP(AVFrame *, s->frames[VP56_FRAME_GOLDEN],
-                              s->frames[VP56_FRAME_GOLDEN2]);
-            buf += alpha_offset;
-            remaining_buf_size -= alpha_offset;
-        }
+next:
+    if (p->key_frame || s->golden_frame) {
+        av_frame_unref(s->frames[VP56_FRAME_GOLDEN]);
+        if ((res = av_frame_ref(s->frames[VP56_FRAME_GOLDEN], p)) < 0)
+            return res;
     }
 
     av_frame_unref(s->frames[VP56_FRAME_PREVIOUS]);
     FFSWAP(AVFrame *, s->frames[VP56_FRAME_CURRENT],
                       s->frames[VP56_FRAME_PREVIOUS]);
-
-    if ((res = av_frame_ref(data, p)) < 0)
-        return res;
-    *got_frame = 1;
-
-    return avpkt->size;
+    return 0;
 }
 
 av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 {
     VP56Context *s = avctx->priv_data;
+    return ff_vp56_init_context(avctx, s, flip, has_alpha);
+}
+
+av_cold int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
+                                  int flip, int has_alpha)
+{
     int i;
 
     s->avctx = avctx;
     avctx->pix_fmt = has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
+    if (avctx->skip_alpha) avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     ff_h264chroma_init(&s->h264chroma, 8);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
@@ -665,7 +701,7 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
     ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id);
     for (i = 0; i < 64; i++) {
-#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
+#define TRANSPOSE(x) (((x) >> 3) | (((x) & 7) << 3))
         s->idct_scantable[i] = TRANSPOSE(ff_zigzag_direct[i]);
 #undef TRANSPOSE
     }
@@ -683,10 +719,14 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
     s->macroblocks = NULL;
     s->quantizer = -1;
     s->deblock_filtering = 1;
+    s->golden_frame = 0;
 
     s->filter = NULL;
 
     s->has_alpha = has_alpha;
+
+    s->modelp = &s->model;
+
     if (flip) {
         s->flip = -1;
         s->frbi = 2;
@@ -703,6 +743,11 @@ av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 av_cold int ff_vp56_free(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
+    return ff_vp56_free_context(s);
+}
+
+av_cold int ff_vp56_free_context(VP56Context *s)
+{
     int i;
 
     av_freep(&s->above_blocks);
diff --git a/libavcodec/vp56.h b/libavcodec/vp56.h
index f2ed770..56c3091 100644
--- a/libavcodec/vp56.h
+++ b/libavcodec/vp56.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -79,7 +79,7 @@ typedef void (*VP56DefaultModelsInit)(VP56Context *s);
 typedef void (*VP56ParseVectorModels)(VP56Context *s);
 typedef int  (*VP56ParseCoeffModels)(VP56Context *s);
 typedef int  (*VP56ParseHeader)(VP56Context *s, const uint8_t *buf,
-                                int buf_size, int *golden_frame);
+                                int buf_size);
 
 typedef struct VP56RangeCoder {
     int high;
@@ -135,6 +135,7 @@ struct vp56_context {
     int sub_version;
 
     /* frame info */
+    int golden_frame;
     int plane_width[4];
     int plane_height[4];
     int mb_width;   /* number of horizontal MB */
@@ -189,8 +190,11 @@ struct vp56_context {
     VP56ParseCoeffModels parse_coeff_models;
     VP56ParseHeader parse_header;
 
+    /* for "slice" parallelism between YUV and A */
+    VP56Context *alpha_context;
+
     VP56Model *modelp;
-    VP56Model models[2];
+    VP56Model model;
 
     /* huffman decoding */
     int use_huffman;
@@ -203,7 +207,10 @@ struct vp56_context {
 
 
 int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha);
+int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
+                          int flip, int has_alpha);
 int ff_vp56_free(AVCodecContext *avctx);
+int ff_vp56_free_context(VP56Context *s);
 void ff_vp56_init_dequant(VP56Context *s, int quantizer);
 int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                          AVPacket *avpkt);
@@ -356,7 +363,7 @@ int vp56_rac_get_tree(VP56RangeCoder *c,
                       const uint8_t *probs)
 {
     while (tree->val > 0) {
-        if (vp56_rac_get_prob(c, probs[tree->prob_idx]))
+        if (vp56_rac_get_prob_branchy(c, probs[tree->prob_idx]))
             tree += tree->val;
         else
             tree++;
@@ -364,15 +371,13 @@ int vp56_rac_get_tree(VP56RangeCoder *c,
     return -tree->val;
 }
 
-/**
- * This is identical to vp8_rac_get_tree except for the possibility of starting
- * on a node other than the root node, needed for coeff decode where this is
- * used to save a bit after a 0 token (by disallowing EOB to immediately follow.)
- */
-static av_always_inline
-int vp8_rac_get_tree_with_offset(VP56RangeCoder *c, const int8_t (*tree)[2],
-                                 const uint8_t *probs, int i)
+// how probabilities are associated with decisions is different I think
+// well, the new scheme fits in the old but this way has one fewer branches per decision
+static av_always_inline int vp8_rac_get_tree(VP56RangeCoder *c, const int8_t (*tree)[2],
+                                   const uint8_t *probs)
 {
+    int i = 0;
+
     do {
         i = tree[i][vp56_rac_get_prob(c, probs[i])];
     } while (i > 0);
@@ -380,15 +385,6 @@ int vp8_rac_get_tree_with_offset(VP56RangeCoder *c, const int8_t (*tree)[2],
     return -i;
 }
 
-// how probabilities are associated with decisions is different I think
-// well, the new scheme fits in the old but this way has one fewer branches per decision
-static av_always_inline
-int vp8_rac_get_tree(VP56RangeCoder *c, const int8_t (*tree)[2],
-                     const uint8_t *probs)
-{
-    return vp8_rac_get_tree_with_offset(c, tree, probs, 0);
-}
-
 // DCTextra
 static av_always_inline int vp8_rac_get_coeff(VP56RangeCoder *c, const uint8_t *prob)
 {
diff --git a/libavcodec/vp56data.c b/libavcodec/vp56data.c
index 989c76a..0080370 100644
--- a/libavcodec/vp56data.c
+++ b/libavcodec/vp56data.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56data.h b/libavcodec/vp56data.h
index 21907bd..3be268c 100644
--- a/libavcodec/vp56data.h
+++ b/libavcodec/vp56data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56dsp.c b/libavcodec/vp56dsp.c
index 5e09d24..fa533ec 100644
--- a/libavcodec/vp56dsp.c
+++ b/libavcodec/vp56dsp.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2006 Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56dsp.h b/libavcodec/vp56dsp.h
index 389d359..7807baa 100644
--- a/libavcodec/vp56dsp.h
+++ b/libavcodec/vp56dsp.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp56rac.c b/libavcodec/vp56rac.c
index 270a3ca..6061b7e 100644
--- a/libavcodec/vp56rac.c
+++ b/libavcodec/vp56rac.c
@@ -2,20 +2,20 @@
  * VP5/6/8 decoder
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp5data.h b/libavcodec/vp5data.h
index b11b99d..e16ff2d 100644
--- a/libavcodec/vp5data.h
+++ b/libavcodec/vp5data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp6.c b/libavcodec/vp6.c
index c48c2b8..a2bb457 100644
--- a/libavcodec/vp6.c
+++ b/libavcodec/vp6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,8 +43,7 @@
 static void vp6_parse_coeff(VP56Context *s);
 static void vp6_parse_coeff_huffman(VP56Context *s);
 
-static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
-                            int *golden_frame)
+static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size)
 {
     VP56RangeCoder *c = &s->c;
     int parse_filter_info = 0;
@@ -113,6 +112,7 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
         if (sub_version < 8)
             vrt_shift = 5;
         s->sub_version = sub_version;
+        s->golden_frame = 0;
     } else {
         if (!s->sub_version || !s->avctx->coded_width || !s->avctx->coded_height)
             return AVERROR_INVALIDDATA;
@@ -124,7 +124,7 @@ static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
         }
         ff_vp56_init_range_decoder(c, buf+1, buf_size-1);
 
-        *golden_frame = vp56_rac_get(c);
+        s->golden_frame = vp56_rac_get(c);
         if (s->filter_header) {
             s->deblock_filtering = vp56_rac_get(c);
             if (s->deblock_filtering)
@@ -211,20 +211,20 @@ static void vp6_parse_vector_models(VP56Context *s)
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
-        if (vp56_rac_get_prob(c, vp6_sig_dct_pct[comp][0]))
+        if (vp56_rac_get_prob_branchy(c, vp6_sig_dct_pct[comp][0]))
             model->vector_dct[comp] = vp56_rac_gets_nn(c, 7);
-        if (vp56_rac_get_prob(c, vp6_sig_dct_pct[comp][1]))
+        if (vp56_rac_get_prob_branchy(c, vp6_sig_dct_pct[comp][1]))
             model->vector_sig[comp] = vp56_rac_gets_nn(c, 7);
     }
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<7; node++)
-            if (vp56_rac_get_prob(c, vp6_pdv_pct[comp][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_pdv_pct[comp][node]))
                 model->vector_pdv[comp][node] = vp56_rac_gets_nn(c, 7);
 
     for (comp=0; comp<2; comp++)
         for (node=0; node<8; node++)
-            if (vp56_rac_get_prob(c, vp6_fdv_pct[comp][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_fdv_pct[comp][node]))
                 model->vector_fdv[comp][node] = vp56_rac_gets_nn(c, 7);
 }
 
@@ -270,7 +270,7 @@ static int vp6_parse_coeff_models(VP56Context *s)
 
     for (pt=0; pt<2; pt++)
         for (node=0; node<11; node++)
-            if (vp56_rac_get_prob(c, vp6_dccv_pct[pt][node])) {
+            if (vp56_rac_get_prob_branchy(c, vp6_dccv_pct[pt][node])) {
                 def_prob[node] = vp56_rac_gets_nn(c, 7);
                 model->coeff_dccv[pt][node] = def_prob[node];
             } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -279,21 +279,21 @@ static int vp6_parse_coeff_models(VP56Context *s)
 
     if (vp56_rac_get(c)) {
         for (pos=1; pos<64; pos++)
-            if (vp56_rac_get_prob(c, vp6_coeff_reorder_pct[pos]))
+            if (vp56_rac_get_prob_branchy(c, vp6_coeff_reorder_pct[pos]))
                 model->coeff_reorder[pos] = vp56_rac_gets(c, 4);
         vp6_coeff_order_table_init(s);
     }
 
     for (cg=0; cg<2; cg++)
         for (node=0; node<14; node++)
-            if (vp56_rac_get_prob(c, vp6_runv_pct[cg][node]))
+            if (vp56_rac_get_prob_branchy(c, vp6_runv_pct[cg][node]))
                 model->coeff_runv[cg][node] = vp56_rac_gets_nn(c, 7);
 
     for (ct=0; ct<3; ct++)
         for (pt=0; pt<2; pt++)
             for (cg=0; cg<6; cg++)
                 for (node=0; node<11; node++)
-                    if (vp56_rac_get_prob(c, vp6_ract_pct[ct][pt][cg][node])) {
+                    if (vp56_rac_get_prob_branchy(c, vp6_ract_pct[ct][pt][cg][node])) {
                         def_prob[node] = vp56_rac_gets_nn(c, 7);
                         model->coeff_ract[pt][ct][cg][node] = def_prob[node];
                     } else if (s->frames[VP56_FRAME_CURRENT]->key_frame) {
@@ -339,7 +339,7 @@ static void vp6_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
     for (comp=0; comp<2; comp++) {
         int i, delta = 0;
 
-        if (vp56_rac_get_prob(c, model->vector_dct[comp])) {
+        if (vp56_rac_get_prob_branchy(c, model->vector_dct[comp])) {
             static const uint8_t prob_order[] = {0, 1, 2, 7, 6, 5, 4};
             for (i=0; i<sizeof(prob_order); i++) {
                 int j = prob_order[i];
@@ -354,7 +354,7 @@ static void vp6_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
                                       model->vector_pdv[comp]);
         }
 
-        if (delta && vp56_rac_get_prob(c, model->vector_sig[comp]))
+        if (delta && vp56_rac_get_prob_branchy(c, model->vector_sig[comp]))
             delta = -delta;
 
         if (!comp)
@@ -462,16 +462,16 @@ static void vp6_parse_coeff(VP56Context *s)
 
         coeff_idx = 0;
         for (;;) {
-            if ((coeff_idx>1 && ct==0) || vp56_rac_get_prob(c, model2[0])) {
+            if ((coeff_idx>1 && ct==0) || vp56_rac_get_prob_branchy(c, model2[0])) {
                 /* parse a coeff */
-                if (vp56_rac_get_prob(c, model2[2])) {
-                    if (vp56_rac_get_prob(c, model2[3])) {
+                if (vp56_rac_get_prob_branchy(c, model2[2])) {
+                    if (vp56_rac_get_prob_branchy(c, model2[3])) {
                         idx = vp56_rac_get_tree(c, ff_vp56_pc_tree, model1);
                         coeff = ff_vp56_coeff_bias[idx+5];
                         for (i=ff_vp56_coeff_bit_length[idx]; i>=0; i--)
                             coeff += vp56_rac_get_prob(c, ff_vp56_coeff_parse_table[idx][i]) << i;
                     } else {
-                        if (vp56_rac_get_prob(c, model2[4]))
+                        if (vp56_rac_get_prob_branchy(c, model2[4]))
                             coeff = 3 + vp56_rac_get_prob(c, model1[5]);
                         else
                             coeff = 2;
@@ -492,7 +492,7 @@ static void vp6_parse_coeff(VP56Context *s)
                 /* parse a run */
                 ct = 0;
                 if (coeff_idx > 0) {
-                    if (!vp56_rac_get_prob(c, model2[1]))
+                    if (!vp56_rac_get_prob_branchy(c, model2[1]))
                         break;
 
                     model3 = model->coeff_runv[coeff_idx >= 6];
@@ -604,6 +604,8 @@ static void vp6_filter(VP56Context *s, uint8_t *dst, uint8_t *src,
     }
 }
 
+static av_cold void vp6_decode_init_context(VP56Context *s);
+
 static av_cold int vp6_decode_init(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
@@ -613,6 +615,21 @@ static av_cold int vp6_decode_init(AVCodecContext *avctx)
                             avctx->codec->id == AV_CODEC_ID_VP6A)) < 0)
         return ret;
 
+    vp6_decode_init_context(s);
+
+    if (s->has_alpha) {
+        s->alpha_context = av_mallocz(sizeof(VP56Context));
+        ff_vp56_init_context(avctx, s->alpha_context,
+                             s->flip == -1, s->has_alpha);
+        vp6_decode_init_context(s->alpha_context);
+    }
+
+    return 0;
+}
+
+static av_cold void vp6_decode_init_context(VP56Context *s)
+{
+    s->deblock_filtering = 0;
     s->vp56_coord_div = vp6_coord_div;
     s->parse_vector_adjustment = vp6_parse_vector_adjustment;
     s->filter = vp6_filter;
@@ -620,16 +637,29 @@ static av_cold int vp6_decode_init(AVCodecContext *avctx)
     s->parse_vector_models = vp6_parse_vector_models;
     s->parse_coeff_models = vp6_parse_coeff_models;
     s->parse_header = vp6_parse_header;
-
-    return 0;
 }
 
+static av_cold void vp6_decode_free_context(VP56Context *s);
+
 static av_cold int vp6_decode_free(AVCodecContext *avctx)
 {
     VP56Context *s = avctx->priv_data;
-    int pt, ct, cg;
 
     ff_vp56_free(avctx);
+    vp6_decode_free_context(s);
+
+    if (s->alpha_context) {
+        ff_vp56_free_context(s->alpha_context);
+        vp6_decode_free_context(s->alpha_context);
+        av_freep(&s->alpha_context);
+    }
+
+    return 0;
+}
+
+static av_cold void vp6_decode_free_context(VP56Context *s)
+{
+    int pt, ct, cg;
 
     for (pt=0; pt<2; pt++) {
         ff_free_vlc(&s->dccv_vlc[pt]);
@@ -638,7 +668,6 @@ static av_cold int vp6_decode_free(AVCodecContext *avctx)
             for (cg=0; cg<6; cg++)
                 ff_free_vlc(&s->ract_vlc[pt][ct][cg]);
     }
-    return 0;
 }
 
 AVCodec ff_vp6_decoder = {
@@ -676,5 +705,5 @@ AVCodec ff_vp6a_decoder = {
     .init           = vp6_decode_init,
     .close          = vp6_decode_free,
     .decode         = ff_vp56_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS,
 };
diff --git a/libavcodec/vp6data.h b/libavcodec/vp6data.h
index 2de90e7..539e19a 100644
--- a/libavcodec/vp6data.h
+++ b/libavcodec/vp6data.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp6dsp.c b/libavcodec/vp6dsp.c
index 54a96ed..67c6be0 100644
--- a/libavcodec/vp6dsp.c
+++ b/libavcodec/vp6dsp.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index d5f8fbb..e60705a 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -7,20 +7,20 @@
  * Copyright (C) 2012 Daniel Kang
  * Copyright (C) 2014 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,6 +38,14 @@
 #   include "arm/vp8.h"
 #endif
 
+#if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
+#define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
+#elif CONFIG_VP7_DECODER
+#define VPX(vp7, f) vp7_ ## f
+#else // CONFIG_VP8_DECODER
+#define VPX(vp7, f) vp8_ ## f
+#endif
+
 static void free_buffers(VP8Context *s)
 {
     int i;
@@ -144,7 +152,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
     AVCodecContext *avctx = s->avctx;
     int i, ret;
 
-    if (width  != s->avctx->width ||
+    if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
         height != s->avctx->height) {
         vp8_decode_flush_impl(s->avctx, 1);
 
@@ -157,7 +165,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
     s->mb_height = (s->avctx->coded_height + 15) / 16;
 
     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
-                   FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
+                   avctx->thread_count > 1;
     if (!s->mb_layout) { // Frame threading and one thread
         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
                                                sizeof(*s->macroblocks));
@@ -203,6 +211,7 @@ static int vp8_update_dimensions(VP8Context *s, int width, int height)
     return update_dimensions(s, width, height, IS_VP8);
 }
 
+
 static void parse_segment_info(VP8Context *s)
 {
     VP56RangeCoder *c = &s->c;
@@ -293,7 +302,7 @@ static void vp7_get_quants(VP8Context *s)
     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 }
 
-static void get_quants(VP8Context *s)
+static void vp8_get_quants(VP8Context *s)
 {
     VP56RangeCoder *c = &s->c;
     int i, base_qi;
@@ -414,7 +423,7 @@ static void update_refs(VP8Context *s)
     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 }
 
-static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
+static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 {
     int i, j;
 
@@ -425,16 +434,16 @@ static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
     }
 }
 
-static void fade(uint8_t *dst, uint8_t *src,
-                 int width, int height, int linesize,
+static void fade(uint8_t *dst, int dst_linesize,
+                 const uint8_t *src, int src_linesize,
+                 int width, int height,
                  int alpha, int beta)
 {
     int i, j;
-
     for (j = 0; j < height; j++) {
         for (i = 0; i < width; i++) {
-            uint8_t y = src[j * linesize + i];
-            dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
+            uint8_t y = src[j * src_linesize + i];
+            dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
         }
     }
 }
@@ -450,8 +459,11 @@ static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
         int height = s->mb_height * 16;
         AVFrame *src, *dst;
 
-        if (!s->framep[VP56_FRAME_PREVIOUS])
+        if (!s->framep[VP56_FRAME_PREVIOUS] ||
+            !s->framep[VP56_FRAME_GOLDEN]) {
+            av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
             return AVERROR_INVALIDDATA;
+        }
 
         dst =
         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
@@ -460,15 +472,16 @@ static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
-               return ret;
+                return ret;
 
             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 
-            copy_luma(dst, src, width, height);
+            copy_chroma(dst, src, width, height);
         }
 
-        fade(dst->data[0], src->data[0],
-             width, height, dst->linesize[0], alpha, beta);
+        fade(dst->data[0], dst->linesize[0],
+             src->data[0], src->linesize[0],
+             width, height, alpha, beta);
     }
 
     return 0;
@@ -495,13 +508,14 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     s->invisible = 0;
     part1_size   = AV_RL24(buf) >> 4;
 
-    buf      += 4 - s->profile;
-    buf_size -= 4 - s->profile;
-
-    if (buf_size < part1_size) {
+    if (buf_size < 4 - s->profile + part1_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
         return AVERROR_INVALIDDATA;
     }
 
+    buf      += 4 - s->profile;
+    buf_size -= 4 - s->profile;
+
     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 
     ff_vp56_init_range_decoder(c, buf, part1_size);
@@ -547,7 +561,7 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
              if (vp7_feature_value_size[s->profile][i])
                  for (j = 0; j < 4; j++)
                      s->feature_value[i][j] =
-                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
+                        vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
         }
     }
 
@@ -630,6 +644,11 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     int width  = s->avctx->width;
     int height = s->avctx->height;
 
+    if (buf_size < 3) {
+        av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
+        return AVERROR_INVALIDDATA;
+    }
+
     s->keyframe  = !(buf[0] & 1);
     s->profile   =  (buf[0]>>1) & 7;
     s->invisible = !(buf[0] & 0x10);
@@ -710,11 +729,12 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     }
 
     if (!s->macroblocks_base || /* first frame */
-        width != s->avctx->width || height != s->avctx->height)
+        width != s->avctx->width || height != s->avctx->height ||
+        (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
             return ret;
 
-    get_quants(s);
+    vp8_get_quants(s);
 
     if (!s->keyframe) {
         update_refs(s);
@@ -747,14 +767,16 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
 static av_always_inline
 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 {
-    dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
-    dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
+    dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
+                             av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
+    dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
+                             av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 }
 
 /**
  * Motion vector coding, 17.1.
  */
-static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
+static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 {
     int bit, x = 0;
 
@@ -782,6 +804,16 @@ static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 }
 
+static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
+{
+    return read_mv_component(c, p, 1);
+}
+
+static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
+{
+    return read_mv_component(c, p, 0);
+}
+
 static av_always_inline
 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 {
@@ -972,8 +1004,8 @@ void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
                     mb->mode = VP8_MVMODE_SPLIT;
                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
                 } else {
-                    mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
-                    mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
+                    mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
+                    mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
                     mb->bmv[0] = mb->mv;
                 }
             } else {
@@ -1072,8 +1104,8 @@ void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
                     mb->mode = VP8_MVMODE_SPLIT;
                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
                 } else {
-                    mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
-                    mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
+                    mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
+                    mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
                     mb->bmv[0] = mb->mv;
                 }
             } else {
@@ -1097,7 +1129,7 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 {
     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 
-    if (layout == 1) {
+    if (layout) {
         VP8Macroblock *mb_top = mb - s->mb_width - 1;
         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
     }
@@ -1105,7 +1137,7 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
         int x, y;
         uint8_t *top;
         uint8_t *const left = s->intra4x4_pred_mode_left;
-        if (layout == 1)
+        if (layout)
             top = mb->intra4x4_pred_mode_top;
         else
             top = s->intra4x4_pred_mode_top + 4 * mb_x;
@@ -1140,7 +1172,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         *segment = 0;
         for (i = 0; i < 4; i++) {
             if (s->feature_enabled[i]) {
-                if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
+                if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
                                                    s->feature_index_prob[i]);
                       av_log(s->avctx, AV_LOG_WARNING,
@@ -1149,9 +1181,10 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
                 }
            }
         }
-    } else if (s->segmentation.update_map)
-        *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
-    else if (s->segmentation.enabled)
+    } else if (s->segmentation.update_map) {
+        int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
+        *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
+    } else if (s->segmentation.enabled)
         *segment = ref ? *ref : *segment;
     mb->segment = *segment;
 
@@ -1166,7 +1199,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         } else {
             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
-            if (s->mb_layout == 1)
+            if (s->mb_layout)
                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
             else
                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
@@ -1330,6 +1363,7 @@ static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
  * @param zero_nhood the initial prediction context for number of surrounding
  *                   all-zero blocks (only left/top, so 0-2)
  * @param qmul       array holding the dc/ac dequant factor at position 0/1
+ * @param scan       scan pattern (VP7 only)
  *
  * @return 0 if no coeffs were decoded
  *         otherwise, the index of the last coeff decoded plus one
@@ -1593,7 +1627,7 @@ void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
             for (x = 0; x < 4; x++) {
                 int copy = 0, linesize = s->linesize;
                 uint8_t *dst = ptr + 4 * x;
-                DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
+                LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
 
                 if ((y == 0 || x == 3) && mb_y == 0) {
                     topright = tr_top;
@@ -1699,8 +1733,8 @@ void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
     if (AV_RN32A(mv)) {
         int src_linesize = linesize;
 
-        int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
-        int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
+        int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
+        int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
 
         x_off += mv->x >> 2;
         y_off += mv->y >> 2;
@@ -1770,7 +1804,8 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                      src1 - my_idx * linesize - mx_idx,
                                      EDGE_EMU_LINESIZE, linesize,
-                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                     block_w + subpel_idx[1][mx],
+                                     block_h + subpel_idx[1][my],
                                      x_off - mx_idx, y_off - my_idx, width, height);
             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
@@ -1778,7 +1813,8 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
                                      src2 - my_idx * linesize - mx_idx,
                                      EDGE_EMU_LINESIZE, linesize,
-                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                     block_w + subpel_idx[1][mx],
+                                     block_h + subpel_idx[1][my],
                                      x_off - mx_idx, y_off - my_idx, width, height);
             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
@@ -2214,7 +2250,7 @@ static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
             td->wait_mb_pos = INT_MAX;                                        \
             pthread_mutex_unlock(&otd->lock);                                 \
         }                                                                     \
-    } while (0);
+    } while (0)
 
 #define update_pos(td, mb_y, mb_x)                                            \
     do {                                                                      \
@@ -2233,13 +2269,13 @@ static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
             pthread_cond_broadcast(&td->cond);                                \
             pthread_mutex_unlock(&td->lock);                                  \
         }                                                                     \
-    } while (0);
+    } while (0)
 #else
-#define check_thread_pos(td, otd, mb_x_check, mb_y_check)
-#define update_pos(td, mb_y, mb_x)
+#define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
+#define update_pos(td, mb_y, mb_x) while(0)
 #endif
 
-static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+static av_always_inline void decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
                                         int jobnr, int threadnr, int is_vp7)
 {
     VP8Context *s = avctx->priv_data;
@@ -2360,7 +2396,19 @@ static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
     }
 }
 
-static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
+static void vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+                                        int jobnr, int threadnr)
+{
+    decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
+}
+
+static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
+                                        int jobnr, int threadnr)
+{
+    decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
+}
+
+static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
                               int jobnr, int threadnr, int is_vp7)
 {
     VP8Context *s = avctx->priv_data;
@@ -2419,6 +2467,18 @@ static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
     }
 }
 
+static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
+                              int jobnr, int threadnr)
+{
+    filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
+}
+
+static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
+                              int jobnr, int threadnr)
+{
+    filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
+}
+
 static av_always_inline
 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
                               int threadnr, int is_vp7)
@@ -2434,9 +2494,9 @@ int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
         if (mb_y >= s->mb_height)
             break;
         td->thread_mb_pos = mb_y << 16;
-        vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
+        s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
         if (s->deblock_filter)
-            vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
+            s->filter_mb_row(avctx, tdata, jobnr, threadnr);
         update_pos(td, mb_y, INT_MAX & 0xFFFF);
 
         s->mv_min.y -= 64;
@@ -2529,10 +2589,8 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     curframe->tf.f->key_frame = s->keyframe;
     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
                                             : AV_PICTURE_TYPE_P;
-    if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
+    if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
         goto err;
-    }
 
     // check if golden and altref are swapped
     if (s->update_altref != VP56_FRAME_NONE)
@@ -2552,7 +2610,8 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     s->next_framep[VP56_FRAME_CURRENT] = curframe;
 
-    ff_thread_finish_setup(avctx);
+    if (avctx->codec->update_thread_context)
+        ff_thread_finish_setup(avctx);
 
     s->linesize   = curframe->tf.f->linesize[0];
     s->uvlinesize = curframe->tf.f->linesize[1];
@@ -2640,6 +2699,9 @@ av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
     VP8Context *s = avctx->priv_data;
     int i;
 
+    if (!s)
+        return 0;
+
     vp8_decode_flush_impl(avctx, 1);
     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
         av_frame_free(&s->frames[i].tf.f);
@@ -2665,6 +2727,7 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     int ret;
 
     s->avctx = avctx;
+    s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->internal->allocate_progress = 1;
 
@@ -2674,9 +2737,13 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
     if (CONFIG_VP7_DECODER && is_vp7) {
         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
         ff_vp7dsp_init(&s->vp8dsp);
+        s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
+        s->filter_mb_row           = vp7_filter_mb_row;
     } else if (CONFIG_VP8_DECODER && !is_vp7) {
         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
         ff_vp8dsp_init(&s->vp8dsp);
+        s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
+        s->filter_mb_row           = vp8_filter_mb_row;
     }
 
     /* does not change for VP8 */
@@ -2703,6 +2770,7 @@ av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
 }
 
 #if CONFIG_VP8_DECODER
+#if HAVE_THREADS
 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
 {
     VP8Context *s = avctx->priv_data;
@@ -2718,7 +2786,7 @@ static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
     return 0;
 }
 
-#define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
+#define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
 
 static int vp8_decode_update_thread_context(AVCodecContext *dst,
                                             const AVCodecContext *src)
@@ -2753,6 +2821,7 @@ static int vp8_decode_update_thread_context(AVCodecContext *dst,
 
     return 0;
 }
+#endif /* HAVE_THREADS */
 #endif /* CONFIG_VP8_DECODER */
 
 #if CONFIG_VP7_DECODER
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index ba5e62a..374e138 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -6,20 +6,20 @@
  * Copyright (C) 2010 Fiona Glaser
  * Copyright (C) 2012 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,18 +27,13 @@
 #define AVCODEC_VP8_H
 
 #include "libavutil/buffer.h"
+#include "libavutil/thread.h"
 
 #include "h264pred.h"
 #include "thread.h"
 #include "vp56.h"
 #include "vp8dsp.h"
 
-#if HAVE_PTHREADS
-#   include <pthread.h>
-#elif HAVE_W32THREADS
-#   include "compat/w32pthreads.h"
-#endif
-
 #define VP8_MAX_QUANT 127
 
 enum dct_token {
@@ -132,6 +127,11 @@ typedef struct VP8Frame {
     AVBufferRef *seg_map;
 } VP8Frame;
 
+typedef struct VP8intmv {
+    int x;
+    int y;
+} VP8intmv;
+
 #define MAX_THREADS 8
 typedef struct VP8Context {
     VP8ThreadData *thread_data;
@@ -150,8 +150,8 @@ typedef struct VP8Context {
     uint8_t deblock_filter;
     uint8_t mbskip_enabled;
     uint8_t profile;
-    VP56mv mv_min;
-    VP56mv mv_max;
+    VP8intmv mv_min;
+    VP8intmv mv_max;
 
     int8_t sign_bias[4]; ///< one state [0, 1] per ref frame type
     int ref_count[3];
@@ -275,6 +275,11 @@ typedef struct VP8Context {
      */
     int mb_layout;
 
+    void (*decode_mb_row_no_filter)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr);
+    void (*filter_mb_row)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr);
+
+    int vp7;
+
     /**
      * Fade bit present in bitstream (VP7)
      */
diff --git a/libavcodec/vp8_parser.c b/libavcodec/vp8_parser.c
index 8f6459c..afc7f99 100644
--- a/libavcodec/vp8_parser.c
+++ b/libavcodec/vp8_parser.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2008 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8data.h b/libavcodec/vp8data.h
index f8f9fff..5e6dea7 100644
--- a/libavcodec/vp8data.h
+++ b/libavcodec/vp8data.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index 4e4012f..07bea69 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2014 Peter Ross
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 
 #include "mathops.h"
 #include "vp8dsp.h"
@@ -71,10 +72,7 @@ static void vp7_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
         b1 = (tmp[i + 0] - tmp[i + 8]) * 23170;
         c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274;
         d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540;
-        dc[i * 4 + 0] = 0;
-        dc[i * 4 + 1] = 0;
-        dc[i * 4 + 2] = 0;
-        dc[i * 4 + 3] = 0;
+        AV_ZERO64(dc + i * 4);
         block[0][i][0] = (a1 + d1 + 0x20000) >> 18;
         block[3][i][0] = (a1 - d1 + 0x20000) >> 18;
         block[1][i][0] = (b1 + c1 + 0x20000) >> 18;
@@ -105,10 +103,7 @@ static void vp7_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
         b1 = (block[i * 4 + 0] - block[i * 4 + 2]) * 23170;
         c1 = block[i * 4 + 1] * 12540 - block[i * 4 + 3] * 30274;
         d1 = block[i * 4 + 1] * 30274 + block[i * 4 + 3] * 12540;
-        block[i * 4 + 0] = 0;
-        block[i * 4 + 1] = 0;
-        block[i * 4 + 2] = 0;
-        block[i * 4 + 3] = 0;
+        AV_ZERO64(block + i * 4);
         tmp[i * 4 + 0] = (a1 + d1) >> 14;
         tmp[i * 4 + 3] = (a1 - d1) >> 14;
         tmp[i * 4 + 1] = (b1 + c1) >> 14;
@@ -171,10 +166,7 @@ static void vp8_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16])
         t1 = dc[i * 4 + 1] + dc[i * 4 + 2];
         t2 = dc[i * 4 + 1] - dc[i * 4 + 2];
         t3 = dc[i * 4 + 0] - dc[i * 4 + 3] + 3; // rounding
-        dc[i * 4 + 0] = 0;
-        dc[i * 4 + 1] = 0;
-        dc[i * 4 + 2] = 0;
-        dc[i * 4 + 3] = 0;
+        AV_ZERO64(dc + i * 4);
 
         block[i][0][0] = (t0 + t1) >> 3;
         block[i][1][0] = (t3 + t2) >> 3;
@@ -262,7 +254,7 @@ MK_IDCT_DC_ADD4_C(vp8)
     int av_unused q2 = p[ 2 * stride];                                        \
     int av_unused q3 = p[ 3 * stride];
 
-#define clip_int8(n) (cm[n + 0x80] - 0x80)
+#define clip_int8(n) (cm[(n) + 0x80] - 0x80)
 
 static av_always_inline void filter_common(uint8_t *p, ptrdiff_t stride,
                                            int is4tap, int is_vp7)
@@ -743,5 +735,7 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
         ff_vp8dsp_init_arm(dsp);
     if (ARCH_X86)
         ff_vp8dsp_init_x86(dsp);
+    if (ARCH_MIPS)
+        ff_vp8dsp_init_mips(dsp);
 }
 #endif /* CONFIG_VP8_DECODER */
diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h
index 4864cf7..0401c92 100644
--- a/libavcodec/vp8dsp.h
+++ b/libavcodec/vp8dsp.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -98,6 +98,7 @@ void ff_vp78dsp_init_x86(VP8DSPContext *c);
 void ff_vp8dsp_init(VP8DSPContext *c);
 void ff_vp8dsp_init_arm(VP8DSPContext *c);
 void ff_vp8dsp_init_x86(VP8DSPContext *c);
+void ff_vp8dsp_init_mips(VP8DSPContext *c);
 
 #define IS_VP7 1
 #define IS_VP8 0
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 45f0771..cb2a4a2 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -4,99 +4,381 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/avassert.h"
-
 #include "avcodec.h"
 #include "get_bits.h"
 #include "internal.h"
+#include "profiles.h"
+#include "thread.h"
 #include "videodsp.h"
 #include "vp56.h"
 #include "vp9.h"
 #include "vp9data.h"
+#include "vp9dsp.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
 
 #define VP9_SYNCCODE 0x498342
-#define MAX_PROB 255
 
-static void vp9_decode_flush(AVCodecContext *avctx)
+struct VP9Filter {
+    uint8_t level[8 * 8];
+    uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
+                              [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
+};
+
+typedef struct VP9Block {
+    uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
+    enum FilterMode filter;
+    VP56mv mv[4 /* b_idx */][2 /* ref */];
+    enum BlockSize bs;
+    enum TxfmMode tx, uvtx;
+    enum BlockLevel bl;
+    enum BlockPartition bp;
+} VP9Block;
+
+typedef struct VP9Context {
+    VP9SharedContext s;
+
+    VP9DSPContext dsp;
+    VideoDSPContext vdsp;
+    GetBitContext gb;
+    VP56RangeCoder c;
+    VP56RangeCoder *c_b;
+    unsigned c_b_size;
+    VP9Block *b_base, *b;
+    int pass;
+    int row, row7, col, col7;
+    uint8_t *dst[3];
+    ptrdiff_t y_stride, uv_stride;
+
+    uint8_t ss_h, ss_v;
+    uint8_t last_bpp, bpp, bpp_index, bytesperpixel;
+    uint8_t last_keyframe;
+    // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
+    // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
+    // and are therefore per-stream. pix_fmt represents the value in the header
+    // of the currently processed frame.
+    int w, h;
+    enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
+    unsigned sb_cols, sb_rows, rows, cols;
+    ThreadFrame next_refs[8];
+
+    struct {
+        uint8_t lim_lut[64];
+        uint8_t mblim_lut[64];
+    } filter_lut;
+    unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    struct {
+        prob_context p;
+        uint8_t coef[4][2][2][6][6][3];
+    } prob_ctx[4];
+    struct {
+        prob_context p;
+        uint8_t coef[4][2][2][6][6][11];
+    } prob;
+    struct {
+        unsigned y_mode[4][10];
+        unsigned uv_mode[10][10];
+        unsigned filter[4][3];
+        unsigned mv_mode[7][4];
+        unsigned intra[4][2];
+        unsigned comp[5][2];
+        unsigned single_ref[5][2][2];
+        unsigned comp_ref[5][2];
+        unsigned tx32p[2][4];
+        unsigned tx16p[2][3];
+        unsigned tx8p[2][2];
+        unsigned skip[3][2];
+        unsigned mv_joint[4];
+        struct {
+            unsigned sign[2];
+            unsigned classes[11];
+            unsigned class0[2];
+            unsigned bits[10][2];
+            unsigned class0_fp[2][4];
+            unsigned fp[4];
+            unsigned class0_hp[2];
+            unsigned hp[2];
+        } mv_comp[2];
+        unsigned partition[4][4][4];
+        unsigned coef[4][2][2][6][6][3];
+        unsigned eob[4][2][2][6][6][2];
+    } counts;
+
+    // contextual (left/above) cache
+    DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
+    DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
+    DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
+    DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
+    DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
+    DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
+    uint8_t *above_partition_ctx;
+    uint8_t *above_mode_ctx;
+    // FIXME maybe merge some of the below in a flags field?
+    uint8_t *above_y_nnz_ctx;
+    uint8_t *above_uv_nnz_ctx[2];
+    uint8_t *above_skip_ctx; // 1bit
+    uint8_t *above_txfm_ctx; // 2bit
+    uint8_t *above_segpred_ctx; // 1bit
+    uint8_t *above_intra_ctx; // 1bit
+    uint8_t *above_comp_ctx; // 1bit
+    uint8_t *above_ref_ctx; // 2bit
+    uint8_t *above_filter_ctx;
+    VP56mv (*above_mv_ctx)[2];
+
+    // whole-frame cache
+    uint8_t *intra_pred_data[3];
+    struct VP9Filter *lflvl;
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2];
+
+    // block reconstruction intermediates
+    int block_alloc_using_2pass;
+    int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
+    uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
+    struct { int x, y; } min_mv, max_mv;
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
+    uint16_t mvscale[3][2];
+    uint8_t mvstep[3][2];
+} VP9Context;
+
+static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
+    {
+        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
+        { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
+    }, {
+        { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
+        { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+    }
+};
+
+static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
 {
-    VP9Context *s = avctx->priv_data;
-    int i;
+    ff_thread_release_buffer(ctx, &f->tf);
+    av_buffer_unref(&f->extradata);
+    av_buffer_unref(&f->hwaccel_priv_buf);
+    f->segmentation_map = NULL;
+    f->hwaccel_picture_private = NULL;
+}
+
+static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
+{
+    VP9Context *s = ctx->priv_data;
+    int ret, sz;
+
+    if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
+    sz = 64 * s->sb_cols * s->sb_rows;
+    if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
+        goto fail;
+    }
+
+    f->segmentation_map = f->extradata->data;
+    f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
+
+    if (ctx->hwaccel) {
+        const AVHWAccel *hwaccel = ctx->hwaccel;
+        av_assert0(!f->hwaccel_picture_private);
+        if (hwaccel->frame_priv_data_size) {
+            f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
+            if (!f->hwaccel_priv_buf)
+                goto fail;
+            f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
+        }
+    }
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
-        av_frame_unref(s->refs[i]);
+    return 0;
+
+fail:
+    vp9_unref_frame(ctx, f);
+    return AVERROR(ENOMEM);
+}
+
+static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
+{
+    int res;
+
+    if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
+        return res;
+    } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
+        goto fail;
+    }
+
+    dst->segmentation_map = src->segmentation_map;
+    dst->mv = src->mv;
+    dst->uses_2pass = src->uses_2pass;
+
+    if (src->hwaccel_picture_private) {
+        dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
+        if (!dst->hwaccel_priv_buf)
+            goto fail;
+        dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
+    }
+
+    return 0;
+
+fail:
+    vp9_unref_frame(ctx, dst);
+    return AVERROR(ENOMEM);
 }
 
-static int update_size(AVCodecContext *avctx, int w, int h)
+static int update_size(AVCodecContext *ctx, int w, int h)
 {
-    VP9Context *s = avctx->priv_data;
+#define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
+    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
+    VP9Context *s = ctx->priv_data;
     uint8_t *p;
+    int bytesperpixel = s->bytesperpixel, res, cols, rows;
 
-    if (s->above_partition_ctx && w == avctx->width && h == avctx->height)
-        return 0;
+    av_assert0(w > 0 && h > 0);
 
-    vp9_decode_flush(avctx);
+    if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
+        if ((res = ff_set_dimensions(ctx, w, h)) < 0)
+            return res;
 
-    if (w <= 0 || h <= 0)
-        return AVERROR_INVALIDDATA;
+        if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
+#if CONFIG_VP9_DXVA2_HWACCEL
+            *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
+#endif
+#if CONFIG_VP9_D3D11VA_HWACCEL
+            *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
+#endif
+#if CONFIG_VP9_VAAPI_HWACCEL
+            *fmtp++ = AV_PIX_FMT_VAAPI;
+#endif
+        }
+
+        *fmtp++ = s->pix_fmt;
+        *fmtp = AV_PIX_FMT_NONE;
+
+        res = ff_thread_get_format(ctx, pix_fmts);
+        if (res < 0)
+            return res;
+
+        ctx->pix_fmt = res;
+        s->gf_fmt  = s->pix_fmt;
+        s->w = w;
+        s->h = h;
+    }
+
+    cols = (w + 7) >> 3;
+    rows = (h + 7) >> 3;
 
-    avctx->width  = w;
-    avctx->height = h;
-    s->sb_cols    = (w + 63) >> 6;
-    s->sb_rows    = (h + 63) >> 6;
-    s->cols       = (w +  7) >> 3;
-    s->rows       = (h +  7) >> 3;
-
-#define assign(var, type, n) var = (type)p; p += s->sb_cols * n * sizeof(*var)
-    av_free(s->above_partition_ctx);
-    p = av_malloc(s->sb_cols *
-                  (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx) +
-                   64 * s->sb_rows * (1 + sizeof(*s->mv[0]) * 2)));
+    if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
+        return 0;
+
+    s->last_fmt  = s->pix_fmt;
+    s->sb_cols   = (w + 63) >> 6;
+    s->sb_rows   = (h + 63) >> 6;
+    s->cols      = (w + 7) >> 3;
+    s->rows      = (h + 7) >> 3;
+
+#define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
+    av_freep(&s->intra_pred_data[0]);
+    // FIXME we slightly over-allocate here for subsampled chroma, but a little
+    // bit of padding shouldn't affect performance...
+    p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
+                                sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
     if (!p)
         return AVERROR(ENOMEM);
-    assign(s->above_partition_ctx, uint8_t *,     8);
-    assign(s->above_skip_ctx,      uint8_t *,     8);
-    assign(s->above_txfm_ctx,      uint8_t *,     8);
-    assign(s->above_mode_ctx,      uint8_t *,    16);
-    assign(s->above_y_nnz_ctx,     uint8_t *,    16);
-    assign(s->above_uv_nnz_ctx[0], uint8_t *,     8);
-    assign(s->above_uv_nnz_ctx[1], uint8_t *,     8);
-    assign(s->intra_pred_data[0],  uint8_t *,    64);
-    assign(s->intra_pred_data[1],  uint8_t *,    32);
-    assign(s->intra_pred_data[2],  uint8_t *,    32);
-    assign(s->above_segpred_ctx,   uint8_t *,     8);
-    assign(s->above_intra_ctx,     uint8_t *,     8);
-    assign(s->above_comp_ctx,      uint8_t *,     8);
-    assign(s->above_ref_ctx,       uint8_t *,     8);
-    assign(s->above_filter_ctx,    uint8_t *,     8);
-    assign(s->lflvl,               VP9Filter *,   1);
-    assign(s->above_mv_ctx,        VP56mv(*)[2], 16);
-    assign(s->segmentation_map,    uint8_t *,      64 * s->sb_rows);
-    assign(s->mv[0],               VP9MVRefPair *, 64 * s->sb_rows);
-    assign(s->mv[1],               VP9MVRefPair *, 64 * s->sb_rows);
+    assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
+    assign(s->intra_pred_data[1],  uint8_t *,             64 * bytesperpixel);
+    assign(s->intra_pred_data[2],  uint8_t *,             64 * bytesperpixel);
+    assign(s->above_y_nnz_ctx,     uint8_t *,             16);
+    assign(s->above_mode_ctx,      uint8_t *,             16);
+    assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
+    assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
+    assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
+    assign(s->above_partition_ctx, uint8_t *,              8);
+    assign(s->above_skip_ctx,      uint8_t *,              8);
+    assign(s->above_txfm_ctx,      uint8_t *,              8);
+    assign(s->above_segpred_ctx,   uint8_t *,              8);
+    assign(s->above_intra_ctx,     uint8_t *,              8);
+    assign(s->above_comp_ctx,      uint8_t *,              8);
+    assign(s->above_ref_ctx,       uint8_t *,              8);
+    assign(s->above_filter_ctx,    uint8_t *,              8);
+    assign(s->lflvl,               struct VP9Filter *,     1);
 #undef assign
 
+    // these will be re-allocated a little later
+    av_freep(&s->b_base);
+    av_freep(&s->block_base);
+
+    if (s->bpp != s->last_bpp) {
+        ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
+        ff_videodsp_init(&s->vdsp, s->bpp);
+        s->last_bpp = s->bpp;
+    }
+
     return 0;
 }
 
-// The sign bit is at the end, not the start, of a bit sequence
-static av_always_inline int get_bits_with_sign(GetBitContext *gb, int n)
+static int update_block_buffers(AVCodecContext *ctx)
+{
+    VP9Context *s = ctx->priv_data;
+    int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
+
+    if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
+        return 0;
+
+    av_free(s->b_base);
+    av_free(s->block_base);
+    chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
+    chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
+    if (s->s.frames[CUR_FRAME].uses_2pass) {
+        int sbs = s->sb_cols * s->sb_rows;
+
+        s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
+        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                    16 * 16 + 2 * chroma_eobs) * sbs);
+        if (!s->b_base || !s->block_base)
+            return AVERROR(ENOMEM);
+        s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
+        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
+        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
+    } else {
+        s->b_base = av_malloc(sizeof(VP9Block));
+        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                   16 * 16 + 2 * chroma_eobs);
+        if (!s->b_base || !s->block_base)
+            return AVERROR(ENOMEM);
+        s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
+        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
+        s->uveob_base[0] = s->eob_base + 16 * 16;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
+    }
+    s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
+
+    return 0;
+}
+
+// for some reason the sign bit is at the end, not the start, of a bit sequence
+static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
 {
     int v = get_bits(gb, n);
     return get_bits1(gb) ? -v : v;
@@ -104,17 +386,13 @@ static av_always_inline int get_bits_with_sign(GetBitContext *gb, int n)
 
 static av_always_inline int inv_recenter_nonneg(int v, int m)
 {
-    if (v > 2 * m)
-        return v;
-    if (v & 1)
-        return m - ((v + 1) >> 1);
-    return m + (v >> 1);
+    return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
 // differential forward probability updates
 static int update_prob(VP56RangeCoder *c, int p)
 {
-    static const int inv_map_table[MAX_PROB - 1] = {
+    static const int inv_map_table[255] = {
           7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
         189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
          10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
@@ -133,19 +411,19 @@ static int update_prob(VP56RangeCoder *c, int p)
         207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
         222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
         237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
-        252, 253,
+        252, 253, 253,
     };
     int d;
 
     /* This code is trying to do a differential probability update. For a
      * current probability A in the range [1, 255], the difference to a new
-     * probability of any value can be expressed differentially as 1-A, 255-A
+     * probability of any value can be expressed differentially as 1-A,255-A
      * where some part of this (absolute range) exists both in positive as
      * well as the negative part, whereas another part only exists in one
      * half. We're trying to code this shared part differentially, i.e.
      * times two where the value of the lowest bit specifies the sign, and
      * the single part is then coded on top of this. This absolute difference
-     * then again has a value of [0, 254], but a bigger value in this range
+     * then again has a value of [0,254], but a bigger value in this range
      * indicates that we're further away from the original value A, so we
      * can code this as a VLC code, since higher values are increasingly
      * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
@@ -160,346 +438,438 @@ static int update_prob(VP56RangeCoder *c, int p)
         d = vp8_rac_get_uint(c, 5) + 32;
     } else {
         d = vp8_rac_get_uint(c, 7);
-        if (d >= 65) {
+        if (d >= 65)
             d = (d << 1) - 65 + vp8_rac_get(c);
-            d = av_clip(d, 0, MAX_PROB - 65 - 1);
-        }
         d += 64;
+        av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
+    }
+
+    return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
+                    255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
+}
+
+static int read_colorspace_details(AVCodecContext *ctx)
+{
+    static const enum AVColorSpace colorspaces[8] = {
+        AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M,
+        AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB,
+    };
+    VP9Context *s = ctx->priv_data;
+    int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
+
+    s->bpp_index = bits;
+    s->bpp = 8 + bits * 2;
+    s->bytesperpixel = (7 + s->bpp) >> 3;
+    ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
+    if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
+        static const enum AVPixelFormat pix_fmt_rgb[3] = {
+            AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12
+        };
+        s->ss_h = s->ss_v = 0;
+        ctx->color_range = AVCOL_RANGE_JPEG;
+        s->pix_fmt = pix_fmt_rgb[bits];
+        if (ctx->profile & 1) {
+            if (get_bits1(&s->gb)) {
+                av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
+                   ctx->profile);
+            return AVERROR_INVALIDDATA;
+        }
+    } else {
+        static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
+            { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P },
+              { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } },
+            { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 },
+              { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } },
+            { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 },
+              { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } }
+        };
+        ctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
+        if (ctx->profile & 1) {
+            s->ss_h = get_bits1(&s->gb);
+            s->ss_v = get_bits1(&s->gb);
+            s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
+            if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
+                av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
+                       ctx->profile);
+                return AVERROR_INVALIDDATA;
+            } else if (get_bits1(&s->gb)) {
+                av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
+                       ctx->profile);
+                return AVERROR_INVALIDDATA;
+            }
+        } else {
+            s->ss_h = s->ss_v = 1;
+            s->pix_fmt = pix_fmt_for_ss[bits][1][1];
+        }
     }
 
-    return p <= 128
-           ?   1 + inv_recenter_nonneg(inv_map_table[d], p - 1)
-           : 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
+    return 0;
 }
 
-static int decode_frame_header(AVCodecContext *avctx,
+static int decode_frame_header(AVCodecContext *ctx,
                                const uint8_t *data, int size, int *ref)
 {
-    VP9Context *s = avctx->priv_data;
-    int c, i, j, k, l, m, n, w, h, max, size2, ret, sharp;
+    VP9Context *s = ctx->priv_data;
+    int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
     int last_invisible;
     const uint8_t *data2;
 
     /* general header */
-    if ((ret = init_get_bits8(&s->gb, data, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
-        return ret;
+    if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
+        return res;
     }
     if (get_bits(&s->gb, 2) != 0x2) { // frame marker
-        av_log(avctx, AV_LOG_ERROR, "Invalid frame marker\n");
+        av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
         return AVERROR_INVALIDDATA;
     }
-    s->profile = get_bits1(&s->gb);
-    if (get_bits1(&s->gb)) { // reserved bit
-        av_log(avctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
+    ctx->profile  = get_bits1(&s->gb);
+    ctx->profile |= get_bits1(&s->gb) << 1;
+    if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
+    if (ctx->profile > 3) {
+        av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
         return AVERROR_INVALIDDATA;
     }
+    s->s.h.profile = ctx->profile;
     if (get_bits1(&s->gb)) {
         *ref = get_bits(&s->gb, 3);
         return 0;
     }
-
-    s->last_keyframe = s->keyframe;
-    s->keyframe      = !get_bits1(&s->gb);
-
-    last_invisible = s->invisible;
-    s->invisible   = !get_bits1(&s->gb);
-    s->errorres    = get_bits1(&s->gb);
-    // FIXME disable this upon resolution change
-    s->use_last_frame_mvs = !s->errorres && !last_invisible;
-
-    if (s->keyframe) {
+    s->last_keyframe  = s->s.h.keyframe;
+    s->s.h.keyframe     = !get_bits1(&s->gb);
+    last_invisible    = s->s.h.invisible;
+    s->s.h.invisible    = !get_bits1(&s->gb);
+    s->s.h.errorres     = get_bits1(&s->gb);
+    s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
+    if (s->s.h.keyframe) {
         if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
-            av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
-            return AVERROR_INVALIDDATA;
-        }
-        s->colorspace = get_bits(&s->gb, 3);
-        if (s->colorspace == 7) { // RGB = profile 1
-            av_log(avctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
+            av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
             return AVERROR_INVALIDDATA;
         }
-        s->fullrange = get_bits1(&s->gb);
-
-        // subsampling bits
-        if (s->profile == 1 || s->profile == 3) {
-            s->sub_x = get_bits1(&s->gb);
-            s->sub_y = get_bits1(&s->gb);
-            if (s->sub_x && s->sub_y) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "4:2:0 color not supported in profile 1 or 3\n");
-                return AVERROR_INVALIDDATA;
-            }
-            if (get_bits1(&s->gb)) { // reserved bit
-                av_log(avctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
-                return AVERROR_INVALIDDATA;
-            }
-        } else {
-            s->sub_x = s->sub_y = 1;
-        }
-        if (!s->sub_x || !s->sub_y) {
-            avpriv_report_missing_feature(avctx, "Subsampling %d:%d",
-                                          s->sub_x, s->sub_y);
-            return AVERROR_PATCHWELCOME;
-        }
-
-        s->refreshrefmask = 0xff;
+        if ((res = read_colorspace_details(ctx)) < 0)
+            return res;
+        // for profile 1, here follows the subsampling bits
+        s->s.h.refreshrefmask = 0xff;
         w = get_bits(&s->gb, 16) + 1;
         h = get_bits(&s->gb, 16) + 1;
         if (get_bits1(&s->gb)) // display size
             skip_bits(&s->gb, 32);
     } else {
-        s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
-        s->resetctx  = s->errorres ? 0 : get_bits(&s->gb, 2);
-        if (s->intraonly) {
+        s->s.h.intraonly  = s->s.h.invisible ? get_bits1(&s->gb) : 0;
+        s->s.h.resetctx   = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
+        if (s->s.h.intraonly) {
             if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
-                av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
+                av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
                 return AVERROR_INVALIDDATA;
             }
-            s->refreshrefmask = get_bits(&s->gb, 8);
+            if (ctx->profile >= 1) {
+                if ((res = read_colorspace_details(ctx)) < 0)
+                    return res;
+            } else {
+                s->ss_h = s->ss_v = 1;
+                s->bpp = 8;
+                s->bpp_index = 0;
+                s->bytesperpixel = 1;
+                s->pix_fmt = AV_PIX_FMT_YUV420P;
+                ctx->colorspace = AVCOL_SPC_BT470BG;
+                ctx->color_range = AVCOL_RANGE_JPEG;
+            }
+            s->s.h.refreshrefmask = get_bits(&s->gb, 8);
             w = get_bits(&s->gb, 16) + 1;
             h = get_bits(&s->gb, 16) + 1;
             if (get_bits1(&s->gb)) // display size
                 skip_bits(&s->gb, 32);
         } else {
-            s->refreshrefmask = get_bits(&s->gb, 8);
-            s->refidx[0]      = get_bits(&s->gb, 3);
-            s->signbias[0]    = get_bits1(&s->gb);
-            s->refidx[1]      = get_bits(&s->gb, 3);
-            s->signbias[1]    = get_bits1(&s->gb);
-            s->refidx[2]      = get_bits(&s->gb, 3);
-            s->signbias[2]    = get_bits1(&s->gb);
-            if (!s->refs[s->refidx[0]]->buf[0] ||
-                !s->refs[s->refidx[1]]->buf[0] ||
-                !s->refs[s->refidx[2]]->buf[0]) {
-                av_log(avctx, AV_LOG_ERROR,
-                       "Not all references are available\n");
+            s->s.h.refreshrefmask = get_bits(&s->gb, 8);
+            s->s.h.refidx[0]      = get_bits(&s->gb, 3);
+            s->s.h.signbias[0]    = get_bits1(&s->gb) && !s->s.h.errorres;
+            s->s.h.refidx[1]      = get_bits(&s->gb, 3);
+            s->s.h.signbias[1]    = get_bits1(&s->gb) && !s->s.h.errorres;
+            s->s.h.refidx[2]      = get_bits(&s->gb, 3);
+            s->s.h.signbias[2]    = get_bits1(&s->gb) && !s->s.h.errorres;
+            if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
+                !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
+                !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
+                av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
                 return AVERROR_INVALIDDATA;
             }
             if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[0]]->width;
-                h = s->refs[s->refidx[0]]->height;
+                w = s->s.refs[s->s.h.refidx[0]].f->width;
+                h = s->s.refs[s->s.h.refidx[0]].f->height;
             } else if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[1]]->width;
-                h = s->refs[s->refidx[1]]->height;
+                w = s->s.refs[s->s.h.refidx[1]].f->width;
+                h = s->s.refs[s->s.h.refidx[1]].f->height;
             } else if (get_bits1(&s->gb)) {
-                w = s->refs[s->refidx[2]]->width;
-                h = s->refs[s->refidx[2]]->height;
+                w = s->s.refs[s->s.h.refidx[2]].f->width;
+                h = s->s.refs[s->s.h.refidx[2]].f->height;
             } else {
                 w = get_bits(&s->gb, 16) + 1;
                 h = get_bits(&s->gb, 16) + 1;
             }
+            // Note that in this code, "CUR_FRAME" is actually before we
+            // have formally allocated a frame, and thus actually represents
+            // the _last_ frame
+            s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
+                                       s->s.frames[CUR_FRAME].tf.f->height == h;
             if (get_bits1(&s->gb)) // display size
                 skip_bits(&s->gb, 32);
-            s->highprecisionmvs = get_bits1(&s->gb);
-            s->filtermode       = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
-                                  get_bits(&s->gb, 2);
-            s->allowcompinter   = s->signbias[0] != s->signbias[1] ||
-                                  s->signbias[0] != s->signbias[2];
-            if (s->allowcompinter) {
-                if (s->signbias[0] == s->signbias[1]) {
-                    s->fixcompref    = 2;
-                    s->varcompref[0] = 0;
-                    s->varcompref[1] = 1;
-                } else if (s->signbias[0] == s->signbias[2]) {
-                    s->fixcompref    = 1;
-                    s->varcompref[0] = 0;
-                    s->varcompref[1] = 2;
+            s->s.h.highprecisionmvs = get_bits1(&s->gb);
+            s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
+                                                  get_bits(&s->gb, 2);
+            s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
+                                  s->s.h.signbias[0] != s->s.h.signbias[2];
+            if (s->s.h.allowcompinter) {
+                if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
+                    s->s.h.fixcompref    = 2;
+                    s->s.h.varcompref[0] = 0;
+                    s->s.h.varcompref[1] = 1;
+                } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
+                    s->s.h.fixcompref    = 1;
+                    s->s.h.varcompref[0] = 0;
+                    s->s.h.varcompref[1] = 2;
                 } else {
-                    s->fixcompref    = 0;
-                    s->varcompref[0] = 1;
-                    s->varcompref[1] = 2;
+                    s->s.h.fixcompref    = 0;
+                    s->s.h.varcompref[0] = 1;
+                    s->s.h.varcompref[1] = 2;
                 }
             }
         }
     }
-
-    s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
-    s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
-    s->framectxid   = c = get_bits(&s->gb, 2);
+    s->s.h.refreshctx   = s->s.h.errorres ? 0 : get_bits1(&s->gb);
+    s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
+    s->s.h.framectxid   = c = get_bits(&s->gb, 2);
+    if (s->s.h.keyframe || s->s.h.intraonly)
+        s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes
 
     /* loopfilter header data */
-    s->filter.level = get_bits(&s->gb, 6);
-    sharp           = get_bits(&s->gb, 3);
-    /* If sharpness changed, reinit lim/mblim LUTs. if it didn't change,
-     * keep the old cache values since they are still valid. */
-    if (s->filter.sharpness != sharp)
-        memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
-    s->filter.sharpness = sharp;
-    if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
-        if (get_bits1(&s->gb)) {
+    if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
+        // reset loopfilter defaults
+        s->s.h.lf_delta.ref[0] = 1;
+        s->s.h.lf_delta.ref[1] = 0;
+        s->s.h.lf_delta.ref[2] = -1;
+        s->s.h.lf_delta.ref[3] = -1;
+        s->s.h.lf_delta.mode[0] = 0;
+        s->s.h.lf_delta.mode[1] = 0;
+        memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
+    }
+    s->s.h.filter.level = get_bits(&s->gb, 6);
+    sharp = get_bits(&s->gb, 3);
+    // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
+    // the old cache values since they are still valid
+    if (s->s.h.filter.sharpness != sharp)
+        memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
+    s->s.h.filter.sharpness = sharp;
+    if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
+        if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
             for (i = 0; i < 4; i++)
                 if (get_bits1(&s->gb))
-                    s->lf_delta.ref[i] = get_bits_with_sign(&s->gb, 6);
+                    s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
             for (i = 0; i < 2; i++)
                 if (get_bits1(&s->gb))
-                    s->lf_delta.mode[i] = get_bits_with_sign(&s->gb, 6);
+                    s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
         }
-    } else {
-        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
     }
 
     /* quantization header data */
-    s->yac_qi      = get_bits(&s->gb, 8);
-    s->ydc_qdelta  = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->uvdc_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->uvac_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
-    s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
-                     s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
+    s->s.h.yac_qi      = get_bits(&s->gb, 8);
+    s->s.h.ydc_qdelta  = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
+    s->s.h.lossless    = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
+                       s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
+    if (s->s.h.lossless)
+        ctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
 
     /* segmentation header info */
-    if ((s->segmentation.enabled = get_bits1(&s->gb))) {
-        if ((s->segmentation.update_map = get_bits1(&s->gb))) {
+    if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
+        if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
             for (i = 0; i < 7; i++)
-                s->prob.seg[i] = get_bits1(&s->gb) ?
+                s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
                                  get_bits(&s->gb, 8) : 255;
-            if ((s->segmentation.temporal = get_bits1(&s->gb)))
+            if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
                 for (i = 0; i < 3; i++)
-                    s->prob.segpred[i] = get_bits1(&s->gb) ?
+                    s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
                                          get_bits(&s->gb, 8) : 255;
+            }
         }
 
         if (get_bits1(&s->gb)) {
-            s->segmentation.absolute_vals = get_bits1(&s->gb);
+            s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
             for (i = 0; i < 8; i++) {
-                if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].q_val = get_bits_with_sign(&s->gb, 8);
-                if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].lf_val = get_bits_with_sign(&s->gb, 6);
-                if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
-                    s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
-                s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
+                if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
+                    s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
+                if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
+                    s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
+                if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
+                    s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
+                s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
             }
         }
-    } else {
-        s->segmentation.feat[0].q_enabled    = 0;
-        s->segmentation.feat[0].lf_enabled   = 0;
-        s->segmentation.feat[0].skip_enabled = 0;
-        s->segmentation.feat[0].ref_enabled  = 0;
     }
 
     // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
-    for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
+    for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
         int qyac, qydc, quvac, quvdc, lflvl, sh;
 
-        if (s->segmentation.feat[i].q_enabled) {
-            if (s->segmentation.absolute_vals)
-                qyac = s->segmentation.feat[i].q_val;
+        if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
+            if (s->s.h.segmentation.absolute_vals)
+                qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
             else
-                qyac = s->yac_qi + s->segmentation.feat[i].q_val;
+                qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
         } else {
-            qyac = s->yac_qi;
+            qyac  = s->s.h.yac_qi;
         }
-        qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
-        quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
-        quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
+        qydc  = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
+        quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
+        quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
         qyac  = av_clip_uintp2(qyac, 8);
 
-        s->segmentation.feat[i].qmul[0][0] = ff_vp9_dc_qlookup[qydc];
-        s->segmentation.feat[i].qmul[0][1] = ff_vp9_ac_qlookup[qyac];
-        s->segmentation.feat[i].qmul[1][0] = ff_vp9_dc_qlookup[quvdc];
-        s->segmentation.feat[i].qmul[1][1] = ff_vp9_ac_qlookup[quvac];
+        s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
+        s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
+        s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
+        s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
 
-        sh = s->filter.level >= 32;
-        if (s->segmentation.feat[i].lf_enabled) {
-            if (s->segmentation.absolute_vals)
-                lflvl = s->segmentation.feat[i].lf_val;
+        sh = s->s.h.filter.level >= 32;
+        if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
+            if (s->s.h.segmentation.absolute_vals)
+                lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
             else
-                lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
+                lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
         } else {
-            lflvl = s->filter.level;
+            lflvl  = s->s.h.filter.level;
         }
-        s->segmentation.feat[i].lflvl[0][0] =
-        s->segmentation.feat[i].lflvl[0][1] =
-            av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
-        for (j = 1; j < 4; j++) {
-            s->segmentation.feat[i].lflvl[j][0] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[0]) << sh), 6);
-            s->segmentation.feat[i].lflvl[j][1] =
-                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
-                                         s->lf_delta.mode[1]) << sh), 6);
+        if (s->s.h.lf_delta.enabled) {
+            s->s.h.segmentation.feat[i].lflvl[0][0] =
+            s->s.h.segmentation.feat[i].lflvl[0][1] =
+                av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
+            for (j = 1; j < 4; j++) {
+                s->s.h.segmentation.feat[i].lflvl[j][0] =
+                    av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
+                                             s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
+                s->s.h.segmentation.feat[i].lflvl[j][1] =
+                    av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
+                                             s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
+            }
+        } else {
+            memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
+                   sizeof(s->s.h.segmentation.feat[i].lflvl));
         }
     }
 
     /* tiling info */
-    if ((ret = update_size(avctx, w, h)) < 0) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Failed to initialize decoder for %dx%d\n", w, h);
-        return ret;
+    if ((res = update_size(ctx, w, h)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
+               w, h, s->pix_fmt);
+        return res;
     }
-    for (s->tiling.log2_tile_cols = 0;
-         (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
-         s->tiling.log2_tile_cols++) ;
+    for (s->s.h.tiling.log2_tile_cols = 0;
+         s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
+         s->s.h.tiling.log2_tile_cols++) ;
     for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
     max = FFMAX(0, max - 1);
-    while (max > s->tiling.log2_tile_cols) {
+    while (max > s->s.h.tiling.log2_tile_cols) {
         if (get_bits1(&s->gb))
-            s->tiling.log2_tile_cols++;
+            s->s.h.tiling.log2_tile_cols++;
         else
             break;
     }
-    s->tiling.log2_tile_rows = decode012(&s->gb);
-    s->tiling.tile_rows      = 1 << s->tiling.log2_tile_rows;
-    if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
-        s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
-        s->c_b              = av_fast_realloc(s->c_b, &s->c_b_size,
-                                              sizeof(VP56RangeCoder) *
-                                              s->tiling.tile_cols);
+    s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
+    s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
+    if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
+        s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
+        s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
+                                 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
         if (!s->c_b) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Ran out of memory during range coder init\n");
+            av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
             return AVERROR(ENOMEM);
         }
     }
 
-    if (s->keyframe || s->errorres || s->intraonly) {
-        s->prob_ctx[0].p =
-        s->prob_ctx[1].p =
-        s->prob_ctx[2].p =
-        s->prob_ctx[3].p = ff_vp9_default_probs;
-        memcpy(s->prob_ctx[0].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
-        memcpy(s->prob_ctx[1].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
-        memcpy(s->prob_ctx[2].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
-        memcpy(s->prob_ctx[3].coef, ff_vp9_default_coef_probs,
-               sizeof(ff_vp9_default_coef_probs));
+    /* check reference frames */
+    if (!s->s.h.keyframe && !s->s.h.intraonly) {
+        for (i = 0; i < 3; i++) {
+            AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
+            int refw = ref->width, refh = ref->height;
+
+            if (ref->format != ctx->pix_fmt) {
+                av_log(ctx, AV_LOG_ERROR,
+                       "Ref pixfmt (%s) did not match current frame (%s)",
+                       av_get_pix_fmt_name(ref->format),
+                       av_get_pix_fmt_name(ctx->pix_fmt));
+                return AVERROR_INVALIDDATA;
+            } else if (refw == w && refh == h) {
+                s->mvscale[i][0] = s->mvscale[i][1] = 0;
+            } else {
+                if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
+                    av_log(ctx, AV_LOG_ERROR,
+                           "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
+                           refw, refh, w, h);
+                    return AVERROR_INVALIDDATA;
+                }
+                s->mvscale[i][0] = (refw << 14) / w;
+                s->mvscale[i][1] = (refh << 14) / h;
+                s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
+                s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
+            }
+        }
+    }
+
+    if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
+        s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
+                           s->prob_ctx[3].p = vp9_default_probs;
+        memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+        memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+        memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+        memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
+    } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
+        s->prob_ctx[c].p = vp9_default_probs;
+        memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
+               sizeof(vp9_default_coef_probs));
     }
 
     // next 16 bits is size of the rest of the header (arith-coded)
-    size2 = get_bits(&s->gb, 16);
+    s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
+    s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
+
     data2 = align_get_bits(&s->gb);
     if (size2 > size - (data2 - data)) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid compressed header size\n");
+        av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
         return AVERROR_INVALIDDATA;
     }
     ff_vp56_init_range_decoder(&s->c, data2, size2);
     if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
-        av_log(avctx, AV_LOG_ERROR, "Marker bit was set\n");
+        av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->keyframe || s->intraonly)
-        memset(s->counts.coef, 0,
-               sizeof(s->counts.coef) + sizeof(s->counts.eob));
-    else
+    if (s->s.h.keyframe || s->s.h.intraonly) {
+        memset(s->counts.coef, 0, sizeof(s->counts.coef));
+        memset(s->counts.eob,  0, sizeof(s->counts.eob));
+    } else {
         memset(&s->counts, 0, sizeof(s->counts));
-
-    /* FIXME is it faster to not copy here, but do it down in the fw updates
-     * as explicit copies if the fw update is missing (and skip the copy upon
-     * fw update)? */
+    }
+    // FIXME is it faster to not copy here, but do it down in the fw updates
+    // as explicit copies if the fw update is missing (and skip the copy upon
+    // fw update)?
     s->prob.p = s->prob_ctx[c].p;
 
     // txfm updates
-    if (s->lossless) {
-        s->txfmmode = TX_4X4;
+    if (s->s.h.lossless) {
+        s->s.h.txfmmode = TX_4X4;
     } else {
-        s->txfmmode = vp8_rac_get_uint(&s->c, 2);
-        if (s->txfmmode == 3)
-            s->txfmmode += vp8_rac_get(&s->c);
+        s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
+        if (s->s.h.txfmmode == 3)
+            s->s.h.txfmmode += vp8_rac_get(&s->c);
 
-        if (s->txfmmode == TX_SWITCHABLE) {
+        if (s->s.h.txfmmode == TX_SWITCHABLE) {
             for (i = 0; i < 2; i++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
@@ -529,10 +899,11 @@ static int decode_frame_header(AVCodecContext *avctx,
                             if (m >= 3 && l == 0) // dc only has 3 pt
                                 break;
                             for (n = 0; n < 3; n++) {
-                                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                                if (vp56_rac_get_prob_branchy(&s->c, 252)) {
                                     p[n] = update_prob(&s->c, r[n]);
-                                else
+                                } else {
                                     p[n] = r[n];
+                                }
                             }
                             p[3] = 0;
                         }
@@ -549,7 +920,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                             p[3] = 0;
                         }
         }
-        if (s->txfmmode == i)
+        if (s->s.h.txfmmode == i)
             break;
     }
 
@@ -557,14 +928,14 @@ static int decode_frame_header(AVCodecContext *avctx,
     for (i = 0; i < 3; i++)
         if (vp56_rac_get_prob_branchy(&s->c, 252))
             s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
-    if (!s->keyframe && !s->intraonly) {
+    if (!s->s.h.keyframe && !s->s.h.intraonly) {
         for (i = 0; i < 7; i++)
             for (j = 0; j < 3; j++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.mv_mode[i][j] =
                         update_prob(&s->c, s->prob.p.mv_mode[i][j]);
 
-        if (s->filtermode == FILTER_SWITCHABLE)
+        if (s->s.h.filtermode == FILTER_SWITCHABLE)
             for (i = 0; i < 4; i++)
                 for (j = 0; j < 2; j++)
                     if (vp56_rac_get_prob_branchy(&s->c, 252))
@@ -575,20 +946,20 @@ static int decode_frame_header(AVCodecContext *avctx,
             if (vp56_rac_get_prob_branchy(&s->c, 252))
                 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
 
-        if (s->allowcompinter) {
-            s->comppredmode = vp8_rac_get(&s->c);
-            if (s->comppredmode)
-                s->comppredmode += vp8_rac_get(&s->c);
-            if (s->comppredmode == PRED_SWITCHABLE)
+        if (s->s.h.allowcompinter) {
+            s->s.h.comppredmode = vp8_rac_get(&s->c);
+            if (s->s.h.comppredmode)
+                s->s.h.comppredmode += vp8_rac_get(&s->c);
+            if (s->s.h.comppredmode == PRED_SWITCHABLE)
                 for (i = 0; i < 5; i++)
                     if (vp56_rac_get_prob_branchy(&s->c, 252))
                         s->prob.p.comp[i] =
                             update_prob(&s->c, s->prob.p.comp[i]);
         } else {
-            s->comppredmode = PRED_SINGLEREF;
+            s->s.h.comppredmode = PRED_SINGLEREF;
         }
 
-        if (s->comppredmode != PRED_COMPREF) {
+        if (s->s.h.comppredmode != PRED_COMPREF) {
             for (i = 0; i < 5; i++) {
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.single_ref[i][0] =
@@ -599,7 +970,7 @@ static int decode_frame_header(AVCodecContext *avctx,
             }
         }
 
-        if (s->comppredmode != PRED_SINGLEREF) {
+        if (s->s.h.comppredmode != PRED_SINGLEREF) {
             for (i = 0; i < 5; i++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.comp_ref[i] =
@@ -617,8 +988,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                 for (k = 0; k < 3; k++)
                     if (vp56_rac_get_prob_branchy(&s->c, 252))
                         s->prob.p.partition[3 - i][j][k] =
-                            update_prob(&s->c,
-                                        s->prob.p.partition[3 - i][j][k]);
+                            update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
 
         // mv fields don't use the update_prob subexp model for some reason
         for (i = 0; i < 3; i++)
@@ -627,8 +997,7 @@ static int decode_frame_header(AVCodecContext *avctx,
 
         for (i = 0; i < 2; i++) {
             if (vp56_rac_get_prob_branchy(&s->c, 252))
-                s->prob.p.mv_comp[i].sign =
-                    (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+                s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 
             for (j = 0; j < 10; j++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
@@ -636,8 +1005,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 
             if (vp56_rac_get_prob_branchy(&s->c, 252))
-                s->prob.p.mv_comp[i].class0 =
-                    (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+                s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
 
             for (j = 0; j < 10; j++)
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
@@ -658,7 +1026,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                         (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
         }
 
-        if (s->highprecisionmvs) {
+        if (s->s.h.highprecisionmvs) {
             for (i = 0; i < 2; i++) {
                 if (vp56_rac_get_prob_branchy(&s->c, 252))
                     s->prob.p.mv_comp[i].class0_hp =
@@ -674,640 +1042,3325 @@ static int decode_frame_header(AVCodecContext *avctx,
     return (data2 - data) + size2;
 }
 
-static int decode_subblock(AVCodecContext *avctx, int row, int col,
-                           VP9Filter *lflvl,
-                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
+                                      VP9Context *s)
+{
+    dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
+    dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
+}
+
+static void find_ref_mvs(VP9Context *s,
+                         VP56mv *pmv, int ref, int z, int idx, int sb)
+{
+    static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
+        [BS_64x64] = {{  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
+                      { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 }},
+        [BS_64x32] = {{  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
+                      { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 }},
+        [BS_32x64] = {{ -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
+                      { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 }},
+        [BS_32x32] = {{  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
+                      { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
+        [BS_32x16] = {{  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
+                      { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
+        [BS_16x32] = {{ -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
+                      {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 }},
+        [BS_16x16] = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
+                      { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 }},
+        [BS_16x8]  = {{  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
+                      {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 }},
+        [BS_8x16]  = {{ -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
+                      { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 }},
+        [BS_8x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+        [BS_8x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+        [BS_4x8]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+        [BS_4x4]   = {{  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                      { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
+    };
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col, row7 = s->row7;
+    const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
+#define INVALID_MV 0x80008000U
+    uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
+    int i;
+
+#define RETURN_DIRECT_MV(mv) \
+    do { \
+        uint32_t m = AV_RN32A(&mv); \
+        if (!idx) { \
+            AV_WN32A(pmv, m); \
+            return; \
+        } else if (mem == INVALID_MV) { \
+            mem = m; \
+        } else if (m != mem) { \
+            AV_WN32A(pmv, m); \
+            return; \
+        } \
+    } while (0)
+
+    if (sb >= 0) {
+        if (sb == 2 || sb == 1) {
+            RETURN_DIRECT_MV(b->mv[0][z]);
+        } else if (sb == 3) {
+            RETURN_DIRECT_MV(b->mv[2][z]);
+            RETURN_DIRECT_MV(b->mv[1][z]);
+            RETURN_DIRECT_MV(b->mv[0][z]);
+        }
+
+#define RETURN_MV(mv) \
+    do { \
+        if (sb > 0) { \
+            VP56mv tmp; \
+            uint32_t m; \
+            av_assert2(idx == 1); \
+            av_assert2(mem != INVALID_MV); \
+            if (mem_sub8x8 == INVALID_MV) { \
+                clamp_mv(&tmp, &mv, s); \
+                m = AV_RN32A(&tmp); \
+                if (m != mem) { \
+                    AV_WN32A(pmv, m); \
+                    return; \
+                } \
+                mem_sub8x8 = AV_RN32A(&mv); \
+            } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
+                clamp_mv(&tmp, &mv, s); \
+                m = AV_RN32A(&tmp); \
+                if (m != mem) { \
+                    AV_WN32A(pmv, m); \
+                } else { \
+                    /* BUG I'm pretty sure this isn't the intention */ \
+                    AV_WN32A(pmv, 0); \
+                } \
+                return; \
+            } \
+        } else { \
+            uint32_t m = AV_RN32A(&mv); \
+            if (!idx) { \
+                clamp_mv(pmv, &mv, s); \
+                return; \
+            } else if (mem == INVALID_MV) { \
+                mem = m; \
+            } else if (m != mem) { \
+                clamp_mv(pmv, &mv, s); \
+                return; \
+            } \
+        } \
+    } while (0)
+
+        if (row > 0) {
+            struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
+            if (mv->ref[0] == ref) {
+                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
+            } else if (mv->ref[1] == ref) {
+                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
+            }
+        }
+        if (col > s->tile_col_start) {
+            struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
+            if (mv->ref[0] == ref) {
+                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
+            } else if (mv->ref[1] == ref) {
+                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
+            }
+        }
+        i = 2;
+    } else {
+        i = 0;
+    }
+
+    // previously coded MVs in this neighbourhood, using same reference frame
+    for (; i < 8; i++) {
+        int c = p[i][0] + col, r = p[i][1] + row;
+
+        if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
+            struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
+
+            if (mv->ref[0] == ref) {
+                RETURN_MV(mv->mv[0]);
+            } else if (mv->ref[1] == ref) {
+                RETURN_MV(mv->mv[1]);
+            }
+        }
+    }
+
+    // MV at this position in previous frame, using same reference frame
+    if (s->s.h.use_last_frame_mvs) {
+        struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
+
+        if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass)
+            ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0);
+        if (mv->ref[0] == ref) {
+            RETURN_MV(mv->mv[0]);
+        } else if (mv->ref[1] == ref) {
+            RETURN_MV(mv->mv[1]);
+        }
+    }
+
+#define RETURN_SCALE_MV(mv, scale) \
+    do { \
+        if (scale) { \
+            VP56mv mv_temp = { -mv.x, -mv.y }; \
+            RETURN_MV(mv_temp); \
+        } else { \
+            RETURN_MV(mv); \
+        } \
+    } while (0)
+
+    // previously coded MVs in this neighbourhood, using different reference frame
+    for (i = 0; i < 8; i++) {
+        int c = p[i][0] + col, r = p[i][1] + row;
+
+        if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
+            struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
+
+            if (mv->ref[0] != ref && mv->ref[0] >= 0) {
+                RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
+            }
+            if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
+                // BUG - libvpx has this condition regardless of whether
+                // we used the first ref MV and pre-scaling
+                AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
+                RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
+            }
+        }
+    }
+
+    // MV at this position in previous frame, using different reference frame
+    if (s->s.h.use_last_frame_mvs) {
+        struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
+
+        // no need to await_progress, because we already did that above
+        if (mv->ref[0] != ref && mv->ref[0] >= 0) {
+            RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
+        }
+        if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
+            // BUG - libvpx has this condition regardless of whether
+            // we used the first ref MV and pre-scaling
+            AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
+            RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
+        }
+    }
+
+    AV_ZERO32(pmv);
+    clamp_mv(pmv, pmv, s);
+#undef INVALID_MV
+#undef RETURN_MV
+#undef RETURN_SCALE_MV
+}
+
+static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
+{
+    int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
+    int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
+                                s->prob.p.mv_comp[idx].classes);
+
+    s->counts.mv_comp[idx].sign[sign]++;
+    s->counts.mv_comp[idx].classes[c]++;
+    if (c) {
+        int m;
+
+        for (n = 0, m = 0; m < c; m++) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
+            n |= bit << m;
+            s->counts.mv_comp[idx].bits[m][bit]++;
+        }
+        n <<= 3;
+        bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
+        n |= bit << 1;
+        s->counts.mv_comp[idx].fp[bit]++;
+        if (hp) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
+            s->counts.mv_comp[idx].hp[bit]++;
+            n |= bit;
+        } else {
+            n |= 1;
+            // bug in libvpx - we count for bw entropy purposes even if the
+            // bit wasn't coded
+            s->counts.mv_comp[idx].hp[1]++;
+        }
+        n += 8 << c;
+    } else {
+        n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
+        s->counts.mv_comp[idx].class0[n]++;
+        bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
+                               s->prob.p.mv_comp[idx].class0_fp[n]);
+        s->counts.mv_comp[idx].class0_fp[n][bit]++;
+        n = (n << 3) | (bit << 1);
+        if (hp) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
+            s->counts.mv_comp[idx].class0_hp[bit]++;
+            n |= bit;
+        } else {
+            n |= 1;
+            // bug in libvpx - we count for bw entropy purposes even if the
+            // bit wasn't coded
+            s->counts.mv_comp[idx].class0_hp[1]++;
+        }
+    }
+
+    return sign ? -(n + 1) : (n + 1);
+}
+
+static void fill_mv(VP9Context *s,
+                    VP56mv *mv, int mode, int sb)
+{
+    VP9Block *b = s->b;
+
+    if (mode == ZEROMV) {
+        AV_ZERO64(mv);
+    } else {
+        int hp;
+
+        // FIXME cache this value and reuse for other subblocks
+        find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
+                     mode == NEWMV ? -1 : sb);
+        // FIXME maybe move this code into find_ref_mvs()
+        if ((mode == NEWMV || sb == -1) &&
+            !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
+            if (mv[0].y & 1) {
+                if (mv[0].y < 0)
+                    mv[0].y++;
+                else
+                    mv[0].y--;
+            }
+            if (mv[0].x & 1) {
+                if (mv[0].x < 0)
+                    mv[0].x++;
+                else
+                    mv[0].x--;
+            }
+        }
+        if (mode == NEWMV) {
+            enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
+                                              s->prob.p.mv_joint);
+
+            s->counts.mv_joint[j]++;
+            if (j >= MV_JOINT_V)
+                mv[0].y += read_mv_component(s, 0, hp);
+            if (j & 1)
+                mv[0].x += read_mv_component(s, 1, hp);
+        }
+
+        if (b->comp) {
+            // FIXME cache this value and reuse for other subblocks
+            find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
+                         mode == NEWMV ? -1 : sb);
+            if ((mode == NEWMV || sb == -1) &&
+                !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
+                if (mv[1].y & 1) {
+                    if (mv[1].y < 0)
+                        mv[1].y++;
+                    else
+                        mv[1].y--;
+                }
+                if (mv[1].x & 1) {
+                    if (mv[1].x < 0)
+                        mv[1].x++;
+                    else
+                        mv[1].x--;
+                }
+            }
+            if (mode == NEWMV) {
+                enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
+                                                  s->prob.p.mv_joint);
+
+                s->counts.mv_joint[j]++;
+                if (j >= MV_JOINT_V)
+                    mv[1].y += read_mv_component(s, 0, hp);
+                if (j & 1)
+                    mv[1].x += read_mv_component(s, 1, hp);
+            }
+        }
+    }
+}
+
+static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
+                                       ptrdiff_t stride, int v)
+{
+    switch (w) {
+    case 1:
+        do {
+            *ptr = v;
+            ptr += stride;
+        } while (--h);
+        break;
+    case 2: {
+        int v16 = v * 0x0101;
+        do {
+            AV_WN16A(ptr, v16);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 4: {
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr, v32);
+            ptr += stride;
+        } while (--h);
+        break;
+    }
+    case 8: {
+#if HAVE_FAST_64BIT
+        uint64_t v64 = v * 0x0101010101010101ULL;
+        do {
+            AV_WN64A(ptr, v64);
+            ptr += stride;
+        } while (--h);
+#else
+        uint32_t v32 = v * 0x01010101;
+        do {
+            AV_WN32A(ptr,     v32);
+            AV_WN32A(ptr + 4, v32);
+            ptr += stride;
+        } while (--h);
+#endif
+        break;
+    }
+    }
+}
+
+static void decode_mode(AVCodecContext *ctx)
+{
+    static const uint8_t left_ctx[N_BS_SIZES] = {
+        0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
+    };
+    static const uint8_t above_ctx[N_BS_SIZES] = {
+        0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
+    };
+    static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
+        TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
+        TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
+    };
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col, row7 = s->row7;
+    enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
+    int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
+    int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
+    int have_a = row > 0, have_l = col > s->tile_col_start;
+    int vref, filter_id;
+
+    if (!s->s.h.segmentation.enabled) {
+        b->seg_id = 0;
+    } else if (s->s.h.keyframe || s->s.h.intraonly) {
+        b->seg_id = !s->s.h.segmentation.update_map ? 0 :
+                    vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->s.h.segmentation.prob);
+    } else if (!s->s.h.segmentation.update_map ||
+               (s->s.h.segmentation.temporal &&
+                vp56_rac_get_prob_branchy(&s->c,
+                    s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
+                                    s->left_segpred_ctx[row7]]))) {
+        if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
+            int pred = 8, x;
+            uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
+
+            if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
+                ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
+            for (y = 0; y < h4; y++) {
+                int idx_base = (y + row) * 8 * s->sb_cols + col;
+                for (x = 0; x < w4; x++)
+                    pred = FFMIN(pred, refsegmap[idx_base + x]);
+            }
+            av_assert1(pred < 8);
+            b->seg_id = pred;
+        } else {
+            b->seg_id = 0;
+        }
+
+        memset(&s->above_segpred_ctx[col], 1, w4);
+        memset(&s->left_segpred_ctx[row7], 1, h4);
+    } else {
+        b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
+                                     s->s.h.segmentation.prob);
+
+        memset(&s->above_segpred_ctx[col], 0, w4);
+        memset(&s->left_segpred_ctx[row7], 0, h4);
+    }
+    if (s->s.h.segmentation.enabled &&
+        (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
+        setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
+                  bw4, bh4, 8 * s->sb_cols, b->seg_id);
+    }
+
+    b->skip = s->s.h.segmentation.enabled &&
+        s->s.h.segmentation.feat[b->seg_id].skip_enabled;
+    if (!b->skip) {
+        int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
+        b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
+        s->counts.skip[c][b->skip]++;
+    }
+
+    if (s->s.h.keyframe || s->s.h.intraonly) {
+        b->intra = 1;
+    } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
+        b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
+    } else {
+        int c, bit;
+
+        if (have_a && have_l) {
+            c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
+            c += (c == 2);
+        } else {
+            c = have_a ? 2 * s->above_intra_ctx[col] :
+                have_l ? 2 * s->left_intra_ctx[row7] : 0;
+        }
+        bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
+        s->counts.intra[c][bit]++;
+        b->intra = !bit;
+    }
+
+    if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
+        int c;
+        if (have_a) {
+            if (have_l) {
+                c = (s->above_skip_ctx[col] ? max_tx :
+                     s->above_txfm_ctx[col]) +
+                    (s->left_skip_ctx[row7] ? max_tx :
+                     s->left_txfm_ctx[row7]) > max_tx;
+            } else {
+                c = s->above_skip_ctx[col] ? 1 :
+                    (s->above_txfm_ctx[col] * 2 > max_tx);
+            }
+        } else if (have_l) {
+            c = s->left_skip_ctx[row7] ? 1 :
+                (s->left_txfm_ctx[row7] * 2 > max_tx);
+        } else {
+            c = 1;
+        }
+        switch (max_tx) {
+        case TX_32X32:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
+            if (b->tx) {
+                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
+                if (b->tx == 2)
+                    b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
+            }
+            s->counts.tx32p[c][b->tx]++;
+            break;
+        case TX_16X16:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
+            if (b->tx)
+                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
+            s->counts.tx16p[c][b->tx]++;
+            break;
+        case TX_8X8:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
+            s->counts.tx8p[c][b->tx]++;
+            break;
+        case TX_4X4:
+            b->tx = TX_4X4;
+            break;
+        }
+    } else {
+        b->tx = FFMIN(max_tx, s->s.h.txfmmode);
+    }
+
+    if (s->s.h.keyframe || s->s.h.intraonly) {
+        uint8_t *a = &s->above_mode_ctx[col * 2];
+        uint8_t *l = &s->left_mode_ctx[(row7) << 1];
+
+        b->comp = 0;
+        if (b->bs > BS_8x8) {
+            // FIXME the memory storage intermediates here aren't really
+            // necessary, they're just there to make the code slightly
+            // simpler for now
+            b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                    vp9_default_kf_ymode_probs[a[0]][l[0]]);
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
+                l[0] = a[1] = b->mode[1];
+            } else {
+                l[0] = a[1] = b->mode[1] = b->mode[0];
+            }
+            if (b->bs != BS_4x8) {
+                b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                        vp9_default_kf_ymode_probs[a[0]][l[1]]);
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
+                    l[1] = a[1] = b->mode[3];
+                } else {
+                    l[1] = a[1] = b->mode[3] = b->mode[2];
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                l[1] = a[1] = b->mode[3] = b->mode[1];
+            }
+        } else {
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                          vp9_default_kf_ymode_probs[*a][*l]);
+            b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
+            // FIXME this can probably be optimized
+            memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
+            memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
+        }
+        b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                     vp9_default_kf_uvmode_probs[b->mode[3]]);
+    } else if (b->intra) {
+        b->comp = 0;
+        if (b->bs > BS_8x8) {
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                          s->prob.p.y_mode[0]);
+            s->counts.y_mode[0][b->mode[0]]++;
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                              s->prob.p.y_mode[0]);
+                s->counts.y_mode[0][b->mode[1]]++;
+            } else {
+                b->mode[1] = b->mode[0];
+            }
+            if (b->bs != BS_4x8) {
+                b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                              s->prob.p.y_mode[0]);
+                s->counts.y_mode[0][b->mode[2]]++;
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                                  s->prob.p.y_mode[0]);
+                    s->counts.y_mode[0][b->mode[3]]++;
+                } else {
+                    b->mode[3] = b->mode[2];
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                b->mode[3] = b->mode[1];
+            }
+        } else {
+            static const uint8_t size_group[10] = {
+                3, 3, 3, 3, 2, 2, 2, 1, 1, 1
+            };
+            int sz = size_group[b->bs];
+
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                          s->prob.p.y_mode[sz]);
+            b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
+            s->counts.y_mode[sz][b->mode[3]]++;
+        }
+        b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
+                                     s->prob.p.uv_mode[b->mode[3]]);
+        s->counts.uv_mode[b->mode[3]][b->uvmode]++;
+    } else {
+        static const uint8_t inter_mode_ctx_lut[14][14] = {
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
+        };
+
+        if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
+            av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
+            b->comp = 0;
+            b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
+        } else {
+            // read comp_pred flag
+            if (s->s.h.comppredmode != PRED_SWITCHABLE) {
+                b->comp = s->s.h.comppredmode == PRED_COMPREF;
+            } else {
+                int c;
+
+                // FIXME add intra as ref=0xff (or -1) to make these easier?
+                if (have_a) {
+                    if (have_l) {
+                        if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
+                            c = 4;
+                        } else if (s->above_comp_ctx[col]) {
+                            c = 2 + (s->left_intra_ctx[row7] ||
+                                     s->left_ref_ctx[row7] == s->s.h.fixcompref);
+                        } else if (s->left_comp_ctx[row7]) {
+                            c = 2 + (s->above_intra_ctx[col] ||
+                                     s->above_ref_ctx[col] == s->s.h.fixcompref);
+                        } else {
+                            c = (!s->above_intra_ctx[col] &&
+                                 s->above_ref_ctx[col] == s->s.h.fixcompref) ^
+                            (!s->left_intra_ctx[row7] &&
+                             s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
+                        }
+                    } else {
+                        c = s->above_comp_ctx[col] ? 3 :
+                        (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
+                    }
+                } else if (have_l) {
+                    c = s->left_comp_ctx[row7] ? 3 :
+                    (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
+                } else {
+                    c = 1;
+                }
+                b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
+                s->counts.comp[c][b->comp]++;
+            }
+
+            // read actual references
+            // FIXME probably cache a few variables here to prevent repetitive
+            // memory accesses below
+            if (b->comp) /* two references */ {
+                int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
+
+                b->ref[fix_idx] = s->s.h.fixcompref;
+                // FIXME can this codeblob be replaced by some sort of LUT?
+                if (have_a) {
+                    if (have_l) {
+                        if (s->above_intra_ctx[col]) {
+                            if (s->left_intra_ctx[row7]) {
+                                c = 2;
+                            } else {
+                                c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
+                            }
+                        } else if (s->left_intra_ctx[row7]) {
+                            c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
+                        } else {
+                            int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
+
+                            if (refl == refa && refa == s->s.h.varcompref[1]) {
+                                c = 0;
+                            } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
+                                if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
+                                    (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
+                                    c = 4;
+                                } else {
+                                    c = (refa == refl) ? 3 : 1;
+                                }
+                            } else if (!s->left_comp_ctx[row7]) {
+                                if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
+                                    c = 1;
+                                } else {
+                                    c = (refl == s->s.h.varcompref[1] &&
+                                         refa != s->s.h.varcompref[1]) ? 2 : 4;
+                                }
+                            } else if (!s->above_comp_ctx[col]) {
+                                if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
+                                    c = 1;
+                                } else {
+                                    c = (refa == s->s.h.varcompref[1] &&
+                                         refl != s->s.h.varcompref[1]) ? 2 : 4;
+                                }
+                            } else {
+                                c = (refl == refa) ? 4 : 2;
+                            }
+                        }
+                    } else {
+                        if (s->above_intra_ctx[col]) {
+                            c = 2;
+                        } else if (s->above_comp_ctx[col]) {
+                            c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
+                        } else {
+                            c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
+                        }
+                    }
+                } else if (have_l) {
+                    if (s->left_intra_ctx[row7]) {
+                        c = 2;
+                    } else if (s->left_comp_ctx[row7]) {
+                        c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
+                    } else {
+                        c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
+                    }
+                } else {
+                    c = 2;
+                }
+                bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
+                b->ref[var_idx] = s->s.h.varcompref[bit];
+                s->counts.comp_ref[c][bit]++;
+            } else /* single reference */ {
+                int bit, c;
+
+                if (have_a && !s->above_intra_ctx[col]) {
+                    if (have_l && !s->left_intra_ctx[row7]) {
+                        if (s->left_comp_ctx[row7]) {
+                            if (s->above_comp_ctx[col]) {
+                                c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
+                                         !s->above_ref_ctx[col]);
+                            } else {
+                                c = (3 * !s->above_ref_ctx[col]) +
+                                    (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
+                            }
+                        } else if (s->above_comp_ctx[col]) {
+                            c = (3 * !s->left_ref_ctx[row7]) +
+                                (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
+                        } else {
+                            c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
+                        }
+                    } else if (s->above_intra_ctx[col]) {
+                        c = 2;
+                    } else if (s->above_comp_ctx[col]) {
+                        c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
+                    } else {
+                        c = 4 * (!s->above_ref_ctx[col]);
+                    }
+                } else if (have_l && !s->left_intra_ctx[row7]) {
+                    if (s->left_intra_ctx[row7]) {
+                        c = 2;
+                    } else if (s->left_comp_ctx[row7]) {
+                        c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
+                    } else {
+                        c = 4 * (!s->left_ref_ctx[row7]);
+                    }
+                } else {
+                    c = 2;
+                }
+                bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
+                s->counts.single_ref[c][0][bit]++;
+                if (!bit) {
+                    b->ref[0] = 0;
+                } else {
+                    // FIXME can this codeblob be replaced by some sort of LUT?
+                    if (have_a) {
+                        if (have_l) {
+                            if (s->left_intra_ctx[row7]) {
+                                if (s->above_intra_ctx[col]) {
+                                    c = 2;
+                                } else if (s->above_comp_ctx[col]) {
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
+                                                 s->above_ref_ctx[col] == 1);
+                                } else if (!s->above_ref_ctx[col]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->above_ref_ctx[col] == 1);
+                                }
+                            } else if (s->above_intra_ctx[col]) {
+                                if (s->left_intra_ctx[row7]) {
+                                    c = 2;
+                                } else if (s->left_comp_ctx[row7]) {
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                } else if (!s->left_ref_ctx[row7]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (s->above_comp_ctx[col]) {
+                                if (s->left_comp_ctx[row7]) {
+                                    if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
+                                        c = 3 * (s->s.h.fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                    } else {
+                                        c = 2;
+                                    }
+                                } else if (!s->left_ref_ctx[row7]) {
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
+                                                 s->above_ref_ctx[col] == 1);
+                                } else {
+                                    c = 3 * (s->left_ref_ctx[row7] == 1) +
+                                    (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                                }
+                            } else if (s->left_comp_ctx[row7]) {
+                                if (!s->above_ref_ctx[col]) {
+                                    c = 1 + 2 * (s->s.h.fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                } else {
+                                    c = 3 * (s->above_ref_ctx[col] == 1) +
+                                    (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (!s->above_ref_ctx[col]) {
+                                if (!s->left_ref_ctx[row7]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (!s->left_ref_ctx[row7]) {
+                                c = 4 * (s->above_ref_ctx[col] == 1);
+                            } else {
+                                c = 2 * (s->left_ref_ctx[row7] == 1) +
+                                2 * (s->above_ref_ctx[col] == 1);
+                            }
+                        } else {
+                            if (s->above_intra_ctx[col] ||
+                                (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
+                                c = 2;
+                            } else if (s->above_comp_ctx[col]) {
+                                c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                            } else {
+                                c = 4 * (s->above_ref_ctx[col] == 1);
+                            }
+                        }
+                    } else if (have_l) {
+                        if (s->left_intra_ctx[row7] ||
+                            (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
+                            c = 2;
+                        } else if (s->left_comp_ctx[row7]) {
+                            c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                        } else {
+                            c = 4 * (s->left_ref_ctx[row7] == 1);
+                        }
+                    } else {
+                        c = 2;
+                    }
+                    bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
+                    s->counts.single_ref[c][1][bit]++;
+                    b->ref[0] = 1 + bit;
+                }
+            }
+        }
+
+        if (b->bs <= BS_8x8) {
+            if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
+                b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
+            } else {
+                static const uint8_t off[10] = {
+                    3, 0, 0, 1, 0, 0, 0, 0, 0, 0
+                };
+
+                // FIXME this needs to use the LUT tables from find_ref_mvs
+                // because not all are -1,0/0,-1
+                int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
+                                          [s->left_mode_ctx[row7 + off[b->bs]]];
+
+                b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
+                s->counts.mv_mode[c][b->mode[0] - 10]++;
+            }
+        }
+
+        if (s->s.h.filtermode == FILTER_SWITCHABLE) {
+            int c;
+
+            if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
+                if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
+                    c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
+                        s->left_filter_ctx[row7] : 3;
+                } else {
+                    c = s->above_filter_ctx[col];
+                }
+            } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
+                c = s->left_filter_ctx[row7];
+            } else {
+                c = 3;
+            }
+
+            filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
+                                         s->prob.p.filter[c]);
+            s->counts.filter[c][filter_id]++;
+            b->filter = vp9_filter_lut[filter_id];
+        } else {
+            b->filter = s->s.h.filtermode;
+        }
+
+        if (b->bs > BS_8x8) {
+            int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
+
+            b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                          s->prob.p.mv_mode[c]);
+            s->counts.mv_mode[c][b->mode[0] - 10]++;
+            fill_mv(s, b->mv[0], b->mode[0], 0);
+
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                s->counts.mv_mode[c][b->mode[1] - 10]++;
+                fill_mv(s, b->mv[1], b->mode[1], 1);
+            } else {
+                b->mode[1] = b->mode[0];
+                AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
+                AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
+            }
+
+            if (b->bs != BS_4x8) {
+                b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                s->counts.mv_mode[c][b->mode[2] - 10]++;
+                fill_mv(s, b->mv[2], b->mode[2], 2);
+
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
+                                                  s->prob.p.mv_mode[c]);
+                    s->counts.mv_mode[c][b->mode[3] - 10]++;
+                    fill_mv(s, b->mv[3], b->mode[3], 3);
+                } else {
+                    b->mode[3] = b->mode[2];
+                    AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
+                    AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
+                AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
+                b->mode[3] = b->mode[1];
+                AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
+                AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
+            }
+        } else {
+            fill_mv(s, b->mv[0], b->mode[0], -1);
+            AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
+            AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
+            AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
+        }
+
+        vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
+    }
+
+#if HAVE_FAST_64BIT
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                                    break; \
+    case 2:  AV_WN16A(&var, val *             0x0101);     break; \
+    case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
+    case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
+    case 16: { \
+        uint64_t v64 = val * 0x0101010101010101ULL; \
+        AV_WN64A(              &var,     v64); \
+        AV_WN64A(&((uint8_t *) &var)[8], v64); \
+        break; \
+    } \
+    }
+#else
+#define SPLAT_CTX(var, val, n) \
+    switch (n) { \
+    case 1:  var = val;                         break; \
+    case 2:  AV_WN16A(&var, val *     0x0101);  break; \
+    case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
+    case 8: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,     v32); \
+        AV_WN32A(&((uint8_t *) &var)[4], v32); \
+        break; \
+    } \
+    case 16: { \
+        uint32_t v32 = val * 0x01010101; \
+        AV_WN32A(              &var,      v32); \
+        AV_WN32A(&((uint8_t *) &var)[4],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[8],  v32); \
+        AV_WN32A(&((uint8_t *) &var)[12], v32); \
+        break; \
+    } \
+    }
+#endif
+
+    switch (bwh_tab[1][b->bs][0]) {
+#define SET_CTXS(dir, off, n) \
+    do { \
+        SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
+        SPLAT_CTX(s->dir##_txfm_ctx[off],      b->tx,            n); \
+        SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
+        if (!s->s.h.keyframe && !s->s.h.intraonly) { \
+            SPLAT_CTX(s->dir##_intra_ctx[off], b->intra,   n); \
+            SPLAT_CTX(s->dir##_comp_ctx[off],  b->comp,    n); \
+            SPLAT_CTX(s->dir##_mode_ctx[off],  b->mode[3], n); \
+            if (!b->intra) { \
+                SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
+                if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
+                    SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
+                } \
+            } \
+        } \
+    } while (0)
+    case 1: SET_CTXS(above, col, 1); break;
+    case 2: SET_CTXS(above, col, 2); break;
+    case 4: SET_CTXS(above, col, 4); break;
+    case 8: SET_CTXS(above, col, 8); break;
+    }
+    switch (bwh_tab[1][b->bs][1]) {
+    case 1: SET_CTXS(left, row7, 1); break;
+    case 2: SET_CTXS(left, row7, 2); break;
+    case 4: SET_CTXS(left, row7, 4); break;
+    case 8: SET_CTXS(left, row7, 8); break;
+    }
+#undef SPLAT_CTX
+#undef SET_CTXS
+
+    if (!s->s.h.keyframe && !s->s.h.intraonly) {
+        if (b->bs > BS_8x8) {
+            int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
+
+            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
+            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
+            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
+            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
+            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
+            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
+            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
+            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
+        } else {
+            int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
+
+            for (n = 0; n < w4 * 2; n++) {
+                AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
+                AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
+            }
+            for (n = 0; n < h4 * 2; n++) {
+                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
+                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
+            }
+        }
+    }
+
+    // FIXME kinda ugly
+    for (y = 0; y < h4; y++) {
+        int x, o = (row + y) * s->sb_cols * 8 + col;
+        struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
+
+        if (b->intra) {
+            for (x = 0; x < w4; x++) {
+                mv[x].ref[0] =
+                mv[x].ref[1] = -1;
+            }
+        } else if (b->comp) {
+            for (x = 0; x < w4; x++) {
+                mv[x].ref[0] = b->ref[0];
+                mv[x].ref[1] = b->ref[1];
+                AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
+                AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
+            }
+        } else {
+            for (x = 0; x < w4; x++) {
+                mv[x].ref[0] = b->ref[0];
+                mv[x].ref[1] = -1;
+                AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
+            }
+        }
+    }
+}
+
+// FIXME merge cnt/eob arguments?
+static av_always_inline int
+decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+                        int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
+                        unsigned (*eob)[6][2], uint8_t (*p)[6][11],
+                        int nnz, const int16_t *scan, const int16_t (*nb)[2],
+                        const int16_t *band_counts, const int16_t *qmul)
+{
+    int i = 0, band = 0, band_left = band_counts[band];
+    uint8_t *tp = p[0][nnz];
+    uint8_t cache[1024];
+
+    do {
+        int val, rc;
+
+        val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
+        eob[band][nnz][val]++;
+        if (!val)
+            break;
+
+    skip_eob:
+        if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
+            cnt[band][nnz][0]++;
+            if (!--band_left)
+                band_left = band_counts[++band];
+            cache[scan[i]] = 0;
+            nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
+            tp = p[band][nnz];
+            if (++i == n_coeffs)
+                break; //invalid input; blocks should end with EOB
+            goto skip_eob;
+        }
+
+        rc = scan[i];
+        if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
+            cnt[band][nnz][1]++;
+            val = 1;
+            cache[rc] = 1;
+        } else {
+            // fill in p[3-10] (model fill) - only once per frame for each pos
+            if (!tp[3])
+                memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
+
+            cnt[band][nnz][2]++;
+            if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
+                if (!vp56_rac_get_prob_branchy(c, tp[4])) {
+                    cache[rc] = val = 2;
+                } else {
+                    val = 3 + vp56_rac_get_prob(c, tp[5]);
+                    cache[rc] = 3;
+                }
+            } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
+                cache[rc] = 4;
+                if (!vp56_rac_get_prob_branchy(c, tp[7])) {
+                    val = 5 + vp56_rac_get_prob(c, 159);
+                } else {
+                    val  = 7 + (vp56_rac_get_prob(c, 165) << 1);
+                    val +=      vp56_rac_get_prob(c, 145);
+                }
+            } else { // cat 3-6
+                cache[rc] = 5;
+                if (!vp56_rac_get_prob_branchy(c, tp[8])) {
+                    if (!vp56_rac_get_prob_branchy(c, tp[9])) {
+                        val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
+                        val +=      (vp56_rac_get_prob(c, 148) << 1);
+                        val +=       vp56_rac_get_prob(c, 140);
+                    } else {
+                        val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
+                        val +=      (vp56_rac_get_prob(c, 155) << 2);
+                        val +=      (vp56_rac_get_prob(c, 140) << 1);
+                        val +=       vp56_rac_get_prob(c, 135);
+                    }
+                } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
+                    val  = 35 + (vp56_rac_get_prob(c, 180) << 4);
+                    val +=      (vp56_rac_get_prob(c, 157) << 3);
+                    val +=      (vp56_rac_get_prob(c, 141) << 2);
+                    val +=      (vp56_rac_get_prob(c, 134) << 1);
+                    val +=       vp56_rac_get_prob(c, 130);
+                } else {
+                    val = 67;
+                    if (!is8bitsperpixel) {
+                        if (bpp == 12) {
+                            val += vp56_rac_get_prob(c, 255) << 17;
+                            val += vp56_rac_get_prob(c, 255) << 16;
+                        }
+                        val +=  (vp56_rac_get_prob(c, 255) << 15);
+                        val +=  (vp56_rac_get_prob(c, 255) << 14);
+                    }
+                    val +=      (vp56_rac_get_prob(c, 254) << 13);
+                    val +=      (vp56_rac_get_prob(c, 254) << 12);
+                    val +=      (vp56_rac_get_prob(c, 254) << 11);
+                    val +=      (vp56_rac_get_prob(c, 252) << 10);
+                    val +=      (vp56_rac_get_prob(c, 249) << 9);
+                    val +=      (vp56_rac_get_prob(c, 243) << 8);
+                    val +=      (vp56_rac_get_prob(c, 230) << 7);
+                    val +=      (vp56_rac_get_prob(c, 196) << 6);
+                    val +=      (vp56_rac_get_prob(c, 177) << 5);
+                    val +=      (vp56_rac_get_prob(c, 153) << 4);
+                    val +=      (vp56_rac_get_prob(c, 140) << 3);
+                    val +=      (vp56_rac_get_prob(c, 133) << 2);
+                    val +=      (vp56_rac_get_prob(c, 130) << 1);
+                    val +=       vp56_rac_get_prob(c, 129);
+                }
+            }
+        }
+#define STORE_COEF(c, i, v) do { \
+    if (is8bitsperpixel) { \
+        c[i] = v; \
+    } else { \
+        AV_WN32A(&c[i * 2], v); \
+    } \
+} while (0)
+        if (!--band_left)
+            band_left = band_counts[++band];
+        if (is_tx32x32)
+            STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
+        else
+            STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
+        nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
+        tp = p[band][nnz];
+    } while (++i < n_coeffs);
+
+    return i;
+}
+
+static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                const int16_t (*nb)[2], const int16_t *band_counts,
+                                const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                  const int16_t (*nb)[2], const int16_t *band_counts,
+                                  const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                 const int16_t (*nb)[2], const int16_t *band_counts,
+                                 const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->bpp, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
+                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
+                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
+                                   const int16_t (*nb)[2], const int16_t *band_counts,
+                                   const int16_t *qmul)
+{
+    return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->bpp, cnt, eob, p,
+                                   nnz, scan, nb, band_counts, qmul);
+}
+
+static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+    uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
+    unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
+    unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
+    int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
+    int end_x = FFMIN(2 * (s->cols - col), w4);
+    int end_y = FFMIN(2 * (s->rows - row), h4);
+    int n, pl, x, y, res;
+    int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
+    int tx = 4 * s->s.h.lossless + b->tx;
+    const int16_t * const *yscans = vp9_scans[tx];
+    const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
+    const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
+    const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
+    uint8_t *a = &s->above_y_nnz_ctx[col * 2];
+    uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
+    static const int16_t band_counts[4][8] = {
+        { 1, 2, 3, 4,  3,   16 - 13 },
+        { 1, 2, 3, 4, 11,   64 - 21 },
+        { 1, 2, 3, 4, 11,  256 - 21 },
+        { 1, 2, 3, 4, 11, 1024 - 21 },
+    };
+    const int16_t *y_band_counts = band_counts[b->tx];
+    const int16_t *uv_band_counts = band_counts[b->uvtx];
+    int bytesperpixel = is8bitsperpixel ? 1 : 2;
+    int total_coeff = 0;
+
+#define MERGE(la, end, step, rd) \
+    for (n = 0; n < end; n += step) \
+        la[n] = !!rd(&la[n])
+#define MERGE_CTX(step, rd) \
+    do { \
+        MERGE(l, end_y, step, rd); \
+        MERGE(a, end_x, step, rd); \
+    } while (0)
+
+#define DECODE_Y_COEF_LOOP(step, mode_index, v) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
+            res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
+                                    (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
+                                     c, e, p, a[x] + l[y], yscans[txtp], \
+                                     ynbs[txtp], y_band_counts, qmul[0]); \
+            a[x] = l[y] = !!res; \
+            total_coeff |= !!res; \
+            if (step >= 4) { \
+                AV_WN16A(&s->eob[n], res); \
+            } else { \
+                s->eob[n] = res; \
+            } \
+        } \
+    }
+
+#define SPLAT(la, end, step, cond) \
+    if (step == 2) { \
+        for (n = 1; n < end; n += step) \
+            la[n] = la[n - 1]; \
+    } else if (step == 4) { \
+        if (cond) { \
+            for (n = 0; n < end; n += step) \
+                AV_WN32A(&la[n], la[n] * 0x01010101); \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
+        } \
+    } else /* step == 8 */ { \
+        if (cond) { \
+            if (HAVE_FAST_64BIT) { \
+                for (n = 0; n < end; n += step) \
+                    AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
+            } else { \
+                for (n = 0; n < end; n += step) { \
+                    uint32_t v32 = la[n] * 0x01010101; \
+                    AV_WN32A(&la[n],     v32); \
+                    AV_WN32A(&la[n + 4], v32); \
+                } \
+            } \
+        } else { \
+            for (n = 0; n < end; n += step) \
+                memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
+        } \
+    }
+#define SPLAT_CTX(step) \
+    do { \
+        SPLAT(a, end_x, step, end_x == w4); \
+        SPLAT(l, end_y, step, end_y == h4); \
+    } while (0)
+
+    /* y tokens */
+    switch (b->tx) {
+    case TX_4X4:
+        DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
+        break;
+    case TX_8X8:
+        MERGE_CTX(2, AV_RN16A);
+        DECODE_Y_COEF_LOOP(2, 0,);
+        SPLAT_CTX(2);
+        break;
+    case TX_16X16:
+        MERGE_CTX(4, AV_RN32A);
+        DECODE_Y_COEF_LOOP(4, 0,);
+        SPLAT_CTX(4);
+        break;
+    case TX_32X32:
+        MERGE_CTX(8, AV_RN64A);
+        DECODE_Y_COEF_LOOP(8, 0, 32);
+        SPLAT_CTX(8);
+        break;
+    }
+
+#define DECODE_UV_COEF_LOOP(step, v) \
+    for (n = 0, y = 0; y < end_y; y += step) { \
+        for (x = 0; x < end_x; x += step, n += step * step) { \
+            res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
+                                    (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
+                                     16 * step * step, c, e, p, a[x] + l[y], \
+                                     uvscan, uvnb, uv_band_counts, qmul[1]); \
+            a[x] = l[y] = !!res; \
+            total_coeff |= !!res; \
+            if (step >= 4) { \
+                AV_WN16A(&s->uveob[pl][n], res); \
+            } else { \
+                s->uveob[pl][n] = res; \
+            } \
+        } \
+    }
+
+    p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
+    c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
+    e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    h4 >>= s->ss_v;
+    end_y >>= s->ss_v;
+    for (pl = 0; pl < 2; pl++) {
+        a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
+        l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
+        switch (b->uvtx) {
+        case TX_4X4:
+            DECODE_UV_COEF_LOOP(1,);
+            break;
+        case TX_8X8:
+            MERGE_CTX(2, AV_RN16A);
+            DECODE_UV_COEF_LOOP(2,);
+            SPLAT_CTX(2);
+            break;
+        case TX_16X16:
+            MERGE_CTX(4, AV_RN32A);
+            DECODE_UV_COEF_LOOP(4,);
+            SPLAT_CTX(4);
+            break;
+        case TX_32X32:
+            MERGE_CTX(8, AV_RN64A);
+            DECODE_UV_COEF_LOOP(8, 32);
+            SPLAT_CTX(8);
+            break;
+        }
+    }
+
+    return total_coeff;
+}
+
+static int decode_coeffs_8bpp(AVCodecContext *ctx)
+{
+    return decode_coeffs(ctx, 1);
+}
+
+static int decode_coeffs_16bpp(AVCodecContext *ctx)
+{
+    return decode_coeffs(ctx, 0);
+}
+
+static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
+                                             uint8_t *dst_edge, ptrdiff_t stride_edge,
+                                             uint8_t *dst_inner, ptrdiff_t stride_inner,
+                                             uint8_t *l, int col, int x, int w,
+                                             int row, int y, enum TxfmMode tx,
+                                             int p, int ss_h, int ss_v, int bytesperpixel)
+{
+    int have_top = row > 0 || y > 0;
+    int have_left = col > s->tile_col_start || x > 0;
+    int have_right = x < w - 1;
+    int bpp = s->bpp;
+    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
+        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED },
+                                   { DC_127_PRED,          VERT_PRED } },
+        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED },
+                                   { HOR_PRED,             HOR_PRED } },
+        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED },
+                                   { LEFT_DC_PRED,         DC_PRED } },
+        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED },
+                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED } },
+        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
+                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
+        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED },
+                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED } },
+        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED },
+                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED } },
+        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED },
+                                   { DC_127_PRED,          VERT_LEFT_PRED } },
+        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED },
+                                   { HOR_UP_PRED,          HOR_UP_PRED } },
+        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED },
+                                   { HOR_PRED,             TM_VP8_PRED } },
+    };
+    static const struct {
+        uint8_t needs_left:1;
+        uint8_t needs_top:1;
+        uint8_t needs_topleft:1;
+        uint8_t needs_topright:1;
+        uint8_t invert_left:1;
+    } edges[N_INTRA_PRED_MODES] = {
+        [VERT_PRED]            = { .needs_top  = 1 },
+        [HOR_PRED]             = { .needs_left = 1 },
+        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
+        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
+        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
+        [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
+        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+        [LEFT_DC_PRED]         = { .needs_left = 1 },
+        [TOP_DC_PRED]          = { .needs_top  = 1 },
+        [DC_128_PRED]          = { 0 },
+        [DC_127_PRED]          = { 0 },
+        [DC_129_PRED]          = { 0 }
+    };
+
+    av_assert2(mode >= 0 && mode < 10);
+    mode = mode_conv[mode][have_left][have_top];
+    if (edges[mode].needs_top) {
+        uint8_t *top, *topleft;
+        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
+        int n_px_need_tr = 0;
+
+        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
+            n_px_need_tr = 4;
+
+        // if top of sb64-row, use s->intra_pred_data[] instead of
+        // dst[-stride] for intra prediction (it contains pre- instead of
+        // post-loopfilter data)
+        if (have_top) {
+            top = !(row & 7) && !y ?
+                s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
+                y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
+            if (have_left)
+                topleft = !(row & 7) && !y ?
+                    s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
+                    y == 0 || x == 0 ? &dst_edge[-stride_edge] :
+                    &dst_inner[-stride_inner];
+        }
+
+        if (have_top &&
+            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
+            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
+            n_px_need + n_px_need_tr <= n_px_have) {
+            *a = top;
+        } else {
+            if (have_top) {
+                if (n_px_need <= n_px_have) {
+                    memcpy(*a, top, n_px_need * bytesperpixel);
+                } else {
+#define memset_bpp(c, i1, v, i2, num) do { \
+    if (bytesperpixel == 1) { \
+        memset(&(c)[(i1)], (v)[(i2)], (num)); \
+    } else { \
+        int n, val = AV_RN16A(&(v)[(i2) * 2]); \
+        for (n = 0; n < (num); n++) { \
+            AV_WN16A(&(c)[((i1) + n) * 2], val); \
+        } \
+    } \
+} while (0)
+                    memcpy(*a, top, n_px_have * bytesperpixel);
+                    memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
+                }
+            } else {
+#define memset_val(c, val, num) do { \
+    if (bytesperpixel == 1) { \
+        memset((c), (val), (num)); \
+    } else { \
+        int n; \
+        for (n = 0; n < (num); n++) { \
+            AV_WN16A(&(c)[n * 2], (val)); \
+        } \
+    } \
+} while (0)
+                memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
+            }
+            if (edges[mode].needs_topleft) {
+                if (have_left && have_top) {
+#define assign_bpp(c, i1, v, i2) do { \
+    if (bytesperpixel == 1) { \
+        (c)[(i1)] = (v)[(i2)]; \
+    } else { \
+        AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
+    } \
+} while (0)
+                    assign_bpp(*a, -1, topleft, -1);
+                } else {
+#define assign_val(c, i, v) do { \
+    if (bytesperpixel == 1) { \
+        (c)[(i)] = (v); \
+    } else { \
+        AV_WN16A(&(c)[(i) * 2], (v)); \
+    } \
+} while (0)
+                    assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
+                }
+            }
+            if (tx == TX_4X4 && edges[mode].needs_topright) {
+                if (have_top && have_right &&
+                    n_px_need + n_px_need_tr <= n_px_have) {
+                    memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
+                } else {
+                    memset_bpp(*a, 4, *a, 3, 4);
+                }
+            }
+        }
+    }
+    if (edges[mode].needs_left) {
+        if (have_left) {
+            int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
+            uint8_t *dst = x == 0 ? dst_edge : dst_inner;
+            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
+
+            if (edges[mode].invert_left) {
+                if (n_px_need <= n_px_have) {
+                    for (i = 0; i < n_px_need; i++)
+                        assign_bpp(l, i, &dst[i * stride], -1);
+                } else {
+                    for (i = 0; i < n_px_have; i++)
+                        assign_bpp(l, i, &dst[i * stride], -1);
+                    memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
+                }
+            } else {
+                if (n_px_need <= n_px_have) {
+                    for (i = 0; i < n_px_need; i++)
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
+                } else {
+                    for (i = 0; i < n_px_have; i++)
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
+                    memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
+                }
+            }
+        } else {
+            memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
+        }
+    }
+
+    return mode;
+}
+
+static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
+                                         ptrdiff_t uv_off, int bytesperpixel)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+    int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+    int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+    int end_x = FFMIN(2 * (s->cols - col), w4);
+    int end_y = FFMIN(2 * (s->rows - row), h4);
+    int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
+    int uvstep1d = 1 << b->uvtx, p;
+    uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
+    LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
+    LOCAL_ALIGNED_32(uint8_t, l, [64]);
+
+    for (n = 0, y = 0; y < end_y; y += step1d) {
+        uint8_t *ptr = dst, *ptr_r = dst_r;
+        for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
+                               ptr_r += 4 * step1d * bytesperpixel, n += step) {
+            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
+                               y * 2 + x : 0];
+            uint8_t *a = &a_buf[32];
+            enum TxfmType txtp = vp9_intra_txfm_type[mode];
+            int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
+
+            mode = check_intra_mode(s, mode, &a, ptr_r,
+                                    s->s.frames[CUR_FRAME].tf.f->linesize[0],
+                                    ptr, s->y_stride, l,
+                                    col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
+            s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
+            if (eob)
+                s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
+                                           s->block + 16 * n * bytesperpixel, eob);
+        }
+        dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
+        dst   += 4 * step1d * s->y_stride;
+    }
+
+    // U/V
+    w4 >>= s->ss_h;
+    end_x >>= s->ss_h;
+    end_y >>= s->ss_v;
+    step = 1 << (b->uvtx * 2);
+    for (p = 0; p < 2; p++) {
+        dst   = s->dst[1 + p];
+        dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
+        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+            uint8_t *ptr = dst, *ptr_r = dst_r;
+            for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
+                                   ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
+                int mode = b->uvmode;
+                uint8_t *a = &a_buf[32];
+                int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
+
+                mode = check_intra_mode(s, mode, &a, ptr_r,
+                                        s->s.frames[CUR_FRAME].tf.f->linesize[1],
+                                        ptr, s->uv_stride, l, col, x, w4, row, y,
+                                        b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
+                s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
+                if (eob)
+                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
+                                                    s->uvblock[p] + 16 * n * bytesperpixel, eob);
+            }
+            dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
+            dst   += 4 * uvstep1d * s->uv_stride;
+        }
+    }
+}
+
+static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
+{
+    intra_recon(ctx, y_off, uv_off, 1);
+}
+
+static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
 {
-    VP9Context *s = avctx->priv_data;
-    int c = ((s->above_partition_ctx[col]       >> (3 - bl)) & 1) |
+    intra_recon(ctx, y_off, uv_off, 2);
+}
+
+static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                              uint8_t *dst, ptrdiff_t dst_stride,
+                                              const uint8_t *ref, ptrdiff_t ref_stride,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              int bw, int bh, int w, int h, int bytesperpixel)
+{
+    int mx = mv->x, my = mv->y, th;
+
+    y += my >> 3;
+    x += mx >> 3;
+    ref += y * ref_stride + x * bytesperpixel;
+    mx &= 7;
+    my &= 7;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + bh + 4 * !!my + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
+                                 160, ref_stride,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        ref_stride = 160;
+    }
+    mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
+}
+
+static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                                uint8_t *dst_u, uint8_t *dst_v,
+                                                ptrdiff_t dst_stride,
+                                                const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                                const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                                ThreadFrame *ref_frame,
+                                                ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                                int bw, int bh, int w, int h, int bytesperpixel)
+{
+    int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
+
+    y += my >> 4;
+    x += mx >> 4;
+    ref_u += y * src_stride_u + x * bytesperpixel;
+    ref_v += y * src_stride_v + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
+                                 160, src_stride_u,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
+                                 160, src_stride_v,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
+    } else {
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
+    }
+}
+
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
+    mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                     mv, bw, bh, w, h, bytesperpixel)
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
+    mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                       row, col, mv, bw, bh, w, h, bytesperpixel)
+#define SCALED 0
+#define FN(x) x##_8bpp
+#define BYTES_PER_PIXEL 1
+#include "vp9_mc_template.c"
+#undef FN
+#undef BYTES_PER_PIXEL
+#define FN(x) x##_16bpp
+#define BYTES_PER_PIXEL 2
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+#undef BYTES_PER_PIXEL
+#undef SCALED
+
+static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                            vp9_mc_func (*mc)[2],
+                                            uint8_t *dst, ptrdiff_t dst_stride,
+                                            const uint8_t *ref, ptrdiff_t ref_stride,
+                                            ThreadFrame *ref_frame,
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                            int px, int py, int pw, int ph,
+                                            int bw, int bh, int w, int h, int bytesperpixel,
+                                            const uint16_t *scale, const uint8_t *step)
+{
+    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
+                         y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
+    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
+    // BUG libvpx seems to scale the two components separately. This introduces
+    // rounding errors but we have to reproduce them to be exactly compatible
+    // with the output from libvpx...
+    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
+    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref += y * ref_stride + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - 3 * ref_stride - 3 * bytesperpixel,
+                                 288, ref_stride,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        ref_stride = 288;
+    }
+    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
+    }
+}
+
+static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                              vp9_mc_func (*mc)[2],
+                                              uint8_t *dst_u, uint8_t *dst_v,
+                                              ptrdiff_t dst_stride,
+                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                              int px, int py, int pw, int ph,
+                                              int bw, int bh, int w, int h, int bytesperpixel,
+                                              const uint16_t *scale, const uint8_t *step)
+{
+    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
+        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
+        mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
+                           ref_v, src_stride_v, ref_frame,
+                           y, x, in_mv, bw, bh, w, h, bytesperpixel);
+    } else {
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    if (s->ss_h) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
+        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    } else {
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
+        mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
+    }
+    if (s->ss_v) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
+        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+    } else {
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
+        my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
+    }
+#undef scale_mv
+    y = my >> 4;
+    x = mx >> 4;
+    ref_u += y * src_stride_u + x * bytesperpixel;
+    ref_v += y * src_stride_v + x * bytesperpixel;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
+                                 288, src_stride_u,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
+                                 288, src_stride_v,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
+        smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
+    } else {
+        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
+        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
+    }
+    }
+}
+
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
+    mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                   s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
+    mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
+                     s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define SCALED 1
+#define FN(x) x##_scaled_8bpp
+#define BYTES_PER_PIXEL 1
+#include "vp9_mc_template.c"
+#undef FN
+#undef BYTES_PER_PIXEL
+#define FN(x) x##_scaled_16bpp
+#define BYTES_PER_PIXEL 2
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+#undef BYTES_PER_PIXEL
+#undef SCALED
+
+static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+
+    if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
+        if (bytesperpixel == 1) {
+            inter_pred_scaled_8bpp(ctx);
+        } else {
+            inter_pred_scaled_16bpp(ctx);
+        }
+    } else {
+        if (bytesperpixel == 1) {
+            inter_pred_8bpp(ctx);
+        } else {
+            inter_pred_16bpp(ctx);
+        }
+    }
+    if (!b->skip) {
+        /* mostly copied intra_recon() */
+
+        int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+        int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+        int end_x = FFMIN(2 * (s->cols - col), w4);
+        int end_y = FFMIN(2 * (s->rows - row), h4);
+        int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
+        int uvstep1d = 1 << b->uvtx, p;
+        uint8_t *dst = s->dst[0];
+
+        // y itxfm add
+        for (n = 0, y = 0; y < end_y; y += step1d) {
+            uint8_t *ptr = dst;
+            for (x = 0; x < end_x; x += step1d,
+                 ptr += 4 * step1d * bytesperpixel, n += step) {
+                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
+
+                if (eob)
+                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
+                                                  s->block + 16 * n * bytesperpixel, eob);
+            }
+            dst += 4 * s->y_stride * step1d;
+        }
+
+        // uv itxfm add
+        end_x >>= s->ss_h;
+        end_y >>= s->ss_v;
+        step = 1 << (b->uvtx * 2);
+        for (p = 0; p < 2; p++) {
+            dst = s->dst[p + 1];
+            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+                uint8_t *ptr = dst;
+                for (x = 0; x < end_x; x += uvstep1d,
+                     ptr += 4 * uvstep1d * bytesperpixel, n += step) {
+                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
+
+                    if (eob)
+                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
+                                                        s->uvblock[p] + 16 * n * bytesperpixel, eob);
+                }
+                dst += 4 * uvstep1d * s->uv_stride;
+            }
+        }
+    }
+}
+
+static void inter_recon_8bpp(AVCodecContext *ctx)
+{
+    inter_recon(ctx, 1);
+}
+
+static void inter_recon_16bpp(AVCodecContext *ctx)
+{
+    inter_recon(ctx, 2);
+}
+
+static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
+                                        int row_and_7, int col_and_7,
+                                        int w, int h, int col_end, int row_end,
+                                        enum TxfmMode tx, int skip_inter)
+{
+    static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
+    static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
+
+    // FIXME I'm pretty sure all loops can be replaced by a single LUT if
+    // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
+    // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
+    // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
+
+    // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
+    // edges. This means that for UV, we work on two subsampled blocks at
+    // a time, and we only use the topleft block's mode information to set
+    // things like block strength. Thus, for any block size smaller than
+    // 16x16, ignore the odd portion of the block.
+    if (tx == TX_4X4 && (ss_v | ss_h)) {
+        if (h == ss_v) {
+            if (row_and_7 & 1)
+                return;
+            if (!row_end)
+                h += 1;
+        }
+        if (w == ss_h) {
+            if (col_and_7 & 1)
+                return;
+            if (!col_end)
+                w += 1;
+        }
+    }
+
+    if (tx == TX_4X4 && !skip_inter) {
+        int t = 1 << col_and_7, m_col = (t << w) - t, y;
+        // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
+        int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
+
+        for (y = row_and_7; y < h + row_and_7; y++) {
+            int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
+
+            mask[0][y][1] |= m_row_8;
+            mask[0][y][2] |= m_row_4;
+            // for odd lines, if the odd col is not being filtered,
+            // skip odd row also:
+            // .---. <-- a
+            // |   |
+            // |___| <-- b
+            // ^   ^
+            // c   d
+            //
+            // if a/c are even row/col and b/d are odd, and d is skipped,
+            // e.g. right edge of size-66x66.webm, then skip b also (bug)
+            if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
+                mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
+            } else {
+                mask[1][y][col_mask_id] |= m_col;
+            }
+            if (!ss_h)
+                mask[0][y][3] |= m_col;
+            if (!ss_v) {
+                if (ss_h && (col_end & 1))
+                    mask[1][y][3] |= (t << (w - 1)) - t;
+                else
+                    mask[1][y][3] |= m_col;
+            }
+        }
+    } else {
+        int y, t = 1 << col_and_7, m_col = (t << w) - t;
+
+        if (!skip_inter) {
+            int mask_id = (tx == TX_8X8);
+            static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
+            int l2 = tx + ss_h - 1, step1d;
+            int m_row = m_col & masks[l2];
+
+            // at odd UV col/row edges tx16/tx32 loopfilter edges, force
+            // 8wd loopfilter to prevent going off the visible edge.
+            if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
+                int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
+                int m_row_8 = m_row - m_row_16;
+
+                for (y = row_and_7; y < h + row_and_7; y++) {
+                    mask[0][y][0] |= m_row_16;
+                    mask[0][y][1] |= m_row_8;
+                }
+            } else {
+                for (y = row_and_7; y < h + row_and_7; y++)
+                    mask[0][y][mask_id] |= m_row;
+            }
+
+            l2 = tx + ss_v - 1;
+            step1d = 1 << l2;
+            if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
+                for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
+                    mask[1][y][0] |= m_col;
+                if (y - row_and_7 == h - 1)
+                    mask[1][y][1] |= m_col;
+            } else {
+                for (y = row_and_7; y < h + row_and_7; y += step1d)
+                    mask[1][y][mask_id] |= m_col;
+            }
+        } else if (tx != TX_4X4) {
+            int mask_id;
+
+            mask_id = (tx == TX_8X8) || (h == ss_v);
+            mask[1][row_and_7][mask_id] |= m_col;
+            mask_id = (tx == TX_8X8) || (w == ss_h);
+            for (y = row_and_7; y < h + row_and_7; y++)
+                mask[0][y][mask_id] |= t;
+        } else {
+            int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
+
+            for (y = row_and_7; y < h + row_and_7; y++) {
+                mask[0][y][2] |= t4;
+                mask[0][y][1] |= t8;
+            }
+            mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
+        }
+    }
+}
+
+static void decode_b(AVCodecContext *ctx, int row, int col,
+                     struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
+                     enum BlockLevel bl, enum BlockPartition bp)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    enum BlockSize bs = bl * 3 + bp;
+    int bytesperpixel = s->bytesperpixel;
+    int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
+    int emu[2];
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
+
+    s->row = row;
+    s->row7 = row & 7;
+    s->col = col;
+    s->col7 = col & 7;
+    s->min_mv.x = -(128 + col * 64);
+    s->min_mv.y = -(128 + row * 64);
+    s->max_mv.x = 128 + (s->cols - col - w4) * 64;
+    s->max_mv.y = 128 + (s->rows - row - h4) * 64;
+    if (s->pass < 2) {
+        b->bs = bs;
+        b->bl = bl;
+        b->bp = bp;
+        decode_mode(ctx);
+        b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
+                           (s->ss_v && h4 * 2 == (1 << b->tx)));
+
+        if (!b->skip) {
+            int has_coeffs;
+
+            if (bytesperpixel == 1) {
+                has_coeffs = decode_coeffs_8bpp(ctx);
+            } else {
+                has_coeffs = decode_coeffs_16bpp(ctx);
+            }
+            if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
+                b->skip = 1;
+                memset(&s->above_skip_ctx[col], 1, w4);
+                memset(&s->left_skip_ctx[s->row7], 1, h4);
+            }
+        } else {
+            int row7 = s->row7;
+
+#define SPLAT_ZERO_CTX(v, n) \
+    switch (n) { \
+    case 1:  v = 0;          break; \
+    case 2:  AV_ZERO16(&v);  break; \
+    case 4:  AV_ZERO32(&v);  break; \
+    case 8:  AV_ZERO64(&v);  break; \
+    case 16: AV_ZERO128(&v); break; \
+    }
+#define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
+    do { \
+        SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
+        if (s->ss_##dir2) { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
+        } else { \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
+            SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
+        } \
+    } while (0)
+
+            switch (w4) {
+            case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
+            case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
+            case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
+            case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
+            }
+            switch (h4) {
+            case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
+            case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
+            case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
+            case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
+            }
+        }
+
+        if (s->pass == 1) {
+            s->b++;
+            s->block += w4 * h4 * 64 * bytesperpixel;
+            s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
+            s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
+            s->eob += 4 * w4 * h4;
+            s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+            s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
+
+            return;
+        }
+    }
+
+    // emulated overhangs if the stride of the target buffer can't hold. This
+    // makes it possible to support emu-edge and so on even if we have large block
+    // overhangs
+    emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
+             (row + h4) > s->rows;
+    emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
+             (row + h4) > s->rows;
+    if (emu[0]) {
+        s->dst[0] = s->tmp_y;
+        s->y_stride = 128;
+    } else {
+        s->dst[0] = f->data[0] + yoff;
+        s->y_stride = f->linesize[0];
+    }
+    if (emu[1]) {
+        s->dst[1] = s->tmp_uv[0];
+        s->dst[2] = s->tmp_uv[1];
+        s->uv_stride = 128;
+    } else {
+        s->dst[1] = f->data[1] + uvoff;
+        s->dst[2] = f->data[2] + uvoff;
+        s->uv_stride = f->linesize[1];
+    }
+    if (b->intra) {
+        if (s->bpp > 8) {
+            intra_recon_16bpp(ctx, yoff, uvoff);
+        } else {
+            intra_recon_8bpp(ctx, yoff, uvoff);
+        }
+    } else {
+        if (s->bpp > 8) {
+            inter_recon_16bpp(ctx);
+        } else {
+            inter_recon_8bpp(ctx);
+        }
+    }
+    if (emu[0]) {
+        int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
+
+        for (n = 0; o < w; n++) {
+            int bw = 64 >> n;
+
+            av_assert2(n <= 4);
+            if (w & bw) {
+                s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
+                                         s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
+                o += bw;
+            }
+        }
+    }
+    if (emu[1]) {
+        int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
+        int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
+
+        for (n = s->ss_h; o < w; n++) {
+            int bw = 64 >> n;
+
+            av_assert2(n <= 4);
+            if (w & bw) {
+                s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
+                                         s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
+                s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
+                                         s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
+                o += bw;
+            }
+        }
+    }
+
+    // pick filter level and find edges to apply filter to
+    if (s->s.h.filter.level &&
+        (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
+                                                      [b->mode[3] != ZEROMV]) > 0) {
+        int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
+        int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
+
+        setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
+        mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
+        if (s->ss_h || s->ss_v)
+            mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
+                       s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
+                       s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
+                       b->uvtx, skip_inter);
+
+        if (!s->filter_lut.lim_lut[lvl]) {
+            int sharp = s->s.h.filter.sharpness;
+            int limit = lvl;
+
+            if (sharp > 0) {
+                limit >>= (sharp + 3) >> 2;
+                limit = FFMIN(limit, 9 - sharp);
+            }
+            limit = FFMAX(limit, 1);
+
+            s->filter_lut.lim_lut[lvl] = limit;
+            s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
+        }
+    }
+
+    if (s->pass == 2) {
+        s->b++;
+        s->block += w4 * h4 * 64 * bytesperpixel;
+        s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
+        s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
+        s->eob += 4 * w4 * h4;
+        s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
+        s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
+    }
+}
+
+static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
+                      ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+{
+    VP9Context *s = ctx->priv_data;
+    int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
             (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
-    int ret;
-    const uint8_t *p = s->keyframe ? ff_vp9_default_kf_partition_probs[bl][c]
-                                   : s->prob.p.partition[bl][c];
+    const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
+                                                     s->prob.p.partition[bl][c];
     enum BlockPartition bp;
     ptrdiff_t hbs = 4 >> bl;
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
+    ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
+    int bytesperpixel = s->bytesperpixel;
 
     if (bl == BL_8X8) {
-        bp  = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
-        ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
-    } else if (col + hbs < s->cols) {
-        if (row + hbs < s->rows) {
-            bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+        bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
+        decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
+    } else if (col + hbs < s->cols) { // FIXME why not <=?
+        if (row + hbs < s->rows) { // FIXME why not <=?
+            bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
             switch (bp) {
             case PARTITION_NONE:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
+                decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_H:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
-                if (!ret) {
-                    yoff  += hbs * 8 * s->cur_frame->linesize[0];
-                    uvoff += hbs * 4 * s->cur_frame->linesize[1];
-                    ret    = ff_vp9_decode_block(avctx, row + hbs, col, lflvl,
-                                                 yoff, uvoff, bl, bp);
-                }
+                decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_V:
-                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                          bl, bp);
-                if (!ret) {
-                    yoff  += hbs * 8;
-                    uvoff += hbs * 4;
-                    ret    = ff_vp9_decode_block(avctx, row, col + hbs, lflvl,
-                                                 yoff, uvoff, bl, bp);
-                }
+                decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                yoff  += hbs * 8 * bytesperpixel;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+                decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_SPLIT:
-                ret = decode_subblock(avctx, row, col, lflvl,
-                                      yoff, uvoff, bl + 1);
-                if (!ret) {
-                    ret = decode_subblock(avctx, row, col + hbs, lflvl,
-                                          yoff + 8 * hbs, uvoff + 4 * hbs,
-                                          bl + 1);
-                    if (!ret) {
-                        yoff  += hbs * 8 * s->cur_frame->linesize[0];
-                        uvoff += hbs * 4 * s->cur_frame->linesize[1];
-                        ret    = decode_subblock(avctx, row + hbs, col, lflvl,
-                                                 yoff, uvoff, bl + 1);
-                        if (!ret) {
-                            ret = decode_subblock(avctx, row + hbs, col + hbs,
-                                                  lflvl, yoff + 8 * hbs,
-                                                  uvoff + 4 * hbs, bl + 1);
-                        }
-                    }
-                }
+                decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(ctx, row, col + hbs, lflvl,
+                          yoff + 8 * hbs * bytesperpixel,
+                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(ctx, row + hbs, col + hbs, lflvl,
+                          yoff + 8 * hbs * bytesperpixel,
+                          uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 break;
             default:
-                av_log(avctx, AV_LOG_ERROR, "Unexpected partition %d.", bp);
-                return AVERROR_INVALIDDATA;
+                av_assert0(0);
             }
         } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
-            bp  = PARTITION_SPLIT;
-            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            if (!ret)
-                ret = decode_subblock(avctx, row, col + hbs, lflvl,
-                                      yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+            bp = PARTITION_SPLIT;
+            decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(ctx, row, col + hbs, lflvl,
+                      yoff + 8 * hbs * bytesperpixel,
+                      uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
         } else {
-            bp  = PARTITION_H;
-            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                      bl, bp);
+            bp = PARTITION_H;
+            decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
         }
-    } else if (row + hbs < s->rows) {
+    } else if (row + hbs < s->rows) { // FIXME why not <=?
         if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
-            bp  = PARTITION_SPLIT;
-            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            if (!ret) {
-                yoff  += hbs * 8 * s->cur_frame->linesize[0];
-                uvoff += hbs * 4 * s->cur_frame->linesize[1];
-                ret    = decode_subblock(avctx, row + hbs, col, lflvl,
-                                         yoff, uvoff, bl + 1);
-            }
+            bp = PARTITION_SPLIT;
+            decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            yoff  += hbs * 8 * y_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         } else {
-            bp  = PARTITION_V;
-            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
-                                      bl, bp);
+            bp = PARTITION_V;
+            decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
         }
     } else {
-        bp  = PARTITION_SPLIT;
-        ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        bp = PARTITION_SPLIT;
+        decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
     }
     s->counts.partition[bl][c][bp]++;
+}
 
-    return ret;
+static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
+                          ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+{
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    ptrdiff_t hbs = 4 >> bl;
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
+    ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
+    int bytesperpixel = s->bytesperpixel;
+
+    if (bl == BL_8X8) {
+        av_assert2(b->bl == BL_8X8);
+        decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+    } else if (s->b->bl == bl) {
+        decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+        if (b->bp == PARTITION_H && row + hbs < s->rows) {
+            yoff  += hbs * 8 * y_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
+        } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
+            yoff  += hbs * 8 * bytesperpixel;
+            uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+            decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
+        }
+    } else {
+        decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        if (col + hbs < s->cols) { // FIXME why not <=?
+            if (row + hbs < s->rows) {
+                decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
+                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
+                yoff  += hbs * 8 * y_stride;
+                uvoff += hbs * 8 * uv_stride >> s->ss_v;
+                decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
+                              yoff + 8 * hbs * bytesperpixel,
+                              uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
+            } else {
+                yoff  += hbs * 8 * bytesperpixel;
+                uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
+                decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
+            }
+        } else if (row + hbs < s->rows) {
+            yoff  += hbs * 8 * y_stride;
+            uvoff += hbs * 8 * uv_stride >> s->ss_v;
+            decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+        }
+    }
 }
 
-static void loopfilter_subblock(AVCodecContext *avctx, VP9Filter *lflvl,
-                                int row, int col,
-                                ptrdiff_t yoff, ptrdiff_t uvoff)
+static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
 {
-    VP9Context *s = avctx->priv_data;
-    uint8_t *dst   = s->cur_frame->data[0] + yoff, *lvl = lflvl->level;
-    ptrdiff_t ls_y = s->cur_frame->linesize[0], ls_uv = s->cur_frame->linesize[1];
-    int y, x, p;
-
-    /* FIXME: In how far can we interleave the v/h loopfilter calls? E.g.
-     * if you think of them as acting on a 8x8 block max, we can interleave
-     * each v/h within the single x loop, but that only works if we work on
-     * 8 pixel blocks, and we won't always do that (we want at least 16px
-     * to use SSE2 optimizations, perhaps 32 for AVX2). */
-
-    // filter edges between columns, Y plane (e.g. block1 | block2)
-    for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
-        uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
-        uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
+    int y, x, bytesperpixel = s->bytesperpixel;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
         unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
         unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
-        unsigned hm  = hm1 | hm2 | hm13 | hm23;
+        unsigned hm = hm1 | hm2 | hm13 | hm23;
 
-        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
-            if (hm1 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
+            if (col || x > 1) {
+                if (hm1 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
-                if (col || x > 1) {
                     if (hmask1[0] & x) {
                         if (hmask2[0] & x) {
-                            av_assert2(l[8] == L);
-                            s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
+                            av_assert2(l[8 << ss_v] == L);
+                            s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
                         } else {
-                            s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
+                            s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
                         }
                     } else if (hm2 & x) {
-                        L  = l[8];
+                        L = l[8 << ss_v];
                         H |= (L >> 4) << 8;
-                        E |= s->filter.mblim_lut[L] << 8;
-                        I |= s->filter.lim_lut[L] << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
                         s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
                                                [!!(hmask2[1] & x)]
-                                               [0](ptr, ls_y, E, I, H);
+                                               [0](ptr, ls, E, I, H);
                     } else {
                         s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                            [0](ptr, ls_y, E, I, H);
+                                            [0](ptr, ls, E, I, H);
                     }
-                }
-            } else if (hm2 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                } else if (hm2 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
-                if (col || x > 1) {
                     s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                        [0](ptr + 8 * ls_y, ls_y, E, I, H);
+                                        [0](ptr + 8 * ls, ls, E, I, H);
                 }
             }
-            if (hm13 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+            if (ss_h) {
+                if (x & 0xAA)
+                    l += 2;
+            } else {
+                if (hm13 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
-                if (hm23 & x) {
-                    L  = l[8];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
-                }
-            } else if (hm23 & x) {
-                int L = l[8], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                    if (hm23 & x) {
+                        L = l[8 << ss_v];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
+                    }
+                } else if (hm23 & x) {
+                    int L = l[8 << ss_v], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
-                s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
+                    s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
+                }
+                l++;
             }
         }
     }
+}
 
-    //                                          block1
-    // filter edges between rows, Y plane (e.g. ------)
-    //                                          block2
-    dst = s->cur_frame->data[0] + yoff;
-    lvl = lflvl->level;
-    for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
-        uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
+static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
+                                               uint8_t *lvl, uint8_t (*mask)[4],
+                                               uint8_t *dst, ptrdiff_t ls)
+{
+    int y, x, bytesperpixel = s->bytesperpixel;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
+        uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
         unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
 
-        for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
+        for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
             if (row || y) {
                 if (vm & x) {
                     int L = *l, H = L >> 4;
-                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
                     if (vmask[0] & x) {
-                        if (vmask[0] & (x << 1)) {
-                            av_assert2(l[1] == L);
-                            s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
+                        if (vmask[0] & (x << (1 + ss_h))) {
+                            av_assert2(l[1 + ss_h] == L);
+                            s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
                         } else {
-                            s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
+                            s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
                         }
-                    } else if (vm & (x << 1)) {
-                        L  = l[1];
+                    } else if (vm & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
                         H |= (L >> 4) << 8;
-                        E |= s->filter.mblim_lut[L] << 8;
-                        I |= s->filter.lim_lut[L] << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
                         s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                               [!!(vmask[1] & (x << 1))]
-                                               [1](ptr, ls_y, E, I, H);
+                                               [!!(vmask[1] & (x << (1 + ss_h)))]
+                                               [1](ptr, ls, E, I, H);
                     } else {
                         s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                            [1](ptr, ls_y, E, I, H);
+                                            [1](ptr, ls, E, I, H);
                     }
-                } else if (vm & (x << 1)) {
-                    int L = l[1], H = L >> 4;
-                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                } else if (vm & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
-                    s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
-                                        [1](ptr + 8, ls_y, E, I, H);
+                    s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
+                                        [1](ptr + 8 * bytesperpixel, ls, E, I, H);
                 }
             }
-            if (vm3 & x) {
-                int L = *l, H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+            if (!ss_v) {
+                if (vm3 & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
-                if (vm3 & (x << 1)) {
-                    L  = l[1];
-                    H |= (L >> 4) << 8;
-                    E |= s->filter.mblim_lut[L] << 8;
-                    I |= s->filter.lim_lut[L] << 8;
-                    s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
-                } else {
-                    s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
-                }
-            } else if (vm3 & (x << 1)) {
-                int L = l[1], H = L >> 4;
-                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+                    if (vm3 & (x << (1 + ss_h))) {
+                        L = l[1 + ss_h];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter_lut.mblim_lut[L] << 8;
+                        I |= s->filter_lut.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
+                    }
+                } else if (vm3 & (x << (1 + ss_h))) {
+                    int L = l[1 + ss_h], H = L >> 4;
+                    int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
 
-                s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
+                    s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
+                }
             }
         }
+        if (ss_v) {
+            if (y & 1)
+                lvl += 16;
+        } else {
+            lvl += 8;
+        }
     }
+}
+
+static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
+                          int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
+{
+    VP9Context *s = ctx->priv_data;
+    AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
+    uint8_t *dst = f->data[0] + yoff;
+    ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
+    uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
+    int p;
+
+    // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
+    // if you think of them as acting on a 8x8 block max, we can interleave
+    // each v/h within the single x loop, but that only works if we work on
+    // 8 pixel blocks, and we won't always do that (we want at least 16px
+    // to use SSE2 optimizations, perhaps 32 for AVX2)
+
+    filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
+    filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
 
-    // same principle but for U/V planes
     for (p = 0; p < 2; p++) {
-        lvl = lflvl->level;
-        dst = s->cur_frame->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
-            uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
-            uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
-            unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
-            unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
-
-            for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
-                if (col || x > 1) {
-                    if (hm1 & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
-
-                        if (hmask1[0] & x) {
-                            if (hmask2[0] & x) {
-                                av_assert2(l[16] == L);
-                                s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (hm2 & x) {
-                            L  = l[16];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
-                                                   [!!(hmask2[1] & x)]
-                                                   [0](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(hmask1[1] & x)]
-                                                [0](ptr, ls_uv, E, I, H);
-                        }
-                    } else if (hm2 & x) {
-                        int L = l[16], H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
+        dst = f->data[1 + p] + uvoff;
+        filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
+        filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
+    }
+}
 
-                        s->dsp.loop_filter_8[!!(hmask2[1] & x)]
-                                            [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
+static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
+{
+    int sb_start = ( idx      * n) >> log2_n;
+    int sb_end   = ((idx + 1) * n) >> log2_n;
+    *start = FFMIN(sb_start, n) << 3;
+    *end   = FFMIN(sb_end,   n) << 3;
+}
+
+static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
+                                        int max_count, int update_factor)
+{
+    unsigned ct = ct0 + ct1, p2, p1;
+
+    if (!ct)
+        return;
+
+    p1 = *p;
+    p2 = ((ct0 << 8) + (ct >> 1)) / ct;
+    p2 = av_clip(p2, 1, 255);
+    ct = FFMIN(ct, max_count);
+    update_factor = FASTDIV(update_factor * ct, max_count);
+
+    // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
+    *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
+}
+
+static void adapt_probs(VP9Context *s)
+{
+    int i, j, k, l, m;
+    prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
+    int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
+
+    // coefficients
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 2; j++)
+            for (k = 0; k < 2; k++)
+                for (l = 0; l < 6; l++)
+                    for (m = 0; m < 6; m++) {
+                        uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
+                        unsigned *e = s->counts.eob[i][j][k][l][m];
+                        unsigned *c = s->counts.coef[i][j][k][l][m];
+
+                        if (l == 0 && m >= 3) // dc only has 3 pt
+                            break;
+
+                        adapt_prob(&pp[0], e[0], e[1], 24, uf);
+                        adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
+                        adapt_prob(&pp[2], c[1], c[2], 24, uf);
                     }
-                }
-                if (x & 0xAA)
-                    l += 2;
-            }
+
+    if (s->s.h.keyframe || s->s.h.intraonly) {
+        memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
+        memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
+        memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
+        memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
+        return;
+    }
+
+    // skip flag
+    for (i = 0; i < 3; i++)
+        adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
+
+    // intra/inter flag
+    for (i = 0; i < 4; i++)
+        adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
+
+    // comppred flag
+    if (s->s.h.comppredmode == PRED_SWITCHABLE) {
+      for (i = 0; i < 5; i++)
+          adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
+    }
+
+    // reference frames
+    if (s->s.h.comppredmode != PRED_SINGLEREF) {
+      for (i = 0; i < 5; i++)
+          adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
+                     s->counts.comp_ref[i][1], 20, 128);
+    }
+
+    if (s->s.h.comppredmode != PRED_COMPREF) {
+      for (i = 0; i < 5; i++) {
+          uint8_t *pp = p->single_ref[i];
+          unsigned (*c)[2] = s->counts.single_ref[i];
+
+          adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
+          adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
+      }
+    }
+
+    // block partitioning
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 4; j++) {
+            uint8_t *pp = p->partition[i][j];
+            unsigned *c = s->counts.partition[i][j];
+
+            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+            adapt_prob(&pp[2], c[2], c[3], 20, 128);
         }
-        lvl = lflvl->level;
-        dst = s->cur_frame->data[1 + p] + uvoff;
-        for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
-            uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
-            unsigned vm = vmask[0] | vmask[1] | vmask[2];
 
-            for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
-                if (row || y) {
-                    if (vm & x) {
-                        int L = *l, H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
+    // tx size
+    if (s->s.h.txfmmode == TX_SWITCHABLE) {
+      for (i = 0; i < 2; i++) {
+          unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
 
-                        if (vmask[0] & x) {
-                            if (vmask[0] & (x << 2)) {
-                                av_assert2(l[2] == L);
-                                s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
-                            } else {
-                                s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
-                            }
-                        } else if (vm & (x << 2)) {
-                            L  = l[2];
-                            H |= (L >> 4) << 8;
-                            E |= s->filter.mblim_lut[L] << 8;
-                            I |= s->filter.lim_lut[L] << 8;
-                            s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
-                                                   [!!(vmask[1] & (x << 2))]
-                                                   [1](ptr, ls_uv, E, I, H);
-                        } else {
-                            s->dsp.loop_filter_8[!!(vmask[1] & x)]
-                                                [1](ptr, ls_uv, E, I, H);
-                        }
-                    } else if (vm & (x << 2)) {
-                        int L = l[2], H = L >> 4;
-                        int E = s->filter.mblim_lut[L];
-                        int I = s->filter.lim_lut[L];
+          adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
+          adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
+          adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
+          adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
+          adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
+          adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
+      }
+    }
 
-                        s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
-                                            [1](ptr + 8, ls_uv, E, I, H);
-                    }
-                }
-            }
-            if (y & 1)
-                lvl += 16;
+    // interpolation filter
+    if (s->s.h.filtermode == FILTER_SWITCHABLE) {
+        for (i = 0; i < 4; i++) {
+            uint8_t *pp = p->filter[i];
+            unsigned *c = s->counts.filter[i];
+
+            adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2], 20, 128);
         }
     }
+
+    // inter modes
+    for (i = 0; i < 7; i++) {
+        uint8_t *pp = p->mv_mode[i];
+        unsigned *c = s->counts.mv_mode[i];
+
+        adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[1], c[3], 20, 128);
+    }
+
+    // mv joints
+    {
+        uint8_t *pp = p->mv_joint;
+        unsigned *c = s->counts.mv_joint;
+
+        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[2], c[3], 20, 128);
+    }
+
+    // mv components
+    for (i = 0; i < 2; i++) {
+        uint8_t *pp;
+        unsigned *c, (*c2)[2], sum;
+
+        adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
+                   s->counts.mv_comp[i].sign[1], 20, 128);
+
+        pp = p->mv_comp[i].classes;
+        c = s->counts.mv_comp[i].classes;
+        sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
+        adapt_prob(&pp[0], c[0], sum, 20, 128);
+        sum -= c[1];
+        adapt_prob(&pp[1], c[1], sum, 20, 128);
+        sum -= c[2] + c[3];
+        adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
+        adapt_prob(&pp[3], c[2], c[3], 20, 128);
+        sum -= c[4] + c[5];
+        adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
+        adapt_prob(&pp[5], c[4], c[5], 20, 128);
+        sum -= c[6];
+        adapt_prob(&pp[6], c[6], sum, 20, 128);
+        adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
+        adapt_prob(&pp[8], c[7], c[8], 20, 128);
+        adapt_prob(&pp[9], c[9], c[10], 20, 128);
+
+        adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
+                   s->counts.mv_comp[i].class0[1], 20, 128);
+        pp = p->mv_comp[i].bits;
+        c2 = s->counts.mv_comp[i].bits;
+        for (j = 0; j < 10; j++)
+            adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
+
+        for (j = 0; j < 2; j++) {
+            pp = p->mv_comp[i].class0_fp[j];
+            c = s->counts.mv_comp[i].class0_fp[j];
+            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+            adapt_prob(&pp[2], c[2], c[3], 20, 128);
+        }
+        pp = p->mv_comp[i].fp;
+        c = s->counts.mv_comp[i].fp;
+        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[2], c[3], 20, 128);
+
+        if (s->s.h.highprecisionmvs) {
+            adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
+                       s->counts.mv_comp[i].class0_hp[1], 20, 128);
+            adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
+                       s->counts.mv_comp[i].hp[1], 20, 128);
+        }
+    }
+
+    // y intra modes
+    for (i = 0; i < 4; i++) {
+        uint8_t *pp = p->y_mode[i];
+        unsigned *c = s->counts.y_mode[i], sum, s2;
+
+        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
+        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
+        sum -= c[TM_VP8_PRED];
+        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
+        sum -= c[VERT_PRED];
+        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
+        s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
+        sum -= s2;
+        adapt_prob(&pp[3], s2, sum, 20, 128);
+        s2 -= c[HOR_PRED];
+        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
+        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
+        sum -= c[DIAG_DOWN_LEFT_PRED];
+        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
+        sum -= c[VERT_LEFT_PRED];
+        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
+        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
+    }
+
+    // uv intra modes
+    for (i = 0; i < 10; i++) {
+        uint8_t *pp = p->uv_mode[i];
+        unsigned *c = s->counts.uv_mode[i], sum, s2;
+
+        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
+        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
+        sum -= c[TM_VP8_PRED];
+        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
+        sum -= c[VERT_PRED];
+        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
+        s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
+        sum -= s2;
+        adapt_prob(&pp[3], s2, sum, 20, 128);
+        s2 -= c[HOR_PRED];
+        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
+        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
+        sum -= c[DIAG_DOWN_LEFT_PRED];
+        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
+        sum -= c[VERT_LEFT_PRED];
+        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
+        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
+    }
 }
 
-static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
+static void free_buffers(VP9Context *s)
 {
-    int sb_start =  (idx      * n) >> log2_n;
-    int sb_end   = ((idx + 1) * n) >> log2_n;
-    *start = FFMIN(sb_start, n) << 3;
-    *end   = FFMIN(sb_end,   n) << 3;
+    av_freep(&s->intra_pred_data[0]);
+    av_freep(&s->b_base);
+    av_freep(&s->block_base);
 }
 
-static int vp9_decode_frame(AVCodecContext *avctx, AVFrame *frame,
-                            int *got_frame, const uint8_t *data, int size)
+static av_cold int vp9_decode_free(AVCodecContext *ctx)
 {
-    VP9Context *s = avctx->priv_data;
-    int ret, tile_row, tile_col, i, ref = -1, row, col;
-    ptrdiff_t yoff = 0, uvoff = 0;
+    VP9Context *s = ctx->priv_data;
+    int i;
 
-    ret = decode_frame_header(avctx, data, size, &ref);
-    if (ret < 0) {
-        return ret;
-    } else if (!ret) {
-        if (!s->refs[ref]->buf[0]) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Requested reference %d not available\n", ref);
+    for (i = 0; i < 3; i++) {
+        if (s->s.frames[i].tf.f->buf[0])
+            vp9_unref_frame(ctx, &s->s.frames[i]);
+        av_frame_free(&s->s.frames[i].tf.f);
+    }
+    for (i = 0; i < 8; i++) {
+        if (s->s.refs[i].f->buf[0])
+            ff_thread_release_buffer(ctx, &s->s.refs[i]);
+        av_frame_free(&s->s.refs[i].f);
+        if (s->next_refs[i].f->buf[0])
+            ff_thread_release_buffer(ctx, &s->next_refs[i]);
+        av_frame_free(&s->next_refs[i].f);
+    }
+    free_buffers(s);
+    av_freep(&s->c_b);
+    s->c_b_size = 0;
+
+    return 0;
+}
+
+
+static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
+                            int *got_frame, AVPacket *pkt)
+{
+    const uint8_t *data = pkt->data;
+    int size = pkt->size;
+    VP9Context *s = ctx->priv_data;
+    int res, tile_row, tile_col, i, ref, row, col;
+    int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
+                            (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
+    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
+    AVFrame *f;
+    int bytesperpixel;
+
+    if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
+        return res;
+    } else if (res == 0) {
+        if (!s->s.refs[ref].f->buf[0]) {
+            av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
             return AVERROR_INVALIDDATA;
         }
-
-        ret = av_frame_ref(frame, s->refs[ref]);
-        if (ret < 0)
-            return ret;
+        if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
+            return res;
+        ((AVFrame *)frame)->pkt_pts = pkt->pts;
+        ((AVFrame *)frame)->pkt_dts = pkt->dts;
+        for (i = 0; i < 8; i++) {
+            if (s->next_refs[i].f->buf[0])
+                ff_thread_release_buffer(ctx, &s->next_refs[i]);
+            if (s->s.refs[i].f->buf[0] &&
+                (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
+                return res;
+        }
         *got_frame = 1;
-        return 0;
+        return pkt->size;
     }
-    data += ret;
-    size -= ret;
+    data += res;
+    size -= res;
 
-    s->cur_frame = frame;
+    if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
+        if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
+            vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
+        if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
+            (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
+            return res;
+    }
+    if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
+        vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR]);
+    if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
+        (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
+        return res;
+    if (s->s.frames[CUR_FRAME].tf.f->buf[0])
+        vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
+    if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
+        return res;
+    f = s->s.frames[CUR_FRAME].tf.f;
+    f->key_frame = s->s.h.keyframe;
+    f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
 
-    av_frame_unref(s->cur_frame);
-    if ((ret = ff_get_buffer(avctx, s->cur_frame,
-                             s->refreshrefmask ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
-        return ret;
-    s->cur_frame->key_frame = s->keyframe;
-    s->cur_frame->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
-                                          : AV_PICTURE_TYPE_P;
+    if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
+        (s->s.frames[REF_FRAME_MVPAIR].tf.f->width  != s->s.frames[CUR_FRAME].tf.f->width ||
+         s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) {
+        vp9_unref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP]);
+    }
 
-    if (s->fullrange)
-        avctx->color_range = AVCOL_RANGE_JPEG;
-    else
-        avctx->color_range = AVCOL_RANGE_MPEG;
+    // ref frame setup
+    for (i = 0; i < 8; i++) {
+        if (s->next_refs[i].f->buf[0])
+            ff_thread_release_buffer(ctx, &s->next_refs[i]);
+        if (s->s.h.refreshrefmask & (1 << i)) {
+            res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
+        } else if (s->s.refs[i].f->buf[0]) {
+            res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
+        }
+        if (res < 0)
+            return res;
+    }
 
-    switch (s->colorspace) {
-    case 1: avctx->colorspace = AVCOL_SPC_BT470BG; break;
-    case 2: avctx->colorspace = AVCOL_SPC_BT709; break;
-    case 3: avctx->colorspace = AVCOL_SPC_SMPTE170M; break;
-    case 4: avctx->colorspace = AVCOL_SPC_SMPTE240M; break;
+    if (ctx->hwaccel) {
+        res = ctx->hwaccel->start_frame(ctx, NULL, 0);
+        if (res < 0)
+            return res;
+        res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
+        if (res < 0)
+            return res;
+        res = ctx->hwaccel->end_frame(ctx);
+        if (res < 0)
+            return res;
+        goto finish;
     }
 
     // main tile decode loop
+    bytesperpixel = s->bytesperpixel;
     memset(s->above_partition_ctx, 0, s->cols);
     memset(s->above_skip_ctx, 0, s->cols);
-    if (s->keyframe || s->intraonly)
+    if (s->s.h.keyframe || s->s.h.intraonly) {
         memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
-    else
+    } else {
         memset(s->above_mode_ctx, NEARESTMV, s->cols);
+    }
     memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
-    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
-    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
+    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
+    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
     memset(s->above_segpred_ctx, 0, s->cols);
-    for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
-        set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
-                        tile_row, s->tiling.log2_tile_rows, s->sb_rows);
-        for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
-            int64_t tile_size;
-
-            if (tile_col == s->tiling.tile_cols - 1 &&
-                tile_row == s->tiling.tile_rows - 1) {
-                tile_size = size;
-            } else {
-                tile_size = AV_RB32(data);
-                data     += 4;
-                size     -= 4;
-            }
-            if (tile_size > size)
-                return AVERROR_INVALIDDATA;
-            ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
-            if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) // marker bit
-                return AVERROR_INVALIDDATA;
-            data += tile_size;
-            size -= tile_size;
-        }
-
-        for (row = s->tiling.tile_row_start;
-             row < s->tiling.tile_row_end;
-             row += 8, yoff += s->cur_frame->linesize[0] * 64,
-             uvoff += s->cur_frame->linesize[1] * 32) {
-            VP9Filter *lflvl = s->lflvl;
-            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
-
-            for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
-                set_tile_offset(&s->tiling.tile_col_start,
-                                &s->tiling.tile_col_end,
-                                tile_col, s->tiling.log2_tile_cols, s->sb_cols);
-
-                memset(s->left_partition_ctx, 0, 8);
-                memset(s->left_skip_ctx, 0, 8);
-                if (s->keyframe || s->intraonly)
-                    memset(s->left_mode_ctx, DC_PRED, 16);
-                else
-                    memset(s->left_mode_ctx, NEARESTMV, 8);
-                memset(s->left_y_nnz_ctx, 0, 16);
-                memset(s->left_uv_nnz_ctx, 0, 16);
-                memset(s->left_segpred_ctx, 0, 8);
-
-                memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
-                for (col = s->tiling.tile_col_start;
-                     col < s->tiling.tile_col_end;
-                     col += 8, yoff2 += 64, uvoff2 += 32, lflvl++) {
-                    // FIXME integrate with lf code (i.e. zero after each
-                    // use, similar to invtxfm coefficients, or similar)
-                    memset(lflvl->mask, 0, sizeof(lflvl->mask));
-
-                    if ((ret = decode_subblock(avctx, row, col, lflvl,
-                                               yoff2, uvoff2, BL_64X64)) < 0)
-                        return ret;
-                }
-                memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
-            }
-
-            // backup pre-loopfilter reconstruction data for intra
-            // prediction of next row of sb64s
-            if (row + 8 < s->rows) {
-                memcpy(s->intra_pred_data[0],
-                       s->cur_frame->data[0] + yoff +
-                       63 * s->cur_frame->linesize[0],
-                       8 * s->cols);
-                memcpy(s->intra_pred_data[1],
-                       s->cur_frame->data[1] + uvoff +
-                       31 * s->cur_frame->linesize[1],
-                       4 * s->cols);
-                memcpy(s->intra_pred_data[2],
-                       s->cur_frame->data[2] + uvoff +
-                       31 * s->cur_frame->linesize[2],
-                       4 * s->cols);
-            }
-
-            // loopfilter one row
-            if (s->filter.level) {
-                yoff2  = yoff;
-                uvoff2 = uvoff;
-                lflvl  = s->lflvl;
-                for (col = 0; col < s->cols;
-                     col += 8, yoff2 += 64, uvoff2 += 32, lflvl++)
-                    loopfilter_subblock(avctx, lflvl, row, col, yoff2, uvoff2);
-            }
-        }
-    }
-
-    // bw adaptivity (or in case of parallel decoding mode, fw adaptivity
-    // probability maintenance between frames)
-    if (s->refreshctx) {
-        if (s->parallelmode) {
-            int j, k, l, m;
-            for (i = 0; i < 4; i++) {
-                for (j = 0; j < 2; j++)
-                    for (k = 0; k < 2; k++)
-                        for (l = 0; l < 6; l++)
-                            for (m = 0; m < 6; m++)
-                                memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
-                                       s->prob.coef[i][j][k][l][m], 3);
-                if (s->txfmmode == i)
-                    break;
-            }
-            s->prob_ctx[s->framectxid].p = s->prob.p;
-        } else {
-            ff_vp9_adapt_probs(s);
-        }
+    s->pass = s->s.frames[CUR_FRAME].uses_2pass =
+        ctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode;
+    if ((res = update_block_buffers(ctx)) < 0) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Failed to allocate block buffers\n");
+        return res;
     }
-    FFSWAP(VP9MVRefPair *, s->mv[0], s->mv[1]);
+    if (s->s.h.refreshctx && s->s.h.parallelmode) {
+        int j, k, l, m;
 
-    // ref frame setup
-    for (i = 0; i < 8; i++)
-        if (s->refreshrefmask & (1 << i)) {
-            av_frame_unref(s->refs[i]);
-            ret = av_frame_ref(s->refs[i], s->cur_frame);
-            if (ret < 0)
-                return ret;
+        for (i = 0; i < 4; i++) {
+            for (j = 0; j < 2; j++)
+                for (k = 0; k < 2; k++)
+                    for (l = 0; l < 6; l++)
+                        for (m = 0; m < 6; m++)
+                            memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m],
+                                   s->prob.coef[i][j][k][l][m], 3);
+            if (s->s.h.txfmmode == i)
+                break;
         }
+        s->prob_ctx[s->s.h.framectxid].p = s->prob.p;
+        ff_thread_finish_setup(ctx);
+    } else if (!s->s.h.refreshctx) {
+        ff_thread_finish_setup(ctx);
+    }
 
-    if (s->invisible)
-        av_frame_unref(s->cur_frame);
-    else
-        *got_frame = 1;
+    do {
+        yoff = uvoff = 0;
+        s->b = s->b_base;
+        s->block = s->block_base;
+        s->uvblock[0] = s->uvblock_base[0];
+        s->uvblock[1] = s->uvblock_base[1];
+        s->eob = s->eob_base;
+        s->uveob[0] = s->uveob_base[0];
+        s->uveob[1] = s->uveob_base[1];
 
-    return 0;
-}
+        for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+            set_tile_offset(&s->tile_row_start, &s->tile_row_end,
+                            tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+            if (s->pass != 2) {
+                for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+                    int64_t tile_size;
 
-static int vp9_decode_packet(AVCodecContext *avctx, void *frame,
-                             int *got_frame, AVPacket *avpkt)
-{
-    const uint8_t *data = avpkt->data;
-    int size            = avpkt->size;
-    int marker, ret;
-
-    /* Read superframe index - this is a collection of individual frames
-     * that together lead to one visible frame */
-    marker = data[size - 1];
-    if ((marker & 0xe0) == 0xc0) {
-        int nbytes   = 1 + ((marker >> 3) & 0x3);
-        int n_frames = 1 + (marker & 0x7);
-        int idx_sz   = 2 + n_frames * nbytes;
-
-        if (size >= idx_sz && data[size - idx_sz] == marker) {
-            const uint8_t *idx = data + size + 1 - idx_sz;
-
-            while (n_frames--) {
-                unsigned sz = AV_RL32(idx);
-
-                if (nbytes < 4)
-                    sz &= (1 << (8 * nbytes)) - 1;
-                idx += nbytes;
-
-                if (sz > size) {
-                    av_log(avctx, AV_LOG_ERROR,
-                           "Superframe packet size too big: %u > %d\n",
-                           sz, size);
-                    return AVERROR_INVALIDDATA;
+                    if (tile_col == s->s.h.tiling.tile_cols - 1 &&
+                        tile_row == s->s.h.tiling.tile_rows - 1) {
+                        tile_size = size;
+                    } else {
+                        tile_size = AV_RB32(data);
+                        data += 4;
+                        size -= 4;
+                    }
+                    if (tile_size > size) {
+                        ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
+                    if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
+                        ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    data += tile_size;
+                    size -= tile_size;
                 }
+            }
+
+            for (row = s->tile_row_start; row < s->tile_row_end;
+                 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+                struct VP9Filter *lflvl_ptr = s->lflvl;
+                ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+
+                for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+                    set_tile_offset(&s->tile_col_start, &s->tile_col_end,
+                                    tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+
+                    if (s->pass != 2) {
+                        memset(s->left_partition_ctx, 0, 8);
+                        memset(s->left_skip_ctx, 0, 8);
+                        if (s->s.h.keyframe || s->s.h.intraonly) {
+                            memset(s->left_mode_ctx, DC_PRED, 16);
+                        } else {
+                            memset(s->left_mode_ctx, NEARESTMV, 8);
+                        }
+                        memset(s->left_y_nnz_ctx, 0, 16);
+                        memset(s->left_uv_nnz_ctx, 0, 32);
+                        memset(s->left_segpred_ctx, 0, 8);
+
+                        memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
+                    }
+
+                    for (col = s->tile_col_start;
+                         col < s->tile_col_end;
+                         col += 8, yoff2 += 64 * bytesperpixel,
+                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                        // FIXME integrate with lf code (i.e. zero after each
+                        // use, similar to invtxfm coefficients, or similar)
+                        if (s->pass != 1) {
+                            memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                        }
 
-                ret = vp9_decode_frame(avctx, frame, got_frame, data, sz);
-                if (ret < 0)
-                    return ret;
-                data += sz;
-                size -= sz;
+                        if (s->pass == 2) {
+                            decode_sb_mem(ctx, row, col, lflvl_ptr,
+                                          yoff2, uvoff2, BL_64X64);
+                        } else {
+                            decode_sb(ctx, row, col, lflvl_ptr,
+                                      yoff2, uvoff2, BL_64X64);
+                        }
+                    }
+                    if (s->pass != 2) {
+                        memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
+                    }
+                }
+
+                if (s->pass == 1) {
+                    continue;
+                }
+
+                // backup pre-loopfilter reconstruction data for intra
+                // prediction of next row of sb64s
+                if (row + 8 < s->rows) {
+                    memcpy(s->intra_pred_data[0],
+                           f->data[0] + yoff + 63 * ls_y,
+                           8 * s->cols * bytesperpixel);
+                    memcpy(s->intra_pred_data[1],
+                           f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                           8 * s->cols * bytesperpixel >> s->ss_h);
+                    memcpy(s->intra_pred_data[2],
+                           f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                           8 * s->cols * bytesperpixel >> s->ss_h);
+                }
+
+                // loopfilter one row
+                if (s->s.h.filter.level) {
+                    yoff2 = yoff;
+                    uvoff2 = uvoff;
+                    lflvl_ptr = s->lflvl;
+                    for (col = 0; col < s->cols;
+                         col += 8, yoff2 += 64 * bytesperpixel,
+                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                        loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
+                    }
+                }
+
+                // FIXME maybe we can make this more finegrained by running the
+                // loopfilter per-block instead of after each sbrow
+                // In fact that would also make intra pred left preparation easier?
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
             }
-            return size;
         }
+
+        if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
+            adapt_probs(s);
+            ff_thread_finish_setup(ctx);
+        }
+    } while (s->pass++ == 1);
+    ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+
+finish:
+    // ref frame setup
+    for (i = 0; i < 8; i++) {
+        if (s->s.refs[i].f->buf[0])
+            ff_thread_release_buffer(ctx, &s->s.refs[i]);
+        if (s->next_refs[i].f->buf[0] &&
+            (res = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0)
+            return res;
     }
 
-    /* If we get here, there was no valid superframe index, i.e. this is just
-     * one whole single frame. Decode it as such from the complete input buf. */
-    if ((ret = vp9_decode_frame(avctx, frame, got_frame, data, size)) < 0)
-        return ret;
-    return size;
+    if (!s->s.h.invisible) {
+        if ((res = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0)
+            return res;
+        *got_frame = 1;
+    }
+
+    return pkt->size;
 }
 
-static av_cold int vp9_decode_free(AVCodecContext *avctx)
+static void vp9_decode_flush(AVCodecContext *ctx)
 {
-    VP9Context *s = avctx->priv_data;
+    VP9Context *s = ctx->priv_data;
     int i;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
-        av_frame_free(&s->refs[i]);
+    for (i = 0; i < 3; i++)
+        vp9_unref_frame(ctx, &s->s.frames[i]);
+    for (i = 0; i < 8; i++)
+        ff_thread_release_buffer(ctx, &s->s.refs[i]);
+}
 
-    av_freep(&s->c_b);
-    av_freep(&s->above_partition_ctx);
+static int init_frames(AVCodecContext *ctx)
+{
+    VP9Context *s = ctx->priv_data;
+    int i;
+
+    for (i = 0; i < 3; i++) {
+        s->s.frames[i].tf.f = av_frame_alloc();
+        if (!s->s.frames[i].tf.f) {
+            vp9_decode_free(ctx);
+            av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
+            return AVERROR(ENOMEM);
+        }
+    }
+    for (i = 0; i < 8; i++) {
+        s->s.refs[i].f = av_frame_alloc();
+        s->next_refs[i].f = av_frame_alloc();
+        if (!s->s.refs[i].f || !s->next_refs[i].f) {
+            vp9_decode_free(ctx);
+            av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
+            return AVERROR(ENOMEM);
+        }
+    }
 
     return 0;
 }
 
-static av_cold int vp9_decode_init(AVCodecContext *avctx)
+static av_cold int vp9_decode_init(AVCodecContext *ctx)
 {
-    VP9Context *s = avctx->priv_data;
-    int i;
+    VP9Context *s = ctx->priv_data;
 
-    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+    ctx->internal->allocate_progress = 1;
+    s->last_bpp = 0;
+    s->s.h.filter.sharpness = -1;
 
-    ff_vp9dsp_init(&s->dsp);
-    ff_videodsp_init(&s->vdsp, 8);
+    return init_frames(ctx);
+}
 
-    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++) {
-        s->refs[i] = av_frame_alloc();
-        if (!s->refs[i]) {
-            vp9_decode_free(avctx);
-            return AVERROR(ENOMEM);
+#if HAVE_THREADS
+static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
+{
+    return init_frames(avctx);
+}
+
+static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
+{
+    int i, res;
+    VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
+
+    for (i = 0; i < 3; i++) {
+        if (s->s.frames[i].tf.f->buf[0])
+            vp9_unref_frame(dst, &s->s.frames[i]);
+        if (ssrc->s.frames[i].tf.f->buf[0]) {
+            if ((res = vp9_ref_frame(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0)
+                return res;
+        }
+    }
+    for (i = 0; i < 8; i++) {
+        if (s->s.refs[i].f->buf[0])
+            ff_thread_release_buffer(dst, &s->s.refs[i]);
+        if (ssrc->next_refs[i].f->buf[0]) {
+            if ((res = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0)
+                return res;
         }
     }
 
-    s->filter.sharpness = -1;
+    s->s.h.invisible = ssrc->s.h.invisible;
+    s->s.h.keyframe = ssrc->s.h.keyframe;
+    s->s.h.intraonly = ssrc->s.h.intraonly;
+    s->ss_v = ssrc->ss_v;
+    s->ss_h = ssrc->ss_h;
+    s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled;
+    s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map;
+    s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals;
+    s->bytesperpixel = ssrc->bytesperpixel;
+    s->gf_fmt = ssrc->gf_fmt;
+    s->w = ssrc->w;
+    s->h = ssrc->h;
+    s->bpp = ssrc->bpp;
+    s->bpp_index = ssrc->bpp_index;
+    s->pix_fmt = ssrc->pix_fmt;
+    memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
+    memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta));
+    memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat,
+           sizeof(s->s.h.segmentation.feat));
 
     return 0;
 }
+#endif
 
 AVCodec ff_vp9_decoder = {
-    .name           = "vp9",
-    .long_name      = NULL_IF_CONFIG_SMALL("Google VP9"),
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = AV_CODEC_ID_VP9,
-    .priv_data_size = sizeof(VP9Context),
-    .init           = vp9_decode_init,
-    .decode         = vp9_decode_packet,
-    .flush          = vp9_decode_flush,
-    .close          = vp9_decode_free,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .name                  = "vp9",
+    .long_name             = NULL_IF_CONFIG_SMALL("Google VP9"),
+    .type                  = AVMEDIA_TYPE_VIDEO,
+    .id                    = AV_CODEC_ID_VP9,
+    .priv_data_size        = sizeof(VP9Context),
+    .init                  = vp9_decode_init,
+    .close                 = vp9_decode_free,
+    .decode                = vp9_decode_frame,
+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .flush                 = vp9_decode_flush,
+    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
+    .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
+    .profiles              = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
 };
diff --git a/libavcodec/vp9.h b/libavcodec/vp9.h
index b83bd61..df5bd4d 100644
--- a/libavcodec/vp9.h
+++ b/libavcodec/vp9.h
@@ -4,34 +4,62 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_VP9_H
 #define AVCODEC_VP9_H
 
-#include <stddef.h>
 #include <stdint.h>
 
-#include "libavutil/internal.h"
-
-#include "avcodec.h"
+#include "thread.h"
 #include "vp56.h"
 
+enum BlockLevel {
+    BL_64X64,
+    BL_32X32,
+    BL_16X16,
+    BL_8X8,
+};
+
+enum BlockPartition {
+    PARTITION_NONE,    // [ ] <-.
+    PARTITION_H,       // [-]   |
+    PARTITION_V,       // [|]   |
+    PARTITION_SPLIT,   // [+] --'
+};
+
+enum BlockSize {
+    BS_64x64,
+    BS_64x32,
+    BS_32x64,
+    BS_32x32,
+    BS_32x16,
+    BS_16x32,
+    BS_16x16,
+    BS_16x8,
+    BS_8x16,
+    BS_8x8,
+    BS_8x4,
+    BS_4x8,
+    BS_4x4,
+    N_BS_SIZES,
+};
+
 enum TxfmMode {
     TX_4X4,
     TX_8X8,
@@ -69,6 +97,13 @@ enum IntraPredMode {
     N_INTRA_PRED_MODES
 };
 
+enum InterPredMode {
+    NEARESTMV = 10,
+    NEARMV = 11,
+    ZEROMV = 12,
+    NEWMV = 13,
+};
+
 enum FilterMode {
     FILTER_8TAP_SMOOTH,
     FILTER_8TAP_REGULAR,
@@ -77,245 +112,69 @@ enum FilterMode {
     FILTER_SWITCHABLE,
 };
 
-enum BlockPartition {
-    PARTITION_NONE,    // [ ] <-.
-    PARTITION_H,       // [-]   |
-    PARTITION_V,       // [|]   |
-    PARTITION_SPLIT,   // [+] --'
-};
-
-enum InterPredMode {
-    NEARESTMV = 10,
-    NEARMV    = 11,
-    ZEROMV    = 12,
-    NEWMV     = 13,
-};
-
-enum MVJoint {
-    MV_JOINT_ZERO,
-    MV_JOINT_H,
-    MV_JOINT_V,
-    MV_JOINT_HV,
-};
-
-typedef struct ProbContext {
-    uint8_t y_mode[4][9];
-    uint8_t uv_mode[10][9];
-    uint8_t filter[4][2];
-    uint8_t mv_mode[7][3];
-    uint8_t intra[4];
-    uint8_t comp[5];
-    uint8_t single_ref[5][2];
-    uint8_t comp_ref[5];
-    uint8_t tx32p[2][3];
-    uint8_t tx16p[2][2];
-    uint8_t tx8p[2];
-    uint8_t skip[3];
-    uint8_t mv_joint[3];
-    struct {
-        uint8_t sign;
-        uint8_t classes[10];
-        uint8_t class0;
-        uint8_t bits[10];
-        uint8_t class0_fp[2][3];
-        uint8_t fp[3];
-        uint8_t class0_hp;
-        uint8_t hp;
-    } mv_comp[2];
-    uint8_t partition[4][4][3];
-} ProbContext;
-
-typedef void (*vp9_mc_func)(uint8_t *dst, const uint8_t *ref,
-                            ptrdiff_t dst_stride,
-                            ptrdiff_t ref_stride,
-                            int h, int mx, int my);
-
-typedef struct VP9DSPContext {
-    /*
-     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32
-     * dimension 2: intra prediction modes
-     *
-     * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
-     * stride is aligned by 16 pixels
-     * top[-1] is top/left; top[4,7] is top-right for 4x4
-     */
-    // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/
-    // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place?
-    // also needs to fit in with what H.264/VP8/etc do
-    void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst,
-                                                         ptrdiff_t stride,
-                                                         const uint8_t *left,
-                                                         const uint8_t *top);
-
-    /*
-     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only)
-     * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst
-     *
-     * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
-     * stride is aligned by 16 pixels
-     * block is 16-byte aligned
-     * eob indicates the position (+1) of the last non-zero coefficient,
-     * in scan-order. This can be used to write faster versions, e.g. a
-     * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32,
-     * etc.
-     */
-    // FIXME also write idct_add_block() versions for whole (inter) pred
-    // blocks, so we can do 2 4x4s at once
-    void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst,
-                                                      ptrdiff_t stride,
-                                                      int16_t *block, int eob);
-
-    /*
-     * dimension 1: width of filter (0=4, 1=8, 2=16)
-     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * dst/stride are aligned by 8
-     */
-    void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride,
-                                int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * The width of filter is assumed to be 16; dst/stride are aligned by 16
-     */
-    void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride,
-                              int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1/2: width of filter (0=4, 1=8) for each filter half
-     * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v)
-     *
-     * dst/stride are aligned by operation size
-     * this basically calls loop_filter[d1][d3][0](), followed by
-     * loop_filter[d2][d3][0]() on the next 8 pixels
-     * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the
-     * integer.
-     */
-    // FIXME perhaps a mix4 that operates on 32px (for AVX2)
-    void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride,
-                                      int mb_lim, int lim, int hev_thr);
-
-    /*
-     * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4)
-     * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin)
-     * dimension 3: averaging type (0: put, 1: avg)
-     * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin)
-     * dimension 5: y subpel interpolation (1: none, 1: 8tap/bilin)
-     *
-     * dst/stride are aligned by hsize
-     */
-    vp9_mc_func mc[5][4][2][2][2];
-} VP9DSPContext;
-
 enum CompPredMode {
     PRED_SINGLEREF,
     PRED_COMPREF,
     PRED_SWITCHABLE,
 };
 
-typedef struct VP9MVRefPair {
+struct VP9mvrefPair {
     VP56mv mv[2];
     int8_t ref[2];
-} VP9MVRefPair;
-
-typedef struct VP9Filter {
-    uint8_t level[8 * 8];
-    uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
-                              [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
-} VP9Filter;
-
-enum BlockLevel {
-    BL_64X64,
-    BL_32X32,
-    BL_16X16,
-    BL_8X8,
 };
 
-enum BlockSize {
-    BS_64x64,
-    BS_64x32,
-    BS_32x64,
-    BS_32x32,
-    BS_32x16,
-    BS_16x32,
-    BS_16x16,
-    BS_16x8,
-    BS_8x16,
-    BS_8x8,
-    BS_8x4,
-    BS_4x8,
-    BS_4x4,
-    N_BS_SIZES,
-};
-
-typedef struct VP9Block {
-    uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
-    enum FilterMode filter;
-    VP56mv mv[4 /* b_idx */][2 /* ref */];
-    enum BlockSize bs;
-    enum TxfmMode tx, uvtx;
-
-    int row, row7, col, col7;
-    uint8_t *dst[3];
-    ptrdiff_t y_stride, uv_stride;
-} VP9Block;
+typedef struct VP9Frame {
+    ThreadFrame tf;
+    AVBufferRef *extradata;
+    uint8_t *segmentation_map;
+    struct VP9mvrefPair *mv;
+    int uses_2pass;
 
-typedef struct VP9Context {
-    VP9DSPContext dsp;
-    VideoDSPContext vdsp;
-    GetBitContext gb;
-    VP56RangeCoder c;
-    VP56RangeCoder *c_b;
-    unsigned c_b_size;
-    VP9Block b;
+    AVBufferRef *hwaccel_priv_buf;
+    void *hwaccel_picture_private;
+} VP9Frame;
 
+typedef struct VP9BitstreamHeader {
     // bitstream header
     uint8_t profile;
-    uint8_t keyframe, last_keyframe;
+    uint8_t keyframe;
     uint8_t invisible;
-    uint8_t use_last_frame_mvs;
     uint8_t errorres;
-    uint8_t colorspace;
-    uint8_t sub_x;
-    uint8_t sub_y;
-    uint8_t fullrange;
     uint8_t intraonly;
     uint8_t resetctx;
     uint8_t refreshrefmask;
     uint8_t highprecisionmvs;
     enum FilterMode filtermode;
     uint8_t allowcompinter;
-    uint8_t fixcompref;
     uint8_t refreshctx;
     uint8_t parallelmode;
     uint8_t framectxid;
+    uint8_t use_last_frame_mvs;
     uint8_t refidx[3];
     uint8_t signbias[3];
+    uint8_t fixcompref;
     uint8_t varcompref[2];
-    AVFrame *refs[8];
-    AVFrame *cur_frame;
-
     struct {
         uint8_t level;
         int8_t sharpness;
-        uint8_t lim_lut[64];
-        uint8_t mblim_lut[64];
     } filter;
     struct {
         uint8_t enabled;
+        uint8_t updated;
         int8_t mode[2];
         int8_t ref[4];
     } lf_delta;
     uint8_t yac_qi;
     int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
     uint8_t lossless;
+#define MAX_SEGMENT 8
     struct {
         uint8_t enabled;
         uint8_t temporal;
         uint8_t absolute_vals;
         uint8_t update_map;
-        #define MAX_SEGMENT 8
+        uint8_t prob[7];
+        uint8_t pred_prob[3];
         struct {
             uint8_t q_enabled;
             uint8_t lf_enabled;
@@ -328,95 +187,25 @@ typedef struct VP9Context {
             uint8_t lflvl[4][2];
         } feat[MAX_SEGMENT];
     } segmentation;
+    enum TxfmMode txfmmode;
+    enum CompPredMode comppredmode;
     struct {
         unsigned log2_tile_cols, log2_tile_rows;
         unsigned tile_cols, tile_rows;
-        unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
     } tiling;
-    unsigned sb_cols, sb_rows, rows, cols;
-    struct {
-        ProbContext p;
-        uint8_t coef[4][2][2][6][6][3];
-    } prob_ctx[4];
-    struct {
-        ProbContext p;
-        uint8_t coef[4][2][2][6][6][11];
-        uint8_t seg[7];
-        uint8_t segpred[3];
-    } prob;
-    struct {
-        unsigned y_mode[4][10];
-        unsigned uv_mode[10][10];
-        unsigned filter[4][3];
-        unsigned mv_mode[7][4];
-        unsigned intra[4][2];
-        unsigned comp[5][2];
-        unsigned single_ref[5][2][2];
-        unsigned comp_ref[5][2];
-        unsigned tx32p[2][4];
-        unsigned tx16p[2][3];
-        unsigned tx8p[2][2];
-        unsigned skip[3][2];
-        unsigned mv_joint[4];
-        struct {
-            unsigned sign[2];
-            unsigned classes[11];
-            unsigned class0[2];
-            unsigned bits[10][2];
-            unsigned class0_fp[2][4];
-            unsigned fp[4];
-            unsigned class0_hp[2];
-            unsigned hp[2];
-        } mv_comp[2];
-        unsigned partition[4][4][4];
-        unsigned coef[4][2][2][6][6][3];
-        unsigned eob[4][2][2][6][6][2];
-    } counts;
-    enum TxfmMode txfmmode;
-    enum CompPredMode comppredmode;
-
-    // contextual (left/above) cache
-    uint8_t left_partition_ctx[8], *above_partition_ctx;
-    uint8_t left_mode_ctx[16], *above_mode_ctx;
-    // FIXME maybe merge some of the below in a flags field?
-    uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
-    uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
-    uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
-    uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
-    uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
-    uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
-    uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
-    uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
-    uint8_t left_filter_ctx[8], *above_filter_ctx;
-    VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
-
-    // whole-frame cache
-    uint8_t *intra_pred_data[3];
-    uint8_t *segmentation_map;
-    VP9MVRefPair *mv[2];
-    VP9Filter *lflvl;
-    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71 * 80];
-
-    // block reconstruction intermediates
-    DECLARE_ALIGNED(32, int16_t, block)[4096];
-    DECLARE_ALIGNED(32, int16_t, uvblock)[2][1024];
-    uint8_t eob[256];
-    uint8_t uveob[2][64];
-    VP56mv min_mv, max_mv;
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32 * 32];
-} VP9Context;
-
-void ff_vp9dsp_init(VP9DSPContext *dsp);
-
-void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
 
-void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb);
+    int uncompressed_header_size;
+    int compressed_header_size;
+} VP9BitstreamHeader;
 
-void ff_vp9_adapt_probs(VP9Context *s);
+typedef struct VP9SharedContext {
+    VP9BitstreamHeader h;
 
-int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
-                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
-                        enum BlockLevel bl, enum BlockPartition bp);
+    ThreadFrame refs[8];
+#define CUR_FRAME 0
+#define REF_FRAME_MVPAIR 1
+#define REF_FRAME_SEGMAP 2
+    VP9Frame frames[3];
+} VP9SharedContext;
 
 #endif /* AVCODEC_VP9_H */
diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c
new file mode 100644
index 0000000..38d9a6d
--- /dev/null
+++ b/libavcodec/vp9_mc_template.c
@@ -0,0 +1,435 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define ROUNDED_DIV_MVx2(a, b) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x, 2), .y = ROUNDED_DIV(a.y + b.y, 2) }
+#define ROUNDED_DIV_MVx4(a, b, c, d) \
+    (VP56mv) { .x = ROUNDED_DIV(a.x + b.x + c.x + d.x, 4), \
+               .y = ROUNDED_DIV(a.y + b.y + c.y + d.y, 4) }
+
+static void FN(inter_pred)(AVCodecContext *ctx)
+{
+    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
+        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
+        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
+    };
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+    ThreadFrame *tref1 = &s->s.refs[s->s.h.refidx[b->ref[0]]], *tref2;
+    AVFrame *ref1 = tref1->f, *ref2;
+    int w1 = ref1->width, h1 = ref1->height, w2, h2;
+    ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
+    int bytesperpixel = BYTES_PER_PIXEL;
+
+    if (b->comp) {
+        tref2 = &s->s.refs[s->s.h.refidx[b->ref[1]]];
+        ref2 = tref2->f;
+        w2 = ref2->width;
+        h2 = ref2->height;
+    }
+
+    // y inter pred
+    if (b->bs > BS_8x8) {
+        VP56mv uvmv;
+
+#if SCALED == 0
+        if (b->bs == BS_8x4) {
+            mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],,,,, 8, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[3][b->filter][0],
+                        s->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0],,,,, 8, 4, w1, h1, 0);
+            w1 = (w1 + s->ss_h) >> s->ss_h;
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 2, col << (3 - s->ss_h),
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+            } else {
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << 3, col << (3 - s->ss_h),
+                              &b->mv[0][0],,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+                // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                // to get the motion vector for the bottom 4x4 block
+                // https://code.google.com/p/webm/issues/detail?id=993
+                if (s->ss_h == 0) {
+                    uvmv = b->mv[2][0];
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                }
+                mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
+                              s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              (row << 3) + 4, col << (3 - s->ss_h),
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
+            }
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1],,,,, 8, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[3][b->filter][1],
+                            s->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1],,,,, 8, 4, w2, h2, 1);
+                w2 = (w2 + s->ss_h) >> s->ss_h;
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 2, col << (3 - s->ss_h),
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << 3, col << (3 - s->ss_h),
+                                  &b->mv[0][1],,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                    // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
+                    // to get the motion vector for the bottom 4x4 block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    if (s->ss_h == 0) {
+                        uvmv = b->mv[2][1];
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                    }
+                    mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  (row << 3) + 4, col << (3 - s->ss_h),
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
+                }
+            }
+        } else if (b->bs == BS_4x8) {
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],,,,, 4, 8, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0],,,,, 4, 8, w1, h1, 0);
+            h1 = (h1 + s->ss_v) >> s->ss_v;
+            if (s->ss_h) {
+                w1 = (w1 + 1) >> 1;
+                uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 2,
+                              &uvmv,,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+            } else {
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1], s->dst[2], ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), col << 3,
+                              &b->mv[0][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+                mc_chroma_dir(s, mc[4][b->filter][0],
+                              s->dst[1] + 4 * bytesperpixel,
+                              s->dst[2] + 4 * bytesperpixel, ls_uv,
+                              ref1->data[1], ref1->linesize[1],
+                              ref1->data[2], ref1->linesize[2], tref1,
+                              row << (3 - s->ss_v), (col << 3) + 4,
+                              &b->mv[1][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
+            }
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1],,,,, 4, 8, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1],,,,, 4, 8, w2, h2, 1);
+                h2 = (h2 + s->ss_v) >> s->ss_v;
+                if (s->ss_h) {
+                    w2 = (w2 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 2,
+                                  &uvmv,,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                } else {
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), col << 3,
+                                  &b->mv[0][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                    mc_chroma_dir(s, mc[4][b->filter][1],
+                                  s->dst[1] + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref2->data[1], ref2->linesize[1],
+                                  ref2->data[2], ref2->linesize[2], tref2,
+                                  row << (3 - s->ss_v), (col << 3) + 4,
+                                  &b->mv[1][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
+                }
+            }
+        } else
+#endif
+        {
+            av_assert2(b->bs == BS_4x4);
+
+            // FIXME if two horizontally adjacent blocks have the same MV,
+            // do a w8 instead of a w4 call
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0],
+                        0, 0, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0],
+                        4, 0, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0],
+                        s->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0],
+                        0, 4, 8, 8, 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0],
+                        s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0],
+                        4, 4, 8, 8, 4, 4, w1, h1, 0);
+            if (s->ss_v) {
+                h1 = (h1 + 1) >> 1;
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx4(b->mv[0][0], b->mv[1][0],
+                                            b->mv[2][0], b->mv[3][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 2,
+                                  &uvmv, 0, 0, 4, 4, 4, 4, w1, h1, 0);
+                } else {
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, col << 3,
+                                  &uvmv, 0, 0, 8, 4, 4, 4, w1, h1, 0);
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 2, (col << 3) + 4,
+                                  &uvmv, 4, 0, 8, 4, 4, 4, w1, h1, 0);
+                }
+            } else {
+                if (s->ss_h) {
+                    w1 = (w1 + 1) >> 1;
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 2,
+                                  &uvmv, 0, 0, 4, 8, 4, 4, w1, h1, 0);
+                    // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                    // bottom block
+                    // https://code.google.com/p/webm/issues/detail?id=993
+                    uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[2][0]);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 2,
+                                  &uvmv, 0, 4, 4, 8, 4, 4, w1, h1, 0);
+                } else {
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1], s->dst[2], ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, col << 3,
+                                  &b->mv[0][0], 0, 0, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  row << 3, (col << 3) + 4,
+                                  &b->mv[1][0], 4, 0, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, col << 3,
+                                  &b->mv[2][0], 0, 4, 8, 8, 4, 4, w1, h1, 0);
+                    mc_chroma_dir(s, mc[4][b->filter][0],
+                                  s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
+                                  s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
+                                  ref1->data[1], ref1->linesize[1],
+                                  ref1->data[2], ref1->linesize[2], tref1,
+                                  (row << 3) + 4, (col << 3) + 4,
+                                  &b->mv[3][0], 4, 4, 8, 8, 4, 4, w1, h1, 0);
+                }
+            }
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1],
+                            s->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1],
+                            s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
+                if (s->ss_v) {
+                    h2 = (h2 + 1) >> 1;
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx4(b->mv[0][1], b->mv[1][1],
+                                                b->mv[2][1], b->mv[3][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 2,
+                                      &uvmv, 0, 0, 4, 4, 4, 4, w2, h2, 1);
+                    } else {
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, col << 3,
+                                      &uvmv, 0, 0, 8, 4, 4, 4, w2, h2, 1);
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * bytesperpixel,
+                                      s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 2, (col << 3) + 4,
+                                      &uvmv, 4, 0, 8, 4, 4, 4, w2, h2, 1);
+                    }
+                } else {
+                    if (s->ss_h) {
+                        w2 = (w2 + 1) >> 1;
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 2,
+                                      &uvmv, 0, 0, 4, 8, 4, 4, w2, h2, 1);
+                        // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
+                        // bottom block
+                        // https://code.google.com/p/webm/issues/detail?id=993
+                        uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[2][1]);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 2,
+                                      &uvmv, 0, 4, 4, 8, 4, 4, w2, h2, 1);
+                    } else {
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1], s->dst[2], ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, col << 3,
+                                      &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * bytesperpixel,
+                                      s->dst[2] + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      row << 3, (col << 3) + 4,
+                                      &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, col << 3,
+                                      &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
+                        mc_chroma_dir(s, mc[4][b->filter][1],
+                                      s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
+                                      s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
+                                      ref2->data[1], ref2->linesize[1],
+                                      ref2->data[2], ref2->linesize[2], tref2,
+                                      (row << 3) + 4, (col << 3) + 4,
+                                      &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
+                    }
+                }
+            }
+        }
+    } else {
+        int bwl = bwlog_tab[0][b->bs];
+        int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
+        int uvbw = bwh_tab[s->ss_h][b->bs][0] * 4, uvbh = bwh_tab[s->ss_v][b->bs][1] * 4;
+
+        mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y,
+                    ref1->data[0], ref1->linesize[0], tref1,
+                    row << 3, col << 3, &b->mv[0][0], 0, 0, bw, bh, bw, bh, w1, h1, 0);
+        w1 = (w1 + s->ss_h) >> s->ss_h;
+        h1 = (h1 + s->ss_v) >> s->ss_v;
+        mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][0],
+                      s->dst[1], s->dst[2], ls_uv,
+                      ref1->data[1], ref1->linesize[1],
+                      ref1->data[2], ref1->linesize[2], tref1,
+                      row << (3 - s->ss_v), col << (3 - s->ss_h),
+                      &b->mv[0][0], 0, 0, uvbw, uvbh, uvbw, uvbh, w1, h1, 0);
+
+        if (b->comp) {
+            mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y,
+                        ref2->data[0], ref2->linesize[0], tref2,
+                        row << 3, col << 3, &b->mv[0][1], 0, 0, bw, bh, bw, bh, w2, h2, 1);
+            w2 = (w2 + s->ss_h) >> s->ss_h;
+            h2 = (h2 + s->ss_v) >> s->ss_v;
+            mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][1],
+                          s->dst[1], s->dst[2], ls_uv,
+                          ref2->data[1], ref2->linesize[1],
+                          ref2->data[2], ref2->linesize[2], tref2,
+                          row << (3 - s->ss_v), col << (3 - s->ss_h),
+                          &b->mv[0][1], 0, 0, uvbw, uvbh, uvbw, uvbh, w2, h2, 1);
+        }
+    }
+}
diff --git a/libavcodec/vp9_parser.c b/libavcodec/vp9_parser.c
new file mode 100644
index 0000000..2e9235e
--- /dev/null
+++ b/libavcodec/vp9_parser.c
@@ -0,0 +1,156 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "libavcodec/get_bits.h"
+#include "parser.h"
+
+typedef struct VP9ParseContext {
+    int n_frames; // 1-8
+    int size[8];
+    int64_t pts;
+} VP9ParseContext;
+
+static int parse_frame(AVCodecParserContext *ctx, const uint8_t *buf, int size)
+{
+    VP9ParseContext *s = ctx->priv_data;
+    GetBitContext gb;
+    int res, profile, keyframe, invisible;
+
+    if ((res = init_get_bits8(&gb, buf, size)) < 0)
+        return res;
+    get_bits(&gb, 2); // frame marker
+    profile  = get_bits1(&gb);
+    profile |= get_bits1(&gb) << 1;
+    if (profile == 3) profile += get_bits1(&gb);
+
+    if (get_bits1(&gb)) {
+        keyframe = 0;
+        invisible = 0;
+    } else {
+        keyframe  = !get_bits1(&gb);
+        invisible = !get_bits1(&gb);
+    }
+
+    if (!keyframe) {
+        ctx->pict_type = AV_PICTURE_TYPE_P;
+        ctx->key_frame = 0;
+    } else {
+        ctx->pict_type = AV_PICTURE_TYPE_I;
+        ctx->key_frame = 1;
+    }
+
+    if (!invisible) {
+        if (ctx->pts == AV_NOPTS_VALUE)
+            ctx->pts = s->pts;
+        s->pts = AV_NOPTS_VALUE;
+    } else if (ctx->pts != AV_NOPTS_VALUE) {
+        s->pts = ctx->pts;
+        ctx->pts = AV_NOPTS_VALUE;
+    }
+
+    return 0;
+}
+
+static int parse(AVCodecParserContext *ctx,
+                 AVCodecContext *avctx,
+                 const uint8_t **out_data, int *out_size,
+                 const uint8_t *data, int size)
+{
+    VP9ParseContext *s = ctx->priv_data;
+    int full_size = size;
+    int marker;
+
+    if (size <= 0) {
+        *out_size = 0;
+        *out_data = data;
+
+        return 0;
+    }
+
+    if (s->n_frames > 0) {
+        *out_data = data;
+        *out_size = s->size[--s->n_frames];
+        parse_frame(ctx, *out_data, *out_size);
+
+        return s->n_frames > 0 ? *out_size : size /* i.e. include idx tail */;
+    }
+
+    marker = data[size - 1];
+    if ((marker & 0xe0) == 0xc0) {
+        int nbytes = 1 + ((marker >> 3) & 0x3);
+        int n_frames = 1 + (marker & 0x7), idx_sz = 2 + n_frames * nbytes;
+
+        if (size >= idx_sz && data[size - idx_sz] == marker) {
+            const uint8_t *idx = data + size + 1 - idx_sz;
+            int first = 1;
+
+            switch (nbytes) {
+#define case_n(a, rd) \
+            case a: \
+                while (n_frames--) { \
+                    unsigned sz = rd; \
+                    idx += a; \
+                    if (sz == 0 || sz > size) { \
+                        s->n_frames = 0; \
+                        *out_size = size; \
+                        *out_data = data; \
+                        av_log(avctx, AV_LOG_ERROR, \
+                               "Invalid superframe packet size: %u frame size: %d\n", \
+                               sz, size); \
+                        return full_size; \
+                    } \
+                    if (first) { \
+                        first = 0; \
+                        *out_data = data; \
+                        *out_size = sz; \
+                        s->n_frames = n_frames; \
+                    } else { \
+                        s->size[n_frames] = sz; \
+                    } \
+                    data += sz; \
+                    size -= sz; \
+                } \
+                parse_frame(ctx, *out_data, *out_size); \
+                return s->n_frames > 0 ? *out_size : full_size
+
+                case_n(1, *idx);
+                case_n(2, AV_RL16(idx));
+                case_n(3, AV_RL24(idx));
+                case_n(4, AV_RL32(idx));
+            }
+        }
+    }
+
+    *out_data = data;
+    *out_size = size;
+    parse_frame(ctx, data, size);
+
+    return size;
+}
+
+AVCodecParser ff_vp9_parser = {
+    .codec_ids      = { AV_CODEC_ID_VP9 },
+    .priv_data_size = sizeof(VP9ParseContext),
+    .parser_parse   = parse,
+};
diff --git a/libavcodec/vp9_superframe_bsf.c b/libavcodec/vp9_superframe_bsf.c
new file mode 100644
index 0000000..b686adb
--- /dev/null
+++ b/libavcodec/vp9_superframe_bsf.c
@@ -0,0 +1,205 @@
+/*
+ * Vp9 invisible (alt-ref) frame to superframe merge bitstream filter
+ * Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "avcodec.h"
+#include "bsf.h"
+#include "get_bits.h"
+
+#define MAX_CACHE 8
+typedef struct VP9BSFContext {
+    int n_cache;
+    struct CachedBuf {
+        uint8_t *data;
+        int size;
+    } cache[MAX_CACHE];
+} VP9BSFContext;
+
+static void stats(const struct CachedBuf *in, int n_in,
+                  unsigned *_max, unsigned *_sum)
+{
+    int n;
+    unsigned max = 0, sum = 0;
+
+    for (n = 0; n < n_in; n++) {
+        unsigned sz = in[n].size;
+
+        if (sz > max)
+            max = sz;
+        sum += sz;
+    }
+
+    *_max = max;
+    *_sum = sum;
+}
+
+static int merge_superframe(const struct CachedBuf *in, int n_in, AVPacket *out)
+{
+    unsigned max, sum, mag, marker, n, sz;
+    uint8_t *ptr;
+    int res;
+
+    stats(in, n_in, &max, &sum);
+    mag = av_log2(max) >> 3;
+    marker = 0xC0 + (mag << 3) + (n_in - 1);
+    sz = sum + 2 + (mag + 1) * n_in;
+    res = av_new_packet(out, sz);
+    if (res < 0)
+        return res;
+    ptr = out->data;
+    for (n = 0; n < n_in; n++) {
+        memcpy(ptr, in[n].data, in[n].size);
+        ptr += in[n].size;
+    }
+
+#define wloop(mag, wr) \
+    for (n = 0; n < n_in; n++) { \
+        wr; \
+        ptr += mag + 1; \
+    }
+
+    // write superframe with marker 110[mag:2][nframes:3]
+    *ptr++ = marker;
+    switch (mag) {
+    case 0:
+        wloop(mag, *ptr = in[n].size);
+        break;
+    case 1:
+        wloop(mag, AV_WL16(ptr, in[n].size));
+        break;
+    case 2:
+        wloop(mag, AV_WL24(ptr, in[n].size));
+        break;
+    case 3:
+        wloop(mag, AV_WL32(ptr, in[n].size));
+        break;
+    }
+    *ptr++ = marker;
+    av_assert0(ptr == &out->data[out->size]);
+
+    return 0;
+}
+
+static int vp9_superframe_filter(AVBSFContext *ctx, AVPacket *out)
+{
+    GetBitContext gb;
+    VP9BSFContext *s = ctx->priv_data;
+    AVPacket *in;
+    int res, invisible, profile, marker, uses_superframe_syntax = 0, n;
+
+    res = ff_bsf_get_packet(ctx, &in);
+    if (res < 0)
+        return res;
+
+    marker = in->data[in->size - 1];
+    if ((marker & 0xe0) == 0xc0) {
+        int nbytes = 1 + ((marker >> 3) & 0x3);
+        int n_frames = 1 + (marker & 0x7), idx_sz = 2 + n_frames * nbytes;
+
+        uses_superframe_syntax = in->size >= idx_sz && in->data[in->size - idx_sz] == marker;
+    }
+
+    if ((res = init_get_bits8(&gb, in->data, in->size)) < 0)
+        goto done;
+
+    get_bits(&gb, 2); // frame marker
+    profile  = get_bits1(&gb);
+    profile |= get_bits1(&gb) << 1;
+    if (profile == 3) profile += get_bits1(&gb);
+
+    if (get_bits1(&gb)) {
+        invisible = 0;
+    } else {
+        get_bits1(&gb); // keyframe
+        invisible = !get_bits1(&gb);
+    }
+
+    if (uses_superframe_syntax && s->n_cache > 0) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Mixing of superframe syntax and naked VP9 frames not supported");
+        res = AVERROR_INVALIDDATA;
+        goto done;
+    } else if ((!invisible || uses_superframe_syntax) && !s->n_cache) {
+        // passthrough
+        av_packet_move_ref(out, in);
+        goto done;
+    } else if (s->n_cache + 1 >= MAX_CACHE) {
+        av_log(ctx, AV_LOG_ERROR,
+               "Too many invisible frames");
+        res = AVERROR_INVALIDDATA;
+        goto done;
+    }
+
+    s->cache[s->n_cache].size = in->size;
+    if (invisible && !uses_superframe_syntax) {
+        s->cache[s->n_cache].data = av_malloc(in->size);
+        if (!s->cache[s->n_cache].data) {
+            res = AVERROR(ENOMEM);
+            goto done;
+        }
+        memcpy(s->cache[s->n_cache++].data, in->data, in->size);
+        res = AVERROR(EAGAIN);
+        goto done;
+    }
+    av_assert0(s->n_cache > 0);
+
+    s->cache[s->n_cache].data = in->data;
+
+    // build superframe
+    if ((res = merge_superframe(s->cache, s->n_cache + 1, out)) < 0)
+        goto done;
+
+    for (n = 0; n < s->n_cache; n++)
+        av_freep(&s->cache[n].data);
+    s->n_cache = 0;
+
+    res = av_packet_copy_props(out, in);
+    if (res < 0)
+        goto done;
+
+done:
+    if (res < 0)
+        av_packet_unref(out);
+    av_packet_free(&in);
+    return res;
+}
+
+static void vp9_superframe_close(AVBSFContext *ctx)
+{
+    VP9BSFContext *s = ctx->priv_data;
+    int n;
+
+    // free cached data
+    for (n = 0; n < s->n_cache; n++)
+        av_freep(&s->cache[n].data);
+}
+
+static const enum AVCodecID codec_ids[] = {
+    AV_CODEC_ID_VP9, AV_CODEC_ID_NONE,
+};
+
+const AVBitStreamFilter ff_vp9_superframe_bsf = {
+    .name           = "vp9_superframe",
+    .priv_data_size = sizeof(VP9BSFContext),
+    .filter         = vp9_superframe_filter,
+    .close          = vp9_superframe_close,
+    .codec_ids      = codec_ids,
+};
diff --git a/libavcodec/vp9block.c b/libavcodec/vp9block.c
deleted file mode 100644
index a92c794..0000000
--- a/libavcodec/vp9block.c
+++ /dev/null
@@ -1,1685 +0,0 @@
-/*
- * VP9 compatible video decoder
- *
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/avassert.h"
-
-#include "avcodec.h"
-#include "get_bits.h"
-#include "internal.h"
-#include "videodsp.h"
-#include "vp56.h"
-#include "vp9.h"
-#include "vp9data.h"
-
-static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
-    {
-        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
-        {  4,  4 }, {  4, 2 }, { 2,  4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
-    },  {
-        {  8,  8 }, {  8, 4 }, { 4,  8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
-        {  2,  2 }, {  2, 1 }, { 1,  2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
-    }
-};
-
-// differential forward probability updates
-static void decode_mode(VP9Context *s, VP9Block *const b)
-{
-    static const uint8_t left_ctx[N_BS_SIZES] = {
-        0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
-    };
-    static const uint8_t above_ctx[N_BS_SIZES] = {
-        0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
-    };
-    static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
-        TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
-        TX_16X16, TX_8X8,   TX_8X8,   TX_8X8,   TX_4X4,   TX_4X4,  TX_4X4
-    };
-    int row = b->row, col = b->col, row7 = b->row7;
-    enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
-    int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
-    int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]);
-    int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
-    int y;
-
-    if (!s->segmentation.enabled) {
-        b->seg_id = 0;
-    } else if (s->keyframe || s->intraonly) {
-        b->seg_id = s->segmentation.update_map ?
-                    vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree, s->prob.seg) : 0;
-    } else if (!s->segmentation.update_map ||
-               (s->segmentation.temporal &&
-                vp56_rac_get_prob_branchy(&s->c,
-                                          s->prob.segpred[s->above_segpred_ctx[col] +
-                                                          s->left_segpred_ctx[row7]]))) {
-        int pred = MAX_SEGMENT - 1;
-        int x;
-
-        for (y = 0; y < h4; y++)
-            for (x = 0; x < w4; x++)
-                pred = FFMIN(pred,
-                             s->segmentation_map[(y + row) * 8 * s->sb_cols + x + col]);
-        b->seg_id = pred;
-
-        memset(&s->above_segpred_ctx[col], 1, w4);
-        memset(&s->left_segpred_ctx[row7], 1, h4);
-    } else {
-        b->seg_id = vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree,
-                                     s->prob.seg);
-
-        memset(&s->above_segpred_ctx[col], 0, w4);
-        memset(&s->left_segpred_ctx[row7], 0, h4);
-    }
-    if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
-        for (y = 0; y < h4; y++)
-            memset(&s->segmentation_map[(y + row) * 8 * s->sb_cols + col],
-                   b->seg_id, w4);
-    }
-
-    b->skip = s->segmentation.enabled &&
-              s->segmentation.feat[b->seg_id].skip_enabled;
-    if (!b->skip) {
-        int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
-        b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
-        s->counts.skip[c][b->skip]++;
-    }
-
-    if (s->keyframe || s->intraonly) {
-        b->intra = 1;
-    } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
-        b->intra = !s->segmentation.feat[b->seg_id].ref_val;
-    } else {
-        int c, bit;
-
-        if (have_a && have_l) {
-            c  = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
-            c += (c == 2);
-        } else {
-            c = have_a ? 2 * s->above_intra_ctx[col] :
-                have_l ? 2 * s->left_intra_ctx[row7] : 0;
-        }
-        bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
-        s->counts.intra[c][bit]++;
-        b->intra = !bit;
-    }
-
-    if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
-        int c;
-        if (have_a) {
-            if (have_l) {
-                c = (s->above_skip_ctx[col] ? max_tx :
-                     s->above_txfm_ctx[col]) +
-                    (s->left_skip_ctx[row7] ? max_tx :
-                     s->left_txfm_ctx[row7]) > max_tx;
-            } else {
-                c = s->above_skip_ctx[col] ? 1 :
-                    (s->above_txfm_ctx[col] * 2 > max_tx);
-            }
-        } else if (have_l) {
-            c = s->left_skip_ctx[row7] ? 1 :
-                (s->left_txfm_ctx[row7] * 2 > max_tx);
-        } else {
-            c = 1;
-        }
-        switch (max_tx) {
-        case TX_32X32:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
-            if (b->tx) {
-                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
-                if (b->tx == 2)
-                    b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
-            }
-            s->counts.tx32p[c][b->tx]++;
-            break;
-        case TX_16X16:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
-            if (b->tx)
-                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
-            s->counts.tx16p[c][b->tx]++;
-            break;
-        case TX_8X8:
-            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
-            s->counts.tx8p[c][b->tx]++;
-            break;
-        case TX_4X4:
-            b->tx = TX_4X4;
-            break;
-        }
-    } else {
-        b->tx = FFMIN(max_tx, s->txfmmode);
-    }
-
-    if (s->keyframe || s->intraonly) {
-        uint8_t *a = &s->above_mode_ctx[col * 2];
-        uint8_t *l = &s->left_mode_ctx[(row7) << 1];
-
-        b->comp = 0;
-        if (b->bs > BS_8x8) {
-            // FIXME the memory storage intermediates here aren't really
-            // necessary, they're just there to make the code slightly
-            // simpler for now
-            b->mode[0] =
-            a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          ff_vp9_default_kf_ymode_probs[a[0]][l[0]]);
-            if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
-                l[0]       =
-                a[1]       = b->mode[1];
-            } else {
-                l[0]       =
-                a[1]       =
-                b->mode[1] = b->mode[0];
-            }
-            if (b->bs != BS_4x8) {
-                b->mode[2] =
-                a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              ff_vp9_default_kf_ymode_probs[a[0]][l[1]]);
-                if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                                  ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
-                    l[1]       =
-                    a[1]       = b->mode[3];
-                } else {
-                    l[1]       =
-                    a[1]       =
-                    b->mode[3] = b->mode[2];
-                }
-            } else {
-                b->mode[2] = b->mode[0];
-                l[1]       =
-                a[1]       =
-                b->mode[3] = b->mode[1];
-            }
-        } else {
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          ff_vp9_default_kf_ymode_probs[*a][*l]);
-            b->mode[3] =
-            b->mode[2] =
-            b->mode[1] = b->mode[0];
-            // FIXME this can probably be optimized
-            memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
-            memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
-        }
-        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                     ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
-    } else if (b->intra) {
-        b->comp = 0;
-        if (b->bs > BS_8x8) {
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          s->prob.p.y_mode[0]);
-            s->counts.y_mode[0][b->mode[0]]++;
-            if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              s->prob.p.y_mode[0]);
-                s->counts.y_mode[0][b->mode[1]]++;
-            } else {
-                b->mode[1] = b->mode[0];
-            }
-            if (b->bs != BS_4x8) {
-                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                              s->prob.p.y_mode[0]);
-                s->counts.y_mode[0][b->mode[2]]++;
-                if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                                  s->prob.p.y_mode[0]);
-                    s->counts.y_mode[0][b->mode[3]]++;
-                } else {
-                    b->mode[3] = b->mode[2];
-                }
-            } else {
-                b->mode[2] = b->mode[0];
-                b->mode[3] = b->mode[1];
-            }
-        } else {
-            static const uint8_t size_group[10] = {
-                3, 3, 3, 3, 2, 2, 2, 1, 1, 1
-            };
-            int sz = size_group[b->bs];
-
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                          s->prob.p.y_mode[sz]);
-            b->mode[1] =
-            b->mode[2] =
-            b->mode[3] = b->mode[0];
-            s->counts.y_mode[sz][b->mode[3]]++;
-        }
-        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
-                                     s->prob.p.uv_mode[b->mode[3]]);
-        s->counts.uv_mode[b->mode[3]][b->uvmode]++;
-    } else {
-        static const uint8_t inter_mode_ctx_lut[14][14] = {
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
-            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
-        };
-
-        if (s->segmentation.feat[b->seg_id].ref_enabled) {
-            av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
-            b->comp   = 0;
-            b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
-        } else {
-            // read comp_pred flag
-            if (s->comppredmode != PRED_SWITCHABLE) {
-                b->comp = s->comppredmode == PRED_COMPREF;
-            } else {
-                int c;
-
-                // FIXME add intra as ref=0xff (or -1) to make these easier?
-                if (have_a) {
-                    if (have_l) {
-                        if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
-                            c = 4;
-                        } else if (s->above_comp_ctx[col]) {
-                            c = 2 + (s->left_intra_ctx[row7] ||
-                                     s->left_ref_ctx[row7] == s->fixcompref);
-                        } else if (s->left_comp_ctx[row7]) {
-                            c = 2 + (s->above_intra_ctx[col] ||
-                                     s->above_ref_ctx[col] == s->fixcompref);
-                        } else {
-                            c = (!s->above_intra_ctx[col] &&
-                                 s->above_ref_ctx[col] == s->fixcompref) ^
-                                (!s->left_intra_ctx[row7] &&
-                                 s->left_ref_ctx[row & 7] == s->fixcompref);
-                        }
-                    } else {
-                        c = s->above_comp_ctx[col] ? 3 :
-                            (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
-                    }
-                } else if (have_l) {
-                    c = s->left_comp_ctx[row7] ? 3 :
-                        (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
-                } else {
-                    c = 1;
-                }
-                b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
-                s->counts.comp[c][b->comp]++;
-            }
-
-            // read actual references
-            // FIXME probably cache a few variables here to prevent repetitive
-            // memory accesses below
-            if (b->comp) { /* two references */
-                int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
-
-                b->ref[fix_idx] = s->fixcompref;
-                // FIXME can this codeblob be replaced by some sort of LUT?
-                if (have_a) {
-                    if (have_l) {
-                        if (s->above_intra_ctx[col]) {
-                            if (s->left_intra_ctx[row7]) {
-                                c = 2;
-                            } else {
-                                c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
-                            }
-                        } else if (s->left_intra_ctx[row7]) {
-                            c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
-                        } else {
-                            int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
-
-                            if (refl == refa && refa == s->varcompref[1]) {
-                                c = 0;
-                            } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
-                                if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
-                                    (refl == s->fixcompref && refa == s->varcompref[0])) {
-                                    c = 4;
-                                } else {
-                                    c = (refa == refl) ? 3 : 1;
-                                }
-                            } else if (!s->left_comp_ctx[row7]) {
-                                if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
-                                    c = 1;
-                                } else {
-                                    c = (refl == s->varcompref[1] &&
-                                         refa != s->varcompref[1]) ? 2 : 4;
-                                }
-                            } else if (!s->above_comp_ctx[col]) {
-                                if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
-                                    c = 1;
-                                } else {
-                                    c = (refa == s->varcompref[1] &&
-                                         refl != s->varcompref[1]) ? 2 : 4;
-                                }
-                            } else {
-                                c = (refl == refa) ? 4 : 2;
-                            }
-                        }
-                    } else {
-                        if (s->above_intra_ctx[col]) {
-                            c = 2;
-                        } else if (s->above_comp_ctx[col]) {
-                            c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
-                        } else {
-                            c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
-                        }
-                    }
-                } else if (have_l) {
-                    if (s->left_intra_ctx[row7]) {
-                        c = 2;
-                    } else if (s->left_comp_ctx[row7]) {
-                        c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
-                    } else {
-                        c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
-                    }
-                } else {
-                    c = 2;
-                }
-                bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
-                b->ref[var_idx] = s->varcompref[bit];
-                s->counts.comp_ref[c][bit]++;
-            } else { /* single reference */
-                int bit, c;
-
-                if (have_a && !s->above_intra_ctx[col]) {
-                    if (have_l && !s->left_intra_ctx[row7]) {
-                        if (s->left_comp_ctx[row7]) {
-                            if (s->above_comp_ctx[col]) {
-                                c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
-                                         !s->above_ref_ctx[col]);
-                            } else {
-                                c = (3 * !s->above_ref_ctx[col]) +
-                                    (!s->fixcompref || !s->left_ref_ctx[row7]);
-                            }
-                        } else if (s->above_comp_ctx[col]) {
-                            c = (3 * !s->left_ref_ctx[row7]) +
-                                (!s->fixcompref || !s->above_ref_ctx[col]);
-                        } else {
-                            c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
-                        }
-                    } else if (s->above_intra_ctx[col]) {
-                        c = 2;
-                    } else if (s->above_comp_ctx[col]) {
-                        c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
-                    } else {
-                        c = 4 * (!s->above_ref_ctx[col]);
-                    }
-                } else if (have_l && !s->left_intra_ctx[row7]) {
-                    if (s->left_intra_ctx[row7]) {
-                        c = 2;
-                    } else if (s->left_comp_ctx[row7]) {
-                        c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
-                    } else {
-                        c = 4 * (!s->left_ref_ctx[row7]);
-                    }
-                } else {
-                    c = 2;
-                }
-                bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
-                s->counts.single_ref[c][0][bit]++;
-                if (!bit) {
-                    b->ref[0] = 0;
-                } else {
-                    // FIXME can this codeblob be replaced by some sort of LUT?
-                    if (have_a) {
-                        if (have_l) {
-                            if (s->left_intra_ctx[row7]) {
-                                if (s->above_intra_ctx[col]) {
-                                    c = 2;
-                                } else if (s->above_comp_ctx[col]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->above_ref_ctx[col] == 1);
-                                } else if (!s->above_ref_ctx[col]) {
-                                    c = 3;
-                                } else {
-                                    c = 4 * (s->above_ref_ctx[col] == 1);
-                                }
-                            } else if (s->above_intra_ctx[col]) {
-                                if (s->left_intra_ctx[row7]) {
-                                    c = 2;
-                                } else if (s->left_comp_ctx[row7]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
-                                } else if (!s->left_ref_ctx[row7]) {
-                                    c = 3;
-                                } else {
-                                    c = 4 * (s->left_ref_ctx[row7] == 1);
-                                }
-                            } else if (s->above_comp_ctx[col]) {
-                                if (s->left_comp_ctx[row7]) {
-                                    if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
-                                        c = 3 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
-                                    } else {
-                                        c = 2;
-                                    }
-                                } else if (!s->left_ref_ctx[row7]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->above_ref_ctx[col] == 1);
-                                } else {
-                                    c = 3 * (s->left_ref_ctx[row7] == 1) +
-                                        (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
-                                }
-                            } else if (s->left_comp_ctx[row7]) {
-                                if (!s->above_ref_ctx[col]) {
-                                    c = 1 + 2 * (s->fixcompref == 1 ||
-                                                 s->left_ref_ctx[row7] == 1);
-                                } else {
-                                    c = 3 * (s->above_ref_ctx[col] == 1) +
-                                        (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
-                                }
-                            } else if (!s->above_ref_ctx[col]) {
-                                if (!s->left_ref_ctx[row7]) {
-                                    c = 3;
-                                } else {
-                                    c = 4 * (s->left_ref_ctx[row7] == 1);
-                                }
-                            } else if (!s->left_ref_ctx[row7]) {
-                                c = 4 * (s->above_ref_ctx[col] == 1);
-                            } else {
-                                c = 2 * (s->left_ref_ctx[row7] == 1) +
-                                    2 * (s->above_ref_ctx[col] == 1);
-                            }
-                        } else {
-                            if (s->above_intra_ctx[col] ||
-                                (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
-                                c = 2;
-                            } else if (s->above_comp_ctx[col]) {
-                                c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
-                            } else {
-                                c = 4 * (s->above_ref_ctx[col] == 1);
-                            }
-                        }
-                    } else if (have_l) {
-                        if (s->left_intra_ctx[row7] ||
-                            (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
-                            c = 2;
-                        } else if (s->left_comp_ctx[row7]) {
-                            c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
-                        } else {
-                            c = 4 * (s->left_ref_ctx[row7] == 1);
-                        }
-                    } else {
-                        c = 2;
-                    }
-                    bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
-                    s->counts.single_ref[c][1][bit]++;
-                    b->ref[0] = 1 + bit;
-                }
-            }
-        }
-
-        if (b->bs <= BS_8x8) {
-            if (s->segmentation.feat[b->seg_id].skip_enabled) {
-                b->mode[0] =
-                b->mode[1] =
-                b->mode[2] =
-                b->mode[3] = ZEROMV;
-            } else {
-                static const uint8_t off[10] = {
-                    3, 0, 0, 1, 0, 0, 0, 0, 0, 0
-                };
-
-                // FIXME this needs to use the LUT tables from find_ref_mvs
-                // because not all are -1,0/0,-1
-                int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
-                                          [s->left_mode_ctx[row7 + off[b->bs]]];
-
-                b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                              s->prob.p.mv_mode[c]);
-                b->mode[1] =
-                b->mode[2] =
-                b->mode[3] = b->mode[0];
-                s->counts.mv_mode[c][b->mode[0] - 10]++;
-            }
-        }
-
-        if (s->filtermode == FILTER_SWITCHABLE) {
-            int c;
-
-            if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
-                if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
-                    c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
-                        s->left_filter_ctx[row7] : 3;
-                } else {
-                    c = s->above_filter_ctx[col];
-                }
-            } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
-                c = s->left_filter_ctx[row7];
-            } else {
-                c = 3;
-            }
-
-            b->filter = vp8_rac_get_tree(&s->c, ff_vp9_filter_tree,
-                                         s->prob.p.filter[c]);
-            s->counts.filter[c][b->filter]++;
-        } else {
-            b->filter = s->filtermode;
-        }
-
-        if (b->bs > BS_8x8) {
-            int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
-
-            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                          s->prob.p.mv_mode[c]);
-            s->counts.mv_mode[c][b->mode[0] - 10]++;
-            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], 0);
-
-            if (b->bs != BS_8x4) {
-                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                              s->prob.p.mv_mode[c]);
-                s->counts.mv_mode[c][b->mode[1] - 10]++;
-                ff_vp9_fill_mv(s, b->mv[1], b->mode[1], 1);
-            } else {
-                b->mode[1] = b->mode[0];
-                AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
-                AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
-            }
-
-            if (b->bs != BS_4x8) {
-                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                              s->prob.p.mv_mode[c]);
-                s->counts.mv_mode[c][b->mode[2] - 10]++;
-                ff_vp9_fill_mv(s, b->mv[2], b->mode[2], 2);
-
-                if (b->bs != BS_8x4) {
-                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
-                                                  s->prob.p.mv_mode[c]);
-                    s->counts.mv_mode[c][b->mode[3] - 10]++;
-                    ff_vp9_fill_mv(s, b->mv[3], b->mode[3], 3);
-                } else {
-                    b->mode[3] = b->mode[2];
-                    AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
-                    AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
-                }
-            } else {
-                b->mode[2] = b->mode[0];
-                AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
-                AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
-                b->mode[3] = b->mode[1];
-                AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
-                AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
-            }
-        } else {
-            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], -1);
-            AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
-            AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
-            AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
-            AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
-            AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
-            AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
-        }
-    }
-
-    // FIXME this can probably be optimized
-    memset(&s->above_skip_ctx[col], b->skip, w4);
-    memset(&s->left_skip_ctx[row7], b->skip, h4);
-    memset(&s->above_txfm_ctx[col], b->tx, w4);
-    memset(&s->left_txfm_ctx[row7], b->tx, h4);
-    memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
-    memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
-    if (!s->keyframe && !s->intraonly) {
-        memset(&s->above_intra_ctx[col], b->intra, w4);
-        memset(&s->left_intra_ctx[row7], b->intra, h4);
-        memset(&s->above_comp_ctx[col], b->comp, w4);
-        memset(&s->left_comp_ctx[row7], b->comp, h4);
-        memset(&s->above_mode_ctx[col], b->mode[3], w4);
-        memset(&s->left_mode_ctx[row7], b->mode[3], h4);
-        if (s->filtermode == FILTER_SWITCHABLE && !b->intra) {
-            memset(&s->above_filter_ctx[col], b->filter, w4);
-            memset(&s->left_filter_ctx[row7], b->filter, h4);
-            b->filter = ff_vp9_filter_lut[b->filter];
-        }
-        if (b->bs > BS_8x8) {
-            int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
-
-            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
-            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
-            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
-            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
-            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
-            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
-            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
-            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
-        } else {
-            int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
-
-            for (n = 0; n < w4 * 2; n++) {
-                AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
-                AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
-            }
-            for (n = 0; n < h4 * 2; n++) {
-                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
-                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
-            }
-        }
-
-        if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
-                         // as a direct check in above branches
-            int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
-
-            memset(&s->above_ref_ctx[col], vref, w4);
-            memset(&s->left_ref_ctx[row7], vref, h4);
-        }
-    }
-
-    // FIXME kinda ugly
-    for (y = 0; y < h4; y++) {
-        int x, o = (row + y) * s->sb_cols * 8 + col;
-
-        if (b->intra) {
-            for (x = 0; x < w4; x++) {
-                s->mv[0][o + x].ref[0] =
-                s->mv[0][o + x].ref[1] = -1;
-            }
-        } else if (b->comp) {
-            for (x = 0; x < w4; x++) {
-                s->mv[0][o + x].ref[0] = b->ref[0];
-                s->mv[0][o + x].ref[1] = b->ref[1];
-                AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
-                AV_COPY32(&s->mv[0][o + x].mv[1], &b->mv[3][1]);
-            }
-        } else {
-            for (x = 0; x < w4; x++) {
-                s->mv[0][o + x].ref[0] = b->ref[0];
-                s->mv[0][o + x].ref[1] = -1;
-                AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
-            }
-        }
-    }
-}
-
-// FIXME remove tx argument, and merge cnt/eob arguments?
-static int decode_block_coeffs(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
-                               enum TxfmMode tx, unsigned (*cnt)[6][3],
-                               unsigned (*eob)[6][2], uint8_t(*p)[6][11],
-                               int nnz, const int16_t *scan,
-                               const int16_t(*nb)[2],
-                               const int16_t *band_counts, const int16_t *qmul)
-{
-    int i = 0, band = 0, band_left = band_counts[band];
-    uint8_t *tp = p[0][nnz];
-    uint8_t cache[1024];
-
-    do {
-        int val, rc;
-
-        val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
-        eob[band][nnz][val]++;
-        if (!val)
-            break;
-
-skip_eob:
-        if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
-            cnt[band][nnz][0]++;
-            if (!--band_left)
-                band_left = band_counts[++band];
-            cache[scan[i]] = 0;
-            nnz            = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
-            tp             = p[band][nnz];
-            if (++i == n_coeffs)
-                break;  //invalid input; blocks should end with EOB
-            goto skip_eob;
-        }
-
-        rc = scan[i];
-        if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
-            cnt[band][nnz][1]++;
-            val       = 1;
-            cache[rc] = 1;
-        } else {
-            // fill in p[3-10] (model fill) - only once per frame for each pos
-            if (!tp[3])
-                memcpy(&tp[3], ff_vp9_model_pareto8[tp[2]], 8);
-
-            cnt[band][nnz][2]++;
-            if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
-                if (!vp56_rac_get_prob_branchy(c, tp[4])) {
-                    cache[rc] = val = 2;
-                } else {
-                    val       = 3 + vp56_rac_get_prob(c, tp[5]);
-                    cache[rc] = 3;
-                }
-            } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
-                cache[rc] = 4;
-                if (!vp56_rac_get_prob_branchy(c, tp[7])) {
-                    val  =  vp56_rac_get_prob(c, 159) + 5;
-                } else {
-                    val  = (vp56_rac_get_prob(c, 165) << 1) + 7;
-                    val +=  vp56_rac_get_prob(c, 145);
-                }
-            } else { // cat 3-6
-                cache[rc] = 5;
-                if (!vp56_rac_get_prob_branchy(c, tp[8])) {
-                    if (!vp56_rac_get_prob_branchy(c, tp[9])) {
-                        val  = (vp56_rac_get_prob(c, 173) << 2) + 11;
-                        val += (vp56_rac_get_prob(c, 148) << 1);
-                        val +=  vp56_rac_get_prob(c, 140);
-                    } else {
-                        val  = (vp56_rac_get_prob(c, 176) << 3) + 19;
-                        val += (vp56_rac_get_prob(c, 155) << 2);
-                        val += (vp56_rac_get_prob(c, 140) << 1);
-                        val +=  vp56_rac_get_prob(c, 135);
-                    }
-                } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
-                    val  = (vp56_rac_get_prob(c, 180) << 4) + 35;
-                    val += (vp56_rac_get_prob(c, 157) << 3);
-                    val += (vp56_rac_get_prob(c, 141) << 2);
-                    val += (vp56_rac_get_prob(c, 134) << 1);
-                    val +=  vp56_rac_get_prob(c, 130);
-                } else {
-                    val  = (vp56_rac_get_prob(c, 254) << 13) + 67;
-                    val += (vp56_rac_get_prob(c, 254) << 12);
-                    val += (vp56_rac_get_prob(c, 254) << 11);
-                    val += (vp56_rac_get_prob(c, 252) << 10);
-                    val += (vp56_rac_get_prob(c, 249) << 9);
-                    val += (vp56_rac_get_prob(c, 243) << 8);
-                    val += (vp56_rac_get_prob(c, 230) << 7);
-                    val += (vp56_rac_get_prob(c, 196) << 6);
-                    val += (vp56_rac_get_prob(c, 177) << 5);
-                    val += (vp56_rac_get_prob(c, 153) << 4);
-                    val += (vp56_rac_get_prob(c, 140) << 3);
-                    val += (vp56_rac_get_prob(c, 133) << 2);
-                    val += (vp56_rac_get_prob(c, 130) << 1);
-                    val +=  vp56_rac_get_prob(c, 129);
-                }
-            }
-        }
-        if (!--band_left)
-            band_left = band_counts[++band];
-        if (tx == TX_32X32) // FIXME slow
-            coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
-        else
-            coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
-        nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
-        tp  = p[band][nnz];
-    } while (++i < n_coeffs);
-
-    return i;
-}
-
-static int decode_coeffs(AVCodecContext *avctx)
-{
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col;
-    uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
-    unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
-    unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
-    int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
-    int end_x = FFMIN(2 * (s->cols - col), w4);
-    int end_y = FFMIN(2 * (s->rows - row), h4);
-    int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
-    int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), ret;
-    int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
-    int tx = 4 * s->lossless + b->tx;
-    const int16_t **yscans = ff_vp9_scans[tx];
-    const int16_t (**ynbs)[2] = ff_vp9_scans_nb[tx];
-    const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT];
-    const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT];
-    uint8_t *a = &s->above_y_nnz_ctx[col * 2];
-    uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
-    static const int16_t band_counts[4][8] = {
-        { 1, 2, 3, 4,  3,   16 - 13, 0 },
-        { 1, 2, 3, 4, 11,   64 - 21, 0 },
-        { 1, 2, 3, 4, 11,  256 - 21, 0 },
-        { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-    };
-    const int16_t *y_band_counts  = band_counts[b->tx];
-    const int16_t *uv_band_counts = band_counts[b->uvtx];
-
-    /* y tokens */
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            for (x = 1; x < step1d; x++)
-                l[y] |= l[y + x];
-        for (x = 0; x < end_x; x += step1d)
-            for (y = 1; y < step1d; y++)
-                a[x] |= a[x + y];
-    }
-    for (n = 0, y = 0; y < end_y; y += step1d) {
-        for (x = 0; x < end_x; x += step1d, n += step) {
-            enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
-                                                                b->bs > BS_8x8 ?
-                                                                n : 0]];
-            int nnz = a[x] + l[y];
-            if ((ret = decode_block_coeffs(&s->c, s->block + 16 * n, 16 * step,
-                                           b->tx, c, e, p, nnz, yscans[txtp],
-                                           ynbs[txtp], y_band_counts,
-                                           qmul[0])) < 0)
-                return ret;
-            a[x] = l[y] = !!ret;
-            if (b->tx > TX_8X8)
-                AV_WN16A(&s->eob[n], ret);
-            else
-                s->eob[n] = ret;
-        }
-    }
-    if (b->tx > TX_4X4) { // FIXME slow
-        for (y = 0; y < end_y; y += step1d)
-            memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
-        for (x = 0; x < end_x; x += step1d)
-            memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
-    }
-
-    p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
-    c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
-    e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
-    w4    >>= 1;
-    h4    >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
-    for (pl = 0; pl < 2; pl++) {
-        a = &s->above_uv_nnz_ctx[pl][col];
-        l = &s->left_uv_nnz_ctx[pl][row & 7];
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                for (x = 1; x < uvstep1d; x++)
-                    l[y] |= l[y + x];
-            for (x = 0; x < end_x; x += uvstep1d)
-                for (y = 1; y < uvstep1d; y++)
-                    a[x] |= a[x + y];
-        }
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-            for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
-                int nnz = a[x] + l[y];
-                if ((ret = decode_block_coeffs(&s->c, s->uvblock[pl] + 16 * n,
-                                               16 * uvstep, b->uvtx, c, e, p,
-                                               nnz, uvscan, uvnb,
-                                               uv_band_counts, qmul[1])) < 0)
-                    return ret;
-                a[x] = l[y] = !!ret;
-                if (b->uvtx > TX_8X8)
-                    AV_WN16A(&s->uveob[pl][n], ret);
-                else
-                    s->uveob[pl][n] = ret;
-            }
-        }
-        if (b->uvtx > TX_4X4) { // FIXME slow
-            for (y = 0; y < end_y; y += uvstep1d)
-                memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
-            for (x = 0; x < end_x; x += uvstep1d)
-                memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
-        }
-    }
-
-    return 0;
-}
-
-static av_always_inline int check_intra_mode(VP9Context *s, int mode,
-                                             uint8_t **a,
-                                             uint8_t *dst_edge,
-                                             ptrdiff_t stride_edge,
-                                             uint8_t *dst_inner,
-                                             ptrdiff_t stride_inner,
-                                             uint8_t *l, int col, int x, int w,
-                                             int row, int y, enum TxfmMode tx,
-                                             int p)
-{
-    int have_top   = row > 0 || y > 0;
-    int have_left  = col > s->tiling.tile_col_start || x > 0;
-    int have_right = x < w - 1;
-    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
-        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED            },
-                                   { DC_127_PRED,          VERT_PRED            } },
-        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED          },
-                                   { HOR_PRED,             HOR_PRED             } },
-        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED          },
-                                   { LEFT_DC_PRED,         DC_PRED              } },
-        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  },
-                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  } },
-        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
-                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
-        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      },
-                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      } },
-        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED        },
-                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED        } },
-        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED       },
-                                   { DC_127_PRED,          VERT_LEFT_PRED       } },
-        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED          },
-                                   { HOR_UP_PRED,          HOR_UP_PRED          } },
-        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED            },
-                                   { HOR_PRED,             TM_VP8_PRED          } },
-    };
-    static const struct {
-        uint8_t needs_left:1;
-        uint8_t needs_top:1;
-        uint8_t needs_topleft:1;
-        uint8_t needs_topright:1;
-    } edges[N_INTRA_PRED_MODES] = {
-        [VERT_PRED]            = { .needs_top  = 1 },
-        [HOR_PRED]             = { .needs_left = 1 },
-        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
-        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
-        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
-        [HOR_UP_PRED]          = { .needs_left = 1 },
-        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1,
-                                   .needs_topleft = 1 },
-        [LEFT_DC_PRED]         = { .needs_left = 1 },
-        [TOP_DC_PRED]          = { .needs_top  = 1 },
-        [DC_128_PRED]          = { 0 },
-        [DC_127_PRED]          = { 0 },
-        [DC_129_PRED]          = { 0 }
-    };
-
-    av_assert2(mode >= 0 && mode < 10);
-    mode = mode_conv[mode][have_left][have_top];
-    if (edges[mode].needs_top) {
-        uint8_t *top = NULL, *topleft = NULL;
-        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
-        int n_px_need_tr = 0;
-
-        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
-            n_px_need_tr = 4;
-
-        // if top of sb64-row, use s->intra_pred_data[] instead of
-        // dst[-stride] for intra prediction (it contains pre- instead of
-        // post-loopfilter data)
-        if (have_top) {
-            top = !(row & 7) && !y ?
-                  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
-                  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
-            if (have_left)
-                topleft = !(row & 7) && !y ?
-                          s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
-                          y == 0 || x == 0 ? &dst_edge[-stride_edge] :
-                          &dst_inner[-stride_inner];
-        }
-
-        if (have_top &&
-            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
-            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
-            n_px_need + n_px_need_tr <= n_px_have) {
-            *a = top;
-        } else {
-            if (have_top) {
-                if (n_px_need <= n_px_have) {
-                    memcpy(*a, top, n_px_need);
-                } else {
-                    memcpy(*a, top, n_px_have);
-                    memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
-                           n_px_need - n_px_have);
-                }
-            } else {
-                memset(*a, 127, n_px_need);
-            }
-            if (edges[mode].needs_topleft) {
-                if (have_left && have_top)
-                    (*a)[-1] = topleft[-1];
-                else
-                    (*a)[-1] = have_top ? 129 : 127;
-            }
-            if (tx == TX_4X4 && edges[mode].needs_topright) {
-                if (have_top && have_right &&
-                    n_px_need + n_px_need_tr <= n_px_have) {
-                    memcpy(&(*a)[4], &top[4], 4);
-                } else {
-                    memset(&(*a)[4], (*a)[3], 4);
-                }
-            }
-        }
-    }
-    if (edges[mode].needs_left) {
-        if (have_left) {
-            int i;
-            int n_px_need = 4 << tx;
-            int n_px_have = (((s->rows - row) << !p) - y) * 4;
-            uint8_t *dst     = x == 0 ? dst_edge : dst_inner;
-            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
-
-            if (n_px_need <= n_px_have) {
-                for (i = 0; i < n_px_need; i++)
-                    l[i] = dst[i * stride - 1];
-            } else {
-                for (i = 0; i < n_px_have; i++)
-                    l[i] = dst[i * stride - 1];
-                memset(&l[i], l[i - 1], n_px_need - n_px_have);
-            }
-        } else {
-            memset(l, 129, 4 << tx);
-        }
-    }
-
-    return mode;
-}
-
-static void intra_recon(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
-{
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col;
-    int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
-    int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
-    int end_x = FFMIN(2 * (s->cols - col), w4);
-    int end_y = FFMIN(2 * (s->rows - row), h4);
-    int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
-    int uvstep1d = 1 << b->uvtx, p;
-    uint8_t *dst = b->dst[0], *dst_r = s->cur_frame->data[0] + y_off;
-
-    for (n = 0, y = 0; y < end_y; y += step1d) {
-        uint8_t *ptr = dst, *ptr_r = dst_r;
-        for (x = 0; x < end_x;
-             x += step1d, ptr += 4 * step1d, ptr_r += 4 * step1d, n += step) {
-            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
-                               y * 2 + x : 0];
-            LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-            uint8_t *a = &a_buf[16], l[32];
-            enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
-            int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
-
-            mode = check_intra_mode(s, mode, &a, ptr_r,
-                                    s->cur_frame->linesize[0],
-                                    ptr, b->y_stride, l,
-                                    col, x, w4, row, y, b->tx, 0);
-            s->dsp.intra_pred[b->tx][mode](ptr, b->y_stride, l, a);
-            if (eob)
-                s->dsp.itxfm_add[tx][txtp](ptr, b->y_stride,
-                                           s->block + 16 * n, eob);
-        }
-        dst_r += 4 * s->cur_frame->linesize[0] * step1d;
-        dst   += 4 * b->y_stride * step1d;
-    }
-
-    // U/V
-    h4    >>= 1;
-    w4    >>= 1;
-    end_x >>= 1;
-    end_y >>= 1;
-    step    = 1 << (b->uvtx * 2);
-    for (p = 0; p < 2; p++) {
-        dst   = b->dst[1 + p];
-        dst_r = s->cur_frame->data[1 + p] + uv_off;
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-            uint8_t *ptr = dst, *ptr_r = dst_r;
-            for (x = 0; x < end_x;
-                 x += uvstep1d, ptr += 4 * uvstep1d,
-                 ptr_r += 4 * uvstep1d, n += step) {
-                int mode = b->uvmode;
-                LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
-                uint8_t *a = &a_buf[16], l[32];
-                int eob    = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
-                                              : s->uveob[p][n];
-
-                mode = check_intra_mode(s, mode, &a, ptr_r,
-                                        s->cur_frame->linesize[1],
-                                        ptr, b->uv_stride, l,
-                                        col, x, w4, row, y, b->uvtx, p + 1);
-                s->dsp.intra_pred[b->uvtx][mode](ptr, b->uv_stride, l, a);
-                if (eob)
-                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
-                                                    s->uvblock[p] + 16 * n,
-                                                    eob);
-            }
-            dst_r += 4 * uvstep1d * s->cur_frame->linesize[1];
-            dst   += 4 * uvstep1d * b->uv_stride;
-        }
-    }
-}
-
-static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
-                                         uint8_t *dst, ptrdiff_t dst_stride,
-                                         const uint8_t *ref,
-                                         ptrdiff_t ref_stride,
-                                         ptrdiff_t y, ptrdiff_t x,
-                                         const VP56mv *mv,
-                                         int bw, int bh, int w, int h)
-{
-    int mx = mv->x, my = mv->y;
-
-    y   += my >> 3;
-    x   += mx >> 3;
-    ref += y * ref_stride + x;
-    mx  &= 7;
-    my  &= 7;
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    if (x < !!mx * 3 || y < !!my * 3 ||
-        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref - !!my * 3 * ref_stride - !!mx * 3,
-                                 80,
-                                 ref_stride,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref        = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        ref_stride = 80;
-    }
-    mc[!!mx][!!my](dst, ref, dst_stride, ref_stride, bh, mx << 1, my << 1);
-}
-
-static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
-                                           uint8_t *dst_u, uint8_t *dst_v,
-                                           ptrdiff_t dst_stride,
-                                           const uint8_t *ref_u,
-                                           ptrdiff_t src_stride_u,
-                                           const uint8_t *ref_v,
-                                           ptrdiff_t src_stride_v,
-                                           ptrdiff_t y, ptrdiff_t x,
-                                           const VP56mv *mv,
-                                           int bw, int bh, int w, int h)
-{
-    int mx = mv->x, my = mv->y;
-
-    y     += my >> 4;
-    x     += mx >> 4;
-    ref_u += y * src_stride_u + x;
-    ref_v += y * src_stride_v + x;
-    mx    &= 15;
-    my    &= 15;
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
-    if (x < !!mx * 3 || y < !!my * 3 ||
-        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3,
-                                 80,
-                                 src_stride_u,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        mc[!!mx][!!my](dst_u, ref_u, dst_stride, 80, bh, mx, my);
-
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
-                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3,
-                                 80,
-                                 src_stride_v,
-                                 bw + !!mx * 7, bh + !!my * 7,
-                                 x - !!mx * 3, y - !!my * 3, w, h);
-        ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
-        mc[!!mx][!!my](dst_v, ref_v, dst_stride, 80, bh, mx, my);
-    } else {
-        mc[!!mx][!!my](dst_u, ref_u, dst_stride, src_stride_u, bh, mx, my);
-        mc[!!mx][!!my](dst_v, ref_v, dst_stride, src_stride_v, bh, mx, my);
-    }
-}
-
-static int inter_recon(AVCodecContext *avctx)
-{
-    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
-        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
-        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
-    };
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col;
-    AVFrame *ref1 = s->refs[s->refidx[b->ref[0]]];
-    AVFrame *ref2 = b->comp ? s->refs[s->refidx[b->ref[1]]] : NULL;
-    int w = avctx->width, h = avctx->height;
-    ptrdiff_t ls_y = b->y_stride, ls_uv = b->uv_stride;
-
-    if (!ref1->data[0] || (b->comp && !ref2->data[0]))
-        return AVERROR_INVALIDDATA;
-
-    // y inter pred
-    if (b->bs > BS_8x8) {
-        if (b->bs == BS_8x4) {
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
-                        b->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
-                            b->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
-            }
-        } else if (b->bs == BS_4x8) {
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
-            }
-        } else {
-            av_assert2(b->bs == BS_4x4);
-
-            // FIXME if two horizontally adjacent blocks have the same MV,
-            // do a w8 instead of a w4 call
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        b->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        b->dst[0] + 4 * ls_y + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0],
-                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            b->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            b->dst[0] + 4 * ls_y + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0],
-                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
-            }
-        }
-    } else {
-        int bwl = bwlog_tab[0][b->bs];
-        int bw  = bwh_tab[0][b->bs][0] * 4;
-        int bh  = bwh_tab[0][b->bs][1] * 4;
-
-        mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], b->dst[0], ls_y,
-                    ref1->data[0], ref1->linesize[0],
-                    row << 3, col << 3, &b->mv[0][0], bw, bh, w, h);
-
-        if (b->comp)
-            mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], b->dst[0], ls_y,
-                        ref2->data[0], ref2->linesize[0],
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
-    }
-
-    // uv inter pred
-    {
-        int bwl = bwlog_tab[1][b->bs];
-        int bw  = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
-        VP56mv mvuv;
-
-        w = (w + 1) >> 1;
-        h = (h + 1) >> 1;
-        if (b->bs > BS_8x8) {
-            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x +
-                                 b->mv[2][0].x + b->mv[3][0].x, 4);
-            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y +
-                                 b->mv[2][0].y + b->mv[3][0].y, 4);
-        } else {
-            mvuv = b->mv[0][0];
-        }
-
-        mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
-                      b->dst[1], b->dst[2], ls_uv,
-                      ref1->data[1], ref1->linesize[1],
-                      ref1->data[2], ref1->linesize[2],
-                      row << 2, col << 2, &mvuv, bw, bh, w, h);
-
-        if (b->comp) {
-            if (b->bs > BS_8x8) {
-                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x +
-                                     b->mv[2][1].x + b->mv[3][1].x, 4);
-                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y +
-                                     b->mv[2][1].y + b->mv[3][1].y, 4);
-            } else {
-                mvuv = b->mv[0][1];
-            }
-            mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
-                          b->dst[1], b->dst[2], ls_uv,
-                          ref2->data[1], ref2->linesize[1],
-                          ref2->data[2], ref2->linesize[2],
-                          row << 2, col << 2, &mvuv, bw, bh, w, h);
-        }
-    }
-
-    if (!b->skip) {
-        /* mostly copied intra_reconn() */
-
-        int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
-        int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
-        int end_x = FFMIN(2 * (s->cols - col), w4);
-        int end_y = FFMIN(2 * (s->rows - row), h4);
-        int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
-        int uvstep1d = 1 << b->uvtx, p;
-        uint8_t *dst = b->dst[0];
-
-        // y itxfm add
-        for (n = 0, y = 0; y < end_y; y += step1d) {
-            uint8_t *ptr = dst;
-            for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
-                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
-
-                if (eob)
-                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, b->y_stride,
-                                                  s->block + 16 * n, eob);
-            }
-            dst += 4 * b->y_stride * step1d;
-        }
-
-        // uv itxfm add
-        h4    >>= 1;
-        w4    >>= 1;
-        end_x >>= 1;
-        end_y >>= 1;
-        step    = 1 << (b->uvtx * 2);
-        for (p = 0; p < 2; p++) {
-            dst = b->dst[p + 1];
-            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
-                uint8_t *ptr = dst;
-                for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
-                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
-                                               : s->uveob[p][n];
-                    if (eob)
-                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
-                                                        s->uvblock[p] + 16 * n, eob);
-                }
-                dst += 4 * uvstep1d * b->uv_stride;
-            }
-        }
-    }
-    return 0;
-}
-
-static av_always_inline void mask_edges(VP9Filter *lflvl, int is_uv,
-                                        int row_and_7, int col_and_7,
-                                        int w, int h, int col_end, int row_end,
-                                        enum TxfmMode tx, int skip_inter)
-{
-    // FIXME I'm pretty sure all loops can be replaced by a single LUT if
-    // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
-    // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
-    // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
-
-    // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
-    // edges. This means that for UV, we work on two subsampled blocks at
-    // a time, and we only use the topleft block's mode information to set
-    // things like block strength. Thus, for any block size smaller than
-    // 16x16, ignore the odd portion of the block.
-    if (tx == TX_4X4 && is_uv) {
-        if (h == 1) {
-            if (row_and_7 & 1)
-                return;
-            if (!row_end)
-                h += 1;
-        }
-        if (w == 1) {
-            if (col_and_7 & 1)
-                return;
-            if (!col_end)
-                w += 1;
-        }
-    }
-
-    if (tx == TX_4X4 && !skip_inter) {
-        int t = 1 << col_and_7, m_col = (t << w) - t, y;
-        int m_col_odd = (t << (w - 1)) - t;
-
-        // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
-        if (is_uv) {
-            int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 7);
-
-                lflvl->mask[is_uv][0][y][1] |= m_row_8;
-                lflvl->mask[is_uv][0][y][2] |= m_row_4;
-                // for odd lines, if the odd col is not being filtered,
-                // skip odd row also:
-                // .---. <-- a
-                // |   |
-                // |___| <-- b
-                // ^   ^
-                // c   d
-                //
-                // if a/c are even row/col and b/d are odd, and d is skipped,
-                // e.g. right edge of size-66x66.webm, then skip b also (bug)
-                if ((col_end & 1) && (y & 1)) {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
-                } else {
-                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
-                }
-            }
-        } else {
-            int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                int col_mask_id = 2 - !(y & 3);
-
-                lflvl->mask[is_uv][0][y][1]           |= m_row_8; // row edge
-                lflvl->mask[is_uv][0][y][2]           |= m_row_4;
-                lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
-                lflvl->mask[is_uv][0][y][3]           |= m_col;
-                lflvl->mask[is_uv][1][y][3]           |= m_col;
-            }
-        }
-    } else {
-        int y, t = 1 << col_and_7, m_col = (t << w) - t;
-
-        if (!skip_inter) {
-            int mask_id = (tx == TX_8X8);
-            int l2 = tx + is_uv - 1, step1d = 1 << l2;
-            static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
-            int m_row = m_col & masks[l2];
-
-            // at odd UV col/row edges tx16/tx32 loopfilter edges, force
-            // 8wd loopfilter to prevent going off the visible edge.
-            if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
-                int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
-                int m_row_8  = m_row - m_row_16;
-
-                for (y = row_and_7; y < h + row_and_7; y++) {
-                    lflvl->mask[is_uv][0][y][0] |= m_row_16;
-                    lflvl->mask[is_uv][0][y][1] |= m_row_8;
-                }
-            } else {
-                for (y = row_and_7; y < h + row_and_7; y++)
-                    lflvl->mask[is_uv][0][y][mask_id] |= m_row;
-            }
-
-            if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
-                for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
-                    lflvl->mask[is_uv][1][y][0] |= m_col;
-                if (y - row_and_7 == h - 1)
-                    lflvl->mask[is_uv][1][y][1] |= m_col;
-            } else {
-                for (y = row_and_7; y < h + row_and_7; y += step1d)
-                    lflvl->mask[is_uv][1][y][mask_id] |= m_col;
-            }
-        } else if (tx != TX_4X4) {
-            int mask_id;
-
-            mask_id = (tx == TX_8X8) || (is_uv && h == 1);
-            lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
-            mask_id = (tx == TX_8X8) || (is_uv && w == 1);
-            for (y = row_and_7; y < h + row_and_7; y++)
-                lflvl->mask[is_uv][0][y][mask_id] |= t;
-        } else if (is_uv) {
-            int t8 = t & 0x01, t4 = t - t8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
-            }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
-        } else {
-            int t8 = t & 0x11, t4 = t - t8;
-
-            for (y = row_and_7; y < h + row_and_7; y++) {
-                lflvl->mask[is_uv][0][y][2] |= t4;
-                lflvl->mask[is_uv][0][y][1] |= t8;
-            }
-            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
-        }
-    }
-}
-
-int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
-                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
-                        enum BlockLevel bl, enum BlockPartition bp)
-{
-    VP9Context *s = avctx->priv_data;
-    VP9Block *const b = &s->b;
-    enum BlockSize bs = bl * 3 + bp;
-    int ret, y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
-    int emu[2];
-
-    b->row  = row;
-    b->row7 = row & 7;
-    b->col  = col;
-    b->col7 = col & 7;
-
-    s->min_mv.x = -(128 + col * 64);
-    s->min_mv.y = -(128 + row * 64);
-    s->max_mv.x = 128 + (s->cols - col - w4) * 64;
-    s->max_mv.y = 128 + (s->rows - row - h4) * 64;
-
-    b->bs = bs;
-    decode_mode(s, b);
-    b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
-
-    if (!b->skip) {
-        if ((ret = decode_coeffs(avctx)) < 0)
-            return ret;
-    } else {
-        int pl;
-
-        memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
-        memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
-        for (pl = 0; pl < 2; pl++) {
-            memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
-            memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
-        }
-    }
-
-    /* Emulated overhangs if the stride of the target buffer can't hold.
-     * This allows to support emu-edge and so on even if we have large
-     * block overhangs. */
-    emu[0] = (col + w4) * 8 > s->cur_frame->linesize[0] ||
-             (row + h4) > s->rows;
-    emu[1] = (col + w4) * 4 > s->cur_frame->linesize[1] ||
-             (row + h4) > s->rows;
-    if (emu[0]) {
-        b->dst[0]   = s->tmp_y;
-        b->y_stride = 64;
-    } else {
-        b->dst[0]   = s->cur_frame->data[0] + yoff;
-        b->y_stride = s->cur_frame->linesize[0];
-    }
-    if (emu[1]) {
-        b->dst[1]    = s->tmp_uv[0];
-        b->dst[2]    = s->tmp_uv[1];
-        b->uv_stride = 32;
-    } else {
-        b->dst[1]    = s->cur_frame->data[1] + uvoff;
-        b->dst[2]    = s->cur_frame->data[2] + uvoff;
-        b->uv_stride = s->cur_frame->linesize[1];
-    }
-    if (b->intra) {
-        intra_recon(avctx, yoff, uvoff);
-    } else {
-        if ((ret = inter_recon(avctx)) < 0)
-            return ret;
-    }
-    if (emu[0]) {
-        int w = FFMIN(s->cols - col, w4) * 8;
-        int h = FFMIN(s->rows - row, h4) * 8;
-        int n, o = 0;
-
-        for (n = 0; o < w; n++) {
-            int bw = 64 >> n;
-
-            av_assert2(n <= 4);
-            if (w & bw) {
-                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[0] + yoff + o,
-                                         s->tmp_y + o,
-                                         s->cur_frame->linesize[0],
-                                         64, h, 0, 0);
-                o += bw;
-            }
-        }
-    }
-    if (emu[1]) {
-        int w = FFMIN(s->cols - col, w4) * 4;
-        int h = FFMIN(s->rows - row, h4) * 4;
-        int n, o = 0;
-
-        for (n = 1; o < w; n++) {
-            int bw = 64 >> n;
-
-            av_assert2(n <= 4);
-            if (w & bw) {
-                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[1] + uvoff + o,
-                                         s->tmp_uv[0] + o,
-                                         s->cur_frame->linesize[1],
-                                         32, h, 0, 0);
-                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[2] + uvoff + o,
-                                         s->tmp_uv[1] + o,
-                                         s->cur_frame->linesize[2],
-                                         32, h, 0, 0);
-                o += bw;
-            }
-        }
-    }
-
-    // pick filter level and find edges to apply filter to
-    if (s->filter.level &&
-        (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
-                                                    [b->mode[3] != ZEROMV]) > 0) {
-        int x_end = FFMIN(s->cols - col, w4);
-        int y_end = FFMIN(s->rows - row, h4);
-        int skip_inter = !b->intra && b->skip;
-
-        for (y = 0; y < h4; y++)
-            memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
-        mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
-        mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
-                   s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
-                   s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
-                   b->uvtx, skip_inter);
-
-        if (!s->filter.lim_lut[lvl]) {
-            int sharp = s->filter.sharpness;
-            int limit = lvl;
-
-            if (sharp > 0) {
-                limit >>= (sharp + 3) >> 2;
-                limit   = FFMIN(limit, 9 - sharp);
-            }
-            limit = FFMAX(limit, 1);
-
-            s->filter.lim_lut[lvl]   = limit;
-            s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
-        }
-    }
-
-    return 0;
-}
diff --git a/libavcodec/vp9data.c b/libavcodec/vp9data.c
deleted file mode 100644
index 374fa8b..0000000
--- a/libavcodec/vp9data.c
+++ /dev/null
@@ -1,2133 +0,0 @@
-/*
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "vp9.h"
-#include "vp9data.h"
-
-const int8_t ff_vp9_partition_tree[3][2] = {
-    { -PARTITION_NONE,                1 }, // '0'
-    {    -PARTITION_H,                2 }, // '10'
-    {    -PARTITION_V, -PARTITION_SPLIT }, // '110', '111'
-};
-
-const uint8_t ff_vp9_default_kf_partition_probs[4][4][3] = {
-    { /* 64x64 -> 32x32 */
-        { 174,  35,  49 } /* a/l both not split */,
-        {  68,  11,  27 } /* a split, l not split */,
-        {  57,  15,   9 } /* l split, a not split */,
-        {  12,   3,   3 } /* a/l both split */
-    }, { /* 32x32 -> 16x16 */
-        { 150,  40,  39 } /* a/l both not split */,
-        {  78,  12,  26 } /* a split, l not split */,
-        {  67,  33,  11 } /* l split, a not split */,
-        {  24,   7,   5 } /* a/l both split */,
-    }, { /* 16x16 -> 8x8 */
-        { 149,  53,  53 } /* a/l both not split */,
-        {  94,  20,  48 } /* a split, l not split */,
-        {  83,  53,  24 } /* l split, a not split */,
-        {  52,  18,  18 } /* a/l both split */,
-    }, { /* 8x8 -> 4x4 */
-        { 158,  97,  94 } /* a/l both not split */,
-        {  93,  24,  99 } /* a split, l not split */,
-        {  85, 119,  44 } /* l split, a not split */,
-        {  62,  59,  67 } /* a/l both split */,
-    },
-};
-
-const int8_t ff_vp9_segmentation_tree[7][2] = {
-    {  1,  2 },
-    {  3,  4 },
-    {  5,  6 },
-    { -0, -1 }, // '00x'
-    { -2, -3 }, // '01x'
-    { -4, -5 }, // '10x'
-    { -6, -7 }, // '11x'
-};
-
-const int8_t ff_vp9_intramode_tree[9][2] = {
-    {              -DC_PRED,                1 }, // '0'
-    {          -TM_VP8_PRED,                2 }, // '10'
-    {            -VERT_PRED,                3 }, // '110'
-    {                     4,                6 },
-    {             -HOR_PRED,                5 }, // '11100'
-    { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '11101x'
-    {  -DIAG_DOWN_LEFT_PRED,                7 }, // '11110'
-    {       -VERT_LEFT_PRED,                8 }, // '111110'
-    {        -HOR_DOWN_PRED,     -HOR_UP_PRED }, // '111111x'
-};
-
-const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9] = {
-    { /* above = v */
-        {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
-        {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
-        {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
-        {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
-        {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
-        {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
-        {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
-        {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
-        {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
-        {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
-    }, { /* above = h */
-        {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
-        {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
-        {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
-        {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
-        {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
-        {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
-        {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
-        {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
-        {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
-        {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
-    }, { /* above = dc */
-        {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
-        {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
-        { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
-        {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
-        {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
-        {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
-        {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
-        {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
-        {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
-        {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
-    }, { /* above = d45 */
-        {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
-        {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
-        { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
-        {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
-        {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
-        {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
-        {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
-        {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
-        {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
-        {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
-    }, { /* above = d135 */
-        {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
-        {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
-        {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
-        {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
-        {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
-        {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
-        {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
-        {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
-        {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
-        {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
-    }, { /* above = d117 */
-        {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
-        {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
-        {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
-        {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
-        {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
-        {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
-        {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
-        {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
-        {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
-        {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
-    }, { /* above = d153 */
-        {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
-        {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
-        {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
-        {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
-        {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
-        {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
-        {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
-        {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
-        {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
-        {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
-    }, { /* above = d63 */
-        {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
-        {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
-        {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
-        {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
-        {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
-        {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
-        {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
-        {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
-        {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
-        {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
-    }, { /* above = d27 */
-        {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
-        {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
-        {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
-        {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
-        {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
-        {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
-        {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
-        {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
-        {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
-        {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
-    }, { /* above = tm */
-        {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
-        {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
-        {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
-        {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
-        {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
-        {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
-        {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
-        {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
-        {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
-        {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
-    }
-};
-
-const uint8_t ff_vp9_default_kf_uvmode_probs[10][9] = {
-    { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
-    { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
-    { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
-    { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,
-    { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
-    { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
-    { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
-    { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
-    { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
-    { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
-};
-
-const int8_t ff_vp9_inter_mode_tree[3][2] = {
-    {    -ZEROMV,      1 }, // '0'
-    { -NEARESTMV,      2 }, // '10'
-    {    -NEARMV, -NEWMV }, // '11x'
-};
-
-const int8_t ff_vp9_filter_tree[2][2] = {
-    { -0,  1 },  // '0'
-    { -1, -2 },  // '1x'
-};
-
-const enum FilterMode ff_vp9_filter_lut[3] = {
-    FILTER_8TAP_REGULAR,
-    FILTER_8TAP_SMOOTH,
-    FILTER_8TAP_SHARP,
-};
-
-const int16_t ff_vp9_dc_qlookup[256] = {
-       4,    8,    8,    9,   10,   11,   12,   12,
-      13,   14,   15,   16,   17,   18,   19,   19,
-      20,   21,   22,   23,   24,   25,   26,   26,
-      27,   28,   29,   30,   31,   32,   32,   33,
-      34,   35,   36,   37,   38,   38,   39,   40,
-      41,   42,   43,   43,   44,   45,   46,   47,
-      48,   48,   49,   50,   51,   52,   53,   53,
-      54,   55,   56,   57,   57,   58,   59,   60,
-      61,   62,   62,   63,   64,   65,   66,   66,
-      67,   68,   69,   70,   70,   71,   72,   73,
-      74,   74,   75,   76,   77,   78,   78,   79,
-      80,   81,   81,   82,   83,   84,   85,   85,
-      87,   88,   90,   92,   93,   95,   96,   98,
-      99,  101,  102,  104,  105,  107,  108,  110,
-     111,  113,  114,  116,  117,  118,  120,  121,
-     123,  125,  127,  129,  131,  134,  136,  138,
-     140,  142,  144,  146,  148,  150,  152,  154,
-     156,  158,  161,  164,  166,  169,  172,  174,
-     177,  180,  182,  185,  187,  190,  192,  195,
-     199,  202,  205,  208,  211,  214,  217,  220,
-     223,  226,  230,  233,  237,  240,  243,  247,
-     250,  253,  257,  261,  265,  269,  272,  276,
-     280,  284,  288,  292,  296,  300,  304,  309,
-     313,  317,  322,  326,  330,  335,  340,  344,
-     349,  354,  359,  364,  369,  374,  379,  384,
-     389,  395,  400,  406,  411,  417,  423,  429,
-     435,  441,  447,  454,  461,  467,  475,  482,
-     489,  497,  505,  513,  522,  530,  539,  549,
-     559,  569,  579,  590,  602,  614,  626,  640,
-     654,  668,  684,  700,  717,  736,  755,  775,
-     796,  819,  843,  869,  896,  925,  955,  988,
-    1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
-};
-
-const int16_t ff_vp9_ac_qlookup[256] = {
-       4,    8,    9,   10,   11,   12,   13,   14,
-      15,   16,   17,   18,   19,   20,   21,   22,
-      23,   24,   25,   26,   27,   28,   29,   30,
-      31,   32,   33,   34,   35,   36,   37,   38,
-      39,   40,   41,   42,   43,   44,   45,   46,
-      47,   48,   49,   50,   51,   52,   53,   54,
-      55,   56,   57,   58,   59,   60,   61,   62,
-      63,   64,   65,   66,   67,   68,   69,   70,
-      71,   72,   73,   74,   75,   76,   77,   78,
-      79,   80,   81,   82,   83,   84,   85,   86,
-      87,   88,   89,   90,   91,   92,   93,   94,
-      95,   96,   97,   98,   99,  100,  101,  102,
-     104,  106,  108,  110,  112,  114,  116,  118,
-     120,  122,  124,  126,  128,  130,  132,  134,
-     136,  138,  140,  142,  144,  146,  148,  150,
-     152,  155,  158,  161,  164,  167,  170,  173,
-     176,  179,  182,  185,  188,  191,  194,  197,
-     200,  203,  207,  211,  215,  219,  223,  227,
-     231,  235,  239,  243,  247,  251,  255,  260,
-     265,  270,  275,  280,  285,  290,  295,  300,
-     305,  311,  317,  323,  329,  335,  341,  347,
-     353,  359,  366,  373,  380,  387,  394,  401,
-     408,  416,  424,  432,  440,  448,  456,  465,
-     474,  483,  492,  501,  510,  520,  530,  540,
-     550,  560,  571,  582,  593,  604,  615,  627,
-     639,  651,  663,  676,  689,  702,  715,  729,
-     743,  757,  771,  786,  801,  816,  832,  848,
-     864,  881,  898,  915,  933,  951,  969,  988,
-    1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
-    1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
-    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
-    1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
-};
-
-const enum TxfmType ff_vp9_intra_txfm_type[14] = {
-    [VERT_PRED]            = ADST_DCT,
-    [HOR_PRED]             = DCT_ADST,
-    [DC_PRED]              = DCT_DCT,
-    [DIAG_DOWN_LEFT_PRED]  = DCT_DCT,
-    [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
-    [VERT_RIGHT_PRED]      = ADST_DCT,
-    [HOR_DOWN_PRED]        = DCT_ADST,
-    [VERT_LEFT_PRED]       = ADST_DCT,
-    [HOR_UP_PRED]          = DCT_ADST,
-    [TM_VP8_PRED]          = ADST_ADST,
-    [NEARESTMV]            = DCT_DCT,
-    [NEARMV]               = DCT_DCT,
-    [ZEROMV]               = DCT_DCT,
-    [NEWMV]                = DCT_DCT,
-};
-
-const int16_t ff_vp9_default_scan_4x4[16] = {
-     0,  1,  4,  5,
-     2,  8,  3,  6,
-    12,  9,  7, 10,
-    13, 11, 14, 15,
-};
-
-const int16_t ff_vp9_col_scan_4x4[16] = {
-     0,  1,  2,  4,
-     3,  5,  6,  8,
-     7,  9, 10, 12,
-    13, 11, 14, 15,
-};
-
-const int16_t ff_vp9_row_scan_4x4[16] = {
-     0,  4,  1,  8,
-     5, 12,  9,  2,
-     6, 13,  3, 10,
-     7, 14, 11, 15,
-};
-
-const int16_t ff_vp9_default_scan_8x8[64] = {
-     0,  1,  8,  2,  9, 16, 10,  3,
-    17, 24, 18, 11,  4, 25, 32, 19,
-    12, 26,  5, 33, 20, 27, 40, 13,
-    34,  6, 41, 28, 21, 35, 42, 48,
-    14,  7, 36, 29, 43, 56, 49, 22,
-    15, 37, 50, 44, 57, 30, 23, 51,
-    45, 58, 38, 31, 52, 59, 39, 46,
-    53, 60, 47, 54, 61, 55, 62, 63,
-};
-
-const int16_t ff_vp9_col_scan_8x8[64] = {
-     0,  1,  2,  8,  3,  9,  4, 10,
-    16,  5, 11, 17, 12, 18,  6, 24,
-    19, 13, 25,  7, 26, 20, 32, 14,
-    27, 21, 33, 28, 34, 15, 22, 35,
-    40, 29, 41, 36, 23, 30, 42, 37,
-    48, 43, 31, 44, 49, 38, 50, 56,
-    45, 39, 51, 57, 52, 46, 58, 53,
-    59, 47, 60, 54, 61, 55, 62, 63,
-};
-
-const int16_t ff_vp9_row_scan_8x8[64] = {
-     0,  8, 16,  1,  9, 24,  2, 17,
-    32, 10, 25,  3, 40, 18, 11, 33,
-    26, 19,  4, 48, 41, 34, 12, 27,
-    56, 20,  5, 42, 35, 13, 49, 28,
-     6, 21, 43, 36, 14, 50, 29, 57,
-     7, 44, 22, 37, 51, 15, 58, 30,
-    23, 45, 52, 38, 59, 31, 46, 53,
-    39, 60, 47, 61, 54, 62, 55, 63,
-};
-
-const int16_t ff_vp9_default_scan_16x16[256] = {
-      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  34,  19,  49,  20,   5,
-     35,  64,  50,  36,  65,  21,   6,  51,  80,  66,  37,  22,  52,   7,  81,  67,
-     38,  82,  53,  23,  96,  68,   8,  83,  97,  54,  39,  69, 112,  24,  98,  84,
-     70,  55,   9,  40,  85,  99, 113, 128,  25, 114, 100,  71,  86,  56,  10,  41,
-    115, 101, 129, 116,  72,  87,  26, 130, 144, 102,  57,  11,  42, 117, 131, 145,
-     88, 103,  27,  73, 132, 118, 146,  58, 160,  12,  43, 133, 147, 104,  89, 119,
-    161,  74, 148, 134,  28, 162,  59,  13, 176, 120, 149,  90, 135, 105, 163,  44,
-     75, 177, 164,  29, 150, 121, 136, 178, 165,  14, 106,  60,  91, 151,  45, 179,
-    192, 137, 166, 122,  76, 180, 152,  30,  61,  15, 107, 167, 181, 193,  92, 208,
-     46, 138, 123, 153, 194,  77, 168, 182,  31, 195, 209, 183, 108, 139,  62, 154,
-     47, 196,  93, 169, 210, 197, 224, 124, 184, 211,  78, 109, 170, 155,  63, 198,
-    212, 185, 225, 240, 140,  94, 199, 125,  79, 213, 226, 171, 186, 156, 214, 200,
-    110, 227, 141,  95, 241, 215, 228, 201, 126, 242, 187, 172, 157, 229, 111, 216,
-    243, 142, 202, 230, 127, 217, 244, 173, 188, 231, 158, 203, 143, 245, 218, 232,
-    189, 246, 159, 174, 233, 247, 219, 204, 175, 190, 248, 234, 205, 220, 249, 191,
-    235, 221, 250, 206, 222, 251, 236, 207, 237, 223, 252, 238, 253, 239, 254, 255,
-};
-
-const int16_t ff_vp9_col_scan_16x16[256] = {
-      0,   1,   2,   3,  16,   4,  17,   5,  18,   6,  19,  32,  20,   7,  33,  21,
-     34,   8,  35,  22,  48,  36,   9,  49,  23,  50,  37,  10,  38,  51,  24,  64,
-     52,  11,  65,  39,  25,  53,  66,  54,  40,  67,  12,  80,  26,  68,  55,  81,
-     41,  69,  13,  27,  82,  56,  70,  83,  42,  14,  84,  96,  71,  28,  57,  85,
-     97,  15,  72,  98,  43,  86,  58,  99,  29,  87, 100, 112,  73,  44, 101,  59,
-     30, 113,  88, 114,  74, 128, 102,  45,  31, 115,  60, 103,  89, 116,  75, 129,
-    117,  46, 104,  90,  61, 130, 118, 131, 132, 105,  76,  47, 119, 144,  91,  62,
-    133, 106, 145, 120, 146, 134,  77, 147, 121,  92, 135, 148,  63, 107, 136, 122,
-     93, 149, 160,  78, 150, 137, 108, 161, 162, 151, 123,  79, 138, 163, 152,  94,
-    164, 109, 165, 153, 124, 139, 176, 166,  95, 177, 167, 110, 154, 178, 125, 179,
-    140, 168, 155, 111, 180, 192, 181, 169, 141, 126, 182, 193, 194, 156, 183, 170,
-    195, 127, 142, 196, 184, 208, 197, 157, 171, 143, 185, 198, 209, 199, 210, 172,
-    158, 186, 211, 224, 212, 200, 240, 159, 213, 225, 187, 201, 173, 226, 214, 215,
-    227, 202, 228, 188, 241, 216, 174, 229, 242, 203, 243, 217, 230, 175, 189, 244,
-    231, 204, 218, 232, 245, 219, 246, 190, 233, 205, 191, 247, 234, 248, 220, 206,
-    249, 235, 221, 207, 250, 236, 222, 251, 223, 237, 238, 252, 239, 253, 254, 255,
-};
-
-const int16_t ff_vp9_row_scan_16x16[256] = {
-      0,  16,  32,   1,  48,  17,  64,  33,   2,  80,  18,  49,  96,  34,   3,  65,
-     19, 112,  50,  81,  35,   4, 128,  66,  20,  97,  51,  82,   5, 144,  36,  67,
-    113,  98,  21,  52, 160,  83, 129,  37,  68,   6, 114, 176,  99,  53,  22,  84,
-    145,  38,  69, 130,   7, 115, 192, 100,  54,  23,  85, 161, 146, 131,  39,  70,
-    208, 116,   8, 101, 177,  55,  86,  24, 162, 147, 132,  71, 224, 117,  40, 102,
-      9, 148,  56,  87, 193, 163, 240, 133, 178,  25, 118,  72,  41, 103, 164,  10,
-    149,  88, 134, 209, 179,  57, 119, 194,  26,  73, 165, 150, 104,  42, 135,  11,
-    180, 120,  89, 225, 195,  58,  27, 210, 151, 181, 166,  74,  43, 105,  12, 136,
-     90,  59, 241, 121,  28, 196, 167, 211, 152,  44, 182, 137,  75,  13, 226, 106,
-    122,  60, 197,  91, 168,  29, 183, 153,  14,  76, 212, 138,  45, 107,  15, 198,
-     92, 227, 169,  30, 123, 154,  61, 242, 184, 213, 139,  46,  77,  31, 108, 170,
-    199, 185, 124, 228,  93, 155, 214,  62, 140, 243,  78,  47, 200, 109, 186, 171,
-    201,  94,  63, 215, 229, 156,  79, 125, 141, 110, 216, 187, 172, 244, 202, 230,
-    217,  95, 157, 126, 245, 111, 142, 231, 188, 127, 158, 218, 173, 232, 246, 233,
-    203, 143, 247, 174, 189, 159, 219, 204, 248, 234, 249, 175, 190, 220, 205, 250,
-    235, 191, 221, 251, 236, 206, 252, 222, 207, 237, 223, 253, 238, 254, 239, 255,
-};
-
-const int16_t ff_vp9_default_scan_32x32[1024] = {
-       0,    1,   32,    2,   33,   64,    3,   34,   65,    4,   96,   35,   66,    5,   36,   97,
-      67,  128,   98,   68,   37,    6,  129,   99,    7,  160,   69,   38,  130,  100,  161,  131,
-      39,   70,    8,  101,  162,  132,  192,   71,   40,    9,  102,  163,  133,  193,   72,  224,
-     103,   41,  164,   10,  194,  134,  165,   73,  104,  135,  225,   42,  195,   11,  256,  166,
-     226,  196,   74,  105,  136,   43,   12,  167,  197,  227,  257,   75,  106,  137,  228,   44,
-     198,  168,  258,  288,   13,  229,   76,  107,  199,  138,  259,  169,  289,   45,  230,  260,
-     200,  108,   14,  170,  139,  320,  290,   77,  231,  261,   46,  201,  140,  291,  109,  232,
-     321,  262,  171,   78,  292,   15,  322,  202,  263,  352,  172,  293,  233,  141,  323,  110,
-      47,  203,  264,  234,  294,  353,  324,   16,   79,  204,  265,  295,  325,  173,  354,  142,
-     235,  384,   48,  296,  111,  266,  355,  326,   80,   17,  205,  236,  174,  356,  385,  327,
-     143,  297,  267,  357,  386,  112,   49,  328,  298,  206,  416,  237,  358,  387,   81,  175,
-      18,  329,  359,  388,  299,  330,  389,  113,  417,  238,  360,   50,  207,  418,  390,  331,
-      19,  448,  361,   82,  419,  391,  239,   51,  362,  420,  114,  449,  480,  421,   83,  363,
-     450,  422,  512,  451,  423,  115,  452,  481,  453,  482,  454,  544,  483,  455,  513,  484,
-     514,  485,  515,  486,  545,  576,  487,  546,  547,  608,  577,  578,  579,  609,  610,  611,
-      20,  144,  268,  392,  516,  640,   21,   52,  145,  176,  269,  300,  393,  424,  517,  548,
-     641,  672,   22,   53,   84,  146,  177,  208,  270,  301,  332,  394,  425,  456,  518,  549,
-     580,  642,  673,  704,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
-     395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,   55,   86,  117,  179,
-     210,  241,  303,  334,  365,  427,  458,  489,  551,  582,  613,  675,  706,  737,   87,  118,
-     211,  242,  335,  366,  459,  490,  583,  614,  707,  738,  119,  243,  367,  491,  615,  739,
-      24,  148,  272,  396,  520,  644,  768,   25,   56,  149,  180,  273,  304,  397,  428,  521,
-     552,  645,  676,  769,  800,   26,   57,   88,  150,  181,  212,  274,  305,  336,  398,  429,
-     460,  522,  553,  584,  646,  677,  708,  770,  801,  832,   27,   58,   89,  120,  151,  182,
-     213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,
-     709,  740,  771,  802,  833,  864,   59,   90,  121,  183,  214,  245,  307,  338,  369,  431,
-     462,  493,  555,  586,  617,  679,  710,  741,  803,  834,  865,   91,  122,  215,  246,  339,
-     370,  463,  494,  587,  618,  711,  742,  835,  866,  123,  247,  371,  495,  619,  743,  867,
-      28,  152,  276,  400,  524,  648,  772,  896,   29,   60,  153,  184,  277,  308,  401,  432,
-     525,  556,  649,  680,  773,  804,  897,  928,   30,   61,   92,  154,  185,  216,  278,  309,
-     340,  402,  433,  464,  526,  557,  588,  650,  681,  712,  774,  805,  836,  898,  929,  960,
-      31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
-     527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,
-      63,   94,  125,  187,  218,  249,  311,  342,  373,  435,  466,  497,  559,  590,  621,  683,
-     714,  745,  807,  838,  869,  931,  962,  993,   95,  126,  219,  250,  343,  374,  467,  498,
-     591,  622,  715,  746,  839,  870,  963,  994,  127,  251,  375,  499,  623,  747,  871,  995,
-     156,  280,  404,  528,  652,  776,  900,  157,  188,  281,  312,  405,  436,  529,  560,  653,
-     684,  777,  808,  901,  932,  158,  189,  220,  282,  313,  344,  406,  437,  468,  530,  561,
-     592,  654,  685,  716,  778,  809,  840,  902,  933,  964,  159,  190,  221,  252,  283,  314,
-     345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
-     841,  872,  903,  934,  965,  996,  191,  222,  253,  315,  346,  377,  439,  470,  501,  563,
-     594,  625,  687,  718,  749,  811,  842,  873,  935,  966,  997,  223,  254,  347,  378,  471,
-     502,  595,  626,  719,  750,  843,  874,  967,  998,  255,  379,  503,  627,  751,  875,  999,
-     284,  408,  532,  656,  780,  904,  285,  316,  409,  440,  533,  564,  657,  688,  781,  812,
-     905,  936,  286,  317,  348,  410,  441,  472,  534,  565,  596,  658,  689,  720,  782,  813,
-     844,  906,  937,  968,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
-     659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000,  319,  350,  381,  443,
-     474,  505,  567,  598,  629,  691,  722,  753,  815,  846,  877,  939,  970, 1001,  351,  382,
-     475,  506,  599,  630,  723,  754,  847,  878,  971, 1002,  383,  507,  631,  755,  879, 1003,
-     412,  536,  660,  784,  908,  413,  444,  537,  568,  661,  692,  785,  816,  909,  940,  414,
-     445,  476,  538,  569,  600,  662,  693,  724,  786,  817,  848,  910,  941,  972,  415,  446,
-     477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
-     973, 1004,  447,  478,  509,  571,  602,  633,  695,  726,  757,  819,  850,  881,  943,  974,
-    1005,  479,  510,  603,  634,  727,  758,  851,  882,  975, 1006,  511,  635,  759,  883, 1007,
-     540,  664,  788,  912,  541,  572,  665,  696,  789,  820,  913,  944,  542,  573,  604,  666,
-     697,  728,  790,  821,  852,  914,  945,  976,  543,  574,  605,  636,  667,  698,  729,  760,
-     791,  822,  853,  884,  915,  946,  977, 1008,  575,  606,  637,  699,  730,  761,  823,  854,
-     885,  947,  978, 1009,  607,  638,  731,  762,  855,  886,  979, 1010,  639,  763,  887, 1011,
-     668,  792,  916,  669,  700,  793,  824,  917,  948,  670,  701,  732,  794,  825,  856,  918,
-     949,  980,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012,  703,  734,
-     765,  827,  858,  889,  951,  982, 1013,  735,  766,  859,  890,  983, 1014,  767,  891, 1015,
-     796,  920,  797,  828,  921,  952,  798,  829,  860,  922,  953,  984,  799,  830,  861,  892,
-     923,  954,  985, 1016,  831,  862,  893,  955,  986, 1017,  863,  894,  987, 1018,  895, 1019,
-     924,  925,  956,  926,  957,  988,  927,  958,  989, 1020,  959,  990, 1021,  991, 1022, 1023,
-};
-
-const int16_t *ff_vp9_scans[5][4] = {
-    {
-        ff_vp9_default_scan_4x4, ff_vp9_col_scan_4x4,
-        ff_vp9_row_scan_4x4, ff_vp9_default_scan_4x4
-    }, {
-        ff_vp9_default_scan_8x8, ff_vp9_col_scan_8x8,
-        ff_vp9_row_scan_8x8, ff_vp9_default_scan_8x8
-    }, {
-        ff_vp9_default_scan_16x16, ff_vp9_col_scan_16x16,
-        ff_vp9_row_scan_16x16, ff_vp9_default_scan_16x16
-    }, {
-        ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32,
-        ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32
-    }, { // lossless
-        ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4,
-        ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4
-    }
-};
-
-const int16_t ff_vp9_default_scan_4x4_nb[16][2] = {
-    {  0,  0 }, {  0,  0 }, {  4,  1 }, {  1,  1 },
-    {  4,  4 }, {  2,  2 }, {  5,  2 }, {  8,  8 },
-    {  8,  5 }, {  6,  3 }, {  9,  6 }, { 12,  9 },
-    { 10,  7 }, { 13, 10 }, { 14, 11 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_col_scan_4x4_nb[16][2] = {
-    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
-    {  4,  4 }, {  5,  5 }, {  4,  4 }, {  6,  6 },
-    {  8,  8 }, {  9,  9 }, {  8,  8 }, { 12, 12 },
-    { 10, 10 }, { 13, 13 }, { 14, 14 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_row_scan_4x4_nb[16][2] = {
-    {  0,  0 }, {  0,  0 }, {  4,  4 }, {  1,  1 },
-    {  8,  8 }, {  5,  5 }, {  1,  1 }, {  2,  2 },
-    {  9,  9 }, {  2,  2 }, {  6,  6 }, {  3,  3 },
-    { 10, 10 }, {  7,  7 }, { 11, 11 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_default_scan_8x8_nb[64][2] = {
-    {  0,  0 }, {  0,  0 }, {  1,  1 }, {  8,  1 },
-    {  8,  8 }, {  9,  2 }, {  2,  2 }, { 16,  9 },
-    { 16, 16 }, { 17, 10 }, { 10,  3 }, {  3,  3 },
-    { 24, 17 }, { 24, 24 }, { 18, 11 }, { 11,  4 },
-    { 25, 18 }, {  4,  4 }, { 32, 25 }, { 19, 12 },
-    { 26, 19 }, { 32, 32 }, { 12,  5 }, { 33, 26 },
-    {  5,  5 }, { 40, 33 }, { 27, 20 }, { 20, 13 },
-    { 34, 27 }, { 41, 34 }, { 40, 40 }, { 13,  6 },
-    {  6,  6 }, { 35, 28 }, { 28, 21 }, { 42, 35 },
-    { 48, 48 }, { 48, 41 }, { 21, 14 }, { 14,  7 },
-    { 36, 29 }, { 49, 42 }, { 43, 36 }, { 56, 49 },
-    { 29, 22 }, { 22, 15 }, { 50, 43 }, { 44, 37 },
-    { 57, 50 }, { 37, 30 }, { 30, 23 }, { 51, 44 },
-    { 58, 51 }, { 38, 31 }, { 45, 38 }, { 52, 45 },
-    { 59, 52 }, { 46, 39 }, { 53, 46 }, { 60, 53 },
-    { 54, 47 }, { 61, 54 }, { 62, 55 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_col_scan_8x8_nb[64][2] = {
-    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
-    {  8,  8 }, {  3,  3 }, {  9,  9 }, {  8,  8 },
-    {  4,  4 }, { 10, 10 }, { 16, 16 }, { 11, 11 },
-    { 17, 17 }, {  5,  5 }, { 16, 16 }, { 18, 18 },
-    { 12, 12 }, { 24, 24 }, {  6,  6 }, { 25, 25 },
-    { 19, 19 }, { 24, 24 }, { 13, 13 }, { 26, 26 },
-    { 20, 20 }, { 32, 32 }, { 27, 27 }, { 33, 33 },
-    { 14, 14 }, { 21, 21 }, { 34, 34 }, { 32, 32 },
-    { 28, 28 }, { 40, 40 }, { 35, 35 }, { 22, 22 },
-    { 29, 29 }, { 41, 41 }, { 36, 36 }, { 40, 40 },
-    { 42, 42 }, { 30, 30 }, { 43, 43 }, { 48, 48 },
-    { 37, 37 }, { 49, 49 }, { 48, 48 }, { 44, 44 },
-    { 38, 38 }, { 50, 50 }, { 56, 56 }, { 51, 51 },
-    { 45, 45 }, { 57, 57 }, { 52, 52 }, { 58, 58 },
-    { 46, 46 }, { 59, 59 }, { 53, 53 }, { 60, 60 },
-    { 54, 54 }, { 61, 61 }, { 62, 62 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_row_scan_8x8_nb[64][2] = {
-    {  0,  0 }, {  8,  8 }, {  0,  0 }, {  1,  1 },
-    { 16, 16 }, {  1,  1 }, {  9,  9 }, { 24, 24 },
-    {  2,  2 }, { 17, 17 }, {  2,  2 }, { 32, 32 },
-    { 10, 10 }, {  3,  3 }, { 25, 25 }, { 18, 18 },
-    { 11, 11 }, {  3,  3 }, { 40, 40 }, { 33, 33 },
-    { 26, 26 }, {  4,  4 }, { 19, 19 }, { 48, 48 },
-    { 12, 12 }, {  4,  4 }, { 34, 34 }, { 27, 27 },
-    {  5,  5 }, { 41, 41 }, { 20, 20 }, {  5,  5 },
-    { 13, 13 }, { 35, 35 }, { 28, 28 }, {  6,  6 },
-    { 42, 42 }, { 21, 21 }, { 49, 49 }, {  6,  6 },
-    { 36, 36 }, { 14, 14 }, { 29, 29 }, { 43, 43 },
-    {  7,  7 }, { 50, 50 }, { 22, 22 }, { 15, 15 },
-    { 37, 37 }, { 44, 44 }, { 30, 30 }, { 51, 51 },
-    { 23, 23 }, { 38, 38 }, { 45, 45 }, { 31, 31 },
-    { 52, 52 }, { 39, 39 }, { 53, 53 }, { 46, 46 },
-    { 54, 54 }, { 47, 47 }, { 55, 55 }, {  0,  0 },
-};
-
-const int16_t ff_vp9_default_scan_16x16_nb[256][2] = {
-    {   0,   0 }, {   0,   0 }, {   1,   1 }, {  16,   1 },
-    {  16,  16 }, {   2,   2 }, {  17,   2 }, {  32,  17 },
-    {  32,  32 }, {   3,   3 }, {  33,  18 }, {  18,   3 },
-    {  48,  33 }, {  19,   4 }, {   4,   4 }, {  34,  19 },
-    {  48,  48 }, {  49,  34 }, {  35,  20 }, {  64,  49 },
-    {  20,   5 }, {   5,   5 }, {  50,  35 }, {  64,  64 },
-    {  65,  50 }, {  36,  21 }, {  21,   6 }, {  51,  36 },
-    {   6,   6 }, {  80,  65 }, {  66,  51 }, {  37,  22 },
-    {  81,  66 }, {  52,  37 }, {  22,   7 }, {  80,  80 },
-    {  67,  52 }, {   7,   7 }, {  82,  67 }, {  96,  81 },
-    {  53,  38 }, {  38,  23 }, {  68,  53 }, {  96,  96 },
-    {  23,   8 }, {  97,  82 }, {  83,  68 }, {  69,  54 },
-    {  54,  39 }, {   8,   8 }, {  39,  24 }, {  84,  69 },
-    {  98,  83 }, { 112,  97 }, { 112, 112 }, {  24,   9 },
-    { 113,  98 }, {  99,  84 }, {  70,  55 }, {  85,  70 },
-    {  55,  40 }, {   9,   9 }, {  40,  25 }, { 114,  99 },
-    { 100,  85 }, { 128, 113 }, { 115, 100 }, {  71,  56 },
-    {  86,  71 }, {  25,  10 }, { 129, 114 }, { 128, 128 },
-    { 101,  86 }, {  56,  41 }, {  10,  10 }, {  41,  26 },
-    { 116, 101 }, { 130, 115 }, { 144, 129 }, {  87,  72 },
-    { 102,  87 }, {  26,  11 }, {  72,  57 }, { 131, 116 },
-    { 117, 102 }, { 145, 130 }, {  57,  42 }, { 144, 144 },
-    {  11,  11 }, {  42,  27 }, { 132, 117 }, { 146, 131 },
-    { 103,  88 }, {  88,  73 }, { 118, 103 }, { 160, 145 },
-    {  73,  58 }, { 147, 132 }, { 133, 118 }, {  27,  12 },
-    { 161, 146 }, {  58,  43 }, {  12,  12 }, { 160, 160 },
-    { 119, 104 }, { 148, 133 }, {  89,  74 }, { 134, 119 },
-    { 104,  89 }, { 162, 147 }, {  43,  28 }, {  74,  59 },
-    { 176, 161 }, { 163, 148 }, {  28,  13 }, { 149, 134 },
-    { 120, 105 }, { 135, 120 }, { 177, 162 }, { 164, 149 },
-    {  13,  13 }, { 105,  90 }, {  59,  44 }, {  90,  75 },
-    { 150, 135 }, {  44,  29 }, { 178, 163 }, { 176, 176 },
-    { 136, 121 }, { 165, 150 }, { 121, 106 }, {  75,  60 },
-    { 179, 164 }, { 151, 136 }, {  29,  14 }, {  60,  45 },
-    {  14,  14 }, { 106,  91 }, { 166, 151 }, { 180, 165 },
-    { 192, 177 }, {  91,  76 }, { 192, 192 }, {  45,  30 },
-    { 137, 122 }, { 122, 107 }, { 152, 137 }, { 193, 178 },
-    {  76,  61 }, { 167, 152 }, { 181, 166 }, {  30,  15 },
-    { 194, 179 }, { 208, 193 }, { 182, 167 }, { 107,  92 },
-    { 138, 123 }, {  61,  46 }, { 153, 138 }, {  46,  31 },
-    { 195, 180 }, {  92,  77 }, { 168, 153 }, { 209, 194 },
-    { 196, 181 }, { 208, 208 }, { 123, 108 }, { 183, 168 },
-    { 210, 195 }, {  77,  62 }, { 108,  93 }, { 169, 154 },
-    { 154, 139 }, {  62,  47 }, { 197, 182 }, { 211, 196 },
-    { 184, 169 }, { 224, 209 }, { 224, 224 }, { 139, 124 },
-    {  93,  78 }, { 198, 183 }, { 124, 109 }, {  78,  63 },
-    { 212, 197 }, { 225, 210 }, { 170, 155 }, { 185, 170 },
-    { 155, 140 }, { 213, 198 }, { 199, 184 }, { 109,  94 },
-    { 226, 211 }, { 140, 125 }, {  94,  79 }, { 240, 225 },
-    { 214, 199 }, { 227, 212 }, { 200, 185 }, { 125, 110 },
-    { 241, 226 }, { 186, 171 }, { 171, 156 }, { 156, 141 },
-    { 228, 213 }, { 110,  95 }, { 215, 200 }, { 242, 227 },
-    { 141, 126 }, { 201, 186 }, { 229, 214 }, { 126, 111 },
-    { 216, 201 }, { 243, 228 }, { 172, 157 }, { 187, 172 },
-    { 230, 215 }, { 157, 142 }, { 202, 187 }, { 142, 127 },
-    { 244, 229 }, { 217, 202 }, { 231, 216 }, { 188, 173 },
-    { 245, 230 }, { 158, 143 }, { 173, 158 }, { 232, 217 },
-    { 246, 231 }, { 218, 203 }, { 203, 188 }, { 174, 159 },
-    { 189, 174 }, { 247, 232 }, { 233, 218 }, { 204, 189 },
-    { 219, 204 }, { 248, 233 }, { 190, 175 }, { 234, 219 },
-    { 220, 205 }, { 249, 234 }, { 205, 190 }, { 221, 206 },
-    { 250, 235 }, { 235, 220 }, { 206, 191 }, { 236, 221 },
-    { 222, 207 }, { 251, 236 }, { 237, 222 }, { 252, 237 },
-    { 238, 223 }, { 253, 238 }, { 254, 239 }, {   0,   0 },
-};
-
-const int16_t ff_vp9_col_scan_16x16_nb[256][2] = {
-    {   0,   0 }, {   1,   1 }, {   2,   2 }, {   0,   0 },
-    {   3,   3 }, {  16,  16 }, {   4,   4 }, {  17,  17 },
-    {   5,   5 }, {  18,  18 }, {  16,  16 }, {  19,  19 },
-    {   6,   6 }, {  32,  32 }, {  20,  20 }, {  33,  33 },
-    {   7,   7 }, {  34,  34 }, {  21,  21 }, {  32,  32 },
-    {  35,  35 }, {   8,   8 }, {  48,  48 }, {  22,  22 },
-    {  49,  49 }, {  36,  36 }, {   9,   9 }, {  37,  37 },
-    {  50,  50 }, {  23,  23 }, {  48,  48 }, {  51,  51 },
-    {  10,  10 }, {  64,  64 }, {  38,  38 }, {  24,  24 },
-    {  52,  52 }, {  65,  65 }, {  53,  53 }, {  39,  39 },
-    {  66,  66 }, {  11,  11 }, {  64,  64 }, {  25,  25 },
-    {  67,  67 }, {  54,  54 }, {  80,  80 }, {  40,  40 },
-    {  68,  68 }, {  12,  12 }, {  26,  26 }, {  81,  81 },
-    {  55,  55 }, {  69,  69 }, {  82,  82 }, {  41,  41 },
-    {  13,  13 }, {  83,  83 }, {  80,  80 }, {  70,  70 },
-    {  27,  27 }, {  56,  56 }, {  84,  84 }, {  96,  96 },
-    {  14,  14 }, {  71,  71 }, {  97,  97 }, {  42,  42 },
-    {  85,  85 }, {  57,  57 }, {  98,  98 }, {  28,  28 },
-    {  86,  86 }, {  99,  99 }, {  96,  96 }, {  72,  72 },
-    {  43,  43 }, { 100, 100 }, {  58,  58 }, {  29,  29 },
-    { 112, 112 }, {  87,  87 }, { 113, 113 }, {  73,  73 },
-    { 112, 112 }, { 101, 101 }, {  44,  44 }, {  30,  30 },
-    { 114, 114 }, {  59,  59 }, { 102, 102 }, {  88,  88 },
-    { 115, 115 }, {  74,  74 }, { 128, 128 }, { 116, 116 },
-    {  45,  45 }, { 103, 103 }, {  89,  89 }, {  60,  60 },
-    { 129, 129 }, { 117, 117 }, { 130, 130 }, { 131, 131 },
-    { 104, 104 }, {  75,  75 }, {  46,  46 }, { 118, 118 },
-    { 128, 128 }, {  90,  90 }, {  61,  61 }, { 132, 132 },
-    { 105, 105 }, { 144, 144 }, { 119, 119 }, { 145, 145 },
-    { 133, 133 }, {  76,  76 }, { 146, 146 }, { 120, 120 },
-    {  91,  91 }, { 134, 134 }, { 147, 147 }, {  62,  62 },
-    { 106, 106 }, { 135, 135 }, { 121, 121 }, {  92,  92 },
-    { 148, 148 }, { 144, 144 }, {  77,  77 }, { 149, 149 },
-    { 136, 136 }, { 107, 107 }, { 160, 160 }, { 161, 161 },
-    { 150, 150 }, { 122, 122 }, {  78,  78 }, { 137, 137 },
-    { 162, 162 }, { 151, 151 }, {  93,  93 }, { 163, 163 },
-    { 108, 108 }, { 164, 164 }, { 152, 152 }, { 123, 123 },
-    { 138, 138 }, { 160, 160 }, { 165, 165 }, {  94,  94 },
-    { 176, 176 }, { 166, 166 }, { 109, 109 }, { 153, 153 },
-    { 177, 177 }, { 124, 124 }, { 178, 178 }, { 139, 139 },
-    { 167, 167 }, { 154, 154 }, { 110, 110 }, { 179, 179 },
-    { 176, 176 }, { 180, 180 }, { 168, 168 }, { 140, 140 },
-    { 125, 125 }, { 181, 181 }, { 192, 192 }, { 193, 193 },
-    { 155, 155 }, { 182, 182 }, { 169, 169 }, { 194, 194 },
-    { 126, 126 }, { 141, 141 }, { 195, 195 }, { 183, 183 },
-    { 192, 192 }, { 196, 196 }, { 156, 156 }, { 170, 170 },
-    { 142, 142 }, { 184, 184 }, { 197, 197 }, { 208, 208 },
-    { 198, 198 }, { 209, 209 }, { 171, 171 }, { 157, 157 },
-    { 185, 185 }, { 210, 210 }, { 208, 208 }, { 211, 211 },
-    { 199, 199 }, { 224, 224 }, { 158, 158 }, { 212, 212 },
-    { 224, 224 }, { 186, 186 }, { 200, 200 }, { 172, 172 },
-    { 225, 225 }, { 213, 213 }, { 214, 214 }, { 226, 226 },
-    { 201, 201 }, { 227, 227 }, { 187, 187 }, { 240, 240 },
-    { 215, 215 }, { 173, 173 }, { 228, 228 }, { 241, 241 },
-    { 202, 202 }, { 242, 242 }, { 216, 216 }, { 229, 229 },
-    { 174, 174 }, { 188, 188 }, { 243, 243 }, { 230, 230 },
-    { 203, 203 }, { 217, 217 }, { 231, 231 }, { 244, 244 },
-    { 218, 218 }, { 245, 245 }, { 189, 189 }, { 232, 232 },
-    { 204, 204 }, { 190, 190 }, { 246, 246 }, { 233, 233 },
-    { 247, 247 }, { 219, 219 }, { 205, 205 }, { 248, 248 },
-    { 234, 234 }, { 220, 220 }, { 206, 206 }, { 249, 249 },
-    { 235, 235 }, { 221, 221 }, { 250, 250 }, { 222, 222 },
-    { 236, 236 }, { 237, 237 }, { 251, 251 }, { 238, 238 },
-    { 252, 252 }, { 253, 253 }, { 254, 254 }, {   0,   0 },
-};
-
-const int16_t ff_vp9_row_scan_16x16_nb[256][2] = {
-    {   0,   0 }, {  16,  16 }, {   0,   0 }, {  32,  32 },
-    {   1,   1 }, {  48,  48 }, {  17,  17 }, {   1,   1 },
-    {  64,  64 }, {   2,   2 }, {  33,  33 }, {  80,  80 },
-    {  18,  18 }, {   2,   2 }, {  49,  49 }, {   3,   3 },
-    {  96,  96 }, {  34,  34 }, {  65,  65 }, {  19,  19 },
-    {   3,   3 }, { 112, 112 }, {  50,  50 }, {   4,   4 },
-    {  81,  81 }, {  35,  35 }, {  66,  66 }, {   4,   4 },
-    { 128, 128 }, {  20,  20 }, {  51,  51 }, {  97,  97 },
-    {  82,  82 }, {   5,   5 }, {  36,  36 }, { 144, 144 },
-    {  67,  67 }, { 113, 113 }, {  21,  21 }, {  52,  52 },
-    {   5,   5 }, {  98,  98 }, { 160, 160 }, {  83,  83 },
-    {  37,  37 }, {   6,   6 }, {  68,  68 }, { 129, 129 },
-    {  22,  22 }, {  53,  53 }, { 114, 114 }, {   6,   6 },
-    {  99,  99 }, { 176, 176 }, {  84,  84 }, {  38,  38 },
-    {   7,   7 }, {  69,  69 }, { 145, 145 }, { 130, 130 },
-    { 115, 115 }, {  23,  23 }, {  54,  54 }, { 192, 192 },
-    { 100, 100 }, {   7,   7 }, {  85,  85 }, { 161, 161 },
-    {  39,  39 }, {  70,  70 }, {   8,   8 }, { 146, 146 },
-    { 131, 131 }, { 116, 116 }, {  55,  55 }, { 208, 208 },
-    { 101, 101 }, {  24,  24 }, {  86,  86 }, {   8,   8 },
-    { 132, 132 }, {  40,  40 }, {  71,  71 }, { 177, 177 },
-    { 147, 147 }, { 224, 224 }, { 117, 117 }, { 162, 162 },
-    {   9,   9 }, { 102, 102 }, {  56,  56 }, {  25,  25 },
-    {  87,  87 }, { 148, 148 }, {   9,   9 }, { 133, 133 },
-    {  72,  72 }, { 118, 118 }, { 193, 193 }, { 163, 163 },
-    {  41,  41 }, { 103, 103 }, { 178, 178 }, {  10,  10 },
-    {  57,  57 }, { 149, 149 }, { 134, 134 }, {  88,  88 },
-    {  26,  26 }, { 119, 119 }, {  10,  10 }, { 164, 164 },
-    { 104, 104 }, {  73,  73 }, { 209, 209 }, { 179, 179 },
-    {  42,  42 }, {  11,  11 }, { 194, 194 }, { 135, 135 },
-    { 165, 165 }, { 150, 150 }, {  58,  58 }, {  27,  27 },
-    {  89,  89 }, {  11,  11 }, { 120, 120 }, {  74,  74 },
-    {  43,  43 }, { 225, 225 }, { 105, 105 }, {  12,  12 },
-    { 180, 180 }, { 151, 151 }, { 195, 195 }, { 136, 136 },
-    {  28,  28 }, { 166, 166 }, { 121, 121 }, {  59,  59 },
-    {  12,  12 }, { 210, 210 }, {  90,  90 }, { 106, 106 },
-    {  44,  44 }, { 181, 181 }, {  75,  75 }, { 152, 152 },
-    {  13,  13 }, { 167, 167 }, { 137, 137 }, {  13,  13 },
-    {  60,  60 }, { 196, 196 }, { 122, 122 }, {  29,  29 },
-    {  91,  91 }, {  14,  14 }, { 182, 182 }, {  76,  76 },
-    { 211, 211 }, { 153, 153 }, {  14,  14 }, { 107, 107 },
-    { 138, 138 }, {  45,  45 }, { 226, 226 }, { 168, 168 },
-    { 197, 197 }, { 123, 123 }, {  30,  30 }, {  61,  61 },
-    {  15,  15 }, {  92,  92 }, { 154, 154 }, { 183, 183 },
-    { 169, 169 }, { 108, 108 }, { 212, 212 }, {  77,  77 },
-    { 139, 139 }, { 198, 198 }, {  46,  46 }, { 124, 124 },
-    { 227, 227 }, {  62,  62 }, {  31,  31 }, { 184, 184 },
-    {  93,  93 }, { 170, 170 }, { 155, 155 }, { 185, 185 },
-    {  78,  78 }, {  47,  47 }, { 199, 199 }, { 213, 213 },
-    { 140, 140 }, {  63,  63 }, { 109, 109 }, { 125, 125 },
-    {  94,  94 }, { 200, 200 }, { 171, 171 }, { 156, 156 },
-    { 228, 228 }, { 186, 186 }, { 214, 214 }, { 201, 201 },
-    {  79,  79 }, { 141, 141 }, { 110, 110 }, { 229, 229 },
-    {  95,  95 }, { 126, 126 }, { 215, 215 }, { 172, 172 },
-    { 111, 111 }, { 142, 142 }, { 202, 202 }, { 157, 157 },
-    { 216, 216 }, { 230, 230 }, { 217, 217 }, { 187, 187 },
-    { 127, 127 }, { 231, 231 }, { 158, 158 }, { 173, 173 },
-    { 143, 143 }, { 203, 203 }, { 188, 188 }, { 232, 232 },
-    { 218, 218 }, { 233, 233 }, { 159, 159 }, { 174, 174 },
-    { 204, 204 }, { 189, 189 }, { 234, 234 }, { 219, 219 },
-    { 175, 175 }, { 205, 205 }, { 235, 235 }, { 220, 220 },
-    { 190, 190 }, { 236, 236 }, { 206, 206 }, { 191, 191 },
-    { 221, 221 }, { 207, 207 }, { 237, 237 }, { 222, 222 },
-    { 238, 238 }, { 223, 223 }, { 239, 239 }, {   0,   0 },
-};
-
-const int16_t ff_vp9_default_scan_32x32_nb[1024][2] = {
-    {    0,    0 }, {    0,    0 }, {    1,    1 }, {   32,    1 },
-    {   32,   32 }, {    2,    2 }, {   33,    2 }, {   64,   33 },
-    {    3,    3 }, {   64,   64 }, {   34,    3 }, {   65,   34 },
-    {    4,    4 }, {   35,    4 }, {   96,   65 }, {   66,   35 },
-    {   96,   96 }, {   97,   66 }, {   67,   36 }, {   36,    5 },
-    {    5,    5 }, {  128,   97 }, {   98,   67 }, {    6,    6 },
-    {  128,  128 }, {   68,   37 }, {   37,    6 }, {  129,   98 },
-    {   99,   68 }, {  160,  129 }, {  130,   99 }, {   38,    7 },
-    {   69,   38 }, {    7,    7 }, {  100,   69 }, {  161,  130 },
-    {  131,  100 }, {  160,  160 }, {   70,   39 }, {   39,    8 },
-    {    8,    8 }, {  101,   70 }, {  162,  131 }, {  132,  101 },
-    {  192,  161 }, {   71,   40 }, {  192,  192 }, {  102,   71 },
-    {   40,    9 }, {  163,  132 }, {    9,    9 }, {  193,  162 },
-    {  133,  102 }, {  164,  133 }, {   72,   41 }, {  103,   72 },
-    {  134,  103 }, {  224,  193 }, {   41,   10 }, {  194,  163 },
-    {   10,   10 }, {  224,  224 }, {  165,  134 }, {  225,  194 },
-    {  195,  164 }, {   73,   42 }, {  104,   73 }, {  135,  104 },
-    {   42,   11 }, {   11,   11 }, {  166,  135 }, {  196,  165 },
-    {  226,  195 }, {  256,  225 }, {   74,   43 }, {  105,   74 },
-    {  136,  105 }, {  227,  196 }, {   43,   12 }, {  197,  166 },
-    {  167,  136 }, {  257,  226 }, {  256,  256 }, {   12,   12 },
-    {  228,  197 }, {   75,   44 }, {  106,   75 }, {  198,  167 },
-    {  137,  106 }, {  258,  227 }, {  168,  137 }, {  288,  257 },
-    {   44,   13 }, {  229,  198 }, {  259,  228 }, {  199,  168 },
-    {  107,   76 }, {   13,   13 }, {  169,  138 }, {  138,  107 },
-    {  288,  288 }, {  289,  258 }, {   76,   45 }, {  230,  199 },
-    {  260,  229 }, {   45,   14 }, {  200,  169 }, {  139,  108 },
-    {  290,  259 }, {  108,   77 }, {  231,  200 }, {  320,  289 },
-    {  261,  230 }, {  170,  139 }, {   77,   46 }, {  291,  260 },
-    {   14,   14 }, {  321,  290 }, {  201,  170 }, {  262,  231 },
-    {  320,  320 }, {  171,  140 }, {  292,  261 }, {  232,  201 },
-    {  140,  109 }, {  322,  291 }, {  109,   78 }, {   46,   15 },
-    {  202,  171 }, {  263,  232 }, {  233,  202 }, {  293,  262 },
-    {  352,  321 }, {  323,  292 }, {   15,   15 }, {   78,   47 },
-    {  203,  172 }, {  264,  233 }, {  294,  263 }, {  324,  293 },
-    {  172,  141 }, {  353,  322 }, {  141,  110 }, {  234,  203 },
-    {  352,  352 }, {   47,   16 }, {  295,  264 }, {  110,   79 },
-    {  265,  234 }, {  354,  323 }, {  325,  294 }, {   79,   48 },
-    {   16,   16 }, {  204,  173 }, {  235,  204 }, {  173,  142 },
-    {  355,  324 }, {  384,  353 }, {  326,  295 }, {  142,  111 },
-    {  296,  265 }, {  266,  235 }, {  356,  325 }, {  385,  354 },
-    {  111,   80 }, {   48,   17 }, {  327,  296 }, {  297,  266 },
-    {  205,  174 }, {  384,  384 }, {  236,  205 }, {  357,  326 },
-    {  386,  355 }, {   80,   49 }, {  174,  143 }, {   17,   17 },
-    {  328,  297 }, {  358,  327 }, {  387,  356 }, {  298,  267 },
-    {  329,  298 }, {  388,  357 }, {  112,   81 }, {  416,  385 },
-    {  237,  206 }, {  359,  328 }, {   49,   18 }, {  206,  175 },
-    {  417,  386 }, {  389,  358 }, {  330,  299 }, {   18,   18 },
-    {  416,  416 }, {  360,  329 }, {   81,   50 }, {  418,  387 },
-    {  390,  359 }, {  238,  207 }, {   50,   19 }, {  361,  330 },
-    {  419,  388 }, {  113,   82 }, {  448,  417 }, {  448,  448 },
-    {  420,  389 }, {   82,   51 }, {  362,  331 }, {  449,  418 },
-    {  421,  390 }, {  480,  480 }, {  450,  419 }, {  422,  391 },
-    {  114,   83 }, {  451,  420 }, {  480,  449 }, {  452,  421 },
-    {  481,  450 }, {  453,  422 }, {  512,  512 }, {  482,  451 },
-    {  454,  423 }, {  512,  481 }, {  483,  452 }, {  513,  482 },
-    {  484,  453 }, {  514,  483 }, {  485,  454 }, {  544,  513 },
-    {  544,  544 }, {  486,  455 }, {  545,  514 }, {  546,  515 },
-    {  576,  576 }, {  576,  545 }, {  577,  546 }, {  578,  547 },
-    {  608,  577 }, {  609,  578 }, {  610,  579 }, {   19,   19 },
-    {  143,  112 }, {  267,  236 }, {  391,  360 }, {  515,  484 },
-    {  608,  608 }, {   20,   20 }, {   51,   20 }, {  144,  113 },
-    {  175,  144 }, {  268,  237 }, {  299,  268 }, {  392,  361 },
-    {  423,  392 }, {  516,  485 }, {  547,  516 }, {  640,  609 },
-    {  640,  640 }, {   21,   21 }, {   52,   21 }, {   83,   52 },
-    {  145,  114 }, {  176,  145 }, {  207,  176 }, {  269,  238 },
-    {  300,  269 }, {  331,  300 }, {  393,  362 }, {  424,  393 },
-    {  455,  424 }, {  517,  486 }, {  548,  517 }, {  579,  548 },
-    {  641,  610 }, {  672,  641 }, {  672,  672 }, {   22,   22 },
-    {   53,   22 }, {   84,   53 }, {  115,   84 }, {  146,  115 },
-    {  177,  146 }, {  208,  177 }, {  239,  208 }, {  270,  239 },
-    {  301,  270 }, {  332,  301 }, {  363,  332 }, {  394,  363 },
-    {  425,  394 }, {  456,  425 }, {  487,  456 }, {  518,  487 },
-    {  549,  518 }, {  580,  549 }, {  611,  580 }, {  642,  611 },
-    {  673,  642 }, {  704,  673 }, {  704,  704 }, {   54,   23 },
-    {   85,   54 }, {  116,   85 }, {  178,  147 }, {  209,  178 },
-    {  240,  209 }, {  302,  271 }, {  333,  302 }, {  364,  333 },
-    {  426,  395 }, {  457,  426 }, {  488,  457 }, {  550,  519 },
-    {  581,  550 }, {  612,  581 }, {  674,  643 }, {  705,  674 },
-    {  736,  705 }, {   86,   55 }, {  117,   86 }, {  210,  179 },
-    {  241,  210 }, {  334,  303 }, {  365,  334 }, {  458,  427 },
-    {  489,  458 }, {  582,  551 }, {  613,  582 }, {  706,  675 },
-    {  737,  706 }, {  118,   87 }, {  242,  211 }, {  366,  335 },
-    {  490,  459 }, {  614,  583 }, {  738,  707 }, {   23,   23 },
-    {  147,  116 }, {  271,  240 }, {  395,  364 }, {  519,  488 },
-    {  643,  612 }, {  736,  736 }, {   24,   24 }, {   55,   24 },
-    {  148,  117 }, {  179,  148 }, {  272,  241 }, {  303,  272 },
-    {  396,  365 }, {  427,  396 }, {  520,  489 }, {  551,  520 },
-    {  644,  613 }, {  675,  644 }, {  768,  737 }, {  768,  768 },
-    {   25,   25 }, {   56,   25 }, {   87,   56 }, {  149,  118 },
-    {  180,  149 }, {  211,  180 }, {  273,  242 }, {  304,  273 },
-    {  335,  304 }, {  397,  366 }, {  428,  397 }, {  459,  428 },
-    {  521,  490 }, {  552,  521 }, {  583,  552 }, {  645,  614 },
-    {  676,  645 }, {  707,  676 }, {  769,  738 }, {  800,  769 },
-    {  800,  800 }, {   26,   26 }, {   57,   26 }, {   88,   57 },
-    {  119,   88 }, {  150,  119 }, {  181,  150 }, {  212,  181 },
-    {  243,  212 }, {  274,  243 }, {  305,  274 }, {  336,  305 },
-    {  367,  336 }, {  398,  367 }, {  429,  398 }, {  460,  429 },
-    {  491,  460 }, {  522,  491 }, {  553,  522 }, {  584,  553 },
-    {  615,  584 }, {  646,  615 }, {  677,  646 }, {  708,  677 },
-    {  739,  708 }, {  770,  739 }, {  801,  770 }, {  832,  801 },
-    {  832,  832 }, {   58,   27 }, {   89,   58 }, {  120,   89 },
-    {  182,  151 }, {  213,  182 }, {  244,  213 }, {  306,  275 },
-    {  337,  306 }, {  368,  337 }, {  430,  399 }, {  461,  430 },
-    {  492,  461 }, {  554,  523 }, {  585,  554 }, {  616,  585 },
-    {  678,  647 }, {  709,  678 }, {  740,  709 }, {  802,  771 },
-    {  833,  802 }, {  864,  833 }, {   90,   59 }, {  121,   90 },
-    {  214,  183 }, {  245,  214 }, {  338,  307 }, {  369,  338 },
-    {  462,  431 }, {  493,  462 }, {  586,  555 }, {  617,  586 },
-    {  710,  679 }, {  741,  710 }, {  834,  803 }, {  865,  834 },
-    {  122,   91 }, {  246,  215 }, {  370,  339 }, {  494,  463 },
-    {  618,  587 }, {  742,  711 }, {  866,  835 }, {   27,   27 },
-    {  151,  120 }, {  275,  244 }, {  399,  368 }, {  523,  492 },
-    {  647,  616 }, {  771,  740 }, {  864,  864 }, {   28,   28 },
-    {   59,   28 }, {  152,  121 }, {  183,  152 }, {  276,  245 },
-    {  307,  276 }, {  400,  369 }, {  431,  400 }, {  524,  493 },
-    {  555,  524 }, {  648,  617 }, {  679,  648 }, {  772,  741 },
-    {  803,  772 }, {  896,  865 }, {  896,  896 }, {   29,   29 },
-    {   60,   29 }, {   91,   60 }, {  153,  122 }, {  184,  153 },
-    {  215,  184 }, {  277,  246 }, {  308,  277 }, {  339,  308 },
-    {  401,  370 }, {  432,  401 }, {  463,  432 }, {  525,  494 },
-    {  556,  525 }, {  587,  556 }, {  649,  618 }, {  680,  649 },
-    {  711,  680 }, {  773,  742 }, {  804,  773 }, {  835,  804 },
-    {  897,  866 }, {  928,  897 }, {  928,  928 }, {   30,   30 },
-    {   61,   30 }, {   92,   61 }, {  123,   92 }, {  154,  123 },
-    {  185,  154 }, {  216,  185 }, {  247,  216 }, {  278,  247 },
-    {  309,  278 }, {  340,  309 }, {  371,  340 }, {  402,  371 },
-    {  433,  402 }, {  464,  433 }, {  495,  464 }, {  526,  495 },
-    {  557,  526 }, {  588,  557 }, {  619,  588 }, {  650,  619 },
-    {  681,  650 }, {  712,  681 }, {  743,  712 }, {  774,  743 },
-    {  805,  774 }, {  836,  805 }, {  867,  836 }, {  898,  867 },
-    {  929,  898 }, {  960,  929 }, {  960,  960 }, {   62,   31 },
-    {   93,   62 }, {  124,   93 }, {  186,  155 }, {  217,  186 },
-    {  248,  217 }, {  310,  279 }, {  341,  310 }, {  372,  341 },
-    {  434,  403 }, {  465,  434 }, {  496,  465 }, {  558,  527 },
-    {  589,  558 }, {  620,  589 }, {  682,  651 }, {  713,  682 },
-    {  744,  713 }, {  806,  775 }, {  837,  806 }, {  868,  837 },
-    {  930,  899 }, {  961,  930 }, {  992,  961 }, {   94,   63 },
-    {  125,   94 }, {  218,  187 }, {  249,  218 }, {  342,  311 },
-    {  373,  342 }, {  466,  435 }, {  497,  466 }, {  590,  559 },
-    {  621,  590 }, {  714,  683 }, {  745,  714 }, {  838,  807 },
-    {  869,  838 }, {  962,  931 }, {  993,  962 }, {  126,   95 },
-    {  250,  219 }, {  374,  343 }, {  498,  467 }, {  622,  591 },
-    {  746,  715 }, {  870,  839 }, {  994,  963 }, {  155,  124 },
-    {  279,  248 }, {  403,  372 }, {  527,  496 }, {  651,  620 },
-    {  775,  744 }, {  899,  868 }, {  156,  125 }, {  187,  156 },
-    {  280,  249 }, {  311,  280 }, {  404,  373 }, {  435,  404 },
-    {  528,  497 }, {  559,  528 }, {  652,  621 }, {  683,  652 },
-    {  776,  745 }, {  807,  776 }, {  900,  869 }, {  931,  900 },
-    {  157,  126 }, {  188,  157 }, {  219,  188 }, {  281,  250 },
-    {  312,  281 }, {  343,  312 }, {  405,  374 }, {  436,  405 },
-    {  467,  436 }, {  529,  498 }, {  560,  529 }, {  591,  560 },
-    {  653,  622 }, {  684,  653 }, {  715,  684 }, {  777,  746 },
-    {  808,  777 }, {  839,  808 }, {  901,  870 }, {  932,  901 },
-    {  963,  932 }, {  158,  127 }, {  189,  158 }, {  220,  189 },
-    {  251,  220 }, {  282,  251 }, {  313,  282 }, {  344,  313 },
-    {  375,  344 }, {  406,  375 }, {  437,  406 }, {  468,  437 },
-    {  499,  468 }, {  530,  499 }, {  561,  530 }, {  592,  561 },
-    {  623,  592 }, {  654,  623 }, {  685,  654 }, {  716,  685 },
-    {  747,  716 }, {  778,  747 }, {  809,  778 }, {  840,  809 },
-    {  871,  840 }, {  902,  871 }, {  933,  902 }, {  964,  933 },
-    {  995,  964 }, {  190,  159 }, {  221,  190 }, {  252,  221 },
-    {  314,  283 }, {  345,  314 }, {  376,  345 }, {  438,  407 },
-    {  469,  438 }, {  500,  469 }, {  562,  531 }, {  593,  562 },
-    {  624,  593 }, {  686,  655 }, {  717,  686 }, {  748,  717 },
-    {  810,  779 }, {  841,  810 }, {  872,  841 }, {  934,  903 },
-    {  965,  934 }, {  996,  965 }, {  222,  191 }, {  253,  222 },
-    {  346,  315 }, {  377,  346 }, {  470,  439 }, {  501,  470 },
-    {  594,  563 }, {  625,  594 }, {  718,  687 }, {  749,  718 },
-    {  842,  811 }, {  873,  842 }, {  966,  935 }, {  997,  966 },
-    {  254,  223 }, {  378,  347 }, {  502,  471 }, {  626,  595 },
-    {  750,  719 }, {  874,  843 }, {  998,  967 }, {  283,  252 },
-    {  407,  376 }, {  531,  500 }, {  655,  624 }, {  779,  748 },
-    {  903,  872 }, {  284,  253 }, {  315,  284 }, {  408,  377 },
-    {  439,  408 }, {  532,  501 }, {  563,  532 }, {  656,  625 },
-    {  687,  656 }, {  780,  749 }, {  811,  780 }, {  904,  873 },
-    {  935,  904 }, {  285,  254 }, {  316,  285 }, {  347,  316 },
-    {  409,  378 }, {  440,  409 }, {  471,  440 }, {  533,  502 },
-    {  564,  533 }, {  595,  564 }, {  657,  626 }, {  688,  657 },
-    {  719,  688 }, {  781,  750 }, {  812,  781 }, {  843,  812 },
-    {  905,  874 }, {  936,  905 }, {  967,  936 }, {  286,  255 },
-    {  317,  286 }, {  348,  317 }, {  379,  348 }, {  410,  379 },
-    {  441,  410 }, {  472,  441 }, {  503,  472 }, {  534,  503 },
-    {  565,  534 }, {  596,  565 }, {  627,  596 }, {  658,  627 },
-    {  689,  658 }, {  720,  689 }, {  751,  720 }, {  782,  751 },
-    {  813,  782 }, {  844,  813 }, {  875,  844 }, {  906,  875 },
-    {  937,  906 }, {  968,  937 }, {  999,  968 }, {  318,  287 },
-    {  349,  318 }, {  380,  349 }, {  442,  411 }, {  473,  442 },
-    {  504,  473 }, {  566,  535 }, {  597,  566 }, {  628,  597 },
-    {  690,  659 }, {  721,  690 }, {  752,  721 }, {  814,  783 },
-    {  845,  814 }, {  876,  845 }, {  938,  907 }, {  969,  938 },
-    { 1000,  969 }, {  350,  319 }, {  381,  350 }, {  474,  443 },
-    {  505,  474 }, {  598,  567 }, {  629,  598 }, {  722,  691 },
-    {  753,  722 }, {  846,  815 }, {  877,  846 }, {  970,  939 },
-    { 1001,  970 }, {  382,  351 }, {  506,  475 }, {  630,  599 },
-    {  754,  723 }, {  878,  847 }, { 1002,  971 }, {  411,  380 },
-    {  535,  504 }, {  659,  628 }, {  783,  752 }, {  907,  876 },
-    {  412,  381 }, {  443,  412 }, {  536,  505 }, {  567,  536 },
-    {  660,  629 }, {  691,  660 }, {  784,  753 }, {  815,  784 },
-    {  908,  877 }, {  939,  908 }, {  413,  382 }, {  444,  413 },
-    {  475,  444 }, {  537,  506 }, {  568,  537 }, {  599,  568 },
-    {  661,  630 }, {  692,  661 }, {  723,  692 }, {  785,  754 },
-    {  816,  785 }, {  847,  816 }, {  909,  878 }, {  940,  909 },
-    {  971,  940 }, {  414,  383 }, {  445,  414 }, {  476,  445 },
-    {  507,  476 }, {  538,  507 }, {  569,  538 }, {  600,  569 },
-    {  631,  600 }, {  662,  631 }, {  693,  662 }, {  724,  693 },
-    {  755,  724 }, {  786,  755 }, {  817,  786 }, {  848,  817 },
-    {  879,  848 }, {  910,  879 }, {  941,  910 }, {  972,  941 },
-    { 1003,  972 }, {  446,  415 }, {  477,  446 }, {  508,  477 },
-    {  570,  539 }, {  601,  570 }, {  632,  601 }, {  694,  663 },
-    {  725,  694 }, {  756,  725 }, {  818,  787 }, {  849,  818 },
-    {  880,  849 }, {  942,  911 }, {  973,  942 }, { 1004,  973 },
-    {  478,  447 }, {  509,  478 }, {  602,  571 }, {  633,  602 },
-    {  726,  695 }, {  757,  726 }, {  850,  819 }, {  881,  850 },
-    {  974,  943 }, { 1005,  974 }, {  510,  479 }, {  634,  603 },
-    {  758,  727 }, {  882,  851 }, { 1006,  975 }, {  539,  508 },
-    {  663,  632 }, {  787,  756 }, {  911,  880 }, {  540,  509 },
-    {  571,  540 }, {  664,  633 }, {  695,  664 }, {  788,  757 },
-    {  819,  788 }, {  912,  881 }, {  943,  912 }, {  541,  510 },
-    {  572,  541 }, {  603,  572 }, {  665,  634 }, {  696,  665 },
-    {  727,  696 }, {  789,  758 }, {  820,  789 }, {  851,  820 },
-    {  913,  882 }, {  944,  913 }, {  975,  944 }, {  542,  511 },
-    {  573,  542 }, {  604,  573 }, {  635,  604 }, {  666,  635 },
-    {  697,  666 }, {  728,  697 }, {  759,  728 }, {  790,  759 },
-    {  821,  790 }, {  852,  821 }, {  883,  852 }, {  914,  883 },
-    {  945,  914 }, {  976,  945 }, { 1007,  976 }, {  574,  543 },
-    {  605,  574 }, {  636,  605 }, {  698,  667 }, {  729,  698 },
-    {  760,  729 }, {  822,  791 }, {  853,  822 }, {  884,  853 },
-    {  946,  915 }, {  977,  946 }, { 1008,  977 }, {  606,  575 },
-    {  637,  606 }, {  730,  699 }, {  761,  730 }, {  854,  823 },
-    {  885,  854 }, {  978,  947 }, { 1009,  978 }, {  638,  607 },
-    {  762,  731 }, {  886,  855 }, { 1010,  979 }, {  667,  636 },
-    {  791,  760 }, {  915,  884 }, {  668,  637 }, {  699,  668 },
-    {  792,  761 }, {  823,  792 }, {  916,  885 }, {  947,  916 },
-    {  669,  638 }, {  700,  669 }, {  731,  700 }, {  793,  762 },
-    {  824,  793 }, {  855,  824 }, {  917,  886 }, {  948,  917 },
-    {  979,  948 }, {  670,  639 }, {  701,  670 }, {  732,  701 },
-    {  763,  732 }, {  794,  763 }, {  825,  794 }, {  856,  825 },
-    {  887,  856 }, {  918,  887 }, {  949,  918 }, {  980,  949 },
-    { 1011,  980 }, {  702,  671 }, {  733,  702 }, {  764,  733 },
-    {  826,  795 }, {  857,  826 }, {  888,  857 }, {  950,  919 },
-    {  981,  950 }, { 1012,  981 }, {  734,  703 }, {  765,  734 },
-    {  858,  827 }, {  889,  858 }, {  982,  951 }, { 1013,  982 },
-    {  766,  735 }, {  890,  859 }, { 1014,  983 }, {  795,  764 },
-    {  919,  888 }, {  796,  765 }, {  827,  796 }, {  920,  889 },
-    {  951,  920 }, {  797,  766 }, {  828,  797 }, {  859,  828 },
-    {  921,  890 }, {  952,  921 }, {  983,  952 }, {  798,  767 },
-    {  829,  798 }, {  860,  829 }, {  891,  860 }, {  922,  891 },
-    {  953,  922 }, {  984,  953 }, { 1015,  984 }, {  830,  799 },
-    {  861,  830 }, {  892,  861 }, {  954,  923 }, {  985,  954 },
-    { 1016,  985 }, {  862,  831 }, {  893,  862 }, {  986,  955 },
-    { 1017,  986 }, {  894,  863 }, { 1018,  987 }, {  923,  892 },
-    {  924,  893 }, {  955,  924 }, {  925,  894 }, {  956,  925 },
-    {  987,  956 }, {  926,  895 }, {  957,  926 }, {  988,  957 },
-    { 1019,  988 }, {  958,  927 }, {  989,  958 }, { 1020,  989 },
-    {  990,  959 }, { 1021,  990 }, { 1022,  991 }, {    0,    0 },
-};
-
-const int16_t (*ff_vp9_scans_nb[5][4])[2] = {
-    {
-        ff_vp9_default_scan_4x4_nb, ff_vp9_col_scan_4x4_nb,
-        ff_vp9_row_scan_4x4_nb, ff_vp9_default_scan_4x4_nb
-    }, {
-        ff_vp9_default_scan_8x8_nb, ff_vp9_col_scan_8x8_nb,
-        ff_vp9_row_scan_8x8_nb, ff_vp9_default_scan_8x8_nb
-    }, {
-        ff_vp9_default_scan_16x16_nb, ff_vp9_col_scan_16x16_nb,
-        ff_vp9_row_scan_16x16_nb, ff_vp9_default_scan_16x16_nb
-    }, {
-        ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb,
-        ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb
-    }, { // lossless
-        ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb,
-        ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb
-    }
-};
-
-const uint8_t ff_vp9_model_pareto8[256][8] = {
-    {   6,  86, 128,  11,  87,  42,  91,  52 },
-    {   3,  86, 128,   6,  86,  23,  88,  29 },
-    {   6,  86, 128,  11,  87,  42,  91,  52 },
-    {   9,  86, 129,  17,  88,  61,  94,  76 },
-    {  12,  86, 129,  22,  88,  77,  97,  93 },
-    {  15,  87, 129,  28,  89,  93, 100, 110 },
-    {  17,  87, 129,  33,  90, 105, 103, 123 },
-    {  20,  88, 130,  38,  91, 118, 106, 136 },
-    {  23,  88, 130,  43,  91, 128, 108, 146 },
-    {  26,  89, 131,  48,  92, 139, 111, 156 },
-    {  28,  89, 131,  53,  93, 147, 114, 163 },
-    {  31,  90, 131,  58,  94, 156, 117, 171 },
-    {  34,  90, 131,  62,  94, 163, 119, 177 },
-    {  37,  90, 132,  66,  95, 171, 122, 184 },
-    {  39,  90, 132,  70,  96, 177, 124, 189 },
-    {  42,  91, 132,  75,  97, 183, 127, 194 },
-    {  44,  91, 132,  79,  97, 188, 129, 198 },
-    {  47,  92, 133,  83,  98, 193, 132, 202 },
-    {  49,  92, 133,  86,  99, 197, 134, 205 },
-    {  52,  93, 133,  90, 100, 201, 137, 208 },
-    {  54,  93, 133,  94, 100, 204, 139, 211 },
-    {  57,  94, 134,  98, 101, 208, 142, 214 },
-    {  59,  94, 134, 101, 102, 211, 144, 216 },
-    {  62,  94, 135, 105, 103, 214, 146, 218 },
-    {  64,  94, 135, 108, 103, 216, 148, 220 },
-    {  66,  95, 135, 111, 104, 219, 151, 222 },
-    {  68,  95, 135, 114, 105, 221, 153, 223 },
-    {  71,  96, 136, 117, 106, 224, 155, 225 },
-    {  73,  96, 136, 120, 106, 225, 157, 226 },
-    {  76,  97, 136, 123, 107, 227, 159, 228 },
-    {  78,  97, 136, 126, 108, 229, 160, 229 },
-    {  80,  98, 137, 129, 109, 231, 162, 231 },
-    {  82,  98, 137, 131, 109, 232, 164, 232 },
-    {  84,  98, 138, 134, 110, 234, 166, 233 },
-    {  86,  98, 138, 137, 111, 235, 168, 234 },
-    {  89,  99, 138, 140, 112, 236, 170, 235 },
-    {  91,  99, 138, 142, 112, 237, 171, 235 },
-    {  93, 100, 139, 145, 113, 238, 173, 236 },
-    {  95, 100, 139, 147, 114, 239, 174, 237 },
-    {  97, 101, 140, 149, 115, 240, 176, 238 },
-    {  99, 101, 140, 151, 115, 241, 177, 238 },
-    { 101, 102, 140, 154, 116, 242, 179, 239 },
-    { 103, 102, 140, 156, 117, 242, 180, 239 },
-    { 105, 103, 141, 158, 118, 243, 182, 240 },
-    { 107, 103, 141, 160, 118, 243, 183, 240 },
-    { 109, 104, 141, 162, 119, 244, 185, 241 },
-    { 111, 104, 141, 164, 119, 244, 186, 241 },
-    { 113, 104, 142, 166, 120, 245, 187, 242 },
-    { 114, 104, 142, 168, 121, 245, 188, 242 },
-    { 116, 105, 143, 170, 122, 246, 190, 243 },
-    { 118, 105, 143, 171, 122, 246, 191, 243 },
-    { 120, 106, 143, 173, 123, 247, 192, 244 },
-    { 121, 106, 143, 175, 124, 247, 193, 244 },
-    { 123, 107, 144, 177, 125, 248, 195, 244 },
-    { 125, 107, 144, 178, 125, 248, 196, 244 },
-    { 127, 108, 145, 180, 126, 249, 197, 245 },
-    { 128, 108, 145, 181, 127, 249, 198, 245 },
-    { 130, 109, 145, 183, 128, 249, 199, 245 },
-    { 132, 109, 145, 184, 128, 249, 200, 245 },
-    { 134, 110, 146, 186, 129, 250, 201, 246 },
-    { 135, 110, 146, 187, 130, 250, 202, 246 },
-    { 137, 111, 147, 189, 131, 251, 203, 246 },
-    { 138, 111, 147, 190, 131, 251, 204, 246 },
-    { 140, 112, 147, 192, 132, 251, 205, 247 },
-    { 141, 112, 147, 193, 132, 251, 206, 247 },
-    { 143, 113, 148, 194, 133, 251, 207, 247 },
-    { 144, 113, 148, 195, 134, 251, 207, 247 },
-    { 146, 114, 149, 197, 135, 252, 208, 248 },
-    { 147, 114, 149, 198, 135, 252, 209, 248 },
-    { 149, 115, 149, 199, 136, 252, 210, 248 },
-    { 150, 115, 149, 200, 137, 252, 210, 248 },
-    { 152, 115, 150, 201, 138, 252, 211, 248 },
-    { 153, 115, 150, 202, 138, 252, 212, 248 },
-    { 155, 116, 151, 204, 139, 253, 213, 249 },
-    { 156, 116, 151, 205, 139, 253, 213, 249 },
-    { 158, 117, 151, 206, 140, 253, 214, 249 },
-    { 159, 117, 151, 207, 141, 253, 215, 249 },
-    { 161, 118, 152, 208, 142, 253, 216, 249 },
-    { 162, 118, 152, 209, 142, 253, 216, 249 },
-    { 163, 119, 153, 210, 143, 253, 217, 249 },
-    { 164, 119, 153, 211, 143, 253, 217, 249 },
-    { 166, 120, 153, 212, 144, 254, 218, 250 },
-    { 167, 120, 153, 212, 145, 254, 219, 250 },
-    { 168, 121, 154, 213, 146, 254, 220, 250 },
-    { 169, 121, 154, 214, 146, 254, 220, 250 },
-    { 171, 122, 155, 215, 147, 254, 221, 250 },
-    { 172, 122, 155, 216, 147, 254, 221, 250 },
-    { 173, 123, 155, 217, 148, 254, 222, 250 },
-    { 174, 123, 155, 217, 149, 254, 222, 250 },
-    { 176, 124, 156, 218, 150, 254, 223, 250 },
-    { 177, 124, 156, 219, 150, 254, 223, 250 },
-    { 178, 125, 157, 220, 151, 254, 224, 251 },
-    { 179, 125, 157, 220, 151, 254, 224, 251 },
-    { 180, 126, 157, 221, 152, 254, 225, 251 },
-    { 181, 126, 157, 221, 152, 254, 225, 251 },
-    { 183, 127, 158, 222, 153, 254, 226, 251 },
-    { 184, 127, 158, 223, 154, 254, 226, 251 },
-    { 185, 128, 159, 224, 155, 255, 227, 251 },
-    { 186, 128, 159, 224, 155, 255, 227, 251 },
-    { 187, 129, 160, 225, 156, 255, 228, 251 },
-    { 188, 130, 160, 225, 156, 255, 228, 251 },
-    { 189, 131, 160, 226, 157, 255, 228, 251 },
-    { 190, 131, 160, 226, 158, 255, 228, 251 },
-    { 191, 132, 161, 227, 159, 255, 229, 251 },
-    { 192, 132, 161, 227, 159, 255, 229, 251 },
-    { 193, 133, 162, 228, 160, 255, 230, 252 },
-    { 194, 133, 162, 229, 160, 255, 230, 252 },
-    { 195, 134, 163, 230, 161, 255, 231, 252 },
-    { 196, 134, 163, 230, 161, 255, 231, 252 },
-    { 197, 135, 163, 231, 162, 255, 231, 252 },
-    { 198, 135, 163, 231, 162, 255, 231, 252 },
-    { 199, 136, 164, 232, 163, 255, 232, 252 },
-    { 200, 136, 164, 232, 164, 255, 232, 252 },
-    { 201, 137, 165, 233, 165, 255, 233, 252 },
-    { 201, 137, 165, 233, 165, 255, 233, 252 },
-    { 202, 138, 166, 233, 166, 255, 233, 252 },
-    { 203, 138, 166, 233, 166, 255, 233, 252 },
-    { 204, 139, 166, 234, 167, 255, 234, 252 },
-    { 205, 139, 166, 234, 167, 255, 234, 252 },
-    { 206, 140, 167, 235, 168, 255, 235, 252 },
-    { 206, 140, 167, 235, 168, 255, 235, 252 },
-    { 207, 141, 168, 236, 169, 255, 235, 252 },
-    { 208, 141, 168, 236, 170, 255, 235, 252 },
-    { 209, 142, 169, 237, 171, 255, 236, 252 },
-    { 209, 143, 169, 237, 171, 255, 236, 252 },
-    { 210, 144, 169, 237, 172, 255, 236, 252 },
-    { 211, 144, 169, 237, 172, 255, 236, 252 },
-    { 212, 145, 170, 238, 173, 255, 237, 252 },
-    { 213, 145, 170, 238, 173, 255, 237, 252 },
-    { 214, 146, 171, 239, 174, 255, 237, 253 },
-    { 214, 146, 171, 239, 174, 255, 237, 253 },
-    { 215, 147, 172, 240, 175, 255, 238, 253 },
-    { 215, 147, 172, 240, 175, 255, 238, 253 },
-    { 216, 148, 173, 240, 176, 255, 238, 253 },
-    { 217, 148, 173, 240, 176, 255, 238, 253 },
-    { 218, 149, 173, 241, 177, 255, 239, 253 },
-    { 218, 149, 173, 241, 178, 255, 239, 253 },
-    { 219, 150, 174, 241, 179, 255, 239, 253 },
-    { 219, 151, 174, 241, 179, 255, 239, 253 },
-    { 220, 152, 175, 242, 180, 255, 240, 253 },
-    { 221, 152, 175, 242, 180, 255, 240, 253 },
-    { 222, 153, 176, 242, 181, 255, 240, 253 },
-    { 222, 153, 176, 242, 181, 255, 240, 253 },
-    { 223, 154, 177, 243, 182, 255, 240, 253 },
-    { 223, 154, 177, 243, 182, 255, 240, 253 },
-    { 224, 155, 178, 244, 183, 255, 241, 253 },
-    { 224, 155, 178, 244, 183, 255, 241, 253 },
-    { 225, 156, 178, 244, 184, 255, 241, 253 },
-    { 225, 157, 178, 244, 184, 255, 241, 253 },
-    { 226, 158, 179, 244, 185, 255, 242, 253 },
-    { 227, 158, 179, 244, 185, 255, 242, 253 },
-    { 228, 159, 180, 245, 186, 255, 242, 253 },
-    { 228, 159, 180, 245, 186, 255, 242, 253 },
-    { 229, 160, 181, 245, 187, 255, 242, 253 },
-    { 229, 160, 181, 245, 187, 255, 242, 253 },
-    { 230, 161, 182, 246, 188, 255, 243, 253 },
-    { 230, 162, 182, 246, 188, 255, 243, 253 },
-    { 231, 163, 183, 246, 189, 255, 243, 253 },
-    { 231, 163, 183, 246, 189, 255, 243, 253 },
-    { 232, 164, 184, 247, 190, 255, 243, 253 },
-    { 232, 164, 184, 247, 190, 255, 243, 253 },
-    { 233, 165, 185, 247, 191, 255, 244, 253 },
-    { 233, 165, 185, 247, 191, 255, 244, 253 },
-    { 234, 166, 185, 247, 192, 255, 244, 253 },
-    { 234, 167, 185, 247, 192, 255, 244, 253 },
-    { 235, 168, 186, 248, 193, 255, 244, 253 },
-    { 235, 168, 186, 248, 193, 255, 244, 253 },
-    { 236, 169, 187, 248, 194, 255, 244, 253 },
-    { 236, 169, 187, 248, 194, 255, 244, 253 },
-    { 236, 170, 188, 248, 195, 255, 245, 253 },
-    { 236, 170, 188, 248, 195, 255, 245, 253 },
-    { 237, 171, 189, 249, 196, 255, 245, 254 },
-    { 237, 172, 189, 249, 196, 255, 245, 254 },
-    { 238, 173, 190, 249, 197, 255, 245, 254 },
-    { 238, 173, 190, 249, 197, 255, 245, 254 },
-    { 239, 174, 191, 249, 198, 255, 245, 254 },
-    { 239, 174, 191, 249, 198, 255, 245, 254 },
-    { 240, 175, 192, 249, 199, 255, 246, 254 },
-    { 240, 176, 192, 249, 199, 255, 246, 254 },
-    { 240, 177, 193, 250, 200, 255, 246, 254 },
-    { 240, 177, 193, 250, 200, 255, 246, 254 },
-    { 241, 178, 194, 250, 201, 255, 246, 254 },
-    { 241, 178, 194, 250, 201, 255, 246, 254 },
-    { 242, 179, 195, 250, 202, 255, 246, 254 },
-    { 242, 180, 195, 250, 202, 255, 246, 254 },
-    { 242, 181, 196, 250, 203, 255, 247, 254 },
-    { 242, 181, 196, 250, 203, 255, 247, 254 },
-    { 243, 182, 197, 251, 204, 255, 247, 254 },
-    { 243, 183, 197, 251, 204, 255, 247, 254 },
-    { 244, 184, 198, 251, 205, 255, 247, 254 },
-    { 244, 184, 198, 251, 205, 255, 247, 254 },
-    { 244, 185, 199, 251, 206, 255, 247, 254 },
-    { 244, 185, 199, 251, 206, 255, 247, 254 },
-    { 245, 186, 200, 251, 207, 255, 247, 254 },
-    { 245, 187, 200, 251, 207, 255, 247, 254 },
-    { 246, 188, 201, 252, 207, 255, 248, 254 },
-    { 246, 188, 201, 252, 207, 255, 248, 254 },
-    { 246, 189, 202, 252, 208, 255, 248, 254 },
-    { 246, 190, 202, 252, 208, 255, 248, 254 },
-    { 247, 191, 203, 252, 209, 255, 248, 254 },
-    { 247, 191, 203, 252, 209, 255, 248, 254 },
-    { 247, 192, 204, 252, 210, 255, 248, 254 },
-    { 247, 193, 204, 252, 210, 255, 248, 254 },
-    { 248, 194, 205, 252, 211, 255, 248, 254 },
-    { 248, 194, 205, 252, 211, 255, 248, 254 },
-    { 248, 195, 206, 252, 212, 255, 249, 254 },
-    { 248, 196, 206, 252, 212, 255, 249, 254 },
-    { 249, 197, 207, 253, 213, 255, 249, 254 },
-    { 249, 197, 207, 253, 213, 255, 249, 254 },
-    { 249, 198, 208, 253, 214, 255, 249, 254 },
-    { 249, 199, 209, 253, 214, 255, 249, 254 },
-    { 250, 200, 210, 253, 215, 255, 249, 254 },
-    { 250, 200, 210, 253, 215, 255, 249, 254 },
-    { 250, 201, 211, 253, 215, 255, 249, 254 },
-    { 250, 202, 211, 253, 215, 255, 249, 254 },
-    { 250, 203, 212, 253, 216, 255, 249, 254 },
-    { 250, 203, 212, 253, 216, 255, 249, 254 },
-    { 251, 204, 213, 253, 217, 255, 250, 254 },
-    { 251, 205, 213, 253, 217, 255, 250, 254 },
-    { 251, 206, 214, 254, 218, 255, 250, 254 },
-    { 251, 206, 215, 254, 218, 255, 250, 254 },
-    { 252, 207, 216, 254, 219, 255, 250, 254 },
-    { 252, 208, 216, 254, 219, 255, 250, 254 },
-    { 252, 209, 217, 254, 220, 255, 250, 254 },
-    { 252, 210, 217, 254, 220, 255, 250, 254 },
-    { 252, 211, 218, 254, 221, 255, 250, 254 },
-    { 252, 212, 218, 254, 221, 255, 250, 254 },
-    { 253, 213, 219, 254, 222, 255, 250, 254 },
-    { 253, 213, 220, 254, 222, 255, 250, 254 },
-    { 253, 214, 221, 254, 223, 255, 250, 254 },
-    { 253, 215, 221, 254, 223, 255, 250, 254 },
-    { 253, 216, 222, 254, 224, 255, 251, 254 },
-    { 253, 217, 223, 254, 224, 255, 251, 254 },
-    { 253, 218, 224, 254, 225, 255, 251, 254 },
-    { 253, 219, 224, 254, 225, 255, 251, 254 },
-    { 254, 220, 225, 254, 225, 255, 251, 254 },
-    { 254, 221, 226, 254, 225, 255, 251, 254 },
-    { 254, 222, 227, 255, 226, 255, 251, 254 },
-    { 254, 223, 227, 255, 226, 255, 251, 254 },
-    { 254, 224, 228, 255, 227, 255, 251, 254 },
-    { 254, 225, 229, 255, 227, 255, 251, 254 },
-    { 254, 226, 230, 255, 228, 255, 251, 254 },
-    { 254, 227, 230, 255, 229, 255, 251, 254 },
-    { 255, 228, 231, 255, 230, 255, 251, 254 },
-    { 255, 229, 232, 255, 230, 255, 251, 254 },
-    { 255, 230, 233, 255, 231, 255, 252, 254 },
-    { 255, 231, 234, 255, 231, 255, 252, 254 },
-    { 255, 232, 235, 255, 232, 255, 252, 254 },
-    { 255, 233, 236, 255, 232, 255, 252, 254 },
-    { 255, 235, 237, 255, 233, 255, 252, 254 },
-    { 255, 236, 238, 255, 234, 255, 252, 254 },
-    { 255, 238, 240, 255, 235, 255, 252, 255 },
-    { 255, 239, 241, 255, 235, 255, 252, 254 },
-    { 255, 241, 243, 255, 236, 255, 252, 254 },
-    { 255, 243, 245, 255, 237, 255, 252, 254 },
-    { 255, 246, 247, 255, 239, 255, 253, 255 },
-};
-
-const ProbContext ff_vp9_default_probs = {
-    { /* y_mode */
-        {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* bsize < 8x8 */,
-        { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* bsize < 16x16 */,
-        { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* bsize < 32x32 */,
-        { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* bsize >= 32x32 */
-    }, { /* uv_mode */
-        {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
-        {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
-        { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
-        {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,
-        {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
-        {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
-        {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
-        {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
-        {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
-        { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
-    }, { /* filter */
-        { 235, 162, },
-        {  36, 255, },
-        {  34,   3, },
-        { 149, 144, },
-    }, { /* mv_mode */
-        {  2, 173,  34 },  // 0 = both zero mv
-        {  7, 145,  85 },  // 1 = one zero mv + one a predicted mv
-        {  7, 166,  63 },  // 2 = two predicted mvs
-        {  7,  94,  66 },  // 3 = one predicted/zero and one new mv
-        {  8,  64,  46 },  // 4 = two new mvs
-        { 17,  81,  31 },  // 5 = one intra neighbor + x
-        { 25,  29,  30 },  // 6 = two intra neighbors
-    }, { /* intra */
-        9, 102, 187, 225
-    }, { /* comp */
-        239, 183, 119,  96,  41
-    }, { /* single_ref */
-        {  33,  16 },
-        {  77,  74 },
-        { 142, 142 },
-        { 172, 170 },
-        { 238, 247 }
-    }, { /* comp_ref */
-        50, 126, 123, 221, 226
-    }, { /* tx32p */
-        { 3, 136, 37, },
-        { 5,  52, 13, },
-    }, { /* tx16p */
-        { 20, 152, },
-        { 15, 101, },
-    }, { /* tx8p */
-        100, 66
-    }, { /* skip */
-        192, 128, 64
-    }, { /* mv_joint */
-        32, 64, 96
-    }, {
-        { /* mv vertical component */
-            128, /* sign */
-            { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, /* class */
-            216, /* class0 */
-            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
-            { /* class0_fp */
-                { 128, 128, 64 },
-                {  96, 112, 64 }
-            },
-            { 64, 96, 64 }, /* fp */
-            160, /* class0_hp bit */
-            128, /* hp */
-        }, { /* mv horizontal component */
-            128, /* sign */
-            { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, /* class */
-            208, /* class0 */
-            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
-            { /* class0_fp */
-                { 128, 128, 64 },
-                {  96, 112, 64 }
-            },
-            { 64, 96, 64 }, /* fp */
-            160, /* class0_hp bit */
-            128, /* hp */
-        }
-    }, { /* partition */
-        { /* 64x64 -> 32x32 */
-            { 222,  34,  30 } /* a/l both not split */,
-            {  72,  16,  44 } /* a split, l not split */,
-            {  58,  32,  12 } /* l split, a not split */,
-            {  10,   7,   6 } /* a/l both split */,
-        }, { /* 32x32 -> 16x16 */
-            { 177,  58,  59 } /* a/l both not split */,
-            {  68,  26,  63 } /* a split, l not split */,
-            {  52,  79,  25 } /* l split, a not split */,
-            {  17,  14,  12 } /* a/l both split */,
-        }, { /* 16x16 -> 8x8 */
-            { 174,  73,  87 } /* a/l both not split */,
-            {  92,  41,  83 } /* a split, l not split */,
-            {  82,  99,  50 } /* l split, a not split */,
-            {  53,  39,  39 } /* a/l both split */,
-        }, { /* 8x8 -> 4x4 */
-            { 199, 122, 141 } /* a/l both not split */,
-            { 147,  63, 159 } /* a split, l not split */,
-            { 148, 133, 118 } /* l split, a not split */,
-            { 121, 104, 114 } /* a/l both split */,
-        }
-    },
-};
-
-const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3] = {
-    { /* tx = 4x4 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 195,  29, 183 },
-                    {  84,  49, 136 },
-                    {   8,  42,  71 }
-                }, { /* Coeff Band 1 */
-                    {  31, 107, 169 },
-                    {  35,  99, 159 },
-                    {  17,  82, 140 },
-                    {   8,  66, 114 },
-                    {   2,  44,  76 },
-                    {   1,  19,  32 }
-                }, { /* Coeff Band 2 */
-                    {  40, 132, 201 },
-                    {  29, 114, 187 },
-                    {  13,  91, 157 },
-                    {   7,  75, 127 },
-                    {   3,  58,  95 },
-                    {   1,  28,  47 }
-                }, { /* Coeff Band 3 */
-                    {  69, 142, 221 },
-                    {  42, 122, 201 },
-                    {  15,  91, 159 },
-                    {   6,  67, 121 },
-                    {   1,  42,  77 },
-                    {   1,  17,  31 }
-                }, { /* Coeff Band 4 */
-                    { 102, 148, 228 },
-                    {  67, 117, 204 },
-                    {  17,  82, 154 },
-                    {   6,  59, 114 },
-                    {   2,  39,  75 },
-                    {   1,  15,  29 }
-                }, { /* Coeff Band 5 */
-                    { 156,  57, 233 },
-                    { 119,  57, 212 },
-                    {  58,  48, 163 },
-                    {  29,  40, 124 },
-                    {  12,  30,  81 },
-                    {   3,  12,  31 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 191, 107, 226 },
-                    { 124, 117, 204 },
-                    {  25,  99, 155 }
-                }, { /* Coeff Band 1 */
-                    {  29, 148, 210 },
-                    {  37, 126, 194 },
-                    {   8,  93, 157 },
-                    {   2,  68, 118 },
-                    {   1,  39,  69 },
-                    {   1,  17,  33 }
-                }, { /* Coeff Band 2 */
-                    {  41, 151, 213 },
-                    {  27, 123, 193 },
-                    {   3,  82, 144 },
-                    {   1,  58, 105 },
-                    {   1,  32,  60 },
-                    {   1,  13,  26 }
-                }, { /* Coeff Band 3 */
-                    {  59, 159, 220 },
-                    {  23, 126, 198 },
-                    {   4,  88, 151 },
-                    {   1,  66, 114 },
-                    {   1,  38,  71 },
-                    {   1,  18,  34 }
-                }, { /* Coeff Band 4 */
-                    { 114, 136, 232 },
-                    {  51, 114, 207 },
-                    {  11,  83, 155 },
-                    {   3,  56, 105 },
-                    {   1,  33,  65 },
-                    {   1,  17,  34 }
-                }, { /* Coeff Band 5 */
-                    { 149,  65, 234 },
-                    { 121,  57, 215 },
-                    {  61,  49, 166 },
-                    {  28,  36, 114 },
-                    {  12,  25,  76 },
-                    {   3,  16,  42 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 214,  49, 220 },
-                    { 132,  63, 188 },
-                    {  42,  65, 137 }
-                }, { /* Coeff Band 1 */
-                    {  85, 137, 221 },
-                    { 104, 131, 216 },
-                    {  49, 111, 192 },
-                    {  21,  87, 155 },
-                    {   2,  49,  87 },
-                    {   1,  16,  28 }
-                }, { /* Coeff Band 2 */
-                    {  89, 163, 230 },
-                    {  90, 137, 220 },
-                    {  29, 100, 183 },
-                    {  10,  70, 135 },
-                    {   2,  42,  81 },
-                    {   1,  17,  33 }
-                }, { /* Coeff Band 3 */
-                    { 108, 167, 237 },
-                    {  55, 133, 222 },
-                    {  15,  97, 179 },
-                    {   4,  72, 135 },
-                    {   1,  45,  85 },
-                    {   1,  19,  38 }
-                }, { /* Coeff Band 4 */
-                    { 124, 146, 240 },
-                    {  66, 124, 224 },
-                    {  17,  88, 175 },
-                    {   4,  58, 122 },
-                    {   1,  36,  75 },
-                    {   1,  18,  37 }
-                }, { /* Coeff Band 5 */
-                    { 141,  79, 241 },
-                    { 126,  70, 227 },
-                    {  66,  58, 182 },
-                    {  30,  44, 136 },
-                    {  12,  34,  96 },
-                    {   2,  20,  47 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 229,  99, 249 },
-                    { 143, 111, 235 },
-                    {  46, 109, 192 }
-                }, { /* Coeff Band 1 */
-                    {  82, 158, 236 },
-                    {  94, 146, 224 },
-                    {  25, 117, 191 },
-                    {   9,  87, 149 },
-                    {   3,  56,  99 },
-                    {   1,  33,  57 }
-                }, { /* Coeff Band 2 */
-                    {  83, 167, 237 },
-                    {  68, 145, 222 },
-                    {  10, 103, 177 },
-                    {   2,  72, 131 },
-                    {   1,  41,  79 },
-                    {   1,  20,  39 }
-                }, { /* Coeff Band 3 */
-                    {  99, 167, 239 },
-                    {  47, 141, 224 },
-                    {  10, 104, 178 },
-                    {   2,  73, 133 },
-                    {   1,  44,  85 },
-                    {   1,  22,  47 }
-                }, { /* Coeff Band 4 */
-                    { 127, 145, 243 },
-                    {  71, 129, 228 },
-                    {  17,  93, 177 },
-                    {   3,  61, 124 },
-                    {   1,  41,  84 },
-                    {   1,  21,  52 }
-                }, { /* Coeff Band 5 */
-                    { 157,  78, 244 },
-                    { 140,  72, 231 },
-                    {  69,  58, 184 },
-                    {  31,  44, 137 },
-                    {  14,  38, 105 },
-                    {   8,  23,  61 }
-                }
-            }
-        }
-    }, { /* tx = 8x8 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 125,  34, 187 },
-                    {  52,  41, 133 },
-                    {   6,  31,  56 }
-                }, { /* Coeff Band 1 */
-                    {  37, 109, 153 },
-                    {  51, 102, 147 },
-                    {  23,  87, 128 },
-                    {   8,  67, 101 },
-                    {   1,  41,  63 },
-                    {   1,  19,  29 }
-                }, { /* Coeff Band 2 */
-                    {  31, 154, 185 },
-                    {  17, 127, 175 },
-                    {   6,  96, 145 },
-                    {   2,  73, 114 },
-                    {   1,  51,  82 },
-                    {   1,  28,  45 }
-                }, { /* Coeff Band 3 */
-                    {  23, 163, 200 },
-                    {  10, 131, 185 },
-                    {   2,  93, 148 },
-                    {   1,  67, 111 },
-                    {   1,  41,  69 },
-                    {   1,  14,  24 }
-                }, { /* Coeff Band 4 */
-                    {  29, 176, 217 },
-                    {  12, 145, 201 },
-                    {   3, 101, 156 },
-                    {   1,  69, 111 },
-                    {   1,  39,  63 },
-                    {   1,  14,  23 }
-                }, { /* Coeff Band 5 */
-                    {  57, 192, 233 },
-                    {  25, 154, 215 },
-                    {   6, 109, 167 },
-                    {   3,  78, 118 },
-                    {   1,  48,  69 },
-                    {   1,  21,  29 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 202, 105, 245 },
-                    { 108, 106, 216 },
-                    {  18,  90, 144 }
-                }, { /* Coeff Band 1 */
-                    {  33, 172, 219 },
-                    {  64, 149, 206 },
-                    {  14, 117, 177 },
-                    {   5,  90, 141 },
-                    {   2,  61,  95 },
-                    {   1,  37,  57 }
-                }, { /* Coeff Band 2 */
-                    {  33, 179, 220 },
-                    {  11, 140, 198 },
-                    {   1,  89, 148 },
-                    {   1,  60, 104 },
-                    {   1,  33,  57 },
-                    {   1,  12,  21 }
-                }, { /* Coeff Band 3 */
-                    {  30, 181, 221 },
-                    {   8, 141, 198 },
-                    {   1,  87, 145 },
-                    {   1,  58, 100 },
-                    {   1,  31,  55 },
-                    {   1,  12,  20 }
-                }, { /* Coeff Band 4 */
-                    {  32, 186, 224 },
-                    {   7, 142, 198 },
-                    {   1,  86, 143 },
-                    {   1,  58, 100 },
-                    {   1,  31,  55 },
-                    {   1,  12,  22 }
-                }, { /* Coeff Band 5 */
-                    {  57, 192, 227 },
-                    {  20, 143, 204 },
-                    {   3,  96, 154 },
-                    {   1,  68, 112 },
-                    {   1,  42,  69 },
-                    {   1,  19,  32 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 212,  35, 215 },
-                    { 113,  47, 169 },
-                    {  29,  48, 105 }
-                }, { /* Coeff Band 1 */
-                    {  74, 129, 203 },
-                    { 106, 120, 203 },
-                    {  49, 107, 178 },
-                    {  19,  84, 144 },
-                    {   4,  50,  84 },
-                    {   1,  15,  25 }
-                }, { /* Coeff Band 2 */
-                    {  71, 172, 217 },
-                    {  44, 141, 209 },
-                    {  15, 102, 173 },
-                    {   6,  76, 133 },
-                    {   2,  51,  89 },
-                    {   1,  24,  42 }
-                }, { /* Coeff Band 3 */
-                    {  64, 185, 231 },
-                    {  31, 148, 216 },
-                    {   8, 103, 175 },
-                    {   3,  74, 131 },
-                    {   1,  46,  81 },
-                    {   1,  18,  30 }
-                }, { /* Coeff Band 4 */
-                    {  65, 196, 235 },
-                    {  25, 157, 221 },
-                    {   5, 105, 174 },
-                    {   1,  67, 120 },
-                    {   1,  38,  69 },
-                    {   1,  15,  30 }
-                }, { /* Coeff Band 5 */
-                    {  65, 204, 238 },
-                    {  30, 156, 224 },
-                    {   7, 107, 177 },
-                    {   2,  70, 124 },
-                    {   1,  42,  73 },
-                    {   1,  18,  34 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 225,  86, 251 },
-                    { 144, 104, 235 },
-                    {  42,  99, 181 }
-                }, { /* Coeff Band 1 */
-                    {  85, 175, 239 },
-                    { 112, 165, 229 },
-                    {  29, 136, 200 },
-                    {  12, 103, 162 },
-                    {   6,  77, 123 },
-                    {   2,  53,  84 }
-                }, { /* Coeff Band 2 */
-                    {  75, 183, 239 },
-                    {  30, 155, 221 },
-                    {   3, 106, 171 },
-                    {   1,  74, 128 },
-                    {   1,  44,  76 },
-                    {   1,  17,  28 }
-                }, { /* Coeff Band 3 */
-                    {  73, 185, 240 },
-                    {  27, 159, 222 },
-                    {   2, 107, 172 },
-                    {   1,  75, 127 },
-                    {   1,  42,  73 },
-                    {   1,  17,  29 }
-                }, { /* Coeff Band 4 */
-                    {  62, 190, 238 },
-                    {  21, 159, 222 },
-                    {   2, 107, 172 },
-                    {   1,  72, 122 },
-                    {   1,  40,  71 },
-                    {   1,  18,  32 }
-                }, { /* Coeff Band 5 */
-                    {  61, 199, 240 },
-                    {  27, 161, 226 },
-                    {   4, 113, 180 },
-                    {   1,  76, 129 },
-                    {   1,  46,  80 },
-                    {   1,  23,  41 }
-                }
-            }
-        }
-    }, { /* tx = 16x16 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    {   7,  27, 153 },
-                    {   5,  30,  95 },
-                    {   1,  16,  30 }
-                }, { /* Coeff Band 1 */
-                    {  50,  75, 127 },
-                    {  57,  75, 124 },
-                    {  27,  67, 108 },
-                    {  10,  54,  86 },
-                    {   1,  33,  52 },
-                    {   1,  12,  18 }
-                }, { /* Coeff Band 2 */
-                    {  43, 125, 151 },
-                    {  26, 108, 148 },
-                    {   7,  83, 122 },
-                    {   2,  59,  89 },
-                    {   1,  38,  60 },
-                    {   1,  17,  27 }
-                }, { /* Coeff Band 3 */
-                    {  23, 144, 163 },
-                    {  13, 112, 154 },
-                    {   2,  75, 117 },
-                    {   1,  50,  81 },
-                    {   1,  31,  51 },
-                    {   1,  14,  23 }
-                }, { /* Coeff Band 4 */
-                    {  18, 162, 185 },
-                    {   6, 123, 171 },
-                    {   1,  78, 125 },
-                    {   1,  51,  86 },
-                    {   1,  31,  54 },
-                    {   1,  14,  23 }
-                }, { /* Coeff Band 5 */
-                    {  15, 199, 227 },
-                    {   3, 150, 204 },
-                    {   1,  91, 146 },
-                    {   1,  55,  95 },
-                    {   1,  30,  53 },
-                    {   1,  11,  20 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    {  19,  55, 240 },
-                    {  19,  59, 196 },
-                    {   3,  52, 105 }
-                }, { /* Coeff Band 1 */
-                    {  41, 166, 207 },
-                    { 104, 153, 199 },
-                    {  31, 123, 181 },
-                    {  14, 101, 152 },
-                    {   5,  72, 106 },
-                    {   1,  36,  52 }
-                }, { /* Coeff Band 2 */
-                    {  35, 176, 211 },
-                    {  12, 131, 190 },
-                    {   2,  88, 144 },
-                    {   1,  60, 101 },
-                    {   1,  36,  60 },
-                    {   1,  16,  28 }
-                }, { /* Coeff Band 3 */
-                    {  28, 183, 213 },
-                    {   8, 134, 191 },
-                    {   1,  86, 142 },
-                    {   1,  56,  96 },
-                    {   1,  30,  53 },
-                    {   1,  12,  20 }
-                }, { /* Coeff Band 4 */
-                    {  20, 190, 215 },
-                    {   4, 135, 192 },
-                    {   1,  84, 139 },
-                    {   1,  53,  91 },
-                    {   1,  28,  49 },
-                    {   1,  11,  20 }
-                }, { /* Coeff Band 5 */
-                    {  13, 196, 216 },
-                    {   2, 137, 192 },
-                    {   1,  86, 143 },
-                    {   1,  57,  99 },
-                    {   1,  32,  56 },
-                    {   1,  13,  24 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 211,  29, 217 },
-                    {  96,  47, 156 },
-                    {  22,  43,  87 }
-                }, { /* Coeff Band 1 */
-                    {  78, 120, 193 },
-                    { 111, 116, 186 },
-                    {  46, 102, 164 },
-                    {  15,  80, 128 },
-                    {   2,  49,  76 },
-                    {   1,  18,  28 }
-                }, { /* Coeff Band 2 */
-                    {  71, 161, 203 },
-                    {  42, 132, 192 },
-                    {  10,  98, 150 },
-                    {   3,  69, 109 },
-                    {   1,  44,  70 },
-                    {   1,  18,  29 }
-                }, { /* Coeff Band 3 */
-                    {  57, 186, 211 },
-                    {  30, 140, 196 },
-                    {   4,  93, 146 },
-                    {   1,  62, 102 },
-                    {   1,  38,  65 },
-                    {   1,  16,  27 }
-                }, { /* Coeff Band 4 */
-                    {  47, 199, 217 },
-                    {  14, 145, 196 },
-                    {   1,  88, 142 },
-                    {   1,  57,  98 },
-                    {   1,  36,  62 },
-                    {   1,  15,  26 }
-                }, { /* Coeff Band 5 */
-                    {  26, 219, 229 },
-                    {   5, 155, 207 },
-                    {   1,  94, 151 },
-                    {   1,  60, 104 },
-                    {   1,  36,  62 },
-                    {   1,  16,  28 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 233,  29, 248 },
-                    { 146,  47, 220 },
-                    {  43,  52, 140 }
-                }, { /* Coeff Band 1 */
-                    { 100, 163, 232 },
-                    { 179, 161, 222 },
-                    {  63, 142, 204 },
-                    {  37, 113, 174 },
-                    {  26,  89, 137 },
-                    {  18,  68,  97 }
-                }, { /* Coeff Band 2 */
-                    {  85, 181, 230 },
-                    {  32, 146, 209 },
-                    {   7, 100, 164 },
-                    {   3,  71, 121 },
-                    {   1,  45,  77 },
-                    {   1,  18,  30 }
-                }, { /* Coeff Band 3 */
-                    {  65, 187, 230 },
-                    {  20, 148, 207 },
-                    {   2,  97, 159 },
-                    {   1,  68, 116 },
-                    {   1,  40,  70 },
-                    {   1,  14,  29 }
-                }, { /* Coeff Band 4 */
-                    {  40, 194, 227 },
-                    {   8, 147, 204 },
-                    {   1,  94, 155 },
-                    {   1,  65, 112 },
-                    {   1,  39,  66 },
-                    {   1,  14,  26 }
-                }, { /* Coeff Band 5 */
-                    {  16, 208, 228 },
-                    {   3, 151, 207 },
-                    {   1,  98, 160 },
-                    {   1,  67, 117 },
-                    {   1,  41,  74 },
-                    {   1,  17,  31 }
-                }
-            }
-        }
-    }, { /* tx = 32x32 */
-        { /* block Type 0 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    {  17,  38, 140 },
-                    {   7,  34,  80 },
-                    {   1,  17,  29 }
-                }, { /* Coeff Band 1 */
-                    {  37,  75, 128 },
-                    {  41,  76, 128 },
-                    {  26,  66, 116 },
-                    {  12,  52,  94 },
-                    {   2,  32,  55 },
-                    {   1,  10,  16 }
-                }, { /* Coeff Band 2 */
-                    {  50, 127, 154 },
-                    {  37, 109, 152 },
-                    {  16,  82, 121 },
-                    {   5,  59,  85 },
-                    {   1,  35,  54 },
-                    {   1,  13,  20 }
-                }, { /* Coeff Band 3 */
-                    {  40, 142, 167 },
-                    {  17, 110, 157 },
-                    {   2,  71, 112 },
-                    {   1,  44,  72 },
-                    {   1,  27,  45 },
-                    {   1,  11,  17 }
-                }, { /* Coeff Band 4 */
-                    {  30, 175, 188 },
-                    {   9, 124, 169 },
-                    {   1,  74, 116 },
-                    {   1,  48,  78 },
-                    {   1,  30,  49 },
-                    {   1,  11,  18 }
-                }, { /* Coeff Band 5 */
-                    {  10, 222, 223 },
-                    {   2, 150, 194 },
-                    {   1,  83, 128 },
-                    {   1,  48,  79 },
-                    {   1,  27,  45 },
-                    {   1,  11,  17 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    {  36,  41, 235 },
-                    {  29,  36, 193 },
-                    {  10,  27, 111 }
-                }, { /* Coeff Band 1 */
-                    {  85, 165, 222 },
-                    { 177, 162, 215 },
-                    { 110, 135, 195 },
-                    {  57, 113, 168 },
-                    {  23,  83, 120 },
-                    {  10,  49,  61 }
-                }, { /* Coeff Band 2 */
-                    {  85, 190, 223 },
-                    {  36, 139, 200 },
-                    {   5,  90, 146 },
-                    {   1,  60, 103 },
-                    {   1,  38,  65 },
-                    {   1,  18,  30 }
-                }, { /* Coeff Band 3 */
-                    {  72, 202, 223 },
-                    {  23, 141, 199 },
-                    {   2,  86, 140 },
-                    {   1,  56,  97 },
-                    {   1,  36,  61 },
-                    {   1,  16,  27 }
-                }, { /* Coeff Band 4 */
-                    {  55, 218, 225 },
-                    {  13, 145, 200 },
-                    {   1,  86, 141 },
-                    {   1,  57,  99 },
-                    {   1,  35,  61 },
-                    {   1,  13,  22 }
-                }, { /* Coeff Band 5 */
-                    {  15, 235, 212 },
-                    {   1, 132, 184 },
-                    {   1,  84, 139 },
-                    {   1,  57,  97 },
-                    {   1,  34,  56 },
-                    {   1,  14,  23 }
-                }
-            }
-        }, { /* block Type 1 */
-            { /* Intra */
-                { /* Coeff Band 0 */
-                    { 181,  21, 201 },
-                    {  61,  37, 123 },
-                    {  10,  38,  71 }
-                }, { /* Coeff Band 1 */
-                    {  47, 106, 172 },
-                    {  95, 104, 173 },
-                    {  42,  93, 159 },
-                    {  18,  77, 131 },
-                    {   4,  50,  81 },
-                    {   1,  17,  23 }
-                }, { /* Coeff Band 2 */
-                    {  62, 147, 199 },
-                    {  44, 130, 189 },
-                    {  28, 102, 154 },
-                    {  18,  75, 115 },
-                    {   2,  44,  65 },
-                    {   1,  12,  19 }
-                }, { /* Coeff Band 3 */
-                    {  55, 153, 210 },
-                    {  24, 130, 194 },
-                    {   3,  93, 146 },
-                    {   1,  61,  97 },
-                    {   1,  31,  50 },
-                    {   1,  10,  16 }
-                }, { /* Coeff Band 4 */
-                    {  49, 186, 223 },
-                    {  17, 148, 204 },
-                    {   1,  96, 142 },
-                    {   1,  53,  83 },
-                    {   1,  26,  44 },
-                    {   1,  11,  17 }
-                }, { /* Coeff Band 5 */
-                    {  13, 217, 212 },
-                    {   2, 136, 180 },
-                    {   1,  78, 124 },
-                    {   1,  50,  83 },
-                    {   1,  29,  49 },
-                    {   1,  14,  23 }
-                }
-            }, { /* Inter */
-                { /* Coeff Band 0 */
-                    { 197,  13, 247 },
-                    {  82,  17, 222 },
-                    {  25,  17, 162 }
-                }, { /* Coeff Band 1 */
-                    { 126, 186, 247 },
-                    { 234, 191, 243 },
-                    { 176, 177, 234 },
-                    { 104, 158, 220 },
-                    {  66, 128, 186 },
-                    {  55,  90, 137 }
-                }, { /* Coeff Band 2 */
-                    { 111, 197, 242 },
-                    {  46, 158, 219 },
-                    {   9, 104, 171 },
-                    {   2,  65, 125 },
-                    {   1,  44,  80 },
-                    {   1,  17,  91 }
-                }, { /* Coeff Band 3 */
-                    { 104, 208, 245 },
-                    {  39, 168, 224 },
-                    {   3, 109, 162 },
-                    {   1,  79, 124 },
-                    {   1,  50, 102 },
-                    {   1,  43, 102 }
-                }, { /* Coeff Band 4 */
-                    {  84, 220, 246 },
-                    {  31, 177, 231 },
-                    {   2, 115, 180 },
-                    {   1,  79, 134 },
-                    {   1,  55,  77 },
-                    {   1,  60,  79 }
-                }, { /* Coeff Band 5 */
-                    {  43, 243, 240 },
-                    {   8, 180, 217 },
-                    {   1, 115, 166 },
-                    {   1,  84, 121 },
-                    {   1,  51,  67 },
-                    {   1,  16,   6 }
-                }
-            }
-        }
-    }
-};
-
-const int8_t ff_vp9_mv_joint_tree[3][2] = {
-    { -MV_JOINT_ZERO,            1 }, // '0'
-    {    -MV_JOINT_H,            2 }, // '10'
-    {    -MV_JOINT_V, -MV_JOINT_HV }, // '11x'
-};
-
-const int8_t ff_vp9_mv_class_tree[10][2] = {
-    { -0,   1 }, // '0'
-    { -1,   2 }, // '10'
-    {  3,   4 },
-    { -2,  -3 }, // '110x'
-    {  5,   6 },
-    { -4,  -5 }, // '1110x'
-    { -6,   7 }, // '11110'
-    {  8,   9 },
-    { -7,  -8 }, // '111110x'
-    { -9, -10 }, // '111111x'
-};
-
-const int8_t ff_vp9_mv_fp_tree[3][2] = {
-    { -0,  1 },   // '0'
-    { -1,  2 },   // '10'
-    { -2, -3 },   // '11x'
-};
diff --git a/libavcodec/vp9data.h b/libavcodec/vp9data.h
index a52cc0a..cb12e7e 100644
--- a/libavcodec/vp9data.h
+++ b/libavcodec/vp9data.h
@@ -2,20 +2,20 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,45 +26,2252 @@
 
 #include "vp9.h"
 
-extern const int8_t ff_vp9_partition_tree[3][2];
-extern const uint8_t ff_vp9_default_kf_partition_probs[4][4][3];
-extern const int8_t ff_vp9_segmentation_tree[7][2];
-extern const int8_t ff_vp9_intramode_tree[9][2];
-extern const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9];
-extern const uint8_t ff_vp9_default_kf_uvmode_probs[10][9];
-extern const int8_t ff_vp9_inter_mode_tree[3][2];
-extern const int8_t ff_vp9_filter_tree[2][2];
-extern const enum FilterMode ff_vp9_filter_lut[3];
-extern const int16_t ff_vp9_dc_qlookup[256];
-extern const int16_t ff_vp9_ac_qlookup[256];
-extern const enum TxfmType ff_vp9_intra_txfm_type[14];
-extern const int16_t ff_vp9_default_scan_4x4[16];
-extern const int16_t ff_vp9_col_scan_4x4[16];
-extern const int16_t ff_vp9_row_scan_4x4[16];
-extern const int16_t ff_vp9_default_scan_8x8[64];
-extern const int16_t ff_vp9_col_scan_8x8[64];
-extern const int16_t ff_vp9_row_scan_8x8[64];
-extern const int16_t ff_vp9_default_scan_16x16[256];
-extern const int16_t ff_vp9_col_scan_16x16[256];
-extern const int16_t ff_vp9_row_scan_16x16[256];
-extern const int16_t ff_vp9_default_scan_32x32[1024];
-extern const int16_t *ff_vp9_scans[5][4];
-extern const int16_t ff_vp9_default_scan_4x4_nb[16][2];
-extern const int16_t ff_vp9_col_scan_4x4_nb[16][2];
-extern const int16_t ff_vp9_row_scan_4x4_nb[16][2];
-extern const int16_t ff_vp9_default_scan_8x8_nb[64][2];
-extern const int16_t ff_vp9_col_scan_8x8_nb[64][2];
-extern const int16_t ff_vp9_row_scan_8x8_nb[64][2];
-extern const int16_t ff_vp9_default_scan_16x16_nb[256][2];
-extern const int16_t ff_vp9_col_scan_16x16_nb[256][2];
-extern const int16_t ff_vp9_row_scan_16x16_nb[256][2];
-extern const int16_t ff_vp9_default_scan_32x32_nb[1024][2];
-extern const int16_t (*ff_vp9_scans_nb[5][4])[2];
-extern const uint8_t ff_vp9_model_pareto8[256][8];
-extern const ProbContext ff_vp9_default_probs;
-extern const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3];
-extern const int8_t ff_vp9_mv_joint_tree[3][2];
-extern const int8_t ff_vp9_mv_class_tree[10][2];
-extern const int8_t ff_vp9_mv_fp_tree[3][2];
+static const int8_t vp9_partition_tree[3][2] = {
+    { -PARTITION_NONE, 1 },               // '0'
+     { -PARTITION_H, 2 },                 // '10'
+      { -PARTITION_V, -PARTITION_SPLIT }, // '110', '111'
+};
+
+static const uint8_t vp9_default_kf_partition_probs[4][4][3] = {
+    { /* 64x64 -> 32x32 */
+        { 174,  35,  49 } /* a/l both not split */,
+        {  68,  11,  27 } /* a split, l not split */,
+        {  57,  15,   9 } /* l split, a not split */,
+        {  12,   3,   3 } /* a/l both split */
+    }, { /* 32x32 -> 16x16 */
+        { 150,  40,  39 } /* a/l both not split */,
+        {  78,  12,  26 } /* a split, l not split */,
+        {  67,  33,  11 } /* l split, a not split */,
+        {  24,   7,   5 } /* a/l both split */,
+    }, { /* 16x16 -> 8x8 */
+        { 149,  53,  53 } /* a/l both not split */,
+        {  94,  20,  48 } /* a split, l not split */,
+        {  83,  53,  24 } /* l split, a not split */,
+        {  52,  18,  18 } /* a/l both split */,
+    }, { /* 8x8 -> 4x4 */
+        { 158,  97,  94 } /* a/l both not split */,
+        {  93,  24,  99 } /* a split, l not split */,
+        {  85, 119,  44 } /* l split, a not split */,
+        {  62,  59,  67 } /* a/l both split */,
+    },
+};
+
+static const int8_t vp9_segmentation_tree[7][2] = {
+    { 1, 2 },
+     { 3, 4 },
+     { 5, 6 },
+      { -0, -1 }, // '00x'
+      { -2, -3 }, // '01x'
+      { -4, -5 }, // '10x'
+      { -6, -7 }, // '11x'
+};
+
+static const int8_t vp9_intramode_tree[9][2] = {
+    { -DC_PRED, 1 },                                  // '0'
+     { -TM_VP8_PRED, 2 },                             // '10'
+      { -VERT_PRED, 3 },                              // '110'
+       { 4, 6 },
+        { -HOR_PRED, 5 },                             // '11100'
+         { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '11101x'
+        { -DIAG_DOWN_LEFT_PRED, 7 },                  // '11110'
+         { -VERT_LEFT_PRED, 8 },                      // '111110'
+          { -HOR_DOWN_PRED, -HOR_UP_PRED },           // '111111x'
+};
+
+static const uint8_t vp9_default_kf_ymode_probs[10][10][9] = {
+    { /* above = v */
+        {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
+        {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
+        {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
+        {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
+        {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
+        {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
+        {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
+        {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
+        {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+        {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
+    }, { /* above = h */
+        {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
+        {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
+        {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
+        {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
+        {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
+        {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
+        {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
+        {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
+        {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+        {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
+    }, { /* above = dc */
+        {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
+        {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
+        { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
+        {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
+        {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
+        {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
+        {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
+        {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
+        {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+        {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
+    }, { /* above = d45 */
+        {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
+        {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
+        { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
+        {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
+        {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
+        {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
+        {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
+        {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
+        {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+        {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
+    }, { /* above = d135 */
+        {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
+        {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
+        {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
+        {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
+        {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
+        {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
+        {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
+        {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
+        {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+        {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
+    }, { /* above = d117 */
+        {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
+        {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
+        {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
+        {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
+        {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
+        {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
+        {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
+        {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
+        {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+        {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
+    }, { /* above = d153 */
+        {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
+        {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
+        {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
+        {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
+        {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
+        {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
+        {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
+        {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
+        {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+        {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
+    }, { /* above = d63 */
+        {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
+        {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
+        {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
+        {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
+        {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
+        {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
+        {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
+        {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
+        {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+        {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
+    }, { /* above = d27 */
+        {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
+        {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
+        {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
+        {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
+        {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
+        {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
+        {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
+        {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
+        {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+        {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
+    }, { /* above = tm */
+        {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
+        {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
+        {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
+        {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
+        {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
+        {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
+        {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
+        {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
+        {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+        {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
+    }
+};
+
+static const uint8_t vp9_default_kf_uvmode_probs[10][9] = {
+    { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
+    { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
+    { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
+    { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,
+    { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
+    { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
+    { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
+    { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
+    { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
+    { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
+};
+
+static const int8_t vp9_inter_mode_tree[3][2] = {
+    { -ZEROMV, 1 },        // '0'
+     { -NEARESTMV, 2 },    // '10'
+      { -NEARMV, -NEWMV }, // '11x'
+};
+
+static const int8_t vp9_filter_tree[2][2] = {
+    { -0, 1 },   // '0'
+     { -1, -2 }, // '1x'
+};
+
+static const enum FilterMode vp9_filter_lut[3] = {
+    FILTER_8TAP_REGULAR,
+    FILTER_8TAP_SMOOTH,
+    FILTER_8TAP_SHARP,
+};
+
+static const int16_t vp9_dc_qlookup[3][256] = {
+    {
+            4,     8,     8,     9,    10,    11,    12,    12,
+           13,    14,    15,    16,    17,    18,    19,    19,
+           20,    21,    22,    23,    24,    25,    26,    26,
+           27,    28,    29,    30,    31,    32,    32,    33,
+           34,    35,    36,    37,    38,    38,    39,    40,
+           41,    42,    43,    43,    44,    45,    46,    47,
+           48,    48,    49,    50,    51,    52,    53,    53,
+           54,    55,    56,    57,    57,    58,    59,    60,
+           61,    62,    62,    63,    64,    65,    66,    66,
+           67,    68,    69,    70,    70,    71,    72,    73,
+           74,    74,    75,    76,    77,    78,    78,    79,
+           80,    81,    81,    82,    83,    84,    85,    85,
+           87,    88,    90,    92,    93,    95,    96,    98,
+           99,   101,   102,   104,   105,   107,   108,   110,
+          111,   113,   114,   116,   117,   118,   120,   121,
+          123,   125,   127,   129,   131,   134,   136,   138,
+          140,   142,   144,   146,   148,   150,   152,   154,
+          156,   158,   161,   164,   166,   169,   172,   174,
+          177,   180,   182,   185,   187,   190,   192,   195,
+          199,   202,   205,   208,   211,   214,   217,   220,
+          223,   226,   230,   233,   237,   240,   243,   247,
+          250,   253,   257,   261,   265,   269,   272,   276,
+          280,   284,   288,   292,   296,   300,   304,   309,
+          313,   317,   322,   326,   330,   335,   340,   344,
+          349,   354,   359,   364,   369,   374,   379,   384,
+          389,   395,   400,   406,   411,   417,   423,   429,
+          435,   441,   447,   454,   461,   467,   475,   482,
+          489,   497,   505,   513,   522,   530,   539,   549,
+          559,   569,   579,   590,   602,   614,   626,   640,
+          654,   668,   684,   700,   717,   736,   755,   775,
+          796,   819,   843,   869,   896,   925,   955,   988,
+         1022,  1058,  1098,  1139,  1184,  1232,  1282,  1336,
+    }, {
+            4,     9,    10,    13,    15,    17,    20,    22,
+           25,    28,    31,    34,    37,    40,    43,    47,
+           50,    53,    57,    60,    64,    68,    71,    75,
+           78,    82,    86,    90,    93,    97,   101,   105,
+          109,   113,   116,   120,   124,   128,   132,   136,
+          140,   143,   147,   151,   155,   159,   163,   166,
+          170,   174,   178,   182,   185,   189,   193,   197,
+          200,   204,   208,   212,   215,   219,   223,   226,
+          230,   233,   237,   241,   244,   248,   251,   255,
+          259,   262,   266,   269,   273,   276,   280,   283,
+          287,   290,   293,   297,   300,   304,   307,   310,
+          314,   317,   321,   324,   327,   331,   334,   337,
+          343,   350,   356,   362,   369,   375,   381,   387,
+          394,   400,   406,   412,   418,   424,   430,   436,
+          442,   448,   454,   460,   466,   472,   478,   484,
+          490,   499,   507,   516,   525,   533,   542,   550,
+          559,   567,   576,   584,   592,   601,   609,   617,
+          625,   634,   644,   655,   666,   676,   687,   698,
+          708,   718,   729,   739,   749,   759,   770,   782,
+          795,   807,   819,   831,   844,   856,   868,   880,
+          891,   906,   920,   933,   947,   961,   975,   988,
+         1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
+         1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
+         1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
+         1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
+         1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
+         1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
+         1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
+         2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
+         2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
+         3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
+         4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
+    }, {
+            4,    12,    18,    25,    33,    41,    50,    60,
+           70,    80,    91,   103,   115,   127,   140,   153,
+          166,   180,   194,   208,   222,   237,   251,   266,
+          281,   296,   312,   327,   343,   358,   374,   390,
+          405,   421,   437,   453,   469,   484,   500,   516,
+          532,   548,   564,   580,   596,   611,   627,   643,
+          659,   674,   690,   706,   721,   737,   752,   768,
+          783,   798,   814,   829,   844,   859,   874,   889,
+          904,   919,   934,   949,   964,   978,   993,  1008,
+         1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
+         1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+         1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
+         1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
+         1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
+         1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
+         1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
+         2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
+         2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
+         2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
+         3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
+         3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
+         4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+         4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
+         5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
+         5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
+         6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
+         6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
+         7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
+         8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
+        10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+        12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+        16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
+    }
+};
+
+static const int16_t vp9_ac_qlookup[3][256] = {
+    {
+            4,     8,     9,    10,    11,    12,    13,    14,
+           15,    16,    17,    18,    19,    20,    21,    22,
+           23,    24,    25,    26,    27,    28,    29,    30,
+           31,    32,    33,    34,    35,    36,    37,    38,
+           39,    40,    41,    42,    43,    44,    45,    46,
+           47,    48,    49,    50,    51,    52,    53,    54,
+           55,    56,    57,    58,    59,    60,    61,    62,
+           63,    64,    65,    66,    67,    68,    69,    70,
+           71,    72,    73,    74,    75,    76,    77,    78,
+           79,    80,    81,    82,    83,    84,    85,    86,
+           87,    88,    89,    90,    91,    92,    93,    94,
+           95,    96,    97,    98,    99,   100,   101,   102,
+          104,   106,   108,   110,   112,   114,   116,   118,
+          120,   122,   124,   126,   128,   130,   132,   134,
+          136,   138,   140,   142,   144,   146,   148,   150,
+          152,   155,   158,   161,   164,   167,   170,   173,
+          176,   179,   182,   185,   188,   191,   194,   197,
+          200,   203,   207,   211,   215,   219,   223,   227,
+          231,   235,   239,   243,   247,   251,   255,   260,
+          265,   270,   275,   280,   285,   290,   295,   300,
+          305,   311,   317,   323,   329,   335,   341,   347,
+          353,   359,   366,   373,   380,   387,   394,   401,
+          408,   416,   424,   432,   440,   448,   456,   465,
+          474,   483,   492,   501,   510,   520,   530,   540,
+          550,   560,   571,   582,   593,   604,   615,   627,
+          639,   651,   663,   676,   689,   702,   715,   729,
+          743,   757,   771,   786,   801,   816,   832,   848,
+          864,   881,   898,   915,   933,   951,   969,   988,
+         1007,  1026,  1046,  1066,  1087,  1108,  1129,  1151,
+         1173,  1196,  1219,  1243,  1267,  1292,  1317,  1343,
+         1369,  1396,  1423,  1451,  1479,  1508,  1537,  1567,
+         1597,  1628,  1660,  1692,  1725,  1759,  1793,  1828,
+    }, {
+            4,     9,    11,    13,    16,    18,    21,    24,
+           27,    30,    33,    37,    40,    44,    48,    51,
+           55,    59,    63,    67,    71,    75,    79,    83,
+           88,    92,    96,   100,   105,   109,   114,   118,
+          122,   127,   131,   136,   140,   145,   149,   154,
+          158,   163,   168,   172,   177,   181,   186,   190,
+          195,   199,   204,   208,   213,   217,   222,   226,
+          231,   235,   240,   244,   249,   253,   258,   262,
+          267,   271,   275,   280,   284,   289,   293,   297,
+          302,   306,   311,   315,   319,   324,   328,   332,
+          337,   341,   345,   349,   354,   358,   362,   367,
+          371,   375,   379,   384,   388,   392,   396,   401,
+          409,   417,   425,   433,   441,   449,   458,   466,
+          474,   482,   490,   498,   506,   514,   523,   531,
+          539,   547,   555,   563,   571,   579,   588,   596,
+          604,   616,   628,   640,   652,   664,   676,   688,
+          700,   713,   725,   737,   749,   761,   773,   785,
+          797,   809,   825,   841,   857,   873,   889,   905,
+          922,   938,   954,   970,   986,  1002,  1018,  1038,
+         1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
+         1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
+         1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
+         1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
+         1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
+         2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
+         2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
+         2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
+         3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
+         4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
+         4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
+         5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
+         6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
+    }, {
+            4,    13,    19,    27,    35,    44,    54,    64,
+           75,    87,    99,   112,   126,   139,   154,   168,
+          183,   199,   214,   230,   247,   263,   280,   297,
+          314,   331,   349,   366,   384,   402,   420,   438,
+          456,   475,   493,   511,   530,   548,   567,   586,
+          604,   623,   642,   660,   679,   698,   716,   735,
+          753,   772,   791,   809,   828,   846,   865,   884,
+          902,   920,   939,   957,   976,   994,  1012,  1030,
+         1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
+         1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
+         1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+         1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
+         1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
+         1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
+         2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
+         2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
+         2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
+         3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
+         3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
+         4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
+         4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
+         5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+         6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
+         7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
+         8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
+        10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+        11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+        13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+        16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+        18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+        21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+        25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
+    }
+};
+
+static const enum TxfmType vp9_intra_txfm_type[14] = {
+    [VERT_PRED]            = ADST_DCT,
+    [HOR_PRED]             = DCT_ADST,
+    [DC_PRED]              = DCT_DCT,
+    [DIAG_DOWN_LEFT_PRED]  = DCT_DCT,
+    [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
+    [VERT_RIGHT_PRED]      = ADST_DCT,
+    [HOR_DOWN_PRED]        = DCT_ADST,
+    [VERT_LEFT_PRED]       = ADST_DCT,
+    [HOR_UP_PRED]          = DCT_ADST,
+    [TM_VP8_PRED]          = ADST_ADST,
+    [NEARESTMV]            = DCT_DCT,
+    [NEARMV]               = DCT_DCT,
+    [ZEROMV]               = DCT_DCT,
+    [NEWMV]                = DCT_DCT,
+};
+
+static const int16_t vp9_default_scan_4x4[16] = {
+     0,  1,  4,  5,
+     2,  8,  3,  6,
+    12,  9,  7, 10,
+    13, 11, 14, 15,
+};
+
+static const int16_t vp9_col_scan_4x4[16] = {
+     0,  1,  2,  4,
+     3,  5,  6,  8,
+     7,  9, 10, 12,
+    13, 11, 14, 15,
+};
+
+static const int16_t vp9_row_scan_4x4[16] = {
+     0,  4,  1,  8,
+     5, 12,  9,  2,
+     6, 13,  3, 10,
+     7, 14, 11, 15,
+};
+
+static const int16_t vp9_default_scan_8x8[64] = {
+     0,  1,  8,  2,  9, 16, 10,  3,
+    17, 24, 18, 11,  4, 25, 32, 19,
+    12, 26,  5, 33, 20, 27, 40, 13,
+    34,  6, 41, 28, 21, 35, 42, 48,
+    14,  7, 36, 29, 43, 56, 49, 22,
+    15, 37, 50, 44, 57, 30, 23, 51,
+    45, 58, 38, 31, 52, 59, 39, 46,
+    53, 60, 47, 54, 61, 55, 62, 63,
+};
+
+static const int16_t vp9_col_scan_8x8[64] = {
+     0,  1,  2,  8,  3,  9,  4, 10,
+    16,  5, 11, 17, 12, 18,  6, 24,
+    19, 13, 25,  7, 26, 20, 32, 14,
+    27, 21, 33, 28, 34, 15, 22, 35,
+    40, 29, 41, 36, 23, 30, 42, 37,
+    48, 43, 31, 44, 49, 38, 50, 56,
+    45, 39, 51, 57, 52, 46, 58, 53,
+    59, 47, 60, 54, 61, 55, 62, 63,
+};
+
+static const int16_t vp9_row_scan_8x8[64] = {
+     0,  8, 16,  1,  9, 24,  2, 17,
+    32, 10, 25,  3, 40, 18, 11, 33,
+    26, 19,  4, 48, 41, 34, 12, 27,
+    56, 20,  5, 42, 35, 13, 49, 28,
+     6, 21, 43, 36, 14, 50, 29, 57,
+     7, 44, 22, 37, 51, 15, 58, 30,
+    23, 45, 52, 38, 59, 31, 46, 53,
+    39, 60, 47, 61, 54, 62, 55, 63,
+};
+
+static const int16_t vp9_default_scan_16x16[256] = {
+      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  34,  19,  49,  20,   5,
+     35,  64,  50,  36,  65,  21,   6,  51,  80,  66,  37,  22,  52,   7,  81,  67,
+     38,  82,  53,  23,  96,  68,   8,  83,  97,  54,  39,  69, 112,  24,  98,  84,
+     70,  55,   9,  40,  85,  99, 113, 128,  25, 114, 100,  71,  86,  56,  10,  41,
+    115, 101, 129, 116,  72,  87,  26, 130, 144, 102,  57,  11,  42, 117, 131, 145,
+     88, 103,  27,  73, 132, 118, 146,  58, 160,  12,  43, 133, 147, 104,  89, 119,
+    161,  74, 148, 134,  28, 162,  59,  13, 176, 120, 149,  90, 135, 105, 163,  44,
+     75, 177, 164,  29, 150, 121, 136, 178, 165,  14, 106,  60,  91, 151,  45, 179,
+    192, 137, 166, 122,  76, 180, 152,  30,  61,  15, 107, 167, 181, 193,  92, 208,
+     46, 138, 123, 153, 194,  77, 168, 182,  31, 195, 209, 183, 108, 139,  62, 154,
+     47, 196,  93, 169, 210, 197, 224, 124, 184, 211,  78, 109, 170, 155,  63, 198,
+    212, 185, 225, 240, 140,  94, 199, 125,  79, 213, 226, 171, 186, 156, 214, 200,
+    110, 227, 141,  95, 241, 215, 228, 201, 126, 242, 187, 172, 157, 229, 111, 216,
+    243, 142, 202, 230, 127, 217, 244, 173, 188, 231, 158, 203, 143, 245, 218, 232,
+    189, 246, 159, 174, 233, 247, 219, 204, 175, 190, 248, 234, 205, 220, 249, 191,
+    235, 221, 250, 206, 222, 251, 236, 207, 237, 223, 252, 238, 253, 239, 254, 255,
+};
+
+static const int16_t vp9_col_scan_16x16[256] = {
+      0,   1,   2,   3,  16,   4,  17,   5,  18,   6,  19,  32,  20,   7,  33,  21,
+     34,   8,  35,  22,  48,  36,   9,  49,  23,  50,  37,  10,  38,  51,  24,  64,
+     52,  11,  65,  39,  25,  53,  66,  54,  40,  67,  12,  80,  26,  68,  55,  81,
+     41,  69,  13,  27,  82,  56,  70,  83,  42,  14,  84,  96,  71,  28,  57,  85,
+     97,  15,  72,  98,  43,  86,  58,  99,  29,  87, 100, 112,  73,  44, 101,  59,
+     30, 113,  88, 114,  74, 128, 102,  45,  31, 115,  60, 103,  89, 116,  75, 129,
+    117,  46, 104,  90,  61, 130, 118, 131, 132, 105,  76,  47, 119, 144,  91,  62,
+    133, 106, 145, 120, 146, 134,  77, 147, 121,  92, 135, 148,  63, 107, 136, 122,
+     93, 149, 160,  78, 150, 137, 108, 161, 162, 151, 123,  79, 138, 163, 152,  94,
+    164, 109, 165, 153, 124, 139, 176, 166,  95, 177, 167, 110, 154, 178, 125, 179,
+    140, 168, 155, 111, 180, 192, 181, 169, 141, 126, 182, 193, 194, 156, 183, 170,
+    195, 127, 142, 196, 184, 208, 197, 157, 171, 143, 185, 198, 209, 199, 210, 172,
+    158, 186, 211, 224, 212, 200, 240, 159, 213, 225, 187, 201, 173, 226, 214, 215,
+    227, 202, 228, 188, 241, 216, 174, 229, 242, 203, 243, 217, 230, 175, 189, 244,
+    231, 204, 218, 232, 245, 219, 246, 190, 233, 205, 191, 247, 234, 248, 220, 206,
+    249, 235, 221, 207, 250, 236, 222, 251, 223, 237, 238, 252, 239, 253, 254, 255,
+};
+
+static const int16_t vp9_row_scan_16x16[256] = {
+      0,  16,  32,   1,  48,  17,  64,  33,   2,  80,  18,  49,  96,  34,   3,  65,
+     19, 112,  50,  81,  35,   4, 128,  66,  20,  97,  51,  82,   5, 144,  36,  67,
+    113,  98,  21,  52, 160,  83, 129,  37,  68,   6, 114, 176,  99,  53,  22,  84,
+    145,  38,  69, 130,   7, 115, 192, 100,  54,  23,  85, 161, 146, 131,  39,  70,
+    208, 116,   8, 101, 177,  55,  86,  24, 162, 147, 132,  71, 224, 117,  40, 102,
+      9, 148,  56,  87, 193, 163, 240, 133, 178,  25, 118,  72,  41, 103, 164,  10,
+    149,  88, 134, 209, 179,  57, 119, 194,  26,  73, 165, 150, 104,  42, 135,  11,
+    180, 120,  89, 225, 195,  58,  27, 210, 151, 181, 166,  74,  43, 105,  12, 136,
+     90,  59, 241, 121,  28, 196, 167, 211, 152,  44, 182, 137,  75,  13, 226, 106,
+    122,  60, 197,  91, 168,  29, 183, 153,  14,  76, 212, 138,  45, 107,  15, 198,
+     92, 227, 169,  30, 123, 154,  61, 242, 184, 213, 139,  46,  77,  31, 108, 170,
+    199, 185, 124, 228,  93, 155, 214,  62, 140, 243,  78,  47, 200, 109, 186, 171,
+    201,  94,  63, 215, 229, 156,  79, 125, 141, 110, 216, 187, 172, 244, 202, 230,
+    217,  95, 157, 126, 245, 111, 142, 231, 188, 127, 158, 218, 173, 232, 246, 233,
+    203, 143, 247, 174, 189, 159, 219, 204, 248, 234, 249, 175, 190, 220, 205, 250,
+    235, 191, 221, 251, 236, 206, 252, 222, 207, 237, 223, 253, 238, 254, 239, 255,
+};
+
+static const int16_t vp9_default_scan_32x32[1024] = {
+       0,    1,   32,    2,   33,   64,    3,   34,   65,    4,   96,   35,   66,    5,   36,   97,   67,  128,   98,   68,   37,    6,  129,   99,    7,  160,   69,   38,  130,  100,  161,  131,
+      39,   70,    8,  101,  162,  132,  192,   71,   40,    9,  102,  163,  133,  193,   72,  224,  103,   41,  164,   10,  194,  134,  165,   73,  104,  135,  225,   42,  195,   11,  256,  166,
+     226,  196,   74,  105,  136,   43,   12,  167,  197,  227,  257,   75,  106,  137,  228,   44,  198,  168,  258,  288,   13,  229,   76,  107,  199,  138,  259,  169,  289,   45,  230,  260,
+     200,  108,   14,  170,  139,  320,  290,   77,  231,  261,   46,  201,  140,  291,  109,  232,  321,  262,  171,   78,  292,   15,  322,  202,  263,  352,  172,  293,  233,  141,  323,  110,
+      47,  203,  264,  234,  294,  353,  324,   16,   79,  204,  265,  295,  325,  173,  354,  142,  235,  384,   48,  296,  111,  266,  355,  326,   80,   17,  205,  236,  174,  356,  385,  327,
+     143,  297,  267,  357,  386,  112,   49,  328,  298,  206,  416,  237,  358,  387,   81,  175,   18,  329,  359,  388,  299,  330,  389,  113,  417,  238,  360,   50,  207,  418,  390,  331,
+      19,  448,  361,   82,  419,  391,  239,   51,  362,  420,  114,  449,  480,  421,   83,  363,  450,  422,  512,  451,  423,  115,  452,  481,  453,  482,  454,  544,  483,  455,  513,  484,
+     514,  485,  515,  486,  545,  576,  487,  546,  547,  608,  577,  578,  579,  609,  610,  611,   20,  144,  268,  392,  516,  640,   21,   52,  145,  176,  269,  300,  393,  424,  517,  548,
+     641,  672,   22,   53,   84,  146,  177,  208,  270,  301,  332,  394,  425,  456,  518,  549,  580,  642,  673,  704,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
+     395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,   55,   86,  117,  179,  210,  241,  303,  334,  365,  427,  458,  489,  551,  582,  613,  675,  706,  737,   87,  118,
+     211,  242,  335,  366,  459,  490,  583,  614,  707,  738,  119,  243,  367,  491,  615,  739,   24,  148,  272,  396,  520,  644,  768,   25,   56,  149,  180,  273,  304,  397,  428,  521,
+     552,  645,  676,  769,  800,   26,   57,   88,  150,  181,  212,  274,  305,  336,  398,  429,  460,  522,  553,  584,  646,  677,  708,  770,  801,  832,   27,   58,   89,  120,  151,  182,
+     213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,   59,   90,  121,  183,  214,  245,  307,  338,  369,  431,
+     462,  493,  555,  586,  617,  679,  710,  741,  803,  834,  865,   91,  122,  215,  246,  339,  370,  463,  494,  587,  618,  711,  742,  835,  866,  123,  247,  371,  495,  619,  743,  867,
+      28,  152,  276,  400,  524,  648,  772,  896,   29,   60,  153,  184,  277,  308,  401,  432,  525,  556,  649,  680,  773,  804,  897,  928,   30,   61,   92,  154,  185,  216,  278,  309,
+     340,  402,  433,  464,  526,  557,  588,  650,  681,  712,  774,  805,  836,  898,  929,  960,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
+     527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,   63,   94,  125,  187,  218,  249,  311,  342,  373,  435,  466,  497,  559,  590,  621,  683,
+     714,  745,  807,  838,  869,  931,  962,  993,   95,  126,  219,  250,  343,  374,  467,  498,  591,  622,  715,  746,  839,  870,  963,  994,  127,  251,  375,  499,  623,  747,  871,  995,
+     156,  280,  404,  528,  652,  776,  900,  157,  188,  281,  312,  405,  436,  529,  560,  653,  684,  777,  808,  901,  932,  158,  189,  220,  282,  313,  344,  406,  437,  468,  530,  561,
+     592,  654,  685,  716,  778,  809,  840,  902,  933,  964,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
+     841,  872,  903,  934,  965,  996,  191,  222,  253,  315,  346,  377,  439,  470,  501,  563,  594,  625,  687,  718,  749,  811,  842,  873,  935,  966,  997,  223,  254,  347,  378,  471,
+     502,  595,  626,  719,  750,  843,  874,  967,  998,  255,  379,  503,  627,  751,  875,  999,  284,  408,  532,  656,  780,  904,  285,  316,  409,  440,  533,  564,  657,  688,  781,  812,
+     905,  936,  286,  317,  348,  410,  441,  472,  534,  565,  596,  658,  689,  720,  782,  813,  844,  906,  937,  968,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
+     659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000,  319,  350,  381,  443,  474,  505,  567,  598,  629,  691,  722,  753,  815,  846,  877,  939,  970, 1001,  351,  382,
+     475,  506,  599,  630,  723,  754,  847,  878,  971, 1002,  383,  507,  631,  755,  879, 1003,  412,  536,  660,  784,  908,  413,  444,  537,  568,  661,  692,  785,  816,  909,  940,  414,
+     445,  476,  538,  569,  600,  662,  693,  724,  786,  817,  848,  910,  941,  972,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
+     973, 1004,  447,  478,  509,  571,  602,  633,  695,  726,  757,  819,  850,  881,  943,  974, 1005,  479,  510,  603,  634,  727,  758,  851,  882,  975, 1006,  511,  635,  759,  883, 1007,
+     540,  664,  788,  912,  541,  572,  665,  696,  789,  820,  913,  944,  542,  573,  604,  666,  697,  728,  790,  821,  852,  914,  945,  976,  543,  574,  605,  636,  667,  698,  729,  760,
+     791,  822,  853,  884,  915,  946,  977, 1008,  575,  606,  637,  699,  730,  761,  823,  854,  885,  947,  978, 1009,  607,  638,  731,  762,  855,  886,  979, 1010,  639,  763,  887, 1011,
+     668,  792,  916,  669,  700,  793,  824,  917,  948,  670,  701,  732,  794,  825,  856,  918,  949,  980,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012,  703,  734,
+     765,  827,  858,  889,  951,  982, 1013,  735,  766,  859,  890,  983, 1014,  767,  891, 1015,  796,  920,  797,  828,  921,  952,  798,  829,  860,  922,  953,  984,  799,  830,  861,  892,
+     923,  954,  985, 1016,  831,  862,  893,  955,  986, 1017,  863,  894,  987, 1018,  895, 1019,  924,  925,  956,  926,  957,  988,  927,  958,  989, 1020,  959,  990, 1021,  991, 1022, 1023,
+};
+
+static const int16_t * const vp9_scans[5][4] = {
+    {
+        vp9_default_scan_4x4, vp9_col_scan_4x4,
+        vp9_row_scan_4x4, vp9_default_scan_4x4
+    }, {
+        vp9_default_scan_8x8, vp9_col_scan_8x8,
+        vp9_row_scan_8x8, vp9_default_scan_8x8
+    }, {
+        vp9_default_scan_16x16, vp9_col_scan_16x16,
+        vp9_row_scan_16x16, vp9_default_scan_16x16
+    }, {
+        vp9_default_scan_32x32, vp9_default_scan_32x32,
+        vp9_default_scan_32x32, vp9_default_scan_32x32
+    }, { // lossless
+        vp9_default_scan_4x4, vp9_default_scan_4x4,
+        vp9_default_scan_4x4, vp9_default_scan_4x4
+    }
+};
+
+static const int16_t vp9_default_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  0,  0 }, {  4,  1 }, {  1,  1 },
+    {  4,  4 }, {  2,  2 }, {  5,  2 }, {  8,  8 },
+    {  8,  5 }, {  6,  3 }, {  9,  6 }, { 12,  9 },
+    { 10,  7 }, { 13, 10 }, { 14, 11 }, {  0,  0 },
+};
+
+static const int16_t vp9_col_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
+    {  4,  4 }, {  5,  5 }, {  4,  4 }, {  6,  6 },
+    {  8,  8 }, {  9,  9 }, {  8,  8 }, { 12, 12 },
+    { 10, 10 }, { 13, 13 }, { 14, 14 }, {  0,  0 },
+};
+
+static const int16_t vp9_row_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  0,  0 }, {  4,  4 }, {  1,  1 },
+    {  8,  8 }, {  5,  5 }, {  1,  1 }, {  2,  2 },
+    {  9,  9 }, {  2,  2 }, {  6,  6 }, {  3,  3 },
+    { 10, 10 }, {  7,  7 }, { 11, 11 }, {  0,  0 },
+};
+
+static const int16_t vp9_default_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  0,  0 }, {  1,  1 }, {  8,  1 },
+    {  8,  8 }, {  9,  2 }, {  2,  2 }, { 16,  9 },
+    { 16, 16 }, { 17, 10 }, { 10,  3 }, {  3,  3 },
+    { 24, 17 }, { 24, 24 }, { 18, 11 }, { 11,  4 },
+    { 25, 18 }, {  4,  4 }, { 32, 25 }, { 19, 12 },
+    { 26, 19 }, { 32, 32 }, { 12,  5 }, { 33, 26 },
+    {  5,  5 }, { 40, 33 }, { 27, 20 }, { 20, 13 },
+    { 34, 27 }, { 41, 34 }, { 40, 40 }, { 13,  6 },
+    {  6,  6 }, { 35, 28 }, { 28, 21 }, { 42, 35 },
+    { 48, 48 }, { 48, 41 }, { 21, 14 }, { 14,  7 },
+    { 36, 29 }, { 49, 42 }, { 43, 36 }, { 56, 49 },
+    { 29, 22 }, { 22, 15 }, { 50, 43 }, { 44, 37 },
+    { 57, 50 }, { 37, 30 }, { 30, 23 }, { 51, 44 },
+    { 58, 51 }, { 38, 31 }, { 45, 38 }, { 52, 45 },
+    { 59, 52 }, { 46, 39 }, { 53, 46 }, { 60, 53 },
+    { 54, 47 }, { 61, 54 }, { 62, 55 }, {  0,  0 },
+};
+
+static const int16_t vp9_col_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
+    {  8,  8 }, {  3,  3 }, {  9,  9 }, {  8,  8 },
+    {  4,  4 }, { 10, 10 }, { 16, 16 }, { 11, 11 },
+    { 17, 17 }, {  5,  5 }, { 16, 16 }, { 18, 18 },
+    { 12, 12 }, { 24, 24 }, {  6,  6 }, { 25, 25 },
+    { 19, 19 }, { 24, 24 }, { 13, 13 }, { 26, 26 },
+    { 20, 20 }, { 32, 32 }, { 27, 27 }, { 33, 33 },
+    { 14, 14 }, { 21, 21 }, { 34, 34 }, { 32, 32 },
+    { 28, 28 }, { 40, 40 }, { 35, 35 }, { 22, 22 },
+    { 29, 29 }, { 41, 41 }, { 36, 36 }, { 40, 40 },
+    { 42, 42 }, { 30, 30 }, { 43, 43 }, { 48, 48 },
+    { 37, 37 }, { 49, 49 }, { 48, 48 }, { 44, 44 },
+    { 38, 38 }, { 50, 50 }, { 56, 56 }, { 51, 51 },
+    { 45, 45 }, { 57, 57 }, { 52, 52 }, { 58, 58 },
+    { 46, 46 }, { 59, 59 }, { 53, 53 }, { 60, 60 },
+    { 54, 54 }, { 61, 61 }, { 62, 62 }, {  0,  0 },
+};
+
+static const int16_t vp9_row_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  8,  8 }, {  0,  0 }, {  1,  1 },
+    { 16, 16 }, {  1,  1 }, {  9,  9 }, { 24, 24 },
+    {  2,  2 }, { 17, 17 }, {  2,  2 }, { 32, 32 },
+    { 10, 10 }, {  3,  3 }, { 25, 25 }, { 18, 18 },
+    { 11, 11 }, {  3,  3 }, { 40, 40 }, { 33, 33 },
+    { 26, 26 }, {  4,  4 }, { 19, 19 }, { 48, 48 },
+    { 12, 12 }, {  4,  4 }, { 34, 34 }, { 27, 27 },
+    {  5,  5 }, { 41, 41 }, { 20, 20 }, {  5,  5 },
+    { 13, 13 }, { 35, 35 }, { 28, 28 }, {  6,  6 },
+    { 42, 42 }, { 21, 21 }, { 49, 49 }, {  6,  6 },
+    { 36, 36 }, { 14, 14 }, { 29, 29 }, { 43, 43 },
+    {  7,  7 }, { 50, 50 }, { 22, 22 }, { 15, 15 },
+    { 37, 37 }, { 44, 44 }, { 30, 30 }, { 51, 51 },
+    { 23, 23 }, { 38, 38 }, { 45, 45 }, { 31, 31 },
+    { 52, 52 }, { 39, 39 }, { 53, 53 }, { 46, 46 },
+    { 54, 54 }, { 47, 47 }, { 55, 55 }, {  0,  0 },
+};
+
+static const int16_t vp9_default_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {   0,   0 }, {   1,   1 }, {  16,   1 },
+    {  16,  16 }, {   2,   2 }, {  17,   2 }, {  32,  17 },
+    {  32,  32 }, {   3,   3 }, {  33,  18 }, {  18,   3 },
+    {  48,  33 }, {  19,   4 }, {   4,   4 }, {  34,  19 },
+    {  48,  48 }, {  49,  34 }, {  35,  20 }, {  64,  49 },
+    {  20,   5 }, {   5,   5 }, {  50,  35 }, {  64,  64 },
+    {  65,  50 }, {  36,  21 }, {  21,   6 }, {  51,  36 },
+    {   6,   6 }, {  80,  65 }, {  66,  51 }, {  37,  22 },
+    {  81,  66 }, {  52,  37 }, {  22,   7 }, {  80,  80 },
+    {  67,  52 }, {   7,   7 }, {  82,  67 }, {  96,  81 },
+    {  53,  38 }, {  38,  23 }, {  68,  53 }, {  96,  96 },
+    {  23,   8 }, {  97,  82 }, {  83,  68 }, {  69,  54 },
+    {  54,  39 }, {   8,   8 }, {  39,  24 }, {  84,  69 },
+    {  98,  83 }, { 112,  97 }, { 112, 112 }, {  24,   9 },
+    { 113,  98 }, {  99,  84 }, {  70,  55 }, {  85,  70 },
+    {  55,  40 }, {   9,   9 }, {  40,  25 }, { 114,  99 },
+    { 100,  85 }, { 128, 113 }, { 115, 100 }, {  71,  56 },
+    {  86,  71 }, {  25,  10 }, { 129, 114 }, { 128, 128 },
+    { 101,  86 }, {  56,  41 }, {  10,  10 }, {  41,  26 },
+    { 116, 101 }, { 130, 115 }, { 144, 129 }, {  87,  72 },
+    { 102,  87 }, {  26,  11 }, {  72,  57 }, { 131, 116 },
+    { 117, 102 }, { 145, 130 }, {  57,  42 }, { 144, 144 },
+    {  11,  11 }, {  42,  27 }, { 132, 117 }, { 146, 131 },
+    { 103,  88 }, {  88,  73 }, { 118, 103 }, { 160, 145 },
+    {  73,  58 }, { 147, 132 }, { 133, 118 }, {  27,  12 },
+    { 161, 146 }, {  58,  43 }, {  12,  12 }, { 160, 160 },
+    { 119, 104 }, { 148, 133 }, {  89,  74 }, { 134, 119 },
+    { 104,  89 }, { 162, 147 }, {  43,  28 }, {  74,  59 },
+    { 176, 161 }, { 163, 148 }, {  28,  13 }, { 149, 134 },
+    { 120, 105 }, { 135, 120 }, { 177, 162 }, { 164, 149 },
+    {  13,  13 }, { 105,  90 }, {  59,  44 }, {  90,  75 },
+    { 150, 135 }, {  44,  29 }, { 178, 163 }, { 176, 176 },
+    { 136, 121 }, { 165, 150 }, { 121, 106 }, {  75,  60 },
+    { 179, 164 }, { 151, 136 }, {  29,  14 }, {  60,  45 },
+    {  14,  14 }, { 106,  91 }, { 166, 151 }, { 180, 165 },
+    { 192, 177 }, {  91,  76 }, { 192, 192 }, {  45,  30 },
+    { 137, 122 }, { 122, 107 }, { 152, 137 }, { 193, 178 },
+    {  76,  61 }, { 167, 152 }, { 181, 166 }, {  30,  15 },
+    { 194, 179 }, { 208, 193 }, { 182, 167 }, { 107,  92 },
+    { 138, 123 }, {  61,  46 }, { 153, 138 }, {  46,  31 },
+    { 195, 180 }, {  92,  77 }, { 168, 153 }, { 209, 194 },
+    { 196, 181 }, { 208, 208 }, { 123, 108 }, { 183, 168 },
+    { 210, 195 }, {  77,  62 }, { 108,  93 }, { 169, 154 },
+    { 154, 139 }, {  62,  47 }, { 197, 182 }, { 211, 196 },
+    { 184, 169 }, { 224, 209 }, { 224, 224 }, { 139, 124 },
+    {  93,  78 }, { 198, 183 }, { 124, 109 }, {  78,  63 },
+    { 212, 197 }, { 225, 210 }, { 170, 155 }, { 185, 170 },
+    { 155, 140 }, { 213, 198 }, { 199, 184 }, { 109,  94 },
+    { 226, 211 }, { 140, 125 }, {  94,  79 }, { 240, 225 },
+    { 214, 199 }, { 227, 212 }, { 200, 185 }, { 125, 110 },
+    { 241, 226 }, { 186, 171 }, { 171, 156 }, { 156, 141 },
+    { 228, 213 }, { 110,  95 }, { 215, 200 }, { 242, 227 },
+    { 141, 126 }, { 201, 186 }, { 229, 214 }, { 126, 111 },
+    { 216, 201 }, { 243, 228 }, { 172, 157 }, { 187, 172 },
+    { 230, 215 }, { 157, 142 }, { 202, 187 }, { 142, 127 },
+    { 244, 229 }, { 217, 202 }, { 231, 216 }, { 188, 173 },
+    { 245, 230 }, { 158, 143 }, { 173, 158 }, { 232, 217 },
+    { 246, 231 }, { 218, 203 }, { 203, 188 }, { 174, 159 },
+    { 189, 174 }, { 247, 232 }, { 233, 218 }, { 204, 189 },
+    { 219, 204 }, { 248, 233 }, { 190, 175 }, { 234, 219 },
+    { 220, 205 }, { 249, 234 }, { 205, 190 }, { 221, 206 },
+    { 250, 235 }, { 235, 220 }, { 206, 191 }, { 236, 221 },
+    { 222, 207 }, { 251, 236 }, { 237, 222 }, { 252, 237 },
+    { 238, 223 }, { 253, 238 }, { 254, 239 }, {   0,   0 },
+};
+
+static const int16_t vp9_col_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {   1,   1 }, {   2,   2 }, {   0,   0 },
+    {   3,   3 }, {  16,  16 }, {   4,   4 }, {  17,  17 },
+    {   5,   5 }, {  18,  18 }, {  16,  16 }, {  19,  19 },
+    {   6,   6 }, {  32,  32 }, {  20,  20 }, {  33,  33 },
+    {   7,   7 }, {  34,  34 }, {  21,  21 }, {  32,  32 },
+    {  35,  35 }, {   8,   8 }, {  48,  48 }, {  22,  22 },
+    {  49,  49 }, {  36,  36 }, {   9,   9 }, {  37,  37 },
+    {  50,  50 }, {  23,  23 }, {  48,  48 }, {  51,  51 },
+    {  10,  10 }, {  64,  64 }, {  38,  38 }, {  24,  24 },
+    {  52,  52 }, {  65,  65 }, {  53,  53 }, {  39,  39 },
+    {  66,  66 }, {  11,  11 }, {  64,  64 }, {  25,  25 },
+    {  67,  67 }, {  54,  54 }, {  80,  80 }, {  40,  40 },
+    {  68,  68 }, {  12,  12 }, {  26,  26 }, {  81,  81 },
+    {  55,  55 }, {  69,  69 }, {  82,  82 }, {  41,  41 },
+    {  13,  13 }, {  83,  83 }, {  80,  80 }, {  70,  70 },
+    {  27,  27 }, {  56,  56 }, {  84,  84 }, {  96,  96 },
+    {  14,  14 }, {  71,  71 }, {  97,  97 }, {  42,  42 },
+    {  85,  85 }, {  57,  57 }, {  98,  98 }, {  28,  28 },
+    {  86,  86 }, {  99,  99 }, {  96,  96 }, {  72,  72 },
+    {  43,  43 }, { 100, 100 }, {  58,  58 }, {  29,  29 },
+    { 112, 112 }, {  87,  87 }, { 113, 113 }, {  73,  73 },
+    { 112, 112 }, { 101, 101 }, {  44,  44 }, {  30,  30 },
+    { 114, 114 }, {  59,  59 }, { 102, 102 }, {  88,  88 },
+    { 115, 115 }, {  74,  74 }, { 128, 128 }, { 116, 116 },
+    {  45,  45 }, { 103, 103 }, {  89,  89 }, {  60,  60 },
+    { 129, 129 }, { 117, 117 }, { 130, 130 }, { 131, 131 },
+    { 104, 104 }, {  75,  75 }, {  46,  46 }, { 118, 118 },
+    { 128, 128 }, {  90,  90 }, {  61,  61 }, { 132, 132 },
+    { 105, 105 }, { 144, 144 }, { 119, 119 }, { 145, 145 },
+    { 133, 133 }, {  76,  76 }, { 146, 146 }, { 120, 120 },
+    {  91,  91 }, { 134, 134 }, { 147, 147 }, {  62,  62 },
+    { 106, 106 }, { 135, 135 }, { 121, 121 }, {  92,  92 },
+    { 148, 148 }, { 144, 144 }, {  77,  77 }, { 149, 149 },
+    { 136, 136 }, { 107, 107 }, { 160, 160 }, { 161, 161 },
+    { 150, 150 }, { 122, 122 }, {  78,  78 }, { 137, 137 },
+    { 162, 162 }, { 151, 151 }, {  93,  93 }, { 163, 163 },
+    { 108, 108 }, { 164, 164 }, { 152, 152 }, { 123, 123 },
+    { 138, 138 }, { 160, 160 }, { 165, 165 }, {  94,  94 },
+    { 176, 176 }, { 166, 166 }, { 109, 109 }, { 153, 153 },
+    { 177, 177 }, { 124, 124 }, { 178, 178 }, { 139, 139 },
+    { 167, 167 }, { 154, 154 }, { 110, 110 }, { 179, 179 },
+    { 176, 176 }, { 180, 180 }, { 168, 168 }, { 140, 140 },
+    { 125, 125 }, { 181, 181 }, { 192, 192 }, { 193, 193 },
+    { 155, 155 }, { 182, 182 }, { 169, 169 }, { 194, 194 },
+    { 126, 126 }, { 141, 141 }, { 195, 195 }, { 183, 183 },
+    { 192, 192 }, { 196, 196 }, { 156, 156 }, { 170, 170 },
+    { 142, 142 }, { 184, 184 }, { 197, 197 }, { 208, 208 },
+    { 198, 198 }, { 209, 209 }, { 171, 171 }, { 157, 157 },
+    { 185, 185 }, { 210, 210 }, { 208, 208 }, { 211, 211 },
+    { 199, 199 }, { 224, 224 }, { 158, 158 }, { 212, 212 },
+    { 224, 224 }, { 186, 186 }, { 200, 200 }, { 172, 172 },
+    { 225, 225 }, { 213, 213 }, { 214, 214 }, { 226, 226 },
+    { 201, 201 }, { 227, 227 }, { 187, 187 }, { 240, 240 },
+    { 215, 215 }, { 173, 173 }, { 228, 228 }, { 241, 241 },
+    { 202, 202 }, { 242, 242 }, { 216, 216 }, { 229, 229 },
+    { 174, 174 }, { 188, 188 }, { 243, 243 }, { 230, 230 },
+    { 203, 203 }, { 217, 217 }, { 231, 231 }, { 244, 244 },
+    { 218, 218 }, { 245, 245 }, { 189, 189 }, { 232, 232 },
+    { 204, 204 }, { 190, 190 }, { 246, 246 }, { 233, 233 },
+    { 247, 247 }, { 219, 219 }, { 205, 205 }, { 248, 248 },
+    { 234, 234 }, { 220, 220 }, { 206, 206 }, { 249, 249 },
+    { 235, 235 }, { 221, 221 }, { 250, 250 }, { 222, 222 },
+    { 236, 236 }, { 237, 237 }, { 251, 251 }, { 238, 238 },
+    { 252, 252 }, { 253, 253 }, { 254, 254 }, {   0,   0 },
+};
+
+static const int16_t vp9_row_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {  16,  16 }, {   0,   0 }, {  32,  32 },
+    {   1,   1 }, {  48,  48 }, {  17,  17 }, {   1,   1 },
+    {  64,  64 }, {   2,   2 }, {  33,  33 }, {  80,  80 },
+    {  18,  18 }, {   2,   2 }, {  49,  49 }, {   3,   3 },
+    {  96,  96 }, {  34,  34 }, {  65,  65 }, {  19,  19 },
+    {   3,   3 }, { 112, 112 }, {  50,  50 }, {   4,   4 },
+    {  81,  81 }, {  35,  35 }, {  66,  66 }, {   4,   4 },
+    { 128, 128 }, {  20,  20 }, {  51,  51 }, {  97,  97 },
+    {  82,  82 }, {   5,   5 }, {  36,  36 }, { 144, 144 },
+    {  67,  67 }, { 113, 113 }, {  21,  21 }, {  52,  52 },
+    {   5,   5 }, {  98,  98 }, { 160, 160 }, {  83,  83 },
+    {  37,  37 }, {   6,   6 }, {  68,  68 }, { 129, 129 },
+    {  22,  22 }, {  53,  53 }, { 114, 114 }, {   6,   6 },
+    {  99,  99 }, { 176, 176 }, {  84,  84 }, {  38,  38 },
+    {   7,   7 }, {  69,  69 }, { 145, 145 }, { 130, 130 },
+    { 115, 115 }, {  23,  23 }, {  54,  54 }, { 192, 192 },
+    { 100, 100 }, {   7,   7 }, {  85,  85 }, { 161, 161 },
+    {  39,  39 }, {  70,  70 }, {   8,   8 }, { 146, 146 },
+    { 131, 131 }, { 116, 116 }, {  55,  55 }, { 208, 208 },
+    { 101, 101 }, {  24,  24 }, {  86,  86 }, {   8,   8 },
+    { 132, 132 }, {  40,  40 }, {  71,  71 }, { 177, 177 },
+    { 147, 147 }, { 224, 224 }, { 117, 117 }, { 162, 162 },
+    {   9,   9 }, { 102, 102 }, {  56,  56 }, {  25,  25 },
+    {  87,  87 }, { 148, 148 }, {   9,   9 }, { 133, 133 },
+    {  72,  72 }, { 118, 118 }, { 193, 193 }, { 163, 163 },
+    {  41,  41 }, { 103, 103 }, { 178, 178 }, {  10,  10 },
+    {  57,  57 }, { 149, 149 }, { 134, 134 }, {  88,  88 },
+    {  26,  26 }, { 119, 119 }, {  10,  10 }, { 164, 164 },
+    { 104, 104 }, {  73,  73 }, { 209, 209 }, { 179, 179 },
+    {  42,  42 }, {  11,  11 }, { 194, 194 }, { 135, 135 },
+    { 165, 165 }, { 150, 150 }, {  58,  58 }, {  27,  27 },
+    {  89,  89 }, {  11,  11 }, { 120, 120 }, {  74,  74 },
+    {  43,  43 }, { 225, 225 }, { 105, 105 }, {  12,  12 },
+    { 180, 180 }, { 151, 151 }, { 195, 195 }, { 136, 136 },
+    {  28,  28 }, { 166, 166 }, { 121, 121 }, {  59,  59 },
+    {  12,  12 }, { 210, 210 }, {  90,  90 }, { 106, 106 },
+    {  44,  44 }, { 181, 181 }, {  75,  75 }, { 152, 152 },
+    {  13,  13 }, { 167, 167 }, { 137, 137 }, {  13,  13 },
+    {  60,  60 }, { 196, 196 }, { 122, 122 }, {  29,  29 },
+    {  91,  91 }, {  14,  14 }, { 182, 182 }, {  76,  76 },
+    { 211, 211 }, { 153, 153 }, {  14,  14 }, { 107, 107 },
+    { 138, 138 }, {  45,  45 }, { 226, 226 }, { 168, 168 },
+    { 197, 197 }, { 123, 123 }, {  30,  30 }, {  61,  61 },
+    {  15,  15 }, {  92,  92 }, { 154, 154 }, { 183, 183 },
+    { 169, 169 }, { 108, 108 }, { 212, 212 }, {  77,  77 },
+    { 139, 139 }, { 198, 198 }, {  46,  46 }, { 124, 124 },
+    { 227, 227 }, {  62,  62 }, {  31,  31 }, { 184, 184 },
+    {  93,  93 }, { 170, 170 }, { 155, 155 }, { 185, 185 },
+    {  78,  78 }, {  47,  47 }, { 199, 199 }, { 213, 213 },
+    { 140, 140 }, {  63,  63 }, { 109, 109 }, { 125, 125 },
+    {  94,  94 }, { 200, 200 }, { 171, 171 }, { 156, 156 },
+    { 228, 228 }, { 186, 186 }, { 214, 214 }, { 201, 201 },
+    {  79,  79 }, { 141, 141 }, { 110, 110 }, { 229, 229 },
+    {  95,  95 }, { 126, 126 }, { 215, 215 }, { 172, 172 },
+    { 111, 111 }, { 142, 142 }, { 202, 202 }, { 157, 157 },
+    { 216, 216 }, { 230, 230 }, { 217, 217 }, { 187, 187 },
+    { 127, 127 }, { 231, 231 }, { 158, 158 }, { 173, 173 },
+    { 143, 143 }, { 203, 203 }, { 188, 188 }, { 232, 232 },
+    { 218, 218 }, { 233, 233 }, { 159, 159 }, { 174, 174 },
+    { 204, 204 }, { 189, 189 }, { 234, 234 }, { 219, 219 },
+    { 175, 175 }, { 205, 205 }, { 235, 235 }, { 220, 220 },
+    { 190, 190 }, { 236, 236 }, { 206, 206 }, { 191, 191 },
+    { 221, 221 }, { 207, 207 }, { 237, 237 }, { 222, 222 },
+    { 238, 238 }, { 223, 223 }, { 239, 239 }, {   0,   0 },
+};
+
+static const int16_t vp9_default_scan_32x32_nb[1024][2] = {
+    {    0,    0 }, {    0,    0 }, {    1,    1 }, {   32,    1 },
+    {   32,   32 }, {    2,    2 }, {   33,    2 }, {   64,   33 },
+    {    3,    3 }, {   64,   64 }, {   34,    3 }, {   65,   34 },
+    {    4,    4 }, {   35,    4 }, {   96,   65 }, {   66,   35 },
+    {   96,   96 }, {   97,   66 }, {   67,   36 }, {   36,    5 },
+    {    5,    5 }, {  128,   97 }, {   98,   67 }, {    6,    6 },
+    {  128,  128 }, {   68,   37 }, {   37,    6 }, {  129,   98 },
+    {   99,   68 }, {  160,  129 }, {  130,   99 }, {   38,    7 },
+    {   69,   38 }, {    7,    7 }, {  100,   69 }, {  161,  130 },
+    {  131,  100 }, {  160,  160 }, {   70,   39 }, {   39,    8 },
+    {    8,    8 }, {  101,   70 }, {  162,  131 }, {  132,  101 },
+    {  192,  161 }, {   71,   40 }, {  192,  192 }, {  102,   71 },
+    {   40,    9 }, {  163,  132 }, {    9,    9 }, {  193,  162 },
+    {  133,  102 }, {  164,  133 }, {   72,   41 }, {  103,   72 },
+    {  134,  103 }, {  224,  193 }, {   41,   10 }, {  194,  163 },
+    {   10,   10 }, {  224,  224 }, {  165,  134 }, {  225,  194 },
+    {  195,  164 }, {   73,   42 }, {  104,   73 }, {  135,  104 },
+    {   42,   11 }, {   11,   11 }, {  166,  135 }, {  196,  165 },
+    {  226,  195 }, {  256,  225 }, {   74,   43 }, {  105,   74 },
+    {  136,  105 }, {  227,  196 }, {   43,   12 }, {  197,  166 },
+    {  167,  136 }, {  257,  226 }, {  256,  256 }, {   12,   12 },
+    {  228,  197 }, {   75,   44 }, {  106,   75 }, {  198,  167 },
+    {  137,  106 }, {  258,  227 }, {  168,  137 }, {  288,  257 },
+    {   44,   13 }, {  229,  198 }, {  259,  228 }, {  199,  168 },
+    {  107,   76 }, {   13,   13 }, {  169,  138 }, {  138,  107 },
+    {  288,  288 }, {  289,  258 }, {   76,   45 }, {  230,  199 },
+    {  260,  229 }, {   45,   14 }, {  200,  169 }, {  139,  108 },
+    {  290,  259 }, {  108,   77 }, {  231,  200 }, {  320,  289 },
+    {  261,  230 }, {  170,  139 }, {   77,   46 }, {  291,  260 },
+    {   14,   14 }, {  321,  290 }, {  201,  170 }, {  262,  231 },
+    {  320,  320 }, {  171,  140 }, {  292,  261 }, {  232,  201 },
+    {  140,  109 }, {  322,  291 }, {  109,   78 }, {   46,   15 },
+    {  202,  171 }, {  263,  232 }, {  233,  202 }, {  293,  262 },
+    {  352,  321 }, {  323,  292 }, {   15,   15 }, {   78,   47 },
+    {  203,  172 }, {  264,  233 }, {  294,  263 }, {  324,  293 },
+    {  172,  141 }, {  353,  322 }, {  141,  110 }, {  234,  203 },
+    {  352,  352 }, {   47,   16 }, {  295,  264 }, {  110,   79 },
+    {  265,  234 }, {  354,  323 }, {  325,  294 }, {   79,   48 },
+    {   16,   16 }, {  204,  173 }, {  235,  204 }, {  173,  142 },
+    {  355,  324 }, {  384,  353 }, {  326,  295 }, {  142,  111 },
+    {  296,  265 }, {  266,  235 }, {  356,  325 }, {  385,  354 },
+    {  111,   80 }, {   48,   17 }, {  327,  296 }, {  297,  266 },
+    {  205,  174 }, {  384,  384 }, {  236,  205 }, {  357,  326 },
+    {  386,  355 }, {   80,   49 }, {  174,  143 }, {   17,   17 },
+    {  328,  297 }, {  358,  327 }, {  387,  356 }, {  298,  267 },
+    {  329,  298 }, {  388,  357 }, {  112,   81 }, {  416,  385 },
+    {  237,  206 }, {  359,  328 }, {   49,   18 }, {  206,  175 },
+    {  417,  386 }, {  389,  358 }, {  330,  299 }, {   18,   18 },
+    {  416,  416 }, {  360,  329 }, {   81,   50 }, {  418,  387 },
+    {  390,  359 }, {  238,  207 }, {   50,   19 }, {  361,  330 },
+    {  419,  388 }, {  113,   82 }, {  448,  417 }, {  448,  448 },
+    {  420,  389 }, {   82,   51 }, {  362,  331 }, {  449,  418 },
+    {  421,  390 }, {  480,  480 }, {  450,  419 }, {  422,  391 },
+    {  114,   83 }, {  451,  420 }, {  480,  449 }, {  452,  421 },
+    {  481,  450 }, {  453,  422 }, {  512,  512 }, {  482,  451 },
+    {  454,  423 }, {  512,  481 }, {  483,  452 }, {  513,  482 },
+    {  484,  453 }, {  514,  483 }, {  485,  454 }, {  544,  513 },
+    {  544,  544 }, {  486,  455 }, {  545,  514 }, {  546,  515 },
+    {  576,  576 }, {  576,  545 }, {  577,  546 }, {  578,  547 },
+    {  608,  577 }, {  609,  578 }, {  610,  579 }, {   19,   19 },
+    {  143,  112 }, {  267,  236 }, {  391,  360 }, {  515,  484 },
+    {  608,  608 }, {   20,   20 }, {   51,   20 }, {  144,  113 },
+    {  175,  144 }, {  268,  237 }, {  299,  268 }, {  392,  361 },
+    {  423,  392 }, {  516,  485 }, {  547,  516 }, {  640,  609 },
+    {  640,  640 }, {   21,   21 }, {   52,   21 }, {   83,   52 },
+    {  145,  114 }, {  176,  145 }, {  207,  176 }, {  269,  238 },
+    {  300,  269 }, {  331,  300 }, {  393,  362 }, {  424,  393 },
+    {  455,  424 }, {  517,  486 }, {  548,  517 }, {  579,  548 },
+    {  641,  610 }, {  672,  641 }, {  672,  672 }, {   22,   22 },
+    {   53,   22 }, {   84,   53 }, {  115,   84 }, {  146,  115 },
+    {  177,  146 }, {  208,  177 }, {  239,  208 }, {  270,  239 },
+    {  301,  270 }, {  332,  301 }, {  363,  332 }, {  394,  363 },
+    {  425,  394 }, {  456,  425 }, {  487,  456 }, {  518,  487 },
+    {  549,  518 }, {  580,  549 }, {  611,  580 }, {  642,  611 },
+    {  673,  642 }, {  704,  673 }, {  704,  704 }, {   54,   23 },
+    {   85,   54 }, {  116,   85 }, {  178,  147 }, {  209,  178 },
+    {  240,  209 }, {  302,  271 }, {  333,  302 }, {  364,  333 },
+    {  426,  395 }, {  457,  426 }, {  488,  457 }, {  550,  519 },
+    {  581,  550 }, {  612,  581 }, {  674,  643 }, {  705,  674 },
+    {  736,  705 }, {   86,   55 }, {  117,   86 }, {  210,  179 },
+    {  241,  210 }, {  334,  303 }, {  365,  334 }, {  458,  427 },
+    {  489,  458 }, {  582,  551 }, {  613,  582 }, {  706,  675 },
+    {  737,  706 }, {  118,   87 }, {  242,  211 }, {  366,  335 },
+    {  490,  459 }, {  614,  583 }, {  738,  707 }, {   23,   23 },
+    {  147,  116 }, {  271,  240 }, {  395,  364 }, {  519,  488 },
+    {  643,  612 }, {  736,  736 }, {   24,   24 }, {   55,   24 },
+    {  148,  117 }, {  179,  148 }, {  272,  241 }, {  303,  272 },
+    {  396,  365 }, {  427,  396 }, {  520,  489 }, {  551,  520 },
+    {  644,  613 }, {  675,  644 }, {  768,  737 }, {  768,  768 },
+    {   25,   25 }, {   56,   25 }, {   87,   56 }, {  149,  118 },
+    {  180,  149 }, {  211,  180 }, {  273,  242 }, {  304,  273 },
+    {  335,  304 }, {  397,  366 }, {  428,  397 }, {  459,  428 },
+    {  521,  490 }, {  552,  521 }, {  583,  552 }, {  645,  614 },
+    {  676,  645 }, {  707,  676 }, {  769,  738 }, {  800,  769 },
+    {  800,  800 }, {   26,   26 }, {   57,   26 }, {   88,   57 },
+    {  119,   88 }, {  150,  119 }, {  181,  150 }, {  212,  181 },
+    {  243,  212 }, {  274,  243 }, {  305,  274 }, {  336,  305 },
+    {  367,  336 }, {  398,  367 }, {  429,  398 }, {  460,  429 },
+    {  491,  460 }, {  522,  491 }, {  553,  522 }, {  584,  553 },
+    {  615,  584 }, {  646,  615 }, {  677,  646 }, {  708,  677 },
+    {  739,  708 }, {  770,  739 }, {  801,  770 }, {  832,  801 },
+    {  832,  832 }, {   58,   27 }, {   89,   58 }, {  120,   89 },
+    {  182,  151 }, {  213,  182 }, {  244,  213 }, {  306,  275 },
+    {  337,  306 }, {  368,  337 }, {  430,  399 }, {  461,  430 },
+    {  492,  461 }, {  554,  523 }, {  585,  554 }, {  616,  585 },
+    {  678,  647 }, {  709,  678 }, {  740,  709 }, {  802,  771 },
+    {  833,  802 }, {  864,  833 }, {   90,   59 }, {  121,   90 },
+    {  214,  183 }, {  245,  214 }, {  338,  307 }, {  369,  338 },
+    {  462,  431 }, {  493,  462 }, {  586,  555 }, {  617,  586 },
+    {  710,  679 }, {  741,  710 }, {  834,  803 }, {  865,  834 },
+    {  122,   91 }, {  246,  215 }, {  370,  339 }, {  494,  463 },
+    {  618,  587 }, {  742,  711 }, {  866,  835 }, {   27,   27 },
+    {  151,  120 }, {  275,  244 }, {  399,  368 }, {  523,  492 },
+    {  647,  616 }, {  771,  740 }, {  864,  864 }, {   28,   28 },
+    {   59,   28 }, {  152,  121 }, {  183,  152 }, {  276,  245 },
+    {  307,  276 }, {  400,  369 }, {  431,  400 }, {  524,  493 },
+    {  555,  524 }, {  648,  617 }, {  679,  648 }, {  772,  741 },
+    {  803,  772 }, {  896,  865 }, {  896,  896 }, {   29,   29 },
+    {   60,   29 }, {   91,   60 }, {  153,  122 }, {  184,  153 },
+    {  215,  184 }, {  277,  246 }, {  308,  277 }, {  339,  308 },
+    {  401,  370 }, {  432,  401 }, {  463,  432 }, {  525,  494 },
+    {  556,  525 }, {  587,  556 }, {  649,  618 }, {  680,  649 },
+    {  711,  680 }, {  773,  742 }, {  804,  773 }, {  835,  804 },
+    {  897,  866 }, {  928,  897 }, {  928,  928 }, {   30,   30 },
+    {   61,   30 }, {   92,   61 }, {  123,   92 }, {  154,  123 },
+    {  185,  154 }, {  216,  185 }, {  247,  216 }, {  278,  247 },
+    {  309,  278 }, {  340,  309 }, {  371,  340 }, {  402,  371 },
+    {  433,  402 }, {  464,  433 }, {  495,  464 }, {  526,  495 },
+    {  557,  526 }, {  588,  557 }, {  619,  588 }, {  650,  619 },
+    {  681,  650 }, {  712,  681 }, {  743,  712 }, {  774,  743 },
+    {  805,  774 }, {  836,  805 }, {  867,  836 }, {  898,  867 },
+    {  929,  898 }, {  960,  929 }, {  960,  960 }, {   62,   31 },
+    {   93,   62 }, {  124,   93 }, {  186,  155 }, {  217,  186 },
+    {  248,  217 }, {  310,  279 }, {  341,  310 }, {  372,  341 },
+    {  434,  403 }, {  465,  434 }, {  496,  465 }, {  558,  527 },
+    {  589,  558 }, {  620,  589 }, {  682,  651 }, {  713,  682 },
+    {  744,  713 }, {  806,  775 }, {  837,  806 }, {  868,  837 },
+    {  930,  899 }, {  961,  930 }, {  992,  961 }, {   94,   63 },
+    {  125,   94 }, {  218,  187 }, {  249,  218 }, {  342,  311 },
+    {  373,  342 }, {  466,  435 }, {  497,  466 }, {  590,  559 },
+    {  621,  590 }, {  714,  683 }, {  745,  714 }, {  838,  807 },
+    {  869,  838 }, {  962,  931 }, {  993,  962 }, {  126,   95 },
+    {  250,  219 }, {  374,  343 }, {  498,  467 }, {  622,  591 },
+    {  746,  715 }, {  870,  839 }, {  994,  963 }, {  155,  124 },
+    {  279,  248 }, {  403,  372 }, {  527,  496 }, {  651,  620 },
+    {  775,  744 }, {  899,  868 }, {  156,  125 }, {  187,  156 },
+    {  280,  249 }, {  311,  280 }, {  404,  373 }, {  435,  404 },
+    {  528,  497 }, {  559,  528 }, {  652,  621 }, {  683,  652 },
+    {  776,  745 }, {  807,  776 }, {  900,  869 }, {  931,  900 },
+    {  157,  126 }, {  188,  157 }, {  219,  188 }, {  281,  250 },
+    {  312,  281 }, {  343,  312 }, {  405,  374 }, {  436,  405 },
+    {  467,  436 }, {  529,  498 }, {  560,  529 }, {  591,  560 },
+    {  653,  622 }, {  684,  653 }, {  715,  684 }, {  777,  746 },
+    {  808,  777 }, {  839,  808 }, {  901,  870 }, {  932,  901 },
+    {  963,  932 }, {  158,  127 }, {  189,  158 }, {  220,  189 },
+    {  251,  220 }, {  282,  251 }, {  313,  282 }, {  344,  313 },
+    {  375,  344 }, {  406,  375 }, {  437,  406 }, {  468,  437 },
+    {  499,  468 }, {  530,  499 }, {  561,  530 }, {  592,  561 },
+    {  623,  592 }, {  654,  623 }, {  685,  654 }, {  716,  685 },
+    {  747,  716 }, {  778,  747 }, {  809,  778 }, {  840,  809 },
+    {  871,  840 }, {  902,  871 }, {  933,  902 }, {  964,  933 },
+    {  995,  964 }, {  190,  159 }, {  221,  190 }, {  252,  221 },
+    {  314,  283 }, {  345,  314 }, {  376,  345 }, {  438,  407 },
+    {  469,  438 }, {  500,  469 }, {  562,  531 }, {  593,  562 },
+    {  624,  593 }, {  686,  655 }, {  717,  686 }, {  748,  717 },
+    {  810,  779 }, {  841,  810 }, {  872,  841 }, {  934,  903 },
+    {  965,  934 }, {  996,  965 }, {  222,  191 }, {  253,  222 },
+    {  346,  315 }, {  377,  346 }, {  470,  439 }, {  501,  470 },
+    {  594,  563 }, {  625,  594 }, {  718,  687 }, {  749,  718 },
+    {  842,  811 }, {  873,  842 }, {  966,  935 }, {  997,  966 },
+    {  254,  223 }, {  378,  347 }, {  502,  471 }, {  626,  595 },
+    {  750,  719 }, {  874,  843 }, {  998,  967 }, {  283,  252 },
+    {  407,  376 }, {  531,  500 }, {  655,  624 }, {  779,  748 },
+    {  903,  872 }, {  284,  253 }, {  315,  284 }, {  408,  377 },
+    {  439,  408 }, {  532,  501 }, {  563,  532 }, {  656,  625 },
+    {  687,  656 }, {  780,  749 }, {  811,  780 }, {  904,  873 },
+    {  935,  904 }, {  285,  254 }, {  316,  285 }, {  347,  316 },
+    {  409,  378 }, {  440,  409 }, {  471,  440 }, {  533,  502 },
+    {  564,  533 }, {  595,  564 }, {  657,  626 }, {  688,  657 },
+    {  719,  688 }, {  781,  750 }, {  812,  781 }, {  843,  812 },
+    {  905,  874 }, {  936,  905 }, {  967,  936 }, {  286,  255 },
+    {  317,  286 }, {  348,  317 }, {  379,  348 }, {  410,  379 },
+    {  441,  410 }, {  472,  441 }, {  503,  472 }, {  534,  503 },
+    {  565,  534 }, {  596,  565 }, {  627,  596 }, {  658,  627 },
+    {  689,  658 }, {  720,  689 }, {  751,  720 }, {  782,  751 },
+    {  813,  782 }, {  844,  813 }, {  875,  844 }, {  906,  875 },
+    {  937,  906 }, {  968,  937 }, {  999,  968 }, {  318,  287 },
+    {  349,  318 }, {  380,  349 }, {  442,  411 }, {  473,  442 },
+    {  504,  473 }, {  566,  535 }, {  597,  566 }, {  628,  597 },
+    {  690,  659 }, {  721,  690 }, {  752,  721 }, {  814,  783 },
+    {  845,  814 }, {  876,  845 }, {  938,  907 }, {  969,  938 },
+    { 1000,  969 }, {  350,  319 }, {  381,  350 }, {  474,  443 },
+    {  505,  474 }, {  598,  567 }, {  629,  598 }, {  722,  691 },
+    {  753,  722 }, {  846,  815 }, {  877,  846 }, {  970,  939 },
+    { 1001,  970 }, {  382,  351 }, {  506,  475 }, {  630,  599 },
+    {  754,  723 }, {  878,  847 }, { 1002,  971 }, {  411,  380 },
+    {  535,  504 }, {  659,  628 }, {  783,  752 }, {  907,  876 },
+    {  412,  381 }, {  443,  412 }, {  536,  505 }, {  567,  536 },
+    {  660,  629 }, {  691,  660 }, {  784,  753 }, {  815,  784 },
+    {  908,  877 }, {  939,  908 }, {  413,  382 }, {  444,  413 },
+    {  475,  444 }, {  537,  506 }, {  568,  537 }, {  599,  568 },
+    {  661,  630 }, {  692,  661 }, {  723,  692 }, {  785,  754 },
+    {  816,  785 }, {  847,  816 }, {  909,  878 }, {  940,  909 },
+    {  971,  940 }, {  414,  383 }, {  445,  414 }, {  476,  445 },
+    {  507,  476 }, {  538,  507 }, {  569,  538 }, {  600,  569 },
+    {  631,  600 }, {  662,  631 }, {  693,  662 }, {  724,  693 },
+    {  755,  724 }, {  786,  755 }, {  817,  786 }, {  848,  817 },
+    {  879,  848 }, {  910,  879 }, {  941,  910 }, {  972,  941 },
+    { 1003,  972 }, {  446,  415 }, {  477,  446 }, {  508,  477 },
+    {  570,  539 }, {  601,  570 }, {  632,  601 }, {  694,  663 },
+    {  725,  694 }, {  756,  725 }, {  818,  787 }, {  849,  818 },
+    {  880,  849 }, {  942,  911 }, {  973,  942 }, { 1004,  973 },
+    {  478,  447 }, {  509,  478 }, {  602,  571 }, {  633,  602 },
+    {  726,  695 }, {  757,  726 }, {  850,  819 }, {  881,  850 },
+    {  974,  943 }, { 1005,  974 }, {  510,  479 }, {  634,  603 },
+    {  758,  727 }, {  882,  851 }, { 1006,  975 }, {  539,  508 },
+    {  663,  632 }, {  787,  756 }, {  911,  880 }, {  540,  509 },
+    {  571,  540 }, {  664,  633 }, {  695,  664 }, {  788,  757 },
+    {  819,  788 }, {  912,  881 }, {  943,  912 }, {  541,  510 },
+    {  572,  541 }, {  603,  572 }, {  665,  634 }, {  696,  665 },
+    {  727,  696 }, {  789,  758 }, {  820,  789 }, {  851,  820 },
+    {  913,  882 }, {  944,  913 }, {  975,  944 }, {  542,  511 },
+    {  573,  542 }, {  604,  573 }, {  635,  604 }, {  666,  635 },
+    {  697,  666 }, {  728,  697 }, {  759,  728 }, {  790,  759 },
+    {  821,  790 }, {  852,  821 }, {  883,  852 }, {  914,  883 },
+    {  945,  914 }, {  976,  945 }, { 1007,  976 }, {  574,  543 },
+    {  605,  574 }, {  636,  605 }, {  698,  667 }, {  729,  698 },
+    {  760,  729 }, {  822,  791 }, {  853,  822 }, {  884,  853 },
+    {  946,  915 }, {  977,  946 }, { 1008,  977 }, {  606,  575 },
+    {  637,  606 }, {  730,  699 }, {  761,  730 }, {  854,  823 },
+    {  885,  854 }, {  978,  947 }, { 1009,  978 }, {  638,  607 },
+    {  762,  731 }, {  886,  855 }, { 1010,  979 }, {  667,  636 },
+    {  791,  760 }, {  915,  884 }, {  668,  637 }, {  699,  668 },
+    {  792,  761 }, {  823,  792 }, {  916,  885 }, {  947,  916 },
+    {  669,  638 }, {  700,  669 }, {  731,  700 }, {  793,  762 },
+    {  824,  793 }, {  855,  824 }, {  917,  886 }, {  948,  917 },
+    {  979,  948 }, {  670,  639 }, {  701,  670 }, {  732,  701 },
+    {  763,  732 }, {  794,  763 }, {  825,  794 }, {  856,  825 },
+    {  887,  856 }, {  918,  887 }, {  949,  918 }, {  980,  949 },
+    { 1011,  980 }, {  702,  671 }, {  733,  702 }, {  764,  733 },
+    {  826,  795 }, {  857,  826 }, {  888,  857 }, {  950,  919 },
+    {  981,  950 }, { 1012,  981 }, {  734,  703 }, {  765,  734 },
+    {  858,  827 }, {  889,  858 }, {  982,  951 }, { 1013,  982 },
+    {  766,  735 }, {  890,  859 }, { 1014,  983 }, {  795,  764 },
+    {  919,  888 }, {  796,  765 }, {  827,  796 }, {  920,  889 },
+    {  951,  920 }, {  797,  766 }, {  828,  797 }, {  859,  828 },
+    {  921,  890 }, {  952,  921 }, {  983,  952 }, {  798,  767 },
+    {  829,  798 }, {  860,  829 }, {  891,  860 }, {  922,  891 },
+    {  953,  922 }, {  984,  953 }, { 1015,  984 }, {  830,  799 },
+    {  861,  830 }, {  892,  861 }, {  954,  923 }, {  985,  954 },
+    { 1016,  985 }, {  862,  831 }, {  893,  862 }, {  986,  955 },
+    { 1017,  986 }, {  894,  863 }, { 1018,  987 }, {  923,  892 },
+    {  924,  893 }, {  955,  924 }, {  925,  894 }, {  956,  925 },
+    {  987,  956 }, {  926,  895 }, {  957,  926 }, {  988,  957 },
+    { 1019,  988 }, {  958,  927 }, {  989,  958 }, { 1020,  989 },
+    {  990,  959 }, { 1021,  990 }, { 1022,  991 }, {    0,    0 },
+};
+
+static const int16_t (* const vp9_scans_nb[5][4])[2] = {
+    {
+        vp9_default_scan_4x4_nb, vp9_col_scan_4x4_nb,
+        vp9_row_scan_4x4_nb, vp9_default_scan_4x4_nb
+    }, {
+        vp9_default_scan_8x8_nb, vp9_col_scan_8x8_nb,
+        vp9_row_scan_8x8_nb, vp9_default_scan_8x8_nb
+    }, {
+        vp9_default_scan_16x16_nb, vp9_col_scan_16x16_nb,
+        vp9_row_scan_16x16_nb, vp9_default_scan_16x16_nb
+    }, {
+        vp9_default_scan_32x32_nb, vp9_default_scan_32x32_nb,
+        vp9_default_scan_32x32_nb, vp9_default_scan_32x32_nb
+    }, { // lossless
+        vp9_default_scan_4x4_nb, vp9_default_scan_4x4_nb,
+        vp9_default_scan_4x4_nb, vp9_default_scan_4x4_nb
+    }
+};
+
+static const uint8_t vp9_model_pareto8[256][8] = {
+    {   6,  86, 128,  11,  87,  42,  91,  52 },
+    {   3,  86, 128,   6,  86,  23,  88,  29 },
+    {   6,  86, 128,  11,  87,  42,  91,  52 },
+    {   9,  86, 129,  17,  88,  61,  94,  76 },
+    {  12,  86, 129,  22,  88,  77,  97,  93 },
+    {  15,  87, 129,  28,  89,  93, 100, 110 },
+    {  17,  87, 129,  33,  90, 105, 103, 123 },
+    {  20,  88, 130,  38,  91, 118, 106, 136 },
+    {  23,  88, 130,  43,  91, 128, 108, 146 },
+    {  26,  89, 131,  48,  92, 139, 111, 156 },
+    {  28,  89, 131,  53,  93, 147, 114, 163 },
+    {  31,  90, 131,  58,  94, 156, 117, 171 },
+    {  34,  90, 131,  62,  94, 163, 119, 177 },
+    {  37,  90, 132,  66,  95, 171, 122, 184 },
+    {  39,  90, 132,  70,  96, 177, 124, 189 },
+    {  42,  91, 132,  75,  97, 183, 127, 194 },
+    {  44,  91, 132,  79,  97, 188, 129, 198 },
+    {  47,  92, 133,  83,  98, 193, 132, 202 },
+    {  49,  92, 133,  86,  99, 197, 134, 205 },
+    {  52,  93, 133,  90, 100, 201, 137, 208 },
+    {  54,  93, 133,  94, 100, 204, 139, 211 },
+    {  57,  94, 134,  98, 101, 208, 142, 214 },
+    {  59,  94, 134, 101, 102, 211, 144, 216 },
+    {  62,  94, 135, 105, 103, 214, 146, 218 },
+    {  64,  94, 135, 108, 103, 216, 148, 220 },
+    {  66,  95, 135, 111, 104, 219, 151, 222 },
+    {  68,  95, 135, 114, 105, 221, 153, 223 },
+    {  71,  96, 136, 117, 106, 224, 155, 225 },
+    {  73,  96, 136, 120, 106, 225, 157, 226 },
+    {  76,  97, 136, 123, 107, 227, 159, 228 },
+    {  78,  97, 136, 126, 108, 229, 160, 229 },
+    {  80,  98, 137, 129, 109, 231, 162, 231 },
+    {  82,  98, 137, 131, 109, 232, 164, 232 },
+    {  84,  98, 138, 134, 110, 234, 166, 233 },
+    {  86,  98, 138, 137, 111, 235, 168, 234 },
+    {  89,  99, 138, 140, 112, 236, 170, 235 },
+    {  91,  99, 138, 142, 112, 237, 171, 235 },
+    {  93, 100, 139, 145, 113, 238, 173, 236 },
+    {  95, 100, 139, 147, 114, 239, 174, 237 },
+    {  97, 101, 140, 149, 115, 240, 176, 238 },
+    {  99, 101, 140, 151, 115, 241, 177, 238 },
+    { 101, 102, 140, 154, 116, 242, 179, 239 },
+    { 103, 102, 140, 156, 117, 242, 180, 239 },
+    { 105, 103, 141, 158, 118, 243, 182, 240 },
+    { 107, 103, 141, 160, 118, 243, 183, 240 },
+    { 109, 104, 141, 162, 119, 244, 185, 241 },
+    { 111, 104, 141, 164, 119, 244, 186, 241 },
+    { 113, 104, 142, 166, 120, 245, 187, 242 },
+    { 114, 104, 142, 168, 121, 245, 188, 242 },
+    { 116, 105, 143, 170, 122, 246, 190, 243 },
+    { 118, 105, 143, 171, 122, 246, 191, 243 },
+    { 120, 106, 143, 173, 123, 247, 192, 244 },
+    { 121, 106, 143, 175, 124, 247, 193, 244 },
+    { 123, 107, 144, 177, 125, 248, 195, 244 },
+    { 125, 107, 144, 178, 125, 248, 196, 244 },
+    { 127, 108, 145, 180, 126, 249, 197, 245 },
+    { 128, 108, 145, 181, 127, 249, 198, 245 },
+    { 130, 109, 145, 183, 128, 249, 199, 245 },
+    { 132, 109, 145, 184, 128, 249, 200, 245 },
+    { 134, 110, 146, 186, 129, 250, 201, 246 },
+    { 135, 110, 146, 187, 130, 250, 202, 246 },
+    { 137, 111, 147, 189, 131, 251, 203, 246 },
+    { 138, 111, 147, 190, 131, 251, 204, 246 },
+    { 140, 112, 147, 192, 132, 251, 205, 247 },
+    { 141, 112, 147, 193, 132, 251, 206, 247 },
+    { 143, 113, 148, 194, 133, 251, 207, 247 },
+    { 144, 113, 148, 195, 134, 251, 207, 247 },
+    { 146, 114, 149, 197, 135, 252, 208, 248 },
+    { 147, 114, 149, 198, 135, 252, 209, 248 },
+    { 149, 115, 149, 199, 136, 252, 210, 248 },
+    { 150, 115, 149, 200, 137, 252, 210, 248 },
+    { 152, 115, 150, 201, 138, 252, 211, 248 },
+    { 153, 115, 150, 202, 138, 252, 212, 248 },
+    { 155, 116, 151, 204, 139, 253, 213, 249 },
+    { 156, 116, 151, 205, 139, 253, 213, 249 },
+    { 158, 117, 151, 206, 140, 253, 214, 249 },
+    { 159, 117, 151, 207, 141, 253, 215, 249 },
+    { 161, 118, 152, 208, 142, 253, 216, 249 },
+    { 162, 118, 152, 209, 142, 253, 216, 249 },
+    { 163, 119, 153, 210, 143, 253, 217, 249 },
+    { 164, 119, 153, 211, 143, 253, 217, 249 },
+    { 166, 120, 153, 212, 144, 254, 218, 250 },
+    { 167, 120, 153, 212, 145, 254, 219, 250 },
+    { 168, 121, 154, 213, 146, 254, 220, 250 },
+    { 169, 121, 154, 214, 146, 254, 220, 250 },
+    { 171, 122, 155, 215, 147, 254, 221, 250 },
+    { 172, 122, 155, 216, 147, 254, 221, 250 },
+    { 173, 123, 155, 217, 148, 254, 222, 250 },
+    { 174, 123, 155, 217, 149, 254, 222, 250 },
+    { 176, 124, 156, 218, 150, 254, 223, 250 },
+    { 177, 124, 156, 219, 150, 254, 223, 250 },
+    { 178, 125, 157, 220, 151, 254, 224, 251 },
+    { 179, 125, 157, 220, 151, 254, 224, 251 },
+    { 180, 126, 157, 221, 152, 254, 225, 251 },
+    { 181, 126, 157, 221, 152, 254, 225, 251 },
+    { 183, 127, 158, 222, 153, 254, 226, 251 },
+    { 184, 127, 158, 223, 154, 254, 226, 251 },
+    { 185, 128, 159, 224, 155, 255, 227, 251 },
+    { 186, 128, 159, 224, 155, 255, 227, 251 },
+    { 187, 129, 160, 225, 156, 255, 228, 251 },
+    { 188, 130, 160, 225, 156, 255, 228, 251 },
+    { 189, 131, 160, 226, 157, 255, 228, 251 },
+    { 190, 131, 160, 226, 158, 255, 228, 251 },
+    { 191, 132, 161, 227, 159, 255, 229, 251 },
+    { 192, 132, 161, 227, 159, 255, 229, 251 },
+    { 193, 133, 162, 228, 160, 255, 230, 252 },
+    { 194, 133, 162, 229, 160, 255, 230, 252 },
+    { 195, 134, 163, 230, 161, 255, 231, 252 },
+    { 196, 134, 163, 230, 161, 255, 231, 252 },
+    { 197, 135, 163, 231, 162, 255, 231, 252 },
+    { 198, 135, 163, 231, 162, 255, 231, 252 },
+    { 199, 136, 164, 232, 163, 255, 232, 252 },
+    { 200, 136, 164, 232, 164, 255, 232, 252 },
+    { 201, 137, 165, 233, 165, 255, 233, 252 },
+    { 201, 137, 165, 233, 165, 255, 233, 252 },
+    { 202, 138, 166, 233, 166, 255, 233, 252 },
+    { 203, 138, 166, 233, 166, 255, 233, 252 },
+    { 204, 139, 166, 234, 167, 255, 234, 252 },
+    { 205, 139, 166, 234, 167, 255, 234, 252 },
+    { 206, 140, 167, 235, 168, 255, 235, 252 },
+    { 206, 140, 167, 235, 168, 255, 235, 252 },
+    { 207, 141, 168, 236, 169, 255, 235, 252 },
+    { 208, 141, 168, 236, 170, 255, 235, 252 },
+    { 209, 142, 169, 237, 171, 255, 236, 252 },
+    { 209, 143, 169, 237, 171, 255, 236, 252 },
+    { 210, 144, 169, 237, 172, 255, 236, 252 },
+    { 211, 144, 169, 237, 172, 255, 236, 252 },
+    { 212, 145, 170, 238, 173, 255, 237, 252 },
+    { 213, 145, 170, 238, 173, 255, 237, 252 },
+    { 214, 146, 171, 239, 174, 255, 237, 253 },
+    { 214, 146, 171, 239, 174, 255, 237, 253 },
+    { 215, 147, 172, 240, 175, 255, 238, 253 },
+    { 215, 147, 172, 240, 175, 255, 238, 253 },
+    { 216, 148, 173, 240, 176, 255, 238, 253 },
+    { 217, 148, 173, 240, 176, 255, 238, 253 },
+    { 218, 149, 173, 241, 177, 255, 239, 253 },
+    { 218, 149, 173, 241, 178, 255, 239, 253 },
+    { 219, 150, 174, 241, 179, 255, 239, 253 },
+    { 219, 151, 174, 241, 179, 255, 239, 253 },
+    { 220, 152, 175, 242, 180, 255, 240, 253 },
+    { 221, 152, 175, 242, 180, 255, 240, 253 },
+    { 222, 153, 176, 242, 181, 255, 240, 253 },
+    { 222, 153, 176, 242, 181, 255, 240, 253 },
+    { 223, 154, 177, 243, 182, 255, 240, 253 },
+    { 223, 154, 177, 243, 182, 255, 240, 253 },
+    { 224, 155, 178, 244, 183, 255, 241, 253 },
+    { 224, 155, 178, 244, 183, 255, 241, 253 },
+    { 225, 156, 178, 244, 184, 255, 241, 253 },
+    { 225, 157, 178, 244, 184, 255, 241, 253 },
+    { 226, 158, 179, 244, 185, 255, 242, 253 },
+    { 227, 158, 179, 244, 185, 255, 242, 253 },
+    { 228, 159, 180, 245, 186, 255, 242, 253 },
+    { 228, 159, 180, 245, 186, 255, 242, 253 },
+    { 229, 160, 181, 245, 187, 255, 242, 253 },
+    { 229, 160, 181, 245, 187, 255, 242, 253 },
+    { 230, 161, 182, 246, 188, 255, 243, 253 },
+    { 230, 162, 182, 246, 188, 255, 243, 253 },
+    { 231, 163, 183, 246, 189, 255, 243, 253 },
+    { 231, 163, 183, 246, 189, 255, 243, 253 },
+    { 232, 164, 184, 247, 190, 255, 243, 253 },
+    { 232, 164, 184, 247, 190, 255, 243, 253 },
+    { 233, 165, 185, 247, 191, 255, 244, 253 },
+    { 233, 165, 185, 247, 191, 255, 244, 253 },
+    { 234, 166, 185, 247, 192, 255, 244, 253 },
+    { 234, 167, 185, 247, 192, 255, 244, 253 },
+    { 235, 168, 186, 248, 193, 255, 244, 253 },
+    { 235, 168, 186, 248, 193, 255, 244, 253 },
+    { 236, 169, 187, 248, 194, 255, 244, 253 },
+    { 236, 169, 187, 248, 194, 255, 244, 253 },
+    { 236, 170, 188, 248, 195, 255, 245, 253 },
+    { 236, 170, 188, 248, 195, 255, 245, 253 },
+    { 237, 171, 189, 249, 196, 255, 245, 254 },
+    { 237, 172, 189, 249, 196, 255, 245, 254 },
+    { 238, 173, 190, 249, 197, 255, 245, 254 },
+    { 238, 173, 190, 249, 197, 255, 245, 254 },
+    { 239, 174, 191, 249, 198, 255, 245, 254 },
+    { 239, 174, 191, 249, 198, 255, 245, 254 },
+    { 240, 175, 192, 249, 199, 255, 246, 254 },
+    { 240, 176, 192, 249, 199, 255, 246, 254 },
+    { 240, 177, 193, 250, 200, 255, 246, 254 },
+    { 240, 177, 193, 250, 200, 255, 246, 254 },
+    { 241, 178, 194, 250, 201, 255, 246, 254 },
+    { 241, 178, 194, 250, 201, 255, 246, 254 },
+    { 242, 179, 195, 250, 202, 255, 246, 254 },
+    { 242, 180, 195, 250, 202, 255, 246, 254 },
+    { 242, 181, 196, 250, 203, 255, 247, 254 },
+    { 242, 181, 196, 250, 203, 255, 247, 254 },
+    { 243, 182, 197, 251, 204, 255, 247, 254 },
+    { 243, 183, 197, 251, 204, 255, 247, 254 },
+    { 244, 184, 198, 251, 205, 255, 247, 254 },
+    { 244, 184, 198, 251, 205, 255, 247, 254 },
+    { 244, 185, 199, 251, 206, 255, 247, 254 },
+    { 244, 185, 199, 251, 206, 255, 247, 254 },
+    { 245, 186, 200, 251, 207, 255, 247, 254 },
+    { 245, 187, 200, 251, 207, 255, 247, 254 },
+    { 246, 188, 201, 252, 207, 255, 248, 254 },
+    { 246, 188, 201, 252, 207, 255, 248, 254 },
+    { 246, 189, 202, 252, 208, 255, 248, 254 },
+    { 246, 190, 202, 252, 208, 255, 248, 254 },
+    { 247, 191, 203, 252, 209, 255, 248, 254 },
+    { 247, 191, 203, 252, 209, 255, 248, 254 },
+    { 247, 192, 204, 252, 210, 255, 248, 254 },
+    { 247, 193, 204, 252, 210, 255, 248, 254 },
+    { 248, 194, 205, 252, 211, 255, 248, 254 },
+    { 248, 194, 205, 252, 211, 255, 248, 254 },
+    { 248, 195, 206, 252, 212, 255, 249, 254 },
+    { 248, 196, 206, 252, 212, 255, 249, 254 },
+    { 249, 197, 207, 253, 213, 255, 249, 254 },
+    { 249, 197, 207, 253, 213, 255, 249, 254 },
+    { 249, 198, 208, 253, 214, 255, 249, 254 },
+    { 249, 199, 209, 253, 214, 255, 249, 254 },
+    { 250, 200, 210, 253, 215, 255, 249, 254 },
+    { 250, 200, 210, 253, 215, 255, 249, 254 },
+    { 250, 201, 211, 253, 215, 255, 249, 254 },
+    { 250, 202, 211, 253, 215, 255, 249, 254 },
+    { 250, 203, 212, 253, 216, 255, 249, 254 },
+    { 250, 203, 212, 253, 216, 255, 249, 254 },
+    { 251, 204, 213, 253, 217, 255, 250, 254 },
+    { 251, 205, 213, 253, 217, 255, 250, 254 },
+    { 251, 206, 214, 254, 218, 255, 250, 254 },
+    { 251, 206, 215, 254, 218, 255, 250, 254 },
+    { 252, 207, 216, 254, 219, 255, 250, 254 },
+    { 252, 208, 216, 254, 219, 255, 250, 254 },
+    { 252, 209, 217, 254, 220, 255, 250, 254 },
+    { 252, 210, 217, 254, 220, 255, 250, 254 },
+    { 252, 211, 218, 254, 221, 255, 250, 254 },
+    { 252, 212, 218, 254, 221, 255, 250, 254 },
+    { 253, 213, 219, 254, 222, 255, 250, 254 },
+    { 253, 213, 220, 254, 222, 255, 250, 254 },
+    { 253, 214, 221, 254, 223, 255, 250, 254 },
+    { 253, 215, 221, 254, 223, 255, 250, 254 },
+    { 253, 216, 222, 254, 224, 255, 251, 254 },
+    { 253, 217, 223, 254, 224, 255, 251, 254 },
+    { 253, 218, 224, 254, 225, 255, 251, 254 },
+    { 253, 219, 224, 254, 225, 255, 251, 254 },
+    { 254, 220, 225, 254, 225, 255, 251, 254 },
+    { 254, 221, 226, 254, 225, 255, 251, 254 },
+    { 254, 222, 227, 255, 226, 255, 251, 254 },
+    { 254, 223, 227, 255, 226, 255, 251, 254 },
+    { 254, 224, 228, 255, 227, 255, 251, 254 },
+    { 254, 225, 229, 255, 227, 255, 251, 254 },
+    { 254, 226, 230, 255, 228, 255, 251, 254 },
+    { 254, 227, 230, 255, 229, 255, 251, 254 },
+    { 255, 228, 231, 255, 230, 255, 251, 254 },
+    { 255, 229, 232, 255, 230, 255, 251, 254 },
+    { 255, 230, 233, 255, 231, 255, 252, 254 },
+    { 255, 231, 234, 255, 231, 255, 252, 254 },
+    { 255, 232, 235, 255, 232, 255, 252, 254 },
+    { 255, 233, 236, 255, 232, 255, 252, 254 },
+    { 255, 235, 237, 255, 233, 255, 252, 254 },
+    { 255, 236, 238, 255, 234, 255, 252, 254 },
+    { 255, 238, 240, 255, 235, 255, 252, 255 },
+    { 255, 239, 241, 255, 235, 255, 252, 254 },
+    { 255, 241, 243, 255, 236, 255, 252, 254 },
+    { 255, 243, 245, 255, 237, 255, 252, 254 },
+    { 255, 246, 247, 255, 239, 255, 253, 255 },
+};
+
+typedef struct {
+    uint8_t y_mode[4][9];
+    uint8_t uv_mode[10][9];
+    uint8_t filter[4][2];
+    uint8_t mv_mode[7][3];
+    uint8_t intra[4];
+    uint8_t comp[5];
+    uint8_t single_ref[5][2];
+    uint8_t comp_ref[5];
+    uint8_t tx32p[2][3];
+    uint8_t tx16p[2][2];
+    uint8_t tx8p[2];
+    uint8_t skip[3];
+    uint8_t mv_joint[3];
+    struct {
+        uint8_t sign;
+        uint8_t classes[10];
+        uint8_t class0;
+        uint8_t bits[10];
+        uint8_t class0_fp[2][3];
+        uint8_t fp[3];
+        uint8_t class0_hp;
+        uint8_t hp;
+    } mv_comp[2];
+    uint8_t partition[4][4][3];
+} prob_context;
+
+static const prob_context vp9_default_probs = {
+    { /* y_mode */
+        {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* bsize < 8x8 */,
+        { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* bsize < 16x16 */,
+        { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* bsize < 32x32 */,
+        { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* bsize >= 32x32 */
+    }, { /* uv_mode */
+        {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
+        {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
+        { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
+        {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,
+        {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
+        {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
+        {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
+        {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
+        {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
+        { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
+    }, { /* filter */
+        { 235, 162, },
+        {  36, 255, },
+        {  34,   3, },
+        { 149, 144, },
+    }, { /* mv_mode */
+        {  2, 173,  34},  // 0 = both zero mv
+        {  7, 145,  85},  // 1 = one zero mv + one a predicted mv
+        {  7, 166,  63},  // 2 = two predicted mvs
+        {  7,  94,  66},  // 3 = one predicted/zero and one new mv
+        {  8,  64,  46},  // 4 = two new mvs
+        { 17,  81,  31},  // 5 = one intra neighbour + x
+        { 25,  29,  30},  // 6 = two intra neighbours
+    }, { /* intra */
+        9, 102, 187, 225
+    }, { /* comp */
+        239, 183, 119,  96,  41
+    }, { /* single_ref */
+        {  33,  16 },
+        {  77,  74 },
+        { 142, 142 },
+        { 172, 170 },
+        { 238, 247 }
+    }, { /* comp_ref */
+        50, 126, 123, 221, 226
+    }, { /* tx32p */
+        { 3, 136, 37, },
+        { 5,  52, 13, },
+    }, { /* tx16p */
+        { 20, 152, },
+        { 15, 101, },
+    }, { /* tx8p */
+        100, 66
+    }, { /* skip */
+        192, 128, 64
+    }, { /* mv_joint */
+        32, 64, 96
+    }, {
+        { /* mv vertical component */
+            128, /* sign */
+            { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, /* class */
+            216, /* class0 */
+            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */
+            { /* class0_fp */
+                { 128, 128, 64 },
+                {  96, 112, 64 }
+            },
+            { 64, 96, 64 }, /* fp */
+            160, /* class0_hp bit */
+            128, /* hp */
+        }, { /* mv horizontal component */
+            128, /* sign */
+            { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, /* class */
+            208, /* class0 */
+            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
+            { /* class0_fp */
+                { 128, 128, 64 },
+                {  96, 112, 64 }
+            },
+            { 64, 96, 64 }, /* fp */
+            160, /* class0_hp bit */
+            128, /* hp */
+        }
+    }, { /* partition */
+        { /* 64x64 -> 32x32 */
+            { 222,  34,  30 } /* a/l both not split */,
+            {  72,  16,  44 } /* a split, l not split */,
+            {  58,  32,  12 } /* l split, a not split */,
+            {  10,   7,   6 } /* a/l both split */,
+        }, { /* 32x32 -> 16x16 */
+            { 177,  58,  59 } /* a/l both not split */,
+            {  68,  26,  63 } /* a split, l not split */,
+            {  52,  79,  25 } /* l split, a not split */,
+            {  17,  14,  12 } /* a/l both split */,
+        }, { /* 16x16 -> 8x8 */
+            { 174,  73,  87 } /* a/l both not split */,
+            {  92,  41,  83 } /* a split, l not split */,
+            {  82,  99,  50 } /* l split, a not split */,
+            {  53,  39,  39 } /* a/l both split */,
+        }, { /* 8x8 -> 4x4 */
+            { 199, 122, 141 } /* a/l both not split */,
+            { 147,  63, 159 } /* a split, l not split */,
+            { 148, 133, 118 } /* l split, a not split */,
+            { 121, 104, 114 } /* a/l both split */,
+        }
+    },
+};
+
+static const uint8_t vp9_default_coef_probs[4][2][2][6][6][3] = {
+    { /* tx = 4x4 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 195,  29, 183 },
+                    {  84,  49, 136 },
+                    {   8,  42,  71 }
+                }, { /* Coeff Band 1 */
+                    {  31, 107, 169 },
+                    {  35,  99, 159 },
+                    {  17,  82, 140 },
+                    {   8,  66, 114 },
+                    {   2,  44,  76 },
+                    {   1,  19,  32 }
+                }, { /* Coeff Band 2 */
+                    {  40, 132, 201 },
+                    {  29, 114, 187 },
+                    {  13,  91, 157 },
+                    {   7,  75, 127 },
+                    {   3,  58,  95 },
+                    {   1,  28,  47 }
+                }, { /* Coeff Band 3 */
+                    {  69, 142, 221 },
+                    {  42, 122, 201 },
+                    {  15,  91, 159 },
+                    {   6,  67, 121 },
+                    {   1,  42,  77 },
+                    {   1,  17,  31 }
+                }, { /* Coeff Band 4 */
+                    { 102, 148, 228 },
+                    {  67, 117, 204 },
+                    {  17,  82, 154 },
+                    {   6,  59, 114 },
+                    {   2,  39,  75 },
+                    {   1,  15,  29 }
+                }, { /* Coeff Band 5 */
+                    { 156,  57, 233 },
+                    { 119,  57, 212 },
+                    {  58,  48, 163 },
+                    {  29,  40, 124 },
+                    {  12,  30,  81 },
+                    {   3,  12,  31 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 191, 107, 226 },
+                    { 124, 117, 204 },
+                    {  25,  99, 155 }
+                }, { /* Coeff Band 1 */
+                    {  29, 148, 210 },
+                    {  37, 126, 194 },
+                    {   8,  93, 157 },
+                    {   2,  68, 118 },
+                    {   1,  39,  69 },
+                    {   1,  17,  33 }
+                }, { /* Coeff Band 2 */
+                    {  41, 151, 213 },
+                    {  27, 123, 193 },
+                    {   3,  82, 144 },
+                    {   1,  58, 105 },
+                    {   1,  32,  60 },
+                    {   1,  13,  26 }
+                }, { /* Coeff Band 3 */
+                    {  59, 159, 220 },
+                    {  23, 126, 198 },
+                    {   4,  88, 151 },
+                    {   1,  66, 114 },
+                    {   1,  38,  71 },
+                    {   1,  18,  34 }
+                }, { /* Coeff Band 4 */
+                    { 114, 136, 232 },
+                    {  51, 114, 207 },
+                    {  11,  83, 155 },
+                    {   3,  56, 105 },
+                    {   1,  33,  65 },
+                    {   1,  17,  34 }
+                }, { /* Coeff Band 5 */
+                    { 149,  65, 234 },
+                    { 121,  57, 215 },
+                    {  61,  49, 166 },
+                    {  28,  36, 114 },
+                    {  12,  25,  76 },
+                    {   3,  16,  42 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 214,  49, 220 },
+                    { 132,  63, 188 },
+                    {  42,  65, 137 }
+                }, { /* Coeff Band 1 */
+                    {  85, 137, 221 },
+                    { 104, 131, 216 },
+                    {  49, 111, 192 },
+                    {  21,  87, 155 },
+                    {   2,  49,  87 },
+                    {   1,  16,  28 }
+                }, { /* Coeff Band 2 */
+                    {  89, 163, 230 },
+                    {  90, 137, 220 },
+                    {  29, 100, 183 },
+                    {  10,  70, 135 },
+                    {   2,  42,  81 },
+                    {   1,  17,  33 }
+                }, { /* Coeff Band 3 */
+                    { 108, 167, 237 },
+                    {  55, 133, 222 },
+                    {  15,  97, 179 },
+                    {   4,  72, 135 },
+                    {   1,  45,  85 },
+                    {   1,  19,  38 }
+                }, { /* Coeff Band 4 */
+                    { 124, 146, 240 },
+                    {  66, 124, 224 },
+                    {  17,  88, 175 },
+                    {   4,  58, 122 },
+                    {   1,  36,  75 },
+                    {   1,  18,  37 }
+                }, { /* Coeff Band 5 */
+                    { 141,  79, 241 },
+                    { 126,  70, 227 },
+                    {  66,  58, 182 },
+                    {  30,  44, 136 },
+                    {  12,  34,  96 },
+                    {   2,  20,  47 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 229,  99, 249 },
+                    { 143, 111, 235 },
+                    {  46, 109, 192 }
+                }, { /* Coeff Band 1 */
+                    {  82, 158, 236 },
+                    {  94, 146, 224 },
+                    {  25, 117, 191 },
+                    {   9,  87, 149 },
+                    {   3,  56,  99 },
+                    {   1,  33,  57 }
+                }, { /* Coeff Band 2 */
+                    {  83, 167, 237 },
+                    {  68, 145, 222 },
+                    {  10, 103, 177 },
+                    {   2,  72, 131 },
+                    {   1,  41,  79 },
+                    {   1,  20,  39 }
+                }, { /* Coeff Band 3 */
+                    {  99, 167, 239 },
+                    {  47, 141, 224 },
+                    {  10, 104, 178 },
+                    {   2,  73, 133 },
+                    {   1,  44,  85 },
+                    {   1,  22,  47 }
+                }, { /* Coeff Band 4 */
+                    { 127, 145, 243 },
+                    {  71, 129, 228 },
+                    {  17,  93, 177 },
+                    {   3,  61, 124 },
+                    {   1,  41,  84 },
+                    {   1,  21,  52 }
+                }, { /* Coeff Band 5 */
+                    { 157,  78, 244 },
+                    { 140,  72, 231 },
+                    {  69,  58, 184 },
+                    {  31,  44, 137 },
+                    {  14,  38, 105 },
+                    {   8,  23,  61 }
+                }
+            }
+        }
+    }, { /* tx = 8x8 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 125,  34, 187 },
+                    {  52,  41, 133 },
+                    {   6,  31,  56 }
+                }, { /* Coeff Band 1 */
+                    {  37, 109, 153 },
+                    {  51, 102, 147 },
+                    {  23,  87, 128 },
+                    {   8,  67, 101 },
+                    {   1,  41,  63 },
+                    {   1,  19,  29 }
+                }, { /* Coeff Band 2 */
+                    {  31, 154, 185 },
+                    {  17, 127, 175 },
+                    {   6,  96, 145 },
+                    {   2,  73, 114 },
+                    {   1,  51,  82 },
+                    {   1,  28,  45 }
+                }, { /* Coeff Band 3 */
+                    {  23, 163, 200 },
+                    {  10, 131, 185 },
+                    {   2,  93, 148 },
+                    {   1,  67, 111 },
+                    {   1,  41,  69 },
+                    {   1,  14,  24 }
+                }, { /* Coeff Band 4 */
+                    {  29, 176, 217 },
+                    {  12, 145, 201 },
+                    {   3, 101, 156 },
+                    {   1,  69, 111 },
+                    {   1,  39,  63 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 5 */
+                    {  57, 192, 233 },
+                    {  25, 154, 215 },
+                    {   6, 109, 167 },
+                    {   3,  78, 118 },
+                    {   1,  48,  69 },
+                    {   1,  21,  29 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 202, 105, 245 },
+                    { 108, 106, 216 },
+                    {  18,  90, 144 }
+                }, { /* Coeff Band 1 */
+                    {  33, 172, 219 },
+                    {  64, 149, 206 },
+                    {  14, 117, 177 },
+                    {   5,  90, 141 },
+                    {   2,  61,  95 },
+                    {   1,  37,  57 }
+                }, { /* Coeff Band 2 */
+                    {  33, 179, 220 },
+                    {  11, 140, 198 },
+                    {   1,  89, 148 },
+                    {   1,  60, 104 },
+                    {   1,  33,  57 },
+                    {   1,  12,  21 }
+                }, { /* Coeff Band 3 */
+                    {  30, 181, 221 },
+                    {   8, 141, 198 },
+                    {   1,  87, 145 },
+                    {   1,  58, 100 },
+                    {   1,  31,  55 },
+                    {   1,  12,  20 }
+                }, { /* Coeff Band 4 */
+                    {  32, 186, 224 },
+                    {   7, 142, 198 },
+                    {   1,  86, 143 },
+                    {   1,  58, 100 },
+                    {   1,  31,  55 },
+                    {   1,  12,  22 }
+                }, { /* Coeff Band 5 */
+                    {  57, 192, 227 },
+                    {  20, 143, 204 },
+                    {   3,  96, 154 },
+                    {   1,  68, 112 },
+                    {   1,  42,  69 },
+                    {   1,  19,  32 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 212,  35, 215 },
+                    { 113,  47, 169 },
+                    {  29,  48, 105 }
+                }, { /* Coeff Band 1 */
+                    {  74, 129, 203 },
+                    { 106, 120, 203 },
+                    {  49, 107, 178 },
+                    {  19,  84, 144 },
+                    {   4,  50,  84 },
+                    {   1,  15,  25 }
+                }, { /* Coeff Band 2 */
+                    {  71, 172, 217 },
+                    {  44, 141, 209 },
+                    {  15, 102, 173 },
+                    {   6,  76, 133 },
+                    {   2,  51,  89 },
+                    {   1,  24,  42 }
+                }, { /* Coeff Band 3 */
+                    {  64, 185, 231 },
+                    {  31, 148, 216 },
+                    {   8, 103, 175 },
+                    {   3,  74, 131 },
+                    {   1,  46,  81 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 4 */
+                    {  65, 196, 235 },
+                    {  25, 157, 221 },
+                    {   5, 105, 174 },
+                    {   1,  67, 120 },
+                    {   1,  38,  69 },
+                    {   1,  15,  30 }
+                }, { /* Coeff Band 5 */
+                    {  65, 204, 238 },
+                    {  30, 156, 224 },
+                    {   7, 107, 177 },
+                    {   2,  70, 124 },
+                    {   1,  42,  73 },
+                    {   1,  18,  34 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 225,  86, 251 },
+                    { 144, 104, 235 },
+                    {  42,  99, 181 }
+                }, { /* Coeff Band 1 */
+                    {  85, 175, 239 },
+                    { 112, 165, 229 },
+                    {  29, 136, 200 },
+                    {  12, 103, 162 },
+                    {   6,  77, 123 },
+                    {   2,  53,  84 }
+                }, { /* Coeff Band 2 */
+                    {  75, 183, 239 },
+                    {  30, 155, 221 },
+                    {   3, 106, 171 },
+                    {   1,  74, 128 },
+                    {   1,  44,  76 },
+                    {   1,  17,  28 }
+                }, { /* Coeff Band 3 */
+                    {  73, 185, 240 },
+                    {  27, 159, 222 },
+                    {   2, 107, 172 },
+                    {   1,  75, 127 },
+                    {   1,  42,  73 },
+                    {   1,  17,  29 }
+                }, { /* Coeff Band 4 */
+                    {  62, 190, 238 },
+                    {  21, 159, 222 },
+                    {   2, 107, 172 },
+                    {   1,  72, 122 },
+                    {   1,  40,  71 },
+                    {   1,  18,  32 }
+                }, { /* Coeff Band 5 */
+                    {  61, 199, 240 },
+                    {  27, 161, 226 },
+                    {   4, 113, 180 },
+                    {   1,  76, 129 },
+                    {   1,  46,  80 },
+                    {   1,  23,  41 }
+                }
+            }
+        }
+    }, { /* tx = 16x16 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    {   7,  27, 153 },
+                    {   5,  30,  95 },
+                    {   1,  16,  30 }
+                }, { /* Coeff Band 1 */
+                    {  50,  75, 127 },
+                    {  57,  75, 124 },
+                    {  27,  67, 108 },
+                    {  10,  54,  86 },
+                    {   1,  33,  52 },
+                    {   1,  12,  18 }
+                }, { /* Coeff Band 2 */
+                    {  43, 125, 151 },
+                    {  26, 108, 148 },
+                    {   7,  83, 122 },
+                    {   2,  59,  89 },
+                    {   1,  38,  60 },
+                    {   1,  17,  27 }
+                }, { /* Coeff Band 3 */
+                    {  23, 144, 163 },
+                    {  13, 112, 154 },
+                    {   2,  75, 117 },
+                    {   1,  50,  81 },
+                    {   1,  31,  51 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 4 */
+                    {  18, 162, 185 },
+                    {   6, 123, 171 },
+                    {   1,  78, 125 },
+                    {   1,  51,  86 },
+                    {   1,  31,  54 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 5 */
+                    {  15, 199, 227 },
+                    {   3, 150, 204 },
+                    {   1,  91, 146 },
+                    {   1,  55,  95 },
+                    {   1,  30,  53 },
+                    {   1,  11,  20 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    {  19,  55, 240 },
+                    {  19,  59, 196 },
+                    {   3,  52, 105 }
+                }, { /* Coeff Band 1 */
+                    {  41, 166, 207 },
+                    { 104, 153, 199 },
+                    {  31, 123, 181 },
+                    {  14, 101, 152 },
+                    {   5,  72, 106 },
+                    {   1,  36,  52 }
+                }, { /* Coeff Band 2 */
+                    {  35, 176, 211 },
+                    {  12, 131, 190 },
+                    {   2,  88, 144 },
+                    {   1,  60, 101 },
+                    {   1,  36,  60 },
+                    {   1,  16,  28 }
+                }, { /* Coeff Band 3 */
+                    {  28, 183, 213 },
+                    {   8, 134, 191 },
+                    {   1,  86, 142 },
+                    {   1,  56,  96 },
+                    {   1,  30,  53 },
+                    {   1,  12,  20 }
+                }, { /* Coeff Band 4 */
+                    {  20, 190, 215 },
+                    {   4, 135, 192 },
+                    {   1,  84, 139 },
+                    {   1,  53,  91 },
+                    {   1,  28,  49 },
+                    {   1,  11,  20 }
+                }, { /* Coeff Band 5 */
+                    {  13, 196, 216 },
+                    {   2, 137, 192 },
+                    {   1,  86, 143 },
+                    {   1,  57,  99 },
+                    {   1,  32,  56 },
+                    {   1,  13,  24 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 211,  29, 217 },
+                    {  96,  47, 156 },
+                    {  22,  43,  87 }
+                }, { /* Coeff Band 1 */
+                    {  78, 120, 193 },
+                    { 111, 116, 186 },
+                    {  46, 102, 164 },
+                    {  15,  80, 128 },
+                    {   2,  49,  76 },
+                    {   1,  18,  28 }
+                }, { /* Coeff Band 2 */
+                    {  71, 161, 203 },
+                    {  42, 132, 192 },
+                    {  10,  98, 150 },
+                    {   3,  69, 109 },
+                    {   1,  44,  70 },
+                    {   1,  18,  29 }
+                }, { /* Coeff Band 3 */
+                    {  57, 186, 211 },
+                    {  30, 140, 196 },
+                    {   4,  93, 146 },
+                    {   1,  62, 102 },
+                    {   1,  38,  65 },
+                    {   1,  16,  27 }
+                }, { /* Coeff Band 4 */
+                    {  47, 199, 217 },
+                    {  14, 145, 196 },
+                    {   1,  88, 142 },
+                    {   1,  57,  98 },
+                    {   1,  36,  62 },
+                    {   1,  15,  26 }
+                }, { /* Coeff Band 5 */
+                    {  26, 219, 229 },
+                    {   5, 155, 207 },
+                    {   1,  94, 151 },
+                    {   1,  60, 104 },
+                    {   1,  36,  62 },
+                    {   1,  16,  28 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 233,  29, 248 },
+                    { 146,  47, 220 },
+                    {  43,  52, 140 }
+                }, { /* Coeff Band 1 */
+                    { 100, 163, 232 },
+                    { 179, 161, 222 },
+                    {  63, 142, 204 },
+                    {  37, 113, 174 },
+                    {  26,  89, 137 },
+                    {  18,  68,  97 }
+                }, { /* Coeff Band 2 */
+                    {  85, 181, 230 },
+                    {  32, 146, 209 },
+                    {   7, 100, 164 },
+                    {   3,  71, 121 },
+                    {   1,  45,  77 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 3 */
+                    {  65, 187, 230 },
+                    {  20, 148, 207 },
+                    {   2,  97, 159 },
+                    {   1,  68, 116 },
+                    {   1,  40,  70 },
+                    {   1,  14,  29 }
+                }, { /* Coeff Band 4 */
+                    {  40, 194, 227 },
+                    {   8, 147, 204 },
+                    {   1,  94, 155 },
+                    {   1,  65, 112 },
+                    {   1,  39,  66 },
+                    {   1,  14,  26 }
+                }, { /* Coeff Band 5 */
+                    {  16, 208, 228 },
+                    {   3, 151, 207 },
+                    {   1,  98, 160 },
+                    {   1,  67, 117 },
+                    {   1,  41,  74 },
+                    {   1,  17,  31 }
+                }
+            }
+        }
+    }, { /* tx = 32x32 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    {  17,  38, 140 },
+                    {   7,  34,  80 },
+                    {   1,  17,  29 }
+                }, { /* Coeff Band 1 */
+                    {  37,  75, 128 },
+                    {  41,  76, 128 },
+                    {  26,  66, 116 },
+                    {  12,  52,  94 },
+                    {   2,  32,  55 },
+                    {   1,  10,  16 }
+                }, { /* Coeff Band 2 */
+                    {  50, 127, 154 },
+                    {  37, 109, 152 },
+                    {  16,  82, 121 },
+                    {   5,  59,  85 },
+                    {   1,  35,  54 },
+                    {   1,  13,  20 }
+                }, { /* Coeff Band 3 */
+                    {  40, 142, 167 },
+                    {  17, 110, 157 },
+                    {   2,  71, 112 },
+                    {   1,  44,  72 },
+                    {   1,  27,  45 },
+                    {   1,  11,  17 }
+                }, { /* Coeff Band 4 */
+                    {  30, 175, 188 },
+                    {   9, 124, 169 },
+                    {   1,  74, 116 },
+                    {   1,  48,  78 },
+                    {   1,  30,  49 },
+                    {   1,  11,  18 }
+                }, { /* Coeff Band 5 */
+                    {  10, 222, 223 },
+                    {   2, 150, 194 },
+                    {   1,  83, 128 },
+                    {   1,  48,  79 },
+                    {   1,  27,  45 },
+                    {   1,  11,  17 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    {  36,  41, 235 },
+                    {  29,  36, 193 },
+                    {  10,  27, 111 }
+                }, { /* Coeff Band 1 */
+                    {  85, 165, 222 },
+                    { 177, 162, 215 },
+                    { 110, 135, 195 },
+                    {  57, 113, 168 },
+                    {  23,  83, 120 },
+                    {  10,  49,  61 }
+                }, { /* Coeff Band 2 */
+                    {  85, 190, 223 },
+                    {  36, 139, 200 },
+                    {   5,  90, 146 },
+                    {   1,  60, 103 },
+                    {   1,  38,  65 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 3 */
+                    {  72, 202, 223 },
+                    {  23, 141, 199 },
+                    {   2,  86, 140 },
+                    {   1,  56,  97 },
+                    {   1,  36,  61 },
+                    {   1,  16,  27 }
+                }, { /* Coeff Band 4 */
+                    {  55, 218, 225 },
+                    {  13, 145, 200 },
+                    {   1,  86, 141 },
+                    {   1,  57,  99 },
+                    {   1,  35,  61 },
+                    {   1,  13,  22 }
+                }, { /* Coeff Band 5 */
+                    {  15, 235, 212 },
+                    {   1, 132, 184 },
+                    {   1,  84, 139 },
+                    {   1,  57,  97 },
+                    {   1,  34,  56 },
+                    {   1,  14,  23 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 181,  21, 201 },
+                    {  61,  37, 123 },
+                    {  10,  38,  71 }
+                }, { /* Coeff Band 1 */
+                    {  47, 106, 172 },
+                    {  95, 104, 173 },
+                    {  42,  93, 159 },
+                    {  18,  77, 131 },
+                    {   4,  50,  81 },
+                    {   1,  17,  23 }
+                }, { /* Coeff Band 2 */
+                    {  62, 147, 199 },
+                    {  44, 130, 189 },
+                    {  28, 102, 154 },
+                    {  18,  75, 115 },
+                    {   2,  44,  65 },
+                    {   1,  12,  19 }
+                }, { /* Coeff Band 3 */
+                    {  55, 153, 210 },
+                    {  24, 130, 194 },
+                    {   3,  93, 146 },
+                    {   1,  61,  97 },
+                    {   1,  31,  50 },
+                    {   1,  10,  16 }
+                }, { /* Coeff Band 4 */
+                    {  49, 186, 223 },
+                    {  17, 148, 204 },
+                    {   1,  96, 142 },
+                    {   1,  53,  83 },
+                    {   1,  26,  44 },
+                    {   1,  11,  17 }
+                }, { /* Coeff Band 5 */
+                    {  13, 217, 212 },
+                    {   2, 136, 180 },
+                    {   1,  78, 124 },
+                    {   1,  50,  83 },
+                    {   1,  29,  49 },
+                    {   1,  14,  23 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 197,  13, 247 },
+                    {  82,  17, 222 },
+                    {  25,  17, 162 }
+                }, { /* Coeff Band 1 */
+                    { 126, 186, 247 },
+                    { 234, 191, 243 },
+                    { 176, 177, 234 },
+                    { 104, 158, 220 },
+                    {  66, 128, 186 },
+                    {  55,  90, 137 }
+                }, { /* Coeff Band 2 */
+                    { 111, 197, 242 },
+                    {  46, 158, 219 },
+                    {   9, 104, 171 },
+                    {   2,  65, 125 },
+                    {   1,  44,  80 },
+                    {   1,  17,  91 }
+                }, { /* Coeff Band 3 */
+                    { 104, 208, 245 },
+                    {  39, 168, 224 },
+                    {   3, 109, 162 },
+                    {   1,  79, 124 },
+                    {   1,  50, 102 },
+                    {   1,  43, 102 }
+                }, { /* Coeff Band 4 */
+                    {  84, 220, 246 },
+                    {  31, 177, 231 },
+                    {   2, 115, 180 },
+                    {   1,  79, 134 },
+                    {   1,  55,  77 },
+                    {   1,  60,  79 }
+                }, { /* Coeff Band 5 */
+                    {  43, 243, 240 },
+                    {   8, 180, 217 },
+                    {   1, 115, 166 },
+                    {   1,  84, 121 },
+                    {   1,  51,  67 },
+                    {   1,  16,   6 }
+                }
+            }
+        }
+    }
+};
+
+enum MVJoint {
+    MV_JOINT_ZERO,
+    MV_JOINT_H,
+    MV_JOINT_V,
+    MV_JOINT_HV,
+};
+
+static const int8_t vp9_mv_joint_tree[3][2] = {
+    { -MV_JOINT_ZERO, 1 },           // '0'
+     { -MV_JOINT_H, 2 },             // '10'
+      { -MV_JOINT_V, -MV_JOINT_HV }, // '11x'
+};
+
+static const int8_t vp9_mv_class_tree[10][2] = {
+    { -0, 1 },         // '0'
+     { -1, 2 },        // '10'
+      { 3, 4 },
+       { -2, -3 },     // '110x'
+       { 5, 6 },
+        { -4, -5 },    // '1110x'
+        { -6, 7 },     // '11110'
+         { 8, 9 },
+          { -7, -8 },  // '111110x'
+          { -9, -10 }, // '111111x'
+};
+
+static const int8_t vp9_mv_fp_tree[3][2] = {
+    { -0, 1 },    // '0'
+     { -1, 2 },   // '10'
+      { -2, -3 }, // '11x'
+};
 
 #endif /* AVCODEC_VP9DATA_H */
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index c83defe..54e77e2 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -4,2171 +4,38 @@
  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
-#include "libavutil/intreadwrite.h"
+#include "vp9dsp.h"
 
-#include "rnd_avg.h"
-#include "vp9.h"
-
-// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
-// back with h264pred.[ch]
-
-static void vert_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    unsigned p4 = AV_RN32A(top);
-
-    AV_WN32A(dst + stride * 0, p4);
-    AV_WN32A(dst + stride * 1, p4);
-    AV_WN32A(dst + stride * 2, p4);
-    AV_WN32A(dst + stride * 3, p4);
-}
-
-static void vert_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8 = AV_RN64A(top);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, p8);
-        dst += stride;
-    }
-}
-
-static void vert_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8a = AV_RN64A(top + 0), p8b = AV_RN64A(top + 8);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, p8a);
-        AV_WN64A(dst + 8, p8b);
-        dst += stride;
-    }
-}
-
-static void vert_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t p8a = AV_RN64A(top + 0),  p8b = AV_RN64A(top + 8),
-             p8c = AV_RN64A(top + 16), p8d = AV_RN64A(top + 24);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, p8a);
-        AV_WN64A(dst +  8, p8b);
-        AV_WN64A(dst + 16, p8c);
-        AV_WN64A(dst + 24, p8d);
-        dst += stride;
-    }
-}
-
-static void hor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                      const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, left[0] * 0x01010101U);
-    AV_WN32A(dst + stride * 1, left[1] * 0x01010101U);
-    AV_WN32A(dst + stride * 2, left[2] * 0x01010101U);
-    AV_WN32A(dst + stride * 3, left[3] * 0x01010101U);
-}
-
-static void hor_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                      const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, left[y] * 0x0101010101010101ULL);
-        dst += stride;
-    }
-}
-
-static void hor_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                        const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        uint64_t p8 = left[y] * 0x0101010101010101ULL;
-
-        AV_WN64A(dst + 0, p8);
-        AV_WN64A(dst + 8, p8);
-        dst += stride;
-    }
-}
-
-static void hor_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                        const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        uint64_t p8 = left[y] * 0x0101010101010101ULL;
-
-        AV_WN64A(dst +  0, p8);
-        AV_WN64A(dst +  8, p8);
-        AV_WN64A(dst + 16, p8);
-        AV_WN64A(dst + 24, p8);
-        dst += stride;
-    }
-}
-
-static void tm_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 4; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0] = av_clip_uint8(top[0] + l_m_tl);
-        dst[1] = av_clip_uint8(top[1] + l_m_tl);
-        dst[2] = av_clip_uint8(top[2] + l_m_tl);
-        dst[3] = av_clip_uint8(top[3] + l_m_tl);
-        dst   += stride;
-    }
-}
-
-static void tm_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 8; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0] = av_clip_uint8(top[0] + l_m_tl);
-        dst[1] = av_clip_uint8(top[1] + l_m_tl);
-        dst[2] = av_clip_uint8(top[2] + l_m_tl);
-        dst[3] = av_clip_uint8(top[3] + l_m_tl);
-        dst[4] = av_clip_uint8(top[4] + l_m_tl);
-        dst[5] = av_clip_uint8(top[5] + l_m_tl);
-        dst[6] = av_clip_uint8(top[6] + l_m_tl);
-        dst[7] = av_clip_uint8(top[7] + l_m_tl);
-        dst   += stride;
-    }
-}
-
-static void tm_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 16; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
-        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
-        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
-        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
-        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
-        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
-        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
-        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
-        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
-        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
-        dst[10] = av_clip_uint8(top[10] + l_m_tl);
-        dst[11] = av_clip_uint8(top[11] + l_m_tl);
-        dst[12] = av_clip_uint8(top[12] + l_m_tl);
-        dst[13] = av_clip_uint8(top[13] + l_m_tl);
-        dst[14] = av_clip_uint8(top[14] + l_m_tl);
-        dst[15] = av_clip_uint8(top[15] + l_m_tl);
-        dst    += stride;
-    }
-}
-
-static void tm_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    int y, tl = top[-1];
-
-    for (y = 0; y < 32; y++) {
-        int l_m_tl = left[y] - tl;
-
-        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
-        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
-        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
-        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
-        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
-        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
-        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
-        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
-        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
-        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
-        dst[10] = av_clip_uint8(top[10] + l_m_tl);
-        dst[11] = av_clip_uint8(top[11] + l_m_tl);
-        dst[12] = av_clip_uint8(top[12] + l_m_tl);
-        dst[13] = av_clip_uint8(top[13] + l_m_tl);
-        dst[14] = av_clip_uint8(top[14] + l_m_tl);
-        dst[15] = av_clip_uint8(top[15] + l_m_tl);
-        dst[16] = av_clip_uint8(top[16] + l_m_tl);
-        dst[17] = av_clip_uint8(top[17] + l_m_tl);
-        dst[18] = av_clip_uint8(top[18] + l_m_tl);
-        dst[19] = av_clip_uint8(top[19] + l_m_tl);
-        dst[20] = av_clip_uint8(top[20] + l_m_tl);
-        dst[21] = av_clip_uint8(top[21] + l_m_tl);
-        dst[22] = av_clip_uint8(top[22] + l_m_tl);
-        dst[23] = av_clip_uint8(top[23] + l_m_tl);
-        dst[24] = av_clip_uint8(top[24] + l_m_tl);
-        dst[25] = av_clip_uint8(top[25] + l_m_tl);
-        dst[26] = av_clip_uint8(top[26] + l_m_tl);
-        dst[27] = av_clip_uint8(top[27] + l_m_tl);
-        dst[28] = av_clip_uint8(top[28] + l_m_tl);
-        dst[29] = av_clip_uint8(top[29] + l_m_tl);
-        dst[30] = av_clip_uint8(top[30] + l_m_tl);
-        dst[31] = av_clip_uint8(top[31] + l_m_tl);
-        dst    += stride;
-    }
-}
-
-static void dc_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    top[0]  + top[1]  + top[2]  + top[3]  + 4) >> 3);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                     const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    left[4] + left[5] + left[6] + left[7] +
-                    top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    top[0]   + top[1]   + top[2]   + top[3]   +
-                    top[4]   + top[5]   + top[6]   + top[7]   +
-                    top[8]   + top[9]   + top[10]  + top[11]  +
-                    top[12]  + top[13]  + top[14]  + top[15]  + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                       const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    left[16] + left[17] + left[18] + left[19] +
-                    left[20] + left[21] + left[22] + left[23] +
-                    left[24] + left[25] + left[26] + left[27] +
-                    left[28] + left[29] + left[30] + left[31] +
-                    top[0]   + top[1]   + top[2]   + top[3]   +
-                    top[4]   + top[5]   + top[6]   + top[7]   +
-                    top[8]   + top[9]   + top[10]  + top[11]  +
-                    top[12]  + top[13]  + top[14]  + top[15]  +
-                    top[16]  + top[17]  + top[18]  + top[19]  +
-                    top[20]  + top[21]  + top[22]  + top[23]  +
-                    top[24]  + top[25]  + top[26]  + top[27]  +
-                    top[28]  + top[29]  + top[30]  + top[31]  + 32) >> 6);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U *
-                  ((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_left_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0] + left[1] + left[2] + left[3] +
-                    left[4] + left[5] + left[6] + left[7] + 4) >> 3);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_left_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((left[0]  + left[1]  + left[2]  + left[3]  +
-                    left[4]  + left[5]  + left[6]  + left[7]  +
-                    left[8]  + left[9]  + left[10] + left[11] +
-                    left[12] + left[13] + left[14] + left[15] +
-                    left[16] + left[17] + left[18] + left[19] +
-                    left[20] + left[21] + left[22] + left[23] +
-                    left[24] + left[25] + left[26] + left[27] +
-                    left[28] + left[29] + left[30] + left[31] + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    unsigned dc = 0x01010101U * ((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
-
-    AV_WN32A(dst + stride * 0, dc);
-    AV_WN32A(dst + stride * 1, dc);
-    AV_WN32A(dst + stride * 2, dc);
-    AV_WN32A(dst + stride * 3, dc);
-}
-
-static void dc_top_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0] + top[1] + top[2] + top[3] +
-                    top[4] + top[5] + top[6] + top[7] + 4) >> 3);
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  +
-                    top[8]  + top[9]  + top[10] + top[11] +
-                    top[12] + top[13] + top[14] + top[15] + 8) >> 4);
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, dc);
-        AV_WN64A(dst + 8, dc);
-        dst += stride;
-    }
-}
-
-static void dc_top_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    uint64_t dc = 0x0101010101010101ULL *
-                  ((top[0]  + top[1]  + top[2]  + top[3]  +
-                    top[4]  + top[5]  + top[6]  + top[7]  +
-                    top[8]  + top[9]  + top[10] + top[11] +
-                    top[12] + top[13] + top[14] + top[15] +
-                    top[16] + top[17] + top[18] + top[19] +
-                    top[20] + top[21] + top[22] + top[23] +
-                    top[24] + top[25] + top[26] + top[27] +
-                    top[28] + top[29] + top[30] + top[31] + 16) >> 5);
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, dc);
-        AV_WN64A(dst +  8, dc);
-        AV_WN64A(dst + 16, dc);
-        AV_WN64A(dst + 24, dc);
-        dst += stride;
-    }
-}
-
-static void dc_128_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x80808080U);
-    AV_WN32A(dst + stride * 1, 0x80808080U);
-    AV_WN32A(dst + stride * 2, 0x80808080U);
-    AV_WN32A(dst + stride * 3, 0x80808080U);
-}
-
-static void dc_128_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_128_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x8080808080808080ULL);
-        AV_WN64A(dst + 8, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_128_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x8080808080808080ULL);
-        AV_WN64A(dst +  8, 0x8080808080808080ULL);
-        AV_WN64A(dst + 16, 0x8080808080808080ULL);
-        AV_WN64A(dst + 24, 0x8080808080808080ULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 1, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 2, 0x7F7F7F7FU);
-    AV_WN32A(dst + stride * 3, 0x7F7F7F7FU);
-}
-
-static void dc_127_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 8, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_127_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst +  8, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 16, 0x7F7F7F7F7F7F7F7FULL);
-        AV_WN64A(dst + 24, 0x7F7F7F7F7F7F7F7FULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    AV_WN32A(dst + stride * 0, 0x81818181U);
-    AV_WN32A(dst + stride * 1, 0x81818181U);
-    AV_WN32A(dst + stride * 2, 0x81818181U);
-    AV_WN32A(dst + stride * 3, 0x81818181U);
-}
-
-static void dc_129_8x8_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 8; y++) {
-        AV_WN64A(dst, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_16x16_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 16; y++) {
-        AV_WN64A(dst + 0, 0x8181818181818181ULL);
-        AV_WN64A(dst + 8, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-static void dc_129_32x32_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int y;
-
-    for (y = 0; y < 32; y++) {
-        AV_WN64A(dst +  0, 0x8181818181818181ULL);
-        AV_WN64A(dst +  8, 0x8181818181818181ULL);
-        AV_WN64A(dst + 16, 0x8181818181818181ULL);
-        AV_WN64A(dst + 24, 0x8181818181818181ULL);
-        dst += stride;
-    }
-}
-
-#define DST(x, y) dst[(x) + (y) * stride]
-
-static void diag_downleft_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *left, const uint8_t *top)
-{
-    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
-
-    DST(0, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(1, 0) =
-    DST(0, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
-    DST(2, 0) =
-    DST(1, 1) =
-    DST(0, 2) = (a2 + a3 * 2 + a4 + 2) >> 2;
-    DST(3, 0) =
-    DST(2, 1) =
-    DST(1, 2) =
-    DST(0, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
-    DST(3, 1) =
-    DST(2, 2) =
-    DST(1, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
-    DST(3, 2) =
-    DST(2, 3) = (a5 + a6 * 2 + a7 + 2) >> 2;
-    DST(3, 3) = a7;  // note: this is different from vp8 and such
-}
-
-#define def_diag_downleft(size)                                             \
-static void diag_downleft_ ## size ## x ## size ## _c(uint8_t *dst,         \
-                                                      ptrdiff_t stride,     \
-                                                      const uint8_t *left,  \
-                                                      const uint8_t *top)   \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size - 1];                                                    \
-                                                                            \
-    for (i = 0; i < size - 2; i++)                                          \
-        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;             \
-    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;             \
-                                                                            \
-    for (j = 0; j < size; j++) {                                            \
-        memcpy(dst + j * stride, v + j, size - 1 - j);                      \
-        memset(dst + j * stride + size - 1 - j, top[size - 1], j + 1);      \
-    }                                                                       \
-}
-
-def_diag_downleft(8)
-def_diag_downleft(16)
-def_diag_downleft(32)
-
-static void diag_downright_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *left, const uint8_t *top)
-{
-    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
-
-    DST(0, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
-    DST(0, 2) =
-    DST(1, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 1) =
-    DST(1, 2) =
-    DST(2, 3) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 0) =
-    DST(1, 1) =
-    DST(2, 2) =
-    DST(3, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
-    DST(1, 0) =
-    DST(2, 1) =
-    DST(3, 2) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(2, 0) =
-    DST(3, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(3, 0) = (a1 + a2 * 2 + a3 + 2) >> 2;
-}
-
-#define def_diag_downright(size)                                            \
-static void diag_downright_ ## size ## x ## size ## _c(uint8_t *dst,        \
-                                                       ptrdiff_t stride,    \
-                                                       const uint8_t *left, \
-                                                       const uint8_t *top)  \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size + size - 1];                                             \
-                                                                            \
-    for (i = 0; i < size - 2; i++) {                                        \
-        v[i]            = (left[size - 1 - i] +                             \
-                           left[size - 2 - i] * 2 +                         \
-                           left[size - 3 - i] + 2) >> 2;                    \
-        v[size + 1 + i] = (top[i]             +                             \
-                           top[i + 1]         * 2 +                         \
-                           top[i + 2]         + 2) >> 2;                    \
-    }                                                                       \
-    v[size - 2] = (left[1] + left[0] * 2 + top[-1] + 2) >> 2;               \
-    v[size - 1] = (left[0] + top[-1] * 2 + top[0]  + 2) >> 2;               \
-    v[size]     = (top[-1] + top[0]  * 2 + top[1]  + 2) >> 2;               \
-                                                                            \
-    for (j = 0; j < size; j++)                                              \
-        memcpy(dst + j * stride, v + size - 1 - j, size);                   \
-}
-
-def_diag_downright(8)
-def_diag_downright(16)
-def_diag_downright(32)
-
-static void vert_right_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *left, const uint8_t *top)
-{
-    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        l0 = left[0], l1 = left[1], l2 = left[2];
-
-    DST(0, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 0) =
-    DST(1, 2) = (tl + a0          + 1) >> 1;
-    DST(0, 1) =
-    DST(1, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
-    DST(1, 0) =
-    DST(2, 2) = (a0 + a1          + 1) >> 1;
-    DST(1, 1) =
-    DST(2, 3) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(2, 0) =
-    DST(3, 2) = (a1 + a2          + 1) >> 1;
-    DST(2, 1) =
-    DST(3, 3) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(3, 0) = (a2 + a3          + 1) >> 1;
-    DST(3, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
-}
-
-#define def_vert_right(size)                                                \
-static void vert_right_ ## size ## x ## size ## _c(uint8_t *dst,            \
-                                                   ptrdiff_t stride,        \
-                                                   const uint8_t *left,     \
-                                                   const uint8_t *top)      \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t ve[size + size / 2 - 1], vo[size + size / 2 - 1];               \
-                                                                            \
-    for (i = 0; i < size / 2 - 2; i++) {                                    \
-        vo[i] = (left[size - 4 - i * 2] +                                   \
-                 left[size - 3 - i * 2] * 2 +                               \
-                 left[size - 2 - i * 2] + 2) >> 2;                          \
-        ve[i] = (left[size - 5 - i * 2] +                                   \
-                 left[size - 4 - i * 2] * 2 +                               \
-                 left[size - 3 - i * 2] + 2) >> 2;                          \
-    }                                                                       \
-    vo[size / 2 - 2] = (left[0] + left[1] * 2 + left[2] + 2) >> 2;          \
-    ve[size / 2 - 2] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;          \
-                                                                            \
-    ve[size / 2 - 1] = (top[-1] + top[0] + 1) >> 1;                         \
-    vo[size / 2 - 1] = (left[0] + top[-1] * 2 + top[0] + 2) >> 2;           \
-    for (i = 0; i < size - 1; i++) {                                        \
-        ve[size / 2 + i] = (top[i] + top[i + 1] + 1) >> 1;                  \
-        vo[size / 2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
-    }                                                                       \
-                                                                            \
-    for (j = 0; j < size / 2; j++) {                                        \
-        memcpy(dst +  j * 2      * stride, ve + size / 2 - 1 - j, size);    \
-        memcpy(dst + (j * 2 + 1) * stride, vo + size / 2 - 1 - j, size);    \
-    }                                                                       \
-}
-
-def_vert_right(8)
-def_vert_right(16)
-def_vert_right(32)
-
-static void hor_down_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                           const uint8_t *left, const uint8_t *top)
-{
-    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3],
-        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
-
-    DST(2, 0) = (tl + a0 * 2 + a1 + 2) >> 2;
-    DST(3, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(0, 0) =
-    DST(2, 1) = (tl + l0          + 1) >> 1;
-    DST(1, 0) =
-    DST(3, 1) = (a0 + tl * 2 + l0 + 2) >> 2;
-    DST(0, 1) =
-    DST(2, 2) = (l0 + l1          + 1) >> 1;
-    DST(1, 1) =
-    DST(3, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
-    DST(0, 2) =
-    DST(2, 3) = (l1 + l2          + 1) >> 1;
-    DST(1, 2) =
-    DST(3, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 3) = (l2 + l3          + 1) >> 1;
-    DST(1, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
-}
-
-#define def_hor_down(size)                                              \
-static void hor_down_ ## size ## x ## size ## _c(uint8_t *dst,          \
-                                                 ptrdiff_t stride,      \
-                                                 const uint8_t *left,   \
-                                                 const uint8_t *top)    \
-{                                                                       \
-    int i, j;                                                           \
-    uint8_t v[size * 3 - 2];                                            \
-                                                                        \
-    for (i = 0; i < size - 2; i++) {                                    \
-        v[i * 2]        = (left[size - 2 - i] +                         \
-                           left[size - 1 - i] + 1) >> 1;                \
-        v[i * 2    + 1] = (left[size - 3 - i] +                         \
-                           left[size - 2 - i] * 2 +                     \
-                           left[size - 1 - i] + 2) >> 2;                \
-        v[size * 2 + i] = (top[i - 1] +                                 \
-                           top[i] * 2 +                                 \
-                           top[i + 1] + 2) >> 2;                        \
-    }                                                                   \
-    v[size * 2 - 2] = (top[-1] + left[0] + 1) >> 1;                     \
-    v[size * 2 - 4] = (left[0] + left[1] + 1) >> 1;                     \
-    v[size * 2 - 1] = (top[0]  + top[-1] * 2 + left[0] + 2) >> 2;       \
-    v[size * 2 - 3] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;       \
-                                                                        \
-    for (j = 0; j < size; j++)                                          \
-        memcpy(dst + j * stride, v + size * 2 - 2 - j * 2, size);       \
-}
-
-def_hor_down(8)
-def_hor_down(16)
-def_hor_down(32)
-
-static void vert_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *left, const uint8_t *top)
-{
-    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
-        a4 = top[4], a5 = top[5], a6 = top[6];
-
-    DST(0, 0) = (a0 + a1          + 1) >> 1;
-    DST(0, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
-    DST(1, 0) =
-    DST(0, 2) = (a1 + a2          + 1) >> 1;
-    DST(1, 1) =
-    DST(0, 3) = (a1 + a2 * 2 + a3 + 2) >> 2;
-    DST(2, 0) =
-    DST(1, 2) = (a2 + a3          + 1) >> 1;
-    DST(2, 1) =
-    DST(1, 3) = (a2 + a3 * 2 + a4 + 2) >> 2;
-    DST(3, 0) =
-    DST(2, 2) = (a3 + a4          + 1) >> 1;
-    DST(3, 1) =
-    DST(2, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
-    DST(3, 2) = (a4 + a5          + 1) >> 1;
-    DST(3, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
-}
-
-#define def_vert_left(size)                                             \
-static void vert_left_ ## size ## x ## size ## _c(uint8_t *dst,         \
-                                                  ptrdiff_t stride,     \
-                                                  const uint8_t *left,  \
-                                                  const uint8_t *top)   \
-{                                                                       \
-    int i, j;                                                           \
-    uint8_t ve[size - 1], vo[size - 1];                                 \
-                                                                        \
-    for (i = 0; i < size - 2; i++) {                                    \
-        ve[i] = (top[i] + top[i + 1] + 1) >> 1;                         \
-        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;        \
-    }                                                                   \
-    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1;            \
-    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;        \
-                                                                        \
-    for (j = 0; j < size / 2; j++) {                                    \
-        memcpy(dst +  j * 2      * stride, ve + j, size - (j + 1));     \
-        memset(dst +  j * 2      * stride + size - j - 1,               \
-               top[size - 1], j + 1);                                   \
-        memcpy(dst + (j * 2 + 1) * stride, vo + j, size - (j + 1));     \
-        memset(dst + (j * 2 + 1) * stride + size - j - 1,               \
-               top[size - 1], j + 1);                                   \
-    }                                                                   \
-}
-
-def_vert_left(8)
-def_vert_left(16)
-def_vert_left(32)
-
-static void hor_up_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                         const uint8_t *left, const uint8_t *top)
-{
-    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
-
-    DST(0, 0) = (l0 + l1          + 1) >> 1;
-    DST(1, 0) = (l0 + l1 * 2 + l2 + 2) >> 2;
-    DST(0, 1) =
-    DST(2, 0) = (l1 + l2          + 1) >> 1;
-    DST(1, 1) =
-    DST(3, 0) = (l1 + l2 * 2 + l3 + 2) >> 2;
-    DST(0, 2) =
-    DST(2, 1) = (l2 + l3          + 1) >> 1;
-    DST(1, 2) =
-    DST(3, 1) = (l2 + l3 * 3      + 2) >> 2;
-    DST(0, 3) =
-    DST(1, 3) =
-    DST(2, 2) =
-    DST(2, 3) =
-    DST(3, 2) =
-    DST(3, 3) = l3;
-}
-
-#define def_hor_up(size)                                                    \
-static void hor_up_ ## size ## x ## size ## _c(uint8_t *dst,                \
-                                               ptrdiff_t stride,            \
-                                               const uint8_t *left,         \
-                                               const uint8_t *top)          \
-{                                                                           \
-    int i, j;                                                               \
-    uint8_t v[size * 2 - 2];                                                \
-                                                                            \
-    for (i = 0; i < size - 2; i++) {                                        \
-        v[i * 2]     = (left[i] + left[i + 1] + 1) >> 1;                    \
-        v[i * 2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2;  \
-    }                                                                       \
-    v[size * 2 - 4] = (left[size - 2] + left[size - 1]     + 1) >> 1;       \
-    v[size * 2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2;       \
-                                                                            \
-    for (j = 0; j < size / 2; j++)                                          \
-        memcpy(dst + j * stride, v + j * 2, size);                          \
-    for (j = size / 2; j < size; j++) {                                     \
-        memcpy(dst + j * stride, v + j * 2, size * 2 - 2 - j * 2);          \
-        memset(dst + j * stride + size * 2 - 2 - j * 2, left[size - 1],     \
-               2 + j * 2 - size);                                           \
-    }                                                                       \
-}
-
-def_hor_up(8)
-def_hor_up(16)
-def_hor_up(32)
-
-#undef DST
-
-static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp)
-{
-#define init_intra_pred(tx, sz)                                              \
-    dsp->intra_pred[tx][VERT_PRED]            = vert_           ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_PRED]             = hor_            ## sz ## _c; \
-    dsp->intra_pred[tx][DC_PRED]              = dc_             ## sz ## _c; \
-    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_  ## sz ## _c; \
-    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_ ## sz ## _c; \
-    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_     ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_       ## sz ## _c; \
-    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_      ## sz ## _c; \
-    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_         ## sz ## _c; \
-    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_             ## sz ## _c; \
-    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_        ## sz ## _c; \
-    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_         ## sz ## _c; \
-    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_         ## sz ## _c
-
-    init_intra_pred(TX_4X4,   4x4);
-    init_intra_pred(TX_8X8,   8x8);
-    init_intra_pred(TX_16X16, 16x16);
-    init_intra_pred(TX_32X32, 32x32);
-
-#undef init_intra_pred
-}
-
-#define itxfm_wrapper(type_a, type_b, sz, bits)                             \
-static void                                                                 \
-type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
-                                                      ptrdiff_t stride,     \
-                                                      int16_t *block,       \
-                                                      int eob)              \
-{                                                                           \
-    int i, j;                                                               \
-    int16_t tmp[sz * sz], out[sz];                                          \
-    for (i = 0; i < sz; i++)                                                \
-        type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);                \
-    memset(block, 0, sz * sz * sizeof(*block));                             \
-    for (i = 0; i < sz; i++) {                                              \
-        type_b ## sz ## _1d(out, tmp + i, sz, 1);                           \
-        for (j = 0; j < sz; j++)                                            \
-            dst[j * stride] =                                               \
-                av_clip_uint8(dst[j * stride] +                             \
-                              (bits ? (out[j] + (1 << (bits - 1))) >> bits  \
-                                    : out[j]));                             \
-        dst++;                                                              \
-    }                                                                       \
-}
-
-#define itxfm_wrap(sz, bits)             \
-    itxfm_wrapper(idct, idct, sz, bits)  \
-    itxfm_wrapper(iadst, idct, sz, bits) \
-    itxfm_wrapper(idct, iadst, sz, bits) \
-    itxfm_wrapper(iadst, iadst, sz, bits)
-
-#define IN(x) in[x * stride]
-
-static av_always_inline void idct4_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3;
-
-    t0 = ((IN(0)        + IN(2)) * 11585 + (1 << 13)) >> 14;
-    t1 = ((IN(0)        - IN(2)) * 11585 + (1 << 13)) >> 14;
-    t2 = (IN(1) *  6270 - IN(3)  * 15137 + (1 << 13)) >> 14;
-    t3 = (IN(1) * 15137 + IN(3)  *  6270 + (1 << 13)) >> 14;
-
-    out[0] = t0 + t3;
-    out[1] = t1 + t2;
-    out[2] = t1 - t2;
-    out[3] = t0 - t3;
-}
-
-static av_always_inline void iadst4_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3;
-
-    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
-    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
-    t2 = 13377 * (IN(0) - IN(2) + IN(3));
-    t3 = 13377 * IN(1);
-
-    out[0] = (t0 + t3      + (1 << 13)) >> 14;
-    out[1] = (t1 + t3      + (1 << 13)) >> 14;
-    out[2] = (t2           + (1 << 13)) >> 14;
-    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
-}
-
-itxfm_wrap(4, 4)
-
-static av_always_inline void idct8_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
-
-    t0a = ((IN(0)        + IN(4)) * 11585 + (1 << 13)) >> 14;
-    t1a = ((IN(0)        - IN(4)) * 11585 + (1 << 13)) >> 14;
-    t2a = (IN(2) *  6270 - IN(6)  * 15137 + (1 << 13)) >> 14;
-    t3a = (IN(2) * 15137 + IN(6)  *  6270 + (1 << 13)) >> 14;
-    t4a = (IN(1) *  3196 - IN(7)  * 16069 + (1 << 13)) >> 14;
-    t5a = (IN(5) * 13623 - IN(3)  *  9102 + (1 << 13)) >> 14;
-    t6a = (IN(5) *  9102 + IN(3)  * 13623 + (1 << 13)) >> 14;
-    t7a = (IN(1) * 16069 + IN(7)  *  3196 + (1 << 13)) >> 14;
-
-    t0  = t0a + t3a;
-    t1  = t1a + t2a;
-    t2  = t1a - t2a;
-    t3  = t0a - t3a;
-    t4  = t4a + t5a;
-    t5a = t4a - t5a;
-    t7  = t7a + t6a;
-    t6a = t7a - t6a;
-
-    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
-    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
-
-    out[0] = t0 + t7;
-    out[1] = t1 + t6;
-    out[2] = t2 + t5;
-    out[3] = t3 + t4;
-    out[4] = t3 - t4;
-    out[5] = t2 - t5;
-    out[6] = t1 - t6;
-    out[7] = t0 - t7;
-}
-
-static av_always_inline void iadst8_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
-
-    t0a = 16305 * IN(7) +  1606 * IN(0);
-    t1a =  1606 * IN(7) - 16305 * IN(0);
-    t2a = 14449 * IN(5) +  7723 * IN(2);
-    t3a =  7723 * IN(5) - 14449 * IN(2);
-    t4a = 10394 * IN(3) + 12665 * IN(4);
-    t5a = 12665 * IN(3) - 10394 * IN(4);
-    t6a =  4756 * IN(1) + 15679 * IN(6);
-    t7a = 15679 * IN(1) -  4756 * IN(6);
-
-    t0  = (t0a + t4a + (1 << 13)) >> 14;
-    t1  = (t1a + t5a + (1 << 13)) >> 14;
-    t2  = (t2a + t6a + (1 << 13)) >> 14;
-    t3  = (t3a + t7a + (1 << 13)) >> 14;
-    t4  = (t0a - t4a + (1 << 13)) >> 14;
-    t5  = (t1a - t5a + (1 << 13)) >> 14;
-    t6  = (t2a - t6a + (1 << 13)) >> 14;
-    t7  = (t3a - t7a + (1 << 13)) >> 14;
-
-    t4a = 15137 * t4 +  6270 * t5;
-    t5a =  6270 * t4 - 15137 * t5;
-    t6a = 15137 * t7 -  6270 * t6;
-    t7a =  6270 * t7 + 15137 * t6;
-
-    out[0] =   t0 + t2;
-    out[7] = -(t1 + t3);
-    t2     =   t0 - t2;
-    t3     =   t1 - t3;
-
-    out[1] = -((t4a + t6a + (1 << 13)) >> 14);
-    out[6] =   (t5a + t7a + (1 << 13)) >> 14;
-    t6     =   (t4a - t6a + (1 << 13)) >> 14;
-    t7     =   (t5a - t7a + (1 << 13)) >> 14;
-
-    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
-    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
-    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
-    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
-}
-
-itxfm_wrap(8, 5)
-
-static av_always_inline void idct16_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
-    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
-    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
-
-    t0a  = ((IN(0)         + IN(8)) * 11585 + (1 << 13)) >> 14;
-    t1a  = ((IN(0)         - IN(8)) * 11585 + (1 << 13)) >> 14;
-    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
-    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
-    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
-    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
-    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
-    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
-    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
-    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
-    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
-    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
-    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
-    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
-    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
-    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
-
-    t0   = t0a  + t3a;
-    t1   = t1a  + t2a;
-    t2   = t1a  - t2a;
-    t3   = t0a  - t3a;
-    t4   = t4a  + t5a;
-    t5   = t4a  - t5a;
-    t6   = t7a  - t6a;
-    t7   = t7a  + t6a;
-    t8   = t8a  + t9a;
-    t9   = t8a  - t9a;
-    t10  = t11a - t10a;
-    t11  = t11a + t10a;
-    t12  = t12a + t13a;
-    t13  = t12a - t13a;
-    t14  = t15a - t14a;
-    t15  = t15a + t14a;
-
-    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
-    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
-    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
-    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
-    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
-    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
-
-    t0a  = t0   + t7;
-    t1a  = t1   + t6a;
-    t2a  = t2   + t5a;
-    t3a  = t3   + t4;
-    t4   = t3   - t4;
-    t5   = t2   - t5a;
-    t6   = t1   - t6a;
-    t7   = t0   - t7;
-    t8a  = t8   + t11;
-    t9   = t9a  + t10a;
-    t10  = t9a  - t10a;
-    t11a = t8   - t11;
-    t12a = t15  - t12;
-    t13  = t14a - t13a;
-    t14  = t14a + t13a;
-    t15a = t15  + t12;
-
-    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
-    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
-    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
-    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
-
-    out[0]  = t0a + t15a;
-    out[1]  = t1a + t14;
-    out[2]  = t2a + t13a;
-    out[3]  = t3a + t12;
-    out[4]  = t4  + t11;
-    out[5]  = t5  + t10a;
-    out[6]  = t6  + t9;
-    out[7]  = t7  + t8a;
-    out[8]  = t7  - t8a;
-    out[9]  = t6  - t9;
-    out[10] = t5  - t10a;
-    out[11] = t4  - t11;
-    out[12] = t3a - t12;
-    out[13] = t2a - t13a;
-    out[14] = t1a - t14;
-    out[15] = t0a - t15a;
-}
-
-static av_always_inline void iadst16_1d(int16_t *out, const int16_t *in,
-                                        ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
-    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
-    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
-
-    t0  = IN(15) * 16364 + IN(0)  *   804;
-    t1  = IN(15) *   804 - IN(0)  * 16364;
-    t2  = IN(13) * 15893 + IN(2)  *  3981;
-    t3  = IN(13) *  3981 - IN(2)  * 15893;
-    t4  = IN(11) * 14811 + IN(4)  *  7005;
-    t5  = IN(11) *  7005 - IN(4)  * 14811;
-    t6  = IN(9)  * 13160 + IN(6)  *  9760;
-    t7  = IN(9)  *  9760 - IN(6)  * 13160;
-    t8  = IN(7)  * 11003 + IN(8)  * 12140;
-    t9  = IN(7)  * 12140 - IN(8)  * 11003;
-    t10 = IN(5)  *  8423 + IN(10) * 14053;
-    t11 = IN(5)  * 14053 - IN(10) *  8423;
-    t12 = IN(3)  *  5520 + IN(12) * 15426;
-    t13 = IN(3)  * 15426 - IN(12) *  5520;
-    t14 = IN(1)  *  2404 + IN(14) * 16207;
-    t15 = IN(1)  * 16207 - IN(14) *  2404;
-
-    t0a  = (t0 + t8  + (1 << 13)) >> 14;
-    t1a  = (t1 + t9  + (1 << 13)) >> 14;
-    t2a  = (t2 + t10 + (1 << 13)) >> 14;
-    t3a  = (t3 + t11 + (1 << 13)) >> 14;
-    t4a  = (t4 + t12 + (1 << 13)) >> 14;
-    t5a  = (t5 + t13 + (1 << 13)) >> 14;
-    t6a  = (t6 + t14 + (1 << 13)) >> 14;
-    t7a  = (t7 + t15 + (1 << 13)) >> 14;
-    t8a  = (t0 - t8  + (1 << 13)) >> 14;
-    t9a  = (t1 - t9  + (1 << 13)) >> 14;
-    t10a = (t2 - t10 + (1 << 13)) >> 14;
-    t11a = (t3 - t11 + (1 << 13)) >> 14;
-    t12a = (t4 - t12 + (1 << 13)) >> 14;
-    t13a = (t5 - t13 + (1 << 13)) >> 14;
-    t14a = (t6 - t14 + (1 << 13)) >> 14;
-    t15a = (t7 - t15 + (1 << 13)) >> 14;
-
-    t8   = t8a  * 16069 + t9a  *  3196;
-    t9   = t8a  *  3196 - t9a  * 16069;
-    t10  = t10a *  9102 + t11a * 13623;
-    t11  = t10a * 13623 - t11a *  9102;
-    t12  = t13a * 16069 - t12a *  3196;
-    t13  = t13a *  3196 + t12a * 16069;
-    t14  = t15a *  9102 - t14a * 13623;
-    t15  = t15a * 13623 + t14a *  9102;
-
-    t0   = t0a  + t4a;
-    t1   = t1a  + t5a;
-    t2   = t2a  + t6a;
-    t3   = t3a  + t7a;
-    t4   = t0a  - t4a;
-    t5   = t1a  - t5a;
-    t6   = t2a  - t6a;
-    t7   = t3a  - t7a;
-    t8a  = (t8  + t12 + (1 << 13)) >> 14;
-    t9a  = (t9  + t13 + (1 << 13)) >> 14;
-    t10a = (t10 + t14 + (1 << 13)) >> 14;
-    t11a = (t11 + t15 + (1 << 13)) >> 14;
-    t12a = (t8  - t12 + (1 << 13)) >> 14;
-    t13a = (t9  - t13 + (1 << 13)) >> 14;
-    t14a = (t10 - t14 + (1 << 13)) >> 14;
-    t15a = (t11 - t15 + (1 << 13)) >> 14;
-
-    t4a  = t4   * 15137 + t5   *  6270;
-    t5a  = t4   *  6270 - t5   * 15137;
-    t6a  = t7   * 15137 - t6   *  6270;
-    t7a  = t7   *  6270 + t6   * 15137;
-    t12  = t12a * 15137 + t13a *  6270;
-    t13  = t12a *  6270 - t13a * 15137;
-    t14  = t15a * 15137 - t14a *  6270;
-    t15  = t15a *  6270 + t14a * 15137;
-
-    out[0]  =     t0 + t2;
-    out[15] =   -(t1 + t3);
-    t2a     =     t0 - t2;
-    t3a     =     t1 - t3;
-    out[3]  = -((t4a + t6a + (1 << 13)) >> 14);
-    out[12] =   (t5a + t7a + (1 << 13)) >> 14;
-    t6      =   (t4a - t6a + (1 << 13)) >> 14;
-    t7      =   (t5a - t7a + (1 << 13)) >> 14;
-    out[1]  =  -(t8a + t10a);
-    out[14] =    t9a + t11a;
-    t10     =    t8a - t10a;
-    t11     =    t9a - t11a;
-    out[2]  =   (t12 + t14 + (1 << 13)) >> 14;
-    out[13] = -((t13 + t15 + (1 << 13)) >> 14);
-    t14a    =   (t12 - t14 + (1 << 13)) >> 14;
-    t15a    =   (t13 - t15 + (1 << 13)) >> 14;
-
-    out[7]  = ((t2a  + t3a)  * -11585 + (1 << 13)) >> 14;
-    out[8]  = ((t2a  - t3a)  *  11585 + (1 << 13)) >> 14;
-    out[4]  = ((t7   + t6)   *  11585 + (1 << 13)) >> 14;
-    out[11] = ((t7   - t6)   *  11585 + (1 << 13)) >> 14;
-    out[6]  = ((t11  + t10)  *  11585 + (1 << 13)) >> 14;
-    out[9]  = ((t11  - t10)  *  11585 + (1 << 13)) >> 14;
-    out[5]  = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
-    out[10] = ((t14a - t15a) *  11585 + (1 << 13)) >> 14;
-}
-
-itxfm_wrap(16, 6)
-
-static av_always_inline void idct32_1d(int16_t *out, const int16_t *in,
-                                       ptrdiff_t stride, int pass)
+av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
 {
-    int t0a  = ((IN(0)         + IN(16)) * 11585 + (1 << 13)) >> 14;
-    int t1a  = ((IN(0)         - IN(16)) * 11585 + (1 << 13)) >> 14;
-    int t2a  = (IN(8)  *  6270 - IN(24)  * 15137 + (1 << 13)) >> 14;
-    int t3a  = (IN(8)  * 15137 + IN(24)  *  6270 + (1 << 13)) >> 14;
-    int t4a  = (IN(4)  *  3196 - IN(28)  * 16069 + (1 << 13)) >> 14;
-    int t7a  = (IN(4)  * 16069 + IN(28)  *  3196 + (1 << 13)) >> 14;
-    int t5a  = (IN(20) * 13623 - IN(12)  *  9102 + (1 << 13)) >> 14;
-    int t6a  = (IN(20) *  9102 + IN(12)  * 13623 + (1 << 13)) >> 14;
-    int t8a  = (IN(2)  *  1606 - IN(30)  * 16305 + (1 << 13)) >> 14;
-    int t15a = (IN(2)  * 16305 + IN(30)  *  1606 + (1 << 13)) >> 14;
-    int t9a  = (IN(18) * 12665 - IN(14)  * 10394 + (1 << 13)) >> 14;
-    int t14a = (IN(18) * 10394 + IN(14)  * 12665 + (1 << 13)) >> 14;
-    int t10a = (IN(10) *  7723 - IN(22)  * 14449 + (1 << 13)) >> 14;
-    int t13a = (IN(10) * 14449 + IN(22)  *  7723 + (1 << 13)) >> 14;
-    int t11a = (IN(26) * 15679 - IN(6)   *  4756 + (1 << 13)) >> 14;
-    int t12a = (IN(26) *  4756 + IN(6)   * 15679 + (1 << 13)) >> 14;
-    int t16a = (IN(1)  *   804 - IN(31)  * 16364 + (1 << 13)) >> 14;
-    int t31a = (IN(1)  * 16364 + IN(31)  *   804 + (1 << 13)) >> 14;
-    int t17a = (IN(17) * 12140 - IN(15)  * 11003 + (1 << 13)) >> 14;
-    int t30a = (IN(17) * 11003 + IN(15)  * 12140 + (1 << 13)) >> 14;
-    int t18a = (IN(9)  *  7005 - IN(23)  * 14811 + (1 << 13)) >> 14;
-    int t29a = (IN(9)  * 14811 + IN(23)  *  7005 + (1 << 13)) >> 14;
-    int t19a = (IN(25) * 15426 - IN(7)   *  5520 + (1 << 13)) >> 14;
-    int t28a = (IN(25) *  5520 + IN(7)   * 15426 + (1 << 13)) >> 14;
-    int t20a = (IN(5)  *  3981 - IN(27)  * 15893 + (1 << 13)) >> 14;
-    int t27a = (IN(5)  * 15893 + IN(27)  *  3981 + (1 << 13)) >> 14;
-    int t21a = (IN(21) * 14053 - IN(11)  *  8423 + (1 << 13)) >> 14;
-    int t26a = (IN(21) *  8423 + IN(11)  * 14053 + (1 << 13)) >> 14;
-    int t22a = (IN(13) *  9760 - IN(19)  * 13160 + (1 << 13)) >> 14;
-    int t25a = (IN(13) * 13160 + IN(19)  *  9760 + (1 << 13)) >> 14;
-    int t23a = (IN(29) * 16207 - IN(3)   *  2404 + (1 << 13)) >> 14;
-    int t24a = (IN(29) *  2404 + IN(3)   * 16207 + (1 << 13)) >> 14;
-
-    int t0  = t0a  + t3a;
-    int t1  = t1a  + t2a;
-    int t2  = t1a  - t2a;
-    int t3  = t0a  - t3a;
-    int t4  = t4a  + t5a;
-    int t5  = t4a  - t5a;
-    int t6  = t7a  - t6a;
-    int t7  = t7a  + t6a;
-    int t8  = t8a  + t9a;
-    int t9  = t8a  - t9a;
-    int t10 = t11a - t10a;
-    int t11 = t11a + t10a;
-    int t12 = t12a + t13a;
-    int t13 = t12a - t13a;
-    int t14 = t15a - t14a;
-    int t15 = t15a + t14a;
-    int t16 = t16a + t17a;
-    int t17 = t16a - t17a;
-    int t18 = t19a - t18a;
-    int t19 = t19a + t18a;
-    int t20 = t20a + t21a;
-    int t21 = t20a - t21a;
-    int t22 = t23a - t22a;
-    int t23 = t23a + t22a;
-    int t24 = t24a + t25a;
-    int t25 = t24a - t25a;
-    int t26 = t27a - t26a;
-    int t27 = t27a + t26a;
-    int t28 = t28a + t29a;
-    int t29 = t28a - t29a;
-    int t30 = t31a - t30a;
-    int t31 = t31a + t30a;
-
-    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
-    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
-    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
-    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
-    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
-    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
-    t17a =   (t30 *  3196 - t17 * 16069  + (1 << 13)) >> 14;
-    t30a =   (t30 * 16069 + t17 *  3196  + (1 << 13)) >> 14;
-    t18a = (-(t29 * 16069 + t18 *  3196) + (1 << 13)) >> 14;
-    t29a =   (t29 *  3196 - t18 * 16069  + (1 << 13)) >> 14;
-    t21a =   (t26 * 13623 - t21 *  9102  + (1 << 13)) >> 14;
-    t26a =   (t26 *  9102 + t21 * 13623  + (1 << 13)) >> 14;
-    t22a = (-(t25 *  9102 + t22 * 13623) + (1 << 13)) >> 14;
-    t25a =   (t25 * 13623 - t22 *  9102  + (1 << 13)) >> 14;
-
-    t0a  = t0   + t7;
-    t1a  = t1   + t6a;
-    t2a  = t2   + t5a;
-    t3a  = t3   + t4;
-    t4a  = t3   - t4;
-    t5   = t2   - t5a;
-    t6   = t1   - t6a;
-    t7a  = t0   - t7;
-    t8a  = t8   + t11;
-    t9   = t9a  + t10a;
-    t10  = t9a  - t10a;
-    t11a = t8   - t11;
-    t12a = t15  - t12;
-    t13  = t14a - t13a;
-    t14  = t14a + t13a;
-    t15a = t15  + t12;
-    t16a = t16  + t19;
-    t17  = t17a + t18a;
-    t18  = t17a - t18a;
-    t19a = t16  - t19;
-    t20a = t23  - t20;
-    t21  = t22a - t21a;
-    t22  = t22a + t21a;
-    t23a = t23  + t20;
-    t24a = t24  + t27;
-    t25  = t25a + t26a;
-    t26  = t25a - t26a;
-    t27a = t24  - t27;
-    t28a = t31  - t28;
-    t29  = t30a - t29a;
-    t30  = t30a + t29a;
-    t31a = t31  + t28;
-
-    t10a = ((t13           - t10)  * 11585  + (1 << 13)) >> 14;
-    t13a = ((t13           + t10)  * 11585  + (1 << 13)) >> 14;
-    t11  = ((t12a          - t11a) * 11585  + (1 << 13)) >> 14;
-    t12  = ((t12a          + t11a) * 11585  + (1 << 13)) >> 14;
-    t18a =   (t29  *  6270 - t18   * 15137  + (1 << 13)) >> 14;
-    t29a =   (t29  * 15137 + t18   *  6270  + (1 << 13)) >> 14;
-    t19  =   (t28a *  6270 - t19a  * 15137  + (1 << 13)) >> 14;
-    t28  =   (t28a * 15137 + t19a  *  6270  + (1 << 13)) >> 14;
-    t20  = (-(t27a * 15137 + t20a  *  6270) + (1 << 13)) >> 14;
-    t27  =   (t27a *  6270 - t20a  * 15137  + (1 << 13)) >> 14;
-    t21a = (-(t26  * 15137 + t21   *  6270) + (1 << 13)) >> 14;
-    t26a =   (t26  *  6270 - t21   * 15137  + (1 << 13)) >> 14;
-
-    t0   = t0a  + t15a;
-    t1   = t1a  + t14;
-    t2   = t2a  + t13a;
-    t3   = t3a  + t12;
-    t4   = t4a  + t11;
-    t5a  = t5   + t10a;
-    t6a  = t6   + t9;
-    t7   = t7a  + t8a;
-    t8   = t7a  - t8a;
-    t9a  = t6   - t9;
-    t10  = t5   - t10a;
-    t11a = t4a  - t11;
-    t12a = t3a  - t12;
-    t13  = t2a  - t13a;
-    t14a = t1a  - t14;
-    t15  = t0a  - t15a;
-    t16  = t16a + t23a;
-    t17a = t17  + t22;
-    t18  = t18a + t21a;
-    t19a = t19  + t20;
-    t20a = t19  - t20;
-    t21  = t18a - t21a;
-    t22a = t17  - t22;
-    t23  = t16a - t23a;
-    t24  = t31a - t24a;
-    t25a = t30  - t25;
-    t26  = t29a - t26a;
-    t27a = t28  - t27;
-    t28a = t28  + t27;
-    t29  = t29a + t26a;
-    t30a = t30  + t25;
-    t31  = t31a + t24a;
-
-    t20  = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
-    t27  = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
-    t21a = ((t26  - t21)  * 11585 + (1 << 13)) >> 14;
-    t26a = ((t26  + t21)  * 11585 + (1 << 13)) >> 14;
-    t22  = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
-    t25  = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
-    t23a = ((t24  - t23)  * 11585 + (1 << 13)) >> 14;
-    t24a = ((t24  + t23)  * 11585 + (1 << 13)) >> 14;
-
-    out[0]  = t0   + t31;
-    out[1]  = t1   + t30a;
-    out[2]  = t2   + t29;
-    out[3]  = t3   + t28a;
-    out[4]  = t4   + t27;
-    out[5]  = t5a  + t26a;
-    out[6]  = t6a  + t25;
-    out[7]  = t7   + t24a;
-    out[8]  = t8   + t23a;
-    out[9]  = t9a  + t22;
-    out[10] = t10  + t21a;
-    out[11] = t11a + t20;
-    out[12] = t12a + t19a;
-    out[13] = t13  + t18;
-    out[14] = t14a + t17a;
-    out[15] = t15  + t16;
-    out[16] = t15  - t16;
-    out[17] = t14a - t17a;
-    out[18] = t13  - t18;
-    out[19] = t12a - t19a;
-    out[20] = t11a - t20;
-    out[21] = t10  - t21a;
-    out[22] = t9a  - t22;
-    out[23] = t8   - t23a;
-    out[24] = t7   - t24a;
-    out[25] = t6a  - t25;
-    out[26] = t5a  - t26a;
-    out[27] = t4   - t27;
-    out[28] = t3   - t28a;
-    out[29] = t2   - t29;
-    out[30] = t1   - t30a;
-    out[31] = t0   - t31;
-}
-
-itxfm_wrapper(idct, idct, 32, 6)
-
-static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
-                                      ptrdiff_t stride, int pass)
-{
-    int t0, t1, t2, t3, t4;
-
-    if (pass == 0) {
-        t0 = IN(0) >> 2;
-        t1 = IN(3) >> 2;
-        t2 = IN(1) >> 2;
-        t3 = IN(2) >> 2;
+    if (bpp == 8) {
+        ff_vp9dsp_init_8(dsp);
+    } else if (bpp == 10) {
+        ff_vp9dsp_init_10(dsp);
     } else {
-        t0 = IN(0);
-        t1 = IN(3);
-        t2 = IN(1);
-        t3 = IN(2);
+        av_assert0(bpp == 12);
+        ff_vp9dsp_init_12(dsp);
     }
 
-    t0 += t2;
-    t3 -= t1;
-    t4 = (t0 - t3) >> 1;
-    t1 = t4 - t1;
-    t2 = t4 - t2;
-    t0 -= t1;
-    t3 += t2;
-
-    out[0] = t0;
-    out[1] = t1;
-    out[2] = t2;
-    out[3] = t3;
-}
-
-itxfm_wrapper(iwht, iwht, 4, 0)
-
-#undef IN
-#undef itxfm_wrapper
-#undef itxfm_wrap
-
-static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
-{
-#define init_itxfm(tx, sz)                                        \
-    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_   ## sz ## _add_c; \
-    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_  ## sz ## _add_c; \
-    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_  ## sz ## _add_c; \
-    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_ ## sz ## _add_c
-
-#define init_idct(tx, nm)                               \
-    dsp->itxfm_add[tx][DCT_DCT]   =                     \
-    dsp->itxfm_add[tx][ADST_DCT]  =                     \
-    dsp->itxfm_add[tx][DCT_ADST]  =                     \
-    dsp->itxfm_add[tx][ADST_ADST] = nm ## _add_c
-
-    init_itxfm(TX_4X4, 4x4);
-    init_itxfm(TX_8X8, 8x8);
-    init_itxfm(TX_16X16, 16x16);
-    init_idct(TX_32X32, idct_idct_32x32);
-    init_idct(4 /* lossless */, iwht_iwht_4x4);
-
-#undef init_itxfm
-#undef init_idct
-}
-
-static av_always_inline void loop_filter(uint8_t *dst, ptrdiff_t stride,
-                                         int E, int I, int H,
-                                         ptrdiff_t stridea, ptrdiff_t strideb,
-                                         int wd)
-{
-    int i;
-
-    for (i = 0; i < 8; i++, dst += stridea) {
-        int p7, p6, p5, p4;
-        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
-        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
-        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
-        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
-        int q4, q5, q6, q7;
-        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
-                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
-                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
-                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
-        int flat8out, flat8in;
-
-        if (!fm)
-            continue;
-
-        if (wd >= 16) {
-            p7 = dst[strideb * -8];
-            p6 = dst[strideb * -7];
-            p5 = dst[strideb * -6];
-            p4 = dst[strideb * -5];
-            q4 = dst[strideb * +4];
-            q5 = dst[strideb * +5];
-            q6 = dst[strideb * +6];
-            q7 = dst[strideb * +7];
-
-            flat8out = FFABS(p7 - p0) <= 1 && FFABS(p6 - p0) <= 1 &&
-                       FFABS(p5 - p0) <= 1 && FFABS(p4 - p0) <= 1 &&
-                       FFABS(q4 - q0) <= 1 && FFABS(q5 - q0) <= 1 &&
-                       FFABS(q6 - q0) <= 1 && FFABS(q7 - q0) <= 1;
-        }
-
-        if (wd >= 8)
-            flat8in = FFABS(p3 - p0) <= 1 && FFABS(p2 - p0) <= 1 &&
-                      FFABS(p1 - p0) <= 1 && FFABS(q1 - q0) <= 1 &&
-                      FFABS(q2 - q0) <= 1 && FFABS(q3 - q0) <= 1;
-
-        if (wd >= 16 && flat8out && flat8in) {
-            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
-                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
-            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
-                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
-            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
-                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
-            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
-                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
-            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
-                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
-            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
-                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
-            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
-            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
-                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
-            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
-                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
-            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
-                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
-                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
-                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
-                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
-                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
-        } else if (wd >= 8 && flat8in) {
-            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
-            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
-            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
-            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
-            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
-            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
-        } else {
-            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
-
-            if (hev) {
-                int f = av_clip_int8(3 * (q0 - p0) + av_clip_int8(p1 - q1));
-                int f1 = FFMIN(f + 4, 127) >> 3;
-                int f2 = FFMIN(f + 3, 127) >> 3;
-
-                dst[strideb * -1] = av_clip_uint8(p0 + f2);
-                dst[strideb * +0] = av_clip_uint8(q0 - f1);
-            } else {
-                int f = av_clip_int8(3 * (q0 - p0));
-                int f1 = FFMIN(f + 4, 127) >> 3;
-                int f2 = FFMIN(f + 3, 127) >> 3;
-
-                dst[strideb * -1] = av_clip_uint8(p0 + f2);
-                dst[strideb * +0] = av_clip_uint8(q0 - f1);
-
-                f = (f1 + 1) >> 1;
-                dst[strideb * -2] = av_clip_uint8(p1 + f);
-                dst[strideb * +1] = av_clip_uint8(q1 - f);
-            }
-        }
-    }
-}
-
-#define lf_8_fn(dir, wd, stridea, strideb)                                  \
-static void loop_filter_ ## dir ## _ ## wd  ## _8_c(uint8_t *dst,           \
-                                                    ptrdiff_t stride,       \
-                                                    int E, int I, int H)    \
-{                                                                           \
-    loop_filter(dst, stride, E, I, H, stridea, strideb, wd);                \
-}
-
-#define lf_8_fns(wd)          \
-    lf_8_fn(h, wd, stride, 1) \
-    lf_8_fn(v, wd, 1, stride)
-
-lf_8_fns(4)
-lf_8_fns(8)
-lf_8_fns(16)
-
-#undef lf_8_fn
-#undef lf_8_fns
-
-#define lf_16_fn(dir, stridea)                                          \
-static void loop_filter_ ## dir ## _16_16_c(uint8_t *dst,               \
-                                            ptrdiff_t stride,           \
-                                            int E, int I, int H)        \
-{                                                                       \
-    loop_filter_ ## dir ## _16_8_c(dst, stride, E, I, H);               \
-    loop_filter_ ## dir ## _16_8_c(dst + 8 * stridea, stride, E, I, H); \
-}
-
-lf_16_fn(h, stride)
-lf_16_fn(v, 1)
-
-#undef lf_16_fn
-
-#define lf_mix_fn(dir, wd1, wd2, stridea)                                     \
-static void loop_filter_ ## dir ## _ ## wd1 ## wd2 ## _16_c(uint8_t *dst,     \
-                                                            ptrdiff_t stride, \
-                                                            int E, int I,     \
-                                                            int H)            \
-{                                                                             \
-    loop_filter_ ## dir ## _ ## wd1 ## _8_c(dst, stride, E & 0xff,            \
-                                            I & 0xff, H & 0xff);              \
-    loop_filter_ ## dir ## _ ## wd2 ## _8_c(dst + 8 * stridea, stride,        \
-                                            E >> 8, I >> 8, H >> 8);          \
-}
-
-#define lf_mix_fns(wd1, wd2)       \
-    lf_mix_fn(h, wd1, wd2, stride) \
-    lf_mix_fn(v, wd1, wd2, 1)
-
-lf_mix_fns(4, 4)
-lf_mix_fns(4, 8)
-lf_mix_fns(8, 4)
-lf_mix_fns(8, 8)
-
-#undef lf_mix_fn
-#undef lf_mix_fns
-
-static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
-{
-    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
-    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
-    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
-    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
-    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
-    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
-
-    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
-    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
-
-    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
-    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
-    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
-    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
-    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
-    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
-    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
-    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
-}
-
-static av_always_inline void copy_c(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t dst_stride,
-                                    ptrdiff_t src_stride,
-                                    int w, int h)
-{
-    do {
-        memcpy(dst, src, w);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-static av_always_inline void avg_c(uint8_t *dst, const uint8_t *src,
-                                   ptrdiff_t dst_stride,
-                                   ptrdiff_t src_stride,
-                                   int w, int h)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x += 4)
-            AV_WN32A(&dst[x], rnd_avg32(AV_RN32A(&dst[x]), AV_RN32(&src[x])));
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define fpel_fn(type, sz)                                      \
-static void type ## sz ## _c(uint8_t *dst, const uint8_t *src, \
-                             ptrdiff_t dst_stride,             \
-                             ptrdiff_t src_stride,             \
-                             int h, int mx, int my)            \
-{                                                              \
-    type ## _c(dst, src, dst_stride, src_stride, sz, h);       \
-}
-
-#define copy_avg_fn(sz) \
-    fpel_fn(copy, sz)   \
-    fpel_fn(avg, sz)
-
-copy_avg_fn(64)
-copy_avg_fn(32)
-copy_avg_fn(16)
-copy_avg_fn(8)
-copy_avg_fn(4)
-
-#undef fpel_fn
-#undef copy_avg_fn
-
-static const int8_t vp9_subpel_filters[3][15][8] = {
-    [FILTER_8TAP_REGULAR] = {
-        {  0,  1,  -5, 126,   8,  -3,  1,  0 },
-        { -1,  3, -10, 122,  18,  -6,  2,  0 },
-        { -1,  4, -13, 118,  27,  -9,  3, -1 },
-        { -1,  4, -16, 112,  37, -11,  4, -1 },
-        { -1,  5, -18, 105,  48, -14,  4, -1 },
-        { -1,  5, -19,  97,  58, -16,  5, -1 },
-        { -1,  6, -19,  88,  68, -18,  5, -1 },
-        { -1,  6, -19,  78,  78, -19,  6, -1 },
-        { -1,  5, -18,  68,  88, -19,  6, -1 },
-        { -1,  5, -16,  58,  97, -19,  5, -1 },
-        { -1,  4, -14,  48, 105, -18,  5, -1 },
-        { -1,  4, -11,  37, 112, -16,  4, -1 },
-        { -1,  3,  -9,  27, 118, -13,  4, -1 },
-        {  0,  2,  -6,  18, 122, -10,  3, -1 },
-        {  0,  1,  -3,   8, 126,  -5,  1,  0 },
-    }, [FILTER_8TAP_SHARP] = {
-        { -1,  3,  -7, 127,   8,  -3,  1,  0 },
-        { -2,  5, -13, 125,  17,  -6,  3, -1 },
-        { -3,  7, -17, 121,  27, -10,  5, -2 },
-        { -4,  9, -20, 115,  37, -13,  6, -2 },
-        { -4, 10, -23, 108,  48, -16,  8, -3 },
-        { -4, 10, -24, 100,  59, -19,  9, -3 },
-        { -4, 11, -24,  90,  70, -21, 10, -4 },
-        { -4, 11, -23,  80,  80, -23, 11, -4 },
-        { -4, 10, -21,  70,  90, -24, 11, -4 },
-        { -3,  9, -19,  59, 100, -24, 10, -4 },
-        { -3,  8, -16,  48, 108, -23, 10, -4 },
-        { -2,  6, -13,  37, 115, -20,  9, -4 },
-        { -2,  5, -10,  27, 121, -17,  7, -3 },
-        { -1,  3,  -6,  17, 125, -13,  5, -2 },
-        {  0,  1,  -3,   8, 127,  -7,  3, -1 },
-    }, [FILTER_8TAP_SMOOTH] = {
-        { -3, -1,  32,  64,  38,   1, -3,  0 },
-        { -2, -2,  29,  63,  41,   2, -3,  0 },
-        { -2, -2,  26,  63,  43,   4, -4,  0 },
-        { -2, -3,  24,  62,  46,   5, -4,  0 },
-        { -2, -3,  21,  60,  49,   7, -4,  0 },
-        { -1, -4,  18,  59,  51,   9, -4,  0 },
-        { -1, -4,  16,  57,  53,  12, -4, -1 },
-        { -1, -4,  14,  55,  55,  14, -4, -1 },
-        { -1, -4,  12,  53,  57,  16, -4, -1 },
-        {  0, -4,   9,  51,  59,  18, -4, -1 },
-        {  0, -4,   7,  49,  60,  21, -3, -2 },
-        {  0, -4,   5,  46,  62,  24, -3, -2 },
-        {  0, -4,   4,  43,  63,  26, -2, -2 },
-        {  0, -3,   2,  41,  63,  29, -2, -2 },
-        {  0, -3,   1,  38,  64,  32, -1, -3 },
-    }
-};
-
-#define FILTER_8TAP(src, x, F, stride)              \
-    av_clip_uint8((F[0] * src[x + -3 * stride] +    \
-                   F[1] * src[x + -2 * stride] +    \
-                   F[2] * src[x + -1 * stride] +    \
-                   F[3] * src[x + +0 * stride] +    \
-                   F[4] * src[x + +1 * stride] +    \
-                   F[5] * src[x + +2 * stride] +    \
-                   F[6] * src[x + +3 * stride] +    \
-                   F[7] * src[x + +4 * stride] + 64) >> 7)
-
-static av_always_inline void do_8tap_1d_c(uint8_t *dst, const uint8_t *src,
-                                          ptrdiff_t dst_stride,
-                                          ptrdiff_t src_stride,
-                                          int w, int h, ptrdiff_t ds,
-                                          const int8_t *filter, int avg)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
-            else
-                dst[x] = FILTER_8TAP(src, x, filter, ds);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define filter_8tap_1d_fn(opn, opa, dir, ds)                                \
-static av_noinline void opn ## _8tap_1d_ ## dir ## _c(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int w, int h,         \
-                                                      const int8_t *filter) \
-{                                                                           \
-    do_8tap_1d_c(dst, src, dst_stride, src_stride, w, h, ds, filter, opa);  \
-}
-
-filter_8tap_1d_fn(put, 0, v, src_stride)
-filter_8tap_1d_fn(put, 0, h, 1)
-filter_8tap_1d_fn(avg, 1, v, src_stride)
-filter_8tap_1d_fn(avg, 1, h, 1)
-
-#undef filter_8tap_1d_fn
-
-static av_always_inline void do_8tap_2d_c(uint8_t *dst, const uint8_t *src,
-                                          ptrdiff_t dst_stride,
-                                          ptrdiff_t src_stride,
-                                          int w, int h, const int8_t *filterx,
-                                          const int8_t *filtery, int avg)
-{
-    int tmp_h = h + 7;
-    uint8_t tmp[64 * 71], *tmp_ptr = tmp;
-
-    src -= src_stride * 3;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
-
-        tmp_ptr += 64;
-        src     += src_stride;
-    } while (--tmp_h);
-
-    tmp_ptr = tmp + 64 * 3;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
-            else
-                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
-
-        tmp_ptr += 64;
-        dst += dst_stride;
-    } while (--h);
-}
-
-#define filter_8tap_2d_fn(opn, opa)                                     \
-static av_noinline void opn ## _8tap_2d_hv_c(uint8_t *dst,              \
-                                             const uint8_t *src,        \
-                                             ptrdiff_t dst_stride,      \
-                                             ptrdiff_t src_stride,      \
-                                             int w, int h,              \
-                                             const int8_t *filterx,     \
-                                             const int8_t *filtery)     \
-{                                                                       \
-    do_8tap_2d_c(dst, src, dst_stride, src_stride,                      \
-                 w, h, filterx, filtery, opa);                          \
-}
-
-filter_8tap_2d_fn(put, 0)
-filter_8tap_2d_fn(avg, 1)
-
-#undef filter_8tap_2d_fn
-
-#undef FILTER_8TAP
-
-#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg)                   \
-static void                                                                 \
-avg ## _8tap_ ## type ## _ ## sz ## dir ## _c(uint8_t *dst,                 \
-                                              const uint8_t *src,           \
-                                              ptrdiff_t dst_stride,         \
-                                              ptrdiff_t src_stride,         \
-                                              int h, int mx, int my)        \
-{                                                                           \
-    avg ## _8tap_1d_ ## dir ## _c(dst, src, dst_stride, src_stride, sz, h,  \
-                                  vp9_subpel_filters[type_idx][dir_m - 1]); \
-}
-
-#define filter_fn_2d(sz, type, type_idx, avg)                               \
-static void avg ## _8tap_ ## type ## _ ## sz ## hv_c(uint8_t *dst,          \
-                                                     const uint8_t *src,    \
-                                                     ptrdiff_t dst_stride,  \
-                                                     ptrdiff_t src_stride,  \
-                                                     int h, int mx, int my) \
-{                                                                           \
-    avg ## _8tap_2d_hv_c(dst, src, dst_stride, src_stride, sz, h,           \
-                         vp9_subpel_filters[type_idx][mx - 1],              \
-                         vp9_subpel_filters[type_idx][my - 1]);             \
-}
-
-#define FILTER_BILIN(src, x, mxy, stride)                       \
-    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
-
-static av_always_inline void do_bilin_1d_c(uint8_t *dst,
-                                           const uint8_t *src,
-                                           ptrdiff_t dst_stride,
-                                           ptrdiff_t src_stride,
-                                           int w, int h, ptrdiff_t ds,
-                                           int mxy, int avg)
-{
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
-            else
-                dst[x] = FILTER_BILIN(src, x, mxy, ds);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-#define bilin_1d_fn(opn, opa, dir, ds)                                        \
-static av_noinline void opn ## _bilin_1d_ ## dir ## _c(uint8_t *dst,          \
-                                                       const uint8_t *src,    \
-                                                       ptrdiff_t dst_stride,  \
-                                                       ptrdiff_t src_stride,  \
-                                                       int w, int h, int mxy) \
-{                                                                             \
-    do_bilin_1d_c(dst, src, dst_stride, src_stride, w, h, ds, mxy, opa);      \
-}
-
-bilin_1d_fn(put, 0, v, src_stride)
-bilin_1d_fn(put, 0, h, 1)
-bilin_1d_fn(avg, 1, v, src_stride)
-bilin_1d_fn(avg, 1, h, 1)
-
-#undef bilin_1d_fn
-
-static av_always_inline void do_bilin_2d_c(uint8_t *dst,
-                                           const uint8_t *src,
-                                           ptrdiff_t dst_stride,
-                                           ptrdiff_t src_stride,
-                                           int w, int h, int mx, int my,
-                                           int avg)
-{
-    uint8_t tmp[64 * 65], *tmp_ptr = tmp;
-    int tmp_h = h + 1;
-
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
-
-        tmp_ptr += 64;
-        src     += src_stride;
-    } while (--tmp_h);
-
-    tmp_ptr = tmp;
-    do {
-        int x;
-
-        for (x = 0; x < w; x++)
-            if (avg)
-                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
-            else
-                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
-
-        tmp_ptr += 64;
-        dst += dst_stride;
-    } while (--h);
-}
-
-#define bilin_2d_fn(opn, opa)                                           \
-static av_noinline void opn ## _bilin_2d_hv_c(uint8_t *dst,             \
-                                              const uint8_t *src,       \
-                                              ptrdiff_t dst_stride,     \
-                                              ptrdiff_t src_stride,     \
-                                              int w, int h,             \
-                                              int mx, int my)           \
-{                                                                       \
-    do_bilin_2d_c(dst, src, dst_stride, src_stride, w, h, mx, my, opa); \
-}
-
-bilin_2d_fn(put, 0)
-bilin_2d_fn(avg, 1)
-
-#undef bilin_2d_fn
-
-#undef FILTER_BILIN
-
-#define bilinf_fn_1d(sz, dir, dir_m, avg)                               \
-static void avg ## _bilin_ ## sz ## dir ## _c(uint8_t *dst,             \
-                                              const uint8_t *src,       \
-                                              ptrdiff_t dst_stride,     \
-                                              ptrdiff_t src_stride,     \
-                                              int h, int mx, int my)    \
-{                                                                       \
-    avg ## _bilin_1d_ ## dir ## _c(dst, src, dst_stride, src_stride,    \
-                                   sz, h, dir_m);                       \
-}
-
-#define bilinf_fn_2d(sz, avg)                                        \
-static void avg ## _bilin_ ## sz ## hv_c(uint8_t *dst,               \
-                                         const uint8_t *src,         \
-                                         ptrdiff_t dst_stride,       \
-                                         ptrdiff_t src_stride,       \
-                                         int h, int mx, int my)      \
-{                                                                    \
-    avg ## _bilin_2d_hv_c(dst, src, dst_stride, src_stride,          \
-                          sz, h, mx, my);                            \
-}
-
-#define filter_fn(sz, avg)                                     \
-    filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
-    filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
-    filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg)        \
-    filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg)   \
-    filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg)   \
-    filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg)          \
-    filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg)     \
-    filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg)     \
-    filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg)            \
-    bilinf_fn_1d(sz, h, mx, avg)                               \
-    bilinf_fn_1d(sz, v, my, avg)                               \
-    bilinf_fn_2d(sz, avg)
-
-#define filter_fn_set(avg) \
-    filter_fn(64, avg)     \
-    filter_fn(32, avg)     \
-    filter_fn(16, avg)     \
-    filter_fn(8, avg)      \
-    filter_fn(4, avg)
-
-filter_fn_set(put)
-filter_fn_set(avg)
-
-#undef filter_fn
-#undef filter_fn_set
-#undef filter_fn_1d
-#undef filter_fn_2d
-#undef bilinf_fn_1d
-#undef bilinf_fn_2d
-
-static av_cold void vp9dsp_mc_init(VP9DSPContext *dsp)
-{
-#define init_fpel(idx1, idx2, sz, type)                                \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][0][0]  = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][0][0]   = type ## sz ## _c; \
-    dsp->mc[idx1][FILTER_BILINEAR][idx2][0][0]     = type ## sz ## _c
-
-#define init_copy_avg(idx, sz)          \
-    init_fpel(idx, 0, sz, copy);        \
-    init_fpel(idx, 1, sz, avg)
-
-    init_copy_avg(0, 64);
-    init_copy_avg(1, 32);
-    init_copy_avg(2, 16);
-    init_copy_avg(3,  8);
-    init_copy_avg(4,  4);
-
-#undef init_copy_avg
-#undef init_fpel
-
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)             \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _c; \
-    dsp->mc[idx1][FILTER_BILINEAR][idx2][idxh][idxv]     = type ## _bilin_        ## sz ## dir ## _c
-
-#define init_subpel2(idx, idxh, idxv, dir, type)     \
-    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
-    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
-    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
-    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
-    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
-
-#define init_subpel3(idx, type)         \
-    init_subpel2(idx, 1, 1, hv, type);  \
-    init_subpel2(idx, 0, 1, v, type);   \
-    init_subpel2(idx, 1, 0, h, type)
-
-    init_subpel3(0, put);
-    init_subpel3(1, avg);
-
-#undef init_subpel1
-#undef init_subpel2
-#undef init_subpel3
-}
-
-av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
-{
-    vp9dsp_intrapred_init(dsp);
-    vp9dsp_itxfm_init(dsp);
-    vp9dsp_loopfilter_init(dsp);
-    vp9dsp_mc_init(dsp);
-
-    if (ARCH_X86)
-        ff_vp9dsp_init_x86(dsp);
+    if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp, bitexact);
+    if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp);
 }
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h
new file mode 100644
index 0000000..733f5bf
--- /dev/null
+++ b/libavcodec/vp9dsp.h
@@ -0,0 +1,132 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VP9DSP_H
+#define AVCODEC_VP9DSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "vp9.h"
+
+typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+                            const uint8_t *ref, ptrdiff_t ref_stride,
+                            int h, int mx, int my);
+typedef void (*vp9_scaled_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+                                   const uint8_t *ref, ptrdiff_t ref_stride,
+                                   int h, int mx, int my, int dx, int dy);
+
+typedef struct VP9DSPContext {
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32
+     * dimension 2: intra prediction modes
+     *
+     * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * top[-1] is top/left; top[4,7] is top-right for 4x4
+     */
+    // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/
+    // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place?
+    // also needs to fit in with what H.264/VP8/etc do
+    void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst,
+                                                         ptrdiff_t stride,
+                                                         const uint8_t *left,
+                                                         const uint8_t *top);
+
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only)
+     * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst
+     *
+     * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * block is 16-byte aligned
+     * eob indicates the position (+1) of the last non-zero coefficient,
+     * in scan-order. This can be used to write faster versions, e.g. a
+     * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32,
+     * etc.
+     */
+    // FIXME also write idct_add_block() versions for whole (inter) pred
+    // blocks, so we can do 2 4x4s at once
+    void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst,
+                                                      ptrdiff_t stride,
+                                                      int16_t *block, int eob);
+
+    /*
+     * dimension 1: width of filter (0=4, 1=8, 2=16)
+     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by 8
+     */
+    void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride,
+                                int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * The width of filter is assumed to be 16; dst/stride are aligned by 16
+     */
+    void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride,
+                              int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1/2: width of filter (0=4, 1=8) for each filter half
+     * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by operation size
+     * this basically calls loop_filter[d1][d3][0](), followed by
+     * loop_filter[d2][d3][0]() on the next 8 pixels
+     * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the
+     * integer.
+     */
+    // FIXME perhaps a mix4 that operates on 32px (for AVX2)
+    void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride,
+                                      int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4)
+     * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin)
+     * dimension 3: averaging type (0: put, 1: avg)
+     * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin)
+     * dimension 5: y subpel interpolation (0: none, 1: 8tap/bilin)
+     *
+     * dst/stride are aligned by hsize
+     */
+    vp9_mc_func mc[5][4][2][2][2];
+
+    /*
+     * for scalable MC, first 3 dimensions identical to above, the other two
+     * don't exist since it changes per stepsize.
+     */
+    vp9_scaled_mc_func smc[5][4][2];
+} VP9DSPContext;
+
+void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact);
+
+void ff_vp9dsp_init_8(VP9DSPContext *dsp);
+void ff_vp9dsp_init_10(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12(VP9DSPContext *dsp);
+
+void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact);
+void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp);
+
+#endif /* AVCODEC_VP9DSP_H */
diff --git a/libavcodec/vp9dsp_10bpp.c b/libavcodec/vp9dsp_10bpp.c
new file mode 100644
index 0000000..62ce182
--- /dev/null
+++ b/libavcodec/vp9dsp_10bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 10
+#define dctint int64_t
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_12bpp.c b/libavcodec/vp9dsp_12bpp.c
new file mode 100644
index 0000000..2f36471
--- /dev/null
+++ b/libavcodec/vp9dsp_12bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 12
+#define dctint int64_t
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_8bpp.c b/libavcodec/vp9dsp_8bpp.c
new file mode 100644
index 0000000..4b219b0
--- /dev/null
+++ b/libavcodec/vp9dsp_8bpp.c
@@ -0,0 +1,26 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BIT_DEPTH 8
+#define dctint int
+#include "vp9dsp_template.c"
diff --git a/libavcodec/vp9dsp_template.c b/libavcodec/vp9dsp_template.c
new file mode 100644
index 0000000..4d810fe
--- /dev/null
+++ b/libavcodec/vp9dsp_template.c
@@ -0,0 +1,2601 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "bit_depth_template.c"
+#include "vp9dsp.h"
+
+#if BIT_DEPTH != 12
+
+// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
+// back with h264pred.[ch]
+
+static void vert_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4 = AV_RN4PA(top);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, p4);
+    AV_WN4PA(dst + stride * 1, p4);
+    AV_WN4PA(dst + stride * 2, p4);
+    AV_WN4PA(dst + stride * 3, p4);
+}
+
+static void vert_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top + 0);
+    pixel4 p4b = AV_RN4PA(top + 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, p4a);
+        AV_WN4PA(dst + 4, p4b);
+        dst += stride;
+    }
+}
+
+static void vert_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+        dst += stride;
+    }
+}
+
+static void vert_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 p4a = AV_RN4PA(top +  0);
+    pixel4 p4b = AV_RN4PA(top +  4);
+    pixel4 p4c = AV_RN4PA(top +  8);
+    pixel4 p4d = AV_RN4PA(top + 12);
+    pixel4 p4e = AV_RN4PA(top + 16);
+    pixel4 p4f = AV_RN4PA(top + 20);
+    pixel4 p4g = AV_RN4PA(top + 24);
+    pixel4 p4h = AV_RN4PA(top + 28);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, p4a);
+        AV_WN4PA(dst +  4, p4b);
+        AV_WN4PA(dst +  8, p4c);
+        AV_WN4PA(dst + 12, p4d);
+        AV_WN4PA(dst + 16, p4e);
+        AV_WN4PA(dst + 20, p4f);
+        AV_WN4PA(dst + 24, p4g);
+        AV_WN4PA(dst + 28, p4h);
+        dst += stride;
+    }
+}
+
+static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
+    AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
+    AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
+    AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
+}
+
+static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                      const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
+
+        AV_WN4PA(dst + 0, p4);
+        AV_WN4PA(dst + 4, p4);
+        dst += stride;
+    }
+}
+
+static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        dst += stride;
+    }
+}
+
+static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                        const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
+
+        AV_WN4PA(dst +  0, p4);
+        AV_WN4PA(dst +  4, p4);
+        AV_WN4PA(dst +  8, p4);
+        AV_WN4PA(dst + 12, p4);
+        AV_WN4PA(dst + 16, p4);
+        AV_WN4PA(dst + 20, p4);
+        AV_WN4PA(dst + 24, p4);
+        AV_WN4PA(dst + 28, p4);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 4; y++) {
+        int l_m_tl = left[3 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        int l_m_tl = left[7 - y] - tl;
+
+        dst[0] = av_clip_pixel(top[0] + l_m_tl);
+        dst[1] = av_clip_pixel(top[1] + l_m_tl);
+        dst[2] = av_clip_pixel(top[2] + l_m_tl);
+        dst[3] = av_clip_pixel(top[3] + l_m_tl);
+        dst[4] = av_clip_pixel(top[4] + l_m_tl);
+        dst[5] = av_clip_pixel(top[5] + l_m_tl);
+        dst[6] = av_clip_pixel(top[6] + l_m_tl);
+        dst[7] = av_clip_pixel(top[7] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        int l_m_tl = left[15 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst += stride;
+    }
+}
+
+static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    int y, tl = top[-1];
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        int l_m_tl = left[31 - y] - tl;
+
+        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
+        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
+        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
+        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
+        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
+        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
+        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
+        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
+        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
+        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
+        dst[10] = av_clip_pixel(top[10] + l_m_tl);
+        dst[11] = av_clip_pixel(top[11] + l_m_tl);
+        dst[12] = av_clip_pixel(top[12] + l_m_tl);
+        dst[13] = av_clip_pixel(top[13] + l_m_tl);
+        dst[14] = av_clip_pixel(top[14] + l_m_tl);
+        dst[15] = av_clip_pixel(top[15] + l_m_tl);
+        dst[16] = av_clip_pixel(top[16] + l_m_tl);
+        dst[17] = av_clip_pixel(top[17] + l_m_tl);
+        dst[18] = av_clip_pixel(top[18] + l_m_tl);
+        dst[19] = av_clip_pixel(top[19] + l_m_tl);
+        dst[20] = av_clip_pixel(top[20] + l_m_tl);
+        dst[21] = av_clip_pixel(top[21] + l_m_tl);
+        dst[22] = av_clip_pixel(top[22] + l_m_tl);
+        dst[23] = av_clip_pixel(top[23] + l_m_tl);
+        dst[24] = av_clip_pixel(top[24] + l_m_tl);
+        dst[25] = av_clip_pixel(top[25] + l_m_tl);
+        dst[26] = av_clip_pixel(top[26] + l_m_tl);
+        dst[27] = av_clip_pixel(top[27] + l_m_tl);
+        dst[28] = av_clip_pixel(top[28] + l_m_tl);
+        dst[29] = av_clip_pixel(top[29] + l_m_tl);
+        dst[30] = av_clip_pixel(top[30] + l_m_tl);
+        dst[31] = av_clip_pixel(top[31] + l_m_tl);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
+                                top[0] + top[1] + top[2] + top[3] + 4) >> 3);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                     const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
+          top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                       const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
+          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
+          left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
+          left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
+          left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
+          left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
+          top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
+          top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
+          top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                          const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] +
+          left[4] + left[5] + left[6] + left[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
+          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
+          left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
+          left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
+          left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
+          left[30] + left[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, dc);
+    AV_WN4PA(dst + stride * 1, dc);
+    AV_WN4PA(dst + stride * 2, dc);
+    AV_WN4PA(dst + stride * 3, dc);
+}
+
+static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] +
+          top[4] + top[5] + top[6] + top[7] + 4) >> 3);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, dc);
+        AV_WN4PA(dst + 4, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + 8) >> 4);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    pixel4 dc = PIXEL_SPLAT_X4
+        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
+          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
+          top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
+          top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
+          top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
+          top[30] + top[31] + 16) >> 5);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, dc);
+        AV_WN4PA(dst +  4, dc);
+        AV_WN4PA(dst +  8, dc);
+        AV_WN4PA(dst + 12, dc);
+        AV_WN4PA(dst + 16, dc);
+        AV_WN4PA(dst + 20, dc);
+        AV_WN4PA(dst + 24, dc);
+        AV_WN4PA(dst + 28, dc);
+        dst += stride;
+    }
+}
+
+#endif /* BIT_DEPTH != 12 */
+
+static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);}
+
+static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+
+    stride /= sizeof(pixel);
+    AV_WN4PA(dst + stride * 0, val);
+    AV_WN4PA(dst + stride * 1, val);
+    AV_WN4PA(dst + stride * 2, val);
+    AV_WN4PA(dst + stride * 3, val);
+}
+
+static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 8; y++) {
+        AV_WN4PA(dst + 0, val);
+        AV_WN4PA(dst + 4, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 16; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        dst += stride;
+    }
+}
+
+static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
+    int y;
+
+    stride /= sizeof(pixel);
+    for (y = 0; y < 32; y++) {
+        AV_WN4PA(dst +  0, val);
+        AV_WN4PA(dst +  4, val);
+        AV_WN4PA(dst +  8, val);
+        AV_WN4PA(dst + 12, val);
+        AV_WN4PA(dst + 16, val);
+        AV_WN4PA(dst + 20, val);
+        AV_WN4PA(dst + 24, val);
+        AV_WN4PA(dst + 28, val);
+        dst += stride;
+    }
+}
+
+#if BIT_DEPTH != 12
+
+#if BIT_DEPTH == 8
+#define memset_bpc memset
+#else
+static inline void memset_bpc(uint16_t *dst, int val, int len) {
+    int n;
+    for (n = 0; n < len; n++) {
+        dst[n] = val;
+    }
+}
+#endif
+
+#define DST(x, y) dst[(x) + (y) * stride]
+
+static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+    DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
+    DST(3,3) = a7;  // note: this is different from vp8 and such
+}
+
+#define def_diag_downleft(size) \
+static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                              const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel v[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) \
+        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) { \
+        memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
+    } \
+}
+
+def_diag_downleft(8)
+def_diag_downleft(16)
+def_diag_downleft(32)
+
+static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                                 const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_diag_downright(size) \
+static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                               const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size + size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i           ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+        v[size + 1 + i] = (top[i]  + top[i + 1]  * 2 + top[i + 2]  + 2) >> 2; \
+    } \
+    v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
+    v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
+    v[size    ] = (top[-1] + top[0]  * 2 + top[ 1] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
+}
+
+def_diag_downright(8)
+def_diag_downright(16)
+def_diag_downright(32)
+
+static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                             const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[3], l1 = left[2], l2 = left[1];
+
+    stride /= sizeof(pixel);
+    DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
+    DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
+    DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
+    DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3,0) = (a2 + a3 + 1) >> 1;
+    DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_vert_right(size) \
+static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                           const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size/2 - 2; i++) { \
+        vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
+        ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
+    } \
+    vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
+    ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
+    vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
+    for (i = 0; i < size - 1; i++) { \
+        ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2     *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
+        memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
+    } \
+}
+
+def_vert_right(8)
+def_vert_right(16)
+def_vert_right(32)
+
+static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                           const uint8_t *_left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
+        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
+
+    stride /= sizeof(pixel);
+    DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
+    DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
+    DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
+    DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
+    DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,3) = (l2 + l3 + 1) >> 1;
+    DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+}
+
+#define def_hor_down(size) \
+static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                         const uint8_t *_left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size * 3 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2       ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
+        v[i*2    + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
+        v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    } \
+    v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
+    v[size*2 - 1] = (top[0]  + top[-1] * 2 + left[size - 1] + 2) >> 2; \
+    v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
+\
+    for (j = 0; j < size; j++) \
+        memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
+}
+
+def_hor_down(8)
+def_hor_down(16)
+def_hor_down(32)
+
+static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                            const uint8_t *left, const uint8_t *_top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *top = (const pixel *) _top;
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (a0 + a1 + 1) >> 1;
+    DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
+    DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
+    DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
+    DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3,2) = (a4 + a5 + 1) >> 1;
+    DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+}
+
+#define def_vert_left(size) \
+static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                          const uint8_t *left, const uint8_t *_top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *top = (const pixel *) _top; \
+    int i, j; \
+    pixel ve[size - 1], vo[size - 1]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
+        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
+    } \
+    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
+    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) { \
+        memcpy(dst +  j*2      * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst +  j*2      * stride + size - j - 1, top[size - 1], j + 1); \
+        memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
+        memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
+    } \
+}
+
+def_vert_left(8)
+def_vert_left(16)
+def_vert_left(32)
+
+static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
+                         const uint8_t *_left, const uint8_t *top)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *left = (const pixel *) _left;
+    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
+
+    stride /= sizeof(pixel);
+    DST(0,0) = (l0 + l1 + 1) >> 1;
+    DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
+    DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
+    DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
+    DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
+}
+
+#define def_hor_up(size) \
+static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
+                                       const uint8_t *_left, const uint8_t *top) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    const pixel *left = (const pixel *) _left; \
+    int i, j; \
+    pixel v[size*2 - 2]; \
+\
+    stride /= sizeof(pixel); \
+    for (i = 0; i < size - 2; i++) { \
+        v[i*2    ] = (left[i] + left[i + 1] + 1) >> 1; \
+        v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
+    } \
+    v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
+    v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
+\
+    for (j = 0; j < size / 2; j++) \
+        memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
+    for (j = size / 2; j < size; j++) { \
+        memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
+        memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
+                   2 + j*2 - size); \
+    } \
+}
+
+def_hor_up(8)
+def_hor_up(16)
+def_hor_up(32)
+
+#undef DST
+
+#endif /* BIT_DEPTH != 12 */
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
+{
+#define init_intra_pred_bd_aware(tx, sz) \
+    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_##sz##_c; \
+    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_##sz##_c; \
+    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_##sz##_c; \
+    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_intrapred_init_10(dsp);
+#define init_intra_pred(tx, sz) \
+    init_intra_pred_bd_aware(tx, sz)
+#else
+    #define init_intra_pred(tx, sz) \
+    dsp->intra_pred[tx][VERT_PRED]            = vert_##sz##_c; \
+    dsp->intra_pred[tx][HOR_PRED]             = hor_##sz##_c; \
+    dsp->intra_pred[tx][DC_PRED]              = dc_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_##sz##_c; \
+    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
+    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_##sz##_c; \
+    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_##sz##_c; \
+    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_##sz##_c; \
+    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_##sz##_c; \
+    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_##sz##_c; \
+    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_##sz##_c; \
+    init_intra_pred_bd_aware(tx, sz)
+#endif
+
+    init_intra_pred(TX_4X4,   4x4);
+    init_intra_pred(TX_8X8,   8x8);
+    init_intra_pred(TX_16X16, 16x16);
+    init_intra_pred(TX_32X32, 32x32);
+
+#undef init_intra_pred
+#undef init_intra_pred_bd_aware
+}
+
+#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
+static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
+                                                    ptrdiff_t stride, \
+                                                    int16_t *_block, int eob) \
+{ \
+    int i, j; \
+    pixel *dst = (pixel *) _dst; \
+    dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
+\
+    stride /= sizeof(pixel); \
+    if (has_dconly && eob == 1) { \
+        const int t  = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
+                                            * 11585 + (1 << 13)) >> 14; \
+        block[0] = 0; \
+        for (i = 0; i < sz; i++) { \
+            for (j = 0; j < sz; j++) \
+                dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                                (bits ? \
+                                                 (t + (1 << (bits - 1))) >> bits : \
+                                                 t)); \
+            dst++; \
+        } \
+        return; \
+    } \
+\
+    for (i = 0; i < sz; i++) \
+        type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
+    memset(block, 0, sz * sz * sizeof(*block)); \
+    for (i = 0; i < sz; i++) { \
+        type_b##sz##_1d(tmp + i, sz, out, 1); \
+        for (j = 0; j < sz; j++) \
+            dst[j * stride] = av_clip_pixel(dst[j * stride] + \
+                                            (bits ? \
+                                             (out[j] + (1 << (bits - 1))) >> bits : \
+                                             out[j])); \
+        dst++; \
+    } \
+}
+
+#define itxfm_wrap(sz, bits) \
+itxfm_wrapper(idct,  idct,  sz, bits, 1) \
+itxfm_wrapper(iadst, idct,  sz, bits, 0) \
+itxfm_wrapper(idct,  iadst, sz, bits, 0) \
+itxfm_wrapper(iadst, iadst, sz, bits, 0)
+
+#define IN(x) ((dctint) in[(x) * stride])
+
+static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
+    t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
+    t2 = (IN(1) *  6270 - IN(3) * 15137 + (1 << 13)) >> 14;
+    t3 = (IN(1) * 15137 + IN(3) *  6270 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t3;
+    out[1] = t1 + t2;
+    out[2] = t1 - t2;
+    out[3] = t0 - t3;
+}
+
+static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3;
+
+    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
+    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
+    t2 = 13377 * (IN(0) - IN(2) + IN(3));
+    t3 = 13377 * IN(1);
+
+    out[0] = (t0 + t3      + (1 << 13)) >> 14;
+    out[1] = (t1 + t3      + (1 << 13)) >> 14;
+    out[2] = (t2           + (1 << 13)) >> 14;
+    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(4, 4)
+
+static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
+    t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
+    t2a = (IN(2) *  6270 - IN(6) * 15137 + (1 << 13)) >> 14;
+    t3a = (IN(2) * 15137 + IN(6) *  6270 + (1 << 13)) >> 14;
+    t4a = (IN(1) *  3196 - IN(7) * 16069 + (1 << 13)) >> 14;
+    t5a = (IN(5) * 13623 - IN(3) *  9102 + (1 << 13)) >> 14;
+    t6a = (IN(5) *  9102 + IN(3) * 13623 + (1 << 13)) >> 14;
+    t7a = (IN(1) * 16069 + IN(7) *  3196 + (1 << 13)) >> 14;
+
+    t0  = t0a + t3a;
+    t1  = t1a + t2a;
+    t2  = t1a - t2a;
+    t3  = t0a - t3a;
+    t4  = t4a + t5a;
+    t5a = t4a - t5a;
+    t7  = t7a + t6a;
+    t6a = t7a - t6a;
+
+    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
+    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t7;
+    out[1] = t1 + t6;
+    out[2] = t2 + t5;
+    out[3] = t3 + t4;
+    out[4] = t3 - t4;
+    out[5] = t2 - t5;
+    out[6] = t1 - t6;
+    out[7] = t0 - t7;
+}
+
+static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = 16305 * IN(7) +  1606 * IN(0);
+    t1a =  1606 * IN(7) - 16305 * IN(0);
+    t2a = 14449 * IN(5) +  7723 * IN(2);
+    t3a =  7723 * IN(5) - 14449 * IN(2);
+    t4a = 10394 * IN(3) + 12665 * IN(4);
+    t5a = 12665 * IN(3) - 10394 * IN(4);
+    t6a =  4756 * IN(1) + 15679 * IN(6);
+    t7a = 15679 * IN(1) -  4756 * IN(6);
+
+    t0 = (t0a + t4a + (1 << 13)) >> 14;
+    t1 = (t1a + t5a + (1 << 13)) >> 14;
+    t2 = (t2a + t6a + (1 << 13)) >> 14;
+    t3 = (t3a + t7a + (1 << 13)) >> 14;
+    t4 = (t0a - t4a + (1 << 13)) >> 14;
+    t5 = (t1a - t5a + (1 << 13)) >> 14;
+    t6 = (t2a - t6a + (1 << 13)) >> 14;
+    t7 = (t3a - t7a + (1 << 13)) >> 14;
+
+    t4a = 15137 * t4 +  6270 * t5;
+    t5a =  6270 * t4 - 15137 * t5;
+    t6a = 15137 * t7 -  6270 * t6;
+    t7a =  6270 * t7 + 15137 * t6;
+
+    out[0] =   t0 + t2;
+    out[7] = -(t1 + t3);
+    t2     =   t0 - t2;
+    t3     =   t1 - t3;
+
+    out[1] = -((t4a + t6a + (1 << 13)) >> 14);
+    out[6] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6     =   (t4a - t6a + (1 << 13)) >> 14;
+    t7     =   (t5a - t7a + (1 << 13)) >> 14;
+
+    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
+    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
+    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
+    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
+}
+
+itxfm_wrap(8, 5)
+
+static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0a  = ((IN(0) + IN(8)) * 11585 + (1 << 13)) >> 14;
+    t1a  = ((IN(0) - IN(8)) * 11585 + (1 << 13)) >> 14;
+    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
+    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
+    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
+    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
+    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
+    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
+    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
+    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
+    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
+    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
+    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
+    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
+    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
+    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
+
+    t0  = t0a  + t3a;
+    t1  = t1a  + t2a;
+    t2  = t1a  - t2a;
+    t3  = t0a  - t3a;
+    t4  = t4a  + t5a;
+    t5  = t4a  - t5a;
+    t6  = t7a  - t6a;
+    t7  = t7a  + t6a;
+    t8  = t8a  + t9a;
+    t9  = t8a  - t9a;
+    t10 = t11a - t10a;
+    t11 = t11a + t10a;
+    t12 = t12a + t13a;
+    t13 = t12a - t13a;
+    t14 = t15a - t14a;
+    t15 = t15a + t14a;
+
+    t5a  = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
+    t6a  = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
+    t9a  = (  t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a = (  t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a = (  t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4   = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7   = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+
+    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
+    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
+    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
+    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
+
+    out[ 0] = t0a + t15a;
+    out[ 1] = t1a + t14;
+    out[ 2] = t2a + t13a;
+    out[ 3] = t3a + t12;
+    out[ 4] = t4  + t11;
+    out[ 5] = t5  + t10a;
+    out[ 6] = t6  + t9;
+    out[ 7] = t7  + t8a;
+    out[ 8] = t7  - t8a;
+    out[ 9] = t6  - t9;
+    out[10] = t5  - t10a;
+    out[11] = t4  - t11;
+    out[12] = t3a - t12;
+    out[13] = t2a - t13a;
+    out[14] = t1a - t14;
+    out[15] = t0a - t15a;
+}
+
+static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
+                                        dctcoef *out, int pass)
+{
+    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0  = IN(15) * 16364 + IN(0)  *   804;
+    t1  = IN(15) *   804 - IN(0)  * 16364;
+    t2  = IN(13) * 15893 + IN(2)  *  3981;
+    t3  = IN(13) *  3981 - IN(2)  * 15893;
+    t4  = IN(11) * 14811 + IN(4)  *  7005;
+    t5  = IN(11) *  7005 - IN(4)  * 14811;
+    t6  = IN(9)  * 13160 + IN(6)  *  9760;
+    t7  = IN(9)  *  9760 - IN(6)  * 13160;
+    t8  = IN(7)  * 11003 + IN(8)  * 12140;
+    t9  = IN(7)  * 12140 - IN(8)  * 11003;
+    t10 = IN(5)  *  8423 + IN(10) * 14053;
+    t11 = IN(5)  * 14053 - IN(10) *  8423;
+    t12 = IN(3)  *  5520 + IN(12) * 15426;
+    t13 = IN(3)  * 15426 - IN(12) *  5520;
+    t14 = IN(1)  *  2404 + IN(14) * 16207;
+    t15 = IN(1)  * 16207 - IN(14) *  2404;
+
+    t0a  = (t0 + t8  + (1 << 13)) >> 14;
+    t1a  = (t1 + t9  + (1 << 13)) >> 14;
+    t2a  = (t2 + t10 + (1 << 13)) >> 14;
+    t3a  = (t3 + t11 + (1 << 13)) >> 14;
+    t4a  = (t4 + t12 + (1 << 13)) >> 14;
+    t5a  = (t5 + t13 + (1 << 13)) >> 14;
+    t6a  = (t6 + t14 + (1 << 13)) >> 14;
+    t7a  = (t7 + t15 + (1 << 13)) >> 14;
+    t8a  = (t0 - t8  + (1 << 13)) >> 14;
+    t9a  = (t1 - t9  + (1 << 13)) >> 14;
+    t10a = (t2 - t10 + (1 << 13)) >> 14;
+    t11a = (t3 - t11 + (1 << 13)) >> 14;
+    t12a = (t4 - t12 + (1 << 13)) >> 14;
+    t13a = (t5 - t13 + (1 << 13)) >> 14;
+    t14a = (t6 - t14 + (1 << 13)) >> 14;
+    t15a = (t7 - t15 + (1 << 13)) >> 14;
+
+    t8   = t8a  * 16069 + t9a  *  3196;
+    t9   = t8a  *  3196 - t9a  * 16069;
+    t10  = t10a *  9102 + t11a * 13623;
+    t11  = t10a * 13623 - t11a *  9102;
+    t12  = t13a * 16069 - t12a *  3196;
+    t13  = t13a *  3196 + t12a * 16069;
+    t14  = t15a *  9102 - t14a * 13623;
+    t15  = t15a * 13623 + t14a *  9102;
+
+    t0   = t0a + t4a;
+    t1   = t1a + t5a;
+    t2   = t2a + t6a;
+    t3   = t3a + t7a;
+    t4   = t0a - t4a;
+    t5   = t1a - t5a;
+    t6   = t2a - t6a;
+    t7   = t3a - t7a;
+    t8a  = (t8  + t12 + (1 << 13)) >> 14;
+    t9a  = (t9  + t13 + (1 << 13)) >> 14;
+    t10a = (t10 + t14 + (1 << 13)) >> 14;
+    t11a = (t11 + t15 + (1 << 13)) >> 14;
+    t12a = (t8  - t12 + (1 << 13)) >> 14;
+    t13a = (t9  - t13 + (1 << 13)) >> 14;
+    t14a = (t10 - t14 + (1 << 13)) >> 14;
+    t15a = (t11 - t15 + (1 << 13)) >> 14;
+
+    t4a  = t4 * 15137 + t5 *  6270;
+    t5a  = t4 *  6270 - t5 * 15137;
+    t6a  = t7 * 15137 - t6 *  6270;
+    t7a  = t7 *  6270 + t6 * 15137;
+    t12  = t12a * 15137 + t13a *  6270;
+    t13  = t12a *  6270 - t13a * 15137;
+    t14  = t15a * 15137 - t14a *  6270;
+    t15  = t15a *  6270 + t14a * 15137;
+
+    out[ 0] =   t0 + t2;
+    out[15] = -(t1 + t3);
+    t2a     =   t0 - t2;
+    t3a     =   t1 - t3;
+    out[ 3] = -((t4a + t6a + (1 << 13)) >> 14);
+    out[12] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6      =   (t4a - t6a + (1 << 13)) >> 14;
+    t7      =   (t5a - t7a + (1 << 13)) >> 14;
+    out[ 1] = -(t8a + t10a);
+    out[14] =   t9a + t11a;
+    t10     =   t8a - t10a;
+    t11     =   t9a - t11a;
+    out[ 2] =   (t12 + t14 + (1 << 13)) >> 14;
+    out[13] = -((t13 + t15 + (1 << 13)) >> 14);
+    t14a    =   (t12 - t14 + (1 << 13)) >> 14;
+    t15a    =   (t13 - t15 + (1 << 13)) >> 14;
+
+    out[ 7] = ((t2a  + t3a)  * -11585 + (1 << 13)) >> 14;
+    out[ 8] = ((t2a  - t3a)  *  11585 + (1 << 13)) >> 14;
+    out[ 4] = ((t7   + t6)   *  11585 + (1 << 13)) >> 14;
+    out[11] = ((t7   - t6)   *  11585 + (1 << 13)) >> 14;
+    out[ 6] = ((t11  + t10)  *  11585 + (1 << 13)) >> 14;
+    out[ 9] = ((t11  - t10)  *  11585 + (1 << 13)) >> 14;
+    out[ 5] = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
+    out[10] = ((t14a - t15a) *  11585 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(16, 6)
+
+static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
+                                       dctcoef *out, int pass)
+{
+    dctint t0a  = ((IN(0) + IN(16)) * 11585 + (1 << 13)) >> 14;
+    dctint t1a  = ((IN(0) - IN(16)) * 11585 + (1 << 13)) >> 14;
+    dctint t2a  = (IN( 8) *  6270 - IN(24) * 15137 + (1 << 13)) >> 14;
+    dctint t3a  = (IN( 8) * 15137 + IN(24) *  6270 + (1 << 13)) >> 14;
+    dctint t4a  = (IN( 4) *  3196 - IN(28) * 16069 + (1 << 13)) >> 14;
+    dctint t7a  = (IN( 4) * 16069 + IN(28) *  3196 + (1 << 13)) >> 14;
+    dctint t5a  = (IN(20) * 13623 - IN(12) *  9102 + (1 << 13)) >> 14;
+    dctint t6a  = (IN(20) *  9102 + IN(12) * 13623 + (1 << 13)) >> 14;
+    dctint t8a  = (IN( 2) *  1606 - IN(30) * 16305 + (1 << 13)) >> 14;
+    dctint t15a = (IN( 2) * 16305 + IN(30) *  1606 + (1 << 13)) >> 14;
+    dctint t9a  = (IN(18) * 12665 - IN(14) * 10394 + (1 << 13)) >> 14;
+    dctint t14a = (IN(18) * 10394 + IN(14) * 12665 + (1 << 13)) >> 14;
+    dctint t10a = (IN(10) *  7723 - IN(22) * 14449 + (1 << 13)) >> 14;
+    dctint t13a = (IN(10) * 14449 + IN(22) *  7723 + (1 << 13)) >> 14;
+    dctint t11a = (IN(26) * 15679 - IN( 6) *  4756 + (1 << 13)) >> 14;
+    dctint t12a = (IN(26) *  4756 + IN( 6) * 15679 + (1 << 13)) >> 14;
+    dctint t16a = (IN( 1) *   804 - IN(31) * 16364 + (1 << 13)) >> 14;
+    dctint t31a = (IN( 1) * 16364 + IN(31) *   804 + (1 << 13)) >> 14;
+    dctint t17a = (IN(17) * 12140 - IN(15) * 11003 + (1 << 13)) >> 14;
+    dctint t30a = (IN(17) * 11003 + IN(15) * 12140 + (1 << 13)) >> 14;
+    dctint t18a = (IN( 9) *  7005 - IN(23) * 14811 + (1 << 13)) >> 14;
+    dctint t29a = (IN( 9) * 14811 + IN(23) *  7005 + (1 << 13)) >> 14;
+    dctint t19a = (IN(25) * 15426 - IN( 7) *  5520 + (1 << 13)) >> 14;
+    dctint t28a = (IN(25) *  5520 + IN( 7) * 15426 + (1 << 13)) >> 14;
+    dctint t20a = (IN( 5) *  3981 - IN(27) * 15893 + (1 << 13)) >> 14;
+    dctint t27a = (IN( 5) * 15893 + IN(27) *  3981 + (1 << 13)) >> 14;
+    dctint t21a = (IN(21) * 14053 - IN(11) *  8423 + (1 << 13)) >> 14;
+    dctint t26a = (IN(21) *  8423 + IN(11) * 14053 + (1 << 13)) >> 14;
+    dctint t22a = (IN(13) *  9760 - IN(19) * 13160 + (1 << 13)) >> 14;
+    dctint t25a = (IN(13) * 13160 + IN(19) *  9760 + (1 << 13)) >> 14;
+    dctint t23a = (IN(29) * 16207 - IN( 3) *  2404 + (1 << 13)) >> 14;
+    dctint t24a = (IN(29) *  2404 + IN( 3) * 16207 + (1 << 13)) >> 14;
+
+    dctint t0  = t0a  + t3a;
+    dctint t1  = t1a  + t2a;
+    dctint t2  = t1a  - t2a;
+    dctint t3  = t0a  - t3a;
+    dctint t4  = t4a  + t5a;
+    dctint t5  = t4a  - t5a;
+    dctint t6  = t7a  - t6a;
+    dctint t7  = t7a  + t6a;
+    dctint t8  = t8a  + t9a;
+    dctint t9  = t8a  - t9a;
+    dctint t10 = t11a - t10a;
+    dctint t11 = t11a + t10a;
+    dctint t12 = t12a + t13a;
+    dctint t13 = t12a - t13a;
+    dctint t14 = t15a - t14a;
+    dctint t15 = t15a + t14a;
+    dctint t16 = t16a + t17a;
+    dctint t17 = t16a - t17a;
+    dctint t18 = t19a - t18a;
+    dctint t19 = t19a + t18a;
+    dctint t20 = t20a + t21a;
+    dctint t21 = t20a - t21a;
+    dctint t22 = t23a - t22a;
+    dctint t23 = t23a + t22a;
+    dctint t24 = t24a + t25a;
+    dctint t25 = t24a - t25a;
+    dctint t26 = t27a - t26a;
+    dctint t27 = t27a + t26a;
+    dctint t28 = t28a + t29a;
+    dctint t29 = t28a - t29a;
+    dctint t30 = t31a - t30a;
+    dctint t31 = t31a + t30a;
+
+    t5a = ((t6 - t5) * 11585 + (1 << 13)) >> 14;
+    t6a = ((t6 + t5) * 11585 + (1 << 13)) >> 14;
+    t9a  = (  t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a = (  t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a = (  t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+    t17a = (  t30 *  3196 - t17 * 16069  + (1 << 13)) >> 14;
+    t30a = (  t30 * 16069 + t17 *  3196  + (1 << 13)) >> 14;
+    t18a = (-(t29 * 16069 + t18 *  3196) + (1 << 13)) >> 14;
+    t29a = (  t29 *  3196 - t18 * 16069  + (1 << 13)) >> 14;
+    t21a = (  t26 * 13623 - t21 *  9102  + (1 << 13)) >> 14;
+    t26a = (  t26 *  9102 + t21 * 13623  + (1 << 13)) >> 14;
+    t22a = (-(t25 *  9102 + t22 * 13623) + (1 << 13)) >> 14;
+    t25a = (  t25 * 13623 - t22 *  9102  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4a  = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7a  = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+    t16a = t16  + t19;
+    t17  = t17a + t18a;
+    t18  = t17a - t18a;
+    t19a = t16  - t19;
+    t20a = t23  - t20;
+    t21  = t22a - t21a;
+    t22  = t22a + t21a;
+    t23a = t23  + t20;
+    t24a = t24  + t27;
+    t25  = t25a + t26a;
+    t26  = t25a - t26a;
+    t27a = t24  - t27;
+    t28a = t31  - t28;
+    t29  = t30a - t29a;
+    t30  = t30a + t29a;
+    t31a = t31  + t28;
+
+    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
+    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
+    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
+    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
+    t18a = (  t29  *  6270 - t18  * 15137  + (1 << 13)) >> 14;
+    t29a = (  t29  * 15137 + t18  *  6270  + (1 << 13)) >> 14;
+    t19  = (  t28a *  6270 - t19a * 15137  + (1 << 13)) >> 14;
+    t28  = (  t28a * 15137 + t19a *  6270  + (1 << 13)) >> 14;
+    t20  = (-(t27a * 15137 + t20a *  6270) + (1 << 13)) >> 14;
+    t27  = (  t27a *  6270 - t20a * 15137  + (1 << 13)) >> 14;
+    t21a = (-(t26  * 15137 + t21  *  6270) + (1 << 13)) >> 14;
+    t26a = (  t26  *  6270 - t21  * 15137  + (1 << 13)) >> 14;
+
+    t0   = t0a + t15a;
+    t1   = t1a + t14;
+    t2   = t2a + t13a;
+    t3   = t3a + t12;
+    t4   = t4a + t11;
+    t5a  = t5  + t10a;
+    t6a  = t6  + t9;
+    t7   = t7a + t8a;
+    t8   = t7a - t8a;
+    t9a  = t6  - t9;
+    t10  = t5  - t10a;
+    t11a = t4a - t11;
+    t12a = t3a - t12;
+    t13  = t2a - t13a;
+    t14a = t1a - t14;
+    t15  = t0a - t15a;
+    t16  = t16a + t23a;
+    t17a = t17  + t22;
+    t18  = t18a + t21a;
+    t19a = t19  + t20;
+    t20a = t19  - t20;
+    t21  = t18a - t21a;
+    t22a = t17  - t22;
+    t23  = t16a - t23a;
+    t24  = t31a - t24a;
+    t25a = t30  - t25;
+    t26  = t29a - t26a;
+    t27a = t28  - t27;
+    t28a = t28  + t27;
+    t29  = t29a + t26a;
+    t30a = t30  + t25;
+    t31  = t31a + t24a;
+
+    t20  = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
+    t27  = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
+    t21a = ((t26  - t21 ) * 11585 + (1 << 13)) >> 14;
+    t26a = ((t26  + t21 ) * 11585 + (1 << 13)) >> 14;
+    t22  = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
+    t25  = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
+    t23a = ((t24  - t23 ) * 11585 + (1 << 13)) >> 14;
+    t24a = ((t24  + t23 ) * 11585 + (1 << 13)) >> 14;
+
+    out[ 0] = t0   + t31;
+    out[ 1] = t1   + t30a;
+    out[ 2] = t2   + t29;
+    out[ 3] = t3   + t28a;
+    out[ 4] = t4   + t27;
+    out[ 5] = t5a  + t26a;
+    out[ 6] = t6a  + t25;
+    out[ 7] = t7   + t24a;
+    out[ 8] = t8   + t23a;
+    out[ 9] = t9a  + t22;
+    out[10] = t10  + t21a;
+    out[11] = t11a + t20;
+    out[12] = t12a + t19a;
+    out[13] = t13  + t18;
+    out[14] = t14a + t17a;
+    out[15] = t15  + t16;
+    out[16] = t15  - t16;
+    out[17] = t14a - t17a;
+    out[18] = t13  - t18;
+    out[19] = t12a - t19a;
+    out[20] = t11a - t20;
+    out[21] = t10  - t21a;
+    out[22] = t9a  - t22;
+    out[23] = t8   - t23a;
+    out[24] = t7   - t24a;
+    out[25] = t6a  - t25;
+    out[26] = t5a  - t26a;
+    out[27] = t4   - t27;
+    out[28] = t3   - t28a;
+    out[29] = t2   - t29;
+    out[30] = t1   - t30a;
+    out[31] = t0   - t31;
+}
+
+itxfm_wrapper(idct, idct, 32, 6, 1)
+
+static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
+                                      dctcoef *out, int pass)
+{
+    int t0, t1, t2, t3, t4;
+
+    if (pass == 0) {
+        t0 = IN(0) >> 2;
+        t1 = IN(3) >> 2;
+        t2 = IN(1) >> 2;
+        t3 = IN(2) >> 2;
+    } else {
+        t0 = IN(0);
+        t1 = IN(3);
+        t2 = IN(1);
+        t3 = IN(2);
+    }
+
+    t0 += t2;
+    t3 -= t1;
+    t4 = (t0 - t3) >> 1;
+    t1 = t4 - t1;
+    t2 = t4 - t2;
+    t0 -= t1;
+    t3 += t2;
+
+    out[0] = t0;
+    out[1] = t1;
+    out[2] = t2;
+    out[3] = t3;
+}
+
+itxfm_wrapper(iwht, iwht, 4, 0, 0)
+
+#undef IN
+#undef itxfm_wrapper
+#undef itxfm_wrap
+
+static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
+{
+#define init_itxfm(tx, sz) \
+    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_##sz##_add_c; \
+    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
+
+#define init_idct(tx, nm) \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
+
+    init_itxfm(TX_4X4,   4x4);
+    init_itxfm(TX_8X8,   8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32,  idct_idct_32x32);
+    init_idct(4 /* lossless */, iwht_iwht_4x4);
+
+#undef init_itxfm
+#undef init_idct
+}
+
+static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
+                                         ptrdiff_t stridea, ptrdiff_t strideb,
+                                         int wd)
+{
+    int i, F = 1 << (BIT_DEPTH - 8);
+
+    E <<= (BIT_DEPTH - 8);
+    I <<= (BIT_DEPTH - 8);
+    H <<= (BIT_DEPTH - 8);
+    for (i = 0; i < 8; i++, dst += stridea) {
+        int p7, p6, p5, p4;
+        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
+        int q4, q5, q6, q7;
+        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
+                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
+                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
+                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
+        int flat8out, flat8in;
+
+        if (!fm)
+            continue;
+
+        if (wd >= 16) {
+            p7 = dst[strideb * -8];
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+            q7 = dst[strideb * +7];
+
+            flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
+                       FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
+                       FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
+                       FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
+        }
+
+        if (wd >= 8)
+            flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
+                      FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
+                      FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
+
+        if (wd >= 16 && flat8out && flat8in) {
+            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
+                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
+                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
+                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
+                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
+                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
+            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
+            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
+                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
+                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
+                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
+                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else {
+            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
+
+            if (hev) {
+                int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
+                f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+            } else {
+                int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
+
+                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
+
+                dst[strideb * -1] = av_clip_pixel(p0 + f2);
+                dst[strideb * +0] = av_clip_pixel(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = av_clip_pixel(p1 + f);
+                dst[strideb * +1] = av_clip_pixel(q1 - f);
+            }
+        }
+    }
+}
+
+#define lf_8_fn(dir, wd, stridea, strideb) \
+static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
+                                           ptrdiff_t stride, \
+                                           int E, int I, int H) \
+{ \
+    pixel *dst = (pixel *) _dst; \
+    stride /= sizeof(pixel); \
+    loop_filter(dst, E, I, H, stridea, strideb, wd); \
+}
+
+#define lf_8_fns(wd) \
+lf_8_fn(h, wd, stride, 1) \
+lf_8_fn(v, wd, 1, stride)
+
+lf_8_fns(4)
+lf_8_fns(8)
+lf_8_fns(16)
+
+#undef lf_8_fn
+#undef lf_8_fns
+
+#define lf_16_fn(dir, stridea) \
+static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
+                                        ptrdiff_t stride, \
+                                        int E, int I, int H) \
+{ \
+    loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
+    loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
+}
+
+lf_16_fn(h, stride)
+lf_16_fn(v, sizeof(pixel))
+
+#undef lf_16_fn
+
+#define lf_mix_fn(dir, wd1, wd2, stridea) \
+static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  int E, int I, int H) \
+{ \
+    loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
+    loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
+}
+
+#define lf_mix_fns(wd1, wd2) \
+lf_mix_fn(h, wd1, wd2, stride) \
+lf_mix_fn(v, wd1, wd2, sizeof(pixel))
+
+lf_mix_fns(4, 4)
+lf_mix_fns(4, 8)
+lf_mix_fns(8, 4)
+lf_mix_fns(8, 8)
+
+#undef lf_mix_fn
+#undef lf_mix_fns
+
+static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
+{
+    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
+    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
+    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
+    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
+    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
+    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
+
+    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
+    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
+
+    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
+    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
+    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
+    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
+    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
+    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
+    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
+    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride,
+                                    const uint8_t *src, ptrdiff_t src_stride,
+                                    int w, int h)
+{
+    do {
+        memcpy(dst, src, w * sizeof(pixel));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+static av_always_inline void avg_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                   const uint8_t *_src, ptrdiff_t src_stride,
+                                   int w, int h)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x += 4)
+            AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define fpel_fn(type, sz) \
+static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                         const uint8_t *src, ptrdiff_t src_stride, \
+                         int h, int mx, int my) \
+{ \
+    type##_c(dst, dst_stride, src, src_stride, sz, h); \
+}
+
+#define copy_avg_fn(sz) \
+fpel_fn(copy, sz) \
+fpel_fn(avg,  sz)
+
+copy_avg_fn(64)
+copy_avg_fn(32)
+copy_avg_fn(16)
+copy_avg_fn(8)
+copy_avg_fn(4)
+
+#undef fpel_fn
+#undef copy_avg_fn
+
+#endif /* BIT_DEPTH != 12 */
+
+static const int16_t vp9_subpel_filters[3][16][8] = {
+    [FILTER_8TAP_REGULAR] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
+        {  0,  1,  -5, 126,   8,  -3,  1,  0 },
+        { -1,  3, -10, 122,  18,  -6,  2,  0 },
+        { -1,  4, -13, 118,  27,  -9,  3, -1 },
+        { -1,  4, -16, 112,  37, -11,  4, -1 },
+        { -1,  5, -18, 105,  48, -14,  4, -1 },
+        { -1,  5, -19,  97,  58, -16,  5, -1 },
+        { -1,  6, -19,  88,  68, -18,  5, -1 },
+        { -1,  6, -19,  78,  78, -19,  6, -1 },
+        { -1,  5, -18,  68,  88, -19,  6, -1 },
+        { -1,  5, -16,  58,  97, -19,  5, -1 },
+        { -1,  4, -14,  48, 105, -18,  5, -1 },
+        { -1,  4, -11,  37, 112, -16,  4, -1 },
+        { -1,  3,  -9,  27, 118, -13,  4, -1 },
+        {  0,  2,  -6,  18, 122, -10,  3, -1 },
+        {  0,  1,  -3,   8, 126,  -5,  1,  0 },
+    }, [FILTER_8TAP_SHARP] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
+        { -1,  3,  -7, 127,   8,  -3,  1,  0 },
+        { -2,  5, -13, 125,  17,  -6,  3, -1 },
+        { -3,  7, -17, 121,  27, -10,  5, -2 },
+        { -4,  9, -20, 115,  37, -13,  6, -2 },
+        { -4, 10, -23, 108,  48, -16,  8, -3 },
+        { -4, 10, -24, 100,  59, -19,  9, -3 },
+        { -4, 11, -24,  90,  70, -21, 10, -4 },
+        { -4, 11, -23,  80,  80, -23, 11, -4 },
+        { -4, 10, -21,  70,  90, -24, 11, -4 },
+        { -3,  9, -19,  59, 100, -24, 10, -4 },
+        { -3,  8, -16,  48, 108, -23, 10, -4 },
+        { -2,  6, -13,  37, 115, -20,  9, -4 },
+        { -2,  5, -10,  27, 121, -17,  7, -3 },
+        { -1,  3,  -6,  17, 125, -13,  5, -2 },
+        {  0,  1,  -3,   8, 127,  -7,  3, -1 },
+    }, [FILTER_8TAP_SMOOTH] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
+        { -3, -1,  32,  64,  38,   1, -3,  0 },
+        { -2, -2,  29,  63,  41,   2, -3,  0 },
+        { -2, -2,  26,  63,  43,   4, -4,  0 },
+        { -2, -3,  24,  62,  46,   5, -4,  0 },
+        { -2, -3,  21,  60,  49,   7, -4,  0 },
+        { -1, -4,  18,  59,  51,   9, -4,  0 },
+        { -1, -4,  16,  57,  53,  12, -4, -1 },
+        { -1, -4,  14,  55,  55,  14, -4, -1 },
+        { -1, -4,  12,  53,  57,  16, -4, -1 },
+        {  0, -4,   9,  51,  59,  18, -4, -1 },
+        {  0, -4,   7,  49,  60,  21, -3, -2 },
+        {  0, -4,   5,  46,  62,  24, -3, -2 },
+        {  0, -4,   4,  43,  63,  26, -2, -2 },
+        {  0, -3,   2,  41,  63,  29, -2, -2 },
+        {  0, -3,   1,  38,  64,  32, -1, -3 },
+    }
+};
+
+#define FILTER_8TAP(src, x, F, stride) \
+    av_clip_pixel((F[0] * src[x + -3 * stride] + \
+                   F[1] * src[x + -2 * stride] + \
+                   F[2] * src[x + -1 * stride] + \
+                   F[3] * src[x + +0 * stride] + \
+                   F[4] * src[x + +1 * stride] + \
+                   F[5] * src[x + +2 * stride] + \
+                   F[6] * src[x + +3 * stride] + \
+                   F[7] * src[x + +4 * stride] + 64) >> 7)
+
+static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, ptrdiff_t ds,
+                                          const int16_t *filter, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(src, x, filter, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define filter_8tap_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src, ptrdiff_t src_stride, \
+                                                int w, int h, const int16_t *filter) \
+{ \
+    do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
+}
+
+filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(put, 0, h, 1)
+filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+filter_8tap_1d_fn(avg, 1, h, 1)
+
+#undef filter_8tap_1d_fn
+
+static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                          const uint8_t *_src, ptrdiff_t src_stride,
+                                          int w, int h, const int16_t *filterx,
+                                          const int16_t *filtery, int avg)
+{
+    int tmp_h = h + 7;
+    pixel tmp[64 * 71], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define filter_8tap_2d_fn(opn, opa) \
+static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int w, int h, const int16_t *filterx, \
+                                           const int16_t *filtery) \
+{ \
+    do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
+}
+
+filter_8tap_2d_fn(put, 0)
+filter_8tap_2d_fn(avg, 1)
+
+#undef filter_8tap_2d_fn
+
+#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                              const uint8_t *src, ptrdiff_t src_stride, \
+                                              int h, int mx, int my) \
+{ \
+    avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
+                            vp9_subpel_filters[type_idx][dir_m]); \
+}
+
+#define filter_fn_2d(sz, type, type_idx, avg) \
+static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my) \
+{ \
+    avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
+                       vp9_subpel_filters[type_idx][mx], \
+                       vp9_subpel_filters[type_idx][my]); \
+}
+
+#if BIT_DEPTH != 12
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
+
+static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, ptrdiff_t ds, int mxy, int avg)
+{
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(src, x, mxy, ds);
+            }
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define bilin_1d_fn(opn, opa, dir, ds) \
+static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                 const uint8_t *src, ptrdiff_t src_stride, \
+                                                 int w, int h, int mxy) \
+{ \
+    do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
+}
+
+bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
+bilin_1d_fn(put, 0, h, 1)
+bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
+bilin_1d_fn(avg, 1, h, 1)
+
+#undef bilin_1d_fn
+
+static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                           const uint8_t *_src, ptrdiff_t src_stride,
+                                           int w, int h, int mx, int my, int avg)
+{
+    pixel tmp[64 * 65], *tmp_ptr = tmp;
+    int tmp_h = h + 1;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define bilin_2d_fn(opn, opa) \
+static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my) \
+{ \
+    do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
+}
+
+bilin_2d_fn(put, 0)
+bilin_2d_fn(avg, 1)
+
+#undef bilin_2d_fn
+
+#define bilinf_fn_1d(sz, dir, dir_m, avg) \
+static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my) \
+{ \
+    avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
+}
+
+#define bilinf_fn_2d(sz, avg) \
+static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my) \
+{ \
+    avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
+}
+
+#else
+
+#define bilinf_fn_1d(a, b, c, d)
+#define bilinf_fn_2d(a, b)
+
+#endif
+
+#define filter_fn(sz, avg) \
+filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_2d(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+filter_fn_1d(sz, h, mx, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, v, my, smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_2d(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+filter_fn_1d(sz, h, mx, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_1d(sz, v, my, sharp,   FILTER_8TAP_SHARP,   avg) \
+filter_fn_2d(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+bilinf_fn_1d(sz, h, mx,                               avg) \
+bilinf_fn_1d(sz, v, my,                               avg) \
+bilinf_fn_2d(sz,                                      avg)
+
+#define filter_fn_set(avg) \
+filter_fn(64, avg) \
+filter_fn(32, avg) \
+filter_fn(16, avg) \
+filter_fn(8,  avg) \
+filter_fn(4,  avg)
+
+filter_fn_set(put)
+filter_fn_set(avg)
+
+#undef filter_fn
+#undef filter_fn_set
+#undef filter_fn_1d
+#undef filter_fn_2d
+#undef bilinf_fn_1d
+#undef bilinf_fn_2d
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
+{
+#if BIT_DEPTH == 12
+    ff_vp9dsp_mc_init_10(dsp);
+#else /* BIT_DEPTH == 12 */
+
+#define init_fpel(idx1, idx2, sz, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = type##sz##_c; \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = type##sz##_c
+
+#define init_copy_avg(idx, sz) \
+    init_fpel(idx, 0, sz, copy); \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_copy_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_fpel
+
+#endif /* BIT_DEPTH == 12 */
+
+#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
+
+#if BIT_DEPTH == 12
+#define init_subpel1 init_subpel1_bd_aware
+#else
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
+    init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
+#endif
+
+#define init_subpel2(idx, idxh, idxv, dir, type) \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type) \
+    init_subpel2(idx, 1, 1, hv, type); \
+    init_subpel2(idx, 0, 1, v, type); \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+#undef init_subpel1_bd_aware
+}
+
+static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                              const uint8_t *_src, ptrdiff_t src_stride,
+                                              int w, int h, int mx, int my,
+                                              int dx, int dy, int avg,
+                                              const int16_t (*filters)[8])
+{
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
+    pixel tmp[64 * 135], *tmp_ptr = tmp;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+        const int16_t *filter = filters[my];
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_filter_8tap_fn(opn, opa) \
+static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my, int dx, int dy, \
+                                            const int16_t (*filters)[8]) \
+{ \
+    do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+                     opa, filters); \
+}
+
+scaled_filter_8tap_fn(put, 0)
+scaled_filter_8tap_fn(avg, 1)
+
+#undef scaled_filter_8tap_fn
+
+#undef FILTER_8TAP
+
+#define scaled_filter_fn(sz, type, type_idx, avg) \
+static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
+                        vp9_subpel_filters[type_idx]); \
+}
+
+#if BIT_DEPTH != 12
+
+static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
+                                               const uint8_t *_src, ptrdiff_t src_stride,
+                                               int w, int h, int mx, int my,
+                                               int dx, int dy, int avg)
+{
+    pixel tmp[64 * 129], *tmp_ptr = tmp;
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
+    pixel *dst = (pixel *) _dst;
+    const pixel *src = (const pixel *) _src;
+
+    dst_stride /= sizeof(pixel);
+    src_stride /= sizeof(pixel);
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_bilin_fn(opn, opa) \
+static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                             const uint8_t *src, ptrdiff_t src_stride, \
+                                             int w, int h, int mx, int my, int dx, int dy) \
+{ \
+    do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
+}
+
+scaled_bilin_fn(put, 0)
+scaled_bilin_fn(avg, 1)
+
+#undef scaled_bilin_fn
+
+#undef FILTER_BILIN
+
+#define scaled_bilinf_fn(sz, avg) \
+static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
+}
+
+#else
+
+#define scaled_bilinf_fn(a, b)
+
+#endif
+
+#define scaled_filter_fns(sz, avg) \
+scaled_filter_fn(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+scaled_filter_fn(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+scaled_filter_fn(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+scaled_bilinf_fn(sz,                                      avg)
+
+#define scaled_filter_fn_set(avg) \
+scaled_filter_fns(64, avg) \
+scaled_filter_fns(32, avg) \
+scaled_filter_fns(16, avg) \
+scaled_filter_fns(8,  avg) \
+scaled_filter_fns(4,  avg)
+
+scaled_filter_fn_set(put)
+scaled_filter_fn_set(avg)
+
+#undef scaled_filter_fns
+#undef scaled_filter_fn_set
+#undef scaled_filter_fn
+#undef scaled_bilinf_fn
+
+#if BIT_DEPTH != 8
+void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
+#endif
+#if BIT_DEPTH != 10
+static
+#endif
+av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
+{
+#define init_scaled_bd_aware(idx1, idx2, sz, type) \
+    dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_SHARP  ][idx2] = type##_scaled_sharp_##sz##_c
+
+#if BIT_DEPTH == 12
+    ff_vp9dsp_scaled_mc_init_10(dsp);
+#define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
+#else
+#define init_scaled(idx1, idx2, sz, type) \
+    init_scaled_bd_aware(idx1, idx2, sz, type); \
+    dsp->smc[idx1][FILTER_BILINEAR    ][idx2] = type##_scaled_bilin_##sz##_c
+#endif
+
+#define init_scaled_put_avg(idx, sz) \
+    init_scaled(idx, 0, sz, put); \
+    init_scaled(idx, 1, sz, avg)
+
+    init_scaled_put_avg(0, 64);
+    init_scaled_put_avg(1, 32);
+    init_scaled_put_avg(2, 16);
+    init_scaled_put_avg(3,  8);
+    init_scaled_put_avg(4,  4);
+
+#undef init_scaled_put_avg
+#undef init_scaled
+#undef init_scaled_bd_aware
+}
+
+av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp)
+{
+    FUNC(ff_vp9dsp_intrapred_init)(dsp);
+    vp9dsp_itxfm_init(dsp);
+    vp9dsp_loopfilter_init(dsp);
+    FUNC(ff_vp9dsp_mc_init)(dsp);
+    FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
+}
diff --git a/libavcodec/vp9mvs.c b/libavcodec/vp9mvs.c
deleted file mode 100644
index 1f65aaa..0000000
--- a/libavcodec/vp9mvs.c
+++ /dev/null
@@ -1,352 +0,0 @@
-/*
- * VP9 compatible video decoder
- *
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "internal.h"
-#include "vp56.h"
-#include "vp9.h"
-#include "vp9data.h"
-
-static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
-                                      VP9Context *s)
-{
-    dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
-    dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
-}
-
-static void find_ref_mvs(VP9Context *s,
-                         VP56mv *pmv, int ref, int z, int idx, int sb)
-{
-    static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
-        [BS_64x64] = { {  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
-                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 } },
-        [BS_64x32] = { {  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
-                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 } },
-        [BS_32x64] = { { -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
-                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 } },
-        [BS_32x32] = { {  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
-                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
-        [BS_32x16] = { {  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
-                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
-        [BS_16x32] = { { -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
-                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 } },
-        [BS_16x16] = { {  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
-                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
-        [BS_16x8]  = { {  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
-                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 } },
-        [BS_8x16]  = { { -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
-                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 } },
-        [BS_8x8]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-        [BS_8x4]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-        [BS_4x8]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-        [BS_4x4]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
-                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
-    };
-    VP9Block *const b = &s->b;
-    int row = b->row, col = b->col, row7 = b->row7;
-    const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
-#define INVALID_MV 0x80008000U
-    uint32_t mem = INVALID_MV;
-    int i;
-
-#define RETURN_DIRECT_MV(mv)                    \
-    do {                                        \
-        uint32_t m = AV_RN32A(&mv);             \
-        if (!idx) {                             \
-            AV_WN32A(pmv, m);                   \
-            return;                             \
-        } else if (mem == INVALID_MV) {         \
-            mem = m;                            \
-        } else if (m != mem) {                  \
-            AV_WN32A(pmv, m);                   \
-            return;                             \
-        }                                       \
-    } while (0)
-
-    if (sb >= 0) {
-        if (sb == 2 || sb == 1) {
-            RETURN_DIRECT_MV(b->mv[0][z]);
-        } else if (sb == 3) {
-            RETURN_DIRECT_MV(b->mv[2][z]);
-            RETURN_DIRECT_MV(b->mv[1][z]);
-            RETURN_DIRECT_MV(b->mv[0][z]);
-        }
-
-#define RETURN_MV(mv)                           \
-    do {                                        \
-        if (sb > 0) {                           \
-            VP56mv tmp;                         \
-            uint32_t m;                         \
-            clamp_mv(&tmp, &mv, s);             \
-            m = AV_RN32A(&tmp);                 \
-            if (!idx) {                         \
-                AV_WN32A(pmv, m);               \
-                return;                         \
-            } else if (mem == INVALID_MV) {     \
-                mem = m;                        \
-            } else if (m != mem) {              \
-                AV_WN32A(pmv, m);               \
-                return;                         \
-            }                                   \
-        } else {                                \
-            uint32_t m = AV_RN32A(&mv);         \
-            if (!idx) {                         \
-                clamp_mv(pmv, &mv, s);          \
-                return;                         \
-            } else if (mem == INVALID_MV) {     \
-                mem = m;                        \
-            } else if (m != mem) {              \
-                clamp_mv(pmv, &mv, s);          \
-                return;                         \
-            }                                   \
-        }                                       \
-    } while (0)
-
-        if (row > 0) {
-            VP9MVRefPair *mv = &s->mv[0][(row - 1) * s->sb_cols * 8 + col];
-
-            if (mv->ref[0] == ref)
-                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
-            else if (mv->ref[1] == ref)
-                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
-        }
-        if (col > s->tiling.tile_col_start) {
-            VP9MVRefPair *mv = &s->mv[0][row * s->sb_cols * 8 + col - 1];
-
-            if (mv->ref[0] == ref)
-                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
-            else if (mv->ref[1] == ref)
-                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
-        }
-        i = 2;
-    } else {
-        i = 0;
-    }
-
-    // previously coded MVs in the neighborhood, using same reference frame
-    for (; i < 8; i++) {
-        int c = p[i][0] + col, r = p[i][1] + row;
-
-        if (c >= s->tiling.tile_col_start && c < s->cols &&
-            r >= 0 && r < s->rows) {
-            VP9MVRefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
-
-            if (mv->ref[0] == ref)
-                RETURN_MV(mv->mv[0]);
-            else if (mv->ref[1] == ref)
-                RETURN_MV(mv->mv[1]);
-        }
-    }
-
-    // MV at this position in previous frame, using same reference frame
-    if (s->use_last_frame_mvs) {
-        VP9MVRefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
-
-        if (mv->ref[0] == ref)
-            RETURN_MV(mv->mv[0]);
-        else if (mv->ref[1] == ref)
-            RETURN_MV(mv->mv[1]);
-    }
-
-#define RETURN_SCALE_MV(mv, scale)              \
-    do {                                        \
-        if (scale) {                            \
-            VP56mv mv_temp = { -mv.x, -mv.y };  \
-            RETURN_MV(mv_temp);                 \
-        } else {                                \
-            RETURN_MV(mv);                      \
-        }                                       \
-    } while (0)
-
-    // previously coded MVs in the neighborhood, using different reference frame
-    for (i = 0; i < 8; i++) {
-        int c = p[i][0] + col, r = p[i][1] + row;
-
-        if (c >= s->tiling.tile_col_start && c < s->cols &&
-            r >= 0 && r < s->rows) {
-            VP9MVRefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
-
-            if (mv->ref[0] != ref && mv->ref[0] >= 0)
-                RETURN_SCALE_MV(mv->mv[0],
-                                s->signbias[mv->ref[0]] != s->signbias[ref]);
-            if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
-                // BUG - libvpx has this condition regardless of whether
-                // we used the first ref MV and pre-scaling
-                AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
-                RETURN_SCALE_MV(mv->mv[1],
-                                s->signbias[mv->ref[1]] != s->signbias[ref]);
-            }
-        }
-    }
-
-    // MV at this position in previous frame, using different reference frame
-    if (s->use_last_frame_mvs) {
-        VP9MVRefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
-
-        if (mv->ref[0] != ref && mv->ref[0] >= 0)
-            RETURN_SCALE_MV(mv->mv[0],
-                            s->signbias[mv->ref[0]] != s->signbias[ref]);
-        if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
-            // BUG - libvpx has this condition regardless of whether
-            // we used the first ref MV and pre-scaling
-            AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
-            RETURN_SCALE_MV(mv->mv[1],
-                            s->signbias[mv->ref[1]] != s->signbias[ref]);
-        }
-    }
-
-    AV_ZERO32(pmv);
-#undef INVALID_MV
-#undef RETURN_MV
-#undef RETURN_SCALE_MV
-}
-
-static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
-{
-    int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
-    int n, c = vp8_rac_get_tree(&s->c, ff_vp9_mv_class_tree,
-                                s->prob.p.mv_comp[idx].classes);
-
-    s->counts.mv_comp[idx].sign[sign]++;
-    s->counts.mv_comp[idx].classes[c]++;
-    if (c) {
-        int m;
-
-        for (n = 0, m = 0; m < c; m++) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
-            n  |= bit << m;
-            s->counts.mv_comp[idx].bits[m][bit]++;
-        }
-        n <<= 3;
-        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
-                               s->prob.p.mv_comp[idx].fp);
-        n  |= bit << 1;
-        s->counts.mv_comp[idx].fp[bit]++;
-        if (hp) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
-            s->counts.mv_comp[idx].hp[bit]++;
-            n |= bit;
-        } else {
-            n |= 1;
-            // bug in libvpx - we count for bw entropy purposes even if the
-            // bit wasn't coded
-            s->counts.mv_comp[idx].hp[1]++;
-        }
-        n += 8 << c;
-    } else {
-        n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
-        s->counts.mv_comp[idx].class0[n]++;
-        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
-                               s->prob.p.mv_comp[idx].class0_fp[n]);
-        s->counts.mv_comp[idx].class0_fp[n][bit]++;
-        n = (n << 3) | (bit << 1);
-        if (hp) {
-            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
-            s->counts.mv_comp[idx].class0_hp[bit]++;
-            n |= bit;
-        } else {
-            n |= 1;
-            // bug in libvpx - we count for bw entropy purposes even if the
-            // bit wasn't coded
-            s->counts.mv_comp[idx].class0_hp[1]++;
-        }
-    }
-
-    return sign ? -(n + 1) : (n + 1);
-}
-
-void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb)
-{
-    VP9Block *const b = &s->b;
-
-    if (mode == ZEROMV) {
-        memset(mv, 0, sizeof(*mv) * 2);
-    } else {
-        int hp;
-
-        // FIXME cache this value and reuse for other subblocks
-        find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
-                     mode == NEWMV ? -1 : sb);
-        // FIXME maybe move this code into find_ref_mvs()
-        if ((mode == NEWMV || sb == -1) &&
-            !(hp = s->highprecisionmvs &&
-              abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
-            if (mv[0].y & 1) {
-                if (mv[0].y < 0)
-                    mv[0].y++;
-                else
-                    mv[0].y--;
-            }
-            if (mv[0].x & 1) {
-                if (mv[0].x < 0)
-                    mv[0].x++;
-                else
-                    mv[0].x--;
-            }
-        }
-        if (mode == NEWMV) {
-            enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
-                                              s->prob.p.mv_joint);
-
-            s->counts.mv_joint[j]++;
-            if (j >= MV_JOINT_V)
-                mv[0].y += read_mv_component(s, 0, hp);
-            if (j & 1)
-                mv[0].x += read_mv_component(s, 1, hp);
-        }
-
-        if (b->comp) {
-            // FIXME cache this value and reuse for other subblocks
-            find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
-                         mode == NEWMV ? -1 : sb);
-            if ((mode == NEWMV || sb == -1) &&
-                !(hp = s->highprecisionmvs &&
-                  abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
-                if (mv[1].y & 1) {
-                    if (mv[1].y < 0)
-                        mv[1].y++;
-                    else
-                        mv[1].y--;
-                }
-                if (mv[1].x & 1) {
-                    if (mv[1].x < 0)
-                        mv[1].x++;
-                    else
-                        mv[1].x--;
-                }
-            }
-            if (mode == NEWMV) {
-                enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
-                                                  s->prob.p.mv_joint);
-
-                s->counts.mv_joint[j]++;
-                if (j >= MV_JOINT_V)
-                    mv[1].y += read_mv_component(s, 0, hp);
-                if (j & 1)
-                    mv[1].x += read_mv_component(s, 1, hp);
-            }
-        }
-    }
-}
diff --git a/libavcodec/vp9prob.c b/libavcodec/vp9prob.c
deleted file mode 100644
index b8a7c22..0000000
--- a/libavcodec/vp9prob.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * VP9 compatible video decoder
- *
- * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
- * Copyright (C) 2013 Clément Bœsch <u pkh me>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "vp56.h"
-#include "vp9.h"
-#include "vp9data.h"
-
-static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
-                                        int max_count, int update_factor)
-{
-    unsigned ct = ct0 + ct1, p2, p1;
-
-    if (!ct)
-        return;
-
-    p1 = *p;
-    p2 = ((ct0 << 8) + (ct >> 1)) / ct;
-    p2 = av_clip(p2, 1, 255);
-    ct = FFMIN(ct, max_count);
-    update_factor = FASTDIV(update_factor * ct, max_count);
-
-    // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
-    *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
-}
-
-void ff_vp9_adapt_probs(VP9Context *s)
-{
-    int i, j, k, l, m;
-    ProbContext *p = &s->prob_ctx[s->framectxid].p;
-    int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
-
-    // coefficients
-    for (i = 0; i < 4; i++)
-        for (j = 0; j < 2; j++)
-            for (k = 0; k < 2; k++)
-                for (l = 0; l < 6; l++)
-                    for (m = 0; m < 6; m++) {
-                        uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
-                        unsigned *e = s->counts.eob[i][j][k][l][m];
-                        unsigned *c = s->counts.coef[i][j][k][l][m];
-
-                        if (l == 0 && m >= 3) // dc only has 3 pt
-                            break;
-
-                        adapt_prob(&pp[0], e[0], e[1], 24, uf);
-                        adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
-                        adapt_prob(&pp[2], c[1], c[2], 24, uf);
-                    }
-
-    if (s->keyframe || s->intraonly) {
-        memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
-        memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
-        memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
-        memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
-        return;
-    }
-
-    // skip flag
-    for (i = 0; i < 3; i++)
-        adapt_prob(&p->skip[i], s->counts.skip[i][0],
-                   s->counts.skip[i][1], 20, 128);
-
-    // intra/inter flag
-    for (i = 0; i < 4; i++)
-        adapt_prob(&p->intra[i], s->counts.intra[i][0],
-                   s->counts.intra[i][1], 20, 128);
-
-    // comppred flag
-    if (s->comppredmode == PRED_SWITCHABLE) {
-        for (i = 0; i < 5; i++)
-            adapt_prob(&p->comp[i], s->counts.comp[i][0],
-                       s->counts.comp[i][1], 20, 128);
-    }
-
-    // reference frames
-    if (s->comppredmode != PRED_SINGLEREF) {
-        for (i = 0; i < 5; i++)
-            adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
-                       s->counts.comp_ref[i][1], 20, 128);
-    }
-
-    if (s->comppredmode != PRED_COMPREF) {
-        for (i = 0; i < 5; i++) {
-            uint8_t *pp = p->single_ref[i];
-            unsigned (*c)[2] = s->counts.single_ref[i];
-
-            adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
-            adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
-        }
-    }
-
-    // block partitioning
-    for (i = 0; i < 4; i++)
-        for (j = 0; j < 4; j++) {
-            uint8_t *pp = p->partition[i][j];
-            unsigned *c = s->counts.partition[i][j];
-
-            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-            adapt_prob(&pp[2], c[2], c[3], 20, 128);
-        }
-
-    // tx size
-    if (s->txfmmode == TX_SWITCHABLE) {
-        for (i = 0; i < 2; i++) {
-            unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
-
-            adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0],
-                       s->counts.tx8p[i][1], 20, 128);
-            adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
-            adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
-            adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
-            adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
-            adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
-        }
-    }
-
-    // interpolation filter
-    if (s->filtermode == FILTER_SWITCHABLE) {
-        for (i = 0; i < 4; i++) {
-            uint8_t *pp = p->filter[i];
-            unsigned *c = s->counts.filter[i];
-
-            adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
-            adapt_prob(&pp[1], c[1], c[2], 20, 128);
-        }
-    }
-
-    // inter modes
-    for (i = 0; i < 7; i++) {
-        uint8_t *pp = p->mv_mode[i];
-        unsigned *c = s->counts.mv_mode[i];
-
-        adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
-        adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
-        adapt_prob(&pp[2], c[1], c[3], 20, 128);
-    }
-
-    // mv joints
-    {
-        uint8_t *pp = p->mv_joint;
-        unsigned *c = s->counts.mv_joint;
-
-        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-        adapt_prob(&pp[2], c[2], c[3], 20, 128);
-    }
-
-    // mv components
-    for (i = 0; i < 2; i++) {
-        uint8_t *pp;
-        unsigned *c, (*c2)[2], sum;
-
-        adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
-                   s->counts.mv_comp[i].sign[1], 20, 128);
-
-        pp  = p->mv_comp[i].classes;
-        c   = s->counts.mv_comp[i].classes;
-        sum = c[1] + c[2] + c[3] + c[4] + c[5] +
-              c[6] + c[7] + c[8] + c[9] + c[10];
-        adapt_prob(&pp[0], c[0], sum, 20, 128);
-        sum -= c[1];
-        adapt_prob(&pp[1], c[1], sum, 20, 128);
-        sum -= c[2] + c[3];
-        adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
-        adapt_prob(&pp[3], c[2], c[3], 20, 128);
-        sum -= c[4] + c[5];
-        adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
-        adapt_prob(&pp[5], c[4], c[5], 20, 128);
-        sum -= c[6];
-        adapt_prob(&pp[6], c[6], sum, 20, 128);
-        adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
-        adapt_prob(&pp[8], c[7], c[8], 20, 128);
-        adapt_prob(&pp[9], c[9], c[10], 20, 128);
-
-        adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
-                   s->counts.mv_comp[i].class0[1], 20, 128);
-        pp = p->mv_comp[i].bits;
-        c2 = s->counts.mv_comp[i].bits;
-        for (j = 0; j < 10; j++)
-            adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
-
-        for (j = 0; j < 2; j++) {
-            pp = p->mv_comp[i].class0_fp[j];
-            c  = s->counts.mv_comp[i].class0_fp[j];
-            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-            adapt_prob(&pp[2], c[2], c[3], 20, 128);
-        }
-        pp = p->mv_comp[i].fp;
-        c  = s->counts.mv_comp[i].fp;
-        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
-        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
-        adapt_prob(&pp[2], c[2], c[3], 20, 128);
-
-        if (s->highprecisionmvs) {
-            adapt_prob(&p->mv_comp[i].class0_hp,
-                       s->counts.mv_comp[i].class0_hp[0],
-                       s->counts.mv_comp[i].class0_hp[1], 20, 128);
-            adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
-                       s->counts.mv_comp[i].hp[1], 20, 128);
-        }
-    }
-
-    // y intra modes
-    for (i = 0; i < 4; i++) {
-        uint8_t *pp = p->y_mode[i];
-        unsigned *c = s->counts.y_mode[i], sum, s2;
-
-        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
-        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
-        sum -= c[TM_VP8_PRED];
-        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
-        sum -= c[VERT_PRED];
-        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
-        s2   = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
-        sum -= s2;
-        adapt_prob(&pp[3], s2, sum, 20, 128);
-        s2 -= c[HOR_PRED];
-        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
-        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED],
-                   20, 128);
-        sum -= c[DIAG_DOWN_LEFT_PRED];
-        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
-        sum -= c[VERT_LEFT_PRED];
-        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
-        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
-    }
-
-    // uv intra modes
-    for (i = 0; i < 10; i++) {
-        uint8_t *pp = p->uv_mode[i];
-        unsigned *c = s->counts.uv_mode[i], sum, s2;
-
-        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
-        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
-        sum -= c[TM_VP8_PRED];
-        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
-        sum -= c[VERT_PRED];
-        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
-        s2   = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
-        sum -= s2;
-        adapt_prob(&pp[3], s2, sum, 20, 128);
-        s2 -= c[HOR_PRED];
-        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
-        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED],
-                   20, 128);
-        sum -= c[DIAG_DOWN_LEFT_PRED];
-        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
-        sum -= c[VERT_LEFT_PRED];
-        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
-        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
-    }
-}
diff --git a/libavcodec/vqavideo.c b/libavcodec/vqavideo.c
index 0d0d59a..81d50bb 100644
--- a/libavcodec/vqavideo.c
+++ b/libavcodec/vqavideo.c
@@ -2,20 +2,20 @@
  * Westwood Studios VQA Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
 
     /* make sure the extradata made it */
     if (s->avctx->extradata_size != VQA_HEADER_SIZE) {
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: expected extradata size of %d\n", VQA_HEADER_SIZE);
+        av_log(s->avctx, AV_LOG_ERROR, "expected extradata size of %d\n", VQA_HEADER_SIZE);
         return AVERROR(EINVAL);
     }
 
@@ -162,8 +162,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->width  & (s->vector_width  - 1) ||
-        s->height & (s->vector_height - 1)) {
+    if (s->width % s->vector_width || s->height % s->vector_height) {
         av_log(avctx, AV_LOG_ERROR, "Image size not multiple of block size\n");
         return AVERROR_INVALIDDATA;
     }
@@ -180,7 +179,7 @@ static av_cold int vqa_decode_init(AVCodecContext *avctx)
     /* allocate decode buffer */
     s->decode_buffer_size = (s->width / s->vector_width) *
         (s->height / s->vector_height) * 2;
-    s->decode_buffer = av_malloc(s->decode_buffer_size);
+    s->decode_buffer = av_mallocz(s->decode_buffer_size);
     if (!s->decode_buffer)
         goto fail;
 
@@ -208,22 +207,22 @@ fail:
 
 #define CHECK_COUNT() \
     if (dest_index + count > dest_size) { \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: next op would overflow dest_index\n"); \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: current dest_index = %d, count = %d, dest_size = %d\n", \
+        av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: next op would overflow dest_index\n"); \
+        av_log(s->avctx, AV_LOG_ERROR, "current dest_index = %d, count = %d, dest_size = %d\n", \
             dest_index, count, dest_size); \
         return AVERROR_INVALIDDATA; \
     }
 
 #define CHECK_COPY(idx) \
     if (idx < 0 || idx + count > dest_size) { \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: next op would overflow dest_index\n"); \
-        av_log(NULL, AV_LOG_ERROR, "  VQA video: current src_pos = %d, count = %d, dest_size = %d\n", \
+        av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: next op would overflow dest_index\n"); \
+        av_log(s->avctx, AV_LOG_ERROR, "current src_pos = %d, count = %d, dest_size = %d\n", \
             src_pos, count, dest_size); \
         return AVERROR_INVALIDDATA; \
     }
 
 
-static int decode_format80(GetByteContext *gb, int src_size,
+static int decode_format80(VqaContext *s, int src_size,
     unsigned char *dest, int dest_size, int check_size) {
 
     int dest_index = 0;
@@ -232,26 +231,32 @@ static int decode_format80(GetByteContext *gb, int src_size,
     unsigned char color;
     int i;
 
-    start = bytestream2_tell(gb);
-    while (bytestream2_tell(gb) - start < src_size) {
-        opcode = bytestream2_get_byte(gb);
-        ff_dlog(NULL, "      opcode %02X: ", opcode);
+    if (src_size < 0 || src_size > bytestream2_get_bytes_left(&s->gb)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Chunk size %d is out of range\n",
+               src_size);
+        return AVERROR_INVALIDDATA;
+    }
+
+    start = bytestream2_tell(&s->gb);
+    while (bytestream2_tell(&s->gb) - start < src_size) {
+        opcode = bytestream2_get_byte(&s->gb);
+        ff_tlog(s->avctx, "opcode %02X: ", opcode);
 
         /* 0x80 means that frame is finished */
         if (opcode == 0x80)
-            return 0;
+            break;
 
         if (dest_index >= dest_size) {
-            av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: dest_index (%d) exceeded dest_size (%d)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: dest_index (%d) exceeded dest_size (%d)\n",
                 dest_index, dest_size);
             return AVERROR_INVALIDDATA;
         }
 
         if (opcode == 0xFF) {
 
-            count   = bytestream2_get_le16(gb);
-            src_pos = bytestream2_get_le16(gb);
-            ff_dlog(NULL, "(1) copy %X bytes from absolute pos %X\n", count, src_pos);
+            count   = bytestream2_get_le16(&s->gb);
+            src_pos = bytestream2_get_le16(&s->gb);
+            ff_tlog(s->avctx, "(1) copy %X bytes from absolute pos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(src_pos);
             for (i = 0; i < count; i++)
@@ -260,9 +265,9 @@ static int decode_format80(GetByteContext *gb, int src_size,
 
         } else if (opcode == 0xFE) {
 
-            count = bytestream2_get_le16(gb);
-            color = bytestream2_get_byte(gb);
-            ff_dlog(NULL, "(2) set %X bytes to %02X\n", count, color);
+            count = bytestream2_get_le16(&s->gb);
+            color = bytestream2_get_byte(&s->gb);
+            ff_tlog(s->avctx, "(2) set %X bytes to %02X\n", count, color);
             CHECK_COUNT();
             memset(&dest[dest_index], color, count);
             dest_index += count;
@@ -270,8 +275,8 @@ static int decode_format80(GetByteContext *gb, int src_size,
         } else if ((opcode & 0xC0) == 0xC0) {
 
             count = (opcode & 0x3F) + 3;
-            src_pos = bytestream2_get_le16(gb);
-            ff_dlog(NULL, "(3) copy %X bytes from absolute pos %X\n", count, src_pos);
+            src_pos = bytestream2_get_le16(&s->gb);
+            ff_tlog(s->avctx, "(3) copy %X bytes from absolute pos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(src_pos);
             for (i = 0; i < count; i++)
@@ -281,16 +286,16 @@ static int decode_format80(GetByteContext *gb, int src_size,
         } else if (opcode > 0x80) {
 
             count = opcode & 0x3F;
-            ff_dlog(NULL, "(4) copy %X bytes from source to dest\n", count);
+            ff_tlog(s->avctx, "(4) copy %X bytes from source to dest\n", count);
             CHECK_COUNT();
-            bytestream2_get_buffer(gb, &dest[dest_index], count);
+            bytestream2_get_buffer(&s->gb, &dest[dest_index], count);
             dest_index += count;
 
         } else {
 
             count = ((opcode & 0x70) >> 4) + 3;
-            src_pos = bytestream2_get_byte(gb) | ((opcode & 0x0F) << 8);
-            ff_dlog(NULL, "(5) copy %X bytes from relpos %X\n", count, src_pos);
+            src_pos = bytestream2_get_byte(&s->gb) | ((opcode & 0x0F) << 8);
+            ff_tlog(s->avctx, "(5) copy %X bytes from relpos %X\n", count, src_pos);
             CHECK_COUNT();
             CHECK_COPY(dest_index - src_pos);
             for (i = 0; i < count; i++)
@@ -304,9 +309,11 @@ static int decode_format80(GetByteContext *gb, int src_size,
      * codebook entry; it is not important for compressed codebooks because
      * not every entry needs to be filled */
     if (check_size)
-        if (dest_index < dest_size)
-            av_log(NULL, AV_LOG_ERROR, "  VQA video: decode_format80 problem: decode finished with dest_index (%d) < dest_size (%d)\n",
+        if (dest_index < dest_size) {
+            av_log(s->avctx, AV_LOG_ERROR, "decode_format80 problem: decode finished with dest_index (%d) < dest_size (%d)\n",
                 dest_index, dest_size);
+            memset(dest + dest_index, 0, dest_size - dest_index);
+        }
 
     return 0; // let's display what we decoded anyway
 }
@@ -377,7 +384,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
             break;
 
         default:
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: Found unknown chunk type: %c%c%c%c (%08X)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "Found unknown chunk type: %c%c%c%c (%08X)\n",
             (chunk_type >> 24) & 0xFF,
             (chunk_type >> 16) & 0xFF,
             (chunk_type >>  8) & 0xFF,
@@ -394,7 +401,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if ((cpl0_chunk != -1) && (cplz_chunk != -1)) {
 
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CPL0 and CPLZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CPL0 and CPLZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -412,7 +419,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         chunk_size = bytestream2_get_be32(&s->gb);
         /* sanity check the palette size */
         if (chunk_size / 3 > 256 || chunk_size > bytestream2_get_bytes_left(&s->gb)) {
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found a palette chunk with %d colors\n",
+            av_log(s->avctx, AV_LOG_ERROR, "problem: found a palette chunk with %d colors\n",
                 chunk_size / 3);
             return AVERROR_INVALIDDATA;
         }
@@ -421,7 +428,8 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
             r = bytestream2_get_byteu(&s->gb) * 4;
             g = bytestream2_get_byteu(&s->gb) * 4;
             b = bytestream2_get_byteu(&s->gb) * 4;
-            s->palette[i] = (r << 16) | (g << 8) | (b);
+            s->palette[i] = 0xFFU << 24 | r << 16 | g << 8 | b;
+            s->palette[i] |= s->palette[i] >> 6 & 0x30303;
         }
     }
 
@@ -429,7 +437,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if ((cbf0_chunk != -1) && (cbfz_chunk != -1)) {
 
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CBF0 and CBFZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CBF0 and CBFZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -438,7 +446,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
 
         bytestream2_seek(&s->gb, cbfz_chunk, SEEK_SET);
         chunk_size = bytestream2_get_be32(&s->gb);
-        if ((res = decode_format80(&s->gb, chunk_size, s->codebook,
+        if ((res = decode_format80(s, chunk_size, s->codebook,
                                    s->codebook_size, 0)) < 0)
             return res;
     }
@@ -450,7 +458,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         chunk_size = bytestream2_get_be32(&s->gb);
         /* sanity check the full codebook size */
         if (chunk_size > MAX_CODEBOOK_SIZE) {
-            av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: CBF0 chunk too large (0x%X bytes)\n",
+            av_log(s->avctx, AV_LOG_ERROR, "problem: CBF0 chunk too large (0x%X bytes)\n",
                 chunk_size);
             return AVERROR_INVALIDDATA;
         }
@@ -462,13 +470,13 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     if (vptz_chunk == -1) {
 
         /* something is wrong if there is no VPTZ chunk */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: no VPTZ chunk found\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: no VPTZ chunk found\n");
         return AVERROR_INVALIDDATA;
     }
 
     bytestream2_seek(&s->gb, vptz_chunk, SEEK_SET);
     chunk_size = bytestream2_get_be32(&s->gb);
-    if ((res = decode_format80(&s->gb, chunk_size,
+    if ((res = decode_format80(s, chunk_size,
                                s->decode_buffer, s->decode_buffer_size, 1)) < 0)
         return res;
 
@@ -531,7 +539,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
     /* handle partial codebook */
     if ((cbp0_chunk != -1) && (cbpz_chunk != -1)) {
         /* a chunk should not have both chunk types */
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA video: problem: found both CBP0 and CBPZ chunks\n");
+        av_log(s->avctx, AV_LOG_ERROR, "problem: found both CBP0 and CBPZ chunks\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -552,7 +560,7 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         s->next_codebook_buffer_index += chunk_size;
 
         s->partial_countdown--;
-        if (s->partial_countdown == 0) {
+        if (s->partial_countdown <= 0) {
 
             /* time to replace codebook */
             memcpy(s->codebook, s->next_codebook_buffer,
@@ -581,12 +589,10 @@ static int vqa_decode_chunk(VqaContext *s, AVFrame *frame)
         s->next_codebook_buffer_index += chunk_size;
 
         s->partial_countdown--;
-        if (s->partial_countdown == 0) {
-            GetByteContext gb;
-
-            bytestream2_init(&gb, s->next_codebook_buffer, s->next_codebook_buffer_index);
+        if (s->partial_countdown <= 0) {
+            bytestream2_init(&s->gb, s->next_codebook_buffer, s->next_codebook_buffer_index);
             /* decompress codebook */
-            if ((res = decode_format80(&gb, s->next_codebook_buffer_index,
+            if ((res = decode_format80(s, s->next_codebook_buffer_index,
                                        s->codebook, s->codebook_size, 0)) < 0)
                 return res;
 
@@ -607,10 +613,8 @@ static int vqa_decode_frame(AVCodecContext *avctx,
     AVFrame *frame = data;
     int res;
 
-    if ((res = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "  VQA Video: get_buffer() failed\n");
+    if ((res = ff_get_buffer(avctx, frame, 0)) < 0)
         return res;
-    }
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
     if ((res = vqa_decode_chunk(s, frame)) < 0)
diff --git a/libavcodec/wavpack.c b/libavcodec/wavpack.c
index ab9dec9..b6022f0 100644
--- a/libavcodec/wavpack.c
+++ b/libavcodec/wavpack.c
@@ -2,20 +2,20 @@
  * WavPack lossless audio decoder
  * Copyright (c) 2006,2011 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,60 +25,16 @@
 #include "avcodec.h"
 #include "get_bits.h"
 #include "internal.h"
+#include "thread.h"
 #include "unary.h"
 #include "bytestream.h"
+#include "wavpack.h"
 
 /**
  * @file
  * WavPack lossless audio decoder
  */
 
-#define WV_HEADER_SIZE    32
-
-#define WV_MONO           0x00000004
-#define WV_JOINT_STEREO   0x00000010
-#define WV_FALSE_STEREO   0x40000000
-
-#define WV_HYBRID_MODE    0x00000008
-#define WV_HYBRID_SHAPE   0x00000008
-#define WV_HYBRID_BITRATE 0x00000200
-#define WV_HYBRID_BALANCE 0x00000400
-#define WV_INITIAL_BLOCK  0x00000800
-#define WV_FINAL_BLOCK    0x00001000
-
-#define WV_SINGLE_BLOCK (WV_INITIAL_BLOCK | WV_FINAL_BLOCK)
-
-#define WV_FLT_SHIFT_ONES 0x01
-#define WV_FLT_SHIFT_SAME 0x02
-#define WV_FLT_SHIFT_SENT 0x04
-#define WV_FLT_ZERO_SENT  0x08
-#define WV_FLT_ZERO_SIGN  0x10
-
-enum WP_ID_Flags {
-    WP_IDF_MASK   = 0x3F,
-    WP_IDF_IGNORE = 0x20,
-    WP_IDF_ODD    = 0x40,
-    WP_IDF_LONG   = 0x80
-};
-
-enum WP_ID {
-    WP_ID_DUMMY = 0,
-    WP_ID_ENCINFO,
-    WP_ID_DECTERMS,
-    WP_ID_DECWEIGHTS,
-    WP_ID_DECSAMPLES,
-    WP_ID_ENTROPY,
-    WP_ID_HYBRID,
-    WP_ID_SHAPING,
-    WP_ID_FLOATINFO,
-    WP_ID_INT32INFO,
-    WP_ID_DATA,
-    WP_ID_CORR,
-    WP_ID_EXTRABITS,
-    WP_ID_CHANINFO,
-    WP_ID_SAMPLE_RATE = 0x27,
-};
-
 typedef struct SavedContext {
     int offset;
     int size;
@@ -86,23 +42,6 @@ typedef struct SavedContext {
     uint32_t crc;
 } SavedContext;
 
-#define MAX_TERMS 16
-
-typedef struct Decorr {
-    int delta;
-    int value;
-    int weightA;
-    int weightB;
-    int samplesA[8];
-    int samplesB[8];
-} Decorr;
-
-typedef struct WvChannel {
-    int median[3];
-    int slow_level, error_limit;
-    int bitrate_acc, bitrate_delta;
-} WvChannel;
-
 typedef struct WavpackFrameContext {
     AVCodecContext *avctx;
     int frame_flags;
@@ -144,101 +83,7 @@ typedef struct WavpackContext {
     int ch_offset;
 } WavpackContext;
 
-static const int wv_rates[16] = {
-     6000,  8000,  9600, 11025, 12000, 16000,  22050, 24000,
-    32000, 44100, 48000, 64000, 88200, 96000, 192000,     0
-};
-
-// exponent table copied from WavPack source
-static const uint8_t wp_exp2_table[256] = {
-    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
-    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
-    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
-    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
-    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
-    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
-    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
-    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
-    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
-    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
-    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
-    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
-    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
-};
-
-static const uint8_t wp_log2_table [] = {
-    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
-    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
-    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
-    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
-    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
-    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
-    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
-    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
-    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
-    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
-    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
-    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
-    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
-    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
-    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
-    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
-};
-
-static av_always_inline int wp_exp2(int16_t val)
-{
-    int res, neg = 0;
-
-    if (val < 0) {
-        val = -val;
-        neg = 1;
-    }
-
-    res   = wp_exp2_table[val & 0xFF] | 0x100;
-    val >>= 8;
-    res   = (val > 9) ? (res << (val - 9)) : (res >> (9 - val));
-    return neg ? -res : res;
-}
-
-static av_always_inline int wp_log2(int32_t val)
-{
-    int bits;
-
-    if (!val)
-        return 0;
-    if (val == 1)
-        return 256;
-    val += val >> 9;
-    bits = av_log2(val) + 1;
-    if (bits < 9)
-        return (bits << 8) + wp_log2_table[(val << (9 - bits)) & 0xFF];
-    else
-        return (bits << 8) + wp_log2_table[(val >> (bits - 9)) & 0xFF];
-}
-
-#define LEVEL_DECAY(a)  ((a + 0x80) >> 8)
-
-// macros for manipulating median values
-#define GET_MED(n) ((c->median[n] >> 4) + 1)
-#define DEC_MED(n) c->median[n] -= ((c->median[n] + (128 >> n) - 2) / (128 >> n)) * 2
-#define INC_MED(n) c->median[n] += ((c->median[n] + (128 >> n)    ) / (128 >> n)) * 5
-
-// macros for applying weight
-#define UPDATE_WEIGHT_CLIP(weight, delta, samples, in) \
-    if (samples && in) { \
-        if ((samples ^ in) < 0) { \
-            weight -= delta; \
-            if (weight < -1024) \
-                weight = -1024; \
-        } else { \
-            weight += delta; \
-            if (weight > 1024) \
-                weight = 1024; \
-        } \
-    }
+#define LEVEL_DECAY(a)  (((a) + 0x80) >> 8)
 
 static av_always_inline int get_tail(GetBitContext *gb, int k)
 {
@@ -310,7 +155,7 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
             if (t >= 2) {
                 if (get_bits_left(gb) < t - 1)
                     goto error;
-                t = get_bits(gb, t - 1) | (1 << (t - 1));
+                t = get_bits_long(gb, t - 1) | (1 << (t - 1));
             } else {
                 if (get_bits_left(gb) < 0)
                     goto error;
@@ -341,7 +186,7 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
             } else {
                 if (get_bits_left(gb) < t2 - 1)
                     goto error;
-                t += get_bits(gb, t2 - 1) | (1 << (t2 - 1));
+                t += get_bits_long(gb, t2 - 1) | (1 << (t2 - 1));
             }
         }
 
@@ -381,6 +226,10 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
         INC_MED(2);
     }
     if (!c->error_limit) {
+        if (add >= 0x2000000U) {
+            av_log(ctx->avctx, AV_LOG_ERROR, "k %d is too large\n", add);
+            goto error;
+        }
         ret = base + get_tail(gb, add);
         if (get_bits_left(gb) <= 0)
             goto error;
@@ -404,6 +253,10 @@ static int wv_get_value(WavpackFrameContext *ctx, GetBitContext *gb,
     return sign ? ~ret : ret;
 
 error:
+    ret = get_bits_left(gb);
+    if (ret <= 0) {
+        av_log(ctx->avctx, AV_LOG_ERROR, "Too few bits (%d) left\n", ret);
+    }
     *last = 1;
     return 0;
 }
@@ -418,7 +271,7 @@ static inline int wv_get_value_integer(WavpackFrameContext *s, uint32_t *crc,
 
         if (s->got_extra_bits &&
             get_bits_left(&s->gb_extra_bits) >= s->extra_bits) {
-            S   |= get_bits(&s->gb_extra_bits, s->extra_bits);
+            S   |= get_bits_long(&s->gb_extra_bits, s->extra_bits);
             *crc = *crc * 9 + (S & 0xffff) * 3 + ((unsigned)S >> 16);
         }
     }
@@ -619,6 +472,14 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, GetBitContext *gb,
                 s->decorr[i].samplesB[0] = L;
             }
         }
+
+        if (type == AV_SAMPLE_FMT_S16P) {
+            if (FFABS(L) + FFABS(R) > (1<<19)) {
+                av_log(s->avctx, AV_LOG_ERROR, "sample %d %d too large\n", L, R);
+                return AVERROR_INVALIDDATA;
+            }
+        }
+
         pos = (pos + 1) & 7;
         if (s->joint)
             L += (R -= (L >> 1));
@@ -638,6 +499,13 @@ static inline int wv_unpack_stereo(WavpackFrameContext *s, GetBitContext *gb,
     } while (!last && count < s->samples);
 
     wv_reset_saved_context(s);
+
+    if (last && count < s->samples) {
+        int size = av_get_bytes_per_sample(type);
+        memset((uint8_t*)dst_l + count*size, 0, (s->samples-count)*size);
+        memset((uint8_t*)dst_r + count*size, 0, (s->samples-count)*size);
+    }
+
     if ((s->avctx->err_recognition & AV_EF_CRCCHECK) &&
         wv_check_crc(s, crc, crc_extra_bits))
         return AVERROR_INVALIDDATA;
@@ -699,6 +567,12 @@ static inline int wv_unpack_mono(WavpackFrameContext *s, GetBitContext *gb,
     } while (!last && count < s->samples);
 
     wv_reset_saved_context(s);
+
+    if (last && count < s->samples) {
+        int size = av_get_bytes_per_sample(type);
+        memset((uint8_t*)dst + count*size, 0, (s->samples-count)*size);
+    }
+
     if (s->avctx->err_recognition & AV_EF_CRCCHECK) {
         int ret = wv_check_crc(s, crc, crc_extra_bits);
         if (ret < 0 && s->avctx->err_recognition & AV_EF_EXPLODE)
@@ -723,6 +597,15 @@ static av_cold int wv_alloc_frame_context(WavpackContext *c)
     return 0;
 }
 
+#if HAVE_THREADS
+static int init_thread_copy(AVCodecContext *avctx)
+{
+    WavpackContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    return 0;
+}
+#endif
+
 static av_cold int wavpack_decode_init(AVCodecContext *avctx)
 {
     WavpackContext *s = avctx->priv_data;
@@ -750,9 +633,10 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                                 AVFrame *frame, const uint8_t *buf, int buf_size)
 {
     WavpackContext *wc = avctx->priv_data;
+    ThreadFrame tframe = { .f = frame };
     WavpackFrameContext *s;
     GetByteContext gb;
-    void *samples_l, *samples_r;
+    void *samples_l = NULL, *samples_r = NULL;
     int ret;
     int got_terms   = 0, got_weights = 0, got_samples = 0,
         got_entropy = 0, got_bs      = 0, got_float   = 0, got_hybrid = 0;
@@ -910,7 +794,7 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         case WP_ID_ENTROPY:
             if (size != 6 * (s->stereo_in + 1)) {
                 av_log(avctx, AV_LOG_ERROR,
-                       "Entropy vars size should be %i, got %i",
+                       "Entropy vars size should be %i, got %i.\n",
                        6 * (s->stereo_in + 1), size);
                 bytestream2_skip(&gb, ssize);
                 continue;
@@ -953,7 +837,11 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                 continue;
             }
             bytestream2_get_buffer(&gb, val, 4);
-            if (val[0]) {
+            if (val[0] > 32) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Invalid INT32INFO, extra_bits = %d (> 32)\n", val[0]);
+                continue;
+            } else if (val[0]) {
                 s->extra_bits = val[0];
             } else if (val[1]) {
                 s->shift = val[1];
@@ -990,7 +878,8 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         case WP_ID_DATA:
             s->sc.offset = bytestream2_tell(&gb);
             s->sc.size   = size * 8;
-            init_get_bits(&s->gb, gb.buffer, size * 8);
+            if ((ret = init_get_bits8(&s->gb, gb.buffer, size)) < 0)
+                return ret;
             s->data_size = size * 8;
             bytestream2_skip(&gb, size);
             got_bs       = 1;
@@ -1004,7 +893,8 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
             }
             s->extra_sc.offset = bytestream2_tell(&gb);
             s->extra_sc.size   = size * 8;
-            init_get_bits(&s->gb_extra_bits, gb.buffer, size * 8);
+            if ((ret = init_get_bits8(&s->gb_extra_bits, gb.buffer, size)) < 0)
+                return ret;
             s->crc_extra_bits  = get_bits_long(&s->gb_extra_bits, 32);
             bytestream2_skip(&gb, size);
             s->got_extra_bits  = 1;
@@ -1027,10 +917,13 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
                 chmask = bytestream2_get_le24(&gb);
                 break;
             case 3:
-                chmask = bytestream2_get_le32(&gb);;
+                chmask = bytestream2_get_le32(&gb);
                 break;
             case 5:
-                bytestream2_skip(&gb, 1);
+                size = bytestream2_get_byte(&gb);
+                if (avctx->channels != size)
+                    av_log(avctx, AV_LOG_WARNING, "%i channels signalled"
+                           " instead of %i.\n", size, avctx->channels);
                 chan  |= (bytestream2_get_byte(&gb) & 0xF) << 8;
                 chmask = bytestream2_get_le16(&gb);
                 break;
@@ -1115,11 +1008,10 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
         }
 
         /* get output buffer */
-        frame->nb_samples = s->samples;
-        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-            av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        frame->nb_samples = s->samples + 1;
+        if ((ret = ff_thread_get_buffer(avctx, &tframe, 0)) < 0)
             return ret;
-        }
+        frame->nb_samples = s->samples;
     }
 
     if (wc->ch_offset + s->stereo >= avctx->channels) {
@@ -1176,7 +1068,7 @@ static int wavpack_decode_frame(AVCodecContext *avctx, void *data,
     /* determine number of samples */
     s->samples  = AV_RL32(buf + 20);
     frame_flags = AV_RL32(buf + 24);
-    if (s->samples <= 0) {
+    if (s->samples <= 0 || s->samples > WV_MAX_SAMPLES) {
         av_log(avctx, AV_LOG_ERROR, "Invalid number of samples: %d\n",
                s->samples);
         return AVERROR_INVALIDDATA;
@@ -1234,5 +1126,6 @@ AVCodec ff_wavpack_decoder = {
     .close          = wavpack_decode_end,
     .decode         = wavpack_decode_frame,
     .flush          = wavpack_decode_flush,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .init_thread_copy = ONLY_IF_THREADS_ENABLED(init_thread_copy),
+    .capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
 };
diff --git a/libavcodec/wavpack.h b/libavcodec/wavpack.h
new file mode 100644
index 0000000..a1b46d5
--- /dev/null
+++ b/libavcodec/wavpack.h
@@ -0,0 +1,194 @@
+/*
+ * WavPack decoder/encoder common code
+ * Copyright (c) 2006,2011 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_WAVPACK_H
+#define AVCODEC_WAVPACK_H
+
+#include "libavutil/common.h"
+
+#define MAX_TERMS      16
+#define MAX_TERM        8
+
+#define WV_HEADER_SIZE    32
+
+#define WV_MONO           0x00000004
+#define WV_JOINT_STEREO   0x00000010
+#define WV_CROSS_DECORR   0x00000020
+#define WV_FLOAT_DATA     0x00000080
+#define WV_INT32_DATA     0x00000100
+#define WV_FALSE_STEREO   0x40000000
+
+#define WV_HYBRID_MODE    0x00000008
+#define WV_HYBRID_SHAPE   0x00000008
+#define WV_HYBRID_BITRATE 0x00000200
+#define WV_HYBRID_BALANCE 0x00000400
+#define WV_INITIAL_BLOCK  0x00000800
+#define WV_FINAL_BLOCK    0x00001000
+
+#define WV_MONO_DATA    (WV_MONO | WV_FALSE_STEREO)
+
+#define WV_SINGLE_BLOCK (WV_INITIAL_BLOCK | WV_FINAL_BLOCK)
+
+#define WV_FLT_SHIFT_ONES 0x01
+#define WV_FLT_SHIFT_SAME 0x02
+#define WV_FLT_SHIFT_SENT 0x04
+#define WV_FLT_ZERO_SENT  0x08
+#define WV_FLT_ZERO_SIGN  0x10
+
+#define WV_MAX_SAMPLES    150000
+
+enum WP_ID_Flags {
+    WP_IDF_MASK   = 0x3F,
+    WP_IDF_IGNORE = 0x20,
+    WP_IDF_ODD    = 0x40,
+    WP_IDF_LONG   = 0x80
+};
+
+enum WP_ID {
+    WP_ID_DUMMY = 0,
+    WP_ID_ENCINFO,
+    WP_ID_DECTERMS,
+    WP_ID_DECWEIGHTS,
+    WP_ID_DECSAMPLES,
+    WP_ID_ENTROPY,
+    WP_ID_HYBRID,
+    WP_ID_SHAPING,
+    WP_ID_FLOATINFO,
+    WP_ID_INT32INFO,
+    WP_ID_DATA,
+    WP_ID_CORR,
+    WP_ID_EXTRABITS,
+    WP_ID_CHANINFO,
+    WP_ID_SAMPLE_RATE = 0x27,
+};
+
+typedef struct Decorr {
+    int delta;
+    int value;
+    int weightA;
+    int weightB;
+    int samplesA[MAX_TERM];
+    int samplesB[MAX_TERM];
+    int sumA;
+    int sumB;
+} Decorr;
+
+typedef struct WvChannel {
+    int median[3];
+    int slow_level, error_limit;
+    int bitrate_acc, bitrate_delta;
+} WvChannel;
+
+// macros for manipulating median values
+#define GET_MED(n) ((c->median[n] >> 4) + 1)
+#define DEC_MED(n) c->median[n] -= ((c->median[n] + (128 >> (n)) - 2) / (128 >> (n))) * 2
+#define INC_MED(n) c->median[n] += ((c->median[n] + (128 >> (n))    ) / (128 >> (n))) * 5
+
+// macros for applying weight
+#define UPDATE_WEIGHT_CLIP(weight, delta, samples, in) \
+    if ((samples) && (in)) { \
+        if (((samples) ^ (in)) < 0) { \
+            (weight) -= (delta); \
+            if ((weight) < -1024) \
+                (weight) = -1024; \
+        } else { \
+            (weight) += (delta); \
+            if ((weight) > 1024) \
+                (weight) = 1024; \
+        } \
+    }
+
+static const int wv_rates[16] = {
+     6000,  8000,  9600, 11025, 12000, 16000,  22050, 24000,
+    32000, 44100, 48000, 64000, 88200, 96000, 192000,     0
+};
+
+// exponent table copied from WavPack source
+static const uint8_t wp_exp2_table[256] = {
+    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
+    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
+    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
+    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
+    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
+    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
+    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
+    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
+    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
+    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
+    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
+    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
+};
+
+static const uint8_t wp_log2_table [] = {
+    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
+    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
+    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
+    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
+    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
+    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
+    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
+    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
+    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
+    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
+    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
+    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
+    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
+    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
+};
+
+static av_always_inline int wp_exp2(int16_t val)
+{
+    int res, neg = 0;
+
+    if (val < 0) {
+        val = -val;
+        neg = 1;
+    }
+
+    res   = wp_exp2_table[val & 0xFF] | 0x100;
+    val >>= 8;
+    res   = (val > 9) ? (res << (val - 9)) : (res >> (9 - val));
+    return neg ? -res : res;
+}
+
+static av_always_inline int wp_log2(int32_t val)
+{
+    int bits;
+
+    if (!val)
+        return 0;
+    if (val == 1)
+        return 256;
+    val += val >> 9;
+    bits = av_log2(val) + 1;
+    if (bits < 9)
+        return (bits << 8) + wp_log2_table[(val << (9 - bits)) & 0xFF];
+    else
+        return (bits << 8) + wp_log2_table[(val >> (bits - 9)) & 0xFF];
+}
+
+#endif /* AVCODEC_WAVPACK_H */
diff --git a/libavcodec/wavpackenc.c b/libavcodec/wavpackenc.c
new file mode 100644
index 0000000..979b921
--- /dev/null
+++ b/libavcodec/wavpackenc.c
@@ -0,0 +1,2990 @@
+/*
+ * WavPack lossless audio encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BITSTREAM_WRITER_LE
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "put_bits.h"
+#include "bytestream.h"
+#include "wavpackenc.h"
+#include "wavpack.h"
+
+#define UPDATE_WEIGHT(weight, delta, source, result) \
+    if ((source) && (result)) { \
+        int32_t s = (int32_t) ((source) ^ (result)) >> 31; \
+        weight = ((delta) ^ s) + ((weight) - s); \
+    }
+
+#define APPLY_WEIGHT_F(weight, sample) ((((((sample) & 0xffff) * (weight)) >> 9) + \
+    ((((sample) & ~0xffff) >> 9) * (weight)) + 1) >> 1)
+
+#define APPLY_WEIGHT_I(weight, sample) (((weight) * (sample) + 512) >> 10)
+
+#define APPLY_WEIGHT(weight, sample) ((sample) != (short) (sample) ? \
+    APPLY_WEIGHT_F(weight, sample) : APPLY_WEIGHT_I (weight, sample))
+
+#define CLEAR(destin) memset(&destin, 0, sizeof(destin));
+
+#define SHIFT_LSB       13
+#define SHIFT_MASK      (0x1FU << SHIFT_LSB)
+
+#define MAG_LSB         18
+#define MAG_MASK        (0x1FU << MAG_LSB)
+
+#define SRATE_LSB       23
+#define SRATE_MASK      (0xFU << SRATE_LSB)
+
+#define EXTRA_TRY_DELTAS     1
+#define EXTRA_ADJUST_DELTAS  2
+#define EXTRA_SORT_FIRST     4
+#define EXTRA_BRANCHES       8
+#define EXTRA_SORT_LAST     16
+
+typedef struct WavPackExtraInfo {
+    struct Decorr dps[MAX_TERMS];
+    int nterms, log_limit, gt16bit;
+    uint32_t best_bits;
+} WavPackExtraInfo;
+
+typedef struct WavPackWords {
+    int pend_data, holding_one, zeros_acc;
+    int holding_zero, pend_count;
+    WvChannel c[2];
+} WavPackWords;
+
+typedef struct WavPackEncodeContext {
+    AVClass *class;
+    AVCodecContext *avctx;
+    PutBitContext pb;
+    int block_samples;
+    int buffer_size;
+    int sample_index;
+    int stereo, stereo_in;
+    int ch_offset;
+
+    int32_t *samples[2];
+    int samples_size[2];
+
+    int32_t *sampleptrs[MAX_TERMS+2][2];
+    int sampleptrs_size[MAX_TERMS+2][2];
+
+    int32_t *temp_buffer[2][2];
+    int temp_buffer_size[2][2];
+
+    int32_t *best_buffer[2];
+    int best_buffer_size[2];
+
+    int32_t *js_left, *js_right;
+    int js_left_size, js_right_size;
+
+    int32_t *orig_l, *orig_r;
+    int orig_l_size, orig_r_size;
+
+    unsigned extra_flags;
+    int optimize_mono;
+    int decorr_filter;
+    int joint;
+    int num_branches;
+
+    uint32_t flags;
+    uint32_t crc_x;
+    WavPackWords w;
+
+    uint8_t int32_sent_bits, int32_zeros, int32_ones, int32_dups;
+    uint8_t float_flags, float_shift, float_max_exp, max_exp;
+    int32_t shifted_ones, shifted_zeros, shifted_both;
+    int32_t false_zeros, neg_zeros, ordata;
+
+    int num_terms, shift, joint_stereo, false_stereo;
+    int num_decorrs, num_passes, best_decorr, mask_decorr;
+    struct Decorr decorr_passes[MAX_TERMS];
+    const WavPackDecorrSpec *decorr_specs;
+    float delta_decay;
+} WavPackEncodeContext;
+
+static av_cold int wavpack_encode_init(AVCodecContext *avctx)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+
+    s->avctx = avctx;
+
+    if (avctx->channels > 255) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid channel count: %d\n", avctx->channels);
+        return AVERROR(EINVAL);
+    }
+
+    if (!avctx->frame_size) {
+        int block_samples;
+        if (!(avctx->sample_rate & 1))
+            block_samples = avctx->sample_rate / 2;
+        else
+            block_samples = avctx->sample_rate;
+
+        while (block_samples * avctx->channels > WV_MAX_SAMPLES)
+            block_samples /= 2;
+
+        while (block_samples * avctx->channels < 40000)
+            block_samples *= 2;
+        avctx->frame_size = block_samples;
+    } else if (avctx->frame_size && (avctx->frame_size < 128 ||
+                              avctx->frame_size > WV_MAX_SAMPLES)) {
+        av_log(avctx, AV_LOG_ERROR, "invalid block size: %d\n", avctx->frame_size);
+        return AVERROR(EINVAL);
+    }
+
+    if (avctx->compression_level != FF_COMPRESSION_DEFAULT) {
+        if (avctx->compression_level >= 3) {
+            s->decorr_filter = 3;
+            s->num_passes = 9;
+            if      (avctx->compression_level >= 8) {
+                s->num_branches = 4;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_SORT_LAST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 7) {
+                s->num_branches = 3;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 6) {
+                s->num_branches = 2;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 5) {
+                s->num_branches = 1;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_SORT_FIRST|EXTRA_BRANCHES;
+            } else if (avctx->compression_level >= 4) {
+                s->num_branches = 1;
+                s->extra_flags = EXTRA_TRY_DELTAS|EXTRA_ADJUST_DELTAS|EXTRA_BRANCHES;
+            }
+        } else if (avctx->compression_level == 2) {
+            s->decorr_filter = 2;
+            s->num_passes = 4;
+        } else if (avctx->compression_level == 1) {
+            s->decorr_filter = 1;
+            s->num_passes = 2;
+        } else if (avctx->compression_level < 1) {
+            s->decorr_filter = 0;
+            s->num_passes = 0;
+        }
+    }
+
+    s->num_decorrs = decorr_filter_sizes[s->decorr_filter];
+    s->decorr_specs = decorr_filters[s->decorr_filter];
+
+    s->delta_decay = 2.0;
+
+    return 0;
+}
+
+static void shift_mono(int32_t *samples, int nb_samples, int shift)
+{
+    int i;
+    for (i = 0; i < nb_samples; i++)
+        samples[i] >>= shift;
+}
+
+static void shift_stereo(int32_t *left, int32_t *right,
+                         int nb_samples, int shift)
+{
+    int i;
+    for (i = 0; i < nb_samples; i++) {
+        left [i] >>= shift;
+        right[i] >>= shift;
+    }
+}
+
+#define FLOAT_SHIFT_ONES 1
+#define FLOAT_SHIFT_SAME 2
+#define FLOAT_SHIFT_SENT 4
+#define FLOAT_ZEROS_SENT 8
+#define FLOAT_NEG_ZEROS  0x10
+#define FLOAT_EXCEPTIONS 0x20
+
+#define get_mantissa(f)     ((f) & 0x7fffff)
+#define get_exponent(f)     (((f) >> 23) & 0xff)
+#define get_sign(f)         (((f) >> 31) & 0x1)
+
+static void process_float(WavPackEncodeContext *s, int32_t *sample)
+{
+    int32_t shift_count, value, f = *sample;
+
+    if (get_exponent(f) == 255) {
+        s->float_flags |= FLOAT_EXCEPTIONS;
+        value = 0x1000000;
+        shift_count = 0;
+    } else if (get_exponent(f)) {
+        shift_count = s->max_exp - get_exponent(f);
+        value = 0x800000 + get_mantissa(f);
+    } else {
+        shift_count = s->max_exp ? s->max_exp - 1 : 0;
+        value = get_mantissa(f);
+    }
+
+    if (shift_count < 25)
+        value >>= shift_count;
+    else
+        value = 0;
+
+    if (!value) {
+        if (get_exponent(f) || get_mantissa(f))
+            s->false_zeros++;
+        else if (get_sign(f))
+            s->neg_zeros++;
+    } else if (shift_count) {
+        int32_t mask = (1 << shift_count) - 1;
+
+        if (!(get_mantissa(f) & mask))
+            s->shifted_zeros++;
+        else if ((get_mantissa(f) & mask) == mask)
+            s->shifted_ones++;
+        else
+            s->shifted_both++;
+    }
+
+    s->ordata |= value;
+    *sample = get_sign(f) ? -value : value;
+}
+
+static int scan_float(WavPackEncodeContext *s,
+                      int32_t *samples_l, int32_t *samples_r,
+                      int nb_samples)
+{
+    uint32_t crc = 0xffffffffu;
+    int i;
+
+    s->shifted_ones = s->shifted_zeros = s->shifted_both = s->ordata = 0;
+    s->float_shift = s->float_flags = 0;
+    s->false_zeros = s->neg_zeros = 0;
+    s->max_exp = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t f = samples_l[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t f;
+
+            f = samples_l[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+
+            f = samples_r[i];
+            crc = crc * 27 + get_mantissa(f) * 9 + get_exponent(f) * 3 + get_sign(f);
+
+            if (get_exponent(f) > s->max_exp && get_exponent(f) < 255)
+                s->max_exp = get_exponent(f);
+        }
+    }
+
+    s->crc_x = crc;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            process_float(s, &samples_l[i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            process_float(s, &samples_l[i]);
+            process_float(s, &samples_r[i]);
+        }
+    }
+
+    s->float_max_exp = s->max_exp;
+
+    if (s->shifted_both)
+        s->float_flags |= FLOAT_SHIFT_SENT;
+    else if (s->shifted_ones && !s->shifted_zeros)
+        s->float_flags |= FLOAT_SHIFT_ONES;
+    else if (s->shifted_ones && s->shifted_zeros)
+        s->float_flags |= FLOAT_SHIFT_SAME;
+    else if (s->ordata && !(s->ordata & 1)) {
+        do {
+            s->float_shift++;
+            s->ordata >>= 1;
+        } while (!(s->ordata & 1));
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, s->float_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, s->float_shift);
+    }
+
+    s->flags &= ~MAG_MASK;
+
+    while (s->ordata) {
+        s->flags += 1 << MAG_LSB;
+        s->ordata >>= 1;
+    }
+
+    if (s->false_zeros || s->neg_zeros)
+        s->float_flags |= FLOAT_ZEROS_SENT;
+
+    if (s->neg_zeros)
+        s->float_flags |= FLOAT_NEG_ZEROS;
+
+    return s->float_flags & (FLOAT_EXCEPTIONS | FLOAT_ZEROS_SENT |
+                             FLOAT_SHIFT_SENT | FLOAT_SHIFT_SAME);
+}
+
+static void scan_int23(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    uint32_t magdata = 0, ordata = 0, xordata = 0, anddata = ~0;
+    int i, total_shift = 0;
+
+    s->int32_sent_bits = s->int32_zeros = s->int32_ones = s->int32_dups = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t M = samples_l[i];
+
+            magdata |= (M < 0) ? ~M : M;
+            xordata |= M ^ -(M & 1);
+            anddata &= M;
+            ordata  |= M;
+
+            if ((ordata & 1) && !(anddata & 1) && (xordata & 2))
+                return;
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t L = samples_l[i];
+            int32_t R = samples_r[i];
+
+            magdata |= (L < 0) ? ~L : L;
+            magdata |= (R < 0) ? ~R : R;
+            xordata |= L ^ -(L & 1);
+            xordata |= R ^ -(R & 1);
+            anddata &= L & R;
+            ordata  |= L | R;
+
+            if ((ordata & 1) && !(anddata & 1) && (xordata & 2))
+                return;
+        }
+    }
+
+    s->flags &= ~MAG_MASK;
+
+    while (magdata) {
+        s->flags += 1 << MAG_LSB;
+        magdata >>= 1;
+    }
+
+    if (!(s->flags & MAG_MASK))
+        return;
+
+    if (!(ordata & 1)) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_zeros++;
+            total_shift++;
+            ordata >>= 1;
+        } while (!(ordata & 1));
+    } else if (anddata & 1) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_ones++;
+            total_shift++;
+            anddata >>= 1;
+        } while (anddata & 1);
+    } else if (!(xordata & 2)) {
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_dups++;
+            total_shift++;
+            xordata >>= 1;
+        } while (!(xordata & 2));
+    }
+
+    if (total_shift) {
+        s->flags |= WV_INT32_DATA;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, total_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, total_shift);
+    }
+}
+
+static int scan_int32(WavPackEncodeContext *s,
+                      int32_t *samples_l, int32_t *samples_r,
+                      int nb_samples)
+{
+    uint32_t magdata = 0, ordata = 0, xordata = 0, anddata = ~0;
+    uint32_t crc = 0xffffffffu;
+    int i, total_shift = 0;
+
+    s->int32_sent_bits = s->int32_zeros = s->int32_ones = s->int32_dups = 0;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t M = samples_l[i];
+
+            crc = crc * 9 + (M & 0xffff) * 3 + ((M >> 16) & 0xffff);
+            magdata |= (M < 0) ? ~M : M;
+            xordata |= M ^ -(M & 1);
+            anddata &= M;
+            ordata  |= M;
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t L = samples_l[i];
+            int32_t R = samples_r[i];
+
+            crc = crc * 9 + (L & 0xffff) * 3 + ((L >> 16) & 0xffff);
+            crc = crc * 9 + (R & 0xffff) * 3 + ((R >> 16) & 0xffff);
+            magdata |= (L < 0) ? ~L : L;
+            magdata |= (R < 0) ? ~R : R;
+            xordata |= L ^ -(L & 1);
+            xordata |= R ^ -(R & 1);
+            anddata &= L & R;
+            ordata  |= L | R;
+        }
+    }
+
+    s->crc_x = crc;
+    s->flags &= ~MAG_MASK;
+
+    while (magdata) {
+        s->flags += 1 << MAG_LSB;
+        magdata >>= 1;
+    }
+
+    if (!((s->flags & MAG_MASK) >> MAG_LSB)) {
+        s->flags &= ~WV_INT32_DATA;
+        return 0;
+    }
+
+    if (!(ordata & 1))
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_zeros++;
+            total_shift++;
+            ordata >>= 1;
+        } while (!(ordata & 1));
+    else if (anddata & 1)
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_ones++;
+            total_shift++;
+            anddata >>= 1;
+        } while (anddata & 1);
+    else if (!(xordata & 2))
+        do {
+            s->flags -= 1 << MAG_LSB;
+            s->int32_dups++;
+            total_shift++;
+            xordata >>= 1;
+        } while (!(xordata & 2));
+
+    if (((s->flags & MAG_MASK) >> MAG_LSB) > 23) {
+        s->int32_sent_bits = (uint8_t)(((s->flags & MAG_MASK) >> MAG_LSB) - 23);
+        total_shift += s->int32_sent_bits;
+        s->flags &= ~MAG_MASK;
+        s->flags += 23 << MAG_LSB;
+    }
+
+    if (total_shift) {
+        s->flags |= WV_INT32_DATA;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, total_shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, total_shift);
+    }
+
+    return s->int32_sent_bits;
+}
+
+static int8_t store_weight(int weight)
+{
+    weight = av_clip(weight, -1024, 1024);
+    if (weight > 0)
+        weight -= (weight + 64) >> 7;
+
+    return (weight + 4) >> 3;
+}
+
+static int restore_weight(int8_t weight)
+{
+    int result;
+
+    if ((result = (int) weight << 3) > 0)
+        result += (result + 64) >> 7;
+
+    return result;
+}
+
+static int log2s(int32_t value)
+{
+    return (value < 0) ? -wp_log2(-value) : wp_log2(value);
+}
+
+static void decorr_mono(int32_t *in_samples, int32_t *out_samples,
+                        int nb_samples, struct Decorr *dpp, int dir)
+{
+    int m = 0, i;
+
+    dpp->sumA = 0;
+
+    if (dir < 0) {
+        out_samples += (nb_samples - 1);
+        in_samples  += (nb_samples - 1);
+    }
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+
+    for (i = 0; i < MAX_TERM; i++)
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+
+    if (dpp->value > MAX_TERM) {
+        while (nb_samples--) {
+            int32_t left, sam_A;
+
+            sam_A = ((3 - (dpp->value & 1)) * dpp->samplesA[0] - dpp->samplesA[1]) >> !(dpp->value & 1);
+
+            dpp->samplesA[1] = dpp->samplesA[0];
+            dpp->samplesA[0] = left = in_samples[0];
+
+            left -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam_A, left);
+            dpp->sumA += dpp->weightA;
+            out_samples[0] = left;
+            in_samples += dir;
+            out_samples += dir;
+        }
+    } else if (dpp->value > 0) {
+        while (nb_samples--) {
+            int k = (m + dpp->value) & (MAX_TERM - 1);
+            int32_t left, sam_A;
+
+            sam_A = dpp->samplesA[m];
+            dpp->samplesA[k] = left = in_samples[0];
+            m = (m + 1) & (MAX_TERM - 1);
+
+            left -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam_A, left);
+            dpp->sumA += dpp->weightA;
+            out_samples[0] = left;
+            in_samples += dir;
+            out_samples += dir;
+        }
+    }
+
+    if (m && dpp->value > 0 && dpp->value <= MAX_TERM) {
+        int32_t temp_A[MAX_TERM];
+
+        memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+
+        for (i = 0; i < MAX_TERM; i++) {
+            dpp->samplesA[i] = temp_A[m];
+            m = (m + 1) & (MAX_TERM - 1);
+        }
+    }
+}
+
+static void reverse_mono_decorr(struct Decorr *dpp)
+{
+    if (dpp->value > MAX_TERM) {
+        int32_t sam_A;
+
+        if (dpp->value & 1)
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+        else
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+        dpp->samplesA[1] = dpp->samplesA[0];
+        dpp->samplesA[0] = sam_A;
+
+        if (dpp->value & 1)
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+        else
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+        dpp->samplesA[1] = sam_A;
+    } else if (dpp->value > 1) {
+        int i, j, k;
+
+        for (i = 0, j = dpp->value - 1, k = 0; k < dpp->value / 2; i++, j--, k++) {
+            i &= (MAX_TERM - 1);
+            j &= (MAX_TERM - 1);
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesA[j] ^= dpp->samplesA[i];
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+        }
+    }
+}
+
+static uint32_t log2sample(uint32_t v, int limit, uint32_t *result)
+{
+    uint32_t dbits;
+
+    if ((v += v >> 9) < (1 << 8)) {
+        dbits = nbits_table[v];
+        *result += (dbits << 8) + wp_log2_table[(v << (9 - dbits)) & 0xff];
+    } else {
+        if (v < (1 << 16))
+            dbits = nbits_table[v >> 8] + 8;
+        else if (v < (1 << 24))
+            dbits = nbits_table[v >> 16] + 16;
+        else
+            dbits = nbits_table[v >> 24] + 24;
+
+        *result += dbits = (dbits << 8) + wp_log2_table[(v >> (dbits - 9)) & 0xff];
+
+        if (limit && dbits >= limit)
+            return 1;
+    }
+
+    return 0;
+}
+
+static uint32_t log2mono(int32_t *samples, int nb_samples, int limit)
+{
+    uint32_t result = 0;
+    while (nb_samples--) {
+        if (log2sample(abs(*samples++), limit, &result))
+            return UINT32_MAX;
+    }
+    return result;
+}
+
+static uint32_t log2stereo(int32_t *samples_l, int32_t *samples_r,
+                           int nb_samples, int limit)
+{
+    uint32_t result = 0;
+    while (nb_samples--) {
+        if (log2sample(abs(*samples_l++), limit, &result) ||
+            log2sample(abs(*samples_r++), limit, &result))
+            return UINT32_MAX;
+    }
+    return result;
+}
+
+static void decorr_mono_buffer(int32_t *samples, int32_t *outsamples,
+                               int nb_samples, struct Decorr *dpp,
+                               int tindex)
+{
+    struct Decorr dp, *dppi = dpp + tindex;
+    int delta = dppi->delta, pre_delta, term = dppi->value;
+
+    if (delta == 7)
+        pre_delta = 7;
+    else if (delta < 2)
+        pre_delta = 3;
+    else
+        pre_delta = delta + 1;
+
+    CLEAR(dp);
+    dp.value = term;
+    dp.delta = pre_delta;
+    decorr_mono(samples, outsamples, FFMIN(2048, nb_samples), &dp, -1);
+    dp.delta = delta;
+
+    if (tindex == 0)
+        reverse_mono_decorr(&dp);
+    else
+        CLEAR(dp.samplesA);
+
+    memcpy(dppi->samplesA, dp.samplesA, sizeof(dp.samplesA));
+    dppi->weightA = dp.weightA;
+
+    if (delta == 0) {
+        dp.delta = 1;
+        decorr_mono(samples, outsamples, nb_samples, &dp, 1);
+        dp.delta = 0;
+        memcpy(dp.samplesA, dppi->samplesA, sizeof(dp.samplesA));
+        dppi->weightA = dp.weightA = dp.sumA / nb_samples;
+    }
+
+    decorr_mono(samples, outsamples, nb_samples, &dp, 1);
+}
+
+static void recurse_mono(WavPackEncodeContext *s, WavPackExtraInfo *info,
+                         int depth, int delta, uint32_t input_bits)
+{
+    int term, branches = s->num_branches - depth;
+    int32_t *samples, *outsamples;
+    uint32_t term_bits[22], bits;
+
+    if (branches < 1 || depth + 1 == info->nterms)
+        branches = 1;
+
+    CLEAR(term_bits);
+    samples = s->sampleptrs[depth][0];
+    outsamples = s->sampleptrs[depth + 1][0];
+
+    for (term = 1; term <= 18; term++) {
+        if (term == 17 && branches == 1 && depth + 1 < info->nterms)
+            continue;
+
+        if (term > 8 && term < 17)
+            continue;
+
+        if (!s->extra_flags && (term > 4 && term < 17))
+            continue;
+
+        info->dps[depth].value = term;
+        info->dps[depth].delta = delta;
+        decorr_mono_buffer(samples, outsamples, s->block_samples, info->dps, depth);
+        bits = log2mono(outsamples, s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * (depth + 1));
+            memcpy(s->sampleptrs[info->nterms + 1][0],
+                   s->sampleptrs[depth + 1][0], s->block_samples * 4);
+        }
+
+        term_bits[term + 3] = bits;
+    }
+
+    while (depth + 1 < info->nterms && branches--) {
+        uint32_t local_best_bits = input_bits;
+        int best_term = 0, i;
+
+        for (i = 0; i < 22; i++)
+            if (term_bits[i] && term_bits[i] < local_best_bits) {
+                local_best_bits = term_bits[i];
+                best_term = i - 3;
+            }
+
+        if (!best_term)
+            break;
+
+        term_bits[best_term + 3] = 0;
+
+        info->dps[depth].value = best_term;
+        info->dps[depth].delta = delta;
+        decorr_mono_buffer(samples, outsamples, s->block_samples, info->dps, depth);
+
+        recurse_mono(s, info, depth + 1, delta, local_best_bits);
+    }
+}
+
+static void sort_mono(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int reversed = 1;
+    uint32_t bits;
+
+    while (reversed) {
+        int ri, i;
+
+        memcpy(info->dps, s->decorr_passes, sizeof(s->decorr_passes));
+        reversed = 0;
+
+        for (ri = 0; ri < info->nterms && s->decorr_passes[ri].value; ri++) {
+
+            if (ri + 1 >= info->nterms || !s->decorr_passes[ri+1].value)
+                break;
+
+            if (s->decorr_passes[ri].value == s->decorr_passes[ri+1].value) {
+                decorr_mono_buffer(s->sampleptrs[ri][0], s->sampleptrs[ri+1][0],
+                                   s->block_samples, info->dps, ri);
+                continue;
+            }
+
+            info->dps[ri  ] = s->decorr_passes[ri+1];
+            info->dps[ri+1] = s->decorr_passes[ri  ];
+
+            for (i = ri; i < info->nterms && s->decorr_passes[i].value; i++)
+                decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                                   s->block_samples, info->dps, i);
+
+            bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+            if (bits < info->best_bits) {
+                reversed = 1;
+                info->best_bits = bits;
+                CLEAR(s->decorr_passes);
+                memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+                memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+                       s->block_samples * 4);
+            } else {
+                info->dps[ri  ] = s->decorr_passes[ri];
+                info->dps[ri+1] = s->decorr_passes[ri+1];
+                decorr_mono_buffer(s->sampleptrs[ri][0], s->sampleptrs[ri+1][0],
+                                   s->block_samples, info->dps, ri);
+            }
+        }
+    }
+}
+
+static void delta_mono(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int lower = 0, delta, d;
+    uint32_t bits;
+
+    if (!s->decorr_passes[0].value)
+        return;
+    delta = s->decorr_passes[0].delta;
+
+    for (d = delta - 1; d >= 0; d--) {
+        int i;
+
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                               s->block_samples, info->dps, i);
+        }
+
+        bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+
+        lower = 1;
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0],  s->sampleptrs[i][0],
+               s->block_samples * 4);
+    }
+
+    for (d = delta + 1; !lower && d <= 7; d++) {
+        int i;
+
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_mono_buffer(s->sampleptrs[i][0], s->sampleptrs[i+1][0],
+                               s->block_samples, info->dps, i);
+        }
+
+        bits = log2mono(s->sampleptrs[i][0], s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+               s->block_samples * 4);
+    }
+}
+
+static int allocate_buffers2(WavPackEncodeContext *s, int nterms)
+{
+    int i;
+
+    for (i = 0; i < nterms + 2; i++) {
+        av_fast_padded_malloc(&s->sampleptrs[i][0], &s->sampleptrs_size[i][0],
+                              s->block_samples * 4);
+        if (!s->sampleptrs[i][0])
+            return AVERROR(ENOMEM);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->sampleptrs[i][1], &s->sampleptrs_size[i][1],
+                                  s->block_samples * 4);
+            if (!s->sampleptrs[i][1])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static int allocate_buffers(WavPackEncodeContext *s)
+{
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        av_fast_padded_malloc(&s->best_buffer[0], &s->best_buffer_size[0],
+                              s->block_samples * 4);
+        if (!s->best_buffer[0])
+            return AVERROR(ENOMEM);
+
+        av_fast_padded_malloc(&s->temp_buffer[i][0], &s->temp_buffer_size[i][0],
+                              s->block_samples * 4);
+        if (!s->temp_buffer[i][0])
+            return AVERROR(ENOMEM);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->best_buffer[1], &s->best_buffer_size[1],
+                                  s->block_samples * 4);
+            if (!s->best_buffer[1])
+                return AVERROR(ENOMEM);
+
+            av_fast_padded_malloc(&s->temp_buffer[i][1], &s->temp_buffer_size[i][1],
+                                  s->block_samples * 4);
+            if (!s->temp_buffer[i][1])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static void analyze_mono(WavPackEncodeContext *s, int32_t *samples, int do_samples)
+{
+    WavPackExtraInfo info;
+    int i;
+
+    info.log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    info.log_limit = FFMIN(6912, info.log_limit);
+
+    info.nterms = s->num_terms;
+
+    if (allocate_buffers2(s, s->num_terms))
+        return;
+
+    memcpy(info.dps, s->decorr_passes, sizeof(info.dps));
+    memcpy(s->sampleptrs[0][0], samples, s->block_samples * 4);
+
+    for (i = 0; i < info.nterms && info.dps[i].value; i++)
+        decorr_mono(s->sampleptrs[i][0], s->sampleptrs[i + 1][0],
+                    s->block_samples, info.dps + i, 1);
+
+    info.best_bits = log2mono(s->sampleptrs[info.nterms][0], s->block_samples, 0) * 1;
+    memcpy(s->sampleptrs[info.nterms + 1][0], s->sampleptrs[i][0], s->block_samples * 4);
+
+    if (s->extra_flags & EXTRA_BRANCHES)
+        recurse_mono(s, &info, 0, (int) floor(s->delta_decay + 0.5),
+                     log2mono(s->sampleptrs[0][0], s->block_samples, 0));
+
+    if (s->extra_flags & EXTRA_SORT_FIRST)
+        sort_mono(s, &info);
+
+    if (s->extra_flags & EXTRA_TRY_DELTAS) {
+        delta_mono(s, &info);
+
+        if ((s->extra_flags & EXTRA_ADJUST_DELTAS) && s->decorr_passes[0].value)
+            s->delta_decay = (float)((s->delta_decay * 2.0 + s->decorr_passes[0].delta) / 3.0);
+        else
+            s->delta_decay = 2.0;
+    }
+
+    if (s->extra_flags & EXTRA_SORT_LAST)
+        sort_mono(s, &info);
+
+    if (do_samples)
+        memcpy(samples, s->sampleptrs[info.nterms + 1][0], s->block_samples * 4);
+
+    for (i = 0; i < info.nterms; i++)
+        if (!s->decorr_passes[i].value)
+            break;
+
+    s->num_terms = i;
+}
+
+static void scan_word(WavPackEncodeContext *s, WvChannel *c,
+                      int32_t *samples, int nb_samples, int dir)
+{
+    if (dir < 0)
+        samples += nb_samples - 1;
+
+    while (nb_samples--) {
+        uint32_t low, value = labs(samples[0]);
+
+        if (value < GET_MED(0)) {
+            DEC_MED(0);
+        } else {
+            low = GET_MED(0);
+            INC_MED(0);
+
+            if (value - low < GET_MED(1)) {
+                DEC_MED(1);
+            } else {
+                low += GET_MED(1);
+                INC_MED(1);
+
+                if (value - low < GET_MED(2)) {
+                    DEC_MED(2);
+                } else {
+                    INC_MED(2);
+                }
+            }
+        }
+        samples += dir;
+    }
+}
+
+static int wv_mono(WavPackEncodeContext *s, int32_t *samples,
+                   int no_history, int do_samples)
+{
+    struct Decorr temp_decorr_pass, save_decorr_passes[MAX_TERMS] = {{0}};
+    int nb_samples = s->block_samples;
+    int buf_size = sizeof(int32_t) * nb_samples;
+    uint32_t best_size = UINT32_MAX, size;
+    int log_limit, pi, i, ret;
+
+    for (i = 0; i < nb_samples; i++)
+        if (samples[i])
+            break;
+
+    if (i == nb_samples) {
+        CLEAR(s->decorr_passes);
+        CLEAR(s->w);
+        s->num_terms = 0;
+        return 0;
+    }
+
+    log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    log_limit = FFMIN(6912, log_limit);
+
+    if ((ret = allocate_buffers(s)) < 0)
+        return ret;
+
+    if (no_history || s->num_passes >= 7)
+        s->best_decorr = s->mask_decorr = 0;
+
+    for (pi = 0; pi < s->num_passes;) {
+        const WavPackDecorrSpec *wpds;
+        int nterms, c, j;
+
+        if (!pi) {
+            c = s->best_decorr;
+        } else {
+            if (s->mask_decorr == 0)
+                c = 0;
+            else
+                c = (s->best_decorr & (s->mask_decorr - 1)) | s->mask_decorr;
+
+            if (c == s->best_decorr) {
+                s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+                continue;
+            }
+        }
+
+        wpds = &s->decorr_specs[c];
+        nterms = decorr_filter_nterms[s->decorr_filter];
+
+        while (1) {
+        memcpy(s->temp_buffer[0][0], samples, buf_size);
+        CLEAR(save_decorr_passes);
+
+        for (j = 0; j < nterms; j++) {
+            CLEAR(temp_decorr_pass);
+            temp_decorr_pass.delta = wpds->delta;
+            temp_decorr_pass.value = wpds->terms[j];
+
+            if (temp_decorr_pass.value < 0)
+                temp_decorr_pass.value = 1;
+
+            decorr_mono(s->temp_buffer[j&1][0], s->temp_buffer[~j&1][0],
+                        FFMIN(nb_samples, 2048), &temp_decorr_pass, -1);
+
+            if (j) {
+                CLEAR(temp_decorr_pass.samplesA);
+            } else {
+                reverse_mono_decorr(&temp_decorr_pass);
+            }
+
+            memcpy(save_decorr_passes + j, &temp_decorr_pass, sizeof(struct Decorr));
+            decorr_mono(s->temp_buffer[j&1][0], s->temp_buffer[~j&1][0],
+                        nb_samples, &temp_decorr_pass, 1);
+        }
+
+        size = log2mono(s->temp_buffer[j&1][0], nb_samples, log_limit);
+        if (size != UINT32_MAX || !nterms)
+            break;
+        nterms >>= 1;
+        }
+
+        if (size < best_size) {
+            memcpy(s->best_buffer[0], s->temp_buffer[j&1][0], buf_size);
+            memcpy(s->decorr_passes, save_decorr_passes, sizeof(struct Decorr) * MAX_TERMS);
+            s->num_terms = nterms;
+            s->best_decorr = c;
+            best_size = size;
+        }
+
+        if (pi++)
+            s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+    }
+
+    if (s->extra_flags)
+        analyze_mono(s, samples, do_samples);
+    else if (do_samples)
+        memcpy(samples, s->best_buffer[0], buf_size);
+
+    if (no_history || s->extra_flags) {
+        CLEAR(s->w);
+        scan_word(s, &s->w.c[0], s->best_buffer[0], nb_samples, -1);
+    }
+    return 0;
+}
+
+static void decorr_stereo(int32_t *in_left, int32_t *in_right,
+                          int32_t *out_left, int32_t *out_right,
+                          int nb_samples, struct Decorr *dpp, int dir)
+{
+    int m = 0, i;
+
+    dpp->sumA = dpp->sumB = 0;
+
+    if (dir < 0) {
+        out_left  += nb_samples - 1;
+        out_right += nb_samples - 1;
+        in_left   += nb_samples - 1;
+        in_right  += nb_samples - 1;
+    }
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+    dpp->weightB = restore_weight(store_weight(dpp->weightB));
+
+    for (i = 0; i < MAX_TERM; i++) {
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+        dpp->samplesB[i] = wp_exp2(log2s(dpp->samplesB[i]));
+    }
+
+    switch (dpp->value) {
+    case 2:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0];
+            dpp->samplesA[0] = dpp->samplesA[1];
+            out_left[0] = tmp = (dpp->samplesA[1] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[0];
+            dpp->samplesB[0] = dpp->samplesB[1];
+            out_right[0] = tmp = (dpp->samplesB[1] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case 17:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[0] = tmp = (dpp->samplesA[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[0] = tmp = (dpp->samplesB[0] = in_right[0]) - APPLY_WEIGHT (dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case 18:
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[0] = tmp = (dpp->samplesA[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[0] = tmp = (dpp->samplesB[0] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    default: {
+        int k = dpp->value & (MAX_TERM - 1);
+
+        while (nb_samples--) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            out_left[0] = tmp = (dpp->samplesA[k] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+            dpp->sumA += dpp->weightA;
+
+            sam = dpp->samplesB[m];
+            out_right[0] = tmp = (dpp->samplesB[k] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+            int k;
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+        }
+    case -1:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            out_left[0] = tmp = (sam_B = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            out_right[0] = tmp = (dpp->samplesA[0] = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case -2:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            out_right[0] = tmp = (sam_A = in_right[0]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            out_left[0] = tmp = (dpp->samplesB[0] = in_left[0]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    case -3:
+        while (nb_samples--) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = in_right[0];
+            out_right[0] = tmp -= APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+            dpp->sumB += dpp->weightB;
+
+            dpp->samplesB[0] = tmp = in_left[0];
+            out_left[0] = tmp -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+            dpp->sumA += dpp->weightA;
+
+            in_left   += dir;
+            out_left  += dir;
+            in_right  += dir;
+            out_right += dir;
+        }
+        break;
+    }
+}
+
+static void reverse_decorr(struct Decorr *dpp)
+{
+    if (dpp->value > MAX_TERM) {
+        int32_t sam_A, sam_B;
+
+        if (dpp->value & 1) {
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            sam_B = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+        } else {
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+            sam_B = (3 * dpp->samplesB[0] - dpp->samplesB[1]) >> 1;
+        }
+
+        dpp->samplesA[1] = dpp->samplesA[0];
+        dpp->samplesB[1] = dpp->samplesB[0];
+        dpp->samplesA[0] = sam_A;
+        dpp->samplesB[0] = sam_B;
+
+        if (dpp->value & 1) {
+            sam_A = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            sam_B = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+        } else {
+            sam_A = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+            sam_B = (3 * dpp->samplesB[0] - dpp->samplesB[1]) >> 1;
+        }
+
+        dpp->samplesA[1] = sam_A;
+        dpp->samplesB[1] = sam_B;
+    } else if (dpp->value > 1) {
+        int i, j, k;
+
+        for (i = 0, j = dpp->value - 1, k = 0; k < dpp->value / 2; i++, j--, k++) {
+            i &= (MAX_TERM - 1);
+            j &= (MAX_TERM - 1);
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesA[j] ^= dpp->samplesA[i];
+            dpp->samplesA[i] ^= dpp->samplesA[j];
+            dpp->samplesB[i] ^= dpp->samplesB[j];
+            dpp->samplesB[j] ^= dpp->samplesB[i];
+            dpp->samplesB[i] ^= dpp->samplesB[j];
+        }
+    }
+}
+
+static void decorr_stereo_quick(int32_t *in_left,  int32_t *in_right,
+                                int32_t *out_left, int32_t *out_right,
+                                int nb_samples, struct Decorr *dpp)
+{
+    int m = 0, i;
+
+    dpp->weightA = restore_weight(store_weight(dpp->weightA));
+    dpp->weightB = restore_weight(store_weight(dpp->weightB));
+
+    for (i = 0; i < MAX_TERM; i++) {
+        dpp->samplesA[i] = wp_exp2(log2s(dpp->samplesA[i]));
+        dpp->samplesB[i] = wp_exp2(log2s(dpp->samplesB[i]));
+    }
+
+    switch (dpp->value) {
+    case 2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0];
+            dpp->samplesA[0] = dpp->samplesA[1];
+            out_left[i] = tmp = (dpp->samplesA[1] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0];
+            dpp->samplesB[0] = dpp->samplesB[1];
+            out_right[i] = tmp = (dpp->samplesB[1] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[i] = tmp = (dpp->samplesA[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[i] = tmp = (dpp->samplesB[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            out_left[i] = tmp = (dpp->samplesA[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            out_right[i] = tmp = (dpp->samplesB[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default: {
+        int k = dpp->value & (MAX_TERM - 1);
+
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            out_left[i] = tmp = (dpp->samplesA[k] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            out_right[i] = tmp = (dpp->samplesB[k] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+            int k;
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    }
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            out_left[i] = tmp = (sam_B = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            out_right[i] = tmp = (dpp->samplesA[0] = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            out_right[i] = tmp = (sam_A = in_right[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            out_left[i] = tmp = (dpp->samplesB[0] = in_left[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = in_right[i];
+            out_right[i] = tmp -= APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = in_left[i];
+            out_left[i] = tmp -= APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+static void decorr_stereo_buffer(WavPackExtraInfo *info,
+                                 int32_t *in_left,  int32_t *in_right,
+                                 int32_t *out_left, int32_t *out_right,
+                                 int nb_samples, int tindex)
+{
+    struct Decorr dp = {0}, *dppi = info->dps + tindex;
+    int delta = dppi->delta, pre_delta;
+    int term = dppi->value;
+
+    if (delta == 7)
+        pre_delta = 7;
+    else if (delta < 2)
+        pre_delta = 3;
+    else
+        pre_delta = delta + 1;
+
+    dp.value = term;
+    dp.delta = pre_delta;
+    decorr_stereo(in_left, in_right, out_left, out_right,
+                  FFMIN(2048, nb_samples), &dp, -1);
+    dp.delta = delta;
+
+    if (tindex == 0) {
+        reverse_decorr(&dp);
+    } else {
+        CLEAR(dp.samplesA);
+        CLEAR(dp.samplesB);
+    }
+
+    memcpy(dppi->samplesA, dp.samplesA, sizeof(dp.samplesA));
+    memcpy(dppi->samplesB, dp.samplesB, sizeof(dp.samplesB));
+    dppi->weightA = dp.weightA;
+    dppi->weightB = dp.weightB;
+
+    if (delta == 0) {
+        dp.delta = 1;
+        decorr_stereo(in_left, in_right, out_left, out_right, nb_samples, &dp, 1);
+        dp.delta = 0;
+        memcpy(dp.samplesA, dppi->samplesA, sizeof(dp.samplesA));
+        memcpy(dp.samplesB, dppi->samplesB, sizeof(dp.samplesB));
+        dppi->weightA = dp.weightA = dp.sumA / nb_samples;
+        dppi->weightB = dp.weightB = dp.sumB / nb_samples;
+    }
+
+    if (info->gt16bit)
+        decorr_stereo(in_left, in_right, out_left, out_right,
+                           nb_samples, &dp, 1);
+    else
+        decorr_stereo_quick(in_left, in_right, out_left, out_right,
+                            nb_samples, &dp);
+}
+
+static void sort_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int reversed = 1;
+    uint32_t bits;
+
+    while (reversed) {
+        int ri, i;
+
+        memcpy(info->dps, s->decorr_passes, sizeof(s->decorr_passes));
+        reversed = 0;
+
+        for (ri = 0; ri < info->nterms && s->decorr_passes[ri].value; ri++) {
+
+            if (ri + 1 >= info->nterms || !s->decorr_passes[ri+1].value)
+                break;
+
+            if (s->decorr_passes[ri].value == s->decorr_passes[ri+1].value) {
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[ri  ][0], s->sampleptrs[ri  ][1],
+                                     s->sampleptrs[ri+1][0], s->sampleptrs[ri+1][1],
+                                     s->block_samples, ri);
+                continue;
+            }
+
+            info->dps[ri  ] = s->decorr_passes[ri+1];
+            info->dps[ri+1] = s->decorr_passes[ri  ];
+
+            for (i = ri; i < info->nterms && s->decorr_passes[i].value; i++)
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                     s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                     s->block_samples, i);
+
+            bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                              s->block_samples, info->log_limit);
+
+            if (bits < info->best_bits) {
+                reversed = 1;
+                info->best_bits = bits;
+                CLEAR(s->decorr_passes);
+                memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+                memcpy(s->sampleptrs[info->nterms + 1][0],
+                       s->sampleptrs[i][0], s->block_samples * 4);
+                memcpy(s->sampleptrs[info->nterms + 1][1],
+                       s->sampleptrs[i][1], s->block_samples * 4);
+            } else {
+                info->dps[ri  ] = s->decorr_passes[ri  ];
+                info->dps[ri+1] = s->decorr_passes[ri+1];
+                decorr_stereo_buffer(info,
+                                     s->sampleptrs[ri  ][0], s->sampleptrs[ri  ][1],
+                                     s->sampleptrs[ri+1][0], s->sampleptrs[ri+1][1],
+                                     s->block_samples, ri);
+            }
+        }
+    }
+}
+
+static void delta_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info)
+{
+    int lower = 0, delta, d, i;
+    uint32_t bits;
+
+    if (!s->decorr_passes[0].value)
+        return;
+    delta = s->decorr_passes[0].delta;
+
+    for (d = delta - 1; d >= 0; d--) {
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_stereo_buffer(info,
+                                 s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                 s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                 s->block_samples, i);
+        }
+
+        bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                          s->block_samples, info->log_limit);
+        if (bits >= info->best_bits)
+            break;
+        lower = 1;
+        info->best_bits = bits;
+        CLEAR(s->decorr_passes);
+        memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+        memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[i][0],
+               s->block_samples * 4);
+        memcpy(s->sampleptrs[info->nterms + 1][1], s->sampleptrs[i][1],
+               s->block_samples * 4);
+    }
+
+    for (d = delta + 1; !lower && d <= 7; d++) {
+        for (i = 0; i < info->nterms && s->decorr_passes[i].value; i++) {
+            info->dps[i].value = s->decorr_passes[i].value;
+            info->dps[i].delta = d;
+            decorr_stereo_buffer(info,
+                                 s->sampleptrs[i  ][0], s->sampleptrs[i  ][1],
+                                 s->sampleptrs[i+1][0], s->sampleptrs[i+1][1],
+                                 s->block_samples, i);
+        }
+
+        bits = log2stereo(s->sampleptrs[i][0], s->sampleptrs[i][1],
+                          s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * i);
+            memcpy(s->sampleptrs[info->nterms + 1][0],
+                   s->sampleptrs[i][0], s->block_samples * 4);
+            memcpy(s->sampleptrs[info->nterms + 1][1],
+                   s->sampleptrs[i][1], s->block_samples * 4);
+        }
+        else
+            break;
+    }
+}
+
+static void recurse_stereo(WavPackEncodeContext *s, WavPackExtraInfo *info,
+                           int depth, int delta, uint32_t input_bits)
+{
+    int term, branches = s->num_branches - depth;
+    int32_t *in_left, *in_right, *out_left, *out_right;
+    uint32_t term_bits[22], bits;
+
+    if (branches < 1 || depth + 1 == info->nterms)
+        branches = 1;
+
+    CLEAR(term_bits);
+    in_left   = s->sampleptrs[depth    ][0];
+    in_right  = s->sampleptrs[depth    ][1];
+    out_left  = s->sampleptrs[depth + 1][0];
+    out_right = s->sampleptrs[depth + 1][1];
+
+    for (term = -3; term <= 18; term++) {
+        if (!term || (term > 8 && term < 17))
+            continue;
+
+        if (term == 17 && branches == 1 && depth + 1 < info->nterms)
+            continue;
+
+        if (term == -1 || term == -2)
+            if (!(s->flags & WV_CROSS_DECORR))
+                continue;
+
+        if (!s->extra_flags && (term > 4 && term < 17))
+            continue;
+
+        info->dps[depth].value = term;
+        info->dps[depth].delta = delta;
+        decorr_stereo_buffer(info, in_left, in_right, out_left, out_right,
+                             s->block_samples, depth);
+        bits = log2stereo(out_left, out_right, s->block_samples, info->log_limit);
+
+        if (bits < info->best_bits) {
+            info->best_bits = bits;
+            CLEAR(s->decorr_passes);
+            memcpy(s->decorr_passes, info->dps, sizeof(info->dps[0]) * (depth + 1));
+            memcpy(s->sampleptrs[info->nterms + 1][0], s->sampleptrs[depth + 1][0],
+                   s->block_samples * 4);
+            memcpy(s->sampleptrs[info->nterms + 1][1], s->sampleptrs[depth + 1][1],
+                   s->block_samples * 4);
+        }
+
+        term_bits[term + 3] = bits;
+    }
+
+    while (depth + 1 < info->nterms && branches--) {
+        uint32_t local_best_bits = input_bits;
+        int best_term = 0, i;
+
+        for (i = 0; i < 22; i++)
+            if (term_bits[i] && term_bits[i] < local_best_bits) {
+                local_best_bits = term_bits[i];
+                best_term = i - 3;
+            }
+
+        if (!best_term)
+            break;
+
+        term_bits[best_term + 3] = 0;
+
+        info->dps[depth].value = best_term;
+        info->dps[depth].delta = delta;
+        decorr_stereo_buffer(info, in_left, in_right, out_left, out_right,
+                             s->block_samples, depth);
+
+        recurse_stereo(s, info, depth + 1, delta, local_best_bits);
+    }
+}
+
+static void analyze_stereo(WavPackEncodeContext *s,
+                           int32_t *in_left, int32_t *in_right,
+                           int do_samples)
+{
+    WavPackExtraInfo info;
+    int i;
+
+    info.gt16bit = ((s->flags & MAG_MASK) >> MAG_LSB) >= 16;
+
+    info.log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    info.log_limit = FFMIN(6912, info.log_limit);
+
+    info.nterms = s->num_terms;
+
+    if (allocate_buffers2(s, s->num_terms))
+        return;
+
+    memcpy(info.dps, s->decorr_passes, sizeof(info.dps));
+    memcpy(s->sampleptrs[0][0], in_left,  s->block_samples * 4);
+    memcpy(s->sampleptrs[0][1], in_right, s->block_samples * 4);
+
+    for (i = 0; i < info.nterms && info.dps[i].value; i++)
+        if (info.gt16bit)
+            decorr_stereo(s->sampleptrs[i    ][0], s->sampleptrs[i    ][1],
+                          s->sampleptrs[i + 1][0], s->sampleptrs[i + 1][1],
+                          s->block_samples, info.dps + i, 1);
+        else
+            decorr_stereo_quick(s->sampleptrs[i    ][0], s->sampleptrs[i    ][1],
+                                s->sampleptrs[i + 1][0], s->sampleptrs[i + 1][1],
+                                s->block_samples, info.dps + i);
+
+    info.best_bits = log2stereo(s->sampleptrs[info.nterms][0], s->sampleptrs[info.nterms][1],
+                                s->block_samples, 0);
+
+    memcpy(s->sampleptrs[info.nterms + 1][0], s->sampleptrs[i][0], s->block_samples * 4);
+    memcpy(s->sampleptrs[info.nterms + 1][1], s->sampleptrs[i][1], s->block_samples * 4);
+
+    if (s->extra_flags & EXTRA_BRANCHES)
+        recurse_stereo(s, &info, 0, (int) floor(s->delta_decay + 0.5),
+                       log2stereo(s->sampleptrs[0][0], s->sampleptrs[0][1],
+                                  s->block_samples, 0));
+
+    if (s->extra_flags & EXTRA_SORT_FIRST)
+        sort_stereo(s, &info);
+
+    if (s->extra_flags & EXTRA_TRY_DELTAS) {
+        delta_stereo(s, &info);
+
+        if ((s->extra_flags & EXTRA_ADJUST_DELTAS) && s->decorr_passes[0].value)
+            s->delta_decay = (float)((s->delta_decay * 2.0 + s->decorr_passes[0].delta) / 3.0);
+        else
+            s->delta_decay = 2.0;
+    }
+
+    if (s->extra_flags & EXTRA_SORT_LAST)
+        sort_stereo(s, &info);
+
+    if (do_samples) {
+        memcpy(in_left,  s->sampleptrs[info.nterms + 1][0], s->block_samples * 4);
+        memcpy(in_right, s->sampleptrs[info.nterms + 1][1], s->block_samples * 4);
+    }
+
+    for (i = 0; i < info.nterms; i++)
+        if (!s->decorr_passes[i].value)
+            break;
+
+    s->num_terms = i;
+}
+
+static int wv_stereo(WavPackEncodeContext *s,
+                     int32_t *samples_l, int32_t *samples_r,
+                     int no_history, int do_samples)
+{
+    struct Decorr temp_decorr_pass, save_decorr_passes[MAX_TERMS] = {{0}};
+    int nb_samples = s->block_samples, ret;
+    int buf_size = sizeof(int32_t) * nb_samples;
+    int log_limit, force_js = 0, force_ts = 0, got_js = 0, pi, i;
+    uint32_t best_size = UINT32_MAX, size;
+
+    for (i = 0; i < nb_samples; i++)
+        if (samples_l[i] || samples_r[i])
+            break;
+
+    if (i == nb_samples) {
+        s->flags &= ~((uint32_t) WV_JOINT_STEREO);
+        CLEAR(s->decorr_passes);
+        CLEAR(s->w);
+        s->num_terms = 0;
+        return 0;
+    }
+
+    log_limit = (((s->flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
+    log_limit = FFMIN(6912, log_limit);
+
+    if (s->joint != -1) {
+        force_js =  s->joint;
+        force_ts = !s->joint;
+    }
+
+    if ((ret = allocate_buffers(s)) < 0)
+        return ret;
+
+    if (no_history || s->num_passes >= 7)
+        s->best_decorr = s->mask_decorr = 0;
+
+    for (pi = 0; pi < s->num_passes;) {
+        const WavPackDecorrSpec *wpds;
+        int nterms, c, j;
+
+        if (!pi)
+            c = s->best_decorr;
+        else {
+            if (s->mask_decorr == 0)
+                c = 0;
+            else
+                c = (s->best_decorr & (s->mask_decorr - 1)) | s->mask_decorr;
+
+            if (c == s->best_decorr) {
+                s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+                continue;
+            }
+        }
+
+        wpds = &s->decorr_specs[c];
+        nterms = decorr_filter_nterms[s->decorr_filter];
+
+        while (1) {
+            if (force_js || (wpds->joint_stereo && !force_ts)) {
+                if (!got_js) {
+                    av_fast_padded_malloc(&s->js_left,  &s->js_left_size,  buf_size);
+                    av_fast_padded_malloc(&s->js_right, &s->js_right_size, buf_size);
+                    memcpy(s->js_left,  samples_l, buf_size);
+                    memcpy(s->js_right, samples_r, buf_size);
+
+                    for (i = 0; i < nb_samples; i++)
+                        s->js_right[i] += ((s->js_left[i] -= s->js_right[i]) >> 1);
+                    got_js = 1;
+                }
+
+                memcpy(s->temp_buffer[0][0], s->js_left,  buf_size);
+                memcpy(s->temp_buffer[0][1], s->js_right, buf_size);
+            } else {
+                memcpy(s->temp_buffer[0][0], samples_l, buf_size);
+                memcpy(s->temp_buffer[0][1], samples_r, buf_size);
+            }
+
+            CLEAR(save_decorr_passes);
+
+            for (j = 0; j < nterms; j++) {
+                CLEAR(temp_decorr_pass);
+                temp_decorr_pass.delta = wpds->delta;
+                temp_decorr_pass.value = wpds->terms[j];
+
+                if (temp_decorr_pass.value < 0 && !(s->flags & WV_CROSS_DECORR))
+                    temp_decorr_pass.value = -3;
+
+                decorr_stereo(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                              s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                              FFMIN(2048, nb_samples), &temp_decorr_pass, -1);
+
+                if (j) {
+                    CLEAR(temp_decorr_pass.samplesA);
+                    CLEAR(temp_decorr_pass.samplesB);
+                } else {
+                    reverse_decorr(&temp_decorr_pass);
+                }
+
+                memcpy(save_decorr_passes + j, &temp_decorr_pass, sizeof(struct Decorr));
+
+                if (((s->flags & MAG_MASK) >> MAG_LSB) >= 16)
+                    decorr_stereo(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                                  s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                                  nb_samples, &temp_decorr_pass, 1);
+                else
+                    decorr_stereo_quick(s->temp_buffer[ j&1][0], s->temp_buffer[ j&1][1],
+                                        s->temp_buffer[~j&1][0], s->temp_buffer[~j&1][1],
+                                        nb_samples, &temp_decorr_pass);
+            }
+
+            size = log2stereo(s->temp_buffer[j&1][0], s->temp_buffer[j&1][1],
+                              nb_samples, log_limit);
+            if (size != UINT32_MAX || !nterms)
+                break;
+            nterms >>= 1;
+        }
+
+        if (size < best_size) {
+            memcpy(s->best_buffer[0], s->temp_buffer[j&1][0], buf_size);
+            memcpy(s->best_buffer[1], s->temp_buffer[j&1][1], buf_size);
+            memcpy(s->decorr_passes, save_decorr_passes, sizeof(struct Decorr) * MAX_TERMS);
+            s->num_terms = nterms;
+            s->best_decorr = c;
+            best_size = size;
+        }
+
+        if (pi++)
+            s->mask_decorr = s->mask_decorr ? ((s->mask_decorr << 1) & (s->num_decorrs - 1)) : 1;
+    }
+
+    if (force_js || (s->decorr_specs[s->best_decorr].joint_stereo && !force_ts))
+        s->flags |= WV_JOINT_STEREO;
+    else
+        s->flags &= ~((uint32_t) WV_JOINT_STEREO);
+
+    if (s->extra_flags) {
+        if (s->flags & WV_JOINT_STEREO) {
+            analyze_stereo(s, s->js_left, s->js_right, do_samples);
+
+            if (do_samples) {
+                memcpy(samples_l, s->js_left,  buf_size);
+                memcpy(samples_r, s->js_right, buf_size);
+            }
+        } else
+            analyze_stereo(s, samples_l, samples_r, do_samples);
+    } else if (do_samples) {
+        memcpy(samples_l, s->best_buffer[0], buf_size);
+        memcpy(samples_r, s->best_buffer[1], buf_size);
+    }
+
+    if (s->extra_flags || no_history ||
+        s->joint_stereo != s->decorr_specs[s->best_decorr].joint_stereo) {
+        s->joint_stereo = s->decorr_specs[s->best_decorr].joint_stereo;
+        CLEAR(s->w);
+        scan_word(s, &s->w.c[0], s->best_buffer[0], nb_samples, -1);
+        scan_word(s, &s->w.c[1], s->best_buffer[1], nb_samples, -1);
+    }
+    return 0;
+}
+
+#define count_bits(av) ( \
+ (av) < (1 << 8) ? nbits_table[av] : \
+  ( \
+   (av) < (1 << 16) ? nbits_table[(av) >> 8] + 8 : \
+   ((av) < (1 << 24) ? nbits_table[(av) >> 16] + 16 : nbits_table[(av) >> 24] + 24) \
+  ) \
+)
+
+static void encode_flush(WavPackEncodeContext *s)
+{
+    WavPackWords *w = &s->w;
+    PutBitContext *pb = &s->pb;
+
+    if (w->zeros_acc) {
+        int cbits = count_bits(w->zeros_acc);
+
+        do {
+            if (cbits > 31) {
+                put_bits(pb, 31, 0x7FFFFFFF);
+                cbits -= 31;
+            } else {
+                put_bits(pb, cbits, (1 << cbits) - 1);
+                cbits = 0;
+            }
+        } while (cbits);
+
+        put_bits(pb, 1, 0);
+
+        while (w->zeros_acc > 1) {
+            put_bits(pb, 1, w->zeros_acc & 1);
+            w->zeros_acc >>= 1;
+        }
+
+        w->zeros_acc = 0;
+    }
+
+    if (w->holding_one) {
+        if (w->holding_one >= 16) {
+            int cbits;
+
+            put_bits(pb, 16, (1 << 16) - 1);
+            put_bits(pb, 1, 0);
+            w->holding_one -= 16;
+            cbits = count_bits(w->holding_one);
+
+            do {
+                if (cbits > 31) {
+                    put_bits(pb, 31, 0x7FFFFFFF);
+                    cbits -= 31;
+                } else {
+                    put_bits(pb, cbits, (1 << cbits) - 1);
+                    cbits = 0;
+                }
+            } while (cbits);
+
+            put_bits(pb, 1, 0);
+
+            while (w->holding_one > 1) {
+                put_bits(pb, 1, w->holding_one & 1);
+                w->holding_one >>= 1;
+            }
+
+            w->holding_zero = 0;
+        } else {
+            put_bits(pb, w->holding_one, (1 << w->holding_one) - 1);
+        }
+
+        w->holding_one = 0;
+    }
+
+    if (w->holding_zero) {
+        put_bits(pb, 1, 0);
+        w->holding_zero = 0;
+    }
+
+    if (w->pend_count) {
+        put_bits(pb, w->pend_count, w->pend_data);
+        w->pend_data = w->pend_count = 0;
+    }
+}
+
+static void wavpack_encode_sample(WavPackEncodeContext *s, WvChannel *c, int32_t sample)
+{
+    WavPackWords *w = &s->w;
+    uint32_t ones_count, low, high;
+    int sign = sample < 0;
+
+    if (s->w.c[0].median[0] < 2 && !s->w.holding_zero && s->w.c[1].median[0] < 2) {
+        if (w->zeros_acc) {
+            if (sample)
+                encode_flush(s);
+            else {
+                w->zeros_acc++;
+                return;
+            }
+        } else if (sample) {
+            put_bits(&s->pb, 1, 0);
+        } else {
+            CLEAR(s->w.c[0].median);
+            CLEAR(s->w.c[1].median);
+            w->zeros_acc = 1;
+            return;
+        }
+    }
+
+    if (sign)
+        sample = ~sample;
+
+    if (sample < (int32_t) GET_MED(0)) {
+        ones_count = low = 0;
+        high = GET_MED(0) - 1;
+        DEC_MED(0);
+    } else {
+        low = GET_MED(0);
+        INC_MED(0);
+
+        if (sample - low < GET_MED(1)) {
+            ones_count = 1;
+            high = low + GET_MED(1) - 1;
+            DEC_MED(1);
+        } else {
+            low += GET_MED(1);
+            INC_MED(1);
+
+            if (sample - low < GET_MED(2)) {
+                ones_count = 2;
+                high = low + GET_MED(2) - 1;
+                DEC_MED(2);
+            } else {
+                ones_count = 2 + (sample - low) / GET_MED(2);
+                low += (ones_count - 2) * GET_MED(2);
+                high = low + GET_MED(2) - 1;
+                INC_MED(2);
+            }
+        }
+    }
+
+    if (w->holding_zero) {
+        if (ones_count)
+            w->holding_one++;
+
+        encode_flush(s);
+
+        if (ones_count) {
+            w->holding_zero = 1;
+            ones_count--;
+        } else
+            w->holding_zero = 0;
+    } else
+        w->holding_zero = 1;
+
+    w->holding_one = ones_count * 2;
+
+    if (high != low) {
+        uint32_t maxcode = high - low, code = sample - low;
+        int bitcount = count_bits(maxcode);
+        uint32_t extras = (1 << bitcount) - maxcode - 1;
+
+        if (code < extras) {
+            w->pend_data |= code << w->pend_count;
+            w->pend_count += bitcount - 1;
+        } else {
+            w->pend_data |= ((code + extras) >> 1) << w->pend_count;
+            w->pend_count += bitcount - 1;
+            w->pend_data |= ((code + extras) & 1) << w->pend_count++;
+        }
+    }
+
+    w->pend_data |= ((int32_t) sign << w->pend_count++);
+
+    if (!w->holding_zero)
+        encode_flush(s);
+}
+
+static void pack_int32(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    const int sent_bits = s->int32_sent_bits;
+    PutBitContext *pb = &s->pb;
+    int i, pre_shift;
+
+    pre_shift = s->int32_zeros + s->int32_ones + s->int32_dups;
+
+    if (!sent_bits)
+        return;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++) {
+            put_sbits(pb, sent_bits, samples_l[i] >> pre_shift);
+        }
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            put_sbits(pb, sent_bits, samples_l[i] >> pre_shift);
+            put_sbits(pb, sent_bits, samples_r[i] >> pre_shift);
+        }
+    }
+}
+
+static void pack_float_sample(WavPackEncodeContext *s, int32_t *sample)
+{
+    const int max_exp = s->float_max_exp;
+    PutBitContext *pb = &s->pb;
+    int32_t value, shift_count;
+
+    if (get_exponent(*sample) == 255) {
+        if (get_mantissa(*sample)) {
+            put_bits(pb, 1, 1);
+            put_bits(pb, 23, get_mantissa(*sample));
+        } else {
+            put_bits(pb, 1, 0);
+        }
+
+        value = 0x1000000;
+        shift_count = 0;
+    } else if (get_exponent(*sample)) {
+        shift_count = max_exp - get_exponent(*sample);
+        value = 0x800000 + get_mantissa(*sample);
+    } else {
+        shift_count = max_exp ? max_exp - 1 : 0;
+        value = get_mantissa(*sample);
+    }
+
+    if (shift_count < 25)
+        value >>= shift_count;
+    else
+        value = 0;
+
+    if (!value) {
+        if (s->float_flags & FLOAT_ZEROS_SENT) {
+            if (get_exponent(*sample) || get_mantissa(*sample)) {
+                put_bits(pb, 1, 1);
+                put_bits(pb, 23, get_mantissa(*sample));
+
+                if (max_exp >= 25)
+                    put_bits(pb, 8, get_exponent(*sample));
+
+                put_bits(pb, 1, get_sign(*sample));
+            } else {
+                put_bits(pb, 1, 0);
+
+                if (s->float_flags & FLOAT_NEG_ZEROS)
+                    put_bits(pb, 1, get_sign(*sample));
+            }
+        }
+    } else if (shift_count) {
+        if (s->float_flags & FLOAT_SHIFT_SENT) {
+            put_sbits(pb, shift_count, get_mantissa(*sample));
+        } else if (s->float_flags & FLOAT_SHIFT_SAME) {
+            put_bits(pb, 1, get_mantissa(*sample) & 1);
+        }
+    }
+}
+
+static void pack_float(WavPackEncodeContext *s,
+                       int32_t *samples_l, int32_t *samples_r,
+                       int nb_samples)
+{
+    int i;
+
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            pack_float_sample(s, &samples_l[i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            pack_float_sample(s, &samples_l[i]);
+            pack_float_sample(s, &samples_r[i]);
+        }
+    }
+}
+
+static void decorr_stereo_pass2(struct Decorr *dpp,
+                                int32_t *samples_l, int32_t *samples_r,
+                                int nb_samples)
+{
+    int i, m, k;
+
+    switch (dpp->value) {
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default:
+        for (m = 0, k = dpp->value & (MAX_TERM - 1), i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            samples_l[i] = tmp = (dpp->samplesA[k] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam);
+            UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            samples_r[i] = tmp = (dpp->samplesB[k] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam);
+            UPDATE_WEIGHT(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+
+            memcpy(temp_A, dpp->samplesA, sizeof (dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof (dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            samples_l[i] = tmp = (sam_B = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            samples_r[i] = tmp = (dpp->samplesA[0] = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            samples_r[i] = tmp = (sam_A = samples_r[i]) - APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            samples_l[i] = tmp = (dpp->samplesB[0] = samples_l[i]) - APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = samples_r[i];
+            samples_r[i] = tmp -= APPLY_WEIGHT(dpp->weightB, sam_B);
+            UPDATE_WEIGHT_CLIP(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = samples_l[i];
+            samples_l[i] = tmp -= APPLY_WEIGHT(dpp->weightA, sam_A);
+            UPDATE_WEIGHT_CLIP(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+#define update_weight_d2(weight, delta, source, result) \
+    if (source && result) \
+        weight -= (((source ^ result) >> 29) & 4) - 2;
+
+#define update_weight_clip_d2(weight, delta, source, result) \
+    if (source && result) { \
+        const int32_t s = (source ^ result) >> 31; \
+        if ((weight = (weight ^ s) + (2 - s)) > 1024) weight = 1024; \
+        weight = (weight ^ s) - s; \
+    }
+
+static void decorr_stereo_pass_id2(struct Decorr *dpp,
+                                   int32_t *samples_l, int32_t *samples_r,
+                                   int nb_samples)
+{
+    int i, m, k;
+
+    switch (dpp->value) {
+    case 17:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = 2 * dpp->samplesB[0] - dpp->samplesB[1];
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    case 18:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[0] + ((dpp->samplesA[0] - dpp->samplesA[1]) >> 1);
+            dpp->samplesA[1] = dpp->samplesA[0];
+            samples_l[i] = tmp = (dpp->samplesA[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[0] + ((dpp->samplesB[0] - dpp->samplesB[1]) >> 1);
+            dpp->samplesB[1] = dpp->samplesB[0];
+            samples_r[i] = tmp = (dpp->samplesB[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+        }
+        break;
+    default:
+        for (m = 0, k = dpp->value & (MAX_TERM - 1), i = 0; i < nb_samples; i++) {
+            int32_t sam, tmp;
+
+            sam = dpp->samplesA[m];
+            samples_l[i] = tmp = (dpp->samplesA[k] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam);
+            update_weight_d2(dpp->weightA, dpp->delta, sam, tmp);
+
+            sam = dpp->samplesB[m];
+            samples_r[i] = tmp = (dpp->samplesB[k] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam);
+            update_weight_d2(dpp->weightB, dpp->delta, sam, tmp);
+
+            m = (m + 1) & (MAX_TERM - 1);
+            k = (k + 1) & (MAX_TERM - 1);
+        }
+
+        if (m) {
+            int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+
+            memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+            memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+            for (k = 0; k < MAX_TERM; k++) {
+                dpp->samplesA[k] = temp_A[m];
+                dpp->samplesB[k] = temp_B[m];
+                m = (m + 1) & (MAX_TERM - 1);
+            }
+        }
+        break;
+    case -1:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            samples_l[i] = tmp = (sam_B = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+
+            samples_r[i] = tmp = (dpp->samplesA[0] = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+        }
+        break;
+    case -2:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_B = dpp->samplesB[0];
+            samples_r[i] = tmp = (sam_A = samples_r[i]) - APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            samples_l[i] = tmp = (dpp->samplesB[0] = samples_l[i]) - APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    case -3:
+        for (i = 0; i < nb_samples; i++) {
+            int32_t sam_A, sam_B, tmp;
+
+            sam_A = dpp->samplesA[0];
+            sam_B = dpp->samplesB[0];
+
+            dpp->samplesA[0] = tmp = samples_r[i];
+            samples_r[i] = tmp -= APPLY_WEIGHT_I(dpp->weightB, sam_B);
+            update_weight_clip_d2(dpp->weightB, dpp->delta, sam_B, tmp);
+
+            dpp->samplesB[0] = tmp = samples_l[i];
+            samples_l[i] = tmp -= APPLY_WEIGHT_I(dpp->weightA, sam_A);
+            update_weight_clip_d2(dpp->weightA, dpp->delta, sam_A, tmp);
+        }
+        break;
+    }
+}
+
+static void put_metadata_block(PutByteContext *pb, int flags, int size)
+{
+    if (size & 1)
+        flags |= WP_IDF_ODD;
+
+    bytestream2_put_byte(pb, flags);
+    bytestream2_put_byte(pb, (size + 1) >> 1);
+}
+
+static int wavpack_encode_block(WavPackEncodeContext *s,
+                                int32_t *samples_l, int32_t *samples_r,
+                                uint8_t *out, int out_size)
+{
+    int block_size, start, end, data_size, tcount, temp, m = 0;
+    int i, j, ret = 0, got_extra = 0, nb_samples = s->block_samples;
+    uint32_t crc = 0xffffffffu;
+    struct Decorr *dpp;
+    PutByteContext pb;
+
+    if (s->flags & WV_MONO_DATA) {
+        CLEAR(s->w);
+    }
+    if (!(s->flags & WV_MONO) && s->optimize_mono) {
+        int32_t lor = 0, diff = 0;
+
+        for (i = 0; i < nb_samples; i++) {
+            lor  |= samples_l[i] | samples_r[i];
+            diff |= samples_l[i] - samples_r[i];
+
+            if (lor && diff)
+                break;
+        }
+
+        if (i == nb_samples && lor && !diff) {
+            s->flags &= ~(WV_JOINT_STEREO | WV_CROSS_DECORR);
+            s->flags |= WV_FALSE_STEREO;
+
+            if (!s->false_stereo) {
+                s->false_stereo = 1;
+                s->num_terms = 0;
+                CLEAR(s->w);
+            }
+        } else if (s->false_stereo) {
+            s->false_stereo = 0;
+            s->num_terms = 0;
+            CLEAR(s->w);
+        }
+    }
+
+    if (s->flags & SHIFT_MASK) {
+        int shift = (s->flags & SHIFT_MASK) >> SHIFT_LSB;
+        int mag = (s->flags & MAG_MASK) >> MAG_LSB;
+
+        if (s->flags & WV_MONO_DATA)
+            shift_mono(samples_l, nb_samples, shift);
+        else
+            shift_stereo(samples_l, samples_r, nb_samples, shift);
+
+        if ((mag -= shift) < 0)
+            s->flags &= ~MAG_MASK;
+        else
+            s->flags -= (1 << MAG_LSB) * shift;
+    }
+
+    if ((s->flags & WV_FLOAT_DATA) || (s->flags & MAG_MASK) >> MAG_LSB >= 24) {
+        av_fast_padded_malloc(&s->orig_l, &s->orig_l_size, sizeof(int32_t) * nb_samples);
+        memcpy(s->orig_l, samples_l, sizeof(int32_t) * nb_samples);
+        if (!(s->flags & WV_MONO_DATA)) {
+            av_fast_padded_malloc(&s->orig_r, &s->orig_r_size, sizeof(int32_t) * nb_samples);
+            memcpy(s->orig_r, samples_r, sizeof(int32_t) * nb_samples);
+        }
+
+        if (s->flags & WV_FLOAT_DATA)
+            got_extra = scan_float(s, samples_l, samples_r, nb_samples);
+        else
+            got_extra = scan_int32(s, samples_l, samples_r, nb_samples);
+        s->num_terms = 0;
+    } else {
+        scan_int23(s, samples_l, samples_r, nb_samples);
+        if (s->shift != s->int32_zeros + s->int32_ones + s->int32_dups) {
+            s->shift = s->int32_zeros + s->int32_ones + s->int32_dups;
+            s->num_terms = 0;
+        }
+    }
+
+    if (!s->num_passes && !s->num_terms) {
+        s->num_passes = 1;
+
+        if (s->flags & WV_MONO_DATA)
+            ret = wv_mono(s, samples_l, 1, 0);
+        else
+            ret = wv_stereo(s, samples_l, samples_r, 1, 0);
+
+        s->num_passes = 0;
+    }
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            crc += (crc << 1) + samples_l[i];
+
+        if (s->num_passes)
+            ret = wv_mono(s, samples_l, !s->num_terms, 1);
+    } else {
+        for (i = 0; i < nb_samples; i++)
+            crc += (crc << 3) + (samples_l[i] << 1) + samples_l[i] + samples_r[i];
+
+        if (s->num_passes)
+            ret = wv_stereo(s, samples_l, samples_r, !s->num_terms, 1);
+    }
+    if (ret < 0)
+        return ret;
+
+    if (!s->ch_offset)
+        s->flags |= WV_INITIAL_BLOCK;
+
+    s->ch_offset += 1 + !(s->flags & WV_MONO);
+
+    if (s->ch_offset == s->avctx->channels)
+        s->flags |= WV_FINAL_BLOCK;
+
+    bytestream2_init_writer(&pb, out, out_size);
+    bytestream2_put_le32(&pb, MKTAG('w', 'v', 'p', 'k'));
+    bytestream2_put_le32(&pb, 0);
+    bytestream2_put_le16(&pb, 0x410);
+    bytestream2_put_le16(&pb, 0);
+    bytestream2_put_le32(&pb, 0);
+    bytestream2_put_le32(&pb, s->sample_index);
+    bytestream2_put_le32(&pb, nb_samples);
+    bytestream2_put_le32(&pb, s->flags);
+    bytestream2_put_le32(&pb, crc);
+
+    if (s->flags & WV_INITIAL_BLOCK &&
+        s->avctx->channel_layout != AV_CH_LAYOUT_MONO &&
+        s->avctx->channel_layout != AV_CH_LAYOUT_STEREO) {
+        put_metadata_block(&pb, WP_ID_CHANINFO, 5);
+        bytestream2_put_byte(&pb, s->avctx->channels);
+        bytestream2_put_le32(&pb, s->avctx->channel_layout);
+        bytestream2_put_byte(&pb, 0);
+    }
+
+    if ((s->flags & SRATE_MASK) == SRATE_MASK) {
+        put_metadata_block(&pb, WP_ID_SAMPLE_RATE, 3);
+        bytestream2_put_le24(&pb, s->avctx->sample_rate);
+        bytestream2_put_byte(&pb, 0);
+    }
+
+    put_metadata_block(&pb, WP_ID_DECTERMS, s->num_terms);
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        bytestream2_put_byte(&pb, ((dpp->value + 5) & 0x1f) | ((dpp->delta << 5) & 0xe0));
+    }
+    if (s->num_terms & 1)
+        bytestream2_put_byte(&pb, 0);
+
+#define WRITE_DECWEIGHT(type) do {            \
+        temp = store_weight(type);    \
+        bytestream2_put_byte(&pb, temp);      \
+        type = restore_weight(temp);  \
+    } while (0)
+
+    bytestream2_put_byte(&pb, WP_ID_DECWEIGHTS);
+    bytestream2_put_byte(&pb, 0);
+    start = bytestream2_tell_p(&pb);
+    for (i = s->num_terms - 1; i >= 0; --i) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+
+        if (store_weight(dpp->weightA) ||
+            (!(s->flags & WV_MONO_DATA) && store_weight(dpp->weightB)))
+                break;
+    }
+    tcount = i + 1;
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        if (i < tcount) {
+            WRITE_DECWEIGHT(dpp->weightA);
+            if (!(s->flags & WV_MONO_DATA))
+                WRITE_DECWEIGHT(dpp->weightB);
+        } else {
+            dpp->weightA = dpp->weightB = 0;
+        }
+    }
+    end = bytestream2_tell_p(&pb);
+    out[start - 2] = WP_ID_DECWEIGHTS | (((end - start) & 1) ? WP_IDF_ODD: 0);
+    out[start - 1] = (end - start + 1) >> 1;
+    if ((end - start) & 1)
+        bytestream2_put_byte(&pb, 0);
+
+#define WRITE_DECSAMPLE(type) do {        \
+        temp = log2s(type);               \
+        type = wp_exp2(temp);             \
+        bytestream2_put_le16(&pb, temp);  \
+    } while (0)
+
+    bytestream2_put_byte(&pb, WP_ID_DECSAMPLES);
+    bytestream2_put_byte(&pb, 0);
+    start = bytestream2_tell_p(&pb);
+    for (i = 0; i < s->num_terms; i++) {
+        struct Decorr *dpp = &s->decorr_passes[i];
+        if (i == 0) {
+            if (dpp->value > MAX_TERM) {
+                WRITE_DECSAMPLE(dpp->samplesA[0]);
+                WRITE_DECSAMPLE(dpp->samplesA[1]);
+                if (!(s->flags & WV_MONO_DATA)) {
+                    WRITE_DECSAMPLE(dpp->samplesB[0]);
+                    WRITE_DECSAMPLE(dpp->samplesB[1]);
+                }
+            } else if (dpp->value < 0) {
+                WRITE_DECSAMPLE(dpp->samplesA[0]);
+                WRITE_DECSAMPLE(dpp->samplesB[0]);
+            } else {
+                for (j = 0; j < dpp->value; j++) {
+                    WRITE_DECSAMPLE(dpp->samplesA[j]);
+                    if (!(s->flags & WV_MONO_DATA))
+                        WRITE_DECSAMPLE(dpp->samplesB[j]);
+                }
+            }
+        } else {
+            CLEAR(dpp->samplesA);
+            CLEAR(dpp->samplesB);
+        }
+    }
+    end = bytestream2_tell_p(&pb);
+    out[start - 1] = (end - start) >> 1;
+
+#define WRITE_CHAN_ENTROPY(chan) do {               \
+        for (i = 0; i < 3; i++) {                   \
+            temp = wp_log2(s->w.c[chan].median[i]); \
+            bytestream2_put_le16(&pb, temp);        \
+            s->w.c[chan].median[i] = wp_exp2(temp); \
+        }                                           \
+    } while (0)
+
+    put_metadata_block(&pb, WP_ID_ENTROPY, 6 * (1 + (!(s->flags & WV_MONO_DATA))));
+    WRITE_CHAN_ENTROPY(0);
+    if (!(s->flags & WV_MONO_DATA))
+        WRITE_CHAN_ENTROPY(1);
+
+    if (s->flags & WV_FLOAT_DATA) {
+        put_metadata_block(&pb, WP_ID_FLOATINFO, 4);
+        bytestream2_put_byte(&pb, s->float_flags);
+        bytestream2_put_byte(&pb, s->float_shift);
+        bytestream2_put_byte(&pb, s->float_max_exp);
+        bytestream2_put_byte(&pb, 127);
+    }
+
+    if (s->flags & WV_INT32_DATA) {
+        put_metadata_block(&pb, WP_ID_INT32INFO, 4);
+        bytestream2_put_byte(&pb, s->int32_sent_bits);
+        bytestream2_put_byte(&pb, s->int32_zeros);
+        bytestream2_put_byte(&pb, s->int32_ones);
+        bytestream2_put_byte(&pb, s->int32_dups);
+    }
+
+    if (s->flags & WV_MONO_DATA && !s->num_passes) {
+        for (i = 0; i < nb_samples; i++) {
+            int32_t code = samples_l[i];
+
+            for (tcount = s->num_terms, dpp = s->decorr_passes; tcount--; dpp++) {
+                int32_t sam;
+
+                if (dpp->value > MAX_TERM) {
+                    if (dpp->value & 1)
+                        sam = 2 * dpp->samplesA[0] - dpp->samplesA[1];
+                    else
+                        sam = (3 * dpp->samplesA[0] - dpp->samplesA[1]) >> 1;
+
+                    dpp->samplesA[1] = dpp->samplesA[0];
+                    dpp->samplesA[0] = code;
+                } else {
+                    sam = dpp->samplesA[m];
+                    dpp->samplesA[(m + dpp->value) & (MAX_TERM - 1)] = code;
+                }
+
+                code -= APPLY_WEIGHT(dpp->weightA, sam);
+                UPDATE_WEIGHT(dpp->weightA, dpp->delta, sam, code);
+            }
+
+            m = (m + 1) & (MAX_TERM - 1);
+            samples_l[i] = code;
+        }
+        if (m) {
+            for (tcount = s->num_terms, dpp = s->decorr_passes; tcount--; dpp++)
+                if (dpp->value > 0 && dpp->value <= MAX_TERM) {
+                int32_t temp_A[MAX_TERM], temp_B[MAX_TERM];
+                int k;
+
+                memcpy(temp_A, dpp->samplesA, sizeof(dpp->samplesA));
+                memcpy(temp_B, dpp->samplesB, sizeof(dpp->samplesB));
+
+                for (k = 0; k < MAX_TERM; k++) {
+                    dpp->samplesA[k] = temp_A[m];
+                    dpp->samplesB[k] = temp_B[m];
+                    m = (m + 1) & (MAX_TERM - 1);
+                }
+            }
+        }
+    } else if (!s->num_passes) {
+        if (s->flags & WV_JOINT_STEREO) {
+            for (i = 0; i < nb_samples; i++)
+                samples_r[i] += ((samples_l[i] -= samples_r[i]) >> 1);
+        }
+
+        for (i = 0; i < s->num_terms; i++) {
+            struct Decorr *dpp = &s->decorr_passes[i];
+            if (((s->flags & MAG_MASK) >> MAG_LSB) >= 16 || dpp->delta != 2)
+                decorr_stereo_pass2(dpp, samples_l, samples_r, nb_samples);
+            else
+                decorr_stereo_pass_id2(dpp, samples_l, samples_r, nb_samples);
+        }
+    }
+
+    bytestream2_put_byte(&pb, WP_ID_DATA | WP_IDF_LONG);
+    init_put_bits(&s->pb, pb.buffer + 3, bytestream2_get_bytes_left_p(&pb));
+    if (s->flags & WV_MONO_DATA) {
+        for (i = 0; i < nb_samples; i++)
+            wavpack_encode_sample(s, &s->w.c[0], s->samples[0][i]);
+    } else {
+        for (i = 0; i < nb_samples; i++) {
+            wavpack_encode_sample(s, &s->w.c[0], s->samples[0][i]);
+            wavpack_encode_sample(s, &s->w.c[1], s->samples[1][i]);
+        }
+    }
+    encode_flush(s);
+    flush_put_bits(&s->pb);
+    data_size = put_bits_count(&s->pb) >> 3;
+    bytestream2_put_le24(&pb, (data_size + 1) >> 1);
+    bytestream2_skip_p(&pb, data_size);
+    if (data_size & 1)
+        bytestream2_put_byte(&pb, 0);
+
+    if (got_extra) {
+        bytestream2_put_byte(&pb, WP_ID_EXTRABITS | WP_IDF_LONG);
+        init_put_bits(&s->pb, pb.buffer + 7, bytestream2_get_bytes_left_p(&pb));
+        if (s->flags & WV_FLOAT_DATA)
+            pack_float(s, s->orig_l, s->orig_r, nb_samples);
+        else
+            pack_int32(s, s->orig_l, s->orig_r, nb_samples);
+        flush_put_bits(&s->pb);
+        data_size = put_bits_count(&s->pb) >> 3;
+        bytestream2_put_le24(&pb, (data_size + 5) >> 1);
+        bytestream2_put_le32(&pb, s->crc_x);
+        bytestream2_skip_p(&pb, data_size);
+        if (data_size & 1)
+            bytestream2_put_byte(&pb, 0);
+    }
+
+    block_size = bytestream2_tell_p(&pb);
+    AV_WL32(out + 4, block_size - 8);
+
+    av_assert0(!bytestream2_get_eof(&pb));
+
+    return block_size;
+}
+
+static void fill_buffer(WavPackEncodeContext *s,
+                        const int8_t *src, int32_t *dst,
+                        int nb_samples)
+{
+    int i;
+
+#define COPY_SAMPLES(type, offset, shift) do {            \
+        const type *sptr = (const type *)src;             \
+        for (i = 0; i < nb_samples; i++)                  \
+            dst[i] = (sptr[i] - offset) >> shift;         \
+    } while (0)
+
+    switch (s->avctx->sample_fmt) {
+    case AV_SAMPLE_FMT_U8P:
+        COPY_SAMPLES(int8_t, 0x80, 0);
+        break;
+    case AV_SAMPLE_FMT_S16P:
+        COPY_SAMPLES(int16_t, 0, 0);
+        break;
+    case AV_SAMPLE_FMT_S32P:
+        if (s->avctx->bits_per_raw_sample <= 24) {
+            COPY_SAMPLES(int32_t, 0, 8);
+            break;
+        }
+    case AV_SAMPLE_FMT_FLTP:
+        memcpy(dst, src, nb_samples * 4);
+    }
+}
+
+static void set_samplerate(WavPackEncodeContext *s)
+{
+    int i;
+
+    for (i = 0; i < 15; i++) {
+        if (wv_rates[i] == s->avctx->sample_rate)
+            break;
+    }
+
+    s->flags = i << SRATE_LSB;
+}
+
+static int wavpack_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                                const AVFrame *frame, int *got_packet_ptr)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+    int buf_size, ret;
+    uint8_t *buf;
+
+    s->block_samples = frame->nb_samples;
+    av_fast_padded_malloc(&s->samples[0], &s->samples_size[0],
+                          sizeof(int32_t) * s->block_samples);
+    if (!s->samples[0])
+        return AVERROR(ENOMEM);
+    if (avctx->channels > 1) {
+        av_fast_padded_malloc(&s->samples[1], &s->samples_size[1],
+                              sizeof(int32_t) * s->block_samples);
+        if (!s->samples[1])
+            return AVERROR(ENOMEM);
+    }
+
+    buf_size = s->block_samples * avctx->channels * 8
+             + 200 * avctx->channels /* for headers */;
+    if ((ret = ff_alloc_packet2(avctx, avpkt, buf_size, 0)) < 0)
+        return ret;
+    buf = avpkt->data;
+
+    for (s->ch_offset = 0; s->ch_offset < avctx->channels;) {
+        set_samplerate(s);
+
+        switch (s->avctx->sample_fmt) {
+        case AV_SAMPLE_FMT_S16P: s->flags |= 1; break;
+        case AV_SAMPLE_FMT_S32P: s->flags |= 3 - (s->avctx->bits_per_raw_sample <= 24); break;
+        case AV_SAMPLE_FMT_FLTP: s->flags |= 3 | WV_FLOAT_DATA;
+        }
+
+        fill_buffer(s, frame->extended_data[s->ch_offset], s->samples[0], s->block_samples);
+        if (avctx->channels - s->ch_offset == 1) {
+            s->flags |= WV_MONO;
+        } else {
+            s->flags |= WV_CROSS_DECORR;
+            fill_buffer(s, frame->extended_data[s->ch_offset + 1], s->samples[1], s->block_samples);
+        }
+
+        s->flags += (1 << MAG_LSB) * ((s->flags & 3) * 8 + 7);
+
+        if ((ret = wavpack_encode_block(s, s->samples[0], s->samples[1],
+                                        buf, buf_size)) < 0)
+            return ret;
+
+        buf      += ret;
+        buf_size -= ret;
+    }
+    s->sample_index += frame->nb_samples;
+
+    avpkt->pts      = frame->pts;
+    avpkt->size     = buf - avpkt->data;
+    avpkt->duration = ff_samples_to_time_base(avctx, frame->nb_samples);
+    *got_packet_ptr = 1;
+    return 0;
+}
+
+static av_cold int wavpack_encode_close(AVCodecContext *avctx)
+{
+    WavPackEncodeContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < MAX_TERMS + 2; i++) {
+        av_freep(&s->sampleptrs[i][0]);
+        av_freep(&s->sampleptrs[i][1]);
+        s->sampleptrs_size[i][0] = s->sampleptrs_size[i][1] = 0;
+    }
+
+    for (i = 0; i < 2; i++) {
+        av_freep(&s->samples[i]);
+        s->samples_size[i] = 0;
+
+        av_freep(&s->best_buffer[i]);
+        s->best_buffer_size[i] = 0;
+
+        av_freep(&s->temp_buffer[i][0]);
+        av_freep(&s->temp_buffer[i][1]);
+        s->temp_buffer_size[i][0] = s->temp_buffer_size[i][1] = 0;
+    }
+
+    av_freep(&s->js_left);
+    av_freep(&s->js_right);
+    s->js_left_size = s->js_right_size = 0;
+
+    av_freep(&s->orig_l);
+    av_freep(&s->orig_r);
+    s->orig_l_size = s->orig_r_size = 0;
+
+    return 0;
+}
+
+#define OFFSET(x) offsetof(WavPackEncodeContext, x)
+#define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
+static const AVOption options[] = {
+    { "joint_stereo",  "", OFFSET(joint), AV_OPT_TYPE_BOOL, {.i64=-1}, -1, 1, FLAGS },
+    { "optimize_mono", "", OFFSET(optimize_mono), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
+    { NULL },
+};
+
+static const AVClass wavpack_encoder_class = {
+    .class_name = "WavPack encoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_wavpack_encoder = {
+    .name           = "wavpack",
+    .long_name      = NULL_IF_CONFIG_SMALL("WavPack"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_WAVPACK,
+    .priv_data_size = sizeof(WavPackEncodeContext),
+    .priv_class     = &wavpack_encoder_class,
+    .init           = wavpack_encode_init,
+    .encode2        = wavpack_encode_frame,
+    .close          = wavpack_encode_close,
+    .capabilities   = AV_CODEC_CAP_SMALL_LAST_FRAME,
+    .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_U8P,
+                                                     AV_SAMPLE_FMT_S16P,
+                                                     AV_SAMPLE_FMT_S32P,
+                                                     AV_SAMPLE_FMT_FLTP,
+                                                     AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/wavpackenc.h b/libavcodec/wavpackenc.h
new file mode 100644
index 0000000..9dd2a01
--- /dev/null
+++ b/libavcodec/wavpackenc.h
@@ -0,0 +1,664 @@
+/*
+ * WavPack lossless audio encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_WAVPACKENC_H
+#define AVCODEC_WAVPACKENC_H
+
+#include "wavpack.h"
+
+typedef struct WavPackDecorrSpec {
+    int8_t joint_stereo, delta, terms[MAX_TERMS+1];
+} WavPackDecorrSpec;
+
+static const WavPackDecorrSpec fast_specs[] = {
+ { 1, 2, { 18,17 } }, { 1, 1, { 17,17 } }, { 0, 2, { 18,17 } },
+ { 0, 1, { 17,17 } }, { 1, 3, {  1,18 } }, { 1, 1, { 17, 1 } },
+ { 0, 1, {  1,17 } }, { 0, 1, { -2,17 } }, { 0, 2, { -1,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 3, { 18,18 } }, { 0, 1, { 17, 1 } },
+ { 1, 6, {  1, 2 } }, { 1, 1, { 17, 3 } }, { 0, 1, { -2, 3 } },
+ { 0, 1, {  2,17 } }, { 0, 1, { 18,-2 } }, { 0, 1, { -1,17 } },
+ { 0, 1, { 18,17 } }, { 0, 1, { 17, 2 } }, { 1, 2, { 18,-2 } },
+ { 1, 1, {  1,17 } }, { 0, 3, { 18, 2 } }, { 0, 1, { 17,-2 } },
+ { 0, 1, { 18,-2 } }, { 1, 2, { 17,-3 } }, { 0, 1, { 18, 3 } },
+ { 0, 1, { 18,18 } }, { 1, 1, {  1, 3 } }, { 1, 1, { 18, 3 } },
+ { 1, 1, {  1, 3 } }, { 0, 2, { 18,17 } }, { 1, 1, {  1,17 } },
+ { 1, 1, { 17, 3 } }, { 0, 3, { 18,17 } }, { 0, 1, { 18,18 } },
+ { 1, 1, {  1, 3 } }, { 1, 1, {  1,18 } }, { 0, 1, { 18,-2 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { -1,18 } }, { 1, 1, { 17, 3 } },
+ { 0, 1, { 17, 2 } }, { 0, 1, { 17, 3 } }, { 1, 1, { 18, 2 } },
+ { 1, 1, { 17,-2 } }, { 0, 1, {  1,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 1, { 17,-2 } }, { 1, 1, { 17,-2 } }, { 0, 1, { 18, 3 } },
+ { 0, 1, {  2,17 } }, { 1, 2, { 18,-3 } }, { 1, 2, {  1,18 } },
+ { 1, 2, { 18, 2 } }, { 0, 1, { 17,-1 } }, { 0, 1, { 17,-2 } },
+ { 1, 1, { 17,-2 } }, { 1, 1, {  1, 3 } }, { 0, 1, {  1,17 } },
+ { 1, 2, { 18,-2 } }, { 1, 2, { 17,-3 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17, 2 } }, { 1, 2, { 18,18 } },
+ { 0, 1, { 17, 2 } }, { 0, 1, { 18,17 } }, { 1, 1, {  1,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,18 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 6, {  1, 2 } }, { 0, 3, { 17,17 } },
+ { 0, 1, {  1,18 } }, { 0, 1, {  1,-2 } }, { 1, 1, { 17, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 1, { 18, 3 } },
+ { 1, 2, { 17,-3 } }, { 0, 1, { 17, 2 } }, { 0, 1, { 17, 3 } },
+ { 0, 1, { 18,-2 } }, { 1, 1, { 18,18 } }, { 1, 6, {  1, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -1,17 } },
+ { 1, 1, { 18, 3 } }, { 0, 1, { 17,18 } }, { 1, 1, { 17, 3 } },
+ { 0, 1, { 18, 3 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 18, 2 } }, { 0, 1, { -2, 3 } }, { 0, 4, { 18,-1 } },
+ { 0, 2, { 18,18 } }, { 0, 1, { -2, 3 } }, { 1, 1, { 17,-2 } },
+ { 0, 1, { 17, 3 } }, { 0, 2, { 18,17 } }, { 0, 2, { -1,18 } },
+ { 1, 1, {  2,17 } }, { 0, 2, { 17,-2 } }, { 0, 1, { 17, 2 } },
+ { 1, 2, { 18,-3 } }, { 0, 1, { 17,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17,-2 } }, { 1, 2, { 17,-3 } },
+ { 1, 1, {  1, 3 } }, { 1, 1, {  2,17 } }, { 1, 2, { 18, 2 } },
+ { 1, 1, {  2,17 } }, { 1, 1, { 18, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { 17,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 17,-1 } }, { 0, 2, { 18,-2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 1, {  1, 3 } },
+ { 0, 2, { -2,17 } }, { 0, 2, { 18,-2 } }, { 0, 2, { 17,-2 } },
+ { 1, 1, {  2,17 } }, { 1, 1, {  1, 3 } }, { 0, 1, {  2,17 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { -1,17 } }, { 1, 1, {  2,17 } },
+ { 0, 2, { 18,18 } }, { 0, 1, { 17, 2 } }, { 1, 4, { 18,-3 } },
+ { 1, 1, { 18, 1 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 18,-1 } }, { 0, 1, { -1,18 } }, { 1, 6, {  1, 2 } },
+ { 1, 1, { 17, 2 } }, { 1, 4, { 18, 3 } }, { 0, 1, {  1,17 } },
+ { 0, 1, { 18, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17, 2 } }, { 0, 2, { 18,-2 } }, { 0, 1, {  1,18 } },
+ { 1, 2, { 18,-3 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 2, { 18,18 } }, { 1, 3, { 17,17 } },
+ { 0, 1, { -2,17 } }, { 0, 1, { 17,18 } }, { 0, 1, { -1, 3 } },
+ { 1, 1, {  2,17 } }, { 0, 2, { 18,-1 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17,-2 } }, { 1, 2, { 17, 2 } },
+ { 1, 1, { 18, 3 } }, { 0, 1, { 18, 2 } }, { 1, 2, { 17,-3 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -2,17 } },
+ { 0, 1, { 17,-1 } }, { 0, 1, { 18,-1 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 1, {  1,18 } }, { 1, 3, { 18, 2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { 18,18 } }, { 0, 1, {  1,-2 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 2, { 17,-3 } }, { 1, 1, { 18,18 } }, { 0, 2, { 18, 2 } },
+ { 0, 1, { 17,18 } }, { 1, 2, { 18, 2 } }, { 1, 1, { 17,-2 } },
+ { 0, 2, { 17,-1 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 1, {  1,-2 } }, { 0, 1, { 18, 1 } },
+ { 1, 2, { 18,-2 } }, { 0, 1, { 17, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 1, 1, { 17, 3 } }, { 0, 1, { 17,-1 } },
+ { 0, 1, { 18, 2 } }, { 1, 1, { 17, 3 } }, { 1, 1, { 17,-2 } },
+ { 0, 1, { 18,18 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17,18 } }, { 0, 1, { -2, 3 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 1, 2, { 18,-3 } },
+ { 0, 2, { 18,17 } }, { 0, 3, { 18, 2 } }, { 0, 1, {  1,18 } },
+ { 0, 2, { 18,17 } }, { 0, 1, { 17,-1 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 1, { -2, 3 } },
+ { 0, 3, { 17,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 1, 1, { 17, 2 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18, 2 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } }, { 0, 2, { 18,17 } },
+ { 0, 2, { 18,17 } },
+};
+
+static const WavPackDecorrSpec default_specs[] = {
+ { 1, 2, { 18,18, 2,17, 3 } }, { 0, 2, { 18,17,-1, 3, 2 } },
+ { 1, 1, { 17,18,18,-2, 2 } }, { 0, 2, { 18,17, 3,-2,17 } },
+ { 1, 2, { 18,17, 2,17, 3 } }, { 0, 1, { 18,18,-1, 2,17 } },
+ { 0, 1, { 17,17,-2, 2, 3 } }, { 0, 1, { 18,-2,18, 2,17 } },
+ { 1, 2, { 18,18,-1, 2, 3 } }, { 0, 2, { 18,17, 3, 2, 5 } },
+ { 1, 1, { 18,17,18, 2, 5 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 0, 1, { 17,-2,17, 2,-3 } },
+ { 1, 1, { 17,-2,17, 1, 2 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 0, 2, { 18,17, 3, 2, 5 } },
+ { 0, 1, { 18,18,18, 2,17 } }, { 0, 1, { 18,17,-1, 2,18 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 0, 2, { 18,-2,18, 2, 3 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 0, 3, { 18,17, 2, 3,17 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 0, 1, { 17,18,-2, 2,17 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 0, 1, { 18,17, 1, 4, 6 } }, { 1, 1, {  3,17,18, 2,17 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 0, 1, { 18,17,-1, 2, 3 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 1, 2, { 18,17,-1,17, 3 } },
+ { 1, 2, { 18,17, 2, 3,-1 } }, { 0, 2, { 18,18,-2, 2,17 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 5, { -2,18,18,18, 2 } },
+ { 1, 1, { 18,18,-1, 6, 3 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 1, 1, { 18,17,18, 2,17 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 0, 1, { -2,18, 2, 2,18 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 1, 1, { 17,17, 2, 1, 4 } }, { 0, 1, { 17,18,-2, 2,17 } },
+ { 1, 1, { 17, 3, 2, 1, 7 } }, { 1, 3, { 18,-3,18, 2, 3 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 1, 1, { 18,18, 3, 5, 2 } },
+ { 0, 2, { 18,18,-1, 2,17 } }, { 0, 1, { 18,-1,17,18, 2 } },
+ { 0, 1, { 17,-1, 2, 3, 6 } }, { 0, 1, { 18,-2,18, 2, 5 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 0, 3, { 18,18, 2, 3,17 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 0, 1, { -1, 3, 5, 4, 7 } }, { 0, 3, { 18,18, 3, 2, 5 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 1, { 18,17,-2,18, 3 } },
+ { 0, 2, { 18,18,-2, 2,17 } }, { 0, 3, { 18,17,-2, 2, 3 } },
+ { 1, 1, { 18,18,-2, 2,17 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 1, 2, {  3,18,17, 2,17 } }, { 1, 2, { 18,18, 2,-2,18 } },
+ { 1, 2, { 18,18,-1,18, 2 } }, { 0, 2, { 18,18,-2, 2,17 } },
+ { 1, 3, { 18,18, 2, 3,-2 } }, { 0, 3, { 18,18, 3, 2, 5 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 1, 1, { 17, 3, 2, 1, 7 } },
+ { 1, 3, { 18,18,-2, 2,18 } }, { 1, 1, { 17,18,18,-2, 2 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 0, 2, { 18,-2,18, 2, 3 } },
+ { 0, 1, { -1, 3, 4, 5, 7 } }, { 1, 1, { 17,17, 2,-1, 7 } },
+ { 0, 1, { 18,-1,-1, 2,-2 } }, { 0, 2, { 18,17, 2, 3,17 } },
+ { 0, 1, { 18,17, 2,18, 2 } }, { 0, 2, { 18,17,-1, 2,17 } },
+ { 0, 1, {  1,18, 3, 2, 5 } }, { 0, 2, { 18,-2, 4,18, 2 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 1, { 18,17,18, 2, 5 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 4, { 18,18,-2, 2,18 } },
+ { 1, 1, { 18,18, 3, 2, 5 } }, { 1, 1, { 17,17, 2, 1, 4 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18, 2, 1, 3 } }, { 1, 1, { 17,17, 2, 1, 4 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 1, { 18,17, 1, 4, 6 } },
+ { 1, 2, { 18,18,-2, 2,-1 } }, { 0, 1, { 18,-2,18, 2, 5 } },
+ { 1, 1, { 17, 2,18, 2,17 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18, 3, 6,-1 } }, { 0, 1, { 18,17, 2,18, 3 } },
+ { 0, 1, { 18,17,-2, 2,17 } }, { 1, 1, {  3,17,18, 2,17 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 1, 3, { 18,18,-3,18, 2 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 1, { 17,-2,17, 2,-3 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 0, 1, { 18,-1,-1, 2,-2 } }, { 1, 1, { 18, 3, 1, 5, 4 } },
+ { 0, 3, { 18,17,-1, 2,17 } }, { 1, 3, { 18,17, 2,18,-2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 2, { 18,18,-2, 2,-1 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 0, 4, {  3,18,18, 2,17 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 1, { 18,17,-1,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18,18, 3, 2 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 0, 1, { 17,-1, 2, 3, 6 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 1, 3, { 18,17, 2,-2,18 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 0, 1, { 18,18, 2,18,-2 } }, { 0, 2, { 18,-2, 4,18, 2 } },
+ { 0, 1, { -2,18, 2, 2,18 } }, { 0, 2, { 18,17, 3, 6, 2 } },
+ { 0, 1, { 18,17,18, 2, 5 } }, { 0, 3, { 18,18,-2, 3, 2 } },
+ { 1, 1, { 18,18, 2,18, 5 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 1, 4, { 18,18, 2, 3,-2 } }, { 0, 2, { 18,17,18, 2,-2 } },
+ { 0, 1, {  1,18, 3, 2, 5 } }, { 1, 4, { 18,-2,18, 2, 3 } },
+ { 1, 2, { 18, 2,18, 3,-2 } }, { 0, 2, { 18,18,18, 2, 4 } },
+ { 0, 2, {  3,17,18, 2,17 } }, { 1, 1, { 18,-1,18, 2,17 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 0, 3, {  3,18,18, 2,17 } },
+ { 0, 1, { 18,-1,17,18, 2 } }, { 0, 1, { 18,17, 2,18, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, { 18,17, 2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 17,17, 2,18, 3 } }, { 0, 1, { 18,17,-2, 2, 3 } },
+ { 0, 1, { 18,-2,18, 2, 5 } }, { 1, 4, { 18,-2,18, 2, 3 } },
+ { 1, 3, { 18,17, 2, 3, 6 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 0, 2, { 18,17, 2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,18, 3, 5, 2 } }, { 0, 2, { 18,18,-2, 2, 3 } },
+ { 1, 2, { 18,17, 2,17, 3 } }, { 0, 1, { 18,17, 2, 3,18 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 1, { 17,-2,17, 2,-3 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 1, 1, { 18,18,18, 2, 4 } }, { 1, 2, { 18, 2,18, 3,-2 } },
+ { 1, 1, { 18,18,-2, 2,17 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18, 2,17, 3 } }, { 0, 2, { 18,18,18, 2, 4 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,17,-2, 3, 2 } },
+ { 0, 1, {  1,-1,-1, 2,17 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, { 18,-2,18, 3, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18,-2, 2,17 } }, { 0, 3, { 18,17, 2, 3,17 } },
+ { 1, 2, { 18,18, 2,-2,18 } }, { 0, 1, { -1, 3, 5, 4, 7 } },
+ { 1, 1, { 18, 3, 1, 5, 4 } }, { 1, 1, { 18,18,-2,18, 3 } },
+ { 0, 2, { 18,17,18, 2,-2 } }, { 0, 2, { 18,18, 2,17, 3 } },
+ { 1, 2, { 18, 2,18, 3,-2 } }, { 1, 4, { 18,18, 2, 3,-2 } },
+ { 1, 3, { 18,17, 2, 3, 6 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 18,17,-2,-1,17 } }, { 0, 1, { 17,-1, 2, 3, 6 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2, 2, 3 } },
+ { 1, 1, { 18,18,18, 2, 5 } }, { 0, 1, { 17,17,-2, 2, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,17, 3, 6, 2 } },
+ { 0, 2, { 18,17,18, 2, 3 } }, { 0, 3, { 18,17,-3,18, 2 } },
+ { 0, 1, { 18,18,18, 2, 3 } }, { 0, 1, { 18,-2,-3, 2, 6 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 1, 1, { 18,17,18, 2, 5 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 1, { 18,17,18, 2, 5 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18,18, 2, 3 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 1, 1, { 17,17, 2,-1, 7 } }, { 0, 1, { 18,17, 4, 3, 1 } },
+ { 1, 3, { 18,-3,18, 2, 3 } }, { 0, 1, {  1,18, 3, 2, 5 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,18, 3, 6, 2 } }, { 0, 1, { 17,17, 2,18, 4 } },
+ { 0, 1, { 17,17, 2,18, 4 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 1, 2, { 18,-2,18, 3, 2 } }, { 1, 1, { 17,-2,17, 1, 2 } },
+ { 1, 1, { 18,18, 3, 2, 5 } }, { 0, 1, { 18,18,-1, 2, 3 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 1, { 18,17,18, 2, 5 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 1, {  3,18,18, 2,17 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+ { 0, 2, { 18,18,-2,18, 2 } }, { 0, 2, { 18,18,-2,18, 2 } },
+};
+
+static const WavPackDecorrSpec high_specs[] = {
+ { 1, 2, { 18,18,18,-2, 2, 3, 5,-1,17, 4 } }, { 0, 1, { 18,17,-2, 2,18, 3, 7, 2, 5, 4 } },
+ { 1, 2, {  1,18, 3, 6,-2,18, 2, 3, 4, 5 } }, { 0, 2, { 18,18,-2, 2,18, 3, 6, 2,17, 4 } },
+ { 1, 2, { 18,18, 2,18, 3, 2,-1, 4,18, 5 } }, { 1, 1, {  7, 6, 5, 3, 4, 2, 5, 4, 3, 7 } },
+ { 1, 1, { 17, 3,18, 7, 2, 6, 1, 4, 3, 5 } }, { 1, 1, { -2,18,18,18, 3,-2, 6, 5, 2, 1 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 6,-2,17, 5 } }, { 0, 1, { 17,17,18, 3, 6, 4, 5, 2,18,-2 } },
+ { 1, 2, {  1,18,-2, 3, 5, 2, 4,-1, 6, 1 } }, { 0, 2, { 18,18, 3, 6,18, 2, 4, 8, 5, 3 } },
+ { 0, 1, { -2, 1,18, 2,-2, 7,18, 2,-1, 5 } }, { 1, 1, {  4, 3, 8, 1, 5, 2, 5, 6, 2, 8 } },
+ { 1, 1, { 17,18, 2, 6, 3, 4,-1, 1, 8, 6 } }, { 0, 1, { 18,18, 3, 6, 3,-2, 2, 5,-1, 1 } },
+ { 0, 1, { 18,18,17,-1, 2,-2,18, 3, 4, 5 } }, { 1, 2, { 18,17, 2,-2,18, 3, 5, 7, 2, 4 } },
+ { 1, 2, { 18,18, 3, 6,-2,18, 2, 5, 8, 3 } }, { 0, 1, { 18,17, 2,18,18, 2, 6, 5,17, 7 } },
+ { 1, 2, { 18,17, 2,18, 3, 2, 6,18,-1, 4 } }, { 1, 1, {  5, 3, 6, 5, 3, 4, 1, 2, 4, 7 } },
+ { 1, 1, {  5, 3, 6, 5, 3, 4, 1, 2, 4, 7 } }, { 0, 1, { -2,18,18,18,-2, 3, 2, 4, 6, 5 } },
+ { 1, 2, { 18,17,-3, 3,-1,18, 2, 3, 6, 5 } }, { 0, 1, { 17,18, 7, 3,-2, 7, 1, 2, 4, 5 } },
+ { 1, 1, {  2,18,18,-2, 2, 4,-1,18, 3, 6 } }, { 0, 3, {  1,18, 4, 3, 5, 2, 4,18, 2, 3 } },
+ { 0, 1, { -2,18, 2,18, 3, 7,18, 2, 6,-2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 5, 1, 4, 3 } }, { 1, 1, { 18, 3, 6, 5, 7, 8, 2, 3, 1,-1 } },
+ { 1, 1, { 18,18,18, 2,-2, 3, 5,18, 2, 8 } }, { 0, 2, { 18,17,-2, 2, 3,18,-3, 5, 2, 7 } },
+ { 1, 1, {  1, 1,-1, 8,17, 3,-2, 2, 6,17 } }, { 0, 2, { 18,18,17, 2,-2, 3, 2, 4,18, 5 } },
+ { 1, 1, { 17,18, 2,-1, 5, 7,18, 3, 4, 6 } }, { 1, 1, {  5, 4, 5,17, 3, 6, 3, 4, 7, 2 } },
+ { 0, 1, { 17, 3, 1, 7, 4, 2, 5,-2,18, 6 } }, { 0, 1, { 17,18, 2,18, 4, 3, 5, 7,-3, 6 } },
+ { 1, 2, { 17,17,-3,-2, 2, 8,18,-1, 3, 5 } }, { 0, 1, { 17,17,18, 2, 3, 6,-2, 8, 1, 7 } },
+ { 1, 1, {  1, 2, 6,-2,18, 2, 5,-3, 7,-2 } }, { 0, 1, { 18,18, 3,18, 6, 8,-2, 2, 3, 5 } },
+ { 0, 1, { 18,17, 2,18,-2, 3, 7, 6, 2, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18, 2,-1, 3, 6, 1, 3, 4, 8 } }, { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } },
+ { 0, 1, { 18,17,-3,18, 2, 4,-2, 3, 6,17 } }, { 1, 3, {  1, 2,17, 3,18, 7,-1, 5, 2, 4 } },
+ { 1, 1, { 18, 3,18, 6, 8,18,-2, 5, 7, 2 } }, { 0, 1, { 17, 2,18, 6, 3, 2, 5, 4, 8, 1 } },
+ { 0, 1, { 18,17,-1, 2, 3,18,18, 2, 3,17 } }, { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 4 } },
+ { 1, 1, {  6,17, 3, 8, 1, 5, 7,-1, 2, 1 } }, { 1, 1, { 18,-2,18, 3,-2, 2, 7, 4, 6,18 } },
+ { 1, 3, { 18,-3,18, 2, 3,18,-1, 7, 2, 5 } }, { 0, 2, { 18,-2, 7, 1, 3, 2, 4, 6,-3, 7 } },
+ { 1, 1, { 18,-2, 2,-3,18,-2,17,-1, 4, 2 } }, { 0, 3, { 17,17, 2, 5, 3, 7,18, 6, 4, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,17, 4, 6, 6, 4, 5, 3, 4, 1 } }, { 0, 1, { 18, 5, 3, 6, 2, 3, 8, 1, 3, 7 } },
+ { 1, 2, { 18,17,-2, 2,18, 3, 5, 7,-1, 2 } }, { 0, 1, {  1,18,18, 3, 6,-1, 4, 8, 5, 2 } },
+ { 1, 1, {  1, 5, 3, 4, 1, 1, 3, 5, 7, 3 } }, { 0, 1, {  3,18,18, 2,18,18,-1, 2, 3,18 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 4, 6,18, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18, 3, 1, 4, 5, 2, 7, 1, 3, 6 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,18,-1,18, 2, 3, 5,-2, 6, 8 } }, { 1, 1, { 17,18, 4, 8, 3, 2, 5, 2, 7, 6 } },
+ { 1, 4, {  1, 2, 5,18,-2, 2, 3, 7,-1, 4 } }, { 0, 2, { 18,17,-1, 3, 6,18, 2, 3, 7, 5 } },
+ { 0, 1, { -2,18, 2,-3, 6,18, 4, 3,-2, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17,17, 6, 2, 4, 8, 3, 5,-1,17 } }, { 1, 1, { 18, 3,18, 6, 8,18,-2, 5, 7, 2 } },
+ { 1, 2, { 17,17,-3, 2,18,-2, 8, 3, 6,-1 } }, { 1, 1, { 18,-2,17,18, 2, 3,-2, 6, 5, 4 } },
+ { 1, 2, { 18,17,-1, 3,18, 2, 5, 3, 6,-3 } }, { 0, 1, { 18,17, 2,18, 7,18, 2, 4, 3,17 } },
+ { 1, 3, { 18,18, 5, 6, 4, 3, 4,18, 6, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, {  7, 6, 5, 3, 4, 2, 5, 4, 3, 7 } }, { 0, 1, { -2,18,18,18, 3, 6, 4, 2, 5, 2 } },
+ { 0, 3, { 18,17,-3,18, 3, 2, 5,-1,17, 3 } }, { 1, 1, { 17,18, 7, 3, 1, 7, 4, 2, 6, 5 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 3, { 18,18,-1, 3, 2, 7, 5,18, 4, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18, 2,-2, 4, 8,18, 3, 6, 5 } }, { 0, 2, { 18,17, 3, 5,-2, 7, 2,18, 3,-1 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 2, {  3,17,18,18, 2, 5, 7, 6,18, 3 } },
+ { 1, 1, { 17,18,18, 4, 3, 2,18, 7, 8,-1 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17, 1, 2, 3, 5, 6, 1, 4, 8,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,17,-1,18,-3, 2, 8, 3, 6,17 } }, { 1, 1, { 17,17, 1, 2, 4, 5,-1, 2, 1, 6 } },
+ { 1, 1, {  1, 2, 6,-2,18, 2,-3, 3,-2, 5 } }, { 0, 1, { 18, 3,18, 6,18, 5, 2, 4,-1, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18,18,-1, 2,18, 3, 6, 4,-2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { -1,18,18,18, 2,-2, 4, 7, 2, 3 } }, { 0, 3, {  3,17,-2, 5, 2, 7,18, 6, 4, 5 } },
+ { 0, 1, { 17, 6,18, 3, 8, 4, 5, 3, 8,18 } }, { 0, 2, { 18, 2, 6, 2,18, 3, 2, 4, 5, 8 } },
+ { 0, 1, {  3,18,18, 2,18,-1, 2,18, 2,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  3, 6,17,-2, 5, 1, 2, 7, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 3, {  3,18,17, 5, 6, 2, 7,-2, 8,18 } }, { 1, 1, { 18,-1, 3, 1, 7, 2,-1, 4, 6,17 } },
+ { 1, 1, { 18, 2,-2,-1,18, 5, 3,-2, 1, 2 } }, { 0, 2, { 18, 1, 2,18, 3, 6, 5, 2, 4, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,-2, 2,18,18, 8, 5, 3, 2, 6 } }, { 0, 1, { 18,17, 2,18, 3, 2, 7,-2,18, 4 } },
+ { 1, 2, {  1,18, 2, 3,-1, 5, 6, 4, 7,17 } }, { 0, 2, { 18,17, 3, 6,-2, 2, 3, 8, 5,17 } },
+ { 0, 2, { 18,18, 3, 2,18,-1, 2, 4, 3,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 17,-1,18, 2, 3,-2, 5,18, 2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,-3,18, 2, 3,-2,18, 5, 6,-3 } }, { 0, 2, { 18,17, 3, 5,-2, 7, 2,18, 3,-1 } },
+ { 1, 1, {  1,18,-1, 2, 3, 1,-2, 8, 2, 5 } }, { 0, 1, { 18,18, 3, 6,18, 2, 3, 4, 8, 5 } },
+ { 0, 1, { -2, 1,18, 2,-2, 5, 7,18, 2,-1 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18,-1, 2, 8, 3, 4, 5, 1, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18,-1, 2,18, 3,-2, 5, 4, 2 } }, { 1, 1, { 18,17, 2,18, 3, 8, 5, 2, 7,17 } },
+ { 0, 1, { 18,18, 3,18, 6, 8,-2, 2, 3, 5 } }, { 0, 1, { 18,18, 2,18, 2, 6,18, 2,17, 7 } },
+ { 1, 3, { 18,17,18, 2, 8,18, 5,-1, 3, 6 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,17,-1, 3, 6,18, 2, 5, 8, 3 } }, { 0, 1, { 17,18,18, 4, 7, 2, 3,-2,18, 5 } },
+ { 1, 2, { 18, 1, 2, 6, 2, 5,18, 2, 4, 8 } }, { 0, 4, { 18, 4, 1, 2, 3, 5, 4, 1, 2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 2, { 18,17, 2,-1,18, 3,-3, 5, 2, 4 } },
+ { 0, 1, { 17,17, 3, 6, 3, 5,-2, 2,18,-1 } }, { 0, 2, { 18,18, 3,-2,18, 2,-3, 5, 3, 6 } },
+ { 1, 1, { 17,17, 2, 4, 1, 3, 5, 2, 6,-3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17, 1, 3, 2, 7, 1, 6, 3, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 17,-1,18, 2, 1, 5, 3, 8,-1,-2 } }, { 1, 1, { 17,18,-1, 8, 2, 5, 3, 4, 1, 6 } },
+ { 1, 2, {  1,18, 3,-1, 5, 1, 2, 4, 7, 6 } }, { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  1,18,-1, 3, 8, 5, 6, 1, 2, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18, 2, 3, 6,18,-1, 4, 2, 3 } }, { 1, 1, {  1, 3, 5,18, 2, 6, 7, 2, 3, 1 } },
+ { 1, 1, {  1, 3, 8,18, 5, 2, 7, 1, 3,-2 } }, { 0, 2, { 17, 2,18, 3, 6, 2, 4, 5, 8, 3 } },
+ { 0, 1, { 18,17, 2,18, 3, 2, 7,-2,18, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,-3,18,-1, 3,-2, 5, 7, 1, 2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 3, { 18,18, 2, 6,18, 5,18, 2, 3,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 3, {  1,-1, 1, 3,-2, 2, 5, 7,-3,18 } }, { 1, 2, { 18, 7, 3,-3, 2, 8, 2, 5, 4,17 } },
+ { 1, 1, {  1, 4, 5, 1, 3, 4, 6, 7, 8, 3 } }, { 0, 1, { 18,17, 2,18,-1, 2, 3,18, 2, 4 } },
+ { 0, 2, { 18,18,-2,18, 2, 3, 4, 7, 5,17 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,18, 2, 1, 3, 2, 5, 1, 2, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 18,18,-1, 2, 3, 5, 8, 6, 1,-2 } }, { 0, 1, { 17,18, 8, 3, 4, 6, 5, 2, 8, 7 } },
+ { 1, 2, {  1, 3,-2,18, 2, 5, 1, 7,-1,-2 } }, { 0, 3, { 18,17,-1, 3,18, 2, 3, 6, 4,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 2, { 18,18, 4,18, 6, 7, 8, 3,18, 2 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { 17,-3,17, 2,-2, 8, 3,18, 4,-3 } }, { 1, 1, { 18,17, 3, 5, 6, 2, 8, 1, 3, 7 } },
+ { 0, 1, { 18,18, 3, 6, 5, 3,-2, 2,18,-1 } }, { 0, 3, { 18,18, 2, 6,18, 5,18, 2, 3,17 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 5, 1, 4, 3 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, {  3,17,18,-3, 2, 5,18, 6,-1, 7 } }, { 1, 1, { 17,18, 3, 2, 5,-1, 6, 8, 4, 7 } },
+ { 1, 1, { 18, 1,-2, 3, 2, 1, 7, 6, 3, 4 } }, { 0, 3, {  1, 2,17, 3,18, 2, 7, 5, 4,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, { 17,-2, 2,18,18, 8, 5, 3, 2, 6 } }, { 0, 2, { 18, 5,18, 2, 3, 7,-2, 1, 6, 8 } },
+ { 0, 1, {  2,-1,18,-1, 2, 4,-3, 5,18, 3 } }, { 0, 1, {  3,17,18, 5, 2,18, 7, 3, 6, 5 } },
+ { 1, 4, {  1, 2, 5,18,-2, 2, 3, 7,-1, 4 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, {  1,18, 2, 1, 3, 4, 1, 5, 2, 7 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { 17,17,18, 2, 4, 5,18,-2, 6, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 2, { 18,18,-1, 3, 5, 6, 8,18, 2, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { 18,18, 4, 6, 8,18, 7, 3, 2, 5 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 2, { -1,18,18,18, 2, 4,-2, 2, 3, 6 } }, { 0, 2, { 18,-2, 7, 1, 3, 2, 4, 6,-3, 7 } },
+ { 1, 1, { 17,18, 8, 3, 4, 6,-2, 5, 3, 8 } }, { 0, 2, { 18, 1, 2, 6, 2, 8, 3,18, 5, 4 } },
+ { 1, 1, {  3,18,18, 2,18, 2,18, 3, 2,18 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 1, 1, {  3,17,18, 5, 2, 6, 7, 1, 4, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } }, { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2, 8 } },
+};
+
+static const WavPackDecorrSpec very_high_specs[] = {
+ { 1, 2, { 18,18, 2, 3,-2,18, 2, 4, 7, 5, 3, 6, 8,-1,18, 2 } },
+ { 0, 1, { 18,18,-1,18, 2, 3, 4, 6, 5, 7,18,-3, 8, 2,-1, 3 } },
+ { 1, 2, {  1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 } },
+ { 0, 1, { 17,17, 2, 3, 4,18,-1, 5, 6, 7,18, 2, 8,17, 3,-2 } },
+ { 1, 1, { 18,18, 2,18, 3, 2,18, 4,-1, 3,18, 2, 6, 8,17, 5 } },
+ { 0, 2, { 18,17, 2, 3,-2, 5,18,-3, 2, 4, 7, 3, 6, 8, 5,17 } },
+ { 1, 1, { 18,-2, 2,-3,18, 5,-2,18, 2, 3, 6, 2,17, 4, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3,-2, 2, 5, 4,18, 6, 3, 8, 7, 2, 5, 4 } },
+ { 0, 2, { 18,17,-2, 2,18, 3, 2, 5,-3, 4, 7,18, 3, 8, 6, 2 } },
+ { 1, 1, {  3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 } },
+ { 1, 2, {  1,18, 3, 2,-2, 1, 5, 4, 6, 2, 7, 1, 8, 3,-1, 1 } },
+ { 0, 1, { 18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 } },
+ { 0, 1, { -2,18, 2,18, 7, 2, 6,-2, 3, 4,18,18, 2,-3, 8, 5 } },
+ { 0, 2, { 18,18,18, 2, 4, 3,18, 5, 3, 6,-2, 2, 4,18, 8, 7 } },
+ { 0, 1, { -2, 1,18, 2,-2,18,-1, 5, 7, 2, 3, 4,18, 2, 6, 2 } },
+ { 1, 1, { 17,18, 3, 2, 1, 7,-1, 2, 4, 3, 5, 6,-2,18, 7, 8 } },
+ { 1, 1, { 18,18, 2,18, 3, 4, 6,-2,18, 5, 8, 2, 3, 7, 4,-1 } },
+ { 0, 1, { 18,18,18,-1, 2, 3, 4, 6, 8,18, 3, 5, 2, 6, 7, 4 } },
+ { 1, 1, { 17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 } },
+ { 0, 1, { 17,17,18, 2, 3, 6,-2, 8, 1, 7, 5, 2, 3, 1, 4, 8 } },
+ { 1, 1, { 17,17, 3, 2, 7, 1, 4, 3, 6, 2, 5,-2, 8, 7,18, 6 } },
+ { 0, 1, { 18,17,-2, 2,18, 3,-3, 7, 6, 5, 2, 4,-1, 8, 3,17 } },
+ { 1, 1, {  2,18,18,-2, 2, 4,-1, 5,18, 3, 8, 6, 2, 7,17, 4 } },
+ { 0, 1, { 17, 3, 6, 8, 5, 4, 3, 8, 1,18, 7, 2, 4, 5, 6, 3 } },
+ { 1, 2, { 17,18, 4, 8, 3, 2, 5, 7, 6, 8, 2, 7,-2,18, 3, 4 } },
+ { 1, 1, {  6, 5, 5, 3, 4, 7, 3, 2, 4, 6, 3, 7, 1, 5, 2, 4 } },
+ { 1, 1, {  1,18,-1, 2, 1, 3, 8,-2, 2, 5, 6, 3, 8, 7,18, 4 } },
+ { 0, 1, {  1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 } },
+ { 0, 1, { 18, 2,18,18, 2,18, 6,-2,18, 7, 5, 4, 3, 2,18,-2 } },
+ { 0, 3, {  1, 4,18, 3, 2, 4, 1, 5, 2, 3, 6,18, 8, 7, 2, 4 } },
+ { 0, 1, { 17,-2, 1,-3, 2,18, 3,-2, 4,18, 3, 6, 7,-3, 2, 8 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 1, 2, { 18,-1,17,18, 2, 3,-2,18, 5, 8, 2, 4, 3, 7, 6,-1 } },
+ { 1, 1, { 18,18,18,-2, 4, 2, 3,18, 5, 8, 2, 4, 6, 7,-2, 3 } },
+ { 1, 2, { 18,18,-2,18,-1, 3, 2, 5,18,-2, 7, 2, 3, 4, 6, 8 } },
+ { 0, 1, { 17,18,-1, 2, 4,18, 8, 3, 6, 5, 7,-3, 2, 4, 3,17 } },
+ { 1, 1, { 18,18,17, 2,-1,18, 3, 2,18, 6, 5, 4,18, 7, 2,-1 } },
+ { 0, 2, {  1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 } },
+ { 1, 1, {  1,17,-2, 2,-3, 6, 3, 5, 1, 2, 7, 6, 8,-2, 4, 1 } },
+ { 0, 1, { 17,-1, 5, 1, 4, 3, 6, 2,-2,18, 3, 2, 4, 5, 8,-1 } },
+ { 0, 2, { 18,18,17, 2, 3,-2, 5,18, 2, 4, 7, 8, 6,17, 3, 5 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 1, 2, {  1,-1, 3, 2,18, 7,-2, 5, 2, 6, 4, 3,-1,18, 8, 7 } },
+ { 0, 2, { 18,17, 3,18, 2, 5, 4, 3, 6, 2, 7, 8,18, 3, 4, 5 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 2, { 18,18, 3,-3,18, 2, 6, 5, 3, 7,18, 4,-2, 8, 2, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 1, 1, {  3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 } },
+ { 0, 1, { 18,18,18, 2, 4,-1,18, 8,-1, 2, 3, 4, 6,-2, 1, 7 } },
+ { 1, 1, { 18,-2,17,18, 2, 6, 3,-2, 5, 4, 7, 1,-3, 8, 2, 6 } },
+ { 0, 1, { 17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 } },
+ { 1, 1, { 18,18, 5, 4, 6, 4, 1, 5, 4, 3, 2, 5, 6, 1, 4, 5 } },
+ { 0, 1, { 18,18,-2,18, 2,-3, 3, 8, 5,18, 6, 4, 3,-1, 7, 2 } },
+ { 1, 1, { 18, 2,-2,-3,18, 5, 2, 3,-2, 4, 6, 1,-3, 2, 7, 8 } },
+ { 0, 1, { 18, 3, 5, 8, 2, 6, 7, 3, 1, 5, 2,-1, 8, 6, 7, 4 } },
+ { 1, 1, {  4, 3, 8, 1, 5, 6, 2, 5, 8,-2, 2, 7, 3,18, 5, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 3,18,18, 7, 2, 4,18, 6, 2, 3,-1, 8, 5,18,-3 } },
+ { 0, 1, {  3,17,18, 2,18, 6, 7,-3,18, 2, 5, 6, 3, 8, 7,-1 } },
+ { 1, 1, { 18,18, 2,18,18, 2,-1, 7, 3,18, 5, 2, 6, 4,-1,18 } },
+ { 0, 3, { 18, 3, 4, 1, 5, 2,18, 4, 2, 3,18, 7, 6, 1, 2, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1,18, 2, 3, 6, 4, 5, 7,18, 3, 8, 2, 4,-2,17 } },
+ { 1, 2, { 18,17, 2, 3, 5,18, 6,-2, 7, 3, 2, 4,18, 8,-1, 5 } },
+ { 0, 2, {  1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 } },
+ { 1, 1, {  1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 } },
+ { 0, 1, { 18,18, 2,18, 2,18, 7, 6,18, 2,-2, 3, 5, 4,18, 8 } },
+ { 1, 2, { 18,17, 2, 3,18,-1, 2, 3, 6,18, 5, 4, 3, 7, 2, 8 } },
+ { 1, 2, { 18,18, 3,-2, 4,18, 5, 7, 6, 2, 4,-3, 8, 5,18, 3 } },
+ { 1, 1, { 17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 } },
+ { 1, 1, {  3,17,18, 5, 7, 2, 4, 6, 1, 8,-1, 3, 7, 4, 1, 2 } },
+ { 0, 2, {  1,-2, 2,18, 3, 5, 2, 4, 7,-1, 2, 3, 5,18,-2, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1, 2,-2, 6,18,-3, 2, 7, 3,-2, 5, 6, 1, 8, 2, 4 } },
+ { 0, 1, { 18,18,18, 3,-2, 6,18, 2, 4, 3, 5, 8, 7, 6, 2,-2 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 0, 1, {  3,17,18, 2, 5,18, 6, 7, 5,-2, 2, 4,18, 3, 6, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 17,-1,18, 2, 4,-1, 8, 3,18, 7,-3, 4, 5, 1, 2,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 } },
+ { 1, 1, { 18,18, 3, 6, 4, 8,-2, 2, 5, 3, 7,18, 6, 8, 4, 2 } },
+ { 1, 1, { 17,18,18,-2, 5, 2, 3, 1, 4,-1, 8, 6, 5, 3, 2,18 } },
+ { 1, 1, { 17,17, 1, 2, 4, 5, 2, 6,-1, 3, 1, 1,-2, 4, 2, 7 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 0, 1, { 18,17,-2,-3, 1, 2, 3, 2, 5, 4, 7,-3, 6,-2, 2, 1 } },
+ { 1, 1, {  1, 3, 5,18, 1, 2, 7, 3, 6, 2, 5, 8,-1, 1, 4, 7 } },
+ { 1, 1, { 17, 3, 6, 8, 1, 4, 5, 3,-2, 7, 2, 8, 5, 6,18, 3 } },
+ { 1, 1, { 17,18, 2, 4, 8,-2, 3, 1, 5, 6, 7, 1, 2, 3, 4, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 1, 8,18, 5, 2, 3,18, 6, 7,-2, 4, 3, 2, 8,18 } },
+ { 0, 1, { 18,17, 2,18, 3, 4,-1,18, 7, 6, 2, 8, 4,18,18, 5 } },
+ { 0, 1, { 18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 } },
+ { 1, 2, { 18,17,18, 2, 3, 5,-2,18, 6,-1, 2, 3, 7, 4, 8,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 } },
+ { 1, 2, { 18,18,-2,17, 2,18, 3, 4,18, 8, 7,-1, 2, 4, 5,17 } },
+ { 0, 2, { 17,-3,17, 3, 2,-2,18, 8, 4,-3, 2,18, 5, 3,-2, 6 } },
+ { 0, 1, { 18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 } },
+ { 0, 2, {  1,18,-1, 3, 5, 2,-3,18, 7, 3,-1, 6, 4, 2,17, 5 } },
+ { 1, 1, { 17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 } },
+ { 1, 1, {  1,18, 1, 3, 5, 8, 6, 2, 3,-1, 7, 1, 4, 8, 5,-3 } },
+ { 0, 2, {  3,18,18, 2,18,-2, 6, 5, 7, 2, 4,18, 3, 6,-3, 5 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 4, { 18, 2,17, 3,18,-2, 2, 6,18, 2, 7, 3, 5, 4, 8,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 2, 5, 3,-2, 1, 4, 3, 7, 6,-3, 2, 1, 1, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18,18,-2,18,-2, 2, 3, 6,18, 4,-1, 2, 3, 8, 1, 4 } },
+ { 1, 1, { 17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 } },
+ { 0, 1, { 17,17,18, 3, 2,18,18, 6, 8, 2,-2, 3, 5, 4,17,18 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 1, 1, {  1, 3,-3,18,18, 6, 5,18, 2,-1, 3, 8, 7,-3, 4,17 } },
+ { 1, 1, { 18, 1, 2, 1, 3, 8, 7, 4, 1, 5, 2,-1,-3,18, 6, 2 } },
+ { 0, 1, { 18, 3, 5, 2, 6, 8,18, 5, 7, 2, 3,-1, 6, 7, 8, 5 } },
+ { 0, 2, { 18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 3, {  1, 1, 2, 5, 2, 7, 4, 3,-1,18,-2, 8, 2, 1, 6, 7 } },
+ { 0, 1, {  3,17,18, 5, 2, 6, 7,18, 4, 5, 3, 6,18, 2, 7, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, {  1,18, 1, 2, 3, 5, 1, 2, 6, 7, 4, 3, 8, 1,17, 5 } },
+ { 1, 2, { 17,-1,18,-2, 2, 3, 5,18, 2, 4, 6, 7, 3,-1, 5, 8 } },
+ { 1, 1, { 18,18,-3,18,-2, 2, 3,-2,18, 6, 4, 5, 8, 3,17,-3 } },
+ { 1, 1, { 18, 7, 6, 5, 5, 3, 1, 4, 2, 7, 3, 4,-3, 6,18, 8 } },
+ { 0, 2, { 18,18, 2, 3, 5,18, 2, 4, 3, 6,18, 7, 8,-1, 5, 2 } },
+ { 0, 1, { 18,17,-1, 2,18, 3, 2,18, 4, 3,18, 2, 6, 5, 8,17 } },
+ { 0, 2, { 18,17, 2, 3,18, 5,-1, 6, 7, 8, 2, 3, 4, 5,18, 6 } },
+ { 1, 2, { 18,-3,18, 2, 3,-2,-3, 5,18, 7, 6, 2, 4, 3, 8,-2 } },
+ { 1, 1, { 17,18,18,-2, 2, 3, 5, 4, 8,18,-1, 5, 3, 6,-2, 7 } },
+ { 1, 2, { 18,17, 2,-2,18, 3,-1, 4,18, 2, 7, 5, 3, 8, 6, 4 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 } },
+ { 0, 2, { 18,18, 3, 3,-2, 2, 5,18, 6, 3,-1, 4, 7,-1, 1, 2 } },
+ { 0, 1, { -2, 1,18, 2,-2, 5, 7,18, 3, 2, 6, 2,-1, 4,-2,17 } },
+ { 0, 2, { 18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17,18,-1, 3, 2, 5, 1, 3, 2, 8, 4, 7, 6, 2,-1, 5 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 0, 1, { 18,18,-2,18, 2, 3, 4, 5, 6,18, 8, 2, 3, 7,-2, 4 } },
+ { 0, 1, { 18,-2,18,18,-3,-2, 2, 3, 5, 8, 1, 2, 6, 4, 7,-1 } },
+ { 0, 1, { 18,17, 2,18, 3,-2, 2, 7, 6, 4,18, 3, 8, 7, 4, 2 } },
+ { 1, 1, { 17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 } },
+ { 1, 1, { 18,17,18, 2, 5, 3,-2,18, 6, 2, 3, 4, 8, 7, 5,-1 } },
+ { 0, 1, {  2,-1,18,-1, 2, 4,-3,18, 5, 3, 6,18, 2, 4, 7, 8 } },
+ { 1, 1, { 17,18, 8, 3, 6, 4,-1, 5, 2, 7, 3, 8, 6, 5,18, 4 } },
+ { 0, 2, { 18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 } },
+ { 1, 2, { 18,-1,18, 3,-2,18, 2, 5, 3, 6, 7, 2,-1,18, 8, 4 } },
+ { 1, 2, {  1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 } },
+ { 1, 2, {  1,18,-3, 2, 3,18,-1, 5, 6, 2, 8, 3, 4, 1,-2, 7 } },
+ { 0, 1, {  1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 } },
+ { 1, 1, { 18,17,18, 4, 3, 5, 1, 2, 6, 3, 4, 7, 1, 8, 5, 2 } },
+ { 0, 1, { 18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18,-2, 2, 5, 3, 7,18, 2, 4,-3, 5, 6, 3, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 3, {  3,18,-1, 5, 2, 7,18, 6, 5, 2, 4, 3,-1, 7,18, 6 } },
+ { 0, 2, { 18,18,18, 4, 3, 2, 6, 4, 8,18, 5, 3, 2, 7,-2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18,18, 2, 4, 6,-2, 2, 8, 3, 4,18, 7,-1, 6 } },
+ { 0, 1, { 18, 1,-2, 2, 4, 1, 3,-1, 2, 5, 7, 1, 6, 8,-2,17 } },
+ { 0, 1, { 17,17,18, 2, 5, 4,18, 3, 8, 7, 4, 6, 8, 1, 5, 2 } },
+ { 1, 2, { 18,18, 5, 4, 6, 3, 4,18, 8, 4,-1, 7, 5, 3, 6, 2 } },
+ { 0, 1, { 18,18,-3,18, 3, 6, 2, 5, 7,18, 3, 8,-1, 4, 5, 2 } },
+ { 1, 1, { 18, 2,-2,-3,18, 5, 2,-2, 4, 3, 6,18, 8,-1, 2, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 17,18, 3,18, 2, 5, 4, 7,-3, 6, 3, 2,18, 4, 7, 3 } },
+ { 1, 1, {  1, 7, 4, 5, 3, 4, 5, 1, 3, 6, 3, 2, 4, 8,-2, 7 } },
+ { 0, 1, {  1,18,-1,-2,18, 3, 2,-1, 6, 7, 4, 5, 3,18, 2,-3 } },
+ { 1, 1, { 18,18,-1, 3, 6,18, 5, 4, 8, 2, 3, 6,18, 7, 4,-2 } },
+ { 0, 2, { 18,18, 2, 6,18, 2,18, 5, 3,18, 2, 4, 7, 8, 3,18 } },
+ { 1, 1, {  3,18,18, 5,18, 6, 2, 4, 7,-2,18, 5, 8, 6, 3, 2 } },
+ { 0, 1, { 18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 } },
+ { 1, 1, { 18,-2,18, 2, 5,18, 3,-2, 4, 7, 2,-1, 8, 6, 5, 1 } },
+ { 1, 1, { 17,17, 5,18, 4, 1, 2, 8, 6, 4,-2, 3, 5,-1, 1, 8 } },
+ { 0, 2, {  1, 2,17, 3, 7,18, 2,-1, 4, 5,18, 2, 7, 3, 6, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, { 18,18,18, 2,-2, 3, 6, 4, 8,18, 2, 5, 7, 4, 3, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18, 1, 8, 3, 5, 6, 4,-1, 8, 3, 7,18, 2, 5, 8, 4 } },
+ { 1, 1, { 17,18, 5, 2, 4, 3, 1, 6,-2, 1, 3, 2, 4, 5,-1,17 } },
+ { 1, 1, { 18,17, 2,18, 3,-3, 7, 2, 6, 4, 3, 5,18, 8, 2,-2 } },
+ { 1, 1, { 18,17,18, 4, 3, 5,-1,18, 2, 7, 8, 4, 6, 3,18, 5 } },
+ { 0, 1, { 18,17,18,-2, 2,-3, 3, 4, 8, 5, 2,18, 6, 3, 7,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 17,18, 8, 3, 4, 6,18, 5,-2, 3, 8, 5, 2, 4, 7, 6 } },
+ { 0, 1, { 18,-2, 3, 5, 1, 7, 3, 2, 6,-3, 4, 1, 5, 8, 3,-2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, {  3,17,18, 5,-1,18, 2, 6, 7,18, 5, 3,-3,-1, 6, 2 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 3, { 18,17,-2, 3,-1,18, 2, 5, 3, 7, 6, 2, 4, 8,18, 5 } },
+ { 0, 1, { 18,-1,18, 2,18, 3, 5,18, 2, 8,18, 5, 4,-1, 6, 2 } },
+ { 1, 2, { 18,-2,18,18, 2, 3, 4,-3, 2, 5,18, 7, 4, 3, 8, 6 } },
+ { 0, 2, { 17,-1,18, 2,-1, 1, 7, 3, 8, 5,-2, 4, 1, 2,-3, 6 } },
+ { 0, 1, { 18,17, 2,18, 2,18, 6, 7, 4, 3,18, 5, 2,-2,17, 8 } },
+ { 0, 3, { 18,17, 2, 3,-3,-1,18, 2, 4, 5,18, 7, 3, 2,-3, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 2, {  3,18,18,18, 2, 6, 5,18, 7, 2, 4, 6,18, 5, 3, 8 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18, 3, 6, 3,-2, 2,18, 5,-1, 7, 3, 4,-2, 2, 6 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 1, 1, { 18,17,18,18,-2, 2, 3,-3,18, 6, 4, 2,-2, 8, 3, 7 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { 18,18,18, 4, 2, 7, 8,18, 3, 2,-2, 4, 7, 6,17, 5 } },
+ { 1, 1, { 18,18,-1,-2, 8, 3,18, 6, 3, 5, 8, 2, 4, 7, 1, 6 } },
+ { 1, 1, {  1,-3, 3,18,18, 2,-1, 3, 6, 5,18, 4, 7,-2, 8, 3 } },
+ { 1, 1, {  1,18, 4, 2, 5,18, 1, 3,-1, 6, 1, 4, 8, 2, 5, 1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+ { 0, 1, { -1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 } },
+};
+
+static const WavPackDecorrSpec * const decorr_filters[] = {
+    &fast_specs[0], &default_specs[0], &high_specs[0], &very_high_specs[0],
+};
+
+static const uint16_t decorr_filter_sizes[] = {
+    FF_ARRAY_ELEMS(fast_specs),
+    FF_ARRAY_ELEMS(default_specs),
+    FF_ARRAY_ELEMS(high_specs),
+    FF_ARRAY_ELEMS(very_high_specs),
+};
+
+static const uint8_t decorr_filter_nterms[] = { 2, 5, 10, 16 };
+
+static const int8_t nbits_table[] = {
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
+};
+
+#endif /* AVCODEC_WAVPACKENC_H */
diff --git a/libavcodec/webp.c b/libavcodec/webp.c
index c475744..e715c4b 100644
--- a/libavcodec/webp.c
+++ b/libavcodec/webp.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2013 Aneesh Dogra <aneesh@sugarlabs.org>
  * Copyright (c) 2013 Justin Ruggles <justin.ruggles@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,16 +31,20 @@
  * Lossless decoder
  * Compressed alpha for lossy
  *
+ * @author James Almer <jamrial@gmail.com>
+ * Exif metadata
+ *
  * Unimplemented:
  *   - Animation
  *   - ICC profile
- *   - Exif and XMP metadata
+ *   - XMP metadata
  */
 
 #define BITSTREAM_READER_LE
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "bytestream.h"
+#include "exif.h"
 #include "internal.h"
 #include "get_bits.h"
 #include "thread.h"
@@ -191,6 +195,7 @@ typedef struct WebPContext {
     enum AlphaFilter alpha_filter;      /* filtering method for alpha chunk */
     uint8_t *alpha_data;                /* alpha chunk data */
     int alpha_data_size;                /* alpha chunk data size */
+    int has_exif;                       /* set after an EXIF chunk has been processed */
     int width;                          /* image width */
     int height;                         /* image height */
     int lossless;                       /* indicates lossless or lossy */
@@ -303,7 +308,7 @@ static int huff_reader_build_canonical(HuffReader *r, int *code_lengths,
     if (max_code_length == 0 || max_code_length > MAX_HUFFMAN_CODE_LENGTH)
         return AVERROR(EINVAL);
 
-    codes = av_malloc(alphabet_size * sizeof(*codes));
+    codes = av_malloc_array(alphabet_size, sizeof(*codes));
     if (!codes)
         return AVERROR(ENOMEM);
 
@@ -1027,7 +1032,7 @@ static int apply_color_indexing_transform(WebPContext *s)
     ImageContext *img;
     ImageContext *pal;
     int i, x, y;
-    uint8_t *p, *pi;
+    uint8_t *p;
 
     img = &s->image[IMAGE_ROLE_ARGB];
     pal = &s->image[IMAGE_ROLE_COLOR_INDEXING];
@@ -1060,16 +1065,33 @@ static int apply_color_indexing_transform(WebPContext *s)
         av_free(line);
     }
 
-    for (y = 0; y < img->frame->height; y++) {
-        for (x = 0; x < img->frame->width; x++) {
-            p = GET_PIXEL(img->frame, x, y);
-            i = p[2];
-            if (i >= pal->frame->width) {
-                av_log(s->avctx, AV_LOG_ERROR, "invalid palette index %d\n", i);
-                return AVERROR_INVALIDDATA;
+    // switch to local palette if it's worth initializing it
+    if (img->frame->height * img->frame->width > 300) {
+        uint8_t palette[256 * 4];
+        const int size = pal->frame->width * 4;
+        av_assert0(size <= 1024U);
+        memcpy(palette, GET_PIXEL(pal->frame, 0, 0), size);   // copy palette
+        // set extra entries to transparent black
+        memset(palette + size, 0, 256 * 4 - size);
+        for (y = 0; y < img->frame->height; y++) {
+            for (x = 0; x < img->frame->width; x++) {
+                p = GET_PIXEL(img->frame, x, y);
+                i = p[2];
+                AV_COPY32(p, &palette[i * 4]);
+            }
+        }
+    } else {
+        for (y = 0; y < img->frame->height; y++) {
+            for (x = 0; x < img->frame->width; x++) {
+                p = GET_PIXEL(img->frame, x, y);
+                i = p[2];
+                if (i >= pal->frame->width) {
+                    AV_WB32(p, 0x00000000);
+                } else {
+                    const uint8_t *pi = GET_PIXEL(pal->frame, i, 0);
+                    AV_COPY32(p, pi);
+                }
             }
-            pi = GET_PIXEL(pal->frame, i, 0);
-            AV_COPY32(p, pi);
         }
     }
 
@@ -1088,7 +1110,7 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
         avctx->pix_fmt = AV_PIX_FMT_ARGB;
     }
 
-    ret = init_get_bits(&s->gb, data_start, data_size * 8);
+    ret = init_get_bits8(&s->gb, data_start, data_size);
     if (ret < 0)
         return ret;
 
@@ -1134,7 +1156,6 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
     used = 0;
     while (get_bits1(&s->gb)) {
         enum TransformType transform = get_bits(&s->gb, 2);
-        s->transforms[s->nb_transforms++] = transform;
         if (used & (1 << transform)) {
             av_log(avctx, AV_LOG_ERROR, "Transform %d used more than once\n",
                    transform);
@@ -1142,6 +1163,7 @@ static int vp8_lossless_decode_frame(AVCodecContext *avctx, AVFrame *p,
             goto free_and_return;
         }
         used |= (1 << transform);
+        s->transforms[s->nb_transforms++] = transform;
         switch (transform) {
         case PREDICTOR_TRANSFORM:
             ret = parse_transform_predictor(s);
@@ -1343,6 +1365,7 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     s->height    = 0;
     *got_frame   = 0;
     s->has_alpha = 0;
+    s->has_exif  = 0;
     bytestream2_init(&gb, avpkt->data, avpkt->size);
 
     if (bytestream2_get_bytes_left(&gb) < 12)
@@ -1392,6 +1415,7 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                                                 chunk_size, 0);
                 if (ret < 0)
                     return ret;
+                avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS;
             }
             bytestream2_skip(&gb, chunk_size);
             break;
@@ -1435,13 +1459,48 @@ static int webp_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
             break;
         }
+        case MKTAG('E', 'X', 'I', 'F'): {
+            int le, ifd_offset, exif_offset = bytestream2_tell(&gb);
+            AVDictionary *exif_metadata = NULL;
+            GetByteContext exif_gb;
+
+            if (s->has_exif) {
+                av_log(avctx, AV_LOG_VERBOSE, "Ignoring extra EXIF chunk\n");
+                goto exif_end;
+            }
+            if (!(vp8x_flags & VP8X_FLAG_EXIF_METADATA))
+                av_log(avctx, AV_LOG_WARNING,
+                       "EXIF chunk present, but Exif bit not set in the "
+                       "VP8X header\n");
+
+            s->has_exif = 1;
+            bytestream2_init(&exif_gb, avpkt->data + exif_offset,
+                             avpkt->size - exif_offset);
+            if (ff_tdecode_header(&exif_gb, &le, &ifd_offset) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "invalid TIFF header "
+                       "in Exif data\n");
+                goto exif_end;
+            }
+
+            bytestream2_seek(&exif_gb, ifd_offset, SEEK_SET);
+            if (avpriv_exif_decode_ifd(avctx, &exif_gb, le, 0, &exif_metadata) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "error decoding Exif data\n");
+                goto exif_end;
+            }
+
+            av_dict_copy(avpriv_frame_get_metadatap(data), exif_metadata, 0);
+
+exif_end:
+            av_dict_free(&exif_metadata);
+            bytestream2_skip(&gb, chunk_size);
+            break;
+        }
         case MKTAG('I', 'C', 'C', 'P'):
         case MKTAG('A', 'N', 'I', 'M'):
         case MKTAG('A', 'N', 'M', 'F'):
-        case MKTAG('E', 'X', 'I', 'F'):
         case MKTAG('X', 'M', 'P', ' '):
             AV_WL32(chunk_str, chunk_type);
-            av_log(avctx, AV_LOG_VERBOSE, "skipping unsupported chunk: %s\n",
+            av_log(avctx, AV_LOG_WARNING, "skipping unsupported chunk: %s\n",
                    chunk_str);
             bytestream2_skip(&gb, chunk_size);
             break;
diff --git a/libavcodec/webvttdec.c b/libavcodec/webvttdec.c
new file mode 100644
index 0000000..7b2d175
--- /dev/null
+++ b/libavcodec/webvttdec.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2012 Clément Bœsch
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * WebVTT subtitle decoder
+ * @see http://dev.w3.org/html5/webvtt/
+ * @todo need to support extended markups and cue settings
+ */
+
+#include "avcodec.h"
+#include "ass.h"
+#include "libavutil/bprint.h"
+
+static const struct {
+    const char *from;
+    const char *to;
+} webvtt_tag_replace[] = {
+    {"<i>", "{\\i1}"}, {"</i>", "{\\i0}"},
+    {"<b>", "{\\b1}"}, {"</b>", "{\\b0}"},
+    {"<u>", "{\\u1}"}, {"</u>", "{\\u0}"},
+    {"{", "\\{"}, {"}", "\\}"}, // escape to avoid ASS markup conflicts
+    {"&gt;", ">"}, {"&lt;", "<"},
+    {"&lrm;", ""}, {"&rlm;", ""}, // FIXME: properly honor bidi marks
+    {"&amp;", "&"}, {"&nbsp;", "\\h"},
+};
+
+static int webvtt_event_to_ass(AVBPrint *buf, const char *p)
+{
+    int i, again = 0, skip = 0;
+
+    while (*p) {
+
+        for (i = 0; i < FF_ARRAY_ELEMS(webvtt_tag_replace); i++) {
+            const char *from = webvtt_tag_replace[i].from;
+            const size_t len = strlen(from);
+            if (!strncmp(p, from, len)) {
+                av_bprintf(buf, "%s", webvtt_tag_replace[i].to);
+                p += len;
+                again = 1;
+                break;
+            }
+        }
+        if (!*p)
+            break;
+
+        if (again) {
+            again = 0;
+            skip = 0;
+            continue;
+        }
+        if (*p == '<')
+            skip = 1;
+        else if (*p == '>')
+            skip = 0;
+        else if (p[0] == '\n' && p[1])
+            av_bprintf(buf, "\\N");
+        else if (!skip && *p != '\r')
+            av_bprint_chars(buf, *p, 1);
+        p++;
+    }
+    return 0;
+}
+
+static int webvtt_decode_frame(AVCodecContext *avctx,
+                               void *data, int *got_sub_ptr, AVPacket *avpkt)
+{
+    int ret = 0;
+    AVSubtitle *sub = data;
+    const char *ptr = avpkt->data;
+    FFASSDecoderContext *s = avctx->priv_data;
+    AVBPrint buf;
+
+    av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
+    if (ptr && avpkt->size > 0 && !webvtt_event_to_ass(&buf, ptr))
+        ret = ff_ass_add_rect(sub, buf.str, s->readorder++, 0, NULL, NULL);
+    av_bprint_finalize(&buf, NULL);
+    if (ret < 0)
+        return ret;
+    *got_sub_ptr = sub->num_rects > 0;
+    return avpkt->size;
+}
+
+AVCodec ff_webvtt_decoder = {
+    .name           = "webvtt",
+    .long_name      = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_WEBVTT,
+    .decode         = webvtt_decode_frame,
+    .init           = ff_ass_subtitle_header_default,
+    .flush          = ff_ass_decoder_flush,
+    .priv_data_size = sizeof(FFASSDecoderContext),
+};
diff --git a/libavcodec/webvttenc.c b/libavcodec/webvttenc.c
new file mode 100644
index 0000000..c84bbf4
--- /dev/null
+++ b/libavcodec/webvttenc.c
@@ -0,0 +1,236 @@
+/*
+ * WebVTT subtitle encoder
+ * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (c) 2014  Aman Gupta <ffmpeg@tmm1.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdarg.h>
+#include "avcodec.h"
+#include "libavutil/avstring.h"
+#include "libavutil/bprint.h"
+#include "ass_split.h"
+#include "ass.h"
+
+#define WEBVTT_STACK_SIZE 64
+typedef struct {
+    AVCodecContext *avctx;
+    ASSSplitContext *ass_ctx;
+    AVBPrint buffer;
+    unsigned timestamp_end;
+    int count;
+    char stack[WEBVTT_STACK_SIZE];
+    int stack_ptr;
+} WebVTTContext;
+
+#ifdef __GNUC__
+__attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+static void webvtt_print(WebVTTContext *s, const char *str, ...)
+{
+    va_list vargs;
+    va_start(vargs, str);
+    av_vbprintf(&s->buffer, str, vargs);
+    va_end(vargs);
+}
+
+static int webvtt_stack_push(WebVTTContext *s, const char c)
+{
+    if (s->stack_ptr >= WEBVTT_STACK_SIZE)
+        return -1;
+    s->stack[s->stack_ptr++] = c;
+    return 0;
+}
+
+static char webvtt_stack_pop(WebVTTContext *s)
+{
+    if (s->stack_ptr <= 0)
+        return 0;
+    return s->stack[--s->stack_ptr];
+}
+
+static int webvtt_stack_find(WebVTTContext *s, const char c)
+{
+    int i;
+    for (i = s->stack_ptr-1; i >= 0; i--)
+        if (s->stack[i] == c)
+            break;
+    return i;
+}
+
+static void webvtt_close_tag(WebVTTContext *s, char tag)
+{
+    webvtt_print(s, "</%c>", tag);
+}
+
+static void webvtt_stack_push_pop(WebVTTContext *s, const char c, int close)
+{
+    if (close) {
+        int i = c ? webvtt_stack_find(s, c) : 0;
+        if (i < 0)
+            return;
+        while (s->stack_ptr != i)
+            webvtt_close_tag(s, webvtt_stack_pop(s));
+    } else if (webvtt_stack_push(s, c) < 0)
+        av_log(s->avctx, AV_LOG_ERROR, "tag stack overflow\n");
+}
+
+static void webvtt_style_apply(WebVTTContext *s, const char *style)
+{
+    ASSStyle *st = ff_ass_style_get(s->ass_ctx, style);
+    if (st) {
+        if (st->bold != ASS_DEFAULT_BOLD) {
+            webvtt_print(s, "<b>");
+            webvtt_stack_push(s, 'b');
+        }
+        if (st->italic != ASS_DEFAULT_ITALIC) {
+            webvtt_print(s, "<i>");
+            webvtt_stack_push(s, 'i');
+        }
+        if (st->underline != ASS_DEFAULT_UNDERLINE) {
+            webvtt_print(s, "<u>");
+            webvtt_stack_push(s, 'u');
+        }
+    }
+}
+
+static void webvtt_text_cb(void *priv, const char *text, int len)
+{
+    WebVTTContext *s = priv;
+    av_bprint_append_data(&s->buffer, text, len);
+}
+
+static void webvtt_new_line_cb(void *priv, int forced)
+{
+    webvtt_print(priv, "\n");
+}
+
+static void webvtt_style_cb(void *priv, char style, int close)
+{
+    if (style == 's') // strikethrough unsupported
+        return;
+
+    webvtt_stack_push_pop(priv, style, close);
+    if (!close)
+        webvtt_print(priv, "<%c>", style);
+}
+
+static void webvtt_cancel_overrides_cb(void *priv, const char *style)
+{
+    webvtt_stack_push_pop(priv, 0, 1);
+    webvtt_style_apply(priv, style);
+}
+
+static void webvtt_end_cb(void *priv)
+{
+    webvtt_stack_push_pop(priv, 0, 1);
+}
+
+static const ASSCodesCallbacks webvtt_callbacks = {
+    .text             = webvtt_text_cb,
+    .new_line         = webvtt_new_line_cb,
+    .style            = webvtt_style_cb,
+    .color            = NULL,
+    .font_name        = NULL,
+    .font_size        = NULL,
+    .alignment        = NULL,
+    .cancel_overrides = webvtt_cancel_overrides_cb,
+    .move             = NULL,
+    .end              = webvtt_end_cb,
+};
+
+static int webvtt_encode_frame(AVCodecContext *avctx,
+                               unsigned char *buf, int bufsize, const AVSubtitle *sub)
+{
+    WebVTTContext *s = avctx->priv_data;
+    ASSDialog *dialog;
+    int i;
+
+    av_bprint_clear(&s->buffer);
+
+    for (i=0; i<sub->num_rects; i++) {
+        const char *ass = sub->rects[i]->ass;
+
+        if (sub->rects[i]->type != SUBTITLE_ASS) {
+            av_log(avctx, AV_LOG_ERROR, "Only SUBTITLE_ASS type supported.\n");
+            return AVERROR(ENOSYS);
+        }
+
+#if FF_API_ASS_TIMING
+        if (!strncmp(ass, "Dialogue: ", 10)) {
+            int num;
+            dialog = ff_ass_split_dialog(s->ass_ctx, ass, 0, &num);
+            // TODO reindent
+        for (; dialog && num--; dialog++) {
+            webvtt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(&webvtt_callbacks, s, dialog->text);
+        }
+        } else {
+#endif
+            dialog = ff_ass_split_dialog2(s->ass_ctx, ass);
+            if (!dialog)
+                return AVERROR(ENOMEM);
+            webvtt_style_apply(s, dialog->style);
+            ff_ass_split_override_codes(&webvtt_callbacks, s, dialog->text);
+            ff_ass_free_dialog(&dialog);
+#if FF_API_ASS_TIMING
+        }
+#endif
+    }
+
+    if (!av_bprint_is_complete(&s->buffer))
+        return AVERROR(ENOMEM);
+    if (!s->buffer.len)
+        return 0;
+
+    if (s->buffer.len > bufsize) {
+        av_log(avctx, AV_LOG_ERROR, "Buffer too small for ASS event.\n");
+        return -1;
+    }
+    memcpy(buf, s->buffer.str, s->buffer.len);
+
+    return s->buffer.len;
+}
+
+static int webvtt_encode_close(AVCodecContext *avctx)
+{
+    WebVTTContext *s = avctx->priv_data;
+    ff_ass_split_free(s->ass_ctx);
+    av_bprint_finalize(&s->buffer, NULL);
+    return 0;
+}
+
+static av_cold int webvtt_encode_init(AVCodecContext *avctx)
+{
+    WebVTTContext *s = avctx->priv_data;
+    s->avctx = avctx;
+    s->ass_ctx = ff_ass_split(avctx->subtitle_header);
+    av_bprint_init(&s->buffer, 0, AV_BPRINT_SIZE_UNLIMITED);
+    return s->ass_ctx ? 0 : AVERROR_INVALIDDATA;
+}
+
+AVCodec ff_webvtt_encoder = {
+    .name           = "webvtt",
+    .long_name      = NULL_IF_CONFIG_SMALL("WebVTT subtitle"),
+    .type           = AVMEDIA_TYPE_SUBTITLE,
+    .id             = AV_CODEC_ID_WEBVTT,
+    .priv_data_size = sizeof(WebVTTContext),
+    .init           = webvtt_encode_init,
+    .encode_sub     = webvtt_encode_frame,
+    .close          = webvtt_encode_close,
+};
diff --git a/libavcodec/wma.c b/libavcodec/wma.c
index 85193ff..bc8e2d1 100644
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@@ -1,21 +1,21 @@
 /*
  * WMA compatible codec
- * Copyright (c) 2002-2007 The Libav Project
+ * Copyright (c) 2002-2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -45,10 +45,10 @@ static av_cold int init_coef_vlc(VLC *vlc, uint16_t **prun_table,
 
     init_vlc(vlc, VLCBITS, n, table_bits, 1, 1, table_codes, 4, 4, 0);
 
-    run_table    = av_malloc(n * sizeof(uint16_t));
-    level_table  = av_malloc(n * sizeof(uint16_t));
-    flevel_table = av_malloc(n * sizeof(*flevel_table));
-    int_table    = av_malloc(n * sizeof(uint16_t));
+    run_table    = av_malloc_array(n, sizeof(uint16_t));
+    level_table  = av_malloc_array(n, sizeof(uint16_t));
+    flevel_table = av_malloc_array(n, sizeof(*flevel_table));
+    int_table    = av_malloc_array(n, sizeof(uint16_t));
     if (!run_table || !level_table || !flevel_table || !int_table) {
         av_freep(&run_table);
         av_freep(&level_table);
@@ -92,7 +92,6 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
         avctx->bit_rate    <= 0)
         return -1;
 
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
 
     if (avctx->codec->id == AV_CODEC_ID_WMAV1)
         s->version = 1;
@@ -141,6 +140,10 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
     bps                 = (float) avctx->bit_rate /
                           (float) (avctx->channels * avctx->sample_rate);
     s->byte_offset_bits = av_log2((int) (bps * s->frame_len / 8.0 + 0.5)) + 2;
+    if (s->byte_offset_bits + 3 > MIN_CACHE_BITS) {
+        av_log(avctx, AV_LOG_ERROR, "byte_offset_bits %d is too large\n", s->byte_offset_bits);
+        return AVERROR_PATCHWELCOME;
+    }
 
     /* compute high frequency value and choose if noise coding should
      * be activated */
@@ -182,8 +185,8 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
             high_freq = high_freq * 0.5;
     }
     ff_dlog(s->avctx, "flags2=0x%x\n", flags2);
-    ff_dlog(s->avctx, "version=%d channels=%d sample_rate=%d bitrate=%d block_align=%d\n",
-            s->version, avctx->channels, avctx->sample_rate, avctx->bit_rate,
+    ff_dlog(s->avctx, "version=%d channels=%d sample_rate=%d bitrate=%"PRId64" block_align=%d\n",
+            s->version, avctx->channels, avctx->sample_rate, (int64_t)avctx->bit_rate,
             avctx->block_align);
     ff_dlog(s->avctx, "bps=%f bps1=%f high_freq=%f bitoffset=%d\n",
             bps, bps1, high_freq, s->byte_offset_bits);
@@ -335,6 +338,10 @@ av_cold int ff_wma_init(AVCodecContext *avctx, int flags2)
 #endif /* TRACE */
     }
 
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
+
     /* choose the VLC tables for the coefficients */
     coef_vlc_table = 2;
     if (avctx->sample_rate >= 32000) {
@@ -382,10 +389,11 @@ int ff_wma_end(AVCodecContext *avctx)
         ff_free_vlc(&s->hgain_vlc);
     for (i = 0; i < 2; i++) {
         ff_free_vlc(&s->coef_vlc[i]);
-        av_free(s->run_table[i]);
-        av_free(s->level_table[i]);
-        av_free(s->int_table[i]);
+        av_freep(&s->run_table[i]);
+        av_freep(&s->level_table[i]);
+        av_freep(&s->int_table[i]);
     }
+    av_freep(&s->fdsp);
 
     return 0;
 }
@@ -444,7 +452,7 @@ int ff_wma_run_level_decode(AVCodecContext *avctx, GetBitContext *gb,
             /** normal code */
             offset                  += run_table[code];
             sign                     = get_bits1(gb) - 1;
-            iptr[offset & coef_mask] = ilvl[code] ^ sign << 31;
+            iptr[offset & coef_mask] = ilvl[code] ^ (sign & 0x80000000);
         } else if (code == 1) {
             /** EOB */
             break;
@@ -476,7 +484,11 @@ int ff_wma_run_level_decode(AVCodecContext *avctx, GetBitContext *gb,
     }
     /** NOTE: EOB can be omitted */
     if (offset > num_coefs) {
-        av_log(avctx, AV_LOG_ERROR, "overflow in spectral RLE, ignoring\n");
+        av_log(avctx, AV_LOG_ERROR,
+               "overflow (%d > %d) in spectral RLE, ignoring\n",
+               offset,
+               num_coefs
+              );
         return -1;
     }
 
diff --git a/libavcodec/wma.h b/libavcodec/wma.h
index c954d71..325f03c 100644
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -1,21 +1,21 @@
 /*
  * WMA compatible codec
- * Copyright (c) 2002-2007 The Libav Project
+ * Copyright (c) 2002-2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -42,7 +42,7 @@
 #define NB_LSP_COEFS 10
 
 /* XXX: is it a suitable value ? */
-#define MAX_CODED_SUPERFRAME_SIZE 16384
+#define MAX_CODED_SUPERFRAME_SIZE 32768
 
 #define MAX_CHANNELS 2
 
@@ -116,7 +116,7 @@ typedef struct WMACodecContext {
     DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
     DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
     FFTContext mdct_ctx[BLOCK_NB_SIZES];
-    float *windows[BLOCK_NB_SIZES];
+    const float *windows[BLOCK_NB_SIZES];
     /* output buffer for one frame and the last for IMDCT windowing */
     DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
     /* last frame info */
@@ -131,7 +131,7 @@ typedef struct WMACodecContext {
     float lsp_pow_e_table[256];
     float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
     float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
 
 #ifdef TRACE
     int frame_count;
@@ -144,7 +144,9 @@ extern const float ff_wma_lsp_codebook[NB_LSP_COEFS][16];
 extern const uint32_t ff_aac_scalefactor_code[121];
 extern const uint8_t  ff_aac_scalefactor_bits[121];
 
+av_warn_unused_result
 int ff_wma_init(AVCodecContext *avctx, int flags2);
+
 int ff_wma_total_gain_to_bits(int total_gain);
 int ff_wma_end(AVCodecContext *avctx);
 unsigned int ff_wma_get_large_val(GetBitContext *gb);
diff --git a/libavcodec/wma_common.c b/libavcodec/wma_common.c
index cf76f5c..c01e0f4 100644
--- a/libavcodec/wma_common.c
+++ b/libavcodec/wma_common.c
@@ -1,20 +1,20 @@
 /*
  * common code shared by all WMA variants
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_common.h b/libavcodec/wma_common.h
index 61b1a35..55404af 100644
--- a/libavcodec/wma_common.h
+++ b/libavcodec/wma_common.h
@@ -1,20 +1,20 @@
 /*
  * common code shared by all WMA variants
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_freqs.c b/libavcodec/wma_freqs.c
index 82cef3b..03a283f 100644
--- a/libavcodec/wma_freqs.c
+++ b/libavcodec/wma_freqs.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wma_freqs.h b/libavcodec/wma_freqs.h
index d40ab65..6fd93e4 100644
--- a/libavcodec/wma_freqs.h
+++ b/libavcodec/wma_freqs.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmadata.h b/libavcodec/wmadata.h
index 58bffed..641cb18 100644
--- a/libavcodec/wmadata.h
+++ b/libavcodec/wmadata.h
@@ -1,21 +1,21 @@
 /*
  * WMA compatible decoder
- * copyright (c) 2002 The Libav Project
+ * copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c
index 50c77dd..78b51e5 100644
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -1,21 +1,21 @@
 /*
  * WMA compatible decoder
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/ffmath.h"
 
 #include "avcodec.h"
 #include "internal.h"
@@ -91,6 +92,16 @@ static av_cold int wma_decode_init(AVCodecContext *avctx)
     s->use_bit_reservoir      = flags2 & 0x0002;
     s->use_variable_block_len = flags2 & 0x0004;
 
+    if (avctx->codec->id == AV_CODEC_ID_WMAV2 && avctx->extradata_size >= 8){
+        if (AV_RL16(extradata+4)==0xd && s->use_variable_block_len){
+            av_log(avctx, AV_LOG_WARNING, "Disabling use_variable_block_len, if this fails contact the ffmpeg developers and send us the file\n");
+            s->use_variable_block_len= 0; // this fixes issue1503
+        }
+    }
+
+    for (i=0; i<MAX_CHANNELS; i++)
+        s->max_exponent[i] = 1.0;
+
     if (ff_wma_init(avctx, flags2) < 0)
         return -1;
 
@@ -153,7 +164,7 @@ static av_cold void wma_lsp_to_curve_init(WMACodecContext *s, int frame_len)
     /* tables for x^-0.25 computation */
     for (i = 0; i < 256; i++) {
         e                     = i - 126;
-        s->lsp_pow_e_table[i] = pow(2.0, e * -0.25);
+        s->lsp_pow_e_table[i] = exp2f(e * -0.25);
     }
 
     /* NOTE: these two tables are needed to avoid two operations in
@@ -162,7 +173,7 @@ static av_cold void wma_lsp_to_curve_init(WMACodecContext *s, int frame_len)
     for (i = (1 << LSP_POW_BITS) - 1; i >= 0; i--) {
         m                      = (1 << LSP_POW_BITS) + i;
         a                      = (float) m * (0.5 / (1 << LSP_POW_BITS));
-        a                      = pow(a, -0.25);
+        a                      = 1/sqrt(sqrt(a));
         s->lsp_pow_m_table1[i] = 2 * a - b;
         s->lsp_pow_m_table2[i] = b - a;
         b                      = a;
@@ -377,14 +388,14 @@ static void wma_window(WMACodecContext *s, float *out)
         block_len = s->block_len;
         bsize     = s->frame_len_bits - s->block_len_bits;
 
-        s->fdsp.vector_fmul_add(out, in, s->windows[bsize],
+        s->fdsp->vector_fmul_add(out, in, s->windows[bsize],
                                 out, block_len);
     } else {
         block_len = 1 << s->prev_block_len_bits;
         n         = (s->block_len - block_len) / 2;
         bsize     = s->frame_len_bits - s->prev_block_len_bits;
 
-        s->fdsp.vector_fmul_add(out + n, in + n, s->windows[bsize],
+        s->fdsp->vector_fmul_add(out + n, in + n, s->windows[bsize],
                                 out + n, block_len);
 
         memcpy(out + n + block_len, in + n + block_len, n * sizeof(float));
@@ -398,7 +409,7 @@ static void wma_window(WMACodecContext *s, float *out)
         block_len = s->block_len;
         bsize     = s->frame_len_bits - s->block_len_bits;
 
-        s->fdsp.vector_fmul_reverse(out, in, s->windows[bsize], block_len);
+        s->fdsp->vector_fmul_reverse(out, in, s->windows[bsize], block_len);
     } else {
         block_len = 1 << s->next_block_len_bits;
         n         = (s->block_len - block_len) / 2;
@@ -406,7 +417,7 @@ static void wma_window(WMACodecContext *s, float *out)
 
         memcpy(out, in, n * sizeof(float));
 
-        s->fdsp.vector_fmul_reverse(out + n, in + n, s->windows[bsize],
+        s->fdsp->vector_fmul_reverse(out + n, in + n, s->windows[bsize],
                                     block_len);
 
         memset(out + n + block_len, 0, n * sizeof(float));
@@ -472,6 +483,11 @@ static int wma_decode_block(WMACodecContext *s)
         s->block_len_bits      = s->frame_len_bits;
     }
 
+    if (s->frame_len_bits - s->block_len_bits >= s->nb_block_sizes){
+        av_log(s->avctx, AV_LOG_ERROR, "block_len_bits not initialized to a valid value\n");
+        return -1;
+    }
+
     /* now check if the block length is coherent with the frame length */
     s->block_len = 1 << s->block_len_bits;
     if ((s->block_pos + s->block_len) > s->frame_len) {
@@ -499,6 +515,10 @@ static int wma_decode_block(WMACodecContext *s)
      * coef escape coding */
     total_gain = 1;
     for (;;) {
+        if (get_bits_left(&s->gb) < 7) {
+            av_log(s->avctx, AV_LOG_ERROR, "total_gain overread\n");
+            return AVERROR_INVALIDDATA;
+        }
         a           = get_bits(&s->gb, 7);
         total_gain += a;
         if (a != 127)
@@ -607,7 +627,7 @@ static int wma_decode_block(WMACodecContext *s)
             coefs1    = s->coefs1[ch];
             exponents = s->exponents[ch];
             esize     = s->exponents_bsize[ch];
-            mult      = pow(10, total_gain * 0.05) / s->max_exponent[ch];
+            mult      = ff_exp10(total_gain * 0.05) / s->max_exponent[ch];
             mult     *= mdct_norm;
             coefs     = s->coefs[ch];
             if (s->use_noise_coding) {
@@ -655,7 +675,7 @@ static int wma_decode_block(WMACodecContext *s)
                         /* use noise with specified power */
                         mult1 = sqrt(exp_power[j] / exp_power[last_high_band]);
                         /* XXX: use a table */
-                        mult1  = mult1 * pow(10, s->high_band_values[ch][j] * 0.05);
+                        mult1  = mult1 * ff_exp10(s->high_band_values[ch][j] * 0.05);
                         mult1  = mult1 / (s->max_exponent[ch] * s->noise_mult);
                         mult1 *= mdct_norm;
                         for (i = 0; i < n; i++) {
@@ -678,7 +698,7 @@ static int wma_decode_block(WMACodecContext *s)
 
                 /* very high freqs : noise */
                 n     = s->block_len - s->coefs_end[bsize];
-                mult1 = mult * exponents[((-1 << bsize)) >> esize];
+                mult1 = mult * exponents[(-(1 << bsize)) >> esize];
                 for (i = 0; i < n; i++) {
                     *coefs++       = s->noise_table[s->noise_index] * mult1;
                     s->noise_index = (s->noise_index + 1) & (NOISE_TAB_SIZE - 1);
@@ -716,7 +736,7 @@ static int wma_decode_block(WMACodecContext *s)
             s->channel_coded[0] = 1;
         }
 
-        s->fdsp.butterflies_float(s->coefs[0], s->coefs[1], s->block_len);
+        s->fdsp->butterflies_float(s->coefs[0], s->coefs[1], s->block_len);
     }
 
 next:
@@ -808,7 +828,8 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
                buf_size, avctx->block_align);
         return AVERROR_INVALIDDATA;
     }
-    buf_size = avctx->block_align;
+    if (avctx->block_align)
+        buf_size = avctx->block_align;
 
     init_get_bits(&s->gb, buf, buf_size * 8);
 
@@ -816,15 +837,38 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
         /* read super frame header */
         skip_bits(&s->gb, 4); /* super frame index */
         nb_frames = get_bits(&s->gb, 4) - (s->last_superframe_len <= 0);
+        if (nb_frames <= 0) {
+            int is_error = nb_frames < 0 || get_bits_left(&s->gb) <= 8;
+            av_log(avctx, is_error ? AV_LOG_ERROR : AV_LOG_WARNING,
+                   "nb_frames is %d bits left %d\n",
+                   nb_frames, get_bits_left(&s->gb));
+            if (is_error)
+                return AVERROR_INVALIDDATA;
+
+            if ((s->last_superframe_len + buf_size - 1) >
+                MAX_CODED_SUPERFRAME_SIZE)
+                goto fail;
+
+            q   = s->last_superframe + s->last_superframe_len;
+            len = buf_size - 1;
+            while (len > 0) {
+                *q++ = get_bits (&s->gb, 8);
+                len --;
+            }
+            memset(q, 0, AV_INPUT_BUFFER_PADDING_SIZE);
+
+            s->last_superframe_len += 8*buf_size - 8;
+//             s->reset_block_lengths = 1; //XXX is this needed ?
+            *got_frame_ptr = 0;
+            return buf_size;
+        }
     } else
         nb_frames = 1;
 
     /* get output buffer */
     frame->nb_samples = nb_frames * s->frame_len;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples        = (float **) frame->extended_data;
     samples_offset = 0;
 
@@ -901,13 +945,13 @@ static int wma_decode_superframe(AVCodecContext *avctx, void *data,
         samples_offset += s->frame_len;
     }
 
-    ff_dlog(s->avctx, "%d %d %d %d outbytes:%td eaten:%d\n",
+    ff_dlog(s->avctx, "%d %d %d %d outbytes:%"PTRDIFF_SPECIFIER" eaten:%d\n",
             s->frame_len_bits, s->block_len_bits, s->frame_len, s->block_len,
             (int8_t *) samples - (int8_t *) data, avctx->block_align);
 
     *got_frame_ptr = 1;
 
-    return avctx->block_align;
+    return buf_size;
 
 fail:
     /* when error, we reset the bit reservoir */
@@ -923,6 +967,7 @@ static av_cold void flush(AVCodecContext *avctx)
     s->last_superframe_len = 0;
 }
 
+#if CONFIG_WMAV1_DECODER
 AVCodec ff_wmav1_decoder = {
     .name           = "wmav1",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 1"),
@@ -937,7 +982,8 @@ AVCodec ff_wmav1_decoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_WMAV2_DECODER
 AVCodec ff_wmav2_decoder = {
     .name           = "wmav2",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 2"),
@@ -952,3 +998,4 @@ AVCodec ff_wmav2_decoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/wmaenc.c b/libavcodec/wmaenc.c
index e991bb8..c68babd 100644
--- a/libavcodec/wmaenc.c
+++ b/libavcodec/wmaenc.c
@@ -2,31 +2,30 @@
  * WMA compatible encoder
  * Copyright (c) 2007 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/ffmath.h"
 
 #include "avcodec.h"
 #include "internal.h"
 #include "wma.h"
-
-#undef NDEBUG
-#include <assert.h>
+#include "libavutil/avassert.h"
 
 
 static av_cold int encode_init(AVCodecContext *avctx)
@@ -34,26 +33,27 @@ static av_cold int encode_init(AVCodecContext *avctx)
     WMACodecContext *s = avctx->priv_data;
     int i, flags1, flags2, block_align;
     uint8_t *extradata;
+    int ret;
 
     s->avctx = avctx;
 
     if (avctx->channels > MAX_CHANNELS) {
         av_log(avctx, AV_LOG_ERROR,
-               "too many channels: got %i, need %i or fewer",
+               "too many channels: got %i, need %i or fewer\n",
                avctx->channels, MAX_CHANNELS);
         return AVERROR(EINVAL);
     }
 
     if (avctx->sample_rate > 48000) {
-        av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz",
+        av_log(avctx, AV_LOG_ERROR, "sample rate is too high: %d > 48kHz\n",
                avctx->sample_rate);
         return AVERROR(EINVAL);
     }
 
     if (avctx->bit_rate < 24 * 1000) {
         av_log(avctx, AV_LOG_ERROR,
-               "bitrate too low: got %i, need 24000 or higher\n",
-               avctx->bit_rate);
+               "bitrate too low: got %"PRId64", need 24000 or higher\n",
+               (int64_t)avctx->bit_rate);
         return AVERROR(EINVAL);
     }
 
@@ -75,7 +75,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
         AV_WL32(extradata, flags1);
         AV_WL16(extradata + 4, flags2);
     } else {
-        assert(0);
+        av_assert0(0);
     }
     avctx->extradata          = extradata;
     s->use_exp_vlc            = flags2 & 0x0001;
@@ -84,7 +84,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
     if (avctx->channels == 2)
         s->ms_stereo = 1;
 
-    ff_wma_init(avctx, flags2);
+    if ((ret = ff_wma_init(avctx, flags2)) < 0)
+        return ret;
 
     /* init MDCT */
     for (i = 0; i < s->nb_block_sizes; i++)
@@ -94,14 +95,12 @@ static av_cold int encode_init(AVCodecContext *avctx)
                          (avctx->sample_rate * 8);
     block_align        = FFMIN(block_align, MAX_CODED_SUPERFRAME_SIZE);
     avctx->block_align = block_align;
-    avctx->bit_rate    = avctx->block_align * 8LL * avctx->sample_rate /
-                         s->frame_len;
     avctx->frame_size = avctx->initial_padding = s->frame_len;
 
     return 0;
 }
 
-static void apply_window_and_mdct(AVCodecContext *avctx, const AVFrame *frame)
+static int apply_window_and_mdct(AVCodecContext *avctx, const AVFrame *frame)
 {
     WMACodecContext *s = avctx->priv_data;
     float **audio      = (float **) frame->extended_data;
@@ -115,12 +114,18 @@ static void apply_window_and_mdct(AVCodecContext *avctx, const AVFrame *frame)
 
     for (ch = 0; ch < avctx->channels; ch++) {
         memcpy(s->output, s->frame_out[ch], window_len * sizeof(*s->output));
-        s->fdsp.vector_fmul_scalar(s->frame_out[ch], audio[ch], n, len);
-        s->fdsp.vector_fmul_reverse(&s->output[window_len], s->frame_out[ch],
+        s->fdsp->vector_fmul_scalar(s->frame_out[ch], audio[ch], n, len);
+        s->fdsp->vector_fmul_reverse(&s->output[window_len], s->frame_out[ch],
                                     win, len);
-        s->fdsp.vector_fmul(s->frame_out[ch], s->frame_out[ch], win, len);
+        s->fdsp->vector_fmul(s->frame_out[ch], s->frame_out[ch], win, len);
         mdct->mdct_calc(mdct, s->coefs[ch], s->output);
+        if (!isfinite(s->coefs[ch][0])) {
+            av_log(avctx, AV_LOG_ERROR, "Input contains NaN/+-Inf\n");
+            return AVERROR(EINVAL);
+        }
     }
+
+    return 0;
 }
 
 // FIXME use for decoding too
@@ -136,7 +141,7 @@ static void init_exp(WMACodecContext *s, int ch, const int *exp_param)
     max_scale = 0;
     while (q < q_end) {
         /* XXX: use a table */
-        v         = pow(10, *exp_param++ *(1.0 / 16.0));
+        v         = ff_exp10(*exp_param++ *(1.0 / 16.0));
         max_scale = FFMAX(max_scale, v);
         n         = *ptr++;
         do {
@@ -157,7 +162,7 @@ static void encode_exp_vlc(WMACodecContext *s, int ch, const int *exp_param)
     q_end = q + s->block_len;
     if (s->version == 1) {
         last_exp = *exp_param++;
-        assert(last_exp - 10 >= 0 && last_exp - 10 < 32);
+        av_assert0(last_exp - 10 >= 0 && last_exp - 10 < 32);
         put_bits(&s->pb, 5, last_exp - 10);
         q += *ptr++;
     } else
@@ -165,7 +170,7 @@ static void encode_exp_vlc(WMACodecContext *s, int ch, const int *exp_param)
     while (q < q_end) {
         int exp  = *exp_param++;
         int code = exp - last_exp + 60;
-        assert(code >= 0 && code < 120);
+        av_assert1(code >= 0 && code < 120);
         put_bits(&s->pb, ff_aac_scalefactor_bits[code],
                  ff_aac_scalefactor_code[code]);
         /* XXX: use a table */
@@ -190,7 +195,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
 
     // FIXME remove duplication relative to decoder
     if (s->use_variable_block_len) {
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
     } else {
         /* fixed block len */
         s->next_block_len_bits = s->frame_len_bits;
@@ -199,7 +204,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     }
 
     s->block_len = 1 << s->block_len_bits;
-//     assert((s->block_pos + s->block_len) <= s->frame_len);
+//     av_assert0((s->block_pos + s->block_len) <= s->frame_len);
     bsize = s->frame_len_bits - s->block_len_bits;
 
     // FIXME factor
@@ -231,11 +236,11 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
 
             coefs1    = s->coefs1[ch];
             exponents = s->exponents[ch];
-            mult      = pow(10, total_gain * 0.05) / s->max_exponent[ch];
+            mult      = ff_exp10(total_gain * 0.05) / s->max_exponent[ch];
             mult     *= mdct_norm;
             coefs     = src_coefs[ch];
             if (s->use_noise_coding && 0) {
-                assert(0); // FIXME not implemented
+                av_assert0(0); // FIXME not implemented
             } else {
                 coefs += s->coefs_start;
                 n      = nb_coefs[ch];
@@ -290,13 +295,13 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
                 if (s->use_exp_vlc) {
                     encode_exp_vlc(s, ch, fixed_exp);
                 } else {
-                    assert(0); // FIXME not implemented
+                    av_assert0(0); // FIXME not implemented
 //                    encode_exp_lsp(s, ch);
                 }
             }
         }
     } else
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
 
     for (ch = 0; ch < s->avctx->channels; ch++) {
         if (s->channel_coded[ch]) {
@@ -316,7 +321,7 @@ static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
                         if (run < s->coef_vlcs[tindex]->levels[abs_level - 1])
                             code = run + s->int_table[tindex][abs_level - 1];
 
-                    assert(code < s->coef_vlcs[tindex]->n);
+                    av_assert2(code < s->coef_vlcs[tindex]->n);
                     put_bits(&s->pb, s->coef_vlcs[tindex]->huffbits[code],
                              s->coef_vlcs[tindex]->huffcodes[code]);
 
@@ -349,7 +354,7 @@ static int encode_frame(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE],
     init_put_bits(&s->pb, buf, buf_size);
 
     if (s->use_bit_reservoir)
-        assert(0); // FIXME not implemented
+        av_assert0(0); // FIXME not implemented
     else if (encode_block(s, src_coefs, total_gain) < 0)
         return INT_MAX;
 
@@ -362,12 +367,15 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
                              const AVFrame *frame, int *got_packet_ptr)
 {
     WMACodecContext *s = avctx->priv_data;
-    int i, total_gain, ret;
+    int i, total_gain, ret, error;
 
     s->block_len_bits = s->frame_len_bits; // required by non variable block len
     s->block_len      = 1 << s->block_len_bits;
 
-    apply_window_and_mdct(avctx, frame);
+    ret = apply_window_and_mdct(avctx, frame);
+
+    if (ret < 0)
+        return ret;
 
     if (s->ms_stereo) {
         float a, b;
@@ -381,46 +389,32 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
         }
     }
 
-    if ((ret = ff_alloc_packet(avpkt, 2 * MAX_CODED_SUPERFRAME_SIZE))) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+    if ((ret = ff_alloc_packet2(avctx, avpkt, 2 * MAX_CODED_SUPERFRAME_SIZE, 0)) < 0)
         return ret;
-    }
 
-#if 1
     total_gain = 128;
     for (i = 64; i; i >>= 1) {
-        int error = encode_frame(s, s->coefs, avpkt->data, avpkt->size,
+        error = encode_frame(s, s->coefs, avpkt->data, avpkt->size,
                                  total_gain - i);
-        if (error < 0)
-            total_gain -= i;
-    }
-#else
-    total_gain = 90;
-    best = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain);
-    for (i = 32; i; i >>= 1) {
-        int scoreL = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain - i);
-        int scoreR = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain + i);
-        av_log(NULL, AV_LOG_ERROR, "%d %d %d (%d)\n", scoreL, best, scoreR, total_gain);
-        if (scoreL < FFMIN(best, scoreR)) {
-            best        = scoreL;
+        if (error <= 0)
             total_gain -= i;
-        } else if (scoreR < best) {
-            best        = scoreR;
-            total_gain += i;
-        }
     }
-#endif /* 1 */
 
-    if ((i = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain)) >= 0) {
-        av_log(avctx, AV_LOG_ERROR, "required frame size too large. please "
-                                    "use a higher bit rate.\n");
+    while(total_gain <= 128 && error > 0)
+        error = encode_frame(s, s->coefs, avpkt->data, avpkt->size, total_gain++);
+    if (error > 0) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid input data or requested bitrate too low, cannot encode\n");
+        avpkt->size = 0;
         return AVERROR(EINVAL);
     }
-    assert((put_bits_count(&s->pb) & 7) == 0);
-    while (i++)
+    av_assert0((put_bits_count(&s->pb) & 7) == 0);
+    i= avctx->block_align - (put_bits_count(&s->pb)+7)/8;
+    av_assert0(i>=0);
+    while(i--)
         put_bits(&s->pb, 8, 'N');
 
     flush_put_bits(&s->pb);
+    av_assert0(put_bits_ptr(&s->pb) - s->pb.buf == avctx->block_align);
 
     if (frame->pts != AV_NOPTS_VALUE)
         avpkt->pts = frame->pts - ff_samples_to_time_base(avctx, avctx->initial_padding);
@@ -430,6 +424,7 @@ static int encode_superframe(AVCodecContext *avctx, AVPacket *avpkt,
     return 0;
 }
 
+#if CONFIG_WMAV1_ENCODER
 AVCodec ff_wmav1_encoder = {
     .name           = "wmav1",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 1"),
@@ -442,7 +437,8 @@ AVCodec ff_wmav1_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
-
+#endif
+#if CONFIG_WMAV2_ENCODER
 AVCodec ff_wmav2_encoder = {
     .name           = "wmav2",
     .long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio 2"),
@@ -455,3 +451,4 @@ AVCodec ff_wmav2_encoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+#endif
diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c
index 231a74d..6b4edfc 100644
--- a/libavcodec/wmalosslessdec.c
+++ b/libavcodec/wmalosslessdec.c
@@ -5,20 +5,20 @@
  * Copyright (c) 2011 Andreas Öman
  * Copyright (c) 2011 - 2012 Mashiat Sarker Shakkhar
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,6 +31,7 @@
 #include "internal.h"
 #include "get_bits.h"
 #include "put_bits.h"
+#include "lossless_audiodsp.h"
 #include "wma.h"
 #include "wma_common.h"
 
@@ -46,6 +47,7 @@
 #define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS)    ///< maximum block size
 #define WMALL_BLOCK_SIZES    (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + 1) ///< possible block sizes
 
+#define WMALL_COEFF_PAD_SIZE   16                       ///< pad coef buffers with 0 for use with SIMD
 
 /**
  * @brief frame-specific decoder context for a single channel
@@ -69,7 +71,9 @@ typedef struct WmallDecodeCtx {
     /* generic decoder variables */
     AVCodecContext  *avctx;
     AVFrame         *frame;
-    uint8_t         frame_data[MAX_FRAMESIZE + AV_INPUT_BUFFER_PADDING_SIZE];  ///< compressed frame data
+    LLAudDSPContext dsp;                           ///< accelerated DSP functions
+    uint8_t         *frame_data;                    ///< compressed frame data
+    int             max_frame_size;                 ///< max bitstream size
     PutBitContext   pb;                             ///< context for filling the frame_data buffer
 
     /* frame size dependent frame information (set during initialization) */
@@ -124,15 +128,15 @@ typedef struct WmallDecodeCtx {
 
     int8_t  acfilter_order;
     int8_t  acfilter_scaling;
-    int64_t acfilter_coeffs[16];
-    int     acfilter_prevvalues[2][16];
+    int16_t acfilter_coeffs[16];
+    int     acfilter_prevvalues[WMALL_MAX_CHANNELS][16];
 
     int8_t  mclms_order;
     int8_t  mclms_scaling;
     int16_t mclms_coeffs[WMALL_MAX_CHANNELS * WMALL_MAX_CHANNELS * 32];
     int16_t mclms_coeffs_cur[WMALL_MAX_CHANNELS * WMALL_MAX_CHANNELS];
-    int16_t mclms_prevvalues[WMALL_MAX_CHANNELS * 2 * 32];
-    int16_t mclms_updates[WMALL_MAX_CHANNELS * 2 * 32];
+    int32_t mclms_prevvalues[WMALL_MAX_CHANNELS * 2 * 32];
+    int32_t mclms_updates[WMALL_MAX_CHANNELS * 2 * 32];
     int     mclms_recent;
 
     int     movave_scaling;
@@ -143,35 +147,35 @@ typedef struct WmallDecodeCtx {
         int scaling;
         int coefsend;
         int bitsend;
-        int16_t coefs[MAX_ORDER];
-        int16_t lms_prevvalues[MAX_ORDER * 2];
-        int16_t lms_updates[MAX_ORDER * 2];
+        DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
+        DECLARE_ALIGNED(16, int32_t, lms_prevvalues)[MAX_ORDER * 2 + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
+        DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2 + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
         int recent;
-    } cdlms[2][9];
+    } cdlms[WMALL_MAX_CHANNELS][9];
 
-    int cdlms_ttl[2];
+    int cdlms_ttl[WMALL_MAX_CHANNELS];
 
     int bV3RTM;
 
-    int is_channel_coded[2];
-    int update_speed[2];
+    int is_channel_coded[WMALL_MAX_CHANNELS];
+    int update_speed[WMALL_MAX_CHANNELS];
 
-    int transient[2];
-    int transient_pos[2];
+    int transient[WMALL_MAX_CHANNELS];
+    int transient_pos[WMALL_MAX_CHANNELS];
     int seekable_tile;
 
-    int ave_sum[2];
+    int ave_sum[WMALL_MAX_CHANNELS];
 
-    int channel_residues[2][WMALL_BLOCK_MAX_SIZE];
+    int channel_residues[WMALL_MAX_CHANNELS][WMALL_BLOCK_MAX_SIZE];
 
-    int lpc_coefs[2][40];
+    int lpc_coefs[WMALL_MAX_CHANNELS][40];
     int lpc_order;
     int lpc_scaling;
     int lpc_intbits;
-
-    int channel_coeffs[2][WMALL_BLOCK_MAX_SIZE];
 } WmallDecodeCtx;
 
+/** Get sign of integer (1 for positive, -1 for negative and 0 for zero) */
+#define WMASIGN(x) (((x) > 0) - ((x) < 0))
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
@@ -180,8 +184,19 @@ static av_cold int decode_init(AVCodecContext *avctx)
     unsigned int channel_mask;
     int i, log2_max_num_subframes;
 
+    if (!avctx->block_align) {
+        av_log(avctx, AV_LOG_ERROR, "block_align is not set\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->max_frame_size = MAX_FRAMESIZE * avctx->channels;
+    s->frame_data = av_mallocz(s->max_frame_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!s->frame_data)
+        return AVERROR(ENOMEM);
+
     s->avctx = avctx;
-    init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+    ff_llauddsp_init(&s->dsp);
+    init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
 
     if (avctx->extradata_size >= 18) {
         s->decode_flags    = AV_RL16(edata_ptr + 14);
@@ -190,9 +205,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
         if (s->bits_per_sample == 16)
             avctx->sample_fmt = AV_SAMPLE_FMT_S16P;
         else if (s->bits_per_sample == 24) {
+            av_log(avctx, AV_LOG_WARNING, "Decoding audio at 24 bit-depth\n");
             avctx->sample_fmt = AV_SAMPLE_FMT_S32P;
-            avpriv_report_missing_feature(avctx, "Bit-depth higher than 16");
-            return AVERROR_PATCHWELCOME;
+            avctx->bits_per_raw_sample = 24;
         } else {
             av_log(avctx, AV_LOG_ERROR, "Unknown bit-depth: %"PRIu8"\n",
                    s->bits_per_sample);
@@ -345,11 +360,11 @@ static int decode_tilehdr(WmallDecodeCtx *s)
             if (num_samples[c] == min_channel_len) {
                 if (fixed_channel_layout || channels_for_cur_subframe == 1 ||
                    (min_channel_len == s->samples_per_frame - s->min_samples_per_subframe)) {
-                    contains_subframe[c] = in_use = 1;
+                    contains_subframe[c] = 1;
                 } else {
-                    if (get_bits1(&s->gb))
-                        contains_subframe[c] = in_use = 1;
+                    contains_subframe[c] = get_bits1(&s->gb);
                 }
+                in_use |= contains_subframe[c];
             } else
                 contains_subframe[c] = 0;
         }
@@ -453,6 +468,13 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 s->cdlms[0][0].order = 0;
                 return AVERROR_INVALIDDATA;
             }
+            if(s->cdlms[c][i].order & 8 && s->bits_per_sample == 16) {
+                static int warned;
+                if(!warned)
+                    avpriv_request_sample(s->avctx, "CDLMS of order %d",
+                                          s->cdlms[c][i].order);
+                warned = 1;
+            }
         }
 
         for (i = 0; i < s->cdlms_ttl[c]; i++)
@@ -478,6 +500,10 @@ static int decode_cdlms(WmallDecodeCtx *s)
                         (get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) >> shift_r;
             }
         }
+
+        for (i = 0; i < s->cdlms_ttl[c]; i++)
+            memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order,
+                   0, WMALL_COEFF_PAD_SIZE);
     }
 
     return 0;
@@ -504,9 +530,9 @@ static int decode_channel_residues(WmallDecodeCtx *s, int ch, int tile_size)
 
     if (s->seekable_tile) {
         if (s->do_inter_ch_decorr)
-            s->channel_residues[ch][0] = get_sbits(&s->gb, s->bits_per_sample + 1);
+            s->channel_residues[ch][0] = get_sbits_long(&s->gb, s->bits_per_sample + 1);
         else
-            s->channel_residues[ch][0] = get_sbits(&s->gb, s->bits_per_sample);
+            s->channel_residues[ch][0] = get_sbits_long(&s->gb, s->bits_per_sample);
         i++;
     }
     for (; i < tile_size; i++) {
@@ -524,17 +550,14 @@ static int decode_channel_residues(WmallDecodeCtx *s, int ch, int tile_size)
             residue = quo;
         else {
             rem_bits = av_ceil_log2(ave_mean);
-            rem      = rem_bits ? get_bits_long(&s->gb, rem_bits) : 0;
+            rem      = get_bits_long(&s->gb, rem_bits);
             residue  = (quo << rem_bits) + rem;
         }
 
         s->ave_sum[ch] = residue + s->ave_sum[ch] -
                          (s->ave_sum[ch] >> s->movave_scaling);
 
-        if (residue & 1)
-            residue = -(residue >> 1) - 1;
-        else
-            residue = residue >> 1;
+        residue = (residue >> 1) ^ -(residue & 1);
         s->channel_residues[ch][i] = residue;
     }
 
@@ -611,47 +634,31 @@ static void mclms_update(WmallDecodeCtx *s, int icoef, int *pred)
             for (i = 0; i < order * num_channels; i++)
                 s->mclms_coeffs[i + ich * order * num_channels] +=
                     s->mclms_updates[s->mclms_recent + i];
-            for (j = 0; j < ich; j++) {
-                if (s->channel_residues[j][icoef] > 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] += 1;
-                else if (s->channel_residues[j][icoef] < 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] -= 1;
-            }
+            for (j = 0; j < ich; j++)
+                s->mclms_coeffs_cur[ich * num_channels + j] += WMASIGN(s->channel_residues[j][icoef]);
         } else if (pred_error < 0) {
             for (i = 0; i < order * num_channels; i++)
                 s->mclms_coeffs[i + ich * order * num_channels] -=
                     s->mclms_updates[s->mclms_recent + i];
-            for (j = 0; j < ich; j++) {
-                if (s->channel_residues[j][icoef] > 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] -= 1;
-                else if (s->channel_residues[j][icoef] < 0)
-                    s->mclms_coeffs_cur[ich * num_channels + j] += 1;
-            }
+            for (j = 0; j < ich; j++)
+                s->mclms_coeffs_cur[ich * num_channels + j] -= WMASIGN(s->channel_residues[j][icoef]);
         }
     }
 
     for (ich = num_channels - 1; ich >= 0; ich--) {
         s->mclms_recent--;
-        s->mclms_prevvalues[s->mclms_recent] = s->channel_residues[ich][icoef];
-        if (s->channel_residues[ich][icoef] > range - 1)
-            s->mclms_prevvalues[s->mclms_recent] = range - 1;
-        else if (s->channel_residues[ich][icoef] < -range)
-            s->mclms_prevvalues[s->mclms_recent] = -range;
-
-        s->mclms_updates[s->mclms_recent] = 0;
-        if (s->channel_residues[ich][icoef] > 0)
-            s->mclms_updates[s->mclms_recent] = 1;
-        else if (s->channel_residues[ich][icoef] < 0)
-            s->mclms_updates[s->mclms_recent] = -1;
+        s->mclms_prevvalues[s->mclms_recent] = av_clip(s->channel_residues[ich][icoef],
+            -range, range - 1);
+        s->mclms_updates[s->mclms_recent] = WMASIGN(s->channel_residues[ich][icoef]);
     }
 
     if (s->mclms_recent == 0) {
         memcpy(&s->mclms_prevvalues[order * num_channels],
                s->mclms_prevvalues,
-               2 * order * num_channels);
+               sizeof(int32_t) * order * num_channels);
         memcpy(&s->mclms_updates[order * num_channels],
                s->mclms_updates,
-               2 * order * num_channels);
+               sizeof(int32_t) * order * num_channels);
         s->mclms_recent = num_channels * order;
     }
 }
@@ -667,10 +674,10 @@ static void mclms_predict(WmallDecodeCtx *s, int icoef, int *pred)
         if (!s->is_channel_coded[ich])
             continue;
         for (i = 0; i < order * num_channels; i++)
-            pred[ich] += s->mclms_prevvalues[i + s->mclms_recent] *
+            pred[ich] += (uint32_t)s->mclms_prevvalues[i + s->mclms_recent] *
                          s->mclms_coeffs[i + order * num_channels * ich];
         for (i = 0; i < ich; i++)
-            pred[ich] += s->channel_residues[i][icoef] *
+            pred[ich] += (uint32_t)s->channel_residues[i][icoef] *
                          s->mclms_coeffs_cur[i + num_channels * ich];
         pred[ich] += 1 << s->mclms_scaling - 1;
         pred[ich] >>= s->mclms_scaling;
@@ -687,60 +694,6 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size)
     }
 }
 
-static int lms_predict(WmallDecodeCtx *s, int ich, int ilms)
-{
-    int pred = 0, icoef;
-    int recent = s->cdlms[ich][ilms].recent;
-
-    for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-        pred += s->cdlms[ich][ilms].coefs[icoef] *
-                s->cdlms[ich][ilms].lms_prevvalues[icoef + recent];
-
-    return pred;
-}
-
-static void lms_update(WmallDecodeCtx *s, int ich, int ilms,
-                       int input, int residue)
-{
-    int icoef;
-    int recent = s->cdlms[ich][ilms].recent;
-    int range  = 1 << s->bits_per_sample - 1;
-
-    if (residue < 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] -=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    } else if (residue > 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] +=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    }
-
-    if (recent)
-        recent--;
-    else {
-        memcpy(&s->cdlms[ich][ilms].lms_prevvalues[s->cdlms[ich][ilms].order],
-               s->cdlms[ich][ilms].lms_prevvalues,
-               2 * s->cdlms[ich][ilms].order);
-        memcpy(&s->cdlms[ich][ilms].lms_updates[s->cdlms[ich][ilms].order],
-               s->cdlms[ich][ilms].lms_updates,
-               2 * s->cdlms[ich][ilms].order);
-        recent = s->cdlms[ich][ilms].order - 1;
-    }
-
-    s->cdlms[ich][ilms].lms_prevvalues[recent] = av_clip(input, -range, range - 1);
-    if (!input)
-        s->cdlms[ich][ilms].lms_updates[recent] = 0;
-    else if (input < 0)
-        s->cdlms[ich][ilms].lms_updates[recent] = -s->update_speed[ich];
-    else
-        s->cdlms[ich][ilms].lms_updates[recent] = s->update_speed[ich];
-
-    s->cdlms[ich][ilms].lms_updates[recent + (s->cdlms[ich][ilms].order >> 4)] >>= 2;
-    s->cdlms[ich][ilms].lms_updates[recent + (s->cdlms[ich][ilms].order >> 3)] >>= 1;
-    s->cdlms[ich][ilms].recent = recent;
-}
-
 static void use_high_update_speed(WmallDecodeCtx *s, int ich)
 {
     int ilms, recent, icoef;
@@ -776,24 +729,63 @@ static void use_normal_update_speed(WmallDecodeCtx *s, int ich)
     s->update_speed[ich] = 8;
 }
 
-static void revert_cdlms(WmallDecodeCtx *s, int ch,
-                         int coef_begin, int coef_end)
-{
-    int icoef, pred, ilms, num_lms, residue, input;
-
-    num_lms = s->cdlms_ttl[ch];
-    for (ilms = num_lms - 1; ilms >= 0; ilms--) {
-        for (icoef = coef_begin; icoef < coef_end; icoef++) {
-            pred = 1 << (s->cdlms[ch][ilms].scaling - 1);
-            residue = s->channel_residues[ch][icoef];
-            pred += lms_predict(s, ch, ilms);
-            input = residue + (pred >> s->cdlms[ch][ilms].scaling);
-            lms_update(s, ch, ilms, input, residue);
-            s->channel_residues[ch][icoef] = input;
-        }
-    }
+#define CD_LMS(bits, ROUND) \
+static void lms_update ## bits (WmallDecodeCtx *s, int ich, int ilms, int input) \
+{ \
+    int recent = s->cdlms[ich][ilms].recent; \
+    int range  = 1 << s->bits_per_sample - 1; \
+    int order  = s->cdlms[ich][ilms].order; \
+    int ##bits##_t *prev = (int##bits##_t *)s->cdlms[ich][ilms].lms_prevvalues; \
+ \
+    if (recent) \
+        recent--; \
+    else { \
+        memcpy(prev + order, prev, (bits/8) * order); \
+        memcpy(s->cdlms[ich][ilms].lms_updates + order, \
+               s->cdlms[ich][ilms].lms_updates, \
+               sizeof(*s->cdlms[ich][ilms].lms_updates) * order); \
+        recent = order - 1; \
+    } \
+ \
+    prev[recent] = av_clip(input, -range, range - 1); \
+    s->cdlms[ich][ilms].lms_updates[recent] = WMASIGN(input) * s->update_speed[ich]; \
+ \
+    s->cdlms[ich][ilms].lms_updates[recent + (order >> 4)] >>= 2; \
+    s->cdlms[ich][ilms].lms_updates[recent + (order >> 3)] >>= 1; \
+    s->cdlms[ich][ilms].recent = recent; \
+    memset(s->cdlms[ich][ilms].lms_updates + recent + order, 0, \
+           sizeof(s->cdlms[ich][ilms].lms_updates) - \
+           sizeof(*s->cdlms[ich][ilms].lms_updates)*(recent+order)); \
+} \
+ \
+static void revert_cdlms ## bits (WmallDecodeCtx *s, int ch, \
+                                  int coef_begin, int coef_end) \
+{ \
+    int icoef, pred, ilms, num_lms, residue, input; \
+ \
+    num_lms = s->cdlms_ttl[ch]; \
+    for (ilms = num_lms - 1; ilms >= 0; ilms--) { \
+        for (icoef = coef_begin; icoef < coef_end; icoef++) { \
+            int##bits##_t *prevvalues = (int##bits##_t *)s->cdlms[ch][ilms].lms_prevvalues; \
+            pred = 1 << (s->cdlms[ch][ilms].scaling - 1); \
+            residue = s->channel_residues[ch][icoef]; \
+            pred += s->dsp.scalarproduct_and_madd_int## bits (s->cdlms[ch][ilms].coefs, \
+                                                        prevvalues + s->cdlms[ch][ilms].recent, \
+                                                        s->cdlms[ch][ilms].lms_updates + \
+                                                        s->cdlms[ch][ilms].recent, \
+                                                        FFALIGN(s->cdlms[ch][ilms].order, ROUND), \
+                                                        WMASIGN(residue)); \
+            input = residue + (pred >> s->cdlms[ch][ilms].scaling); \
+            lms_update ## bits(s, ch, ilms, input); \
+            s->channel_residues[ch][icoef] = input; \
+        } \
+    } \
+    if (bits <= 16) emms_c(); \
 }
 
+CD_LMS(16, WMALL_COEFF_PAD_SIZE)
+CD_LMS(32, 8)
+
 static void revert_inter_ch_decorr(WmallDecodeCtx *s, int tile_size)
 {
     if (s->num_channels != 2)
@@ -810,7 +802,7 @@ static void revert_inter_ch_decorr(WmallDecodeCtx *s, int tile_size)
 static void revert_acfilter(WmallDecodeCtx *s, int tile_size)
 {
     int ich, pred, i, j;
-    int64_t *filter_coeffs = s->acfilter_coeffs;
+    int16_t *filter_coeffs = s->acfilter_coeffs;
     int scaling            = s->acfilter_scaling;
     int order              = s->acfilter_order;
 
@@ -830,7 +822,7 @@ static void revert_acfilter(WmallDecodeCtx *s, int tile_size)
         for (i = order; i < tile_size; i++) {
             pred = 0;
             for (j = 0; j < order; j++)
-                pred += s->channel_residues[ich][i - j - 1] * filter_coeffs[j];
+                pred += (uint32_t)s->channel_residues[ich][i - j - 1] * filter_coeffs[j];
             pred >>= scaling;
             s->channel_residues[ich][i] += pred;
         }
@@ -908,14 +900,17 @@ static int decode_subframe(WmallDecodeCtx *s)
         s->quant_stepsize = get_bits(&s->gb, 8) + 1;
 
         reset_codec(s);
-    } else if (!s->cdlms[0][0].order) {
+    }
+
+    rawpcm_tile = get_bits1(&s->gb);
+
+    if (!rawpcm_tile && !s->cdlms[0][0].order) {
         av_log(s->avctx, AV_LOG_DEBUG,
                "Waiting for seekable tile\n");
         av_frame_unref(s->frame);
         return -1;
     }
 
-    rawpcm_tile = get_bits1(&s->gb);
 
     for (i = 0; i < s->num_channels; i++)
         s->is_channel_coded[i] = 1;
@@ -954,32 +949,37 @@ static int decode_subframe(WmallDecodeCtx *s)
                 bits * s->num_channels * subframe_len, get_bits_count(&s->gb));
         for (i = 0; i < s->num_channels; i++)
             for (j = 0; j < subframe_len; j++)
-                s->channel_coeffs[i][j] = get_sbits(&s->gb, bits);
+                s->channel_residues[i][j] = get_sbits_long(&s->gb, bits);
     } else {
-        for (i = 0; i < s->num_channels; i++)
+        for (i = 0; i < s->num_channels; i++) {
             if (s->is_channel_coded[i]) {
                 decode_channel_residues(s, i, subframe_len);
                 if (s->seekable_tile)
                     use_high_update_speed(s, i);
                 else
                     use_normal_update_speed(s, i);
-                revert_cdlms(s, i, 0, subframe_len);
+                if (s->bits_per_sample > 16)
+                    revert_cdlms32(s, i, 0, subframe_len);
+                else
+                    revert_cdlms16(s, i, 0, subframe_len);
             } else {
                 memset(s->channel_residues[i], 0, sizeof(**s->channel_residues) * subframe_len);
             }
+        }
+
+        if (s->do_mclms)
+            revert_mclms(s, subframe_len);
+        if (s->do_inter_ch_decorr)
+            revert_inter_ch_decorr(s, subframe_len);
+        if (s->do_ac_filter)
+            revert_acfilter(s, subframe_len);
+
+        /* Dequantize */
+        if (s->quant_stepsize != 1)
+            for (i = 0; i < s->num_channels; i++)
+                for (j = 0; j < subframe_len; j++)
+                    s->channel_residues[i][j] *= s->quant_stepsize;
     }
-    if (s->do_mclms)
-        revert_mclms(s, subframe_len);
-    if (s->do_inter_ch_decorr)
-        revert_inter_ch_decorr(s, subframe_len);
-    if (s->do_ac_filter)
-        revert_acfilter(s, subframe_len);
-
-    /* Dequantize */
-    if (s->quant_stepsize != 1)
-        for (i = 0; i < s->num_channels; i++)
-            for (j = 0; j < subframe_len; j++)
-                s->channel_residues[i][j] *= s->quant_stepsize;
 
     /* Write to proper output buffer depending on bit-depth */
     for (i = 0; i < s->channels_for_cur_subframe; i++) {
@@ -990,7 +990,7 @@ static int decode_subframe(WmallDecodeCtx *s)
             if (s->bits_per_sample == 16) {
                 *s->samples_16[c]++ = (int16_t) s->channel_residues[c][j] << padding_zeroes;
             } else {
-                *s->samples_32[c]++ = s->channel_residues[c][j] << padding_zeroes;
+                *s->samples_32[c]++ = s->channel_residues[c][j] << (padding_zeroes + 8);
             }
         }
     }
@@ -1021,9 +1021,8 @@ static int decode_frame(WmallDecodeCtx *s)
     s->frame->nb_samples = s->samples_per_frame;
     if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0) {
         /* return an error if no frame could be decoded at all */
-        av_log(s->avctx, AV_LOG_ERROR,
-               "not enough space for the output samples\n");
         s->packet_loss = 1;
+        s->frame->nb_samples = 0;
         return ret;
     }
     for (i = 0; i < s->num_channels; i++) {
@@ -1036,9 +1035,10 @@ static int decode_frame(WmallDecodeCtx *s)
         len = get_bits(gb, s->log2_frame_size);
 
     /* decode tile information */
-    if (decode_tilehdr(s)) {
+    if ((ret = decode_tilehdr(s))) {
         s->packet_loss = 1;
-        return 0;
+        av_frame_unref(s->frame);
+        return ret;
     }
 
     /* read drc info */
@@ -1060,6 +1060,9 @@ static int decode_frame(WmallDecodeCtx *s)
         if (get_bits1(gb)) {
             skip = get_bits(gb, av_log2(s->samples_per_frame * 2));
             ff_dlog(s->avctx, "end skip: %i\n", skip);
+            s->frame->nb_samples -= skip;
+            if (s->frame->nb_samples <= 0)
+                return AVERROR_INVALIDDATA;
         }
 
     }
@@ -1073,16 +1076,18 @@ static int decode_frame(WmallDecodeCtx *s)
 
     /* decode all subframes */
     while (!s->parsed_all_subframes) {
+        int decoded_samples = s->channel[0].decoded_samples;
         if (decode_subframe(s) < 0) {
             s->packet_loss = 1;
+            if (s->frame->nb_samples)
+                s->frame->nb_samples = decoded_samples;
             return 0;
         }
     }
 
     ff_dlog(s->avctx, "Frame done\n");
 
-    if (s->skip_frame)
-        s->skip_frame = 0;
+    s->skip_frame = 0;
 
     if (s->len_prefix) {
         if (len != (get_bits_count(gb) - s->frame_offset) + 2) {
@@ -1136,12 +1141,12 @@ static void save_bits(WmallDecodeCtx *s, GetBitContext* gb, int len,
     if (!append) {
         s->frame_offset   = get_bits_count(gb) & 7;
         s->num_saved_bits = s->frame_offset;
-        init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+        init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
     }
 
     buflen = (s->num_saved_bits + len + 8) >> 3;
 
-    if (len <= 0 || buflen > MAX_FRAMESIZE) {
+    if (len <= 0 || buflen > s->max_frame_size) {
         avpriv_request_sample(s->avctx, "Too small input buffer");
         s->packet_loss = 1;
         return;
@@ -1178,15 +1183,18 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
 
     s->frame->nb_samples = 0;
 
-    if (s->packet_done || s->packet_loss) {
+    if (!buf_size && s->num_saved_bits > get_bits_count(&s->gb)) {
+        s->packet_done = 0;
+        if (!decode_frame(s))
+            s->num_saved_bits = 0;
+    } else if (s->packet_done || s->packet_loss) {
         s->packet_done = 0;
 
-        /* sanity check for the buffer length */
-        if (buf_size < avctx->block_align)
+        if (!buf_size)
             return 0;
 
-        s->next_packet_start = buf_size - avctx->block_align;
-        buf_size             = avctx->block_align;
+        s->next_packet_start = buf_size - FFMIN(avctx->block_align, buf_size);
+        buf_size             = FFMIN(avctx->block_align, buf_size);
         s->buf_bit_size      = buf_size << 3;
 
         /* parse packet header */
@@ -1234,7 +1242,7 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
              * to decode incomplete frames in the s->len_prefix == 0 case. */
             s->num_saved_bits = 0;
             s->packet_loss    = 0;
-            init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+            init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
         }
 
     } else {
@@ -1275,7 +1283,7 @@ static int decode_packet(AVCodecContext *avctx, void *data, int *got_frame_ptr,
 
     s->packet_offset = get_bits_count(gb) & 7;
 
-    return (s->packet_loss) ? AVERROR_INVALIDDATA : get_bits_count(gb) >> 3;
+    return (s->packet_loss) ? AVERROR_INVALIDDATA : buf_size ? get_bits_count(gb) >> 3 : 0;
 }
 
 static void flush(AVCodecContext *avctx)
@@ -1288,7 +1296,7 @@ static void flush(AVCodecContext *avctx)
     s->next_packet_start = 0;
     s->cdlms[0][0].order = 0;
     s->frame->nb_samples = 0;
-    init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
+    init_put_bits(&s->pb, s->frame_data, s->max_frame_size);
 }
 
 static av_cold int decode_close(AVCodecContext *avctx)
@@ -1296,6 +1304,7 @@ static av_cold int decode_close(AVCodecContext *avctx)
     WmallDecodeCtx *s = avctx->priv_data;
 
     av_frame_free(&s->frame);
+    av_freep(&s->frame_data);
 
     return 0;
 }
diff --git a/libavcodec/wmaprodata.h b/libavcodec/wmaprodata.h
index f8a52bf..5382479 100644
--- a/libavcodec/wmaprodata.h
+++ b/libavcodec/wmaprodata.h
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Baptiste Coudurier, Benjamin Larsson, Ulion
  * Copyright (c) 2008 - 2009 Sascha Sommer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c
index daeaa79..2cc1b09 100644
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2007 Baptiste Coudurier, Benjamin Larsson, Ulion
  * Copyright (c) 2008 - 2011 Sascha Sommer, Benjamin Larsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -88,6 +88,7 @@
 
 #include <inttypes.h>
 
+#include "libavutil/ffmath.h"
 #include "libavutil/float_dsp.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/intreadwrite.h"
@@ -171,13 +172,13 @@ typedef struct WMAProChannelGrp {
 typedef struct WMAProDecodeCtx {
     /* generic decoder variables */
     AVCodecContext*  avctx;                         ///< codec context for av_log
-    AVFloatDSPContext fdsp;
+    AVFloatDSPContext *fdsp;
     uint8_t          frame_data[MAX_FRAMESIZE +
                       AV_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
     PutBitContext    pb;                            ///< context for filling the frame_data buffer
     FFTContext       mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
     DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
-    float*           windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
+    const float*     windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
 
     /* frame size dependent frame information (set during initialization) */
     uint32_t         decode_flags;                  ///< used compression features
@@ -206,9 +207,11 @@ typedef struct WMAProDecodeCtx {
     int              subframe_offset;               ///< subframe offset in the bit reservoir
     uint8_t          packet_loss;                   ///< set in case of bitstream error
     uint8_t          packet_done;                   ///< set when a packet is fully decoded
+    uint8_t          skip_packets;
 
     /* frame decode state */
     uint32_t         frame_num;                     ///< current frame number (not used for decoding)
+    int              num_frames;
     GetBitContext    gb;                            ///< bitstream reader context
     int              buf_bit_size;                  ///< buffer size in bits
     uint8_t          drc_gain;                      ///< gain for the DRC tool
@@ -260,12 +263,29 @@ static av_cold int decode_end(AVCodecContext *avctx)
     WMAProDecodeCtx *s = avctx->priv_data;
     int i;
 
+    av_freep(&s->fdsp);
+
     for (i = 0; i < WMAPRO_BLOCK_SIZES; i++)
         ff_mdct_end(&s->mdct_ctx[i]);
 
     return 0;
 }
 
+static av_cold int get_rate(AVCodecContext *avctx)
+{
+    if (avctx->codec_id != AV_CODEC_ID_WMAPRO) { // XXX: is this really only for XMA?
+        if (avctx->sample_rate > 44100)
+            return 48000;
+        else if (avctx->sample_rate > 32000)
+            return 44100;
+        else if (avctx->sample_rate > 24000)
+            return 32000;
+        return 24000;
+    }
+
+    return avctx->sample_rate;
+}
+
 /**
  *@brief Initialize the decoder.
  *@param avctx codec context
@@ -280,22 +300,51 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int log2_max_num_subframes;
     int num_possible_block_sizes;
 
+    if (avctx->codec_id == AV_CODEC_ID_XMA1 || avctx->codec_id == AV_CODEC_ID_XMA2)
+        avctx->block_align = 2048;
+
     if (!avctx->block_align) {
         av_log(avctx, AV_LOG_ERROR, "block_align is not set\n");
         return AVERROR(EINVAL);
     }
 
     s->avctx = avctx;
-    avpriv_float_dsp_init(&s->fdsp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->fdsp)
+        return AVERROR(ENOMEM);
 
     init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
 
     avctx->sample_fmt = AV_SAMPLE_FMT_FLTP;
 
-    if (avctx->extradata_size >= 18) {
+    if (avctx->codec_id == AV_CODEC_ID_XMA2 && avctx->extradata_size >= 34) {
+        s->decode_flags    = 0x10d6;
+        channel_mask       = AV_RL32(edata_ptr+2);
+        s->bits_per_sample = 16;
+        /** dump the extradata */
+        for (i = 0; i < avctx->extradata_size; i++)
+            ff_dlog(avctx, "[%x] ", avctx->extradata[i]);
+        ff_dlog(avctx, "\n");
+
+     } else if (avctx->codec_id == AV_CODEC_ID_XMA1 && avctx->extradata_size >= 28) {
+        s->decode_flags    = 0x10d6;
+        s->bits_per_sample = 16;
+        channel_mask       = 0;
+        /** dump the extradata */
+        for (i = 0; i < avctx->extradata_size; i++)
+            ff_dlog(avctx, "[%x] ", avctx->extradata[i]);
+        ff_dlog(avctx, "\n");
+
+     } else if (avctx->extradata_size >= 18) {
         s->decode_flags    = AV_RL16(edata_ptr+14);
         channel_mask       = AV_RL32(edata_ptr+2);
         s->bits_per_sample = AV_RL16(edata_ptr);
+
+        if (s->bits_per_sample > 32 || s->bits_per_sample < 1) {
+            avpriv_request_sample(avctx, "bits per sample is %d", s->bits_per_sample);
+            return AVERROR_PATCHWELCOME;
+        }
+
         /** dump the extradata */
         for (i = 0; i < avctx->extradata_size; i++)
             ff_dlog(avctx, "[%x] ", avctx->extradata[i]);
@@ -306,21 +355,38 @@ static av_cold int decode_init(AVCodecContext *avctx)
         return AVERROR_PATCHWELCOME;
     }
 
+    if (avctx->codec_id != AV_CODEC_ID_WMAPRO && avctx->channels > 2) {
+        avpriv_report_missing_feature(avctx, ">2 channels support");
+        return AVERROR_PATCHWELCOME;
+    }
+
     /** generic init */
     s->log2_frame_size = av_log2(avctx->block_align) + 4;
+    if (s->log2_frame_size > 25) {
+        avpriv_request_sample(avctx, "Large block align");
+        return AVERROR_PATCHWELCOME;
+    }
 
     /** frame info */
-    s->skip_frame  = 1; /* skip first frame */
+    if (avctx->codec_id != AV_CODEC_ID_WMAPRO)
+        s->skip_frame = 0;
+    else
+        s->skip_frame = 1; /* skip first frame */
+
     s->packet_loss = 1;
     s->len_prefix  = (s->decode_flags & 0x40);
 
     /** get frame len */
-    bits = ff_wma_get_frame_len_bits(avctx->sample_rate, 3, s->decode_flags);
-    if (bits > WMAPRO_BLOCK_MAX_BITS) {
-        avpriv_request_sample(avctx, "14-bit block sizes");
-        return AVERROR_PATCHWELCOME;
+    if (avctx->codec_id == AV_CODEC_ID_WMAPRO) {
+        bits = ff_wma_get_frame_len_bits(avctx->sample_rate, 3, s->decode_flags);
+        if (bits > WMAPRO_BLOCK_MAX_BITS) {
+            avpriv_request_sample(avctx, "14-bit block sizes");
+            return AVERROR_PATCHWELCOME;
+        }
+        s->samples_per_frame = 1 << bits;
+    } else {
+        s->samples_per_frame = 512;
     }
-    s->samples_per_frame = 1 << bits;
 
     /** subframe info */
     log2_max_num_subframes       = ((s->decode_flags & 0x38) >> 3);
@@ -340,8 +406,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
     }
 
     if (s->min_samples_per_subframe < WMAPRO_BLOCK_MIN_SIZE) {
-        av_log(avctx, AV_LOG_ERROR, "Invalid minimum block size %"PRId8"\n",
-               s->max_num_subframes);
+        av_log(avctx, AV_LOG_ERROR, "min_samples_per_subframe of %d too small\n",
+               s->min_samples_per_subframe);
         return AVERROR_INVALIDDATA;
     }
 
@@ -409,18 +475,25 @@ static av_cold int decode_init(AVCodecContext *avctx)
         int subframe_len = s->samples_per_frame >> i;
         int x;
         int band = 1;
+        int rate = get_rate(avctx);
 
         s->sfb_offsets[i][0] = 0;
 
         for (x = 0; x < MAX_BANDS-1 && s->sfb_offsets[i][band - 1] < subframe_len; x++) {
-            int offset = (subframe_len * 2 * critical_freq[x])
-                          / s->avctx->sample_rate + 2;
+            int offset = (subframe_len * 2 * critical_freq[x]) / rate + 2;
             offset &= ~3;
             if (offset > s->sfb_offsets[i][band - 1])
                 s->sfb_offsets[i][band++] = offset;
+
+            if (offset >= subframe_len)
+                break;
         }
         s->sfb_offsets[i][band - 1] = subframe_len;
         s->num_sfb[i]               = band - 1;
+        if (s->num_sfb[i] <= 0) {
+            av_log(avctx, AV_LOG_ERROR, "num_sfb invalid\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
 
 
@@ -437,9 +510,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
                            + s->sfb_offsets[i][b + 1] - 1) << i) >> 1;
             for (x = 0; x < num_possible_block_sizes; x++) {
                 int v = 0;
-                while (s->sfb_offsets[x][v + 1] << x < offset)
-                    if (++v >= MAX_BANDS)
-                        return AVERROR_INVALIDDATA;
+                while (s->sfb_offsets[x][v + 1] << x < offset) {
+                    v++;
+                    av_assert0(v < MAX_BANDS);
+                }
                 s->sf_offsets[i][x][b] = v;
             }
         }
@@ -461,7 +535,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     /** calculate subwoofer cutoff values */
     for (i = 0; i < num_possible_block_sizes; i++) {
         int block_size = s->samples_per_frame >> i;
-        int cutoff = (440*block_size + 3 * (s->avctx->sample_rate >> 1) - 1)
+        int cutoff = (440*block_size + 3LL * (s->avctx->sample_rate >> 1) - 1)
                      / s->avctx->sample_rate;
         s->subwoofer_cutoffs[i] = av_clip(cutoff, 4, block_size);
     }
@@ -493,6 +567,9 @@ static int decode_subframe_length(WMAProDecodeCtx *s, int offset)
     if (offset == s->samples_per_frame - s->min_samples_per_subframe)
         return s->min_samples_per_subframe;
 
+    if (get_bits_left(&s->gb) < 1)
+        return AVERROR_INVALIDDATA;
+
     /** 1 bit indicates if the subframe is of maximum length */
     if (s->max_subframe_len_bit) {
         if (get_bits1(&s->gb))
@@ -671,7 +748,7 @@ static void decode_decorrelation_matrix(WMAProDecodeCtx *s,
 /**
  *@brief Decode channel transformation parameters
  *@param s codec context
- *@return 0 in case of success, < 0 in case of bitstream errors
+ *@return >= 0 in case of success, < 0 in case of bitstream errors
  */
 static int decode_channel_transform(WMAProDecodeCtx* s)
 {
@@ -1022,10 +1099,10 @@ static void inverse_channel_transform(WMAProDecodeCtx *s)
                     }
                 } else if (s->avctx->channels == 2) {
                     int len = FFMIN(sfb[1], s->subframe_len) - sfb[0];
-                    s->fdsp.vector_fmul_scalar(ch_data[0] + sfb[0],
+                    s->fdsp->vector_fmul_scalar(ch_data[0] + sfb[0],
                                                ch_data[0] + sfb[0],
                                                181.0 / 128, len);
-                    s->fdsp.vector_fmul_scalar(ch_data[1] + sfb[0],
+                    s->fdsp->vector_fmul_scalar(ch_data[1] + sfb[0],
                                                ch_data[1] + sfb[0],
                                                181.0 / 128, len);
                 }
@@ -1043,7 +1120,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
     int i;
     for (i = 0; i < s->channels_for_cur_subframe; i++) {
         int c = s->channel_indexes_for_cur_subframe[i];
-        float* window;
+        const float* window;
         int winlen = s->channel[c].prev_block_len;
         float* start = s->channel[c].coeffs - (winlen >> 1);
 
@@ -1056,7 +1133,7 @@ static void wmapro_window(WMAProDecodeCtx *s)
 
         winlen >>= 1;
 
-        s->fdsp.vector_fmul_window(start, start, start + winlen,
+        s->fdsp->vector_fmul_window(start, start, start + winlen,
                                    window, winlen);
 
         s->channel[c].prev_block_len = s->subframe_len;
@@ -1175,6 +1252,7 @@ static int decode_subframe(WMAProDecodeCtx *s)
             transmit_coeffs = 1;
     }
 
+    av_assert0(s->subframe_len <= WMAPRO_BLOCK_MAX_SIZE);
     if (transmit_coeffs) {
         int step;
         int quant_step = 90 * s->bits_per_sample >> 4;
@@ -1185,10 +1263,11 @@ static int decode_subframe(WMAProDecodeCtx *s)
             for (i = 0; i < s->channels_for_cur_subframe; i++) {
                 int c = s->channel_indexes_for_cur_subframe[i];
                 int num_vec_coeffs = get_bits(&s->gb, num_bits) << 2;
-                if (num_vec_coeffs + offset > FF_ARRAY_ELEMS(s->channel[c].out)) {
+                if (num_vec_coeffs > s->subframe_len) {
                     av_log(s->avctx, AV_LOG_ERROR, "num_vec_coeffs %d is too large\n", num_vec_coeffs);
                     return AVERROR_INVALIDDATA;
                 }
+                av_assert0(num_vec_coeffs + offset <= FF_ARRAY_ELEMS(s->channel[c].out));
                 s->channel[c].num_vec_coeffs = num_vec_coeffs;
             }
         } else {
@@ -1272,9 +1351,9 @@ static int decode_subframe(WMAProDecodeCtx *s)
                 const int exp = s->channel[c].quant_step -
                             (s->channel[c].max_scale_factor - *sf++) *
                             s->channel[c].scale_factor_step;
-                const float quant = pow(10.0, exp / 20.0);
+                const float quant = ff_exp10(exp / 20.0);
                 int start = s->cur_sfb_offsets[b];
-                s->fdsp.vector_fmul_scalar(s->tmp + start,
+                s->fdsp->vector_fmul_scalar(s->tmp + start,
                                            s->channel[c].coeffs + start,
                                            quant, end - start);
             }
@@ -1381,7 +1460,6 @@ static int decode_frame(WMAProDecodeCtx *s, AVFrame *frame, int *got_frame_ptr)
     /* get output buffer */
     frame->nb_samples = s->samples_per_frame;
     if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         s->packet_loss = 1;
         return 0;
     }
@@ -1464,7 +1542,7 @@ static void save_bits(WMAProDecodeCtx *s, GetBitContext* gb, int len,
         init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
     }
 
-    buflen = (s->num_saved_bits + len + 8) >> 3;
+    buflen = (put_bits_count(&s->pb) + len + 8) >> 3;
 
     if (len <= 0 || buflen > MAX_FRAMESIZE) {
         avpriv_request_sample(s->avctx, "Too small input buffer");
@@ -1472,13 +1550,7 @@ static void save_bits(WMAProDecodeCtx *s, GetBitContext* gb, int len,
         return;
     }
 
-    if (len > put_bits_left(&s->pb)) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "Cannot append %d bits, only %d bits available.\n",
-               len, put_bits_left(&s->pb));
-        s->packet_loss = 1;
-        return;
-    }
+    av_assert0(len <= put_bits_left(&s->pb));
 
     s->num_saved_bits += len;
     if (!append) {
@@ -1521,32 +1593,52 @@ static int decode_packet(AVCodecContext *avctx, void *data,
 
     *got_frame_ptr = 0;
 
+    if (s->skip_packets > 0) {
+        s->skip_packets--;
+        return FFMIN(avpkt->size, avctx->block_align);
+    }
+
     if (s->packet_done || s->packet_loss) {
         s->packet_done = 0;
 
         /** sanity check for the buffer length */
-        if (buf_size < avctx->block_align) {
+        if (avctx->codec_id == AV_CODEC_ID_WMAPRO && buf_size < avctx->block_align) {
             av_log(avctx, AV_LOG_ERROR, "Input packet too small (%d < %d)\n",
                    buf_size, avctx->block_align);
             return AVERROR_INVALIDDATA;
         }
 
-        s->next_packet_start = buf_size - avctx->block_align;
-        buf_size = avctx->block_align;
+        if (avctx->codec_id == AV_CODEC_ID_WMAPRO) {
+            s->next_packet_start = buf_size - avctx->block_align;
+            buf_size = avctx->block_align;
+        } else {
+            s->next_packet_start = buf_size - FFMIN(buf_size, avctx->block_align);
+            buf_size = FFMIN(buf_size, avctx->block_align);
+        }
         s->buf_bit_size = buf_size << 3;
 
         /** parse packet header */
         init_get_bits(gb, buf, s->buf_bit_size);
-        packet_sequence_number = get_bits(gb, 4);
-        skip_bits(gb, 2);
+        if (avctx->codec_id != AV_CODEC_ID_XMA2) {
+            packet_sequence_number = get_bits(gb, 4);
+            skip_bits(gb, 2);
+        } else {
+            s->num_frames = get_bits(gb, 6);
+            packet_sequence_number = 0;
+        }
 
         /** get number of bits that need to be added to the previous frame */
         num_bits_prev_frame = get_bits(gb, s->log2_frame_size);
+        if (avctx->codec_id != AV_CODEC_ID_WMAPRO) {
+            skip_bits(gb, 3);
+            s->skip_packets = get_bits(gb, 8);
+        }
+
         ff_dlog(avctx, "packet[%d]: nbpf %x\n", avctx->frame_number,
                 num_bits_prev_frame);
 
         /** check for packet loss */
-        if (!s->packet_loss &&
+        if (avctx->codec_id != AV_CODEC_ID_XMA2 && !s->packet_loss &&
             ((s->packet_sequence_number + 1) & 0xF) != packet_sequence_number) {
             s->packet_loss = 1;
             av_log(avctx, AV_LOG_ERROR,
@@ -1593,7 +1685,8 @@ static int decode_packet(AVCodecContext *avctx, void *data,
             (frame_size = show_bits(gb, s->log2_frame_size)) &&
             frame_size <= remaining_bits(s, gb)) {
             save_bits(s, gb, frame_size, 0);
-            s->packet_done = !decode_frame(s, data, got_frame_ptr);
+            if (!s->packet_loss)
+                s->packet_done = !decode_frame(s, data, got_frame_ptr);
         } else if (!s->len_prefix
                    && s->num_saved_bits > get_bits_count(&s->gb)) {
             /** when the frames do not have a length prefix, we don't know
@@ -1608,6 +1701,11 @@ static int decode_packet(AVCodecContext *avctx, void *data,
             s->packet_done = 1;
     }
 
+    if (remaining_bits(s, gb) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Overread %d\n", -remaining_bits(s, gb));
+        s->packet_loss = 1;
+    }
+
     if (s->packet_done && !s->packet_loss &&
         remaining_bits(s, gb) > 0) {
         /** save the rest of the data so that it can be decoded
@@ -1656,3 +1754,33 @@ AVCodec ff_wmapro_decoder = {
     .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
                                                       AV_SAMPLE_FMT_NONE },
 };
+
+AVCodec ff_xma1_decoder = {
+    .name           = "xma1",
+    .long_name      = NULL_IF_CONFIG_SMALL("Xbox Media Audio 1"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_XMA1,
+    .priv_data_size = sizeof(WMAProDecodeCtx),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_packet,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+    .flush          = flush,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+};
+
+AVCodec ff_xma2_decoder = {
+    .name           = "xma2",
+    .long_name      = NULL_IF_CONFIG_SMALL("Xbox Media Audio 2"),
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = AV_CODEC_ID_XMA2,
+    .priv_data_size = sizeof(WMAProDecodeCtx),
+    .init           = decode_init,
+    .close          = decode_end,
+    .decode         = decode_packet,
+    .capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1,
+    .flush          = flush,
+    .sample_fmts    = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+                                                      AV_SAMPLE_FMT_NONE },
+};
diff --git a/libavcodec/wmavoice.c b/libavcodec/wmavoice.c
index 62b603c..ceac61f 100644
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -2,20 +2,20 @@
  * Windows Media Audio Voice decoder.
  * Copyright (c) 2009 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,6 @@
  * @author Ronald S. Bultje <rsbultje@gmail.com>
  */
 
-#define UNCHECKED_BITSTREAM_READER 1
-
 #include <math.h>
 
 #include "libavutil/channel_layout.h"
@@ -520,7 +518,7 @@ static int kalman_smoothen(WMAVoiceContext *s, int pitch,
     float optimal_gain = 0, dot;
     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
-                *best_hist_ptr;
+                *best_hist_ptr = NULL;
 
     /* find best fitting point in history */
     do {
@@ -780,7 +778,7 @@ static void postfilter(WMAVoiceContext *s, const float *synth,
           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
           *synth_filter_in = zero_exc_pf;
 
-    assert(size <= MAX_FRAMESIZE / 2);
+    av_assert0(size <= MAX_FRAMESIZE / 2);
 
     /* generate excitation from input signal */
     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
@@ -1249,7 +1247,7 @@ static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
     float gain;
     int n, r_idx;
 
-    assert(size <= MAX_FRAMESIZE);
+    av_assert0(size <= MAX_FRAMESIZE);
 
     /* Set the offset from which we start reading wmavoice_std_codebook */
     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
@@ -1285,7 +1283,7 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
     int n, idx, gain_weight;
     AMRFixed fcb;
 
-    assert(size <= MAX_FRAMESIZE / 2);
+    av_assert0(size <= MAX_FRAMESIZE / 2);
     memset(pulses, 0, sizeof(*pulses) * size);
 
     fcb.pitch_lag      = block_pitch_sh2 >> 2;
@@ -1456,8 +1454,8 @@ static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
                        float *excitation, float *synth)
 {
     WMAVoiceContext *s = ctx->priv_data;
-    int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
-    int pitch[MAX_BLOCKS], last_block_pitch;
+    int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
+    int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
 
     /* Parse frame type ("frame header"), see frame_descs */
     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
@@ -1674,7 +1672,7 @@ static int check_bits_for_superframe(GetBitContext *orig_gb,
     /* initialize a copy */
     init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
     skip_bits_long(gb, get_bits_count(orig_gb));
-    assert(get_bits_left(gb) == get_bits_left(orig_gb));
+    av_assert1(get_bits_left(gb) == get_bits_left(orig_gb));
 
     /* superframe header */
     if (get_bits_left(gb) < 14)
@@ -1820,10 +1818,8 @@ static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
 
     /* get output buffer */
     frame->nb_samples = 480;
-    if ((res = ff_get_buffer(ctx, frame, 0)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
         return res;
-    }
     frame->nb_samples = n_samples;
     samples = (float *)frame->data[0];
 
@@ -1955,7 +1951,7 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
     int size, res, pos;
 
     /* Packets are sometimes a multiple of ctx->block_align, with a packet
-     * header at each ctx->block_align bytes. However, Libav's ASF demuxer
+     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
      * feeds us ASF packets, which may concatenate multiple "codec" packets
      * in a single "muxer" packet, so we artificially emulate that by
      * capping the packet size at ctx->block_align. */
@@ -1986,7 +1982,14 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
                     *got_frame_ptr) {
                     cnt += s->spillover_nbits;
                     s->skip_bits_next = cnt & 7;
-                    return cnt >> 3;
+                    res = cnt >> 3;
+                    if (res > avpkt->size) {
+                        av_log(ctx, AV_LOG_ERROR,
+                               "Trying to skip %d bytes in packet of size %d\n",
+                               res, avpkt->size);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    return res;
                 } else
                     skip_bits_long (gb, s->spillover_nbits - cnt +
                                     get_bits_count(gb)); // resync
@@ -2005,12 +2008,19 @@ static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
     } else if (*got_frame_ptr) {
         int cnt = get_bits_count(gb);
         s->skip_bits_next = cnt & 7;
-        return cnt >> 3;
+        res = cnt >> 3;
+        if (res > avpkt->size) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Trying to skip %d bytes in packet of size %d\n",
+                   res, avpkt->size);
+            return AVERROR_INVALIDDATA;
+        }
+        return res;
     } else if ((s->sframe_cache_size = pos) > 0) {
         /* rewind bit reader to start of last (incomplete) superframe... */
         init_get_bits(gb, avpkt->data, size << 3);
         skip_bits_long(gb, (size << 3) - pos);
-        assert(get_bits_left(gb) == pos);
+        av_assert1(get_bits_left(gb) == pos);
 
         /* ...and cache it for spillover in next packet */
         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
diff --git a/libavcodec/wmavoice_data.h b/libavcodec/wmavoice_data.h
index 7f14fb8..cbf65b0 100644
--- a/libavcodec/wmavoice_data.h
+++ b/libavcodec/wmavoice_data.h
@@ -2,20 +2,20 @@
  * Windows Media Voice (WMAVoice) tables.
  * Copyright (c) 2009 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2.c b/libavcodec/wmv2.c
index 6542d31..327c5bd 100644
--- a/libavcodec/wmv2.c
+++ b/libavcodec/wmv2.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -104,8 +104,8 @@ void ff_mspel_motion(MpegEncContext *s, uint8_t *dest_y,
 {
     Wmv2Context *const w = (Wmv2Context *) s;
     uint8_t *ptr;
-    int dxy, offset, mx, my, src_x, src_y, v_edge_pos;
-    ptrdiff_t linesize, uvlinesize;
+    int dxy, mx, my, src_x, src_y, v_edge_pos;
+    ptrdiff_t offset, linesize, uvlinesize;
     int emu = 0;
 
     dxy   = ((motion_y & 1) << 1) | (motion_x & 1);
@@ -145,21 +145,13 @@ void ff_mspel_motion(MpegEncContext *s, uint8_t *dest_y,
     if (s->avctx->flags & AV_CODEC_FLAG_GRAY)
         return;
 
-    if (s->out_format == FMT_H263) {
-        dxy = 0;
-        if ((motion_x & 3) != 0)
-            dxy |= 1;
-        if ((motion_y & 3) != 0)
-            dxy |= 2;
-        mx = motion_x >> 2;
-        my = motion_y >> 2;
-    } else {
-        mx   = motion_x / 2;
-        my   = motion_y / 2;
-        dxy  = ((my & 1) << 1) | (mx & 1);
-        mx >>= 1;
-        my >>= 1;
-    }
+    dxy = 0;
+    if ((motion_x & 3) != 0)
+        dxy |= 1;
+    if ((motion_y & 3) != 0)
+        dxy |= 2;
+    mx = motion_x >> 2;
+    my = motion_y >> 2;
 
     src_x = s->mb_x * 8 + mx;
     src_y = s->mb_y * 8 + my;
diff --git a/libavcodec/wmv2.h b/libavcodec/wmv2.h
index b77dd98..31593b8 100644
--- a/libavcodec/wmv2.h
+++ b/libavcodec/wmv2.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,4 +70,16 @@ void ff_mspel_motion(MpegEncContext *s,
                      uint8_t **ref_picture, op_pixels_func (*pix_op)[4],
                      int motion_x, int motion_y, int h);
 
+
+static av_always_inline int wmv2_get_cbp_table_index(MpegEncContext *s, int cbp_index)
+{
+    static const uint8_t map[3][3] = {
+        { 0, 2, 1 },
+        { 1, 0, 2 },
+        { 2, 1, 0 },
+    };
+
+    return map[(s->qscale > 10) + (s->qscale > 20)][cbp_index];
+}
+
 #endif /* AVCODEC_WMV2_H */
diff --git a/libavcodec/wmv2data.c b/libavcodec/wmv2data.c
index bbb07bb..e858572 100644
--- a/libavcodec/wmv2data.c
+++ b/libavcodec/wmv2data.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2data.h b/libavcodec/wmv2data.h
index 8914e57..178346a 100644
--- a/libavcodec/wmv2data.h
+++ b/libavcodec/wmv2data.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2dec.c b/libavcodec/wmv2dec.c
index e1f86d8..20dbee5 100644
--- a/libavcodec/wmv2dec.c
+++ b/libavcodec/wmv2dec.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -108,7 +108,7 @@ static int decode_ext_header(Wmv2Context *w)
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG,
-               "fps:%d, br:%d, qpbit:%d, abt_flag:%d, j_type_bit:%d, "
+               "fps:%d, br:%"PRId64", qpbit:%d, abt_flag:%d, j_type_bit:%d, "
                "tl_mv_flag:%d, mbrl_bit:%d, code:%d, loop_filter:%d, "
                "slices:%d\n",
                fps, s->bit_rate, w->mspel_bit, w->abt_flag, w->j_type_bit,
@@ -174,16 +174,7 @@ int ff_wmv2_decode_secondary_picture_header(MpegEncContext *s)
 
         parse_mb_skip(w);
         cbp_index = decode012(&s->gb);
-        if (s->qscale <= 10) {
-            int map[3]         = { 0, 2, 1 };
-            w->cbp_table_index = map[cbp_index];
-        } else if (s->qscale <= 20) {
-            int map[3]         = { 1, 0, 2 };
-            w->cbp_table_index = map[cbp_index];
-        } else {
-            int map[3]         = {2,1,0};
-            w->cbp_table_index = map[cbp_index];
-        }
+        w->cbp_table_index = wmv2_get_cbp_table_index(s, cbp_index);
 
         if (w->mspel_bit)
             s->mspel = get_bits1(&s->gb);
@@ -469,6 +460,10 @@ static av_cold int wmv2_decode_init(AVCodecContext *avctx)
     Wmv2Context *const w = avctx->priv_data;
     int ret;
 
+#if FF_API_EMU_EDGE
+    avctx->flags |= CODEC_FLAG_EMU_EDGE;
+#endif
+
     if ((ret = ff_msmpeg4_decode_init(avctx)) < 0)
         return ret;
 
diff --git a/libavcodec/wmv2dsp.c b/libavcodec/wmv2dsp.c
index 2e3a3ff..40e0bef 100644
--- a/libavcodec/wmv2dsp.c
+++ b/libavcodec/wmv2dsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2dsp.h b/libavcodec/wmv2dsp.h
index f2f258e..0bf9489 100644
--- a/libavcodec/wmv2dsp.h
+++ b/libavcodec/wmv2dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/wmv2enc.c b/libavcodec/wmv2enc.c
index b09942e..74ae12b 100644
--- a/libavcodec/wmv2enc.c
+++ b/libavcodec/wmv2enc.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (c) 2002 The Libav Project
+ * Copyright (c) 2002 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -62,7 +62,7 @@ static av_cold int wmv2_encode_init(AVCodecContext *avctx)
     ff_wmv2_common_init(w);
 
     avctx->extradata_size = 4;
-    avctx->extradata      = av_mallocz(avctx->extradata_size + 10);
+    avctx->extradata      = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
     if (!avctx->extradata)
         return AVERROR(ENOMEM);
 
@@ -88,10 +88,10 @@ int ff_wmv2_encode_picture_header(MpegEncContext *s, int picture_number)
     w->abt_type        = 0;
     w->j_type          = 0;
 
-    assert(s->flipflop_rounding);
+    av_assert0(s->flipflop_rounding);
 
     if (s->pict_type == AV_PICTURE_TYPE_I) {
-        assert(s->no_rounding == 1);
+        av_assert0(s->no_rounding == 1);
         if (w->j_type_bit)
             put_bits(&s->pb, 1, w->j_type);
 
@@ -112,16 +112,7 @@ int ff_wmv2_encode_picture_header(MpegEncContext *s, int picture_number)
         put_bits(&s->pb, 2, SKIP_TYPE_NONE);
 
         ff_msmpeg4_code012(&s->pb, cbp_index = 0);
-        if (s->qscale <= 10) {
-            int map[3]         = { 0, 2, 1 };
-            w->cbp_table_index = map[cbp_index];
-        } else if (s->qscale <= 20) {
-            int map[3]         = { 1, 0, 2 };
-            w->cbp_table_index = map[cbp_index];
-        } else {
-            int map[3]         = { 2, 1, 0 };
-            w->cbp_table_index = map[cbp_index];
-        }
+        w->cbp_table_index = wmv2_get_cbp_table_index(s, cbp_index);
 
         if (w->mspel_bit)
             put_bits(&s->pb, 1, s->mspel);
@@ -174,10 +165,12 @@ void ff_wmv2_encode_mb(MpegEncContext *s, int16_t block[6][64],
                  ff_wmv2_inter_table[w->cbp_table_index][cbp + 64][1],
                  ff_wmv2_inter_table[w->cbp_table_index][cbp + 64][0]);
 
+        s->misc_bits += get_bits_diff(s);
         /* motion vector */
         ff_h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
         ff_msmpeg4_encode_motion(s, motion_x - pred_x,
                                  motion_y - pred_y);
+        s->mv_bits += get_bits_diff(s);
     } else {
         /* compute cbp */
         cbp       = 0;
@@ -210,10 +203,15 @@ void ff_wmv2_encode_mb(MpegEncContext *s, int16_t block[6][64],
                      ff_table_inter_intra[s->h263_aic_dir][1],
                      ff_table_inter_intra[s->h263_aic_dir][0]);
         }
+        s->misc_bits += get_bits_diff(s);
     }
 
     for (i = 0; i < 6; i++)
         ff_msmpeg4_encode_block(s, block[i], i);
+    if (s->mb_intra)
+        s->i_tex_bits += get_bits_diff(s);
+    else
+        s->p_tex_bits += get_bits_diff(s);
 }
 
 static const AVClass wmv2_class = {
diff --git a/libavcodec/wnv1.c b/libavcodec/wnv1.c
index d0304c9..9ff99b2 100644
--- a/libavcodec/wnv1.c
+++ b/libavcodec/wnv1.c
@@ -2,20 +2,20 @@
  * Winnov WNV1 codec
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,8 +31,6 @@
 
 
 typedef struct WNV1Context {
-    AVCodecContext *avctx;
-
     int shift;
     GetBitContext gb;
 } WNV1Context;
@@ -70,8 +68,8 @@ static int decode_frame(AVCodecContext *avctx,
     int prev_y = 0, prev_u = 0, prev_v = 0;
     uint8_t *rbuf;
 
-    if (buf_size < 8) {
-        av_log(avctx, AV_LOG_ERROR, "Packet is too short\n");
+    if (buf_size <= 8) {
+        av_log(avctx, AV_LOG_ERROR, "Packet size %d is too small\n", buf_size);
         return AVERROR_INVALIDDATA;
     }
 
@@ -80,9 +78,9 @@ static int decode_frame(AVCodecContext *avctx,
         av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer\n");
         return AVERROR(ENOMEM);
     }
+    memset(rbuf + buf_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
 
     if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         av_free(rbuf);
         return ret;
     }
@@ -90,7 +88,9 @@ static int decode_frame(AVCodecContext *avctx,
 
     for (i = 8; i < buf_size; i++)
         rbuf[i] = ff_reverse[buf[i]];
-    init_get_bits(&l->gb, rbuf + 8, (buf_size - 8) * 8);
+
+    if ((ret = init_get_bits8(&l->gb, rbuf + 8, buf_size - 8)) < 0)
+        return ret;
 
     if (buf[2] >> 4 == 6)
         l->shift = 2;
@@ -134,10 +134,8 @@ static int decode_frame(AVCodecContext *avctx,
 
 static av_cold int decode_init(AVCodecContext *avctx)
 {
-    WNV1Context * const l = avctx->priv_data;
     static VLC_TYPE code_table[1 << CODE_VLC_BITS][2];
 
-    l->avctx       = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV422P;
 
     code_vlc.table           = code_table;
diff --git a/libavcodec/wrapped_avframe.c b/libavcodec/wrapped_avframe.c
index e1273e4..13c8d8a 100644
--- a/libavcodec/wrapped_avframe.c
+++ b/libavcodec/wrapped_avframe.c
@@ -2,20 +2,20 @@
  * AVFrame wrapper
  * Copyright (c) 2015 Luca Barbato
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/ws-snd1.c b/libavcodec/ws-snd1.c
index 11b7289..0f00580 100644
--- a/libavcodec/ws-snd1.c
+++ b/libavcodec/ws-snd1.c
@@ -2,20 +2,20 @@
  * Westwood SNDx codecs
  * Copyright (c) 2005 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -76,15 +76,13 @@ static int ws_snd_decode_frame(AVCodecContext *avctx, void *data,
 
     if (in_size > buf_size) {
         av_log(avctx, AV_LOG_ERROR, "Frame data is larger than input buffer\n");
-        return -1;
+        return AVERROR_INVALIDDATA;
     }
 
     /* get output buffer */
     frame->nb_samples = out_size;
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
         return ret;
-    }
     samples     = frame->data[0];
     samples_end = samples + out_size;
 
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index cdf7758..839b5bc 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -3,11 +3,14 @@ OBJS                                   += x86/constants.o               \
 # subsystems
 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
-OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp.o
+OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
+OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
+                                          x86/dirac_dwt_init.o
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
+OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
 OBJS-$(CONFIG_FMTCONVERT)              += x86/fmtconvert_init.o
 OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
 OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
@@ -15,11 +18,12 @@ OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
 OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
 OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
-OBJS-$(CONFIG_MDCT)                    += x86/mdct_init.o
 OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
@@ -36,43 +40,52 @@ OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 # decoders/encoders
-OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
-OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
+OBJS-$(CONFIG_AAC_DECODER)             += x86/aacpsdsp_init.o          \
+                                          x86/sbrdsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_DECODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
+OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
+OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
-OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
+OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
-OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp.o
+OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
 OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
 OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
-OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc.o
-OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
+OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
+OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
+OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
+OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
 OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
-OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o            \
+                                          x86/vp9dsp_init_10bpp.o      \
+                                          x86/vp9dsp_init_12bpp.o      \
+                                          x86/vp9dsp_init_16bpp.o
+OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
 
 
 # GCC inline assembly optimizations
 # subsystems
-MMX-OBJS-$(CONFIG_AUDIODSP)            += x86/audiodsp_mmx.o
 MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
-MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
-                                          x86/hpeldsp_mmx.o
-MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
-                                          x86/simple_idct.o
-MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/simple_idct.o
 MMX-OBJS-$(CONFIG_VC1DSP)              += x86/vc1dsp_mmx.o
 
 # decoders/encoders
-MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
-                                          x86/xvididct_sse2.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
 
 # subsystems
 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
+YASM-OBJS-$(CONFIG_BLOCKDSP)           += x86/blockdsp.o
 YASM-OBJS-$(CONFIG_BSWAPDSP)           += x86/bswapdsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
@@ -95,6 +108,10 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                           x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
+YASM-OBJS-$(CONFIG_HUFFYUVENCDSP)      += x86/huffyuvencdsp.o
+YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
+YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
+YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
 YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
@@ -103,23 +120,56 @@ YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                           x86/fpel.o                    \
                                           x86/qpel.o
 YASM-OBJS-$(CONFIG_RV34DSP)            += x86/rv34dsp.o
-YASM-OBJS-$(CONFIG_VC1DSP)             += x86/vc1dsp.o
+YASM-OBJS-$(CONFIG_VC1DSP)             += x86/vc1dsp_loopfilter.o       \
+                                          x86/vc1dsp_mc.o
+YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/simple_idct10.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP8DSP)             += x86/vp8dsp.o                  \
                                           x86/vp8dsp_loopfilter.o
 
 # decoders/encoders
-YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
-YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
-YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
+YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/aacpsdsp.o                \
+                                          x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
+YASM-OBJS-$(CONFIG_ALAC_DECODER)       += x86/alacdsp.o
+YASM-OBJS-$(CONFIG_APNG_DECODER)       += x86/pngdsp.o
+YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o x86/synth_filter.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp.o                \
+                                          x86/dirac_dwt.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
-YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_deblock.o            \
-                                          x86/hevc_mc.o
+YASM-OBJS-$(CONFIG_FLAC_DECODER)       += x86/flacdsp.o
+ifdef CONFIG_GPL
+YASM-OBJS-$(CONFIG_FLAC_ENCODER)       += x86/flac_dsp_gpl.o
+endif
+YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
+                                          x86/hevc_deblock.o            \
+                                          x86/hevc_idct.o               \
+                                          x86/hevc_res_add.o            \
+                                          x86/hevc_sao.o                \
+                                          x86/hevc_sao_10bit.o
+YASM-OBJS-$(CONFIG_JPEG2000_DECODER)   += x86/jpeg2000dsp.o
+YASM-OBJS-$(CONFIG_MLP_DECODER)        += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_MPEG4_DECODER)      += x86/xvididct.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_SVQ1_ENCODER)       += x86/svq1enc.o
+YASM-OBJS-$(CONFIG_TAK_DECODER)        += x86/takdsp.o
+YASM-OBJS-$(CONFIG_TRUEHD_DECODER)     += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_TTA_DECODER)        += x86/ttadsp.o
 YASM-OBJS-$(CONFIG_V210_ENCODER)       += x86/v210enc.o
+YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
 YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
-YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9dsp.o
+YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9intrapred.o            \
+                                          x86/vp9intrapred_16bpp.o      \
+                                          x86/vp9itxfm.o                \
+                                          x86/vp9itxfm_16bpp.o          \
+                                          x86/vp9lpf.o                  \
+                                          x86/vp9lpf_16bpp.o            \
+                                          x86/vp9mc.o                   \
+                                          x86/vp9mc_16bpp.o
+YASM-OBJS-$(CONFIG_WEBP_DECODER)       += x86/vp8dsp.o
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm
new file mode 100644
index 0000000..e92cbbc
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp.asm
@@ -0,0 +1,209 @@
+;******************************************************************************
+;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
+
+SECTION .text
+
+;*************************************************************************
+;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
+;*************************************************************************
+%macro PS_ADD_SQUARES 1
+cglobal ps_add_squares, 3, 3, %1, dst, src, n
+    shl    nd, 3
+    add  srcq, nq
+    neg    nq
+
+align 16
+.loop:
+    movaps m0, [srcq+nq]
+    movaps m1, [srcq+nq+mmsize]
+    mulps  m0, m0
+    mulps  m1, m1
+    HADDPS m0, m1, m2
+    addps  m0, [dstq]
+    movaps [dstq], m0
+    add  dstq, mmsize
+    add    nq, mmsize*2
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_ADD_SQUARES 2
+INIT_XMM sse3
+PS_ADD_SQUARES 3
+
+;*******************************************************************
+;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
+;                                   float *src1, int n);
+;*******************************************************************
+INIT_XMM sse
+cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
+    xor r4q, r4q
+
+.loop:
+    movu     m0, [src1q+r4q]
+    movu     m1, [src1q+r4q+mmsize]
+    mova     m2, [src2q]
+    mova     m3, m2
+    unpcklps m2, m2
+    unpckhps m3, m3
+    mulps    m0, m2
+    mulps    m1, m3
+    mova [dstq+r4q], m0
+    mova [dstq+r4q+mmsize], m1
+    add   src2q, mmsize
+    add     r4q, mmsize*2
+    sub      nd, mmsize/4
+    jg .loop
+    REP_RET
+
+;***********************************************************************
+;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+;                                   float h[2][4], float h_step[2][4],
+;                                   int len);
+;***********************************************************************
+INIT_XMM sse3
+cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
+    movaps   m0, [hq]
+    movaps   m1, [h_stepq]
+    cmp      nd, 0
+    jle .ret
+    shl      nd, 3
+    add      lq, nq
+    add      rq, nq
+    neg      nq
+
+align 16
+.loop:
+    addps    m0, m1
+    movddup  m2, [lq+nq]
+    movddup  m3, [rq+nq]
+    movaps   m4, m0
+    movaps   m5, m0
+    unpcklps m4, m4
+    unpckhps m5, m5
+    mulps    m2, m4
+    mulps    m3, m5
+    addps    m2, m3
+    movsd  [lq+nq], m2
+    movhps [rq+nq], m2
+    add      nq, 8
+    jl .loop
+.ret:
+    REP_RET
+
+;*******************************************************************
+;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
+;                                 const float (*filter)[8][2],
+;                                 int stride, int n);
+;*******************************************************************
+%macro PS_HYBRID_ANALYSIS_LOOP 3
+    movu     %1, [inq+mmsize*%3]
+    movu     m1, [inq+mmsize*(5-%3)+8]
+%if cpuflag(sse3)
+    pshufd   %2, %1, q2301
+    pshufd   m4, m1, q0123
+    pshufd   m1, m1, q1032
+    pshufd   m2, [filterq+nq+mmsize*%3], q2301
+    addsubps %2, m4
+    addsubps %1, m1
+%else
+    mova     m2, [filterq+nq+mmsize*%3]
+    mova     %2, %1
+    mova     m4, m1
+    shufps   %2, %2, q2301
+    shufps   m4, m4, q0123
+    shufps   m1, m1, q1032
+    shufps   m2, m2, q2301
+    xorps    m4, m7
+    xorps    m1, m7
+    subps    %2, m4
+    subps    %1, m1
+%endif
+    mulps    %2, m2
+    mulps    %1, m2
+%if %3
+    addps    m3, %2
+    addps    m0, %1
+%endif
+%endmacro
+
+%macro PS_HYBRID_ANALYSIS 0
+cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
+%if cpuflag(sse3)
+%define MOVH movsd
+%else
+%define MOVH movlps
+%endif
+    shl strided, 3
+    shl nd, 6
+    add filterq, nq
+    neg nq
+    mova m7, [ps_p1m1p1m1]
+
+align 16
+.loop:
+    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
+    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
+
+%if cpuflag(sse3)
+    pshufd   m3, m3, q2301
+    xorps    m0, m7
+    hsubps   m3, m0
+    pshufd   m1, m3, q0020
+    pshufd   m3, m3, q0031
+    addps    m1, m3
+    movsd    m2, [inq+6*8]
+%else
+    mova     m1, m3
+    mova     m2, m0
+    shufps   m1, m1, q2301
+    shufps   m2, m2, q2301
+    subps    m1, m3
+    addps    m2, m0
+    unpcklps m3, m1, m2
+    unpckhps m1, m2
+    addps    m1, m3
+    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
+%endif
+    movss    m3, [filterq+nq+8*6]
+    SPLATD   m3
+    mulps    m2, m3
+    addps    m1, m2
+    MOVH [outq], m1
+    add    outq, strideq
+    add      nq, 64
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+PS_HYBRID_ANALYSIS
+INIT_XMM sse3
+PS_HYBRID_ANALYSIS
diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c
new file mode 100644
index 0000000..f6d6c03
--- /dev/null
+++ b/libavcodec/x86/aacpsdsp_init.c
@@ -0,0 +1,55 @@
+/*
+ * SIMD optimized MPEG-4 Parametric Stereo decoding functions
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_sse  (float *dst, const float (*src)[2], int n);
+void ff_ps_add_squares_sse3 (float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                int stride, int n);
+void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                int stride, int n);
+void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+
+av_cold void ff_psdsp_init_x86(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse;
+        s->mul_pair_single        = ff_ps_mul_pair_single_sse;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse;
+    }
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->add_squares            = ff_ps_add_squares_sse3;
+        s->stereo_interpolate[0]  = ff_ps_stereo_interpolate_sse3;
+        s->hybrid_analysis        = ff_ps_hybrid_analysis_sse3;
+    }
+}
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 817d5a3..675ade3 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized AC-3 DSP functions
 ;* Copyright (c) 2011 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -32,7 +32,7 @@ pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
 
 ; used in ff_ac3_extract_exponents()
-pd_1:   times 4 dd 1
+cextern pd_1
 pd_151: times 4 dd 151
 
 ; used in ff_apply_window_int16()
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 23f0162..07f0d25 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -2,20 +2,20 @@
  * x86-optimized AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -63,6 +63,11 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 
+#if ARCH_X86_32 && defined(__INTEL_COMPILER)
+#       undef HAVE_7REGS
+#       define HAVE_7REGS 0
+#endif
+
 #if HAVE_SSE_INLINE && HAVE_7REGS
 
 #define IF1(x) x
@@ -159,7 +164,7 @@ static void ac3_downmix_sse(float **samples, float (*matrix)[2],
                matrix_cmp[3][0] == matrix_cmp[4][0]) {
         MIX5(IF1, IF0);
     } else {
-        DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
+        LOCAL_ALIGNED(16, float, matrix_simd, [AC3_MAX_CHANNELS], [2][4]);
         float *samp[AC3_MAX_CHANNELS];
 
         for (j = 0; j < in_ch; j++)
diff --git a/libavcodec/x86/alacdsp.asm b/libavcodec/x86/alacdsp.asm
new file mode 100644
index 0000000..bb2069f
--- /dev/null
+++ b/libavcodec/x86/alacdsp.asm
@@ -0,0 +1,133 @@
+;******************************************************************************
+;* ALAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
+%else
+cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
+%define  buf1q  r2q
+%endif
+    movd    m6, shiftm
+    movd    m7, weightm
+    SPLATD  m7
+    shl   lend, 2
+    mov  buf1q, [buf0q + gprsize]
+    mov  buf0q, [buf0q]
+    add  buf1q, lenq
+    add  buf0q, lenq
+    neg  lenq
+
+align 16
+.loop:
+    mova    m0, [buf0q + lenq]
+    mova    m1, [buf0q + lenq + mmsize]
+    mova    m2, [buf1q + lenq]
+    mova    m3, [buf1q + lenq + mmsize]
+    pmulld  m4, m2, m7
+    pmulld  m5, m3, m7
+    psrad   m4, m6
+    psrad   m5, m6
+    psubd   m0, m4
+    psubd   m1, m5
+    paddd   m2, m0
+    paddd   m3, m1
+    mova [buf1q + lenq], m0
+    mova [buf1q + lenq + mmsize], m1
+    mova [buf0q + lenq], m2
+    mova [buf0q + lenq + mmsize], m3
+
+    add   lenq, mmsize*2
+    jl .loop
+    RET
+
+INIT_XMM sse2
+cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
+    movifnidn lend, lenm
+    movd      m4, r2m ; exbits
+    shl     lend, 2
+    mov    buf1q, [buf0q + gprsize]
+    mov    buf0q, [buf0q]
+    mov  exbuf1q, [exbuf0q + gprsize]
+    mov  exbuf0q, [exbuf0q]
+    add    buf1q, lenq
+    add    buf0q, lenq
+    add  exbuf1q, lenq
+    add  exbuf0q, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [buf0q + lenq]
+    mova      m1, [buf0q + lenq + mmsize]
+    pslld     m0, m4
+    pslld     m1, m4
+    mova      m2, [buf1q + lenq]
+    mova      m3, [buf1q + lenq + mmsize]
+    pslld     m2, m4
+    pslld     m3, m4
+    por       m0, [exbuf0q + lenq]
+    por       m1, [exbuf0q + lenq + mmsize]
+    por       m2, [exbuf1q + lenq]
+    por       m3, [exbuf1q + lenq + mmsize]
+    mova [buf0q + lenq         ], m0
+    mova [buf0q + lenq + mmsize], m1
+    mova [buf1q + lenq         ], m2
+    mova [buf1q + lenq + mmsize], m3
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
+
+%if ARCH_X86_64
+cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
+%else
+cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
+%define exbitsm r2m
+%endif
+    movifnidn lend, r4m
+    movd     m2, exbitsm
+    shl    lend, 2
+    mov    bufq, [bufq]
+    mov  exbufq, [exbufq]
+    add    bufq, lenq
+    add  exbufq, lenq
+    neg lenq
+
+align 16
+.loop:
+    mova      m0, [bufq + lenq]
+    mova      m1, [bufq + lenq + mmsize]
+    pslld     m0, m2
+    pslld     m1, m2
+    por       m0, [exbufq + lenq]
+    por       m1, [exbufq + lenq + mmsize]
+    mova [bufq + lenq], m0
+    mova [bufq + lenq + mmsize], m1
+
+    add     lenq, mmsize*2
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/alacdsp_init.c b/libavcodec/x86/alacdsp_init.c
new file mode 100644
index 0000000..de5dae6
--- /dev/null
+++ b/libavcodec/x86/alacdsp_init.c
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/alacdsp.h"
+#include "config.h"
+
+void ff_alac_decorrelate_stereo_sse4(int32_t *buffer[2], int nb_samples,
+                                     int decorr_shift, int decorr_left_weight);
+void ff_alac_append_extra_bits_stereo_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                           int extra_bits, int channels, int nb_samples);
+void ff_alac_append_extra_bits_mono_sse2(int32_t *buffer[2], int32_t *extra_bits_buffer[2],
+                                         int extra_bits, int channels, int nb_samples);
+
+av_cold void ff_alacdsp_init_x86(ALACDSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->append_extra_bits[0] = ff_alac_append_extra_bits_mono_sse2;
+        c->append_extra_bits[1] = ff_alac_append_extra_bits_stereo_sse2;
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_stereo   = ff_alac_decorrelate_stereo_sse4;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index 696a73b..3ffb27f 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -2,20 +2,20 @@
 ;* optimized audio functions
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,15 +40,11 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m1
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m2
-    paddd   m2, m0
-    pshuflw m0, m2, 0x4e
-%else
-    pshufw  m0, m2, 0x4e
-%endif
-    paddd   m2, m0
+    HADDD   m2, m0
     movd   eax, m2
+%if mmsize == 8
+    emms
+%endif
     RET
 %endmacro
 
@@ -80,17 +76,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
     SPLATD    m4
     SPLATD    m5
 .loop:
-%assign %%i 1
+%assign %%i 0
 %rep %2
-    mova      m0,  [srcq+mmsize*0*%%i]
-    mova      m1,  [srcq+mmsize*1*%%i]
-    mova      m2,  [srcq+mmsize*2*%%i]
-    mova      m3,  [srcq+mmsize*3*%%i]
+    mova      m0,  [srcq+mmsize*(0+%%i)]
+    mova      m1,  [srcq+mmsize*(1+%%i)]
+    mova      m2,  [srcq+mmsize*(2+%%i)]
+    mova      m3,  [srcq+mmsize*(3+%%i)]
 %if %3
-    mova      m7,  [srcq+mmsize*4*%%i]
-    mova      m8,  [srcq+mmsize*5*%%i]
-    mova      m9,  [srcq+mmsize*6*%%i]
-    mova      m10, [srcq+mmsize*7*%%i]
+    mova      m7,  [srcq+mmsize*(4+%%i)]
+    mova      m8,  [srcq+mmsize*(5+%%i)]
+    mova      m9,  [srcq+mmsize*(6+%%i)]
+    mova      m10, [srcq+mmsize*(7+%%i)]
 %endif
     CLIPD  m0,  m4, m5, m6
     CLIPD  m1,  m4, m5, m6
@@ -102,17 +98,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
     CLIPD  m9,  m4, m5, m6
     CLIPD  m10, m4, m5, m6
 %endif
-    mova  [dstq+mmsize*0*%%i], m0
-    mova  [dstq+mmsize*1*%%i], m1
-    mova  [dstq+mmsize*2*%%i], m2
-    mova  [dstq+mmsize*3*%%i], m3
+    mova  [dstq+mmsize*(0+%%i)], m0
+    mova  [dstq+mmsize*(1+%%i)], m1
+    mova  [dstq+mmsize*(2+%%i)], m2
+    mova  [dstq+mmsize*(3+%%i)], m3
 %if %3
-    mova  [dstq+mmsize*4*%%i], m7
-    mova  [dstq+mmsize*5*%%i], m8
-    mova  [dstq+mmsize*6*%%i], m9
-    mova  [dstq+mmsize*7*%%i], m10
+    mova  [dstq+mmsize*(4+%%i)], m7
+    mova  [dstq+mmsize*(5+%%i)], m8
+    mova  [dstq+mmsize*(6+%%i)], m9
+    mova  [dstq+mmsize*(7+%%i)], m10
 %endif
-%assign %%i %%i+1
+%assign %%i %%i+4*(%3+1)
 %endrep
     add     srcq, mmsize*4*(%2+%3)
     add     dstq, mmsize*4*(%2+%3)
@@ -135,3 +131,47 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
 %else
 VECTOR_CLIP_INT32 6, 1, 0, 0
 %endif
+
+;-----------------------------------------------------
+;void ff_vector_clipf(float *dst, const float *src,
+;                     float min, float max, int len)
+;-----------------------------------------------------
+INIT_XMM sse
+%if UNIX64
+cglobal vector_clipf, 3,3,6, dst, src, len
+%else
+cglobal vector_clipf, 5,5,6, dst, src, min, max, len
+%endif
+%if WIN64
+    SWAP 0, 2
+    SWAP 1, 3
+%elif ARCH_X86_32
+    movss   m0, minm
+    movss   m1, maxm
+%endif
+    SPLATD  m0
+    SPLATD  m1
+        shl lend, 2
+        add srcq, lenq
+        add dstq, lenq
+        neg lenq
+.loop:
+    mova    m2,  [srcq+lenq+mmsize*0]
+    mova    m3,  [srcq+lenq+mmsize*1]
+    mova    m4,  [srcq+lenq+mmsize*2]
+    mova    m5,  [srcq+lenq+mmsize*3]
+    maxps   m2, m0
+    maxps   m3, m0
+    maxps   m4, m0
+    maxps   m5, m0
+    minps   m2, m1
+    minps   m3, m1
+    minps   m4, m1
+    minps   m5, m1
+    mova    [dstq+lenq+mmsize*0], m2
+    mova    [dstq+lenq+mmsize*1], m3
+    mova    [dstq+lenq+mmsize*2], m4
+    mova    [dstq+lenq+mmsize*3], m5
+    add     lenq, mmsize*4
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c
index 743f5a3..a2ce231 100644
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,6 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/audiodsp.h"
-#include "audiodsp.h"
 
 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                       int order);
@@ -39,6 +38,8 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
                                    int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
                                int32_t min, int32_t max, unsigned int len);
+void ff_vector_clipf_sse(float *dst, const float *src,
+                         float min, float max, int len);
 
 av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
 {
@@ -50,7 +51,7 @@ av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
     if (EXTERNAL_MMXEXT(cpu_flags))
         c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
 
-    if (INLINE_SSE(cpu_flags))
+    if (EXTERNAL_SSE(cpu_flags))
         c->vector_clipf = ff_vector_clipf_sse;
 
     if (EXTERNAL_SSE2(cpu_flags)) {
diff --git a/libavcodec/x86/audiodsp_mmx.c b/libavcodec/x86/audiodsp_mmx.c
deleted file mode 100644
index cb55059..0000000
--- a/libavcodec/x86/audiodsp_mmx.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/x86/asm.h"
-#include "audiodsp.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_vector_clipf_sse(float *dst, const float *src,
-                         float min, float max, int len)
-{
-    x86_reg i = (len - 16) * 4;
-    __asm__ volatile (
-        "movss          %3, %%xmm4      \n\t"
-        "movss          %4, %%xmm5      \n\t"
-        "shufps $0, %%xmm4, %%xmm4      \n\t"
-        "shufps $0, %%xmm5, %%xmm5      \n\t"
-        "1:                             \n\t"
-        "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
-        "movaps 16(%2, %0), %%xmm1      \n\t"
-        "movaps 32(%2, %0), %%xmm2      \n\t"
-        "movaps 48(%2, %0), %%xmm3      \n\t"
-        "maxps      %%xmm4, %%xmm0      \n\t"
-        "maxps      %%xmm4, %%xmm1      \n\t"
-        "maxps      %%xmm4, %%xmm2      \n\t"
-        "maxps      %%xmm4, %%xmm3      \n\t"
-        "minps      %%xmm5, %%xmm0      \n\t"
-        "minps      %%xmm5, %%xmm1      \n\t"
-        "minps      %%xmm5, %%xmm2      \n\t"
-        "minps      %%xmm5, %%xmm3      \n\t"
-        "movaps     %%xmm0,   (%1, %0)  \n\t"
-        "movaps     %%xmm1, 16(%1, %0)  \n\t"
-        "movaps     %%xmm2, 32(%1, %0)  \n\t"
-        "movaps     %%xmm3, 48(%1, %0)  \n\t"
-        "sub           $64, %0          \n\t"
-        "jge            1b              \n\t"
-        : "+&r" (i)
-        : "r" (dst), "r" (src), "m" (min), "m" (max)
-        : "memory");
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
new file mode 100644
index 0000000..7cbfa3a
--- /dev/null
+++ b/libavcodec/x86/blockdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD-optimized clear block functions
+;* Copyright (c) 2002 Michael Niedermayer
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+    ZERO  m0, m0
+%assign %%i 0
+%rep %2
+    mova  [blocksq+mmsize*(0+%%i)], m0
+    mova  [blocksq+mmsize*(1+%%i)], m0
+    mova  [blocksq+mmsize*(2+%%i)], m0
+    mova  [blocksq+mmsize*(3+%%i)], m0
+    mova  [blocksq+mmsize*(4+%%i)], m0
+    mova  [blocksq+mmsize*(5+%%i)], m0
+    mova  [blocksq+mmsize*(6+%%i)], m0
+    mova  [blocksq+mmsize*(7+%%i)], m0
+%assign %%i %%i+8
+%endrep
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 2
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+    add   blocksq, 768
+    mov      lenq, -768
+    ZERO       m0, m0
+.loop:
+    mova  [blocksq+lenq+mmsize*0], m0
+    mova  [blocksq+lenq+mmsize*1], m0
+    mova  [blocksq+lenq+mmsize*2], m0
+    mova  [blocksq+lenq+mmsize*3], m0
+    mova  [blocksq+lenq+mmsize*4], m0
+    mova  [blocksq+lenq+mmsize*5], m0
+    mova  [blocksq+lenq+mmsize*6], m0
+    mova  [blocksq+lenq+mmsize*7], m0
+    add   lenq, mmsize*8
+    js .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
diff --git a/libavcodec/x86/blockdsp.c b/libavcodec/x86/blockdsp.c
deleted file mode 100644
index b529424..0000000
--- a/libavcodec/x86/blockdsp.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/blockdsp.h"
-#include "libavcodec/version.h"
-
-#if HAVE_INLINE_ASM
-
-#define CLEAR_BLOCKS(name, n)                           \
-static void name(int16_t *blocks)                       \
-{                                                       \
-    __asm__ volatile (                                  \
-        "pxor %%mm7, %%mm7              \n\t"           \
-        "mov     %1,        %%"REG_a"   \n\t"           \
-        "1:                             \n\t"           \
-        "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
-        "add    $32, %%"REG_a"          \n\t"           \
-        "js      1b                     \n\t"           \
-        :: "r"(((uint8_t *) blocks) + 128 * n),         \
-           "i"(-128 * n)                                \
-        : "%"REG_a);                                    \
-}
-CLEAR_BLOCKS(clear_blocks_mmx, 6)
-CLEAR_BLOCKS(clear_block_mmx, 1)
-
-static void clear_block_sse(int16_t *block)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0          \n"
-        "movaps %%xmm0,    (%0)         \n"
-        "movaps %%xmm0,  16(%0)         \n"
-        "movaps %%xmm0,  32(%0)         \n"
-        "movaps %%xmm0,  48(%0)         \n"
-        "movaps %%xmm0,  64(%0)         \n"
-        "movaps %%xmm0,  80(%0)         \n"
-        "movaps %%xmm0,  96(%0)         \n"
-        "movaps %%xmm0, 112(%0)         \n"
-        :: "r" (block)
-        : "memory");
-}
-
-static void clear_blocks_sse(int16_t *blocks)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0              \n"
-        "mov        %1,         %%"REG_a"   \n"
-        "1:                                 \n"
-        "movaps %%xmm0,    (%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
-        "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
-        "add      $128,         %%"REG_a"   \n"
-        "js         1b                      \n"
-        :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6)
-        : "%"REG_a);
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#if FF_API_XVMC
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
-                                  AVCodecContext *avctx)
-#else
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth)
-#endif /* FF_API_XVMC */
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (!high_bit_depth) {
-        if (INLINE_MMX(cpu_flags)) {
-            c->clear_block  = clear_block_mmx;
-            c->clear_blocks = clear_blocks_mmx;
-        }
-
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
-    if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)
-        return;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
-        if (INLINE_SSE(cpu_flags)) {
-            c->clear_block  = clear_block_sse;
-            c->clear_blocks = clear_blocks_sse;
-        }
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
new file mode 100644
index 0000000..2159993
--- /dev/null
+++ b/libavcodec/x86/blockdsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
+                                  AVCodecContext *avctx)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->clear_block  = ff_clear_block_mmx;
+        c->clear_blocks = ff_clear_blocks_mmx;
+    }
+
+    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+        return;
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->clear_block  = ff_clear_block_sse;
+        c->clear_blocks = ff_clear_blocks_sse;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 4810867..56d8083 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -1,21 +1,23 @@
 ;******************************************************************************
 ;* optimized bswap buffer functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -24,6 +26,8 @@
 SECTION_RODATA
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+cextern pb_80
+
 SECTION .text
 
 ; %1 = aligned/unaligned
@@ -84,11 +88,14 @@ SECTION .text
 %macro BSWAP32_BUF 0
 %if cpuflag(ssse3)
 cglobal bswap32_buf, 3,4,3
+    mov      r3, r1
     mova     m2, [pb_bswap32]
 %else
 cglobal bswap32_buf, 3,4,5
+    mov      r3, r1
 %endif
-    test     r1, 15
+    or       r3, r0
+    test     r3, 15
     jz       .start_align
     BSWAP_LOOPS  u
     jmp      .left
diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c
index ba40f2d..c042e56 100644
--- a/libavcodec/x86/bswapdsp_init.c
+++ b/libavcodec/x86/bswapdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 40c2994..4795f5b 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,28 @@
 #include "libavutil/x86/asm.h"
 #include "config.h"
 
+#if   (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+   || (                  !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+   || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+#       define BROKEN_COMPILER 1
+#else
+#       define BROKEN_COMPILER 0
+#endif
+
 #if HAVE_INLINE_ASM
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+        "cmp    "end"       , %%"REG_c"                                 \n\t"\
+        "jge    1f                                                      \n\t"
+#endif
+
 #ifdef BROKEN_RELOCATIONS
 #define TABLES_ARG , "r"(tables)
 
@@ -73,8 +93,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         "jnz    2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"REG_c") , "tmp"                                     \n\t"\
@@ -92,7 +111,8 @@
         "2:                                                             \n\t"
 
 #else /* BROKEN_RELOCATIONS */
-#define TABLES_ARG
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
 
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,8 +154,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         " jnz   2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"REG_c")     , "tmp"                                 \n\t"\
@@ -154,8 +173,7 @@
 
 #endif /* BROKEN_RELOCATIONS */
 
-
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define get_cabac_inline get_cabac_inline_x86
 static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                                                  uint8_t *const state)
@@ -167,6 +185,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -178,17 +197,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                              AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%8")
-        : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+        : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
         : "r"(state), "r"(c),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end))
           TABLES_ARG
+          ,"1"(c->low), "2"(c->range)
         : "%"REG_c, "memory"
     );
     return bit & 1;
 }
-#endif /* HAVE_7REGS */
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
 
+#if !BROKEN_COMPILER
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
@@ -199,7 +220,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "xor           %%edx, %%ecx     \n\t"
@@ -211,10 +232,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "movzwl         (%1), %%edx     \n\t"
         "bswap         %%edx            \n\t"
         "shrl            $15, %%edx     \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "add              $2, %1        \n\t"
+        "addl          %%edx, %%eax     \n\t"
+        "mov              %1, %c4(%2)   \n\t"
+#else
         "addl          %%edx, %%eax     \n\t"
         "cmp         %c5(%2), %1        \n\t"
         "jge              1f            \n\t"
         "add"OPSIZE"      $2, %c4(%2)   \n\t"
+#endif
         "1:                             \n\t"
         "movl          %%eax, %c3(%2)   \n\t"
 
@@ -240,7 +267,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "inc           %%edx            \n\t"
@@ -268,6 +295,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
     );
     return res;
 }
+#endif /* !BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_CABAC_H */
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index 39eec4b..4b20e65 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -5,20 +5,20 @@
  * MMX-optimized DSP functions, based on H.264 optimizations by
  * Michael Niedermayer and Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,7 +139,7 @@ static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
 {
     int i;
-    DECLARE_ALIGNED(8, int16_t, b2)[64];
+    LOCAL_ALIGNED(16, int16_t, b2, [64]);
 
     for(i=0; i<2; i++){
         cavs_idct8_1d(block + 4 * i, ff_pw_4.a);
@@ -196,7 +196,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         );
     }
 
-    ff_add_pixels_clamped_mmx(b2, dst, stride);
+    ff_add_pixels_clamped(b2, dst, stride);
 }
 
 #endif /* HAVE_MMX_INLINE */
@@ -210,10 +210,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  ****************************************************************************/
 
 /* vertical filter [-1 -2 96 42 -7  0]  */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
         "psllw $3, "#E"             \n\t"\
@@ -228,35 +228,35 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "psubw "#B", %%mm6          \n\t"\
         "psraw $1, "#B"             \n\t"\
         "psubw "#A", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -1  5  5 -1  0]  */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "paddw "#D", %%mm6          \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "add %2, %0                 \n\t"\
         "punpcklbw %%mm7, "#F"      \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psubw "#E", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $3, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -7 42 96 -2 -1]  */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
-        "pmullw %5, %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
         "psllw $3, "#B"             \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psraw $3, "#B"             \n\t"\
@@ -269,7 +269,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "psubw "#E", %%mm6          \n\t"\
         "psraw $1, "#E"             \n\t"\
         "psubw "#F", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
@@ -298,32 +298,34 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "punpcklbw %%mm7, %%mm2     \n\t"\
         "punpcklbw %%mm7, %%mm3     \n\t"\
         "punpcklbw %%mm7, %%mm4     \n\t"\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
         \
         : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
         : "memory"\
      );\
      if(h==16){\
         __asm__ volatile(\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
             \
            : "+a"(src), "+c"(dst)\
-           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+             NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
            : "memory"\
         );\
      }\
@@ -336,7 +338,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
     int h=8;\
     __asm__ volatile(\
         "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm6             \n\t"\
+        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
         "1:                         \n\t"\
         "movq    (%0), %%mm0        \n\t"\
         "movq   1(%0), %%mm2        \n\t"\
@@ -362,7 +364,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
         "paddw %%mm3, %%mm5         \n\t"\
         "psubw %%mm2, %%mm0         \n\t"\
         "psubw %%mm5, %%mm1         \n\t"\
-        "movq %6, %%mm5             \n\t"\
+        "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
         "paddw %%mm5, %%mm0         \n\t"\
         "paddw %%mm5, %%mm1         \n\t"\
         "psraw $3, %%mm0            \n\t"\
@@ -374,7 +376,8 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
         : "memory"\
     );\
 }\
@@ -384,7 +387,7 @@ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
-  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42)        \
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
@@ -457,7 +460,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 
 #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
 
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
 static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                     ptrdiff_t stride)
 {
@@ -470,6 +473,12 @@ static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels8_mmx(dst, src, stride, 8);
 }
 
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
 static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                      ptrdiff_t stride)
 {
@@ -482,18 +491,40 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels16_mmx(dst, src, stride, 16);
 }
 
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
 static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
                                      AVCodecContext *avctx)
 {
+#if HAVE_MMX_EXTERNAL
     c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
     c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+#endif
 
+#if HAVE_MMX_INLINE
     c->cavs_idct8_add = cavs_idct8_add_mmx;
     c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
-}
 #endif /* HAVE_MMX_INLINE */
+}
 
 #define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
     c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
@@ -509,15 +540,6 @@ CAVS_MC(put_,  8, mmxext)
 CAVS_MC(put_, 16, mmxext)
 CAVS_MC(avg_,  8, mmxext)
 CAVS_MC(avg_, 16, mmxext)
-
-static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
-                                        AVCodecContext *avctx)
-{
-    DSPFUNC(put, 0, 16, mmxext);
-    DSPFUNC(put, 1,  8, mmxext);
-    DSPFUNC(avg, 0, 16, mmxext);
-    DSPFUNC(avg, 1,  8, mmxext);
-}
 #endif /* HAVE_MMXEXT_INLINE */
 
 #if HAVE_AMD3DNOW_INLINE
@@ -541,18 +563,31 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
 
 av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
 {
-#if HAVE_MMX_INLINE
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
-        cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
+    cavsdsp_init_mmx(c, avctx);
 #if HAVE_AMD3DNOW_INLINE
     if (INLINE_AMD3DNOW(cpu_flags))
         cavsdsp_init_3dnow(c, avctx);
 #endif /* HAVE_AMD3DNOW_INLINE */
 #if HAVE_MMXEXT_INLINE
-    if (INLINE_MMXEXT(cpu_flags))
-        cavsdsp_init_mmxext(c, avctx);
-#endif /* HAVE_MMXEXT_INLINE */
+    if (INLINE_MMXEXT(cpu_flags)) {
+        DSPFUNC(put, 0, 16, mmxext);
+        DSPFUNC(put, 1,  8, mmxext);
+        DSPFUNC(avg, 0, 16, mmxext);
+        DSPFUNC(avg, 1,  8, mmxext);
+    }
+#endif
+#if HAVE_MMX_EXTERNAL
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+        c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+    }
+#endif
+#if HAVE_SSE2_EXTERNAL
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+    }
+#endif
 }
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 5b8d1b2..11002ee 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,20 +1,20 @@
 /*
- * MMX/SSE constants used across x86 dsp optimizations.
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,12 +22,13 @@
 #include "libavutil/x86/asm.h" // for xmm_reg
 #include "constants.h"
 
-DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL,
+                                                    0x0001000100010001ULL, 0x0001000100010001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL,
+                                                    0x0002000200020002ULL, 0x0002000200020002ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+                                                    0x0004000400040004ULL, 0x0004000400040004ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
@@ -35,19 +36,58 @@ DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_20)   = { 0x0014001400140014ULL, 0x0014001400140014ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_255)  = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+                                                    0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
+                                                    0x0100010001000100ULL, 0x0100010001000100ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL,
+                                                    0x0200020002000200ULL, 0x0200020002000200ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL,
+                                                    0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL,
+                                                    0x0400040004000400ULL, 0x0400040004000400ULL};
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
+                                                    0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
+                                                    0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
+                                                    0x1000100010001000ULL, 0x1000100010001000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
+                                                    0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
+                                                    0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
 
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL,
+                                                    0x0000000000000000ULL, 0x0000000000000000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_2)    = { 0x0202020202020202ULL, 0x0202020202020202ULL,
+                                                    0x0202020202020202ULL, 0x0202020202020202ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const xmm_reg,  ff_pb_15)   = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
+                                                    0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = { 0x8000000080000000ULL, 0x8000000080000000ULL };
+
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_1)    = { 0x0000000100000001ULL, 0x0000000100000001ULL,
+                                                    0x0000000100000001ULL, 0x0000000100000001ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = { 0x0000001000000010ULL, 0x0000001000000010ULL,
+                                                    0x0000001000000010ULL, 0x0000001000000010ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = { 0x0000002000000020ULL, 0x0000002000000020ULL,
+                                                    0x0000002000000020ULL, 0x0000002000000020ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL,
+                                                    0x0000200000002000ULL, 0x0000200000002000ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+                                                    0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index f38fbe3..b82aef9 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -1,20 +1,20 @@
 /*
  * MMX/SSE constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,27 +25,47 @@
 
 #include "libavutil/x86/asm.h"
 
-extern const uint64_t ff_wtwo;
-
+extern const ymm_reg  ff_pw_1;
+extern const ymm_reg  ff_pw_2;
 extern const xmm_reg  ff_pw_3;
-extern const xmm_reg  ff_pw_4;
+extern const ymm_reg  ff_pw_4;
 extern const xmm_reg  ff_pw_5;
 extern const xmm_reg  ff_pw_8;
+extern const xmm_reg  ff_pw_9;
 extern const uint64_t ff_pw_15;
 extern const xmm_reg  ff_pw_16;
 extern const xmm_reg  ff_pw_18;
-extern const uint64_t ff_pw_20;
+extern const xmm_reg  ff_pw_20;
 extern const xmm_reg  ff_pw_32;
 extern const uint64_t ff_pw_42;
 extern const uint64_t ff_pw_53;
 extern const xmm_reg  ff_pw_64;
 extern const uint64_t ff_pw_96;
 extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
+extern const ymm_reg  ff_pw_255;
+extern const ymm_reg  ff_pw_512;
+extern const ymm_reg  ff_pw_1023;
+extern const ymm_reg  ff_pw_1024;
+extern const ymm_reg  ff_pw_2048;
+extern const ymm_reg  ff_pw_4095;
+extern const ymm_reg  ff_pw_4096;
+extern const ymm_reg  ff_pw_8192;
+extern const ymm_reg  ff_pw_m1;
 
-extern const xmm_reg  ff_pb_1;
-extern const xmm_reg  ff_pb_3;
-extern const xmm_reg  ff_pb_F8;
+extern const ymm_reg  ff_pb_0;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_2;
+extern const ymm_reg  ff_pb_3;
+extern const xmm_reg  ff_pb_80;
+extern const ymm_reg  ff_pb_FE;
 extern const uint64_t ff_pb_FC;
 
+extern const xmm_reg  ff_ps_neg;
+
+extern const ymm_reg  ff_pd_1;
+extern const ymm_reg  ff_pd_16;
+extern const ymm_reg  ff_pd_32;
+extern const ymm_reg  ff_pd_8192;
+extern const ymm_reg  ff_pd_65535;
+
 #endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index 11d45ae..0000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DCA_H
-#define AVCODEC_X86_DCA_H
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
-    __asm__ volatile (
-        "cvtsi2ss        %2, %%xmm0 \n\t"
-        "mulss           %3, %%xmm0 \n\t"
-        "movq          (%1), %%xmm1 \n\t"
-        "punpcklbw   %%xmm1, %%xmm1 \n\t"
-        "movaps      %%xmm1, %%xmm2 \n\t"
-        "punpcklwd   %%xmm1, %%xmm1 \n\t"
-        "punpckhwd   %%xmm2, %%xmm2 \n\t"
-        "psrad          $24, %%xmm1 \n\t"
-        "psrad          $24, %%xmm2 \n\t"
-        "shufps  $0, %%xmm0, %%xmm0 \n\t"
-        "cvtdq2ps    %%xmm1, %%xmm1 \n\t"
-        "cvtdq2ps    %%xmm2, %%xmm2 \n\t"
-        "mulps       %%xmm0, %%xmm1 \n\t"
-        "mulps       %%xmm0, %%xmm2 \n\t"
-        "movaps      %%xmm1,  0(%0) \n\t"
-        "movaps      %%xmm2, 16(%0) \n\t"
-        :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
-        XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
-    );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
-
-#endif /* AVCODEC_X86_DCA_H */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 89d4ac4..c5bf21a 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -1,336 +1,282 @@
 ;******************************************************************************
-;* SSE-optimized functions for the DCA decoder
-;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-pf_inv16:  times 4 dd 0x3D800000 ; 1/16
-
 SECTION .text
 
-; %1=v0/v1  %2=in1  %3=in2
-%macro FIR_LOOP 2-3
-.loop%1:
-%define va          m1
-%define vb          m2
-%if %1
-%define OFFSET      0
-%else
-%define OFFSET      NUM_COEF*count
-%endif
-; for v0, incrementing and for v1, decrementing
-    mova        va, [cf0q + OFFSET]
-    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
-%if %0 == 3
-    mova        m4, [cf0q + OFFSET + mmsize]
-    mova        m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
-%endif
-    mulps       va, %2
-    mulps       vb, %2
-%if %0 == 3
-    mulps       m4, %3
-    mulps       m0, %3
-    addps       va, m4
-    addps       vb, m0
-%endif
-    ; va = va1 va2 va3 va4
-    ; vb = vb1 vb2 vb3 vb4
-%if %1
-    SWAP        va, vb
-%endif
-    mova        m4, va
-    unpcklps    va, vb ; va3 vb3 va4 vb4
-    unpckhps    m4, vb ; va1 vb1 va2 vb2
-    addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
-    movhlps     vb, m4 ; va1+3  vb1+3
-    addps       vb, m4 ; va0..4 vb0..4
-    movlps  [outq + count], vb
-%if %1
-    sub       cf0q, 8*NUM_COEF
-%endif
-    add      count, 8
-    jl   .loop%1
-%endmacro
-
-; void dca_lfe_fir(float *out, float *in, float *coefs)
-%macro DCA_LFE_FIR 1
-cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
-%define IN1       m3
-%define IN2       m5
-%define count     inq
-%define NUM_COEF  4*(2-%1)
-%define NUM_OUT   32*(%1+1)
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64)
 
-    movu     IN1, [inq + 4 - 1*mmsize]
-    shufps   IN1, IN1, q0123
-%if %1 == 0
-    movu     IN2, [inq + 4 - 2*mmsize]
-    shufps   IN2, IN2, q0123
-%endif
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 1
+    sub     lfeq, 7*sizeof_float
+    mov    cnt1d, 32*sizeof_float
+    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+    lea   coeffq, [coeffq+cnt1q*8]
+    add samplesq, cnt1q
+    neg    cnt1q
 
-    mov    count, -4*NUM_OUT
-    add     cf0q, 4*NUM_COEF*NUM_OUT
-    add     outq, 4*NUM_OUT
-    ; compute v0 first
-%if %1 == 0
-    FIR_LOOP   0, IN1, IN2
-%else
-    FIR_LOOP   0, IN1
-%endif
-    shufps   IN1, IN1, q0123
-    mov    count, -4*NUM_OUT
-    ; cf1 already correctly positioned
-    add     outq, 4*NUM_OUT          ; outq now at out2
-    sub     cf0q, 8*NUM_COEF
-%if %1 == 0
-    shufps   IN2, IN2, q0123
-    FIR_LOOP   1, IN2, IN1
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq+16]
+    cvtdq2ps  m5, [lfeq   ]
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq+16]
+    movu      m5, [lfeq   ]
+    cvtdq2ps  m4, m4
+    cvtdq2ps  m5, m5
+    pshufd    m7, m4, q0123
+    pshufd    m6, m5, q0123
 %else
-    FIR_LOOP   1, IN1
+    cvtpi2ps  m4, [lfeq+16]
+    cvtpi2ps  m0, [lfeq+24]
+    cvtpi2ps  m5, [lfeq   ]
+    cvtpi2ps  m1, [lfeq+8 ]
+    shufps    m4, m0, q1010
+    shufps    m5, m1, q1010
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
 %endif
-    RET
-%endmacro
 
-INIT_XMM sse
-DCA_LFE_FIR 0
-DCA_LFE_FIR 1
+.inner_loop:
+%if ARCH_X86_64
+    movaps    m8, [coeffq+cnt1q*8   ]
+    movaps    m9, [coeffq+cnt1q*8+16]
+    movaps   m10, [coeffq+cnt1q*8+32]
+    movaps   m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+    movaps   m12, [coeffq+cnt1q*8+64]
+    movaps   m13, [coeffq+cnt1q*8+80]
+    movaps   m14, [coeffq+cnt1q*8+96]
+    movaps   m15, [coeffq+cnt1q*8+112]
+    mulps     m0, m7, m8
+    mulps     m1, m7, m10
+    mulps     m2, m7, m12
+    mulps     m3, m7, m14
+    fmaddps   m0, m6, m9, m0
+    fmaddps   m1, m6, m11, m1
+    fmaddps   m2, m6, m13, m2
+    fmaddps   m3, m6, m15, m3
 
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
-    pxor          %1, %1
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
 %else
-    xorps         %1, %1, %1
-%endif
-%endmacro
+    mulps     m0, m7, m8
+    mulps     m1, m6, m9
+    mulps     m2, m7, m10
+    mulps     m3, m6, m11
+    addps     m0, m1
+    addps     m2, m3
 
-%macro SHUF 3
-%if cpuflag(avx)
-    mova          %3, [%2 - 16]
-    vperm2f128    %1, %3, %3, 1
-    vshufps       %1, %1, %1, q0123
-%elif cpuflag(sse2)
-    pshufd        %1, [%2], q0123
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m7, [coeffq+cnt1q*8   ]
+    movaps        m1, [coeffq+cnt1q*8+16]
+    mulps     m2, m7, [coeffq+cnt1q*8+32]
+    fmaddps   m0, m6, m1, m0
+    fmaddps   m2, m6, [coeffq+cnt1q*8+48], m2
 %else
-    mova          %1, [%2]
-    shufps        %1, %1, q0123
-%endif
-%endmacro
+    mulps     m0, m7, [coeffq+cnt1q*8   ]
+    mulps     m1, m6, [coeffq+cnt1q*8+16]
+    mulps     m2, m7, [coeffq+cnt1q*8+32]
+    mulps     m3, m6, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+%endif
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif; ARCH
 
-%macro INNER_LOOP   1
-    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
-    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
-    ;~ b += window[i + j + 16] * (synth_buf[i + j])
-    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
-    mova          m6, [ptr1 + j]
 %if ARCH_X86_64
-    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
-    mova         m12, [ptr1 + j + mmsize]
-%endif
 %if cpuflag(fma3)
-    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
-    fnmaddps      m1, m5,  [win + %1 + j], m1
-%if ARCH_X86_64
-    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
-    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
-    mulps         m6, m6,  [win + %1 + j + 16 * 4]
-    mulps         m5, m5,  [win + %1 + j]
-%if ARCH_X86_64
-    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
-    mulps        m11, m11, [win + %1 + j + mmsize]
-%endif
-    addps         m2, m2, m6
-    subps         m1, m1, m5
-%if ARCH_X86_64
-    addps         m8, m8, m12
-    subps         m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
-    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
-    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
-    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
-    mova          m5, [ptr1 + j + 16 * 4]
-%if ARCH_X86_64
-    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
-    mova         m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
-%if cpuflag(fma3)
-    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
-    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
-    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
-    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
-    mulps         m5, m5,  [win + %1 + j + 32 * 4]
-    mulps         m6, m6,  [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
-    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
-    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
-    addps         m3, m3, m5
-    addps         m4, m4, m6
-%if ARCH_X86_64
-    addps         m9, m9, m11
-    addps        m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
-    sub            j, 64 * 4
-%endmacro
+    mulps     m8, m5
+    mulps    m10, m5
+    mulps    m12, m5
+    mulps    m14, m5
+    fmaddps   m8, m4, m9, m8
+    fmaddps  m10, m4, m11, m10
+    fmaddps  m12, m4, m13, m12
+    fmaddps  m14, m4, m15, m14
 
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-;                                  const float window[512], float out[32],
-;                                  intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
-                              synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
-    movd       scale, scalem
-    SPLATD        m0
-%else
-    VBROADCASTSS  m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ  r4q
+    haddps   m10, m8
+    haddps   m14, m12
+    haddps   m14, m10
+    movaps [samplesq+cnt2q], m14
 %else
-    SPLATD      xmm0
-%if cpuflag(avx)
-    vinsertf128   m0, m0, xmm0, 1
-%endif
-%define OFFQ  offq
-%endif
-    ; prepare inner counter limit 1
-    mov          r5q, 480
-    sub          r5q, offmp
-    and          r5q, -64
-    shl          r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
-    mov         OFFQ, r5q
-%define i        r5q
-    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
+    mulps     m8, m5
+    mulps     m9, m4
+    mulps    m10, m5
+    mulps    m11, m4
+    addps     m8, m9
+    addps    m10, m11
+
+    unpckhps m11, m10, m8
+    unpcklps m10, m8
+    addps    m11, m10
+    movhlps   m8, m11
+    addps     m8, m11
+    movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m5, [coeffq+cnt1q*8   ]
+    mulps     m2, m5, [coeffq+cnt1q*8+32]
+    fmaddps   m0, m4, m1, m0
+    fmaddps   m2, m4, [coeffq+cnt1q*8+48], m2
 %else
-%define i 0
-%define OFFQ  r5q
-%endif
+    mulps     m0, m5, [coeffq+cnt1q*8   ]
+    mulps     m1, m4, [coeffq+cnt1q*8+16]
+    mulps     m2, m5, [coeffq+cnt1q*8+32]
+    mulps     m3, m4, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+%endif
+    unpckhps  m3, m2, m0
+    unpcklps  m2, m0
+    addps     m3, m2
+    movhlps   m0, m3
+    addps     m0, m3
+    movlps [samplesq+cnt2q], m0
+%endif; ARCH
+
+    sub    cnt2d, 8 + FMA3_OFFSET
+    add    cnt1q, 8 + FMA3_OFFSET
+    jl .inner_loop
+
+    add     lfeq, 4
+    add samplesq,  64*sizeof_float
+    mov    cnt1q, -32*sizeof_float
+    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
+    sub nblocksd, 1
+    jg .loop
+    RET
+%endmacro
 
-%define buf2     synth_buf2q
-%if ARCH_X86_32
-    mov         buf2, synth_buf2mp
-%endif
-.mainloop
-    ; m1 = a  m2 = b  m3 = c  m4 = d
-    SETZERO       m3
-    SETZERO       m4
-    mova          m1, [buf2 + i]
-    mova          m2, [buf2 + i + 16 * 4]
 %if ARCH_X86_32
-%define ptr1     r0q
-%define ptr2     r1q
-%define win      r2q
-%define j        r3q
-    mov          win, windowm
-    mov         ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
-    add          win, i
-    add         ptr1, i
+INIT_XMM sse
+LFE_FIR0_FLOAT
 %endif
-%else ; ARCH_X86_64
-%define ptr1     r6q
-%define ptr2     r7q ; must be loaded
-%define win      r8q
-%define j        r9q
-    SETZERO       m9
-    SETZERO      m10
-    mova          m7, [buf2 + i + mmsize]
-    mova          m8, [buf2 + i + mmsize + 16 * 4]
-    lea          win, [windowq + i]
-    lea         ptr1, [synth_bufq + i]
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR0_FLOAT
 %endif
-    mov         ptr2, synth_bufmp
-    ; prepare the inner loop counter
-    mov            j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub         ptr2, i
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+LFE_FIR0_FLOAT
 %endif
-.loop1:
-    INNER_LOOP  0
-    jge       .loop1
 
-    mov            j, 448 * 4
-    sub            j, OFFQ
-    jz          .end
-    sub         ptr1, j
-    sub         ptr2, j
-    add          win, OFFQ ; now at j-64, so define OFFSET
-    sub            j, 64 * 4
-.loop2:
-    INNER_LOOP  64 * 4
-    jge       .loop2
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 2
+    sub     lfeq, 3*sizeof_float
+    mov    cnt1d, 64*sizeof_float
+    mov    cnt2d, 64*sizeof_float-16
+    lea   coeffq, [coeffq+cnt1q*4]
+    add samplesq, cnt1q
+    neg    cnt1q
 
-.end:
-%if ARCH_X86_32
-    mov         buf2, synth_buf2m ; needed for next iteration anyway
-    mov         outq, outmp       ; j, which will be set again during it
-%endif
-    ;~ out[i]      = a * scale;
-    ;~ out[i + 16] = b * scale;
-    mulps         m1, m1, scale
-    mulps         m2, m2, scale
-%if ARCH_X86_64
-    mulps         m7, m7, scale
-    mulps         m8, m8, scale
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq]
+    shufps    m5, m4, m4, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq]
+    cvtdq2ps  m4, m4
+    pshufd    m5, m4, q0123
 %endif
-    ;~ synth_buf2[i]      = c;
-    ;~ synth_buf2[i + 16] = d;
-    mova   [buf2 + i +  0 * 4], m3
-    mova   [buf2 + i + 16 * 4], m4
+
+.inner_loop:
+    movaps    m6, [coeffq+cnt1q*4   ]
+    movaps    m7, [coeffq+cnt1q*4+16]
+    mulps     m0, m5, m6
+    mulps     m1, m5, m7
 %if ARCH_X86_64
-    mova   [buf2 + i +  0 * 4 + mmsize], m9
-    mova   [buf2 + i + 16 * 4 + mmsize], m10
+    movaps    m8, [coeffq+cnt1q*4+32]
+    movaps    m9, [coeffq+cnt1q*4+48]
+    mulps     m2, m5, m8
+    mulps     m3, m5, m9
+%else
+    mulps     m2, m5, [coeffq+cnt1q*4+32]
+    mulps     m3, m5, [coeffq+cnt1q*4+48]
 %endif
-    ;~ out[i]      = a;
-    ;~ out[i + 16] = a;
-    mova   [outq + i +  0 * 4], m1
-    mova   [outq + i + 16 * 4], m2
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+
+    mulps     m6, m4
+    mulps     m7, m4
 %if ARCH_X86_64
-    mova   [outq + i +  0 * 4 + mmsize], m7
-    mova   [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
-    sub            i, (ARCH_X86_64 + 1) * mmsize
-    jge    .mainloop
+    mulps     m8, m4
+    mulps     m9, m4
+
+    haddps    m6, m7
+    haddps    m8, m9
+    haddps    m6, m8
+%else
+    mulps     m2, m4, [coeffq+cnt1q*4+32]
+    mulps     m3, m4, [coeffq+cnt1q*4+48]
+
+    haddps    m6, m7
+    haddps    m2, m3
+    haddps    m6, m2
 %endif
+    movaps [samplesq+cnt2q], m6
+
+    sub    cnt2d, 16
+    add    cnt1q, 16
+    jl .inner_loop
+
+    add     lfeq, sizeof_float
+    add samplesq, 128*sizeof_float
+    mov    cnt1q, -64*sizeof_float
+    mov    cnt2d,  64*sizeof_float-16
+    sub nblocksd, 1
+    jg .loop
     RET
 %endmacro
 
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
 %endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 8632c4a..fc10fb8 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -1,20 +1,18 @@
 /*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ * This file is part of FFmpeg.
  *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,66 +21,32 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
-
-av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->lfe_fir[0]        = ff_dca_lfe_fir0_sse;
-        s->lfe_fir[1]        = ff_dca_lfe_fir1_sse;
-    }
-}
-
+#define LFE_FIR_FLOAT_FUNC(opt)                                               \
+void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
 
-#define SYNTH_FILTER_FUNC(opt)                                                 \
-void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
-                                 const float window[512],                      \
-                                 float out[32], intptr_t offset, float scale); \
-static void synth_filter_##opt(FFTContext *imdct,                              \
-                               float *synth_buf_ptr, int *synth_buf_offset,    \
-                               float synth_buf2[32], const float window[512],  \
-                               float out[32], const float in[32], float scale) \
-{                                                                              \
-    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
-                                                                               \
-    imdct->imdct_half(imdct, synth_buf, in);                                   \
-                                                                               \
-    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
-                                out, *synth_buf_offset, scale);                \
-                                                                               \
-    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
-}                                                                              \
+LFE_FIR_FLOAT_FUNC(sse)
+LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
+LFE_FIR_FLOAT_FUNC(avx)
+LFE_FIR_FLOAT_FUNC(fma3)
 
-#if HAVE_YASM
-#if ARCH_X86_32
-SYNTH_FILTER_FUNC(sse)
-#endif
-SYNTH_FILTER_FUNC(sse2)
-SYNTH_FILTER_FUNC(avx)
-SYNTH_FILTER_FUNC(fma3)
-#endif /* HAVE_YASM */
-
-av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 {
-#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse;
-    }
-#endif
-    if (EXTERNAL_SSE2(cpu_flags)) {
-        s->synth_filter_float = synth_filter_sse2;
-    }
-    if (EXTERNAL_AVX_FAST(cpu_flags)) {
-        s->synth_filter_float = synth_filter_avx;
-    }
-    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
-        s->synth_filter_float = synth_filter_fma3;
+    if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
+    if (EXTERNAL_SSE3(cpu_flags))
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+    if (EXTERNAL_AVX(cpu_flags)) {
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+        s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
     }
-#endif /* HAVE_YASM */
+    if (EXTERNAL_FMA3(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
 }
diff --git a/libavcodec/x86/dct-test.c b/libavcodec/x86/dct-test.c
index 9d4aaf5..28ede16 100644
--- a/libavcodec/x86/dct-test.c
+++ b/libavcodec/x86/dct-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,37 @@
 #include "xvididct.h"
 #include "simple_idct.h"
 
+#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_YASM
+void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
+                                int16_t *block, int16_t *qmat);
+
+#define PR_WRAP(INSN) \
+static void ff_prores_idct_put_10_##INSN##_wrap(int16_t *dst){ \
+    LOCAL_ALIGNED(16, int16_t, qmat, [64]); \
+    LOCAL_ALIGNED(16, int16_t, tmp, [64]); \
+    int i; \
+ \
+    for(i=0; i<64; i++){ \
+        qmat[i]=4; \
+        tmp[i]= dst[i]; \
+    } \
+    ff_prores_idct_put_10_##INSN (dst, 16, tmp, qmat); \
+ \
+    for(i=0; i<64; i++) { \
+         dst[i] -= 512; \
+    } \
+}
+
+PR_WRAP(sse2)
+
+# if HAVE_AVX_EXTERNAL
+void ff_prores_idct_put_10_avx(uint16_t *dst, int linesize,
+                               int16_t *block, int16_t *qmat);
+PR_WRAP(avx)
+# endif
+
+#endif
+
 static const struct algo fdct_tab_arch[] = {
 #if HAVE_MMX_INLINE
     { "MMX",    ff_fdct_mmx,    FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
@@ -39,21 +70,37 @@ static const struct algo idct_tab_arch[] = {
 #if HAVE_MMX_INLINE
     { "SIMPLE-MMX",  ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX },
 #endif
-#if CONFIG_MPEG4_DECODER
-#if HAVE_MMX_INLINE
+#if CONFIG_MPEG4_DECODER && HAVE_YASM
+#if ARCH_X86_32
     { "XVID-MMX",    ff_xvid_idct_mmx,    FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMX,    1 },
-#endif
-#if HAVE_MMXEXT_INLINE
     { "XVID-MMXEXT", ff_xvid_idct_mmxext, FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMXEXT, 1 },
 #endif
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
     { "XVID-SSE2",   ff_xvid_idct_sse2,   FF_IDCT_PERM_SSE2,   AV_CPU_FLAG_SSE2,   1 },
 #endif
-#endif /* CONFIG_MPEG4_DECODER */
+#endif /* CONFIG_MPEG4_DECODER && HAVE_YASM */
+#if (CONFIG_PRORES_DECODER || CONFIG_PRORES_LGPL_DECODER) && ARCH_X86_64 && HAVE_YASM
+    { "PR-SSE2",     ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
+# if HAVE_AVX_EXTERNAL
+    { "PR-AVX",      ff_prores_idct_put_10_avx_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 },
+# endif
+#endif
+#if HAVE_YASM
+#if ARCH_X86_64
+#if HAVE_SSE2_EXTERNAL
+    { "SIMPLE10-SSE2",  ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
+    { "SIMPLE12-SSE2",  ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
+#endif
+#if HAVE_AVX_EXTERNAL
+    { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
+    { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX,  1 },
+#endif
+#endif
+#endif
     { 0 }
 };
 
-static short idct_simple_mmx_perm[64] = {
+static const uint8_t idct_simple_mmx_perm[64] = {
     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 2c4c32e..4e657b5 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -2,20 +2,20 @@
 ;* 32 point SSE-optimized DCT transform
 ;* Copyright (c) 2010 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
 
 INIT_YMM avx
 SECTION .text
+%if HAVE_AVX_EXTERNAL
 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
 cglobal dct32_float, 2,3,8, out, in, tmp
     ; pass 1
@@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp
 INIT_XMM
     PASS6_AND_PERMUTE
     RET
+%endif
 
 %if ARCH_X86_64
 %define SPILL SWAP
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index b2e43a9..c31ef92 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dirac_dwt.asm b/libavcodec/x86/dirac_dwt.asm
new file mode 100644
index 0000000..8980689
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.asm
@@ -0,0 +1,307 @@
+;******************************************************************************
+;* x86 optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
+%macro COMPOSE_53iL0 4
+    paddw   %2, %3
+    paddw   %2, %4
+    psraw   %2, 2
+    psubw   %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered  m3: pw_8  m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+    paddw   m0, %3
+    paddw   m1, %2
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+%if %0 > 3
+    movu    %1, %4
+%endif
+    psrad   m1, 4
+    psrad   m2, 4
+    packssdw m1, m2
+    paddw   m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+    mova    m2, [pw_2]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b0q+2*widthq]
+    mova    m0, [b1q+2*widthq]
+    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+    mova    m1, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    paddw   m0, [b2q+2*widthq]
+    paddw   m0, m1
+    psraw   m0, 1
+    paddw   m0, [b1q+2*widthq]
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                               IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+    mova    [b2q+2*widthq], m1
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_16]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    mova    m5, [b2q+2*widthq]
+    paddw   m0, [b4q+2*widthq]
+    paddw   m1, [b3q+2*widthq]
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+    psrad   m1, 5
+    psrad   m2, 5
+    packssdw m1, m2
+    psubw   m5, m1
+    mova    [b2q+2*widthq], m5
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+    mova    m3, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b1q+2*widthq]
+    mova    m0, [b0q+2*widthq]
+    mova    m2, m1
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [b0q+2*widthq], m0
+    paddw   m2, m0
+    mova    [b1q+2*widthq], m2
+    jg      .loop
+    REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+    mov     %3, [tmpq]
+%assign %%i 1
+%rep %1
+    mov     [tmpq-2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+    mov     %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+    mov     [tmpq+2*w2q+2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xq, xq
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    mova    m3, [pw_1]
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [tmpq + 2*xq], m0
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .lowpass_loop
+
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
+    jl      .end
+
+.highpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [tmpq  + 2*xq]
+    paddw   m1, m0
+
+    ; shift and interleave
+%if %2 == 1
+    paddw   m0, m3
+    paddw   m1, m3
+    psraw   m0, 1
+    psraw   m1, 1
+%endif
+    mova    m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m0
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .highpass_loop
+.end:
+    REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xd, xd
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    movu    m4, [bq+wq]
+    mova    m7, [pw_2]
+    pslldq  m4, 14
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    mova    m2, m1
+    palignr m1, m4, 14
+    mova    m4, m2
+    COMPOSE_53iL0 m0, m1, m2, m7
+    mova    [tmpq + 2*xq], m0
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .lowpass_loop
+
+    EDGE_EXTENSION 1, 2, xw
+    ; leave the last up to 7 (sse) or 3 (mmx) values for C
+    xor     xd, xd
+    and    w2d, ~(mmsize/2 - 1)
+    cmp    w2d, mmsize/2
+    jl      .end
+
+    mova    m7, [tmpq-mmsize]
+    mova    m0, [tmpq]
+    mova    m5, [pw_1]
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.highpass_loop:
+    mova    m6, m0
+    palignr m0, m7, 14
+    mova    m7, [tmpq + 2*xq + 16]
+    mova    m1, m7
+    mova    m2, m7
+    palignr m1, m6, 2
+    palignr m2, m6, 4
+    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+    mova    m0, m7
+    mova    m7, m6
+
+    ; shift and interleave
+    paddw   m6, m5
+    paddw   m1, m5
+    psraw   m6, 1
+    psraw   m1, 1
+    mova    m2, m6
+    punpcklwd m6, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m6
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .highpass_loop
+.end:
+    REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/dirac_dwt_init.c b/libavcodec/x86/dirac_dwt_init.c
new file mode 100644
index 0000000..afdf0a1
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_init.c
@@ -0,0 +1,229 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \
+void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                           uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \
+                                          uint8_t *_b3, uint8_t *_b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+    int16_t *b2 = (int16_t *)_b2; \
+    int16_t *b3 = (int16_t *)_b3; \
+    int16_t *b4 = (int16_t *)_b4; \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+    int16_t *b0 = (int16_t *)_b0; \
+    int16_t *b1 = (int16_t *)_b1; \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    int16_t *b = (int16_t *)_b; \
+    int16_t *tmp = (int16_t *)_tmp; \
+\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    int16_t *b = (int16_t *)_b;
+    int16_t *tmp = (int16_t *)_tmp;
+
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_YASM
+}
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
new file mode 100644
index 0000000..a042413
--- /dev/null
+++ b/libavcodec/x86/diracdsp.asm
@@ -0,0 +1,265 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
+section .text
+
+%macro UNPACK_ADD 6
+    mov%5   %1, %3
+    mov%6   m5, %4
+    mova    m4, %1
+    mova    %2, m5
+    punpcklbw %1, m7
+    punpcklbw m5, m7
+    punpckhbw m4, m7
+    punpckhbw %2, m7
+    paddw   %1, m5
+    paddw   %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+    mov     src0q, srcq
+    lea     stridex3q, [3*strideq]
+    sub     src0q, stridex3q
+    pxor    m7, m7
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq], m0
+    add     dstq, mmsize
+    add     srcq, mmsize
+    add     src0q, mmsize
+    sub     widthd, mmsize
+    jg      .loop
+    RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+    dec     widthd
+    pxor    m7, m7
+    and     widthd, ~(mmsize-1)
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq + widthq], m0
+    sub     widthd, mmsize
+    jge     .loop
+    RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+    mova    m0, [pb_80]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   dst_strideq, dst_strided
+    movsxd   src_strideq, src_strided
+    mov   r7d, r5m
+    mov   r8d, wd
+    %define wspill r8d
+    %define hd r7d
+%else
+    mov    r4m, wd
+    %define wspill r4m
+    %define hd r5mp
+%endif
+
+.loopy:
+    lea     src2q, [srcq+src_strideq]
+    lea     dst2q, [dstq+dst_strideq]
+.loopx:
+    sub      wd, mmsize
+    mova     m1, [srcq +2*wq]
+    mova     m2, [src2q+2*wq]
+    packsswb m1, [srcq +2*wq+mmsize]
+    packsswb m2, [src2q+2*wq+mmsize]
+    paddb    m1, m0
+    paddb    m2, m0
+    mova    [dstq +wq], m1
+    mova    [dst2q+wq], m2
+    jg      .loopx
+
+    lea   srcq, [srcq+src_strideq*2]
+    lea   dstq, [dstq+dst_strideq*2]
+    sub     hd, 2
+    mov     wd, wspill
+    jg      .loopy
+    RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+    mova    m0, [pw_32]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   strideq, strided
+    movsxd   idwt_strideq, idwt_strided
+    mov   r8d, wd
+    %define wspill r8d
+%else
+    mov    r5m, wd
+    %define wspill r5m
+%endif
+
+.loop:
+    sub     wd, mmsize
+    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
+    paddw   m1, m0
+    psraw   m1, 6
+    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+    paddw   m2, m0
+    psraw   m2, 6
+    paddw   m1, [idwtq+2*wq]
+    paddw   m2, [idwtq+2*wq+mmsize]
+    packuswb m1, m2
+    mova    [dstq +wq], m1
+    jg      .loop
+
+    lea   srcq, [srcq + 2*strideq]
+    add   dstq, strideq
+    lea  idwtq, [idwtq+ 2*idwt_strideq]
+    sub     hd, 1
+    mov     wd, wspill
+    jg      .loop
+    RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+    pxor        m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+    mova        m0, [srcq+i]
+    mova        m1, m0
+    punpcklbw   m0, m4
+    punpckhbw   m1, m4
+    mova        m2, [obmcq+i]
+    mova        m3, m2
+   punpcklbw   m2, m4
+    punpckhbw   m3, m4
+    pmullw      m0, m2
+    pmullw      m1, m3
+    movu        m2, [dstq+2*i]
+    movu        m3, [dstq+2*i+mmsize]
+    paddw       m0, m2
+    paddw       m1, m3
+    movu        [dstq+2*i], m0
+    movu        [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
new file mode 100644
index 0000000..5fae798
--- /dev/null
+++ b/libavcodec/x86/diracdsp_init.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/diracdsp.h"
+#include "fpel.h"
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+
+#if HAVE_YASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
+        ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+        ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+        ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+
+#else // HAVE_YASM
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                     \
+    void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,              \
+                                   const uint8_t *src, int stride, int width, int height);
+
+#define PIXFUNC(PFX, IDX, EXT) do {} while (0)
+
+#endif // HAVE_YASM
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+void ff_diracdsp_init_x86(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(mm_flags)) {
+        c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+        c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+        c->add_rect_clamped = ff_add_rect_clamped_mmx;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_mmx;
+#endif
+        PIXFUNC(put, 0, mmx);
+        PIXFUNC(avg, 0, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(mm_flags)) {
+        PIXFUNC(avg, 0, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(mm_flags)) {
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+}
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index d39b07b..9dd6d51 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
 ;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index f1ff7bd..fd6f150 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c
index 6528b57..112566d 100644
--- a/libavcodec/x86/fdct.c
+++ b/libavcodec/x86/fdct.c
@@ -13,20 +13,20 @@
  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
 
 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
 } fdct_r_row_sse2 =
@@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = {  // forward_dct
   29692,  -12299,   26722,  -31521,
 };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
 } tab_frw_01234567_sse2 =
diff --git a/libavcodec/x86/fdct.h b/libavcodec/x86/fdct.h
index c94a977..648cdc5 100644
--- a/libavcodec/x86/fdct.h
+++ b/libavcodec/x86/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdctdsp_init.c b/libavcodec/x86/fdctdsp_init.c
index 4e8e4eb..0cb5fd6 100644
--- a/libavcodec/x86/fdctdsp_init.c
+++ b/libavcodec/x86/fdctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index ef007f4..cdbfd66 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -51,13 +51,12 @@ struc FFTContext
     .imdcthalf:pointer 1
 endstruc
 
-SECTION_RODATA
+SECTION_RODATA 32
 
 %define M_SQRT1_2 0.70710678118654752440
 %define M_COS_PI_1_8 0.923879532511287
 %define M_COS_PI_3_8 0.38268343236509
 
-align 32
 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
 
@@ -69,11 +68,12 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
 ps_m1p1: dd 1<<31, 0
 
+cextern ps_neg
+
 %assign i 16
-%rep 13
+%rep 14
 cextern cos_ %+ i
 %assign i i<<1
 %endrep
@@ -305,6 +305,7 @@ IF%1 mova  Z(1), m5
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 align 16
 fft8_avx:
     mova      m0, Z(0)
@@ -394,6 +395,8 @@ fft32_interleave_avx:
     jg .deint_loop
     ret
 
+%endif
+
 INIT_XMM sse
 
 align 16
@@ -537,6 +540,7 @@ DEFINE_ARGS zc, w, n, o1, o3
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 %macro INTERL_AVX 5
     vunpckhps      %3, %2, %1
     vunpcklps      %2, %2, %1
@@ -558,6 +562,7 @@ cglobal fft_calc, 2,5,8
     FFT_DISPATCH _interleave %+ SUFFIX, r1
     REP_RET
 
+%endif
 
 INIT_XMM sse
 
@@ -655,6 +660,68 @@ cglobal fft_permute, 2,7,1
     jl      .loopcopy
     REP_RET
 
+%macro IMDCT_CALC_FUNC 0
+cglobal imdct_calc, 3,5,3
+    mov     r3d, [r0 + FFTContext.mdctsize]
+    mov     r4,  [r0 + FFTContext.imdcthalf]
+    add     r1,  r3
+    PUSH    r3
+    PUSH    r1
+%if ARCH_X86_32
+    push    r2
+    push    r1
+    push    r0
+%else
+    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
+%endif
+    call    r4
+%if ARCH_X86_32
+    add     esp, 12
+%else
+    add     rsp, 8+32*WIN64
+%endif
+    POP     r1
+    POP     r3
+    lea     r0, [r1 + 2*r3]
+    mov     r2, r3
+    sub     r3, mmsize
+    neg     r2
+    mova    m2, [ps_neg]
+.loop:
+%if mmsize == 8
+    PSWAPD  m0, [r1 + r3]
+    PSWAPD  m1, [r0 + r2]
+    pxor    m0, m2
+%else
+    mova    m0, [r1 + r3]
+    mova    m1, [r0 + r2]
+    shufps  m0, m0, 0x1b
+    shufps  m1, m1, 0x1b
+    xorps   m0, m2
+%endif
+    mova [r0 + r3], m1
+    mova [r1 + r2], m0
+    sub     r3, mmsize
+    add     r2, mmsize
+    jl      .loop
+%if cpuflag(3dnow)
+    femms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+IMDCT_CALC_FUNC
+INIT_MMX 3dnowext
+IMDCT_CALC_FUNC
+%endif
+
+INIT_XMM sse
+IMDCT_CALC_FUNC
+
 %if ARCH_X86_32
 INIT_MMX 3dnow
 %define mulps pfmul
@@ -689,7 +756,7 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
 %endif
 
 %assign n 1<<%1
-%rep 17-%1
+%rep 18-%1
 %assign n2 n/2
 %assign n4 n/4
 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
@@ -714,9 +781,11 @@ align 8
 dispatch_tab %+ fullsuffix: pointer list_of_fft
 %endmacro ; DECL_FFT
 
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 DECL_FFT 6
 DECL_FFT 6, _interleave
+%endif
 INIT_XMM sse
 DECL_FFT 5
 DECL_FFT 5, _interleave
@@ -729,70 +798,6 @@ DECL_FFT 4
 DECL_FFT 4, _interleave
 %endif
 
-%if CONFIG_MDCT
-
-%macro IMDCT_CALC_FUNC 0
-cglobal imdct_calc, 3,5,3
-    mov     r3d, [r0 + FFTContext.mdctsize]
-    mov     r4,  [r0 + FFTContext.imdcthalf]
-    add     r1,  r3
-    PUSH    r3
-    PUSH    r1
-%if ARCH_X86_32
-    push    r2
-    push    r1
-    push    r0
-%else
-    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
-%endif
-    call    r4
-%if ARCH_X86_32
-    add     esp, 12
-%else
-    add     rsp, 8+32*WIN64
-%endif
-    POP     r1
-    POP     r3
-    lea     r0, [r1 + 2*r3]
-    mov     r2, r3
-    sub     r3, mmsize
-    neg     r2
-    mova    m2, [ps_m1m1m1m1]
-.loop:
-%if mmsize == 8
-    PSWAPD  m0, [r1 + r3]
-    PSWAPD  m1, [r0 + r2]
-    pxor    m0, m2
-%else
-    mova    m0, [r1 + r3]
-    mova    m1, [r0 + r2]
-    shufps  m0, m0, 0x1b
-    shufps  m1, m1, 0x1b
-    xorps   m0, m2
-%endif
-    mova [r0 + r3], m1
-    mova [r1 + r2], m0
-    sub     r3, mmsize
-    add     r2, mmsize
-    jl      .loop
-%if cpuflag(3dnow)
-    femms
-    RET
-%else
-    REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-IMDCT_CALC_FUNC
-INIT_MMX 3dnowext
-IMDCT_CALC_FUNC
-%endif
-
-INIT_XMM sse
-IMDCT_CALC_FUNC
-
 INIT_XMM sse
 %undef mulps
 %undef addps
@@ -994,7 +999,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
     sub   r4, r3
 %endif
 %if notcpuflag(3dnowext) && mmsize == 8
-    movd  m7, [ps_m1m1m1m1]
+    movd  m7, [ps_neg]
 %endif
 .pre:
 %if ARCH_X86_64 == 0
@@ -1082,6 +1087,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW
 %endif
 
 INIT_YMM avx
-DECL_IMDCT POSROTATESHUF_AVX
 
-%endif ; CONFIG_MDCT
+%if HAVE_AVX_EXTERNAL
+DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 94405d0..398091e 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,4 +27,12 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 #endif /* AVCODEC_X86_FFT_H */
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index ed12909..928f1dc 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,23 +28,33 @@ av_cold void ff_fft_init_x86(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (s->nbits > 16)
+        return;
+
 #if ARCH_X86_32
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnow;
+        s->imdct_half = ff_imdct_half_3dnow;
         s->fft_calc   = ff_fft_calc_3dnow;
     }
 
     if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+        s->imdct_calc = ff_imdct_calc_3dnowext;
+        s->imdct_half = ff_imdct_half_3dnowext;
         s->fft_calc   = ff_fft_calc_3dnowext;
     }
 #endif /* ARCH_X86_32 */
 
     if (EXTERNAL_SSE(cpu_flags)) {
+        s->imdct_calc  = ff_imdct_calc_sse;
+        s->imdct_half  = ff_imdct_half_sse;
         s->fft_permute = ff_fft_permute_sse;
         s->fft_calc    = ff_fft_calc_sse;
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
     }
 
     if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
+        s->imdct_half      = ff_imdct_half_avx;
         s->fft_calc        = ff_fft_calc_avx;
         s->fft_permutation = FF_FFT_PERM_AVX;
     }
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
new file mode 100644
index 0000000..e285158
--- /dev/null
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,101 @@
+;******************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea  resq,   [resq+orderq*4]
+lea  smpq,   [smpq+orderq*4]
+lea  coefsq, [coefsq+orderq*4]
+sub  length,  orderd
+movd m3,      r5m
+neg  orderq
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
+        pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
+        paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
+    movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
+    psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
+    movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
+
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
+jg .looplen
+RET
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
new file mode 100644
index 0000000..7138611
--- /dev/null
+++ b/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,313 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro PMACSDQL 5
+%if cpuflag(xop)
+    pmacsdql %1, %2, %3, %1
+%else
+    pmuldq   %2, %3
+    paddq    %1, %2
+%endif
+%endmacro
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+    sub    lend, pred_orderd
+    jle .ret
+    lea    decodedq, [decodedq+pred_orderq*4-8]
+    lea    coeffsq, [coeffsq+pred_orderq*4]
+    neg    pred_orderq
+    movd   m4, qlevelm
+ALIGN 16
+.loop_sample:
+    movd   m0, [decodedq+pred_orderq*4+8]
+    add    decodedq, 8
+    movd   m1, [coeffsq+pred_orderq*4]
+    pxor   m2, m2
+    pxor   m3, m3
+    lea    jq, [pred_orderq+1]
+    test   jq, jq
+    jz .end_order
+.loop_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    movd   m0, [decodedq+jq*4]
+    PMACSDQL m3, m1, m0, m3, m1
+    movd   m1, [coeffsq+jq*4]
+    inc    jq
+    jl .loop_order
+.end_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    psrlq  m2, m4
+    movd   m0, [decodedq]
+    paddd  m0, m2
+    movd   [decodedq], m0
+    sub  lend, 2
+    jl .ret
+    PMACSDQL m3, m1, m0, m3, m1
+    psrlq  m3, m4
+    movd   m1, [decodedq+4]
+    paddd  m1, m3
+    movd   [decodedq+4], m1
+    jg .loop_sample
+.ret:
+    REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
+;                                                   int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_16 3-4
+cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    shl      lend, 2
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    add      in1q, lenq
+    add      in0q, lenq
+    add      outq, lenq
+    neg      lenq
+
+align 16
+.loop:
+    mova       m0, [in0q + lenq]
+    mova       m1, [in1q + lenq]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+%ifnidn %1, indep2
+    p%4d       m2, m0, m1
+%endif
+    packssdw  m%2, m%2
+    packssdw  m%3, m%3
+    punpcklwd m%2, m%3
+    psllw     m%2, m3
+    mova [outq + lenq], m%2
+    add      lenq, 16
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 ls, 0, 2, sub
+FLAC_DECORRELATE_16 rs, 2, 1, add
+FLAC_DECORRELATE_16 ms, 2, 0, add
+
+;----------------------------------------------------------------------------------
+;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
+;                                        int len, int shift);
+;----------------------------------------------------------------------------------
+%macro FLAC_DECORRELATE_32 5
+cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
+%if ARCH_X86_32
+    mov      lend, lenm
+%endif
+    movd       m3, r4m
+    mov      in1q, [in0q + gprsize]
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+    sub      in1q, in0q
+
+align 16
+.loop:
+    mova       m0, [in0q]
+    mova       m1, [in0q + in1q]
+%ifidn %1, ms
+    psrad      m2, m1, 1
+    psubd      m0, m2
+%endif
+    p%5d       m2, m0, m1
+    pslld     m%2, m3
+    pslld     m%3, m3
+
+    SBUTTERFLY dq, %2, %3, %4
+
+    mova  [outq         ], m%2
+    mova  [outq + mmsize], m%3
+
+    add      in0q, mmsize
+    add      outq, mmsize*2
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
+FLAC_DECORRELATE_32 rs, 2, 1, 0, add
+FLAC_DECORRELATE_32 ms, 2, 0, 1, add
+
+;-----------------------------------------------------------------------------------------
+;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
+;                                            int len, int shift);
+;-----------------------------------------------------------------------------------------
+;%1 = bps
+;%2 = channels
+;%3 = last xmm reg used
+;%4 = word/dword (shift instruction)
+%macro FLAC_DECORRELATE_INDEP 4
+%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
+cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
+%if ARCH_X86_32
+%if %2 == 6
+    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
+    %define  lend  dword r3m
+%else
+    mov      lend, lenm
+%endif
+%endif
+    movd      m%3, r4m
+
+%assign %%i 1
+%rep %2-1
+    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
+%assign %%i %%i+1
+%endrep
+
+    mov      in0q, [in0q]
+    mov      outq, [outq]
+
+%assign %%i 1
+%rep %2-1
+    sub      in %+ %%i %+ q, in0q
+%assign %%i %%i+1
+%endrep
+
+align 16
+.loop:
+    mova       m0, [in0q]
+
+%assign %%i 1
+%rep REPCOUNT-1
+    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
+%assign %%i %%i+1
+%endrep
+
+%if %1 == 32
+
+%if %2 == 8
+    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+%elif %2 == 6
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+
+    punpcklqdq m6, m0, m2
+    punpckhqdq m2, m4
+    shufps     m4, m0, 0xe4
+    punpcklqdq m0, m1, m3
+    punpckhqdq m3, m5
+    shufps     m5, m1, 0xe4
+    SWAP 0,6,1,4,5,3
+%elif %2 == 4
+    TRANSPOSE4x4D 0, 1, 2, 3, 4
+%else ; %2 == 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%else ; %1 == 16
+
+%if %2 == 8
+    packssdw   m0, [in0q + in4q]
+    packssdw   m1, [in0q + in5q]
+    packssdw   m2, [in0q + in6q]
+    packssdw   m3, [in0q + in7q]
+    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
+%elif %2 == 6
+    packssdw   m0, [in0q + in3q]
+    packssdw   m1, [in0q + in4q]
+    packssdw   m2, [in0q + in5q]
+    pshufd     m3, m0,     q1032
+    punpcklwd  m0, m1
+    punpckhwd  m1, m2
+    punpcklwd  m2, m3
+
+    shufps     m3, m0, m2, q2020
+    shufps     m0, m1,     q2031
+    shufps     m2, m1,     q3131
+    shufps     m1, m2, m3, q3120
+    shufps     m3, m0,     q0220
+    shufps     m0, m2,     q3113
+    SWAP 2, 0, 3
+%else ; %2 == 4
+    packssdw   m0, [in0q + in2q]
+    packssdw   m1, [in0q + in3q]
+    SBUTTERFLY wd, 0, 1, 2
+    SBUTTERFLY dq, 0, 1, 2
+%endif
+
+%endif
+
+%assign %%i 0
+%rep REPCOUNT
+    psll%4   m %+ %%i, m%3
+%assign %%i %%i+1
+%endrep
+
+%assign %%i 0
+%rep REPCOUNT
+    mova [outq + %%i*mmsize], m %+ %%i
+%assign %%i %%i+1
+%endrep
+
+    add      in0q, mmsize
+    add      outq, mmsize*REPCOUNT
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
+FLAC_DECORRELATE_INDEP 32, 2, 3, d
+FLAC_DECORRELATE_INDEP 16, 4, 3, w
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 16, 6, 4, w
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
+
+INIT_XMM avx
+FLAC_DECORRELATE_INDEP 32, 4, 5, d
+FLAC_DECORRELATE_INDEP 32, 6, 7, d
+%if ARCH_X86_64
+FLAC_DECORRELATE_INDEP 16, 8, 5, w
+FLAC_DECORRELATE_INDEP 32, 8, 9, d
+%endif
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 0000000..e28c5c9
--- /dev/null
+++ b/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+#define DECORRELATE_FUNCS(fmt, opt)                                                      \
+void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
+                                          int len, int shift);                           \
+void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                             int len, int shift);                        \
+void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift);                       \
+void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
+                                              int len, int shift)
+
+DECORRELATE_FUNCS(16, sse2);
+DECORRELATE_FUNCS(16,  avx);
+DECORRELATE_FUNCS(32, sse2);
+DECORRELATE_FUNCS(32,  avx);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
+                                 int bps)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if CONFIG_FLAC_DECODER
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_16_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_16_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_16_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 2)
+                c->decorrelate[0] = ff_flac_decorrelate_indep2_32_sse2;
+            else if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_sse2;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_sse2;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_sse2;
+            c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2;
+            c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2;
+            c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_sse4;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (fmt == AV_SAMPLE_FMT_S16) {
+            if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx;
+        } else if (fmt == AV_SAMPLE_FMT_S32) {
+            if (channels == 4)
+                c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx;
+            else if (channels == 6)
+                c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx;
+            else if (ARCH_X86_64 && channels == 8)
+                c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx;
+        }
+    }
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->lpc32 = ff_flac_lpc_32_xop;
+    }
+#endif
+
+#if CONFIG_FLAC_ENCODER
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
+    }
+#endif
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 2a3e4a5..8f62a0a 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,20 +2,20 @@
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -75,3 +75,50 @@ INIT_XMM sse
 INT32_TO_FLOAT_FMUL_SCALAR 5
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_SCALAR 3
+
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
+;                                    const float *mul, int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
+cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
+    shl     lend, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+    movss     m0, [mulq]
+    SPLATD    m0
+%if cpuflag(sse2)
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     mulq, 4
+    add     lenq, 32
+    jl .loop
+%if notcpuflag(sse2)
+    ;; cvtpi2ps switches to MMX even if the source is a memory location
+    ;; possible an error in documentation since every tested CPU disagrees with
+    ;; that. Use emms anyway since the vast majority of machines will use the
+    ;; SSE2 variant
+    emms
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_ARRAY8
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_ARRAY8
+
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 1871b47..e4cbadc 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 
 void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
 void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
 
 #endif /* HAVE_YASM */
 
@@ -42,9 +46,11 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
 
     if (EXTERNAL_SSE(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
     }
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b581471..0e3b444 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,85 +25,83 @@
 
 SECTION .text
 
-INIT_MMX mmxext
+%macro PAVGB_MMX 4
+    LOAD   %3, %1
+    por    %3, %2
+    pxor   %2, %1
+    pand   %2, %4
+    psrlq  %2, 1
+    psubb  %3, %2
+    SWAP   %2, %3
+%endmacro
+
 ; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
 ;                        ptrdiff_t line_size, int h)
-%macro PIXELS48 2
-%if %2 == 4
-%define OP movh
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN  mmsize
 %else
-%define OP mova
+%define LOAD movu
+%define SAVE mova
+%define LEN  %2
 %endif
-cglobal %1_pixels%2, 4,5
+cglobal %1_pixels%2, 4,5,4
     movsxdifnidn r2, r2d
     lea          r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+    pcmpeqd      m6, m6
+    paddb        m6, m6
+%endif
+%endif
 .loop:
-    OP           m0, [r1]
-    OP           m1, [r1+r2]
-    OP           m2, [r1+r2*2]
-    OP           m3, [r1+r4]
-    lea          r1, [r1+r2*4]
+%assign %%i 0
+%rep LEN/mmsize
+    LOAD         m0, [r1 + %%i]
+    LOAD         m1, [r1+r2 + %%i]
+    LOAD         m2, [r1+r2*2 + %%i]
+    LOAD         m3, [r1+r4 + %%i]
 %ifidn %1, avg
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
+%if notcpuflag(mmxext)
+    PAVGB_MMX    [r0 + %%i], m0, m4, m6
+    PAVGB_MMX    [r0+r2 + %%i], m1, m5, m6
+    PAVGB_MMX    [r0+r2*2 + %%i], m2, m4, m6
+    PAVGB_MMX    [r0+r4 + %%i], m3, m5, m6
+%else
+    pavgb        m0, [r0 + %%i]
+    pavgb        m1, [r0+r2 + %%i]
+    pavgb        m2, [r0+r2*2 + %%i]
+    pavgb        m3, [r0+r4 + %%i]
+%endif
 %endif
-    OP         [r0], m0
-    OP      [r0+r2], m1
-    OP    [r0+r2*2], m2
-    OP      [r0+r4], m3
+    SAVE       [r0 + %%i], m0
+    SAVE    [r0+r2 + %%i], m1
+    SAVE  [r0+r2*2 + %%i], m2
+    SAVE    [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
     sub         r3d, 4
+    lea          r1, [r1+r2*4]
     lea          r0, [r0+r2*4]
     jne       .loop
     RET
 %endmacro
 
-PIXELS48 put, 4
-PIXELS48 avg, 4
-PIXELS48 put, 8
-PIXELS48 avg, 8
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
 
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
 
 INIT_XMM sse2
-; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal put_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
-
-; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal avg_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 88d1415..4e83cf7 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,18 +22,24 @@
 #include <stddef.h>
 #include <stdint.h>
 
+void ff_avg_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
+                           ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
 void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int h);
-void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
 void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
deleted file mode 100644
index eef05ec..0000000
--- a/libavcodec/x86/fpel_mmx.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "fpel.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-// in case more speed is needed - unrolling would certainly help
-void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             "movq  8%0, %%mm0          \n\t"
-             "movq  8%1, %%mm1          \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, 8%0          \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"REG_a"      \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"REG_a, "memory"
-        );
-}
-
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"REG_a"      \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"REG_a, "memory"
-        );
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/g722dsp.asm b/libavcodec/x86/g722dsp.asm
new file mode 100644
index 0000000..a529422
--- /dev/null
+++ b/libavcodec/x86/g722dsp.asm
@@ -0,0 +1,54 @@
+;******************************************************************************
+;* SIMD optimized DSP functions for G722 coding
+;*
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_qmf_coeffs:  dw   3, -210,  -11, -805,  -11,  951,  53, 3876
+pw_qmf_coeffs2: dw  12, 3876, -156,  951,   32, -805, 362, -210
+pw_qmf_coeffs3: dw 362,    0 ,  32,    0, -156,    0,  12,    0
+pw_qmf_coeffs4: dw  53,    0,  -11,    0,  -11,    0,   3,    0
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal g722_apply_qmf, 2, 2, 5, prev, out
+    movu m0, [prevq+mmsize*0]
+    movu m1, [prevq+mmsize*1]
+    movu m2, [prevq+mmsize*2]
+    punpcklwd m3, m0, m1
+    punpckhwd m0, m1
+    punpcklwd m4, m2, m2
+    punpckhwd m2, m2
+    pmaddwd   m3, [pw_qmf_coeffs ]
+    pmaddwd   m0, [pw_qmf_coeffs2]
+    pmaddwd   m4, [pw_qmf_coeffs3]
+    pmaddwd   m2, [pw_qmf_coeffs4]
+    paddd     m0, m3
+    paddd     m2, m4
+    paddd     m0, m2
+    pshufd    m2, m0, q0032
+    paddd     m0, m2
+    pshufd    m0, m0, q0001
+    movq  [outq], m0
+    RET
diff --git a/libavcodec/x86/g722dsp_init.c b/libavcodec/x86/g722dsp_init.c
new file mode 100644
index 0000000..6146951
--- /dev/null
+++ b/libavcodec/x86/g722dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/g722dsp.h"
+
+void ff_g722_apply_qmf_sse2(const int16_t *prev_samples, int xout[2]);
+
+av_cold void ff_g722dsp_init_x86(G722DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        dsp->apply_qmf = ff_g722_apply_qmf_sse2;
+}
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index cd726ba..77c8cf1 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,20 +1,22 @@
 ;******************************************************************************
 ;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index d4fab98..ab81063 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index 1447940..fa698e5 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
 ;*               2005-2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index 7b00351..c358482 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -252,8 +252,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7
 %define CHROMAMC_AVG  NOTHING
 INIT_XMM sse2
 CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 put
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 put
 CHROMA_MC2 put
@@ -261,8 +263,10 @@ CHROMA_MC2 put
 %define CHROMAMC_AVG  AVG
 INIT_XMM sse2
 CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 avg
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 avg
 CHROMA_MC2 avg
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 33fd5a9..4aabbc0 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -384,8 +384,10 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
 
 INIT_XMM sse2
 DEBLOCK_LUMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
+%endif
 
 %else
 
@@ -499,8 +501,10 @@ INIT_MMX mmxext
 DEBLOCK_LUMA v8, 8
 INIT_XMM sse2
 DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA v, 16
+%endif
 
 %endif ; ARCH
 
@@ -772,8 +776,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA v
+%endif
 %if ARCH_X86_64 == 0
 INIT_MMX mmxext
 DEBLOCK_LUMA_INTRA v8
@@ -836,7 +842,11 @@ cglobal deblock_h_chroma_8, 5,7
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call ff_chroma_inter_body_mmxext
+    LOAD_MASK  r2d, r3d
+    movd       m6, [r4] ; tc0
+    punpcklbw  m6, m6
+    pand       m7, m6
+    DEBLOCK_P0_Q0
     movq  m0, buf0
     movq  m3, buf1
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
@@ -854,7 +864,52 @@ ff_chroma_inter_body_mmxext:
     DEBLOCK_P0_Q0
     ret
 
+%define t5 r4
+%define t6 r5
+
+cglobal deblock_h_chroma422_8, 5, 6
+    SUB rsp, (1+ARCH_X86_64*2)*mmsize
+    %if ARCH_X86_64
+        %define buf0 [rsp+16]
+        %define buf1 [rsp+8]
+    %else
+        %define buf0 r0m
+        %define buf1 r2m
+    %endif
+
+    movd m6, [r4]
+    punpcklbw m6, m6
+    movq [rsp], m6
+    CHROMA_H_START
+
+    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+    movq buf0, m0
+    movq buf1, m3
+    LOAD_MASK r2d, r3d
+    movd m6, [rsp]
+    punpcklwd m6, m6
+    pand m7, m6
+    DEBLOCK_P0_Q0
+    movq m0, buf0
+    movq m3, buf1
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+
+    lea r0, [r0+r1*8]
+    lea t5, [t5+r1*8]
 
+    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+    movq buf0, m0
+    movq buf1, m3
+    LOAD_MASK r2d, r3d
+    movd m6, [rsp+4]
+    punpcklwd m6, m6
+    pand m7, m6
+    DEBLOCK_P0_Q0
+    movq m0, buf0
+    movq m3, buf1
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+    ADD rsp, (1+ARCH_X86_64*2)*mmsize
+RET
 
 ; in: %1=p0 %2=p1 %3=q1
 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
@@ -867,9 +922,6 @@ ff_chroma_inter_body_mmxext:
     pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
 %endmacro
 
-%define t5 r4
-%define t6 r5
-
 ;------------------------------------------------------------------------------
 ; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
 ;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index d049c62..ebf8a3f 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -7,34 +7,32 @@
 ;*          Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 SECTION .text
 
 cextern pw_2
 cextern pw_3
 cextern pw_4
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 ; out: %4 = |%1-%2|-%3
 ; clobbers: %5
@@ -418,9 +416,11 @@ cglobal deblock_h_luma_10, 5,7,15
 
 INIT_XMM sse2
 DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_64
 %endif
+%endif
 
 %macro SWAPMOVA 2
 %ifid %1
@@ -715,8 +715,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA_64
+%endif
 
 %endif
 
@@ -802,10 +804,12 @@ DEBLOCK_LUMA_INTRA
 INIT_XMM sse2
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
 %endif
+%endif
 
 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
 ; out: %1=p0', %2=q0'
@@ -918,5 +922,7 @@ DEBLOCK_CHROMA
 %endif
 INIT_XMM sse2
 DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index fb33e40..4dfbc30 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,9 +36,15 @@
 
 #if HAVE_INLINE_ASM
 
+#if ARCH_X86_64
+#define REG64 "r"
+#else
+#define REG64 "m"
+#endif
+
 //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
 //as that would make optimization work hard)
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define decode_significance decode_significance_x86
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
@@ -55,6 +61,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
     __asm__ volatile(
         "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -130,6 +137,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -138,7 +146,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "3:                                     \n\t"
 
         "mov %10, %0                            \n\t"
-        "movzbl (%0, %6), %k6                   \n\t"
+        "movzb (%0, %6), %6                     \n\t"
         "add %9, %6                             \n\t"
 
         BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
@@ -149,14 +157,14 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%15")
 
-        "mov %1, %k6                            \n\t"
+        "mov %1, %6                             \n\t"
         "test $1, %4                            \n\t"
         " jz 4f                                 \n\t"
 
 #ifdef BROKEN_RELOCATIONS
-        "movzbl %c14(%15, %q6), %k6\n\t"
+        "movzb %c14(%15, %q6), %6\n\t"
 #else
-        "movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
+        "movzb "MANGLE(ff_h264_cabac_tables)"+%c14(%6), %6\n\t"
 #endif
         "add %11, %6                            \n\t"
 
@@ -169,8 +177,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
                              "%15")
 
         "mov %2, %0                             \n\t"
-        "mov %1, %k6                            \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %1, %6                             \n\t"
+        "mov %k6, (%0)                          \n\t"
 
         "test $1, %4                            \n\t"
         " jnz 5f                                \n\t"
@@ -178,19 +186,19 @@ static int decode_significance_8x8_x86(CABACContext *c,
         "add"OPSIZE"  $4, %2                    \n\t"
 
         "4:                                     \n\t"
-        "addl $1, %k6                           \n\t"
-        "mov %k6, %1                            \n\t"
-        "cmpl $63, %k6                          \n\t"
+        "add $1, %6                             \n\t"
+        "mov %6, %1                             \n\t"
+        "cmp $63, %6                            \n\t"
         " jb 3b                                 \n\t"
         "mov %2, %0                             \n\t"
-        "movl %k6, (%0)                         \n\t"
+        "mov %k6, (%0)                          \n\t"
         "5:                                     \n\t"
         "addl %8, %k0                           \n\t"
         "shr $2, %k0                            \n\t"
-        : "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
+        : "=&q"(coeff_count), "+"REG64(last), "+"REG64(index), "+&r"(c->low),
           "=&r"(bit), "+&r"(c->range), "=&r"(state)
         : "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
-          "m"(sig_off), "m"(last_coeff_ctx_base),
+          REG64(sig_off), REG64(last_coeff_ctx_base),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end)),
           "i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
@@ -198,7 +206,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     );
     return coeff_count;
 }
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_H264_I386_H */
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 313791a..7fafe19 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,20 +9,20 @@
 ;*          Holger Lubitz <hal@duncan.ol.sub.de>
 ;*          Min Chen <chenm001.163.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index b7d5105..f1c2c81 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -5,32 +5,31 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pd_32:        times 4 dd 32
-
 SECTION .text
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pd_32
+
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
@@ -83,8 +82,10 @@ cglobal h264_idct_add_10, 3,3
 
 INIT_XMM sse2
 IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
@@ -117,9 +118,11 @@ add4x4_idct %+ SUFFIX:
 INIT_XMM sse2
 ALIGN 16
 ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 ALIGN 16
 ADD4x4IDCT
+%endif
 
 %macro ADD16_OP 2
     cmp          byte [r4+%2], 0
@@ -155,8 +158,10 @@ cglobal h264_idct_add16_10, 5,6
 
 INIT_XMM sse2
 IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
@@ -220,8 +225,10 @@ cglobal h264_idct8_dc_add_10,3,4,7
 
 INIT_XMM sse2
 IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_DC_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
@@ -293,8 +300,10 @@ cglobal h264_idct_add16intra_10,5,7,8
 
 INIT_XMM sse2
 IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16INTRA_10
+%endif
 
 %assign last_block 36
 ;-----------------------------------------------------------------------------
@@ -330,8 +339,10 @@ cglobal h264_idct_add8_10,5,8,7
 
 INIT_XMM sse2
 IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD8
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
@@ -537,8 +548,10 @@ h264_idct8_add1_10 %+ SUFFIX:
 
 INIT_XMM sse2
 IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
@@ -577,5 +590,7 @@ cglobal h264_idct8_add4_10, 0,7,16
 
 INIT_XMM sse2
 IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD4
+%endif
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index df657a4..c88d91b 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,20 +5,20 @@
 ;* Copyright (c) 2010 Loren Merritt
 ;* Copyright (c) 2010 Ronald S. Bultje
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -2497,10 +2497,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
     pshufb     mm3, mm6
     pshufb     mm4, mm6
     pshufb     mm5, mm6
-    psubw      mm2, mm7
-    psubw      mm3, mm7
-    psubw      mm4, mm7
-    psubw      mm5, mm7
+    psubw      mm0, mm7
     paddw      mm2, mm0
     paddw      mm3, mm0
     paddw      mm4, mm0
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 55790a9..9e40cfe 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,18 +26,19 @@
 
 SECTION_RODATA
 
+cextern pw_1023
+%define pw_pixel_max pw_1023
+cextern pw_512
 cextern pw_16
 cextern pw_8
 cextern pw_4
 cextern pw_2
 cextern pw_1
+cextern pd_16
 
 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
 pw_m3:        times 8 dw -3
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_512:       times 8 dw 512
 pd_17:        times 4 dd 17
-pd_16:        times 4 dd 16
 
 SECTION .text
 
@@ -82,8 +83,10 @@ INIT_XMM sse2
 PRED4x4_DR
 INIT_XMM ssse3
 PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DR
+%endif
 
 ;------------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
@@ -119,8 +122,10 @@ INIT_XMM sse2
 PRED4x4_VR
 INIT_XMM ssse3
 PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VR
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
@@ -159,28 +164,14 @@ INIT_XMM sse2
 PRED4x4_HD
 INIT_XMM ssse3
 PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_HD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
 ;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
-    movhlps %2, %1
-    paddd   %1, %2
-    pshuflw %2, %1, 0xE
-    paddd   %1, %2
-%else
-    pshufw  %2, %1, 0xE
-    paddd   %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
-    pmaddwd %1, [pw_1]
-    HADDD   %1, %2
-%endmacro
 
 INIT_MMX mmxext
 cglobal pred4x4_dc_10, 3, 3
@@ -228,8 +219,10 @@ cglobal pred4x4_down_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
@@ -255,8 +248,10 @@ cglobal pred4x4_vertical_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
@@ -565,8 +560,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_TOP_DC
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
@@ -622,8 +619,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6
 
 INIT_XMM sse2
 PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DC
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
@@ -656,8 +655,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
@@ -711,8 +712,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
@@ -778,8 +781,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_LEFT
 INIT_XMM ssse3
 PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_LEFT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
@@ -851,8 +856,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_RIGHT
 INIT_XMM ssse3
 PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
@@ -920,8 +927,10 @@ INIT_XMM sse2
 PRED8x8L_VERTICAL_RIGHT
 INIT_XMM ssse3
 PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
@@ -980,8 +989,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL_UP
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL_UP
+%endif
 
 
 ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 0e572b1..528b92e 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 76d2ab0..d759e88 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  * Copyright (c) 2011 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,10 +29,6 @@
 #include "fpel.h"
 
 #if HAVE_YASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
-void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                               int dstStride, int src1Stride, int h);
 void ff_avg_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -49,9 +45,9 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
 #define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext   ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext   ff_put_pixels4_mmx
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
@@ -282,7 +278,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
 }\
@@ -294,7 +290,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, const uin
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
 }\
@@ -302,74 +298,74 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*SIZE]);\
     ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+    LOCAL_ALIGNED(ALIGN, uint16_t, temp, [SIZE*(SIZE<8?12:24)]);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
 {\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+    LOCAL_ALIGNED(ALIGN, uint8_t, temp, [SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index f92c4aa..8722683 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,12 +26,13 @@
 
 SECTION_RODATA 32
 
+cextern pd_65535
+cextern pw_1023
+%define pw_pixel_max pw_1023
 cextern pw_16
 cextern pw_1
 cextern pb_0
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-
 pad10: times 8 dw 10*1023
 pad20: times 8 dw 20*1023
 pad30: times 8 dw 30*1023
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
 tap1: times 4 dw  1, -5
 tap2: times 4 dw 20, 20
 tap3: times 4 dw -5,  1
-pd_0f: times 4 dd 0xffff
 
 SECTION .text
 
@@ -386,7 +386,7 @@ MC_CACHE MC10
 ; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
 %macro V_FILT 10
-v_filt%9_%10_10
+v_filt%9_%10_10:
     add    r4, r2
 .no_addr4:
     FILT_V m0, m1, m2, m3, m4, m5, m6, m7
@@ -708,7 +708,7 @@ h%1_loop_op:
     psrad      m1, 10
     psrad      m2, 10
     pslld      m2, 16
-    pand       m1, [pd_0f]
+    pand       m1, [pd_65535]
     por        m1, m2
 %if num_mmregs <= 8
     pxor       m0, m0
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bc6c725..2d287ba 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -6,20 +6,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index 9ad26de..6c57d57 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -139,12 +139,12 @@ WEIGHT_FUNC_HALF_MM 8, 8
     je .nonnormal
     cmp        r5, 128
     jne .normal
-.nonnormal
+.nonnormal:
     sar        r5, 1
     sar        r6, 1
     sar  off_regd, 1
     sub        r4, 1
-.normal
+.normal:
 %if cpuflag(ssse3)
     movd       m4, r5d
     movd       m0, r6d
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 961ec8c..f924e55 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,11 +26,12 @@
 
 SECTION_RODATA 32
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
 sq_1: dq 1
       dq 0
 
 cextern pw_1
+cextern pw_1023
+%define pw_pixel_max pw_1023
 
 SECTION .text
 
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index 8ec8a79..e08af27 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 134d594..c8cd065 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -129,6 +129,8 @@ LF_IFUNC(v, chroma_intra, depth, avx)
 LF_FUNCS(uint8_t,   8)
 LF_FUNCS(uint16_t, 10)
 
+void ff_deblock_h_chroma422_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+
 #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
 LF_FUNC(v8, luma, 8, mmxext)
 static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
@@ -210,6 +212,7 @@ H264_BIWEIGHT_10_SSE(4,  10)
 av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
@@ -244,6 +247,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             if (chroma_format_idc <= 1) {
                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmxext;
                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+            } else {
+                c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
             }
 #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
             c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
@@ -365,4 +370,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #endif /* HAVE_ALIGNED_STACK */
         }
     }
+#endif
 }
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 1e895f0..48a5975 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,13 +26,15 @@
 
 SECTION_RODATA
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_m1:        times 8 dw -1
-pw_m2:        times 8 dw -2
-pd_1 :        times 4 dd  1
+cextern pw_1023
+%define pw_pixel_max_10 pw_1023
+pw_pixel_max_12: times 8 dw ((1 << 12)-1)
+pw_m2:           times 8 dw -2
+pd_1 :           times 4 dd  1
 
 cextern pw_4
 cextern pw_8
+cextern pw_m1
 
 SECTION .text
 INIT_XMM sse2
@@ -57,10 +59,10 @@ INIT_XMM sse2
     movd             m4, %5
     movd             m6, %6
     movd             m5, %7
-    movd             m7, %8
+    movd             m3, %8
 
     punpcklbw        m4, m6
-    punpcklbw        m5, m7
+    punpcklbw        m5, m3
     punpcklwd        m4, m5
 
     punpckhdq        m2, m0, m4
@@ -76,16 +78,10 @@ INIT_XMM sse2
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
 %macro TRANSPOSE8x4B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m6, m0, m2
-    punpcklwd        m0, m2
+    packuswb         m0, m2
+    packuswb         m1, m3
+    SBUTTERFLY bw, 0, 1, 2
+    SBUTTERFLY wd, 0, 1, 2
 
     movd             %1, m0
     pshufd           m0, m0, 0x39
@@ -95,13 +91,13 @@ INIT_XMM sse2
     pshufd           m0, m0, 0x39
     movd             %4, m0
 
-    movd             %5, m6
-    pshufd           m6, m6, 0x39
-    movd             %6, m6
-    pshufd           m6, m6, 0x39
-    movd             %7, m6
-    pshufd           m6, m6, 0x39
-    movd             %8, m6
+    movd             %5, m1
+    pshufd           m1, m1, 0x39
+    movd             %6, m1
+    pshufd           m1, m1, 0x39
+    movd             %7, m1
+    pshufd           m1, m1, 0x39
+    movd             %8, m1
 %endmacro
 
 ; in: 8 rows of 4 words in %4..%11
@@ -120,10 +116,10 @@ INIT_XMM sse2
     movq             m4, %5
     movq             m6, %6
     movq             m5, %7
-    movq             m7, %8
+    movq             m3, %8
 
     punpcklwd        m4, m6
-    punpcklwd        m5, m7
+    punpcklwd        m5, m3
     punpckhdq        m6, m4, m5
     punpckldq        m4, m5
 
@@ -136,32 +132,23 @@ INIT_XMM sse2
 
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 words in %1..%8
-%macro TRANSPOSE8x4W_STORE 8
-    pxor             m5, m5; zeros reg
-    CLIPW            m0, m5, [pw_pixel_max]
-    CLIPW            m1, m5, [pw_pixel_max]
-    CLIPW            m2, m5, [pw_pixel_max]
-    CLIPW            m3, m5, [pw_pixel_max]
+%macro TRANSPOSE8x4W_STORE 9
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
 
-    punpckhwd        m4, m0, m1
-    punpcklwd        m0, m1
-    punpckhwd        m5, m2, m3
-    punpcklwd        m2, m3
-    punpckhdq        m6, m0, m2
-    punpckldq        m0, m2
+    pxor             m5, m5; zeros reg
+    CLIPW            m0, m5, %9
+    CLIPW            m1, m5, %9
+    CLIPW            m2, m5, %9
+    CLIPW            m3, m5, %9
 
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m6
-    movhps           %4, m6
-
-    punpckhdq        m6, m4, m5
-    punpckldq        m4, m5
-
-    movq             %5, m4
-    movhps           %6, m4
-    movq             %7, m6
-    movhps           %8, m6
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 bytes in %1..%8
@@ -212,40 +199,20 @@ INIT_XMM sse2
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 bytes in %1..%8
 %macro TRANSPOSE8x8B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-    packuswb         m4, m4
-    packuswb         m5, m5
-    packuswb         m6, m6
-    packuswb         m7, m7
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m8, m0, m2
-    punpcklwd        m0, m2
-
-    punpcklbw        m4, m5
-    punpcklbw        m6, m7
-
-    punpckhwd        m9, m4, m6
-    punpcklwd        m4, m6
+    packuswb         m0, m4
+    packuswb         m1, m5
+    packuswb         m2, m6
+    packuswb         m3, m7
+    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
 
-    punpckhdq       m10, m0, m4; 2, 3
-    punpckldq        m0, m4;   0, 1
-
-    punpckldq       m11, m8, m9;  4, 5
-    punpckhdq        m8, m9;   6, 7
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m10
-    movhps           %4, m10
-    movq             %5, m11
-    movhps           %6, m11
-    movq             %7, m8
-    movhps           %8, m8
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 words in %1..%8
@@ -264,18 +231,18 @@ INIT_XMM sse2
 
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 words in %1..%8
-%macro TRANSPOSE8x8W_STORE 8
+%macro TRANSPOSE8x8W_STORE 9
     TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
 
     pxor             m8, m8
-    CLIPW            m0, m8, [pw_pixel_max]
-    CLIPW            m1, m8, [pw_pixel_max]
-    CLIPW            m2, m8, [pw_pixel_max]
-    CLIPW            m3, m8, [pw_pixel_max]
-    CLIPW            m4, m8, [pw_pixel_max]
-    CLIPW            m5, m8, [pw_pixel_max]
-    CLIPW            m6, m8, [pw_pixel_max]
-    CLIPW            m7, m8, [pw_pixel_max]
+    CLIPW            m0, m8, %9
+    CLIPW            m1, m8, %9
+    CLIPW            m2, m8, %9
+    CLIPW            m3, m8, %9
+    CLIPW            m4, m8, %9
+    CLIPW            m5, m8, %9
+    CLIPW            m6, m8, %9
+    CLIPW            m7, m8, %9
 
     movdqu           %1, m0
     movdqu           %2, m1
@@ -318,13 +285,14 @@ ALIGN 16
     paddw            m5, m4;
 
     ;tc calculations
-    movd             m6, [r2]; tc0
-    add              r2, 4;
+    movq             m6, [tcq]; tc0
     punpcklwd        m6, m6
-    movd             m7, [r2]; tc1
-    punpcklwd        m7, m7
-    shufps           m6, m7, 0; tc0, tc1
+    pshufd           m6, m6, 0xA0; tc0, tc1
+%if cpuflag(ssse3)
+    psignw           m4, m6, [pw_m1]; -tc0, -tc1
+%else
     pmullw           m4, m6, [pw_m1]; -tc0, -tc1
+%endif
     ;end tc calculations
 
     paddw            m5, [pw_4]; +4
@@ -362,11 +330,11 @@ ALIGN 16
 
     paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
 
-    pshufhw         m14, m9,  q0033 ;0b00001111;  0d3 0d3 0d0 0d0 in high
-    pshuflw         m14, m14, q0033 ;0b00001111;  1d3 1d3 1d0 1d0 in low
+    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
+    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
 
-    pshufhw          m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3
-    pshuflw          m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3
+    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
+    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
 
     paddw           m14, m9; 0d0+0d3, 1d0+1d3
 
@@ -380,7 +348,7 @@ ALIGN 16
     psraw           m15, m13, 2;   beta >> 2
     psllw            m8, m9, 1;
     pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
-    movmskps        r14, m15;
+    movmskps        r6, m15;
     ;end weak / strong decision
 
     ; weak filter nd_p/q calculation
@@ -388,19 +356,15 @@ ALIGN 16
     psrld            m8, 16
     paddw            m8, m10
     movd            r7d, m8
-    and              r7, 0xffff; 1dp0 + 1dp3
     pshufd           m8, m8, 0x4E
     movd            r8d, m8
-    and              r8, 0xffff; 0dp0 + 0dp3
 
     pshufd           m8, m11, 0x31
     psrld            m8, 16
     paddw            m8, m11
     movd            r9d, m8
-    and              r9, 0xffff; 1dq0 + 1dq3
     pshufd           m8, m8, 0x4E
     movd           r10d, m8
-    and             r10, 0xffff; 0dq0 + 0dq3
     ; end calc for weak filter
 
     ; filtering mask
@@ -422,14 +386,13 @@ ALIGN 16
     shl             r11, %1 - 8
 %endif
     movd             m8, r11d; tc0
-    add             tcq, 4;
-    mov             r3d, [tcq];
+    mov             r3d, [tcq+4];
 %if %1 > 8
     shl              r3, %1 - 8
 %endif
-    movd             m9, r3d; tc1
     add            r11d, r3d; tc0 + tc1
     jz             .bypassluma
+    movd             m9, r3d; tc1
     punpcklwd        m8, m8
     punpcklwd        m9, m9
     shufps           m8, m9, 0; tc0, tc1
@@ -453,7 +416,7 @@ ALIGN 16
     psraw           m13, 3; beta >> 3
     pcmpgtw         m13, m12;
     movmskps        r11, m13;
-    and             r14, r11; strong mask , beta_2 and beta_3 comparisons
+    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
     ;----beta_3 comparison end-----
     ;----tc25 comparison---
     psubw           m12, m3, m4;      p0 - q0
@@ -464,23 +427,23 @@ ALIGN 16
 
     pcmpgtw          m8, m12; tc25 comparisons
     movmskps        r11, m8;
-    and             r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
     ;----tc25 comparison end---
-    mov             r11, r14;
+    mov             r11, r6;
     shr             r11, 1;
-    and             r14, r11; strong mask, bits 2 and 0
+    and             r6, r11; strong mask, bits 2 and 0
 
     pmullw          m14, m9, [pw_m2]; -tc * 2
     paddw            m9, m9
 
-    and             r14, 5; 0b101
-    mov             r11, r14; strong mask
-    shr             r14, 2;
-    movd            m12, r14d; store to xmm for mask generation
-    shl             r14, 1
+    and             r6, 5; 0b101
+    mov             r11, r6; strong mask
+    shr             r6, 2;
+    movd            m12, r6d; store to xmm for mask generation
+    shl             r6, 1
     and             r11, 1
     movd            m10, r11d; store to xmm for mask generation
-    or              r14, r11; final strong mask, bits 1 and 0
+    or              r6, r11; final strong mask, bits 1 and 0
     jz      .weakfilter
 
     shufps          m10, m12, 0
@@ -565,16 +528,16 @@ ALIGN 16
     MASKED_COPY      m3, m12
 
 .weakfilter:
-    not             r14; strong mask -> weak mask
-    and             r14, r13; final weak filtering mask, bits 0 and 1
+    not             r6; strong mask -> weak mask
+    and             r6, r13; final weak filtering mask, bits 0 and 1
     jz             .store
 
     ; weak filtering mask
-    mov             r11, r14
+    mov             r11, r6
     shr             r11, 1
     movd            m12, r11d
-    and             r14, 1
-    movd            m11, r14d
+    and             r6, 1
+    movd            m11, r6d
     shufps          m11, m12, 0
     pcmpeqd         m11, [pd_1]; filtering mask
 
@@ -609,7 +572,11 @@ ALIGN 16
     pminsw          m12, m9;  av_clip(delta0, -tc, tc)
 
     psraw            m9, 1;   tc -> tc / 2
+%if cpuflag(ssse3)
+    psignw          m14, m9, [pw_m1]; -tc / 2
+%else
     pmullw          m14, m9, [pw_m1]; -tc / 2
+%endif
 
     pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
     psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
@@ -658,117 +625,161 @@ ALIGN 16
     MASKED_COPY      m4, m8
 %endmacro
 
-INIT_XMM sse2
 ;-----------------------------------------------------------------------------
-; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
-    sub              r0, 2
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8B_LOAD  PASS8ROWS(r4, r0, r1, r5)
+%macro LOOP_FILTER_CHROMA 0
+cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 2
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 8
-    TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     RET
 
-cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(r4, r0, r1, r5)
+cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 10
-    TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
+    RET
+
+cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 12
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
     RET
 
 ;-----------------------------------------------------------------------------
-; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
-    mov              r5, r0; pix
-    sub              r5, r1
-    sub              r5, r1
-    movh             m0, [r5];      p1
-    movh             m1, [r5 + r1]; p0
-    movh             m2, [r0];      q0
-    movh             m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
+    mov           pix0q, pixq
+    sub           pix0q, strideq
+    sub           pix0q, strideq
+    movq             m0, [pix0q];    p1
+    movq             m1, [pix0q+strideq]; p0
+    movq             m2, [pixq];    q0
+    movq             m3, [pixq+strideq]; q1
     pxor             m5, m5; zeros reg
     punpcklbw        m0, m5
     punpcklbw        m1, m5
     punpcklbw        m2, m5
     punpcklbw        m3, m5
     CHROMA_DEBLOCK_BODY  8
-    packuswb          m1, m2
-    movh       [r5 + r1], m1
-    movhps          [r0], m1
+    packuswb         m1, m2
+    movh[pix0q+strideq], m1
+    movhps       [pixq], m1
     RET
 
-cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
-    mov             r5, r0; pix
-    sub             r5, r1
-    sub             r5, r1
-    movdqu          m0, [r5];      p1
-    movdqu          m1, [r5+r1];   p0
-    movdqu          m2, [r0];      q0
-    movdqu          m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
     CHROMA_DEBLOCK_BODY 10
     pxor            m5, m5; zeros reg
-    CLIPW           m1, m5, [pw_pixel_max]
-    CLIPW           m2, m5, [pw_pixel_max]
-    movdqu   [r5 + r1], m1
-    movdqu        [r0], m2
+    CLIPW           m1, m5, [pw_pixel_max_10]
+    CLIPW           m2, m5, [pw_pixel_max_10]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
+    RET
+
+cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
+    CHROMA_DEBLOCK_BODY 12
+    pxor            m5, m5; zeros reg
+    CLIPW           m1, m5, [pw_pixel_max_12]
+    CLIPW           m2, m5, [pw_pixel_max_12]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
     RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_CHROMA
+INIT_XMM avx
+LOOP_FILTER_CHROMA
 
 %if ARCH_X86_64
-INIT_XMM ssse3
+%macro LOOP_FILTER_LUMA 0
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r6, r0
-    add              r0, r5
-    TRANSPOSE8x8B_LOAD  PASS8ROWS(r6, r0, r1, r5)
+cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 4
+    lea           pix0q, [3 * r1]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
     LUMA_DEBLOCK_BODY 8, v
 .store:
-    TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
 .bypassluma:
     RET
 
-cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
+cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     sub            pixq, 8
-    lea              r5, [3 * strideq]
-    mov              r6, pixq
-    add            pixq, r5
-    TRANSPOSE8x8W_LOAD  PASS8ROWS(r6, pixq, strideq, r5)
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
     LUMA_DEBLOCK_BODY 10, v
 .store:
-    TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
+.bypassluma:
+    RET
+
+cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 8
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+    LUMA_DEBLOCK_BODY 12, v
+.store:
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
 .bypassluma:
     RET
 
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea     src3strideq, [3 * strideq]
     mov           pix0q, pixq
     sub           pix0q, src3strideq
     sub           pix0q, strideq
-    movdqu           m0, [pix0q];               p3
-    movdqu           m1, [pix0q +     strideq]; p2
-    movdqu           m2, [pix0q + 2 * strideq]; p1
-    movdqu           m3, [pix0q + src3strideq]; p0
-    movdqu           m4, [pixq];                q0
-    movdqu           m5, [pixq +     strideq];  q1
-    movdqu           m6, [pixq + 2 * strideq];  q2
-    movdqu           m7, [pixq + src3strideq];  q3
+    movq             m0, [pix0q];               p3
+    movq             m1, [pix0q +     strideq]; p2
+    movq             m2, [pix0q + 2 * strideq]; p1
+    movq             m3, [pix0q + src3strideq]; p0
+    movq             m4, [pixq];                q0
+    movq             m5, [pixq +     strideq];  q1
+    movq             m6, [pixq + 2 * strideq];  q2
+    movq             m7, [pixq + src3strideq];  q3
     pxor             m8, m8
     punpcklbw        m0, m8
     punpcklbw        m1, m8
@@ -783,16 +794,16 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0
     packuswb          m1, m2
     packuswb          m3, m4
     packuswb          m5, m6
-    movh   [r5 +     r1], m1
-    movhps [r5 + 2 * r1], m1
-    movh   [r5 +     r6], m3
-    movhps [r0         ], m3
-    movh   [r0 +     r1], m5
-    movhps [r0 + 2 * r1], m5
+    movh   [pix0q +     strideq], m1
+    movhps [pix0q + 2 * strideq], m1
+    movh   [pix0q + src3strideq], m3
+    movhps [pixq               ], m3
+    movh   [pixq  +     strideq], m5
+    movhps [pixq  + 2 * strideq], m5
 .bypassluma:
     RET
 
-cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea                  src3strideq, [3 * strideq]
     mov                        pix0q, pixq
     sub                        pix0q, src3strideq
@@ -808,12 +819,43 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     LUMA_DEBLOCK_BODY             10, h
 .store:
     pxor                          m8, m8; zeros reg
-    CLIPW                         m1, m8, [pw_pixel_max]
-    CLIPW                         m2, m8, [pw_pixel_max]
-    CLIPW                         m3, m8, [pw_pixel_max]
-    CLIPW                         m4, m8, [pw_pixel_max]
-    CLIPW                         m5, m8, [pw_pixel_max]
-    CLIPW                         m6, m8, [pw_pixel_max]
+    CLIPW                         m1, m8, [pw_pixel_max_10]
+    CLIPW                         m2, m8, [pw_pixel_max_10]
+    CLIPW                         m3, m8, [pw_pixel_max_10]
+    CLIPW                         m4, m8, [pw_pixel_max_10]
+    CLIPW                         m5, m8, [pw_pixel_max_10]
+    CLIPW                         m6, m8, [pw_pixel_max_10]
+    movdqu     [pix0q +     strideq], m1;  p2
+    movdqu     [pix0q + 2 * strideq], m2;  p1
+    movdqu     [pix0q + src3strideq], m3;  p0
+    movdqu     [pixq               ], m4;  q0
+    movdqu     [pixq  +     strideq], m5;  q1
+    movdqu     [pixq  + 2 * strideq], m6;  q2
+.bypassluma:
+    RET
+
+cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea                  src3strideq, [3 * strideq]
+    mov                        pix0q, pixq
+    sub                        pix0q, src3strideq
+    sub                        pix0q, strideq
+    movdqu                        m0, [pix0q];               p3
+    movdqu                        m1, [pix0q +     strideq]; p2
+    movdqu                        m2, [pix0q + 2 * strideq]; p1
+    movdqu                        m3, [pix0q + src3strideq]; p0
+    movdqu                        m4, [pixq];                q0
+    movdqu                        m5, [pixq  +     strideq]; q1
+    movdqu                        m6, [pixq  + 2 * strideq]; q2
+    movdqu                        m7, [pixq  + src3strideq]; q3
+    LUMA_DEBLOCK_BODY             12, h
+.store:
+    pxor                          m8, m8; zeros reg
+    CLIPW                         m1, m8, [pw_pixel_max_12]
+    CLIPW                         m2, m8, [pw_pixel_max_12]
+    CLIPW                         m3, m8, [pw_pixel_max_12]
+    CLIPW                         m4, m8, [pw_pixel_max_12]
+    CLIPW                         m5, m8, [pw_pixel_max_12]
+    CLIPW                         m6, m8, [pw_pixel_max_12]
     movdqu     [pix0q +     strideq], m1;  p2
     movdqu     [pix0q + 2 * strideq], m2;  p1
     movdqu     [pix0q + src3strideq], m3;  p0
@@ -822,4 +864,13 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     movdqu     [pixq  + 2 * strideq], m6;  q2
 .bypassluma:
     RET
+
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
+INIT_XMM avx
+LOOP_FILTER_LUMA
 %endif
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
new file mode 100644
index 0000000..2edaf9a
--- /dev/null
+++ b/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,122 @@
+; /*
+; * SIMD optimized idct functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; * Copyright (c) 2014 James Almer
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; %1 = HxW
+; %2 = number of loops
+; %3 = bitdepth
+%macro IDCT_DC 3
+cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%3) + 1)
+    sar               tmpw, (15-%3)
+    movd               xm0, tmpd
+    SPLATW              m0, xm0
+    DEFINE_ARGS coeff, cnt
+    mov               cntd, %2
+.loop:
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+    add  coeffq, mmsize*8
+    dec  cntd
+    jg  .loop
+    RET
+%endmacro
+
+; %1 = HxW
+; %2 = bitdepth
+%macro IDCT_DC_NL 2 ; No loop
+cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%2) + 1)
+    sar               tmpw, (15-%2)
+    movd                m0, tmpd
+    SPLATW              m0, xm0
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+%if mmsize == 16
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+%endif
+    RET
+%endmacro
+
+; 8-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,      8
+IDCT_DC     8,  2,  8
+
+INIT_XMM sse2
+IDCT_DC_NL  8,      8
+IDCT_DC    16,  4,  8
+IDCT_DC    32, 16,  8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2,  8
+IDCT_DC    32,  8,  8
+%endif ;HAVE_AVX2_EXTERNAL
+
+; 10-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,     10
+IDCT_DC     8,  2, 10
+
+INIT_XMM sse2
+IDCT_DC_NL  8,     10
+IDCT_DC    16,  4, 10
+IDCT_DC    32, 16, 10
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2, 10
+IDCT_DC    32,  8, 10
+%endif ;HAVE_AVX2_EXTERNAL
+
+; 12-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,     12
+IDCT_DC     8,  2, 12
+
+INIT_XMM sse2
+IDCT_DC_NL  8,     12
+IDCT_DC    16,  4, 12
+IDCT_DC    32, 16, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2, 12
+IDCT_DC    32,  8, 12
+%endif ;HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 8444c41..ff6ed07 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -1,851 +1,1672 @@
-;*****************************************************************************
-;* x86-optimized HEVC MC
-;* Copyright 2015 Anton Khirnov
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_8192
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+%define pw_8 pw_512
+%define pw_10 pw_2048
+%define pw_12 pw_8192
+%define pw_bi_10 pw_1024
+%define pw_bi_12 pw_4096
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+pw_bi_8:                times 16 dw  (1 <<  8)
+max_pixels_12:          times 16 dw ((1 << 12)-1)
+cextern pd_1
+cextern pb_0
+
+%macro EPEL_TABLE 4
+hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
+                        times %2 d%3 10, -2
+                        times %2 d%3 -4, 54
+                        times %2 d%3 16, -2
+                        times %2 d%3 -6, 46
+                        times %2 d%3 28, -4
+                        times %2 d%3 -4, 36
+                        times %2 d%3 36, -4
+                        times %2 d%3 -4, 28
+                        times %2 d%3 46, -6
+                        times %2 d%3 -2, 16
+                        times %2 d%3 54, -4
+                        times %2 d%3 -2, 10
+                        times %2 d%3 58, -2
+%endmacro
 
-pw_1023: times 8 dw 1023
 
-cextern hevc_qpel_coeffs
-cextern hevc_qpel_coeffs8
+EPEL_TABLE  8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
+
+EPEL_TABLE  8, 8, b, sse4
+EPEL_TABLE 10, 4, w, sse4
+EPEL_TABLE 12, 4, w, sse4
+
+%macro QPEL_TABLE 4
+hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
+                        times %2 d%3 -10, 58
+                        times %2 d%3  17, -5
+                        times %2 d%3   1,  0
+                        times %2 d%3  -1,  4
+                        times %2 d%3 -11, 40
+                        times %2 d%3  40,-11
+                        times %2 d%3   4, -1
+                        times %2 d%3   0,  1
+                        times %2 d%3  -5, 17
+                        times %2 d%3  58,-10
+                        times %2 d%3   4, -1
+%endmacro
 
-cextern hevc_epel_coeffs
-cextern hevc_epel_coeffs8
+QPEL_TABLE  8, 8, b, sse4
+QPEL_TABLE 10, 4, w, sse4
+QPEL_TABLE 12, 4, w, sse4
 
-cextern pw_8
-cextern pw_16
-cextern pw_32
-cextern pw_64
+QPEL_TABLE  8,16, b, avx2
+QPEL_TABLE 10, 8, w, avx2
 
 SECTION .text
 
-; %1: width
-; %2: bit depth
-%macro COMMON_DEFS 2
-    %assign blocksize            8
-    %assign nb_blocks            ((%1 + blocksize - 1) / blocksize)
-    %define last_block_truncated (blocksize * nb_blocks > %1)
-    %if %2 > 8
-        %define LOAD_BLOCK     movu
-        %define LOAD_HALFBLOCK movq
-        %assign pixelsize      2
-    %else
-        %define LOAD_BLOCK     movq
-        %define LOAD_HALFBLOCK movd
-        %assign pixelsize      1
-    %endif
-    %define STORE_BLOCK        mova
-    %define STORE_HALFBLOCK    movq
-%endmacro
-
-; %1: block index
-%macro BLOCK_DEFS 1
-    %if last_block_truncated && %1 == nb_blocks - 1
-        %define block_truncated 1
-        %define LOAD            LOAD_HALFBLOCK
-        %define STORE           STORE_HALFBLOCK
-    %else
-        %define block_truncated 0
-        %define LOAD            LOAD_BLOCK
-        %define STORE           STORE_BLOCK
-    %endif
-%endmacro
-
-
-; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
-;                         pixel   *src, ptrdiff_t srcstride,
-;                         int height, int mx, int my, int *mcbuffer)
-
-; %1: block width
-; %2: bit depth
-; %3: log2 of height unroll
-%macro GET_PIXELS 3
-cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
-
-    %assign shift 14 - %2
-    COMMON_DEFS %1, %2
-
-%if pixelsize == 1
-    pxor      m0, m0
-%endif
-
-    shr       heightd, %3
-
-.loop:
+%define MAX_PB_SIZE  64
 
-%assign i 0
-%rep (1 << %3)
+%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
 
-%assign j 0
-%rep nb_blocks
+%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
 
-    BLOCK_DEFS j
+%if ARCH_X86_64
 
-    LOAD       m1, [srcq + j * pixelsize * blocksize]
-%if pixelsize == 1
-    punpcklbw  m1, m0
+%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
+%if %1 <= 4
+    movq              %3, [%2]                                              ; load data from source2
+%elif %1 <= 8
+    movdqa            %3, [%2]                                              ; load data from source2
+%elif %1 <= 12
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movq              %4, [%2+16]                                           ; load data from source2
+%endif ;avx
+%elif %1 <= 16
+%if cpuflag(avx2)
+    mova              %3, [%2]
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movdqa            %4, [%2+16]                                           ; load data from source2
+%endif ; avx
+%else ; %1 = 32
+    mova              %3, [%2]
+    mova              %4, [%2+32]
 %endif
-    psllw      m1, shift
-    STORE      [dstq + j * 2 * blocksize], m1
-
-%assign j (j + 1)
-%endrep
-
-    add       dstq, dststrideq
-    add       srcq, srcstrideq
-
-%assign i (i + 1)
-%endrep
-
-    dec heightd
-    jg .loop
-    RET
 %endmacro
 
-INIT_XMM sse2
-GET_PIXELS 4,  8, 1
-GET_PIXELS 8,  8, 1
-GET_PIXELS 12, 8, 3
-GET_PIXELS 16, 8, 2
-GET_PIXELS 24, 8, 3
-GET_PIXELS 32, 8, 3
-GET_PIXELS 48, 8, 3
-GET_PIXELS 64, 8, 3
-
-GET_PIXELS 4,  10, 1
-GET_PIXELS 8,  10, 1
-GET_PIXELS 12, 10, 3
-GET_PIXELS 16, 10, 2
-GET_PIXELS 24, 10, 3
-GET_PIXELS 32, 10, 3
-GET_PIXELS 48, 10, 3
-GET_PIXELS 64, 10, 3
-
-; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                     uint8_t *src, ptrdiff_t srcstride,
-;                     int height, int mx, int my, int *mcbuffer)
-
-; 8-bit qpel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro QPEL_8 2
-%if %2
-    %define postfix    v
-    %define mvfrac     myq
-    %define coeffsaddr r5q
-    %define pixstride  srcstrideq
-    %define pixstride3 r5q
-    %define src_m3     r6q
-%else
-    %define postfix    h
-    %define mvfrac     mxq
-    %define coeffsaddr r6q
-    %define pixstride  1
-    %define pixstride3 3
-    %define src_m3     (srcq - 3)
-%endif
-
-    COMMON_DEFS %1, 8
-
-cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
-    and       mvfrac, 0x3
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsaddr, [hevc_qpel_coeffs8]
-    mova      m0,         [coeffsaddr + mvfrac]
-
-    SPLATW    m1, m0, 1
-    SPLATW    m2, m0, 2
-    SPLATW    m3, m0, 3
-    SPLATW    m0, m0, 0
-
-%if %2
-    lea       pixstride3, [srcstrideq + 2 * srcstrideq]
-    mov       src_m3, srcq
-    sub       src_m3, pixstride3
-%endif
-
-.loop
-
-%assign i 0
-%rep nb_blocks
-
-    BLOCK_DEFS i
-
-    LOAD m4, [src_m3 + i * blocksize]
-    LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
-    punpcklbw m4, m5
-    pmaddubsw m4, m0
-
-    LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
-    LOAD m6, [srcq   + i * blocksize]
-    punpcklbw m5, m6
-    pmaddubsw m5, m1
-    paddsw    m4, m5
-
-    LOAD m5, [srcq + i * blocksize + 1 * pixstride]
-    LOAD m6, [srcq + i * blocksize + 2 * pixstride]
-    punpcklbw m5, m6
-    pmaddubsw m5, m2
-    paddsw    m4, m5
-
-    LOAD m5, [srcq + i * blocksize +     pixstride3]
-    LOAD m6, [srcq + i * blocksize + 4 * pixstride]
-    punpcklbw m5, m6
-    pmaddubsw m5, m3
-    paddsw    m4, m5
-
-    STORE [dstq + i * 2 * blocksize], m4
-
-%assign i (i + 1)
-%endrep
-
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-%if %2
-    add       src_m3, srcstrideq
-%endif
-
-    dec heightd
-    jg .loop
-    RET
+%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+    movd              %4, [%3]                                               ; load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+    movq              %4, [%3]                                               ; load data from source
+%elif notcpuflag(avx)
+    movu              %4, [%3]                                               ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+    movdqu           %4, [%3]
+%else
+    movu              %4, [%3]
+%endif
 %endmacro
 
-INIT_XMM ssse3
-QPEL_8 4,  0
-QPEL_8 8,  0
-QPEL_8 12, 0
-QPEL_8 16, 0
-QPEL_8 24, 0
-QPEL_8 32, 0
-QPEL_8 48, 0
-QPEL_8 64, 0
-
-QPEL_8 4,  1
-QPEL_8 8,  1
-QPEL_8 12, 1
-QPEL_8 16, 1
-QPEL_8 24, 1
-QPEL_8 32, 1
-QPEL_8 48, 1
-QPEL_8 64, 1
-
-; 16-bit qpel interpolation
-; %1: block width
-; %2: shift applied to the result
-; %3: 0 - horizontal; 1 - vertical
-%macro QPEL_16 3
-%if %3
-    %define mvfrac     myq
-    %define pixstride  srcstrideq
-    %define pixstride3 sstride3q
-    %define src_m3     srcm3q
-%else
-    %define mvfrac     mxq
-    %define pixstride  2
-    %define pixstride3 6
-    %define src_m3     (srcq - 6)
-%endif
-
-    COMMON_DEFS %1, 16
-
-    and       mvfrac, 0x3
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsregq, [hevc_qpel_coeffs]
-    mova      m0,         [coeffsregq + mvfrac]
-
-    pshufd    m1, m0, 0x55
-    pshufd    m2, m0, 0xaa
-    pshufd    m3, m0, 0xff
-    pshufd    m0, m0, 0x00
-
-%if %3
-    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
-    mov       srcm3q, srcq
-    sub       srcm3q, sstride3q
-%endif
-
-.loop
-
-%assign i 0
-%rep nb_blocks
-
-    BLOCK_DEFS i
-
-    LOAD m4,  [src_m3 + i * 2 * blocksize]
-    LOAD m5,  [src_m3 + i * 2 * blocksize + 1 * pixstride]
-    LOAD m6,  [src_m3 + i * 2 * blocksize + 2 * pixstride]
-    LOAD m7,  [srcq   + i * 2 * blocksize + 0 * pixstride]
-    LOAD m8,  [srcq   + i * 2 * blocksize + 1 * pixstride]
-    LOAD m9,  [srcq   + i * 2 * blocksize + 2 * pixstride]
-    LOAD m10, [srcq   + i * 2 * blocksize +     pixstride3]
-    LOAD m11, [srcq   + i * 2 * blocksize + 4 * pixstride]
-
-    punpcklwd m12, m4, m5
-    pmaddwd   m12, m0
-
-    punpcklwd m13, m6, m7
-    pmaddwd   m13, m1
-    paddd     m12, m13
-
-    punpcklwd m13, m8, m9
-    pmaddwd   m13, m2
-    paddd     m12, m13
-
-    punpcklwd m13, m10, m11
-    pmaddwd   m13, m3
-    paddd     m12, m13
-    psrad     m12, %2
-
-    %if block_truncated == 0
-        punpckhwd m4, m5
-        pmaddwd   m4, m0
-
-        punpckhwd m6, m7
-        pmaddwd   m6, m1
-        paddd     m4, m6
-
-        punpckhwd m8, m9
-        pmaddwd   m8, m2
-        paddd     m4, m8
-
-        punpckhwd m10, m11
-        pmaddwd   m10, m3
-        paddd     m4, m10
-
-        psrad     m4, %2
-    %endif
-    packssdw  m12, m4
-    STORE [dstq + i * 2 * blocksize], m12
-
-%assign i (i + 1)
-%endrep
-
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
-%if %3
-    add       srcm3q, srcstrideq
-%endif
 
-    dec heightd
-    jg .loop
-    RET
+%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
+%if cpuflag(avx2)
+%assign %%offset 32
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_avx2_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_avx2_%1
+%endif
+%else
+%assign %%offset 16
+%ifdef PIC
+    lea              %5q, [hevc_epel_filters_sse4_%1]
+    %define FILTER %5q
+%else
+    %define FILTER hevc_epel_filters_sse4_%1
+%endif
+%endif ;cpuflag(avx2)
+    sub              %2q, 1
+%if cpuflag(avx2)
+    shl              %2q, 6                      ; multiply by 64
+  %else
+    shl              %2q, 5                      ; multiply by 32
+%endif
+    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
+    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
 %endmacro
 
-%if ARCH_X86_64
-
-%macro QPEL_H_10 1
-cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
-QPEL_16 %1, 2, 0
-%endmacro
-
-INIT_XMM avx
-QPEL_H_10 4
-QPEL_H_10 8
-QPEL_H_10 12
-QPEL_H_10 16
-QPEL_H_10 24
-QPEL_H_10 32
-QPEL_H_10 48
-QPEL_H_10 64
-
-%macro QPEL_V_10 1
-cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 2, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_V_10 4
-QPEL_V_10 8
-QPEL_V_10 12
-QPEL_V_10 16
-QPEL_V_10 24
-QPEL_V_10 32
-QPEL_V_10 48
-QPEL_V_10 64
-
-; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
-;                  uint8_t *src, ptrdiff_t srcstride,
-;                  int height, int mx, int my, int *mcbuffer)
-
-%macro QPEL_HV 1
-cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
-QPEL_16 %1, 6, 1
-%endmacro
-
-INIT_XMM avx
-QPEL_HV 4
-QPEL_HV 8
-QPEL_HV 12
-QPEL_HV 16
-QPEL_HV 24
-QPEL_HV 32
-QPEL_HV 48
-QPEL_HV 64
+%macro EPEL_HV_FILTER 1
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  6
+%define %%table  hevc_epel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  5
+%define %%table  hevc_epel_filters_sse4_%1
+%endif
 
-%endif ; ARCH_X86_64
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
+    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
+
+%if cpuflag(avx2)
+%define %%table  hevc_epel_filters_avx2_10
+%else
+%define %%table  hevc_epel_filters_sse4_10
+%endif
+%ifdef PIC
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
+%else
+    %define FILTER %%table
+%endif
+    mova             m12, [FILTER + myq]        ; get 2 first values of filters
+    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
+    lea           r3srcq, [srcstrideq*3]
+%endmacro
 
-; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                     uint8_t *src, ptrdiff_t srcstride,
-;                     int height, int mx, int my, int *mcbuffer)
+%macro QPEL_FILTER 2
 
-; 8-bit epel interpolation
-; %1: block width
-; %2: 0 - horizontal; 1 - vertical
-%macro EPEL_8 2
-%if %2
-    %define postfix    v
-    %define mvfrac     myq
-    %define coeffsaddr r5q
-    %define pixstride  srcstrideq
-    %define pixstride3 r5q
+%if cpuflag(avx2)
+%assign %%offset 32
+%assign %%shift  7
+%define %%table  hevc_qpel_filters_avx2_%1
 %else
-    %define postfix    h
-    %define mvfrac     mxq
-    %define coeffsaddr r6q
-    %define pixstride  1
-    %define pixstride3 3
+%assign %%offset 16
+%assign %%shift  6
+%define %%table  hevc_qpel_filters_sse4_%1
 %endif
 
-    COMMON_DEFS %1, 8
-
-cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
-    and       mvfrac, 0x7
-    dec       mvfrac
-    shl       mvfrac, 4
-    lea       coeffsaddr, [hevc_epel_coeffs8]
-    movq      m0,         [coeffsaddr + mvfrac]
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
+    sub              %2q, 1
+    shl              %2q, %%shift                        ; multiply by 32
+    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
+    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
+    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
+    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
+%endmacro
 
-    SPLATW    m1, m0, 1
-    SPLATW    m0, m0, 0
+%macro EPEL_LOAD 4
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
 
-%if %2
-    lea       pixstride3, [srcstrideq + 2 * srcstrideq]
+    %%load            m0, [%2q ]
+%ifnum %3
+    %%load            m1, [%2q+  %3]
+    %%load            m2, [%2q+2*%3]
+    %%load            m3, [%2q+3*%3]
+%else
+    %%load            m1, [%2q+  %3q]
+    %%load            m2, [%2q+2*%3q]
+    %%load            m3, [%2q+r3srcq]
+%endif
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 7
+    SBUTTERFLY        bw, 2, 3, 7
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 7
+    SBUTTERFLY        wd, 2, 3, 7
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+%endif
 %endif
-    sub       srcq, pixstride
+%endmacro
 
-.loop
 
-%assign i 0
-%rep nb_blocks
+%macro QPEL_H_LOAD 4
+%assign %%stride (%1+7)/8
+%if %1 == 8
+%if %3 <= 4
+%define %%load movd
+%elif %3 == 8
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%else
+%if %3 == 2
+%define %%load movd
+%elif %3 == 4
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%endif
+    %%load            m0, [%2-3*%%stride]        ;load data from source
+    %%load            m1, [%2-2*%%stride]
+    %%load            m2, [%2-%%stride  ]
+    %%load            m3, [%2           ]
+    %%load            m4, [%2+%%stride  ]
+    %%load            m5, [%2+2*%%stride]
+    %%load            m6, [%2+3*%%stride]
+    %%load            m7, [%2+4*%%stride]
+
+%if %1 == 8
+%if %3 > 8
+    SBUTTERFLY        wd, 0, 1, %4
+    SBUTTERFLY        wd, 2, 3, %4
+    SBUTTERFLY        wd, 4, 5, %4
+    SBUTTERFLY        wd, 6, 7, %4
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
+%endif
+%else
+%if %3 > 4
+    SBUTTERFLY        dq, 0, 1, %4
+    SBUTTERFLY        dq, 2, 3, %4
+    SBUTTERFLY        dq, 4, 5, %4
+    SBUTTERFLY        dq, 6, 7, %4
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
 
-    BLOCK_DEFS i
+%macro QPEL_V_LOAD 5
+    lea              %5q, [%2]
+    sub              %5q, r3srcq
+    movu              m0, [%5q            ]      ;load x- 3*srcstride
+    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
+    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
+    movu              m3, [%2       ]      ;load x
+    movu              m4, [%2+   %3q]      ;load x+stride
+    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
+    movu              m6, [%2+r3srcq]      ;load x+3*stride
+    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 8
+    SBUTTERFLY        bw, 2, 3, 8
+    SBUTTERFLY        bw, 4, 5, 8
+    SBUTTERFLY        bw, 6, 7, 8
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 8
+    SBUTTERFLY        wd, 2, 3, 8
+    SBUTTERFLY        wd, 4, 5, 8
+    SBUTTERFLY        wd, 6, 7, 8
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
 
-    LOAD m2, [srcq + i * blocksize + 0 * pixstride]
-    LOAD m3, [srcq + i * blocksize + 1 * pixstride]
-    LOAD m4, [srcq + i * blocksize + 2 * pixstride]
-    LOAD m5, [srcq + i * blocksize +     pixstride3]
+%macro PEL_12STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_12STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_12STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_12STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_12STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_12STORE16 3
+    PEL_12STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
+%endmacro
 
-    punpcklbw m2, m3
-    punpcklbw m4, m5
+%macro PEL_10STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+%if cpuflag(avx2)
+    movu            [%1], %2
+%else
+    PEL_10STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
+%endif
+%endmacro
 
-    pmaddubsw m2, m0
-    pmaddubsw m4, m1
+%macro PEL_10STORE32 3
+    PEL_10STORE16     %1, %2, %3
+    movu         [%1+32], %3
+%endmacro
 
-    paddsw    m2, m4
+%macro PEL_8STORE2 3
+    pextrw          [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+    movd            [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+    movd            [%1], %2
+    pextrw        [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+    movq            [%1], %2
+    psrldq            %2, 8
+    movd          [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+%if cpuflag(avx2)
+    movdqu        [%1], %2
+%else
+    mova          [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+    movu          [%1], %2
+%endmacro
 
-    STORE [dstq + i * 2 * blocksize], m2
+%macro LOOP_END 3
+    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
+    add              %2q, %3q                    ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+%endmacro
 
-%assign i (i + 1)
-%endrep
 
-    add       dstq, dststrideq
-    add       srcq, srcstrideq
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && %0 ==3
+%if %1 > 16
+    vextracti128 xm1, m0, 1
+    pmovzxbw      m1, xm1
+    psllw         m1, 14-%2
+%endif
+    pmovzxbw      m0, xm0
+%else ; not avx
+%if %1 > 8
+    punpckhbw     m1, m0, m2
+    psllw         m1, 14-%2
+%endif
+    punpcklbw     m0, m2
+%endif
+%endif ;avx
+    psllw         m0, 14-%2
+%endmacro
 
-    dec heightd
-    jg .loop
-    RET
+%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
+%if %1 == 8
+%if cpuflag(avx2) && (%0 == 5)
+%if %2 > 16
+    vperm2i128    m10, m0, m1, q0301
+%endif
+    vinserti128    m0, m0, xm1, 1
+    mova           m1, m10
+%if %2 > 16
+    vperm2i128    m10, m2, m3, q0301
+%endif
+    vinserti128    m2, m2, xm3, 1
+    mova           m3, m10
+%endif
+    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
+    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
+    paddw          %%reg0, %%reg2
+%if %2 > 8
+    pmaddubsw      %%reg1, %3
+    pmaddubsw      %%reg3, %4
+    paddw          %%reg1, %%reg3
+%endif
+%else
+    pmaddwd        %%reg0, %3
+    pmaddwd        %%reg2, %4
+    paddd          %%reg0, %%reg2
+%if %2 > 4
+    pmaddwd        %%reg1, %3
+    pmaddwd        %%reg3, %4
+    paddd          %%reg1, %%reg3
+%if %1 != 8
+    psrad          %%reg1, %1-8
+%endif
+%endif
+%if %1 != 8
+    psrad          %%reg0, %1-8
+%endif
+    packssdw       %%reg0, %%reg1
+%endif
 %endmacro
 
-INIT_XMM ssse3
-EPEL_8 4,  0
-EPEL_8 8,  0
-EPEL_8 12, 0
-EPEL_8 16, 0
-EPEL_8 24, 0
-EPEL_8 32, 0
+%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
 
-EPEL_8 4,  1
-EPEL_8 8,  1
-EPEL_8 12, 1
-EPEL_8 16, 1
-EPEL_8 24, 1
-EPEL_8 32, 1
+%if cpuflag(avx2)
+%assign %%offset 32
+%define %%table  hevc_qpel_filters_avx2_%2
+%else
+%assign %%offset 16
+%define %%table  hevc_qpel_filters_sse4_%2
+%endif
+
+%ifdef PIC
+    lea         rfilterq, [%%table]
+%else
+    %define rfilterq %%table
+%endif
 
-%macro EPEL_16 3
-%if %3
-    %define mvfrac     myq
-    %define pixstride  srcstrideq
-    %define pixstride3 sstride3q
+%if %2 == 8
+    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
+    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
+    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
+    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
 %else
-    %define mvfrac     mxq
-    %define pixstride  2
-    %define pixstride3 6
+    pmaddwd           m0, [rfilterq + %3q*8   ]
+    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
 %endif
+%if %1 > 4
+    pmaddwd           m1, [rfilterq + %3q*8   ]
+    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+    p%4               m0, m1
+%endif
+%endmacro
+
+%macro QPEL_COMPUTE 2-3     ; width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && (%0 == 3)
 
-    COMMON_DEFS %1, 16
+    vperm2i128 m10, m0,  m1, q0301
+    vinserti128 m0, m0, xm1, 1
+    SWAP 1, 10
 
-    and       mvfrac, 0x7
-    dec       mvfrac
-    shl       mvfrac, 5
-    lea       coeffsregq, [hevc_epel_coeffs]
-    mova      m0, [coeffsregq + mvfrac]
+    vperm2i128 m10, m2,  m3, q0301
+    vinserti128 m2, m2, xm3, 1
+    SWAP 3, 10
 
-    pshufd    m1, m0, 0x55
-    pshufd    m0, m0, 0x00
 
-%if %3
-    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
+    vperm2i128 m10, m4,  m5, q0301
+    vinserti128 m4, m4, xm5, 1
+    SWAP 5, 10
+
+    vperm2i128 m10, m6,  m7, q0301
+    vinserti128 m6, m6, xm7, 1
+    SWAP 7, 10
 %endif
-    sub       srcq, pixstride
 
-.loop
+    pmaddubsw         m0, m12   ;x1*c1+x2*c2
+    pmaddubsw         m2, m13   ;x3*c3+x4*c4
+    pmaddubsw         m4, m14   ;x5*c5+x6*c6
+    pmaddubsw         m6, m15   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%if %1 > 8
+    pmaddubsw         m1, m12
+    pmaddubsw         m3, m13
+    pmaddubsw         m5, m14
+    pmaddubsw         m7, m15
+    paddw             m1, m3
+    paddw             m5, m7
+    paddw             m1, m5
+%endif
+%else
+    pmaddwd           m0, m12
+    pmaddwd           m2, m13
+    pmaddwd           m4, m14
+    pmaddwd           m6, m15
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, m12
+    pmaddwd           m3, m13
+    pmaddwd           m5, m14
+    pmaddwd           m7, m15
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+%endif
+%endmacro
 
-%assign i 0
-%rep nb_blocks
+%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+    paddsw            %3, %5
+%if %1 > 8
+    paddsw            %4, %6
+%endif
+    UNI_COMPUTE       %1, %2, %3, %4, %7
+%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
+    vpermq            %3, %3, 216
+    vpermq            %4, %4, 216
+%endif
+%endmacro
 
-    BLOCK_DEFS i
+%macro UNI_COMPUTE 5
+    pmulhrsw          %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+    pmulhrsw          %4, %5
+%endif
+%if %2 == 8
+    packuswb          %3, %4
+%else
+    CLIPW             %3, [pb_0], [max_pixels_%2]
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
+    CLIPW             %4, [pb_0], [max_pixels_%2]
+%endif
+%endif
+%endmacro
 
-    LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
-    LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
-    LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
-    LOAD m5, [srcq + i * 2 * blocksize +     pixstride3]
 
-    punpcklwd m6, m2, m3
-    punpcklwd m7, m4, m5
-    pmaddwd   m6, m0
-    pmaddwd   m7, m1
-    paddd     m6, m7
-    psrad     m6, %2
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+;                         uint8_t *_src, ptrdiff_t _srcstride,
+;                         int height, int mx, int my)
+; ******************************
 
-    %if block_truncated == 0
-        punpckhwd m2, m3
-        punpckhwd m4, m5
-        pmaddwd   m2, m0
-        pmaddwd   m4, m1
-        paddd     m2, m4
-        psrad     m2, %2
-    %endif
-    packssdw  m6, m2
-    STORE [dstq + i * 2 * blocksize], m6
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+HEVC_PEL_PIXELS     %1, %2
+HEVC_UNI_PEL_PIXELS %1, %2
+HEVC_BI_PEL_PIXELS  %1, %2
+%endmacro
 
-%assign i (i + 1)
-%endrep
+%macro HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
+    pxor               m2, m2
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+ %endmacro
 
-    add       dstq,   dststrideq
-    add       srcq,   srcstrideq
+%macro HEVC_UNI_PEL_PIXELS 2
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
-    dec heightd
-    jg .loop
+%macro HEVC_BI_PEL_PIXELS 2
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
+    pxor              m2, m2
+    movdqa            m5, [pw_bi_%2]
+.loop:
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    SIMPLE_BILOAD     %1, src2q, m3, m4
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
 %endmacro
 
-%if ARCH_X86_64
 
-%macro EPEL_H_10 1
-cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 0
-%endmacro
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width);
+; ******************************
 
-INIT_XMM avx
-EPEL_H_10 4
-EPEL_H_10 8
-EPEL_H_10 12
-EPEL_H_10 16
-EPEL_H_10 24
-EPEL_H_10 32
 
-%macro EPEL_V_10 1
-cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 2, 1
-%endmacro
+%macro HEVC_PUT_HEVC_EPEL 2
+%if cpuflag(avx2)
+%define XMM_REGS  11
+%else
+%define XMM_REGS  8
+%endif
 
-INIT_XMM avx
-EPEL_V_10 4
-EPEL_V_10 8
-EPEL_V_10 12
-EPEL_V_10 16
-EPEL_V_10 24
-EPEL_V_10 32
+cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1      dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
 
-; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
-;                    int16_t *src, ptrdiff_t srcstride,
-;                    int height, int mx, int my, int *mcbuffer)
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    movdqa            m6, [pw_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%macro EPEL_HV 1
-cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
-EPEL_16 %1, 6, 1
-%endmacro
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m6, [pw_bi_%2]
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-INIT_XMM avx
-EPEL_HV 4
-EPEL_HV 8
-EPEL_HV 12
-EPEL_HV 16
-EPEL_HV 24
-EPEL_HV 32
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+;                      uint8_t *_src, ptrdiff_t _srcstride,
+;                      int height, int mx, int my, int width)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
 
-%endif ; ARCH_X86_64
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-; hevc_put_unweighted_pred_<w>_<d>(pixel   *dst, ptrdiff_t dststride,
-;                                  int16_t *src, ptrdiff_t srcstride,
-;                                  int height)
 
-%macro AVG 5
-    %if %3
-        %if %4 == 4
-            movq %5, %2
-            paddsw %1, %5
-        %else
-            paddsw %1, %2
-        %endif
-    %endif
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
+    movifnidn        myd, mym
+    movdqa            m6, [pw_bi_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
+.loop:
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 %endmacro
 
-; %1: 0 - one source; 1 - two sources
-; %2: width
-; %3: bit depth
-%macro PUT_PRED 3
-%if %1
-cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
+
+; ******************************
+; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
+
+%macro HEVC_PUT_HEVC_EPEL_HV 2
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m2, m0, xm4, 1
+    vperm2i128        m3, m0, m4, q0301
+    PEL_10STORE%1     dstq, m2, m3
 %else
-cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
+    PEL_10STORE%1     dstq, m0, m4
 %endif
+%else
+    PEL_10STORE%1     dstq, m0, m1
+%endif
+    movdqa            m4, m5
+    movdqa            m5, m6
+    movdqa            m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
 
-%assign shift       14 + %1 - %3
-%assign offset      (1 << (shift - 1))
-%define offset_data pw_ %+ offset
-
-    mova        m0, [offset_data]
+cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    mova              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
+%else
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%if %3 > 8
-    %define STORE_BLOCK movu
-    %define STORE_HALF  movq
-
-    %assign pixel_max ((1 << %3) - 1)
-    %define pw_pixel_max pw_ %+ pixel_max
-    pxor    m1, m1
-    mova    m2, [pw_pixel_max]
-%else
-    %define STORE_BLOCK movq
-    %define STORE_HALF  movd
-%endif
-
-.loop
-%assign i 0
-%rep (%2 + 7) / 8
-
-    %if (i + 1) * 8 > %2
-        %define LOAD movq
-        %define STORE STORE_HALF
-    %else
-        %define LOAD mova
-        %define STORE STORE_BLOCK
-    %endif
-
-    LOAD m3, [srcq  + 16 * i]
-    AVG  m3, [src2q + 16 * i], %1, %3 - i * 8, m4
-
-    paddsw m3, m0
-    psraw  m3, shift
-
-    %if %3 == 8
-        packuswb m3, m3
-        STORE [dstq + 8 * i], m3
-    %else
-        CLIPW m3, m1, m2
-        STORE [dstq + 16 * i], m3
-    %endif
-%assign i (i + 1)
-%endrep
-
-    add dstq,  dststrideq
-    add srcq,  srcstrideq
-%if %1
-    add src2q, srcstrideq
-%endif
-
-    dec         heightd
-    jg          .loop
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop:
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    SIMPLE_BILOAD     %1, src2q, m8, m3
+%if cpuflag(avx2)
+    vinserti128       m1, m8, xm3, 1
+    vperm2i128        m2, m8, m3, q0301
+    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
+%else
+    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
+%endif
+%else
+    SIMPLE_BILOAD     %1, src2q, m8, m9
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m4
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
 %endmacro
 
-INIT_XMM sse2
-PUT_PRED 0, 4,  8
-PUT_PRED 1, 4,  8
-PUT_PRED 0, 8,  8
-PUT_PRED 1, 8,  8
-PUT_PRED 0, 12, 8
-PUT_PRED 1, 12, 8
-PUT_PRED 0, 16, 8
-PUT_PRED 1, 16, 8
-PUT_PRED 0, 24, 8
-PUT_PRED 1, 24, 8
-PUT_PRED 0, 32, 8
-PUT_PRED 1, 32, 8
-PUT_PRED 0, 48, 8
-PUT_PRED 1, 48, 8
-PUT_PRED 0, 64, 8
-PUT_PRED 1, 64, 8
-
-PUT_PRED 0, 4,  10
-PUT_PRED 1, 4,  10
-PUT_PRED 0, 8,  10
-PUT_PRED 1, 8,  10
-PUT_PRED 0, 12, 10
-PUT_PRED 1, 12, 10
-PUT_PRED 0, 16, 10
-PUT_PRED 1, 16, 10
-PUT_PRED 0, 24, 10
-PUT_PRED 1, 24, 10
-PUT_PRED 0, 32, 10
-PUT_PRED 1, 32, 10
-PUT_PRED 0, 48, 10
-PUT_PRED 1, 48, 10
-PUT_PRED 0, 64, 10
-PUT_PRED 1, 64, 10
-
-%macro PUT_WEIGHTED_PRED 3
-%if %1
-cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
-%else
-cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
-%endif
-
-    and         denomd, 0xff
-    movsx       weight0d, weight0w
-    movsx       offset0d, offset0w
-%if %1
-    movsx       weight1d, weight1w
-    movsx       offset1d, offset1w
-%endif
-
-    add         denomd, 14 + %1 - %3
-    movd        m0, denomd
+; ******************************
+; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
 
-%if %3 > 8
-    %assign     pixel_max ((1 << %3) - 1)
-    %define     pw_pixel_max pw_ %+ pixel_max
-    pxor        m4, m4
-    mova        m5, [pw_pixel_max]
+%macro HEVC_PUT_HEVC_QPEL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
 
-    shl         offset0d, %3 - 8
-%if %1
-    shl         offset1d, %3 - 8
+cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
+    mova              m9, [pw_%2]
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
 %endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m9, [pw_bi_%2]
+    QPEL_FILTER       %2, mx
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
 %endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%if %1
-    lea         offset0d, [offset0d + offset1d + 1]
-%else
-    lea         offset0d, [2 * offset0d + 1]
+
+; ******************************
+; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my, int width)
+; ******************************
+
+cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
 %endif
-    movd        m1, offset0d
-    SPLATD      m1
-    pslld       m1, m0
-    psrad       m1, 1
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
 
-    movd        m2, weight0d
-    SPLATD      m2
-%if %1
-    movd        m3, weight1d
-    SPLATD      m3
+cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
 %endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-.loop
-%assign i 0
-%rep (%2 + 3) / 4
+cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+    movifnidn        myd, mym
+    movdqa            m9, [pw_bi_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop:
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
+    QPEL_COMPUTE      %1, %2, 1
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
-    pmovsxwd   m6, [src0q + 8 * i]
-    pmulld     m6, m2
 
-%if %1
-    pmovsxwd   m7, [src1q + 8 * i]
-    pmulld     m7, m3
-    paddd      m6, m7
+; ******************************
+; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_QPEL_HV 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
 %endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    PEL_10STORE%1     dstq, m0, m1
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
 
-    paddd      m6, m1
-    psrad      m6, m0
-
-    packssdw   m6, m6
+cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    mova            m8, m9
+    mova            m9, m10
+    mova           m10, m11
+    mova           m11, m12
+    mova           m12, m13
+    mova           m13, m14
+    mova           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-%if %3 > 8
-    CLIPW      m6, m4, m5
-    movq       [dstq + 8 * i], m6
+cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+%if cpuflag(avx2)
+%assign %%shift  4
 %else
-    packuswb   m6, m6
-    movd [dstq + 4 * i], m6
+%assign %%shift  3
 %endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop:
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
 
-%assign i (i + 1)
-%endrep
+%macro WEIGHTING_FUNCS 2
+%if WIN64 || ARCH_X86_32
+cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
+    mov             r4d, denomm
+%define SHIFT  r4d
+%else
+cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
+%define SHIFT  denomd
+%endif
+    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
+%if %1 <= 4
+    pxor             m1, m1
+%endif
+    movd             m2, wxm        ; WX
+    movd             m4, SHIFT      ; shift
+%if %1 <= 4
+    punpcklwd        m2, m1
+%else
+    punpcklwd        m2, m2
+%endif
+    dec           SHIFT
+    movdqu           m5, [pd_1]
+    movd             m6, SHIFT
+    pshufd           m2, m2, 0
+    mov           SHIFT, oxm
+    pslld            m5, m6
+%if %2 != 8
+    shl           SHIFT, %2-8       ; ox << (bitd - 8)
+%endif
+    movd             m3, SHIFT      ; OX
+    pshufd           m3, m3, 0
+%if WIN64 || ARCH_X86_32
+    mov           SHIFT, heightm
+%endif
+.loop:
+   SIMPLE_LOAD        %1, 10, srcq, m0
+%if %1 <= 4
+    punpcklwd         m0, m1
+    pmaddwd           m0, m2
+    paddd             m0, m5
+    psrad             m0, m4
+    paddd             m0, m3
+%else
+    pmulhw            m6, m0, m2
+    pmullw            m0, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    paddd             m0, m5
+    paddd             m1, m5
+    psrad             m0, m4
+    psrad             m1, m4
+    paddd             m0, m3
+    paddd             m1, m3
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+    CLIPW             m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
 
-    add dstq,  dststrideq
-    add src0q, srcstrideq
-%if %1
-    add src1q, srcstrideq
+cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
+    movifnidn        r5d, denomm
+%if %1 <= 4
+    pxor              m1, m1
+%endif
+    movd              m2, wx0m         ; WX0
+    lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
+    movd              m3, wx1m         ; WX1
+    movd              m0, r5d          ; shift
+%if %1 <= 4
+    punpcklwd         m2, m1
+    punpcklwd         m3, m1
+%else
+    punpcklwd         m2, m2
+    punpcklwd         m3, m3
+%endif
+    inc              r5d
+    movd              m5, r5d          ; shift+1
+    pshufd            m2, m2, 0
+    mov              r5d, ox0m
+    pshufd            m3, m3, 0
+    add              r5d, ox1m
+%if %2 != 8
+    shl              r5d, %2-8         ; ox << (bitd - 8)
+%endif
+    inc              r5d
+    movd              m4, r5d          ; offset
+    pshufd            m4, m4, 0
+%if UNIX64
+%define h heightd
+%else
+    mov              r5d, heightm
+%define h r5d
 %endif
+    pslld             m4, m0
 
-    dec         heightd
-    jg          .loop
+.loop:
+   SIMPLE_LOAD        %1, 10, srcq,  m0
+   SIMPLE_LOAD        %1, 10, src2q, m8
+%if %1 <= 4
+    punpcklwd         m0, m1
+    punpcklwd         m8, m1
+    pmaddwd           m0, m3
+    pmaddwd           m8, m2
+    paddd             m0, m4
+    paddd             m0, m8
+    psrad             m0, m5
+%else
+    pmulhw            m6, m0, m3
+    pmullw            m0, m3
+    pmulhw            m7, m8, m2
+    pmullw            m8, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    punpckhwd         m9, m8, m7
+    punpcklwd         m8, m7
+    paddd             m0, m8
+    paddd             m1, m9
+    paddd             m0, m4
+    paddd             m1, m4
+    psrad             m0, m5
+    psrad             m1, m5
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+     CLIPW            m0, [pb_0], [max_pixels_%2]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
+    dec                h                         ; cmp height
+    jnz               .loop                      ; height loop
     RET
 %endmacro
 
-%if ARCH_X86_64
-INIT_XMM sse4
-PUT_WEIGHTED_PRED 0, 4,  8
-PUT_WEIGHTED_PRED 1, 4,  8
-PUT_WEIGHTED_PRED 0, 8,  8
-PUT_WEIGHTED_PRED 1, 8,  8
-PUT_WEIGHTED_PRED 0, 12, 8
-PUT_WEIGHTED_PRED 1, 12, 8
-PUT_WEIGHTED_PRED 0, 16, 8
-PUT_WEIGHTED_PRED 1, 16, 8
-PUT_WEIGHTED_PRED 0, 24, 8
-PUT_WEIGHTED_PRED 1, 24, 8
-PUT_WEIGHTED_PRED 0, 32, 8
-PUT_WEIGHTED_PRED 1, 32, 8
-PUT_WEIGHTED_PRED 0, 48, 8
-PUT_WEIGHTED_PRED 1, 48, 8
-PUT_WEIGHTED_PRED 0, 64, 8
-PUT_WEIGHTED_PRED 1, 64, 8
-
-PUT_WEIGHTED_PRED 0, 4,  10
-PUT_WEIGHTED_PRED 1, 4,  10
-PUT_WEIGHTED_PRED 0, 8,  10
-PUT_WEIGHTED_PRED 1, 8,  10
-PUT_WEIGHTED_PRED 0, 12, 10
-PUT_WEIGHTED_PRED 1, 12, 10
-PUT_WEIGHTED_PRED 0, 16, 10
-PUT_WEIGHTED_PRED 1, 16, 10
-PUT_WEIGHTED_PRED 0, 24, 10
-PUT_WEIGHTED_PRED 1, 24, 10
-PUT_WEIGHTED_PRED 0, 32, 10
-PUT_WEIGHTED_PRED 1, 32, 10
-PUT_WEIGHTED_PRED 0, 48, 10
-PUT_WEIGHTED_PRED 1, 48, 10
-PUT_WEIGHTED_PRED 0, 64, 10
-PUT_WEIGHTED_PRED 1, 64, 10
+INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
+
+WEIGHTING_FUNCS 2, 8
+WEIGHTING_FUNCS 4, 8
+WEIGHTING_FUNCS 6, 8
+WEIGHTING_FUNCS 8, 8
+
+WEIGHTING_FUNCS 2, 10
+WEIGHTING_FUNCS 4, 10
+WEIGHTING_FUNCS 6, 10
+WEIGHTING_FUNCS 8, 10
+
+WEIGHTING_FUNCS 2, 12
+WEIGHTING_FUNCS 4, 12
+WEIGHTING_FUNCS 6, 12
+WEIGHTING_FUNCS 8, 12
+
+HEVC_PUT_HEVC_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_PEL_PIXELS  6, 8
+HEVC_PUT_HEVC_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 8
+
+HEVC_PUT_HEVC_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_PEL_PIXELS 8, 10
+
+HEVC_PUT_HEVC_PEL_PIXELS 2, 12
+HEVC_PUT_HEVC_PEL_PIXELS 4, 12
+HEVC_PUT_HEVC_PEL_PIXELS 6, 12
+HEVC_PUT_HEVC_PEL_PIXELS 8, 12
+
+HEVC_PUT_HEVC_EPEL 2,  8
+HEVC_PUT_HEVC_EPEL 4,  8
+HEVC_PUT_HEVC_EPEL 6,  8
+HEVC_PUT_HEVC_EPEL 8,  8
+HEVC_PUT_HEVC_EPEL 12, 8
+HEVC_PUT_HEVC_EPEL 16, 8
+
+
+HEVC_PUT_HEVC_EPEL 2, 10
+HEVC_PUT_HEVC_EPEL 4, 10
+HEVC_PUT_HEVC_EPEL 6, 10
+HEVC_PUT_HEVC_EPEL 8, 10
+
+HEVC_PUT_HEVC_EPEL 2, 12
+HEVC_PUT_HEVC_EPEL 4, 12
+HEVC_PUT_HEVC_EPEL 6, 12
+HEVC_PUT_HEVC_EPEL 8, 12
+
+HEVC_PUT_HEVC_EPEL_HV 2,  8
+HEVC_PUT_HEVC_EPEL_HV 4,  8
+HEVC_PUT_HEVC_EPEL_HV 6,  8
+HEVC_PUT_HEVC_EPEL_HV 8,  8
+HEVC_PUT_HEVC_EPEL_HV 16, 8
+
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
+
+HEVC_PUT_HEVC_EPEL_HV 2, 12
+HEVC_PUT_HEVC_EPEL_HV 4, 12
+HEVC_PUT_HEVC_EPEL_HV 6, 12
+HEVC_PUT_HEVC_EPEL_HV 8, 12
+
+HEVC_PUT_HEVC_QPEL 4,  8
+HEVC_PUT_HEVC_QPEL 8,  8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
+
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
+
+HEVC_PUT_HEVC_QPEL 4, 12
+HEVC_PUT_HEVC_QPEL 8, 12
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 12
+HEVC_PUT_HEVC_QPEL_HV 4, 12
+HEVC_PUT_HEVC_QPEL_HV 6, 12
+HEVC_PUT_HEVC_QPEL_HV 8, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
+
+HEVC_PUT_HEVC_PEL_PIXELS 32, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+
+HEVC_PUT_HEVC_EPEL 32, 8
+HEVC_PUT_HEVC_EPEL 16, 10
+
+HEVC_PUT_HEVC_EPEL_HV 16, 10
+HEVC_PUT_HEVC_EPEL_HV 32, 8
+
+HEVC_PUT_HEVC_QPEL 32, 8
+
+HEVC_PUT_HEVC_QPEL 16, 10
+
+HEVC_PUT_HEVC_QPEL_HV 16, 10
 
+%endif ;AVX2
 %endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
new file mode 100644
index 0000000..dc3e88a
--- /dev/null
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -0,0 +1,388 @@
+; /*
+; * Provide SIMD optimizations for transform_add functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+cextern pw_1023
+%define max_pixels_10 pw_1023
+
+
+;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
+%macro TR_ADD_MMX_4_8 0
+    mova              m2, [r1]
+    mova              m4, [r1+8]
+    pxor              m3, m3
+    psubw             m3, m2
+    packuswb          m2, m2
+    packuswb          m3, m3
+    pxor              m5, m5
+    psubw             m5, m4
+    packuswb          m4, m4
+    packuswb          m5, m5
+
+    movh              m0, [r0     ]
+    movh              m1, [r0+r2  ]
+    paddusb           m0, m2
+    paddusb           m1, m4
+    psubusb           m0, m3
+    psubusb           m1, m5
+    movh       [r0     ], m0
+    movh       [r0+r2  ], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add4_8, 3, 4, 6
+    TR_ADD_MMX_4_8
+    add               r1, 16
+    lea               r0, [r0+r2*2]
+    TR_ADD_MMX_4_8
+    RET
+
+%macro TR_ADD_SSE_8_8 0
+    pxor              m3, m3
+    mova              m4, [r1]
+    mova              m6, [r1+16]
+    mova              m0, [r1+32]
+    mova              m2, [r1+48]
+    psubw             m5, m3, m4
+    psubw             m7, m3, m6
+    psubw             m1, m3, m0
+    packuswb          m4, m0
+    packuswb          m5, m1
+    psubw             m3, m2
+    packuswb          m6, m2
+    packuswb          m7, m3
+
+    movq                m0, [r0     ]
+    movq                m1, [r0+r2  ]
+    movhps              m0, [r0+r2*2]
+    movhps              m1, [r0+r3  ]
+    paddusb             m0, m4
+    paddusb             m1, m6
+    psubusb             m0, m5
+    psubusb             m1, m7
+    movq         [r0     ], m0
+    movq         [r0+r2  ], m1
+    movhps       [r0+2*r2], m0
+    movhps       [r0+r3  ], m1
+%endmacro
+
+%macro TR_ADD_SSE_16_32_8 3
+    mova             xm2, [r1+%1   ]
+    mova             xm6, [r1+%1+16]
+%if cpuflag(avx2)
+    vinserti128       m2, m2, [r1+%1+32], 1
+    vinserti128       m6, m6, [r1+%1+48], 1
+%endif
+%if cpuflag(avx)
+    psubw             m1, m0, m2
+    psubw             m5, m0, m6
+%else
+    mova              m1, m0
+    mova              m5, m0
+    psubw             m1, m2
+    psubw             m5, m6
+%endif
+    packuswb          m2, m6
+    packuswb          m1, m5
+
+    mova             xm4, [r1+%1+mmsize*2   ]
+    mova             xm6, [r1+%1+mmsize*2+16]
+%if cpuflag(avx2)
+    vinserti128       m4, m4, [r1+%1+96 ], 1
+    vinserti128       m6, m6, [r1+%1+112], 1
+%endif
+%if cpuflag(avx)
+    psubw             m3, m0, m4
+    psubw             m5, m0, m6
+%else
+    mova              m3, m0
+    mova              m5, m0
+    psubw             m3, m4
+    psubw             m5, m6
+%endif
+    packuswb          m4, m6
+    packuswb          m3, m5
+
+    paddusb           m2, [%2]
+    paddusb           m4, [%3]
+    psubusb           m2, m1
+    psubusb           m4, m3
+    mova            [%2], m2
+    mova            [%3], m4
+%endmacro
+
+
+%macro TRANSFORM_ADD_8 0
+; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add8_8, 3, 4, 8
+    lea               r3, [r2*3]
+    TR_ADD_SSE_8_8
+    add               r1, 64
+    lea               r0, [r0+r2*4]
+    TR_ADD_SSE_8_8
+    RET
+
+; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add16_8, 3, 4, 7
+    pxor              m0, m0
+    lea               r3, [r2*3]
+    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%rep 3
+    add                r1, 128
+    lea                r0, [r0+r2*4]
+    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%endrep
+    RET
+
+; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+    pxor               m0, m0
+    TR_ADD_SSE_16_32_8  0, r0,    r0+16
+    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%rep 15
+    add                r1, 128
+    lea                r0, [r0+r2*2]
+    TR_ADD_SSE_16_32_8  0, r0,    r0+16
+    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%endrep
+    RET
+%endmacro
+
+INIT_XMM sse2
+TRANSFORM_ADD_8
+INIT_XMM avx
+TRANSFORM_ADD_8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+    pxor              m0, m0
+    lea               r3, [r2*3]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%rep 7
+    add                r1, 256
+    lea                r0, [r0+r2*4]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%endrep
+    RET
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro TR_ADD_SSE_8_10 4
+    mova              m0, [%4]
+    mova              m1, [%4+16]
+    mova              m2, [%4+32]
+    mova              m3, [%4+48]
+    paddw             m0, [%1+0   ]
+    paddw             m1, [%1+%2  ]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3  ]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+    mova       [%1+%2*2], m2
+    mova       [%1+%3  ], m3
+%endmacro
+
+%macro TR_ADD_MMX4_10 3
+    mova              m0, [%1+0   ]
+    mova              m1, [%1+%2  ]
+    paddw             m0, [%3]
+    paddw             m1, [%3+8]
+    CLIPW             m0, m2, m3
+    CLIPW             m1, m2, m3
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+%endmacro
+
+%macro TRANS_ADD_SSE_16_10 3
+    mova              m0, [%3]
+    mova              m1, [%3+16]
+    mova              m2, [%3+32]
+    mova              m3, [%3+48]
+    paddw             m0, [%1      ]
+    paddw             m1, [%1+16   ]
+    paddw             m2, [%1+%2   ]
+    paddw             m3, [%1+%2+16]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova      [%1      ], m0
+    mova      [%1+16   ], m1
+    mova      [%1+%2   ], m2
+    mova      [%1+%2+16], m3
+%endmacro
+
+%macro TRANS_ADD_SSE_32_10 2
+    mova              m0, [%2]
+    mova              m1, [%2+16]
+    mova              m2, [%2+32]
+    mova              m3, [%2+48]
+
+    paddw             m0, [%1   ]
+    paddw             m1, [%1+16]
+    paddw             m2, [%1+32]
+    paddw             m3, [%1+48]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova         [%1   ], m0
+    mova         [%1+16], m1
+    mova         [%1+32], m2
+    mova         [%1+48], m3
+%endmacro
+
+%macro TRANS_ADD16_AVX2 4
+    mova              m0, [%4]
+    mova              m1, [%4+32]
+    mova              m2, [%4+64]
+    mova              m3, [%4+96]
+
+    paddw             m0, [%1+0   ]
+    paddw             m1, [%1+%2  ]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3  ]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+    mova       [%1+%2*2], m2
+    mova       [%1+%3  ], m3
+%endmacro
+
+%macro TRANS_ADD32_AVX2 3
+    mova              m0, [%3]
+    mova              m1, [%3+32]
+    mova              m2, [%3+64]
+    mova              m3, [%3+96]
+
+    paddw             m0, [%1      ]
+    paddw             m1, [%1+32   ]
+    paddw             m2, [%1+%2   ]
+    paddw             m3, [%1+%2+32]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova      [%1      ], m0
+    mova      [%1+32   ], m1
+    mova      [%1+%2   ], m2
+    mova      [%1+%2+32], m3
+%endmacro
+
+
+INIT_MMX mmxext
+cglobal hevc_transform_add4_10,3,4, 6
+    pxor              m2, m2
+    mova              m3, [max_pixels_10]
+    TR_ADD_MMX4_10     r0, r2, r1
+    add               r1, 16
+    lea               r0, [r0+2*r2]
+    TR_ADD_MMX4_10     r0, r2, r1
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal hevc_transform_add8_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+    lea               r3, [r2*3]
+
+    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    lea               r0, [r0+r2*4]
+    add               r1, 64
+    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    RET
+
+cglobal hevc_transform_add16_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD_SSE_16_10 r0, r2, r1
+%rep 7
+    lea                 r0, [r0+r2*2]
+    add                 r1, 64
+    TRANS_ADD_SSE_16_10 r0, r2, r1
+%endrep
+    RET
+
+cglobal hevc_transform_add32_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD_SSE_32_10 r0, r1
+%rep 31
+    lea                 r0, [r0+r2]
+    add                 r1, 64
+    TRANS_ADD_SSE_32_10 r0, r1
+%endrep
+    RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+cglobal hevc_transform_add16_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+    lea               r3, [r2*3]
+
+    TRANS_ADD16_AVX2  r0, r2, r3, r1
+%rep 3
+    lea               r0, [r0+r2*4]
+    add               r1, 128
+    TRANS_ADD16_AVX2  r0, r2, r3, r1
+%endrep
+    RET
+
+cglobal hevc_transform_add32_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD32_AVX2  r0, r2, r1
+%rep 15
+    lea               r0, [r0+r2*2]
+    add               r1, 128
+    TRANS_ADD32_AVX2  r0, r2, r1
+%endrep
+    RET
+%endif ;HAVE_AVX_EXTERNAL
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
new file mode 100644
index 0000000..888a28a
--- /dev/null
+++ b/libavcodec/x86/hevc_sao.asm
@@ -0,0 +1,340 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 8bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pb_1
+cextern pb_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 0
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    pxor              m0, m0
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
+    psraw             %1, %2, 3
+%if ARCH_X86_64
+    pcmpeqw          m10, %1, m0
+    pcmpeqw          m11, %1, m1
+    pcmpeqw          m12, %1, m2
+    pcmpeqw           %1, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand              %1, m7
+    por              m10, m11
+    por              m12, %1
+    por              m10, m12
+    paddw             %2, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
+    pcmpeqw           %1, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %1, m7
+    por               m4, m5
+    por               m6, %1
+    por               m4, m6
+    paddw             %2, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 2
+cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT
+
+align 16
+.loop:
+%if %1 == 8
+    movq              m8, [srcq]
+    punpcklbw         m8, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
+    packuswb          m8, m14
+    movq          [dstq], m8
+%endif ; %1 == 8
+
+%assign i 0
+%rep %2
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %1 == 48
+
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    REP_RET
+%endmacro
+
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+HEVC_SAO_BAND_FILTER 32, 2
+HEVC_SAO_BAND_FILTER 48, 2
+HEVC_SAO_BAND_FILTER 64, 4
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER  8, 0
+HEVC_SAO_BAND_FILTER 16, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 32, 1
+HEVC_SAO_BAND_FILTER 48, 1
+HEVC_SAO_BAND_FILTER 64, 2
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE
+    imul       b_strideq, EDGE_SRCSTRIDE
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
+    pminub            m4, m1, m2
+    pminub            m5, m1, m3
+    pcmpeqb           m2, m4
+    pcmpeqb           m3, m5
+    pcmpeqb           m4, m1
+    pcmpeqb           m5, m1
+    psubb             m4, m2
+    psubb             m5, m3
+    paddb             m4, m6
+    paddb             m4, m5
+
+    pshufb            m2, m0, m4
+%if %1 > 8
+    punpckhbw         m5, m7, m1
+    punpckhbw         m4, m2, m7
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m5, m4
+    pmaddubsw         m3, m2
+    packuswb          m3, m5
+%else
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m3, m2
+    packuswb          m3, m3
+%endif
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                             int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 2-3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+%endif ; ARCH
+
+%if mmsize > 16
+    vbroadcasti128    m0, [offsetq]
+%else
+    movu              m0, [offsetq]
+%endif
+    mova              m1, [pb_edge_shuffle]
+    packsswb          m0, m0
+    mova              m7, [pb_1]
+    pshufb            m0, m1
+    mova              m6, [pb_2]
+%if ARCH_X86_32
+    mov          heightd, r6m
+%endif
+
+align 16
+.loop:
+
+%if %1 == 8
+    movq              m1, [srcq]
+    movq              m2, [srcq + a_strideq]
+    movq              m3, [srcq + b_strideq]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    movq          [dstq], m3
+%endif
+
+%assign i 0
+%rep %2
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mov%3     [dstq + i], m3
+%assign i i+mmsize
+%endrep
+
+%if %1 == 48
+INIT_XMM cpuname
+
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    HEVC_SAO_EDGE_FILTER_COMPUTE %1
+    mova      [dstq + i], m3
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+HEVC_SAO_EDGE_FILTER  8, 0
+HEVC_SAO_EDGE_FILTER 16, 1, a
+HEVC_SAO_EDGE_FILTER 32, 2, a
+HEVC_SAO_EDGE_FILTER 48, 2, a
+HEVC_SAO_EDGE_FILTER 64, 4, a
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 32, 1, a
+HEVC_SAO_EDGE_FILTER 48, 1, u
+HEVC_SAO_EDGE_FILTER 64, 2, a
+%endif
diff --git a/libavcodec/x86/hevc_sao_10bit.asm b/libavcodec/x86/hevc_sao_10bit.asm
new file mode 100644
index 0000000..f81e2d5
--- /dev/null
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@@ -0,0 +1,370 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC 10/12bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m2:     times 16 dw -2
+pw_mask10: times 16 dw 0x03FF
+pw_mask12: times 16 dw 0x0FFF
+pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pw_m1
+cextern pw_1
+cextern pw_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro HEVC_SAO_BAND_FILTER_INIT 1
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    mova             m13, [pw_mask %+ %1]
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    mova              m1, [pw_mask %+ %1]
+    pxor              m0, m0
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro HEVC_SAO_BAND_FILTER 3
+cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    HEVC_SAO_BAND_FILTER_INIT %1
+
+align 16
+.loop:
+
+%assign i 0
+%assign j 0
+%rep %3
+%assign k 8+(j&1)
+%assign l 9-(j&1)
+    mova          m %+ k, [srcq + i]
+    psraw         m %+ l, m %+ k, %1-5
+%if ARCH_X86_64
+    pcmpeqw          m10, m %+ l, m0
+    pcmpeqw          m11, m %+ l, m1
+    pcmpeqw          m12, m %+ l, m2
+    pcmpeqw       m %+ l, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand          m %+ l, m7
+    por              m10, m11
+    por              m12, m %+ l
+    por              m10, m12
+    paddw         m %+ k, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
+    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
+    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
+    pcmpeqw       m %+ l, [rsp+mmsize*3]
+    pand              m4, [rsp+mmsize*4]
+    pand              m5, [rsp+mmsize*5]
+    pand              m6, [rsp+mmsize*6]
+    pand          m %+ l, m7
+    por               m4, m5
+    por               m6, m %+ l
+    por               m4, m6
+    paddw         m %+ k, m4
+%endif ; ARCH
+    CLIPW             m %+ k, m14, m13
+    mova      [dstq + i], m %+ k
+%assign i i+mmsize
+%assign j j+1
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, srcstrideq
+    dec          heightd
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS 0
+HEVC_SAO_BAND_FILTER 10,  8, 1
+HEVC_SAO_BAND_FILTER 10, 16, 2
+HEVC_SAO_BAND_FILTER 10, 32, 4
+HEVC_SAO_BAND_FILTER 10, 48, 6
+HEVC_SAO_BAND_FILTER 10, 64, 8
+
+HEVC_SAO_BAND_FILTER 12,  8, 1
+HEVC_SAO_BAND_FILTER 12, 16, 2
+HEVC_SAO_BAND_FILTER 12, 32, 4
+HEVC_SAO_BAND_FILTER 12, 48, 6
+HEVC_SAO_BAND_FILTER 12, 64, 8
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+HEVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 10, 16, 1
+HEVC_SAO_BAND_FILTER 10, 32, 2
+HEVC_SAO_BAND_FILTER 10, 48, 3
+HEVC_SAO_BAND_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_BAND_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_BAND_FILTER 12, 16, 1
+HEVC_SAO_BAND_FILTER 12, 32, 2
+HEVC_SAO_BAND_FILTER 12, 48, 3
+HEVC_SAO_BAND_FILTER 12, 64, 4
+%endif
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define MAX_PB_SIZE  64
+%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro PMINUW 4
+%if cpuflag(sse4)
+    pminuw            %1, %2, %3
+%else
+    psubusw           %4, %2, %3
+    psubw             %1, %2, %4
+%endif
+%endmacro
+
+%macro HEVC_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE >> 1
+    imul       b_strideq, EDGE_SRCSTRIDE >> 1
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                                   int eo, int width, int height);
+%macro HEVC_SAO_EDGE_FILTER 3
+%if ARCH_X86_64
+cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+    HEVC_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%endif ; ARCH
+
+%if mmsize > 16
+    SPLATW            m8, [offsetq+2]
+    SPLATW            m9, [offsetq+4]
+    SPLATW           m10, [offsetq+0]
+    SPLATW           m11, [offsetq+6]
+    SPLATW           m12, [offsetq+8]
+%else
+    movq             m10, [offsetq+0]
+    movd             m12, [offsetq+6]
+    SPLATW            m8, xm10, 1
+    SPLATW            m9, xm10, 2
+    SPLATW           m10, xm10, 0
+    SPLATW           m11, xm12, 0
+    SPLATW           m12, xm12, 1
+%endif
+    pxor              m0, m0
+%if ARCH_X86_64
+    mova             m13, [pw_m1]
+    mova             m14, [pw_1]
+    mova             m15, [pw_2]
+%else
+    mov          heightd, r6m
+    mova  [rsp+mmsize*0], m8
+    mova  [rsp+mmsize*1], m9
+    mova  [rsp+mmsize*2], m10
+    mova  [rsp+mmsize*3], m11
+    mova  [rsp+mmsize*4], m12
+%endif
+
+align 16
+.loop:
+
+%assign i 0
+%rep %3
+    mova              m1, [srcq + i]
+    movu              m2, [srcq+a_strideq + i]
+    movu              m3, [srcq+b_strideq + i]
+    PMINUW            m4, m1, m2, m6
+    PMINUW            m5, m1, m3, m7
+    pcmpeqw           m2, m4
+    pcmpeqw           m3, m5
+    pcmpeqw           m4, m1
+    pcmpeqw           m5, m1
+    psubw             m4, m2
+    psubw             m5, m3
+
+    paddw             m4, m5
+    pcmpeqw           m2, m4, [pw_m2]
+%if ARCH_X86_64
+    pcmpeqw           m3, m4, m13
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, m14
+    pcmpeqw           m7, m4, m15
+    pand              m2, m8
+    pand              m3, m9
+    pand              m5, m10
+    pand              m6, m11
+    pand              m7, m12
+%else
+    pcmpeqw           m3, m4, [pw_m1]
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, [pw_1]
+    pcmpeqw           m7, m4, [pw_2]
+    pand              m2, [rsp+mmsize*0]
+    pand              m3, [rsp+mmsize*1]
+    pand              m5, [rsp+mmsize*2]
+    pand              m6, [rsp+mmsize*3]
+    pand              m7, [rsp+mmsize*4]
+%endif
+    paddw             m2, m3
+    paddw             m5, m6
+    paddw             m2, m7
+    paddw             m2, m1
+    paddw             m2, m5
+    CLIPW             m2, m0, [pw_mask %+ %1]
+    mova      [dstq + i], m2
+%assign i i+mmsize
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+HEVC_SAO_EDGE_FILTER 10, 16, 2
+HEVC_SAO_EDGE_FILTER 10, 32, 4
+HEVC_SAO_EDGE_FILTER 10, 48, 6
+HEVC_SAO_EDGE_FILTER 10, 64, 8
+
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+HEVC_SAO_EDGE_FILTER 12, 16, 2
+HEVC_SAO_EDGE_FILTER 12, 32, 4
+HEVC_SAO_EDGE_FILTER 12, 48, 6
+HEVC_SAO_EDGE_FILTER 12, 64, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 10,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 10, 16, 1
+HEVC_SAO_EDGE_FILTER 10, 32, 2
+HEVC_SAO_EDGE_FILTER 10, 48, 3
+HEVC_SAO_EDGE_FILTER 10, 64, 4
+
+INIT_XMM avx2
+HEVC_SAO_EDGE_FILTER 12,  8, 1
+INIT_YMM avx2
+HEVC_SAO_EDGE_FILTER 12, 16, 1
+HEVC_SAO_EDGE_FILTER 12, 32, 2
+HEVC_SAO_EDGE_FILTER 12, 48, 3
+HEVC_SAO_EDGE_FILTER 12, 64, 4
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
new file mode 100644
index 0000000..ad8168f
--- /dev/null
+++ b/libavcodec/x86/hevcdsp.h
@@ -0,0 +1,261 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define idct_dc_proto(size, bitd, opt) \
+                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##6,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom,  int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+        WEIGHTING_PROTOTYPE(2, bitd, opt); \
+        WEIGHTING_PROTOTYPE(4, bitd, opt); \
+        WEIGHTING_PROTOTYPE(6, bitd, opt); \
+        WEIGHTING_PROTOTYPE(8, bitd, opt); \
+        WEIGHTING_PROTOTYPE(12, bitd, opt); \
+        WEIGHTING_PROTOTYPE(16, bitd, opt); \
+        WEIGHTING_PROTOTYPE(24, bitd, opt); \
+        WEIGHTING_PROTOTYPE(32, bitd, opt); \
+        WEIGHTING_PROTOTYPE(48, bitd, opt); \
+        WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h ,  8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v ,  8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv ,  8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h ,  8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v,  8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv,  8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index fd22fc3..09eb06d 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -2,29 +2,31 @@
  * Copyright (c) 2013 Seppo Tomperi
  * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
-
 #include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-
+#include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
 #include "libavcodec/hevcdsp.h"
+#include "libavcodec/x86/hevcdsp.h"
 
 #define LFC_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
@@ -32,277 +34,1081 @@ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix,
 #define LFL_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
 
-#define LFC_FUNCS(type, depth) \
-    LFC_FUNC(h, depth, sse2)   \
-    LFC_FUNC(v, depth, sse2)
-
-#define LFL_FUNCS(type, depth) \
-    LFL_FUNC(h, depth, ssse3)  \
-    LFL_FUNC(v, depth, ssse3)
-
-LFC_FUNCS(uint8_t, 8)
-LFC_FUNCS(uint8_t, 10)
-LFL_FUNCS(uint8_t, 8)
-LFL_FUNCS(uint8_t, 10)
-
-#define GET_PIXELS(width, depth, cf)                                                                      \
-void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride,             \
-                                                           uint8_t *src, ptrdiff_t srcstride,             \
-                                                           int height, int mx, int my, int16_t *mcbuffer);
-
-GET_PIXELS(4,  8, sse2)
-GET_PIXELS(8,  8, sse2)
-GET_PIXELS(12, 8, sse2)
-GET_PIXELS(16, 8, sse2)
-GET_PIXELS(24, 8, sse2)
-GET_PIXELS(32, 8, sse2)
-GET_PIXELS(48, 8, sse2)
-GET_PIXELS(64, 8, sse2)
-
-GET_PIXELS(4,  10, sse2)
-GET_PIXELS(8,  10, sse2)
-GET_PIXELS(12, 10, sse2)
-GET_PIXELS(16, 10, sse2)
-GET_PIXELS(24, 10, sse2)
-GET_PIXELS(32, 10, sse2)
-GET_PIXELS(48, 10, sse2)
-GET_PIXELS(64, 10, sse2)
-
-/* those are independent of the bit depth, so declared separately */
-#define INTERP_HV_FUNC(width, cf)                                                         \
-void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
-                                          int16_t *src, ptrdiff_t srcstride,              \
-                                          int height, int mx, int my, int16_t *mcbuffer); \
-void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
-                                          int16_t *src, ptrdiff_t srcstride,              \
-                                          int height, int mx, int my, int16_t *mcbuffer);
-
-INTERP_HV_FUNC(4,  avx)
-INTERP_HV_FUNC(8,  avx)
-INTERP_HV_FUNC(12, avx)
-INTERP_HV_FUNC(16, avx)
-INTERP_HV_FUNC(24, avx)
-INTERP_HV_FUNC(32, avx)
-INTERP_HV_FUNC(48, avx)
-INTERP_HV_FUNC(64, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
-static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
-                                                               uint8_t *src, ptrdiff_t srcstride,             \
-                                                               int height, int mx, int my, int16_t *mcbuffer) \
+#define LFC_FUNCS(type, depth, opt) \
+    LFC_FUNC(h, depth, opt)  \
+    LFC_FUNC(v, depth, opt)
+
+#define LFL_FUNCS(type, depth, opt) \
+    LFL_FUNC(h, depth, opt)  \
+    LFL_FUNC(v, depth, opt)
+
+LFC_FUNCS(uint8_t,   8, sse2)
+LFC_FUNCS(uint8_t,  10, sse2)
+LFC_FUNCS(uint8_t,  12, sse2)
+LFC_FUNCS(uint8_t,   8, avx)
+LFC_FUNCS(uint8_t,  10, avx)
+LFC_FUNCS(uint8_t,  12, avx)
+LFL_FUNCS(uint8_t,   8, sse2)
+LFL_FUNCS(uint8_t,  10, sse2)
+LFL_FUNCS(uint8_t,  12, sse2)
+LFL_FUNCS(uint8_t,   8, ssse3)
+LFL_FUNCS(uint8_t,  10, ssse3)
+LFL_FUNCS(uint8_t,  12, ssse3)
+LFL_FUNCS(uint8_t,   8, avx)
+LFL_FUNCS(uint8_t,  10, avx)
+LFL_FUNCS(uint8_t,  12, avx)
+
+#define IDCT_FUNCS(W, opt) \
+void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
+
+IDCT_FUNCS(4x4,   mmxext);
+IDCT_FUNCS(8x8,   mmxext);
+IDCT_FUNCS(8x8,   sse2);
+IDCT_FUNCS(16x16, sse2);
+IDCT_FUNCS(32x32, sse2);
+IDCT_FUNCS(16x16, avx2);
+IDCT_FUNCS(32x32, avx2);
+
+#define mc_rep_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
+                                                uint8_t *_src, ptrdiff_t _srcstride, int height,                \
+                                                intptr_t mx, intptr_t my, int width)                            \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    int16_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst = _dst + i;                                                                                         \
+        ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);            \
+    }                                                                                                           \
+}
+#define mc_rep_uni_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *_src, ptrdiff_t _srcstride, int height,            \
+                                                    intptr_t mx, intptr_t my, int width)                        \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    uint8_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src = _src + (i * ((bitd + 7) / 8));                                                                    \
+        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
+        ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                     \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+#define mc_rep_bi_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src,          \
+                                                   ptrdiff_t _srcstride, int16_t* _src2,                        \
+                                                   int height, intptr_t mx, intptr_t my, int width)             \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t  *src;                                                                                              \
+    uint8_t  *dst;                                                                                              \
+    int16_t  *src2;                                                                                             \
+    for (i = 0; i < W ; i += step) {                                                                            \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst  = _dst + (i * ((bitd + 7) / 8));                                                                   \
+        src2 = _src2 + i;                                                                                       \
+        ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt)            \
+    mc_rep_uni_func(name, bitd, step, W, opt)        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                                                  \
+                                                 uint8_t *src, ptrdiff_t _srcstride, int height,                \
+                                                 intptr_t mx, intptr_t my, int width)                           \
+{                                                                                                               \
+    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);               \
+    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)),              \
+                                                    _srcstride, height, mx, my, width);                         \
+}
+#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,                         \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,            \
+                                                     intptr_t mx, intptr_t my, int width)                       \
+{                                                                                                               \
+    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
+    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,            \
+                                                        src + (step1 * ((bitd + 7) / 8)), _srcstride,           \
+                                                        height, mx, my, width);                                 \
+}
+#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,            \
+                                                    ptrdiff_t _srcstride, int16_t* src2,                        \
+                                                    int height, intptr_t mx, intptr_t my, int width)            \
+{                                                                                                               \
+    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
+    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,             \
+                                                       src + (step1 * ((bitd + 7) / 8)), _srcstride,            \
+                                                       src2 + step1, height, mx, my, width);                    \
+}
+
+#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
+    mc_rep_func2(name, bitd, step1, step2, W, opt)      \
+    mc_rep_uni_func2(name, bitd, step1, step2, W, opt)  \
+    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
+
+#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                       \
+void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,            \
+                                                 int height, intptr_t mx, intptr_t my, int width)             \
+                                                                                                              \
+{                                                                                                             \
+    ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width);                 \
+    ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                    \
+void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,          \
+                                                    ptrdiff_t _srcstride, int16_t *src2,                      \
+                                                    int height, intptr_t mx, intptr_t my, int width)          \
+{                                                                                                             \
+    ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2,                     \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
+                                                   height, mx, my, width);                                    \
+}
+
+#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                   \
+void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride,                       \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,          \
+                                                     intptr_t mx, intptr_t my, int width)                     \
 {                                                                                                             \
-    const ptrdiff_t stride = FFALIGN(width + 7, 8);                                                           \
-    ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
-                                                        height + 7, mx, my, mcbuffer);                        \
-    ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride,                \
-                                            height, mx, my, mcbuffer);                                        \
+    ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride,                          \
+                                                      height, mx, my, width);                                 \
+    ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride,            \
+                                                      height, mx, my, width);                                 \
 }
-#else
-#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
-void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-QPEL_FUNCS(4,  8, ssse3, ssse3, avx)
-QPEL_FUNCS(8,  8, ssse3, ssse3, avx)
-QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
-QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
-
-QPEL_FUNCS(4,  10, avx, avx, avx)
-QPEL_FUNCS(8,  10, avx, avx, avx)
-QPEL_FUNCS(12, 10, avx, avx, avx)
-QPEL_FUNCS(16, 10, avx, avx, avx)
-QPEL_FUNCS(24, 10, avx, avx, avx)
-QPEL_FUNCS(32, 10, avx, avx, avx)
-QPEL_FUNCS(48, 10, avx, avx, avx)
-QPEL_FUNCS(64, 10, avx, avx, avx)
-
-#if ARCH_X86_64 && HAVE_AVX_EXTERNAL
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)                                                         \
-static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
-                                                               uint8_t *src, ptrdiff_t srcstride,             \
-                                                               int height, int mx, int my, int16_t *mcbuffer) \
+
+#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4)   \
+mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)            \
+mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)         \
+mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
+
+#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                \
+void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride,             \
+                                                int height, intptr_t mx, intptr_t my, int width)              \
+                                                                                                              \
 {                                                                                                             \
-    const ptrdiff_t stride = FFALIGN(width + 3, 8);                                                           \
-    ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride,     \
-                                                        height + 3, mx, my, mcbuffer);                        \
-    ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride,                    \
-                                            height, mx, my, mcbuffer);                                        \
+    ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width);                  \
+    ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width);  \
 }
-#else
-#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-#endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
-
-#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
-void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
-                                                         uint8_t *src, ptrdiff_t srcstride,                   \
-                                                         int height, int mx, int my, int16_t *mcbuffer);      \
-EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
-
-EPEL_FUNCS(4,  8, ssse3, ssse3, avx)
-EPEL_FUNCS(8,  8, ssse3, ssse3, avx)
-EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
-EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
-
-EPEL_FUNCS(4,  10, avx, avx, avx)
-EPEL_FUNCS(8,  10, avx, avx, avx)
-EPEL_FUNCS(12, 10, avx, avx, avx)
-EPEL_FUNCS(16, 10, avx, avx, avx)
-EPEL_FUNCS(24, 10, avx, avx, avx)
-EPEL_FUNCS(32, 10, avx, avx, avx)
-
-#define PUT_PRED(width, depth, cf_uw, cf_w) \
-void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,                   \
-                                                                       int16_t *src, ptrdiff_t srcstride,                   \
-                                                                       int height);                                         \
-void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,               \
-                                                                           int16_t *src1, int16_t *src2,                    \
-                                                                           ptrdiff_t srcstride, int height);                \
-void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset,          \
-                                                                    uint8_t *dst, ptrdiff_t dststride,                      \
-                                                                    int16_t *src, ptrdiff_t srcstride,                      \
-                                                                    int height);                                            \
-void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1,    \
-                                                                        int16_t offset0, int16_t offset1,                   \
-                                                                        uint8_t *dst, ptrdiff_t dststride,                  \
-                                                                        int16_t *src0, int16_t *src1, ptrdiff_t srcstride,  \
-                                                                        int height);
-
-PUT_PRED(4,  8, sse2, sse4)
-PUT_PRED(8,  8, sse2, sse4)
-PUT_PRED(12, 8, sse2, sse4)
-PUT_PRED(16, 8, sse2, sse4)
-PUT_PRED(24, 8, sse2, sse4)
-PUT_PRED(32, 8, sse2, sse4)
-PUT_PRED(48, 8, sse2, sse4)
-PUT_PRED(64, 8, sse2, sse4)
-
-PUT_PRED(4,  10, sse2, sse4)
-PUT_PRED(8,  10, sse2, sse4)
-PUT_PRED(12, 10, sse2, sse4)
-PUT_PRED(16, 10, sse2, sse4)
-PUT_PRED(24, 10, sse2, sse4)
-PUT_PRED(32, 10, sse2, sse4)
-PUT_PRED(48, 10, sse2, sse4)
-PUT_PRED(64, 10, sse2, sse4)
+
+#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                             \
+void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,           \
+                                                   ptrdiff_t _srcstride, int16_t* src2,                       \
+                                                   int height, intptr_t mx, intptr_t my, int width)           \
+{                                                                                                             \
+    ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                            \
+                                                  src2, height, mx, my, width);                               \
+    ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,              \
+                                                  src2+width2, height, mx, my, width);                        \
+}
+
+#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                            \
+void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *src, ptrdiff_t _srcstride, int height,           \
+                                                    intptr_t mx, intptr_t my, int width)                      \
+{                                                                                                             \
+    ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                           \
+                                                   height, mx, my, width);                                    \
+    ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,             \
+                                                   height, mx, my, width);                                    \
+}
+
+#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2)   \
+mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)            \
+mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)         \
+mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
+
+#if HAVE_AVX2_EXTERNAL
+
+mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_h ,    48, 32, 16, avx2, sse4)
+mc_rep_mixs_8(epel_v ,    48, 32, 16, avx2, sse4)
+
+mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
+mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_hv,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_h ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(epel_v ,   24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_mixs_10(qpel_h ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_v ,   24, 16, 8, avx2, sse4, 32)
+mc_rep_mixs_10(qpel_hv,   24, 16, 8, avx2, sse4, 32)
+
+
+mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
+mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
+
+mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
+
+mc_rep_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
+mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
+mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 8, 32, 64, avx2)
+
+mc_rep_funcs(epel_h, 10, 16, 32, avx2)
+mc_rep_funcs(epel_h, 10, 16, 48, avx2)
+mc_rep_funcs(epel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(epel_v, 10, 16, 32, avx2)
+mc_rep_funcs(epel_v, 10, 16, 48, avx2)
+mc_rep_funcs(epel_v, 10, 32, 64, avx2)
+
+
+mc_rep_funcs(epel_hv,  8, 32, 64, avx2)
+
+mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_h ,  48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
+mc_rep_mixs_8(qpel_v,  48, 32, 16, avx2, sse4)
+
+mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
+
+mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
+mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
+mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
+
+#endif //AVX2
+
+mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
+mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
+mc_rep_funcs(pel_pixels, 8,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 64, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 48, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 32, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,10,  8, 16, sse4)
+mc_rep_funcs(pel_pixels,10,  4, 12, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 64, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 48, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 32, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 24, sse4)
+mc_rep_funcs(pel_pixels,12,  8, 16, sse4)
+mc_rep_funcs(pel_pixels,12,  4, 12, sse4)
+
+mc_rep_funcs(epel_h, 8, 16, 64, sse4)
+mc_rep_funcs(epel_h, 8, 16, 48, sse4)
+mc_rep_funcs(epel_h, 8, 16, 32, sse4)
+mc_rep_funcs(epel_h, 8,  8, 24, sse4)
+mc_rep_funcs(epel_h,10,  8, 64, sse4)
+mc_rep_funcs(epel_h,10,  8, 48, sse4)
+mc_rep_funcs(epel_h,10,  8, 32, sse4)
+mc_rep_funcs(epel_h,10,  8, 24, sse4)
+mc_rep_funcs(epel_h,10,  8, 16, sse4)
+mc_rep_funcs(epel_h,10,  4, 12, sse4)
+mc_rep_funcs(epel_h,12,  8, 64, sse4)
+mc_rep_funcs(epel_h,12,  8, 48, sse4)
+mc_rep_funcs(epel_h,12,  8, 32, sse4)
+mc_rep_funcs(epel_h,12,  8, 24, sse4)
+mc_rep_funcs(epel_h,12,  8, 16, sse4)
+mc_rep_funcs(epel_h,12,  4, 12, sse4)
+mc_rep_funcs(epel_v, 8, 16, 64, sse4)
+mc_rep_funcs(epel_v, 8, 16, 48, sse4)
+mc_rep_funcs(epel_v, 8, 16, 32, sse4)
+mc_rep_funcs(epel_v, 8,  8, 24, sse4)
+mc_rep_funcs(epel_v,10,  8, 64, sse4)
+mc_rep_funcs(epel_v,10,  8, 48, sse4)
+mc_rep_funcs(epel_v,10,  8, 32, sse4)
+mc_rep_funcs(epel_v,10,  8, 24, sse4)
+mc_rep_funcs(epel_v,10,  8, 16, sse4)
+mc_rep_funcs(epel_v,10,  4, 12, sse4)
+mc_rep_funcs(epel_v,12,  8, 64, sse4)
+mc_rep_funcs(epel_v,12,  8, 48, sse4)
+mc_rep_funcs(epel_v,12,  8, 32, sse4)
+mc_rep_funcs(epel_v,12,  8, 24, sse4)
+mc_rep_funcs(epel_v,12,  8, 16, sse4)
+mc_rep_funcs(epel_v,12,  4, 12, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
+mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
+mc_rep_funcs(epel_hv, 8,  8, 24, sse4)
+mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4)
+mc_rep_funcs(epel_hv,10,  8, 64, sse4)
+mc_rep_funcs(epel_hv,10,  8, 48, sse4)
+mc_rep_funcs(epel_hv,10,  8, 32, sse4)
+mc_rep_funcs(epel_hv,10,  8, 24, sse4)
+mc_rep_funcs(epel_hv,10,  8, 16, sse4)
+mc_rep_funcs(epel_hv,10,  4, 12, sse4)
+mc_rep_funcs(epel_hv,12,  8, 64, sse4)
+mc_rep_funcs(epel_hv,12,  8, 48, sse4)
+mc_rep_funcs(epel_hv,12,  8, 32, sse4)
+mc_rep_funcs(epel_hv,12,  8, 24, sse4)
+mc_rep_funcs(epel_hv,12,  8, 16, sse4)
+mc_rep_funcs(epel_hv,12,  4, 12, sse4)
+
+mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_h, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_h,10,  8, 64, sse4)
+mc_rep_funcs(qpel_h,10,  8, 48, sse4)
+mc_rep_funcs(qpel_h,10,  8, 32, sse4)
+mc_rep_funcs(qpel_h,10,  8, 24, sse4)
+mc_rep_funcs(qpel_h,10,  8, 16, sse4)
+mc_rep_funcs(qpel_h,10,  4, 12, sse4)
+mc_rep_funcs(qpel_h,12,  8, 64, sse4)
+mc_rep_funcs(qpel_h,12,  8, 48, sse4)
+mc_rep_funcs(qpel_h,12,  8, 32, sse4)
+mc_rep_funcs(qpel_h,12,  8, 24, sse4)
+mc_rep_funcs(qpel_h,12,  8, 16, sse4)
+mc_rep_funcs(qpel_h,12,  4, 12, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
+mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
+mc_rep_funcs(qpel_v, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_v,10,  8, 64, sse4)
+mc_rep_funcs(qpel_v,10,  8, 48, sse4)
+mc_rep_funcs(qpel_v,10,  8, 32, sse4)
+mc_rep_funcs(qpel_v,10,  8, 24, sse4)
+mc_rep_funcs(qpel_v,10,  8, 16, sse4)
+mc_rep_funcs(qpel_v,10,  4, 12, sse4)
+mc_rep_funcs(qpel_v,12,  8, 64, sse4)
+mc_rep_funcs(qpel_v,12,  8, 48, sse4)
+mc_rep_funcs(qpel_v,12,  8, 32, sse4)
+mc_rep_funcs(qpel_v,12,  8, 24, sse4)
+mc_rep_funcs(qpel_v,12,  8, 16, sse4)
+mc_rep_funcs(qpel_v,12,  4, 12, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 64, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 48, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 32, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 24, sse4)
+mc_rep_funcs(qpel_hv, 8,  8, 16, sse4)
+mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 64, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 48, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 32, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 24, sse4)
+mc_rep_funcs(qpel_hv,10,  8, 16, sse4)
+mc_rep_funcs(qpel_hv,10,  4, 12, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 64, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 48, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 32, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 24, sse4)
+mc_rep_funcs(qpel_hv,12,  8, 16, sse4)
+mc_rep_funcs(qpel_hv,12,  4, 12, sse4)
+
+#define mc_rep_uni_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                               int height, int denom,  int _wx, int _ox)                                \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src= _src + i;                                                                                                  \
+        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src,                                   \
+                                                     height, denom, _wx, _ox);                                          \
+    }                                                                                                                   \
+}
+
+mc_rep_uni_w(8, 6, 12, sse4)
+mc_rep_uni_w(8, 8, 16, sse4)
+mc_rep_uni_w(8, 8, 24, sse4)
+mc_rep_uni_w(8, 8, 32, sse4)
+mc_rep_uni_w(8, 8, 48, sse4)
+mc_rep_uni_w(8, 8, 64, sse4)
+
+mc_rep_uni_w(10, 6, 12, sse4)
+mc_rep_uni_w(10, 8, 16, sse4)
+mc_rep_uni_w(10, 8, 24, sse4)
+mc_rep_uni_w(10, 8, 32, sse4)
+mc_rep_uni_w(10, 8, 48, sse4)
+mc_rep_uni_w(10, 8, 64, sse4)
+
+mc_rep_uni_w(12, 6, 12, sse4)
+mc_rep_uni_w(12, 8, 16, sse4)
+mc_rep_uni_w(12, 8, 24, sse4)
+mc_rep_uni_w(12, 8, 32, sse4)
+mc_rep_uni_w(12, 8, 48, sse4)
+mc_rep_uni_w(12, 8, 64, sse4)
+
+#define mc_rep_bi_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
+                                              int16_t *_src2, int height,                                               \
+                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    int16_t *src2;                                                                                                      \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src  = _src  + i;                                                                                               \
+        src2 = _src2 + i;                                                                                               \
+        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
+        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2,                             \
+                                                     height, denom, _wx0, _wx1, _ox0, _ox1);                             \
+    }                                                                                                                   \
+}
+
+mc_rep_bi_w(8, 6, 12, sse4)
+mc_rep_bi_w(8, 8, 16, sse4)
+mc_rep_bi_w(8, 8, 24, sse4)
+mc_rep_bi_w(8, 8, 32, sse4)
+mc_rep_bi_w(8, 8, 48, sse4)
+mc_rep_bi_w(8, 8, 64, sse4)
+
+mc_rep_bi_w(10, 6, 12, sse4)
+mc_rep_bi_w(10, 8, 16, sse4)
+mc_rep_bi_w(10, 8, 24, sse4)
+mc_rep_bi_w(10, 8, 32, sse4)
+mc_rep_bi_w(10, 8, 48, sse4)
+mc_rep_bi_w(10, 8, 64, sse4)
+
+mc_rep_bi_w(12, 6, 12, sse4)
+mc_rep_bi_w(12, 8, 16, sse4)
+mc_rep_bi_w(12, 8, 24, sse4)
+mc_rep_bi_w(12, 8, 32, sse4)
+mc_rep_bi_w(12, 8, 48, sse4)
+mc_rep_bi_w(12, 8, 64, sse4)
+
+#define mc_uni_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
+                                                      uint8_t *_src, ptrdiff_t _srcstride,          \
+                                                      int height, int denom,                        \
+                                                      int _wx, int _ox,                             \
+                                                      intptr_t mx, intptr_t my, int width)          \
+{                                                                                                   \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
+}
+
+#define mc_uni_w_funcs(name, bitd, opt)      \
+        mc_uni_w_func(name, bitd, 4, opt)    \
+        mc_uni_w_func(name, bitd, 8, opt)    \
+        mc_uni_w_func(name, bitd, 12, opt)   \
+        mc_uni_w_func(name, bitd, 16, opt)   \
+        mc_uni_w_func(name, bitd, 24, opt)   \
+        mc_uni_w_func(name, bitd, 32, opt)   \
+        mc_uni_w_func(name, bitd, 48, opt)   \
+        mc_uni_w_func(name, bitd, 64, opt)
+
+mc_uni_w_funcs(pel_pixels, 8, sse4)
+mc_uni_w_func(pel_pixels, 8, 6, sse4)
+mc_uni_w_funcs(epel_h, 8, sse4)
+mc_uni_w_func(epel_h, 8, 6, sse4)
+mc_uni_w_funcs(epel_v, 8, sse4)
+mc_uni_w_func(epel_v, 8, 6, sse4)
+mc_uni_w_funcs(epel_hv, 8, sse4)
+mc_uni_w_func(epel_hv, 8, 6, sse4)
+mc_uni_w_funcs(qpel_h, 8, sse4)
+mc_uni_w_funcs(qpel_v, 8, sse4)
+mc_uni_w_funcs(qpel_hv, 8, sse4)
+
+mc_uni_w_funcs(pel_pixels, 10, sse4)
+mc_uni_w_func(pel_pixels, 10, 6, sse4)
+mc_uni_w_funcs(epel_h, 10, sse4)
+mc_uni_w_func(epel_h, 10, 6, sse4)
+mc_uni_w_funcs(epel_v, 10, sse4)
+mc_uni_w_func(epel_v, 10, 6, sse4)
+mc_uni_w_funcs(epel_hv, 10, sse4)
+mc_uni_w_func(epel_hv, 10, 6, sse4)
+mc_uni_w_funcs(qpel_h, 10, sse4)
+mc_uni_w_funcs(qpel_v, 10, sse4)
+mc_uni_w_funcs(qpel_hv, 10, sse4)
+
+mc_uni_w_funcs(pel_pixels, 12, sse4)
+mc_uni_w_func(pel_pixels, 12, 6, sse4)
+mc_uni_w_funcs(epel_h, 12, sse4)
+mc_uni_w_func(epel_h, 12, 6, sse4)
+mc_uni_w_funcs(epel_v, 12, sse4)
+mc_uni_w_func(epel_v, 12, 6, sse4)
+mc_uni_w_funcs(epel_hv, 12, sse4)
+mc_uni_w_func(epel_hv, 12, 6, sse4)
+mc_uni_w_funcs(qpel_h, 12, sse4)
+mc_uni_w_funcs(qpel_v, 12, sse4)
+mc_uni_w_funcs(qpel_hv, 12, sse4)
+
+#define mc_bi_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
+                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
+                                                     int16_t *_src2,                                 \
+                                                     int height, int denom,                          \
+                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
+                                                     intptr_t mx, intptr_t my, int width)            \
+{                                                                                                    \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
+    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2,                         \
+                                              height, denom, _wx0, _wx1, _ox0, _ox1);                \
+}
+
+#define mc_bi_w_funcs(name, bitd, opt)      \
+        mc_bi_w_func(name, bitd, 4, opt)    \
+        mc_bi_w_func(name, bitd, 8, opt)    \
+        mc_bi_w_func(name, bitd, 12, opt)   \
+        mc_bi_w_func(name, bitd, 16, opt)   \
+        mc_bi_w_func(name, bitd, 24, opt)   \
+        mc_bi_w_func(name, bitd, 32, opt)   \
+        mc_bi_w_func(name, bitd, 48, opt)   \
+        mc_bi_w_func(name, bitd, 64, opt)
+
+mc_bi_w_funcs(pel_pixels, 8, sse4)
+mc_bi_w_func(pel_pixels, 8, 6, sse4)
+mc_bi_w_funcs(epel_h, 8, sse4)
+mc_bi_w_func(epel_h, 8, 6, sse4)
+mc_bi_w_funcs(epel_v, 8, sse4)
+mc_bi_w_func(epel_v, 8, 6, sse4)
+mc_bi_w_funcs(epel_hv, 8, sse4)
+mc_bi_w_func(epel_hv, 8, 6, sse4)
+mc_bi_w_funcs(qpel_h, 8, sse4)
+mc_bi_w_funcs(qpel_v, 8, sse4)
+mc_bi_w_funcs(qpel_hv, 8, sse4)
+
+mc_bi_w_funcs(pel_pixels, 10, sse4)
+mc_bi_w_func(pel_pixels, 10, 6, sse4)
+mc_bi_w_funcs(epel_h, 10, sse4)
+mc_bi_w_func(epel_h, 10, 6, sse4)
+mc_bi_w_funcs(epel_v, 10, sse4)
+mc_bi_w_func(epel_v, 10, 6, sse4)
+mc_bi_w_funcs(epel_hv, 10, sse4)
+mc_bi_w_func(epel_hv, 10, 6, sse4)
+mc_bi_w_funcs(qpel_h, 10, sse4)
+mc_bi_w_funcs(qpel_v, 10, sse4)
+mc_bi_w_funcs(qpel_hv, 10, sse4)
+
+mc_bi_w_funcs(pel_pixels, 12, sse4)
+mc_bi_w_func(pel_pixels, 12, 6, sse4)
+mc_bi_w_funcs(epel_h, 12, sse4)
+mc_bi_w_func(epel_h, 12, 6, sse4)
+mc_bi_w_funcs(epel_v, 12, sse4)
+mc_bi_w_func(epel_v, 12, 6, sse4)
+mc_bi_w_funcs(epel_hv, 12, sse4)
+mc_bi_w_func(epel_hv, 12, 6, sse4)
+mc_bi_w_funcs(qpel_h, 12, sse4)
+mc_bi_w_funcs(qpel_v, 12, sse4)
+mc_bi_w_funcs(qpel_hv, 12, sse4)
+#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define SAO_BAND_FILTER_FUNCS(bitd, opt)                                                                                   \
+void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                            int16_t *sao_offset_val, int sao_left_class, int width, int height);           \
+void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
+                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+SAO_BAND_FILTER_FUNCS(8,  sse2)
+SAO_BAND_FILTER_FUNCS(10, sse2)
+SAO_BAND_FILTER_FUNCS(12, sse2)
+SAO_BAND_FILTER_FUNCS(8,   avx)
+SAO_BAND_FILTER_FUNCS(10,  avx)
+SAO_BAND_FILTER_FUNCS(12,  avx)
+SAO_BAND_FILTER_FUNCS(8,  avx2)
+SAO_BAND_FILTER_FUNCS(10, avx2)
+SAO_BAND_FILTER_FUNCS(12, avx2)
+
+#define SAO_BAND_INIT(bitd, opt) do {                                       \
+    c->sao_band_filter[0]      = ff_hevc_sao_band_filter_8_##bitd##_##opt;  \
+    c->sao_band_filter[1]      = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
+    c->sao_band_filter[2]      = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
+    c->sao_band_filter[3]      = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
+    c->sao_band_filter[4]      = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define SAO_EDGE_FILTER_FUNCS(bitd, opt)                                                                                    \
+void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,  \
+                                              int eo, int width, int height);                                               \
+void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
+                                               int eo, int width, int height);                                              \
+
+SAO_EDGE_FILTER_FUNCS(8, ssse3)
+SAO_EDGE_FILTER_FUNCS(8, avx2)
+SAO_EDGE_FILTER_FUNCS(10, sse2)
+SAO_EDGE_FILTER_FUNCS(10, avx2)
+SAO_EDGE_FILTER_FUNCS(12, sse2)
+SAO_EDGE_FILTER_FUNCS(12, avx2)
+
+#define SAO_EDGE_INIT(bitd, opt) do {                                       \
+    c->sao_edge_filter[0]      = ff_hevc_sao_edge_filter_8_##bitd##_##opt;  \
+    c->sao_edge_filter[1]      = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
+    c->sao_edge_filter[2]      = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
+    c->sao_edge_filter[3]      = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
+    c->sao_edge_filter[4]      = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
+} while (0)
+
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
+#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
 
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#define SET_LUMA_FUNCS(tabname, funcname, depth, cf)      \
-    c->tabname[0] = funcname ## _4_  ## depth ## _ ## cf; \
-    c->tabname[1] = funcname ## _8_  ## depth ## _ ## cf; \
-    c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
-    c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
-    c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
-    c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
-    c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
-    c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
-
-#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf)    \
-    c->tabname[1] = funcname ## _4_  ## depth ## _ ## cf; \
-    c->tabname[3] = funcname ## _8_  ## depth ## _ ## cf; \
-    c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
-    c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
-    c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
-    c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
-
-#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS  (put_hevc_qpel[v][h], name, depth, cf)
-#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
-
     if (bit_depth == 8) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
+            c->transform_add[0]    =  ff_hevc_transform_add4_8_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
+
+            }
+            SAO_BAND_INIT(8, sse2);
 
-            SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
-            SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
 
-            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     8, sse2);
-            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 8, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     8, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
+            c->transform_add[1]    = ff_hevc_transform_add8_8_sse2;
+            c->transform_add[2]    = ff_hevc_transform_add16_8_sse2;
+            c->transform_add[3]    = ff_hevc_transform_add32_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
-            SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
-            SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
-            SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
-            SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
+            if(ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+            }
+            SAO_EDGE_INIT(8, ssse3);
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+            }
+            SAO_BAND_INIT(8, avx);
+
+            c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
+            c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
+            c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
+            c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
+            }
+            SAO_BAND_INIT(8, avx2);
+
+            c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
+            c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
+
+            c->transform_add[3]    = ff_hevc_transform_add32_8_avx2;
         }
     } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+            }
+            SAO_BAND_INIT(10, sse2);
+            SAO_EDGE_INIT(10, sse2);
 
-            SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
-            SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
 
-            SET_LUMA_FUNCS(put_unweighted_pred,              ff_hevc_put_unweighted_pred,     10, sse2);
-            SET_LUMA_FUNCS(put_unweighted_pred_avg,          ff_hevc_put_unweighted_pred_avg, 10, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_chroma,     ff_hevc_put_unweighted_pred,     10, sse2);
-            SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
+            c->transform_add[1]    = ff_hevc_transform_add8_10_sse2;
+            c->transform_add[2]    = ff_hevc_transform_add16_10_sse2;
+            c->transform_add[3]    = ff_hevc_transform_add32_10_sse2;
         }
-    }
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
 
-#if ARCH_X86_64
-    if (bit_depth == 8) {
-        if (EXTERNAL_SSSE3(cpu_flags)) {
-            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
-            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+            }
+            SAO_BAND_INIT(10, avx);
         }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
+            if (ARCH_X86_64) {
+                c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+                c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+                c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+                c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+                c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+                c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+                c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+                c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+                c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+                c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+                c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+                c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+                c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+                c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+                c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+                c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+                c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+                c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+                c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+                c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+                c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+                c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+                c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+                c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+                c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+                c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+                c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+                c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+                c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+                c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+                c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+                c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+                c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+                c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+                c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+                c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+                c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+                c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+                c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+                c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+                c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+                c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+                c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+                c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+                c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+                c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+                c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+                c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+                c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+                c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+                c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+                c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+                c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+                c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+                c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+                c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+                c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+                c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+                c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+                c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+                c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
 
-        if (EXTERNAL_SSE4(cpu_flags)) {
-            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     8, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     8, sse4);
-            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 8, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
+                c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+                c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+                c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+                c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+                c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+                c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+                c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+                c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+                c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+                c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+                c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+                c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+                c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+                c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+                c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+                c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+                c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+                c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+                c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+                c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+                c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+                c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+                c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+                c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
+            }
+            SAO_BAND_INIT(10, avx2);
+            SAO_EDGE_INIT(10, avx2);
+
+            c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
+            c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
+
+        }
+    } else if (bit_depth == 12) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
         }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+            }
+            SAO_BAND_INIT(12, sse2);
+            SAO_EDGE_INIT(12, sse2);
 
-        if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
-            SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
-            SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
         }
-    } else if (bit_depth == 10) {
-        if (EXTERNAL_SSSE3(cpu_flags)) {
-            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
-            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
         }
-        if (EXTERNAL_SSE4(cpu_flags)) {
-            SET_LUMA_FUNCS(weighted_pred,              ff_hevc_put_weighted_pred,     10, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_chroma,     ff_hevc_put_weighted_pred,     10, sse4);
-            SET_LUMA_FUNCS(weighted_pred_avg,          ff_hevc_put_weighted_pred_avg, 10, sse4);
-            SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
         }
         if (EXTERNAL_AVX(cpu_flags)) {
-#if HAVE_AVX_EXTERNAL
-            SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
-            SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
-            SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
-            SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
-            SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
-            SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
-#endif /* HAVE_AVX_EXTERNAL */
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+            }
+            SAO_BAND_INIT(12, avx);
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
+        }
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
+
+            SAO_BAND_INIT(12, avx2);
+            SAO_EDGE_INIT(12, avx2);
         }
     }
-#endif /* ARCH_X86_64 */
 }
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index b8929b9..82fb893 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -1,20 +1,27 @@
 ;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
 ;* SIMD-optimized halfpel functions
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -22,26 +29,49 @@
 
 SECTION_RODATA
 cextern pb_1
+cextern pw_2
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
+
+cextern pw_8192
 
 SECTION .text
 
 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
 cglobal put_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
 .loop:
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     mova       [r0], m0
     mova    [r0+r2], m1
     add          r1, r4
     add          r0, r4
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m1
@@ -99,6 +129,9 @@ INIT_MMX mmxext
 PUT_PIXELS_16
 INIT_MMX 3dnow
 PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
 
 
 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -191,20 +224,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT
 
 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
 cglobal put_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     add          r0, r4
     add          r1, r4
     PAVGB        m2, m1
@@ -221,6 +258,9 @@ INIT_MMX mmxext
 PUT_PIXELS8_Y2
 INIT_MMX 3dnow
 PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
 
 
 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -334,26 +374,48 @@ AVG_PIXELS8
 
 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
 cglobal avg_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
+%if notcpuflag(mmxext)
+    pcmpeqd      m5, m5
+    paddb        m5, m5
+%endif
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m2
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
     add          r0, r4
     add          r1, r4
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     mova       [r0], m0
     mova    [r0+r2], m2
     add          r0, r4
@@ -362,40 +424,45 @@ cglobal avg_pixels8_x2, 4,5
     REP_RET
 %endmacro
 
+INIT_MMX mmx
+AVG_PIXELS8_X2
 INIT_MMX mmxext
 AVG_PIXELS8_X2
 INIT_MMX 3dnow
 AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
 
 
 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
 cglobal avg_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m0, m3
-    PAVGB        m1, m4
+    PAVGB        m0, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     PAVGB        m2, m1
     PAVGB        m1, m0
     add          r0, r4
     add          r1, r4
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m2, m3
-    PAVGB        m1, m4
+    PAVGB        m2, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -408,11 +475,16 @@ INIT_MMX mmxext
 AVG_PIXELS8_Y2
 INIT_MMX 3dnow
 AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
 
 
 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_XY2 0
-cglobal avg_pixels8_xy2, 4,5
+; Note this is not correctly rounded, and is therefore used for
+; not-bitexact output
+%macro AVG_APPROX_PIXELS8_XY2 0
+cglobal avg_approx_pixels8_xy2, 4,5
     mova         m6, [pb_1]
     lea          r4, [r2*2]
     mova         m0, [r1]
@@ -449,6 +521,160 @@ cglobal avg_pixels8_xy2, 4,5
 %endmacro
 
 INIT_MMX mmxext
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
 INIT_MMX 3dnow
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS_XY2 1
+%if cpuflag(sse2)
+cglobal %1_pixels16_xy2, 4,5,8
+%else
+cglobal %1_pixels8_xy2, 4,5
+%endif
+    pxor        m7, m7
+    mova        m6, [pw_2]
+    movu        m0, [r1]
+    movu        m4, [r1+1]
+    mova        m1, m0
+    mova        m5, m4
+    punpcklbw   m0, m7
+    punpcklbw   m4, m7
+    punpckhbw   m1, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m0
+    paddusw     m5, m1
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m0, [r1+r4]
+    movu        m2, [r1+r4+1]
+    mova        m1, m0
+    mova        m3, m2
+    punpcklbw   m0, m7
+    punpcklbw   m2, m7
+    punpckhbw   m1, m7
+    punpckhbw   m3, m7
+    paddusw     m0, m2
+    paddusw     m1, m3
+    paddusw     m4, m6
+    paddusw     m5, m6
+    paddusw     m4, m0
+    paddusw     m5, m1
+    psrlw       m4, 2
+    psrlw       m5, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m4, m5
+    PAVGB       m4, m3
+%else
+    packuswb    m4, m5
+%endif
+    mova   [r0+r4], m4
+    add         r4, r2
+
+    movu        m2, [r1+r4]
+    movu        m4, [r1+r4+1]
+    mova        m3, m2
+    mova        m5, m4
+    punpcklbw   m2, m7
+    punpcklbw   m4, m7
+    punpckhbw   m3, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m2
+    paddusw     m5, m3
+    paddusw     m0, m6
+    paddusw     m1, m6
+    paddusw     m0, m4
+    paddusw     m1, m5
+    psrlw       m0, 2
+    psrlw       m1, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m0, m1
+    PAVGB       m0, m3
+%else
+    packuswb    m0, m1
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+SET_PIXELS_XY2 avg
+INIT_MMX 3dnow
+SET_PIXELS_XY2 avg
+INIT_XMM sse2
+SET_PIXELS_XY2 put
+SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+    mova        m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+    mova        m4, [pb_interleave8]
+%endif
+    mova        m5, [pb_1]
+    movu        m0, [r1]
+    movu        m1, [r1+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m2, [r1+r4]
+    movu        m3, [r1+r4+1]
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+    paddusw     m0, m2
+    paddusw     m1, m3
+    pmulhrsw    m0, [pw_8192]
+    pmulhrsw    m1, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m0, m1
+    pshufb      m0, m4
+    pavgb       m0, m6
+%else
+    packuswb    m0, m1
+    pshufb      m0, m4
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+
+    movu        m0, [r1+r4]
+    movu        m1, [r1+r4+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    paddusw     m2, m0
+    paddusw     m3, m1
+    pmulhrsw    m2, [pw_8192]
+    pmulhrsw    m3, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m2, m3
+    pshufb      m2, m4
+    pavgb       m2, m6
+%else
+    packuswb    m2, m3
+    pshufb      m2, m4
+%endif
+    mova   [r0+r4], m2
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index 47b0b8b..5fae990 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,27 @@ void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
 
 void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
 void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
 void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 #endif /* AVCODEC_X86_HPELDSP_H */
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 59cb5e1..5c5da28 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -40,6 +40,14 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h);
 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                      ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -74,10 +82,12 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 
 #define avg_pixels8_mmx         ff_avg_pixels8_mmx
 #define avg_pixels8_x2_mmx      ff_avg_pixels8_x2_mmx
@@ -111,11 +121,13 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
 #undef PAVGB
 #undef STATIC
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
+#endif
 
 /***********************************/
 /* MMX rounding */
@@ -138,11 +150,13 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
 #undef PAVGBP
 #undef PAVGB
 
+#if HAVE_MMX
 CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8)
 
 CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
+#endif
 
 #endif /* HAVE_INLINE_ASM */
 
@@ -156,32 +170,49 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
     CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8)
+    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
 
 HPELDSP_AVG_PIXELS16(_3dnow)
 HPELDSP_AVG_PIXELS16(_mmxext)
 
 #endif /* HAVE_YASM */
 
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
+    if (HAVE_MMX_EXTERNAL)                                                  \
+    c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU;
+
+#if HAVE_MMX_INLINE
 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
     do {                                                                        \
-        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
     } while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+    } while (0)
+#endif
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
 {
-#if HAVE_MMX_INLINE
     SET_HPEL_FUNCS(put,        [0], 16, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
     SET_HPEL_FUNCS(avg,        [0], 16, mmx);
     SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
     SET_HPEL_FUNCS(put,        [1],  8, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
-    SET_HPEL_FUNCS(avg,        [1],  8, mmx);
-#endif /* HAVE_MMX_INLINE */
+    if (HAVE_MMX_EXTERNAL) {
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+    }
+#if HAVE_MMX_INLINE
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
 }
 
 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
@@ -193,6 +224,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -200,6 +232,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
@@ -207,11 +240,11 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
     }
 
-    if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+    if (CONFIG_VP3_DECODER && flags & AV_CODEC_FLAG_BITEXACT) {
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
     }
@@ -227,6 +260,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
@@ -234,6 +268,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)){
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
@@ -241,11 +276,11 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
     }
 
-    if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+    if (CONFIG_VP3_DECODER && flags & AV_CODEC_FLAG_BITEXACT) {
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
     }
@@ -259,11 +294,27 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
         // these functions are slower than mmx on AMD, but faster on Intel
         c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
         c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+        c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
+        c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
+        c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
         c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+        c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
+        c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
+        c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
     }
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+    c->put_pixels_tab[0][3]            = ff_put_pixels16_xy2_ssse3;
+    c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
+    c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
+    c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
 av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -279,4 +330,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 
     if (EXTERNAL_SSE2(cpu_flags))
         hpeldsp_init_sse2(c, flags, cpu_flags);
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        hpeldsp_init_ssse3(c, flags, cpu_flags);
 }
diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c
deleted file mode 100644
index c93c78e..0000000
--- a/libavcodec/x86/hpeldsp_mmx.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2001 Fabrice Bellard
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "hpeldsp.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
index d854e8a..e20d065 100644
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 #include <stdint.h>
 
 // put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -60,7 +60,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :REG_a, "memory");
 }
 
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -106,7 +106,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff
         :REG_a, "memory");
 }
 
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
@@ -135,33 +135,34 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_
         :REG_a, "memory");
 }
 
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
         __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
+            ".p2align 3                 \n\t"
+            "1:                         \n\t"
+            "movq  (%1), %%mm0          \n\t"
+            "movq  1(%1), %%mm1         \n\t"
+            "movq  (%2), %%mm3          \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  9%1, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
+            "movq  %%mm0, (%2)          \n\t"
+            "movq  8(%1), %%mm0         \n\t"
+            "movq  9(%1), %%mm1         \n\t"
+            "movq  8(%2), %%mm3         \n\t"
             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
+            "movq  %%mm0, 8(%2)         \n\t"
+            "add    %3, %1              \n\t"
+            "add    %3, %2              \n\t"
+            "subl   $1, %0              \n\t"
+            "jnz    1b                  \n\t"
+            :"+g"(h), "+S"(pixels), "+D"(block)
+            :"r"((x86_reg)line_size)
             :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
 }
 
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm__ volatile(
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index e7536da..0dbe598 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -1,28 +1,29 @@
 ;******************************************************************************
 ;* SIMD-optimized HuffYUV functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
-pb_f: times 16 db 15
+cextern pb_15
 pb_zzzzzzzz77777777: times 8 db -1
 pb_7: times 8 db 7
 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
@@ -33,64 +34,72 @@ SECTION .text
 ; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
 ;                                     const uint8_t *diff, int w,
 ;                                     int *left, int *left_top)
-INIT_MMX mmxext
-cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
-    movq    mm0, [topq]
-    movq    mm2, mm0
-    movd    mm4, [left_topq]
-    psllq   mm2, 8
-    movq    mm1, mm0
-    por     mm4, mm2
-    movd    mm3, [leftq]
-    psubb   mm0, mm4 ; t-tl
+%macro HFYU_MEDIAN 0
+cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+    movu    m0, [topq]
+    mova    m2, m0
+    movd    m4, [left_topq]
+    LSHIFT  m2, 1
+    mova    m1, m0
+    por     m4, m2
+    movd    m3, [leftq]
+    psubb   m0, m4 ; t-tl
     add    dstq, wq
     add    topq, wq
     add   diffq, wq
     neg      wq
     jmp .skip
 .loop:
-    movq    mm4, [topq+wq]
-    movq    mm0, mm4
-    psllq   mm4, 8
-    por     mm4, mm1
-    movq    mm1, mm0 ; t
-    psubb   mm0, mm4 ; t-tl
+    movu    m4, [topq+wq]
+    mova    m0, m4
+    LSHIFT  m4, 1
+    por     m4, m1
+    mova    m1, m0 ; t
+    psubb   m0, m4 ; t-tl
 .skip:
-    movq    mm2, [diffq+wq]
+    movu    m2, [diffq+wq]
 %assign i 0
-%rep 8
-    movq    mm4, mm0
-    paddb   mm4, mm3 ; t-tl+l
-    movq    mm5, mm3
-    pmaxub  mm3, mm1
-    pminub  mm5, mm1
-    pminub  mm3, mm4
-    pmaxub  mm3, mm5 ; median
-    paddb   mm3, mm2 ; +residual
+%rep mmsize
+    mova    m4, m0
+    paddb   m4, m3 ; t-tl+l
+    mova    m5, m3
+    pmaxub  m3, m1
+    pminub  m5, m1
+    pminub  m3, m4
+    pmaxub  m3, m5 ; median
+    paddb   m3, m2 ; +residual
 %if i==0
-    movq    mm7, mm3
-    psllq   mm7, 56
+    mova    m7, m3
+    LSHIFT  m7, mmsize-1
 %else
-    movq    mm6, mm3
-    psrlq   mm7, 8
-    psllq   mm6, 56
-    por     mm7, mm6
+    mova    m6, m3
+    RSHIFT  m7, 1
+    LSHIFT  m6, mmsize-1
+    por     m7, m6
 %endif
-%if i<7
-    psrlq   mm0, 8
-    psrlq   mm1, 8
-    psrlq   mm2, 8
+%if i<mmsize-1
+    RSHIFT  m0, 1
+    RSHIFT  m1, 1
+    RSHIFT  m2, 1
 %endif
 %assign i i+1
 %endrep
-    movq [dstq+wq], mm7
-    add      wq, 8
+    movu [dstq+wq], m7
+    add      wq, mmsize
     jl .loop
     movzx   r2d, byte [dstq-1]
     mov [leftq], r2d
     movzx   r2d, byte [topq-1]
     mov [left_topq], r2d
     RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+HFYU_MEDIAN
+%endif
+INIT_XMM sse2
+HFYU_MEDIAN
 
 
 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
@@ -148,7 +157,7 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
 
 INIT_XMM sse4
 cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
-    mova    m5, [pb_f]
+    mova    m5, [pb_15]
     mova    m6, [pb_zzzzzzzz77777777]
     mova    m4, [pb_zzzz3333zzzzbbbb]
     mova    m3, [pb_zz11zz55zz99zzdd]
@@ -163,3 +172,82 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
     ADD_HFYU_LEFT_LOOP 0, 1
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3:
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES
+%endif
+INIT_XMM sse2
+ADD_BYTES
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+;                               intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
+    shl           wq, 2
+    movd          m0, [leftq]
+    lea         dstq, [dstq + wq]
+    lea         srcq, [srcq + wq]
+    LSHIFT        m0, mmsize-4
+    neg           wq
+.loop:
+    movu          m1, [srcq+wq]
+    mova          m2, m1
+%if mmsize == 8
+    punpckhdq     m0, m0
+%endif
+    LSHIFT        m1, 4
+    paddb         m1, m2
+%if mmsize == 16
+    pshufd        m0, m0, q3333
+    mova          m2, m1
+    LSHIFT        m1, 8
+    paddb         m1, m2
+%endif
+    paddb         m0, m1
+    movu   [dstq+wq], m0
+    add           wq, mmsize
+    jl         .loop
+    movd          m0, [dstq-4]
+    movd     [leftq], m0
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+LEFT_BGR32
+%endif
+INIT_XMM sse2
+LEFT_BGR32
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 75537d7..3ced3c0 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,20 +25,29 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/huffyuvdsp.h"
 
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
+
 void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-                                    const uint8_t *diff, int w,
+                                    const uint8_t *diff, intptr_t w,
                                     int *left, int *left_top);
+void ff_add_hfyu_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, intptr_t w,
+                                  int *left, int *left_top);
 
 int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
-                                 int w, int left);
+                                 intptr_t w, int left);
 int  ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src,
-                                int w, int left);
+                                intptr_t w, int left);
 
-#if HAVE_INLINE_ASM
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+                                      intptr_t w, uint8_t *left);
 
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
 static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
-                                      const uint8_t *diff, int w,
+                                      const uint8_t *diff, intptr_t w,
                                       int *left, int *left_top)
 {
     x86_reg w2 = -w;
@@ -72,56 +81,34 @@ static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
     *left     = l;
     *left_top = tl;
 }
-#endif /* HAVE_7REGS */
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */
+#endif
 
 av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_INLINE_ASM
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
     if (cpu_flags & AV_CPU_FLAG_CMOV)
         c->add_hfyu_median_pred = add_hfyu_median_pred_cmov;
-#endif /* HAVE_7REGS */
+#endif
 
-    if (INLINE_MMX(cpu_flags))
-        c->add_bytes = add_bytes_mmx;
-#endif /* HAVE_INLINE_ASM */
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_bytes = ff_add_bytes_mmx;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+    }
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
+    if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
         /* slower than cmov version on AMD */
         if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
             c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
     }
 
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes            = ff_add_bytes_sse2;
+        c->add_hfyu_median_pred = ff_add_hfyu_median_pred_sse2;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
+    }
+
     if (EXTERNAL_SSSE3(cpu_flags)) {
         c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
         if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
new file mode 100644
index 0000000..a55a1de
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -0,0 +1,150 @@
+;************************************************************************
+;* SIMD-optimized HuffYUV encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    intptr_t w);
+%macro DIFF_BYTES_PROLOGUE 0
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+    DECLARE_REG_TMP 3
+    mov               wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+    DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+%endmacro
+
+; label to jump to if w < regsize
+%macro DIFF_BYTES_LOOP_PREP 1
+    mov                i, wq
+    and                i, -2 * regsize
+        jz            %1
+    add             dstq, i
+    add            src1q, i
+    add            src2q, i
+    neg                i
+%endmacro
+
+; mov type used for src1q, dstq, first reg, second reg
+%macro DIFF_BYTES_LOOP_CORE 4
+%if mmsize != 16
+    mov%1             %3, [src1q + i]
+    mov%1             %4, [src1q + i + regsize]
+    psubb             %3, [src2q + i]
+    psubb             %4, [src2q + i + regsize]
+    mov%2           [dstq + i], %3
+    mov%2 [regsize + dstq + i], %4
+%else
+    ; SSE enforces alignment of psubb operand
+    mov%1             %3, [src1q + i]
+    movu              %4, [src2q + i]
+    psubb             %3, %4
+    mov%2     [dstq + i], %3
+    mov%1             %3, [src1q + i + regsize]
+    movu              %4, [src2q + i + regsize]
+    psubb             %3, %4
+    mov%2 [regsize + dstq + i], %3
+%endif
+%endmacro
+
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
+    %define regsize mmsize
+.loop_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
+    add                i, 2 * regsize
+        jl    .loop_%1%2
+.skip_main_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%if mmsize > 16
+    ; fall back to narrower xmm
+    %define regsize mmsize / 2
+    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
+.loop2_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
+    add                i, 2 * regsize
+        jl   .loop2_%1%2
+.setup_loop_gpr_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%endif
+    add             dstq, wq
+    add            src1q, wq
+    add            src2q, wq
+    neg               wq
+.loop_gpr_%1%2:
+    mov              t0b, [src1q + wq]
+    sub              t0b, [src2q + wq]
+    mov      [dstq + wq], t0b
+    inc               wq
+        jl .loop_gpr_%1%2
+.end_%1%2:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
+    DIFF_BYTES_BODY    a, a
+%undef i
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    ; Directly using unaligned SSE2 version is marginally faster than
+    ; branching based on arguments.
+    DIFF_BYTES_LOOP_PREP .skip_main_uu
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+%endif
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c
index 8ffaced..9767b21 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,34 +29,17 @@
 #include "libavcodec/huffyuvencdsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_INLINE_ASM
-
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "1:                             \n\t"
-        "movq  (%2, %0), %%mm0          \n\t"
-        "movq  (%1, %0), %%mm1          \n\t"
-        "psubb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, (%3, %0)           \n\t"
-        "movq 8(%2, %0), %%mm0          \n\t"
-        "movq 8(%1, %0), %%mm1          \n\t"
-        "psubb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, 8(%3, %0)          \n\t"
-        "add $16, %0                    \n\t"
-        "cmp %4, %0                     \n\t"
-        " jb 1b                         \n\t"
-        : "+r" (i)
-        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w - 15));
+void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                       intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
 
-    for (; i < w; i++)
-        dst[i + 0] = src1[i + 0] - src2[i + 0];
-}
+#if HAVE_INLINE_ASM
 
 static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
-                                        const uint8_t *src2, int w,
+                                        const uint8_t *src2, intptr_t w,
                                         int *left, int *left_top)
 {
     x86_reg i = 0;
@@ -99,15 +82,23 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
 
 av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c)
 {
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->diff_bytes = diff_bytes_mmx;
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_mmx;
     }
 
+#if HAVE_INLINE_ASM
     if (INLINE_MMXEXT(cpu_flags)) {
         c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
     }
 #endif /* HAVE_INLINE_ASM */
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_sse2;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->diff_bytes = ff_diff_bytes_avx2;
+    }
 }
diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm
new file mode 100644
index 0000000..089425a
--- /dev/null
+++ b/libavcodec/x86/idctdsp.asm
@@ -0,0 +1,183 @@
+;******************************************************************************
+;* SIMD-optimized IDCT-related routines
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+SECTION .text
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                                  ptrdiff_t line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+    mova     m1, [blockq+mmsize*0+%1]
+    mova     m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m3, [blockq+mmsize*4+%1]
+    mova     m4, [blockq+mmsize*6+%1]
+%endif
+    packsswb m1, [blockq+mmsize*1+%1]
+    packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packsswb m3, [blockq+mmsize*5+%1]
+    packsswb m4, [blockq+mmsize*7+%1]
+%endif
+    paddb    m1, m0
+    paddb    m2, m0
+%if mmsize == 8
+    paddb    m3, m0
+    paddb    m4, m0
+    movq     [pixelsq+lsizeq*0], m1
+    movq     [pixelsq+lsizeq*1], m2
+    movq     [pixelsq+lsizeq*2], m3
+    movq     [pixelsq+lsize3q ], m4
+%else
+    movq     [pixelsq+lsizeq*0], m1
+    movhps   [pixelsq+lsizeq*1], m1
+    movq     [pixelsq+lsizeq*2], m2
+    movhps   [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+    mova     m0, [pb_80]
+    lea      lsize3q, [lsizeq*3]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+    lea      pixelsq, [pixelsq+lsizeq*4]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
+
+;--------------------------------------------------------------------------
+; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED 0
+cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED
+
+;--------------------------------------------------------------------------
+; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro ADD_PIXELS_CLAMPED 1
+    mova       m0, [blockq+mmsize*0+%1]
+    mova       m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+    mova       m5, [blockq+mmsize*2+%1]
+    mova       m6, [blockq+mmsize*3+%1]
+%endif
+    movq       m2, [pixelsq]
+    movq       m3, [pixelsq+lsizeq]
+%if mmsize == 8
+    mova       m7, m2
+    punpcklbw  m2, m4
+    punpckhbw  m7, m4
+    paddsw     m0, m2
+    paddsw     m1, m7
+    mova       m7, m3
+    punpcklbw  m3, m4
+    punpckhbw  m7, m4
+    paddsw     m5, m3
+    paddsw     m6, m7
+%else
+    punpcklbw  m2, m4
+    punpcklbw  m3, m4
+    paddsw     m0, m2
+    paddsw     m1, m3
+%endif
+    packuswb   m0, m1
+%if mmsize == 8
+    packuswb   m5, m6
+    movq       [pixelsq], m0
+    movq       [pixelsq+lsizeq], m5
+%else
+    movq       [pixelsq], m0
+    movhps     [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 0
+cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+    RET
+%endmacro
+
+INIT_MMX mmx
+ADD_PIXELS_CLAMPED
+INIT_XMM sse2
+ADD_PIXELS_CLAMPED
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
index 22df3dd..daa4e79 100644
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,12 +20,19 @@
 #define AVCODEC_X86_IDCTDSP_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
+                               ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
+                               ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size);
+                                      ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                       ptrdiff_t line_size);
 
 #endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 853c6a3..bcf7e5b 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,12 +64,10 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
-        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
-        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
-
         if (!high_bit_depth &&
+            avctx->lowres == 0 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
              avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct_put  = ff_simple_idct_put_mmx;
                 c->idct_add  = ff_simple_idct_add_mmx;
@@ -77,4 +75,52 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
     }
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
+    }
+
+    if (ARCH_X86_64 && avctx->lowres == 0) {
+        if (avctx->bits_per_raw_sample == 10 &&
+        (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLE)) {
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->idct_put  = ff_simple_idct10_put_sse2;
+            c->idct_add  = NULL;
+            c->idct      = ff_simple_idct10_sse2;
+            c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->idct_put  = ff_simple_idct10_put_avx;
+            c->idct_add  = NULL;
+            c->idct      = ff_simple_idct10_avx;
+            c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+        }
+        }
+
+        if (avctx->bits_per_raw_sample == 12 &&
+            (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+            if (EXTERNAL_SSE2(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_sse2;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_sse2;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+            if (EXTERNAL_AVX(cpu_flags)) {
+                c->idct_put  = ff_simple_idct12_put_avx;
+                c->idct_add  = NULL;
+                c->idct      = ff_simple_idct12_avx;
+                c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+            }
+        }
+    }
 }
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
deleted file mode 100644
index 7285b1d..0000000
--- a/libavcodec/x86/idctdsp_mmx.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * SIMD-optimized IDCT-related routines
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "idctdsp.h"
-#include "inline_asm.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off)             \
-    "movq          "#off"(%2), %%mm1        \n\t"           \
-    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
-    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
-    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
-    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
-    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
-    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
-    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
-    "paddb              %%mm0, %%mm1        \n\t"           \
-    "paddb              %%mm0, %%mm2        \n\t"           \
-    "paddb              %%mm0, %%mm3        \n\t"           \
-    "paddb              %%mm0, %%mm4        \n\t"           \
-    "movq               %%mm1, (%0)         \n\t"           \
-    "movq               %%mm2, (%0, %3)     \n\t"           \
-    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
-    "movq               %%mm4, (%0, %1)     \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
-        "lea         (%3, %3, 2), %1        \n\t"
-        put_signed_pixels_clamped_mmx_half(0)
-        "lea         (%0, %3, 4), %0        \n\t"
-        put_signed_pixels_clamped_mmx_half(64)
-        : "+&r" (pixels), "=&r" (line_skip3)
-        : "r" (block), "r" (line_skip)
-        : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile (
-            "movq        (%2), %%mm0    \n\t"
-            "movq       8(%2), %%mm1    \n\t"
-            "movq      16(%2), %%mm2    \n\t"
-            "movq      24(%2), %%mm3    \n\t"
-            "movq          %0, %%mm4    \n\t"
-            "movq          %1, %%mm6    \n\t"
-            "movq       %%mm4, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm4    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm4, %%mm0    \n\t"
-            "paddsw     %%mm5, %%mm1    \n\t"
-            "movq       %%mm6, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm6    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm6, %%mm2    \n\t"
-            "paddsw     %%mm5, %%mm3    \n\t"
-            "packuswb   %%mm1, %%mm0    \n\t"
-            "packuswb   %%mm3, %%mm2    \n\t"
-            "movq       %%mm0, %0       \n\t"
-            "movq       %%mm2, %1       \n\t"
-            : "+m" (*pix), "+m" (*(pix + line_size))
-            : "r" (p)
-            : "memory");
-        pix += line_size * 2;
-        p   += 16;
-    } while (--i);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index f85e2e4..409b2c5 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -2,20 +2,20 @@
 ;* 36 point SSE-optimized IMDCT transform
 ;* Copyright (c) 2011 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -50,7 +50,7 @@ ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
                dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
                dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
                dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
-               dd 1.0,  0.70710678118654752439,  0.0,  0.0
+               dd 1.0, -0.70710678118654752439,  0.0,  0.0
 
 costabs:  times 4 dd  0.98480773
           times 4 dd  0.93969262
@@ -129,7 +129,26 @@ SECTION .text
 %endif
 %endmacro
 
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+    mulps    %1, %1, [ps_cosh_sse3 + %3]
+    PSHUFD   %2, %1, 0xe1
+    addsubps %1, %1, %2
+%else
+    mulps    %1, [ps_cosh + %3]
+    PSHUFD   %2, %1, 0xe1
+    xorps    %1, [ps_p1m1p1m1]
+    addps    %1, %2
+%endif
+%endmacro
+
 %macro STORE 4
+%if cpuflag(sse4)
+    movss     [%3       ], %1
+    extractps [%3 +   %4], %1, 1
+    extractps [%3 + 2*%4], %1, 2
+    extractps [%3 + 3*%4], %1, 3
+%else
     movhlps %2, %1
     movss   [%3       ], %1
     movss   [%3 + 2*%4], %2
@@ -137,6 +156,7 @@ SECTION .text
     movss   [%3 +   %4], %1
     movhlps %2, %1
     movss   [%3 + 3*%4], %2
+%endif
 %endmacro
 
 %macro LOAD 4
@@ -279,11 +299,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     BUTTERF  m7, m2, 16
     BUTTERF  m3, m6, 32
     BUTTERF  m4, m1, 48
-
-    mulps   m5, m5, [ps_cosh + 64]
-    PSHUFD  m1, m5, 0xe1
-    xorps   m5, m5, [ps_p1m1p1m1]
-    addps   m5, m5, m1
+    BUTTERF2 m5, m1, 64
 
     ; permutates:
     ; m0    0  1  2  3     =>     2  6 10 14   m1
@@ -358,8 +374,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse2
 DEFINE_IMDCT
@@ -370,8 +388,10 @@ DEFINE_IMDCT
 INIT_XMM ssse3
 DEFINE_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse
 
@@ -716,5 +736,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
 INIT_XMM sse
 DEFINE_FOUR_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
index fc554bf..0198746 100644
--- a/libavcodec/x86/inline_asm.h
+++ b/libavcodec/x86/inline_asm.h
@@ -1,20 +1,20 @@
 /*
  * inline assembly helper macros
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@
         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
 
 #ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
 #else
 // for shared library it's better to use this way for accessing constants
 // pcmpeqd -> -1
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
new file mode 100644
index 0000000..56b5fbd
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -0,0 +1,144 @@
+;******************************************************************************
+;* SIMD-optimized JPEG2000 DSP functions
+;* Copyright (c) 2014 Nicolas Bertrand
+;* Copyright (c) 2015 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pf_ict0: times 8 dd 1.402
+pf_ict1: times 8 dd 0.34413
+pf_ict2: times 8 dd 0.71414
+pf_ict3: times 8 dd 1.772
+
+SECTION .text
+
+;***********************************************************************
+; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
+;***********************************************************************
+%macro ICT_FLOAT 1
+cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+    movaps   m6, [pf_ict0]
+    movaps   m7, [pf_ict1]
+    %define ICT0 m6
+    %define ICT1 m7
+
+%if ARCH_X86_64
+    movaps   m8, [pf_ict2]
+    %define ICT2 m8
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    movaps   m9, [pf_ict3]
+    %define ICT3 m9
+%endif
+
+%else ; ARCH_X86_32
+    %define ICT2 [pf_ict2]
+%if cpuflag(avx)
+    movaps   m3, [pf_ict3]
+    %define ICT3 m3
+%else
+    %define ICT3 [pf_ict3]
+%endif
+
+%endif ; ARCH
+
+align 16
+.loop:
+    movaps   m0, [src0q+csizeq]
+    movaps   m1, [src1q+csizeq]
+    movaps   m2, [src2q+csizeq]
+
+%if cpuflag(avx)
+    mulps    m5, m1, ICT1
+    mulps    m4, m2, ICT0
+    mulps    m1, m1, ICT3
+    mulps    m2, m2, ICT2
+    subps    m5, m0, m5
+%else ; sse
+    movaps   m3, m1
+    movaps   m4, m2
+    movaps   m5, m0
+    mulps    m3, ICT1
+    mulps    m4, ICT0
+    mulps    m1, ICT3
+    mulps    m2, ICT2
+    subps    m5, m3
+%endif
+    addps    m4, m4, m0
+    addps    m0, m0, m1
+    subps    m5, m5, m2
+
+    movaps   [src0q+csizeq], m4
+    movaps   [src2q+csizeq], m0
+    movaps   [src1q+csizeq], m5
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+ICT_FLOAT 10
+INIT_YMM avx
+ICT_FLOAT 9
+
+;***************************************************************************
+; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
+;***************************************************************************
+%macro RCT_INT 0
+cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
+    shl  csized, 2
+    add   src0q, csizeq
+    add   src1q, csizeq
+    add   src2q, csizeq
+    neg  csizeq
+
+align 16
+.loop:
+    mova   m1, [src1q+csizeq]
+    mova   m2, [src2q+csizeq]
+    mova   m0, [src0q+csizeq]
+    paddd  m3, m1, m2
+    psrad  m3, 2
+    psubd  m0, m3
+    paddd  m1, m0
+    paddd  m2, m0
+    mova   [src1q+csizeq], m0
+    mova   [src2q+csizeq], m1
+    mova   [src0q+csizeq], m2
+    add  csizeq, mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+RCT_INT
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+RCT_INT
+%endif
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
new file mode 100644
index 0000000..baa8138
--- /dev/null
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -0,0 +1,50 @@
+/*
+ * SIMD optimized JPEG 2000 DSP functions
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/jpeg2000dsp.h"
+
+void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
+void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
+void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+
+av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (EXTERNAL_SSE(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_sse;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_sse2;
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT97] = ff_ict_float_avx;
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
+    }
+}
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/lossless_audiodsp.asm
index d6abd98..063d7b4 100644
--- a/libavcodec/x86/apedsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -58,14 +58,7 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
     mova    [v1q + orderq + mmsize], m3
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
 %endmacro
@@ -75,6 +68,39 @@ SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 
+INIT_XMM sse4
+; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    SPLATW  m7, m7
+    pxor    m6, m6
+    add v1q, orderq
+    lea v2q, [v2q + 2*orderq]
+    add v3q, orderq
+    neg orderq
+.loop:
+    mova    m3, [v1q + orderq]
+    movu    m0, [v2q + 2*orderq]
+    pmovsxwd m4, m3
+    movu    m1, [v2q + 2*orderq + mmsize]
+    movhlps m5, m3
+    movu    m2, [v3q + orderq]
+    pmovsxwd m5, m5
+    pmullw  m2, m7
+    pmulld  m0, m4
+    pmulld  m1, m5
+    paddw   m2, m3
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    add     orderq, 16
+    jl .loop
+    HADDD   m6, m0
+    movd   eax, m6
+    RET
+
 %macro SCALARPRODUCT_LOOP 1
 align 16
 .loop%1:
@@ -159,9 +185,6 @@ SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
 .end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c
index f692c2b..10b6a65 100644
--- a/libavcodec/x86/apedsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -1,25 +1,25 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
                                                const int16_t *v3,
@@ -31,8 +31,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 
-av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags))
@@ -44,4 +49,8 @@ av_cold void ff_apedsp_init_x86(APEDSPContext *c)
     if (EXTERNAL_SSSE3(cpu_flags) &&
         !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
+#endif
 }
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
new file mode 100644
index 0000000..f06fcdf
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,294 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_ef: times 8 db 14,15
+pb_67: times 8 db  6, 7
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION .text
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+    movd    m4, maskd
+    SPLATW  m4, m4
+    add     wd, wd
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+    push  tmpq
+%%.wordloop:
+    sub     wq, 2
+%ifidn %2, add
+    mov   tmpw, [srcq+wq]
+    add   tmpw, [dstq+wq]
+%else
+    mov   tmpw, [src1q+wq]
+    sub   tmpw, [src2q+wq]
+%endif
+    and   tmpw, maskw
+    mov     [dstq+wq], tmpw
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+    pop   tmpq
+%%.tomainloop:
+%ifidn %2, add
+    add     srcq, wq
+%else
+    add     src1q, wq
+    add     src2q, wq
+%endif
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%ifidn %2, add
+    mov%1   m0, [srcq+wq]
+    mov%1   m1, [dstq+wq]
+    mov%1   m2, [srcq+wq+mmsize]
+    mov%1   m3, [dstq+wq+mmsize]
+%else
+    mov%1   m0, [src1q+wq]
+    mov%1   m1, [src2q+wq]
+    mov%1   m2, [src1q+wq+mmsize]
+    mov%1   m3, [src2q+wq+mmsize]
+%endif
+    p%2w    m0, m1
+    p%2w    m2, m3
+    pand    m0, m4
+    pand    m2, m4
+    mov%1   [dstq+wq]       , m0
+    mov%1   [dstq+wq+mmsize], m2
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
+
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    INT16_LOOP a, add
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, add
+.unaligned:
+    INT16_LOOP u, add
+
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    INT16_LOOP a, sub
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    test src1q, mmsize-1
+    jnz .unaligned
+    test src2q, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, sub
+.unaligned:
+    INT16_LOOP u, sub
+
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+    add     wd, wd
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    mov%2   m1, [srcq+wq]
+    mova    m2, m1
+    pslld   m1, 16
+    paddw   m1, m2
+    mova    m2, m1
+
+    pshufb  m1, m3
+    paddw   m1, m2
+    pshufb  m0, m5
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m4
+    paddw   m1, m2
+%endif
+    paddw   m0, m1
+    pand    m0, m7
+%ifidn %1, a
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    mov     wd, eax
+    shl     wd, 8
+    lea     eax, [wd+eax-1]
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+INIT_MMX ssse3
+cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+    mova    m5, [pb_67]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    psllq   m0, 48
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM sse4
+cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
+    mova    m5, [pb_ef]
+    mova    m4, [pb_zzzzzzzz67676767]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    pslldq  m0, 14
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+    add      wd, wd
+    movd    mm6, maskd
+    SPLATW  mm6, mm6
+    movq    mm0, [topq]
+    movq    mm2, mm0
+    movd    mm4, [left_topq]
+    psllq   mm2, 16
+    movq    mm1, mm0
+    por     mm4, mm2
+    movd    mm3, [leftq]
+    psubw   mm0, mm4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movq    mm4, [topq+wq]
+    movq    mm0, mm4
+    psllq   mm4, 16
+    por     mm4, mm1
+    movq    mm1, mm0 ; t
+    psubw   mm0, mm4 ; t-tl
+.skip:
+    movq    mm2, [diffq+wq]
+%assign i 0
+%rep 4
+    movq    mm4, mm0
+    paddw   mm4, mm3 ; t-tl+l
+    pand    mm4, mm6
+    movq    mm5, mm3
+    pmaxsw  mm3, mm1
+    pminsw  mm5, mm1
+    pminsw  mm3, mm4
+    pmaxsw  mm3, mm5 ; median
+    paddw   mm3, mm2 ; +residual
+    pand    mm3, mm6
+%if i==0
+    movq    mm7, mm3
+    psllq   mm7, 48
+%else
+    movq    mm4, mm3
+    psrlq   mm7, 16
+    psllq   mm4, 48
+    por     mm7, mm4
+%endif
+%if i<3
+    psrlq   mm0, 16
+    psrlq   mm1, 16
+    psrlq   mm2, 16
+%endif
+%assign i i+1
+%endrep
+    movq [dstq+wq], mm7
+    add      wq, 8
+    jl .loop
+    movzx   r2d, word [dstq-2]
+    mov [leftq], r2d
+    movzx   r2d, word [topq-2]
+    mov [left_topq], r2d
+    RET
+
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+    add      wd, wd
+    movd    mm7, maskd
+    SPLATW  mm7, mm7
+    movq    mm0, [src1q]
+    movq    mm2, [src2q]
+    psllq   mm0, 16
+    psllq   mm2, 16
+    movd    mm6, [left_topq]
+    por     mm0, mm6
+    movd    mm6, [leftq]
+    por     mm2, mm6
+    xor     maskq, maskq
+.loop:
+    movq    mm1, [src1q + maskq]
+    movq    mm3, [src2q + maskq]
+    movq    mm4, mm2
+    psubw   mm2, mm0
+    paddw   mm2, mm1
+    pand    mm2, mm7
+    movq    mm5, mm4
+    pmaxsw  mm4, mm1
+    pminsw  mm1, mm5
+    pminsw  mm4, mm2
+    pmaxsw  mm4, mm1
+    psubw   mm3, mm4
+    pand    mm3, mm7
+    movq    [dstq + maskq], mm3
+    add     maskq, 8
+    movq    mm0, [src1q + maskq - 2]
+    movq    mm2, [src2q + maskq - 2]
+    cmp     maskq, wq
+        jb .loop
+    movzx maskd, word [src1q + wq - 2]
+    mov [left_topq], maskd
+    movzx maskd, word [src2q + wq - 2]
+    mov [leftq], maskd
+    RET
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
new file mode 100644
index 0000000..b0fbcfe
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,62 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../lossless_videodsp.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+int ff_add_hfyu_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc);
+int ff_add_hfyu_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->add_int16 = ff_add_int16_mmx;
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc->comp[0].depth<16) {
+        c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_int16 = ff_add_int16_sse2;
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_ssse3;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_sse4;
+    }
+}
diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index ea5d2ea..3a9493f 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -2,26 +2,25 @@
  * SIMD-optimized LPC functions
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -73,6 +72,7 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
         "3:                                    \n\t"
         :"+&r"(i), "+&r"(j)
         :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+         NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                     "%xmm5", "%xmm6", "%xmm7")
     );
@@ -117,6 +117,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm2, 16(%1)           \n\t"
                 :"+&r"(i)
                 :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
                 :"memory"
             );
         } else {
@@ -140,6 +141,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm1, %2               \n\t"
                 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
                 :"r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
             );
         }
     }
@@ -152,7 +154,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c)
 #if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
         c->lpc_compute_autocorr   = lpc_compute_autocorr_sse2;
     }
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index 2c04d9d..6298f5e 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -105,7 +105,7 @@ __asm__ volatile(\
 #endif /* HAVE_I686 */
 
 #define MASK_ABS(mask, level)                   \
-    __asm__ ("cltd                   \n\t"      \
+    __asm__ ("cdq                    \n\t"      \
              "xorl %1, %0            \n\t"      \
              "subl %1, %0            \n\t"      \
              : "+a"(level), "=&d"(mask))
diff --git a/libavcodec/x86/mdct.h b/libavcodec/x86/mdct.h
deleted file mode 100644
index cc107cb..0000000
--- a/libavcodec/x86/mdct.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_MDCT_H
-#define AVCODEC_X86_MDCT_H
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-#endif /* AVCODEC_X86_MDCT_H */
diff --git a/libavcodec/x86/mdct_init.c b/libavcodec/x86/mdct_init.c
deleted file mode 100644
index db642d8..0000000
--- a/libavcodec/x86/mdct_init.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/cpu.h"
-
-#include "mdct.h"
-
-av_cold void ff_mdct_init_x86(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
-    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
-        s->imdct_calc = ff_imdct_calc_3dnow;
-        s->imdct_half = ff_imdct_half_3dnow;
-    }
-
-    if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
-        s->imdct_calc = ff_imdct_calc_3dnowext;
-        s->imdct_half = ff_imdct_half_3dnowext;
-    }
-#endif /* ARCH_X86_32 */
-
-    if (EXTERNAL_SSE(cpu_flags)) {
-        s->imdct_calc  = ff_imdct_calc_sse;
-        s->imdct_half  = ff_imdct_half_sse;
-    }
-
-    if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
-        s->imdct_half      = ff_imdct_half_avx;
-    }
-}
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 1a87f37..ad06d48 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -4,25 +4,30 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+cextern pb_1
+cextern pb_80
+
 SECTION .text
 
 %macro DIFF_PIXELS_1 4
@@ -210,7 +215,7 @@ hadamard8_16_wrapper %1, 3
 %elif cpuflag(mmx)
 ALIGN 16
 ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,
-;                               uint8_t *src2, int stride, int h)
+;                               uint8_t *src2, ptrdiff_t stride, int h)
 ; r0 = void *s = unused, int h = unused (always 8)
 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
 ; can simply call this 2x2x (and that's why we access rsp+gprsize
@@ -274,19 +279,27 @@ INIT_XMM ssse3
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
 HADAMARD8_DIFF 9
 
-INIT_XMM sse2
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-;                   int line_size, int h);
-cglobal sse16, 5, 5, 8
-    shr      r4d, 1
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;               ptrdiff_t line_size, int h)
+
+%macro SUM_SQUARED_ERRORS 1
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
+%if %1 == mmsize
+    shr       hd, 1
+%endif
     pxor      m0, m0         ; mm0 = 0
     pxor      m7, m7         ; mm7 holds the sum
 
 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
-    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
-    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
-    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
-    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
+    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
+    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
+%if %1 == mmsize
+    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
+    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
+%else  ; %1 / 2 == mmsize; mmx only
+    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
+    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
+%endif
 
     ; todo: mm1-mm2, mm3-mm4
     ; algo: subtract mm1 from mm2 with saturation and vice versa
@@ -315,22 +328,607 @@ cglobal sse16, 5, 5, 8
     pmaddwd   m1, m1
     pmaddwd   m3, m3
 
-    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
-    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
-
     paddd     m1, m2
     paddd     m3, m4
     paddd     m7, m1
     paddd     m7, m3
 
-    dec       r4
+%if %1 == mmsize
+    lea    pix1q, [pix1q + 2*lsizeq]
+    lea    pix2q, [pix2q + 2*lsizeq]
+%else
+    add    pix1q, lsizeq
+    add    pix2q, lsizeq
+%endif
+    dec       hd
     jnz .next2lines
 
-    mova      m1, m7
-    psrldq    m7, 8          ; shift hi qword to lo
-    paddd     m7, m1
-    mova      m1, m7
-    psrldq    m7, 4          ; shift hi dword to lo
-    paddd     m7, m1
+    HADDD     m7, m1
     movd     eax, m7         ; return value
     RET
+%endmacro
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 8
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 16
+
+INIT_XMM sse2
+SUM_SQUARED_ERRORS 16
+
+;-----------------------------------------------
+;int ff_sum_abs_dctelem(int16_t *block)
+;-----------------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline loops
+
+%macro SUM_ABS_DCTELEM 2
+cglobal sum_abs_dctelem, 1, 1, %1, block
+    pxor    m0, m0
+    pxor    m1, m1
+%assign %%i 0
+%rep %2
+    mova      m2, [blockq+mmsize*(0+%%i)]
+    mova      m3, [blockq+mmsize*(1+%%i)]
+    mova      m4, [blockq+mmsize*(2+%%i)]
+    mova      m5, [blockq+mmsize*(3+%%i)]
+    ABS1_SUM  m2, m6, m0
+    ABS1_SUM  m3, m6, m1
+    ABS1_SUM  m4, m6, m0
+    ABS1_SUM  m5, m6, m1
+%assign %%i %%i+4
+%endrep
+    paddusw m0, m1
+    HSUM    m0, m1, eax
+    and     eax, 0xFFFF
+    RET
+%endmacro
+
+INIT_MMX mmx
+SUM_ABS_DCTELEM 0, 4
+INIT_MMX mmxext
+SUM_ABS_DCTELEM 0, 4
+INIT_XMM sse2
+SUM_ABS_DCTELEM 7, 2
+INIT_XMM ssse3
+SUM_ABS_DCTELEM 6, 2
+
+;------------------------------------------------------------------------------
+; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h)
+;------------------------------------------------------------------------------
+; %1 = 8/16. %2-5=m#
+%macro HF_NOISE_PART1 5
+    mova      m%2, [pix1q]
+%if %1 == 8
+    mova      m%3, m%2
+    psllq     m%2, 8
+    psrlq     m%3, 8
+    psrlq     m%2, 8
+%else
+    mova      m%3, [pix1q+1]
+%endif
+    mova      m%4, m%2
+    mova      m%5, m%3
+    punpcklbw m%2, m7
+    punpcklbw m%3, m7
+    punpckhbw m%4, m7
+    punpckhbw m%5, m7
+    psubw     m%2, m%3
+    psubw     m%4, m%5
+%endmacro
+
+; %1-2 = m#
+%macro HF_NOISE_PART2 4
+    psubw     m%1, m%3
+    psubw     m%2, m%4
+    pxor       m3, m3
+    pxor       m1, m1
+    pcmpgtw    m3, m%1
+    pcmpgtw    m1, m%2
+    pxor      m%1, m3
+    pxor      m%2, m1
+    psubw     m%1, m3
+    psubw     m%2, m1
+    paddw     m%2, m%1
+    paddw      m6, m%2
+%endmacro
+
+; %1 = 8/16
+%macro HF_NOISE 1
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
+    sub        hd, 2
+    pxor       m7, m7
+    pxor       m6, m6
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+.loop:
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    HF_NOISE_PART2     4, 5, 0, 2
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+    sub        hd, 2
+        jne .loop
+
+    mova       m0, m6
+    punpcklwd  m0, m7
+    punpckhwd  m6, m7
+    paddd      m6, m0
+    mova       m0, m6
+    psrlq      m6, 32
+    paddd      m0, m6
+    movd      eax, m0   ; eax = result of hf_noise8;
+    REP_RET                 ; return eax;
+%endmacro
+
+INIT_MMX mmx
+HF_NOISE 8
+HF_NOISE 16
+
+;---------------------------------------------------------------------------------------
+;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;---------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+    movu      m2, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m2, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+strideq*2]
+    lea    pix2q, [pix2q+strideq*2]
+    movu      m0, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m0, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m0
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m0, m2
+    paddw     m2, m0
+%endif
+    movd     eax, m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD 8
+SAD 16
+INIT_XMM sse2
+SAD 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
+    movu      m0, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m0, m3
+    pavgb     m2, m4
+%else
+    pavgb     m0, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m0, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m1, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m1, m3
+    pavgb     m2, m4
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_X2 8
+SAD_X2 16
+INIT_XMM sse2
+SAD_X2 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m4, [pix2q+8]
+    movu      m5, [pix2q+strideq+8]
+    movu      m6, [pix2q+2*strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_Y2 8
+SAD_Y2 16
+INIT_XMM sse2
+SAD_Y2 16
+
+;-------------------------------------------------------------------------------------------
+;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h);
+;-------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_APPROX_XY2 1
+cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
+    mova      m4, [pb_1]
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    movu      m2, [pix2q+2*strideq+1]
+    pavgb     m1, m5
+    pavgb     m0, m6
+    pavgb     m3, m2
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m0, [pix2q+strideq+1]
+    pavgb     m3, [pix2q+2*strideq+1]
+%endif
+    psubusb   m0, m4
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    movu      m7, [pix2q+2*strideq+8]
+    pavgb     m5, [pix2q+1+8]
+    pavgb     m6, [pix2q+strideq+1+8]
+    pavgb     m7, [pix2q+2*strideq+1+8]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    pavgb     m2, m5
+    pavgb     m3, m6
+%else
+    pavgb     m2, [pix2q+1]
+    pavgb     m3, [pix2q+strideq+1]
+%endif
+    psubusb   m2, m4
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m6, [pix2q+8]
+    movu      m7, [pix2q+strideq+8]
+    pavgb     m6, [pix2q+8+1]
+    pavgb     m7, [pix2q+strideq+8+1]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_APPROX_XY2 8
+SAD_APPROX_XY2 16
+INIT_XMM sse2
+SAD_APPROX_XY2 16
+
+;--------------------------------------------------------------------
+;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                  ptrdiff_t line_size, int h);
+;--------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_INTRA 1
+cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
+    mova      m0, [pix1q]
+%if %1 == mmsize
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m0, m2
+%else
+    mova      m2, [pix1q+lsizeq]
+    mova      m3, [pix1q+8]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m0, m2
+    psadbw    m3, m4
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+
+.loop:
+    lea    pix1q, [pix1q + 2*lsizeq]
+%if %1 == mmsize
+    mova      m1, [pix1q]
+    psadbw    m2, m1
+    paddw     m0, m2
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m1, m2
+    paddw     m0, m1
+%else
+    mova      m1, [pix1q]
+    mova      m3, [pix1q+8]
+    psadbw    m2, m1
+    psadbw    m4, m3
+    paddw     m0, m2
+    paddw     m0, m4
+    mova      m2, [pix1q+lsizeq]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m1, m2
+    psadbw    m3, m4
+    paddw     m0, m1
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+    jg     .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_INTRA 8
+VSAD_INTRA 16
+INIT_XMM sse2
+VSAD_INTRA 16
+
+;---------------------------------------------------------------------
+;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                   ptrdiff_t line_size, int h);
+;---------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_APPROX 1
+cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+    mova   m1, [pb_80]
+    mova   m0, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+    mova   m4, [pix1q+lsizeq]
+%if mmsize == 16
+    movu   m3, [pix2q]
+    movu   m2, [pix2q+lsizeq]
+    psubb  m0, m3
+    psubb  m4, m2
+%else
+    psubb  m0, [pix2q]
+    psubb  m4, [pix2q+lsizeq]
+%endif
+    pxor   m0, m1
+    pxor   m4, m1
+    psadbw m0, m4
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m0, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m0, m1
+    pxor   m3, m1
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m0, m4
+    psadbw m3, m5
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+
+.loop:
+    lea pix1q, [pix1q + 2*lsizeq]
+    lea pix2q, [pix2q + 2*lsizeq]
+    mova   m2, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+%if mmsize == 16
+    movu   m3, [pix2q]
+    psubb  m2, m3
+%else
+    psubb  m2, [pix2q]
+%endif
+    pxor   m2, m1
+    psadbw m4, m2
+    paddw  m0, m4
+    mova   m4, [pix1q+lsizeq]
+    movu   m3, [pix2q+lsizeq]
+    psubb  m4, m3
+    pxor   m4, m1
+    psadbw m2, m4
+    paddw  m0, m2
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m2, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m2, m1
+    pxor   m3, m1
+    psadbw m4, m2
+    psadbw m5, m3
+    paddw  m0, m4
+    paddw  m0, m5
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m2, m4
+    psadbw m3, m5
+    paddw  m0, m2
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+    jg  .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd  eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_APPROX 8
+VSAD_APPROX 16
+INIT_XMM sse2
+VSAD_APPROX 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index b906bb6..49f50d0 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,382 +29,67 @@
 #include "libavcodec/me_cmp.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_INLINE_ASM
-
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                    ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl         %4, %%ecx          \n"
-        "shr          $1, %%ecx          \n"
-        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
-        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
-        "1:                              \n"
-        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
-        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
-        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
-        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq      %%mm1, %%mm5          \n"
-        "movq      %%mm3, %%mm6          \n"
-        "psubusb   %%mm2, %%mm1          \n"
-        "psubusb   %%mm4, %%mm3          \n"
-        "psubusb   %%mm5, %%mm2          \n"
-        "psubusb   %%mm6, %%mm4          \n"
-
-        "por       %%mm1, %%mm2          \n"
-        "por       %%mm3, %%mm4          \n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq      %%mm2, %%mm1          \n"
-        "movq      %%mm4, %%mm3          \n"
-
-        "punpckhbw %%mm0, %%mm2          \n"
-        "punpckhbw %%mm0, %%mm4          \n"
-        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd   %%mm2, %%mm2          \n"
-        "pmaddwd   %%mm4, %%mm4          \n"
-        "pmaddwd   %%mm1, %%mm1          \n"
-        "pmaddwd   %%mm3, %%mm3          \n"
-
-        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * stride */
-        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * stride */
-
-        "paddd     %%mm2, %%mm1          \n"
-        "paddd     %%mm4, %%mm3          \n"
-        "paddd     %%mm1, %%mm7          \n"
-        "paddd     %%mm3, %%mm7          \n"
-
-        "decl      %%ecx                 \n"
-        "jnz       1b                    \n"
-
-        "movq      %%mm7, %%mm1          \n"
-        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
-        "paddd     %%mm7, %%mm1          \n"
-        "movd      %%mm1, %2             \n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                     ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
-        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
-        "1:\n"
-        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
-        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
-        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
-        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq %%mm1, %%mm5\n"
-        "movq %%mm3, %%mm6\n"
-        "psubusb %%mm2, %%mm1\n"
-        "psubusb %%mm4, %%mm3\n"
-        "psubusb %%mm5, %%mm2\n"
-        "psubusb %%mm6, %%mm4\n"
-
-        "por %%mm1, %%mm2\n"
-        "por %%mm3, %%mm4\n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq %%mm2, %%mm1\n"
-        "movq %%mm4, %%mm3\n"
-
-        "punpckhbw %%mm0, %%mm2\n"
-        "punpckhbw %%mm0, %%mm4\n"
-        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd %%mm2, %%mm2\n"
-        "pmaddwd %%mm4, %%mm4\n"
-        "pmaddwd %%mm1, %%mm1\n"
-        "pmaddwd %%mm3, %%mm3\n"
-
-        "add %3, %0\n"
-        "add %3, %1\n"
-
-        "paddd %%mm2, %%mm1\n"
-        "paddd %%mm4, %%mm3\n"
-        "paddd %%mm1, %%mm7\n"
-        "paddd %%mm3, %%mm7\n"
-
-        "decl %%ecx\n"
-        "jnz 1b\n"
-
-        "movq %%mm7, %%mm1\n"
-        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
-        "paddd %%mm7, %%mm1\n"
-        "movd %%mm1, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor  %%mm3, %%mm3\n"
-        "pxor  %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor  %%mm3, %%mm4\n"
-        "pxor  %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq      %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq      %%mm4, %%mm5\n"
-        "movq      %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw     %%mm1, %%mm4\n"
-        "psubw     %%mm3, %%mm5\n"
-        "psubw     %%mm4, %%mm0\n"
-        "psubw     %%mm5, %%mm2\n"
-        "pxor      %%mm3, %%mm3\n"
-        "pxor      %%mm1, %%mm1\n"
-        "pcmpgtw   %%mm0, %%mm3\n\t"
-        "pcmpgtw   %%mm2, %%mm1\n\t"
-        "pxor      %%mm3, %%mm0\n"
-        "pxor      %%mm1, %%mm2\n"
-        "psubw     %%mm3, %%mm0\n"
-        "psubw     %%mm1, %%mm2\n"
-        "paddw     %%mm0, %%mm2\n"
-        "paddw     %%mm2, %%mm6\n"
-
-        "add  %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq      %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd     %%mm0, %%mm6\n"
-
-        "movq  %%mm6, %%mm0\n"
-        "psrlq $32,   %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd  %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
-{
-    int tmp;
-    uint8_t *pix = pix1;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor %%mm3, %%mm4\n"
-        "pxor %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd %%mm0, %%mm6\n"
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                ptrdiff_t stride, int h);
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                 ptrdiff_t stride, int h);
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  ptrdiff_t stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      ptrdiff_t stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       ptrdiff_t stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                              ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                               ptrdiff_t stride, int h);
+int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             ptrdiff_t stride, int h);
+int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                          ptrdiff_t stride, int h);
+int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                           ptrdiff_t stride, int h);
+int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         ptrdiff_t stride, int h);
+int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    ptrdiff_t stride, int h);
+int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     ptrdiff_t stride, int h);
+int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   ptrdiff_t stride, int h);
 
-        "movq %%mm6, %%mm0\n"
-        "psrlq $32, %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" (stride), "g" (h - 2)
-        : "%ecx");
+#define hadamard_func(cpu)                                                    \
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
+                                  uint8_t *src2, ptrdiff_t stride, int h);    \
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
+                                    uint8_t *src2, ptrdiff_t stride, int h);
 
-    return tmp + hf_noise8_mmx(pix + 8, stride, h);
-}
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
 
+#if HAVE_YASM
 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
@@ -413,9 +98,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
     if (c)
         score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
     else
-        score1 = sse16_mmx(c, pix1, pix2, stride, h);
-    score2 = hf_noise16_mmx(pix1, stride, h) -
-             hf_noise16_mmx(pix2, stride, h);
+        score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
+    score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
+           - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -426,9 +111,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                      ptrdiff_t stride, int h)
 {
-    int score1 = sse8_mmx(c, pix1, pix2, stride, h);
-    int score2 = hf_noise8_mmx(pix1, stride, h) -
-                 hf_noise8_mmx(pix2, stride, h);
+    int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
+    int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
+                 ff_hf_noise8_mmx(pix2, stride, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -436,13 +121,17 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
         return score1 + FFABS(score2) * 8;
 }
 
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
+
 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
                             ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)               \
     "movq (%0), %%mm2\n"                        \
@@ -500,57 +189,14 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 }
 #undef SUM
 
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
-                               ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq 8(%0), " #out1 "\n"                   \
-    "add %2, %0\n"                              \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n"
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pxor %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq 8(%0), %%mm1\n"
-        "add %2, %0\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %1\n"
-        : "+r" (pix), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h)
 {
     int tmp;
 
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
+    av_assert2((((int) pix1) & 7) == 0);
+    av_assert2((((int) pix2) & 7) == 0);
+    av_assert2((stride & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)       \
     "movq (%0), %%mm2\n"                \
@@ -624,191 +270,16 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 }
 #undef SUM
 
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                         ptrdiff_t stride, int h)
-{
-    int tmp;
-
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((stride & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq (%1), %%mm2\n"                        \
-    "movq 8(%0), " #out1 "\n"                   \
-    "movq 8(%1), %%mm3\n"                       \
-    "add %3, %0\n"                              \
-    "add %3, %1\n"                              \
-    "psubb %%mm2, " #out0 "\n"                  \
-    "psubb %%mm3, " #out1 "\n"                  \
-    "pxor %%mm7, " #out0 "\n"                   \
-    "pxor %%mm7, " #out1 "\n"                   \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n    "
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pcmpeqw %%mm7, %%mm7\n"
-        "psllw $15, %%mm7\n"
-        "packsswb %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq (%1), %%mm2\n"
-        "movq 8(%0), %%mm1\n"
-        "movq 8(%1), %%mm3\n"
-        "add %3, %0\n"
-        "add %3, %1\n"
-        "psubb %%mm2, %%mm0\n"
-        "psubb %%mm3, %%mm1\n"
-        "pxor %%mm7, %%mm0\n"
-        "pxor %%mm7, %%mm1\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" (stride), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
-#define MMABS_MMX(a,z)                          \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "pcmpgtw " #a ", " #z "             \n\t"   \
-    "pxor "    #z ", " #a "             \n\t"   \
-    "psubw "   #z ", " #a "             \n\t"
-
-#define MMABS_MMXEXT(a, z)                      \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "psubw "   #a ", " #z "             \n\t"   \
-    "pmaxsw "  #z ", " #a "             \n\t"
-
-#define MMABS_SSSE3(a,z)                        \
-    "pabsw "   #a ", " #a "             \n\t"
-
-#define MMABS_SUM(a,z, sum)                     \
-    MMABS(a,z)                                  \
-    "paddusw " #a ", " #sum "           \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst)                     \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $32, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $16, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_MMXEXT(a, t, dst)                  \
-    "pshufw   $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshufw   $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_SSE2(a, t, dst)                    \
-    "movhlps " #a ", " #t "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define DCT_SAD4(m, mm, o)                      \
-    "mov"#m" "#o" +  0(%1), " #mm "2    \n\t"   \
-    "mov"#m" "#o" + 16(%1), " #mm "3    \n\t"   \
-    "mov"#m" "#o" + 32(%1), " #mm "4    \n\t"   \
-    "mov"#m" "#o" + 48(%1), " #mm "5    \n\t"   \
-    MMABS_SUM(mm ## 2, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 3, mm ## 7, mm ## 1)        \
-    MMABS_SUM(mm ## 4, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 5, mm ## 7, mm ## 1)        \
-
-#define DCT_SAD_MMX                             \
-    "pxor    %%mm0, %%mm0               \n\t"   \
-    "pxor    %%mm1, %%mm1               \n\t"   \
-    DCT_SAD4(q, %%mm, 0)                        \
-    DCT_SAD4(q, %%mm, 8)                        \
-    DCT_SAD4(q, %%mm, 64)                       \
-    DCT_SAD4(q, %%mm, 72)                       \
-    "paddusw %%mm1, %%mm0               \n\t"   \
-    HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2                            \
-    "pxor    %%xmm0, %%xmm0             \n\t"   \
-    "pxor    %%xmm1, %%xmm1             \n\t"   \
-    DCT_SAD4(dqa, %%xmm, 0)                     \
-    DCT_SAD4(dqa, %%xmm, 64)                    \
-    "paddusw %%xmm1, %%xmm0             \n\t"   \
-    HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu)                           \
-static int sum_abs_dctelem_ ## cpu(int16_t *block)  \
-{                                                   \
-    int sum;                                        \
-    __asm__ volatile (                              \
-        DCT_SAD                                     \
-        :"=r"(sum)                                  \
-        :"r"(block));                               \
-    return sum & 0xFFFF;                            \
-}
-
-#define DCT_SAD         DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z)     MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z)     MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD         DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z)     MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
-
 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
     0x0000000000000000ULL,
     0x0001000100010001ULL,
     0x0002000200020002ULL,
 };
 
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
-
 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -841,133 +312,10 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
         : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
 }
 
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
-                      ptrdiff_t stride, int h)
-{
-    int ret;
-    __asm__ volatile (
-        "pxor %%xmm2, %%xmm2            \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movdqu (%1), %%xmm0            \n\t"
-        "movdqu (%1, %4), %%xmm1        \n\t"
-        "psadbw (%2), %%xmm0            \n\t"
-        "psadbw (%2, %4), %%xmm1        \n\t"
-        "paddw %%xmm0, %%xmm2           \n\t"
-        "paddw %%xmm1, %%xmm2           \n\t"
-        "lea (%1,%4,2), %1              \n\t"
-        "lea (%2,%4,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        "movhlps %%xmm2, %%xmm0         \n\t"
-        "paddw   %%xmm0, %%xmm2         \n\t"
-        "movd    %%xmm2, %3             \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
-        : "r" (stride));
-    return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "pavgb 1(%1, %3), %%mm1         \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq (%1), %%mm0               \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 ptrdiff_t stride, int h)
-{
-    __asm__ volatile (
-        "movq "MANGLE(bone)", %%mm5     \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1,%3), %%mm2            \n\t"
-        "pavgb 1(%1), %%mm1             \n\t"
-        "pavgb 1(%1,%3), %%mm2          \n\t"
-        "psubusb %%mm5, %%mm1           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2,%3), %%mm1          \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" (stride));
-}
-
 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -1006,7 +354,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
                               ptrdiff_t stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -stride * h;
     __asm__ volatile (
         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
@@ -1030,7 +378,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddw %%mm4, %%mm2             \n\t"
         "paddw %%mm5, %%mm3             \n\t"
-        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+        "movq %5, %%mm5                 \n\t"
         "paddw %%mm2, %%mm0             \n\t"
         "paddw %%mm3, %%mm1             \n\t"
         "paddw %%mm5, %%mm0             \n\t"
@@ -1054,7 +402,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
         " js 1b                         \n\t"
         : "+a" (len)
         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
-          "r" (stride));
+          "r" (stride), "m" (round_tab[2]));
 }
 
 static inline int sum_mmx(void)
@@ -1072,15 +420,6 @@ static inline int sum_mmx(void)
     return ret & 0xFFFF;
 }
 
-static inline int sum_mmxext(void)
-{
-    int ret;
-    __asm__ volatile (
-        "movd %%mm6, %0                 \n\t"
-        : "=r" (ret));
-    return ret;
-}
-
 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
                                 ptrdiff_t stride, int h)
 {
@@ -1097,7 +436,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
                         uint8_t *blk1, ptrdiff_t stride, int h)         \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1111,7 +450,7 @@ static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1126,7 +465,7 @@ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, ptrdiff_t stride, int h)      \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1141,7 +480,7 @@ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
                             uint8_t *blk1, ptrdiff_t stride, int h)     \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1211,32 +550,15 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
 }                                                                       \
 
 PIX_SAD(mmx)
-PIX_SAD(mmxext)
 
 #endif /* HAVE_INLINE_ASM */
 
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                  ptrdiff_t stride, int h);
-
-#define hadamard_func(cpu)                                                    \
-    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
-                                  uint8_t *src2, ptrdiff_t stride, int h);    \
-    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
-                                    uint8_t *src2, ptrdiff_t stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmxext)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
     if (INLINE_MMX(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
-
         c->pix_abs[0][0] = sad16_mmx;
         c->pix_abs[0][1] = sad16_x2_mmx;
         c->pix_abs[0][2] = sad16_y2_mmx;
@@ -1249,77 +571,81 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
         c->sad[0] = sad16_mmx;
         c->sad[1] = sad8_mmx;
 
-        c->sse[0]  = sse16_mmx;
-        c->sse[1]  = sse8_mmx;
         c->vsad[4] = vsad_intra16_mmx;
 
-        c->nsse[0] = nsse16_mmx;
-        c->nsse[1] = nsse8_mmx;
-
         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
             c->vsad[0] = vsad16_mmx;
         }
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
-
-        c->vsad[4] = vsad_intra16_mmxext;
-
-        c->pix_abs[0][0] = sad16_mmxext;
-        c->pix_abs[1][0] = sad8_mmxext;
-
-        c->sad[0] = sad16_mmxext;
-        c->sad[1] = sad8_mmxext;
-
-        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
-            c->pix_abs[0][1] = sad16_x2_mmxext;
-            c->pix_abs[0][2] = sad16_y2_mmxext;
-            c->pix_abs[0][3] = sad16_xy2_mmxext;
-            c->pix_abs[1][1] = sad8_x2_mmxext;
-            c->pix_abs[1][2] = sad8_y2_mmxext;
-            c->pix_abs[1][3] = sad8_xy2_mmxext;
-
-            c->vsad[0] = vsad16_mmxext;
-        }
-    }
-
-    if (INLINE_SSE2(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_sse2;
-    }
-
-    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
-        c->sad[0] = sad16_sse2;
-    }
-
-#if HAVE_SSSE3_INLINE
-    if (INLINE_SSSE3(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
-    }
-#endif
 #endif /* HAVE_INLINE_ASM */
 
     if (EXTERNAL_MMX(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
+        c->sse[0]            = ff_sse16_mmx;
+        c->sse[1]            = ff_sse8_mmx;
+#if HAVE_YASM
+        c->nsse[0]           = nsse16_mmx;
+        c->nsse[1]           = nsse8_mmx;
+#endif
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
+
+        c->sad[0] = ff_sad16_mmxext;
+        c->sad[1] = ff_sad8_mmxext;
+
+        c->pix_abs[0][0] = ff_sad16_mmxext;
+        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+        c->pix_abs[1][0] = ff_sad8_mmxext;
+        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+        c->vsad[4] = ff_vsad_intra16_mmxext;
+        c->vsad[5] = ff_vsad_intra8_mmxext;
+
+        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
+            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
+
+            c->vsad[0] = ff_vsad16_approx_mmxext;
+            c->vsad[1] = ff_vsad8_approx_mmxext;
+        }
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->sse[0] = ff_sse16_sse2;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
 
 #if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
 #endif
+        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+            c->sad[0]        = ff_sad16_sse2;
+            c->pix_abs[0][0] = ff_sad16_sse2;
+            c->pix_abs[0][1] = ff_sad16_x2_sse2;
+            c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+            c->vsad[4]       = ff_vsad_intra16_sse2;
+            if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
+                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
+                c->vsad[0]       = ff_vsad16_approx_sse2;
+            }
+        }
     }
 
-    if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
     }
 }
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm
new file mode 100644
index 0000000..3dc641e
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.asm
@@ -0,0 +1,196 @@
+;******************************************************************************
+;* SIMD-optimized MLP DSP functions
+;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%if ARCH_X86_64
+
+%macro SHLX 2
+%if cpuflag(bmi2)
+   shlx %1, %1, %2q
+%else
+   shl  %1, %2b
+%endif
+%endmacro
+
+%macro REMATRIX 0
+    movdqa        m0, [samplesq]
+    movdqa        m1, [coeffsq ]
+    pshufd        m2, m0, q2301
+    pshufd        m3, m1, q2301
+    pmuldq        m0, m1
+    pmuldq        m3, m2
+    paddq         m0, m3
+%if notcpuflag(avx2)
+    movdqa        m1, [samplesq + 16]
+    movdqa        m2, [coeffsq  + 16]
+    pshufd        m3, m1, q2301
+    pshufd        m4, m2, q2301
+    pmuldq        m1, m2
+    pmuldq        m4, m3
+    paddq         m0, m1
+    paddq         m0, m4
+%else
+    vextracti128 xm1, m0, 1
+    paddq        xm0, xm1
+%endif
+%endmacro
+
+%macro LOOP_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+%macro LOOP_SHIFT_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    and       indexd, auspd                         ; index &= access_unit_size_pow2;
+    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
+    add       indexd, index2d                       ; index += index2
+    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
+    add       accumq, noiseq                        ; accum += noise_buffer[index]
+    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, noised                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
+;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
+;                             int index, unsigned int dest_ch, uint16_t blockpos,
+;                             unsigned int maxchan, int matrix_noise_shift,
+;                             int access_unit_size_pow2, int32_t mask)
+%macro MLP_REMATRIX_CHANNEL 0
+cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
+                                        index, dest_ch, blockpos, maxchan, mns, \
+                                        accum, mask, cnt
+    mov         mnsd, mnsm                          ; load matrix_noise_shift
+    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
+    mov     maxchand, maxchanm                      ; load maxchan
+    mov        maskd, maskm                         ; load mask
+%if WIN64
+    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
+%endif
+    shl     dest_chd, 2
+    lea         cntq, [blsbs_ptrq + blockposq*8]
+    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
+    jne .shift                                      ; jump if true
+    cmp     maxchand, 4                             ; is maxchan < 4?
+    jl .loop4                                       ; jump if true
+
+align 16
+.loop8:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_END
+    jne .loop8
+    RET
+
+align 16
+.loop4:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_END
+    jne .loop4
+    RET
+
+.shift:
+%if WIN64
+    mov       indexd, indexm         ; load index (not needed on UNIX64)
+%endif
+    mov          r9d, r9m            ; load access_unit_size_pow2
+%if cpuflag(bmi2)
+    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
+                index, dest_ch, accum, index2, mns, \
+                ausp, mask, cnt, noise
+    add         mnsd, 7              ; matrix_noise_shift += 7
+%else ; sse4
+    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
+%if WIN64
+    ; r0 = rcx
+    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
+                index2, accum, ausp, mask, cnt, noise
+%else ; UNIX64
+    ; r3 = rcx
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
+                index2, accum, ausp, mask, cnt, noise
+%endif
+    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
+%endif ; cpuflag
+    sub        auspd, 1              ; access_unit_size_pow2 -= 1
+    cmp          r7d, 4              ; is maxchan < 4?
+    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
+    jl .loop4_shift                  ; jump if maxchan < 4
+
+align 16
+.loop8_shift:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_SHIFT_END
+    jne .loop8_shift
+    RET
+
+align 16
+.loop4_shift:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_SHIFT_END
+    jne .loop4_shift
+    RET
+%endmacro
+
+INIT_XMM sse4
+MLP_REMATRIX_CHANNEL
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2, bmi2
+MLP_REMATRIX_CHANNEL
+%endif
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c
index 72fc637..7f5e6b1 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp_init.c
@@ -2,32 +2,47 @@
  * MLP DSP functions x86-optimized
  * Copyright (c) 2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/internal.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mlpdsp.h"
 #include "libavcodec/mlp.h"
 
-#if HAVE_7REGS && HAVE_INLINE_ASM
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+                                   const int32_t *coeffs, \
+                                   const uint8_t *bypassed_lsbs, \
+                                   const int8_t *noise_buffer, \
+                                   int index, \
+                                   unsigned int dest_ch, \
+                                   uint16_t blockpos, \
+                                   unsigned int maxchan, \
+                                   int matrix_noise_shift, \
+                                   int access_unit_size_pow2, \
+                                   int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
 
 extern char ff_mlp_firorder_8;
 extern char ff_mlp_firorder_7;
@@ -45,12 +60,12 @@ extern char ff_mlp_iirorder_2;
 extern char ff_mlp_iirorder_1;
 extern char ff_mlp_iirorder_0;
 
-static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
                                    &ff_mlp_firorder_2, &ff_mlp_firorder_3,
                                    &ff_mlp_firorder_4, &ff_mlp_firorder_5,
                                    &ff_mlp_firorder_6, &ff_mlp_firorder_7,
                                    &ff_mlp_firorder_8 };
-static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
                                    &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
                                    &ff_mlp_iirorder_4 };
 
@@ -133,8 +148,8 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         FIRMUL   (ff_mlp_firorder_6, 0x14   )
         FIRMUL   (ff_mlp_firorder_5, 0x10   )
         FIRMUL   (ff_mlp_firorder_4, 0x0c   )
-        FIRMULREG(ff_mlp_firorder_3, 0x08,10)
-        FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
+        FIRMUL   (ff_mlp_firorder_3, 0x08   )
+        FIRMUL   (ff_mlp_firorder_2, 0x04   )
         FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
         LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
         "jmp  *%6                     \n\t"
@@ -163,8 +178,6 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
         : /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
           /* 6*/"r"(iirjump)      , /* 7*/"c"(filter_shift)
         , /* 8*/"r"((int64_t)coeff[0])
-        , /* 9*/"r"((int64_t)coeff[1])
-        , /*10*/"r"((int64_t)coeff[2])
         : "rax", "rdx", "rsi"
 #else /* ARCH_X86_32 */
           /* 3*/"+m"(blocksize)
@@ -179,9 +192,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
 
 av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
 {
-#if HAVE_7REGS && HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
     if (INLINE_MMX(cpu_flags))
         c->mlp_filter_channel = mlp_filter_channel_x86;
 #endif
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+    if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
 }
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 533b4a7..d969f1d 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -2,20 +2,20 @@
  * SIMD-optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,11 +26,20 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if HAVE_YASM
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+#endif /* HAVE_YASM */
+
 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
                                float *tmpbuf);
 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
@@ -38,7 +47,7 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
 
 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
-#if HAVE_SSE2_INLINE
+#if HAVE_6REGS && HAVE_SSE_INLINE
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -182,7 +191,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     *out = sum;
 }
 
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
 
 #if HAVE_YASM
 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
@@ -217,16 +226,22 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
     }                                                                   \
 }
 
+#if HAVE_SSE
+#if ARCH_X86_32
 DECL_IMDCT_BLOCKS(sse,sse)
+#endif
 DECL_IMDCT_BLOCKS(sse2,sse)
 DECL_IMDCT_BLOCKS(sse3,sse)
 DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
 DECL_IMDCT_BLOCKS(avx,avx)
+#endif
 #endif /* HAVE_YASM */
 
 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
     int i, j;
     for (j = 0; j < 4; j++) {
@@ -242,16 +257,19 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
         }
     }
 
-#if HAVE_SSE2_INLINE
-    if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS && HAVE_SSE_INLINE
+    if (INLINE_SSE(cpu_flags)) {
         s->apply_window_float = apply_window_mp3;
     }
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE_INLINE */
 
 #if HAVE_YASM
+#if HAVE_SSE
+#if ARCH_X86_32
     if (EXTERNAL_SSE(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse;
     }
+#endif
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse2;
     }
@@ -261,8 +279,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_ssse3;
     }
+#endif
+#if HAVE_AVX_EXTERNAL
     if (EXTERNAL_AVX(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_avx;
     }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 33d5cd8..1811326 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -2,20 +2,20 @@
  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
  * H.263, MPEG-1, MPEG-2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,8 +25,9 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 
 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
                                   int16_t *block, int n, int qscale)
@@ -35,7 +36,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
 
     qmul = qscale << 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     if (!s->h263_aic) {
         if (n < 4)
@@ -111,7 +112,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
 
@@ -171,7 +172,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -239,7 +240,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -306,7 +307,10 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -345,8 +349,8 @@ __asm__ volatile(
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psraw $3, %%mm0                \n\t"
-                "psraw $3, %%mm1                \n\t"
+                "psraw $4, %%mm0                \n\t"
+                "psraw $4, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
@@ -371,7 +375,10 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
+
+    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
+    else                 qscale <<= 1;
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -410,8 +417,8 @@ __asm__ volatile(
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
-                "psrlw $4, %%mm0                \n\t"
-                "psrlw $4, %%mm1                \n\t"
+                "psrlw $5, %%mm0                \n\t"
+                "psrlw $5, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
                 "pxor %%mm3, %%mm1              \n\t"
                 "psubw %%mm2, %%mm0             \n\t"
@@ -442,11 +449,11 @@ __asm__ volatile(
         );
 }
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 
 av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
 {
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
@@ -458,5 +465,5 @@ av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 }
diff --git a/libavcodec/x86/mpegvideodsp.c b/libavcodec/x86/mpegvideodsp.c
index 0e5dd0f..941a8e2 100644
--- a/libavcodec/x86/mpegvideodsp.c
+++ b/libavcodec/x86/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,7 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegvideodsp.h"
+#include "libavcodec/videodsp.h"
 
 #if HAVE_INLINE_ASM
 
@@ -43,20 +44,24 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
     const uint64_t shift2  = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
     int x, y;
 
     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
     const int dxh = dxy * (h - 1);
     const int dyw = dyx * (w - 1);
+    int need_emu  =  (unsigned) ix >= width  - w ||
+                     (unsigned) iy >= height - h;
 
     if ( // non-constant fullpel offset (3% of blocks)
         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
         // uses more than 16 bits of subpel mv (only at huge resolution)
         (dxx | dxy | dyx | dyy) & 15 ||
-        (unsigned) ix >= width  - w ||
-        (unsigned) iy >= height - h) {
+        (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
         // FIXME could still use mmx for some of the rows
         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
                  shift, r, width, height);
@@ -64,6 +69,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     }
 
     src += ix + iy * stride;
+    if (need_emu) {
+        ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+        src = edge_buf;
+    }
 
     __asm__ volatile (
         "movd         %0, %%mm6         \n\t"
@@ -150,4 +159,3 @@ av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
         c->gmc = gmc_mmx;
 #endif /* HAVE_INLINE_ASM */
 }
-
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 47349d1..67b2617 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -2,20 +2,20 @@
  * The simplest mpeg encoder (well, it was the simplest!)
  * Copyright (c) 2000,2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
 DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 
+#if HAVE_6REGS
+
 #if HAVE_MMX_INLINE
 #define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2   0
@@ -81,7 +83,10 @@ DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_SSSE3_INLINE */
 
+#endif /* HAVE_6REGS */
+
 #if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -135,7 +140,9 @@ static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
         : "r"(block+64)
     );
 }
+#endif /* HAVE_MMX_INLINE */
 
+#if HAVE_SSE2_INLINE
 static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -191,9 +198,10 @@ static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
                             "%xmm4", "%xmm5", "%xmm6", "%xmm7")
     );
 }
+#endif /* HAVE_SSE2_INLINE */
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
 {
     const int dct_algo = s->avctx->dct_algo;
     int i;
@@ -205,21 +213,25 @@ av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
 #if HAVE_MMX_INLINE
         int cpu_flags = av_get_cpu_flags();
         if (INLINE_MMX(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_mmx;
+#endif
             s->denoise_dct  = denoise_dct_mmx;
         }
 #endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_6REGS && HAVE_MMXEXT_INLINE
         if (INLINE_MMXEXT(cpu_flags))
             s->dct_quantize = dct_quantize_mmxext;
 #endif
 #if HAVE_SSE2_INLINE
         if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
+#endif
             s->denoise_dct  = denoise_dct_sse2;
         }
 #endif
-#if HAVE_SSSE3_INLINE
+#if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
 #endif
diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c b/libavcodec/x86/mpegvideoenc_qns_template.c
index 8d8d687..882d486 100644
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ b/libavcodec/x86/mpegvideoenc_qns_template.c
@@ -5,26 +5,26 @@
  * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
  * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
 #include <stdint.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/x86/asm.h"
 
@@ -36,7 +36,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
 {
     x86_reg i=0;
 
-    assert(FFABS(scale) < MAX_ABS);
+    av_assert2(FFABS(scale) < MAX_ABS);
     scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
 
     SET_RND(mm6);
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index a54c904..da76459 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -108,7 +108,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     const uint16_t *qmat, *bias;
     LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
-    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+    av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
 
     //s->fdct (block);
     RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
@@ -118,10 +118,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 
     if (s->mb_intra) {
         int dummy;
-        if (n < 4)
+        if (n < 4){
             q = s->y_dc_scale;
-        else
+            bias = s->q_intra_matrix16[qscale][1];
+            qmat = s->q_intra_matrix16[qscale][0];
+        }else{
             q = s->c_dc_scale;
+            bias = s->q_chroma_intra_matrix16[qscale][1];
+            qmat = s->q_chroma_intra_matrix16[qscale][0];
+        }
         /* note: block[0] is assumed to be positive */
         if (!s->h263_aic) {
         __asm__ volatile (
@@ -136,8 +141,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
         last_non_zero_p1 = 1;
-        bias = s->q_intra_matrix16[qscale][1];
-        qmat = s->q_intra_matrix16[qscale][0];
     } else {
         last_non_zero_p1 = 0;
         bias = s->q_inter_matrix16[qscale][1];
@@ -173,7 +176,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"REG_a"              \n\t"
-            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -207,7 +210,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"REG_a"              \n\t"
-            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -221,7 +224,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         "psubusw "MM"1, "MM"4               \n\t"
         "packuswb "MM"4, "MM"4              \n\t"
 #if COMPILE_TEMPLATE_SSE2
-        "packuswb "MM"4, "MM"4              \n\t"
+        "packsswb "MM"4, "MM"4              \n\t"
 #endif
         "movd "MM"4, %0                     \n\t" // *overflow
         : "=g" (*overflow)
@@ -275,6 +278,50 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
         block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
         block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x04] = temp_block[0x01];
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+        block[0x05] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1C] = temp_block[0x19];
+        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
     }else{
         if(last_non_zero_p1 <= 1) goto end;
         block[0x01] = temp_block[0x01];
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 9326ee7..aec73f8 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -4,92 +4,151 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION .text
+SECTION_RODATA
 
-INIT_MMX mmx
+cextern pw_1
+
+SECTION .text
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
     movsxdifnidn r1, r1d
-    mov          r2, r1
-    neg          r2
-    shl          r2, 4
-    sub          r0, r2
-    pxor         m7, m7
-    pxor         m6, m6
+    mov          r2, %1
+%if mmsize == 16
+    lea          r3, [r1*3]
+%endif
+%if notcpuflag(xop)
+    pxor         m5, m5
+%endif
+    pxor         m4, m4
 .loop:
-    mova         m0, [r0+r2+0]
-    mova         m1, [r0+r2+0]
-    mova         m2, [r0+r2+8]
-    mova         m3, [r0+r2+8]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+%if cpuflag(xop)
+    vphaddubq    m0, [r0]
+    vphaddubq    m1, [r0+r1]
+    vphaddubq    m2, [r0+r1*2]
+    vphaddubq    m3, [r0+r3]
+%else
+    mova         m0, [r0]
+%if mmsize == 8
+    mova         m1, [r0+8]
+%if cpuflag(mmxext)
+    mova         m2, [r0+r1]
+    mova         m3, [r0+r1+8]
+%endif
+%else ; sse2
+    mova         m1, [r0+r1]
+    mova         m2, [r0+r1*2]
+    mova         m3, [r0+r3]
+%endif
+%if cpuflag(mmxext)
+    psadbw       m0, m5
+    psadbw       m1, m5
+    psadbw       m2, m5
+    psadbw       m3, m5
+%else ; mmx
+    punpckhbw    m2, m0, m5
+    punpcklbw    m0, m5
+    punpckhbw    m3, m1, m5
+    punpcklbw    m1, m5
+%endif ; cpuflag(mmxext)
+%endif ; cpuflag(xop)
     paddw        m1, m0
     paddw        m3, m2
     paddw        m3, m1
-    paddw        m6, m3
-    add          r2, r1
-    js .loop
-    mova         m5, m6
-    psrlq        m6, 32
-    paddw        m6, m5
-    mova         m5, m6
-    psrlq        m6, 16
-    paddw        m6, m5
-    movd        eax, m6
-    and         eax, 0xffff
+    paddw        m4, m3
+%if cpuflag(mmxext)
+    lea          r0, [r0+r1*%3]
+%else
+    add          r0, r1
+%endif
+    dec r2
+    jne .loop
+%if mmsize == 16
+    pshufd       m0, m4, q0032
+    paddd        m4, m0
+%elif notcpuflag(mmxext)
+    HADDW        m4, m5
+%endif
+    movd        eax, m4
     RET
+%endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16  8, 4, 2
+%endif
+INIT_XMM sse2
+PIX_SUM16  4, 4, 4
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+PIX_SUM16  4, 4, 4
+%endif
+
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
     movsxdifnidn r1, r1d
-    mov          r2, 16
+    mov          r2, %2
     pxor         m0, m0
-    pxor         m7, m7
+    pxor         m5, m5
 .loop:
     mova         m2, [r0+0]
+%if mmsize == 8
     mova         m3, [r0+8]
-    mova         m1, m2
-    punpckhbw    m1, m0
+%else
+    mova         m3, [r0+r1]
+%endif
+    punpckhbw    m1, m2, m0
     punpcklbw    m2, m0
-    mova         m4, m3
-    punpckhbw    m3, m0
-    punpcklbw    m4, m0
+    punpckhbw    m4, m3, m0
+    punpcklbw    m3, m0
     pmaddwd      m1, m1
     pmaddwd      m2, m2
     pmaddwd      m3, m3
     pmaddwd      m4, m4
     paddd        m2, m1
     paddd        m4, m3
-    paddd        m7, m2
+    paddd        m5, m2
+    paddd        m5, m4
+%if mmsize == 8
     add          r0, r1
-    paddd        m7, m4
+%else
+    lea          r0, [r0+r1*2]
+%endif
     dec r2
     jne .loop
-    mova         m1, m7
-    psrlq        m7, 32
-    paddd        m1, m7
-    movd        eax, m1
+    HADDD        m5, m1
+    movd        eax, m5
     RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8
 
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index 71fbf28..532836c 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -1,29 +1,34 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 
@@ -123,7 +128,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
-    } else {
+    } else if (w == 16) {
         __asm__ volatile (
             "1:                                 \n\t"
             "movd            (%0), %%mm0        \n\t"
@@ -141,6 +146,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             "add               %1, %0           \n\t"
             "cmp               %3, %0           \n\t"
             "jb                1b               \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
@@ -195,11 +219,26 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->pix_sum   = ff_pix_sum16_mmx;
         c->pix_norm1 = ff_pix_norm1_mmx;
     }
 
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_mmxext;
+    }
+#endif
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_sse2;
+        c->pix_norm1   = ff_pix_norm1_sse2;
+    }
+
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_xop;
+    }
+
 #if HAVE_INLINE_ASM
 
     if (INLINE_MMX(cpu_flags)) {
diff --git a/libavcodec/x86/pixblockdsp.asm b/libavcodec/x86/pixblockdsp.asm
index c8fd1b2..2864d0c 100644
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
@@ -26,9 +26,8 @@
 SECTION .text
 
 INIT_MMX mmx
-; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
+; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size)
 cglobal get_pixels, 3,4
-    movsxdifnidn r2, r2d
     add          r0, 128
     mov          r3, -128
     pxor         m7, m7
@@ -51,8 +50,7 @@ cglobal get_pixels, 3,4
     REP_RET
 
 INIT_XMM sse2
-cglobal get_pixels, 3, 4
-    movsxdifnidn r2, r2d
+cglobal get_pixels, 3, 4, 5
     lea          r3, [r2*3]
     pxor         m4, m4
     movh         m0, [r1]
@@ -82,29 +80,50 @@ cglobal get_pixels, 3, 4
     mova  [r0+0x70], m3
     RET
 
-INIT_MMX mmx
 ; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
 ;                         int stride);
-cglobal diff_pixels, 4,5
+%macro DIFF_PIXELS 0
+cglobal diff_pixels, 4,5,5
     movsxdifnidn r3, r3d
-    pxor         m7, m7
+    pxor         m4, m4
     add          r0,  128
     mov          r4, -128
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r2]
-    mova         m1, m0
-    mova         m3, m2
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+    movq         m0, [r1]
+    movq         m2, [r2]
+%if mmsize == 8
+    movq         m1, m0
+    movq         m3, m2
+    punpcklbw    m0, m4
+    punpckhbw    m1, m4
+    punpcklbw    m2, m4
+    punpckhbw    m3, m4
+%else
+    movq         m1, [r1+r3]
+    movq         m3, [r2+r3]
+    punpcklbw    m0, m4
+    punpcklbw    m1, m4
+    punpcklbw    m2, m4
+    punpcklbw    m3, m4
+%endif
     psubw        m0, m2
     psubw        m1, m3
     mova  [r0+r4+0], m0
-    mova  [r0+r4+8], m1
+    mova  [r0+r4+mmsize], m1
+%if mmsize == 8
     add          r1, r3
     add          r2, r3
-    add          r4, 16
+%else
+    lea          r1, [r1+r3*2]
+    lea          r2, [r2+r3*2]
+%endif
+    add          r4, 2 * mmsize
     jne .loop
-    REP_RET
+    RET
+%endmacro
+
+INIT_MMX mmx
+DIFF_PIXELS
+
+INIT_XMM sse2
+DIFF_PIXELS
diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c
index 9582e0b..4d06a44 100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * SIMD-optimized pixel operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,10 +23,12 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/pixblockdsp.h"
 
-void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
-void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
+void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
+void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                         int stride);
 
 av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
@@ -43,5 +45,6 @@ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
     if (EXTERNAL_SSE2(cpu_flags)) {
         if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_sse2;
+        c->diff_pixels = ff_diff_pixels_sse2;
     }
 }
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index 722caf0..50e4255 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
     and                waq, ~(mmsize*2-1)
     jmp .end_v
 .loop_v:
-    mova                m0, [src1q+iq]
-    mova                m1, [src1q+iq+mmsize]
-    paddb               m0, [src2q+iq]
-    paddb               m1, [src2q+iq+mmsize]
-    mova  [dstq+iq       ], m0
-    mova  [dstq+iq+mmsize], m1
+    movu                m0, [src2q+iq]
+    movu                m1, [src2q+iq+mmsize]
+    paddb               m0, [src1q+iq]
+    paddb               m1, [src1q+iq+mmsize]
+    movu  [dstq+iq       ], m0
+    movu  [dstq+iq+mmsize], m1
     add                 iq, mmsize*2
 .end_v:
     cmp                 iq, waq
@@ -157,7 +157,7 @@ cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
     movh            [dstq], m3
     add               dstq, bppq
     cmp               dstq, endq
-    jle .loop
+    jl .loop
 
     mov               dstq, [rsp]
     dec              cntrq
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 34a3da3..7dca62c 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -2,20 +2,20 @@
  * x86 PNG optimizations.
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 5a329cb..16fc262 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,428 +1,66 @@
 ;******************************************************************************
 ;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
 ;*
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
-%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
-%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
-%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
-%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
-%define W6sh2  8867 ; W6 = 35468 =  8867<<2
-%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
-
 %if ARCH_X86_64
 
 SECTION_RODATA
 
-w4_plus_w2: times 4 dw W4sh2, +W2sh2
-w4_min_w2:  times 4 dw W4sh2, -W2sh2
-w4_plus_w6: times 4 dw W4sh2, +W6sh2
-w4_min_w6:  times 4 dw W4sh2, -W6sh2
-w1_plus_w3: times 4 dw W1sh2, +W3sh2
-w3_min_w1:  times 4 dw W3sh2, -W1sh2
-w7_plus_w3: times 4 dw W7sh2, +W3sh2
-w3_min_w7:  times 4 dw W3sh2, -W7sh2
-w1_plus_w5: times 4 dw W1sh2, +W5sh2
-w5_min_w1:  times 4 dw W5sh2, -W1sh2
-w5_plus_w7: times 4 dw W5sh2, +W7sh2
-w7_min_w5:  times 4 dw W7sh2, -W5sh2
-row_round:  times 8 dw (1<<14)
-
+pw_88:      times 8 dw 0x2008
+cextern pw_1
 cextern pw_4
-cextern pw_8
-cextern pw_512
 cextern pw_1019
+; Below are defined in simple_idct10.asm built from selecting idctdsp
+cextern w4_plus_w2
+cextern w4_min_w2
+cextern w4_plus_w6
+cextern w4_min_w6
+cextern w1_plus_w3
+cextern w3_min_w1
+cextern w7_plus_w3
+cextern w3_min_w7
+cextern w1_plus_w5
+cextern w5_min_w1
+cextern w5_plus_w7
+cextern w7_min_w5
+
+%include "libavcodec/x86/simple_idct10_template.asm"
 
 SECTION .text
 
-; interleave data while maintaining source
-; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
-%macro SBUTTERFLY3 5
-    punpckl%1   m%2, m%4, m%5
-    punpckh%1   m%3, m%4, m%5
-%endmacro
-
-; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
-; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
-;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
-%macro SUMSUB_SHPK 7
-    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
-    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
-    paddd       %1,  %5            ; { a0 + b0 }[0-3]
-    paddd       %2,  %6            ; { a0 + b0 }[4-7]
-    psrad       %1,  %7
-    psrad       %2,  %7
-    psrad       %3,  %7
-    psrad       %4,  %7
-    packssdw    %1,  %2            ; row[0]
-    packssdw    %3,  %4            ; row[7]
-%endmacro
-
-; %1 = row or col (for rounding variable)
-; %2 = number of bits to shift at the end
-%macro IDCT_1D 2
-    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
-    ; a1 = a0;
-    ; a2 = a0;
-    ; a3 = a0;
-    ; a0 += W2 * row[2];
-    ; a1 += W6 * row[2];
-    ; a2 -= W6 * row[2];
-    ; a3 -= W2 * row[2];
-%ifidn %1, col
-    paddw       m10,[pw_8]
-%endif
-    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
-%ifidn %1, row
-    psubw       m10,[row_round]
-%endif
-    SIGNEXTEND  m8,  m9,  m14      ; { row[2] }[0-3] / [4-7]
-    SIGNEXTEND  m10, m11, m14      ; { row[0] }[0-3] / [4-7]
-    pmaddwd     m2,  m0, [w4_plus_w6]
-    pmaddwd     m3,  m1, [w4_plus_w6]
-    pmaddwd     m4,  m0, [w4_min_w6]
-    pmaddwd     m5,  m1, [w4_min_w6]
-    pmaddwd     m6,  m0, [w4_min_w2]
-    pmaddwd     m7,  m1, [w4_min_w2]
-    pmaddwd     m0, [w4_plus_w2]
-    pmaddwd     m1, [w4_plus_w2]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
-
-    ; a0: -1*row[0]-1*row[2]
-    ; a1: -1*row[0]
-    ; a2: -1*row[0]
-    ; a3: -1*row[0]+1*row[2]
-    psubd       m2,  m10           ; a1[0-3]
-    psubd       m3,  m11           ; a1[4-7]
-    psubd       m4,  m10           ; a2[0-3]
-    psubd       m5,  m11           ; a2[4-7]
-    psubd       m0,  m10
-    psubd       m1,  m11
-    psubd       m6,  m10
-    psubd       m7,  m11
-    psubd       m0,  m8            ; a0[0-3]
-    psubd       m1,  m9            ; a0[4-7]
-    paddd       m6,  m8            ; a3[0-3]
-    paddd       m7,  m9            ; a3[4-7]
-
-    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
-    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
-    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
-    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
-    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m14, m10      ; { row[4] }[0-3] / [4-7]
-    pmaddwd     m10, m8, [w4_plus_w6]
-    pmaddwd     m11, m9, [w4_plus_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10,  m13
-    psubd       m11,  m14
-    paddd       m0,  m10            ; a0[0-3]
-    paddd       m1,  m11            ; a0[4-7]
-    pmaddwd     m10, m8, [w4_min_w6]
-    pmaddwd     m11, m9, [w4_min_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10, m13
-    psubd       m11, m14
-    paddd       m6,  m10           ; a3[0-3]
-    paddd       m7,  m11           ; a3[4-7]
-    pmaddwd     m10, m8, [w4_min_w2]
-    pmaddwd     m11, m9, [w4_min_w2]
-    pmaddwd     m8, [w4_plus_w2]
-    pmaddwd     m9, [w4_plus_w2]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    psubd       m10, m13
-    psubd       m11, m14
-    psubd       m8,  m13
-    psubd       m9,  m14
-    psubd       m4,  m10           ; a2[0-3] intermediate
-    psubd       m5,  m11           ; a2[4-7] intermediate
-    psubd       m2,  m8            ; a1[0-3] intermediate
-    psubd       m3,  m9            ; a1[4-7] intermediate
-    SIGNEXTEND  m12, m13, m10      ; { row[6] }[0-3] / [4-7]
-    psubd       m4,  m12           ; a2[0-3]
-    psubd       m5,  m13           ; a2[4-7]
-    paddd       m2,  m12           ; a1[0-3]
-    paddd       m3,  m13           ; a1[4-7]
-
-    ; load/store
-    mova   [r2+  0], m0
-    mova   [r2+ 32], m2
-    mova   [r2+ 64], m4
-    mova   [r2+ 96], m6
-    mova        m10,[r2+ 16]       ; { row[1] }[0-7]
-    mova        m8, [r2+ 48]       ; { row[3] }[0-7]
-    mova        m13,[r2+ 80]       ; { row[5] }[0-7]
-    mova        m14,[r2+112]       ; { row[7] }[0-7]
-    mova   [r2+ 16], m1
-    mova   [r2+ 48], m3
-    mova   [r2+ 80], m5
-    mova   [r2+112], m7
-%ifidn %1, row
-    pmullw      m10,[r3+ 16]
-    pmullw      m8, [r3+ 48]
-    pmullw      m13,[r3+ 80]
-    pmullw      m14,[r3+112]
-%endif
-
-    ; b0 = MUL(W1, row[1]);
-    ; MAC(b0, W3, row[3]);
-    ; b1 = MUL(W3, row[1]);
-    ; MAC(b1, -W7, row[3]);
-    ; b2 = MUL(W5, row[1]);
-    ; MAC(b2, -W1, row[3]);
-    ; b3 = MUL(W7, row[1]);
-    ; MAC(b3, -W5, row[3]);
-    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
-    SIGNEXTEND  m10, m11, m12      ; { row[1] }[0-3] / [4-7]
-    SIGNEXTEND  m8,  m9,  m12      ; { row[3] }[0-3] / [4-7]
-    pmaddwd     m2,  m0, [w3_min_w7]
-    pmaddwd     m3,  m1, [w3_min_w7]
-    pmaddwd     m4,  m0, [w5_min_w1]
-    pmaddwd     m5,  m1, [w5_min_w1]
-    pmaddwd     m6,  m0, [w7_min_w5]
-    pmaddwd     m7,  m1, [w7_min_w5]
-    pmaddwd     m0, [w1_plus_w3]
-    pmaddwd     m1, [w1_plus_w3]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
-
-    ; b0: +1*row[1]+2*row[3]
-    ; b1: +2*row[1]-1*row[3]
-    ; b2: -1*row[1]-1*row[3]
-    ; b3: +1*row[1]+1*row[3]
-    psubd       m2,  m8
-    psubd       m3,  m9
-    paddd       m0,  m8
-    paddd       m1,  m9
-    paddd       m8,  m10           ; { row[1] + row[3] }[0-3]
-    paddd       m9,  m11           ; { row[1] + row[3] }[4-7]
-    paddd       m10, m10
-    paddd       m11, m11
-    paddd       m0,  m8            ; b0[0-3]
-    paddd       m1,  m9            ; b0[4-7]
-    paddd       m2,  m10           ; b1[0-3]
-    paddd       m3,  m11           ; b2[4-7]
-    psubd       m4,  m8            ; b2[0-3]
-    psubd       m5,  m9            ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
-
-    ; MAC(b0,  W5, row[5]);
-    ; MAC(b0,  W7, row[7]);
-    ; MAC(b1, -W1, row[5]);
-    ; MAC(b1, -W5, row[7]);
-    ; MAC(b2,  W7, row[5]);
-    ; MAC(b2,  W3, row[7]);
-    ; MAC(b3,  W3, row[5]);
-    ; MAC(b3, -W1, row[7]);
-    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m12, m11      ; { row[5] }[0-3] / [4-7]
-    SIGNEXTEND  m14, m11, m10      ; { row[7] }[0-3] / [4-7]
-
-    ; b0: -1*row[5]+1*row[7]
-    ; b1: -1*row[5]+1*row[7]
-    ; b2: +1*row[5]+2*row[7]
-    ; b3: +2*row[5]-1*row[7]
-    paddd       m4,  m13
-    paddd       m5,  m12
-    paddd       m6,  m13
-    paddd       m7,  m12
-    psubd       m13, m14           ; { row[5] - row[7] }[0-3]
-    psubd       m12, m11           ; { row[5] - row[7] }[4-7]
-    paddd       m14, m14
-    paddd       m11, m11
-    psubd       m0,  m13
-    psubd       m1,  m12
-    psubd       m2,  m13
-    psubd       m3,  m12
-    paddd       m4,  m14
-    paddd       m5,  m11
-    paddd       m6,  m13
-    paddd       m7,  m12
-
-    pmaddwd     m10, m8, [w1_plus_w5]
-    pmaddwd     m11, m9, [w1_plus_w5]
-    pmaddwd     m12, m8, [w5_plus_w7]
-    pmaddwd     m13, m9, [w5_plus_w7]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m12,  2
-    pslld       m13,  2
-    psubd       m2,  m10           ; b1[0-3]
-    psubd       m3,  m11           ; b1[4-7]
-    paddd       m0,  m12            ; b0[0-3]
-    paddd       m1,  m13            ; b0[4-7]
-    pmaddwd     m12, m8, [w7_plus_w3]
-    pmaddwd     m13, m9, [w7_plus_w3]
-    pmaddwd     m8, [w3_min_w1]
-    pmaddwd     m9, [w3_min_w1]
-    pslld       m12, 2
-    pslld       m13, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    paddd       m4,  m12           ; b2[0-3]
-    paddd       m5,  m13           ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
-
-    ; row[0] = (a0 + b0) >> 15;
-    ; row[7] = (a0 - b0) >> 15;
-    ; row[1] = (a1 + b1) >> 15;
-    ; row[6] = (a1 - b1) >> 15;
-    ; row[2] = (a2 + b2) >> 15;
-    ; row[5] = (a2 - b2) >> 15;
-    ; row[3] = (a3 + b3) >> 15;
-    ; row[4] = (a3 - b3) >> 15;
-    mova        m8, [r2+ 0]        ; a0[0-3]
-    mova        m9, [r2+16]        ; a0[4-7]
-    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
-    mova        m0, [r2+32]        ; a1[0-3]
-    mova        m1, [r2+48]        ; a1[4-7]
-    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
-    mova        m1, [r2+64]        ; a2[0-3]
-    mova        m2, [r2+80]        ; a2[4-7]
-    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
-    mova        m2, [r2+96]        ; a3[0-3]
-    mova        m3, [r2+112]       ; a3[4-7]
-    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
-%endmacro
-
-; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
-;                                  int16_t *block, const int16_t *qmat);
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
-    movsxd      r1,  r1d
-    pxor        m15, m15           ; zero
-
-    ; for (i = 0; i < 8; i++)
-    ;     idctRowCondDC(block + i*8);
-    mova        m10,[r2+ 0]        ; { row[0] }[0-7]
-    mova        m8, [r2+32]        ; { row[2] }[0-7]
-    mova        m13,[r2+64]        ; { row[4] }[0-7]
-    mova        m12,[r2+96]        ; { row[6] }[0-7]
-
-    pmullw      m10,[r3+ 0]
-    pmullw      m8, [r3+32]
-    pmullw      m13,[r3+64]
-    pmullw      m12,[r3+96]
-
-    IDCT_1D     row, 17
-
-    ; transpose for second part of IDCT
-    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
-    mova   [r2+ 16], m0
-    mova   [r2+ 48], m2
-    mova   [r2+ 80], m11
-    mova   [r2+112], m10
-    SWAP         8,  10
-    SWAP         1,   8
-    SWAP         4,  13
-    SWAP         9,  12
-
-    ; for (i = 0; i < 8; i++)
-    ;     idctSparseColAdd(dest + i, line_size, block + i);
-    IDCT_1D     col, 20
-
-    ; clip/store
-    mova        m6, [pw_512]
-    mova        m3, [pw_4]
-    mova        m5, [pw_1019]
-    paddw       m8,  m6
-    paddw       m0,  m6
-    paddw       m1,  m6
-    paddw       m2,  m6
-    paddw       m4,  m6
-    paddw       m11, m6
-    paddw       m9,  m6
-    paddw       m10, m6
-    pmaxsw      m8,  m3
-    pmaxsw      m0,  m3
-    pmaxsw      m1,  m3
-    pmaxsw      m2,  m3
-    pmaxsw      m4,  m3
-    pmaxsw      m11, m3
-    pmaxsw      m9,  m3
-    pmaxsw      m10, m3
-    pminsw      m8,  m5
-    pminsw      m0,  m5
-    pminsw      m1,  m5
-    pminsw      m2,  m5
-    pminsw      m4,  m5
-    pminsw      m11, m5
-    pminsw      m9,  m5
-    pminsw      m10, m5
-
-    lea         r2, [r1*3]
-    mova  [r0     ], m8
-    mova  [r0+r1  ], m0
-    mova  [r0+r1*2], m1
-    mova  [r0+r2  ], m2
-    lea         r0, [r0+r1*4]
-    mova  [r0     ], m4
-    mova  [r0+r1  ], m11
-    mova  [r0+r1*2], m9
-    mova  [r0+r2  ], m10
+%macro idct_fn 0
+cglobal prores_idct_put_10, 4, 4, 15
+    IDCT_FN    pw_1, 15, pw_88, 18, pw_4, pw_1019, r3
     RET
 %endmacro
 
-%macro SIGNEXTEND 2-3
-%if cpuflag(sse4) ; dstlow, dsthigh
-    movhlps     %2,  %1
-    pmovsxwd    %1,  %1
-    pmovsxwd    %2,  %2
-%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
-    pxor        %3,  %3
-    pcmpgtw     %3,  %1
-    mova        %2,  %1
-    punpcklwd   %1,  %3
-    punpckhwd   %2,  %3
-%endif
-%endmacro
-
 INIT_XMM sse2
-idct_put_fn 16
-INIT_XMM sse4
-idct_put_fn 16
+idct_fn
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
-idct_put_fn 16
+idct_fn
+%endif
 
 %endif
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index e82dac0..ead11ae 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,10 @@
 
 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
                                 int16_t *block, const int16_t *qmat);
-void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
-                                int16_t *block, const int16_t *qmat);
 void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
                                 int16_t *block, const int16_t *qmat);
 
-av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
 #if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
@@ -42,11 +40,6 @@ av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
         dsp->idct_put = ff_prores_idct_put_10_sse2;
     }
 
-    if (EXTERNAL_SSE4(cpu_flags)) {
-        dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
-        dsp->idct_put = ff_prores_idct_put_10_sse4;
-    }
-
     if (EXTERNAL_AVX(cpu_flags)) {
         dsp->idct_permutation_type = FF_IDCT_PERM_TRANSPOSE;
         dsp->idct_put = ff_prores_idct_put_10_avx;
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 27a1c63..4e72d50 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index ef5f1d8..282faed 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -1,22 +1,23 @@
 ;******************************************************************************
-;* quarterpel DSP functions
-;*
+;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index cdefe50..3268d90 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,13 +79,13 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
                                                 const uint8_t *src,
                                                 int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
 
 #if HAVE_YASM
 
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
 
 #define QPEL_OP(OPNAME, RND, MMX)                                       \
 static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst,                  \
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index a9fb132..ddca4eb 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "inline_asm.h"
 
 // put_pixels
-STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
@@ -99,7 +99,7 @@ STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
 
 // avg_pixels
 // this routine is 'slightly' suboptimal but mostly unused
-STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
+av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
                                   ptrdiff_t line_size, int h)
 {
     MOVQ_ZERO(mm7);
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b..7732d65 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 586e4e9..99c56f9 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -2,20 +2,20 @@
  * RV30/40 MMX/SSE2 optimizations
  * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 77f6ddb..d0c3af0 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index e006c76..218deb8 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions x86-optimised
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,13 @@
 #include "libavutil/x86/cpu.h"
 #include "hpeldsp.h"
 
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+                                               ptrdiff_t stride) \
+{ \
+    ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
 #if HAVE_YASM
 void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
                                   int stride, int h, int x, int y);
@@ -75,7 +82,7 @@ static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst,  \
 {                                                                       \
     int i;                                                              \
     if (PH && PV) {                                                     \
-        DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)];           \
+        LOCAL_ALIGNED(16, uint8_t, tmp, [SIZE * (SIZE + 5)]);           \
         uint8_t *tmpptr = tmp + SIZE * 2;                               \
         src -= stride * 2;                                              \
                                                                         \
@@ -94,7 +101,7 @@ static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst,  \
             ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i,     \
                                           stride, SIZE, HCOFF(PH));     \
     }                                                                   \
-};
+}
 
 /** Declare functions for sizes 8 and 16 and given operations
  *  and qpel position. */
@@ -127,8 +134,8 @@ QPEL_FUNCS_DECL(OP, 3, 2, OPT)
 /** @} */
 
 #define LOOPSIZE  8
-#define HCOFF(x)  (32 * (x - 1))
-#define VCOFF(x)  (32 * (x - 1))
+#define HCOFF(x)  (32 * ((x) - 1))
+#define VCOFF(x)  (32 * ((x) - 1))
 QPEL_MC_DECL(put_, _ssse3)
 QPEL_MC_DECL(avg_, _ssse3)
 
@@ -136,8 +143,8 @@ QPEL_MC_DECL(avg_, _ssse3)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  8
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 QPEL_MC_DECL(put_, _sse2)
 QPEL_MC_DECL(avg_, _sse2)
 
@@ -146,8 +153,8 @@ QPEL_MC_DECL(avg_, _sse2)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  4
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 
 QPEL_MC_DECL(put_, _mmx)
 
@@ -186,34 +193,28 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \
 QPEL_FUNCS_SET (OP, 3, 2, OPT)
 /** @} */
 
+DEFINE_FN(put, 8, ssse3)
+
+DEFINE_FN(put, 16, sse2)
+DEFINE_FN(put, 16, ssse3)
+
+DEFINE_FN(avg, 8, mmxext)
+DEFINE_FN(avg, 8, ssse3)
+
+DEFINE_FN(avg, 16, sse2)
+DEFINE_FN(avg, 16, ssse3)
 #endif /* HAVE_YASM */
 
 #if HAVE_MMX_INLINE
-static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-#endif /* HAVE_MMX_INLINE */
+DEFINE_FN(put, 8, mmx)
+DEFINE_FN(avg, 8, mmx)
+DEFINE_FN(put, 16, mmx)
+DEFINE_FN(avg, 16, mmx)
+#endif
 
 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 {
-    int cpu_flags = av_get_cpu_flags();
+    av_unused int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_MMX_INLINE
     if (INLINE_MMX(cpu_flags)) {
@@ -240,6 +241,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_mmxext;
         c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
         c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
@@ -251,6 +253,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_sse2;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_sse2;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
@@ -259,6 +263,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
         QPEL_MC_SET(avg_, _sse2)
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
+        c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_ssse3;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index b449de5..07a412b 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -2,20 +2,20 @@
 ;* AAC Spectral Band Replication decoding functions
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,7 +25,14 @@ SECTION_RODATA
 ; mask equivalent for multiply by -1.0 1.0
 ps_mask         times 2 dd 1<<31, 0
 ps_mask2        times 2 dd 0, 1<<31
-ps_neg          times 4 dd 1<<31
+ps_mask3        dd  0, 0, 0, 1<<31
+ps_noise0       times 2 dd  1.0,  0.0,
+ps_noise2       times 2 dd -1.0,  0.0
+ps_noise13      dd  0.0,  1.0, 0.0, -1.0
+                dd  0.0, -1.0, 0.0,  1.0
+                dd  0.0,  1.0, 0.0, -1.0
+cextern         sbr_noise_table
+cextern         ps_neg
 
 SECTION .text
 
@@ -136,7 +143,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
     mova       m3, m1
     mova       m4, m2
-    mova       m7, [ps_mask]
 
     ; Set pointers
 %if ARCH_X86_64 == 0 || WIN64
@@ -156,30 +162,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     shl      start, 3            ; offset from num loops
 
     mova        m0, [X_lowq + start]
-    movlhps     m1, m1           ; (a2 a3 a2 a3)
-    movlhps     m2, m2           ; (a0 a1 a0 a1)
-    shufps      m3, m3, q0101    ; (a3 a2 a3 a2)
-    shufps      m4, m4, q0101    ; (a1 a0 a1 a0)
-    xorps       m3, m7           ; (-a3 a2 -a3 a2)
-    xorps       m4, m7           ; (-a1 a0 -a1 a0)
+    shufps      m3, m3, q1111
+    shufps      m4, m4, q1111
+    xorps       m3, [ps_mask]
+    shufps      m1, m1, q0000
+    shufps      m2, m2, q0000
+    xorps       m4, [ps_mask]
 .loop2:
-    mova        m5, m0
+    movu        m7, [X_lowq + start + 8]        ; BbCc
     mova        m6, m0
-    shufps      m0, m0, q2200    ; {Xl[-2][0],",Xl[-1][0],"}
-    shufps      m5, m5, q3311    ; {Xl[-2][1],",Xl[-1][1],"}
-    mulps       m0, m2
-    mulps       m5, m4
-    mova        m7, m6
-    addps       m5, m0
-    mova        m0, [X_lowq + start + 2*2*4]
-    shufps      m6, m0, q0022    ; {Xl[-1][0],",Xl[0][0],"}
-    shufps      m7, m0, q1133    ; {Xl[-1][1],",Xl[1][1],"}
-    mulps       m6, m1
+    mova        m5, m7
+    shufps      m0, m0, q2301                   ; aAbB
+    shufps      m7, m7, q2301                   ; bBcC
+    mulps       m0, m4
     mulps       m7, m3
-    addps       m5, m6
+    mulps       m6, m2
+    mulps       m5, m1
+    addps       m7, m0
+    mova        m0, [X_lowq + start +16]        ; CcDd
     addps       m7, m0
-    addps       m5, m7
-    mova  [X_highq + start], m5
+    addps       m6, m5
+    addps       m7, m6
+    mova  [X_highq + start], m7
     add     start, 16
     jnz         .loop2
     RET
@@ -246,33 +250,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z
     jne      .loop
     REP_RET
 
-INIT_XMM sse2
 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY  0
 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
     mov               cq, 64*4-2*mmsize
     lea            vrevq, [vq + 64*4]
 .loop:
     mova              m0, [src0q+cq]
     mova              m1, [src1q]
-    mova              m2, [src0q+cq+mmsize]
-    mova              m3, [src1q+mmsize]
-    pshufd            m4, m0, q0123
-    pshufd            m5, m1, q0123
-    pshufd            m6, m2, q0123
-    pshufd            m7, m3, q0123
-    addps             m3, m4
+    mova              m4, [src0q+cq+mmsize]
+    mova              m5, [src1q+mmsize]
+%if cpuflag(sse2)
+    pshufd            m2, m0, q0123
+    pshufd            m3, m1, q0123
+    pshufd            m6, m4, q0123
+    pshufd            m7, m5, q0123
+%else
+    shufps            m2, m0, m0, q0123
+    shufps            m3, m1, m1, q0123
+    shufps            m6, m4, m4, q0123
+    shufps            m7, m5, m5, q0123
+%endif
+    addps             m5, m2
     subps             m0, m7
     addps             m1, m6
-    subps             m2, m5
+    subps             m4, m3
     mova         [vrevq], m1
-    mova  [vrevq+mmsize], m3
+    mova  [vrevq+mmsize], m5
     mova         [vq+cq], m0
-    mova  [vq+cq+mmsize], m2
+    mova  [vq+cq+mmsize], m4
     add            src1q, 2*mmsize
     add            vrevq, 2*mmsize
     sub               cq, 2*mmsize
     jge            .loop
     REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
 
 INIT_XMM sse2
 cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@@ -303,3 +321,228 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
     movq       m2, [zq]
     movq    [r2q], m2
     REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST  1
+%ifdef PIC
+    lea  NOISE_TABLE, [%1]
+    mova          m0, [kxq + NOISE_TABLE]
+%else
+    mova          m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise0]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise2]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+    mov       kxd, m_maxm
+%define count kxq
+%else
+%define count m_maxq
+%endif
+    movsxdifnidn    noiseq, noised
+    dec    noiseq
+    shl    count, 2
+%ifdef PIC
+    lea NOISE_TABLE, [sbr_noise_table]
+%endif
+    lea        Yq, [Yq + 2*count]
+    add      s_mq, count
+    add   q_filtq, count
+    shl    noiseq, 3
+    pxor       m5, m5
+    neg    count
+.loop:
+    mova       m1, [q_filtq + count]
+    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
+    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
+    add    noiseq, 2*mmsize
+    and    noiseq, 0x1ff<<3
+    punpckhdq  m2, m1, m1
+    punpckldq  m1, m1
+    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mova       m3, [s_mq + count]
+    ; TODO: replace by a vpermd in AVX2
+    punpckhdq  m4, m3, m3
+    punpckldq  m3, m3
+    pcmpeqd    m6, m3, m5 ; m6 == 0
+    pcmpeqd    m7, m4, m5 ; m7 == 0
+    mulps      m3, m0 ; s_m[m] * phi_sign
+    mulps      m4, m0 ; s_m[m] * phi_sign
+    pand       m1, m6
+    pand       m2, m7
+    movu       m6, [Yq + 2*count]
+    movu       m7, [Yq + 2*count + mmsize]
+    addps      m3, m1
+    addps      m4, m2
+    addps      m6, m3
+    addps      m7, m4
+    movu    [Yq + 2*count], m6
+    movu    [Yq + 2*count + mmsize], m7
+    add    count, mmsize
+    jl      .loop
+    RET
+
+INIT_XMM sse
+cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
+%define COUNT  32*4
+%define OFFSET 32*4
+    mov        cq, -COUNT
+    lea     vrevq, [vq + OFFSET + COUNT]
+    add        vq, OFFSET-mmsize
+    add      srcq, 2*COUNT
+    mova       m3, [ps_neg]
+.loop:
+    mova       m0, [srcq + 2*cq + 0*mmsize]
+    mova       m1, [srcq + 2*cq + 1*mmsize]
+    shufps     m2, m0, m1, q2020
+    shufps     m1, m0, q1313
+    xorps      m2, m3
+    mova     [vq], m1
+    mova  [vrevq + cq], m2
+    sub        vq, mmsize
+    add        cq, mmsize
+    jl      .loop
+    REP_RET
+
+%macro SBR_AUTOCORRELATE 0
+cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+    mov   cntq, 37*8
+    add     xq, cntq
+    neg   cntq
+
+%if cpuflag(sse3)
+%define   MOVH  movsd
+    movddup m5, [xq+cntq]
+%else
+%define   MOVH  movlps
+    movlps  m5, [xq+cntq]
+    movlhps m5, m5
+%endif
+    MOVH    m7, [xq+cntq+8 ]
+    MOVH    m1, [xq+cntq+16]
+    shufps  m7, m7, q0110
+    shufps  m1, m1, q0110
+    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
+    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
+    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
+    movaps  [rsp   ], m3
+    movaps  [rsp+16], m4
+    add   cntq, 8
+
+    MOVH    m2, [xq+cntq+16]
+    movlhps m7, m7
+    shufps  m2, m2, q0110
+    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
+    mulps   m4, m7, m2
+    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
+    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
+
+align 16
+.loop:
+    add   cntq, 8
+    MOVH    m0, [xq+cntq+16]
+    movlhps m1, m1
+    shufps  m0, m0, q0110
+    mulps   m3, m1, m2
+    mulps   m4, m1, m0
+    mulps   m1, m1
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m1, [xq+cntq+16]
+    movlhps m2, m2
+    shufps  m1, m1, q0110
+    mulps   m3, m2, m0
+    mulps   m4, m2, m1
+    mulps   m2, m2
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    add   cntq, 8
+    MOVH    m2, [xq+cntq+16]
+    movlhps m0, m0
+    shufps  m2, m2, q0110
+    mulps   m3, m0, m1
+    mulps   m4, m0, m2
+    mulps   m0, m0
+    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
+    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
+    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
+    jl .loop
+
+    movlhps m1, m1
+    mulps   m2, m1
+    mulps   m1, m1
+    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
+    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
+    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
+    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
+
+    xorps   m2, [ps_mask3]
+    xorps   m5, [ps_mask3]
+    xorps   m6, [ps_mask3]
+    HADDPS  m2, m5, m3
+    HADDPS  m7, m6, m4
+%if cpuflag(sse3)
+    movshdup m0, m1
+%else
+    movss   m0, m1
+    shufps  m1, m1, q0001
+%endif
+    addss   m1, m0
+    movaps  [phiq     ], m2
+    movhps  [phiq+0x18], m7
+    movss   [phiq+0x28], m7
+    movss   [phiq+0x10], m1
+    RET
+%endmacro
+
+INIT_XMM sse
+SBR_AUTOCORRELATE
+INIT_XMM sse3
+SBR_AUTOCORRELATE
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 9600852..6911a1a 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding functions
  * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,28 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
                        float bw, int start, int end);
 void ff_sbr_neg_odd_64_sse(float *z);
 void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_pre_shuffle_sse2(float *z);
 
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
+
+void ff_sbr_autocorrelate_sse (const float x[40][2], float phi[3][2][2]);
+void ff_sbr_autocorrelate_sse3(const float x[40][2], float phi[3][2][2]);
+
 av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,10 +67,21 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->hf_g_filt  = ff_sbr_hf_g_filt_sse;
         s->hf_gen     = ff_sbr_hf_gen_sse;
         s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+        s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse;
+        s->qmf_deint_neg    = ff_sbr_qmf_deint_neg_sse;
+        s->autocorrelate    = ff_sbr_autocorrelate_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse2;
         s->qmf_pre_shuffle  = ff_sbr_qmf_pre_shuffle_sse2;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
+    }
+
+    if (EXTERNAL_SSE3(cpu_flags)) {
+        s->autocorrelate = ff_sbr_autocorrelate_sse3;
     }
 }
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 71763db..1d46212 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -3,24 +3,23 @@
  *
  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 
@@ -86,7 +85,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
 
 static inline void idct(int16_t *block)
 {
-        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+        LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
         int16_t * const temp= (int16_t*)align_tmp;
 
         __asm__ volatile(
@@ -1148,6 +1147,7 @@ Temp
 
 "9: \n\t"
                 :: "r" (block), "r" (temp), "r" (coeffs)
+                   NAMED_CONSTRAINTS_ADD(wm1010,d40000)
                 : "%eax"
         );
 }
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 4fc2914..8eeb31e 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,4 +25,16 @@ void ff_simple_idct_mmx(int16_t *block);
 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
 
+void ff_simple_idct10_sse2(int16_t *block);
+void ff_simple_idct10_avx(int16_t *block);
+
+void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_simple_idct12_sse2(int16_t *block);
+void ff_simple_idct12_avx(int16_t *block);
+
+void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block);
+
 #endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
new file mode 100644
index 0000000..5dee533
--- /dev/null
+++ b/libavcodec/x86/simple_idct10.asm
@@ -0,0 +1,100 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2015 Christophe Gisquet
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+cextern pw_2
+cextern pw_16
+cextern pw_1023
+cextern pw_4095
+pd_round_12: times 4 dd 1<<(12-1)
+pd_round_15: times 4 dd 1<<(15-1)
+pd_round_19: times 4 dd 1<<(19-1)
+
+%macro CONST_DEC  3
+const %1
+times 4 dw %2, %3
+%endmacro
+
+%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
+%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
+%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
+%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
+%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
+%define W6sh2  8867 ; W6 = 35468 =  8867<<2
+%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
+
+CONST_DEC  w4_plus_w2,   W4sh2, +W2sh2
+CONST_DEC  w4_min_w2,    W4sh2, -W2sh2
+CONST_DEC  w4_plus_w6,   W4sh2, +W6sh2
+CONST_DEC  w4_min_w6,    W4sh2, -W6sh2
+CONST_DEC  w1_plus_w3,   W1sh2, +W3sh2
+CONST_DEC  w3_min_w1,    W3sh2, -W1sh2
+CONST_DEC  w7_plus_w3,   W7sh2, +W3sh2
+CONST_DEC  w3_min_w7,    W3sh2, -W7sh2
+CONST_DEC  w1_plus_w5,   W1sh2, +W5sh2
+CONST_DEC  w5_min_w1,    W5sh2, -W1sh2
+CONST_DEC  w5_plus_w7,   W5sh2, +W7sh2
+CONST_DEC  w7_min_w5,    W7sh2, -W5sh2
+
+%include "libavcodec/x86/simple_idct10_template.asm"
+
+SECTION .text
+
+%macro idct_fn 0
+cglobal simple_idct10, 1, 1, 16
+    IDCT_FN    "", 12, "", 19
+    RET
+
+cglobal simple_idct10_put, 3, 3, 16
+    IDCT_FN    "", 12, "", 19, 0, pw_1023
+    RET
+
+cglobal simple_idct12, 1, 1, 16
+    ; coeffs are already 15bits, adding the offset would cause
+    ; overflow in the input
+    IDCT_FN    "", 15, pw_2, 16
+    RET
+
+cglobal simple_idct12_put, 3, 3, 16
+    ; range isn't known, so the C simple_idct range is used
+    ; Also, using a bias on input overflows, so use the bias
+    ; on output of the first butterfly instead
+    IDCT_FN    "", 15, pw_2, 16, 0, pw_4095
+    RET
+%endmacro
+
+INIT_XMM sse2
+idct_fn
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+idct_fn
+%endif
+
+%endif
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
new file mode 100644
index 0000000..e5deb0f
--- /dev/null
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -0,0 +1,315 @@
+;******************************************************************************
+;* x86-SIMD-optimized IDCT for prores
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
+;*
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; add SECTION_RODATA and proper include before including this file!
+
+%if ARCH_X86_64
+
+; interleave data while maintaining source
+; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
+%macro SBUTTERFLY3 5
+    punpckl%1   m%2, m%4, m%5
+    punpckh%1   m%3, m%4, m%5
+%endmacro
+
+; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
+; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
+;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
+%macro SUMSUB_SHPK 7
+    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
+    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
+    paddd       %1,  %5            ; { a0 + b0 }[0-3]
+    paddd       %2,  %6            ; { a0 + b0 }[4-7]
+    psrad       %1,  %7
+    psrad       %2,  %7
+    psrad       %3,  %7
+    psrad       %4,  %7
+    packssdw    %1,  %2            ; row[0]
+    packssdw    %3,  %4            ; row[7]
+%endmacro
+
+; %1 = initial bias ("" if nop)
+; %2 = number of bits to shift at the end
+; %3 = qmat (for prores)
+%macro IDCT_1D 2-3
+    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
+    ; a1 = a0;
+    ; a2 = a0;
+    ; a3 = a0;
+    ; a0 += W2 * row[2];
+    ; a1 += W6 * row[2];
+    ; a2 -= W6 * row[2];
+    ; a3 -= W2 * row[2];
+%ifstr %1
+    mova        m15, [pd_round_ %+ %2]
+%else
+    paddw       m10, [%1]
+%endif
+    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
+    pmaddwd     m2,  m0, [w4_plus_w6]
+    pmaddwd     m3,  m1, [w4_plus_w6]
+    pmaddwd     m4,  m0, [w4_min_w6]
+    pmaddwd     m5,  m1, [w4_min_w6]
+    pmaddwd     m6,  m0, [w4_min_w2]
+    pmaddwd     m7,  m1, [w4_min_w2]
+    pmaddwd     m0, [w4_plus_w2]
+    pmaddwd     m1, [w4_plus_w2]
+%ifstr %1
+    ; Adding 1<<(%2-1) for >=15 bits values
+    paddd       m2, m15
+    paddd       m3, m15
+    paddd       m4, m15
+    paddd       m5, m15
+    paddd       m6, m15
+    paddd       m7, m15
+    paddd       m0, m15
+    paddd       m1, m15
+%endif
+
+    ; a0: -1*row[0]-1*row[2]
+    ; a1: -1*row[0]
+    ; a2: -1*row[0]
+    ; a3: -1*row[0]+1*row[2]
+
+    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
+    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
+    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
+    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
+    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
+    pmaddwd     m10, m8, [w4_plus_w6]
+    pmaddwd     m11, m9, [w4_plus_w6]
+    paddd       m0,  m10            ; a0[0-3]
+    paddd       m1,  m11            ; a0[4-7]
+    pmaddwd     m10, m8, [w4_min_w6]
+    pmaddwd     m11, m9, [w4_min_w6]
+    paddd       m6,  m10           ; a3[0-3]
+    paddd       m7,  m11           ; a3[4-7]
+    pmaddwd     m10, m8, [w4_min_w2]
+    pmaddwd     m11, m9, [w4_min_w2]
+    pmaddwd     m8, [w4_plus_w2]
+    pmaddwd     m9, [w4_plus_w2]
+    psubd       m4,  m10           ; a2[0-3] intermediate
+    psubd       m5,  m11           ; a2[4-7] intermediate
+    psubd       m2,  m8            ; a1[0-3] intermediate
+    psubd       m3,  m9            ; a1[4-7] intermediate
+
+    ; load/store
+    mova   [COEFFS+  0], m0
+    mova   [COEFFS+ 32], m2
+    mova   [COEFFS+ 64], m4
+    mova   [COEFFS+ 96], m6
+    mova        m10,[COEFFS+ 16]       ; { row[1] }[0-7]
+    mova        m8, [COEFFS+ 48]       ; { row[3] }[0-7]
+    mova        m13,[COEFFS+ 80]       ; { row[5] }[0-7]
+    mova        m14,[COEFFS+112]       ; { row[7] }[0-7]
+    mova   [COEFFS+ 16], m1
+    mova   [COEFFS+ 48], m3
+    mova   [COEFFS+ 80], m5
+    mova   [COEFFS+112], m7
+%if %0 == 3
+    pmullw      m10,[%3+ 16]
+    pmullw      m8, [%3+ 48]
+    pmullw      m13,[%3+ 80]
+    pmullw      m14,[%3+112]
+%endif
+
+    ; b0 = MUL(W1, row[1]);
+    ; MAC(b0, W3, row[3]);
+    ; b1 = MUL(W3, row[1]);
+    ; MAC(b1, -W7, row[3]);
+    ; b2 = MUL(W5, row[1]);
+    ; MAC(b2, -W1, row[3]);
+    ; b3 = MUL(W7, row[1]);
+    ; MAC(b3, -W5, row[3]);
+    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
+    pmaddwd     m2,  m0, [w3_min_w7]
+    pmaddwd     m3,  m1, [w3_min_w7]
+    pmaddwd     m4,  m0, [w5_min_w1]
+    pmaddwd     m5,  m1, [w5_min_w1]
+    pmaddwd     m6,  m0, [w7_min_w5]
+    pmaddwd     m7,  m1, [w7_min_w5]
+    pmaddwd     m0, [w1_plus_w3]
+    pmaddwd     m1, [w1_plus_w3]
+
+    ; b0: +1*row[1]+2*row[3]
+    ; b1: +2*row[1]-1*row[3]
+    ; b2: -1*row[1]-1*row[3]
+    ; b3: +1*row[1]+1*row[3]
+
+    ; MAC(b0,  W5, row[5]);
+    ; MAC(b0,  W7, row[7]);
+    ; MAC(b1, -W1, row[5]);
+    ; MAC(b1, -W5, row[7]);
+    ; MAC(b2,  W7, row[5]);
+    ; MAC(b2,  W3, row[7]);
+    ; MAC(b3,  W3, row[5]);
+    ; MAC(b3, -W1, row[7]);
+    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
+
+    ; b0: -1*row[5]+1*row[7]
+    ; b1: -1*row[5]+1*row[7]
+    ; b2: +1*row[5]+2*row[7]
+    ; b3: +2*row[5]-1*row[7]
+
+    pmaddwd     m10, m8, [w1_plus_w5]
+    pmaddwd     m11, m9, [w1_plus_w5]
+    pmaddwd     m12, m8, [w5_plus_w7]
+    pmaddwd     m13, m9, [w5_plus_w7]
+    psubd       m2,  m10           ; b1[0-3]
+    psubd       m3,  m11           ; b1[4-7]
+    paddd       m0,  m12            ; b0[0-3]
+    paddd       m1,  m13            ; b0[4-7]
+    pmaddwd     m12, m8, [w7_plus_w3]
+    pmaddwd     m13, m9, [w7_plus_w3]
+    pmaddwd     m8, [w3_min_w1]
+    pmaddwd     m9, [w3_min_w1]
+    paddd       m4,  m12           ; b2[0-3]
+    paddd       m5,  m13           ; b2[4-7]
+    paddd       m6,  m8            ; b3[0-3]
+    paddd       m7,  m9            ; b3[4-7]
+
+    ; row[0] = (a0 + b0) >> 15;
+    ; row[7] = (a0 - b0) >> 15;
+    ; row[1] = (a1 + b1) >> 15;
+    ; row[6] = (a1 - b1) >> 15;
+    ; row[2] = (a2 + b2) >> 15;
+    ; row[5] = (a2 - b2) >> 15;
+    ; row[3] = (a3 + b3) >> 15;
+    ; row[4] = (a3 - b3) >> 15;
+    mova        m8, [COEFFS+ 0]        ; a0[0-3]
+    mova        m9, [COEFFS+16]        ; a0[4-7]
+    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
+    mova        m0, [COEFFS+32]        ; a1[0-3]
+    mova        m1, [COEFFS+48]        ; a1[4-7]
+    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
+    mova        m1, [COEFFS+64]        ; a2[0-3]
+    mova        m2, [COEFFS+80]        ; a2[4-7]
+    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
+    mova        m2, [COEFFS+96]        ; a3[0-3]
+    mova        m3, [COEFFS+112]       ; a3[4-7]
+    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
+%endmacro
+
+; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
+;                                  int16_t *block, const int16_t *qmat);
+
+; %1 = row shift
+; %2 = row bias macro
+; %3 = column shift
+; %4 = column bias macro
+; %5 = min pixel value
+; %6 = max pixel value
+; %7 = qmat (for prores)
+
+%macro IDCT_FN 4-7
+%if %0 == 4
+    ; No clamping, means pure idct
+%xdefine COEFFS r0
+%else
+    movsxd      r1,  r1d
+%xdefine COEFFS r2
+%endif
+
+    ; for (i = 0; i < 8; i++)
+    ;     idctRowCondDC(block + i*8);
+    mova        m10,[COEFFS+ 0]        ; { row[0] }[0-7]
+    mova        m8, [COEFFS+32]        ; { row[2] }[0-7]
+    mova        m13,[COEFFS+64]        ; { row[4] }[0-7]
+    mova        m12,[COEFFS+96]        ; { row[6] }[0-7]
+
+%if %0 == 7
+    pmullw      m10,[%7+ 0]
+    pmullw      m8, [%7+32]
+    pmullw      m13,[%7+64]
+    pmullw      m12,[%7+96]
+
+    IDCT_1D     %1, %2, %7
+%else
+    IDCT_1D     %1, %2
+%endif
+
+    ; transpose for second part of IDCT
+    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
+    mova   [COEFFS+ 16], m0
+    mova   [COEFFS+ 48], m2
+    mova   [COEFFS+ 80], m11
+    mova   [COEFFS+112], m10
+    SWAP         8,  10
+    SWAP         1,   8
+    SWAP         4,  13
+    SWAP         9,  12
+
+    ; for (i = 0; i < 8; i++)
+    ;     idctSparseColAdd(dest + i, line_size, block + i);
+    IDCT_1D     %3, %4
+
+    ; clip/store
+%if %0 == 4
+    ; No clamping, means pure idct
+    mova  [r0+  0], m8
+    mova  [r0+ 16], m0
+    mova  [r0+ 32], m1
+    mova  [r0+ 48], m2
+    mova  [r0+ 64], m4
+    mova  [r0+ 80], m11
+    mova  [r0+ 96], m9
+    mova  [r0+112], m10
+%else
+%ifidn %5, 0
+    pxor        m3, m3
+%else
+    mova        m3, [%5]
+%endif
+    mova        m5, [%6]
+    pmaxsw      m8,  m3
+    pmaxsw      m0,  m3
+    pmaxsw      m1,  m3
+    pmaxsw      m2,  m3
+    pmaxsw      m4,  m3
+    pmaxsw      m11, m3
+    pmaxsw      m9,  m3
+    pmaxsw      m10, m3
+    pminsw      m8,  m5
+    pminsw      m0,  m5
+    pminsw      m1,  m5
+    pminsw      m2,  m5
+    pminsw      m4,  m5
+    pminsw      m11, m5
+    pminsw      m9,  m5
+    pminsw      m10, m5
+
+    lea         r2, [r1*3]
+    mova  [r0     ], m8
+    mova  [r0+r1  ], m0
+    mova  [r0+r1*2], m1
+    mova  [r0+r2  ], m2
+    lea         r0, [r0+r1*4]
+    mova  [r0     ], m4
+    mova  [r0+r1  ], m11
+    mova  [r0+r1*2], m9
+    mova  [r0+r2  ], m10
+%endif
+%endmacro
+
+%endif
diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000..e2ad511
--- /dev/null
+++ b/libavcodec/x86/snowdsp.c
@@ -0,0 +1,908 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+        IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+        // The savings in code and time are well worth having to store this value and
+        // calculate b[0] correctly afterwards.
+
+        i = 0;
+        __asm__ volatile(
+            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
+            "pcmpeqd   %%xmm3, %%xmm3         \n\t"
+            "psllw         $1, %%xmm3         \n\t"
+            "paddw     %%xmm7, %%xmm3         \n\t"
+            "psllw        $13, %%xmm3         \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "paddw  %%xmm7, %%xmm2        \n\t"
+                "paddw  %%xmm7, %%xmm6        \n\t"
+                "pmulhw %%xmm3, %%xmm2        \n\t"
+                "pmulhw %%xmm3, %%xmm6        \n\t"
+                "paddw    (%0), %%xmm2        \n\t"
+                "paddw  16(%0), %%xmm6        \n\t"
+                "movdqa %%xmm2, (%0)          \n\t"
+                "movdqa %%xmm6, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+            dst[i] = dst[i] - (b[i] + b[i + 1]);
+        }
+        for(; i<w_r-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubw  %%xmm2, %%xmm0        \n\t"
+                "psubw  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+        IDWTELEM b_0 = b[0];
+
+        i = 0;
+        __asm__ volatile(
+            "psllw         $15, %%xmm7        \n\t"
+            "pcmpeqw    %%xmm6, %%xmm6        \n\t"
+            "psrlw         $13, %%xmm6        \n\t"
+            "paddw      %%xmm7, %%xmm6        \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm0        \n\t"
+                "movdqu 16(%1), %%xmm4        \n\t"
+                "movdqu  2(%1), %%xmm1        \n\t"
+                "movdqu 18(%1), %%xmm5        \n\t" //FIXME try aligned reads and shifts
+                "paddw  %%xmm6, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "paddw  %%xmm7, %%xmm1        \n\t"
+                "paddw  %%xmm7, %%xmm5        \n\t"
+                "pavgw  %%xmm1, %%xmm0        \n\t"
+                "pavgw  %%xmm5, %%xmm4        \n\t"
+                "psubw  %%xmm7, %%xmm0        \n\t"
+                "psubw  %%xmm7, %%xmm4        \n\t"
+                "psraw      $1, %%xmm0        \n\t"
+                "psraw      $1, %%xmm4        \n\t"
+                "movdqa   (%0), %%xmm1        \n\t"
+                "movdqa 16(%0), %%xmm5        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "psraw      $2, %%xmm0        \n\t"
+                "psraw      $2, %%xmm4        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+        b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+        }
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw    (%1), %%xmm2        \n\t"
+                "paddw  16(%1), %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddw  %%xmm2, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "psraw      $1, %%xmm2        \n\t"
+                "psraw      $1, %%xmm6        \n\t"
+                "paddw  %%xmm0, %%xmm2        \n\t"
+                "paddw  %%xmm4, %%xmm6        \n\t"
+                "movdqa %%xmm2, (%2)          \n\t"
+                "movdqa %%xmm6, 16(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x3E) != 0x3E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=62; i>=0; i-=64){
+            __asm__ volatile(
+                "movdqa      (%1), %%xmm0       \n\t"
+                "movdqa    16(%1), %%xmm2       \n\t"
+                "movdqa    32(%1), %%xmm4       \n\t"
+                "movdqa    48(%1), %%xmm6       \n\t"
+                "movdqa      (%1), %%xmm1       \n\t"
+                "movdqa    16(%1), %%xmm3       \n\t"
+                "movdqa    32(%1), %%xmm5       \n\t"
+                "movdqa    48(%1), %%xmm7       \n\t"
+                "punpcklwd   (%2), %%xmm0       \n\t"
+                "punpcklwd 16(%2), %%xmm2       \n\t"
+                "punpcklwd 32(%2), %%xmm4       \n\t"
+                "punpcklwd 48(%2), %%xmm6       \n\t"
+                "movdqa    %%xmm0, (%0)         \n\t"
+                "movdqa    %%xmm2, 32(%0)       \n\t"
+                "movdqa    %%xmm4, 64(%0)       \n\t"
+                "movdqa    %%xmm6, 96(%0)       \n\t"
+                "punpckhwd   (%2), %%xmm1       \n\t"
+                "punpckhwd 16(%2), %%xmm3       \n\t"
+                "punpckhwd 32(%2), %%xmm5       \n\t"
+                "punpckhwd 48(%2), %%xmm7       \n\t"
+                "movdqa    %%xmm1, 16(%0)       \n\t"
+                "movdqa    %%xmm3, 48(%0)       \n\t"
+                "movdqa    %%xmm5, 80(%0)       \n\t"
+                "movdqa    %%xmm7, 112(%0)      \n\t"
+                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+        __asm__ volatile(
+            "pcmpeqw    %%mm7, %%mm7         \n\t"
+            "pcmpeqw    %%mm3, %%mm3         \n\t"
+            "psllw         $1, %%mm3         \n\t"
+            "paddw      %%mm7, %%mm3         \n\t"
+            "psllw        $13, %%mm3         \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "paddw   %%mm7, %%mm2        \n\t"
+                "paddw   %%mm7, %%mm6        \n\t"
+                "pmulhw  %%mm3, %%mm2        \n\t"
+                "pmulhw  %%mm3, %%mm6        \n\t"
+                "paddw    (%0), %%mm2        \n\t"
+                "paddw   8(%0), %%mm6        \n\t"
+                "movq    %%mm2, (%0)         \n\t"
+                "movq    %%mm6, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubw   %%mm2, %%mm0        \n\t"
+                "psubw   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+
+        i = 1;
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+        __asm__ volatile(
+            "psllw         $15, %%mm7        \n\t"
+            "pcmpeqw     %%mm6, %%mm6        \n\t"
+            "psrlw         $13, %%mm6        \n\t"
+            "paddw       %%mm7, %%mm6        \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm0        \n\t"
+                "movq    8(%1), %%mm4        \n\t"
+                "movq    2(%1), %%mm1        \n\t"
+                "movq   10(%1), %%mm5        \n\t"
+                "paddw   %%mm6, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "paddw   %%mm7, %%mm1        \n\t"
+                "paddw   %%mm7, %%mm5        \n\t"
+                "pavgw   %%mm1, %%mm0        \n\t"
+                "pavgw   %%mm5, %%mm4        \n\t"
+                "psubw   %%mm7, %%mm0        \n\t"
+                "psubw   %%mm7, %%mm4        \n\t"
+                "psraw      $1, %%mm0        \n\t"
+                "psraw      $1, %%mm4        \n\t"
+                "movq     (%0), %%mm1        \n\t"
+                "movq    8(%0), %%mm5        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "psraw      $2, %%mm0        \n\t"
+                "psraw      $2, %%mm4        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+        i = 0;
+
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq    2(%1), %%mm2        \n\t"
+                "movq   10(%1), %%mm6        \n\t"
+                "paddw    (%1), %%mm2        \n\t"
+                "paddw   8(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "paddw   %%mm2, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "psraw      $1, %%mm2        \n\t"
+                "psraw      $1, %%mm6        \n\t"
+                "paddw   %%mm0, %%mm2        \n\t"
+                "paddw   %%mm4, %%mm6        \n\t"
+                "movq    %%mm2, (%2)         \n\t"
+                "movq    %%mm6, 8(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x1E) != 0x1E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=30; i>=0; i-=32){
+            __asm__ volatile(
+                "movq        (%1), %%mm0       \n\t"
+                "movq       8(%1), %%mm2       \n\t"
+                "movq      16(%1), %%mm4       \n\t"
+                "movq      24(%1), %%mm6       \n\t"
+                "movq        (%1), %%mm1       \n\t"
+                "movq       8(%1), %%mm3       \n\t"
+                "movq      16(%1), %%mm5       \n\t"
+                "movq      24(%1), %%mm7       \n\t"
+                "punpcklwd   (%2), %%mm0       \n\t"
+                "punpcklwd  8(%2), %%mm2       \n\t"
+                "punpcklwd 16(%2), %%mm4       \n\t"
+                "punpcklwd 24(%2), %%mm6       \n\t"
+                "movq       %%mm0, (%0)        \n\t"
+                "movq       %%mm2, 16(%0)      \n\t"
+                "movq       %%mm4, 32(%0)      \n\t"
+                "movq       %%mm6, 48(%0)      \n\t"
+                "punpckhwd   (%2), %%mm1       \n\t"
+                "punpckhwd  8(%2), %%mm3       \n\t"
+                "punpckhwd 16(%2), %%mm5       \n\t"
+                "punpckhwd 24(%2), %%mm7       \n\t"
+                "movq       %%mm1, 8(%0)       \n\t"
+                "movq       %%mm3, 24(%0)      \n\t"
+                "movq       %%mm5, 40(%0)      \n\t"
+                "movq       %%mm7, 56(%0)      \n\t"
+                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"REG_d"), %%"t0"      \n\t"\
+        ""op" 16("r",%%"REG_d"), %%"t1"    \n\t"\
+        ""op" 32("r",%%"REG_d"), %%"t2"    \n\t"\
+        ""op" 48("r",%%"REG_d"), %%"t3"    \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "psubw %%"s0", %%"t0" \n\t"\
+        "psubw %%"s1", %%"t1" \n\t"\
+        "psubw %%"s2", %%"t2" \n\t"\
+        "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+        "movdqa %%"s0", ("w",%%"REG_d")      \n\t"\
+        "movdqa %%"s1", 16("w",%%"REG_d")    \n\t"\
+        "movdqa %%"s2", 32("w",%%"REG_d")    \n\t"\
+        "movdqa %%"s3", 48("w",%%"REG_d")    \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+        "psraw $"n", %%"t0" \n\t"\
+        "psraw $"n", %%"t1" \n\t"\
+        "psraw $"n", %%"t2" \n\t"\
+        "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "paddw %%"s0", %%"t0" \n\t"\
+        "paddw %%"s1", %%"t1" \n\t"\
+        "paddw %%"s2", %%"t2" \n\t"\
+        "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "pmulhw %%"s0", %%"t0" \n\t"\
+        "pmulhw %%"s1", %%"t1" \n\t"\
+        "pmulhw %%"s2", %%"t2" \n\t"\
+        "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movdqa %%"s0", %%"t0" \n\t"\
+        "movdqa %%"s1", %%"t1" \n\t"\
+        "movdqa %%"s2", %%"t2" \n\t"\
+        "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+
+    while(i & 0x1F)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+
+         __asm__ volatile (
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+        "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
+        "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm0, %%xmm2                   \n\t"
+        "psllw         $13, %%xmm2                   \n\t"
+        snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+        "pcmpeqw %%xmm7, %%xmm7                      \n\t"
+        "pcmpeqw %%xmm5, %%xmm5                      \n\t"
+        "psllw $15, %%xmm7                           \n\t"
+        "psrlw $13, %%xmm5                           \n\t"
+        "paddw %%xmm7, %%xmm5                        \n\t"
+        snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+        "movq   (%2,%%"REG_d"), %%xmm1        \n\t"
+        "movq  8(%2,%%"REG_d"), %%xmm3        \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm0                        \n\t"
+        "pavgw %%xmm3, %%xmm2                        \n\t"
+        "movq 16(%2,%%"REG_d"), %%xmm1        \n\t"
+        "movq 24(%2,%%"REG_d"), %%xmm3        \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm4                        \n\t"
+        "pavgw %%xmm3, %%xmm6                        \n\t"
+        snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+        snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+        "2:                                          \n\t"
+        "sub $64, %%"REG_d"                          \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"REG_d"), %%"t0"   \n\t"\
+        ""op" 8("r",%%"REG_d"), %%"t1"  \n\t"\
+        ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
+        ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+        "movq %%"s0", ("w",%%"REG_d")   \n\t"\
+        "movq %%"s1", 8("w",%%"REG_d")  \n\t"\
+        "movq %%"s2", 16("w",%%"REG_d") \n\t"\
+        "movq %%"s3", 24("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movq %%"s0", %%"t0" \n\t"\
+        "movq %%"s1", %%"t1" \n\t"\
+        "movq %%"s2", %%"t2" \n\t"\
+        "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+    while(i & 15)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+    __asm__ volatile(
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+        "pcmpeqw    %%mm0, %%mm0                     \n\t"
+        "pcmpeqw    %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm0, %%mm2                     \n\t"
+        "psllw        $13, %%mm2                     \n\t"
+        snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+        "pcmpeqw %%mm7, %%mm7                        \n\t"
+        "pcmpeqw %%mm5, %%mm5                        \n\t"
+        "psllw $15, %%mm7                            \n\t"
+        "psrlw $13, %%mm5                            \n\t"
+        "paddw %%mm7, %%mm5                          \n\t"
+        snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+        "movq   (%2,%%"REG_d"), %%mm1         \n\t"
+        "movq  8(%2,%%"REG_d"), %%mm3         \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm0                          \n\t"
+        "pavgw %%mm3, %%mm2                          \n\t"
+        "movq 16(%2,%%"REG_d"), %%mm1         \n\t"
+        "movq 24(%2,%%"REG_d"), %%mm3         \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm4                          \n\t"
+        "pavgw %%mm3, %%mm6                          \n\t"
+        snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+        snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+        "2:                                          \n\t"
+        "sub $32, %%"REG_d"                          \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#if HAVE_6REGS
+#define snow_inner_add_yblock_sse2_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"REG_c"             \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"REG_S"             \n\t"\
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
+             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
+             "psllw $15, %%xmm3              \n\t"\
+             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"REG_D"              \n\t"\
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
+             "add %3, %%"REG_D"              \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+             "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+             "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+             "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+             "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+             "add $32, %%"REG_S"             \n\t"\
+             "add %%"REG_c", %0              \n\t"\
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+             "add %%"REG_c", (%%"REG_a")     \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
+             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+             "sal $1, %%"REG_c"              \n\t"\
+             "add"OPSIZE" $"PTR_SIZE"*2, %1  \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "sar $1, %%"REG_c"              \n\t"\
+             "sub $2, %2                     \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+             "add"OPSIZE" $"PTR_SIZE"*1, %1  \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "dec %2                         \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+             "mov %0, %%"REG_d"              \n\t"
+             "movdqa (%%"REG_D"), %%xmm0     \n\t"
+             "movdqa %%xmm1, %%xmm2          \n\t"
+
+             "punpckhwd %%xmm7, %%xmm1       \n\t"
+             "punpcklwd %%xmm7, %%xmm2       \n\t"
+             "paddd %%xmm2, %%xmm0           \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
+             "paddd %%xmm1, %%xmm2           \n\t"
+             "paddd %%xmm3, %%xmm0           \n\t"
+             "paddd %%xmm3, %%xmm2           \n\t"
+
+             "mov %1, %%"REG_D"              \n\t"
+             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+             "add %3, %%"REG_D"              \n\t"
+
+             "movdqa (%%"REG_D"), %%xmm4     \n\t"
+             "movdqa %%xmm5, %%xmm6          \n\t"
+             "punpckhwd %%xmm7, %%xmm5       \n\t"
+             "punpcklwd %%xmm7, %%xmm6       \n\t"
+             "paddd %%xmm6, %%xmm4           \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
+             "paddd %%xmm5, %%xmm6           \n\t"
+             "paddd %%xmm3, %%xmm4           \n\t"
+             "paddd %%xmm3, %%xmm6           \n\t"
+
+             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm2, %%xmm0        \n\t"
+             "packuswb %%xmm7, %%xmm0        \n\t"
+             "movq %%xmm0, (%%"REG_d")       \n\t"
+
+             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm6, %%xmm4        \n\t"
+             "packuswb %%xmm7, %%xmm4        \n\t"
+             "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+             "mov %0, %%"REG_d"              \n\t"
+             "psrlw $4, %%xmm1               \n\t"
+             "psrlw $4, %%xmm5               \n\t"
+             "paddw   (%%"REG_D"), %%xmm1    \n\t"
+             "paddw 16(%%"REG_D"), %%xmm5    \n\t"
+             "paddw %%xmm3, %%xmm1           \n\t"
+             "paddw %%xmm3, %%xmm5           \n\t"
+             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
+             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
+             "packuswb %%xmm5, %%xmm1        \n\t"
+
+             "movdqu %%xmm1, (%%"REG_d")       \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"REG_c"             \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"REG_S"             \n\t"\
+             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
+             "pcmpeqd %%mm3, %%mm3           \n\t"\
+             "psllw $15, %%mm3               \n\t"\
+             "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"REG_D"              \n\t"\
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
+             "add %3, %%"REG_D"              \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
+             "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+             "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
+             "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
+             "punpcklbw %%mm7, %%mm0       \n\t"\
+             "punpcklbw %%mm7, %%mm4       \n\t"\
+             "pmullw %%mm0, %%"out_reg1"    \n\t"\
+             "pmullw %%mm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+             "paddusw %%mm2, %%mm1         \n\t"\
+             "paddusw %%mm6, %%mm5         \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+             "mov %0, %%"REG_d"              \n\t"\
+             "psrlw $4, %%mm1                \n\t"\
+             "psrlw $4, %%mm5                \n\t"\
+             "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
+             "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
+             "paddw %%mm3, %%mm1             \n\t"\
+             "paddw %%mm3, %%mm5             \n\t"\
+             "psraw $4, %%mm1                \n\t"\
+             "psraw $4, %%mm5                \n\t"\
+             "packuswb %%mm5, %%mm1          \n\t"\
+             "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+             "add $"s_step", %%"REG_S"             \n\t"\
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+             "add %%"REG_c", (%%"REG_a")     \n\t"\
+             "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
+             "add %%"REG_c", %0              \n\t"\
+             "dec %2                         \n\t"\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16) {
+        if (!(b_h & 1))
+            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+        else
+            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    } else
+         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16)
+        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else
+        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+#endif
+        }
+        else{
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+            }
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+#endif
+        }
+    }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/x86/svq1enc.asm b/libavcodec/x86/svq1enc.asm
new file mode 100644
index 0000000..a876328
--- /dev/null
+++ b/libavcodec/x86/svq1enc.asm
@@ -0,0 +1,61 @@
+;******************************************************************************
+;* SIMD-optimized SVQ1 encoder functions
+;* Copyright (c) 2007 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SSD_INT8_VS_INT16 0
+cglobal ssd_int8_vs_int16, 3, 3, 3, pix1, pix2, size
+    pxor m0, m0
+.loop:
+    sub       sizeq, 8
+    movq      m1, [pix1q + sizeq]
+    mova      m2, [pix2q + sizeq*2]
+%if mmsize == 8
+    movq      m3, [pix2q + sizeq*2 + mmsize]
+    punpckhbw m4, m1
+    punpcklbw m1, m1
+    psraw     m4, 8
+    psraw     m1, 8
+    psubw     m3, m4
+    psubw     m2, m1
+    pmaddwd   m3, m3
+    pmaddwd   m2, m2
+    paddd     m0, m3
+    paddd     m0, m2
+%else
+    punpcklbw m1, m1
+    psraw     m1, 8
+    psubw     m2, m1
+    pmaddwd   m2, m2
+    paddd     m0, m2
+%endif
+    jg .loop
+    HADDD     m0, m1
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmx
+SSD_INT8_VS_INT16
+INIT_XMM sse2
+SSD_INT8_VS_INT16
diff --git a/libavcodec/x86/svq1enc.c b/libavcodec/x86/svq1enc.c
deleted file mode 100644
index 02b0a84..0000000
--- a/libavcodec/x86/svq1enc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/svq1enc.h"
-
-#if HAVE_INLINE_ASM
-
-static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
-                                 int size)
-{
-    int sum;
-    x86_reg i = size;
-
-    __asm__ volatile (
-        "pxor %%mm4, %%mm4 \n"
-        "1: \n"
-        "sub $8, %0 \n"
-        "movq (%2, %0), %%mm2 \n"
-        "movq (%3, %0, 2), %%mm0 \n"
-        "movq 8(%3, %0, 2), %%mm1 \n"
-        "punpckhbw %%mm2, %%mm3 \n"
-        "punpcklbw %%mm2, %%mm2 \n"
-        "psraw $8, %%mm3 \n"
-        "psraw $8, %%mm2 \n"
-        "psubw %%mm3, %%mm1 \n"
-        "psubw %%mm2, %%mm0 \n"
-        "pmaddwd %%mm1, %%mm1 \n"
-        "pmaddwd %%mm0, %%mm0 \n"
-        "paddd %%mm1, %%mm4 \n"
-        "paddd %%mm0, %%mm4 \n"
-        "jg 1b \n"
-        "movq %%mm4, %%mm3 \n"
-        "psrlq $32, %%mm3 \n"
-        "paddd %%mm3, %%mm4 \n"
-        "movd %%mm4, %1 \n"
-        : "+r" (i), "=r" (sum)
-        : "r" (pix1), "r" (pix2));
-
-    return sum;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (INLINE_MMX(cpu_flags)) {
-        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/svq1enc_init.c b/libavcodec/x86/svq1enc_init.c
new file mode 100644
index 0000000..40b4b0e
--- /dev/null
+++ b/libavcodec/x86/svq1enc_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/svq1enc.h"
+
+int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+                             intptr_t size);
+int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
+                              intptr_t size);
+
+av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
+    }
+}
diff --git a/libavcodec/x86/synth_filter.asm b/libavcodec/x86/synth_filter.asm
new file mode 100644
index 0000000..bc1a48f
--- /dev/null
+++ b/libavcodec/x86/synth_filter.asm
@@ -0,0 +1,246 @@
+;******************************************************************************
+;* SSE-optimized functions for the DCA decoder
+;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+    pxor          %1, %1
+%else
+    xorps         %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 3
+%if cpuflag(avx)
+    mova          %3, [%2 - 16]
+    vperm2f128    %1, %3, %3, 1
+    vshufps       %1, %1, %1, q0123
+%elif cpuflag(sse2)
+    pshufd        %1, [%2], q0123
+%else
+    mova          %1, [%2]
+    shufps        %1, %1, q0123
+%endif
+%endmacro
+
+%macro INNER_LOOP   1
+    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
+    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
+    ;~ b += window[i + j + 16] * (synth_buf[i + j])
+    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
+    mova          m6, [ptr1 + j]
+%if ARCH_X86_64
+    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
+    mova         m12, [ptr1 + j + mmsize]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
+    fnmaddps      m1, m5,  [win + %1 + j], m1
+%if ARCH_X86_64
+    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
+    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
+%endif
+%else ; non-FMA
+    mulps         m6, m6,  [win + %1 + j + 16 * 4]
+    mulps         m5, m5,  [win + %1 + j]
+%if ARCH_X86_64
+    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
+    mulps        m11, m11, [win + %1 + j + mmsize]
+%endif
+    addps         m2, m2, m6
+    subps         m1, m1, m5
+%if ARCH_X86_64
+    addps         m8, m8, m12
+    subps         m7, m7, m11
+%endif
+%endif ; cpuflag(fma3)
+    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
+    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
+    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
+    mova          m5, [ptr1 + j + 16 * 4]
+%if ARCH_X86_64
+    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
+    mova         m11, [ptr1 + j + mmsize + 16 * 4]
+%endif
+%if cpuflag(fma3)
+    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
+    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
+%if ARCH_X86_64
+    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
+    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
+%endif
+%else ; non-FMA
+    mulps         m5, m5,  [win + %1 + j + 32 * 4]
+    mulps         m6, m6,  [win + %1 + j + 48 * 4]
+%if ARCH_X86_64
+    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
+    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
+%endif
+    addps         m3, m3, m5
+    addps         m4, m4, m6
+%if ARCH_X86_64
+    addps         m9, m9, m11
+    addps        m10, m10, m12
+%endif
+%endif ; cpuflag(fma3)
+    sub            j, 64 * 4
+%endmacro
+
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+;                                  const float window[512], float out[32],
+;                                  intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
+cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
+                              synth_buf, synth_buf2, window, out, off, scale
+%define scale m0
+%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+    movd       scale, scalem
+    SPLATD        m0
+%else
+    VBROADCASTSS  m0, scalem
+%endif
+; Make sure offset is in a register and not on the stack
+%define OFFQ  r4q
+%else
+    SPLATD      xmm0
+%if cpuflag(avx)
+    vinsertf128   m0, m0, xmm0, 1
+%endif
+%define OFFQ  offq
+%endif
+    ; prepare inner counter limit 1
+    mov          r5q, 480
+    sub          r5q, offmp
+    and          r5q, -64
+    shl          r5q, 2
+%if ARCH_X86_32 || notcpuflag(avx)
+    mov         OFFQ, r5q
+%define i        r5q
+    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
+%else
+%define i 0
+%define OFFQ  r5q
+%endif
+
+%define buf2     synth_buf2q
+%if ARCH_X86_32
+    mov         buf2, synth_buf2mp
+%endif
+.mainloop:
+    ; m1 = a  m2 = b  m3 = c  m4 = d
+    SETZERO       m3
+    SETZERO       m4
+    mova          m1, [buf2 + i]
+    mova          m2, [buf2 + i + 16 * 4]
+%if ARCH_X86_32
+%define ptr1     r0q
+%define ptr2     r1q
+%define win      r2q
+%define j        r3q
+    mov          win, windowm
+    mov         ptr1, synth_bufm
+%if ARCH_X86_32 || notcpuflag(avx)
+    add          win, i
+    add         ptr1, i
+%endif
+%else ; ARCH_X86_64
+%define ptr1     r6q
+%define ptr2     r7q ; must be loaded
+%define win      r8q
+%define j        r9q
+    SETZERO       m9
+    SETZERO      m10
+    mova          m7, [buf2 + i + mmsize]
+    mova          m8, [buf2 + i + mmsize + 16 * 4]
+    lea          win, [windowq + i]
+    lea         ptr1, [synth_bufq + i]
+%endif
+    mov         ptr2, synth_bufmp
+    ; prepare the inner loop counter
+    mov            j, OFFQ
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub         ptr2, i
+%endif
+.loop1:
+    INNER_LOOP  0
+    jge       .loop1
+
+    mov            j, 448 * 4
+    sub            j, OFFQ
+    jz          .end
+    sub         ptr1, j
+    sub         ptr2, j
+    add          win, OFFQ ; now at j-64, so define OFFSET
+    sub            j, 64 * 4
+.loop2:
+    INNER_LOOP  64 * 4
+    jge       .loop2
+
+.end:
+%if ARCH_X86_32
+    mov         buf2, synth_buf2m ; needed for next iteration anyway
+    mov         outq, outmp       ; j, which will be set again during it
+%endif
+    ;~ out[i]      = a * scale;
+    ;~ out[i + 16] = b * scale;
+    mulps         m1, m1, scale
+    mulps         m2, m2, scale
+%if ARCH_X86_64
+    mulps         m7, m7, scale
+    mulps         m8, m8, scale
+%endif
+    ;~ synth_buf2[i]      = c;
+    ;~ synth_buf2[i + 16] = d;
+    mova   [buf2 + i +  0 * 4], m3
+    mova   [buf2 + i + 16 * 4], m4
+%if ARCH_X86_64
+    mova   [buf2 + i +  0 * 4 + mmsize], m9
+    mova   [buf2 + i + 16 * 4 + mmsize], m10
+%endif
+    ;~ out[i]      = a;
+    ;~ out[i + 16] = a;
+    mova   [outq + i +  0 * 4], m1
+    mova   [outq + i + 16 * 4], m2
+%if ARCH_X86_64
+    mova   [outq + i +  0 * 4 + mmsize], m7
+    mova   [outq + i + 16 * 4 + mmsize], m8
+%endif
+%if ARCH_X86_32 || notcpuflag(avx)
+    sub            i, (ARCH_X86_64 + 1) * mmsize
+    jge    .mainloop
+%endif
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
+INIT_YMM avx
+SYNTH_FILTER
+INIT_YMM fma3
+SYNTH_FILTER
diff --git a/libavcodec/x86/synth_filter_init.c b/libavcodec/x86/synth_filter_init.c
new file mode 100644
index 0000000..9ef00cd
--- /dev/null
+++ b/libavcodec/x86/synth_filter_init.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/synth_filter.h"
+
+#define SYNTH_FILTER_FUNC(opt)                                                 \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
+                                 const float window[512],                      \
+                                 float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct,                              \
+                               float *synth_buf_ptr, int *synth_buf_offset,    \
+                               float synth_buf2[32], const float window[512],  \
+                               float out[32], const float in[32], float scale) \
+{                                                                              \
+    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
+                                                                               \
+    imdct->imdct_half(imdct, synth_buf, in);                                   \
+                                                                               \
+    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
+                                out, *synth_buf_offset, scale);                \
+                                                                               \
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
+}                                                                              \
+
+#if HAVE_YASM
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
+SYNTH_FILTER_FUNC(avx)
+SYNTH_FILTER_FUNC(fma3)
+#endif /* HAVE_YASM */
+
+av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse;
+    }
+#endif
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse2;
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_avx;
+    }
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        s->synth_filter_float = synth_filter_fma3;
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
new file mode 100644
index 0000000..5f3ded3
--- /dev/null
+++ b/libavcodec/x86/takdsp.asm
@@ -0,0 +1,116 @@
+;******************************************************************************
+;* TAK DSP SIMD optimizations
+;*
+;* Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_128: times 4 dd 128
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+.loop:
+    mova                         m0, [p1q+lengthq+mmsize*0]
+    mova                         m1, [p1q+lengthq+mmsize*1]
+    paddd                        m0, [p2q+lengthq+mmsize*0]
+    paddd                        m1, [p2q+lengthq+mmsize*1]
+    mova     [p2q+lengthq+mmsize*0], m0
+    mova     [p2q+lengthq+mmsize*1], m1
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+
+.loop:
+    mova                         m0, [p2q+lengthq+mmsize*0]
+    mova                         m1, [p2q+lengthq+mmsize*1]
+    psubd                        m0, [p1q+lengthq+mmsize*0]
+    psubd                        m1, [p1q+lengthq+mmsize*1]
+    mova     [p1q+lengthq+mmsize*0], m0
+    mova     [p1q+lengthq+mmsize*1], m1
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
+    shl                     lengthd, 2
+    add                         p1q, lengthq
+    add                         p2q, lengthq
+    neg                     lengthq
+
+.loop:
+    mova                         m0, [p1q+lengthq]
+    mova                         m1, [p2q+lengthq]
+    mova                         m3, [p1q+lengthq+mmsize]
+    mova                         m4, [p2q+lengthq+mmsize]
+    mova                         m2, m1
+    mova                         m5, m4
+    psrad                        m2, 1
+    psrad                        m5, 1
+    psubd                        m0, m2
+    psubd                        m3, m5
+    paddd                        m1, m0
+    paddd                        m4, m3
+    mova              [p1q+lengthq], m0
+    mova              [p2q+lengthq], m1
+    mova       [p1q+lengthq+mmsize], m3
+    mova       [p2q+lengthq+mmsize], m4
+    add                     lengthq, mmsize*2
+    jl .loop
+    REP_RET
+
+INIT_XMM sse4
+cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
+    shl             lengthd, 2
+    add                 p1q, lengthq
+    add                 p2q, lengthq
+    neg             lengthq
+
+    movd                 m2, dshiftm
+    movd                 m3, dfactorm
+    pshufd               m3, m3, 0
+    mova                 m4, [pd_128]
+
+.loop:
+    mova                 m0, [p1q+lengthq]
+    mova                 m1, [p2q+lengthq]
+    psrad                m1, m2
+    pmulld               m1, m3
+    paddd                m1, m4
+    psrad                m1, 8
+    pslld                m1, m2
+    psubd                m1, m0
+    mova      [p1q+lengthq], m1
+    add             lengthq, mmsize
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
new file mode 100644
index 0000000..555d064
--- /dev/null
+++ b/libavcodec/x86/takdsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/takdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_tak_decorrelate_ls_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sr_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sf_sse4(int32_t *p1, int32_t *p2, int length, int dshift, int dfactor);
+
+av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->decorrelate_ls = ff_tak_decorrelate_ls_sse2;
+        c->decorrelate_sr = ff_tak_decorrelate_sr_sse2;
+        c->decorrelate_sm = ff_tak_decorrelate_sm_sse2;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
+    }
+#endif
+}
diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm
new file mode 100644
index 0000000..8f48949
--- /dev/null
+++ b/libavcodec/x86/ttadsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224:  dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTA_FILTER 2
+INIT_XMM %1
+cglobal ttafilter_process_dec, 5,5,%2, qm, dx, dl, error, in, shift, round
+    mova       m2, [qmq       ]
+    mova       m3, [qmq + 0x10]
+    mova       m4, [dxq       ]
+    mova       m5, [dxq + 0x10]
+
+    movd       m6, [errorq]         ; if (filter->error < 0) {
+    SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
+    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
+    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
+    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
+    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
+    mova       [qmq       ], m2     ; }
+    mova       [qmq + 0x10], m3     ;
+
+    mova       m0, [dlq       ]
+    mova       m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+    pmulld     m2, m0
+    pmulld     m3, m1
+%else
+    pshufd     m6, m0, 0xb1
+    pshufd     m7, m2, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m2, m0
+    pshufd     m2, m2, 0xd8
+    punpckldq  m2, m6
+
+    pshufd     m6, m1, 0xb1
+    pshufd     m7, m3, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m3, m1
+    pshufd     m3, m3, 0xd8
+    punpckldq  m3, m6
+%endif
+    ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+    paddd      m2, m3               ; int sum = filter->round +
+                                    ;           filter->dl[0] * filter->qm[0] +
+    pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
+    paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
+                                    ;           filter->dl[3] * filter->qm[3] +
+    movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
+    paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
+    pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
+    paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
+
+    palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+                                    ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+    palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+                                    ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+    psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+    por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+    pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+                                    ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+    mova       [dlq       ], m2
+    mova       [dxq       ], m5
+    mova       [dxq + 0x10], m4
+    movd       m0, [inq]            ; filter->error = *in;
+    movd       [errorq], m0         ;
+
+    movd       m2, shiftm           ; *in += (sum >> filter->shift);
+    psrad      m6, m2               ;
+    paddd      m0, m6               ;
+    movd       [inq], m0            ;
+
+    psrldq     m1, 4                ;
+    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
+    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
+    psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
+    psrldq     m1, m0, 4            ; filter->dl[7] = *in;
+    pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
+    paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
+    psrldq     m1, 4                ;
+    paddd      m0, m1               ;
+    mova       [dlq + 0x10], m0     ;
+    RET
+%endmacro
+
+TTA_FILTER ssse3, 8
+TTA_FILTER sse4,  7
diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c
new file mode 100644
index 0000000..47dc87f
--- /dev/null
+++ b/libavcodec/x86/ttadsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttafilter_process_dec_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round);
+void ff_ttafilter_process_dec_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                   int32_t *error, int32_t *in, int32_t shift,
+                                   int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->ttafilter_process_dec = ff_ttafilter_process_dec_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->ttafilter_process_dec = ff_ttafilter_process_dec_sse4;
+#endif
+}
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000..f579307
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void ff_v210_x86_init(V210DecContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+    }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000..c24c765
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,90 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 1
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1, 5, 5, 7
+    movsxdifnidn r4, r4d
+    lea    r1, [r1+2*r4]
+    add    r2, r4
+    add    r3, r4
+    neg    r4
+
+    mova   m3, [v210_mult]
+    mova   m4, [v210_mask]
+    mova   m5, [v210_luma_shuf]
+    mova   m6, [v210_chroma_shuf]
+.loop:
+%ifidn %1, unaligned
+    movu   m0, [r0]
+%else
+    mova   m0, [r0]
+%endif
+
+    pmullw m1, m0, m3
+    psrld  m0, 10
+    psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
+    pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+    shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+    pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    movu   [r1+2*r4], m2
+
+    shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+    pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+    movq   [r2+r4], m1
+    movhps [r3+r4], m1
+
+    add r0, mmsize
+    add r4, 6
+    jl  .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+v210_planar_unpack unaligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack unaligned
+%endif
+
+INIT_XMM ssse3
+v210_planar_unpack aligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack aligned
+%endif
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index ec4309f..965f2be 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -2,20 +2,20 @@
 ;* V210 SIMD pack
 ;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -23,8 +23,9 @@
 
 SECTION_RODATA 32
 
-v210_enc_min_10: times 32 dw 0x4
-v210_enc_max_10: times 32 dw 0x3fb
+cextern pw_4
+%define v210_enc_min_10 pw_4
+v210_enc_max_10: times 16 dw 0x3fb
 
 v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0
 v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
@@ -32,16 +33,19 @@ v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
 v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0
 v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
 
-v210_enc_min_8: times 32 db 0x1
-v210_enc_max_8: times 32 db 0xfe
+cextern pb_1
+%define v210_enc_min_8 pb_1
+cextern pb_FE
+%define v210_enc_max_8 pb_FE
 
-v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
 v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
+v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
 
-v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
 v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
 v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
 
+v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
+
 SECTION .text
 
 %macro v210_planar_pack_10 0
@@ -56,19 +60,19 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
     mova    m2, [v210_enc_min_10]
     mova    m3, [v210_enc_max_10]
 
-.loop
+.loop:
     movu        xm0, [yq+2*widthq]
 %if cpuflag(avx2)
-    vinserti128 m0, m0, [yq+2*widthq+12], 1
+    vinserti128 m0,   m0, [yq+widthq*2+12], 1
 %endif
     CLIPW   m0, m2, m3
 
-    movq    xm1, [uq+widthq]
-    movhps  xm1, [vq+widthq]
+    movq         xm1, [uq+widthq]
+    movhps       xm1, [vq+widthq]
 %if cpuflag(avx2)
     movq         xm4, [uq+widthq+6]
     movhps       xm4, [vq+widthq+6]
-    vinserti128  m1, m1, xm4, 1
+    vinserti128  m1,   m1, xm4, 1
 %endif
     CLIPW   m1, m2, m3
 
@@ -93,6 +97,7 @@ cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
 INIT_XMM ssse3
 v210_planar_pack_10
 %endif
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 v210_planar_pack_10
@@ -112,10 +117,10 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
     mova    m5, [v210_enc_max_8]
     pxor    m6, m6
 
-.loop
-    movu        xm1, [yq+2*widthq]
+.loop:
+    movu        xm1, [yq+widthq*2]
 %if cpuflag(avx2)
-    vinserti128 m1, m1, [yq+2*widthq+12], 1
+    vinserti128 m1,   m1, [yq+widthq*2+12], 1
 %endif
     CLIPUB  m1, m4, m5
 
@@ -172,6 +177,7 @@ v210_planar_pack_8
 INIT_XMM avx
 v210_planar_pack_8
 %endif
+
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 v210_planar_pack_8
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index c4d2745..e997b4b 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h
index 9b6c8ad..fdd4de1 100644
--- a/libavcodec/x86/vc1dsp.h
+++ b/libavcodec/x86/vc1dsp.h
@@ -1,20 +1,20 @@
 /*
  * VC-1 and WMV3 decoder - X86 DSP init functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index aff4b26..c8943fa 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -27,6 +27,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavcodec/vc1dsp.h"
 #include "fpel.h"
 #include "vc1dsp.h"
@@ -63,11 +64,22 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
     ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
 }
 
-static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
-                                      ptrdiff_t stride, int rnd)
-{
-    ff_avg_pixels8_mmxext(dst, src, stride, 8);
-}
+#define DECLARE_FUNCTION(OP, DEPTH, INSN)                       \
+    static void OP##vc1_mspel_mc00_##DEPTH##INSN(uint8_t *dst,          \
+                             const uint8_t *src, ptrdiff_t stride, int rnd) \
+    {                                                                       \
+        ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH);     \
+    }
+
+DECLARE_FUNCTION(put_,  8, _mmx)
+DECLARE_FUNCTION(put_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmx)
+DECLARE_FUNCTION(avg_, 16, _mmx)
+DECLARE_FUNCTION(avg_,  8, _mmxext)
+DECLARE_FUNCTION(avg_, 16, _mmxext)
+DECLARE_FUNCTION(put_, 16, _sse2)
+DECLARE_FUNCTION(avg_, 16, _sse2)
+
 #endif /* HAVE_YASM */
 
 void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
@@ -80,16 +92,24 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        int stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
                                        int stride, int h, int x, int y);
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
+                                    int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
+                                    int16_t *block);
 
 
 av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMX(cpu_flags) && EXTERNAL_MMX(cpu_flags))
         ff_vc1dsp_init_mmx(dsp);
 
-    if (INLINE_MMXEXT(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags) && EXTERNAL_MMXEXT(cpu_flags))
         ff_vc1dsp_init_mmxext(dsp);
 
 #define ASSIGN_LF(EXT) \
@@ -103,6 +123,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 #if HAVE_YASM
     if (EXTERNAL_MMX(cpu_flags)) {
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
+
+        dsp->put_vc1_mspel_pixels_tab[1][0]      = put_vc1_mspel_mc00_8_mmx;
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmx;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmx;
     }
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
@@ -111,13 +136,22 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         ASSIGN_LF(mmxext);
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
 
-        dsp->avg_vc1_mspel_pixels_tab[0]         = avg_vc1_mspel_mc00_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_mmxext;
+
+        dsp->vc1_inv_trans_8x8_dc                = ff_vc1_inv_trans_8x8_dc_mmxext;
+        dsp->vc1_inv_trans_4x8_dc                = ff_vc1_inv_trans_4x8_dc_mmxext;
+        dsp->vc1_inv_trans_8x4_dc                = ff_vc1_inv_trans_8x4_dc_mmxext;
+        dsp->vc1_inv_trans_4x4_dc                = ff_vc1_inv_trans_4x4_dc_mmxext;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_sse2;
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse2;
         dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
         dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+
+        dsp->put_vc1_mspel_pixels_tab[0][0]      = put_vc1_mspel_mc00_16_sse2;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_sse2;
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
         ASSIGN_LF(ssse3);
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp_loopfilter.asm
index adf08d7d..1838f6f 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@ -1,21 +1,21 @@
 ;******************************************************************************
-;* VC1 deblocking optimizations
+;* VC1 loopfilter optimizations
 ;* Copyright (c) 2009 David Conrad
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
new file mode 100644
index 0000000..7eaf043
--- /dev/null
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -0,0 +1,292 @@
+;******************************************************************************
+;* VC1 motion compensation optimizations
+;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_9
+cextern pw_128
+
+section .text
+
+%if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+;     when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+    pavgb           %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+    paddw           m3, m7 ; +bias-r
+    paddw           m4, m7 ; +bias-r
+    psraw           m3, %1
+    psraw           m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+    packuswb        m3, m4
+    %1              m3, [%2]
+    mova          [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+    %1              m3, [%2]
+    %1              m3, [%2 + mmsize]
+    mova          [%2], m3
+    mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+    punpcklbw       %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
+; Compute the rounder 32-r or 8-r and unpacks it to m7
+%macro LOAD_ROUNDER_MMX 1 ; round
+    movd      m7, %1
+    punpcklwd m7, m7
+    punpckldq m7, m7
+%endmacro
+
+%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
+    paddw          m%3, m%4
+    movh           m%2, [srcq + stride_neg2]
+    pmullw         m%3, m6
+    punpcklbw      m%2, m0
+    movh           m%5, [srcq + strideq]
+    psubw          m%3, m%2
+    punpcklbw      m%5, m0
+    paddw          m%3, m7
+    psubw          m%3, m%5
+    psraw          m%3, shift
+    movu   [dstq + %1], m%3
+    add           srcq, strideq
+%endmacro
+
+INIT_MMX mmx
+; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
+;                                    x86_reg stride, int rnd, int64_t shift)
+; Sacrificing m6 makes it possible to pipeline loads from src
+%if ARCH_X86_32
+cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
+    DECLARE_REG_TMP     3, 4, 5
+    %define rnd r3mp
+    %define shift qword r4m
+%else ; X86_64
+cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
+    DECLARE_REG_TMP     4, 5, 6
+    %define   rnd r3d
+    ; We need shift either in memory or in a mm reg as it's used in psraw
+    ; On WIN64, the arg is already on the stack
+    ; On UNIX64, m5 doesn't seem to be used
+%if WIN64
+    %define shift r4mp
+%else ; UNIX64
+    %define shift m5
+    mova shift, r4q
+%endif ; WIN64
+%endif ; X86_32
+%define stride_neg2 t0q
+%define stride_9minus4 t1q
+%define i t2q
+    mov       stride_neg2, strideq
+    neg       stride_neg2
+    add       stride_neg2, stride_neg2
+    lea    stride_9minus4, [strideq * 9 - 4]
+    mov                 i, 3
+    LOAD_ROUNDER_MMX  rnd
+    mova               m6, [pw_9]
+    pxor               m0, m0
+.loop:
+    movh               m2, [srcq]
+    add              srcq, strideq
+    movh               m3, [srcq]
+    punpcklbw          m2, m0
+    punpcklbw          m3, m0
+    SHIFT2_LINE         0, 1, 2, 3, 4
+    SHIFT2_LINE        24, 2, 3, 4, 1
+    SHIFT2_LINE        48, 3, 4, 1, 2
+    SHIFT2_LINE        72, 4, 1, 2, 3
+    SHIFT2_LINE        96, 1, 2, 3, 4
+    SHIFT2_LINE       120, 2, 3, 4, 1
+    SHIFT2_LINE       144, 3, 4, 1, 2
+    SHIFT2_LINE       168, 4, 1, 2, 3
+    sub              srcq, stride_9minus4
+    add              dstq, 8
+    dec                 i
+        jnz         .loop
+    REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+;                                  const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+    mov                hq, 8
+    sub              srcq, 2
+    sub              rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+    LOAD_ROUNDER_MMX rndq
+    mova               m5, [pw_9]
+    mova               m6, [pw_128]
+    pxor               m0, m0
+
+.loop:
+    mova               m1, [srcq + 2 * 0]
+    mova               m2, [srcq + 2 * 0 + mmsize]
+    mova               m3, [srcq + 2 * 1]
+    mova               m4, [srcq + 2 * 1 + mmsize]
+    paddw              m3, [srcq + 2 * 2]
+    paddw              m4, [srcq + 2 * 2 + mmsize]
+    paddw              m1, [srcq + 2 * 3]
+    paddw              m2, [srcq + 2 * 3 + mmsize]
+    pmullw             m3, m5
+    pmullw             m4, m5
+    psubw              m3, m1
+    psubw              m4, m2
+    NORMALIZE_MMX      7
+    ; remove bias
+    paddw              m3, m6
+    paddw              m4, m6
+    TRANSFER_DO_PACK   %1, dstq
+    add              srcq, 24
+    add              dstq, strideq
+    dec                hq
+        jnz         .loop
+
+    RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
+%endif ; HAVE_MMX_INLINE
+
+%macro INV_TRANS_INIT 0
+    movsxdifnidn linesizeq, linesized
+    movd       m0, blockd
+    SPLATW     m0, m0
+    pxor       m1, m1
+    psubw      m1, m0
+    packuswb   m0, m0
+    packuswb   m1, m1
+
+    DEFINE_ARGS dest, linesize, linesize3
+    lea    linesize3q, [linesizeq*3]
+%endmacro
+
+%macro INV_TRANS_PROCESS 1
+    mov%1                  m2, [destq+linesizeq*0]
+    mov%1                  m3, [destq+linesizeq*1]
+    mov%1                  m4, [destq+linesizeq*2]
+    mov%1                  m5, [destq+linesize3q]
+    paddusb                m2, m0
+    paddusb                m3, m0
+    paddusb                m4, m0
+    paddusb                m5, m0
+    psubusb                m2, m1
+    psubusb                m3, m1
+    psubusb                m4, m1
+    psubusb                m5, m1
+    mov%1 [linesizeq*0+destq], m2
+    mov%1 [linesizeq*1+destq], m3
+    mov%1 [linesizeq*2+destq], m4
+    mov%1 [linesize3q +destq], m5
+%endmacro
+
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    shl        blockd, 2               ;  4 * dc
+    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
+    sar        blockd, 5               ; >> 5
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS a
+    RET
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 95f5ee4..da32a3e 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -25,7 +25,6 @@
  */
 
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -34,7 +33,15 @@
 #include "fpel.h"
 #include "vc1dsp.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL
+
+void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+                                   const uint8_t *src, x86_reg stride,
+                                   int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+                                   const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+                                      const int16_t *src, int rnd);
 
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -67,102 +74,6 @@
      "punpcklwd %%mm7, %%mm7           \n\t"    \
      "punpckldq %%mm7, %%mm7           \n\t"
 
-#define SHIFT2_LINE(OFF, R0,R1,R2,R3)           \
-    "paddw     %%mm"#R2", %%mm"#R1"    \n\t"    \
-    "movd      (%0,%3), %%mm"#R0"      \n\t"    \
-    "pmullw    %%mm6, %%mm"#R1"        \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R0"        \n\t"    \
-    "movd      (%0,%2), %%mm"#R3"      \n\t"    \
-    "psubw     %%mm"#R0", %%mm"#R1"    \n\t"    \
-    "punpcklbw %%mm0, %%mm"#R3"        \n\t"    \
-    "paddw     %%mm7, %%mm"#R1"        \n\t"    \
-    "psubw     %%mm"#R3", %%mm"#R1"    \n\t"    \
-    "psraw     %4, %%mm"#R1"           \n\t"    \
-    "movq      %%mm"#R1", "#OFF"(%1)   \n\t"    \
-    "add       %2, %0                  \n\t"
-
-/** Sacrificing mm6 allows to pipeline loads from src */
-static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
-                                       const uint8_t *src, x86_reg stride,
-                                       int rnd, int64_t shift)
-{
-    __asm__ volatile(
-        "mov       $3, %%"REG_c"           \n\t"
-        LOAD_ROUNDER_MMX("%5")
-        "movq      "MANGLE(ff_pw_9)", %%mm6 \n\t"
-        "1:                                \n\t"
-        "movd      (%0), %%mm2             \n\t"
-        "add       %2, %0                  \n\t"
-        "movd      (%0), %%mm3             \n\t"
-        "punpcklbw %%mm0, %%mm2            \n\t"
-        "punpcklbw %%mm0, %%mm3            \n\t"
-        SHIFT2_LINE(  0, 1, 2, 3, 4)
-        SHIFT2_LINE( 24, 2, 3, 4, 1)
-        SHIFT2_LINE( 48, 3, 4, 1, 2)
-        SHIFT2_LINE( 72, 4, 1, 2, 3)
-        SHIFT2_LINE( 96, 1, 2, 3, 4)
-        SHIFT2_LINE(120, 2, 3, 4, 1)
-        SHIFT2_LINE(144, 3, 4, 1, 2)
-        SHIFT2_LINE(168, 4, 1, 2, 3)
-        "sub       %6, %0                  \n\t"
-        "add       $8, %1                  \n\t"
-        "dec       %%"REG_c"               \n\t"
-        "jnz 1b                            \n\t"
-        : "+r"(src), "+r"(dst)
-        : "r"(stride), "r"(-2*stride),
-          "m"(shift), "m"(rnd), "r"(9*stride-4)
-        : "%"REG_c, "memory"
-    );
-}
-
-/**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
-                                             const int16_t *src, int rnd)\
-{\
-    int h = 8;\
-\
-    src -= 1;\
-    rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
-    __asm__ volatile(\
-        LOAD_ROUNDER_MMX("%4")\
-        "movq      "MANGLE(ff_pw_128)", %%mm6\n\t"\
-        "movq      "MANGLE(ff_pw_9)", %%mm5 \n\t"\
-        "1:                                \n\t"\
-        "movq      2*0+0(%1), %%mm1        \n\t"\
-        "movq      2*0+8(%1), %%mm2        \n\t"\
-        "movq      2*1+0(%1), %%mm3        \n\t"\
-        "movq      2*1+8(%1), %%mm4        \n\t"\
-        "paddw     2*3+0(%1), %%mm1        \n\t"\
-        "paddw     2*3+8(%1), %%mm2        \n\t"\
-        "paddw     2*2+0(%1), %%mm3        \n\t"\
-        "paddw     2*2+8(%1), %%mm4        \n\t"\
-        "pmullw    %%mm5, %%mm3            \n\t"\
-        "pmullw    %%mm5, %%mm4            \n\t"\
-        "psubw     %%mm1, %%mm3            \n\t"\
-        "psubw     %%mm2, %%mm4            \n\t"\
-        NORMALIZE_MMX("$7")\
-        /* Remove bias */\
-        "paddw     %%mm6, %%mm3            \n\t"\
-        "paddw     %%mm6, %%mm4            \n\t"\
-        TRANSFER_DO_PACK(OP)\
-        "add       $24, %1                 \n\t"\
-        "add       %3, %2                  \n\t"\
-        "decl      %0                      \n\t"\
-        "jnz 1b                            \n\t"\
-        : "+r"(h), "+r" (src),  "+r" (dst)\
-        : "r"(stride), "m"(rnd)\
-        : "memory"\
-    );\
-}
-
-VC1_HOR_16b_SHIFT2(OP_PUT, put_)
-VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
-
-
 /**
  * Purely vertical or horizontal 1/2 shift interpolation.
  * Sacrifice mm6 for *9 factor.
@@ -213,6 +124,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
         : "+r"(src),  "+r"(dst)\
         : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
           "g"(stride-offset)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)\
         : "%"REG_c, "memory"\
     );\
 }
@@ -315,6 +227,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(src_stride), "r"(3*src_stride),                           \
           "m"(rnd), "m"(shift)                                          \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -352,6 +265,7 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(stride), "m"(rnd)                                         \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128)    \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -387,6 +301,7 @@ OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \
+          NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -420,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
  * @param  hmode   Vertical filter.
  * @param  rnd     Rounding bias.
  */
-#define VC1_MSPEL_MC(OP)\
+#define VC1_MSPEL_MC(OP, INSTR)\
 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
                                int hmode, int vmode, int rnd)\
 {\
     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
-         { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
+         { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
-         { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
+         { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
          { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
 \
@@ -441,7 +356,7 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
             static const int shift_value[] = { 0, 5, 1, 5 };\
             int              shift = (shift_value[hmode]+shift_value[vmode])>>1;\
             int              r;\
-            DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
+            LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\
 \
             r = (1<<(shift-1)) + rnd-1;\
             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
@@ -457,10 +372,19 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
 \
     /* Horizontal mode with no vertical mode */\
     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+} \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
+                                  int stride, int hmode, int vmode, int rnd)\
+{ \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+    dst += 8*stride; src += 8*stride; \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
 }
 
-VC1_MSPEL_MC(put_)
-VC1_MSPEL_MC(avg_)
+VC1_MSPEL_MC(put_, mmx)
+VC1_MSPEL_MC(avg_, mmxext)
 
 /** Macro to ease bicubic filter interpolation functions declarations */
 #define DECLARE_FUNCTION(a, b)                                          \
@@ -477,6 +401,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst,         \
                                                   int rnd)              \
 {                                                                       \
      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \
+}\
+static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t stride,     \
+                                                  int rnd)              \
+{                                                                       \
+     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst,      \
+                                                     const uint8_t *src,\
+                                                     ptrdiff_t stride,  \
+                                                     int rnd)           \
+{                                                                       \
+     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
 }
 
 DECLARE_FUNCTION(0, 1)
@@ -498,261 +436,51 @@ DECLARE_FUNCTION(3, 1)
 DECLARE_FUNCTION(3, 2)
 DECLARE_FUNCTION(3, 3)
 
-static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (17 * dc +  4) >> 3;
-    dc = (17 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dest+0*linesize)),
-         "+m"(*(uint32_t*)(dest+1*linesize)),
-         "+m"(*(uint32_t*)(dest+2*linesize)),
-         "+m"(*(uint32_t*)(dest+3*linesize))
-    );
-}
-
-static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (17 * dc +  4) >> 3;
-    dc = (12 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dest+0*linesize)),
-         "+m"(*(uint32_t*)(dest+1*linesize)),
-         "+m"(*(uint32_t*)(dest+2*linesize)),
-         "+m"(*(uint32_t*)(dest+3*linesize))
-    );
-    dest += 4*linesize;
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dest+0*linesize)),
-         "+m"(*(uint32_t*)(dest+1*linesize)),
-         "+m"(*(uint32_t*)(dest+2*linesize)),
-         "+m"(*(uint32_t*)(dest+3*linesize))
-    );
-}
-
-static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = ( 3 * dc +  1) >> 1;
-    dc = (17 * dc + 64) >> 7;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dest+0*linesize)),
-         "+m"(*(uint32_t*)(dest+1*linesize)),
-         "+m"(*(uint32_t*)(dest+2*linesize)),
-         "+m"(*(uint32_t*)(dest+3*linesize))
-    );
-}
-
-static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
-                                        int16_t *block)
-{
-    int dc = block[0];
-    dc = (3 * dc +  1) >> 1;
-    dc = (3 * dc + 16) >> 5;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dest+0*linesize)),
-         "+m"(*(uint32_t*)(dest+1*linesize)),
-         "+m"(*(uint32_t*)(dest+2*linesize)),
-         "+m"(*(uint32_t*)(dest+3*linesize))
-    );
-    dest += 4*linesize;
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dest+0*linesize)),
-         "+m"(*(uint32_t*)(dest+1*linesize)),
-         "+m"(*(uint32_t*)(dest+2*linesize)),
-         "+m"(*(uint32_t*)(dest+3*linesize))
-    );
-}
-
-static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
-                                   ptrdiff_t stride, int rnd)
-{
-    ff_put_pixels8_mmx(dst, src, stride, 8);
-}
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
 
 av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 {
-    dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+    FN_ASSIGN(put_, 0, 1, _mmx);
+    FN_ASSIGN(put_, 0, 2, _mmx);
+    FN_ASSIGN(put_, 0, 3, _mmx);
+
+    FN_ASSIGN(put_, 1, 0, _mmx);
+    FN_ASSIGN(put_, 1, 1, _mmx);
+    FN_ASSIGN(put_, 1, 2, _mmx);
+    FN_ASSIGN(put_, 1, 3, _mmx);
+
+    FN_ASSIGN(put_, 2, 0, _mmx);
+    FN_ASSIGN(put_, 2, 1, _mmx);
+    FN_ASSIGN(put_, 2, 2, _mmx);
+    FN_ASSIGN(put_, 2, 3, _mmx);
+
+    FN_ASSIGN(put_, 3, 0, _mmx);
+    FN_ASSIGN(put_, 3, 1, _mmx);
+    FN_ASSIGN(put_, 3, 2, _mmx);
+    FN_ASSIGN(put_, 3, 3, _mmx);
 }
 
 av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
 {
-    dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
-
-    dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
-
-    dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
-    dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
-    dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
-    dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
+    FN_ASSIGN(avg_, 0, 1, _mmxext);
+    FN_ASSIGN(avg_, 0, 2, _mmxext);
+    FN_ASSIGN(avg_, 0, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 1, 0, _mmxext);
+    FN_ASSIGN(avg_, 1, 1, _mmxext);
+    FN_ASSIGN(avg_, 1, 2, _mmxext);
+    FN_ASSIGN(avg_, 1, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 2, 0, _mmxext);
+    FN_ASSIGN(avg_, 2, 1, _mmxext);
+    FN_ASSIGN(avg_, 2, 2, _mmxext);
+    FN_ASSIGN(avg_, 2, 3, _mmxext);
+
+    FN_ASSIGN(avg_, 3, 0, _mmxext);
+    FN_ASSIGN(avg_, 3, 1, _mmxext);
+    FN_ASSIGN(avg_, 3, 2, _mmxext);
+    FN_ASSIGN(avg_, 3, 3, _mmxext);
 }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index 53b9e82..a807d3b 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -2,20 +2,20 @@
 ;* Core video DSP functions
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -54,13 +54,13 @@ SECTION .text
 ; |    |    <- bottom is copied from last line in body of source
 ; '----' <- bh
 %if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
                                 start_y, end_y, bh, w
 %else ; x86-32
 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
 %define src_strideq r3mp
-%define dst_strideq r2mp
-    mov            srcq, r1mp
+%define dst_strideq r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
@@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
     neg        n_wordsq
     lea        start_xq, [start_xq+n_wordsq*2]
 .y_loop:                                        ; do {
-    ; FIXME also write a ssse3 version using pshufb
+%if cpuflag(avx2)
+    vpbroadcastb     m0, [dstq+start_xq]
+    mov              wq, n_wordsq               ;   initialize w
+%else
     movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
     imul             wd, 0x01010101             ;   w *= 0x01010101
     movd             m0, wd
@@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
 %else ; mmx
     punpckldq        m0, m0                     ;   splat
 %endif ; mmx/sse
+%endif ; avx2
 .x_loop:                                        ;   do {
     movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
     add              wq, mmsize/2               ;     w -= $mmsize/2
@@ -127,6 +131,11 @@ hvar_fn
 INIT_XMM sse2
 hvar_fn
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+hvar_fn
+%endif
+
 ; macro to read/write a horizontal number of pixels (%2) to/from registers
 ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
@@ -184,10 +193,10 @@ hvar_fn
     mov            valb, [srcq+%2-1]
 %elif (%2-%%off) == 2
     mov            valw, [srcq+%2-2]
-%elifidn %1, body
-    mov            vald, [srcq+%2-3]
 %else
-    movd mm %+ %%mmx_idx, [srcq+%2-3]
+    mov            valb, [srcq+%2-1]
+    ror            vald, 16
+    mov            valw, [srcq+%2-3]
 %endif
 %endif ; (%2-%%off) >= 1
 %endmacro ; READ_NUM_BYTES
@@ -240,15 +249,13 @@ hvar_fn
     mov     [dstq+%2-1], valb
 %elif (%2-%%off) == 2
     mov     [dstq+%2-2], valw
-%elifidn %1, body
-    mov     [dstq+%2-3], valw
-    shr            vald, 16
-    mov     [dstq+%2-1], valb
 %else
-    movd           vald, mm %+ %%mmx_idx
     mov     [dstq+%2-3], valw
-    shr            vald, 16
+    ror            vald, 16
     mov     [dstq+%2-1], valb
+%ifnidn %1, body
+    ror            vald, 16
+%endif
 %endif
 %endif ; (%2-%%off) >= 1
 %endmacro ; WRITE_NUM_BYTES
@@ -262,30 +269,30 @@ hvar_fn
 %rep 1+%2-%1
 %if %%n <= 3
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, val, bh
     mov             bhq, r6mp                   ; r6mp = bhmp
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
     mov            dstq, r0mp
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %else
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, bh
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %endif
@@ -344,9 +351,8 @@ VERTICAL_EXTEND 16, 22
 ; obviously not the same on both sides.
 
 %macro READ_V_PIXEL 2
-%if %1 == 2
-    movzx          valw, byte %2
-    imul           valw, 0x0101
+%if cpuflag(avx2)
+    vpbroadcastb     m0, %2
 %else
     movzx          vald, byte %2
     imul           vald, 0x01010101
@@ -356,13 +362,16 @@ VERTICAL_EXTEND 16, 22
     pshufd           m0, m0, q0000
 %else
     punpckldq        m0, m0
-%endif
-%endif ; %1 >= 8
-%endif
+%endif ; mmsize == 16
+%endif ; %1 > 16
+%endif ; avx2
 %endmacro ; READ_V_PIXEL
 
 %macro WRITE_V_PIXEL 2
 %assign %%off 0
+
+%if %1 >= 8
+
 %rep %1/mmsize
     movu     [%2+%%off], m0
 %assign %%off %%off+mmsize
@@ -378,34 +387,44 @@ VERTICAL_EXTEND 16, 22
 %assign %%off %%off+8
 %endif
 %endif ; %1-%%off >= 8
-%endif
+%endif ; mmsize == 16
 
 %if %1-%%off >= 4
 %if %1 > 8 && %1-%%off > 4
     movq      [%2+%1-8], m0
 %assign %%off %1
-%elif %1 >= 8 && %1-%%off >= 4
-    movd     [%2+%%off], m0
-%assign %%off %%off+4
 %else
-    mov      [%2+%%off], vald
+    movd     [%2+%%off], m0
 %assign %%off %%off+4
 %endif
 %endif ; %1-%%off >= 4
 
-%if %1-%%off >= 2
-%if %1 >= 8
-    movd      [%2+%1-4], m0
+%else ; %1 < 8
+
+%rep %1/4
+    mov      [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
+%if cpuflag(avx2)
+    movd     [%2+%%off-2], m0
 %else
     mov      [%2+%%off], valw
-%endif
+%endif ; avx2
 %endif ; (%1-%%off)/2
 %endmacro ; WRITE_V_PIXEL
 
 %macro H_EXTEND 2
 %assign %%n %1
 %rep 1+(%2-%1)/2
+%if cpuflag(avx2)
+cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
+%else
 cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
+%endif
 .loop_y:                                        ; do {
     READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
     WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
@@ -426,6 +445,11 @@ H_EXTEND 16, 22
 INIT_XMM sse2
 H_EXTEND 16, 22
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+H_EXTEND 8, 22
+%endif
+
 %macro PREFETCH_FN 1
 cglobal prefetch, 3, 3, 0, buf, stride, h
 .loop:
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index 8ee8370..26e072b 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -1,25 +1,27 @@
 /*
+ * Copyright (C) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
@@ -28,11 +30,11 @@
 #include "libavcodec/videodsp.h"
 
 #if HAVE_YASM
-typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh,
                                 x86_reg w);
 
@@ -59,7 +61,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_vfix_func *vfixtbl_mmx[22] = {
+static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
     &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
     &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
     &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
@@ -78,7 +80,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func *vfixtbl_sse[22] = {
+static emu_edge_vfix_func * const vfixtbl_sse[22] = {
     ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
     ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
     ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
@@ -107,7 +109,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_hfix_func *hfixtbl_mmx[11] = {
+static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
@@ -119,13 +121,30 @@ extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
-static emu_edge_hfix_func *hfixtbl_sse2[11] = {
+static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
     ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
 };
 extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
 
 static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t dst_stride,
@@ -133,22 +152,26 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               x86_reg block_w, x86_reg block_h,
                                               x86_reg src_x, x86_reg src_y,
                                               x86_reg w, x86_reg h,
-                                              emu_edge_vfix_func **vfix_tbl,
+                                              emu_edge_vfix_func * const *vfix_tbl,
                                               emu_edge_vvar_func *v_extend_var,
-                                              emu_edge_hfix_func **hfix_tbl,
+                                              emu_edge_hfix_func * const *hfix_tbl,
                                               emu_edge_hvar_func *h_extend_var)
 {
     x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
 
     if (!w || !h)
-         return;
+        return;
+
+    av_assert2(block_w <= FFABS(dst_stride));
 
     if (src_y >= h) {
-        src  -= src_y * src_stride;
-        src_y = src_y_add = h - 1;
+        src -= src_y*src_stride;
+        src_y_add = h - 1;
+        src_y     = h - 1;
     } else if (src_y <= -block_h) {
-        src  -= src_y*src_stride;
-        src_y = src_y_add = 1 - block_h;
+        src -= src_y*src_stride;
+        src_y_add = 1 - block_h;
+        src_y     = 1 - block_h;
     }
     if (src_x >= w) {
         src   += w - 1 - src_x;
@@ -162,18 +185,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y   = FFMIN(block_h, h-src_y);
     end_x   = FFMIN(block_w, w-src_x);
-    assert(start_x < end_x && block_w > 0);
-    assert(start_y < end_y && block_h > 0);
+    av_assert2(start_x < end_x && block_w > 0);
+    av_assert2(start_y < end_y && block_h > 0);
 
     // fill in the to-be-copied part plus all above/below
     src += (src_y_add + start_y) * src_stride + start_x;
     w = end_x - start_x;
     if (w <= 22) {
-        vfix_tbl[w - 1](dst + start_x, src,
-                        dst_stride, src_stride,
+        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                         start_y, end_y, block_h);
     } else {
-        v_extend_var(dst + start_x, src, dst_stride, src_stride,
+        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                      start_y, end_y, block_h, w);
     }
 
@@ -212,7 +234,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
                      hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
 }
 
-static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src,
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
                                              int block_w, int block_h,
@@ -231,10 +253,24 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                               int src_x, int src_y, int w,
                                               int h)
 {
-    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x,
-                     src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                      hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
 }
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+                                              ptrdiff_t buf_stride,
+                                              ptrdiff_t src_stride,
+                                              int block_w, int block_h,
+                                              int src_x, int src_y, int w,
+                                              int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
 #endif /* HAVE_YASM */
 
 void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
@@ -264,5 +300,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
     if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
         ctx->emulated_edge_mc = emulated_edge_mc_sse2;
     }
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+    }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
index c54650e..b25d838 100644
--- a/libavcodec/x86/vorbisdsp.asm
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -2,20 +2,20 @@
 ;* Vorbis x86 optimizations
 ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index bbd8319..bc1cc43 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index fc8a047..d457cd7 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,6 +40,7 @@ pb_81: times 8 db 0x81
 cextern pb_1
 cextern pb_3
 cextern pb_80
+cextern pb_FE
 
 cextern pw_8
 
@@ -147,6 +148,49 @@ cglobal vp3_h_loop_filter, 3, 4
     STORE_4_WORDS m3
     RET
 
+%macro PAVGB_NO_RND 0
+    mova   m4, m0
+    mova   m5, m2
+    pand   m4, m1
+    pand   m5, m3
+    pxor   m1, m0
+    pxor   m3, m2
+    pand   m1, m6
+    pand   m3, m6
+    psrlq  m1, 1
+    psrlq  m3, 1
+    paddb  m4, m1
+    paddb  m5, m3
+%endmacro
+
+INIT_MMX mmx
+cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
+    mova   m6, [pb_FE]
+    lea    stride3q,[strideq+strideq*2]
+.loop:
+    mova   m0, [src1q]
+    mova   m1, [src2q]
+    mova   m2, [src1q+strideq]
+    mova   m3, [src2q+strideq]
+    PAVGB_NO_RND
+    mova   [dstq], m4
+    mova   [dstq+strideq], m5
+
+    mova   m0, [src1q+strideq*2]
+    mova   m1, [src2q+strideq*2]
+    mova   m2, [src1q+stride3q]
+    mova   m3, [src2q+stride3q]
+    PAVGB_NO_RND
+    mova   [dstq+strideq*2], m4
+    mova   [dstq+stride3q],  m5
+
+    lea    src1q, [src1q+strideq*4]
+    lea    src2q, [src2q+strideq*4]
+    lea    dstq,  [dstq+strideq*4]
+    sub    hd, 4
+    jnz .loop
+    RET
+
 ; from original comments: The Macro does IDct on 4 1-D Dcts
 %macro BeginIDCT 0
     movq          m2, I(3)
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index b320dc5..2ece9ab 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,7 +25,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
-#include "config.h"
 
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
@@ -39,16 +40,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
 void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
                                  int *bounding_values);
 
+void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
+                                     const uint8_t *b, ptrdiff_t stride,
+                                     int h);
+
 av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
+#if ARCH_X86_32
         c->idct_put  = ff_vp3_idct_put_mmx;
         c->idct_add  = ff_vp3_idct_add_mmx;
-    }
 #endif
+    }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index 0a69368..810cc8d 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,49 +4,46 @@
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (C) 2010  Eli Friedman
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_X86_VP56_ARITH_H
 #define AVCODEC_X86_VP56_ARITH_H
 
-#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
 #define vp56_rac_get_prob vp56_rac_get_prob
 static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
 {
     unsigned int code_word = vp56_rac_renorm(c);
-    unsigned int high = c->high;
-    unsigned int low = 1 + (((high - 1) * prob) >> 8);
+    unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
     unsigned int low_shift = low << 16;
     int bit = 0;
+    c->code_word = code_word;
 
     __asm__(
         "subl  %4, %1      \n\t"
         "subl  %3, %2      \n\t"
-        "leal (%2, %3), %3 \n\t"
         "setae %b0         \n\t"
         "cmovb %4, %1      \n\t"
-        "cmovb %3, %2      \n\t"
-        : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
-        : "r"(low)
+        "cmovb %5, %2      \n\t"
+        : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+        : "r"(low_shift), "r"(low), "r"(code_word)
     );
 
-    c->high      = high;
-    c->code_word = code_word;
     return bit;
 }
 #endif
diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm
index 80f8ca5..3d874ea 100644
--- a/libavcodec/x86/vp6dsp.asm
+++ b/libavcodec/x86/vp6dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
 ;* Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c
index cd94f3e..82baee7 100644
--- a/libavcodec/x86/vp6dsp_init.c
+++ b/libavcodec/x86/vp6dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
  * Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index adc9730..538b3f4 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -143,13 +143,13 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
 
-pw_256:   times 8 dw 256
 pw_20091: times 4 dw 20091
 pw_17734: times 4 dw 17734
 
 cextern pw_3
 cextern pw_4
 cextern pw_64
+cextern pw_256
 
 SECTION .text
 
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index e5afd49..8d5d033 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -169,7 +169,7 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
     uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
     src -= srcstride * (TAPNUMY / 2 - 1); \
     ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
@@ -214,7 +214,7 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
     ptrdiff_t srcstride, int height, int mx, int my) \
 { \
-    DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
+    LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
     ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
         tmp, SIZE,      src, srcstride, height + 1, mx, my); \
     ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
@@ -347,7 +347,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
 
-    if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         VP8_LUMA_MC_FUNC(0, 16, sse2);
         VP8_MC_FUNC(1, 8, sse2);
         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -417,7 +417,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
         c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
     }
 
-    if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
 
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -430,7 +430,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
 
-        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
 
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
@@ -455,7 +455,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     }
 
     if (EXTERNAL_SSE4(cpu_flags)) {
-        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
 
         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm
index 5d792e8..98bb669 100644
--- a/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm
deleted file mode 100644
index 6488f30..0000000
--- a/libavcodec/x86/vp9dsp.asm
+++ /dev/null
@@ -1,277 +0,0 @@
-;******************************************************************************
-;* VP9 SIMD optimizations
-;*
-;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
-;*
-;* This file is part of Libav.
-;*
-;* Libav is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* Libav is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA
-
-; FIXME share with vp8dsp.asm
-pw_256:   times 8 dw 256
-
-%macro F8_TAPS 8
-times 8 db %1, %2
-times 8 db %3, %4
-times 8 db %5, %6
-times 8 db %7, %8
-%endmacro
-; int8_t ff_filters_ssse3[3][15][4][16]
-const filters_ssse3 ; smooth
-                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
-                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
-                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
-                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
-                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
-                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
-                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
-                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
-                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
-                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
-                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
-                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
-                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
-                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
-                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
-                    ; regular
-                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
-                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
-                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
-                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
-                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
-                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
-                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
-                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
-                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
-                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
-                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
-                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
-                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
-                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
-                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
-                    ; sharp
-                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
-                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
-                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
-                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
-                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
-                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
-                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
-                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
-                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
-                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
-                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
-                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
-                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
-                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
-                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
-
-SECTION .text
-
-%macro filter_h_fn 1
-%assign %%px mmsize/2
-cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
-    mova        m6, [pw_256]
-    mova        m7, [filteryq+ 0]
-%if ARCH_X86_64 && mmsize > 8
-    mova        m8, [filteryq+16]
-    mova        m9, [filteryq+32]
-    mova       m10, [filteryq+48]
-%endif
-.loop:
-    movh        m0, [srcq-3]
-    movh        m1, [srcq-2]
-    movh        m2, [srcq-1]
-    movh        m3, [srcq+0]
-    movh        m4, [srcq+1]
-    movh        m5, [srcq+2]
-    punpcklbw   m0, m1
-    punpcklbw   m2, m3
-    movh        m1, [srcq+3]
-    movh        m3, [srcq+4]
-    add       srcq, sstrideq
-    punpcklbw   m4, m5
-    punpcklbw   m1, m3
-    pmaddubsw   m0, m7
-%if ARCH_X86_64 && mmsize > 8
-    pmaddubsw   m2, m8
-    pmaddubsw   m4, m9
-    pmaddubsw   m1, m10
-%else
-    pmaddubsw   m2, [filteryq+16]
-    pmaddubsw   m4, [filteryq+32]
-    pmaddubsw   m1, [filteryq+48]
-%endif
-    paddw       m0, m2
-    paddw       m4, m1
-    paddsw      m0, m4
-    pmulhrsw    m0, m6
-%ifidn %1, avg
-    movh        m1, [dstq]
-%endif
-    packuswb    m0, m0
-%ifidn %1, avg
-    pavgb       m0, m1
-%endif
-    movh    [dstq], m0
-    add       dstq, dstrideq
-    dec         hd
-    jg .loop
-    RET
-%endmacro
-
-INIT_MMX ssse3
-filter_h_fn put
-filter_h_fn avg
-
-INIT_XMM ssse3
-filter_h_fn put
-filter_h_fn avg
-
-%macro filter_v_fn 1
-%assign %%px mmsize/2
-%if ARCH_X86_64
-cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3
-%else
-cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3
-    mov   filteryq, r5mp
-%define hd r4mp
-%endif
-    sub       srcq, sstrideq
-    lea  sstride3q, [sstrideq*3]
-    sub       srcq, sstrideq
-    mova        m6, [pw_256]
-    sub       srcq, sstrideq
-    mova        m7, [filteryq+ 0]
-    lea      src4q, [srcq+sstrideq*4]
-%if ARCH_X86_64 && mmsize > 8
-    mova        m8, [filteryq+16]
-    mova        m9, [filteryq+32]
-    mova       m10, [filteryq+48]
-%endif
-.loop:
-    ; FIXME maybe reuse loads from previous rows, or just more generally
-    ; unroll this to prevent multiple loads of the same data?
-    movh        m0, [srcq]
-    movh        m1, [srcq+sstrideq]
-    movh        m2, [srcq+sstrideq*2]
-    movh        m3, [srcq+sstride3q]
-    movh        m4, [src4q]
-    movh        m5, [src4q+sstrideq]
-    punpcklbw   m0, m1
-    punpcklbw   m2, m3
-    movh        m1, [src4q+sstrideq*2]
-    movh        m3, [src4q+sstride3q]
-    add       srcq, sstrideq
-    add      src4q, sstrideq
-    punpcklbw   m4, m5
-    punpcklbw   m1, m3
-    pmaddubsw   m0, m7
-%if ARCH_X86_64 && mmsize > 8
-    pmaddubsw   m2, m8
-    pmaddubsw   m4, m9
-    pmaddubsw   m1, m10
-%else
-    pmaddubsw   m2, [filteryq+16]
-    pmaddubsw   m4, [filteryq+32]
-    pmaddubsw   m1, [filteryq+48]
-%endif
-    paddw       m0, m2
-    paddw       m4, m1
-    paddsw      m0, m4
-    pmulhrsw    m0, m6
-%ifidn %1, avg
-    movh        m1, [dstq]
-%endif
-    packuswb    m0, m0
-%ifidn %1, avg
-    pavgb       m0, m1
-%endif
-    movh    [dstq], m0
-    add       dstq, dstrideq
-    dec         hd
-    jg .loop
-    RET
-%endmacro
-
-INIT_MMX ssse3
-filter_v_fn put
-filter_v_fn avg
-
-INIT_XMM ssse3
-filter_v_fn put
-filter_v_fn avg
-
-%macro fpel_fn 6
-%if %2 == 4
-%define %%srcfn movh
-%define %%dstfn movh
-%else
-%define %%srcfn movu
-%define %%dstfn mova
-%endif
-
-%if %2 <= 16
-cglobal %1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3
-    lea  sstride3q, [sstrideq*3]
-    lea  dstride3q, [dstrideq*3]
-%else
-cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h
-%endif
-.loop:
-    %%srcfn     m0, [srcq]
-    %%srcfn     m1, [srcq+s%3]
-    %%srcfn     m2, [srcq+s%4]
-    %%srcfn     m3, [srcq+s%5]
-    lea       srcq, [srcq+sstrideq*%6]
-%ifidn %1, avg
-    pavgb       m0, [dstq]
-    pavgb       m1, [dstq+d%3]
-    pavgb       m2, [dstq+d%4]
-    pavgb       m3, [dstq+d%5]
-%endif
-    %%dstfn [dstq], m0
-    %%dstfn [dstq+d%3], m1
-    %%dstfn [dstq+d%4], m2
-    %%dstfn [dstq+d%5], m3
-    lea       dstq, [dstq+dstrideq*%6]
-    sub         hd, %6
-    jnz .loop
-    RET
-%endmacro
-
-%define d16 16
-%define s16 16
-INIT_MMX mmx
-fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
-fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
-INIT_MMX sse
-fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
-fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
-INIT_XMM sse
-fpel_fn put, 16, strideq, strideq*2, stride3q, 4
-fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
-fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
-INIT_XMM sse2
-fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
-fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
-%undef s16
-%undef d16
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index ce58c08..469a661 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -1,240 +1,394 @@
 /*
  * VP9 SIMD optimizations
  *
- * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
 
 #if HAVE_YASM
 
-#define fpel_func(avg, sz, opt)                                         \
-void ff_ ## avg ## sz ## _ ## opt(uint8_t *dst, const uint8_t *src,     \
-                                  ptrdiff_t dst_stride,                 \
-                                  ptrdiff_t src_stride,                 \
-                                  int h, int mx, int my)
-
-fpel_func(put,  4, mmx);
-fpel_func(put,  8, mmx);
-fpel_func(put, 16, sse);
-fpel_func(put, 32, sse);
-fpel_func(put, 64, sse);
-fpel_func(avg,  4, sse);
-fpel_func(avg,  8, sse);
-fpel_func(avg, 16, sse2);
-fpel_func(avg, 32, sse2);
-fpel_func(avg, 64, sse2);
-#undef fpel_func
-
-#define mc_func(avg, sz, dir, opt)                                          \
-void                                                                        \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const int8_t (*filter)[16])
-
-#define mc_funcs(sz)            \
-    mc_func(put, sz, h, ssse3); \
-    mc_func(avg, sz, h, ssse3); \
-    mc_func(put, sz, v, ssse3); \
-    mc_func(avg, sz, v, ssse3)
-
-mc_funcs(4);
-mc_funcs(8);
-
-#undef mc_funcs
-#undef mc_func
-
-#define mc_rep_func(avg, sz, hsz, dir, opt)                                 \
-static av_always_inline void                                                \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const int8_t (*filter)[16]) \
-{                                                                           \
-    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src,        \
-                                                           dst_stride,      \
-                                                           src_stride,      \
-                                                           h,               \
-                                                           filter);         \
-    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz,       \
-                                                           src + hsz,       \
-                                                           dst_stride,      \
-                                                           src_stride,      \
-                                                           h, filter);      \
-}
-
-#define mc_rep_funcs(sz, hsz)            \
-    mc_rep_func(put, sz, hsz, h, ssse3); \
-    mc_rep_func(avg, sz, hsz, h, ssse3); \
-    mc_rep_func(put, sz, hsz, v, ssse3); \
-    mc_rep_func(avg, sz, hsz, v, ssse3)
-
-mc_rep_funcs(16, 8);
-mc_rep_funcs(32, 16);
-mc_rep_funcs(64, 32);
-
-#undef mc_rep_funcs
-#undef mc_rep_func
-
-extern const int8_t ff_filters_ssse3[3][15][4][16];
-
-#define filter_8tap_2d_fn(op, sz, f, fname)                             \
-static void                                                             \
-op ## _8tap_ ## fname ## _ ## sz ## hv_ssse3(uint8_t *dst,              \
-                                             const uint8_t *src,        \
-                                             ptrdiff_t dst_stride,      \
-                                             ptrdiff_t src_stride,      \
-                                             int h, int mx, int my)     \
-{                                                                       \
-    LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]);                         \
-    ff_put_8tap_1d_h_ ## sz ## _ssse3(temp, src - 3 * src_stride,       \
-                                      64, src_stride,                   \
-                                      h + 7,                            \
-                                      ff_filters_ssse3[f][mx - 1]);     \
-    ff_ ## op ## _8tap_1d_v_ ## sz ## _ssse3(dst, temp + 3 * 64,        \
-                                             dst_stride, 64,            \
-                                             h,                         \
-                                             ff_filters_ssse3[f][my - 1]); \
-}
-
-#define filters_8tap_2d_fn(op, sz)                          \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp)     \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
-
-#define filters_8tap_2d_fn2(op) \
-    filters_8tap_2d_fn(op, 64)  \
-    filters_8tap_2d_fn(op, 32)  \
-    filters_8tap_2d_fn(op, 16)  \
-    filters_8tap_2d_fn(op, 8)   \
-    filters_8tap_2d_fn(op, 4)
-
-filters_8tap_2d_fn2(put)
-filters_8tap_2d_fn2(avg)
-
-#undef filters_8tap_2d_fn2
-#undef filters_8tap_2d_fn
-#undef filter_8tap_2d_fn
-
-#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar)                  \
-static void                                                             \
-op ## _8tap_ ## fname ## _ ## sz ## dir ## _ssse3(uint8_t *dst,         \
-                                                  const uint8_t *src,   \
-                                                  ptrdiff_t dst_stride, \
-                                                  ptrdiff_t src_stride, \
-                                                  int h, int mx,        \
-                                                  int my)               \
-{                                                                       \
-    ff_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ssse3(dst, src,        \
-                                                       dst_stride,      \
-                                                       src_stride, h,   \
-                                                       ff_filters_ssse3[f][dvar - 1]); \
-}
-
-#define filters_8tap_1d_fn(op, sz, dir, dvar)                          \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar)     \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
-
-#define filters_8tap_1d_fn2(op, sz)             \
-    filters_8tap_1d_fn(op, sz, h, mx)           \
-    filters_8tap_1d_fn(op, sz, v, my)
-
-#define filters_8tap_1d_fn3(op) \
-    filters_8tap_1d_fn2(op, 64) \
-    filters_8tap_1d_fn2(op, 32) \
-    filters_8tap_1d_fn2(op, 16) \
-    filters_8tap_1d_fn2(op,  8) \
-    filters_8tap_1d_fn2(op,  4)
-
-filters_8tap_1d_fn3(put)
-filters_8tap_1d_fn3(avg)
-
-#undef filters_8tap_1d_fn
-#undef filters_8tap_1d_fn2
-#undef filters_8tap_1d_fn3
-#undef filter_8tap_1d_fn
+decl_fpel_func(put,  4,   , mmx);
+decl_fpel_func(put,  8,   , mmx);
+decl_fpel_func(put, 16,   , sse);
+decl_fpel_func(put, 32,   , sse);
+decl_fpel_func(put, 64,   , sse);
+decl_fpel_func(avg,  4, _8, mmxext);
+decl_fpel_func(avg,  8, _8, mmxext);
+decl_fpel_func(avg, 16, _8, sse2);
+decl_fpel_func(avg, 32, _8, sse2);
+decl_fpel_func(avg, 64, _8, sse2);
+decl_fpel_func(put, 32,   , avx);
+decl_fpel_func(put, 64,   , avx);
+decl_fpel_func(avg, 32, _8, avx2);
+decl_fpel_func(avg, 64, _8, avx2);
+
+decl_mc_funcs(4, mmxext, int16_t, 8, 8);
+decl_mc_funcs(8, sse2, int16_t,  8, 8);
+decl_mc_funcs(4, ssse3, int8_t, 32, 8);
+decl_mc_funcs(8, ssse3, int8_t, 32, 8);
+#if ARCH_X86_64
+decl_mc_funcs(16, ssse3, int8_t, 32, 8);
+decl_mc_funcs(32, avx2, int8_t, 32, 8);
+#endif
+
+mc_rep_funcs(16,  8,  8,  sse2, int16_t,  8, 8)
+#if ARCH_X86_32
+mc_rep_funcs(16,  8,  8, ssse3, int8_t,  32, 8)
+#endif
+mc_rep_funcs(32, 16, 16, sse2,  int16_t,  8, 8)
+mc_rep_funcs(32, 16, 16, ssse3, int8_t,  32, 8)
+mc_rep_funcs(64, 32, 32, sse2,  int16_t,  8, 8)
+mc_rep_funcs(64, 32, 32, ssse3, int8_t,  32, 8)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+mc_rep_funcs(64, 32, 32, avx2,  int8_t,  32, 8)
+#endif
+
+extern const int8_t ff_filters_ssse3[3][15][4][32];
+extern const int16_t ff_filters_sse2[3][15][8][8];
+
+filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2)
+filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3)
+filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3)
+filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3)
+#endif
+
+filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2)
+filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3)
+filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3)
+filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3)
+#endif
+
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                            int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct,  idct,  size, opt); \
+itxfm_func(iadst, idct,  size, opt); \
+itxfm_func(idct,  iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct,  idct,  4, mmxext);
+itxfm_func(idct,  iadst, 4, sse2);
+itxfm_func(iadst, idct,  4, sse2);
+itxfm_func(iadst, iadst, 4, sse2);
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, sse2);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, sse2);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, sse2);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+
+#undef itxfm_func
+#undef itxfm_funcs
+
+#define lpf_funcs(size1, size2, opt) \
+void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H); \
+void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H)
+
+lpf_funcs(16, 16, sse2);
+lpf_funcs(16, 16, ssse3);
+lpf_funcs(16, 16, avx);
+lpf_funcs(44, 16, sse2);
+lpf_funcs(44, 16, ssse3);
+lpf_funcs(44, 16, avx);
+lpf_funcs(84, 16, sse2);
+lpf_funcs(84, 16, ssse3);
+lpf_funcs(84, 16, avx);
+lpf_funcs(48, 16, sse2);
+lpf_funcs(48, 16, ssse3);
+lpf_funcs(48, 16, avx);
+lpf_funcs(88, 16, sse2);
+lpf_funcs(88, 16, ssse3);
+lpf_funcs(88, 16, avx);
+
+#undef lpf_funcs
+
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                   const uint8_t *l, const uint8_t *a)
+
+ipred_func(8, v, mmx);
+
+#define ipred_dc_funcs(size, opt) \
+ipred_func(size, dc, opt); \
+ipred_func(size, dc_left, opt); \
+ipred_func(size, dc_top, opt)
+
+ipred_dc_funcs(4, mmxext);
+ipred_dc_funcs(8, mmxext);
+
+#define ipred_dir_tm_funcs(size, opt) \
+ipred_func(size, tm, opt); \
+ipred_func(size, dl, opt); \
+ipred_func(size, dr, opt); \
+ipred_func(size, hd, opt); \
+ipred_func(size, hu, opt); \
+ipred_func(size, vl, opt); \
+ipred_func(size, vr, opt)
+
+ipred_dir_tm_funcs(4, mmxext);
+
+ipred_func(16, v, sse);
+ipred_func(32, v, sse);
+
+ipred_dc_funcs(16, sse2);
+ipred_dc_funcs(32, sse2);
+
+#define ipred_dir_tm_h_funcs(size, opt) \
+ipred_dir_tm_funcs(size, opt); \
+ipred_func(size, h, opt)
+
+ipred_dir_tm_h_funcs(8, sse2);
+ipred_dir_tm_h_funcs(16, sse2);
+ipred_dir_tm_h_funcs(32, sse2);
+
+ipred_func(4, h, sse2);
+
+#define ipred_all_funcs(size, opt) \
+ipred_dc_funcs(size, opt); \
+ipred_dir_tm_h_funcs(size, opt)
+
+// FIXME hd/vl_4x4_ssse3 does not exist
+ipred_all_funcs(4, ssse3);
+ipred_all_funcs(8, ssse3);
+ipred_all_funcs(16, ssse3);
+ipred_all_funcs(32, ssse3);
+
+ipred_dir_tm_h_funcs(8, avx);
+ipred_dir_tm_h_funcs(16, avx);
+ipred_dir_tm_h_funcs(32, avx);
+
+ipred_func(32, v, avx);
+
+ipred_dc_funcs(32, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_func
+#undef ipred_dir_tm_h_funcs
+#undef ipred_dir_tm_funcs
+#undef ipred_dc_funcs
 
 #endif /* HAVE_YASM */
 
-av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
 {
 #if HAVE_YASM
-    int cpu_flags = av_get_cpu_flags();
-
-#define init_fpel(idx1, idx2, sz, type, opt)                            \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_ ## type ## sz ## _ ## opt
-
-
-#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _ ## opt
-
-#define init_subpel2(idx, idxh, idxv, dir, type, opt)     \
-    init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
-    init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
-    init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
-    init_subpel1(3, idx, idxh, idxv,  8, dir, type, opt); \
-    init_subpel1(4, idx, idxh, idxv,  4, dir, type, opt)
+    int cpu_flags;
+
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_x86(dsp, bitexact);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_x86(dsp, bitexact);
+        return;
+    }
 
-#define init_subpel3(idx, type, opt)        \
-    init_subpel2(idx, 1, 1, hv, type, opt); \
-    init_subpel2(idx, 0, 1,  v, type, opt); \
-    init_subpel2(idx, 1, 0,  h, type, opt)
+    cpu_flags = av_get_cpu_flags();
+
+#define init_lpf(opt) do { \
+    dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
+    dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
+    dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
+    dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
+    dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
+    dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
+    dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
+    dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
+    dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
+    dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
+} while (0)
+
+#define init_ipred(sz, opt, t, e) \
+    dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
+
+#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
+#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
+#define init_dir_tm_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
+    init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
+    init_ipred(sz, opt, hd, HOR_DOWN); \
+    init_ipred(sz, opt, vl, VERT_LEFT); \
+    init_ipred(sz, opt, hu, HOR_UP); \
+    init_ipred(sz, opt, tm, TM_VP8); \
+    init_ipred(sz, opt, vr, VERT_RIGHT); \
+} while (0)
+#define init_dir_tm_h_ipred(sz, opt) do { \
+    init_dir_tm_ipred(sz, opt); \
+    init_ipred(sz, opt, h,  HOR); \
+} while (0)
+#define init_dc_ipred(sz, opt) do { \
+    init_ipred(sz, opt, dc,      DC); \
+    init_ipred(sz, opt, dc_left, LEFT_DC); \
+    init_ipred(sz, opt, dc_top,  TOP_DC); \
+} while (0)
+#define init_all_ipred(sz, opt) do { \
+    init_dc_ipred(sz, opt); \
+    init_dir_tm_h_ipred(sz, opt); \
+} while (0)
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        init_fpel(4, 0,  4, put, mmx);
-        init_fpel(3, 0,  8, put, mmx);
+        init_fpel_func(4, 0,  4, put, , mmx);
+        init_fpel_func(3, 0,  8, put, , mmx);
+        if (!bitexact) {
+            dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+            dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+            dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+            dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+        }
+        init_ipred(8, mmx, v, VERT);
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_subpel2(4, 0, 4, put, 8, mmxext);
+        init_subpel2(4, 1, 4, avg, 8, mmxext);
+        init_fpel_func(4, 1,  4, avg, _8, mmxext);
+        init_fpel_func(3, 1,  8, avg, _8, mmxext);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
+        init_dc_ipred(4, mmxext);
+        init_dc_ipred(8, mmxext);
+        init_dir_tm_ipred(4, mmxext);
     }
 
     if (EXTERNAL_SSE(cpu_flags)) {
-        init_fpel(2, 0, 16, put, sse);
-        init_fpel(1, 0, 32, put, sse);
-        init_fpel(0, 0, 64, put, sse);
-        init_fpel(4, 1,  4, avg, sse);
-        init_fpel(3, 1,  8, avg, sse);
+        init_fpel_func(2, 0, 16, put, , sse);
+        init_fpel_func(1, 0, 32, put, , sse);
+        init_fpel_func(0, 0, 64, put, , sse);
+        init_ipred(16, sse, v, VERT);
+        init_ipred(32, sse, v, VERT);
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
-        init_fpel(2, 1, 16, avg, sse2);
-        init_fpel(1, 1, 32, avg, sse2);
-        init_fpel(0, 1, 64, avg, sse2);
+        init_subpel3_8to64(0, put, 8, sse2);
+        init_subpel3_8to64(1, avg, 8, sse2);
+        init_fpel_func(2, 1, 16, avg,  _8, sse2);
+        init_fpel_func(1, 1, 32, avg,  _8, sse2);
+        init_fpel_func(0, 1, 64, avg,  _8, sse2);
+        init_lpf(sse2);
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_sse2;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_sse2;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_sse2;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2;
+        init_dc_ipred(16, sse2);
+        init_dc_ipred(32, sse2);
+        init_dir_tm_h_ipred(8, sse2);
+        init_dir_tm_h_ipred(16, sse2);
+        init_dir_tm_h_ipred(32, sse2);
+        init_ipred(4, sse2, h, HOR);
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
-        init_subpel3(0, put, ssse3);
-        init_subpel3(1, avg, ssse3);
+        init_subpel3(0, put, 8, ssse3);
+        init_subpel3(1, avg, 8, ssse3);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_ssse3;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_ssse3;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
+        init_lpf(ssse3);
+        init_all_ipred(4, ssse3);
+        init_all_ipred(8, ssse3);
+        init_all_ipred(16, ssse3);
+        init_all_ipred(32, ssse3);
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
+        dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
+        dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+        dsp->itxfm_add[TX_32X32][ADST_ADST] =
+        dsp->itxfm_add[TX_32X32][ADST_DCT] =
+        dsp->itxfm_add[TX_32X32][DCT_ADST] =
+        dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
+        init_lpf(avx);
+        init_dir_tm_h_ipred(8, avx);
+        init_dir_tm_h_ipred(16, avx);
+        init_dir_tm_h_ipred(32, avx);
+    }
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(1, 0, 32, put, , avx);
+        init_fpel_func(0, 0, 64, put, , avx);
+        init_ipred(32, avx, v, VERT);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        init_fpel_func(1, 1, 32, avg, _8, avx2);
+        init_fpel_func(0, 1, 64, avg, _8, avx2);
+        if (ARCH_X86_64) {
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+            init_subpel3_32_64(0, put, 8, avx2);
+            init_subpel3_32_64(1, avg, 8, avx2);
+#endif
+        }
+        init_dc_ipred(32, avx2);
+        init_ipred(32, avx2, h,  HOR);
+        init_ipred(32, avx2, tm, TM_VP8);
     }
 
 #undef init_fpel
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
new file mode 100644
index 0000000..e410cab
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -0,0 +1,189 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP9DSP_INIT_H
+#define AVCODEC_X86_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+// hack to force-expand BPC
+#define cat(a, bpp, b) a##bpp##b
+
+#define decl_fpel_func(avg, sz, bpp, opt) \
+void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                   const uint8_t *src, ptrdiff_t src_stride, \
+                                   int h, int mx, int my)
+
+#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                         const uint8_t *src, ptrdiff_t src_stride, \
+                                                         int h, const type (*filter)[f_sz])
+
+#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
+decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
+decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
+decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
+
+#define decl_ipred_fn(type, sz, bpp, opt) \
+void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
+                                                       ptrdiff_t stride, \
+                                                       const uint8_t *l, \
+                                                       const uint8_t *a)
+
+#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
+decl_ipred_fn(type,  4, bpp, opt4); \
+decl_ipred_fn(type,  8, bpp, opt8_16_32); \
+decl_ipred_fn(type, 16, bpp, opt8_16_32); \
+decl_ipred_fn(type, 32, bpp, opt8_16_32)
+
+#define decl_itxfm_func(typea, typeb, size, bpp, opt) \
+void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \
+                                                                         ptrdiff_t stride, \
+                                                                         int16_t *block, \
+                                                                         int eob)
+
+#define decl_itxfm_funcs(size, bpp, opt) \
+decl_itxfm_func(idct,  idct,  size, bpp, opt); \
+decl_itxfm_func(iadst, idct,  size, bpp, opt); \
+decl_itxfm_func(idct,  iadst, size, bpp, opt); \
+decl_itxfm_func(iadst, iadst, size, bpp, opt)
+
+#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                    const uint8_t *src, ptrdiff_t src_stride, \
+                                                    int h, const type (*filter)[f_sz]) \
+{ \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst,        dst_stride, src, \
+                                                         src_stride, h, filter); \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
+                                                         src_stride, h, filter); \
+}
+
+#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \
+mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \
+mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
+
+#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                          const uint8_t *src, ptrdiff_t src_stride, \
+                                                          int h, int mx, int my) \
+{ \
+    ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
+                                                       h, ff_filters_##f_opt[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp,   dir, dvar, bpp, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth,  dir, dvar, bpp, opt)
+
+#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
+filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
+
+#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
+filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
+
+#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
+static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my) \
+{ \
+    LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
+    ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
+                                              src_stride,  h + 7, \
+                                              ff_filters_##f_opt[f][mx - 1]); \
+    ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
+                                                 64 * bytes, h, \
+                                                 ff_filters_##f_opt[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   f_opt, sharp, align, bpp, bytes, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  f_opt, smooth, align, bpp, bytes, opt)
+
+#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
+filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
+filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
+
+#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+        type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+        type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = \
+        type##_8tap_sharp_##sz##dir##_##bpp##_##opt
+
+#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
+    init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
+    init_subpel1(idx1, idx2, 0, 1, sz, v,  type, bpp, opt); \
+    init_subpel1(idx1, idx2, 1, 0, sz, h,  type, bpp, opt)
+
+#define init_subpel3_32_64(idx, type, bpp, opt) \
+    init_subpel2(0, idx, 64, type, bpp, opt); \
+    init_subpel2(1, idx, 32, type, bpp, opt)
+
+#define init_subpel3_8to64(idx, type, bpp, opt) \
+    init_subpel3_32_64(idx, type, bpp, opt); \
+    init_subpel2(2, idx, 16, type, bpp, opt); \
+    init_subpel2(3, idx,  8, type, bpp, opt)
+
+#define init_subpel3(idx, type, bpp, opt) \
+    init_subpel3_8to64(idx, type, bpp, opt); \
+    init_subpel2(4, idx,  4, type, bpp, opt)
+
+#define init_ipred_func(type, enum, sz, bpp, opt) \
+    dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
+        cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
+
+#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  8, bpp, opt); \
+    init_ipred_func(type, enum, 16, bpp, opt); \
+    init_ipred_func(type, enum, 32, bpp, opt)
+
+#define init_ipred_funcs(type, enum, bpp, opt) \
+    init_ipred_func(type, enum,  4, bpp, opt); \
+    init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
+void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact);
+void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_X86_VP9DSP_INIT_H */
diff --git a/libavcodec/x86/vp9dsp_init_10bpp.c b/libavcodec/x86/vp9dsp_init_10bpp.c
new file mode 100644
index 0000000..2694c06
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_10bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_12bpp.c b/libavcodec/x86/vp9dsp_init_12bpp.c
new file mode 100644
index 0000000..5da3bc1
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_12bpp.c
@@ -0,0 +1,25 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPC 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
+#include "vp9dsp_init_16bpp_template.c"
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
new file mode 100644
index 0000000..eb67499
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -0,0 +1,139 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+decl_fpel_func(put,   8,    , mmx);
+decl_fpel_func(avg,   8, _16, mmxext);
+decl_fpel_func(put,  16,    , sse);
+decl_fpel_func(put,  32,    , sse);
+decl_fpel_func(put,  64,    , sse);
+decl_fpel_func(put, 128,    , sse);
+decl_fpel_func(avg,  16, _16, sse2);
+decl_fpel_func(avg,  32, _16, sse2);
+decl_fpel_func(avg,  64, _16, sse2);
+decl_fpel_func(avg, 128, _16, sse2);
+decl_fpel_func(put,  32,    , avx);
+decl_fpel_func(put,  64,    , avx);
+decl_fpel_func(put, 128,    , avx);
+decl_fpel_func(avg,  32, _16, avx2);
+decl_fpel_func(avg,  64, _16, avx2);
+decl_fpel_func(avg, 128, _16, avx2);
+
+decl_ipred_fns(v,       16, mmx,    sse);
+decl_ipred_fns(h,       16, mmxext, sse2);
+decl_ipred_fns(dc,      16, mmxext, sse2);
+decl_ipred_fns(dc_top,  16, mmxext, sse2);
+decl_ipred_fns(dc_left, 16, mmxext, sse2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2,  sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx,   avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel_func(4, 0,   8, put, , mmx);
+        init_ipred_func(v, VERT, 4, 16, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_fpel_func(4, 1,   8, avg, _16, mmxext);
+        init_ipred_func(h, HOR, 4, 16, mmxext);
+        init_ipred_func(dc, DC, 4, 16, mmxext);
+        init_ipred_func(dc_top,  TOP_DC,  4, 16, mmxext);
+        init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        init_fpel_func(3, 0,  16, put, , sse);
+        init_fpel_func(2, 0,  32, put, , sse);
+        init_fpel_func(1, 0,  64, put, , sse);
+        init_fpel_func(0, 0, 128, put, , sse);
+        init_8_16_32_ipred_funcs(v, VERT, 16, sse);
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_fpel_func(3, 1,  16, avg, _16, sse2);
+        init_fpel_func(2, 1,  32, avg, _16, sse2);
+        init_fpel_func(1, 1,  64, avg, _16, sse2);
+        init_fpel_func(0, 1, 128, avg, _16, sse2);
+        init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
+        init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
+        init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
+        init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+        init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+        init_ipred_funcs(hu, HOR_UP, 16, sse2);
+        init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+        init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+        init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+        init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
+    }
+
+    if (EXTERNAL_AVX_FAST(cpu_flags)) {
+        init_fpel_func(2, 0,  32, put, , avx);
+        init_fpel_func(1, 0,  64, put, , avx);
+        init_fpel_func(0, 0, 128, put, , avx);
+        init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+        init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+        init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+        init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+        init_ipred_funcs(hu, HOR_UP, 16, avx);
+        init_ipred_funcs(hd, HOR_DOWN, 16, avx);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        init_fpel_func(2, 1,  32, avg, _16, avx2);
+        init_fpel_func(1, 1,  64, avg, _16, avx2);
+        init_fpel_func(0, 1, 128, avg, _16, avx2);
+    }
+
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
new file mode 100644
index 0000000..4840b28
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -0,0 +1,240 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "libavcodec/x86/vp9dsp_init.h"
+
+#if HAVE_YASM
+
+extern const int16_t ff_filters_16bpp[3][15][4][16];
+
+decl_mc_funcs(4, sse2, int16_t, 16, BPC);
+decl_mc_funcs(8, sse2, int16_t, 16, BPC);
+decl_mc_funcs(16, avx2, int16_t, 16, BPC);
+
+mc_rep_funcs(16,  8, 16, sse2, int16_t, 16, BPC)
+mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC)
+#if HAVE_AVX2_EXTERNAL
+mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC)
+mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC)
+#endif
+
+filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp)
+filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp)
+filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp)
+#endif
+
+filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp)
+filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp)
+#if HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
+filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
+#endif
+
+#define decl_lpf_func(dir, wd, bpp, opt) \
+void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                     int E, int I, int H)
+
+#define decl_lpf_funcs(dir, wd, bpp) \
+decl_lpf_func(dir, wd, bpp, sse2); \
+decl_lpf_func(dir, wd, bpp, ssse3); \
+decl_lpf_func(dir, wd, bpp, avx)
+
+#define decl_lpf_funcs_wd(dir) \
+decl_lpf_funcs(dir,  4, BPC); \
+decl_lpf_funcs(dir,  8, BPC); \
+decl_lpf_funcs(dir, 16, BPC)
+
+decl_lpf_funcs_wd(h);
+decl_lpf_funcs_wd(v);
+
+#define lpf_16_wrapper(dir, off, bpp, opt) \
+static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                 int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst,       stride, E, I, H); \
+    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
+}
+
+#define lpf_16_wrappers(bpp, opt) \
+lpf_16_wrapper(h, 8 * stride, bpp, opt) \
+lpf_16_wrapper(v, 16,         bpp, opt)
+
+lpf_16_wrappers(BPC, sse2)
+lpf_16_wrappers(BPC, ssse3)
+lpf_16_wrappers(BPC, avx)
+
+#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
+static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                           int E, int I, int H) \
+{ \
+    ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst,       stride, \
+                                                     E & 0xff, I & 0xff, H & 0xff); \
+    ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
+                                                     E >> 8,   I >> 8,   H >> 8); \
+}
+
+#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \
+lpf_mix2_wrapper(v, 16,         wd1, wd2, bpp, opt)
+
+#define lpf_mix2_wrappers_set(bpp, opt) \
+lpf_mix2_wrappers(4, 4, bpp, opt) \
+lpf_mix2_wrappers(4, 8, bpp, opt) \
+lpf_mix2_wrappers(8, 4, bpp, opt) \
+lpf_mix2_wrappers(8, 8, bpp, opt) \
+
+lpf_mix2_wrappers_set(BPC, sse2)
+lpf_mix2_wrappers_set(BPC, ssse3)
+lpf_mix2_wrappers_set(BPC, avx)
+
+decl_ipred_fns(tm, BPC, mmxext, sse2);
+
+decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+decl_itxfm_func(idct,  idct,  4, BPC, mmxext);
+decl_itxfm_funcs(4, BPC, ssse3);
+#else
+decl_itxfm_func(idct,  idct,  4, BPC, sse2);
+#endif
+decl_itxfm_func(idct,  iadst, 4, BPC, sse2);
+decl_itxfm_func(iadst, idct,  4, BPC, sse2);
+decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
+decl_itxfm_funcs(8, BPC, sse2);
+decl_itxfm_funcs(16, BPC, sse2);
+decl_itxfm_func(idct,  idct, 32, BPC, sse2);
+#endif /* HAVE_YASM */
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
+#define init_lpf_16_func(idx, dir, bpp, opt) \
+    dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
+#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
+
+#define init_lpf_funcs(bpp, opt) \
+    init_lpf_8_func(0, 0, h,  4, bpp, opt); \
+    init_lpf_8_func(0, 1, v,  4, bpp, opt); \
+    init_lpf_8_func(1, 0, h,  8, bpp, opt); \
+    init_lpf_8_func(1, 1, v,  8, bpp, opt); \
+    init_lpf_8_func(2, 0, h, 16, bpp, opt); \
+    init_lpf_8_func(2, 1, v, 16, bpp, opt); \
+    init_lpf_16_func(0, h, bpp, opt); \
+    init_lpf_16_func(1, v, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
+    init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
+    init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
+    init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
+    init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
+
+#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \
+    dsp->itxfm_add[idxa][idxb] = \
+        cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt);
+#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \
+    init_itx_func(idx, DCT_DCT,   typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, ADST_DCT,  typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, DCT_ADST,  typea, typeb, size, bpp, opt); \
+    init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
+#define init_itx_funcs(idx, size, bpp, opt) \
+    init_itx_func(idx, DCT_DCT,   idct,  idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_DCT,  idct,  iadst, size, bpp, opt); \
+    init_itx_func(idx, DCT_ADST,  iadst, idct,  size, bpp, opt); \
+    init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
+        if (!bitexact) {
+            init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
+#if BPC == 10
+            init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
+#endif
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_subpel3(0, put, BPC, sse2);
+        init_subpel3(1, avg, BPC, sse2);
+        init_lpf_funcs(BPC, sse2);
+        init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
+#if BPC == 10
+        if (!bitexact) {
+            init_itx_func(TX_4X4, ADST_DCT,  idct,  iadst, 4, 10, sse2);
+            init_itx_func(TX_4X4, DCT_ADST,  iadst, idct,  4, 10, sse2);
+            init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
+        }
+#else
+        init_itx_funcs(TX_4X4, 4, 12, sse2);
+#endif
+        init_itx_funcs(TX_8X8, 8, BPC, sse2);
+        init_itx_funcs(TX_16X16, 16, BPC, sse2);
+        init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_lpf_funcs(BPC, ssse3);
+#if BPC == 10
+        if (!bitexact) {
+            init_itx_funcs(TX_4X4, 4, BPC, ssse3);
+        }
+#endif
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        init_lpf_funcs(BPC, avx);
+    }
+
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if HAVE_AVX2_EXTERNAL
+        init_subpel3_32_64(0,  put, BPC, avx2);
+        init_subpel3_32_64(1,  avg, BPC, avx2);
+        init_subpel2(2, 0, 16, put, BPC, avx2);
+        init_subpel2(2, 1, 16, avg, BPC, avx2);
+#endif
+    }
+
+#endif /* HAVE_YASM */
+
+    ff_vp9dsp_init_16bpp_x86(dsp);
+}
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 0000000..31f7d44
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
+pw_4096: times 8 dw 4096
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+                    times 4 db 2
+                    times 4 db 1
+                    times 4 db 0
+pb_8x1_8x0:   times 8 db 1
+              times 8 db 0
+pb_8x3_8x2:   times 8 db 3
+              times 8 db 2
+pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
+              times 8 db -1
+pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
+              times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+              times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+              times 11 db 7
+pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+               times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+              times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_15x0_1xm1: times 15 db 0
+              db -1
+pb_0to2_5x3: db 0, 1, 2
+             times 5 db 3
+pb_6xm1_2x0: times 6 db -1
+             times 2 db 0
+pb_6x0_2xm1: times 6 db 0
+             times 2 db -1
+
+cextern pb_1
+cextern pb_2
+cextern pb_3
+cextern pb_15
+cextern pw_2
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_255
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_8192
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_4to8_FUNCS 0
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [lq]
+    movq                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_4to8_FUNCS
+INIT_MMX ssse3
+DC_4to8_FUNCS
+
+%macro DC_16to32_FUNCS 0
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    mova                    m2, [aq]
+    mova                    m3, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m4, m4
+    psadbw                  m0, m4
+    psadbw                  m1, m4
+    psadbw                  m2, m4
+    psadbw                  m3, m4
+    paddw                   m0, m1
+    paddw                   m2, m3
+    paddw                   m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_512]
+    pshufb                  m0, m4
+%else
+    paddw                   m0, [pw_32]
+    psraw                   m0, 6
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_16to32_FUNCS
+INIT_XMM ssse3
+DC_16to32_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_512]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [%2q]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_8192]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_2]
+    psraw                   m0, 2
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+%else
+    paddw                   m0, [pw_4]
+    psraw                   m0, 3
+    punpcklbw               m0, m0
+    pshufw                  m0, m0, q0000
+%endif
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+INIT_MMX ssse3
+DC_1D_4to8_FUNCS top,  a
+DC_1D_4to8_FUNCS left, l
+
+%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_8]
+    psraw                   m0, 4
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    mova                    m1, [%2q+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+%if cpuflag(ssse3)
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+%else
+    paddw                   m0, [pw_16]
+    psraw                   m0, 5
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+%endif
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+INIT_XMM ssse3
+DC_1D_16to32_FUNCS top,  a
+DC_1D_16to32_FUNCS left, l
+
+%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
+%if HAVE_AVX2_EXTERNAL
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_1024]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+%endmacro
+
+INIT_YMM avx2
+DC_1D_AVX2_FUNCS top,  a
+DC_1D_AVX2_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_YMM avx
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+; h
+
+%macro H_XMM_FUNCS 2
+%if notcpuflag(avx)
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
+%else
+    punpcklbw               m0, m0
+    pshuflw                 m0, m0, q0123
+    punpcklwd               m0, m0
+%endif
+    lea               stride3q, [strideq*3]
+    movd      [dstq+strideq*0], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*1], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*2], m0
+    psrldq                  m0, 4
+    movd      [dstq+stride3q ], m0
+    RET
+%endif
+
+cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m2, [pb_8x1_8x0]
+    mova                    m3, [pb_8x3_8x2]
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 1
+.loop:
+    movd                    m0, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, m3
+    pshufb                  m0, m2
+%else
+    punpcklbw               m0, m0
+    punpcklwd               m0, m0
+    pshufd                  m1, m0, q2233
+    pshufd                  m0, m0, q0011
+%endif
+    movq      [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m1
+    movq      [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 3
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+%endif
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+%if cpuflag(ssse3)
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+%else
+    punpcklbw               m3, m3
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+%endif
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+%if cpuflag(ssse3)
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+%else
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+H_XMM_FUNCS 2, 4
+INIT_XMM ssse3
+H_XMM_FUNCS 4, 8
+INIT_XMM avx
+H_XMM_FUNCS 4, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                   xm3, [lq+cntq*4]
+    vinserti128             m3, m3, xm3, 1
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; tm
+
+%macro TM_MMX_FUNCS 0
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+    pxor                    m1, m1
+    movd                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 1
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    pshufw                  m4, m2, q1111
+    pshufw                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m4
+    packuswb                m2, m2
+    movd      [dstq+strideq*0], m4
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+TM_MMX_FUNCS
+INIT_MMX ssse3
+TM_MMX_FUNCS
+
+%macro TM_XMM_FUNCS 0
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+    pxor                    m1, m1
+    movh                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpcklbw               m0, m1
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m3, [pw_m256]
+    mova                    m1, [pw_m255]
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m0, m2
+    mov                   cntq, 3
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m4, m2, m1
+    pshufb                  m2, m3
+%else
+    punpcklbw               m2, m1
+    punpcklwd               m2, m2
+    pshufd                  m4, m2, q1111
+    pshufd                  m2, m2, q0000
+%endif
+    paddw                   m4, m0
+    paddw                   m2, m0
+    packuswb                m4, m2
+    movh      [dstq+strideq*0], m4
+    movhps    [dstq+strideq*1], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    mova                    m0, [aq]
+    pinsrw                  m2, [aq-1], 0
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+    mova                    m4, [pw_m256]
+    mova                    m3, [pw_m255]
+    pshufb                  m2, m4
+%else
+    punpcklbw               m2, m3
+    punpcklwd               m2, m2
+    pshufd                  m2, m2, q0000
+%endif
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 7
+.loop:
+    pinsrw                  m7, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m5, m7, m3
+    pshufb                  m7, m4
+%else
+    punpcklbw               m7, m3
+    punpcklwd               m7, m7
+    pshufd                  m5, m7, q1111
+    pshufd                  m7, m7, q0000
+%endif
+    paddw                   m2, m5, m0
+    paddw                   m5, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m5
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+%if ARCH_X86_64
+%define mem 0
+%else
+%define mem 64
+%endif
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
+    pxor                    m5, m5
+    pinsrw                  m4, [aq-1], 0
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    DEFINE_ARGS dst, stride, l, cnt
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova                   m12, [pw_m256]
+    mova                   m13, [pw_m255]
+%define pw_m256_reg m12
+%define pw_m255_reg m13
+%else
+%define pw_m256_reg [pw_m256]
+%define pw_m255_reg [pw_m255]
+%endif
+    pshufb                  m4, pw_m256_reg
+%else
+    punpcklbw               m4, m5
+    punpcklwd               m4, m4
+    pshufd                  m4, m4, q0000
+%endif
+    punpckhbw               m1, m0,  m5
+    punpckhbw               m3, m2,  m5
+    punpcklbw               m0, m5
+    punpcklbw               m2, m5
+    psubw                   m1, m4
+    psubw                   m0, m4
+    psubw                   m3, m4
+    psubw                   m2, m4
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+    SWAP                     2, 10
+    SWAP                     3, 11
+%else
+    mova            [rsp+0*16], m0
+    mova            [rsp+1*16], m1
+    mova            [rsp+2*16], m2
+    mova            [rsp+3*16], m3
+%endif
+    mov                   cntq, 15
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+%if cpuflag(ssse3)
+    pshufb                  m7, m3, pw_m255_reg
+    pshufb                  m3, pw_m256_reg
+%else
+    pxor                    m7, m7
+    punpcklbw               m3, m7
+    punpcklwd               m3, m3
+    pshufd                  m7, m3, q1111
+    pshufd                  m3, m3, q0000
+%endif
+%if ARCH_X86_64
+    paddw                   m4, m7, m8
+    paddw                   m5, m7, m9
+    paddw                   m6, m7, m10
+    paddw                   m7, m11
+    paddw                   m0, m3, m8
+    paddw                   m1, m3, m9
+    paddw                   m2, m3, m10
+    paddw                   m3, m11
+%else
+    paddw                   m4, m7, [rsp+0*16]
+    paddw                   m5, m7, [rsp+1*16]
+    paddw                   m6, m7, [rsp+2*16]
+    paddw                   m7, [rsp+3*16]
+    paddw                   m0, m3, [rsp+0*16]
+    paddw                   m1, m3, [rsp+1*16]
+    paddw                   m2, m3, [rsp+2*16]
+    paddw                   m3, [rsp+3*16]
+%endif
+    packuswb                m4, m5
+    packuswb                m6, m7
+    packuswb                m0, m1
+    packuswb                m2, m3
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m6
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%undef pw_m256_reg
+%undef pw_m255_reg
+%undef mem
+%endmacro
+
+INIT_XMM sse2
+TM_XMM_FUNCS
+INIT_XMM ssse3
+TM_XMM_FUNCS
+INIT_XMM avx
+TM_XMM_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    pinsrw                 xm2, [aq-1], 0
+    vinserti128             m2, m2, xm2, 1
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m4, [pw_m256]
+    mova                    m5, [pw_m255]
+    pshufb                  m2, m4
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 15
+.loop:
+    pinsrw                 xm7, [lq+cntq*2], 0
+    vinserti128             m7, m7, xm7, 1
+    pshufb                  m3, m7, m5
+    pshufb                  m7, m4
+    paddw                   m2, m3, m0
+    paddw                   m3, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m3
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+    pxor                   m%4, m%1, m%3
+    pand                   m%4, [pb_1]
+    pavgb                  m%1, m%3
+    psubusb                m%1, m%4
+    pavgb                  m%1, m%2
+%endmacro
+
+%macro DL_MMX_FUNCS 0
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, m1, [pb_0to5_2x7]
+    pshufb                  m2, m1, [pb_2to6_3x7]
+%else
+    punpckhbw               m3, m1, m1              ; 44556677
+    pand                    m0, m1, [pb_6xm1_2x0]   ; 012345__
+    pand                    m3, [pb_6x0_2xm1]       ; ______77
+    psrlq                   m2, m1, 16              ; 234567__
+    por                     m0, m3                  ; 01234577
+    por                     m2, m3                  ; 23456777
+%endif
+    psrlq                   m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    pshufw                  m1, m0, q3321
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    psrlq                   m0, 8
+    psrlq                   m1, 8
+    add                   dstq, strideq
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DL_MMX_FUNCS
+INIT_MMX ssse3
+DL_MMX_FUNCS
+
+%macro DL_XMM_FUNCS 0
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+    movq                    m0, [aq]
+    lea               stride5q, [strideq*5]
+%if cpuflag(ssse3)
+    pshufb                  m1, m0, [pb_1to6_10x7]
+%else
+    punpcklbw               m1, m0, m0              ; 0011223344556677
+    punpckhwd               m1, m1                  ; 4x4,4x5,4x6,4x7
+%endif
+    shufps                  m0, m1, q3310
+%if notcpuflag(ssse3)
+    psrldq                  m1, m0, 1
+    shufps                  m1, m0, q3210
+%endif
+    psrldq                  m2, m1, 1
+    LOWPASS                  0, 1, 2, 3
+
+    pshufd                  m1, m0, q3321
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m0, [aq]
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m1, m0, m5
+    pshufb                  m2, m1, m5
+    pshufb                  m4, m0, [pb_15]
+%else
+    pand                    m5, m0, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m1, m0, 1                   ; 123456789ABCDEF_
+    por                     m1, m5                      ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1                   ; 23456789ABCDEFF_
+    por                     m2, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m4, m1, q3333               ; xxxxxxxxFFFFFFFF
+%endif
+    LOWPASS                  0, 1, 2, 3
+    DEFINE_ARGS dst, stride, cnt, stride9
+    lea               stride9q, [strideq+strideq*8]
+    mov                   cntd, 4
+
+.loop:
+    movhlps                 m4, m0
+    mova      [dstq+strideq*0], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+strideq*8], m4
+    movhlps                 m4, m0
+    mova      [dstq+strideq*1], m0
+%if cpuflag(ssse3)
+    pshufb                  m0, m5
+%else
+    psrldq                  m0, 1
+    por                     m0, m5
+%endif
+    mova      [dstq+stride9q ], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    PALIGNR                 m2, m1, m0, 1, m4
+    PALIGNR                 m3, m1, m0, 2, m4
+    LOWPASS                  0, 2, 3, 4
+%if cpuflag(ssse3)
+    mova                    m5, [pb_1toE_2xF]
+    pshufb                  m2, m1, m5
+    pshufb                  m3, m2, m5
+    pshufb                  m6, m1, [pb_15]
+    mova                    m7, m6
+%else
+    pand                    m5, m1, [pb_15x0_1xm1]      ; _______________F
+    psrldq                  m2, m1, 1                   ; 123456789ABCDEF_
+    por                     m2, m5                      ; 123456789ABCDEFF
+    psrldq                  m3, m2, 1                   ; 23456789ABCDEFF_
+    por                     m3, m5                      ; 23456789ABCDEFFF
+    pshufhw                 m7, m2, q3333               ; xxxxxxxxFFFFFFFF
+    pshufd                  m6, m7, q3333
+%endif
+    LOWPASS                  1, 2, 3, 4
+    lea                 dst16q, [dstq  +strideq*8]
+    mov                   cntd, 8
+    lea                 dst16q, [dst16q+strideq*8]
+.loop:
+    movhlps                 m7, m1
+    mova [dstq  +strideq*0+ 0], m0
+    mova [dstq  +strideq*0+16], m1
+    movhps [dstq+strideq*8+ 0], m0
+    movq [dstq  +strideq*8+ 8], m1
+    mova [dstq  +strideq*8+16], m7
+    mova [dst16q+strideq*0+ 0], m1
+    mova [dst16q+strideq*0+16], m6
+    mova [dst16q+strideq*8+ 0], m7
+    mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 1
+    pshufb                  m1, m5
+%elif cpuflag(ssse3)
+    palignr                 m2, m1, m0, 1
+    pshufb                  m1, m5
+    mova                    m0, m2
+%else
+    mova                    m4, m1
+    psrldq                  m0, 1
+    pslldq                  m4, 15
+    psrldq                  m1, 1
+    por                     m0, m4
+    por                     m1, m5
+%endif
+    add                   dstq, strideq
+    add                 dst16q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_XMM_FUNCS
+INIT_XMM ssse3
+DL_XMM_FUNCS
+INIT_XMM avx
+DL_XMM_FUNCS
+
+; dr
+
+%macro DR_MMX_FUNCS 0
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    movd                    m1, [aq+3]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m1, m0, 1, m3
+    psrlq                   m2, m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    movd      [dstq+stride3q ], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*2], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*1], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*0], m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+DR_MMX_FUNCS
+INIT_MMX ssse3
+DR_MMX_FUNCS
+
+%macro DR_XMM_FUNCS 0
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m1, [lq]
+    movhps                  m1, [aq-1]
+    movd                    m2, [aq+7]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m3
+    LOWPASS                  0, 1, 2, 3
+
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*4]
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    RET
+
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m1, [lq]
+    movu                    m2, [aq-1]
+    movd                    m4, [aq+15]
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea               stride9q, [strideq *3]
+    mov                   cntd, 4
+    lea               stride9q, [stride9q*3]
+    PALIGNR                 m4, m2, 1, m5
+    PALIGNR                 m3, m2, m1, 15, m5
+    LOWPASS                  3,  2, 4, 5
+    pslldq                  m0, m1, 1
+    PALIGNR                 m2, m1, 1, m4
+    LOWPASS                  0,  1, 2, 4
+
+.loop:
+    mova    [dstq+strideq*0  ], m3
+    movhps  [dstq+strideq*8+0], m0
+    movq    [dstq+strideq*8+8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    mova    [dstq+strideq*1  ], m3
+    movhps  [dstq+stride9q +0], m0
+    movq    [dstq+stride9q +8], m3
+    PALIGNR                 m3, m0, 15, m1
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+    mova                    m1, [lq]
+    mova                    m2, [lq+16]
+    movu                    m3, [aq-1]
+    movu                    m4, [aq+15]
+    movd                    m5, [aq+31]
+    DEFINE_ARGS dst, stride, stride8, cnt
+    lea               stride8q, [strideq*8]
+    PALIGNR                 m5, m4, 1, m7
+    PALIGNR                 m6, m4, m3, 15, m7
+    LOWPASS                  5,  4,  6,  7
+    PALIGNR                 m4, m3, 1, m7
+    PALIGNR                 m6, m3, m2, 15, m7
+    LOWPASS                  4,  3,  6,  7
+    PALIGNR                 m3, m2, 1, m7
+    PALIGNR                 m6, m2, m1, 15, m7
+    LOWPASS                  3,  2,  6,  7
+    PALIGNR                 m2, m1, 1, m6
+    pslldq                  m0, m1, 1
+    LOWPASS                  2,  1,  0,  6
+    mov                   cntd, 16
+
+    ; out=m2/m3/m4/m5
+.loop:
+    mova  [dstq+stride8q*0+ 0], m4
+    mova  [dstq+stride8q*0+16], m5
+    mova  [dstq+stride8q*2+ 0], m3
+    mova  [dstq+stride8q*2+16], m4
+    PALIGNR                 m5, m4, 15, m6
+    PALIGNR                 m4, m3, 15, m6
+    PALIGNR                 m3, m2, 15, m6
+    pslldq                  m2, 1
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_XMM_FUNCS
+INIT_XMM ssse3
+DR_XMM_FUNCS
+INIT_XMM avx
+DR_XMM_FUNCS
+
+; vl
+
+INIT_MMX mmxext
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    psrlq                   m1, 8
+    psrlq                   m2, 8
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    RET
+
+%macro VL_XMM_FUNCS 0
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m0, [aq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0
+    punpckhwd               m1, m1
+    shufps                  m0, m1, q3310
+%endif
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m0, 2
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+    pshufb                  m2, m1, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m1, m0, 1               ; 123456789ABCDEF_
+    por                     m1, m4                  ; 123456789ABCDEFF
+    psrldq                  m2, m1, 1               ; 23456789ABCDEFF_
+    por                     m2, m4                  ; 23456789ABCDEFFF
+%endif
+    LOWPASS                  2,  1,  0, 3
+    pavgb                   m1, m0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+%if cpuflag(ssse3)
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+%else
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    por                     m1, m4
+    por                     m2, m4
+%endif
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m5, [aq+16]
+    DEFINE_ARGS dst, stride, dst16, cnt
+    PALIGNR                 m2, m5, m0, 1, m4
+    PALIGNR                 m3, m5, m0, 2, m4
+    lea                 dst16q, [dstq  +strideq*8]
+    LOWPASS                  3,  2,  0, 6
+    pavgb                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_1toE_2xF]
+    pshufb                  m0, m5, m4
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m5, [pb_15x0_1xm1]  ; _______________F
+    psrldq                  m0, m5, 1               ; 123456789ABCDEF_
+    por                     m0, m4                  ; 123456789ABCDEFF
+    psrldq                  m1, m0, 1               ; 23456789ABCDEFF_
+    por                     m1, m4                  ; 23456789ABCDEFFF
+%endif
+    lea                 dst16q, [dst16q+strideq*8]
+    LOWPASS                  1,  0,  5, 6
+    pavgb                   m0, m5
+%if cpuflag(ssse3)
+    pshufb                  m5, [pb_15]
+%else
+    punpckhbw               m5, m4, m4
+    pshufhw                 m5, m5, q3333
+    punpckhqdq              m5, m5
+%endif
+    mov                   cntd, 8
+
+.loop:
+%macro %%write 3
+    mova    [dstq+stride%1+ 0], %2
+    mova    [dstq+stride%1+16], %3
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+    palignr                 %2, %3, %2, 1
+    pshufb                  %3, m4
+%elif cpuflag(ssse3)
+    palignr                 m6, %3, %2, 1
+    pshufb                  %3, m4
+    mova                    %2, m6
+%else
+    pslldq                  m6, %3, 15
+    psrldq                  %3, 1
+    psrldq                  %2, 1
+    por                     %3, m4
+    por                     %2, m6
+%endif
+%endmacro
+
+    %%write                q*0, m2, m0
+    %%write                q*1, m3, m1
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_XMM_FUNCS
+INIT_XMM ssse3
+VL_XMM_FUNCS
+INIT_XMM avx
+VL_XMM_FUNCS
+
+; vr
+
+%macro VR_MMX_FUNCS 0
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq-1]
+    punpckldq               m2, [lq]
+    movd                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 5, m3
+    psrlq                   m2, m1, 8
+    psllq                   m3, m1, 8
+    LOWPASS                  2,  1, 3, 4
+
+    ; ABCD <- for the following predictor:
+    ; EFGH
+    ; IABC  | m0 contains ABCDxxxx
+    ; JEFG  | m2 contains xJIEFGHx
+
+%if cpuflag(ssse3)
+    punpckldq               m0, m2
+    pshufb                  m2, [pb_13456_3xm1]
+    movd      [dstq+strideq*0], m0
+    pshufb                  m0, [pb_6012_4xm1]
+    movd      [dstq+stride3q ], m2
+    psrlq                   m2, 8
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+strideq*1], m2
+%else
+    psllq                   m1, m2, 40
+    psrlq                   m2, 24
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m2
+    PALIGNR                 m0, m1, 7, m3
+    psllq                   m1, 8
+    PALIGNR                 m2, m1, 7, m3
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+stride3q ], m2
+%endif
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VR_MMX_FUNCS
+INIT_MMX ssse3
+VR_MMX_FUNCS
+
+%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-1]
+    movhps                  m2, [lq]
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2, 9, m3
+    pslldq                  m2, m1, 1
+    pslldq                  m3, m1, 2
+    LOWPASS                  1,  2, 3, 4
+
+    ; ABCDEFGH <- for the following predictor:
+    ; IJKLMNOP
+    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
+    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
+    ; SQABCDEF
+    ; TRIJKLMN
+    ; USQABCDE
+    ; VTRIJKLM
+
+%if cpuflag(ssse3)
+    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
+%endif
+    movq      [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_6xm1_BDF_0to6]  ; xxxxxxUSQABCDEFG
+    pshufb                  m1, [pb_6xm1_246_8toE]  ; xxxxxxVTRIJKLMNO
+%else
+    psrlw                   m2, m1, 8               ; x_U_S_Q_xxxxxxxx
+    pand                    m3, m1, [pw_255]        ; x_V_T_R_xxxxxxxx
+    packuswb                m3, m2                  ; xVTRxxxxxUSQxxxx
+    pslldq                  m3, 4                   ; xxxxxVTRxxxxxUSQ
+    PALIGNR                 m0, m3, 7, m4           ; xxxxxxUSQABCDEFG
+    psrldq                  m1, 8
+    pslldq                  m3, 8
+    PALIGNR                 m1, m3, 7, m4           ; xxxxxxVTRIJKLMNO
+%endif
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*0], m0
+    movhps    [dstq+strideq*1], m1
+    pslldq                  m0, 1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    RET
+
+cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
+    mova                    m0, [aq]
+    movu                    m1, [aq-1]
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    PALIGNR                 m3, m1, m2, 15, m6
+    LOWPASS                  3,  1,  0,  4
+    pavgb                   m0, m1
+    PALIGNR                 m1, m2,  1, m6
+    pslldq                  m4, m2,  1
+    LOWPASS                  1,  2,  4,  5
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m5, m1, 8
+    pand                    m1, [pw_255]
+    packuswb                m1, m5
+%endif
+    mov                   cntd, 4
+
+.loop:
+    movlhps                 m2, m1
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m4, m0, m1, 15, m6
+    PALIGNR                 m5, m3, m2, 15, m6
+    mova      [dstq+strideq*2], m4
+    mova      [dstq+stride3q ], m5
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m1, 14, m6
+    PALIGNR                 m3, m2, 14, m6
+    pslldq                  m1, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    movu                    m1, [aq-1]
+    PALIGNR                 m3, m2, m0, 15, m6
+    PALIGNR                 m4, m2, m0, 14, m6
+    LOWPASS                  4,  3,  2,  5
+    pavgb                   m3, m2
+    mova                    m2, [lq+16]
+    PALIGNR                 m5, m1, m2, 15, m6
+    LOWPASS                  5,  1,  0,  6
+    pavgb                   m0, m1
+    mova                    m6, [lq]
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                [dstq], m0
+%endif
+    PALIGNR                 m1, m2,  1, m0
+    PALIGNR                 m7, m2, m6, 15, m0
+    LOWPASS                  1,  2,  7,  0
+    PALIGNR                 m2, m6,  1, m0
+    pslldq                  m7, m6,  1
+    LOWPASS                  2,  6,  7,  0
+%if cpuflag(ssse3)
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+    pshufb                  m2, [pb_02468ACE_13579BDF]
+%else
+    psrlw                   m0, m1, 8
+    psrlw                   m6, m2, 8
+    pand                    m1, [pw_255]
+    pand                    m2, [pw_255]
+    packuswb                m1, m0
+    packuswb                m2, m6
+%endif
+    DEFINE_ARGS dst, stride, dst16, cnt
+    lea                 dst16q, [dstq  +strideq*8]
+    lea                 dst16q, [dst16q+strideq*8]
+    SBUTTERFLY             qdq,  2,  1,  6
+%if ARCH_X86_64
+    SWAP                     0, 8
+%else
+    mova                    m0, [dstq]
+%endif
+    mov                   cntd, 8
+
+.loop:
+    ; even lines (0, 2, 4, ...): m1 | m0, m3
+    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+    mova    [dstq+stride%1+ 0], %3
+    mova    [dstq+stride%1+16], %4
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], %4
+    PALIGNR                 %4, %3, 15, m6
+    PALIGNR                 %3, %2, 15, m6
+    pslldq                  %2,  1
+%endmacro
+
+    %%write                q*0, m1, m0, m3
+    %%write                q*1, m2, m5, m4
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_XMM_FUNCS 7
+INIT_XMM ssse3
+VR_XMM_FUNCS 6
+INIT_XMM avx
+VR_XMM_FUNCS 6
+
+; hd
+
+INIT_MMX mmxext
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0,  3
+    pavgb                   m1, m0
+
+    ; DHIJ <- for the following predictor:
+    ; CGDH
+    ; BFCG  | m1 contains ABCDxxxx
+    ; AEBF  | m2 contains EFGHIJxx
+
+    punpcklbw               m1, m2
+    punpckhdq               m0, m1, m2
+
+    ; m1 contains AEBFCGDH
+    ; m0 contains CGDHIJxx
+
+    movd      [dstq+stride3q ], m1
+    movd      [dstq+strideq*1], m0
+    psrlq                   m1, 16
+    psrlq                   m0, 16
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+strideq*0], m0
+    RET
+
+%macro HD_XMM_FUNCS 0
+cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
+    movq                    m0, [lq]
+    movhps                  m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    ; HPQRSTUV <- for the following predictor
+    ; GOHPQRST
+    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
+    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
+    ; DLEMFNGO
+    ; CKDLEMFN
+    ; BJCKDLEM
+    ; AIBJCKDL
+
+    punpcklbw               m1, m2
+    movhlps                 m2, m2
+
+    ; m1 contains AIBJCKDLEMFNGOHP
+    ; m2 contains QRSTUVxxxxxxxxxx
+
+    movhps   [dstq +stride3q ], m1
+    movq     [dst4q+stride3q ], m1
+    PALIGNR                 m3, m2, m1, 2, m4
+    movhps   [dstq +strideq*2], m3
+    movq     [dst4q+strideq*2], m3
+    PALIGNR                 m3, m2, m1, 4, m4
+    movhps   [dstq +strideq*1], m3
+    movq     [dst4q+strideq*1], m3
+    PALIGNR                 m2, m1, 6, m4
+    movhps   [dstq +strideq*0], m2
+    movq     [dst4q+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m3, [aq-1]
+    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+    lea               stride4q, [strideq*4]
+    lea                  dst4q, [dstq +stride4q]
+    lea                  dst8q, [dst4q+stride4q]
+    lea                 dst12q, [dst8q+stride4q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m1, m3, m0,  1, m6
+    PALIGNR                 m2, m3, m0,  2, m6
+    LOWPASS                  2,  1,  0,  6
+    pavgb                   m1, m0
+    SBUTTERFLY              bw,  1,  2,  6
+
+    ; I PROBABLY INVERTED L0 ad L16 here
+    ; m1, m2, m5
+.loop:
+    sub               stride4q, strideq
+    movhps [dstq +stride4q +0], m2
+    movq   [dstq +stride4q +8], m5
+    mova   [dst4q+stride4q   ], m2
+    movhps [dst8q+stride4q +0], m1
+    movq   [dst8q+stride4q +8], m2
+    mova  [dst12q+stride4q   ], m1
+%if cpuflag(avx)
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m5, m2, 2
+%elif cpuflag(ssse3)
+    palignr                 m3, m2, m1, 2
+    palignr                 m0, m5, m2, 2
+    mova                    m1, m3
+    mova                    m2, m0
+%else
+    ; slightly modified version of PALIGNR
+    mova                    m6, m2
+    mova                    m4, m5
+    pslldq                  m6, 14
+    pslldq                  m4, 14
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    por                     m1, m6
+    por                     m2, m4
+%endif
+    psrldq                  m5, 2
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    movu                    m2, [aq-1]
+    movu                    m3, [aq+15]
+    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+    lea               stride8q, [strideq*8]
+    lea                  dst8q, [dstq  +stride8q]
+    lea                 dst16q, [dst8q +stride8q]
+    lea                 dst24q, [dst16q+stride8q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    PALIGNR                 m4, m3, m2,  2, m6
+    PALIGNR                 m3, m2,  1, m6
+    LOWPASS                  4,  3,  2,  6
+    PALIGNR                 m3, m2, m1,  2, m6
+    PALIGNR                 m2, m1,  1, m6
+    LOWPASS                  3,  2,  1,  6
+    pavgb                   m2, m1
+    PALIGNR                 m6, m1, m0,  1, m7
+    PALIGNR                 m1, m0,  2, m7
+    LOWPASS                  1,  6,  0,  7
+    pavgb                   m0, m6
+    SBUTTERFLY              bw,  2,  3,  6
+    SBUTTERFLY              bw,  0,  1,  6
+
+    ; m0, m1, m2, m3, m4, m5
+.loop:
+    sub               stride8q, strideq
+    mova  [dstq  +stride8q+ 0], m3
+    mova  [dstq  +stride8q+16], m4
+    mova  [dst8q +stride8q+ 0], m2
+    mova  [dst8q +stride8q+16], m3
+    mova  [dst16q+stride8q+ 0], m1
+    mova  [dst16q+stride8q+16], m2
+    mova  [dst24q+stride8q+ 0], m0
+    mova  [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+    palignr                 m0, m1, m0, 2
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m4, m3, 2
+    palignr                 m4, m5, m4, 2
+    psrldq                  m5, 2
+%elif cpuflag(ssse3)
+    psrldq                  m6, m5, 2
+    palignr                 m5, m4, 2
+    palignr                 m4, m3, 2
+    palignr                 m3, m2, 2
+    palignr                 m2, m1, 2
+    palignr                 m1, m0, 2
+    mova                    m0, m1
+    mova                    m1, m2
+    mova                    m2, m3
+    mova                    m3, m4
+    mova                    m4, m5
+    mova                    m5, m6
+%else
+    ; sort of a half-integrated version of PALIGNR
+    pslldq                  m7, m4, 14
+    pslldq                  m6, m5, 14
+    psrldq                  m4, 2
+    psrldq                  m5, 2
+    por                     m4, m6
+    pslldq                  m6, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    pslldq                  m7, m2, 14
+    psrldq                  m2, 2
+    por                     m2, m6
+    pslldq                  m6, m1, 14
+    psrldq                  m1, 2
+    por                     m1, m7
+    psrldq                  m0, 2
+    por                     m0, m6
+%endif
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_XMM_FUNCS
+INIT_XMM ssse3
+HD_XMM_FUNCS
+INIT_XMM avx
+HD_XMM_FUNCS
+
+%macro HU_MMX_FUNCS 0
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+    movd                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to2_5x3]
+%else
+    punpcklbw               m1, m0, m0          ; 00112233
+    pshufw                  m1, m1, q3333       ; 33333333
+    punpckldq               m0, m1              ; 01233333
+%endif
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    SBUTTERFLY              bw,  1, 2, 0
+    PALIGNR                 m2, m1, 2, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    punpckhdq               m1, m1
+    punpckhdq               m2, m2
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+stride3q ], m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+HU_MMX_FUNCS
+INIT_MMX ssse3
+HU_MMX_FUNCS
+
+%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+    movq                    m0, [lq]
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to6_9x7]
+%else
+    punpcklbw               m1, m0, m0          ; 0011223344556677
+    punpckhwd               m1, m1              ; 4444555566667777
+    shufps                  m0, m1, q3310       ; 0123456777777777
+%endif
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    SBUTTERFLY              bw,  1, 2, 0
+    movq     [dstq +strideq*0], m1
+    movhps   [dst4q+strideq*0], m1
+    PALIGNR                 m0, m2, m1, 2, m3
+    movq     [dstq +strideq*1], m0
+    movhps   [dst4q+strideq*1], m0
+    PALIGNR                 m0, m2, m1, 4, m3
+    movq     [dstq +strideq*2], m0
+    movhps   [dst4q+strideq*2], m0
+    PALIGNR                 m2, m1, 6, m3
+    movq     [dstq +stride3q ], m2
+    movhps   [dst4q+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2toE_3xF]
+    pshufb                  m1, m0, [pb_1toE_2xF]
+    pshufb                  m2, m0, m3
+%else
+    pand                    m3, m0, [pb_15x0_1xm1]
+    psrldq                  m1, m0, 1
+    por                     m1, m3
+    punpckhbw               m3, m3
+    psrldq                  m2, m0, 2
+    por                     m2, m3
+%endif
+    LOWPASS                  2,  1,  0,  4
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea                stride9q, [strideq*8+strideq]
+    mov                   cntd,  4
+    SBUTTERFLY              bw,  1,  2,  0
+
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*8], m2
+    PALIGNR                 m0, m2, m1, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride9q ], m2
+    PALIGNR                 m1, m2, m0, 2, m4
+%if cpuflag(ssse3)
+    pshufb                  m2, m3
+%else
+    psrldq                  m2, 2
+    por                     m2, m3
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
+    mova                    m1, [lq]
+    mova                    m0, [lq+16]
+    PALIGNR                 m2, m0, m1,  1, m5
+    PALIGNR                 m3, m0, m1,  2, m5
+    LOWPASS                  3,  2,  1,  5
+    pavgb                   m2, m1
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2toE_3xF]
+    pshufb                  m5, m0, [pb_1toE_2xF]
+    pshufb                  m1, m0, m4
+%else
+    pand                    m4, m0, [pb_15x0_1xm1]
+    psrldq                  m5, m0, 1
+    por                     m5, m4
+    punpckhbw               m4, m4
+    psrldq                  m1, m0, 2
+    por                     m1, m4
+%endif
+    LOWPASS                  1,  5,  0,  6
+    pavgb                   m0, m5
+    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+    mov                   cntd,  8
+    xor               stride0q, stride0q
+    lea                  dst8q, [dstq  +strideq*8]
+    lea                 dst16q, [dst8q +strideq*8]
+    lea                 dst24q, [dst16q+strideq*8]
+    SBUTTERFLY              bw,  0,  1,  5
+    SBUTTERFLY              bw,  2,  3,  5
+%if cpuflag(ssse3)
+    pshufb                  m6, m1, [pb_15]
+%else
+    pshufhw                 m6, m4, q3333
+    punpckhqdq              m6, m6
+%endif
+
+.loop:
+    mova  [dstq  +stride0q+ 0], m2
+    mova  [dstq  +stride0q+16], m3
+    mova  [dst8q +stride0q+ 0], m3
+    mova  [dst8q +stride0q+16], m0
+    mova  [dst16q+stride0q+ 0], m0
+    mova  [dst16q+stride0q+16], m1
+    mova  [dst24q+stride0q+ 0], m1
+    mova  [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m0, m3, 2
+    palignr                 m0, m1, m0, 2
+    pshufb                  m1, m4
+%elif cpuflag(ssse3)
+    pshufb                  m5, m1, m4
+    palignr                 m1, m0, 2
+    palignr                 m0, m3, 2
+    palignr                 m3, m2, 2
+    mova                    m2, m3
+    mova                    m3, m0
+    mova                    m0, m1
+    mova                    m1, m5
+%else
+    ; half-integrated version of PALIGNR
+    pslldq                  m5, m1, 14
+    pslldq                  m7, m0, 14
+    psrldq                  m1, 2
+    psrldq                  m0, 2
+    por                     m1, m4
+    por                     m0, m5
+    pslldq                  m5, m3, 14
+    psrldq                  m3, 2
+    por                     m3, m7
+    psrldq                  m2, 2
+    por                     m2, m5
+%endif
+    add               stride0q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_XMM_FUNCS 8
+INIT_XMM ssse3
+HU_XMM_FUNCS 7
+INIT_XMM avx
+HU_XMM_FUNCS 7
+
+; FIXME 127, 128, 129 ?
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
new file mode 100644
index 0000000..c0ac16d
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -0,0 +1,2135 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_2: times 8 dd 2
+pd_4: times 8 dd 4
+pd_8: times 8 dd 8
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
+
+cextern pw_1
+cextern pw_1023
+cextern pw_4095
+cextern pd_16
+cextern pd_32
+cextern pd_65535;
+
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_XMM sse
+cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    mova                    m3, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m2
+    mova   [dstq+strideq*1+48], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
+    mova                    m3, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    punpckhwd               m3, m2, m2
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m3, q1111
+    pshufd                  m1, m3, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    punpcklwd               m2, m2
+    pshufd                  m0, m2, q3333
+    pshufd                  m1, m2, q2222
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufd                  m0, m2, q1111
+    pshufd                  m1, m2, q0000
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m1
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 3
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova    [dstq+strideq*0+ 0], m0
+    mova    [dstq+strideq*0+16], m0
+    mova    [dstq+strideq*1+ 0], m1
+    mova    [dstq+strideq*1+16], m1
+    mova    [dstq+strideq*2+ 0], m2
+    mova    [dstq+strideq*2+16], m2
+    mova    [dstq+stride3q + 0], m3
+    mova    [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
+    mov                   cntd, 7
+    lea               stride3q, [strideq*3]
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*1+32], m1
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+strideq*2+32], m2
+    mova   [dstq+strideq*2+48], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    mova   [dstq+stride3q +32], m3
+    mova   [dstq+stride3q +48], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq]
+    paddw                   m0, [lq+mmsize]
+    paddw                   m0, [aq]
+    paddw                   m0, [aq+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [lq+mmsize*0]
+    paddw                   m0, [lq+mmsize*1]
+    paddw                   m0, [lq+mmsize*2]
+    paddw                   m0, [lq+mmsize*3]
+    paddw                   m0, [aq+mmsize*0]
+    paddw                   m0, [aq+mmsize*1]
+    paddw                   m0, [aq+mmsize*2]
+    paddw                   m0, [aq+mmsize*3]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_32]
+    paddd                   m0, m1
+    psrad                   m0, 6
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+%macro DC_1D_FNS 2
+INIT_MMX mmxext
+cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufw                  m1, m0, q3232
+    paddd                   m0, [pd_2]
+    paddd                   m0, m1
+    psrad                   m0, 2
+    pshufw                  m0, m0, q0000
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_4]
+    paddd                   m0, m1
+    psrad                   m0, 3
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2]
+    paddw                   m0, [%2+mmsize]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_8]
+    paddd                   m0, m1
+    psrad                   m0, 4
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [%2+mmsize*0]
+    paddw                   m0, [%2+mmsize*1]
+    paddw                   m0, [%2+mmsize*2]
+    paddw                   m0, [%2+mmsize*3]
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 16
+    pmaddwd                 m0, [pw_1]
+    pshufd                  m1, m0, q3232
+    paddd                   m0, m1
+    pshufd                  m1, m0, q1111
+    paddd                   m0, [pd_16]
+    paddd                   m0, m1
+    psrad                   m0, 5
+    pshuflw                 m0, m0, q0000
+    punpcklqdq              m0, m0
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*0+32], m0
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*1+32], m0
+    mova   [dstq+strideq*1+48], m0
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+DC_1D_FNS top,  aq
+DC_1D_FNS left, lq
+
+INIT_MMX mmxext
+cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_1023]
+.body:
+    mova                    m4, [aq]
+    mova                    m3, [lq]
+    movd                    m0, [aq-4]
+    pshufw                  m0, m0, q1111
+    psubw                   m4, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pshufw                  m0, m3, q3333
+    pshufw                  m1, m3, q2222
+    pshufw                  m2, m3, q1111
+    pshufw                  m3, m3, q0000
+    paddw                   m0, m4
+    paddw                   m1, m4
+    paddw                   m2, m4
+    paddw                   m3, m4
+    pxor                    m4, m4
+    pmaxsw                  m0, m4
+    pmaxsw                  m1, m4
+    pmaxsw                  m2, m4
+    pmaxsw                  m3, m4
+    pminsw                  m0, m5
+    pminsw                  m1, m5
+    pminsw                  m2, m5
+    pminsw                  m3, m5
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m5, [aq]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 1
+.loop:
+    movh                    m3, [lq+cntq*8]
+    punpcklwd               m3, m3
+    pshufd                  m0, m3, q3333
+    pshufd                  m1, m3, q2222
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m5
+    paddw                   m1, m5
+    paddw                   m2, m5
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m4
+    pminsw                  m1, m4
+    pminsw                  m2, m4
+    pminsw                  m3, m4
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_1023]
+.body:
+    pxor                    m6, m6
+    mova                    m4, [aq]
+    mova                    m5, [aq+mmsize]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+    punpcklwd               m3, m3
+    pshufd                  m2, m3, q1111
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m2, m4
+    paddw                   m2, m5
+    paddw                   m1, m3, m4
+    paddw                   m3, m5
+    pmaxsw                  m0, m6
+    pmaxsw                  m2, m6
+    pmaxsw                  m1, m6
+    pmaxsw                  m3, m6
+    pminsw                  m0, m7
+    pminsw                  m2, m7
+    pminsw                  m1, m7
+    pminsw                  m3, m7
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m3
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
+    mova                    m7, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
+
+INIT_XMM sse2
+cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_1023]
+.body:
+    pxor                    m1, m1
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     1, 9
+%define reg_min m9
+%define reg_max m8
+%else
+    mova              [rsp+ 0], m0
+    mova              [rsp+16], m1
+%define reg_min [rsp+16]
+%define reg_max [rsp+ 0]
+%endif
+
+    mova                    m4, [aq+mmsize*0]
+    mova                    m5, [aq+mmsize*1]
+    mova                    m6, [aq+mmsize*2]
+    mova                    m7, [aq+mmsize*3]
+    movd                    m0, [aq-4]
+    pshuflw                 m0, m0, q1111
+    punpcklqdq              m0, m0
+    psubw                   m4, m0
+    psubw                   m5, m0
+    psubw                   m6, m0
+    psubw                   m7, m0
+    DEFINE_ARGS dst, stride, l, cnt
+    mov                   cntd, 31
+.loop:
+    pinsrw                  m3, [lq+cntq*2], 0
+    punpcklwd               m3, m3
+    pshufd                  m3, m3, q0000
+    paddw                   m0, m3, m4
+    paddw                   m1, m3, m5
+    paddw                   m2, m3, m6
+    paddw                   m3, m7
+    pmaxsw                  m0, reg_min
+    pmaxsw                  m1, reg_min
+    pmaxsw                  m2, reg_min
+    pmaxsw                  m3, reg_min
+    pminsw                  m0, reg_max
+    pminsw                  m1, reg_max
+    pminsw                  m2, reg_max
+    pminsw                  m3, reg_max
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+    add                   dstq, strideq
+    dec                   cntd
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
+    mova                    m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra predicion functions
+;
+; in the functions below, 'abcdefgh' refers to above data (sometimes simply
+; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
+; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
+; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
+; top-left data.
+
+; left=(left+2*center+right+2)>>2
+%macro LOWPASS 3 ; left [dst], center, right
+    paddw                  m%1, m%3
+    psraw                  m%1, 1
+    pavgw                  m%1, m%2
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst)
+; dst/src can be the same register
+%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
+%else
+    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+%endif
+%endmacro
+
+; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
+%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
+%if cpuflag(ssse3)
+    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
+    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
+%else
+    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
+    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
+    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
+    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
+%endif
+%endmacro
+
+%macro DL_FUNCS 0
+cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m1, [aq]                ; abcdefgh
+    pshufhw                 m0, m1, q3310           ; abcdefhh
+    SHIFT_RIGHT             m1, m1                  ; bcdefghh
+    psrldq                  m2, m1, 2               ; cdefghh.
+    LOWPASS                  0,  1,  2              ; BCDEFGh.
+    pshufd                  m1, m0, q3321           ; DEFGh...
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    add                   dstq, strideq
+    psrldq                  m0, 2                   ; CDEFGh..
+    psrldq                  m1, 2                   ; EFGh....
+    movh      [dstq+strideq*0], m0
+    movh      [dstq+strideq*2], m1
+    RET
+
+cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
+    LOWPASS                  0,  1,  2              ; BCDEFGHh
+    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
+    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
+    DEFINE_ARGS dst, stride, stride5
+    lea               stride5q, [strideq*5]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*4], m1
+    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
+    pshuflw                 m1, m1, q3321           ; GHhhhhhh
+    pshufd                  m2, m0, q3321           ; EFGHhhhh
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
+    mova      [dstq+strideq*0], m3
+    mova      [dstq+strideq*4], m1
+    pshuflw                 m1, m1, q3321           ; hhhhhhhh
+    mova      [dstq+strideq*1], m2
+    mova      [dstq+stride5q ], m1
+    RET
+
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+    mova                    m3, [aq+mmsize]         ; ijklmnop
+    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
+    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
+    LOWPASS                  0,  1,  2              ; BCDEFGHI
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
+    LOWPASS                  1,  2,  3              ; JKLMNOPp
+    pshufd                  m2, m2, q3333           ; pppppppp
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*8+ 0], m1
+    mova   [dstq+strideq*8+16], m2
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+%else
+    PALIGNR                 m3, m1, m0, 2, m4
+    mova                    m0, m3
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]       ; abcdefgh
+    mova                    m1, [aq+mmsize*1]       ; ijklmnop
+    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
+    mova                    m3, [aq+mmsize*3]       ; yz012345
+    PALIGNR                 m4, m1, m0, 2, m6
+    PALIGNR                 m5, m1, m0, 4, m6
+    LOWPASS                  0,  4,  5              ; BCDEFGHI
+    PALIGNR                 m4, m2, m1, 2, m6
+    PALIGNR                 m5, m2, m1, 4, m6
+    LOWPASS                  1,  4,  5              ; JKLMNOPQ
+    PALIGNR                 m4, m3, m2, 2, m6
+    PALIGNR                 m5, m3, m2, 4, m6
+    LOWPASS                  2,  4,  5              ; RSTUVWXY
+%if cpuflag(ssse3)
+    mova                    m6, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m4, m5, m3, m6
+    LOWPASS                  3,  4,  5              ; Z0123455
+    pshufd                  m4, m4, q3333           ; 55555555
+    DEFINE_ARGS dst, stride, stride8, stride24, cnt
+    mov                   cntd, 8
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+
+.loop:
+    mova  [dstq+stride8q*0+ 0], m0
+    mova  [dstq+stride8q*0+16], m1
+    mova  [dstq+stride8q*0+32], m2
+    mova  [dstq+stride8q*0+48], m3
+    mova  [dstq+stride8q*1+ 0], m1
+    mova  [dstq+stride8q*1+16], m2
+    mova  [dstq+stride8q*1+32], m3
+    mova  [dstq+stride8q*1+48], m4
+    mova  [dstq+stride8q*2+ 0], m2
+    mova  [dstq+stride8q*2+16], m3
+    mova  [dstq+stride8q*2+32], m4
+    mova  [dstq+stride8q*2+48], m4
+    mova  [dstq+stride24q + 0], m3
+    mova  [dstq+stride24q +16], m4
+    mova  [dstq+stride24q +32], m4
+    mova  [dstq+stride24q +48], m4
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 2
+    vpalignr                m1, m2, m1, 2
+    vpalignr                m2, m3, m2, 2
+%else
+    PALIGNR                 m5, m1, m0, 2, m6
+    mova                    m0, m5
+    PALIGNR                 m5, m2, m1, 2, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 2, m6
+    mova                    m2, m5
+%endif
+    SHIFT_RIGHT             m3, m3, m6
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DL_FUNCS
+INIT_XMM ssse3
+DL_FUNCS
+INIT_XMM avx
+DL_FUNCS
+
+%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; wxyz....
+    movhps                  m0, [aq-2]              ; wxyz*abc
+    movd                    m1, [aq+6]              ; d.......
+    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
+    psrldq                  m2, m1, 2               ; yz*abcd.
+    LOWPASS                  0, 1, 2                ; XYZ#ABC.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m0
+    psrldq                  m0, 2                   ; YZ#ABC..
+    movh      [dstq+strideq*2], m0
+    psrldq                  m0, 2                   ; Z#ABC...
+    movh      [dstq+strideq*1], m0
+    psrldq                  m0, 2                   ; #ABC....
+    movh      [dstq+strideq*0], m0
+    RET
+
+cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]                ; stuvwxyz
+    movu                    m1, [aq-2]              ; *abcdefg
+    mova                    m2, [aq]                ; abcdefgh
+    psrldq                  m3, m2, 2               ; bcdefgh.
+    LOWPASS                  3,  2, 1               ; ABCDEFG.
+    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
+    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
+    LOWPASS                  2,  1, 0               ; TUVWXYZ#
+    DEFINE_ARGS dst, stride, dst4, stride3
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+
+    movhps [dstq +stride3q +0], m2
+    movh   [dstq+ stride3q +8], m3
+    mova   [dst4q+stride3q +0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*2+0], m1
+    movh   [dstq+ strideq*2+8], m3
+    mova   [dst4q+strideq*2+0], m1
+    PALIGNR                 m2, m3, m1, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*1+0], m2
+    movh   [dstq+ strideq*1+8], m3
+    mova   [dst4q+strideq*1+0], m2
+    PALIGNR                 m1, m3, m2, 2, m0
+    psrldq                  m3, 2
+    movhps [dstq +strideq*0+0], m1
+    movh   [dstq+ strideq*0+8], m3
+    mova   [dst4q+strideq*0+0], m1
+    RET
+
+cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [lq]                ; klmnopqr
+    mova                    m1, [lq+mmsize]         ; stuvwxyz
+    movu                    m2, [aq-2]              ; *abcdefg
+    movu                    m3, [aq+mmsize-2]       ; hijklmno
+    mova                    m4, [aq]                ; abcdefgh
+    mova                    m5, [aq+mmsize]         ; ijklmnop
+    psrldq                  m6, m5, 2               ; jklmnop.
+    LOWPASS                  6,  5, 3               ; IJKLMNO.
+    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
+    LOWPASS                  5,  4, 2               ; ABCDEFGH
+    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
+    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
+    LOWPASS                  4,  2, 1               ; TUVWXYZ#
+    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
+    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
+    LOWPASS                  2, 1, 0                ; LMNOPQRS
+    DEFINE_ARGS dst, stride, dst8, cnt
+    lea                  dst8q, [dstq+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+    mova  [dst8q+strideq*0+ 0], m4
+    mova  [dst8q+strideq*0+16], m5
+    mova  [dst8q+strideq*8+ 0], m2
+    mova  [dst8q+strideq*8+16], m4
+%if cpuflag(avx)
+    vpalignr                m2, m4, m2, 2
+    vpalignr                m4, m5, m4, 2
+    vpalignr                m5, m6, m5, 2
+%else
+    PALIGNR                 m0, m4, m2, 2, m1
+    mova                    m2, m0
+    PALIGNR                 m0, m5, m4, 2, m1
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m5, 2, m1
+    mova                    m5, m0
+%endif
+    psrldq                  m6, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
+                               %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
+    mova                    m0, [aq+mmsize*3]       ; a[24-31]
+    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
+    psrldq                  m2, m0, 2               ; a[25-31].
+    LOWPASS                  2,  0, 1               ; A[24-30].
+    mova                    m1, [aq+mmsize*2]       ; a[16-23]
+    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
+    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
+    LOWPASS                  0,  1, 3               ; A[16-23]
+    mova                    m3, [aq+mmsize*1]       ; a[8-15]
+    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
+    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
+    LOWPASS                  1,  3, 4               ; A[8-15]
+    mova                    m4, [aq+mmsize*0]       ; a[0-7]
+    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
+    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
+    LOWPASS                  3,  4, 5               ; A[0-7]
+    SCRATCH                  1,  8, rsp+0*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0, 10, rsp+2*mmsize
+%endif
+    mova                    m6, [lq+mmsize*3]       ; l[24-31]
+    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
+    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
+    LOWPASS                  4,  5, 6               ; L[25-31]#
+    mova                    m7, [lq+mmsize*2]       ; l[16-23]
+    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
+    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
+    LOWPASS                  5,  6, 7               ; L[17-24]
+    mova                    m1, [lq+mmsize*1]       ; l[8-15]
+    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
+    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
+    LOWPASS                  6,  7, 1               ; L[9-16]
+    mova                    m3, [lq+mmsize*0]       ; l[0-7]
+    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
+    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
+    LOWPASS                  7,  1, 3               ; L[1-8]
+%if cpuflag(ssse3)
+%if cpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%endif
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%else
+    UNSCRATCH                0, 10, rsp+2*mmsize
+%endif
+    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
+    lea               stride8q, [strideq*8]
+    lea              stride24q, [stride8q*3]
+    lea                  dst8q, [dst8q+strideq*8]
+    mov                   cntd, 8
+
+.loop:
+    sub                  dst8q, strideq
+%if notcpuflag(avx)
+    UNSCRATCH                1,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3,  9, rsp+1*mmsize
+%endif
+%endif
+    mova [dst8q+stride8q*0+ 0], m4
+    mova [dst8q+stride8q*0+16], m3
+    mova [dst8q+stride8q*0+32], m1
+    mova [dst8q+stride8q*0+48], m0
+    mova [dst8q+stride8q*1+ 0], m5
+    mova [dst8q+stride8q*1+16], m4
+    mova [dst8q+stride8q*1+32], m3
+    mova [dst8q+stride8q*1+48], m1
+    mova [dst8q+stride8q*2+ 0], m6
+    mova [dst8q+stride8q*2+16], m5
+    mova [dst8q+stride8q*2+32], m4
+    mova [dst8q+stride8q*2+48], m3
+    mova [dst8q+stride24q + 0], m7
+    mova [dst8q+stride24q +16], m6
+    mova [dst8q+stride24q +32], m5
+    mova [dst8q+stride24q +48], m4
+%if cpuflag(avx)
+    vpalignr                m7, m6, m7, 2
+    vpalignr                m6, m5, m6, 2
+    vpalignr                m5, m4, m5, 2
+    vpalignr                m4, m3, m4, 2
+    vpalignr                m3, m1, m3, 2
+    vpalignr                m1, m0, m1, 2
+    vpalignr                m0, m2, m0, 2
+%else
+    SCRATCH                  2,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  0,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m6, m7, 2, m0
+    mova                    m7, m2
+    PALIGNR                 m2, m5, m6, 2, m0
+    mova                    m6, m2
+    PALIGNR                 m2, m4, m5, 2, m0
+    mova                    m5, m2
+    PALIGNR                 m2, m3, m4, 2, m0
+    mova                    m4, m2
+    PALIGNR                 m2, m1, m3, 2, m0
+    mova                    m3, m2
+%if notcpuflag(ssse3)
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    SCRATCH                  3,  9, rsp+1*mmsize
+%endif
+    PALIGNR                 m2, m0, m1, 2, m3
+    mova                    m1, m2
+    UNSCRATCH                2,  8, rsp+0*mmsize
+    SCRATCH                  1,  8, rsp+0*mmsize
+    PALIGNR                 m1, m2, m0, 2, m3
+    mova                    m0, m1
+%endif
+    psrldq                  m2, 2
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+DR_FUNCS 3
+INIT_XMM ssse3
+DR_FUNCS 2
+INIT_XMM avx
+DR_FUNCS 2
+
+%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    movu                    m0, [aq]                ; abcdefgh
+    psrldq                  m1, m0, 2               ; bcdefgh.
+    psrldq                  m2, m0, 4               ; cdefgh..
+    LOWPASS                  2,  1, 0               ; BCDEFGH.
+    pavgw                   m1, m0                  ; ABCDEFG.
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1
+    movh      [dstq+strideq*1], m2
+    psrldq                  m1, 2
+    psrldq                  m2, 2
+    movh      [dstq+strideq*2], m1
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]                ; abcdefgh
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
+    LOWPASS                  2,  1, 0               ; BCDEFGHh
+    pavgw                   m1, m0                  ; ABCDEFGh
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    SHIFT_RIGHT             m1, m1, m3
+    SHIFT_RIGHT             m2, m2, m3
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    mova                    m1, [aq+mmsize]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+%if cpuflag(ssse3)
+    mova                    m4, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m5, m0, m1, m4
+    LOWPASS                  0,  5,  1
+    pavgw                   m1, m5
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m2
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m3
+    mova   [dstq+strideq*1+16], m0
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m2, m1, m2, 2
+    vpalignr                m3, m0, m3, 2
+%else
+    PALIGNR                 m5, m1, m2, 2, m4
+    mova                    m2, m5
+    PALIGNR                 m5, m0, m3, 2, m4
+    mova                    m3, m5
+%endif
+    SHIFT_RIGHT             m1, m1, m4
+    SHIFT_RIGHT             m0, m0, m4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq+mmsize*0]
+    mova                    m1, [aq+mmsize*1]
+    mova                    m2, [aq+mmsize*2]
+    PALIGNR                 m6, m1, m0, 2, m5
+    PALIGNR                 m7, m1, m0, 4, m5
+    LOWPASS                  7,  6,  0
+    pavgw                   m6, m0
+    SCRATCH                  6,  8, rsp+0*mmsize
+    PALIGNR                 m4, m2, m1, 2, m0
+    PALIGNR                 m5, m2, m1, 4, m0
+    LOWPASS                  5,  4,  1
+    pavgw                   m4, m1
+    mova                    m0, [aq+mmsize*3]
+    PALIGNR                 m1, m0, m2, 2, m6
+    PALIGNR                 m3, m0, m2, 4, m6
+    LOWPASS                  3,  1,  2
+    pavgw                   m2, m1
+%if cpuflag(ssse3)
+    PRELOAD                 10, pb_2to15_14_15, shuf
+%endif
+    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
+    LOWPASS                  1,  6,  0
+    pavgw                   m0, m6
+%if ARCH_X86_64
+    pshufd                  m9, m6, q3333
+%endif
+%if cpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride16, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+    lea              stride17q, [stride16q+strideq]
+
+    ; FIXME m8 is unused for avx, so we could save one register here for win64
+.loop:
+%if notcpuflag(avx)
+    UNSCRATCH                6,  8, rsp+0*mmsize
+%endif
+    mova   [dstq+strideq*0+ 0], m6
+    mova   [dstq+strideq*0+16], m4
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+ 0], m7
+    mova   [dstq+strideq*1+16], m5
+    mova   [dstq+strideq*1+32], m3
+    mova   [dstq+strideq*1+48], m1
+    mova   [dstq+stride16q+ 0], m4
+    mova   [dstq+stride16q+16], m2
+    mova   [dstq+stride16q+32], m0
+%if ARCH_X86_64
+    mova   [dstq+stride16q+48], m9
+%endif
+    mova   [dstq+stride17q+ 0], m5
+    mova   [dstq+stride17q+16], m3
+    mova   [dstq+stride17q+32], m1
+%if ARCH_X86_64
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+%if cpuflag(avx)
+    vpalignr                m6, m4, m6, 2
+    vpalignr                m4, m2, m4, 2
+    vpalignr                m2, m0, m2, 2
+    vpalignr                m7, m5, m7, 2
+    vpalignr                m5, m3, m5, 2
+    vpalignr                m3, m1, m3, 2
+%else
+    SCRATCH                  3,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  1, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m3, m4, m6, 2, m1
+    mova                    m6, m3
+    PALIGNR                 m3, m2, m4, 2, m1
+    mova                    m4, m3
+    PALIGNR                 m3, m0, m2, 2, m1
+    mova                    m2, m3
+    PALIGNR                 m3, m5, m7, 2, m1
+    mova                    m7, m3
+    UNSCRATCH                3,  8, rsp+0*mmsize
+    SCRATCH                  6,  8, rsp+0*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                1, 10, rsp+1*mmsize
+    SCRATCH                  7, 10, rsp+1*mmsize
+%endif
+    PALIGNR                 m6, m3, m5, 2, m7
+    mova                    m5, m6
+    PALIGNR                 m6, m1, m3, 2, m7
+    mova                    m3, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+1*mmsize
+%endif
+%endif
+    SHIFT_RIGHT             m1, m1, reg_shuf
+    SHIFT_RIGHT             m0, m0, reg_shuf
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+%assign %%n 0
+%rep 4
+    mova   [dstq+strideq*0+48], m0
+    mova   [dstq+strideq*1+48], m0
+    mova   [dstq+strideq*2+48], m0
+    mova   [dstq+stride3q +48], m0
+%if %%n < 3
+    lea                   dstq, [dstq+strideq*4]
+%endif
+%assign %%n (%%n+1)
+%endrep
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VL_FUNCS 2
+INIT_XMM ssse3
+VL_FUNCS 1
+INIT_XMM avx
+VL_FUNCS 1
+
+%macro VR_FUNCS 0
+cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
+    movu                    m0, [aq-2]
+    movhps                  m1, [lq]
+    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
+    pslldq                  m1, m0, 2               ; .xyz*abc
+    pslldq                  m2, m0, 4               ; ..xyz*ab
+    LOWPASS                  2,  1, 0               ; ..YZ#ABC
+    pavgw                   m1, m0                  ; ....#ABC
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movhps    [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m2
+    shufps                  m0, m2, m1, q3210
+%if cpuflag(ssse3)
+    pshufb                  m2, [pb_4_5_8to13_8x0]
+%else
+    pshuflw                 m2, m2, q2222
+    psrldq                  m2, 6
+%endif
+    psrldq                  m0, 6
+    movh      [dstq+strideq*2], m0
+    movh      [dstq+stride3q ], m2
+    RET
+
+cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [lq]                ; stuvwxyz
+    mova                    m0, [aq]                ; abcdefgh
+    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
+    LOWPASS                  3,  1,  0
+    pavgw                   m0, m1
+    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
+    pslldq                  m4, m2,  2              ; .stuvwxy
+    LOWPASS                  4,  2,  1
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m1
+    pslldq                  m4, 2
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    PALIGNR                 m0, m4, 14, m1
+    pslldq                  m4, 2
+    PALIGNR                 m3, m4, 14, m4
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m3
+    RET
+
+cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
+    movu                    m1, [aq-2]              ; *abcdefg
+    movu                    m2, [aq+mmsize-2]       ; hijklmno
+    mova                    m3, [aq]                ; abcdefgh
+    mova                    m4, [aq+mmsize]         ; ijklmnop
+    mova                    m5, [lq+mmsize]         ; stuvwxyz
+    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
+    movu                    m6, [aq+mmsize-4]       ; ghijklmn
+    LOWPASS                  6,  2,  4
+    pavgw                   m2, m4
+    LOWPASS                  0,  1,  3
+    pavgw                   m3, m1
+    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
+    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
+    LOWPASS                  1,  5,  7
+    movu                    m5, [lq+2]              ; lmnopqrs
+    pslldq                  m4, m5,  2              ; .lmnopqr
+    pslldq                  m7, m5,  4              ; ..lmnopq
+    LOWPASS                  5,  4,  7
+    psrld                   m4, m1, 16
+    psrld                   m7, m5, 16
+    pand                    m1, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m7, m4
+    packssdw                m5, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov                   cntd, 8
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m3
+    mova   [dstq+strideq*0+16], m2
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m6
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m2, m3, 14, m4
+    PALIGNR                 m3, m7, 14, m4
+    pslldq                  m7, 2
+    PALIGNR                 m6, m0, 14, m4
+    PALIGNR                 m0, m5, 14, m4
+    pslldq                  m5, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
+    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
+    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
+    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
+    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
+    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
+    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
+    LOWPASS                  5,  3,  4              ; A[23-30]
+    SCRATCH                  5,  8, rsp+0*mmsize
+    pavgw                   m3, m4
+    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
+    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
+    LOWPASS                  6,  2,  4              ; A[15-22]
+    SCRATCH                  6,  9, rsp+1*mmsize
+    pavgw                   m2, m4
+    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
+    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
+    LOWPASS                  7,  1,  4              ; A[7-14]
+    SCRATCH                  7, 10, rsp+2*mmsize
+    pavgw                   m1, m4
+    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
+    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
+    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
+    LOWPASS                  6,  0,  4              ; #A[0-6]
+    SCRATCH                  6, 11, rsp+3*mmsize
+    pavgw                   m4, m0
+    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
+    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
+    LOWPASS                  0,  5,  7              ; L[24-31]
+    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
+    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
+    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
+    LOWPASS                  5,  7,  6              ; L[16-23]
+    psrld                   m7, m0, 16
+    psrld                   m6, m5, 16
+    pand                    m0, [pd_65535]
+    pand                    m5, [pd_65535]
+    packssdw                m6, m7
+    packssdw                m5, m0
+    SCRATCH                  5, 12, rsp+4*mmsize
+    SCRATCH                  6, 13, rsp+5*mmsize
+    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
+    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
+    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
+    LOWPASS                  6,  0,  5              ; L[8-15]
+    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
+    pslldq                  m5, m0,  2              ; .l[1-7]
+    pslldq                  m7, m0,  4              ; ..l[1-6]
+    LOWPASS                  0,  5,  7
+    psrld                   m5, m6, 16
+    psrld                   m7, m0, 16
+    pand                    m6, [pd_65535]
+    pand                    m0, [pd_65535]
+    packssdw                m7, m5
+    packssdw                m0, m6
+    UNSCRATCH                6, 13, rsp+5*mmsize
+    DEFINE_ARGS dst, stride, stride16, cnt, stride17
+    mov              stride16q, strideq
+    mov                   cntd, 8
+    shl              stride16q, 4
+%if ARCH_X86_64
+    lea              stride17q, [stride16q+strideq]
+%endif
+
+.loop:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m3
+%if ARCH_X86_64
+    mova   [dstq+strideq*1+ 0], m11
+    mova   [dstq+strideq*1+16], m10
+    mova   [dstq+strideq*1+32], m9
+    mova   [dstq+strideq*1+48], m8
+%endif
+    mova   [dstq+stride16q+ 0], m6
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m1
+    mova   [dstq+stride16q+48], m2
+%if ARCH_X86_64
+    mova   [dstq+stride17q+ 0], m12
+    mova   [dstq+stride17q+16], m11
+    mova   [dstq+stride17q+32], m10
+    mova   [dstq+stride17q+48], m9
+%endif
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m3, m2,  14, m5
+    PALIGNR                 m2, m1,  14, m5
+    PALIGNR                 m1, m4,  14, m5
+    PALIGNR                 m4, m6,  14, m5
+    PALIGNR                 m6, m7,  14, m5
+    pslldq                  m7, 2
+%if ARCH_X86_64
+    PALIGNR                 m8, m9,  14, m5
+    PALIGNR                 m9, m10, 14, m5
+    PALIGNR                m10, m11, 14, m5
+    PALIGNR                m11, m12, 14, m5
+    PALIGNR                m12, m0,  14, m5
+    pslldq                  m0, 2
+%endif
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                5, 12, rsp+4*mmsize
+    UNSCRATCH                4, 11, rsp+3*mmsize
+    UNSCRATCH                3, 10, rsp+2*mmsize
+    UNSCRATCH                2,  9, rsp+1*mmsize
+    UNSCRATCH                1,  8, rsp+0*mmsize
+    mov                   dstq, dstm
+    mov                   cntd, 8
+    add                   dstq, strideq
+.loop2:
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m3
+    mova   [dstq+strideq*0+32], m2
+    mova   [dstq+strideq*0+48], m1
+    mova   [dstq+stride16q+ 0], m5
+    mova   [dstq+stride16q+16], m4
+    mova   [dstq+stride16q+32], m3
+    mova   [dstq+stride16q+48], m2
+    lea                   dstq, [dstq+strideq*2]
+    PALIGNR                 m1, m2,  14, m6
+    PALIGNR                 m2, m3,  14, m6
+    PALIGNR                 m3, m4,  14, m6
+    PALIGNR                 m4, m5,  14, m6
+    PALIGNR                 m5, m0,  14, m6
+    pslldq                  m0, 2
+    dec                   cntd
+    jg .loop2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+VR_FUNCS
+INIT_XMM ssse3
+VR_FUNCS
+INIT_XMM avx
+VR_FUNCS
+
+%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
+cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
+    movh                    m0, [lq]                ; abcd
+%if cpuflag(ssse3)
+    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
+%else
+    punpcklqdq              m0, m0
+    pshufhw                 m0, m0, q3333           ; abcddddd
+%endif
+    psrldq                  m1, m0,  2              ; bcddddd.
+    psrldq                  m2, m0,  4              ; cddddd..
+    LOWPASS                  2,  1,  0              ; BCDddd..
+    pavgw                   m1, m0                  ; abcddddd
+    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
+    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+strideq*0], m1                  ; aBbC
+    movh      [dstq+strideq*1], m2                  ; bCcD
+    movhps    [dstq+strideq*2], m1                  ; cDdd
+    movhps    [dstq+stride3q ], m2                  ; dddd
+    RET
+
+cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
+    mova                    m0, [lq]
+%if cpuflag(ssse3)
+    mova                    m3, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m2, m0, m3
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY          wd,  1,  2,  0
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+%else
+    PALIGNR                 m0, m2, m1, 4, m3
+    mova                    m1, m0
+%endif
+    pshufd                  m2, m2, q3321
+    shufps                  m0, m1, m2, q1032
+    pshufd                  m3, m2, q3332
+    mova     [dstq+strideq *0], m1
+    mova     [dstq+strideq *2], m0
+    mova     [dstq+strideq *4], m2
+    mova     [dstq+stride3q*2], m3
+    RET
+
+cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m3, [lq+mmsize]
+    movu                    m1, [lq+2]
+    movu                    m2, [lq+4]
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    SBUTTERFLY           wd, 1,  2,  0
+%if cpuflag(ssse3)
+    mova                    m5, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m0, m4, m3, m5
+    LOWPASS                  4,  0,  3
+    pavgw                   m3, m0
+    SBUTTERFLY           wd, 3,  4,  5
+    pshufd                  m0, m0, q3333
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+
+.loop:
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m2
+    mova  [dstq+strideq *4+ 0], m2
+    mova  [dstq+strideq *4+16], m3
+    mova  [dstq+strideq *8+ 0], m3
+    mova  [dstq+strideq *8+16], m4
+    mova  [dstq+stride3q*4+ 0], m4
+    mova  [dstq+stride3q*4+16], m0
+    add                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m2, m1, 4
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m4, m3, 4
+    vpalignr                m4, m0, m4, 4
+%else
+    PALIGNR                 m5, m2, m1, 4, m6
+    mova                    m1, m5
+    PALIGNR                 m5, m3, m2, 4, m6
+    mova                    m2, m5
+    PALIGNR                 m5, m4, m3, 4, m6
+    mova                    m3, m5
+    PALIGNR                 m5, m0, m4, 4, m6
+    mova                    m4, m5
+%endif
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
+                               %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    SCRATCH                  1,  8, rsp+0*mmsize
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m7, [lq+mmsize*3+0]
+    SCRATCH                  0,  9, rsp+1*mmsize
+%if cpuflag(ssse3)
+    mova                    m0, [pb_2to15_14_15]
+%endif
+    SHIFT_RIGHTx2           m1, m6, m7, m0
+    LOWPASS                  6,  1,  7
+    pavgw                   m7, m1
+    SBUTTERFLY           wd, 7,  6,  0
+    pshufd                  m1, m1, q3333
+    UNSCRATCH                0,  9, rsp+1*mmsize
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    lea               stride3q, [strideq*3]
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+    mov                   cntd, 4
+
+.loop:
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+1*mmsize], m1
+    mova                    m1, [rsp+0*mmsize]
+%endif
+    mova  [dstq+strideq *0+ 0], m1
+    mova  [dstq+strideq *0+16], m0
+    mova  [dstq+strideq *0+32], m3
+    mova  [dstq+strideq *0+48], m2
+    mova  [dstq+stride4q*1+ 0], m0
+    mova  [dstq+stride4q*1+16], m3
+    mova  [dstq+stride4q*1+32], m2
+    mova  [dstq+stride4q*1+48], m5
+    mova  [dstq+stride4q*2+ 0], m3
+    mova  [dstq+stride4q*2+16], m2
+    mova  [dstq+stride4q*2+32], m5
+    mova  [dstq+stride4q*2+48], m4
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+%else
+    SCRATCH                  6,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 10, rsp+3*mmsize
+%endif
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    UNSCRATCH                6,  9, rsp+2*mmsize
+    SCRATCH                  0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 10, rsp+3*mmsize
+    SCRATCH                  3, 10, rsp+3*mmsize
+%endif
+%endif
+%if ARCH_X86_64
+    SWAP                     1,  8
+%else
+    mova        [rsp+0*mmsize], m1
+    mova                    m1, [rsp+1*mmsize]
+%endif
+    mova  [dstq+stride3q*4+ 0], m2
+    mova  [dstq+stride3q*4+16], m5
+    mova  [dstq+stride3q*4+32], m4
+    mova  [dstq+stride3q*4+48], m7
+    mova  [dstq+stride4q*4+ 0], m5
+    mova  [dstq+stride4q*4+16], m4
+    mova  [dstq+stride4q*4+32], m7
+    mova  [dstq+stride4q*4+48], m6
+    mova  [dstq+stride20q + 0], m4
+    mova  [dstq+stride20q +16], m7
+    mova  [dstq+stride20q +32], m6
+    mova  [dstq+stride20q +48], m1
+    mova  [dstq+stride3q*8+ 0], m7
+    mova  [dstq+stride3q*8+16], m6
+    mova  [dstq+stride3q*8+32], m1
+    mova  [dstq+stride3q*8+48], m1
+    mova  [dstq+stride28q + 0], m6
+    mova  [dstq+stride28q +16], m1
+    mova  [dstq+stride28q +32], m1
+    mova  [dstq+stride28q +48], m1
+%if cpuflag(avx)
+    vpalignr                m2, m5, m2, 4
+    vpalignr                m5, m4, m5, 4
+    vpalignr                m4, m7, m4, 4
+    vpalignr                m7, m6, m7, 4
+    vpalignr                m6, m1, m6, 4
+%else
+    PALIGNR                 m0, m5, m2, 4, m3
+    mova                    m2, m0
+    PALIGNR                 m0, m4, m5, 4, m3
+    mova                    m5, m0
+    PALIGNR                 m0, m7, m4, 4, m3
+    mova                    m4, m0
+    PALIGNR                 m0, m6, m7, 4, m3
+    mova                    m7, m0
+    PALIGNR                 m0, m1, m6, 4, m3
+    mova                    m6, m0
+    UNSCRATCH                0,  9, rsp+2*mmsize
+%if notcpuflag(ssse3)
+    UNSCRATCH                3, 10, rsp+3*mmsize
+%endif
+%endif
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+HU_FUNCS 4
+INIT_XMM ssse3
+HU_FUNCS 3
+INIT_XMM avx
+HU_FUNCS 2
+
+%macro HD_FUNCS 0
+cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
+    movh                    m0, [lq]
+    movhps                  m0, [aq-2]
+    psrldq                  m1, m0, 2
+    psrldq                  m2, m0, 4
+    LOWPASS                  2,  1,  0
+    pavgw                   m1, m0
+    punpcklwd               m1, m2
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+
+    movh      [dstq+stride3q ], m1
+    movhps    [dstq+strideq*1], m1
+    movhlps                 m2, m2
+    PALIGNR                 m2, m1, 4, m0
+    movh      [dstq+strideq*2], m2
+    movhps    [dstq+strideq*0], m2
+    RET
+
+cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m1, [aq-2]
+    PALIGNR                 m2, m1, m0, 2, m3
+    PALIGNR                 m3, m1, m0, 4, m4
+    LOWPASS                  3,  2,  0
+    pavgw                   m2, m0
+    SBUTTERFLY           wd, 2,  3,  0
+    psrldq                  m0, m1,  2
+    psrldq                  m4, m1,  4
+    LOWPASS                  1,  0,  4
+    DEFINE_ARGS dst8, mstride, cnt
+    lea                  dst8q, [dst8q+mstrideq*8]
+    neg               mstrideq
+    mov                   cntd, 4
+
+.loop:
+    add                  dst8q, mstrideq
+    mova    [dst8q+mstrideq*0], m2
+    mova    [dst8q+mstrideq*4], m3
+%if cpuflag(avx)
+    vpalignr                m2, m3, m2, 4
+    vpalignr                m3, m1, m3, 4
+%else
+    PALIGNR                 m0, m3, m2, 4, m4
+    mova                    m2, m0
+    PALIGNR                 m0, m1, m3, 4, m4
+    mova                    m3, m0
+%endif
+    psrldq                  m1, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
+    mova                    m2, [lq]
+    movu                    m1, [lq+2]
+    movu                    m0, [lq+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    mova                    m4, [lq+mmsize]
+    movu                    m5, [aq-2]
+    PALIGNR                 m3, m5, m4, 2, m6
+    PALIGNR                 m2, m5, m4, 4, m6
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 1,  0,  4
+    SBUTTERFLY           wd, 3,  2,  4
+    mova                    m6, [aq]
+    movu                    m4, [aq+2]
+    LOWPASS                  4,  6,  5
+    movu                    m5, [aq+mmsize-2]
+    psrldq                  m6, m5,  2
+    psrldq                  m7, m5,  4
+    LOWPASS                  5,  6,  7
+    DEFINE_ARGS dst, mstride, mstride3, cnt
+    lea                   dstq, [dstq+mstrideq*8]
+    lea                   dstq, [dstq+mstrideq*8]
+    neg               mstrideq
+    lea              mstride3q, [mstrideq*3]
+    mov                   cntd, 4
+
+.loop:
+    add                  dstq, mstrideq
+    mova [dstq+mstride3q*4+ 0], m2
+    mova [dstq+mstride3q*4+16], m4
+    mova [dstq+mstrideq *8+ 0], m3
+    mova [dstq+mstrideq *8+16], m2
+    mova [dstq+mstrideq *4+ 0], m0
+    mova [dstq+mstrideq *4+16], m3
+    mova [dstq+mstrideq *0+ 0], m1
+    mova [dstq+mstrideq *0+16], m0
+%if cpuflag(avx)
+    vpalignr                m1, m0, m1, 4
+    vpalignr                m0, m3, m0, 4
+    vpalignr                m3, m2, m3, 4
+    vpalignr                m2, m4, m2, 4
+    vpalignr                m4, m5, m4, 4
+%else
+    PALIGNR                 m6, m0, m1, 4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3, m0, 4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2, m3, 4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m4, m2, 4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m5, m4, 4, m7
+    mova                    m4, m6
+%endif
+    psrldq                  m5, 4
+    dec                   cntd
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
+                               10 * -mmsize * ARCH_X86_32, dst, stride, l, a
+    mova                    m2, [lq+mmsize*0+0]
+    movu                    m1, [lq+mmsize*0+2]
+    movu                    m0, [lq+mmsize*0+4]
+    LOWPASS                  0,  1,  2
+    pavgw                   m1, m2
+    SBUTTERFLY           wd, 1,  0,  2
+    mova                    m4, [lq+mmsize*1+0]
+    movu                    m3, [lq+mmsize*1+2]
+    movu                    m2, [lq+mmsize*1+4]
+    LOWPASS                  2,  3,  4
+    pavgw                   m3, m4
+    SBUTTERFLY           wd, 3,  2,  4
+    SCRATCH                  0,  8, rsp+0*mmsize
+    SCRATCH                  1,  9, rsp+1*mmsize
+    SCRATCH                  2, 10, rsp+2*mmsize
+    SCRATCH                  3, 11, rsp+3*mmsize
+    mova                    m6, [lq+mmsize*2+0]
+    movu                    m5, [lq+mmsize*2+2]
+    movu                    m4, [lq+mmsize*2+4]
+    LOWPASS                  4,  5,  6
+    pavgw                   m5, m6
+    SBUTTERFLY           wd, 5,  4,  6
+    mova                    m0, [lq+mmsize*3+0]
+    movu                    m1, [aq+mmsize*0-2]
+    PALIGNR                 m7, m1, m0, 2, m2
+    PALIGNR                 m6, m1, m0, 4, m2
+    LOWPASS                  6,  7,  0
+    pavgw                   m7, m0
+    SBUTTERFLY           wd, 7,  6,  0
+    mova                    m2, [aq+mmsize*0+0]
+    movu                    m0, [aq+mmsize*0+2]
+    LOWPASS                  0,  2,  1
+    movu                    m1, [aq+mmsize*1-2]
+    mova                    m2, [aq+mmsize*1+0]
+    movu                    m3, [aq+mmsize*1+2]
+    LOWPASS                  1,  2,  3
+    SCRATCH                  6, 12, rsp+6*mmsize
+    SCRATCH                  7, 13, rsp+7*mmsize
+    movu                    m2, [aq+mmsize*2-2]
+    mova                    m3, [aq+mmsize*2+0]
+    movu                    m6, [aq+mmsize*2+2]
+    LOWPASS                  2,  3,  6
+    movu                    m3, [aq+mmsize*3-2]
+    psrldq                  m6, m3,  2
+    psrldq                  m7, m3,  4
+    LOWPASS                  3,  6,  7
+    UNSCRATCH                6, 12, rsp+6*mmsize
+    UNSCRATCH                7, 13, rsp+7*mmsize
+%if ARCH_X86_32
+    mova        [rsp+4*mmsize], m4
+    mova        [rsp+5*mmsize], m5
+    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
+    ; to do it again here
+%endif
+    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
+    mov                   cntd, 4
+    lea               stride3q, [strideq*3]
+%if ARCH_X86_64
+    lea               stride4q, [strideq*4]
+    lea              stride28q, [stride4q*8]
+    lea              stride20q, [stride4q*5]
+    sub              stride28q, stride4q
+%endif
+    add                   dstq, stride3q
+
+    ; x86-32 doesn't have enough registers, so on that platform, we split
+    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
+.loop:
+%if ARCH_X86_64
+    mova  [dstq+stride28q + 0], m9
+    mova  [dstq+stride28q +16], m8
+    mova  [dstq+stride28q +32], m11
+    mova  [dstq+stride28q +48], m10
+    mova  [dstq+stride3q*8+ 0], m8
+    mova  [dstq+stride3q*8+16], m11
+    mova  [dstq+stride3q*8+32], m10
+    mova  [dstq+stride3q*8+48], m5
+    mova  [dstq+stride20q + 0], m11
+    mova  [dstq+stride20q +16], m10
+    mova  [dstq+stride20q +32], m5
+    mova  [dstq+stride20q +48], m4
+    mova  [dstq+stride4q*4+ 0], m10
+    mova  [dstq+stride4q*4+16], m5
+    mova  [dstq+stride4q*4+32], m4
+    mova  [dstq+stride4q*4+48], m7
+%endif
+    mova  [dstq+stride3q*4+ 0], m5
+    mova  [dstq+stride3q*4+16], m4
+    mova  [dstq+stride3q*4+32], m7
+    mova  [dstq+stride3q*4+48], m6
+    mova  [dstq+strideq* 8+ 0], m4
+    mova  [dstq+strideq* 8+16], m7
+    mova  [dstq+strideq* 8+32], m6
+    mova  [dstq+strideq* 8+48], m0
+    mova  [dstq+strideq* 4+ 0], m7
+    mova  [dstq+strideq* 4+16], m6
+    mova  [dstq+strideq* 4+32], m0
+    mova  [dstq+strideq* 4+48], m1
+    mova  [dstq+strideq* 0+ 0], m6
+    mova  [dstq+strideq* 0+16], m0
+    mova  [dstq+strideq* 0+32], m1
+    mova  [dstq+strideq* 0+48], m2
+    sub                   dstq, strideq
+%if cpuflag(avx)
+%if ARCH_X86_64
+    vpalignr                m9, m8,  m9,  4
+    vpalignr                m8, m11, m8,  4
+    vpalignr               m11, m10, m11, 4
+    vpalignr               m10, m5,  m10, 4
+%endif
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+    vpalignr                m6, m0,  m6,  4
+    vpalignr                m0, m1,  m0,  4
+    vpalignr                m1, m2,  m1,  4
+    vpalignr                m2, m3,  m2,  4
+%else
+%if ARCH_X86_64
+    PALIGNR                m12, m8,  m9,  4, m13
+    mova                    m9, m12
+    PALIGNR                m12, m11, m8,  4, m13
+    mova                    m8, m12
+    PALIGNR                m12, m10, m11, 4, m13
+    mova                   m11, m12
+    PALIGNR                m12, m5,  m10, 4, m13
+    mova                   m10, m12
+%endif
+    SCRATCH                  3, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  2, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m4,  m5,  4, m2
+    mova                    m5, m3
+    PALIGNR                 m3, m7,  m4,  4, m2
+    mova                    m4, m3
+    PALIGNR                 m3, m6,  m7,  4, m2
+    mova                    m7, m3
+    PALIGNR                 m3, m0,  m6,  4, m2
+    mova                    m6, m3
+    PALIGNR                 m3, m1,  m0,  4, m2
+    mova                    m0, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                2, 13, rsp+9*mmsize
+    SCRATCH                  0, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m3, m2,  m1,  4, m0
+    mova                    m1, m3
+    PALIGNR                 m3, reg_sh,  m2,  4, m0
+    mova                    m2, m3
+%if notcpuflag(ssse3)
+    UNSCRATCH                0, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                3, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m3, 4
+    dec                   cntd
+    jg .loop
+
+%if ARCH_X86_32
+    UNSCRATCH                0,  8, rsp+0*mmsize
+    UNSCRATCH                1,  9, rsp+1*mmsize
+    UNSCRATCH                2, 10, rsp+2*mmsize
+    UNSCRATCH                3, 11, rsp+3*mmsize
+    mova                    m4, [rsp+4*mmsize]
+    mova                    m5, [rsp+5*mmsize]
+    mova                    m6, [rsp+6*mmsize]
+    mova                    m7, [rsp+7*mmsize]
+    DEFINE_ARGS dst, stride, stride5, stride3
+    lea               stride5q, [strideq*5]
+    lea                   dstq, [dstq+stride5q*4]
+    DEFINE_ARGS dst, stride, cnt, stride3
+    mov                   cntd, 4
+.loop_2:
+    mova  [dstq+stride3q*4+ 0], m1
+    mova  [dstq+stride3q*4+16], m0
+    mova  [dstq+stride3q*4+32], m3
+    mova  [dstq+stride3q*4+48], m2
+    mova  [dstq+strideq* 8+ 0], m0
+    mova  [dstq+strideq* 8+16], m3
+    mova  [dstq+strideq* 8+32], m2
+    mova  [dstq+strideq* 8+48], m5
+    mova  [dstq+strideq* 4+ 0], m3
+    mova  [dstq+strideq* 4+16], m2
+    mova  [dstq+strideq* 4+32], m5
+    mova  [dstq+strideq* 4+48], m4
+    mova  [dstq+strideq* 0+ 0], m2
+    mova  [dstq+strideq* 0+16], m5
+    mova  [dstq+strideq* 0+32], m4
+    mova  [dstq+strideq* 0+48], m7
+    sub                   dstq, strideq
+%if cpuflag(avx)
+    vpalignr                m1, m0,  m1,  4
+    vpalignr                m0, m3,  m0,  4
+    vpalignr                m3, m2,  m3,  4
+    vpalignr                m2, m5,  m2,  4
+    vpalignr                m5, m4,  m5,  4
+    vpalignr                m4, m7,  m4,  4
+    vpalignr                m7, m6,  m7,  4
+%else
+    SCRATCH                  6, 12, rsp+8*mmsize, sh
+%if notcpuflag(ssse3)
+    SCRATCH                  7, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m0,  m1,  4, m7
+    mova                    m1, m6
+    PALIGNR                 m6, m3,  m0,  4, m7
+    mova                    m0, m6
+    PALIGNR                 m6, m2,  m3,  4, m7
+    mova                    m3, m6
+    PALIGNR                 m6, m5,  m2,  4, m7
+    mova                    m2, m6
+    PALIGNR                 m6, m4,  m5,  4, m7
+    mova                    m5, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                7, 13, rsp+9*mmsize
+    SCRATCH                  5, 13, rsp+9*mmsize
+%endif
+    PALIGNR                 m6, m7,  m4,  4, m5
+    mova                    m4, m6
+    PALIGNR                 m6, reg_sh,  m7,  4, m5
+    mova                    m7, m6
+%if notcpuflag(ssse3)
+    UNSCRATCH                5, 13, rsp+9*mmsize
+%endif
+    UNSCRATCH                6, 12, rsp+8*mmsize, sh
+%endif
+    psrldq                  m6, 4
+    dec                   cntd
+    jg .loop_2
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+HD_FUNCS
+INIT_XMM ssse3
+HD_FUNCS
+INIT_XMM avx
+HD_FUNCS
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 0000000..6d5008e
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,2625 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA
+
+%macro VP9_IDCT_COEFFS 2-3 0
+const pw_m%1_%2
+times 4 dw -%1,  %2
+const pw_%2_%1
+times 4 dw  %2,  %1
+
+%if %3 == 1
+const pw_m%2_m%1
+times 4 dw -%2, -%1
+%if %1 != %2
+const pw_m%2_%1
+times 4 dw -%2,  %1
+const pw_%1_%2
+times 4 dw  %1,  %2
+%endif
+%endif
+
+%if %1 < 11585
+pw_m%1x2:   times 8 dw -%1*2
+%elif %1 > 11585
+pw_%1x2:    times 8 dw  %1*2
+%else
+const pw_%1x2
+times 8 dw %1*2
+%endif
+
+%if %2 != %1
+pw_%2x2:    times 8 dw  %2*2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 16364,   804
+VP9_IDCT_COEFFS 16305,  1606
+VP9_IDCT_COEFFS 16069,  3196, 1
+VP9_IDCT_COEFFS 15893,  3981
+VP9_IDCT_COEFFS 15137,  6270, 1
+VP9_IDCT_COEFFS 14811,  7005
+VP9_IDCT_COEFFS 14449,  7723
+VP9_IDCT_COEFFS 13160,  9760
+VP9_IDCT_COEFFS 11585, 11585, 1
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS  9102, 13623, 1
+VP9_IDCT_COEFFS  8423, 14053
+VP9_IDCT_COEFFS  5520, 15426
+VP9_IDCT_COEFFS  4756, 15679
+VP9_IDCT_COEFFS  2404, 16207
+
+const pw_5283_13377
+times 4 dw 5283, 13377
+const pw_9929_13377
+times 4 dw 9929, 13377
+const pw_15212_m13377
+times 4 dw 15212, -13377
+const pw_15212_9929
+times 4 dw 15212, 9929
+const pw_m5283_m15212
+times 4 dw -5283, -15212
+const pw_13377x2
+times 8 dw 13377*2
+const pw_m13377_13377
+times 4 dw -13377, 13377
+const pw_13377_0
+times 4 dw 13377, 0
+
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_m1
+cextern pd_8192
+
+SECTION .text
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+    punpckhwd          m%4, m%2, m%1
+    punpcklwd          m%2, m%1
+    pmaddwd            m%3, m%4, [pw_m%5_%6]
+    pmaddwd            m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+    SUMSUB_BA            d, %1, %2, %5
+    SUMSUB_BA            d, %3, %4, %5
+    paddd              m%1, %6
+    paddd              m%2, %6
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%1, 14
+    psrad              m%2, 14
+    psrad              m%3, 14
+    psrad              m%4, 14
+    packssdw           m%1, m%3
+    packssdw           m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+    movh               m%3, [%6]
+    movh               m%4, [%6+strideq]
+    punpcklbw          m%3, m%5
+    punpcklbw          m%4, m%5
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+    packuswb           m%3, m%5
+    packuswb           m%4, m%5
+    movh              [%6], m%3
+    movh      [%6+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+    mova                m0, [blockq+0*8]
+    mova                m1, [blockq+1*8]
+    mova                m2, [blockq+2*8]
+    mova                m3, [blockq+3*8]
+    psraw               m0, 2
+    psraw               m1, 2
+    psraw               m2, 2
+    psraw               m3, 2
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m4, m4
+    VP9_STORE_2X         0, 1, 5, 6, 4
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 5, 6, 4
+    ZERO_BLOCK      blockq, 8, 4, m4
+    RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+    pmulhrsw            m0, m5                              ; m0=t1
+    mova                m2, m0                              ; m2=t0
+    mova                m3, m1
+    pmulhrsw            m1, m6                              ; m1=t2
+    pmulhrsw            m3, m7                              ; m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    pmulhrsw            m1, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    psraw               m0, 4
+    psraw               m1, 4
+%endif
+    VP9_STORE_2X         0,  1,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%if cpuflag(ssse3)
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    VP9_STORE_2X         2,  3,  6,  7,  4
+%endmacro
+
+%macro IDCT_4x4_FN 1
+INIT_MMX %1
+cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+    cmp eobd, 4 ; 2x2 or smaller
+    jg .idctfull
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct2x2
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (8 << 14) + 8192
+    sar              coefd, 14 + 4
+    movd                m0, coefd
+%endif
+    pshufw              m0, m0, 0
+    pxor                m4, m4
+    movh          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+8]
+    mova                m5, [pw_11585x2]
+    mova                m6, [pw_6270x2]
+    mova                m7, [pw_15137x2]
+    VP9_IDCT4_2x2_1D
+    ; partial 2x4 transpose
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    SBUTTERFLY          dq, 0, 2, 1
+    SWAP                1, 2
+    VP9_IDCT4_2x2_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movh       [blockq+ 0], m4
+    movh       [blockq+ 8], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endif
+
+.idctfull: ; generic full 4x4 idct/idct
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IDCT_4x4_FN mmxext
+IDCT_4x4_FN ssse3
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, sse2
+IADST4_FN iadst, IADST4, idct,  IDCT4,  sse2
+IADST4_FN iadst, IADST4, iadst, IADST4, sse2
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova              [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+    SUMSUB_BA            w,  3,  6, 5                       ; m3=t0+t7, m6=t0-t7
+    SUMSUB_BA            w,  1,  2, 5                       ; m1=t1+t6, m2=t1-t6
+    SUMSUB_BA            w,  7,  0, 5                       ; m7=t2+t5, m0=t2-t5
+
+    UNSCRATCH            5, 8, blockq+ 0
+    SCRATCH              2, 8, blockq+ 0
+
+    SUMSUB_BA            w,  5,  4, 2                       ; m5=t3+t4, m4=t3-t4
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+%endmacro
+
+; x86-32
+; - in: m0/m4 is in mem
+; - out: m6 is in mem
+; x86-64:
+; - everything is in registers (m0-7)
+%macro VP9_IDCT8_1D 0
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 4, 9
+%endif
+
+    VP9_UNPACK_MULSUB_2W_4X 5,  3,  9102, 13623, D_8192_REG, 0, 4  ; m5=t5a, m3=t6a
+    VP9_UNPACK_MULSUB_2W_4X 1,  7, 16069,  3196, D_8192_REG, 0, 4  ; m1=t4a, m7=t7a
+    SUMSUB_BA            w,  5,  1, 0                       ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 0                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+%if cpuflag(ssse3)
+    SUMSUB_BA            w,  1,  7, 0                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+%else
+    VP9_UNPACK_MULSUB_2W_4X 7,  1, 11585, 11585, D_8192_REG, 0, 4
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 2,  6, 15137,  6270, D_8192_REG, 0, 4  ; m2=t2a, m6=t3a
+
+    UNSCRATCH            0, 8, blockq+ 0    ; IN(0)
+    UNSCRATCH            4, 9, blockq+64    ; IN(4)
+    SCRATCH              5, 8, blockq+ 0
+
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 4, 0, 5                         ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
+    pmulhrsw            m4, W_11585x2_REG                   ; m4=t0a
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a
+%else
+    SCRATCH              7, 9, blockq+64
+    VP9_UNPACK_MULSUB_2W_4X 0,  4, 11585, 11585, D_8192_REG, 5, 7
+    UNSCRATCH            7, 9, blockq+64
+%endif
+    SUMSUB_BA            w,  6,  4, 5                       ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a/t0a
+    pmulhrsw            m6, m2, [pw_15137x2]                ; m6=t3a
+    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
+    pmulhrsw            m7, m1, [pw_16069x2]                ; m7=t7a
+    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
+    pmulhrsw            m5, m3, [pw_m9102x2]                ; m5=t5a
+    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
+    SUMSUB_BA            w,  5,  1, 4                       ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3,  7, 4                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
+    SUMSUB_BA            w,  1,  7, 4                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
+    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    psubw               m4, m0, m6                          ; m4=t0a-t3a (t3)
+    paddw               m6, m0                              ; m6=t0a+t3a (t0)
+    SCRATCH              5,  8, blockq+ 0
+    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_2x2_1D 1
+    pmulhrsw            m0, W_11585x2_REG                   ; m0=t0
+    pmulhrsw            m3, m1, W_16069x2_REG               ; m3=t7
+    pmulhrsw            m1, W_3196x2_REG                    ; m1=t4
+    psubw               m7, m3, m1                          ; t5 = t7a - t4a
+    paddw               m5, m3, m1                          ; t6 = t7a + t4a
+    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
+    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
+    SWAP                 5,  1
+    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
+    psubw               m6, m0, m3                          ; m6=t0-t7
+    paddw               m3, m0                              ; m3=t0+t7
+    psubw               m2, m0, m1                          ; m2=t1-t6
+    paddw               m1, m0                              ; m1=t1+t6
+%if %1 == 1
+    punpcklwd           m3, m1
+%define SCRATCH_REG 1
+%elif ARCH_X86_32
+    mova       [blockq+ 0], m2
+%define SCRATCH_REG 2
+%else
+%define SCRATCH_REG 8
+%endif
+    psubw               m4, m0, m5                          ; m4=t3-t4
+    paddw               m5, m0                              ; m5=t3+t4
+    SUMSUB_BA            w,  7,  0, SCRATCH_REG             ; m7=t2+t5, m0=t2-t5
+    SWAP                 7,  6,  2
+    SWAP                 3,  5,  0
+%undef SCRATCH_REG
+%endmacro
+
+%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
+%if cpuflag(ssse3)
+    pmulhrsw           m%1, %6              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+    pmulhrsw           m%2, %6
+%else
+    paddw              m%1, %6
+    paddw              m%2, %6
+    psraw              m%1, %7
+    psraw              m%2, %7
+%endif
+%if %0 <= 7
+    VP9_STORE_2X        %1, %2, %3, %4, %5
+%else
+    VP9_STORE_2X        %1, %2, %3, %4, %5, %8
+%endif
+%endmacro
+
+; x86-32:
+; - m6 is in mem
+; x86-64:
+; - m8 holds m6 (SWAP)
+; m6 holds zero
+%macro VP9_IDCT8_WRITEOUT 0
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova                m9, [pw_1024]
+%else
+    mova                m9, [pw_16]
+%endif
+%define ROUND_REG m9
+%else
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_1024]
+%else
+%define ROUND_REG [pw_16]
+%endif
+%endif
+    SCRATCH              5, 10, blockq+16
+    SCRATCH              7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    0,  1, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 7, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 10, blockq+16
+    UNSCRATCH            7, 11, blockq+32
+    VP9_IDCT8_WRITEx2    4,  5, 0, 1, 6, ROUND_REG
+    lea               dstq, [dstq+2*strideq]
+    UNSCRATCH            5, 8, blockq+ 0
+    VP9_IDCT8_WRITEx2    5,  7, 0, 1, 6, ROUND_REG
+
+%undef ROUND_REG
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
+
+%if cpuflag(ssse3)
+%if ARCH_X86_64
+    mova               m12, [pw_11585x2]    ; often used
+%define W_11585x2_REG m12
+%else
+%define W_11585x2_REG [pw_11585x2]
+%endif
+
+    cmp eobd, 12 ; top left half or less
+    jg .idctfull
+
+    cmp eobd, 3  ; top left corner or less
+    jg .idcthalf
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idcttopleftcorner
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    pmulhrsw            m0, W_11585x2_REG
+    pmulhrsw            m0, W_11585x2_REG
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (16 << 14) + 8192
+    sar              coefd, 14 + 5
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, 0
+    pxor                m4, m4
+    movd          [blockq], m4
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_1024]       ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+%endif
+%rep 3
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+%if cpuflag(ssse3)
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+16]
+%if ARCH_X86_64
+    mova               m10, [pw_3196x2]
+    mova               m11, [pw_16069x2]
+%define W_3196x2_REG m10
+%define W_16069x2_REG m11
+%else
+%define W_3196x2_REG [pw_3196x2]
+%define W_16069x2_REG [pw_16069x2]
+%endif
+    VP9_IDCT8_2x2_1D 1
+    ; partial 2x8 transpose
+    ; punpcklwd m0, m1 already done inside idct
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    punpckldq           m0, m2
+    punpckldq           m4, m6
+    SBUTTERFLY         qdq, 0, 4, 1
+    SWAP                 1, 4
+    VP9_IDCT8_2x2_1D 2
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movd       [blockq+ 0], m6
+    movd       [blockq+16], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    RET
+
+.idcthalf:
+    movh                m0, [blockq + 0]
+    movh                m1, [blockq +16]
+    movh                m2, [blockq +32]
+    movh                m3, [blockq +48]
+    VP9_IDCT8_4x4_1D
+    ; partial 4x8 transpose
+%if ARCH_X86_32
+    mova                m6, [blockq+ 0]
+%endif
+    punpcklwd           m0, m1
+    punpcklwd           m2, m3
+    punpcklwd           m4, m5
+    punpcklwd           m6, m7
+    SBUTTERFLY          dq, 0, 2, 1
+    SBUTTERFLY          dq, 4, 6, 5
+    SBUTTERFLY         qdq, 0, 4, 1
+    SBUTTERFLY         qdq, 2, 6, 5
+    SWAP                 1, 4
+    SWAP                 3, 6
+    VP9_IDCT8_4x4_1D
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6
+    VP9_IDCT8_WRITEOUT
+%if ARCH_X86_64
+    movh       [blockq+ 0], m6
+    movh       [blockq+16], m6
+    movh       [blockq+32], m6
+%else
+    mova       [blockq+ 0], m6
+    mova       [blockq+16], m6
+    mova       [blockq+32], m6
+%endif
+    movh       [blockq+48], m6
+    RET
+%endif
+
+.idctfull: ; generic full 8x8 idct/idct
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+    mova                m3, [blockq+ 48]    ; IN(3)
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+    mova                m7, [blockq+112]    ; IN(7)
+%if ARCH_X86_64
+    mova               m11, [pd_8192]       ; rounding
+%define D_8192_REG m11
+%else
+%define D_8192_REG [pd_8192]
+%endif
+    VP9_IDCT8_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova        [blockq+0], m0
+%endif
+    VP9_IDCT8_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+%undef W_11585x2_REG
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
+VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-32:
+; - in: m0/3/4/7 are in mem [blockq+N*16]
+; - out: m6 is in mem [blockq+0]
+; x86-64:
+; - everything is in registers
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
+%if ARCH_X86_64
+    SWAP                     0, 8
+    SWAP                     3, 9
+    SWAP                     4, 10
+    SWAP                     7, 11
+%endif
+
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  0,  3, 14449,  7723    ; m5/2=t3[d], m2/4=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  1,  6,  4,  7,  4756, 15679    ; m1/4=t7[d], m6/7=t6[d]
+    SCRATCH                  4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     6,  2,  7,  3, 4, D_8192_REG  ; m6=t2[w], m2=t6[w]
+    UNSCRATCH                4, 12, blockq+1*16
+    VP9_RND_SH_SUMSUB_BA     1,  5,  4,  0, 3, D_8192_REG  ; m1=t3[w], m5=t7[w]
+
+    UNSCRATCH                0,  8, blockq+16*0
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                7, 11, blockq+16*7
+    SCRATCH                  1,  8, blockq+16*1
+    SCRATCH                  2,  9, blockq+16*2
+    SCRATCH                  5, 10, blockq+16*5
+    SCRATCH                  6, 11, blockq+16*6
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  0,  1,  2, 16305,  1606    ; m7/1=t1[d], m0/2=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  4,  5,  6, 10394, 12665    ; m3/5=t5[d], m4/6=t4[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     4,  0,  6,  2, 1, D_8192_REG  ; m4=t0[w], m0=t4[w]
+    UNSCRATCH                1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     3,  7,  5,  1, 2, D_8192_REG  ; m3=t1[w], m7=t5[w]
+
+    UNSCRATCH                2,  9, blockq+16*2
+    UNSCRATCH                5, 10, blockq+16*5
+    SCRATCH                  3,  9, blockq+16*3
+    SCRATCH                  4, 10, blockq+16*4
+
+    ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  7,  1,  3, 15137,  6270    ; m0/1=t5[d], m7/3=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  2,  4,  6,  6270, 15137    ; m5/4=t6[d], m2/6=t7[d]
+    SCRATCH                  1, 12, blockq+ 0*16
+    VP9_RND_SH_SUMSUB_BA     5,  7,  4,  3, 1, D_8192_REG
+    UNSCRATCH                1, 12, blockq+ 0*16
+    PSIGNW                  m5, W_M1_REG                    ; m5=out1[w], m7=t6[w]
+    VP9_RND_SH_SUMSUB_BA     2,  0,  6,  1, 3, D_8192_REG   ; m2=out6[w], m0=t7[w]
+
+    UNSCRATCH                1,  8, blockq+16*1
+    UNSCRATCH                3,  9, blockq+16*3
+    UNSCRATCH                4, 10, blockq+16*4
+    UNSCRATCH                6, 11, blockq+16*6
+    SCRATCH                  2,  8, blockq+16*0
+
+    SUMSUB_BA                w,  6,  4, 2                   ; m6=out0[w], m4=t2[w]
+    SUMSUB_BA                w,  1,  3, 2
+    PSIGNW                  m1, W_M1_REG                    ; m1=out7[w], m3=t3[w]
+
+    ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  3,  4,  2
+    SUMSUB_BA                w,  0,  7,  2
+    pmulhrsw                m3, W_11585x2_REG
+    pmulhrsw                m7, W_11585x2_REG
+    pmulhrsw                m4, W_11585x2_REG               ; out4
+    pmulhrsw                m0, W_11585x2_REG               ; out2
+%else
+    SCRATCH                  5,  9, blockq+16*1
+    VP9_UNPACK_MULSUB_2W_4X  4, 3, 11585, 11585, D_8192_REG, 2, 5
+    VP9_UNPACK_MULSUB_2W_4X  7, 0, 11585, 11585, D_8192_REG, 2, 5
+    UNSCRATCH                5,  9, blockq+16*1
+%endif
+    PSIGNW                  m3, W_M1_REG                    ; out3
+    PSIGNW                  m7, W_M1_REG                    ; out5
+
+    ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
+
+%if ARCH_X86_64
+    SWAP                     2, 8
+%endif
+    SWAP                     0, 6, 2
+    SWAP                     7, 1, 5
+%endmacro
+
+%macro IADST8_FN 6
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
+
+%ifidn %1, idct
+%define first_is_idct 1
+%else
+%define first_is_idct 0
+%endif
+
+%ifidn %3, idct
+%define second_is_idct 1
+%else
+%define second_is_idct 0
+%endif
+
+%if ARCH_X86_64
+    mova                m0, [blockq+  0]    ; IN(0)
+%endif
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+%if ARCH_X86_64 || first_is_idct
+    mova                m3, [blockq+ 48]    ; IN(3)
+%endif
+%if ARCH_X86_64
+    mova                m4, [blockq+ 64]    ; IN(4)
+%endif
+    mova                m5, [blockq+ 80]    ; IN(5)
+    mova                m6, [blockq+ 96]    ; IN(6)
+%if ARCH_X86_64 || first_is_idct
+    mova                m7, [blockq+112]    ; IN(7)
+%endif
+%if ARCH_X86_64
+%if cpuflag(ssse3)
+    mova               m15, [pw_11585x2]    ; often used
+%endif
+    mova               m13, [pd_8192]       ; rounding
+    mova               m14, [pw_m1]
+%define W_11585x2_REG m15
+%define D_8192_REG m13
+%define W_M1_REG m14
+%else
+%define W_11585x2_REG [pw_11585x2]
+%define D_8192_REG [pd_8192]
+%define W_M1_REG [pw_m1]
+%endif
+
+    ; note different calling conventions for idct8 vs. iadst8 on x86-32
+    VP9_%2_1D
+%if ARCH_X86_64
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
+%else
+    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
+    mova      [blockq+  0], m0
+%if second_is_idct == 0
+    mova      [blockq+ 48], m3
+    mova      [blockq+112], m7
+%endif
+%endif
+    VP9_%4_1D
+
+%if ARCH_X86_64
+    SWAP                 6, 8
+%endif
+    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
+    VP9_IDCT8_WRITEOUT
+    ZERO_BLOCK      blockq, 16, 8, m6
+    RET
+
+%undef W_11585x2_REG
+%undef first_is_idct
+%undef second_is_idct
+
+%endmacro
+
+IADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
+IADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
+IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
+IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
+IADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
+IADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
+IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; x86-64:
+; at the end of this macro, m7 is stored in [%4+15*%5]
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
+; and the following simsubs have not been done yet:
+;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
+;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
+
+%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
+%if %2 <= 4
+    mova                m3, [%1+ 1*%3]      ; IN(1)
+    mova                m0, [%1+ 3*%3]      ; IN(3)
+
+    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
+    pmulhrsw            m3, [pw_1606x2]             ; t8-9
+    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
+    pmulhrsw            m0, [pw_15679x2]            ; t12-13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 1, 6 ; t9,  t14
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+    UNSCRATCH            5, 11, %4+ 7*%5
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+    mova                m5, [%1+ 1*%3]      ; IN(1)
+    mova                m4, [%1+ 7*%3]      ; IN(7)
+%if %2 <= 8
+    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
+    pmulhrsw            m5, [pw_1606x2]             ; t8
+    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
+    pmulhrsw            m4, [pw_12665x2]            ; t14
+%else
+    mova                m3, [%1+ 9*%3]      ; IN(9)
+    mova                m2, [%1+15*%3]      ; IN(15)
+
+    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 0, 1 ; t8,  t15
+    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 0, 1 ; t9,  t14
+%endif
+
+    SUMSUB_BA            w,  3,  5, 0       ; t8,  t9
+    SUMSUB_BA            w,  4,  2, 0       ; t15, t14
+
+    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 0, 1 ; t9,  t14
+
+    SCRATCH              4, 10, %4+ 1*%5
+    SCRATCH              5, 11, %4+ 7*%5
+
+    mova                m6, [%1+ 3*%3]      ; IN(3)
+    mova                m7, [%1+ 5*%3]      ; IN(5)
+%if %2 <= 8
+    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
+    pmulhrsw            m7, [pw_7723x2]             ; t10
+    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
+    pmulhrsw            m6, [pw_15679x2]            ; t12
+%else
+    mova                m0, [%1+11*%3]      ; IN(11)
+    mova                m1, [%1+13*%3]      ; IN(13)
+
+    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 4, 5 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 4, 5 ; t11, t12
+%endif
+
+    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+    SUMSUB_BA            w,  7,  1, 4       ; t11, t10
+    SUMSUB_BA            w,  0,  6, 4       ; t12, t13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
+
+    UNSCRATCH            5, 11, %4+ 7*%5
+%endif
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+    SUMSUB_BA            w,  7,  3, 4       ; t8,  t11
+
+    ; backup first register
+    mova        [%4+15*%5], m7
+
+    SUMSUB_BA            w,  6,  2, 7       ; t9,  t10
+    UNSCRATCH            4, 10, %4+ 1*%5
+    SUMSUB_BA            w,  0,  4, 7       ; t15, t12
+    SUMSUB_BA            w,  1,  5, 7       ; t14. t13
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  2,  5, 7
+    SUMSUB_BA            w,  3,  4, 7
+    pmulhrsw            m5, [pw_11585x2]    ; t10
+    pmulhrsw            m4, [pw_11585x2]    ; t11
+    pmulhrsw            m3, [pw_11585x2]    ; t12
+    pmulhrsw            m2, [pw_11585x2]    ; t13
+%else
+    SCRATCH              6, 10, %4+ 1*%5
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   4,   3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
+    UNSCRATCH            6, 10, %4+ 1*%5
+%endif
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+    SCRATCH              0,  8, %4+ 1*%5
+    SCRATCH              1,  9, %4+ 3*%5
+    SCRATCH              2, 10, %4+ 5*%5
+    SCRATCH              3, 11, %4+ 7*%5
+    SCRATCH              4, 12, %4+ 9*%5
+    SCRATCH              5, 13, %4+11*%5
+    SCRATCH              6, 14, %4+13*%5
+
+    ; even (tx8x8)
+%if %2 <= 4
+    mova                m3, [%1+ 0*%3]      ; IN(0)
+    mova                m4, [%1+ 2*%3]      ; IN(2)
+
+    pmulhrsw            m3, [pw_11585x2]    ; t0-t3
+    pmulhrsw            m7, m4, [pw_16069x2]        ; t6-7
+    pmulhrsw            m4, [pw_3196x2]             ; t4-5
+
+%if 0 ; overflows :(
+    paddw               m6, m7, m4
+    psubw               m5, m7, m4
+    pmulhrsw            m5, [pw_11585x2]            ; t5
+    pmulhrsw            m6, [pw_11585x2]            ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5,  t6
+%endif
+
+    psubw               m0, m3, m7
+    paddw               m7, m3
+    psubw               m1, m3, m6
+    paddw               m6, m3
+    psubw               m2, m3, m5
+    paddw               m5, m3
+
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+%else
+    mova                m6, [%1+ 2*%3]      ; IN(2)
+    mova                m1, [%1+ 4*%3]      ; IN(4)
+    mova                m7, [%1+ 6*%3]      ; IN(6)
+%if %2 <= 8
+    pmulhrsw            m0, m1,  [pw_15137x2]       ; t3
+    pmulhrsw            m1, [pw_6270x2]             ; t2
+    pmulhrsw            m5, m6, [pw_16069x2]        ; t7
+    pmulhrsw            m6, [pw_3196x2]             ; t4
+    pmulhrsw            m4, m7, [pw_m9102x2]        ; t5
+    pmulhrsw            m7, [pw_13623x2]            ; t6
+%else
+    mova                m4, [%1+10*%3]      ; IN(10)
+    mova                m0, [%1+12*%3]      ; IN(12)
+    mova                m5, [%1+14*%3]      ; IN(14)
+
+    VP9_UNPACK_MULSUB_2W_4X   1,   0, 15137,  6270, [pd_8192], 2, 3 ; t2,  t3
+    VP9_UNPACK_MULSUB_2W_4X   6,   5, 16069,  3196, [pd_8192], 2, 3 ; t4,  t7
+    VP9_UNPACK_MULSUB_2W_4X   4,   7,  9102, 13623, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
+    SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
+
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  6,  5, 2
+    pmulhrsw            m5, [pw_11585x2]                              ; t5
+    pmulhrsw            m6, [pw_11585x2]                              ; t6
+%else
+    VP9_UNPACK_MULSUB_2W_4X  5,  6, 11585, 11585, [pd_8192], 2, 3 ; t5,  t6
+%endif
+
+    SCRATCH              5, 15, %4+10*%5
+    mova                m2, [%1+ 0*%3]      ; IN(0)
+%if %2 <= 8
+    pmulhrsw            m2, [pw_11585x2]    ; t0 and t1
+    psubw               m3, m2, m0
+    paddw               m0, m2
+
+    SUMSUB_BA            w,  7,  0, 5       ; t0,  t7
+%else
+    mova                m3, [%1+ 8*%3]      ; IN(8)
+
+    ; from 3 stages back
+%if cpuflag(ssse3) && %6 == 0
+    SUMSUB_BA            w,  3,  2, 5
+    pmulhrsw            m3, [pw_11585x2]    ; t0
+    pmulhrsw            m2, [pw_11585x2]    ; t1
+%else
+    mova        [%1+ 0*%3], m0
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585,  11585, [pd_8192], 5, 0 ; t0, t1
+    mova                m0, [%1+ 0*%3]
+%endif
+
+    ; from 2 stages back
+    SUMSUB_BA            w,  0,  3, 5      ; t0,  t3
+
+    SUMSUB_BA            w,  7,  0, 5      ; t0,  t7
+%endif
+    UNSCRATCH            5, 15, %4+10*%5
+%if ARCH_X86_32
+    SWAP                 0, 7
+%endif
+    SCRATCH              7, 15, %4+12*%5
+    SUMSUB_BA            w,  1,  2, 7       ; t1,  t2
+
+    ; from 1 stage back
+    SUMSUB_BA            w,  6,  1, 7       ; t1,  t6
+    SUMSUB_BA            w,  5,  2, 7       ; t2,  t5
+%endif
+    SUMSUB_BA            w,  4,  3, 7       ; t3,  t4
+
+%if ARCH_X86_64
+    SWAP                 0, 8
+    SWAP                 1, 9
+    SWAP                 2, 10
+    SWAP                 3, 11
+    SWAP                 4, 12
+    SWAP                 5, 13
+    SWAP                 6, 14
+
+    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
+    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
+    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
+    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
+    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
+    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
+%else
+    SWAP                 1, 6
+    SWAP                 2, 5
+    SWAP                 3, 4
+    mova        [%4+14*%5], m6
+
+%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
+    mova                m6, [%4+%2*%5]
+    SUMSUB_BA            w,  6, %1, 7
+    SWAP                %1, 6
+    mova        [%4+%3*%5], m6
+%endmacro
+
+    %%SUMSUB_BA_STORE    0,  1,  1, %4, %5  ; t0, t15
+    %%SUMSUB_BA_STORE    1,  3,  3, %4, %5  ; t1, t14
+    %%SUMSUB_BA_STORE    2,  5,  5, %4, %5  ; t2, t13
+    %%SUMSUB_BA_STORE    3,  7,  7, %4, %5  ; t3, t12
+    %%SUMSUB_BA_STORE    4,  9,  9, %4, %5  ; t4, t11
+    %%SUMSUB_BA_STORE    5, 11, 11, %4, %5  ; t5, t10
+%endif
+%endmacro
+
+%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
+%if %2 == 1
+    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
+
+%if ARCH_X86_64
+    ; backup a different register
+    mova                m7, [tmpq+15*16]
+    mova      [tmpq+ 1*16], m15
+
+    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
+    mova        [tmpq+  0], m0
+    mova        [tmpq+ 32], m1
+    mova        [tmpq+ 64], m2
+    mova        [tmpq+ 96], m3
+    mova        [tmpq+128], m4
+    mova        [tmpq+160], m5
+    mova        [tmpq+192], m6
+    mova        [tmpq+224], m7
+
+    mova               m15, [tmpq+ 1*16]
+    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova        [tmpq+ 16], m8
+    mova        [tmpq+ 48], m9
+    mova        [tmpq+ 80], m10
+    mova        [tmpq+112], m11
+    mova        [tmpq+144], m12
+    mova        [tmpq+176], m13
+    mova        [tmpq+208], m14
+    mova        [tmpq+240], m15
+%else
+    mova                m6, [tmpq+13*16]
+    mova                m7, [tmpq+14*16]
+    SUMSUB_BA            w, 6, 7                ; t6, t9
+    mova      [tmpq+14*16], m6
+    mova      [tmpq+13*16], m7
+    mova                m7, [tmpq+15*16]
+    mova                m6, [tmpq+12*16]
+    SUMSUB_BA            w, 7, 6                ; t7, t8
+    mova      [tmpq+15*16], m6
+
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
+    mova     [tmpq+ 0*16], m0
+    mova     [tmpq+ 2*16], m1
+    mova     [tmpq+ 4*16], m2
+    mova     [tmpq+ 6*16], m3
+    mova     [tmpq+10*16], m5
+    mova     [tmpq+12*16], m6
+    mova     [tmpq+14*16], m7
+
+    mova                m0, [tmpq+15*16]
+    mova                m1, [tmpq+13*16]
+    mova                m2, [tmpq+11*16]
+    mova                m3, [tmpq+ 9*16]
+    mova                m4, [tmpq+ 7*16]
+    mova                m5, [tmpq+ 5*16]
+    mova                m7, [tmpq+ 1*16]
+    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
+    mova     [tmpq+ 1*16], m0
+    mova     [tmpq+ 3*16], m1
+    mova     [tmpq+ 5*16], m2
+    mova     [tmpq+ 7*16], m3
+    mova     [tmpq+11*16], m5
+    mova     [tmpq+13*16], m6
+    mova     [tmpq+15*16], m7
+%endif
+%else ; %2 == 2
+    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+    pxor                m7, m7
+%if ARCH_X86_64
+    ; backup more registers
+    mova        [%1+ 2*32], m8
+    mova        [%1+ 3*32], m9
+
+    VP9_IDCT8_WRITEx2    0,  1, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    4,  5, 8, 9, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    ; restore from cache
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m7, [%1+15*32]
+    mova                m8, [%1+ 2*32]
+    mova                m9, [%1+ 3*32]
+
+    SUMSUB_BA            w,  6,  9, 3       ; t6, t9
+    SUMSUB_BA            w,  7,  8, 3       ; t7, t8
+
+    VP9_IDCT8_WRITEx2    6,  7, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    8,  9, 3, 4, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   10, 11, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   12, 13, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2   14, 15, 1, 2, 0, ROUND_REG, 6
+%else
+    mova      [tmpq+ 0*32], m5
+
+    VP9_IDCT8_WRITEx2    0,  1, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    2,  3, 5, 6, 7, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m5, [tmpq+ 0*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+13*32]
+    mova                m7, [tmpq+14*32]
+    mova                m5, [tmpq+15*32]
+    mova                m6, [tmpq+12*32]
+    SUMSUB_BADC w, 4, 7, 5, 6, 1
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+11*32]
+    mova                m5, [tmpq+ 9*32]
+    mova                m6, [tmpq+ 7*32]
+    mova                m7, [tmpq+ 5*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+
+    mova                m4, [tmpq+ 3*32]
+    mova                m5, [tmpq+ 1*32]
+
+    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
+    lea               dstq, [dstq+strideq*2]
+%endif
+
+%undef ROUND_REG
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+    mova               m%3, [dstq]
+    mova               m%5, [dstq+%7]
+    punpcklbw          m%2, m%3, m%6
+    punpckhbw          m%3, m%6
+    punpcklbw          m%4, m%5, m%6
+    punpckhbw          m%5, m%6
+    paddw              m%2, m%1
+    paddw              m%3, m%1
+    paddw              m%4, m%1
+    paddw              m%5, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova            [dstq], m%2
+    mova         [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+%if cpuflag(ssse3)
+    ; 2x2=eob=3, 4x4=eob=10
+    cmp eobd, 38
+    jg .idctfull
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct8x8
+%else
+    cmp eobd, 1 ; faster path for when only DC is set
+    jg .idctfull
+%endif
+
+    ; dc-only
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 7
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    RET
+
+    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+%if cpuflag(ssse3)
+.idct8x8:
+    mov               tmpq, rsp
+    VP9_IDCT16_1D   blockq, 1, 8, 0
+
+    mov               cntd, 2
+    mov           dst_bakq, dstq
+.loop2_8x8:
+    VP9_IDCT16_1D     tmpq, 2, 8, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 8, m0
+    RET
+%endif
+
+.idctfull:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT16_1D   blockq, 1, 16, 0
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_IDCT16_1D     tmpq, 2, 16, 0
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM sse2
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+    mova                m0, [%1+ 0*32]  ; in0
+    mova                m1, [%1+15*32]  ; in15
+    mova                m2, [%1+ 7*32]  ; in7
+    mova                m3, [%1+ 8*32]  ; in8
+
+    VP9_UNPACK_MULSUB_2D_4X  1,  0,  4,  5, 16364,   804    ; m1/4=t1[d], m0/5=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  6, 11003, 12140    ; m2/7=t9[d], m3/6=t8[d]
+    SCRATCH              4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  6,  5,  4, [pd_8192]   ; m3=t0[w], m0=t8[w]
+    UNSCRATCH            4, 8, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  7,  4,  5, [pd_8192]   ; m2=t1[w], m1=t9[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    mova   [tmpq+ 7*%%str], m2
+    mova   [tmpq+ 8*%%str], m3
+
+    mova                m1, [%1+ 2*32]  ; in2
+    mova                m0, [%1+13*32]  ; in13
+    mova                m3, [%1+ 5*32]  ; in5
+    mova                m2, [%1+10*32]  ; in10
+
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 15893,  3981    ; m0/6=t3[d], m1/7=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
+    SCRATCH              4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  1,  5,  7,  4, [pd_8192]   ; m2=t2[w], m1=t10[w]
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  0,  4,  6,  5, [pd_8192]   ; m3=t3[w], m0=t11[w]
+
+    SCRATCH              0, 12, tmpq+ 2*%%str
+    SCRATCH              1, 13, tmpq+13*%%str
+    mova   [tmpq+ 5*%%str], m2
+    mova   [tmpq+10*%%str], m3
+
+    mova                m2, [%1+ 4*32]  ; in4
+    mova                m3, [%1+11*32]  ; in11
+    mova                m0, [%1+ 3*32]  ; in3
+    mova                m1, [%1+12*32]  ; in12
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 14811,  7005    ; m3/7=t5[d], m2/6=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  5520, 15426    ; m0/4=t13[d], m1/5=t12[d]
+    SCRATCH              4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t4[w], m2=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 4*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t5[w], m3=t13[w]
+
+    SCRATCH              0,  8, tmpq+ 4*%%str
+    mova   [tmpq+11*%%str], m1          ; t4:m1->r11
+    UNSCRATCH            0, 10, tmpq+ 0*%%str
+    UNSCRATCH            1, 11, tmpq+15*%%str
+
+    ; round 2 interleaved part 1
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  5,  4,  3196, 16069    ; m3/5=t12[d], m2/4=t13[d]
+    SCRATCH              4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  1,  5,  7,  4, [pd_8192]   ; m3=t8[w], m1=t12[w]
+    UNSCRATCH            4, 9, tmpq+ 3*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  0,  4,  6,  5, [pd_8192]   ; m2=t9[w], m0=t13[w]
+
+    SCRATCH              0, 10, tmpq+ 0*%%str
+    SCRATCH              1, 11, tmpq+15*%%str
+    SCRATCH              2, 14, tmpq+ 3*%%str
+    SCRATCH              3, 15, tmpq+12*%%str
+
+    mova                m2, [%1+ 6*32]  ; in6
+    mova                m3, [%1+ 9*32]  ; in9
+    mova                m0, [%1+ 1*32]  ; in1
+    mova                m1, [%1+14*32]  ; in14
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  2404, 16207    ; m0/4=t15[d], m1/5=t14[d]
+    SCRATCH              4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t6[w], m2=t14[w]
+    UNSCRATCH            4, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t7[w], m3=t15[w]
+
+    ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
+    ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
+
+    UNSCRATCH            4, 12, tmpq+ 2*%%str
+    UNSCRATCH            5, 13, tmpq+13*%%str
+    SCRATCH              0, 12, tmpq+ 1*%%str
+    SCRATCH              1, 13, tmpq+14*%%str
+
+    ; remainder of round 2 (rest of t8-15)
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  6,  7,  9102, 13623    ; m5/6=t11[d], m4/7=t10[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  1,  0, 13623,  9102    ; m3/1=t14[d], m2/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     3,  4,  1,  7,  0, [pd_8192]   ; m3=t10[w], m4=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 6*%%str
+    VP9_RND_SH_SUMSUB_BA     2,  5,  0,  6,  1, [pd_8192]   ; m2=t11[w], m5=t15[w]
+
+    ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
+
+    UNSCRATCH            6, 14, tmpq+ 3*%%str
+    UNSCRATCH            7, 15, tmpq+12*%%str
+
+    SUMSUB_BA                w,  3,  7,  1
+    PSIGNW                  m3, [pw_m1]                     ; m3=out1[w], m7=t10[w]
+    SUMSUB_BA                w,  2,  6,  1                  ; m2=out14[w], m6=t11[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_11585x2]                ; m7=out6[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out9[w]
+%else
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    mova       [tmpq+ 3*%%str], m6
+    mova       [tmpq+ 6*%%str], m7
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    UNSCRATCH                7, 11, tmpq+15*%%str
+    mova       [tmpq+13*%%str], m2
+    SCRATCH                  3, 11, tmpq+ 9*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  7,  6,  2,  3, 15137,  6270    ; m6/3=t13[d], m7/2=t12[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  1,  0,  6270, 15137    ; m5/1=t14[d], m4/0=t15[d]
+    SCRATCH              0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     5,  6,  1,  3,  0, [pd_8192]   ; m5=out2[w], m6=t14[w]
+    UNSCRATCH            0, 9, tmpq+ 2*%%str
+    VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
+    PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
+
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
+    SUMSUB_BA                w,  7,  6,  1
+    pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
+%else
+    PSIGNW                  m7, [pw_m1]
+    VP9_UNPACK_MULSUB_2W_4X  7,  6, 11585, 11585, [pd_8192], 1, 0
+%endif
+
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
+
+    mova                    m2, [tmpq+ 8*%%str]
+    mova                    m3, [tmpq+ 7*%%str]
+    mova                    m1, [tmpq+11*%%str]
+    mova       [tmpq+ 7*%%str], m6
+    mova       [tmpq+11*%%str], m4
+    mova                    m4, [tmpq+ 5*%%str]
+    SCRATCH                  5, 14, tmpq+ 5*%%str
+    SCRATCH                  7, 15, tmpq+ 8*%%str
+    UNSCRATCH                6,  8, tmpq+ 4*%%str
+    UNSCRATCH                5, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+14*%%str
+
+    ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+    SUMSUB_BA                w,  1,  2, 0                   ; m1=t0[w], m2=t4[w]
+    mova                    m0, [tmpq+10*%%str]
+    SCRATCH                  1, 12, tmpq+ 1*%%str
+    SUMSUB_BA                w,  6,  3, 1                   ; m8=t1[w], m3=t5[w]
+    SCRATCH                  6, 13, tmpq+ 4*%%str
+    SUMSUB_BA                w,  7,  4, 1                   ; m13=t2[w], m9=t6[w]
+    SCRATCH                  7,  8, tmpq+10*%%str
+    SUMSUB_BA                w,  5,  0, 1                   ; m12=t3[w], m0=t7[w]
+    SCRATCH                  5,  9, tmpq+14*%%str
+
+    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  5, 15137,  6270    ; m2/6=t5[d], m3/10=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  0,  4,  1,  6,  6270, 15137    ; m0/14=t6[d], m9/15=t7[d]
+    SCRATCH                  6, 10, tmpq+ 0*%%str
+    VP9_RND_SH_SUMSUB_BA     0,  3,  1,  5,  6, [pd_8192]
+    UNSCRATCH                6, 10, tmpq+ 0*%%str
+    PSIGNW                  m0, [pw_m1]                     ; m0=out3[w], m3=t6[w]
+    VP9_RND_SH_SUMSUB_BA     4,  2,  6,  7,  5, [pd_8192]   ; m9=out12[w], m2=t7[w]
+
+    UNSCRATCH                1,  8, tmpq+10*%%str
+    UNSCRATCH                5,  9, tmpq+14*%%str
+    UNSCRATCH                6, 12, tmpq+ 1*%%str
+    UNSCRATCH                7, 13, tmpq+ 4*%%str
+    SCRATCH                  4,  9, tmpq+14*%%str
+
+    SUMSUB_BA                w,  1,  6,  4                  ; m13=out0[w], m1=t2[w]
+    SUMSUB_BA                w,  5,  7,  4
+    PSIGNW                  m5, [pw_m1]                     ; m12=out15[w], m8=t3[w]
+
+    ; unfortunately, the code below overflows in some cases, e.g.
+    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+%if 0 ; cpuflag(ssse3)
+    SUMSUB_BA               w,   7,  6,  4
+    pmulhrsw                m7, [pw_m11585x2]               ; m8=out7[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m1=out8[w]
+    SWAP                     6,  7
+    SUMSUB_BA                w,  3,  2,  4
+    pmulhrsw                m3, [pw_11585x2]                ; m3=out4[w]
+    pmulhrsw                m2, [pw_11585x2]                ; m2=out11[w]
+%else
+    SCRATCH                  5,  8, tmpq+10*%%str
+    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, m11585, [pd_8192],  5,  4
+    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585, 11585, [pd_8192],  5,  4
+    UNSCRATCH                5,  8, tmpq+10*%%str
+%endif
+
+    ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
+    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
+
+%if %2 == 1
+%if ARCH_X86_64
+    mova                   m13, [tmpq+ 6*%%str]
+    TRANSPOSE8x8W            1, 11, 14, 0, 3, 15, 13, 6, 10
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m11
+    mova          [tmpq+ 4*16], m14
+    mova          [tmpq+ 6*16], m0
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+    mova          [tmpq+ 8*16], m3
+    mova          [tmpq+10*16], m15
+    mova          [tmpq+12*16], m13
+    mova          [tmpq+14*16], m6
+
+    TRANSPOSE8x8W            7, 1, 11, 2, 9, 14, 0, 5, 10
+    mova          [tmpq+ 1*16], m7
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m11
+    mova          [tmpq+ 7*16], m2
+    mova          [tmpq+ 9*16], m9
+    mova          [tmpq+11*16], m14
+    mova          [tmpq+13*16], m0
+    mova          [tmpq+15*16], m5
+%else
+    mova       [tmpq+12*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+15*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    mova                    m5, [tmpq+ 5*%%str]
+    mova                    m7, [tmpq+ 8*%%str]
+    TRANSPOSE8x8W            1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
+    mova          [tmpq+ 0*16], m1
+    mova          [tmpq+ 2*16], m2
+    mova          [tmpq+ 4*16], m5
+    mova          [tmpq+ 6*16], m0
+    mova          [tmpq+10*16], m7
+    mova                    m3, [tmpq+12*%%str]
+    mova          [tmpq+12*16], m4
+    mova                    m4, [tmpq+14*%%str]
+    mova          [tmpq+14*16], m6
+
+    mova                    m0, [tmpq+15*%%str]
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                    m2, [tmpq+ 7*%%str]
+    mova                    m5, [tmpq+11*%%str]
+    mova                    m7, [tmpq+ 1*%%str]
+    TRANSPOSE8x8W            0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
+    mova          [tmpq+ 1*16], m0
+    mova          [tmpq+ 3*16], m1
+    mova          [tmpq+ 5*16], m2
+    mova          [tmpq+ 7*16], m3
+    mova          [tmpq+11*16], m5
+    mova          [tmpq+13*16], m6
+    mova          [tmpq+15*16], m7
+%endif
+%else
+    pxor                    m4, m4
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%if ARCH_X86_64
+    mova                   m12, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        1, 11, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       14,  0, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        3, 15, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       12,  6, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m1, [tmpq+ 3*%%str]
+    mova                   m11, [tmpq+ 7*%%str]
+    mova                   m14, [tmpq+11*%%str]
+    mova                    m0, [tmpq+13*%%str]
+
+    VP9_IDCT8_WRITEx2        7,  1, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2       11,  2, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        9, 14, 10,  8,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    VP9_IDCT8_WRITEx2        0,  5, 10,  8,  4, ROUND_REG, 6
+%else
+    mova       [tmpq+ 0*%%str], m2
+    mova       [tmpq+ 1*%%str], m5
+    mova       [tmpq+ 2*%%str], m7
+    mova                    m2, [tmpq+ 9*%%str]
+    VP9_IDCT8_WRITEx2        1,  2,  5,  7,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 5*%%str]
+    VP9_IDCT8_WRITEx2        5,  0,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 8*%%str]
+    VP9_IDCT8_WRITEx2        3,  5,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m5, [tmpq+ 6*%%str]
+    VP9_IDCT8_WRITEx2        5,  6,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m0, [tmpq+ 2*%%str]
+    mova                    m3, [tmpq+ 3*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+ 7*%%str]
+    mova                    m3, [tmpq+ 0*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+14*%%str]
+    mova                    m3, [tmpq+11*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+    lea                   dstq, [dstq+strideq*2]
+    mova                    m0, [tmpq+13*%%str]
+    mova                    m3, [tmpq+ 1*%%str]
+    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
+%endif
+
+    SWAP                     0,  4 ; zero
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_%2_1D       blockq, 1
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_%4_1D         tmpq, 2
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+IADST16_FN idct,  IDCT16,  iadst, IADST16, sse2
+IADST16_FN iadst, IADST16, idct,  IDCT16,  sse2
+IADST16_FN iadst, IADST16, iadst, IADST16, sse2
+IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%assign %%str 16*%2*%2
+    ; first do t0-15, this can be done identical to idct16x16
+    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
+
+    ; store everything on stack to make space available for t16-31
+    ; we store interleaved with the output of the second half (t16-31)
+    ; so we don't need to allocate extra stack space
+    mova    [tmpq+ 0*%%str], m0     ; t0
+    mova    [tmpq+ 4*%%str], m1     ; t1
+    mova    [tmpq+ 8*%%str], m2     ; t2
+    mova    [tmpq+12*%%str], m3     ; t3
+    mova    [tmpq+16*%%str], m4     ; t4
+    mova    [tmpq+20*%%str], m5     ; t5
+%if ARCH_X86_64
+    mova    [tmpq+22*%%str], m10    ; t10
+    mova    [tmpq+18*%%str], m11    ; t11
+    mova    [tmpq+14*%%str], m12    ; t12
+    mova    [tmpq+10*%%str], m13    ; t13
+    mova    [tmpq+ 6*%%str], m14    ; t14
+    mova    [tmpq+ 2*%%str], m15    ; t15
+%endif
+
+    mova                m0, [tmpq+ 30*%%str]
+    UNSCRATCH            1,  6, tmpq+26*%%str
+    UNSCRATCH            2,  8, tmpq+24*%%str
+    UNSCRATCH            3,  9, tmpq+28*%%str
+    SUMSUB_BA            w,  1,  3, 4       ; t6, t9
+    SUMSUB_BA            w,  0,  2, 4       ; t7, t8
+
+    mova    [tmpq+24*%%str], m1     ; t6
+    mova    [tmpq+28*%%str], m0     ; t7
+    mova    [tmpq+30*%%str], m2     ; t8
+    mova    [tmpq+26*%%str], m3     ; t9
+
+    ; then, secondly, do t16-31
+%if %3 <= 8
+    mova                 m4, [%1+ 1*64]
+    mova                 m7, [%1+ 7*64]
+
+    pmulhrsw             m1,  m4, [pw_16364x2] ;t31
+    pmulhrsw             m4, [pw_804x2] ;t16
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0,  1,  4, 16069,  3196, [pd_8192], 6,  2 ; t17, t30
+
+    pmulhrsw             m3,  m7, [pw_m5520x2] ;t19
+    pmulhrsw             m7, [pw_15426x2] ;t28
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6,  7,  3, 3196, m16069, [pd_8192], 4,  5 ; t18, t29
+%else
+    mova                 m0, [%1+ 1*64]
+    mova                 m1, [%1+15*64]
+%if %3 <= 16
+    pmulhrsw             m5, m0, [pw_16364x2]
+    pmulhrsw             m0, [pw_804x2]
+    pmulhrsw             m4, m1, [pw_m11003x2]
+    pmulhrsw             m1, [pw_12140x2]
+%else
+    mova                 m4, [%1+17*64]
+    mova                 m5, [%1+31*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+%endif
+    SUMSUB_BA             w,  4,  0,  2
+    SUMSUB_BA             w,  1,  5,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   5,  0, 16069,  3196, [pd_8192], 2, 3 ; t17, t30
+
+    SCRATCH               4, 13, tmpq+ 1*%%str
+    SCRATCH               5, 12, tmpq+15*%%str
+
+    mova                 m2, [%1+ 7*64]
+    mova                 m3, [%1+ 9*64]
+%if %3 <= 16
+    pmulhrsw             m7,  m3, [pw_14811x2]
+    pmulhrsw             m3, [pw_7005x2]
+    pmulhrsw             m6,  m2, [pw_m5520x2]
+    pmulhrsw             m2, [pw_15426x2]
+%else
+    mova                 m7, [%1+23*64]
+    mova                 m6, [%1+25*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  7, 14811,  7005, [pd_8192], 4, 5 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   6,  2,  5520, 15426, [pd_8192], 4, 5 ; t19, t28
+%endif
+    SUMSUB_BA             w,  3,  6,  4
+    SUMSUB_BA             w,  7,  2,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   2,  6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
+%endif
+
+    UNSCRATCH             5, 12, tmpq+15*%%str
+    SUMSUB_BA             w,  6,  0,  4
+    mova    [tmpq+25*%%str], m6             ; t19
+    UNSCRATCH             4, 13, tmpq+ 1*%%str
+    SUMSUB_BA             w,  7,  1,  6
+    SUMSUB_BA             w,  3,  4,  6
+    mova    [tmpq+23*%%str], m3             ; t16
+    SUMSUB_BA             w,  2,  5,  6
+
+    VP9_UNPACK_MULSUB_2W_4X   0,  5, 15137,  6270, [pd_8192], 6, 3 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   1,  4, 15137,  6270, [pd_8192], 6, 3 ; t19, t28
+
+    SCRATCH               0, 10, tmpq+ 1*%%str
+    SCRATCH               1, 11, tmpq+ 7*%%str
+    SCRATCH               2,  9, tmpq+ 9*%%str
+    SCRATCH               4, 14, tmpq+15*%%str
+    SCRATCH               5, 15, tmpq+17*%%str
+    SCRATCH               7, 13, tmpq+31*%%str
+
+%if %3 <= 8
+    mova                 m0, [%1+ 5*64]
+    mova                 m3, [%1+ 3*64]
+
+    pmulhrsw             m5,  m0, [pw_15893x2] ;t27
+    pmulhrsw             m0, [pw_3981x2] ;t20
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  5,  0,  9102, 13623, [pd_8192], 7,  2 ; t21, t26
+
+    pmulhrsw             m6,  m3, [pw_m2404x2] ;t23
+    pmulhrsw             m3, [pw_16207x2] ;t24
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2,  3,  6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%else
+    mova                 m4, [%1+ 5*64]
+    mova                 m5, [%1+11*64]
+%if %3 <= 16
+    pmulhrsw             m1, m4, [pw_15893x2]
+    pmulhrsw             m4, [pw_3981x2]
+    pmulhrsw             m0, m5, [pw_m8423x2]
+    pmulhrsw             m5, [pw_14053x2]
+%else
+    mova                 m0, [%1+21*64]
+    mova                 m1, [%1+27*64]
+
+    VP9_UNPACK_MULSUB_2W_4X   4,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   0,  5,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+    SUMSUB_BA             w,  0,  4,  2
+    SUMSUB_BA             w,  5,  1,  2
+
+    VP9_UNPACK_MULSUB_2W_4X   1,  4,  9102, 13623, [pd_8192], 2, 3 ; t21, t26
+
+    SCRATCH               5,  8, tmpq+ 5*%%str
+    SCRATCH               4, 12, tmpq+11*%%str
+
+    mova                 m7, [%1+ 3*64]
+    mova                 m6, [%1+13*64]
+%if %3 <= 16
+    pmulhrsw             m3, m6, [pw_13160x2]
+    pmulhrsw             m6, [pw_9760x2]
+    pmulhrsw             m2, m7, [pw_m2404x2]
+    pmulhrsw             m7, [pw_16207x2]
+%else
+    mova                 m2, [%1+29*64]
+    mova                 m3, [%1+19*64]
+    VP9_UNPACK_MULSUB_2W_4X   6,  3, 13160,  9760, [pd_8192], 4, 5 ; t22, t25
+    VP9_UNPACK_MULSUB_2W_4X   2,  7,  2404, 16207, [pd_8192], 4, 5 ; t23, t24
+%endif
+    SUMSUB_BA             w,  6,  2,  4
+    SUMSUB_BA             w,  3,  7,  4
+
+    VP9_UNPACK_MULSUB_2W_4X   7,  2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
+%endif
+
+    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+    UNSCRATCH             4, 12, tmpq+11*%%str
+    SUMSUB_BA             w,  0,  6, 5
+    SUMSUB_BA             w,  4,  2, 5
+    UNSCRATCH             5,  8, tmpq+ 5*%%str
+    SCRATCH               4,  8, tmpq+11*%%str
+    SUMSUB_BA             w,  1,  7, 4
+    SUMSUB_BA             w,  5,  3, 4
+    SCRATCH               5, 12, tmpq+ 5*%%str
+
+    VP9_UNPACK_MULSUB_2W_4X   3,  6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   2,  7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
+
+    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+    UNSCRATCH             5,  9, tmpq+ 9*%%str
+    mova                 m4, [tmpq+23*%%str] ; t16
+%if ARCH_X86_64
+    SUMSUB_BA             w,  1,  5,  9
+    SUMSUB_BA             w,  0,  4,  9
+%else
+    SUMSUB_BADC           w,  1,  5,  0,  4
+%endif
+    mova    [tmpq+29*%%str], m1     ; t17
+    mova    [tmpq+21*%%str], m0     ; t16
+    UNSCRATCH             0, 10, tmpq+ 1*%%str
+    UNSCRATCH             1, 11, tmpq+ 7*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  2,  0,  9
+    SUMSUB_BA             w,  3,  1,  9
+%else
+    SUMSUB_BADC           w,  2,  0,  3,  1
+%endif
+    mova    [tmpq+ 9*%%str], m2     ; t18
+    mova    [tmpq+13*%%str], m3     ; t19
+    SCRATCH               0, 10, tmpq+23*%%str
+    SCRATCH               1, 11, tmpq+27*%%str
+
+    UNSCRATCH             2, 14, tmpq+15*%%str
+    UNSCRATCH             3, 15, tmpq+17*%%str
+    SUMSUB_BA             w,  6,  2, 0
+    SUMSUB_BA             w,  7,  3, 0
+    SCRATCH               6, 14, tmpq+ 3*%%str
+    SCRATCH               7, 15, tmpq+ 7*%%str
+
+    UNSCRATCH             0,  8, tmpq+11*%%str
+    mova                 m1, [tmpq+25*%%str] ; t19
+    UNSCRATCH             6, 12, tmpq+ 5*%%str
+    UNSCRATCH             7, 13, tmpq+31*%%str
+%if ARCH_X86_64
+    SUMSUB_BA             w,  0,  1,  9
+    SUMSUB_BA             w,  6,  7,  9
+%else
+    SUMSUB_BADC           w,  0,  1,  6,  7
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+%if 0; cpuflag(ssse3)
+%if ARCH_X86_64
+    SUMSUB_BA             w,  4,  7,  8
+    SUMSUB_BA             w,  5,  1,  8
+%else
+    SUMSUB_BADC           w,  4,  7,  5,  1
+%endif
+
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m4, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+    pmulhrsw             m5, [pw_11585x2]
+
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+
+%if ARCH_X86_64
+    SUMSUB_BA             w,  7,  3, 10
+    SUMSUB_BA             w,  1,  2, 10
+%else
+    SUMSUB_BADC           w,  7,  3,  1,  2
+%endif
+
+    pmulhrsw             m3, [pw_11585x2]
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m2, [pw_11585x2]
+    pmulhrsw             m1, [pw_11585x2]
+%else
+    SCRATCH               0,  8, tmpq+15*%%str
+    SCRATCH               6,  9, tmpq+17*%%str
+    VP9_UNPACK_MULSUB_2W_4X  7,  4, 11585, 11585, [pd_8192], 0, 6
+    mova    [tmpq+ 5*%%str], m7     ; t23
+    UNSCRATCH             7, 10, tmpq+23*%%str
+    VP9_UNPACK_MULSUB_2W_4X  1,  5, 11585, 11585, [pd_8192], 0, 6
+    SCRATCH               1, 13, tmpq+25*%%str
+    UNSCRATCH             1, 11, tmpq+27*%%str
+    VP9_UNPACK_MULSUB_2W_4X  3,  7, 11585, 11585, [pd_8192], 0, 6
+    VP9_UNPACK_MULSUB_2W_4X  2,  1, 11585, 11585, [pd_8192], 0, 6
+    UNSCRATCH             0,  8, tmpq+15*%%str
+    UNSCRATCH             6,  9, tmpq+17*%%str
+%endif
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+    ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+    mova    [tmpq+17*%%str], m2     ; t20
+    mova    [tmpq+ 1*%%str], m3     ; t21
+%if ARCH_X86_64
+    mova    [tmpq+25*%%str], m13    ; t22
+
+    mova                 m8, [tmpq+ 0*%%str] ; t0
+    mova                 m9, [tmpq+ 4*%%str] ; t1
+    mova                m12, [tmpq+ 8*%%str] ; t2
+    mova                m11, [tmpq+12*%%str] ; t3
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    mova                m13, [tmpq+24*%%str] ; t6
+
+    SUMSUB_BA             w,  6,  8,  10
+    mova    [tmpq+ 3*%%str], m8              ; t15
+    mova                m10, [tmpq+28*%%str] ; t7
+    SUMSUB_BA             w,  0,  9,  8
+    SUMSUB_BA             w, 15, 12,  8
+    SUMSUB_BA             w, 14, 11,  8
+    SUMSUB_BA             w,  1,  2,  8
+    SUMSUB_BA             w,  7,  3,  8
+    SUMSUB_BA             w,  5, 13,  8
+    SUMSUB_BA             w,  4, 10,  8
+
+    TRANSPOSE8x8W         6, 0, 15, 14, 1, 7, 5, 4, 8
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m15
+    mova    [tmpq+12*%%str], m14
+    mova    [tmpq+16*%%str], m1
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+28*%%str], m4
+
+    mova                  m8, [tmpq+ 3*%%str] ; t15
+    TRANSPOSE8x8W         10, 13, 3, 2, 11, 12, 9, 8, 0
+    mova    [tmpq+ 3*%%str], m10
+    mova    [tmpq+ 7*%%str], m13
+    mova    [tmpq+11*%%str], m3
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+19*%%str], m11
+    mova    [tmpq+23*%%str], m12
+    mova    [tmpq+27*%%str], m9
+    mova    [tmpq+31*%%str], m8
+
+    mova                m15, [tmpq+30*%%str] ; t8
+    mova                m14, [tmpq+26*%%str] ; t9
+    mova                m13, [tmpq+22*%%str] ; t10
+    mova                m12, [tmpq+18*%%str] ; t11
+    mova                m11, [tmpq+14*%%str] ; t12
+    mova                m10, [tmpq+10*%%str] ; t13
+    mova                 m9, [tmpq+ 6*%%str] ; t14
+    mova                 m8, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    mova                 m1, [tmpq+25*%%str] ; t22
+
+    SUMSUB_BA             w,  7,  8, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BA             w,  6,  9, 8
+    SUMSUB_BA             w,  5, 10, 8
+    SUMSUB_BA             w,  4, 11, 8
+    SUMSUB_BA             w,  3, 12, 8
+    SUMSUB_BA             w,  2, 13, 8
+    SUMSUB_BA             w,  1, 14, 8
+    SUMSUB_BA             w,  0, 15, 8
+
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+17*%%str], m4
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m8, [tmpq+ 2*%%str]
+    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova    [tmpq+ 6*%%str], m9
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+    mova    [tmpq+18*%%str], m12
+    mova    [tmpq+22*%%str], m13
+    mova    [tmpq+26*%%str], m14
+    mova    [tmpq+30*%%str], m15
+%else
+    mova                 m2, [tmpq+24*%%str] ; t6
+    mova                 m3, [tmpq+28*%%str] ; t7
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+24*%%str], m5
+    mova    [tmpq+23*%%str], m2
+    mova    [tmpq+28*%%str], m4
+    mova    [tmpq+19*%%str], m3
+
+    mova                 m2, [tmpq+16*%%str] ; t4
+    mova                 m3, [tmpq+20*%%str] ; t5
+    SUMSUB_BA             w,  1,  2,  5
+    SUMSUB_BA             w,  7,  3,  5
+    mova    [tmpq+15*%%str], m2
+    mova    [tmpq+11*%%str], m3
+
+    mova                 m2, [tmpq+ 0*%%str] ; t0
+    mova                 m3, [tmpq+ 4*%%str] ; t1
+    SUMSUB_BA             w,  6,  2,  5
+    SUMSUB_BA             w,  0,  3,  5
+    mova    [tmpq+31*%%str], m2
+    mova    [tmpq+27*%%str], m3
+
+    mova                 m2, [tmpq+ 8*%%str] ; t2
+    mova                 m3, [tmpq+12*%%str] ; t3
+    mova                 m5, [tmpq+ 7*%%str]
+    mova                 m4, [tmpq+ 3*%%str]
+    SUMSUB_BADC           w,  5,  2,  4,  3
+    mova    [tmpq+ 7*%%str], m2
+    mova    [tmpq+ 3*%%str], m3
+
+    mova                 m3, [tmpq+28*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
+    mova    [tmpq+ 0*%%str], m6
+    mova    [tmpq+ 4*%%str], m0
+    mova    [tmpq+ 8*%%str], m5
+    mova    [tmpq+12*%%str], m4
+    mova    [tmpq+20*%%str], m7
+    mova    [tmpq+24*%%str], m2
+    mova    [tmpq+28*%%str], m3
+
+    mova                 m6, [tmpq+19*%%str]
+    mova                 m0, [tmpq+23*%%str]
+    mova                 m5, [tmpq+11*%%str]
+    mova                 m4, [tmpq+15*%%str]
+    mova                 m1, [tmpq+ 3*%%str]
+    mova                 m7, [tmpq+ 7*%%str]
+    mova                 m3, [tmpq+31*%%str]
+    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
+    mova    [tmpq+ 3*%%str], m6
+    mova    [tmpq+ 7*%%str], m0
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+15*%%str], m4
+    mova    [tmpq+23*%%str], m7
+    mova    [tmpq+27*%%str], m2
+    mova    [tmpq+31*%%str], m3
+
+    mova                 m1, [tmpq+ 6*%%str] ; t14
+    mova                 m0, [tmpq+ 2*%%str] ; t15
+    mova                 m7, [tmpq+21*%%str] ; t16
+    mova                 m6, [tmpq+29*%%str] ; t17
+    SUMSUB_BA             w,  7,  0,  2
+    SUMSUB_BA             w,  6,  1,  2
+    mova    [tmpq+29*%%str], m7
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+21*%%str], m6
+    mova    [tmpq+ 6*%%str], m1
+
+    mova                 m1, [tmpq+14*%%str] ; t12
+    mova                 m0, [tmpq+10*%%str] ; t13
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    SUMSUB_BA             w,  5,  0,  2
+    SUMSUB_BA             w,  4,  1,  2
+    mova     [tmpq+10*%%str], m0
+    mova     [tmpq+14*%%str], m1
+
+    mova                 m1, [tmpq+22*%%str] ; t10
+    mova                 m0, [tmpq+18*%%str] ; t11
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+ 1*%%str] ; t21
+    SUMSUB_BA             w,  3,  0,  6
+    SUMSUB_BA             w,  2,  1,  6
+    mova     [tmpq+18*%%str], m0
+    mova     [tmpq+22*%%str], m1
+
+    mova                 m7, [tmpq+30*%%str] ; t8
+    mova                 m6, [tmpq+26*%%str] ; t9
+    mova                 m1, [tmpq+25*%%str] ; t22
+    mova                 m0, [tmpq+ 5*%%str] ; t23
+    SUMSUB_BADC           w,  1,  6,  0,  7
+    mova     [tmpq+26*%%str], m6
+    mova     [tmpq+30*%%str], m7
+
+    mova                 m7, [tmpq+29*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m0, [tmpq+ 2*%%str]
+    mova                 m1, [tmpq+ 6*%%str]
+    mova                 m2, [tmpq+10*%%str]
+    mova                 m3, [tmpq+14*%%str]
+    mova                 m4, [tmpq+18*%%str]
+    mova                 m5, [tmpq+22*%%str]
+    mova                 m7, [tmpq+30*%%str]
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
+    mova    [tmpq+ 2*%%str], m0
+    mova    [tmpq+ 6*%%str], m1
+    mova    [tmpq+10*%%str], m2
+    mova    [tmpq+14*%%str], m3
+    mova    [tmpq+22*%%str], m5
+    mova    [tmpq+26*%%str], m6
+    mova    [tmpq+30*%%str], m7
+%endif
+%else
+    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+    ; t20-22 is in m4-6
+    ; t24-31 is in m8-15
+
+%if cpuflag(ssse3)
+%define ROUND_REG [pw_512]
+%else
+%define ROUND_REG [pw_32]
+%endif
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+    SUMSUB_BA            w, %4, %1, %5
+    SUMSUB_BA            w, %3, %2, %5
+    VP9_IDCT8_WRITEx2   %4, %3, %5, %6, %7, ROUND_REG, 6
+%if %8 == 1
+    add               dstq, stride2q
+%endif
+    VP9_IDCT8_WRITEx2   %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
+%if %8 == 1
+    sub           dst_endq, stride2q
+%endif
+%endmacro
+
+%if ARCH_X86_64
+    pxor               m10, m10
+
+    ; store t0-1 and t30-31
+    mova                m8, [tmpq+ 0*%%str]
+    mova                m9, [tmpq+ 4*%%str]
+    %%STORE_2X2          8,  9,  0,  6, 12, 11, 10
+
+    ; store t2-3 and t28-29
+    mova                m8, [tmpq+ 8*%%str]
+    mova                m9, [tmpq+12*%%str]
+    %%STORE_2X2          8,  9, 14, 15, 12, 11, 10
+
+    ; store t4-5 and t26-27
+    mova                m8, [tmpq+16*%%str]
+    mova                m9, [tmpq+20*%%str]
+    %%STORE_2X2          8,  9,  7,  1, 12, 11, 10
+
+    ; store t6-7 and t24-25
+    mova                m8, [tmpq+24*%%str]
+    mova                m9, [tmpq+28*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t8-9 and t22-23
+    mova                m8, [tmpq+30*%%str]
+    mova                m9, [tmpq+26*%%str]
+    mova                m0, [tmpq+ 5*%%str]
+    %%STORE_2X2          8,  9, 13,  0, 12, 11, 10
+
+    ; store t10-11 and t20-21
+    mova                m8, [tmpq+22*%%str]
+    mova                m9, [tmpq+18*%%str]
+    %%STORE_2X2          8,  9,  2,  3, 12, 11, 10
+
+    ; store t12-13 and t18-19
+    mova                m8, [tmpq+14*%%str]
+    mova                m9, [tmpq+10*%%str]
+    mova                m5, [tmpq+13*%%str]
+    mova                m4, [tmpq+ 9*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
+
+    ; store t14-17
+    mova                m8, [tmpq+ 6*%%str]
+    mova                m9, [tmpq+ 2*%%str]
+    mova                m5, [tmpq+29*%%str]
+    mova                m4, [tmpq+21*%%str]
+    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10, 0
+
+    SWAP                 1, 10 ; zero
+%else
+    mova   [tmpq+ 1*%%str], m1
+    mova   [tmpq+11*%%str], m2
+    mova   [tmpq+15*%%str], m3
+    mova   [tmpq+17*%%str], m4
+    mova   [tmpq+19*%%str], m5
+    pxor                m1, m1
+
+    ; store t0-1 and t30-31
+    mova                m2, [tmpq+ 0*%%str]
+    mova                m3, [tmpq+ 4*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t2-3 and t28-29
+    mova                m2, [tmpq+ 8*%%str]
+    mova                m3, [tmpq+12*%%str]
+    mova                m0, [tmpq+ 3*%%str]
+    mova                m6, [tmpq+ 7*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t4-5 and t26-27
+    mova                m2, [tmpq+16*%%str]
+    mova                m3, [tmpq+20*%%str]
+    mova                m0, [tmpq+ 1*%%str]
+    %%STORE_2X2          2,  3,  7,  0, 4, 5, 1
+
+    ; store t6-7 and t24-25
+    mova                m2, [tmpq+24*%%str]
+    mova                m3, [tmpq+28*%%str]
+    mova                m0, [tmpq+17*%%str]
+    mova                m6, [tmpq+19*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t8-9 and t22-23
+    mova                m2, [tmpq+30*%%str]
+    mova                m3, [tmpq+26*%%str]
+    mova                m0, [tmpq+25*%%str]
+    mova                m6, [tmpq+ 5*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t10-11 and t20-21
+    mova                m2, [tmpq+22*%%str]
+    mova                m3, [tmpq+18*%%str]
+    mova                m0, [tmpq+11*%%str]
+    mova                m6, [tmpq+15*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t12-13 and t18-19
+    mova                m2, [tmpq+14*%%str]
+    mova                m3, [tmpq+10*%%str]
+    mova                m6, [tmpq+13*%%str]
+    mova                m0, [tmpq+ 9*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
+
+    ; store t14-17
+    mova                m2, [tmpq+ 6*%%str]
+    mova                m3, [tmpq+ 2*%%str]
+    mova                m6, [tmpq+29*%%str]
+    mova                m0, [tmpq+21*%%str]
+    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1, 0
+%endif
+%undef ROUND_REG
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+    movifnidn         eobd, dword eobm
+%if cpuflag(ssse3)
+    cmp eobd, 135
+    jg .idctfull
+    cmp eobd, 34
+    jg .idct16x16
+    cmp eobd, 1
+    jg .idct8x8
+%else
+    cmp eobd, 1
+    jg .idctfull
+%endif
+
+    ; dc-only case
+    movifnidn       blockq, blockmp
+    movifnidn         dstq, dstmp
+    movifnidn      strideq, stridemp
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    movsx            coefd, word [blockq]
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, (32 << 14) + 8192
+    sar              coefd, 14 + 6
+    movd                m0, coefd
+%endif
+    SPLATW              m0, m0, q0000
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_512]
+%endif
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 31
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    add               dstq, strideq
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    RET
+
+%if ARCH_X86_64
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+%else
+%define dst_bakq r0mp
+%endif
+%if cpuflag(ssse3)
+.idct8x8:
+%if ARCH_X86_32
+    DEFINE_ARGS block, u1, u2, u3, u4, tmp
+    mov             blockq, r2mp
+%endif
+    mov               tmpq, rsp
+    VP9_IDCT32_1D   blockq, 1, 8
+
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    sub          stride30q, stride2q        ; stride*30
+.loop2_8x8:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 8
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64,  8, m1
+    RET
+
+.idct16x16:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_16x16:
+    VP9_IDCT32_1D   blockq, 1, 16
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_16x16
+
+%if ARCH_X86_64
+    sub             blockq, 32
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_16x16:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 16
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_16x16
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 16, m1
+    RET
+%endif
+
+.idctfull:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
+    mov               cntd, 4
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT32_1D   blockq, 1
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_full
+
+%if ARCH_X86_64
+    sub             blockq, 64
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_full:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dstq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
+    ZERO_BLOCK      blockq, 64, 32, m1
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM sse2
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
new file mode 100644
index 0000000..902685e
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -0,0 +1,2044 @@
+;******************************************************************************
+;* VP9 inverse transform x86 SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+%include "vp9itxfm_template.asm"
+
+SECTION_RODATA
+
+cextern pw_8
+cextern pw_1023
+cextern pw_2048
+cextern pw_4095
+cextern pw_m1
+cextern pd_1
+cextern pd_16
+cextern pd_32
+cextern pd_8192
+
+pd_8: times 4 dd 8
+pd_3fff: times 4 dd 0x3fff
+
+cextern pw_11585x2
+
+cextern pw_5283_13377
+cextern pw_9929_13377
+cextern pw_15212_m13377
+cextern pw_15212_9929
+cextern pw_m5283_m15212
+cextern pw_13377x2
+cextern pw_m13377_13377
+cextern pw_13377_0
+
+pw_9929_m5283: times 4 dw 9929, -5283
+
+%macro COEF_PAIR 2-3
+cextern pw_m%1_%2
+cextern pw_%2_%1
+%if %0 == 3
+cextern pw_m%1_m%2
+%if %1 != %2
+cextern pw_m%2_%1
+cextern pw_%1_%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR  2404, 16207
+COEF_PAIR  3196, 16069, 1
+COEF_PAIR  4756, 15679
+COEF_PAIR  5520, 15426
+COEF_PAIR  6270, 15137, 1
+COEF_PAIR  8423, 14053
+COEF_PAIR 10394, 12665
+COEF_PAIR 11003, 12140
+COEF_PAIR 11585, 11585, 1
+COEF_PAIR 13160,  9760
+COEF_PAIR 13623,  9102, 1
+COEF_PAIR 14449,  7723
+COEF_PAIR 14811,  7005
+COEF_PAIR 15893,  3981
+COEF_PAIR 16305,  1606
+COEF_PAIR 16364,   804
+
+default_8x8:
+times 12 db 1
+times 52 db 2
+row_8x8:
+times 18 db 1
+times 46 db 2
+col_8x8:
+times 6 db 1
+times 58 db 2
+default_16x16:
+times 10 db 1
+times 28 db 2
+times 51 db 3
+times 167 db 4
+row_16x16:
+times 21 db 1
+times 45 db 2
+times 60 db 3
+times 130 db 4
+col_16x16:
+times 5 db 1
+times 12 db 2
+times 25 db 3
+times 214 db 4
+default_32x32:
+times 9 db 1
+times 25 db 2
+times 36 db 3
+times 65 db 4
+times 105 db 5
+times 96 db 6
+times 112 db 7
+times 576 db 8
+
+SECTION .text
+
+%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
+    mova               m%3, [%7]
+    mova               m%4, [%7+strideq]
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+    pmaxsw             m%3, m%5
+    pmaxsw             m%4, m%5
+    pminsw             m%3, m%6
+    pminsw             m%4, m%6
+    mova              [%7], m%3
+    mova      [%7+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*4/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+; the input coefficients are scaled up by 2 bit (which we downscale immediately
+; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
+; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
+; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
+; add 2 bits, we need to scale before converting to word in 12bpp, since the
+; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
+; we can scale after converting to words (which is half the instructions),
+; since the input is only 14+sign bit, which fits in 15+sign words directly.
+
+%macro IWHT4_FN 2 ; bpp, max
+cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
+    mova                m7, [pw_%2]
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+%if %1 >= 12
+    mova                m4, [blockq+0*16+8]
+    mova                m5, [blockq+1*16+8]
+    psrad               m0, 2
+    psrad               m1, 2
+    psrad               m4, 2
+    psrad               m5, 2
+    packssdw            m0, m4
+    packssdw            m1, m5
+%else
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    psraw               m0, 2
+    psraw               m1, 2
+%endif
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+%if %1 >= 12
+    mova                m4, [blockq+2*16+8]
+    mova                m5, [blockq+3*16+8]
+    psrad               m2, 2
+    psrad               m3, 2
+    psrad               m4, 2
+    psrad               m5, 2
+    packssdw            m2, m4
+    packssdw            m3, m5
+%else
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+    psraw               m2, 2
+    psraw               m3, 2
+%endif
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m6, m6
+    VP9_STORE_2X         0, 1, 4, 5, 6, 7
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 4, 5, 6, 7
+    ZERO_BLOCK      blockq, 16, 4, m6
+    RET
+%endmacro
+
+INIT_MMX mmxext
+IWHT4_FN 10, 1023
+INIT_MMX mmxext
+IWHT4_FN 12, 4095
+
+%macro VP9_IDCT4_WRITEOUT 0
+%if cpuflag(ssse3)
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5
+    pmulhrsw            m1, m5
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+%else
+    mova                m5, [pw_8]
+    paddw               m0, m5
+    paddw               m1, m5
+    paddw               m2, m5
+    paddw               m3, m5
+    psraw               m0, 4
+    psraw               m1, 4
+    psraw               m2, 4
+    psraw               m3, 4
+%endif
+    mova                m5, [pw_1023]
+    VP9_STORE_2X         0,  1,  6,  7,  4,  5
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         2,  3,  6,  7,  4,  5
+%endmacro
+
+%macro DC_ONLY 2 ; shift, zero
+    mov              coefd, dword [blockq]
+    movd          [blockq], %2
+    imul             coefd, 11585
+    add              coefd, 8192
+    sar              coefd, 14
+    imul             coefd, 11585
+    add              coefd, ((1 << (%1 - 1)) << 14) + 8192
+    sar              coefd, 14 + %1
+%endmacro
+
+; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
+; in 15+1 words without additional effort, since the coefficients are 15bpp.
+
+%macro IDCT4_10_FN 0
+cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only
+    pxor                m4, m4
+%if cpuflag(ssse3)
+    movd                m0, [blockq]
+    movd          [blockq], m4
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+%else
+    DEFINE_ARGS dst, stride, block, coef
+    DC_ONLY              4, m4
+    movd                m0, coefd
+%endif
+    pshufw              m0, m0, 0
+    mova                m5, [pw_1023]
+%if cpuflag(ssse3)
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+%endif
+    VP9_STORE_2X         0,  0,  6,  7,  4,  5
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4,  5
+    RET
+
+.idctfull:
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+INIT_MMX mmxext
+IDCT4_10_FN
+INIT_MMX ssse3
+IDCT4_10_FN
+
+%macro IADST4_FN 4
+cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
+%if WIN64 && notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%endif
+    movdqa            xmm5, [pd_8192]
+    mova                m0, [blockq+0*16+0]
+    mova                m1, [blockq+1*16+0]
+    packssdw            m0, [blockq+0*16+8]
+    packssdw            m1, [blockq+1*16+8]
+    mova                m2, [blockq+2*16+0]
+    mova                m3, [blockq+3*16+0]
+    packssdw            m2, [blockq+2*16+8]
+    packssdw            m3, [blockq+3*16+8]
+
+%if cpuflag(ssse3)
+    mova                m6, [pw_11585x2]
+%endif
+%ifnidn %1%3, iadstiadst
+    movdq2q             m7, xmm5
+%endif
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+INIT_MMX sse2
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+INIT_MMX ssse3
+IADST4_FN idct,  IDCT4,  iadst, IADST4
+IADST4_FN iadst, IADST4, idct,  IDCT4
+IADST4_FN iadst, IADST4, iadst, IADST4
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
+; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
+%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
+    pand               m%3, m%1, %8
+    pand               m%4, m%2, %8
+    psrad              m%1, 14
+    psrad              m%2, 14
+    packssdw           m%4, m%2
+    packssdw           m%3, m%1
+    punpckhwd          m%2, m%4, m%3
+    punpcklwd          m%4, m%3
+    pmaddwd            m%3, m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_%6_%5]
+    pmaddwd            m%4, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_m%5_%6]
+    paddd              m%3, %7
+    paddd              m%4, %7
+    psrad              m%3, 14
+    psrad              m%4, 14
+    paddd              m%1, m%3
+    paddd              m%2, m%4
+%endmacro
+
+%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
+    SUMSUB_MUL          %3, %5, %7, %8, 11585, 11585, %1, %2
+    SUMSUB_MUL          %4, %6, %7, %8, 15137,  6270, %1, %2
+    SUMSUB_BA        d, %4, %3, %7
+    SUMSUB_BA        d, %6, %5, %7
+    SWAP                %4, %6, %3
+%endmacro
+
+%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
+    movh               m%1, [dstq+strideq*0]
+    movh               m%2, [dstq+strideq*2]
+    movhps             m%1, [dstq+strideq*1]
+    movhps             m%2, [dstq+stride3q ]
+    paddw              m%1, m%3
+    paddw              m%2, m%4
+    pmaxsw             m%1, %5
+    pmaxsw             m%2, %5
+    pminsw             m%1, %6
+    pminsw             m%2, %6
+    movh   [dstq+strideq*0], m%1
+    movhps [dstq+strideq*1], m%1
+    movh   [dstq+strideq*2], m%2
+    movhps [dstq+stride3q ], m%2
+%endmacro
+
+%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
+    paddd              m%1, %7
+    paddd              m%2, %7
+    paddd              m%3, %7
+    paddd              m%4, %7
+    psrad              m%1, %8
+    psrad              m%2, %8
+    psrad              m%3, %8
+    psrad              m%4, %8
+    packssdw           m%1, m%2
+    packssdw           m%3, m%4
+    STORE_4x4           %2, %4, %1, %3, %5, %6
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
+    ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
+    ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
+    ; dword. After the final shift (4), the result is 13+sign bits, so we
+    ; don't need any additional processing to fit it in a word
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m4, m4
+    DC_ONLY              4, m4
+    movd                m0, coefd
+    pshuflw             m0, m0, q0000
+    punpcklqdq          m0, m0
+    mova                m5, [pw_4095]
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    STORE_4x4            1, 3, 0, 0, m4, m5
+    RET
+
+.idctfull:
+    DEFINE_ARGS dst, stride, block, eob
+    mova                m0, [blockq+0*16]
+    mova                m1, [blockq+1*16]
+    mova                m2, [blockq+2*16]
+    mova                m3, [blockq+3*16]
+    mova                m6, [pd_8192]
+    mova                m7, [pd_3fff]
+
+    IDCT4_12BPP_1D      m6, m7
+    TRANSPOSE4x4D        0, 1, 2, 3, 4
+    IDCT4_12BPP_1D      m6, m7
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+
+    ; writeout
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    mova                m5, [pw_4095]
+    mova                m6, [pd_8]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
+    RET
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; out0 =  5283 * in0 + 13377 + in1 + 15212 * in2 +  9929 * in3 + rnd >> 14
+; out1 =  9929 * in0 + 13377 * in1 -  5283 * in2 - 15282 * in3 + rnd >> 14
+; out2 = 13377 * in0               - 13377 * in2 + 13377 * in3 + rnd >> 14
+; out3 = 15212 * in0 - 13377 * in1 +  9929 * in2 -  5283 * in3 + rnd >> 14
+%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
+    pand                m4, m0, %2
+    pand                m5, m1, %2
+    psrad               m0, 14
+    psrad               m1, 14
+    packssdw            m5, m1
+    packssdw            m4, m0
+    punpckhwd           m1, m4, m5
+    punpcklwd           m4, m5
+    pand                m5, m2, %2
+    pand                m6, m3, %2
+    psrad               m2, 14
+    psrad               m3, 14
+    packssdw            m6, m3
+    packssdw            m5, m2
+    punpckhwd           m3, m5, m6
+    punpcklwd           m5, m6
+    SCRATCH              1,  8, rsp+0*mmsize, a
+    SCRATCH              5,  9, rsp+1*mmsize, b
+
+    ; m1/3 have the high bits of 0,1,2,3
+    ; m4/5 have the low bits of 0,1,2,3
+    ; m0/2/6/7 are free
+
+    mova                m2, [pw_15212_9929]
+    mova                m0, [pw_5283_13377]
+    pmaddwd             m7, m2, reg_b
+    pmaddwd             m6, m4, m0
+    pmaddwd             m2, m3
+    pmaddwd             m0, reg_a
+    paddd               m6, m7
+    paddd               m0, m2
+    mova                m1, [pw_m13377_13377]
+    mova                m5, [pw_13377_0]
+    pmaddwd             m7, m1, reg_b
+    pmaddwd             m2, m4, m5
+    pmaddwd             m1, m3
+    pmaddwd             m5, reg_a
+    paddd               m2, m7
+    paddd               m1, m5
+    paddd               m6, %1
+    paddd               m2, %1
+    psrad               m6, 14
+    psrad               m2, 14
+    paddd               m0, m6                      ; t0
+    paddd               m2, m1                      ; t2
+
+    mova                m7, [pw_m5283_m15212]
+    mova                m5, [pw_9929_13377]
+    pmaddwd             m1, m7, reg_b
+    pmaddwd             m6, m4, m5
+    pmaddwd             m7, m3
+    pmaddwd             m5, reg_a
+    paddd               m6, m1
+    paddd               m7, m5
+    UNSCRATCH            5,  9, rsp+1*mmsize, b
+    pmaddwd             m5, [pw_9929_m5283]
+    pmaddwd             m4, [pw_15212_m13377]
+    pmaddwd             m3, [pw_9929_m5283]
+    UNSCRATCH            1,  8, rsp+0*mmsize, a
+    pmaddwd             m1, [pw_15212_m13377]
+    paddd               m4, m5
+    paddd               m3, m1
+    paddd               m6, %1
+    paddd               m4, %1
+    psrad               m6, 14
+    psrad               m4, 14
+    paddd               m7, m6                      ; t1
+    paddd               m3, m4                      ; t3
+
+    SWAP                 1, 7
+%endmacro
+
+%macro IADST4_12BPP_FN 4
+cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
+    mova                m0, [blockq+0*16]
+    mova                m1, [blockq+1*16]
+    mova                m2, [blockq+2*16]
+    mova                m3, [blockq+3*16]
+
+    PRELOAD             10, pd_8192, rnd
+    PRELOAD             11, pd_3fff, mask
+    %2_12BPP_1D    reg_rnd, reg_mask
+    TRANSPOSE4x4D        0, 1, 2, 3, 4
+    %4_12BPP_1D    reg_rnd, reg_mask
+
+    pxor                m4, m4
+    ZERO_BLOCK      blockq, 16, 4, m4
+
+    ; writeout
+    DEFINE_ARGS dst, stride, stride3
+    lea           stride3q, [strideq*3]
+    mova                m5, [pw_4095]
+    mova                m6, [pd_8]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
+    RET
+%endmacro
+
+INIT_XMM sse2
+IADST4_12BPP_FN idct,  IDCT4,  iadst, IADST4
+IADST4_12BPP_FN iadst, IADST4, idct,  IDCT4
+IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH            6, 8, rsp+%3*mmsize
+%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
+    mova                m0, [%1+0*%4]
+    mova                m2, [%1+2*%4]
+    mova                m4, [%1+4*%4]
+    mova                m6, [%1+6*%4]
+    IDCT4_12BPP_1D      %2, %3, 0, 2, 4, 6, 1, 3            ; m0/2/4/6 have t0/1/2/3
+    SCRATCH              4, 8, rsp+(%5+0)*mmsize
+    SCRATCH              6, 9, rsp+(%5+1)*mmsize
+    mova                m1, [%1+1*%4]
+    mova                m3, [%1+3*%4]
+    mova                m5, [%1+5*%4]
+    mova                m7, [%1+7*%4]
+    SUMSUB_MUL           1, 7, 4, 6, 16069,  3196, %2, %3   ; m1=t7a, m7=t4a
+    SUMSUB_MUL           5, 3, 4, 6,  9102, 13623, %2, %3   ; m5=t6a, m3=t5a
+    SUMSUB_BA         d, 3, 7, 4                            ; m3=t4, m7=t5a
+    SUMSUB_BA         d, 5, 1, 4                            ; m5=t7, m1=t6a
+    SUMSUB_MUL           1, 7, 4, 6, 11585, 11585, %2, %3   ; m1=t6, m7=t5
+    SUMSUB_BA         d, 5, 0, 4                            ; m5=out0, m0=out7
+    SUMSUB_BA         d, 1, 2, 4                            ; m1=out1, m2=out6
+    UNSCRATCH            4, 8, rsp+(%5+0)*mmsize
+    UNSCRATCH            6, 9, rsp+(%5+1)*mmsize
+    SCRATCH              2, 8, rsp+(%5+0)*mmsize
+    SUMSUB_BA         d, 7, 4, 2                            ; m7=out2, m4=out5
+    SUMSUB_BA         d, 3, 6, 2                            ; m3=out3, m6=out4
+    SWAP                 0, 5, 4, 6, 2, 7
+%endmacro
+
+%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
+    mova               m%1, [%6+%7*0]
+    mova               m%2, [%6+%7*1]
+    paddw              m%1, m%3
+    paddw              m%2, m%3
+    pmaxsw             m%1, %4
+    pmaxsw             m%2, %4
+    pminsw             m%1, %5
+    pminsw             m%2, %5
+    mova         [%6+%7*0], m%1
+    mova         [%6+%7*1], m%2
+%endmacro
+
+; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
+; storage also instead of allocating two more stack spaces. This doesn't
+; matter much but it's something...
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
+                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+                                  dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              5, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 4
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    SCRATCH              0, 12, rsp+16*mmsize, max
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_8x8]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_8x8+cntq-1]
+%endif
+    mov              skipd, 2
+    sub              skipd, cntd
+    mov               ptrq, rsp
+    PRELOAD             10, pd_8192, rnd
+    PRELOAD             11, pd_3fff, mask
+    PRELOAD             13, pd_16, srnd
+.loop_1:
+    IDCT8_1D        blockq, reg_rnd, reg_mask
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 6
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 2*mmsize], m1
+    mova  [ptrq+ 4*mmsize], m2
+    mova  [ptrq+ 6*mmsize], m3
+    UNSCRATCH            6, 8, rsp+17*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 1*mmsize], m4
+    mova  [ptrq+ 3*mmsize], m5
+    mova  [ptrq+ 5*mmsize], m6
+    mova  [ptrq+ 7*mmsize], m7
+    add               ptrq, 8 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    add               ptrq, 4 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 2
+    mov               ptrq, rsp
+.loop_2:
+    IDCT8_1D          ptrq, reg_rnd, reg_mask
+
+    pxor                m6, m6
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+    lea               dstq, [dstq+strideq*4]
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    UNSCRATCH            1, 12, rsp+16*mmsize, max
+    UNSCRATCH            2, 13, pd_16, srnd
+    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
+    add               ptrq, 16
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+8]
+%else
+    mov               dstq, dstm
+    add               dstq, 8
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m6 is still zero
+    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+    RET
+
+%macro DC_ONLY_64BIT 2 ; shift, zero
+%if ARCH_X86_64
+    movsxd           coefq, dword [blockq]
+    movd          [blockq], %2
+    imul             coefq, 11585
+    add              coefq, 8192
+    sar              coefq, 14
+    imul             coefq, 11585
+    add              coefq, ((1 << (%1 - 1)) << 14) + 8192
+    sar              coefq, 14 + %1
+%else
+    mov              coefd, dword [blockq]
+    movd          [blockq], %2
+    DEFINE_ARGS dst, stride, cnt, coef, coefl
+    mov               cntd, 2
+.loop_dc_calc:
+    mov             coefld, coefd
+    sar              coefd, 14
+    and             coefld, 0x3fff
+    imul             coefd, 11585
+    imul            coefld, 11585
+    add             coefld, 8192
+    sar             coefld, 14
+    add              coefd, coefld
+    dec               cntd
+    jg .loop_dc_calc
+    add              coefd, 1 << (%1 - 1)
+    sar              coefd, %1
+%endif
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
+                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
+                                  dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
+    ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        5, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 4
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+; inputs and outputs are dwords, coefficients are words
+;
+; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
+; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
+%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
+    pand               m%3, m%1, %7
+    pand               m%4, m%2, %7
+    psrad              m%1, 14
+    psrad              m%2, 14
+    packssdw           m%4, m%2
+    packssdw           m%3, m%1
+    punpckhwd          m%2, m%4, m%3
+    punpcklwd          m%4, m%3
+    pmaddwd            m%3, m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_%6_%5]
+    pmaddwd            m%4, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_m%5_%6]
+%endmacro
+
+; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
+; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
+%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
+    SUMSUB_BA        d, %1, %2, %5
+    SUMSUB_BA        d, %3, %4, %5
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%3, 14
+    psrad              m%4, 14
+    paddd              m%1, m%3
+    paddd              m%2, m%4
+%endmacro
+
+%macro NEGD 1
+%if cpuflag(ssse3)
+    psignd              %1, [pw_m1]
+%else
+    pxor                %1, [pw_m1]
+    paddd               %1, [pd_1]
+%endif
+%endmacro
+
+; the following line has not been executed at the end of this macro:
+; UNSCRATCH            6, 8, rsp+17*mmsize
+%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
+    mova                m0, [%1+ 0*mmsize]
+    mova                m3, [%1+ 6*mmsize]
+    mova                m4, [%1+ 8*mmsize]
+    mova                m7, [%1+14*mmsize]
+    SUMSUB_MUL_D         7, 0, 1, 2, 16305,  1606, %3   ; m7/1=t0a, m0/2=t1a
+    SUMSUB_MUL_D         3, 4, 5, 6, 10394, 12665, %3   ; m3/5=t4a, m4/6=t5a
+    SCRATCH              0, 8, rsp+17*mmsize
+    SUMSUB_PACK_D        3, 7, 5, 1, 0, %2              ; m3=t0, m7=t4
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    SUMSUB_PACK_D        4, 0, 6, 2, 1, %2              ; m4=t1, m0=t5
+
+    SCRATCH              3, 8, rsp+17*mmsize
+    SCRATCH              4, 9, rsp+18*mmsize
+    SCRATCH              7, 10, rsp+19*mmsize
+    SCRATCH              0, 11, rsp+20*mmsize
+
+    mova                m1, [%1+ 2*mmsize]
+    mova                m2, [%1+ 4*mmsize]
+    mova                m5, [%1+10*mmsize]
+    mova                m6, [%1+12*mmsize]
+    SUMSUB_MUL_D         5, 2, 3, 4, 14449,  7723, %3   ; m5/8=t2a, m2/9=t3a
+    SUMSUB_MUL_D         1, 6, 7, 0,  4756, 15679, %3   ; m1/10=t6a, m6/11=t7a
+    SCRATCH              2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        1, 5, 7, 3, 2, %2              ; m1=t2, m5=t6
+    UNSCRATCH            2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        6, 2, 0, 4, 3, %2              ; m6=t3, m2=t7
+
+    UNSCRATCH            7, 10, rsp+19*mmsize
+    UNSCRATCH            0, 11, rsp+20*mmsize
+    SCRATCH              1, 10, rsp+19*mmsize
+    SCRATCH              6, 11, rsp+20*mmsize
+
+    SUMSUB_MUL_D         7, 0, 3, 4, 15137,  6270, %3   ; m7/8=t4a, m0/9=t5a
+    SUMSUB_MUL_D         2, 5, 1, 6,  6270, 15137, %3   ; m2/10=t7a, m5/11=t6a
+    SCRATCH              2, 12, rsp+21*mmsize
+    SUMSUB_PACK_D        5, 7, 6, 3, 2, %2              ; m5=-out1, m7=t6
+    UNSCRATCH            2, 12, rsp+21*mmsize
+    NEGD                m5                              ; m5=out1
+    SUMSUB_PACK_D        2, 0, 1, 4, 3, %2              ; m2=out6, m0=t7
+    SUMSUB_MUL           7, 0, 3, 4, 11585, 11585, %2, %3   ; m7=out2, m0=-out5
+    NEGD                m0                              ; m0=out5
+
+    UNSCRATCH            3, 8, rsp+17*mmsize
+    UNSCRATCH            4, 9, rsp+18*mmsize
+    UNSCRATCH            1, 10, rsp+19*mmsize
+    UNSCRATCH            6, 11, rsp+20*mmsize
+    SCRATCH              2, 8, rsp+17*mmsize
+    SCRATCH              0, 9, rsp+18*mmsize
+
+    SUMSUB_BA         d, 1, 3,  2                       ; m1=out0, m3=t2
+    SUMSUB_BA         d, 6, 4,  2                       ; m6=-out7, m4=t3
+    NEGD                m6                              ; m6=out7
+    SUMSUB_MUL           3, 4,  2,  0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
+    NEGD                m3                              ; m3=out3
+
+    UNSCRATCH            0, 9, rsp+18*mmsize
+
+    SWAP                 0, 1, 5
+    SWAP                 2, 7, 6
+%endmacro
+
+%macro IADST8_FN 5
+cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
+                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+                              dst, stride, block, eob
+    mova                m0, [pw_1023]
+
+.body:
+    SCRATCH              0, 13, rsp+16*mmsize, max
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [%5_8x8]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [%5_8x8+cntq-1]
+%endif
+    mov              skipd, 2
+    sub              skipd, cntd
+    mov               ptrq, rsp
+    PRELOAD             14, pd_8192, rnd
+    PRELOAD             15, pd_3fff, mask
+.loop_1:
+    %2_1D           blockq, reg_rnd, reg_mask
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 6
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 2*mmsize], m1
+    mova  [ptrq+ 4*mmsize], m2
+    mova  [ptrq+ 6*mmsize], m3
+    UNSCRATCH            6, 8, rsp+17*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 1*mmsize], m4
+    mova  [ptrq+ 3*mmsize], m5
+    mova  [ptrq+ 5*mmsize], m6
+    mova  [ptrq+ 7*mmsize], m7
+    add               ptrq, 8 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    add               ptrq, 4 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 2
+    mov               ptrq, rsp
+.loop_2:
+    %4_1D             ptrq, reg_rnd, reg_mask
+
+    pxor                m6, m6
+    PRELOAD              9, pd_16, srnd
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
+    lea               dstq, [dstq+strideq*4]
+    UNSCRATCH            0, 8, rsp+17*mmsize
+    UNSCRATCH            1, 13, rsp+16*mmsize, max
+    UNSCRATCH            2, 9, pd_16, srnd
+    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
+    add               ptrq, 16
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+8]
+%else
+    mov               dstq, dstm
+    add               dstq, 8
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m6 is still zero
+    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
+    RET
+
+cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
+                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
+                              dst, stride, block, eob
+    mova                m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST8_FN idct,  IDCT8,  iadst, IADST8, row
+IADST8_FN iadst, IADST8, idct,  IDCT8,  col
+IADST8_FN iadst, IADST8, iadst, IADST8, default
+
+%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
+    IDCT8_1D            %1, [pd_8192], [pd_3fff], %2 * 2, %4    ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
+    ; SCRATCH            6, 8, rsp+(%4+0)*mmsize    ; t6
+    SCRATCH              0, 15, rsp+(%4+7)*mmsize   ; t0a
+    SCRATCH              1, 14, rsp+(%4+6)*mmsize   ; t1a
+    SCRATCH              2, 13, rsp+(%4+5)*mmsize   ; t2a
+    SCRATCH              3, 12, rsp+(%4+4)*mmsize   ; t3a
+    SCRATCH              4, 11, rsp+(%4+3)*mmsize   ; t4
+    mova [rsp+(%3+0)*mmsize], m5                    ; t5
+    mova [rsp+(%3+1)*mmsize], m7                    ; t7
+
+    mova                m0, [%1+ 1*%2]              ; in1
+    mova                m3, [%1+ 7*%2]              ; in7
+    mova                m4, [%1+ 9*%2]              ; in9
+    mova                m7, [%1+15*%2]              ; in15
+
+    SUMSUB_MUL           0, 7, 1, 2, 16305,  1606   ; m0=t15a, m7=t8a
+    SUMSUB_MUL           4, 3, 1, 2, 10394, 12665   ; m4=t14a, m3=t9a
+    SUMSUB_BA         d, 3, 7, 1                    ; m3=t8, m7=t9
+    SUMSUB_BA         d, 4, 0, 1                    ; m4=t15,m0=t14
+    SUMSUB_MUL           0, 7, 1, 2, 15137,  6270   ; m0=t14a, m7=t9a
+
+    mova                m1, [%1+ 3*%2]              ; in3
+    mova                m2, [%1+ 5*%2]              ; in5
+    mova                m5, [%1+11*%2]              ; in11
+    mova                m6, [%1+13*%2]              ; in13
+
+    SCRATCH              0,  9, rsp+(%4+1)*mmsize
+    SCRATCH              7, 10, rsp+(%4+2)*mmsize
+
+    SUMSUB_MUL           2, 5, 0, 7, 14449,  7723   ; m2=t13a, m5=t10a
+    SUMSUB_MUL           6, 1, 0, 7,  4756, 15679   ; m6=t12a, m1=t11a
+    SUMSUB_BA         d, 5, 1, 0                    ; m5=t11,m1=t10
+    SUMSUB_BA         d, 2, 6, 0                    ; m2=t12,m6=t13
+    NEGD                m1                          ; m1=-t10
+    SUMSUB_MUL           1, 6, 0, 7, 15137,  6270   ; m1=t13a, m6=t10a
+
+    UNSCRATCH            7, 10, rsp+(%4+2)*mmsize
+    SUMSUB_BA         d, 5, 3, 0                    ; m5=t8a, m3=t11a
+    SUMSUB_BA         d, 6, 7, 0                    ; m6=t9,  m7=t10
+    SUMSUB_BA         d, 2, 4, 0                    ; m2=t15a,m4=t12a
+    SCRATCH              5, 10, rsp+(%4+2)*mmsize
+    SUMSUB_MUL           4, 3, 0, 5, 11585, 11585   ; m4=t12, m3=t11
+    UNSCRATCH            0, 9, rsp+(%4+1)*mmsize
+    SUMSUB_BA         d, 1, 0, 5                    ; m1=t14, m0=t13
+    SCRATCH              6, 9, rsp+(%4+1)*mmsize
+    SUMSUB_MUL           0, 7, 6, 5, 11585, 11585   ; m0=t13a,m7=t10a
+
+    ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
+    ; free: 6,5
+
+    UNSCRATCH            5, 15, rsp+(%4+7)*mmsize
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=out0, m5=out15
+    SCRATCH              5, 15, rsp+(%4+7)*mmsize
+    UNSCRATCH            5, 14, rsp+(%4+6)*mmsize
+    SUMSUB_BA         d, 1, 5, 6                    ; m1=out1, m5=out14
+    SCRATCH              5, 14, rsp+(%4+6)*mmsize
+    UNSCRATCH            5, 13, rsp+(%4+5)*mmsize
+    SUMSUB_BA         d, 0, 5, 6                    ; m0=out2, m5=out13
+    SCRATCH              5, 13, rsp+(%4+5)*mmsize
+    UNSCRATCH            5, 12, rsp+(%4+4)*mmsize
+    SUMSUB_BA         d, 4, 5, 6                    ; m4=out3, m5=out12
+    SCRATCH              5, 12, rsp+(%4+4)*mmsize
+    UNSCRATCH            5, 11, rsp+(%4+3)*mmsize
+    SUMSUB_BA         d, 3, 5, 6                    ; m3=out4, m5=out11
+    SCRATCH              4, 11, rsp+(%4+3)*mmsize
+    mova                m4, [rsp+(%3+0)*mmsize]
+    SUMSUB_BA         d, 7, 4, 6                    ; m7=out5, m4=out10
+    mova [rsp+(%3+0)*mmsize], m5
+    UNSCRATCH            5, 8, rsp+(%4+0)*mmsize
+    UNSCRATCH            6, 9, rsp+(%4+1)*mmsize
+    SCRATCH              2, 8, rsp+(%4+0)*mmsize
+    SCRATCH              1, 9, rsp+(%4+1)*mmsize
+    UNSCRATCH            1, 10, rsp+(%4+2)*mmsize
+    SCRATCH              0, 10, rsp+(%4+2)*mmsize
+    mova                m0, [rsp+(%3+1)*mmsize]
+    SUMSUB_BA         d, 6, 5, 2                    ; m6=out6, m5=out9
+    SUMSUB_BA         d, 1, 0, 2                    ; m1=out7, m0=out8
+
+    SWAP                 0, 3, 1, 7, 2, 6, 4
+
+    ; output order: 8-11|r67-70=out0-3
+    ;               0-6,r65=out4-11
+    ;               12-15|r71-74=out12-15
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 8
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    mova   [rsp+64*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_16x16]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_16x16+cntq-1]
+%endif
+    mov              skipd, 4
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    IDCT16_1D       blockq
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+ 9*mmsize], m2
+    mova  [ptrq+13*mmsize], m3
+    mova                m7, [rsp+65*mmsize]
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m4
+    mova  [ptrq+ 6*mmsize], m5
+    mova  [ptrq+10*mmsize], m6
+    mova  [ptrq+14*mmsize], m7
+    UNSCRATCH               0, 8, rsp+67*mmsize
+    UNSCRATCH               1, 9, rsp+68*mmsize
+    UNSCRATCH               2, 10, rsp+69*mmsize
+    UNSCRATCH               3, 11, rsp+70*mmsize
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 4*mmsize], m1
+    mova  [ptrq+ 8*mmsize], m2
+    mova  [ptrq+12*mmsize], m3
+    UNSCRATCH               4, 12, rsp+71*mmsize
+    UNSCRATCH               5, 13, rsp+72*mmsize
+    UNSCRATCH               6, 14, rsp+73*mmsize
+    UNSCRATCH               7, 15, rsp+74*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 3*mmsize], m4
+    mova  [ptrq+ 7*mmsize], m5
+    mova  [ptrq+11*mmsize], m6
+    mova  [ptrq+15*mmsize], m7
+    add               ptrq, 16 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 4
+    mov               ptrq, rsp
+.loop_2:
+    IDCT16_1D         ptrq
+
+    pxor               m7, m7
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+    lea               dstq, [dstq+strideq*4]
+    mova                m0, [rsp+65*mmsize]
+    mova                m1, [rsp+64*mmsize]
+    mova                m2, [pd_32]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+    mov               dstq, dstm
+%endif
+    UNSCRATCH               0, 8, rsp+67*mmsize
+    UNSCRATCH               4, 9, rsp+68*mmsize
+    UNSCRATCH               5, 10, rsp+69*mmsize
+    UNSCRATCH               3, 11, rsp+70*mmsize
+    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea               dstq, [dstbakq+stride3q*4]
+%else
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    UNSCRATCH               4, 12, rsp+71*mmsize
+    UNSCRATCH               5, 13, rsp+72*mmsize
+    UNSCRATCH               6, 14, rsp+73*mmsize
+    UNSCRATCH               0, 15, rsp+74*mmsize
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+    RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 8
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
+    lea               dstq, [dstq+strideq*2]
+    dec               cntd
+    jg .loop_dc
+    RET
+
+; r65-69 are available for spills
+; r70-77 are available on x86-32 only (x86-64 should use m8-15)
+; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
+%macro IADST16_1D 1 ; src
+    mova                m0, [%1+ 0*4*mmsize]        ; in0
+    mova                m1, [%1+ 7*4*mmsize]        ; in7
+    mova                m2, [%1+ 8*4*mmsize]        ; in8
+    mova                m3, [%1+15*4*mmsize]        ; in15
+    SUMSUB_MUL_D         3, 0, 4, 5, 16364,  804    ; m3/4=t0, m0/5=t1
+    SUMSUB_MUL_D         1, 2, 6, 7, 11003, 12140   ; m1/6=t8, m2/7=t9
+    SCRATCH              0, 8, rsp+70*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t0a, m3=t8a
+    UNSCRATCH            0, 8, rsp+70*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t1a, m0=t9a
+    mova   [rsp+67*mmsize], m1
+    SCRATCH              2, 9, rsp+71*mmsize
+    SCRATCH              3, 12, rsp+74*mmsize
+    SCRATCH              0, 13, rsp+75*mmsize
+
+    mova                m0, [%1+ 3*4*mmsize]        ; in3
+    mova                m1, [%1+ 4*4*mmsize]        ; in4
+    mova                m2, [%1+11*4*mmsize]        ; in11
+    mova                m3, [%1+12*4*mmsize]        ; in12
+    SUMSUB_MUL_D         2, 1, 4, 5, 14811,  7005   ; m2/4=t4, m1/5=t5
+    SUMSUB_MUL_D         0, 3, 6, 7,  5520, 15426   ; m0/6=t12, m3/7=t13
+    SCRATCH              1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t4a, m2=t12a
+    UNSCRATCH            1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t5a, m1=t13a
+    SCRATCH              0, 15, rsp+77*mmsize
+    SCRATCH              3, 11, rsp+73*mmsize
+
+    UNSCRATCH            0, 12, rsp+74*mmsize       ; t8a
+    UNSCRATCH            3, 13, rsp+75*mmsize       ; t9a
+    SUMSUB_MUL_D         0, 3, 4, 5, 16069,  3196   ; m0/4=t8, m3/5=t9
+    SUMSUB_MUL_D         1, 2, 6, 7,  3196, 16069   ; m1/6=t13, m2/7=t12
+    SCRATCH              1, 12, rsp+74*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 4, 1              ; m2=t8a, m0=t12a
+    UNSCRATCH            1, 12, rsp+74*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 5, 4              ; m1=t9a, m3=t13a
+    mova   [rsp+65*mmsize], m2
+    mova   [rsp+66*mmsize], m1
+    SCRATCH              0, 8, rsp+70*mmsize
+    SCRATCH              3, 12, rsp+74*mmsize
+
+    mova                m0, [%1+ 2*4*mmsize]        ; in2
+    mova                m1, [%1+ 5*4*mmsize]        ; in5
+    mova                m2, [%1+10*4*mmsize]        ; in10
+    mova                m3, [%1+13*4*mmsize]        ; in13
+    SUMSUB_MUL_D         3, 0, 4, 5, 15893,  3981   ; m3/4=t2, m0/5=t3
+    SUMSUB_MUL_D         1, 2, 6, 7,  8423, 14053   ; m1/6=t10, m2/7=t11
+    SCRATCH              0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t2a, m3=t10a
+    UNSCRATCH            0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t3a, m0=t11a
+    mova   [rsp+68*mmsize], m1
+    mova   [rsp+69*mmsize], m2
+    SCRATCH              3, 13, rsp+75*mmsize
+    SCRATCH              0, 14, rsp+76*mmsize
+
+    mova                m0, [%1+ 1*4*mmsize]        ; in1
+    mova                m1, [%1+ 6*4*mmsize]        ; in6
+    mova                m2, [%1+ 9*4*mmsize]        ; in9
+    mova                m3, [%1+14*4*mmsize]        ; in14
+    SUMSUB_MUL_D         2, 1, 4, 5, 13160,  9760   ; m2/4=t6, m1/5=t7
+    SUMSUB_MUL_D         0, 3, 6, 7,  2404, 16207   ; m0/6=t14, m3/7=t15
+    SCRATCH              1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t6a, m2=t14a
+    UNSCRATCH            1, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t7a, m1=t15a
+
+    UNSCRATCH            4, 13, rsp+75*mmsize       ; t10a
+    UNSCRATCH            5, 14, rsp+76*mmsize       ; t11a
+    SCRATCH              0, 13, rsp+75*mmsize
+    SCRATCH              3, 14, rsp+76*mmsize
+    SUMSUB_MUL_D         4, 5, 6, 7,  9102, 13623   ; m4/6=t10, m5/7=t11
+    SUMSUB_MUL_D         1, 2, 0, 3, 13623,  9102   ; m1/0=t15, m2/3=t14
+    SCRATCH              0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        2, 4, 3, 6, 0              ; m2=t10a, m4=t14a
+    UNSCRATCH            0, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        1, 5, 0, 7, 6              ; m1=t11a, m5=t15a
+
+    UNSCRATCH            0, 8, rsp+70*mmsize        ; t12a
+    UNSCRATCH            3, 12, rsp+74*mmsize       ; t13a
+    SCRATCH              2, 8, rsp+70*mmsize
+    SCRATCH              1, 12, rsp+74*mmsize
+    SUMSUB_MUL_D         0, 3, 1, 2, 15137,  6270   ; m0/1=t12, m3/2=t13
+    SUMSUB_MUL_D         5, 4, 7, 6,  6270, 15137   ; m5/7=t15, m4/6=t14
+    SCRATCH              2, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        4, 0, 6, 1, 2              ; m4=out2, m0=t14a
+    UNSCRATCH            2, 10, rsp+72*mmsize
+    SUMSUB_PACK_D        5, 3, 7, 2, 1              ; m5=-out13, m3=t15a
+    NEGD                m5                          ; m5=out13
+
+    UNSCRATCH            1, 9, rsp+71*mmsize        ; t1a
+    mova                m2, [rsp+68*mmsize]         ; t2a
+    UNSCRATCH            6, 13, rsp+75*mmsize       ; t6a
+    UNSCRATCH            7, 14, rsp+76*mmsize       ; t7a
+    SCRATCH              4, 10, rsp+72*mmsize
+    SCRATCH              5, 13, rsp+75*mmsize
+    UNSCRATCH            4, 15, rsp+77*mmsize       ; t4a
+    UNSCRATCH            5, 11, rsp+73*mmsize       ; t5a
+    SCRATCH              0, 14, rsp+76*mmsize
+    SCRATCH              3, 15, rsp+77*mmsize
+    mova                m0, [rsp+67*mmsize]         ; t0a
+    SUMSUB_BA         d, 4, 0, 3                    ; m4=t0, m0=t4
+    SUMSUB_BA         d, 5, 1, 3                    ; m5=t1, m1=t5
+    SUMSUB_BA         d, 6, 2, 3                    ; m6=t2, m2=t6
+    SCRATCH              4, 9, rsp+71*mmsize
+    mova                m3, [rsp+69*mmsize]         ; t3a
+    SUMSUB_BA         d, 7, 3, 4                    ; m7=t3, m3=t7
+
+    mova   [rsp+67*mmsize], m5
+    mova   [rsp+68*mmsize], m6
+    mova   [rsp+69*mmsize], m7
+    SUMSUB_MUL_D         0, 1, 4, 5, 15137,  6270   ; m0/4=t4a, m1/5=t5a
+    SUMSUB_MUL_D         3, 2, 7, 6,  6270, 15137   ; m3/7=t7a, m2/6=t6a
+    SCRATCH              1, 11, rsp+73*mmsize
+    SUMSUB_PACK_D        2, 0, 6, 4, 1              ; m2=-out3, m0=t6
+    NEGD                m2                          ; m2=out3
+    UNSCRATCH            1, 11, rsp+73*mmsize
+    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=out12, m1=t7
+    SCRATCH              2, 11, rsp+73*mmsize
+    UNSCRATCH            2, 12, rsp+74*mmsize       ; t11a
+    SCRATCH              3, 12, rsp+74*mmsize
+
+    UNSCRATCH            3, 8, rsp+70*mmsize        ; t10a
+    mova                m4, [rsp+65*mmsize]         ; t8a
+    mova                m5, [rsp+66*mmsize]         ; t9a
+    SUMSUB_BA         d, 3, 4, 6                    ; m3=-out1, m4=t10
+    NEGD                m3                          ; m3=out1
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=out14, m5=t11
+    UNSCRATCH            6, 9, rsp+71*mmsize        ; t0
+    UNSCRATCH            7, 14, rsp+76*mmsize       ; t14a
+    SCRATCH              3, 9, rsp+71*mmsize
+    SCRATCH              2, 14, rsp+76*mmsize
+
+    SUMSUB_MUL           1, 0, 2, 3, 11585, 11585   ; m1=out4, m0=out11
+    mova   [rsp+65*mmsize], m0
+    SUMSUB_MUL           5, 4, 2, 3, 11585, 11585   ; m5=out6, m4=out9
+    UNSCRATCH            0, 15, rsp+77*mmsize       ; t15a
+    SUMSUB_MUL           7, 0, 2, 3, 11585, m11585  ; m7=out10, m0=out5
+
+    mova                m2, [rsp+68*mmsize]         ; t2
+    SUMSUB_BA         d, 2, 6, 3                    ; m2=out0, m6=t2a
+    SCRATCH              2, 8, rsp+70*mmsize
+    mova                m2, [rsp+67*mmsize]         ; t1
+    mova                m3, [rsp+69*mmsize]         ; t3
+    mova   [rsp+67*mmsize], m7
+    SUMSUB_BA         d, 3, 2, 7                    ; m3=-out15, m2=t3a
+    NEGD                m3                          ; m3=out15
+    SCRATCH              3, 15, rsp+77*mmsize
+    SUMSUB_MUL           6, 2, 7, 3, 11585, m11585  ; m6=out8, m2=out7
+    mova                m7, [rsp+67*mmsize]
+
+    SWAP                 0, 1
+    SWAP                 2, 5, 4, 6, 7, 3
+%endmacro
+
+%macro IADST16_FN 7
+cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                dst, stride, block, eob
+    mova                m0, [pw_1023]
+
+.body:
+    mova   [rsp+64*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [%7_16x16]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [%7_16x16+cntq-1]
+%endif
+    mov              skipd, 4
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    %2_1D           blockq
+
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+ 9*mmsize], m2
+    mova  [ptrq+13*mmsize], m3
+    mova                m7, [rsp+65*mmsize]
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m4
+    mova  [ptrq+ 6*mmsize], m5
+    mova  [ptrq+10*mmsize], m6
+    mova  [ptrq+14*mmsize], m7
+    UNSCRATCH               0, 8, rsp+(%3+0)*mmsize
+    UNSCRATCH               1, 9, rsp+(%3+1)*mmsize
+    UNSCRATCH               2, 10, rsp+(%3+2)*mmsize
+    UNSCRATCH               3, 11, rsp+(%3+3)*mmsize
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 0*mmsize], m0
+    mova  [ptrq+ 4*mmsize], m1
+    mova  [ptrq+ 8*mmsize], m2
+    mova  [ptrq+12*mmsize], m3
+    UNSCRATCH               4, 12, rsp+(%3+4)*mmsize
+    UNSCRATCH               5, 13, rsp+(%3+5)*mmsize
+    UNSCRATCH               6, 14, rsp+(%3+6)*mmsize
+    UNSCRATCH               7, 15, rsp+(%3+7)*mmsize
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 3*mmsize], m4
+    mova  [ptrq+ 7*mmsize], m5
+    mova  [ptrq+11*mmsize], m6
+    mova  [ptrq+15*mmsize], m7
+    add               ptrq, 16 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    add              skipd, skipd
+    lea             blockq, [blockq+skipq*(mmsize/2)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 4
+    mov               ptrq, rsp
+.loop_2:
+    %5_1D             ptrq
+
+    pxor                m7, m7
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
+    lea               dstq, [dstq+strideq*4]
+    mova                m0, [rsp+65*mmsize]
+    mova                m1, [rsp+64*mmsize]
+    mova                m2, [pd_32]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else
+    mov               dstq, dstm
+%endif
+    UNSCRATCH               0, 8, rsp+(%6+0)*mmsize
+    UNSCRATCH               4, 9, rsp+(%6+1)*mmsize
+    UNSCRATCH               5, 10, rsp+(%6+2)*mmsize
+    UNSCRATCH               3, 11, rsp+(%6+3)*mmsize
+    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea               dstq, [dstbakq+stride3q*4]
+%else
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    UNSCRATCH               4, 12, rsp+(%6+4)*mmsize
+    UNSCRATCH               5, 13, rsp+(%6+5)*mmsize
+    UNSCRATCH               6, 14, rsp+(%6+6)*mmsize
+    UNSCRATCH               0, 15, rsp+(%6+7)*mmsize
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
+    RET
+
+cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                dst, stride, block, eob
+    mova                m0, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+IADST16_FN idct,  IDCT16,  67, iadst, IADST16, 70, row
+IADST16_FN iadst, IADST16, 70, idct,  IDCT16,  67, col
+IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
+
+%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
+    IDCT16_1D %2, 2 * %3, 272, 257
+%if ARCH_X86_64
+    mova  [rsp+257*mmsize], m8
+    mova  [rsp+258*mmsize], m9
+    mova  [rsp+259*mmsize], m10
+    mova  [rsp+260*mmsize], m11
+    mova  [rsp+261*mmsize], m12
+    mova  [rsp+262*mmsize], m13
+    mova  [rsp+263*mmsize], m14
+    mova  [rsp+264*mmsize], m15
+%endif
+    mova  [rsp+265*mmsize], m0
+    mova  [rsp+266*mmsize], m1
+    mova  [rsp+267*mmsize], m2
+    mova  [rsp+268*mmsize], m3
+    mova  [rsp+269*mmsize], m4
+    mova  [rsp+270*mmsize], m5
+    mova  [rsp+271*mmsize], m6
+
+    ; r257-260: t0-3
+    ; r265-272: t4/5a/6a/7/8/9a/10/11a
+    ; r261-264: t12a/13/14a/15
+    ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
+
+    mova                m0, [%2+ 1*%3]              ; in1
+    mova                m1, [%2+15*%3]              ; in15
+    mova                m2, [%2+17*%3]              ; in17
+    mova                m3, [%2+31*%3]              ; in31
+    SUMSUB_MUL           0, 3, 4, 5, 16364,  804    ; m0=t31a, m3=t16a
+    SUMSUB_MUL           2, 1, 4, 5, 11003, 12140   ; m2=t30a, m1=t17a
+    SUMSUB_BA         d, 1, 3, 4                    ; m1=t16, m3=t17
+    SUMSUB_BA         d, 2, 0, 4                    ; m2=t31, m0=t30
+    SUMSUB_MUL           0, 3, 4, 5, 16069,  3196   ; m0=t30a, m3=t17a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              2, 9, rsp+276*mmsize
+
+    ; end of stage 1-3 first quart
+
+    mova                m0, [%2+ 7*%3]              ; in7
+    mova                m2, [%2+ 9*%3]              ; in9
+    mova                m4, [%2+23*%3]              ; in23
+    mova                m5, [%2+25*%3]              ; in25
+    SUMSUB_MUL           2, 4, 6, 7, 14811,  7005   ; m2=t29a, m4=t18a
+    SUMSUB_MUL           5, 0, 6, 7,  5520, 15426   ; m5=t28a, m0=t19a
+    SUMSUB_BA         d, 4, 0, 6                    ; m4=t19, m0=t18
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=t28, m5=t29
+    SUMSUB_MUL           5, 0, 6, 7,  3196, m16069  ; m5=t29a, m0=t18a
+
+    ; end of stage 1-3 second quart
+
+    SUMSUB_BA         d, 4, 1, 6                    ; m4=t16a, m1=t19a
+    SUMSUB_BA         d, 0, 3, 6                    ; m0=t17, m3=t18
+    UNSCRATCH            6, 8, rsp+275*mmsize       ; t30a
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; t31
+    mova  [rsp+273*mmsize], m4
+    mova  [rsp+274*mmsize], m0
+    SUMSUB_BA         d, 2, 7, 0                    ; m2=t31a, m7=t28a
+    SUMSUB_BA         d, 5, 6, 0                    ; m5=t30, m6=t29
+    SUMSUB_MUL           6, 3, 0, 4, 15137,  6270   ; m6=t29a, m3=t18a
+    SUMSUB_MUL           7, 1, 0, 4, 15137,  6270   ; m7=t28, m1=t19
+    SCRATCH              3, 10, rsp+277*mmsize
+    SCRATCH              1, 11, rsp+278*mmsize
+    SCRATCH              7, 12, rsp+279*mmsize
+    SCRATCH              6, 13, rsp+280*mmsize
+    SCRATCH              5, 14, rsp+281*mmsize
+    SCRATCH              2, 15, rsp+282*mmsize
+
+    ; end of stage 4-5 first half
+
+    mova                m0, [%2+ 5*%3]              ; in5
+    mova                m1, [%2+11*%3]              ; in11
+    mova                m2, [%2+21*%3]              ; in21
+    mova                m3, [%2+27*%3]              ; in27
+    SUMSUB_MUL           0, 3, 4, 5, 15893,  3981   ; m0=t27a, m3=t20a
+    SUMSUB_MUL           2, 1, 4, 5,  8423, 14053   ; m2=t26a, m1=t21a
+    SUMSUB_BA         d, 1, 3, 4                    ; m1=t20, m3=t21
+    SUMSUB_BA         d, 2, 0, 4                    ; m2=t27, m0=t26
+    SUMSUB_MUL           0, 3, 4, 5,  9102, 13623   ; m0=t26a, m3=t21a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              2, 9, rsp+276*mmsize
+
+    ; end of stage 1-3 third quart
+
+    mova                m0, [%2+ 3*%3]              ; in3
+    mova                m2, [%2+13*%3]              ; in13
+    mova                m4, [%2+19*%3]              ; in19
+    mova                m5, [%2+29*%3]              ; in29
+    SUMSUB_MUL           2, 4, 6, 7, 13160,  9760   ; m2=t25a, m4=t22a
+    SUMSUB_MUL           5, 0, 6, 7,  2404, 16207   ; m5=t24a, m0=t23a
+    SUMSUB_BA         d, 4, 0, 6                    ; m4=t23, m0=t22
+    SUMSUB_BA         d, 2, 5, 6                    ; m2=t24, m5=t25
+    SUMSUB_MUL           5, 0, 6, 7, 13623, m9102   ; m5=t25a, m0=t22a
+
+    ; end of stage 1-3 fourth quart
+
+    SUMSUB_BA         d, 1, 4, 6                    ; m1=t23a, m4=t20a
+    SUMSUB_BA         d, 3, 0, 6                    ; m3=t22, m0=t21
+    UNSCRATCH            6, 8, rsp+275*mmsize       ; t26a
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; t27
+    SCRATCH              3, 8, rsp+275*mmsize
+    SCRATCH              1, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 7, 2, 1                    ; m7=t24a, m2=t27a
+    SUMSUB_BA         d, 6, 5, 1                    ; m6=t25, m5=t26
+    SUMSUB_MUL           2, 4, 1, 3,  6270, m15137  ; m2=t27, m4=t20
+    SUMSUB_MUL           5, 0, 1, 3,  6270, m15137  ; m5=t26a, m0=t21a
+
+    ; end of stage 4-5 second half
+
+    UNSCRATCH            1, 12, rsp+279*mmsize      ; t28
+    UNSCRATCH            3, 13, rsp+280*mmsize      ; t29a
+    SCRATCH              4, 12, rsp+279*mmsize
+    SCRATCH              0, 13, rsp+280*mmsize
+    SUMSUB_BA         d, 5, 3, 0                    ; m5=t29, m3=t26
+    SUMSUB_BA         d, 2, 1, 0                    ; m2=t28a, m1=t27a
+    UNSCRATCH            0, 14, rsp+281*mmsize      ; t30
+    UNSCRATCH            4, 15, rsp+282*mmsize      ; t31a
+    SCRATCH              2, 14, rsp+281*mmsize
+    SCRATCH              5, 15, rsp+282*mmsize
+    SUMSUB_BA         d, 6, 0, 2                    ; m6=t30a, m0=t25a
+    SUMSUB_BA         d, 7, 4, 2                    ; m7=t31, m4=t24
+
+    mova                m2, [rsp+273*mmsize]        ; t16a
+    mova                m5, [rsp+274*mmsize]        ; t17
+    mova  [rsp+273*mmsize], m6
+    mova  [rsp+274*mmsize], m7
+    UNSCRATCH            6, 10, rsp+277*mmsize      ; t18a
+    UNSCRATCH            7, 11, rsp+278*mmsize      ; t19
+    SCRATCH              4, 10, rsp+277*mmsize
+    SCRATCH              0, 11, rsp+278*mmsize
+    UNSCRATCH            4, 12, rsp+279*mmsize      ; t20
+    UNSCRATCH            0, 13, rsp+280*mmsize      ; t21a
+    SCRATCH              3, 12, rsp+279*mmsize
+    SCRATCH              1, 13, rsp+280*mmsize
+    SUMSUB_BA         d, 0, 6, 1                    ; m0=t18, m6=t21
+    SUMSUB_BA         d, 4, 7, 1                    ; m4=t19a, m7=t20a
+    UNSCRATCH            3, 8, rsp+275*mmsize       ; t22
+    UNSCRATCH            1, 9, rsp+276*mmsize       ; t23a
+    SCRATCH              0, 8, rsp+275*mmsize
+    SCRATCH              4, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 3, 5, 0                    ; m3=t17a, m5=t22a
+    SUMSUB_BA         d, 1, 2, 0                    ; m1=t16, m2=t23
+
+    ; end of stage 6
+
+    UNSCRATCH            0, 10, rsp+277*mmsize      ; t24
+    UNSCRATCH            4, 11, rsp+278*mmsize      ; t25a
+    SCRATCH              1, 10, rsp+277*mmsize
+    SCRATCH              3, 11, rsp+278*mmsize
+    SUMSUB_MUL           0, 2, 1, 3, 11585, 11585   ; m0=t24a, m2=t23a
+    SUMSUB_MUL           4, 5, 1, 3, 11585, 11585   ; m4=t25, m5=t22
+    UNSCRATCH            1, 12, rsp+279*mmsize      ; t26
+    UNSCRATCH            3, 13, rsp+280*mmsize      ; t27a
+    SCRATCH              0, 12, rsp+279*mmsize
+    SCRATCH              4, 13, rsp+280*mmsize
+    SUMSUB_MUL           3, 7, 0, 4, 11585, 11585   ; m3=t27, m7=t20
+    SUMSUB_MUL           1, 6, 0, 4, 11585, 11585   ; m1=t26a, m6=t21a
+
+    ; end of stage 7
+
+    mova                m0, [rsp+269*mmsize]        ; t8
+    mova                m4, [rsp+270*mmsize]        ; t9a
+    mova  [rsp+269*mmsize], m1                      ; t26a
+    mova  [rsp+270*mmsize], m3                      ; t27
+    mova                m3, [rsp+271*mmsize]        ; t10
+    SUMSUB_BA         d, 2, 0, 1                    ; m2=out8, m0=out23
+    SUMSUB_BA         d, 5, 4, 1                    ; m5=out9, m4=out22
+    SUMSUB_BA         d, 6, 3, 1                    ; m6=out10, m3=out21
+    mova                m1, [rsp+272*mmsize]        ; t11a
+    mova  [rsp+271*mmsize], m0
+    SUMSUB_BA         d, 7, 1, 0                    ; m7=out11, m1=out20
+
+%if %1 == 1
+    TRANSPOSE4x4D        2, 5, 6, 7, 0
+    mova  [ptrq+ 2*mmsize], m2
+    mova  [ptrq+10*mmsize], m5
+    mova  [ptrq+18*mmsize], m6
+    mova  [ptrq+26*mmsize], m7
+%else ; %1 == 2
+    pxor                m0, m0
+    lea               dstq, [dstq+strideq*8]
+    ROUND_AND_STORE_4x4  2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+    mova                m2, [rsp+271*mmsize]
+%if %1 == 1
+    TRANSPOSE4x4D        1, 3, 4, 2, 0
+    mova  [ptrq+ 5*mmsize], m1
+    mova  [ptrq+13*mmsize], m3
+    mova  [ptrq+21*mmsize], m4
+    mova  [ptrq+29*mmsize], m2
+%else ; %1 == 2
+    lea               dstq, [dstq+stride3q*4]
+    ROUND_AND_STORE_4x4  1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out8-11 and out20-23
+
+    UNSCRATCH            0, 9, rsp+276*mmsize       ; t19a
+    UNSCRATCH            1, 8, rsp+275*mmsize       ; t18
+    UNSCRATCH            2, 11, rsp+278*mmsize      ; t17a
+    UNSCRATCH            3, 10, rsp+277*mmsize      ; t16
+    mova                m7, [rsp+261*mmsize]        ; t12a
+    mova                m6, [rsp+262*mmsize]        ; t13
+    mova                m5, [rsp+263*mmsize]        ; t14a
+    SUMSUB_BA         d, 0, 7, 4                    ; m0=out12, m7=out19
+    SUMSUB_BA         d, 1, 6, 4                    ; m1=out13, m6=out18
+    SUMSUB_BA         d, 2, 5, 4                    ; m2=out14, m5=out17
+    mova                m4, [rsp+264*mmsize]        ; t15
+    SCRATCH              7, 8, rsp+275*mmsize
+    SUMSUB_BA         d, 3, 4, 7                    ; m3=out15, m4=out16
+
+%if %1 == 1
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 3*mmsize], m0
+    mova  [ptrq+11*mmsize], m1
+    mova  [ptrq+19*mmsize], m2
+    mova  [ptrq+27*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 7, 9
+    lea               dstq, [dstbakq+stride3q*4]
+%else ; x86-32
+    pxor                m7, m7
+    mov               dstq, dstm
+    lea               dstq, [dstq+stride3q*4]
+%endif
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            0, 8, rsp+275*mmsize       ; out19
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 0, 7
+    mova  [ptrq+ 4*mmsize], m4
+    mova  [ptrq+12*mmsize], m5
+    mova  [ptrq+20*mmsize], m6
+    mova  [ptrq+28*mmsize], m0
+%else ; %1 == 2
+    lea               dstq, [dstq+strideq*4]
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out12-19
+
+%if ARCH_X86_64
+    SWAP                 7, 8
+%endif
+    mova                m7, [rsp+257*mmsize]        ; t0
+    mova                m6, [rsp+258*mmsize]        ; t1
+    mova                m5, [rsp+259*mmsize]        ; t2
+    mova                m4, [rsp+260*mmsize]        ; t3
+    mova                m0, [rsp+274*mmsize]        ; t31
+    mova                m1, [rsp+273*mmsize]        ; t30a
+    UNSCRATCH            2, 15, rsp+282*mmsize      ; t29
+    SUMSUB_BA         d, 0, 7, 3                    ; m0=out0, m7=out31
+    SUMSUB_BA         d, 1, 6, 3                    ; m1=out1, m6=out30
+    SUMSUB_BA         d, 2, 5, 3                    ; m2=out2, m5=out29
+    SCRATCH              0, 9, rsp+276*mmsize
+    UNSCRATCH            3, 14, rsp+281*mmsize      ; t28a
+    SUMSUB_BA         d, 3, 4, 0                    ; m3=out3, m4=out28
+
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 7, 0
+    mova  [ptrq+ 7*mmsize], m4
+    mova  [ptrq+15*mmsize], m5
+    mova  [ptrq+23*mmsize], m6
+    mova  [ptrq+31*mmsize], m7
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 0, 8
+%else ; x86-32
+    pxor                m0, m0
+%endif
+    lea               dstq, [dstq+stride3q*4]
+    ROUND_AND_STORE_4x4  4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            7, 9, rsp+276*mmsize       ; out0
+%if %1 == 1
+    TRANSPOSE4x4D        7, 1, 2, 3, 0
+    mova  [ptrq+ 0*mmsize], m7
+    mova  [ptrq+ 8*mmsize], m1
+    mova  [ptrq+16*mmsize], m2
+    mova  [ptrq+24*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
+%else ; x86-32
+    mov               dstq, dstm
+%endif
+    ROUND_AND_STORE_4x4  7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+%endif
+%endif
+
+    ; end of last stage + store for out0-3 and out28-31
+
+%if ARCH_X86_64
+    SWAP                 0, 8
+%endif
+    mova                m7, [rsp+265*mmsize]        ; t4
+    mova                m6, [rsp+266*mmsize]        ; t5a
+    mova                m5, [rsp+267*mmsize]        ; t6a
+    mova                m4, [rsp+268*mmsize]        ; t7
+    mova                m0, [rsp+270*mmsize]        ; t27
+    mova                m1, [rsp+269*mmsize]        ; t26a
+    UNSCRATCH            2, 13, rsp+280*mmsize      ; t25
+    SUMSUB_BA         d, 0, 7, 3                    ; m0=out4, m7=out27
+    SUMSUB_BA         d, 1, 6, 3                    ; m1=out5, m6=out26
+    SUMSUB_BA         d, 2, 5, 3                    ; m2=out6, m5=out25
+    UNSCRATCH            3, 12, rsp+279*mmsize      ; t24a
+    SCRATCH              7, 9, rsp+276*mmsize
+    SUMSUB_BA         d, 3, 4, 7                    ; m3=out7, m4=out24
+
+%if %1 == 1
+    TRANSPOSE4x4D        0, 1, 2, 3, 7
+    mova  [ptrq+ 1*mmsize], m0
+    mova  [ptrq+ 9*mmsize], m1
+    mova  [ptrq+17*mmsize], m2
+    mova  [ptrq+25*mmsize], m3
+%else ; %1 == 2
+%if ARCH_X86_64
+    SWAP                 7, 8
+    lea               dstq, [dstbakq+strideq*4]
+%else ; x86-32
+    pxor                m7, m7
+    lea               dstq, [dstq+strideq*4]
+%endif
+    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+    UNSCRATCH            0, 9, rsp+276*mmsize       ; out27
+%if %1 == 1
+    TRANSPOSE4x4D        4, 5, 6, 0, 7
+    mova  [ptrq+ 6*mmsize], m4
+    mova  [ptrq+14*mmsize], m5
+    mova  [ptrq+22*mmsize], m6
+    mova  [ptrq+30*mmsize], m0
+%else ; %1 == 2
+%if ARCH_X86_64
+    lea               dstq, [dstbakq+stride3q*8]
+%else
+    mov               dstq, dstm
+    lea               dstq, [dstq+stride3q*8]
+%endif
+    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
+%endif
+
+    ; end of last stage + store for out4-7 and out24-27
+%endmacro
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
+                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_1023]
+    cmp               eobd, 1
+    jg .idctfull
+
+    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
+    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
+    ; fits in 32bit
+    DEFINE_ARGS dst, stride, block, coef
+    pxor                m2, m2
+    DC_ONLY              6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 32
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+    add               dstq, strideq
+    dec               cntd
+    jg .loop_dc
+    RET
+
+.idctfull:
+    mova  [rsp+256*mmsize], m0
+    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
+%if ARCH_X86_64
+    mov            dstbakq, dstq
+    movsxd            cntq, cntd
+%endif
+%ifdef PIC
+    lea               ptrq, [default_32x32]
+    movzx             cntd, byte [ptrq+cntq-1]
+%else
+    movzx             cntd, byte [default_32x32+cntq-1]
+%endif
+    mov              skipd, 8
+    sub              skipd, cntd
+    mov               ptrq, rsp
+.loop_1:
+    IDCT32_1D            1, blockq
+
+    add               ptrq, 32 * mmsize
+    add             blockq, mmsize
+    dec               cntd
+    jg .loop_1
+
+    ; zero-pad the remainder (skipped cols)
+    test             skipd, skipd
+    jz .end
+    shl              skipd, 2
+    lea             blockq, [blockq+skipq*(mmsize/4)]
+    pxor                m0, m0
+.loop_z:
+    mova   [ptrq+mmsize*0], m0
+    mova   [ptrq+mmsize*1], m0
+    mova   [ptrq+mmsize*2], m0
+    mova   [ptrq+mmsize*3], m0
+    mova   [ptrq+mmsize*4], m0
+    mova   [ptrq+mmsize*5], m0
+    mova   [ptrq+mmsize*6], m0
+    mova   [ptrq+mmsize*7], m0
+    add               ptrq, 8 * mmsize
+    dec              skipd
+    jg .loop_z
+.end:
+
+    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+    lea           stride3q, [strideq*3]
+    mov               cntd, 8
+    mov               ptrq, rsp
+.loop_2:
+    IDCT32_1D            2, ptrq
+
+    add               ptrq, mmsize
+%if ARCH_X86_64
+    add            dstbakq, 8
+    mov               dstq, dstbakq
+%else
+    add         dword dstm, 8
+    mov               dstq, dstm
+%endif
+    dec               cntd
+    jg .loop_2
+
+    ; m7 is still zero
+    ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
+    RET
+
+INIT_XMM sse2
+cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
+                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
+                                    dst, stride, block, eob
+    mova                m0, [pw_4095]
+    cmp               eobd, 1
+    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
+
+    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
+    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
+    DEFINE_ARGS dst, stride, block, coef, coefl
+    pxor                m2, m2
+    DC_ONLY_64BIT        6, m2
+    movd                m1, coefd
+    pshuflw             m1, m1, q0000
+    punpcklqdq          m1, m1
+    DEFINE_ARGS dst, stride, cnt
+    mov               cntd, 32
+.loop_dc:
+    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
+    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
+    add               dstq, strideq
+    dec               cntd
+    jg .loop_dc
+    RET
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm
new file mode 100644
index 0000000..d2f2257
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_template.asm
@@ -0,0 +1,142 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%macro VP9_IWHT4_1D 0
+    SWAP                 1, 2, 3
+    paddw               m0, m2
+    psubw               m3, m1
+    psubw               m4, m0, m3
+    psraw               m4, 1
+    psubw               m5, m4, m1
+    SWAP                 5, 1
+    psubw               m4, m2
+    SWAP                 4, 2
+    psubw               m0, m1
+    paddw               m3, m2
+    SWAP                 3, 2, 1
+%endmacro
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+    pmaddwd            m%1, m%2, %4
+    pmaddwd            m%2,  %5
+    paddd              m%1,  %3
+    paddd              m%2,  %3
+    psrad              m%1,  14
+    psrad              m%2,  14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+    VP9_MULSUB_2W_2X    %7,  %6,  %5, [pw_m%3_%4], [pw_%4_%3]
+    VP9_MULSUB_2W_2X    %1,  %2,  %5, [pw_m%3_%4], [pw_%4_%3]
+    packssdw           m%1, m%7
+    packssdw           m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+    punpckhwd          m%6, m%2, m%1
+    punpcklwd          m%2, m%1
+    VP9_MULSUB_2W_4X   %1, %2, %3, %4, %5, %6, %7
+%else
+    punpckhwd          m%8, m%4, m%3
+    punpcklwd          m%2, m%4, m%3
+    VP9_MULSUB_2W_4X   %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
+    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
+    SWAP                 0, 3, 2                            ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+%if cpuflag(ssse3)
+    SUMSUB_BA            w, 2, 0, 4                         ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+    pmulhrsw            m2, m6                              ; m2=t0
+    pmulhrsw            m0, m6                              ; m0=t1
+%else ; <= sse2
+    VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5    ; m0=t1, m1=t0
+%endif
+    VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5     ; m1=t2, m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IADST4_1D 0
+    movq2dq           xmm0, m0
+    movq2dq           xmm1, m1
+    movq2dq           xmm2, m2
+    movq2dq           xmm3, m3
+%if cpuflag(ssse3)
+    paddw               m3, m0
+%endif
+    punpcklwd         xmm0, xmm1
+    punpcklwd         xmm2, xmm3
+    pmaddwd           xmm1, xmm0, [pw_5283_13377]
+    pmaddwd           xmm4, xmm0, [pw_9929_13377]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm6, xmm0, [pw_13377_0]
+%endif
+    pmaddwd           xmm0, [pw_15212_m13377]
+    pmaddwd           xmm3, xmm2, [pw_15212_9929]
+%if notcpuflag(ssse3)
+    pmaddwd           xmm7, xmm2, [pw_m13377_13377]
+%endif
+    pmaddwd           xmm2, [pw_m5283_m15212]
+%if cpuflag(ssse3)
+    psubw               m3, m2
+%else
+    paddd             xmm6, xmm7
+%endif
+    paddd             xmm0, xmm2
+    paddd             xmm3, xmm5
+    paddd             xmm2, xmm5
+%if notcpuflag(ssse3)
+    paddd             xmm6, xmm5
+%endif
+    paddd             xmm1, xmm3
+    paddd             xmm0, xmm3
+    paddd             xmm4, xmm2
+    psrad             xmm1, 14
+    psrad             xmm0, 14
+    psrad             xmm4, 14
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_13377x2]        ; out2
+%else
+    psrad             xmm6, 14
+%endif
+    packssdw          xmm0, xmm0
+    packssdw          xmm1, xmm1
+    packssdw          xmm4, xmm4
+%if notcpuflag(ssse3)
+    packssdw          xmm6, xmm6
+%endif
+    movdq2q             m0, xmm0                ; out3
+    movdq2q             m1, xmm1                ; out0
+    movdq2q             m2, xmm4                ; out1
+%if notcpuflag(ssse3)
+    movdq2q             m3, xmm6                ; out2
+%endif
+    SWAP                 0, 1, 2, 3
+%endmacro
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
new file mode 100644
index 0000000..2c4fe21
--- /dev/null
+++ b/libavcodec/x86/vp9lpf.asm
@@ -0,0 +1,1139 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
+;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_3
+cextern pb_80
+
+pb_4:   times 16 db 0x04
+pb_10:  times 16 db 0x10
+pb_40:  times 16 db 0x40
+pb_81:  times 16 db 0x81
+pb_f8:  times 16 db 0xf8
+pb_fe:  times 16 db 0xfe
+pb_ff:  times 16 db 0xff
+
+cextern pw_4
+cextern pw_8
+
+; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
+; the following mask is used to splat both in the same register
+mask_mix: times 8 db 0
+          times 8 db 1
+
+mask_mix84: times 8 db 0xff
+            times 8 db 0x00
+mask_mix48: times 8 db 0x00
+            times 8 db 0xff
+
+SECTION .text
+
+%macro SCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova              [%3], m%1
+%endif
+%endmacro
+
+%macro UNSCRATCH 3
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%endmacro
+
+; %1 = abs(%2-%3)
+%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
+%if ARCH_X86_64
+    psubusb             %1, %3, %2
+    psubusb             %4, %2, %3
+%else
+    mova                %1, %3
+    mova                %4, %2
+    psubusb             %1, %2
+    psubusb             %4, %3
+%endif
+    por                 %1, %4
+%endmacro
+
+; %1 = %1>%2
+%macro CMP_GT 2-3 ; src/dst, cmp, pb_80
+%if %0 == 3
+    pxor                %1, %3
+%endif
+    pcmpgtb             %1, %2
+%endmacro
+
+; %1 = abs(%2-%3) > %4
+%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80]
+    ABSSUB              %1, %2, %3, %5      ; dst = abs(src1-src2)
+    CMP_GT              %1, %4, %6          ; dst > cmp
+%endmacro
+
+%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp
+    pand                %1, %3              ; new &= mask
+    pandn               %4, %3, %2          ; tmp = ~mask & old
+    por                 %1, %4              ; new&mask | old&~mask
+%endmacro
+
+%macro UNPACK 4
+%if ARCH_X86_64
+    punpck%1bw          %2, %3, %4
+%else
+    mova                %2, %3
+    punpck%1bw          %2, %4
+%endif
+%endmacro
+
+%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
+                             ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
+    psubw               %3, [rsp+%4+%5*32]
+    psubw               %3, [rsp+%4+%6*32]
+    paddw               %3, [rsp+%4+%7*32]
+%ifnidn %10, ""
+%if %11 == 0
+    punpck%2bw          %1, %10, m0
+%else
+    UNPACK          %2, %1, %10, m0
+%endif
+    mova    [rsp+%4+%8*32], %1
+    paddw               %3, %1
+%else
+    paddw               %3, [rsp+%4+%8*32]
+%endif
+    psraw               %1, %3, %9
+%endmacro
+
+; FIXME interleave l/h better (for instruction pairing)
+%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
+    FILTER%7_INIT       %1, l, %3, %6 +  0
+    FILTER%7_INIT       %2, h, %4, %6 + 16
+    packuswb            %1, %2
+    MASK_APPLY          %1, %9, %8, %2
+    mova                %5, %1
+%endmacro
+
+
+%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift,
+                                         ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32]
+; FIXME interleave this properly with the subx2/addx2
+%ifnidn %15, ""
+%if %16 == 0 || ARCH_X86_64
+    mova               %14, %15
+%endif
+%endif
+    FILTER_SUBx2_ADDx2  %1, l, %3, %6 +  0, %7, %8, %9, %10, %11, %14, %16
+    FILTER_SUBx2_ADDx2  %2, h, %4, %6 + 16, %7, %8, %9, %10, %11, %14, %16
+    packuswb            %1, %2
+%ifnidn %13, ""
+    MASK_APPLY          %1, %13, %12, %2
+%else
+    MASK_APPLY          %1, %5, %12, %2
+%endif
+    mova                %5, %1
+%endmacro
+
+%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp
+    mova                %4, [pb_f8]
+    pand                %1, %4
+    pand                %2, %4
+    psrlq               %1, 3
+    psrlq               %2, 3
+    pxor                %1, %3
+    pxor                %2, %3
+    psubb               %1, %3
+    psubb               %2, %3
+%endmacro
+
+%macro EXTRACT_POS_NEG 3 ; i8, neg, pos
+    pxor                %3, %3
+    pxor                %2, %2
+    pcmpgtb             %3, %1                          ; i8 < 0 mask
+    psubb               %2, %1                          ; neg values (only the originally - will be kept)
+    pand                %2, %3                          ; negative values of i8 (but stored as +)
+    pandn               %3, %1                          ; positive values of i8
+%endmacro
+
+; clip_u8(u8 + i8)
+%macro SIGN_ADD 4 ; dst, u8, i8, tmp1
+    EXTRACT_POS_NEG     %3, %4, %1
+    paddusb             %1, %2                          ; add the positives
+    psubusb             %1, %4                          ; sub the negatives
+%endmacro
+
+; clip_u8(u8 - i8)
+%macro SIGN_SUB 4 ; dst, u8, i8, tmp1
+    EXTRACT_POS_NEG     %3, %1, %4
+    paddusb             %1, %2                          ; add the negatives
+    psubusb             %1, %4                          ; sub the positives
+%endmacro
+
+%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
+    UNPACK          %2, %1, rp3, m0                     ; p3: B->W
+    mova     [rsp+%4+0*32], %1
+    paddw               %3, %1, %1                      ; p3*2
+    paddw               %3, %1                          ; p3*3
+    punpck%2bw          %1, m1,  m0                     ; p2: B->W
+    mova     [rsp+%4+1*32], %1
+    paddw               %3, %1                          ; p3*3 + p2
+    paddw               %3, %1                          ; p3*3 + p2*2
+    UNPACK          %2, %1, rp1, m0                     ; p1: B->W
+    mova     [rsp+%4+2*32], %1
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1
+    UNPACK          %2, %1, rp0, m0                     ; p0: B->W
+    mova     [rsp+%4+3*32], %1
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0
+    UNPACK          %2, %1, rq0, m0                     ; q0: B->W
+    mova     [rsp+%4+4*32], %1
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0 + q0
+    paddw               %3, [pw_4]                      ; p3*3 + p2*2 + p1 + p0 + q0 + 4
+    psraw               %1, %3, 3                       ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
+%endmacro
+
+%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
+    punpck%2bw          %1, m2, m0                      ; p7: B->W
+    mova    [rsp+%4+ 8*32], %1
+    psllw               %3, %1, 3                       ; p7*8
+    psubw               %3, %1                          ; p7*7
+    punpck%2bw          %1, m3, m0                      ; p6: B->W
+    mova    [rsp+%4+ 9*32], %1
+    paddw               %3, %1                          ; p7*7 + p6
+    paddw               %3, %1                          ; p7*7 + p6*2
+    UNPACK          %2, %1, rp5, m0                     ; p5: B->W
+    mova    [rsp+%4+10*32], %1
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5
+    UNPACK          %2, %1, rp4, m0                     ; p4: B->W
+    mova    [rsp+%4+11*32], %1
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + p4
+    paddw               %3, [rsp+%4+ 0*32]              ; p7*7 + p6*2 + p5 + p4 + p3
+    paddw               %3, [rsp+%4+ 1*32]              ; p7*7 + p6*2 + p5 + .. + p2
+    paddw               %3, [rsp+%4+ 2*32]              ; p7*7 + p6*2 + p5 + .. + p1
+    paddw               %3, [rsp+%4+ 3*32]              ; p7*7 + p6*2 + p5 + .. + p0
+    paddw               %3, [rsp+%4+ 4*32]              ; p7*7 + p6*2 + p5 + .. + p0 + q0
+    paddw               %3, [pw_8]                      ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
+    psraw               %1, %3, 4                       ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
+%endmacro
+
+%macro TRANSPOSE16x16B 17
+    mova %17, m%16
+    SBUTTERFLY bw,  %1,  %2,  %16
+    SBUTTERFLY bw,  %3,  %4,  %16
+    SBUTTERFLY bw,  %5,  %6,  %16
+    SBUTTERFLY bw,  %7,  %8,  %16
+    SBUTTERFLY bw,  %9,  %10, %16
+    SBUTTERFLY bw,  %11, %12, %16
+    SBUTTERFLY bw,  %13, %14, %16
+    mova m%16,  %17
+    mova  %17, m%14
+    SBUTTERFLY bw,  %15, %16, %14
+    SBUTTERFLY wd,  %1,  %3,  %14
+    SBUTTERFLY wd,  %2,  %4,  %14
+    SBUTTERFLY wd,  %5,  %7,  %14
+    SBUTTERFLY wd,  %6,  %8,  %14
+    SBUTTERFLY wd,  %9,  %11, %14
+    SBUTTERFLY wd,  %10, %12, %14
+    SBUTTERFLY wd,  %13, %15, %14
+    mova m%14,  %17
+    mova  %17, m%12
+    SBUTTERFLY wd,  %14, %16, %12
+    SBUTTERFLY dq,  %1,  %5,  %12
+    SBUTTERFLY dq,  %2,  %6,  %12
+    SBUTTERFLY dq,  %3,  %7,  %12
+    SBUTTERFLY dq,  %4,  %8,  %12
+    SBUTTERFLY dq,  %9,  %13, %12
+    SBUTTERFLY dq,  %10, %14, %12
+    SBUTTERFLY dq,  %11, %15, %12
+    mova m%12, %17
+    mova  %17, m%8
+    SBUTTERFLY dq,  %12, %16, %8
+    SBUTTERFLY qdq, %1,  %9,  %8
+    SBUTTERFLY qdq, %2,  %10, %8
+    SBUTTERFLY qdq, %3,  %11, %8
+    SBUTTERFLY qdq, %4,  %12, %8
+    SBUTTERFLY qdq, %5,  %13, %8
+    SBUTTERFLY qdq, %6,  %14, %8
+    SBUTTERFLY qdq, %7,  %15, %8
+    mova m%8, %17
+    mova %17, m%1
+    SBUTTERFLY qdq, %8,  %16, %1
+    mova m%1, %17
+    SWAP %2,  %9
+    SWAP %3,  %5
+    SWAP %4,  %13
+    SWAP %6,  %11
+    SWAP %8,  %15
+    SWAP %12, %14
+%endmacro
+
+%macro TRANSPOSE8x8B 13
+    SBUTTERFLY bw,  %1, %2, %7
+    movdq%10 m%7, %9
+    movdqa %11, m%2
+    SBUTTERFLY bw,  %3, %4, %2
+    SBUTTERFLY bw,  %5, %6, %2
+    SBUTTERFLY bw,  %7, %8, %2
+    SBUTTERFLY wd,  %1, %3, %2
+    movdqa m%2, %11
+    movdqa %11, m%3
+    SBUTTERFLY wd,  %2, %4, %3
+    SBUTTERFLY wd,  %5, %7, %3
+    SBUTTERFLY wd,  %6, %8, %3
+    SBUTTERFLY dq, %1, %5, %3
+    SBUTTERFLY dq, %2, %6, %3
+    movdqa m%3, %11
+    movh   %12, m%2
+    movhps %13, m%2
+    SBUTTERFLY dq, %3, %7, %2
+    SBUTTERFLY dq, %4, %8, %2
+    SWAP %2, %5
+    SWAP %4, %7
+%endmacro
+
+%macro DEFINE_REAL_P7_TO_Q7 0-1 0
+%define P7 dstq  + 4*mstrideq  + %1
+%define P6 dstq  +   mstride3q + %1
+%define P5 dstq  + 2*mstrideq  + %1
+%define P4 dstq  +   mstrideq  + %1
+%define P3 dstq                + %1
+%define P2 dstq  +    strideq  + %1
+%define P1 dstq  + 2* strideq  + %1
+%define P0 dstq  +    stride3q + %1
+%define Q0 dstq  + 4* strideq  + %1
+%define Q1 dst2q +   mstride3q + %1
+%define Q2 dst2q + 2*mstrideq  + %1
+%define Q3 dst2q +   mstrideq  + %1
+%define Q4 dst2q               + %1
+%define Q5 dst2q +    strideq  + %1
+%define Q6 dst2q + 2* strideq  + %1
+%define Q7 dst2q +    stride3q + %1
+%endmacro
+
+%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
+%define P3 rsp +   0 + %1
+%define P2 rsp +  16 + %1
+%define P1 rsp +  32 + %1
+%define P0 rsp +  48 + %1
+%define Q0 rsp +  64 + %1
+%define Q1 rsp +  80 + %1
+%define Q2 rsp +  96 + %1
+%define Q3 rsp + 112 + %1
+%define P7 rsp + 128 + %1
+%define P6 rsp + 144 + %1
+%define P5 rsp + 160 + %1
+%define P4 rsp + 176 + %1
+%define Q4 rsp + 192 + %1
+%define Q5 rsp + 208 + %1
+%define Q6 rsp + 224 + %1
+%define Q7 rsp + 240 + %1
+%endmacro
+
+; ..............AB -> AAAAAAAABBBBBBBB
+%macro SPLATB_MIX 1-2 [mask_mix]
+%if cpuflag(ssse3)
+    pshufb     %1, %2
+%else
+    punpcklbw  %1, %1
+    punpcklwd  %1, %1
+    punpckldq  %1, %1
+%endif
+%endmacro
+
+%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=32bit stack only
+%if UNIX64
+cglobal vp9_loop_filter_%1_%2_16, 5, 9, 16, %3 + %4, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
+%else
+%if WIN64
+cglobal vp9_loop_filter_%1_%2_16, 4, 8, 16, %3 + %4, dst, stride, E, I, mstride, dst2, stride3, mstride3
+%else
+cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, dst2, stride3, mstride3
+%define Ed dword r2m
+%define Id dword r3m
+%endif
+%define Hd dword r4m
+%endif
+
+    mov               mstrideq, strideq
+    neg               mstrideq
+
+    lea               stride3q, [strideq*3]
+    lea              mstride3q, [mstrideq*3]
+
+%ifidn %1, h
+%if %2 > 16
+%define movx movh
+    lea                   dstq, [dstq + 4*strideq - 4]
+%else
+%define movx movu
+    lea                   dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
+%endif
+    lea                  dst2q, [dstq + 8*strideq]
+%else
+    lea                   dstq, [dstq + 4*mstrideq]
+    lea                  dst2q, [dstq + 8*strideq]
+%endif
+
+    DEFINE_REAL_P7_TO_Q7
+
+%ifidn %1, h
+    movx                    m0, [P7]
+    movx                    m1, [P6]
+    movx                    m2, [P5]
+    movx                    m3, [P4]
+    movx                    m4, [P3]
+    movx                    m5, [P2]
+%if ARCH_X86_64 || %2 != 16
+    movx                    m6, [P1]
+%endif
+    movx                    m7, [P0]
+%if ARCH_X86_64
+    movx                    m8, [Q0]
+    movx                    m9, [Q1]
+    movx                   m10, [Q2]
+    movx                   m11, [Q3]
+    movx                   m12, [Q4]
+    movx                   m13, [Q5]
+    movx                   m14, [Q6]
+    movx                   m15, [Q7]
+    DEFINE_TRANSPOSED_P7_TO_Q7
+%if %2 == 16
+    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+    mova           [P7],  m0
+    mova           [P6],  m1
+    mova           [P5],  m2
+    mova           [P4],  m3
+%else ; %2 == 44/48/84/88
+    ; 8x16 transpose
+    punpcklbw        m0,  m1
+    punpcklbw        m2,  m3
+    punpcklbw        m4,  m5
+    punpcklbw        m6,  m7
+    punpcklbw        m8,  m9
+    punpcklbw       m10, m11
+    punpcklbw       m12, m13
+    punpcklbw       m14, m15
+    TRANSPOSE8x8W     0, 2, 4, 6, 8, 10, 12, 14, 15
+    SWAP              0,  4
+    SWAP              2,  5
+    SWAP              0,  6
+    SWAP              0,  7
+    SWAP             10,  9
+    SWAP             12, 10
+    SWAP             14, 11
+%endif ; %2
+    mova           [P3],  m4
+    mova           [P2],  m5
+    mova           [P1],  m6
+    mova           [P0],  m7
+    mova           [Q0],  m8
+    mova           [Q1],  m9
+    mova           [Q2], m10
+    mova           [Q3], m11
+%if %2 == 16
+    mova           [Q4], m12
+    mova           [Q5], m13
+    mova           [Q6], m14
+    mova           [Q7], m15
+%endif ; %2
+%else ; x86-32
+%if %2 == 16
+    TRANSPOSE8x8B    0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80]
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    movh          [P7], m0
+    movh          [P5], m1
+    movh          [P3], m2
+    movh          [P1], m3
+    movh          [Q2], m5
+    movh          [Q4], m6
+    movh          [Q6], m7
+    movhps        [P6], m0
+    movhps        [P4], m1
+    movhps        [P2], m2
+    movhps        [P0], m3
+    movhps        [Q3], m5
+    movhps        [Q5], m6
+    movhps        [Q7], m7
+    DEFINE_REAL_P7_TO_Q7
+    movx                    m0, [Q0]
+    movx                    m1, [Q1]
+    movx                    m2, [Q2]
+    movx                    m3, [Q3]
+    movx                    m4, [Q4]
+    movx                    m5, [Q5]
+    movx                    m7, [Q7]
+    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88]
+    DEFINE_TRANSPOSED_P7_TO_Q7 8
+    movh          [P7], m0
+    movh          [P5], m1
+    movh          [P3], m2
+    movh          [P1], m3
+    movh          [Q2], m5
+    movh          [Q4], m6
+    movh          [Q6], m7
+    movhps        [P6], m0
+    movhps        [P4], m1
+    movhps        [P2], m2
+    movhps        [P0], m3
+    movhps        [Q3], m5
+    movhps        [Q5], m6
+    movhps        [Q7], m7
+    DEFINE_TRANSPOSED_P7_TO_Q7
+%else ; %2 == 44/48/84/88
+    punpcklbw        m0, m1
+    punpcklbw        m2, m3
+    punpcklbw        m4, m5
+    punpcklbw        m6, m7
+    movx             m1, [Q0]
+    movx             m3, [Q1]
+    movx             m5, [Q2]
+    movx             m7, [Q3]
+    punpcklbw        m1, m3
+    punpcklbw        m5, m7
+    movx             m3, [Q4]
+    movx             m7, [Q5]
+    punpcklbw        m3, m7
+    mova          [rsp], m3
+    movx             m3, [Q6]
+    movx             m7, [Q7]
+    punpcklbw        m3, m7
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    TRANSPOSE8x8W     0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1
+    mova           [P3],  m0
+    mova           [P2],  m2
+    mova           [P1],  m4
+    mova           [P0],  m6
+    mova           [Q1],  m5
+    mova           [Q2],  m7
+    mova           [Q3],  m3
+%endif ; %2
+%endif ; x86-32/64
+%endif ; %1 == h
+
+    ; calc fm mask
+%if %2 == 16
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m2, I, m0                       ; I I I I ...
+    SPLATB_REG          m3, E, m0                       ; E E E E ...
+%else
+%if cpuflag(ssse3)
+    mova                m0, [mask_mix]
+%endif
+    movd                m2, Id
+    movd                m3, Ed
+    SPLATB_MIX          m2, m0
+    SPLATB_MIX          m3, m0
+%endif
+    mova                m0, [pb_80]
+    pxor                m2, m0
+    pxor                m3, m0
+%if ARCH_X86_64
+%ifidn %1, v
+    mova                m8, [P3]
+    mova                m9, [P2]
+    mova               m10, [P1]
+    mova               m11, [P0]
+    mova               m12, [Q0]
+    mova               m13, [Q1]
+    mova               m14, [Q2]
+    mova               m15, [Q3]
+%else
+    ; In case of horizontal, P3..Q3 are already present in some registers due
+    ; to the previous transpose, so we just swap registers.
+    SWAP                 8,  4, 12
+    SWAP                 9,  5, 13
+    SWAP                10,  6, 14
+    SWAP                11,  7, 15
+%endif
+%define rp3 m8
+%define rp2 m9
+%define rp1 m10
+%define rp0 m11
+%define rq0 m12
+%define rq1 m13
+%define rq2 m14
+%define rq3 m15
+%else
+%define rp3 [P3]
+%define rp2 [P2]
+%define rp1 [P1]
+%define rp0 [P0]
+%define rq0 [Q0]
+%define rq1 [Q1]
+%define rq2 [Q2]
+%define rq3 [Q3]
+%endif
+    ABSSUB_GT           m5, rp3, rp2, m2, m7, m0        ; m5 = abs(p3-p2) <= I
+    ABSSUB_GT           m1, rp2, rp1, m2, m7, m0        ; m1 = abs(p2-p1) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rp1, rp0, m2, m7, m0        ; m1 = abs(p1-p0) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rq0, rq1, m2, m7, m0        ; m1 = abs(q1-q0) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rq1, rq2, m2, m7, m0        ; m1 = abs(q2-q1) <= I
+    por                 m5, m1
+    ABSSUB_GT           m1, rq2, rq3, m2, m7, m0        ; m1 = abs(q3-q2) <= I
+    por                 m5, m1
+    ABSSUB              m1, rp0, rq0, m7                ; abs(p0-q0)
+    paddusb             m1, m1                          ; abs(p0-q0) * 2
+    ABSSUB              m2, rp1, rq1, m7                ; abs(p1-q1)
+    pand                m2, [pb_fe]                     ; drop lsb so shift can work
+    psrlq               m2, 1                           ; abs(p1-q1)/2
+    paddusb             m1, m2                          ; abs(p0-q0)*2 + abs(p1-q1)/2
+    pxor                m1, m0
+    pcmpgtb             m1, m3
+    por                 m1, m5                          ; fm final value
+    SWAP                 1, 3
+    pxor                m3, [pb_ff]
+
+    ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
+    ; calc flat8in (if not 44_16) and hev masks
+%if %2 != 44
+    mova                m6, [pb_81]                     ; [1 1 1 1 ...] ^ 0x80
+    ABSSUB_GT           m2, rp3, rp0, m6, m5            ; abs(p3 - p0) <= 1
+%if ARCH_X86_64
+    mova                m8, [pb_80]
+%define rb80 m8
+%else
+%define rb80 [pb_80]
+%endif
+    ABSSUB_GT           m1, rp2, rp0, m6, m5, rb80      ; abs(p2 - p0) <= 1
+    por                 m2, m1
+    ABSSUB              m4, rp1, rp0, m5                ; abs(p1 - p0)
+%if %2 == 16
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m7, H, m0                       ; H H H H ...
+%else
+    movd                m7, Hd
+    SPLATB_MIX          m7
+%endif
+    pxor                m7, rb80
+    pxor                m4, rb80
+    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
+    CMP_GT              m4, m6                          ; abs(p1 - p0) <= 1
+    por                 m2, m4                          ; (flat8in)
+    ABSSUB              m4, rq1, rq0, m1                ; abs(q1 - q0)
+    pxor                m4, rb80
+    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
+    por                 m0, m5                          ; hev final value
+    CMP_GT              m4, m6                          ; abs(q1 - q0) <= 1
+    por                 m2, m4                          ; (flat8in)
+    ABSSUB_GT           m1, rq2, rq0, m6, m5, rb80      ; abs(q2 - q0) <= 1
+    por                 m2, m1
+    ABSSUB_GT           m1, rq3, rq0, m6, m5, rb80      ; abs(q3 - q0) <= 1
+    por                 m2, m1                          ; flat8in final value
+    pxor                m2, [pb_ff]
+%if %2 == 84 || %2 == 48
+    pand                m2, [mask_mix%2]
+%endif
+%else
+    mova                m6, [pb_80]
+    movd                m7, Hd
+    SPLATB_MIX          m7
+    pxor                m7, m6
+    ABSSUB              m4, rp1, rp0, m1                ; abs(p1 - p0)
+    pxor                m4, m6
+    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
+    ABSSUB              m4, rq1, rq0, m1                ; abs(q1 - q0)
+    pxor                m4, m6
+    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
+    por                 m0, m5                          ; hev final value
+%endif
+
+%if %2 == 16
+    ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
+    ; calc flat8out mask
+%if ARCH_X86_64
+    mova                m8, [P7]
+    mova                m9, [P6]
+%define rp7 m8
+%define rp6 m9
+%else
+%define rp7 [P7]
+%define rp6 [P6]
+%endif
+    ABSSUB_GT           m1, rp7, rp0, m6, m5            ; abs(p7 - p0) <= 1
+    ABSSUB_GT           m7, rp6, rp0, m6, m5            ; abs(p6 - p0) <= 1
+    por                 m1, m7
+%if ARCH_X86_64
+    mova                m8, [P5]
+    mova                m9, [P4]
+%define rp5 m8
+%define rp4 m9
+%else
+%define rp5 [P5]
+%define rp4 [P4]
+%endif
+    ABSSUB_GT           m7, rp5, rp0, m6, m5            ; abs(p5 - p0) <= 1
+    por                 m1, m7
+    ABSSUB_GT           m7, rp4, rp0, m6, m5            ; abs(p4 - p0) <= 1
+    por                 m1, m7
+%if ARCH_X86_64
+    mova                m14, [Q4]
+    mova                m15, [Q5]
+%define rq4 m14
+%define rq5 m15
+%else
+%define rq4 [Q4]
+%define rq5 [Q5]
+%endif
+    ABSSUB_GT           m7, rq4, rq0, m6, m5            ; abs(q4 - q0) <= 1
+    por                 m1, m7
+    ABSSUB_GT           m7, rq5, rq0, m6, m5            ; abs(q5 - q0) <= 1
+    por                 m1, m7
+%if ARCH_X86_64
+    mova                m14, [Q6]
+    mova                m15, [Q7]
+%define rq6 m14
+%define rq7 m15
+%else
+%define rq6 [Q6]
+%define rq7 [Q7]
+%endif
+    ABSSUB_GT           m7, rq6, rq0, m6, m5            ; abs(q4 - q0) <= 1
+    por                 m1, m7
+    ABSSUB_GT           m7, rq7, rq0, m6, m5            ; abs(q5 - q0) <= 1
+    por                 m1, m7                          ; flat8out final value
+    pxor                m1, [pb_ff]
+%endif
+
+    ; if (fm) {
+    ;     if (out && in) filter_14()
+    ;     else if (in)   filter_6()
+    ;     else if (hev)  filter_2()
+    ;     else           filter_4()
+    ; }
+    ;
+    ; f14:                                                                            fm &  out &  in
+    ; f6:  fm & ~f14 & in        => fm & ~(out & in) & in                          => fm & ~out &  in
+    ; f2:  fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev          => fm &  ~in &  hev
+    ; f4:  fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm &  ~in & ~hev
+
+    ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
+    ; filter2()
+%if %2 != 44
+    mova                m6, [pb_80]                     ; already in m6 if 44_16
+    SCRATCH              2, 15, rsp+%3+%4
+%if %2 == 16
+    SCRATCH              1,  8, rsp+%3+%4+16
+%endif
+%endif
+    pxor                m2, m6, rq0                     ; q0 ^ 0x80
+    pxor                m4, m6, rp0                     ; p0 ^ 0x80
+    psubsb              m2, m4                          ; (signed) q0 - p0
+    pxor                m4, m6, rp1                     ; p1 ^ 0x80
+    pxor                m5, m6, rq1                     ; q1 ^ 0x80
+    psubsb              m4, m5                          ; (signed) p1 - q1
+    paddsb              m4, m2                          ;   (q0 - p0) + (p1 - q1)
+    paddsb              m4, m2                          ; 2*(q0 - p0) + (p1 - q1)
+    paddsb              m4, m2                          ; 3*(q0 - p0) + (p1 - q1)
+    paddsb              m6, m4, [pb_4]                  ; m6: f1 = clip(f + 4, 127)
+    paddsb              m4, [pb_3]                      ; m4: f2 = clip(f + 3, 127)
+%if ARCH_X86_64
+    mova                m14, [pb_10]                    ; will be reused in filter4()
+%define rb10 m14
+%else
+%define rb10 [pb_10]
+%endif
+    SRSHIFT3B_2X        m6, m4, rb10, m7                ; f1 and f2 sign byte shift by 3
+    SIGN_SUB            m7, rq0, m6, m5                 ; m7 = q0 - f1
+    SIGN_ADD            m1, rp0, m4, m5                 ; m1 = p0 + f2
+%if %2 != 44
+%if ARCH_X86_64
+    pandn               m6, m15, m3                     ;  ~mask(in) & mask(fm)
+%else
+    mova                m6, [rsp+%3+%4]
+    pandn               m6, m3
+%endif
+    pand                m6, m0                          ; (~mask(in) & mask(fm)) & mask(hev)
+%else
+    pand                m6, m3, m0
+%endif
+    MASK_APPLY          m7, rq0, m6, m5                 ; m7 = filter2(q0) & mask / we write it in filter4()
+    MASK_APPLY          m1, rp0, m6, m5                 ; m1 = filter2(p0) & mask / we write it in filter4()
+
+    ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], )
+    ; filter4()
+    mova                m4, m2
+    paddsb              m2, m4                          ; 2 * (q0 - p0)
+    paddsb              m2, m4                          ; 3 * (q0 - p0)
+    paddsb              m6, m2, [pb_4]                  ; m6:  f1 = clip(f + 4, 127)
+    paddsb              m2, [pb_3]                      ; m2: f2 = clip(f + 3, 127)
+    SRSHIFT3B_2X        m6, m2, rb10, m4                ; f1 and f2 sign byte shift by 3
+%if %2 != 44
+%if ARCH_X86_64
+    pandn               m5, m15, m3                     ;               ~mask(in) & mask(fm)
+%else
+    mova                m5, [rsp+%3+%4]
+    pandn               m5, m3
+%endif
+    pandn               m0, m5                          ; ~mask(hev) & (~mask(in) & mask(fm))
+%else
+    pandn               m0, m3
+%endif
+    SIGN_SUB            m5, rq0, m6, m4                 ; q0 - f1
+    MASK_APPLY          m5, m7, m0, m4                  ; filter4(q0) & mask
+    mova                [Q0], m5
+    SIGN_ADD            m7, rp0, m2, m4                 ; p0 + f2
+    MASK_APPLY          m7, m1, m0, m4                  ; filter4(p0) & mask
+    mova                [P0], m7
+    paddb               m6, [pb_80]                     ;
+    pxor                m1, m1                          ;   f=(f1+1)>>1
+    pavgb               m6, m1                          ;
+    psubb               m6, [pb_40]                     ;
+    SIGN_ADD            m1, rp1, m6, m2                 ; p1 + f
+    SIGN_SUB            m4, rq1, m6, m2                 ; q1 - f
+    MASK_APPLY          m1, rp1, m0, m2                 ; m1 = filter4(p1)
+    MASK_APPLY          m4, rq1, m0, m2                 ; m4 = filter4(q1)
+    mova                [P1], m1
+    mova                [Q1], m4
+
+%if %2 != 44
+    UNSCRATCH            2, 15, rsp+%3+%4
+%endif
+
+    ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
+    ; filter6()
+%if %2 != 44
+    pxor                m0, m0
+%if %2 > 16
+    pand                m3, m2
+%else
+    pand                m2, m3                          ;               mask(fm) & mask(in)
+%if ARCH_X86_64
+    pandn               m3, m8, m2                      ; ~mask(out) & (mask(fm) & mask(in))
+%else
+    mova                m3, [rsp+%3+%4+16]
+    pandn               m3, m2
+%endif
+%endif
+%if ARCH_X86_64
+    mova               m14, [P3]
+    mova                m9, [Q3]
+%define rp3 m14
+%define rq3 m9
+%else
+%define rp3 [P3]
+%define rq3 [Q3]
+%endif
+    mova                m1, [P2]
+    FILTER_INIT         m4, m5, m6, m7, [P2], %4, 6,             m3,  m1             ; [p2]
+    mova                m1, [Q2]
+    FILTER_UPDATE       m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3,  "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1
+    FILTER_UPDATE       m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3,  "", m1         ; [p0] -p3 -p1 +p0 +q2
+    FILTER_UPDATE       m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3,  "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3
+    FILTER_UPDATE       m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3,  ""             ; [q1] -p2 -q0 +q1 +q3
+    FILTER_UPDATE       m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3,  m1             ; [q2] -p1 -q1 +q2 +q3
+%endif
+
+%if %2 == 16
+    UNSCRATCH            1,  8, rsp+%3+%4+16
+%endif
+
+    ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
+    ; filter14()
+    ;
+    ;                            m2  m3  m8  m9 m14 m15 m10 m11 m12 m13
+    ;
+    ;                                    q2  q3  p3  p2  p1  p0  q0  q1
+    ; p6  -7                     p7  p6  p5  p4   .   .   .   .   .
+    ; p5  -6  -p7 -p6 +p5 +q1     .   .   .                           .
+    ; p4  -5  -p7 -p5 +p4 +q2     .       .   .                      q2
+    ; p3  -4  -p7 -p4 +p3 +q3     .           .   .                  q3
+    ; p2  -3  -p7 -p3 +p2 +q4     .               .   .              q4
+    ; p1  -2  -p7 -p2 +p1 +q5     .                   .   .          q5
+    ; p0  -1  -p7 -p1 +p0 +q6     .                       .   .      q6
+    ; q0  +0  -p7 -p0 +q0 +q7     .                           .   .  q7
+    ; q1  +1  -p6 -q0 +q1 +q7    q1   .                           .   .
+    ; q2  +2  -p5 -q1 +q2 +q7     .  q2   .                           .
+    ; q3  +3  -p4 -q2 +q3 +q7         .  q3   .                       .
+    ; q4  +4  -p3 -q3 +q4 +q7             .  q4   .                   .
+    ; q5  +5  -p2 -q4 +q5 +q7                 .  q5   .               .
+    ; q6  +6  -p1 -q5 +q6 +q7                     .  q6   .           .
+
+%if %2 == 16
+    pand            m1, m2                                                              ; mask(out) & (mask(fm) & mask(in))
+    mova            m2, [P7]
+    mova            m3, [P6]
+%if ARCH_X86_64
+    mova            m8, [P5]
+    mova            m9, [P4]
+%define rp5 m8
+%define rp4 m9
+%define rp5s m8
+%define rp4s m9
+%define rp3s m14
+%define rq4 m8
+%define rq5 m9
+%define rq6 m14
+%define rq7 m15
+%define rq4s m8
+%define rq5s m9
+%define rq6s m14
+%else
+%define rp5 [P5]
+%define rp4 [P4]
+%define rp5s ""
+%define rp4s ""
+%define rp3s ""
+%define rq4 [Q4]
+%define rq5 [Q5]
+%define rq6 [Q6]
+%define rq7 [Q7]
+%define rq4s ""
+%define rq5s ""
+%define rq6s ""
+%endif
+    FILTER_INIT     m4, m5, m6, m7, [P6], %4, 14,                m1,  m3            ; [p6]
+    FILTER_UPDATE   m4, m5, m6, m7, [P5], %4,  8,  9, 10,  5, 4, m1, rp5s           ; [p5] -p7 -p6 +p5 +q1
+    FILTER_UPDATE   m4, m5, m6, m7, [P4], %4,  8, 10, 11,  6, 4, m1, rp4s           ; [p4] -p7 -p5 +p4 +q2
+    FILTER_UPDATE   m4, m5, m6, m7, [P3], %4,  8, 11,  0,  7, 4, m1, rp3s           ; [p3] -p7 -p4 +p3 +q3
+    FILTER_UPDATE   m4, m5, m6, m7, [P2], %4,  8,  0,  1, 12, 4, m1,  "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4
+    FILTER_UPDATE   m4, m5, m6, m7, [P1], %4,  8,  1,  2, 13, 4, m1,  "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5
+    FILTER_UPDATE   m4, m5, m6, m7, [P0], %4,  8,  2,  3, 14, 4, m1,  "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6
+    FILTER_UPDATE   m4, m5, m6, m7, [Q0], %4,  8,  3,  4, 15, 4, m1,  "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q1], %4,  9,  4,  5, 15, 4, m1,  ""            ; [q1] -p6 -q0 +q1 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q2], %4, 10,  5,  6, 15, 4, m1,  ""            ; [q2] -p5 -q1 +q2 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q3], %4, 11,  6,  7, 15, 4, m1,  ""            ; [q3] -p4 -q2 +q3 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q4], %4,  0,  7, 12, 15, 4, m1, rq4s           ; [q4] -p3 -q3 +q4 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q5], %4,  1, 12, 13, 15, 4, m1, rq5s           ; [q5] -p2 -q4 +q5 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q6], %4,  2, 13, 14, 15, 4, m1, rq6s           ; [q6] -p1 -q5 +q6 +q7
+%endif
+
+%ifidn %1, h
+%if %2 == 16
+    mova                    m0, [P7]
+    mova                    m1, [P6]
+    mova                    m2, [P5]
+    mova                    m3, [P4]
+    mova                    m4, [P3]
+    mova                    m5, [P2]
+%if ARCH_X86_64
+    mova                    m6, [P1]
+%endif
+    mova                    m7, [P0]
+%if ARCH_X86_64
+    mova                    m8, [Q0]
+    mova                    m9, [Q1]
+    mova                   m10, [Q2]
+    mova                   m11, [Q3]
+    mova                   m12, [Q4]
+    mova                   m13, [Q5]
+    mova                   m14, [Q6]
+    mova                   m15, [Q7]
+    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+    DEFINE_REAL_P7_TO_Q7
+    movu  [P7],  m0
+    movu  [P6],  m1
+    movu  [P5],  m2
+    movu  [P4],  m3
+    movu  [P3],  m4
+    movu  [P2],  m5
+    movu  [P1],  m6
+    movu  [P0],  m7
+    movu  [Q0],  m8
+    movu  [Q1],  m9
+    movu  [Q2], m10
+    movu  [Q3], m11
+    movu  [Q4], m12
+    movu  [Q5], m13
+    movu  [Q6], m14
+    movu  [Q7], m15
+%else
+    DEFINE_REAL_P7_TO_Q7
+    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1]
+    movh   [P7],  m0
+    movh   [P5],  m1
+    movh   [P3],  m2
+    movh   [P1],  m3
+    movh   [Q2],  m5
+    movh   [Q4],  m6
+    movh   [Q6],  m7
+    movhps [P6],  m0
+    movhps [P4],  m1
+    movhps [P2],  m2
+    movhps [P0],  m3
+    movhps [Q3],  m5
+    movhps [Q5],  m6
+    movhps [Q7],  m7
+    DEFINE_TRANSPOSED_P7_TO_Q7
+    mova                    m0, [Q0]
+    mova                    m1, [Q1]
+    mova                    m2, [Q2]
+    mova                    m3, [Q3]
+    mova                    m4, [Q4]
+    mova                    m5, [Q5]
+    mova                    m7, [Q7]
+    DEFINE_REAL_P7_TO_Q7 8
+    TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1]
+    movh   [P7],  m0
+    movh   [P5],  m1
+    movh   [P3],  m2
+    movh   [P1],  m3
+    movh   [Q2],  m5
+    movh   [Q4],  m6
+    movh   [Q6],  m7
+    movhps [P6],  m0
+    movhps [P4],  m1
+    movhps [P2],  m2
+    movhps [P0],  m3
+    movhps [Q3],  m5
+    movhps [Q5],  m6
+    movhps [Q7],  m7
+%endif
+%elif %2 == 44
+    SWAP 0, 1   ; m0 = p1
+    SWAP 1, 7   ; m1 = p0
+    SWAP 2, 5   ; m2 = q0
+    SWAP 3, 4   ; m3 = q1
+    DEFINE_REAL_P7_TO_Q7 2
+    SBUTTERFLY  bw, 0, 1, 4
+    SBUTTERFLY  bw, 2, 3, 4
+    SBUTTERFLY  wd, 0, 2, 4
+    SBUTTERFLY  wd, 1, 3, 4
+    movd  [P7], m0
+    movd  [P3], m2
+    movd  [Q0], m1
+    movd  [Q4], m3
+    psrldq  m0, 4
+    psrldq  m1, 4
+    psrldq  m2, 4
+    psrldq  m3, 4
+    movd  [P6], m0
+    movd  [P2], m2
+    movd  [Q1], m1
+    movd  [Q5], m3
+    psrldq  m0, 4
+    psrldq  m1, 4
+    psrldq  m2, 4
+    psrldq  m3, 4
+    movd  [P5], m0
+    movd  [P1], m2
+    movd  [Q2], m1
+    movd  [Q6], m3
+    psrldq  m0, 4
+    psrldq  m1, 4
+    psrldq  m2, 4
+    psrldq  m3, 4
+    movd  [P4], m0
+    movd  [P0], m2
+    movd  [Q3], m1
+    movd  [Q7], m3
+%else
+    ; the following code do a transpose of 8 full lines to 16 half
+    ; lines (high part). It is inlined to avoid the need of a staging area
+    mova                    m0, [P3]
+    mova                    m1, [P2]
+    mova                    m2, [P1]
+    mova                    m3, [P0]
+    mova                    m4, [Q0]
+    mova                    m5, [Q1]
+%if ARCH_X86_64
+    mova                    m6, [Q2]
+%endif
+    mova                    m7, [Q3]
+    DEFINE_REAL_P7_TO_Q7
+%if ARCH_X86_64
+    SBUTTERFLY  bw,  0,  1, 8
+    SBUTTERFLY  bw,  2,  3, 8
+    SBUTTERFLY  bw,  4,  5, 8
+    SBUTTERFLY  bw,  6,  7, 8
+    SBUTTERFLY  wd,  0,  2, 8
+    SBUTTERFLY  wd,  1,  3, 8
+    SBUTTERFLY  wd,  4,  6, 8
+    SBUTTERFLY  wd,  5,  7, 8
+    SBUTTERFLY  dq,  0,  4, 8
+    SBUTTERFLY  dq,  1,  5, 8
+    SBUTTERFLY  dq,  2,  6, 8
+    SBUTTERFLY  dq,  3,  7, 8
+%else
+    SBUTTERFLY  bw,  0,  1, 6
+    mova  [rsp+64], m1
+    mova        m6, [rsp+96]
+    SBUTTERFLY  bw,  2,  3, 1
+    SBUTTERFLY  bw,  4,  5, 1
+    SBUTTERFLY  bw,  6,  7, 1
+    SBUTTERFLY  wd,  0,  2, 1
+    mova  [rsp+96], m2
+    mova        m1, [rsp+64]
+    SBUTTERFLY  wd,  1,  3, 2
+    SBUTTERFLY  wd,  4,  6, 2
+    SBUTTERFLY  wd,  5,  7, 2
+    SBUTTERFLY  dq,  0,  4, 2
+    SBUTTERFLY  dq,  1,  5, 2
+    movh      [Q0], m1
+    movhps    [Q1], m1
+    mova        m2, [rsp+96]
+    SBUTTERFLY  dq,  2,  6, 1
+    SBUTTERFLY  dq,  3,  7, 1
+%endif
+    SWAP         3, 6
+    SWAP         1, 4
+    movh      [P7], m0
+    movhps    [P6], m0
+    movh      [P5], m1
+    movhps    [P4], m1
+    movh      [P3], m2
+    movhps    [P2], m2
+    movh      [P1], m3
+    movhps    [P0], m3
+%if ARCH_X86_64
+    movh      [Q0], m4
+    movhps    [Q1], m4
+%endif
+    movh      [Q2], m5
+    movhps    [Q3], m5
+    movh      [Q4], m6
+    movhps    [Q5], m6
+    movh      [Q6], m7
+    movhps    [Q7], m7
+%endif
+%endif
+
+    RET
+%endmacro
+
+%macro LPF_16_VH 5
+INIT_XMM %5
+LOOPFILTER v, %1, %2,  0, %4
+LOOPFILTER h, %1, %2, %3, %4
+%endmacro
+
+%macro LPF_16_VH_ALL_OPTS 4
+LPF_16_VH %1, %2, %3, %4, sse2
+LPF_16_VH %1, %2, %3, %4, ssse3
+LPF_16_VH %1, %2, %3, %4, avx
+%endmacro
+
+LPF_16_VH_ALL_OPTS 16, 512, 256, 32
+LPF_16_VH_ALL_OPTS 44,   0, 128,  0
+LPF_16_VH_ALL_OPTS 48, 256, 128, 16
+LPF_16_VH_ALL_OPTS 84, 256, 128, 16
+LPF_16_VH_ALL_OPTS 88, 256, 128, 16
diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm
new file mode 100644
index 0000000..c15437b
--- /dev/null
+++ b/libavcodec/x86/vp9lpf_16bpp.asm
@@ -0,0 +1,823 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_511: times 16 dw 511
+pw_2047: times 16 dw 2047
+pw_16384: times 16 dw 16384
+pw_m512: times 16 dw -512
+pw_m2048: times 16 dw -2048
+
+cextern pw_1
+cextern pw_3
+cextern pw_4
+cextern pw_8
+cextern pw_16
+cextern pw_256
+cextern pw_1023
+cextern pw_4095
+cextern pw_m1
+
+SECTION .text
+
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+    mova              [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+    SWAP                %1, %2
+%else
+    mova               m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+    mova               m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
+; calulate p or q portion of flat8out
+%macro FLAT8OUT_HALF 0
+    psubw               m4, m0                      ; q4-q0
+    psubw               m5, m0                      ; q5-q0
+    psubw               m6, m0                      ; q6-q0
+    psubw               m7, m0                      ; q7-q0
+    ABS2                m4, m5, m2, m3              ; abs(q4-q0) | abs(q5-q0)
+    ABS2                m6, m7, m2, m3              ; abs(q6-q0) | abs(q7-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q4-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q5-q0) > F
+    pcmpgtw             m6, reg_F                   ; abs(q6-q0) > F
+    pcmpgtw             m7, reg_F                   ; abs(q7-q0) > F
+    por                 m5, m4
+    por                 m7, m6
+    por                 m7, m5                      ; !flat8out, q portion
+%endmacro
+
+; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
+%macro FLAT8IN_HALF 1
+%if %1 > 4
+    psubw               m4, m3, m0                  ; q3-q0
+    psubw               m5, m2, m0                  ; q2-q0
+    ABS2                m4, m5, m6, m7              ; abs(q3-q0) | abs(q2-q0)
+    pcmpgtw             m4, reg_F                   ; abs(q3-q0) > F
+    pcmpgtw             m5, reg_F                   ; abs(q2-q0) > F
+%endif
+    psubw               m3, m2                      ; q3-q2
+    psubw               m2, m1                      ; q2-q1
+    ABS2                m3, m2, m6, m7              ; abs(q3-q2) | abs(q2-q1)
+    pcmpgtw             m3, reg_I                   ; abs(q3-q2) > I
+    pcmpgtw             m2, reg_I                   ; abs(q2-q1) > I
+%if %1 > 4
+    por                 m4, m5
+%endif
+    por                 m2, m3
+    psubw               m3, m1, m0                  ; q1-q0
+    ABS1                m3, m5                      ; abs(q1-q0)
+%if %1 > 4
+    pcmpgtw             m6, m3, reg_F               ; abs(q1-q0) > F
+%endif
+    pcmpgtw             m7, m3, reg_H               ; abs(q1-q0) > H
+    pcmpgtw             m3, reg_I                   ; abs(q1-q0) > I
+%if %1 > 4
+    por                 m4, m6
+%endif
+    por                 m2, m3
+%endmacro
+
+; one step in filter_14/filter_6
+;
+; take sum $reg, downshift, apply mask and write into dst
+;
+; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
+; step's sum $reg. This is omitted for the last row in each filter.
+;
+; if dont_store is set, don't write the result into memory, instead keep the
+; values in register so we can write it out later
+%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
+                                      ; src/sub1, sub2, add1, add2, dont_store
+    psrlw               %1, %2, %4
+    psubw               %1, %6                      ; abs->delta
+%ifnidn %7, ""
+    psubw               %2, %6
+    psubw               %2, %7
+    paddw               %2, %8
+    paddw               %2, %9
+%endif
+    pand                %1, reg_%3                  ; apply mask
+%if %10 == 1
+    paddw               %6, %1                      ; delta->abs
+%else
+    paddw               %1, %6                      ; delta->abs
+    mova              [%5], %1
+%endif
+%endmacro
+
+; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
+
+%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
+
+%if ARCH_X86_64
+%if %2 == 16
+%assign %%num_xmm_regs 16
+%elif %2 == 8
+%assign %%num_xmm_regs 15
+%else ; %2 == 4
+%assign %%num_xmm_regs 14
+%endif ; %2
+%assign %%bak_mem 0
+%else ; ARCH_X86_32
+%assign %%num_xmm_regs 8
+%if %2 == 16
+%assign %%bak_mem 7
+%elif %2 == 8
+%assign %%bak_mem 6
+%else ; %2 == 4
+%assign %%bak_mem 5
+%endif ; %2
+%endif ; ARCH_X86_64/32
+
+%if %2 == 16
+%ifidn %1, v
+%assign %%num_gpr_regs 6
+%else ; %1 == h
+%assign %%num_gpr_regs 5
+%endif ; %1
+%assign %%wd_mem 6
+%else ; %2 == 8/4
+%assign %%num_gpr_regs 5
+%if ARCH_X86_32 && %2 == 8
+%assign %%wd_mem 2
+%else ; ARCH_X86_64 || %2 == 4
+%assign %%wd_mem 0
+%endif ; ARCH_X86_64/32 etc.
+%endif ; %2
+
+%ifidn %1, v
+%assign %%tsp_mem 0
+%elif %2 == 16 ; && %1 == h
+%assign %%tsp_mem 16
+%else ; %1 == h && %1 == 8/4
+%assign %%tsp_mem 8
+%endif ; %1/%2
+
+%assign %%off %%wd_mem
+%assign %%tspoff %%bak_mem+%%wd_mem
+%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
+
+%if %3 == 10
+%define %%maxsgn 511
+%define %%minsgn m512
+%define %%maxusgn 1023
+%define %%maxf 4
+%else ; %3 == 12
+%define %%maxsgn 2047
+%define %%minsgn m2048
+%define %%maxusgn 4095
+%define %%maxf 16
+%endif ; %3
+
+cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
+    ; prepare E, I and H masks
+    shl                 Ed, %3-8
+    shl                 Id, %3-8
+    shl                 Hd, %3-8
+%if cpuflag(ssse3)
+    mova                m0, [pw_256]
+%endif
+    movd                m1, Ed
+    movd                m2, Id
+    movd                m3, Hd
+%if cpuflag(ssse3)
+    pshufb              m1, m0                      ; E << (bit_depth - 8)
+    pshufb              m2, m0                      ; I << (bit_depth - 8)
+    pshufb              m3, m0                      ; H << (bit_depth - 8)
+%else
+    punpcklwd           m1, m1
+    punpcklwd           m2, m2
+    punpcklwd           m3, m3
+    pshufd              m1, m1, q0000
+    pshufd              m2, m2, q0000
+    pshufd              m3, m3, q0000
+%endif
+    SCRATCH              1,  8, rsp+(%%off+0)*mmsize,  E
+    SCRATCH              2,  9, rsp+(%%off+1)*mmsize,  I
+    SCRATCH              3, 10, rsp+(%%off+2)*mmsize,  H
+%if %2 > 4
+    PRELOAD                 11, pw_ %+ %%maxf, F
+%endif
+
+    ; set up variables to load data
+%ifidn %1, v
+    DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
+    lea           stride3q, [strideq*3]
+    neg            strideq
+%if %2 == 16
+    lea              dst0q, [dst8q+strideq*8]
+%else
+    lea              dst4q, [dst8q+strideq*4]
+%endif
+    neg            strideq
+%if %2 == 16
+    lea             dst12q, [dst8q+strideq*4]
+    lea              dst4q, [dst0q+strideq*4]
+%endif
+
+%if %2 == 16
+%define %%p7 dst0q
+%define %%p6 dst0q+strideq
+%define %%p5 dst0q+strideq*2
+%define %%p4 dst0q+stride3q
+%endif
+%define %%p3 dst4q
+%define %%p2 dst4q+strideq
+%define %%p1 dst4q+strideq*2
+%define %%p0 dst4q+stride3q
+%define %%q0 dst8q
+%define %%q1 dst8q+strideq
+%define %%q2 dst8q+strideq*2
+%define %%q3 dst8q+stride3q
+%if %2 == 16
+%define %%q4 dst12q
+%define %%q5 dst12q+strideq
+%define %%q6 dst12q+strideq*2
+%define %%q7 dst12q+stride3q
+%endif
+%else ; %1 == h
+    DEFINE_ARGS dst0, stride, stride3, dst4
+    lea           stride3q, [strideq*3]
+    lea              dst4q, [dst0q+strideq*4]
+
+%define %%p3 rsp+(%%tspoff+0)*mmsize
+%define %%p2 rsp+(%%tspoff+1)*mmsize
+%define %%p1 rsp+(%%tspoff+2)*mmsize
+%define %%p0 rsp+(%%tspoff+3)*mmsize
+%define %%q0 rsp+(%%tspoff+4)*mmsize
+%define %%q1 rsp+(%%tspoff+5)*mmsize
+%define %%q2 rsp+(%%tspoff+6)*mmsize
+%define %%q3 rsp+(%%tspoff+7)*mmsize
+
+%if %2 < 16
+    movu                m0, [dst0q+strideq*0-8]
+    movu                m1, [dst0q+strideq*1-8]
+    movu                m2, [dst0q+strideq*2-8]
+    movu                m3, [dst0q+stride3q -8]
+    movu                m4, [dst4q+strideq*0-8]
+    movu                m5, [dst4q+strideq*1-8]
+    movu                m6, [dst4q+strideq*2-8]
+    movu                m7, [dst4q+stride3q -8]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
+%endif
+
+    mova            [%%p3], m0
+    mova            [%%p2], m1
+    mova            [%%p1], m2
+    mova            [%%p0], m3
+%if ARCH_X86_64
+    mova            [%%q0], m4
+%endif
+    mova            [%%q1], m5
+    mova            [%%q2], m6
+    mova            [%%q3], m7
+
+    ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
+    ; order here accordingly
+%else ; %2 == 16
+
+%define %%p7 rsp+(%%tspoff+ 8)*mmsize
+%define %%p6 rsp+(%%tspoff+ 9)*mmsize
+%define %%p5 rsp+(%%tspoff+10)*mmsize
+%define %%p4 rsp+(%%tspoff+11)*mmsize
+%define %%q4 rsp+(%%tspoff+12)*mmsize
+%define %%q5 rsp+(%%tspoff+13)*mmsize
+%define %%q6 rsp+(%%tspoff+14)*mmsize
+%define %%q7 rsp+(%%tspoff+15)*mmsize
+
+    mova                m0, [dst0q+strideq*0-16]
+    mova                m1, [dst0q+strideq*1-16]
+    mova                m2, [dst0q+strideq*2-16]
+    mova                m3, [dst0q+stride3q -16]
+    mova                m4, [dst4q+strideq*0-16]
+    mova                m5, [dst4q+strideq*1-16]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2-16]
+%endif
+    mova                m7, [dst4q+stride3q -16]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
+%endif
+
+    mova            [%%p7], m0
+    mova            [%%p6], m1
+    mova            [%%p5], m2
+    mova            [%%p4], m3
+%if ARCH_X86_64
+    mova            [%%p3], m4
+%endif
+    mova            [%%p2], m5
+    mova            [%%p1], m6
+    mova            [%%p0], m7
+
+    mova                m0, [dst0q+strideq*0]
+    mova                m1, [dst0q+strideq*1]
+    mova                m2, [dst0q+strideq*2]
+    mova                m3, [dst0q+stride3q ]
+    mova                m4, [dst4q+strideq*0]
+    mova                m5, [dst4q+strideq*1]
+%if ARCH_X86_64
+    mova                m6, [dst4q+strideq*2]
+%endif
+    mova                m7, [dst4q+stride3q ]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
+%else
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
+%endif
+
+    mova            [%%q0], m0
+    mova            [%%q1], m1
+    mova            [%%q2], m2
+    mova            [%%q3], m3
+%if ARCH_X86_64
+    mova            [%%q4], m4
+%endif
+    mova            [%%q5], m5
+    mova            [%%q6], m6
+    mova            [%%q7], m7
+
+    ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
+    ; order here accordingly
+%endif ; %2
+%endif ; %1
+
+    ; load q0|q4-7 data
+    mova                m0, [%%q0]
+%if %2 == 16
+    mova                m4, [%%q4]
+    mova                m5, [%%q5]
+    mova                m6, [%%q6]
+    mova                m7, [%%q7]
+
+    ; flat8out q portion
+    FLAT8OUT_HALF
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; load q1-3 data
+    mova                m1, [%%q1]
+    mova                m2, [%%q2]
+    mova                m3, [%%q3]
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flatout[q]
+    ; m12-14=free
+    ; m0-3=q0-q3
+    ; m4-7=free
+
+    ; flat8in|fm|hev q portion
+    FLAT8IN_HALF        %2
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+%if %2 > 4
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8I
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; m2=!fm[q]
+    ; m0,1=q0-q1
+    ; m2-7=free
+    ; m12=free
+
+    ; load p0-1
+    mova                m3, [%%p0]
+    mova                m4, [%%p1]
+
+    ; fm mb_edge portion
+    psubw               m5, m3, m0                  ; q0-p0
+    psubw               m6, m4, m1                  ; q1-p1
+%if ARCH_X86_64
+    ABS2                m5, m6, m7, m12             ; abs(q0-p0) | abs(q1-p1)
+%else
+    ABS1                m5, m7                      ; abs(q0-p0)
+    ABS1                m6, m7                      ; abs(q1-p1)
+%endif
+    paddw               m5, m5
+    psraw               m6, 1
+    paddw               m6, m5                      ; abs(q0-p0)*2+(abs(q1-p1)>>1)
+    pcmpgtw             m6, reg_E
+    por                 m2, m6
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, FM
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out[q]
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m3-4=q0-1
+    ; m0-2/5-7=free
+
+    ; load p4-7 data
+    SWAP                 3, 0                       ; p0
+    SWAP                 4, 1                       ; p1
+%if %2 == 16
+    mova                m7, [%%p7]
+    mova                m6, [%%p6]
+    mova                m5, [%%p5]
+    mova                m4, [%%p4]
+
+    ; flat8out p portion
+    FLAT8OUT_HALF
+    por                 m7, reg_F8O
+    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
+%endif
+
+    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
+    ; r9[m15]=!flat8out
+    ; r10[m13]=hev[q]
+    ; r11[m14]=!flat8in[q]
+    ; r12[m12]=!fm[q]
+    ; m0=p0
+    ; m1-7=free
+
+    ; load p2-3 data
+    mova                m2, [%%p2]
+    mova                m3, [%%p3]
+
+    ; flat8in|fm|hev p portion
+    FLAT8IN_HALF        %2
+    por                 m7, reg_HEV
+%if %2 > 4
+    por                 m4, reg_F8I
+%endif
+    por                 m2, reg_FM
+%if %2 > 4
+    por                 m4, m2                      ; !flat8|!fm
+%if %2 == 16
+    por                 m5, m4, reg_F8O             ; !flat16|!fm
+    pandn               m2, m4                      ; filter4_mask
+    pandn               m4, m5                      ; filter8_mask
+    pxor                m5, [pw_m1]                 ; filter16_mask
+    SCRATCH              5, 15, rsp+(%%off+6)*mmsize, F16M
+%else
+    pandn               m2, m4                      ; filter4_mask
+    pxor                m4, [pw_m1]                 ; filter8_mask
+%endif
+    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8M
+%else
+    pxor                m2, [pw_m1]                 ; filter4_mask
+%endif
+    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
+    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, F4M
+
+    ; r9[m15]=filter16_mask
+    ; r10[m13]=hev
+    ; r11[m14]=filter8_mask
+    ; r12[m12]=filter4_mask
+    ; m0,1=p0-p1
+    ; m2-7=free
+    ; m8-11=free
+
+%if %2 > 4
+%if %2 == 16
+    ; filter_14
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m6, [%%p5]
+    mova                m7, [%%p4]
+    PRELOAD              8, %%p3, P3
+    PRELOAD              9, %%p2, P2
+%endif
+    PRELOAD             10, %%q0, Q0
+    PRELOAD             11, %%q1, Q1
+%if %2 == 16
+    psllw               m4, m2, 3
+    paddw               m5, m3, m3
+    paddw               m4, m6
+    paddw               m5, m7
+    paddw               m4, reg_P3
+    paddw               m5, reg_P2
+    paddw               m4, m1
+    paddw               m5, m0
+    paddw               m4, reg_Q0                  ; q0+p1+p3+p5+p7*8
+    psubw               m5, m2                      ; p0+p2+p4+p6*2-p7
+    paddw               m4, [pw_8]
+    paddw               m5, m4                      ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
+
+    ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
+    ; at the end of the filter
+
+    mova    [rsp+0*mmsize], m3
+    FILTER_STEP         m4, m5, F16M, 4, %%p6, m3,     m2,             m6,     reg_Q1
+%endif
+    mova                m3, [%%q2]
+%if %2 == 16
+    mova    [rsp+1*mmsize], m6
+    FILTER_STEP         m4, m5, F16M, 4, %%p5, m6,     m2,             m7,     m3
+%endif
+    mova                m6, [%%q3]
+%if %2 == 16
+    mova    [rsp+2*mmsize], m7
+    FILTER_STEP         m4, m5, F16M, 4, %%p4, m7,     m2,             reg_P3, m6
+    mova                m7, [%%q4]
+%if ARCH_X86_64
+    mova    [rsp+3*mmsize], reg_P3
+%else
+    mova                m4, reg_P3
+    mova    [rsp+3*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p3, reg_P3, m2,             reg_P2, m7
+    PRELOAD              8, %%q5, Q5
+%if ARCH_X86_64
+    mova    [rsp+4*mmsize], reg_P2
+%else
+    mova                m4, reg_P2
+    mova    [rsp+4*mmsize], m4
+%endif
+    FILTER_STEP         m4, m5, F16M, 4, %%p2, reg_P2, m2,             m1,     reg_Q5
+    PRELOAD              9, %%q6, Q6
+    mova    [rsp+5*mmsize], m1
+    FILTER_STEP         m4, m5, F16M, 4, %%p1, m1,     m2,             m0,     reg_Q6
+    mova                m1, [%%q7]
+    FILTER_STEP         m4, m5, F16M, 4, %%p0, m0,     m2,             reg_Q0, m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m1,     ARCH_X86_64
+    FILTER_STEP         m4, m5, F16M, 4, %%q2, m3,     [rsp+2*mmsize], m6,     m1,     1
+    FILTER_STEP         m4, m5, F16M, 4, %%q3, m6,     [rsp+3*mmsize], m7,     m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q4, m7,     [rsp+4*mmsize], reg_Q5, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
+    FILTER_STEP         m4, m5, F16M, 4, %%q6, reg_Q6
+
+    mova                m7, [%%p1]
+%else
+    SWAP                 1, 7
+%endif
+
+    mova                m2, [%%p3]
+    mova                m1, [%%p2]
+
+    ; reg_Q0-1 (m10-m11)
+    ; m0=p0
+    ; m1=p2
+    ; m2=p3
+    ; m3=q2
+    ; m4-5=free
+    ; m6=q3
+    ; m7=p1
+    ; m8-9 unused
+
+    ; filter_6
+    psllw               m4, m2, 2
+    paddw               m5, m1, m1
+    paddw               m4, m7
+    psubw               m5, m2
+    paddw               m4, m0
+    paddw               m5, reg_Q0
+    paddw               m4, [pw_4]
+    paddw               m5, m4
+
+%if ARCH_X86_64
+    mova                m8, m1
+    mova                m9, m7
+%else
+    mova    [rsp+0*mmsize], m1
+    mova    [rsp+1*mmsize], m7
+%endif
+%ifidn %1, v
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1, 1
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%p1, m7,     m2,             m0,     m3, 1
+    FILTER_STEP         m4, m5, F8M, 3, %%p0, m0,     m2,             reg_Q0, m6, 1
+%if ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, m8,             reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, m9,             m3,     m6, ARCH_X86_64
+%else
+    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
+    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m6, ARCH_X86_64
+%endif
+    FILTER_STEP         m4, m5, F8M, 3, %%q2, m3
+
+    UNSCRATCH            2, 10, %%q0
+    UNSCRATCH            6, 11, %%q1
+%else
+    SWAP                 1, 7
+    mova                m2, [%%q0]
+    mova                m6, [%%q1]
+%endif
+    UNSCRATCH            3, 13, rsp+(%%off+4)*mmsize, HEV
+
+    ; m0=p0
+    ; m1=p2
+    ; m2=q0
+    ; m3=hev_mask
+    ; m4-5=free
+    ; m6=q1
+    ; m7=p1
+
+    ; filter_4
+    psubw               m4, m7, m6              ; p1-q1
+    psubw               m5, m2, m0              ; q0-p0
+    pand                m4, m3
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(p1-q1, 9) -> f
+    paddw               m4, m5
+    paddw               m5, m5
+    paddw               m4, m5                  ; 3*(q0-p0)+f
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(3*(q0-p0)+f, 9) -> f
+    pand                m4, reg_F4M
+    paddw               m5, m4, [pw_4]
+    paddw               m4, [pw_3]
+    pminsw              m5, [pw_ %+ %%maxsgn]
+    pminsw              m4, [pw_ %+ %%maxsgn]
+    psraw               m5, 3                   ; min_intp2(f+4, 9)>>3 -> f1
+    psraw               m4, 3                   ; min_intp2(f+3, 9)>>3 -> f2
+    psubw               m2, m5                  ; q0-f1
+    paddw               m0, m4                  ; p0+f2
+    pandn               m3, m5                  ; f1 & !hev (for p1/q1 adj)
+    pxor                m4, m4
+    mova                m5, [pw_ %+ %%maxusgn]
+    pmaxsw              m2, m4
+    pmaxsw              m0, m4
+    pminsw              m2, m5
+    pminsw              m0, m5
+%if cpuflag(ssse3)
+    pmulhrsw            m3, [pw_16384]          ; (f1+1)>>1
+%else
+    paddw               m3, [pw_1]
+    psraw               m3, 1
+%endif
+    paddw               m7, m3                  ; p1+f
+    psubw               m6, m3                  ; q1-f
+    pmaxsw              m7, m4
+    pmaxsw              m6, m4
+    pminsw              m7, m5
+    pminsw              m6, m5
+
+    ; store
+%ifidn %1, v
+    mova            [%%p1], m7
+    mova            [%%p0], m0
+    mova            [%%q0], m2
+    mova            [%%q1], m6
+%else ; %1 == h
+%if %2 == 4
+    TRANSPOSE4x4W        7, 0, 2, 6, 1
+    movh   [dst0q+strideq*0-4], m7
+    movhps [dst0q+strideq*1-4], m7
+    movh   [dst0q+strideq*2-4], m0
+    movhps [dst0q+stride3q -4], m0
+    movh   [dst4q+strideq*0-4], m2
+    movhps [dst4q+strideq*1-4], m2
+    movh   [dst4q+strideq*2-4], m6
+    movhps [dst4q+stride3q -4], m6
+%elif %2 == 8
+    mova                m3, [%%p3]
+    mova                m4, [%%q2]
+    mova                m5, [%%q3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, 8
+%else
+    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
+    mova                m2, [%%q0]
+%endif
+
+    movu [dst0q+strideq*0-8], m3
+    movu [dst0q+strideq*1-8], m1
+    movu [dst0q+strideq*2-8], m7
+    movu [dst0q+stride3q -8], m0
+    movu [dst4q+strideq*0-8], m2
+    movu [dst4q+strideq*1-8], m6
+    movu [dst4q+strideq*2-8], m4
+    movu [dst4q+stride3q -8], m5
+%else ; %2 == 16
+    SCRATCH              2, 8, %%q0
+    SCRATCH              6, 9, %%q1
+    mova                m2, [%%p7]
+    mova                m3, [%%p6]
+    mova                m4, [%%p5]
+    mova                m5, [%%p4]
+    mova                m6, [%%p3]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, 10
+%else
+    mova            [%%p1], m7
+    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
+%endif
+
+    mova [dst0q+strideq*0-16], m2
+    mova [dst0q+strideq*1-16], m3
+    mova [dst0q+strideq*2-16], m4
+    mova [dst0q+stride3q -16], m5
+%if ARCH_X86_64
+    mova [dst4q+strideq*0-16], m6
+%endif
+    mova [dst4q+strideq*1-16], m1
+    mova [dst4q+strideq*2-16], m7
+    mova [dst4q+stride3q -16], m0
+
+    UNSCRATCH            2, 8, %%q0
+    UNSCRATCH            6, 9, %%q1
+    mova                m0, [%%q2]
+    mova                m1, [%%q3]
+    mova                m3, [%%q4]
+    mova                m4, [%%q5]
+%if ARCH_X86_64
+    mova                m5, [%%q6]
+%endif
+    mova                m7, [%%q7]
+
+%if ARCH_X86_64
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, 8
+%else
+    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
+%endif
+
+    mova [dst0q+strideq*0], m2
+    mova [dst0q+strideq*1], m6
+    mova [dst0q+strideq*2], m0
+    mova [dst0q+stride3q ], m1
+%if ARCH_X86_64
+    mova [dst4q+strideq*0], m3
+%endif
+    mova [dst4q+strideq*1], m4
+    mova [dst4q+strideq*2], m5
+    mova [dst4q+stride3q ], m7
+%endif ; %2
+%endif ; %1
+    RET
+%endmacro
+
+%macro LOOP_FILTER_CPUSETS 3
+INIT_XMM sse2
+LOOP_FILTER %1, %2, %3
+INIT_XMM ssse3
+LOOP_FILTER %1, %2, %3
+INIT_XMM avx
+LOOP_FILTER %1, %2, %3
+%endmacro
+
+%macro LOOP_FILTER_WDSETS 2
+LOOP_FILTER_CPUSETS %1,  4, %2
+LOOP_FILTER_CPUSETS %1,  8, %2
+LOOP_FILTER_CPUSETS %1, 16, %2
+%endmacro
+
+LOOP_FILTER_WDSETS h, 10
+LOOP_FILTER_WDSETS v, 10
+LOOP_FILTER_WDSETS h, 12
+LOOP_FILTER_WDSETS v, 12
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
new file mode 100644
index 0000000..9152ba5
--- /dev/null
+++ b/libavcodec/x86/vp9mc.asm
@@ -0,0 +1,676 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+cextern pw_256
+cextern pw_64
+
+%macro F8_SSSE3_TAPS 8
+times 16 db %1, %2
+times 16 db %3, %4
+times 16 db %5, %6
+times 16 db %7, %8
+%endmacro
+
+%macro F8_SSE2_TAPS 8
+times 8 dw %1
+times 8 dw %2
+times 8 dw %3
+times 8 dw %4
+times 8 dw %5
+times 8 dw %6
+times 8 dw %7
+times 8 dw %8
+%endmacro
+
+%macro F8_16BPP_TAPS 8
+times 8 dw %1, %2
+times 8 dw %3, %4
+times 8 dw %5, %6
+times 8 dw %7, %8
+%endmacro
+
+%macro FILTER 1
+const filters_%1 ; smooth
+                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
+                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
+                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
+                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
+                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
+                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
+                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
+                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
+                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
+                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
+                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
+                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
+                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
+                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
+                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
+                    ; regular
+                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
+                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
+                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
+                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
+                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
+                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
+                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
+                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
+                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
+                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
+                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
+                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
+                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
+                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
+                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
+                    ; sharp
+                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
+                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
+                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
+                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
+                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
+                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
+                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
+                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
+                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
+                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
+                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
+                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
+                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
+                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
+                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
+%endmacro
+
+%define F8_TAPS F8_SSSE3_TAPS
+; int8_t ff_filters_ssse3[3][15][4][32]
+FILTER ssse3
+%define F8_TAPS F8_SSE2_TAPS
+; int16_t ff_filters_sse2[3][15][8][8]
+FILTER sse2
+%define F8_TAPS F8_16BPP_TAPS
+; int16_t ff_filters_16bpp[3][15][4][16]
+FILTER 16bpp
+
+SECTION .text
+
+%macro filter_sse2_h_fn 1
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
+    pxor        m5, m5
+    mova        m6, [pw_64]
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 16]
+    mova        m9, [filteryq+ 32]
+    mova       m10, [filteryq+ 48]
+    mova       m11, [filteryq+ 64]
+    mova       m12, [filteryq+ 80]
+    mova       m13, [filteryq+ 96]
+    mova       m14, [filteryq+112]
+%endif
+.loop:
+    movh        m0, [srcq-3]
+    movh        m1, [srcq-2]
+    movh        m2, [srcq-1]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+1]
+    punpcklbw   m0, m5
+    punpcklbw   m1, m5
+    punpcklbw   m2, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+    pmullw      m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m8
+    pmullw      m2, m9
+    pmullw      m3, m10
+    pmullw      m4, m11
+%else
+    pmullw      m1, [filteryq+ 16]
+    pmullw      m2, [filteryq+ 32]
+    pmullw      m3, [filteryq+ 48]
+    pmullw      m4, [filteryq+ 64]
+%endif
+    paddw       m0, m1
+    paddw       m2, m3
+    paddw       m0, m4
+    movh        m1, [srcq+2]
+    movh        m3, [srcq+3]
+    movh        m4, [srcq+4]
+    add       srcq, sstrideq
+    punpcklbw   m1, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m12
+    pmullw      m3, m13
+    pmullw      m4, m14
+%else
+    pmullw      m1, [filteryq+ 80]
+    pmullw      m3, [filteryq+ 96]
+    pmullw      m4, [filteryq+112]
+%endif
+    paddw       m0, m1
+    paddw       m3, m4
+    paddw       m0, m6
+    paddw       m2, m3
+    paddsw      m0, m2
+    psraw       m0, 7
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+filter_sse2_h_fn put
+filter_sse2_h_fn avg
+
+INIT_XMM sse2
+filter_sse2_h_fn put
+filter_sse2_h_fn avg
+
+%macro filter_h_fn 1
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
+    mova        m6, [pw_256]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movh        m0, [srcq-3]
+    movh        m1, [srcq-2]
+    movh        m2, [srcq-1]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+1]
+    movh        m5, [srcq+2]
+    punpcklbw   m0, m1
+    punpcklbw   m2, m3
+    movh        m1, [srcq+3]
+    movh        m3, [srcq+4]
+    add       srcq, sstrideq
+    punpcklbw   m4, m5
+    punpcklbw   m1, m3
+    pmaddubsw   m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddubsw   m2, m8
+    pmaddubsw   m4, m9
+    pmaddubsw   m1, m10
+%else
+    pmaddubsw   m2, [filteryq+32]
+    pmaddubsw   m4, [filteryq+64]
+    pmaddubsw   m1, [filteryq+96]
+%endif
+    paddw       m0, m4
+    paddw       m2, m1
+    paddsw      m0, m2
+    pmulhrsw    m0, m6
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX ssse3
+filter_h_fn put
+filter_h_fn avg
+
+INIT_XMM ssse3
+filter_h_fn put
+filter_h_fn avg
+
+%if ARCH_X86_64
+%macro filter_hx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+    mova       m13, [pw_256]
+    mova        m8, [filteryq+ 0]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+64]
+    mova       m11, [filteryq+96]
+.loop:
+    movu        m0, [srcq-3]
+    movu        m1, [srcq-2]
+    movu        m2, [srcq-1]
+    movu        m3, [srcq+0]
+    movu        m4, [srcq+1]
+    movu        m5, [srcq+2]
+    movu        m6, [srcq+3]
+    movu        m7, [srcq+4]
+    add       srcq, sstrideq
+    SBUTTERFLY  bw, 0, 1, 12
+    SBUTTERFLY  bw, 2, 3, 12
+    SBUTTERFLY  bw, 4, 5, 12
+    SBUTTERFLY  bw, 6, 7, 12
+    pmaddubsw   m0, m8
+    pmaddubsw   m1, m8
+    pmaddubsw   m2, m9
+    pmaddubsw   m3, m9
+    pmaddubsw   m4, m10
+    pmaddubsw   m5, m10
+    pmaddubsw   m6, m11
+    pmaddubsw   m7, m11
+    paddw       m0, m4
+    paddw       m1, m5
+    paddw       m2, m6
+    paddw       m3, m7
+    paddsw      m0, m2
+    paddsw      m1, m3
+    pmulhrsw    m0, m13
+    pmulhrsw    m1, m13
+    packuswb    m0, m1
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+filter_hx2_fn put
+filter_hx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_hx2_fn put
+filter_hx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
+%macro filter_sse2_v_fn 1
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    pxor        m5, m5
+    mova        m6, [pw_64]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 16]
+    mova        m9, [filteryq+ 32]
+    mova       m10, [filteryq+ 48]
+    mova       m11, [filteryq+ 64]
+    mova       m12, [filteryq+ 80]
+    mova       m13, [filteryq+ 96]
+    mova       m14, [filteryq+112]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movh        m4, [src4q]
+    punpcklbw   m0, m5
+    punpcklbw   m1, m5
+    punpcklbw   m2, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+    pmullw      m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m8
+    pmullw      m2, m9
+    pmullw      m3, m10
+    pmullw      m4, m11
+%else
+    pmullw      m1, [filteryq+ 16]
+    pmullw      m2, [filteryq+ 32]
+    pmullw      m3, [filteryq+ 48]
+    pmullw      m4, [filteryq+ 64]
+%endif
+    paddw       m0, m1
+    paddw       m2, m3
+    paddw       m0, m4
+    movh        m1, [src4q+sstrideq]
+    movh        m3, [src4q+sstrideq*2]
+    movh        m4, [src4q+sstride3q]
+    add      src4q, sstrideq
+    punpcklbw   m1, m5
+    punpcklbw   m3, m5
+    punpcklbw   m4, m5
+%if ARCH_X86_64 && mmsize > 8
+    pmullw      m1, m12
+    pmullw      m3, m13
+    pmullw      m4, m14
+%else
+    pmullw      m1, [filteryq+ 80]
+    pmullw      m3, [filteryq+ 96]
+    pmullw      m4, [filteryq+112]
+%endif
+    paddw       m0, m1
+    paddw       m3, m4
+    paddw       m0, m6
+    paddw       m2, m3
+    paddsw      m0, m2
+    psraw       m0, 7
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX mmxext
+filter_sse2_v_fn put
+filter_sse2_v_fn avg
+
+INIT_XMM sse2
+filter_sse2_v_fn put
+filter_sse2_v_fn avg
+
+%macro filter_v_fn 1
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m6, [pw_256]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    movh        m4, [src4q]
+    movh        m5, [src4q+sstrideq]
+    punpcklbw   m0, m1
+    punpcklbw   m2, m3
+    movh        m1, [src4q+sstrideq*2]
+    movh        m3, [src4q+sstride3q]
+    add       srcq, sstrideq
+    add      src4q, sstrideq
+    punpcklbw   m4, m5
+    punpcklbw   m1, m3
+    pmaddubsw   m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddubsw   m2, m8
+    pmaddubsw   m4, m9
+    pmaddubsw   m1, m10
+%else
+    pmaddubsw   m2, [filteryq+32]
+    pmaddubsw   m4, [filteryq+64]
+    pmaddubsw   m1, [filteryq+96]
+%endif
+    paddw       m0, m4
+    paddw       m2, m1
+    paddsw      m0, m2
+    pmulhrsw    m0, m6
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX ssse3
+filter_v_fn put
+filter_v_fn avg
+
+INIT_XMM ssse3
+filter_v_fn put
+filter_v_fn avg
+
+%if ARCH_X86_64
+
+%macro filter_vx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+    mova       m13, [pw_256]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m8, [filteryq+ 0]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+64]
+    mova       m11, [filteryq+96]
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movu        m0, [srcq]
+    movu        m1, [srcq+sstrideq]
+    movu        m2, [srcq+sstrideq*2]
+    movu        m3, [srcq+sstride3q]
+    movu        m4, [src4q]
+    movu        m5, [src4q+sstrideq]
+    movu        m6, [src4q+sstrideq*2]
+    movu        m7, [src4q+sstride3q]
+    add       srcq, sstrideq
+    add      src4q, sstrideq
+    SBUTTERFLY  bw, 0, 1, 12
+    SBUTTERFLY  bw, 2, 3, 12
+    SBUTTERFLY  bw, 4, 5, 12
+    SBUTTERFLY  bw, 6, 7, 12
+    pmaddubsw   m0, m8
+    pmaddubsw   m1, m8
+    pmaddubsw   m2, m9
+    pmaddubsw   m3, m9
+    pmaddubsw   m4, m10
+    pmaddubsw   m5, m10
+    pmaddubsw   m6, m11
+    pmaddubsw   m7, m11
+    paddw       m0, m4
+    paddw       m1, m5
+    paddw       m2, m6
+    paddw       m3, m7
+    paddsw      m0, m2
+    paddsw      m1, m3
+    pmulhrsw    m0, m13
+    pmulhrsw    m1, m13
+    packuswb    m0, m1
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+filter_vx2_fn put
+filter_vx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_vx2_fn put
+filter_vx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
+%macro fpel_fn 6-8 0, 4
+%if %2 == 4
+%define %%srcfn movh
+%define %%dstfn movh
+%else
+%define %%srcfn movu
+%define %%dstfn mova
+%endif
+
+%if %7 == 8
+%define %%pavg pavgb
+%define %%szsuf _8
+%elif %7 == 16
+%define %%pavg pavgw
+%define %%szsuf _16
+%else
+%define %%szsuf
+%endif
+
+%if %2 <= mmsize
+cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+    lea  sstride3q, [sstrideq*3]
+    lea  dstride3q, [dstrideq*3]
+%else
+cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
+%endif
+.loop:
+    %%srcfn     m0, [srcq]
+    %%srcfn     m1, [srcq+s%3]
+    %%srcfn     m2, [srcq+s%4]
+    %%srcfn     m3, [srcq+s%5]
+%if %2/mmsize == 8
+    %%srcfn     m4, [srcq+mmsize*4]
+    %%srcfn     m5, [srcq+mmsize*5]
+    %%srcfn     m6, [srcq+mmsize*6]
+    %%srcfn     m7, [srcq+mmsize*7]
+%endif
+    lea       srcq, [srcq+sstrideq*%6]
+%ifidn %1, avg
+    %%pavg      m0, [dstq]
+    %%pavg      m1, [dstq+d%3]
+    %%pavg      m2, [dstq+d%4]
+    %%pavg      m3, [dstq+d%5]
+%if %2/mmsize == 8
+    %%pavg      m4, [dstq+mmsize*4]
+    %%pavg      m5, [dstq+mmsize*5]
+    %%pavg      m6, [dstq+mmsize*6]
+    %%pavg      m7, [dstq+mmsize*7]
+%endif
+%endif
+    %%dstfn [dstq], m0
+    %%dstfn [dstq+d%3], m1
+    %%dstfn [dstq+d%4], m2
+    %%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+    %%dstfn [dstq+mmsize*4], m4
+    %%dstfn [dstq+mmsize*5], m5
+    %%dstfn [dstq+mmsize*6], m6
+    %%dstfn [dstq+mmsize*7], m7
+%endif
+    lea       dstq, [dstq+dstrideq*%6]
+    sub         hd, %6
+    jnz .loop
+    RET
+%endmacro
+
+%define d16 16
+%define s16 16
+%define d32 32
+%define s32 32
+INIT_MMX mmx
+fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
+fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
+INIT_MMX mmxext
+fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
+INIT_XMM sse
+fpel_fn put, 16, strideq, strideq*2, stride3q, 4
+fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
+fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
+INIT_YMM avx
+fpel_fn put, 32, strideq, strideq*2, stride3q, 4
+fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
+%endif
+INIT_MMX mmxext
+fpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
+INIT_XMM sse2
+fpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
+fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
+%endif
+%undef s16
+%undef d16
+%undef s32
+%undef d32
diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm
new file mode 100644
index 0000000..9a462ea
--- /dev/null
+++ b/libavcodec/x86/vp9mc_16bpp.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* VP9 MC SIMD optimizations
+;*
+;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pd_64: times 8 dd 64
+
+cextern pw_1023
+cextern pw_4095
+
+SECTION .text
+
+%macro filter_h4_fn 1-2 12
+cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movh        m0, [srcq-6]
+    movh        m1, [srcq-4]
+    movh        m2, [srcq-2]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+2]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+32]
+%endif
+    movu        m1, [srcq+4]
+    movu        m3, [srcq+6]
+    paddd       m0, m2
+    movu        m2, [srcq+8]
+    add       srcq, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h4_fn put
+filter_h4_fn avg
+
+%macro filter_h_fn 1-2 12
+%assign %%px mmsize/2
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
+%endif
+.loop:
+    movu        m0, [srcq-6]
+    movu        m1, [srcq-4]
+    movu        m2, [srcq-2]
+    movu        m3, [srcq+0]
+    movu        m4, [srcq+2]
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+    pmaddwd     m4, m9
+%else
+    pmaddwd     m2, [filteryq+32]
+    pmaddwd     m3, [filteryq+32]
+    pmaddwd     m4, [filteryq+64]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    paddd       m0, m4
+    movu        m2, [srcq+4]
+    movu        m3, [srcq+6]
+    movu        m4, [srcq+8]
+    add       srcq, sstrideq
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m9
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m2, [filteryq+64]
+    pmaddwd     m3, [filteryq+96]
+    pmaddwd     m4, [filteryq+96]
+%endif
+    paddd       m1, m2
+    paddd       m0, m3
+    paddd       m1, m4
+    paddd       m0, m6
+    paddd       m1, m6
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+    packusdw    m1, m1
+%else
+    packssdw    m0, m0
+    packssdw    m1, m1
+%endif
+    punpcklwd   m0, m1
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_h_fn put
+filter_h_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_h_fn put
+filter_h_fn avg
+%endif
+
+%macro filter_v4_fn 1-2 12
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m11, m11
+%endif
+    mova        m6, [pd_64]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movh        m4, [src4q]
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    pmaddwd     m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+%endif
+    movh        m1, [src4q+sstrideq]
+    movh        m3, [src4q+sstrideq*2]
+    paddd       m0, m2
+    movh        m2, [src4q+sstride3q]
+    add      src4q, sstrideq
+    punpcklwd   m4, m1
+    punpcklwd   m3, m2
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m3, m10
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m3, [filteryq+ 96]
+%endif
+    paddd       m0, m4
+    paddd       m0, m3
+    paddd       m0, m6
+    psrad       m0, 7
+%if cpuflag(sse4)
+    packusdw    m0, m0
+%else
+    packssdw    m0, m0
+%endif
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m11
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v4_fn put
+filter_v4_fn avg
+
+%macro filter_v_fn 1-2 13
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    mova        m5, [pw_1023]
+.body:
+%if notcpuflag(sse4) && ARCH_X86_64
+    pxor       m12, m12
+%endif
+%if ARCH_X86_64
+    mova       m11, [pd_64]
+%endif
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m7, [filteryq+  0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+ 32]
+    mova        m9, [filteryq+ 64]
+    mova       m10, [filteryq+ 96]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movu        m0, [srcq]
+    movu        m1, [srcq+sstrideq]
+    movu        m2, [srcq+sstrideq*2]
+    movu        m3, [srcq+sstride3q]
+    add       srcq, sstrideq
+    movu        m4, [src4q]
+    SBUTTERFLY  wd, 0, 1, 6
+    SBUTTERFLY  wd, 2, 3, 6
+    pmaddwd     m0, m7
+    pmaddwd     m1, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m2, m8
+    pmaddwd     m3, m8
+%else
+    pmaddwd     m2, [filteryq+ 32]
+    pmaddwd     m3, [filteryq+ 32]
+%endif
+    paddd       m0, m2
+    paddd       m1, m3
+    movu        m2, [src4q+sstrideq]
+    movu        m3, [src4q+sstrideq*2]
+    SBUTTERFLY  wd, 4, 2, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m4, m9
+    pmaddwd     m2, m9
+%else
+    pmaddwd     m4, [filteryq+ 64]
+    pmaddwd     m2, [filteryq+ 64]
+%endif
+    paddd       m0, m4
+    paddd       m1, m2
+    movu        m4, [src4q+sstride3q]
+    add      src4q, sstrideq
+    SBUTTERFLY  wd, 3, 4, 6
+%if ARCH_X86_64 && mmsize > 8
+    pmaddwd     m3, m10
+    pmaddwd     m4, m10
+%else
+    pmaddwd     m3, [filteryq+ 96]
+    pmaddwd     m4, [filteryq+ 96]
+%endif
+    paddd       m0, m3
+    paddd       m1, m4
+%if ARCH_X86_64
+    paddd       m0, m11
+    paddd       m1, m11
+%else
+    paddd       m0, [pd_64]
+    paddd       m1, [pd_64]
+%endif
+    psrad       m0, 7
+    psrad       m1, 7
+%if cpuflag(sse4)
+    packusdw    m0, m1
+%else
+    packssdw    m0, m1
+%endif
+    pminsw      m0, m5
+%if notcpuflag(sse4)
+%if ARCH_X86_64
+    pmaxsw      m0, m12
+%else
+    pxor        m2, m2
+    pmaxsw      m0, m2
+%endif
+%endif
+%ifidn %1, avg
+    pavgw       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
+%else
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%endif
+    mova        m5, [pw_4095]
+    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
+%endmacro
+
+INIT_XMM sse2
+filter_v_fn put
+filter_v_fn avg
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_v_fn put
+filter_v_fn avg
+%endif
diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c
index 2f064ca..94b3049 100644
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
  * check XMM registers for clobbers on Win64
  * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
new file mode 100644
index 0000000..0220885
--- /dev/null
+++ b/libavcodec/x86/xvididct.asm
@@ -0,0 +1,983 @@
+; XVID MPEG-4 VIDEO CODEC
+;
+; Conversion from gcc syntax to x264asm syntax with modifications
+; by Christophe Gisquet <christophe.gisquet@gmail.com>
+;
+; ===========     SSE2 inverse discrete cosine transform     ===========
+;
+; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+;
+; Conversion to gcc syntax with modifications
+; by Alexander Strange <astrange@ithinksw.com>
+;
+; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+;
+; Vertical pass is an implementation of the scheme:
+;  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+;  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+;  Proc. ICASSP 1989, 988-991.
+;
+; Horizontal pass is a double 4x4 vector/matrix multiplication,
+; (see also Intel's Application Note 922:
+;  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+;  Copyright (C) 1999 Intel Corporation)
+;
+; More details at http://skal.planet-d.net/coding/dct.html
+;
+; =======     MMX and XMM forward discrete cosine transform     =======
+;
+; Copyright(C) 2001 Peter Ross <pross@xvid.org>
+;
+; Originally provided by Intel at AP-922
+; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+; but in a limited edition.
+; New macro implements a column part for precise iDCT
+; The routine precision now satisfies IEEE standard 1180-1990.
+;
+; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+;
+; http://www.elecard.com/peter/idct.html
+; http://www.linuxvideo.org/mpeg2dec/
+;
+; These examples contain code fragments for first stage iDCT 8x8
+; (for rows) and first stage DCT 8x8 (for columns)
+;
+; conversion to gcc syntax by Michael Niedermayer
+;
+; ======================================================================
+;
+; This file is part of FFmpeg.
+;
+; FFmpeg is free software; you can redistribute it and/or
+; modify it under the terms of the GNU Lesser General Public
+; License as published by the Free Software Foundation; either
+; version 2.1 of the License, or (at your option) any later version.
+;
+; FFmpeg is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; Lesser General Public License for more details.
+;
+; You should have received a copy of the GNU Lesser General Public License
+; along with FFmpeg; if not, write to the Free Software Foundation,
+; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+; Similar to tg_1_16 in MMX code
+tan1:   times 8 dw 13036
+tan2:   times 8 dw 27146
+tan3:   times 8 dw 43790
+sqrt2:  times 8 dw 23170
+
+; SSE2 tables
+iTab1:  dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
+        dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
+        dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
+        dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+iTab2:  dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
+        dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
+        dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
+        dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+iTab3:  dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
+        dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
+        dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
+        dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
+        dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
+        dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
+        dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+
+%if ARCH_X86_32
+; -----------------------------------------------------------------------------
+;
+; The first stage iDCT 8x8 - inverse DCTs of rows
+;
+; -----------------------------------------------------------------------------
+; The 8-point inverse DCT direct algorithm
+; -----------------------------------------------------------------------------
+;
+; static const short w[32] = {
+;     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
+;     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
+;     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
+;     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
+;     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+;     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
+;     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
+;
+; #define DCT_8_INV_ROW(x, y)
+; {
+;     int a0, a1, a2, a3, b0, b1, b2, b3;
+;
+;     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
+;     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
+;     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
+;     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+;     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+;     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+;     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+;     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+;
+;     y[0] = SHIFT_ROUND(a0 + b0);
+;     y[1] = SHIFT_ROUND(a1 + b1);
+;     y[2] = SHIFT_ROUND(a2 + b2);
+;     y[3] = SHIFT_ROUND(a3 + b3);
+;     y[4] = SHIFT_ROUND(a3 - b3);
+;     y[5] = SHIFT_ROUND(a2 - b2);
+;     y[6] = SHIFT_ROUND(a1 - b1);
+;     y[7] = SHIFT_ROUND(a0 - b0);
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; In this implementation the outputs of the iDCT-1D are multiplied
+;     for rows 0,4 - by cos_4_16,
+;     for rows 1,7 - by cos_1_16,
+;     for rows 2,6 - by cos_2_16,
+;     for rows 3,5 - by cos_3_16
+; and are shifted to the left for better accuracy.
+;
+; For the constants used,
+;     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; Tables for mmx processors
+; -----------------------------------------------------------------------------
+
+; Table for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_mmx: dw  16384,  16384,  16384, -16384
+              dw  21407,   8867,   8867, -21407 ; w07 w05 w03 w01
+              dw  16384, -16384,  16384,  16384 ; w14 w12 w10 w08
+              dw  -8867,  21407, -21407,  -8867 ; w15 w13 w11 w09
+              dw  22725,  12873,  19266, -22725 ; w22 w20 w18 w16
+              dw  19266,   4520,  -4520, -12873 ; w23 w21 w19 w17
+              dw  12873,   4520,   4520,  19266 ; w30 w28 w26 w24
+              dw -22725,  19266, -12873, -22725 ; w31 w29 w27 w25
+; Table for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  22725,  22725, -22725 ; movq-> w06 w04 w02 w00
+              dw  29692,  12299,  12299, -29692 ; w07 w05 w03 w01
+              dw  22725, -22725,  22725,  22725 ; w14 w12 w10 w08
+              dw -12299,  29692, -29692, -12299 ; w15 w13 w11 w09
+              dw  31521,  17855,  26722, -31521 ; w22 w20 w18 w16
+              dw  26722,   6270,  -6270, -17855 ; w23 w21 w19 w17
+              dw  17855,   6270,   6270,  26722 ; w30 w28 w26 w24
+              dw -31521,  26722, -17855, -31521 ; w31 w29 w27 w25
+; Table for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  21407,  21407, -21407 ; movq-> w06 w04 w02 w00
+              dw  27969,  11585,  11585, -27969 ; w07 w05 w03 w01
+              dw  21407, -21407,  21407,  21407 ; w14 w12 w10 w08
+              dw -11585,  27969, -27969, -11585 ; w15 w13 w11 w09
+              dw  29692,  16819,  25172, -29692 ; w22 w20 w18 w16
+              dw  25172,   5906,  -5906, -16819 ; w23 w21 w19 w17
+              dw  16819,   5906,   5906,  25172 ; w30 w28 w26 w24
+              dw -29692,  25172, -16819, -29692 ; w31 w29 w27 w25
+; Table for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  19266,  19266, -19266 ; movq-> w06 w04 w02 w00
+              dw  25172,  10426,  10426, -25172 ; w07 w05 w03 w01
+              dw  19266, -19266,  19266,  19266 ; w14 w12 w10 w08
+              dw -10426,  25172, -25172, -10426 ; w15 w13 w11 w09
+              dw  26722,  15137,  22654, -26722 ; w22 w20 w18 w16
+              dw  22654,   5315,  -5315, -15137 ; w23 w21 w19 w17
+              dw  15137,   5315,   5315,  22654 ; w30 w28 w26 w24
+              dw -26722,  22654, -15137, -26722 ; w31 w29 w27 w25
+
+; -----------------------------------------------------------------------------
+; Tables for xmm processors
+; -----------------------------------------------------------------------------
+
+; %3 for rows 0,4 - constants are multiplied by cos_4_16
+tab_i_04_xmm: dw  16384,  21407,  16384,   8867 ; movq-> w05 w04 w01 w00
+              dw  16384,   8867, -16384, -21407 ; w07 w06 w03 w02
+              dw  16384,  -8867,  16384, -21407 ; w13 w12 w09 w08
+              dw -16384,  21407,  16384,  -8867 ; w15 w14 w11 w10
+              dw  22725,  19266,  19266,  -4520 ; w21 w20 w17 w16
+              dw  12873,   4520, -22725, -12873 ; w23 w22 w19 w18
+              dw  12873, -22725,   4520, -12873 ; w29 w28 w25 w24
+              dw   4520,  19266,  19266, -22725 ; w31 w30 w27 w26
+; %3 for rows 1,7 - constants are multiplied by cos_1_16
+              dw  22725,  29692,  22725,  12299 ; movq-> w05 w04 w01 w00
+              dw  22725,  12299, -22725, -29692 ; w07 w06 w03 w02
+              dw  22725, -12299,  22725, -29692 ; w13 w12 w09 w08
+              dw -22725,  29692,  22725, -12299 ; w15 w14 w11 w10
+              dw  31521,  26722,  26722,  -6270 ; w21 w20 w17 w16
+              dw  17855,   6270, -31521, -17855 ; w23 w22 w19 w18
+              dw  17855, -31521,   6270, -17855 ; w29 w28 w25 w24
+              dw   6270,  26722,  26722, -31521 ; w31 w30 w27 w26
+; %3 for rows 2,6 - constants are multiplied by cos_2_16
+              dw  21407,  27969,  21407,  11585 ; movq-> w05 w04 w01 w00
+              dw  21407,  11585, -21407, -27969 ; w07 w06 w03 w02
+              dw  21407, -11585,  21407, -27969 ; w13 w12 w09 w08
+              dw -21407,  27969,  21407, -11585 ; w15 w14 w11 w10
+              dw  29692,  25172,  25172,  -5906 ; w21 w20 w17 w16
+              dw  16819,   5906, -29692, -16819 ; w23 w22 w19 w18
+              dw  16819, -29692,   5906, -16819 ; w29 w28 w25 w24
+              dw   5906,  25172,  25172, -29692 ; w31 w30 w27 w26
+; %3 for rows 3,5 - constants are multiplied by cos_3_16
+              dw  19266,  25172,  19266,  10426 ; movq-> w05 w04 w01 w00
+              dw  19266,  10426, -19266, -25172 ; w07 w06 w03 w02
+              dw  19266, -10426,  19266, -25172 ; w13 w12 w09 w08
+              dw -19266,  25172,  19266, -10426 ; w15 w14 w11 w10
+              dw  26722,  22654,  22654,  -5315 ; w21 w20 w17 w16
+              dw  15137,   5315, -26722, -15137 ; w23 w22 w19 w18
+              dw  15137, -26722,   5315, -15137 ; w29 w28 w25 w24
+              dw   5315,  22654,  22654, -26722 ; w31 w30 w27 w26
+%endif ; ~ARCH_X86_32
+
+; Similar to rounder_0 in MMX code
+; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
+walkenIdctRounders: times 4 dd 65536
+                    times 4 dd  3597
+                    times 4 dd  2260
+                    times 4 dd  1203
+                    times 4 dd   120
+                    times 4 dd   512
+                    times 2 dd     0
+
+pb_127: times 8 db 127
+
+SECTION .text
+
+; Temporary storage before the column pass
+%define ROW1 xmm6
+%define ROW3 xmm4
+%define ROW5 xmm5
+%define ROW7 xmm7
+
+%macro CLEAR_ODD 1
+    pxor      %1, %1
+%endmacro
+%macro PUT_ODD 1
+    pshufhw   %1, xmm2, 0x1B
+%endmacro
+
+%macro MOV32 2
+%if ARCH_X86_32
+    movdqa    %2, %1
+%endif
+%endmacro
+
+%macro CLEAR_EVEN 1
+%if ARCH_X86_64
+    CLEAR_ODD %1
+%endif
+%endmacro
+
+%macro PUT_EVEN 1
+%if ARCH_X86_64
+    PUT_ODD   %1
+%else
+    pshufhw xmm2, xmm2, 0x1B
+    movdqa    %1, xmm2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+%define ROW0  xmm8
+%define REG0  ROW0
+%define ROW2  xmm9
+%define REG2  ROW2
+%define ROW4  xmm10
+%define REG4  ROW4
+%define ROW6  xmm11
+%define REG6  ROW6
+%define XMMS  xmm12
+%define SREG2 REG2
+%define TAN3  xmm13
+%define TAN1  xmm14
+%else
+%define ROW0  [BLOCK + 0*16]
+%define REG0  xmm4
+%define ROW2  [BLOCK + 2*16]
+%define REG2  xmm4
+%define ROW4  [BLOCK + 4*16]
+%define REG4  xmm6
+%define ROW6  [BLOCK + 6*16]
+%define REG6  xmm6
+%define XMMS  xmm2
+%define SREG2 xmm7
+%define TAN3  xmm0
+%define TAN1  xmm2
+%endif
+
+%macro JZ  2
+    test      %1, %1
+    jz       .%2
+%endmacro
+
+%macro JNZ  2
+    test      %1, %1
+    jnz      .%2
+%endmacro
+
+%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
+    %3        %4
+    movq     mm1, [%1]
+    por      mm1, [%1 + 8]
+    paddusb  mm1, mm0
+    pmovmskb  %2, mm1
+%endmacro
+
+;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
+%macro  TEST_TWO_ROWS  8
+    %5         %6
+    %7         %8
+    movq      mm1, [%1 + 0]
+    por       mm1, [%1 + 8]
+    movq      mm2, [%2 + 0]
+    por       mm2, [%2 + 8]
+    paddusb   mm1, mm0
+    paddusb   mm2, mm0
+    pmovmskb   %3, mm1
+    pmovmskb   %4, mm2
+%endmacro
+
+; IDCT pass on rows.
+%macro iMTX_MULT   4-5 ; src, table, put, arg, rounder
+    movdqa       xmm3, [%1]
+    movdqa       xmm0, xmm3
+    pshufd       xmm1, xmm3, 0x11 ; 4602
+    punpcklqdq   xmm0, xmm0       ; 0246
+    pmaddwd      xmm0, [%2]
+    pmaddwd      xmm1, [%2+16]
+    pshufd       xmm2, xmm3, 0xBB ; 5713
+    punpckhqdq   xmm3, xmm3       ; 1357
+    pmaddwd      xmm2, [%2+32]
+    pmaddwd      xmm3, [%2+48]
+    paddd        xmm0, xmm1
+    paddd        xmm2, xmm3
+%if %0 == 5
+    paddd        xmm0, [walkenIdctRounders+%5]
+%endif
+    movdqa       xmm3, xmm2
+    paddd        xmm2, xmm0
+    psubd        xmm0, xmm3
+    psrad        xmm2, 11
+    psrad        xmm0, 11
+    packssdw     xmm2, xmm0
+    %3           %4
+%endmacro
+
+%macro iLLM_HEAD 0
+    movdqa   TAN3, [tan3]
+    movdqa   TAN1, [tan1]
+%endmacro
+
+%macro FIRST_HALF 2  ; %1=dct  %2=type(normal,add,put)
+    psraw    xmm5, 6
+    psraw    REG0, 6
+    psraw    TAN3, 6
+    psraw    xmm3, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+%else
+    ; Must now load args as gprs are no longer used for masks
+    ; DEST is set to where address of dest was loaded
+    %if ARCH_X86_32
+        %if %2 == 2 ; Not enough xmms, store
+    movdqa   [%1+1*16], TAN3
+    movdqa   [%1+2*16], xmm3
+    movdqa   [%1+5*16], REG0
+    movdqa   [%1+6*16], xmm5
+        %endif
+    %xdefine DEST r2q ; BLOCK is r0, stride r1
+    movifnidn DEST, destm
+    movifnidn strideq, stridem
+    %else
+    %xdefine DEST r0q
+    %endif
+    lea      r3q, [3*strideq]
+    %if %2 == 1
+    packuswb TAN3, xmm3
+    packuswb xmm5, REG0
+    movq     [DEST + strideq], TAN3
+    movhps   [DEST + 2*strideq], TAN3
+    ; REG0 and TAN3 are now available (and likely used in second half)
+    %endif
+%endif
+%endmacro
+
+%macro SECOND_HALF 6 ; %1=dct  %2=type(normal,add,put) 3-6: xmms
+    psraw    %3, 6
+    psraw    %4, 6
+    psraw    %5, 6
+    psraw    %6, 6
+    ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+    movdqa   [%1+0*16], %3
+    movdqa   [%1+3*16], %5
+    movdqa   [%1+4*16], %6
+    movdqa   [%1+7*16], %4
+%elif %2 == 1
+    packuswb %3, %5
+    packuswb %6, %4
+    ; address of dest may have been loaded
+    movq     [DEST], %3
+    movhps   [DEST + r3q], %3
+    lea      DEST, [DEST + 4*strideq]
+    movq     [DEST], %6
+    movhps   [DEST + r3q], %6
+    ; and now write remainder of first half
+    movq     [DEST + 2*strideq], xmm5
+    movhps   [DEST + strideq], xmm5
+%elif %2 == 2
+    pxor        xmm0, xmm0
+    %if ARCH_X86_32
+    ; free: m3 REG0=m4 m5
+    ; input: m1, m7, m2, m6
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    paddsw      xmm3, %3
+    paddsw      xmm4, [%1 + 1*16]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw        %3, [%1 + 2*16]
+    paddsw      xmm5, %5
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm3, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq          %3, [DEST+2*strideq]
+    movq        xmm5, [DEST+      r3q]
+    punpcklbw   xmm3, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw     %3, xmm0
+    punpcklbw   xmm5, xmm0
+    paddsw      xmm3, %6
+    paddsw      xmm4, [%1 + 5*16]
+    paddsw        %3, [%1 + 6*16]
+    paddsw      xmm5, %4
+    packuswb    xmm3, xmm4
+    packuswb      %3, xmm5
+    movq    [DEST+0*strideq], xmm3
+    movhps  [DEST+1*strideq], xmm3
+    movq    [DEST+2*strideq], %3
+    movhps  [DEST+      r3q], %3
+    %else
+    ; l1:TAN3=m13  l2:m3  l5:REG0=m8 l6=m5
+    ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %3
+    paddsw      xmm4, TAN3
+    paddsw     xmm12, xmm3
+    paddsw     xmm11, %5
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    lea         DEST, [DEST+4*strideq]
+    movq        xmm2, [DEST+0*strideq]
+    movq        xmm4, [DEST+1*strideq]
+    movq       xmm12, [DEST+2*strideq]
+    movq       xmm11, [DEST+      r3q]
+    punpcklbw   xmm2, xmm0
+    punpcklbw   xmm4, xmm0
+    punpcklbw  xmm12, xmm0
+    punpcklbw  xmm11, xmm0
+    paddsw      xmm2, %6
+    paddsw      xmm4, REG0
+    paddsw     xmm12, xmm5
+    paddsw     xmm11, %4
+    packuswb    xmm2, xmm4
+    packuswb   xmm12, xmm11
+    movq    [DEST+0*strideq], xmm2
+    movhps  [DEST+1*strideq], xmm2
+    movq    [DEST+2*strideq], xmm12
+    movhps  [DEST+      r3q], xmm12
+    %endif
+%endif
+%endmacro
+
+
+; IDCT pass on columns.
+%macro iLLM_PASS  2  ; %1=dct  %2=type(normal,add,put)
+    movdqa   xmm1, TAN3
+    movdqa   xmm3, TAN1
+    pmulhw   TAN3, xmm4
+    pmulhw   xmm1, xmm5
+    paddsw   TAN3, xmm4
+    paddsw   xmm1, xmm5
+    psubsw   TAN3, xmm5
+    paddsw   xmm1, xmm4
+    pmulhw   xmm3, xmm7
+    pmulhw   TAN1, xmm6
+    paddsw   xmm3, xmm6
+    psubsw   TAN1, xmm7
+    movdqa   xmm7, xmm3
+    movdqa   xmm6, TAN1
+    psubsw   xmm3, xmm1
+    psubsw   TAN1, TAN3
+    paddsw   xmm1, xmm7
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm7, [tan2]
+    MOV32    ROW2, REG2
+    MOV32    ROW6, REG6
+    movdqa   xmm5, xmm7
+    pmulhw   xmm7, REG6
+    pmulhw   xmm5, REG2
+    paddsw   xmm7, REG2
+    psubsw   xmm5, REG6
+    MOV32    ROW0, REG0
+    MOV32    ROW4, REG4
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, REG4
+    paddsw   REG4, XMMS
+    movdqa   XMMS, REG4
+    psubsw   REG4, xmm7
+    paddsw   xmm7, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, xmm7
+    movdqa   xmm4, REG4
+    psubsw   xmm7, xmm1
+    psubsw   REG4, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
+%endmacro
+
+; IDCT pass on columns, assuming rows 4-7 are zero
+%macro iLLM_PASS_SPARSE   2 ; %1=dct   %2=type(normal,put,add)
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, xmm4
+    movdqa   xmm3, xmm6
+    pmulhw   TAN1, xmm6
+    movdqa   xmm1, xmm4
+    psubsw   xmm3, xmm1
+    paddsw   xmm1, xmm6
+    movdqa   xmm6, TAN1
+    psubsw   TAN1, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm6, xmm3
+    psubsw   xmm3, TAN3
+    paddsw   TAN3, xmm6
+    movdqa   xmm4, [sqrt2]
+    pmulhw   xmm3, xmm4
+    pmulhw   TAN3, xmm4
+    paddsw   TAN3, TAN3
+    paddsw   xmm3, xmm3
+    movdqa   xmm5, [tan2]
+    MOV32    ROW2, SREG2
+    pmulhw   xmm5, SREG2
+    MOV32    ROW0, REG0
+    movdqa   xmm6, REG0
+    psubsw   xmm6, SREG2
+    paddsw  SREG2, REG0
+    MOV32    TAN1, [BLOCK]
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm5
+    paddsw   xmm5, XMMS
+    movdqa   XMMS, xmm5
+    psubsw   xmm5, TAN3
+    paddsw   TAN3, XMMS
+    movdqa   XMMS, REG0
+    psubsw   REG0, xmm3
+    paddsw   xmm3, XMMS
+    MOV32    [BLOCK], TAN1
+
+    FIRST_HALF %1, %2
+
+    movdqa   xmm0, SREG2
+    movdqa   xmm4, xmm6
+    psubsw  SREG2, xmm1
+    psubsw   xmm6, TAN1
+    paddsw   xmm1, xmm0
+    paddsw   TAN1, xmm4
+
+    SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
+%endmacro
+
+%macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
+%if %1 == 0 || ARCH_X86_32
+    %define GPR0  r1d
+    %define GPR1  r2d
+    %define GPR2  r3d
+    %define GPR3  r4d
+    %define NUM_GPRS 5
+%else
+    %define GPR0  r3d
+    %define GPR1  r4d
+    %define GPR2  r5d
+    %define GPR3  r6d
+    %define NUM_GPRS 7
+%endif
+%if %1 == 0
+cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
+%xdefine BLOCK blockq
+%else
+    %if %1 == 1
+cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %else
+cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+    %endif
+    %if ARCH_X86_64
+    %xdefine BLOCK blockq
+    %else
+    mov    r0q, blockm
+    %xdefine BLOCK r0q
+    %endif
+%endif
+    movq           mm0, [pb_127]
+    iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
+    iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
+    iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
+
+    TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
+    JZ   GPR0, col1
+    iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
+.col1:
+    TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
+    TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
+
+    iLLM_HEAD
+    JNZ  GPR1, 2
+    JNZ  GPR0, 3
+    JNZ  GPR2, 4
+    JNZ  GPR3, 5
+    iLLM_PASS_SPARSE BLOCK, %1
+    jmp .6
+.2:
+    iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
+.3:
+    iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
+    JZ   GPR2, col2
+.4:
+    iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
+.col2:
+    JZ   GPR3, col3
+.5:
+    iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
+.col3:
+%if ARCH_X86_32
+    iLLM_HEAD
+%endif
+    iLLM_PASS     BLOCK, %1
+.6:
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_SSE2 0
+IDCT_SSE2 1
+IDCT_SSE2 2
+
+%if ARCH_X86_32
+
+; %1=offset  %2=tab_offset
+; %3=rnd_offset where 4*8->6*16  5*8->4*16  6/7*8->5*16
+%macro DCT_8_INV_ROW  3
+    movq       mm0, [r0+16*%1+0]  ; 0 ; x3 x2 x1 x0
+    movq       mm1, [r0+16*%1+8]  ; 1 ; x7 x6 x5 x4
+    movq       mm2, mm0       ; 2 ; x3 x2 x1 x0
+    movq       mm3, [%2+ 0]   ; 3 ; w06 w04 w02 w00
+%if cpuflag(mmxext)
+    pshufw     mm0, mm0, 0x88 ; x2 x0 x2 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w06 w03 w02
+    movq       mm5, mm1       ; 5 ; x7 x6 x5 x4
+    pmaddwd    mm3, mm0       ; x2*w05+x0*w04 x2*w01+x0*w00
+    movq       mm6, [%2+32]   ; 6 ; w21 w20 w17 w16
+    pshufw     mm1, mm1, 0x88 ; x6 x4 x6 x4
+    pmaddwd    mm4, mm1       ; x6*w07+x4*w06 x6*w03+x4*w02
+    movq       mm7, [%2+40]   ; 7; w23 w22 w19 w18
+    pshufw     mm2, mm2, 0xdd ; x3 x1 x3 x1
+    pmaddwd    mm6, mm2       ; x3*w21+x1*w20 x3*w17+x1*w16
+    pshufw     mm5, mm5, 0xdd ; x7 x5 x7 x5
+    pmaddwd    mm7, mm5       ; x7*w23+x5*w22 x7*w19+x5*w18
+    paddd      mm3, [walkenIdctRounders + %3]      ; +%3
+    pmaddwd    mm0, [%2+16]   ; x2*w13+x0*w12 x2*w09+x0*w08
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm1, [%2+24]   ; x6*w15+x4*w14 x6*w11+x4*w10
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm2, [%2+48]   ; x3*w29+x1*w28 x3*w25+x1*w24
+    paddd      mm6, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    pmaddwd    mm5, [%2+56]   ; x7*w31+x5*w30 x7*w27+x5*w26
+    paddd      mm3, mm6       ; a1+b1 a0+b0
+    paddd      mm0, [walkenIdctRounders + %3]      ; +%3
+    psrad      mm3, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm0, mm1       ; 1 ; a3=sum(even3) a2=sum(even2)
+    psubd      mm4, mm6       ; 6 ; a1-b1 a0-b0
+    movq       mm7, mm0       ; 7 ; a3 a2
+    paddd      mm2, mm5       ; 5 ; b3=sum(odd3) b2=sum(odd2)
+    paddd      mm0, mm2       ; a3+b3 a2+b2
+    psrad      mm4, 11        ; y6=a1-b1 y7=a0-b0
+    psubd      mm7, mm2       ; 2 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm7, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm3, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm7, mm4       ; 4 ; y6 y7 y4 y5
+    movq  [r0+16*%1+0], mm3       ; 3 ; save y3 y2 y1 y0
+    pshufw     mm7, mm7, 0xb1 ; y7 y6 y5 y4
+%else
+    punpcklwd  mm0, mm1       ; x5 x1 x4 x0
+    movq       mm5, mm0       ; 5 ; x5 x1 x4 x0
+    punpckldq  mm0, mm0       ; x4 x0 x4 x0
+    movq       mm4, [%2+ 8]   ; 4 ; w07 w05 w03 w01
+    punpckhwd  mm2, mm1       ; 1 ; x7 x3 x6 x2
+    pmaddwd    mm3, mm0       ; x4*w06+x0*w04 x4*w02+x0*w00
+    movq       mm6, mm2       ; 6 ; x7 x3 x6 x2
+    movq       mm1, [%2+32]   ; 1 ; w22 w20 w18 w16
+    punpckldq  mm2, mm2       ; x6 x2 x6 x2
+    pmaddwd    mm4, mm2       ; x6*w07+x2*w05 x6*w03+x2*w01
+    punpckhdq  mm5, mm5       ; x5 x1 x5 x1
+    pmaddwd    mm0, [%2+16]   ; x4*w14+x0*w12 x4*w10+x0*w08
+    punpckhdq  mm6, mm6       ; x7 x3 x7 x3
+    movq       mm7, [%2+40]   ; 7 ; w23 w21 w19 w17
+    pmaddwd    mm1, mm5       ; x5*w22+x1*w20 x5*w18+x1*w16
+    paddd      mm3, [walkenIdctRounders + %3]     ; +%3
+    pmaddwd    mm7, mm6       ; x7*w23+x3*w21 x7*w19+x3*w17
+    pmaddwd    mm2, [%2+24]   ; x6*w15+x2*w13 x6*w11+x2*w09
+    paddd      mm3, mm4       ; 4 ; a1=sum(even1) a0=sum(even0)
+    pmaddwd    mm5, [%2+48]   ; x5*w30+x1*w28 x5*w26+x1*w24
+    movq       mm4, mm3       ; 4 ; a1 a0
+    pmaddwd    mm6, [%2+56]   ; x7*w31+x3*w29 x7*w27+x3*w25
+    paddd      mm1, mm7       ; 7 ; b1=sum(odd1) b0=sum(odd0)
+    paddd      mm0, [walkenIdctRounders + %3]     ; +%3
+    psubd      mm3, mm1       ; a1-b1 a0-b0
+    psrad      mm3, 11        ; y6=a1-b1 y7=a0-b0
+    paddd      mm1, mm4       ; 4 ; a1+b1 a0+b0
+    paddd      mm0, mm2       ; 2 ; a3=sum(even3) a2=sum(even2)
+    psrad      mm1, 11        ; y1=a1+b1 y0=a0+b0
+    paddd      mm5, mm6       ; 6 ; b3=sum(odd3) b2=sum(odd2)
+    movq       mm4, mm0       ; 4 ; a3 a2
+    paddd      mm0, mm5       ; a3+b3 a2+b2
+    psubd      mm4, mm5       ; 5 ; a3-b3 a2-b2
+    psrad      mm0, 11        ; y3=a3+b3 y2=a2+b2
+    psrad      mm4, 11        ; y4=a3-b3 y5=a2-b2
+    packssdw   mm1, mm0       ; 0 ; y3 y2 y1 y0
+    packssdw   mm4, mm3       ; 3 ; y6 y7 y4 y5
+    movq       mm7, mm4       ; 7 ; y6 y7 y4 y5
+    psrld      mm4, 16        ; 0 y6 0 y4
+    pslld      mm7, 16        ; y7 0 y5 0
+    movq  [r0+16*%1+0], mm1   ; 1 ; save y3 y2 y1 y0
+    por        mm7, mm4       ; 4 ; y7 y6 y5 y4
+%endif
+    movq  [r0+16*%1+8], mm7   ; 7 ; save y7 y6 y5 y4
+%endmacro
+
+; -----------------------------------------------------------------------------
+;
+; The first stage DCT 8x8 - forward DCTs of columns
+;
+; The %2puts are multiplied
+; for rows 0,4 - on cos_4_16,
+; for rows 1,7 - on cos_1_16,
+; for rows 2,6 - on cos_2_16,
+; for rows 3,5 - on cos_3_16
+; and are shifted to the left for rise of accuracy
+;
+; -----------------------------------------------------------------------------
+;
+; The 8-point scaled forward DCT algorithm (26a8m)
+;
+; -----------------------------------------------------------------------------
+;
+;#define DCT_8_FRW_COL(x, y)
+; {
+;     short t0, t1, t2, t3, t4, t5, t6, t7;
+;     short tp03, tm03, tp12, tm12, tp65, tm65;
+;     short tp465, tm465, tp765, tm765;
+;
+;     t0 = LEFT_SHIFT(x[0] + x[7]);
+;     t1 = LEFT_SHIFT(x[1] + x[6]);
+;     t2 = LEFT_SHIFT(x[2] + x[5]);
+;     t3 = LEFT_SHIFT(x[3] + x[4]);
+;     t4 = LEFT_SHIFT(x[3] - x[4]);
+;     t5 = LEFT_SHIFT(x[2] - x[5]);
+;     t6 = LEFT_SHIFT(x[1] - x[6]);
+;     t7 = LEFT_SHIFT(x[0] - x[7]);
+;
+;     tp03 = t0 + t3;
+;     tm03 = t0 - t3;
+;     tp12 = t1 + t2;
+;     tm12 = t1 - t2;
+;
+;     y[0] = tp03 + tp12;
+;     y[4] = tp03 - tp12;
+;
+;     y[2] = tm03 + tm12 * tg_2_16;
+;     y[6] = tm03 * tg_2_16 - tm12;
+;
+;     tp65 = (t6 + t5) * cos_4_16;
+;     tm65 = (t6 - t5) * cos_4_16;
+;
+;     tp765 = t7 + tp65;
+;     tm765 = t7 - tp65;
+;     tp465 = t4 + tm65;
+;     tm465 = t4 - tm65;
+;
+;     y[1] = tp765 + tp465 * tg_1_16;
+;     y[7] = tp765 * tg_1_16 - tp465;
+;     y[5] = tm765 * tg_3_16 + tm465;
+;     y[3] = tm765 - tm465 * tg_3_16;
+; }
+;
+; -----------------------------------------------------------------------------
+
+; -----------------------------------------------------------------------------
+; DCT_8_INV_COL_4  INP,OUT
+; -----------------------------------------------------------------------------
+%macro DCT_8_INV_COL 1
+    movq        mm0, [tan3]
+    movq        mm3, [%1+16*3]
+    movq        mm1, mm0 ; tg_3_16
+    movq        mm5, [%1+16*5]
+    pmulhw      mm0, mm3 ; x3*(tg_3_16-1)
+    movq        mm4, [tan1]
+    pmulhw      mm1, mm5 ; x5*(tg_3_16-1)
+    movq        mm7, [%1+16*7]
+    movq        mm2, mm4 ; tg_1_16
+    movq        mm6, [%1+16*1]
+    pmulhw      mm4, mm7 ; x7*tg_1_16
+    paddsw      mm0, mm3 ; x3*tg_3_16
+    pmulhw      mm2, mm6 ; x1*tg_1_16
+    paddsw      mm1, mm3 ; x3+x5*(tg_3_16-1)
+    psubsw      mm0, mm5 ; x3*tg_3_16-x5 = tm35
+    movq        mm3, [sqrt2]
+    paddsw      mm1, mm5 ; x3+x5*tg_3_16 = tp35
+    paddsw      mm4, mm6 ; x1+tg_1_16*x7 = tp17
+    psubsw      mm2, mm7 ; x1*tg_1_16-x7 = tm17
+    movq        mm5, mm4 ; tp17
+    movq        mm6, mm2 ; tm17
+    paddsw      mm5, mm1 ; tp17+tp35 = b0
+    psubsw      mm6, mm0 ; tm17-tm35 = b3
+    psubsw      mm4, mm1 ; tp17-tp35 = t1
+    paddsw      mm2, mm0 ; tm17+tm35 = t2
+    movq        mm7, [tan2]
+    movq        mm1, mm4 ; t1
+    movq  [%1+3*16], mm5 ; save b0
+    paddsw      mm1, mm2 ; t1+t2
+    movq  [%1+5*16], mm6 ; save b3
+    psubsw      mm4, mm2 ; t1-t2
+    movq        mm5, [%1+2*16]
+    movq        mm0, mm7 ; tg_2_16
+    movq        mm6, [%1+6*16]
+    pmulhw      mm0, mm5 ; x2*tg_2_16
+    pmulhw      mm7, mm6 ; x6*tg_2_16
+    pmulhw      mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2
+    movq        mm2, [%1+0*16]
+    pmulhw      mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2
+    psubsw      mm0, mm6 ; t2*tg_2_16-x6 = tm26
+    movq        mm3, mm2 ; x0
+    movq        mm6, [%1+4*16]
+    paddsw      mm7, mm5 ; x2+x6*tg_2_16 = tp26
+    paddsw      mm2, mm6 ; x0+x4 = tp04
+    psubsw      mm3, mm6 ; x0-x4 = tm04
+    movq        mm5, mm2 ; tp04
+    movq        mm6, mm3 ; tm04
+    psubsw      mm2, mm7 ; tp04-tp26 = a3
+    paddsw      mm3, mm0 ; tm04+tm26 = a1
+    paddsw      mm1, mm1 ; b1
+    paddsw      mm4, mm4 ; b2
+    paddsw      mm5, mm7 ; tp04+tp26 = a0
+    psubsw      mm6, mm0 ; tm04-tm26 = a2
+    movq        mm7, mm3 ; a1
+    movq        mm0, mm6 ; a2
+    paddsw      mm3, mm1 ; a1+b1
+    paddsw      mm6, mm4 ; a2+b2
+    psraw       mm3, 6   ; dst1
+    psubsw      mm7, mm1 ; a1-b1
+    psraw       mm6, 6   ; dst2
+    psubsw      mm0, mm4 ; a2-b2
+    movq        mm1, [%1+3*16] ; load b0
+    psraw       mm7, 6   ; dst6
+    movq        mm4, mm5 ; a0
+    psraw       mm0, 6   ; dst5
+    movq  [%1+1*16], mm3
+    paddsw      mm5, mm1 ; a0+b0
+    movq  [%1+2*16], mm6
+    psubsw      mm4, mm1 ; a0-b0
+    movq        mm3, [%1+5*16] ; load b3
+    psraw       mm5, 6   ; dst0
+    movq        mm6, mm2 ; a3
+    psraw       mm4, 6   ; dst7
+    movq  [%1+5*16], mm0
+    paddsw      mm2, mm3 ; a3+b3
+    movq  [%1+6*16], mm7
+    psubsw      mm6, mm3 ; a3-b3
+    movq  [%1+0*16], mm5
+    psraw       mm2, 6   ; dst3
+    movq  [%1+7*16], mm4
+    psraw       mm6, 6   ; dst4
+    movq  [%1+3*16], mm2
+    movq  [%1+4*16], mm6
+%endmacro
+
+%macro XVID_IDCT_MMX 0
+cglobal xvid_idct, 1, 1, 0, block
+%if cpuflag(mmxext)
+%define TAB tab_i_04_xmm
+%else
+%define TAB tab_i_04_mmx
+%endif
+    ; Process each row - beware of rounder offset
+    DCT_8_INV_ROW  0, TAB + 64 * 0, 0*16
+    DCT_8_INV_ROW  1, TAB + 64 * 1, 1*16
+    DCT_8_INV_ROW  2, TAB + 64 * 2, 2*16
+    DCT_8_INV_ROW  3, TAB + 64 * 3, 3*16
+    DCT_8_INV_ROW  4, TAB + 64 * 0, 6*16
+    DCT_8_INV_ROW  5, TAB + 64 * 3, 4*16
+    DCT_8_INV_ROW  6, TAB + 64 * 2, 5*16
+    DCT_8_INV_ROW  7, TAB + 64 * 1, 5*16
+
+    ; Process the columns (4 at a time)
+    DCT_8_INV_COL  r0+0
+    DCT_8_INV_COL  r0+8
+
+    RET
+%endmacro
+
+INIT_MMX mmx
+XVID_IDCT_MMX
+INIT_MMX mmxext
+XVID_IDCT_MMX
+
+%endif ; ~ARCH_X86_32
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
index 13a4e85..573b25c 100644
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@ -1,20 +1,20 @@
 /*
  * XVID MPEG-4 VIDEO CODEC
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@ void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
 void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
 
 void ff_xvid_idct_sse2(short *block);
-void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block);
-void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block);
+void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
+void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block);
 
 #endif /* AVCODEC_X86_XVIDIDCT_H */
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index e4f7345..8b9d8de 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,9 +26,36 @@
 #include "idctdsp.h"
 #include "xvididct.h"
 
+#if ARCH_X86_32 && HAVE_YASM
+static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmx_add(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_put(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void xvid_idct_mmxext_add(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+#endif
+
 av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (high_bit_depth ||
@@ -36,24 +63,27 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
           avctx->idct_algo == FF_IDCT_XVID))
         return;
 
-    if (INLINE_MMX(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmx_put;
-        c->idct_add  = ff_xvid_idct_mmx_add;
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmx_put;
+        c->idct_add  = xvid_idct_mmx_add;
         c->idct      = ff_xvid_idct_mmx;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_mmxext_put;
-        c->idct_add  = ff_xvid_idct_mmxext_add;
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->idct_put  = xvid_idct_mmxext_put;
+        c->idct_add  = xvid_idct_mmxext_add;
         c->idct      = ff_xvid_idct_mmxext;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
+#endif
 
-    if (INLINE_SSE2(cpu_flags)) {
-        c->idct_put  = ff_xvid_idct_sse2_put;
-        c->idct_add  = ff_xvid_idct_sse2_add;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->idct_put  = ff_xvid_idct_put_sse2;
+        c->idct_add  = ff_xvid_idct_add_sse2;
         c->idct      = ff_xvid_idct_sse2;
         c->perm_type = FF_IDCT_PERM_SSE2;
     }
+#endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c
deleted file mode 100644
index e371142..0000000
--- a/libavcodec/x86/xvididct_mmx.c
+++ /dev/null
@@ -1,548 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - MMX and XMM forward discrete cosine transform -
- *
- * Copyright(C) 2001 Peter Ross <pross@xvid.org>
- *
- * Originally provided by Intel at AP-922
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
- * but in a limited edition.
- * New macro implements a column part for precise iDCT
- * The routine precision now satisfies IEEE standard 1180-1990.
- *
- * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
- * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
- *
- * http://www.elecard.com/peter/idct.html
- * http://www.linuxvideo.org/mpeg2dec/
- *
- * These examples contain code fragments for first stage iDCT 8x8
- * (for rows) and first stage DCT 8x8 (for columns)
- *
- * conversion to gcc syntax by Michael Niedermayer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-
-#include "config.h"
-
-#include "libavutil/mem.h"
-
-#include "libavcodec/avcodec.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_MMX_INLINE
-
-// -----------------------------------------------------------------------------
-// Various memory constants (trigonometric values or rounding values)
-// -----------------------------------------------------------------------------
-
-DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4 * 4] = {
-     13036,  13036,  13036,  13036, // tg * (2 << 16) + 0.5
-     27146,  27146,  27146,  27146, // tg * (2 << 16) + 0.5
-    -21746, -21746, -21746, -21746, // tg * (2 << 16) + 0.5
-     23170,  23170,  23170,  23170
-};                                  // cos * (2 << 15) + 0.5
-
-DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2 * 8] = {
-    65536, 65536,
-    3597,   3597,
-    2260,   2260,
-    1203,   1203,
-    0,         0,
-    120,     120,
-    512,     512,
-    512, 512
-};
-
-// -----------------------------------------------------------------------------
-//
-// The first stage iDCT 8x8 - inverse DCTs of rows
-//
-// -----------------------------------------------------------------------------
-// The 8-point inverse DCT direct algorithm
-// -----------------------------------------------------------------------------
-//
-// static const short w[32] = {
-//     FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
-//     FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
-//     FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
-//     FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
-//     FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
-//     FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
-//     FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
-//
-// #define DCT_8_INV_ROW(x, y)
-// {
-//     int a0, a1, a2, a3, b0, b1, b2, b3;
-//
-//     a0 = x[0] * w[0]  + x[2] * w[1]  + x[4] * w[2]  + x[6] * w[3];
-//     a1 = x[0] * w[4]  + x[2] * w[5]  + x[4] * w[6]  + x[6] * w[7];
-//     a2 = x[0] * w[8]  + x[2] * w[9]  + x[4] * w[10] + x[6] * w[11];
-//     a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
-//     b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
-//     b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
-//     b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
-//     b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
-//
-//     y[0] = SHIFT_ROUND(a0 + b0);
-//     y[1] = SHIFT_ROUND(a1 + b1);
-//     y[2] = SHIFT_ROUND(a2 + b2);
-//     y[3] = SHIFT_ROUND(a3 + b3);
-//     y[4] = SHIFT_ROUND(a3 - b3);
-//     y[5] = SHIFT_ROUND(a2 - b2);
-//     y[6] = SHIFT_ROUND(a1 - b1);
-//     y[7] = SHIFT_ROUND(a0 - b0);
-// }
-//
-// -----------------------------------------------------------------------------
-//
-// In this implementation the outputs of the iDCT-1D are multiplied
-//     for rows 0,4 - by cos_4_16,
-//     for rows 1,7 - by cos_1_16,
-//     for rows 2,6 - by cos_2_16,
-//     for rows 3,5 - by cos_3_16
-// and are shifted to the left for better accuracy.
-//
-// For the constants used,
-//     FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// Tables for mmx processors
-// -----------------------------------------------------------------------------
-
-// Table for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32 * 4] = {
-     16384,  16384,  16384, -16384, // movq-> w06 w04 w02 w00
-     21407,   8867,   8867, -21407, // w07 w05 w03 w01
-     16384, -16384,  16384,  16384, // w14 w12 w10 w08
-     -8867,  21407, -21407,  -8867, // w15 w13 w11 w09
-     22725,  12873,  19266, -22725, // w22 w20 w18 w16
-     19266,   4520,  -4520, -12873, // w23 w21 w19 w17
-     12873,   4520,   4520,  19266, // w30 w28 w26 w24
-    -22725,  19266, -12873, -22725, // w31 w29 w27 w25
-// Table for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  22725,  22725, -22725, // movq-> w06 w04 w02 w00
-     29692,  12299,  12299, -29692, // w07 w05 w03 w01
-     22725, -22725,  22725,  22725, // w14 w12 w10 w08
-    -12299,  29692, -29692, -12299, // w15 w13 w11 w09
-     31521,  17855,  26722, -31521, // w22 w20 w18 w16
-     26722,   6270,  -6270, -17855, // w23 w21 w19 w17
-     17855,   6270,   6270,  26722, // w30 w28 w26 w24
-    -31521,  26722, -17855, -31521, // w31 w29 w27 w25
-// Table for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  21407,  21407, -21407, // movq-> w06 w04 w02 w00
-     27969,  11585,  11585, -27969, // w07 w05 w03 w01
-     21407, -21407,  21407,  21407, // w14 w12 w10 w08
-    -11585,  27969, -27969, -11585, // w15 w13 w11 w09
-     29692,  16819,  25172, -29692, // w22 w20 w18 w16
-     25172,   5906,  -5906, -16819, // w23 w21 w19 w17
-     16819,   5906,   5906,  25172, // w30 w28 w26 w24
-    -29692,  25172, -16819, -29692, // w31 w29 w27 w25
-// Table for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  19266,  19266, -19266, // movq-> w06 w04 w02 w00
-     25172,  10426,  10426, -25172, // w07 w05 w03 w01
-     19266, -19266,  19266,  19266, // w14 w12 w10 w08
-    -10426,  25172, -25172, -10426, // w15 w13 w11 w09
-     26722,  15137,  22654, -26722, // w22 w20 w18 w16
-     22654,   5315,  -5315, -15137, // w23 w21 w19 w17
-     15137,   5315,   5315,  22654, // w30 w28 w26 w24
-    -26722,  22654, -15137, -26722, // w31 w29 w27 w25
-};
-// -----------------------------------------------------------------------------
-// Tables for xmm processors
-// -----------------------------------------------------------------------------
-
-// %3 for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32 * 4] = {
-     16384,  21407,  16384,   8867, // movq-> w05 w04 w01 w00
-     16384,   8867, -16384, -21407, // w07 w06 w03 w02
-     16384,  -8867,  16384, -21407, // w13 w12 w09 w08
-    -16384,  21407,  16384,  -8867, // w15 w14 w11 w10
-     22725,  19266,  19266,  -4520, // w21 w20 w17 w16
-     12873,   4520, -22725, -12873, // w23 w22 w19 w18
-     12873, -22725,   4520, -12873, // w29 w28 w25 w24
-      4520,  19266,  19266, -22725, // w31 w30 w27 w26
-// %3 for rows 1,7 - constants are multiplied by cos_1_16
-     22725,  29692,  22725,  12299, // movq-> w05 w04 w01 w00
-     22725,  12299, -22725, -29692, // w07 w06 w03 w02
-     22725, -12299,  22725, -29692, // w13 w12 w09 w08
-    -22725,  29692,  22725, -12299, // w15 w14 w11 w10
-     31521,  26722,  26722,  -6270, // w21 w20 w17 w16
-     17855,   6270, -31521, -17855, // w23 w22 w19 w18
-     17855, -31521,   6270, -17855, // w29 w28 w25 w24
-      6270,  26722,  26722, -31521, // w31 w30 w27 w26
-// %3 for rows 2,6 - constants are multiplied by cos_2_16
-     21407,  27969,  21407,  11585, // movq-> w05 w04 w01 w00
-     21407,  11585, -21407, -27969, // w07 w06 w03 w02
-     21407, -11585,  21407, -27969, // w13 w12 w09 w08
-    -21407,  27969,  21407, -11585, // w15 w14 w11 w10
-     29692,  25172,  25172,  -5906, // w21 w20 w17 w16
-     16819,   5906, -29692, -16819, // w23 w22 w19 w18
-     16819, -29692,   5906, -16819, // w29 w28 w25 w24
-      5906,  25172,  25172, -29692, // w31 w30 w27 w26
-// %3 for rows 3,5 - constants are multiplied by cos_3_16
-     19266,  25172,  19266,  10426, // movq-> w05 w04 w01 w00
-     19266,  10426, -19266, -25172, // w07 w06 w03 w02
-     19266, -10426,  19266, -25172, // w13 w12 w09 w08
-    -19266,  25172,  19266, -10426, // w15 w14 w11 w10
-     26722,  22654,  22654,  -5315, // w21 w20 w17 w16
-     15137,   5315, -26722, -15137, // w23 w22 w19 w18
-     15137, -26722,   5315, -15137, // w29 w28 w25 w24
-      5315,  22654,  22654, -26722, // w31 w30 w27 w26
-};
-// =============================================================================
-// Helper macros for the code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_MMX(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0    \n\t" /* 0 ; x3 x2 x1 x0 */                     \
-    "movq     8+"#A1", %%mm1    \n\t" /* 1 ; x7 x6 x5 x4 */                     \
-    "movq       %%mm0, %%mm2    \n\t" /* 2 ; x3 x2 x1 x0 */                     \
-    "movq       "#A3", %%mm3    \n\t" /* 3 ; w06 w04 w02 w00 */                 \
-    "punpcklwd  %%mm1, %%mm0    \n\t" /* x5 x1 x4 x0 */                         \
-    "movq       %%mm0, %%mm5    \n\t" /* 5 ; x5 x1 x4 x0 */                     \
-    "punpckldq  %%mm0, %%mm0    \n\t" /* x4 x0 x4 x0 */                         \
-    "movq     8+"#A3", %%mm4    \n\t" /* 4 ; w07 w05 w03 w01 */                 \
-    "punpckhwd  %%mm1, %%mm2    \n\t" /* 1 ; x7 x3 x6 x2 */                     \
-    "pmaddwd    %%mm0, %%mm3    \n\t" /* x4*w06+x0*w04 x4*w02+x0*w00 */         \
-    "movq       %%mm2, %%mm6    \n\t" /* 6 ; x7 x3 x6 x2 */                     \
-    "movq    32+"#A3", %%mm1    \n\t" /* 1 ; w22 w20 w18 w16 */                 \
-    "punpckldq  %%mm2, %%mm2    \n\t" /* x6 x2 x6 x2 */                         \
-    "pmaddwd    %%mm2, %%mm4    \n\t" /* x6*w07+x2*w05 x6*w03+x2*w01 */         \
-    "punpckhdq  %%mm5, %%mm5    \n\t" /* x5 x1 x5 x1 */                         \
-    "pmaddwd 16+"#A3", %%mm0    \n\t" /* x4*w14+x0*w12 x4*w10+x0*w08 */         \
-    "punpckhdq  %%mm6, %%mm6    \n\t" /* x7 x3 x7 x3 */                         \
-    "movq 40+   "#A3", %%mm7    \n\t" /* 7 ; w23 w21 w19 w17 */                 \
-    "pmaddwd    %%mm5, %%mm1    \n\t" /* x5*w22+x1*w20 x5*w18+x1*w16 */         \
-    "paddd      "#A4", %%mm3    \n\t" /* +%4 */                                 \
-    "pmaddwd    %%mm6, %%mm7    \n\t" /* x7*w23+x3*w21 x7*w19+x3*w17 */         \
-    "pmaddwd 24+"#A3", %%mm2    \n\t" /* x6*w15+x2*w13 x6*w11+x2*w09 */         \
-    "paddd      %%mm4, %%mm3    \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */     \
-    "pmaddwd 48+"#A3", %%mm5    \n\t" /* x5*w30+x1*w28 x5*w26+x1*w24 */         \
-    "movq       %%mm3, %%mm4    \n\t" /* 4 ; a1 a0 */                           \
-    "pmaddwd 56+"#A3", %%mm6    \n\t" /* x7*w31+x3*w29 x7*w27+x3*w25 */         \
-    "paddd      %%mm7, %%mm1    \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */       \
-    "paddd      "#A4", %%mm0    \n\t" /* +%4 */                                 \
-    "psubd      %%mm1, %%mm3    \n\t" /* a1-b1 a0-b0 */                         \
-    "psrad        $11, %%mm3    \n\t" /* y6=a1-b1 y7=a0-b0 */                   \
-    "paddd      %%mm4, %%mm1    \n\t" /* 4 ; a1+b1 a0+b0 */                     \
-    "paddd      %%mm2, %%mm0    \n\t" /* 2 ; a3=sum(even3) a2=sum(even2) */     \
-    "psrad        $11, %%mm1    \n\t" /* y1=a1+b1 y0=a0+b0 */                   \
-    "paddd      %%mm6, %%mm5    \n\t" /* 6 ; b3=sum(odd3) b2=sum(odd2) */       \
-    "movq       %%mm0, %%mm4    \n\t" /* 4 ; a3 a2 */                           \
-    "paddd      %%mm5, %%mm0    \n\t" /* a3+b3 a2+b2 */                         \
-    "psubd      %%mm5, %%mm4    \n\t" /* 5 ; a3-b3 a2-b2 */                     \
-    "psrad        $11, %%mm0    \n\t" /* y3=a3+b3 y2=a2+b2 */                   \
-    "psrad        $11, %%mm4    \n\t" /* y4=a3-b3 y5=a2-b2 */                   \
-    "packssdw   %%mm0, %%mm1    \n\t" /* 0 ; y3 y2 y1 y0 */                     \
-    "packssdw   %%mm3, %%mm4    \n\t" /* 3 ; y6 y7 y4 y5 */                     \
-    "movq       %%mm4, %%mm7    \n\t" /* 7 ; y6 y7 y4 y5 */                     \
-    "psrld        $16, %%mm4    \n\t" /* 0 y6 0 y4 */                           \
-    "pslld        $16, %%mm7    \n\t" /* y7 0 y5 0 */                           \
-    "movq       %%mm1, "#A2"    \n\t" /* 1 ; save y3 y2 y1 y0 */                \
-    "por        %%mm4, %%mm7    \n\t" /* 4 ; y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"  \n\t" /* 7 ; save y7 y6 y5 y4 */                \
-
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_XMM(A1, A2, A3, A4)                                       \
-    "movq       "#A1", %%mm0        \n\t" /* 0 ; x3 x2 x1 x0 */                 \
-    "movq     8+"#A1", %%mm1        \n\t" /* 1 ; x7 x6 x5 x4 */                 \
-    "movq       %%mm0, %%mm2        \n\t" /* 2 ; x3 x2 x1 x0 */                 \
-    "movq       "#A3", %%mm3        \n\t" /* 3 ; w05 w04 w01 w00 */             \
-    "pshufw     $0x88, %%mm0, %%mm0 \n\t" /* x2 x0 x2 x0 */                     \
-    "movq     8+"#A3", %%mm4        \n\t" /* 4 ; w07 w06 w03 w02 */             \
-    "movq       %%mm1, %%mm5        \n\t" /* 5 ; x7 x6 x5 x4 */                 \
-    "pmaddwd    %%mm0, %%mm3        \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */     \
-    "movq    32+"#A3", %%mm6        \n\t" /* 6 ; w21 w20 w17 w16 */             \
-    "pshufw     $0x88, %%mm1, %%mm1 \n\t" /* x6 x4 x6 x4 */                     \
-    "pmaddwd    %%mm1, %%mm4        \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */     \
-    "movq    40+"#A3", %%mm7        \n\t" /* 7; w23 w22 w19 w18 */              \
-    "pshufw     $0xdd, %%mm2, %%mm2 \n\t" /* x3 x1 x3 x1 */                     \
-    "pmaddwd    %%mm2, %%mm6        \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */     \
-    "pshufw     $0xdd, %%mm5, %%mm5 \n\t" /* x7 x5 x7 x5 */                     \
-    "pmaddwd    %%mm5, %%mm7        \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */     \
-    "paddd      "#A4", %%mm3        \n\t" /* +%4 */                             \
-    "pmaddwd 16+"#A3", %%mm0        \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */     \
-    "paddd      %%mm4, %%mm3        \n\t" /* 4 ; a1=sum(even1) a0=sum(even0) */ \
-    "pmaddwd 24+"#A3", %%mm1        \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */     \
-    "movq       %%mm3, %%mm4        \n\t" /* 4 ; a1 a0 */                       \
-    "pmaddwd 48+"#A3", %%mm2        \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */     \
-    "paddd      %%mm7, %%mm6        \n\t" /* 7 ; b1=sum(odd1) b0=sum(odd0) */   \
-    "pmaddwd 56+"#A3", %%mm5        \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */     \
-    "paddd      %%mm6, %%mm3        \n\t" /* a1+b1 a0+b0 */                     \
-    "paddd      "#A4", %%mm0        \n\t" /* +%4 */                             \
-    "psrad        $11, %%mm3        \n\t" /* y1=a1+b1 y0=a0+b0 */               \
-    "paddd      %%mm1, %%mm0        \n\t" /* 1 ; a3=sum(even3) a2=sum(even2) */ \
-    "psubd      %%mm6, %%mm4        \n\t" /* 6 ; a1-b1 a0-b0 */                 \
-    "movq       %%mm0, %%mm7        \n\t" /* 7 ; a3 a2 */                       \
-    "paddd      %%mm5, %%mm2        \n\t" /* 5 ; b3=sum(odd3) b2=sum(odd2) */   \
-    "paddd      %%mm2, %%mm0        \n\t" /* a3+b3 a2+b2 */                     \
-    "psrad        $11, %%mm4        \n\t" /* y6=a1-b1 y7=a0-b0 */               \
-    "psubd      %%mm2, %%mm7        \n\t" /* 2 ; a3-b3 a2-b2 */                 \
-    "psrad        $11, %%mm0        \n\t" /* y3=a3+b3 y2=a2+b2 */               \
-    "psrad        $11, %%mm7        \n\t" /* y4=a3-b3 y5=a2-b2 */               \
-    "packssdw   %%mm0, %%mm3        \n\t" /* 0 ; y3 y2 y1 y0 */                 \
-    "packssdw   %%mm4, %%mm7        \n\t" /* 4 ; y6 y7 y4 y5 */                 \
-    "movq       %%mm3, "#A2"        \n\t" /* 3 ; save y3 y2 y1 y0 */            \
-    "pshufw     $0xb1, %%mm7, %%mm7 \n\t" /* y7 y6 y5 y4 */                     \
-    "movq       %%mm7, 8+"#A2"      \n\t" /* 7 ; save y7 y6 y5 y4 */            \
-
-
-// -----------------------------------------------------------------------------
-//
-// The first stage DCT 8x8 - forward DCTs of columns
-//
-// The %2puts are multiplied
-// for rows 0,4 - on cos_4_16,
-// for rows 1,7 - on cos_1_16,
-// for rows 2,6 - on cos_2_16,
-// for rows 3,5 - on cos_3_16
-// and are shifted to the left for rise of accuracy
-//
-// -----------------------------------------------------------------------------
-//
-// The 8-point scaled forward DCT algorithm (26a8m)
-//
-// -----------------------------------------------------------------------------
-//
-//#define DCT_8_FRW_COL(x, y)
-// {
-//     short t0, t1, t2, t3, t4, t5, t6, t7;
-//     short tp03, tm03, tp12, tm12, tp65, tm65;
-//     short tp465, tm465, tp765, tm765;
-//
-//     t0 = LEFT_SHIFT(x[0] + x[7]);
-//     t1 = LEFT_SHIFT(x[1] + x[6]);
-//     t2 = LEFT_SHIFT(x[2] + x[5]);
-//     t3 = LEFT_SHIFT(x[3] + x[4]);
-//     t4 = LEFT_SHIFT(x[3] - x[4]);
-//     t5 = LEFT_SHIFT(x[2] - x[5]);
-//     t6 = LEFT_SHIFT(x[1] - x[6]);
-//     t7 = LEFT_SHIFT(x[0] - x[7]);
-//
-//     tp03 = t0 + t3;
-//     tm03 = t0 - t3;
-//     tp12 = t1 + t2;
-//     tm12 = t1 - t2;
-//
-//     y[0] = tp03 + tp12;
-//     y[4] = tp03 - tp12;
-//
-//     y[2] = tm03 + tm12 * tg_2_16;
-//     y[6] = tm03 * tg_2_16 - tm12;
-//
-//     tp65 = (t6 + t5) * cos_4_16;
-//     tm65 = (t6 - t5) * cos_4_16;
-//
-//     tp765 = t7 + tp65;
-//     tm765 = t7 - tp65;
-//     tp465 = t4 + tm65;
-//     tm465 = t4 - tm65;
-//
-//     y[1] = tp765 + tp465 * tg_1_16;
-//     y[7] = tp765 * tg_1_16 - tp465;
-//     y[5] = tm765 * tg_3_16 + tm465;
-//     y[3] = tm765 - tm465 * tg_3_16;
-// }
-//
-// -----------------------------------------------------------------------------
-
-// -----------------------------------------------------------------------------
-// DCT_8_INV_COL_4  INP,OUT
-// -----------------------------------------------------------------------------
-
-#define DCT_8_INV_COL(A1, A2)                                                   \
-    "movq    2*8(%3), %%mm0         \n\t"                                       \
-    "movq 16*3+"#A1", %%mm3         \n\t"                                       \
-    "movq      %%mm0, %%mm1         \n\t" /* tg_3_16 */                         \
-    "movq 16*5+"#A1", %%mm5         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm0         \n\t" /* x3*(tg_3_16-1) */                  \
-    "movq       (%3), %%mm4         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm1         \n\t" /* x5*(tg_3_16-1) */                  \
-    "movq 16*7+"#A1", %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm2         \n\t" /* tg_1_16 */                         \
-    "movq 16*1+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm7, %%mm4         \n\t" /* x7*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm0         \n\t" /* x3*tg_3_16 */                      \
-    "pmulhw    %%mm6, %%mm2         \n\t" /* x1*tg_1_16 */                      \
-    "paddsw    %%mm3, %%mm1         \n\t" /* x3+x5*(tg_3_16-1) */               \
-    "psubsw    %%mm5, %%mm0         \n\t" /* x3*tg_3_16-x5 = tm35 */            \
-    "movq    3*8(%3), %%mm3         \n\t"                                       \
-    "paddsw    %%mm5, %%mm1         \n\t" /* x3+x5*tg_3_16 = tp35 */            \
-    "paddsw    %%mm6, %%mm4         \n\t" /* x1+tg_1_16*x7 = tp17 */            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* x1*tg_1_16-x7 = tm17 */            \
-    "movq      %%mm4, %%mm5         \n\t" /* tp17 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* tm17 */                            \
-    "paddsw    %%mm1, %%mm5         \n\t" /* tp17+tp35 = b0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm17-tm35 = b3 */                  \
-    "psubsw    %%mm1, %%mm4         \n\t" /* tp17-tp35 = t1 */                  \
-    "paddsw    %%mm0, %%mm2         \n\t" /* tm17+tm35 = t2 */                  \
-    "movq    1*8(%3), %%mm7         \n\t"                                       \
-    "movq      %%mm4, %%mm1         \n\t" /* t1 */                              \
-    "movq      %%mm5, 3*16+"#A2"    \n\t" /* save b0 */                         \
-    "paddsw    %%mm2, %%mm1         \n\t" /* t1+t2 */                           \
-    "movq      %%mm6, 5*16+"#A2"    \n\t" /* save b3 */                         \
-    "psubsw    %%mm2, %%mm4         \n\t" /* t1-t2 */                           \
-    "movq 2*16+"#A1", %%mm5         \n\t"                                       \
-    "movq      %%mm7, %%mm0         \n\t" /* tg_2_16 */                         \
-    "movq 6*16+"#A1", %%mm6         \n\t"                                       \
-    "pmulhw    %%mm5, %%mm0         \n\t" /* x2*tg_2_16 */                      \
-    "pmulhw    %%mm6, %%mm7         \n\t" /* x6*tg_2_16 */                      \
-    "pmulhw    %%mm3, %%mm1         \n\t" /* ocos_4_16*(t1+t2) = b1/2 */        \
-    "movq 0*16+"#A1", %%mm2         \n\t"                                       \
-    "pmulhw    %%mm3, %%mm4         \n\t" /* ocos_4_16*(t1-t2) = b2/2 */        \
-    "psubsw    %%mm6, %%mm0         \n\t" /* t2*tg_2_16-x6 = tm26 */            \
-    "movq      %%mm2, %%mm3         \n\t" /* x0 */                              \
-    "movq 4*16+"#A1", %%mm6         \n\t"                                       \
-    "paddsw    %%mm5, %%mm7         \n\t" /* x2+x6*tg_2_16 = tp26 */            \
-    "paddsw    %%mm6, %%mm2         \n\t" /* x0+x4 = tp04 */                    \
-    "psubsw    %%mm6, %%mm3         \n\t" /* x0-x4 = tm04 */                    \
-    "movq      %%mm2, %%mm5         \n\t" /* tp04 */                            \
-    "movq      %%mm3, %%mm6         \n\t" /* tm04 */                            \
-    "psubsw    %%mm7, %%mm2         \n\t" /* tp04-tp26 = a3 */                  \
-    "paddsw    %%mm0, %%mm3         \n\t" /* tm04+tm26 = a1 */                  \
-    "paddsw    %%mm1, %%mm1         \n\t" /* b1 */                              \
-    "paddsw    %%mm4, %%mm4         \n\t" /* b2 */                              \
-    "paddsw    %%mm7, %%mm5         \n\t" /* tp04+tp26 = a0 */                  \
-    "psubsw    %%mm0, %%mm6         \n\t" /* tm04-tm26 = a2 */                  \
-    "movq      %%mm3, %%mm7         \n\t" /* a1 */                              \
-    "movq      %%mm6, %%mm0         \n\t" /* a2 */                              \
-    "paddsw    %%mm1, %%mm3         \n\t" /* a1+b1 */                           \
-    "paddsw    %%mm4, %%mm6         \n\t" /* a2+b2 */                           \
-    "psraw        $6, %%mm3         \n\t" /* dst1 */                            \
-    "psubsw    %%mm1, %%mm7         \n\t" /* a1-b1 */                           \
-    "psraw        $6, %%mm6         \n\t" /* dst2 */                            \
-    "psubsw    %%mm4, %%mm0         \n\t" /* a2-b2 */                           \
-    "movq 3*16+"#A2", %%mm1         \n\t" /* load b0 */                         \
-    "psraw        $6, %%mm7         \n\t" /* dst6 */                            \
-    "movq      %%mm5, %%mm4         \n\t" /* a0 */                              \
-    "psraw        $6, %%mm0         \n\t" /* dst5 */                            \
-    "movq      %%mm3, 1*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm1, %%mm5         \n\t" /* a0+b0 */                           \
-    "movq      %%mm6, 2*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm1, %%mm4         \n\t" /* a0-b0 */                           \
-    "movq 5*16+"#A2", %%mm3         \n\t" /* load b3 */                         \
-    "psraw        $6, %%mm5         \n\t" /* dst0 */                            \
-    "movq      %%mm2, %%mm6         \n\t" /* a3 */                              \
-    "psraw        $6, %%mm4         \n\t" /* dst7 */                            \
-    "movq      %%mm0, 5*16+"#A2"    \n\t"                                       \
-    "paddsw    %%mm3, %%mm2         \n\t" /* a3+b3 */                           \
-    "movq      %%mm7, 6*16+"#A2"    \n\t"                                       \
-    "psubsw    %%mm3, %%mm6         \n\t" /* a3-b3 */                           \
-    "movq      %%mm5, 0*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm2         \n\t" /* dst3 */                            \
-    "movq      %%mm4, 7*16+"#A2"    \n\t"                                       \
-    "psraw        $6, %%mm6         \n\t" /* dst4 */                            \
-    "movq      %%mm2, 3*16+"#A2"    \n\t"                                       \
-    "movq      %%mm6, 4*16+"#A2"    \n\t"                                       \
-
-// =============================================================================
-// Code
-// =============================================================================
-
-// -----------------------------------------------------------------------------
-// void idct_mmx(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmx(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_MMX(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_MMX(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_MMX(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_MMX(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_MMX(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_MMX(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_MMX(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_MMX(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_mmx), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmx(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-// -----------------------------------------------------------------------------
-// void idct_xmm(uint16_t block[64]);
-// -----------------------------------------------------------------------------
-
-void ff_xvid_idct_mmxext(short *block)
-{
-    __asm__ volatile (
-        // # Process each row
-        DCT_8_INV_ROW_XMM(0 * 16(%0), 0 * 16(%0), 64 * 0(%2), 8 * 0(%1))
-        DCT_8_INV_ROW_XMM(1 * 16(%0), 1 * 16(%0), 64 * 1(%2), 8 * 1(%1))
-        DCT_8_INV_ROW_XMM(2 * 16(%0), 2 * 16(%0), 64 * 2(%2), 8 * 2(%1))
-        DCT_8_INV_ROW_XMM(3 * 16(%0), 3 * 16(%0), 64 * 3(%2), 8 * 3(%1))
-        DCT_8_INV_ROW_XMM(4 * 16(%0), 4 * 16(%0), 64 * 0(%2), 8 * 4(%1))
-        DCT_8_INV_ROW_XMM(5 * 16(%0), 5 * 16(%0), 64 * 3(%2), 8 * 5(%1))
-        DCT_8_INV_ROW_XMM(6 * 16(%0), 6 * 16(%0), 64 * 2(%2), 8 * 6(%1))
-        DCT_8_INV_ROW_XMM(7 * 16(%0), 7 * 16(%0), 64 * 1(%2), 8 * 7(%1))
-
-        // # Process the columns (4 at a time)
-        DCT_8_INV_COL(0(%0), 0(%0))
-        DCT_8_INV_COL(8(%0), 8(%0))
-        :: "r" (block), "r" (rounder_0), "r" (tab_i_04_xmm), "r" (tg_1_16));
-}
-
-void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_xvid_idct_mmxext(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c
deleted file mode 100644
index d4f0169..0000000
--- a/libavcodec/x86/xvididct_sse2.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange@ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of Libav.
- *
- * Vertical pass is an implementation of the scheme:
- *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
- *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- *  Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- *  Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-
-#include "idctdsp.h"
-#include "xvididct.h"
-
-#if HAVE_SSE2_INLINE
-
-/**
- * @file
- * @brief SSE2 IDCT compatible with the Xvid IDCT
- */
-
-#define X8(x) x, x, x, x, x, x, x, x
-
-DECLARE_ASM_CONST(16, int16_t, tan1)[]  = { X8(13036) }; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2)[]  = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3)[]  = { X8(43790) }; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8,  uint8_t, m127)[]  = { X8(127) };
-
-DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
-    0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
-    0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
-    0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
-    0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
-    0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
-    0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
-    0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
-    0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
-    0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
-    0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
-    0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
-    0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
-    0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
-    0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
-    0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
-    0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
-    65536, 65536, 65536, 65536,
-     3597,  3597,  3597,  3597,
-     2260,  2260,  2260,  2260,
-     1203,  1203,  1203,  1203,
-      120,   120,   120,   120,
-      512,   512,   512,   512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
-
-#if ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
-    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
-    "movdqa          %%xmm2, "dst"    \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd   "MANGLE(x)
-
-#define JZ(reg, to)                         \
-    "testl     "reg","reg"            \n\t" \
-    "jz        "to"                   \n\t"
-
-#define JNZ(reg, to)                        \
-    "testl     "reg","reg"            \n\t" \
-    "jnz       "to"                   \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear)       \
-    clear                                   \
-    "movq     "src", %%mm1            \n\t" \
-    "por    8+"src", %%mm1            \n\t" \
-    "paddusb  %%mm0, %%mm1            \n\t" \
-    "pmovmskb %%mm1, "reg"            \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
-    clear1                                                    \
-    clear2                                                    \
-    "movq     "row1", %%mm1           \n\t"                   \
-    "por    8+"row1", %%mm1           \n\t"                   \
-    "movq     "row2", %%mm2           \n\t"                   \
-    "por    8+"row2", %%mm2           \n\t"                   \
-    "paddusb   %%mm0, %%mm1           \n\t"                   \
-    "paddusb   %%mm0, %%mm2           \n\t"                   \
-    "pmovmskb  %%mm1, "reg1"          \n\t"                   \
-    "pmovmskb  %%mm2, "reg2"          \n\t"
-
-/// IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put)            \
-    "movdqa        "src", %%xmm3      \n\t"            \
-    "movdqa       %%xmm3, %%xmm0      \n\t"            \
-    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
-    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
-    "pmaddwd     "table", %%xmm0      \n\t"            \
-    "pmaddwd  16+"table", %%xmm1      \n\t"            \
-    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
-    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
-    "pmaddwd  32+"table", %%xmm2      \n\t"            \
-    "pmaddwd  48+"table", %%xmm3      \n\t"            \
-    "paddd        %%xmm1, %%xmm0      \n\t"            \
-    "paddd        %%xmm3, %%xmm2      \n\t"            \
-    rounder",     %%xmm0              \n\t"            \
-    "movdqa       %%xmm2, %%xmm3      \n\t"            \
-    "paddd        %%xmm0, %%xmm2      \n\t"            \
-    "psubd        %%xmm3, %%xmm0      \n\t"            \
-    "psrad           $11, %%xmm2      \n\t"            \
-    "psrad           $11, %%xmm0      \n\t"            \
-    "packssdw     %%xmm0, %%xmm2      \n\t"            \
-    put                                                \
-    "1:                               \n\t"
-
-#define iLLM_HEAD                           \
-    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
-    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
-
-/// IDCT pass on columns.
-#define iLLM_PASS(dct)                      \
-    "movdqa   "TAN3", %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "pmulhw   %%xmm5, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm5, %%xmm1          \n\t" \
-    "psubsw   %%xmm5, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, %%xmm1          \n\t" \
-    "pmulhw   %%xmm7, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "paddsw   %%xmm6, %%xmm3          \n\t" \
-    "psubsw   %%xmm7, "TAN1"          \n\t" \
-    "movdqa   %%xmm3, %%xmm7          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm7, %%xmm1          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
-    MOV_32_ONLY ROW2", "REG2"         \n\t" \
-    MOV_32_ONLY ROW6", "REG6"         \n\t" \
-    "movdqa   %%xmm7, %%xmm5          \n\t" \
-    "pmulhw   "REG6", %%xmm7          \n\t" \
-    "pmulhw   "REG2", %%xmm5          \n\t" \
-    "paddsw   "REG2", %%xmm7          \n\t" \
-    "psubsw   "REG6", %%xmm5          \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    MOV_32_ONLY ROW4", "REG4"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   "REG4", "REG0"          \n\t" \
-    "paddsw   "XMMS", "REG4"          \n\t" \
-    "movdqa   "REG4", "XMMS"          \n\t" \
-    "psubsw   %%xmm7, "REG4"          \n\t" \
-    "paddsw   "XMMS", %%xmm7          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   %%xmm7, %%xmm0          \n\t" \
-    "movdqa   "REG4", %%xmm4          \n\t" \
-    "psubsw   %%xmm1, %%xmm7          \n\t" \
-    "psubsw   "TAN1", "REG4"          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, %%xmm7          \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, "REG4"          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   "REG4", 4*16("dct")     \n\t" \
-    "movdqa   %%xmm7, 7*16("dct")     \n\t"
-
-/// IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct)               \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "movdqa   %%xmm6, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "movdqa   %%xmm4, %%xmm1          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "paddsw   %%xmm6, %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
-    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
-    "pmulhw   "SREG2", %%xmm5         \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    "movdqa   "REG0", %%xmm6          \n\t" \
-    "psubsw   "SREG2", %%xmm6         \n\t" \
-    "paddsw   "REG0", "SREG2"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   "SREG2", %%xmm0         \n\t" \
-    "movdqa   %%xmm6, %%xmm4          \n\t" \
-    "psubsw   %%xmm1, "SREG2"         \n\t" \
-    "psubsw   "TAN1", %%xmm6          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, "SREG2"         \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm6          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
-    "movdqa   "SREG2", 7*16("dct")    \n\t"
-
-inline void ff_xvid_idct_sse2(short *block)
-{
-    __asm__ volatile (
-        "movq     "MANGLE (m127) ", %%mm0                              \n\t"
-        iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),          PUT_EVEN(ROW0))
-        iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1))
-        iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2))
-
-        TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
-        JZ("%%eax", "1f")
-        iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3))
-
-        TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
-        TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
-        iLLM_HEAD
-        ".p2align 4 \n\t"
-        JNZ("%%ecx", "2f")
-        JNZ("%%eax", "3f")
-        JNZ("%%edx", "4f")
-        JNZ("%%esi", "5f")
-        iLLM_PASS_SPARSE("%0")
-        "jmp 6f                                                      \n\t"
-        "2:                                                          \n\t"
-        iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
-        "3:                                                          \n\t"
-        iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5))
-        JZ("%%edx", "1f")
-        "4:                                                          \n\t"
-        iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6))
-        JZ("%%esi", "1f")
-        "5:                                                          \n\t"
-        iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7))
-#if ARCH_X86_32
-        iLLM_HEAD
-#endif
-        iLLM_PASS("%0")
-        "6:                                                          \n\t"
-        : "+r" (block)
-        :
-        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                       "%xmm4", "%xmm5", "%xmm6", "%xmm7", )
-#if ARCH_X86_64
-          XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11",
-                       "%xmm12", "%xmm13", "%xmm14", )
-#endif
-          "%eax", "%ecx", "%edx", "%esi", "memory");
-}
-
-void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
-{
-    ff_xvid_idct_sse2(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_SSE2_INLINE */
diff --git a/libavcodec/xan.c b/libavcodec/xan.c
index a46b58c..9609f83 100644
--- a/libavcodec/xan.c
+++ b/libavcodec/xan.c
@@ -2,20 +2,20 @@
  * Wing Commander/Xan Video Decoder
  * Copyright (C) 2003 The FFmpeg project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -54,13 +54,13 @@ typedef struct XanContext {
     AVCodecContext *avctx;
     AVFrame *last_frame;
 
-    const unsigned char *buf;
+    const uint8_t *buf;
     int size;
 
     /* scratch space */
-    unsigned char *buffer1;
+    uint8_t *buffer1;
     int buffer1_size;
-    unsigned char *buffer2;
+    uint8_t *buffer2;
     int buffer2_size;
 
     unsigned *palettes;
@@ -113,22 +113,21 @@ static av_cold int xan_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int xan_huffman_decode(unsigned char *dest, int dest_len,
-                              const unsigned char *src, int src_len)
+static int xan_huffman_decode(uint8_t *dest, int dest_len,
+                              const uint8_t *src, int src_len)
 {
-    unsigned char byte = *src++;
-    unsigned char ival = byte + 0x16;
-    const unsigned char * ptr = src + byte*2;
+    uint8_t byte = *src++;
+    uint8_t ival = byte + 0x16;
+    const uint8_t * ptr = src + byte*2;
     int ptr_len = src_len - 1 - byte*2;
-    unsigned char val = ival;
-    unsigned char *dest_end = dest + dest_len;
-    unsigned char *dest_start = dest;
+    uint8_t val = ival;
+    uint8_t *dest_end = dest + dest_len;
+    uint8_t *dest_start = dest;
+    int ret;
     GetBitContext gb;
 
-    if (ptr_len < 0)
-        return AVERROR_INVALIDDATA;
-
-    init_get_bits(&gb, ptr, ptr_len * 8);
+    if ((ret = init_get_bits8(&gb, ptr, ptr_len)) < 0)
+        return ret;
 
     while (val != 0x16) {
         unsigned idx = val - 0x17 + get_bits1(&gb) * byte;
@@ -152,13 +151,13 @@ static int xan_huffman_decode(unsigned char *dest, int dest_len,
  *
  * @param dest destination buffer of dest_len, must be padded with at least 130 bytes
  */
-static void xan_unpack(unsigned char *dest, int dest_len,
-                       const unsigned char *src, int src_len)
+static void xan_unpack(uint8_t *dest, int dest_len,
+                       const uint8_t *src, int src_len)
 {
-    unsigned char opcode;
+    uint8_t opcode;
     int size;
-    unsigned char *dest_org = dest;
-    unsigned char *dest_end = dest + dest_len;
+    uint8_t *dest_org = dest;
+    uint8_t *dest_end = dest + dest_len;
     GetByteContext ctx;
 
     bytestream2_init(&ctx, src, src_len);
@@ -207,14 +206,14 @@ static void xan_unpack(unsigned char *dest, int dest_len,
 }
 
 static inline void xan_wc3_output_pixel_run(XanContext *s, AVFrame *frame,
-    const unsigned char *pixel_buffer, int x, int y, int pixel_count)
+    const uint8_t *pixel_buffer, int x, int y, int pixel_count)
 {
     int stride;
     int line_inc;
     int index;
     int current_x;
     int width = s->avctx->width;
-    unsigned char *palette_plane;
+    uint8_t *palette_plane;
 
     palette_plane = frame->data[0];
     stride = frame->linesize[0];
@@ -246,7 +245,7 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, AVFrame *frame,
     int curframe_index, prevframe_index;
     int curframe_x, prevframe_x;
     int width = s->avctx->width;
-    unsigned char *palette_plane, *prev_palette_plane;
+    uint8_t *palette_plane, *prev_palette_plane;
 
     if (y + motion_y < 0 || y + motion_y >= s->avctx->height ||
         x + motion_x < 0 || x + motion_x >= s->avctx->width)
@@ -262,6 +261,12 @@ static inline void xan_wc3_copy_pixel_run(XanContext *s, AVFrame *frame,
     curframe_x = x;
     prevframe_index = (y + motion_y) * stride + x + motion_x;
     prevframe_x = x + motion_x;
+
+    if (prev_palette_plane == palette_plane && FFABS(curframe_index - prevframe_index) < pixel_count) {
+         avpriv_request_sample(s->avctx, "Overlapping copy");
+         return ;
+    }
+
     while (pixel_count &&
            curframe_index  < s->frame_size &&
            prevframe_index < s->frame_size) {
@@ -294,22 +299,22 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
     int width  = s->avctx->width;
     int height = s->avctx->height;
     int total_pixels = width * height;
-    unsigned char opcode;
-    unsigned char flag = 0;
+    uint8_t opcode;
+    uint8_t flag = 0;
     int size = 0;
     int motion_x, motion_y;
     int x, y, ret;
 
-    unsigned char *opcode_buffer = s->buffer1;
-    unsigned char *opcode_buffer_end = s->buffer1 + s->buffer1_size;
+    uint8_t *opcode_buffer = s->buffer1;
+    uint8_t *opcode_buffer_end = s->buffer1 + s->buffer1_size;
     int opcode_buffer_size = s->buffer1_size;
-    const unsigned char *imagedata_buffer = s->buffer2;
+    const uint8_t *imagedata_buffer = s->buffer2;
 
     /* pointers to segments inside the compressed chunk */
-    const unsigned char *huffman_segment;
+    const uint8_t *huffman_segment;
     GetByteContext       size_segment;
     GetByteContext       vector_segment;
-    const unsigned char *imagedata_segment;
+    const uint8_t *imagedata_segment;
     int huffman_offset, size_offset, vector_offset, imagedata_offset,
         imagedata_size;
 
@@ -382,16 +387,28 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
 
         case 9:
         case 19:
+            if (bytestream2_get_bytes_left(&size_segment) < 1) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_byte(&size_segment);
             break;
 
         case 10:
         case 20:
+            if (bytestream2_get_bytes_left(&size_segment) < 2) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_be16(&size_segment);
             break;
 
         case 11:
         case 21:
+            if (bytestream2_get_bytes_left(&size_segment) < 3) {
+                av_log(s->avctx, AV_LOG_ERROR, "size_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             size = bytestream2_get_be24(&size_segment);
             break;
         }
@@ -413,8 +430,13 @@ static int xan_wc3_decode_frame(XanContext *s, AVFrame *frame)
                 imagedata_size -= size;
             }
         } else {
+            uint8_t vector;
+            if (bytestream2_get_bytes_left(&vector_segment) <= 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "vector_segment overread\n");
+                return AVERROR_INVALIDDATA;
+            }
             /* run-based motion compensation from last frame */
-            uint8_t vector = bytestream2_get_byte(&vector_segment);
+            vector = bytestream2_get_byte(&vector_segment);
             motion_x = sign_extend(vector >> 4,  4);
             motion_y = sign_extend(vector & 0xF, 4);
 
@@ -534,6 +556,10 @@ static int xan_decode_frame(AVCodecContext *avctx,
         int i;
         tag  = bytestream2_get_le32(&ctx);
         size = bytestream2_get_be32(&ctx);
+        if (size < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid tag size %d\n", size);
+            return AVERROR_INVALIDDATA;
+        }
         size = FFMIN(size, bytestream2_get_bytes_left(&ctx));
         switch (tag) {
         case PALT_TAG:
@@ -541,8 +567,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
                 return AVERROR_INVALIDDATA;
             if (s->palettes_count >= PALETTES_MAX)
                 return AVERROR_INVALIDDATA;
-            tmpptr = av_realloc(s->palettes,
-                                (s->palettes_count + 1) * AVPALETTE_SIZE);
+            tmpptr = av_realloc_array(s->palettes,
+                                      s->palettes_count + 1, AVPALETTE_SIZE);
             if (!tmpptr)
                 return AVERROR(ENOMEM);
             s->palettes = tmpptr;
@@ -557,7 +583,7 @@ static int xan_decode_frame(AVCodecContext *avctx,
                 int g = gamma_lookup[bytestream2_get_byteu(&ctx)];
                 int b = gamma_lookup[bytestream2_get_byteu(&ctx)];
 #endif
-                *tmpptr++ = (r << 16) | (g << 8) | b;
+                *tmpptr++ = (0xFFU << 24) | (r << 16) | (g << 8) | b;
             }
             s->palettes_count++;
             break;
@@ -584,10 +610,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF))) {
-        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
         return ret;
-    }
 
     if (!s->frame_size)
         s->frame_size = frame->linesize[0] * s->avctx->height;
diff --git a/libavcodec/xbmdec.c b/libavcodec/xbmdec.c
index 2ce1465..d19bdae 100644
--- a/libavcodec/xbmdec.c
+++ b/libavcodec/xbmdec.c
@@ -1,20 +1,22 @@
 /*
  * XBM image format
  *
- * This file is part of Libav.
+ * Copyright (c) 2012 Paul B Mahol
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,43 +26,54 @@
 #include "internal.h"
 #include "mathops.h"
 
+static int convert(uint8_t x)
+{
+    if (x >= 'a')
+        x -= 87;
+    else if (x >= 'A')
+        x -= 55;
+    else
+        x -= '0';
+    return x;
+}
+
+static int parse_str_int(const uint8_t *p, int len, const uint8_t *key)
+{
+    const uint8_t *end = p + len;
+
+    for(; p<end - strlen(key); p++) {
+        if (!memcmp(p, key, strlen(key)))
+            break;
+    }
+    p += strlen(key);
+    if (p >= end)
+        return INT_MIN;
+
+    for(; p<end; p++) {
+        char *eptr;
+        int64_t ret = strtol(p, &eptr, 10);
+        if ((const uint8_t *)eptr != p)
+            return ret;
+    }
+    return INT_MIN;
+}
+
 static int xbm_decode_frame(AVCodecContext *avctx, void *data,
                             int *got_frame, AVPacket *avpkt)
 {
     AVFrame *p = data;
-    int ret, linesize, i;
+    int ret, linesize, i, j;
     int width  = 0;
     int height = 0;
-    const uint8_t *ptr = avpkt->data;
+    const uint8_t *end, *ptr = avpkt->data;
+    const uint8_t *next;
     uint8_t *dst;
 
     avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
-    while (!width || !height) {
-        ptr += strcspn(ptr, "#");
-        if (ptr >= avpkt->data + avpkt->size) {
-            av_log(avctx, AV_LOG_ERROR, "End of file reached.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        if (strncmp(ptr, "#define", 7) != 0) {
-            av_log(avctx, AV_LOG_ERROR,
-                   "Unexpected preprocessor directive.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        // skip the name
-        ptr += strcspn(ptr, "_") + 1;
-        // get width or height
-        if (strncmp(ptr, "width", 5) == 0) {
-            ptr += strcspn(ptr, " ");
-            width = strtol(ptr, NULL, 10);
-        } else if (strncmp(ptr, "height", 6) == 0) {
-            ptr += strcspn(ptr, " ");
-            height = strtol(ptr, NULL, 10);
-        } else {
-            // skip offset and unknown variables
-            av_log(avctx, AV_LOG_VERBOSE,
-                   "Ignoring preprocessor directive.\n");
-        }
-    }
+    end = avpkt->data + avpkt->size;
+
+    width  = parse_str_int(avpkt->data, avpkt->size, "_width");
+    height = parse_str_int(avpkt->data, avpkt->size, "_height");
 
     if ((ret = ff_set_dimensions(avctx, width, height)) < 0)
         return ret;
@@ -68,46 +81,48 @@ static int xbm_decode_frame(AVCodecContext *avctx, void *data,
     if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
 
-    // go to start of image data
-    ptr += strcspn(ptr, "{");
+    // goto start of image data
+    next = memchr(ptr, '{', avpkt->size);
+    if (!next)
+        next = memchr(ptr, '(', avpkt->size);
+    if (!next)
+        return AVERROR_INVALIDDATA;
+    ptr = next + 1;
 
     linesize = (avctx->width + 7) / 8;
     for (i = 0; i < avctx->height; i++) {
-        int eol = 0, e = 0;
         dst = p->data[0] + i * p->linesize[0];
-        if (ptr >= avpkt->data + avpkt->size) {
-            av_log(avctx, AV_LOG_ERROR, "End of file reached.\n");
-            return AVERROR_INVALIDDATA;
-        }
-        do {
-            int val;
-            uint8_t *endptr;
+        for (j = 0; j < linesize; j++) {
+            uint8_t val;
 
-            ptr += strcspn(ptr, "x") - 1; // -1 to get 0x
-            val = strtol(ptr, (char **)&endptr, 16);
+            while (ptr < end && *ptr != 'x' && *ptr != '$')
+                ptr++;
 
-            if (endptr - ptr == 4) {
-                // XBM X11 format
+            ptr ++;
+            if (ptr < end && av_isxdigit(*ptr)) {
+                val = convert(*ptr++);
+                if (av_isxdigit(*ptr))
+                    val = (val << 4) + convert(*ptr++);
                 *dst++ = ff_reverse[val];
-                eol = linesize;
-            } else if (endptr - ptr == 6) {
-                // XBM X10 format
-                *dst++ = ff_reverse[val >> 8];
-                *dst++ = ff_reverse[val & 0xFF];
-                eol = linesize / 2; // 2 bytes read
+                if (av_isxdigit(*ptr) && j+1 < linesize) {
+                    j++;
+                    val = convert(*ptr++);
+                    if (av_isxdigit(*ptr))
+                        val = (val << 4) + convert(*ptr++);
+                    *dst++ = ff_reverse[val];
+                }
             } else {
                 av_log(avctx, AV_LOG_ERROR,
                        "Unexpected data at %.8s.\n", ptr);
                 return AVERROR_INVALIDDATA;
             }
-            ptr = endptr;
-        } while (++e < eol);
+        }
     }
 
     p->key_frame = 1;
     p->pict_type = AV_PICTURE_TYPE_I;
 
-    *got_frame = 1;
+    *got_frame       = 1;
 
     return avpkt->size;
 }
diff --git a/libavcodec/xbmenc.c b/libavcodec/xbmenc.c
index 4840050..b25615f 100644
--- a/libavcodec/xbmenc.c
+++ b/libavcodec/xbmenc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,17 +24,6 @@
 #include "internal.h"
 #include "mathops.h"
 
-static av_cold int xbm_encode_init(AVCodecContext *avctx)
-{
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
-    return 0;
-}
-
 static int xbm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                             const AVFrame *p, int *got_packet)
 {
@@ -43,10 +32,8 @@ static int xbm_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
     linesize = (avctx->width + 7) / 8;
     size     = avctx->height * (linesize * 7 + 2) + 110;
-    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, size, 0)) < 0)
         return ret;
-    }
 
     buf = pkt->data;
     ptr = p->data[0];
@@ -73,8 +60,7 @@ AVCodec ff_xbm_encoder = {
     .long_name    = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"),
     .type         = AVMEDIA_TYPE_VIDEO,
     .id           = AV_CODEC_ID_XBM,
-    .init         = xbm_encode_init,
     .encode2      = xbm_encode_frame,
     .pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE,
-                                                 AV_PIX_FMT_NONE },
+                                                   AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/xface.c b/libavcodec/xface.c
new file mode 100644
index 0000000..8c0cbfd
--- /dev/null
+++ b/libavcodec/xface.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face common data and utilities definition.
+ */
+
+#include "libavutil/avassert.h"
+
+#include "xface.h"
+
+void ff_big_add(BigInt *b, uint8_t a)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c;
+
+    a &= XFACE_WORDMASK;
+    if (a == 0)
+        return;
+    w = b->words;
+    c = a;
+    for (i = 0; i < b->nb_words && c; i++) {
+        c += *w;
+        *w++ = c & XFACE_WORDMASK;
+        c >>= XFACE_BITSPERWORD;
+    }
+    if (i == b->nb_words && c) {
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        b->nb_words++;
+        *w = c & XFACE_WORDMASK;
+    }
+}
+
+void ff_big_div(BigInt *b, uint8_t a, uint8_t *r)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c, d;
+
+    a &= XFACE_WORDMASK;
+    if (a == 1 || b->nb_words == 0) {
+        *r = 0;
+        return;
+    }
+
+    /* treat this as a == WORDCARRY and just shift everything right a WORD */
+    if (a == 0) {
+        i = --b->nb_words;
+        w = b->words;
+        *r = *w;
+        while (i--) {
+            *w = *(w + 1);
+            w++;
+        }
+        *w = 0;
+        return;
+    }
+    i = b->nb_words;
+    w = b->words + i;
+    c = 0;
+    while (i--) {
+        c <<= XFACE_BITSPERWORD;
+        c += *--w;
+        d = c / (uint16_t)a;
+        c = c % (uint16_t)a;
+        *w = d & XFACE_WORDMASK;
+    }
+    *r = c;
+    if (b->words[b->nb_words - 1] == 0)
+        b->nb_words--;
+}
+
+void ff_big_mul(BigInt *b, uint8_t a)
+{
+    int i;
+    uint8_t *w;
+    uint16_t c;
+
+    a &= XFACE_WORDMASK;
+    if (a == 1 || b->nb_words == 0)
+        return;
+    if (a == 0) {
+        /* treat this as a == WORDCARRY and just shift everything left a WORD */
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        i = b->nb_words++;
+        w = b->words + i;
+        while (i--) {
+            *w = *(w - 1);
+            w--;
+        }
+        *w = 0;
+        return;
+    }
+    i = b->nb_words;
+    w = b->words;
+    c = 0;
+    while (i--) {
+        c += (uint16_t)*w * (uint16_t)a;
+        *(w++) = c & XFACE_WORDMASK;
+        c >>= XFACE_BITSPERWORD;
+    }
+    if (c) {
+        av_assert0(b->nb_words < XFACE_MAX_WORDS);
+        b->nb_words++;
+        *w = c & XFACE_WORDMASK;
+    }
+}
+
+const ProbRange ff_xface_probranges_per_level[4][3] = {
+    //  black      grey       white
+    { {  1, 255}, {251, 0}, {  4, 251} }, /* Top of tree almost always grey */
+    { {  1, 255}, {200, 0}, { 55, 200} },
+    { { 33, 223}, {159, 0}, { 64, 159} },
+    { {131,   0}, {  0, 0}, {125, 131} }, /* Grey disallowed at bottom */
+};
+
+const ProbRange ff_xface_probranges_2x2[16] = {
+    { 0,   0},  {38,   0}, {38,  38},  {13, 152},
+    {38,  76},  {13, 165}, {13, 178},  { 6, 230},
+    {38, 114},  {13, 191}, {13, 204},  { 6, 236},
+    {13, 217},  { 6, 242}, { 5, 248},  { 3, 253},
+};
+
+/*
+ * The "guess the next pixel" tables follow. Normally there are 12
+ * neighbour pixels used to give 1<<12 cases as we get closer to the
+ * upper left corner lesser numbers of neighbours are available.
+ *
+ * Each byte in the tables represents 8 boolean values starting from
+ * the most significant bit.
+ */
+
+static const uint8_t g_00[] = {
+    0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0xe3, 0xdf, 0x05, 0x17,
+    0x05, 0x0f, 0x00, 0x1b, 0x0f, 0xdf, 0x00, 0x04, 0x00, 0x00,
+    0x0d, 0x0f, 0x03, 0x7f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x1d,
+    0x45, 0x2f, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x0a, 0xff, 0xff,
+    0x00, 0x04, 0x00, 0x05, 0x01, 0x3f, 0xcf, 0xff, 0x10, 0x01,
+    0x80, 0xc9, 0x0f, 0x0f, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+    0x1b, 0x1f, 0xff, 0xff, 0x4f, 0x54, 0x07, 0x1f, 0x57, 0x47,
+    0xd7, 0x3d, 0xff, 0xff, 0x5f, 0x1f, 0x7f, 0xff, 0x7f, 0x7f,
+    0x05, 0x0f, 0x01, 0x0f, 0x0f, 0x5f, 0x9b, 0xdf, 0x7f, 0xff,
+    0x5f, 0x1d, 0x5f, 0xff, 0x0f, 0x1f, 0x0f, 0x5f, 0x03, 0x1f,
+    0x4f, 0x5f, 0xf7, 0x7f, 0x7f, 0xff, 0x0d, 0x0f, 0xfb, 0xff,
+    0xf7, 0xbf, 0x0f, 0x4f, 0xd7, 0x3f, 0x4f, 0x7f, 0xff, 0xff,
+    0x67, 0xbf, 0x56, 0x25, 0x1f, 0x7f, 0x9f, 0xff, 0x00, 0x00,
+    0x00, 0x05, 0x5f, 0x7f, 0x01, 0xdf, 0x14, 0x00, 0x05, 0x0f,
+    0x07, 0xa2, 0x09, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x5f,
+    0x18, 0xd7, 0x94, 0x71, 0x00, 0x05, 0x1f, 0xb7, 0x0c, 0x07,
+    0x0f, 0x0f, 0x00, 0x0f, 0x0f, 0x1f, 0x84, 0x8f, 0x05, 0x15,
+    0x05, 0x0f, 0x4f, 0xff, 0x87, 0xdf, 0x05, 0x01, 0x10, 0x00,
+    0x0f, 0x0f, 0x00, 0x08, 0x05, 0x04, 0x04, 0x01, 0x4f, 0xff,
+    0x9f, 0x8f, 0x4a, 0x40, 0x5f, 0x5f, 0xff, 0xfe, 0xdf, 0xff,
+    0x7f, 0xf7, 0xff, 0x7f, 0xff, 0xff, 0x7b, 0xff, 0x0f, 0xfd,
+    0xd7, 0x5f, 0x4f, 0x7f, 0x7f, 0xdf, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0x77, 0xdf, 0x7f, 0x4f, 0xef, 0xff, 0xff, 0x77, 0xff,
+    0xff, 0xff, 0x6f, 0xff, 0x0f, 0x4f, 0xff, 0xff, 0x9d, 0xff,
+    0x0f, 0xef, 0xff, 0xdf, 0x6f, 0xff, 0xff, 0xff, 0x4f, 0xff,
+    0xcd, 0x0f, 0x4f, 0xff, 0xff, 0xdf, 0x00, 0x00, 0x00, 0x0b,
+    0x05, 0x02, 0x02, 0x0f, 0x04, 0x00, 0x00, 0x0c, 0x01, 0x06,
+    0x00, 0x0f, 0x20, 0x03, 0x00, 0x00, 0x05, 0x0f, 0x40, 0x08,
+    0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x0c, 0x0f, 0x01, 0x00,
+    0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x14, 0x01, 0x05,
+    0x01, 0x15, 0xaf, 0x0f, 0x00, 0x01, 0x10, 0x00, 0x08, 0x00,
+    0x46, 0x0c, 0x20, 0x00, 0x88, 0x00, 0x0f, 0x15, 0xff, 0xdf,
+    0x02, 0x00, 0x00, 0x0f, 0x7f, 0x5f, 0xdb, 0xff, 0x4f, 0x3e,
+    0x05, 0x0f, 0x7f, 0xf7, 0x95, 0x4f, 0x0d, 0x0f, 0x01, 0x0f,
+    0x4f, 0x5f, 0x9f, 0xdf, 0x25, 0x0e, 0x0d, 0x0d, 0x4f, 0x7f,
+    0x8f, 0x0f, 0x0f, 0xfa, 0x04, 0x4f, 0x4f, 0xff, 0xf7, 0x77,
+    0x47, 0xed, 0x05, 0x0f, 0xff, 0xff, 0xdf, 0xff, 0x4f, 0x6f,
+    0xd8, 0x5f, 0x0f, 0x7f, 0xdf, 0x5f, 0x07, 0x0f, 0x94, 0x0d,
+    0x1f, 0xff, 0xff, 0xff, 0x00, 0x02, 0x00, 0x03, 0x46, 0x57,
+    0x01, 0x0d, 0x01, 0x08, 0x01, 0x0f, 0x47, 0x6c, 0x0d, 0x0f,
+    0x02, 0x00, 0x00, 0x00, 0x0b, 0x4f, 0x00, 0x08, 0x05, 0x00,
+    0x95, 0x01, 0x0f, 0x7f, 0x0c, 0x0f, 0x01, 0x0e, 0x00, 0x00,
+    0x0f, 0x41, 0x00, 0x00, 0x04, 0x24, 0x0d, 0x0f, 0x0f, 0x7f,
+    0xcf, 0xdf, 0x00, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00,
+    0x06, 0x26, 0xcf, 0x05, 0xcf, 0x7f, 0xdf, 0xdf, 0x00, 0x00,
+    0x17, 0x5f, 0xff, 0xfd, 0xff, 0xff, 0x46, 0x09, 0x4f, 0x5f,
+    0x7f, 0xfd, 0xdf, 0xff, 0x0a, 0x88, 0xa7, 0x7f, 0x7f, 0xff,
+    0xff, 0xff, 0x0f, 0x04, 0xdf, 0x7f, 0x4f, 0xff, 0x9f, 0xff,
+    0x0e, 0xe6, 0xdf, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x0f, 0xec,
+    0x8f, 0x4f, 0x7f, 0xff, 0xdf, 0xff, 0x0f, 0xcf, 0xdf, 0xff,
+    0x6f, 0x7f, 0xff, 0xff, 0x03, 0x0c, 0x9d, 0x0f, 0x7f, 0xff,
+    0xff, 0xff,
+};
+
+static const uint8_t g_01[] = {
+    0x37, 0x73, 0x00, 0x19, 0x57, 0x7f, 0xf5, 0xfb, 0x70, 0x33,
+    0xf0, 0xf9, 0x7f, 0xff, 0xff, 0xff,
+};
+
+static const uint8_t g_02[] = {
+    0x50,
+};
+
+static const uint8_t g_10[] = {
+    0x00, 0x00, 0x00, 0x00, 0x50, 0x00, 0xf3, 0x5f, 0x84, 0x04,
+    0x17, 0x9f, 0x04, 0x23, 0x05, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x03, 0x03, 0x33, 0xd7, 0x05, 0x03, 0x5f, 0x3f, 0x17, 0x33,
+    0xff, 0xff, 0x00, 0x80, 0x02, 0x04, 0x12, 0x00, 0x11, 0x57,
+    0x05, 0x25, 0x05, 0x03, 0x35, 0xbf, 0x9f, 0xff, 0x07, 0x6f,
+    0x20, 0x40, 0x17, 0x06, 0xfa, 0xe8, 0x01, 0x07, 0x1f, 0x9f,
+    0x1f, 0xff, 0xff, 0xff,
+};
+
+static const uint8_t g_20[] = {
+    0x04, 0x00, 0x01, 0x01, 0x43, 0x2e, 0xff, 0x3f,
+};
+
+static const uint8_t g_30[] = {
+    0x11, 0x11, 0x11, 0x11, 0x51, 0x11, 0x13, 0x11, 0x11, 0x11,
+    0x13, 0x11, 0x11, 0x11, 0x33, 0x11, 0x13, 0x11, 0x13, 0x13,
+    0x13, 0x13, 0x31, 0x31, 0x11, 0x01, 0x11, 0x11, 0x71, 0x11,
+    0x11, 0x75,
+};
+
+static const uint8_t g_40[] = {
+    0x00, 0x0f, 0x00, 0x09, 0x00, 0x0d, 0x00, 0x0d, 0x00, 0x0f,
+    0x00, 0x4e, 0xe4, 0x0d, 0x10, 0x0f, 0x00, 0x0f, 0x44, 0x4f,
+    0x00, 0x1e, 0x0f, 0x0f, 0xae, 0xaf, 0x45, 0x7f, 0xef, 0xff,
+    0x0f, 0xff, 0x00, 0x09, 0x01, 0x11, 0x00, 0x01, 0x1c, 0xdd,
+    0x00, 0x15, 0x00, 0xff, 0x00, 0x10, 0x00, 0xfd, 0x00, 0x0f,
+    0x4f, 0x5f, 0x3d, 0xff, 0xff, 0xff, 0x4f, 0xff, 0x1c, 0xff,
+    0xdf, 0xff, 0x8f, 0xff, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x15,
+    0x01, 0x07, 0x00, 0x01, 0x02, 0x1f, 0x01, 0x11, 0x05, 0x7f,
+    0x00, 0x1f, 0x41, 0x57, 0x1f, 0xff, 0x05, 0x77, 0x0d, 0x5f,
+    0x4d, 0xff, 0x4f, 0xff, 0x0f, 0xff, 0x00, 0x00, 0x02, 0x05,
+    0x00, 0x11, 0x05, 0x7d, 0x10, 0x15, 0x2f, 0xff, 0x40, 0x50,
+    0x0d, 0xfd, 0x04, 0x0f, 0x07, 0x1f, 0x07, 0x7f, 0x0f, 0xbf,
+    0x0d, 0x7f, 0x0f, 0xff, 0x4d, 0x7d, 0x0f, 0xff,
+};
+
+static const uint8_t g_11[] = {
+    0x01, 0x13, 0x03, 0x7f,
+};
+
+static const uint8_t g_21[] = {
+    0x17,
+};
+
+static const uint8_t g_31[] = {
+    0x55, 0x57, 0x57, 0x7f,
+};
+
+static const uint8_t g_41[] = {
+    0x01, 0x01, 0x01, 0x1f, 0x03, 0x1f, 0x3f, 0xff,
+};
+
+static const uint8_t g_12[] = {
+    0x40,
+};
+
+static const uint8_t g_22[] = {
+    0x00,
+};
+
+static const uint8_t g_32[] = {
+    0x10,
+};
+
+static const uint8_t g_42[] = {
+    0x10,
+};
+
+void ff_xface_generate_face(uint8_t *dst, uint8_t * const src)
+{
+    int h, i, j, k, l, m;
+
+    for (j = 0; j < XFACE_HEIGHT; j++) {
+        for (i = 0; i < XFACE_WIDTH; i++) {
+            h = i + j * XFACE_WIDTH;
+            k = 0;
+
+            /*
+               Compute k, encoding the bits *before* the current one, contained in the
+               image buffer. That is, given the grid:
+
+                l      i
+                |      |
+                v      v
+               +--+--+--+--+--+
+          m -> | 1| 2| 3| 4| 5|
+               +--+--+--+--+--+
+               | 6| 7| 8| 9|10|
+               +--+--+--+--+--+
+          j -> |11|12| *|  |  |
+               +--+--+--+--+--+
+
+               the value k for the pixel marked as "*" will contain the bit encoding of
+               the values in the matrix marked from "1" to "12". In case the pixel is
+               near the border of the grid, the number of values contained within the
+               grid will be lesser than 12.
+             */
+
+            for (l = i - 2; l <= i + 2; l++) {
+                for (m = j - 2; m <= j; m++) {
+                    if (l >= i && m == j)
+                        continue;
+                    if (l > 0 && l <= XFACE_WIDTH && m > 0)
+                        k = 2*k + src[l + m * XFACE_WIDTH];
+                }
+            }
+
+            /*
+              Use the guess for the given position and the computed value of k.
+
+              The following table shows the number of digits in k, depending on
+              the position of the pixel, and shows the corresponding guess table
+              to use:
+
+                 i=1  i=2  i=3       i=w-1 i=w
+               +----+----+----+ ... +----+----+
+           j=1 |  0 |  1 |  2 |     |  2 |  2 |
+               |g22 |g12 |g02 |     |g42 |g32 |
+               +----+----+----+ ... +----+----+
+           j=2 |  3 |  5 |  7 |     |  6 |  5 |
+               |g21 |g11 |g01 |     |g41 |g31 |
+               +----+----+----+ ... +----+----+
+           j=3 |  5 |  9 | 12 |     | 10 |  8 |
+               |g20 |g10 |g00 |     |g40 |g30 |
+               +----+----+----+ ... +----+----+
+            */
+
+#define GEN(table) dst[h] ^= (table[k>>3]>>(7-(k&7)))&1
+
+            switch (i) {
+            case 1:
+                switch (j) {
+                case 1:  GEN(g_22); break;
+                case 2:  GEN(g_21); break;
+                default: GEN(g_20); break;
+                }
+                break;
+            case 2:
+                switch (j) {
+                case 1:  GEN(g_12); break;
+                case 2:  GEN(g_11); break;
+                default: GEN(g_10); break;
+                }
+                break;
+            case XFACE_WIDTH - 1:
+                switch (j) {
+                case 1:  GEN(g_42); break;
+                case 2:  GEN(g_41); break;
+                default: GEN(g_40); break;
+                }
+                break;
+            case XFACE_WIDTH:
+                switch (j) {
+                case 1:  GEN(g_32); break;
+                case 2:  GEN(g_31); break;
+                default: GEN(g_30); break;
+                }
+                break;
+            default:
+                switch (j) {
+                case 1:  GEN(g_02); break;
+                case 2:  GEN(g_01); break;
+                default: GEN(g_00); break;
+                }
+                break;
+            }
+        }
+    }
+}
diff --git a/libavcodec/xface.h b/libavcodec/xface.h
new file mode 100644
index 0000000..d366fdb
--- /dev/null
+++ b/libavcodec/xface.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face common definitions.
+ */
+
+#ifndef AVCODEC_XFACE_H
+#define AVCODEC_XFACE_H
+
+#include <stdint.h>
+
+/* define the face size - 48x48x1 */
+#define XFACE_WIDTH  48
+#define XFACE_HEIGHT 48
+#define XFACE_PIXELS (XFACE_WIDTH * XFACE_HEIGHT)
+
+/* compressed output uses the full range of printable characters.
+ * In ASCII these are in a contiguous block so we just need to know
+ * the first and last. The total number of printables is needed too. */
+#define XFACE_FIRST_PRINT '!'
+#define XFACE_LAST_PRINT '~'
+#define XFACE_PRINTS (XFACE_LAST_PRINT - XFACE_FIRST_PRINT + 1)
+
+/*
+ * Image is encoded as a big integer, using characters from '~' to
+ * '!', for a total of 94 symbols. In order to express
+ * 48x48 pixels with the worst case encoding 666 symbols should
+ * be sufficient.
+ */
+#define XFACE_MAX_DIGITS 666
+
+#define XFACE_BITSPERWORD 8
+#define XFACE_WORDCARRY (1 << XFACE_BITSPERWORD)
+#define XFACE_WORDMASK (XFACE_WORDCARRY - 1)
+
+// This must be larger or equal to log256(94^XFACE_MAX_DIGITS)
+#define XFACE_MAX_WORDS 546
+
+/* Portable, very large unsigned integer arithmetic is needed.
+ * Implementation uses arrays of WORDs. */
+typedef struct {
+    int nb_words;
+    uint8_t words[XFACE_MAX_WORDS];
+} BigInt;
+
+/**
+ * Add a to b storing the result in b.
+ */
+void ff_big_add(BigInt *b, uint8_t a);
+
+/**
+ * Divide b by a storing the result in b and the remainder in the word
+ * pointed to by r.
+ */
+void ff_big_div(BigInt *b, uint8_t a, uint8_t *r);
+
+/**
+ * Multiply a by b storing the result in b.
+ */
+void ff_big_mul(BigInt *b, uint8_t a);
+
+/* Each face is encoded using 9 octrees of 16x16 each. Each level of the
+ * trees has varying probabilities of being white, grey or black.
+ * The table below is based on sampling many faces */
+enum XFaceColor { XFACE_COLOR_BLACK = 0, XFACE_COLOR_GREY, XFACE_COLOR_WHITE };
+
+/* Data of varying probabilities are encoded by a value in the range 0 - 255.
+ * The probability of the data determines the range of possible encodings.
+ * Offset gives the first possible encoding of the range. */
+typedef struct {
+    uint8_t range;
+    uint8_t offset;
+} ProbRange;
+
+extern const ProbRange ff_xface_probranges_per_level[4][3];
+
+extern const ProbRange ff_xface_probranges_2x2[16];
+
+void ff_xface_generate_face(uint8_t *dst, uint8_t * const src);
+
+#endif /* AVCODEC_XFACE_H */
diff --git a/libavcodec/xfacedec.c b/libavcodec/xfacedec.c
new file mode 100644
index 0000000..d045cb6
--- /dev/null
+++ b/libavcodec/xfacedec.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face decoder, based on libcompface, by James Ashton.
+ */
+
+#include "libavutil/pixdesc.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "internal.h"
+#include "xface.h"
+
+static int pop_integer(BigInt *b, const ProbRange *pranges)
+{
+    uint8_t r;
+    int i;
+
+    /* extract the last byte into r, and shift right b by 8 bits */
+    ff_big_div(b, 0, &r);
+
+    i = 0;
+    while (r < pranges->offset || r >= pranges->range + pranges->offset) {
+        pranges++;
+        i++;
+    }
+    ff_big_mul(b, pranges->range);
+    ff_big_add(b, r - pranges->offset);
+    return i;
+}
+
+static void pop_greys(BigInt *b, char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        pop_greys(b, bitmap,                       w, h);
+        pop_greys(b, bitmap + w,                   w, h);
+        pop_greys(b, bitmap + XFACE_WIDTH * h,     w, h);
+        pop_greys(b, bitmap + XFACE_WIDTH * h + w, w, h);
+    } else {
+        w = pop_integer(b, ff_xface_probranges_2x2);
+        if (w & 1) bitmap[0]               = 1;
+        if (w & 2) bitmap[1]               = 1;
+        if (w & 4) bitmap[XFACE_WIDTH]     = 1;
+        if (w & 8) bitmap[XFACE_WIDTH + 1] = 1;
+    }
+}
+
+static void decode_block(BigInt *b, char *bitmap, int w, int h, int level)
+{
+    switch (pop_integer(b, &ff_xface_probranges_per_level[level][0])) {
+    case XFACE_COLOR_WHITE:
+        return;
+    case XFACE_COLOR_BLACK:
+        pop_greys(b, bitmap, w, h);
+        return;
+    default:
+        w /= 2;
+        h /= 2;
+        level++;
+        decode_block(b, bitmap,                       w, h, level);
+        decode_block(b, bitmap + w,                   w, h, level);
+        decode_block(b, bitmap + h * XFACE_WIDTH,     w, h, level);
+        decode_block(b, bitmap + w + h * XFACE_WIDTH, w, h, level);
+        return;
+    }
+}
+
+typedef struct XFaceContext {
+    uint8_t bitmap[XFACE_PIXELS]; ///< image used internally for decoding
+} XFaceContext;
+
+static av_cold int xface_decode_init(AVCodecContext *avctx)
+{
+    if (avctx->width || avctx->height) {
+        if (avctx->width != XFACE_WIDTH || avctx->height != XFACE_HEIGHT) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Size value %dx%d not supported, only accepts a size of %dx%d\n",
+                   avctx->width, avctx->height, XFACE_WIDTH, XFACE_HEIGHT);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    avctx->width   = XFACE_WIDTH;
+    avctx->height  = XFACE_HEIGHT;
+    avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+
+    return 0;
+}
+
+static int xface_decode_frame(AVCodecContext *avctx,
+                              void *data, int *got_frame,
+                              AVPacket *avpkt)
+{
+    XFaceContext *xface = avctx->priv_data;
+    int ret, i, j, k;
+    uint8_t byte;
+    BigInt b = {0};
+    char *buf;
+    int64_t c;
+    AVFrame *frame = data;
+
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
+    for (i = 0, k = 0; avpkt->data[i] && i < avpkt->size; i++) {
+        c = avpkt->data[i];
+
+        /* ignore invalid digits */
+        if (c < XFACE_FIRST_PRINT || c > XFACE_LAST_PRINT)
+            continue;
+
+        if (++k > XFACE_MAX_DIGITS) {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Buffer is longer than expected, truncating at byte %d\n", i);
+            break;
+        }
+        ff_big_mul(&b, XFACE_PRINTS);
+        ff_big_add(&b, c - XFACE_FIRST_PRINT);
+    }
+
+    /* decode image and put it in bitmap */
+    memset(xface->bitmap, 0, XFACE_PIXELS);
+    buf = xface->bitmap;
+    decode_block(&b, buf,                         16, 16, 0);
+    decode_block(&b, buf + 16,                    16, 16, 0);
+    decode_block(&b, buf + 32,                    16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16,      16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16 + 16, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 16 + 32, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32     , 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32 + 16, 16, 16, 0);
+    decode_block(&b, buf + XFACE_WIDTH * 32 + 32, 16, 16, 0);
+
+    ff_xface_generate_face(xface->bitmap, xface->bitmap);
+
+    /* convert image from 1=black 0=white bitmap to MONOWHITE */
+    buf = frame->data[0];
+    for (i = 0, j = 0, k = 0, byte = 0; i < XFACE_PIXELS; i++) {
+        byte += xface->bitmap[i];
+        if (k == 7) {
+            buf[j++] = byte;
+            byte = k = 0;
+        } else {
+            k++;
+            byte <<= 1;
+        }
+        if (j == XFACE_WIDTH/8) {
+            j = 0;
+            buf += frame->linesize[0];
+        }
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_xface_decoder = {
+    .name           = "xface",
+    .long_name      = NULL_IF_CONFIG_SMALL("X-face image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XFACE,
+    .priv_data_size = sizeof(XFaceContext),
+    .init           = xface_decode_init,
+    .decode         = xface_decode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE, AV_PIX_FMT_NONE },
+};
diff --git a/libavcodec/xfaceenc.c b/libavcodec/xfaceenc.c
new file mode 100644
index 0000000..bfb9fb9
--- /dev/null
+++ b/libavcodec/xfaceenc.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 1990 James Ashton - Sydney University
+ * Copyright (c) 2012 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * X-Face encoder, based on libcompface, by James Ashton.
+ */
+
+#include "xface.h"
+#include "avcodec.h"
+#include "internal.h"
+#include "libavutil/avassert.h"
+
+typedef struct XFaceContext {
+    AVClass *class;
+    uint8_t bitmap[XFACE_PIXELS]; ///< image used internally for decoding
+    int max_line_len;             ///< max line length for compressed data
+    int set_header;               ///< set X-Face header in the output
+} XFaceContext;
+
+static int all_same(char *bitmap, int w, int h)
+{
+    char val, *row;
+    int x;
+
+    val = *bitmap;
+    while (h--) {
+        row = bitmap;
+        x = w;
+        while (x--)
+            if (*(row++) != val)
+                return 0;
+        bitmap += XFACE_WIDTH;
+    }
+    return 1;
+}
+
+static int all_black(char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        return (all_black(bitmap, w, h) && all_black(bitmap + w, w, h) &&
+                all_black(bitmap + XFACE_WIDTH * h, w, h) &&
+                all_black(bitmap + XFACE_WIDTH * h + w, w, h));
+    } else {
+        /* at least one pixel in the 2x2 grid is non-zero */
+        return *bitmap || *(bitmap + 1) ||
+               *(bitmap + XFACE_WIDTH) || *(bitmap + XFACE_WIDTH + 1);
+    }
+}
+
+static int all_white(char *bitmap, int w, int h)
+{
+    return *bitmap == 0 && all_same(bitmap, w, h);
+}
+
+typedef struct {
+    ProbRange prob_ranges[XFACE_PIXELS*2];
+    int prob_ranges_idx;
+} ProbRangesQueue;
+
+static inline int pq_push(ProbRangesQueue *pq, const ProbRange *p)
+{
+    if (pq->prob_ranges_idx >= XFACE_PIXELS * 2 - 1)
+        return -1;
+    pq->prob_ranges[pq->prob_ranges_idx++] = *p;
+    return 0;
+}
+
+static void push_greys(ProbRangesQueue *pq, char *bitmap, int w, int h)
+{
+    if (w > 3) {
+        w /= 2;
+        h /= 2;
+        push_greys(pq, bitmap,                       w, h);
+        push_greys(pq, bitmap + w,                   w, h);
+        push_greys(pq, bitmap + XFACE_WIDTH * h,     w, h);
+        push_greys(pq, bitmap + XFACE_WIDTH * h + w, w, h);
+    } else {
+        const ProbRange *p = ff_xface_probranges_2x2 +
+                 *bitmap +
+            2 * *(bitmap + 1) +
+            4 * *(bitmap + XFACE_WIDTH) +
+            8 * *(bitmap + XFACE_WIDTH + 1);
+        pq_push(pq, p);
+    }
+}
+
+static void encode_block(char *bitmap, int w, int h, int level, ProbRangesQueue *pq)
+{
+    if (all_white(bitmap, w, h)) {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_WHITE]);
+    } else if (all_black(bitmap, w, h)) {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_BLACK]);
+        push_greys(pq, bitmap, w, h);
+    } else {
+        pq_push(pq, &ff_xface_probranges_per_level[level][XFACE_COLOR_GREY]);
+        w /= 2;
+        h /= 2;
+        level++;
+        encode_block(bitmap,                       w, h, level, pq);
+        encode_block(bitmap + w,                   w, h, level, pq);
+        encode_block(bitmap + h * XFACE_WIDTH,     w, h, level, pq);
+        encode_block(bitmap + w + h * XFACE_WIDTH, w, h, level, pq);
+    }
+}
+
+static void push_integer(BigInt *b, const ProbRange *prange)
+{
+    uint8_t r;
+
+    ff_big_div(b, prange->range, &r);
+    ff_big_mul(b, 0);
+    ff_big_add(b, r + prange->offset);
+}
+
+static int xface_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *frame, int *got_packet)
+{
+    XFaceContext *xface = avctx->priv_data;
+    ProbRangesQueue pq = {{{ 0 }}, 0};
+    uint8_t bitmap_copy[XFACE_PIXELS];
+    BigInt b = {0};
+    int i, j, k, ret = 0;
+    const uint8_t *buf;
+    uint8_t *p;
+    char intbuf[XFACE_MAX_DIGITS];
+
+    if (avctx->width || avctx->height) {
+        if (avctx->width != XFACE_WIDTH || avctx->height != XFACE_HEIGHT) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Size value %dx%d not supported, only accepts a size of %dx%d\n",
+                   avctx->width, avctx->height, XFACE_WIDTH, XFACE_HEIGHT);
+            return AVERROR(EINVAL);
+        }
+    }
+    avctx->width  = XFACE_WIDTH;
+    avctx->height = XFACE_HEIGHT;
+
+    /* convert image from MONOWHITE to 1=black 0=white bitmap */
+    buf = frame->data[0];
+    i = j = 0;
+    do {
+        for (k = 0; k < 8; k++)
+            xface->bitmap[i++] = (buf[j]>>(7-k))&1;
+        if (++j == XFACE_WIDTH/8) {
+            buf += frame->linesize[0];
+            j = 0;
+        }
+    } while (i < XFACE_PIXELS);
+
+    /* create a copy of bitmap */
+    memcpy(bitmap_copy, xface->bitmap, XFACE_PIXELS);
+    ff_xface_generate_face(xface->bitmap, bitmap_copy);
+
+    encode_block(xface->bitmap,                         16, 16, 0, &pq);
+    encode_block(xface->bitmap + 16,                    16, 16, 0, &pq);
+    encode_block(xface->bitmap + 32,                    16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16,      16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16 + 16, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 16 + 32, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32,      16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32 + 16, 16, 16, 0, &pq);
+    encode_block(xface->bitmap + XFACE_WIDTH * 32 + 32, 16, 16, 0, &pq);
+
+    while (pq.prob_ranges_idx > 0)
+        push_integer(&b, &pq.prob_ranges[--pq.prob_ranges_idx]);
+
+    /* write the inverted big integer in b to intbuf */
+    i = 0;
+    av_assert0(b.nb_words < XFACE_MAX_WORDS);
+    while (b.nb_words) {
+        uint8_t r;
+        ff_big_div(&b, XFACE_PRINTS, &r);
+        av_assert0(i < sizeof(intbuf));
+        intbuf[i++] = r + XFACE_FIRST_PRINT;
+    }
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, i+2, 0)) < 0)
+        return ret;
+
+    /* revert the number, and close the buffer */
+    p = pkt->data;
+    while (--i >= 0)
+        *(p++) = intbuf[i];
+    *(p++) = '\n';
+    *(p++) = 0;
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
+}
+
+AVCodec ff_xface_encoder = {
+    .name           = "xface",
+    .long_name      = NULL_IF_CONFIG_SMALL("X-face image"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_XFACE,
+    .priv_data_size = sizeof(XFaceContext),
+    .encode2        = xface_encode_frame,
+    .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_MONOWHITE, AV_PIX_FMT_NONE },
+    .capabilities   = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/xiph.c b/libavcodec/xiph.c
index 7c3c710..d072224 100644
--- a/libavcodec/xiph.c
+++ b/libavcodec/xiph.c
@@ -1,28 +1,28 @@
 /*
- * Copyright (C) 2007  Libav Project
+ * Copyright (C) 2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/intreadwrite.h"
 #include "xiph.h"
 
-int avpriv_split_xiph_headers(uint8_t *extradata, int extradata_size,
-                          int first_header_size, uint8_t *header_start[3],
+int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size,
+                          int first_header_size, const uint8_t *header_start[3],
                           int header_len[3])
 {
     int i;
diff --git a/libavcodec/xiph.h b/libavcodec/xiph.h
index afaece7..1741a51 100644
--- a/libavcodec/xiph.h
+++ b/libavcodec/xiph.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2007  Libav Project
+ * Copyright (C) 2007 The FFmpeg Project
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -36,8 +36,8 @@
  * @param[out] header_len The sizes of each of the three headers.
  * @return On error a negative value is returned, on success zero.
  */
-int avpriv_split_xiph_headers(uint8_t *extradata, int extradata_size,
-                              int first_header_size, uint8_t *header_start[3],
+int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size,
+                              int first_header_size, const uint8_t *header_start[3],
                               int header_len[3]);
 
 #endif /* AVCODEC_XIPH_H */
diff --git a/libavcodec/xl.c b/libavcodec/xl.c
index 7286c14..37ab46e 100644
--- a/libavcodec/xl.c
+++ b/libavcodec/xl.c
@@ -2,20 +2,20 @@
  * Miro VideoXL codec
  * Copyright (c) 2004 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -50,19 +50,16 @@ static int decode_frame(AVCodecContext *avctx,
     int y0, y1, y2, y3 = 0, c0 = 0, c1 = 0;
 
     if (avctx->width % 4) {
-        av_log(avctx, AV_LOG_ERROR, "Width not a multiple of 4.\n");
+        av_log(avctx, AV_LOG_ERROR, "width is not a multiple of 4\n");
         return AVERROR_INVALIDDATA;
     }
-
     if (buf_size < avctx->width * avctx->height) {
         av_log(avctx, AV_LOG_ERROR, "Packet is too small\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
 
diff --git a/libavcodec/xsubdec.c b/libavcodec/xsubdec.c
index 7e25787..540607a 100644
--- a/libavcodec/xsubdec.c
+++ b/libavcodec/xsubdec.c
@@ -2,20 +2,20 @@
  * XSUB subtitle decoder
  * Copyright (c) 2007 Reimar Döffinger
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -60,11 +60,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     AVSubtitleRect *rect;
     int j;
 
-    memset(sub, 0, sizeof(*sub));
-
     // check that at least header fits
     if (buf_size < 27 + 7 * 2 + 4 * (3 + has_alpha)) {
-        av_log(avctx, AV_LOG_ERROR, "coded frame too small\n");
+        av_log(avctx, AV_LOG_ERROR, "coded frame size %d too small\n", buf_size);
         return -1;
     }
 
@@ -99,12 +97,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     sub->rects =  av_mallocz(sizeof(*sub->rects));
     if (!sub->rects)
         return AVERROR(ENOMEM);
+
     sub->rects[0] = av_mallocz(sizeof(*sub->rects[0]));
     if (!sub->rects[0]) {
         av_freep(&sub->rects);
         return AVERROR(ENOMEM);
     }
-    sub->num_rects = 1;
     sub->rects[0]->x = x; sub->rects[0]->y = y;
     sub->rects[0]->w = w; sub->rects[0]->h = h;
     sub->rects[0]->type = SUBTITLE_BITMAP;
@@ -119,6 +117,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
         av_freep(&sub->rects);
         return AVERROR(ENOMEM);
     }
+    sub->num_rects = 1;
 
     // read palette
     for (i = 0; i < sub->rects[0]->nb_colors; i++)
diff --git a/libavcodec/xsubenc.c b/libavcodec/xsubenc.c
index 5b7e135..b3da909 100644
--- a/libavcodec/xsubenc.c
+++ b/libavcodec/xsubenc.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2005 DivX, Inc.
  * Copyright (c) 2009 Bjorn Axelsson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -128,7 +128,7 @@ static int xsub_encode(AVCodecContext *avctx, unsigned char *buf,
     }
 
     // TODO: support multiple rects
-    if (h->num_rects > 1)
+    if (h->num_rects != 1)
         av_log(avctx, AV_LOG_WARNING, "Only single rects supported (%d in subtitle.)\n", h->num_rects);
 
 #if FF_API_AVPICTURE
@@ -155,7 +155,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
         av_log(avctx, AV_LOG_WARNING, "No more than 4 subtitle colors supported (%d found.)\n", h->rects[0]->nb_colors);
 
     // TODO: Palette swapping if color zero is not transparent
-    if (((uint32_t *)h->rects[0]->data[1])[0] & 0xff)
+    if (((uint32_t *)h->rects[0]->data[1])[0] & 0xff000000)
         av_log(avctx, AV_LOG_WARNING, "Color index 0 is not transparent. Transparency will be messed up.\n");
 
     if (make_tc(startTime, start_tc) || make_tc(endTime, end_tc)) {
@@ -179,8 +179,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     bytestream_put_le16(&hdr, height);
     bytestream_put_le16(&hdr, h->rects[0]->x);
     bytestream_put_le16(&hdr, h->rects[0]->y);
-    bytestream_put_le16(&hdr, h->rects[0]->x + width);
-    bytestream_put_le16(&hdr, h->rects[0]->y + height);
+    bytestream_put_le16(&hdr, h->rects[0]->x + width -1);
+    bytestream_put_le16(&hdr, h->rects[0]->y + height -1);
 
     rlelenptr = hdr; // Will store length of first field here later.
     hdr+=2;
@@ -203,7 +203,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                         h->rects[0]->w, h->rects[0]->h >> 1))
         return -1;
 
-    // Enforce total height to be be multiple of 2
+    // Enforce total height to be a multiple of 2
     if (h->rects[0]->h & 1) {
         put_xsub_rle(&pb, h->rects[0]->w, PADDING_COLOR);
         avpriv_align_put_bits(&pb);
@@ -219,6 +219,8 @@ static av_cold int xsub_encoder_init(AVCodecContext *avctx)
     if (!avctx->codec_tag)
         avctx->codec_tag = MKTAG('D','X','S','B');
 
+    avctx->bits_per_coded_sample = 4;
+
     return 0;
 }
 
diff --git a/libavcodec/xvididct.c b/libavcodec/xvididct.c
index ca89703..1f96ccc 100644
--- a/libavcodec/xvididct.c
+++ b/libavcodec/xvididct.c
@@ -3,20 +3,20 @@
  *
  * Copyright (C) 2006-2011 Xvid Solutions GmbH
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -334,9 +334,12 @@ av_cold void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx)
 {
     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-    if (!high_bit_depth &&
-        (avctx->idct_algo == FF_IDCT_AUTO ||
-         avctx->idct_algo == FF_IDCT_XVID)) {
+    if (high_bit_depth || avctx->lowres ||
+        !(avctx->idct_algo == FF_IDCT_AUTO ||
+          avctx->idct_algo == FF_IDCT_XVID))
+        return;
+
+    if (avctx->idct_algo == FF_IDCT_XVID) {
         c->idct_put  = xvid_idct_put;
         c->idct_add  = xvid_idct_add;
         c->idct      = ff_xvid_idct;
@@ -345,6 +348,8 @@ av_cold void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx)
 
     if (ARCH_X86)
         ff_xvid_idct_init_x86(c, avctx, high_bit_depth);
+    if (ARCH_MIPS)
+        ff_xvid_idct_init_mips(c, avctx, high_bit_depth);
 
     ff_init_scantable_permutation(c->idct_permutation, c->perm_type);
 }
diff --git a/libavcodec/xvididct.h b/libavcodec/xvididct.h
index 499f819..e0bc1a2 100644
--- a/libavcodec/xvididct.h
+++ b/libavcodec/xvididct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,5 +30,7 @@ void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx);
 
 void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                            unsigned high_bit_depth);
+void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
+                            unsigned high_bit_depth);
 
 #endif /* AVCODEC_XVIDIDCT_H */
diff --git a/libavcodec/xvmc.h b/libavcodec/xvmc.h
index 950ed18..465ee78 100644
--- a/libavcodec/xvmc.h
+++ b/libavcodec/xvmc.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2003 Ivan Kalvachev
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,8 +33,6 @@
 #include "version.h"
 #include "avcodec.h"
 
-#if FF_API_XVMC
-
 /**
  * @defgroup lavc_codec_hwaccel_xvmc XvMC
  * @ingroup lavc_codec_hwaccel
@@ -45,7 +43,7 @@
 #define AV_XVMC_ID                    0x1DC711C0  /**< special value to ensure that regular pixel routines haven't corrupted the struct
                                                        the number is 1337 speak for the letters IDCT MCo (motion compensation) */
 
-attribute_deprecated struct xvmc_pix_fmt {
+struct attribute_deprecated xvmc_pix_fmt {
     /** The field contains the special constant value AV_XVMC_ID.
         It is used as a test that the application correctly uses the API,
         and that there is no corruption caused by pixel routines.
@@ -169,6 +167,4 @@ attribute_deprecated struct xvmc_pix_fmt {
  * @}
  */
 
-#endif /* FF_API_XVMC */
-
 #endif /* AVCODEC_XVMC_H */
diff --git a/libavcodec/xvmc_internal.h b/libavcodec/xvmc_internal.h
index 9018e4a..d365ef0 100644
--- a/libavcodec/xvmc_internal.h
+++ b/libavcodec/xvmc_internal.h
@@ -1,20 +1,20 @@
 /*
  * XVideo Motion Compensation internal functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,14 +25,7 @@
 #include "mpegvideo.h"
 #include "version.h"
 
-#if FF_API_XVMC
-
 void ff_xvmc_init_block(MpegEncContext *s);
 void ff_xvmc_pack_pblocks(MpegEncContext *s, int cbp);
-int  ff_xvmc_field_start(MpegEncContext*s, AVCodecContext *avctx);
-void ff_xvmc_field_end(MpegEncContext *s);
-void ff_xvmc_decode_mb(MpegEncContext *s);
-
-#endif /* FF_API_XVMC */
 
 #endif /* AVCODEC_XVMC_INTERNAL_H */
diff --git a/libavcodec/xwd.h b/libavcodec/xwd.h
index f41e2cd..d046046 100644
--- a/libavcodec/xwd.h
+++ b/libavcodec/xwd.h
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/xwddec.c b/libavcodec/xwddec.c
index 1c9874a..64cd841 100644
--- a/libavcodec/xwddec.c
+++ b/libavcodec/xwddec.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -147,7 +147,7 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
     }
 
     if (pixformat != XWD_Z_PIXMAP) {
-        av_log(avctx, AV_LOG_ERROR, "pixmap format %"PRIu32" unsupported\n", pixformat);
+        avpriv_report_missing_feature(avctx, "Pixmap format %"PRIu32, pixformat);
         return AVERROR_PATCHWELCOME;
     }
 
@@ -155,10 +155,13 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
     switch (vclass) {
     case XWD_STATIC_GRAY:
     case XWD_GRAY_SCALE:
-        if (bpp != 1)
+        if (bpp != 1 && bpp != 8)
             return AVERROR_INVALIDDATA;
-        if (pixdepth == 1)
+        if (pixdepth == 1) {
             avctx->pix_fmt = AV_PIX_FMT_MONOWHITE;
+        } else if (pixdepth == 8) {
+            avctx->pix_fmt = AV_PIX_FMT_GRAY8;
+        }
         break;
     case XWD_STATIC_COLOR:
     case XWD_PSEUDO_COLOR:
@@ -204,10 +207,8 @@ static int xwd_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_PATCHWELCOME;
     }
 
-    if ((ret = ff_get_buffer(avctx, p, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_get_buffer(avctx, p, 0)) < 0)
         return ret;
-    }
 
     p->key_frame = 1;
     p->pict_type = AV_PICTURE_TYPE_I;
diff --git a/libavcodec/xwdenc.c b/libavcodec/xwdenc.c
index e346b5c..43bca89 100644
--- a/libavcodec/xwdenc.c
+++ b/libavcodec/xwdenc.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2012 Paul B Mahol
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,7 +31,7 @@
 #define WINDOW_NAME_SIZE    11
 
 static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
-                            const AVFrame *p, int *got_packet)
+                            const AVFrame *pict, int *got_packet)
 {
     enum AVPixelFormat pix_fmt = avctx->pix_fmt;
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -40,6 +40,7 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     uint32_t header_size;
     int i, out_size, ret;
     uint8_t *ptr, *buf;
+    AVFrame * const p = (AVFrame *)pict;
 
     pixdepth = av_get_bits_per_pixel(desc);
     if (desc->flags & AV_PIX_FMT_FLAG_BE)
@@ -124,6 +125,11 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         bpad     = 8;
         ncolors  = 256;
         break;
+    case AV_PIX_FMT_GRAY8:
+        bpp      = 8;
+        bpad     = 8;
+        vclass   = XWD_STATIC_GRAY;
+        break;
     case AV_PIX_FMT_MONOWHITE:
         be       = 1;
         bitorder = 1;
@@ -132,7 +138,7 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
         vclass   = XWD_STATIC_GRAY;
         break;
     default:
-        av_log(avctx, AV_LOG_INFO, "unsupported pixel format\n");
+        av_log(avctx, AV_LOG_ERROR, "unsupported pixel format\n");
         return AVERROR(EINVAL);
     }
 
@@ -140,18 +146,12 @@ static int xwd_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     header_size = XWD_HEADER_SIZE + WINDOW_NAME_SIZE;
     out_size    = header_size + ncolors * XWD_CMAP_SIZE + avctx->height * lsize;
 
-    if ((ret = ff_alloc_packet(pkt, out_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "output buffer too small\n");
+    if ((ret = ff_alloc_packet2(avctx, pkt, out_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
-#if FF_API_CODED_FRAME
-FF_DISABLE_DEPRECATION_WARNINGS
-    avctx->coded_frame->key_frame = 1;
-    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+    p->key_frame = 1;
+    p->pict_type = AV_PICTURE_TYPE_I;
 
     bytestream_put_be32(&buf, header_size);
     bytestream_put_be32(&buf, XWD_VERSION);   // file version
@@ -233,6 +233,7 @@ AVCodec ff_xwd_encoder = {
                                                  AV_PIX_FMT_RGB4_BYTE,
                                                  AV_PIX_FMT_BGR4_BYTE,
                                                  AV_PIX_FMT_PAL8,
+                                                 AV_PIX_FMT_GRAY8,
                                                  AV_PIX_FMT_MONOWHITE,
                                                  AV_PIX_FMT_NONE },
 };
diff --git a/libavcodec/xxan.c b/libavcodec/xxan.c
index 6369b7d..5485296 100644
--- a/libavcodec/xxan.c
+++ b/libavcodec/xxan.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2011 Konstantin Shishkov
  * based on work by Mike Melanson
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -73,7 +73,7 @@ static av_cold int xan_decode_init(AVCodecContext *avctx)
         return AVERROR(ENOMEM);
     s->scratch_buffer = av_malloc(s->buffer_size + 130);
     if (!s->scratch_buffer) {
-        av_freep(&s->y_buffer);
+        xan_decode_end(avctx);
         return AVERROR(ENOMEM);
     }
 
@@ -224,16 +224,18 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off)
     if (mode) {
         for (j = 0; j < avctx->height >> 1; j++) {
             for (i = 0; i < avctx->width >> 1; i++) {
+                if (src_end - src < 1)
+                    return 0;
                 val = *src++;
-                if (val && val < table_size) {
+                if (val) {
+                    if (val >= table_size)
+                        return AVERROR_INVALIDDATA;
                     val  = AV_RL16(table + (val << 1));
                     uval = (val >> 3) & 0xF8;
                     vval = (val >> 8) & 0xF8;
                     U[i] = uval | (uval >> 5);
                     V[i] = vval | (vval >> 5);
                 }
-                if (src == src_end)
-                    return 0;
             }
             U += s->pic->linesize[1];
             V += s->pic->linesize[2];
@@ -248,8 +250,12 @@ static int xan_decode_chroma(AVCodecContext *avctx, unsigned chroma_off)
 
         for (j = 0; j < avctx->height >> 2; j++) {
             for (i = 0; i < avctx->width >> 1; i += 2) {
+                if (src_end - src < 1)
+                    return 0;
                 val = *src++;
-                if (val && val < table_size) {
+                if (val) {
+                    if (val >= table_size)
+                        return AVERROR_INVALIDDATA;
                     val  = AV_RL16(table + (val << 1));
                     uval = (val >> 3) & 0xF8;
                     vval = (val >> 8) & 0xF8;
@@ -288,7 +294,7 @@ static int xan_decode_frame_type0(AVCodecContext *avctx)
     if ((ret = xan_decode_chroma(avctx, chroma_off)) != 0)
         return ret;
 
-    if (corr_off >= (s->gb.buffer_end - s->gb.buffer_start)) {
+    if (corr_off >= bytestream2_size(&s->gb)) {
         av_log(avctx, AV_LOG_WARNING, "Ignoring invalid correction block position\n");
         corr_off = 0;
     }
@@ -333,6 +339,9 @@ static int xan_decode_frame_type0(AVCodecContext *avctx)
         dec_size = xan_unpack(s, s->scratch_buffer, s->buffer_size / 2);
         if (dec_size < 0)
             dec_size = 0;
+        else
+            dec_size = FFMIN(dec_size, s->buffer_size/2 - 1);
+
         for (i = 0; i < dec_size; i++)
             s->y_buffer[i*2+1] = (s->y_buffer[i*2+1] + (s->scratch_buffer[i] << 1)) & 0x3F;
     }
@@ -402,10 +411,8 @@ static int xan_decode_frame(AVCodecContext *avctx,
     int ftype;
     int ret;
 
-    if ((ret = ff_reget_buffer(avctx, s->pic))) {
-        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, s->pic)) < 0)
         return ret;
-    }
 
     bytestream2_init(&s->gb, avpkt->data, avpkt->size);
     ftype = bytestream2_get_le32(&s->gb);
diff --git a/libavcodec/y41pdec.c b/libavcodec/y41pdec.c
new file mode 100644
index 0000000..1b177d4
--- /dev/null
+++ b/libavcodec/y41pdec.c
@@ -0,0 +1,92 @@
+/*
+ * y41p decoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y41p_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt             = AV_PIX_FMT_YUV411P;
+    avctx->bits_per_raw_sample = 12;
+
+    if (avctx->width & 7) {
+        av_log(avctx, AV_LOG_WARNING, "y41p requires width to be divisible by 8.\n");
+    }
+
+    return 0;
+}
+
+static int y41p_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 3LL * avctx->height * avctx->width / 2) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    for (i = avctx->height - 1; i >= 0 ; i--) {
+        y = &pic->data[0][i * pic->linesize[0]];
+        u = &pic->data[1][i * pic->linesize[1]];
+        v = &pic->data[2][i * pic->linesize[2]];
+        for (j = 0; j < avctx->width; j += 8) {
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(u++) = *src++;
+            *(y++) = *src++;
+            *(v++) = *src++;
+            *(y++) = *src++;
+
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+            *(y++) = *src++;
+        }
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_y41p_decoder = {
+    .name         = "y41p",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_Y41P,
+    .init         = y41p_decode_init,
+    .decode       = y41p_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/y41penc.c b/libavcodec/y41penc.c
new file mode 100644
index 0000000..94acc34
--- /dev/null
+++ b/libavcodec/y41penc.c
@@ -0,0 +1,93 @@
+/*
+ * y41p encoder
+ *
+ * Copyright (c) 2012 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int y41p_encode_init(AVCodecContext *avctx)
+{
+    if (avctx->width & 7) {
+        av_log(avctx, AV_LOG_ERROR, "y41p requires width to be divisible by 8.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    avctx->bits_per_coded_sample = 12;
+
+    return 0;
+}
+
+static int y41p_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, avctx->width * avctx->height * 1.5, 0)) < 0)
+        return ret;
+
+    dst = pkt->data;
+
+    for (i = avctx->height - 1; i >= 0; i--) {
+        y = &pic->data[0][i * pic->linesize[0]];
+        u = &pic->data[1][i * pic->linesize[1]];
+        v = &pic->data[2][i * pic->linesize[2]];
+        for (j = 0; j < avctx->width; j += 8) {
+            *(dst++) = *(u++);
+            *(dst++) = *(y++);
+            *(dst++) = *(v++);
+            *(dst++) = *(y++);
+
+            *(dst++) = *(u++);
+            *(dst++) = *(y++);
+            *(dst++) = *(v++);
+            *(dst++) = *(y++);
+
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+            *(dst++) = *(y++);
+        }
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int y41p_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_y41p_encoder = {
+    .name         = "y41p",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_Y41P,
+    .init         = y41p_encode_init,
+    .encode2      = y41p_encode_frame,
+    .close        = y41p_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_YUV411P,
+                                                 AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/yop.c b/libavcodec/yop.c
index 5de4ac8..32cfea2 100644
--- a/libavcodec/yop.c
+++ b/libavcodec/yop.c
@@ -5,20 +5,20 @@
  * derived from the code by
  * Copyright (C) 2009 Thomas P. Higdon <thomas.p.higdon@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,7 @@
 
 typedef struct YopDecContext {
     AVCodecContext *avctx;
+    AVFrame *frame;
 
     int num_pal_colors;
     int first_color[2];
@@ -80,6 +81,15 @@ static const int8_t motion_vector[16][2] =
      { 4, -2}, {-2,  0},
     };
 
+static av_cold int yop_decode_close(AVCodecContext *avctx)
+{
+    YopDecContext *s = avctx->priv_data;
+
+    av_frame_free(&s->frame);
+
+    return 0;
+}
+
 static av_cold int yop_decode_init(AVCodecContext *avctx)
 {
     YopDecContext *s = avctx->priv_data;
@@ -105,10 +115,14 @@ static av_cold int yop_decode_init(AVCodecContext *avctx)
     if (s->num_pal_colors + s->first_color[0] > 256 ||
         s->num_pal_colors + s->first_color[1] > 256) {
         av_log(avctx, AV_LOG_ERROR,
-               "YOP: palette parameters invalid, header probably corrupt\n");
+               "Palette parameters invalid, header probably corrupt\n");
         return AVERROR_INVALIDDATA;
     }
 
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
     return 0;
 }
 
@@ -146,8 +160,7 @@ static int yop_copy_previous_block(YopDecContext *s, int linesize, int copy_tag)
     bufptr = s->dstptr + motion_vector[copy_tag][0] +
              linesize * motion_vector[copy_tag][1];
     if (bufptr < s->dstbuf) {
-        av_log(s->avctx, AV_LOG_ERROR,
-               "YOP: cannot decode, file probably corrupt\n");
+        av_log(s->avctx, AV_LOG_ERROR, "File probably corrupt\n");
         return AVERROR_INVALIDDATA;
     }
 
@@ -181,7 +194,7 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                             AVPacket *avpkt)
 {
     YopDecContext *s = avctx->priv_data;
-    AVFrame *frame = data;
+    AVFrame *frame = s->frame;
     int tag, firstcolor, is_odd_frame;
     int ret, i, x, y;
     uint32_t *palette;
@@ -191,11 +204,8 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         return AVERROR_INVALIDDATA;
     }
 
-    ret = ff_get_buffer(avctx, frame, 0);
-    if (ret < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+    if ((ret = ff_reget_buffer(avctx, frame)) < 0)
         return ret;
-    }
 
     if (!avctx->frame_number)
         memset(frame->data[1], 0, AVPALETTE_SIZE);
@@ -207,13 +217,20 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     s->low_nibble = NULL;
 
     is_odd_frame = avpkt->data[0];
+    if(is_odd_frame>1){
+        av_log(avctx, AV_LOG_ERROR, "frame is too odd %d\n", is_odd_frame);
+        return AVERROR_INVALIDDATA;
+    }
     firstcolor   = s->first_color[is_odd_frame];
     palette      = (uint32_t *)frame->data[1];
 
-    for (i = 0; i < s->num_pal_colors; i++, s->srcptr += 3)
+    for (i = 0; i < s->num_pal_colors; i++, s->srcptr += 3) {
         palette[i + firstcolor] = (s->srcptr[0] << 18) |
                                   (s->srcptr[1] << 10) |
                                   (s->srcptr[2] << 2);
+        palette[i + firstcolor] |= 0xFFU << 24 |
+                                   (palette[i + firstcolor] >> 6) & 0x30303;
+    }
 
     frame->palette_has_changed = 1;
 
@@ -241,6 +258,9 @@ static int yop_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         s->dstptr += 2*frame->linesize[0] - x;
     }
 
+    if ((ret = av_frame_ref(data, s->frame)) < 0)
+        return ret;
+
     *got_frame = 1;
     return avpkt->size;
 }
@@ -252,6 +272,6 @@ AVCodec ff_yop_decoder = {
     .id             = AV_CODEC_ID_YOP,
     .priv_data_size = sizeof(YopDecContext),
     .init           = yop_decode_init,
+    .close          = yop_decode_close,
     .decode         = yop_decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
 };
diff --git a/libavcodec/yuv4dec.c b/libavcodec/yuv4dec.c
new file mode 100644
index 0000000..f89f62d
--- /dev/null
+++ b/libavcodec/yuv4dec.c
@@ -0,0 +1,84 @@
+/*
+ * libquicktime yuv4 decoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int yuv4_decode_init(AVCodecContext *avctx)
+{
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    return 0;
+}
+
+static int yuv4_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    AVFrame *pic = data;
+    const uint8_t *src = avpkt->data;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if (avpkt->size < 6 * (avctx->width + 1 >> 1) * (avctx->height + 1 >> 1)) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < (avctx->height + 1) >> 1; i++) {
+        for (j = 0; j < (avctx->width + 1) >> 1; j++) {
+            u[j] = *src++ ^ 0x80;
+            v[j] = *src++ ^ 0x80;
+            y[                   2 * j    ] = *src++;
+            y[                   2 * j + 1] = *src++;
+            y[pic->linesize[0] + 2 * j    ] = *src++;
+            y[pic->linesize[0] + 2 * j + 1] = *src++;
+        }
+
+        y += 2 * pic->linesize[0];
+        u +=     pic->linesize[1];
+        v +=     pic->linesize[2];
+    }
+
+    *got_frame = 1;
+
+    return avpkt->size;
+}
+
+AVCodec ff_yuv4_decoder = {
+    .name         = "yuv4",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_YUV4,
+    .init         = yuv4_decode_init,
+    .decode       = yuv4_decode_frame,
+    .capabilities = AV_CODEC_CAP_DR1,
+};
diff --git a/libavcodec/yuv4enc.c b/libavcodec/yuv4enc.c
new file mode 100644
index 0000000..cc8846d
--- /dev/null
+++ b/libavcodec/yuv4enc.c
@@ -0,0 +1,80 @@
+/*
+ * libquicktime yuv4 encoder
+ *
+ * Copyright (c) 2011 Carl Eugen Hoyos
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "internal.h"
+
+static av_cold int yuv4_encode_init(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+static int yuv4_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pic, int *got_packet)
+{
+    uint8_t *dst;
+    uint8_t *y, *u, *v;
+    int i, j, ret;
+
+    if ((ret = ff_alloc_packet2(avctx, pkt, 6 * (avctx->width + 1 >> 1) * (avctx->height + 1 >> 1), 0)) < 0)
+        return ret;
+    dst = pkt->data;
+
+    y = pic->data[0];
+    u = pic->data[1];
+    v = pic->data[2];
+
+    for (i = 0; i < avctx->height + 1 >> 1; i++) {
+        for (j = 0; j < avctx->width + 1 >> 1; j++) {
+            *dst++ = u[j] ^ 0x80;
+            *dst++ = v[j] ^ 0x80;
+            *dst++ = y[                   2 * j    ];
+            *dst++ = y[                   2 * j + 1];
+            *dst++ = y[pic->linesize[0] + 2 * j    ];
+            *dst++ = y[pic->linesize[0] + 2 * j + 1];
+        }
+        y += 2 * pic->linesize[0];
+        u +=     pic->linesize[1];
+        v +=     pic->linesize[2];
+    }
+
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
+}
+
+static av_cold int yuv4_encode_close(AVCodecContext *avctx)
+{
+    return 0;
+}
+
+AVCodec ff_yuv4_encoder = {
+    .name         = "yuv4",
+    .long_name    = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
+    .type         = AVMEDIA_TYPE_VIDEO,
+    .id           = AV_CODEC_ID_YUV4,
+    .init         = yuv4_encode_init,
+    .encode2      = yuv4_encode_frame,
+    .close        = yuv4_encode_close,
+    .pix_fmts     = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE },
+    .capabilities = AV_CODEC_CAP_INTRA_ONLY,
+};
diff --git a/libavcodec/zerocodec.c b/libavcodec/zerocodec.c
index 1419e84..55a9a91 100644
--- a/libavcodec/zerocodec.c
+++ b/libavcodec/zerocodec.c
@@ -59,10 +59,8 @@ static int zerocodec_decode_frame(AVCodecContext *avctx, void *data,
         return AVERROR_INVALIDDATA;
     }
 
-    if (ff_get_buffer(avctx, pic, AV_GET_BUFFER_FLAG_REF) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n");
-        return AVERROR(ENOMEM);
-    }
+    if ((ret = ff_get_buffer(avctx, pic, AV_GET_BUFFER_FLAG_REF)) < 0)
+        return ret;
 
     zstream->next_in  = avpkt->data;
     zstream->avail_in = avpkt->size;
diff --git a/libavcodec/zmbv.c b/libavcodec/zmbv.c
index f945aea..f126515 100644
--- a/libavcodec/zmbv.c
+++ b/libavcodec/zmbv.c
@@ -2,20 +2,20 @@
  * Zip Motion Blocks Video (ZMBV) decoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 
 #include "libavutil/common.h"
+#include "libavutil/imgutils.h"
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
@@ -64,6 +65,7 @@ typedef struct ZmbvContext {
     int fmt;
     int comp;
     int flags;
+    int stride;
     int bw, bh, bx, by;
     int decomp_len;
     z_stream zstream;
@@ -143,7 +145,7 @@ static int zmbv_decode_xor_8(ZmbvContext *c)
         prev += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -217,7 +219,7 @@ static int zmbv_decode_xor_16(ZmbvContext *c)
         prev += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -375,7 +377,7 @@ static int zmbv_decode_xor_32(ZmbvContext *c)
         prev   += c->width * c->bh;
     }
     if (src - c->decomp_buf != c->decomp_len)
-        av_log(c->avctx, AV_LOG_ERROR, "Used %ti of %i bytes\n",
+        av_log(c->avctx, AV_LOG_ERROR, "Used %"PTRDIFF_SPECIFIER" of %i bytes\n",
                src-c->decomp_buf, c->decomp_len);
     return 0;
 }
@@ -406,17 +408,18 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     int zret = Z_OK; // Zlib return code
     int len = buf_size;
     int hi_ver, lo_ver, ret;
-    uint8_t *tmp;
-
-    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
-        return ret;
-    }
 
     /* parse header */
+    if (len < 1)
+        return AVERROR_INVALIDDATA;
     c->flags = buf[0];
     buf++; len--;
     if (c->flags & ZMBV_KEYFRAME) {
+        void *decode_intra = NULL;
+        c->decode_intra= NULL;
+
+        if (len < 6)
+            return AVERROR_INVALIDDATA;
         hi_ver = buf[0];
         lo_ver = buf[1];
         c->comp = buf[2];
@@ -447,29 +450,39 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         switch (c->fmt) {
         case ZMBV_FMT_8BPP:
             c->bpp = 8;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_8;
+            avctx->pix_fmt = AV_PIX_FMT_PAL8;
+            c->stride = c->width;
             break;
         case ZMBV_FMT_15BPP:
         case ZMBV_FMT_16BPP:
             c->bpp = 16;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_16;
+            if (c->fmt == ZMBV_FMT_15BPP)
+                avctx->pix_fmt = AV_PIX_FMT_RGB555LE;
+            else
+                avctx->pix_fmt = AV_PIX_FMT_RGB565LE;
+            c->stride = c->width * 2;
             break;
 #ifdef ZMBV_ENABLE_24BPP
         case ZMBV_FMT_24BPP:
             c->bpp = 24;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_24;
+            avctx->pix_fmt = AV_PIX_FMT_RGB24;
+            c->stride = c->width * 3;
             break;
 #endif //ZMBV_ENABLE_24BPP
         case ZMBV_FMT_32BPP:
             c->bpp = 32;
-            c->decode_intra = zmbv_decode_intra;
+            decode_intra = zmbv_decode_intra;
             c->decode_xor = zmbv_decode_xor_32;
+            avctx->pix_fmt = AV_PIX_FMT_BGR0;
+            c->stride = c->width * 4;
             break;
         default:
-            c->decode_intra = NULL;
             c->decode_xor = NULL;
             avpriv_request_sample(avctx, "Format %i", c->fmt);
             return AVERROR_PATCHWELCOME;
@@ -481,16 +494,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
             return AVERROR_UNKNOWN;
         }
 
-        tmp = av_realloc(c->cur,  avctx->width * avctx->height * (c->bpp / 8));
-        if (!tmp)
-            return AVERROR(ENOMEM);
-        c->cur = tmp;
-        tmp = av_realloc(c->prev, avctx->width * avctx->height * (c->bpp / 8));
-        if (!tmp)
+        c->cur  = av_realloc_f(c->cur, avctx->width * avctx->height,  (c->bpp / 8));
+        c->prev = av_realloc_f(c->prev, avctx->width * avctx->height,  (c->bpp / 8));
+        c->bx = (c->width + c->bw - 1) / c->bw;
+        c->by = (c->height+ c->bh - 1) / c->bh;
+        if (!c->cur || !c->prev)
             return AVERROR(ENOMEM);
-        c->prev = tmp;
-        c->bx   = (c->width  + c->bw - 1) / c->bw;
-        c->by   = (c->height + c->bh - 1) / c->bh;
+        memset(c->cur, 0, avctx->width * avctx->height * (c->bpp / 8));
+        memset(c->prev, 0, avctx->width * avctx->height * (c->bpp / 8));
+        c->decode_intra= decode_intra;
     }
 
     if (!c->decode_intra) {
@@ -498,6 +510,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         return AVERROR_INVALIDDATA;
     }
 
+    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+        return ret;
+
     if (c->comp == 0) { // uncompressed data
         if (c->decomp_size < len) {
             av_log(avctx, AV_LOG_ERROR, "Buffer too small\n");
@@ -506,7 +521,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
         memcpy(c->decomp_buf, buf, len);
     } else { // ZLIB-compressed data
         c->zstream.total_in = c->zstream.total_out = 0;
-        c->zstream.next_in = buf;
+        c->zstream.next_in = (uint8_t*)buf;
         c->zstream.avail_in = len;
         c->zstream.next_out = c->decomp_buf;
         c->zstream.avail_out = c->decomp_size;
@@ -531,64 +546,22 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPac
     /* update frames */
     {
         uint8_t *out, *src;
-        int i, j;
+        int j;
 
         out = frame->data[0];
         src = c->cur;
         switch (c->fmt) {
         case ZMBV_FMT_8BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    out[i * 3 + 0] = c->pal[(*src) * 3 + 0];
-                    out[i * 3 + 1] = c->pal[(*src) * 3 + 1];
-                    out[i * 3 + 2] = c->pal[(*src) * 3 + 2];
-                    src++;
-                }
-                out += frame->linesize[0];
-            }
-            break;
+            for (j = 0; j < 256; j++)
+                AV_WN32(&frame->data[1][j * 4], 0xFFU << 24 | AV_RB24(&c->pal[j * 3]));
         case ZMBV_FMT_15BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint16_t tmp = AV_RL16(src);
-                    src += 2;
-                    out[i * 3 + 0] = (tmp & 0x7C00) >> 7;
-                    out[i * 3 + 1] = (tmp & 0x03E0) >> 2;
-                    out[i * 3 + 2] = (tmp & 0x001F) << 3;
-                }
-                out += frame->linesize[0];
-            }
-            break;
         case ZMBV_FMT_16BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint16_t tmp = AV_RL16(src);
-                    src += 2;
-                    out[i * 3 + 0] = (tmp & 0xF800) >> 8;
-                    out[i * 3 + 1] = (tmp & 0x07E0) >> 3;
-                    out[i * 3 + 2] = (tmp & 0x001F) << 3;
-                }
-                out += frame->linesize[0];
-            }
-            break;
 #ifdef ZMBV_ENABLE_24BPP
         case ZMBV_FMT_24BPP:
-            for (j = 0; j < c->height; j++) {
-                memcpy(out, src, c->width * 3);
-                src += c->width * 3;
-                out += frame->linesize[0];
-            }
-            break;
-#endif //ZMBV_ENABLE_24BPP
+#endif
         case ZMBV_FMT_32BPP:
-            for (j = 0; j < c->height; j++) {
-                for (i = 0; i < c->width; i++) {
-                    uint32_t tmp = AV_RL32(src);
-                    src += 4;
-                    AV_WB24(out+(i*3), tmp);
-                }
-                out += frame->linesize[0];
-            }
+            av_image_copy_plane(out, frame->linesize[0], src, c->stride,
+                                c->stride, c->height);
             break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Cannot handle format %i\n", c->fmt);
@@ -616,12 +589,11 @@ static av_cold int decode_init(AVCodecContext *avctx)
     // Needed if zlib unused or init aborted before inflateInit
     memset(&c->zstream, 0, sizeof(z_stream));
 
-    avctx->pix_fmt = AV_PIX_FMT_RGB24;
     c->decomp_size = (avctx->width + 255) * 4 * (avctx->height + 64);
 
     /* Allocate decompression buffer */
     if (c->decomp_size) {
-        if (!(c->decomp_buf = av_malloc(c->decomp_size))) {
+        if (!(c->decomp_buf = av_mallocz(c->decomp_size))) {
             av_log(avctx, AV_LOG_ERROR,
                    "Can't allocate decompression buffer.\n");
             return AVERROR(ENOMEM);
diff --git a/libavcodec/zmbvenc.c b/libavcodec/zmbvenc.c
index 4436bb3..e832bed 100644
--- a/libavcodec/zmbvenc.c
+++ b/libavcodec/zmbvenc.c
@@ -2,20 +2,20 @@
  * Zip Motion Blocks Video (ZMBV) encoder
  * Copyright (c) 2006 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -231,10 +231,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     pkt_size = c->zstream.total_out + 1 + 6*keyframe;
-    if ((ret = ff_alloc_packet(pkt, pkt_size)) < 0) {
-        av_log(avctx, AV_LOG_ERROR, "Error getting packet of size %d.\n", pkt_size);
+    if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size, 0)) < 0)
         return ret;
-    }
     buf = pkt->data;
 
     fl = (keyframe ? ZMBV_KEYFRAME : 0) | (chpal ? ZMBV_DELTAPAL : 0);
@@ -279,7 +277,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     int lvl = 9;
 
     for(i=1; i<256; i++)
-        score_tab[i]= -i * log(i/(double)(ZMBV_BLOCK*ZMBV_BLOCK)) * (256/M_LN2);
+        score_tab[i]= -i * log2(i/(double)(ZMBV_BLOCK*ZMBV_BLOCK)) * 256;
 
     c->avctx = avctx;